diff --git a/README.MD b/README.MD index 1c5f854..58ab651 100644 --- a/README.MD +++ b/README.MD @@ -21,6 +21,7 @@ - 地区分类:按国家的数据、按省份统计的数据、按城市统计的数据 - 中国截止目前累积数据、每日累积的数据(1月13日开始进行每日数据统计) -#### 4. 数据可视化 -- 方式1: -- 方式2:使用 basemap + matplotlib +#### 4. 疫情数据可视化 +- 使用 basemap + matplotlib +- 每日疫情变化时间图 +- 中国疫情地图 diff --git "a/nCov_basemap_visualization/2019-nCoV\347\226\253\346\203\205\345\210\206\345\270\203\345\234\260\345\233\276.png" "b/nCov_basemap_visualization/2019-nCoV\347\226\253\346\203\205\345\210\206\345\270\203\345\234\260\345\233\276.png" new file mode 100644 index 0000000..beadaae Binary files /dev/null and "b/nCov_basemap_visualization/2019-nCoV\347\226\253\346\203\205\345\210\206\345\270\203\345\234\260\345\233\276.png" differ diff --git a/nCov_basemap_visualization/nCoV_daily_change.py b/nCov_basemap_visualization/nCoV_daily_change.py new file mode 100644 index 0000000..18c7fcc --- /dev/null +++ b/nCov_basemap_visualization/nCoV_daily_change.py @@ -0,0 +1,59 @@ +# -*- coding:utf-8 -*- +# project_xxx\venv\Scripts python + +''' +Author: Felix +WeiXin: AXiaShuBai +Email: xiashubai@gmail.com +Blog: https://blog.csdn.net/u011318077 +Date: 2020/1/31 17:18 +Desc: +''' + +from china_data_analysis import ChinaData +import matplotlib.pyplot as plt + +def daily_change(): + + # 获取每日疫情数据,日期,确诊,疑似,死亡,治愈 + date_list, everyday_confirm, everyday_suspect, everyday_dead, everyday_heal = ChinaData().china_everyday_data() + + # 显示中文和显示负号 + plt.rcParams['font.sans-serif'] = ['SimHei'] + plt.rcParams['axes.unicode_minus'] = False + + # 绘制画布和子图对象 + fig, ax1 = plt.subplots(figsize=(10, 6)) + + # 左Y轴绘制确诊和疑似病例曲线 + ax1.plot(date_list, everyday_confirm, lw=2, ls='--', marker='o', color='red', label='确诊') + ax1.plot(date_list, everyday_suspect, lw=2, ls='--', marker='o', color='orange', label='疑似') + # 设置标题,XY轴标题,刻度 + ax1.set_title("2019-nCoV疫情变化时间图", fontsize=16) + ax1.set_xlabel("2020年1月", fontsize=16) + ax1.set_xticklabels(date_list, rotation=30) + ax1.set_ylabel(r"确诊及疑似人数", fontsize=16) + ax1.set_ylim(0, 16000) + # 显示网格线和显示图例 + plt.grid(which='major', axis='both', color='grey', linestyle='--', alpha=0.2) + plt.legend(loc='upper left', bbox_to_anchor=(0.3,1)) + + # 右Y轴绘制死亡和治愈病例曲线,共用ax1的X轴 + ax2 = ax1.twinx() + ax2.plot(date_list, everyday_dead, lw=1, ls='--', marker='.', color='cyan', label='死亡') + ax2.plot(date_list, everyday_heal, lw=1, ls='--', marker='.', color='green', label='治愈') + # 设置标题刻度 + ax2.set_ylabel(r"死亡及治愈人数", fontsize=16) + ax2.set_ylim(0, 400) + # 显示网格线和显示图例 + plt.grid(which='major', axis='both', color='grey', linestyle='--', alpha=0.2) + plt.legend(loc='upper center') + + # 展示图形 + # plt.show() + # 保存图形为图片,第一个参数保存路径,第二个参数裁掉多余的空白部分 + plt.savefig('2019-nCoV疫情变化时间图.png', bbox_inches='tight') + +if __name__ == '__main__': + daily_change() + diff --git a/nCov_basemap_visualization/nCov_basemap_matplotlib_visual.py b/nCov_basemap_visualization/nCov_basemap_matplotlib_visual.py deleted file mode 100644 index 1dabf8f..0000000 --- a/nCov_basemap_visualization/nCov_basemap_matplotlib_visual.py +++ /dev/null @@ -1,57 +0,0 @@ -# -*- coding:utf-8 -*- -# project_xxx\venv\Scripts python - -''' -Author: Felix -WeiXin: AXiaShuBai -Email: xiashubai@gmail.com -Blog: https://blog.csdn.net/u011318077 -Date: 2020/1/31 17:18 -Desc: -''' - -import matplotlib.pyplot as plt -from china_data_analysis import ChinaData - -# 显示中文和显示负号 -plt.rcParams['font.sans-serif'] = ['SimHei'] -plt.rcParams['axes.unicode_minus'] = False - -# 绘制画布和子图对象 -fig, ax1 = plt.subplots(figsize=(10, 6)) - -# 获取每日疫情数据,日期,确诊,疑似,死亡,治愈 -date_list, everyday_confirm, everyday_suspect, everyday_dead, everyday_heal = ChinaData().china_everyday_data() - -# 左Y轴绘制确诊和疑似病例曲线 -ax1.plot(date_list, everyday_confirm, lw=2, ls='--', marker='o', color='red', label='确诊') -ax1.plot(date_list, everyday_suspect, lw=2, ls='--', marker='o', color='orange', label='疑似') - -# 设置标题,XY轴标题,刻度 -ax1.set_title("2019-nCoV疫情变化时间图", fontsize=16) -ax1.set_xlabel("2020年1月", fontsize=16) -ax1.set_xticklabels(date_list, rotation=30) -ax1.set_ylabel(r"确诊及疑似人数", fontsize=16) -ax1.set_ylim(0, 16000) -# 显示网格线和显示图例 -plt.grid(which='major', axis='both', color='grey', linestyle='--', alpha=0.2) -plt.legend(loc='upper left', bbox_to_anchor=(0.3,1)) - - -# 右Y轴绘制死亡和治愈病例曲线,共用ax1的X轴 -ax2 = ax1.twinx() -ax2.plot(date_list, everyday_dead, lw=1, ls='--', marker='.', color='cyan', label='死亡') -ax2.plot(date_list, everyday_heal, lw=1, ls='--', marker='.', color='green', label='治愈') - -# 设置标题刻度 -ax2.set_ylabel(r"死亡及治愈人数", fontsize=16) -ax2.set_ylim(0, 400) -# 显示网格线和显示图例 -plt.grid(which='major', axis='both', color='grey', linestyle='--', alpha=0.2) -plt.legend(loc='upper center') - - -# 展示图形 -# plt.show() -# 保存图形为图片,第一个参数保存路径,第二个参数裁掉多余的空白部分 -plt.savefig('2019-nCoV疫情变化时间图.png', bbox_inches='tight') diff --git a/nCov_basemap_visualization/nCov_distribution_map.py b/nCov_basemap_visualization/nCov_distribution_map.py new file mode 100644 index 0000000..a490c1e --- /dev/null +++ b/nCov_basemap_visualization/nCov_distribution_map.py @@ -0,0 +1,86 @@ +# -*- coding:utf-8 -*- +# project_xxx\venv\Scripts python + +''' +Author: Felix +WeiXin: AXiaShuBai +Email: xiashubai@gmail.com +Blog: https://blog.csdn.net/u011318077 +Date: 2020/1/31 17:18 +Desc: +''' +from province_data_analysis import ProvinceData +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from mpl_toolkits.basemap import Basemap + +def distribution_map(): + + # 显示中文和显示负号 + plt.rcParams['font.sans-serif'] = ['SimHei'] + plt.rcParams['axes.unicode_minus'] = False + + # 获取省份名称列表及确认病例列表原始数据,按照人数多到少排列 + province_name, province_total_confirm = ProvinceData().province_total_data() + province_confirm_dict = dict(zip(province_name, province_total_confirm)) + + # 设置图形大小 + plt.figure(figsize=(10, 8), dpi=300) + + # 设置中国的经纬度范围 + lon_min, lon_max = 77, 142 + lat_min, lat_max = 14, 51 + + # 绘制中国地图,设置经度纬度范围,使用兰伯特投影 + map = Basemap(llcrnrlon=lon_min, llcrnrlat=lat_min, urcrnrlon=lon_max, urcrnrlat=lat_max, projection='lcc', + lat_1=33, lat_2=45, lon_0=100) + map.readshapefile('../china_shapfiles/china-shapefiles-simple-version/china', 'china', drawbounds=True) + map.readshapefile('../china_shapfiles/china-shapefiles-simple-version/china_nine_dotted_line', 'china_nine', + drawbounds=True) + + # 读取各省份省委城市的经纬度数据 + posi = pd.read_excel('中国省会城市经度纬度表.xlsx') + province_list = list(posi['province']) + lat_list = np.array(posi["lat"][0:34]) + lon_list = np.array(posi["lon"][0:34]) + confirm_origin = list(posi["confirm"][0:34]) + province_dict = dict(zip(province_list,confirm_origin)) + + # 进行重新排序后的省份疫情表,省份排序与本地的经纬度表一致 + new_province_confirm= data_merge(province_dict, province_confirm_dict) + confirm_list = np.array(list(new_province_confirm.values())) + size = (confirm_list/np.max(confirm_list))*3000 + print(confirm_list) + + parallels = np.arange(0., 90, 10.) + map.drawparallels(parallels, labels=[1, 0, 0, 0], fontsize=10) # 绘制纬线 + meridians = np.arange(80., 140., 10.) + map.drawmeridians(meridians, labels=[0, 0, 0, 1], fontsize=10) # 绘制经线 + + x, y = map(lon_list, lat_list) + map.scatter(x, y, s=size, c='red') + # 设置数字标记 + for i in range(0, 34): + plt.text(x[i] + 5000, y[i] + 5000, str(confirm_list[i])) + + plt.title('2019-nCoV疫情分布地图', fontsize=16) + plt.savefig('2019-nCoV疫情分布地图.png') + plt.show() + +# 由于原始疫情数据是按确诊人数排列的,与本地经纬度表排序不一致 +# 我们将省份相同的名称对应的confirm(初始confirm都是0)值相加,得到重新排序后的确诊人数列表 +def data_merge(A, B): + C = dict() + for key in A: + if B.get(key): + C[key] = A[key] + B[key] + else: + C[key] = A[key] + for key in B: + if not A.get(key): + C[key] = B[key] + return C + +if __name__ == '__main__': + distribution_map() diff --git "a/nCov_basemap_visualization/\344\270\255\345\233\275\347\234\201\344\274\232\345\237\216\345\270\202\347\273\217\345\272\246\347\272\254\345\272\246\350\241\250.xlsx" "b/nCov_basemap_visualization/\344\270\255\345\233\275\347\234\201\344\274\232\345\237\216\345\270\202\347\273\217\345\272\246\347\272\254\345\272\246\350\241\250.xlsx" new file mode 100644 index 0000000..943dcb6 Binary files /dev/null and "b/nCov_basemap_visualization/\344\270\255\345\233\275\347\234\201\344\274\232\345\237\216\345\270\202\347\273\217\345\272\246\347\272\254\345\272\246\350\241\250.xlsx" differ diff --git a/nCov_data_analysis/province_data_analysis.py b/nCov_data_analysis/province_data_analysis.py index 08457c6..5f88893 100644 --- a/nCov_data_analysis/province_data_analysis.py +++ b/nCov_data_analysis/province_data_analysis.py @@ -33,8 +33,9 @@ def province_total_data(self): province_total_suspect.append(province['total']['suspect']) province_total_dead.append(province['total']['dead']) province_total_heal.append(province['total']['heal']) - print(province_name) - print(province_total_confirm) + # print(province_name) + # print(province_total_confirm) + return province_name, province_total_confirm def province_today_data(self): '''获取各省今日数据''' @@ -50,7 +51,7 @@ def province_today_data(self): province_today_suspect.append(province['today']['suspect']) province_today_dead.append(province['total']['dead']) province_today_heal.append(province['total']['heal']) - print(province_today_confirm) + # print(province_today_confirm) def main(self): self.province_total_data() diff --git a/venv/Lib/site-packages/pandas-1.0.0.dist-info/INSTALLER b/venv/Lib/site-packages/pandas-1.0.0.dist-info/INSTALLER new file mode 100644 index 0000000..a1b589e --- /dev/null +++ b/venv/Lib/site-packages/pandas-1.0.0.dist-info/INSTALLER @@ -0,0 +1 @@ +pip diff --git a/venv/Lib/site-packages/pandas-1.0.0.dist-info/LICENSE b/venv/Lib/site-packages/pandas-1.0.0.dist-info/LICENSE new file mode 100644 index 0000000..924de26 --- /dev/null +++ b/venv/Lib/site-packages/pandas-1.0.0.dist-info/LICENSE @@ -0,0 +1,29 @@ +BSD 3-Clause License + +Copyright (c) 2008-2012, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/venv/Lib/site-packages/pandas-1.0.0.dist-info/METADATA b/venv/Lib/site-packages/pandas-1.0.0.dist-info/METADATA new file mode 100644 index 0000000..368b5c5 --- /dev/null +++ b/venv/Lib/site-packages/pandas-1.0.0.dist-info/METADATA @@ -0,0 +1,95 @@ +Metadata-Version: 2.1 +Name: pandas +Version: 1.0.0 +Summary: Powerful data structures for data analysis, time series, and statistics +Home-page: https://pandas.pydata.org +Maintainer: The PyData Development Team +Maintainer-email: pydata@googlegroups.com +License: BSD +Project-URL: Bug Tracker, https://github.com/pandas-dev/pandas/issues +Project-URL: Documentation, https://pandas.pydata.org/pandas-docs/stable/ +Project-URL: Source Code, https://github.com/pandas-dev/pandas +Platform: any +Classifier: Development Status :: 5 - Production/Stable +Classifier: Environment :: Console +Classifier: Operating System :: OS Independent +Classifier: Intended Audience :: Science/Research +Classifier: Programming Language :: Python +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3.6 +Classifier: Programming Language :: Python :: 3.7 +Classifier: Programming Language :: Python :: 3.8 +Classifier: Programming Language :: Cython +Classifier: Topic :: Scientific/Engineering +Requires-Python: >=3.6.1 +Requires-Dist: python-dateutil (>=2.6.1) +Requires-Dist: pytz (>=2017.2) +Requires-Dist: numpy (>=1.13.3) +Provides-Extra: test +Requires-Dist: pytest (>=4.0.2) ; extra == 'test' +Requires-Dist: pytest-xdist ; extra == 'test' +Requires-Dist: hypothesis (>=3.58) ; extra == 'test' + + +**pandas** is a Python package providing fast, flexible, and expressive data +structures designed to make working with structured (tabular, multidimensional, +potentially heterogeneous) and time series data both easy and intuitive. It +aims to be the fundamental high-level building block for doing practical, +**real world** data analysis in Python. Additionally, it has the broader goal +of becoming **the most powerful and flexible open source data analysis / +manipulation tool available in any language**. It is already well on its way +toward this goal. + +pandas is well suited for many different kinds of data: + + - Tabular data with heterogeneously-typed columns, as in an SQL table or + Excel spreadsheet + - Ordered and unordered (not necessarily fixed-frequency) time series data. + - Arbitrary matrix data (homogeneously typed or heterogeneous) with row and + column labels + - Any other form of observational / statistical data sets. The data actually + need not be labeled at all to be placed into a pandas data structure + +The two primary data structures of pandas, Series (1-dimensional) and DataFrame +(2-dimensional), handle the vast majority of typical use cases in finance, +statistics, social science, and many areas of engineering. For R users, +DataFrame provides everything that R's ``data.frame`` provides and much +more. pandas is built on top of `NumPy `__ and is +intended to integrate well within a scientific computing environment with many +other 3rd party libraries. + +Here are just a few of the things that pandas does well: + + - Easy handling of **missing data** (represented as NaN) in floating point as + well as non-floating point data + - Size mutability: columns can be **inserted and deleted** from DataFrame and + higher dimensional objects + - Automatic and explicit **data alignment**: objects can be explicitly + aligned to a set of labels, or the user can simply ignore the labels and + let `Series`, `DataFrame`, etc. automatically align the data for you in + computations + - Powerful, flexible **group by** functionality to perform + split-apply-combine operations on data sets, for both aggregating and + transforming data + - Make it **easy to convert** ragged, differently-indexed data in other + Python and NumPy data structures into DataFrame objects + - Intelligent label-based **slicing**, **fancy indexing**, and **subsetting** + of large data sets + - Intuitive **merging** and **joining** data sets + - Flexible **reshaping** and pivoting of data sets + - **Hierarchical** labeling of axes (possible to have multiple labels per + tick) + - Robust IO tools for loading data from **flat files** (CSV and delimited), + Excel files, databases, and saving / loading data from the ultrafast **HDF5 + format** + - **Time series**-specific functionality: date range generation and frequency + conversion, moving window statistics, date shifting and lagging. + +Many of these principles are here to address the shortcomings frequently +experienced using other languages / scientific research environments. For data +scientists, working with data is typically divided into multiple stages: +munging and cleaning data, analyzing / modeling it, then organizing the results +of the analysis into a form suitable for plotting or tabular display. pandas is +the ideal tool for all of these tasks. + + diff --git a/venv/Lib/site-packages/pandas-1.0.0.dist-info/RECORD b/venv/Lib/site-packages/pandas-1.0.0.dist-info/RECORD new file mode 100644 index 0000000..c0d9d60 --- /dev/null +++ b/venv/Lib/site-packages/pandas-1.0.0.dist-info/RECORD @@ -0,0 +1,1749 @@ +pandas/__init__.py,sha256=Y8ztkcG-ZBjSYUiFeR8mp7urldgrN8-FPJoAUXKurCU,10644 +pandas/_testing.py,sha256=FlxqabYKiLLk2zyMIz8h13_EvL5URFOJarbNYU9IyzQ,83705 +pandas/_typing.py,sha256=b5dSJFjol73nqPiIWA8kNtgq6QrH_uj1JoNkVD_rCr4,2569 +pandas/_version.py,sha256=GnuuK3zYZ9qXGyzXjOCNYP61bwzuf7Es0PIBS8HQ0G8,568 +pandas/conftest.py,sha256=DwkxF_Xr9jPFXfWuPha83w5EBVktsgRrnVjzmiS9DQg,22364 +pandas/testing.py,sha256=R-qMoJX9R-8-XhM57EirjGYyY3hRnbqHlR8PgcP6Zk0,312 +pandas/_config/__init__.py,sha256=jzEp5Rpr9WGcaLUuXUKeHQcmWNh3OGBmRTFKTGVMq4s,669 +pandas/_config/config.py,sha256=857i73B2tRYERaMgTSbiCvaCK3y1hTcPqSTIs_o8ATE,23210 +pandas/_config/dates.py,sha256=u2rl3gkqVBRFadm4GcgijNpZjtxFm88EDH3sopO-3fs,632 +pandas/_config/display.py,sha256=pNyitveK3d48P5VUUk-vrlX2_qJ95ZJ7Fq3XKpULhYo,1748 +pandas/_config/localization.py,sha256=wIMwTI0UACpn3sUpF2Yvkphto93YCDW4tJv-vprb6xk,4981 +pandas/_libs/__init__.py,sha256=-WcumQLU3L4R4gQ0UsldabvJX39k0v4ZMeRMzRtwXPA,139 +pandas/_libs/algos.cp37-win_amd64.pyd,sha256=WEnmhlNpEzs_ZJSTEOVhyclhu62RPxPLjfettw46VrQ,1298432 +pandas/_libs/groupby.cp37-win_amd64.pyd,sha256=DRPv_PK7orlaguzBS27DG0Cfv11aVocjXrJJvwHxOpU,950784 +pandas/_libs/hashing.cp37-win_amd64.pyd,sha256=WGGrC_MzqUNqHxAqu0Pl6SbJuWE92TRsc9u4O10Nzvc,161792 +pandas/_libs/hashtable.cp37-win_amd64.pyd,sha256=ttd2K0vvQdCuI6ubv-yCK3tT2i5Sb36aB-_DrWGzHNU,534528 +pandas/_libs/index.cp37-win_amd64.pyd,sha256=0CnSLebnD_msZKZb4rmTyP27k-AFag9TZcLqBfd8978,530432 +pandas/_libs/indexing.cp37-win_amd64.pyd,sha256=VNlmpIytaHjRG4DAZ3jrIyZSa3zeOUUYtV6G-OQ0kB8,45568 +pandas/_libs/internals.cp37-win_amd64.pyd,sha256=zIOtgquSkGncEIUlmumR1k9d1Dr8jL5GTpTf1D7YFGQ,223232 +pandas/_libs/interval.cp37-win_amd64.pyd,sha256=Z632oIFsYapjOKx1Ex64Xe9GuVF7XxDDroJb4dehMRw,1167872 +pandas/_libs/join.cp37-win_amd64.pyd,sha256=7Y4PrXENELxo6T46cpfGD8DkOcb-MqxQaGcll8gQ5vQ,1988608 +pandas/_libs/json.cp37-win_amd64.pyd,sha256=YziU_36dbVvTO7cTTbphRJo4sqtxVeGA1O-QSV2FKzg,66048 +pandas/_libs/lib.cp37-win_amd64.pyd,sha256=jxaoCt2ffZ7VYy8abyeElrtZsmHwqo1nfLKPi0QcBmw,465920 +pandas/_libs/missing.cp37-win_amd64.pyd,sha256=IIZ8h_dRgqklVyyXe87pKtGkT78_XlLrVUtPYHmDppQ,164864 +pandas/_libs/ops.cp37-win_amd64.pyd,sha256=5qR4UkuD_sQrrELZu0zT32OAE-SO2kOyILzfQTa5c4o,193536 +pandas/_libs/ops_dispatch.cp37-win_amd64.pyd,sha256=Ei6soeXY-OuwugZlO6HFlgMR4irjavZNYV4jXeolth4,52224 +pandas/_libs/parsers.cp37-win_amd64.pyd,sha256=Tz1YOsWA_gntZO5vyOHUUFHKkxBmxKj-pwEmuVPHj7M,442880 +pandas/_libs/properties.cp37-win_amd64.pyd,sha256=juQidUv3NrmwSWeTNkJ1gnQxMqGNaRosR-8RnIIMGws,61440 +pandas/_libs/reduction.cp37-win_amd64.pyd,sha256=HTgCTxfG4Hb-Z0s-sym6zMRQQwuFTIy85CH-dTEHO44,302080 +pandas/_libs/reshape.cp37-win_amd64.pyd,sha256=AWL09kDkuI99DdfIxFQNT2Cu-DmLnDobL8zMAQkYcRA,236544 +pandas/_libs/sparse.cp37-win_amd64.pyd,sha256=Fkbw_QPtZdv3cd0VH1pBC6KeYV70iMgsBuKNyoWWsYM,845824 +pandas/_libs/testing.cp37-win_amd64.pyd,sha256=5zDNzlfuue_wkQ3wQT1DopBUb5I7VVwKIdoF-nnin94,70144 +pandas/_libs/tslib.cp37-win_amd64.pyd,sha256=XqW3wWB2Amhv1lLln_q95wT5jJLfEz3YAxNS2g8c1D0,258560 +pandas/_libs/writers.cp37-win_amd64.pyd,sha256=dPCkKkxWr5S7UwwbBRBK7-JVXz1DL_EVn9WIdCqlpY4,189440 +pandas/_libs/tslibs/__init__.py,sha256=gMmsMXjR3j11eZyn9oNDDHnXq3iIKjJXAwrQbjn2wrA,495 +pandas/_libs/tslibs/c_timestamp.cp37-win_amd64.pyd,sha256=JtkGSJCJ4dPxdpw2Yxnk8jPYqYbq9S8DzxvcVAfjK4k,220160 +pandas/_libs/tslibs/ccalendar.cp37-win_amd64.pyd,sha256=jvmr3TVOVde9ONppt9LXA-vUd28bJas0xlHlqjRy9hc,57856 +pandas/_libs/tslibs/conversion.cp37-win_amd64.pyd,sha256=JEhGlihNhwV7-d3n3HhaBvcbXDFI_0PV4QWFLCl-hp4,242688 +pandas/_libs/tslibs/fields.cp37-win_amd64.pyd,sha256=ne3wx7dkQR-PdGe0cypELPjOmdVvdJJtPc2vjopTEQY,225280 +pandas/_libs/tslibs/frequencies.cp37-win_amd64.pyd,sha256=9rMZRtL8rm5JPZRYDH75krNa_rnG3cnG5RKLu9ysq_k,116224 +pandas/_libs/tslibs/nattype.cp37-win_amd64.pyd,sha256=-WV5wXacn5_LD5PsjkCm6D793STkDZksEowusTFP8Ow,184832 +pandas/_libs/tslibs/np_datetime.cp37-win_amd64.pyd,sha256=OthuQ-KPN-h-YiFcmZmP8siH3VraN1R6fvBJfnSjVbc,47616 +pandas/_libs/tslibs/offsets.cp37-win_amd64.pyd,sha256=4Gd4JYCdRfEKAlgkKv_fu0fl0G0IMMCQCw6d807Gy4w,351744 +pandas/_libs/tslibs/parsing.cp37-win_amd64.pyd,sha256=_HmSMIrbs7lBCXIeNoiVV2h5NMw47QUFwtHdVPZSOG4,331776 +pandas/_libs/tslibs/period.cp37-win_amd64.pyd,sha256=_qitMKtRT6XK6JMb3cSMX9-TGjVvCXEcXRkncigOMSw,372736 +pandas/_libs/tslibs/resolution.cp37-win_amd64.pyd,sha256=RkzMEYQ9yIZL43xLIp3Kg5aiJwwdj8ZJ3h7cr0bbFnU,218112 +pandas/_libs/tslibs/strptime.cp37-win_amd64.pyd,sha256=U7qIm_eDHTo3CS3jWmPi5s2YvmVu_PFzvMTMTVobUN4,323072 +pandas/_libs/tslibs/timedeltas.cp37-win_amd64.pyd,sha256=RLlFp2hVF6KcuTRmn0-vgFz-6vwn7531zvtwr-CGKMw,390656 +pandas/_libs/tslibs/timestamps.cp37-win_amd64.pyd,sha256=dUQGr1peVuFAkFxjqf6R1RM1l3nFSfe_vKzAJLUfGq0,250880 +pandas/_libs/tslibs/timezones.cp37-win_amd64.pyd,sha256=LSnrWutaHgUnc30M0Rzx0r5j5hc4c81emXPoF3BvN8Y,198144 +pandas/_libs/tslibs/tzconversion.cp37-win_amd64.pyd,sha256=tKjpPs8-WoJRYNhmwsEHzBdhZ_zn9qHg3rWdv-_zwHo,233472 +pandas/_libs/window/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/_libs/window/aggregations.cp37-win_amd64.pyd,sha256=ZSDInmMHUF_Q8Q0ifssFVe6wsuCWC2aky6xeuSt--zY,347136 +pandas/_libs/window/concrt140.dll,sha256=1AS0nCXMdtxMhuHYL8I3mUgvZQnoWnPtgXfvwyDsAZU,332568 +pandas/_libs/window/indexers.cp37-win_amd64.pyd,sha256=7Q-EFaY15M8B3ipES-yxnfYYeyW_k9Luo4b-KxCPRVw,152064 +pandas/_libs/window/msvcp140.dll,sha256=mePiXNpAQoP72Wslt2g6jSE-eVRnSt76InkSOo0HAf0,627992 +pandas/api/__init__.py,sha256=o_GBSB3OnMEmoldMWjubsJotWfgTl0v5gD-X3CXG_LY,86 +pandas/api/extensions/__init__.py,sha256=BFZv3cvva0zkcJJo2AlOGh9EO6mpcdFH7GVLTIQSPxk,661 +pandas/api/indexers/__init__.py,sha256=XVgjgDEjaeo1UaWdmy1andJeBBJRmlfBbLea6kDUX1M,204 +pandas/api/types/__init__.py,sha256=XO85YJNx9YdyBS9Ya5bpoWhNBqLJNIr-XORZZXT80oc,453 +pandas/arrays/__init__.py,sha256=Xz1E6LHWMuZ0cSyxlwbTLrdes4dPRdlrDPLgoADRYmQ,516 +pandas/compat/__init__.py,sha256=ZbJIKjgmyci9nyCtdLg6q64VQRfNLaopke7-ZmCpcWg,3035 +pandas/compat/_optional.py,sha256=3igVm00iTdHKMTLgFPmDznEG7HdjofgyeqzbY_6nMQM,3451 +pandas/compat/chainmap.py,sha256=19hloy2z6IpIgzWBZqxqOrneCH_dHgtvF3ybn3Nsnro,991 +pandas/compat/pickle_compat.py,sha256=FXXuBn1kV1Pky4rMfK1YaK72QEuO82lD8wNP8592di0,7007 +pandas/compat/numpy/__init__.py,sha256=LOjbtSK23Fo3ApsoS5_ROXhWP-CZF40Qt-HbETlzfeQ,2022 +pandas/compat/numpy/function.py,sha256=_DR0PywtLd03wemPXCS6FtSWqKe0sY7EP88Y4rqZji0,13084 +pandas/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/core/accessor.py,sha256=3sze-Ji6UivRxr3N7C7aB0Rrf59Awld1NX5lZ0siHe8,8467 +pandas/core/algorithms.py,sha256=H5ia2Ko4x_D4GOaASp80CmlZFK3O05jeJxLAC3Lekqo,65357 +pandas/core/api.py,sha256=3DUJbSkWY-I45C4ugFua3aj0zK33__DI7tJ8yKEK9KM,1727 +pandas/core/apply.py,sha256=J3dT9Ze_VZHXtN-pqmTBLidaw-9SrhsZsAH0iy2-IS4,13775 +pandas/core/base.py,sha256=NveWqfQSdrkMD-7isCYlzcHNbdq9hIqJj7IX5Im5Zgc,46722 +pandas/core/common.py,sha256=18AHnDewjuGmuzHER664Uc3WrpRa_dyrPrVKD3GvEFw,12386 +pandas/core/config_init.py,sha256=06WAwJ5hxiRpFVbw_qA2oZKJbPtIIU70p_zUf8-FtVM,18442 +pandas/core/construction.py,sha256=qFlVnSpUoN5koXeuEG0_tQZRGSg_QAFlKaB735UftoM,20780 +pandas/core/frame.py,sha256=cxCbgaOXp5jcATdwTweM-OYu-lH3VybcJnn6Gu3Ibw8,293017 +pandas/core/generic.py,sha256=m3SOVSocu5rzbWZ8aSSM4iu-G9e7zlfa3azDKQcQJf8,375347 +pandas/core/index.py,sha256=S5UT4hZ99vuQgfz3G2yLLAE5MYCGmcVRKraBI76ohdQ,682 +pandas/core/indexers.py,sha256=ewWeIJUS9G43O9EGtOBdgvhBvrlCtNPD5HUw3nfAB_g,12202 +pandas/core/indexing.py,sha256=hGj-O-nLGELK4Pk68gk3yVnMuWHZzB0DnpkCkKAAID8,83115 +pandas/core/missing.py,sha256=nsn_eoinCteSIbhY9_tYsed5skWpQNCbigQL5VDZdQc,20105 +pandas/core/nanops.py,sha256=9KxvSYjv3q3kl5qcQBg84xeRS7sPzj1MF54AbkKadnM,41013 +pandas/core/resample.py,sha256=9tIZSwgdzoKBrvNoFs2kb6nmRB6L4zcn9RQUFleBOAc,55927 +pandas/core/series.py,sha256=GfssYwjOqFIqxDbOKOFIwAj1QObf2WsFZsRU0QTMxSE,142138 +pandas/core/sorting.py,sha256=-MdCfA1VkCM9Vjq6Yc_f8nXikW4z8LtkMMsdbxdTLC4,12790 +pandas/core/strings.py,sha256=x6ijJ7ZGpwsFq8lbwhaJJIOJ7U6cgkXfvcGBshgU4g8,107234 +pandas/core/arrays/__init__.py,sha256=RLUnDm5OunEHVHXMz-vEk3z5CSg1V2LQ9-k4E_DqG60,1074 +pandas/core/arrays/_arrow_utils.py,sha256=yLNwMKIJd483wI7re03TEdPhNWPffNPk-5oqGANBK2M,4174 +pandas/core/arrays/_ranges.py,sha256=uSxRjuwPc52lOEdU3p2mlpTPqB_c0JoKQpjiX0oxawI,6647 +pandas/core/arrays/base.py,sha256=ajwa0PxCR_wKR57QdRgXejE0MOl66wmeN97IgDEwwzo,42189 +pandas/core/arrays/boolean.py,sha256=fTVVQqSmy8NZDSXhE67lSQHG5Una8coPoylnrPjf59E,25095 +pandas/core/arrays/categorical.py,sha256=KzQ2OALDDkhu7srkWYzFD6NRszXg6lRp_gOVG_axM24,88409 +pandas/core/arrays/datetimelike.py,sha256=EqvkKab3Yx9F_4VfXrtJdUZ1_gEla33uj0Udmdj-kaY,55506 +pandas/core/arrays/datetimes.py,sha256=yOVk9JJyS1NP6J2WacYzjJdH2-FVl1lpajE0r2M7pWQ,70075 +pandas/core/arrays/integer.py,sha256=4hFEYDPyzhn9phYtfqA4u4LtGM5MqhZSNda0iaAiTyg,23465 +pandas/core/arrays/interval.py,sha256=lwoHTMWoY6ux0IS7qWDexelIGzoXHC9rthvhJRWwirU,43813 +pandas/core/arrays/masked.py,sha256=vRPFFgu8OJ6tN8zZSISh2PDUo1w4SxNYDomNrzmdxQA,7871 +pandas/core/arrays/numpy_.py,sha256=ZouEZ41jEwulZ_ph_37osP3u5tJwMsguBnwbWeJMyhA,14855 +pandas/core/arrays/period.py,sha256=8C6iVpXu3G6sWuay9obPrwtkROClaI2kTCa7iV-tX9w,31646 +pandas/core/arrays/string_.py,sha256=k1IOhRHqMWcmLG5rWaBv7XjfNtluNLwWMscL1AdqLDI,9708 +pandas/core/arrays/timedeltas.py,sha256=VEB7BWXmmHA7EDEykjGH-gljEvCtUFIZMKtJSpHauew,35611 +pandas/core/arrays/sparse/__init__.py,sha256=imJmbLnORYINVprWee-rDohsNU9Z-aFo_J4joLdCRPg,273 +pandas/core/arrays/sparse/accessor.py,sha256=0vnHX-bX4OoLkyqsIgSN5HWibF-nzeBXaTlXM1nd7Fc,10492 +pandas/core/arrays/sparse/array.py,sha256=bKfk8xalhovBaHewpMCYJx4-mvYBRw3d11457so0PFw,49799 +pandas/core/arrays/sparse/dtype.py,sha256=BMG4lGSKI6jei0z3vfMpHbjm6fY8BkfbuMm4n_dHKf0,10439 +pandas/core/arrays/sparse/scipy_sparse.py,sha256=LwY8Cswp2oACJyfFaxLv_SuEOzcDkw6HlwmbeVqtt_A,5340 +pandas/core/computation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/core/computation/align.py,sha256=rDv4Z7vv3dKejcULZ8xtFSjetxf3wSgYHj5FVIwnivw,5854 +pandas/core/computation/api.py,sha256=Q_hRn-f6r45ph3AJqKmXlodzOufxNc9masH1q-DbSjE,62 +pandas/core/computation/check.py,sha256=cjl9Lick6gKjnft8YlgUAZf_W6T_Q8Rf7iFBBWfoiaw,335 +pandas/core/computation/common.py,sha256=N0YWmWY01rCk36dHaZQNxbOdPmBJouy1-DBHVk39pIk,682 +pandas/core/computation/engines.py,sha256=V0OAVtvlrcwVLCSeacUFodVp33Z-I0SIJ1V6a9xsEl8,3185 +pandas/core/computation/eval.py,sha256=RpcMlAueGR32fCvSMsqOGRevyWqAWTJkRBkSVHWq8IY,12636 +pandas/core/computation/expr.py,sha256=-YQx-F0OBbI7WJTlXsmQBMS7fNxn6dtI1QwX3fh-rns,23637 +pandas/core/computation/expressions.py,sha256=b4R3c1rRav_B4UA8k_TniUUJxjzc8VAyLcTBagLDrxs,6487 +pandas/core/computation/ops.py,sha256=GY6p6m6be5KYd3nLfVdhisb96byer8wwk-sTw960H_0,15844 +pandas/core/computation/parsing.py,sha256=A1I9CqGbLJBMFM2AdjP0h05gbRQYp-OhIgOem8jmLIU,6292 +pandas/core/computation/pytables.py,sha256=qhNiBYA1EX1Kcrwqegt4ZQMughqqu4RtUwwhZ4f_y3g,19133 +pandas/core/computation/scope.py,sha256=mK3zoJbiyvWDiEsZ2eCFmy6pJO4OExGqesG5DI2kr6o,9091 +pandas/core/dtypes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/core/dtypes/api.py,sha256=n2yY_gwcSayI-sfINVeGw2y3eJu3Rk2BDzs6Q_0YtAo,911 +pandas/core/dtypes/base.py,sha256=kXHpUo0K3WucIRwX6_Cp-_jDhm7Ef9vEryasPOESAFc,9594 +pandas/core/dtypes/cast.py,sha256=8sxCoRSAs0YLdaklynyDuxf8WgaobOOHU64Mgs8gUaw,50359 +pandas/core/dtypes/common.py,sha256=anr6xUC4JOCpc-jem_KJPIkN4YUQjKFjpBBm5P-7hck,48078 +pandas/core/dtypes/concat.py,sha256=RMz1W4gGn9P1-esZeGC5kEtrqrCb9XaQbhYE8ObBFKM,15266 +pandas/core/dtypes/dtypes.py,sha256=1Dhve6RdTTt-z6gVse-_8gQ58bbuUW7GY4BYMEBgWXI,36576 +pandas/core/dtypes/generic.py,sha256=4d71IREotNwC9XskcnyOR2mH2UwxOTUi-ojC5VN7r-A,3113 +pandas/core/dtypes/inference.py,sha256=i5yPa9mCgTwnDVA4p0hfzu2ypC7XUiyKiS2zhrghFwg,8401 +pandas/core/dtypes/missing.py,sha256=KnvpdrXEBk6hcuVdd_u08rSFI3er3l5g6_LeWl39m04,16561 +pandas/core/groupby/__init__.py,sha256=xZmPcq3GTA59FrudjUQ2e03mvF4jdow1KqIJQ57dXOQ,284 +pandas/core/groupby/base.py,sha256=7JTsZNjRWuOm5oF6lZ1WlnUHTCjl5fcGvnZwfqW8Cnw,4658 +pandas/core/groupby/categorical.py,sha256=fI5yMM9qErxv8Sg1Tg82j-040IKjlEvGoaG41-WbEdA,3079 +pandas/core/groupby/generic.py,sha256=OAwwM0FrDpcHnecbxioDOt7skk6JYsQ_SGb1HUI5qEU,68920 +pandas/core/groupby/groupby.py,sha256=_72Lfprv4E5URzTmPEwJ7S_J1KsMbBwCgsJd0zbOQrw,79769 +pandas/core/groupby/grouper.py,sha256=5db9FPXffSyOuYy9qypLkaJC6KN2XEKikLdPSOnhRV4,22093 +pandas/core/groupby/ops.py,sha256=Y7huBBKPwt4LhDPq50BF3ghvR-GkwsJnOonday-hpck,31348 +pandas/core/indexes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/core/indexes/accessors.py,sha256=t4IInmZVc3Ql73RNA785TNzUDEpHnScCzRGOQXwSzAY,10437 +pandas/core/indexes/api.py,sha256=xdeaRoBqXkZOAqj4ofSfaLu0hezS565BJyfYHK-zqm0,7838 +pandas/core/indexes/base.py,sha256=q5uPpyNYysRWi2F3uhEjntdFyibf-2Oxuqztabda3rU,180714 +pandas/core/indexes/category.py,sha256=kHMKKt1t5Rz_52EXsbLmxjGKtSX31G-ZtI-oc9osdMc,29633 +pandas/core/indexes/datetimelike.py,sha256=7J7Xe_QUlYmuCPO8KKeZBl99blWlUbKOz0e1iZyRDpE,29556 +pandas/core/indexes/datetimes.py,sha256=yJPfcUK5xRY2L87GlnhlNsakf_hm_lxYxWEn7owJ-dk,43845 +pandas/core/indexes/extension.py,sha256=gt0RD8GWcqfnlWS3Rmv9uxkkLYVUECm6VvfdW9VG250,8700 +pandas/core/indexes/frozen.py,sha256=y_iv6fx55WvUAAy3h1lfR9_xMRVAJbIYCMw1NDUP1Js,2938 +pandas/core/indexes/interval.py,sha256=_9kHGO1noBN_rXQA_Qncuswurgd-ZltZn8INetHBZ90,45752 +pandas/core/indexes/multi.py,sha256=np4pFHYB7oU1yFRcQ5clUAEpFNEo2ezfp60MSUTwfO4,117402 +pandas/core/indexes/numeric.py,sha256=nUzuSAskEEKJVoB-Dke8WwZIY1HeLmEAQl8UrBnBWWM,16911 +pandas/core/indexes/period.py,sha256=WRJD8Tq3DyIxqEAPdfFI08Z3_Y4lL3DW_EXebdIylQQ,31331 +pandas/core/indexes/range.py,sha256=aLuRQXCtp8rKT-RYxAG2v4gvPu_5ReMA-EE-BpWEmPE,27208 +pandas/core/indexes/timedeltas.py,sha256=-hCnzyVnD28uzzv0Ipyk52_VtOgF7fxFYK6I0GooWJw,17199 +pandas/core/internals/__init__.py,sha256=kfuHnp0Ir6DnIPEHlcO6ye-5xraohpuxB9AbCRCXU5Q,1010 +pandas/core/internals/blocks.py,sha256=roUvBrMQVUEbLZx5LgTmWbN0SCwq_JVPGGdnctpg_BM,102621 +pandas/core/internals/concat.py,sha256=lHSxE9tKbx-ovw7RZL65zwUiupNd6kQbjluKsBdYkGs,16629 +pandas/core/internals/construction.py,sha256=u01hpON8uG8EAxnYsKAaLJVtKSRlFgvHvrakGPQnomg,19756 +pandas/core/internals/managers.py,sha256=2ZtERY6H3lFjUbSuJP2u_MMctHgesCw05PlZaSRi4T8,63298 +pandas/core/ops/__init__.py,sha256=qbF-uVfu1cuMrlw8Dk_VGf4EN6fk1WmQtDhRQY-yDw4,23106 +pandas/core/ops/array_ops.py,sha256=fsSPmlj55rGy0iHYluzU79gGFscqVYHt3L7HEK8zm5U,11988 +pandas/core/ops/common.py,sha256=ivTd8OV4EniqKCGSvx-wD2UGAt6iqlZjXOIoz9x0U2U,1569 +pandas/core/ops/dispatch.py,sha256=qxDEvYWA3NTgBbwXAyXRGYBLqg7PNwWwRGOSSTmNPmk,3362 +pandas/core/ops/docstrings.py,sha256=Dcw6ZHT62lnFzLJY_3F-7gp5GdkwBb3pheEuabqIDwg,16200 +pandas/core/ops/invalid.py,sha256=sUm_2E_QJhW9kiDWlvcDQjM2U8OdUgPEj2Dv2iQE8aM,1285 +pandas/core/ops/mask_ops.py,sha256=WqezxdvCmKEXHIWihblKYsKyGWjqx1YcCb9vkPTNlWM,4935 +pandas/core/ops/methods.py,sha256=LAwE8dTNoRgaP9b0uh1Uc72D2sqIBxjeuQL6ExB09-I,7754 +pandas/core/ops/missing.py,sha256=7HRLQNuA4ix9vR4bOkDqIcn1WL5BiQG0BO8AsLORxc4,5270 +pandas/core/ops/roperator.py,sha256=F8ULcax62DJD-0JEax2vBsIGf5vBLvuKhDjQR244AUc,1080 +pandas/core/reshape/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/core/reshape/api.py,sha256=pRwwOuS6LNFLQOEFVrVKgvJk2ZrjIPf14ZHmnrN_7vA,365 +pandas/core/reshape/concat.py,sha256=xlL-DptPjxlF6bOlMF7pfbuus9fT4A-IeWTZdwStgwY,22081 +pandas/core/reshape/melt.py,sha256=wknzTd0c8IfPYt397Pu7-thgH6MWJbHb1tDDmyHWJRM,15913 +pandas/core/reshape/merge.py,sha256=otULgCHGLXJDCndGvd-91t9YY5S-t4K0r0FY1F4u-nA,70735 +pandas/core/reshape/pivot.py,sha256=cwX6qWQ-ULOwj5c4Qvmqd0FSRx7Za16wfzHSbin85V8,22222 +pandas/core/reshape/reshape.py,sha256=pXc0KumZGMGrXhiAQdOehNpZ8AMg5fwTI1XsAdM7bpw,35445 +pandas/core/reshape/tile.py,sha256=3MTMqenXViEf1TXQ4AU0L2VxcJCfZEpJA_ZcNhGVy_Y,19572 +pandas/core/reshape/util.py,sha256=AcIjnrFDwbx1lt73GwEz94kwk1-APaKOeoRP6jYtcQc,1351 +pandas/core/sparse/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/core/sparse/api.py,sha256=g4O6jA1PpcOWOFCAeR8SELO2CGaZVKjIAN6sqvA-KUw,105 +pandas/core/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/core/tools/datetimes.py,sha256=C9CT5eKe7q5MiIQ8-2OIj3HnQijMvUQyOLxwWLDRYLQ,33894 +pandas/core/tools/numeric.py,sha256=8kNkql5v3D8Y5Jq-sNIlfMWibGjmu6aBizR_987FaKY,6592 +pandas/core/tools/timedeltas.py,sha256=1xnl2K4SFtYDM0xG7jvN5txaDrCI1OPrF07dse-IyZw,5594 +pandas/core/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/core/util/hashing.py,sha256=7hE-U1J9oiyF_pE0Ay4hrRKap-X53srMloNhb7BMbVg,10613 +pandas/core/window/__init__.py,sha256=WnVXAxFWr3dNlCLXuyrjLyN4UzoQoDzkrsYYcPqTUt0,218 +pandas/core/window/common.py,sha256=4JLQOmr-i4Fnle9g9m-EGLeNjvijwCYxTaLbtfEQzL8,10781 +pandas/core/window/ewm.py,sha256=HtewNhJ4-6uuTghiPe-kCJrnyQutY8i54LMde8vIAe8,12796 +pandas/core/window/expanding.py,sha256=Daq0y9fBKsMPKQDosQxH7GMrkDkWrJrmI4Ng5Q0gCh4,7373 +pandas/core/window/indexers.py,sha256=6TiOxqDKpvQkJGK3znb8KsARATljlGaAIqTN11qSj9U,3689 +pandas/core/window/numba_.py,sha256=ocU96x-j-v6qBvfZ-Nz49vM4-vNw9M540en-GrPgF3Q,3597 +pandas/core/window/rolling.py,sha256=NFpt-1Iy5OFZCAdbXiCo4RbsQsVdiiN3DLIMRBV7llc,63624 +pandas/errors/__init__.py,sha256=-xymwYAipBXnz_PvLGO50rU1etP5gQhnvc2SL9eZUCg,5521 +pandas/io/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/io/api.py,sha256=pvcFoUIpJ3vb-eyIKxX6YyfxyZGrFT4Jrkyj7TSneDk,726 +pandas/io/clipboards.py,sha256=8kQPA49ub6sHinfEkOBdiEHJAykUW-tUoVuCk7kOU8c,4352 +pandas/io/common.py,sha256=UMV5p340Nu7VLry1B2zvzq1PKZQR97fWA5-K5LUiBBc,15838 +pandas/io/date_converters.py,sha256=P4gyzxRNsQtHPGEtose11Tnliws_w4uNFH5hpdXT8U4,1759 +pandas/io/feather_format.py,sha256=5aOtCyvejuFX3QiDtthaXr8Gh_ps7BCJnf_LeMERGk8,2989 +pandas/io/gbq.py,sha256=c0qmuvPQvMqcrH924_GY-9Fnv2StE0b128NtGBNJ2m8,8032 +pandas/io/gcs.py,sha256=Pm3eShHN8RYYTIFgjzgRNNoaEU-C_C_DEyljOAUnkiA,511 +pandas/io/html.py,sha256=0vzA8dJrdERgkU-P0sumoU2QU7qETihpMXM6ShaE_7k,33991 +pandas/io/orc.py,sha256=pTncOdk5jMjZwf6wA3IqwNs2CJ64jnWONXzDvBQ06R0,1677 +pandas/io/parquet.py,sha256=6mccQ8NC5cBG_NS1aLb2dzfOvZmsUB7vsi-Mg1ZyPm0,10008 +pandas/io/parsers.py,sha256=F1oX_aeE5TvHok7bbNTQBysbQIdnPviVVPpIo0PrgeI,126677 +pandas/io/pickle.py,sha256=2QBLrhqAnbdGY4XjRheycwDT42tHOYx3MVNeh0RO2Kk,6356 +pandas/io/pytables.py,sha256=PM1NKWGvaHaGL9L_b92TnXerPI9Fj0lCkHRhTGPpAl4,162475 +pandas/io/s3.py,sha256=cCGfMND5ER_WXSyFQ5pRzwC88jdGfneY6li4zSqzBIw,1665 +pandas/io/spss.py,sha256=j5yOQRMUNyL_8-7_Gp8iZuom76cIay2-uo6FORABxuA,1197 +pandas/io/sql.py,sha256=kLTqiO9kD7CIvOURnQB6Xiqdj7JE0lkT5xu3Cmtfxio,60490 +pandas/io/stata.py,sha256=IVN5_PBM3iUsGSewaehCV6FEIYJyQOWIMFxHHhsig2M,115398 +pandas/io/clipboard/__init__.py,sha256=yqHy18xTnkh0EDR_teUdwyvW1VSVyG-1HpR1xNzEvew,21552 +pandas/io/excel/__init__.py,sha256=RZTHCBmsMg-3htlzVQemOboT1kqbTomu9HhKtUHnDlU,422 +pandas/io/excel/_base.py,sha256=e4JC4dYGWzKXxg7uzVfC829goWWhzDjW8vNDDQ0R4rY,30329 +pandas/io/excel/_odfreader.py,sha256=cIq1cPZak_5m0cHmfgBC15ESX0MIpH-2FemyvWPw3ac,5959 +pandas/io/excel/_openpyxl.py,sha256=lThGHGNw1nwPe2iDn3zbO6gI4NtW6jmKWfVYEW_GMbo,16106 +pandas/io/excel/_pyxlsb.py,sha256=IBu7lA9RoQJpzEfgWB9hSKyi65aRIwhHDzwdqz44UXk,2324 +pandas/io/excel/_util.py,sha256=IASr_0wX4ECv4b-OqDji7fNoN_rz5o8s6IUKDrfMPnY,5507 +pandas/io/excel/_xlrd.py,sha256=S2bWcCYTY16l3GuS2q0loN52DIQ5KWaZz-EhArkAAXA,3368 +pandas/io/excel/_xlsxwriter.py,sha256=tU6bExazgN-qUeXjWDkD4onwdIYJDxPpYLrHDT6PG-c,7827 +pandas/io/excel/_xlwt.py,sha256=2UlSsvERZHW-dwDh-oNSrOh3ycNeLBlNOpamdGZT6a8,4415 +pandas/io/formats/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/io/formats/console.py,sha256=-dCS1zHBN2D-EgjyMvqF8gCbuEpPPgfsR09GL1tf7Mk,2523 +pandas/io/formats/css.py,sha256=QJYTMG7D0vX11xU2LVuuaA30ugfb6_RnAQ3T-je0l78,7946 +pandas/io/formats/csvs.py,sha256=gNfyFE2VztuMTvI3R1q5DVrW9OnVopUjHzpwh85uEyE,11501 +pandas/io/formats/excel.py,sha256=6bf4CAV6z16sTMW602NcF2bP3OgAyWnGzfHbUPmQKuk,25530 +pandas/io/formats/format.py,sha256=ABghazqF0pdfHTR7W1btviEwB3k-aFLRLPz9Jf176mE,64452 +pandas/io/formats/html.py,sha256=HVi8hI4wwZ_dMmGKTPmLHon_FeToA9PA6n6yUxan0tM,23264 +pandas/io/formats/latex.py,sha256=2eWvUY0nN-XtWepRWfwqWhuD607g6T_UyUhuhUhILVg,13237 +pandas/io/formats/printing.py,sha256=KBwINnMEAJLW2ai-mjf19L3Ij7NcrgLLhxbFFnu_B8A,16798 +pandas/io/formats/style.py,sha256=hKHTe_e-hrkLwTVaJ-ypaLeBp6rva5jGIsQlxwW0q58,49205 +pandas/io/formats/templates/html.tpl,sha256=mCgH4jUkz-tgpB3rKqzIlCMb48cg7ffkrahQkyC4uIg,2040 +pandas/io/json/__init__.py,sha256=D1g5ROWGdB9OhvKZi6-mYt2QIllqc7Te_xXbesD-2Oc,340 +pandas/io/json/_json.py,sha256=K6mXeygXevOJa59MfjYk9_C9v0QUl-1Rygaa28VN-0o,36111 +pandas/io/json/_normalize.py,sha256=C9tS_Omle08JWC4M-UqafRWE8eXoSXNZC6l3HUfcxfU,12100 +pandas/io/json/_table_schema.py,sha256=1oRgCRR5cGsaDXbhWXwjXbiceT4ChL6Erg4zDbQ1faY,10255 +pandas/io/sas/__init__.py,sha256=l74xCbedVGG8EzuApVbpGlttu5KFgxVGQFdvevtAkQw,53 +pandas/io/sas/_sas.cp37-win_amd64.pyd,sha256=SiS7q8-_8ZBHWMDLcyqBMuTzjtYT53GOGyTOBgEwW-Y,191488 +pandas/io/sas/sas7bdat.py,sha256=LQFxzWW_ORP8JcU1V14lhz9c-3cHSywopK02Gm09J58,27549 +pandas/io/sas/sas_constants.py,sha256=1osy4oIK4siNYqILPpHOmPqrDFhpZL8c06ywvGFEtmk,6731 +pandas/io/sas/sas_xport.py,sha256=tFDec43xy4E05mANks4arGoRYqTrzUrSlTfFU4-z7I0,14771 +pandas/io/sas/sasreader.py,sha256=An1bze35EDabGg7W8-qZHLmjPdWs6MUfXlL-ZwlGAb4,2866 +pandas/plotting/__init__.py,sha256=W_2wP9v02mNCK4lV5ekG1iJHYSF8dD1NbByJiNq3g8I,2826 +pandas/plotting/_core.py,sha256=lKtCfuzGP_xINtYlK0FKfzPd_mewicJAhF4hLtoRLx4,57099 +pandas/plotting/_misc.py,sha256=7DDMZfQHRjR9hB4heitzP1sfk0XUNWdBunNL82eAGr0,14465 +pandas/plotting/_matplotlib/__init__.py,sha256=G2FDXYuth9N0cIkupZAOQjffXmTmH-iuhYl1AFKjcGs,2002 +pandas/plotting/_matplotlib/boxplot.py,sha256=XTbR4jch-TvI9i9DnYhZ6uWnltcDw1DgM1HPQlg_xhQ,13036 +pandas/plotting/_matplotlib/compat.py,sha256=exJphwoAQCiqypLO1-M3jCrBCkbMyzPGbGPJUCiHIu8,554 +pandas/plotting/_matplotlib/converter.py,sha256=TTogksQ7sOtB8mZ9LyROzOt-AOwF0UJ95oHGb8DfAzQ,36895 +pandas/plotting/_matplotlib/core.py,sha256=lEUA_G7utZesCqFx5EBi68Dc49mRrOSPryEQrGNmBZM,49468 +pandas/plotting/_matplotlib/hist.py,sha256=ZByAHAfyuexLjan4MXSCug6MRIw6CucooOFvJLrZ_eM,10575 +pandas/plotting/_matplotlib/misc.py,sha256=-D4rNeN0JFtrDEeNw5P3xcRrPWWc2ZtR_8Tj0qtGlwA,12259 +pandas/plotting/_matplotlib/style.py,sha256=91exAF4XF38DO-yNU4Vaeze2bKkJkOtC5igR5H5OIyQ,3378 +pandas/plotting/_matplotlib/timeseries.py,sha256=mKA1c07uQENoZpbSnmVqljD-nWt2SbLAQPSpj0c0nM4,9541 +pandas/plotting/_matplotlib/tools.py,sha256=fx_8ddrvfNx87rWm05kHlI_3yDH1DTgsZBCTPzgw3hc,11914 +pandas/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/test_algos.py,sha256=dFuP9cgVqDyWyReoLCDwJVWxR8_SY8apty9sGS4iPk0,78032 +pandas/tests/test_common.py,sha256=MA2OlCU6LJ9yJwgps-XDoCqQBG_gPuO_2zWVch8Gek8,3517 +pandas/tests/test_compat.py,sha256=p--4vgr2SwtoNidbkkK1Zu4M-u9-0lMz2AsZsxGfKP0,60 +pandas/tests/test_downstream.py,sha256=zEG4djbvjMXuXE6W5jQqPaxQLsp9GI4hGTW0OU4oyas,4649 +pandas/tests/test_errors.py,sha256=rT-WOmv-ld8_V90iH_bCXnVlHrCh8fORpbplFATqYu8,1523 +pandas/tests/test_expressions.py,sha256=JwTrPIe5rv-t0OHiATT9IoOH9JAXY4_vwjIvN8fK_6A,12904 +pandas/tests/test_join.py,sha256=YaG3S3-qD5erhiWKYMoksRi9-POpCQEws4Qh2xGzxeY,9296 +pandas/tests/test_lib.py,sha256=XoE1va0rFr50MnKRCAbuUPlgB6gSPZZl2_scDLqCFR4,7543 +pandas/tests/test_multilevel.py,sha256=na845-Ix9_ai5Dez6O1GceCW2eMQcNoAU_Pk4JMUjM0,87221 +pandas/tests/test_nanops.py,sha256=S5-hkO0b8yio0u0zW1t5sWVZAueLGO8qJsZ9h5K76Rw,38312 +pandas/tests/test_optional_dependency.py,sha256=oX9RslBhLbUgfZGRTxE0ApeKYOPrgMgYNeRbLxc5_68,1456 +pandas/tests/test_register_accessor.py,sha256=B-LXZTt33BwfEZoge0P_CF-NoPwa52zoqyUDO8miYFU,2346 +pandas/tests/test_sorting.py,sha256=xDaERJw4I7ZPoVm4ANrJn7sqx8i35wKp68XLOgqDXlQ,17786 +pandas/tests/test_strings.py,sha256=tdmShFNRyPTL7N3azFIMJ8vhOebUgUac0p-pDyqPZyU,131537 +pandas/tests/test_take.py,sha256=v5vfrCEa7gg6lhfHOTkNSl2koeTcH4BmPeghHGHyuB0,16517 +pandas/tests/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/api/test_api.py,sha256=doQQBbdKOmiNh24pbh4lweWa8F2lzLbQzl_8Uph1buI,8579 +pandas/tests/api/test_types.py,sha256=hCK0T11SGVVGkWcB09oI8a88whWTBjeWmwuMSUJn_IA,1684 +pandas/tests/arithmetic/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/arithmetic/common.py,sha256=xkx5xTBstm87_4LKafOyvBXEet3Re6g2DXFp2hQ4JWs,2448 +pandas/tests/arithmetic/conftest.py,sha256=AN-xU8tP1nUkZgqOqDJwKEoNpuka6EcxZkVKWpaPUPg,6014 +pandas/tests/arithmetic/test_datetime64.py,sha256=HglClnkQRwmQqU-0vEyTvSoJTftpHBOvfyCwz8XMdWs,87353 +pandas/tests/arithmetic/test_interval.py,sha256=VtyfUlgpz1mbVTyD-bLrxAcxUMBYdVxhkXlYU7hqjyQ,9459 +pandas/tests/arithmetic/test_numeric.py,sha256=XEtYHRMRLI811z6Ix1uwA7hkPpmOnaMpYwLKiTOdtyg,44995 +pandas/tests/arithmetic/test_object.py,sha256=7eQslE9-Ytkj6TDFdKOXuelwtYy2CNphSieY6lPpr_Q,11617 +pandas/tests/arithmetic/test_period.py,sha256=cF4UllDraBuJvZ8ahrAKzNa-sj1rGkhFdnorVUir4dc,52585 +pandas/tests/arithmetic/test_timedelta64.py,sha256=iv4A-ix2tP5yU-DQJZ6_IzI26Ub7-IOEpRByYv0Ol8g,79314 +pandas/tests/arrays/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/arrays/test_array.py,sha256=G5VST5mxuR0hn5UH6t_YE2QS3AWeew3MBN0oD0BW0Ns,12345 +pandas/tests/arrays/test_boolean.py,sha256=ctJtH_zhzeLRbaqtjBCEAaofhKITd55qQwaOojoF8Mg,32429 +pandas/tests/arrays/test_datetimelike.py,sha256=gqSguvi2SNN2T7yQ-r5sRuQXMkfOlkxvi1-Y9mRlMOQ,27518 +pandas/tests/arrays/test_datetimes.py,sha256=uSyC_NcHu2IDN5OJVzIb_TZ4ioyLO9YAdlJv254lgL0,14051 +pandas/tests/arrays/test_integer.py,sha256=VG57N6XfEC5ifwxOHlCT_nLCUuO1Vn7sSRyHQ3lrPWw,35018 +pandas/tests/arrays/test_numpy.py,sha256=QdV0-H62PnKs8Wvmu7axcdJDtB_IjzNqXZjMddcdu2Q,6532 +pandas/tests/arrays/test_period.py,sha256=_0bW5sdB6nhMIkX3KD4KJVOIiuJAurAycDs_EKMW1g8,12468 +pandas/tests/arrays/test_timedeltas.py,sha256=J531jmP5FzaunXo06uFWmaorrb0ifVoewibnVEvZXT4,9416 +pandas/tests/arrays/categorical/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/arrays/categorical/common.py,sha256=lQ8b1Pb6EWPTcKE-PQ9cymIcuDWYzH1JDyxmU6PaOi4,204 +pandas/tests/arrays/categorical/conftest.py,sha256=AmIOXNVnHCRF2-kO0cLEU6YDahNTFvUrHAbYSv3HxrA,166 +pandas/tests/arrays/categorical/test_algos.py,sha256=rMHvcPdKBe55wnkO04HkafPe-cQV2o2udQfy2H4AJ8M,7186 +pandas/tests/arrays/categorical/test_analytics.py,sha256=kHlh_E_B16fy9CdJsG7fbeTyW-7frXS12Hv7bjhhuLA,12858 +pandas/tests/arrays/categorical/test_api.py,sha256=PVIF7iK2v9fntP9LDif2e0u64IInQet0ftaeEL1iyco,20382 +pandas/tests/arrays/categorical/test_constructors.py,sha256=vSur7VvcxXSIi8NbAXqOHhwc5In2mbkFTxhiw8mftL4,23672 +pandas/tests/arrays/categorical/test_dtypes.py,sha256=fQTwnS_bO-175ipXsUTwoGobQFh1pb_fe7CDCrnTlrA,6764 +pandas/tests/arrays/categorical/test_indexing.py,sha256=Wo8NXilG3yEvS198kluwALdFUKHkfn6rdRd_n_SO9rg,10003 +pandas/tests/arrays/categorical/test_missing.py,sha256=L97tJLjsuOTRK42jxLYxXdEg9WmklmMaanH4dLSGnyk,3037 +pandas/tests/arrays/categorical/test_operators.py,sha256=UenLs4Rq9KQTZWqOlro2qGsY6g7vVvZxrvsBBDL0AGQ,16771 +pandas/tests/arrays/categorical/test_repr.py,sha256=Ctuw-r7HHPHee3kk1qcqOwQAFJAKtFnJpc0j1RRRDjw,25894 +pandas/tests/arrays/categorical/test_sorting.py,sha256=GyomvjFYw3StEbhVI6NBgmxfrVp79c2T7qDD5z9zwUk,4966 +pandas/tests/arrays/categorical/test_subclass.py,sha256=v-VtvFFLSUt9zXoLOf7YLbda4q0NQVqRWck9x2qtXys,852 +pandas/tests/arrays/categorical/test_warnings.py,sha256=xm4bh8aymFC3fQYNU0pQxgCkDpTUXyBsyJbtcZNhWJw,919 +pandas/tests/arrays/interval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/arrays/interval/test_interval.py,sha256=h4PTw39lEvNZ-CS5yGXCx3KAa8nJWRzDG926yvoySWI,6739 +pandas/tests/arrays/interval/test_ops.py,sha256=hFceHva24OQigq_64P7SJHb_OFQarK9QeIu_4caUaQI,3246 +pandas/tests/arrays/sparse/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/arrays/sparse/test_accessor.py,sha256=8r7HDzRI3W08trEqj5UwPed_Rq32iHnVx_487ynQbYs,4396 +pandas/tests/arrays/sparse/test_arithmetics.py,sha256=UrkIwKqUfAplNbn2ikxNLYcuDqqVCMkHs0GLE8ROKs8,19260 +pandas/tests/arrays/sparse/test_array.py,sha256=JVWKLgYwSlTKsMWpM87Zr8WgHV1ryZGejrWuidsJTLI,43988 +pandas/tests/arrays/sparse/test_combine_concat.py,sha256=vezXzwdOppe3oyrx0bUiylaCasO1t_EjQo9MHVrt9lc,1176 +pandas/tests/arrays/sparse/test_dtype.py,sha256=CG0XYt649Hoz4Es8MXJ6QBY1Kjv2tfho8UBzGPQ_sYI,5385 +pandas/tests/arrays/sparse/test_libsparse.py,sha256=ejPO3fCGni1-k8OMIUAq5Z5zaSg61T6bdu9lCHETtH0,20759 +pandas/tests/arrays/string_/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/arrays/string_/test_string.py,sha256=taaKayTVCnzq1q2rXQmn9-A4tJlKjhxKcoMKl6G6fpQ,8672 +pandas/tests/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/base/test_constructors.py,sha256=ghK3mE1GFPykr6DiqV_Z76pDp8WcNE51VZHbG5WTJ-8,4032 +pandas/tests/base/test_conversion.py,sha256=LR-T4j9ZaidQD7B0I_p5V0jgbafoWITnfV4VjDdPHq0,14302 +pandas/tests/base/test_ops.py,sha256=jKZSUlwPNaea4803_k2ZCCMrQgkWearaNPqCoRQnWl4,33362 +pandas/tests/computation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/computation/test_compat.py,sha256=ydReMAt3D3t0eNMU55khkJ7fYuFusMsVcsj-J2H9-uY,1331 +pandas/tests/computation/test_eval.py,sha256=FuW1S8gl3Jhlz3FbOe__cbDfnuIINKA4PzSYwye-Lhw,71621 +pandas/tests/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/config/test_config.py,sha256=XsPEoVAynRSdNvn0yxFbqIdPhZVsp64BDAbnasq57nM,18088 +pandas/tests/config/test_localization.py,sha256=PrSp-149Qdk6Jb1iaDYgLqIPfG5dRhwG6YRJO0Go1cA,2861 +pandas/tests/dtypes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/dtypes/test_common.py,sha256=j6h0ATki0cNtRlIUBLhRKzNayQiNi2aOl73e8Ci-SZ4,24364 +pandas/tests/dtypes/test_concat.py,sha256=qsIIlT0FEtul4cI0wmoi9laU1IyX4o8CnYyuRkAzdRs,2435 +pandas/tests/dtypes/test_dtypes.py,sha256=lPbZB-wsEPxWTT-ODvDp4bzRnraY2XliIYLk-8iqreQ,33525 +pandas/tests/dtypes/test_generic.py,sha256=nZIYbs2NgsTzRxX18MbcOykKWk9fRbzW2pgtPgsJQcg,3782 +pandas/tests/dtypes/test_inference.py,sha256=_4wED05qjRPeOVVJdnzn3zgpiN-mLZ-Vow_LzmrAwo8,50348 +pandas/tests/dtypes/test_missing.py,sha256=MSJ8kzydYoAIuVEr3ZKZOmjdZzewTfvChmq96lvUePk,18797 +pandas/tests/dtypes/cast/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/dtypes/cast/test_construct_from_scalar.py,sha256=8l0YU427D5NqrEcAcHgI6BT3ju5hAOiGYMgfk5hDISY,709 +pandas/tests/dtypes/cast/test_construct_ndarray.py,sha256=FqWQthn7ViFSpVbLwcMSACr7CKTd0_roxkQ4Ybjcs8Y,708 +pandas/tests/dtypes/cast/test_construct_object_arr.py,sha256=eOmUu4q0ihGTbYpCleoCnYtvwh1TBCEZQQjLeJaUMNA,717 +pandas/tests/dtypes/cast/test_convert_objects.py,sha256=ca4MmPaGI-YlJ2dm2Xc8YR0bPDddrsTq-5WWbEseWRM,290 +pandas/tests/dtypes/cast/test_downcast.py,sha256=ow1CtCx2RWAv3MPS8r7A3KEsi7FdJ-1AmXyIZ5Y20lk,2790 +pandas/tests/dtypes/cast/test_find_common_type.py,sha256=oncXqfk4BWWV8-Y8UHPg3hMJZ1S0lZOi2xdMb3SrEFk,3956 +pandas/tests/dtypes/cast/test_infer_datetimelike.py,sha256=aA7RtfP1NktXICyIIr-M34ukfnhOOtW2vmNJCl-0gCI,582 +pandas/tests/dtypes/cast/test_infer_dtype.py,sha256=Hh_a73ANdWHa6uuoqa9_7F8thM8Gq-3wqrbxTqAykbg,5823 +pandas/tests/dtypes/cast/test_promote.py,sha256=kHA3swT1NKswss3bN8aNHVOgkRNuE5cjy1ieW3IPe_Y,23403 +pandas/tests/dtypes/cast/test_upcast.py,sha256=vwjaLOGEQwVODw8JMb5T6R034HS0GIZhJGxNZhcGY3I,2217 +pandas/tests/extension/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/extension/conftest.py,sha256=_9dvqiKQcLhPyyeHXSZPNPYVVvA1wnW3JFTBPPiOQC4,3760 +pandas/tests/extension/test_boolean.py,sha256=xPtlqHnmLcw_iIJAJT6O6rPunDb2FzQwKkP8p5ccZ6U,11183 +pandas/tests/extension/test_categorical.py,sha256=dEnazTraX-1p3YG5Ds99bCFoBZhcY9q4lOwFbw-y488,8881 +pandas/tests/extension/test_common.py,sha256=X2dv8Vbo1vmkh8uKpKyAOaLMDZqTOgFLtz1N3EQttZk,2091 +pandas/tests/extension/test_datetime.py,sha256=w5T-a79odeDnG-0g2W7UpDkOFK2A5Chvq34ZeJd-fNw,6207 +pandas/tests/extension/test_external_block.py,sha256=5_dZirWO8VnG8thrcPmToWfX2Ox2yL-mz5H_QhEVxK4,1583 +pandas/tests/extension/test_integer.py,sha256=ooVtb98JfgVrv4tvl9jnAaPQAGb1D2fmQyG1HwxxPcE,7291 +pandas/tests/extension/test_interval.py,sha256=8uaSv1SMkAvFLdQg_JH_lYmcD3wJjNDLIZ7thlecnnc,4202 +pandas/tests/extension/test_numpy.py,sha256=R84a0NYNrksO274obT7ZfPd_ySyvLOdlFEKJAVo11xM,12861 +pandas/tests/extension/test_period.py,sha256=wILn_0GaxMwb3v_jAU3NByHzJpyJc5nG7p1wlBrt2sw,4075 +pandas/tests/extension/test_sparse.py,sha256=cMTxge-U3hmdXiAyfNuf1QYiVucMbPL5H7sWEmCUKJg,12321 +pandas/tests/extension/test_string.py,sha256=PJnkY1SlbkDw1Bic4NEiHTKX2FfE8amZj_alYnXd3E8,2327 +pandas/tests/extension/arrow/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/extension/arrow/arrays.py,sha256=lV6AEIaDPnBfzI2HU3xSwS3IWGb5qqLz3y7Zv4OyVDM,4862 +pandas/tests/extension/arrow/test_bool.py,sha256=wnCLQG55Y79MqYxhICfm5xba-9msfw3YPW6k6c0Nc14,1821 +pandas/tests/extension/arrow/test_string.py,sha256=0pt4mJ8VyFmkBtZMe35R_bt18EtgY__CNU2dLLfZBxM,301 +pandas/tests/extension/base/__init__.py,sha256=NF6b_J2GVTZ8s0jocB0JKngggaKU6akmB0mVVbZXy5I,2121 +pandas/tests/extension/base/base.py,sha256=F97RfRu3OrUBls4Rv2S4syi7ViRblngQHAh7RgU9Xvw,312 +pandas/tests/extension/base/casting.py,sha256=aEK9ZfZSWOxyeDRIvnImZ8BCcAeY7xG084YXWg4IX8E,976 +pandas/tests/extension/base/constructors.py,sha256=Djt4vgKh8W6_k5RoMhNEF8TgPFyttO7Ns-0b9E5l7us,3172 +pandas/tests/extension/base/dtype.py,sha256=Fc95y0jPD_Yf79nx18hbQf_CgrxJNhefMe32lD-KT3k,3439 +pandas/tests/extension/base/getitem.py,sha256=PDNQJetpJruZAMjXSA9M4jHh3rrncLwF-oX7_xyWIpU,12144 +pandas/tests/extension/base/groupby.py,sha256=OwXGR-HFShZMSSChhclH2TysjYo6iJDsDp3hkwYtJ_g,3433 +pandas/tests/extension/base/interface.py,sha256=OHNCf55ytBbkaVwkwre3WmeuQ4XH70rYfppjdz40vW0,2919 +pandas/tests/extension/base/io.py,sha256=O_d56AfwAE0YEkPNMTXNXdelYej2flUdjCWLSexX5Aw,602 +pandas/tests/extension/base/methods.py,sha256=9_ufPjEjaOXph02OJzBce5cVCgfZd-7sBe-MP0Il654,14501 +pandas/tests/extension/base/missing.py,sha256=7gVT9V7MVY46Teot7AQ1NIhxcA-nBVssG1wAZhg4aCU,4245 +pandas/tests/extension/base/ops.py,sha256=MXoSrC-WH5YXizcibFit1Q3voPmWWo9GgtiWOQcumzo,6057 +pandas/tests/extension/base/printing.py,sha256=yaso4LDPXYq1FWTgmnMEd0PJx0JZ9tZ0wvQZvREWZG4,1167 +pandas/tests/extension/base/reduce.py,sha256=MzovNiXU0l2t87_wtd9vfVFDf1T-ls-ns4WLnMTN0Q8,1906 +pandas/tests/extension/base/reshaping.py,sha256=8Z0e08rwDQO61V-I1wRzmIjyPapcZt_N0dk8Tx6a9_k,11990 +pandas/tests/extension/base/setitem.py,sha256=oFk6s7_HUAaen7lJcAG17TfBY5FTUxcPdwYyEk-Iv-Y,6473 +pandas/tests/extension/decimal/__init__.py,sha256=0FmFcmPOLYaQuKo2y872UA-3tYhtSJ1XQvdu0GJJQ0M,140 +pandas/tests/extension/decimal/array.py,sha256=qmcHR2WIGPw2wvtI8rIkj1YXZEX6Lrhn-qkl4SWxwGo,6127 +pandas/tests/extension/decimal/test_decimal.py,sha256=5QI0BheEQcntNzIleVqmydwkus0SEjD8wTRA-jrP0hA,16121 +pandas/tests/extension/json/__init__.py,sha256=xlEw4fnCd-FMBc-rq1Uan8yAHcAGeBz1rAYst06_kQY,102 +pandas/tests/extension/json/array.py,sha256=SwE448FT9YJp24whah3pNbSKOQ2AWtymJuA9Z3rOSS0,6717 +pandas/tests/extension/json/test_json.py,sha256=Dxp0YMyzVIjNhEPlzBMeVp00xxSE1lndjl0IommHEZc,9196 +pandas/tests/extension/list/__init__.py,sha256=JCuru1OdLD07EwMYEzdgf_VZnhy8TRpbr92WKi3Ll2Y,102 +pandas/tests/extension/list/array.py,sha256=hlhZKpJ0X8rMDJXbECwqRAW4PkIzRYw4mfyxNIgkAY4,3874 +pandas/tests/extension/list/test_list.py,sha256=WVIHHFZzOMxmR-9-SHD4VhWtnd1K_A2daplQBEKJorY,625 +pandas/tests/frame/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/frame/common.py,sha256=g6wgY8Jbms_sKEcJZ0lCXfiui8FOZY46ZOHDQRj7Np0,1119 +pandas/tests/frame/conftest.py,sha256=Zc_LRqMFAZHWiMaz_1UjMRY0kNomY5zdGx37D0KA_ZI,10622 +pandas/tests/frame/test_alter_axes.py,sha256=KynazwUezjK_NA1PtEiq4J8Yec2FizWFEEF9XexQZZM,57849 +pandas/tests/frame/test_analytics.py,sha256=wuhj9Ck6kiQkt2EL56SxSa-GiN-CUYrQQkyRfNgYfrI,43745 +pandas/tests/frame/test_api.py,sha256=x6AgtJYOFQt83PRKW0dRBxbGTS2BxTz3IzUOAPK2jyM,18612 +pandas/tests/frame/test_apply.py,sha256=JnwZV2rIXzHCWvKC-Z4IROkc8TNddw0G-u1xFD8gcl8,49412 +pandas/tests/frame/test_arithmetic.py,sha256=Vhbm9appjxYWoScAcVD1dJFF9Dmia5cJ_VaWusu-32s,27207 +pandas/tests/frame/test_axis_select_reindex.py,sha256=9pkxKrx6dfTLRPxCVAUS7GeSXFWF4ZQbw4y_vZYz96w,42700 +pandas/tests/frame/test_block_internals.py,sha256=aCKy7oMNI4ZIVizsl58GdvxQY6TJ1myIhQ8caSZSS4c,21737 +pandas/tests/frame/test_combine_concat.py,sha256=IFS4kzATRpZOS1V0wTFuMLjcSYUR2z39EWskygZizIo,28006 +pandas/tests/frame/test_constructors.py,sha256=0LOTnvi-W2r20gGhn8tVYpj7Gfyw214OpY_-WL4KsCc,92952 +pandas/tests/frame/test_cumulative.py,sha256=JgbxQq-rWs1fFdsV46_SM6bKG6TgqBFcBQyJ34FR454,4216 +pandas/tests/frame/test_dtypes.py,sha256=FxUG5pZsKV3d0rl8ObX71uKXugGfR2As1BMXd4ZFEvo,44611 +pandas/tests/frame/test_join.py,sha256=Wq5R3WGXCU7ivW2jehon0B5VhTwfG8CMIr_P_EDWb_s,6429 +pandas/tests/frame/test_missing.py,sha256=M5ycDJua4FIIuHTE-JC8Ed2vmVYBb5am2fbmnes1qOY,34123 +pandas/tests/frame/test_mutate_columns.py,sha256=U7bacxLuv_jnZ2BMxiwt7PR70kGGAK06J4pcu2Ap_oM,8973 +pandas/tests/frame/test_nonunique_indexes.py,sha256=m22E53DYbz3xBYOMp-CtMqhY6ji9LPKo03jZ6wEiQQU,17962 +pandas/tests/frame/test_operators.py,sha256=Ri5WZMdp5QFHB1xS6j7yjMUSsO5btJGbj3St87UH8aY,28799 +pandas/tests/frame/test_period.py,sha256=eMITzpQrnTtp3E_lpe2aIii8FobPn0_c4StJGYBXbsk,5585 +pandas/tests/frame/test_query_eval.py,sha256=lfs5OOl5R-V_0AeqeHkUkrI5wZxM8Ae-G-RlFOU9cKA,44782 +pandas/tests/frame/test_repr_info.py,sha256=c_gpmHBwm7iHzJSstwVjAqcOAxMHCFle8O-QmuSBJ_4,18506 +pandas/tests/frame/test_reshape.py,sha256=TIzvKXTj1ceXDPWeE7kucRFFMap5zkeTcNm6s_qtSH0,40950 +pandas/tests/frame/test_sort_values_level_as_str.py,sha256=u1eP9_uxzc0AJcF2MpDt9yXQ0ly2SAhmHL1nFq-r2fs,2550 +pandas/tests/frame/test_subclass.py,sha256=5avXrGDFaeXMHnhGUwgiy5bYvINiG18VSVfu4yEBdaI,18547 +pandas/tests/frame/test_timeseries.py,sha256=Wv9M1mqyclLy1Cg6b_cUvjTLTp43LwmZ7gWA9sTGHRo,18846 +pandas/tests/frame/test_timezones.py,sha256=7TFtxA4hYODc1V1A1r5p3SqPYv_ZkYl4uHA0pp9-4Co,7875 +pandas/tests/frame/test_to_csv.py,sha256=sHVUm0DXQdyLDNjjuY-Ca9xKG3dF0gfZGpDpZeOoXGA,47715 +pandas/tests/frame/test_validate.py,sha256=H59yO5SWwI5P1Lc6myDqdePx_h6TcKdSAsOM3TEL2zk,1095 +pandas/tests/frame/methods/__init__.py,sha256=M6dCS5d750Fzf9GX7xyNka-SZ2wJFCL66y5j-moHhwo,229 +pandas/tests/frame/methods/test_append.py,sha256=oQc3jaqUxBcwEqYqYUllLo5iakhpKe27HLE8mmDLwkE,7248 +pandas/tests/frame/methods/test_asof.py,sha256=RDjzndxrCqML4C1VnEX63vVoPltYAWLnux5azEn5lko,5285 +pandas/tests/frame/methods/test_clip.py,sha256=KDABuk0fF59XEsw_tBAAmFeh2-801hNAqc6RRP8fkHE,5977 +pandas/tests/frame/methods/test_count.py,sha256=9JAxU49GfUmKxltapDuenbH9CkUBcCTG-hu8MQcWOOw,1068 +pandas/tests/frame/methods/test_cov_corr.py,sha256=iM-KDpexvKqDjpbJb3-vT_uruIRXYFy_kaPAPUEJKfM,9691 +pandas/tests/frame/methods/test_describe.py,sha256=5yxHO6sRUP2spw0yiup-mkhtICTQN8ygb97KXZo01LA,11167 +pandas/tests/frame/methods/test_diff.py,sha256=FncJKqdZkldNLm8AwZo_zL02GCtsQDuMnZYGji5I5Is,3648 +pandas/tests/frame/methods/test_drop_duplicates.py,sha256=1J3AHbnrzAszVwCFdlbVWYdZsn-XCyVZ6pDke3W3FTg,12706 +pandas/tests/frame/methods/test_duplicated.py,sha256=D_cWJDA-WKMUjcxkOEhnsLA4OTKn8YB8xeYVsMhHDaw,2938 +pandas/tests/frame/methods/test_explode.py,sha256=9gsdQ6jeaIDIBBgZi3XYBAYJG5kU4egdzLrFYdhjnig,4896 +pandas/tests/frame/methods/test_isin.py,sha256=WI7pVX-4x85en7cX-kDtahmbzYvB_DrHUW_s4qcX2Jk,6243 +pandas/tests/frame/methods/test_nlargest.py,sha256=Kv2XkfvUUrEbb0IftKA8MiEQpTVX61bfRiNwO5qM6xo,6731 +pandas/tests/frame/methods/test_pct_change.py,sha256=QBwcDrBJn9BqC06Q8imdtzZ2Oy35AegInKf6NMr3boQ,3436 +pandas/tests/frame/methods/test_quantile.py,sha256=Zp2uSGKwbCoyk2poMI8ULO1FO6sRksNkpMk3IXmff0A,17238 +pandas/tests/frame/methods/test_rank.py,sha256=wv6RVPaHILdrVKPHlhFHhYJDAxOSyzX7vzSg3zhJlSg,11377 +pandas/tests/frame/methods/test_replace.py,sha256=GQdT-APNJttTNAtJSn3ZZO_mQXCxUhOpafh3y6Nfo3U,49977 +pandas/tests/frame/methods/test_round.py,sha256=KbKyrEIz0_1cOI9lG3XGHWxoVolOOpdXge0vL6Rfxdw,7685 +pandas/tests/frame/methods/test_shift.py,sha256=7HgQg8Fj7zWdCEjRTQ-IONrLWi9Uk4GfIhbU8TVoRxE,6375 +pandas/tests/frame/methods/test_sort_index.py,sha256=8h46ZNkTZC082j_2YefAr2TTbKF632QDJ3jbnFvLAKg,11295 +pandas/tests/frame/methods/test_sort_values.py,sha256=UZcJ3wJzHndmhC6v0baletcnMjdaBg4_WNI6vaG-BqQ,18258 +pandas/tests/frame/methods/test_to_dict.py,sha256=T2Gx-B15HgjppaAS6VZ_5YIQ71BdvX--VBU1pnDzh4g,9405 +pandas/tests/frame/methods/test_to_records.py,sha256=BscPB_YBegpmTq-WlH9EKNakIShO8fatKu6R-do6Avo,13585 +pandas/tests/frame/methods/test_transpose.py,sha256=b-RpJiQUKMcBNLjDQ4mLv2xCOvTnR1ENcPjwzNlYaGc,1445 +pandas/tests/frame/methods/test_truncate.py,sha256=Q5HcfALA-4O2M4gMB0FVTtDpF-b85HIX3-ipwstntVk,2855 +pandas/tests/generic/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/generic/test_frame.py,sha256=8mlwQJ0vc5W5tB4xzEghB5BqqvvhnTECt36ZC4nD5io,9157 +pandas/tests/generic/test_generic.py,sha256=0MfYF4PKu6cUpVnjfwHgIvQlx4vWVylMWYuWI9D-ZtA,33120 +pandas/tests/generic/test_label_or_level_utils.py,sha256=OGS9hYHw-vg_IH8Q0_U0g7rPALi0UZNKZW_PHLv9tQ0,9968 +pandas/tests/generic/test_series.py,sha256=-zVHyYoY0QSgUV51maNq2xPrwGsi37YcoYaVB2W_boo,8408 +pandas/tests/groupby/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/groupby/conftest.py,sha256=X-hQPshzq9dxvUp2ARpQ0FgXY7hIYJ1ajvFHMajHlmc,3004 +pandas/tests/groupby/test_apply.py,sha256=FX4zB47npiO2d8iVs4CBTthMlUQyLYW7E5P_cnFALr8,23957 +pandas/tests/groupby/test_bin_groupby.py,sha256=NVD3Skzw8WyHHGVCAeUuTP2mLtXTFr-E2nms_heONLo,4572 +pandas/tests/groupby/test_categorical.py,sha256=MVHumSeu1f-HPSixHR8hc7J5FggZ0rqZNufQwhXiBrc,45417 +pandas/tests/groupby/test_counting.py,sha256=qWT6dcG0ACLsep4be5c7WJN8nliojRTeeFmFtkhAR5A,7718 +pandas/tests/groupby/test_filters.py,sha256=bWNiINwob7_rT5ScUjeeDSGNm9kDK_BYLTOC_75TTSQ,20402 +pandas/tests/groupby/test_function.py,sha256=5C-DXZcrx-_xnUWEZqRQYXBpqlHqTMZVAEiSG9mzAHk,48981 +pandas/tests/groupby/test_groupby.py,sha256=ZdYKMLx1iydd2fTvhr4o4S7KahdcPKmcK2pyHOX8WtY,61238 +pandas/tests/groupby/test_grouping.py,sha256=FsVHFAhdKs77OkmzVAFJ57mtFFt6O8_UEldWVLbAwGU,34601 +pandas/tests/groupby/test_index_as_string.py,sha256=KQOXdEoOc0Y1ebWV79oZYzUH8soAb0SaISo2qBCiPNk,2069 +pandas/tests/groupby/test_nth.py,sha256=DuslRAQ8jaEK8uEu7gsIhnNzyu-Yic0LCyzkUiWNIbM,17560 +pandas/tests/groupby/test_rank.py,sha256=IIk5oZrvMNZmZHeusqkoNkonjCoRVTxtHn-ybeuQvws,15447 +pandas/tests/groupby/test_timegrouper.py,sha256=F5togvc4vRxsEnIbMORBx8eJB8DIa1oDYrb5TuOxaD8,27618 +pandas/tests/groupby/test_transform.py,sha256=wah714t8zHeXfQUozDP81dJlMiqn6iWU3Wcdpy5KAzk,36944 +pandas/tests/groupby/test_value_counts.py,sha256=XdXqYSTU1Q8T_0aKbs6E-8mzqyOBmjclS-3-x35TklE,3470 +pandas/tests/groupby/test_whitelist.py,sha256=dKepYPqSVF93S-FImEQ5MUIKA5LCuCgdagPIiz5hf34,10315 +pandas/tests/groupby/aggregate/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/groupby/aggregate/test_aggregate.py,sha256=mK4KHdtR2kImfYXAsL1ZuM4hfcqHQq4J8PV3Djw2o7M,29552 +pandas/tests/groupby/aggregate/test_cython.py,sha256=Ud0GBu572MSzyBcyPnmRObG5ohmZzFotr_dvIsNiVDo,6844 +pandas/tests/groupby/aggregate/test_other.py,sha256=D8c29kOGxTPY8eHynJAAeHNs9ImNVIiNOnZGx7jo84k,19161 +pandas/tests/indexes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/indexes/common.py,sha256=_Yzx8PoYEpTiw4XVC9VzPYfa14LMmyfvgxtgKLPnhTE,30759 +pandas/tests/indexes/conftest.py,sha256=LZLyM3bsgmExYOhPN1Z9ZjM1_p0B_i50CeNdvGS4BW8,1539 +pandas/tests/indexes/datetimelike.py,sha256=iY4V_HHiPKj863GIK4uKHmSafDJSxo8VNB8AQxyrKWc,2945 +pandas/tests/indexes/test_base.py,sha256=ueR6DIp0bH8GyDq1U-oF23StgzF2lc67BIpPnUGKsvE,100683 +pandas/tests/indexes/test_common.py,sha256=bkjcIezkwS8yOzzBOpXHF99yx7rM8Wrn_Ql77nGnK84,13140 +pandas/tests/indexes/test_frozen.py,sha256=Oy29zGmZ1Qleg7HCDI8hebDod4hdlgDk0LB0KhAjAbs,2994 +pandas/tests/indexes/test_numeric.py,sha256=tg7TND0pWXU7MHh_gteVVqwc3L7LptIi6eKYEom560U,44843 +pandas/tests/indexes/test_numpy_compat.py,sha256=ACGQunvqPTxq2dgNtcciad1s3Hn1vQCDa0ufY4JZ-uE,3778 +pandas/tests/indexes/test_setops.py,sha256=DqQQRRrVKFZK0LTPWzmwgwthIVtjo6wi5aq3v7DRI2Q,3576 +pandas/tests/indexes/categorical/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/indexes/categorical/test_category.py,sha256=KgBhXy2czuCf_pAofpNs2BaHcG_k1sQUX47OfSec9VI,39128 +pandas/tests/indexes/categorical/test_constructors.py,sha256=dcVW2F9uOFLdYM4jkPOEp36E7PhAHrL75-T_kD96p54,5823 +pandas/tests/indexes/datetimes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/indexes/datetimes/test_astype.py,sha256=Lm7t4fdFCngEklo1eowFqv54NQXzVkWXZLtdSuk2YOk,13616 +pandas/tests/indexes/datetimes/test_constructors.py,sha256=wQpVCiBMQxa7rfHO102WkYTD_DBlu3XBeuurCiGM0tg,34595 +pandas/tests/indexes/datetimes/test_date_range.py,sha256=ztqu3u3Mv4golO-HVp-gqLWqQ3OrDv3hx2FbdgAIIgo,34574 +pandas/tests/indexes/datetimes/test_datetime.py,sha256=TOAZ3bP0vznx7YlCozz2BhUcK6j0v7Xyoj6Y-byviOI,15610 +pandas/tests/indexes/datetimes/test_datetimelike.py,sha256=L2md7u367gpSIrRyoVHtXmamVab7q-peNhT8oKiEDTw,806 +pandas/tests/indexes/datetimes/test_formats.py,sha256=HMCjnT1ocaVS7CK5RODUeBNbsZAJxUsBbXuHXkrLplw,8411 +pandas/tests/indexes/datetimes/test_indexing.py,sha256=lgsQiHm0fITLcMCejnowio52hHxkeAVKiebpb1dOsVk,27791 +pandas/tests/indexes/datetimes/test_misc.py,sha256=HvQVGNo00n2qJn7TDUDyymRYSv9IXRURsj47StAQJC8,14137 +pandas/tests/indexes/datetimes/test_missing.py,sha256=NJIVh8qkFGj7gVwPfmLl7CtTdeRiu9Zb4TzSlJTKeOI,1998 +pandas/tests/indexes/datetimes/test_ops.py,sha256=vpa9Y6yMqzbXE89C4TG12wa2DD0kvSCioYkZBdA7btE,18307 +pandas/tests/indexes/datetimes/test_partial_slicing.py,sha256=tDtlzWWwCOVuyVx9ZcpovzK2Nq43qMrT29ZANFPTo7Y,17443 +pandas/tests/indexes/datetimes/test_scalar_compat.py,sha256=8AThIuZo1AjGmtyuBtukgobrjAMj35TeJO34PW9JqqI,12021 +pandas/tests/indexes/datetimes/test_setops.py,sha256=LjYafCbN4J6Sro8WB9RvqPWGWrniS_-0XIXMJdIO0zw,20645 +pandas/tests/indexes/datetimes/test_shift.py,sha256=VsmIGwAX7Rh8iL90G-bjtP-TnCZkAIhWxEVg9YGeuMw,3932 +pandas/tests/indexes/datetimes/test_timezones.py,sha256=2Xd9hXcnLWGbrTUETHYzRQy7VWazl9l4QlmUGjO5EJI,46266 +pandas/tests/indexes/datetimes/test_tools.py,sha256=cFOH9c22fqF9mJIhweITczlQon7TvAUguHwz_-X-hxk,86588 +pandas/tests/indexes/interval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/indexes/interval/test_astype.py,sha256=P_yJpwgR-YEOXaGA6Yrn7y3y_j0SkzBIncsDo_Qbshc,7740 +pandas/tests/indexes/interval/test_base.py,sha256=ET6r4lE_59Dw-ALojToWAP8BRlj_4qoDkUCHwL8FfpE,3184 +pandas/tests/indexes/interval/test_constructors.py,sha256=Q-TQSgxDoZoG58-3U7RsRGqqSooFRmHkXyLUVXnRv3I,15229 +pandas/tests/indexes/interval/test_formats.py,sha256=hS-2OrhPdtNaAIcpbDrYaHGP5jVaJDHWOsZvIVjH4l8,2591 +pandas/tests/indexes/interval/test_indexing.py,sha256=JmMcKorlbMX4Ug-Mdh8wcZmMICONr4Qr3BEUsIFmKY4,16550 +pandas/tests/indexes/interval/test_interval.py,sha256=tES2F-GW4IM7En1k-2pGSszUm-INjmLdDQyydTRRdpY,32494 +pandas/tests/indexes/interval/test_interval_range.py,sha256=xUVurv9UbC_boXzljUVkmjNdxVCeRYS33z_JyEhDB74,13249 +pandas/tests/indexes/interval/test_interval_tree.py,sha256=6xg0upKaVZrSBuocNndF4TH9zczCLdkRf1mcUgFE4VQ,7127 +pandas/tests/indexes/interval/test_setops.py,sha256=t3toTfD-Mb1cDKJbKCX2kk_DbszB1qLDaPxoYWr34hk,7324 +pandas/tests/indexes/multi/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/indexes/multi/conftest.py,sha256=xRUByIBtz7sMkwbSBuHeHlU_baZvDu4_2Ua3HzgGt60,2342 +pandas/tests/indexes/multi/test_analytics.py,sha256=lOu8pXipLZltQbOwDtbaCkW6X3E-zMXL7w9B5_aT7_g,9875 +pandas/tests/indexes/multi/test_astype.py,sha256=YmTnPF6qXwvYY82wZfQ8XFwVwOYYsIls3LSrdADDW-4,924 +pandas/tests/indexes/multi/test_compat.py,sha256=f4nWPWjKXaF7CROmzgBQT9ouBKpUBUtL-R17jaUr2SM,3306 +pandas/tests/indexes/multi/test_constructors.py,sha256=NCNAkid4p2-eY9W0fzfNU-mxwlTpuIzN48YfzW87dzE,23306 +pandas/tests/indexes/multi/test_contains.py,sha256=oLMpNPLotYa5emguWO7AzJkc7-KmpCjPVFiEvRDIyMk,4046 +pandas/tests/indexes/multi/test_conversion.py,sha256=beHSiWtWi4qvNN4YkllKbsgnPJnKawVwfVXREYF8P1w,5576 +pandas/tests/indexes/multi/test_copy.py,sha256=BR681P4EpFwCmk8VIyYrNd5Ya0aYX5LKRKbvCC7dUs8,2352 +pandas/tests/indexes/multi/test_drop.py,sha256=v9CMAzvXKwVovmKteDHi6-H7Ehnm7RPMur6J7ENALew,6028 +pandas/tests/indexes/multi/test_duplicates.py,sha256=tnn9YfLBjBczb0_NXmGWoSXyTctaMsh8-QAD9Eh8hks,8917 +pandas/tests/indexes/multi/test_equivalence.py,sha256=H_5KjbMjRNT-ov7SOhowBK2HLpysDW0GMA8fbvSOrRY,7083 +pandas/tests/indexes/multi/test_format.py,sha256=AavpvXWzfy6pBGzNanIcZppXsWd6rVnM-NS44ZoSU0E,7329 +pandas/tests/indexes/multi/test_get_set.py,sha256=rxd6XzStPRKPMqEcmi6s2s6zrGIXxXnN3bhMQGHbyFY,14702 +pandas/tests/indexes/multi/test_indexing.py,sha256=jADWCJ6vUhJKerJofNPrqKg_bbhnejrgT0lc4itq_LQ,16927 +pandas/tests/indexes/multi/test_integrity.py,sha256=kN6smzW-2fA_UqwDSv7xIoYIX2-jtur-Q3PVMX0xjug,9001 +pandas/tests/indexes/multi/test_join.py,sha256=YFBQ1zgyW9ZBAXRwCoG5w13Xfj3vemACID0kAcsb8DU,3473 +pandas/tests/indexes/multi/test_missing.py,sha256=19PLUcWNrNL3rsZvEc1nBvZJPFQewe2K-gYuSzfWgoc,4505 +pandas/tests/indexes/multi/test_monotonic.py,sha256=WNhKsKuP1sdw_AiaEE2CDqKdLGWtHFt33QTlH3Ero5c,8416 +pandas/tests/indexes/multi/test_names.py,sha256=QjHgWb_nEbemh37AaNksH05P7aOqRXNJdqZ4E5xOlEI,4593 +pandas/tests/indexes/multi/test_partial_indexing.py,sha256=qTG0eD7QoQBNLLCqWEpPtt9sA__s9H7QJZzsUnrvcW4,3298 +pandas/tests/indexes/multi/test_reindex.py,sha256=afdw-hR1tX8q0H1Re3GVWpGfttHianW116dTa8p7mdY,3745 +pandas/tests/indexes/multi/test_reshape.py,sha256=Q9AO5kiYSH4D8DbeYb3YcHoNAsibQw4C0rC15LwXsOA,3516 +pandas/tests/indexes/multi/test_setops.py,sha256=4wyRsO_I6xUTXODevvRIDtmdA2NvFh8JKBA3wBsgPsg,11199 +pandas/tests/indexes/multi/test_sorting.py,sha256=Ur-VluHBm2qm6qIPw9MDi0IestgODh2LrHt9cHd0ziA,8258 +pandas/tests/indexes/period/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/indexes/period/test_asfreq.py,sha256=OKAzK8vRz7kozDD3tupNX4wh830uRevkRo7FMg8eJNE,6245 +pandas/tests/indexes/period/test_astype.py,sha256=ABxzxbkj3xt2q5NEsko0tCWaOOKxcIOES9oXdXkBpS4,4923 +pandas/tests/indexes/period/test_constructors.py,sha256=kTxSk533EQiIr9HOuWF5BiILAC07OP36UzS0cFnID8M,19790 +pandas/tests/indexes/period/test_formats.py,sha256=qfkuGWPjb1GChAIjNkeCvWmWvnKRsk9LvhXpPqr6j5o,6971 +pandas/tests/indexes/period/test_indexing.py,sha256=c34jfGDp5gOTS3AAhU4IsYhbqXux_4Mtm5TK3vWzZ8M,26099 +pandas/tests/indexes/period/test_ops.py,sha256=JCJNMfu_IZLTt7EYNIWEgKcAJHYmfqKXG0VK7NtyNMI,13110 +pandas/tests/indexes/period/test_partial_slicing.py,sha256=e86dl73aq_LIazeNbS8trzAL_zR1F7GO4gvjbozBAVY,5395 +pandas/tests/indexes/period/test_period.py,sha256=E05VVBr-Sinkcvr_NV3_pB3CjUqY_5dHPrgbiDhWIx8,23110 +pandas/tests/indexes/period/test_period_range.py,sha256=HenlIigEfc4UNqsDlsY3Irx7lptTQJMf3EiH3-fXux4,3605 +pandas/tests/indexes/period/test_scalar_compat.py,sha256=2U2barczz46I_zjfhHRMduEkFPSa9_D8iEAHOVbGT9Q,745 +pandas/tests/indexes/period/test_setops.py,sha256=ijAsEV3Ms6CEzc9odUMvKd-e0t77Qk2f9_3JRz6Q7Y0,13282 +pandas/tests/indexes/period/test_shift.py,sha256=7Ywg7GVw46P1c-YVKX0QW55TTR39_69YtvCaGfsM4po,4345 +pandas/tests/indexes/period/test_tools.py,sha256=W3GbwHje6bAB7CFbZAMW32VkdG3CoshKdSPT2Ik93Us,13982 +pandas/tests/indexes/ranges/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/indexes/ranges/test_constructors.py,sha256=UwaZpB415-DjTMSu5kyheUSZnnW0knpkLKoW3n5nb2E,4956 +pandas/tests/indexes/ranges/test_range.py,sha256=N338af1yM9Oy2V9k45Z9BYoqJ2RBGDg86nHfHFnraeM,24741 +pandas/tests/indexes/ranges/test_setops.py,sha256=nBF_p3G6jjQQkahT-mXk4lCVxUDgTfs8_RHUNmVE22I,8498 +pandas/tests/indexes/timedeltas/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/indexes/timedeltas/test_astype.py,sha256=yUpUn26epcAdBi_YBaTa2swcuMjwNjPbeZzt_x1prMc,4053 +pandas/tests/indexes/timedeltas/test_constructors.py,sha256=G1xzlROpSPWShCxrB9jBYzy57_KdULSkESJrX1tNipk,7623 +pandas/tests/indexes/timedeltas/test_formats.py,sha256=rFffGHOvVqGDaYdLQC45_b1vyIUx_fdtBhAGX3AD9jA,3275 +pandas/tests/indexes/timedeltas/test_indexing.py,sha256=3XGId46lJIBY1EfvhXXO8FyeeSm0x_xSg8dmHaxA-5Y,14011 +pandas/tests/indexes/timedeltas/test_ops.py,sha256=zhokKG5nngv0_9O4Ba3N67dg4ymglBOayGd7jDMVjtc,11557 +pandas/tests/indexes/timedeltas/test_partial_slicing.py,sha256=K6J-oBOFUJwOVqw8fNcDrhcYnXHdlSCcdHQsZ2g2lFw,3146 +pandas/tests/indexes/timedeltas/test_scalar_compat.py,sha256=t034TAigYfv6ZctwJCDAn1e0BBnc5QuldtX2qQ5ZCuo,2341 +pandas/tests/indexes/timedeltas/test_setops.py,sha256=hIrMJ3yLM6UCtlopc6Is8q_dZNYqyOB_RQQHu2KaeXc,9683 +pandas/tests/indexes/timedeltas/test_shift.py,sha256=dsFnog14mNBBexzUtBO0qaYUEUeKKYWeMVCHmrbMrvA,2661 +pandas/tests/indexes/timedeltas/test_timedelta.py,sha256=o_rcQSHjyExWJWwpUmjkRrqLg2q42VOGZryMX1X6G3M,9366 +pandas/tests/indexes/timedeltas/test_timedelta_range.py,sha256=0-V4qTACsxc2mknMBoth3YMn-qT_Ua62Q9ZLJk54OwA,2981 +pandas/tests/indexes/timedeltas/test_tools.py,sha256=oDR6-KEm-kHZd6Nn4Czd_ykkG98cr7dYLrTZ-y9HeEg,5269 +pandas/tests/indexing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/indexing/common.py,sha256=4N8UY5a4pI41z3C8Z0vwt3KRN5MjJcd11P1cJa0ZPyA,6749 +pandas/tests/indexing/conftest.py,sha256=a3wf3a7PdoOk2q6qtLQcHWVrKsrf50Wuo09EycyEhMI,661 +pandas/tests/indexing/test_callable.py,sha256=VmakZkPfIp8MRAYFG_ugP4VLXcvS5HbezyYl3LnxEs8,8529 +pandas/tests/indexing/test_categorical.py,sha256=N8XgMdWv6y3XkhGlvb3pKoHd6S0w8LmFKG1-wXiLXuo,29518 +pandas/tests/indexing/test_chaining_and_caching.py,sha256=dA4bW4HJB2QdssWrvmD2qam3ogZBrResU0FVj0MFh6o,12713 +pandas/tests/indexing/test_check_indexer.py,sha256=dsAzt5JSVygXw0rJUWL-aYpOCxsfB6wRMLH1TjlAd-E,2865 +pandas/tests/indexing/test_coercion.py,sha256=FF639X1eaIxM4WS3sEmzDV3KnA29wh_YJfyzsgMu-jE,35906 +pandas/tests/indexing/test_datetime.py,sha256=VkQG91qxQAF7JfDxfSVJ6VQP2rO3Fk_ypA-GaVNHnEo,11616 +pandas/tests/indexing/test_floats.py,sha256=KjekcXEj3ajwD_eF14hgRI-JBp6S4cvd0jmZ1esjQos,30021 +pandas/tests/indexing/test_iloc.py,sha256=e1h6iv8Gud7yELpM4QqQHUfw6r7iiGhja5irr8bGFyc,22897 +pandas/tests/indexing/test_indexing.py,sha256=LXdoays2cXUJlWEsP3OXMa3-I6cdsyOIplb-z0bg79g,39013 +pandas/tests/indexing/test_indexing_engines.py,sha256=YCemI0_4QgHb_CyAvsBi_NT6Fz3BWaeSvCCVYBcCyZI,5978 +pandas/tests/indexing/test_indexing_slow.py,sha256=1-3TatYWiEaB7x_C5fWXlWLpqj8sb67-Ihh6ajVDuF8,418 +pandas/tests/indexing/test_loc.py,sha256=hjUaUWoCMOumCeopVbRX2lXmb2oONMflP4FadazKk1c,31800 +pandas/tests/indexing/test_na_indexing.py,sha256=6ntIY323xB_WiFDjxTf-xvQjlf7pCAlk2CIJ6Wf1_2A,2230 +pandas/tests/indexing/test_partial.py,sha256=yeQ9WaKfq-wW6mJhEIzUDSaZ2SgzSw7QHhNCf-dvLxk,18314 +pandas/tests/indexing/test_scalar.py,sha256=BwRaWuJ6budZv5ymneVJH6d9Xtxh_x696WaanstQpTI,8008 +pandas/tests/indexing/test_timedelta.py,sha256=9WXgEy0Kott9mS6TFwtnLzJCPyzKx3SEpJoscO-Fd4s,4108 +pandas/tests/indexing/interval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/indexing/interval/test_interval.py,sha256=nkrSDYsuyUVC2ad2QpSzBPAx1K2GGSKRnJ5qlMB-c1Q,4732 +pandas/tests/indexing/interval/test_interval_new.py,sha256=PWHhGNPI8M4fXfMy4xjy4X7UxufRzzlFH7PP3VAUaig,7870 +pandas/tests/indexing/multiindex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/indexing/multiindex/conftest.py,sha256=9JDnZjD_HglMs7MLQQOnHmRZvxHjSEzRB9XczRz6Xp4,1102 +pandas/tests/indexing/multiindex/test_chaining_and_caching.py,sha256=xzOYI8kl2vghdRPNnBgfz1pJSc70dmcYXFlrjcPkb5I,1890 +pandas/tests/indexing/multiindex/test_datetime.py,sha256=x38FOD0rsVXBqyzz-nd0yQn_adnffTe--d-KnO1hCRM,613 +pandas/tests/indexing/multiindex/test_getitem.py,sha256=Gm6ZZkCO5fqtjjGcuDScfT4XegG_YUoLrpgRXfmub-k,8442 +pandas/tests/indexing/multiindex/test_iloc.py,sha256=0fA21M8J8Wbw8lVc7qJbWuLlU2oHYjLQ3n2alB3OfQI,4991 +pandas/tests/indexing/multiindex/test_indexing_slow.py,sha256=RLIis4pyMdeIdbLcsai2xMU2DIu6n5Zbpi685WnnKks,3124 +pandas/tests/indexing/multiindex/test_ix.py,sha256=Km7Z_aE6luzqHSSzyGtEzQ01UQ9wKyA5sn4S2Gpb4Oc,2066 +pandas/tests/indexing/multiindex/test_loc.py,sha256=uuGNLO9RXbd-ohvufIfaihiHzJZpbeUOzlzSvLKfDOQ,15327 +pandas/tests/indexing/multiindex/test_multiindex.py,sha256=Sz96EiwzGbI4YPSl6M73qptI3ao046bCu9FNEIERlmE,3498 +pandas/tests/indexing/multiindex/test_partial.py,sha256=_0X0JtbFazh26VVPKa3KzO1U7-gMLTIys8CvDRqTS10,6500 +pandas/tests/indexing/multiindex/test_set_ops.py,sha256=oAVPFVLSPffIN71tSJOBYE87zSCrEqODbrUrH0O-UYk,1359 +pandas/tests/indexing/multiindex/test_setitem.py,sha256=Dxin3ElUMAf0IqUfm6oYaZXjvGLSj8_iFJj0wXcR3tU,14875 +pandas/tests/indexing/multiindex/test_slice.py,sha256=bY5KuzHRAO-5DxE5pG9-Nh9e4Zu8yjt7njuLcqgNYGA,24868 +pandas/tests/indexing/multiindex/test_sorted.py,sha256=bkejCrrVIFSS2AC5AoIsN5mB67zQFJOKD_Wdc3_W5ZQ,3413 +pandas/tests/indexing/multiindex/test_xs.py,sha256=b0Od3aoebGiDUbdIquZ-fFKe-GByoye4_xTA7gNXhgQ,8046 +pandas/tests/internals/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/internals/test_internals.py,sha256=S4JrnNFu4PY2ebXliFpsB1kjs1jOCjrHtk_4Zi5saWY,45384 +pandas/tests/io/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/io/conftest.py,sha256=qy_o3rF6fPoFxwvSd-XsWNhimRlNcpg5X8Non0_gWbI,2337 +pandas/tests/io/generate_legacy_storage_files.py,sha256=5Z9CU5LklkZHPbuttDaINYX3iGRWT3dBV9KSG-_wd4k,10140 +pandas/tests/io/test_clipboard.py,sha256=it6LJXFoJ7NJ2UkLjiXyGlUO8TzDtztAaSrvSVY4C0Q,7968 +pandas/tests/io/test_common.py,sha256=tCnYOFa5OkItpeTNmx0VjRPZD3hb7XgVw29cOEE7QxY,12389 +pandas/tests/io/test_compression.py,sha256=aviA8IJnJauYFqxTaqj2zlFv89AliMZUJ6I_iyGui2I,4867 +pandas/tests/io/test_date_converters.py,sha256=pZGXOHSVVe-iQNQy7zkM1kNGX9mMqr0BKwlK734s41g,1200 +pandas/tests/io/test_feather.py,sha256=DkZl8pH-IPjINwM6jz5HoBg-iOqBl_dYKTKTwRBNXsI,4639 +pandas/tests/io/test_gbq.py,sha256=kRYjyrH4CAxWdbh-Tvyw6pFc_peFaUlDhtrkeQWCh2w,6799 +pandas/tests/io/test_gcs.py,sha256=9ORWmC1CpdNnwpBT1pomvM6qSiBrz5z9STVfwerHXvA,3133 +pandas/tests/io/test_html.py,sha256=Q-0ePyirIXuwQ8hT5R7shWXyk1XGZZx44h-03GWk1P0,39836 +pandas/tests/io/test_orc.py,sha256=BRZGhL_991jsslfo1u6lM0WLun0O6sGLNen2xfGrZYc,6482 +pandas/tests/io/test_parquet.py,sha256=4Vg32tjVFVLIpZVlEXBuLa1N45z7ZI8j1yOv7cZ1nBA,22377 +pandas/tests/io/test_pickle.py,sha256=gIQbdX1SpNm82l2Ldg-GhwV_ufODjEL8pXW8EZozVIA,14618 +pandas/tests/io/test_s3.py,sha256=iyj5pw3ME5NA6TbVxey2cCfykG1E-NVopkvlV7Lmdd4,685 +pandas/tests/io/test_spss.py,sha256=-MRNVwHWj1otEA3PL2vhhPzJbK8Fsp0T0CJhv01l_fM,2635 +pandas/tests/io/test_sql.py,sha256=WkeHBvG5PY19wHkjOhdswC5_tbnLnQt6asiWoFJbpwo,99021 +pandas/tests/io/test_stata.py,sha256=U6FaPMeQg24QgLVocAOmo26z4eN2BZPzknJWNEOANEQ,70933 +pandas/tests/io/excel/__init__.py,sha256=m7toma1Wz3xjXhsczIc8bH0Rwn0EbbMx2VB5eB14Mlw,214 +pandas/tests/io/excel/conftest.py,sha256=nPB_DRdq1GS7bQG0fJ67GXxJzJ_Qw1KAG-QAVpmvJ5Y,1355 +pandas/tests/io/excel/test_odf.py,sha256=104tKyJftkAZHAt0sPeQ82lNEN2Oz70zuRUM3yhwhWc,1377 +pandas/tests/io/excel/test_openpyxl.py,sha256=AOgEEfxaX98JC14pJtnIYUGw537nf8uxchIuhg1-FTU,4028 +pandas/tests/io/excel/test_readers.py,sha256=-Z1AnO9RzjERi32gQC8p9nXwZYtvN5W6dLgRkd7rB8c,39455 +pandas/tests/io/excel/test_style.py,sha256=rKAaNOY4OLYsHePy3ZGv4Cei346M4aal6seQtc5YIM4,6501 +pandas/tests/io/excel/test_writers.py,sha256=Ul_GA3GfIRH1DrKgwm0rXakGE7ZtUMIj8wMSyKObBjU,46121 +pandas/tests/io/excel/test_xlrd.py,sha256=UdzPnP6MYThtZ-4stHXELzJxB4mC0TsI7-Dl338aiNk,1190 +pandas/tests/io/excel/test_xlsxwriter.py,sha256=vDDAslQMjSIPuJbpOjs2QksIMMLLw2dDkJnq8yiwm9o,1975 +pandas/tests/io/excel/test_xlwt.py,sha256=HwqtrTDRGVzdIw35PoMB4IRLFf762mAgSVF6yq10z_0,2238 +pandas/tests/io/formats/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/io/formats/test_console.py,sha256=TwsDyFN62SpzjdrGXIfqPTyrHQhUXdXlNiQRNG-PxY0,2476 +pandas/tests/io/formats/test_css.py,sha256=6sixRDEQHfoId1_uAChD08EBJ3VLpQ0hue_YpjTxA9E,6800 +pandas/tests/io/formats/test_eng_formatting.py,sha256=j26ODZ0q_z8Ox39KqCIclfbazzggDnVdKKZ40bJMWZo,8364 +pandas/tests/io/formats/test_format.py,sha256=quq5Y5uGR3vwbtleqOIK7aOcv7Xy24Ljx229DFFCWa4,116559 +pandas/tests/io/formats/test_printing.py,sha256=dPAoaxeoGJLC_n5amJ7hbiuRdhQ4zAIIuF6pNVo3cws,6791 +pandas/tests/io/formats/test_style.py,sha256=pJGpWay5hiy2LxR26RfdGUHiKcCQJCW3SP3EkmPtbcs,62595 +pandas/tests/io/formats/test_to_csv.py,sha256=9uzVaZXV4AMckUb_6C7DrDJGYL6Pvbd7xQYX0kxGCmo,21040 +pandas/tests/io/formats/test_to_excel.py,sha256=1AONSrV5QO7wu5XCggvV4SeWD3_ezgAfGQaxjvCG7-E,11985 +pandas/tests/io/formats/test_to_html.py,sha256=6kO7ZtG7ewrMfaidGEgXykQi1AsD3AypZmME20oWqas,25156 +pandas/tests/io/formats/test_to_latex.py,sha256=T0X8HiMhfUwc12rDjUXG_rGi8BJL0piIKAQRFKP1_Bc,22220 +pandas/tests/io/formats/test_to_markdown.py,sha256=9r1XrPL6YnWUjMIXVPDuPjwpqMLssLlXMuCf32MGHME,1340 +pandas/tests/io/json/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/io/json/conftest.py,sha256=Zp83o90PvZ56MbhNRr1NZEPTpho7jRHcLYiEA9R_BZw,205 +pandas/tests/io/json/test_compression.py,sha256=uAfR-WXQXa_kpTzY_Ntz2reMe_3EGHjuY4YMOOoD_nk,3845 +pandas/tests/io/json/test_json_table_schema.py,sha256=Zd9Bi18KLKlarfOn5kF1l4axxDKeo0lClOEDNv8yV14,25611 +pandas/tests/io/json/test_normalize.py,sha256=bziWs4R_s7o7WeVKirZwcA6SfQn1WKAxMOdHj6VjU3I,24250 +pandas/tests/io/json/test_pandas.py,sha256=tK2kcCALRVsBfv54Ka7awRLClUFdYZn6dgfIMC5zt2g,57203 +pandas/tests/io/json/test_readlines.py,sha256=xH6lS-CL6gYPrZ3iOuvAXRC_DyvIlLQhzaxXqRkbL9Q,6007 +pandas/tests/io/json/test_ujson.py,sha256=p8zcTT9teI_UwvG5BSIMoCiyOHWRaYgeEdRBJGFxb5o,36021 +pandas/tests/io/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/io/parser/conftest.py,sha256=LtQqnEMoOimJgkLvbxiBPrVlSo57eiOQUKGyDltBMKc,2785 +pandas/tests/io/parser/test_c_parser_only.py,sha256=m_htFjhaF3P7p4AAvyik2cJtcL9JmCRivi0oYCm6kAQ,19940 +pandas/tests/io/parser/test_comment.py,sha256=VbhWUm4TmvH0Dh941a_zcjpOoHcQg94QYCtXpkhbgEs,3815 +pandas/tests/io/parser/test_common.py,sha256=QcUynzC6Aus5lQGsOLuLd6vqP3-JMT0N6_bq7PR-Nso,57868 +pandas/tests/io/parser/test_compression.py,sha256=oM-My9k1DHrhchixvXdLiCAjKrZhG9Ek3vFVnr9WZ14,4629 +pandas/tests/io/parser/test_converters.py,sha256=E18J2GlSvs1P880hqYEiFViCR2b8vo2VvDD8JphqlIU,3989 +pandas/tests/io/parser/test_dialect.py,sha256=qC4-wTbddHUSyMObhnjEO-QcoQbvBON61YySPh9X0xY,4090 +pandas/tests/io/parser/test_dtypes.py,sha256=uuK9xfc5lYT_PBhx7KmaqqZhtURLY6FGYbTCWgVjJug,16360 +pandas/tests/io/parser/test_encoding.py,sha256=rK6Nb7c5ptzQ42rPQlHjkDIQakYo6AHuqstdB7-F1rs,5098 +pandas/tests/io/parser/test_header.py,sha256=Hwr6JSnTv518ghuXaKYthAj7OKU2j8MVG3uSlq5qSaY,15960 +pandas/tests/io/parser/test_index_col.py,sha256=mMOJeEbGMIwKf2RJKsbhmFtVxH-i6XmeNslzjG78dDI,5727 +pandas/tests/io/parser/test_mangle_dupes.py,sha256=2uCwo8PQEKKdOnDXVG9krI0qzNLtc-u5J2NryW24AK4,3868 +pandas/tests/io/parser/test_multi_thread.py,sha256=jU_-aC2mJ4C6SHlfm2b4uV_giTvOrJq8a_OiRE-kfRU,3468 +pandas/tests/io/parser/test_na_values.py,sha256=ZQ4tJrTa_BRvMU_s2mL-Wib9N6yd2zBm5EkVfDpD5rM,15078 +pandas/tests/io/parser/test_network.py,sha256=iHgHgLgkc0mkX3HnzYMZvPH1VBfFFcU0FExFxBo8Gv8,7975 +pandas/tests/io/parser/test_parse_dates.py,sha256=Ej-12m_vkXmT8OJqJ98Vt8E4pnfg15rX9UmQKfWFPtw,44755 +pandas/tests/io/parser/test_python_parser_only.py,sha256=6IMWYFTMaU-LaxV5f8T7inTMGnkN6sotyVG8XgVs3ng,9000 +pandas/tests/io/parser/test_quoting.py,sha256=7FU53-fUG_poezvLfw10SAPeFfSPpIX_X_-ViEs1Si0,5078 +pandas/tests/io/parser/test_read_fwf.py,sha256=N-3v8aXLn5jdzz4c0IIFJrWOo61sGRrEC0IV4p5Bp-w,18800 +pandas/tests/io/parser/test_skiprows.py,sha256=vjPtoeML78unD0JPLMSTyeRolul6_yueYRGRMW501_c,7010 +pandas/tests/io/parser/test_textreader.py,sha256=YfSge2BWq6KOAs9ECr0hjPu0iyJqOoiXReCO_QsyY38,10661 +pandas/tests/io/parser/test_unsupported.py,sha256=80_MkJduwIFWC2q2sM4S0_VJbDKldguzWQUDry5p6fg,4268 +pandas/tests/io/parser/test_usecols.py,sha256=1q-Z0kY41BSdXHn5uSxXTRZSUoyplYktkKRn2GrWQFo,16362 +pandas/tests/io/pytables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/io/pytables/common.py,sha256=Ooiajg1I2QljYlKS59SPoKWqZiHcOELn82AfMCnaTB0,2044 +pandas/tests/io/pytables/conftest.py,sha256=HC_C6zHnt8yo354BXrjRg0COW_QHiEm4F1OT-RKViaw,329 +pandas/tests/io/pytables/test_compat.py,sha256=7wo6Xd-MYxKA5xVQZ3TU9i81IHOxl7PzL0UcWCzBBZ4,2633 +pandas/tests/io/pytables/test_complex.py,sha256=FoItReb6FV0PCdTOHC9QbPTYNCqXrZxbckaP0PXIJCQ,5647 +pandas/tests/io/pytables/test_pytables_missing.py,sha256=mS4LkjqTPsAovK9V_aKLLMPlEi055_sp-5zykczITRA,341 +pandas/tests/io/pytables/test_store.py,sha256=lEwrVKsfm38Q5FbsndXPgHDiI04pVKmI2VG08bJYB5Y,176242 +pandas/tests/io/pytables/test_timezones.py,sha256=CLE9whrE4wWeMDnehl3cWDD_F8PbX-kZc7DNKQbdWAA,11671 +pandas/tests/io/sas/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/io/sas/test_sas.py,sha256=C2mvGTXUvmtoroqtSKBPW1SDI9A-Dho1WjN9Wz_5Cf8,695 +pandas/tests/io/sas/test_sas7bdat.py,sha256=CMheXz-BHzALfG7EPlmek-HkBFWERMMAPetiuZIMDaE,7725 +pandas/tests/io/sas/test_xport.py,sha256=KCBTjgHOgd4AzNTpbYszYXbkG_8_mJzpRY6CrV7UoDw,4763 +pandas/tests/plotting/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/plotting/common.py,sha256=4a6UHCKmRipv80z_-Rae5R63XNiBBnvjMTm87IB0xXw,18697 +pandas/tests/plotting/test_backend.py,sha256=dE4JvFd2tmqZt8DzFAnw13KhoC_idXrZZ0Kex__cqLo,3488 +pandas/tests/plotting/test_boxplot_method.py,sha256=bQNlip_dwfYbXimOEUFGiurPJPO7n8cFF9yEsUnSY8M,17654 +pandas/tests/plotting/test_converter.py,sha256=rPLfIGARtchMVolBX4llkeb5zm7Pw81BrP56iglwLbw,12481 +pandas/tests/plotting/test_datetimelike.py,sha256=J5f3ahUAxqUv2A4CkDN1aHV6-Rx4lnwgas5aSpTQ_Pw,54391 +pandas/tests/plotting/test_frame.py,sha256=pzlRrSHtyJI15A2J0fo2-FDsCvsDmyHySqEB6LZTsdk,128745 +pandas/tests/plotting/test_groupby.py,sha256=KeO1oVi0auhB0DbjSkvGLvLR8OEVAseS572FApMzjOA,2290 +pandas/tests/plotting/test_hist_method.py,sha256=0HoGdg_LeeuFl9-bSyHhZLCCTVcDfIT-CMmwnQkGvDc,16276 +pandas/tests/plotting/test_misc.py,sha256=1vcB46aIGOnFTe_QjpDsgXQNHw6uKco3MrYAhu9eXuk,14740 +pandas/tests/plotting/test_series.py,sha256=2eNhoMaHXPol_VeL4rLNKfIoO2wTzfl27JSjLEl-oTI,34532 +pandas/tests/reductions/__init__.py,sha256=vflo8yMcocx2X1Rdw9vt8NpiZ4ZFq9xZRC3PW6Gp-Cs,125 +pandas/tests/reductions/test_reductions.py,sha256=HPnipxTv8KFcQo_XysQysjCSz6AIucxXQaWLK4_QvwU,41831 +pandas/tests/reductions/test_stat_reductions.py,sha256=zGe2nAugTZRiCPdGa3XQtSws3Yd1zp7D3f2HTjvUhtk,9370 +pandas/tests/resample/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/resample/conftest.py,sha256=fj_7LirYhDGcHe5sQQrIIsGxULjcYbKs7E8Eq7bxu5E,4226 +pandas/tests/resample/test_base.py,sha256=qmmNw5kgyh2uQSUMXuKtmzQz2KEhnMZlyFdh5dsTg24,8974 +pandas/tests/resample/test_datetime_index.py,sha256=QGaKTsfnjPyRAvqcHvSPqaqYaK6-WCUC0N4wWbELyRg,50452 +pandas/tests/resample/test_period_index.py,sha256=cBvtEdU2-ZYYB9ijCQjjC2hBxFYB_8XKMycBhws1VRM,34181 +pandas/tests/resample/test_resample_api.py,sha256=ghziQDd4aTKe4It6EVWUe6bVS_9KOhx3401svAWau1o,18072 +pandas/tests/resample/test_resampler_grouper.py,sha256=D2b_SFs683SAZVZMMFM-T13OMH9fMNfZfltmwMvhKSA,7997 +pandas/tests/resample/test_time_grouper.py,sha256=fY-TT5kCcg7QIdK6t3M3lCJPQr6hY0UDtc24OnAYM7Y,8739 +pandas/tests/resample/test_timedelta.py,sha256=SFupG1dFEkL15Ega6vNJgNgu6e_Jli4GOGTIH7EZYxA,4113 +pandas/tests/reshape/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/reshape/test_concat.py,sha256=xATbcIcQpmNBCogRYQv4iQ4CeKHPM1xOh-OIkiSJQU0,100275 +pandas/tests/reshape/test_cut.py,sha256=1xhdNOtlN1fef_ud_PfM0dG3ig1-ly_z8zzNV61Lh68,17872 +pandas/tests/reshape/test_melt.py,sha256=w3dZZie0elnux5fjrtL7YPYv_vl4RShNUSis1NSSjpE,34388 +pandas/tests/reshape/test_pivot.py,sha256=KqQEg0u-WlIVj0E1O2oQBjTYTlJzB35LAHzOpp_aT9c,91200 +pandas/tests/reshape/test_qcut.py,sha256=70JHiRn2wQ1liJq4y6RPkiLbZm-utaz5N6t3r_80c7Y,7876 +pandas/tests/reshape/test_reshape.py,sha256=UW8pMizEqi461TNAkxkxnfiwYbQhj43V44x9czDuwmM,24091 +pandas/tests/reshape/test_union_categoricals.py,sha256=KgxDjYDubDDwSPev0WUo2ZM0dd62DuMO_lhSZchBiyQ,14348 +pandas/tests/reshape/test_util.py,sha256=U0gFoilJv3LVZNDhKKzDK8ZspaVO2fvJ3X6HB0UsiZs,1878 +pandas/tests/reshape/merge/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/reshape/merge/test_join.py,sha256=LqMeEbik-p2Oe1CnVIB9t6zuBbgvuXwiRp3GDYkjowk,31298 +pandas/tests/reshape/merge/test_merge.py,sha256=gCLqENQz93grfdKVvVCRPkXTHY9DlXqVaqZVLY2jf-U,75757 +pandas/tests/reshape/merge/test_merge_asof.py,sha256=wJ9AKHoA53S-tdTFIKHKYL8SyZtId6-hcBBpBGghalc,44874 +pandas/tests/reshape/merge/test_merge_index_as_string.py,sha256=j2o5tg0kvGMgmNtmUncveyAM6zd8E6takZQv17We9I8,5343 +pandas/tests/reshape/merge/test_merge_ordered.py,sha256=dd6WUwf5--f7L0_Gh0dQgykPlEVDvMehLT0IBIvUN1A,3652 +pandas/tests/reshape/merge/test_multi.py,sha256=5h2g9-hS9F57l7pSdcobytUHgGn-iGgMiIRmR9qZc5A,27559 +pandas/tests/reshape/merge/test_pivot_old.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/scalar/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/scalar/test_na_scalar.py,sha256=mhUdpI-RngNHJgXiWQ-Oda5wOxk9qkLgjA8ly7Xn7Vo,6155 +pandas/tests/scalar/test_nat.py,sha256=Ovl3PDLxoQzu3QTK2y4DpZDkacVp148ncsUHqcNu-Q8,14422 +pandas/tests/scalar/interval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/scalar/interval/test_interval.py,sha256=LiTH2DIkXWZ1rPTEulGuTt6RwN1bM3iugPBFGfmf7zg,8165 +pandas/tests/scalar/interval/test_ops.py,sha256=ExrB4ZM0mLdWtzgpscSsLmErnrUeUiNNgkoDKJ1iwnI,2336 +pandas/tests/scalar/period/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/scalar/period/test_asfreq.py,sha256=L1f_P8Rb-rsrQlvkAMd7-uMbuEuBTxteov3Ab43I8z0,35602 +pandas/tests/scalar/period/test_period.py,sha256=vhxjncBAbal2jjWd0c2_6Qm1rxrOTgef8MWK2d0S-MA,52684 +pandas/tests/scalar/timedelta/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/scalar/timedelta/test_arithmetic.py,sha256=AlV5JeK8Q1LyLjz108bsL5Mu4xax6bJ9krj0-oclOSE,24280 +pandas/tests/scalar/timedelta/test_constructors.py,sha256=qfECF25Dx4VJ_NyDCcvZGFx1E2RDhmLy5ClInz3EvfQ,8880 +pandas/tests/scalar/timedelta/test_formats.py,sha256=afiVjnkmjtnprcbtxg0v70VqMVnolTWyFJBXMlWaIY8,1261 +pandas/tests/scalar/timedelta/test_timedelta.py,sha256=gN_jf2GkaQB5wvFdQaWaxaU-97FxTZ_gwQp_U5LPof8,28462 +pandas/tests/scalar/timestamp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/scalar/timestamp/test_arithmetic.py,sha256=cF2DtqeGdZPO6-MnXkYmuMzhN1Psg6TOIB8hm8ig2Xw,7125 +pandas/tests/scalar/timestamp/test_comparisons.py,sha256=NCg6k2cN3h_ZEuC4qWtvdowRUJqnjm4f_dY6zE8wA6w,5505 +pandas/tests/scalar/timestamp/test_rendering.py,sha256=bKmuzMmPnvwOCvCCsk1DQ0MuuhlrHuEU_NARMaYLGlQ,3381 +pandas/tests/scalar/timestamp/test_timestamp.py,sha256=u-rbzzSPyqDfwx6gF2ocBq2EDDjeAYeljsv-E7jvzoU,39391 +pandas/tests/scalar/timestamp/test_timezones.py,sha256=Y-k-uQVA-0lywpN8dYtjLwDQldQeRP-0AGfiE0DjaiM,14951 +pandas/tests/scalar/timestamp/test_unary_ops.py,sha256=c2OtDLjWBcMbJT2UtTghCeJR8CEzJVemAFoEgRYHLQg,14805 +pandas/tests/series/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/series/conftest.py,sha256=uhmBpPIyLaf-SXDSIaYlVym79TKnHT0sJJc2cL69fM0,583 +pandas/tests/series/test_alter_axes.py,sha256=E0cbZsm5eopOktfZRYKFEqjlnS6V_3Ea20BM5AoFFno,12279 +pandas/tests/series/test_analytics.py,sha256=AaTWCwMX--Xh4L9a3_u-IG8iGNvB-TV40J_yxXaOMCU,9431 +pandas/tests/series/test_api.py,sha256=IK_Q50OxQQht8nMEN4QcO_rYJywtnYYuAGuSyvWjAmk,25936 +pandas/tests/series/test_apply.py,sha256=LtF-RUERNbaBpIMmK-8_mbfdgzFxFjKfB9OXPL3MaeY,28589 +pandas/tests/series/test_arithmetic.py,sha256=6RDImJW7OCbGohnbdC2b4bX4ouYPb3ptzdnV9_1smGo,6982 +pandas/tests/series/test_block_internals.py,sha256=L3Jtop854s-rbE643Um4DfZ2Wc-AWtUQ8D7038JG-GM,1420 +pandas/tests/series/test_combine_concat.py,sha256=fjYk8KfaA4BfWYdsGw70uda1_vfONw-BXhETdUd46dE,9355 +pandas/tests/series/test_constructors.py,sha256=yLxFPNZOW1CgpwSfyOF5q8oB9-1WPBrhB9f-xI72SEI,51607 +pandas/tests/series/test_convert_dtypes.py,sha256=-SZz7FkflJy86WgaUaZ_DJBJG4iob-eV7jYVkFo4V3I,8315 +pandas/tests/series/test_cumulative.py,sha256=0091c5vqfiF5vZLwKAeU8lG-afXC7_dXYhNqqOw6flg,5444 +pandas/tests/series/test_datetime_values.py,sha256=GXjU3mGnJKTzkcy94TNZ8zTrv-45Eg8Ff4suU69QHEo,24488 +pandas/tests/series/test_dtypes.py,sha256=GCoIRAtnaBRNtCHzotTMnOZs2jQVyaescQbc76ytjXM,17604 +pandas/tests/series/test_duplicates.py,sha256=7FCUYIixsssHF8loJNQCQXclOQ0gQxkR-QzpqbW7ySY,2422 +pandas/tests/series/test_internals.py,sha256=75yu0hUuHFvENe6jdlhQarKdx9eTakpxkau0_UCrPSE,8876 +pandas/tests/series/test_io.py,sha256=_WmpyF_-xtiG98dKXiRuquMN0dW6RQV9nT8oEKrUk34,8091 +pandas/tests/series/test_missing.py,sha256=Jj35hNqnvR5v3AYMim_pkwOTGqslD5hRU6bDQLvtIkw,59173 +pandas/tests/series/test_operators.py,sha256=hOkdPLm4N58Z_ELs_0T9ib-7P8pzDxGVNtwRpJXa9tI,32407 +pandas/tests/series/test_period.py,sha256=FVcSvFZ4JlhoIDxzaN9icnWs1UkFOdTkDJQGkKXkiZE,5978 +pandas/tests/series/test_repr.py,sha256=YLYK273QVrYLrdeS-r14mVitg10WLgEkyEtqfVIzCmk,14362 +pandas/tests/series/test_subclass.py,sha256=O2osuyxkaWoE_IribFuQxTni7Yc5ijCAvwTGoQJarNQ,1332 +pandas/tests/series/test_timeseries.py,sha256=YeAYM5qr-XegDRKIfmij6u0WTM-TC4deg_0em4aAKdg,25931 +pandas/tests/series/test_timezones.py,sha256=Z4PsbyubOvEZSIDjsGFlhiewu2gUtf5SjD_YjwMn4F8,13458 +pandas/tests/series/test_ufunc.py,sha256=34r5bY9-ofh_fNinR64x84TAaJTCO3vI81l0o1vqPhM,9791 +pandas/tests/series/test_validate.py,sha256=slWicmFD6sD9eLFscghW-75YoXFOuOzMI3ots_VuFDQ,668 +pandas/tests/series/indexing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/series/indexing/test_alter_index.py,sha256=dlRK2fcAHvBSKFezLAqZNGDZS8zNIPA-2iFtzOtrpCw,17829 +pandas/tests/series/indexing/test_boolean.py,sha256=8iosuBT_tpjrpQ9KsqLBwa0W1YEBWnM7a5PBqLk6aHs,16952 +pandas/tests/series/indexing/test_callable.py,sha256=0T4ccvACPAVU39MCO6gZijLCavdtSbyN3jZz5WgBDxc,810 +pandas/tests/series/indexing/test_datetime.py,sha256=uj0IqKXowx5Yuof3eM8-SwyST7mP75X7V6fc3J1nZJI,21727 +pandas/tests/series/indexing/test_iloc.py,sha256=XEP-3D7kr7nGgSHo2Rlhsi2i8C9tlQtcpug6hWf_hk4,743 +pandas/tests/series/indexing/test_indexing.py,sha256=mfRJQNU_PnKFkqrH5DmvMXarN9Buif1CtS_AFU_-5xQ,26800 +pandas/tests/series/indexing/test_loc.py,sha256=UWT6MvY3LIIsKpiz_Lu8QcsJUzNhBJsKGJF2A-MiexI,4421 +pandas/tests/series/indexing/test_numeric.py,sha256=zfe_snIOolhIrGRmPiwTR1_FO_c1HuuVyjOLgC1xDME,7339 +pandas/tests/series/methods/__init__.py,sha256=zVXqGxDIQ-ebxxcetI9KcJ9ZEHeIC4086CoDvyc8CNM,225 +pandas/tests/series/methods/test_append.py,sha256=AlLzQ0wuRPJ1Ue1DDEm1JnmIZqzIoEQJBJhWdjxQeiE,6108 +pandas/tests/series/methods/test_argsort.py,sha256=1EAxFH6-IdX4rZJ21VFDKMmmigYgQhkNyyXIhZ3inSc,2266 +pandas/tests/series/methods/test_asof.py,sha256=QZ3Pnf0J5LBYPgGWk9ztounienVXEKtTJQCAY0KZHRc,5200 +pandas/tests/series/methods/test_clip.py,sha256=9tA3eqAObWF9hdFFyx56tTOyDyRHZJ-12AD7OZlnjl4,3384 +pandas/tests/series/methods/test_count.py,sha256=1EU4xZ14gvevYkgqU3o2TmrtF1daMmZD6Peoh4yWrDM,1123 +pandas/tests/series/methods/test_cov_corr.py,sha256=wCCsQOPQXRwF9G-ieldJaLek2ZJrkdNpkMPDXaO5Pzs,4778 +pandas/tests/series/methods/test_describe.py,sha256=tj5CKK0B5bH-2gQW06RElPGguBF7JdhQw1La9dKKoIY,2289 +pandas/tests/series/methods/test_diff.py,sha256=sj2jx-6vYyijU3ueai-OFZZ7dkrDl2Ijie_qd5o1RTQ,2331 +pandas/tests/series/methods/test_drop_duplicates.py,sha256=dsPye2wrnbt9Q8F-WZuc2E8N2GO-WGTtxLCLC9qyj2A,5739 +pandas/tests/series/methods/test_duplicated.py,sha256=EGNeuEFFAc5G-yebtUfiVDxmfVRaD22C2z5nxI2_sGE,996 +pandas/tests/series/methods/test_explode.py,sha256=yPAIfAklEEoHVkBbdKjYgU5HGf98VXBmtvRX03s7EGE,3313 +pandas/tests/series/methods/test_isin.py,sha256=SOQJZAYzW3JbSasYt050Yk1HyLqmcICefqXI_2mDJlI,2751 +pandas/tests/series/methods/test_nlargest.py,sha256=38nWMpTnKHadQU3qFtN-Q8NOp0JgLxlzSJi2-fli0nM,7134 +pandas/tests/series/methods/test_pct_change.py,sha256=GZ97ZyLCDw_BvBlJ3U6V75e8ECudGYJkee0xzfiyayc,2968 +pandas/tests/series/methods/test_quantile.py,sha256=o4xmp7PIS7s8xMKcPtd7AOgi6zZiCqThG3hPFxx12lY,6810 +pandas/tests/series/methods/test_rank.py,sha256=O-9WFPfxa7MacXdz0ycKZNl-CAIp0Ot6SZ-NcVyoeL4,20346 +pandas/tests/series/methods/test_replace.py,sha256=p_n66o7lQLcDym6iZiUDVfwpm8vcOYazIK0AnQkzl00,12995 +pandas/tests/series/methods/test_round.py,sha256=IiK95EDhosZMLcSmANe6zhEbQ7_GrccxA2NtctSfNX4,1571 +pandas/tests/series/methods/test_searchsorted.py,sha256=ZS9WnNG5w5oDtirPQno1JGnv9JJn-99DlgxxRYKbwCI,1807 +pandas/tests/series/methods/test_shift.py,sha256=xaUF45wDuKP0tbK_y3QSN0cV0X1U1qY6JG_UDH6YJ0Q,9063 +pandas/tests/series/methods/test_sort_index.py,sha256=EKMBzSfF61Lrn7YUZE2jOqqOT-h_YhXwo7m0pQcYEkA,6013 +pandas/tests/series/methods/test_sort_values.py,sha256=LwA2K6L8EtgeaPvkUaqDD6PxgYtbgnmVOSwuFpqXNUo,6962 +pandas/tests/series/methods/test_to_dict.py,sha256=WlZBh5DT5KweiR_KSRB3jUefCYK9Capx82GVzZCDGgw,647 +pandas/tests/series/methods/test_truncate.py,sha256=3wJIU9mA2D_6QqAMEF_30M8p4rtI3DNkZyL8KTq39kw,2443 +pandas/tests/series/methods/test_value_counts.py,sha256=E3PY427QhraOC6i2PQEiZ-N9s1iYMqr1_V9JfEsyufo,7341 +pandas/tests/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/tools/test_numeric.py,sha256=iC8ZVqMh8gujFK0asWwgDivMBPwnXaKCVgGlb05SoUM,18380 +pandas/tests/tseries/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/tseries/frequencies/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/tseries/frequencies/test_freq_code.py,sha256=84p4fz_D89mfs9z2dUnKX1t5Sww5fLEqfVZedeoKXZM,5144 +pandas/tests/tseries/frequencies/test_inference.py,sha256=nQuoLykVOnMzIKTibeZEv6hSIavPoG74y2AyENURXKw,14460 +pandas/tests/tseries/frequencies/test_to_offset.py,sha256=6g1W0L4q5zN8vqpIZxw7PM6zp2zasG0Pl_XAeQumn5M,5051 +pandas/tests/tseries/holiday/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/tseries/holiday/test_calendar.py,sha256=-jdlbzhLIH8ibbAqdVtylDBprM31AFazh7i7NAe7J5M,3136 +pandas/tests/tseries/holiday/test_federal.py,sha256=TPMPlc2skaMCNbeJ8gDYS7JwsAZFQ16EjdJpN4pQysY,1157 +pandas/tests/tseries/holiday/test_holiday.py,sha256=fagvlzxBzZPtwn-vIT5d_iyFDf2fxnS6wfEi9bl9WWQ,8620 +pandas/tests/tseries/holiday/test_observance.py,sha256=oUPC0jLX9dnxIOWb38NEv8-EykMwrhETWDFS5j4Fdgo,2309 +pandas/tests/tseries/offsets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/tseries/offsets/common.py,sha256=Z4AaKqCX2ZZu-144f5u6LCcupyZx3JR6ilO-YYE-mv4,732 +pandas/tests/tseries/offsets/conftest.py,sha256=9Ez1TE8tMjVo-1NRUAC6e9Hndo5aLl9miA84SuPXlOo,598 +pandas/tests/tseries/offsets/test_fiscal.py,sha256=LO2WcsTaXj9SNMePgsvt2Ym8tU1nT84YjDD7LuXbgGg,28008 +pandas/tests/tseries/offsets/test_offsets.py,sha256=aba6XHQeku9oFtE8KQO6LAwCGJG2mpVrHEy7DsNc_UI,160864 +pandas/tests/tseries/offsets/test_offsets_properties.py,sha256=hVYqVVtNe-8jHQTiPYz9Pwx_omm5KVbFxxSBfIrxHlA,4435 +pandas/tests/tseries/offsets/test_ticks.py,sha256=_TtuTDnl5twY6H_CSIoQGbjNa0LuEpbHDFJkkItIel8,9344 +pandas/tests/tseries/offsets/test_yqm_offsets.py,sha256=8ACcjBcFc8_JgPXGSkQdv6IJbAq26-1X4rweZMBC6eI,50956 +pandas/tests/tslibs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/tslibs/test_api.py,sha256=-K3M3cdihpZc__L2ligeqhAKnksw2qGth8obHHMkdAA,1005 +pandas/tests/tslibs/test_array_to_datetime.py,sha256=Ug5HTO9_9mqZQjebYRhaFxRyc5TPNWFZvsXju1cY9DA,6087 +pandas/tests/tslibs/test_ccalendar.py,sha256=NlcP61013BIHu5jIasqQBbMpotq1b8eJCTpthWn8MZc,724 +pandas/tests/tslibs/test_conversion.py,sha256=z2djiRFrGhg3JC2McHTb-CfWOKccoJUQK7f-qZkJiXE,2919 +pandas/tests/tslibs/test_fields.py,sha256=TqCCV29XfBWdF88ppP10-3nnDDM8wriy58yvobFBn4Q,1140 +pandas/tests/tslibs/test_libfrequencies.py,sha256=kE3JzfkQSbY9zG4W-szmycN3cM0eoZLa-4ObKi7P8mA,2889 +pandas/tests/tslibs/test_liboffsets.py,sha256=WhTQHXQa23WA70MTkCVuloVWPXwZaGMpRmzeLbnvyr8,5046 +pandas/tests/tslibs/test_normalize_date.py,sha256=AVxFuSeHBmk-WOcxnvtNRDxv31BQ806iRd2QUWeQ6Ww,1040 +pandas/tests/tslibs/test_parse_iso8601.py,sha256=62iRZLf8GojbN3J367sfjQECsSemorC-2EgVfu4ruzI,2080 +pandas/tests/tslibs/test_parsing.py,sha256=Wt_noP2ehSRQV9LRYORra9KmnZgU2v_oWphaeD__MoQ,6571 +pandas/tests/tslibs/test_period_asfreq.py,sha256=IntpsGQLtFR5e3cwXkuICKCeMlfXUtyGk4tIRf6QNvs,2130 +pandas/tests/tslibs/test_timedeltas.py,sha256=TEfl_6ou_7tv-gQCPwxM9FUVgv1lCfMmruJR2FwN6m0,779 +pandas/tests/tslibs/test_timezones.py,sha256=peaIznXIvFvwnlalqGdvJLexQ-zJHz2nXBEwDWNta3U,3024 +pandas/tests/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/util/conftest.py,sha256=jwFjj-8FSV4wF7VTB4FJAo1cKdvZ1EpUoQZNfqh-_vI,487 +pandas/tests/util/test_assert_almost_equal.py,sha256=Db0tuES9g4ZjNgmMgw2aeFSQ2D4Zo68p2R1AglV6ShY,9899 +pandas/tests/util/test_assert_categorical_equal.py,sha256=djolfrG1zv9uy8k9x9CdrPqfI7SsZOdyrPC6dubLiC4,2749 +pandas/tests/util/test_assert_extension_array_equal.py,sha256=B4bAcczJ6FFvA0IvOj5AySKKNyVKrTNBq3oIsn658Nk,3219 +pandas/tests/util/test_assert_frame_equal.py,sha256=wXkyRPrSa8P6ck2-skxkqd7dgcuDaF6K0bQXKNYWdiI,7174 +pandas/tests/util/test_assert_index_equal.py,sha256=ZIEUArE4s0on5wJpMWUnxdfciCO-izH4cbX7N0KwI3k,5799 +pandas/tests/util/test_assert_interval_array_equal.py,sha256=gu2uIlXsj8s5J-tQ-tWw7anqjbqqLLbcEhiLip-sFCs,2364 +pandas/tests/util/test_assert_numpy_array_equal.py,sha256=BpeGgyRAPm2MqvkN4YC2aaLDQhXE7ffNCEH_AMLzG_I,5415 +pandas/tests/util/test_assert_produces_warning.py,sha256=wX65FGknkcYYRS_kIq_S8QdBLtNCj2VBwK0gcKh7O68,547 +pandas/tests/util/test_assert_series_equal.py,sha256=pp_f1HgOx9bG7nT6JO__wnyRnueUf-_2n0eVsedm_aU,5617 +pandas/tests/util/test_deprecate.py,sha256=oZXuNwUnS_hAcMWPgl9ErjGCZSs4beoaivnsOTQzIys,1626 +pandas/tests/util/test_deprecate_kwarg.py,sha256=7T2QkCxXUoJHhCxUjAH_5_hM-BHC6nPWG635LFY35lo,2043 +pandas/tests/util/test_hashing.py,sha256=DYNg6W_xATmfRVu-et_IuEbkfGROok8f8w7DSVQa4JI,12420 +pandas/tests/util/test_safe_import.py,sha256=UxH90Ju9wyQ7Rs7SduRj3dkxroyehIwaWbBEz3ZzvEw,1020 +pandas/tests/util/test_util.py,sha256=8tasPmaeDIB7tWgngZSonascdfQdNXBaKAqOokhbcq8,1807 +pandas/tests/util/test_validate_args.py,sha256=IsoDM16phAHckOGdiPVaGvk70KNnMmd1GSO7WEt64-8,1844 +pandas/tests/util/test_validate_args_and_kwargs.py,sha256=OFROeLC6jsezEgz3SypUMUl5VHPQGgj49eZYsMOQ9-s,2391 +pandas/tests/util/test_validate_kwargs.py,sha256=Y88_bxcfnjIPQQuCP1bfPXKQUTOeEPrL7itXrBXKoaQ,1740 +pandas/tests/window/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tests/window/common.py,sha256=KdPMAtP_nvKuJcRbKGYB0tMI47kU4QmISKj2kMeXxGA,12806 +pandas/tests/window/conftest.py,sha256=S606HrmFtSEFRsyc8UpNjTqJAc-jPA6Y3MY84DFWTV4,1877 +pandas/tests/window/test_api.py,sha256=enIDBFz9rup-S4y6NREXPDALpodo4bxs8I0CLtxvNow,11843 +pandas/tests/window/test_apply.py,sha256=JGaDhJMxQWqIjgHBVecRmit9vHETFDUUXPMmONWtJLk,4378 +pandas/tests/window/test_base_indexer.py,sha256=kq-v6SLIsC7_BzLTG3-RkKoCwWFXG8VuOvSGAqSa8NU,2890 +pandas/tests/window/test_dtypes.py,sha256=MaTzb6Hb7mjqPQyIhHe4I8-DHarSLWko2lthAw_jVSo,7278 +pandas/tests/window/test_ewm.py,sha256=lP-cZ4u-7d0XUiPreg4hZ0_-6Yapi04fJee76XfXoNE,1940 +pandas/tests/window/test_expanding.py,sha256=Hey_M3hBgHptf0-ax7L91p15Sm6M96xVDIQHezmihHY,4387 +pandas/tests/window/test_grouper.py,sha256=bDORBwRryNNFAwbGtRdP0JUQFSnNIHTUg2hBtHJbCCE,6292 +pandas/tests/window/test_numba.py,sha256=J-43On_bqAhOBucxrCzlLp_64NSHq8ucMD2VqjL2TAw,2441 +pandas/tests/window/test_pairwise.py,sha256=f3pOiAUAHcp7sE5Rwlh1heumbrgt8H78FMV-wsOvG0Y,7251 +pandas/tests/window/test_rolling.py,sha256=54pzdwnieNu5_z_-h304MVwSs1g6WKSF3aenkQquaMI,15839 +pandas/tests/window/test_timeseries_window.py,sha256=gqVM8jAFJyWG4gXtIlFbRyhH-CSifpE5mMBq5rwR7A0,23739 +pandas/tests/window/test_window.py,sha256=LsHGuK8N_y9khcHZiWo0VkI3DaszKefcUwQ9VeR3kTA,2510 +pandas/tseries/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +pandas/tseries/api.py,sha256=GqF_UoX9OyCigcaoePStvDGasXZiFByVKSeATIYbWWY,131 +pandas/tseries/frequencies.py,sha256=bpQiEmCoDkklDc3YWi8Ng90St0MihR4pgNyLhb8NHMs,16098 +pandas/tseries/holiday.py,sha256=qHKzzct4RtKUG9Cuy-xoM1JzCy_hWjekOK6AgYH_Ksc,15975 +pandas/tseries/offsets.py,sha256=VX67BXgOogDiSxVzmqI8UBPQJEH2_OYes3evvC5rIoU,87459 +pandas/util/__init__.py,sha256=Mxt44oH9IOz6t0Q26M6WdBIhsvvL1viypUSzfjCHuK4,693 +pandas/util/_decorators.py,sha256=00aAN1hti1zWsoyqU0ZMazQpePsKMIX6PzyptoTex2k,11325 +pandas/util/_depr_module.py,sha256=PQ1MOqjY8Go8g56pBSvlVla2mPTaseeZhckGh07vrd8,3463 +pandas/util/_doctools.py,sha256=AqvtBI8G3vNCJT6iNnpNvD9MKb5RPsojgiZ_ewQooJg,6642 +pandas/util/_exceptions.py,sha256=o41FLW_YVeexTEaBDOX376m-k74BKZDXdz90vW6XLc0,452 +pandas/util/_print_versions.py,sha256=mOn3praQDTEpC2EfJPaEihs25K2nUGERGGiwUo6GRVs,3890 +pandas/util/_test_decorators.py,sha256=KHM4GMfe1SAr8k9YtMpzG0I0SA86ma4g9xOVKkLjW00,7402 +pandas/util/_tester.py,sha256=GIylNuY6F3HVbP2PQxHFqfVSpLIbKzqymjKOBaVnTNE,727 +pandas/util/_validators.py,sha256=lbfzBJSEdkl3XtdVVf919Xlpsj0oFZASSkFIIfVXypI,13336 +pandas/util/testing.py,sha256=Cxkz00-nJkvGqpWXUxvO0LOY16F2hjIkJtgv64yJIcA,242 +pandas-1.0.0.dist-info/LICENSE,sha256=eeToGKZixsArBSSZvW7e2hjngNPAJjOYNMwJRV_qbO0,1582 +pandas-1.0.0.dist-info/METADATA,sha256=NlAftfHmRi6JoByjgBOxCb8DZfJVhA-WUtcfi3VuEXk,4675 +pandas-1.0.0.dist-info/WHEEL,sha256=uaZe_9gV-4T_d4AskuIQkCgcY8wMc0UXsVFnf0_mBGs,106 +pandas-1.0.0.dist-info/entry_points.txt,sha256=OVLKNEPs-Q7IWypWBL6fxv56_zt4sRnEI7zawo6y_0w,69 +pandas-1.0.0.dist-info/top_level.txt,sha256=_W-EYOwsRjyO7fqakAIX0J3vvvCqzSWZ8z5RtnXISDw,7 +pandas-1.0.0.dist-info/RECORD,, +pandas-1.0.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4 +pandas/api/extensions/__pycache__/__init__.cpython-37.pyc,, +pandas/api/indexers/__pycache__/__init__.cpython-37.pyc,, +pandas/api/types/__pycache__/__init__.cpython-37.pyc,, +pandas/api/__pycache__/__init__.cpython-37.pyc,, +pandas/arrays/__pycache__/__init__.cpython-37.pyc,, +pandas/compat/numpy/__pycache__/function.cpython-37.pyc,, +pandas/compat/numpy/__pycache__/__init__.cpython-37.pyc,, +pandas/compat/__pycache__/chainmap.cpython-37.pyc,, +pandas/compat/__pycache__/pickle_compat.cpython-37.pyc,, +pandas/compat/__pycache__/_optional.cpython-37.pyc,, +pandas/compat/__pycache__/__init__.cpython-37.pyc,, +pandas/core/arrays/sparse/__pycache__/accessor.cpython-37.pyc,, +pandas/core/arrays/sparse/__pycache__/array.cpython-37.pyc,, +pandas/core/arrays/sparse/__pycache__/dtype.cpython-37.pyc,, +pandas/core/arrays/sparse/__pycache__/scipy_sparse.cpython-37.pyc,, +pandas/core/arrays/sparse/__pycache__/__init__.cpython-37.pyc,, +pandas/core/arrays/__pycache__/base.cpython-37.pyc,, +pandas/core/arrays/__pycache__/boolean.cpython-37.pyc,, +pandas/core/arrays/__pycache__/categorical.cpython-37.pyc,, +pandas/core/arrays/__pycache__/datetimelike.cpython-37.pyc,, +pandas/core/arrays/__pycache__/datetimes.cpython-37.pyc,, +pandas/core/arrays/__pycache__/integer.cpython-37.pyc,, +pandas/core/arrays/__pycache__/interval.cpython-37.pyc,, +pandas/core/arrays/__pycache__/masked.cpython-37.pyc,, +pandas/core/arrays/__pycache__/numpy_.cpython-37.pyc,, +pandas/core/arrays/__pycache__/period.cpython-37.pyc,, +pandas/core/arrays/__pycache__/string_.cpython-37.pyc,, +pandas/core/arrays/__pycache__/timedeltas.cpython-37.pyc,, +pandas/core/arrays/__pycache__/_arrow_utils.cpython-37.pyc,, +pandas/core/arrays/__pycache__/_ranges.cpython-37.pyc,, +pandas/core/arrays/__pycache__/__init__.cpython-37.pyc,, +pandas/core/computation/__pycache__/align.cpython-37.pyc,, +pandas/core/computation/__pycache__/api.cpython-37.pyc,, +pandas/core/computation/__pycache__/check.cpython-37.pyc,, +pandas/core/computation/__pycache__/common.cpython-37.pyc,, +pandas/core/computation/__pycache__/engines.cpython-37.pyc,, +pandas/core/computation/__pycache__/eval.cpython-37.pyc,, +pandas/core/computation/__pycache__/expr.cpython-37.pyc,, +pandas/core/computation/__pycache__/expressions.cpython-37.pyc,, +pandas/core/computation/__pycache__/ops.cpython-37.pyc,, +pandas/core/computation/__pycache__/parsing.cpython-37.pyc,, +pandas/core/computation/__pycache__/pytables.cpython-37.pyc,, +pandas/core/computation/__pycache__/scope.cpython-37.pyc,, +pandas/core/computation/__pycache__/__init__.cpython-37.pyc,, +pandas/core/dtypes/__pycache__/api.cpython-37.pyc,, +pandas/core/dtypes/__pycache__/base.cpython-37.pyc,, +pandas/core/dtypes/__pycache__/cast.cpython-37.pyc,, +pandas/core/dtypes/__pycache__/common.cpython-37.pyc,, +pandas/core/dtypes/__pycache__/concat.cpython-37.pyc,, +pandas/core/dtypes/__pycache__/dtypes.cpython-37.pyc,, +pandas/core/dtypes/__pycache__/generic.cpython-37.pyc,, +pandas/core/dtypes/__pycache__/inference.cpython-37.pyc,, +pandas/core/dtypes/__pycache__/missing.cpython-37.pyc,, +pandas/core/dtypes/__pycache__/__init__.cpython-37.pyc,, +pandas/core/groupby/__pycache__/base.cpython-37.pyc,, +pandas/core/groupby/__pycache__/categorical.cpython-37.pyc,, +pandas/core/groupby/__pycache__/generic.cpython-37.pyc,, +pandas/core/groupby/__pycache__/groupby.cpython-37.pyc,, +pandas/core/groupby/__pycache__/grouper.cpython-37.pyc,, +pandas/core/groupby/__pycache__/ops.cpython-37.pyc,, +pandas/core/groupby/__pycache__/__init__.cpython-37.pyc,, +pandas/core/indexes/__pycache__/accessors.cpython-37.pyc,, +pandas/core/indexes/__pycache__/api.cpython-37.pyc,, +pandas/core/indexes/__pycache__/base.cpython-37.pyc,, +pandas/core/indexes/__pycache__/category.cpython-37.pyc,, +pandas/core/indexes/__pycache__/datetimelike.cpython-37.pyc,, +pandas/core/indexes/__pycache__/datetimes.cpython-37.pyc,, +pandas/core/indexes/__pycache__/extension.cpython-37.pyc,, +pandas/core/indexes/__pycache__/frozen.cpython-37.pyc,, +pandas/core/indexes/__pycache__/interval.cpython-37.pyc,, +pandas/core/indexes/__pycache__/multi.cpython-37.pyc,, +pandas/core/indexes/__pycache__/numeric.cpython-37.pyc,, +pandas/core/indexes/__pycache__/period.cpython-37.pyc,, +pandas/core/indexes/__pycache__/range.cpython-37.pyc,, +pandas/core/indexes/__pycache__/timedeltas.cpython-37.pyc,, +pandas/core/indexes/__pycache__/__init__.cpython-37.pyc,, +pandas/core/internals/__pycache__/blocks.cpython-37.pyc,, +pandas/core/internals/__pycache__/concat.cpython-37.pyc,, +pandas/core/internals/__pycache__/construction.cpython-37.pyc,, +pandas/core/internals/__pycache__/managers.cpython-37.pyc,, +pandas/core/internals/__pycache__/__init__.cpython-37.pyc,, +pandas/core/ops/__pycache__/array_ops.cpython-37.pyc,, +pandas/core/ops/__pycache__/common.cpython-37.pyc,, +pandas/core/ops/__pycache__/dispatch.cpython-37.pyc,, +pandas/core/ops/__pycache__/docstrings.cpython-37.pyc,, +pandas/core/ops/__pycache__/invalid.cpython-37.pyc,, +pandas/core/ops/__pycache__/mask_ops.cpython-37.pyc,, +pandas/core/ops/__pycache__/methods.cpython-37.pyc,, +pandas/core/ops/__pycache__/missing.cpython-37.pyc,, +pandas/core/ops/__pycache__/roperator.cpython-37.pyc,, +pandas/core/ops/__pycache__/__init__.cpython-37.pyc,, +pandas/core/reshape/__pycache__/api.cpython-37.pyc,, +pandas/core/reshape/__pycache__/concat.cpython-37.pyc,, +pandas/core/reshape/__pycache__/melt.cpython-37.pyc,, +pandas/core/reshape/__pycache__/merge.cpython-37.pyc,, +pandas/core/reshape/__pycache__/pivot.cpython-37.pyc,, +pandas/core/reshape/__pycache__/reshape.cpython-37.pyc,, +pandas/core/reshape/__pycache__/tile.cpython-37.pyc,, +pandas/core/reshape/__pycache__/util.cpython-37.pyc,, +pandas/core/reshape/__pycache__/__init__.cpython-37.pyc,, +pandas/core/sparse/__pycache__/api.cpython-37.pyc,, +pandas/core/sparse/__pycache__/__init__.cpython-37.pyc,, +pandas/core/tools/__pycache__/datetimes.cpython-37.pyc,, +pandas/core/tools/__pycache__/numeric.cpython-37.pyc,, +pandas/core/tools/__pycache__/timedeltas.cpython-37.pyc,, +pandas/core/tools/__pycache__/__init__.cpython-37.pyc,, +pandas/core/util/__pycache__/hashing.cpython-37.pyc,, +pandas/core/util/__pycache__/__init__.cpython-37.pyc,, +pandas/core/window/__pycache__/common.cpython-37.pyc,, +pandas/core/window/__pycache__/ewm.cpython-37.pyc,, +pandas/core/window/__pycache__/expanding.cpython-37.pyc,, +pandas/core/window/__pycache__/indexers.cpython-37.pyc,, +pandas/core/window/__pycache__/numba_.cpython-37.pyc,, +pandas/core/window/__pycache__/rolling.cpython-37.pyc,, +pandas/core/window/__pycache__/__init__.cpython-37.pyc,, +pandas/core/__pycache__/accessor.cpython-37.pyc,, +pandas/core/__pycache__/algorithms.cpython-37.pyc,, +pandas/core/__pycache__/api.cpython-37.pyc,, +pandas/core/__pycache__/apply.cpython-37.pyc,, +pandas/core/__pycache__/base.cpython-37.pyc,, +pandas/core/__pycache__/common.cpython-37.pyc,, +pandas/core/__pycache__/config_init.cpython-37.pyc,, +pandas/core/__pycache__/construction.cpython-37.pyc,, +pandas/core/__pycache__/frame.cpython-37.pyc,, +pandas/core/__pycache__/generic.cpython-37.pyc,, +pandas/core/__pycache__/index.cpython-37.pyc,, +pandas/core/__pycache__/indexers.cpython-37.pyc,, +pandas/core/__pycache__/indexing.cpython-37.pyc,, +pandas/core/__pycache__/missing.cpython-37.pyc,, +pandas/core/__pycache__/nanops.cpython-37.pyc,, +pandas/core/__pycache__/resample.cpython-37.pyc,, +pandas/core/__pycache__/series.cpython-37.pyc,, +pandas/core/__pycache__/sorting.cpython-37.pyc,, +pandas/core/__pycache__/strings.cpython-37.pyc,, +pandas/core/__pycache__/__init__.cpython-37.pyc,, +pandas/errors/__pycache__/__init__.cpython-37.pyc,, +pandas/io/clipboard/__pycache__/__init__.cpython-37.pyc,, +pandas/io/excel/__pycache__/_base.cpython-37.pyc,, +pandas/io/excel/__pycache__/_odfreader.cpython-37.pyc,, +pandas/io/excel/__pycache__/_openpyxl.cpython-37.pyc,, +pandas/io/excel/__pycache__/_pyxlsb.cpython-37.pyc,, +pandas/io/excel/__pycache__/_util.cpython-37.pyc,, +pandas/io/excel/__pycache__/_xlrd.cpython-37.pyc,, +pandas/io/excel/__pycache__/_xlsxwriter.cpython-37.pyc,, +pandas/io/excel/__pycache__/_xlwt.cpython-37.pyc,, +pandas/io/excel/__pycache__/__init__.cpython-37.pyc,, +pandas/io/formats/__pycache__/console.cpython-37.pyc,, +pandas/io/formats/__pycache__/css.cpython-37.pyc,, +pandas/io/formats/__pycache__/csvs.cpython-37.pyc,, +pandas/io/formats/__pycache__/excel.cpython-37.pyc,, +pandas/io/formats/__pycache__/format.cpython-37.pyc,, +pandas/io/formats/__pycache__/html.cpython-37.pyc,, +pandas/io/formats/__pycache__/latex.cpython-37.pyc,, +pandas/io/formats/__pycache__/printing.cpython-37.pyc,, +pandas/io/formats/__pycache__/style.cpython-37.pyc,, +pandas/io/formats/__pycache__/__init__.cpython-37.pyc,, +pandas/io/json/__pycache__/_json.cpython-37.pyc,, +pandas/io/json/__pycache__/_normalize.cpython-37.pyc,, +pandas/io/json/__pycache__/_table_schema.cpython-37.pyc,, +pandas/io/json/__pycache__/__init__.cpython-37.pyc,, +pandas/io/sas/__pycache__/sas7bdat.cpython-37.pyc,, +pandas/io/sas/__pycache__/sasreader.cpython-37.pyc,, +pandas/io/sas/__pycache__/sas_constants.cpython-37.pyc,, +pandas/io/sas/__pycache__/sas_xport.cpython-37.pyc,, +pandas/io/sas/__pycache__/__init__.cpython-37.pyc,, +pandas/io/__pycache__/api.cpython-37.pyc,, +pandas/io/__pycache__/clipboards.cpython-37.pyc,, +pandas/io/__pycache__/common.cpython-37.pyc,, +pandas/io/__pycache__/date_converters.cpython-37.pyc,, +pandas/io/__pycache__/feather_format.cpython-37.pyc,, +pandas/io/__pycache__/gbq.cpython-37.pyc,, +pandas/io/__pycache__/gcs.cpython-37.pyc,, +pandas/io/__pycache__/html.cpython-37.pyc,, +pandas/io/__pycache__/orc.cpython-37.pyc,, +pandas/io/__pycache__/parquet.cpython-37.pyc,, +pandas/io/__pycache__/parsers.cpython-37.pyc,, +pandas/io/__pycache__/pickle.cpython-37.pyc,, +pandas/io/__pycache__/pytables.cpython-37.pyc,, +pandas/io/__pycache__/s3.cpython-37.pyc,, +pandas/io/__pycache__/spss.cpython-37.pyc,, +pandas/io/__pycache__/sql.cpython-37.pyc,, +pandas/io/__pycache__/stata.cpython-37.pyc,, +pandas/io/__pycache__/__init__.cpython-37.pyc,, +pandas/plotting/_matplotlib/__pycache__/boxplot.cpython-37.pyc,, +pandas/plotting/_matplotlib/__pycache__/compat.cpython-37.pyc,, +pandas/plotting/_matplotlib/__pycache__/converter.cpython-37.pyc,, +pandas/plotting/_matplotlib/__pycache__/core.cpython-37.pyc,, +pandas/plotting/_matplotlib/__pycache__/hist.cpython-37.pyc,, +pandas/plotting/_matplotlib/__pycache__/misc.cpython-37.pyc,, +pandas/plotting/_matplotlib/__pycache__/style.cpython-37.pyc,, +pandas/plotting/_matplotlib/__pycache__/timeseries.cpython-37.pyc,, +pandas/plotting/_matplotlib/__pycache__/tools.cpython-37.pyc,, +pandas/plotting/_matplotlib/__pycache__/__init__.cpython-37.pyc,, +pandas/plotting/__pycache__/_core.cpython-37.pyc,, +pandas/plotting/__pycache__/_misc.cpython-37.pyc,, +pandas/plotting/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/api/__pycache__/test_api.cpython-37.pyc,, +pandas/tests/api/__pycache__/test_types.cpython-37.pyc,, +pandas/tests/api/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/arithmetic/__pycache__/common.cpython-37.pyc,, +pandas/tests/arithmetic/__pycache__/conftest.cpython-37.pyc,, +pandas/tests/arithmetic/__pycache__/test_datetime64.cpython-37.pyc,, +pandas/tests/arithmetic/__pycache__/test_interval.cpython-37.pyc,, +pandas/tests/arithmetic/__pycache__/test_numeric.cpython-37.pyc,, +pandas/tests/arithmetic/__pycache__/test_object.cpython-37.pyc,, +pandas/tests/arithmetic/__pycache__/test_period.cpython-37.pyc,, +pandas/tests/arithmetic/__pycache__/test_timedelta64.cpython-37.pyc,, +pandas/tests/arithmetic/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/arrays/categorical/__pycache__/common.cpython-37.pyc,, +pandas/tests/arrays/categorical/__pycache__/conftest.cpython-37.pyc,, +pandas/tests/arrays/categorical/__pycache__/test_algos.cpython-37.pyc,, +pandas/tests/arrays/categorical/__pycache__/test_analytics.cpython-37.pyc,, +pandas/tests/arrays/categorical/__pycache__/test_api.cpython-37.pyc,, +pandas/tests/arrays/categorical/__pycache__/test_constructors.cpython-37.pyc,, +pandas/tests/arrays/categorical/__pycache__/test_dtypes.cpython-37.pyc,, +pandas/tests/arrays/categorical/__pycache__/test_indexing.cpython-37.pyc,, +pandas/tests/arrays/categorical/__pycache__/test_missing.cpython-37.pyc,, +pandas/tests/arrays/categorical/__pycache__/test_operators.cpython-37.pyc,, +pandas/tests/arrays/categorical/__pycache__/test_repr.cpython-37.pyc,, +pandas/tests/arrays/categorical/__pycache__/test_sorting.cpython-37.pyc,, +pandas/tests/arrays/categorical/__pycache__/test_subclass.cpython-37.pyc,, +pandas/tests/arrays/categorical/__pycache__/test_warnings.cpython-37.pyc,, +pandas/tests/arrays/categorical/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/arrays/interval/__pycache__/test_interval.cpython-37.pyc,, +pandas/tests/arrays/interval/__pycache__/test_ops.cpython-37.pyc,, +pandas/tests/arrays/interval/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/arrays/sparse/__pycache__/test_accessor.cpython-37.pyc,, +pandas/tests/arrays/sparse/__pycache__/test_arithmetics.cpython-37.pyc,, +pandas/tests/arrays/sparse/__pycache__/test_array.cpython-37.pyc,, +pandas/tests/arrays/sparse/__pycache__/test_combine_concat.cpython-37.pyc,, +pandas/tests/arrays/sparse/__pycache__/test_dtype.cpython-37.pyc,, +pandas/tests/arrays/sparse/__pycache__/test_libsparse.cpython-37.pyc,, +pandas/tests/arrays/sparse/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/arrays/string_/__pycache__/test_string.cpython-37.pyc,, +pandas/tests/arrays/string_/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/arrays/__pycache__/test_array.cpython-37.pyc,, +pandas/tests/arrays/__pycache__/test_boolean.cpython-37.pyc,, +pandas/tests/arrays/__pycache__/test_datetimelike.cpython-37.pyc,, +pandas/tests/arrays/__pycache__/test_datetimes.cpython-37.pyc,, +pandas/tests/arrays/__pycache__/test_integer.cpython-37.pyc,, +pandas/tests/arrays/__pycache__/test_numpy.cpython-37.pyc,, +pandas/tests/arrays/__pycache__/test_period.cpython-37.pyc,, +pandas/tests/arrays/__pycache__/test_timedeltas.cpython-37.pyc,, +pandas/tests/arrays/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/base/__pycache__/test_constructors.cpython-37.pyc,, +pandas/tests/base/__pycache__/test_conversion.cpython-37.pyc,, +pandas/tests/base/__pycache__/test_ops.cpython-37.pyc,, +pandas/tests/base/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/computation/__pycache__/test_compat.cpython-37.pyc,, +pandas/tests/computation/__pycache__/test_eval.cpython-37.pyc,, +pandas/tests/computation/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/config/__pycache__/test_config.cpython-37.pyc,, +pandas/tests/config/__pycache__/test_localization.cpython-37.pyc,, +pandas/tests/config/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/dtypes/cast/__pycache__/test_construct_from_scalar.cpython-37.pyc,, +pandas/tests/dtypes/cast/__pycache__/test_construct_ndarray.cpython-37.pyc,, +pandas/tests/dtypes/cast/__pycache__/test_construct_object_arr.cpython-37.pyc,, +pandas/tests/dtypes/cast/__pycache__/test_convert_objects.cpython-37.pyc,, +pandas/tests/dtypes/cast/__pycache__/test_downcast.cpython-37.pyc,, +pandas/tests/dtypes/cast/__pycache__/test_find_common_type.cpython-37.pyc,, +pandas/tests/dtypes/cast/__pycache__/test_infer_datetimelike.cpython-37.pyc,, +pandas/tests/dtypes/cast/__pycache__/test_infer_dtype.cpython-37.pyc,, +pandas/tests/dtypes/cast/__pycache__/test_promote.cpython-37.pyc,, +pandas/tests/dtypes/cast/__pycache__/test_upcast.cpython-37.pyc,, +pandas/tests/dtypes/cast/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/dtypes/__pycache__/test_common.cpython-37.pyc,, +pandas/tests/dtypes/__pycache__/test_concat.cpython-37.pyc,, +pandas/tests/dtypes/__pycache__/test_dtypes.cpython-37.pyc,, +pandas/tests/dtypes/__pycache__/test_generic.cpython-37.pyc,, +pandas/tests/dtypes/__pycache__/test_inference.cpython-37.pyc,, +pandas/tests/dtypes/__pycache__/test_missing.cpython-37.pyc,, +pandas/tests/dtypes/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/extension/arrow/__pycache__/arrays.cpython-37.pyc,, +pandas/tests/extension/arrow/__pycache__/test_bool.cpython-37.pyc,, +pandas/tests/extension/arrow/__pycache__/test_string.cpython-37.pyc,, +pandas/tests/extension/arrow/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/extension/base/__pycache__/base.cpython-37.pyc,, +pandas/tests/extension/base/__pycache__/casting.cpython-37.pyc,, +pandas/tests/extension/base/__pycache__/constructors.cpython-37.pyc,, +pandas/tests/extension/base/__pycache__/dtype.cpython-37.pyc,, +pandas/tests/extension/base/__pycache__/getitem.cpython-37.pyc,, +pandas/tests/extension/base/__pycache__/groupby.cpython-37.pyc,, +pandas/tests/extension/base/__pycache__/interface.cpython-37.pyc,, +pandas/tests/extension/base/__pycache__/io.cpython-37.pyc,, +pandas/tests/extension/base/__pycache__/methods.cpython-37.pyc,, +pandas/tests/extension/base/__pycache__/missing.cpython-37.pyc,, +pandas/tests/extension/base/__pycache__/ops.cpython-37.pyc,, +pandas/tests/extension/base/__pycache__/printing.cpython-37.pyc,, +pandas/tests/extension/base/__pycache__/reduce.cpython-37.pyc,, +pandas/tests/extension/base/__pycache__/reshaping.cpython-37.pyc,, +pandas/tests/extension/base/__pycache__/setitem.cpython-37.pyc,, +pandas/tests/extension/base/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/extension/decimal/__pycache__/array.cpython-37.pyc,, +pandas/tests/extension/decimal/__pycache__/test_decimal.cpython-37.pyc,, +pandas/tests/extension/decimal/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/extension/json/__pycache__/array.cpython-37.pyc,, +pandas/tests/extension/json/__pycache__/test_json.cpython-37.pyc,, +pandas/tests/extension/json/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/extension/list/__pycache__/array.cpython-37.pyc,, +pandas/tests/extension/list/__pycache__/test_list.cpython-37.pyc,, +pandas/tests/extension/list/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/extension/__pycache__/conftest.cpython-37.pyc,, +pandas/tests/extension/__pycache__/test_boolean.cpython-37.pyc,, +pandas/tests/extension/__pycache__/test_categorical.cpython-37.pyc,, +pandas/tests/extension/__pycache__/test_common.cpython-37.pyc,, +pandas/tests/extension/__pycache__/test_datetime.cpython-37.pyc,, +pandas/tests/extension/__pycache__/test_external_block.cpython-37.pyc,, +pandas/tests/extension/__pycache__/test_integer.cpython-37.pyc,, +pandas/tests/extension/__pycache__/test_interval.cpython-37.pyc,, +pandas/tests/extension/__pycache__/test_numpy.cpython-37.pyc,, +pandas/tests/extension/__pycache__/test_period.cpython-37.pyc,, +pandas/tests/extension/__pycache__/test_sparse.cpython-37.pyc,, +pandas/tests/extension/__pycache__/test_string.cpython-37.pyc,, +pandas/tests/extension/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/frame/methods/__pycache__/test_append.cpython-37.pyc,, +pandas/tests/frame/methods/__pycache__/test_asof.cpython-37.pyc,, +pandas/tests/frame/methods/__pycache__/test_clip.cpython-37.pyc,, +pandas/tests/frame/methods/__pycache__/test_count.cpython-37.pyc,, +pandas/tests/frame/methods/__pycache__/test_cov_corr.cpython-37.pyc,, +pandas/tests/frame/methods/__pycache__/test_describe.cpython-37.pyc,, +pandas/tests/frame/methods/__pycache__/test_diff.cpython-37.pyc,, +pandas/tests/frame/methods/__pycache__/test_drop_duplicates.cpython-37.pyc,, +pandas/tests/frame/methods/__pycache__/test_duplicated.cpython-37.pyc,, +pandas/tests/frame/methods/__pycache__/test_explode.cpython-37.pyc,, +pandas/tests/frame/methods/__pycache__/test_isin.cpython-37.pyc,, +pandas/tests/frame/methods/__pycache__/test_nlargest.cpython-37.pyc,, +pandas/tests/frame/methods/__pycache__/test_pct_change.cpython-37.pyc,, +pandas/tests/frame/methods/__pycache__/test_quantile.cpython-37.pyc,, +pandas/tests/frame/methods/__pycache__/test_rank.cpython-37.pyc,, +pandas/tests/frame/methods/__pycache__/test_replace.cpython-37.pyc,, +pandas/tests/frame/methods/__pycache__/test_round.cpython-37.pyc,, +pandas/tests/frame/methods/__pycache__/test_shift.cpython-37.pyc,, +pandas/tests/frame/methods/__pycache__/test_sort_index.cpython-37.pyc,, +pandas/tests/frame/methods/__pycache__/test_sort_values.cpython-37.pyc,, +pandas/tests/frame/methods/__pycache__/test_to_dict.cpython-37.pyc,, +pandas/tests/frame/methods/__pycache__/test_to_records.cpython-37.pyc,, +pandas/tests/frame/methods/__pycache__/test_transpose.cpython-37.pyc,, +pandas/tests/frame/methods/__pycache__/test_truncate.cpython-37.pyc,, +pandas/tests/frame/methods/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/frame/__pycache__/common.cpython-37.pyc,, +pandas/tests/frame/__pycache__/conftest.cpython-37.pyc,, +pandas/tests/frame/__pycache__/test_alter_axes.cpython-37.pyc,, +pandas/tests/frame/__pycache__/test_analytics.cpython-37.pyc,, +pandas/tests/frame/__pycache__/test_api.cpython-37.pyc,, +pandas/tests/frame/__pycache__/test_apply.cpython-37.pyc,, +pandas/tests/frame/__pycache__/test_arithmetic.cpython-37.pyc,, +pandas/tests/frame/__pycache__/test_axis_select_reindex.cpython-37.pyc,, +pandas/tests/frame/__pycache__/test_block_internals.cpython-37.pyc,, +pandas/tests/frame/__pycache__/test_combine_concat.cpython-37.pyc,, +pandas/tests/frame/__pycache__/test_constructors.cpython-37.pyc,, +pandas/tests/frame/__pycache__/test_cumulative.cpython-37.pyc,, +pandas/tests/frame/__pycache__/test_dtypes.cpython-37.pyc,, +pandas/tests/frame/__pycache__/test_join.cpython-37.pyc,, +pandas/tests/frame/__pycache__/test_missing.cpython-37.pyc,, +pandas/tests/frame/__pycache__/test_mutate_columns.cpython-37.pyc,, +pandas/tests/frame/__pycache__/test_nonunique_indexes.cpython-37.pyc,, +pandas/tests/frame/__pycache__/test_operators.cpython-37.pyc,, +pandas/tests/frame/__pycache__/test_period.cpython-37.pyc,, +pandas/tests/frame/__pycache__/test_query_eval.cpython-37.pyc,, +pandas/tests/frame/__pycache__/test_repr_info.cpython-37.pyc,, +pandas/tests/frame/__pycache__/test_reshape.cpython-37.pyc,, +pandas/tests/frame/__pycache__/test_sort_values_level_as_str.cpython-37.pyc,, +pandas/tests/frame/__pycache__/test_subclass.cpython-37.pyc,, +pandas/tests/frame/__pycache__/test_timeseries.cpython-37.pyc,, +pandas/tests/frame/__pycache__/test_timezones.cpython-37.pyc,, +pandas/tests/frame/__pycache__/test_to_csv.cpython-37.pyc,, +pandas/tests/frame/__pycache__/test_validate.cpython-37.pyc,, +pandas/tests/frame/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/generic/__pycache__/test_frame.cpython-37.pyc,, +pandas/tests/generic/__pycache__/test_generic.cpython-37.pyc,, +pandas/tests/generic/__pycache__/test_label_or_level_utils.cpython-37.pyc,, +pandas/tests/generic/__pycache__/test_series.cpython-37.pyc,, +pandas/tests/generic/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/groupby/aggregate/__pycache__/test_aggregate.cpython-37.pyc,, +pandas/tests/groupby/aggregate/__pycache__/test_cython.cpython-37.pyc,, +pandas/tests/groupby/aggregate/__pycache__/test_other.cpython-37.pyc,, +pandas/tests/groupby/aggregate/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/groupby/__pycache__/conftest.cpython-37.pyc,, +pandas/tests/groupby/__pycache__/test_apply.cpython-37.pyc,, +pandas/tests/groupby/__pycache__/test_bin_groupby.cpython-37.pyc,, +pandas/tests/groupby/__pycache__/test_categorical.cpython-37.pyc,, +pandas/tests/groupby/__pycache__/test_counting.cpython-37.pyc,, +pandas/tests/groupby/__pycache__/test_filters.cpython-37.pyc,, +pandas/tests/groupby/__pycache__/test_function.cpython-37.pyc,, +pandas/tests/groupby/__pycache__/test_groupby.cpython-37.pyc,, +pandas/tests/groupby/__pycache__/test_grouping.cpython-37.pyc,, +pandas/tests/groupby/__pycache__/test_index_as_string.cpython-37.pyc,, +pandas/tests/groupby/__pycache__/test_nth.cpython-37.pyc,, +pandas/tests/groupby/__pycache__/test_rank.cpython-37.pyc,, +pandas/tests/groupby/__pycache__/test_timegrouper.cpython-37.pyc,, +pandas/tests/groupby/__pycache__/test_transform.cpython-37.pyc,, +pandas/tests/groupby/__pycache__/test_value_counts.cpython-37.pyc,, +pandas/tests/groupby/__pycache__/test_whitelist.cpython-37.pyc,, +pandas/tests/groupby/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/indexes/categorical/__pycache__/test_category.cpython-37.pyc,, +pandas/tests/indexes/categorical/__pycache__/test_constructors.cpython-37.pyc,, +pandas/tests/indexes/categorical/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/indexes/datetimes/__pycache__/test_astype.cpython-37.pyc,, +pandas/tests/indexes/datetimes/__pycache__/test_constructors.cpython-37.pyc,, +pandas/tests/indexes/datetimes/__pycache__/test_datetime.cpython-37.pyc,, +pandas/tests/indexes/datetimes/__pycache__/test_datetimelike.cpython-37.pyc,, +pandas/tests/indexes/datetimes/__pycache__/test_date_range.cpython-37.pyc,, +pandas/tests/indexes/datetimes/__pycache__/test_formats.cpython-37.pyc,, +pandas/tests/indexes/datetimes/__pycache__/test_indexing.cpython-37.pyc,, +pandas/tests/indexes/datetimes/__pycache__/test_misc.cpython-37.pyc,, +pandas/tests/indexes/datetimes/__pycache__/test_missing.cpython-37.pyc,, +pandas/tests/indexes/datetimes/__pycache__/test_ops.cpython-37.pyc,, +pandas/tests/indexes/datetimes/__pycache__/test_partial_slicing.cpython-37.pyc,, +pandas/tests/indexes/datetimes/__pycache__/test_scalar_compat.cpython-37.pyc,, +pandas/tests/indexes/datetimes/__pycache__/test_setops.cpython-37.pyc,, +pandas/tests/indexes/datetimes/__pycache__/test_shift.cpython-37.pyc,, +pandas/tests/indexes/datetimes/__pycache__/test_timezones.cpython-37.pyc,, +pandas/tests/indexes/datetimes/__pycache__/test_tools.cpython-37.pyc,, +pandas/tests/indexes/datetimes/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/indexes/interval/__pycache__/test_astype.cpython-37.pyc,, +pandas/tests/indexes/interval/__pycache__/test_base.cpython-37.pyc,, +pandas/tests/indexes/interval/__pycache__/test_constructors.cpython-37.pyc,, +pandas/tests/indexes/interval/__pycache__/test_formats.cpython-37.pyc,, +pandas/tests/indexes/interval/__pycache__/test_indexing.cpython-37.pyc,, +pandas/tests/indexes/interval/__pycache__/test_interval.cpython-37.pyc,, +pandas/tests/indexes/interval/__pycache__/test_interval_range.cpython-37.pyc,, +pandas/tests/indexes/interval/__pycache__/test_interval_tree.cpython-37.pyc,, +pandas/tests/indexes/interval/__pycache__/test_setops.cpython-37.pyc,, +pandas/tests/indexes/interval/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/indexes/multi/__pycache__/conftest.cpython-37.pyc,, +pandas/tests/indexes/multi/__pycache__/test_analytics.cpython-37.pyc,, +pandas/tests/indexes/multi/__pycache__/test_astype.cpython-37.pyc,, +pandas/tests/indexes/multi/__pycache__/test_compat.cpython-37.pyc,, +pandas/tests/indexes/multi/__pycache__/test_constructors.cpython-37.pyc,, +pandas/tests/indexes/multi/__pycache__/test_contains.cpython-37.pyc,, +pandas/tests/indexes/multi/__pycache__/test_conversion.cpython-37.pyc,, +pandas/tests/indexes/multi/__pycache__/test_copy.cpython-37.pyc,, +pandas/tests/indexes/multi/__pycache__/test_drop.cpython-37.pyc,, +pandas/tests/indexes/multi/__pycache__/test_duplicates.cpython-37.pyc,, +pandas/tests/indexes/multi/__pycache__/test_equivalence.cpython-37.pyc,, +pandas/tests/indexes/multi/__pycache__/test_format.cpython-37.pyc,, +pandas/tests/indexes/multi/__pycache__/test_get_set.cpython-37.pyc,, +pandas/tests/indexes/multi/__pycache__/test_indexing.cpython-37.pyc,, +pandas/tests/indexes/multi/__pycache__/test_integrity.cpython-37.pyc,, +pandas/tests/indexes/multi/__pycache__/test_join.cpython-37.pyc,, +pandas/tests/indexes/multi/__pycache__/test_missing.cpython-37.pyc,, +pandas/tests/indexes/multi/__pycache__/test_monotonic.cpython-37.pyc,, +pandas/tests/indexes/multi/__pycache__/test_names.cpython-37.pyc,, +pandas/tests/indexes/multi/__pycache__/test_partial_indexing.cpython-37.pyc,, +pandas/tests/indexes/multi/__pycache__/test_reindex.cpython-37.pyc,, +pandas/tests/indexes/multi/__pycache__/test_reshape.cpython-37.pyc,, +pandas/tests/indexes/multi/__pycache__/test_setops.cpython-37.pyc,, +pandas/tests/indexes/multi/__pycache__/test_sorting.cpython-37.pyc,, +pandas/tests/indexes/multi/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/indexes/period/__pycache__/test_asfreq.cpython-37.pyc,, +pandas/tests/indexes/period/__pycache__/test_astype.cpython-37.pyc,, +pandas/tests/indexes/period/__pycache__/test_constructors.cpython-37.pyc,, +pandas/tests/indexes/period/__pycache__/test_formats.cpython-37.pyc,, +pandas/tests/indexes/period/__pycache__/test_indexing.cpython-37.pyc,, +pandas/tests/indexes/period/__pycache__/test_ops.cpython-37.pyc,, +pandas/tests/indexes/period/__pycache__/test_partial_slicing.cpython-37.pyc,, +pandas/tests/indexes/period/__pycache__/test_period.cpython-37.pyc,, +pandas/tests/indexes/period/__pycache__/test_period_range.cpython-37.pyc,, +pandas/tests/indexes/period/__pycache__/test_scalar_compat.cpython-37.pyc,, +pandas/tests/indexes/period/__pycache__/test_setops.cpython-37.pyc,, +pandas/tests/indexes/period/__pycache__/test_shift.cpython-37.pyc,, +pandas/tests/indexes/period/__pycache__/test_tools.cpython-37.pyc,, +pandas/tests/indexes/period/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/indexes/ranges/__pycache__/test_constructors.cpython-37.pyc,, +pandas/tests/indexes/ranges/__pycache__/test_range.cpython-37.pyc,, +pandas/tests/indexes/ranges/__pycache__/test_setops.cpython-37.pyc,, +pandas/tests/indexes/ranges/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/indexes/timedeltas/__pycache__/test_astype.cpython-37.pyc,, +pandas/tests/indexes/timedeltas/__pycache__/test_constructors.cpython-37.pyc,, +pandas/tests/indexes/timedeltas/__pycache__/test_formats.cpython-37.pyc,, +pandas/tests/indexes/timedeltas/__pycache__/test_indexing.cpython-37.pyc,, +pandas/tests/indexes/timedeltas/__pycache__/test_ops.cpython-37.pyc,, +pandas/tests/indexes/timedeltas/__pycache__/test_partial_slicing.cpython-37.pyc,, +pandas/tests/indexes/timedeltas/__pycache__/test_scalar_compat.cpython-37.pyc,, +pandas/tests/indexes/timedeltas/__pycache__/test_setops.cpython-37.pyc,, +pandas/tests/indexes/timedeltas/__pycache__/test_shift.cpython-37.pyc,, +pandas/tests/indexes/timedeltas/__pycache__/test_timedelta.cpython-37.pyc,, +pandas/tests/indexes/timedeltas/__pycache__/test_timedelta_range.cpython-37.pyc,, +pandas/tests/indexes/timedeltas/__pycache__/test_tools.cpython-37.pyc,, +pandas/tests/indexes/timedeltas/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/indexes/__pycache__/common.cpython-37.pyc,, +pandas/tests/indexes/__pycache__/conftest.cpython-37.pyc,, +pandas/tests/indexes/__pycache__/datetimelike.cpython-37.pyc,, +pandas/tests/indexes/__pycache__/test_base.cpython-37.pyc,, +pandas/tests/indexes/__pycache__/test_common.cpython-37.pyc,, +pandas/tests/indexes/__pycache__/test_frozen.cpython-37.pyc,, +pandas/tests/indexes/__pycache__/test_numeric.cpython-37.pyc,, +pandas/tests/indexes/__pycache__/test_numpy_compat.cpython-37.pyc,, +pandas/tests/indexes/__pycache__/test_setops.cpython-37.pyc,, +pandas/tests/indexes/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/indexing/interval/__pycache__/test_interval.cpython-37.pyc,, +pandas/tests/indexing/interval/__pycache__/test_interval_new.cpython-37.pyc,, +pandas/tests/indexing/interval/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/indexing/multiindex/__pycache__/conftest.cpython-37.pyc,, +pandas/tests/indexing/multiindex/__pycache__/test_chaining_and_caching.cpython-37.pyc,, +pandas/tests/indexing/multiindex/__pycache__/test_datetime.cpython-37.pyc,, +pandas/tests/indexing/multiindex/__pycache__/test_getitem.cpython-37.pyc,, +pandas/tests/indexing/multiindex/__pycache__/test_iloc.cpython-37.pyc,, +pandas/tests/indexing/multiindex/__pycache__/test_indexing_slow.cpython-37.pyc,, +pandas/tests/indexing/multiindex/__pycache__/test_ix.cpython-37.pyc,, +pandas/tests/indexing/multiindex/__pycache__/test_loc.cpython-37.pyc,, +pandas/tests/indexing/multiindex/__pycache__/test_multiindex.cpython-37.pyc,, +pandas/tests/indexing/multiindex/__pycache__/test_partial.cpython-37.pyc,, +pandas/tests/indexing/multiindex/__pycache__/test_setitem.cpython-37.pyc,, +pandas/tests/indexing/multiindex/__pycache__/test_set_ops.cpython-37.pyc,, +pandas/tests/indexing/multiindex/__pycache__/test_slice.cpython-37.pyc,, +pandas/tests/indexing/multiindex/__pycache__/test_sorted.cpython-37.pyc,, +pandas/tests/indexing/multiindex/__pycache__/test_xs.cpython-37.pyc,, +pandas/tests/indexing/multiindex/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/indexing/__pycache__/common.cpython-37.pyc,, +pandas/tests/indexing/__pycache__/conftest.cpython-37.pyc,, +pandas/tests/indexing/__pycache__/test_callable.cpython-37.pyc,, +pandas/tests/indexing/__pycache__/test_categorical.cpython-37.pyc,, +pandas/tests/indexing/__pycache__/test_chaining_and_caching.cpython-37.pyc,, +pandas/tests/indexing/__pycache__/test_check_indexer.cpython-37.pyc,, +pandas/tests/indexing/__pycache__/test_coercion.cpython-37.pyc,, +pandas/tests/indexing/__pycache__/test_datetime.cpython-37.pyc,, +pandas/tests/indexing/__pycache__/test_floats.cpython-37.pyc,, +pandas/tests/indexing/__pycache__/test_iloc.cpython-37.pyc,, +pandas/tests/indexing/__pycache__/test_indexing.cpython-37.pyc,, +pandas/tests/indexing/__pycache__/test_indexing_engines.cpython-37.pyc,, +pandas/tests/indexing/__pycache__/test_indexing_slow.cpython-37.pyc,, +pandas/tests/indexing/__pycache__/test_loc.cpython-37.pyc,, +pandas/tests/indexing/__pycache__/test_na_indexing.cpython-37.pyc,, +pandas/tests/indexing/__pycache__/test_partial.cpython-37.pyc,, +pandas/tests/indexing/__pycache__/test_scalar.cpython-37.pyc,, +pandas/tests/indexing/__pycache__/test_timedelta.cpython-37.pyc,, +pandas/tests/indexing/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/internals/__pycache__/test_internals.cpython-37.pyc,, +pandas/tests/internals/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/io/excel/__pycache__/conftest.cpython-37.pyc,, +pandas/tests/io/excel/__pycache__/test_odf.cpython-37.pyc,, +pandas/tests/io/excel/__pycache__/test_openpyxl.cpython-37.pyc,, +pandas/tests/io/excel/__pycache__/test_readers.cpython-37.pyc,, +pandas/tests/io/excel/__pycache__/test_style.cpython-37.pyc,, +pandas/tests/io/excel/__pycache__/test_writers.cpython-37.pyc,, +pandas/tests/io/excel/__pycache__/test_xlrd.cpython-37.pyc,, +pandas/tests/io/excel/__pycache__/test_xlsxwriter.cpython-37.pyc,, +pandas/tests/io/excel/__pycache__/test_xlwt.cpython-37.pyc,, +pandas/tests/io/excel/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/io/formats/__pycache__/test_console.cpython-37.pyc,, +pandas/tests/io/formats/__pycache__/test_css.cpython-37.pyc,, +pandas/tests/io/formats/__pycache__/test_eng_formatting.cpython-37.pyc,, +pandas/tests/io/formats/__pycache__/test_format.cpython-37.pyc,, +pandas/tests/io/formats/__pycache__/test_printing.cpython-37.pyc,, +pandas/tests/io/formats/__pycache__/test_style.cpython-37.pyc,, +pandas/tests/io/formats/__pycache__/test_to_csv.cpython-37.pyc,, +pandas/tests/io/formats/__pycache__/test_to_excel.cpython-37.pyc,, +pandas/tests/io/formats/__pycache__/test_to_html.cpython-37.pyc,, +pandas/tests/io/formats/__pycache__/test_to_latex.cpython-37.pyc,, +pandas/tests/io/formats/__pycache__/test_to_markdown.cpython-37.pyc,, +pandas/tests/io/formats/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/io/json/__pycache__/conftest.cpython-37.pyc,, +pandas/tests/io/json/__pycache__/test_compression.cpython-37.pyc,, +pandas/tests/io/json/__pycache__/test_json_table_schema.cpython-37.pyc,, +pandas/tests/io/json/__pycache__/test_normalize.cpython-37.pyc,, +pandas/tests/io/json/__pycache__/test_pandas.cpython-37.pyc,, +pandas/tests/io/json/__pycache__/test_readlines.cpython-37.pyc,, +pandas/tests/io/json/__pycache__/test_ujson.cpython-37.pyc,, +pandas/tests/io/json/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/io/parser/__pycache__/conftest.cpython-37.pyc,, +pandas/tests/io/parser/__pycache__/test_comment.cpython-37.pyc,, +pandas/tests/io/parser/__pycache__/test_common.cpython-37.pyc,, +pandas/tests/io/parser/__pycache__/test_compression.cpython-37.pyc,, +pandas/tests/io/parser/__pycache__/test_converters.cpython-37.pyc,, +pandas/tests/io/parser/__pycache__/test_c_parser_only.cpython-37.pyc,, +pandas/tests/io/parser/__pycache__/test_dialect.cpython-37.pyc,, +pandas/tests/io/parser/__pycache__/test_dtypes.cpython-37.pyc,, +pandas/tests/io/parser/__pycache__/test_encoding.cpython-37.pyc,, +pandas/tests/io/parser/__pycache__/test_header.cpython-37.pyc,, +pandas/tests/io/parser/__pycache__/test_index_col.cpython-37.pyc,, +pandas/tests/io/parser/__pycache__/test_mangle_dupes.cpython-37.pyc,, +pandas/tests/io/parser/__pycache__/test_multi_thread.cpython-37.pyc,, +pandas/tests/io/parser/__pycache__/test_na_values.cpython-37.pyc,, +pandas/tests/io/parser/__pycache__/test_network.cpython-37.pyc,, +pandas/tests/io/parser/__pycache__/test_parse_dates.cpython-37.pyc,, +pandas/tests/io/parser/__pycache__/test_python_parser_only.cpython-37.pyc,, +pandas/tests/io/parser/__pycache__/test_quoting.cpython-37.pyc,, +pandas/tests/io/parser/__pycache__/test_read_fwf.cpython-37.pyc,, +pandas/tests/io/parser/__pycache__/test_skiprows.cpython-37.pyc,, +pandas/tests/io/parser/__pycache__/test_textreader.cpython-37.pyc,, +pandas/tests/io/parser/__pycache__/test_unsupported.cpython-37.pyc,, +pandas/tests/io/parser/__pycache__/test_usecols.cpython-37.pyc,, +pandas/tests/io/parser/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/io/pytables/__pycache__/common.cpython-37.pyc,, +pandas/tests/io/pytables/__pycache__/conftest.cpython-37.pyc,, +pandas/tests/io/pytables/__pycache__/test_compat.cpython-37.pyc,, +pandas/tests/io/pytables/__pycache__/test_complex.cpython-37.pyc,, +pandas/tests/io/pytables/__pycache__/test_pytables_missing.cpython-37.pyc,, +pandas/tests/io/pytables/__pycache__/test_store.cpython-37.pyc,, +pandas/tests/io/pytables/__pycache__/test_timezones.cpython-37.pyc,, +pandas/tests/io/pytables/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/io/sas/__pycache__/test_sas.cpython-37.pyc,, +pandas/tests/io/sas/__pycache__/test_sas7bdat.cpython-37.pyc,, +pandas/tests/io/sas/__pycache__/test_xport.cpython-37.pyc,, +pandas/tests/io/sas/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/io/__pycache__/conftest.cpython-37.pyc,, +pandas/tests/io/__pycache__/generate_legacy_storage_files.cpython-37.pyc,, +pandas/tests/io/__pycache__/test_clipboard.cpython-37.pyc,, +pandas/tests/io/__pycache__/test_common.cpython-37.pyc,, +pandas/tests/io/__pycache__/test_compression.cpython-37.pyc,, +pandas/tests/io/__pycache__/test_date_converters.cpython-37.pyc,, +pandas/tests/io/__pycache__/test_feather.cpython-37.pyc,, +pandas/tests/io/__pycache__/test_gbq.cpython-37.pyc,, +pandas/tests/io/__pycache__/test_gcs.cpython-37.pyc,, +pandas/tests/io/__pycache__/test_html.cpython-37.pyc,, +pandas/tests/io/__pycache__/test_orc.cpython-37.pyc,, +pandas/tests/io/__pycache__/test_parquet.cpython-37.pyc,, +pandas/tests/io/__pycache__/test_pickle.cpython-37.pyc,, +pandas/tests/io/__pycache__/test_s3.cpython-37.pyc,, +pandas/tests/io/__pycache__/test_spss.cpython-37.pyc,, +pandas/tests/io/__pycache__/test_sql.cpython-37.pyc,, +pandas/tests/io/__pycache__/test_stata.cpython-37.pyc,, +pandas/tests/io/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/plotting/__pycache__/common.cpython-37.pyc,, +pandas/tests/plotting/__pycache__/test_backend.cpython-37.pyc,, +pandas/tests/plotting/__pycache__/test_boxplot_method.cpython-37.pyc,, +pandas/tests/plotting/__pycache__/test_converter.cpython-37.pyc,, +pandas/tests/plotting/__pycache__/test_datetimelike.cpython-37.pyc,, +pandas/tests/plotting/__pycache__/test_frame.cpython-37.pyc,, +pandas/tests/plotting/__pycache__/test_groupby.cpython-37.pyc,, +pandas/tests/plotting/__pycache__/test_hist_method.cpython-37.pyc,, +pandas/tests/plotting/__pycache__/test_misc.cpython-37.pyc,, +pandas/tests/plotting/__pycache__/test_series.cpython-37.pyc,, +pandas/tests/plotting/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/reductions/__pycache__/test_reductions.cpython-37.pyc,, +pandas/tests/reductions/__pycache__/test_stat_reductions.cpython-37.pyc,, +pandas/tests/reductions/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/resample/__pycache__/conftest.cpython-37.pyc,, +pandas/tests/resample/__pycache__/test_base.cpython-37.pyc,, +pandas/tests/resample/__pycache__/test_datetime_index.cpython-37.pyc,, +pandas/tests/resample/__pycache__/test_period_index.cpython-37.pyc,, +pandas/tests/resample/__pycache__/test_resampler_grouper.cpython-37.pyc,, +pandas/tests/resample/__pycache__/test_resample_api.cpython-37.pyc,, +pandas/tests/resample/__pycache__/test_timedelta.cpython-37.pyc,, +pandas/tests/resample/__pycache__/test_time_grouper.cpython-37.pyc,, +pandas/tests/resample/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/reshape/merge/__pycache__/test_join.cpython-37.pyc,, +pandas/tests/reshape/merge/__pycache__/test_merge.cpython-37.pyc,, +pandas/tests/reshape/merge/__pycache__/test_merge_asof.cpython-37.pyc,, +pandas/tests/reshape/merge/__pycache__/test_merge_index_as_string.cpython-37.pyc,, +pandas/tests/reshape/merge/__pycache__/test_merge_ordered.cpython-37.pyc,, +pandas/tests/reshape/merge/__pycache__/test_multi.cpython-37.pyc,, +pandas/tests/reshape/merge/__pycache__/test_pivot_old.cpython-37.pyc,, +pandas/tests/reshape/merge/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/reshape/__pycache__/test_concat.cpython-37.pyc,, +pandas/tests/reshape/__pycache__/test_cut.cpython-37.pyc,, +pandas/tests/reshape/__pycache__/test_melt.cpython-37.pyc,, +pandas/tests/reshape/__pycache__/test_pivot.cpython-37.pyc,, +pandas/tests/reshape/__pycache__/test_qcut.cpython-37.pyc,, +pandas/tests/reshape/__pycache__/test_reshape.cpython-37.pyc,, +pandas/tests/reshape/__pycache__/test_union_categoricals.cpython-37.pyc,, +pandas/tests/reshape/__pycache__/test_util.cpython-37.pyc,, +pandas/tests/reshape/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/scalar/interval/__pycache__/test_interval.cpython-37.pyc,, +pandas/tests/scalar/interval/__pycache__/test_ops.cpython-37.pyc,, +pandas/tests/scalar/interval/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/scalar/period/__pycache__/test_asfreq.cpython-37.pyc,, +pandas/tests/scalar/period/__pycache__/test_period.cpython-37.pyc,, +pandas/tests/scalar/period/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/scalar/timedelta/__pycache__/test_arithmetic.cpython-37.pyc,, +pandas/tests/scalar/timedelta/__pycache__/test_constructors.cpython-37.pyc,, +pandas/tests/scalar/timedelta/__pycache__/test_formats.cpython-37.pyc,, +pandas/tests/scalar/timedelta/__pycache__/test_timedelta.cpython-37.pyc,, +pandas/tests/scalar/timedelta/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/scalar/timestamp/__pycache__/test_arithmetic.cpython-37.pyc,, +pandas/tests/scalar/timestamp/__pycache__/test_comparisons.cpython-37.pyc,, +pandas/tests/scalar/timestamp/__pycache__/test_rendering.cpython-37.pyc,, +pandas/tests/scalar/timestamp/__pycache__/test_timestamp.cpython-37.pyc,, +pandas/tests/scalar/timestamp/__pycache__/test_timezones.cpython-37.pyc,, +pandas/tests/scalar/timestamp/__pycache__/test_unary_ops.cpython-37.pyc,, +pandas/tests/scalar/timestamp/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/scalar/__pycache__/test_nat.cpython-37.pyc,, +pandas/tests/scalar/__pycache__/test_na_scalar.cpython-37.pyc,, +pandas/tests/scalar/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/series/indexing/__pycache__/test_alter_index.cpython-37.pyc,, +pandas/tests/series/indexing/__pycache__/test_boolean.cpython-37.pyc,, +pandas/tests/series/indexing/__pycache__/test_callable.cpython-37.pyc,, +pandas/tests/series/indexing/__pycache__/test_datetime.cpython-37.pyc,, +pandas/tests/series/indexing/__pycache__/test_iloc.cpython-37.pyc,, +pandas/tests/series/indexing/__pycache__/test_indexing.cpython-37.pyc,, +pandas/tests/series/indexing/__pycache__/test_loc.cpython-37.pyc,, +pandas/tests/series/indexing/__pycache__/test_numeric.cpython-37.pyc,, +pandas/tests/series/indexing/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/series/methods/__pycache__/test_append.cpython-37.pyc,, +pandas/tests/series/methods/__pycache__/test_argsort.cpython-37.pyc,, +pandas/tests/series/methods/__pycache__/test_asof.cpython-37.pyc,, +pandas/tests/series/methods/__pycache__/test_clip.cpython-37.pyc,, +pandas/tests/series/methods/__pycache__/test_count.cpython-37.pyc,, +pandas/tests/series/methods/__pycache__/test_cov_corr.cpython-37.pyc,, +pandas/tests/series/methods/__pycache__/test_describe.cpython-37.pyc,, +pandas/tests/series/methods/__pycache__/test_diff.cpython-37.pyc,, +pandas/tests/series/methods/__pycache__/test_drop_duplicates.cpython-37.pyc,, +pandas/tests/series/methods/__pycache__/test_duplicated.cpython-37.pyc,, +pandas/tests/series/methods/__pycache__/test_explode.cpython-37.pyc,, +pandas/tests/series/methods/__pycache__/test_isin.cpython-37.pyc,, +pandas/tests/series/methods/__pycache__/test_nlargest.cpython-37.pyc,, +pandas/tests/series/methods/__pycache__/test_pct_change.cpython-37.pyc,, +pandas/tests/series/methods/__pycache__/test_quantile.cpython-37.pyc,, +pandas/tests/series/methods/__pycache__/test_rank.cpython-37.pyc,, +pandas/tests/series/methods/__pycache__/test_replace.cpython-37.pyc,, +pandas/tests/series/methods/__pycache__/test_round.cpython-37.pyc,, +pandas/tests/series/methods/__pycache__/test_searchsorted.cpython-37.pyc,, +pandas/tests/series/methods/__pycache__/test_shift.cpython-37.pyc,, +pandas/tests/series/methods/__pycache__/test_sort_index.cpython-37.pyc,, +pandas/tests/series/methods/__pycache__/test_sort_values.cpython-37.pyc,, +pandas/tests/series/methods/__pycache__/test_to_dict.cpython-37.pyc,, +pandas/tests/series/methods/__pycache__/test_truncate.cpython-37.pyc,, +pandas/tests/series/methods/__pycache__/test_value_counts.cpython-37.pyc,, +pandas/tests/series/methods/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/series/__pycache__/conftest.cpython-37.pyc,, +pandas/tests/series/__pycache__/test_alter_axes.cpython-37.pyc,, +pandas/tests/series/__pycache__/test_analytics.cpython-37.pyc,, +pandas/tests/series/__pycache__/test_api.cpython-37.pyc,, +pandas/tests/series/__pycache__/test_apply.cpython-37.pyc,, +pandas/tests/series/__pycache__/test_arithmetic.cpython-37.pyc,, +pandas/tests/series/__pycache__/test_block_internals.cpython-37.pyc,, +pandas/tests/series/__pycache__/test_combine_concat.cpython-37.pyc,, +pandas/tests/series/__pycache__/test_constructors.cpython-37.pyc,, +pandas/tests/series/__pycache__/test_convert_dtypes.cpython-37.pyc,, +pandas/tests/series/__pycache__/test_cumulative.cpython-37.pyc,, +pandas/tests/series/__pycache__/test_datetime_values.cpython-37.pyc,, +pandas/tests/series/__pycache__/test_dtypes.cpython-37.pyc,, +pandas/tests/series/__pycache__/test_duplicates.cpython-37.pyc,, +pandas/tests/series/__pycache__/test_internals.cpython-37.pyc,, +pandas/tests/series/__pycache__/test_io.cpython-37.pyc,, +pandas/tests/series/__pycache__/test_missing.cpython-37.pyc,, +pandas/tests/series/__pycache__/test_operators.cpython-37.pyc,, +pandas/tests/series/__pycache__/test_period.cpython-37.pyc,, +pandas/tests/series/__pycache__/test_repr.cpython-37.pyc,, +pandas/tests/series/__pycache__/test_subclass.cpython-37.pyc,, +pandas/tests/series/__pycache__/test_timeseries.cpython-37.pyc,, +pandas/tests/series/__pycache__/test_timezones.cpython-37.pyc,, +pandas/tests/series/__pycache__/test_ufunc.cpython-37.pyc,, +pandas/tests/series/__pycache__/test_validate.cpython-37.pyc,, +pandas/tests/series/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/tools/__pycache__/test_numeric.cpython-37.pyc,, +pandas/tests/tools/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/tseries/frequencies/__pycache__/test_freq_code.cpython-37.pyc,, +pandas/tests/tseries/frequencies/__pycache__/test_inference.cpython-37.pyc,, +pandas/tests/tseries/frequencies/__pycache__/test_to_offset.cpython-37.pyc,, +pandas/tests/tseries/frequencies/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/tseries/holiday/__pycache__/test_calendar.cpython-37.pyc,, +pandas/tests/tseries/holiday/__pycache__/test_federal.cpython-37.pyc,, +pandas/tests/tseries/holiday/__pycache__/test_holiday.cpython-37.pyc,, +pandas/tests/tseries/holiday/__pycache__/test_observance.cpython-37.pyc,, +pandas/tests/tseries/holiday/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/tseries/offsets/__pycache__/common.cpython-37.pyc,, +pandas/tests/tseries/offsets/__pycache__/conftest.cpython-37.pyc,, +pandas/tests/tseries/offsets/__pycache__/test_fiscal.cpython-37.pyc,, +pandas/tests/tseries/offsets/__pycache__/test_offsets.cpython-37.pyc,, +pandas/tests/tseries/offsets/__pycache__/test_offsets_properties.cpython-37.pyc,, +pandas/tests/tseries/offsets/__pycache__/test_ticks.cpython-37.pyc,, +pandas/tests/tseries/offsets/__pycache__/test_yqm_offsets.cpython-37.pyc,, +pandas/tests/tseries/offsets/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/tseries/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/tslibs/__pycache__/test_api.cpython-37.pyc,, +pandas/tests/tslibs/__pycache__/test_array_to_datetime.cpython-37.pyc,, +pandas/tests/tslibs/__pycache__/test_ccalendar.cpython-37.pyc,, +pandas/tests/tslibs/__pycache__/test_conversion.cpython-37.pyc,, +pandas/tests/tslibs/__pycache__/test_fields.cpython-37.pyc,, +pandas/tests/tslibs/__pycache__/test_libfrequencies.cpython-37.pyc,, +pandas/tests/tslibs/__pycache__/test_liboffsets.cpython-37.pyc,, +pandas/tests/tslibs/__pycache__/test_normalize_date.cpython-37.pyc,, +pandas/tests/tslibs/__pycache__/test_parse_iso8601.cpython-37.pyc,, +pandas/tests/tslibs/__pycache__/test_parsing.cpython-37.pyc,, +pandas/tests/tslibs/__pycache__/test_period_asfreq.cpython-37.pyc,, +pandas/tests/tslibs/__pycache__/test_timedeltas.cpython-37.pyc,, +pandas/tests/tslibs/__pycache__/test_timezones.cpython-37.pyc,, +pandas/tests/tslibs/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/util/__pycache__/conftest.cpython-37.pyc,, +pandas/tests/util/__pycache__/test_assert_almost_equal.cpython-37.pyc,, +pandas/tests/util/__pycache__/test_assert_categorical_equal.cpython-37.pyc,, +pandas/tests/util/__pycache__/test_assert_extension_array_equal.cpython-37.pyc,, +pandas/tests/util/__pycache__/test_assert_frame_equal.cpython-37.pyc,, +pandas/tests/util/__pycache__/test_assert_index_equal.cpython-37.pyc,, +pandas/tests/util/__pycache__/test_assert_interval_array_equal.cpython-37.pyc,, +pandas/tests/util/__pycache__/test_assert_numpy_array_equal.cpython-37.pyc,, +pandas/tests/util/__pycache__/test_assert_produces_warning.cpython-37.pyc,, +pandas/tests/util/__pycache__/test_assert_series_equal.cpython-37.pyc,, +pandas/tests/util/__pycache__/test_deprecate.cpython-37.pyc,, +pandas/tests/util/__pycache__/test_deprecate_kwarg.cpython-37.pyc,, +pandas/tests/util/__pycache__/test_hashing.cpython-37.pyc,, +pandas/tests/util/__pycache__/test_safe_import.cpython-37.pyc,, +pandas/tests/util/__pycache__/test_util.cpython-37.pyc,, +pandas/tests/util/__pycache__/test_validate_args.cpython-37.pyc,, +pandas/tests/util/__pycache__/test_validate_args_and_kwargs.cpython-37.pyc,, +pandas/tests/util/__pycache__/test_validate_kwargs.cpython-37.pyc,, +pandas/tests/util/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/window/__pycache__/common.cpython-37.pyc,, +pandas/tests/window/__pycache__/conftest.cpython-37.pyc,, +pandas/tests/window/__pycache__/test_api.cpython-37.pyc,, +pandas/tests/window/__pycache__/test_apply.cpython-37.pyc,, +pandas/tests/window/__pycache__/test_base_indexer.cpython-37.pyc,, +pandas/tests/window/__pycache__/test_dtypes.cpython-37.pyc,, +pandas/tests/window/__pycache__/test_ewm.cpython-37.pyc,, +pandas/tests/window/__pycache__/test_expanding.cpython-37.pyc,, +pandas/tests/window/__pycache__/test_grouper.cpython-37.pyc,, +pandas/tests/window/__pycache__/test_numba.cpython-37.pyc,, +pandas/tests/window/__pycache__/test_pairwise.cpython-37.pyc,, +pandas/tests/window/__pycache__/test_rolling.cpython-37.pyc,, +pandas/tests/window/__pycache__/test_timeseries_window.cpython-37.pyc,, +pandas/tests/window/__pycache__/test_window.cpython-37.pyc,, +pandas/tests/window/__pycache__/__init__.cpython-37.pyc,, +pandas/tests/__pycache__/test_algos.cpython-37.pyc,, +pandas/tests/__pycache__/test_common.cpython-37.pyc,, +pandas/tests/__pycache__/test_compat.cpython-37.pyc,, +pandas/tests/__pycache__/test_downstream.cpython-37.pyc,, +pandas/tests/__pycache__/test_errors.cpython-37.pyc,, +pandas/tests/__pycache__/test_expressions.cpython-37.pyc,, +pandas/tests/__pycache__/test_join.cpython-37.pyc,, +pandas/tests/__pycache__/test_lib.cpython-37.pyc,, +pandas/tests/__pycache__/test_multilevel.cpython-37.pyc,, +pandas/tests/__pycache__/test_nanops.cpython-37.pyc,, +pandas/tests/__pycache__/test_optional_dependency.cpython-37.pyc,, +pandas/tests/__pycache__/test_register_accessor.cpython-37.pyc,, +pandas/tests/__pycache__/test_sorting.cpython-37.pyc,, +pandas/tests/__pycache__/test_strings.cpython-37.pyc,, +pandas/tests/__pycache__/test_take.cpython-37.pyc,, +pandas/tests/__pycache__/__init__.cpython-37.pyc,, +pandas/tseries/__pycache__/api.cpython-37.pyc,, +pandas/tseries/__pycache__/frequencies.cpython-37.pyc,, +pandas/tseries/__pycache__/holiday.cpython-37.pyc,, +pandas/tseries/__pycache__/offsets.cpython-37.pyc,, +pandas/tseries/__pycache__/__init__.cpython-37.pyc,, +pandas/util/__pycache__/testing.cpython-37.pyc,, +pandas/util/__pycache__/_decorators.cpython-37.pyc,, +pandas/util/__pycache__/_depr_module.cpython-37.pyc,, +pandas/util/__pycache__/_doctools.cpython-37.pyc,, +pandas/util/__pycache__/_exceptions.cpython-37.pyc,, +pandas/util/__pycache__/_print_versions.cpython-37.pyc,, +pandas/util/__pycache__/_tester.cpython-37.pyc,, +pandas/util/__pycache__/_test_decorators.cpython-37.pyc,, +pandas/util/__pycache__/_validators.cpython-37.pyc,, +pandas/util/__pycache__/__init__.cpython-37.pyc,, +pandas/_config/__pycache__/config.cpython-37.pyc,, +pandas/_config/__pycache__/dates.cpython-37.pyc,, +pandas/_config/__pycache__/display.cpython-37.pyc,, +pandas/_config/__pycache__/localization.cpython-37.pyc,, +pandas/_config/__pycache__/__init__.cpython-37.pyc,, +pandas/_libs/tslibs/__pycache__/__init__.cpython-37.pyc,, +pandas/_libs/window/__pycache__/__init__.cpython-37.pyc,, +pandas/_libs/__pycache__/__init__.cpython-37.pyc,, +pandas/__pycache__/conftest.cpython-37.pyc,, +pandas/__pycache__/testing.cpython-37.pyc,, +pandas/__pycache__/_testing.cpython-37.pyc,, +pandas/__pycache__/_typing.cpython-37.pyc,, +pandas/__pycache__/_version.cpython-37.pyc,, +pandas/__pycache__/__init__.cpython-37.pyc,, diff --git a/venv/Lib/site-packages/pandas-1.0.0.dist-info/WHEEL b/venv/Lib/site-packages/pandas-1.0.0.dist-info/WHEEL new file mode 100644 index 0000000..c4dd0f9 --- /dev/null +++ b/venv/Lib/site-packages/pandas-1.0.0.dist-info/WHEEL @@ -0,0 +1,5 @@ +Wheel-Version: 1.0 +Generator: bdist_wheel (0.33.6) +Root-Is-Purelib: false +Tag: cp37-cp37m-win_amd64 + diff --git a/venv/Lib/site-packages/pandas-1.0.0.dist-info/entry_points.txt b/venv/Lib/site-packages/pandas-1.0.0.dist-info/entry_points.txt new file mode 100644 index 0000000..3c1b523 --- /dev/null +++ b/venv/Lib/site-packages/pandas-1.0.0.dist-info/entry_points.txt @@ -0,0 +1,3 @@ +[pandas_plotting_backends] +matplotlib = pandas:plotting._matplotlib + diff --git a/venv/Lib/site-packages/pandas-1.0.0.dist-info/top_level.txt b/venv/Lib/site-packages/pandas-1.0.0.dist-info/top_level.txt new file mode 100644 index 0000000..fb6c7ed --- /dev/null +++ b/venv/Lib/site-packages/pandas-1.0.0.dist-info/top_level.txt @@ -0,0 +1 @@ +pandas diff --git a/venv/Lib/site-packages/pandas/__init__.py b/venv/Lib/site-packages/pandas/__init__.py new file mode 100644 index 0000000..491bcb2 --- /dev/null +++ b/venv/Lib/site-packages/pandas/__init__.py @@ -0,0 +1,406 @@ +# flake8: noqa + +__docformat__ = "restructuredtext" + +# Let users know if they're missing any of our hard dependencies +hard_dependencies = ("numpy", "pytz", "dateutil") +missing_dependencies = [] + +for dependency in hard_dependencies: + try: + __import__(dependency) + except ImportError as e: + missing_dependencies.append(f"{dependency}: {e}") + +if missing_dependencies: + raise ImportError( + "Unable to import required dependencies:\n" + "\n".join(missing_dependencies) + ) +del hard_dependencies, dependency, missing_dependencies + +# numpy compat +from pandas.compat.numpy import ( + _np_version_under1p14, + _np_version_under1p15, + _np_version_under1p16, + _np_version_under1p17, + _np_version_under1p18, +) + +try: + from pandas._libs import hashtable as _hashtable, lib as _lib, tslib as _tslib +except ImportError as e: # pragma: no cover + # hack but overkill to use re + module = str(e).replace("cannot import name ", "") + raise ImportError( + f"C extension: {module} not built. If you want to import " + "pandas from the source directory, you may need to run " + "'python setup.py build_ext --inplace --force' to build " + "the C extensions first." + ) + +from pandas._config import ( + get_option, + set_option, + reset_option, + describe_option, + option_context, + options, +) + +# let init-time option registration happen +import pandas.core.config_init + +from pandas.core.api import ( + # dtype + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, + CategoricalDtype, + PeriodDtype, + IntervalDtype, + DatetimeTZDtype, + StringDtype, + BooleanDtype, + # missing + NA, + isna, + isnull, + notna, + notnull, + # indexes + Index, + CategoricalIndex, + Int64Index, + UInt64Index, + RangeIndex, + Float64Index, + MultiIndex, + IntervalIndex, + TimedeltaIndex, + DatetimeIndex, + PeriodIndex, + IndexSlice, + # tseries + NaT, + Period, + period_range, + Timedelta, + timedelta_range, + Timestamp, + date_range, + bdate_range, + Interval, + interval_range, + DateOffset, + # conversion + to_numeric, + to_datetime, + to_timedelta, + # misc + Grouper, + factorize, + unique, + value_counts, + NamedAgg, + array, + Categorical, + set_eng_float_format, + Series, + DataFrame, +) + +from pandas.core.arrays.sparse import SparseDtype + +from pandas.tseries.api import infer_freq +from pandas.tseries import offsets + +from pandas.core.computation.api import eval + +from pandas.core.reshape.api import ( + concat, + lreshape, + melt, + wide_to_long, + merge, + merge_asof, + merge_ordered, + crosstab, + pivot, + pivot_table, + get_dummies, + cut, + qcut, +) + +import pandas.api +from pandas.util._print_versions import show_versions + +from pandas.io.api import ( + # excel + ExcelFile, + ExcelWriter, + read_excel, + # parsers + read_csv, + read_fwf, + read_table, + # pickle + read_pickle, + to_pickle, + # pytables + HDFStore, + read_hdf, + # sql + read_sql, + read_sql_query, + read_sql_table, + # misc + read_clipboard, + read_parquet, + read_orc, + read_feather, + read_gbq, + read_html, + read_json, + read_stata, + read_sas, + read_spss, +) + +from pandas.io.json import _json_normalize as json_normalize + +from pandas.util._tester import test +import pandas.testing +import pandas.arrays + +# use the closest tagged version if possible +from ._version import get_versions + +v = get_versions() +__version__ = v.get("closest-tag", v["version"]) +__git_version__ = v.get("full-revisionid") +del get_versions, v + +# GH 27101 +# TODO: remove Panel compat in 1.0 +if pandas.compat.PY37: + + def __getattr__(name): + import warnings + + if name == "Panel": + + warnings.warn( + "The Panel class is removed from pandas. Accessing it " + "from the top-level namespace will also be removed in " + "the next version", + FutureWarning, + stacklevel=2, + ) + + class Panel: + pass + + return Panel + + elif name == "datetime": + warnings.warn( + "The pandas.datetime class is deprecated " + "and will be removed from pandas in a future version. " + "Import from datetime module instead.", + FutureWarning, + stacklevel=2, + ) + + from datetime import datetime as dt + + return dt + + elif name == "np": + + warnings.warn( + "The pandas.np module is deprecated " + "and will be removed from pandas in a future version. " + "Import numpy directly instead", + FutureWarning, + stacklevel=2, + ) + import numpy as np + + return np + + elif name in {"SparseSeries", "SparseDataFrame"}: + warnings.warn( + f"The {name} class is removed from pandas. Accessing it from " + "the top-level namespace will also be removed in the next " + "version", + FutureWarning, + stacklevel=2, + ) + + return type(name, (), {}) + + elif name == "SparseArray": + + warnings.warn( + "The pandas.SparseArray class is deprecated " + "and will be removed from pandas in a future version. " + "Use pandas.arrays.SparseArray instead.", + FutureWarning, + stacklevel=2, + ) + from pandas.core.arrays.sparse import SparseArray as _SparseArray + + return _SparseArray + + raise AttributeError(f"module 'pandas' has no attribute '{name}'") + + +else: + + class Panel: + pass + + class SparseDataFrame: + pass + + class SparseSeries: + pass + + class __numpy: + def __init__(self): + import numpy as np + import warnings + + self.np = np + self.warnings = warnings + + def __getattr__(self, item): + self.warnings.warn( + "The pandas.np module is deprecated " + "and will be removed from pandas in a future version. " + "Import numpy directly instead", + FutureWarning, + stacklevel=2, + ) + + try: + return getattr(self.np, item) + except AttributeError: + raise AttributeError(f"module numpy has no attribute {item}") + + np = __numpy() + + class __Datetime(type): + + from datetime import datetime as dt + + datetime = dt + + def __getattr__(cls, item): + cls.emit_warning() + + try: + return getattr(cls.datetime, item) + except AttributeError: + raise AttributeError(f"module datetime has no attribute {item}") + + def __instancecheck__(cls, other): + return isinstance(other, cls.datetime) + + class __DatetimeSub(metaclass=__Datetime): + def emit_warning(dummy=0): + import warnings + + warnings.warn( + "The pandas.datetime class is deprecated " + "and will be removed from pandas in a future version. " + "Import from datetime instead.", + FutureWarning, + stacklevel=3, + ) + + def __new__(cls, *args, **kwargs): + cls.emit_warning() + from datetime import datetime as dt + + return dt(*args, **kwargs) + + datetime = __DatetimeSub + + class __SparseArray(type): + + from pandas.core.arrays.sparse import SparseArray as sa + + SparseArray = sa + + def __instancecheck__(cls, other): + return isinstance(other, cls.SparseArray) + + class __SparseArraySub(metaclass=__SparseArray): + def emit_warning(dummy=0): + import warnings + + warnings.warn( + "The pandas.SparseArray class is deprecated " + "and will be removed from pandas in a future version. " + "Use pandas.arrays.SparseArray instead.", + FutureWarning, + stacklevel=3, + ) + + def __new__(cls, *args, **kwargs): + cls.emit_warning() + from pandas.core.arrays.sparse import SparseArray as sa + + return sa(*args, **kwargs) + + SparseArray = __SparseArraySub + + +# module level doc-string +__doc__ = """ +pandas - a powerful data analysis and manipulation library for Python +===================================================================== + +**pandas** is a Python package providing fast, flexible, and expressive data +structures designed to make working with "relational" or "labeled" data both +easy and intuitive. It aims to be the fundamental high-level building block for +doing practical, **real world** data analysis in Python. Additionally, it has +the broader goal of becoming **the most powerful and flexible open source data +analysis / manipulation tool available in any language**. It is already well on +its way toward this goal. + +Main Features +------------- +Here are just a few of the things that pandas does well: + + - Easy handling of missing data in floating point as well as non-floating + point data. + - Size mutability: columns can be inserted and deleted from DataFrame and + higher dimensional objects + - Automatic and explicit data alignment: objects can be explicitly aligned + to a set of labels, or the user can simply ignore the labels and let + `Series`, `DataFrame`, etc. automatically align the data for you in + computations. + - Powerful, flexible group by functionality to perform split-apply-combine + operations on data sets, for both aggregating and transforming data. + - Make it easy to convert ragged, differently-indexed data in other Python + and NumPy data structures into DataFrame objects. + - Intelligent label-based slicing, fancy indexing, and subsetting of large + data sets. + - Intuitive merging and joining data sets. + - Flexible reshaping and pivoting of data sets. + - Hierarchical labeling of axes (possible to have multiple labels per tick). + - Robust IO tools for loading data from flat files (CSV and delimited), + Excel files, databases, and saving/loading data from the ultrafast HDF5 + format. + - Time series-specific functionality: date range generation and frequency + conversion, moving window statistics, date shifting and lagging. +""" diff --git a/venv/Lib/site-packages/pandas/_config/__init__.py b/venv/Lib/site-packages/pandas/_config/__init__.py new file mode 100644 index 0000000..65936a9 --- /dev/null +++ b/venv/Lib/site-packages/pandas/_config/__init__.py @@ -0,0 +1,28 @@ +""" +pandas._config is considered explicitly upstream of everything else in pandas, +should have no intra-pandas dependencies. + +importing `dates` and `display` ensures that keys needed by _libs +are initialized. +""" +__all__ = [ + "config", + "detect_console_encoding", + "get_option", + "set_option", + "reset_option", + "describe_option", + "option_context", + "options", +] +from pandas._config import config +from pandas._config import dates # noqa:F401 +from pandas._config.config import ( + describe_option, + get_option, + option_context, + options, + reset_option, + set_option, +) +from pandas._config.display import detect_console_encoding diff --git a/venv/Lib/site-packages/pandas/_config/config.py b/venv/Lib/site-packages/pandas/_config/config.py new file mode 100644 index 0000000..0a3009f --- /dev/null +++ b/venv/Lib/site-packages/pandas/_config/config.py @@ -0,0 +1,871 @@ +""" +The config module holds package-wide configurables and provides +a uniform API for working with them. + +Overview +======== + +This module supports the following requirements: +- options are referenced using keys in dot.notation, e.g. "x.y.option - z". +- keys are case-insensitive. +- functions should accept partial/regex keys, when unambiguous. +- options can be registered by modules at import time. +- options can be registered at init-time (via core.config_init) +- options have a default value, and (optionally) a description and + validation function associated with them. +- options can be deprecated, in which case referencing them + should produce a warning. +- deprecated options can optionally be rerouted to a replacement + so that accessing a deprecated option reroutes to a differently + named option. +- options can be reset to their default value. +- all option can be reset to their default value at once. +- all options in a certain sub - namespace can be reset at once. +- the user can set / get / reset or ask for the description of an option. +- a developer can register and mark an option as deprecated. +- you can register a callback to be invoked when the option value + is set or reset. Changing the stored value is considered misuse, but + is not verboten. + +Implementation +============== + +- Data is stored using nested dictionaries, and should be accessed + through the provided API. + +- "Registered options" and "Deprecated options" have metadata associated + with them, which are stored in auxiliary dictionaries keyed on the + fully-qualified key, e.g. "x.y.z.option". + +- the config_init module is imported by the package's __init__.py file. + placing any register_option() calls there will ensure those options + are available as soon as pandas is loaded. If you use register_option + in a module, it will only be available after that module is imported, + which you should be aware of. + +- `config_prefix` is a context_manager (for use with the `with` keyword) + which can save developers some typing, see the docstring. + +""" + +from collections import namedtuple +from contextlib import contextmanager +import re +from typing import Any, Dict, Iterable, List +import warnings + +DeprecatedOption = namedtuple("DeprecatedOption", "key msg rkey removal_ver") +RegisteredOption = namedtuple("RegisteredOption", "key defval doc validator cb") + +# holds deprecated option metadata +_deprecated_options: Dict[str, DeprecatedOption] = {} + +# holds registered option metadata +_registered_options: Dict[str, RegisteredOption] = {} + +# holds the current values for registered options +_global_config: Dict[str, Any] = {} + +# keys which have a special meaning +_reserved_keys: List[str] = ["all"] + + +class OptionError(AttributeError, KeyError): + """Exception for pandas.options, backwards compatible with KeyError + checks + """ + + +# +# User API + + +def _get_single_key(pat, silent): + keys = _select_options(pat) + if len(keys) == 0: + if not silent: + _warn_if_deprecated(pat) + raise OptionError(f"No such keys(s): {repr(pat)}") + if len(keys) > 1: + raise OptionError("Pattern matched multiple keys") + key = keys[0] + + if not silent: + _warn_if_deprecated(key) + + key = _translate_key(key) + + return key + + +def _get_option(pat, silent=False): + key = _get_single_key(pat, silent) + + # walk the nested dict + root, k = _get_root(key) + return root[k] + + +def _set_option(*args, **kwargs): + # must at least 1 arg deal with constraints later + nargs = len(args) + if not nargs or nargs % 2 != 0: + raise ValueError("Must provide an even number of non-keyword arguments") + + # default to false + silent = kwargs.pop("silent", False) + + if kwargs: + kwarg = list(kwargs.keys())[0] + raise TypeError(f'_set_option() got an unexpected keyword argument "{kwarg}"') + + for k, v in zip(args[::2], args[1::2]): + key = _get_single_key(k, silent) + + o = _get_registered_option(key) + if o and o.validator: + o.validator(v) + + # walk the nested dict + root, k = _get_root(key) + root[k] = v + + if o.cb: + if silent: + with warnings.catch_warnings(record=True): + o.cb(key) + else: + o.cb(key) + + +def _describe_option(pat="", _print_desc=True): + + keys = _select_options(pat) + if len(keys) == 0: + raise OptionError("No such keys(s)") + + s = "" + for k in keys: # filter by pat + s += _build_option_description(k) + + if _print_desc: + print(s) + else: + return s + + +def _reset_option(pat, silent=False): + + keys = _select_options(pat) + + if len(keys) == 0: + raise OptionError("No such keys(s)") + + if len(keys) > 1 and len(pat) < 4 and pat != "all": + raise ValueError( + "You must specify at least 4 characters when " + "resetting multiple keys, use the special keyword " + '"all" to reset all the options to their default ' + "value" + ) + + for k in keys: + _set_option(k, _registered_options[k].defval, silent=silent) + + +def get_default_val(pat): + key = _get_single_key(pat, silent=True) + return _get_registered_option(key).defval + + +class DictWrapper: + """ provide attribute-style access to a nested dict""" + + def __init__(self, d, prefix=""): + object.__setattr__(self, "d", d) + object.__setattr__(self, "prefix", prefix) + + def __setattr__(self, key, val): + prefix = object.__getattribute__(self, "prefix") + if prefix: + prefix += "." + prefix += key + # you can't set new keys + # can you can't overwrite subtrees + if key in self.d and not isinstance(self.d[key], dict): + _set_option(prefix, val) + else: + raise OptionError("You can only set the value of existing options") + + def __getattr__(self, key: str): + prefix = object.__getattribute__(self, "prefix") + if prefix: + prefix += "." + prefix += key + try: + v = object.__getattribute__(self, "d")[key] + except KeyError: + raise OptionError("No such option") + if isinstance(v, dict): + return DictWrapper(v, prefix) + else: + return _get_option(prefix) + + def __dir__(self): + return list(self.d.keys()) + + +# For user convenience, we'd like to have the available options described +# in the docstring. For dev convenience we'd like to generate the docstrings +# dynamically instead of maintaining them by hand. To this, we use the +# class below which wraps functions inside a callable, and converts +# __doc__ into a property function. The doctsrings below are templates +# using the py2.6+ advanced formatting syntax to plug in a concise list +# of options, and option descriptions. + + +class CallableDynamicDoc: + def __init__(self, func, doc_tmpl): + self.__doc_tmpl__ = doc_tmpl + self.__func__ = func + + def __call__(self, *args, **kwds): + return self.__func__(*args, **kwds) + + @property + def __doc__(self): + opts_desc = _describe_option("all", _print_desc=False) + opts_list = pp_options_list(list(_registered_options.keys())) + return self.__doc_tmpl__.format(opts_desc=opts_desc, opts_list=opts_list) + + +_get_option_tmpl = """ +get_option(pat) + +Retrieves the value of the specified option. + +Available options: + +{opts_list} + +Parameters +---------- +pat : str + Regexp which should match a single option. + Note: partial matches are supported for convenience, but unless you use the + full option name (e.g. x.y.z.option_name), your code may break in future + versions if new options with similar names are introduced. + +Returns +------- +result : the value of the option + +Raises +------ +OptionError : if no such option exists + +Notes +----- +The available options with its descriptions: + +{opts_desc} +""" + +_set_option_tmpl = """ +set_option(pat, value) + +Sets the value of the specified option. + +Available options: + +{opts_list} + +Parameters +---------- +pat : str + Regexp which should match a single option. + Note: partial matches are supported for convenience, but unless you use the + full option name (e.g. x.y.z.option_name), your code may break in future + versions if new options with similar names are introduced. +value : object + New value of option. + +Returns +------- +None + +Raises +------ +OptionError if no such option exists + +Notes +----- +The available options with its descriptions: + +{opts_desc} +""" + +_describe_option_tmpl = """ +describe_option(pat, _print_desc=False) + +Prints the description for one or more registered options. + +Call with not arguments to get a listing for all registered options. + +Available options: + +{opts_list} + +Parameters +---------- +pat : str + Regexp pattern. All matching keys will have their description displayed. +_print_desc : bool, default True + If True (default) the description(s) will be printed to stdout. + Otherwise, the description(s) will be returned as a unicode string + (for testing). + +Returns +------- +None by default, the description(s) as a unicode string if _print_desc +is False + +Notes +----- +The available options with its descriptions: + +{opts_desc} +""" + +_reset_option_tmpl = """ +reset_option(pat) + +Reset one or more options to their default value. + +Pass "all" as argument to reset all options. + +Available options: + +{opts_list} + +Parameters +---------- +pat : str/regex + If specified only options matching `prefix*` will be reset. + Note: partial matches are supported for convenience, but unless you + use the full option name (e.g. x.y.z.option_name), your code may break + in future versions if new options with similar names are introduced. + +Returns +------- +None + +Notes +----- +The available options with its descriptions: + +{opts_desc} +""" + +# bind the functions with their docstrings into a Callable +# and use that as the functions exposed in pd.api +get_option = CallableDynamicDoc(_get_option, _get_option_tmpl) +set_option = CallableDynamicDoc(_set_option, _set_option_tmpl) +reset_option = CallableDynamicDoc(_reset_option, _reset_option_tmpl) +describe_option = CallableDynamicDoc(_describe_option, _describe_option_tmpl) +options = DictWrapper(_global_config) + +# +# Functions for use by pandas developers, in addition to User - api + + +class option_context: + """ + Context manager to temporarily set options in the `with` statement context. + + You need to invoke as ``option_context(pat, val, [(pat, val), ...])``. + + Examples + -------- + + >>> with option_context('display.max_rows', 10, 'display.max_columns', 5): + ... ... + """ + + def __init__(self, *args): + if not (len(args) % 2 == 0 and len(args) >= 2): + raise ValueError( + "Need to invoke as option_context(pat, val, [(pat, val), ...])." + ) + + self.ops = list(zip(args[::2], args[1::2])) + + def __enter__(self): + self.undo = [(pat, _get_option(pat, silent=True)) for pat, val in self.ops] + + for pat, val in self.ops: + _set_option(pat, val, silent=True) + + def __exit__(self, *args): + if self.undo: + for pat, val in self.undo: + _set_option(pat, val, silent=True) + + +def register_option(key: str, defval: object, doc="", validator=None, cb=None): + """Register an option in the package-wide pandas config object + + Parameters + ---------- + key - a fully-qualified key, e.g. "x.y.option - z". + defval - the default value of the option + doc - a string description of the option + validator - a function of a single argument, should raise `ValueError` if + called with a value which is not a legal value for the option. + cb - a function of a single argument "key", which is called + immediately after an option value is set/reset. key is + the full name of the option. + + Returns + ------- + Nothing. + + Raises + ------ + ValueError if `validator` is specified and `defval` is not a valid value. + + """ + import tokenize + import keyword + + key = key.lower() + + if key in _registered_options: + raise OptionError(f"Option '{key}' has already been registered") + if key in _reserved_keys: + raise OptionError(f"Option '{key}' is a reserved key") + + # the default value should be legal + if validator: + validator(defval) + + # walk the nested dict, creating dicts as needed along the path + path = key.split(".") + + for k in path: + # NOTE: tokenize.Name is not a public constant + # error: Module has no attribute "Name" [attr-defined] + if not re.match("^" + tokenize.Name + "$", k): # type: ignore + raise ValueError(f"{k} is not a valid identifier") + if keyword.iskeyword(k): + raise ValueError(f"{k} is a python keyword") + + cursor = _global_config + msg = "Path prefix to option '{option}' is already an option" + + for i, p in enumerate(path[:-1]): + if not isinstance(cursor, dict): + raise OptionError(msg.format(option=".".join(path[:i]))) + if p not in cursor: + cursor[p] = {} + cursor = cursor[p] + + if not isinstance(cursor, dict): + raise OptionError(msg.format(option=".".join(path[:-1]))) + + cursor[path[-1]] = defval # initialize + + # save the option metadata + _registered_options[key] = RegisteredOption( + key=key, defval=defval, doc=doc, validator=validator, cb=cb + ) + + +def deprecate_option(key, msg=None, rkey=None, removal_ver=None): + """ + Mark option `key` as deprecated, if code attempts to access this option, + a warning will be produced, using `msg` if given, or a default message + if not. + if `rkey` is given, any access to the key will be re-routed to `rkey`. + + Neither the existence of `key` nor that if `rkey` is checked. If they + do not exist, any subsequence access will fail as usual, after the + deprecation warning is given. + + Parameters + ---------- + key - the name of the option to be deprecated. must be a fully-qualified + option name (e.g "x.y.z.rkey"). + + msg - (Optional) a warning message to output when the key is referenced. + if no message is given a default message will be emitted. + + rkey - (Optional) the name of an option to reroute access to. + If specified, any referenced `key` will be re-routed to `rkey` + including set/get/reset. + rkey must be a fully-qualified option name (e.g "x.y.z.rkey"). + used by the default message if no `msg` is specified. + + removal_ver - (Optional) specifies the version in which this option will + be removed. used by the default message if no `msg` + is specified. + + Returns + ------- + Nothing + + Raises + ------ + OptionError - if key has already been deprecated. + + """ + + key = key.lower() + + if key in _deprecated_options: + raise OptionError(f"Option '{key}' has already been defined as deprecated.") + + _deprecated_options[key] = DeprecatedOption(key, msg, rkey, removal_ver) + + +# +# functions internal to the module + + +def _select_options(pat): + """returns a list of keys matching `pat` + + if pat=="all", returns all registered options + """ + + # short-circuit for exact key + if pat in _registered_options: + return [pat] + + # else look through all of them + keys = sorted(_registered_options.keys()) + if pat == "all": # reserved key + return keys + + return [k for k in keys if re.search(pat, k, re.I)] + + +def _get_root(key): + path = key.split(".") + cursor = _global_config + for p in path[:-1]: + cursor = cursor[p] + return cursor, path[-1] + + +def _is_deprecated(key): + """ Returns True if the given option has been deprecated """ + + key = key.lower() + return key in _deprecated_options + + +def _get_deprecated_option(key): + """ + Retrieves the metadata for a deprecated option, if `key` is deprecated. + + Returns + ------- + DeprecatedOption (namedtuple) if key is deprecated, None otherwise + """ + + try: + d = _deprecated_options[key] + except KeyError: + return None + else: + return d + + +def _get_registered_option(key): + """ + Retrieves the option metadata if `key` is a registered option. + + Returns + ------- + RegisteredOption (namedtuple) if key is deprecated, None otherwise + """ + return _registered_options.get(key) + + +def _translate_key(key): + """ + if key id deprecated and a replacement key defined, will return the + replacement key, otherwise returns `key` as - is + """ + + d = _get_deprecated_option(key) + if d: + return d.rkey or key + else: + return key + + +def _warn_if_deprecated(key): + """ + Checks if `key` is a deprecated option and if so, prints a warning. + + Returns + ------- + bool - True if `key` is deprecated, False otherwise. + """ + + d = _get_deprecated_option(key) + if d: + if d.msg: + print(d.msg) + warnings.warn(d.msg, FutureWarning) + else: + msg = f"'{key}' is deprecated" + if d.removal_ver: + msg += f" and will be removed in {d.removal_ver}" + if d.rkey: + msg += f", please use '{d.rkey}' instead." + else: + msg += ", please refrain from using it." + + warnings.warn(msg, FutureWarning) + return True + return False + + +def _build_option_description(k): + """ Builds a formatted description of a registered option and prints it """ + + o = _get_registered_option(k) + d = _get_deprecated_option(k) + + s = f"{k} " + + if o.doc: + s += "\n".join(o.doc.strip().split("\n")) + else: + s += "No description available." + + if o: + s += f"\n [default: {o.defval}] [currently: {_get_option(k, True)}]" + + if d: + rkey = d.rkey if d.rkey else "" + s += "\n (Deprecated" + s += f", use `{rkey}` instead." + s += ")" + + return s + + +def pp_options_list(keys, width=80, _print=False): + """ Builds a concise listing of available options, grouped by prefix """ + + from textwrap import wrap + from itertools import groupby + + def pp(name: str, ks: Iterable[str]) -> List[str]: + pfx = "- " + name + ".[" if name else "" + ls = wrap( + ", ".join(ks), + width, + initial_indent=pfx, + subsequent_indent=" ", + break_long_words=False, + ) + if ls and ls[-1] and name: + ls[-1] = ls[-1] + "]" + return ls + + ls: List[str] = [] + singles = [x for x in sorted(keys) if x.find(".") < 0] + if singles: + ls += pp("", singles) + keys = [x for x in keys if x.find(".") >= 0] + + for k, g in groupby(sorted(keys), lambda x: x[: x.rfind(".")]): + ks = [x[len(k) + 1 :] for x in list(g)] + ls += pp(k, ks) + s = "\n".join(ls) + if _print: + print(s) + else: + return s + + +# +# helpers + + +@contextmanager +def config_prefix(prefix): + """contextmanager for multiple invocations of API with a common prefix + + supported API functions: (register / get / set )__option + + Warning: This is not thread - safe, and won't work properly if you import + the API functions into your module using the "from x import y" construct. + + Example: + + import pandas._config.config as cf + with cf.config_prefix("display.font"): + cf.register_option("color", "red") + cf.register_option("size", " 5 pt") + cf.set_option(size, " 6 pt") + cf.get_option(size) + ... + + etc' + + will register options "display.font.color", "display.font.size", set the + value of "display.font.size"... and so on. + """ + + # Note: reset_option relies on set_option, and on key directly + # it does not fit in to this monkey-patching scheme + + global register_option, get_option, set_option, reset_option + + def wrap(func): + def inner(key, *args, **kwds): + pkey = f"{prefix}.{key}" + return func(pkey, *args, **kwds) + + return inner + + _register_option = register_option + _get_option = get_option + _set_option = set_option + set_option = wrap(set_option) + get_option = wrap(get_option) + register_option = wrap(register_option) + yield None + set_option = _set_option + get_option = _get_option + register_option = _register_option + + +# These factories and methods are handy for use as the validator +# arg in register_option + + +def is_type_factory(_type): + """ + + Parameters + ---------- + `_type` - a type to be compared against (e.g. type(x) == `_type`) + + Returns + ------- + validator - a function of a single argument x , which raises + ValueError if type(x) is not equal to `_type` + + """ + + def inner(x): + if type(x) != _type: + raise ValueError(f"Value must have type '{_type}'") + + return inner + + +def is_instance_factory(_type): + """ + + Parameters + ---------- + `_type` - the type to be checked against + + Returns + ------- + validator - a function of a single argument x , which raises + ValueError if x is not an instance of `_type` + + """ + + if isinstance(_type, (tuple, list)): + _type = tuple(_type) + type_repr = "|".join(map(str, _type)) + else: + type_repr = f"'{_type}'" + + def inner(x): + if not isinstance(x, _type): + raise ValueError(f"Value must be an instance of {type_repr}") + + return inner + + +def is_one_of_factory(legal_values): + + callables = [c for c in legal_values if callable(c)] + legal_values = [c for c in legal_values if not callable(c)] + + def inner(x): + if x not in legal_values: + + if not any(c(x) for c in callables): + uvals = [str(lval) for lval in legal_values] + pp_values = "|".join(uvals) + msg = f"Value must be one of {pp_values}" + if len(callables): + msg += " or a callable" + raise ValueError(msg) + + return inner + + +def is_nonnegative_int(value): + """ + Verify that value is None or a positive int. + + Parameters + ---------- + value : None or int + The `value` to be checked. + + Raises + ------ + ValueError + When the value is not None or is a negative integer + """ + + if value is None: + return + + elif isinstance(value, int): + if value >= 0: + return + + msg = "Value must be a nonnegative integer or None" + raise ValueError(msg) + + +# common type validators, for convenience +# usage: register_option(... , validator = is_int) +is_int = is_type_factory(int) +is_bool = is_type_factory(bool) +is_float = is_type_factory(float) +is_str = is_type_factory(str) +is_text = is_instance_factory((str, bytes)) + + +def is_callable(obj): + """ + + Parameters + ---------- + `obj` - the object to be checked + + Returns + ------- + validator - returns True if object is callable + raises ValueError otherwise. + + """ + if not callable(obj): + raise ValueError("Value must be a callable") + return True diff --git a/venv/Lib/site-packages/pandas/_config/dates.py b/venv/Lib/site-packages/pandas/_config/dates.py new file mode 100644 index 0000000..5bf2b49 --- /dev/null +++ b/venv/Lib/site-packages/pandas/_config/dates.py @@ -0,0 +1,23 @@ +""" +config for datetime formatting +""" +from pandas._config import config as cf + +pc_date_dayfirst_doc = """ +: boolean + When True, prints and parses dates with the day first, eg 20/01/2005 +""" + +pc_date_yearfirst_doc = """ +: boolean + When True, prints and parses dates with the year first, eg 2005/01/20 +""" + +with cf.config_prefix("display"): + # Needed upstream of `_libs` because these are used in tslibs.parsing + cf.register_option( + "date_dayfirst", False, pc_date_dayfirst_doc, validator=cf.is_bool + ) + cf.register_option( + "date_yearfirst", False, pc_date_yearfirst_doc, validator=cf.is_bool + ) diff --git a/venv/Lib/site-packages/pandas/_config/display.py b/venv/Lib/site-packages/pandas/_config/display.py new file mode 100644 index 0000000..067b7c5 --- /dev/null +++ b/venv/Lib/site-packages/pandas/_config/display.py @@ -0,0 +1,59 @@ +""" +Unopinionated display configuration. +""" +import locale +import sys + +from pandas._config import config as cf + +# ----------------------------------------------------------------------------- +# Global formatting options +_initial_defencoding = None + + +def detect_console_encoding(): + """ + Try to find the most capable encoding supported by the console. + slightly modified from the way IPython handles the same issue. + """ + global _initial_defencoding + + encoding = None + try: + encoding = sys.stdout.encoding or sys.stdin.encoding + except (AttributeError, IOError): + pass + + # try again for something better + if not encoding or "ascii" in encoding.lower(): + try: + encoding = locale.getpreferredencoding() + except locale.Error: + # can be raised by locale.setlocale(), which is + # called by getpreferredencoding + # (on some systems, see stdlib locale docs) + pass + + # when all else fails. this will usually be "ascii" + if not encoding or "ascii" in encoding.lower(): + encoding = sys.getdefaultencoding() + + # GH#3360, save the reported defencoding at import time + # MPL backends may change it. Make available for debugging. + if not _initial_defencoding: + _initial_defencoding = sys.getdefaultencoding() + + return encoding + + +pc_encoding_doc = """ +: str/unicode + Defaults to the detected encoding of the console. + Specifies the encoding to be used for strings returned by to_string, + these are generally strings meant to be displayed on the console. +""" + +with cf.config_prefix("display"): + cf.register_option( + "encoding", detect_console_encoding(), pc_encoding_doc, validator=cf.is_text + ) diff --git a/venv/Lib/site-packages/pandas/_config/localization.py b/venv/Lib/site-packages/pandas/_config/localization.py new file mode 100644 index 0000000..dd1d494 --- /dev/null +++ b/venv/Lib/site-packages/pandas/_config/localization.py @@ -0,0 +1,166 @@ +""" +Helpers for configuring locale settings. + +Name `localization` is chosen to avoid overlap with builtin `locale` module. +""" +from contextlib import contextmanager +import locale +import re +import subprocess + +from pandas._config.config import options + + +@contextmanager +def set_locale(new_locale, lc_var=locale.LC_ALL): + """ + Context manager for temporarily setting a locale. + + Parameters + ---------- + new_locale : str or tuple + A string of the form .. For example to set + the current locale to US English with a UTF8 encoding, you would pass + "en_US.UTF-8". + lc_var : int, default `locale.LC_ALL` + The category of the locale being set. + + Notes + ----- + This is useful when you want to run a particular block of code under a + particular locale, without globally setting the locale. This probably isn't + thread-safe. + """ + current_locale = locale.getlocale() + + try: + locale.setlocale(lc_var, new_locale) + normalized_locale = locale.getlocale() + if all(x is not None for x in normalized_locale): + yield ".".join(normalized_locale) + else: + yield new_locale + finally: + locale.setlocale(lc_var, current_locale) + + +def can_set_locale(lc, lc_var=locale.LC_ALL): + """ + Check to see if we can set a locale, and subsequently get the locale, + without raising an Exception. + + Parameters + ---------- + lc : str + The locale to attempt to set. + lc_var : int, default `locale.LC_ALL` + The category of the locale being set. + + Returns + ------- + is_valid : bool + Whether the passed locale can be set + """ + + try: + with set_locale(lc, lc_var=lc_var): + pass + except (ValueError, locale.Error): + # horrible name for a Exception subclass + return False + else: + return True + + +def _valid_locales(locales, normalize): + """ + Return a list of normalized locales that do not throw an ``Exception`` + when set. + + Parameters + ---------- + locales : str + A string where each locale is separated by a newline. + normalize : bool + Whether to call ``locale.normalize`` on each locale. + + Returns + ------- + valid_locales : list + A list of valid locales. + """ + if normalize: + normalizer = lambda x: locale.normalize(x.strip()) + else: + normalizer = lambda x: x.strip() + + return list(filter(can_set_locale, map(normalizer, locales))) + + +def _default_locale_getter(): + raw_locales = subprocess.check_output(["locale -a"], shell=True) + return raw_locales + + +def get_locales(prefix=None, normalize=True, locale_getter=_default_locale_getter): + """ + Get all the locales that are available on the system. + + Parameters + ---------- + prefix : str + If not ``None`` then return only those locales with the prefix + provided. For example to get all English language locales (those that + start with ``"en"``), pass ``prefix="en"``. + normalize : bool + Call ``locale.normalize`` on the resulting list of available locales. + If ``True``, only locales that can be set without throwing an + ``Exception`` are returned. + locale_getter : callable + The function to use to retrieve the current locales. This should return + a string with each locale separated by a newline character. + + Returns + ------- + locales : list of strings + A list of locale strings that can be set with ``locale.setlocale()``. + For example:: + + locale.setlocale(locale.LC_ALL, locale_string) + + On error will return None (no locale available, e.g. Windows) + + """ + try: + raw_locales = locale_getter() + except subprocess.CalledProcessError: + # Raised on (some? all?) Windows platforms because Note: "locale -a" + # is not defined + return None + + try: + # raw_locales is "\n" separated list of locales + # it may contain non-decodable parts, so split + # extract what we can and then rejoin. + raw_locales = raw_locales.split(b"\n") + out_locales = [] + for x in raw_locales: + try: + out_locales.append(str(x, encoding=options.display.encoding)) + except UnicodeError: + # 'locale -a' is used to populated 'raw_locales' and on + # Redhat 7 Linux (and maybe others) prints locale names + # using windows-1252 encoding. Bug only triggered by + # a few special characters and when there is an + # extensive list of installed locales. + out_locales.append(str(x, encoding="windows-1252")) + + except TypeError: + pass + + if prefix is None: + return _valid_locales(out_locales, normalize) + + pattern = re.compile(f"{prefix}.*") + found = pattern.findall("\n".join(out_locales)) + return _valid_locales(found, normalize) diff --git a/venv/Lib/site-packages/pandas/_libs/__init__.py b/venv/Lib/site-packages/pandas/_libs/__init__.py new file mode 100644 index 0000000..af67cb3 --- /dev/null +++ b/venv/Lib/site-packages/pandas/_libs/__init__.py @@ -0,0 +1,11 @@ +# flake8: noqa + +from .tslibs import ( + NaT, + NaTType, + OutOfBoundsDatetime, + Period, + Timedelta, + Timestamp, + iNaT, +) diff --git a/venv/Lib/site-packages/pandas/_libs/algos.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/algos.cp37-win_amd64.pyd new file mode 100644 index 0000000..6cdfe63 Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/algos.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/groupby.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/groupby.cp37-win_amd64.pyd new file mode 100644 index 0000000..cd785f6 Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/groupby.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/hashing.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/hashing.cp37-win_amd64.pyd new file mode 100644 index 0000000..a431e51 Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/hashing.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/hashtable.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/hashtable.cp37-win_amd64.pyd new file mode 100644 index 0000000..8cebd10 Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/hashtable.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/index.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/index.cp37-win_amd64.pyd new file mode 100644 index 0000000..7cbc6a2 Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/index.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/indexing.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/indexing.cp37-win_amd64.pyd new file mode 100644 index 0000000..31c821e Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/indexing.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/internals.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/internals.cp37-win_amd64.pyd new file mode 100644 index 0000000..e5a4d3d Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/internals.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/interval.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/interval.cp37-win_amd64.pyd new file mode 100644 index 0000000..8717f75 Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/interval.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/join.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/join.cp37-win_amd64.pyd new file mode 100644 index 0000000..2100106 Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/join.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/json.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/json.cp37-win_amd64.pyd new file mode 100644 index 0000000..2dcb79c Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/json.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/lib.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/lib.cp37-win_amd64.pyd new file mode 100644 index 0000000..ee4ac16 Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/lib.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/missing.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/missing.cp37-win_amd64.pyd new file mode 100644 index 0000000..bdf3eac Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/missing.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/ops.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/ops.cp37-win_amd64.pyd new file mode 100644 index 0000000..a84af4d Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/ops.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/ops_dispatch.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/ops_dispatch.cp37-win_amd64.pyd new file mode 100644 index 0000000..10024b2 Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/ops_dispatch.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/parsers.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/parsers.cp37-win_amd64.pyd new file mode 100644 index 0000000..acacf9f Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/parsers.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/properties.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/properties.cp37-win_amd64.pyd new file mode 100644 index 0000000..911ae25 Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/properties.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/reduction.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/reduction.cp37-win_amd64.pyd new file mode 100644 index 0000000..1fdaa12 Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/reduction.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/reshape.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/reshape.cp37-win_amd64.pyd new file mode 100644 index 0000000..3ac3544 Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/reshape.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/sparse.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/sparse.cp37-win_amd64.pyd new file mode 100644 index 0000000..d12aa93 Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/sparse.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/testing.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/testing.cp37-win_amd64.pyd new file mode 100644 index 0000000..33e14f1 Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/testing.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/tslib.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/tslib.cp37-win_amd64.pyd new file mode 100644 index 0000000..ced6866 Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/tslib.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/tslibs/__init__.py b/venv/Lib/site-packages/pandas/_libs/tslibs/__init__.py new file mode 100644 index 0000000..8d3b00e --- /dev/null +++ b/venv/Lib/site-packages/pandas/_libs/tslibs/__init__.py @@ -0,0 +1,12 @@ +# flake8: noqa + +from .conversion import localize_pydatetime, normalize_date +from .nattype import NaT, NaTType, iNaT, is_null_datetimelike +from .np_datetime import OutOfBoundsDatetime +from .period import IncompatibleFrequency, Period +from .timedeltas import Timedelta, delta_to_nanoseconds, ints_to_pytimedelta +from .timestamps import Timestamp +from .tzconversion import tz_convert_single + +# import fails if we do this before np_datetime +from .c_timestamp import NullFrequencyError # isort:skip diff --git a/venv/Lib/site-packages/pandas/_libs/tslibs/c_timestamp.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/tslibs/c_timestamp.cp37-win_amd64.pyd new file mode 100644 index 0000000..8faacfb Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/tslibs/c_timestamp.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/tslibs/ccalendar.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/tslibs/ccalendar.cp37-win_amd64.pyd new file mode 100644 index 0000000..f0b4526 Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/tslibs/ccalendar.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/tslibs/conversion.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/tslibs/conversion.cp37-win_amd64.pyd new file mode 100644 index 0000000..6f53699 Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/tslibs/conversion.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/tslibs/fields.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/tslibs/fields.cp37-win_amd64.pyd new file mode 100644 index 0000000..6ac2727 Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/tslibs/fields.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/tslibs/frequencies.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/tslibs/frequencies.cp37-win_amd64.pyd new file mode 100644 index 0000000..90e3c2d Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/tslibs/frequencies.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/tslibs/nattype.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/tslibs/nattype.cp37-win_amd64.pyd new file mode 100644 index 0000000..886926c Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/tslibs/nattype.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/tslibs/np_datetime.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/tslibs/np_datetime.cp37-win_amd64.pyd new file mode 100644 index 0000000..83b06a1 Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/tslibs/np_datetime.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/tslibs/offsets.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/tslibs/offsets.cp37-win_amd64.pyd new file mode 100644 index 0000000..2a693fa Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/tslibs/offsets.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/tslibs/parsing.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/tslibs/parsing.cp37-win_amd64.pyd new file mode 100644 index 0000000..8abafe1 Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/tslibs/parsing.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/tslibs/period.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/tslibs/period.cp37-win_amd64.pyd new file mode 100644 index 0000000..2ea9165 Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/tslibs/period.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/tslibs/resolution.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/tslibs/resolution.cp37-win_amd64.pyd new file mode 100644 index 0000000..c12e513 Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/tslibs/resolution.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/tslibs/strptime.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/tslibs/strptime.cp37-win_amd64.pyd new file mode 100644 index 0000000..0f993b3 Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/tslibs/strptime.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/tslibs/timedeltas.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/tslibs/timedeltas.cp37-win_amd64.pyd new file mode 100644 index 0000000..abeb00d Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/tslibs/timedeltas.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/tslibs/timestamps.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/tslibs/timestamps.cp37-win_amd64.pyd new file mode 100644 index 0000000..f9a141b Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/tslibs/timestamps.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/tslibs/timezones.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/tslibs/timezones.cp37-win_amd64.pyd new file mode 100644 index 0000000..796c633 Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/tslibs/timezones.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/tslibs/tzconversion.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/tslibs/tzconversion.cp37-win_amd64.pyd new file mode 100644 index 0000000..cf5f609 Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/tslibs/tzconversion.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/window/__init__.py b/venv/Lib/site-packages/pandas/_libs/window/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/_libs/window/aggregations.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/window/aggregations.cp37-win_amd64.pyd new file mode 100644 index 0000000..ff357ea Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/window/aggregations.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/window/concrt140.dll b/venv/Lib/site-packages/pandas/_libs/window/concrt140.dll new file mode 100644 index 0000000..1065145 Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/window/concrt140.dll differ diff --git a/venv/Lib/site-packages/pandas/_libs/window/indexers.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/window/indexers.cp37-win_amd64.pyd new file mode 100644 index 0000000..1a42e87 Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/window/indexers.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_libs/window/msvcp140.dll b/venv/Lib/site-packages/pandas/_libs/window/msvcp140.dll new file mode 100644 index 0000000..98313d4 Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/window/msvcp140.dll differ diff --git a/venv/Lib/site-packages/pandas/_libs/writers.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/_libs/writers.cp37-win_amd64.pyd new file mode 100644 index 0000000..33a92ee Binary files /dev/null and b/venv/Lib/site-packages/pandas/_libs/writers.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/_testing.py b/venv/Lib/site-packages/pandas/_testing.py new file mode 100644 index 0000000..1fdc5d4 --- /dev/null +++ b/venv/Lib/site-packages/pandas/_testing.py @@ -0,0 +1,2745 @@ +import bz2 +from collections import Counter +from contextlib import contextmanager +from datetime import datetime +from functools import wraps +import gzip +import os +from shutil import rmtree +import string +import tempfile +from typing import Any, List, Optional, Union, cast +import warnings +import zipfile + +import numpy as np +from numpy.random import rand, randn + +from pandas._config.localization import ( # noqa:F401 + can_set_locale, + get_locales, + set_locale, +) + +import pandas._libs.testing as _testing +from pandas._typing import FilePathOrBuffer, FrameOrSeries +from pandas.compat import _get_lzma_file, _import_lzma + +from pandas.core.dtypes.common import ( + is_bool, + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_extension_array_dtype, + is_interval_dtype, + is_list_like, + is_number, + is_period_dtype, + is_sequence, + is_timedelta64_dtype, + needs_i8_conversion, +) +from pandas.core.dtypes.missing import array_equivalent + +import pandas as pd +from pandas import ( + Categorical, + CategoricalIndex, + DataFrame, + DatetimeIndex, + Index, + IntervalIndex, + MultiIndex, + RangeIndex, + Series, + bdate_range, +) +from pandas.core.algorithms import take_1d +from pandas.core.arrays import ( + DatetimeArray, + ExtensionArray, + IntervalArray, + PeriodArray, + TimedeltaArray, + period_array, +) + +from pandas.io.common import urlopen +from pandas.io.formats.printing import pprint_thing + +lzma = _import_lzma() + +N = 30 +K = 4 +_RAISE_NETWORK_ERROR_DEFAULT = False + +# set testing_mode +_testing_mode_warnings = (DeprecationWarning, ResourceWarning) + + +def set_testing_mode(): + # set the testing mode filters + testing_mode = os.environ.get("PANDAS_TESTING_MODE", "None") + if "deprecate" in testing_mode: + warnings.simplefilter("always", _testing_mode_warnings) + + +def reset_testing_mode(): + # reset the testing mode filters + testing_mode = os.environ.get("PANDAS_TESTING_MODE", "None") + if "deprecate" in testing_mode: + warnings.simplefilter("ignore", _testing_mode_warnings) + + +set_testing_mode() + + +def reset_display_options(): + """ + Reset the display options for printing and representing objects. + """ + pd.reset_option("^display.", silent=True) + + +def round_trip_pickle( + obj: Any, path: Optional[FilePathOrBuffer] = None +) -> FrameOrSeries: + """ + Pickle an object and then read it again. + + Parameters + ---------- + obj : any object + The object to pickle and then re-read. + path : str, path object or file-like object, default None + The path where the pickled object is written and then read. + + Returns + ------- + pandas object + The original object that was pickled and then re-read. + """ + _path = path + if _path is None: + _path = f"__{rands(10)}__.pickle" + with ensure_clean(_path) as temp_path: + pd.to_pickle(obj, temp_path) + return pd.read_pickle(temp_path) + + +def round_trip_pathlib(writer, reader, path: Optional[str] = None): + """ + Write an object to file specified by a pathlib.Path and read it back + + Parameters + ---------- + writer : callable bound to pandas object + IO writing function (e.g. DataFrame.to_csv ) + reader : callable + IO reading function (e.g. pd.read_csv ) + path : str, default None + The path where the object is written and then read. + + Returns + ------- + pandas object + The original object that was serialized and then re-read. + """ + import pytest + + Path = pytest.importorskip("pathlib").Path + if path is None: + path = "___pathlib___" + with ensure_clean(path) as path: + writer(Path(path)) + obj = reader(Path(path)) + return obj + + +def round_trip_localpath(writer, reader, path: Optional[str] = None): + """ + Write an object to file specified by a py.path LocalPath and read it back. + + Parameters + ---------- + writer : callable bound to pandas object + IO writing function (e.g. DataFrame.to_csv ) + reader : callable + IO reading function (e.g. pd.read_csv ) + path : str, default None + The path where the object is written and then read. + + Returns + ------- + pandas object + The original object that was serialized and then re-read. + """ + import pytest + + LocalPath = pytest.importorskip("py.path").local + if path is None: + path = "___localpath___" + with ensure_clean(path) as path: + writer(LocalPath(path)) + obj = reader(LocalPath(path)) + return obj + + +@contextmanager +def decompress_file(path, compression): + """ + Open a compressed file and return a file object. + + Parameters + ---------- + path : str + The path where the file is read from. + + compression : {'gzip', 'bz2', 'zip', 'xz', None} + Name of the decompression to use + + Returns + ------- + file object + """ + if compression is None: + f = open(path, "rb") + elif compression == "gzip": + f = gzip.open(path, "rb") + elif compression == "bz2": + f = bz2.BZ2File(path, "rb") + elif compression == "xz": + f = _get_lzma_file(lzma)(path, "rb") + elif compression == "zip": + zip_file = zipfile.ZipFile(path) + zip_names = zip_file.namelist() + if len(zip_names) == 1: + f = zip_file.open(zip_names.pop()) + else: + raise ValueError(f"ZIP file {path} error. Only one file per ZIP.") + else: + raise ValueError(f"Unrecognized compression type: {compression}") + + try: + yield f + finally: + f.close() + if compression == "zip": + zip_file.close() + + +def write_to_compressed(compression, path, data, dest="test"): + """ + Write data to a compressed file. + + Parameters + ---------- + compression : {'gzip', 'bz2', 'zip', 'xz'} + The compression type to use. + path : str + The file path to write the data. + data : str + The data to write. + dest : str, default "test" + The destination file (for ZIP only) + + Raises + ------ + ValueError : An invalid compression value was passed in. + """ + if compression == "zip": + import zipfile + + compress_method = zipfile.ZipFile + elif compression == "gzip": + import gzip + + compress_method = gzip.GzipFile + elif compression == "bz2": + import bz2 + + compress_method = bz2.BZ2File + elif compression == "xz": + compress_method = _get_lzma_file(lzma) + else: + raise ValueError(f"Unrecognized compression type: {compression}") + + if compression == "zip": + mode = "w" + args = (dest, data) + method = "writestr" + else: + mode = "wb" + args = (data,) + method = "write" + + with compress_method(path, mode=mode) as f: + getattr(f, method)(*args) + + +def assert_almost_equal( + left, + right, + check_dtype: Union[bool, str] = "equiv", + check_less_precise: Union[bool, int] = False, + **kwargs, +): + """ + Check that the left and right objects are approximately equal. + + By approximately equal, we refer to objects that are numbers or that + contain numbers which may be equivalent to specific levels of precision. + + Parameters + ---------- + left : object + right : object + check_dtype : bool or {'equiv'}, default 'equiv' + Check dtype if both a and b are the same type. If 'equiv' is passed in, + then `RangeIndex` and `Int64Index` are also considered equivalent + when doing type checking. + check_less_precise : bool or int, default False + Specify comparison precision. 5 digits (False) or 3 digits (True) + after decimal points are compared. If int, then specify the number + of digits to compare. + + When comparing two numbers, if the first number has magnitude less + than 1e-5, we compare the two numbers directly and check whether + they are equivalent within the specified precision. Otherwise, we + compare the **ratio** of the second number to the first number and + check whether it is equivalent to 1 within the specified precision. + """ + if isinstance(left, pd.Index): + assert_index_equal( + left, + right, + check_exact=False, + exact=check_dtype, + check_less_precise=check_less_precise, + **kwargs, + ) + + elif isinstance(left, pd.Series): + assert_series_equal( + left, + right, + check_exact=False, + check_dtype=check_dtype, + check_less_precise=check_less_precise, + **kwargs, + ) + + elif isinstance(left, pd.DataFrame): + assert_frame_equal( + left, + right, + check_exact=False, + check_dtype=check_dtype, + check_less_precise=check_less_precise, + **kwargs, + ) + + else: + # Other sequences. + if check_dtype: + if is_number(left) and is_number(right): + # Do not compare numeric classes, like np.float64 and float. + pass + elif is_bool(left) and is_bool(right): + # Do not compare bool classes, like np.bool_ and bool. + pass + else: + if isinstance(left, np.ndarray) or isinstance(right, np.ndarray): + obj = "numpy array" + else: + obj = "Input" + assert_class_equal(left, right, obj=obj) + _testing.assert_almost_equal( + left, + right, + check_dtype=check_dtype, + check_less_precise=check_less_precise, + **kwargs, + ) + + +def _check_isinstance(left, right, cls): + """ + Helper method for our assert_* methods that ensures that + the two objects being compared have the right type before + proceeding with the comparison. + + Parameters + ---------- + left : The first object being compared. + right : The second object being compared. + cls : The class type to check against. + + Raises + ------ + AssertionError : Either `left` or `right` is not an instance of `cls`. + """ + cls_name = cls.__name__ + + if not isinstance(left, cls): + raise AssertionError( + f"{cls_name} Expected type {cls}, found {type(left)} instead" + ) + if not isinstance(right, cls): + raise AssertionError( + f"{cls_name} Expected type {cls}, found {type(right)} instead" + ) + + +def assert_dict_equal(left, right, compare_keys: bool = True): + + _check_isinstance(left, right, dict) + _testing.assert_dict_equal(left, right, compare_keys=compare_keys) + + +def randbool(size=(), p: float = 0.5): + return rand(*size) <= p + + +RANDS_CHARS = np.array(list(string.ascii_letters + string.digits), dtype=(np.str_, 1)) +RANDU_CHARS = np.array( + list("".join(map(chr, range(1488, 1488 + 26))) + string.digits), + dtype=(np.unicode_, 1), +) + + +def rands_array(nchars, size, dtype="O"): + """ + Generate an array of byte strings. + """ + retval = ( + np.random.choice(RANDS_CHARS, size=nchars * np.prod(size)) + .view((np.str_, nchars)) + .reshape(size) + ) + if dtype is None: + return retval + else: + return retval.astype(dtype) + + +def randu_array(nchars, size, dtype="O"): + """ + Generate an array of unicode strings. + """ + retval = ( + np.random.choice(RANDU_CHARS, size=nchars * np.prod(size)) + .view((np.unicode_, nchars)) + .reshape(size) + ) + if dtype is None: + return retval + else: + return retval.astype(dtype) + + +def rands(nchars): + """ + Generate one random byte string. + + See `rands_array` if you want to create an array of random strings. + + """ + return "".join(np.random.choice(RANDS_CHARS, nchars)) + + +def randu(nchars): + """ + Generate one random unicode string. + + See `randu_array` if you want to create an array of random unicode strings. + + """ + return "".join(np.random.choice(RANDU_CHARS, nchars)) + + +def close(fignum=None): + from matplotlib.pyplot import get_fignums, close as _close + + if fignum is None: + for fignum in get_fignums(): + _close(fignum) + else: + _close(fignum) + + +# ----------------------------------------------------------------------------- +# contextmanager to ensure the file cleanup + + +@contextmanager +def ensure_clean(filename=None, return_filelike=False): + """ + Gets a temporary path and agrees to remove on close. + + Parameters + ---------- + filename : str (optional) + if None, creates a temporary file which is then removed when out of + scope. if passed, creates temporary file with filename as ending. + return_filelike : bool (default False) + if True, returns a file-like which is *always* cleaned. Necessary for + savefig and other functions which want to append extensions. + """ + filename = filename or "" + fd = None + + if return_filelike: + f = tempfile.TemporaryFile(suffix=filename) + try: + yield f + finally: + f.close() + else: + # don't generate tempfile if using a path with directory specified + if len(os.path.dirname(filename)): + raise ValueError("Can't pass a qualified name to ensure_clean()") + + try: + fd, filename = tempfile.mkstemp(suffix=filename) + except UnicodeEncodeError: + import pytest + + pytest.skip("no unicode file names on this system") + + try: + yield filename + finally: + try: + os.close(fd) + except OSError: + print(f"Couldn't close file descriptor: {fd} (file: {filename})") + try: + if os.path.exists(filename): + os.remove(filename) + except OSError as e: + print(f"Exception on removing file: {e}") + + +@contextmanager +def ensure_clean_dir(): + """ + Get a temporary directory path and agrees to remove on close. + + Yields + ------ + Temporary directory path + """ + directory_name = tempfile.mkdtemp(suffix="") + try: + yield directory_name + finally: + try: + rmtree(directory_name) + except OSError: + pass + + +@contextmanager +def ensure_safe_environment_variables(): + """ + Get a context manager to safely set environment variables + + All changes will be undone on close, hence environment variables set + within this contextmanager will neither persist nor change global state. + """ + saved_environ = dict(os.environ) + try: + yield + finally: + os.environ.clear() + os.environ.update(saved_environ) + + +# ----------------------------------------------------------------------------- +# Comparators + + +def equalContents(arr1, arr2) -> bool: + """ + Checks if the set of unique elements of arr1 and arr2 are equivalent. + """ + return frozenset(arr1) == frozenset(arr2) + + +def assert_index_equal( + left: Index, + right: Index, + exact: Union[bool, str] = "equiv", + check_names: bool = True, + check_less_precise: Union[bool, int] = False, + check_exact: bool = True, + check_categorical: bool = True, + obj: str = "Index", +) -> None: + """ + Check that left and right Index are equal. + + Parameters + ---------- + left : Index + right : Index + exact : bool or {'equiv'}, default 'equiv' + Whether to check the Index class, dtype and inferred_type + are identical. If 'equiv', then RangeIndex can be substituted for + Int64Index as well. + check_names : bool, default True + Whether to check the names attribute. + check_less_precise : bool or int, default False + Specify comparison precision. Only used when check_exact is False. + 5 digits (False) or 3 digits (True) after decimal points are compared. + If int, then specify the digits to compare. + check_exact : bool, default True + Whether to compare number exactly. + check_categorical : bool, default True + Whether to compare internal Categorical exactly. + obj : str, default 'Index' + Specify object name being compared, internally used to show appropriate + assertion message. + """ + __tracebackhide__ = True + + def _check_types(l, r, obj="Index"): + if exact: + assert_class_equal(l, r, exact=exact, obj=obj) + + # Skip exact dtype checking when `check_categorical` is False + if check_categorical: + assert_attr_equal("dtype", l, r, obj=obj) + + # allow string-like to have different inferred_types + if l.inferred_type in ("string", "unicode"): + assert r.inferred_type in ("string", "unicode") + else: + assert_attr_equal("inferred_type", l, r, obj=obj) + + def _get_ilevel_values(index, level): + # accept level number only + unique = index.levels[level] + level_codes = index.codes[level] + filled = take_1d(unique._values, level_codes, fill_value=unique._na_value) + values = unique._shallow_copy(filled, name=index.names[level]) + return values + + # instance validation + _check_isinstance(left, right, Index) + + # class / dtype comparison + _check_types(left, right, obj=obj) + + # level comparison + if left.nlevels != right.nlevels: + msg1 = f"{obj} levels are different" + msg2 = f"{left.nlevels}, {left}" + msg3 = f"{right.nlevels}, {right}" + raise_assert_detail(obj, msg1, msg2, msg3) + + # length comparison + if len(left) != len(right): + msg1 = f"{obj} length are different" + msg2 = f"{len(left)}, {left}" + msg3 = f"{len(right)}, {right}" + raise_assert_detail(obj, msg1, msg2, msg3) + + # MultiIndex special comparison for little-friendly error messages + if left.nlevels > 1: + left = cast(MultiIndex, left) + right = cast(MultiIndex, right) + + for level in range(left.nlevels): + # cannot use get_level_values here because it can change dtype + llevel = _get_ilevel_values(left, level) + rlevel = _get_ilevel_values(right, level) + + lobj = f"MultiIndex level [{level}]" + assert_index_equal( + llevel, + rlevel, + exact=exact, + check_names=check_names, + check_less_precise=check_less_precise, + check_exact=check_exact, + obj=lobj, + ) + # get_level_values may change dtype + _check_types(left.levels[level], right.levels[level], obj=obj) + + # skip exact index checking when `check_categorical` is False + if check_exact and check_categorical: + if not left.equals(right): + diff = np.sum((left.values != right.values).astype(int)) * 100.0 / len(left) + msg = f"{obj} values are different ({np.round(diff, 5)} %)" + raise_assert_detail(obj, msg, left, right) + else: + _testing.assert_almost_equal( + left.values, + right.values, + check_less_precise=check_less_precise, + check_dtype=exact, + obj=obj, + lobj=left, + robj=right, + ) + + # metadata comparison + if check_names: + assert_attr_equal("names", left, right, obj=obj) + if isinstance(left, pd.PeriodIndex) or isinstance(right, pd.PeriodIndex): + assert_attr_equal("freq", left, right, obj=obj) + if isinstance(left, pd.IntervalIndex) or isinstance(right, pd.IntervalIndex): + assert_interval_array_equal(left.values, right.values) + + if check_categorical: + if is_categorical_dtype(left) or is_categorical_dtype(right): + assert_categorical_equal(left.values, right.values, obj=f"{obj} category") + + +def assert_class_equal(left, right, exact: Union[bool, str] = True, obj="Input"): + """ + Checks classes are equal. + """ + __tracebackhide__ = True + + def repr_class(x): + if isinstance(x, Index): + # return Index as it is to include values in the error message + return x + + try: + return type(x).__name__ + except AttributeError: + return repr(type(x)) + + if exact == "equiv": + if type(left) != type(right): + # allow equivalence of Int64Index/RangeIndex + types = {type(left).__name__, type(right).__name__} + if len(types - {"Int64Index", "RangeIndex"}): + msg = f"{obj} classes are not equivalent" + raise_assert_detail(obj, msg, repr_class(left), repr_class(right)) + elif exact: + if type(left) != type(right): + msg = f"{obj} classes are different" + raise_assert_detail(obj, msg, repr_class(left), repr_class(right)) + + +def assert_attr_equal(attr, left, right, obj="Attributes"): + """checks attributes are equal. Both objects must have attribute. + + Parameters + ---------- + attr : str + Attribute name being compared. + left : object + right : object + obj : str, default 'Attributes' + Specify object name being compared, internally used to show appropriate + assertion message + """ + __tracebackhide__ = True + + left_attr = getattr(left, attr) + right_attr = getattr(right, attr) + + if left_attr is right_attr: + return True + elif ( + is_number(left_attr) + and np.isnan(left_attr) + and is_number(right_attr) + and np.isnan(right_attr) + ): + # np.nan + return True + + try: + result = left_attr == right_attr + except TypeError: + # datetimetz on rhs may raise TypeError + result = False + if not isinstance(result, bool): + result = result.all() + + if result: + return True + else: + msg = f'Attribute "{attr}" are different' + raise_assert_detail(obj, msg, left_attr, right_attr) + + +def assert_is_valid_plot_return_object(objs): + import matplotlib.pyplot as plt + + if isinstance(objs, (pd.Series, np.ndarray)): + for el in objs.ravel(): + msg = ( + "one of 'objs' is not a matplotlib Axes instance, " + f"type encountered {repr(type(el).__name__)}" + ) + assert isinstance(el, (plt.Axes, dict)), msg + else: + msg = ( + "objs is neither an ndarray of Artist instances nor a single " + "ArtistArtist instance, tuple, or dict, 'objs' is a " + f"{repr(type(objs).__name__)}" + ) + assert isinstance(objs, (plt.Artist, tuple, dict)), msg + + +def isiterable(obj): + return hasattr(obj, "__iter__") + + +def assert_is_sorted(seq): + """Assert that the sequence is sorted.""" + if isinstance(seq, (Index, Series)): + seq = seq.values + # sorting does not change precisions + assert_numpy_array_equal(seq, np.sort(np.array(seq))) + + +def assert_categorical_equal( + left, right, check_dtype=True, check_category_order=True, obj="Categorical" +): + """Test that Categoricals are equivalent. + + Parameters + ---------- + left : Categorical + right : Categorical + check_dtype : bool, default True + Check that integer dtype of the codes are the same + check_category_order : bool, default True + Whether the order of the categories should be compared, which + implies identical integer codes. If False, only the resulting + values are compared. The ordered attribute is + checked regardless. + obj : str, default 'Categorical' + Specify object name being compared, internally used to show appropriate + assertion message + """ + _check_isinstance(left, right, Categorical) + + if check_category_order: + assert_index_equal(left.categories, right.categories, obj=f"{obj}.categories") + assert_numpy_array_equal( + left.codes, right.codes, check_dtype=check_dtype, obj=f"{obj}.codes", + ) + else: + assert_index_equal( + left.categories.sort_values(), + right.categories.sort_values(), + obj=f"{obj}.categories", + ) + assert_index_equal( + left.categories.take(left.codes), + right.categories.take(right.codes), + obj=f"{obj}.values", + ) + + assert_attr_equal("ordered", left, right, obj=obj) + + +def assert_interval_array_equal(left, right, exact="equiv", obj="IntervalArray"): + """Test that two IntervalArrays are equivalent. + + Parameters + ---------- + left, right : IntervalArray + The IntervalArrays to compare. + exact : bool or {'equiv'}, default 'equiv' + Whether to check the Index class, dtype and inferred_type + are identical. If 'equiv', then RangeIndex can be substituted for + Int64Index as well. + obj : str, default 'IntervalArray' + Specify object name being compared, internally used to show appropriate + assertion message + """ + _check_isinstance(left, right, IntervalArray) + + assert_index_equal(left.left, right.left, exact=exact, obj=f"{obj}.left") + assert_index_equal(left.right, right.right, exact=exact, obj=f"{obj}.left") + assert_attr_equal("closed", left, right, obj=obj) + + +def assert_period_array_equal(left, right, obj="PeriodArray"): + _check_isinstance(left, right, PeriodArray) + + assert_numpy_array_equal(left._data, right._data, obj=f"{obj}.values") + assert_attr_equal("freq", left, right, obj=obj) + + +def assert_datetime_array_equal(left, right, obj="DatetimeArray"): + __tracebackhide__ = True + _check_isinstance(left, right, DatetimeArray) + + assert_numpy_array_equal(left._data, right._data, obj=f"{obj}._data") + assert_attr_equal("freq", left, right, obj=obj) + assert_attr_equal("tz", left, right, obj=obj) + + +def assert_timedelta_array_equal(left, right, obj="TimedeltaArray"): + __tracebackhide__ = True + _check_isinstance(left, right, TimedeltaArray) + assert_numpy_array_equal(left._data, right._data, obj=f"{obj}._data") + assert_attr_equal("freq", left, right, obj=obj) + + +def raise_assert_detail(obj, message, left, right, diff=None): + __tracebackhide__ = True + + if isinstance(left, np.ndarray): + left = pprint_thing(left) + elif is_categorical_dtype(left): + left = repr(left) + + if isinstance(right, np.ndarray): + right = pprint_thing(right) + elif is_categorical_dtype(right): + right = repr(right) + + msg = f"""{obj} are different + +{message} +[left]: {left} +[right]: {right}""" + + if diff is not None: + msg += f"\n[diff]: {diff}" + + raise AssertionError(msg) + + +def assert_numpy_array_equal( + left, + right, + strict_nan=False, + check_dtype=True, + err_msg=None, + check_same=None, + obj="numpy array", +): + """ + Check that 'np.ndarray' is equivalent. + + Parameters + ---------- + left, right : numpy.ndarray or iterable + The two arrays to be compared. + strict_nan : bool, default False + If True, consider NaN and None to be different. + check_dtype : bool, default True + Check dtype if both a and b are np.ndarray. + err_msg : str, default None + If provided, used as assertion message. + check_same : None|'copy'|'same', default None + Ensure left and right refer/do not refer to the same memory area. + obj : str, default 'numpy array' + Specify object name being compared, internally used to show appropriate + assertion message. + """ + __tracebackhide__ = True + + # instance validation + # Show a detailed error message when classes are different + assert_class_equal(left, right, obj=obj) + # both classes must be an np.ndarray + _check_isinstance(left, right, np.ndarray) + + def _get_base(obj): + return obj.base if getattr(obj, "base", None) is not None else obj + + left_base = _get_base(left) + right_base = _get_base(right) + + if check_same == "same": + if left_base is not right_base: + raise AssertionError(f"{repr(left_base)} is not {repr(right_base)}") + elif check_same == "copy": + if left_base is right_base: + raise AssertionError(f"{repr(left_base)} is {repr(right_base)}") + + def _raise(left, right, err_msg): + if err_msg is None: + if left.shape != right.shape: + raise_assert_detail( + obj, f"{obj} shapes are different", left.shape, right.shape, + ) + + diff = 0 + for l, r in zip(left, right): + # count up differences + if not array_equivalent(l, r, strict_nan=strict_nan): + diff += 1 + + diff = diff * 100.0 / left.size + msg = f"{obj} values are different ({np.round(diff, 5)} %)" + raise_assert_detail(obj, msg, left, right) + + raise AssertionError(err_msg) + + # compare shape and values + if not array_equivalent(left, right, strict_nan=strict_nan): + _raise(left, right, err_msg) + + if check_dtype: + if isinstance(left, np.ndarray) and isinstance(right, np.ndarray): + assert_attr_equal("dtype", left, right, obj=obj) + + +def assert_extension_array_equal( + left, right, check_dtype=True, check_less_precise=False, check_exact=False +): + """Check that left and right ExtensionArrays are equal. + + Parameters + ---------- + left, right : ExtensionArray + The two arrays to compare + check_dtype : bool, default True + Whether to check if the ExtensionArray dtypes are identical. + check_less_precise : bool or int, default False + Specify comparison precision. Only used when check_exact is False. + 5 digits (False) or 3 digits (True) after decimal points are compared. + If int, then specify the digits to compare. + check_exact : bool, default False + Whether to compare number exactly. + + Notes + ----- + Missing values are checked separately from valid values. + A mask of missing values is computed for each and checked to match. + The remaining all-valid values are cast to object dtype and checked. + """ + assert isinstance(left, ExtensionArray), "left is not an ExtensionArray" + assert isinstance(right, ExtensionArray), "right is not an ExtensionArray" + if check_dtype: + assert_attr_equal("dtype", left, right, obj="ExtensionArray") + + if hasattr(left, "asi8") and type(right) == type(left): + # Avoid slow object-dtype comparisons + assert_numpy_array_equal(left.asi8, right.asi8) + return + + left_na = np.asarray(left.isna()) + right_na = np.asarray(right.isna()) + assert_numpy_array_equal(left_na, right_na, obj="ExtensionArray NA mask") + + left_valid = np.asarray(left[~left_na].astype(object)) + right_valid = np.asarray(right[~right_na].astype(object)) + if check_exact: + assert_numpy_array_equal(left_valid, right_valid, obj="ExtensionArray") + else: + _testing.assert_almost_equal( + left_valid, + right_valid, + check_dtype=check_dtype, + check_less_precise=check_less_precise, + obj="ExtensionArray", + ) + + +# This could be refactored to use the NDFrame.equals method +def assert_series_equal( + left, + right, + check_dtype=True, + check_index_type="equiv", + check_series_type=True, + check_less_precise=False, + check_names=True, + check_exact=False, + check_datetimelike_compat=False, + check_categorical=True, + obj="Series", +): + """ + Check that left and right Series are equal. + + Parameters + ---------- + left : Series + right : Series + check_dtype : bool, default True + Whether to check the Series dtype is identical. + check_index_type : bool or {'equiv'}, default 'equiv' + Whether to check the Index class, dtype and inferred_type + are identical. + check_series_type : bool, default True + Whether to check the Series class is identical. + check_less_precise : bool or int, default False + Specify comparison precision. Only used when check_exact is False. + 5 digits (False) or 3 digits (True) after decimal points are compared. + If int, then specify the digits to compare. + + When comparing two numbers, if the first number has magnitude less + than 1e-5, we compare the two numbers directly and check whether + they are equivalent within the specified precision. Otherwise, we + compare the **ratio** of the second number to the first number and + check whether it is equivalent to 1 within the specified precision. + check_names : bool, default True + Whether to check the Series and Index names attribute. + check_exact : bool, default False + Whether to compare number exactly. + check_datetimelike_compat : bool, default False + Compare datetime-like which is comparable ignoring dtype. + check_categorical : bool, default True + Whether to compare internal Categorical exactly. + obj : str, default 'Series' + Specify object name being compared, internally used to show appropriate + assertion message. + """ + __tracebackhide__ = True + + # instance validation + _check_isinstance(left, right, Series) + + if check_series_type: + # ToDo: There are some tests using rhs is sparse + # lhs is dense. Should use assert_class_equal in future + assert isinstance(left, type(right)) + # assert_class_equal(left, right, obj=obj) + + # length comparison + if len(left) != len(right): + msg1 = f"{len(left)}, {left.index}" + msg2 = f"{len(right)}, {right.index}" + raise_assert_detail(obj, "Series length are different", msg1, msg2) + + # index comparison + assert_index_equal( + left.index, + right.index, + exact=check_index_type, + check_names=check_names, + check_less_precise=check_less_precise, + check_exact=check_exact, + check_categorical=check_categorical, + obj=f"{obj}.index", + ) + + if check_dtype: + # We want to skip exact dtype checking when `check_categorical` + # is False. We'll still raise if only one is a `Categorical`, + # regardless of `check_categorical` + if ( + is_categorical_dtype(left) + and is_categorical_dtype(right) + and not check_categorical + ): + pass + else: + assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") + + if check_exact: + assert_numpy_array_equal( + left._internal_get_values(), + right._internal_get_values(), + check_dtype=check_dtype, + obj=str(obj), + ) + elif check_datetimelike_compat: + # we want to check only if we have compat dtypes + # e.g. integer and M|m are NOT compat, but we can simply check + # the values in that case + if needs_i8_conversion(left) or needs_i8_conversion(right): + + # datetimelike may have different objects (e.g. datetime.datetime + # vs Timestamp) but will compare equal + if not Index(left.values).equals(Index(right.values)): + msg = ( + f"[datetimelike_compat=True] {left.values} " + f"is not equal to {right.values}." + ) + raise AssertionError(msg) + else: + assert_numpy_array_equal( + left._internal_get_values(), + right._internal_get_values(), + check_dtype=check_dtype, + ) + elif is_interval_dtype(left) or is_interval_dtype(right): + assert_interval_array_equal(left.array, right.array) + elif is_extension_array_dtype(left.dtype) and is_datetime64tz_dtype(left.dtype): + # .values is an ndarray, but ._values is the ExtensionArray. + # TODO: Use .array + assert is_extension_array_dtype(right.dtype) + assert_extension_array_equal(left._values, right._values) + elif ( + is_extension_array_dtype(left) + and not is_categorical_dtype(left) + and is_extension_array_dtype(right) + and not is_categorical_dtype(right) + ): + assert_extension_array_equal(left.array, right.array) + else: + _testing.assert_almost_equal( + left._internal_get_values(), + right._internal_get_values(), + check_less_precise=check_less_precise, + check_dtype=check_dtype, + obj=str(obj), + ) + + # metadata comparison + if check_names: + assert_attr_equal("name", left, right, obj=obj) + + if check_categorical: + if is_categorical_dtype(left) or is_categorical_dtype(right): + assert_categorical_equal(left.values, right.values, obj=f"{obj} category") + + +# This could be refactored to use the NDFrame.equals method +def assert_frame_equal( + left, + right, + check_dtype=True, + check_index_type="equiv", + check_column_type="equiv", + check_frame_type=True, + check_less_precise=False, + check_names=True, + by_blocks=False, + check_exact=False, + check_datetimelike_compat=False, + check_categorical=True, + check_like=False, + obj="DataFrame", +): + """ + Check that left and right DataFrame are equal. + + This function is intended to compare two DataFrames and output any + differences. Is is mostly intended for use in unit tests. + Additional parameters allow varying the strictness of the + equality checks performed. + + Parameters + ---------- + left : DataFrame + First DataFrame to compare. + right : DataFrame + Second DataFrame to compare. + check_dtype : bool, default True + Whether to check the DataFrame dtype is identical. + check_index_type : bool or {'equiv'}, default 'equiv' + Whether to check the Index class, dtype and inferred_type + are identical. + check_column_type : bool or {'equiv'}, default 'equiv' + Whether to check the columns class, dtype and inferred_type + are identical. Is passed as the ``exact`` argument of + :func:`assert_index_equal`. + check_frame_type : bool, default True + Whether to check the DataFrame class is identical. + check_less_precise : bool or int, default False + Specify comparison precision. Only used when check_exact is False. + 5 digits (False) or 3 digits (True) after decimal points are compared. + If int, then specify the digits to compare. + + When comparing two numbers, if the first number has magnitude less + than 1e-5, we compare the two numbers directly and check whether + they are equivalent within the specified precision. Otherwise, we + compare the **ratio** of the second number to the first number and + check whether it is equivalent to 1 within the specified precision. + check_names : bool, default True + Whether to check that the `names` attribute for both the `index` + and `column` attributes of the DataFrame is identical. + by_blocks : bool, default False + Specify how to compare internal data. If False, compare by columns. + If True, compare by blocks. + check_exact : bool, default False + Whether to compare number exactly. + check_datetimelike_compat : bool, default False + Compare datetime-like which is comparable ignoring dtype. + check_categorical : bool, default True + Whether to compare internal Categorical exactly. + check_like : bool, default False + If True, ignore the order of index & columns. + Note: index labels must match their respective rows + (same as in columns) - same labels must be with the same data. + obj : str, default 'DataFrame' + Specify object name being compared, internally used to show appropriate + assertion message. + + See Also + -------- + assert_series_equal : Equivalent method for asserting Series equality. + DataFrame.equals : Check DataFrame equality. + + Examples + -------- + This example shows comparing two DataFrames that are equal + but with columns of differing dtypes. + + >>> from pandas._testing import assert_frame_equal + >>> df1 = pd.DataFrame({'a': [1, 2], 'b': [3, 4]}) + >>> df2 = pd.DataFrame({'a': [1, 2], 'b': [3.0, 4.0]}) + + df1 equals itself. + + >>> assert_frame_equal(df1, df1) + + df1 differs from df2 as column 'b' is of a different type. + + >>> assert_frame_equal(df1, df2) + Traceback (most recent call last): + ... + AssertionError: Attributes of DataFrame.iloc[:, 1] (column name="b") are different + + Attribute "dtype" are different + [left]: int64 + [right]: float64 + + Ignore differing dtypes in columns with check_dtype. + + >>> assert_frame_equal(df1, df2, check_dtype=False) + """ + __tracebackhide__ = True + + # instance validation + _check_isinstance(left, right, DataFrame) + + if check_frame_type: + assert isinstance(left, type(right)) + # assert_class_equal(left, right, obj=obj) + + # shape comparison + if left.shape != right.shape: + raise_assert_detail( + obj, f"{obj} shape mismatch", f"{repr(left.shape)}", f"{repr(right.shape)}", + ) + + if check_like: + left, right = left.reindex_like(right), right + + # index comparison + assert_index_equal( + left.index, + right.index, + exact=check_index_type, + check_names=check_names, + check_less_precise=check_less_precise, + check_exact=check_exact, + check_categorical=check_categorical, + obj=f"{obj}.index", + ) + + # column comparison + assert_index_equal( + left.columns, + right.columns, + exact=check_column_type, + check_names=check_names, + check_less_precise=check_less_precise, + check_exact=check_exact, + check_categorical=check_categorical, + obj=f"{obj}.columns", + ) + + # compare by blocks + if by_blocks: + rblocks = right._to_dict_of_blocks() + lblocks = left._to_dict_of_blocks() + for dtype in list(set(list(lblocks.keys()) + list(rblocks.keys()))): + assert dtype in lblocks + assert dtype in rblocks + assert_frame_equal( + lblocks[dtype], rblocks[dtype], check_dtype=check_dtype, obj=obj + ) + + # compare by columns + else: + for i, col in enumerate(left.columns): + assert col in right + lcol = left.iloc[:, i] + rcol = right.iloc[:, i] + assert_series_equal( + lcol, + rcol, + check_dtype=check_dtype, + check_index_type=check_index_type, + check_less_precise=check_less_precise, + check_exact=check_exact, + check_names=check_names, + check_datetimelike_compat=check_datetimelike_compat, + check_categorical=check_categorical, + obj=f'{obj}.iloc[:, {i}] (column name="{col}")', + ) + + +def assert_equal(left, right, **kwargs): + """ + Wrapper for tm.assert_*_equal to dispatch to the appropriate test function. + + Parameters + ---------- + left, right : Index, Series, DataFrame, ExtensionArray, or np.ndarray + The two items to be compared. + **kwargs + All keyword arguments are passed through to the underlying assert method. + """ + __tracebackhide__ = True + + if isinstance(left, pd.Index): + assert_index_equal(left, right, **kwargs) + elif isinstance(left, pd.Series): + assert_series_equal(left, right, **kwargs) + elif isinstance(left, pd.DataFrame): + assert_frame_equal(left, right, **kwargs) + elif isinstance(left, IntervalArray): + assert_interval_array_equal(left, right, **kwargs) + elif isinstance(left, PeriodArray): + assert_period_array_equal(left, right, **kwargs) + elif isinstance(left, DatetimeArray): + assert_datetime_array_equal(left, right, **kwargs) + elif isinstance(left, TimedeltaArray): + assert_timedelta_array_equal(left, right, **kwargs) + elif isinstance(left, ExtensionArray): + assert_extension_array_equal(left, right, **kwargs) + elif isinstance(left, np.ndarray): + assert_numpy_array_equal(left, right, **kwargs) + elif isinstance(left, str): + assert kwargs == {} + assert left == right + else: + raise NotImplementedError(type(left)) + + +def box_expected(expected, box_cls, transpose=True): + """ + Helper function to wrap the expected output of a test in a given box_class. + + Parameters + ---------- + expected : np.ndarray, Index, Series + box_cls : {Index, Series, DataFrame} + + Returns + ------- + subclass of box_cls + """ + if box_cls is pd.Index: + expected = pd.Index(expected) + elif box_cls is pd.Series: + expected = pd.Series(expected) + elif box_cls is pd.DataFrame: + expected = pd.Series(expected).to_frame() + if transpose: + # for vector operations, we we need a DataFrame to be a single-row, + # not a single-column, in order to operate against non-DataFrame + # vectors of the same length. + expected = expected.T + elif box_cls is PeriodArray: + # the PeriodArray constructor is not as flexible as period_array + expected = period_array(expected) + elif box_cls is DatetimeArray: + expected = DatetimeArray(expected) + elif box_cls is TimedeltaArray: + expected = TimedeltaArray(expected) + elif box_cls is np.ndarray: + expected = np.array(expected) + elif box_cls is to_array: + expected = to_array(expected) + else: + raise NotImplementedError(box_cls) + return expected + + +def to_array(obj): + # temporary implementation until we get pd.array in place + if is_period_dtype(obj): + return period_array(obj) + elif is_datetime64_dtype(obj) or is_datetime64tz_dtype(obj): + return DatetimeArray._from_sequence(obj) + elif is_timedelta64_dtype(obj): + return TimedeltaArray._from_sequence(obj) + else: + return np.array(obj) + + +# ----------------------------------------------------------------------------- +# Sparse + + +def assert_sp_array_equal( + left, + right, + check_dtype=True, + check_kind=True, + check_fill_value=True, + consolidate_block_indices=False, +): + """Check that the left and right SparseArray are equal. + + Parameters + ---------- + left : SparseArray + right : SparseArray + check_dtype : bool, default True + Whether to check the data dtype is identical. + check_kind : bool, default True + Whether to just the kind of the sparse index for each column. + check_fill_value : bool, default True + Whether to check that left.fill_value matches right.fill_value + consolidate_block_indices : bool, default False + Whether to consolidate contiguous blocks for sparse arrays with + a BlockIndex. Some operations, e.g. concat, will end up with + block indices that could be consolidated. Setting this to true will + create a new BlockIndex for that array, with consolidated + block indices. + """ + + _check_isinstance(left, right, pd.arrays.SparseArray) + + assert_numpy_array_equal(left.sp_values, right.sp_values, check_dtype=check_dtype) + + # SparseIndex comparison + assert isinstance(left.sp_index, pd._libs.sparse.SparseIndex) + assert isinstance(right.sp_index, pd._libs.sparse.SparseIndex) + + if not check_kind: + left_index = left.sp_index.to_block_index() + right_index = right.sp_index.to_block_index() + else: + left_index = left.sp_index + right_index = right.sp_index + + if consolidate_block_indices and left.kind == "block": + # we'll probably remove this hack... + left_index = left_index.to_int_index().to_block_index() + right_index = right_index.to_int_index().to_block_index() + + if not left_index.equals(right_index): + raise_assert_detail( + "SparseArray.index", "index are not equal", left_index, right_index + ) + else: + # Just ensure a + pass + + if check_fill_value: + assert_attr_equal("fill_value", left, right) + if check_dtype: + assert_attr_equal("dtype", left, right) + assert_numpy_array_equal(left.to_dense(), right.to_dense(), check_dtype=check_dtype) + + +# ----------------------------------------------------------------------------- +# Others + + +def assert_contains_all(iterable, dic): + for k in iterable: + assert k in dic, f"Did not contain item: {repr(k)}" + + +def assert_copy(iter1, iter2, **eql_kwargs): + """ + iter1, iter2: iterables that produce elements + comparable with assert_almost_equal + + Checks that the elements are equal, but not + the same object. (Does not check that items + in sequences are also not the same object) + """ + for elem1, elem2 in zip(iter1, iter2): + assert_almost_equal(elem1, elem2, **eql_kwargs) + msg = ( + f"Expected object {repr(type(elem1))} and object {repr(type(elem2))} to be " + "different objects, but they were the same object." + ) + assert elem1 is not elem2, msg + + +def getCols(k): + return string.ascii_uppercase[:k] + + +# make index +def makeStringIndex(k=10, name=None): + return Index(rands_array(nchars=10, size=k), name=name) + + +def makeUnicodeIndex(k=10, name=None): + return Index(randu_array(nchars=10, size=k), name=name) + + +def makeCategoricalIndex(k=10, n=3, name=None, **kwargs): + """ make a length k index or n categories """ + x = rands_array(nchars=4, size=n) + return CategoricalIndex( + Categorical.from_codes(np.arange(k) % n, categories=x), name=name, **kwargs + ) + + +def makeIntervalIndex(k=10, name=None, **kwargs): + """ make a length k IntervalIndex """ + x = np.linspace(0, 100, num=(k + 1)) + return IntervalIndex.from_breaks(x, name=name, **kwargs) + + +def makeBoolIndex(k=10, name=None): + if k == 1: + return Index([True], name=name) + elif k == 2: + return Index([False, True], name=name) + return Index([False, True] + [False] * (k - 2), name=name) + + +def makeIntIndex(k=10, name=None): + return Index(list(range(k)), name=name) + + +def makeUIntIndex(k=10, name=None): + return Index([2 ** 63 + i for i in range(k)], name=name) + + +def makeRangeIndex(k=10, name=None, **kwargs): + return RangeIndex(0, k, 1, name=name, **kwargs) + + +def makeFloatIndex(k=10, name=None): + values = sorted(np.random.random_sample(k)) - np.random.random_sample(1) + return Index(values * (10 ** np.random.randint(0, 9)), name=name) + + +def makeDateIndex(k=10, freq="B", name=None, **kwargs): + dt = datetime(2000, 1, 1) + dr = bdate_range(dt, periods=k, freq=freq, name=name) + return DatetimeIndex(dr, name=name, **kwargs) + + +def makeTimedeltaIndex(k=10, freq="D", name=None, **kwargs): + return pd.timedelta_range(start="1 day", periods=k, freq=freq, name=name, **kwargs) + + +def makePeriodIndex(k=10, name=None, **kwargs): + dt = datetime(2000, 1, 1) + dr = pd.period_range(start=dt, periods=k, freq="B", name=name, **kwargs) + return dr + + +def makeMultiIndex(k=10, names=None, **kwargs): + return MultiIndex.from_product((("foo", "bar"), (1, 2)), names=names, **kwargs) + + +_names = [ + "Alice", + "Bob", + "Charlie", + "Dan", + "Edith", + "Frank", + "George", + "Hannah", + "Ingrid", + "Jerry", + "Kevin", + "Laura", + "Michael", + "Norbert", + "Oliver", + "Patricia", + "Quinn", + "Ray", + "Sarah", + "Tim", + "Ursula", + "Victor", + "Wendy", + "Xavier", + "Yvonne", + "Zelda", +] + + +def _make_timeseries(start="2000-01-01", end="2000-12-31", freq="1D", seed=None): + """ + Make a DataFrame with a DatetimeIndex + + Parameters + ---------- + start : str or Timestamp, default "2000-01-01" + The start of the index. Passed to date_range with `freq`. + end : str or Timestamp, default "2000-12-31" + The end of the index. Passed to date_range with `freq`. + freq : str or Freq + The frequency to use for the DatetimeIndex + seed : int, optional + The random state seed. + + * name : object dtype with string names + * id : int dtype with + * x, y : float dtype + + Examples + -------- + >>> _make_timeseries() + id name x y + timestamp + 2000-01-01 982 Frank 0.031261 0.986727 + 2000-01-02 1025 Edith -0.086358 -0.032920 + 2000-01-03 982 Edith 0.473177 0.298654 + 2000-01-04 1009 Sarah 0.534344 -0.750377 + 2000-01-05 963 Zelda -0.271573 0.054424 + ... ... ... ... ... + 2000-12-27 980 Ingrid -0.132333 -0.422195 + 2000-12-28 972 Frank -0.376007 -0.298687 + 2000-12-29 1009 Ursula -0.865047 -0.503133 + 2000-12-30 1000 Hannah -0.063757 -0.507336 + 2000-12-31 972 Tim -0.869120 0.531685 + """ + index = pd.date_range(start=start, end=end, freq=freq, name="timestamp") + n = len(index) + state = np.random.RandomState(seed) + columns = { + "name": state.choice(_names, size=n), + "id": state.poisson(1000, size=n), + "x": state.rand(n) * 2 - 1, + "y": state.rand(n) * 2 - 1, + } + df = pd.DataFrame(columns, index=index, columns=sorted(columns)) + if df.index[-1] == end: + df = df.iloc[:-1] + return df + + +def all_index_generator(k=10): + """Generator which can be iterated over to get instances of all the various + index classes. + + Parameters + ---------- + k: length of each of the index instances + """ + all_make_index_funcs = [ + makeIntIndex, + makeFloatIndex, + makeStringIndex, + makeUnicodeIndex, + makeDateIndex, + makePeriodIndex, + makeTimedeltaIndex, + makeBoolIndex, + makeRangeIndex, + makeIntervalIndex, + makeCategoricalIndex, + ] + for make_index_func in all_make_index_funcs: + yield make_index_func(k=k) + + +def index_subclass_makers_generator(): + make_index_funcs = [ + makeDateIndex, + makePeriodIndex, + makeTimedeltaIndex, + makeRangeIndex, + makeIntervalIndex, + makeCategoricalIndex, + makeMultiIndex, + ] + for make_index_func in make_index_funcs: + yield make_index_func + + +def all_timeseries_index_generator(k=10): + """Generator which can be iterated over to get instances of all the classes + which represent time-series. + + Parameters + ---------- + k: length of each of the index instances + """ + make_index_funcs = [makeDateIndex, makePeriodIndex, makeTimedeltaIndex] + for make_index_func in make_index_funcs: + yield make_index_func(k=k) + + +# make series +def makeFloatSeries(name=None): + index = makeStringIndex(N) + return Series(randn(N), index=index, name=name) + + +def makeStringSeries(name=None): + index = makeStringIndex(N) + return Series(randn(N), index=index, name=name) + + +def makeObjectSeries(name=None): + data = makeStringIndex(N) + data = Index(data, dtype=object) + index = makeStringIndex(N) + return Series(data, index=index, name=name) + + +def getSeriesData(): + index = makeStringIndex(N) + return {c: Series(randn(N), index=index) for c in getCols(K)} + + +def makeTimeSeries(nper=None, freq="B", name=None): + if nper is None: + nper = N + return Series(randn(nper), index=makeDateIndex(nper, freq=freq), name=name) + + +def makePeriodSeries(nper=None, name=None): + if nper is None: + nper = N + return Series(randn(nper), index=makePeriodIndex(nper), name=name) + + +def getTimeSeriesData(nper=None, freq="B"): + return {c: makeTimeSeries(nper, freq) for c in getCols(K)} + + +def getPeriodData(nper=None): + return {c: makePeriodSeries(nper) for c in getCols(K)} + + +# make frame +def makeTimeDataFrame(nper=None, freq="B"): + data = getTimeSeriesData(nper, freq) + return DataFrame(data) + + +def makeDataFrame(): + data = getSeriesData() + return DataFrame(data) + + +def getMixedTypeDict(): + index = Index(["a", "b", "c", "d", "e"]) + + data = { + "A": [0.0, 1.0, 2.0, 3.0, 4.0], + "B": [0.0, 1.0, 0.0, 1.0, 0.0], + "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], + "D": bdate_range("1/1/2009", periods=5), + } + + return index, data + + +def makeMixedDataFrame(): + return DataFrame(getMixedTypeDict()[1]) + + +def makePeriodFrame(nper=None): + data = getPeriodData(nper) + return DataFrame(data) + + +def makeCustomIndex( + nentries, nlevels, prefix="#", names=False, ndupe_l=None, idx_type=None +): + """Create an index/multindex with given dimensions, levels, names, etc' + + nentries - number of entries in index + nlevels - number of levels (> 1 produces multindex) + prefix - a string prefix for labels + names - (Optional), bool or list of strings. if True will use default + names, if false will use no names, if a list is given, the name of + each level in the index will be taken from the list. + ndupe_l - (Optional), list of ints, the number of rows for which the + label will repeated at the corresponding level, you can specify just + the first few, the rest will use the default ndupe_l of 1. + len(ndupe_l) <= nlevels. + idx_type - "i"/"f"/"s"/"u"/"dt"/"p"/"td". + If idx_type is not None, `idx_nlevels` must be 1. + "i"/"f" creates an integer/float index, + "s"/"u" creates a string/unicode index + "dt" create a datetime index. + "td" create a datetime index. + + if unspecified, string labels will be generated. + """ + + if ndupe_l is None: + ndupe_l = [1] * nlevels + assert is_sequence(ndupe_l) and len(ndupe_l) <= nlevels + assert names is None or names is False or names is True or len(names) is nlevels + assert idx_type is None or ( + idx_type in ("i", "f", "s", "u", "dt", "p", "td") and nlevels == 1 + ) + + if names is True: + # build default names + names = [prefix + str(i) for i in range(nlevels)] + if names is False: + # pass None to index constructor for no name + names = None + + # make singleton case uniform + if isinstance(names, str) and nlevels == 1: + names = [names] + + # specific 1D index type requested? + idx_func = dict( + i=makeIntIndex, + f=makeFloatIndex, + s=makeStringIndex, + u=makeUnicodeIndex, + dt=makeDateIndex, + td=makeTimedeltaIndex, + p=makePeriodIndex, + ).get(idx_type) + if idx_func: + idx = idx_func(nentries) + # but we need to fill in the name + if names: + idx.name = names[0] + return idx + elif idx_type is not None: + raise ValueError( + f"{repr(idx_type)} is not a legal value for `idx_type`, " + "use 'i'/'f'/'s'/'u'/'dt'/'p'/'td'." + ) + + if len(ndupe_l) < nlevels: + ndupe_l.extend([1] * (nlevels - len(ndupe_l))) + assert len(ndupe_l) == nlevels + + assert all(x > 0 for x in ndupe_l) + + tuples = [] + for i in range(nlevels): + + def keyfunc(x): + import re + + numeric_tuple = re.sub(r"[^\d_]_?", "", x).split("_") + return [int(num) for num in numeric_tuple] + + # build a list of lists to create the index from + div_factor = nentries // ndupe_l[i] + 1 + cnt = Counter() + for j in range(div_factor): + label = f"{prefix}_l{i}_g{j}" + cnt[label] = ndupe_l[i] + # cute Counter trick + result = sorted(cnt.elements(), key=keyfunc)[:nentries] + tuples.append(result) + + tuples = list(zip(*tuples)) + + # convert tuples to index + if nentries == 1: + # we have a single level of tuples, i.e. a regular Index + index = Index(tuples[0], name=names[0]) + elif nlevels == 1: + name = None if names is None else names[0] + index = Index((x[0] for x in tuples), name=name) + else: + index = MultiIndex.from_tuples(tuples, names=names) + return index + + +def makeCustomDataframe( + nrows, + ncols, + c_idx_names=True, + r_idx_names=True, + c_idx_nlevels=1, + r_idx_nlevels=1, + data_gen_f=None, + c_ndupe_l=None, + r_ndupe_l=None, + dtype=None, + c_idx_type=None, + r_idx_type=None, +): + """ + nrows, ncols - number of data rows/cols + c_idx_names, idx_names - False/True/list of strings, yields No names , + default names or uses the provided names for the levels of the + corresponding index. You can provide a single string when + c_idx_nlevels ==1. + c_idx_nlevels - number of levels in columns index. > 1 will yield MultiIndex + r_idx_nlevels - number of levels in rows index. > 1 will yield MultiIndex + data_gen_f - a function f(row,col) which return the data value + at that position, the default generator used yields values of the form + "RxCy" based on position. + c_ndupe_l, r_ndupe_l - list of integers, determines the number + of duplicates for each label at a given level of the corresponding + index. The default `None` value produces a multiplicity of 1 across + all levels, i.e. a unique index. Will accept a partial list of length + N < idx_nlevels, for just the first N levels. If ndupe doesn't divide + nrows/ncol, the last label might have lower multiplicity. + dtype - passed to the DataFrame constructor as is, in case you wish to + have more control in conjunction with a custom `data_gen_f` + r_idx_type, c_idx_type - "i"/"f"/"s"/"u"/"dt"/"td". + If idx_type is not None, `idx_nlevels` must be 1. + "i"/"f" creates an integer/float index, + "s"/"u" creates a string/unicode index + "dt" create a datetime index. + "td" create a timedelta index. + + if unspecified, string labels will be generated. + + Examples: + + # 5 row, 3 columns, default names on both, single index on both axis + >> makeCustomDataframe(5,3) + + # make the data a random int between 1 and 100 + >> mkdf(5,3,data_gen_f=lambda r,c:randint(1,100)) + + # 2-level multiindex on rows with each label duplicated + # twice on first level, default names on both axis, single + # index on both axis + >> a=makeCustomDataframe(5,3,r_idx_nlevels=2,r_ndupe_l=[2]) + + # DatetimeIndex on row, index with unicode labels on columns + # no names on either axis + >> a=makeCustomDataframe(5,3,c_idx_names=False,r_idx_names=False, + r_idx_type="dt",c_idx_type="u") + + # 4-level multindex on rows with names provided, 2-level multindex + # on columns with default labels and default names. + >> a=makeCustomDataframe(5,3,r_idx_nlevels=4, + r_idx_names=["FEE","FI","FO","FAM"], + c_idx_nlevels=2) + + >> a=mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4) + """ + + assert c_idx_nlevels > 0 + assert r_idx_nlevels > 0 + assert r_idx_type is None or ( + r_idx_type in ("i", "f", "s", "u", "dt", "p", "td") and r_idx_nlevels == 1 + ) + assert c_idx_type is None or ( + c_idx_type in ("i", "f", "s", "u", "dt", "p", "td") and c_idx_nlevels == 1 + ) + + columns = makeCustomIndex( + ncols, + nlevels=c_idx_nlevels, + prefix="C", + names=c_idx_names, + ndupe_l=c_ndupe_l, + idx_type=c_idx_type, + ) + index = makeCustomIndex( + nrows, + nlevels=r_idx_nlevels, + prefix="R", + names=r_idx_names, + ndupe_l=r_ndupe_l, + idx_type=r_idx_type, + ) + + # by default, generate data based on location + if data_gen_f is None: + data_gen_f = lambda r, c: f"R{r}C{c}" + + data = [[data_gen_f(r, c) for c in range(ncols)] for r in range(nrows)] + + return DataFrame(data, index, columns, dtype=dtype) + + +def _create_missing_idx(nrows, ncols, density, random_state=None): + if random_state is None: + random_state = np.random + else: + random_state = np.random.RandomState(random_state) + + # below is cribbed from scipy.sparse + size = int(np.round((1 - density) * nrows * ncols)) + # generate a few more to ensure unique values + min_rows = 5 + fac = 1.02 + extra_size = min(size + min_rows, fac * size) + + def _gen_unique_rand(rng, _extra_size): + ind = rng.rand(int(_extra_size)) + return np.unique(np.floor(ind * nrows * ncols))[:size] + + ind = _gen_unique_rand(random_state, extra_size) + while ind.size < size: + extra_size *= 1.05 + ind = _gen_unique_rand(random_state, extra_size) + + j = np.floor(ind * 1.0 / nrows).astype(int) + i = (ind - j * nrows).astype(int) + return i.tolist(), j.tolist() + + +def makeMissingCustomDataframe( + nrows, + ncols, + density=0.9, + random_state=None, + c_idx_names=True, + r_idx_names=True, + c_idx_nlevels=1, + r_idx_nlevels=1, + data_gen_f=None, + c_ndupe_l=None, + r_ndupe_l=None, + dtype=None, + c_idx_type=None, + r_idx_type=None, +): + """ + Parameters + ---------- + Density : float, optional + Float in (0, 1) that gives the percentage of non-missing numbers in + the DataFrame. + random_state : {np.random.RandomState, int}, optional + Random number generator or random seed. + + See makeCustomDataframe for descriptions of the rest of the parameters. + """ + df = makeCustomDataframe( + nrows, + ncols, + c_idx_names=c_idx_names, + r_idx_names=r_idx_names, + c_idx_nlevels=c_idx_nlevels, + r_idx_nlevels=r_idx_nlevels, + data_gen_f=data_gen_f, + c_ndupe_l=c_ndupe_l, + r_ndupe_l=r_ndupe_l, + dtype=dtype, + c_idx_type=c_idx_type, + r_idx_type=r_idx_type, + ) + + i, j = _create_missing_idx(nrows, ncols, density, random_state) + df.values[i, j] = np.nan + return df + + +def makeMissingDataframe(density=0.9, random_state=None): + df = makeDataFrame() + i, j = _create_missing_idx(*df.shape, density=density, random_state=random_state) + df.values[i, j] = np.nan + return df + + +def optional_args(decorator): + """allows a decorator to take optional positional and keyword arguments. + Assumes that taking a single, callable, positional argument means that + it is decorating a function, i.e. something like this:: + + @my_decorator + def function(): pass + + Calls decorator with decorator(f, *args, **kwargs)""" + + @wraps(decorator) + def wrapper(*args, **kwargs): + def dec(f): + return decorator(f, *args, **kwargs) + + is_decorating = not kwargs and len(args) == 1 and callable(args[0]) + if is_decorating: + f = args[0] + args = [] + return dec(f) + else: + return dec + + return wrapper + + +# skip tests on exceptions with this message +_network_error_messages = ( + # 'urlopen error timed out', + # 'timeout: timed out', + # 'socket.timeout: timed out', + "timed out", + "Server Hangup", + "HTTP Error 503: Service Unavailable", + "502: Proxy Error", + "HTTP Error 502: internal error", + "HTTP Error 502", + "HTTP Error 503", + "HTTP Error 403", + "HTTP Error 400", + "Temporary failure in name resolution", + "Name or service not known", + "Connection refused", + "certificate verify", +) + +# or this e.errno/e.reason.errno +_network_errno_vals = ( + 101, # Network is unreachable + 111, # Connection refused + 110, # Connection timed out + 104, # Connection reset Error + 54, # Connection reset by peer + 60, # urllib.error.URLError: [Errno 60] Connection timed out +) + +# Both of the above shouldn't mask real issues such as 404's +# or refused connections (changed DNS). +# But some tests (test_data yahoo) contact incredibly flakey +# servers. + +# and conditionally raise on exception types in _get_default_network_errors + + +def _get_default_network_errors(): + # Lazy import for http.client because it imports many things from the stdlib + import http.client + + return (IOError, http.client.HTTPException, TimeoutError) + + +def can_connect(url, error_classes=None): + """Try to connect to the given url. True if succeeds, False if IOError + raised + + Parameters + ---------- + url : basestring + The URL to try to connect to + + Returns + ------- + connectable : bool + Return True if no IOError (unable to connect) or URLError (bad url) was + raised + """ + + if error_classes is None: + error_classes = _get_default_network_errors() + + try: + with urlopen(url): + pass + except error_classes: + return False + else: + return True + + +@optional_args +def network( + t, + url="http://www.google.com", + raise_on_error=_RAISE_NETWORK_ERROR_DEFAULT, + check_before_test=False, + error_classes=None, + skip_errnos=_network_errno_vals, + _skip_on_messages=_network_error_messages, +): + """ + Label a test as requiring network connection and, if an error is + encountered, only raise if it does not find a network connection. + + In comparison to ``network``, this assumes an added contract to your test: + you must assert that, under normal conditions, your test will ONLY fail if + it does not have network connectivity. + + You can call this in 3 ways: as a standard decorator, with keyword + arguments, or with a positional argument that is the url to check. + + Parameters + ---------- + t : callable + The test requiring network connectivity. + url : path + The url to test via ``pandas.io.common.urlopen`` to check + for connectivity. Defaults to 'http://www.google.com'. + raise_on_error : bool + If True, never catches errors. + check_before_test : bool + If True, checks connectivity before running the test case. + error_classes : tuple or Exception + error classes to ignore. If not in ``error_classes``, raises the error. + defaults to IOError. Be careful about changing the error classes here. + skip_errnos : iterable of int + Any exception that has .errno or .reason.erno set to one + of these values will be skipped with an appropriate + message. + _skip_on_messages: iterable of string + any exception e for which one of the strings is + a substring of str(e) will be skipped with an appropriate + message. Intended to suppress errors where an errno isn't available. + + Notes + ----- + * ``raise_on_error`` supercedes ``check_before_test`` + + Returns + ------- + t : callable + The decorated test ``t``, with checks for connectivity errors. + + Example + ------- + + Tests decorated with @network will fail if it's possible to make a network + connection to another URL (defaults to google.com):: + + >>> from pandas._testing import network + >>> from pandas.io.common import urlopen + >>> @network + ... def test_network(): + ... with urlopen("rabbit://bonanza.com"): + ... pass + Traceback + ... + URLError: + + You can specify alternative URLs:: + + >>> @network("http://www.yahoo.com") + ... def test_something_with_yahoo(): + ... raise IOError("Failure Message") + >>> test_something_with_yahoo() + Traceback (most recent call last): + ... + IOError: Failure Message + + If you set check_before_test, it will check the url first and not run the + test on failure:: + + >>> @network("failing://url.blaher", check_before_test=True) + ... def test_something(): + ... print("I ran!") + ... raise ValueError("Failure") + >>> test_something() + Traceback (most recent call last): + ... + + Errors not related to networking will always be raised. + """ + from pytest import skip + + if error_classes is None: + error_classes = _get_default_network_errors() + + t.network = True + + @wraps(t) + def wrapper(*args, **kwargs): + if check_before_test and not raise_on_error: + if not can_connect(url, error_classes): + skip() + try: + return t(*args, **kwargs) + except Exception as err: + errno = getattr(err, "errno", None) + if not errno and hasattr(errno, "reason"): + errno = getattr(err.reason, "errno", None) + + if errno in skip_errnos: + skip(f"Skipping test due to known errno and error {err}") + + e_str = str(err) + + if any(m.lower() in e_str.lower() for m in _skip_on_messages): + skip( + f"Skipping test because exception message is known and error {err}" + ) + + if not isinstance(err, error_classes): + raise + + if raise_on_error or can_connect(url, error_classes): + raise + else: + skip(f"Skipping test due to lack of connectivity and error {err}") + + return wrapper + + +with_connectivity_check = network + + +@contextmanager +def assert_produces_warning( + expected_warning=Warning, + filter_level="always", + clear=None, + check_stacklevel=True, + raise_on_extra_warnings=True, +): + """ + Context manager for running code expected to either raise a specific + warning, or not raise any warnings. Verifies that the code raises the + expected warning, and that it does not raise any other unexpected + warnings. It is basically a wrapper around ``warnings.catch_warnings``. + + Parameters + ---------- + expected_warning : {Warning, False, None}, default Warning + The type of Exception raised. ``exception.Warning`` is the base + class for all warnings. To check that no warning is returned, + specify ``False`` or ``None``. + filter_level : str or None, default "always" + Specifies whether warnings are ignored, displayed, or turned + into errors. + Valid values are: + + * "error" - turns matching warnings into exceptions + * "ignore" - discard the warning + * "always" - always emit a warning + * "default" - print the warning the first time it is generated + from each location + * "module" - print the warning the first time it is generated + from each module + * "once" - print the warning the first time it is generated + + clear : str, default None + If not ``None`` then remove any previously raised warnings from + the ``__warningsregistry__`` to ensure that no warning messages are + suppressed by this context manager. If ``None`` is specified, + the ``__warningsregistry__`` keeps track of which warnings have been + shown, and does not show them again. + check_stacklevel : bool, default True + If True, displays the line that called the function containing + the warning to show were the function is called. Otherwise, the + line that implements the function is displayed. + raise_on_extra_warnings : bool, default True + Whether extra warnings not of the type `expected_warning` should + cause the test to fail. + + Examples + -------- + >>> import warnings + >>> with assert_produces_warning(): + ... warnings.warn(UserWarning()) + ... + >>> with assert_produces_warning(False): + ... warnings.warn(RuntimeWarning()) + ... + Traceback (most recent call last): + ... + AssertionError: Caused unexpected warning(s): ['RuntimeWarning']. + >>> with assert_produces_warning(UserWarning): + ... warnings.warn(RuntimeWarning()) + Traceback (most recent call last): + ... + AssertionError: Did not see expected warning of class 'UserWarning'. + + ..warn:: This is *not* thread-safe. + """ + __tracebackhide__ = True + + with warnings.catch_warnings(record=True) as w: + + if clear is not None: + # make sure that we are clearing these warnings + # if they have happened before + # to guarantee that we will catch them + if not is_list_like(clear): + clear = [clear] + for m in clear: + try: + m.__warningregistry__.clear() + except AttributeError: + # module may not have __warningregistry__ + pass + + saw_warning = False + warnings.simplefilter(filter_level) + yield w + extra_warnings = [] + + for actual_warning in w: + if expected_warning and issubclass( + actual_warning.category, expected_warning + ): + saw_warning = True + + if check_stacklevel and issubclass( + actual_warning.category, (FutureWarning, DeprecationWarning) + ): + from inspect import getframeinfo, stack + + caller = getframeinfo(stack()[2][0]) + msg = ( + "Warning not set with correct stacklevel. " + f"File where warning is raised: {actual_warning.filename} != " + f"{caller.filename}. Warning message: {actual_warning.message}" + ) + assert actual_warning.filename == caller.filename, msg + else: + extra_warnings.append( + ( + actual_warning.category.__name__, + actual_warning.message, + actual_warning.filename, + actual_warning.lineno, + ) + ) + if expected_warning: + msg = ( + f"Did not see expected warning of class " + f"{repr(expected_warning.__name__)}" + ) + assert saw_warning, msg + if raise_on_extra_warnings and extra_warnings: + raise AssertionError( + f"Caused unexpected warning(s): {repr(extra_warnings)}" + ) + + +class RNGContext: + """ + Context manager to set the numpy random number generator speed. Returns + to the original value upon exiting the context manager. + + Parameters + ---------- + seed : int + Seed for numpy.random.seed + + Examples + -------- + + with RNGContext(42): + np.random.randn() + """ + + def __init__(self, seed): + self.seed = seed + + def __enter__(self): + + self.start_state = np.random.get_state() + np.random.seed(self.seed) + + def __exit__(self, exc_type, exc_value, traceback): + + np.random.set_state(self.start_state) + + +@contextmanager +def with_csv_dialect(name, **kwargs): + """ + Context manager to temporarily register a CSV dialect for parsing CSV. + + Parameters + ---------- + name : str + The name of the dialect. + kwargs : mapping + The parameters for the dialect. + + Raises + ------ + ValueError : the name of the dialect conflicts with a builtin one. + + See Also + -------- + csv : Python's CSV library. + """ + import csv + + _BUILTIN_DIALECTS = {"excel", "excel-tab", "unix"} + + if name in _BUILTIN_DIALECTS: + raise ValueError("Cannot override builtin dialect.") + + csv.register_dialect(name, **kwargs) + yield + csv.unregister_dialect(name) + + +@contextmanager +def use_numexpr(use, min_elements=None): + from pandas.core.computation import expressions as expr + + if min_elements is None: + min_elements = expr._MIN_ELEMENTS + + olduse = expr._USE_NUMEXPR + oldmin = expr._MIN_ELEMENTS + expr.set_use_numexpr(use) + expr._MIN_ELEMENTS = min_elements + yield + expr._MIN_ELEMENTS = oldmin + expr.set_use_numexpr(olduse) + + +def test_parallel(num_threads=2, kwargs_list=None): + """Decorator to run the same function multiple times in parallel. + + Parameters + ---------- + num_threads : int, optional + The number of times the function is run in parallel. + kwargs_list : list of dicts, optional + The list of kwargs to update original + function kwargs on different threads. + Notes + ----- + This decorator does not pass the return value of the decorated function. + + Original from scikit-image: + + https://github.com/scikit-image/scikit-image/pull/1519 + + """ + + assert num_threads > 0 + has_kwargs_list = kwargs_list is not None + if has_kwargs_list: + assert len(kwargs_list) == num_threads + import threading + + def wrapper(func): + @wraps(func) + def inner(*args, **kwargs): + if has_kwargs_list: + update_kwargs = lambda i: dict(kwargs, **kwargs_list[i]) + else: + update_kwargs = lambda i: kwargs + threads = [] + for i in range(num_threads): + updated_kwargs = update_kwargs(i) + thread = threading.Thread(target=func, args=args, kwargs=updated_kwargs) + threads.append(thread) + for thread in threads: + thread.start() + for thread in threads: + thread.join() + + return inner + + return wrapper + + +class SubclassedSeries(Series): + _metadata = ["testattr", "name"] + + @property + def _constructor(self): + return SubclassedSeries + + @property + def _constructor_expanddim(self): + return SubclassedDataFrame + + +class SubclassedDataFrame(DataFrame): + _metadata = ["testattr"] + + @property + def _constructor(self): + return SubclassedDataFrame + + @property + def _constructor_sliced(self): + return SubclassedSeries + + +class SubclassedCategorical(Categorical): + @property + def _constructor(self): + return SubclassedCategorical + + +@contextmanager +def set_timezone(tz: str): + """ + Context manager for temporarily setting a timezone. + + Parameters + ---------- + tz : str + A string representing a valid timezone. + + Examples + -------- + + >>> from datetime import datetime + >>> from dateutil.tz import tzlocal + >>> tzlocal().tzname(datetime.now()) + 'IST' + + >>> with set_timezone('US/Eastern'): + ... tzlocal().tzname(datetime.now()) + ... + 'EDT' + """ + + import os + import time + + def setTZ(tz): + if tz is None: + try: + del os.environ["TZ"] + except KeyError: + pass + else: + os.environ["TZ"] = tz + time.tzset() + + orig_tz = os.environ.get("TZ") + setTZ(tz) + try: + yield + finally: + setTZ(orig_tz) + + +def _make_skipna_wrapper(alternative, skipna_alternative=None): + """ + Create a function for calling on an array. + + Parameters + ---------- + alternative : function + The function to be called on the array with no NaNs. + Only used when 'skipna_alternative' is None. + skipna_alternative : function + The function to be called on the original array + + Returns + ------- + function + """ + if skipna_alternative: + + def skipna_wrapper(x): + return skipna_alternative(x.values) + + else: + + def skipna_wrapper(x): + nona = x.dropna() + if len(nona) == 0: + return np.nan + return alternative(nona) + + return skipna_wrapper + + +def convert_rows_list_to_csv_str(rows_list: List[str]): + """ + Convert list of CSV rows to single CSV-formatted string for current OS. + + This method is used for creating expected value of to_csv() method. + + Parameters + ---------- + rows_list : List[str] + Each element represents the row of csv. + + Returns + ------- + str + Expected output of to_csv() in current OS. + """ + sep = os.linesep + expected = sep.join(rows_list) + sep + return expected diff --git a/venv/Lib/site-packages/pandas/_typing.py b/venv/Lib/site-packages/pandas/_typing.py new file mode 100644 index 0000000..171b76b --- /dev/null +++ b/venv/Lib/site-packages/pandas/_typing.py @@ -0,0 +1,73 @@ +from pathlib import Path +from typing import ( + IO, + TYPE_CHECKING, + Any, + AnyStr, + Callable, + Collection, + Dict, + Hashable, + List, + Mapping, + Optional, + TypeVar, + Union, +) + +import numpy as np + +# To prevent import cycles place any internal imports in the branch below +# and use a string literal forward reference to it in subsequent types +# https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles +if TYPE_CHECKING: + from pandas._libs import Period, Timedelta, Timestamp # noqa: F401 + from pandas.core.arrays.base import ExtensionArray # noqa: F401 + from pandas.core.dtypes.dtypes import ExtensionDtype # noqa: F401 + from pandas.core.indexes.base import Index # noqa: F401 + from pandas.core.generic import NDFrame # noqa: F401 + from pandas import Interval # noqa: F401 + from pandas.core.series import Series # noqa: F401 + from pandas.core.frame import DataFrame # noqa: F401 + +# array-like + +AnyArrayLike = TypeVar("AnyArrayLike", "ExtensionArray", "Index", "Series", np.ndarray) +ArrayLike = TypeVar("ArrayLike", "ExtensionArray", np.ndarray) + +# scalars + +PythonScalar = Union[str, int, float, bool] +DatetimeLikeScalar = TypeVar("DatetimeLikeScalar", "Period", "Timestamp", "Timedelta") +PandasScalar = Union["Period", "Timestamp", "Timedelta", "Interval"] +Scalar = Union[PythonScalar, PandasScalar] + +# other + +Dtype = Union[str, np.dtype, "ExtensionDtype"] +FilePathOrBuffer = Union[str, Path, IO[AnyStr]] + +# FrameOrSeriesUnion means either a DataFrame or a Series. E.g. +# `def func(a: FrameOrSeriesUnion) -> FrameOrSeriesUnion: ...` means that if a Series +# is passed in, either a Series or DataFrame is returned, and if a DataFrame is passed +# in, either a DataFrame or a Series is returned. +FrameOrSeriesUnion = Union["DataFrame", "Series"] + +# FrameOrSeries is stricter and ensures that the same subclass of NDFrame always is +# used. E.g. `def func(a: FrameOrSeries) -> FrameOrSeries: ...` means that if a +# Series is passed into a function, a Series is always returned and if a DataFrame is +# passed in, a DataFrame is always returned. +FrameOrSeries = TypeVar("FrameOrSeries", bound="NDFrame") + +Axis = Union[str, int] +Label = Optional[Hashable] +Level = Union[Label, int] +Ordered = Optional[bool] +JSONSerializable = Union[PythonScalar, List, Dict] +Axes = Collection + +# For functions like rename that convert one label to another +Renamer = Union[Mapping[Label, Any], Callable[[Label], Label]] + +# to maintain type information across generic functions and parametrization +T = TypeVar("T") diff --git a/venv/Lib/site-packages/pandas/_version.py b/venv/Lib/site-packages/pandas/_version.py new file mode 100644 index 0000000..bb2564a --- /dev/null +++ b/venv/Lib/site-packages/pandas/_version.py @@ -0,0 +1,23 @@ + +# This file was generated by 'versioneer.py' (0.15) from +# revision-control system data, or from the parent directory name of an +# unpacked source archive. Distribution tarballs contain a pre-generated copy +# of this file. + +from warnings import catch_warnings +with catch_warnings(record=True): + import json +import sys + +version_json = ''' +{ + "dirty": false, + "error": null, + "full-revisionid": "fd9ceb9dce3d62d8a9caa6ba7c127b512939452e", + "version": "1.0.0" +} +''' # END VERSION_JSON + + +def get_versions(): + return json.loads(version_json) diff --git a/venv/Lib/site-packages/pandas/api/__init__.py b/venv/Lib/site-packages/pandas/api/__init__.py new file mode 100644 index 0000000..bebbb38 --- /dev/null +++ b/venv/Lib/site-packages/pandas/api/__init__.py @@ -0,0 +1,2 @@ +""" public toolkit API """ +from pandas.api import extensions, indexers, types # noqa diff --git a/venv/Lib/site-packages/pandas/api/extensions/__init__.py b/venv/Lib/site-packages/pandas/api/extensions/__init__.py new file mode 100644 index 0000000..3019dd0 --- /dev/null +++ b/venv/Lib/site-packages/pandas/api/extensions/__init__.py @@ -0,0 +1,27 @@ +""" +Public API for extending pandas objects. +""" + +from pandas._libs.lib import no_default + +from pandas.core.dtypes.dtypes import ExtensionDtype, register_extension_dtype + +from pandas.core.accessor import ( + register_dataframe_accessor, + register_index_accessor, + register_series_accessor, +) +from pandas.core.algorithms import take +from pandas.core.arrays import ExtensionArray, ExtensionScalarOpsMixin + +__all__ = [ + "no_default", + "ExtensionDtype", + "register_extension_dtype", + "register_dataframe_accessor", + "register_index_accessor", + "register_series_accessor", + "take", + "ExtensionArray", + "ExtensionScalarOpsMixin", +] diff --git a/venv/Lib/site-packages/pandas/api/indexers/__init__.py b/venv/Lib/site-packages/pandas/api/indexers/__init__.py new file mode 100644 index 0000000..826297e --- /dev/null +++ b/venv/Lib/site-packages/pandas/api/indexers/__init__.py @@ -0,0 +1,8 @@ +""" +Public API for Rolling Window Indexers. +""" + +from pandas.core.indexers import check_array_indexer +from pandas.core.window.indexers import BaseIndexer + +__all__ = ["check_array_indexer", "BaseIndexer"] diff --git a/venv/Lib/site-packages/pandas/api/types/__init__.py b/venv/Lib/site-packages/pandas/api/types/__init__.py new file mode 100644 index 0000000..3495b49 --- /dev/null +++ b/venv/Lib/site-packages/pandas/api/types/__init__.py @@ -0,0 +1,23 @@ +""" +Public toolkit API. +""" + +from pandas._libs.lib import infer_dtype + +from pandas.core.dtypes.api import * # noqa: F403, F401 +from pandas.core.dtypes.concat import union_categoricals +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + DatetimeTZDtype, + IntervalDtype, + PeriodDtype, +) + +__all__ = [ + "infer_dtype", + "union_categoricals", + "CategoricalDtype", + "DatetimeTZDtype", + "IntervalDtype", + "PeriodDtype", +] diff --git a/venv/Lib/site-packages/pandas/arrays/__init__.py b/venv/Lib/site-packages/pandas/arrays/__init__.py new file mode 100644 index 0000000..61832a8 --- /dev/null +++ b/venv/Lib/site-packages/pandas/arrays/__init__.py @@ -0,0 +1,30 @@ +""" +All of pandas' ExtensionArrays. + +See :ref:`extending.extension-types` for more. +""" +from pandas.core.arrays import ( + BooleanArray, + Categorical, + DatetimeArray, + IntegerArray, + IntervalArray, + PandasArray, + PeriodArray, + SparseArray, + StringArray, + TimedeltaArray, +) + +__all__ = [ + "BooleanArray", + "Categorical", + "DatetimeArray", + "IntegerArray", + "IntervalArray", + "PandasArray", + "PeriodArray", + "SparseArray", + "StringArray", + "TimedeltaArray", +] diff --git a/venv/Lib/site-packages/pandas/compat/__init__.py b/venv/Lib/site-packages/pandas/compat/__init__.py new file mode 100644 index 0000000..60cfecd --- /dev/null +++ b/venv/Lib/site-packages/pandas/compat/__init__.py @@ -0,0 +1,140 @@ +""" +compat +====== + +Cross-compatible functions for different versions of Python. + +Other items: +* platform checker +""" +import platform +import struct +import sys +import warnings + +PY37 = sys.version_info >= (3, 7) +PY38 = sys.version_info >= (3, 8) +PYPY = platform.python_implementation() == "PyPy" + + +# ---------------------------------------------------------------------------- +# functions largely based / taken from the six module + +# Much of the code in this module comes from Benjamin Peterson's six library. +# The license for this library can be found in LICENSES/SIX and the code can be +# found at https://bitbucket.org/gutworth/six + + +def set_function_name(f, name, cls): + """ + Bind the name/qualname attributes of the function. + """ + f.__name__ = name + f.__qualname__ = f"{cls.__name__}.{name}" + f.__module__ = cls.__module__ + return f + + +# https://github.com/pandas-dev/pandas/pull/9123 +def is_platform_little_endian() -> bool: + """ + Checking if the running platform is little endian. + + Returns + ------- + bool + True if the running platform is little endian. + """ + return sys.byteorder == "little" + + +def is_platform_windows() -> bool: + """ + Checking if the running platform is windows. + + Returns + ------- + bool + True if the running platform is windows. + """ + return sys.platform == "win32" or sys.platform == "cygwin" + + +def is_platform_linux() -> bool: + """ + Checking if the running platform is linux. + + Returns + ------- + bool + True if the running platform is linux. + """ + return sys.platform == "linux2" + + +def is_platform_mac() -> bool: + """ + Checking if the running platform is mac. + + Returns + ------- + bool + True if the running platform is mac. + """ + return sys.platform == "darwin" + + +def is_platform_32bit() -> bool: + """ + Checking if the running platform is 32-bit. + + Returns + ------- + bool + True if the running platform is 32-bit. + """ + return struct.calcsize("P") * 8 < 64 + + +def _import_lzma(): + """ + Importing the `lzma` module. + + Warns + ----- + When the `lzma` module is not available. + """ + try: + import lzma + + return lzma + except ImportError: + msg = ( + "Could not import the lzma module. " + "Your installed Python is incomplete. " + "Attempting to use lzma compression will result in a RuntimeError." + ) + warnings.warn(msg) + + +def _get_lzma_file(lzma): + """ + Importing the `LZMAFile` class from the `lzma` module. + + Returns + ------- + class + The `LZMAFile` class from the `lzma` module. + + Raises + ------ + RuntimeError + If the `lzma` module was not imported correctly, or didn't exist. + """ + if lzma is None: + raise RuntimeError( + "lzma module not available. " + "A Python re-install with the proper dependencies, " + "might be required to solve this issue." + ) + return lzma.LZMAFile diff --git a/venv/Lib/site-packages/pandas/compat/_optional.py b/venv/Lib/site-packages/pandas/compat/_optional.py new file mode 100644 index 0000000..cd711bc --- /dev/null +++ b/venv/Lib/site-packages/pandas/compat/_optional.py @@ -0,0 +1,111 @@ +import distutils.version +import importlib +import types +import warnings + +# Update install.rst when updating versions! + +VERSIONS = { + "bs4": "4.6.0", + "bottleneck": "1.2.1", + "fastparquet": "0.3.2", + "gcsfs": "0.2.2", + "lxml.etree": "3.8.0", + "matplotlib": "2.2.2", + "numexpr": "2.6.2", + "odfpy": "1.3.0", + "openpyxl": "2.5.7", + "pandas_gbq": "0.8.0", + "pyarrow": "0.13.0", + "pytables": "3.4.2", + "pytest": "5.0.1", + "pyxlsb": "1.0.6", + "s3fs": "0.3.0", + "scipy": "0.19.0", + "sqlalchemy": "1.1.4", + "tables": "3.4.2", + "tabulate": "0.8.3", + "xarray": "0.8.2", + "xlrd": "1.1.0", + "xlwt": "1.2.0", + "xlsxwriter": "0.9.8", + "numba": "0.46.0", +} + + +def _get_version(module: types.ModuleType) -> str: + version = getattr(module, "__version__", None) + if version is None: + # xlrd uses a capitalized attribute name + version = getattr(module, "__VERSION__", None) + + if version is None: + raise ImportError(f"Can't determine version for {module.__name__}") + return version + + +def import_optional_dependency( + name: str, extra: str = "", raise_on_missing: bool = True, on_version: str = "raise" +): + """ + Import an optional dependency. + + By default, if a dependency is missing an ImportError with a nice + message will be raised. If a dependency is present, but too old, + we raise. + + Parameters + ---------- + name : str + The module name. This should be top-level only, so that the + version may be checked. + extra : str + Additional text to include in the ImportError message. + raise_on_missing : bool, default True + Whether to raise if the optional dependency is not found. + When False and the module is not present, None is returned. + on_version : str {'raise', 'warn'} + What to do when a dependency's version is too old. + + * raise : Raise an ImportError + * warn : Warn that the version is too old. Returns None + * ignore: Return the module, even if the version is too old. + It's expected that users validate the version locally when + using ``on_version="ignore"`` (see. ``io/html.py``) + + Returns + ------- + maybe_module : Optional[ModuleType] + The imported module, when found and the version is correct. + None is returned when the package is not found and `raise_on_missing` + is False, or when the package's version is too old and `on_version` + is ``'warn'``. + """ + msg = ( + f"Missing optional dependency '{name}'. {extra} " + f"Use pip or conda to install {name}." + ) + try: + module = importlib.import_module(name) + except ImportError: + if raise_on_missing: + raise ImportError(msg) from None + else: + return None + + minimum_version = VERSIONS.get(name) + if minimum_version: + version = _get_version(module) + if distutils.version.LooseVersion(version) < minimum_version: + assert on_version in {"warn", "raise", "ignore"} + msg = ( + f"Pandas requires version '{minimum_version}' or newer of '{name}' " + f"(version '{version}' currently installed)." + ) + if on_version == "warn": + warnings.warn(msg, UserWarning) + return None + elif on_version == "raise": + raise ImportError(msg) + + return module diff --git a/venv/Lib/site-packages/pandas/compat/chainmap.py b/venv/Lib/site-packages/pandas/compat/chainmap.py new file mode 100644 index 0000000..588bd24 --- /dev/null +++ b/venv/Lib/site-packages/pandas/compat/chainmap.py @@ -0,0 +1,33 @@ +from typing import ChainMap, MutableMapping, TypeVar, cast + +_KT = TypeVar("_KT") +_VT = TypeVar("_VT") + + +class DeepChainMap(ChainMap[_KT, _VT]): + """Variant of ChainMap that allows direct updates to inner scopes. + + Only works when all passed mapping are mutable. + """ + + def __setitem__(self, key: _KT, value: _VT) -> None: + for mapping in self.maps: + mutable_mapping = cast(MutableMapping[_KT, _VT], mapping) + if key in mutable_mapping: + mutable_mapping[key] = value + return + cast(MutableMapping[_KT, _VT], self.maps[0])[key] = value + + def __delitem__(self, key: _KT) -> None: + """ + Raises + ------ + KeyError + If `key` doesn't exist. + """ + for mapping in self.maps: + mutable_mapping = cast(MutableMapping[_KT, _VT], mapping) + if key in mapping: + del mutable_mapping[key] + return + raise KeyError(key) diff --git a/venv/Lib/site-packages/pandas/compat/numpy/__init__.py b/venv/Lib/site-packages/pandas/compat/numpy/__init__.py new file mode 100644 index 0000000..27f1c32 --- /dev/null +++ b/venv/Lib/site-packages/pandas/compat/numpy/__init__.py @@ -0,0 +1,75 @@ +""" support numpy compatibility across versions """ + +from distutils.version import LooseVersion +import re + +import numpy as np + +# numpy versioning +_np_version = np.__version__ +_nlv = LooseVersion(_np_version) +_np_version_under1p14 = _nlv < LooseVersion("1.14") +_np_version_under1p15 = _nlv < LooseVersion("1.15") +_np_version_under1p16 = _nlv < LooseVersion("1.16") +_np_version_under1p17 = _nlv < LooseVersion("1.17") +_np_version_under1p18 = _nlv < LooseVersion("1.18") +_is_numpy_dev = ".dev" in str(_nlv) + + +if _nlv < "1.13.3": + raise ImportError( + f"this version of pandas is incompatible with " + f"numpy < 1.13.3\n" + f"your numpy version is {_np_version}.\n" + f"Please upgrade numpy to >= 1.13.3 to use " + f"this pandas version" + ) + + +_tz_regex = re.compile("[+-]0000$") + + +def tz_replacer(s): + if isinstance(s, str): + if s.endswith("Z"): + s = s[:-1] + elif _tz_regex.search(s): + s = s[:-5] + return s + + +def np_datetime64_compat(s, *args, **kwargs): + """ + provide compat for construction of strings to numpy datetime64's with + tz-changes in 1.11 that make '2015-01-01 09:00:00Z' show a deprecation + warning, when need to pass '2015-01-01 09:00:00' + """ + s = tz_replacer(s) + return np.datetime64(s, *args, **kwargs) + + +def np_array_datetime64_compat(arr, *args, **kwargs): + """ + provide compat for construction of an array of strings to a + np.array(..., dtype=np.datetime64(..)) + tz-changes in 1.11 that make '2015-01-01 09:00:00Z' show a deprecation + warning, when need to pass '2015-01-01 09:00:00' + """ + # is_list_like + if hasattr(arr, "__iter__") and not isinstance(arr, (str, bytes)): + arr = [tz_replacer(s) for s in arr] + else: + arr = tz_replacer(arr) + + return np.array(arr, *args, **kwargs) + + +__all__ = [ + "np", + "_np_version", + "_np_version_under1p14", + "_np_version_under1p15", + "_np_version_under1p16", + "_np_version_under1p17", + "_is_numpy_dev", +] diff --git a/venv/Lib/site-packages/pandas/compat/numpy/function.py b/venv/Lib/site-packages/pandas/compat/numpy/function.py new file mode 100644 index 0000000..7158f25 --- /dev/null +++ b/venv/Lib/site-packages/pandas/compat/numpy/function.py @@ -0,0 +1,405 @@ +""" +For compatibility with numpy libraries, pandas functions or +methods have to accept '*args' and '**kwargs' parameters to +accommodate numpy arguments that are not actually used or +respected in the pandas implementation. + +To ensure that users do not abuse these parameters, validation +is performed in 'validators.py' to make sure that any extra +parameters passed correspond ONLY to those in the numpy signature. +Part of that validation includes whether or not the user attempted +to pass in non-default values for these extraneous parameters. As we +want to discourage users from relying on these parameters when calling +the pandas implementation, we want them only to pass in the default values +for these parameters. + +This module provides a set of commonly used default arguments for functions +and methods that are spread throughout the codebase. This module will make it +easier to adjust to future upstream changes in the analogous numpy signatures. +""" +from collections import OrderedDict +from distutils.version import LooseVersion +from typing import Any, Dict, Optional, Union + +from numpy import __version__ as _np_version, ndarray + +from pandas._libs.lib import is_bool, is_integer +from pandas.errors import UnsupportedFunctionCall +from pandas.util._validators import ( + validate_args, + validate_args_and_kwargs, + validate_kwargs, +) + + +class CompatValidator: + def __init__(self, defaults, fname=None, method=None, max_fname_arg_count=None): + self.fname = fname + self.method = method + self.defaults = defaults + self.max_fname_arg_count = max_fname_arg_count + + def __call__(self, args, kwargs, fname=None, max_fname_arg_count=None, method=None): + if args or kwargs: + fname = self.fname if fname is None else fname + max_fname_arg_count = ( + self.max_fname_arg_count + if max_fname_arg_count is None + else max_fname_arg_count + ) + method = self.method if method is None else method + + if method == "args": + validate_args(fname, args, max_fname_arg_count, self.defaults) + elif method == "kwargs": + validate_kwargs(fname, kwargs, self.defaults) + elif method == "both": + validate_args_and_kwargs( + fname, args, kwargs, max_fname_arg_count, self.defaults + ) + else: + raise ValueError(f"invalid validation method '{method}'") + + +ARGMINMAX_DEFAULTS = dict(out=None) +validate_argmin = CompatValidator( + ARGMINMAX_DEFAULTS, fname="argmin", method="both", max_fname_arg_count=1 +) +validate_argmax = CompatValidator( + ARGMINMAX_DEFAULTS, fname="argmax", method="both", max_fname_arg_count=1 +) + + +def process_skipna(skipna, args): + if isinstance(skipna, ndarray) or skipna is None: + args = (skipna,) + args + skipna = True + + return skipna, args + + +def validate_argmin_with_skipna(skipna, args, kwargs): + """ + If 'Series.argmin' is called via the 'numpy' library, + the third parameter in its signature is 'out', which + takes either an ndarray or 'None', so check if the + 'skipna' parameter is either an instance of ndarray or + is None, since 'skipna' itself should be a boolean + """ + + skipna, args = process_skipna(skipna, args) + validate_argmin(args, kwargs) + return skipna + + +def validate_argmax_with_skipna(skipna, args, kwargs): + """ + If 'Series.argmax' is called via the 'numpy' library, + the third parameter in its signature is 'out', which + takes either an ndarray or 'None', so check if the + 'skipna' parameter is either an instance of ndarray or + is None, since 'skipna' itself should be a boolean + """ + + skipna, args = process_skipna(skipna, args) + validate_argmax(args, kwargs) + return skipna + + +ARGSORT_DEFAULTS: "OrderedDict[str, Optional[Union[int, str]]]" = OrderedDict() +ARGSORT_DEFAULTS["axis"] = -1 +ARGSORT_DEFAULTS["kind"] = "quicksort" +ARGSORT_DEFAULTS["order"] = None + +if LooseVersion(_np_version) >= LooseVersion("1.17.0"): + # GH-26361. NumPy added radix sort and changed default to None. + ARGSORT_DEFAULTS["kind"] = None + + +validate_argsort = CompatValidator( + ARGSORT_DEFAULTS, fname="argsort", max_fname_arg_count=0, method="both" +) + +# two different signatures of argsort, this second validation +# for when the `kind` param is supported +ARGSORT_DEFAULTS_KIND: "OrderedDict[str, Optional[int]]" = OrderedDict() +ARGSORT_DEFAULTS_KIND["axis"] = -1 +ARGSORT_DEFAULTS_KIND["order"] = None +validate_argsort_kind = CompatValidator( + ARGSORT_DEFAULTS_KIND, fname="argsort", max_fname_arg_count=0, method="both" +) + + +def validate_argsort_with_ascending(ascending, args, kwargs): + """ + If 'Categorical.argsort' is called via the 'numpy' library, the + first parameter in its signature is 'axis', which takes either + an integer or 'None', so check if the 'ascending' parameter has + either integer type or is None, since 'ascending' itself should + be a boolean + """ + + if is_integer(ascending) or ascending is None: + args = (ascending,) + args + ascending = True + + validate_argsort_kind(args, kwargs, max_fname_arg_count=3) + return ascending + + +CLIP_DEFAULTS = dict(out=None) # type Dict[str, Any] +validate_clip = CompatValidator( + CLIP_DEFAULTS, fname="clip", method="both", max_fname_arg_count=3 +) + + +def validate_clip_with_axis(axis, args, kwargs): + """ + If 'NDFrame.clip' is called via the numpy library, the third + parameter in its signature is 'out', which can takes an ndarray, + so check if the 'axis' parameter is an instance of ndarray, since + 'axis' itself should either be an integer or None + """ + + if isinstance(axis, ndarray): + args = (axis,) + args + axis = None + + validate_clip(args, kwargs) + return axis + + +CUM_FUNC_DEFAULTS: "OrderedDict[str, Any]" = OrderedDict() +CUM_FUNC_DEFAULTS["dtype"] = None +CUM_FUNC_DEFAULTS["out"] = None +validate_cum_func = CompatValidator( + CUM_FUNC_DEFAULTS, method="both", max_fname_arg_count=1 +) +validate_cumsum = CompatValidator( + CUM_FUNC_DEFAULTS, fname="cumsum", method="both", max_fname_arg_count=1 +) + + +def validate_cum_func_with_skipna(skipna, args, kwargs, name): + """ + If this function is called via the 'numpy' library, the third + parameter in its signature is 'dtype', which takes either a + 'numpy' dtype or 'None', so check if the 'skipna' parameter is + a boolean or not + """ + if not is_bool(skipna): + args = (skipna,) + args + skipna = True + + validate_cum_func(args, kwargs, fname=name) + return skipna + + +ALLANY_DEFAULTS: "OrderedDict[str, Optional[bool]]" = OrderedDict() +ALLANY_DEFAULTS["dtype"] = None +ALLANY_DEFAULTS["out"] = None +ALLANY_DEFAULTS["keepdims"] = False +validate_all = CompatValidator( + ALLANY_DEFAULTS, fname="all", method="both", max_fname_arg_count=1 +) +validate_any = CompatValidator( + ALLANY_DEFAULTS, fname="any", method="both", max_fname_arg_count=1 +) + +LOGICAL_FUNC_DEFAULTS = dict(out=None, keepdims=False) +validate_logical_func = CompatValidator(LOGICAL_FUNC_DEFAULTS, method="kwargs") + +MINMAX_DEFAULTS = dict(out=None, keepdims=False) +validate_min = CompatValidator( + MINMAX_DEFAULTS, fname="min", method="both", max_fname_arg_count=1 +) +validate_max = CompatValidator( + MINMAX_DEFAULTS, fname="max", method="both", max_fname_arg_count=1 +) + +RESHAPE_DEFAULTS: Dict[str, str] = dict(order="C") +validate_reshape = CompatValidator( + RESHAPE_DEFAULTS, fname="reshape", method="both", max_fname_arg_count=1 +) + +REPEAT_DEFAULTS: Dict[str, Any] = dict(axis=None) +validate_repeat = CompatValidator( + REPEAT_DEFAULTS, fname="repeat", method="both", max_fname_arg_count=1 +) + +ROUND_DEFAULTS: Dict[str, Any] = dict(out=None) +validate_round = CompatValidator( + ROUND_DEFAULTS, fname="round", method="both", max_fname_arg_count=1 +) + +SORT_DEFAULTS: "OrderedDict[str, Optional[Union[int, str]]]" = OrderedDict() +SORT_DEFAULTS["axis"] = -1 +SORT_DEFAULTS["kind"] = "quicksort" +SORT_DEFAULTS["order"] = None +validate_sort = CompatValidator(SORT_DEFAULTS, fname="sort", method="kwargs") + +STAT_FUNC_DEFAULTS: "OrderedDict[str, Optional[Any]]" = OrderedDict() +STAT_FUNC_DEFAULTS["dtype"] = None +STAT_FUNC_DEFAULTS["out"] = None + +PROD_DEFAULTS = SUM_DEFAULTS = STAT_FUNC_DEFAULTS.copy() +SUM_DEFAULTS["keepdims"] = False +SUM_DEFAULTS["initial"] = None + +MEDIAN_DEFAULTS = STAT_FUNC_DEFAULTS.copy() +MEDIAN_DEFAULTS["overwrite_input"] = False +MEDIAN_DEFAULTS["keepdims"] = False + +STAT_FUNC_DEFAULTS["keepdims"] = False + +validate_stat_func = CompatValidator(STAT_FUNC_DEFAULTS, method="kwargs") +validate_sum = CompatValidator( + SUM_DEFAULTS, fname="sum", method="both", max_fname_arg_count=1 +) +validate_prod = CompatValidator( + PROD_DEFAULTS, fname="prod", method="both", max_fname_arg_count=1 +) +validate_mean = CompatValidator( + STAT_FUNC_DEFAULTS, fname="mean", method="both", max_fname_arg_count=1 +) +validate_median = CompatValidator( + MEDIAN_DEFAULTS, fname="median", method="both", max_fname_arg_count=1 +) + +STAT_DDOF_FUNC_DEFAULTS: "OrderedDict[str, Optional[bool]]" = OrderedDict() +STAT_DDOF_FUNC_DEFAULTS["dtype"] = None +STAT_DDOF_FUNC_DEFAULTS["out"] = None +STAT_DDOF_FUNC_DEFAULTS["keepdims"] = False +validate_stat_ddof_func = CompatValidator(STAT_DDOF_FUNC_DEFAULTS, method="kwargs") + +TAKE_DEFAULTS: "OrderedDict[str, Optional[str]]" = OrderedDict() +TAKE_DEFAULTS["out"] = None +TAKE_DEFAULTS["mode"] = "raise" +validate_take = CompatValidator(TAKE_DEFAULTS, fname="take", method="kwargs") + + +def validate_take_with_convert(convert, args, kwargs): + """ + If this function is called via the 'numpy' library, the third + parameter in its signature is 'axis', which takes either an + ndarray or 'None', so check if the 'convert' parameter is either + an instance of ndarray or is None + """ + + if isinstance(convert, ndarray) or convert is None: + args = (convert,) + args + convert = True + + validate_take(args, kwargs, max_fname_arg_count=3, method="both") + return convert + + +TRANSPOSE_DEFAULTS = dict(axes=None) +validate_transpose = CompatValidator( + TRANSPOSE_DEFAULTS, fname="transpose", method="both", max_fname_arg_count=0 +) + + +def validate_window_func(name, args, kwargs): + numpy_args = ("axis", "dtype", "out") + msg = ( + f"numpy operations are not valid with window objects. " + f"Use .{name}() directly instead " + ) + + if len(args) > 0: + raise UnsupportedFunctionCall(msg) + + for arg in numpy_args: + if arg in kwargs: + raise UnsupportedFunctionCall(msg) + + +def validate_rolling_func(name, args, kwargs): + numpy_args = ("axis", "dtype", "out") + msg = ( + f"numpy operations are not valid with window objects. " + f"Use .rolling(...).{name}() instead " + ) + + if len(args) > 0: + raise UnsupportedFunctionCall(msg) + + for arg in numpy_args: + if arg in kwargs: + raise UnsupportedFunctionCall(msg) + + +def validate_expanding_func(name, args, kwargs): + numpy_args = ("axis", "dtype", "out") + msg = ( + f"numpy operations are not valid with window objects. " + f"Use .expanding(...).{name}() instead " + ) + + if len(args) > 0: + raise UnsupportedFunctionCall(msg) + + for arg in numpy_args: + if arg in kwargs: + raise UnsupportedFunctionCall(msg) + + +def validate_groupby_func(name, args, kwargs, allowed=None): + """ + 'args' and 'kwargs' should be empty, except for allowed + kwargs because all of + their necessary parameters are explicitly listed in + the function signature + """ + if allowed is None: + allowed = [] + + kwargs = set(kwargs) - set(allowed) + + if len(args) + len(kwargs) > 0: + raise UnsupportedFunctionCall( + f"numpy operations are not valid with " + f"groupby. Use .groupby(...).{name}() " + f"instead" + ) + + +RESAMPLER_NUMPY_OPS = ("min", "max", "sum", "prod", "mean", "std", "var") + + +def validate_resampler_func(method, args, kwargs): + """ + 'args' and 'kwargs' should be empty because all of + their necessary parameters are explicitly listed in + the function signature + """ + if len(args) + len(kwargs) > 0: + if method in RESAMPLER_NUMPY_OPS: + raise UnsupportedFunctionCall( + f"numpy operations are not " + f"valid with resample. Use " + f".resample(...).{method}() instead" + ) + else: + raise TypeError("too many arguments passed in") + + +def validate_minmax_axis(axis): + """ + Ensure that the axis argument passed to min, max, argmin, or argmax is + zero or None, as otherwise it will be incorrectly ignored. + + Parameters + ---------- + axis : int or None + + Raises + ------ + ValueError + """ + ndim = 1 # hard-coded for Index + if axis is None: + return + if axis >= ndim or (axis < 0 and ndim + axis < 0): + raise ValueError(f"`axis` must be fewer than the number of dimensions ({ndim})") diff --git a/venv/Lib/site-packages/pandas/compat/pickle_compat.py b/venv/Lib/site-packages/pandas/compat/pickle_compat.py new file mode 100644 index 0000000..0a1a137 --- /dev/null +++ b/venv/Lib/site-packages/pandas/compat/pickle_compat.py @@ -0,0 +1,243 @@ +""" +Support pre-0.12 series pickle compatibility. +""" + +import copy +import pickle as pkl +from typing import TYPE_CHECKING, Optional +import warnings + +from pandas import Index + +if TYPE_CHECKING: + from pandas import Series, DataFrame + + +def load_reduce(self): + stack = self.stack + args = stack.pop() + func = stack[-1] + + if len(args) and type(args[0]) is type: + n = args[0].__name__ # noqa + + try: + stack[-1] = func(*args) + return + except TypeError as err: + + # If we have a deprecated function, + # try to replace and try again. + + msg = "_reconstruct: First argument must be a sub-type of ndarray" + + if msg in str(err): + try: + cls = args[0] + stack[-1] = object.__new__(cls) + return + except TypeError: + pass + + raise + + +_sparse_msg = """\ + +Loading a saved '{cls}' as a {new} with sparse values. +'{cls}' is now removed. You should re-save this dataset in its new format. +""" + + +class _LoadSparseSeries: + # To load a SparseSeries as a Series[Sparse] + + # https://github.com/python/mypy/issues/1020 + # error: Incompatible return type for "__new__" (returns "Series", but must return + # a subtype of "_LoadSparseSeries") + def __new__(cls) -> "Series": # type: ignore + from pandas import Series + + warnings.warn( + _sparse_msg.format(cls="SparseSeries", new="Series"), + FutureWarning, + stacklevel=6, + ) + + return Series(dtype=object) + + +class _LoadSparseFrame: + # To load a SparseDataFrame as a DataFrame[Sparse] + + # https://github.com/python/mypy/issues/1020 + # error: Incompatible return type for "__new__" (returns "DataFrame", but must + # return a subtype of "_LoadSparseFrame") + def __new__(cls) -> "DataFrame": # type: ignore + from pandas import DataFrame + + warnings.warn( + _sparse_msg.format(cls="SparseDataFrame", new="DataFrame"), + FutureWarning, + stacklevel=6, + ) + + return DataFrame() + + +# If classes are moved, provide compat here. +_class_locations_map = { + ("pandas.core.sparse.array", "SparseArray"): ("pandas.core.arrays", "SparseArray"), + # 15477 + ("pandas.core.base", "FrozenNDArray"): ("numpy", "ndarray"), + ("pandas.core.indexes.frozen", "FrozenNDArray"): ("numpy", "ndarray"), + ("pandas.core.base", "FrozenList"): ("pandas.core.indexes.frozen", "FrozenList"), + # 10890 + ("pandas.core.series", "TimeSeries"): ("pandas.core.series", "Series"), + ("pandas.sparse.series", "SparseTimeSeries"): ( + "pandas.core.sparse.series", + "SparseSeries", + ), + # 12588, extensions moving + ("pandas._sparse", "BlockIndex"): ("pandas._libs.sparse", "BlockIndex"), + ("pandas.tslib", "Timestamp"): ("pandas._libs.tslib", "Timestamp"), + # 18543 moving period + ("pandas._period", "Period"): ("pandas._libs.tslibs.period", "Period"), + ("pandas._libs.period", "Period"): ("pandas._libs.tslibs.period", "Period"), + # 18014 moved __nat_unpickle from _libs.tslib-->_libs.tslibs.nattype + ("pandas.tslib", "__nat_unpickle"): ( + "pandas._libs.tslibs.nattype", + "__nat_unpickle", + ), + ("pandas._libs.tslib", "__nat_unpickle"): ( + "pandas._libs.tslibs.nattype", + "__nat_unpickle", + ), + # 15998 top-level dirs moving + ("pandas.sparse.array", "SparseArray"): ( + "pandas.core.arrays.sparse", + "SparseArray", + ), + ("pandas.sparse.series", "SparseSeries"): ( + "pandas.compat.pickle_compat", + "_LoadSparseSeries", + ), + ("pandas.sparse.frame", "SparseDataFrame"): ( + "pandas.core.sparse.frame", + "_LoadSparseFrame", + ), + ("pandas.indexes.base", "_new_Index"): ("pandas.core.indexes.base", "_new_Index"), + ("pandas.indexes.base", "Index"): ("pandas.core.indexes.base", "Index"), + ("pandas.indexes.numeric", "Int64Index"): ( + "pandas.core.indexes.numeric", + "Int64Index", + ), + ("pandas.indexes.range", "RangeIndex"): ("pandas.core.indexes.range", "RangeIndex"), + ("pandas.indexes.multi", "MultiIndex"): ("pandas.core.indexes.multi", "MultiIndex"), + ("pandas.tseries.index", "_new_DatetimeIndex"): ( + "pandas.core.indexes.datetimes", + "_new_DatetimeIndex", + ), + ("pandas.tseries.index", "DatetimeIndex"): ( + "pandas.core.indexes.datetimes", + "DatetimeIndex", + ), + ("pandas.tseries.period", "PeriodIndex"): ( + "pandas.core.indexes.period", + "PeriodIndex", + ), + # 19269, arrays moving + ("pandas.core.categorical", "Categorical"): ("pandas.core.arrays", "Categorical"), + # 19939, add timedeltaindex, float64index compat from 15998 move + ("pandas.tseries.tdi", "TimedeltaIndex"): ( + "pandas.core.indexes.timedeltas", + "TimedeltaIndex", + ), + ("pandas.indexes.numeric", "Float64Index"): ( + "pandas.core.indexes.numeric", + "Float64Index", + ), + ("pandas.core.sparse.series", "SparseSeries"): ( + "pandas.compat.pickle_compat", + "_LoadSparseSeries", + ), + ("pandas.core.sparse.frame", "SparseDataFrame"): ( + "pandas.compat.pickle_compat", + "_LoadSparseFrame", + ), +} + + +# our Unpickler sub-class to override methods and some dispatcher +# functions for compat and uses a non-public class of the pickle module. + +# error: Name 'pkl._Unpickler' is not defined +class Unpickler(pkl._Unpickler): # type: ignore + def find_class(self, module, name): + # override superclass + key = (module, name) + module, name = _class_locations_map.get(key, key) + return super().find_class(module, name) + + +Unpickler.dispatch = copy.copy(Unpickler.dispatch) +Unpickler.dispatch[pkl.REDUCE[0]] = load_reduce + + +def load_newobj(self): + args = self.stack.pop() + cls = self.stack[-1] + + # compat + if issubclass(cls, Index): + obj = object.__new__(cls) + else: + obj = cls.__new__(cls, *args) + + self.stack[-1] = obj + + +Unpickler.dispatch[pkl.NEWOBJ[0]] = load_newobj + + +def load_newobj_ex(self): + kwargs = self.stack.pop() + args = self.stack.pop() + cls = self.stack.pop() + + # compat + if issubclass(cls, Index): + obj = object.__new__(cls) + else: + obj = cls.__new__(cls, *args, **kwargs) + self.append(obj) + + +try: + Unpickler.dispatch[pkl.NEWOBJ_EX[0]] = load_newobj_ex +except (AttributeError, KeyError): + pass + + +def load(fh, encoding: Optional[str] = None, is_verbose: bool = False): + """ + Load a pickle, with a provided encoding, + + Parameters + ---------- + fh : a filelike object + encoding : an optional encoding + is_verbose : show exception output + """ + + try: + fh.seek(0) + if encoding is not None: + up = Unpickler(fh, encoding=encoding) + else: + up = Unpickler(fh) + up.is_verbose = is_verbose + + return up.load() + except (ValueError, TypeError): + raise diff --git a/venv/Lib/site-packages/pandas/conftest.py b/venv/Lib/site-packages/pandas/conftest.py new file mode 100644 index 0000000..3eab218 --- /dev/null +++ b/venv/Lib/site-packages/pandas/conftest.py @@ -0,0 +1,932 @@ +from collections import abc +from datetime import date, time, timedelta, timezone +from decimal import Decimal +import operator +import os + +from dateutil.tz import tzlocal, tzutc +import hypothesis +from hypothesis import strategies as st +import numpy as np +import pytest +from pytz import FixedOffset, utc + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import DataFrame +import pandas._testing as tm +from pandas.core import ops + +hypothesis.settings.register_profile( + "ci", + # Hypothesis timing checks are tuned for scalars by default, so we bump + # them from 200ms to 500ms per test case as the global default. If this + # is too short for a specific test, (a) try to make it faster, and (b) + # if it really is slow add `@settings(deadline=...)` with a working value, + # or `deadline=None` to entirely disable timeouts for that test. + deadline=500, + suppress_health_check=(hypothesis.HealthCheck.too_slow,), +) +hypothesis.settings.load_profile("ci") + + +def pytest_addoption(parser): + parser.addoption("--skip-slow", action="store_true", help="skip slow tests") + parser.addoption("--skip-network", action="store_true", help="skip network tests") + parser.addoption("--skip-db", action="store_true", help="skip db tests") + parser.addoption( + "--run-high-memory", action="store_true", help="run high memory tests" + ) + parser.addoption("--only-slow", action="store_true", help="run only slow tests") + parser.addoption( + "--strict-data-files", + action="store_true", + help="Fail if a test is skipped for missing data file.", + ) + + +def pytest_runtest_setup(item): + if "slow" in item.keywords and item.config.getoption("--skip-slow"): + pytest.skip("skipping due to --skip-slow") + + if "slow" not in item.keywords and item.config.getoption("--only-slow"): + pytest.skip("skipping due to --only-slow") + + if "network" in item.keywords and item.config.getoption("--skip-network"): + pytest.skip("skipping due to --skip-network") + + if "db" in item.keywords and item.config.getoption("--skip-db"): + pytest.skip("skipping due to --skip-db") + + if "high_memory" in item.keywords and not item.config.getoption( + "--run-high-memory" + ): + pytest.skip("skipping high memory test since --run-high-memory was not set") + + +# Configurations for all tests and all test modules + + +@pytest.fixture(autouse=True) +def configure_tests(): + pd.set_option("chained_assignment", "raise") + + +# For running doctests: make np and pd names available + + +@pytest.fixture(autouse=True) +def add_imports(doctest_namespace): + doctest_namespace["np"] = np + doctest_namespace["pd"] = pd + + +@pytest.fixture(params=["bsr", "coo", "csc", "csr", "dia", "dok", "lil"]) +def spmatrix(request): + from scipy import sparse + + return getattr(sparse, request.param + "_matrix") + + +@pytest.fixture(params=[0, 1, "index", "columns"], ids=lambda x: f"axis {repr(x)}") +def axis(request): + """ + Fixture for returning the axis numbers of a DataFrame. + """ + return request.param + + +axis_frame = axis + + +@pytest.fixture(params=[0, "index"], ids=lambda x: f"axis {repr(x)}") +def axis_series(request): + """ + Fixture for returning the axis numbers of a Series. + """ + return request.param + + +@pytest.fixture +def ip(): + """ + Get an instance of IPython.InteractiveShell. + + Will raise a skip if IPython is not installed. + """ + + pytest.importorskip("IPython", minversion="6.0.0") + from IPython.core.interactiveshell import InteractiveShell + + return InteractiveShell() + + +@pytest.fixture(params=[True, False, None]) +def observed(request): + """ + Pass in the observed keyword to groupby for [True, False] + This indicates whether categoricals should return values for + values which are not in the grouper [False / None], or only values which + appear in the grouper [True]. [None] is supported for future compatibility + if we decide to change the default (and would need to warn if this + parameter is not passed). + """ + return request.param + + +@pytest.fixture(params=[True, False, None]) +def ordered_fixture(request): + """ + Boolean 'ordered' parameter for Categorical. + """ + return request.param + + +_all_arithmetic_operators = [ + "__add__", + "__radd__", + "__sub__", + "__rsub__", + "__mul__", + "__rmul__", + "__floordiv__", + "__rfloordiv__", + "__truediv__", + "__rtruediv__", + "__pow__", + "__rpow__", + "__mod__", + "__rmod__", +] + + +@pytest.fixture(params=_all_arithmetic_operators) +def all_arithmetic_operators(request): + """ + Fixture for dunder names for common arithmetic operations. + """ + return request.param + + +@pytest.fixture( + params=[ + operator.add, + ops.radd, + operator.sub, + ops.rsub, + operator.mul, + ops.rmul, + operator.truediv, + ops.rtruediv, + operator.floordiv, + ops.rfloordiv, + operator.mod, + ops.rmod, + operator.pow, + ops.rpow, + ] +) +def all_arithmetic_functions(request): + """ + Fixture for operator and roperator arithmetic functions. + + Notes + ----- + This includes divmod and rdivmod, whereas all_arithmetic_operators + does not. + """ + return request.param + + +_all_numeric_reductions = [ + "sum", + "max", + "min", + "mean", + "prod", + "std", + "var", + "median", + "kurt", + "skew", +] + + +@pytest.fixture(params=_all_numeric_reductions) +def all_numeric_reductions(request): + """ + Fixture for numeric reduction names. + """ + return request.param + + +_all_boolean_reductions = ["all", "any"] + + +@pytest.fixture(params=_all_boolean_reductions) +def all_boolean_reductions(request): + """ + Fixture for boolean reduction names. + """ + return request.param + + +_cython_table = pd.core.base.SelectionMixin._cython_table.items() + + +@pytest.fixture(params=list(_cython_table)) +def cython_table_items(request): + return request.param + + +def _get_cython_table_params(ndframe, func_names_and_expected): + """ + Combine frame, functions from SelectionMixin._cython_table + keys and expected result. + + Parameters + ---------- + ndframe : DataFrame or Series + func_names_and_expected : Sequence of two items + The first item is a name of a NDFrame method ('sum', 'prod') etc. + The second item is the expected return value. + + Returns + ------- + list + List of three items (DataFrame, function, expected result) + """ + results = [] + for func_name, expected in func_names_and_expected: + results.append((ndframe, func_name, expected)) + results += [ + (ndframe, func, expected) + for func, name in _cython_table + if name == func_name + ] + return results + + +@pytest.fixture(params=["__eq__", "__ne__", "__le__", "__lt__", "__ge__", "__gt__"]) +def all_compare_operators(request): + """ + Fixture for dunder names for common compare operations + + * >= + * > + * == + * != + * < + * <= + """ + return request.param + + +@pytest.fixture(params=["__le__", "__lt__", "__ge__", "__gt__"]) +def compare_operators_no_eq_ne(request): + """ + Fixture for dunder names for compare operations except == and != + + * >= + * > + * < + * <= + """ + return request.param + + +@pytest.fixture( + params=["__and__", "__rand__", "__or__", "__ror__", "__xor__", "__rxor__"] +) +def all_logical_operators(request): + """ + Fixture for dunder names for common logical operations + + * | + * & + * ^ + """ + return request.param + + +@pytest.fixture(params=[None, "gzip", "bz2", "zip", "xz"]) +def compression(request): + """ + Fixture for trying common compression types in compression tests. + """ + return request.param + + +@pytest.fixture(params=["gzip", "bz2", "zip", "xz"]) +def compression_only(request): + """ + Fixture for trying common compression types in compression tests excluding + uncompressed case. + """ + return request.param + + +@pytest.fixture(params=[True, False]) +def writable(request): + """ + Fixture that an array is writable. + """ + return request.param + + +@pytest.fixture(scope="module") +def datetime_tz_utc(): + return timezone.utc + + +@pytest.fixture(params=["utc", "dateutil/UTC", utc, tzutc(), timezone.utc]) +def utc_fixture(request): + """ + Fixture to provide variants of UTC timezone strings and tzinfo objects. + """ + return request.param + + +@pytest.fixture(params=["inner", "outer", "left", "right"]) +def join_type(request): + """ + Fixture for trying all types of join operations. + """ + return request.param + + +@pytest.fixture +def strict_data_files(pytestconfig): + return pytestconfig.getoption("--strict-data-files") + + +@pytest.fixture +def datapath(strict_data_files): + """ + Get the path to a data file. + + Parameters + ---------- + path : str + Path to the file, relative to ``pandas/tests/`` + + Returns + ------- + path including ``pandas/tests``. + + Raises + ------ + ValueError + If the path doesn't exist and the --strict-data-files option is set. + """ + BASE_PATH = os.path.join(os.path.dirname(__file__), "tests") + + def deco(*args): + path = os.path.join(BASE_PATH, *args) + if not os.path.exists(path): + if strict_data_files: + raise ValueError( + f"Could not find file {path} and --strict-data-files is set." + ) + else: + pytest.skip(f"Could not find {path}.") + return path + + return deco + + +@pytest.fixture +def iris(datapath): + """ + The iris dataset as a DataFrame. + """ + return pd.read_csv(datapath("data", "iris.csv")) + + +@pytest.fixture(params=["nlargest", "nsmallest"]) +def nselect_method(request): + """ + Fixture for trying all nselect methods. + """ + return request.param + + +@pytest.fixture(params=["left", "right", "both", "neither"]) +def closed(request): + """ + Fixture for trying all interval closed parameters. + """ + return request.param + + +@pytest.fixture(params=["left", "right", "both", "neither"]) +def other_closed(request): + """ + Secondary closed fixture to allow parametrizing over all pairs of closed. + """ + return request.param + + +@pytest.fixture(params=[None, np.nan, pd.NaT, float("nan"), np.float("NaN")]) +def nulls_fixture(request): + """ + Fixture for each null type in pandas. + """ + return request.param + + +nulls_fixture2 = nulls_fixture # Generate cartesian product of nulls_fixture + + +@pytest.fixture(params=[None, np.nan, pd.NaT]) +def unique_nulls_fixture(request): + """ + Fixture for each null type in pandas, each null type exactly once. + """ + return request.param + + +# Generate cartesian product of unique_nulls_fixture: +unique_nulls_fixture2 = unique_nulls_fixture + + +TIMEZONES = [ + None, + "UTC", + "US/Eastern", + "Asia/Tokyo", + "dateutil/US/Pacific", + "dateutil/Asia/Singapore", + tzutc(), + tzlocal(), + FixedOffset(300), + FixedOffset(0), + FixedOffset(-300), + timezone.utc, + timezone(timedelta(hours=1)), + timezone(timedelta(hours=-1), name="foo"), +] +TIMEZONE_IDS = [repr(i) for i in TIMEZONES] + + +@td.parametrize_fixture_doc(str(TIMEZONE_IDS)) +@pytest.fixture(params=TIMEZONES, ids=TIMEZONE_IDS) +def tz_naive_fixture(request): + """ + Fixture for trying timezones including default (None): {0} + """ + return request.param + + +@td.parametrize_fixture_doc(str(TIMEZONE_IDS[1:])) +@pytest.fixture(params=TIMEZONES[1:], ids=TIMEZONE_IDS[1:]) +def tz_aware_fixture(request): + """ + Fixture for trying explicit timezones: {0} + """ + return request.param + + +# Generate cartesian product of tz_aware_fixture: +tz_aware_fixture2 = tz_aware_fixture + + +# ---------------------------------------------------------------- +# Dtypes +# ---------------------------------------------------------------- + +UNSIGNED_INT_DTYPES = ["uint8", "uint16", "uint32", "uint64"] +UNSIGNED_EA_INT_DTYPES = ["UInt8", "UInt16", "UInt32", "UInt64"] +SIGNED_INT_DTYPES = [int, "int8", "int16", "int32", "int64"] +SIGNED_EA_INT_DTYPES = ["Int8", "Int16", "Int32", "Int64"] +ALL_INT_DTYPES = UNSIGNED_INT_DTYPES + SIGNED_INT_DTYPES +ALL_EA_INT_DTYPES = UNSIGNED_EA_INT_DTYPES + SIGNED_EA_INT_DTYPES + +FLOAT_DTYPES = [float, "float32", "float64"] +COMPLEX_DTYPES = [complex, "complex64", "complex128"] +STRING_DTYPES = [str, "str", "U"] + +DATETIME64_DTYPES = ["datetime64[ns]", "M8[ns]"] +TIMEDELTA64_DTYPES = ["timedelta64[ns]", "m8[ns]"] + +BOOL_DTYPES = [bool, "bool"] +BYTES_DTYPES = [bytes, "bytes"] +OBJECT_DTYPES = [object, "object"] + +ALL_REAL_DTYPES = FLOAT_DTYPES + ALL_INT_DTYPES +ALL_NUMPY_DTYPES = ( + ALL_REAL_DTYPES + + COMPLEX_DTYPES + + STRING_DTYPES + + DATETIME64_DTYPES + + TIMEDELTA64_DTYPES + + BOOL_DTYPES + + OBJECT_DTYPES + + BYTES_DTYPES +) + + +@pytest.fixture(params=STRING_DTYPES) +def string_dtype(request): + """ + Parametrized fixture for string dtypes. + + * str + * 'str' + * 'U' + """ + return request.param + + +@pytest.fixture(params=BYTES_DTYPES) +def bytes_dtype(request): + """ + Parametrized fixture for bytes dtypes. + + * bytes + * 'bytes' + """ + return request.param + + +@pytest.fixture(params=OBJECT_DTYPES) +def object_dtype(request): + """ + Parametrized fixture for object dtypes. + + * object + * 'object' + """ + return request.param + + +@pytest.fixture(params=DATETIME64_DTYPES) +def datetime64_dtype(request): + """ + Parametrized fixture for datetime64 dtypes. + + * 'datetime64[ns]' + * 'M8[ns]' + """ + return request.param + + +@pytest.fixture(params=TIMEDELTA64_DTYPES) +def timedelta64_dtype(request): + """ + Parametrized fixture for timedelta64 dtypes. + + * 'timedelta64[ns]' + * 'm8[ns]' + """ + return request.param + + +@pytest.fixture(params=FLOAT_DTYPES) +def float_dtype(request): + """ + Parameterized fixture for float dtypes. + + * float + * 'float32' + * 'float64' + """ + return request.param + + +@pytest.fixture(params=COMPLEX_DTYPES) +def complex_dtype(request): + """ + Parameterized fixture for complex dtypes. + + * complex + * 'complex64' + * 'complex128' + """ + return request.param + + +@pytest.fixture(params=SIGNED_INT_DTYPES) +def sint_dtype(request): + """ + Parameterized fixture for signed integer dtypes. + + * int + * 'int8' + * 'int16' + * 'int32' + * 'int64' + """ + return request.param + + +@pytest.fixture(params=UNSIGNED_INT_DTYPES) +def uint_dtype(request): + """ + Parameterized fixture for unsigned integer dtypes. + + * 'uint8' + * 'uint16' + * 'uint32' + * 'uint64' + """ + return request.param + + +@pytest.fixture(params=ALL_INT_DTYPES) +def any_int_dtype(request): + """ + Parameterized fixture for any integer dtype. + + * int + * 'int8' + * 'uint8' + * 'int16' + * 'uint16' + * 'int32' + * 'uint32' + * 'int64' + * 'uint64' + """ + return request.param + + +@pytest.fixture(params=ALL_EA_INT_DTYPES) +def any_nullable_int_dtype(request): + """ + Parameterized fixture for any nullable integer dtype. + + * 'UInt8' + * 'Int8' + * 'UInt16' + * 'Int16' + * 'UInt32' + * 'Int32' + * 'UInt64' + * 'Int64' + """ + + return request.param + + +@pytest.fixture(params=ALL_REAL_DTYPES) +def any_real_dtype(request): + """ + Parameterized fixture for any (purely) real numeric dtype. + + * int + * 'int8' + * 'uint8' + * 'int16' + * 'uint16' + * 'int32' + * 'uint32' + * 'int64' + * 'uint64' + * float + * 'float32' + * 'float64' + """ + return request.param + + +@pytest.fixture(params=ALL_NUMPY_DTYPES) +def any_numpy_dtype(request): + """ + Parameterized fixture for all numpy dtypes. + + * bool + * 'bool' + * int + * 'int8' + * 'uint8' + * 'int16' + * 'uint16' + * 'int32' + * 'uint32' + * 'int64' + * 'uint64' + * float + * 'float32' + * 'float64' + * complex + * 'complex64' + * 'complex128' + * str + * 'str' + * 'U' + * bytes + * 'bytes' + * 'datetime64[ns]' + * 'M8[ns]' + * 'timedelta64[ns]' + * 'm8[ns]' + * object + * 'object' + """ + return request.param + + +# categoricals are handled separately +_any_skipna_inferred_dtype = [ + ("string", ["a", np.nan, "c"]), + ("bytes", [b"a", np.nan, b"c"]), + ("empty", [np.nan, np.nan, np.nan]), + ("empty", []), + ("mixed-integer", ["a", np.nan, 2]), + ("mixed", ["a", np.nan, 2.0]), + ("floating", [1.0, np.nan, 2.0]), + ("integer", [1, np.nan, 2]), + ("mixed-integer-float", [1, np.nan, 2.0]), + ("decimal", [Decimal(1), np.nan, Decimal(2)]), + ("boolean", [True, np.nan, False]), + ("datetime64", [np.datetime64("2013-01-01"), np.nan, np.datetime64("2018-01-01")]), + ("datetime", [pd.Timestamp("20130101"), np.nan, pd.Timestamp("20180101")]), + ("date", [date(2013, 1, 1), np.nan, date(2018, 1, 1)]), + # The following two dtypes are commented out due to GH 23554 + # ('complex', [1 + 1j, np.nan, 2 + 2j]), + # ('timedelta64', [np.timedelta64(1, 'D'), + # np.nan, np.timedelta64(2, 'D')]), + ("timedelta", [timedelta(1), np.nan, timedelta(2)]), + ("time", [time(1), np.nan, time(2)]), + ("period", [pd.Period(2013), pd.NaT, pd.Period(2018)]), + ("interval", [pd.Interval(0, 1), np.nan, pd.Interval(0, 2)]), +] +ids, _ = zip(*_any_skipna_inferred_dtype) # use inferred type as fixture-id + + +@pytest.fixture(params=_any_skipna_inferred_dtype, ids=ids) +def any_skipna_inferred_dtype(request): + """ + Fixture for all inferred dtypes from _libs.lib.infer_dtype + + The covered (inferred) types are: + * 'string' + * 'empty' + * 'bytes' + * 'mixed' + * 'mixed-integer' + * 'mixed-integer-float' + * 'floating' + * 'integer' + * 'decimal' + * 'boolean' + * 'datetime64' + * 'datetime' + * 'date' + * 'timedelta' + * 'time' + * 'period' + * 'interval' + + Returns + ------- + inferred_dtype : str + The string for the inferred dtype from _libs.lib.infer_dtype + values : np.ndarray + An array of object dtype that will be inferred to have + `inferred_dtype` + + Examples + -------- + >>> import pandas._libs.lib as lib + >>> + >>> def test_something(any_skipna_inferred_dtype): + ... inferred_dtype, values = any_skipna_inferred_dtype + ... # will pass + ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype + """ + inferred_dtype, values = request.param + values = np.array(values, dtype=object) # object dtype to avoid casting + + # correctness of inference tested in tests/dtypes/test_inference.py + return inferred_dtype, values + + +@pytest.fixture( + params=[ + getattr(pd.offsets, o) + for o in pd.offsets.__all__ + if issubclass(getattr(pd.offsets, o), pd.offsets.Tick) + ] +) +def tick_classes(request): + """ + Fixture for Tick based datetime offsets available for a time series. + """ + return request.param + + +# ---------------------------------------------------------------- +# Global setup for tests using Hypothesis + + +# Registering these strategies makes them globally available via st.from_type, +# which is use for offsets in tests/tseries/offsets/test_offsets_properties.py +for name in "MonthBegin MonthEnd BMonthBegin BMonthEnd".split(): + cls = getattr(pd.tseries.offsets, name) + st.register_type_strategy( + cls, st.builds(cls, n=st.integers(-99, 99), normalize=st.booleans()) + ) + +for name in "YearBegin YearEnd BYearBegin BYearEnd".split(): + cls = getattr(pd.tseries.offsets, name) + st.register_type_strategy( + cls, + st.builds( + cls, + n=st.integers(-5, 5), + normalize=st.booleans(), + month=st.integers(min_value=1, max_value=12), + ), + ) + +for name in "QuarterBegin QuarterEnd BQuarterBegin BQuarterEnd".split(): + cls = getattr(pd.tseries.offsets, name) + st.register_type_strategy( + cls, + st.builds( + cls, + n=st.integers(-24, 24), + normalize=st.booleans(), + startingMonth=st.integers(min_value=1, max_value=12), + ), + ) + + +@pytest.fixture +def float_frame(): + """ + Fixture for DataFrame of floats with index of unique strings + + Columns are ['A', 'B', 'C', 'D']. + + A B C D + P7GACiRnxd -0.465578 -0.361863 0.886172 -0.053465 + qZKh6afn8n -0.466693 -0.373773 0.266873 1.673901 + tkp0r6Qble 0.148691 -0.059051 0.174817 1.598433 + wP70WOCtv8 0.133045 -0.581994 -0.992240 0.261651 + M2AeYQMnCz -1.207959 -0.185775 0.588206 0.563938 + QEPzyGDYDo -0.381843 -0.758281 0.502575 -0.565053 + r78Jwns6dn -0.653707 0.883127 0.682199 0.206159 + ... ... ... ... ... + IHEGx9NO0T -0.277360 0.113021 -1.018314 0.196316 + lPMj8K27FA -1.313667 -0.604776 -1.305618 -0.863999 + qa66YMWQa5 1.110525 0.475310 -0.747865 0.032121 + yOa0ATsmcE -0.431457 0.067094 0.096567 -0.264962 + 65znX3uRNG 1.528446 0.160416 -0.109635 -0.032987 + eCOBvKqf3e 0.235281 1.622222 0.781255 0.392871 + xSucinXxuV -1.263557 0.252799 -0.552247 0.400426 + + [30 rows x 4 columns] + """ + return DataFrame(tm.getSeriesData()) + + +@pytest.fixture(params=[pd.Index, pd.Series], ids=["index", "series"]) +def index_or_series(request): + """ + Fixture to parametrize over Index and Series, made necessary by a mypy + bug, giving an error: + + List item 0 has incompatible type "Type[Series]"; expected "Type[PandasObject]" + + See GH#29725 + """ + return request.param + + +@pytest.fixture +def dict_subclass(): + """ + Fixture for a dictionary subclass. + """ + + class TestSubDict(dict): + def __init__(self, *args, **kwargs): + dict.__init__(self, *args, **kwargs) + + return TestSubDict + + +@pytest.fixture +def non_mapping_dict_subclass(): + """ + Fixture for a non-mapping dictionary subclass. + """ + + class TestNonDictMapping(abc.Mapping): + def __init__(self, underlying_dict): + self._data = underlying_dict + + def __getitem__(self, key): + return self._data.__getitem__(key) + + def __iter__(self): + return self._data.__iter__() + + def __len__(self): + return self._data.__len__() + + return TestNonDictMapping diff --git a/venv/Lib/site-packages/pandas/core/__init__.py b/venv/Lib/site-packages/pandas/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/core/accessor.py b/venv/Lib/site-packages/pandas/core/accessor.py new file mode 100644 index 0000000..3f1c7b1 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/accessor.py @@ -0,0 +1,316 @@ +""" + +accessor.py contains base classes for implementing accessor properties +that can be mixed into or pinned onto other pandas classes. + +""" +from typing import FrozenSet, Set +import warnings + +from pandas.util._decorators import Appender + + +class DirNamesMixin: + _accessors: Set[str] = set() + _deprecations: FrozenSet[str] = frozenset() + + def _dir_deletions(self): + """ + Delete unwanted __dir__ for this object. + """ + return self._accessors | self._deprecations + + def _dir_additions(self): + """ + Add additional __dir__ for this object. + """ + rv = set() + for accessor in self._accessors: + try: + getattr(self, accessor) + rv.add(accessor) + except AttributeError: + pass + return rv + + def __dir__(self): + """ + Provide method name lookup and completion. + + Notes + ----- + Only provide 'public' methods. + """ + rv = set(dir(type(self))) + rv = (rv - self._dir_deletions()) | self._dir_additions() + return sorted(rv) + + +class PandasDelegate: + """ + Abstract base class for delegating methods/properties. + """ + + def _delegate_property_get(self, name, *args, **kwargs): + raise TypeError(f"You cannot access the property {name}") + + def _delegate_property_set(self, name, value, *args, **kwargs): + raise TypeError(f"The property {name} cannot be set") + + def _delegate_method(self, name, *args, **kwargs): + raise TypeError(f"You cannot call method {name}") + + @classmethod + def _add_delegate_accessors( + cls, delegate, accessors, typ: str, overwrite: bool = False + ): + """ + Add accessors to cls from the delegate class. + + Parameters + ---------- + cls + Class to add the methods/properties to. + delegate + Class to get methods/properties and doc-strings. + accessors : list of str + List of accessors to add. + typ : {'property', 'method'} + overwrite : bool, default False + Overwrite the method/property in the target class if it exists. + """ + + def _create_delegator_property(name): + def _getter(self): + return self._delegate_property_get(name) + + def _setter(self, new_values): + return self._delegate_property_set(name, new_values) + + _getter.__name__ = name + _setter.__name__ = name + + return property( + fget=_getter, fset=_setter, doc=getattr(delegate, name).__doc__ + ) + + def _create_delegator_method(name): + def f(self, *args, **kwargs): + return self._delegate_method(name, *args, **kwargs) + + f.__name__ = name + f.__doc__ = getattr(delegate, name).__doc__ + + return f + + for name in accessors: + + if typ == "property": + f = _create_delegator_property(name) + else: + f = _create_delegator_method(name) + + # don't overwrite existing methods/properties + if overwrite or not hasattr(cls, name): + setattr(cls, name, f) + + +def delegate_names(delegate, accessors, typ: str, overwrite: bool = False): + """ + Add delegated names to a class using a class decorator. This provides + an alternative usage to directly calling `_add_delegate_accessors` + below a class definition. + + Parameters + ---------- + delegate : object + The class to get methods/properties & doc-strings. + accessors : Sequence[str] + List of accessor to add. + typ : {'property', 'method'} + overwrite : bool, default False + Overwrite the method/property in the target class if it exists. + + Returns + ------- + callable + A class decorator. + + Examples + -------- + @delegate_names(Categorical, ["categories", "ordered"], "property") + class CategoricalAccessor(PandasDelegate): + [...] + """ + + def add_delegate_accessors(cls): + cls._add_delegate_accessors(delegate, accessors, typ, overwrite=overwrite) + return cls + + return add_delegate_accessors + + +# Ported with modifications from xarray +# https://github.com/pydata/xarray/blob/master/xarray/core/extensions.py +# 1. We don't need to catch and re-raise AttributeErrors as RuntimeErrors +# 2. We use a UserWarning instead of a custom Warning + + +class CachedAccessor: + """ + Custom property-like object. + + A descriptor for caching accessors. + + Parameters + ---------- + name : str + Namespace that will be accessed under, e.g. ``df.foo``. + accessor : cls + Class with the extension methods. + + Notes + ----- + For accessor, The class's __init__ method assumes that one of + ``Series``, ``DataFrame`` or ``Index`` as the + single argument ``data``. + """ + + def __init__(self, name: str, accessor) -> None: + self._name = name + self._accessor = accessor + + def __get__(self, obj, cls): + if obj is None: + # we're accessing the attribute of the class, i.e., Dataset.geo + return self._accessor + accessor_obj = self._accessor(obj) + # Replace the property with the accessor object. Inspired by: + # http://www.pydanny.com/cached-property.html + # We need to use object.__setattr__ because we overwrite __setattr__ on + # NDFrame + object.__setattr__(obj, self._name, accessor_obj) + return accessor_obj + + +def _register_accessor(name, cls): + def decorator(accessor): + if hasattr(cls, name): + warnings.warn( + f"registration of accessor {repr(accessor)} under name " + f"{repr(name)} for type {repr(cls)} is overriding a preexisting" + f"attribute with the same name.", + UserWarning, + stacklevel=2, + ) + setattr(cls, name, CachedAccessor(name, accessor)) + cls._accessors.add(name) + return accessor + + return decorator + + +_doc = """ +Register a custom accessor on %(klass)s objects. + +Parameters +---------- +name : str + Name under which the accessor should be registered. A warning is issued + if this name conflicts with a preexisting attribute. + +Returns +------- +callable + A class decorator. + +See Also +-------- +%(others)s + +Notes +----- +When accessed, your accessor will be initialized with the pandas object +the user is interacting with. So the signature must be + +.. code-block:: python + + def __init__(self, pandas_object): # noqa: E999 + ... + +For consistency with pandas methods, you should raise an ``AttributeError`` +if the data passed to your accessor has an incorrect dtype. + +>>> pd.Series(['a', 'b']).dt +Traceback (most recent call last): +... +AttributeError: Can only use .dt accessor with datetimelike values + +Examples +-------- + +In your library code:: + + import pandas as pd + + @pd.api.extensions.register_dataframe_accessor("geo") + class GeoAccessor: + def __init__(self, pandas_obj): + self._obj = pandas_obj + + @property + def center(self): + # return the geographic center point of this DataFrame + lat = self._obj.latitude + lon = self._obj.longitude + return (float(lon.mean()), float(lat.mean())) + + def plot(self): + # plot this array's data on a map, e.g., using Cartopy + pass + +Back in an interactive IPython session: + + >>> ds = pd.DataFrame({'longitude': np.linspace(0, 10), + ... 'latitude': np.linspace(0, 20)}) + >>> ds.geo.center + (5.0, 10.0) + >>> ds.geo.plot() + # plots data on a map +""" + + +@Appender( + _doc + % dict( + klass="DataFrame", others=("register_series_accessor, register_index_accessor") + ) +) +def register_dataframe_accessor(name): + from pandas import DataFrame + + return _register_accessor(name, DataFrame) + + +@Appender( + _doc + % dict( + klass="Series", others=("register_dataframe_accessor, register_index_accessor") + ) +) +def register_series_accessor(name): + from pandas import Series + + return _register_accessor(name, Series) + + +@Appender( + _doc + % dict( + klass="Index", others=("register_dataframe_accessor, register_series_accessor") + ) +) +def register_index_accessor(name): + from pandas import Index + + return _register_accessor(name, Index) diff --git a/venv/Lib/site-packages/pandas/core/algorithms.py b/venv/Lib/site-packages/pandas/core/algorithms.py new file mode 100644 index 0000000..431aa56 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/algorithms.py @@ -0,0 +1,2061 @@ +""" +Generic data algorithms. This module is experimental at the moment and not +intended for public consumption +""" +import operator +from textwrap import dedent +from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union +from warnings import catch_warnings, simplefilter, warn + +import numpy as np + +from pandas._libs import Timestamp, algos, hashtable as htable, lib +from pandas._libs.tslib import iNaT +from pandas.util._decorators import Appender, Substitution + +from pandas.core.dtypes.cast import ( + construct_1d_object_array_from_listlike, + infer_dtype_from_array, + maybe_promote, +) +from pandas.core.dtypes.common import ( + ensure_float64, + ensure_int64, + ensure_object, + ensure_platform_int, + ensure_uint64, + is_array_like, + is_bool_dtype, + is_categorical_dtype, + is_complex_dtype, + is_datetime64_any_dtype, + is_datetime64_ns_dtype, + is_extension_array_dtype, + is_float_dtype, + is_integer, + is_integer_dtype, + is_list_like, + is_numeric_dtype, + is_object_dtype, + is_period_dtype, + is_scalar, + is_signed_integer_dtype, + is_timedelta64_dtype, + is_unsigned_integer_dtype, + needs_i8_conversion, +) +from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries +from pandas.core.dtypes.missing import isna, na_value_for_dtype + +import pandas.core.common as com +from pandas.core.construction import array, extract_array +from pandas.core.indexers import validate_indices + +if TYPE_CHECKING: + from pandas import Series + +_shared_docs: Dict[str, str] = {} + + +# --------------- # +# dtype access # +# --------------- # +def _ensure_data(values, dtype=None): + """ + routine to ensure that our data is of the correct + input dtype for lower-level routines + + This will coerce: + - ints -> int64 + - uint -> uint64 + - bool -> uint64 (TODO this should be uint8) + - datetimelike -> i8 + - datetime64tz -> i8 (in local tz) + - categorical -> codes + + Parameters + ---------- + values : array-like + dtype : pandas_dtype, optional + coerce to this dtype + + Returns + ------- + values : ndarray + pandas_dtype : str or dtype + """ + + # we check some simple dtypes first + if is_object_dtype(dtype): + return ensure_object(np.asarray(values)), "object" + elif is_object_dtype(values) and dtype is None: + return ensure_object(np.asarray(values)), "object" + + try: + if is_bool_dtype(values) or is_bool_dtype(dtype): + # we are actually coercing to uint64 + # until our algos support uint8 directly (see TODO) + return np.asarray(values).astype("uint64"), "bool" + elif is_signed_integer_dtype(values) or is_signed_integer_dtype(dtype): + return ensure_int64(values), "int64" + elif is_unsigned_integer_dtype(values) or is_unsigned_integer_dtype(dtype): + return ensure_uint64(values), "uint64" + elif is_float_dtype(values) or is_float_dtype(dtype): + return ensure_float64(values), "float64" + elif is_complex_dtype(values) or is_complex_dtype(dtype): + + # ignore the fact that we are casting to float + # which discards complex parts + with catch_warnings(): + simplefilter("ignore", np.ComplexWarning) + values = ensure_float64(values) + return values, "float64" + + except (TypeError, ValueError, OverflowError): + # if we are trying to coerce to a dtype + # and it is incompat this will fall through to here + return ensure_object(values), "object" + + # datetimelike + if ( + needs_i8_conversion(values) + or is_period_dtype(dtype) + or is_datetime64_any_dtype(dtype) + or is_timedelta64_dtype(dtype) + ): + if is_period_dtype(values) or is_period_dtype(dtype): + from pandas import PeriodIndex + + values = PeriodIndex(values) + dtype = values.dtype + elif is_timedelta64_dtype(values) or is_timedelta64_dtype(dtype): + from pandas import TimedeltaIndex + + values = TimedeltaIndex(values) + dtype = values.dtype + else: + # Datetime + if values.ndim > 1 and is_datetime64_ns_dtype(values): + # Avoid calling the DatetimeIndex constructor as it is 1D only + # Note: this is reached by DataFrame.rank calls GH#27027 + asi8 = values.view("i8") + dtype = values.dtype + return asi8, dtype + + from pandas import DatetimeIndex + + values = DatetimeIndex(values) + dtype = values.dtype + + return values.asi8, dtype + + elif is_categorical_dtype(values) and ( + is_categorical_dtype(dtype) or dtype is None + ): + values = getattr(values, "values", values) + values = values.codes + dtype = "category" + + # we are actually coercing to int64 + # until our algos support int* directly (not all do) + values = ensure_int64(values) + + return values, dtype + + # we have failed, return object + values = np.asarray(values, dtype=np.object) + return ensure_object(values), "object" + + +def _reconstruct_data(values, dtype, original): + """ + reverse of _ensure_data + + Parameters + ---------- + values : ndarray + dtype : pandas_dtype + original : ndarray-like + + Returns + ------- + Index for extension types, otherwise ndarray casted to dtype + """ + + if is_extension_array_dtype(dtype): + values = dtype.construct_array_type()._from_sequence(values) + elif is_bool_dtype(dtype): + values = values.astype(dtype, copy=False) + + # we only support object dtypes bool Index + if isinstance(original, ABCIndexClass): + values = values.astype(object, copy=False) + elif dtype is not None: + values = values.astype(dtype, copy=False) + + return values + + +def _ensure_arraylike(values): + """ + ensure that we are arraylike if not already + """ + if not is_array_like(values): + inferred = lib.infer_dtype(values, skipna=False) + if inferred in ["mixed", "string", "unicode"]: + if isinstance(values, tuple): + values = list(values) + values = construct_1d_object_array_from_listlike(values) + else: + values = np.asarray(values) + return values + + +_hashtables = { + "float64": htable.Float64HashTable, + "uint64": htable.UInt64HashTable, + "int64": htable.Int64HashTable, + "string": htable.StringHashTable, + "object": htable.PyObjectHashTable, +} + + +def _get_hashtable_algo(values): + """ + Parameters + ---------- + values : arraylike + + Returns + ------- + htable : HashTable subclass + values : ndarray + """ + values, _ = _ensure_data(values) + + ndtype = _check_object_for_strings(values) + htable = _hashtables[ndtype] + return htable, values + + +def _get_values_for_rank(values): + if is_categorical_dtype(values): + values = values._values_for_rank() + + values, _ = _ensure_data(values) + return values + + +def _get_data_algo(values): + values = _get_values_for_rank(values) + + ndtype = _check_object_for_strings(values) + htable = _hashtables.get(ndtype, _hashtables["object"]) + + return htable, values + + +def _check_object_for_strings(values) -> str: + """ + Check if we can use string hashtable instead of object hashtable. + + Parameters + ---------- + values : ndarray + ndtype : str + + Returns + ------- + str + """ + ndtype = values.dtype.name + if ndtype == "object": + + # it's cheaper to use a String Hash Table than Object; we infer + # including nulls because that is the only difference between + # StringHashTable and ObjectHashtable + if lib.infer_dtype(values, skipna=False) in ["string"]: + ndtype = "string" + return ndtype + + +# --------------- # +# top-level algos # +# --------------- # + + +def unique(values): + """ + Hash table-based unique. Uniques are returned in order + of appearance. This does NOT sort. + + Significantly faster than numpy.unique. Includes NA values. + + Parameters + ---------- + values : 1d array-like + + Returns + ------- + numpy.ndarray or ExtensionArray + + The return can be: + + * Index : when the input is an Index + * Categorical : when the input is a Categorical dtype + * ndarray : when the input is a Series/ndarray + + Return numpy.ndarray or ExtensionArray. + + See Also + -------- + Index.unique + Series.unique + + Examples + -------- + >>> pd.unique(pd.Series([2, 1, 3, 3])) + array([2, 1, 3]) + + >>> pd.unique(pd.Series([2] + [1] * 5)) + array([2, 1]) + + >>> pd.unique(pd.Series([pd.Timestamp('20160101'), + ... pd.Timestamp('20160101')])) + array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]') + + >>> pd.unique(pd.Series([pd.Timestamp('20160101', tz='US/Eastern'), + ... pd.Timestamp('20160101', tz='US/Eastern')])) + array([Timestamp('2016-01-01 00:00:00-0500', tz='US/Eastern')], + dtype=object) + + >>> pd.unique(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), + ... pd.Timestamp('20160101', tz='US/Eastern')])) + DatetimeIndex(['2016-01-01 00:00:00-05:00'], + ... dtype='datetime64[ns, US/Eastern]', freq=None) + + >>> pd.unique(list('baabc')) + array(['b', 'a', 'c'], dtype=object) + + An unordered Categorical will return categories in the + order of appearance. + + >>> pd.unique(pd.Series(pd.Categorical(list('baabc')))) + [b, a, c] + Categories (3, object): [b, a, c] + + >>> pd.unique(pd.Series(pd.Categorical(list('baabc'), + ... categories=list('abc')))) + [b, a, c] + Categories (3, object): [b, a, c] + + An ordered Categorical preserves the category ordering. + + >>> pd.unique(pd.Series(pd.Categorical(list('baabc'), + ... categories=list('abc'), + ... ordered=True))) + [b, a, c] + Categories (3, object): [a < b < c] + + An array of tuples + + >>> pd.unique([('a', 'b'), ('b', 'a'), ('a', 'c'), ('b', 'a')]) + array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object) + """ + + values = _ensure_arraylike(values) + + if is_extension_array_dtype(values): + # Dispatch to extension dtype's unique. + return values.unique() + + original = values + htable, values = _get_hashtable_algo(values) + + table = htable(len(values)) + uniques = table.unique(values) + uniques = _reconstruct_data(uniques, original.dtype, original) + return uniques + + +unique1d = unique + + +def isin(comps, values) -> np.ndarray: + """ + Compute the isin boolean array. + + Parameters + ---------- + comps : array-like + values : array-like + + Returns + ------- + ndarray[bool] + Same length as `comps`. + """ + if not is_list_like(comps): + raise TypeError( + "only list-like objects are allowed to be passed " + f"to isin(), you passed a [{type(comps).__name__}]" + ) + if not is_list_like(values): + raise TypeError( + "only list-like objects are allowed to be passed " + f"to isin(), you passed a [{type(values).__name__}]" + ) + + if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)): + values = construct_1d_object_array_from_listlike(list(values)) + + if is_categorical_dtype(comps): + # TODO(extension) + # handle categoricals + return comps._values.isin(values) + + comps = com.values_from_object(comps) + + comps, dtype = _ensure_data(comps) + values, _ = _ensure_data(values, dtype=dtype) + + # faster for larger cases to use np.in1d + f = htable.ismember_object + + # GH16012 + # Ensure np.in1d doesn't get object types or it *may* throw an exception + if len(comps) > 1_000_000 and not is_object_dtype(comps): + f = np.in1d + elif is_integer_dtype(comps): + try: + values = values.astype("int64", copy=False) + comps = comps.astype("int64", copy=False) + f = htable.ismember_int64 + except (TypeError, ValueError, OverflowError): + values = values.astype(object) + comps = comps.astype(object) + + elif is_float_dtype(comps): + try: + values = values.astype("float64", copy=False) + comps = comps.astype("float64", copy=False) + f = htable.ismember_float64 + except (TypeError, ValueError): + values = values.astype(object) + comps = comps.astype(object) + + return f(comps, values) + + +def _factorize_array( + values, na_sentinel: int = -1, size_hint=None, na_value=None +) -> Tuple[np.ndarray, np.ndarray]: + """ + Factorize an array-like to codes and uniques. + + This doesn't do any coercion of types or unboxing before factorization. + + Parameters + ---------- + values : ndarray + na_sentinel : int, default -1 + size_hint : int, optional + Passsed through to the hashtable's 'get_labels' method + na_value : object, optional + A value in `values` to consider missing. Note: only use this + parameter when you know that you don't have any values pandas would + consider missing in the array (NaN for float data, iNaT for + datetimes, etc.). + + Returns + ------- + codes : ndarray + uniques : ndarray + """ + hash_klass, values = _get_data_algo(values) + + table = hash_klass(size_hint or len(values)) + uniques, codes = table.factorize(values, na_sentinel=na_sentinel, na_value=na_value) + + codes = ensure_platform_int(codes) + return codes, uniques + + +_shared_docs[ + "factorize" +] = """ + Encode the object as an enumerated type or categorical variable. + + This method is useful for obtaining a numeric representation of an + array when all that matters is identifying distinct values. `factorize` + is available as both a top-level function :func:`pandas.factorize`, + and as a method :meth:`Series.factorize` and :meth:`Index.factorize`. + + Parameters + ---------- + %(values)s%(sort)s + na_sentinel : int, default -1 + Value to mark "not found". + %(size_hint)s\ + + Returns + ------- + codes : ndarray + An integer ndarray that's an indexer into `uniques`. + ``uniques.take(codes)`` will have the same values as `values`. + uniques : ndarray, Index, or Categorical + The unique valid values. When `values` is Categorical, `uniques` + is a Categorical. When `values` is some other pandas object, an + `Index` is returned. Otherwise, a 1-D ndarray is returned. + + .. note :: + + Even if there's a missing value in `values`, `uniques` will + *not* contain an entry for it. + + See Also + -------- + cut : Discretize continuous-valued array. + unique : Find the unique value in an array. + + Examples + -------- + These examples all show factorize as a top-level method like + ``pd.factorize(values)``. The results are identical for methods like + :meth:`Series.factorize`. + + >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b']) + >>> codes + array([0, 0, 1, 2, 0]) + >>> uniques + array(['b', 'a', 'c'], dtype=object) + + With ``sort=True``, the `uniques` will be sorted, and `codes` will be + shuffled so that the relationship is the maintained. + + >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True) + >>> codes + array([1, 1, 0, 2, 1]) + >>> uniques + array(['a', 'b', 'c'], dtype=object) + + Missing values are indicated in `codes` with `na_sentinel` + (``-1`` by default). Note that missing values are never + included in `uniques`. + + >>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b']) + >>> codes + array([ 0, -1, 1, 2, 0]) + >>> uniques + array(['b', 'a', 'c'], dtype=object) + + Thus far, we've only factorized lists (which are internally coerced to + NumPy arrays). When factorizing pandas objects, the type of `uniques` + will differ. For Categoricals, a `Categorical` is returned. + + >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c']) + >>> codes, uniques = pd.factorize(cat) + >>> codes + array([0, 0, 1]) + >>> uniques + [a, c] + Categories (3, object): [a, b, c] + + Notice that ``'b'`` is in ``uniques.categories``, despite not being + present in ``cat.values``. + + For all other pandas objects, an Index of the appropriate type is + returned. + + >>> cat = pd.Series(['a', 'a', 'c']) + >>> codes, uniques = pd.factorize(cat) + >>> codes + array([0, 0, 1]) + >>> uniques + Index(['a', 'c'], dtype='object') + """ + + +@Substitution( + values=dedent( + """\ + values : sequence + A 1-D sequence. Sequences that aren't pandas objects are + coerced to ndarrays before factorization. + """ + ), + sort=dedent( + """\ + sort : bool, default False + Sort `uniques` and shuffle `codes` to maintain the + relationship. + """ + ), + size_hint=dedent( + """\ + size_hint : int, optional + Hint to the hashtable sizer. + """ + ), +) +@Appender(_shared_docs["factorize"]) +def factorize( + values, sort: bool = False, na_sentinel: int = -1, size_hint: Optional[int] = None +) -> Tuple[np.ndarray, Union[np.ndarray, ABCIndex]]: + # Implementation notes: This method is responsible for 3 things + # 1.) coercing data to array-like (ndarray, Index, extension array) + # 2.) factorizing codes and uniques + # 3.) Maybe boxing the uniques in an Index + # + # Step 2 is dispatched to extension types (like Categorical). They are + # responsible only for factorization. All data coercion, sorting and boxing + # should happen here. + + values = _ensure_arraylike(values) + original = values + + if is_extension_array_dtype(values): + values = extract_array(values) + codes, uniques = values.factorize(na_sentinel=na_sentinel) + dtype = original.dtype + else: + values, dtype = _ensure_data(values) + + if original.dtype.kind in ["m", "M"]: + na_value = na_value_for_dtype(original.dtype) + else: + na_value = None + + codes, uniques = _factorize_array( + values, na_sentinel=na_sentinel, size_hint=size_hint, na_value=na_value + ) + + if sort and len(uniques) > 0: + uniques, codes = safe_sort( + uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False + ) + + uniques = _reconstruct_data(uniques, dtype, original) + + # return original tenor + if isinstance(original, ABCIndexClass): + uniques = original._shallow_copy(uniques, name=None) + elif isinstance(original, ABCSeries): + from pandas import Index + + uniques = Index(uniques) + + return codes, uniques + + +def value_counts( + values, + sort: bool = True, + ascending: bool = False, + normalize: bool = False, + bins=None, + dropna: bool = True, +) -> "Series": + """ + Compute a histogram of the counts of non-null values. + + Parameters + ---------- + values : ndarray (1-d) + sort : bool, default True + Sort by values + ascending : bool, default False + Sort in ascending order + normalize: bool, default False + If True then compute a relative histogram + bins : integer, optional + Rather than count values, group them into half-open bins, + convenience for pd.cut, only works with numeric data + dropna : bool, default True + Don't include counts of NaN + + Returns + ------- + Series + """ + from pandas.core.series import Series + + name = getattr(values, "name", None) + + if bins is not None: + from pandas.core.reshape.tile import cut + + values = Series(values) + try: + ii = cut(values, bins, include_lowest=True) + except TypeError: + raise TypeError("bins argument only works with numeric data.") + + # count, remove nulls (from the index), and but the bins + result = ii.value_counts(dropna=dropna) + result = result[result.index.notna()] + result.index = result.index.astype("interval") + result = result.sort_index() + + # if we are dropna and we have NO values + if dropna and (result.values == 0).all(): + result = result.iloc[0:0] + + # normalizing is by len of all (regardless of dropna) + counts = np.array([len(ii)]) + + else: + + if is_extension_array_dtype(values): + + # handle Categorical and sparse, + result = Series(values)._values.value_counts(dropna=dropna) + result.name = name + counts = result.values + + else: + keys, counts = _value_counts_arraylike(values, dropna) + + result = Series(counts, index=keys, name=name) + + if sort: + result = result.sort_values(ascending=ascending) + + if normalize: + result = result / float(counts.sum()) + + return result + + +def _value_counts_arraylike(values, dropna: bool): + """ + Parameters + ---------- + values : arraylike + dropna : bool + + Returns + ------- + uniques : np.ndarray or ExtensionArray + counts : np.ndarray + """ + values = _ensure_arraylike(values) + original = values + values, _ = _ensure_data(values) + ndtype = values.dtype.name + + if needs_i8_conversion(original.dtype): + # datetime, timedelta, or period + + keys, counts = htable.value_count_int64(values, dropna) + + if dropna: + msk = keys != iNaT + keys, counts = keys[msk], counts[msk] + + else: + # ndarray like + + # TODO: handle uint8 + f = getattr(htable, f"value_count_{ndtype}") + keys, counts = f(values, dropna) + + mask = isna(values) + if not dropna and mask.any(): + if not isna(keys).any(): + keys = np.insert(keys, 0, np.NaN) + counts = np.insert(counts, 0, mask.sum()) + + keys = _reconstruct_data(keys, original.dtype, original) + + return keys, counts + + +def duplicated(values, keep="first") -> np.ndarray: + """ + Return boolean ndarray denoting duplicate values. + + Parameters + ---------- + values : ndarray-like + Array over which to check for duplicate values. + keep : {'first', 'last', False}, default 'first' + - ``first`` : Mark duplicates as ``True`` except for the first + occurrence. + - ``last`` : Mark duplicates as ``True`` except for the last + occurrence. + - False : Mark all duplicates as ``True``. + + Returns + ------- + duplicated : ndarray + """ + + values, _ = _ensure_data(values) + ndtype = values.dtype.name + f = getattr(htable, f"duplicated_{ndtype}") + return f(values, keep=keep) + + +def mode(values, dropna: bool = True) -> "Series": + """ + Returns the mode(s) of an array. + + Parameters + ---------- + values : array-like + Array over which to check for duplicate values. + dropna : boolean, default True + Don't consider counts of NaN/NaT. + + .. versionadded:: 0.24.0 + + Returns + ------- + mode : Series + """ + from pandas import Series + + values = _ensure_arraylike(values) + original = values + + # categorical is a fast-path + if is_categorical_dtype(values): + if isinstance(values, Series): + return Series(values.values.mode(dropna=dropna), name=values.name) + return values.mode(dropna=dropna) + + if dropna and needs_i8_conversion(values.dtype): + mask = values.isnull() + values = values[~mask] + + values, _ = _ensure_data(values) + ndtype = values.dtype.name + + f = getattr(htable, f"mode_{ndtype}") + result = f(values, dropna=dropna) + try: + result = np.sort(result) + except TypeError as err: + warn(f"Unable to sort modes: {err}") + + result = _reconstruct_data(result, original.dtype, original) + return Series(result) + + +def rank( + values, + axis: int = 0, + method: str = "average", + na_option: str = "keep", + ascending: bool = True, + pct: bool = False, +): + """ + Rank the values along a given axis. + + Parameters + ---------- + values : array-like + Array whose values will be ranked. The number of dimensions in this + array must not exceed 2. + axis : int, default 0 + Axis over which to perform rankings. + method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' + The method by which tiebreaks are broken during the ranking. + na_option : {'keep', 'top'}, default 'keep' + The method by which NaNs are placed in the ranking. + - ``keep``: rank each NaN value with a NaN ranking + - ``top``: replace each NaN with either +/- inf so that they + there are ranked at the top + ascending : boolean, default True + Whether or not the elements should be ranked in ascending order. + pct : boolean, default False + Whether or not to the display the returned rankings in integer form + (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1). + """ + if values.ndim == 1: + values = _get_values_for_rank(values) + ranks = algos.rank_1d( + values, + ties_method=method, + ascending=ascending, + na_option=na_option, + pct=pct, + ) + elif values.ndim == 2: + values = _get_values_for_rank(values) + ranks = algos.rank_2d( + values, + axis=axis, + ties_method=method, + ascending=ascending, + na_option=na_option, + pct=pct, + ) + else: + raise TypeError("Array with ndim > 2 are not supported.") + + return ranks + + +def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None): + """ + Perform array addition that checks for underflow and overflow. + + Performs the addition of an int64 array and an int64 integer (or array) + but checks that they do not result in overflow first. For elements that + are indicated to be NaN, whether or not there is overflow for that element + is automatically ignored. + + Parameters + ---------- + arr : array addend. + b : array or scalar addend. + arr_mask : boolean array or None + array indicating which elements to exclude from checking + b_mask : boolean array or boolean or None + array or scalar indicating which element(s) to exclude from checking + + Returns + ------- + sum : An array for elements x + b for each element x in arr if b is + a scalar or an array for elements x + y for each element pair + (x, y) in (arr, b). + + Raises + ------ + OverflowError if any x + y exceeds the maximum or minimum int64 value. + """ + # For performance reasons, we broadcast 'b' to the new array 'b2' + # so that it has the same size as 'arr'. + b2 = np.broadcast_to(b, arr.shape) + if b_mask is not None: + # We do the same broadcasting for b_mask as well. + b2_mask = np.broadcast_to(b_mask, arr.shape) + else: + b2_mask = None + + # For elements that are NaN, regardless of their value, we should + # ignore whether they overflow or not when doing the checked add. + if arr_mask is not None and b2_mask is not None: + not_nan = np.logical_not(arr_mask | b2_mask) + elif arr_mask is not None: + not_nan = np.logical_not(arr_mask) + elif b_mask is not None: + not_nan = np.logical_not(b2_mask) + else: + not_nan = np.empty(arr.shape, dtype=bool) + not_nan.fill(True) + + # gh-14324: For each element in 'arr' and its corresponding element + # in 'b2', we check the sign of the element in 'b2'. If it is positive, + # we then check whether its sum with the element in 'arr' exceeds + # np.iinfo(np.int64).max. If so, we have an overflow error. If it + # it is negative, we then check whether its sum with the element in + # 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow + # error as well. + mask1 = b2 > 0 + mask2 = b2 < 0 + + if not mask1.any(): + to_raise = ((np.iinfo(np.int64).min - b2 > arr) & not_nan).any() + elif not mask2.any(): + to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any() + else: + to_raise = ( + ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any() + or ( + (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2] + ).any() + ) + + if to_raise: + raise OverflowError("Overflow in int64 addition") + return arr + b + + +def quantile(x, q, interpolation_method="fraction"): + """ + Compute sample quantile or quantiles of the input array. For example, q=0.5 + computes the median. + + The `interpolation_method` parameter supports three values, namely + `fraction` (default), `lower` and `higher`. Interpolation is done only, + if the desired quantile lies between two data points `i` and `j`. For + `fraction`, the result is an interpolated value between `i` and `j`; + for `lower`, the result is `i`, for `higher` the result is `j`. + + Parameters + ---------- + x : ndarray + Values from which to extract score. + q : scalar or array + Percentile at which to extract score. + interpolation_method : {'fraction', 'lower', 'higher'}, optional + This optional parameter specifies the interpolation method to use, + when the desired quantile lies between two data points `i` and `j`: + + - fraction: `i + (j - i)*fraction`, where `fraction` is the + fractional part of the index surrounded by `i` and `j`. + -lower: `i`. + - higher: `j`. + + Returns + ------- + score : float + Score at percentile. + + Examples + -------- + >>> from scipy import stats + >>> a = np.arange(100) + >>> stats.scoreatpercentile(a, 50) + 49.5 + + """ + x = np.asarray(x) + mask = isna(x) + + x = x[~mask] + + values = np.sort(x) + + def _interpolate(a, b, fraction): + """ + Returns the point at the given fraction between a and b, where + 'fraction' must be between 0 and 1. + """ + return a + (b - a) * fraction + + def _get_score(at): + if len(values) == 0: + return np.nan + + idx = at * (len(values) - 1) + if idx % 1 == 0: + score = values[int(idx)] + else: + if interpolation_method == "fraction": + score = _interpolate(values[int(idx)], values[int(idx) + 1], idx % 1) + elif interpolation_method == "lower": + score = values[np.floor(idx)] + elif interpolation_method == "higher": + score = values[np.ceil(idx)] + else: + raise ValueError( + "interpolation_method can only be 'fraction' " + ", 'lower' or 'higher'" + ) + + return score + + if is_scalar(q): + return _get_score(q) + else: + q = np.asarray(q, np.float64) + result = [_get_score(x) for x in q] + result = np.array(result, dtype=np.float64) + return result + + +# --------------- # +# select n # +# --------------- # + + +class SelectN: + def __init__(self, obj, n: int, keep: str): + self.obj = obj + self.n = n + self.keep = keep + + if self.keep not in ("first", "last", "all"): + raise ValueError('keep must be either "first", "last" or "all"') + + def nlargest(self): + return self.compute("nlargest") + + def nsmallest(self): + return self.compute("nsmallest") + + @staticmethod + def is_valid_dtype_n_method(dtype) -> bool: + """ + Helper function to determine if dtype is valid for + nsmallest/nlargest methods + """ + return ( + is_numeric_dtype(dtype) and not is_complex_dtype(dtype) + ) or needs_i8_conversion(dtype) + + +class SelectNSeries(SelectN): + """ + Implement n largest/smallest for Series + + Parameters + ---------- + obj : Series + n : int + keep : {'first', 'last'}, default 'first' + + Returns + ------- + nordered : Series + """ + + def compute(self, method): + + n = self.n + dtype = self.obj.dtype + if not self.is_valid_dtype_n_method(dtype): + raise TypeError(f"Cannot use method '{method}' with dtype {dtype}") + + if n <= 0: + return self.obj[[]] + + dropped = self.obj.dropna() + + # slow method + if n >= len(self.obj): + reverse_it = self.keep == "last" or method == "nlargest" + ascending = method == "nsmallest" + slc = np.s_[::-1] if reverse_it else np.s_[:] + return dropped[slc].sort_values(ascending=ascending).head(n) + + # fast method + arr, pandas_dtype = _ensure_data(dropped.values) + if method == "nlargest": + arr = -arr + if is_integer_dtype(pandas_dtype): + # GH 21426: ensure reverse ordering at boundaries + arr -= 1 + + elif is_bool_dtype(pandas_dtype): + # GH 26154: ensure False is smaller than True + arr = 1 - (-arr) + + if self.keep == "last": + arr = arr[::-1] + + narr = len(arr) + n = min(n, narr) + + kth_val = algos.kth_smallest(arr.copy(), n - 1) + (ns,) = np.nonzero(arr <= kth_val) + inds = ns[arr[ns].argsort(kind="mergesort")] + + if self.keep != "all": + inds = inds[:n] + + if self.keep == "last": + # reverse indices + inds = narr - 1 - inds + + return dropped.iloc[inds] + + +class SelectNFrame(SelectN): + """ + Implement n largest/smallest for DataFrame + + Parameters + ---------- + obj : DataFrame + n : int + keep : {'first', 'last'}, default 'first' + columns : list or str + + Returns + ------- + nordered : DataFrame + """ + + def __init__(self, obj, n: int, keep: str, columns): + super().__init__(obj, n, keep) + if not is_list_like(columns) or isinstance(columns, tuple): + columns = [columns] + columns = list(columns) + self.columns = columns + + def compute(self, method): + + from pandas import Int64Index + + n = self.n + frame = self.obj + columns = self.columns + + for column in columns: + dtype = frame[column].dtype + if not self.is_valid_dtype_n_method(dtype): + raise TypeError( + f"Column {repr(column)} has dtype {dtype}, " + f"cannot use method {repr(method)} with this dtype" + ) + + def get_indexer(current_indexer, other_indexer): + """ + Helper function to concat `current_indexer` and `other_indexer` + depending on `method` + """ + if method == "nsmallest": + return current_indexer.append(other_indexer) + else: + return other_indexer.append(current_indexer) + + # Below we save and reset the index in case index contains duplicates + original_index = frame.index + cur_frame = frame = frame.reset_index(drop=True) + cur_n = n + indexer = Int64Index([]) + + for i, column in enumerate(columns): + # For each column we apply method to cur_frame[column]. + # If it's the last column or if we have the number of + # results desired we are done. + # Otherwise there are duplicates of the largest/smallest + # value and we need to look at the rest of the columns + # to determine which of the rows with the largest/smallest + # value in the column to keep. + series = cur_frame[column] + is_last_column = len(columns) - 1 == i + values = getattr(series, method)( + cur_n, keep=self.keep if is_last_column else "all" + ) + + if is_last_column or len(values) <= cur_n: + indexer = get_indexer(indexer, values.index) + break + + # Now find all values which are equal to + # the (nsmallest: largest)/(nlarrgest: smallest) + # from our series. + border_value = values == values[values.index[-1]] + + # Some of these values are among the top-n + # some aren't. + unsafe_values = values[border_value] + + # These values are definitely among the top-n + safe_values = values[~border_value] + indexer = get_indexer(indexer, safe_values.index) + + # Go on and separate the unsafe_values on the remaining + # columns. + cur_frame = cur_frame.loc[unsafe_values.index] + cur_n = n - len(indexer) + + frame = frame.take(indexer) + + # Restore the index on frame + frame.index = original_index.take(indexer) + + # If there is only one column, the frame is already sorted. + if len(columns) == 1: + return frame + + ascending = method == "nsmallest" + + return frame.sort_values(columns, ascending=ascending, kind="mergesort") + + +# ---- # +# take # +# ---- # + + +def _view_wrapper(f, arr_dtype=None, out_dtype=None, fill_wrap=None): + def wrapper(arr, indexer, out, fill_value=np.nan): + if arr_dtype is not None: + arr = arr.view(arr_dtype) + if out_dtype is not None: + out = out.view(out_dtype) + if fill_wrap is not None: + fill_value = fill_wrap(fill_value) + f(arr, indexer, out, fill_value=fill_value) + + return wrapper + + +def _convert_wrapper(f, conv_dtype): + def wrapper(arr, indexer, out, fill_value=np.nan): + arr = arr.astype(conv_dtype) + f(arr, indexer, out, fill_value=fill_value) + + return wrapper + + +def _take_2d_multi_object(arr, indexer, out, fill_value, mask_info): + # this is not ideal, performance-wise, but it's better than raising + # an exception (best to optimize in Cython to avoid getting here) + row_idx, col_idx = indexer + if mask_info is not None: + (row_mask, col_mask), (row_needs, col_needs) = mask_info + else: + row_mask = row_idx == -1 + col_mask = col_idx == -1 + row_needs = row_mask.any() + col_needs = col_mask.any() + if fill_value is not None: + if row_needs: + out[row_mask, :] = fill_value + if col_needs: + out[:, col_mask] = fill_value + for i in range(len(row_idx)): + u_ = row_idx[i] + for j in range(len(col_idx)): + v = col_idx[j] + out[i, j] = arr[u_, v] + + +def _take_nd_object(arr, indexer, out, axis: int, fill_value, mask_info): + if mask_info is not None: + mask, needs_masking = mask_info + else: + mask = indexer == -1 + needs_masking = mask.any() + if arr.dtype != out.dtype: + arr = arr.astype(out.dtype) + if arr.shape[axis] > 0: + arr.take(ensure_platform_int(indexer), axis=axis, out=out) + if needs_masking: + outindexer = [slice(None)] * arr.ndim + outindexer[axis] = mask + out[tuple(outindexer)] = fill_value + + +_take_1d_dict = { + ("int8", "int8"): algos.take_1d_int8_int8, + ("int8", "int32"): algos.take_1d_int8_int32, + ("int8", "int64"): algos.take_1d_int8_int64, + ("int8", "float64"): algos.take_1d_int8_float64, + ("int16", "int16"): algos.take_1d_int16_int16, + ("int16", "int32"): algos.take_1d_int16_int32, + ("int16", "int64"): algos.take_1d_int16_int64, + ("int16", "float64"): algos.take_1d_int16_float64, + ("int32", "int32"): algos.take_1d_int32_int32, + ("int32", "int64"): algos.take_1d_int32_int64, + ("int32", "float64"): algos.take_1d_int32_float64, + ("int64", "int64"): algos.take_1d_int64_int64, + ("int64", "float64"): algos.take_1d_int64_float64, + ("float32", "float32"): algos.take_1d_float32_float32, + ("float32", "float64"): algos.take_1d_float32_float64, + ("float64", "float64"): algos.take_1d_float64_float64, + ("object", "object"): algos.take_1d_object_object, + ("bool", "bool"): _view_wrapper(algos.take_1d_bool_bool, np.uint8, np.uint8), + ("bool", "object"): _view_wrapper(algos.take_1d_bool_object, np.uint8, None), + ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( + algos.take_1d_int64_int64, np.int64, np.int64, np.int64 + ), +} + +_take_2d_axis0_dict = { + ("int8", "int8"): algos.take_2d_axis0_int8_int8, + ("int8", "int32"): algos.take_2d_axis0_int8_int32, + ("int8", "int64"): algos.take_2d_axis0_int8_int64, + ("int8", "float64"): algos.take_2d_axis0_int8_float64, + ("int16", "int16"): algos.take_2d_axis0_int16_int16, + ("int16", "int32"): algos.take_2d_axis0_int16_int32, + ("int16", "int64"): algos.take_2d_axis0_int16_int64, + ("int16", "float64"): algos.take_2d_axis0_int16_float64, + ("int32", "int32"): algos.take_2d_axis0_int32_int32, + ("int32", "int64"): algos.take_2d_axis0_int32_int64, + ("int32", "float64"): algos.take_2d_axis0_int32_float64, + ("int64", "int64"): algos.take_2d_axis0_int64_int64, + ("int64", "float64"): algos.take_2d_axis0_int64_float64, + ("float32", "float32"): algos.take_2d_axis0_float32_float32, + ("float32", "float64"): algos.take_2d_axis0_float32_float64, + ("float64", "float64"): algos.take_2d_axis0_float64_float64, + ("object", "object"): algos.take_2d_axis0_object_object, + ("bool", "bool"): _view_wrapper(algos.take_2d_axis0_bool_bool, np.uint8, np.uint8), + ("bool", "object"): _view_wrapper(algos.take_2d_axis0_bool_object, np.uint8, None), + ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( + algos.take_2d_axis0_int64_int64, np.int64, np.int64, fill_wrap=np.int64 + ), +} + +_take_2d_axis1_dict = { + ("int8", "int8"): algos.take_2d_axis1_int8_int8, + ("int8", "int32"): algos.take_2d_axis1_int8_int32, + ("int8", "int64"): algos.take_2d_axis1_int8_int64, + ("int8", "float64"): algos.take_2d_axis1_int8_float64, + ("int16", "int16"): algos.take_2d_axis1_int16_int16, + ("int16", "int32"): algos.take_2d_axis1_int16_int32, + ("int16", "int64"): algos.take_2d_axis1_int16_int64, + ("int16", "float64"): algos.take_2d_axis1_int16_float64, + ("int32", "int32"): algos.take_2d_axis1_int32_int32, + ("int32", "int64"): algos.take_2d_axis1_int32_int64, + ("int32", "float64"): algos.take_2d_axis1_int32_float64, + ("int64", "int64"): algos.take_2d_axis1_int64_int64, + ("int64", "float64"): algos.take_2d_axis1_int64_float64, + ("float32", "float32"): algos.take_2d_axis1_float32_float32, + ("float32", "float64"): algos.take_2d_axis1_float32_float64, + ("float64", "float64"): algos.take_2d_axis1_float64_float64, + ("object", "object"): algos.take_2d_axis1_object_object, + ("bool", "bool"): _view_wrapper(algos.take_2d_axis1_bool_bool, np.uint8, np.uint8), + ("bool", "object"): _view_wrapper(algos.take_2d_axis1_bool_object, np.uint8, None), + ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( + algos.take_2d_axis1_int64_int64, np.int64, np.int64, fill_wrap=np.int64 + ), +} + +_take_2d_multi_dict = { + ("int8", "int8"): algos.take_2d_multi_int8_int8, + ("int8", "int32"): algos.take_2d_multi_int8_int32, + ("int8", "int64"): algos.take_2d_multi_int8_int64, + ("int8", "float64"): algos.take_2d_multi_int8_float64, + ("int16", "int16"): algos.take_2d_multi_int16_int16, + ("int16", "int32"): algos.take_2d_multi_int16_int32, + ("int16", "int64"): algos.take_2d_multi_int16_int64, + ("int16", "float64"): algos.take_2d_multi_int16_float64, + ("int32", "int32"): algos.take_2d_multi_int32_int32, + ("int32", "int64"): algos.take_2d_multi_int32_int64, + ("int32", "float64"): algos.take_2d_multi_int32_float64, + ("int64", "int64"): algos.take_2d_multi_int64_int64, + ("int64", "float64"): algos.take_2d_multi_int64_float64, + ("float32", "float32"): algos.take_2d_multi_float32_float32, + ("float32", "float64"): algos.take_2d_multi_float32_float64, + ("float64", "float64"): algos.take_2d_multi_float64_float64, + ("object", "object"): algos.take_2d_multi_object_object, + ("bool", "bool"): _view_wrapper(algos.take_2d_multi_bool_bool, np.uint8, np.uint8), + ("bool", "object"): _view_wrapper(algos.take_2d_multi_bool_object, np.uint8, None), + ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( + algos.take_2d_multi_int64_int64, np.int64, np.int64, fill_wrap=np.int64 + ), +} + + +def _get_take_nd_function( + ndim: int, arr_dtype, out_dtype, axis: int = 0, mask_info=None +): + if ndim <= 2: + tup = (arr_dtype.name, out_dtype.name) + if ndim == 1: + func = _take_1d_dict.get(tup, None) + elif ndim == 2: + if axis == 0: + func = _take_2d_axis0_dict.get(tup, None) + else: + func = _take_2d_axis1_dict.get(tup, None) + if func is not None: + return func + + tup = (out_dtype.name, out_dtype.name) + if ndim == 1: + func = _take_1d_dict.get(tup, None) + elif ndim == 2: + if axis == 0: + func = _take_2d_axis0_dict.get(tup, None) + else: + func = _take_2d_axis1_dict.get(tup, None) + if func is not None: + func = _convert_wrapper(func, out_dtype) + return func + + def func2(arr, indexer, out, fill_value=np.nan): + indexer = ensure_int64(indexer) + _take_nd_object( + arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info + ) + + return func2 + + +def take(arr, indices, axis: int = 0, allow_fill: bool = False, fill_value=None): + """ + Take elements from an array. + + .. versionadded:: 0.23.0 + + Parameters + ---------- + arr : sequence + Non array-likes (sequences without a dtype) are coerced + to an ndarray. + indices : sequence of integers + Indices to be taken. + axis : int, default 0 + The axis over which to select values. + allow_fill : bool, default False + How to handle negative values in `indices`. + + * False: negative values in `indices` indicate positional indices + from the right (the default). This is similar to :func:`numpy.take`. + + * True: negative values in `indices` indicate + missing values. These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. + + fill_value : any, optional + Fill value to use for NA-indices when `allow_fill` is True. + This may be ``None``, in which case the default NA value for + the type (``self.dtype.na_value``) is used. + + For multi-dimensional `arr`, each *element* is filled with + `fill_value`. + + Returns + ------- + ndarray or ExtensionArray + Same type as the input. + + Raises + ------ + IndexError + When `indices` is out of bounds for the array. + ValueError + When the indexer contains negative values other than ``-1`` + and `allow_fill` is True. + + Notes + ----- + When `allow_fill` is False, `indices` may be whatever dimensionality + is accepted by NumPy for `arr`. + + When `allow_fill` is True, `indices` should be 1-D. + + See Also + -------- + numpy.take + + Examples + -------- + >>> from pandas.api.extensions import take + + With the default ``allow_fill=False``, negative numbers indicate + positional indices from the right. + + >>> take(np.array([10, 20, 30]), [0, 0, -1]) + array([10, 10, 30]) + + Setting ``allow_fill=True`` will place `fill_value` in those positions. + + >>> take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True) + array([10., 10., nan]) + + >>> take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True, + ... fill_value=-10) + array([ 10, 10, -10]) + """ + if not is_array_like(arr): + arr = np.asarray(arr) + + indices = np.asarray(indices, dtype=np.intp) + + if allow_fill: + # Pandas style, -1 means NA + validate_indices(indices, arr.shape[axis]) + result = take_1d( + arr, indices, axis=axis, allow_fill=True, fill_value=fill_value + ) + else: + # NumPy style + result = arr.take(indices, axis=axis) + return result + + +def take_nd( + arr, indexer, axis: int = 0, out=None, fill_value=np.nan, allow_fill: bool = True +): + """ + Specialized Cython take which sets NaN values in one pass + + This dispatches to ``take`` defined on ExtensionArrays. It does not + currently dispatch to ``SparseArray.take`` for sparse ``arr``. + + Parameters + ---------- + arr : array-like + Input array. + indexer : ndarray + 1-D array of indices to take, subarrays corresponding to -1 value + indices are filed with fill_value + axis : int, default 0 + Axis to take from + out : ndarray or None, default None + Optional output array, must be appropriate type to hold input and + fill_value together, if indexer has any -1 value entries; call + maybe_promote to determine this type for any fill_value + fill_value : any, default np.nan + Fill value to replace -1 values with + allow_fill : boolean, default True + If False, indexer is assumed to contain no -1 values so no filling + will be done. This short-circuits computation of a mask. Result is + undefined if allow_fill == False and -1 is present in indexer. + + Returns + ------- + subarray : array-like + May be the same type as the input, or cast to an ndarray. + """ + mask_info = None + + if is_extension_array_dtype(arr): + return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) + + arr = extract_array(arr) + arr = np.asarray(arr) + + if indexer is None: + indexer = np.arange(arr.shape[axis], dtype=np.int64) + dtype, fill_value = arr.dtype, arr.dtype.type() + else: + indexer = ensure_int64(indexer, copy=False) + if not allow_fill: + dtype, fill_value = arr.dtype, arr.dtype.type() + mask_info = None, False + else: + # check for promotion based on types only (do this first because + # it's faster than computing a mask) + dtype, fill_value = maybe_promote(arr.dtype, fill_value) + if dtype != arr.dtype and (out is None or out.dtype != dtype): + # check if promotion is actually required based on indexer + mask = indexer == -1 + needs_masking = mask.any() + mask_info = mask, needs_masking + if needs_masking: + if out is not None and out.dtype != dtype: + raise TypeError("Incompatible type for fill_value") + else: + # if not, then depromote, set fill_value to dummy + # (it won't be used but we don't want the cython code + # to crash when trying to cast it to dtype) + dtype, fill_value = arr.dtype, arr.dtype.type() + + flip_order = False + if arr.ndim == 2: + if arr.flags.f_contiguous: + flip_order = True + + if flip_order: + arr = arr.T + axis = arr.ndim - axis - 1 + if out is not None: + out = out.T + + # at this point, it's guaranteed that dtype can hold both the arr values + # and the fill_value + if out is None: + out_shape_ = list(arr.shape) + out_shape_[axis] = len(indexer) + out_shape = tuple(out_shape_) + if arr.flags.f_contiguous and axis == arr.ndim - 1: + # minor tweak that can make an order-of-magnitude difference + # for dataframes initialized directly from 2-d ndarrays + # (s.t. df.values is c-contiguous and df._data.blocks[0] is its + # f-contiguous transpose) + out = np.empty(out_shape, dtype=dtype, order="F") + else: + out = np.empty(out_shape, dtype=dtype) + + func = _get_take_nd_function( + arr.ndim, arr.dtype, out.dtype, axis=axis, mask_info=mask_info + ) + func(arr, indexer, out, fill_value) + + if flip_order: + out = out.T + return out + + +take_1d = take_nd + + +def take_2d_multi(arr, indexer, fill_value=np.nan): + """ + Specialized Cython take which sets NaN values in one pass. + """ + # This is only called from one place in DataFrame._reindex_multi, + # so we know indexer is well-behaved. + assert indexer is not None + assert indexer[0] is not None + assert indexer[1] is not None + + row_idx, col_idx = indexer + + row_idx = ensure_int64(row_idx) + col_idx = ensure_int64(col_idx) + indexer = row_idx, col_idx + mask_info = None + + # check for promotion based on types only (do this first because + # it's faster than computing a mask) + dtype, fill_value = maybe_promote(arr.dtype, fill_value) + if dtype != arr.dtype: + # check if promotion is actually required based on indexer + row_mask = row_idx == -1 + col_mask = col_idx == -1 + row_needs = row_mask.any() + col_needs = col_mask.any() + mask_info = (row_mask, col_mask), (row_needs, col_needs) + + if not (row_needs or col_needs): + # if not, then depromote, set fill_value to dummy + # (it won't be used but we don't want the cython code + # to crash when trying to cast it to dtype) + dtype, fill_value = arr.dtype, arr.dtype.type() + + # at this point, it's guaranteed that dtype can hold both the arr values + # and the fill_value + out_shape = len(row_idx), len(col_idx) + out = np.empty(out_shape, dtype=dtype) + + func = _take_2d_multi_dict.get((arr.dtype.name, out.dtype.name), None) + if func is None and arr.dtype != out.dtype: + func = _take_2d_multi_dict.get((out.dtype.name, out.dtype.name), None) + if func is not None: + func = _convert_wrapper(func, out.dtype) + if func is None: + + def func(arr, indexer, out, fill_value=np.nan): + _take_2d_multi_object( + arr, indexer, out, fill_value=fill_value, mask_info=mask_info + ) + + func(arr, indexer, out=out, fill_value=fill_value) + return out + + +# ------------ # +# searchsorted # +# ------------ # + + +def searchsorted(arr, value, side="left", sorter=None): + """ + Find indices where elements should be inserted to maintain order. + + .. versionadded:: 0.25.0 + + Find the indices into a sorted array `arr` (a) such that, if the + corresponding elements in `value` were inserted before the indices, + the order of `arr` would be preserved. + + Assuming that `arr` is sorted: + + ====== ================================ + `side` returned index `i` satisfies + ====== ================================ + left ``arr[i-1] < value <= self[i]`` + right ``arr[i-1] <= value < self[i]`` + ====== ================================ + + Parameters + ---------- + arr: array-like + Input array. If `sorter` is None, then it must be sorted in + ascending order, otherwise `sorter` must be an array of indices + that sort it. + value : array_like + Values to insert into `arr`. + side : {'left', 'right'}, optional + If 'left', the index of the first suitable location found is given. + If 'right', return the last such index. If there is no suitable + index, return either 0 or N (where N is the length of `self`). + sorter : 1-D array_like, optional + Optional array of integer indices that sort array a into ascending + order. They are typically the result of argsort. + + Returns + ------- + array of ints + Array of insertion points with the same shape as `value`. + + See Also + -------- + numpy.searchsorted : Similar method from NumPy. + """ + if sorter is not None: + sorter = ensure_platform_int(sorter) + + if ( + isinstance(arr, np.ndarray) + and is_integer_dtype(arr) + and (is_integer(value) or is_integer_dtype(value)) + ): + # if `arr` and `value` have different dtypes, `arr` would be + # recast by numpy, causing a slow search. + # Before searching below, we therefore try to give `value` the + # same dtype as `arr`, while guarding against integer overflows. + iinfo = np.iinfo(arr.dtype.type) + value_arr = np.array([value]) if is_scalar(value) else np.array(value) + if (value_arr >= iinfo.min).all() and (value_arr <= iinfo.max).all(): + # value within bounds, so no overflow, so can convert value dtype + # to dtype of arr + dtype = arr.dtype + else: + dtype = value_arr.dtype + + if is_scalar(value): + value = dtype.type(value) + else: + value = array(value, dtype=dtype) + elif not ( + is_object_dtype(arr) or is_numeric_dtype(arr) or is_categorical_dtype(arr) + ): + # E.g. if `arr` is an array with dtype='datetime64[ns]' + # and `value` is a pd.Timestamp, we may need to convert value + value_ser = array([value]) if is_scalar(value) else array(value) + value = value_ser[0] if is_scalar(value) else value_ser + if isinstance(value, Timestamp) and value.tzinfo is None: + value = value.to_datetime64() + + result = arr.searchsorted(value, side=side, sorter=sorter) + return result + + +# ---- # +# diff # +# ---- # + +_diff_special = {"float64", "float32", "int64", "int32", "int16", "int8"} + + +def diff(arr, n: int, axis: int = 0, stacklevel=3): + """ + difference of n between self, + analogous to s-s.shift(n) + + Parameters + ---------- + arr : ndarray + n : int + number of periods + axis : int + axis to shift on + stacklevel : int + The stacklevel for the lost dtype warning. + + Returns + ------- + shifted + """ + from pandas.core.arrays import PandasDtype + + n = int(n) + na = np.nan + dtype = arr.dtype + + if dtype.kind == "b": + op = operator.xor + else: + op = operator.sub + + if isinstance(dtype, PandasDtype): + # PandasArray cannot necessarily hold shifted versions of itself. + arr = np.asarray(arr) + dtype = arr.dtype + + if is_extension_array_dtype(dtype): + if hasattr(arr, f"__{op.__name__}__"): + return op(arr, arr.shift(n)) + else: + warn( + "dtype lost in 'diff()'. In the future this will raise a " + "TypeError. Convert to a suitable dtype prior to calling 'diff'.", + FutureWarning, + stacklevel=stacklevel, + ) + arr = np.asarray(arr) + dtype = arr.dtype + + is_timedelta = False + is_bool = False + if needs_i8_conversion(arr): + dtype = np.float64 + arr = arr.view("i8") + na = iNaT + is_timedelta = True + + elif is_bool_dtype(dtype): + dtype = np.object_ + is_bool = True + + elif is_integer_dtype(dtype): + dtype = np.float64 + + dtype = np.dtype(dtype) + out_arr = np.empty(arr.shape, dtype=dtype) + + na_indexer = [slice(None)] * arr.ndim + na_indexer[axis] = slice(None, n) if n >= 0 else slice(n, None) + out_arr[tuple(na_indexer)] = na + + if arr.ndim == 2 and arr.dtype.name in _diff_special: + # TODO: can diff_2d dtype specialization troubles be fixed by defining + # out_arr inside diff_2d? + algos.diff_2d(arr, out_arr, n, axis) + else: + # To keep mypy happy, _res_indexer is a list while res_indexer is + # a tuple, ditto for lag_indexer. + _res_indexer = [slice(None)] * arr.ndim + _res_indexer[axis] = slice(n, None) if n >= 0 else slice(None, n) + res_indexer = tuple(_res_indexer) + + _lag_indexer = [slice(None)] * arr.ndim + _lag_indexer[axis] = slice(None, -n) if n > 0 else slice(-n, None) + lag_indexer = tuple(_lag_indexer) + + # need to make sure that we account for na for datelike/timedelta + # we don't actually want to subtract these i8 numbers + if is_timedelta: + res = arr[res_indexer] + lag = arr[lag_indexer] + + mask = (arr[res_indexer] == na) | (arr[lag_indexer] == na) + if mask.any(): + res = res.copy() + res[mask] = 0 + lag = lag.copy() + lag[mask] = 0 + + result = res - lag + result[mask] = na + out_arr[res_indexer] = result + elif is_bool: + out_arr[res_indexer] = arr[res_indexer] ^ arr[lag_indexer] + else: + out_arr[res_indexer] = arr[res_indexer] - arr[lag_indexer] + + if is_timedelta: + out_arr = out_arr.astype("int64").view("timedelta64[ns]") + + return out_arr + + +# -------------------------------------------------------------------- +# Helper functions + +# Note: safe_sort is in algorithms.py instead of sorting.py because it is +# low-dependency, is used in this module, and used private methods from +# this module. +def safe_sort( + values, + codes=None, + na_sentinel: int = -1, + assume_unique: bool = False, + verify: bool = True, +) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]: + """ + Sort ``values`` and reorder corresponding ``codes``. + + ``values`` should be unique if ``codes`` is not None. + Safe for use with mixed types (int, str), orders ints before strs. + + Parameters + ---------- + values : list-like + Sequence; must be unique if ``codes`` is not None. + codes : list_like, optional + Indices to ``values``. All out of bound indices are treated as + "not found" and will be masked with ``na_sentinel``. + na_sentinel : int, default -1 + Value in ``codes`` to mark "not found". + Ignored when ``codes`` is None. + assume_unique : bool, default False + When True, ``values`` are assumed to be unique, which can speed up + the calculation. Ignored when ``codes`` is None. + verify : bool, default True + Check if codes are out of bound for the values and put out of bound + codes equal to na_sentinel. If ``verify=False``, it is assumed there + are no out of bound codes. Ignored when ``codes`` is None. + + .. versionadded:: 0.25.0 + + Returns + ------- + ordered : ndarray + Sorted ``values`` + new_codes : ndarray + Reordered ``codes``; returned when ``codes`` is not None. + + Raises + ------ + TypeError + * If ``values`` is not list-like or if ``codes`` is neither None + nor list-like + * If ``values`` cannot be sorted + ValueError + * If ``codes`` is not None and ``values`` contain duplicates. + """ + if not is_list_like(values): + raise TypeError( + "Only list-like objects are allowed to be passed to safe_sort as values" + ) + + if not isinstance(values, np.ndarray) and not is_extension_array_dtype(values): + # don't convert to string types + dtype, _ = infer_dtype_from_array(values) + values = np.asarray(values, dtype=dtype) + + def sort_mixed(values): + # order ints before strings, safe in py3 + str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) + nums = np.sort(values[~str_pos]) + strs = np.sort(values[str_pos]) + return np.concatenate([nums, np.asarray(strs, dtype=object)]) + + sorter = None + if ( + not is_extension_array_dtype(values) + and lib.infer_dtype(values, skipna=False) == "mixed-integer" + ): + # unorderable in py3 if mixed str/int + ordered = sort_mixed(values) + else: + try: + sorter = values.argsort() + ordered = values.take(sorter) + except TypeError: + # try this anyway + ordered = sort_mixed(values) + + # codes: + + if codes is None: + return ordered + + if not is_list_like(codes): + raise TypeError( + "Only list-like objects or None are allowed to " + "be passed to safe_sort as codes" + ) + codes = ensure_platform_int(np.asarray(codes)) + + from pandas import Index + + if not assume_unique and not Index(values).is_unique: + raise ValueError("values should be unique if codes is not None") + + if sorter is None: + # mixed types + hash_klass, values = _get_data_algo(values) + t = hash_klass(len(values)) + t.map_locations(values) + sorter = ensure_platform_int(t.lookup(ordered)) + + if na_sentinel == -1: + # take_1d is faster, but only works for na_sentinels of -1 + order2 = sorter.argsort() + new_codes = take_1d(order2, codes, fill_value=-1) + if verify: + mask = (codes < -len(values)) | (codes >= len(values)) + else: + mask = None + else: + reverse_indexer = np.empty(len(sorter), dtype=np.int_) + reverse_indexer.put(sorter, np.arange(len(sorter))) + # Out of bound indices will be masked with `na_sentinel` next, so we + # may deal with them here without performance loss using `mode='wrap'` + new_codes = reverse_indexer.take(codes, mode="wrap") + + mask = codes == na_sentinel + if verify: + mask = mask | (codes < -len(values)) | (codes >= len(values)) + + if mask is not None: + np.putmask(new_codes, mask, na_sentinel) + + return ordered, ensure_platform_int(new_codes) diff --git a/venv/Lib/site-packages/pandas/core/api.py b/venv/Lib/site-packages/pandas/core/api.py new file mode 100644 index 0000000..b0b65f9 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/api.py @@ -0,0 +1,57 @@ +# flake8: noqa + +from pandas._libs import NaT, Period, Timedelta, Timestamp +from pandas._libs.missing import NA + +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + DatetimeTZDtype, + IntervalDtype, + PeriodDtype, +) +from pandas.core.dtypes.missing import isna, isnull, notna, notnull + +from pandas.core.algorithms import factorize, unique, value_counts +from pandas.core.arrays import Categorical +from pandas.core.arrays.boolean import BooleanDtype +from pandas.core.arrays.integer import ( + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, +) +from pandas.core.arrays.string_ import StringDtype +from pandas.core.construction import array +from pandas.core.groupby import Grouper, NamedAgg +from pandas.core.indexes.api import ( + CategoricalIndex, + DatetimeIndex, + Float64Index, + Index, + Int64Index, + IntervalIndex, + MultiIndex, + PeriodIndex, + RangeIndex, + TimedeltaIndex, + UInt64Index, +) +from pandas.core.indexes.datetimes import bdate_range, date_range +from pandas.core.indexes.interval import Interval, interval_range +from pandas.core.indexes.period import period_range +from pandas.core.indexes.timedeltas import timedelta_range +from pandas.core.indexing import IndexSlice +from pandas.core.series import Series +from pandas.core.tools.datetimes import to_datetime +from pandas.core.tools.numeric import to_numeric +from pandas.core.tools.timedeltas import to_timedelta + +from pandas.io.formats.format import set_eng_float_format +from pandas.tseries.offsets import DateOffset + +# DataFrame needs to be imported after NamedAgg to avoid a circular import +from pandas.core.frame import DataFrame # isort:skip diff --git a/venv/Lib/site-packages/pandas/core/apply.py b/venv/Lib/site-packages/pandas/core/apply.py new file mode 100644 index 0000000..a496afe --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/apply.py @@ -0,0 +1,466 @@ +import abc +import inspect +from typing import TYPE_CHECKING, Any, Dict, Iterator, Tuple, Type, Union + +import numpy as np + +from pandas._libs import reduction as libreduction +from pandas.util._decorators import cache_readonly + +from pandas.core.dtypes.common import ( + is_dict_like, + is_extension_array_dtype, + is_list_like, + is_sequence, +) +from pandas.core.dtypes.generic import ABCSeries + +from pandas.core.construction import create_series_with_explicit_dtype + +if TYPE_CHECKING: + from pandas import DataFrame, Series, Index + +ResType = Dict[int, Any] + + +def frame_apply( + obj: "DataFrame", + func, + axis=0, + raw: bool = False, + result_type=None, + ignore_failures: bool = False, + args=None, + kwds=None, +): + """ construct and return a row or column based frame apply object """ + + axis = obj._get_axis_number(axis) + klass: Type[FrameApply] + if axis == 0: + klass = FrameRowApply + elif axis == 1: + klass = FrameColumnApply + + return klass( + obj, + func, + raw=raw, + result_type=result_type, + ignore_failures=ignore_failures, + args=args, + kwds=kwds, + ) + + +class FrameApply(metaclass=abc.ABCMeta): + + # --------------------------------------------------------------- + # Abstract Methods + axis: int + + @property + @abc.abstractmethod + def result_index(self) -> "Index": + pass + + @property + @abc.abstractmethod + def result_columns(self) -> "Index": + pass + + @property + @abc.abstractmethod + def series_generator(self) -> Iterator["Series"]: + pass + + @abc.abstractmethod + def wrap_results_for_axis( + self, results: ResType, res_index: "Index" + ) -> Union["Series", "DataFrame"]: + pass + + # --------------------------------------------------------------- + + def __init__( + self, + obj: "DataFrame", + func, + raw: bool, + result_type, + ignore_failures: bool, + args, + kwds, + ): + self.obj = obj + self.raw = raw + self.ignore_failures = ignore_failures + self.args = args or () + self.kwds = kwds or {} + + if result_type not in [None, "reduce", "broadcast", "expand"]: + raise ValueError( + "invalid value for result_type, must be one " + "of {None, 'reduce', 'broadcast', 'expand'}" + ) + + self.result_type = result_type + + # curry if needed + if (kwds or args) and not isinstance(func, (np.ufunc, str)): + + def f(x): + return func(x, *args, **kwds) + + else: + f = func + + self.f = f + + @property + def res_columns(self) -> "Index": + return self.result_columns + + @property + def columns(self) -> "Index": + return self.obj.columns + + @property + def index(self) -> "Index": + return self.obj.index + + @cache_readonly + def values(self): + return self.obj.values + + @cache_readonly + def dtypes(self) -> "Series": + return self.obj.dtypes + + @property + def agg_axis(self) -> "Index": + return self.obj._get_agg_axis(self.axis) + + def get_result(self): + """ compute the results """ + + # dispatch to agg + if is_list_like(self.f) or is_dict_like(self.f): + return self.obj.aggregate(self.f, axis=self.axis, *self.args, **self.kwds) + + # all empty + if len(self.columns) == 0 and len(self.index) == 0: + return self.apply_empty_result() + + # string dispatch + if isinstance(self.f, str): + # Support for `frame.transform('method')` + # Some methods (shift, etc.) require the axis argument, others + # don't, so inspect and insert if necessary. + func = getattr(self.obj, self.f) + sig = inspect.getfullargspec(func) + if "axis" in sig.args: + self.kwds["axis"] = self.axis + return func(*self.args, **self.kwds) + + # ufunc + elif isinstance(self.f, np.ufunc): + with np.errstate(all="ignore"): + results = self.obj._data.apply("apply", func=self.f) + return self.obj._constructor( + data=results, index=self.index, columns=self.columns, copy=False + ) + + # broadcasting + if self.result_type == "broadcast": + return self.apply_broadcast(self.obj) + + # one axis empty + elif not all(self.obj.shape): + return self.apply_empty_result() + + # raw + elif self.raw and not self.obj._is_mixed_type: + return self.apply_raw() + + return self.apply_standard() + + def apply_empty_result(self): + """ + we have an empty result; at least 1 axis is 0 + + we will try to apply the function to an empty + series in order to see if this is a reduction function + """ + + # we are not asked to reduce or infer reduction + # so just return a copy of the existing object + if self.result_type not in ["reduce", None]: + return self.obj.copy() + + # we may need to infer + should_reduce = self.result_type == "reduce" + + from pandas import Series + + if not should_reduce: + try: + r = self.f(Series([], dtype=np.float64)) + except Exception: + pass + else: + should_reduce = not isinstance(r, Series) + + if should_reduce: + if len(self.agg_axis): + r = self.f(Series([], dtype=np.float64)) + else: + r = np.nan + + return self.obj._constructor_sliced(r, index=self.agg_axis) + else: + return self.obj.copy() + + def apply_raw(self): + """ apply to the values as a numpy array """ + try: + result = libreduction.compute_reduction(self.values, self.f, axis=self.axis) + except ValueError as err: + if "Function does not reduce" not in str(err): + # catch only ValueError raised intentionally in libreduction + raise + # We expect np.apply_along_axis to give a two-dimensional result, or + # also raise. + result = np.apply_along_axis(self.f, self.axis, self.values) + + # TODO: mixed type case + if result.ndim == 2: + return self.obj._constructor(result, index=self.index, columns=self.columns) + else: + return self.obj._constructor_sliced(result, index=self.agg_axis) + + def apply_broadcast(self, target: "DataFrame") -> "DataFrame": + result_values = np.empty_like(target.values) + + # axis which we want to compare compliance + result_compare = target.shape[0] + + for i, col in enumerate(target.columns): + res = self.f(target[col]) + ares = np.asarray(res).ndim + + # must be a scalar or 1d + if ares > 1: + raise ValueError("too many dims to broadcast") + elif ares == 1: + + # must match return dim + if result_compare != len(res): + raise ValueError("cannot broadcast result") + + result_values[:, i] = res + + # we *always* preserve the original index / columns + result = self.obj._constructor( + result_values, index=target.index, columns=target.columns + ) + return result + + def apply_standard(self): + + # try to reduce first (by default) + # this only matters if the reduction in values is of different dtype + # e.g. if we want to apply to a SparseFrame, then can't directly reduce + + # we cannot reduce using non-numpy dtypes, + # as demonstrated in gh-12244 + if ( + self.result_type in ["reduce", None] + and not self.dtypes.apply(is_extension_array_dtype).any() + # Disallow complex_internals since libreduction shortcut raises a TypeError + and not self.agg_axis._has_complex_internals + ): + + values = self.values + index = self.obj._get_axis(self.axis) + labels = self.agg_axis + empty_arr = np.empty(len(index), dtype=values.dtype) + + # Preserve subclass for e.g. test_subclassed_apply + dummy = self.obj._constructor_sliced( + empty_arr, index=index, dtype=values.dtype + ) + + try: + result = libreduction.compute_reduction( + values, self.f, axis=self.axis, dummy=dummy, labels=labels + ) + except ValueError as err: + if "Function does not reduce" not in str(err): + # catch only ValueError raised intentionally in libreduction + raise + except TypeError: + # e.g. test_apply_ignore_failures we just ignore + if not self.ignore_failures: + raise + except ZeroDivisionError: + # reached via numexpr; fall back to python implementation + pass + else: + return self.obj._constructor_sliced(result, index=labels) + + # compute the result using the series generator + results, res_index = self.apply_series_generator() + + # wrap results + return self.wrap_results(results, res_index) + + def apply_series_generator(self) -> Tuple[ResType, "Index"]: + series_gen = self.series_generator + res_index = self.result_index + + keys = [] + results = {} + if self.ignore_failures: + successes = [] + for i, v in enumerate(series_gen): + try: + results[i] = self.f(v) + except Exception: + pass + else: + keys.append(v.name) + successes.append(i) + + # so will work with MultiIndex + if len(successes) < len(res_index): + res_index = res_index.take(successes) + + else: + for i, v in enumerate(series_gen): + results[i] = self.f(v) + keys.append(v.name) + + return results, res_index + + def wrap_results( + self, results: ResType, res_index: "Index" + ) -> Union["Series", "DataFrame"]: + from pandas import Series + + # see if we can infer the results + if len(results) > 0 and 0 in results and is_sequence(results[0]): + + return self.wrap_results_for_axis(results, res_index) + + # dict of scalars + + # the default dtype of an empty Series will be `object`, but this + # code can be hit by df.mean() where the result should have dtype + # float64 even if it's an empty Series. + constructor_sliced = self.obj._constructor_sliced + if constructor_sliced is Series: + result = create_series_with_explicit_dtype( + results, dtype_if_empty=np.float64 + ) + else: + result = constructor_sliced(results) + result.index = res_index + + return result + + +class FrameRowApply(FrameApply): + axis = 0 + + def apply_broadcast(self, target: "DataFrame") -> "DataFrame": + return super().apply_broadcast(target) + + @property + def series_generator(self): + return (self.obj._ixs(i, axis=1) for i in range(len(self.columns))) + + @property + def result_index(self) -> "Index": + return self.columns + + @property + def result_columns(self) -> "Index": + return self.index + + def wrap_results_for_axis( + self, results: ResType, res_index: "Index" + ) -> "DataFrame": + """ return the results for the rows """ + + result = self.obj._constructor(data=results) + + if not isinstance(results[0], ABCSeries): + if len(result.index) == len(self.res_columns): + result.index = self.res_columns + + if len(result.columns) == len(res_index): + result.columns = res_index + + return result + + +class FrameColumnApply(FrameApply): + axis = 1 + + def apply_broadcast(self, target: "DataFrame") -> "DataFrame": + result = super().apply_broadcast(target.T) + return result.T + + @property + def series_generator(self): + constructor = self.obj._constructor_sliced + return ( + constructor(arr, index=self.columns, name=name) + for i, (arr, name) in enumerate(zip(self.values, self.index)) + ) + + @property + def result_index(self) -> "Index": + return self.index + + @property + def result_columns(self) -> "Index": + return self.columns + + def wrap_results_for_axis( + self, results: ResType, res_index: "Index" + ) -> Union["Series", "DataFrame"]: + """ return the results for the columns """ + result: Union["Series", "DataFrame"] + + # we have requested to expand + if self.result_type == "expand": + result = self.infer_to_same_shape(results, res_index) + + # we have a non-series and don't want inference + elif not isinstance(results[0], ABCSeries): + from pandas import Series + + result = Series(results) + result.index = res_index + + # we may want to infer results + else: + result = self.infer_to_same_shape(results, res_index) + + return result + + def infer_to_same_shape(self, results: ResType, res_index: "Index") -> "DataFrame": + """ infer the results to the same shape as the input object """ + + result = self.obj._constructor(data=results) + result = result.T + + # set the index + result.index = res_index + + # infer dtypes + result = result.infer_objects() + + return result diff --git a/venv/Lib/site-packages/pandas/core/arrays/__init__.py b/venv/Lib/site-packages/pandas/core/arrays/__init__.py new file mode 100644 index 0000000..bf34699 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/arrays/__init__.py @@ -0,0 +1,36 @@ +from pandas.core.arrays.base import ( + ExtensionArray, + ExtensionOpsMixin, + ExtensionScalarOpsMixin, + try_cast_to_ea, +) +from pandas.core.arrays.boolean import BooleanArray +from pandas.core.arrays.categorical import Categorical +from pandas.core.arrays.datetimes import DatetimeArray +from pandas.core.arrays.integer import IntegerArray, integer_array +from pandas.core.arrays.interval import IntervalArray +from pandas.core.arrays.numpy_ import PandasArray, PandasDtype +from pandas.core.arrays.period import PeriodArray, period_array +from pandas.core.arrays.sparse import SparseArray +from pandas.core.arrays.string_ import StringArray +from pandas.core.arrays.timedeltas import TimedeltaArray + +__all__ = [ + "ExtensionArray", + "ExtensionOpsMixin", + "ExtensionScalarOpsMixin", + "try_cast_to_ea", + "BooleanArray", + "Categorical", + "DatetimeArray", + "IntegerArray", + "integer_array", + "IntervalArray", + "PandasArray", + "PandasDtype", + "PeriodArray", + "period_array", + "SparseArray", + "StringArray", + "TimedeltaArray", +] diff --git a/venv/Lib/site-packages/pandas/core/arrays/_arrow_utils.py b/venv/Lib/site-packages/pandas/core/arrays/_arrow_utils.py new file mode 100644 index 0000000..e0d33be --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/arrays/_arrow_utils.py @@ -0,0 +1,124 @@ +from distutils.version import LooseVersion +import json + +import numpy as np +import pyarrow + +from pandas.core.arrays.interval import _VALID_CLOSED + +_pyarrow_version_ge_015 = LooseVersion(pyarrow.__version__) >= LooseVersion("0.15") + + +def pyarrow_array_to_numpy_and_mask(arr, dtype): + """ + Convert a primitive pyarrow.Array to a numpy array and boolean mask based + on the buffers of the Array. + + Parameters + ---------- + arr : pyarrow.Array + dtype : numpy.dtype + + Returns + ------- + (data, mask) + Tuple of two numpy arrays with the raw data (with specified dtype) and + a boolean mask (validity mask, so False means missing) + """ + buflist = arr.buffers() + data = np.frombuffer(buflist[1], dtype=dtype)[arr.offset : arr.offset + len(arr)] + bitmask = buflist[0] + if bitmask is not None: + mask = pyarrow.BooleanArray.from_buffers( + pyarrow.bool_(), len(arr), [None, bitmask] + ) + mask = np.asarray(mask) + else: + mask = np.ones(len(arr), dtype=bool) + return data, mask + + +if _pyarrow_version_ge_015: + # the pyarrow extension types are only available for pyarrow 0.15+ + + class ArrowPeriodType(pyarrow.ExtensionType): + def __init__(self, freq): + # attributes need to be set first before calling + # super init (as that calls serialize) + self._freq = freq + pyarrow.ExtensionType.__init__(self, pyarrow.int64(), "pandas.period") + + @property + def freq(self): + return self._freq + + def __arrow_ext_serialize__(self): + metadata = {"freq": self.freq} + return json.dumps(metadata).encode() + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + metadata = json.loads(serialized.decode()) + return ArrowPeriodType(metadata["freq"]) + + def __eq__(self, other): + if isinstance(other, pyarrow.BaseExtensionType): + return type(self) == type(other) and self.freq == other.freq + else: + return NotImplemented + + def __hash__(self): + return hash((str(self), self.freq)) + + # register the type with a dummy instance + _period_type = ArrowPeriodType("D") + pyarrow.register_extension_type(_period_type) + + class ArrowIntervalType(pyarrow.ExtensionType): + def __init__(self, subtype, closed): + # attributes need to be set first before calling + # super init (as that calls serialize) + assert closed in _VALID_CLOSED + self._closed = closed + if not isinstance(subtype, pyarrow.DataType): + subtype = pyarrow.type_for_alias(str(subtype)) + self._subtype = subtype + + storage_type = pyarrow.struct([("left", subtype), ("right", subtype)]) + pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval") + + @property + def subtype(self): + return self._subtype + + @property + def closed(self): + return self._closed + + def __arrow_ext_serialize__(self): + metadata = {"subtype": str(self.subtype), "closed": self.closed} + return json.dumps(metadata).encode() + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + metadata = json.loads(serialized.decode()) + subtype = pyarrow.type_for_alias(metadata["subtype"]) + closed = metadata["closed"] + return ArrowIntervalType(subtype, closed) + + def __eq__(self, other): + if isinstance(other, pyarrow.BaseExtensionType): + return ( + type(self) == type(other) + and self.subtype == other.subtype + and self.closed == other.closed + ) + else: + return NotImplemented + + def __hash__(self): + return hash((str(self), str(self.subtype), self.closed)) + + # register the type with a dummy instance + _interval_type = ArrowIntervalType(pyarrow.int64(), "left") + pyarrow.register_extension_type(_interval_type) diff --git a/venv/Lib/site-packages/pandas/core/arrays/_ranges.py b/venv/Lib/site-packages/pandas/core/arrays/_ranges.py new file mode 100644 index 0000000..20e4cf7 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/arrays/_ranges.py @@ -0,0 +1,190 @@ +""" +Helper functions to generate range-like data for DatetimeArray +(and possibly TimedeltaArray/PeriodArray) +""" + +from typing import Tuple + +import numpy as np + +from pandas._libs.tslibs import OutOfBoundsDatetime, Timestamp + +from pandas.tseries.offsets import DateOffset, Tick, generate_range + + +def generate_regular_range( + start: Timestamp, end: Timestamp, periods: int, freq: DateOffset +) -> Tuple[np.ndarray, str]: + """ + Generate a range of dates with the spans between dates described by + the given `freq` DateOffset. + + Parameters + ---------- + start : Timestamp or None + first point of produced date range + end : Timestamp or None + last point of produced date range + periods : int + number of periods in produced date range + freq : DateOffset + describes space between dates in produced date range + + Returns + ------- + ndarray[np.int64] representing nanosecond unix timestamps + """ + if isinstance(freq, Tick): + stride = freq.nanos + if periods is None: + b = Timestamp(start).value + # cannot just use e = Timestamp(end) + 1 because arange breaks when + # stride is too large, see GH10887 + e = b + (Timestamp(end).value - b) // stride * stride + stride // 2 + 1 + # end.tz == start.tz by this point due to _generate implementation + tz = start.tz + elif start is not None: + b = Timestamp(start).value + e = _generate_range_overflow_safe(b, periods, stride, side="start") + tz = start.tz + elif end is not None: + e = Timestamp(end).value + stride + b = _generate_range_overflow_safe(e, periods, stride, side="end") + tz = end.tz + else: + raise ValueError( + "at least 'start' or 'end' should be specified " + "if a 'period' is given." + ) + + with np.errstate(over="raise"): + # If the range is sufficiently large, np.arange may overflow + # and incorrectly return an empty array if not caught. + try: + values = np.arange(b, e, stride, dtype=np.int64) + except FloatingPointError: + xdr = [b] + while xdr[-1] != e: + xdr.append(xdr[-1] + stride) + values = np.array(xdr[:-1], dtype=np.int64) + + else: + tz = None + # start and end should have the same timezone by this point + if start is not None: + tz = start.tz + elif end is not None: + tz = end.tz + + xdr = generate_range(start=start, end=end, periods=periods, offset=freq) + + values = np.array([x.value for x in xdr], dtype=np.int64) + + return values, tz + + +def _generate_range_overflow_safe( + endpoint: int, periods: int, stride: int, side: str = "start" +) -> int: + """ + Calculate the second endpoint for passing to np.arange, checking + to avoid an integer overflow. Catch OverflowError and re-raise + as OutOfBoundsDatetime. + + Parameters + ---------- + endpoint : int + nanosecond timestamp of the known endpoint of the desired range + periods : int + number of periods in the desired range + stride : int + nanoseconds between periods in the desired range + side : {'start', 'end'} + which end of the range `endpoint` refers to + + Returns + ------- + other_end : int + + Raises + ------ + OutOfBoundsDatetime + """ + # GH#14187 raise instead of incorrectly wrapping around + assert side in ["start", "end"] + + i64max = np.uint64(np.iinfo(np.int64).max) + msg = f"Cannot generate range with {side}={endpoint} and periods={periods}" + + with np.errstate(over="raise"): + # if periods * strides cannot be multiplied within the *uint64* bounds, + # we cannot salvage the operation by recursing, so raise + try: + addend = np.uint64(periods) * np.uint64(np.abs(stride)) + except FloatingPointError: + raise OutOfBoundsDatetime(msg) + + if np.abs(addend) <= i64max: + # relatively easy case without casting concerns + return _generate_range_overflow_safe_signed(endpoint, periods, stride, side) + + elif (endpoint > 0 and side == "start" and stride > 0) or ( + endpoint < 0 and side == "end" and stride > 0 + ): + # no chance of not-overflowing + raise OutOfBoundsDatetime(msg) + + elif side == "end" and endpoint > i64max and endpoint - stride <= i64max: + # in _generate_regular_range we added `stride` thereby overflowing + # the bounds. Adjust to fix this. + return _generate_range_overflow_safe( + endpoint - stride, periods - 1, stride, side + ) + + # split into smaller pieces + mid_periods = periods // 2 + remaining = periods - mid_periods + assert 0 < remaining < periods, (remaining, periods, endpoint, stride) + + midpoint = _generate_range_overflow_safe(endpoint, mid_periods, stride, side) + return _generate_range_overflow_safe(midpoint, remaining, stride, side) + + +def _generate_range_overflow_safe_signed( + endpoint: int, periods: int, stride: int, side: str +) -> int: + """ + A special case for _generate_range_overflow_safe where `periods * stride` + can be calculated without overflowing int64 bounds. + """ + assert side in ["start", "end"] + if side == "end": + stride *= -1 + + with np.errstate(over="raise"): + addend = np.int64(periods) * np.int64(stride) + try: + # easy case with no overflows + return np.int64(endpoint) + addend + except (FloatingPointError, OverflowError): + # with endpoint negative and addend positive we risk + # FloatingPointError; with reversed signed we risk OverflowError + pass + + # if stride and endpoint had opposite signs, then endpoint + addend + # should never overflow. so they must have the same signs + assert (stride > 0 and endpoint >= 0) or (stride < 0 and endpoint <= 0) + + if stride > 0: + # watch out for very special case in which we just slightly + # exceed implementation bounds, but when passing the result to + # np.arange will get a result slightly within the bounds + result = np.uint64(endpoint) + np.uint64(addend) + i64max = np.uint64(np.iinfo(np.int64).max) + assert result > i64max + if result <= i64max + np.uint64(stride): + return result + + raise OutOfBoundsDatetime( + f"Cannot generate range with {side}={endpoint} and periods={periods}" + ) diff --git a/venv/Lib/site-packages/pandas/core/arrays/base.py b/venv/Lib/site-packages/pandas/core/arrays/base.py new file mode 100644 index 0000000..9723343 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/arrays/base.py @@ -0,0 +1,1239 @@ +"""An interface for extending pandas with custom arrays. + +.. warning:: + + This is an experimental API and subject to breaking changes + without warning. +""" +import operator +from typing import Any, Callable, Dict, Optional, Sequence, Tuple, Union + +import numpy as np + +from pandas._libs import lib +from pandas._typing import ArrayLike +from pandas.compat import set_function_name +from pandas.compat.numpy import function as nv +from pandas.errors import AbstractMethodError +from pandas.util._decorators import Appender, Substitution +from pandas.util._validators import validate_fillna_kwargs + +from pandas.core.dtypes.common import is_array_like, is_list_like +from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.generic import ABCExtensionArray, ABCIndexClass, ABCSeries +from pandas.core.dtypes.missing import isna + +from pandas.core import ops +from pandas.core.algorithms import _factorize_array, unique +from pandas.core.missing import backfill_1d, pad_1d +from pandas.core.sorting import nargsort + +_extension_array_shared_docs: Dict[str, str] = dict() + + +def try_cast_to_ea(cls_or_instance, obj, dtype=None): + """ + Call to `_from_sequence` that returns the object unchanged on Exception. + + Parameters + ---------- + cls_or_instance : ExtensionArray subclass or instance + obj : arraylike + Values to pass to cls._from_sequence + dtype : ExtensionDtype, optional + + Returns + ------- + ExtensionArray or obj + """ + try: + result = cls_or_instance._from_sequence(obj, dtype=dtype) + except Exception: + # We can't predict what downstream EA constructors may raise + result = obj + return result + + +class ExtensionArray: + """ + Abstract base class for custom 1-D array types. + + pandas will recognize instances of this class as proper arrays + with a custom type and will not attempt to coerce them to objects. They + may be stored directly inside a :class:`DataFrame` or :class:`Series`. + + .. versionadded:: 0.23.0 + + Attributes + ---------- + dtype + nbytes + ndim + shape + + Methods + ------- + argsort + astype + copy + dropna + factorize + fillna + isna + ravel + repeat + searchsorted + shift + take + unique + view + _concat_same_type + _formatter + _from_factorized + _from_sequence + _from_sequence_of_strings + _ndarray_values + _reduce + _values_for_argsort + _values_for_factorize + + Notes + ----- + The interface includes the following abstract methods that must be + implemented by subclasses: + + * _from_sequence + * _from_factorized + * __getitem__ + * __len__ + * dtype + * nbytes + * isna + * take + * copy + * _concat_same_type + + A default repr displaying the type, (truncated) data, length, + and dtype is provided. It can be customized or replaced by + by overriding: + + * __repr__ : A default repr for the ExtensionArray. + * _formatter : Print scalars inside a Series or DataFrame. + + Some methods require casting the ExtensionArray to an ndarray of Python + objects with ``self.astype(object)``, which may be expensive. When + performance is a concern, we highly recommend overriding the following + methods: + + * fillna + * dropna + * unique + * factorize / _values_for_factorize + * argsort / _values_for_argsort + * searchsorted + + The remaining methods implemented on this class should be performant, + as they only compose abstract methods. Still, a more efficient + implementation may be available, and these methods can be overridden. + + One can implement methods to handle array reductions. + + * _reduce + + One can implement methods to handle parsing from strings that will be used + in methods such as ``pandas.io.parsers.read_csv``. + + * _from_sequence_of_strings + + This class does not inherit from 'abc.ABCMeta' for performance reasons. + Methods and properties required by the interface raise + ``pandas.errors.AbstractMethodError`` and no ``register`` method is + provided for registering virtual subclasses. + + ExtensionArrays are limited to 1 dimension. + + They may be backed by none, one, or many NumPy arrays. For example, + ``pandas.Categorical`` is an extension array backed by two arrays, + one for codes and one for categories. An array of IPv6 address may + be backed by a NumPy structured array with two fields, one for the + lower 64 bits and one for the upper 64 bits. Or they may be backed + by some other storage type, like Python lists. Pandas makes no + assumptions on how the data are stored, just that it can be converted + to a NumPy array. + The ExtensionArray interface does not impose any rules on how this data + is stored. However, currently, the backing data cannot be stored in + attributes called ``.values`` or ``._values`` to ensure full compatibility + with pandas internals. But other names as ``.data``, ``._data``, + ``._items``, ... can be freely used. + + If implementing NumPy's ``__array_ufunc__`` interface, pandas expects + that + + 1. You defer by returning ``NotImplemented`` when any Series are present + in `inputs`. Pandas will extract the arrays and call the ufunc again. + 2. You define a ``_HANDLED_TYPES`` tuple as an attribute on the class. + Pandas inspect this to determine whether the ufunc is valid for the + types present. + + See :ref:`extending.extension.ufunc` for more. + """ + + # '_typ' is for pandas.core.dtypes.generic.ABCExtensionArray. + # Don't override this. + _typ = "extension" + + # ------------------------------------------------------------------------ + # Constructors + # ------------------------------------------------------------------------ + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + """ + Construct a new ExtensionArray from a sequence of scalars. + + Parameters + ---------- + scalars : Sequence + Each element will be an instance of the scalar type for this + array, ``cls.dtype.type``. + dtype : dtype, optional + Construct for this particular dtype. This should be a Dtype + compatible with the ExtensionArray. + copy : bool, default False + If True, copy the underlying data. + + Returns + ------- + ExtensionArray + """ + raise AbstractMethodError(cls) + + @classmethod + def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): + """Construct a new ExtensionArray from a sequence of strings. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + strings : Sequence + Each element will be an instance of the scalar type for this + array, ``cls.dtype.type``. + dtype : dtype, optional + Construct for this particular dtype. This should be a Dtype + compatible with the ExtensionArray. + copy : bool, default False + If True, copy the underlying data. + + Returns + ------- + ExtensionArray + """ + raise AbstractMethodError(cls) + + @classmethod + def _from_factorized(cls, values, original): + """ + Reconstruct an ExtensionArray after factorization. + + Parameters + ---------- + values : ndarray + An integer ndarray with the factorized values. + original : ExtensionArray + The original ExtensionArray that factorize was called on. + + See Also + -------- + factorize + ExtensionArray.factorize + """ + raise AbstractMethodError(cls) + + # ------------------------------------------------------------------------ + # Must be a Sequence + # ------------------------------------------------------------------------ + + def __getitem__(self, item): + # type (Any) -> Any + """ + Select a subset of self. + + Parameters + ---------- + item : int, slice, or ndarray + * int: The position in 'self' to get. + + * slice: A slice object, where 'start', 'stop', and 'step' are + integers or None + + * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' + + Returns + ------- + item : scalar or ExtensionArray + + Notes + ----- + For scalar ``item``, return a scalar value suitable for the array's + type. This should be an instance of ``self.dtype.type``. + + For slice ``key``, return an instance of ``ExtensionArray``, even + if the slice is length 0 or 1. + + For a boolean mask, return an instance of ``ExtensionArray``, filtered + to the values where ``item`` is True. + """ + raise AbstractMethodError(self) + + def __setitem__(self, key: Union[int, np.ndarray], value: Any) -> None: + """ + Set one or more values inplace. + + This method is not required to satisfy the pandas extension array + interface. + + Parameters + ---------- + key : int, ndarray, or slice + When called from, e.g. ``Series.__setitem__``, ``key`` will be + one of + + * scalar int + * ndarray of integers. + * boolean ndarray + * slice object + + value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object + value or values to be set of ``key``. + + Returns + ------- + None + """ + # Some notes to the ExtensionArray implementor who may have ended up + # here. While this method is not required for the interface, if you + # *do* choose to implement __setitem__, then some semantics should be + # observed: + # + # * Setting multiple values : ExtensionArrays should support setting + # multiple values at once, 'key' will be a sequence of integers and + # 'value' will be a same-length sequence. + # + # * Broadcasting : For a sequence 'key' and a scalar 'value', + # each position in 'key' should be set to 'value'. + # + # * Coercion : Most users will expect basic coercion to work. For + # example, a string like '2018-01-01' is coerced to a datetime + # when setting on a datetime64ns array. In general, if the + # __init__ method coerces that value, then so should __setitem__ + # Note, also, that Series/DataFrame.where internally use __setitem__ + # on a copy of the data. + raise NotImplementedError(f"{type(self)} does not implement __setitem__.") + + def __len__(self) -> int: + """ + Length of this array + + Returns + ------- + length : int + """ + raise AbstractMethodError(self) + + def __iter__(self): + """ + Iterate over elements of the array. + """ + # This needs to be implemented so that pandas recognizes extension + # arrays as list-like. The default implementation makes successive + # calls to ``__getitem__``, which may be slower than necessary. + for i in range(len(self)): + yield self[i] + + def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default): + """ + Convert to a NumPy ndarray. + + .. versionadded:: 1.0.0 + + This is similar to :meth:`numpy.asarray`, but may provide additional control + over how the conversion is done. + + Parameters + ---------- + dtype : str or numpy.dtype, optional + The dtype to pass to :meth:`numpy.asarray`. + copy : bool, default False + Whether to ensure that the returned value is a not a view on + another array. Note that ``copy=False`` does not *ensure* that + ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that + a copy is made, even if not strictly necessary. + na_value : Any, optional + The value to use for missing values. The default value depends + on `dtype` and the type of the array. + + Returns + ------- + numpy.ndarray + """ + result = np.asarray(self, dtype=dtype) + if copy or na_value is not lib.no_default: + result = result.copy() + if na_value is not lib.no_default: + result[self.isna()] = na_value + return result + + # ------------------------------------------------------------------------ + # Required attributes + # ------------------------------------------------------------------------ + + @property + def dtype(self) -> ExtensionDtype: + """ + An instance of 'ExtensionDtype'. + """ + raise AbstractMethodError(self) + + @property + def shape(self) -> Tuple[int, ...]: + """ + Return a tuple of the array dimensions. + """ + return (len(self),) + + @property + def ndim(self) -> int: + """ + Extension Arrays are only allowed to be 1-dimensional. + """ + return 1 + + @property + def nbytes(self) -> int: + """ + The number of bytes needed to store this object in memory. + """ + # If this is expensive to compute, return an approximate lower bound + # on the number of bytes needed. + raise AbstractMethodError(self) + + # ------------------------------------------------------------------------ + # Additional Methods + # ------------------------------------------------------------------------ + + def astype(self, dtype, copy=True): + """ + Cast to a NumPy array with 'dtype'. + + Parameters + ---------- + dtype : str or dtype + Typecode or data-type to which the array is cast. + copy : bool, default True + Whether to copy the data, even if not necessary. If False, + a copy is made only if the old dtype does not match the + new dtype. + + Returns + ------- + array : ndarray + NumPy ndarray with 'dtype' for its dtype. + """ + return np.array(self, dtype=dtype, copy=copy) + + def isna(self) -> ArrayLike: + """ + A 1-D array indicating if each value is missing. + + Returns + ------- + na_values : Union[np.ndarray, ExtensionArray] + In most cases, this should return a NumPy ndarray. For + exceptional cases like ``SparseArray``, where returning + an ndarray would be expensive, an ExtensionArray may be + returned. + + Notes + ----- + If returning an ExtensionArray, then + + * ``na_values._is_boolean`` should be True + * `na_values` should implement :func:`ExtensionArray._reduce` + * ``na_values.any`` and ``na_values.all`` should be implemented + """ + raise AbstractMethodError(self) + + def _values_for_argsort(self) -> np.ndarray: + """ + Return values for sorting. + + Returns + ------- + ndarray + The transformed values should maintain the ordering between values + within the array. + + See Also + -------- + ExtensionArray.argsort + """ + # Note: this is used in `ExtensionArray.argsort`. + return np.array(self) + + def argsort( + self, ascending: bool = True, kind: str = "quicksort", *args, **kwargs + ) -> np.ndarray: + """ + Return the indices that would sort this array. + + Parameters + ---------- + ascending : bool, default True + Whether the indices should result in an ascending + or descending sort. + kind : {'quicksort', 'mergesort', 'heapsort'}, optional + Sorting algorithm. + *args, **kwargs: + passed through to :func:`numpy.argsort`. + + Returns + ------- + ndarray + Array of indices that sort ``self``. If NaN values are contained, + NaN values are placed at the end. + + See Also + -------- + numpy.argsort : Sorting implementation used internally. + """ + # Implementor note: You have two places to override the behavior of + # argsort. + # 1. _values_for_argsort : construct the values passed to np.argsort + # 2. argsort : total control over sorting. + ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) + + result = nargsort(self, kind=kind, ascending=ascending, na_position="last") + return result + + def fillna(self, value=None, method=None, limit=None): + """ + Fill NA/NaN values using the specified method. + + Parameters + ---------- + value : scalar, array-like + If a scalar value is passed it is used to fill all missing values. + Alternatively, an array-like 'value' can be given. It's expected + that the array-like have the same length as 'self'. + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + Method to use for filling holes in reindexed Series + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap. + limit : int, default None + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. + + Returns + ------- + ExtensionArray + With NA/NaN filled. + """ + value, method = validate_fillna_kwargs(value, method) + + mask = self.isna() + + if is_array_like(value): + if len(value) != len(self): + raise ValueError( + f"Length of 'value' does not match. Got ({len(value)}) " + f"expected {len(self)}" + ) + value = value[mask] + + if mask.any(): + if method is not None: + func = pad_1d if method == "pad" else backfill_1d + new_values = func(self.astype(object), limit=limit, mask=mask) + new_values = self._from_sequence(new_values, dtype=self.dtype) + else: + # fill with value + new_values = self.copy() + new_values[mask] = value + else: + new_values = self.copy() + return new_values + + def dropna(self): + """ + Return ExtensionArray without NA values. + + Returns + ------- + valid : ExtensionArray + """ + return self[~self.isna()] + + def shift(self, periods: int = 1, fill_value: object = None) -> ABCExtensionArray: + """ + Shift values by desired number. + + Newly introduced missing values are filled with + ``self.dtype.na_value``. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + periods : int, default 1 + The number of periods to shift. Negative values are allowed + for shifting backwards. + + fill_value : object, optional + The scalar value to use for newly introduced missing values. + The default is ``self.dtype.na_value``. + + .. versionadded:: 0.24.0 + + Returns + ------- + ExtensionArray + Shifted. + + Notes + ----- + If ``self`` is empty or ``periods`` is 0, a copy of ``self`` is + returned. + + If ``periods > len(self)``, then an array of size + len(self) is returned, with all values filled with + ``self.dtype.na_value``. + """ + # Note: this implementation assumes that `self.dtype.na_value` can be + # stored in an instance of your ExtensionArray with `self.dtype`. + if not len(self) or periods == 0: + return self.copy() + + if isna(fill_value): + fill_value = self.dtype.na_value + + empty = self._from_sequence( + [fill_value] * min(abs(periods), len(self)), dtype=self.dtype + ) + if periods > 0: + a = empty + b = self[:-periods] + else: + a = self[abs(periods) :] + b = empty + return self._concat_same_type([a, b]) + + def unique(self): + """ + Compute the ExtensionArray of unique values. + + Returns + ------- + uniques : ExtensionArray + """ + uniques = unique(self.astype(object)) + return self._from_sequence(uniques, dtype=self.dtype) + + def searchsorted(self, value, side="left", sorter=None): + """ + Find indices where elements should be inserted to maintain order. + + .. versionadded:: 0.24.0 + + Find the indices into a sorted array `self` (a) such that, if the + corresponding elements in `value` were inserted before the indices, + the order of `self` would be preserved. + + Assuming that `self` is sorted: + + ====== ================================ + `side` returned index `i` satisfies + ====== ================================ + left ``self[i-1] < value <= self[i]`` + right ``self[i-1] <= value < self[i]`` + ====== ================================ + + Parameters + ---------- + value : array_like + Values to insert into `self`. + side : {'left', 'right'}, optional + If 'left', the index of the first suitable location found is given. + If 'right', return the last such index. If there is no suitable + index, return either 0 or N (where N is the length of `self`). + sorter : 1-D array_like, optional + Optional array of integer indices that sort array a into ascending + order. They are typically the result of argsort. + + Returns + ------- + array of ints + Array of insertion points with the same shape as `value`. + + See Also + -------- + numpy.searchsorted : Similar method from NumPy. + """ + # Note: the base tests provided by pandas only test the basics. + # We do not test + # 1. Values outside the range of the `data_for_sorting` fixture + # 2. Values between the values in the `data_for_sorting` fixture + # 3. Missing values. + arr = self.astype(object) + return arr.searchsorted(value, side=side, sorter=sorter) + + def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: + """ + Return an array and missing value suitable for factorization. + + Returns + ------- + values : ndarray + + An array suitable for factorization. This should maintain order + and be a supported dtype (Float64, Int64, UInt64, String, Object). + By default, the extension array is cast to object dtype. + na_value : object + The value in `values` to consider missing. This will be treated + as NA in the factorization routines, so it will be coded as + `na_sentinal` and not included in `uniques`. By default, + ``np.nan`` is used. + + Notes + ----- + The values returned by this method are also used in + :func:`pandas.util.hash_pandas_object`. + """ + return self.astype(object), np.nan + + def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ABCExtensionArray]: + """ + Encode the extension array as an enumerated type. + + Parameters + ---------- + na_sentinel : int, default -1 + Value to use in the `codes` array to indicate missing values. + + Returns + ------- + codes : ndarray + An integer NumPy array that's an indexer into the original + ExtensionArray. + uniques : ExtensionArray + An ExtensionArray containing the unique values of `self`. + + .. note:: + + uniques will *not* contain an entry for the NA value of + the ExtensionArray if there are any missing values present + in `self`. + + See Also + -------- + factorize : Top-level factorize method that dispatches here. + + Notes + ----- + :meth:`pandas.factorize` offers a `sort` keyword as well. + """ + # Implementer note: There are two ways to override the behavior of + # pandas.factorize + # 1. _values_for_factorize and _from_factorize. + # Specify the values passed to pandas' internal factorization + # routines, and how to convert from those values back to the + # original ExtensionArray. + # 2. ExtensionArray.factorize. + # Complete control over factorization. + arr, na_value = self._values_for_factorize() + + codes, uniques = _factorize_array( + arr, na_sentinel=na_sentinel, na_value=na_value + ) + + uniques = self._from_factorized(uniques, self) + return codes, uniques + + _extension_array_shared_docs[ + "repeat" + ] = """ + Repeat elements of a %(klass)s. + + Returns a new %(klass)s where each element of the current %(klass)s + is repeated consecutively a given number of times. + + Parameters + ---------- + repeats : int or array of ints + The number of repetitions for each element. This should be a + non-negative integer. Repeating 0 times will return an empty + %(klass)s. + axis : None + Must be ``None``. Has no effect but is accepted for compatibility + with numpy. + + Returns + ------- + repeated_array : %(klass)s + Newly created %(klass)s with repeated elements. + + See Also + -------- + Series.repeat : Equivalent function for Series. + Index.repeat : Equivalent function for Index. + numpy.repeat : Similar method for :class:`numpy.ndarray`. + ExtensionArray.take : Take arbitrary positions. + + Examples + -------- + >>> cat = pd.Categorical(['a', 'b', 'c']) + >>> cat + [a, b, c] + Categories (3, object): [a, b, c] + >>> cat.repeat(2) + [a, a, b, b, c, c] + Categories (3, object): [a, b, c] + >>> cat.repeat([1, 2, 3]) + [a, b, b, c, c, c] + Categories (3, object): [a, b, c] + """ + + @Substitution(klass="ExtensionArray") + @Appender(_extension_array_shared_docs["repeat"]) + def repeat(self, repeats, axis=None): + nv.validate_repeat(tuple(), dict(axis=axis)) + ind = np.arange(len(self)).repeat(repeats) + return self.take(ind) + + # ------------------------------------------------------------------------ + # Indexing methods + # ------------------------------------------------------------------------ + + def take( + self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None + ) -> ABCExtensionArray: + """ + Take elements from an array. + + Parameters + ---------- + indices : sequence of int + Indices to be taken. + allow_fill : bool, default False + How to handle negative values in `indices`. + + * False: negative values in `indices` indicate positional indices + from the right (the default). This is similar to + :func:`numpy.take`. + + * True: negative values in `indices` indicate + missing values. These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. + + fill_value : any, optional + Fill value to use for NA-indices when `allow_fill` is True. + This may be ``None``, in which case the default NA value for + the type, ``self.dtype.na_value``, is used. + + For many ExtensionArrays, there will be two representations of + `fill_value`: a user-facing "boxed" scalar, and a low-level + physical NA value. `fill_value` should be the user-facing version, + and the implementation should handle translating that to the + physical version for processing the take if necessary. + + Returns + ------- + ExtensionArray + + Raises + ------ + IndexError + When the indices are out of bounds for the array. + ValueError + When `indices` contains negative values other than ``-1`` + and `allow_fill` is True. + + See Also + -------- + numpy.take + api.extensions.take + + Notes + ----- + ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, + ``iloc``, when `indices` is a sequence of values. Additionally, + it's called by :meth:`Series.reindex`, or any other method + that causes realignment, with a `fill_value`. + + Examples + -------- + Here's an example implementation, which relies on casting the + extension array to object dtype. This uses the helper method + :func:`pandas.api.extensions.take`. + + .. code-block:: python + + def take(self, indices, allow_fill=False, fill_value=None): + from pandas.core.algorithms import take + + # If the ExtensionArray is backed by an ndarray, then + # just pass that here instead of coercing to object. + data = self.astype(object) + + if allow_fill and fill_value is None: + fill_value = self.dtype.na_value + + # fill value should always be translated from the scalar + # type for the array, to the physical storage type for + # the data, before passing to take. + + result = take(data, indices, fill_value=fill_value, + allow_fill=allow_fill) + return self._from_sequence(result, dtype=self.dtype) + """ + # Implementer note: The `fill_value` parameter should be a user-facing + # value, an instance of self.dtype.type. When passed `fill_value=None`, + # the default of `self.dtype.na_value` should be used. + # This may differ from the physical storage type your ExtensionArray + # uses. In this case, your implementation is responsible for casting + # the user-facing type to the storage type, before using + # pandas.api.extensions.take + raise AbstractMethodError(self) + + def copy(self) -> ABCExtensionArray: + """ + Return a copy of the array. + + Returns + ------- + ExtensionArray + """ + raise AbstractMethodError(self) + + def view(self, dtype=None) -> Union[ABCExtensionArray, np.ndarray]: + """ + Return a view on the array. + + Parameters + ---------- + dtype : str, np.dtype, or ExtensionDtype, optional + Default None. + + Returns + ------- + ExtensionArray + A view of the :class:`ExtensionArray`. + """ + # NB: + # - This must return a *new* object referencing the same data, not self. + # - The only case that *must* be implemented is with dtype=None, + # giving a view with the same dtype as self. + if dtype is not None: + raise NotImplementedError(dtype) + return self[:] + + # ------------------------------------------------------------------------ + # Printing + # ------------------------------------------------------------------------ + + def __repr__(self) -> str: + from pandas.io.formats.printing import format_object_summary + + # the short repr has no trailing newline, while the truncated + # repr does. So we include a newline in our template, and strip + # any trailing newlines from format_object_summary + data = format_object_summary( + self, self._formatter(), indent_for_name=False + ).rstrip(", \n") + class_name = f"<{type(self).__name__}>\n" + return f"{class_name}{data}\nLength: {len(self)}, dtype: {self.dtype}" + + def _formatter(self, boxed: bool = False) -> Callable[[Any], Optional[str]]: + """Formatting function for scalar values. + + This is used in the default '__repr__'. The returned formatting + function receives instances of your scalar type. + + Parameters + ---------- + boxed : bool, default False + An indicated for whether or not your array is being printed + within a Series, DataFrame, or Index (True), or just by + itself (False). This may be useful if you want scalar values + to appear differently within a Series versus on its own (e.g. + quoted or not). + + Returns + ------- + Callable[[Any], str] + A callable that gets instances of the scalar type and + returns a string. By default, :func:`repr` is used + when ``boxed=False`` and :func:`str` is used when + ``boxed=True``. + """ + if boxed: + return str + return repr + + # ------------------------------------------------------------------------ + # Reshaping + # ------------------------------------------------------------------------ + + def ravel(self, order="C") -> ABCExtensionArray: + """ + Return a flattened view on this array. + + Parameters + ---------- + order : {None, 'C', 'F', 'A', 'K'}, default 'C' + + Returns + ------- + ExtensionArray + + Notes + ----- + - Because ExtensionArrays are 1D-only, this is a no-op. + - The "order" argument is ignored, is for compatibility with NumPy. + """ + return self + + @classmethod + def _concat_same_type( + cls, to_concat: Sequence[ABCExtensionArray] + ) -> ABCExtensionArray: + """ + Concatenate multiple array. + + Parameters + ---------- + to_concat : sequence of this type + + Returns + ------- + ExtensionArray + """ + raise AbstractMethodError(cls) + + # The _can_hold_na attribute is set to True so that pandas internals + # will use the ExtensionDtype.na_value as the NA value in operations + # such as take(), reindex(), shift(), etc. In addition, those results + # will then be of the ExtensionArray subclass rather than an array + # of objects + _can_hold_na = True + + @property + def _ndarray_values(self) -> np.ndarray: + """ + Internal pandas method for lossy conversion to a NumPy ndarray. + + This method is not part of the pandas interface. + + The expectation is that this is cheap to compute, and is primarily + used for interacting with our indexers. + + Returns + ------- + array : ndarray + """ + return np.array(self) + + def _reduce(self, name, skipna=True, **kwargs): + """ + Return a scalar result of performing the reduction operation. + + Parameters + ---------- + name : str + Name of the function, supported values are: + { any, all, min, max, sum, mean, median, prod, + std, var, sem, kurt, skew }. + skipna : bool, default True + If True, skip NaN values. + **kwargs + Additional keyword arguments passed to the reduction function. + Currently, `ddof` is the only supported kwarg. + + Returns + ------- + scalar + + Raises + ------ + TypeError : subclass does not define reductions + """ + raise TypeError(f"cannot perform {name} with type {self.dtype}") + + +class ExtensionOpsMixin: + """ + A base class for linking the operators to their dunder names. + + .. note:: + + You may want to set ``__array_priority__`` if you want your + implementation to be called when involved in binary operations + with NumPy arrays. + """ + + @classmethod + def _add_arithmetic_ops(cls): + cls.__add__ = cls._create_arithmetic_method(operator.add) + cls.__radd__ = cls._create_arithmetic_method(ops.radd) + cls.__sub__ = cls._create_arithmetic_method(operator.sub) + cls.__rsub__ = cls._create_arithmetic_method(ops.rsub) + cls.__mul__ = cls._create_arithmetic_method(operator.mul) + cls.__rmul__ = cls._create_arithmetic_method(ops.rmul) + cls.__pow__ = cls._create_arithmetic_method(operator.pow) + cls.__rpow__ = cls._create_arithmetic_method(ops.rpow) + cls.__mod__ = cls._create_arithmetic_method(operator.mod) + cls.__rmod__ = cls._create_arithmetic_method(ops.rmod) + cls.__floordiv__ = cls._create_arithmetic_method(operator.floordiv) + cls.__rfloordiv__ = cls._create_arithmetic_method(ops.rfloordiv) + cls.__truediv__ = cls._create_arithmetic_method(operator.truediv) + cls.__rtruediv__ = cls._create_arithmetic_method(ops.rtruediv) + cls.__divmod__ = cls._create_arithmetic_method(divmod) + cls.__rdivmod__ = cls._create_arithmetic_method(ops.rdivmod) + + @classmethod + def _add_comparison_ops(cls): + cls.__eq__ = cls._create_comparison_method(operator.eq) + cls.__ne__ = cls._create_comparison_method(operator.ne) + cls.__lt__ = cls._create_comparison_method(operator.lt) + cls.__gt__ = cls._create_comparison_method(operator.gt) + cls.__le__ = cls._create_comparison_method(operator.le) + cls.__ge__ = cls._create_comparison_method(operator.ge) + + @classmethod + def _add_logical_ops(cls): + cls.__and__ = cls._create_logical_method(operator.and_) + cls.__rand__ = cls._create_logical_method(ops.rand_) + cls.__or__ = cls._create_logical_method(operator.or_) + cls.__ror__ = cls._create_logical_method(ops.ror_) + cls.__xor__ = cls._create_logical_method(operator.xor) + cls.__rxor__ = cls._create_logical_method(ops.rxor) + + +class ExtensionScalarOpsMixin(ExtensionOpsMixin): + """ + A mixin for defining ops on an ExtensionArray. + + It is assumed that the underlying scalar objects have the operators + already defined. + + Notes + ----- + If you have defined a subclass MyExtensionArray(ExtensionArray), then + use MyExtensionArray(ExtensionArray, ExtensionScalarOpsMixin) to + get the arithmetic operators. After the definition of MyExtensionArray, + insert the lines + + MyExtensionArray._add_arithmetic_ops() + MyExtensionArray._add_comparison_ops() + + to link the operators to your class. + + .. note:: + + You may want to set ``__array_priority__`` if you want your + implementation to be called when involved in binary operations + with NumPy arrays. + """ + + @classmethod + def _create_method(cls, op, coerce_to_dtype=True): + """ + A class method that returns a method that will correspond to an + operator for an ExtensionArray subclass, by dispatching to the + relevant operator defined on the individual elements of the + ExtensionArray. + + Parameters + ---------- + op : function + An operator that takes arguments op(a, b) + coerce_to_dtype : bool, default True + boolean indicating whether to attempt to convert + the result to the underlying ExtensionArray dtype. + If it's not possible to create a new ExtensionArray with the + values, an ndarray is returned instead. + + Returns + ------- + Callable[[Any, Any], Union[ndarray, ExtensionArray]] + A method that can be bound to a class. When used, the method + receives the two arguments, one of which is the instance of + this class, and should return an ExtensionArray or an ndarray. + + Returning an ndarray may be necessary when the result of the + `op` cannot be stored in the ExtensionArray. The dtype of the + ndarray uses NumPy's normal inference rules. + + Examples + -------- + Given an ExtensionArray subclass called MyExtensionArray, use + + >>> __add__ = cls._create_method(operator.add) + + in the class definition of MyExtensionArray to create the operator + for addition, that will be based on the operator implementation + of the underlying elements of the ExtensionArray + """ + + def _binop(self, other): + def convert_values(param): + if isinstance(param, ExtensionArray) or is_list_like(param): + ovalues = param + else: # Assume its an object + ovalues = [param] * len(self) + return ovalues + + if isinstance(other, (ABCSeries, ABCIndexClass)): + # rely on pandas to unbox and dispatch to us + return NotImplemented + + lvalues = self + rvalues = convert_values(other) + + # If the operator is not defined for the underlying objects, + # a TypeError should be raised + res = [op(a, b) for (a, b) in zip(lvalues, rvalues)] + + def _maybe_convert(arr): + if coerce_to_dtype: + # https://github.com/pandas-dev/pandas/issues/22850 + # We catch all regular exceptions here, and fall back + # to an ndarray. + res = try_cast_to_ea(self, arr) + if not isinstance(res, type(self)): + # exception raised in _from_sequence; ensure we have ndarray + res = np.asarray(arr) + else: + res = np.asarray(arr) + return res + + if op.__name__ in {"divmod", "rdivmod"}: + a, b = zip(*res) + return _maybe_convert(a), _maybe_convert(b) + + return _maybe_convert(res) + + op_name = ops._get_op_name(op, True) + return set_function_name(_binop, op_name, cls) + + @classmethod + def _create_arithmetic_method(cls, op): + return cls._create_method(op) + + @classmethod + def _create_comparison_method(cls, op): + return cls._create_method(op, coerce_to_dtype=False) diff --git a/venv/Lib/site-packages/pandas/core/arrays/boolean.py b/venv/Lib/site-packages/pandas/core/arrays/boolean.py new file mode 100644 index 0000000..7b12f33 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/arrays/boolean.py @@ -0,0 +1,789 @@ +import numbers +from typing import TYPE_CHECKING, Any, List, Tuple, Type +import warnings + +import numpy as np + +from pandas._libs import lib, missing as libmissing +from pandas.compat import set_function_name +from pandas.compat.numpy import function as nv + +from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.cast import astype_nansafe +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_extension_array_dtype, + is_float, + is_float_dtype, + is_integer_dtype, + is_list_like, + is_numeric_dtype, + is_scalar, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import register_extension_dtype +from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries +from pandas.core.dtypes.missing import isna, notna + +from pandas.core import nanops, ops + +from .masked import BaseMaskedArray + +if TYPE_CHECKING: + from pandas._typing import Scalar + + +@register_extension_dtype +class BooleanDtype(ExtensionDtype): + """ + Extension dtype for boolean data. + + .. versionadded:: 1.0.0 + + .. warning:: + + BooleanDtype is considered experimental. The implementation and + parts of the API may change without warning. + + Attributes + ---------- + None + + Methods + ------- + None + + Examples + -------- + >>> pd.BooleanDtype() + BooleanDtype + """ + + name = "boolean" + + @property + def na_value(self) -> "Scalar": + """ + BooleanDtype uses :attr:`pandas.NA` as the missing NA value. + + .. warning:: + + `na_value` may change in a future release. + """ + return libmissing.NA + + @property + def type(self) -> Type: + return np.bool_ + + @property + def kind(self) -> str: + return "b" + + @classmethod + def construct_array_type(cls) -> "Type[BooleanArray]": + return BooleanArray + + def __repr__(self) -> str: + return "BooleanDtype" + + @property + def _is_boolean(self) -> bool: + return True + + def __from_arrow__(self, array): + """Construct BooleanArray from passed pyarrow Array/ChunkedArray""" + import pyarrow + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + # pyarrow.ChunkedArray + chunks = array.chunks + + results = [] + for arr in chunks: + # TODO should optimize this without going through object array + bool_arr = BooleanArray._from_sequence(np.array(arr)) + results.append(bool_arr) + + return BooleanArray._concat_same_type(results) + + +def coerce_to_array(values, mask=None, copy: bool = False): + """ + Coerce the input values array to numpy arrays with a mask. + + Parameters + ---------- + values : 1D list-like + mask : bool 1D array, optional + copy : bool, default False + if True, copy the input + + Returns + ------- + tuple of (values, mask) + """ + if isinstance(values, BooleanArray): + if mask is not None: + raise ValueError("cannot pass mask for BooleanArray input") + values, mask = values._data, values._mask + if copy: + values = values.copy() + mask = mask.copy() + return values, mask + + mask_values = None + if isinstance(values, np.ndarray) and values.dtype == np.bool_: + if copy: + values = values.copy() + elif isinstance(values, np.ndarray) and is_numeric_dtype(values.dtype): + mask_values = isna(values) + + values_bool = np.zeros(len(values), dtype=bool) + values_bool[~mask_values] = values[~mask_values].astype(bool) + + if not np.all( + values_bool[~mask_values].astype(values.dtype) == values[~mask_values] + ): + raise TypeError("Need to pass bool-like values") + + values = values_bool + else: + values_object = np.asarray(values, dtype=object) + + inferred_dtype = lib.infer_dtype(values_object, skipna=True) + integer_like = ("floating", "integer", "mixed-integer-float") + if inferred_dtype not in ("boolean", "empty") + integer_like: + raise TypeError("Need to pass bool-like values") + + mask_values = isna(values_object) + values = np.zeros(len(values), dtype=bool) + values[~mask_values] = values_object[~mask_values].astype(bool) + + # if the values were integer-like, validate it were actually 0/1's + if inferred_dtype in integer_like: + if not np.all( + values[~mask_values].astype(float) + == values_object[~mask_values].astype(float) + ): + raise TypeError("Need to pass bool-like values") + + if mask is None and mask_values is None: + mask = np.zeros(len(values), dtype=bool) + elif mask is None: + mask = mask_values + else: + if isinstance(mask, np.ndarray) and mask.dtype == np.bool_: + if mask_values is not None: + mask = mask | mask_values + else: + if copy: + mask = mask.copy() + else: + mask = np.array(mask, dtype=bool) + if mask_values is not None: + mask = mask | mask_values + + if not values.ndim == 1: + raise ValueError("values must be a 1D list-like") + if not mask.ndim == 1: + raise ValueError("mask must be a 1D list-like") + + return values, mask + + +class BooleanArray(BaseMaskedArray): + """ + Array of boolean (True/False) data with missing values. + + This is a pandas Extension array for boolean data, under the hood + represented by 2 numpy arrays: a boolean array with the data and + a boolean array with the mask (True indicating missing). + + BooleanArray implements Kleene logic (sometimes called three-value + logic) for logical operations. See :ref:`boolean.kleene` for more. + + To construct an BooleanArray from generic array-like input, use + :func:`pandas.array` specifying ``dtype="boolean"`` (see examples + below). + + .. versionadded:: 1.0.0 + + .. warning:: + + BooleanArray is considered experimental. The implementation and + parts of the API may change without warning. + + Parameters + ---------- + values : numpy.ndarray + A 1-d boolean-dtype array with the data. + mask : numpy.ndarray + A 1-d boolean-dtype array indicating missing values (True + indicates missing). + copy : bool, default False + Whether to copy the `values` and `mask` arrays. + + Attributes + ---------- + None + + Methods + ------- + None + + Returns + ------- + BooleanArray + + Examples + -------- + Create an BooleanArray with :func:`pandas.array`: + + >>> pd.array([True, False, None], dtype="boolean") + + [True, False, ] + Length: 3, dtype: boolean + """ + + # The value used to fill '_data' to avoid upcasting + _internal_fill_value = False + + def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): + if not (isinstance(values, np.ndarray) and values.dtype == np.bool_): + raise TypeError( + "values should be boolean numpy array. Use " + "the 'array' function instead" + ) + if not (isinstance(mask, np.ndarray) and mask.dtype == np.bool_): + raise TypeError( + "mask should be boolean numpy array. Use " + "the 'array' function instead" + ) + if not values.ndim == 1: + raise ValueError("values must be a 1D array") + if not mask.ndim == 1: + raise ValueError("mask must be a 1D array") + + if copy: + values = values.copy() + mask = mask.copy() + + self._data = values + self._mask = mask + self._dtype = BooleanDtype() + + @property + def dtype(self): + return self._dtype + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy: bool = False): + if dtype: + assert dtype == "boolean" + values, mask = coerce_to_array(scalars, copy=copy) + return BooleanArray(values, mask) + + @classmethod + def _from_sequence_of_strings( + cls, strings: List[str], dtype=None, copy: bool = False + ): + def map_string(s): + if isna(s): + return s + elif s in ["True", "TRUE", "true"]: + return True + elif s in ["False", "FALSE", "false"]: + return False + else: + raise ValueError(f"{s} cannot be cast to bool") + + scalars = [map_string(x) for x in strings] + return cls._from_sequence(scalars, dtype, copy) + + def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: + data = self._data.astype("int8") + data[self._mask] = -1 + return data, -1 + + @classmethod + def _from_factorized(cls, values, original: "BooleanArray"): + return cls._from_sequence(values, dtype=original.dtype) + + _HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_) + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + # For BooleanArray inputs, we apply the ufunc to ._data + # and mask the result. + if method == "reduce": + # Not clear how to handle missing values in reductions. Raise. + raise NotImplementedError("The 'reduce' method is not supported.") + out = kwargs.get("out", ()) + + for x in inputs + out: + if not isinstance(x, self._HANDLED_TYPES + (BooleanArray,)): + return NotImplemented + + # for binary ops, use our custom dunder methods + result = ops.maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + return result + + mask = np.zeros(len(self), dtype=bool) + inputs2 = [] + for x in inputs: + if isinstance(x, BooleanArray): + mask |= x._mask + inputs2.append(x._data) + else: + inputs2.append(x) + + def reconstruct(x): + # we don't worry about scalar `x` here, since we + # raise for reduce up above. + + if is_bool_dtype(x.dtype): + m = mask.copy() + return BooleanArray(x, m) + else: + x[mask] = np.nan + return x + + result = getattr(ufunc, method)(*inputs2, **kwargs) + if isinstance(result, tuple): + tuple(reconstruct(x) for x in result) + else: + return reconstruct(result) + + def __setitem__(self, key, value): + _is_scalar = is_scalar(value) + if _is_scalar: + value = [value] + value, mask = coerce_to_array(value) + + if _is_scalar: + value = value[0] + mask = mask[0] + + self._data[key] = value + self._mask[key] = mask + + def astype(self, dtype, copy=True): + """ + Cast to a NumPy array or ExtensionArray with 'dtype'. + + Parameters + ---------- + dtype : str or dtype + Typecode or data-type to which the array is cast. + copy : bool, default True + Whether to copy the data, even if not necessary. If False, + a copy is made only if the old dtype does not match the + new dtype. + + Returns + ------- + array : ndarray or ExtensionArray + NumPy ndarray, BooleanArray or IntergerArray with 'dtype' for its dtype. + + Raises + ------ + TypeError + if incompatible type with an BooleanDtype, equivalent of same_kind + casting + """ + dtype = pandas_dtype(dtype) + + if isinstance(dtype, BooleanDtype): + values, mask = coerce_to_array(self, copy=copy) + return BooleanArray(values, mask, copy=False) + + if is_bool_dtype(dtype): + # astype_nansafe converts np.nan to True + if self._hasna: + raise ValueError("cannot convert float NaN to bool") + else: + return self._data.astype(dtype, copy=copy) + if is_extension_array_dtype(dtype) and is_integer_dtype(dtype): + from pandas.core.arrays import IntegerArray + + return IntegerArray( + self._data.astype(dtype.numpy_dtype), self._mask.copy(), copy=False + ) + # for integer, error if there are missing values + if is_integer_dtype(dtype): + if self._hasna: + raise ValueError("cannot convert NA to integer") + # for float dtype, ensure we use np.nan before casting (numpy cannot + # deal with pd.NA) + na_value = self._na_value + if is_float_dtype(dtype): + na_value = np.nan + # coerce + data = self.to_numpy(na_value=na_value) + return astype_nansafe(data, dtype, copy=False) + + def _values_for_argsort(self) -> np.ndarray: + """ + Return values for sorting. + + Returns + ------- + ndarray + The transformed values should maintain the ordering between values + within the array. + + See Also + -------- + ExtensionArray.argsort + """ + data = self._data.copy() + data[self._mask] = -1 + return data + + def any(self, skipna: bool = True, **kwargs): + """ + Return whether any element is True. + + Returns False unless there is at least one element that is True. + By default, NAs are skipped. If ``skipna=False`` is specified and + missing values are present, similar :ref:`Kleene logic ` + is used as for logical operations. + + Parameters + ---------- + skipna : bool, default True + Exclude NA values. If the entire array is NA and `skipna` is + True, then the result will be False, as for an empty array. + If `skipna` is False, the result will still be True if there is + at least one element that is True, otherwise NA will be returned + if there are NA's present. + **kwargs : any, default None + Additional keywords have no effect but might be accepted for + compatibility with NumPy. + + Returns + ------- + bool or :attr:`pandas.NA` + + See Also + -------- + numpy.any : Numpy version of this method. + BooleanArray.all : Return whether all elements are True. + + Examples + -------- + + The result indicates whether any element is True (and by default + skips NAs): + + >>> pd.array([True, False, True]).any() + True + >>> pd.array([True, False, pd.NA]).any() + True + >>> pd.array([False, False, pd.NA]).any() + False + >>> pd.array([], dtype="boolean").any() + False + >>> pd.array([pd.NA], dtype="boolean").any() + False + + With ``skipna=False``, the result can be NA if this is logically + required (whether ``pd.NA`` is True or False influences the result): + + >>> pd.array([True, False, pd.NA]).any(skipna=False) + True + >>> pd.array([False, False, pd.NA]).any(skipna=False) + + """ + kwargs.pop("axis", None) + nv.validate_any((), kwargs) + + values = self._data.copy() + np.putmask(values, self._mask, False) + result = values.any() + if skipna: + return result + else: + if result or len(self) == 0: + return result + else: + return self.dtype.na_value + + def all(self, skipna: bool = True, **kwargs): + """ + Return whether all elements are True. + + Returns True unless there is at least one element that is False. + By default, NAs are skipped. If ``skipna=False`` is specified and + missing values are present, similar :ref:`Kleene logic ` + is used as for logical operations. + + Parameters + ---------- + skipna : bool, default True + Exclude NA values. If the entire array is NA and `skipna` is + True, then the result will be True, as for an empty array. + If `skipna` is False, the result will still be False if there is + at least one element that is False, otherwise NA will be returned + if there are NA's present. + **kwargs : any, default None + Additional keywords have no effect but might be accepted for + compatibility with NumPy. + + Returns + ------- + bool or :attr:`pandas.NA` + + See Also + -------- + numpy.all : Numpy version of this method. + BooleanArray.any : Return whether any element is True. + + Examples + -------- + + The result indicates whether any element is True (and by default + skips NAs): + + >>> pd.array([True, True, pd.NA]).all() + True + >>> pd.array([True, False, pd.NA]).all() + False + >>> pd.array([], dtype="boolean").all() + True + >>> pd.array([pd.NA], dtype="boolean").all() + True + + With ``skipna=False``, the result can be NA if this is logically + required (whether ``pd.NA`` is True or False influences the result): + + >>> pd.array([True, True, pd.NA]).all(skipna=False) + + >>> pd.array([True, False, pd.NA]).all(skipna=False) + False + """ + kwargs.pop("axis", None) + nv.validate_all((), kwargs) + + values = self._data.copy() + np.putmask(values, self._mask, True) + result = values.all() + + if skipna: + return result + else: + if not result or len(self) == 0: + return result + else: + return self.dtype.na_value + + @classmethod + def _create_logical_method(cls, op): + def logical_method(self, other): + if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): + # Rely on pandas to unbox and dispatch to us. + return NotImplemented + + assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"} + other = lib.item_from_zerodim(other) + other_is_booleanarray = isinstance(other, BooleanArray) + other_is_scalar = lib.is_scalar(other) + mask = None + + if other_is_booleanarray: + other, mask = other._data, other._mask + elif is_list_like(other): + other = np.asarray(other, dtype="bool") + if other.ndim > 1: + raise NotImplementedError( + "can only perform ops with 1-d structures" + ) + other, mask = coerce_to_array(other, copy=False) + elif isinstance(other, np.bool_): + other = other.item() + + if other_is_scalar and not (other is libmissing.NA or lib.is_bool(other)): + raise TypeError( + "'other' should be pandas.NA or a bool. " + f"Got {type(other).__name__} instead." + ) + + if not other_is_scalar and len(self) != len(other): + raise ValueError("Lengths must match to compare") + + if op.__name__ in {"or_", "ror_"}: + result, mask = ops.kleene_or(self._data, other, self._mask, mask) + elif op.__name__ in {"and_", "rand_"}: + result, mask = ops.kleene_and(self._data, other, self._mask, mask) + elif op.__name__ in {"xor", "rxor"}: + result, mask = ops.kleene_xor(self._data, other, self._mask, mask) + + return BooleanArray(result, mask) + + name = f"__{op.__name__}__" + return set_function_name(logical_method, name, cls) + + @classmethod + def _create_comparison_method(cls, op): + def cmp_method(self, other): + from pandas.arrays import IntegerArray + + if isinstance( + other, (ABCDataFrame, ABCSeries, ABCIndexClass, IntegerArray) + ): + # Rely on pandas to unbox and dispatch to us. + return NotImplemented + + other = lib.item_from_zerodim(other) + mask = None + + if isinstance(other, BooleanArray): + other, mask = other._data, other._mask + + elif is_list_like(other): + other = np.asarray(other) + if other.ndim > 1: + raise NotImplementedError( + "can only perform ops with 1-d structures" + ) + if len(self) != len(other): + raise ValueError("Lengths must match to compare") + + if other is libmissing.NA: + # numpy does not handle pd.NA well as "other" scalar (it returns + # a scalar False instead of an array) + result = np.zeros_like(self._data) + mask = np.ones_like(self._data) + else: + # numpy will show a DeprecationWarning on invalid elementwise + # comparisons, this will raise in the future + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "elementwise", FutureWarning) + with np.errstate(all="ignore"): + result = op(self._data, other) + + # nans propagate + if mask is None: + mask = self._mask.copy() + else: + mask = self._mask | mask + + return BooleanArray(result, mask, copy=False) + + name = f"__{op.__name__}" + return set_function_name(cmp_method, name, cls) + + def _reduce(self, name, skipna=True, **kwargs): + + if name in {"any", "all"}: + return getattr(self, name)(skipna=skipna, **kwargs) + + data = self._data + mask = self._mask + + # coerce to a nan-aware float if needed + if self._hasna: + data = self.to_numpy("float64", na_value=np.nan) + + op = getattr(nanops, "nan" + name) + result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) + + if np.isnan(result): + return libmissing.NA + + # if we have numeric op that would result in an int, coerce to int if possible + if name in ["sum", "prod"] and notna(result): + int_result = np.int64(result) + if int_result == result: + result = int_result + + elif name in ["min", "max"] and notna(result): + result = np.bool_(result) + + return result + + def _maybe_mask_result(self, result, mask, other, op_name): + """ + Parameters + ---------- + result : array-like + mask : array-like bool + other : scalar or array-like + op_name : str + """ + # if we have a float operand we are by-definition + # a float result + # or our op is a divide + if (is_float_dtype(other) or is_float(other)) or ( + op_name in ["rtruediv", "truediv"] + ): + result[mask] = np.nan + return result + + if is_bool_dtype(result): + return BooleanArray(result, mask, copy=False) + + elif is_integer_dtype(result): + from pandas.core.arrays import IntegerArray + + return IntegerArray(result, mask, copy=False) + else: + result[mask] = np.nan + return result + + @classmethod + def _create_arithmetic_method(cls, op): + op_name = op.__name__ + + def boolean_arithmetic_method(self, other): + + if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): + # Rely on pandas to unbox and dispatch to us. + return NotImplemented + + other = lib.item_from_zerodim(other) + mask = None + + if isinstance(other, BooleanArray): + other, mask = other._data, other._mask + + elif is_list_like(other): + other = np.asarray(other) + if other.ndim > 1: + raise NotImplementedError( + "can only perform ops with 1-d structures" + ) + if len(self) != len(other): + raise ValueError("Lengths must match") + + # nans propagate + if mask is None: + mask = self._mask + else: + mask = self._mask | mask + + with np.errstate(all="ignore"): + result = op(self._data, other) + + # divmod returns a tuple + if op_name == "divmod": + div, mod = result + return ( + self._maybe_mask_result(div, mask, other, "floordiv"), + self._maybe_mask_result(mod, mask, other, "mod"), + ) + + return self._maybe_mask_result(result, mask, other, op_name) + + name = f"__{op_name}__" + return set_function_name(boolean_arithmetic_method, name, cls) + + +BooleanArray._add_logical_ops() +BooleanArray._add_comparison_ops() +BooleanArray._add_arithmetic_ops() diff --git a/venv/Lib/site-packages/pandas/core/arrays/categorical.py b/venv/Lib/site-packages/pandas/core/arrays/categorical.py new file mode 100644 index 0000000..105b14a --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/arrays/categorical.py @@ -0,0 +1,2688 @@ +import operator +from shutil import get_terminal_size +from typing import Dict, Hashable, List, Type, Union, cast +from warnings import warn + +import numpy as np + +from pandas._config import get_option + +from pandas._libs import algos as libalgos, hashtable as htable +from pandas._typing import ArrayLike, Dtype, Ordered, Scalar +from pandas.compat.numpy import function as nv +from pandas.util._decorators import ( + Appender, + Substitution, + cache_readonly, + deprecate_kwarg, +) +from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs + +from pandas.core.dtypes.cast import coerce_indexer_dtype, maybe_infer_to_datetimelike +from pandas.core.dtypes.common import ( + ensure_int64, + ensure_object, + ensure_platform_int, + is_categorical_dtype, + is_datetime64_dtype, + is_dict_like, + is_dtype_equal, + is_extension_array_dtype, + is_integer_dtype, + is_iterator, + is_list_like, + is_object_dtype, + is_scalar, + is_sequence, + is_timedelta64_dtype, + needs_i8_conversion, +) +from pandas.core.dtypes.dtypes import CategoricalDtype +from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries +from pandas.core.dtypes.inference import is_hashable +from pandas.core.dtypes.missing import isna, notna + +from pandas.core import ops +from pandas.core.accessor import PandasDelegate, delegate_names +import pandas.core.algorithms as algorithms +from pandas.core.algorithms import _get_data_algo, factorize, take, take_1d, unique1d +from pandas.core.arrays.base import ( + ExtensionArray, + _extension_array_shared_docs, + try_cast_to_ea, +) +from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs +import pandas.core.common as com +from pandas.core.construction import array, extract_array, sanitize_array +from pandas.core.indexers import check_array_indexer, deprecate_ndim_indexing +from pandas.core.missing import interpolate_2d +from pandas.core.ops.common import unpack_zerodim_and_defer +from pandas.core.sorting import nargsort + +from pandas.io.formats import console + + +def _cat_compare_op(op): + opname = f"__{op.__name__}__" + + @unpack_zerodim_and_defer(opname) + def func(self, other): + if is_list_like(other) and len(other) != len(self): + # TODO: Could this fail if the categories are listlike objects? + raise ValueError("Lengths must match.") + + if not self.ordered: + if opname in ["__lt__", "__gt__", "__le__", "__ge__"]: + raise TypeError( + "Unordered Categoricals can only compare equality or not" + ) + if isinstance(other, Categorical): + # Two Categoricals can only be be compared if the categories are + # the same (maybe up to ordering, depending on ordered) + + msg = "Categoricals can only be compared if 'categories' are the same." + if len(self.categories) != len(other.categories): + raise TypeError(msg + " Categories are different lengths") + elif self.ordered and not (self.categories == other.categories).all(): + raise TypeError(msg) + elif not set(self.categories) == set(other.categories): + raise TypeError(msg) + + if not (self.ordered == other.ordered): + raise TypeError( + "Categoricals can only be compared if 'ordered' is the same" + ) + if not self.ordered and not self.categories.equals(other.categories): + # both unordered and different order + other_codes = _get_codes_for_values(other, self.categories) + else: + other_codes = other._codes + + f = getattr(self._codes, opname) + ret = f(other_codes) + mask = (self._codes == -1) | (other_codes == -1) + if mask.any(): + # In other series, the leads to False, so do that here too + ret[mask] = False + return ret + + if is_scalar(other): + if other in self.categories: + i = self.categories.get_loc(other) + ret = getattr(self._codes, opname)(i) + + if opname not in {"__eq__", "__ge__", "__gt__"}: + # check for NaN needed if we are not equal or larger + mask = self._codes == -1 + ret[mask] = False + return ret + else: + if opname == "__eq__": + return np.zeros(len(self), dtype=bool) + elif opname == "__ne__": + return np.ones(len(self), dtype=bool) + else: + raise TypeError( + f"Cannot compare a Categorical for op {opname} with a " + "scalar, which is not a category." + ) + else: + + # allow categorical vs object dtype array comparisons for equality + # these are only positional comparisons + if opname in ["__eq__", "__ne__"]: + return getattr(np.array(self), opname)(np.array(other)) + + raise TypeError( + f"Cannot compare a Categorical for op {opname} with " + f"type {type(other)}.\nIf you want to compare values, " + "use 'np.asarray(cat) other'." + ) + + func.__name__ = opname + + return func + + +def contains(cat, key, container): + """ + Helper for membership check for ``key`` in ``cat``. + + This is a helper method for :method:`__contains__` + and :class:`CategoricalIndex.__contains__`. + + Returns True if ``key`` is in ``cat.categories`` and the + location of ``key`` in ``categories`` is in ``container``. + + Parameters + ---------- + cat : :class:`Categorical`or :class:`categoricalIndex` + key : a hashable object + The key to check membership for. + container : Container (e.g. list-like or mapping) + The container to check for membership in. + + Returns + ------- + is_in : bool + True if ``key`` is in ``self.categories`` and location of + ``key`` in ``categories`` is in ``container``, else False. + + Notes + ----- + This method does not check for NaN values. Do that separately + before calling this method. + """ + hash(key) + + # get location of key in categories. + # If a KeyError, the key isn't in categories, so logically + # can't be in container either. + try: + loc = cat.categories.get_loc(key) + except (KeyError, TypeError): + return False + + # loc is the location of key in categories, but also the *value* + # for key in container. So, `key` may be in categories, + # but still not in `container`. Example ('b' in categories, + # but not in values): + # 'b' in Categorical(['a'], categories=['a', 'b']) # False + if is_scalar(loc): + return loc in container + else: + # if categories is an IntervalIndex, loc is an array. + return any(loc_ in container for loc_ in loc) + + +_codes_doc = """ +The category codes of this categorical. + +Level codes are an array if integer which are the positions of the real +values in the categories array. + +There is not setter, use the other categorical methods and the normal item +setter to change values in the categorical. +""" + + +class Categorical(ExtensionArray, PandasObject): + """ + Represent a categorical variable in classic R / S-plus fashion. + + `Categoricals` can only take on only a limited, and usually fixed, number + of possible values (`categories`). In contrast to statistical categorical + variables, a `Categorical` might have an order, but numerical operations + (additions, divisions, ...) are not possible. + + All values of the `Categorical` are either in `categories` or `np.nan`. + Assigning values outside of `categories` will raise a `ValueError`. Order + is defined by the order of the `categories`, not lexical order of the + values. + + Parameters + ---------- + values : list-like + The values of the categorical. If categories are given, values not in + categories will be replaced with NaN. + categories : Index-like (unique), optional + The unique categories for this categorical. If not given, the + categories are assumed to be the unique values of `values` (sorted, if + possible, otherwise in the order in which they appear). + ordered : bool, default False + Whether or not this categorical is treated as a ordered categorical. + If True, the resulting categorical will be ordered. + An ordered categorical respects, when sorted, the order of its + `categories` attribute (which in turn is the `categories` argument, if + provided). + dtype : CategoricalDtype + An instance of ``CategoricalDtype`` to use for this categorical. + + .. versionadded:: 0.21.0 + + Attributes + ---------- + categories : Index + The categories of this categorical + codes : ndarray + The codes (integer positions, which point to the categories) of this + categorical, read only. + ordered : bool + Whether or not this Categorical is ordered. + dtype : CategoricalDtype + The instance of ``CategoricalDtype`` storing the ``categories`` + and ``ordered``. + + .. versionadded:: 0.21.0 + + Methods + ------- + from_codes + __array__ + + Raises + ------ + ValueError + If the categories do not validate. + TypeError + If an explicit ``ordered=True`` is given but no `categories` and the + `values` are not sortable. + + See Also + -------- + CategoricalDtype : Type for categorical data. + CategoricalIndex : An Index with an underlying ``Categorical``. + + Notes + ----- + See the `user guide + `_ + for more. + + Examples + -------- + >>> pd.Categorical([1, 2, 3, 1, 2, 3]) + [1, 2, 3, 1, 2, 3] + Categories (3, int64): [1, 2, 3] + + >>> pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c']) + [a, b, c, a, b, c] + Categories (3, object): [a, b, c] + + Ordered `Categoricals` can be sorted according to the custom order + of the categories and can have a min and max value. + + >>> c = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=True, + ... categories=['c', 'b', 'a']) + >>> c + [a, b, c, a, b, c] + Categories (3, object): [c < b < a] + >>> c.min() + 'c' + """ + + # For comparisons, so that numpy uses our implementation if the compare + # ops, which raise + __array_priority__ = 1000 + _dtype = CategoricalDtype(ordered=False) + # tolist is not actually deprecated, just suppressed in the __dir__ + _deprecations = PandasObject._deprecations | frozenset(["tolist"]) + _typ = "categorical" + + def __init__( + self, values, categories=None, ordered=None, dtype=None, fastpath=False + ): + + dtype = CategoricalDtype._from_values_or_dtype( + values, categories, ordered, dtype + ) + # At this point, dtype is always a CategoricalDtype, but + # we may have dtype.categories be None, and we need to + # infer categories in a factorization step further below + + if fastpath: + self._codes = coerce_indexer_dtype(values, dtype.categories) + self._dtype = self._dtype.update_dtype(dtype) + return + + # null_mask indicates missing values we want to exclude from inference. + # This means: only missing values in list-likes (not arrays/ndframes). + null_mask = np.array(False) + + # sanitize input + if is_categorical_dtype(values): + if dtype.categories is None: + dtype = CategoricalDtype(values.categories, dtype.ordered) + elif not isinstance(values, (ABCIndexClass, ABCSeries)): + # sanitize_array coerces np.nan to a string under certain versions + # of numpy + values = maybe_infer_to_datetimelike(values, convert_dates=True) + if not isinstance(values, np.ndarray): + values = _convert_to_list_like(values) + + # By convention, empty lists result in object dtype: + if len(values) == 0: + sanitize_dtype = "object" + else: + sanitize_dtype = None + null_mask = isna(values) + if null_mask.any(): + values = [values[idx] for idx in np.where(~null_mask)[0]] + values = sanitize_array(values, None, dtype=sanitize_dtype) + + if dtype.categories is None: + try: + codes, categories = factorize(values, sort=True) + except TypeError: + codes, categories = factorize(values, sort=False) + if dtype.ordered: + # raise, as we don't have a sortable data structure and so + # the user should give us one by specifying categories + raise TypeError( + "'values' is not ordered, please " + "explicitly specify the categories order " + "by passing in a categories argument." + ) + except ValueError: + + # FIXME + raise NotImplementedError( + "> 1 ndim Categorical are not supported at this time" + ) + + # we're inferring from values + dtype = CategoricalDtype(categories, dtype.ordered) + + elif is_categorical_dtype(values): + old_codes = ( + values._values.codes if isinstance(values, ABCSeries) else values.codes + ) + codes = _recode_for_categories( + old_codes, values.dtype.categories, dtype.categories + ) + + else: + codes = _get_codes_for_values(values, dtype.categories) + + if null_mask.any(): + # Reinsert -1 placeholders for previously removed missing values + full_codes = -np.ones(null_mask.shape, dtype=codes.dtype) + full_codes[~null_mask] = codes + codes = full_codes + + self._dtype = self._dtype.update_dtype(dtype) + self._codes = coerce_indexer_dtype(codes, dtype.categories) + + @property + def categories(self): + """ + The categories of this categorical. + + Setting assigns new values to each category (effectively a rename of + each individual category). + + The assigned value has to be a list-like object. All items must be + unique and the number of items in the new categories must be the same + as the number of items in the old categories. + + Assigning to `categories` is a inplace operation! + + Raises + ------ + ValueError + If the new categories do not validate as categories or if the + number of new categories is unequal the number of old categories + + See Also + -------- + rename_categories + reorder_categories + add_categories + remove_categories + remove_unused_categories + set_categories + """ + return self.dtype.categories + + @categories.setter + def categories(self, categories): + new_dtype = CategoricalDtype(categories, ordered=self.ordered) + if self.dtype.categories is not None and len(self.dtype.categories) != len( + new_dtype.categories + ): + raise ValueError( + "new categories need to have the same number of " + "items as the old categories!" + ) + self._dtype = new_dtype + + @property + def ordered(self) -> Ordered: + """ + Whether the categories have an ordered relationship. + """ + return self.dtype.ordered + + @property + def dtype(self) -> CategoricalDtype: + """ + The :class:`~pandas.api.types.CategoricalDtype` for this instance. + """ + return self._dtype + + @property + def _ndarray_values(self) -> np.ndarray: + return self.codes + + @property + def _constructor(self) -> Type["Categorical"]: + return Categorical + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + return Categorical(scalars, dtype=dtype) + + def _formatter(self, boxed=False): + # Defer to CategoricalFormatter's formatter. + return None + + def copy(self) -> "Categorical": + """ + Copy constructor. + """ + return self._constructor( + values=self._codes.copy(), dtype=self.dtype, fastpath=True + ) + + def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: + """ + Coerce this type to another dtype + + Parameters + ---------- + dtype : numpy dtype or pandas type + copy : bool, default True + By default, astype always returns a newly allocated object. + If copy is set to False and dtype is categorical, the original + object is returned. + """ + if is_categorical_dtype(dtype): + dtype = cast(Union[str, CategoricalDtype], dtype) + + # GH 10696/18593 + dtype = self.dtype.update_dtype(dtype) + self = self.copy() if copy else self + if dtype == self.dtype: + return self + return self._set_dtype(dtype) + if is_extension_array_dtype(dtype): + return array(self, dtype=dtype, copy=copy) # type: ignore # GH 28770 + if is_integer_dtype(dtype) and self.isna().any(): + raise ValueError("Cannot convert float NaN to integer") + return np.array(self, dtype=dtype, copy=copy) + + @cache_readonly + def size(self) -> int: + """ + Return the len of myself. + """ + return self._codes.size + + @cache_readonly + def itemsize(self) -> int: + """ + return the size of a single category + """ + return self.categories.itemsize + + def tolist(self) -> List[Scalar]: + """ + Return a list of the values. + + These are each a scalar type, which is a Python scalar + (for str, int, float) or a pandas scalar + (for Timestamp/Timedelta/Interval/Period) + """ + return list(self) + + to_list = tolist + + @classmethod + def _from_inferred_categories( + cls, inferred_categories, inferred_codes, dtype, true_values=None + ): + """ + Construct a Categorical from inferred values. + + For inferred categories (`dtype` is None) the categories are sorted. + For explicit `dtype`, the `inferred_categories` are cast to the + appropriate type. + + Parameters + ---------- + inferred_categories : Index + inferred_codes : Index + dtype : CategoricalDtype or 'category' + true_values : list, optional + If none are provided, the default ones are + "True", "TRUE", and "true." + + Returns + ------- + Categorical + """ + from pandas import Index, to_numeric, to_datetime, to_timedelta + + cats = Index(inferred_categories) + known_categories = ( + isinstance(dtype, CategoricalDtype) and dtype.categories is not None + ) + + if known_categories: + # Convert to a specialized type with `dtype` if specified. + if dtype.categories.is_numeric(): + cats = to_numeric(inferred_categories, errors="coerce") + elif is_datetime64_dtype(dtype.categories): + cats = to_datetime(inferred_categories, errors="coerce") + elif is_timedelta64_dtype(dtype.categories): + cats = to_timedelta(inferred_categories, errors="coerce") + elif dtype.categories.is_boolean(): + if true_values is None: + true_values = ["True", "TRUE", "true"] + + cats = cats.isin(true_values) + + if known_categories: + # Recode from observation order to dtype.categories order. + categories = dtype.categories + codes = _recode_for_categories(inferred_codes, cats, categories) + elif not cats.is_monotonic_increasing: + # Sort categories and recode for unknown categories. + unsorted = cats.copy() + categories = cats.sort_values() + + codes = _recode_for_categories(inferred_codes, unsorted, categories) + dtype = CategoricalDtype(categories, ordered=False) + else: + dtype = CategoricalDtype(cats, ordered=False) + codes = inferred_codes + + return cls(codes, dtype=dtype, fastpath=True) + + @classmethod + def from_codes(cls, codes, categories=None, ordered=None, dtype=None): + """ + Make a Categorical type from codes and categories or dtype. + + This constructor is useful if you already have codes and + categories/dtype and so do not need the (computation intensive) + factorization step, which is usually done on the constructor. + + If your data does not follow this convention, please use the normal + constructor. + + Parameters + ---------- + codes : array-like of int + An integer array, where each integer points to a category in + categories or dtype.categories, or else is -1 for NaN. + categories : index-like, optional + The categories for the categorical. Items need to be unique. + If the categories are not given here, then they must be provided + in `dtype`. + ordered : bool, optional + Whether or not this categorical is treated as an ordered + categorical. If not given here or in `dtype`, the resulting + categorical will be unordered. + dtype : CategoricalDtype or "category", optional + If :class:`CategoricalDtype`, cannot be used together with + `categories` or `ordered`. + + .. versionadded:: 0.24.0 + + When `dtype` is provided, neither `categories` nor `ordered` + should be provided. + + Returns + ------- + Categorical + + Examples + -------- + >>> dtype = pd.CategoricalDtype(['a', 'b'], ordered=True) + >>> pd.Categorical.from_codes(codes=[0, 1, 0, 1], dtype=dtype) + [a, b, a, b] + Categories (2, object): [a < b] + """ + dtype = CategoricalDtype._from_values_or_dtype( + categories=categories, ordered=ordered, dtype=dtype + ) + if dtype.categories is None: + msg = ( + "The categories must be provided in 'categories' or " + "'dtype'. Both were None." + ) + raise ValueError(msg) + + codes = np.asarray(codes) # #21767 + if len(codes) and not is_integer_dtype(codes): + raise ValueError("codes need to be array-like integers") + + if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1): + raise ValueError("codes need to be between -1 and len(categories)-1") + + return cls(codes, dtype=dtype, fastpath=True) + + def _get_codes(self): + """ + Get the codes. + + Returns + ------- + codes : integer array view + A non writable view of the `codes` array. + """ + v = self._codes.view() + v.flags.writeable = False + return v + + def _set_codes(self, codes): + """ + Not settable by the user directly + """ + raise ValueError("cannot set Categorical codes directly") + + codes = property(fget=_get_codes, fset=_set_codes, doc=_codes_doc) + + def _set_categories(self, categories, fastpath=False): + """ + Sets new categories inplace + + Parameters + ---------- + fastpath : bool, default False + Don't perform validation of the categories for uniqueness or nulls + + Examples + -------- + >>> c = pd.Categorical(['a', 'b']) + >>> c + [a, b] + Categories (2, object): [a, b] + + >>> c._set_categories(pd.Index(['a', 'c'])) + >>> c + [a, c] + Categories (2, object): [a, c] + """ + + if fastpath: + new_dtype = CategoricalDtype._from_fastpath(categories, self.ordered) + else: + new_dtype = CategoricalDtype(categories, ordered=self.ordered) + if ( + not fastpath + and self.dtype.categories is not None + and len(new_dtype.categories) != len(self.dtype.categories) + ): + raise ValueError( + "new categories need to have the same number of " + "items than the old categories!" + ) + + self._dtype = new_dtype + + def _set_dtype(self, dtype: CategoricalDtype) -> "Categorical": + """ + Internal method for directly updating the CategoricalDtype + + Parameters + ---------- + dtype : CategoricalDtype + + Notes + ----- + We don't do any validation here. It's assumed that the dtype is + a (valid) instance of `CategoricalDtype`. + """ + codes = _recode_for_categories(self.codes, self.categories, dtype.categories) + return type(self)(codes, dtype=dtype, fastpath=True) + + def set_ordered(self, value, inplace=False): + """ + Set the ordered attribute to the boolean value. + + Parameters + ---------- + value : bool + Set whether this categorical is ordered (True) or not (False). + inplace : bool, default False + Whether or not to set the ordered attribute in-place or return + a copy of this categorical with ordered set to the value. + """ + inplace = validate_bool_kwarg(inplace, "inplace") + new_dtype = CategoricalDtype(self.categories, ordered=value) + cat = self if inplace else self.copy() + cat._dtype = new_dtype + if not inplace: + return cat + + def as_ordered(self, inplace=False): + """ + Set the Categorical to be ordered. + + Parameters + ---------- + inplace : bool, default False + Whether or not to set the ordered attribute in-place or return + a copy of this categorical with ordered set to True. + + Returns + ------- + Categorical + Ordered Categorical. + """ + inplace = validate_bool_kwarg(inplace, "inplace") + return self.set_ordered(True, inplace=inplace) + + def as_unordered(self, inplace=False): + """ + Set the Categorical to be unordered. + + Parameters + ---------- + inplace : bool, default False + Whether or not to set the ordered attribute in-place or return + a copy of this categorical with ordered set to False. + + Returns + ------- + Categorical + Unordered Categorical. + """ + inplace = validate_bool_kwarg(inplace, "inplace") + return self.set_ordered(False, inplace=inplace) + + def set_categories(self, new_categories, ordered=None, rename=False, inplace=False): + """ + Set the categories to the specified new_categories. + + `new_categories` can include new categories (which will result in + unused categories) or remove old categories (which results in values + set to NaN). If `rename==True`, the categories will simple be renamed + (less or more items than in old categories will result in values set to + NaN or in unused categories respectively). + + This method can be used to perform more than one action of adding, + removing, and reordering simultaneously and is therefore faster than + performing the individual steps via the more specialised methods. + + On the other hand this methods does not do checks (e.g., whether the + old categories are included in the new categories on a reorder), which + can result in surprising changes, for example when using special string + dtypes, which does not considers a S1 string equal to a single char + python string. + + Parameters + ---------- + new_categories : Index-like + The categories in new order. + ordered : bool, default False + Whether or not the categorical is treated as a ordered categorical. + If not given, do not change the ordered information. + rename : bool, default False + Whether or not the new_categories should be considered as a rename + of the old categories or as reordered categories. + inplace : bool, default False + Whether or not to reorder the categories in-place or return a copy + of this categorical with reordered categories. + + Returns + ------- + Categorical with reordered categories or None if inplace. + + Raises + ------ + ValueError + If new_categories does not validate as categories + + See Also + -------- + rename_categories + reorder_categories + add_categories + remove_categories + remove_unused_categories + """ + inplace = validate_bool_kwarg(inplace, "inplace") + if ordered is None: + ordered = self.dtype.ordered + new_dtype = CategoricalDtype(new_categories, ordered=ordered) + + cat = self if inplace else self.copy() + if rename: + if cat.dtype.categories is not None and len(new_dtype.categories) < len( + cat.dtype.categories + ): + # remove all _codes which are larger and set to -1/NaN + cat._codes[cat._codes >= len(new_dtype.categories)] = -1 + else: + codes = _recode_for_categories( + cat.codes, cat.categories, new_dtype.categories + ) + cat._codes = codes + cat._dtype = new_dtype + + if not inplace: + return cat + + def rename_categories(self, new_categories, inplace=False): + """ + Rename categories. + + Parameters + ---------- + new_categories : list-like, dict-like or callable + + New categories which will replace old categories. + + * list-like: all items must be unique and the number of items in + the new categories must match the existing number of categories. + + * dict-like: specifies a mapping from + old categories to new. Categories not contained in the mapping + are passed through and extra categories in the mapping are + ignored. + + .. versionadded:: 0.21.0. + + * callable : a callable that is called on all items in the old + categories and whose return values comprise the new categories. + + .. versionadded:: 0.23.0. + + inplace : bool, default False + Whether or not to rename the categories inplace or return a copy of + this categorical with renamed categories. + + Returns + ------- + cat : Categorical or None + With ``inplace=False``, the new categorical is returned. + With ``inplace=True``, there is no return value. + + Raises + ------ + ValueError + If new categories are list-like and do not have the same number of + items than the current categories or do not validate as categories + + See Also + -------- + reorder_categories + add_categories + remove_categories + remove_unused_categories + set_categories + + Examples + -------- + >>> c = pd.Categorical(['a', 'a', 'b']) + >>> c.rename_categories([0, 1]) + [0, 0, 1] + Categories (2, int64): [0, 1] + + For dict-like ``new_categories``, extra keys are ignored and + categories not in the dictionary are passed through + + >>> c.rename_categories({'a': 'A', 'c': 'C'}) + [A, A, b] + Categories (2, object): [A, b] + + You may also provide a callable to create the new categories + + >>> c.rename_categories(lambda x: x.upper()) + [A, A, B] + Categories (2, object): [A, B] + """ + inplace = validate_bool_kwarg(inplace, "inplace") + cat = self if inplace else self.copy() + + if is_dict_like(new_categories): + cat.categories = [new_categories.get(item, item) for item in cat.categories] + elif callable(new_categories): + cat.categories = [new_categories(item) for item in cat.categories] + else: + cat.categories = new_categories + if not inplace: + return cat + + def reorder_categories(self, new_categories, ordered=None, inplace=False): + """ + Reorder categories as specified in new_categories. + + `new_categories` need to include all old categories and no new category + items. + + Parameters + ---------- + new_categories : Index-like + The categories in new order. + ordered : bool, optional + Whether or not the categorical is treated as a ordered categorical. + If not given, do not change the ordered information. + inplace : bool, default False + Whether or not to reorder the categories inplace or return a copy of + this categorical with reordered categories. + + Returns + ------- + cat : Categorical with reordered categories or None if inplace. + + Raises + ------ + ValueError + If the new categories do not contain all old category items or any + new ones + + See Also + -------- + rename_categories + add_categories + remove_categories + remove_unused_categories + set_categories + """ + inplace = validate_bool_kwarg(inplace, "inplace") + if set(self.dtype.categories) != set(new_categories): + raise ValueError( + "items in new_categories are not the same as in old categories" + ) + return self.set_categories(new_categories, ordered=ordered, inplace=inplace) + + def add_categories(self, new_categories, inplace=False): + """ + Add new categories. + + `new_categories` will be included at the last/highest place in the + categories and will be unused directly after this call. + + Parameters + ---------- + new_categories : category or list-like of category + The new categories to be included. + inplace : bool, default False + Whether or not to add the categories inplace or return a copy of + this categorical with added categories. + + Returns + ------- + cat : Categorical with new categories added or None if inplace. + + Raises + ------ + ValueError + If the new categories include old categories or do not validate as + categories + + See Also + -------- + rename_categories + reorder_categories + remove_categories + remove_unused_categories + set_categories + """ + inplace = validate_bool_kwarg(inplace, "inplace") + if not is_list_like(new_categories): + new_categories = [new_categories] + already_included = set(new_categories) & set(self.dtype.categories) + if len(already_included) != 0: + raise ValueError( + f"new categories must not include old categories: {already_included}" + ) + new_categories = list(self.dtype.categories) + list(new_categories) + new_dtype = CategoricalDtype(new_categories, self.ordered) + + cat = self if inplace else self.copy() + cat._dtype = new_dtype + cat._codes = coerce_indexer_dtype(cat._codes, new_dtype.categories) + if not inplace: + return cat + + def remove_categories(self, removals, inplace=False): + """ + Remove the specified categories. + + `removals` must be included in the old categories. Values which were in + the removed categories will be set to NaN + + Parameters + ---------- + removals : category or list of categories + The categories which should be removed. + inplace : bool, default False + Whether or not to remove the categories inplace or return a copy of + this categorical with removed categories. + + Returns + ------- + cat : Categorical with removed categories or None if inplace. + + Raises + ------ + ValueError + If the removals are not contained in the categories + + See Also + -------- + rename_categories + reorder_categories + add_categories + remove_unused_categories + set_categories + """ + inplace = validate_bool_kwarg(inplace, "inplace") + if not is_list_like(removals): + removals = [removals] + + removal_set = set(removals) + not_included = removal_set - set(self.dtype.categories) + new_categories = [c for c in self.dtype.categories if c not in removal_set] + + # GH 10156 + if any(isna(removals)): + not_included = {x for x in not_included if notna(x)} + new_categories = [x for x in new_categories if notna(x)] + + if len(not_included) != 0: + raise ValueError(f"removals must all be in old categories: {not_included}") + + return self.set_categories( + new_categories, ordered=self.ordered, rename=False, inplace=inplace + ) + + def remove_unused_categories(self, inplace=False): + """ + Remove categories which are not used. + + Parameters + ---------- + inplace : bool, default False + Whether or not to drop unused categories inplace or return a copy of + this categorical with unused categories dropped. + + Returns + ------- + cat : Categorical with unused categories dropped or None if inplace. + + See Also + -------- + rename_categories + reorder_categories + add_categories + remove_categories + set_categories + """ + inplace = validate_bool_kwarg(inplace, "inplace") + cat = self if inplace else self.copy() + idx, inv = np.unique(cat._codes, return_inverse=True) + + if idx.size != 0 and idx[0] == -1: # na sentinel + idx, inv = idx[1:], inv - 1 + + new_categories = cat.dtype.categories.take(idx) + new_dtype = CategoricalDtype._from_fastpath( + new_categories, ordered=self.ordered + ) + cat._dtype = new_dtype + cat._codes = coerce_indexer_dtype(inv, new_dtype.categories) + + if not inplace: + return cat + + def map(self, mapper): + """ + Map categories using input correspondence (dict, Series, or function). + + Maps the categories to new categories. If the mapping correspondence is + one-to-one the result is a :class:`~pandas.Categorical` which has the + same order property as the original, otherwise a :class:`~pandas.Index` + is returned. NaN values are unaffected. + + If a `dict` or :class:`~pandas.Series` is used any unmapped category is + mapped to `NaN`. Note that if this happens an :class:`~pandas.Index` + will be returned. + + Parameters + ---------- + mapper : function, dict, or Series + Mapping correspondence. + + Returns + ------- + pandas.Categorical or pandas.Index + Mapped categorical. + + See Also + -------- + CategoricalIndex.map : Apply a mapping correspondence on a + :class:`~pandas.CategoricalIndex`. + Index.map : Apply a mapping correspondence on an + :class:`~pandas.Index`. + Series.map : Apply a mapping correspondence on a + :class:`~pandas.Series`. + Series.apply : Apply more complex functions on a + :class:`~pandas.Series`. + + Examples + -------- + >>> cat = pd.Categorical(['a', 'b', 'c']) + >>> cat + [a, b, c] + Categories (3, object): [a, b, c] + >>> cat.map(lambda x: x.upper()) + [A, B, C] + Categories (3, object): [A, B, C] + >>> cat.map({'a': 'first', 'b': 'second', 'c': 'third'}) + [first, second, third] + Categories (3, object): [first, second, third] + + If the mapping is one-to-one the ordering of the categories is + preserved: + + >>> cat = pd.Categorical(['a', 'b', 'c'], ordered=True) + >>> cat + [a, b, c] + Categories (3, object): [a < b < c] + >>> cat.map({'a': 3, 'b': 2, 'c': 1}) + [3, 2, 1] + Categories (3, int64): [3 < 2 < 1] + + If the mapping is not one-to-one an :class:`~pandas.Index` is returned: + + >>> cat.map({'a': 'first', 'b': 'second', 'c': 'first'}) + Index(['first', 'second', 'first'], dtype='object') + + If a `dict` is used, all unmapped categories are mapped to `NaN` and + the result is an :class:`~pandas.Index`: + + >>> cat.map({'a': 'first', 'b': 'second'}) + Index(['first', 'second', nan], dtype='object') + """ + new_categories = self.categories.map(mapper) + try: + return self.from_codes( + self._codes.copy(), categories=new_categories, ordered=self.ordered + ) + except ValueError: + # NA values are represented in self._codes with -1 + # np.take causes NA values to take final element in new_categories + if np.any(self._codes == -1): + new_categories = new_categories.insert(len(new_categories), np.nan) + return np.take(new_categories, self._codes) + + __eq__ = _cat_compare_op(operator.eq) + __ne__ = _cat_compare_op(operator.ne) + __lt__ = _cat_compare_op(operator.lt) + __gt__ = _cat_compare_op(operator.gt) + __le__ = _cat_compare_op(operator.le) + __ge__ = _cat_compare_op(operator.ge) + + # for Series/ndarray like compat + @property + def shape(self): + """ + Shape of the Categorical. + + For internal compatibility with numpy arrays. + + Returns + ------- + shape : tuple + """ + + return tuple([len(self._codes)]) + + def shift(self, periods, fill_value=None): + """ + Shift Categorical by desired number of periods. + + Parameters + ---------- + periods : int + Number of periods to move, can be positive or negative + fill_value : object, optional + The scalar value to use for newly introduced missing values. + + .. versionadded:: 0.24.0 + + Returns + ------- + shifted : Categorical + """ + # since categoricals always have ndim == 1, an axis parameter + # doesn't make any sense here. + codes = self.codes + if codes.ndim > 1: + raise NotImplementedError("Categorical with ndim > 1.") + if np.prod(codes.shape) and (periods != 0): + codes = np.roll(codes, ensure_platform_int(periods), axis=0) + if isna(fill_value): + fill_value = -1 + elif fill_value in self.categories: + fill_value = self.categories.get_loc(fill_value) + else: + raise ValueError( + f"'fill_value={fill_value}' is not present " + "in this Categorical's categories" + ) + if periods > 0: + codes[:periods] = fill_value + else: + codes[periods:] = fill_value + + return self.from_codes(codes, dtype=self.dtype) + + def __array__(self, dtype=None) -> np.ndarray: + """ + The numpy array interface. + + Returns + ------- + numpy.array + A numpy array of either the specified dtype or, + if dtype==None (default), the same dtype as + categorical.categories.dtype. + """ + ret = take_1d(self.categories.values, self._codes) + if dtype and not is_dtype_equal(dtype, self.categories.dtype): + return np.asarray(ret, dtype) + if is_extension_array_dtype(ret): + # When we're a Categorical[ExtensionArray], like Interval, + # we need to ensure __array__ get's all the way to an + # ndarray. + ret = np.asarray(ret) + return ret + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + # for binary ops, use our custom dunder methods + result = ops.maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + return result + + # for all other cases, raise for now (similarly as what happens in + # Series.__array_prepare__) + raise TypeError( + f"Object with dtype {self.dtype} cannot perform " + f"the numpy op {ufunc.__name__}" + ) + + def __setstate__(self, state): + """Necessary for making this object picklable""" + if not isinstance(state, dict): + raise Exception("invalid pickle state") + + # compat with pre 0.21.0 CategoricalDtype change + if "_dtype" not in state: + state["_dtype"] = CategoricalDtype(state["_categories"], state["_ordered"]) + + for k, v in state.items(): + setattr(self, k, v) + + @property + def T(self): + """ + Return transposed numpy array. + """ + return self + + @property + def nbytes(self): + return self._codes.nbytes + self.dtype.categories.values.nbytes + + def memory_usage(self, deep=False): + """ + Memory usage of my values + + Parameters + ---------- + deep : bool + Introspect the data deeply, interrogate + `object` dtypes for system-level memory consumption + + Returns + ------- + bytes used + + Notes + ----- + Memory usage does not include memory consumed by elements that + are not components of the array if deep=False + + See Also + -------- + numpy.ndarray.nbytes + """ + return self._codes.nbytes + self.dtype.categories.memory_usage(deep=deep) + + @Substitution(klass="Categorical") + @Appender(_shared_docs["searchsorted"]) + def searchsorted(self, value, side="left", sorter=None): + # searchsorted is very performance sensitive. By converting codes + # to same dtype as self.codes, we get much faster performance. + if is_scalar(value): + codes = self.categories.get_loc(value) + codes = self.codes.dtype.type(codes) + else: + locs = [self.categories.get_loc(x) for x in value] + codes = np.array(locs, dtype=self.codes.dtype) + return self.codes.searchsorted(codes, side=side, sorter=sorter) + + def isna(self): + """ + Detect missing values + + Missing values (-1 in .codes) are detected. + + Returns + ------- + a boolean array of whether my values are null + + See Also + -------- + isna : Top-level isna. + isnull : Alias of isna. + Categorical.notna : Boolean inverse of Categorical.isna. + + """ + + ret = self._codes == -1 + return ret + + isnull = isna + + def notna(self): + """ + Inverse of isna + + Both missing values (-1 in .codes) and NA as a category are detected as + null. + + Returns + ------- + a boolean array of whether my values are not null + + See Also + -------- + notna : Top-level notna. + notnull : Alias of notna. + Categorical.isna : Boolean inverse of Categorical.notna. + + """ + return ~self.isna() + + notnull = notna + + def put(self, *args, **kwargs): + """ + Replace specific elements in the Categorical with given values. + """ + raise NotImplementedError(("'put' is not yet implemented for Categorical")) + + def dropna(self): + """ + Return the Categorical without null values. + + Missing values (-1 in .codes) are detected. + + Returns + ------- + valid : Categorical + """ + result = self[self.notna()] + + return result + + def value_counts(self, dropna=True): + """ + Return a Series containing counts of each category. + + Every category will have an entry, even those with a count of 0. + + Parameters + ---------- + dropna : bool, default True + Don't include counts of NaN. + + Returns + ------- + counts : Series + + See Also + -------- + Series.value_counts + """ + from pandas import Series, CategoricalIndex + + code, cat = self._codes, self.categories + ncat, mask = len(cat), 0 <= code + ix, clean = np.arange(ncat), mask.all() + + if dropna or clean: + obs = code if clean else code[mask] + count = np.bincount(obs, minlength=ncat or 0) + else: + count = np.bincount(np.where(mask, code, ncat)) + ix = np.append(ix, -1) + + ix = self._constructor(ix, dtype=self.dtype, fastpath=True) + + return Series(count, index=CategoricalIndex(ix), dtype="int64") + + def _internal_get_values(self): + """ + Return the values. + + For internal compatibility with pandas formatting. + + Returns + ------- + np.ndarray or Index + A numpy array of the same dtype as categorical.categories.dtype or + Index if datetime / periods. + """ + # if we are a datetime and period index, return Index to keep metadata + if needs_i8_conversion(self.categories): + return self.categories.take(self._codes, fill_value=np.nan) + elif is_integer_dtype(self.categories) and -1 in self._codes: + return self.categories.astype("object").take(self._codes, fill_value=np.nan) + return np.array(self) + + def check_for_ordered(self, op): + """ assert that we are ordered """ + if not self.ordered: + raise TypeError( + f"Categorical is not ordered for operation {op}\n" + "you can use .as_ordered() to change the " + "Categorical to an ordered one\n" + ) + + def _values_for_argsort(self): + return self._codes.copy() + + def argsort(self, ascending=True, kind="quicksort", *args, **kwargs): + """ + Return the indices that would sort the Categorical. + + .. versionchanged:: 0.25.0 + + Changed to sort missing values at the end. + + Parameters + ---------- + ascending : bool, default True + Whether the indices should result in an ascending + or descending sort. + kind : {'quicksort', 'mergesort', 'heapsort'}, optional + Sorting algorithm. + *args, **kwargs: + passed through to :func:`numpy.argsort`. + + Returns + ------- + numpy.array + + See Also + -------- + numpy.ndarray.argsort + + Notes + ----- + While an ordering is applied to the category values, arg-sorting + in this context refers more to organizing and grouping together + based on matching category values. Thus, this function can be + called on an unordered Categorical instance unlike the functions + 'Categorical.min' and 'Categorical.max'. + + Examples + -------- + >>> pd.Categorical(['b', 'b', 'a', 'c']).argsort() + array([2, 0, 1, 3]) + + >>> cat = pd.Categorical(['b', 'b', 'a', 'c'], + ... categories=['c', 'b', 'a'], + ... ordered=True) + >>> cat.argsort() + array([3, 0, 1, 2]) + + Missing values are placed at the end + + >>> cat = pd.Categorical([2, None, 1]) + >>> cat.argsort() + array([2, 0, 1]) + """ + return super().argsort(ascending=ascending, kind=kind, *args, **kwargs) + + def sort_values(self, inplace=False, ascending=True, na_position="last"): + """ + Sort the Categorical by category value returning a new + Categorical by default. + + While an ordering is applied to the category values, sorting in this + context refers more to organizing and grouping together based on + matching category values. Thus, this function can be called on an + unordered Categorical instance unlike the functions 'Categorical.min' + and 'Categorical.max'. + + Parameters + ---------- + inplace : bool, default False + Do operation in place. + ascending : bool, default True + Order ascending. Passing False orders descending. The + ordering parameter provides the method by which the + category values are organized. + na_position : {'first', 'last'} (optional, default='last') + 'first' puts NaNs at the beginning + 'last' puts NaNs at the end + + Returns + ------- + Categorical or None + + See Also + -------- + Categorical.sort + Series.sort_values + + Examples + -------- + >>> c = pd.Categorical([1, 2, 2, 1, 5]) + >>> c + [1, 2, 2, 1, 5] + Categories (3, int64): [1, 2, 5] + >>> c.sort_values() + [1, 1, 2, 2, 5] + Categories (3, int64): [1, 2, 5] + >>> c.sort_values(ascending=False) + [5, 2, 2, 1, 1] + Categories (3, int64): [1, 2, 5] + + Inplace sorting can be done as well: + + >>> c.sort_values(inplace=True) + >>> c + [1, 1, 2, 2, 5] + Categories (3, int64): [1, 2, 5] + >>> + >>> c = pd.Categorical([1, 2, 2, 1, 5]) + + 'sort_values' behaviour with NaNs. Note that 'na_position' + is independent of the 'ascending' parameter: + + >>> c = pd.Categorical([np.nan, 2, 2, np.nan, 5]) + >>> c + [NaN, 2.0, 2.0, NaN, 5.0] + Categories (2, int64): [2, 5] + >>> c.sort_values() + [2.0, 2.0, 5.0, NaN, NaN] + Categories (2, int64): [2, 5] + >>> c.sort_values(ascending=False) + [5.0, 2.0, 2.0, NaN, NaN] + Categories (2, int64): [2, 5] + >>> c.sort_values(na_position='first') + [NaN, NaN, 2.0, 2.0, 5.0] + Categories (2, int64): [2, 5] + >>> c.sort_values(ascending=False, na_position='first') + [NaN, NaN, 5.0, 2.0, 2.0] + Categories (2, int64): [2, 5] + """ + inplace = validate_bool_kwarg(inplace, "inplace") + if na_position not in ["last", "first"]: + raise ValueError(f"invalid na_position: {repr(na_position)}") + + sorted_idx = nargsort(self, ascending=ascending, na_position=na_position) + + if inplace: + self._codes = self._codes[sorted_idx] + else: + return self._constructor( + values=self._codes[sorted_idx], dtype=self.dtype, fastpath=True + ) + + def _values_for_rank(self): + """ + For correctly ranking ordered categorical data. See GH#15420 + + Ordered categorical data should be ranked on the basis of + codes with -1 translated to NaN. + + Returns + ------- + numpy.array + + """ + from pandas import Series + + if self.ordered: + values = self.codes + mask = values == -1 + if mask.any(): + values = values.astype("float64") + values[mask] = np.nan + elif self.categories.is_numeric(): + values = np.array(self) + else: + # reorder the categories (so rank can use the float codes) + # instead of passing an object array to rank + values = np.array( + self.rename_categories(Series(self.categories).rank().values) + ) + return values + + def view(self, dtype=None): + if dtype is not None: + raise NotImplementedError(dtype) + return self._constructor(values=self._codes, dtype=self.dtype, fastpath=True) + + def to_dense(self): + """ + Return my 'dense' representation + + For internal compatibility with numpy arrays. + + Returns + ------- + dense : array + """ + return np.asarray(self) + + def fillna(self, value=None, method=None, limit=None): + """ + Fill NA/NaN values using the specified method. + + Parameters + ---------- + value : scalar, dict, Series + If a scalar value is passed it is used to fill all missing values. + Alternatively, a Series or dict can be used to fill in different + values for each index. The value should not be a list. The + value(s) passed should either be in the categories or should be + NaN. + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + Method to use for filling holes in reindexed Series + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap + limit : int, default None + (Not implemented yet for Categorical!) + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. + + Returns + ------- + filled : Categorical with NA/NaN filled + """ + value, method = validate_fillna_kwargs( + value, method, validate_scalar_dict_value=False + ) + + if value is None: + value = np.nan + if limit is not None: + raise NotImplementedError( + "specifying a limit for fillna has not been implemented yet" + ) + + codes = self._codes + + # pad / bfill + if method is not None: + + values = self.to_dense().reshape(-1, len(self)) + values = interpolate_2d(values, method, 0, None, value).astype( + self.categories.dtype + )[0] + codes = _get_codes_for_values(values, self.categories) + + else: + + # If value is a dict or a Series (a dict value has already + # been converted to a Series) + if isinstance(value, ABCSeries): + if not value[~value.isin(self.categories)].isna().all(): + raise ValueError("fill value must be in categories") + + values_codes = _get_codes_for_values(value, self.categories) + indexer = np.where(codes == -1) + codes[indexer] = values_codes[indexer] + + # If value is not a dict or Series it should be a scalar + elif is_hashable(value): + if not isna(value) and value not in self.categories: + raise ValueError("fill value must be in categories") + + mask = codes == -1 + if mask.any(): + codes = codes.copy() + if isna(value): + codes[mask] = -1 + else: + codes[mask] = self.categories.get_loc(value) + + else: + raise TypeError( + f"'value' parameter must be a scalar, dict " + f"or Series, but you passed a {type(value).__name__}" + ) + + return self._constructor(codes, dtype=self.dtype, fastpath=True) + + def take(self, indexer, allow_fill: bool = False, fill_value=None): + """ + Take elements from the Categorical. + + Parameters + ---------- + indexer : sequence of int + The indices in `self` to take. The meaning of negative values in + `indexer` depends on the value of `allow_fill`. + allow_fill : bool, default False + How to handle negative values in `indexer`. + + * False: negative values in `indices` indicate positional indices + from the right. This is similar to + :func:`numpy.take`. + + * True: negative values in `indices` indicate missing values + (the default). These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. + + .. versionchanged:: 1.0.0 + + Default value changed from ``True`` to ``False``. + + fill_value : object + The value to use for `indices` that are missing (-1), when + ``allow_fill=True``. This should be the category, i.e. a value + in ``self.categories``, not a code. + + Returns + ------- + Categorical + This Categorical will have the same categories and ordered as + `self`. + + See Also + -------- + Series.take : Similar method for Series. + numpy.ndarray.take : Similar method for NumPy arrays. + + Examples + -------- + >>> cat = pd.Categorical(['a', 'a', 'b']) + >>> cat + [a, a, b] + Categories (2, object): [a, b] + + Specify ``allow_fill==False`` to have negative indices mean indexing + from the right. + + >>> cat.take([0, -1, -2], allow_fill=False) + [a, b, a] + Categories (2, object): [a, b] + + With ``allow_fill=True``, indices equal to ``-1`` mean "missing" + values that should be filled with the `fill_value`, which is + ``np.nan`` by default. + + >>> cat.take([0, -1, -1], allow_fill=True) + [a, NaN, NaN] + Categories (2, object): [a, b] + + The fill value can be specified. + + >>> cat.take([0, -1, -1], allow_fill=True, fill_value='a') + [a, a, a] + Categories (3, object): [a, b] + + Specifying a fill value that's not in ``self.categories`` + will raise a ``TypeError``. + """ + indexer = np.asarray(indexer, dtype=np.intp) + + dtype = self.dtype + + if isna(fill_value): + fill_value = -1 + elif allow_fill: + # convert user-provided `fill_value` to codes + if fill_value in self.categories: + fill_value = self.categories.get_loc(fill_value) + else: + msg = ( + f"'fill_value' ('{fill_value}') is not in this " + "Categorical's categories." + ) + raise TypeError(msg) + + codes = take(self._codes, indexer, allow_fill=allow_fill, fill_value=fill_value) + result = type(self).from_codes(codes, dtype=dtype) + return result + + def take_nd(self, indexer, allow_fill: bool = False, fill_value=None): + # GH#27745 deprecate alias that other EAs dont have + warn( + "Categorical.take_nd is deprecated, use Categorical.take instead", + FutureWarning, + stacklevel=2, + ) + return self.take(indexer, allow_fill=allow_fill, fill_value=fill_value) + + def __len__(self) -> int: + """ + The length of this Categorical. + """ + return len(self._codes) + + def __iter__(self): + """ + Returns an Iterator over the values of this Categorical. + """ + return iter(self._internal_get_values().tolist()) + + def __contains__(self, key) -> bool: + """ + Returns True if `key` is in this Categorical. + """ + # if key is a NaN, check if any NaN is in self. + if is_scalar(key) and isna(key): + return self.isna().any() + + return contains(self, key, container=self._codes) + + def _tidy_repr(self, max_vals=10, footer=True) -> str: + """ a short repr displaying only max_vals and an optional (but default + footer) + """ + num = max_vals // 2 + head = self[:num]._get_repr(length=False, footer=False) + tail = self[-(max_vals - num) :]._get_repr(length=False, footer=False) + + result = f"{head[:-1]}, ..., {tail[1:]}" + if footer: + result = f"{result}\n{self._repr_footer()}" + + return str(result) + + def _repr_categories(self): + """ + return the base repr for the categories + """ + max_categories = ( + 10 + if get_option("display.max_categories") == 0 + else get_option("display.max_categories") + ) + from pandas.io.formats import format as fmt + + if len(self.categories) > max_categories: + num = max_categories // 2 + head = fmt.format_array(self.categories[:num], None) + tail = fmt.format_array(self.categories[-num:], None) + category_strs = head + ["..."] + tail + else: + category_strs = fmt.format_array(self.categories, None) + + # Strip all leading spaces, which format_array adds for columns... + category_strs = [x.strip() for x in category_strs] + return category_strs + + def _repr_categories_info(self) -> str: + """ + Returns a string representation of the footer. + """ + + category_strs = self._repr_categories() + dtype = str(self.categories.dtype) + levheader = f"Categories ({len(self.categories)}, {dtype}): " + width, height = get_terminal_size() + max_width = get_option("display.width") or width + if console.in_ipython_frontend(): + # 0 = no breaks + max_width = 0 + levstring = "" + start = True + cur_col_len = len(levheader) # header + sep_len, sep = (3, " < ") if self.ordered else (2, ", ") + linesep = sep.rstrip() + "\n" # remove whitespace + for val in category_strs: + if max_width != 0 and cur_col_len + sep_len + len(val) > max_width: + levstring += linesep + (" " * (len(levheader) + 1)) + cur_col_len = len(levheader) + 1 # header + a whitespace + elif not start: + levstring += sep + cur_col_len += len(val) + levstring += val + start = False + # replace to simple save space by + return levheader + "[" + levstring.replace(" < ... < ", " ... ") + "]" + + def _repr_footer(self) -> str: + info = self._repr_categories_info() + return f"Length: {len(self)}\n{info}" + + def _get_repr(self, length=True, na_rep="NaN", footer=True) -> str: + from pandas.io.formats import format as fmt + + formatter = fmt.CategoricalFormatter( + self, length=length, na_rep=na_rep, footer=footer + ) + result = formatter.to_string() + return str(result) + + def __repr__(self) -> str: + """ + String representation. + """ + _maxlen = 10 + if len(self._codes) > _maxlen: + result = self._tidy_repr(_maxlen) + elif len(self._codes) > 0: + result = self._get_repr(length=len(self) > _maxlen) + else: + msg = self._get_repr(length=False, footer=True).replace("\n", ", ") + result = f"[], {msg}" + + return result + + def _maybe_coerce_indexer(self, indexer): + """ + return an indexer coerced to the codes dtype + """ + if isinstance(indexer, np.ndarray) and indexer.dtype.kind == "i": + indexer = indexer.astype(self._codes.dtype) + return indexer + + def __getitem__(self, key): + """ + Return an item. + """ + if isinstance(key, (int, np.integer)): + i = self._codes[key] + if i == -1: + return np.nan + else: + return self.categories[i] + + key = check_array_indexer(self, key) + + result = self._codes[key] + if result.ndim > 1: + deprecate_ndim_indexing(result) + return result + return self._constructor(result, dtype=self.dtype, fastpath=True) + + def __setitem__(self, key, value): + """ + Item assignment. + + Raises + ------ + ValueError + If (one or more) Value is not in categories or if a assigned + `Categorical` does not have the same categories + """ + value = extract_array(value, extract_numpy=True) + + # require identical categories set + if isinstance(value, Categorical): + if not is_dtype_equal(self, value): + raise ValueError( + "Cannot set a Categorical with another, " + "without identical categories" + ) + if not self.categories.equals(value.categories): + new_codes = _recode_for_categories( + value.codes, value.categories, self.categories + ) + value = Categorical.from_codes(new_codes, dtype=self.dtype) + + rvalue = value if is_list_like(value) else [value] + + from pandas import Index + + to_add = Index(rvalue).difference(self.categories) + + # no assignments of values not in categories, but it's always ok to set + # something to np.nan + if len(to_add) and not isna(to_add).all(): + raise ValueError( + "Cannot setitem on a Categorical with a new " + "category, set the categories first" + ) + + # set by position + if isinstance(key, (int, np.integer)): + pass + + # tuple of indexers (dataframe) + elif isinstance(key, tuple): + # only allow 1 dimensional slicing, but can + # in a 2-d case be passd (slice(None),....) + if len(key) == 2: + if not com.is_null_slice(key[0]): + raise AssertionError("invalid slicing for a 1-ndim categorical") + key = key[1] + elif len(key) == 1: + key = key[0] + else: + raise AssertionError("invalid slicing for a 1-ndim categorical") + + # slicing in Series or Categorical + elif isinstance(key, slice): + pass + + # else: array of True/False in Series or Categorical + + lindexer = self.categories.get_indexer(rvalue) + lindexer = self._maybe_coerce_indexer(lindexer) + self._codes[key] = lindexer + + def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: + """ + Compute the inverse of a categorical, returning + a dict of categories -> indexers. + + *This is an internal function* + + Returns + ------- + dict of categories -> indexers + + Examples + -------- + >>> c = pd.Categorical(list('aabca')) + >>> c + [a, a, b, c, a] + Categories (3, object): [a, b, c] + >>> c.categories + Index(['a', 'b', 'c'], dtype='object') + >>> c.codes + array([0, 0, 1, 2, 0], dtype=int8) + >>> c._reverse_indexer() + {'a': array([0, 1, 4]), 'b': array([2]), 'c': array([3])} + + """ + categories = self.categories + r, counts = libalgos.groupsort_indexer( + self.codes.astype("int64"), categories.size + ) + counts = counts.cumsum() + _result = (r[start:end] for start, end in zip(counts, counts[1:])) + result = dict(zip(categories, _result)) + return result + + # reduction ops # + def _reduce(self, name, axis=0, **kwargs): + func = getattr(self, name, None) + if func is None: + raise TypeError(f"Categorical cannot perform the operation {name}") + return func(**kwargs) + + @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna") + def min(self, skipna=True): + """ + The minimum value of the object. + + Only ordered `Categoricals` have a minimum! + + .. versionchanged:: 1.0.0 + + Returns an NA value on empty arrays + + Raises + ------ + TypeError + If the `Categorical` is not `ordered`. + + Returns + ------- + min : the minimum of this `Categorical` + """ + self.check_for_ordered("min") + + if not len(self._codes): + return self.dtype.na_value + + good = self._codes != -1 + if not good.all(): + if skipna: + pointer = self._codes[good].min() + else: + return np.nan + else: + pointer = self._codes.min() + return self.categories[pointer] + + @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna") + def max(self, skipna=True): + """ + The maximum value of the object. + + Only ordered `Categoricals` have a maximum! + + .. versionchanged:: 1.0.0 + + Returns an NA value on empty arrays + + Raises + ------ + TypeError + If the `Categorical` is not `ordered`. + + Returns + ------- + max : the maximum of this `Categorical` + """ + self.check_for_ordered("max") + + if not len(self._codes): + return self.dtype.na_value + + good = self._codes != -1 + if not good.all(): + if skipna: + pointer = self._codes[good].max() + else: + return np.nan + else: + pointer = self._codes.max() + return self.categories[pointer] + + def mode(self, dropna=True): + """ + Returns the mode(s) of the Categorical. + + Always returns `Categorical` even if only one value. + + Parameters + ---------- + dropna : bool, default True + Don't consider counts of NaN/NaT. + + .. versionadded:: 0.24.0 + + Returns + ------- + modes : `Categorical` (sorted) + """ + codes = self._codes + if dropna: + good = self._codes != -1 + codes = self._codes[good] + codes = sorted(htable.mode_int64(ensure_int64(codes), dropna)) + return self._constructor(values=codes, dtype=self.dtype, fastpath=True) + + def unique(self): + """ + Return the ``Categorical`` which ``categories`` and ``codes`` are + unique. Unused categories are NOT returned. + + - unordered category: values and categories are sorted by appearance + order. + - ordered category: values are sorted by appearance order, categories + keeps existing order. + + Returns + ------- + unique values : ``Categorical`` + + Examples + -------- + An unordered Categorical will return categories in the + order of appearance. + + >>> pd.Categorical(list('baabc')) + [b, a, c] + Categories (3, object): [b, a, c] + + >>> pd.Categorical(list('baabc'), categories=list('abc')) + [b, a, c] + Categories (3, object): [b, a, c] + + An ordered Categorical preserves the category ordering. + + >>> pd.Categorical(list('baabc'), + ... categories=list('abc'), + ... ordered=True) + [b, a, c] + Categories (3, object): [a < b < c] + + See Also + -------- + unique + CategoricalIndex.unique + Series.unique + + """ + + # unlike np.unique, unique1d does not sort + unique_codes = unique1d(self.codes) + cat = self.copy() + + # keep nan in codes + cat._codes = unique_codes + + # exclude nan from indexer for categories + take_codes = unique_codes[unique_codes != -1] + if self.ordered: + take_codes = np.sort(take_codes) + return cat.set_categories(cat.categories.take(take_codes)) + + def _values_for_factorize(self): + codes = self.codes.astype("int64") + return codes, -1 + + @classmethod + def _from_factorized(cls, uniques, original): + return original._constructor( + original.categories.take(uniques), dtype=original.dtype + ) + + def equals(self, other): + """ + Returns True if categorical arrays are equal. + + Parameters + ---------- + other : `Categorical` + + Returns + ------- + bool + """ + if self.is_dtype_equal(other): + if self.categories.equals(other.categories): + # fastpath to avoid re-coding + other_codes = other._codes + else: + other_codes = _recode_for_categories( + other.codes, other.categories, self.categories + ) + return np.array_equal(self._codes, other_codes) + return False + + def is_dtype_equal(self, other): + """ + Returns True if categoricals are the same dtype + same categories, and same ordered + + Parameters + ---------- + other : Categorical + + Returns + ------- + bool + """ + + try: + return hash(self.dtype) == hash(other.dtype) + except (AttributeError, TypeError): + return False + + def describe(self): + """ + Describes this Categorical + + Returns + ------- + description: `DataFrame` + A dataframe with frequency and counts by category. + """ + counts = self.value_counts(dropna=False) + freqs = counts / float(counts.sum()) + + from pandas.core.reshape.concat import concat + + result = concat([counts, freqs], axis=1) + result.columns = ["counts", "freqs"] + result.index.name = "categories" + + return result + + @Substitution(klass="Categorical") + @Appender(_extension_array_shared_docs["repeat"]) + def repeat(self, repeats, axis=None): + nv.validate_repeat(tuple(), dict(axis=axis)) + codes = self._codes.repeat(repeats) + return self._constructor(values=codes, dtype=self.dtype, fastpath=True) + + # Implement the ExtensionArray interface + @property + def _can_hold_na(self): + return True + + @classmethod + def _concat_same_type(self, to_concat): + from pandas.core.dtypes.concat import concat_categorical + + return concat_categorical(to_concat) + + def isin(self, values): + """ + Check whether `values` are contained in Categorical. + + Return a boolean NumPy Array showing whether each element in + the Categorical matches an element in the passed sequence of + `values` exactly. + + Parameters + ---------- + values : set or list-like + The sequence of values to test. Passing in a single string will + raise a ``TypeError``. Instead, turn a single string into a + list of one element. + + Returns + ------- + isin : numpy.ndarray (bool dtype) + + Raises + ------ + TypeError + * If `values` is not a set or list-like + + See Also + -------- + pandas.Series.isin : Equivalent method on Series. + + Examples + -------- + + >>> s = pd.Categorical(['lama', 'cow', 'lama', 'beetle', 'lama', + ... 'hippo']) + >>> s.isin(['cow', 'lama']) + array([ True, True, True, False, True, False]) + + Passing a single string as ``s.isin('lama')`` will raise an error. Use + a list of one element instead: + + >>> s.isin(['lama']) + array([ True, False, True, False, True, False]) + """ + if not is_list_like(values): + values_type = type(values).__name__ + raise TypeError( + "only list-like objects are allowed to be passed" + f" to isin(), you passed a [{values_type}]" + ) + values = sanitize_array(values, None, None) + null_mask = np.asarray(isna(values)) + code_values = self.categories.get_indexer(values) + code_values = code_values[null_mask | (code_values >= 0)] + return algorithms.isin(self.codes, code_values) + + def replace(self, to_replace, value, inplace: bool = False): + """ + Replaces all instances of one value with another + + Parameters + ---------- + to_replace: object + The value to be replaced + + value: object + The value to replace it with + + inplace: bool + Whether the operation is done in-place + + Returns + ------- + None if inplace is True, otherwise the new Categorical after replacement + + + Examples + -------- + >>> s = pd.Categorical([1, 2, 1, 3]) + >>> s.replace(1, 3) + [3, 3, 2, 3] + Categories (2, int64): [2, 3] + """ + inplace = validate_bool_kwarg(inplace, "inplace") + cat = self if inplace else self.copy() + if to_replace in cat.categories: + if isna(value): + cat.remove_categories(to_replace, inplace=True) + else: + categories = cat.categories.tolist() + index = categories.index(to_replace) + if value in cat.categories: + value_index = categories.index(value) + cat._codes[cat._codes == index] = value_index + cat.remove_categories(to_replace, inplace=True) + else: + categories[index] = value + cat.rename_categories(categories, inplace=True) + if not inplace: + return cat + + +# The Series.cat accessor + + +@delegate_names( + delegate=Categorical, accessors=["categories", "ordered"], typ="property" +) +@delegate_names( + delegate=Categorical, + accessors=[ + "rename_categories", + "reorder_categories", + "add_categories", + "remove_categories", + "remove_unused_categories", + "set_categories", + "as_ordered", + "as_unordered", + ], + typ="method", +) +class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): + """ + Accessor object for categorical properties of the Series values. + + Be aware that assigning to `categories` is a inplace operation, while all + methods return new categorical data per default (but can be called with + `inplace=True`). + + Parameters + ---------- + data : Series or CategoricalIndex + + Examples + -------- + >>> s.cat.categories + >>> s.cat.categories = list('abc') + >>> s.cat.rename_categories(list('cab')) + >>> s.cat.reorder_categories(list('cab')) + >>> s.cat.add_categories(['d','e']) + >>> s.cat.remove_categories(['d']) + >>> s.cat.remove_unused_categories() + >>> s.cat.set_categories(list('abcde')) + >>> s.cat.as_ordered() + >>> s.cat.as_unordered() + """ + + _deprecations = PandasObject._deprecations | frozenset( + ["categorical", "index", "name"] + ) + + def __init__(self, data): + self._validate(data) + self._parent = data.values + self._index = data.index + self._name = data.name + self._freeze() + + @staticmethod + def _validate(data): + if not is_categorical_dtype(data.dtype): + raise AttributeError("Can only use .cat accessor with a 'category' dtype") + + def _delegate_property_get(self, name): + return getattr(self._parent, name) + + def _delegate_property_set(self, name, new_values): + return setattr(self._parent, name, new_values) + + @property + def codes(self): + """ + Return Series of codes as well as the index. + """ + from pandas import Series + + return Series(self._parent.codes, index=self._index) + + def _delegate_method(self, name, *args, **kwargs): + from pandas import Series + + method = getattr(self._parent, name) + res = method(*args, **kwargs) + if res is not None: + return Series(res, index=self._index, name=self._name) + + +# utility routines + + +def _get_codes_for_values(values, categories): + """ + utility routine to turn values into codes given the specified categories + """ + dtype_equal = is_dtype_equal(values.dtype, categories.dtype) + + if dtype_equal: + # To prevent erroneous dtype coercion in _get_data_algo, retrieve + # the underlying numpy array. gh-22702 + values = getattr(values, "_ndarray_values", values) + categories = getattr(categories, "_ndarray_values", categories) + elif is_extension_array_dtype(categories.dtype) and is_object_dtype(values): + # Support inferring the correct extension dtype from an array of + # scalar objects. e.g. + # Categorical(array[Period, Period], categories=PeriodIndex(...)) + cls = categories.dtype.construct_array_type() + values = try_cast_to_ea(cls, values) + if not isinstance(values, cls): + # exception raised in _from_sequence + values = ensure_object(values) + categories = ensure_object(categories) + else: + values = ensure_object(values) + categories = ensure_object(categories) + + hash_klass, vals = _get_data_algo(values) + _, cats = _get_data_algo(categories) + t = hash_klass(len(cats)) + t.map_locations(cats) + return coerce_indexer_dtype(t.lookup(vals), cats) + + +def _recode_for_categories(codes: np.ndarray, old_categories, new_categories): + """ + Convert a set of codes for to a new set of categories + + Parameters + ---------- + codes : np.ndarray + old_categories, new_categories : Index + + Returns + ------- + new_codes : np.ndarray[np.int64] + + Examples + -------- + >>> old_cat = pd.Index(['b', 'a', 'c']) + >>> new_cat = pd.Index(['a', 'b']) + >>> codes = np.array([0, 1, 1, 2]) + >>> _recode_for_categories(codes, old_cat, new_cat) + array([ 1, 0, 0, -1]) + """ + if len(old_categories) == 0: + # All null anyway, so just retain the nulls + return codes.copy() + elif new_categories.equals(old_categories): + # Same categories, so no need to actually recode + return codes.copy() + indexer = coerce_indexer_dtype( + new_categories.get_indexer(old_categories), new_categories + ) + new_codes = take_1d(indexer, codes.copy(), fill_value=-1) + return new_codes + + +def _convert_to_list_like(list_like): + if hasattr(list_like, "dtype"): + return list_like + if isinstance(list_like, list): + return list_like + if is_sequence(list_like) or isinstance(list_like, tuple) or is_iterator(list_like): + return list(list_like) + elif is_scalar(list_like): + return [list_like] + else: + # TODO: is this reached? + return [list_like] + + +def factorize_from_iterable(values): + """ + Factorize an input `values` into `categories` and `codes`. Preserves + categorical dtype in `categories`. + + *This is an internal function* + + Parameters + ---------- + values : list-like + + Returns + ------- + codes : ndarray + categories : Index + If `values` has a categorical dtype, then `categories` is + a CategoricalIndex keeping the categories and order of `values`. + """ + if not is_list_like(values): + raise TypeError("Input must be list-like") + + if is_categorical_dtype(values): + values = extract_array(values) + # The Categorical we want to build has the same categories + # as values but its codes are by def [0, ..., len(n_categories) - 1] + cat_codes = np.arange(len(values.categories), dtype=values.codes.dtype) + categories = Categorical.from_codes(cat_codes, dtype=values.dtype) + codes = values.codes + else: + # The value of ordered is irrelevant since we don't use cat as such, + # but only the resulting categories, the order of which is independent + # from ordered. Set ordered to False as default. See GH #15457 + cat = Categorical(values, ordered=False) + categories = cat.categories + codes = cat.codes + return codes, categories + + +def factorize_from_iterables(iterables): + """ + A higher-level wrapper over `factorize_from_iterable`. + + *This is an internal function* + + Parameters + ---------- + iterables : list-like of list-likes + + Returns + ------- + codes_list : list of ndarrays + categories_list : list of Indexes + + Notes + ----- + See `factorize_from_iterable` for more info. + """ + if len(iterables) == 0: + # For consistency, it should return a list of 2 lists. + return [[], []] + return map(list, zip(*(factorize_from_iterable(it) for it in iterables))) diff --git a/venv/Lib/site-packages/pandas/core/arrays/datetimelike.py b/venv/Lib/site-packages/pandas/core/arrays/datetimelike.py new file mode 100644 index 0000000..82fa919 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/arrays/datetimelike.py @@ -0,0 +1,1659 @@ +from datetime import datetime, timedelta +import operator +from typing import Any, Sequence, Type, Union, cast +import warnings + +import numpy as np + +from pandas._libs import NaT, NaTType, Timestamp, algos, iNaT, lib +from pandas._libs.tslibs.c_timestamp import integer_op_not_supported +from pandas._libs.tslibs.period import DIFFERENT_FREQ, IncompatibleFrequency, Period +from pandas._libs.tslibs.timedeltas import Timedelta, delta_to_nanoseconds +from pandas._libs.tslibs.timestamps import RoundTo, round_nsint64 +from pandas._typing import DatetimeLikeScalar +from pandas.compat import set_function_name +from pandas.compat.numpy import function as nv +from pandas.errors import AbstractMethodError, NullFrequencyError, PerformanceWarning +from pandas.util._decorators import Appender, Substitution +from pandas.util._validators import validate_fillna_kwargs + +from pandas.core.dtypes.common import ( + is_categorical_dtype, + is_datetime64_any_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_datetime_or_timedelta_dtype, + is_dtype_equal, + is_float_dtype, + is_integer_dtype, + is_list_like, + is_object_dtype, + is_period_dtype, + is_string_dtype, + is_timedelta64_dtype, + is_unsigned_integer_dtype, + pandas_dtype, +) +from pandas.core.dtypes.generic import ABCSeries +from pandas.core.dtypes.inference import is_array_like +from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna + +from pandas.core import missing, nanops, ops +from pandas.core.algorithms import checked_add_with_arr, take, unique1d, value_counts +from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin +import pandas.core.common as com +from pandas.core.indexers import check_array_indexer +from pandas.core.ops.common import unpack_zerodim_and_defer +from pandas.core.ops.invalid import invalid_comparison, make_invalid_op + +from pandas.tseries import frequencies +from pandas.tseries.offsets import DateOffset, Tick + + +def _datetimelike_array_cmp(cls, op): + """ + Wrap comparison operations to convert Timestamp/Timedelta/Period-like to + boxed scalars/arrays. + """ + opname = f"__{op.__name__}__" + nat_result = opname == "__ne__" + + @unpack_zerodim_and_defer(opname) + def wrapper(self, other): + + if isinstance(other, str): + try: + # GH#18435 strings get a pass from tzawareness compat + other = self._scalar_from_string(other) + except ValueError: + # failed to parse as Timestamp/Timedelta/Period + return invalid_comparison(self, other, op) + + if isinstance(other, self._recognized_scalars) or other is NaT: + other = self._scalar_type(other) + self._check_compatible_with(other) + + other_i8 = self._unbox_scalar(other) + + result = op(self.view("i8"), other_i8) + if isna(other): + result.fill(nat_result) + + elif not is_list_like(other): + return invalid_comparison(self, other, op) + + elif len(other) != len(self): + raise ValueError("Lengths must match") + + else: + if isinstance(other, list): + # TODO: could use pd.Index to do inference? + other = np.array(other) + + if not isinstance(other, (np.ndarray, type(self))): + return invalid_comparison(self, other, op) + + if is_object_dtype(other): + # We have to use comp_method_OBJECT_ARRAY instead of numpy + # comparison otherwise it would fail to raise when + # comparing tz-aware and tz-naive + with np.errstate(all="ignore"): + result = ops.comp_method_OBJECT_ARRAY( + op, self.astype(object), other + ) + o_mask = isna(other) + + elif not type(self)._is_recognized_dtype(other.dtype): + return invalid_comparison(self, other, op) + + else: + # For PeriodDType this casting is unnecessary + other = type(self)._from_sequence(other) + self._check_compatible_with(other) + + result = op(self.view("i8"), other.view("i8")) + o_mask = other._isnan + + if o_mask.any(): + result[o_mask] = nat_result + + if self._hasnans: + result[self._isnan] = nat_result + + return result + + return set_function_name(wrapper, opname, cls) + + +class AttributesMixin: + _data: np.ndarray + + @classmethod + def _simple_new(cls, values, **kwargs): + raise AbstractMethodError(cls) + + @property + def _scalar_type(self) -> Type[DatetimeLikeScalar]: + """The scalar associated with this datelike + + * PeriodArray : Period + * DatetimeArray : Timestamp + * TimedeltaArray : Timedelta + """ + raise AbstractMethodError(self) + + def _scalar_from_string( + self, value: str + ) -> Union[Period, Timestamp, Timedelta, NaTType]: + """ + Construct a scalar type from a string. + + Parameters + ---------- + value : str + + Returns + ------- + Period, Timestamp, or Timedelta, or NaT + Whatever the type of ``self._scalar_type`` is. + + Notes + ----- + This should call ``self._check_compatible_with`` before + unboxing the result. + """ + raise AbstractMethodError(self) + + def _unbox_scalar(self, value: Union[Period, Timestamp, Timedelta, NaTType]) -> int: + """ + Unbox the integer value of a scalar `value`. + + Parameters + ---------- + value : Union[Period, Timestamp, Timedelta] + + Returns + ------- + int + + Examples + -------- + >>> self._unbox_scalar(Timedelta('10s')) # DOCTEST: +SKIP + 10000000000 + """ + raise AbstractMethodError(self) + + def _check_compatible_with( + self, other: Union[Period, Timestamp, Timedelta, NaTType], setitem: bool = False + ) -> None: + """ + Verify that `self` and `other` are compatible. + + * DatetimeArray verifies that the timezones (if any) match + * PeriodArray verifies that the freq matches + * Timedelta has no verification + + In each case, NaT is considered compatible. + + Parameters + ---------- + other + setitem : bool, default False + For __setitem__ we may have stricter compatiblity resrictions than + for comparisons. + + Raises + ------ + Exception + """ + raise AbstractMethodError(self) + + +class DatelikeOps: + """ + Common ops for DatetimeIndex/PeriodIndex, but not TimedeltaIndex. + """ + + @Substitution( + URL="https://docs.python.org/3/library/datetime.html" + "#strftime-and-strptime-behavior" + ) + def strftime(self, date_format): + """ + Convert to Index using specified date_format. + + Return an Index of formatted strings specified by date_format, which + supports the same string format as the python standard library. Details + of the string format can be found in `python string format + doc <%(URL)s>`__. + + Parameters + ---------- + date_format : str + Date format string (e.g. "%%Y-%%m-%%d"). + + Returns + ------- + ndarray + NumPy ndarray of formatted strings. + + See Also + -------- + to_datetime : Convert the given argument to datetime. + DatetimeIndex.normalize : Return DatetimeIndex with times to midnight. + DatetimeIndex.round : Round the DatetimeIndex to the specified freq. + DatetimeIndex.floor : Floor the DatetimeIndex to the specified freq. + + Examples + -------- + >>> rng = pd.date_range(pd.Timestamp("2018-03-10 09:00"), + ... periods=3, freq='s') + >>> rng.strftime('%%B %%d, %%Y, %%r') + Index(['March 10, 2018, 09:00:00 AM', 'March 10, 2018, 09:00:01 AM', + 'March 10, 2018, 09:00:02 AM'], + dtype='object') + """ + result = self._format_native_types(date_format=date_format, na_rep=np.nan) + return result.astype(object) + + +class TimelikeOps: + """ + Common ops for TimedeltaIndex/DatetimeIndex, but not PeriodIndex. + """ + + _round_doc = """ + Perform {op} operation on the data to the specified `freq`. + + Parameters + ---------- + freq : str or Offset + The frequency level to {op} the index to. Must be a fixed + frequency like 'S' (second) not 'ME' (month end). See + :ref:`frequency aliases ` for + a list of possible `freq` values. + ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' + Only relevant for DatetimeIndex: + + - 'infer' will attempt to infer fall dst-transition hours based on + order + - bool-ndarray where True signifies a DST time, False designates + a non-DST time (note that this flag is only applicable for + ambiguous times) + - 'NaT' will return NaT where there are ambiguous times + - 'raise' will raise an AmbiguousTimeError if there are ambiguous + times. + + .. versionadded:: 0.24.0 + + nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, \ +default 'raise' + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. + + - 'shift_forward' will shift the nonexistent time forward to the + closest existing time + - 'shift_backward' will shift the nonexistent time backward to the + closest existing time + - 'NaT' will return NaT where there are nonexistent times + - timedelta objects will shift nonexistent times by the timedelta + - 'raise' will raise an NonExistentTimeError if there are + nonexistent times. + + .. versionadded:: 0.24.0 + + Returns + ------- + DatetimeIndex, TimedeltaIndex, or Series + Index of the same type for a DatetimeIndex or TimedeltaIndex, + or a Series with the same index for a Series. + + Raises + ------ + ValueError if the `freq` cannot be converted. + + Examples + -------- + **DatetimeIndex** + + >>> rng = pd.date_range('1/1/2018 11:59:00', periods=3, freq='min') + >>> rng + DatetimeIndex(['2018-01-01 11:59:00', '2018-01-01 12:00:00', + '2018-01-01 12:01:00'], + dtype='datetime64[ns]', freq='T') + """ + + _round_example = """>>> rng.round('H') + DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', + '2018-01-01 12:00:00'], + dtype='datetime64[ns]', freq=None) + + **Series** + + >>> pd.Series(rng).dt.round("H") + 0 2018-01-01 12:00:00 + 1 2018-01-01 12:00:00 + 2 2018-01-01 12:00:00 + dtype: datetime64[ns] + """ + + _floor_example = """>>> rng.floor('H') + DatetimeIndex(['2018-01-01 11:00:00', '2018-01-01 12:00:00', + '2018-01-01 12:00:00'], + dtype='datetime64[ns]', freq=None) + + **Series** + + >>> pd.Series(rng).dt.floor("H") + 0 2018-01-01 11:00:00 + 1 2018-01-01 12:00:00 + 2 2018-01-01 12:00:00 + dtype: datetime64[ns] + """ + + _ceil_example = """>>> rng.ceil('H') + DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', + '2018-01-01 13:00:00'], + dtype='datetime64[ns]', freq=None) + + **Series** + + >>> pd.Series(rng).dt.ceil("H") + 0 2018-01-01 12:00:00 + 1 2018-01-01 12:00:00 + 2 2018-01-01 13:00:00 + dtype: datetime64[ns] + """ + + def _round(self, freq, mode, ambiguous, nonexistent): + # round the local times + if is_datetime64tz_dtype(self): + # operate on naive timestamps, then convert back to aware + naive = self.tz_localize(None) + result = naive._round(freq, mode, ambiguous, nonexistent) + aware = result.tz_localize( + self.tz, ambiguous=ambiguous, nonexistent=nonexistent + ) + return aware + + values = self.view("i8") + result = round_nsint64(values, mode, freq) + result = self._maybe_mask_results(result, fill_value=NaT) + return self._simple_new(result, dtype=self.dtype) + + @Appender((_round_doc + _round_example).format(op="round")) + def round(self, freq, ambiguous="raise", nonexistent="raise"): + return self._round(freq, RoundTo.NEAREST_HALF_EVEN, ambiguous, nonexistent) + + @Appender((_round_doc + _floor_example).format(op="floor")) + def floor(self, freq, ambiguous="raise", nonexistent="raise"): + return self._round(freq, RoundTo.MINUS_INFTY, ambiguous, nonexistent) + + @Appender((_round_doc + _ceil_example).format(op="ceil")) + def ceil(self, freq, ambiguous="raise", nonexistent="raise"): + return self._round(freq, RoundTo.PLUS_INFTY, ambiguous, nonexistent) + + +class DatetimeLikeArrayMixin(ExtensionOpsMixin, AttributesMixin, ExtensionArray): + """ + Shared Base/Mixin class for DatetimeArray, TimedeltaArray, PeriodArray + + Assumes that __new__/__init__ defines: + _data + _freq + + and that the inheriting class has methods: + _generate_range + """ + + @property + def ndim(self) -> int: + return self._data.ndim + + @property + def shape(self): + return self._data.shape + + def reshape(self, *args, **kwargs): + # Note: we drop any freq + data = self._data.reshape(*args, **kwargs) + return type(self)(data, dtype=self.dtype) + + def ravel(self, *args, **kwargs): + # Note: we drop any freq + data = self._data.ravel(*args, **kwargs) + return type(self)(data, dtype=self.dtype) + + @property + def _box_func(self): + """ + box function to get object from internal representation + """ + raise AbstractMethodError(self) + + def _box_values(self, values): + """ + apply box func to passed values + """ + return lib.map_infer(values, self._box_func) + + def __iter__(self): + return (self._box_func(v) for v in self.asi8) + + @property + def asi8(self) -> np.ndarray: + """ + Integer representation of the values. + + Returns + ------- + ndarray + An ndarray with int64 dtype. + """ + # do not cache or you'll create a memory leak + return self._data.view("i8") + + @property + def _ndarray_values(self): + return self._data + + # ---------------------------------------------------------------- + # Rendering Methods + + def _format_native_types(self, na_rep="NaT", date_format=None): + """ + Helper method for astype when converting to strings. + + Returns + ------- + ndarray[str] + """ + raise AbstractMethodError(self) + + def _formatter(self, boxed=False): + # TODO: Remove Datetime & DatetimeTZ formatters. + return "'{}'".format + + # ---------------------------------------------------------------- + # Array-Like / EA-Interface Methods + + @property + def nbytes(self): + return self._data.nbytes + + def __array__(self, dtype=None) -> np.ndarray: + # used for Timedelta/DatetimeArray, overwritten by PeriodArray + if is_object_dtype(dtype): + return np.array(list(self), dtype=object) + return self._data + + @property + def size(self) -> int: + """The number of elements in this array.""" + return np.prod(self.shape) + + def __len__(self) -> int: + return len(self._data) + + def __getitem__(self, key): + """ + This getitem defers to the underlying array, which by-definition can + only handle list-likes, slices, and integer scalars + """ + + is_int = lib.is_integer(key) + if lib.is_scalar(key) and not is_int: + raise IndexError( + "only integers, slices (`:`), ellipsis (`...`), " + "numpy.newaxis (`None`) and integer or boolean " + "arrays are valid indices" + ) + + getitem = self._data.__getitem__ + if is_int: + val = getitem(key) + if lib.is_scalar(val): + # i.e. self.ndim == 1 + return self._box_func(val) + return type(self)(val, dtype=self.dtype) + + if com.is_bool_indexer(key): + # first convert to boolean, because check_array_indexer doesn't + # allow object dtype + key = np.asarray(key, dtype=bool) + key = check_array_indexer(self, key) + if key.all(): + key = slice(0, None, None) + else: + key = lib.maybe_booleans_to_slice(key.view(np.uint8)) + elif isinstance(key, list) and len(key) == 1 and isinstance(key[0], slice): + # see https://github.com/pandas-dev/pandas/issues/31299, need to allow + # this for now (would otherwise raise in check_array_indexer) + pass + else: + key = check_array_indexer(self, key) + + is_period = is_period_dtype(self) + if is_period: + freq = self.freq + else: + freq = None + if isinstance(key, slice): + if self.freq is not None and key.step is not None: + freq = key.step * self.freq + else: + freq = self.freq + elif key is Ellipsis: + # GH#21282 indexing with Ellipsis is similar to a full slice, + # should preserve `freq` attribute + freq = self.freq + + result = getitem(key) + if result.ndim > 1: + # To support MPL which performs slicing with 2 dim + # even though it only has 1 dim by definition + return result + + return self._simple_new(result, dtype=self.dtype, freq=freq) + + def __setitem__( + self, + key: Union[int, Sequence[int], Sequence[bool], slice], + value: Union[NaTType, Any, Sequence[Any]], + ) -> None: + # I'm fudging the types a bit here. "Any" above really depends + # on type(self). For PeriodArray, it's Period (or stuff coercible + # to a period in from_sequence). For DatetimeArray, it's Timestamp... + # I don't know if mypy can do that, possibly with Generics. + # https://mypy.readthedocs.io/en/latest/generics.html + if lib.is_scalar(value) and not isna(value): + value = com.maybe_box_datetimelike(value) + + if is_list_like(value): + is_slice = isinstance(key, slice) + + if lib.is_scalar(key): + raise ValueError("setting an array element with a sequence.") + + if not is_slice: + key = cast(Sequence, key) + if len(key) != len(value) and not com.is_bool_indexer(key): + msg = ( + f"shape mismatch: value array of length '{len(key)}' " + "does not match indexing result of length " + f"'{len(value)}'." + ) + raise ValueError(msg) + elif not len(key): + return + + value = type(self)._from_sequence(value, dtype=self.dtype) + self._check_compatible_with(value, setitem=True) + value = value.asi8 + elif isinstance(value, self._scalar_type): + self._check_compatible_with(value, setitem=True) + value = self._unbox_scalar(value) + elif is_valid_nat_for_dtype(value, self.dtype): + value = iNaT + else: + msg = ( + f"'value' should be a '{self._scalar_type.__name__}', 'NaT', " + f"or array of those. Got '{type(value).__name__}' instead." + ) + raise TypeError(msg) + self._data[key] = value + self._maybe_clear_freq() + + def _maybe_clear_freq(self): + # inplace operations like __setitem__ may invalidate the freq of + # DatetimeArray and TimedeltaArray + pass + + def astype(self, dtype, copy=True): + # Some notes on cases we don't have to handle here in the base class: + # 1. PeriodArray.astype handles period -> period + # 2. DatetimeArray.astype handles conversion between tz. + # 3. DatetimeArray.astype handles datetime -> period + from pandas import Categorical + + dtype = pandas_dtype(dtype) + + if is_object_dtype(dtype): + return self._box_values(self.asi8) + elif is_string_dtype(dtype) and not is_categorical_dtype(dtype): + return self._format_native_types() + elif is_integer_dtype(dtype): + # we deliberately ignore int32 vs. int64 here. + # See https://github.com/pandas-dev/pandas/issues/24381 for more. + values = self.asi8 + + if is_unsigned_integer_dtype(dtype): + # Again, we ignore int32 vs. int64 + values = values.view("uint64") + + if copy: + values = values.copy() + return values + elif ( + is_datetime_or_timedelta_dtype(dtype) + and not is_dtype_equal(self.dtype, dtype) + ) or is_float_dtype(dtype): + # disallow conversion between datetime/timedelta, + # and conversions for any datetimelike to float + msg = f"Cannot cast {type(self).__name__} to dtype {dtype}" + raise TypeError(msg) + elif is_categorical_dtype(dtype): + return Categorical(self, dtype=dtype) + else: + return np.asarray(self, dtype=dtype) + + def view(self, dtype=None): + if dtype is None or dtype is self.dtype: + return type(self)(self._data, dtype=self.dtype) + return self._data.view(dtype=dtype) + + # ------------------------------------------------------------------ + # ExtensionArray Interface + + def unique(self): + result = unique1d(self.asi8) + return type(self)(result, dtype=self.dtype) + + def _validate_fill_value(self, fill_value): + """ + If a fill_value is passed to `take` convert it to an i8 representation, + raising ValueError if this is not possible. + + Parameters + ---------- + fill_value : object + + Returns + ------- + fill_value : np.int64 + + Raises + ------ + ValueError + """ + if isna(fill_value): + fill_value = iNaT + elif isinstance(fill_value, self._recognized_scalars): + self._check_compatible_with(fill_value) + fill_value = self._scalar_type(fill_value) + fill_value = self._unbox_scalar(fill_value) + else: + raise ValueError( + f"'fill_value' should be a {self._scalar_type}. Got '{fill_value}'." + ) + return fill_value + + def take(self, indices, allow_fill=False, fill_value=None): + if allow_fill: + fill_value = self._validate_fill_value(fill_value) + + new_values = take( + self.asi8, indices, allow_fill=allow_fill, fill_value=fill_value + ) + + return type(self)(new_values, dtype=self.dtype) + + @classmethod + def _concat_same_type(cls, to_concat): + dtypes = {x.dtype for x in to_concat} + assert len(dtypes) == 1 + dtype = list(dtypes)[0] + + values = np.concatenate([x.asi8 for x in to_concat]) + return cls(values, dtype=dtype) + + def copy(self): + values = self.asi8.copy() + return type(self)._simple_new(values, dtype=self.dtype, freq=self.freq) + + def _values_for_factorize(self): + return self.asi8, iNaT + + @classmethod + def _from_factorized(cls, values, original): + return cls(values, dtype=original.dtype) + + def _values_for_argsort(self): + return self._data + + # ------------------------------------------------------------------ + # Additional array methods + # These are not part of the EA API, but we implement them because + # pandas assumes they're there. + + def searchsorted(self, value, side="left", sorter=None): + """ + Find indices where elements should be inserted to maintain order. + + Find the indices into a sorted array `self` such that, if the + corresponding elements in `value` were inserted before the indices, + the order of `self` would be preserved. + + Parameters + ---------- + value : array_like + Values to insert into `self`. + side : {'left', 'right'}, optional + If 'left', the index of the first suitable location found is given. + If 'right', return the last such index. If there is no suitable + index, return either 0 or N (where N is the length of `self`). + sorter : 1-D array_like, optional + Optional array of integer indices that sort `self` into ascending + order. They are typically the result of ``np.argsort``. + + Returns + ------- + indices : array of ints + Array of insertion points with the same shape as `value`. + """ + if isinstance(value, str): + value = self._scalar_from_string(value) + + if not (isinstance(value, (self._scalar_type, type(self))) or isna(value)): + raise ValueError(f"Unexpected type for 'value': {type(value)}") + + self._check_compatible_with(value) + if isinstance(value, type(self)): + value = value.asi8 + else: + value = self._unbox_scalar(value) + + return self.asi8.searchsorted(value, side=side, sorter=sorter) + + def repeat(self, repeats, *args, **kwargs): + """ + Repeat elements of an array. + + See Also + -------- + numpy.ndarray.repeat + """ + nv.validate_repeat(args, kwargs) + values = self._data.repeat(repeats) + return type(self)(values.view("i8"), dtype=self.dtype) + + def value_counts(self, dropna=False): + """ + Return a Series containing counts of unique values. + + Parameters + ---------- + dropna : bool, default True + Don't include counts of NaT values. + + Returns + ------- + Series + """ + from pandas import Series, Index + + if dropna: + values = self[~self.isna()]._data + else: + values = self._data + + cls = type(self) + + result = value_counts(values, sort=False, dropna=dropna) + index = Index( + cls(result.index.view("i8"), dtype=self.dtype), name=result.index.name + ) + return Series(result.values, index=index, name=result.name) + + def map(self, mapper): + # TODO(GH-23179): Add ExtensionArray.map + # Need to figure out if we want ExtensionArray.map first. + # If so, then we can refactor IndexOpsMixin._map_values to + # a standalone function and call from here.. + # Else, just rewrite _map_infer_values to do the right thing. + from pandas import Index + + return Index(self).map(mapper).array + + # ------------------------------------------------------------------ + # Null Handling + + def isna(self): + return self._isnan + + @property # NB: override with cache_readonly in immutable subclasses + def _isnan(self): + """ + return if each value is nan + """ + return self.asi8 == iNaT + + @property # NB: override with cache_readonly in immutable subclasses + def _hasnans(self): + """ + return if I have any nans; enables various perf speedups + """ + return bool(self._isnan.any()) + + def _maybe_mask_results(self, result, fill_value=iNaT, convert=None): + """ + Parameters + ---------- + result : a ndarray + fill_value : object, default iNaT + convert : str, dtype or None + + Returns + ------- + result : ndarray with values replace by the fill_value + + mask the result if needed, convert to the provided dtype if its not + None + + This is an internal routine. + """ + + if self._hasnans: + if convert: + result = result.astype(convert) + if fill_value is None: + fill_value = np.nan + result[self._isnan] = fill_value + return result + + def fillna(self, value=None, method=None, limit=None): + # TODO(GH-20300): remove this + # Just overriding to ensure that we avoid an astype(object). + # Either 20300 or a `_values_for_fillna` would avoid this duplication. + if isinstance(value, ABCSeries): + value = value.array + + value, method = validate_fillna_kwargs(value, method) + + mask = self.isna() + + if is_array_like(value): + if len(value) != len(self): + raise ValueError( + f"Length of 'value' does not match. Got ({len(value)}) " + f" expected {len(self)}" + ) + value = value[mask] + + if mask.any(): + if method is not None: + if method == "pad": + func = missing.pad_1d + else: + func = missing.backfill_1d + + values = self._data + if not is_period_dtype(self): + # For PeriodArray self._data is i8, which gets copied + # by `func`. Otherwise we need to make a copy manually + # to avoid modifying `self` in-place. + values = values.copy() + + new_values = func(values, limit=limit, mask=mask) + if is_datetime64tz_dtype(self): + # we need to pass int64 values to the constructor to avoid + # re-localizing incorrectly + new_values = new_values.view("i8") + new_values = type(self)(new_values, dtype=self.dtype) + else: + # fill with value + new_values = self.copy() + new_values[mask] = value + else: + new_values = self.copy() + return new_values + + # ------------------------------------------------------------------ + # Frequency Properties/Methods + + @property + def freq(self): + """ + Return the frequency object if it is set, otherwise None. + """ + return self._freq + + @freq.setter + def freq(self, value): + if value is not None: + value = frequencies.to_offset(value) + self._validate_frequency(self, value) + + self._freq = value + + @property + def freqstr(self): + """ + Return the frequency object as a string if its set, otherwise None + """ + if self.freq is None: + return None + return self.freq.freqstr + + @property # NB: override with cache_readonly in immutable subclasses + def inferred_freq(self): + """ + Tryies to return a string representing a frequency guess, + generated by infer_freq. Returns None if it can't autodetect the + frequency. + """ + if self.ndim != 1: + return None + try: + return frequencies.infer_freq(self) + except ValueError: + return None + + @property # NB: override with cache_readonly in immutable subclasses + def _resolution(self): + return frequencies.Resolution.get_reso_from_freq(self.freqstr) + + @property # NB: override with cache_readonly in immutable subclasses + def resolution(self): + """ + Returns day, hour, minute, second, millisecond or microsecond + """ + return frequencies.Resolution.get_str(self._resolution) + + @classmethod + def _validate_frequency(cls, index, freq, **kwargs): + """ + Validate that a frequency is compatible with the values of a given + Datetime Array/Index or Timedelta Array/Index + + Parameters + ---------- + index : DatetimeIndex or TimedeltaIndex + The index on which to determine if the given frequency is valid + freq : DateOffset + The frequency to validate + """ + if is_period_dtype(cls): + # Frequency validation is not meaningful for Period Array/Index + return None + + inferred = index.inferred_freq + if index.size == 0 or inferred == freq.freqstr: + return None + + try: + on_freq = cls._generate_range( + start=index[0], end=None, periods=len(index), freq=freq, **kwargs + ) + if not np.array_equal(index.asi8, on_freq.asi8): + raise ValueError + except ValueError as e: + if "non-fixed" in str(e): + # non-fixed frequencies are not meaningful for timedelta64; + # we retain that error message + raise e + # GH#11587 the main way this is reached is if the `np.array_equal` + # check above is False. This can also be reached if index[0] + # is `NaT`, in which case the call to `cls._generate_range` will + # raise a ValueError, which we re-raise with a more targeted + # message. + raise ValueError( + f"Inferred frequency {inferred} from passed values " + f"does not conform to passed frequency {freq.freqstr}" + ) + + # monotonicity/uniqueness properties are called via frequencies.infer_freq, + # see GH#23789 + + @property + def _is_monotonic_increasing(self): + return algos.is_monotonic(self.asi8, timelike=True)[0] + + @property + def _is_monotonic_decreasing(self): + return algos.is_monotonic(self.asi8, timelike=True)[1] + + @property + def _is_unique(self): + return len(unique1d(self.asi8)) == len(self) + + # ------------------------------------------------------------------ + # Arithmetic Methods + _create_comparison_method = classmethod(_datetimelike_array_cmp) + + # pow is invalid for all three subclasses; TimedeltaArray will override + # the multiplication and division ops + __pow__ = make_invalid_op("__pow__") + __rpow__ = make_invalid_op("__rpow__") + __mul__ = make_invalid_op("__mul__") + __rmul__ = make_invalid_op("__rmul__") + __truediv__ = make_invalid_op("__truediv__") + __rtruediv__ = make_invalid_op("__rtruediv__") + __floordiv__ = make_invalid_op("__floordiv__") + __rfloordiv__ = make_invalid_op("__rfloordiv__") + __mod__ = make_invalid_op("__mod__") + __rmod__ = make_invalid_op("__rmod__") + __divmod__ = make_invalid_op("__divmod__") + __rdivmod__ = make_invalid_op("__rdivmod__") + + def _add_datetimelike_scalar(self, other): + # Overridden by TimedeltaArray + raise TypeError(f"cannot add {type(self).__name__} and {type(other).__name__}") + + _add_datetime_arraylike = _add_datetimelike_scalar + + def _sub_datetimelike_scalar(self, other): + # Overridden by DatetimeArray + assert other is not NaT + raise TypeError(f"cannot subtract a datelike from a {type(self).__name__}") + + _sub_datetime_arraylike = _sub_datetimelike_scalar + + def _sub_period(self, other): + # Overridden by PeriodArray + raise TypeError(f"cannot subtract Period from a {type(self).__name__}") + + def _add_offset(self, offset): + raise AbstractMethodError(self) + + def _add_delta(self, other): + """ + Add a timedelta-like, Tick or TimedeltaIndex-like object + to self, yielding an int64 numpy array + + Parameters + ---------- + delta : {timedelta, np.timedelta64, Tick, + TimedeltaIndex, ndarray[timedelta64]} + + Returns + ------- + result : ndarray[int64] + + Notes + ----- + The result's name is set outside of _add_delta by the calling + method (__add__ or __sub__), if necessary (i.e. for Indexes). + """ + if isinstance(other, (Tick, timedelta, np.timedelta64)): + new_values = self._add_timedeltalike_scalar(other) + elif is_timedelta64_dtype(other): + # ndarray[timedelta64] or TimedeltaArray/index + new_values = self._add_delta_tdi(other) + + return new_values + + def _add_timedeltalike_scalar(self, other): + """ + Add a delta of a timedeltalike + return the i8 result view + """ + if isna(other): + # i.e np.timedelta64("NaT"), not recognized by delta_to_nanoseconds + new_values = np.empty(self.shape, dtype="i8") + new_values[:] = iNaT + return new_values + + inc = delta_to_nanoseconds(other) + new_values = checked_add_with_arr(self.asi8, inc, arr_mask=self._isnan).view( + "i8" + ) + new_values = self._maybe_mask_results(new_values) + return new_values.view("i8") + + def _add_delta_tdi(self, other): + """ + Add a delta of a TimedeltaIndex + return the i8 result view + """ + if len(self) != len(other): + raise ValueError("cannot add indices of unequal length") + + if isinstance(other, np.ndarray): + # ndarray[timedelta64]; wrap in TimedeltaIndex for op + from pandas.core.arrays import TimedeltaArray + + other = TimedeltaArray._from_sequence(other) + + self_i8 = self.asi8 + other_i8 = other.asi8 + new_values = checked_add_with_arr( + self_i8, other_i8, arr_mask=self._isnan, b_mask=other._isnan + ) + if self._hasnans or other._hasnans: + mask = (self._isnan) | (other._isnan) + new_values[mask] = iNaT + return new_values.view("i8") + + def _add_nat(self): + """ + Add pd.NaT to self + """ + if is_period_dtype(self): + raise TypeError( + f"Cannot add {type(self).__name__} and {type(NaT).__name__}" + ) + + # GH#19124 pd.NaT is treated like a timedelta for both timedelta + # and datetime dtypes + result = np.zeros(self.shape, dtype=np.int64) + result.fill(iNaT) + return type(self)(result, dtype=self.dtype, freq=None) + + def _sub_nat(self): + """ + Subtract pd.NaT from self + """ + # GH#19124 Timedelta - datetime is not in general well-defined. + # We make an exception for pd.NaT, which in this case quacks + # like a timedelta. + # For datetime64 dtypes by convention we treat NaT as a datetime, so + # this subtraction returns a timedelta64 dtype. + # For period dtype, timedelta64 is a close-enough return dtype. + result = np.zeros(self.shape, dtype=np.int64) + result.fill(iNaT) + return result.view("timedelta64[ns]") + + def _sub_period_array(self, other): + """ + Subtract a Period Array/Index from self. This is only valid if self + is itself a Period Array/Index, raises otherwise. Both objects must + have the same frequency. + + Parameters + ---------- + other : PeriodIndex or PeriodArray + + Returns + ------- + result : np.ndarray[object] + Array of DateOffset objects; nulls represented by NaT. + """ + if not is_period_dtype(self): + raise TypeError( + f"cannot subtract {other.dtype}-dtype from {type(self).__name__}" + ) + + if self.freq != other.freq: + msg = DIFFERENT_FREQ.format( + cls=type(self).__name__, own_freq=self.freqstr, other_freq=other.freqstr + ) + raise IncompatibleFrequency(msg) + + new_values = checked_add_with_arr( + self.asi8, -other.asi8, arr_mask=self._isnan, b_mask=other._isnan + ) + + new_values = np.array([self.freq.base * x for x in new_values]) + if self._hasnans or other._hasnans: + mask = (self._isnan) | (other._isnan) + new_values[mask] = NaT + return new_values + + def _addsub_object_array(self, other: np.ndarray, op): + """ + Add or subtract array-like of DateOffset objects + + Parameters + ---------- + other : np.ndarray[object] + op : {operator.add, operator.sub} + + Returns + ------- + result : same class as self + """ + assert op in [operator.add, operator.sub] + if len(other) == 1: + return op(self, other[0]) + + warnings.warn( + "Adding/subtracting array of DateOffsets to " + f"{type(self).__name__} not vectorized", + PerformanceWarning, + ) + + # For EA self.astype('O') returns a numpy array, not an Index + left = self.astype("O") + + res_values = op(left, np.array(other)) + kwargs = {} + if not is_period_dtype(self): + kwargs["freq"] = "infer" + try: + res = type(self)._from_sequence(res_values, **kwargs) + except ValueError: + # e.g. we've passed a Timestamp to TimedeltaArray + res = res_values + return res + + def _time_shift(self, periods, freq=None): + """ + Shift each value by `periods`. + + Note this is different from ExtensionArray.shift, which + shifts the *position* of each element, padding the end with + missing values. + + Parameters + ---------- + periods : int + Number of periods to shift by. + freq : pandas.DateOffset, pandas.Timedelta, or str + Frequency increment to shift by. + """ + if freq is not None and freq != self.freq: + if isinstance(freq, str): + freq = frequencies.to_offset(freq) + offset = periods * freq + result = self + offset + return result + + if periods == 0: + # immutable so OK + return self.copy() + + if self.freq is None: + raise NullFrequencyError("Cannot shift with no freq") + + start = self[0] + periods * self.freq + end = self[-1] + periods * self.freq + + # Note: in the DatetimeTZ case, _generate_range will infer the + # appropriate timezone from `start` and `end`, so tz does not need + # to be passed explicitly. + return self._generate_range(start=start, end=end, periods=None, freq=self.freq) + + @unpack_zerodim_and_defer("__add__") + def __add__(self, other): + + # scalar others + if other is NaT: + result = self._add_nat() + elif isinstance(other, (Tick, timedelta, np.timedelta64)): + result = self._add_delta(other) + elif isinstance(other, DateOffset): + # specifically _not_ a Tick + result = self._add_offset(other) + elif isinstance(other, (datetime, np.datetime64)): + result = self._add_datetimelike_scalar(other) + elif lib.is_integer(other): + # This check must come after the check for np.timedelta64 + # as is_integer returns True for these + if not is_period_dtype(self): + raise integer_op_not_supported(self) + result = self._time_shift(other) + + # array-like others + elif is_timedelta64_dtype(other): + # TimedeltaIndex, ndarray[timedelta64] + result = self._add_delta(other) + elif is_object_dtype(other): + # e.g. Array/Index of DateOffset objects + result = self._addsub_object_array(other, operator.add) + elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): + # DatetimeIndex, ndarray[datetime64] + return self._add_datetime_arraylike(other) + elif is_integer_dtype(other): + if not is_period_dtype(self): + raise integer_op_not_supported(self) + result = self._addsub_int_array(other, operator.add) + else: + # Includes Categorical, other ExtensionArrays + # For PeriodDtype, if self is a TimedeltaArray and other is a + # PeriodArray with a timedelta-like (i.e. Tick) freq, this + # operation is valid. Defer to the PeriodArray implementation. + # In remaining cases, this will end up raising TypeError. + return NotImplemented + + if is_timedelta64_dtype(result) and isinstance(result, np.ndarray): + from pandas.core.arrays import TimedeltaArray + + return TimedeltaArray(result) + return result + + def __radd__(self, other): + # alias for __add__ + return self.__add__(other) + + @unpack_zerodim_and_defer("__sub__") + def __sub__(self, other): + + # scalar others + if other is NaT: + result = self._sub_nat() + elif isinstance(other, (Tick, timedelta, np.timedelta64)): + result = self._add_delta(-other) + elif isinstance(other, DateOffset): + # specifically _not_ a Tick + result = self._add_offset(-other) + elif isinstance(other, (datetime, np.datetime64)): + result = self._sub_datetimelike_scalar(other) + elif lib.is_integer(other): + # This check must come after the check for np.timedelta64 + # as is_integer returns True for these + if not is_period_dtype(self): + raise integer_op_not_supported(self) + result = self._time_shift(-other) + + elif isinstance(other, Period): + result = self._sub_period(other) + + # array-like others + elif is_timedelta64_dtype(other): + # TimedeltaIndex, ndarray[timedelta64] + result = self._add_delta(-other) + elif is_object_dtype(other): + # e.g. Array/Index of DateOffset objects + result = self._addsub_object_array(other, operator.sub) + elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): + # DatetimeIndex, ndarray[datetime64] + result = self._sub_datetime_arraylike(other) + elif is_period_dtype(other): + # PeriodIndex + result = self._sub_period_array(other) + elif is_integer_dtype(other): + if not is_period_dtype(self): + raise integer_op_not_supported(self) + result = self._addsub_int_array(other, operator.sub) + else: + # Includes ExtensionArrays, float_dtype + return NotImplemented + + if is_timedelta64_dtype(result) and isinstance(result, np.ndarray): + from pandas.core.arrays import TimedeltaArray + + return TimedeltaArray(result) + return result + + def __rsub__(self, other): + if is_datetime64_any_dtype(other) and is_timedelta64_dtype(self.dtype): + # ndarray[datetime64] cannot be subtracted from self, so + # we need to wrap in DatetimeArray/Index and flip the operation + if lib.is_scalar(other): + # i.e. np.datetime64 object + return Timestamp(other) - self + if not isinstance(other, DatetimeLikeArrayMixin): + # Avoid down-casting DatetimeIndex + from pandas.core.arrays import DatetimeArray + + other = DatetimeArray(other) + return other - self + elif ( + is_datetime64_any_dtype(self.dtype) + and hasattr(other, "dtype") + and not is_datetime64_any_dtype(other.dtype) + ): + # GH#19959 datetime - datetime is well-defined as timedelta, + # but any other type - datetime is not well-defined. + raise TypeError( + f"cannot subtract {type(self).__name__} from {type(other).__name__}" + ) + elif is_period_dtype(self.dtype) and is_timedelta64_dtype(other): + # TODO: Can we simplify/generalize these cases at all? + raise TypeError(f"cannot subtract {type(self).__name__} from {other.dtype}") + elif is_timedelta64_dtype(self.dtype): + if lib.is_integer(other) or is_integer_dtype(other): + # need to subtract before negating, since that flips freq + # -self flips self.freq, messing up results + return -(self - other) + + return (-self) + other + + return -(self - other) + + def __iadd__(self, other): # type: ignore + result = self + other + self[:] = result[:] + + if not is_period_dtype(self): + # restore freq, which is invalidated by setitem + self._freq = result._freq + return self + + def __isub__(self, other): # type: ignore + result = self - other + self[:] = result[:] + + if not is_period_dtype(self): + # restore freq, which is invalidated by setitem + self._freq = result._freq + return self + + # -------------------------------------------------------------- + # Reductions + + def _reduce(self, name, axis=0, skipna=True, **kwargs): + op = getattr(self, name, None) + if op: + return op(skipna=skipna, **kwargs) + else: + return super()._reduce(name, skipna, **kwargs) + + def min(self, axis=None, skipna=True, *args, **kwargs): + """ + Return the minimum value of the Array or minimum along + an axis. + + See Also + -------- + numpy.ndarray.min + Index.min : Return the minimum value in an Index. + Series.min : Return the minimum value in a Series. + """ + nv.validate_min(args, kwargs) + nv.validate_minmax_axis(axis) + + result = nanops.nanmin(self.asi8, skipna=skipna, mask=self.isna()) + if isna(result): + # Period._from_ordinal does not handle np.nan gracefully + return NaT + return self._box_func(result) + + def max(self, axis=None, skipna=True, *args, **kwargs): + """ + Return the maximum value of the Array or maximum along + an axis. + + See Also + -------- + numpy.ndarray.max + Index.max : Return the maximum value in an Index. + Series.max : Return the maximum value in a Series. + """ + # TODO: skipna is broken with max. + # See https://github.com/pandas-dev/pandas/issues/24265 + nv.validate_max(args, kwargs) + nv.validate_minmax_axis(axis) + + mask = self.isna() + if skipna: + values = self[~mask].asi8 + elif mask.any(): + return NaT + else: + values = self.asi8 + + if not len(values): + # short-circuit for empty max / min + return NaT + + result = nanops.nanmax(values, skipna=skipna) + # Don't have to worry about NA `result`, since no NA went in. + return self._box_func(result) + + def mean(self, skipna=True): + """ + Return the mean value of the Array. + + .. versionadded:: 0.25.0 + + Parameters + ---------- + skipna : bool, default True + Whether to ignore any NaT elements. + + Returns + ------- + scalar + Timestamp or Timedelta. + + See Also + -------- + numpy.ndarray.mean : Returns the average of array elements along a given axis. + Series.mean : Return the mean value in a Series. + + Notes + ----- + mean is only defined for Datetime and Timedelta dtypes, not for Period. + """ + if is_period_dtype(self): + # See discussion in GH#24757 + raise TypeError( + f"mean is not implemented for {type(self).__name__} since the " + "meaning is ambiguous. An alternative is " + "obj.to_timestamp(how='start').mean()" + ) + + mask = self.isna() + if skipna: + values = self[~mask] + elif mask.any(): + return NaT + else: + values = self + + if not len(values): + # short-circuit for empty max / min + return NaT + + result = nanops.nanmean(values.view("i8"), skipna=skipna) + # Don't have to worry about NA `result`, since no NA went in. + return self._box_func(result) + + +DatetimeLikeArrayMixin._add_comparison_ops() + +# ------------------------------------------------------------------- +# Shared Constructor Helpers + + +def validate_periods(periods): + """ + If a `periods` argument is passed to the Datetime/Timedelta Array/Index + constructor, cast it to an integer. + + Parameters + ---------- + periods : None, float, int + + Returns + ------- + periods : None or int + + Raises + ------ + TypeError + if periods is None, float, or int + """ + if periods is not None: + if lib.is_float(periods): + periods = int(periods) + elif not lib.is_integer(periods): + raise TypeError(f"periods must be a number, got {periods}") + return periods + + +def validate_endpoints(closed): + """ + Check that the `closed` argument is among [None, "left", "right"] + + Parameters + ---------- + closed : {None, "left", "right"} + + Returns + ------- + left_closed : bool + right_closed : bool + + Raises + ------ + ValueError : if argument is not among valid values + """ + left_closed = False + right_closed = False + + if closed is None: + left_closed = True + right_closed = True + elif closed == "left": + left_closed = True + elif closed == "right": + right_closed = True + else: + raise ValueError("Closed has to be either 'left', 'right' or None") + + return left_closed, right_closed + + +def validate_inferred_freq(freq, inferred_freq, freq_infer): + """ + If the user passes a freq and another freq is inferred from passed data, + require that they match. + + Parameters + ---------- + freq : DateOffset or None + inferred_freq : DateOffset or None + freq_infer : bool + + Returns + ------- + freq : DateOffset or None + freq_infer : bool + + Notes + ----- + We assume at this point that `maybe_infer_freq` has been called, so + `freq` is either a DateOffset object or None. + """ + if inferred_freq is not None: + if freq is not None and freq != inferred_freq: + raise ValueError( + f"Inferred frequency {inferred_freq} from passed " + "values does not conform to passed frequency " + f"{freq.freqstr}" + ) + elif freq is None: + freq = inferred_freq + freq_infer = False + + return freq, freq_infer + + +def maybe_infer_freq(freq): + """ + Comparing a DateOffset to the string "infer" raises, so we need to + be careful about comparisons. Make a dummy variable `freq_infer` to + signify the case where the given freq is "infer" and set freq to None + to avoid comparison trouble later on. + + Parameters + ---------- + freq : {DateOffset, None, str} + + Returns + ------- + freq : {DateOffset, None} + freq_infer : bool + """ + freq_infer = False + if not isinstance(freq, DateOffset): + # if a passed freq is None, don't infer automatically + if freq != "infer": + freq = frequencies.to_offset(freq) + else: + freq_infer = True + freq = None + return freq, freq_infer diff --git a/venv/Lib/site-packages/pandas/core/arrays/datetimes.py b/venv/Lib/site-packages/pandas/core/arrays/datetimes.py new file mode 100644 index 0000000..e42402b --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/arrays/datetimes.py @@ -0,0 +1,2154 @@ +from datetime import datetime, time, timedelta +from typing import Union +import warnings + +import numpy as np +from pytz import utc + +from pandas._libs import lib, tslib +from pandas._libs.tslibs import ( + NaT, + Timestamp, + ccalendar, + conversion, + fields, + iNaT, + normalize_date, + resolution as libresolution, + timezones, + tzconversion, +) +from pandas.errors import PerformanceWarning + +from pandas.core.dtypes.common import ( + _INT64_DTYPE, + _NS_DTYPE, + is_categorical_dtype, + is_datetime64_any_dtype, + is_datetime64_dtype, + is_datetime64_ns_dtype, + is_datetime64tz_dtype, + is_dtype_equal, + is_extension_array_dtype, + is_float_dtype, + is_object_dtype, + is_period_dtype, + is_string_dtype, + is_timedelta64_dtype, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import DatetimeTZDtype +from pandas.core.dtypes.generic import ABCIndexClass, ABCPandasArray, ABCSeries +from pandas.core.dtypes.missing import isna + +from pandas.core.algorithms import checked_add_with_arr +from pandas.core.arrays import datetimelike as dtl +from pandas.core.arrays._ranges import generate_regular_range +import pandas.core.common as com + +from pandas.tseries.frequencies import get_period_alias, to_offset +from pandas.tseries.offsets import Day, Tick + +_midnight = time(0, 0) + + +def tz_to_dtype(tz): + """ + Return a datetime64[ns] dtype appropriate for the given timezone. + + Parameters + ---------- + tz : tzinfo or None + + Returns + ------- + np.dtype or Datetime64TZDType + """ + if tz is None: + return _NS_DTYPE + else: + return DatetimeTZDtype(tz=tz) + + +def _field_accessor(name, field, docstring=None): + def f(self): + values = self.asi8 + if self.tz is not None and not timezones.is_utc(self.tz): + values = self._local_timestamps() + + if field in self._bool_ops: + if field.endswith(("start", "end")): + freq = self.freq + month_kw = 12 + if freq: + kwds = freq.kwds + month_kw = kwds.get("startingMonth", kwds.get("month", 12)) + + result = fields.get_start_end_field( + values, field, self.freqstr, month_kw + ) + else: + result = fields.get_date_field(values, field) + + # these return a boolean by-definition + return result + + if field in self._object_ops: + result = fields.get_date_name_field(values, field) + result = self._maybe_mask_results(result, fill_value=None) + + else: + result = fields.get_date_field(values, field) + result = self._maybe_mask_results( + result, fill_value=None, convert="float64" + ) + + return result + + f.__name__ = name + f.__doc__ = docstring + return property(f) + + +class DatetimeArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps, dtl.DatelikeOps): + """ + Pandas ExtensionArray for tz-naive or tz-aware datetime data. + + .. versionadded:: 0.24.0 + + .. warning:: + + DatetimeArray is currently experimental, and its API may change + without warning. In particular, :attr:`DatetimeArray.dtype` is + expected to change to always be an instance of an ``ExtensionDtype`` + subclass. + + Parameters + ---------- + values : Series, Index, DatetimeArray, ndarray + The datetime data. + + For DatetimeArray `values` (or a Series or Index boxing one), + `dtype` and `freq` will be extracted from `values`. + + dtype : numpy.dtype or DatetimeTZDtype + Note that the only NumPy dtype allowed is 'datetime64[ns]'. + freq : str or Offset, optional + The frequency. + copy : bool, default False + Whether to copy the underlying array of values. + + Attributes + ---------- + None + + Methods + ------- + None + """ + + _typ = "datetimearray" + _scalar_type = Timestamp + _recognized_scalars = (datetime, np.datetime64) + _is_recognized_dtype = is_datetime64_any_dtype + + # define my properties & methods for delegation + _bool_ops = [ + "is_month_start", + "is_month_end", + "is_quarter_start", + "is_quarter_end", + "is_year_start", + "is_year_end", + "is_leap_year", + ] + _object_ops = ["freq", "tz"] + _field_ops = [ + "year", + "month", + "day", + "hour", + "minute", + "second", + "weekofyear", + "week", + "weekday", + "dayofweek", + "dayofyear", + "quarter", + "days_in_month", + "daysinmonth", + "microsecond", + "nanosecond", + ] + _other_ops = ["date", "time", "timetz"] + _datetimelike_ops = _field_ops + _object_ops + _bool_ops + _other_ops + _datetimelike_methods = [ + "to_period", + "tz_localize", + "tz_convert", + "normalize", + "strftime", + "round", + "floor", + "ceil", + "month_name", + "day_name", + ] + + # ndim is inherited from ExtensionArray, must exist to ensure + # Timestamp.__richcmp__(DateTimeArray) operates pointwise + + # ensure that operations with numpy arrays defer to our implementation + __array_priority__ = 1000 + + # ----------------------------------------------------------------- + # Constructors + + _dtype: Union[np.dtype, DatetimeTZDtype] + _freq = None + + def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False): + if isinstance(values, (ABCSeries, ABCIndexClass)): + values = values._values + + inferred_freq = getattr(values, "_freq", None) + + if isinstance(values, type(self)): + # validation + dtz = getattr(dtype, "tz", None) + if dtz and values.tz is None: + dtype = DatetimeTZDtype(tz=dtype.tz) + elif dtz and values.tz: + if not timezones.tz_compare(dtz, values.tz): + msg = ( + "Timezone of the array and 'dtype' do not match. " + f"'{dtz}' != '{values.tz}'" + ) + raise TypeError(msg) + elif values.tz: + dtype = values.dtype + + if freq is None: + freq = values.freq + values = values._data + + if not isinstance(values, np.ndarray): + msg = ( + f"Unexpected type '{type(values).__name__}'. 'values' must be " + "a DatetimeArray ndarray, or Series or Index containing one of those." + ) + raise ValueError(msg) + if values.ndim not in [1, 2]: + raise ValueError("Only 1-dimensional input arrays are supported.") + + if values.dtype == "i8": + # for compat with datetime/timedelta/period shared methods, + # we can sometimes get here with int64 values. These represent + # nanosecond UTC (or tz-naive) unix timestamps + values = values.view(_NS_DTYPE) + + if values.dtype != _NS_DTYPE: + msg = ( + "The dtype of 'values' is incorrect. Must be 'datetime64[ns]'." + f" Got {values.dtype} instead." + ) + raise ValueError(msg) + + dtype = _validate_dt64_dtype(dtype) + + if freq == "infer": + msg = ( + "Frequency inference not allowed in DatetimeArray.__init__. " + "Use 'pd.array()' instead." + ) + raise ValueError(msg) + + if copy: + values = values.copy() + if freq: + freq = to_offset(freq) + if getattr(dtype, "tz", None): + # https://github.com/pandas-dev/pandas/issues/18595 + # Ensure that we have a standard timezone for pytz objects. + # Without this, things like adding an array of timedeltas and + # a tz-aware Timestamp (with a tz specific to its datetime) will + # be incorrect(ish?) for the array as a whole + dtype = DatetimeTZDtype(tz=timezones.tz_standardize(dtype.tz)) + + self._data = values + self._dtype = dtype + self._freq = freq + + if inferred_freq is None and freq is not None: + type(self)._validate_frequency(self, freq) + + @classmethod + def _simple_new(cls, values, freq=None, dtype=_NS_DTYPE): + assert isinstance(values, np.ndarray) + if values.dtype == "i8": + values = values.view(_NS_DTYPE) + + result = object.__new__(cls) + result._data = values + result._freq = freq + result._dtype = dtype + return result + + @classmethod + def _from_sequence( + cls, + data, + dtype=None, + copy=False, + tz=None, + freq=None, + dayfirst=False, + yearfirst=False, + ambiguous="raise", + ): + + freq, freq_infer = dtl.maybe_infer_freq(freq) + + subarr, tz, inferred_freq = sequence_to_dt64ns( + data, + dtype=dtype, + copy=copy, + tz=tz, + dayfirst=dayfirst, + yearfirst=yearfirst, + ambiguous=ambiguous, + ) + + freq, freq_infer = dtl.validate_inferred_freq(freq, inferred_freq, freq_infer) + + dtype = tz_to_dtype(tz) + result = cls._simple_new(subarr, freq=freq, dtype=dtype) + + if inferred_freq is None and freq is not None: + # this condition precludes `freq_infer` + cls._validate_frequency(result, freq, ambiguous=ambiguous) + + elif freq_infer: + # Set _freq directly to bypass duplicative _validate_frequency + # check. + result._freq = to_offset(result.inferred_freq) + + return result + + @classmethod + def _generate_range( + cls, + start, + end, + periods, + freq, + tz=None, + normalize=False, + ambiguous="raise", + nonexistent="raise", + closed=None, + ): + + periods = dtl.validate_periods(periods) + if freq is None and any(x is None for x in [periods, start, end]): + raise ValueError("Must provide freq argument if no data is supplied") + + if com.count_not_none(start, end, periods, freq) != 3: + raise ValueError( + "Of the four parameters: start, end, periods, " + "and freq, exactly three must be specified" + ) + freq = to_offset(freq) + + if start is not None: + start = Timestamp(start) + + if end is not None: + end = Timestamp(end) + + if start is None and end is None: + if closed is not None: + raise ValueError( + "Closed has to be None if not both of start and end are defined" + ) + if start is NaT or end is NaT: + raise ValueError("Neither `start` nor `end` can be NaT") + + left_closed, right_closed = dtl.validate_endpoints(closed) + + start, end, _normalized = _maybe_normalize_endpoints(start, end, normalize) + + tz = _infer_tz_from_endpoints(start, end, tz) + + if tz is not None: + # Localize the start and end arguments + start = _maybe_localize_point( + start, + getattr(start, "tz", None), + start, + freq, + tz, + ambiguous, + nonexistent, + ) + end = _maybe_localize_point( + end, getattr(end, "tz", None), end, freq, tz, ambiguous, nonexistent + ) + if freq is not None: + # We break Day arithmetic (fixed 24 hour) here and opt for + # Day to mean calendar day (23/24/25 hour). Therefore, strip + # tz info from start and day to avoid DST arithmetic + if isinstance(freq, Day): + if start is not None: + start = start.tz_localize(None) + if end is not None: + end = end.tz_localize(None) + # TODO: consider re-implementing _cached_range; GH#17914 + values, _tz = generate_regular_range(start, end, periods, freq) + index = cls._simple_new(values, freq=freq, dtype=tz_to_dtype(_tz)) + + if tz is not None and index.tz is None: + arr = conversion.tz_localize_to_utc( + index.asi8, tz, ambiguous=ambiguous, nonexistent=nonexistent + ) + + index = cls(arr) + + # index is localized datetime64 array -> have to convert + # start/end as well to compare + if start is not None: + start = start.tz_localize(tz).asm8 + if end is not None: + end = end.tz_localize(tz).asm8 + else: + # Create a linearly spaced date_range in local time + # Nanosecond-granularity timestamps aren't always correctly + # representable with doubles, so we limit the range that we + # pass to np.linspace as much as possible + arr = ( + np.linspace(0, end.value - start.value, periods, dtype="int64") + + start.value + ) + dtype = tz_to_dtype(tz) + index = cls._simple_new( + arr.astype("M8[ns]", copy=False), freq=None, dtype=dtype + ) + + if not left_closed and len(index) and index[0] == start: + index = index[1:] + if not right_closed and len(index) and index[-1] == end: + index = index[:-1] + + dtype = tz_to_dtype(tz) + return cls._simple_new(index.asi8, freq=freq, dtype=dtype) + + # ----------------------------------------------------------------- + # DatetimeLike Interface + + def _unbox_scalar(self, value): + if not isinstance(value, self._scalar_type) and value is not NaT: + raise ValueError("'value' should be a Timestamp.") + if not isna(value): + self._check_compatible_with(value) + return value.value + + def _scalar_from_string(self, value): + return Timestamp(value, tz=self.tz) + + def _check_compatible_with(self, other, setitem: bool = False): + if other is NaT: + return + self._assert_tzawareness_compat(other) + if setitem: + # Stricter check for setitem vs comparison methods + if not timezones.tz_compare(self.tz, other.tz): + raise ValueError(f"Timezones don't match. '{self.tz} != {other.tz}'") + + def _maybe_clear_freq(self): + self._freq = None + + # ----------------------------------------------------------------- + # Descriptive Properties + + @property + def _box_func(self): + return lambda x: Timestamp(x, freq=self.freq, tz=self.tz) + + @property + def dtype(self) -> Union[np.dtype, DatetimeTZDtype]: + """ + The dtype for the DatetimeArray. + + .. warning:: + + A future version of pandas will change dtype to never be a + ``numpy.dtype``. Instead, :attr:`DatetimeArray.dtype` will + always be an instance of an ``ExtensionDtype`` subclass. + + Returns + ------- + numpy.dtype or DatetimeTZDtype + If the values are tz-naive, then ``np.dtype('datetime64[ns]')`` + is returned. + + If the values are tz-aware, then the ``DatetimeTZDtype`` + is returned. + """ + return self._dtype + + @property + def tz(self): + """ + Return timezone, if any. + + Returns + ------- + datetime.tzinfo, pytz.tzinfo.BaseTZInfo, dateutil.tz.tz.tzfile, or None + Returns None when the array is tz-naive. + """ + # GH 18595 + return getattr(self.dtype, "tz", None) + + @tz.setter + def tz(self, value): + # GH 3746: Prevent localizing or converting the index by setting tz + raise AttributeError( + "Cannot directly set timezone. Use tz_localize() " + "or tz_convert() as appropriate" + ) + + @property + def tzinfo(self): + """ + Alias for tz attribute + """ + return self.tz + + @property # NB: override with cache_readonly in immutable subclasses + def _timezone(self): + """ + Comparable timezone both for pytz / dateutil + """ + return timezones.get_timezone(self.tzinfo) + + @property # NB: override with cache_readonly in immutable subclasses + def is_normalized(self): + """ + Returns True if all of the dates are at midnight ("no time") + """ + return conversion.is_date_array_normalized(self.asi8, self.tz) + + @property # NB: override with cache_readonly in immutable subclasses + def _resolution(self): + return libresolution.resolution(self.asi8, self.tz) + + # ---------------------------------------------------------------- + # Array-Like / EA-Interface Methods + + def __array__(self, dtype=None) -> np.ndarray: + if dtype is None and self.tz: + # The default for tz-aware is object, to preserve tz info + dtype = object + + return super().__array__(dtype=dtype) + + def __iter__(self): + """ + Return an iterator over the boxed values + + Yields + ------ + tstamp : Timestamp + """ + + # convert in chunks of 10k for efficiency + data = self.asi8 + length = len(self) + chunksize = 10000 + chunks = int(length / chunksize) + 1 + for i in range(chunks): + start_i = i * chunksize + end_i = min((i + 1) * chunksize, length) + converted = tslib.ints_to_pydatetime( + data[start_i:end_i], tz=self.tz, freq=self.freq, box="timestamp" + ) + for v in converted: + yield v + + def astype(self, dtype, copy=True): + # We handle + # --> datetime + # --> period + # DatetimeLikeArrayMixin Super handles the rest. + dtype = pandas_dtype(dtype) + + if is_datetime64_ns_dtype(dtype) and not is_dtype_equal(dtype, self.dtype): + # GH#18951: datetime64_ns dtype but not equal means different tz + new_tz = getattr(dtype, "tz", None) + if getattr(self.dtype, "tz", None) is None: + return self.tz_localize(new_tz) + result = self.tz_convert(new_tz) + if new_tz is None: + # Do we want .astype('datetime64[ns]') to be an ndarray. + # The astype in Block._astype expects this to return an + # ndarray, but we could maybe work around it there. + result = result._data + return result + elif is_datetime64tz_dtype(self.dtype) and is_dtype_equal(self.dtype, dtype): + if copy: + return self.copy() + return self + elif is_period_dtype(dtype): + return self.to_period(freq=dtype.freq) + return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy) + + # ----------------------------------------------------------------- + # Rendering Methods + + def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): + from pandas.io.formats.format import _get_format_datetime64_from_values + + fmt = _get_format_datetime64_from_values(self, date_format) + + return tslib.format_array_from_datetime( + self.asi8, tz=self.tz, format=fmt, na_rep=na_rep + ) + + # ----------------------------------------------------------------- + # Comparison Methods + + def _has_same_tz(self, other): + zzone = self._timezone + + # vzone shouldn't be None if value is non-datetime like + if isinstance(other, np.datetime64): + # convert to Timestamp as np.datetime64 doesn't have tz attr + other = Timestamp(other) + vzone = timezones.get_timezone(getattr(other, "tzinfo", "__no_tz__")) + return zzone == vzone + + def _assert_tzawareness_compat(self, other): + # adapted from _Timestamp._assert_tzawareness_compat + other_tz = getattr(other, "tzinfo", None) + if is_datetime64tz_dtype(other): + # Get tzinfo from Series dtype + other_tz = other.dtype.tz + if other is NaT: + # pd.NaT quacks both aware and naive + pass + elif self.tz is None: + if other_tz is not None: + raise TypeError( + "Cannot compare tz-naive and tz-aware datetime-like objects." + ) + elif other_tz is None: + raise TypeError( + "Cannot compare tz-naive and tz-aware datetime-like objects" + ) + + # ----------------------------------------------------------------- + # Arithmetic Methods + + def _sub_datetime_arraylike(self, other): + """subtract DatetimeArray/Index or ndarray[datetime64]""" + if len(self) != len(other): + raise ValueError("cannot add indices of unequal length") + + if isinstance(other, np.ndarray): + assert is_datetime64_dtype(other) + other = type(self)(other) + + if not self._has_same_tz(other): + # require tz compat + raise TypeError( + f"{type(self).__name__} subtraction must have the same " + "timezones or no timezones" + ) + + self_i8 = self.asi8 + other_i8 = other.asi8 + arr_mask = self._isnan | other._isnan + new_values = checked_add_with_arr(self_i8, -other_i8, arr_mask=arr_mask) + if self._hasnans or other._hasnans: + new_values[arr_mask] = iNaT + return new_values.view("timedelta64[ns]") + + def _add_offset(self, offset): + if self.ndim == 2: + return self.ravel()._add_offset(offset).reshape(self.shape) + + assert not isinstance(offset, Tick) + try: + if self.tz is not None: + values = self.tz_localize(None) + else: + values = self + result = offset.apply_index(values).tz_localize(self.tz) + + except NotImplementedError: + warnings.warn( + "Non-vectorized DateOffset being applied to Series or DatetimeIndex", + PerformanceWarning, + ) + result = self.astype("O") + offset + if not len(self): + # GH#30336 _from_sequence won't be able to infer self.tz + return type(self)._from_sequence(result).tz_localize(self.tz) + + return type(self)._from_sequence(result, freq="infer") + + def _sub_datetimelike_scalar(self, other): + # subtract a datetime from myself, yielding a ndarray[timedelta64[ns]] + assert isinstance(other, (datetime, np.datetime64)) + assert other is not NaT + other = Timestamp(other) + if other is NaT: + return self - NaT + + if not self._has_same_tz(other): + # require tz compat + raise TypeError( + "Timestamp subtraction must have the same timezones or no timezones" + ) + + i8 = self.asi8 + result = checked_add_with_arr(i8, -other.value, arr_mask=self._isnan) + result = self._maybe_mask_results(result) + return result.view("timedelta64[ns]") + + def _add_delta(self, delta): + """ + Add a timedelta-like, Tick, or TimedeltaIndex-like object + to self, yielding a new DatetimeArray + + Parameters + ---------- + other : {timedelta, np.timedelta64, Tick, + TimedeltaIndex, ndarray[timedelta64]} + + Returns + ------- + result : DatetimeArray + """ + new_values = super()._add_delta(delta) + return type(self)._from_sequence(new_values, tz=self.tz, freq="infer") + + # ----------------------------------------------------------------- + # Timezone Conversion and Localization Methods + + def _local_timestamps(self): + """ + Convert to an i8 (unix-like nanosecond timestamp) representation + while keeping the local timezone and not using UTC. + This is used to calculate time-of-day information as if the timestamps + were timezone-naive. + """ + return tzconversion.tz_convert(self.asi8, utc, self.tz) + + def tz_convert(self, tz): + """ + Convert tz-aware Datetime Array/Index from one time zone to another. + + Parameters + ---------- + tz : str, pytz.timezone, dateutil.tz.tzfile or None + Time zone for time. Corresponding timestamps would be converted + to this time zone of the Datetime Array/Index. A `tz` of None will + convert to UTC and remove the timezone information. + + Returns + ------- + Array or Index + + Raises + ------ + TypeError + If Datetime Array/Index is tz-naive. + + See Also + -------- + DatetimeIndex.tz : A timezone that has a variable offset from UTC. + DatetimeIndex.tz_localize : Localize tz-naive DatetimeIndex to a + given time zone, or remove timezone from a tz-aware DatetimeIndex. + + Examples + -------- + With the `tz` parameter, we can change the DatetimeIndex + to other time zones: + + >>> dti = pd.date_range(start='2014-08-01 09:00', + ... freq='H', periods=3, tz='Europe/Berlin') + + >>> dti + DatetimeIndex(['2014-08-01 09:00:00+02:00', + '2014-08-01 10:00:00+02:00', + '2014-08-01 11:00:00+02:00'], + dtype='datetime64[ns, Europe/Berlin]', freq='H') + + >>> dti.tz_convert('US/Central') + DatetimeIndex(['2014-08-01 02:00:00-05:00', + '2014-08-01 03:00:00-05:00', + '2014-08-01 04:00:00-05:00'], + dtype='datetime64[ns, US/Central]', freq='H') + + With the ``tz=None``, we can remove the timezone (after converting + to UTC if necessary): + + >>> dti = pd.date_range(start='2014-08-01 09:00', freq='H', + ... periods=3, tz='Europe/Berlin') + + >>> dti + DatetimeIndex(['2014-08-01 09:00:00+02:00', + '2014-08-01 10:00:00+02:00', + '2014-08-01 11:00:00+02:00'], + dtype='datetime64[ns, Europe/Berlin]', freq='H') + + >>> dti.tz_convert(None) + DatetimeIndex(['2014-08-01 07:00:00', + '2014-08-01 08:00:00', + '2014-08-01 09:00:00'], + dtype='datetime64[ns]', freq='H') + """ + tz = timezones.maybe_get_tz(tz) + + if self.tz is None: + # tz naive, use tz_localize + raise TypeError( + "Cannot convert tz-naive timestamps, use tz_localize to localize" + ) + + # No conversion since timestamps are all UTC to begin with + dtype = tz_to_dtype(tz) + return self._simple_new(self.asi8, dtype=dtype, freq=self.freq) + + def tz_localize(self, tz, ambiguous="raise", nonexistent="raise"): + """ + Localize tz-naive Datetime Array/Index to tz-aware + Datetime Array/Index. + + This method takes a time zone (tz) naive Datetime Array/Index object + and makes this time zone aware. It does not move the time to another + time zone. + Time zone localization helps to switch from time zone aware to time + zone unaware objects. + + Parameters + ---------- + tz : str, pytz.timezone, dateutil.tz.tzfile or None + Time zone to convert timestamps to. Passing ``None`` will + remove the time zone information preserving local time. + ambiguous : 'infer', 'NaT', bool array, default 'raise' + When clocks moved backward due to DST, ambiguous times may arise. + For example in Central European Time (UTC+01), when going from + 03:00 DST to 02:00 non-DST, 02:30:00 local time occurs both at + 00:30:00 UTC and at 01:30:00 UTC. In such a situation, the + `ambiguous` parameter dictates how ambiguous times should be + handled. + + - 'infer' will attempt to infer fall dst-transition hours based on + order + - bool-ndarray where True signifies a DST time, False signifies a + non-DST time (note that this flag is only applicable for + ambiguous times) + - 'NaT' will return NaT where there are ambiguous times + - 'raise' will raise an AmbiguousTimeError if there are ambiguous + times. + + nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \ +default 'raise' + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. + + - 'shift_forward' will shift the nonexistent time forward to the + closest existing time + - 'shift_backward' will shift the nonexistent time backward to the + closest existing time + - 'NaT' will return NaT where there are nonexistent times + - timedelta objects will shift nonexistent times by the timedelta + - 'raise' will raise an NonExistentTimeError if there are + nonexistent times. + + .. versionadded:: 0.24.0 + + Returns + ------- + Same type as self + Array/Index converted to the specified time zone. + + Raises + ------ + TypeError + If the Datetime Array/Index is tz-aware and tz is not None. + + See Also + -------- + DatetimeIndex.tz_convert : Convert tz-aware DatetimeIndex from + one time zone to another. + + Examples + -------- + >>> tz_naive = pd.date_range('2018-03-01 09:00', periods=3) + >>> tz_naive + DatetimeIndex(['2018-03-01 09:00:00', '2018-03-02 09:00:00', + '2018-03-03 09:00:00'], + dtype='datetime64[ns]', freq='D') + + Localize DatetimeIndex in US/Eastern time zone: + + >>> tz_aware = tz_naive.tz_localize(tz='US/Eastern') + >>> tz_aware + DatetimeIndex(['2018-03-01 09:00:00-05:00', + '2018-03-02 09:00:00-05:00', + '2018-03-03 09:00:00-05:00'], + dtype='datetime64[ns, US/Eastern]', freq='D') + + With the ``tz=None``, we can remove the time zone information + while keeping the local time (not converted to UTC): + + >>> tz_aware.tz_localize(None) + DatetimeIndex(['2018-03-01 09:00:00', '2018-03-02 09:00:00', + '2018-03-03 09:00:00'], + dtype='datetime64[ns]', freq='D') + + Be careful with DST changes. When there is sequential data, pandas can + infer the DST time: + + >>> s = pd.to_datetime(pd.Series(['2018-10-28 01:30:00', + ... '2018-10-28 02:00:00', + ... '2018-10-28 02:30:00', + ... '2018-10-28 02:00:00', + ... '2018-10-28 02:30:00', + ... '2018-10-28 03:00:00', + ... '2018-10-28 03:30:00'])) + >>> s.dt.tz_localize('CET', ambiguous='infer') + 0 2018-10-28 01:30:00+02:00 + 1 2018-10-28 02:00:00+02:00 + 2 2018-10-28 02:30:00+02:00 + 3 2018-10-28 02:00:00+01:00 + 4 2018-10-28 02:30:00+01:00 + 5 2018-10-28 03:00:00+01:00 + 6 2018-10-28 03:30:00+01:00 + dtype: datetime64[ns, CET] + + In some cases, inferring the DST is impossible. In such cases, you can + pass an ndarray to the ambiguous parameter to set the DST explicitly + + >>> s = pd.to_datetime(pd.Series(['2018-10-28 01:20:00', + ... '2018-10-28 02:36:00', + ... '2018-10-28 03:46:00'])) + >>> s.dt.tz_localize('CET', ambiguous=np.array([True, True, False])) + 0 2015-03-29 03:00:00+02:00 + 1 2015-03-29 03:30:00+02:00 + dtype: datetime64[ns, Europe/Warsaw] + + If the DST transition causes nonexistent times, you can shift these + dates forward or backwards with a timedelta object or `'shift_forward'` + or `'shift_backwards'`. + + >>> s = pd.to_datetime(pd.Series(['2015-03-29 02:30:00', + ... '2015-03-29 03:30:00'])) + >>> s.dt.tz_localize('Europe/Warsaw', nonexistent='shift_forward') + 0 2015-03-29 03:00:00+02:00 + 1 2015-03-29 03:30:00+02:00 + dtype: datetime64[ns, 'Europe/Warsaw'] + >>> s.dt.tz_localize('Europe/Warsaw', nonexistent='shift_backward') + 0 2015-03-29 01:59:59.999999999+01:00 + 1 2015-03-29 03:30:00+02:00 + dtype: datetime64[ns, 'Europe/Warsaw'] + >>> s.dt.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1H')) + 0 2015-03-29 03:30:00+02:00 + 1 2015-03-29 03:30:00+02:00 + dtype: datetime64[ns, 'Europe/Warsaw'] + """ + nonexistent_options = ("raise", "NaT", "shift_forward", "shift_backward") + if nonexistent not in nonexistent_options and not isinstance( + nonexistent, timedelta + ): + raise ValueError( + "The nonexistent argument must be one of 'raise', " + "'NaT', 'shift_forward', 'shift_backward' or " + "a timedelta object" + ) + + if self.tz is not None: + if tz is None: + new_dates = tzconversion.tz_convert(self.asi8, timezones.UTC, self.tz) + else: + raise TypeError("Already tz-aware, use tz_convert to convert.") + else: + tz = timezones.maybe_get_tz(tz) + # Convert to UTC + + new_dates = conversion.tz_localize_to_utc( + self.asi8, tz, ambiguous=ambiguous, nonexistent=nonexistent + ) + new_dates = new_dates.view(_NS_DTYPE) + dtype = tz_to_dtype(tz) + return self._simple_new(new_dates, dtype=dtype, freq=self.freq) + + # ---------------------------------------------------------------- + # Conversion Methods - Vectorized analogues of Timestamp methods + + def to_pydatetime(self): + """ + Return Datetime Array/Index as object ndarray of datetime.datetime + objects. + + Returns + ------- + datetimes : ndarray + """ + return tslib.ints_to_pydatetime(self.asi8, tz=self.tz) + + def normalize(self): + """ + Convert times to midnight. + + The time component of the date-time is converted to midnight i.e. + 00:00:00. This is useful in cases, when the time does not matter. + Length is unaltered. The timezones are unaffected. + + This method is available on Series with datetime values under + the ``.dt`` accessor, and directly on Datetime Array/Index. + + Returns + ------- + DatetimeArray, DatetimeIndex or Series + The same type as the original data. Series will have the same + name and index. DatetimeIndex will have the same name. + + See Also + -------- + floor : Floor the datetimes to the specified freq. + ceil : Ceil the datetimes to the specified freq. + round : Round the datetimes to the specified freq. + + Examples + -------- + >>> idx = pd.date_range(start='2014-08-01 10:00', freq='H', + ... periods=3, tz='Asia/Calcutta') + >>> idx + DatetimeIndex(['2014-08-01 10:00:00+05:30', + '2014-08-01 11:00:00+05:30', + '2014-08-01 12:00:00+05:30'], + dtype='datetime64[ns, Asia/Calcutta]', freq='H') + >>> idx.normalize() + DatetimeIndex(['2014-08-01 00:00:00+05:30', + '2014-08-01 00:00:00+05:30', + '2014-08-01 00:00:00+05:30'], + dtype='datetime64[ns, Asia/Calcutta]', freq=None) + """ + if self.tz is None or timezones.is_utc(self.tz): + not_null = ~self.isna() + DAY_NS = ccalendar.DAY_SECONDS * 1_000_000_000 + new_values = self.asi8.copy() + adjustment = new_values[not_null] % DAY_NS + new_values[not_null] = new_values[not_null] - adjustment + else: + new_values = conversion.normalize_i8_timestamps(self.asi8, self.tz) + return type(self)._from_sequence(new_values, freq="infer").tz_localize(self.tz) + + def to_period(self, freq=None): + """ + Cast to PeriodArray/Index at a particular frequency. + + Converts DatetimeArray/Index to PeriodArray/Index. + + Parameters + ---------- + freq : str or Offset, optional + One of pandas' :ref:`offset strings ` + or an Offset object. Will be inferred by default. + + Returns + ------- + PeriodArray/Index + + Raises + ------ + ValueError + When converting a DatetimeArray/Index with non-regular values, + so that a frequency cannot be inferred. + + See Also + -------- + PeriodIndex: Immutable ndarray holding ordinal values. + DatetimeIndex.to_pydatetime: Return DatetimeIndex as object. + + Examples + -------- + >>> df = pd.DataFrame({"y": [1, 2, 3]}, + ... index=pd.to_datetime(["2000-03-31 00:00:00", + ... "2000-05-31 00:00:00", + ... "2000-08-31 00:00:00"])) + >>> df.index.to_period("M") + PeriodIndex(['2000-03', '2000-05', '2000-08'], + dtype='period[M]', freq='M') + + Infer the daily frequency + + >>> idx = pd.date_range("2017-01-01", periods=2) + >>> idx.to_period() + PeriodIndex(['2017-01-01', '2017-01-02'], + dtype='period[D]', freq='D') + """ + from pandas.core.arrays import PeriodArray + + if self.tz is not None: + warnings.warn( + "Converting to PeriodArray/Index representation " + "will drop timezone information.", + UserWarning, + ) + + if freq is None: + freq = self.freqstr or self.inferred_freq + + if freq is None: + raise ValueError( + "You must pass a freq argument as current index has none." + ) + + freq = get_period_alias(freq) + + return PeriodArray._from_datetime64(self._data, freq, tz=self.tz) + + def to_perioddelta(self, freq): + """ + Calculate TimedeltaArray of difference between index + values and index converted to PeriodArray at specified + freq. Used for vectorized offsets. + + Parameters + ---------- + freq : Period frequency + + Returns + ------- + TimedeltaArray/Index + """ + # TODO: consider privatizing (discussion in GH#23113) + from pandas.core.arrays.timedeltas import TimedeltaArray + + i8delta = self.asi8 - self.to_period(freq).to_timestamp().asi8 + m8delta = i8delta.view("m8[ns]") + return TimedeltaArray(m8delta) + + # ----------------------------------------------------------------- + # Properties - Vectorized Timestamp Properties/Methods + + def month_name(self, locale=None): + """ + Return the month names of the DateTimeIndex with specified locale. + + .. versionadded:: 0.23.0 + + Parameters + ---------- + locale : str, optional + Locale determining the language in which to return the month name. + Default is English locale. + + Returns + ------- + Index + Index of month names. + + Examples + -------- + >>> idx = pd.date_range(start='2018-01', freq='M', periods=3) + >>> idx + DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'], + dtype='datetime64[ns]', freq='M') + >>> idx.month_name() + Index(['January', 'February', 'March'], dtype='object') + """ + if self.tz is not None and not timezones.is_utc(self.tz): + values = self._local_timestamps() + else: + values = self.asi8 + + result = fields.get_date_name_field(values, "month_name", locale=locale) + result = self._maybe_mask_results(result, fill_value=None) + return result + + def day_name(self, locale=None): + """ + Return the day names of the DateTimeIndex with specified locale. + + .. versionadded:: 0.23.0 + + Parameters + ---------- + locale : str, optional + Locale determining the language in which to return the day name. + Default is English locale. + + Returns + ------- + Index + Index of day names. + + Examples + -------- + >>> idx = pd.date_range(start='2018-01-01', freq='D', periods=3) + >>> idx + DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'], + dtype='datetime64[ns]', freq='D') + >>> idx.day_name() + Index(['Monday', 'Tuesday', 'Wednesday'], dtype='object') + """ + if self.tz is not None and not timezones.is_utc(self.tz): + values = self._local_timestamps() + else: + values = self.asi8 + + result = fields.get_date_name_field(values, "day_name", locale=locale) + result = self._maybe_mask_results(result, fill_value=None) + return result + + @property + def time(self): + """ + Returns numpy array of datetime.time. The time part of the Timestamps. + """ + # If the Timestamps have a timezone that is not UTC, + # convert them into their i8 representation while + # keeping their timezone and not using UTC + if self.tz is not None and not timezones.is_utc(self.tz): + timestamps = self._local_timestamps() + else: + timestamps = self.asi8 + + return tslib.ints_to_pydatetime(timestamps, box="time") + + @property + def timetz(self): + """ + Returns numpy array of datetime.time also containing timezone + information. The time part of the Timestamps. + """ + return tslib.ints_to_pydatetime(self.asi8, self.tz, box="time") + + @property + def date(self): + """ + Returns numpy array of python datetime.date objects (namely, the date + part of Timestamps without timezone information). + """ + # If the Timestamps have a timezone that is not UTC, + # convert them into their i8 representation while + # keeping their timezone and not using UTC + if self.tz is not None and not timezones.is_utc(self.tz): + timestamps = self._local_timestamps() + else: + timestamps = self.asi8 + + return tslib.ints_to_pydatetime(timestamps, box="date") + + year = _field_accessor( + "year", + "Y", + """ + The year of the datetime. + """, + ) + month = _field_accessor( + "month", + "M", + """ + The month as January=1, December=12. + """, + ) + day = _field_accessor( + "day", + "D", + """ + The month as January=1, December=12. + """, + ) + hour = _field_accessor( + "hour", + "h", + """ + The hours of the datetime. + """, + ) + minute = _field_accessor( + "minute", + "m", + """ + The minutes of the datetime. + """, + ) + second = _field_accessor( + "second", + "s", + """ + The seconds of the datetime. + """, + ) + microsecond = _field_accessor( + "microsecond", + "us", + """ + The microseconds of the datetime. + """, + ) + nanosecond = _field_accessor( + "nanosecond", + "ns", + """ + The nanoseconds of the datetime. + """, + ) + weekofyear = _field_accessor( + "weekofyear", + "woy", + """ + The week ordinal of the year. + """, + ) + week = weekofyear + _dayofweek_doc = """ + The day of the week with Monday=0, Sunday=6. + + Return the day of the week. It is assumed the week starts on + Monday, which is denoted by 0 and ends on Sunday which is denoted + by 6. This method is available on both Series with datetime + values (using the `dt` accessor) or DatetimeIndex. + + Returns + ------- + Series or Index + Containing integers indicating the day number. + + See Also + -------- + Series.dt.dayofweek : Alias. + Series.dt.weekday : Alias. + Series.dt.day_name : Returns the name of the day of the week. + + Examples + -------- + >>> s = pd.date_range('2016-12-31', '2017-01-08', freq='D').to_series() + >>> s.dt.dayofweek + 2016-12-31 5 + 2017-01-01 6 + 2017-01-02 0 + 2017-01-03 1 + 2017-01-04 2 + 2017-01-05 3 + 2017-01-06 4 + 2017-01-07 5 + 2017-01-08 6 + Freq: D, dtype: int64 + """ + dayofweek = _field_accessor("dayofweek", "dow", _dayofweek_doc) + weekday = dayofweek + + dayofyear = _field_accessor( + "dayofyear", + "doy", + """ + The ordinal day of the year. + """, + ) + quarter = _field_accessor( + "quarter", + "q", + """ + The quarter of the date. + """, + ) + days_in_month = _field_accessor( + "days_in_month", + "dim", + """ + The number of days in the month. + """, + ) + daysinmonth = days_in_month + _is_month_doc = """ + Indicates whether the date is the {first_or_last} day of the month. + + Returns + ------- + Series or array + For Series, returns a Series with boolean values. + For DatetimeIndex, returns a boolean array. + + See Also + -------- + is_month_start : Return a boolean indicating whether the date + is the first day of the month. + is_month_end : Return a boolean indicating whether the date + is the last day of the month. + + Examples + -------- + This method is available on Series with datetime values under + the ``.dt`` accessor, and directly on DatetimeIndex. + + >>> s = pd.Series(pd.date_range("2018-02-27", periods=3)) + >>> s + 0 2018-02-27 + 1 2018-02-28 + 2 2018-03-01 + dtype: datetime64[ns] + >>> s.dt.is_month_start + 0 False + 1 False + 2 True + dtype: bool + >>> s.dt.is_month_end + 0 False + 1 True + 2 False + dtype: bool + + >>> idx = pd.date_range("2018-02-27", periods=3) + >>> idx.is_month_start + array([False, False, True]) + >>> idx.is_month_end + array([False, True, False]) + """ + is_month_start = _field_accessor( + "is_month_start", "is_month_start", _is_month_doc.format(first_or_last="first") + ) + + is_month_end = _field_accessor( + "is_month_end", "is_month_end", _is_month_doc.format(first_or_last="last") + ) + + is_quarter_start = _field_accessor( + "is_quarter_start", + "is_quarter_start", + """ + Indicator for whether the date is the first day of a quarter. + + Returns + ------- + is_quarter_start : Series or DatetimeIndex + The same type as the original data with boolean values. Series will + have the same name and index. DatetimeIndex will have the same + name. + + See Also + -------- + quarter : Return the quarter of the date. + is_quarter_end : Similar property for indicating the quarter start. + + Examples + -------- + This method is available on Series with datetime values under + the ``.dt`` accessor, and directly on DatetimeIndex. + + >>> df = pd.DataFrame({'dates': pd.date_range("2017-03-30", + ... periods=4)}) + >>> df.assign(quarter=df.dates.dt.quarter, + ... is_quarter_start=df.dates.dt.is_quarter_start) + dates quarter is_quarter_start + 0 2017-03-30 1 False + 1 2017-03-31 1 False + 2 2017-04-01 2 True + 3 2017-04-02 2 False + + >>> idx = pd.date_range('2017-03-30', periods=4) + >>> idx + DatetimeIndex(['2017-03-30', '2017-03-31', '2017-04-01', '2017-04-02'], + dtype='datetime64[ns]', freq='D') + + >>> idx.is_quarter_start + array([False, False, True, False]) + """, + ) + is_quarter_end = _field_accessor( + "is_quarter_end", + "is_quarter_end", + """ + Indicator for whether the date is the last day of a quarter. + + Returns + ------- + is_quarter_end : Series or DatetimeIndex + The same type as the original data with boolean values. Series will + have the same name and index. DatetimeIndex will have the same + name. + + See Also + -------- + quarter : Return the quarter of the date. + is_quarter_start : Similar property indicating the quarter start. + + Examples + -------- + This method is available on Series with datetime values under + the ``.dt`` accessor, and directly on DatetimeIndex. + + >>> df = pd.DataFrame({'dates': pd.date_range("2017-03-30", + ... periods=4)}) + >>> df.assign(quarter=df.dates.dt.quarter, + ... is_quarter_end=df.dates.dt.is_quarter_end) + dates quarter is_quarter_end + 0 2017-03-30 1 False + 1 2017-03-31 1 True + 2 2017-04-01 2 False + 3 2017-04-02 2 False + + >>> idx = pd.date_range('2017-03-30', periods=4) + >>> idx + DatetimeIndex(['2017-03-30', '2017-03-31', '2017-04-01', '2017-04-02'], + dtype='datetime64[ns]', freq='D') + + >>> idx.is_quarter_end + array([False, True, False, False]) + """, + ) + is_year_start = _field_accessor( + "is_year_start", + "is_year_start", + """ + Indicate whether the date is the first day of a year. + + Returns + ------- + Series or DatetimeIndex + The same type as the original data with boolean values. Series will + have the same name and index. DatetimeIndex will have the same + name. + + See Also + -------- + is_year_end : Similar property indicating the last day of the year. + + Examples + -------- + This method is available on Series with datetime values under + the ``.dt`` accessor, and directly on DatetimeIndex. + + >>> dates = pd.Series(pd.date_range("2017-12-30", periods=3)) + >>> dates + 0 2017-12-30 + 1 2017-12-31 + 2 2018-01-01 + dtype: datetime64[ns] + + >>> dates.dt.is_year_start + 0 False + 1 False + 2 True + dtype: bool + + >>> idx = pd.date_range("2017-12-30", periods=3) + >>> idx + DatetimeIndex(['2017-12-30', '2017-12-31', '2018-01-01'], + dtype='datetime64[ns]', freq='D') + + >>> idx.is_year_start + array([False, False, True]) + """, + ) + is_year_end = _field_accessor( + "is_year_end", + "is_year_end", + """ + Indicate whether the date is the last day of the year. + + Returns + ------- + Series or DatetimeIndex + The same type as the original data with boolean values. Series will + have the same name and index. DatetimeIndex will have the same + name. + + See Also + -------- + is_year_start : Similar property indicating the start of the year. + + Examples + -------- + This method is available on Series with datetime values under + the ``.dt`` accessor, and directly on DatetimeIndex. + + >>> dates = pd.Series(pd.date_range("2017-12-30", periods=3)) + >>> dates + 0 2017-12-30 + 1 2017-12-31 + 2 2018-01-01 + dtype: datetime64[ns] + + >>> dates.dt.is_year_end + 0 False + 1 True + 2 False + dtype: bool + + >>> idx = pd.date_range("2017-12-30", periods=3) + >>> idx + DatetimeIndex(['2017-12-30', '2017-12-31', '2018-01-01'], + dtype='datetime64[ns]', freq='D') + + >>> idx.is_year_end + array([False, True, False]) + """, + ) + is_leap_year = _field_accessor( + "is_leap_year", + "is_leap_year", + """ + Boolean indicator if the date belongs to a leap year. + + A leap year is a year, which has 366 days (instead of 365) including + 29th of February as an intercalary day. + Leap years are years which are multiples of four with the exception + of years divisible by 100 but not by 400. + + Returns + ------- + Series or ndarray + Booleans indicating if dates belong to a leap year. + + Examples + -------- + This method is available on Series with datetime values under + the ``.dt`` accessor, and directly on DatetimeIndex. + + >>> idx = pd.date_range("2012-01-01", "2015-01-01", freq="Y") + >>> idx + DatetimeIndex(['2012-12-31', '2013-12-31', '2014-12-31'], + dtype='datetime64[ns]', freq='A-DEC') + >>> idx.is_leap_year + array([ True, False, False], dtype=bool) + + >>> dates = pd.Series(idx) + >>> dates_series + 0 2012-12-31 + 1 2013-12-31 + 2 2014-12-31 + dtype: datetime64[ns] + >>> dates_series.dt.is_leap_year + 0 True + 1 False + 2 False + dtype: bool + """, + ) + + def to_julian_date(self): + """ + Convert Datetime Array to float64 ndarray of Julian Dates. + 0 Julian date is noon January 1, 4713 BC. + http://en.wikipedia.org/wiki/Julian_day + """ + + # http://mysite.verizon.net/aesir_research/date/jdalg2.htm + year = np.asarray(self.year) + month = np.asarray(self.month) + day = np.asarray(self.day) + testarr = month < 3 + year[testarr] -= 1 + month[testarr] += 12 + return ( + day + + np.fix((153 * month - 457) / 5) + + 365 * year + + np.floor(year / 4) + - np.floor(year / 100) + + np.floor(year / 400) + + 1_721_118.5 + + ( + self.hour + + self.minute / 60.0 + + self.second / 3600.0 + + self.microsecond / 3600.0 / 1e6 + + self.nanosecond / 3600.0 / 1e9 + ) + / 24.0 + ) + + +# ------------------------------------------------------------------- +# Constructor Helpers + + +def sequence_to_dt64ns( + data, + dtype=None, + copy=False, + tz=None, + dayfirst=False, + yearfirst=False, + ambiguous="raise", +): + """ + Parameters + ---------- + data : list-like + dtype : dtype, str, or None, default None + copy : bool, default False + tz : tzinfo, str, or None, default None + dayfirst : bool, default False + yearfirst : bool, default False + ambiguous : str, bool, or arraylike, default 'raise' + See pandas._libs.tslibs.conversion.tz_localize_to_utc. + + Returns + ------- + result : numpy.ndarray + The sequence converted to a numpy array with dtype ``datetime64[ns]``. + tz : tzinfo or None + Either the user-provided tzinfo or one inferred from the data. + inferred_freq : Tick or None + The inferred frequency of the sequence. + + Raises + ------ + TypeError : PeriodDType data is passed + """ + + inferred_freq = None + + dtype = _validate_dt64_dtype(dtype) + + if not hasattr(data, "dtype"): + # e.g. list, tuple + if np.ndim(data) == 0: + # i.e. generator + data = list(data) + data = np.asarray(data) + copy = False + elif isinstance(data, ABCSeries): + data = data._values + if isinstance(data, ABCPandasArray): + data = data.to_numpy() + + if hasattr(data, "freq"): + # i.e. DatetimeArray/Index + inferred_freq = data.freq + + # if dtype has an embedded tz, capture it + tz = validate_tz_from_dtype(dtype, tz) + + if isinstance(data, ABCIndexClass): + if data.nlevels > 1: + # Without this check, data._data below is None + raise TypeError("Cannot create a DatetimeArray from a MultiIndex.") + data = data._data + + # By this point we are assured to have either a numpy array or Index + data, copy = maybe_convert_dtype(data, copy) + + if is_object_dtype(data) or is_string_dtype(data): + # TODO: We do not have tests specific to string-dtypes, + # also complex or categorical or other extension + copy = False + if lib.infer_dtype(data, skipna=False) == "integer": + data = data.astype(np.int64) + else: + # data comes back here as either i8 to denote UTC timestamps + # or M8[ns] to denote wall times + data, inferred_tz = objects_to_datetime64ns( + data, dayfirst=dayfirst, yearfirst=yearfirst + ) + tz = maybe_infer_tz(tz, inferred_tz) + + # `data` may have originally been a Categorical[datetime64[ns, tz]], + # so we need to handle these types. + if is_datetime64tz_dtype(data): + # DatetimeArray -> ndarray + tz = maybe_infer_tz(tz, data.tz) + result = data._data + + elif is_datetime64_dtype(data): + # tz-naive DatetimeArray or ndarray[datetime64] + data = getattr(data, "_data", data) + if data.dtype != _NS_DTYPE: + data = conversion.ensure_datetime64ns(data) + + if tz is not None: + # Convert tz-naive to UTC + tz = timezones.maybe_get_tz(tz) + data = conversion.tz_localize_to_utc( + data.view("i8"), tz, ambiguous=ambiguous + ) + data = data.view(_NS_DTYPE) + + assert data.dtype == _NS_DTYPE, data.dtype + result = data + + else: + # must be integer dtype otherwise + # assume this data are epoch timestamps + if tz: + tz = timezones.maybe_get_tz(tz) + + if data.dtype != _INT64_DTYPE: + data = data.astype(np.int64, copy=False) + result = data.view(_NS_DTYPE) + + if copy: + # TODO: should this be deepcopy? + result = result.copy() + + assert isinstance(result, np.ndarray), type(result) + assert result.dtype == "M8[ns]", result.dtype + + # We have to call this again after possibly inferring a tz above + validate_tz_from_dtype(dtype, tz) + + return result, tz, inferred_freq + + +def objects_to_datetime64ns( + data, + dayfirst, + yearfirst, + utc=False, + errors="raise", + require_iso8601=False, + allow_object=False, +): + """ + Convert data to array of timestamps. + + Parameters + ---------- + data : np.ndarray[object] + dayfirst : bool + yearfirst : bool + utc : bool, default False + Whether to convert timezone-aware timestamps to UTC. + errors : {'raise', 'ignore', 'coerce'} + allow_object : bool + Whether to return an object-dtype ndarray instead of raising if the + data contains more than one timezone. + + Returns + ------- + result : ndarray + np.int64 dtype if returned values represent UTC timestamps + np.datetime64[ns] if returned values represent wall times + object if mixed timezones + inferred_tz : tzinfo or None + + Raises + ------ + ValueError : if data cannot be converted to datetimes + """ + assert errors in ["raise", "ignore", "coerce"] + + # if str-dtype, convert + data = np.array(data, copy=False, dtype=np.object_) + + try: + result, tz_parsed = tslib.array_to_datetime( + data, + errors=errors, + utc=utc, + dayfirst=dayfirst, + yearfirst=yearfirst, + require_iso8601=require_iso8601, + ) + except ValueError as e: + try: + values, tz_parsed = conversion.datetime_to_datetime64(data) + # If tzaware, these values represent unix timestamps, so we + # return them as i8 to distinguish from wall times + return values.view("i8"), tz_parsed + except (ValueError, TypeError): + raise e + + if tz_parsed is not None: + # We can take a shortcut since the datetime64 numpy array + # is in UTC + # Return i8 values to denote unix timestamps + return result.view("i8"), tz_parsed + elif is_datetime64_dtype(result): + # returning M8[ns] denotes wall-times; since tz is None + # the distinction is a thin one + return result, tz_parsed + elif is_object_dtype(result): + # GH#23675 when called via `pd.to_datetime`, returning an object-dtype + # array is allowed. When called via `pd.DatetimeIndex`, we can + # only accept datetime64 dtype, so raise TypeError if object-dtype + # is returned, as that indicates the values can be recognized as + # datetimes but they have conflicting timezones/awareness + if allow_object: + return result, tz_parsed + raise TypeError(result) + else: # pragma: no cover + # GH#23675 this TypeError should never be hit, whereas the TypeError + # in the object-dtype branch above is reachable. + raise TypeError(result) + + +def maybe_convert_dtype(data, copy): + """ + Convert data based on dtype conventions, issuing deprecation warnings + or errors where appropriate. + + Parameters + ---------- + data : np.ndarray or pd.Index + copy : bool + + Returns + ------- + data : np.ndarray or pd.Index + copy : bool + + Raises + ------ + TypeError : PeriodDType data is passed + """ + if is_float_dtype(data): + # Note: we must cast to datetime64[ns] here in order to treat these + # as wall-times instead of UTC timestamps. + data = data.astype(_NS_DTYPE) + copy = False + # TODO: deprecate this behavior to instead treat symmetrically + # with integer dtypes. See discussion in GH#23675 + + elif is_timedelta64_dtype(data): + # GH#29794 enforcing deprecation introduced in GH#23539 + raise TypeError(f"dtype {data.dtype} cannot be converted to datetime64[ns]") + elif is_period_dtype(data): + # Note: without explicitly raising here, PeriodIndex + # test_setops.test_join_does_not_recur fails + raise TypeError( + "Passing PeriodDtype data is invalid. Use `data.to_timestamp()` instead" + ) + + elif is_categorical_dtype(data): + # GH#18664 preserve tz in going DTI->Categorical->DTI + # TODO: cases where we need to do another pass through this func, + # e.g. the categories are timedelta64s + data = data.categories.take(data.codes, fill_value=NaT)._values + copy = False + + elif is_extension_array_dtype(data) and not is_datetime64tz_dtype(data): + # Includes categorical + # TODO: We have no tests for these + data = np.array(data, dtype=np.object_) + copy = False + + return data, copy + + +# ------------------------------------------------------------------- +# Validation and Inference + + +def maybe_infer_tz(tz, inferred_tz): + """ + If a timezone is inferred from data, check that it is compatible with + the user-provided timezone, if any. + + Parameters + ---------- + tz : tzinfo or None + inferred_tz : tzinfo or None + + Returns + ------- + tz : tzinfo or None + + Raises + ------ + TypeError : if both timezones are present but do not match + """ + if tz is None: + tz = inferred_tz + elif inferred_tz is None: + pass + elif not timezones.tz_compare(tz, inferred_tz): + raise TypeError( + f"data is already tz-aware {inferred_tz}, unable to " + f"set specified tz: {tz}" + ) + return tz + + +def _validate_dt64_dtype(dtype): + """ + Check that a dtype, if passed, represents either a numpy datetime64[ns] + dtype or a pandas DatetimeTZDtype. + + Parameters + ---------- + dtype : object + + Returns + ------- + dtype : None, numpy.dtype, or DatetimeTZDtype + + Raises + ------ + ValueError : invalid dtype + + Notes + ----- + Unlike validate_tz_from_dtype, this does _not_ allow non-existent + tz errors to go through + """ + if dtype is not None: + dtype = pandas_dtype(dtype) + if is_dtype_equal(dtype, np.dtype("M8")): + # no precision, disallowed GH#24806 + msg = ( + "Passing in 'datetime64' dtype with no precision is not allowed. " + "Please pass in 'datetime64[ns]' instead." + ) + raise ValueError(msg) + + if (isinstance(dtype, np.dtype) and dtype != _NS_DTYPE) or not isinstance( + dtype, (np.dtype, DatetimeTZDtype) + ): + raise ValueError( + f"Unexpected value for 'dtype': '{dtype}'. " + "Must be 'datetime64[ns]' or DatetimeTZDtype'." + ) + return dtype + + +def validate_tz_from_dtype(dtype, tz): + """ + If the given dtype is a DatetimeTZDtype, extract the implied + tzinfo object from it and check that it does not conflict with the given + tz. + + Parameters + ---------- + dtype : dtype, str + tz : None, tzinfo + + Returns + ------- + tz : consensus tzinfo + + Raises + ------ + ValueError : on tzinfo mismatch + """ + if dtype is not None: + if isinstance(dtype, str): + try: + dtype = DatetimeTZDtype.construct_from_string(dtype) + except TypeError: + # Things like `datetime64[ns]`, which is OK for the + # constructors, but also nonsense, which should be validated + # but not by us. We *do* allow non-existent tz errors to + # go through + pass + dtz = getattr(dtype, "tz", None) + if dtz is not None: + if tz is not None and not timezones.tz_compare(tz, dtz): + raise ValueError("cannot supply both a tz and a dtype with a tz") + tz = dtz + + if tz is not None and is_datetime64_dtype(dtype): + # We also need to check for the case where the user passed a + # tz-naive dtype (i.e. datetime64[ns]) + if tz is not None and not timezones.tz_compare(tz, dtz): + raise ValueError( + "cannot supply both a tz and a " + "timezone-naive dtype (i.e. datetime64[ns])" + ) + + return tz + + +def _infer_tz_from_endpoints(start, end, tz): + """ + If a timezone is not explicitly given via `tz`, see if one can + be inferred from the `start` and `end` endpoints. If more than one + of these inputs provides a timezone, require that they all agree. + + Parameters + ---------- + start : Timestamp + end : Timestamp + tz : tzinfo or None + + Returns + ------- + tz : tzinfo or None + + Raises + ------ + TypeError : if start and end timezones do not agree + """ + try: + inferred_tz = timezones.infer_tzinfo(start, end) + except AssertionError: + # infer_tzinfo raises AssertionError if passed mismatched timezones + raise TypeError( + "Start and end cannot both be tz-aware with different timezones" + ) + + inferred_tz = timezones.maybe_get_tz(inferred_tz) + tz = timezones.maybe_get_tz(tz) + + if tz is not None and inferred_tz is not None: + if not timezones.tz_compare(inferred_tz, tz): + raise AssertionError("Inferred time zone not equal to passed time zone") + + elif inferred_tz is not None: + tz = inferred_tz + + return tz + + +def _maybe_normalize_endpoints(start, end, normalize): + _normalized = True + + if start is not None: + if normalize: + start = normalize_date(start) + _normalized = True + else: + _normalized = _normalized and start.time() == _midnight + + if end is not None: + if normalize: + end = normalize_date(end) + _normalized = True + else: + _normalized = _normalized and end.time() == _midnight + + return start, end, _normalized + + +def _maybe_localize_point(ts, is_none, is_not_none, freq, tz, ambiguous, nonexistent): + """ + Localize a start or end Timestamp to the timezone of the corresponding + start or end Timestamp + + Parameters + ---------- + ts : start or end Timestamp to potentially localize + is_none : argument that should be None + is_not_none : argument that should not be None + freq : Tick, DateOffset, or None + tz : str, timezone object or None + ambiguous: str, localization behavior for ambiguous times + nonexistent: str, localization behavior for nonexistent times + + Returns + ------- + ts : Timestamp + """ + # Make sure start and end are timezone localized if: + # 1) freq = a Timedelta-like frequency (Tick) + # 2) freq = None i.e. generating a linspaced range + if is_none is None and is_not_none is not None: + # Note: We can't ambiguous='infer' a singular ambiguous time; however, + # we have historically defaulted ambiguous=False + ambiguous = ambiguous if ambiguous != "infer" else False + localize_args = {"ambiguous": ambiguous, "nonexistent": nonexistent, "tz": None} + if isinstance(freq, Tick) or freq is None: + localize_args["tz"] = tz + ts = ts.tz_localize(**localize_args) + return ts diff --git a/venv/Lib/site-packages/pandas/core/arrays/integer.py b/venv/Lib/site-packages/pandas/core/arrays/integer.py new file mode 100644 index 0000000..022e6a7 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/arrays/integer.py @@ -0,0 +1,805 @@ +import numbers +from typing import Any, Tuple, Type +import warnings + +import numpy as np + +from pandas._libs import lib, missing as libmissing +from pandas.compat import set_function_name +from pandas.util._decorators import cache_readonly + +from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.cast import astype_nansafe +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_float, + is_float_dtype, + is_integer, + is_integer_dtype, + is_list_like, + is_object_dtype, + is_scalar, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import register_extension_dtype +from pandas.core.dtypes.missing import isna + +from pandas.core import nanops, ops +from pandas.core.ops import invalid_comparison +from pandas.core.ops.common import unpack_zerodim_and_defer +from pandas.core.tools.numeric import to_numeric + +from .masked import BaseMaskedArray + + +class _IntegerDtype(ExtensionDtype): + """ + An ExtensionDtype to hold a single size & kind of integer dtype. + + These specific implementations are subclasses of the non-public + _IntegerDtype. For example we have Int8Dtype to represent signed int 8s. + + The attributes name & type are set when these subclasses are created. + """ + + name: str + base = None + type: Type + na_value = libmissing.NA + + def __repr__(self) -> str: + sign = "U" if self.is_unsigned_integer else "" + return f"{sign}Int{8 * self.itemsize}Dtype()" + + @cache_readonly + def is_signed_integer(self): + return self.kind == "i" + + @cache_readonly + def is_unsigned_integer(self): + return self.kind == "u" + + @property + def _is_numeric(self): + return True + + @cache_readonly + def numpy_dtype(self): + """ Return an instance of our numpy dtype """ + return np.dtype(self.type) + + @cache_readonly + def kind(self): + return self.numpy_dtype.kind + + @cache_readonly + def itemsize(self): + """ Return the number of bytes in this dtype """ + return self.numpy_dtype.itemsize + + @classmethod + def construct_array_type(cls): + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + return IntegerArray + + def __from_arrow__(self, array): + """Construct IntegerArray from passed pyarrow Array/ChunkedArray""" + import pyarrow + from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + # pyarrow.ChunkedArray + chunks = array.chunks + + results = [] + for arr in chunks: + data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.type) + int_arr = IntegerArray(data.copy(), ~mask, copy=False) + results.append(int_arr) + + return IntegerArray._concat_same_type(results) + + +def integer_array(values, dtype=None, copy=False): + """ + Infer and return an integer array of the values. + + Parameters + ---------- + values : 1D list-like + dtype : dtype, optional + dtype to coerce + copy : bool, default False + + Returns + ------- + IntegerArray + + Raises + ------ + TypeError if incompatible types + """ + values, mask = coerce_to_array(values, dtype=dtype, copy=copy) + return IntegerArray(values, mask) + + +def safe_cast(values, dtype, copy): + """ + Safely cast the values to the dtype if they + are equivalent, meaning floats must be equivalent to the + ints. + + """ + + try: + return values.astype(dtype, casting="safe", copy=copy) + except TypeError: + + casted = values.astype(dtype, copy=copy) + if (casted == values).all(): + return casted + + raise TypeError( + f"cannot safely cast non-equivalent {values.dtype} to {np.dtype(dtype)}" + ) + + +def coerce_to_array(values, dtype, mask=None, copy=False): + """ + Coerce the input values array to numpy arrays with a mask + + Parameters + ---------- + values : 1D list-like + dtype : integer dtype + mask : bool 1D array, optional + copy : bool, default False + if True, copy the input + + Returns + ------- + tuple of (values, mask) + """ + # if values is integer numpy array, preserve it's dtype + if dtype is None and hasattr(values, "dtype"): + if is_integer_dtype(values.dtype): + dtype = values.dtype + + if dtype is not None: + if isinstance(dtype, str) and ( + dtype.startswith("Int") or dtype.startswith("UInt") + ): + # Avoid DeprecationWarning from NumPy about np.dtype("Int64") + # https://github.com/numpy/numpy/pull/7476 + dtype = dtype.lower() + + if not issubclass(type(dtype), _IntegerDtype): + try: + dtype = _dtypes[str(np.dtype(dtype))] + except KeyError: + raise ValueError(f"invalid dtype specified {dtype}") + + if isinstance(values, IntegerArray): + values, mask = values._data, values._mask + if dtype is not None: + values = values.astype(dtype.numpy_dtype, copy=False) + + if copy: + values = values.copy() + mask = mask.copy() + return values, mask + + values = np.array(values, copy=copy) + if is_object_dtype(values): + inferred_type = lib.infer_dtype(values, skipna=True) + if inferred_type == "empty": + values = np.empty(len(values)) + values.fill(np.nan) + elif inferred_type not in [ + "floating", + "integer", + "mixed-integer", + "integer-na", + "mixed-integer-float", + ]: + raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype") + + elif is_bool_dtype(values) and is_integer_dtype(dtype): + values = np.array(values, dtype=int, copy=copy) + + elif not (is_integer_dtype(values) or is_float_dtype(values)): + raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype") + + if mask is None: + mask = isna(values) + else: + assert len(mask) == len(values) + + if not values.ndim == 1: + raise TypeError("values must be a 1D list-like") + if not mask.ndim == 1: + raise TypeError("mask must be a 1D list-like") + + # infer dtype if needed + if dtype is None: + dtype = np.dtype("int64") + else: + dtype = dtype.type + + # if we are float, let's make sure that we can + # safely cast + + # we copy as need to coerce here + if mask.any(): + values = values.copy() + values[mask] = 1 + values = safe_cast(values, dtype, copy=False) + else: + values = safe_cast(values, dtype, copy=False) + + return values, mask + + +class IntegerArray(BaseMaskedArray): + """ + Array of integer (optional missing) values. + + .. versionadded:: 0.24.0 + + .. versionchanged:: 1.0.0 + + Now uses :attr:`pandas.NA` as the missing value rather + than :attr:`numpy.nan`. + + .. warning:: + + IntegerArray is currently experimental, and its API or internal + implementation may change without warning. + + We represent an IntegerArray with 2 numpy arrays: + + - data: contains a numpy integer array of the appropriate dtype + - mask: a boolean array holding a mask on the data, True is missing + + To construct an IntegerArray from generic array-like input, use + :func:`pandas.array` with one of the integer dtypes (see examples). + + See :ref:`integer_na` for more. + + Parameters + ---------- + values : numpy.ndarray + A 1-d integer-dtype array. + mask : numpy.ndarray + A 1-d boolean-dtype array indicating missing values. + copy : bool, default False + Whether to copy the `values` and `mask`. + + Attributes + ---------- + None + + Methods + ------- + None + + Returns + ------- + IntegerArray + + Examples + -------- + Create an IntegerArray with :func:`pandas.array`. + + >>> int_array = pd.array([1, None, 3], dtype=pd.Int32Dtype()) + >>> int_array + + [1, , 3] + Length: 3, dtype: Int32 + + String aliases for the dtypes are also available. They are capitalized. + + >>> pd.array([1, None, 3], dtype='Int32') + + [1, , 3] + Length: 3, dtype: Int32 + + >>> pd.array([1, None, 3], dtype='UInt16') + + [1, , 3] + Length: 3, dtype: UInt16 + """ + + # The value used to fill '_data' to avoid upcasting + _internal_fill_value = 1 + + @cache_readonly + def dtype(self): + return _dtypes[str(self._data.dtype)] + + def __init__(self, values, mask, copy=False): + if not (isinstance(values, np.ndarray) and is_integer_dtype(values.dtype)): + raise TypeError( + "values should be integer numpy array. Use " + "the 'integer_array' function instead" + ) + if not (isinstance(mask, np.ndarray) and is_bool_dtype(mask.dtype)): + raise TypeError( + "mask should be boolean numpy array. Use " + "the 'integer_array' function instead" + ) + + if copy: + values = values.copy() + mask = mask.copy() + + self._data = values + self._mask = mask + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + return integer_array(scalars, dtype=dtype, copy=copy) + + @classmethod + def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): + scalars = to_numeric(strings, errors="raise") + return cls._from_sequence(scalars, dtype, copy) + + @classmethod + def _from_factorized(cls, values, original): + return integer_array(values, dtype=original.dtype) + + _HANDLED_TYPES = (np.ndarray, numbers.Number) + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + # For IntegerArray inputs, we apply the ufunc to ._data + # and mask the result. + if method == "reduce": + # Not clear how to handle missing values in reductions. Raise. + raise NotImplementedError("The 'reduce' method is not supported.") + out = kwargs.get("out", ()) + + for x in inputs + out: + if not isinstance(x, self._HANDLED_TYPES + (IntegerArray,)): + return NotImplemented + + # for binary ops, use our custom dunder methods + result = ops.maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + return result + + mask = np.zeros(len(self), dtype=bool) + inputs2 = [] + for x in inputs: + if isinstance(x, IntegerArray): + mask |= x._mask + inputs2.append(x._data) + else: + inputs2.append(x) + + def reconstruct(x): + # we don't worry about scalar `x` here, since we + # raise for reduce up above. + + if is_integer_dtype(x.dtype): + m = mask.copy() + return IntegerArray(x, m) + else: + x[mask] = np.nan + return x + + result = getattr(ufunc, method)(*inputs2, **kwargs) + if isinstance(result, tuple): + tuple(reconstruct(x) for x in result) + else: + return reconstruct(result) + + def __setitem__(self, key, value): + _is_scalar = is_scalar(value) + if _is_scalar: + value = [value] + value, mask = coerce_to_array(value, dtype=self.dtype) + + if _is_scalar: + value = value[0] + mask = mask[0] + + self._data[key] = value + self._mask[key] = mask + + def astype(self, dtype, copy=True): + """ + Cast to a NumPy array or IntegerArray with 'dtype'. + + Parameters + ---------- + dtype : str or dtype + Typecode or data-type to which the array is cast. + copy : bool, default True + Whether to copy the data, even if not necessary. If False, + a copy is made only if the old dtype does not match the + new dtype. + + Returns + ------- + array : ndarray or IntegerArray + NumPy ndarray or IntergerArray with 'dtype' for its dtype. + + Raises + ------ + TypeError + if incompatible type with an IntegerDtype, equivalent of same_kind + casting + """ + from pandas.core.arrays.boolean import BooleanArray, BooleanDtype + + dtype = pandas_dtype(dtype) + + # if we are astyping to an existing IntegerDtype we can fastpath + if isinstance(dtype, _IntegerDtype): + result = self._data.astype(dtype.numpy_dtype, copy=False) + return type(self)(result, mask=self._mask, copy=False) + elif isinstance(dtype, BooleanDtype): + result = self._data.astype("bool", copy=False) + return BooleanArray(result, mask=self._mask, copy=False) + + # coerce + if is_float_dtype(dtype): + # In astype, we consider dtype=float to also mean na_value=np.nan + kwargs = dict(na_value=np.nan) + else: + kwargs = {} + + data = self.to_numpy(dtype=dtype, **kwargs) + return astype_nansafe(data, dtype, copy=False) + + @property + def _ndarray_values(self) -> np.ndarray: + """Internal pandas method for lossy conversion to a NumPy ndarray. + + This method is not part of the pandas interface. + + The expectation is that this is cheap to compute, and is primarily + used for interacting with our indexers. + """ + return self._data + + def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: + # TODO: https://github.com/pandas-dev/pandas/issues/30037 + # use masked algorithms, rather than object-dtype / np.nan. + return self.to_numpy(na_value=np.nan), np.nan + + def _values_for_argsort(self) -> np.ndarray: + """Return values for sorting. + + Returns + ------- + ndarray + The transformed values should maintain the ordering between values + within the array. + + See Also + -------- + ExtensionArray.argsort + """ + data = self._data.copy() + data[self._mask] = data.min() - 1 + return data + + @classmethod + def _create_comparison_method(cls, op): + op_name = op.__name__ + + @unpack_zerodim_and_defer(op.__name__) + def cmp_method(self, other): + from pandas.arrays import BooleanArray + + mask = None + + if isinstance(other, (BooleanArray, IntegerArray)): + other, mask = other._data, other._mask + + elif is_list_like(other): + other = np.asarray(other) + if other.ndim > 1: + raise NotImplementedError( + "can only perform ops with 1-d structures" + ) + if len(self) != len(other): + raise ValueError("Lengths must match to compare") + + if other is libmissing.NA: + # numpy does not handle pd.NA well as "other" scalar (it returns + # a scalar False instead of an array) + # This may be fixed by NA.__array_ufunc__. Revisit this check + # once that's implemented. + result = np.zeros(self._data.shape, dtype="bool") + mask = np.ones(self._data.shape, dtype="bool") + else: + with warnings.catch_warnings(): + # numpy may show a FutureWarning: + # elementwise comparison failed; returning scalar instead, + # but in the future will perform elementwise comparison + # before returning NotImplemented. We fall back to the correct + # behavior today, so that should be fine to ignore. + warnings.filterwarnings("ignore", "elementwise", FutureWarning) + with np.errstate(all="ignore"): + method = getattr(self._data, f"__{op_name}__") + result = method(other) + + if result is NotImplemented: + result = invalid_comparison(self._data, other, op) + + # nans propagate + if mask is None: + mask = self._mask.copy() + else: + mask = self._mask | mask + + return BooleanArray(result, mask) + + name = f"__{op.__name__}__" + return set_function_name(cmp_method, name, cls) + + def _reduce(self, name, skipna=True, **kwargs): + data = self._data + mask = self._mask + + # coerce to a nan-aware float if needed + # (we explicitly use NaN within reductions) + if self._hasna: + data = self.to_numpy("float64", na_value=np.nan) + + op = getattr(nanops, "nan" + name) + result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) + + if np.isnan(result): + return libmissing.NA + + # if we have a boolean op, don't coerce + if name in ["any", "all"]: + pass + + # if we have a preservable numeric op, + # provide coercion back to an integer type if possible + elif name in ["sum", "min", "max", "prod"]: + int_result = int(result) + if int_result == result: + result = int_result + + return result + + def _maybe_mask_result(self, result, mask, other, op_name): + """ + Parameters + ---------- + result : array-like + mask : array-like bool + other : scalar or array-like + op_name : str + """ + + # if we have a float operand we are by-definition + # a float result + # or our op is a divide + if (is_float_dtype(other) or is_float(other)) or ( + op_name in ["rtruediv", "truediv"] + ): + result[mask] = np.nan + return result + + return type(self)(result, mask, copy=False) + + @classmethod + def _create_arithmetic_method(cls, op): + op_name = op.__name__ + + @unpack_zerodim_and_defer(op.__name__) + def integer_arithmetic_method(self, other): + + omask = None + + if getattr(other, "ndim", 0) > 1: + raise NotImplementedError("can only perform ops with 1-d structures") + + if isinstance(other, IntegerArray): + other, omask = other._data, other._mask + + elif is_list_like(other): + other = np.asarray(other) + if other.ndim > 1: + raise NotImplementedError( + "can only perform ops with 1-d structures" + ) + if len(self) != len(other): + raise ValueError("Lengths must match") + if not (is_float_dtype(other) or is_integer_dtype(other)): + raise TypeError("can only perform ops with numeric values") + + else: + if not (is_float(other) or is_integer(other) or other is libmissing.NA): + raise TypeError("can only perform ops with numeric values") + + if omask is None: + mask = self._mask.copy() + if other is libmissing.NA: + mask |= True + else: + mask = self._mask | omask + + if op_name == "pow": + # 1 ** x is 1. + mask = np.where((self._data == 1) & ~self._mask, False, mask) + # x ** 0 is 1. + if omask is not None: + mask = np.where((other == 0) & ~omask, False, mask) + elif other is not libmissing.NA: + mask = np.where(other == 0, False, mask) + + elif op_name == "rpow": + # 1 ** x is 1. + if omask is not None: + mask = np.where((other == 1) & ~omask, False, mask) + elif other is not libmissing.NA: + mask = np.where(other == 1, False, mask) + # x ** 0 is 1. + mask = np.where((self._data == 0) & ~self._mask, False, mask) + + if other is libmissing.NA: + result = np.ones_like(self._data) + else: + with np.errstate(all="ignore"): + result = op(self._data, other) + + # divmod returns a tuple + if op_name == "divmod": + div, mod = result + return ( + self._maybe_mask_result(div, mask, other, "floordiv"), + self._maybe_mask_result(mod, mask, other, "mod"), + ) + + return self._maybe_mask_result(result, mask, other, op_name) + + name = f"__{op.__name__}__" + return set_function_name(integer_arithmetic_method, name, cls) + + +IntegerArray._add_arithmetic_ops() +IntegerArray._add_comparison_ops() + + +_dtype_docstring = """ +An ExtensionDtype for {dtype} integer data. + +.. versionchanged:: 1.0.0 + + Now uses :attr:`pandas.NA` as its missing value, + rather than :attr:`numpy.nan`. + +Attributes +---------- +None + +Methods +------- +None +""" + +# create the Dtype +Int8Dtype = register_extension_dtype( + type( + "Int8Dtype", + (_IntegerDtype,), + { + "type": np.int8, + "name": "Int8", + "__doc__": _dtype_docstring.format(dtype="int8"), + }, + ) +) + +Int16Dtype = register_extension_dtype( + type( + "Int16Dtype", + (_IntegerDtype,), + { + "type": np.int16, + "name": "Int16", + "__doc__": _dtype_docstring.format(dtype="int16"), + }, + ) +) + +Int32Dtype = register_extension_dtype( + type( + "Int32Dtype", + (_IntegerDtype,), + { + "type": np.int32, + "name": "Int32", + "__doc__": _dtype_docstring.format(dtype="int32"), + }, + ) +) + +Int64Dtype = register_extension_dtype( + type( + "Int64Dtype", + (_IntegerDtype,), + { + "type": np.int64, + "name": "Int64", + "__doc__": _dtype_docstring.format(dtype="int64"), + }, + ) +) + +UInt8Dtype = register_extension_dtype( + type( + "UInt8Dtype", + (_IntegerDtype,), + { + "type": np.uint8, + "name": "UInt8", + "__doc__": _dtype_docstring.format(dtype="uint8"), + }, + ) +) + +UInt16Dtype = register_extension_dtype( + type( + "UInt16Dtype", + (_IntegerDtype,), + { + "type": np.uint16, + "name": "UInt16", + "__doc__": _dtype_docstring.format(dtype="uint16"), + }, + ) +) + +UInt32Dtype = register_extension_dtype( + type( + "UInt32Dtype", + (_IntegerDtype,), + { + "type": np.uint32, + "name": "UInt32", + "__doc__": _dtype_docstring.format(dtype="uint32"), + }, + ) +) + +UInt64Dtype = register_extension_dtype( + type( + "UInt64Dtype", + (_IntegerDtype,), + { + "type": np.uint64, + "name": "UInt64", + "__doc__": _dtype_docstring.format(dtype="uint64"), + }, + ) +) + +_dtypes = { + "int8": Int8Dtype(), + "int16": Int16Dtype(), + "int32": Int32Dtype(), + "int64": Int64Dtype(), + "uint8": UInt8Dtype(), + "uint16": UInt16Dtype(), + "uint32": UInt32Dtype(), + "uint64": UInt64Dtype(), +} diff --git a/venv/Lib/site-packages/pandas/core/arrays/interval.py b/venv/Lib/site-packages/pandas/core/arrays/interval.py new file mode 100644 index 0000000..d890c0c --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/arrays/interval.py @@ -0,0 +1,1332 @@ +from operator import le, lt +import textwrap + +import numpy as np + +from pandas._config import get_option + +from pandas._libs.interval import Interval, IntervalMixin, intervals_to_interval_bounds +from pandas.compat.numpy import function as nv +from pandas.util._decorators import Appender + +from pandas.core.dtypes.cast import maybe_convert_platform +from pandas.core.dtypes.common import ( + is_categorical_dtype, + is_datetime64_any_dtype, + is_float_dtype, + is_integer_dtype, + is_interval, + is_interval_dtype, + is_list_like, + is_object_dtype, + is_scalar, + is_string_dtype, + is_timedelta64_dtype, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import IntervalDtype +from pandas.core.dtypes.generic import ( + ABCDatetimeIndex, + ABCIndexClass, + ABCInterval, + ABCIntervalIndex, + ABCPeriodIndex, + ABCSeries, +) +from pandas.core.dtypes.missing import isna, notna + +from pandas.core.algorithms import take, value_counts +from pandas.core.arrays.base import ExtensionArray, _extension_array_shared_docs +from pandas.core.arrays.categorical import Categorical +import pandas.core.common as com +from pandas.core.construction import array +from pandas.core.indexers import check_array_indexer +from pandas.core.indexes.base import ensure_index + +_VALID_CLOSED = {"left", "right", "both", "neither"} +_interval_shared_docs = {} + +_shared_docs_kwargs = dict( + klass="IntervalArray", qualname="arrays.IntervalArray", name="" +) + + +_interval_shared_docs[ + "class" +] = """ +%(summary)s + +.. versionadded:: %(versionadded)s + +Parameters +---------- +data : array-like (1-dimensional) + Array-like containing Interval objects from which to build the + %(klass)s. +closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both or + neither. +dtype : dtype or None, default None + If None, dtype will be inferred. + + .. versionadded:: 0.23.0 +copy : bool, default False + Copy the input data. +%(name)s\ +verify_integrity : bool, default True + Verify that the %(klass)s is valid. + +Attributes +---------- +left +right +closed +mid +length +is_empty +is_non_overlapping_monotonic +%(extra_attributes)s\ + +Methods +------- +from_arrays +from_tuples +from_breaks +contains +overlaps +set_closed +to_tuples +%(extra_methods)s\ + +See Also +-------- +Index : The base pandas Index type. +Interval : A bounded slice-like interval; the elements of an %(klass)s. +interval_range : Function to create a fixed frequency IntervalIndex. +cut : Bin values into discrete Intervals. +qcut : Bin values into equal-sized Intervals based on rank or sample quantiles. + +Notes +----- +See the `user guide +`_ +for more. + +%(examples)s\ +""" + + +@Appender( + _interval_shared_docs["class"] + % dict( + klass="IntervalArray", + summary="Pandas array for interval data that are closed on the same side.", + versionadded="0.24.0", + name="", + extra_attributes="", + extra_methods="", + examples=textwrap.dedent( + """\ + Examples + -------- + A new ``IntervalArray`` can be constructed directly from an array-like of + ``Interval`` objects: + + >>> pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]) + + [(0, 1], (1, 5]] + Length: 2, closed: right, dtype: interval[int64] + + It may also be constructed using one of the constructor + methods: :meth:`IntervalArray.from_arrays`, + :meth:`IntervalArray.from_breaks`, and :meth:`IntervalArray.from_tuples`. + """ + ), + ) +) +class IntervalArray(IntervalMixin, ExtensionArray): + ndim = 1 + can_hold_na = True + _na_value = _fill_value = np.nan + + def __new__(cls, data, closed=None, dtype=None, copy=False, verify_integrity=True): + + if isinstance(data, ABCSeries) and is_interval_dtype(data): + data = data.values + + if isinstance(data, (cls, ABCIntervalIndex)): + left = data.left + right = data.right + closed = closed or data.closed + else: + + # don't allow scalars + if is_scalar(data): + msg = ( + f"{cls.__name__}(...) must be called with a collection " + f"of some kind, {data} was passed" + ) + raise TypeError(msg) + + # might need to convert empty or purely na data + data = maybe_convert_platform_interval(data) + left, right, infer_closed = intervals_to_interval_bounds( + data, validate_closed=closed is None + ) + closed = closed or infer_closed + + return cls._simple_new( + left, + right, + closed, + copy=copy, + dtype=dtype, + verify_integrity=verify_integrity, + ) + + @classmethod + def _simple_new( + cls, left, right, closed=None, copy=False, dtype=None, verify_integrity=True + ): + result = IntervalMixin.__new__(cls) + + closed = closed or "right" + left = ensure_index(left, copy=copy) + right = ensure_index(right, copy=copy) + + if dtype is not None: + # GH 19262: dtype must be an IntervalDtype to override inferred + dtype = pandas_dtype(dtype) + if not is_interval_dtype(dtype): + msg = f"dtype must be an IntervalDtype, got {dtype}" + raise TypeError(msg) + elif dtype.subtype is not None: + left = left.astype(dtype.subtype) + right = right.astype(dtype.subtype) + + # coerce dtypes to match if needed + if is_float_dtype(left) and is_integer_dtype(right): + right = right.astype(left.dtype) + elif is_float_dtype(right) and is_integer_dtype(left): + left = left.astype(right.dtype) + + if type(left) != type(right): + msg = ( + f"must not have differing left [{type(left).__name__}] and " + f"right [{type(right).__name__}] types" + ) + raise ValueError(msg) + elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype): + # GH 19016 + msg = ( + "category, object, and string subtypes are not supported " + "for IntervalArray" + ) + raise TypeError(msg) + elif isinstance(left, ABCPeriodIndex): + msg = "Period dtypes are not supported, use a PeriodIndex instead" + raise ValueError(msg) + elif isinstance(left, ABCDatetimeIndex) and str(left.tz) != str(right.tz): + msg = ( + "left and right must have the same time zone, got " + f"'{left.tz}' and '{right.tz}'" + ) + raise ValueError(msg) + + result._left = left + result._right = right + result._closed = closed + if verify_integrity: + result._validate() + return result + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + return cls(scalars, dtype=dtype, copy=copy) + + @classmethod + def _from_factorized(cls, values, original): + if len(values) == 0: + # An empty array returns object-dtype here. We can't create + # a new IA from an (empty) object-dtype array, so turn it into the + # correct dtype. + values = values.astype(original.dtype.subtype) + return cls(values, closed=original.closed) + + _interval_shared_docs["from_breaks"] = textwrap.dedent( + """ + Construct an %(klass)s from an array of splits. + + Parameters + ---------- + breaks : array-like (1-dimensional) + Left and right bounds for each interval. + closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both + or neither. + copy : bool, default False + Copy the data. + dtype : dtype or None, default None + If None, dtype will be inferred. + + .. versionadded:: 0.23.0 + + Returns + ------- + %(klass)s + + See Also + -------- + interval_range : Function to create a fixed frequency IntervalIndex. + %(klass)s.from_arrays : Construct from a left and right array. + %(klass)s.from_tuples : Construct from a sequence of tuples. + + %(examples)s\ + """ + ) + + @classmethod + @Appender( + _interval_shared_docs["from_breaks"] + % dict( + klass="IntervalArray", + examples=textwrap.dedent( + """\ + Examples + -------- + >>> pd.arrays.IntervalArray.from_breaks([0, 1, 2, 3]) + + [(0, 1], (1, 2], (2, 3]] + Length: 3, closed: right, dtype: interval[int64] + """ + ), + ) + ) + def from_breaks(cls, breaks, closed="right", copy=False, dtype=None): + breaks = maybe_convert_platform_interval(breaks) + + return cls.from_arrays(breaks[:-1], breaks[1:], closed, copy=copy, dtype=dtype) + + _interval_shared_docs["from_arrays"] = textwrap.dedent( + """ + Construct from two arrays defining the left and right bounds. + + Parameters + ---------- + left : array-like (1-dimensional) + Left bounds for each interval. + right : array-like (1-dimensional) + Right bounds for each interval. + closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both + or neither. + copy : bool, default False + Copy the data. + dtype : dtype, optional + If None, dtype will be inferred. + + .. versionadded:: 0.23.0 + + Returns + ------- + %(klass)s + + Raises + ------ + ValueError + When a value is missing in only one of `left` or `right`. + When a value in `left` is greater than the corresponding value + in `right`. + + See Also + -------- + interval_range : Function to create a fixed frequency IntervalIndex. + %(klass)s.from_breaks : Construct an %(klass)s from an array of + splits. + %(klass)s.from_tuples : Construct an %(klass)s from an + array-like of tuples. + + Notes + ----- + Each element of `left` must be less than or equal to the `right` + element at the same position. If an element is missing, it must be + missing in both `left` and `right`. A TypeError is raised when + using an unsupported type for `left` or `right`. At the moment, + 'category', 'object', and 'string' subtypes are not supported. + + %(examples)s\ + """ + ) + + @classmethod + @Appender( + _interval_shared_docs["from_arrays"] + % dict( + klass="IntervalArray", + examples=textwrap.dedent( + """\ + >>> pd.arrays.IntervalArray.from_arrays([0, 1, 2], [1, 2, 3]) + + [(0, 1], (1, 2], (2, 3]] + Length: 3, closed: right, dtype: interval[int64] + """ + ), + ) + ) + def from_arrays(cls, left, right, closed="right", copy=False, dtype=None): + left = maybe_convert_platform_interval(left) + right = maybe_convert_platform_interval(right) + + return cls._simple_new( + left, right, closed, copy=copy, dtype=dtype, verify_integrity=True + ) + + _interval_shared_docs["from_tuples"] = textwrap.dedent( + """ + Construct an %(klass)s from an array-like of tuples. + + Parameters + ---------- + data : array-like (1-dimensional) + Array of tuples. + closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both + or neither. + copy : bool, default False + By-default copy the data, this is compat only and ignored. + dtype : dtype or None, default None + If None, dtype will be inferred. + + .. versionadded:: 0.23.0 + + Returns + ------- + %(klass)s + + See Also + -------- + interval_range : Function to create a fixed frequency IntervalIndex. + %(klass)s.from_arrays : Construct an %(klass)s from a left and + right array. + %(klass)s.from_breaks : Construct an %(klass)s from an array of + splits. + + %(examples)s\ + """ + ) + + @classmethod + @Appender( + _interval_shared_docs["from_tuples"] + % dict( + klass="IntervalArray", + examples=textwrap.dedent( + """\ + Examples + -------- + >>> pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 2)]) + + [(0, 1], (1, 2]] + Length: 2, closed: right, dtype: interval[int64] + """ + ), + ) + ) + def from_tuples(cls, data, closed="right", copy=False, dtype=None): + if len(data): + left, right = [], [] + else: + # ensure that empty data keeps input dtype + left = right = data + + for d in data: + if isna(d): + lhs = rhs = np.nan + else: + name = cls.__name__ + try: + # need list of length 2 tuples, e.g. [(0, 1), (1, 2), ...] + lhs, rhs = d + except ValueError: + msg = f"{name}.from_tuples requires tuples of length 2, got {d}" + raise ValueError(msg) + except TypeError: + msg = f"{name}.from_tuples received an invalid item, {d}" + raise TypeError(msg) + left.append(lhs) + right.append(rhs) + + return cls.from_arrays(left, right, closed, copy=False, dtype=dtype) + + def _validate(self): + """Verify that the IntervalArray is valid. + + Checks that + + * closed is valid + * left and right match lengths + * left and right have the same missing values + * left is always below right + """ + if self.closed not in _VALID_CLOSED: + msg = f"invalid option for 'closed': {self.closed}" + raise ValueError(msg) + if len(self.left) != len(self.right): + msg = "left and right must have the same length" + raise ValueError(msg) + left_mask = notna(self.left) + right_mask = notna(self.right) + if not (left_mask == right_mask).all(): + msg = ( + "missing values must be missing in the same " + "location both left and right sides" + ) + raise ValueError(msg) + if not (self.left[left_mask] <= self.right[left_mask]).all(): + msg = "left side of interval must be <= right side" + raise ValueError(msg) + + # --------- + # Interface + # --------- + def __iter__(self): + return iter(np.asarray(self)) + + def __len__(self) -> int: + return len(self.left) + + def __getitem__(self, value): + value = check_array_indexer(self, value) + left = self.left[value] + right = self.right[value] + + # scalar + if not isinstance(left, ABCIndexClass): + if is_scalar(left) and isna(left): + return self._fill_value + if np.ndim(left) > 1: + # GH#30588 multi-dimensional indexer disallowed + raise ValueError("multi-dimensional indexing not allowed") + return Interval(left, right, self.closed) + + return self._shallow_copy(left, right) + + def __setitem__(self, key, value): + # na value: need special casing to set directly on numpy arrays + needs_float_conversion = False + if is_scalar(value) and isna(value): + if is_integer_dtype(self.dtype.subtype): + # can't set NaN on a numpy integer array + needs_float_conversion = True + elif is_datetime64_any_dtype(self.dtype.subtype): + # need proper NaT to set directly on the numpy array + value = np.datetime64("NaT") + elif is_timedelta64_dtype(self.dtype.subtype): + # need proper NaT to set directly on the numpy array + value = np.timedelta64("NaT") + value_left, value_right = value, value + + # scalar interval + elif is_interval_dtype(value) or isinstance(value, ABCInterval): + self._check_closed_matches(value, name="value") + value_left, value_right = value.left, value.right + + else: + # list-like of intervals + try: + array = IntervalArray(value) + value_left, value_right = array.left, array.right + except TypeError: + # wrong type: not interval or NA + msg = f"'value' should be an interval type, got {type(value)} instead." + raise TypeError(msg) + + # Need to ensure that left and right are updated atomically, so we're + # forced to copy, update the copy, and swap in the new values. + left = self.left.copy(deep=True) + if needs_float_conversion: + left = left.astype("float") + left.values[key] = value_left + self._left = left + + right = self.right.copy(deep=True) + if needs_float_conversion: + right = right.astype("float") + right.values[key] = value_right + self._right = right + + def __eq__(self, other): + # ensure pandas array for list-like and eliminate non-interval scalars + if is_list_like(other): + if len(self) != len(other): + raise ValueError("Lengths must match to compare") + other = array(other) + elif not isinstance(other, Interval): + # non-interval scalar -> no matches + return np.zeros(len(self), dtype=bool) + + # determine the dtype of the elements we want to compare + if isinstance(other, Interval): + other_dtype = "interval" + elif not is_categorical_dtype(other): + other_dtype = other.dtype + else: + # for categorical defer to categories for dtype + other_dtype = other.categories.dtype + + # extract intervals if we have interval categories with matching closed + if is_interval_dtype(other_dtype): + if self.closed != other.categories.closed: + return np.zeros(len(self), dtype=bool) + other = other.categories.take(other.codes) + + # interval-like -> need same closed and matching endpoints + if is_interval_dtype(other_dtype): + if self.closed != other.closed: + return np.zeros(len(self), dtype=bool) + return (self.left == other.left) & (self.right == other.right) + + # non-interval/non-object dtype -> no matches + if not is_object_dtype(other_dtype): + return np.zeros(len(self), dtype=bool) + + # object dtype -> iteratively check for intervals + result = np.zeros(len(self), dtype=bool) + for i, obj in enumerate(other): + # need object to be an Interval with same closed and endpoints + if ( + isinstance(obj, Interval) + and self.closed == obj.closed + and self.left[i] == obj.left + and self.right[i] == obj.right + ): + result[i] = True + + return result + + def __ne__(self, other): + return ~self.__eq__(other) + + def fillna(self, value=None, method=None, limit=None): + """ + Fill NA/NaN values using the specified method. + + Parameters + ---------- + value : scalar, dict, Series + If a scalar value is passed it is used to fill all missing values. + Alternatively, a Series or dict can be used to fill in different + values for each index. The value should not be a list. The + value(s) passed should be either Interval objects or NA/NaN. + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + (Not implemented yet for IntervalArray) + Method to use for filling holes in reindexed Series + limit : int, default None + (Not implemented yet for IntervalArray) + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. + + Returns + ------- + filled : IntervalArray with NA/NaN filled + """ + if method is not None: + raise TypeError("Filling by method is not supported for IntervalArray.") + if limit is not None: + raise TypeError("limit is not supported for IntervalArray.") + + if not isinstance(value, ABCInterval): + msg = ( + "'IntervalArray.fillna' only supports filling with a " + f"scalar 'pandas.Interval'. Got a '{type(value).__name__}' instead." + ) + raise TypeError(msg) + + value = getattr(value, "_values", value) + self._check_closed_matches(value, name="value") + + left = self.left.fillna(value=value.left) + right = self.right.fillna(value=value.right) + return self._shallow_copy(left, right) + + @property + def dtype(self): + return IntervalDtype(self.left.dtype) + + def astype(self, dtype, copy=True): + """ + Cast to an ExtensionArray or NumPy array with dtype 'dtype'. + + Parameters + ---------- + dtype : str or dtype + Typecode or data-type to which the array is cast. + + copy : bool, default True + Whether to copy the data, even if not necessary. If False, + a copy is made only if the old dtype does not match the + new dtype. + + Returns + ------- + array : ExtensionArray or ndarray + ExtensionArray or NumPy ndarray with 'dtype' for its dtype. + """ + dtype = pandas_dtype(dtype) + if is_interval_dtype(dtype): + if dtype == self.dtype: + return self.copy() if copy else self + + # need to cast to different subtype + try: + new_left = self.left.astype(dtype.subtype) + new_right = self.right.astype(dtype.subtype) + except TypeError: + msg = ( + f"Cannot convert {self.dtype} to {dtype}; subtypes are incompatible" + ) + raise TypeError(msg) + return self._shallow_copy(new_left, new_right) + elif is_categorical_dtype(dtype): + return Categorical(np.asarray(self)) + # TODO: This try/except will be repeated. + try: + return np.asarray(self).astype(dtype, copy=copy) + except (TypeError, ValueError): + msg = f"Cannot cast {type(self).__name__} to dtype {dtype}" + raise TypeError(msg) + + @classmethod + def _concat_same_type(cls, to_concat): + """ + Concatenate multiple IntervalArray + + Parameters + ---------- + to_concat : sequence of IntervalArray + + Returns + ------- + IntervalArray + """ + closed = {interval.closed for interval in to_concat} + if len(closed) != 1: + raise ValueError("Intervals must all be closed on the same side.") + closed = closed.pop() + + left = np.concatenate([interval.left for interval in to_concat]) + right = np.concatenate([interval.right for interval in to_concat]) + return cls._simple_new(left, right, closed=closed, copy=False) + + def _shallow_copy(self, left=None, right=None, closed=None): + """ + Return a new IntervalArray with the replacement attributes + + Parameters + ---------- + left : array-like + Values to be used for the left-side of the the intervals. + If None, the existing left and right values will be used. + + right : array-like + Values to be used for the right-side of the the intervals. + If None and left is IntervalArray-like, the left and right + of the IntervalArray-like will be used. + + closed : {'left', 'right', 'both', 'neither'}, optional + Whether the intervals are closed on the left-side, right-side, both + or neither. If None, the existing closed will be used. + """ + if left is None: + + # no values passed + left, right = self.left, self.right + + elif right is None: + + # only single value passed, could be an IntervalArray + # or array of Intervals + if not isinstance(left, (type(self), ABCIntervalIndex)): + left = type(self)(left) + + left, right = left.left, left.right + else: + + # both left and right are values + pass + + closed = closed or self.closed + return self._simple_new(left, right, closed=closed, verify_integrity=False) + + def copy(self): + """ + Return a copy of the array. + + Returns + ------- + IntervalArray + """ + left = self.left.copy(deep=True) + right = self.right.copy(deep=True) + closed = self.closed + # TODO: Could skip verify_integrity here. + return type(self).from_arrays(left, right, closed=closed) + + def isna(self): + return isna(self.left) + + @property + def nbytes(self) -> int: + return self.left.nbytes + self.right.nbytes + + @property + def size(self) -> int: + # Avoid materializing self.values + return self.left.size + + def take(self, indices, allow_fill=False, fill_value=None, axis=None, **kwargs): + """ + Take elements from the IntervalArray. + + Parameters + ---------- + indices : sequence of integers + Indices to be taken. + + allow_fill : bool, default False + How to handle negative values in `indices`. + + * False: negative values in `indices` indicate positional indices + from the right (the default). This is similar to + :func:`numpy.take`. + + * True: negative values in `indices` indicate + missing values. These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. + + fill_value : Interval or NA, optional + Fill value to use for NA-indices when `allow_fill` is True. + This may be ``None``, in which case the default NA value for + the type, ``self.dtype.na_value``, is used. + + For many ExtensionArrays, there will be two representations of + `fill_value`: a user-facing "boxed" scalar, and a low-level + physical NA value. `fill_value` should be the user-facing version, + and the implementation should handle translating that to the + physical version for processing the take if necessary. + + axis : any, default None + Present for compat with IntervalIndex; does nothing. + + Returns + ------- + IntervalArray + + Raises + ------ + IndexError + When the indices are out of bounds for the array. + ValueError + When `indices` contains negative values other than ``-1`` + and `allow_fill` is True. + """ + nv.validate_take(tuple(), kwargs) + + fill_left = fill_right = fill_value + if allow_fill: + if fill_value is None: + fill_left = fill_right = self.left._na_value + elif is_interval(fill_value): + self._check_closed_matches(fill_value, name="fill_value") + fill_left, fill_right = fill_value.left, fill_value.right + elif not is_scalar(fill_value) and notna(fill_value): + msg = ( + "'IntervalArray.fillna' only supports filling with a " + "'scalar pandas.Interval or NA'. " + f"Got a '{type(fill_value).__name__}' instead." + ) + raise ValueError(msg) + + left_take = take( + self.left, indices, allow_fill=allow_fill, fill_value=fill_left + ) + right_take = take( + self.right, indices, allow_fill=allow_fill, fill_value=fill_right + ) + + return self._shallow_copy(left_take, right_take) + + def value_counts(self, dropna=True): + """ + Returns a Series containing counts of each interval. + + Parameters + ---------- + dropna : bool, default True + Don't include counts of NaN. + + Returns + ------- + counts : Series + + See Also + -------- + Series.value_counts + """ + # TODO: implement this is a non-naive way! + return value_counts(np.asarray(self), dropna=dropna) + + # Formatting + + def _format_data(self): + + # TODO: integrate with categorical and make generic + # name argument is unused here; just for compat with base / categorical + n = len(self) + max_seq_items = min((get_option("display.max_seq_items") or n) // 10, 10) + + formatter = str + + if n == 0: + summary = "[]" + elif n == 1: + first = formatter(self[0]) + summary = f"[{first}]" + elif n == 2: + first = formatter(self[0]) + last = formatter(self[-1]) + summary = f"[{first}, {last}]" + else: + + if n > max_seq_items: + n = min(max_seq_items // 2, 10) + head = [formatter(x) for x in self[:n]] + tail = [formatter(x) for x in self[-n:]] + head_str = ", ".join(head) + tail_str = ", ".join(tail) + summary = f"[{head_str} ... {tail_str}]" + else: + tail = [formatter(x) for x in self] + tail_str = ", ".join(tail) + summary = f"[{tail_str}]" + + return summary + + def __repr__(self) -> str: + # the short repr has no trailing newline, while the truncated + # repr does. So we include a newline in our template, and strip + # any trailing newlines from format_object_summary + data = self._format_data() + class_name = f"<{type(self).__name__}>\n" + + template = ( + f"{class_name}" + f"{data}\n" + f"Length: {len(self)}, closed: {self.closed}, dtype: {self.dtype}" + ) + return template + + def _format_space(self): + space = " " * (len(type(self).__name__) + 1) + return f"\n{space}" + + @property + def left(self): + """ + Return the left endpoints of each Interval in the IntervalArray as + an Index. + """ + return self._left + + @property + def right(self): + """ + Return the right endpoints of each Interval in the IntervalArray as + an Index. + """ + return self._right + + @property + def closed(self): + """ + Whether the intervals are closed on the left-side, right-side, both or + neither. + """ + return self._closed + + _interval_shared_docs["set_closed"] = textwrap.dedent( + """ + Return an %(klass)s identical to the current one, but closed on the + specified side. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + closed : {'left', 'right', 'both', 'neither'} + Whether the intervals are closed on the left-side, right-side, both + or neither. + + Returns + ------- + new_index : %(klass)s + + %(examples)s\ + """ + ) + + @Appender( + _interval_shared_docs["set_closed"] + % dict( + klass="IntervalArray", + examples=textwrap.dedent( + """\ + Examples + -------- + >>> index = pd.arrays.IntervalArray.from_breaks(range(4)) + >>> index + + [(0, 1], (1, 2], (2, 3]] + Length: 3, closed: right, dtype: interval[int64] + >>> index.set_closed('both') + + [[0, 1], [1, 2], [2, 3]] + Length: 3, closed: both, dtype: interval[int64] + """ + ), + ) + ) + def set_closed(self, closed): + if closed not in _VALID_CLOSED: + msg = f"invalid option for 'closed': {closed}" + raise ValueError(msg) + + return self._shallow_copy(closed=closed) + + @property + def length(self): + """ + Return an Index with entries denoting the length of each Interval in + the IntervalArray. + """ + try: + return self.right - self.left + except TypeError: + # length not defined for some types, e.g. string + msg = ( + "IntervalArray contains Intervals without defined length, " + "e.g. Intervals with string endpoints" + ) + raise TypeError(msg) + + @property + def mid(self): + """ + Return the midpoint of each Interval in the IntervalArray as an Index. + """ + try: + return 0.5 * (self.left + self.right) + except TypeError: + # datetime safe version + return self.left + 0.5 * self.length + + _interval_shared_docs[ + "is_non_overlapping_monotonic" + ] = """ + Return True if the %(klass)s is non-overlapping (no Intervals share + points) and is either monotonic increasing or monotonic decreasing, + else False. + """ + # https://github.com/python/mypy/issues/1362 + # Mypy does not support decorated properties + @property # type: ignore + @Appender( + _interval_shared_docs["is_non_overlapping_monotonic"] % _shared_docs_kwargs + ) + def is_non_overlapping_monotonic(self): + # must be increasing (e.g., [0, 1), [1, 2), [2, 3), ... ) + # or decreasing (e.g., [-1, 0), [-2, -1), [-3, -2), ...) + # we already require left <= right + + # strict inequality for closed == 'both'; equality implies overlapping + # at a point when both sides of intervals are included + if self.closed == "both": + return bool( + (self.right[:-1] < self.left[1:]).all() + or (self.left[:-1] > self.right[1:]).all() + ) + + # non-strict inequality when closed != 'both'; at least one side is + # not included in the intervals, so equality does not imply overlapping + return bool( + (self.right[:-1] <= self.left[1:]).all() + or (self.left[:-1] >= self.right[1:]).all() + ) + + # Conversion + def __array__(self, dtype=None) -> np.ndarray: + """ + Return the IntervalArray's data as a numpy array of Interval + objects (with dtype='object') + """ + left = self.left + right = self.right + mask = self.isna() + closed = self._closed + + result = np.empty(len(left), dtype=object) + for i in range(len(left)): + if mask[i]: + result[i] = np.nan + else: + result[i] = Interval(left[i], right[i], closed) + return result + + def __arrow_array__(self, type=None): + """ + Convert myself into a pyarrow Array. + """ + import pyarrow + from pandas.core.arrays._arrow_utils import ArrowIntervalType + + try: + subtype = pyarrow.from_numpy_dtype(self.dtype.subtype) + except TypeError: + raise TypeError( + "Conversion to arrow with subtype '{}' " + "is not supported".format(self.dtype.subtype) + ) + interval_type = ArrowIntervalType(subtype, self.closed) + storage_array = pyarrow.StructArray.from_arrays( + [ + pyarrow.array(self.left, type=subtype, from_pandas=True), + pyarrow.array(self.right, type=subtype, from_pandas=True), + ], + names=["left", "right"], + ) + mask = self.isna() + if mask.any(): + # if there are missing values, set validity bitmap also on the array level + null_bitmap = pyarrow.array(~mask).buffers()[1] + storage_array = pyarrow.StructArray.from_buffers( + storage_array.type, + len(storage_array), + [null_bitmap], + children=[storage_array.field(0), storage_array.field(1)], + ) + + if type is not None: + if type.equals(interval_type.storage_type): + return storage_array + elif isinstance(type, ArrowIntervalType): + # ensure we have the same subtype and closed attributes + if not type.equals(interval_type): + raise TypeError( + "Not supported to convert IntervalArray to type with " + "different 'subtype' ({0} vs {1}) and 'closed' ({2} vs {3}) " + "attributes".format( + self.dtype.subtype, type.subtype, self.closed, type.closed + ) + ) + else: + raise TypeError( + "Not supported to convert IntervalArray to '{0}' type".format(type) + ) + + return pyarrow.ExtensionArray.from_storage(interval_type, storage_array) + + _interval_shared_docs[ + "to_tuples" + ] = """ + Return an %(return_type)s of tuples of the form (left, right). + + Parameters + ---------- + na_tuple : boolean, default True + Returns NA as a tuple if True, ``(nan, nan)``, or just as the NA + value itself if False, ``nan``. + + .. versionadded:: 0.23.0 + + Returns + ------- + tuples: %(return_type)s + %(examples)s\ + """ + + @Appender( + _interval_shared_docs["to_tuples"] % dict(return_type="ndarray", examples="") + ) + def to_tuples(self, na_tuple=True): + tuples = com.asarray_tuplesafe(zip(self.left, self.right)) + if not na_tuple: + # GH 18756 + tuples = np.where(~self.isna(), tuples, np.nan) + return tuples + + @Appender(_extension_array_shared_docs["repeat"] % _shared_docs_kwargs) + def repeat(self, repeats, axis=None): + nv.validate_repeat(tuple(), dict(axis=axis)) + left_repeat = self.left.repeat(repeats) + right_repeat = self.right.repeat(repeats) + return self._shallow_copy(left=left_repeat, right=right_repeat) + + _interval_shared_docs["contains"] = textwrap.dedent( + """ + Check elementwise if the Intervals contain the value. + + Return a boolean mask whether the value is contained in the Intervals + of the %(klass)s. + + .. versionadded:: 0.25.0 + + Parameters + ---------- + other : scalar + The value to check whether it is contained in the Intervals. + + Returns + ------- + boolean array + + See Also + -------- + Interval.contains : Check whether Interval object contains value. + %(klass)s.overlaps : Check if an Interval overlaps the values in the + %(klass)s. + + Examples + -------- + %(examples)s + >>> intervals.contains(0.5) + array([ True, False, False]) + """ + ) + + @Appender( + _interval_shared_docs["contains"] + % dict( + klass="IntervalArray", + examples=textwrap.dedent( + """\ + >>> intervals = pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 3), (2, 4)]) + >>> intervals + + [(0, 1], (1, 3], (2, 4]] + Length: 3, closed: right, dtype: interval[int64] + """ + ), + ) + ) + def contains(self, other): + if isinstance(other, Interval): + raise NotImplementedError("contains not implemented for two intervals") + + return (self.left < other if self.open_left else self.left <= other) & ( + other < self.right if self.open_right else other <= self.right + ) + + _interval_shared_docs["overlaps"] = textwrap.dedent( + """ + Check elementwise if an Interval overlaps the values in the %(klass)s. + + Two intervals overlap if they share a common point, including closed + endpoints. Intervals that only have an open endpoint in common do not + overlap. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + other : %(klass)s + Interval to check against for an overlap. + + Returns + ------- + ndarray + Boolean array positionally indicating where an overlap occurs. + + See Also + -------- + Interval.overlaps : Check whether two Interval objects overlap. + + Examples + -------- + %(examples)s + >>> intervals.overlaps(pd.Interval(0.5, 1.5)) + array([ True, True, False]) + + Intervals that share closed endpoints overlap: + + >>> intervals.overlaps(pd.Interval(1, 3, closed='left')) + array([ True, True, True]) + + Intervals that only have an open endpoint in common do not overlap: + + >>> intervals.overlaps(pd.Interval(1, 2, closed='right')) + array([False, True, False]) + """ + ) + + @Appender( + _interval_shared_docs["overlaps"] + % dict( + klass="IntervalArray", + examples=textwrap.dedent( + """\ + >>> data = [(0, 1), (1, 3), (2, 4)] + >>> intervals = pd.arrays.IntervalArray.from_tuples(data) + >>> intervals + + [(0, 1], (1, 3], (2, 4]] + Length: 3, closed: right, dtype: interval[int64] + """ + ), + ) + ) + def overlaps(self, other): + if isinstance(other, (IntervalArray, ABCIntervalIndex)): + raise NotImplementedError + elif not isinstance(other, Interval): + msg = f"`other` must be Interval-like, got {type(other).__name__}" + raise TypeError(msg) + + # equality is okay if both endpoints are closed (overlap at a point) + op1 = le if (self.closed_left and other.closed_right) else lt + op2 = le if (other.closed_left and self.closed_right) else lt + + # overlaps is equivalent negation of two interval being disjoint: + # disjoint = (A.left > B.right) or (B.left > A.right) + # (simplifying the negation allows this to be done in less operations) + return op1(self.left, other.right) & op2(other.left, self.right) + + +def maybe_convert_platform_interval(values): + """ + Try to do platform conversion, with special casing for IntervalArray. + Wrapper around maybe_convert_platform that alters the default return + dtype in certain cases to be compatible with IntervalArray. For example, + empty lists return with integer dtype instead of object dtype, which is + prohibited for IntervalArray. + + Parameters + ---------- + values : array-like + + Returns + ------- + array + """ + if isinstance(values, (list, tuple)) and len(values) == 0: + # GH 19016 + # empty lists/tuples get object dtype by default, but this is + # prohibited for IntervalArray, so coerce to integer instead + return np.array([], dtype=np.int64) + elif is_categorical_dtype(values): + values = np.asarray(values) + + return maybe_convert_platform(values) diff --git a/venv/Lib/site-packages/pandas/core/arrays/masked.py b/venv/Lib/site-packages/pandas/core/arrays/masked.py new file mode 100644 index 0000000..aa71684 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/arrays/masked.py @@ -0,0 +1,248 @@ +from typing import TYPE_CHECKING + +import numpy as np + +from pandas._libs import lib, missing as libmissing + +from pandas.core.dtypes.common import is_integer, is_object_dtype, is_string_dtype +from pandas.core.dtypes.missing import isna, notna + +from pandas.core.algorithms import take +from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin +from pandas.core.indexers import check_array_indexer + +if TYPE_CHECKING: + from pandas._typing import Scalar + + +class BaseMaskedArray(ExtensionArray, ExtensionOpsMixin): + """ + Base class for masked arrays (which use _data and _mask to store the data). + + numpy based + """ + + _data: np.ndarray + _mask: np.ndarray + + # The value used to fill '_data' to avoid upcasting + _internal_fill_value: "Scalar" + + def __getitem__(self, item): + if is_integer(item): + if self._mask[item]: + return self.dtype.na_value + return self._data[item] + + item = check_array_indexer(self, item) + + return type(self)(self._data[item], self._mask[item]) + + def __iter__(self): + for i in range(len(self)): + if self._mask[i]: + yield self.dtype.na_value + else: + yield self._data[i] + + def __len__(self) -> int: + return len(self._data) + + def to_numpy( + self, dtype=None, copy=False, na_value: "Scalar" = lib.no_default, + ): + """ + Convert to a NumPy Array. + + By default converts to an object-dtype NumPy array. Specify the `dtype` and + `na_value` keywords to customize the conversion. + + Parameters + ---------- + dtype : dtype, default object + The numpy dtype to convert to. + copy : bool, default False + Whether to ensure that the returned value is a not a view on + the array. Note that ``copy=False`` does not *ensure* that + ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that + a copy is made, even if not strictly necessary. This is typically + only possible when no missing values are present and `dtype` + is the equivalent numpy dtype. + na_value : scalar, optional + Scalar missing value indicator to use in numpy array. Defaults + to the native missing value indicator of this array (pd.NA). + + Returns + ------- + numpy.ndarray + + Examples + -------- + An object-dtype is the default result + + >>> a = pd.array([True, False, pd.NA], dtype="boolean") + >>> a.to_numpy() + array([True, False, NA], dtype=object) + + When no missing values are present, an equivalent dtype can be used. + + >>> pd.array([True, False], dtype="boolean").to_numpy(dtype="bool") + array([ True, False]) + >>> pd.array([1, 2], dtype="Int64").to_numpy("int64") + array([1, 2]) + + However, requesting such dtype will raise a ValueError if + missing values are present and the default missing value :attr:`NA` + is used. + + >>> a = pd.array([True, False, pd.NA], dtype="boolean") + >>> a + + [True, False, NA] + Length: 3, dtype: boolean + + >>> a.to_numpy(dtype="bool") + Traceback (most recent call last): + ... + ValueError: cannot convert to bool numpy array in presence of missing values + + Specify a valid `na_value` instead + + >>> a.to_numpy(dtype="bool", na_value=False) + array([ True, False, False]) + """ + if na_value is lib.no_default: + na_value = libmissing.NA + if dtype is None: + dtype = object + if self._hasna: + if ( + not (is_object_dtype(dtype) or is_string_dtype(dtype)) + and na_value is libmissing.NA + ): + raise ValueError( + f"cannot convert to '{dtype}'-dtype NumPy array " + "with missing values. Specify an appropriate 'na_value' " + "for this dtype." + ) + # don't pass copy to astype -> always need a copy since we are mutating + data = self._data.astype(dtype) + data[self._mask] = na_value + else: + data = self._data.astype(dtype, copy=copy) + return data + + __array_priority__ = 1000 # higher than ndarray so ops dispatch to us + + def __array__(self, dtype=None) -> np.ndarray: + """ + the array interface, return my values + We return an object array here to preserve our scalar values + """ + return self.to_numpy(dtype=dtype) + + def __arrow_array__(self, type=None): + """ + Convert myself into a pyarrow Array. + """ + import pyarrow as pa + + return pa.array(self._data, mask=self._mask, type=type) + + @property + def _hasna(self) -> bool: + # Note: this is expensive right now! The hope is that we can + # make this faster by having an optional mask, but not have to change + # source code using it.. + return self._mask.any() + + def isna(self): + return self._mask + + @property + def _na_value(self): + return self.dtype.na_value + + @property + def nbytes(self): + return self._data.nbytes + self._mask.nbytes + + @classmethod + def _concat_same_type(cls, to_concat): + data = np.concatenate([x._data for x in to_concat]) + mask = np.concatenate([x._mask for x in to_concat]) + return cls(data, mask) + + def take(self, indexer, allow_fill=False, fill_value=None): + # we always fill with 1 internally + # to avoid upcasting + data_fill_value = self._internal_fill_value if isna(fill_value) else fill_value + result = take( + self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill + ) + + mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill) + + # if we are filling + # we only fill where the indexer is null + # not existing missing values + # TODO(jreback) what if we have a non-na float as a fill value? + if allow_fill and notna(fill_value): + fill_mask = np.asarray(indexer) == -1 + result[fill_mask] = fill_value + mask = mask ^ fill_mask + + return type(self)(result, mask, copy=False) + + def copy(self): + data, mask = self._data, self._mask + data = data.copy() + mask = mask.copy() + return type(self)(data, mask, copy=False) + + def value_counts(self, dropna=True): + """ + Returns a Series containing counts of each unique value. + + Parameters + ---------- + dropna : bool, default True + Don't include counts of missing values. + + Returns + ------- + counts : Series + + See Also + -------- + Series.value_counts + """ + from pandas import Index, Series + from pandas.arrays import IntegerArray + + # compute counts on the data with no nans + data = self._data[~self._mask] + value_counts = Index(data).value_counts() + + # TODO(extension) + # if we have allow Index to hold an ExtensionArray + # this is easier + index = value_counts.index.values.astype(object) + + # if we want nans, count the mask + if dropna: + counts = value_counts.values + else: + counts = np.empty(len(value_counts) + 1, dtype="int64") + counts[:-1] = value_counts + counts[-1] = self._mask.sum() + + index = Index( + np.concatenate([index, np.array([self.dtype.na_value], dtype=object)]), + dtype=object, + ) + + mask = np.zeros(len(counts), dtype="bool") + counts = IntegerArray(counts, mask) + + return Series(counts, index=index) diff --git a/venv/Lib/site-packages/pandas/core/arrays/numpy_.py b/venv/Lib/site-packages/pandas/core/arrays/numpy_.py new file mode 100644 index 0000000..8b1d1e5 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/arrays/numpy_.py @@ -0,0 +1,465 @@ +import numbers +from typing import Union + +import numpy as np +from numpy.lib.mixins import NDArrayOperatorsMixin + +from pandas._libs import lib +from pandas.compat.numpy import function as nv +from pandas.util._decorators import Appender +from pandas.util._validators import validate_fillna_kwargs + +from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries +from pandas.core.dtypes.inference import is_array_like +from pandas.core.dtypes.missing import isna + +from pandas import compat +from pandas.core import nanops +from pandas.core.algorithms import searchsorted, take, unique +from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin +from pandas.core.construction import extract_array +from pandas.core.indexers import check_array_indexer +from pandas.core.missing import backfill_1d, pad_1d + + +class PandasDtype(ExtensionDtype): + """ + A Pandas ExtensionDtype for NumPy dtypes. + + .. versionadded:: 0.24.0 + + This is mostly for internal compatibility, and is not especially + useful on its own. + + Parameters + ---------- + dtype : numpy.dtype + """ + + _metadata = ("_dtype",) + + def __init__(self, dtype): + dtype = np.dtype(dtype) + self._dtype = dtype + self._type = dtype.type + + def __repr__(self) -> str: + return f"PandasDtype({repr(self.name)})" + + @property + def numpy_dtype(self): + """The NumPy dtype this PandasDtype wraps.""" + return self._dtype + + @property + def name(self): + return self._dtype.name + + @property + def type(self): + return self._type + + @property + def _is_numeric(self): + # exclude object, str, unicode, void. + return self.kind in set("biufc") + + @property + def _is_boolean(self): + return self.kind == "b" + + @classmethod + def construct_from_string(cls, string): + try: + return cls(np.dtype(string)) + except TypeError as err: + raise TypeError( + f"Cannot construct a 'PandasDtype' from '{string}'" + ) from err + + @classmethod + def construct_array_type(cls): + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + return PandasArray + + @property + def kind(self): + return self._dtype.kind + + @property + def itemsize(self): + """The element size of this data-type object.""" + return self._dtype.itemsize + + +class PandasArray(ExtensionArray, ExtensionOpsMixin, NDArrayOperatorsMixin): + """ + A pandas ExtensionArray for NumPy data. + + .. versionadded:: 0.24.0 + + This is mostly for internal compatibility, and is not especially + useful on its own. + + Parameters + ---------- + values : ndarray + The NumPy ndarray to wrap. Must be 1-dimensional. + copy : bool, default False + Whether to copy `values`. + + Attributes + ---------- + None + + Methods + ------- + None + """ + + # If you're wondering why pd.Series(cls) doesn't put the array in an + # ExtensionBlock, search for `ABCPandasArray`. We check for + # that _typ to ensure that that users don't unnecessarily use EAs inside + # pandas internals, which turns off things like block consolidation. + _typ = "npy_extension" + __array_priority__ = 1000 + _ndarray: np.ndarray + + # ------------------------------------------------------------------------ + # Constructors + + def __init__(self, values: Union[np.ndarray, "PandasArray"], copy: bool = False): + if isinstance(values, type(self)): + values = values._ndarray + if not isinstance(values, np.ndarray): + raise ValueError( + f"'values' must be a NumPy array, not {type(values).__name__}" + ) + + if values.ndim != 1: + raise ValueError("PandasArray must be 1-dimensional.") + + if copy: + values = values.copy() + + self._ndarray = values + self._dtype = PandasDtype(values.dtype) + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + if isinstance(dtype, PandasDtype): + dtype = dtype._dtype + + result = np.asarray(scalars, dtype=dtype) + if copy and result is scalars: + result = result.copy() + return cls(result) + + @classmethod + def _from_factorized(cls, values, original): + return cls(values) + + @classmethod + def _concat_same_type(cls, to_concat): + return cls(np.concatenate(to_concat)) + + # ------------------------------------------------------------------------ + # Data + + @property + def dtype(self): + return self._dtype + + # ------------------------------------------------------------------------ + # NumPy Array Interface + + def __array__(self, dtype=None) -> np.ndarray: + return np.asarray(self._ndarray, dtype=dtype) + + _HANDLED_TYPES = (np.ndarray, numbers.Number) + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + # Lightly modified version of + # https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/\ + # numpy.lib.mixins.NDArrayOperatorsMixin.html + # The primary modification is not boxing scalar return values + # in PandasArray, since pandas' ExtensionArrays are 1-d. + out = kwargs.get("out", ()) + for x in inputs + out: + # Only support operations with instances of _HANDLED_TYPES. + # Use PandasArray instead of type(self) for isinstance to + # allow subclasses that don't override __array_ufunc__ to + # handle PandasArray objects. + if not isinstance(x, self._HANDLED_TYPES + (PandasArray,)): + return NotImplemented + + # Defer to the implementation of the ufunc on unwrapped values. + inputs = tuple(x._ndarray if isinstance(x, PandasArray) else x for x in inputs) + if out: + kwargs["out"] = tuple( + x._ndarray if isinstance(x, PandasArray) else x for x in out + ) + result = getattr(ufunc, method)(*inputs, **kwargs) + + if type(result) is tuple and len(result): + # multiple return values + if not lib.is_scalar(result[0]): + # re-box array-like results + return tuple(type(self)(x) for x in result) + else: + # but not scalar reductions + return result + elif method == "at": + # no return value + return None + else: + # one return value + if not lib.is_scalar(result): + # re-box array-like results, but not scalar reductions + result = type(self)(result) + return result + + # ------------------------------------------------------------------------ + # Pandas ExtensionArray Interface + + def __getitem__(self, item): + if isinstance(item, type(self)): + item = item._ndarray + + item = check_array_indexer(self, item) + + result = self._ndarray[item] + if not lib.is_scalar(item): + result = type(self)(result) + return result + + def __setitem__(self, key, value): + value = extract_array(value, extract_numpy=True) + + scalar_key = lib.is_scalar(key) + scalar_value = lib.is_scalar(value) + + if not scalar_key and scalar_value: + key = np.asarray(key) + + if not scalar_value: + value = np.asarray(value, dtype=self._ndarray.dtype) + + self._ndarray[key] = value + + def __len__(self) -> int: + return len(self._ndarray) + + @property + def nbytes(self) -> int: + return self._ndarray.nbytes + + def isna(self): + return isna(self._ndarray) + + def fillna(self, value=None, method=None, limit=None): + # TODO(_values_for_fillna): remove this + value, method = validate_fillna_kwargs(value, method) + + mask = self.isna() + + if is_array_like(value): + if len(value) != len(self): + raise ValueError( + f"Length of 'value' does not match. Got ({len(value)}) " + f" expected {len(self)}" + ) + value = value[mask] + + if mask.any(): + if method is not None: + func = pad_1d if method == "pad" else backfill_1d + new_values = func(self._ndarray, limit=limit, mask=mask) + new_values = self._from_sequence(new_values, dtype=self.dtype) + else: + # fill with value + new_values = self.copy() + new_values[mask] = value + else: + new_values = self.copy() + return new_values + + def take(self, indices, allow_fill=False, fill_value=None): + if fill_value is None: + # Primarily for subclasses + fill_value = self.dtype.na_value + result = take( + self._ndarray, indices, allow_fill=allow_fill, fill_value=fill_value + ) + return type(self)(result) + + def copy(self): + return type(self)(self._ndarray.copy()) + + def _values_for_argsort(self): + return self._ndarray + + def _values_for_factorize(self): + return self._ndarray, -1 + + def unique(self): + return type(self)(unique(self._ndarray)) + + # ------------------------------------------------------------------------ + # Reductions + + def _reduce(self, name, skipna=True, **kwargs): + meth = getattr(self, name, None) + if meth: + return meth(skipna=skipna, **kwargs) + else: + msg = f"'{type(self).__name__}' does not implement reduction '{name}'" + raise TypeError(msg) + + def any(self, axis=None, out=None, keepdims=False, skipna=True): + nv.validate_any((), dict(out=out, keepdims=keepdims)) + return nanops.nanany(self._ndarray, axis=axis, skipna=skipna) + + def all(self, axis=None, out=None, keepdims=False, skipna=True): + nv.validate_all((), dict(out=out, keepdims=keepdims)) + return nanops.nanall(self._ndarray, axis=axis, skipna=skipna) + + def min(self, axis=None, out=None, keepdims=False, skipna=True): + nv.validate_min((), dict(out=out, keepdims=keepdims)) + return nanops.nanmin(self._ndarray, axis=axis, skipna=skipna) + + def max(self, axis=None, out=None, keepdims=False, skipna=True): + nv.validate_max((), dict(out=out, keepdims=keepdims)) + return nanops.nanmax(self._ndarray, axis=axis, skipna=skipna) + + def sum( + self, + axis=None, + dtype=None, + out=None, + keepdims=False, + initial=None, + skipna=True, + min_count=0, + ): + nv.validate_sum( + (), dict(dtype=dtype, out=out, keepdims=keepdims, initial=initial) + ) + return nanops.nansum( + self._ndarray, axis=axis, skipna=skipna, min_count=min_count + ) + + def prod( + self, + axis=None, + dtype=None, + out=None, + keepdims=False, + initial=None, + skipna=True, + min_count=0, + ): + nv.validate_prod( + (), dict(dtype=dtype, out=out, keepdims=keepdims, initial=initial) + ) + return nanops.nanprod( + self._ndarray, axis=axis, skipna=skipna, min_count=min_count + ) + + def mean(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True): + nv.validate_mean((), dict(dtype=dtype, out=out, keepdims=keepdims)) + return nanops.nanmean(self._ndarray, axis=axis, skipna=skipna) + + def median( + self, axis=None, out=None, overwrite_input=False, keepdims=False, skipna=True + ): + nv.validate_median( + (), dict(out=out, overwrite_input=overwrite_input, keepdims=keepdims) + ) + return nanops.nanmedian(self._ndarray, axis=axis, skipna=skipna) + + def std(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True): + nv.validate_stat_ddof_func( + (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="std" + ) + return nanops.nanstd(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) + + def var(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True): + nv.validate_stat_ddof_func( + (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="var" + ) + return nanops.nanvar(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) + + def sem(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True): + nv.validate_stat_ddof_func( + (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="sem" + ) + return nanops.nansem(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) + + def kurt(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True): + nv.validate_stat_ddof_func( + (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="kurt" + ) + return nanops.nankurt(self._ndarray, axis=axis, skipna=skipna) + + def skew(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True): + nv.validate_stat_ddof_func( + (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="skew" + ) + return nanops.nanskew(self._ndarray, axis=axis, skipna=skipna) + + # ------------------------------------------------------------------------ + # Additional Methods + def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default): + result = np.asarray(self._ndarray, dtype=dtype) + + if (copy or na_value is not lib.no_default) and result is self._ndarray: + result = result.copy() + + if na_value is not lib.no_default: + result[self.isna()] = na_value + + return result + + @Appender(ExtensionArray.searchsorted.__doc__) + def searchsorted(self, value, side="left", sorter=None): + return searchsorted(self.to_numpy(), value, side=side, sorter=sorter) + + # ------------------------------------------------------------------------ + # Ops + + def __invert__(self): + return type(self)(~self._ndarray) + + @classmethod + def _create_arithmetic_method(cls, op): + def arithmetic_method(self, other): + if isinstance(other, (ABCIndexClass, ABCSeries)): + return NotImplemented + + elif isinstance(other, cls): + other = other._ndarray + + with np.errstate(all="ignore"): + result = op(self._ndarray, other) + + if op is divmod: + a, b = result + return cls(a), cls(b) + + return cls(result) + + return compat.set_function_name(arithmetic_method, f"__{op.__name__}__", cls) + + _create_comparison_method = _create_arithmetic_method + + +PandasArray._add_arithmetic_ops() +PandasArray._add_comparison_ops() diff --git a/venv/Lib/site-packages/pandas/core/arrays/period.py b/venv/Lib/site-packages/pandas/core/arrays/period.py new file mode 100644 index 0000000..8b49c21 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/arrays/period.py @@ -0,0 +1,1054 @@ +from datetime import timedelta +import operator +from typing import Any, Callable, List, Optional, Sequence, Union + +import numpy as np + +from pandas._libs.tslibs import ( + NaT, + NaTType, + frequencies as libfrequencies, + iNaT, + period as libperiod, +) +from pandas._libs.tslibs.fields import isleapyear_arr +from pandas._libs.tslibs.period import ( + DIFFERENT_FREQ, + IncompatibleFrequency, + Period, + get_period_field_arr, + period_asfreq_arr, +) +from pandas._libs.tslibs.timedeltas import Timedelta, delta_to_nanoseconds +from pandas.util._decorators import cache_readonly + +from pandas.core.dtypes.common import ( + _TD_DTYPE, + ensure_object, + is_datetime64_dtype, + is_float_dtype, + is_period_dtype, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import PeriodDtype +from pandas.core.dtypes.generic import ( + ABCIndexClass, + ABCPeriodArray, + ABCPeriodIndex, + ABCSeries, +) +from pandas.core.dtypes.missing import isna, notna + +import pandas.core.algorithms as algos +from pandas.core.arrays import datetimelike as dtl +import pandas.core.common as com + +from pandas.tseries import frequencies +from pandas.tseries.offsets import DateOffset, Tick, _delta_to_tick + + +def _field_accessor(name, alias, docstring=None): + def f(self): + base, mult = libfrequencies.get_freq_code(self.freq) + result = get_period_field_arr(alias, self.asi8, base) + return result + + f.__name__ = name + f.__doc__ = docstring + return property(f) + + +class PeriodArray(dtl.DatetimeLikeArrayMixin, dtl.DatelikeOps): + """ + Pandas ExtensionArray for storing Period data. + + Users should use :func:`period_array` to create new instances. + + Parameters + ---------- + values : Union[PeriodArray, Series[period], ndarray[int], PeriodIndex] + The data to store. These should be arrays that can be directly + converted to ordinals without inference or copy (PeriodArray, + ndarray[int64]), or a box around such an array (Series[period], + PeriodIndex). + freq : str or DateOffset + The `freq` to use for the array. Mostly applicable when `values` + is an ndarray of integers, when `freq` is required. When `values` + is a PeriodArray (or box around), it's checked that ``values.freq`` + matches `freq`. + dtype : PeriodDtype, optional + A PeriodDtype instance from which to extract a `freq`. If both + `freq` and `dtype` are specified, then the frequencies must match. + copy : bool, default False + Whether to copy the ordinals before storing. + + Attributes + ---------- + None + + Methods + ------- + None + + See Also + -------- + period_array : Create a new PeriodArray. + PeriodIndex : Immutable Index for period data. + + Notes + ----- + There are two components to a PeriodArray + + - ordinals : integer ndarray + - freq : pd.tseries.offsets.Offset + + The values are physically stored as a 1-D ndarray of integers. These are + called "ordinals" and represent some kind of offset from a base. + + The `freq` indicates the span covered by each element of the array. + All elements in the PeriodArray have the same `freq`. + """ + + # array priority higher than numpy scalars + __array_priority__ = 1000 + _typ = "periodarray" # ABCPeriodArray + _scalar_type = Period + _recognized_scalars = (Period,) + _is_recognized_dtype = is_period_dtype + + # Names others delegate to us + _other_ops: List[str] = [] + _bool_ops = ["is_leap_year"] + _object_ops = ["start_time", "end_time", "freq"] + _field_ops = [ + "year", + "month", + "day", + "hour", + "minute", + "second", + "weekofyear", + "weekday", + "week", + "dayofweek", + "dayofyear", + "quarter", + "qyear", + "days_in_month", + "daysinmonth", + ] + _datetimelike_ops = _field_ops + _object_ops + _bool_ops + _datetimelike_methods = ["strftime", "to_timestamp", "asfreq"] + + # -------------------------------------------------------------------- + # Constructors + + def __init__(self, values, freq=None, dtype=None, copy=False): + freq = validate_dtype_freq(dtype, freq) + + if freq is not None: + freq = Period._maybe_convert_freq(freq) + + if isinstance(values, ABCSeries): + values = values._values + if not isinstance(values, type(self)): + raise TypeError("Incorrect dtype") + + elif isinstance(values, ABCPeriodIndex): + values = values._values + + if isinstance(values, type(self)): + if freq is not None and freq != values.freq: + raise raise_on_incompatible(values, freq) + values, freq = values._data, values.freq + + values = np.array(values, dtype="int64", copy=copy) + self._data = values + if freq is None: + raise ValueError("freq is not specified and cannot be inferred") + self._dtype = PeriodDtype(freq) + + @classmethod + def _simple_new(cls, values, freq=None, **kwargs): + # alias for PeriodArray.__init__ + return cls(values, freq=freq, **kwargs) + + @classmethod + def _from_sequence( + cls, + scalars: Sequence[Optional[Period]], + dtype: Optional[PeriodDtype] = None, + copy: bool = False, + ) -> ABCPeriodArray: + if dtype: + freq = dtype.freq + else: + freq = None + + if isinstance(scalars, cls): + validate_dtype_freq(scalars.dtype, freq) + if copy: + scalars = scalars.copy() + return scalars + + periods = np.asarray(scalars, dtype=object) + if copy: + periods = periods.copy() + + freq = freq or libperiod.extract_freq(periods) + ordinals = libperiod.extract_ordinals(periods, freq) + return cls(ordinals, freq=freq) + + @classmethod + def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): + return cls._from_sequence(strings, dtype, copy) + + @classmethod + def _from_datetime64(cls, data, freq, tz=None): + """ + Construct a PeriodArray from a datetime64 array + + Parameters + ---------- + data : ndarray[datetime64[ns], datetime64[ns, tz]] + freq : str or Tick + tz : tzinfo, optional + + Returns + ------- + PeriodArray[freq] + """ + data, freq = dt64arr_to_periodarr(data, freq, tz) + return cls(data, freq=freq) + + @classmethod + def _generate_range(cls, start, end, periods, freq, fields): + periods = dtl.validate_periods(periods) + + if freq is not None: + freq = Period._maybe_convert_freq(freq) + + field_count = len(fields) + if start is not None or end is not None: + if field_count > 0: + raise ValueError( + "Can either instantiate from fields or endpoints, but not both" + ) + subarr, freq = _get_ordinal_range(start, end, periods, freq) + elif field_count > 0: + subarr, freq = _range_from_fields(freq=freq, **fields) + else: + raise ValueError("Not enough parameters to construct Period range") + + return subarr, freq + + # ----------------------------------------------------------------- + # DatetimeLike Interface + + def _unbox_scalar(self, value: Union[Period, NaTType]) -> int: + if value is NaT: + return value.value + elif isinstance(value, self._scalar_type): + if not isna(value): + self._check_compatible_with(value) + return value.ordinal + else: + raise ValueError(f"'value' should be a Period. Got '{value}' instead.") + + def _scalar_from_string(self, value: str) -> Period: + return Period(value, freq=self.freq) + + def _check_compatible_with(self, other, setitem: bool = False): + if other is NaT: + return + if self.freqstr != other.freqstr: + raise raise_on_incompatible(self, other) + + # -------------------------------------------------------------------- + # Data / Attributes + + @cache_readonly + def dtype(self): + return self._dtype + + # error: Read-only property cannot override read-write property [misc] + @property # type: ignore + def freq(self): + """ + Return the frequency object for this PeriodArray. + """ + return self.dtype.freq + + def __array__(self, dtype=None) -> np.ndarray: + # overriding DatetimelikeArray + return np.array(list(self), dtype=object) + + def __arrow_array__(self, type=None): + """ + Convert myself into a pyarrow Array. + """ + import pyarrow + from pandas.core.arrays._arrow_utils import ArrowPeriodType + + if type is not None: + if pyarrow.types.is_integer(type): + return pyarrow.array(self._data, mask=self.isna(), type=type) + elif isinstance(type, ArrowPeriodType): + # ensure we have the same freq + if self.freqstr != type.freq: + raise TypeError( + "Not supported to convert PeriodArray to array with different" + " 'freq' ({0} vs {1})".format(self.freqstr, type.freq) + ) + else: + raise TypeError( + "Not supported to convert PeriodArray to '{0}' type".format(type) + ) + + period_type = ArrowPeriodType(self.freqstr) + storage_array = pyarrow.array(self._data, mask=self.isna(), type="int64") + return pyarrow.ExtensionArray.from_storage(period_type, storage_array) + + # -------------------------------------------------------------------- + # Vectorized analogues of Period properties + + year = _field_accessor( + "year", + 0, + """ + The year of the period. + """, + ) + month = _field_accessor( + "month", + 3, + """ + The month as January=1, December=12. + """, + ) + day = _field_accessor( + "day", + 4, + """ + The days of the period. + """, + ) + hour = _field_accessor( + "hour", + 5, + """ + The hour of the period. + """, + ) + minute = _field_accessor( + "minute", + 6, + """ + The minute of the period. + """, + ) + second = _field_accessor( + "second", + 7, + """ + The second of the period. + """, + ) + weekofyear = _field_accessor( + "week", + 8, + """ + The week ordinal of the year. + """, + ) + week = weekofyear + dayofweek = _field_accessor( + "dayofweek", + 10, + """ + The day of the week with Monday=0, Sunday=6. + """, + ) + weekday = dayofweek + dayofyear = day_of_year = _field_accessor( + "dayofyear", + 9, + """ + The ordinal day of the year. + """, + ) + quarter = _field_accessor( + "quarter", + 2, + """ + The quarter of the date. + """, + ) + qyear = _field_accessor("qyear", 1) + days_in_month = _field_accessor( + "days_in_month", + 11, + """ + The number of days in the month. + """, + ) + daysinmonth = days_in_month + + @property + def is_leap_year(self): + """ + Logical indicating if the date belongs to a leap year. + """ + return isleapyear_arr(np.asarray(self.year)) + + @property + def start_time(self): + return self.to_timestamp(how="start") + + @property + def end_time(self): + return self.to_timestamp(how="end") + + def to_timestamp(self, freq=None, how="start"): + """ + Cast to DatetimeArray/Index. + + Parameters + ---------- + freq : str or DateOffset, optional + Target frequency. The default is 'D' for week or longer, + 'S' otherwise. + how : {'s', 'e', 'start', 'end'} + Whether to use the start or end of the time period being converted. + + Returns + ------- + DatetimeArray/Index + """ + from pandas.core.arrays import DatetimeArray + + how = libperiod._validate_end_alias(how) + + end = how == "E" + if end: + if freq == "B": + # roll forward to ensure we land on B date + adjust = Timedelta(1, "D") - Timedelta(1, "ns") + return self.to_timestamp(how="start") + adjust + else: + adjust = Timedelta(1, "ns") + return (self + self.freq).to_timestamp(how="start") - adjust + + if freq is None: + base, mult = libfrequencies.get_freq_code(self.freq) + freq = libfrequencies.get_to_timestamp_base(base) + else: + freq = Period._maybe_convert_freq(freq) + + base, mult = libfrequencies.get_freq_code(freq) + new_data = self.asfreq(freq, how=how) + + new_data = libperiod.periodarr_to_dt64arr(new_data.asi8, base) + return DatetimeArray._from_sequence(new_data, freq="infer") + + # -------------------------------------------------------------------- + # Array-like / EA-Interface Methods + + def _values_for_argsort(self): + return self._data + + # -------------------------------------------------------------------- + + def _time_shift(self, periods, freq=None): + """ + Shift each value by `periods`. + + Note this is different from ExtensionArray.shift, which + shifts the *position* of each element, padding the end with + missing values. + + Parameters + ---------- + periods : int + Number of periods to shift by. + freq : pandas.DateOffset, pandas.Timedelta, or str + Frequency increment to shift by. + """ + if freq is not None: + raise TypeError( + "`freq` argument is not supported for " + f"{type(self).__name__}._time_shift" + ) + values = self.asi8 + periods * self.freq.n + if self._hasnans: + values[self._isnan] = iNaT + return type(self)(values, freq=self.freq) + + @property + def _box_func(self): + return lambda x: Period._from_ordinal(ordinal=x, freq=self.freq) + + def asfreq(self, freq=None, how="E"): + """ + Convert the Period Array/Index to the specified frequency `freq`. + + Parameters + ---------- + freq : str + A frequency. + how : str {'E', 'S'} + Whether the elements should be aligned to the end + or start within pa period. + + * 'E', 'END', or 'FINISH' for end, + * 'S', 'START', or 'BEGIN' for start. + + January 31st ('END') vs. January 1st ('START') for example. + + Returns + ------- + Period Array/Index + Constructed with the new frequency. + + Examples + -------- + >>> pidx = pd.period_range('2010-01-01', '2015-01-01', freq='A') + >>> pidx + PeriodIndex(['2010', '2011', '2012', '2013', '2014', '2015'], + dtype='period[A-DEC]', freq='A-DEC') + + >>> pidx.asfreq('M') + PeriodIndex(['2010-12', '2011-12', '2012-12', '2013-12', '2014-12', + '2015-12'], dtype='period[M]', freq='M') + + >>> pidx.asfreq('M', how='S') + PeriodIndex(['2010-01', '2011-01', '2012-01', '2013-01', '2014-01', + '2015-01'], dtype='period[M]', freq='M') + """ + how = libperiod._validate_end_alias(how) + + freq = Period._maybe_convert_freq(freq) + + base1, mult1 = libfrequencies.get_freq_code(self.freq) + base2, mult2 = libfrequencies.get_freq_code(freq) + + asi8 = self.asi8 + # mult1 can't be negative or 0 + end = how == "E" + if end: + ordinal = asi8 + mult1 - 1 + else: + ordinal = asi8 + + new_data = period_asfreq_arr(ordinal, base1, base2, end) + + if self._hasnans: + new_data[self._isnan] = iNaT + + return type(self)(new_data, freq=freq) + + # ------------------------------------------------------------------ + # Rendering Methods + + def _formatter(self, boxed=False): + if boxed: + return str + return "'{}'".format + + def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): + """ + actually format my specific types + """ + values = self.astype(object) + + if date_format: + formatter = lambda dt: dt.strftime(date_format) + else: + formatter = lambda dt: str(dt) + + if self._hasnans: + mask = self._isnan + values[mask] = na_rep + imask = ~mask + values[imask] = np.array([formatter(dt) for dt in values[imask]]) + else: + values = np.array([formatter(dt) for dt in values]) + return values + + # ------------------------------------------------------------------ + + def astype(self, dtype, copy=True): + # We handle Period[T] -> Period[U] + # Our parent handles everything else. + dtype = pandas_dtype(dtype) + + if is_period_dtype(dtype): + return self.asfreq(dtype.freq) + return super().astype(dtype, copy=copy) + + # ------------------------------------------------------------------ + # Arithmetic Methods + + def _sub_datelike(self, other): + assert other is not NaT + return NotImplemented + + def _sub_period(self, other): + # If the operation is well-defined, we return an object-Index + # of DateOffsets. Null entries are filled with pd.NaT + self._check_compatible_with(other) + asi8 = self.asi8 + new_data = asi8 - other.ordinal + new_data = np.array([self.freq * x for x in new_data]) + + if self._hasnans: + new_data[self._isnan] = NaT + + return new_data + + def _addsub_int_array( + self, other: np.ndarray, op: Callable[[Any, Any], Any], + ) -> "PeriodArray": + """ + Add or subtract array of integers; equivalent to applying + `_time_shift` pointwise. + + Parameters + ---------- + other : np.ndarray[integer-dtype] + op : {operator.add, operator.sub} + + Returns + ------- + result : PeriodArray + """ + + assert op in [operator.add, operator.sub] + if op is operator.sub: + other = -other + res_values = algos.checked_add_with_arr(self.asi8, other, arr_mask=self._isnan) + res_values = res_values.view("i8") + res_values[self._isnan] = iNaT + return type(self)(res_values, freq=self.freq) + + def _add_offset(self, other): + assert not isinstance(other, Tick) + base = libfrequencies.get_base_alias(other.rule_code) + if base != self.freq.rule_code: + raise raise_on_incompatible(self, other) + + # Note: when calling parent class's _add_timedeltalike_scalar, + # it will call delta_to_nanoseconds(delta). Because delta here + # is an integer, delta_to_nanoseconds will return it unchanged. + result = super()._add_timedeltalike_scalar(other.n) + return type(self)(result, freq=self.freq) + + def _add_timedeltalike_scalar(self, other): + """ + Parameters + ---------- + other : timedelta, Tick, np.timedelta64 + + Returns + ------- + result : ndarray[int64] + """ + assert isinstance(self.freq, Tick) # checked by calling function + assert isinstance(other, (timedelta, np.timedelta64, Tick)) + + if notna(other): + # special handling for np.timedelta64("NaT"), avoid calling + # _check_timedeltalike_freq_compat as that would raise TypeError + other = self._check_timedeltalike_freq_compat(other) + + # Note: when calling parent class's _add_timedeltalike_scalar, + # it will call delta_to_nanoseconds(delta). Because delta here + # is an integer, delta_to_nanoseconds will return it unchanged. + ordinals = super()._add_timedeltalike_scalar(other) + return ordinals + + def _add_delta_tdi(self, other): + """ + Parameters + ---------- + other : TimedeltaArray or ndarray[timedelta64] + + Returns + ------- + result : ndarray[int64] + """ + assert isinstance(self.freq, Tick) # checked by calling function + + if not np.all(isna(other)): + delta = self._check_timedeltalike_freq_compat(other) + else: + # all-NaT TimedeltaIndex is equivalent to a single scalar td64 NaT + return self + np.timedelta64("NaT") + + return self._addsub_int_array(delta, operator.add).asi8 + + def _add_delta(self, other): + """ + Add a timedelta-like, Tick, or TimedeltaIndex-like object + to self, yielding a new PeriodArray + + Parameters + ---------- + other : {timedelta, np.timedelta64, Tick, + TimedeltaIndex, ndarray[timedelta64]} + + Returns + ------- + result : PeriodArray + """ + if not isinstance(self.freq, Tick): + # We cannot add timedelta-like to non-tick PeriodArray + raise raise_on_incompatible(self, other) + + new_ordinals = super()._add_delta(other) + return type(self)(new_ordinals, freq=self.freq) + + def _check_timedeltalike_freq_compat(self, other): + """ + Arithmetic operations with timedelta-like scalars or array `other` + are only valid if `other` is an integer multiple of `self.freq`. + If the operation is valid, find that integer multiple. Otherwise, + raise because the operation is invalid. + + Parameters + ---------- + other : timedelta, np.timedelta64, Tick, + ndarray[timedelta64], TimedeltaArray, TimedeltaIndex + + Returns + ------- + multiple : int or ndarray[int64] + + Raises + ------ + IncompatibleFrequency + """ + assert isinstance(self.freq, Tick) # checked by calling function + own_offset = frequencies.to_offset(self.freq.rule_code) + base_nanos = delta_to_nanoseconds(own_offset) + + if isinstance(other, (timedelta, np.timedelta64, Tick)): + nanos = delta_to_nanoseconds(other) + + elif isinstance(other, np.ndarray): + # numpy timedelta64 array; all entries must be compatible + assert other.dtype.kind == "m" + if other.dtype != _TD_DTYPE: + # i.e. non-nano unit + # TODO: disallow unit-less timedelta64 + other = other.astype(_TD_DTYPE) + nanos = other.view("i8") + else: + # TimedeltaArray/Index + nanos = other.asi8 + + if np.all(nanos % base_nanos == 0): + # nanos being added is an integer multiple of the + # base-frequency to self.freq + delta = nanos // base_nanos + # delta is the integer (or integer-array) number of periods + # by which will be added to self. + return delta + + raise raise_on_incompatible(self, other) + + +def raise_on_incompatible(left, right): + """ + Helper function to render a consistent error message when raising + IncompatibleFrequency. + + Parameters + ---------- + left : PeriodArray + right : None, DateOffset, Period, ndarray, or timedelta-like + + Returns + ------- + IncompatibleFrequency + Exception to be raised by the caller. + """ + # GH#24283 error message format depends on whether right is scalar + if isinstance(right, np.ndarray) or right is None: + other_freq = None + elif isinstance(right, (ABCPeriodIndex, PeriodArray, Period, DateOffset)): + other_freq = right.freqstr + else: + other_freq = _delta_to_tick(Timedelta(right)).freqstr + + msg = DIFFERENT_FREQ.format( + cls=type(left).__name__, own_freq=left.freqstr, other_freq=other_freq + ) + return IncompatibleFrequency(msg) + + +# ------------------------------------------------------------------- +# Constructor Helpers + + +def period_array( + data: Sequence[Optional[Period]], + freq: Optional[Union[str, Tick]] = None, + copy: bool = False, +) -> PeriodArray: + """ + Construct a new PeriodArray from a sequence of Period scalars. + + Parameters + ---------- + data : Sequence of Period objects + A sequence of Period objects. These are required to all have + the same ``freq.`` Missing values can be indicated by ``None`` + or ``pandas.NaT``. + freq : str, Tick, or Offset + The frequency of every element of the array. This can be specified + to avoid inferring the `freq` from `data`. + copy : bool, default False + Whether to ensure a copy of the data is made. + + Returns + ------- + PeriodArray + + See Also + -------- + PeriodArray + pandas.PeriodIndex + + Examples + -------- + >>> period_array([pd.Period('2017', freq='A'), + ... pd.Period('2018', freq='A')]) + + ['2017', '2018'] + Length: 2, dtype: period[A-DEC] + + >>> period_array([pd.Period('2017', freq='A'), + ... pd.Period('2018', freq='A'), + ... pd.NaT]) + + ['2017', '2018', 'NaT'] + Length: 3, dtype: period[A-DEC] + + Integers that look like years are handled + + >>> period_array([2000, 2001, 2002], freq='D') + ['2000-01-01', '2001-01-01', '2002-01-01'] + Length: 3, dtype: period[D] + + Datetime-like strings may also be passed + + >>> period_array(['2000-Q1', '2000-Q2', '2000-Q3', '2000-Q4'], freq='Q') + + ['2000Q1', '2000Q2', '2000Q3', '2000Q4'] + Length: 4, dtype: period[Q-DEC] + """ + if is_datetime64_dtype(data): + return PeriodArray._from_datetime64(data, freq) + if isinstance(data, (ABCPeriodIndex, ABCSeries, PeriodArray)): + return PeriodArray(data, freq) + + # other iterable of some kind + if not isinstance(data, (np.ndarray, list, tuple)): + data = list(data) + + data = np.asarray(data) + + dtype: Optional[PeriodDtype] + if freq: + dtype = PeriodDtype(freq) + else: + dtype = None + + if is_float_dtype(data) and len(data) > 0: + raise TypeError("PeriodIndex does not allow floating point in construction") + + data = ensure_object(data) + + return PeriodArray._from_sequence(data, dtype=dtype) + + +def validate_dtype_freq(dtype, freq): + """ + If both a dtype and a freq are available, ensure they match. If only + dtype is available, extract the implied freq. + + Parameters + ---------- + dtype : dtype + freq : DateOffset or None + + Returns + ------- + freq : DateOffset + + Raises + ------ + ValueError : non-period dtype + IncompatibleFrequency : mismatch between dtype and freq + """ + if freq is not None: + freq = frequencies.to_offset(freq) + + if dtype is not None: + dtype = pandas_dtype(dtype) + if not is_period_dtype(dtype): + raise ValueError("dtype must be PeriodDtype") + if freq is None: + freq = dtype.freq + elif freq != dtype.freq: + raise IncompatibleFrequency("specified freq and dtype are different") + return freq + + +def dt64arr_to_periodarr(data, freq, tz=None): + """ + Convert an datetime-like array to values Period ordinals. + + Parameters + ---------- + data : Union[Series[datetime64[ns]], DatetimeIndex, ndarray[datetime64ns]] + freq : Optional[Union[str, Tick]] + Must match the `freq` on the `data` if `data` is a DatetimeIndex + or Series. + tz : Optional[tzinfo] + + Returns + ------- + ordinals : ndarray[int] + freq : Tick + The frequency extracted from the Series or DatetimeIndex if that's + used. + + """ + if data.dtype != np.dtype("M8[ns]"): + raise ValueError(f"Wrong dtype: {data.dtype}") + + if freq is None: + if isinstance(data, ABCIndexClass): + data, freq = data._values, data.freq + elif isinstance(data, ABCSeries): + data, freq = data._values, data.dt.freq + + freq = Period._maybe_convert_freq(freq) + + if isinstance(data, (ABCIndexClass, ABCSeries)): + data = data._values + + base, mult = libfrequencies.get_freq_code(freq) + return libperiod.dt64arr_to_periodarr(data.view("i8"), base, tz), freq + + +def _get_ordinal_range(start, end, periods, freq, mult=1): + if com.count_not_none(start, end, periods) != 2: + raise ValueError( + "Of the three parameters: start, end, and periods, " + "exactly two must be specified" + ) + + if freq is not None: + _, mult = libfrequencies.get_freq_code(freq) + + if start is not None: + start = Period(start, freq) + if end is not None: + end = Period(end, freq) + + is_start_per = isinstance(start, Period) + is_end_per = isinstance(end, Period) + + if is_start_per and is_end_per and start.freq != end.freq: + raise ValueError("start and end must have same freq") + if start is NaT or end is NaT: + raise ValueError("start and end must not be NaT") + + if freq is None: + if is_start_per: + freq = start.freq + elif is_end_per: + freq = end.freq + else: # pragma: no cover + raise ValueError("Could not infer freq from start/end") + + if periods is not None: + periods = periods * mult + if start is None: + data = np.arange( + end.ordinal - periods + mult, end.ordinal + 1, mult, dtype=np.int64 + ) + else: + data = np.arange( + start.ordinal, start.ordinal + periods, mult, dtype=np.int64 + ) + else: + data = np.arange(start.ordinal, end.ordinal + 1, mult, dtype=np.int64) + + return data, freq + + +def _range_from_fields( + year=None, + month=None, + quarter=None, + day=None, + hour=None, + minute=None, + second=None, + freq=None, +): + if hour is None: + hour = 0 + if minute is None: + minute = 0 + if second is None: + second = 0 + if day is None: + day = 1 + + ordinals = [] + + if quarter is not None: + if freq is None: + freq = "Q" + base = libfrequencies.FreqGroup.FR_QTR + else: + base, mult = libfrequencies.get_freq_code(freq) + if base != libfrequencies.FreqGroup.FR_QTR: + raise AssertionError("base must equal FR_QTR") + + year, quarter = _make_field_arrays(year, quarter) + for y, q in zip(year, quarter): + y, m = libperiod.quarter_to_myear(y, q, freq) + val = libperiod.period_ordinal(y, m, 1, 1, 1, 1, 0, 0, base) + ordinals.append(val) + else: + base, mult = libfrequencies.get_freq_code(freq) + arrays = _make_field_arrays(year, month, day, hour, minute, second) + for y, mth, d, h, mn, s in zip(*arrays): + ordinals.append(libperiod.period_ordinal(y, mth, d, h, mn, s, 0, 0, base)) + + return np.array(ordinals, dtype=np.int64), freq + + +def _make_field_arrays(*fields): + length = None + for x in fields: + if isinstance(x, (list, np.ndarray, ABCSeries)): + if length is not None and len(x) != length: + raise ValueError("Mismatched Period array lengths") + elif length is None: + length = len(x) + + arrays = [ + np.asarray(x) + if isinstance(x, (np.ndarray, list, ABCSeries)) + else np.repeat(x, length) + for x in fields + ] + + return arrays diff --git a/venv/Lib/site-packages/pandas/core/arrays/sparse/__init__.py b/venv/Lib/site-packages/pandas/core/arrays/sparse/__init__.py new file mode 100644 index 0000000..e928db4 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/arrays/sparse/__init__.py @@ -0,0 +1,10 @@ +# flake8: noqa: F401 + +from pandas.core.arrays.sparse.accessor import SparseAccessor, SparseFrameAccessor +from pandas.core.arrays.sparse.array import ( + BlockIndex, + IntIndex, + SparseArray, + _make_index, +) +from pandas.core.arrays.sparse.dtype import SparseDtype diff --git a/venv/Lib/site-packages/pandas/core/arrays/sparse/accessor.py b/venv/Lib/site-packages/pandas/core/arrays/sparse/accessor.py new file mode 100644 index 0000000..92c05f4 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/arrays/sparse/accessor.py @@ -0,0 +1,328 @@ +"""Sparse accessor""" + +import numpy as np + +from pandas.compat._optional import import_optional_dependency + +from pandas.core.dtypes.cast import find_common_type + +from pandas.core.accessor import PandasDelegate, delegate_names +from pandas.core.arrays.sparse.array import SparseArray +from pandas.core.arrays.sparse.dtype import SparseDtype + + +class BaseAccessor: + _validation_msg = "Can only use the '.sparse' accessor with Sparse data." + + def __init__(self, data=None): + self._parent = data + self._validate(data) + + def _validate(self, data): + raise NotImplementedError + + +@delegate_names( + SparseArray, ["npoints", "density", "fill_value", "sp_values"], typ="property" +) +class SparseAccessor(BaseAccessor, PandasDelegate): + """ + Accessor for SparseSparse from other sparse matrix data types. + """ + + def _validate(self, data): + if not isinstance(data.dtype, SparseDtype): + raise AttributeError(self._validation_msg) + + def _delegate_property_get(self, name, *args, **kwargs): + return getattr(self._parent.array, name) + + def _delegate_method(self, name, *args, **kwargs): + if name == "from_coo": + return self.from_coo(*args, **kwargs) + elif name == "to_coo": + return self.to_coo(*args, **kwargs) + else: + raise ValueError + + @classmethod + def from_coo(cls, A, dense_index=False): + """ + Create a Series with sparse values from a scipy.sparse.coo_matrix. + + Parameters + ---------- + A : scipy.sparse.coo_matrix + dense_index : bool, default False + If False (default), the SparseSeries index consists of only the + coords of the non-null entries of the original coo_matrix. + If True, the SparseSeries index consists of the full sorted + (row, col) coordinates of the coo_matrix. + + Returns + ------- + s : Series + A Series with sparse values. + + Examples + -------- + >>> from scipy import sparse + >>> A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), + shape=(3, 4)) + >>> A + <3x4 sparse matrix of type '' + with 3 stored elements in COOrdinate format> + >>> A.todense() + matrix([[ 0., 0., 1., 2.], + [ 3., 0., 0., 0.], + [ 0., 0., 0., 0.]]) + >>> ss = pd.Series.sparse.from_coo(A) + >>> ss + 0 2 1 + 3 2 + 1 0 3 + dtype: float64 + BlockIndex + Block locations: array([0], dtype=int32) + Block lengths: array([3], dtype=int32) + """ + from pandas.core.arrays.sparse.scipy_sparse import _coo_to_sparse_series + from pandas import Series + + result = _coo_to_sparse_series(A, dense_index=dense_index) + result = Series(result.array, index=result.index, copy=False) + + return result + + def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False): + """ + Create a scipy.sparse.coo_matrix from a Series with MultiIndex. + + Use row_levels and column_levels to determine the row and column + coordinates respectively. row_levels and column_levels are the names + (labels) or numbers of the levels. {row_levels, column_levels} must be + a partition of the MultiIndex level names (or numbers). + + Parameters + ---------- + row_levels : tuple/list + column_levels : tuple/list + sort_labels : bool, default False + Sort the row and column labels before forming the sparse matrix. + + Returns + ------- + y : scipy.sparse.coo_matrix + rows : list (row labels) + columns : list (column labels) + + Examples + -------- + >>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan]) + >>> s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0), + (1, 2, 'a', 1), + (1, 1, 'b', 0), + (1, 1, 'b', 1), + (2, 1, 'b', 0), + (2, 1, 'b', 1)], + names=['A', 'B', 'C', 'D']) + >>> ss = s.astype("Sparse") + >>> A, rows, columns = ss.sparse.to_coo(row_levels=['A', 'B'], + ... column_levels=['C', 'D'], + ... sort_labels=True) + >>> A + <3x4 sparse matrix of type '' + with 3 stored elements in COOrdinate format> + >>> A.todense() + matrix([[ 0., 0., 1., 3.], + [ 3., 0., 0., 0.], + [ 0., 0., 0., 0.]]) + >>> rows + [(1, 1), (1, 2), (2, 1)] + >>> columns + [('a', 0), ('a', 1), ('b', 0), ('b', 1)] + """ + from pandas.core.arrays.sparse.scipy_sparse import _sparse_series_to_coo + + A, rows, columns = _sparse_series_to_coo( + self._parent, row_levels, column_levels, sort_labels=sort_labels + ) + return A, rows, columns + + def to_dense(self): + """ + Convert a Series from sparse values to dense. + + .. versionadded:: 0.25.0 + + Returns + ------- + Series: + A Series with the same values, stored as a dense array. + + Examples + -------- + >>> series = pd.Series(pd.arrays.SparseArray([0, 1, 0])) + >>> series + 0 0 + 1 1 + 2 0 + dtype: Sparse[int64, 0] + + >>> series.sparse.to_dense() + 0 0 + 1 1 + 2 0 + dtype: int64 + """ + from pandas import Series + + return Series( + self._parent.array.to_dense(), + index=self._parent.index, + name=self._parent.name, + ) + + +class SparseFrameAccessor(BaseAccessor, PandasDelegate): + """ + DataFrame accessor for sparse data. + + .. versionadded:: 0.25.0 + """ + + def _validate(self, data): + dtypes = data.dtypes + if not all(isinstance(t, SparseDtype) for t in dtypes): + raise AttributeError(self._validation_msg) + + @classmethod + def from_spmatrix(cls, data, index=None, columns=None): + """ + Create a new DataFrame from a scipy sparse matrix. + + .. versionadded:: 0.25.0 + + Parameters + ---------- + data : scipy.sparse.spmatrix + Must be convertible to csc format. + index, columns : Index, optional + Row and column labels to use for the resulting DataFrame. + Defaults to a RangeIndex. + + Returns + ------- + DataFrame + Each column of the DataFrame is stored as a + :class:`arrays.SparseArray`. + + Examples + -------- + >>> import scipy.sparse + >>> mat = scipy.sparse.eye(3) + >>> pd.DataFrame.sparse.from_spmatrix(mat) + 0 1 2 + 0 1.0 0.0 0.0 + 1 0.0 1.0 0.0 + 2 0.0 0.0 1.0 + """ + from pandas import DataFrame + + data = data.tocsc() + index, columns = cls._prep_index(data, index, columns) + sparrays = [SparseArray.from_spmatrix(data[:, i]) for i in range(data.shape[1])] + data = dict(enumerate(sparrays)) + result = DataFrame(data, index=index) + result.columns = columns + return result + + def to_dense(self): + """ + Convert a DataFrame with sparse values to dense. + + .. versionadded:: 0.25.0 + + Returns + ------- + DataFrame + A DataFrame with the same values stored as dense arrays. + + Examples + -------- + >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0])}) + >>> df.sparse.to_dense() + A + 0 0 + 1 1 + 2 0 + """ + from pandas import DataFrame + + data = {k: v.array.to_dense() for k, v in self._parent.items()} + return DataFrame(data, index=self._parent.index, columns=self._parent.columns) + + def to_coo(self): + """ + Return the contents of the frame as a sparse SciPy COO matrix. + + .. versionadded:: 0.25.0 + + Returns + ------- + coo_matrix : scipy.sparse.spmatrix + If the caller is heterogeneous and contains booleans or objects, + the result will be of dtype=object. See Notes. + + Notes + ----- + The dtype will be the lowest-common-denominator type (implicit + upcasting); that is to say if the dtypes (even of numeric types) + are mixed, the one that accommodates all will be chosen. + + e.g. If the dtypes are float16 and float32, dtype will be upcast to + float32. By numpy.find_common_type convention, mixing int64 and + and uint64 will result in a float64 dtype. + """ + import_optional_dependency("scipy") + from scipy.sparse import coo_matrix + + dtype = find_common_type(self._parent.dtypes) + if isinstance(dtype, SparseDtype): + dtype = dtype.subtype + + cols, rows, datas = [], [], [] + for col, name in enumerate(self._parent): + s = self._parent[name] + row = s.array.sp_index.to_int_index().indices + cols.append(np.repeat(col, len(row))) + rows.append(row) + datas.append(s.array.sp_values.astype(dtype, copy=False)) + + cols = np.concatenate(cols) + rows = np.concatenate(rows) + datas = np.concatenate(datas) + return coo_matrix((datas, (rows, cols)), shape=self._parent.shape) + + @property + def density(self) -> float: + """ + Ratio of non-sparse points to total (dense) data points. + """ + return np.mean([column.array.density for _, column in self._parent.items()]) + + @staticmethod + def _prep_index(data, index, columns): + import pandas.core.indexes.base as ibase + + N, K = data.shape + if index is None: + index = ibase.default_index(N) + if columns is None: + columns = ibase.default_index(K) + + if len(columns) != K: + raise ValueError(f"Column length mismatch: {len(columns)} vs. {K}") + if len(index) != N: + raise ValueError(f"Index length mismatch: {len(index)} vs. {N}") + return index, columns diff --git a/venv/Lib/site-packages/pandas/core/arrays/sparse/array.py b/venv/Lib/site-packages/pandas/core/arrays/sparse/array.py new file mode 100644 index 0000000..b476a01 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/arrays/sparse/array.py @@ -0,0 +1,1552 @@ +""" +SparseArray data structure +""" +from collections import abc +import numbers +import operator +from typing import Any, Callable +import warnings + +import numpy as np + +from pandas._libs import index as libindex, lib +import pandas._libs.sparse as splib +from pandas._libs.sparse import BlockIndex, IntIndex, SparseIndex +from pandas._libs.tslibs import NaT +import pandas.compat as compat +from pandas.compat.numpy import function as nv +from pandas.errors import PerformanceWarning + +from pandas.core.dtypes.cast import ( + astype_nansafe, + construct_1d_arraylike_from_scalar, + find_common_type, + infer_dtype_from_scalar, +) +from pandas.core.dtypes.common import ( + is_array_like, + is_bool_dtype, + is_datetime64_any_dtype, + is_dtype_equal, + is_integer, + is_object_dtype, + is_scalar, + is_string_dtype, + pandas_dtype, +) +from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries, ABCSparseArray +from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna + +import pandas.core.algorithms as algos +from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin +from pandas.core.arrays.sparse.dtype import SparseDtype +from pandas.core.base import PandasObject +import pandas.core.common as com +from pandas.core.construction import sanitize_array +from pandas.core.indexers import check_array_indexer +from pandas.core.missing import interpolate_2d +import pandas.core.ops as ops +from pandas.core.ops.common import unpack_zerodim_and_defer + +import pandas.io.formats.printing as printing + +# ---------------------------------------------------------------------------- +# Array + + +_sparray_doc_kwargs = dict(klass="SparseArray") + + +def _get_fill(arr: ABCSparseArray) -> np.ndarray: + """ + Create a 0-dim ndarray containing the fill value + + Parameters + ---------- + arr : SparseArray + + Returns + ------- + fill_value : ndarray + 0-dim ndarray with just the fill value. + + Notes + ----- + coerce fill_value to arr dtype if possible + int64 SparseArray can have NaN as fill_value if there is no missing + """ + try: + return np.asarray(arr.fill_value, dtype=arr.dtype.subtype) + except ValueError: + return np.asarray(arr.fill_value) + + +def _sparse_array_op( + left: ABCSparseArray, right: ABCSparseArray, op: Callable, name: str +) -> Any: + """ + Perform a binary operation between two arrays. + + Parameters + ---------- + left : Union[SparseArray, ndarray] + right : Union[SparseArray, ndarray] + op : Callable + The binary operation to perform + name str + Name of the callable. + + Returns + ------- + SparseArray + """ + if name.startswith("__"): + # For lookups in _libs.sparse we need non-dunder op name + name = name[2:-2] + + # dtype used to find corresponding sparse method + ltype = left.dtype.subtype + rtype = right.dtype.subtype + + if not is_dtype_equal(ltype, rtype): + subtype = find_common_type([ltype, rtype]) + ltype = SparseDtype(subtype, left.fill_value) + rtype = SparseDtype(subtype, right.fill_value) + + # TODO(GH-23092): pass copy=False. Need to fix astype_nansafe + left = left.astype(ltype) + right = right.astype(rtype) + dtype = ltype.subtype + else: + dtype = ltype + + # dtype the result must have + result_dtype = None + + if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0: + with np.errstate(all="ignore"): + result = op(left.to_dense(), right.to_dense()) + fill = op(_get_fill(left), _get_fill(right)) + + if left.sp_index.ngaps == 0: + index = left.sp_index + else: + index = right.sp_index + elif left.sp_index.equals(right.sp_index): + with np.errstate(all="ignore"): + result = op(left.sp_values, right.sp_values) + fill = op(_get_fill(left), _get_fill(right)) + index = left.sp_index + else: + if name[0] == "r": + left, right = right, left + name = name[1:] + + if name in ("and", "or", "xor") and dtype == "bool": + opname = f"sparse_{name}_uint8" + # to make template simple, cast here + left_sp_values = left.sp_values.view(np.uint8) + right_sp_values = right.sp_values.view(np.uint8) + result_dtype = np.bool + else: + opname = f"sparse_{name}_{dtype}" + left_sp_values = left.sp_values + right_sp_values = right.sp_values + + sparse_op = getattr(splib, opname) + + with np.errstate(all="ignore"): + result, index, fill = sparse_op( + left_sp_values, + left.sp_index, + left.fill_value, + right_sp_values, + right.sp_index, + right.fill_value, + ) + + if result_dtype is None: + result_dtype = result.dtype + + return _wrap_result(name, result, index, fill, dtype=result_dtype) + + +def _wrap_result(name, data, sparse_index, fill_value, dtype=None): + """ + wrap op result to have correct dtype + """ + if name.startswith("__"): + # e.g. __eq__ --> eq + name = name[2:-2] + + if name in ("eq", "ne", "lt", "gt", "le", "ge"): + dtype = np.bool + + fill_value = lib.item_from_zerodim(fill_value) + + if is_bool_dtype(dtype): + # fill_value may be np.bool_ + fill_value = bool(fill_value) + return SparseArray( + data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype + ) + + +class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin): + """ + An ExtensionArray for storing sparse data. + + .. versionchanged:: 0.24.0 + + Implements the ExtensionArray interface. + + Parameters + ---------- + data : array-like + A dense array of values to store in the SparseArray. This may contain + `fill_value`. + sparse_index : SparseIndex, optional + index : Index + fill_value : scalar, optional + Elements in `data` that are `fill_value` are not stored in the + SparseArray. For memory savings, this should be the most common value + in `data`. By default, `fill_value` depends on the dtype of `data`: + + =========== ========== + data.dtype na_value + =========== ========== + float ``np.nan`` + int ``0`` + bool False + datetime64 ``pd.NaT`` + timedelta64 ``pd.NaT`` + =========== ========== + + The fill value is potentially specified in three ways. In order of + precedence, these are + + 1. The `fill_value` argument + 2. ``dtype.fill_value`` if `fill_value` is None and `dtype` is + a ``SparseDtype`` + 3. ``data.dtype.fill_value`` if `fill_value` is None and `dtype` + is not a ``SparseDtype`` and `data` is a ``SparseArray``. + + kind : {'integer', 'block'}, default 'integer' + The type of storage for sparse locations. + + * 'block': Stores a `block` and `block_length` for each + contiguous *span* of sparse values. This is best when + sparse data tends to be clumped together, with large + regions of ``fill-value`` values between sparse values. + * 'integer': uses an integer to store the location of + each sparse value. + + dtype : np.dtype or SparseDtype, optional + The dtype to use for the SparseArray. For numpy dtypes, this + determines the dtype of ``self.sp_values``. For SparseDtype, + this determines ``self.sp_values`` and ``self.fill_value``. + copy : bool, default False + Whether to explicitly copy the incoming `data` array. + + Attributes + ---------- + None + + Methods + ------- + None + """ + + _pandas_ftype = "sparse" + _subtyp = "sparse_array" # register ABCSparseArray + _deprecations = PandasObject._deprecations | frozenset(["get_values"]) + _sparse_index: SparseIndex + + def __init__( + self, + data, + sparse_index=None, + index=None, + fill_value=None, + kind="integer", + dtype=None, + copy=False, + ): + + if fill_value is None and isinstance(dtype, SparseDtype): + fill_value = dtype.fill_value + + if isinstance(data, type(self)): + # disable normal inference on dtype, sparse_index, & fill_value + if sparse_index is None: + sparse_index = data.sp_index + if fill_value is None: + fill_value = data.fill_value + if dtype is None: + dtype = data.dtype + # TODO: make kind=None, and use data.kind? + data = data.sp_values + + # Handle use-provided dtype + if isinstance(dtype, str): + # Two options: dtype='int', regular numpy dtype + # or dtype='Sparse[int]', a sparse dtype + try: + dtype = SparseDtype.construct_from_string(dtype) + except TypeError: + dtype = pandas_dtype(dtype) + + if isinstance(dtype, SparseDtype): + if fill_value is None: + fill_value = dtype.fill_value + dtype = dtype.subtype + + if index is not None and not is_scalar(data): + raise Exception("must only pass scalars with an index ") + + if is_scalar(data): + if index is not None: + if data is None: + data = np.nan + + if index is not None: + npoints = len(index) + elif sparse_index is None: + npoints = 1 + else: + npoints = sparse_index.length + + dtype = infer_dtype_from_scalar(data)[0] + data = construct_1d_arraylike_from_scalar(data, npoints, dtype) + + if dtype is not None: + dtype = pandas_dtype(dtype) + + # TODO: disentangle the fill_value dtype inference from + # dtype inference + if data is None: + # XXX: What should the empty dtype be? Object or float? + data = np.array([], dtype=dtype) + + if not is_array_like(data): + try: + # probably shared code in sanitize_series + + data = sanitize_array(data, index=None) + except ValueError: + # NumPy may raise a ValueError on data like [1, []] + # we retry with object dtype here. + if dtype is None: + dtype = object + data = np.atleast_1d(np.asarray(data, dtype=dtype)) + else: + raise + + if copy: + # TODO: avoid double copy when dtype forces cast. + data = data.copy() + + if fill_value is None: + fill_value_dtype = data.dtype if dtype is None else dtype + if fill_value_dtype is None: + fill_value = np.nan + else: + fill_value = na_value_for_dtype(fill_value_dtype) + + if isinstance(data, type(self)) and sparse_index is None: + sparse_index = data._sparse_index + sparse_values = np.asarray(data.sp_values, dtype=dtype) + elif sparse_index is None: + sparse_values, sparse_index, fill_value = make_sparse( + data, kind=kind, fill_value=fill_value, dtype=dtype + ) + else: + sparse_values = np.asarray(data, dtype=dtype) + if len(sparse_values) != sparse_index.npoints: + raise AssertionError( + f"Non array-like type {type(sparse_values)} must " + "have the same length as the index" + ) + self._sparse_index = sparse_index + self._sparse_values = sparse_values + self._dtype = SparseDtype(sparse_values.dtype, fill_value) + + @classmethod + def _simple_new( + cls, sparse_array: np.ndarray, sparse_index: SparseIndex, dtype: SparseDtype + ) -> "SparseArray": + new = cls([]) + new._sparse_index = sparse_index + new._sparse_values = sparse_array + new._dtype = dtype + return new + + @classmethod + def from_spmatrix(cls, data): + """ + Create a SparseArray from a scipy.sparse matrix. + + .. versionadded:: 0.25.0 + + Parameters + ---------- + data : scipy.sparse.sp_matrix + This should be a SciPy sparse matrix where the size + of the second dimension is 1. In other words, a + sparse matrix with a single column. + + Returns + ------- + SparseArray + + Examples + -------- + >>> import scipy.sparse + >>> mat = scipy.sparse.coo_matrix((4, 1)) + >>> pd.arrays.SparseArray.from_spmatrix(mat) + [0.0, 0.0, 0.0, 0.0] + Fill: 0.0 + IntIndex + Indices: array([], dtype=int32) + """ + length, ncol = data.shape + + if ncol != 1: + raise ValueError(f"'data' must have a single column, not '{ncol}'") + + # our sparse index classes require that the positions be strictly + # increasing. So we need to sort loc, and arr accordingly. + arr = data.data + idx, _ = data.nonzero() + loc = np.argsort(idx) + arr = arr.take(loc) + idx.sort() + + zero = np.array(0, dtype=arr.dtype).item() + dtype = SparseDtype(arr.dtype, zero) + index = IntIndex(length, idx) + + return cls._simple_new(arr, index, dtype) + + def __array__(self, dtype=None, copy=True) -> np.ndarray: + fill_value = self.fill_value + + if self.sp_index.ngaps == 0: + # Compat for na dtype and int values. + return self.sp_values + if dtype is None: + # Can NumPy represent this type? + # If not, `np.result_type` will raise. We catch that + # and return object. + if is_datetime64_any_dtype(self.sp_values.dtype): + # However, we *do* special-case the common case of + # a datetime64 with pandas NaT. + if fill_value is NaT: + # Can't put pd.NaT in a datetime64[ns] + fill_value = np.datetime64("NaT") + try: + dtype = np.result_type(self.sp_values.dtype, type(fill_value)) + except TypeError: + dtype = object + + out = np.full(self.shape, fill_value, dtype=dtype) + out[self.sp_index.to_int_index().indices] = self.sp_values + return out + + def __setitem__(self, key, value): + # I suppose we could allow setting of non-fill_value elements. + # TODO(SparseArray.__setitem__): remove special cases in + # ExtensionBlock.where + msg = "SparseArray does not support item assignment via setitem" + raise TypeError(msg) + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + return cls(scalars, dtype=dtype) + + @classmethod + def _from_factorized(cls, values, original): + return cls(values, dtype=original.dtype) + + # ------------------------------------------------------------------------ + # Data + # ------------------------------------------------------------------------ + @property + def sp_index(self): + """ + The SparseIndex containing the location of non- ``fill_value`` points. + """ + return self._sparse_index + + @property + def sp_values(self): + """ + An ndarray containing the non- ``fill_value`` values. + + Examples + -------- + >>> s = SparseArray([0, 0, 1, 0, 2], fill_value=0) + >>> s.sp_values + array([1, 2]) + """ + return self._sparse_values + + @property + def dtype(self): + return self._dtype + + @property + def fill_value(self): + """ + Elements in `data` that are `fill_value` are not stored. + + For memory savings, this should be the most common value in the array. + """ + return self.dtype.fill_value + + @fill_value.setter + def fill_value(self, value): + self._dtype = SparseDtype(self.dtype.subtype, value) + + @property + def kind(self) -> str: + """ + The kind of sparse index for this array. One of {'integer', 'block'}. + """ + if isinstance(self.sp_index, IntIndex): + return "integer" + else: + return "block" + + @property + def _valid_sp_values(self): + sp_vals = self.sp_values + mask = notna(sp_vals) + return sp_vals[mask] + + def __len__(self) -> int: + return self.sp_index.length + + @property + def _null_fill_value(self): + return self._dtype._is_na_fill_value + + def _fill_value_matches(self, fill_value): + if self._null_fill_value: + return isna(fill_value) + else: + return self.fill_value == fill_value + + @property + def nbytes(self) -> int: + return self.sp_values.nbytes + self.sp_index.nbytes + + @property + def density(self): + """ + The percent of non- ``fill_value`` points, as decimal. + + Examples + -------- + >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0) + >>> s.density + 0.6 + """ + r = float(self.sp_index.npoints) / float(self.sp_index.length) + return r + + @property + def npoints(self) -> int: + """ + The number of non- ``fill_value`` points. + + Examples + -------- + >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0) + >>> s.npoints + 3 + """ + return self.sp_index.npoints + + def isna(self): + # If null fill value, we want SparseDtype[bool, true] + # to preserve the same memory usage. + dtype = SparseDtype(bool, self._null_fill_value) + return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype) + + def fillna(self, value=None, method=None, limit=None): + """ + Fill missing values with `value`. + + Parameters + ---------- + value : scalar, optional + method : str, optional + + .. warning:: + + Using 'method' will result in high memory use, + as all `fill_value` methods will be converted to + an in-memory ndarray + + limit : int, optional + + Returns + ------- + SparseArray + + Notes + ----- + When `value` is specified, the result's ``fill_value`` depends on + ``self.fill_value``. The goal is to maintain low-memory use. + + If ``self.fill_value`` is NA, the result dtype will be + ``SparseDtype(self.dtype, fill_value=value)``. This will preserve + amount of memory used before and after filling. + + When ``self.fill_value`` is not NA, the result dtype will be + ``self.dtype``. Again, this preserves the amount of memory used. + """ + if (method is None and value is None) or ( + method is not None and value is not None + ): + raise ValueError("Must specify one of 'method' or 'value'.") + + elif method is not None: + msg = "fillna with 'method' requires high memory usage." + warnings.warn(msg, PerformanceWarning) + filled = interpolate_2d(np.asarray(self), method=method, limit=limit) + return type(self)(filled, fill_value=self.fill_value) + + else: + new_values = np.where(isna(self.sp_values), value, self.sp_values) + + if self._null_fill_value: + # This is essentially just updating the dtype. + new_dtype = SparseDtype(self.dtype.subtype, fill_value=value) + else: + new_dtype = self.dtype + + return self._simple_new(new_values, self._sparse_index, new_dtype) + + def shift(self, periods=1, fill_value=None): + + if not len(self) or periods == 0: + return self.copy() + + if isna(fill_value): + fill_value = self.dtype.na_value + + subtype = np.result_type(fill_value, self.dtype.subtype) + + if subtype != self.dtype.subtype: + # just coerce up front + arr = self.astype(SparseDtype(subtype, self.fill_value)) + else: + arr = self + + empty = self._from_sequence( + [fill_value] * min(abs(periods), len(self)), dtype=arr.dtype + ) + + if periods > 0: + a = empty + b = arr[:-periods] + else: + a = arr[abs(periods) :] + b = empty + return arr._concat_same_type([a, b]) + + def _first_fill_value_loc(self): + """ + Get the location of the first missing value. + + Returns + ------- + int + """ + if len(self) == 0 or self.sp_index.npoints == len(self): + return -1 + + indices = self.sp_index.to_int_index().indices + if not len(indices) or indices[0] > 0: + return 0 + + diff = indices[1:] - indices[:-1] + return np.searchsorted(diff, 2) + 1 + + def unique(self): + uniques = list(algos.unique(self.sp_values)) + fill_loc = self._first_fill_value_loc() + if fill_loc >= 0: + uniques.insert(fill_loc, self.fill_value) + return type(self)._from_sequence(uniques, dtype=self.dtype) + + def _values_for_factorize(self): + # Still override this for hash_pandas_object + return np.asarray(self), self.fill_value + + def factorize(self, na_sentinel=-1): + # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA] + # The sparsity on this is backwards from what Sparse would want. Want + # ExtensionArray.factorize -> Tuple[EA, EA] + # Given that we have to return a dense array of codes, why bother + # implementing an efficient factorize? + codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel) + uniques = SparseArray(uniques, dtype=self.dtype) + return codes, uniques + + def value_counts(self, dropna=True): + """ + Returns a Series containing counts of unique values. + + Parameters + ---------- + dropna : boolean, default True + Don't include counts of NaN, even if NaN is in sp_values. + + Returns + ------- + counts : Series + """ + from pandas import Index, Series + + keys, counts = algos._value_counts_arraylike(self.sp_values, dropna=dropna) + fcounts = self.sp_index.ngaps + if fcounts > 0: + if self._null_fill_value and dropna: + pass + else: + if self._null_fill_value: + mask = isna(keys) + else: + mask = keys == self.fill_value + + if mask.any(): + counts[mask] += fcounts + else: + keys = np.insert(keys, 0, self.fill_value) + counts = np.insert(counts, 0, fcounts) + + if not isinstance(keys, ABCIndexClass): + keys = Index(keys) + result = Series(counts, index=keys) + return result + + # -------- + # Indexing + # -------- + + def __getitem__(self, key): + # avoid mypy issues when importing at the top-level + from pandas.core.indexing import check_bool_indexer + + if isinstance(key, tuple): + if len(key) > 1: + raise IndexError("too many indices for array.") + key = key[0] + + if is_integer(key): + return self._get_val_at(key) + elif isinstance(key, tuple): + data_slice = self.to_dense()[key] + elif isinstance(key, slice): + # special case to preserve dtypes + if key == slice(None): + return self.copy() + # TODO: this logic is surely elsewhere + # TODO: this could be more efficient + indices = np.arange(len(self), dtype=np.int32)[key] + return self.take(indices) + else: + # TODO: I think we can avoid densifying when masking a + # boolean SparseArray with another. Need to look at the + # key's fill_value for True / False, and then do an intersection + # on the indicies of the sp_values. + if isinstance(key, SparseArray): + if is_bool_dtype(key): + key = key.to_dense() + else: + key = np.asarray(key) + + key = check_array_indexer(self, key) + + if com.is_bool_indexer(key): + key = check_bool_indexer(self, key) + + return self.take(np.arange(len(key), dtype=np.int32)[key]) + elif hasattr(key, "__len__"): + return self.take(key) + else: + raise ValueError(f"Cannot slice with '{key}'") + + return type(self)(data_slice, kind=self.kind) + + def _get_val_at(self, loc): + n = len(self) + if loc < 0: + loc += n + + if loc >= n or loc < 0: + raise IndexError("Out of bounds access") + + sp_loc = self.sp_index.lookup(loc) + if sp_loc == -1: + return self.fill_value + else: + return libindex.get_value_at(self.sp_values, sp_loc) + + def take(self, indices, allow_fill=False, fill_value=None): + if is_scalar(indices): + raise ValueError(f"'indices' must be an array, not a scalar '{indices}'.") + indices = np.asarray(indices, dtype=np.int32) + + if indices.size == 0: + result = [] + kwargs = {"dtype": self.dtype} + elif allow_fill: + result = self._take_with_fill(indices, fill_value=fill_value) + kwargs = {} + else: + result = self._take_without_fill(indices) + kwargs = {"dtype": self.dtype} + + return type(self)(result, fill_value=self.fill_value, kind=self.kind, **kwargs) + + def _take_with_fill(self, indices, fill_value=None): + if fill_value is None: + fill_value = self.dtype.na_value + + if indices.min() < -1: + raise ValueError( + "Invalid value in 'indices'. Must be between -1 " + "and the length of the array." + ) + + if indices.max() >= len(self): + raise IndexError("out of bounds value in 'indices'.") + + if len(self) == 0: + # Empty... Allow taking only if all empty + if (indices == -1).all(): + dtype = np.result_type(self.sp_values, type(fill_value)) + taken = np.empty_like(indices, dtype=dtype) + taken.fill(fill_value) + return taken + else: + raise IndexError("cannot do a non-empty take from an empty axes.") + + sp_indexer = self.sp_index.lookup_array(indices) + + if self.sp_index.npoints == 0: + # Avoid taking from the empty self.sp_values + taken = np.full( + sp_indexer.shape, + fill_value=fill_value, + dtype=np.result_type(type(fill_value)), + ) + else: + taken = self.sp_values.take(sp_indexer) + + # sp_indexer may be -1 for two reasons + # 1.) we took for an index of -1 (new) + # 2.) we took a value that was self.fill_value (old) + new_fill_indices = indices == -1 + old_fill_indices = (sp_indexer == -1) & ~new_fill_indices + + # Fill in two steps. + # Old fill values + # New fill values + # potentially coercing to a new dtype at each stage. + + m0 = sp_indexer[old_fill_indices] < 0 + m1 = sp_indexer[new_fill_indices] < 0 + + result_type = taken.dtype + + if m0.any(): + result_type = np.result_type(result_type, type(self.fill_value)) + taken = taken.astype(result_type) + taken[old_fill_indices] = self.fill_value + + if m1.any(): + result_type = np.result_type(result_type, type(fill_value)) + taken = taken.astype(result_type) + taken[new_fill_indices] = fill_value + + return taken + + def _take_without_fill(self, indices): + to_shift = indices < 0 + indices = indices.copy() + + n = len(self) + + if (indices.max() >= n) or (indices.min() < -n): + if n == 0: + raise IndexError("cannot do a non-empty take from an empty axes.") + else: + raise IndexError("out of bounds value in 'indices'.") + + if to_shift.any(): + indices[to_shift] += n + + if self.sp_index.npoints == 0: + # edge case in take... + # I think just return + out = np.full( + indices.shape, + self.fill_value, + dtype=np.result_type(type(self.fill_value)), + ) + arr, sp_index, fill_value = make_sparse(out, fill_value=self.fill_value) + return type(self)(arr, sparse_index=sp_index, fill_value=fill_value) + + sp_indexer = self.sp_index.lookup_array(indices) + taken = self.sp_values.take(sp_indexer) + fillable = sp_indexer < 0 + + if fillable.any(): + # TODO: may need to coerce array to fill value + result_type = np.result_type(taken, type(self.fill_value)) + taken = taken.astype(result_type) + taken[fillable] = self.fill_value + + return taken + + def searchsorted(self, v, side="left", sorter=None): + msg = "searchsorted requires high memory usage." + warnings.warn(msg, PerformanceWarning, stacklevel=2) + if not is_scalar(v): + v = np.asarray(v) + v = np.asarray(v) + return np.asarray(self, dtype=self.dtype.subtype).searchsorted(v, side, sorter) + + def copy(self): + values = self.sp_values.copy() + return self._simple_new(values, self.sp_index, self.dtype) + + @classmethod + def _concat_same_type(cls, to_concat): + fill_values = [x.fill_value for x in to_concat] + + fill_value = fill_values[0] + + # np.nan isn't a singleton, so we may end up with multiple + # NaNs here, so we ignore tha all NA case too. + if not (len(set(fill_values)) == 1 or isna(fill_values).all()): + warnings.warn( + "Concatenating sparse arrays with multiple fill " + f"values: '{fill_values}'. Picking the first and " + "converting the rest.", + PerformanceWarning, + stacklevel=6, + ) + keep = to_concat[0] + to_concat2 = [keep] + + for arr in to_concat[1:]: + to_concat2.append(cls(np.asarray(arr), fill_value=fill_value)) + + to_concat = to_concat2 + + values = [] + length = 0 + + if to_concat: + sp_kind = to_concat[0].kind + else: + sp_kind = "integer" + + if sp_kind == "integer": + indices = [] + + for arr in to_concat: + idx = arr.sp_index.to_int_index().indices.copy() + idx += length # TODO: wraparound + length += arr.sp_index.length + + values.append(arr.sp_values) + indices.append(idx) + + data = np.concatenate(values) + indices = np.concatenate(indices) + sp_index = IntIndex(length, indices) + + else: + # when concatenating block indices, we don't claim that you'll + # get an identical index as concating the values and then + # creating a new index. We don't want to spend the time trying + # to merge blocks across arrays in `to_concat`, so the resulting + # BlockIndex may have more blocs. + blengths = [] + blocs = [] + + for arr in to_concat: + idx = arr.sp_index.to_block_index() + + values.append(arr.sp_values) + blocs.append(idx.blocs.copy() + length) + blengths.append(idx.blengths) + length += arr.sp_index.length + + data = np.concatenate(values) + blocs = np.concatenate(blocs) + blengths = np.concatenate(blengths) + + sp_index = BlockIndex(length, blocs, blengths) + + return cls(data, sparse_index=sp_index, fill_value=fill_value) + + def astype(self, dtype=None, copy=True): + """ + Change the dtype of a SparseArray. + + The output will always be a SparseArray. To convert to a dense + ndarray with a certain dtype, use :meth:`numpy.asarray`. + + Parameters + ---------- + dtype : np.dtype or ExtensionDtype + For SparseDtype, this changes the dtype of + ``self.sp_values`` and the ``self.fill_value``. + + For other dtypes, this only changes the dtype of + ``self.sp_values``. + + copy : bool, default True + Whether to ensure a copy is made, even if not necessary. + + Returns + ------- + SparseArray + + Examples + -------- + >>> arr = SparseArray([0, 0, 1, 2]) + >>> arr + [0, 0, 1, 2] + Fill: 0 + IntIndex + Indices: array([2, 3], dtype=int32) + + >>> arr.astype(np.dtype('int32')) + [0, 0, 1, 2] + Fill: 0 + IntIndex + Indices: array([2, 3], dtype=int32) + + Using a NumPy dtype with a different kind (e.g. float) will coerce + just ``self.sp_values``. + + >>> arr.astype(np.dtype('float64')) + ... # doctest: +NORMALIZE_WHITESPACE + [0, 0, 1.0, 2.0] + Fill: 0 + IntIndex + Indices: array([2, 3], dtype=int32) + + Use a SparseDtype if you wish to be change the fill value as well. + + >>> arr.astype(SparseDtype("float64", fill_value=np.nan)) + ... # doctest: +NORMALIZE_WHITESPACE + [nan, nan, 1.0, 2.0] + Fill: nan + IntIndex + Indices: array([2, 3], dtype=int32) + """ + dtype = self.dtype.update_dtype(dtype) + subtype = dtype._subtype_with_str + sp_values = astype_nansafe(self.sp_values, subtype, copy=copy) + if sp_values is self.sp_values and copy: + sp_values = sp_values.copy() + + return self._simple_new(sp_values, self.sp_index, dtype) + + def map(self, mapper): + """ + Map categories using input correspondence (dict, Series, or function). + + Parameters + ---------- + mapper : dict, Series, callable + The correspondence from old values to new. + + Returns + ------- + SparseArray + The output array will have the same density as the input. + The output fill value will be the result of applying the + mapping to ``self.fill_value`` + + Examples + -------- + >>> arr = pd.arrays.SparseArray([0, 1, 2]) + >>> arr.apply(lambda x: x + 10) + [10, 11, 12] + Fill: 10 + IntIndex + Indices: array([1, 2], dtype=int32) + + >>> arr.apply({0: 10, 1: 11, 2: 12}) + [10, 11, 12] + Fill: 10 + IntIndex + Indices: array([1, 2], dtype=int32) + + >>> arr.apply(pd.Series([10, 11, 12], index=[0, 1, 2])) + [10, 11, 12] + Fill: 10 + IntIndex + Indices: array([1, 2], dtype=int32) + """ + # this is used in apply. + # We get hit since we're an "is_extension_type" but regular extension + # types are not hit. This may be worth adding to the interface. + if isinstance(mapper, ABCSeries): + mapper = mapper.to_dict() + + if isinstance(mapper, abc.Mapping): + fill_value = mapper.get(self.fill_value, self.fill_value) + sp_values = [mapper.get(x, None) for x in self.sp_values] + else: + fill_value = mapper(self.fill_value) + sp_values = [mapper(x) for x in self.sp_values] + + return type(self)(sp_values, sparse_index=self.sp_index, fill_value=fill_value) + + def to_dense(self): + """ + Convert SparseArray to a NumPy array. + + Returns + ------- + arr : NumPy array + """ + return np.asarray(self, dtype=self.sp_values.dtype) + + _internal_get_values = to_dense + + # ------------------------------------------------------------------------ + # IO + # ------------------------------------------------------------------------ + def __setstate__(self, state): + """Necessary for making this object picklable""" + if isinstance(state, tuple): + # Compat for pandas < 0.24.0 + nd_state, (fill_value, sp_index) = state + sparse_values = np.array([]) + sparse_values.__setstate__(nd_state) + + self._sparse_values = sparse_values + self._sparse_index = sp_index + self._dtype = SparseDtype(sparse_values.dtype, fill_value) + else: + self.__dict__.update(state) + + def nonzero(self): + if self.fill_value == 0: + return (self.sp_index.to_int_index().indices,) + else: + return (self.sp_index.to_int_index().indices[self.sp_values != 0],) + + # ------------------------------------------------------------------------ + # Reductions + # ------------------------------------------------------------------------ + + def _reduce(self, name, skipna=True, **kwargs): + method = getattr(self, name, None) + + if method is None: + raise TypeError(f"cannot perform {name} with type {self.dtype}") + + if skipna: + arr = self + else: + arr = self.dropna() + + # we don't support these kwargs. + # They should only be present when called via pandas, so do it here. + # instead of in `any` / `all` (which will raise if they're present, + # thanks to nv.validate + kwargs.pop("filter_type", None) + kwargs.pop("numeric_only", None) + kwargs.pop("op", None) + return getattr(arr, name)(**kwargs) + + def all(self, axis=None, *args, **kwargs): + """ + Tests whether all elements evaluate True + + Returns + ------- + all : bool + + See Also + -------- + numpy.all + """ + nv.validate_all(args, kwargs) + + values = self.sp_values + + if len(values) != len(self) and not np.all(self.fill_value): + return False + + return values.all() + + def any(self, axis=0, *args, **kwargs): + """ + Tests whether at least one of elements evaluate True + + Returns + ------- + any : bool + + See Also + -------- + numpy.any + """ + nv.validate_any(args, kwargs) + + values = self.sp_values + + if len(values) != len(self) and np.any(self.fill_value): + return True + + return values.any().item() + + def sum(self, axis=0, *args, **kwargs): + """ + Sum of non-NA/null values + + Returns + ------- + sum : float + """ + nv.validate_sum(args, kwargs) + valid_vals = self._valid_sp_values + sp_sum = valid_vals.sum() + if self._null_fill_value: + return sp_sum + else: + nsparse = self.sp_index.ngaps + return sp_sum + self.fill_value * nsparse + + def cumsum(self, axis=0, *args, **kwargs): + """ + Cumulative sum of non-NA/null values. + + When performing the cumulative summation, any non-NA/null values will + be skipped. The resulting SparseArray will preserve the locations of + NaN values, but the fill value will be `np.nan` regardless. + + Parameters + ---------- + axis : int or None + Axis over which to perform the cumulative summation. If None, + perform cumulative summation over flattened array. + + Returns + ------- + cumsum : SparseArray + """ + nv.validate_cumsum(args, kwargs) + + if axis is not None and axis >= self.ndim: # Mimic ndarray behaviour. + raise ValueError(f"axis(={axis}) out of bounds") + + if not self._null_fill_value: + return SparseArray(self.to_dense()).cumsum() + + return SparseArray( + self.sp_values.cumsum(), + sparse_index=self.sp_index, + fill_value=self.fill_value, + ) + + def mean(self, axis=0, *args, **kwargs): + """ + Mean of non-NA/null values + + Returns + ------- + mean : float + """ + nv.validate_mean(args, kwargs) + valid_vals = self._valid_sp_values + sp_sum = valid_vals.sum() + ct = len(valid_vals) + + if self._null_fill_value: + return sp_sum / ct + else: + nsparse = self.sp_index.ngaps + return (sp_sum + self.fill_value * nsparse) / (ct + nsparse) + + def transpose(self, *axes): + """ + Returns the SparseArray. + """ + return self + + @property + def T(self): + """ + Returns the SparseArray. + """ + return self + + # ------------------------------------------------------------------------ + # Ufuncs + # ------------------------------------------------------------------------ + + _HANDLED_TYPES = (np.ndarray, numbers.Number) + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + out = kwargs.get("out", ()) + + for x in inputs + out: + if not isinstance(x, self._HANDLED_TYPES + (SparseArray,)): + return NotImplemented + + # for binary ops, use our custom dunder methods + result = ops.maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + return result + + if len(inputs) == 1: + # No alignment necessary. + sp_values = getattr(ufunc, method)(self.sp_values, **kwargs) + fill_value = getattr(ufunc, method)(self.fill_value, **kwargs) + + if isinstance(sp_values, tuple): + # multiple outputs. e.g. modf + arrays = tuple( + self._simple_new( + sp_value, self.sp_index, SparseDtype(sp_value.dtype, fv) + ) + for sp_value, fv in zip(sp_values, fill_value) + ) + return arrays + elif is_scalar(sp_values): + # e.g. reductions + return sp_values + + return self._simple_new( + sp_values, self.sp_index, SparseDtype(sp_values.dtype, fill_value) + ) + + result = getattr(ufunc, method)(*[np.asarray(x) for x in inputs], **kwargs) + if out: + if len(out) == 1: + out = out[0] + return out + + if type(result) is tuple: + return tuple(type(self)(x) for x in result) + elif method == "at": + # no return value + return None + else: + return type(self)(result) + + def __abs__(self): + return np.abs(self) + + # ------------------------------------------------------------------------ + # Ops + # ------------------------------------------------------------------------ + + @classmethod + def _create_unary_method(cls, op) -> Callable[["SparseArray"], "SparseArray"]: + def sparse_unary_method(self) -> "SparseArray": + fill_value = op(np.array(self.fill_value)).item() + values = op(self.sp_values) + dtype = SparseDtype(values.dtype, fill_value) + return cls._simple_new(values, self.sp_index, dtype) + + name = f"__{op.__name__}__" + return compat.set_function_name(sparse_unary_method, name, cls) + + @classmethod + def _create_arithmetic_method(cls, op): + op_name = op.__name__ + + @unpack_zerodim_and_defer(op_name) + def sparse_arithmetic_method(self, other): + + if isinstance(other, SparseArray): + return _sparse_array_op(self, other, op, op_name) + + elif is_scalar(other): + with np.errstate(all="ignore"): + fill = op(_get_fill(self), np.asarray(other)) + result = op(self.sp_values, other) + + if op_name == "divmod": + left, right = result + lfill, rfill = fill + return ( + _wrap_result(op_name, left, self.sp_index, lfill), + _wrap_result(op_name, right, self.sp_index, rfill), + ) + + return _wrap_result(op_name, result, self.sp_index, fill) + + else: + other = np.asarray(other) + with np.errstate(all="ignore"): + # TODO: look into _wrap_result + if len(self) != len(other): + raise AssertionError( + (f"length mismatch: {len(self)} vs. {len(other)}") + ) + if not isinstance(other, SparseArray): + dtype = getattr(other, "dtype", None) + other = SparseArray( + other, fill_value=self.fill_value, dtype=dtype + ) + return _sparse_array_op(self, other, op, op_name) + + name = f"__{op.__name__}__" + return compat.set_function_name(sparse_arithmetic_method, name, cls) + + @classmethod + def _create_comparison_method(cls, op): + op_name = op.__name__ + if op_name in {"and_", "or_"}: + op_name = op_name[:-1] + + @unpack_zerodim_and_defer(op_name) + def cmp_method(self, other): + + if not is_scalar(other) and not isinstance(other, type(self)): + # convert list-like to ndarray + other = np.asarray(other) + + if isinstance(other, np.ndarray): + # TODO: make this more flexible than just ndarray... + if len(self) != len(other): + raise AssertionError( + f"length mismatch: {len(self)} vs. {len(other)}" + ) + other = SparseArray(other, fill_value=self.fill_value) + + if isinstance(other, SparseArray): + return _sparse_array_op(self, other, op, op_name) + else: + with np.errstate(all="ignore"): + fill_value = op(self.fill_value, other) + result = op(self.sp_values, other) + + return type(self)( + result, + sparse_index=self.sp_index, + fill_value=fill_value, + dtype=np.bool_, + ) + + name = f"__{op.__name__}__" + return compat.set_function_name(cmp_method, name, cls) + + @classmethod + def _add_unary_ops(cls): + cls.__pos__ = cls._create_unary_method(operator.pos) + cls.__neg__ = cls._create_unary_method(operator.neg) + cls.__invert__ = cls._create_unary_method(operator.invert) + + @classmethod + def _add_comparison_ops(cls): + cls.__and__ = cls._create_comparison_method(operator.and_) + cls.__or__ = cls._create_comparison_method(operator.or_) + cls.__xor__ = cls._create_arithmetic_method(operator.xor) + super()._add_comparison_ops() + + # ---------- + # Formatting + # ----------- + def __repr__(self) -> str: + pp_str = printing.pprint_thing(self) + pp_fill = printing.pprint_thing(self.fill_value) + pp_index = printing.pprint_thing(self.sp_index) + return f"{pp_str}\nFill: {pp_fill}\n{pp_index}" + + def _formatter(self, boxed=False): + # Defer to the formatter from the GenericArrayFormatter calling us. + # This will infer the correct formatter from the dtype of the values. + return None + + +SparseArray._add_arithmetic_ops() +SparseArray._add_comparison_ops() +SparseArray._add_unary_ops() + + +def make_sparse(arr, kind="block", fill_value=None, dtype=None, copy=False): + """ + Convert ndarray to sparse format + + Parameters + ---------- + arr : ndarray + kind : {'block', 'integer'} + fill_value : NaN or another value + dtype : np.dtype, optional + copy : bool, default False + + Returns + ------- + (sparse_values, index, fill_value) : (ndarray, SparseIndex, Scalar) + """ + + arr = com.values_from_object(arr) + + if arr.ndim > 1: + raise TypeError("expected dimension <= 1 data") + + if fill_value is None: + fill_value = na_value_for_dtype(arr.dtype) + + if isna(fill_value): + mask = notna(arr) + else: + # cast to object comparison to be safe + if is_string_dtype(arr): + arr = arr.astype(object) + + if is_object_dtype(arr.dtype): + # element-wise equality check method in numpy doesn't treat + # each element type, eg. 0, 0.0, and False are treated as + # same. So we have to check the both of its type and value. + mask = splib.make_mask_object_ndarray(arr, fill_value) + else: + mask = arr != fill_value + + length = len(arr) + if length != len(mask): + # the arr is a SparseArray + indices = mask.sp_index.indices + else: + indices = mask.nonzero()[0].astype(np.int32) + + index = _make_index(length, indices, kind) + sparsified_values = arr[mask] + if dtype is not None: + sparsified_values = astype_nansafe(sparsified_values, dtype=dtype) + # TODO: copy + return sparsified_values, index, fill_value + + +def _make_index(length, indices, kind): + + if kind == "block" or isinstance(kind, BlockIndex): + locs, lens = splib.get_blocks(indices) + index = BlockIndex(length, locs, lens) + elif kind == "integer" or isinstance(kind, IntIndex): + index = IntIndex(length, indices) + else: # pragma: no cover + raise ValueError("must be block or integer type") + return index diff --git a/venv/Lib/site-packages/pandas/core/arrays/sparse/dtype.py b/venv/Lib/site-packages/pandas/core/arrays/sparse/dtype.py new file mode 100644 index 0000000..6f15681 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/arrays/sparse/dtype.py @@ -0,0 +1,348 @@ +"""Sparse Dtype""" + +import re +from typing import Any, Tuple + +import numpy as np + +from pandas._typing import Dtype + +from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.cast import astype_nansafe +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_object_dtype, + is_scalar, + is_string_dtype, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import register_extension_dtype +from pandas.core.dtypes.missing import isna, na_value_for_dtype + + +@register_extension_dtype +class SparseDtype(ExtensionDtype): + """ + Dtype for data stored in :class:`SparseArray`. + + This dtype implements the pandas ExtensionDtype interface. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + dtype : str, ExtensionDtype, numpy.dtype, type, default numpy.float64 + The dtype of the underlying array storing the non-fill value values. + fill_value : scalar, optional + The scalar value not stored in the SparseArray. By default, this + depends on `dtype`. + + =========== ========== + dtype na_value + =========== ========== + float ``np.nan`` + int ``0`` + bool ``False`` + datetime64 ``pd.NaT`` + timedelta64 ``pd.NaT`` + =========== ========== + + The default value may be overridden by specifying a `fill_value`. + + Attributes + ---------- + None + + Methods + ------- + None + """ + + # We include `_is_na_fill_value` in the metadata to avoid hash collisions + # between SparseDtype(float, 0.0) and SparseDtype(float, nan). + # Without is_na_fill_value in the comparison, those would be equal since + # hash(nan) is (sometimes?) 0. + _metadata = ("_dtype", "_fill_value", "_is_na_fill_value") + + def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None): + + if isinstance(dtype, type(self)): + if fill_value is None: + fill_value = dtype.fill_value + dtype = dtype.subtype + + dtype = pandas_dtype(dtype) + if is_string_dtype(dtype): + dtype = np.dtype("object") + + if fill_value is None: + fill_value = na_value_for_dtype(dtype) + + if not is_scalar(fill_value): + raise ValueError(f"fill_value must be a scalar. Got {fill_value} instead") + self._dtype = dtype + self._fill_value = fill_value + + def __hash__(self): + # Python3 doesn't inherit __hash__ when a base class overrides + # __eq__, so we explicitly do it here. + return super().__hash__() + + def __eq__(self, other: Any) -> bool: + # We have to override __eq__ to handle NA values in _metadata. + # The base class does simple == checks, which fail for NA. + if isinstance(other, str): + try: + other = self.construct_from_string(other) + except TypeError: + return False + + if isinstance(other, type(self)): + subtype = self.subtype == other.subtype + if self._is_na_fill_value: + # this case is complicated by two things: + # SparseDtype(float, float(nan)) == SparseDtype(float, np.nan) + # SparseDtype(float, np.nan) != SparseDtype(float, pd.NaT) + # i.e. we want to treat any floating-point NaN as equal, but + # not a floating-point NaN and a datetime NaT. + fill_value = ( + other._is_na_fill_value + and isinstance(self.fill_value, type(other.fill_value)) + or isinstance(other.fill_value, type(self.fill_value)) + ) + else: + fill_value = self.fill_value == other.fill_value + + return subtype and fill_value + return False + + @property + def fill_value(self): + """ + The fill value of the array. + + Converting the SparseArray to a dense ndarray will fill the + array with this value. + + .. warning:: + + It's possible to end up with a SparseArray that has ``fill_value`` + values in ``sp_values``. This can occur, for example, when setting + ``SparseArray.fill_value`` directly. + """ + return self._fill_value + + @property + def _is_na_fill_value(self): + return isna(self.fill_value) + + @property + def _is_numeric(self): + return not is_object_dtype(self.subtype) + + @property + def _is_boolean(self): + return is_bool_dtype(self.subtype) + + @property + def kind(self): + """ + The sparse kind. Either 'integer', or 'block'. + """ + return self.subtype.kind + + @property + def type(self): + return self.subtype.type + + @property + def subtype(self): + return self._dtype + + @property + def name(self): + return f"Sparse[{self.subtype.name}, {self.fill_value}]" + + def __repr__(self) -> str: + return self.name + + @classmethod + def construct_array_type(cls): + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + from pandas.core.arrays.sparse.array import SparseArray + + return SparseArray + + @classmethod + def construct_from_string(cls, string): + """ + Construct a SparseDtype from a string form. + + Parameters + ---------- + string : str + Can take the following forms. + + string dtype + ================ ============================ + 'int' SparseDtype[np.int64, 0] + 'Sparse' SparseDtype[np.float64, nan] + 'Sparse[int]' SparseDtype[np.int64, 0] + 'Sparse[int, 0]' SparseDtype[np.int64, 0] + ================ ============================ + + It is not possible to specify non-default fill values + with a string. An argument like ``'Sparse[int, 1]'`` + will raise a ``TypeError`` because the default fill value + for integers is 0. + + Returns + ------- + SparseDtype + """ + msg = f"Cannot construct a 'SparseDtype' from '{string}'" + if string.startswith("Sparse"): + try: + sub_type, has_fill_value = cls._parse_subtype(string) + except ValueError: + raise TypeError(msg) + else: + result = SparseDtype(sub_type) + msg = ( + f"Cannot construct a 'SparseDtype' from '{string}'.\n\nIt " + "looks like the fill_value in the string is not " + "the default for the dtype. Non-default fill_values " + "are not supported. Use the 'SparseDtype()' " + "constructor instead." + ) + if has_fill_value and str(result) != string: + raise TypeError(msg) + return result + else: + raise TypeError(msg) + + @staticmethod + def _parse_subtype(dtype: str) -> Tuple[str, bool]: + """ + Parse a string to get the subtype + + Parameters + ---------- + dtype : str + A string like + + * Sparse[subtype] + * Sparse[subtype, fill_value] + + Returns + ------- + subtype : str + + Raises + ------ + ValueError + When the subtype cannot be extracted. + """ + xpr = re.compile(r"Sparse\[(?P[^,]*)(, )?(?P.*?)?\]$") + m = xpr.match(dtype) + has_fill_value = False + if m: + subtype = m.groupdict()["subtype"] + has_fill_value = bool(m.groupdict()["fill_value"]) + elif dtype == "Sparse": + subtype = "float64" + else: + raise ValueError(f"Cannot parse {dtype}") + return subtype, has_fill_value + + @classmethod + def is_dtype(cls, dtype): + dtype = getattr(dtype, "dtype", dtype) + if isinstance(dtype, str) and dtype.startswith("Sparse"): + sub_type, _ = cls._parse_subtype(dtype) + dtype = np.dtype(sub_type) + elif isinstance(dtype, cls): + return True + return isinstance(dtype, np.dtype) or dtype == "Sparse" + + def update_dtype(self, dtype): + """ + Convert the SparseDtype to a new dtype. + + This takes care of converting the ``fill_value``. + + Parameters + ---------- + dtype : Union[str, numpy.dtype, SparseDtype] + The new dtype to use. + + * For a SparseDtype, it is simply returned + * For a NumPy dtype (or str), the current fill value + is converted to the new dtype, and a SparseDtype + with `dtype` and the new fill value is returned. + + Returns + ------- + SparseDtype + A new SparseDtype with the correct `dtype` and fill value + for that `dtype`. + + Raises + ------ + ValueError + When the current fill value cannot be converted to the + new `dtype` (e.g. trying to convert ``np.nan`` to an + integer dtype). + + + Examples + -------- + >>> SparseDtype(int, 0).update_dtype(float) + Sparse[float64, 0.0] + + >>> SparseDtype(int, 1).update_dtype(SparseDtype(float, np.nan)) + Sparse[float64, nan] + """ + cls = type(self) + dtype = pandas_dtype(dtype) + + if not isinstance(dtype, cls): + fill_value = astype_nansafe(np.array(self.fill_value), dtype).item() + dtype = cls(dtype, fill_value=fill_value) + + return dtype + + @property + def _subtype_with_str(self): + """ + Whether the SparseDtype's subtype should be considered ``str``. + + Typically, pandas will store string data in an object-dtype array. + When converting values to a dtype, e.g. in ``.astype``, we need to + be more specific, we need the actual underlying type. + + Returns + ------- + + >>> SparseDtype(int, 1)._subtype_with_str + dtype('int64') + + >>> SparseDtype(object, 1)._subtype_with_str + dtype('O') + + >>> dtype = SparseDtype(str, '') + >>> dtype.subtype + dtype('O') + + >>> dtype._subtype_with_str + str + """ + if isinstance(self.fill_value, str): + return type(self.fill_value) + return self.subtype diff --git a/venv/Lib/site-packages/pandas/core/arrays/sparse/scipy_sparse.py b/venv/Lib/site-packages/pandas/core/arrays/sparse/scipy_sparse.py new file mode 100644 index 0000000..88d6307 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/arrays/sparse/scipy_sparse.py @@ -0,0 +1,146 @@ +""" +Interaction with scipy.sparse matrices. + +Currently only includes to_coo helpers. +""" +from pandas.core.indexes.api import Index, MultiIndex +from pandas.core.series import Series + + +def _check_is_partition(parts, whole): + whole = set(whole) + parts = [set(x) for x in parts] + if set.intersection(*parts) != set(): + raise ValueError("Is not a partition because intersection is not null.") + if set.union(*parts) != whole: + raise ValueError("Is not a partition because union is not the whole.") + + +def _to_ijv(ss, row_levels=(0,), column_levels=(1,), sort_labels=False): + """ For arbitrary (MultiIndexed) SparseSeries return + (v, i, j, ilabels, jlabels) where (v, (i, j)) is suitable for + passing to scipy.sparse.coo constructor. """ + # index and column levels must be a partition of the index + _check_is_partition([row_levels, column_levels], range(ss.index.nlevels)) + + # from the SparseSeries: get the labels and data for non-null entries + values = ss._data.internal_values()._valid_sp_values + + nonnull_labels = ss.dropna() + + def get_indexers(levels): + """ Return sparse coords and dense labels for subset levels """ + + # TODO: how to do this better? cleanly slice nonnull_labels given the + # coord + values_ilabels = [tuple(x[i] for i in levels) for x in nonnull_labels.index] + if len(levels) == 1: + values_ilabels = [x[0] for x in values_ilabels] + + # # performance issues with groupby ################################### + # TODO: these two lines can replace the code below but + # groupby is too slow (in some cases at least) + # labels_to_i = ss.groupby(level=levels, sort=sort_labels).first() + # labels_to_i[:] = np.arange(labels_to_i.shape[0]) + + def _get_label_to_i_dict(labels, sort_labels=False): + """ Return dict of unique labels to number. + Optionally sort by label. + """ + labels = Index(map(tuple, labels)).unique().tolist() # squish + if sort_labels: + labels = sorted(labels) + return {k: i for i, k in enumerate(labels)} + + def _get_index_subset_to_coord_dict(index, subset, sort_labels=False): + ilabels = list(zip(*[index._get_level_values(i) for i in subset])) + labels_to_i = _get_label_to_i_dict(ilabels, sort_labels=sort_labels) + labels_to_i = Series(labels_to_i) + if len(subset) > 1: + labels_to_i.index = MultiIndex.from_tuples(labels_to_i.index) + labels_to_i.index.names = [index.names[i] for i in subset] + else: + labels_to_i.index = Index(x[0] for x in labels_to_i.index) + labels_to_i.index.name = index.names[subset[0]] + + labels_to_i.name = "value" + return labels_to_i + + labels_to_i = _get_index_subset_to_coord_dict( + ss.index, levels, sort_labels=sort_labels + ) + # ##################################################################### + # ##################################################################### + + i_coord = labels_to_i[values_ilabels].tolist() + i_labels = labels_to_i.index.tolist() + + return i_coord, i_labels + + i_coord, i_labels = get_indexers(row_levels) + j_coord, j_labels = get_indexers(column_levels) + + return values, i_coord, j_coord, i_labels, j_labels + + +def _sparse_series_to_coo(ss, row_levels=(0,), column_levels=(1,), sort_labels=False): + """ + Convert a SparseSeries to a scipy.sparse.coo_matrix using index + levels row_levels, column_levels as the row and column + labels respectively. Returns the sparse_matrix, row and column labels. + """ + + import scipy.sparse + + if ss.index.nlevels < 2: + raise ValueError("to_coo requires MultiIndex with nlevels > 2") + if not ss.index.is_unique: + raise ValueError( + "Duplicate index entries are not allowed in to_coo transformation." + ) + + # to keep things simple, only rely on integer indexing (not labels) + row_levels = [ss.index._get_level_number(x) for x in row_levels] + column_levels = [ss.index._get_level_number(x) for x in column_levels] + + v, i, j, rows, columns = _to_ijv( + ss, row_levels=row_levels, column_levels=column_levels, sort_labels=sort_labels + ) + sparse_matrix = scipy.sparse.coo_matrix( + (v, (i, j)), shape=(len(rows), len(columns)) + ) + return sparse_matrix, rows, columns + + +def _coo_to_sparse_series(A, dense_index: bool = False): + """ + Convert a scipy.sparse.coo_matrix to a SparseSeries. + + Parameters + ---------- + A : scipy.sparse.coo.coo_matrix + dense_index : bool, default False + + Returns + ------- + Series + + Raises + ------ + TypeError if A is not a coo_matrix + """ + from pandas import SparseDtype + + try: + s = Series(A.data, MultiIndex.from_arrays((A.row, A.col))) + except AttributeError: + raise TypeError(f"Expected coo_matrix. Got {type(A).__name__} instead.") + s = s.sort_index() + s = s.astype(SparseDtype(s.dtype)) + if dense_index: + # is there a better constructor method to use here? + i = range(A.shape[0]) + j = range(A.shape[1]) + ind = MultiIndex.from_product([i, j]) + s = s.reindex(ind) + return s diff --git a/venv/Lib/site-packages/pandas/core/arrays/string_.py b/venv/Lib/site-packages/pandas/core/arrays/string_.py new file mode 100644 index 0000000..c485d1f --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/arrays/string_.py @@ -0,0 +1,321 @@ +import operator +from typing import Type + +import numpy as np + +from pandas._libs import lib, missing as libmissing + +from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.common import pandas_dtype +from pandas.core.dtypes.dtypes import register_extension_dtype +from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries +from pandas.core.dtypes.inference import is_array_like + +from pandas import compat +from pandas.core import ops +from pandas.core.arrays import PandasArray +from pandas.core.construction import extract_array +from pandas.core.missing import isna + + +@register_extension_dtype +class StringDtype(ExtensionDtype): + """ + Extension dtype for string data. + + .. versionadded:: 1.0.0 + + .. warning:: + + StringDtype is considered experimental. The implementation and + parts of the API may change without warning. + + In particular, StringDtype.na_value may change to no longer be + ``numpy.nan``. + + Attributes + ---------- + None + + Methods + ------- + None + + Examples + -------- + >>> pd.StringDtype() + StringDtype + """ + + name = "string" + + #: StringDtype.na_value uses pandas.NA + na_value = libmissing.NA + + @property + def type(self) -> Type: + return str + + @classmethod + def construct_array_type(cls) -> "Type[StringArray]": + return StringArray + + def __repr__(self) -> str: + return "StringDtype" + + def __from_arrow__(self, array): + """Construct StringArray from passed pyarrow Array/ChunkedArray""" + import pyarrow + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + # pyarrow.ChunkedArray + chunks = array.chunks + + results = [] + for arr in chunks: + # using _from_sequence to ensure None is converted to NA + str_arr = StringArray._from_sequence(np.array(arr)) + results.append(str_arr) + + return StringArray._concat_same_type(results) + + +class StringArray(PandasArray): + """ + Extension array for string data. + + .. versionadded:: 1.0.0 + + .. warning:: + + StringArray is considered experimental. The implementation and + parts of the API may change without warning. + + Parameters + ---------- + values : array-like + The array of data. + + .. warning:: + + Currently, this expects an object-dtype ndarray + where the elements are Python strings or :attr:`pandas.NA`. + This may change without warning in the future. Use + :meth:`pandas.array` with ``dtype="string"`` for a stable way of + creating a `StringArray` from any sequence. + + copy : bool, default False + Whether to copy the array of data. + + Attributes + ---------- + None + + Methods + ------- + None + + See Also + -------- + array + The recommended function for creating a StringArray. + Series.str + The string methods are available on Series backed by + a StringArray. + + Notes + ----- + StringArray returns a BooleanArray for comparison methods. + + Examples + -------- + >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string") + + ['This is', 'some text', , 'data.'] + Length: 4, dtype: string + + Unlike ``object`` dtype arrays, ``StringArray`` doesn't allow non-string + values. + + >>> pd.array(['1', 1], dtype="string") + Traceback (most recent call last): + ... + ValueError: StringArray requires an object-dtype ndarray of strings. + + For comparison methods, this returns a :class:`pandas.BooleanArray` + + >>> pd.array(["a", None, "c"], dtype="string") == "a" + + [True, , False] + Length: 3, dtype: boolean + """ + + # undo the PandasArray hack + _typ = "extension" + + def __init__(self, values, copy=False): + values = extract_array(values) + skip_validation = isinstance(values, type(self)) + + super().__init__(values, copy=copy) + self._dtype = StringDtype() + if not skip_validation: + self._validate() + + def _validate(self): + """Validate that we only store NA or strings.""" + if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): + raise ValueError("StringArray requires a sequence of strings or pandas.NA") + if self._ndarray.dtype != "object": + raise ValueError( + "StringArray requires a sequence of strings or pandas.NA. Got " + f"'{self._ndarray.dtype}' dtype instead." + ) + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + if dtype: + assert dtype == "string" + + result = np.asarray(scalars, dtype="object") + if copy and result is scalars: + result = result.copy() + + # Standardize all missing-like values to NA + # TODO: it would be nice to do this in _validate / lib.is_string_array + # We are already doing a scan over the values there. + na_values = isna(result) + if na_values.any(): + if result is scalars: + # force a copy now, if we haven't already + result = result.copy() + result[na_values] = StringDtype.na_value + + return cls(result) + + @classmethod + def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): + return cls._from_sequence(strings, dtype=dtype, copy=copy) + + def __arrow_array__(self, type=None): + """ + Convert myself into a pyarrow Array. + """ + import pyarrow as pa + + if type is None: + type = pa.string() + + values = self._ndarray.copy() + values[self.isna()] = None + return pa.array(values, type=type, from_pandas=True) + + def _values_for_factorize(self): + arr = self._ndarray.copy() + mask = self.isna() + arr[mask] = -1 + return arr, -1 + + def __setitem__(self, key, value): + value = extract_array(value, extract_numpy=True) + if isinstance(value, type(self)): + # extract_array doesn't extract PandasArray subclasses + value = value._ndarray + + scalar_key = lib.is_scalar(key) + scalar_value = lib.is_scalar(value) + if scalar_key and not scalar_value: + raise ValueError("setting an array element with a sequence.") + + # validate new items + if scalar_value: + if isna(value): + value = StringDtype.na_value + elif not isinstance(value, str): + raise ValueError( + f"Cannot set non-string value '{value}' into a StringArray." + ) + else: + if not is_array_like(value): + value = np.asarray(value, dtype=object) + if len(value) and not lib.is_string_array(value, skipna=True): + raise ValueError("Must provide strings.") + + super().__setitem__(key, value) + + def fillna(self, value=None, method=None, limit=None): + # TODO: validate dtype + return super().fillna(value, method, limit) + + def astype(self, dtype, copy=True): + dtype = pandas_dtype(dtype) + if isinstance(dtype, StringDtype): + if copy: + return self.copy() + return self + return super().astype(dtype, copy) + + def _reduce(self, name, skipna=True, **kwargs): + raise TypeError(f"Cannot perform reduction '{name}' with string dtype") + + def value_counts(self, dropna=False): + from pandas import value_counts + + return value_counts(self._ndarray, dropna=dropna).astype("Int64") + + # Overrride parent because we have different return types. + @classmethod + def _create_arithmetic_method(cls, op): + # Note: this handles both arithmetic and comparison methods. + def method(self, other): + from pandas.arrays import BooleanArray + + assert op.__name__ in ops.ARITHMETIC_BINOPS | ops.COMPARISON_BINOPS + + if isinstance(other, (ABCIndexClass, ABCSeries, ABCDataFrame)): + return NotImplemented + + elif isinstance(other, cls): + other = other._ndarray + + mask = isna(self) | isna(other) + valid = ~mask + + if not lib.is_scalar(other): + if len(other) != len(self): + # prevent improper broadcasting when other is 2D + raise ValueError( + f"Lengths of operands do not match: {len(self)} != {len(other)}" + ) + + other = np.asarray(other) + other = other[valid] + + if op.__name__ in ops.ARITHMETIC_BINOPS: + result = np.empty_like(self._ndarray, dtype="object") + result[mask] = StringDtype.na_value + result[valid] = op(self._ndarray[valid], other) + return StringArray(result) + else: + # logical + result = np.zeros(len(self._ndarray), dtype="bool") + result[valid] = op(self._ndarray[valid], other) + return BooleanArray(result, mask) + + return compat.set_function_name(method, f"__{op.__name__}__", cls) + + @classmethod + def _add_arithmetic_ops(cls): + cls.__add__ = cls._create_arithmetic_method(operator.add) + cls.__radd__ = cls._create_arithmetic_method(ops.radd) + + cls.__mul__ = cls._create_arithmetic_method(operator.mul) + cls.__rmul__ = cls._create_arithmetic_method(ops.rmul) + + _create_comparison_method = _create_arithmetic_method + + +StringArray._add_arithmetic_ops() +StringArray._add_comparison_ops() diff --git a/venv/Lib/site-packages/pandas/core/arrays/timedeltas.py b/venv/Lib/site-packages/pandas/core/arrays/timedeltas.py new file mode 100644 index 0000000..c34d14f --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/arrays/timedeltas.py @@ -0,0 +1,1090 @@ +from datetime import timedelta +from typing import List + +import numpy as np + +from pandas._libs import lib, tslibs +from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT +from pandas._libs.tslibs.fields import get_timedelta_field +from pandas._libs.tslibs.timedeltas import ( + array_to_timedelta64, + parse_timedelta_unit, + precision_from_unit, +) +from pandas.compat.numpy import function as nv + +from pandas.core.dtypes.common import ( + _NS_DTYPE, + _TD_DTYPE, + is_dtype_equal, + is_float_dtype, + is_integer_dtype, + is_object_dtype, + is_scalar, + is_string_dtype, + is_timedelta64_dtype, + is_timedelta64_ns_dtype, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import DatetimeTZDtype +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCIndexClass, + ABCSeries, + ABCTimedeltaIndex, +) +from pandas.core.dtypes.missing import isna + +from pandas.core import nanops +from pandas.core.algorithms import checked_add_with_arr +from pandas.core.arrays import datetimelike as dtl +import pandas.core.common as com + +from pandas.tseries.frequencies import to_offset +from pandas.tseries.offsets import Tick + +_BAD_DTYPE = "dtype {dtype} cannot be converted to timedelta64[ns]" + + +def _is_convertible_to_td(key): + return isinstance(key, (Tick, timedelta, np.timedelta64, str)) + + +def _field_accessor(name, alias, docstring=None): + def f(self): + values = self.asi8 + result = get_timedelta_field(values, alias) + if self._hasnans: + result = self._maybe_mask_results( + result, fill_value=None, convert="float64" + ) + + return result + + f.__name__ = name + f.__doc__ = f"\n{docstring}\n" + return property(f) + + +class TimedeltaArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps): + """ + Pandas ExtensionArray for timedelta data. + + .. versionadded:: 0.24.0 + + .. warning:: + + TimedeltaArray is currently experimental, and its API may change + without warning. In particular, :attr:`TimedeltaArray.dtype` is + expected to change to be an instance of an ``ExtensionDtype`` + subclass. + + Parameters + ---------- + values : array-like + The timedelta data. + + dtype : numpy.dtype + Currently, only ``numpy.dtype("timedelta64[ns]")`` is accepted. + freq : Offset, optional + copy : bool, default False + Whether to copy the underlying array of data. + + Attributes + ---------- + None + + Methods + ------- + None + """ + + _typ = "timedeltaarray" + _scalar_type = Timedelta + _recognized_scalars = (timedelta, np.timedelta64, Tick) + _is_recognized_dtype = is_timedelta64_dtype + + __array_priority__ = 1000 + # define my properties & methods for delegation + _other_ops: List[str] = [] + _bool_ops: List[str] = [] + _object_ops = ["freq"] + _field_ops = ["days", "seconds", "microseconds", "nanoseconds"] + _datetimelike_ops = _field_ops + _object_ops + _bool_ops + _datetimelike_methods = [ + "to_pytimedelta", + "total_seconds", + "round", + "floor", + "ceil", + ] + + # Note: ndim must be defined to ensure NaT.__richcmp(TimedeltaArray) + # operates pointwise. + + @property + def _box_func(self): + return lambda x: Timedelta(x, unit="ns") + + @property + def dtype(self): + """ + The dtype for the TimedeltaArray. + + .. warning:: + + A future version of pandas will change dtype to be an instance + of a :class:`pandas.api.extensions.ExtensionDtype` subclass, + not a ``numpy.dtype``. + + Returns + ------- + numpy.dtype + """ + return _TD_DTYPE + + # ---------------------------------------------------------------- + # Constructors + + def __init__(self, values, dtype=_TD_DTYPE, freq=None, copy=False): + if isinstance(values, (ABCSeries, ABCIndexClass)): + values = values._values + + inferred_freq = getattr(values, "_freq", None) + + if isinstance(values, type(self)): + if freq is None: + freq = values.freq + elif freq and values.freq: + freq = to_offset(freq) + freq, _ = dtl.validate_inferred_freq(freq, values.freq, False) + values = values._data + + if not isinstance(values, np.ndarray): + msg = ( + f"Unexpected type '{type(values).__name__}'. 'values' must be a " + "TimedeltaArray ndarray, or Series or Index containing one of those." + ) + raise ValueError(msg) + if values.ndim not in [1, 2]: + raise ValueError("Only 1-dimensional input arrays are supported.") + + if values.dtype == "i8": + # for compat with datetime/timedelta/period shared methods, + # we can sometimes get here with int64 values. These represent + # nanosecond UTC (or tz-naive) unix timestamps + values = values.view(_TD_DTYPE) + + _validate_td64_dtype(values.dtype) + dtype = _validate_td64_dtype(dtype) + + if freq == "infer": + msg = ( + "Frequency inference not allowed in TimedeltaArray.__init__. " + "Use 'pd.array()' instead." + ) + raise ValueError(msg) + + if copy: + values = values.copy() + if freq: + freq = to_offset(freq) + + self._data = values + self._dtype = dtype + self._freq = freq + + if inferred_freq is None and freq is not None: + type(self)._validate_frequency(self, freq) + + @classmethod + def _simple_new(cls, values, freq=None, dtype=_TD_DTYPE): + assert dtype == _TD_DTYPE, dtype + assert isinstance(values, np.ndarray), type(values) + + result = object.__new__(cls) + result._data = values.view(_TD_DTYPE) + result._freq = to_offset(freq) + result._dtype = _TD_DTYPE + return result + + @classmethod + def _from_sequence(cls, data, dtype=_TD_DTYPE, copy=False, freq=None, unit=None): + if dtype: + _validate_td64_dtype(dtype) + freq, freq_infer = dtl.maybe_infer_freq(freq) + + data, inferred_freq = sequence_to_td64ns(data, copy=copy, unit=unit) + freq, freq_infer = dtl.validate_inferred_freq(freq, inferred_freq, freq_infer) + + result = cls._simple_new(data, freq=freq) + + if inferred_freq is None and freq is not None: + # this condition precludes `freq_infer` + cls._validate_frequency(result, freq) + + elif freq_infer: + # Set _freq directly to bypass duplicative _validate_frequency + # check. + result._freq = to_offset(result.inferred_freq) + + return result + + @classmethod + def _generate_range(cls, start, end, periods, freq, closed=None): + + periods = dtl.validate_periods(periods) + if freq is None and any(x is None for x in [periods, start, end]): + raise ValueError("Must provide freq argument if no data is supplied") + + if com.count_not_none(start, end, periods, freq) != 3: + raise ValueError( + "Of the four parameters: start, end, periods, " + "and freq, exactly three must be specified" + ) + + if start is not None: + start = Timedelta(start) + + if end is not None: + end = Timedelta(end) + + if start is None and end is None: + if closed is not None: + raise ValueError( + "Closed has to be None if not both of startand end are defined" + ) + + left_closed, right_closed = dtl.validate_endpoints(closed) + + if freq is not None: + index = _generate_regular_range(start, end, periods, freq) + else: + index = np.linspace(start.value, end.value, periods).astype("i8") + + if not left_closed: + index = index[1:] + if not right_closed: + index = index[:-1] + + return cls._simple_new(index, freq=freq) + + # ---------------------------------------------------------------- + # DatetimeLike Interface + + def _unbox_scalar(self, value): + if not isinstance(value, self._scalar_type) and value is not NaT: + raise ValueError("'value' should be a Timedelta.") + self._check_compatible_with(value) + return value.value + + def _scalar_from_string(self, value): + return Timedelta(value) + + def _check_compatible_with(self, other, setitem: bool = False): + # we don't have anything to validate. + pass + + def _maybe_clear_freq(self): + self._freq = None + + # ---------------------------------------------------------------- + # Array-Like / EA-Interface Methods + + def astype(self, dtype, copy=True): + # We handle + # --> timedelta64[ns] + # --> timedelta64 + # DatetimeLikeArrayMixin super call handles other cases + dtype = pandas_dtype(dtype) + + if is_timedelta64_dtype(dtype) and not is_timedelta64_ns_dtype(dtype): + # by pandas convention, converting to non-nano timedelta64 + # returns an int64-dtyped array with ints representing multiples + # of the desired timedelta unit. This is essentially division + if self._hasnans: + # avoid double-copying + result = self._data.astype(dtype, copy=False) + values = self._maybe_mask_results( + result, fill_value=None, convert="float64" + ) + return values + result = self._data.astype(dtype, copy=copy) + return result.astype("i8") + elif is_timedelta64_ns_dtype(dtype): + if copy: + return self.copy() + return self + return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy=copy) + + # ---------------------------------------------------------------- + # Reductions + + def sum( + self, + axis=None, + dtype=None, + out=None, + keepdims: bool = False, + initial=None, + skipna: bool = True, + min_count: int = 0, + ): + nv.validate_sum( + (), dict(dtype=dtype, out=out, keepdims=keepdims, initial=initial) + ) + if not len(self): + return NaT + if not skipna and self._hasnans: + return NaT + + result = nanops.nansum( + self._data, axis=axis, skipna=skipna, min_count=min_count + ) + return Timedelta(result) + + def std( + self, + axis=None, + dtype=None, + out=None, + ddof: int = 1, + keepdims: bool = False, + skipna: bool = True, + ): + nv.validate_stat_ddof_func( + (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="std" + ) + if not len(self): + return NaT + if not skipna and self._hasnans: + return NaT + + result = nanops.nanstd(self._data, axis=axis, skipna=skipna, ddof=ddof) + return Timedelta(result) + + def median( + self, + axis=None, + out=None, + overwrite_input: bool = False, + keepdims: bool = False, + skipna: bool = True, + ): + nv.validate_median( + (), dict(out=out, overwrite_input=overwrite_input, keepdims=keepdims) + ) + return nanops.nanmedian(self._data, axis=axis, skipna=skipna) + + # ---------------------------------------------------------------- + # Rendering Methods + + def _formatter(self, boxed=False): + from pandas.io.formats.format import _get_format_timedelta64 + + return _get_format_timedelta64(self, box=True) + + def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): + from pandas.io.formats.format import _get_format_timedelta64 + + formatter = _get_format_timedelta64(self._data, na_rep) + return np.array([formatter(x) for x in self._data]) + + # ---------------------------------------------------------------- + # Arithmetic Methods + + def _add_offset(self, other): + assert not isinstance(other, Tick) + raise TypeError( + f"cannot add the type {type(other).__name__} to a {type(self).__name__}" + ) + + def _add_delta(self, delta): + """ + Add a timedelta-like, Tick, or TimedeltaIndex-like object + to self, yielding a new TimedeltaArray. + + Parameters + ---------- + other : {timedelta, np.timedelta64, Tick, + TimedeltaIndex, ndarray[timedelta64]} + + Returns + ------- + result : TimedeltaArray + """ + new_values = super()._add_delta(delta) + return type(self)._from_sequence(new_values, freq="infer") + + def _add_datetime_arraylike(self, other): + """ + Add DatetimeArray/Index or ndarray[datetime64] to TimedeltaArray. + """ + if isinstance(other, np.ndarray): + # At this point we have already checked that dtype is datetime64 + from pandas.core.arrays import DatetimeArray + + other = DatetimeArray(other) + + # defer to implementation in DatetimeArray + return other + self + + def _add_datetimelike_scalar(self, other): + # adding a timedeltaindex to a datetimelike + from pandas.core.arrays import DatetimeArray + + assert other is not NaT + other = Timestamp(other) + if other is NaT: + # In this case we specifically interpret NaT as a datetime, not + # the timedelta interpretation we would get by returning self + NaT + result = self.asi8.view("m8[ms]") + NaT.to_datetime64() + return DatetimeArray(result) + + i8 = self.asi8 + result = checked_add_with_arr(i8, other.value, arr_mask=self._isnan) + result = self._maybe_mask_results(result) + dtype = DatetimeTZDtype(tz=other.tz) if other.tz else _NS_DTYPE + return DatetimeArray(result, dtype=dtype, freq=self.freq) + + def _addsub_object_array(self, other, op): + # Add or subtract Array-like of objects + try: + # TimedeltaIndex can only operate with a subset of DateOffset + # subclasses. Incompatible classes will raise AttributeError, + # which we re-raise as TypeError + return super()._addsub_object_array(other, op) + except AttributeError: + raise TypeError( + f"Cannot add/subtract non-tick DateOffset to {type(self).__name__}" + ) + + def __mul__(self, other): + other = lib.item_from_zerodim(other) + + if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): + return NotImplemented + + if is_scalar(other): + # numpy will accept float and int, raise TypeError for others + result = self._data * other + freq = None + if self.freq is not None and not isna(other): + freq = self.freq * other + return type(self)(result, freq=freq) + + if not hasattr(other, "dtype"): + # list, tuple + other = np.array(other) + if len(other) != len(self) and not is_timedelta64_dtype(other): + # Exclude timedelta64 here so we correctly raise TypeError + # for that instead of ValueError + raise ValueError("Cannot multiply with unequal lengths") + + if is_object_dtype(other.dtype): + # this multiplication will succeed only if all elements of other + # are int or float scalars, so we will end up with + # timedelta64[ns]-dtyped result + result = [self[n] * other[n] for n in range(len(self))] + result = np.array(result) + return type(self)(result) + + # numpy will accept float or int dtype, raise TypeError for others + result = self._data * other + return type(self)(result) + + __rmul__ = __mul__ + + def __truediv__(self, other): + # timedelta / X is well-defined for timedelta-like or numeric X + other = lib.item_from_zerodim(other) + + if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): + return NotImplemented + + if isinstance(other, (timedelta, np.timedelta64, Tick)): + other = Timedelta(other) + if other is NaT: + # specifically timedelta64-NaT + result = np.empty(self.shape, dtype=np.float64) + result.fill(np.nan) + return result + + # otherwise, dispatch to Timedelta implementation + return self._data / other + + elif lib.is_scalar(other): + # assume it is numeric + result = self._data / other + freq = None + if self.freq is not None: + # Tick division is not implemented, so operate on Timedelta + freq = self.freq.delta / other + return type(self)(result, freq=freq) + + if not hasattr(other, "dtype"): + # e.g. list, tuple + other = np.array(other) + + if len(other) != len(self): + raise ValueError("Cannot divide vectors with unequal lengths") + + elif is_timedelta64_dtype(other.dtype): + # let numpy handle it + return self._data / other + + elif is_object_dtype(other.dtype): + # Note: we do not do type inference on the result, so either + # an object array or numeric-dtyped (if numpy does inference) + # will be returned. GH#23829 + result = [self[n] / other[n] for n in range(len(self))] + result = np.array(result) + return result + + else: + result = self._data / other + return type(self)(result) + + def __rtruediv__(self, other): + # X / timedelta is defined only for timedelta-like X + other = lib.item_from_zerodim(other) + + if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): + return NotImplemented + + if isinstance(other, (timedelta, np.timedelta64, Tick)): + other = Timedelta(other) + if other is NaT: + # specifically timedelta64-NaT + result = np.empty(self.shape, dtype=np.float64) + result.fill(np.nan) + return result + + # otherwise, dispatch to Timedelta implementation + return other / self._data + + elif lib.is_scalar(other): + raise TypeError( + f"Cannot divide {type(other).__name__} by {type(self).__name__}" + ) + + if not hasattr(other, "dtype"): + # e.g. list, tuple + other = np.array(other) + + if len(other) != len(self): + raise ValueError("Cannot divide vectors with unequal lengths") + + elif is_timedelta64_dtype(other.dtype): + # let numpy handle it + return other / self._data + + elif is_object_dtype(other.dtype): + # Note: unlike in __truediv__, we do not _need_ to do type + # inference on the result. It does not raise, a numeric array + # is returned. GH#23829 + result = [other[n] / self[n] for n in range(len(self))] + return np.array(result) + + else: + raise TypeError( + f"Cannot divide {other.dtype} data by {type(self).__name__}" + ) + + def __floordiv__(self, other): + if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): + return NotImplemented + + other = lib.item_from_zerodim(other) + if is_scalar(other): + if isinstance(other, (timedelta, np.timedelta64, Tick)): + other = Timedelta(other) + if other is NaT: + # treat this specifically as timedelta-NaT + result = np.empty(self.shape, dtype=np.float64) + result.fill(np.nan) + return result + + # dispatch to Timedelta implementation + result = other.__rfloordiv__(self._data) + return result + + # at this point we should only have numeric scalars; anything + # else will raise + result = self.asi8 // other + result[self._isnan] = iNaT + freq = None + if self.freq is not None: + # Note: freq gets division, not floor-division + freq = self.freq / other + return type(self)(result.view("m8[ns]"), freq=freq) + + if not hasattr(other, "dtype"): + # list, tuple + other = np.array(other) + if len(other) != len(self): + raise ValueError("Cannot divide with unequal lengths") + + elif is_timedelta64_dtype(other.dtype): + other = type(self)(other) + + # numpy timedelta64 does not natively support floordiv, so operate + # on the i8 values + result = self.asi8 // other.asi8 + mask = self._isnan | other._isnan + if mask.any(): + result = result.astype(np.int64) + result[mask] = np.nan + return result + + elif is_object_dtype(other.dtype): + result = [self[n] // other[n] for n in range(len(self))] + result = np.array(result) + if lib.infer_dtype(result, skipna=False) == "timedelta": + result, _ = sequence_to_td64ns(result) + return type(self)(result) + return result + + elif is_integer_dtype(other.dtype) or is_float_dtype(other.dtype): + result = self._data // other + return type(self)(result) + + else: + dtype = getattr(other, "dtype", type(other).__name__) + raise TypeError(f"Cannot divide {dtype} by {type(self).__name__}") + + def __rfloordiv__(self, other): + if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): + return NotImplemented + + other = lib.item_from_zerodim(other) + if is_scalar(other): + if isinstance(other, (timedelta, np.timedelta64, Tick)): + other = Timedelta(other) + if other is NaT: + # treat this specifically as timedelta-NaT + result = np.empty(self.shape, dtype=np.float64) + result.fill(np.nan) + return result + + # dispatch to Timedelta implementation + result = other.__floordiv__(self._data) + return result + + raise TypeError( + f"Cannot divide {type(other).__name__} by {type(self).__name__}" + ) + + if not hasattr(other, "dtype"): + # list, tuple + other = np.array(other) + if len(other) != len(self): + raise ValueError("Cannot divide with unequal lengths") + + elif is_timedelta64_dtype(other.dtype): + other = type(self)(other) + + # numpy timedelta64 does not natively support floordiv, so operate + # on the i8 values + result = other.asi8 // self.asi8 + mask = self._isnan | other._isnan + if mask.any(): + result = result.astype(np.int64) + result[mask] = np.nan + return result + + elif is_object_dtype(other.dtype): + result = [other[n] // self[n] for n in range(len(self))] + result = np.array(result) + return result + + else: + dtype = getattr(other, "dtype", type(other).__name__) + raise TypeError(f"Cannot divide {dtype} by {type(self).__name__}") + + def __mod__(self, other): + # Note: This is a naive implementation, can likely be optimized + if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): + return NotImplemented + + other = lib.item_from_zerodim(other) + if isinstance(other, (timedelta, np.timedelta64, Tick)): + other = Timedelta(other) + return self - (self // other) * other + + def __rmod__(self, other): + # Note: This is a naive implementation, can likely be optimized + if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): + return NotImplemented + + other = lib.item_from_zerodim(other) + if isinstance(other, (timedelta, np.timedelta64, Tick)): + other = Timedelta(other) + return other - (other // self) * self + + def __divmod__(self, other): + # Note: This is a naive implementation, can likely be optimized + if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): + return NotImplemented + + other = lib.item_from_zerodim(other) + if isinstance(other, (timedelta, np.timedelta64, Tick)): + other = Timedelta(other) + + res1 = self // other + res2 = self - res1 * other + return res1, res2 + + def __rdivmod__(self, other): + # Note: This is a naive implementation, can likely be optimized + if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): + return NotImplemented + + other = lib.item_from_zerodim(other) + if isinstance(other, (timedelta, np.timedelta64, Tick)): + other = Timedelta(other) + + res1 = other // self + res2 = other - res1 * self + return res1, res2 + + def __neg__(self): + if self.freq is not None: + return type(self)(-self._data, freq=-self.freq) + return type(self)(-self._data) + + def __pos__(self): + return type(self)(self._data, freq=self.freq) + + def __abs__(self): + # Note: freq is not preserved + return type(self)(np.abs(self._data)) + + # ---------------------------------------------------------------- + # Conversion Methods - Vectorized analogues of Timedelta methods + + def total_seconds(self): + """ + Return total duration of each element expressed in seconds. + + This method is available directly on TimedeltaArray, TimedeltaIndex + and on Series containing timedelta values under the ``.dt`` namespace. + + Returns + ------- + seconds : [ndarray, Float64Index, Series] + When the calling object is a TimedeltaArray, the return type + is ndarray. When the calling object is a TimedeltaIndex, + the return type is a Float64Index. When the calling object + is a Series, the return type is Series of type `float64` whose + index is the same as the original. + + See Also + -------- + datetime.timedelta.total_seconds : Standard library version + of this method. + TimedeltaIndex.components : Return a DataFrame with components of + each Timedelta. + + Examples + -------- + **Series** + + >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit='d')) + >>> s + 0 0 days + 1 1 days + 2 2 days + 3 3 days + 4 4 days + dtype: timedelta64[ns] + + >>> s.dt.total_seconds() + 0 0.0 + 1 86400.0 + 2 172800.0 + 3 259200.0 + 4 345600.0 + dtype: float64 + + **TimedeltaIndex** + + >>> idx = pd.to_timedelta(np.arange(5), unit='d') + >>> idx + TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], + dtype='timedelta64[ns]', freq=None) + + >>> idx.total_seconds() + Float64Index([0.0, 86400.0, 172800.0, 259200.00000000003, 345600.0], + dtype='float64') + """ + return self._maybe_mask_results(1e-9 * self.asi8, fill_value=None) + + def to_pytimedelta(self): + """ + Return Timedelta Array/Index as object ndarray of datetime.timedelta + objects. + + Returns + ------- + datetimes : ndarray + """ + return tslibs.ints_to_pytimedelta(self.asi8) + + days = _field_accessor("days", "days", "Number of days for each element.") + seconds = _field_accessor( + "seconds", + "seconds", + "Number of seconds (>= 0 and less than 1 day) for each element.", + ) + microseconds = _field_accessor( + "microseconds", + "microseconds", + "Number of microseconds (>= 0 and less than 1 second) for each element.", + ) + nanoseconds = _field_accessor( + "nanoseconds", + "nanoseconds", + "Number of nanoseconds (>= 0 and less than 1 microsecond) for each element.", + ) + + @property + def components(self): + """ + Return a dataframe of the components (days, hours, minutes, + seconds, milliseconds, microseconds, nanoseconds) of the Timedeltas. + + Returns + ------- + a DataFrame + """ + from pandas import DataFrame + + columns = [ + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", + "nanoseconds", + ] + hasnans = self._hasnans + if hasnans: + + def f(x): + if isna(x): + return [np.nan] * len(columns) + return x.components + + else: + + def f(x): + return x.components + + result = DataFrame([f(x) for x in self], columns=columns) + if not hasnans: + result = result.astype("int64") + return result + + +# --------------------------------------------------------------------- +# Constructor Helpers + + +def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"): + """ + Parameters + ---------- + array : list-like + copy : bool, default False + unit : str, default "ns" + The timedelta unit to treat integers as multiples of. + errors : {"raise", "coerce", "ignore"}, default "raise" + How to handle elements that cannot be converted to timedelta64[ns]. + See ``pandas.to_timedelta`` for details. + + Returns + ------- + converted : numpy.ndarray + The sequence converted to a numpy array with dtype ``timedelta64[ns]``. + inferred_freq : Tick or None + The inferred frequency of the sequence. + + Raises + ------ + ValueError : Data cannot be converted to timedelta64[ns]. + + Notes + ----- + Unlike `pandas.to_timedelta`, if setting ``errors=ignore`` will not cause + errors to be ignored; they are caught and subsequently ignored at a + higher level. + """ + inferred_freq = None + unit = parse_timedelta_unit(unit) + + # Unwrap whatever we have into a np.ndarray + if not hasattr(data, "dtype"): + # e.g. list, tuple + if np.ndim(data) == 0: + # i.e. generator + data = list(data) + data = np.array(data, copy=False) + elif isinstance(data, ABCSeries): + data = data._values + elif isinstance(data, (ABCTimedeltaIndex, TimedeltaArray)): + inferred_freq = data.freq + data = data._data + + # Convert whatever we have into timedelta64[ns] dtype + if is_object_dtype(data.dtype) or is_string_dtype(data.dtype): + # no need to make a copy, need to convert if string-dtyped + data = objects_to_td64ns(data, unit=unit, errors=errors) + copy = False + + elif is_integer_dtype(data.dtype): + # treat as multiples of the given unit + data, copy_made = ints_to_td64ns(data, unit=unit) + copy = copy and not copy_made + + elif is_float_dtype(data.dtype): + # cast the unit, multiply base/frace separately + # to avoid precision issues from float -> int + mask = np.isnan(data) + m, p = precision_from_unit(unit) + base = data.astype(np.int64) + frac = data - base + if p: + frac = np.round(frac, p) + data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]") + data[mask] = iNaT + copy = False + + elif is_timedelta64_dtype(data.dtype): + if data.dtype != _TD_DTYPE: + # non-nano unit + # TODO: watch out for overflows + data = data.astype(_TD_DTYPE) + copy = False + + else: + # This includes datetime64-dtype, see GH#23539, GH#29794 + raise TypeError(f"dtype {data.dtype} cannot be converted to timedelta64[ns]") + + data = np.array(data, copy=copy) + + assert data.dtype == "m8[ns]", data + return data, inferred_freq + + +def ints_to_td64ns(data, unit="ns"): + """ + Convert an ndarray with integer-dtype to timedelta64[ns] dtype, treating + the integers as multiples of the given timedelta unit. + + Parameters + ---------- + data : numpy.ndarray with integer-dtype + unit : str, default "ns" + The timedelta unit to treat integers as multiples of. + + Returns + ------- + numpy.ndarray : timedelta64[ns] array converted from data + bool : whether a copy was made + """ + copy_made = False + unit = unit if unit is not None else "ns" + + if data.dtype != np.int64: + # converting to int64 makes a copy, so we can avoid + # re-copying later + data = data.astype(np.int64) + copy_made = True + + if unit != "ns": + dtype_str = f"timedelta64[{unit}]" + data = data.view(dtype_str) + + # TODO: watch out for overflows when converting from lower-resolution + data = data.astype("timedelta64[ns]") + # the astype conversion makes a copy, so we can avoid re-copying later + copy_made = True + + else: + data = data.view("timedelta64[ns]") + + return data, copy_made + + +def objects_to_td64ns(data, unit="ns", errors="raise"): + """ + Convert a object-dtyped or string-dtyped array into an + timedelta64[ns]-dtyped array. + + Parameters + ---------- + data : ndarray or Index + unit : str, default "ns" + The timedelta unit to treat integers as multiples of. + errors : {"raise", "coerce", "ignore"}, default "raise" + How to handle elements that cannot be converted to timedelta64[ns]. + See ``pandas.to_timedelta`` for details. + + Returns + ------- + numpy.ndarray : timedelta64[ns] array converted from data + + Raises + ------ + ValueError : Data cannot be converted to timedelta64[ns]. + + Notes + ----- + Unlike `pandas.to_timedelta`, if setting `errors=ignore` will not cause + errors to be ignored; they are caught and subsequently ignored at a + higher level. + """ + # coerce Index to np.ndarray, converting string-dtype if necessary + values = np.array(data, dtype=np.object_, copy=False) + + result = array_to_timedelta64(values, unit=unit, errors=errors) + return result.view("timedelta64[ns]") + + +def _validate_td64_dtype(dtype): + dtype = pandas_dtype(dtype) + if is_dtype_equal(dtype, np.dtype("timedelta64")): + # no precision disallowed GH#24806 + msg = ( + "Passing in 'timedelta' dtype with no precision is not allowed. " + "Please pass in 'timedelta64[ns]' instead." + ) + raise ValueError(msg) + + if not is_dtype_equal(dtype, _TD_DTYPE): + raise ValueError(_BAD_DTYPE.format(dtype=dtype)) + + return dtype + + +def _generate_regular_range(start, end, periods, offset): + stride = offset.nanos + if periods is None: + b = Timedelta(start).value + e = Timedelta(end).value + e += stride - e % stride + elif start is not None: + b = Timedelta(start).value + e = b + periods * stride + elif end is not None: + e = Timedelta(end).value + stride + b = e - periods * stride + else: + raise ValueError( + "at least 'start' or 'end' should be specified if a 'period' is given." + ) + + data = np.arange(b, e, stride, dtype=np.int64) + return data diff --git a/venv/Lib/site-packages/pandas/core/base.py b/venv/Lib/site-packages/pandas/core/base.py new file mode 100644 index 0000000..5acf1a2 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/base.py @@ -0,0 +1,1507 @@ +""" +Base and utility classes for pandas objects. +""" +import builtins +import textwrap +from typing import Dict, FrozenSet, List, Optional + +import numpy as np + +import pandas._libs.lib as lib +from pandas._typing import T +from pandas.compat import PYPY +from pandas.compat.numpy import function as nv +from pandas.errors import AbstractMethodError +from pandas.util._decorators import Appender, Substitution, cache_readonly +from pandas.util._validators import validate_bool_kwarg + +from pandas.core.dtypes.cast import is_nested_object +from pandas.core.dtypes.common import ( + is_categorical_dtype, + is_dict_like, + is_extension_array_dtype, + is_list_like, + is_object_dtype, + is_scalar, + needs_i8_conversion, +) +from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries +from pandas.core.dtypes.missing import isna + +from pandas.core import algorithms, common as com +from pandas.core.accessor import DirNamesMixin +from pandas.core.algorithms import duplicated, unique1d, value_counts +from pandas.core.arrays import ExtensionArray +from pandas.core.construction import create_series_with_explicit_dtype +import pandas.core.nanops as nanops + +_shared_docs: Dict[str, str] = dict() +_indexops_doc_kwargs = dict( + klass="IndexOpsMixin", + inplace="", + unique="IndexOpsMixin", + duplicated="IndexOpsMixin", +) + + +class PandasObject(DirNamesMixin): + """baseclass for various pandas objects""" + + @property + def _constructor(self): + """class constructor (for this class it's just `__class__`""" + return type(self) + + def __repr__(self) -> str: + """ + Return a string representation for a particular object. + """ + # Should be overwritten by base classes + return object.__repr__(self) + + def _reset_cache(self, key=None): + """ + Reset cached properties. If ``key`` is passed, only clears that key. + """ + if getattr(self, "_cache", None) is None: + return + if key is None: + self._cache.clear() + else: + self._cache.pop(key, None) + + def __sizeof__(self): + """ + Generates the total memory usage for an object that returns + either a value or Series of values + """ + if hasattr(self, "memory_usage"): + mem = self.memory_usage(deep=True) + if not is_scalar(mem): + mem = mem.sum() + return int(mem) + + # no memory_usage attribute, so fall back to + # object's 'sizeof' + return super().__sizeof__() + + def _ensure_type(self: T, obj) -> T: + """Ensure that an object has same type as self. + + Used by type checkers. + """ + assert isinstance(obj, type(self)), type(obj) + return obj + + +class NoNewAttributesMixin: + """Mixin which prevents adding new attributes. + + Prevents additional attributes via xxx.attribute = "something" after a + call to `self.__freeze()`. Mainly used to prevent the user from using + wrong attributes on an accessor (`Series.cat/.str/.dt`). + + If you really want to add a new attribute at a later time, you need to use + `object.__setattr__(self, key, value)`. + """ + + def _freeze(self): + """Prevents setting additional attributes""" + object.__setattr__(self, "__frozen", True) + + # prevent adding any attribute via s.xxx.new_attribute = ... + def __setattr__(self, key, value): + # _cache is used by a decorator + # We need to check both 1.) cls.__dict__ and 2.) getattr(self, key) + # because + # 1.) getattr is false for attributes that raise errors + # 2.) cls.__dict__ doesn't traverse into base classes + if getattr(self, "__frozen", False) and not ( + key == "_cache" + or key in type(self).__dict__ + or getattr(self, key, None) is not None + ): + raise AttributeError(f"You cannot add any new attribute '{key}'") + object.__setattr__(self, key, value) + + +class GroupByError(Exception): + pass + + +class DataError(GroupByError): + pass + + +class SpecificationError(GroupByError): + pass + + +class SelectionMixin: + """ + mixin implementing the selection & aggregation interface on a group-like + object sub-classes need to define: obj, exclusions + """ + + _selection = None + _internal_names = ["_cache", "__setstate__"] + _internal_names_set = set(_internal_names) + + _builtin_table = {builtins.sum: np.sum, builtins.max: np.max, builtins.min: np.min} + + _cython_table = { + builtins.sum: "sum", + builtins.max: "max", + builtins.min: "min", + np.all: "all", + np.any: "any", + np.sum: "sum", + np.nansum: "sum", + np.mean: "mean", + np.nanmean: "mean", + np.prod: "prod", + np.nanprod: "prod", + np.std: "std", + np.nanstd: "std", + np.var: "var", + np.nanvar: "var", + np.median: "median", + np.nanmedian: "median", + np.max: "max", + np.nanmax: "max", + np.min: "min", + np.nanmin: "min", + np.cumprod: "cumprod", + np.nancumprod: "cumprod", + np.cumsum: "cumsum", + np.nancumsum: "cumsum", + } + + @property + def _selection_name(self): + """ + return a name for myself; this would ideally be called + the 'name' property, but we cannot conflict with the + Series.name property which can be set + """ + if self._selection is None: + return None # 'result' + else: + return self._selection + + @property + def _selection_list(self): + if not isinstance( + self._selection, (list, tuple, ABCSeries, ABCIndexClass, np.ndarray) + ): + return [self._selection] + return self._selection + + @cache_readonly + def _selected_obj(self): + + if self._selection is None or isinstance(self.obj, ABCSeries): + return self.obj + else: + return self.obj[self._selection] + + @cache_readonly + def ndim(self) -> int: + return self._selected_obj.ndim + + @cache_readonly + def _obj_with_exclusions(self): + if self._selection is not None and isinstance(self.obj, ABCDataFrame): + return self.obj.reindex(columns=self._selection_list) + + if len(self.exclusions) > 0: + return self.obj.drop(self.exclusions, axis=1) + else: + return self.obj + + def __getitem__(self, key): + if self._selection is not None: + raise IndexError(f"Column(s) {self._selection} already selected") + + if isinstance(key, (list, tuple, ABCSeries, ABCIndexClass, np.ndarray)): + if len(self.obj.columns.intersection(key)) != len(key): + bad_keys = list(set(key).difference(self.obj.columns)) + raise KeyError(f"Columns not found: {str(bad_keys)[1:-1]}") + return self._gotitem(list(key), ndim=2) + + elif not getattr(self, "as_index", False): + if key not in self.obj.columns: + raise KeyError(f"Column not found: {key}") + return self._gotitem(key, ndim=2) + + else: + if key not in self.obj: + raise KeyError(f"Column not found: {key}") + return self._gotitem(key, ndim=1) + + def _gotitem(self, key, ndim, subset=None): + """ + sub-classes to define + return a sliced object + + Parameters + ---------- + key : string / list of selections + ndim : 1,2 + requested ndim of result + subset : object, default None + subset to act on + + """ + raise AbstractMethodError(self) + + def aggregate(self, func, *args, **kwargs): + raise AbstractMethodError(self) + + agg = aggregate + + def _try_aggregate_string_function(self, arg: str, *args, **kwargs): + """ + if arg is a string, then try to operate on it: + - try to find a function (or attribute) on ourselves + - try to find a numpy function + - raise + + """ + assert isinstance(arg, str) + + f = getattr(self, arg, None) + if f is not None: + if callable(f): + return f(*args, **kwargs) + + # people may try to aggregate on a non-callable attribute + # but don't let them think they can pass args to it + assert len(args) == 0 + assert len([kwarg for kwarg in kwargs if kwarg not in ["axis"]]) == 0 + return f + + f = getattr(np, arg, None) + if f is not None: + if hasattr(self, "__array__"): + # in particular exclude Window + return f(self, *args, **kwargs) + + raise AttributeError( + f"'{arg}' is not a valid function for '{type(self).__name__}' object" + ) + + def _aggregate(self, arg, *args, **kwargs): + """ + provide an implementation for the aggregators + + Parameters + ---------- + arg : string, dict, function + *args : args to pass on to the function + **kwargs : kwargs to pass on to the function + + Returns + ------- + tuple of result, how + + Notes + ----- + how can be a string describe the required post-processing, or + None if not required + """ + is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) + + _axis = kwargs.pop("_axis", None) + if _axis is None: + _axis = getattr(self, "axis", 0) + + if isinstance(arg, str): + return self._try_aggregate_string_function(arg, *args, **kwargs), None + + if isinstance(arg, dict): + # aggregate based on the passed dict + if _axis != 0: # pragma: no cover + raise ValueError("Can only pass dict with axis=0") + + obj = self._selected_obj + + # if we have a dict of any non-scalars + # eg. {'A' : ['mean']}, normalize all to + # be list-likes + if any(is_aggregator(x) for x in arg.values()): + new_arg = {} + for k, v in arg.items(): + if not isinstance(v, (tuple, list, dict)): + new_arg[k] = [v] + else: + new_arg[k] = v + + # the keys must be in the columns + # for ndim=2, or renamers for ndim=1 + + # ok for now, but deprecated + # {'A': { 'ra': 'mean' }} + # {'A': { 'ra': ['mean'] }} + # {'ra': ['mean']} + + # not ok + # {'ra' : { 'A' : 'mean' }} + if isinstance(v, dict): + raise SpecificationError("nested renamer is not supported") + elif isinstance(obj, ABCSeries): + raise SpecificationError("nested renamer is not supported") + elif isinstance(obj, ABCDataFrame) and k not in obj.columns: + raise KeyError(f"Column '{k}' does not exist!") + + arg = new_arg + + else: + # deprecation of renaming keys + # GH 15931 + keys = list(arg.keys()) + if isinstance(obj, ABCDataFrame) and len( + obj.columns.intersection(keys) + ) != len(keys): + raise SpecificationError("nested renamer is not supported") + + from pandas.core.reshape.concat import concat + + def _agg_1dim(name, how, subset=None): + """ + aggregate a 1-dim with how + """ + colg = self._gotitem(name, ndim=1, subset=subset) + if colg.ndim != 1: + raise SpecificationError( + "nested dictionary is ambiguous in aggregation" + ) + return colg.aggregate(how) + + def _agg_2dim(name, how): + """ + aggregate a 2-dim with how + """ + colg = self._gotitem(self._selection, ndim=2, subset=obj) + return colg.aggregate(how) + + def _agg(arg, func): + """ + run the aggregations over the arg with func + return a dict + """ + result = {} + for fname, agg_how in arg.items(): + result[fname] = func(fname, agg_how) + return result + + # set the final keys + keys = list(arg.keys()) + result = {} + + if self._selection is not None: + + sl = set(self._selection_list) + + # we are a Series like object, + # but may have multiple aggregations + if len(sl) == 1: + + result = _agg( + arg, lambda fname, agg_how: _agg_1dim(self._selection, agg_how) + ) + + # we are selecting the same set as we are aggregating + elif not len(sl - set(keys)): + + result = _agg(arg, _agg_1dim) + + # we are a DataFrame, with possibly multiple aggregations + else: + + result = _agg(arg, _agg_2dim) + + # no selection + else: + + try: + result = _agg(arg, _agg_1dim) + except SpecificationError: + + # we are aggregating expecting all 1d-returns + # but we have 2d + result = _agg(arg, _agg_2dim) + + # combine results + + def is_any_series() -> bool: + # return a boolean if we have *any* nested series + return any(isinstance(r, ABCSeries) for r in result.values()) + + def is_any_frame() -> bool: + # return a boolean if we have *any* nested series + return any(isinstance(r, ABCDataFrame) for r in result.values()) + + if isinstance(result, list): + return concat(result, keys=keys, axis=1, sort=True), True + + elif is_any_frame(): + # we have a dict of DataFrames + # return a MI DataFrame + + return concat([result[k] for k in keys], keys=keys, axis=1), True + + elif isinstance(self, ABCSeries) and is_any_series(): + + # we have a dict of Series + # return a MI Series + try: + result = concat(result) + except TypeError: + # we want to give a nice error here if + # we have non-same sized objects, so + # we don't automatically broadcast + + raise ValueError( + "cannot perform both aggregation " + "and transformation operations " + "simultaneously" + ) + + return result, True + + # fall thru + from pandas import DataFrame, Series + + try: + result = DataFrame(result) + except ValueError: + + # we have a dict of scalars + result = Series(result, name=getattr(self, "name", None)) + + return result, True + elif is_list_like(arg): + # we require a list, but not an 'str' + return self._aggregate_multiple_funcs(arg, _axis=_axis), None + else: + result = None + + f = self._get_cython_func(arg) + if f and not args and not kwargs: + return getattr(self, f)(), None + + # caller can react + return result, True + + def _aggregate_multiple_funcs(self, arg, _axis): + from pandas.core.reshape.concat import concat + + if _axis != 0: + raise NotImplementedError("axis other than 0 is not supported") + + if self._selected_obj.ndim == 1: + obj = self._selected_obj + else: + obj = self._obj_with_exclusions + + results = [] + keys = [] + + # degenerate case + if obj.ndim == 1: + for a in arg: + colg = self._gotitem(obj.name, ndim=1, subset=obj) + try: + new_res = colg.aggregate(a) + + except TypeError: + pass + else: + results.append(new_res) + + # make sure we find a good name + name = com.get_callable_name(a) or a + keys.append(name) + + # multiples + else: + for index, col in enumerate(obj): + colg = self._gotitem(col, ndim=1, subset=obj.iloc[:, index]) + try: + new_res = colg.aggregate(arg) + except (TypeError, DataError): + pass + except ValueError as err: + # cannot aggregate + if "Must produce aggregated value" in str(err): + # raised directly in _aggregate_named + pass + elif "no results" in str(err): + # raised direcly in _aggregate_multiple_funcs + pass + else: + raise + else: + results.append(new_res) + keys.append(col) + + # if we are empty + if not len(results): + raise ValueError("no results") + + try: + return concat(results, keys=keys, axis=1, sort=False) + except TypeError: + + # we are concatting non-NDFrame objects, + # e.g. a list of scalars + + from pandas import Series + + result = Series(results, index=keys, name=self.name) + if is_nested_object(result): + raise ValueError("cannot combine transform and aggregation operations") + return result + + def _get_cython_func(self, arg: str) -> Optional[str]: + """ + if we define an internal function for this argument, return it + """ + return self._cython_table.get(arg) + + def _is_builtin_func(self, arg): + """ + if we define an builtin function for this argument, return it, + otherwise return the arg + """ + return self._builtin_table.get(arg, arg) + + +class ShallowMixin: + _attributes: List[str] = [] + + def _shallow_copy(self, obj=None, **kwargs): + """ + return a new object with the replacement attributes + """ + if obj is None: + obj = self._selected_obj.copy() + + if isinstance(obj, self._constructor): + obj = obj.obj + for attr in self._attributes: + if attr not in kwargs: + kwargs[attr] = getattr(self, attr) + return self._constructor(obj, **kwargs) + + +class IndexOpsMixin: + """ + Common ops mixin to support a unified interface / docs for Series / Index + """ + + # ndarray compatibility + __array_priority__ = 1000 + _deprecations: FrozenSet[str] = frozenset( + ["tolist"] # tolist is not deprecated, just suppressed in the __dir__ + ) + + def transpose(self, *args, **kwargs): + """ + Return the transpose, which is by definition self. + + Returns + ------- + %(klass)s + """ + nv.validate_transpose(args, kwargs) + return self + + T = property( + transpose, + doc=""" + Return the transpose, which is by definition self. + """, + ) + + @property + def shape(self): + """ + Return a tuple of the shape of the underlying data. + """ + return self._values.shape + + @property + def ndim(self) -> int: + """ + Number of dimensions of the underlying data, by definition 1. + """ + return 1 + + def item(self): + """ + Return the first element of the underlying data as a python scalar. + + Returns + ------- + scalar + The first element of %(klass)s. + + Raises + ------ + ValueError + If the data is not length-1. + """ + if not ( + is_extension_array_dtype(self.dtype) or needs_i8_conversion(self.dtype) + ): + # numpy returns ints instead of datetime64/timedelta64 objects, + # which we need to wrap in Timestamp/Timedelta/Period regardless. + return self.values.item() + + if len(self) == 1: + return next(iter(self)) + else: + raise ValueError("can only convert an array of size 1 to a Python scalar") + + @property + def nbytes(self): + """ + Return the number of bytes in the underlying data. + """ + return self._values.nbytes + + @property + def size(self): + """ + Return the number of elements in the underlying data. + """ + return len(self._values) + + @property + def array(self) -> ExtensionArray: + """ + The ExtensionArray of the data backing this Series or Index. + + .. versionadded:: 0.24.0 + + Returns + ------- + ExtensionArray + An ExtensionArray of the values stored within. For extension + types, this is the actual array. For NumPy native types, this + is a thin (no copy) wrapper around :class:`numpy.ndarray`. + + ``.array`` differs ``.values`` which may require converting the + data to a different form. + + See Also + -------- + Index.to_numpy : Similar method that always returns a NumPy array. + Series.to_numpy : Similar method that always returns a NumPy array. + + Notes + ----- + This table lays out the different array types for each extension + dtype within pandas. + + ================== ============================= + dtype array type + ================== ============================= + category Categorical + period PeriodArray + interval IntervalArray + IntegerNA IntegerArray + string StringArray + boolean BooleanArray + datetime64[ns, tz] DatetimeArray + ================== ============================= + + For any 3rd-party extension types, the array type will be an + ExtensionArray. + + For all remaining dtypes ``.array`` will be a + :class:`arrays.NumpyExtensionArray` wrapping the actual ndarray + stored within. If you absolutely need a NumPy array (possibly with + copying / coercing data), then use :meth:`Series.to_numpy` instead. + + Examples + -------- + + For regular NumPy types like int, and float, a PandasArray + is returned. + + >>> pd.Series([1, 2, 3]).array + + [1, 2, 3] + Length: 3, dtype: int64 + + For extension types, like Categorical, the actual ExtensionArray + is returned + + >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) + >>> ser.array + [a, b, a] + Categories (2, object): [a, b] + """ + raise AbstractMethodError(self) + + def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default, **kwargs): + """ + A NumPy ndarray representing the values in this Series or Index. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + dtype : str or numpy.dtype, optional + The dtype to pass to :meth:`numpy.asarray`. + copy : bool, default False + Whether to ensure that the returned value is a not a view on + another array. Note that ``copy=False`` does not *ensure* that + ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that + a copy is made, even if not strictly necessary. + na_value : Any, optional + The value to use for missing values. The default value depends + on `dtype` and the type of the array. + + .. versionadded:: 1.0.0 + + **kwargs + Additional keywords passed through to the ``to_numpy`` method + of the underlying array (for extension arrays). + + .. versionadded:: 1.0.0 + + Returns + ------- + numpy.ndarray + + See Also + -------- + Series.array : Get the actual data stored within. + Index.array : Get the actual data stored within. + DataFrame.to_numpy : Similar method for DataFrame. + + Notes + ----- + The returned array will be the same up to equality (values equal + in `self` will be equal in the returned array; likewise for values + that are not equal). When `self` contains an ExtensionArray, the + dtype may be different. For example, for a category-dtype Series, + ``to_numpy()`` will return a NumPy array and the categorical dtype + will be lost. + + For NumPy dtypes, this will be a reference to the actual data stored + in this Series or Index (assuming ``copy=False``). Modifying the result + in place will modify the data stored in the Series or Index (not that + we recommend doing that). + + For extension types, ``to_numpy()`` *may* require copying data and + coercing the result to a NumPy type (possibly object), which may be + expensive. When you need a no-copy reference to the underlying data, + :attr:`Series.array` should be used instead. + + This table lays out the different dtypes and default return types of + ``to_numpy()`` for various dtypes within pandas. + + ================== ================================ + dtype array type + ================== ================================ + category[T] ndarray[T] (same dtype as input) + period ndarray[object] (Periods) + interval ndarray[object] (Intervals) + IntegerNA ndarray[object] + datetime64[ns] datetime64[ns] + datetime64[ns, tz] ndarray[object] (Timestamps) + ================== ================================ + + Examples + -------- + >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) + >>> ser.to_numpy() + array(['a', 'b', 'a'], dtype=object) + + Specify the `dtype` to control how datetime-aware data is represented. + Use ``dtype=object`` to return an ndarray of pandas :class:`Timestamp` + objects, each with the correct ``tz``. + + >>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET")) + >>> ser.to_numpy(dtype=object) + array([Timestamp('2000-01-01 00:00:00+0100', tz='CET', freq='D'), + Timestamp('2000-01-02 00:00:00+0100', tz='CET', freq='D')], + dtype=object) + + Or ``dtype='datetime64[ns]'`` to return an ndarray of native + datetime64 values. The values are converted to UTC and the timezone + info is dropped. + + >>> ser.to_numpy(dtype="datetime64[ns]") + ... # doctest: +ELLIPSIS + array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'], + dtype='datetime64[ns]') + """ + if is_extension_array_dtype(self.dtype): + return self.array.to_numpy(dtype, copy=copy, na_value=na_value, **kwargs) + else: + if kwargs: + msg = "to_numpy() got an unexpected keyword argument '{}'".format( + list(kwargs.keys())[0] + ) + raise TypeError(msg) + + result = np.asarray(self._values, dtype=dtype) + # TODO(GH-24345): Avoid potential double copy + if copy or na_value is not lib.no_default: + result = result.copy() + if na_value is not lib.no_default: + result[self.isna()] = na_value + return result + + @property + def _ndarray_values(self) -> np.ndarray: + """ + The data as an ndarray, possibly losing information. + + The expectation is that this is cheap to compute, and is primarily + used for interacting with our indexers. + + - categorical -> codes + """ + if is_extension_array_dtype(self): + return self.array._ndarray_values + # As a mixin, we depend on the mixing class having values. + # Special mixin syntax may be developed in the future: + # https://github.com/python/typing/issues/246 + return self.values # type: ignore + + @property + def empty(self): + return not self.size + + def max(self, axis=None, skipna=True, *args, **kwargs): + """ + Return the maximum value of the Index. + + Parameters + ---------- + axis : int, optional + For compatibility with NumPy. Only 0 or None are allowed. + skipna : bool, default True + + Returns + ------- + scalar + Maximum value. + + See Also + -------- + Index.min : Return the minimum value in an Index. + Series.max : Return the maximum value in a Series. + DataFrame.max : Return the maximum values in a DataFrame. + + Examples + -------- + >>> idx = pd.Index([3, 2, 1]) + >>> idx.max() + 3 + + >>> idx = pd.Index(['c', 'b', 'a']) + >>> idx.max() + 'c' + + For a MultiIndex, the maximum is determined lexicographically. + + >>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)]) + >>> idx.max() + ('b', 2) + """ + nv.validate_minmax_axis(axis) + nv.validate_max(args, kwargs) + return nanops.nanmax(self._values, skipna=skipna) + + def argmax(self, axis=None, skipna=True, *args, **kwargs): + """ + Return an ndarray of the maximum argument indexer. + + Parameters + ---------- + axis : {None} + Dummy argument for consistency with Series. + skipna : bool, default True + + Returns + ------- + numpy.ndarray + Indices of the maximum values. + + See Also + -------- + numpy.ndarray.argmax + """ + nv.validate_minmax_axis(axis) + nv.validate_argmax_with_skipna(skipna, args, kwargs) + return nanops.nanargmax(self._values, skipna=skipna) + + def min(self, axis=None, skipna=True, *args, **kwargs): + """ + Return the minimum value of the Index. + + Parameters + ---------- + axis : {None} + Dummy argument for consistency with Series. + skipna : bool, default True + + Returns + ------- + scalar + Minimum value. + + See Also + -------- + Index.max : Return the maximum value of the object. + Series.min : Return the minimum value in a Series. + DataFrame.min : Return the minimum values in a DataFrame. + + Examples + -------- + >>> idx = pd.Index([3, 2, 1]) + >>> idx.min() + 1 + + >>> idx = pd.Index(['c', 'b', 'a']) + >>> idx.min() + 'a' + + For a MultiIndex, the minimum is determined lexicographically. + + >>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)]) + >>> idx.min() + ('a', 1) + """ + nv.validate_minmax_axis(axis) + nv.validate_min(args, kwargs) + return nanops.nanmin(self._values, skipna=skipna) + + def argmin(self, axis=None, skipna=True, *args, **kwargs): + """ + Return a ndarray of the minimum argument indexer. + + Parameters + ---------- + axis : {None} + Dummy argument for consistency with Series. + skipna : bool, default True + + Returns + ------- + numpy.ndarray + + See Also + -------- + numpy.ndarray.argmin + """ + nv.validate_minmax_axis(axis) + nv.validate_argmax_with_skipna(skipna, args, kwargs) + return nanops.nanargmin(self._values, skipna=skipna) + + def tolist(self): + """ + Return a list of the values. + + These are each a scalar type, which is a Python scalar + (for str, int, float) or a pandas scalar + (for Timestamp/Timedelta/Interval/Period) + + Returns + ------- + list + + See Also + -------- + numpy.ndarray.tolist + """ + if self.dtype.kind in ["m", "M"]: + return [com.maybe_box_datetimelike(x) for x in self._values] + elif is_extension_array_dtype(self._values): + return list(self._values) + else: + return self._values.tolist() + + to_list = tolist + + def __iter__(self): + """ + Return an iterator of the values. + + These are each a scalar type, which is a Python scalar + (for str, int, float) or a pandas scalar + (for Timestamp/Timedelta/Interval/Period) + + Returns + ------- + iterator + """ + # We are explicitly making element iterators. + if self.dtype.kind in ["m", "M"]: + return map(com.maybe_box_datetimelike, self._values) + elif is_extension_array_dtype(self._values): + return iter(self._values) + else: + return map(self._values.item, range(self._values.size)) + + @cache_readonly + def hasnans(self): + """ + Return if I have any nans; enables various perf speedups. + """ + return bool(isna(self).any()) + + def _reduce( + self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds + ): + """ perform the reduction type operation if we can """ + func = getattr(self, name, None) + if func is None: + raise TypeError( + f"{type(self).__name__} cannot perform the operation {name}" + ) + return func(skipna=skipna, **kwds) + + def _map_values(self, mapper, na_action=None): + """ + An internal function that maps values using the input + correspondence (which can be a dict, Series, or function). + + Parameters + ---------- + mapper : function, dict, or Series + The input correspondence object + na_action : {None, 'ignore'} + If 'ignore', propagate NA values, without passing them to the + mapping function + + Returns + ------- + Union[Index, MultiIndex], inferred + The output of the mapping function applied to the index. + If the function returns a tuple with more than one element + a MultiIndex will be returned. + + """ + + # we can fastpath dict/Series to an efficient map + # as we know that we are not going to have to yield + # python types + if is_dict_like(mapper): + if isinstance(mapper, dict) and hasattr(mapper, "__missing__"): + # If a dictionary subclass defines a default value method, + # convert mapper to a lookup function (GH #15999). + dict_with_default = mapper + mapper = lambda x: dict_with_default[x] + else: + # Dictionary does not have a default. Thus it's safe to + # convert to an Series for efficiency. + # we specify the keys here to handle the + # possibility that they are tuples + + # The return value of mapping with an empty mapper is + # expected to be pd.Series(np.nan, ...). As np.nan is + # of dtype float64 the return value of this method should + # be float64 as well + mapper = create_series_with_explicit_dtype( + mapper, dtype_if_empty=np.float64 + ) + + if isinstance(mapper, ABCSeries): + # Since values were input this means we came from either + # a dict or a series and mapper should be an index + if is_categorical_dtype(self._values): + # use the built in categorical series mapper which saves + # time by mapping the categories instead of all values + return self._values.map(mapper) + if is_extension_array_dtype(self.dtype): + values = self._values + else: + values = self.values + + indexer = mapper.index.get_indexer(values) + new_values = algorithms.take_1d(mapper._values, indexer) + + return new_values + + # we must convert to python types + if is_extension_array_dtype(self.dtype) and hasattr(self._values, "map"): + # GH#23179 some EAs do not have `map` + values = self._values + if na_action is not None: + raise NotImplementedError + map_f = lambda values, f: values.map(f) + else: + values = self.astype(object) + values = getattr(values, "values", values) + if na_action == "ignore": + + def map_f(values, f): + return lib.map_infer_mask(values, f, isna(values).view(np.uint8)) + + else: + map_f = lib.map_infer + + # mapper is a function + new_values = map_f(values, mapper) + + return new_values + + def value_counts( + self, normalize=False, sort=True, ascending=False, bins=None, dropna=True + ): + """ + Return a Series containing counts of unique values. + + The resulting object will be in descending order so that the + first element is the most frequently-occurring element. + Excludes NA values by default. + + Parameters + ---------- + normalize : bool, default False + If True then the object returned will contain the relative + frequencies of the unique values. + sort : bool, default True + Sort by frequencies. + ascending : bool, default False + Sort in ascending order. + bins : int, optional + Rather than count values, group them into half-open bins, + a convenience for ``pd.cut``, only works with numeric data. + dropna : bool, default True + Don't include counts of NaN. + + Returns + ------- + Series + + See Also + -------- + Series.count: Number of non-NA elements in a Series. + DataFrame.count: Number of non-NA elements in a DataFrame. + + Examples + -------- + >>> index = pd.Index([3, 1, 2, 3, 4, np.nan]) + >>> index.value_counts() + 3.0 2 + 4.0 1 + 2.0 1 + 1.0 1 + dtype: int64 + + With `normalize` set to `True`, returns the relative frequency by + dividing all values by the sum of values. + + >>> s = pd.Series([3, 1, 2, 3, 4, np.nan]) + >>> s.value_counts(normalize=True) + 3.0 0.4 + 4.0 0.2 + 2.0 0.2 + 1.0 0.2 + dtype: float64 + + **bins** + + Bins can be useful for going from a continuous variable to a + categorical variable; instead of counting unique + apparitions of values, divide the index in the specified + number of half-open bins. + + >>> s.value_counts(bins=3) + (2.0, 3.0] 2 + (0.996, 2.0] 2 + (3.0, 4.0] 1 + dtype: int64 + + **dropna** + + With `dropna` set to `False` we can also see NaN index values. + + >>> s.value_counts(dropna=False) + 3.0 2 + NaN 1 + 4.0 1 + 2.0 1 + 1.0 1 + dtype: int64 + """ + result = value_counts( + self, + sort=sort, + ascending=ascending, + normalize=normalize, + bins=bins, + dropna=dropna, + ) + return result + + def unique(self): + values = self._values + + if hasattr(values, "unique"): + + result = values.unique() + else: + result = unique1d(values) + + return result + + def nunique(self, dropna=True): + """ + Return number of unique elements in the object. + + Excludes NA values by default. + + Parameters + ---------- + dropna : bool, default True + Don't include NaN in the count. + + Returns + ------- + int + + See Also + -------- + DataFrame.nunique: Method nunique for DataFrame. + Series.count: Count non-NA/null observations in the Series. + + Examples + -------- + >>> s = pd.Series([1, 3, 5, 7, 7]) + >>> s + 0 1 + 1 3 + 2 5 + 3 7 + 4 7 + dtype: int64 + + >>> s.nunique() + 4 + """ + uniqs = self.unique() + n = len(uniqs) + if dropna and isna(uniqs).any(): + n -= 1 + return n + + @property + def is_unique(self): + """ + Return boolean if values in the object are unique. + + Returns + ------- + bool + """ + return self.nunique(dropna=False) == len(self) + + @property + def is_monotonic(self): + """ + Return boolean if values in the object are + monotonic_increasing. + + Returns + ------- + bool + """ + from pandas import Index + + return Index(self).is_monotonic + + is_monotonic_increasing = is_monotonic + + @property + def is_monotonic_decreasing(self) -> bool: + """ + Return boolean if values in the object are + monotonic_decreasing. + + Returns + ------- + bool + """ + from pandas import Index + + return Index(self).is_monotonic_decreasing + + def memory_usage(self, deep=False): + """ + Memory usage of the values. + + Parameters + ---------- + deep : bool + Introspect the data deeply, interrogate + `object` dtypes for system-level memory consumption. + + Returns + ------- + bytes used + + See Also + -------- + numpy.ndarray.nbytes + + Notes + ----- + Memory usage does not include memory consumed by elements that + are not components of the array if deep=False or if used on PyPy + """ + if hasattr(self.array, "memory_usage"): + return self.array.memory_usage(deep=deep) + + v = self.array.nbytes + if deep and is_object_dtype(self) and not PYPY: + v += lib.memory_usage_of_objects(self.array) + return v + + @Substitution( + values="", + order="", + size_hint="", + sort=textwrap.dedent( + """\ + sort : bool, default False + Sort `uniques` and shuffle `codes` to maintain the + relationship. + """ + ), + ) + @Appender(algorithms._shared_docs["factorize"]) + def factorize(self, sort=False, na_sentinel=-1): + return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel) + + _shared_docs[ + "searchsorted" + ] = """ + Find indices where elements should be inserted to maintain order. + + Find the indices into a sorted %(klass)s `self` such that, if the + corresponding elements in `value` were inserted before the indices, + the order of `self` would be preserved. + + .. note:: + + The %(klass)s *must* be monotonically sorted, otherwise + wrong locations will likely be returned. Pandas does *not* + check this for you. + + Parameters + ---------- + value : array_like + Values to insert into `self`. + side : {'left', 'right'}, optional + If 'left', the index of the first suitable location found is given. + If 'right', return the last such index. If there is no suitable + index, return either 0 or N (where N is the length of `self`). + sorter : 1-D array_like, optional + Optional array of integer indices that sort `self` into ascending + order. They are typically the result of ``np.argsort``. + + Returns + ------- + int or array of int + A scalar or array of insertion points with the + same shape as `value`. + + .. versionchanged:: 0.24.0 + If `value` is a scalar, an int is now always returned. + Previously, scalar inputs returned an 1-item array for + :class:`Series` and :class:`Categorical`. + + See Also + -------- + sort_values + numpy.searchsorted + + Notes + ----- + Binary search is used to find the required insertion points. + + Examples + -------- + + >>> x = pd.Series([1, 2, 3]) + >>> x + 0 1 + 1 2 + 2 3 + dtype: int64 + + >>> x.searchsorted(4) + 3 + + >>> x.searchsorted([0, 4]) + array([0, 3]) + + >>> x.searchsorted([1, 3], side='left') + array([0, 2]) + + >>> x.searchsorted([1, 3], side='right') + array([1, 3]) + + >>> x = pd.Categorical(['apple', 'bread', 'bread', + 'cheese', 'milk'], ordered=True) + [apple, bread, bread, cheese, milk] + Categories (4, object): [apple < bread < cheese < milk] + + >>> x.searchsorted('bread') + 1 + + >>> x.searchsorted(['bread'], side='right') + array([3]) + + If the values are not monotonically sorted, wrong locations + may be returned: + + >>> x = pd.Series([2, 1, 3]) + >>> x.searchsorted(1) + 0 # wrong result, correct would be 1 + """ + + @Substitution(klass="Index") + @Appender(_shared_docs["searchsorted"]) + def searchsorted(self, value, side="left", sorter=None): + return algorithms.searchsorted(self._values, value, side=side, sorter=sorter) + + def drop_duplicates(self, keep="first", inplace=False): + inplace = validate_bool_kwarg(inplace, "inplace") + if isinstance(self, ABCIndexClass): + if self.is_unique: + return self._shallow_copy() + + duplicated = self.duplicated(keep=keep) + result = self[np.logical_not(duplicated)] + if inplace: + return self._update_inplace(result) + else: + return result + + def duplicated(self, keep="first"): + if isinstance(self, ABCIndexClass): + if self.is_unique: + return np.zeros(len(self), dtype=np.bool) + return duplicated(self, keep=keep) + else: + return self._constructor( + duplicated(self, keep=keep), index=self.index + ).__finalize__(self) + + # ---------------------------------------------------------------------- + # abstracts + + def _update_inplace(self, result, verify_is_copy=True, **kwargs): + raise AbstractMethodError(self) diff --git a/venv/Lib/site-packages/pandas/core/common.py b/venv/Lib/site-packages/pandas/core/common.py new file mode 100644 index 0000000..d8b082e --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/common.py @@ -0,0 +1,485 @@ +""" +Misc tools for implementing data structures + +Note: pandas.core.common is *not* part of the public API. +""" + +import collections +from collections import abc +from datetime import datetime, timedelta +from functools import partial +import inspect +from typing import Any, Collection, Iterable, Union + +import numpy as np + +from pandas._libs import lib, tslibs +from pandas._typing import T + +from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike +from pandas.core.dtypes.common import ( + is_array_like, + is_bool_dtype, + is_extension_array_dtype, + is_integer, +) +from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries +from pandas.core.dtypes.inference import _iterable_not_string +from pandas.core.dtypes.missing import isna, isnull, notnull # noqa + + +class SettingWithCopyError(ValueError): + pass + + +class SettingWithCopyWarning(Warning): + pass + + +def flatten(l): + """ + Flatten an arbitrarily nested sequence. + + Parameters + ---------- + l : sequence + The non string sequence to flatten + + Notes + ----- + This doesn't consider strings sequences. + + Returns + ------- + flattened : generator + """ + for el in l: + if _iterable_not_string(el): + for s in flatten(el): + yield s + else: + yield el + + +def consensus_name_attr(objs): + name = objs[0].name + for obj in objs[1:]: + try: + if obj.name != name: + name = None + except ValueError: + name = None + return name + + +def maybe_box(indexer, values, obj, key): + + # if we have multiples coming back, box em + if isinstance(values, np.ndarray): + return obj[indexer.get_loc(key)] + + # return the value + return values + + +def maybe_box_datetimelike(value): + # turn a datetime like into a Timestamp/timedelta as needed + + if isinstance(value, (np.datetime64, datetime)): + value = tslibs.Timestamp(value) + elif isinstance(value, (np.timedelta64, timedelta)): + value = tslibs.Timedelta(value) + + return value + + +values_from_object = lib.values_from_object + + +def is_bool_indexer(key: Any) -> bool: + """ + Check whether `key` is a valid boolean indexer. + + Parameters + ---------- + key : Any + Only list-likes may be considered boolean indexers. + All other types are not considered a boolean indexer. + For array-like input, boolean ndarrays or ExtensionArrays + with ``_is_boolean`` set are considered boolean indexers. + + Returns + ------- + bool + Whether `key` is a valid boolean indexer. + + Raises + ------ + ValueError + When the array is an object-dtype ndarray or ExtensionArray + and contains missing values. + + See Also + -------- + check_array_indexer : Check that `key` is a valid array to index, + and convert to an ndarray. + """ + na_msg = "cannot mask with array containing NA / NaN values" + if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or ( + is_array_like(key) and is_extension_array_dtype(key.dtype) + ): + if key.dtype == np.object_: + key = np.asarray(values_from_object(key)) + + if not lib.is_bool_array(key): + if isna(key).any(): + raise ValueError(na_msg) + return False + return True + elif is_bool_dtype(key.dtype): + # an ndarray with bool-dtype by definition has no missing values. + # So we only need to check for NAs in ExtensionArrays + if is_extension_array_dtype(key.dtype): + if np.any(key.isna()): + raise ValueError(na_msg) + return True + elif isinstance(key, list): + try: + arr = np.asarray(key) + return arr.dtype == np.bool_ and len(arr) == len(key) + except TypeError: # pragma: no cover + return False + + return False + + +def cast_scalar_indexer(val): + """ + To avoid numpy DeprecationWarnings, cast float to integer where valid. + + Parameters + ---------- + val : scalar + + Returns + ------- + outval : scalar + """ + # assumes lib.is_scalar(val) + if lib.is_float(val) and val == int(val): + return int(val) + return val + + +def not_none(*args): + """ + Returns a generator consisting of the arguments that are not None. + """ + return (arg for arg in args if arg is not None) + + +def any_none(*args): + """ + Returns a boolean indicating if any argument is None. + """ + return any(arg is None for arg in args) + + +def all_none(*args): + """ + Returns a boolean indicating if all arguments are None. + """ + return all(arg is None for arg in args) + + +def any_not_none(*args): + """ + Returns a boolean indicating if any argument is not None. + """ + return any(arg is not None for arg in args) + + +def all_not_none(*args): + """ + Returns a boolean indicating if all arguments are not None. + """ + return all(arg is not None for arg in args) + + +def count_not_none(*args): + """ + Returns the count of arguments that are not None. + """ + return sum(x is not None for x in args) + + +def try_sort(iterable): + listed = list(iterable) + try: + return sorted(listed) + except TypeError: + return listed + + +def asarray_tuplesafe(values, dtype=None): + + if not (isinstance(values, (list, tuple)) or hasattr(values, "__array__")): + values = list(values) + elif isinstance(values, ABCIndexClass): + return values.values + + if isinstance(values, list) and dtype in [np.object_, object]: + return construct_1d_object_array_from_listlike(values) + + result = np.asarray(values, dtype=dtype) + + if issubclass(result.dtype.type, str): + result = np.asarray(values, dtype=object) + + if result.ndim == 2: + # Avoid building an array of arrays: + values = [tuple(x) for x in values] + result = construct_1d_object_array_from_listlike(values) + + return result + + +def index_labels_to_array(labels, dtype=None): + """ + Transform label or iterable of labels to array, for use in Index. + + Parameters + ---------- + dtype : dtype + If specified, use as dtype of the resulting array, otherwise infer. + + Returns + ------- + array + """ + if isinstance(labels, (str, tuple)): + labels = [labels] + + if not isinstance(labels, (list, np.ndarray)): + try: + labels = list(labels) + except TypeError: # non-iterable + labels = [labels] + + labels = asarray_tuplesafe(labels, dtype=dtype) + + return labels + + +def maybe_make_list(obj): + if obj is not None and not isinstance(obj, (tuple, list)): + return [obj] + return obj + + +def maybe_iterable_to_list(obj: Union[Iterable[T], T]) -> Union[Collection[T], T]: + """ + If obj is Iterable but not list-like, consume into list. + """ + if isinstance(obj, abc.Iterable) and not isinstance(obj, abc.Sized): + return list(obj) + return obj + + +def is_null_slice(obj): + """ + We have a null slice. + """ + return ( + isinstance(obj, slice) + and obj.start is None + and obj.stop is None + and obj.step is None + ) + + +def is_true_slices(l): + """ + Find non-trivial slices in "l": return a list of booleans with same length. + """ + return [isinstance(k, slice) and not is_null_slice(k) for k in l] + + +# TODO: used only once in indexing; belongs elsewhere? +def is_full_slice(obj, l): + """ + We have a full length slice. + """ + return ( + isinstance(obj, slice) and obj.start == 0 and obj.stop == l and obj.step is None + ) + + +def get_callable_name(obj): + # typical case has name + if hasattr(obj, "__name__"): + return getattr(obj, "__name__") + # some objects don't; could recurse + if isinstance(obj, partial): + return get_callable_name(obj.func) + # fall back to class name + if hasattr(obj, "__call__"): + return type(obj).__name__ + # everything failed (probably because the argument + # wasn't actually callable); we return None + # instead of the empty string in this case to allow + # distinguishing between no name and a name of '' + return None + + +def apply_if_callable(maybe_callable, obj, **kwargs): + """ + Evaluate possibly callable input using obj and kwargs if it is callable, + otherwise return as it is. + + Parameters + ---------- + maybe_callable : possibly a callable + obj : NDFrame + **kwargs + """ + + if callable(maybe_callable): + return maybe_callable(obj, **kwargs) + + return maybe_callable + + +def dict_compat(d): + """ + Helper function to convert datetimelike-keyed dicts + to Timestamp-keyed dict. + + Parameters + ---------- + d: dict like object + + Returns + ------- + dict + + """ + return {maybe_box_datetimelike(key): value for key, value in d.items()} + + +def standardize_mapping(into): + """ + Helper function to standardize a supplied mapping. + + .. versionadded:: 0.21.0 + + Parameters + ---------- + into : instance or subclass of collections.abc.Mapping + Must be a class, an initialized collections.defaultdict, + or an instance of a collections.abc.Mapping subclass. + + Returns + ------- + mapping : a collections.abc.Mapping subclass or other constructor + a callable object that can accept an iterator to create + the desired Mapping. + + See Also + -------- + DataFrame.to_dict + Series.to_dict + """ + if not inspect.isclass(into): + if isinstance(into, collections.defaultdict): + return partial(collections.defaultdict, into.default_factory) + into = type(into) + if not issubclass(into, abc.Mapping): + raise TypeError(f"unsupported type: {into}") + elif into == collections.defaultdict: + raise TypeError("to_dict() only accepts initialized defaultdicts") + return into + + +def random_state(state=None): + """ + Helper function for processing random_state arguments. + + Parameters + ---------- + state : int, np.random.RandomState, None. + If receives an int, passes to np.random.RandomState() as seed. + If receives an np.random.RandomState object, just returns object. + If receives `None`, returns np.random. + If receives anything else, raises an informative ValueError. + Default None. + + Returns + ------- + np.random.RandomState + """ + + if is_integer(state): + return np.random.RandomState(state) + elif isinstance(state, np.random.RandomState): + return state + elif state is None: + return np.random + else: + raise ValueError( + "random_state must be an integer, a numpy RandomState, or None" + ) + + +def pipe(obj, func, *args, **kwargs): + """ + Apply a function ``func`` to object ``obj`` either by passing obj as the + first argument to the function or, in the case that the func is a tuple, + interpret the first element of the tuple as a function and pass the obj to + that function as a keyword argument whose key is the value of the second + element of the tuple. + + Parameters + ---------- + func : callable or tuple of (callable, str) + Function to apply to this object or, alternatively, a + ``(callable, data_keyword)`` tuple where ``data_keyword`` is a + string indicating the keyword of `callable`` that expects the + object. + *args : iterable, optional + Positional arguments passed into ``func``. + **kwargs : dict, optional + A dictionary of keyword arguments passed into ``func``. + + Returns + ------- + object : the return type of ``func``. + """ + if isinstance(func, tuple): + func, target = func + if target in kwargs: + msg = f"{target} is both the pipe target and a keyword argument" + raise ValueError(msg) + kwargs[target] = obj + return func(*args, **kwargs) + else: + return func(obj, *args, **kwargs) + + +def get_rename_function(mapper): + """ + Returns a function that will map names/labels, dependent if mapper + is a dict, Series or just a function. + """ + if isinstance(mapper, (abc.Mapping, ABCSeries)): + + def f(x): + if x in mapper: + return mapper[x] + else: + return x + + else: + f = mapper + + return f diff --git a/venv/Lib/site-packages/pandas/core/computation/__init__.py b/venv/Lib/site-packages/pandas/core/computation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/core/computation/align.py b/venv/Lib/site-packages/pandas/core/computation/align.py new file mode 100644 index 0000000..a1b1cff --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/computation/align.py @@ -0,0 +1,192 @@ +"""Core eval alignment algorithms +""" + +from functools import partial, wraps +from typing import Dict, Optional, Sequence, Tuple, Type, Union +import warnings + +import numpy as np + +from pandas._typing import FrameOrSeries +from pandas.errors import PerformanceWarning + +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries + +from pandas.core.base import PandasObject +import pandas.core.common as com +from pandas.core.computation.common import result_type_many + + +def _align_core_single_unary_op( + term, +) -> Tuple[Union[partial, Type[FrameOrSeries]], Optional[Dict[str, int]]]: + + typ: Union[partial, Type[FrameOrSeries]] + axes: Optional[Dict[str, int]] = None + + if isinstance(term.value, np.ndarray): + typ = partial(np.asanyarray, dtype=term.value.dtype) + else: + typ = type(term.value) + if hasattr(term.value, "axes"): + axes = _zip_axes_from_type(typ, term.value.axes) + + return typ, axes + + +def _zip_axes_from_type( + typ: Type[FrameOrSeries], new_axes: Sequence[int] +) -> Dict[str, int]: + axes = {name: new_axes[i] for i, name in typ._AXIS_NAMES.items()} + return axes + + +def _any_pandas_objects(terms) -> bool: + """ + Check a sequence of terms for instances of PandasObject. + """ + return any(isinstance(term.value, PandasObject) for term in terms) + + +def _filter_special_cases(f): + @wraps(f) + def wrapper(terms): + # single unary operand + if len(terms) == 1: + return _align_core_single_unary_op(terms[0]) + + term_values = (term.value for term in terms) + + # we don't have any pandas objects + if not _any_pandas_objects(terms): + return result_type_many(*term_values), None + + return f(terms) + + return wrapper + + +@_filter_special_cases +def _align_core(terms): + term_index = [i for i, term in enumerate(terms) if hasattr(term.value, "axes")] + term_dims = [terms[i].value.ndim for i in term_index] + + from pandas import Series + + ndims = Series(dict(zip(term_index, term_dims))) + + # initial axes are the axes of the largest-axis'd term + biggest = terms[ndims.idxmax()].value + typ = biggest._constructor + axes = biggest.axes + naxes = len(axes) + gt_than_one_axis = naxes > 1 + + for value in (terms[i].value for i in term_index): + is_series = isinstance(value, ABCSeries) + is_series_and_gt_one_axis = is_series and gt_than_one_axis + + for axis, items in enumerate(value.axes): + if is_series_and_gt_one_axis: + ax, itm = naxes - 1, value.index + else: + ax, itm = axis, items + + if not axes[ax].is_(itm): + axes[ax] = axes[ax].join(itm, how="outer") + + for i, ndim in ndims.items(): + for axis, items in zip(range(ndim), axes): + ti = terms[i].value + + if hasattr(ti, "reindex"): + transpose = isinstance(ti, ABCSeries) and naxes > 1 + reindexer = axes[naxes - 1] if transpose else items + + term_axis_size = len(ti.axes[axis]) + reindexer_size = len(reindexer) + + ordm = np.log10(max(1, abs(reindexer_size - term_axis_size))) + if ordm >= 1 and reindexer_size >= 10000: + w = ( + f"Alignment difference on axis {axis} is larger " + f"than an order of magnitude on term {repr(terms[i].name)}, " + f"by more than {ordm:.4g}; performance may suffer" + ) + warnings.warn(w, category=PerformanceWarning, stacklevel=6) + + f = partial(ti.reindex, reindexer, axis=axis, copy=False) + + terms[i].update(f()) + + terms[i].update(terms[i].value.values) + + return typ, _zip_axes_from_type(typ, axes) + + +def align_terms(terms): + """ + Align a set of terms. + """ + try: + # flatten the parse tree (a nested list, really) + terms = list(com.flatten(terms)) + except TypeError: + # can't iterate so it must just be a constant or single variable + if isinstance(terms.value, (ABCSeries, ABCDataFrame)): + typ = type(terms.value) + return typ, _zip_axes_from_type(typ, terms.value.axes) + return np.result_type(terms.type), None + + # if all resolved variables are numeric scalars + if all(term.is_scalar for term in terms): + return result_type_many(*(term.value for term in terms)).type, None + + # perform the main alignment + typ, axes = _align_core(terms) + return typ, axes + + +def reconstruct_object(typ, obj, axes, dtype): + """ + Reconstruct an object given its type, raw value, and possibly empty + (None) axes. + + Parameters + ---------- + typ : object + A type + obj : object + The value to use in the type constructor + axes : dict + The axes to use to construct the resulting pandas object + + Returns + ------- + ret : typ + An object of type ``typ`` with the value `obj` and possible axes + `axes`. + """ + try: + typ = typ.type + except AttributeError: + pass + + res_t = np.result_type(obj.dtype, dtype) + + if not isinstance(typ, partial) and issubclass(typ, PandasObject): + return typ(obj, dtype=res_t, **axes) + + # special case for pathological things like ~True/~False + if hasattr(res_t, "type") and typ == np.bool_ and res_t != np.bool_: + ret_value = res_t.type(obj) + else: + ret_value = typ(obj).astype(res_t) + # The condition is to distinguish 0-dim array (returned in case of + # scalar) and 1 element array + # e.g. np.array(0) and np.array([0]) + if len(obj.shape) == 1 and len(obj) == 1: + if not isinstance(ret_value, np.ndarray): + ret_value = np.array([ret_value]).astype(res_t) + + return ret_value diff --git a/venv/Lib/site-packages/pandas/core/computation/api.py b/venv/Lib/site-packages/pandas/core/computation/api.py new file mode 100644 index 0000000..31e8a48 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/computation/api.py @@ -0,0 +1,3 @@ +# flake8: noqa + +from pandas.core.computation.eval import eval diff --git a/venv/Lib/site-packages/pandas/core/computation/check.py b/venv/Lib/site-packages/pandas/core/computation/check.py new file mode 100644 index 0000000..4d20590 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/computation/check.py @@ -0,0 +1,10 @@ +from pandas.compat._optional import import_optional_dependency + +ne = import_optional_dependency("numexpr", raise_on_missing=False, on_version="warn") +_NUMEXPR_INSTALLED = ne is not None +if _NUMEXPR_INSTALLED: + _NUMEXPR_VERSION = ne.__version__ +else: + _NUMEXPR_VERSION = None + +__all__ = ["_NUMEXPR_INSTALLED", "_NUMEXPR_VERSION"] diff --git a/venv/Lib/site-packages/pandas/core/computation/common.py b/venv/Lib/site-packages/pandas/core/computation/common.py new file mode 100644 index 0000000..19a8898 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/computation/common.py @@ -0,0 +1,30 @@ +from functools import reduce + +import numpy as np + +from pandas._config import get_option + + +def _ensure_decoded(s): + """ + If we have bytes, decode them to unicode. + """ + if isinstance(s, (np.bytes_, bytes)): + s = s.decode(get_option("display.encoding")) + return s + + +def result_type_many(*arrays_and_dtypes): + """ + Wrapper around numpy.result_type which overcomes the NPY_MAXARGS (32) + argument limit. + """ + try: + return np.result_type(*arrays_and_dtypes) + except ValueError: + # we have > NPY_MAXARGS terms in our expression + return reduce(np.result_type, arrays_and_dtypes) + + +class NameResolutionError(NameError): + pass diff --git a/venv/Lib/site-packages/pandas/core/computation/engines.py b/venv/Lib/site-packages/pandas/core/computation/engines.py new file mode 100644 index 0000000..9c5388f --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/computation/engines.py @@ -0,0 +1,136 @@ +""" +Engine classes for :func:`~pandas.eval` +""" + +import abc +from typing import Dict, Type + +from pandas.core.computation.align import align_terms, reconstruct_object +from pandas.core.computation.ops import _mathops, _reductions + +import pandas.io.formats.printing as printing + +_ne_builtins = frozenset(_mathops + _reductions) + + +class NumExprClobberingError(NameError): + pass + + +def _check_ne_builtin_clash(expr): + """ + Attempt to prevent foot-shooting in a helpful way. + + Parameters + ---------- + terms : Term + Terms can contain + """ + names = expr.names + overlap = names & _ne_builtins + + if overlap: + s = ", ".join(repr(x) for x in overlap) + raise NumExprClobberingError( + f'Variables in expression "{expr}" overlap with builtins: ({s})' + ) + + +class AbstractEngine(metaclass=abc.ABCMeta): + """Object serving as a base class for all engines.""" + + has_neg_frac = False + + def __init__(self, expr): + self.expr = expr + self.aligned_axes = None + self.result_type = None + + def convert(self) -> str: + """ + Convert an expression for evaluation. + + Defaults to return the expression as a string. + """ + return printing.pprint_thing(self.expr) + + def evaluate(self) -> object: + """ + Run the engine on the expression. + + This method performs alignment which is necessary no matter what engine + is being used, thus its implementation is in the base class. + + Returns + ------- + object + The result of the passed expression. + """ + if not self._is_aligned: + self.result_type, self.aligned_axes = align_terms(self.expr.terms) + + # make sure no names in resolvers and locals/globals clash + res = self._evaluate() + return reconstruct_object( + self.result_type, res, self.aligned_axes, self.expr.terms.return_type + ) + + @property + def _is_aligned(self) -> bool: + return self.aligned_axes is not None and self.result_type is not None + + @abc.abstractmethod + def _evaluate(self): + """ + Return an evaluated expression. + + Parameters + ---------- + env : Scope + The local and global environment in which to evaluate an + expression. + + Notes + ----- + Must be implemented by subclasses. + """ + pass + + +class NumExprEngine(AbstractEngine): + """NumExpr engine class""" + + has_neg_frac = True + + def _evaluate(self): + import numexpr as ne + + # convert the expression to a valid numexpr expression + s = self.convert() + + env = self.expr.env + scope = env.full_scope + _check_ne_builtin_clash(self.expr) + return ne.evaluate(s, local_dict=scope) + + +class PythonEngine(AbstractEngine): + """ + Evaluate an expression in Python space. + + Mostly for testing purposes. + """ + + has_neg_frac = False + + def evaluate(self): + return self.expr() + + def _evaluate(self) -> None: + pass + + +_engines: Dict[str, Type[AbstractEngine]] = { + "numexpr": NumExprEngine, + "python": PythonEngine, +} diff --git a/venv/Lib/site-packages/pandas/core/computation/eval.py b/venv/Lib/site-packages/pandas/core/computation/eval.py new file mode 100644 index 0000000..51892b8 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/computation/eval.py @@ -0,0 +1,390 @@ +#!/usr/bin/env python + +""" +Top level ``eval`` module. +""" + +import tokenize +from typing import Optional +import warnings + +from pandas._libs.lib import no_default +from pandas.util._validators import validate_bool_kwarg + +from pandas.core.computation.engines import _engines +from pandas.core.computation.expr import Expr, _parsers +from pandas.core.computation.parsing import tokenize_string +from pandas.core.computation.scope import ensure_scope + +from pandas.io.formats.printing import pprint_thing + + +def _check_engine(engine: Optional[str]) -> str: + """ + Make sure a valid engine is passed. + + Parameters + ---------- + engine : str + + Raises + ------ + KeyError + * If an invalid engine is passed + ImportError + * If numexpr was requested but doesn't exist + + Returns + ------- + string engine + """ + from pandas.core.computation.check import _NUMEXPR_INSTALLED + + if engine is None: + if _NUMEXPR_INSTALLED: + engine = "numexpr" + else: + engine = "python" + + if engine not in _engines: + valid = list(_engines.keys()) + raise KeyError( + f"Invalid engine {repr(engine)} passed, valid engines are {valid}" + ) + + # TODO: validate this in a more general way (thinking of future engines + # that won't necessarily be import-able) + # Could potentially be done on engine instantiation + if engine == "numexpr": + if not _NUMEXPR_INSTALLED: + raise ImportError( + "'numexpr' is not installed or an " + "unsupported version. Cannot use " + "engine='numexpr' for query/eval " + "if 'numexpr' is not installed" + ) + + return engine + + +def _check_parser(parser: str): + """ + Make sure a valid parser is passed. + + Parameters + ---------- + parser : str + + Raises + ------ + KeyError + * If an invalid parser is passed + """ + + if parser not in _parsers: + raise KeyError( + f"Invalid parser {repr(parser)} passed, " + f"valid parsers are {_parsers.keys()}" + ) + + +def _check_resolvers(resolvers): + if resolvers is not None: + for resolver in resolvers: + if not hasattr(resolver, "__getitem__"): + name = type(resolver).__name__ + raise TypeError( + f"Resolver of type {repr(name)} does not " + f"implement the __getitem__ method" + ) + + +def _check_expression(expr): + """ + Make sure an expression is not an empty string + + Parameters + ---------- + expr : object + An object that can be converted to a string + + Raises + ------ + ValueError + * If expr is an empty string + """ + if not expr: + raise ValueError("expr cannot be an empty string") + + +def _convert_expression(expr) -> str: + """ + Convert an object to an expression. + + This function converts an object to an expression (a unicode string) and + checks to make sure it isn't empty after conversion. This is used to + convert operators to their string representation for recursive calls to + :func:`~pandas.eval`. + + Parameters + ---------- + expr : object + The object to be converted to a string. + + Returns + ------- + str + The string representation of an object. + + Raises + ------ + ValueError + * If the expression is empty. + """ + s = pprint_thing(expr) + _check_expression(s) + return s + + +def _check_for_locals(expr: str, stack_level: int, parser: str): + + at_top_of_stack = stack_level == 0 + not_pandas_parser = parser != "pandas" + + if not_pandas_parser: + msg = "The '@' prefix is only supported by the pandas parser" + elif at_top_of_stack: + msg = ( + "The '@' prefix is not allowed in " + "top-level eval calls, \nplease refer to " + "your variables by name without the '@' " + "prefix" + ) + + if at_top_of_stack or not_pandas_parser: + for toknum, tokval in tokenize_string(expr): + if toknum == tokenize.OP and tokval == "@": + raise SyntaxError(msg) + + +def eval( + expr, + parser="pandas", + engine: Optional[str] = None, + truediv=no_default, + local_dict=None, + global_dict=None, + resolvers=(), + level=0, + target=None, + inplace=False, +): + """ + Evaluate a Python expression as a string using various backends. + + The following arithmetic operations are supported: ``+``, ``-``, ``*``, + ``/``, ``**``, ``%``, ``//`` (python engine only) along with the following + boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not). + Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`, + :keyword:`or`, and :keyword:`not` with the same semantics as the + corresponding bitwise operators. :class:`~pandas.Series` and + :class:`~pandas.DataFrame` objects are supported and behave as they would + with plain ol' Python evaluation. + + Parameters + ---------- + expr : str + The expression to evaluate. This string cannot contain any Python + `statements + `__, + only Python `expressions + `__. + parser : {'pandas', 'python'}, default 'pandas' + The parser to use to construct the syntax tree from the expression. The + default of ``'pandas'`` parses code slightly different than standard + Python. Alternatively, you can parse an expression using the + ``'python'`` parser to retain strict Python semantics. See the + :ref:`enhancing performance ` documentation for + more details. + engine : {'python', 'numexpr'}, default 'numexpr' + + The engine used to evaluate the expression. Supported engines are + + - None : tries to use ``numexpr``, falls back to ``python`` + - ``'numexpr'``: This default engine evaluates pandas objects using + numexpr for large speed ups in complex expressions + with large frames. + - ``'python'``: Performs operations as if you had ``eval``'d in top + level python. This engine is generally not that useful. + + More backends may be available in the future. + + truediv : bool, optional + Whether to use true division, like in Python >= 3. + deprecated:: 1.0.0 + + local_dict : dict or None, optional + A dictionary of local variables, taken from locals() by default. + global_dict : dict or None, optional + A dictionary of global variables, taken from globals() by default. + resolvers : list of dict-like or None, optional + A list of objects implementing the ``__getitem__`` special method that + you can use to inject an additional collection of namespaces to use for + variable lookup. For example, this is used in the + :meth:`~DataFrame.query` method to inject the + ``DataFrame.index`` and ``DataFrame.columns`` + variables that refer to their respective :class:`~pandas.DataFrame` + instance attributes. + level : int, optional + The number of prior stack frames to traverse and add to the current + scope. Most users will **not** need to change this parameter. + target : object, optional, default None + This is the target object for assignment. It is used when there is + variable assignment in the expression. If so, then `target` must + support item assignment with string keys, and if a copy is being + returned, it must also support `.copy()`. + inplace : bool, default False + If `target` is provided, and the expression mutates `target`, whether + to modify `target` inplace. Otherwise, return a copy of `target` with + the mutation. + + Returns + ------- + ndarray, numeric scalar, DataFrame, Series + + Raises + ------ + ValueError + There are many instances where such an error can be raised: + + - `target=None`, but the expression is multiline. + - The expression is multiline, but not all them have item assignment. + An example of such an arrangement is this: + + a = b + 1 + a + 2 + + Here, there are expressions on different lines, making it multiline, + but the last line has no variable assigned to the output of `a + 2`. + - `inplace=True`, but the expression is missing item assignment. + - Item assignment is provided, but the `target` does not support + string item assignment. + - Item assignment is provided and `inplace=False`, but the `target` + does not support the `.copy()` method + + See Also + -------- + DataFrame.query + DataFrame.eval + + Notes + ----- + The ``dtype`` of any objects involved in an arithmetic ``%`` operation are + recursively cast to ``float64``. + + See the :ref:`enhancing performance ` documentation for + more details. + """ + + inplace = validate_bool_kwarg(inplace, "inplace") + + if truediv is not no_default: + warnings.warn( + "The `truediv` parameter in pd.eval is deprecated and will be " + "removed in a future version.", + FutureWarning, + stacklevel=2, + ) + + if isinstance(expr, str): + _check_expression(expr) + exprs = [e.strip() for e in expr.splitlines() if e.strip() != ""] + else: + exprs = [expr] + multi_line = len(exprs) > 1 + + if multi_line and target is None: + raise ValueError( + "multi-line expressions are only valid in the " + "context of data, use DataFrame.eval" + ) + engine = _check_engine(engine) + _check_parser(parser) + _check_resolvers(resolvers) + + ret = None + first_expr = True + target_modified = False + + for expr in exprs: + expr = _convert_expression(expr) + _check_for_locals(expr, level, parser) + + # get our (possibly passed-in) scope + env = ensure_scope( + level + 1, + global_dict=global_dict, + local_dict=local_dict, + resolvers=resolvers, + target=target, + ) + + parsed_expr = Expr(expr, engine=engine, parser=parser, env=env) + + # construct the engine and evaluate the parsed expression + eng = _engines[engine] + eng_inst = eng(parsed_expr) + ret = eng_inst.evaluate() + + if parsed_expr.assigner is None: + if multi_line: + raise ValueError( + "Multi-line expressions are only valid " + "if all expressions contain an assignment" + ) + elif inplace: + raise ValueError("Cannot operate inplace if there is no assignment") + + # assign if needed + assigner = parsed_expr.assigner + if env.target is not None and assigner is not None: + target_modified = True + + # if returning a copy, copy only on the first assignment + if not inplace and first_expr: + try: + target = env.target.copy() + except AttributeError: + raise ValueError("Cannot return a copy of the target") + else: + target = env.target + + # TypeError is most commonly raised (e.g. int, list), but you + # get IndexError if you try to do this assignment on np.ndarray. + # we will ignore numpy warnings here; e.g. if trying + # to use a non-numeric indexer + try: + with warnings.catch_warnings(record=True): + # TODO: Filter the warnings we actually care about here. + target[assigner] = ret + except (TypeError, IndexError): + raise ValueError("Cannot assign expression output to target") + + if not resolvers: + resolvers = ({assigner: ret},) + else: + # existing resolver needs updated to handle + # case of mutating existing column in copy + for resolver in resolvers: + if assigner in resolver: + resolver[assigner] = ret + break + else: + resolvers += ({assigner: ret},) + + ret = None + first_expr = False + + # We want to exclude `inplace=None` as being False. + if inplace is False: + return target if target_modified else ret diff --git a/venv/Lib/site-packages/pandas/core/computation/expr.py b/venv/Lib/site-packages/pandas/core/computation/expr.py new file mode 100644 index 0000000..1350587 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/computation/expr.py @@ -0,0 +1,791 @@ +""":func:`~pandas.eval` parsers +""" + +import ast +from functools import partial, reduce +from keyword import iskeyword +import tokenize +from typing import Optional, Type + +import numpy as np + +import pandas.core.common as com +from pandas.core.computation.ops import ( + _LOCAL_TAG, + BinOp, + Constant, + Div, + FuncNode, + Op, + Term, + UnaryOp, + UndefinedVariableError, + _arith_ops_syms, + _bool_ops_syms, + _cmp_ops_syms, + _mathops, + _reductions, + _unary_ops_syms, + is_term, +) +from pandas.core.computation.parsing import clean_backtick_quoted_toks, tokenize_string +from pandas.core.computation.scope import Scope + +import pandas.io.formats.printing as printing + + +def _rewrite_assign(tok): + """Rewrite the assignment operator for PyTables expressions that use ``=`` + as a substitute for ``==``. + + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + t : tuple of int, str + Either the input or token or the replacement values + """ + toknum, tokval = tok + return toknum, "==" if tokval == "=" else tokval + + +def _replace_booleans(tok): + """Replace ``&`` with ``and`` and ``|`` with ``or`` so that bitwise + precedence is changed to boolean precedence. + + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + t : tuple of int, str + Either the input or token or the replacement values + """ + toknum, tokval = tok + if toknum == tokenize.OP: + if tokval == "&": + return tokenize.NAME, "and" + elif tokval == "|": + return tokenize.NAME, "or" + return toknum, tokval + return toknum, tokval + + +def _replace_locals(tok): + """Replace local variables with a syntactically valid name. + + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + t : tuple of int, str + Either the input or token or the replacement values + + Notes + ----- + This is somewhat of a hack in that we rewrite a string such as ``'@a'`` as + ``'__pd_eval_local_a'`` by telling the tokenizer that ``__pd_eval_local_`` + is a ``tokenize.OP`` and to replace the ``'@'`` symbol with it. + """ + toknum, tokval = tok + if toknum == tokenize.OP and tokval == "@": + return tokenize.OP, _LOCAL_TAG + return toknum, tokval + + +def _compose2(f, g): + """Compose 2 callables""" + return lambda *args, **kwargs: f(g(*args, **kwargs)) + + +def _compose(*funcs): + """Compose 2 or more callables""" + assert len(funcs) > 1, "At least 2 callables must be passed to compose" + return reduce(_compose2, funcs) + + +def _preparse( + source: str, + f=_compose( + _replace_locals, _replace_booleans, _rewrite_assign, clean_backtick_quoted_toks + ), +): + """Compose a collection of tokenization functions + + Parameters + ---------- + source : str + A Python source code string + f : callable + This takes a tuple of (toknum, tokval) as its argument and returns a + tuple with the same structure but possibly different elements. Defaults + to the composition of ``_rewrite_assign``, ``_replace_booleans``, and + ``_replace_locals``. + + Returns + ------- + s : str + Valid Python source code + + Notes + ----- + The `f` parameter can be any callable that takes *and* returns input of the + form ``(toknum, tokval)``, where ``toknum`` is one of the constants from + the ``tokenize`` module and ``tokval`` is a string. + """ + assert callable(f), "f must be callable" + return tokenize.untokenize((f(x) for x in tokenize_string(source))) + + +def _is_type(t): + """Factory for a type checking function of type ``t`` or tuple of types.""" + return lambda x: isinstance(x.value, t) + + +_is_list = _is_type(list) +_is_str = _is_type(str) + + +# partition all AST nodes +_all_nodes = frozenset( + filter( + lambda x: isinstance(x, type) and issubclass(x, ast.AST), + (getattr(ast, node) for node in dir(ast)), + ) +) + + +def _filter_nodes(superclass, all_nodes=_all_nodes): + """Filter out AST nodes that are subclasses of ``superclass``.""" + node_names = (node.__name__ for node in all_nodes if issubclass(node, superclass)) + return frozenset(node_names) + + +_all_node_names = frozenset(map(lambda x: x.__name__, _all_nodes)) +_mod_nodes = _filter_nodes(ast.mod) +_stmt_nodes = _filter_nodes(ast.stmt) +_expr_nodes = _filter_nodes(ast.expr) +_expr_context_nodes = _filter_nodes(ast.expr_context) +_slice_nodes = _filter_nodes(ast.slice) +_boolop_nodes = _filter_nodes(ast.boolop) +_operator_nodes = _filter_nodes(ast.operator) +_unary_op_nodes = _filter_nodes(ast.unaryop) +_cmp_op_nodes = _filter_nodes(ast.cmpop) +_comprehension_nodes = _filter_nodes(ast.comprehension) +_handler_nodes = _filter_nodes(ast.excepthandler) +_arguments_nodes = _filter_nodes(ast.arguments) +_keyword_nodes = _filter_nodes(ast.keyword) +_alias_nodes = _filter_nodes(ast.alias) + + +# nodes that we don't support directly but are needed for parsing +_hacked_nodes = frozenset(["Assign", "Module", "Expr"]) + + +_unsupported_expr_nodes = frozenset( + [ + "Yield", + "GeneratorExp", + "IfExp", + "DictComp", + "SetComp", + "Repr", + "Lambda", + "Set", + "AST", + "Is", + "IsNot", + ] +) + +# these nodes are low priority or won't ever be supported (e.g., AST) +_unsupported_nodes = ( + _stmt_nodes + | _mod_nodes + | _handler_nodes + | _arguments_nodes + | _keyword_nodes + | _alias_nodes + | _expr_context_nodes + | _unsupported_expr_nodes +) - _hacked_nodes + +# we're adding a different assignment in some cases to be equality comparison +# and we don't want `stmt` and friends in their so get only the class whose +# names are capitalized +_base_supported_nodes = (_all_node_names - _unsupported_nodes) | _hacked_nodes +intersection = _unsupported_nodes & _base_supported_nodes +_msg = f"cannot both support and not support {intersection}" +assert not intersection, _msg + + +def _node_not_implemented(node_name, cls): + """Return a function that raises a NotImplementedError with a passed node + name. + """ + + def f(self, *args, **kwargs): + raise NotImplementedError(f"{repr(node_name)} nodes are not implemented") + + return f + + +def disallow(nodes): + """Decorator to disallow certain nodes from parsing. Raises a + NotImplementedError instead. + + Returns + ------- + disallowed : callable + """ + + def disallowed(cls): + cls.unsupported_nodes = () + for node in nodes: + new_method = _node_not_implemented(node, cls) + name = f"visit_{node}" + cls.unsupported_nodes += (name,) + setattr(cls, name, new_method) + return cls + + return disallowed + + +def _op_maker(op_class, op_symbol): + """Return a function to create an op class with its symbol already passed. + + Returns + ------- + f : callable + """ + + def f(self, node, *args, **kwargs): + """Return a partial function with an Op subclass with an operator + already passed. + + Returns + ------- + f : callable + """ + return partial(op_class, op_symbol, *args, **kwargs) + + return f + + +_op_classes = {"binary": BinOp, "unary": UnaryOp} + + +def add_ops(op_classes): + """Decorator to add default implementation of ops.""" + + def f(cls): + for op_attr_name, op_class in op_classes.items(): + ops = getattr(cls, f"{op_attr_name}_ops") + ops_map = getattr(cls, f"{op_attr_name}_op_nodes_map") + for op in ops: + op_node = ops_map[op] + if op_node is not None: + made_op = _op_maker(op_class, op) + setattr(cls, f"visit_{op_node}", made_op) + return cls + + return f + + +@disallow(_unsupported_nodes) +@add_ops(_op_classes) +class BaseExprVisitor(ast.NodeVisitor): + """ + Custom ast walker. Parsers of other engines should subclass this class + if necessary. + + Parameters + ---------- + env : Scope + engine : str + parser : str + preparser : callable + """ + + const_type: Type[Term] = Constant + term_type = Term + + binary_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms + binary_op_nodes = ( + "Gt", + "Lt", + "GtE", + "LtE", + "Eq", + "NotEq", + "In", + "NotIn", + "BitAnd", + "BitOr", + "And", + "Or", + "Add", + "Sub", + "Mult", + None, + "Pow", + "FloorDiv", + "Mod", + ) + binary_op_nodes_map = dict(zip(binary_ops, binary_op_nodes)) + + unary_ops = _unary_ops_syms + unary_op_nodes = "UAdd", "USub", "Invert", "Not" + unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes)) + + rewrite_map = { + ast.Eq: ast.In, + ast.NotEq: ast.NotIn, + ast.In: ast.In, + ast.NotIn: ast.NotIn, + } + + def __init__(self, env, engine, parser, preparser=_preparse): + self.env = env + self.engine = engine + self.parser = parser + self.preparser = preparser + self.assigner = None + + def visit(self, node, **kwargs): + if isinstance(node, str): + clean = self.preparser(node) + try: + node = ast.fix_missing_locations(ast.parse(clean)) + except SyntaxError as e: + if any(iskeyword(x) for x in clean.split()): + e.msg = "Python keyword not valid identifier in numexpr query" + raise e + + method = "visit_" + type(node).__name__ + visitor = getattr(self, method) + return visitor(node, **kwargs) + + def visit_Module(self, node, **kwargs): + if len(node.body) != 1: + raise SyntaxError("only a single expression is allowed") + expr = node.body[0] + return self.visit(expr, **kwargs) + + def visit_Expr(self, node, **kwargs): + return self.visit(node.value, **kwargs) + + def _rewrite_membership_op(self, node, left, right): + # the kind of the operator (is actually an instance) + op_instance = node.op + op_type = type(op_instance) + + # must be two terms and the comparison operator must be ==/!=/in/not in + if is_term(left) and is_term(right) and op_type in self.rewrite_map: + + left_list, right_list = map(_is_list, (left, right)) + left_str, right_str = map(_is_str, (left, right)) + + # if there are any strings or lists in the expression + if left_list or right_list or left_str or right_str: + op_instance = self.rewrite_map[op_type]() + + # pop the string variable out of locals and replace it with a list + # of one string, kind of a hack + if right_str: + name = self.env.add_tmp([right.value]) + right = self.term_type(name, self.env) + + if left_str: + name = self.env.add_tmp([left.value]) + left = self.term_type(name, self.env) + + op = self.visit(op_instance) + return op, op_instance, left, right + + def _maybe_transform_eq_ne(self, node, left=None, right=None): + if left is None: + left = self.visit(node.left, side="left") + if right is None: + right = self.visit(node.right, side="right") + op, op_class, left, right = self._rewrite_membership_op(node, left, right) + return op, op_class, left, right + + def _maybe_downcast_constants(self, left, right): + f32 = np.dtype(np.float32) + if ( + left.is_scalar + and hasattr(left, "value") + and not right.is_scalar + and right.return_type == f32 + ): + # right is a float32 array, left is a scalar + name = self.env.add_tmp(np.float32(left.value)) + left = self.term_type(name, self.env) + if ( + right.is_scalar + and hasattr(right, "value") + and not left.is_scalar + and left.return_type == f32 + ): + # left is a float32 array, right is a scalar + name = self.env.add_tmp(np.float32(right.value)) + right = self.term_type(name, self.env) + + return left, right + + def _maybe_eval(self, binop, eval_in_python): + # eval `in` and `not in` (for now) in "partial" python space + # things that can be evaluated in "eval" space will be turned into + # temporary variables. for example, + # [1,2] in a + 2 * b + # in that case a + 2 * b will be evaluated using numexpr, and the "in" + # call will be evaluated using isin (in python space) + return binop.evaluate( + self.env, self.engine, self.parser, self.term_type, eval_in_python + ) + + def _maybe_evaluate_binop( + self, + op, + op_class, + lhs, + rhs, + eval_in_python=("in", "not in"), + maybe_eval_in_python=("==", "!=", "<", ">", "<=", ">="), + ): + res = op(lhs, rhs) + + if res.has_invalid_return_type: + raise TypeError( + f"unsupported operand type(s) for {res.op}:" + f" '{lhs.type}' and '{rhs.type}'" + ) + + if self.engine != "pytables": + if ( + res.op in _cmp_ops_syms + and getattr(lhs, "is_datetime", False) + or getattr(rhs, "is_datetime", False) + ): + # all date ops must be done in python bc numexpr doesn't work + # well with NaT + return self._maybe_eval(res, self.binary_ops) + + if res.op in eval_in_python: + # "in"/"not in" ops are always evaluated in python + return self._maybe_eval(res, eval_in_python) + elif self.engine != "pytables": + if ( + getattr(lhs, "return_type", None) == object + or getattr(rhs, "return_type", None) == object + ): + # evaluate "==" and "!=" in python if either of our operands + # has an object return type + return self._maybe_eval(res, eval_in_python + maybe_eval_in_python) + return res + + def visit_BinOp(self, node, **kwargs): + op, op_class, left, right = self._maybe_transform_eq_ne(node) + left, right = self._maybe_downcast_constants(left, right) + return self._maybe_evaluate_binop(op, op_class, left, right) + + def visit_Div(self, node, **kwargs): + return lambda lhs, rhs: Div(lhs, rhs) + + def visit_UnaryOp(self, node, **kwargs): + op = self.visit(node.op) + operand = self.visit(node.operand) + return op(operand) + + def visit_Name(self, node, **kwargs): + return self.term_type(node.id, self.env, **kwargs) + + def visit_NameConstant(self, node, **kwargs): + return self.const_type(node.value, self.env) + + def visit_Num(self, node, **kwargs): + return self.const_type(node.n, self.env) + + def visit_Constant(self, node, **kwargs): + return self.const_type(node.n, self.env) + + def visit_Str(self, node, **kwargs): + name = self.env.add_tmp(node.s) + return self.term_type(name, self.env) + + def visit_List(self, node, **kwargs): + name = self.env.add_tmp([self.visit(e)(self.env) for e in node.elts]) + return self.term_type(name, self.env) + + visit_Tuple = visit_List + + def visit_Index(self, node, **kwargs): + """ df.index[4] """ + return self.visit(node.value) + + def visit_Subscript(self, node, **kwargs): + import pandas as pd + + value = self.visit(node.value) + slobj = self.visit(node.slice) + result = pd.eval( + slobj, local_dict=self.env, engine=self.engine, parser=self.parser + ) + try: + # a Term instance + v = value.value[result] + except AttributeError: + # an Op instance + lhs = pd.eval( + value, local_dict=self.env, engine=self.engine, parser=self.parser + ) + v = lhs[result] + name = self.env.add_tmp(v) + return self.term_type(name, env=self.env) + + def visit_Slice(self, node, **kwargs): + """ df.index[slice(4,6)] """ + lower = node.lower + if lower is not None: + lower = self.visit(lower).value + upper = node.upper + if upper is not None: + upper = self.visit(upper).value + step = node.step + if step is not None: + step = self.visit(step).value + + return slice(lower, upper, step) + + def visit_Assign(self, node, **kwargs): + """ + support a single assignment node, like + + c = a + b + + set the assigner at the top level, must be a Name node which + might or might not exist in the resolvers + + """ + + if len(node.targets) != 1: + raise SyntaxError("can only assign a single expression") + if not isinstance(node.targets[0], ast.Name): + raise SyntaxError("left hand side of an assignment must be a single name") + if self.env.target is None: + raise ValueError("cannot assign without a target object") + + try: + assigner = self.visit(node.targets[0], **kwargs) + except UndefinedVariableError: + assigner = node.targets[0].id + + self.assigner = getattr(assigner, "name", assigner) + if self.assigner is None: + raise SyntaxError( + "left hand side of an assignment must be a single resolvable name" + ) + + return self.visit(node.value, **kwargs) + + def visit_Attribute(self, node, **kwargs): + attr = node.attr + value = node.value + + ctx = node.ctx + if isinstance(ctx, ast.Load): + # resolve the value + resolved = self.visit(value).value + try: + v = getattr(resolved, attr) + name = self.env.add_tmp(v) + return self.term_type(name, self.env) + except AttributeError: + # something like datetime.datetime where scope is overridden + if isinstance(value, ast.Name) and value.id == attr: + return resolved + + raise ValueError(f"Invalid Attribute context {ctx.__name__}") + + def visit_Call(self, node, side=None, **kwargs): + + if isinstance(node.func, ast.Attribute): + res = self.visit_Attribute(node.func) + elif not isinstance(node.func, ast.Name): + raise TypeError("Only named functions are supported") + else: + try: + res = self.visit(node.func) + except UndefinedVariableError: + # Check if this is a supported function name + try: + res = FuncNode(node.func.id) + except ValueError: + # Raise original error + raise + + if res is None: + raise ValueError(f"Invalid function call {node.func.id}") + if hasattr(res, "value"): + res = res.value + + if isinstance(res, FuncNode): + + new_args = [self.visit(arg) for arg in node.args] + + if node.keywords: + raise TypeError( + f'Function "{res.name}" does not support keyword arguments' + ) + + return res(*new_args, **kwargs) + + else: + + new_args = [self.visit(arg).value for arg in node.args] + + for key in node.keywords: + if not isinstance(key, ast.keyword): + raise ValueError(f"keyword error in function call '{node.func.id}'") + + if key.arg: + kwargs[key.arg] = self.visit(key.value).value + + return self.const_type(res(*new_args, **kwargs), self.env) + + def translate_In(self, op): + return op + + def visit_Compare(self, node, **kwargs): + ops = node.ops + comps = node.comparators + + # base case: we have something like a CMP b + if len(comps) == 1: + op = self.translate_In(ops[0]) + binop = ast.BinOp(op=op, left=node.left, right=comps[0]) + return self.visit(binop) + + # recursive case: we have a chained comparison, a CMP b CMP c, etc. + left = node.left + values = [] + for op, comp in zip(ops, comps): + new_node = self.visit( + ast.Compare(comparators=[comp], left=left, ops=[self.translate_In(op)]) + ) + left = comp + values.append(new_node) + return self.visit(ast.BoolOp(op=ast.And(), values=values)) + + def _try_visit_binop(self, bop): + if isinstance(bop, (Op, Term)): + return bop + return self.visit(bop) + + def visit_BoolOp(self, node, **kwargs): + def visitor(x, y): + lhs = self._try_visit_binop(x) + rhs = self._try_visit_binop(y) + + op, op_class, lhs, rhs = self._maybe_transform_eq_ne(node, lhs, rhs) + return self._maybe_evaluate_binop(op, node.op, lhs, rhs) + + operands = node.values + return reduce(visitor, operands) + + +_python_not_supported = frozenset(["Dict", "BoolOp", "In", "NotIn"]) +_numexpr_supported_calls = frozenset(_reductions + _mathops) + + +@disallow( + (_unsupported_nodes | _python_not_supported) + - (_boolop_nodes | frozenset(["BoolOp", "Attribute", "In", "NotIn", "Tuple"])) +) +class PandasExprVisitor(BaseExprVisitor): + def __init__( + self, + env, + engine, + parser, + preparser=partial( + _preparse, + f=_compose(_replace_locals, _replace_booleans, clean_backtick_quoted_toks), + ), + ): + super().__init__(env, engine, parser, preparser) + + +@disallow(_unsupported_nodes | _python_not_supported | frozenset(["Not"])) +class PythonExprVisitor(BaseExprVisitor): + def __init__(self, env, engine, parser, preparser=lambda x: x): + super().__init__(env, engine, parser, preparser=preparser) + + +class Expr: + """ + Object encapsulating an expression. + + Parameters + ---------- + expr : str + engine : str, optional, default 'numexpr' + parser : str, optional, default 'pandas' + env : Scope, optional, default None + level : int, optional, default 2 + """ + + env: Scope + engine: str + parser: str + + def __init__( + self, + expr, + engine: str = "numexpr", + parser: str = "pandas", + env: Optional[Scope] = None, + level: int = 0, + ): + self.expr = expr + self.env = env or Scope(level=level + 1) + self.engine = engine + self.parser = parser + self._visitor = _parsers[parser](self.env, self.engine, self.parser) + self.terms = self.parse() + + @property + def assigner(self): + return getattr(self._visitor, "assigner", None) + + def __call__(self): + return self.terms(self.env) + + def __repr__(self) -> str: + return printing.pprint_thing(self.terms) + + def __len__(self) -> int: + return len(self.expr) + + def parse(self): + """Parse an expression""" + return self._visitor.visit(self.expr) + + @property + def names(self): + """Get the names in an expression""" + if is_term(self.terms): + return frozenset([self.terms.name]) + return frozenset(term.name for term in com.flatten(self.terms)) + + +_parsers = {"python": PythonExprVisitor, "pandas": PandasExprVisitor} diff --git a/venv/Lib/site-packages/pandas/core/computation/expressions.py b/venv/Lib/site-packages/pandas/core/computation/expressions.py new file mode 100644 index 0000000..7e95988 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/computation/expressions.py @@ -0,0 +1,252 @@ +""" +Expressions +----------- + +Offer fast expression evaluation through numexpr + +""" + +import warnings + +import numpy as np + +from pandas._config import get_option + +from pandas._libs.lib import values_from_object + +from pandas.core.dtypes.generic import ABCDataFrame + +from pandas.core.computation.check import _NUMEXPR_INSTALLED + +if _NUMEXPR_INSTALLED: + import numexpr as ne + +_TEST_MODE = None +_TEST_RESULT = None +_USE_NUMEXPR = _NUMEXPR_INSTALLED +_evaluate = None +_where = None + +# the set of dtypes that we will allow pass to numexpr +_ALLOWED_DTYPES = { + "evaluate": {"int64", "int32", "float64", "float32", "bool"}, + "where": {"int64", "float64", "bool"}, +} + +# the minimum prod shape that we will use numexpr +_MIN_ELEMENTS = 10000 + + +def set_use_numexpr(v=True): + # set/unset to use numexpr + global _USE_NUMEXPR + if _NUMEXPR_INSTALLED: + _USE_NUMEXPR = v + + # choose what we are going to do + global _evaluate, _where + if not _USE_NUMEXPR: + _evaluate = _evaluate_standard + _where = _where_standard + else: + _evaluate = _evaluate_numexpr + _where = _where_numexpr + + +def set_numexpr_threads(n=None): + # if we are using numexpr, set the threads to n + # otherwise reset + if _NUMEXPR_INSTALLED and _USE_NUMEXPR: + if n is None: + n = ne.detect_number_of_cores() + ne.set_num_threads(n) + + +def _evaluate_standard(op, op_str, a, b): + """ standard evaluation """ + if _TEST_MODE: + _store_test_result(False) + with np.errstate(all="ignore"): + return op(a, b) + + +def _can_use_numexpr(op, op_str, a, b, dtype_check): + """ return a boolean if we WILL be using numexpr """ + if op_str is not None: + + # required min elements (otherwise we are adding overhead) + if np.prod(a.shape) > _MIN_ELEMENTS: + # check for dtype compatibility + dtypes = set() + for o in [a, b]: + # Series implements dtypes, check for dimension count as well + if hasattr(o, "dtypes") and o.ndim > 1: + s = o.dtypes.value_counts() + if len(s) > 1: + return False + dtypes |= set(s.index.astype(str)) + # ndarray and Series Case + elif hasattr(o, "dtype"): + dtypes |= {o.dtype.name} + + # allowed are a superset + if not len(dtypes) or _ALLOWED_DTYPES[dtype_check] >= dtypes: + return True + + return False + + +def _evaluate_numexpr(op, op_str, a, b): + result = None + + if _can_use_numexpr(op, op_str, a, b, "evaluate"): + is_reversed = op.__name__.strip("_").startswith("r") + if is_reversed: + # we were originally called by a reversed op method + a, b = b, a + + a_value = getattr(a, "values", a) + b_value = getattr(b, "values", b) + + result = ne.evaluate( + f"a_value {op_str} b_value", + local_dict={"a_value": a_value, "b_value": b_value}, + casting="safe", + ) + + if _TEST_MODE: + _store_test_result(result is not None) + + if result is None: + result = _evaluate_standard(op, op_str, a, b) + + return result + + +def _where_standard(cond, a, b): + return np.where( + values_from_object(cond), values_from_object(a), values_from_object(b) + ) + + +def _where_numexpr(cond, a, b): + result = None + + if _can_use_numexpr(None, "where", a, b, "where"): + cond_value = getattr(cond, "values", cond) + a_value = getattr(a, "values", a) + b_value = getattr(b, "values", b) + + result = ne.evaluate( + "where(cond_value, a_value, b_value)", + local_dict={ + "cond_value": cond_value, + "a_value": a_value, + "b_value": b_value, + }, + casting="safe", + ) + + if result is None: + result = _where_standard(cond, a, b) + + return result + + +# turn myself on +set_use_numexpr(get_option("compute.use_numexpr")) + + +def _has_bool_dtype(x): + if isinstance(x, ABCDataFrame): + return "bool" in x.dtypes + try: + return x.dtype == bool + except AttributeError: + return isinstance(x, (bool, np.bool_)) + + +def _bool_arith_check( + op_str, a, b, not_allowed=frozenset(("/", "//", "**")), unsupported=None +): + if unsupported is None: + unsupported = {"+": "|", "*": "&", "-": "^"} + + if _has_bool_dtype(a) and _has_bool_dtype(b): + if op_str in unsupported: + warnings.warn( + f"evaluating in Python space because the {repr(op_str)} " + f"operator is not supported by numexpr for " + f"the bool dtype, use {repr(unsupported[op_str])} instead" + ) + return False + + if op_str in not_allowed: + raise NotImplementedError( + f"operator {repr(op_str)} not implemented for bool dtypes" + ) + return True + + +def evaluate(op, op_str, a, b, use_numexpr=True): + """ + Evaluate and return the expression of the op on a and b. + + Parameters + ---------- + op : the actual operand + op_str : str + The string version of the op. + a : left operand + b : right operand + use_numexpr : bool, default True + Whether to try to use numexpr. + """ + + use_numexpr = use_numexpr and _bool_arith_check(op_str, a, b) + if use_numexpr: + return _evaluate(op, op_str, a, b) + return _evaluate_standard(op, op_str, a, b) + + +def where(cond, a, b, use_numexpr=True): + """ + Evaluate the where condition cond on a and b. + + Parameters + ---------- + cond : np.ndarray[bool] + a : return if cond is True + b : return if cond is False + use_numexpr : bool, default True + Whether to try to use numexpr. + """ + + if use_numexpr: + return _where(cond, a, b) + return _where_standard(cond, a, b) + + +def set_test_mode(v=True): + """ + Keeps track of whether numexpr was used. Stores an additional ``True`` + for every successful use of evaluate with numexpr since the last + ``get_test_result`` + """ + global _TEST_MODE, _TEST_RESULT + _TEST_MODE = v + _TEST_RESULT = [] + + +def _store_test_result(used_numexpr): + global _TEST_RESULT + if used_numexpr: + _TEST_RESULT.append(used_numexpr) + + +def get_test_result(): + """get test result and reset test_results""" + global _TEST_RESULT + res = _TEST_RESULT + _TEST_RESULT = [] + return res diff --git a/venv/Lib/site-packages/pandas/core/computation/ops.py b/venv/Lib/site-packages/pandas/core/computation/ops.py new file mode 100644 index 0000000..cb166ba --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/computation/ops.py @@ -0,0 +1,601 @@ +"""Operator classes for eval. +""" + +from datetime import datetime +from distutils.version import LooseVersion +from functools import partial +import operator + +import numpy as np + +from pandas._libs.tslibs import Timestamp + +from pandas.core.dtypes.common import is_list_like, is_scalar + +import pandas.core.common as com +from pandas.core.computation.common import _ensure_decoded, result_type_many +from pandas.core.computation.scope import _DEFAULT_GLOBALS + +from pandas.io.formats.printing import pprint_thing, pprint_thing_encoded + +_reductions = ("sum", "prod") + +_unary_math_ops = ( + "sin", + "cos", + "exp", + "log", + "expm1", + "log1p", + "sqrt", + "sinh", + "cosh", + "tanh", + "arcsin", + "arccos", + "arctan", + "arccosh", + "arcsinh", + "arctanh", + "abs", + "log10", + "floor", + "ceil", +) +_binary_math_ops = ("arctan2",) + +_mathops = _unary_math_ops + _binary_math_ops + + +_LOCAL_TAG = "__pd_eval_local_" + + +class UndefinedVariableError(NameError): + """ + NameError subclass for local variables. + """ + + def __init__(self, name, is_local: bool): + base_msg = f"{repr(name)} is not defined" + if is_local: + msg = f"local variable {base_msg}" + else: + msg = f"name {base_msg}" + super().__init__(msg) + + +class Term: + def __new__(cls, name, env, side=None, encoding=None): + klass = Constant if not isinstance(name, str) else cls + supr_new = super(Term, klass).__new__ + return supr_new(klass) + + is_local: bool + + def __init__(self, name, env, side=None, encoding=None): + # name is a str for Term, but may be something else for subclasses + self._name = name + self.env = env + self.side = side + tname = str(name) + self.is_local = tname.startswith(_LOCAL_TAG) or tname in _DEFAULT_GLOBALS + self._value = self._resolve_name() + self.encoding = encoding + + @property + def local_name(self) -> str: + return self.name.replace(_LOCAL_TAG, "") + + def __repr__(self) -> str: + return pprint_thing(self.name) + + def __call__(self, *args, **kwargs): + return self.value + + def evaluate(self, *args, **kwargs): + return self + + def _resolve_name(self): + res = self.env.resolve(self.local_name, is_local=self.is_local) + self.update(res) + + if hasattr(res, "ndim") and res.ndim > 2: + raise NotImplementedError( + "N-dimensional objects, where N > 2, are not supported with eval" + ) + return res + + def update(self, value): + """ + search order for local (i.e., @variable) variables: + + scope, key_variable + [('locals', 'local_name'), + ('globals', 'local_name'), + ('locals', 'key'), + ('globals', 'key')] + """ + key = self.name + + # if it's a variable name (otherwise a constant) + if isinstance(key, str): + self.env.swapkey(self.local_name, key, new_value=value) + + self.value = value + + @property + def is_scalar(self) -> bool: + return is_scalar(self._value) + + @property + def type(self): + try: + # potentially very slow for large, mixed dtype frames + return self._value.values.dtype + except AttributeError: + try: + # ndarray + return self._value.dtype + except AttributeError: + # scalar + return type(self._value) + + return_type = type + + @property + def raw(self) -> str: + return f"{type(self).__name__}(name={repr(self.name)}, type={self.type})" + + @property + def is_datetime(self) -> bool: + try: + t = self.type.type + except AttributeError: + t = self.type + + return issubclass(t, (datetime, np.datetime64)) + + @property + def value(self): + return self._value + + @value.setter + def value(self, new_value): + self._value = new_value + + @property + def name(self): + return self._name + + @property + def ndim(self) -> int: + return self._value.ndim + + +class Constant(Term): + def __init__(self, value, env, side=None, encoding=None): + super().__init__(value, env, side=side, encoding=encoding) + + def _resolve_name(self): + return self._name + + @property + def name(self): + return self.value + + def __repr__(self) -> str: + # in python 2 str() of float + # can truncate shorter than repr() + return repr(self.name) + + +_bool_op_map = {"not": "~", "and": "&", "or": "|"} + + +class Op: + """ + Hold an operator of arbitrary arity. + """ + + op: str + + def __init__(self, op: str, operands, *args, **kwargs): + self.op = _bool_op_map.get(op, op) + self.operands = operands + self.encoding = kwargs.get("encoding", None) + + def __iter__(self): + return iter(self.operands) + + def __repr__(self) -> str: + """ + Print a generic n-ary operator and its operands using infix notation. + """ + # recurse over the operands + parened = (f"({pprint_thing(opr)})" for opr in self.operands) + return pprint_thing(f" {self.op} ".join(parened)) + + @property + def return_type(self): + # clobber types to bool if the op is a boolean operator + if self.op in (_cmp_ops_syms + _bool_ops_syms): + return np.bool_ + return result_type_many(*(term.type for term in com.flatten(self))) + + @property + def has_invalid_return_type(self) -> bool: + types = self.operand_types + obj_dtype_set = frozenset([np.dtype("object")]) + return self.return_type == object and types - obj_dtype_set + + @property + def operand_types(self): + return frozenset(term.type for term in com.flatten(self)) + + @property + def is_scalar(self) -> bool: + return all(operand.is_scalar for operand in self.operands) + + @property + def is_datetime(self) -> bool: + try: + t = self.return_type.type + except AttributeError: + t = self.return_type + + return issubclass(t, (datetime, np.datetime64)) + + +def _in(x, y): + """Compute the vectorized membership of ``x in y`` if possible, otherwise + use Python. + """ + try: + return x.isin(y) + except AttributeError: + if is_list_like(x): + try: + return y.isin(x) + except AttributeError: + pass + return x in y + + +def _not_in(x, y): + """Compute the vectorized membership of ``x not in y`` if possible, + otherwise use Python. + """ + try: + return ~x.isin(y) + except AttributeError: + if is_list_like(x): + try: + return ~y.isin(x) + except AttributeError: + pass + return x not in y + + +_cmp_ops_syms = (">", "<", ">=", "<=", "==", "!=", "in", "not in") +_cmp_ops_funcs = ( + operator.gt, + operator.lt, + operator.ge, + operator.le, + operator.eq, + operator.ne, + _in, + _not_in, +) +_cmp_ops_dict = dict(zip(_cmp_ops_syms, _cmp_ops_funcs)) + +_bool_ops_syms = ("&", "|", "and", "or") +_bool_ops_funcs = (operator.and_, operator.or_, operator.and_, operator.or_) +_bool_ops_dict = dict(zip(_bool_ops_syms, _bool_ops_funcs)) + +_arith_ops_syms = ("+", "-", "*", "/", "**", "//", "%") +_arith_ops_funcs = ( + operator.add, + operator.sub, + operator.mul, + operator.truediv, + operator.pow, + operator.floordiv, + operator.mod, +) +_arith_ops_dict = dict(zip(_arith_ops_syms, _arith_ops_funcs)) + +_special_case_arith_ops_syms = ("**", "//", "%") +_special_case_arith_ops_funcs = (operator.pow, operator.floordiv, operator.mod) +_special_case_arith_ops_dict = dict( + zip(_special_case_arith_ops_syms, _special_case_arith_ops_funcs) +) + +_binary_ops_dict = {} + +for d in (_cmp_ops_dict, _bool_ops_dict, _arith_ops_dict): + _binary_ops_dict.update(d) + + +def _cast_inplace(terms, acceptable_dtypes, dtype): + """ + Cast an expression inplace. + + Parameters + ---------- + terms : Op + The expression that should cast. + acceptable_dtypes : list of acceptable numpy.dtype + Will not cast if term's dtype in this list. + dtype : str or numpy.dtype + The dtype to cast to. + """ + dt = np.dtype(dtype) + for term in terms: + if term.type in acceptable_dtypes: + continue + + try: + new_value = term.value.astype(dt) + except AttributeError: + new_value = dt.type(term.value) + term.update(new_value) + + +def is_term(obj) -> bool: + return isinstance(obj, Term) + + +class BinOp(Op): + """ + Hold a binary operator and its operands. + + Parameters + ---------- + op : str + left : Term or Op + right : Term or Op + """ + + def __init__(self, op: str, lhs, rhs, **kwargs): + super().__init__(op, (lhs, rhs)) + self.lhs = lhs + self.rhs = rhs + + self._disallow_scalar_only_bool_ops() + + self.convert_values() + + try: + self.func = _binary_ops_dict[op] + except KeyError: + # has to be made a list for python3 + keys = list(_binary_ops_dict.keys()) + raise ValueError( + f"Invalid binary operator {repr(op)}, valid operators are {keys}" + ) + + def __call__(self, env): + """ + Recursively evaluate an expression in Python space. + + Parameters + ---------- + env : Scope + + Returns + ------- + object + The result of an evaluated expression. + """ + + # recurse over the left/right nodes + left = self.lhs(env) + right = self.rhs(env) + + return self.func(left, right) + + def evaluate(self, env, engine: str, parser, term_type, eval_in_python): + """ + Evaluate a binary operation *before* being passed to the engine. + + Parameters + ---------- + env : Scope + engine : str + parser : str + term_type : type + eval_in_python : list + + Returns + ------- + term_type + The "pre-evaluated" expression as an instance of ``term_type`` + """ + if engine == "python": + res = self(env) + else: + # recurse over the left/right nodes + left = self.lhs.evaluate( + env, + engine=engine, + parser=parser, + term_type=term_type, + eval_in_python=eval_in_python, + ) + right = self.rhs.evaluate( + env, + engine=engine, + parser=parser, + term_type=term_type, + eval_in_python=eval_in_python, + ) + + # base cases + if self.op in eval_in_python: + res = self.func(left.value, right.value) + else: + from pandas.core.computation.eval import eval + + res = eval(self, local_dict=env, engine=engine, parser=parser) + + name = env.add_tmp(res) + return term_type(name, env=env) + + def convert_values(self): + """Convert datetimes to a comparable value in an expression. + """ + + def stringify(value): + if self.encoding is not None: + encoder = partial(pprint_thing_encoded, encoding=self.encoding) + else: + encoder = pprint_thing + return encoder(value) + + lhs, rhs = self.lhs, self.rhs + + if is_term(lhs) and lhs.is_datetime and is_term(rhs) and rhs.is_scalar: + v = rhs.value + if isinstance(v, (int, float)): + v = stringify(v) + v = Timestamp(_ensure_decoded(v)) + if v.tz is not None: + v = v.tz_convert("UTC") + self.rhs.update(v) + + if is_term(rhs) and rhs.is_datetime and is_term(lhs) and lhs.is_scalar: + v = lhs.value + if isinstance(v, (int, float)): + v = stringify(v) + v = Timestamp(_ensure_decoded(v)) + if v.tz is not None: + v = v.tz_convert("UTC") + self.lhs.update(v) + + def _disallow_scalar_only_bool_ops(self): + if ( + (self.lhs.is_scalar or self.rhs.is_scalar) + and self.op in _bool_ops_dict + and ( + not ( + issubclass(self.rhs.return_type, (bool, np.bool_)) + and issubclass(self.lhs.return_type, (bool, np.bool_)) + ) + ) + ): + raise NotImplementedError("cannot evaluate scalar only bool ops") + + +def isnumeric(dtype) -> bool: + return issubclass(np.dtype(dtype).type, np.number) + + +class Div(BinOp): + """ + Div operator to special case casting. + + Parameters + ---------- + lhs, rhs : Term or Op + The Terms or Ops in the ``/`` expression. + """ + + def __init__(self, lhs, rhs, **kwargs): + super().__init__("/", lhs, rhs, **kwargs) + + if not isnumeric(lhs.return_type) or not isnumeric(rhs.return_type): + raise TypeError( + f"unsupported operand type(s) for {self.op}: " + f"'{lhs.return_type}' and '{rhs.return_type}'" + ) + + # do not upcast float32s to float64 un-necessarily + acceptable_dtypes = [np.float32, np.float_] + _cast_inplace(com.flatten(self), acceptable_dtypes, np.float_) + + +_unary_ops_syms = ("+", "-", "~", "not") +_unary_ops_funcs = (operator.pos, operator.neg, operator.invert, operator.invert) +_unary_ops_dict = dict(zip(_unary_ops_syms, _unary_ops_funcs)) + + +class UnaryOp(Op): + """ + Hold a unary operator and its operands. + + Parameters + ---------- + op : str + The token used to represent the operator. + operand : Term or Op + The Term or Op operand to the operator. + + Raises + ------ + ValueError + * If no function associated with the passed operator token is found. + """ + + def __init__(self, op: str, operand): + super().__init__(op, (operand,)) + self.operand = operand + + try: + self.func = _unary_ops_dict[op] + except KeyError: + raise ValueError( + f"Invalid unary operator {repr(op)}, " + f"valid operators are {_unary_ops_syms}" + ) + + def __call__(self, env): + operand = self.operand(env) + return self.func(operand) + + def __repr__(self) -> str: + return pprint_thing(f"{self.op}({self.operand})") + + @property + def return_type(self) -> np.dtype: + operand = self.operand + if operand.return_type == np.dtype("bool"): + return np.dtype("bool") + if isinstance(operand, Op) and ( + operand.op in _cmp_ops_dict or operand.op in _bool_ops_dict + ): + return np.dtype("bool") + return np.dtype("int") + + +class MathCall(Op): + def __init__(self, func, args): + super().__init__(func.name, args) + self.func = func + + def __call__(self, env): + operands = [op(env) for op in self.operands] + with np.errstate(all="ignore"): + return self.func.func(*operands) + + def __repr__(self) -> str: + operands = map(str, self.operands) + return pprint_thing(f"{self.op}({','.join(operands)})") + + +class FuncNode: + def __init__(self, name: str): + from pandas.core.computation.check import _NUMEXPR_INSTALLED, _NUMEXPR_VERSION + + if name not in _mathops or ( + _NUMEXPR_INSTALLED + and _NUMEXPR_VERSION < LooseVersion("2.6.9") + and name in ("floor", "ceil") + ): + raise ValueError(f'"{name}" is not a supported function') + + self.name = name + self.func = getattr(np, name) + + def __call__(self, *args): + return MathCall(self, args) diff --git a/venv/Lib/site-packages/pandas/core/computation/parsing.py b/venv/Lib/site-packages/pandas/core/computation/parsing.py new file mode 100644 index 0000000..ce213c8 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/computation/parsing.py @@ -0,0 +1,190 @@ +""":func:`~pandas.eval` source string parsing functions +""" + +from io import StringIO +from keyword import iskeyword +import token +import tokenize +from typing import Iterator, Tuple + +# A token value Python's tokenizer probably will never use. +BACKTICK_QUOTED_STRING = 100 + + +def create_valid_python_identifier(name: str) -> str: + """ + Create valid Python identifiers from any string. + + Check if name contains any special characters. If it contains any + special characters, the special characters will be replaced by + a special string and a prefix is added. + + Raises + ------ + SyntaxError + If the returned name is not a Python valid identifier, raise an exception. + This can happen if there is a hashtag in the name, as the tokenizer will + than terminate and not find the backtick. + But also for characters that fall out of the range of (U+0001..U+007F). + """ + if name.isidentifier() and not iskeyword(name): + return name + + # Create a dict with the special characters and their replacement string. + # EXACT_TOKEN_TYPES contains these special characters + # toke.tok_name contains a readable description of the replacement string. + special_characters_replacements = { + char: f"_{token.tok_name[tokval]}_" + # The ignore here is because of a bug in mypy that is resolved in 0.740 + for char, tokval in tokenize.EXACT_TOKEN_TYPES.items() # type: ignore + } + special_characters_replacements.update( + { + " ": "_", + "?": "_QUESTIONMARK_", + "!": "_EXCLAMATIONMARK_", + "$": "_DOLLARSIGN_", + "€": "_EUROSIGN_", + # Including quotes works, but there are exceptions. + "'": "_SINGLEQUOTE_", + '"': "_DOUBLEQUOTE_", + # Currently not possible. Terminates parser and won't find backtick. + # "#": "_HASH_", + } + ) + + name = "".join(special_characters_replacements.get(char, char) for char in name) + name = "BACKTICK_QUOTED_STRING_" + name + + if not name.isidentifier(): + raise SyntaxError(f"Could not convert '{name}' to a valid Python identifier.") + + return name + + +def clean_backtick_quoted_toks(tok: Tuple[int, str]) -> Tuple[int, str]: + """ + Clean up a column name if surrounded by backticks. + + Backtick quoted string are indicated by a certain tokval value. If a string + is a backtick quoted token it will processed by + :func:`_create_valid_python_identifier` so that the parser can find this + string when the query is executed. + In this case the tok will get the NAME tokval. + + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + tok : Tuple[int, str] + Either the input or token or the replacement values + """ + toknum, tokval = tok + if toknum == BACKTICK_QUOTED_STRING: + return tokenize.NAME, create_valid_python_identifier(tokval) + return toknum, tokval + + +def clean_column_name(name: str) -> str: + """ + Function to emulate the cleaning of a backtick quoted name. + + The purpose for this function is to see what happens to the name of + identifier if it goes to the process of being parsed a Python code + inside a backtick quoted string and than being cleaned + (removed of any special characters). + + Parameters + ---------- + name : str + Name to be cleaned. + + Returns + ------- + name : str + Returns the name after tokenizing and cleaning. + + Notes + ----- + For some cases, a name cannot be converted to a valid Python identifier. + In that case :func:`tokenize_string` raises a SyntaxError. + In that case, we just return the name unmodified. + + If this name was used in the query string (this makes the query call impossible) + an error will be raised by :func:`tokenize_backtick_quoted_string` instead, + which is not catched and propogates to the user level. + """ + try: + tokenized = tokenize_string(f"`{name}`") + tokval = next(tokenized)[1] + return create_valid_python_identifier(tokval) + except SyntaxError: + return name + + +def tokenize_backtick_quoted_string( + token_generator: Iterator[tokenize.TokenInfo], source: str, string_start: int +) -> Tuple[int, str]: + """ + Creates a token from a backtick quoted string. + + Moves the token_generator forwards till right after the next backtick. + + Parameters + ---------- + token_generator : Iterator[tokenize.TokenInfo] + The generator that yields the tokens of the source string (Tuple[int, str]). + The generator is at the first token after the backtick (`) + + source : str + The Python source code string. + + string_start : int + This is the start of backtick quoted string inside the source string. + + Returns + ------- + tok: Tuple[int, str] + The token that represents the backtick quoted string. + The integer is equal to BACKTICK_QUOTED_STRING (100). + """ + for _, tokval, start, _, _ in token_generator: + if tokval == "`": + string_end = start[1] + break + + return BACKTICK_QUOTED_STRING, source[string_start:string_end] + + +def tokenize_string(source: str) -> Iterator[Tuple[int, str]]: + """ + Tokenize a Python source code string. + + Parameters + ---------- + source : str + The Python source code string. + + Returns + ------- + tok_generator : Iterator[Tuple[int, str]] + An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]). + """ + line_reader = StringIO(source).readline + token_generator = tokenize.generate_tokens(line_reader) + + # Loop over all tokens till a backtick (`) is found. + # Then, take all tokens till the next backtick to form a backtick quoted string + for toknum, tokval, start, _, _ in token_generator: + if tokval == "`": + try: + yield tokenize_backtick_quoted_string( + token_generator, source, string_start=start[1] + 1 + ) + except Exception: + raise SyntaxError(f"Failed to parse backticks in '{source}'.") + else: + yield toknum, tokval diff --git a/venv/Lib/site-packages/pandas/core/computation/pytables.py b/venv/Lib/site-packages/pandas/core/computation/pytables.py new file mode 100644 index 0000000..be652ca --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/computation/pytables.py @@ -0,0 +1,624 @@ +""" manage PyTables query interface via Expressions """ + +import ast +from functools import partial +from typing import Any, Dict, Optional, Tuple + +import numpy as np + +from pandas._libs.tslibs import Timedelta, Timestamp +from pandas.compat.chainmap import DeepChainMap + +from pandas.core.dtypes.common import is_list_like + +import pandas as pd +import pandas.core.common as com +from pandas.core.computation import expr, ops, scope as _scope +from pandas.core.computation.common import _ensure_decoded +from pandas.core.computation.expr import BaseExprVisitor +from pandas.core.computation.ops import UndefinedVariableError, is_term + +from pandas.io.formats.printing import pprint_thing, pprint_thing_encoded + + +class PyTablesScope(_scope.Scope): + __slots__ = ("queryables",) + + queryables: Dict[str, Any] + + def __init__( + self, + level: int, + global_dict=None, + local_dict=None, + queryables: Optional[Dict[str, Any]] = None, + ): + super().__init__(level + 1, global_dict=global_dict, local_dict=local_dict) + self.queryables = queryables or dict() + + +class Term(ops.Term): + env: PyTablesScope + + def __new__(cls, name, env, side=None, encoding=None): + klass = Constant if not isinstance(name, str) else cls + return object.__new__(klass) + + def __init__(self, name, env: PyTablesScope, side=None, encoding=None): + super().__init__(name, env, side=side, encoding=encoding) + + def _resolve_name(self): + # must be a queryables + if self.side == "left": + # Note: The behavior of __new__ ensures that self.name is a str here + if self.name not in self.env.queryables: + raise NameError(f"name {repr(self.name)} is not defined") + return self.name + + # resolve the rhs (and allow it to be None) + try: + return self.env.resolve(self.name, is_local=False) + except UndefinedVariableError: + return self.name + + # read-only property overwriting read/write property + @property # type: ignore + def value(self): + return self._value + + +class Constant(Term): + def __init__(self, value, env: PyTablesScope, side=None, encoding=None): + assert isinstance(env, PyTablesScope), type(env) + super().__init__(value, env, side=side, encoding=encoding) + + def _resolve_name(self): + return self._name + + +class BinOp(ops.BinOp): + + _max_selectors = 31 + + op: str + queryables: Dict[str, Any] + + def __init__(self, op: str, lhs, rhs, queryables: Dict[str, Any], encoding): + super().__init__(op, lhs, rhs) + self.queryables = queryables + self.encoding = encoding + self.condition = None + + def _disallow_scalar_only_bool_ops(self): + pass + + def prune(self, klass): + def pr(left, right): + """ create and return a new specialized BinOp from myself """ + + if left is None: + return right + elif right is None: + return left + + k = klass + if isinstance(left, ConditionBinOp): + if isinstance(right, ConditionBinOp): + k = JointConditionBinOp + elif isinstance(left, k): + return left + elif isinstance(right, k): + return right + + elif isinstance(left, FilterBinOp): + if isinstance(right, FilterBinOp): + k = JointFilterBinOp + elif isinstance(left, k): + return left + elif isinstance(right, k): + return right + + return k( + self.op, left, right, queryables=self.queryables, encoding=self.encoding + ).evaluate() + + left, right = self.lhs, self.rhs + + if is_term(left) and is_term(right): + res = pr(left.value, right.value) + elif not is_term(left) and is_term(right): + res = pr(left.prune(klass), right.value) + elif is_term(left) and not is_term(right): + res = pr(left.value, right.prune(klass)) + elif not (is_term(left) or is_term(right)): + res = pr(left.prune(klass), right.prune(klass)) + + return res + + def conform(self, rhs): + """ inplace conform rhs """ + if not is_list_like(rhs): + rhs = [rhs] + if isinstance(rhs, np.ndarray): + rhs = rhs.ravel() + return rhs + + @property + def is_valid(self) -> bool: + """ return True if this is a valid field """ + return self.lhs in self.queryables + + @property + def is_in_table(self) -> bool: + """ return True if this is a valid column name for generation (e.g. an + actual column in the table) """ + return self.queryables.get(self.lhs) is not None + + @property + def kind(self): + """ the kind of my field """ + return getattr(self.queryables.get(self.lhs), "kind", None) + + @property + def meta(self): + """ the meta of my field """ + return getattr(self.queryables.get(self.lhs), "meta", None) + + @property + def metadata(self): + """ the metadata of my field """ + return getattr(self.queryables.get(self.lhs), "metadata", None) + + def generate(self, v) -> str: + """ create and return the op string for this TermValue """ + val = v.tostring(self.encoding) + return f"({self.lhs} {self.op} {val})" + + def convert_value(self, v) -> "TermValue": + """ convert the expression that is in the term to something that is + accepted by pytables """ + + def stringify(value): + if self.encoding is not None: + encoder = partial(pprint_thing_encoded, encoding=self.encoding) + else: + encoder = pprint_thing + return encoder(value) + + kind = _ensure_decoded(self.kind) + meta = _ensure_decoded(self.meta) + if kind == "datetime64" or kind == "datetime": + if isinstance(v, (int, float)): + v = stringify(v) + v = _ensure_decoded(v) + v = Timestamp(v) + if v.tz is not None: + v = v.tz_convert("UTC") + return TermValue(v, v.value, kind) + elif kind == "timedelta64" or kind == "timedelta": + v = Timedelta(v, unit="s").value + return TermValue(int(v), v, kind) + elif meta == "category": + metadata = com.values_from_object(self.metadata) + result = metadata.searchsorted(v, side="left") + + # result returns 0 if v is first element or if v is not in metadata + # check that metadata contains v + if not result and v not in metadata: + result = -1 + return TermValue(result, result, "integer") + elif kind == "integer": + v = int(float(v)) + return TermValue(v, v, kind) + elif kind == "float": + v = float(v) + return TermValue(v, v, kind) + elif kind == "bool": + if isinstance(v, str): + v = not v.strip().lower() in [ + "false", + "f", + "no", + "n", + "none", + "0", + "[]", + "{}", + "", + ] + else: + v = bool(v) + return TermValue(v, v, kind) + elif isinstance(v, str): + # string quoting + return TermValue(v, stringify(v), "string") + else: + raise TypeError(f"Cannot compare {v} of type {type(v)} to {kind} column") + + def convert_values(self): + pass + + +class FilterBinOp(BinOp): + filter: Optional[Tuple[Any, Any, pd.Index]] = None + + def __repr__(self) -> str: + if self.filter is None: + return "Filter: Not Initialized" + return pprint_thing(f"[Filter : [{self.filter[0]}] -> [{self.filter[1]}]") + + def invert(self): + """ invert the filter """ + if self.filter is not None: + f = list(self.filter) + f[1] = self.generate_filter_op(invert=True) + self.filter = tuple(f) + return self + + def format(self): + """ return the actual filter format """ + return [self.filter] + + def evaluate(self): + + if not self.is_valid: + raise ValueError(f"query term is not valid [{self}]") + + rhs = self.conform(self.rhs) + values = list(rhs) + + if self.is_in_table: + + # if too many values to create the expression, use a filter instead + if self.op in ["==", "!="] and len(values) > self._max_selectors: + + filter_op = self.generate_filter_op() + self.filter = (self.lhs, filter_op, pd.Index(values)) + + return self + return None + + # equality conditions + if self.op in ["==", "!="]: + + filter_op = self.generate_filter_op() + self.filter = (self.lhs, filter_op, pd.Index(values)) + + else: + raise TypeError( + f"passing a filterable condition to a non-table indexer [{self}]" + ) + + return self + + def generate_filter_op(self, invert: bool = False): + if (self.op == "!=" and not invert) or (self.op == "==" and invert): + return lambda axis, vals: ~axis.isin(vals) + else: + return lambda axis, vals: axis.isin(vals) + + +class JointFilterBinOp(FilterBinOp): + def format(self): + raise NotImplementedError("unable to collapse Joint Filters") + + def evaluate(self): + return self + + +class ConditionBinOp(BinOp): + def __repr__(self) -> str: + return pprint_thing(f"[Condition : [{self.condition}]]") + + def invert(self): + """ invert the condition """ + # if self.condition is not None: + # self.condition = "~(%s)" % self.condition + # return self + raise NotImplementedError( + "cannot use an invert condition when passing to numexpr" + ) + + def format(self): + """ return the actual ne format """ + return self.condition + + def evaluate(self): + + if not self.is_valid: + raise ValueError(f"query term is not valid [{self}]") + + # convert values if we are in the table + if not self.is_in_table: + return None + + rhs = self.conform(self.rhs) + values = [self.convert_value(v) for v in rhs] + + # equality conditions + if self.op in ["==", "!="]: + + # too many values to create the expression? + if len(values) <= self._max_selectors: + vs = [self.generate(v) for v in values] + self.condition = f"({' | '.join(vs)})" + + # use a filter after reading + else: + return None + else: + self.condition = self.generate(values[0]) + + return self + + +class JointConditionBinOp(ConditionBinOp): + def evaluate(self): + self.condition = f"({self.lhs.condition} {self.op} {self.rhs.condition})" + return self + + +class UnaryOp(ops.UnaryOp): + def prune(self, klass): + + if self.op != "~": + raise NotImplementedError("UnaryOp only support invert type ops") + + operand = self.operand + operand = operand.prune(klass) + + if operand is not None: + if issubclass(klass, ConditionBinOp): + if operand.condition is not None: + return operand.invert() + elif issubclass(klass, FilterBinOp): + if operand.filter is not None: + return operand.invert() + + return None + + +class PyTablesExprVisitor(BaseExprVisitor): + const_type = Constant + term_type = Term + + def __init__(self, env, engine, parser, **kwargs): + super().__init__(env, engine, parser) + for bin_op in self.binary_ops: + bin_node = self.binary_op_nodes_map[bin_op] + setattr( + self, + f"visit_{bin_node}", + lambda node, bin_op=bin_op: partial(BinOp, bin_op, **kwargs), + ) + + def visit_UnaryOp(self, node, **kwargs): + if isinstance(node.op, (ast.Not, ast.Invert)): + return UnaryOp("~", self.visit(node.operand)) + elif isinstance(node.op, ast.USub): + return self.const_type(-self.visit(node.operand).value, self.env) + elif isinstance(node.op, ast.UAdd): + raise NotImplementedError("Unary addition not supported") + + def visit_Index(self, node, **kwargs): + return self.visit(node.value).value + + def visit_Assign(self, node, **kwargs): + cmpr = ast.Compare( + ops=[ast.Eq()], left=node.targets[0], comparators=[node.value] + ) + return self.visit(cmpr) + + def visit_Subscript(self, node, **kwargs): + # only allow simple subscripts + + value = self.visit(node.value) + slobj = self.visit(node.slice) + try: + value = value.value + except AttributeError: + pass + + try: + return self.const_type(value[slobj], self.env) + except TypeError: + raise ValueError(f"cannot subscript {repr(value)} with {repr(slobj)}") + + def visit_Attribute(self, node, **kwargs): + attr = node.attr + value = node.value + + ctx = type(node.ctx) + if ctx == ast.Load: + # resolve the value + resolved = self.visit(value) + + # try to get the value to see if we are another expression + try: + resolved = resolved.value + except (AttributeError): + pass + + try: + return self.term_type(getattr(resolved, attr), self.env) + except AttributeError: + + # something like datetime.datetime where scope is overridden + if isinstance(value, ast.Name) and value.id == attr: + return resolved + + raise ValueError(f"Invalid Attribute context {ctx.__name__}") + + def translate_In(self, op): + return ast.Eq() if isinstance(op, ast.In) else op + + def _rewrite_membership_op(self, node, left, right): + return self.visit(node.op), node.op, left, right + + +def _validate_where(w): + """ + Validate that the where statement is of the right type. + + The type may either be String, Expr, or list-like of Exprs. + + Parameters + ---------- + w : String term expression, Expr, or list-like of Exprs. + + Returns + ------- + where : The original where clause if the check was successful. + + Raises + ------ + TypeError : An invalid data type was passed in for w (e.g. dict). + """ + + if not (isinstance(w, (PyTablesExpr, str)) or is_list_like(w)): + raise TypeError( + "where must be passed as a string, PyTablesExpr, " + "or list-like of PyTablesExpr" + ) + + return w + + +class PyTablesExpr(expr.Expr): + """ + Hold a pytables-like expression, comprised of possibly multiple 'terms'. + + Parameters + ---------- + where : string term expression, PyTablesExpr, or list-like of PyTablesExprs + queryables : a "kinds" map (dict of column name -> kind), or None if column + is non-indexable + encoding : an encoding that will encode the query terms + + Returns + ------- + a PyTablesExpr object + + Examples + -------- + + 'index>=date' + "columns=['A', 'D']" + 'columns=A' + 'columns==A' + "~(columns=['A','B'])" + 'index>df.index[3] & string="bar"' + '(index>df.index[3] & index<=df.index[6]) | string="bar"' + "ts>=Timestamp('2012-02-01')" + "major_axis>=20130101" + """ + + _visitor: Optional[PyTablesExprVisitor] + env: PyTablesScope + + def __init__( + self, + where, + queryables: Optional[Dict[str, Any]] = None, + encoding=None, + scope_level: int = 0, + ): + + where = _validate_where(where) + + self.encoding = encoding + self.condition = None + self.filter = None + self.terms = None + self._visitor = None + + # capture the environment if needed + local_dict: DeepChainMap[Any, Any] = DeepChainMap() + + if isinstance(where, PyTablesExpr): + local_dict = where.env.scope + _where = where.expr + + elif isinstance(where, (list, tuple)): + where = list(where) + for idx, w in enumerate(where): + if isinstance(w, PyTablesExpr): + local_dict = w.env.scope + else: + w = _validate_where(w) + where[idx] = w + _where = " & ".join((f"({w})" for w in com.flatten(where))) + else: + _where = where + + self.expr = _where + self.env = PyTablesScope(scope_level + 1, local_dict=local_dict) + + if queryables is not None and isinstance(self.expr, str): + self.env.queryables.update(queryables) + self._visitor = PyTablesExprVisitor( + self.env, + queryables=queryables, + parser="pytables", + engine="pytables", + encoding=encoding, + ) + self.terms = self.parse() + + def __repr__(self) -> str: + if self.terms is not None: + return pprint_thing(self.terms) + return pprint_thing(self.expr) + + def evaluate(self): + """ create and return the numexpr condition and filter """ + + try: + self.condition = self.terms.prune(ConditionBinOp) + except AttributeError: + raise ValueError( + f"cannot process expression [{self.expr}], [{self}] " + "is not a valid condition" + ) + try: + self.filter = self.terms.prune(FilterBinOp) + except AttributeError: + raise ValueError( + f"cannot process expression [{self.expr}], [{self}] " + "is not a valid filter" + ) + + return self.condition, self.filter + + +class TermValue: + """ hold a term value the we use to construct a condition/filter """ + + def __init__(self, value, converted, kind: str): + assert isinstance(kind, str), kind + self.value = value + self.converted = converted + self.kind = kind + + def tostring(self, encoding) -> str: + """ quote the string if not encoded + else encode and return """ + if self.kind == "string": + if encoding is not None: + return str(self.converted) + return f'"{self.converted}"' + elif self.kind == "float": + # python 2 str(float) is not always + # round-trippable so use repr() + return repr(self.converted) + return str(self.converted) + + +def maybe_expression(s) -> bool: + """ loose checking if s is a pytables-acceptable expression """ + if not isinstance(s, str): + return False + ops = PyTablesExprVisitor.binary_ops + PyTablesExprVisitor.unary_ops + ("=",) + + # make sure we have an op at least + return any(op in s for op in ops) diff --git a/venv/Lib/site-packages/pandas/core/computation/scope.py b/venv/Lib/site-packages/pandas/core/computation/scope.py new file mode 100644 index 0000000..70dcf4d --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/computation/scope.py @@ -0,0 +1,314 @@ +""" +Module for scope operations +""" + +import datetime +import inspect +from io import StringIO +import itertools +import pprint +import struct +import sys +from typing import List + +import numpy as np + +from pandas._libs.tslibs import Timestamp +from pandas.compat.chainmap import DeepChainMap + + +def ensure_scope( + level: int, global_dict=None, local_dict=None, resolvers=(), target=None, **kwargs +) -> "Scope": + """Ensure that we are grabbing the correct scope.""" + return Scope( + level + 1, + global_dict=global_dict, + local_dict=local_dict, + resolvers=resolvers, + target=target, + ) + + +def _replacer(x) -> str: + """Replace a number with its hexadecimal representation. Used to tag + temporary variables with their calling scope's id. + """ + # get the hex repr of the binary char and remove 0x and pad by pad_size + # zeros + try: + hexin = ord(x) + except TypeError: + # bytes literals masquerade as ints when iterating in py3 + hexin = x + + return hex(hexin) + + +def _raw_hex_id(obj) -> str: + """Return the padded hexadecimal id of ``obj``.""" + # interpret as a pointer since that's what really what id returns + packed = struct.pack("@P", id(obj)) + return "".join(_replacer(x) for x in packed) + + +_DEFAULT_GLOBALS = { + "Timestamp": Timestamp, + "datetime": datetime.datetime, + "True": True, + "False": False, + "list": list, + "tuple": tuple, + "inf": np.inf, + "Inf": np.inf, +} + + +def _get_pretty_string(obj) -> str: + """ + Return a prettier version of obj. + + Parameters + ---------- + obj : object + Object to pretty print + + Returns + ------- + str + Pretty print object repr + """ + sio = StringIO() + pprint.pprint(obj, stream=sio) + return sio.getvalue() + + +class Scope: + """ + Object to hold scope, with a few bells to deal with some custom syntax + and contexts added by pandas. + + Parameters + ---------- + level : int + global_dict : dict or None, optional, default None + local_dict : dict or Scope or None, optional, default None + resolvers : list-like or None, optional, default None + target : object + + Attributes + ---------- + level : int + scope : DeepChainMap + target : object + temps : dict + """ + + __slots__ = ["level", "scope", "target", "resolvers", "temps"] + + def __init__( + self, level, global_dict=None, local_dict=None, resolvers=(), target=None + ): + self.level = level + 1 + + # shallow copy because we don't want to keep filling this up with what + # was there before if there are multiple calls to Scope/_ensure_scope + self.scope = DeepChainMap(_DEFAULT_GLOBALS.copy()) + self.target = target + + if isinstance(local_dict, Scope): + self.scope.update(local_dict.scope) + if local_dict.target is not None: + self.target = local_dict.target + self._update(local_dict.level) + + frame = sys._getframe(self.level) + + try: + # shallow copy here because we don't want to replace what's in + # scope when we align terms (alignment accesses the underlying + # numpy array of pandas objects) + self.scope = self.scope.new_child((global_dict or frame.f_globals).copy()) + if not isinstance(local_dict, Scope): + self.scope = self.scope.new_child((local_dict or frame.f_locals).copy()) + finally: + del frame + + # assumes that resolvers are going from outermost scope to inner + if isinstance(local_dict, Scope): + resolvers += tuple(local_dict.resolvers.maps) + self.resolvers = DeepChainMap(*resolvers) + self.temps = {} + + def __repr__(self) -> str: + scope_keys = _get_pretty_string(list(self.scope.keys())) + res_keys = _get_pretty_string(list(self.resolvers.keys())) + unicode_str = f"{type(self).__name__}(scope={scope_keys}, resolvers={res_keys})" + return unicode_str + + @property + def has_resolvers(self) -> bool: + """ + Return whether we have any extra scope. + + For example, DataFrames pass Their columns as resolvers during calls to + ``DataFrame.eval()`` and ``DataFrame.query()``. + + Returns + ------- + hr : bool + """ + return bool(len(self.resolvers)) + + def resolve(self, key: str, is_local: bool): + """ + Resolve a variable name in a possibly local context. + + Parameters + ---------- + key : str + A variable name + is_local : bool + Flag indicating whether the variable is local or not (prefixed with + the '@' symbol) + + Returns + ------- + value : object + The value of a particular variable + """ + try: + # only look for locals in outer scope + if is_local: + return self.scope[key] + + # not a local variable so check in resolvers if we have them + if self.has_resolvers: + return self.resolvers[key] + + # if we're here that means that we have no locals and we also have + # no resolvers + assert not is_local and not self.has_resolvers + return self.scope[key] + except KeyError: + try: + # last ditch effort we look in temporaries + # these are created when parsing indexing expressions + # e.g., df[df > 0] + return self.temps[key] + except KeyError: + # runtime import because ops imports from scope + from pandas.core.computation.ops import UndefinedVariableError + + raise UndefinedVariableError(key, is_local) + + def swapkey(self, old_key: str, new_key: str, new_value=None): + """ + Replace a variable name, with a potentially new value. + + Parameters + ---------- + old_key : str + Current variable name to replace + new_key : str + New variable name to replace `old_key` with + new_value : object + Value to be replaced along with the possible renaming + """ + if self.has_resolvers: + maps = self.resolvers.maps + self.scope.maps + else: + maps = self.scope.maps + + maps.append(self.temps) + + for mapping in maps: + if old_key in mapping: + mapping[new_key] = new_value + return + + def _get_vars(self, stack, scopes: List[str]): + """ + Get specifically scoped variables from a list of stack frames. + + Parameters + ---------- + stack : list + A list of stack frames as returned by ``inspect.stack()`` + scopes : sequence of strings + A sequence containing valid stack frame attribute names that + evaluate to a dictionary. For example, ('locals', 'globals') + """ + variables = itertools.product(scopes, stack) + for scope, (frame, _, _, _, _, _) in variables: + try: + d = getattr(frame, "f_" + scope) + self.scope = self.scope.new_child(d) + finally: + # won't remove it, but DECREF it + # in Py3 this probably isn't necessary since frame won't be + # scope after the loop + del frame + + def _update(self, level: int): + """ + Update the current scope by going back `level` levels. + + Parameters + ---------- + level : int + """ + sl = level + 1 + + # add sl frames to the scope starting with the + # most distant and overwriting with more current + # makes sure that we can capture variable scope + stack = inspect.stack() + + try: + self._get_vars(stack[:sl], scopes=["locals"]) + finally: + del stack[:], stack + + def add_tmp(self, value) -> str: + """ + Add a temporary variable to the scope. + + Parameters + ---------- + value : object + An arbitrary object to be assigned to a temporary variable. + + Returns + ------- + str + The name of the temporary variable created. + """ + name = f"{type(value).__name__}_{self.ntemps}_{_raw_hex_id(self)}" + + # add to inner most scope + assert name not in self.temps + self.temps[name] = value + assert name in self.temps + + # only increment if the variable gets put in the scope + return name + + @property + def ntemps(self) -> int: + """The number of temporary variables in this scope""" + return len(self.temps) + + @property + def full_scope(self): + """ + Return the full scope for use with passing to engines transparently + as a mapping. + + Returns + ------- + vars : DeepChainMap + All variables in this scope. + """ + maps = [self.temps] + self.resolvers.maps + self.scope.maps + return DeepChainMap(*maps) diff --git a/venv/Lib/site-packages/pandas/core/config_init.py b/venv/Lib/site-packages/pandas/core/config_init.py new file mode 100644 index 0000000..eb15873 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/config_init.py @@ -0,0 +1,635 @@ +""" +This module is imported from the pandas package __init__.py file +in order to ensure that the core.config options registered here will +be available as soon as the user loads the package. if register_option +is invoked inside specific modules, they will not be registered until that +module is imported, which may or may not be a problem. + +If you need to make sure options are available even before a certain +module is imported, register them here rather then in the module. + +""" +import pandas._config.config as cf +from pandas._config.config import ( + is_bool, + is_callable, + is_instance_factory, + is_int, + is_nonnegative_int, + is_one_of_factory, + is_text, +) + +# compute + +use_bottleneck_doc = """ +: bool + Use the bottleneck library to accelerate if it is installed, + the default is True + Valid values: False,True +""" + + +def use_bottleneck_cb(key): + from pandas.core import nanops + + nanops.set_use_bottleneck(cf.get_option(key)) + + +use_numexpr_doc = """ +: bool + Use the numexpr library to accelerate computation if it is installed, + the default is True + Valid values: False,True +""" + + +def use_numexpr_cb(key): + from pandas.core.computation import expressions + + expressions.set_use_numexpr(cf.get_option(key)) + + +with cf.config_prefix("compute"): + cf.register_option( + "use_bottleneck", + True, + use_bottleneck_doc, + validator=is_bool, + cb=use_bottleneck_cb, + ) + cf.register_option( + "use_numexpr", True, use_numexpr_doc, validator=is_bool, cb=use_numexpr_cb + ) +# +# options from the "display" namespace + +pc_precision_doc = """ +: int + Floating point output precision (number of significant digits). This is + only a suggestion +""" + +pc_colspace_doc = """ +: int + Default space for DataFrame columns. +""" + +pc_max_rows_doc = """ +: int + If max_rows is exceeded, switch to truncate view. Depending on + `large_repr`, objects are either centrally truncated or printed as + a summary view. 'None' value means unlimited. + + In case python/IPython is running in a terminal and `large_repr` + equals 'truncate' this can be set to 0 and pandas will auto-detect + the height of the terminal and print a truncated object which fits + the screen height. The IPython notebook, IPython qtconsole, or + IDLE do not run in a terminal and hence it is not possible to do + correct auto-detection. +""" + +pc_min_rows_doc = """ +: int + The numbers of rows to show in a truncated view (when `max_rows` is + exceeded). Ignored when `max_rows` is set to None or 0. When set to + None, follows the value of `max_rows`. +""" + +pc_max_cols_doc = """ +: int + If max_cols is exceeded, switch to truncate view. Depending on + `large_repr`, objects are either centrally truncated or printed as + a summary view. 'None' value means unlimited. + + In case python/IPython is running in a terminal and `large_repr` + equals 'truncate' this can be set to 0 and pandas will auto-detect + the width of the terminal and print a truncated object which fits + the screen width. The IPython notebook, IPython qtconsole, or IDLE + do not run in a terminal and hence it is not possible to do + correct auto-detection. +""" + +pc_max_categories_doc = """ +: int + This sets the maximum number of categories pandas should output when + printing out a `Categorical` or a Series of dtype "category". +""" + +pc_max_info_cols_doc = """ +: int + max_info_columns is used in DataFrame.info method to decide if + per column information will be printed. +""" + +pc_nb_repr_h_doc = """ +: boolean + When True, IPython notebook will use html representation for + pandas objects (if it is available). +""" + +pc_pprint_nest_depth = """ +: int + Controls the number of nested levels to process when pretty-printing +""" + +pc_multi_sparse_doc = """ +: boolean + "sparsify" MultiIndex display (don't display repeated + elements in outer levels within groups) +""" + +float_format_doc = """ +: callable + The callable should accept a floating point number and return + a string with the desired format of the number. This is used + in some places like SeriesFormatter. + See formats.format.EngFormatter for an example. +""" + +max_colwidth_doc = """ +: int or None + The maximum width in characters of a column in the repr of + a pandas data structure. When the column overflows, a "..." + placeholder is embedded in the output. A 'None' value means unlimited. +""" + +colheader_justify_doc = """ +: 'left'/'right' + Controls the justification of column headers. used by DataFrameFormatter. +""" + +pc_expand_repr_doc = """ +: boolean + Whether to print out the full DataFrame repr for wide DataFrames across + multiple lines, `max_columns` is still respected, but the output will + wrap-around across multiple "pages" if its width exceeds `display.width`. +""" + +pc_show_dimensions_doc = """ +: boolean or 'truncate' + Whether to print out dimensions at the end of DataFrame repr. + If 'truncate' is specified, only print out the dimensions if the + frame is truncated (e.g. not display all rows and/or columns) +""" + +pc_east_asian_width_doc = """ +: boolean + Whether to use the Unicode East Asian Width to calculate the display text + width. + Enabling this may affect to the performance (default: False) +""" + +pc_ambiguous_as_wide_doc = """ +: boolean + Whether to handle Unicode characters belong to Ambiguous as Wide (width=2) + (default: False) +""" + +pc_latex_repr_doc = """ +: boolean + Whether to produce a latex DataFrame representation for jupyter + environments that support it. + (default: False) +""" + +pc_table_schema_doc = """ +: boolean + Whether to publish a Table Schema representation for frontends + that support it. + (default: False) +""" + +pc_html_border_doc = """ +: int + A ``border=value`` attribute is inserted in the ```` tag + for the DataFrame HTML repr. +""" + +pc_html_use_mathjax_doc = """\ +: boolean + When True, Jupyter notebook will process table contents using MathJax, + rendering mathematical expressions enclosed by the dollar symbol. + (default: True) +""" + +pc_width_doc = """ +: int + Width of the display in characters. In case python/IPython is running in + a terminal this can be set to None and pandas will correctly auto-detect + the width. + Note that the IPython notebook, IPython qtconsole, or IDLE do not run in a + terminal and hence it is not possible to correctly detect the width. +""" + +pc_chop_threshold_doc = """ +: float or None + if set to a float value, all float values smaller then the given threshold + will be displayed as exactly 0 by repr and friends. +""" + +pc_max_seq_items = """ +: int or None + when pretty-printing a long sequence, no more then `max_seq_items` + will be printed. If items are omitted, they will be denoted by the + addition of "..." to the resulting string. + + If set to None, the number of items to be printed is unlimited. +""" + +pc_max_info_rows_doc = """ +: int or None + df.info() will usually show null-counts for each column. + For large frames this can be quite slow. max_info_rows and max_info_cols + limit this null check only to frames with smaller dimensions than + specified. +""" + +pc_large_repr_doc = """ +: 'truncate'/'info' + For DataFrames exceeding max_rows/max_cols, the repr (and HTML repr) can + show a truncated table (the default from 0.13), or switch to the view from + df.info() (the behaviour in earlier versions of pandas). +""" + +pc_memory_usage_doc = """ +: bool, string or None + This specifies if the memory usage of a DataFrame should be displayed when + df.info() is called. Valid values True,False,'deep' +""" + +pc_latex_escape = """ +: bool + This specifies if the to_latex method of a Dataframe uses escapes special + characters. + Valid values: False,True +""" + +pc_latex_longtable = """ +:bool + This specifies if the to_latex method of a Dataframe uses the longtable + format. + Valid values: False,True +""" + +pc_latex_multicolumn = """ +: bool + This specifies if the to_latex method of a Dataframe uses multicolumns + to pretty-print MultiIndex columns. + Valid values: False,True +""" + +pc_latex_multicolumn_format = """ +: string + This specifies the format for multicolumn headers. + Can be surrounded with '|'. + Valid values: 'l', 'c', 'r', 'p{}' +""" + +pc_latex_multirow = """ +: bool + This specifies if the to_latex method of a Dataframe uses multirows + to pretty-print MultiIndex rows. + Valid values: False,True +""" + + +def table_schema_cb(key): + from pandas.io.formats.printing import _enable_data_resource_formatter + + _enable_data_resource_formatter(cf.get_option(key)) + + +def is_terminal() -> bool: + """ + Detect if Python is running in a terminal. + + Returns True if Python is running in a terminal or False if not. + """ + try: + # error: Name 'get_ipython' is not defined + ip = get_ipython() # type: ignore + except NameError: # assume standard Python interpreter in a terminal + return True + else: + if hasattr(ip, "kernel"): # IPython as a Jupyter kernel + return False + else: # IPython in a terminal + return True + + +with cf.config_prefix("display"): + cf.register_option("precision", 6, pc_precision_doc, validator=is_nonnegative_int) + cf.register_option( + "float_format", + None, + float_format_doc, + validator=is_one_of_factory([None, is_callable]), + ) + cf.register_option("column_space", 12, validator=is_int) + cf.register_option( + "max_info_rows", + 1690785, + pc_max_info_rows_doc, + validator=is_instance_factory((int, type(None))), + ) + cf.register_option("max_rows", 60, pc_max_rows_doc, validator=is_nonnegative_int) + cf.register_option( + "min_rows", + 10, + pc_min_rows_doc, + validator=is_instance_factory([type(None), int]), + ) + cf.register_option("max_categories", 8, pc_max_categories_doc, validator=is_int) + cf.register_option( + "max_colwidth", 50, max_colwidth_doc, validator=is_nonnegative_int + ) + if is_terminal(): + max_cols = 0 # automatically determine optimal number of columns + else: + max_cols = 20 # cannot determine optimal number of columns + cf.register_option( + "max_columns", max_cols, pc_max_cols_doc, validator=is_nonnegative_int + ) + cf.register_option( + "large_repr", + "truncate", + pc_large_repr_doc, + validator=is_one_of_factory(["truncate", "info"]), + ) + cf.register_option("max_info_columns", 100, pc_max_info_cols_doc, validator=is_int) + cf.register_option( + "colheader_justify", "right", colheader_justify_doc, validator=is_text + ) + cf.register_option("notebook_repr_html", True, pc_nb_repr_h_doc, validator=is_bool) + cf.register_option("pprint_nest_depth", 3, pc_pprint_nest_depth, validator=is_int) + cf.register_option("multi_sparse", True, pc_multi_sparse_doc, validator=is_bool) + cf.register_option("expand_frame_repr", True, pc_expand_repr_doc) + cf.register_option( + "show_dimensions", + "truncate", + pc_show_dimensions_doc, + validator=is_one_of_factory([True, False, "truncate"]), + ) + cf.register_option("chop_threshold", None, pc_chop_threshold_doc) + cf.register_option("max_seq_items", 100, pc_max_seq_items) + cf.register_option( + "width", 80, pc_width_doc, validator=is_instance_factory([type(None), int]) + ) + cf.register_option( + "memory_usage", + True, + pc_memory_usage_doc, + validator=is_one_of_factory([None, True, False, "deep"]), + ) + cf.register_option( + "unicode.east_asian_width", False, pc_east_asian_width_doc, validator=is_bool + ) + cf.register_option( + "unicode.ambiguous_as_wide", False, pc_east_asian_width_doc, validator=is_bool + ) + cf.register_option("latex.repr", False, pc_latex_repr_doc, validator=is_bool) + cf.register_option("latex.escape", True, pc_latex_escape, validator=is_bool) + cf.register_option("latex.longtable", False, pc_latex_longtable, validator=is_bool) + cf.register_option( + "latex.multicolumn", True, pc_latex_multicolumn, validator=is_bool + ) + cf.register_option( + "latex.multicolumn_format", "l", pc_latex_multicolumn, validator=is_text + ) + cf.register_option("latex.multirow", False, pc_latex_multirow, validator=is_bool) + cf.register_option( + "html.table_schema", + False, + pc_table_schema_doc, + validator=is_bool, + cb=table_schema_cb, + ) + cf.register_option("html.border", 1, pc_html_border_doc, validator=is_int) + cf.register_option( + "html.use_mathjax", True, pc_html_use_mathjax_doc, validator=is_bool + ) + +tc_sim_interactive_doc = """ +: boolean + Whether to simulate interactive mode for purposes of testing +""" + +with cf.config_prefix("mode"): + cf.register_option("sim_interactive", False, tc_sim_interactive_doc) + +use_inf_as_null_doc = """ +: boolean + use_inf_as_null had been deprecated and will be removed in a future + version. Use `use_inf_as_na` instead. +""" + +use_inf_as_na_doc = """ +: boolean + True means treat None, NaN, INF, -INF as NA (old way), + False means None and NaN are null, but INF, -INF are not NA + (new way). +""" + +# We don't want to start importing everything at the global context level +# or we'll hit circular deps. + + +def use_inf_as_na_cb(key): + from pandas.core.dtypes.missing import _use_inf_as_na + + _use_inf_as_na(key) + + +with cf.config_prefix("mode"): + cf.register_option("use_inf_as_na", False, use_inf_as_na_doc, cb=use_inf_as_na_cb) + cf.register_option( + "use_inf_as_null", False, use_inf_as_null_doc, cb=use_inf_as_na_cb + ) + +cf.deprecate_option( + "mode.use_inf_as_null", msg=use_inf_as_null_doc, rkey="mode.use_inf_as_na" +) + + +# user warnings +chained_assignment = """ +: string + Raise an exception, warn, or no action if trying to use chained assignment, + The default is warn +""" + +with cf.config_prefix("mode"): + cf.register_option( + "chained_assignment", + "warn", + chained_assignment, + validator=is_one_of_factory([None, "warn", "raise"]), + ) + + +# Set up the io.excel specific reader configuration. +reader_engine_doc = """ +: string + The default Excel reader engine for '{ext}' files. Available options: + auto, {others}. +""" + +_xls_options = ["xlrd"] +_xlsm_options = ["xlrd", "openpyxl"] +_xlsx_options = ["xlrd", "openpyxl"] +_ods_options = ["odf"] +_xlsb_options = ["pyxlsb"] + + +with cf.config_prefix("io.excel.xls"): + cf.register_option( + "reader", + "auto", + reader_engine_doc.format(ext="xls", others=", ".join(_xls_options)), + validator=str, + ) + +with cf.config_prefix("io.excel.xlsm"): + cf.register_option( + "reader", + "auto", + reader_engine_doc.format(ext="xlsm", others=", ".join(_xlsm_options)), + validator=str, + ) + + +with cf.config_prefix("io.excel.xlsx"): + cf.register_option( + "reader", + "auto", + reader_engine_doc.format(ext="xlsx", others=", ".join(_xlsx_options)), + validator=str, + ) + + +with cf.config_prefix("io.excel.ods"): + cf.register_option( + "reader", + "auto", + reader_engine_doc.format(ext="ods", others=", ".join(_ods_options)), + validator=str, + ) + +with cf.config_prefix("io.excel.xlsb"): + cf.register_option( + "reader", + "auto", + reader_engine_doc.format(ext="xlsb", others=", ".join(_xlsb_options)), + validator=str, + ) + +# Set up the io.excel specific writer configuration. +writer_engine_doc = """ +: string + The default Excel writer engine for '{ext}' files. Available options: + auto, {others}. +""" + +_xls_options = ["xlwt"] +_xlsm_options = ["openpyxl"] +_xlsx_options = ["openpyxl", "xlsxwriter"] + + +with cf.config_prefix("io.excel.xls"): + cf.register_option( + "writer", + "auto", + writer_engine_doc.format(ext="xls", others=", ".join(_xls_options)), + validator=str, + ) + +with cf.config_prefix("io.excel.xlsm"): + cf.register_option( + "writer", + "auto", + writer_engine_doc.format(ext="xlsm", others=", ".join(_xlsm_options)), + validator=str, + ) + + +with cf.config_prefix("io.excel.xlsx"): + cf.register_option( + "writer", + "auto", + writer_engine_doc.format(ext="xlsx", others=", ".join(_xlsx_options)), + validator=str, + ) + + +# Set up the io.parquet specific configuration. +parquet_engine_doc = """ +: string + The default parquet reader/writer engine. Available options: + 'auto', 'pyarrow', 'fastparquet', the default is 'auto' +""" + +with cf.config_prefix("io.parquet"): + cf.register_option( + "engine", + "auto", + parquet_engine_doc, + validator=is_one_of_factory(["auto", "pyarrow", "fastparquet"]), + ) + +# -------- +# Plotting +# --------- + +plotting_backend_doc = """ +: str + The plotting backend to use. The default value is "matplotlib", the + backend provided with pandas. Other backends can be specified by + prodiving the name of the module that implements the backend. +""" + + +def register_plotting_backend_cb(key): + if key == "matplotlib": + # We defer matplotlib validation, since it's the default + return + from pandas.plotting._core import _get_plot_backend + + _get_plot_backend(key) + + +with cf.config_prefix("plotting"): + cf.register_option( + "backend", + defval="matplotlib", + doc=plotting_backend_doc, + validator=register_plotting_backend_cb, + ) + + +register_converter_doc = """ +: bool or 'auto'. + Whether to register converters with matplotlib's units registry for + dates, times, datetimes, and Periods. Toggling to False will remove + the converters, restoring any converters that pandas overwrote. +""" + + +def register_converter_cb(key): + from pandas.plotting import register_matplotlib_converters + from pandas.plotting import deregister_matplotlib_converters + + if cf.get_option(key): + register_matplotlib_converters() + else: + deregister_matplotlib_converters() + + +with cf.config_prefix("plotting.matplotlib"): + cf.register_option( + "register_converters", + "auto", + register_converter_doc, + validator=is_one_of_factory(["auto", True, False]), + cb=register_converter_cb, + ) diff --git a/venv/Lib/site-packages/pandas/core/construction.py b/venv/Lib/site-packages/pandas/core/construction.py new file mode 100644 index 0000000..203ef3e --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/construction.py @@ -0,0 +1,626 @@ +""" +Constructor functions intended to be shared by pd.array, Series.__init__, +and Index.__new__. + +These should not depend on core.internals. +""" +from typing import TYPE_CHECKING, Any, Optional, Sequence, Union, cast + +import numpy as np +import numpy.ma as ma + +from pandas._libs import lib +from pandas._libs.tslibs import IncompatibleFrequency, OutOfBoundsDatetime +from pandas._typing import ArrayLike, Dtype + +from pandas.core.dtypes.cast import ( + construct_1d_arraylike_from_scalar, + construct_1d_ndarray_preserving_na, + construct_1d_object_array_from_listlike, + infer_dtype_from_scalar, + maybe_cast_to_datetime, + maybe_cast_to_integer_array, + maybe_castable, + maybe_convert_platform, + maybe_upcast, +) +from pandas.core.dtypes.common import ( + is_categorical_dtype, + is_datetime64_ns_dtype, + is_extension_array_dtype, + is_float_dtype, + is_integer_dtype, + is_iterator, + is_list_like, + is_object_dtype, + is_timedelta64_ns_dtype, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import CategoricalDtype, ExtensionDtype, registry +from pandas.core.dtypes.generic import ( + ABCExtensionArray, + ABCIndexClass, + ABCPandasArray, + ABCSeries, +) +from pandas.core.dtypes.missing import isna + +import pandas.core.common as com + +if TYPE_CHECKING: + from pandas.core.series import Series # noqa: F401 + from pandas.core.indexes.api import Index # noqa: F401 + + +def array( + data: Sequence[object], + dtype: Optional[Union[str, np.dtype, ExtensionDtype]] = None, + copy: bool = True, +) -> ABCExtensionArray: + """ + Create an array. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + data : Sequence of objects + The scalars inside `data` should be instances of the + scalar type for `dtype`. It's expected that `data` + represents a 1-dimensional array of data. + + When `data` is an Index or Series, the underlying array + will be extracted from `data`. + + dtype : str, np.dtype, or ExtensionDtype, optional + The dtype to use for the array. This may be a NumPy + dtype or an extension type registered with pandas using + :meth:`pandas.api.extensions.register_extension_dtype`. + + If not specified, there are two possibilities: + + 1. When `data` is a :class:`Series`, :class:`Index`, or + :class:`ExtensionArray`, the `dtype` will be taken + from the data. + 2. Otherwise, pandas will attempt to infer the `dtype` + from the data. + + Note that when `data` is a NumPy array, ``data.dtype`` is + *not* used for inferring the array type. This is because + NumPy cannot represent all the types of data that can be + held in extension arrays. + + Currently, pandas will infer an extension dtype for sequences of + + ============================== ===================================== + Scalar Type Array Type + ============================== ===================================== + :class:`pandas.Interval` :class:`pandas.arrays.IntervalArray` + :class:`pandas.Period` :class:`pandas.arrays.PeriodArray` + :class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray` + :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray` + :class:`int` :class:`pandas.arrays.IntegerArray` + :class:`str` :class:`pandas.arrays.StringArray` + :class:`bool` :class:`pandas.arrays.BooleanArray` + ============================== ===================================== + + For all other cases, NumPy's usual inference rules will be used. + + .. versionchanged:: 1.0.0 + + Pandas infers nullable-integer dtype for integer data, + string dtype for string data, and nullable-boolean dtype + for boolean data. + + copy : bool, default True + Whether to copy the data, even if not necessary. Depending + on the type of `data`, creating the new array may require + copying data, even if ``copy=False``. + + Returns + ------- + ExtensionArray + The newly created array. + + Raises + ------ + ValueError + When `data` is not 1-dimensional. + + See Also + -------- + numpy.array : Construct a NumPy array. + Series : Construct a pandas Series. + Index : Construct a pandas Index. + arrays.PandasArray : ExtensionArray wrapping a NumPy array. + Series.array : Extract the array stored within a Series. + + Notes + ----- + Omitting the `dtype` argument means pandas will attempt to infer the + best array type from the values in the data. As new array types are + added by pandas and 3rd party libraries, the "best" array type may + change. We recommend specifying `dtype` to ensure that + + 1. the correct array type for the data is returned + 2. the returned array type doesn't change as new extension types + are added by pandas and third-party libraries + + Additionally, if the underlying memory representation of the returned + array matters, we recommend specifying the `dtype` as a concrete object + rather than a string alias or allowing it to be inferred. For example, + a future version of pandas or a 3rd-party library may include a + dedicated ExtensionArray for string data. In this event, the following + would no longer return a :class:`arrays.PandasArray` backed by a NumPy + array. + + >>> pd.array(['a', 'b'], dtype=str) + + ['a', 'b'] + Length: 2, dtype: str32 + + This would instead return the new ExtensionArray dedicated for string + data. If you really need the new array to be backed by a NumPy array, + specify that in the dtype. + + >>> pd.array(['a', 'b'], dtype=np.dtype(" + ['a', 'b'] + Length: 2, dtype: str32 + + Finally, Pandas has arrays that mostly overlap with NumPy + + * :class:`arrays.DatetimeArray` + * :class:`arrays.TimedeltaArray` + + When data with a ``datetime64[ns]`` or ``timedelta64[ns]`` dtype is + passed, pandas will always return a ``DatetimeArray`` or ``TimedeltaArray`` + rather than a ``PandasArray``. This is for symmetry with the case of + timezone-aware data, which NumPy does not natively support. + + >>> pd.array(['2015', '2016'], dtype='datetime64[ns]') + + ['2015-01-01 00:00:00', '2016-01-01 00:00:00'] + Length: 2, dtype: datetime64[ns] + + >>> pd.array(["1H", "2H"], dtype='timedelta64[ns]') + + ['01:00:00', '02:00:00'] + Length: 2, dtype: timedelta64[ns] + + Examples + -------- + If a dtype is not specified, pandas will infer the best dtype from the values. + See the description of `dtype` for the types pandas infers for. + + >>> pd.array([1, 2]) + + [1, 2] + Length: 2, dtype: Int64 + + >>> pd.array([1, 2, np.nan]) + + [1, 2, NaN] + Length: 3, dtype: Int64 + + >>> pd.array(["a", None, "c"]) + + ['a', nan, 'c'] + Length: 3, dtype: string + + >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) + + ['2000-01-01', '2000-01-01'] + Length: 2, dtype: period[D] + + You can use the string alias for `dtype` + + >>> pd.array(['a', 'b', 'a'], dtype='category') + [a, b, a] + Categories (2, object): [a, b] + + Or specify the actual dtype + + >>> pd.array(['a', 'b', 'a'], + ... dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True)) + [a, b, a] + Categories (3, object): [a < b < c] + + If pandas does not infer a dedicated extension type a + :class:`arrays.PandasArray` is returned. + + >>> pd.array([1.1, 2.2]) + + [1.1, 2.2] + Length: 2, dtype: float64 + + As mentioned in the "Notes" section, new extension types may be added + in the future (by pandas or 3rd party libraries), causing the return + value to no longer be a :class:`arrays.PandasArray`. Specify the `dtype` + as a NumPy dtype if you need to ensure there's no future change in + behavior. + + >>> pd.array([1, 2], dtype=np.dtype("int32")) + + [1, 2] + Length: 2, dtype: int32 + + `data` must be 1-dimensional. A ValueError is raised when the input + has the wrong dimensionality. + + >>> pd.array(1) + Traceback (most recent call last): + ... + ValueError: Cannot pass scalar '1' to 'pandas.array'. + """ + from pandas.core.arrays import ( + period_array, + BooleanArray, + IntegerArray, + IntervalArray, + PandasArray, + DatetimeArray, + TimedeltaArray, + StringArray, + ) + + if lib.is_scalar(data): + msg = f"Cannot pass scalar '{data}' to 'pandas.array'." + raise ValueError(msg) + + if dtype is None and isinstance( + data, (ABCSeries, ABCIndexClass, ABCExtensionArray) + ): + dtype = data.dtype + + data = extract_array(data, extract_numpy=True) + + # this returns None for not-found dtypes. + if isinstance(dtype, str): + dtype = registry.find(dtype) or dtype + + if is_extension_array_dtype(dtype): + cls = cast(ExtensionDtype, dtype).construct_array_type() + return cls._from_sequence(data, dtype=dtype, copy=copy) + + if dtype is None: + inferred_dtype = lib.infer_dtype(data, skipna=True) + if inferred_dtype == "period": + try: + return period_array(data, copy=copy) + except IncompatibleFrequency: + # We may have a mixture of frequencies. + # We choose to return an ndarray, rather than raising. + pass + elif inferred_dtype == "interval": + try: + return IntervalArray(data, copy=copy) + except ValueError: + # We may have a mixture of `closed` here. + # We choose to return an ndarray, rather than raising. + pass + + elif inferred_dtype.startswith("datetime"): + # datetime, datetime64 + try: + return DatetimeArray._from_sequence(data, copy=copy) + except ValueError: + # Mixture of timezones, fall back to PandasArray + pass + + elif inferred_dtype.startswith("timedelta"): + # timedelta, timedelta64 + return TimedeltaArray._from_sequence(data, copy=copy) + + elif inferred_dtype == "string": + return StringArray._from_sequence(data, copy=copy) + + elif inferred_dtype == "integer": + return IntegerArray._from_sequence(data, copy=copy) + + elif inferred_dtype == "boolean": + return BooleanArray._from_sequence(data, copy=copy) + + # Pandas overrides NumPy for + # 1. datetime64[ns] + # 2. timedelta64[ns] + # so that a DatetimeArray is returned. + if is_datetime64_ns_dtype(dtype): + return DatetimeArray._from_sequence(data, dtype=dtype, copy=copy) + elif is_timedelta64_ns_dtype(dtype): + return TimedeltaArray._from_sequence(data, dtype=dtype, copy=copy) + + result = PandasArray._from_sequence(data, dtype=dtype, copy=copy) + return result + + +def extract_array(obj, extract_numpy=False): + """ + Extract the ndarray or ExtensionArray from a Series or Index. + + For all other types, `obj` is just returned as is. + + Parameters + ---------- + obj : object + For Series / Index, the underlying ExtensionArray is unboxed. + For Numpy-backed ExtensionArrays, the ndarray is extracted. + + extract_numpy : bool, default False + Whether to extract the ndarray from a PandasArray + + Returns + ------- + arr : object + + Examples + -------- + >>> extract_array(pd.Series(['a', 'b', 'c'], dtype='category')) + [a, b, c] + Categories (3, object): [a, b, c] + + Other objects like lists, arrays, and DataFrames are just passed through. + + >>> extract_array([1, 2, 3]) + [1, 2, 3] + + For an ndarray-backed Series / Index a PandasArray is returned. + + >>> extract_array(pd.Series([1, 2, 3])) + + [1, 2, 3] + Length: 3, dtype: int64 + + To extract all the way down to the ndarray, pass ``extract_numpy=True``. + + >>> extract_array(pd.Series([1, 2, 3]), extract_numpy=True) + array([1, 2, 3]) + """ + if isinstance(obj, (ABCIndexClass, ABCSeries)): + obj = obj.array + + if extract_numpy and isinstance(obj, ABCPandasArray): + obj = obj.to_numpy() + + return obj + + +def sanitize_array( + data, index, dtype=None, copy: bool = False, raise_cast_failure: bool = False +): + """ + Sanitize input data to an ndarray, copy if specified, coerce to the + dtype if specified. + """ + if dtype is not None: + dtype = pandas_dtype(dtype) + + if isinstance(data, ma.MaskedArray): + mask = ma.getmaskarray(data) + if mask.any(): + data, fill_value = maybe_upcast(data, copy=True) + data.soften_mask() # set hardmask False if it was True + data[mask] = fill_value + else: + data = data.copy() + + # extract ndarray or ExtensionArray, ensure we have no PandasArray + data = extract_array(data, extract_numpy=True) + + # GH#846 + if isinstance(data, np.ndarray): + + if dtype is not None and is_float_dtype(data.dtype) and is_integer_dtype(dtype): + # possibility of nan -> garbage + try: + subarr = _try_cast(data, dtype, copy, True) + except ValueError: + if copy: + subarr = data.copy() + else: + subarr = np.array(data, copy=False) + else: + # we will try to copy be-definition here + subarr = _try_cast(data, dtype, copy, raise_cast_failure) + + elif isinstance(data, ABCExtensionArray): + # it is already ensured above this is not a PandasArray + subarr = data + + if dtype is not None: + subarr = subarr.astype(dtype, copy=copy) + elif copy: + subarr = subarr.copy() + return subarr + + elif isinstance(data, (list, tuple)) and len(data) > 0: + if dtype is not None: + subarr = _try_cast(data, dtype, copy, raise_cast_failure) + else: + subarr = maybe_convert_platform(data) + + subarr = maybe_cast_to_datetime(subarr, dtype) + + elif isinstance(data, range): + # GH#16804 + arr = np.arange(data.start, data.stop, data.step, dtype="int64") + subarr = _try_cast(arr, dtype, copy, raise_cast_failure) + else: + subarr = _try_cast(data, dtype, copy, raise_cast_failure) + + # scalar like, GH + if getattr(subarr, "ndim", 0) == 0: + if isinstance(data, list): # pragma: no cover + subarr = np.array(data, dtype=object) + elif index is not None: + value = data + + # figure out the dtype from the value (upcast if necessary) + if dtype is None: + dtype, value = infer_dtype_from_scalar(value) + else: + # need to possibly convert the value here + value = maybe_cast_to_datetime(value, dtype) + + subarr = construct_1d_arraylike_from_scalar(value, len(index), dtype) + + else: + return subarr.item() + + # the result that we want + elif subarr.ndim == 1: + if index is not None: + + # a 1-element ndarray + if len(subarr) != len(index) and len(subarr) == 1: + subarr = construct_1d_arraylike_from_scalar( + subarr[0], len(index), subarr.dtype + ) + + elif subarr.ndim > 1: + if isinstance(data, np.ndarray): + raise Exception("Data must be 1-dimensional") + else: + subarr = com.asarray_tuplesafe(data, dtype=dtype) + + if not (is_extension_array_dtype(subarr.dtype) or is_extension_array_dtype(dtype)): + # This is to prevent mixed-type Series getting all casted to + # NumPy string type, e.g. NaN --> '-1#IND'. + if issubclass(subarr.dtype.type, str): + # GH#16605 + # If not empty convert the data to dtype + # GH#19853: If data is a scalar, subarr has already the result + if not lib.is_scalar(data): + if not np.all(isna(data)): + data = np.array(data, dtype=dtype, copy=False) + subarr = np.array(data, dtype=object, copy=copy) + + if is_object_dtype(subarr.dtype) and not is_object_dtype(dtype): + inferred = lib.infer_dtype(subarr, skipna=False) + if inferred in {"interval", "period"}: + subarr = array(subarr) + + return subarr + + +def _try_cast( + arr, + dtype: Optional[Union[np.dtype, "ExtensionDtype"]], + copy: bool, + raise_cast_failure: bool, +): + """ + Convert input to numpy ndarray and optionally cast to a given dtype. + + Parameters + ---------- + arr : ndarray, list, tuple, iterator (catchall) + Excludes: ExtensionArray, Series, Index. + dtype : np.dtype, ExtensionDtype or None + copy : bool + If False, don't copy the data if not needed. + raise_cast_failure : bool + If True, and if a dtype is specified, raise errors during casting. + Otherwise an object array is returned. + """ + # perf shortcut as this is the most common case + if isinstance(arr, np.ndarray): + if maybe_castable(arr) and not copy and dtype is None: + return arr + + try: + # GH#15832: Check if we are requesting a numeric dype and + # that we can convert the data to the requested dtype. + if is_integer_dtype(dtype): + subarr = maybe_cast_to_integer_array(arr, dtype) + + subarr = maybe_cast_to_datetime(arr, dtype) + # Take care in creating object arrays (but iterators are not + # supported): + if is_object_dtype(dtype) and ( + is_list_like(subarr) + and not (is_iterator(subarr) or isinstance(subarr, np.ndarray)) + ): + subarr = construct_1d_object_array_from_listlike(subarr) + elif not is_extension_array_dtype(subarr): + subarr = construct_1d_ndarray_preserving_na(subarr, dtype, copy=copy) + except OutOfBoundsDatetime: + # in case of out of bound datetime64 -> always raise + raise + except (ValueError, TypeError): + if is_categorical_dtype(dtype): + # We *do* allow casting to categorical, since we know + # that Categorical is the only array type for 'category'. + dtype = cast(CategoricalDtype, dtype) + subarr = dtype.construct_array_type()( + arr, dtype.categories, ordered=dtype.ordered + ) + elif is_extension_array_dtype(dtype): + # create an extension array from its dtype + dtype = cast(ExtensionDtype, dtype) + array_type = dtype.construct_array_type()._from_sequence + subarr = array_type(arr, dtype=dtype, copy=copy) + elif dtype is not None and raise_cast_failure: + raise + else: + subarr = np.array(arr, dtype=object, copy=copy) + return subarr + + +def is_empty_data(data: Any) -> bool: + """ + Utility to check if a Series is instantiated with empty data, + which does not contain dtype information. + + Parameters + ---------- + data : array-like, Iterable, dict, or scalar value + Contains data stored in Series. + + Returns + ------- + bool + """ + is_none = data is None + is_list_like_without_dtype = is_list_like(data) and not hasattr(data, "dtype") + is_simple_empty = is_list_like_without_dtype and not data + return is_none or is_simple_empty + + +def create_series_with_explicit_dtype( + data: Any = None, + index: Optional[Union[ArrayLike, "Index"]] = None, + dtype: Optional[Dtype] = None, + name: Optional[str] = None, + copy: bool = False, + fastpath: bool = False, + dtype_if_empty: Dtype = object, +) -> "Series": + """ + Helper to pass an explicit dtype when instantiating an empty Series. + + This silences a DeprecationWarning described in GitHub-17261. + + Parameters + ---------- + data : Mirrored from Series.__init__ + index : Mirrored from Series.__init__ + dtype : Mirrored from Series.__init__ + name : Mirrored from Series.__init__ + copy : Mirrored from Series.__init__ + fastpath : Mirrored from Series.__init__ + dtype_if_empty : str, numpy.dtype, or ExtensionDtype + This dtype will be passed explicitly if an empty Series will + be instantiated. + + Returns + ------- + Series + """ + from pandas.core.series import Series + + if is_empty_data(data) and dtype is None: + dtype = dtype_if_empty + return Series( + data=data, index=index, dtype=dtype, name=name, copy=copy, fastpath=fastpath + ) diff --git a/venv/Lib/site-packages/pandas/core/dtypes/__init__.py b/venv/Lib/site-packages/pandas/core/dtypes/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/core/dtypes/api.py b/venv/Lib/site-packages/pandas/core/dtypes/api.py new file mode 100644 index 0000000..051affd --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/dtypes/api.py @@ -0,0 +1,45 @@ +# flake8: noqa + +from pandas.core.dtypes.common import ( + is_array_like, + is_bool, + is_bool_dtype, + is_categorical, + is_categorical_dtype, + is_complex, + is_complex_dtype, + is_datetime64_any_dtype, + is_datetime64_dtype, + is_datetime64_ns_dtype, + is_datetime64tz_dtype, + is_dict_like, + is_dtype_equal, + is_extension_array_dtype, + is_extension_type, + is_file_like, + is_float, + is_float_dtype, + is_hashable, + is_int64_dtype, + is_integer, + is_integer_dtype, + is_interval, + is_interval_dtype, + is_iterator, + is_list_like, + is_named_tuple, + is_number, + is_numeric_dtype, + is_object_dtype, + is_period_dtype, + is_re, + is_re_compilable, + is_scalar, + is_signed_integer_dtype, + is_sparse, + is_string_dtype, + is_timedelta64_dtype, + is_timedelta64_ns_dtype, + is_unsigned_integer_dtype, + pandas_dtype, +) diff --git a/venv/Lib/site-packages/pandas/core/dtypes/base.py b/venv/Lib/site-packages/pandas/core/dtypes/base.py new file mode 100644 index 0000000..1b4e706 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/dtypes/base.py @@ -0,0 +1,317 @@ +"""Extend pandas with custom array types""" +from typing import Any, List, Optional, Tuple, Type + +import numpy as np + +from pandas.errors import AbstractMethodError + +from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries + + +class ExtensionDtype: + """ + A custom data type, to be paired with an ExtensionArray. + + .. versionadded:: 0.23.0 + + See Also + -------- + extensions.register_extension_dtype + extensions.ExtensionArray + + Notes + ----- + The interface includes the following abstract methods that must + be implemented by subclasses: + + * type + * name + * construct_from_string + + The following attributes influence the behavior of the dtype in + pandas operations + + * _is_numeric + * _is_boolean + + Optionally one can override construct_array_type for construction + with the name of this dtype via the Registry. See + :meth:`extensions.register_extension_dtype`. + + * construct_array_type + + The `na_value` class attribute can be used to set the default NA value + for this type. :attr:`numpy.nan` is used by default. + + ExtensionDtypes are required to be hashable. The base class provides + a default implementation, which relies on the ``_metadata`` class + attribute. ``_metadata`` should be a tuple containing the strings + that define your data type. For example, with ``PeriodDtype`` that's + the ``freq`` attribute. + + **If you have a parametrized dtype you should set the ``_metadata`` + class property**. + + Ideally, the attributes in ``_metadata`` will match the + parameters to your ``ExtensionDtype.__init__`` (if any). If any of + the attributes in ``_metadata`` don't implement the standard + ``__eq__`` or ``__hash__``, the default implementations here will not + work. + + .. versionchanged:: 0.24.0 + + Added ``_metadata``, ``__hash__``, and changed the default definition + of ``__eq__``. + + For interaction with Apache Arrow (pyarrow), a ``__from_arrow__`` method + can be implemented: this method receives a pyarrow Array or ChunkedArray + as only argument and is expected to return the appropriate pandas + ExtensionArray for this dtype and the passed values:: + + class ExtensionDtype: + + def __from_arrow__( + self, array: pyarrow.Array/ChunkedArray + ) -> ExtensionArray: + ... + + This class does not inherit from 'abc.ABCMeta' for performance reasons. + Methods and properties required by the interface raise + ``pandas.errors.AbstractMethodError`` and no ``register`` method is + provided for registering virtual subclasses. + """ + + _metadata: Tuple[str, ...] = () + + def __str__(self) -> str: + return self.name + + def __eq__(self, other: Any) -> bool: + """ + Check whether 'other' is equal to self. + + By default, 'other' is considered equal if either + + * it's a string matching 'self.name'. + * it's an instance of this type and all of the + the attributes in ``self._metadata`` are equal between + `self` and `other`. + + Parameters + ---------- + other : Any + + Returns + ------- + bool + """ + if isinstance(other, str): + try: + other = self.construct_from_string(other) + except TypeError: + return False + if isinstance(other, type(self)): + return all( + getattr(self, attr) == getattr(other, attr) for attr in self._metadata + ) + return False + + def __hash__(self) -> int: + return hash(tuple(getattr(self, attr) for attr in self._metadata)) + + def __ne__(self, other) -> bool: + return not self.__eq__(other) + + @property + def na_value(self): + """ + Default NA value to use for this type. + + This is used in e.g. ExtensionArray.take. This should be the + user-facing "boxed" version of the NA value, not the physical NA value + for storage. e.g. for JSONArray, this is an empty dictionary. + """ + return np.nan + + @property + def type(self) -> Type: + """ + The scalar type for the array, e.g. ``int`` + + It's expected ``ExtensionArray[item]`` returns an instance + of ``ExtensionDtype.type`` for scalar ``item``, assuming + that value is valid (not NA). NA values do not need to be + instances of `type`. + """ + raise AbstractMethodError(self) + + @property + def kind(self) -> str: + """ + A character code (one of 'biufcmMOSUV'), default 'O' + + This should match the NumPy dtype used when the array is + converted to an ndarray, which is probably 'O' for object if + the extension type cannot be represented as a built-in NumPy + type. + + See Also + -------- + numpy.dtype.kind + """ + return "O" + + @property + def name(self) -> str: + """ + A string identifying the data type. + + Will be used for display in, e.g. ``Series.dtype`` + """ + raise AbstractMethodError(self) + + @property + def names(self) -> Optional[List[str]]: + """ + Ordered list of field names, or None if there are no fields. + + This is for compatibility with NumPy arrays, and may be removed in the + future. + """ + return None + + @classmethod + def construct_array_type(cls): + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + raise NotImplementedError + + @classmethod + def construct_from_string(cls, string: str): + r""" + Construct this type from a string. + + This is useful mainly for data types that accept parameters. + For example, a period dtype accepts a frequency parameter that + can be set as ``period[H]`` (where H means hourly frequency). + + By default, in the abstract class, just the name of the type is + expected. But subclasses can overwrite this method to accept + parameters. + + Parameters + ---------- + string : str + The name of the type, for example ``category``. + + Returns + ------- + ExtensionDtype + Instance of the dtype. + + Raises + ------ + TypeError + If a class cannot be constructed from this 'string'. + + Examples + -------- + For extension dtypes with arguments the following may be an + adequate implementation. + + >>> @classmethod + ... def construct_from_string(cls, string): + ... pattern = re.compile(r"^my_type\[(?P.+)\]$") + ... match = pattern.match(string) + ... if match: + ... return cls(**match.groupdict()) + ... else: + ... raise TypeError(f"Cannot construct a '{cls.__name__}' from + ... " "'{string}'") + """ + if not isinstance(string, str): + raise TypeError(f"Expects a string, got {type(string).__name__}") + + # error: Non-overlapping equality check (left operand type: "str", right + # operand type: "Callable[[ExtensionDtype], str]") [comparison-overlap] + assert isinstance(cls.name, str), (cls, type(cls.name)) + if string != cls.name: + raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") + return cls() + + @classmethod + def is_dtype(cls, dtype) -> bool: + """ + Check if we match 'dtype'. + + Parameters + ---------- + dtype : object + The object to check. + + Returns + ------- + is_dtype : bool + + Notes + ----- + The default implementation is True if + + 1. ``cls.construct_from_string(dtype)`` is an instance + of ``cls``. + 2. ``dtype`` is an object and is an instance of ``cls`` + 3. ``dtype`` has a ``dtype`` attribute, and any of the above + conditions is true for ``dtype.dtype``. + """ + dtype = getattr(dtype, "dtype", dtype) + + if isinstance(dtype, (ABCSeries, ABCIndexClass, ABCDataFrame, np.dtype)): + # https://github.com/pandas-dev/pandas/issues/22960 + # avoid passing data to `construct_from_string`. This could + # cause a FutureWarning from numpy about failing elementwise + # comparison from, e.g., comparing DataFrame == 'category'. + return False + elif dtype is None: + return False + elif isinstance(dtype, cls): + return True + if isinstance(dtype, str): + try: + return cls.construct_from_string(dtype) is not None + except TypeError: + return False + return False + + @property + def _is_numeric(self) -> bool: + """ + Whether columns with this dtype should be considered numeric. + + By default ExtensionDtypes are assumed to be non-numeric. + They'll be excluded from operations that exclude non-numeric + columns, like (groupby) reductions, plotting, etc. + """ + return False + + @property + def _is_boolean(self) -> bool: + """ + Whether this dtype should be considered boolean. + + By default, ExtensionDtypes are assumed to be non-numeric. + Setting this to True will affect the behavior of several places, + e.g. + + * is_bool + * boolean indexing + + Returns + ------- + bool + """ + return False diff --git a/venv/Lib/site-packages/pandas/core/dtypes/cast.py b/venv/Lib/site-packages/pandas/core/dtypes/cast.py new file mode 100644 index 0000000..fa80e5c --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/dtypes/cast.py @@ -0,0 +1,1601 @@ +""" routings for casting """ + +from datetime import datetime, timedelta + +import numpy as np + +from pandas._libs import lib, tslib, tslibs +from pandas._libs.tslibs import NaT, OutOfBoundsDatetime, Period, iNaT +from pandas._libs.tslibs.timezones import tz_compare +from pandas._typing import Dtype +from pandas.util._validators import validate_bool_kwarg + +from pandas.core.dtypes.common import ( + _INT64_DTYPE, + _NS_DTYPE, + _POSSIBLY_CAST_DTYPES, + _TD_DTYPE, + ensure_int8, + ensure_int16, + ensure_int32, + ensure_int64, + ensure_object, + ensure_str, + is_bool, + is_bool_dtype, + is_complex, + is_complex_dtype, + is_datetime64_dtype, + is_datetime64_ns_dtype, + is_datetime64tz_dtype, + is_datetime_or_timedelta_dtype, + is_dtype_equal, + is_extension_array_dtype, + is_float, + is_float_dtype, + is_integer, + is_integer_dtype, + is_numeric_dtype, + is_object_dtype, + is_scalar, + is_string_dtype, + is_timedelta64_dtype, + is_timedelta64_ns_dtype, + is_unsigned_integer_dtype, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import ( + DatetimeTZDtype, + ExtensionDtype, + IntervalDtype, + PeriodDtype, +) +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCDatetimeArray, + ABCDatetimeIndex, + ABCPeriodArray, + ABCPeriodIndex, + ABCSeries, +) +from pandas.core.dtypes.inference import is_list_like +from pandas.core.dtypes.missing import isna, notna + +_int8_max = np.iinfo(np.int8).max +_int16_max = np.iinfo(np.int16).max +_int32_max = np.iinfo(np.int32).max +_int64_max = np.iinfo(np.int64).max + + +def maybe_convert_platform(values): + """ try to do platform conversion, allow ndarray or list here """ + + if isinstance(values, (list, tuple, range)): + values = construct_1d_object_array_from_listlike(values) + if getattr(values, "dtype", None) == np.object_: + if hasattr(values, "_values"): + values = values._values + values = lib.maybe_convert_objects(values) + + return values + + +def is_nested_object(obj) -> bool: + """ + return a boolean if we have a nested object, e.g. a Series with 1 or + more Series elements + + This may not be necessarily be performant. + + """ + + if isinstance(obj, ABCSeries) and is_object_dtype(obj): + + if any(isinstance(v, ABCSeries) for v in obj.values): + return True + + return False + + +def maybe_downcast_to_dtype(result, dtype): + """ try to cast to the specified dtype (e.g. convert back to bool/int + or could be an astype of float64->float32 + """ + do_round = False + + if is_scalar(result): + return result + elif isinstance(result, ABCDataFrame): + # occurs in pivot_table doctest + return result + + if isinstance(dtype, str): + if dtype == "infer": + inferred_type = lib.infer_dtype(ensure_object(result.ravel()), skipna=False) + if inferred_type == "boolean": + dtype = "bool" + elif inferred_type == "integer": + dtype = "int64" + elif inferred_type == "datetime64": + dtype = "datetime64[ns]" + elif inferred_type == "timedelta64": + dtype = "timedelta64[ns]" + + # try to upcast here + elif inferred_type == "floating": + dtype = "int64" + if issubclass(result.dtype.type, np.number): + do_round = True + + else: + dtype = "object" + + dtype = np.dtype(dtype) + + converted = maybe_downcast_numeric(result, dtype, do_round) + if converted is not result: + return converted + + # a datetimelike + # GH12821, iNaT is casted to float + if dtype.kind in ["M", "m"] and result.dtype.kind in ["i", "f"]: + if hasattr(dtype, "tz"): + # not a numpy dtype + if dtype.tz: + # convert to datetime and change timezone + from pandas import to_datetime + + result = to_datetime(result).tz_localize("utc") + result = result.tz_convert(dtype.tz) + else: + result = result.astype(dtype) + + elif dtype.type is Period: + # TODO(DatetimeArray): merge with previous elif + from pandas.core.arrays import PeriodArray + + try: + return PeriodArray(result, freq=dtype.freq) + except TypeError: + # e.g. TypeError: int() argument must be a string, a + # bytes-like object or a number, not 'Period + pass + + return result + + +def maybe_downcast_numeric(result, dtype, do_round: bool = False): + """ + Subset of maybe_downcast_to_dtype restricted to numeric dtypes. + + Parameters + ---------- + result : ndarray or ExtensionArray + dtype : np.dtype or ExtensionDtype + do_round : bool + + Returns + ------- + ndarray or ExtensionArray + """ + if not isinstance(dtype, np.dtype): + # e.g. SparseDtype has no itemsize attr + return result + + if isinstance(result, list): + # reached via groupoby.agg _ohlc; really this should be handled + # earlier + result = np.array(result) + + def trans(x): + if do_round: + return x.round() + return x + + if dtype.kind == result.dtype.kind: + # don't allow upcasts here (except if empty) + if result.dtype.itemsize <= dtype.itemsize and result.size: + return result + + if is_bool_dtype(dtype) or is_integer_dtype(dtype): + + if not result.size: + # if we don't have any elements, just astype it + return trans(result).astype(dtype) + + # do a test on the first element, if it fails then we are done + r = result.ravel() + arr = np.array([r[0]]) + + if isna(arr).any(): + # if we have any nulls, then we are done + return result + + elif not isinstance(r[0], (np.integer, np.floating, np.bool, int, float, bool)): + # a comparable, e.g. a Decimal may slip in here + return result + + if ( + issubclass(result.dtype.type, (np.object_, np.number)) + and notna(result).all() + ): + new_result = trans(result).astype(dtype) + if new_result.dtype.kind == "O" or result.dtype.kind == "O": + # np.allclose may raise TypeError on object-dtype + if (new_result == result).all(): + return new_result + else: + if np.allclose(new_result, result, rtol=0): + return new_result + + elif ( + issubclass(dtype.type, np.floating) + and not is_bool_dtype(result.dtype) + and not is_string_dtype(result.dtype) + ): + return result.astype(dtype) + + return result + + +def maybe_upcast_putmask(result: np.ndarray, mask: np.ndarray, other): + """ + A safe version of putmask that potentially upcasts the result. + The result is replaced with the first N elements of other, + where N is the number of True values in mask. + If the length of other is shorter than N, other will be repeated. + + Parameters + ---------- + result : ndarray + The destination array. This will be mutated in-place if no upcasting is + necessary. + mask : boolean ndarray + other : scalar + The source value. + + Returns + ------- + result : ndarray + changed : bool + Set to true if the result array was upcasted. + + Examples + -------- + >>> result, _ = maybe_upcast_putmask(np.arange(1,6), + np.array([False, True, False, True, True]), np.arange(21,23)) + >>> result + array([1, 21, 3, 22, 21]) + """ + + if not isinstance(result, np.ndarray): + raise ValueError("The result input must be a ndarray.") + if not is_scalar(other): + # We _could_ support non-scalar other, but until we have a compelling + # use case, we assume away the possibility. + raise ValueError("other must be a scalar") + + if mask.any(): + # Two conversions for date-like dtypes that can't be done automatically + # in np.place: + # NaN -> NaT + # integer or integer array -> date-like array + if result.dtype.kind in ["m", "M"]: + if is_scalar(other): + if isna(other): + other = result.dtype.type("nat") + elif is_integer(other): + other = np.array(other, dtype=result.dtype) + elif is_integer_dtype(other): + other = np.array(other, dtype=result.dtype) + + def changeit(): + + # try to directly set by expanding our array to full + # length of the boolean + try: + om = other[mask] + except (IndexError, TypeError): + # IndexError occurs in test_upcast when we have a boolean + # mask of the wrong shape + # TypeError occurs in test_upcast when `other` is a bool + pass + else: + om_at = om.astype(result.dtype) + if (om == om_at).all(): + new_result = result.values.copy() + new_result[mask] = om_at + result[:] = new_result + return result, False + + # we are forced to change the dtype of the result as the input + # isn't compatible + r, _ = maybe_upcast(result, fill_value=other, copy=True) + np.place(r, mask, other) + + return r, True + + # we want to decide whether place will work + # if we have nans in the False portion of our mask then we need to + # upcast (possibly), otherwise we DON't want to upcast (e.g. if we + # have values, say integers, in the success portion then it's ok to not + # upcast) + new_dtype, _ = maybe_promote(result.dtype, other) + if new_dtype != result.dtype: + + # we have a scalar or len 0 ndarray + # and its nan and we are changing some values + if is_scalar(other) or (isinstance(other, np.ndarray) and other.ndim < 1): + if isna(other): + return changeit() + + # we have an ndarray and the masking has nans in it + else: + + if isna(other).any(): + return changeit() + + try: + np.place(result, mask, other) + except TypeError: + # e.g. int-dtype result and float-dtype other + return changeit() + + return result, False + + +def maybe_promote(dtype, fill_value=np.nan): + """ + Find the minimal dtype that can hold both the given dtype and fill_value. + + Parameters + ---------- + dtype : np.dtype or ExtensionDtype + fill_value : scalar, default np.nan + + Returns + ------- + dtype + Upcasted from dtype argument if necessary. + fill_value + Upcasted from fill_value argument if necessary. + """ + if not is_scalar(fill_value) and not is_object_dtype(dtype): + # with object dtype there is nothing to promote, and the user can + # pass pretty much any weird fill_value they like + raise ValueError("fill_value must be a scalar") + + # if we passed an array here, determine the fill value by dtype + if isinstance(fill_value, np.ndarray): + if issubclass(fill_value.dtype.type, (np.datetime64, np.timedelta64)): + fill_value = fill_value.dtype.type("NaT", "ns") + else: + + # we need to change to object type as our + # fill_value is of object type + if fill_value.dtype == np.object_: + dtype = np.dtype(np.object_) + fill_value = np.nan + + if dtype == np.object_ or dtype.kind in ["U", "S"]: + # We treat string-like dtypes as object, and _always_ fill + # with np.nan + fill_value = np.nan + dtype = np.dtype(np.object_) + + # returns tuple of (dtype, fill_value) + if issubclass(dtype.type, np.datetime64): + if isinstance(fill_value, datetime) and fill_value.tzinfo is not None: + # Trying to insert tzaware into tznaive, have to cast to object + dtype = np.dtype(np.object_) + elif is_integer(fill_value) or (is_float(fill_value) and not isna(fill_value)): + dtype = np.dtype(np.object_) + else: + try: + fill_value = tslibs.Timestamp(fill_value).to_datetime64() + except (TypeError, ValueError): + dtype = np.dtype(np.object_) + elif issubclass(dtype.type, np.timedelta64): + if ( + is_integer(fill_value) + or (is_float(fill_value) and not np.isnan(fill_value)) + or isinstance(fill_value, str) + ): + # TODO: What about str that can be a timedelta? + dtype = np.dtype(np.object_) + else: + try: + fv = tslibs.Timedelta(fill_value) + except ValueError: + dtype = np.dtype(np.object_) + else: + if fv is NaT: + # NaT has no `to_timedelta64` method + fill_value = np.timedelta64("NaT", "ns") + else: + fill_value = fv.to_timedelta64() + elif is_datetime64tz_dtype(dtype): + if isna(fill_value): + fill_value = NaT + elif not isinstance(fill_value, datetime): + dtype = np.dtype(np.object_) + elif fill_value.tzinfo is None: + dtype = np.dtype(np.object_) + elif not tz_compare(fill_value.tzinfo, dtype.tz): + # TODO: sure we want to cast here? + dtype = np.dtype(np.object_) + + elif is_extension_array_dtype(dtype) and isna(fill_value): + fill_value = dtype.na_value + + elif is_float(fill_value): + if issubclass(dtype.type, np.bool_): + dtype = np.dtype(np.object_) + + elif issubclass(dtype.type, np.integer): + dtype = np.dtype(np.float64) + + elif dtype.kind == "f": + mst = np.min_scalar_type(fill_value) + if mst > dtype: + # e.g. mst is np.float64 and dtype is np.float32 + dtype = mst + + elif dtype.kind == "c": + mst = np.min_scalar_type(fill_value) + dtype = np.promote_types(dtype, mst) + + elif is_bool(fill_value): + if not issubclass(dtype.type, np.bool_): + dtype = np.dtype(np.object_) + + elif is_integer(fill_value): + if issubclass(dtype.type, np.bool_): + dtype = np.dtype(np.object_) + + elif issubclass(dtype.type, np.integer): + if not np.can_cast(fill_value, dtype): + # upcast to prevent overflow + mst = np.min_scalar_type(fill_value) + dtype = np.promote_types(dtype, mst) + if dtype.kind == "f": + # Case where we disagree with numpy + dtype = np.dtype(np.object_) + + elif is_complex(fill_value): + if issubclass(dtype.type, np.bool_): + dtype = np.dtype(np.object_) + + elif issubclass(dtype.type, (np.integer, np.floating)): + mst = np.min_scalar_type(fill_value) + dtype = np.promote_types(dtype, mst) + + elif dtype.kind == "c": + mst = np.min_scalar_type(fill_value) + if mst > dtype: + # e.g. mst is np.complex128 and dtype is np.complex64 + dtype = mst + + elif fill_value is None: + if is_float_dtype(dtype) or is_complex_dtype(dtype): + fill_value = np.nan + elif is_integer_dtype(dtype): + dtype = np.float64 + fill_value = np.nan + elif is_datetime_or_timedelta_dtype(dtype): + fill_value = dtype.type("NaT", "ns") + else: + dtype = np.dtype(np.object_) + fill_value = np.nan + else: + dtype = np.dtype(np.object_) + + # in case we have a string that looked like a number + if is_extension_array_dtype(dtype): + pass + elif issubclass(np.dtype(dtype).type, (bytes, str)): + dtype = np.dtype(np.object_) + + fill_value = _ensure_dtype_type(fill_value, dtype) + return dtype, fill_value + + +def _ensure_dtype_type(value, dtype): + """ + Ensure that the given value is an instance of the given dtype. + + e.g. if out dtype is np.complex64, we should have an instance of that + as opposed to a python complex object. + + Parameters + ---------- + value : object + dtype : np.dtype or ExtensionDtype + + Returns + ------- + object + """ + + # Start with exceptions in which we do _not_ cast to numpy types + if is_extension_array_dtype(dtype): + return value + elif dtype == np.object_: + return value + elif isna(value): + # e.g. keep np.nan rather than try to cast to np.float32(np.nan) + return value + + return dtype.type(value) + + +def infer_dtype_from(val, pandas_dtype: bool = False): + """ + Interpret the dtype from a scalar or array. + + Parameters + ---------- + val : object + pandas_dtype : bool, default False + whether to infer dtype including pandas extension types. + If False, scalar/array belongs to pandas extension types is inferred as + object + """ + if is_scalar(val): + return infer_dtype_from_scalar(val, pandas_dtype=pandas_dtype) + return infer_dtype_from_array(val, pandas_dtype=pandas_dtype) + + +def infer_dtype_from_scalar(val, pandas_dtype: bool = False): + """ + Interpret the dtype from a scalar. + + Parameters + ---------- + pandas_dtype : bool, default False + whether to infer dtype including pandas extension types. + If False, scalar belongs to pandas extension types is inferred as + object + """ + + dtype = np.object_ + + # a 1-element ndarray + if isinstance(val, np.ndarray): + msg = "invalid ndarray passed to infer_dtype_from_scalar" + if val.ndim != 0: + raise ValueError(msg) + + dtype = val.dtype + val = val.item() + + elif isinstance(val, str): + + # If we create an empty array using a string to infer + # the dtype, NumPy will only allocate one character per entry + # so this is kind of bad. Alternately we could use np.repeat + # instead of np.empty (but then you still don't want things + # coming out as np.str_! + + dtype = np.object_ + + elif isinstance(val, (np.datetime64, datetime)): + val = tslibs.Timestamp(val) + if val is tslibs.NaT or val.tz is None: + dtype = np.dtype("M8[ns]") + else: + if pandas_dtype: + dtype = DatetimeTZDtype(unit="ns", tz=val.tz) + else: + # return datetimetz as object + return np.object_, val + val = val.value + + elif isinstance(val, (np.timedelta64, timedelta)): + val = tslibs.Timedelta(val).value + dtype = np.dtype("m8[ns]") + + elif is_bool(val): + dtype = np.bool_ + + elif is_integer(val): + if isinstance(val, np.integer): + dtype = type(val) + else: + dtype = np.int64 + + elif is_float(val): + if isinstance(val, np.floating): + dtype = type(val) + else: + dtype = np.float64 + + elif is_complex(val): + dtype = np.complex_ + + elif pandas_dtype: + if lib.is_period(val): + dtype = PeriodDtype(freq=val.freq) + val = val.ordinal + elif lib.is_interval(val): + subtype = infer_dtype_from_scalar(val.left, pandas_dtype=True)[0] + dtype = IntervalDtype(subtype=subtype) + + return dtype, val + + +def infer_dtype_from_array(arr, pandas_dtype: bool = False): + """ + Infer the dtype from an array. + + Parameters + ---------- + arr : array + pandas_dtype : bool, default False + whether to infer dtype including pandas extension types. + If False, array belongs to pandas extension types + is inferred as object + + Returns + ------- + tuple (numpy-compat/pandas-compat dtype, array) + + Notes + ----- + if pandas_dtype=False. these infer to numpy dtypes + exactly with the exception that mixed / object dtypes + are not coerced by stringifying or conversion + + if pandas_dtype=True. datetime64tz-aware/categorical + types will retain there character. + + Examples + -------- + >>> np.asarray([1, '1']) + array(['1', '1'], dtype='>> infer_dtype_from_array([1, '1']) + (numpy.object_, [1, '1']) + """ + + if isinstance(arr, np.ndarray): + return arr.dtype, arr + + if not is_list_like(arr): + arr = [arr] + + if pandas_dtype and is_extension_array_dtype(arr): + return arr.dtype, arr + + elif isinstance(arr, ABCSeries): + return arr.dtype, np.asarray(arr) + + # don't force numpy coerce with nan's + inferred = lib.infer_dtype(arr, skipna=False) + if inferred in ["string", "bytes", "unicode", "mixed", "mixed-integer"]: + return (np.object_, arr) + + arr = np.asarray(arr) + return arr.dtype, arr + + +def maybe_infer_dtype_type(element): + """ + Try to infer an object's dtype, for use in arithmetic ops. + + Uses `element.dtype` if that's available. + Objects implementing the iterator protocol are cast to a NumPy array, + and from there the array's type is used. + + Parameters + ---------- + element : object + Possibly has a `.dtype` attribute, and possibly the iterator + protocol. + + Returns + ------- + tipo : type + + Examples + -------- + >>> from collections import namedtuple + >>> Foo = namedtuple("Foo", "dtype") + >>> maybe_infer_dtype_type(Foo(np.dtype("i8"))) + numpy.int64 + """ + tipo = None + if hasattr(element, "dtype"): + tipo = element.dtype + elif is_list_like(element): + element = np.asarray(element) + tipo = element.dtype + return tipo + + +def maybe_upcast(values, fill_value=np.nan, dtype=None, copy: bool = False): + """ + Provide explicit type promotion and coercion. + + Parameters + ---------- + values : ndarray or ExtensionArray + The array that we want to maybe upcast. + fill_value : what we want to fill with + dtype : if None, then use the dtype of the values, else coerce to this type + copy : bool, default True + If True always make a copy even if no upcast is required. + """ + if not is_scalar(fill_value) and not is_object_dtype(values.dtype): + # We allow arbitrary fill values for object dtype + raise ValueError("fill_value must be a scalar") + + if is_extension_array_dtype(values): + if copy: + values = values.copy() + else: + if dtype is None: + dtype = values.dtype + new_dtype, fill_value = maybe_promote(dtype, fill_value) + if new_dtype != values.dtype: + values = values.astype(new_dtype) + elif copy: + values = values.copy() + + return values, fill_value + + +def invalidate_string_dtypes(dtype_set): + """Change string like dtypes to object for + ``DataFrame.select_dtypes()``. + """ + non_string_dtypes = dtype_set - {np.dtype("S").type, np.dtype(" 1 and coerce: + raise ValueError( + "Only one of 'datetime', 'numeric' or " + "'timedelta' can be True when when coerce=True." + ) + + if not is_object_dtype(values.dtype): + # If not object, do not attempt conversion + values = values.copy() if copy else values + return values + + # If 1 flag is coerce, ensure 2 others are False + if coerce: + # Immediate return if coerce + if datetime: + from pandas import to_datetime + + return to_datetime(values, errors="coerce").to_numpy() + elif timedelta: + from pandas import to_timedelta + + return to_timedelta(values, errors="coerce").to_numpy() + elif numeric: + from pandas import to_numeric + + return to_numeric(values, errors="coerce") + + # Soft conversions + if datetime: + # GH 20380, when datetime is beyond year 2262, hence outside + # bound of nanosecond-resolution 64-bit integers. + try: + values = lib.maybe_convert_objects(values, convert_datetime=True) + except OutOfBoundsDatetime: + pass + + if timedelta and is_object_dtype(values.dtype): + # Object check to ensure only run if previous did not convert + values = lib.maybe_convert_objects(values, convert_timedelta=True) + + if numeric and is_object_dtype(values.dtype): + try: + converted = lib.maybe_convert_numeric(values, set(), coerce_numeric=True) + except (ValueError, TypeError): + pass + else: + # If all NaNs, then do not-alter + values = converted if not isna(converted).all() else values + values = values.copy() if copy else values + + return values + + +def convert_dtypes( + input_array, + convert_string: bool = True, + convert_integer: bool = True, + convert_boolean: bool = True, +) -> Dtype: + """ + Convert objects to best possible type, and optionally, + to types supporting ``pd.NA``. + + Parameters + ---------- + input_array : ExtensionArray or PandasArray + convert_string : bool, default True + Whether object dtypes should be converted to ``StringDtype()``. + convert_integer : bool, default True + Whether, if possible, conversion can be done to integer extension types. + convert_boolean : bool, defaults True + Whether object dtypes should be converted to ``BooleanDtypes()``. + + Returns + ------- + dtype + new dtype + """ + + if convert_string or convert_integer or convert_boolean: + try: + inferred_dtype = lib.infer_dtype(input_array) + except ValueError: + # Required to catch due to Period. Can remove once GH 23553 is fixed + inferred_dtype = input_array.dtype + + if not convert_string and is_string_dtype(inferred_dtype): + inferred_dtype = input_array.dtype + + if convert_integer: + target_int_dtype = "Int64" + + if isinstance(inferred_dtype, str) and ( + inferred_dtype == "mixed-integer" + or inferred_dtype == "mixed-integer-float" + ): + inferred_dtype = target_int_dtype + if is_integer_dtype(input_array.dtype) and not is_extension_array_dtype( + input_array.dtype + ): + from pandas.core.arrays.integer import _dtypes + + inferred_dtype = _dtypes.get(input_array.dtype.name, target_int_dtype) + if not is_integer_dtype(input_array.dtype) and is_numeric_dtype( + input_array.dtype + ): + inferred_dtype = target_int_dtype + + else: + if is_integer_dtype(inferred_dtype): + inferred_dtype = input_array.dtype + + if convert_boolean: + if is_bool_dtype(input_array.dtype) and not is_extension_array_dtype( + input_array.dtype + ): + inferred_dtype = "boolean" + else: + if isinstance(inferred_dtype, str) and inferred_dtype == "boolean": + inferred_dtype = input_array.dtype + + else: + inferred_dtype = input_array.dtype + + return inferred_dtype + + +def maybe_castable(arr) -> bool: + # return False to force a non-fastpath + + # check datetime64[ns]/timedelta64[ns] are valid + # otherwise try to coerce + kind = arr.dtype.kind + if kind == "M": + return is_datetime64_ns_dtype(arr.dtype) + elif kind == "m": + return is_timedelta64_ns_dtype(arr.dtype) + + return arr.dtype.name not in _POSSIBLY_CAST_DTYPES + + +def maybe_infer_to_datetimelike(value, convert_dates: bool = False): + """ + we might have a array (or single object) that is datetime like, + and no dtype is passed don't change the value unless we find a + datetime/timedelta set + + this is pretty strict in that a datetime/timedelta is REQUIRED + in addition to possible nulls/string likes + + Parameters + ---------- + value : np.array / Series / Index / list-like + convert_dates : bool, default False + if True try really hard to convert dates (such as datetime.date), other + leave inferred dtype 'date' alone + + """ + + # TODO: why not timedelta? + if isinstance( + value, (ABCDatetimeIndex, ABCPeriodIndex, ABCDatetimeArray, ABCPeriodArray) + ): + return value + elif isinstance(value, ABCSeries): + if isinstance(value._values, ABCDatetimeIndex): + return value._values + + v = value + + if not is_list_like(v): + v = [v] + v = np.array(v, copy=False) + + # we only care about object dtypes + if not is_object_dtype(v): + return value + + shape = v.shape + if not v.ndim == 1: + v = v.ravel() + + if not len(v): + return value + + def try_datetime(v): + # safe coerce to datetime64 + try: + # GH19671 + v = tslib.array_to_datetime(v, require_iso8601=True, errors="raise")[0] + except ValueError: + + # we might have a sequence of the same-datetimes with tz's + # if so coerce to a DatetimeIndex; if they are not the same, + # then these stay as object dtype, xref GH19671 + from pandas._libs.tslibs import conversion + from pandas import DatetimeIndex + + try: + + values, tz = conversion.datetime_to_datetime64(v) + return DatetimeIndex(values).tz_localize("UTC").tz_convert(tz=tz) + except (ValueError, TypeError): + pass + + except Exception: + pass + + return v.reshape(shape) + + def try_timedelta(v): + # safe coerce to timedelta64 + + # will try first with a string & object conversion + from pandas import to_timedelta + + try: + return to_timedelta(v)._ndarray_values.reshape(shape) + except ValueError: + return v.reshape(shape) + + inferred_type = lib.infer_datetimelike_array(ensure_object(v)) + + if inferred_type == "date" and convert_dates: + value = try_datetime(v) + elif inferred_type == "datetime": + value = try_datetime(v) + elif inferred_type == "timedelta": + value = try_timedelta(v) + elif inferred_type == "nat": + + # if all NaT, return as datetime + if isna(v).all(): + value = try_datetime(v) + else: + + # We have at least a NaT and a string + # try timedelta first to avoid spurious datetime conversions + # e.g. '00:00:01' is a timedelta but technically is also a datetime + value = try_timedelta(v) + if lib.infer_dtype(value, skipna=False) in ["mixed"]: + # cannot skip missing values, as NaT implies that the string + # is actually a datetime + value = try_datetime(v) + + return value + + +def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): + """ try to cast the array/value to a datetimelike dtype, converting float + nan to iNaT + """ + from pandas.core.tools.timedeltas import to_timedelta + from pandas.core.tools.datetimes import to_datetime + + if dtype is not None: + if isinstance(dtype, str): + dtype = np.dtype(dtype) + + is_datetime64 = is_datetime64_dtype(dtype) + is_datetime64tz = is_datetime64tz_dtype(dtype) + is_timedelta64 = is_timedelta64_dtype(dtype) + + if is_datetime64 or is_datetime64tz or is_timedelta64: + + # Force the dtype if needed. + msg = ( + f"The '{dtype.name}' dtype has no unit. " + f"Please pass in '{dtype.name}[ns]' instead." + ) + + if is_datetime64 and not is_dtype_equal(dtype, _NS_DTYPE): + + # pandas supports dtype whose granularity is less than [ns] + # e.g., [ps], [fs], [as] + if dtype <= np.dtype("M8[ns]"): + if dtype.name == "datetime64": + raise ValueError(msg) + dtype = _NS_DTYPE + else: + raise TypeError(f"cannot convert datetimelike to dtype [{dtype}]") + elif is_datetime64tz: + + # our NaT doesn't support tz's + # this will coerce to DatetimeIndex with + # a matching dtype below + if is_scalar(value) and isna(value): + value = [value] + + elif is_timedelta64 and not is_dtype_equal(dtype, _TD_DTYPE): + + # pandas supports dtype whose granularity is less than [ns] + # e.g., [ps], [fs], [as] + if dtype <= np.dtype("m8[ns]"): + if dtype.name == "timedelta64": + raise ValueError(msg) + dtype = _TD_DTYPE + else: + raise TypeError(f"cannot convert timedeltalike to dtype [{dtype}]") + + if is_scalar(value): + if value == iNaT or isna(value): + value = iNaT + else: + value = np.array(value, copy=False) + + # have a scalar array-like (e.g. NaT) + if value.ndim == 0: + value = iNaT + + # we have an array of datetime or timedeltas & nulls + elif np.prod(value.shape) or not is_dtype_equal(value.dtype, dtype): + try: + if is_datetime64: + value = to_datetime(value, errors=errors) + # GH 25843: Remove tz information since the dtype + # didn't specify one + if value.tz is not None: + value = value.tz_localize(None) + value = value._values + elif is_datetime64tz: + # The string check can be removed once issue #13712 + # is solved. String data that is passed with a + # datetime64tz is assumed to be naive which should + # be localized to the timezone. + is_dt_string = is_string_dtype(value) + value = to_datetime(value, errors=errors).array + if is_dt_string: + # Strings here are naive, so directly localize + value = value.tz_localize(dtype.tz) + else: + # Numeric values are UTC at this point, + # so localize and convert + value = value.tz_localize("UTC").tz_convert(dtype.tz) + elif is_timedelta64: + value = to_timedelta(value, errors=errors)._values + except OutOfBoundsDatetime: + raise + except (AttributeError, ValueError, TypeError): + pass + + # coerce datetimelike to object + elif is_datetime64_dtype(value) and not is_datetime64_dtype(dtype): + if is_object_dtype(dtype): + if value.dtype != _NS_DTYPE: + value = value.astype(_NS_DTYPE) + ints = np.asarray(value).view("i8") + return tslib.ints_to_pydatetime(ints) + + # we have a non-castable dtype that was passed + raise TypeError(f"Cannot cast datetime64 to {dtype}") + + else: + + is_array = isinstance(value, np.ndarray) + + # catch a datetime/timedelta that is not of ns variety + # and no coercion specified + if is_array and value.dtype.kind in ["M", "m"]: + dtype = value.dtype + + if dtype.kind == "M" and dtype != _NS_DTYPE: + value = tslibs.conversion.ensure_datetime64ns(value) + + elif dtype.kind == "m" and dtype != _TD_DTYPE: + value = to_timedelta(value) + + # only do this if we have an array and the dtype of the array is not + # setup already we are not an integer/object, so don't bother with this + # conversion + elif not ( + is_array + and not ( + issubclass(value.dtype.type, np.integer) or value.dtype == np.object_ + ) + ): + value = maybe_infer_to_datetimelike(value) + + return value + + +def find_common_type(types): + """ + Find a common data type among the given dtypes. + + Parameters + ---------- + types : list of dtypes + + Returns + ------- + pandas extension or numpy dtype + + See Also + -------- + numpy.find_common_type + + """ + + if len(types) == 0: + raise ValueError("no types given") + + first = types[0] + + # workaround for find_common_type([np.dtype('datetime64[ns]')] * 2) + # => object + if all(is_dtype_equal(first, t) for t in types[1:]): + return first + + if any(isinstance(t, ExtensionDtype) for t in types): + return np.object + + # take lowest unit + if all(is_datetime64_dtype(t) for t in types): + return np.dtype("datetime64[ns]") + if all(is_timedelta64_dtype(t) for t in types): + return np.dtype("timedelta64[ns]") + + # don't mix bool / int or float or complex + # this is different from numpy, which casts bool with float/int as int + has_bools = any(is_bool_dtype(t) for t in types) + if has_bools: + for t in types: + if is_integer_dtype(t) or is_float_dtype(t) or is_complex_dtype(t): + return np.object + + return np.find_common_type(types, []) + + +def cast_scalar_to_array(shape, value, dtype=None): + """ + Create np.ndarray of specified shape and dtype, filled with values. + + Parameters + ---------- + shape : tuple + value : scalar value + dtype : np.dtype, optional + dtype to coerce + + Returns + ------- + ndarray of shape, filled with value, of specified / inferred dtype + + """ + + if dtype is None: + dtype, fill_value = infer_dtype_from_scalar(value) + else: + fill_value = value + + values = np.empty(shape, dtype=dtype) + values.fill(fill_value) + + return values + + +def construct_1d_arraylike_from_scalar(value, length: int, dtype): + """ + create a np.ndarray / pandas type of specified shape and dtype + filled with values + + Parameters + ---------- + value : scalar value + length : int + dtype : pandas_dtype / np.dtype + + Returns + ------- + np.ndarray / pandas type of length, filled with value + + """ + if is_extension_array_dtype(dtype): + cls = dtype.construct_array_type() + subarr = cls._from_sequence([value] * length, dtype=dtype) + + else: + if not isinstance(dtype, (np.dtype, type(np.dtype))): + dtype = dtype.dtype + + if length and is_integer_dtype(dtype) and isna(value): + # coerce if we have nan for an integer dtype + dtype = np.dtype("float64") + elif isinstance(dtype, np.dtype) and dtype.kind in ("U", "S"): + # we need to coerce to object dtype to avoid + # to allow numpy to take our string as a scalar value + dtype = object + if not isna(value): + value = ensure_str(value) + + subarr = np.empty(length, dtype=dtype) + subarr.fill(value) + + return subarr + + +def construct_1d_object_array_from_listlike(values): + """ + Transform any list-like object in a 1-dimensional numpy array of object + dtype. + + Parameters + ---------- + values : any iterable which has a len() + + Raises + ------ + TypeError + * If `values` does not have a len() + + Returns + ------- + 1-dimensional numpy array of dtype object + """ + # numpy will try to interpret nested lists as further dimensions, hence + # making a 1D array that contains list-likes is a bit tricky: + result = np.empty(len(values), dtype="object") + result[:] = values + return result + + +def construct_1d_ndarray_preserving_na(values, dtype=None, copy: bool = False): + """ + Construct a new ndarray, coercing `values` to `dtype`, preserving NA. + + Parameters + ---------- + values : Sequence + dtype : numpy.dtype, optional + copy : bool, default False + Note that copies may still be made with ``copy=False`` if casting + is required. + + Returns + ------- + arr : ndarray[dtype] + + Examples + -------- + >>> np.array([1.0, 2.0, None], dtype='str') + array(['1.0', '2.0', 'None'], dtype='>> construct_1d_ndarray_preserving_na([1.0, 2.0, None], dtype=np.dtype('str')) + array(['1.0', '2.0', None], dtype=object) + """ + subarr = np.array(values, dtype=dtype, copy=copy) + + if dtype is not None and dtype.kind in ("U", "S"): + # GH-21083 + # We can't just return np.array(subarr, dtype='str') since + # NumPy will convert the non-string objects into strings + # Including NA values. Se we have to go + # string -> object -> update NA, which requires an + # additional pass over the data. + na_values = isna(values) + subarr2 = subarr.astype(object) + subarr2[na_values] = np.asarray(values, dtype=object)[na_values] + subarr = subarr2 + + return subarr + + +def maybe_cast_to_integer_array(arr, dtype, copy: bool = False): + """ + Takes any dtype and returns the casted version, raising for when data is + incompatible with integer/unsigned integer dtypes. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + arr : array-like + The array to cast. + dtype : str, np.dtype + The integer dtype to cast the array to. + copy: bool, default False + Whether to make a copy of the array before returning. + + Returns + ------- + int_arr : ndarray + An array of integer or unsigned integer dtype + + Raises + ------ + OverflowError : the dtype is incompatible with the data + ValueError : loss of precision has occurred during casting + + Examples + -------- + If you try to coerce negative values to unsigned integers, it raises: + + >>> Series([-1], dtype="uint64") + Traceback (most recent call last): + ... + OverflowError: Trying to coerce negative values to unsigned integers + + Also, if you try to coerce float values to integers, it raises: + + >>> Series([1, 2, 3.5], dtype="int64") + Traceback (most recent call last): + ... + ValueError: Trying to coerce float values to integers + """ + + try: + if not hasattr(arr, "astype"): + casted = np.array(arr, dtype=dtype, copy=copy) + else: + casted = arr.astype(dtype, copy=copy) + except OverflowError: + raise OverflowError( + "The elements provided in the data cannot all be " + f"casted to the dtype {dtype}" + ) + + if np.array_equal(arr, casted): + return casted + + # We do this casting to allow for proper + # data and dtype checking. + # + # We didn't do this earlier because NumPy + # doesn't handle `uint64` correctly. + arr = np.asarray(arr) + + if is_unsigned_integer_dtype(dtype) and (arr < 0).any(): + raise OverflowError("Trying to coerce negative values to unsigned integers") + + if is_integer_dtype(dtype) and (is_float_dtype(arr) or is_object_dtype(arr)): + raise ValueError("Trying to coerce float values to integers") diff --git a/venv/Lib/site-packages/pandas/core/dtypes/common.py b/venv/Lib/site-packages/pandas/core/dtypes/common.py new file mode 100644 index 0000000..5a007f2 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/dtypes/common.py @@ -0,0 +1,1891 @@ +""" common type operations """ +from typing import Any, Callable, Union +import warnings + +import numpy as np + +from pandas._libs import algos, lib +from pandas._libs.tslibs import conversion +from pandas._typing import ArrayLike + +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + DatetimeTZDtype, + ExtensionDtype, + IntervalDtype, + PeriodDtype, + registry, +) +from pandas.core.dtypes.generic import ( + ABCCategorical, + ABCDatetimeIndex, + ABCIndexClass, + ABCPeriodArray, + ABCPeriodIndex, + ABCSeries, +) +from pandas.core.dtypes.inference import ( # noqa:F401 + is_array_like, + is_bool, + is_complex, + is_decimal, + is_dict_like, + is_file_like, + is_float, + is_hashable, + is_integer, + is_interval, + is_iterator, + is_list_like, + is_named_tuple, + is_nested_list_like, + is_number, + is_re, + is_re_compilable, + is_scalar, + is_sequence, +) + +_POSSIBLY_CAST_DTYPES = { + np.dtype(t).name + for t in [ + "O", + "int8", + "uint8", + "int16", + "uint16", + "int32", + "uint32", + "int64", + "uint64", + ] +} + +_NS_DTYPE = conversion.NS_DTYPE +_TD_DTYPE = conversion.TD_DTYPE +_INT64_DTYPE = np.dtype(np.int64) + +# oh the troubles to reduce import time +_is_scipy_sparse = None + +ensure_float64 = algos.ensure_float64 +ensure_float32 = algos.ensure_float32 + +_ensure_datetime64ns = conversion.ensure_datetime64ns +_ensure_timedelta64ns = conversion.ensure_timedelta64ns + + +def ensure_float(arr): + """ + Ensure that an array object has a float dtype if possible. + + Parameters + ---------- + arr : array-like + The array whose data type we want to enforce as float. + + Returns + ------- + float_arr : The original array cast to the float dtype if + possible. Otherwise, the original array is returned. + """ + + if issubclass(arr.dtype.type, (np.integer, np.bool_)): + arr = arr.astype(float) + return arr + + +ensure_uint64 = algos.ensure_uint64 +ensure_int64 = algos.ensure_int64 +ensure_int32 = algos.ensure_int32 +ensure_int16 = algos.ensure_int16 +ensure_int8 = algos.ensure_int8 +ensure_platform_int = algos.ensure_platform_int +ensure_object = algos.ensure_object + + +def ensure_str(value: Union[bytes, Any]) -> str: + """ + Ensure that bytes and non-strings get converted into ``str`` objects. + """ + if isinstance(value, bytes): + value = value.decode("utf-8") + elif not isinstance(value, str): + value = str(value) + return value + + +def ensure_categorical(arr): + """ + Ensure that an array-like object is a Categorical (if not already). + + Parameters + ---------- + arr : array-like + The array that we want to convert into a Categorical. + + Returns + ------- + cat_arr : The original array cast as a Categorical. If it already + is a Categorical, we return as is. + """ + + if not is_categorical(arr): + from pandas import Categorical + + arr = Categorical(arr) + return arr + + +def ensure_int_or_float(arr: ArrayLike, copy: bool = False) -> np.array: + """ + Ensure that an dtype array of some integer dtype + has an int64 dtype if possible. + If it's not possible, potentially because of overflow, + convert the array to float64 instead. + + Parameters + ---------- + arr : array-like + The array whose data type we want to enforce. + copy: bool + Whether to copy the original array or reuse + it in place, if possible. + + Returns + ------- + out_arr : The input array cast as int64 if + possible without overflow. + Otherwise the input array cast to float64. + + Notes + ----- + If the array is explicitly of type uint64 the type + will remain unchanged. + """ + # TODO: GH27506 potential bug with ExtensionArrays + try: + return arr.astype("int64", copy=copy, casting="safe") # type: ignore + except TypeError: + pass + try: + return arr.astype("uint64", copy=copy, casting="safe") # type: ignore + except TypeError: + if is_extension_array_dtype(arr.dtype): + return arr.to_numpy(dtype="float64", na_value=np.nan) + return arr.astype("float64", copy=copy) + + +def ensure_python_int(value: Union[int, np.integer]) -> int: + """ + Ensure that a value is a python int. + + Parameters + ---------- + value: int or numpy.integer + + Returns + ------- + int + + Raises + ------ + TypeError: if the value isn't an int or can't be converted to one. + """ + if not is_scalar(value): + raise TypeError(f"Value needs to be a scalar value, was type {type(value)}") + msg = "Wrong type {} for value {}" + try: + new_value = int(value) + assert new_value == value + except (TypeError, ValueError, AssertionError): + raise TypeError(msg.format(type(value), value)) + return new_value + + +def classes(*klasses) -> Callable: + """ evaluate if the tipo is a subclass of the klasses """ + return lambda tipo: issubclass(tipo, klasses) + + +def classes_and_not_datetimelike(*klasses) -> Callable: + """ + evaluate if the tipo is a subclass of the klasses + and not a datetimelike + """ + return lambda tipo: ( + issubclass(tipo, klasses) + and not issubclass(tipo, (np.datetime64, np.timedelta64)) + ) + + +def is_object_dtype(arr_or_dtype) -> bool: + """ + Check whether an array-like or dtype is of the object dtype. + + Parameters + ---------- + arr_or_dtype : array-like + The array-like or dtype to check. + + Returns + ------- + boolean + Whether or not the array-like or dtype is of the object dtype. + + Examples + -------- + >>> is_object_dtype(object) + True + >>> is_object_dtype(int) + False + >>> is_object_dtype(np.array([], dtype=object)) + True + >>> is_object_dtype(np.array([], dtype=int)) + False + >>> is_object_dtype([1, 2, 3]) + False + """ + return _is_dtype_type(arr_or_dtype, classes(np.object_)) + + +def is_sparse(arr) -> bool: + """ + Check whether an array-like is a 1-D pandas sparse array. + + Check that the one-dimensional array-like is a pandas sparse array. + Returns True if it is a pandas sparse array, not another type of + sparse array. + + Parameters + ---------- + arr : array-like + Array-like to check. + + Returns + ------- + bool + Whether or not the array-like is a pandas sparse array. + + Examples + -------- + Returns `True` if the parameter is a 1-D pandas sparse array. + + >>> is_sparse(pd.arrays.SparseArray([0, 0, 1, 0])) + True + >>> is_sparse(pd.Series(pd.arrays.SparseArray([0, 0, 1, 0]))) + True + + Returns `False` if the parameter is not sparse. + + >>> is_sparse(np.array([0, 0, 1, 0])) + False + >>> is_sparse(pd.Series([0, 1, 0, 0])) + False + + Returns `False` if the parameter is not a pandas sparse array. + + >>> from scipy.sparse import bsr_matrix + >>> is_sparse(bsr_matrix([0, 1, 0, 0])) + False + + Returns `False` if the parameter has more than one dimension. + """ + from pandas.core.arrays.sparse import SparseDtype + + dtype = getattr(arr, "dtype", arr) + return isinstance(dtype, SparseDtype) + + +def is_scipy_sparse(arr) -> bool: + """ + Check whether an array-like is a scipy.sparse.spmatrix instance. + + Parameters + ---------- + arr : array-like + The array-like to check. + + Returns + ------- + boolean + Whether or not the array-like is a scipy.sparse.spmatrix instance. + + Notes + ----- + If scipy is not installed, this function will always return False. + + Examples + -------- + >>> from scipy.sparse import bsr_matrix + >>> is_scipy_sparse(bsr_matrix([1, 2, 3])) + True + >>> is_scipy_sparse(pd.arrays.SparseArray([1, 2, 3])) + False + """ + + global _is_scipy_sparse + + if _is_scipy_sparse is None: + try: + from scipy.sparse import issparse as _is_scipy_sparse + except ImportError: + _is_scipy_sparse = lambda _: False + + assert _is_scipy_sparse is not None + return _is_scipy_sparse(arr) + + +def is_categorical(arr) -> bool: + """ + Check whether an array-like is a Categorical instance. + + Parameters + ---------- + arr : array-like + The array-like to check. + + Returns + ------- + boolean + Whether or not the array-like is of a Categorical instance. + + Examples + -------- + >>> is_categorical([1, 2, 3]) + False + + Categoricals, Series Categoricals, and CategoricalIndex will return True. + + >>> cat = pd.Categorical([1, 2, 3]) + >>> is_categorical(cat) + True + >>> is_categorical(pd.Series(cat)) + True + >>> is_categorical(pd.CategoricalIndex([1, 2, 3])) + True + """ + + return isinstance(arr, ABCCategorical) or is_categorical_dtype(arr) + + +def is_datetime64_dtype(arr_or_dtype) -> bool: + """ + Check whether an array-like or dtype is of the datetime64 dtype. + + Parameters + ---------- + arr_or_dtype : array-like + The array-like or dtype to check. + + Returns + ------- + boolean + Whether or not the array-like or dtype is of the datetime64 dtype. + + Examples + -------- + >>> is_datetime64_dtype(object) + False + >>> is_datetime64_dtype(np.datetime64) + True + >>> is_datetime64_dtype(np.array([], dtype=int)) + False + >>> is_datetime64_dtype(np.array([], dtype=np.datetime64)) + True + >>> is_datetime64_dtype([1, 2, 3]) + False + """ + + return _is_dtype_type(arr_or_dtype, classes(np.datetime64)) + + +def is_datetime64tz_dtype(arr_or_dtype) -> bool: + """ + Check whether an array-like or dtype is of a DatetimeTZDtype dtype. + + Parameters + ---------- + arr_or_dtype : array-like + The array-like or dtype to check. + + Returns + ------- + boolean + Whether or not the array-like or dtype is of a DatetimeTZDtype dtype. + + Examples + -------- + >>> is_datetime64tz_dtype(object) + False + >>> is_datetime64tz_dtype([1, 2, 3]) + False + >>> is_datetime64tz_dtype(pd.DatetimeIndex([1, 2, 3])) # tz-naive + False + >>> is_datetime64tz_dtype(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern")) + True + + >>> dtype = DatetimeTZDtype("ns", tz="US/Eastern") + >>> s = pd.Series([], dtype=dtype) + >>> is_datetime64tz_dtype(dtype) + True + >>> is_datetime64tz_dtype(s) + True + """ + + if arr_or_dtype is None: + return False + return DatetimeTZDtype.is_dtype(arr_or_dtype) + + +def is_timedelta64_dtype(arr_or_dtype) -> bool: + """ + Check whether an array-like or dtype is of the timedelta64 dtype. + + Parameters + ---------- + arr_or_dtype : array-like + The array-like or dtype to check. + + Returns + ------- + boolean + Whether or not the array-like or dtype is of the timedelta64 dtype. + + Examples + -------- + >>> is_timedelta64_dtype(object) + False + >>> is_timedelta64_dtype(np.timedelta64) + True + >>> is_timedelta64_dtype([1, 2, 3]) + False + >>> is_timedelta64_dtype(pd.Series([], dtype="timedelta64[ns]")) + True + >>> is_timedelta64_dtype('0 days') + False + """ + + return _is_dtype_type(arr_or_dtype, classes(np.timedelta64)) + + +def is_period_dtype(arr_or_dtype) -> bool: + """ + Check whether an array-like or dtype is of the Period dtype. + + Parameters + ---------- + arr_or_dtype : array-like + The array-like or dtype to check. + + Returns + ------- + boolean + Whether or not the array-like or dtype is of the Period dtype. + + Examples + -------- + >>> is_period_dtype(object) + False + >>> is_period_dtype(PeriodDtype(freq="D")) + True + >>> is_period_dtype([1, 2, 3]) + False + >>> is_period_dtype(pd.Period("2017-01-01")) + False + >>> is_period_dtype(pd.PeriodIndex([], freq="A")) + True + """ + + # TODO: Consider making Period an instance of PeriodDtype + if arr_or_dtype is None: + return False + return PeriodDtype.is_dtype(arr_or_dtype) + + +def is_interval_dtype(arr_or_dtype) -> bool: + """ + Check whether an array-like or dtype is of the Interval dtype. + + Parameters + ---------- + arr_or_dtype : array-like + The array-like or dtype to check. + + Returns + ------- + boolean + Whether or not the array-like or dtype is of the Interval dtype. + + Examples + -------- + >>> is_interval_dtype(object) + False + >>> is_interval_dtype(IntervalDtype()) + True + >>> is_interval_dtype([1, 2, 3]) + False + >>> + >>> interval = pd.Interval(1, 2, closed="right") + >>> is_interval_dtype(interval) + False + >>> is_interval_dtype(pd.IntervalIndex([interval])) + True + """ + + # TODO: Consider making Interval an instance of IntervalDtype + if arr_or_dtype is None: + return False + return IntervalDtype.is_dtype(arr_or_dtype) + + +def is_categorical_dtype(arr_or_dtype) -> bool: + """ + Check whether an array-like or dtype is of the Categorical dtype. + + Parameters + ---------- + arr_or_dtype : array-like + The array-like or dtype to check. + + Returns + ------- + boolean + Whether or not the array-like or dtype is of the Categorical dtype. + + Examples + -------- + >>> is_categorical_dtype(object) + False + >>> is_categorical_dtype(CategoricalDtype()) + True + >>> is_categorical_dtype([1, 2, 3]) + False + >>> is_categorical_dtype(pd.Categorical([1, 2, 3])) + True + >>> is_categorical_dtype(pd.CategoricalIndex([1, 2, 3])) + True + """ + + if arr_or_dtype is None: + return False + return CategoricalDtype.is_dtype(arr_or_dtype) + + +def is_string_dtype(arr_or_dtype) -> bool: + """ + Check whether the provided array or dtype is of the string dtype. + + Parameters + ---------- + arr_or_dtype : array-like + The array or dtype to check. + + Returns + ------- + boolean + Whether or not the array or dtype is of the string dtype. + + Examples + -------- + >>> is_string_dtype(str) + True + >>> is_string_dtype(object) + True + >>> is_string_dtype(int) + False + >>> + >>> is_string_dtype(np.array(['a', 'b'])) + True + >>> is_string_dtype(pd.Series([1, 2])) + False + """ + + # TODO: gh-15585: consider making the checks stricter. + def condition(dtype) -> bool: + return dtype.kind in ("O", "S", "U") and not is_excluded_dtype(dtype) + + def is_excluded_dtype(dtype) -> bool: + """ + These have kind = "O" but aren't string dtypes so need to be explicitly excluded + """ + is_excluded_checks = (is_period_dtype, is_interval_dtype) + return any(is_excluded(dtype) for is_excluded in is_excluded_checks) + + return _is_dtype(arr_or_dtype, condition) + + +def is_period_arraylike(arr) -> bool: + """ + Check whether an array-like is a periodical array-like or PeriodIndex. + + Parameters + ---------- + arr : array-like + The array-like to check. + + Returns + ------- + boolean + Whether or not the array-like is a periodical array-like or + PeriodIndex instance. + + Examples + -------- + >>> is_period_arraylike([1, 2, 3]) + False + >>> is_period_arraylike(pd.Index([1, 2, 3])) + False + >>> is_period_arraylike(pd.PeriodIndex(["2017-01-01"], freq="D")) + True + """ + + if isinstance(arr, (ABCPeriodIndex, ABCPeriodArray)): + return True + elif isinstance(arr, (np.ndarray, ABCSeries)): + return is_period_dtype(arr.dtype) + return getattr(arr, "inferred_type", None) == "period" + + +def is_datetime_arraylike(arr) -> bool: + """ + Check whether an array-like is a datetime array-like or DatetimeIndex. + + Parameters + ---------- + arr : array-like + The array-like to check. + + Returns + ------- + boolean + Whether or not the array-like is a datetime array-like or + DatetimeIndex. + + Examples + -------- + >>> is_datetime_arraylike([1, 2, 3]) + False + >>> is_datetime_arraylike(pd.Index([1, 2, 3])) + False + >>> is_datetime_arraylike(pd.DatetimeIndex([1, 2, 3])) + True + """ + + if isinstance(arr, ABCDatetimeIndex): + return True + elif isinstance(arr, (np.ndarray, ABCSeries)): + return ( + is_object_dtype(arr.dtype) + and lib.infer_dtype(arr, skipna=False) == "datetime" + ) + return getattr(arr, "inferred_type", None) == "datetime" + + +def is_dtype_equal(source, target) -> bool: + """ + Check if two dtypes are equal. + + Parameters + ---------- + source : The first dtype to compare + target : The second dtype to compare + + Returns + ------- + boolean + Whether or not the two dtypes are equal. + + Examples + -------- + >>> is_dtype_equal(int, float) + False + >>> is_dtype_equal("int", int) + True + >>> is_dtype_equal(object, "category") + False + >>> is_dtype_equal(CategoricalDtype(), "category") + True + >>> is_dtype_equal(DatetimeTZDtype(), "datetime64") + False + """ + + try: + source = _get_dtype(source) + target = _get_dtype(target) + return source == target + except (TypeError, AttributeError): + + # invalid comparison + # object == category will hit this + return False + + +def is_any_int_dtype(arr_or_dtype) -> bool: + """ + Check whether the provided array or dtype is of an integer dtype. + + In this function, timedelta64 instances are also considered "any-integer" + type objects and will return True. + + This function is internal and should not be exposed in the public API. + + .. versionchanged:: 0.24.0 + + The nullable Integer dtypes (e.g. pandas.Int64Dtype) are also considered + as integer by this function. + + Parameters + ---------- + arr_or_dtype : array-like + The array or dtype to check. + + Returns + ------- + boolean + Whether or not the array or dtype is of an integer dtype. + + Examples + -------- + >>> is_any_int_dtype(str) + False + >>> is_any_int_dtype(int) + True + >>> is_any_int_dtype(float) + False + >>> is_any_int_dtype(np.uint64) + True + >>> is_any_int_dtype(np.datetime64) + False + >>> is_any_int_dtype(np.timedelta64) + True + >>> is_any_int_dtype(np.array(['a', 'b'])) + False + >>> is_any_int_dtype(pd.Series([1, 2])) + True + >>> is_any_int_dtype(np.array([], dtype=np.timedelta64)) + True + >>> is_any_int_dtype(pd.Index([1, 2.])) # float + False + """ + + return _is_dtype_type(arr_or_dtype, classes(np.integer, np.timedelta64)) + + +def is_integer_dtype(arr_or_dtype) -> bool: + """ + Check whether the provided array or dtype is of an integer dtype. + + Unlike in `in_any_int_dtype`, timedelta64 instances will return False. + + .. versionchanged:: 0.24.0 + + The nullable Integer dtypes (e.g. pandas.Int64Dtype) are also considered + as integer by this function. + + Parameters + ---------- + arr_or_dtype : array-like + The array or dtype to check. + + Returns + ------- + boolean + Whether or not the array or dtype is of an integer dtype and + not an instance of timedelta64. + + Examples + -------- + >>> is_integer_dtype(str) + False + >>> is_integer_dtype(int) + True + >>> is_integer_dtype(float) + False + >>> is_integer_dtype(np.uint64) + True + >>> is_integer_dtype('int8') + True + >>> is_integer_dtype('Int8') + True + >>> is_integer_dtype(pd.Int8Dtype) + True + >>> is_integer_dtype(np.datetime64) + False + >>> is_integer_dtype(np.timedelta64) + False + >>> is_integer_dtype(np.array(['a', 'b'])) + False + >>> is_integer_dtype(pd.Series([1, 2])) + True + >>> is_integer_dtype(np.array([], dtype=np.timedelta64)) + False + >>> is_integer_dtype(pd.Index([1, 2.])) # float + False + """ + + return _is_dtype_type(arr_or_dtype, classes_and_not_datetimelike(np.integer)) + + +def is_signed_integer_dtype(arr_or_dtype) -> bool: + """ + Check whether the provided array or dtype is of a signed integer dtype. + + Unlike in `in_any_int_dtype`, timedelta64 instances will return False. + + .. versionchanged:: 0.24.0 + + The nullable Integer dtypes (e.g. pandas.Int64Dtype) are also considered + as integer by this function. + + Parameters + ---------- + arr_or_dtype : array-like + The array or dtype to check. + + Returns + ------- + boolean + Whether or not the array or dtype is of a signed integer dtype + and not an instance of timedelta64. + + Examples + -------- + >>> is_signed_integer_dtype(str) + False + >>> is_signed_integer_dtype(int) + True + >>> is_signed_integer_dtype(float) + False + >>> is_signed_integer_dtype(np.uint64) # unsigned + False + >>> is_signed_integer_dtype('int8') + True + >>> is_signed_integer_dtype('Int8') + True + >>> is_signed_dtype(pd.Int8Dtype) + True + >>> is_signed_integer_dtype(np.datetime64) + False + >>> is_signed_integer_dtype(np.timedelta64) + False + >>> is_signed_integer_dtype(np.array(['a', 'b'])) + False + >>> is_signed_integer_dtype(pd.Series([1, 2])) + True + >>> is_signed_integer_dtype(np.array([], dtype=np.timedelta64)) + False + >>> is_signed_integer_dtype(pd.Index([1, 2.])) # float + False + >>> is_signed_integer_dtype(np.array([1, 2], dtype=np.uint32)) # unsigned + False + """ + + return _is_dtype_type(arr_or_dtype, classes_and_not_datetimelike(np.signedinteger)) + + +def is_unsigned_integer_dtype(arr_or_dtype) -> bool: + """ + Check whether the provided array or dtype is of an unsigned integer dtype. + + .. versionchanged:: 0.24.0 + + The nullable Integer dtypes (e.g. pandas.UInt64Dtype) are also + considered as integer by this function. + + Parameters + ---------- + arr_or_dtype : array-like + The array or dtype to check. + + Returns + ------- + boolean + Whether or not the array or dtype is of an unsigned integer dtype. + + Examples + -------- + >>> is_unsigned_integer_dtype(str) + False + >>> is_unsigned_integer_dtype(int) # signed + False + >>> is_unsigned_integer_dtype(float) + False + >>> is_unsigned_integer_dtype(np.uint64) + True + >>> is_unsigned_integer_dtype('uint8') + True + >>> is_unsigned_integer_dtype('UInt8') + True + >>> is_unsigned_integer_dtype(pd.UInt8Dtype) + True + >>> is_unsigned_integer_dtype(np.array(['a', 'b'])) + False + >>> is_unsigned_integer_dtype(pd.Series([1, 2])) # signed + False + >>> is_unsigned_integer_dtype(pd.Index([1, 2.])) # float + False + >>> is_unsigned_integer_dtype(np.array([1, 2], dtype=np.uint32)) + True + """ + return _is_dtype_type( + arr_or_dtype, classes_and_not_datetimelike(np.unsignedinteger) + ) + + +def is_int64_dtype(arr_or_dtype) -> bool: + """ + Check whether the provided array or dtype is of the int64 dtype. + + Parameters + ---------- + arr_or_dtype : array-like + The array or dtype to check. + + Returns + ------- + boolean + Whether or not the array or dtype is of the int64 dtype. + + Notes + ----- + Depending on system architecture, the return value of `is_int64_dtype( + int)` will be True if the OS uses 64-bit integers and False if the OS + uses 32-bit integers. + + Examples + -------- + >>> is_int64_dtype(str) + False + >>> is_int64_dtype(np.int32) + False + >>> is_int64_dtype(np.int64) + True + >>> is_int64_dtype('int8') + False + >>> is_int64_dtype('Int8') + False + >>> is_int64_dtype(pd.Int64Dtype) + True + >>> is_int64_dtype(float) + False + >>> is_int64_dtype(np.uint64) # unsigned + False + >>> is_int64_dtype(np.array(['a', 'b'])) + False + >>> is_int64_dtype(np.array([1, 2], dtype=np.int64)) + True + >>> is_int64_dtype(pd.Index([1, 2.])) # float + False + >>> is_int64_dtype(np.array([1, 2], dtype=np.uint32)) # unsigned + False + """ + + return _is_dtype_type(arr_or_dtype, classes(np.int64)) + + +def is_datetime64_any_dtype(arr_or_dtype) -> bool: + """ + Check whether the provided array or dtype is of the datetime64 dtype. + + Parameters + ---------- + arr_or_dtype : array-like + The array or dtype to check. + + Returns + ------- + boolean + Whether or not the array or dtype is of the datetime64 dtype. + + Examples + -------- + >>> is_datetime64_any_dtype(str) + False + >>> is_datetime64_any_dtype(int) + False + >>> is_datetime64_any_dtype(np.datetime64) # can be tz-naive + True + >>> is_datetime64_any_dtype(DatetimeTZDtype("ns", "US/Eastern")) + True + >>> is_datetime64_any_dtype(np.array(['a', 'b'])) + False + >>> is_datetime64_any_dtype(np.array([1, 2])) + False + >>> is_datetime64_any_dtype(np.array([], dtype=np.datetime64)) + True + >>> is_datetime64_any_dtype(pd.DatetimeIndex([1, 2, 3], + dtype=np.datetime64)) + True + """ + + if arr_or_dtype is None: + return False + return is_datetime64_dtype(arr_or_dtype) or is_datetime64tz_dtype(arr_or_dtype) + + +def is_datetime64_ns_dtype(arr_or_dtype) -> bool: + """ + Check whether the provided array or dtype is of the datetime64[ns] dtype. + + Parameters + ---------- + arr_or_dtype : array-like + The array or dtype to check. + + Returns + ------- + boolean + Whether or not the array or dtype is of the datetime64[ns] dtype. + + Examples + -------- + >>> is_datetime64_ns_dtype(str) + False + >>> is_datetime64_ns_dtype(int) + False + >>> is_datetime64_ns_dtype(np.datetime64) # no unit + False + >>> is_datetime64_ns_dtype(DatetimeTZDtype("ns", "US/Eastern")) + True + >>> is_datetime64_ns_dtype(np.array(['a', 'b'])) + False + >>> is_datetime64_ns_dtype(np.array([1, 2])) + False + >>> is_datetime64_ns_dtype(np.array([], dtype=np.datetime64)) # no unit + False + >>> is_datetime64_ns_dtype(np.array([], + dtype="datetime64[ps]")) # wrong unit + False + >>> is_datetime64_ns_dtype(pd.DatetimeIndex([1, 2, 3], + dtype=np.datetime64)) # has 'ns' unit + True + """ + + if arr_or_dtype is None: + return False + try: + tipo = _get_dtype(arr_or_dtype) + except TypeError: + if is_datetime64tz_dtype(arr_or_dtype): + tipo = _get_dtype(arr_or_dtype.dtype) + else: + return False + return tipo == _NS_DTYPE or getattr(tipo, "base", None) == _NS_DTYPE + + +def is_timedelta64_ns_dtype(arr_or_dtype) -> bool: + """ + Check whether the provided array or dtype is of the timedelta64[ns] dtype. + + This is a very specific dtype, so generic ones like `np.timedelta64` + will return False if passed into this function. + + Parameters + ---------- + arr_or_dtype : array-like + The array or dtype to check. + + Returns + ------- + boolean + Whether or not the array or dtype is of the timedelta64[ns] dtype. + + Examples + -------- + >>> is_timedelta64_ns_dtype(np.dtype('m8[ns]')) + True + >>> is_timedelta64_ns_dtype(np.dtype('m8[ps]')) # Wrong frequency + False + >>> is_timedelta64_ns_dtype(np.array([1, 2], dtype='m8[ns]')) + True + >>> is_timedelta64_ns_dtype(np.array([1, 2], dtype=np.timedelta64)) + False + """ + return _is_dtype(arr_or_dtype, lambda dtype: dtype == _TD_DTYPE) + + +def is_datetime_or_timedelta_dtype(arr_or_dtype) -> bool: + """ + Check whether the provided array or dtype is of + a timedelta64 or datetime64 dtype. + + Parameters + ---------- + arr_or_dtype : array-like + The array or dtype to check. + + Returns + ------- + boolean + Whether or not the array or dtype is of a timedelta64, + or datetime64 dtype. + + Examples + -------- + >>> is_datetime_or_timedelta_dtype(str) + False + >>> is_datetime_or_timedelta_dtype(int) + False + >>> is_datetime_or_timedelta_dtype(np.datetime64) + True + >>> is_datetime_or_timedelta_dtype(np.timedelta64) + True + >>> is_datetime_or_timedelta_dtype(np.array(['a', 'b'])) + False + >>> is_datetime_or_timedelta_dtype(pd.Series([1, 2])) + False + >>> is_datetime_or_timedelta_dtype(np.array([], dtype=np.timedelta64)) + True + >>> is_datetime_or_timedelta_dtype(np.array([], dtype=np.datetime64)) + True + """ + + return _is_dtype_type(arr_or_dtype, classes(np.datetime64, np.timedelta64)) + + +def _is_unorderable_exception(e: TypeError) -> bool: + """ + Check if the exception raised is an unorderable exception. + + Parameters + ---------- + e : Exception or sub-class + The exception object to check. + + Returns + ------- + bool + Whether or not the exception raised is an unorderable exception. + """ + return "'>' not supported between instances of" in str(e) + + +# This exists to silence numpy deprecation warnings, see GH#29553 +def is_numeric_v_string_like(a, b): + """ + Check if we are comparing a string-like object to a numeric ndarray. + NumPy doesn't like to compare such objects, especially numeric arrays + and scalar string-likes. + + Parameters + ---------- + a : array-like, scalar + The first object to check. + b : array-like, scalar + The second object to check. + + Returns + ------- + boolean + Whether we return a comparing a string-like object to a numeric array. + + Examples + -------- + >>> is_numeric_v_string_like(1, 1) + False + >>> is_numeric_v_string_like("foo", "foo") + False + >>> is_numeric_v_string_like(1, "foo") # non-array numeric + False + >>> is_numeric_v_string_like(np.array([1]), "foo") + True + >>> is_numeric_v_string_like("foo", np.array([1])) # symmetric check + True + >>> is_numeric_v_string_like(np.array([1, 2]), np.array(["foo"])) + True + >>> is_numeric_v_string_like(np.array(["foo"]), np.array([1, 2])) + True + >>> is_numeric_v_string_like(np.array([1]), np.array([2])) + False + >>> is_numeric_v_string_like(np.array(["foo"]), np.array(["foo"])) + False + """ + + is_a_array = isinstance(a, np.ndarray) + is_b_array = isinstance(b, np.ndarray) + + is_a_numeric_array = is_a_array and is_numeric_dtype(a) + is_b_numeric_array = is_b_array and is_numeric_dtype(b) + is_a_string_array = is_a_array and is_string_like_dtype(a) + is_b_string_array = is_b_array and is_string_like_dtype(b) + + is_a_scalar_string_like = not is_a_array and isinstance(a, str) + is_b_scalar_string_like = not is_b_array and isinstance(b, str) + + return ( + (is_a_numeric_array and is_b_scalar_string_like) + or (is_b_numeric_array and is_a_scalar_string_like) + or (is_a_numeric_array and is_b_string_array) + or (is_b_numeric_array and is_a_string_array) + ) + + +# This exists to silence numpy deprecation warnings, see GH#29553 +def is_datetimelike_v_numeric(a, b): + """ + Check if we are comparing a datetime-like object to a numeric object. + By "numeric," we mean an object that is either of an int or float dtype. + + Parameters + ---------- + a : array-like, scalar + The first object to check. + b : array-like, scalar + The second object to check. + + Returns + ------- + boolean + Whether we return a comparing a datetime-like to a numeric object. + + Examples + -------- + >>> dt = np.datetime64(pd.datetime(2017, 1, 1)) + >>> + >>> is_datetimelike_v_numeric(1, 1) + False + >>> is_datetimelike_v_numeric(dt, dt) + False + >>> is_datetimelike_v_numeric(1, dt) + True + >>> is_datetimelike_v_numeric(dt, 1) # symmetric check + True + >>> is_datetimelike_v_numeric(np.array([dt]), 1) + True + >>> is_datetimelike_v_numeric(np.array([1]), dt) + True + >>> is_datetimelike_v_numeric(np.array([dt]), np.array([1])) + True + >>> is_datetimelike_v_numeric(np.array([1]), np.array([2])) + False + >>> is_datetimelike_v_numeric(np.array([dt]), np.array([dt])) + False + """ + + if not hasattr(a, "dtype"): + a = np.asarray(a) + if not hasattr(b, "dtype"): + b = np.asarray(b) + + def is_numeric(x): + """ + Check if an object has a numeric dtype (i.e. integer or float). + """ + return is_integer_dtype(x) or is_float_dtype(x) + + return (needs_i8_conversion(a) and is_numeric(b)) or ( + needs_i8_conversion(b) and is_numeric(a) + ) + + +def needs_i8_conversion(arr_or_dtype) -> bool: + """ + Check whether the array or dtype should be converted to int64. + + An array-like or dtype "needs" such a conversion if the array-like + or dtype is of a datetime-like dtype + + Parameters + ---------- + arr_or_dtype : array-like + The array or dtype to check. + + Returns + ------- + boolean + Whether or not the array or dtype should be converted to int64. + + Examples + -------- + >>> needs_i8_conversion(str) + False + >>> needs_i8_conversion(np.int64) + False + >>> needs_i8_conversion(np.datetime64) + True + >>> needs_i8_conversion(np.array(['a', 'b'])) + False + >>> needs_i8_conversion(pd.Series([1, 2])) + False + >>> needs_i8_conversion(pd.Series([], dtype="timedelta64[ns]")) + True + >>> needs_i8_conversion(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern")) + True + """ + + if arr_or_dtype is None: + return False + return ( + is_datetime_or_timedelta_dtype(arr_or_dtype) + or is_datetime64tz_dtype(arr_or_dtype) + or is_period_dtype(arr_or_dtype) + ) + + +def is_numeric_dtype(arr_or_dtype) -> bool: + """ + Check whether the provided array or dtype is of a numeric dtype. + + Parameters + ---------- + arr_or_dtype : array-like + The array or dtype to check. + + Returns + ------- + boolean + Whether or not the array or dtype is of a numeric dtype. + + Examples + -------- + >>> is_numeric_dtype(str) + False + >>> is_numeric_dtype(int) + True + >>> is_numeric_dtype(float) + True + >>> is_numeric_dtype(np.uint64) + True + >>> is_numeric_dtype(np.datetime64) + False + >>> is_numeric_dtype(np.timedelta64) + False + >>> is_numeric_dtype(np.array(['a', 'b'])) + False + >>> is_numeric_dtype(pd.Series([1, 2])) + True + >>> is_numeric_dtype(pd.Index([1, 2.])) + True + >>> is_numeric_dtype(np.array([], dtype=np.timedelta64)) + False + """ + + return _is_dtype_type( + arr_or_dtype, classes_and_not_datetimelike(np.number, np.bool_) + ) + + +def is_string_like_dtype(arr_or_dtype) -> bool: + """ + Check whether the provided array or dtype is of a string-like dtype. + + Unlike `is_string_dtype`, the object dtype is excluded because it + is a mixed dtype. + + Parameters + ---------- + arr_or_dtype : array-like + The array or dtype to check. + + Returns + ------- + boolean + Whether or not the array or dtype is of the string dtype. + + Examples + -------- + >>> is_string_like_dtype(str) + True + >>> is_string_like_dtype(object) + False + >>> is_string_like_dtype(np.array(['a', 'b'])) + True + >>> is_string_like_dtype(pd.Series([1, 2])) + False + """ + + return _is_dtype(arr_or_dtype, lambda dtype: dtype.kind in ("S", "U")) + + +def is_float_dtype(arr_or_dtype) -> bool: + """ + Check whether the provided array or dtype is of a float dtype. + + This function is internal and should not be exposed in the public API. + + Parameters + ---------- + arr_or_dtype : array-like + The array or dtype to check. + + Returns + ------- + boolean + Whether or not the array or dtype is of a float dtype. + + Examples + -------- + >>> is_float_dtype(str) + False + >>> is_float_dtype(int) + False + >>> is_float_dtype(float) + True + >>> is_float_dtype(np.array(['a', 'b'])) + False + >>> is_float_dtype(pd.Series([1, 2])) + False + >>> is_float_dtype(pd.Index([1, 2.])) + True + """ + return _is_dtype_type(arr_or_dtype, classes(np.floating)) + + +def is_bool_dtype(arr_or_dtype) -> bool: + """ + Check whether the provided array or dtype is of a boolean dtype. + + Parameters + ---------- + arr_or_dtype : array-like + The array or dtype to check. + + Returns + ------- + boolean + Whether or not the array or dtype is of a boolean dtype. + + Notes + ----- + An ExtensionArray is considered boolean when the ``_is_boolean`` + attribute is set to True. + + Examples + -------- + >>> is_bool_dtype(str) + False + >>> is_bool_dtype(int) + False + >>> is_bool_dtype(bool) + True + >>> is_bool_dtype(np.bool) + True + >>> is_bool_dtype(np.array(['a', 'b'])) + False + >>> is_bool_dtype(pd.Series([1, 2])) + False + >>> is_bool_dtype(np.array([True, False])) + True + >>> is_bool_dtype(pd.Categorical([True, False])) + True + >>> is_bool_dtype(pd.arrays.SparseArray([True, False])) + True + """ + if arr_or_dtype is None: + return False + try: + dtype = _get_dtype(arr_or_dtype) + except TypeError: + return False + + if isinstance(arr_or_dtype, CategoricalDtype): + arr_or_dtype = arr_or_dtype.categories + # now we use the special definition for Index + + if isinstance(arr_or_dtype, ABCIndexClass): + + # TODO(jreback) + # we don't have a boolean Index class + # so its object, we need to infer to + # guess this + return arr_or_dtype.is_object and arr_or_dtype.inferred_type == "boolean" + elif is_extension_array_dtype(arr_or_dtype): + dtype = getattr(arr_or_dtype, "dtype", arr_or_dtype) + return dtype._is_boolean + + return issubclass(dtype.type, np.bool_) + + +def is_extension_type(arr) -> bool: + """ + Check whether an array-like is of a pandas extension class instance. + + .. deprecated:: 1.0.0 + Use ``is_extension_array_dtype`` instead. + + Extension classes include categoricals, pandas sparse objects (i.e. + classes represented within the pandas library and not ones external + to it like scipy sparse matrices), and datetime-like arrays. + + Parameters + ---------- + arr : array-like + The array-like to check. + + Returns + ------- + boolean + Whether or not the array-like is of a pandas extension class instance. + + Examples + -------- + >>> is_extension_type([1, 2, 3]) + False + >>> is_extension_type(np.array([1, 2, 3])) + False + >>> + >>> cat = pd.Categorical([1, 2, 3]) + >>> + >>> is_extension_type(cat) + True + >>> is_extension_type(pd.Series(cat)) + True + >>> is_extension_type(pd.arrays.SparseArray([1, 2, 3])) + True + >>> from scipy.sparse import bsr_matrix + >>> is_extension_type(bsr_matrix([1, 2, 3])) + False + >>> is_extension_type(pd.DatetimeIndex([1, 2, 3])) + False + >>> is_extension_type(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern")) + True + >>> + >>> dtype = DatetimeTZDtype("ns", tz="US/Eastern") + >>> s = pd.Series([], dtype=dtype) + >>> is_extension_type(s) + True + """ + warnings.warn( + "'is_extension_type' is deprecated and will be removed in a future " + "version. Use 'is_extension_array_dtype' instead.", + FutureWarning, + stacklevel=2, + ) + + if is_categorical(arr): + return True + elif is_sparse(arr): + return True + elif is_datetime64tz_dtype(arr): + return True + return False + + +def is_extension_array_dtype(arr_or_dtype) -> bool: + """ + Check if an object is a pandas extension array type. + + See the :ref:`Use Guide ` for more. + + Parameters + ---------- + arr_or_dtype : object + For array-like input, the ``.dtype`` attribute will + be extracted. + + Returns + ------- + bool + Whether the `arr_or_dtype` is an extension array type. + + Notes + ----- + This checks whether an object implements the pandas extension + array interface. In pandas, this includes: + + * Categorical + * Sparse + * Interval + * Period + * DatetimeArray + * TimedeltaArray + + Third-party libraries may implement arrays or types satisfying + this interface as well. + + Examples + -------- + >>> from pandas.api.types import is_extension_array_dtype + >>> arr = pd.Categorical(['a', 'b']) + >>> is_extension_array_dtype(arr) + True + >>> is_extension_array_dtype(arr.dtype) + True + + >>> arr = np.array(['a', 'b']) + >>> is_extension_array_dtype(arr.dtype) + False + """ + dtype = getattr(arr_or_dtype, "dtype", arr_or_dtype) + return isinstance(dtype, ExtensionDtype) or registry.find(dtype) is not None + + +def is_complex_dtype(arr_or_dtype) -> bool: + """ + Check whether the provided array or dtype is of a complex dtype. + + Parameters + ---------- + arr_or_dtype : array-like + The array or dtype to check. + + Returns + ------- + boolean + Whether or not the array or dtype is of a complex dtype. + + Examples + -------- + >>> is_complex_dtype(str) + False + >>> is_complex_dtype(int) + False + >>> is_complex_dtype(np.complex) + True + >>> is_complex_dtype(np.array(['a', 'b'])) + False + >>> is_complex_dtype(pd.Series([1, 2])) + False + >>> is_complex_dtype(np.array([1 + 1j, 5])) + True + """ + + return _is_dtype_type(arr_or_dtype, classes(np.complexfloating)) + + +def _is_dtype(arr_or_dtype, condition) -> bool: + """ + Return a boolean if the condition is satisfied for the arr_or_dtype. + + Parameters + ---------- + arr_or_dtype : array-like, str, np.dtype, or ExtensionArrayType + The array-like or dtype object whose dtype we want to extract. + condition : callable[Union[np.dtype, ExtensionDtype]] + + Returns + ------- + bool + + """ + + if arr_or_dtype is None: + return False + try: + dtype = _get_dtype(arr_or_dtype) + except (TypeError, ValueError, UnicodeEncodeError): + return False + return condition(dtype) + + +def _get_dtype(arr_or_dtype): + """ + Get the dtype instance associated with an array + or dtype object. + + Parameters + ---------- + arr_or_dtype : array-like + The array-like or dtype object whose dtype we want to extract. + + Returns + ------- + obj_dtype : The extract dtype instance from the + passed in array or dtype object. + + Raises + ------ + TypeError : The passed in object is None. + """ + + if arr_or_dtype is None: + raise TypeError("Cannot deduce dtype from null object") + + # fastpath + elif isinstance(arr_or_dtype, np.dtype): + return arr_or_dtype + elif isinstance(arr_or_dtype, type): + return np.dtype(arr_or_dtype) + + # if we have an array-like + elif hasattr(arr_or_dtype, "dtype"): + arr_or_dtype = arr_or_dtype.dtype + + return pandas_dtype(arr_or_dtype) + + +def _is_dtype_type(arr_or_dtype, condition) -> bool: + """ + Return a boolean if the condition is satisfied for the arr_or_dtype. + + Parameters + ---------- + arr_or_dtype : array-like + The array-like or dtype object whose dtype we want to extract. + condition : callable[Union[np.dtype, ExtensionDtypeType]] + + Returns + ------- + bool : if the condition is satisfied for the arr_or_dtype + """ + + if arr_or_dtype is None: + return condition(type(None)) + + # fastpath + if isinstance(arr_or_dtype, np.dtype): + return condition(arr_or_dtype.type) + elif isinstance(arr_or_dtype, type): + if issubclass(arr_or_dtype, ExtensionDtype): + arr_or_dtype = arr_or_dtype.type + return condition(np.dtype(arr_or_dtype).type) + + # if we have an array-like + if hasattr(arr_or_dtype, "dtype"): + arr_or_dtype = arr_or_dtype.dtype + + # we are not possibly a dtype + elif is_list_like(arr_or_dtype): + return condition(type(None)) + + try: + tipo = pandas_dtype(arr_or_dtype).type + except (TypeError, ValueError, UnicodeEncodeError): + if is_scalar(arr_or_dtype): + return condition(type(None)) + + return False + + return condition(tipo) + + +def infer_dtype_from_object(dtype): + """ + Get a numpy dtype.type-style object for a dtype object. + + This methods also includes handling of the datetime64[ns] and + datetime64[ns, TZ] objects. + + If no dtype can be found, we return ``object``. + + Parameters + ---------- + dtype : dtype, type + The dtype object whose numpy dtype.type-style + object we want to extract. + + Returns + ------- + dtype_object : The extracted numpy dtype.type-style object. + """ + + if isinstance(dtype, type) and issubclass(dtype, np.generic): + # Type object from a dtype + return dtype + elif isinstance(dtype, (np.dtype, ExtensionDtype)): + # dtype object + try: + _validate_date_like_dtype(dtype) + except TypeError: + # Should still pass if we don't have a date-like + pass + return dtype.type + + try: + dtype = pandas_dtype(dtype) + except TypeError: + pass + + if is_extension_array_dtype(dtype): + return dtype.type + elif isinstance(dtype, str): + + # TODO(jreback) + # should deprecate these + if dtype in ["datetimetz", "datetime64tz"]: + return DatetimeTZDtype.type + elif dtype in ["period"]: + raise NotImplementedError + + if dtype == "datetime" or dtype == "timedelta": + dtype += "64" + try: + return infer_dtype_from_object(getattr(np, dtype)) + except (AttributeError, TypeError): + # Handles cases like _get_dtype(int) i.e., + # Python objects that are valid dtypes + # (unlike user-defined types, in general) + # + # TypeError handles the float16 type code of 'e' + # further handle internal types + pass + + return infer_dtype_from_object(np.dtype(dtype)) + + +def _validate_date_like_dtype(dtype) -> None: + """ + Check whether the dtype is a date-like dtype. Raises an error if invalid. + + Parameters + ---------- + dtype : dtype, type + The dtype to check. + + Raises + ------ + TypeError : The dtype could not be casted to a date-like dtype. + ValueError : The dtype is an illegal date-like dtype (e.g. the + the frequency provided is too specific) + """ + + try: + typ = np.datetime_data(dtype)[0] + except ValueError as e: + raise TypeError(e) + if typ != "generic" and typ != "ns": + raise ValueError( + f"{repr(dtype.name)} is too specific of a frequency, " + f"try passing {repr(dtype.type.__name__)}" + ) + + +def pandas_dtype(dtype): + """ + Convert input into a pandas only dtype object or a numpy dtype object. + + Parameters + ---------- + dtype : object to be converted + + Returns + ------- + np.dtype or a pandas dtype + + Raises + ------ + TypeError if not a dtype + """ + # short-circuit + if isinstance(dtype, np.ndarray): + return dtype.dtype + elif isinstance(dtype, (np.dtype, ExtensionDtype)): + return dtype + + # registered extension types + result = registry.find(dtype) + if result is not None: + return result + + # try a numpy dtype + # raise a consistent TypeError if failed + try: + npdtype = np.dtype(dtype) + except SyntaxError: + # np.dtype uses `eval` which can raise SyntaxError + raise TypeError(f"data type '{dtype}' not understood") + + # Any invalid dtype (such as pd.Timestamp) should raise an error. + # np.dtype(invalid_type).kind = 0 for such objects. However, this will + # also catch some valid dtypes such as object, np.object_ and 'object' + # which we safeguard against by catching them earlier and returning + # np.dtype(valid_dtype) before this condition is evaluated. + if is_hashable(dtype) and dtype in [object, np.object_, "object", "O"]: + # check hashability to avoid errors/DeprecationWarning when we get + # here and `dtype` is an array + return npdtype + elif npdtype.kind == "O": + raise TypeError(f"dtype '{dtype}' not understood") + + return npdtype diff --git a/venv/Lib/site-packages/pandas/core/dtypes/concat.py b/venv/Lib/site-packages/pandas/core/dtypes/concat.py new file mode 100644 index 0000000..cd4b5af --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/dtypes/concat.py @@ -0,0 +1,481 @@ +""" +Utility functions related to concat +""" + +import numpy as np + +from pandas._libs import tslib, tslibs + +from pandas.core.dtypes.common import ( + _NS_DTYPE, + _TD_DTYPE, + is_bool_dtype, + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_dtype_equal, + is_extension_array_dtype, + is_object_dtype, + is_sparse, + is_timedelta64_dtype, +) +from pandas.core.dtypes.generic import ( + ABCCategoricalIndex, + ABCDatetimeArray, + ABCIndexClass, + ABCRangeIndex, + ABCSeries, +) + + +def get_dtype_kinds(l): + """ + Parameters + ---------- + l : list of arrays + + Returns + ------- + a set of kinds that exist in this list of arrays + """ + + typs = set() + for arr in l: + + dtype = arr.dtype + if is_categorical_dtype(dtype): + typ = "category" + elif is_sparse(arr): + typ = "sparse" + elif isinstance(arr, ABCRangeIndex): + typ = "range" + elif is_datetime64tz_dtype(arr): + # if to_concat contains different tz, + # the result must be object dtype + typ = str(arr.dtype) + elif is_datetime64_dtype(dtype): + typ = "datetime" + elif is_timedelta64_dtype(dtype): + typ = "timedelta" + elif is_object_dtype(dtype): + typ = "object" + elif is_bool_dtype(dtype): + typ = "bool" + elif is_extension_array_dtype(dtype): + typ = str(arr.dtype) + else: + typ = dtype.kind + typs.add(typ) + return typs + + +def concat_compat(to_concat, axis: int = 0): + """ + provide concatenation of an array of arrays each of which is a single + 'normalized' dtypes (in that for example, if it's object, then it is a + non-datetimelike and provide a combined dtype for the resulting array that + preserves the overall dtype if possible) + + Parameters + ---------- + to_concat : array of arrays + axis : axis to provide concatenation + + Returns + ------- + a single array, preserving the combined dtypes + """ + + # filter empty arrays + # 1-d dtypes always are included here + def is_nonempty(x) -> bool: + if x.ndim <= axis: + return True + return x.shape[axis] > 0 + + # If all arrays are empty, there's nothing to convert, just short-cut to + # the concatenation, #3121. + # + # Creating an empty array directly is tempting, but the winnings would be + # marginal given that it would still require shape & dtype calculation and + # np.concatenate which has them both implemented is compiled. + + typs = get_dtype_kinds(to_concat) + _contains_datetime = any(typ.startswith("datetime") for typ in typs) + _contains_period = any(typ.startswith("period") for typ in typs) + + if "category" in typs: + # this must be prior to concat_datetime, + # to support Categorical + datetime-like + return concat_categorical(to_concat, axis=axis) + + elif _contains_datetime or "timedelta" in typs or _contains_period: + return concat_datetime(to_concat, axis=axis, typs=typs) + + # these are mandated to handle empties as well + elif "sparse" in typs: + return _concat_sparse(to_concat, axis=axis, typs=typs) + + all_empty = all(not is_nonempty(x) for x in to_concat) + if any(is_extension_array_dtype(x) for x in to_concat) and axis == 1: + to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat] + + if all_empty: + # we have all empties, but may need to coerce the result dtype to + # object if we have non-numeric type operands (numpy would otherwise + # cast this to float) + typs = get_dtype_kinds(to_concat) + if len(typs) != 1: + + if not len(typs - {"i", "u", "f"}) or not len(typs - {"bool", "i", "u"}): + # let numpy coerce + pass + else: + # coerce to object + to_concat = [x.astype("object") for x in to_concat] + + return np.concatenate(to_concat, axis=axis) + + +def concat_categorical(to_concat, axis: int = 0): + """Concatenate an object/categorical array of arrays, each of which is a + single dtype + + Parameters + ---------- + to_concat : array of arrays + axis : int + Axis to provide concatenation in the current implementation this is + always 0, e.g. we only have 1D categoricals + + Returns + ------- + Categorical + A single array, preserving the combined dtypes + """ + + # we could have object blocks and categoricals here + # if we only have a single categoricals then combine everything + # else its a non-compat categorical + categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)] + + # validate the categories + if len(categoricals) != len(to_concat): + pass + else: + # when all categories are identical + first = to_concat[0] + if all(first.is_dtype_equal(other) for other in to_concat[1:]): + return union_categoricals(categoricals) + + # extract the categoricals & coerce to object if needed + to_concat = [ + x._internal_get_values() + if is_categorical_dtype(x.dtype) + else np.asarray(x).ravel() + if not is_datetime64tz_dtype(x) + else np.asarray(x.astype(object)) + for x in to_concat + ] + result = concat_compat(to_concat) + if axis == 1: + result = result.reshape(1, len(result)) + return result + + +def union_categoricals( + to_union, sort_categories: bool = False, ignore_order: bool = False +): + """ + Combine list-like of Categorical-like, unioning categories. + + All categories must have the same dtype. + + Parameters + ---------- + to_union : list-like + Categorical, CategoricalIndex, or Series with dtype='category'. + sort_categories : bool, default False + If true, resulting categories will be lexsorted, otherwise + they will be ordered as they appear in the data. + ignore_order : bool, default False + If true, the ordered attribute of the Categoricals will be ignored. + Results in an unordered categorical. + + Returns + ------- + Categorical + + Raises + ------ + TypeError + - all inputs do not have the same dtype + - all inputs do not have the same ordered property + - all inputs are ordered and their categories are not identical + - sort_categories=True and Categoricals are ordered + ValueError + Empty list of categoricals passed + + Notes + ----- + + To learn more about categories, see `link + `__ + + Examples + -------- + + >>> from pandas.api.types import union_categoricals + + If you want to combine categoricals that do not necessarily have + the same categories, `union_categoricals` will combine a list-like + of categoricals. The new categories will be the union of the + categories being combined. + + >>> a = pd.Categorical(["b", "c"]) + >>> b = pd.Categorical(["a", "b"]) + >>> union_categoricals([a, b]) + [b, c, a, b] + Categories (3, object): [b, c, a] + + By default, the resulting categories will be ordered as they appear + in the `categories` of the data. If you want the categories to be + lexsorted, use `sort_categories=True` argument. + + >>> union_categoricals([a, b], sort_categories=True) + [b, c, a, b] + Categories (3, object): [a, b, c] + + `union_categoricals` also works with the case of combining two + categoricals of the same categories and order information (e.g. what + you could also `append` for). + + >>> a = pd.Categorical(["a", "b"], ordered=True) + >>> b = pd.Categorical(["a", "b", "a"], ordered=True) + >>> union_categoricals([a, b]) + [a, b, a, b, a] + Categories (2, object): [a < b] + + Raises `TypeError` because the categories are ordered and not identical. + + >>> a = pd.Categorical(["a", "b"], ordered=True) + >>> b = pd.Categorical(["a", "b", "c"], ordered=True) + >>> union_categoricals([a, b]) + TypeError: to union ordered Categoricals, all categories must be the same + + New in version 0.20.0 + + Ordered categoricals with different categories or orderings can be + combined by using the `ignore_ordered=True` argument. + + >>> a = pd.Categorical(["a", "b", "c"], ordered=True) + >>> b = pd.Categorical(["c", "b", "a"], ordered=True) + >>> union_categoricals([a, b], ignore_order=True) + [a, b, c, c, b, a] + Categories (3, object): [a, b, c] + + `union_categoricals` also works with a `CategoricalIndex`, or `Series` + containing categorical data, but note that the resulting array will + always be a plain `Categorical` + + >>> a = pd.Series(["b", "c"], dtype='category') + >>> b = pd.Series(["a", "b"], dtype='category') + >>> union_categoricals([a, b]) + [b, c, a, b] + Categories (3, object): [b, c, a] + """ + from pandas import Index, Categorical + from pandas.core.arrays.categorical import _recode_for_categories + + if len(to_union) == 0: + raise ValueError("No Categoricals to union") + + def _maybe_unwrap(x): + if isinstance(x, (ABCCategoricalIndex, ABCSeries)): + return x.values + elif isinstance(x, Categorical): + return x + else: + raise TypeError("all components to combine must be Categorical") + + to_union = [_maybe_unwrap(x) for x in to_union] + first = to_union[0] + + if not all( + is_dtype_equal(other.categories.dtype, first.categories.dtype) + for other in to_union[1:] + ): + raise TypeError("dtype of categories must be the same") + + ordered = False + if all(first.is_dtype_equal(other) for other in to_union[1:]): + # identical categories - fastpath + categories = first.categories + ordered = first.ordered + + if all(first.categories.equals(other.categories) for other in to_union[1:]): + new_codes = np.concatenate([c.codes for c in to_union]) + else: + codes = [first.codes] + [ + _recode_for_categories(other.codes, other.categories, first.categories) + for other in to_union[1:] + ] + new_codes = np.concatenate(codes) + + if sort_categories and not ignore_order and ordered: + raise TypeError("Cannot use sort_categories=True with ordered Categoricals") + + if sort_categories and not categories.is_monotonic_increasing: + categories = categories.sort_values() + indexer = categories.get_indexer(first.categories) + + from pandas.core.algorithms import take_1d + + new_codes = take_1d(indexer, new_codes, fill_value=-1) + elif ignore_order or all(not c.ordered for c in to_union): + # different categories - union and recode + cats = first.categories.append([c.categories for c in to_union[1:]]) + categories = Index(cats.unique()) + if sort_categories: + categories = categories.sort_values() + + new_codes = [ + _recode_for_categories(c.codes, c.categories, categories) for c in to_union + ] + new_codes = np.concatenate(new_codes) + else: + # ordered - to show a proper error message + if all(c.ordered for c in to_union): + msg = "to union ordered Categoricals, all categories must be the same" + raise TypeError(msg) + else: + raise TypeError("Categorical.ordered must be the same") + + if ignore_order: + ordered = False + + return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True) + + +def _concatenate_2d(to_concat, axis: int): + # coerce to 2d if needed & concatenate + if axis == 1: + to_concat = [np.atleast_2d(x) for x in to_concat] + return np.concatenate(to_concat, axis=axis) + + +def concat_datetime(to_concat, axis=0, typs=None): + """ + provide concatenation of an datetimelike array of arrays each of which is a + single M8[ns], datetimet64[ns, tz] or m8[ns] dtype + + Parameters + ---------- + to_concat : array of arrays + axis : axis to provide concatenation + typs : set of to_concat dtypes + + Returns + ------- + a single array, preserving the combined dtypes + """ + + if typs is None: + typs = get_dtype_kinds(to_concat) + + # multiple types, need to coerce to object + if len(typs) != 1: + return _concatenate_2d( + [_convert_datetimelike_to_object(x) for x in to_concat], axis=axis + ) + + # must be single dtype + if any(typ.startswith("datetime") for typ in typs): + + if "datetime" in typs: + to_concat = [x.astype(np.int64, copy=False) for x in to_concat] + return _concatenate_2d(to_concat, axis=axis).view(_NS_DTYPE) + else: + # when to_concat has different tz, len(typs) > 1. + # thus no need to care + return _concat_datetimetz(to_concat) + + elif "timedelta" in typs: + return _concatenate_2d([x.view(np.int64) for x in to_concat], axis=axis).view( + _TD_DTYPE + ) + + elif any(typ.startswith("period") for typ in typs): + assert len(typs) == 1 + cls = to_concat[0] + new_values = cls._concat_same_type(to_concat) + return new_values + + +def _convert_datetimelike_to_object(x): + # coerce datetimelike array to object dtype + + # if dtype is of datetimetz or timezone + if x.dtype.kind == _NS_DTYPE.kind: + if getattr(x, "tz", None) is not None: + x = np.asarray(x.astype(object)) + else: + shape = x.shape + x = tslib.ints_to_pydatetime(x.view(np.int64).ravel(), box="timestamp") + x = x.reshape(shape) + + elif x.dtype == _TD_DTYPE: + shape = x.shape + x = tslibs.ints_to_pytimedelta(x.view(np.int64).ravel(), box=True) + x = x.reshape(shape) + + return x + + +def _concat_datetimetz(to_concat, name=None): + """ + concat DatetimeIndex with the same tz + all inputs must be DatetimeIndex + it is used in DatetimeIndex.append also + """ + # Right now, internals will pass a List[DatetimeArray] here + # for reductions like quantile. I would like to disentangle + # all this before we get here. + sample = to_concat[0] + + if isinstance(sample, ABCIndexClass): + return sample._concat_same_dtype(to_concat, name=name) + elif isinstance(sample, ABCDatetimeArray): + return sample._concat_same_type(to_concat) + + +def _concat_sparse(to_concat, axis=0, typs=None): + """ + provide concatenation of an sparse/dense array of arrays each of which is a + single dtype + + Parameters + ---------- + to_concat : array of arrays + axis : axis to provide concatenation + typs : set of to_concat dtypes + + Returns + ------- + a single array, preserving the combined dtypes + """ + + from pandas.core.arrays import SparseArray + + fill_values = [x.fill_value for x in to_concat if isinstance(x, SparseArray)] + fill_value = fill_values[0] + + # TODO: Fix join unit generation so we aren't passed this. + to_concat = [ + x + if isinstance(x, SparseArray) + else SparseArray(x.squeeze(), fill_value=fill_value) + for x in to_concat + ] + + return SparseArray._concat_same_type(to_concat) diff --git a/venv/Lib/site-packages/pandas/core/dtypes/dtypes.py b/venv/Lib/site-packages/pandas/core/dtypes/dtypes.py new file mode 100644 index 0000000..466ed81 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/dtypes/dtypes.py @@ -0,0 +1,1161 @@ +""" define extension dtypes """ +import re +from typing import Any, Dict, List, MutableMapping, Optional, Tuple, Type, Union, cast + +import numpy as np +import pytz + +from pandas._libs.interval import Interval +from pandas._libs.tslibs import NaT, Period, Timestamp, timezones +from pandas._typing import Ordered + +from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCDateOffset, ABCIndexClass +from pandas.core.dtypes.inference import is_bool, is_list_like + +str_type = str + + +def register_extension_dtype(cls: Type[ExtensionDtype]) -> Type[ExtensionDtype]: + """ + Register an ExtensionType with pandas as class decorator. + + .. versionadded:: 0.24.0 + + This enables operations like ``.astype(name)`` for the name + of the ExtensionDtype. + + Returns + ------- + callable + A class decorator. + + Examples + -------- + >>> from pandas.api.extensions import register_extension_dtype + >>> from pandas.api.extensions import ExtensionDtype + >>> @register_extension_dtype + ... class MyExtensionDtype(ExtensionDtype): + ... pass + """ + registry.register(cls) + return cls + + +class Registry: + """ + Registry for dtype inference. + + The registry allows one to map a string repr of a extension + dtype to an extension dtype. The string alias can be used in several + places, including + + * Series and Index constructors + * :meth:`pandas.array` + * :meth:`pandas.Series.astype` + + Multiple extension types can be registered. + These are tried in order. + """ + + def __init__(self): + self.dtypes: List[Type[ExtensionDtype]] = [] + + def register(self, dtype: Type[ExtensionDtype]) -> None: + """ + Parameters + ---------- + dtype : ExtensionDtype + """ + if not issubclass(dtype, ExtensionDtype): + raise ValueError("can only register pandas extension dtypes") + + self.dtypes.append(dtype) + + def find( + self, dtype: Union[Type[ExtensionDtype], str] + ) -> Optional[Type[ExtensionDtype]]: + """ + Parameters + ---------- + dtype : Type[ExtensionDtype] or str + + Returns + ------- + return the first matching dtype, otherwise return None + """ + if not isinstance(dtype, str): + dtype_type = dtype + if not isinstance(dtype, type): + dtype_type = type(dtype) + if issubclass(dtype_type, ExtensionDtype): + return dtype + + return None + + for dtype_type in self.dtypes: + try: + return dtype_type.construct_from_string(dtype) + except TypeError: + pass + + return None + + +registry = Registry() + + +class PandasExtensionDtype(ExtensionDtype): + """ + A np.dtype duck-typed class, suitable for holding a custom dtype. + + THIS IS NOT A REAL NUMPY DTYPE + """ + + type: Any + kind: Any + # The Any type annotations above are here only because mypy seems to have a + # problem dealing with with multiple inheritance from PandasExtensionDtype + # and ExtensionDtype's @properties in the subclasses below. The kind and + # type variables in those subclasses are explicitly typed below. + subdtype = None + str: Optional[str_type] = None + num = 100 + shape: Tuple[int, ...] = tuple() + itemsize = 8 + base = None + isbuiltin = 0 + isnative = 0 + _cache: Dict[str_type, "PandasExtensionDtype"] = {} + + def __str__(self) -> str_type: + """ + Return a string representation for a particular Object + """ + return self.name + + def __repr__(self) -> str_type: + """ + Return a string representation for a particular object. + """ + return str(self) + + def __hash__(self) -> int: + raise NotImplementedError("sub-classes should implement an __hash__ method") + + def __getstate__(self) -> Dict[str_type, Any]: + # pickle support; we don't want to pickle the cache + return {k: getattr(self, k, None) for k in self._metadata} + + @classmethod + def reset_cache(cls) -> None: + """ clear the cache """ + cls._cache = {} + + +class CategoricalDtypeType(type): + """ + the type of CategoricalDtype, this metaclass determines subclass ability + """ + + pass + + +@register_extension_dtype +class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): + """ + Type for categorical data with the categories and orderedness. + + .. versionchanged:: 0.21.0 + + Parameters + ---------- + categories : sequence, optional + Must be unique, and must not contain any nulls. + ordered : bool or None, default False + Whether or not this categorical is treated as a ordered categorical. + None can be used to maintain the ordered value of existing categoricals when + used in operations that combine categoricals, e.g. astype, and will resolve to + False if there is no existing ordered to maintain. + + Attributes + ---------- + categories + ordered + + Methods + ------- + None + + See Also + -------- + Categorical + + Notes + ----- + This class is useful for specifying the type of a ``Categorical`` + independent of the values. See :ref:`categorical.categoricaldtype` + for more. + + Examples + -------- + >>> t = pd.CategoricalDtype(categories=['b', 'a'], ordered=True) + >>> pd.Series(['a', 'b', 'a', 'c'], dtype=t) + 0 a + 1 b + 2 a + 3 NaN + dtype: category + Categories (2, object): [b < a] + """ + + # TODO: Document public vs. private API + name = "category" + type: Type[CategoricalDtypeType] = CategoricalDtypeType + kind: str_type = "O" + str = "|O08" + base = np.dtype("O") + _metadata = ("categories", "ordered") + _cache: Dict[str_type, PandasExtensionDtype] = {} + + def __init__(self, categories=None, ordered: Ordered = False): + self._finalize(categories, ordered, fastpath=False) + + @classmethod + def _from_fastpath( + cls, categories=None, ordered: Optional[bool] = None + ) -> "CategoricalDtype": + self = cls.__new__(cls) + self._finalize(categories, ordered, fastpath=True) + return self + + @classmethod + def _from_categorical_dtype( + cls, dtype: "CategoricalDtype", categories=None, ordered: Ordered = None + ) -> "CategoricalDtype": + if categories is ordered is None: + return dtype + if categories is None: + categories = dtype.categories + if ordered is None: + ordered = dtype.ordered + return cls(categories, ordered) + + @classmethod + def _from_values_or_dtype( + cls, + values=None, + categories=None, + ordered: Optional[bool] = None, + dtype: Optional["CategoricalDtype"] = None, + ) -> "CategoricalDtype": + """ + Construct dtype from the input parameters used in :class:`Categorical`. + + This constructor method specifically does not do the factorization + step, if that is needed to find the categories. This constructor may + therefore return ``CategoricalDtype(categories=None, ordered=None)``, + which may not be useful. Additional steps may therefore have to be + taken to create the final dtype. + + The return dtype is specified from the inputs in this prioritized + order: + 1. if dtype is a CategoricalDtype, return dtype + 2. if dtype is the string 'category', create a CategoricalDtype from + the supplied categories and ordered parameters, and return that. + 3. if values is a categorical, use value.dtype, but override it with + categories and ordered if either/both of those are not None. + 4. if dtype is None and values is not a categorical, construct the + dtype from categories and ordered, even if either of those is None. + + Parameters + ---------- + values : list-like, optional + The list-like must be 1-dimensional. + categories : list-like, optional + Categories for the CategoricalDtype. + ordered : bool, optional + Designating if the categories are ordered. + dtype : CategoricalDtype or the string "category", optional + If ``CategoricalDtype``, cannot be used together with + `categories` or `ordered`. + + Returns + ------- + CategoricalDtype + + Examples + -------- + >>> CategoricalDtype._from_values_or_dtype() + CategoricalDtype(categories=None, ordered=None) + >>> CategoricalDtype._from_values_or_dtype(categories=['a', 'b'], + ... ordered=True) + CategoricalDtype(categories=['a', 'b'], ordered=True) + >>> dtype1 = CategoricalDtype(['a', 'b'], ordered=True) + >>> dtype2 = CategoricalDtype(['x', 'y'], ordered=False) + >>> c = Categorical([0, 1], dtype=dtype1, fastpath=True) + >>> CategoricalDtype._from_values_or_dtype(c, ['x', 'y'], ordered=True, + ... dtype=dtype2) + ValueError: Cannot specify `categories` or `ordered` together with + `dtype`. + + The supplied dtype takes precedence over values' dtype: + + >>> CategoricalDtype._from_values_or_dtype(c, dtype=dtype2) + CategoricalDtype(['x', 'y'], ordered=False) + """ + from pandas.core.dtypes.common import is_categorical + + if dtype is not None: + # The dtype argument takes precedence over values.dtype (if any) + if isinstance(dtype, str): + if dtype == "category": + dtype = CategoricalDtype(categories, ordered) + else: + raise ValueError(f"Unknown dtype {repr(dtype)}") + elif categories is not None or ordered is not None: + raise ValueError( + "Cannot specify `categories` or `ordered` together with `dtype`." + ) + elif is_categorical(values): + # If no "dtype" was passed, use the one from "values", but honor + # the "ordered" and "categories" arguments + dtype = values.dtype._from_categorical_dtype( + values.dtype, categories, ordered + ) + else: + # If dtype=None and values is not categorical, create a new dtype. + # Note: This could potentially have categories=None and + # ordered=None. + dtype = CategoricalDtype(categories, ordered) + + return dtype + + @classmethod + def construct_from_string(cls, string: str_type) -> "CategoricalDtype": + """ + Construct a CategoricalDtype from a string. + + Parameters + ---------- + string : str + Must be the string "category" in order to be successfully constructed. + + Returns + ------- + CategoricalDtype + Instance of the dtype. + + Raises + ------ + TypeError + If a CategoricalDtype cannot be constructed from the input. + """ + if not isinstance(string, str): + raise TypeError(f"Expects a string, got {type(string)}") + if string != cls.name: + raise TypeError(f"Cannot construct a 'CategoricalDtype' from '{string}'") + + # need ordered=None to ensure that operations specifying dtype="category" don't + # override the ordered value for existing categoricals + return cls(ordered=None) + + def _finalize(self, categories, ordered: Ordered, fastpath: bool = False) -> None: + + if ordered is not None: + self.validate_ordered(ordered) + + if categories is not None: + categories = self.validate_categories(categories, fastpath=fastpath) + + self._categories = categories + self._ordered = ordered + + def __setstate__(self, state: MutableMapping[str_type, Any]) -> None: + # for pickle compat. __get_state__ is defined in the + # PandasExtensionDtype superclass and uses the public properties to + # pickle -> need to set the settable private ones here (see GH26067) + self._categories = state.pop("categories", None) + self._ordered = state.pop("ordered", False) + + def __hash__(self) -> int: + # _hash_categories returns a uint64, so use the negative + # space for when we have unknown categories to avoid a conflict + if self.categories is None: + if self.ordered: + return -1 + else: + return -2 + # We *do* want to include the real self.ordered here + return int(self._hash_categories(self.categories, self.ordered)) + + def __eq__(self, other: Any) -> bool: + """ + Rules for CDT equality: + 1) Any CDT is equal to the string 'category' + 2) Any CDT is equal to itself + 3) Any CDT is equal to a CDT with categories=None regardless of ordered + 4) A CDT with ordered=True is only equal to another CDT with + ordered=True and identical categories in the same order + 5) A CDT with ordered={False, None} is only equal to another CDT with + ordered={False, None} and identical categories, but same order is + not required. There is no distinction between False/None. + 6) Any other comparison returns False + """ + if isinstance(other, str): + return other == self.name + elif other is self: + return True + elif not (hasattr(other, "ordered") and hasattr(other, "categories")): + return False + elif self.categories is None or other.categories is None: + # We're forced into a suboptimal corner thanks to math and + # backwards compatibility. We require that `CDT(...) == 'category'` + # for all CDTs **including** `CDT(None, ...)`. Therefore, *all* + # CDT(., .) = CDT(None, False) and *all* + # CDT(., .) = CDT(None, True). + return True + elif self.ordered or other.ordered: + # At least one has ordered=True; equal if both have ordered=True + # and the same values for categories in the same order. + return (self.ordered == other.ordered) and self.categories.equals( + other.categories + ) + else: + # Neither has ordered=True; equal if both have the same categories, + # but same order is not necessary. There is no distinction between + # ordered=False and ordered=None: CDT(., False) and CDT(., None) + # will be equal if they have the same categories. + if ( + self.categories.dtype == other.categories.dtype + and self.categories.equals(other.categories) + ): + # Check and see if they happen to be identical categories + return True + return hash(self) == hash(other) + + def __repr__(self) -> str_type: + tpl = "CategoricalDtype(categories={data}ordered={ordered})" + if self.categories is None: + data = "None, " + else: + data = self.categories._format_data(name=type(self).__name__) + return tpl.format(data=data, ordered=self.ordered) + + @staticmethod + def _hash_categories(categories, ordered: Ordered = True) -> int: + from pandas.core.util.hashing import ( + hash_array, + _combine_hash_arrays, + hash_tuples, + ) + from pandas.core.dtypes.common import is_datetime64tz_dtype, _NS_DTYPE + + if len(categories) and isinstance(categories[0], tuple): + # assumes if any individual category is a tuple, then all our. ATM + # I don't really want to support just some of the categories being + # tuples. + categories = list(categories) # breaks if a np.array of categories + cat_array = hash_tuples(categories) + else: + if categories.dtype == "O": + if len({type(x) for x in categories}) != 1: + # TODO: hash_array doesn't handle mixed types. It casts + # everything to a str first, which means we treat + # {'1', '2'} the same as {'1', 2} + # find a better solution + hashed = hash((tuple(categories), ordered)) + return hashed + + if is_datetime64tz_dtype(categories.dtype): + # Avoid future warning. + categories = categories.astype(_NS_DTYPE) + + cat_array = hash_array(np.asarray(categories), categorize=False) + if ordered: + cat_array = np.vstack( + [cat_array, np.arange(len(cat_array), dtype=cat_array.dtype)] + ) + else: + cat_array = [cat_array] + hashed = _combine_hash_arrays(iter(cat_array), num_items=len(cat_array)) + return np.bitwise_xor.reduce(hashed) + + @classmethod + def construct_array_type(cls): + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + from pandas import Categorical + + return Categorical + + @staticmethod + def validate_ordered(ordered: Ordered) -> None: + """ + Validates that we have a valid ordered parameter. If + it is not a boolean, a TypeError will be raised. + + Parameters + ---------- + ordered : object + The parameter to be verified. + + Raises + ------ + TypeError + If 'ordered' is not a boolean. + """ + if not is_bool(ordered): + raise TypeError("'ordered' must either be 'True' or 'False'") + + @staticmethod + def validate_categories(categories, fastpath: bool = False): + """ + Validates that we have good categories + + Parameters + ---------- + categories : array-like + fastpath : bool + Whether to skip nan and uniqueness checks + + Returns + ------- + categories : Index + """ + from pandas.core.indexes.base import Index + + if not fastpath and not is_list_like(categories): + raise TypeError( + f"Parameter 'categories' must be list-like, was {repr(categories)}" + ) + elif not isinstance(categories, ABCIndexClass): + categories = Index(categories, tupleize_cols=False) + + if not fastpath: + + if categories.hasnans: + raise ValueError("Categorial categories cannot be null") + + if not categories.is_unique: + raise ValueError("Categorical categories must be unique") + + if isinstance(categories, ABCCategoricalIndex): + categories = categories.categories + + return categories + + def update_dtype( + self, dtype: Union[str_type, "CategoricalDtype"] + ) -> "CategoricalDtype": + """ + Returns a CategoricalDtype with categories and ordered taken from dtype + if specified, otherwise falling back to self if unspecified + + Parameters + ---------- + dtype : CategoricalDtype + + Returns + ------- + new_dtype : CategoricalDtype + """ + if isinstance(dtype, str) and dtype == "category": + # dtype='category' should not change anything + return self + elif not self.is_dtype(dtype): + raise ValueError( + f"a CategoricalDtype must be passed to perform an update, " + f"got {repr(dtype)}" + ) + else: + # from here on, dtype is a CategoricalDtype + dtype = cast(CategoricalDtype, dtype) + + # update categories/ordered unless they've been explicitly passed as None + new_categories = ( + dtype.categories if dtype.categories is not None else self.categories + ) + new_ordered = dtype.ordered if dtype.ordered is not None else self.ordered + + return CategoricalDtype(new_categories, new_ordered) + + @property + def categories(self): + """ + An ``Index`` containing the unique categories allowed. + """ + return self._categories + + @property + def ordered(self) -> Ordered: + """ + Whether the categories have an ordered relationship. + """ + return self._ordered + + @property + def _is_boolean(self) -> bool: + from pandas.core.dtypes.common import is_bool_dtype + + return is_bool_dtype(self.categories) + + +@register_extension_dtype +class DatetimeTZDtype(PandasExtensionDtype): + """ + An ExtensionDtype for timezone-aware datetime data. + + **This is not an actual numpy dtype**, but a duck type. + + Parameters + ---------- + unit : str, default "ns" + The precision of the datetime data. Currently limited + to ``"ns"``. + tz : str, int, or datetime.tzinfo + The timezone. + + Attributes + ---------- + unit + tz + + Methods + ------- + None + + Raises + ------ + pytz.UnknownTimeZoneError + When the requested timezone cannot be found. + + Examples + -------- + >>> pd.DatetimeTZDtype(tz='UTC') + datetime64[ns, UTC] + + >>> pd.DatetimeTZDtype(tz='dateutil/US/Central') + datetime64[ns, tzfile('/usr/share/zoneinfo/US/Central')] + """ + + type: Type[Timestamp] = Timestamp + kind: str_type = "M" + str = "|M8[ns]" + num = 101 + base = np.dtype("M8[ns]") + na_value = NaT + _metadata = ("unit", "tz") + _match = re.compile(r"(datetime64|M8)\[(?P.+), (?P.+)\]") + _cache: Dict[str_type, PandasExtensionDtype] = {} + + def __init__(self, unit="ns", tz=None): + if isinstance(unit, DatetimeTZDtype): + unit, tz = unit.unit, unit.tz + + if unit != "ns": + if isinstance(unit, str) and tz is None: + # maybe a string like datetime64[ns, tz], which we support for + # now. + result = type(self).construct_from_string(unit) + unit = result.unit + tz = result.tz + msg = ( + f"Passing a dtype alias like 'datetime64[ns, {tz}]' " + "to DatetimeTZDtype is no longer supported. Use " + "'DatetimeTZDtype.construct_from_string()' instead." + ) + raise ValueError(msg) + else: + raise ValueError("DatetimeTZDtype only supports ns units") + + if tz: + tz = timezones.maybe_get_tz(tz) + tz = timezones.tz_standardize(tz) + elif tz is not None: + raise pytz.UnknownTimeZoneError(tz) + if tz is None: + raise TypeError("A 'tz' is required.") + + self._unit = unit + self._tz = tz + + @property + def unit(self): + """ + The precision of the datetime data. + """ + return self._unit + + @property + def tz(self): + """ + The timezone. + """ + return self._tz + + @classmethod + def construct_array_type(cls): + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + from pandas.core.arrays import DatetimeArray + + return DatetimeArray + + @classmethod + def construct_from_string(cls, string: str_type): + """ + Construct a DatetimeTZDtype from a string. + + Parameters + ---------- + string : str + The string alias for this DatetimeTZDtype. + Should be formatted like ``datetime64[ns, ]``, + where ```` is the timezone name. + + Examples + -------- + >>> DatetimeTZDtype.construct_from_string('datetime64[ns, UTC]') + datetime64[ns, UTC] + """ + if isinstance(string, str): + msg = f"Cannot construct a 'DatetimeTZDtype' from '{string}'" + match = cls._match.match(string) + if match: + d = match.groupdict() + try: + return cls(unit=d["unit"], tz=d["tz"]) + except (KeyError, TypeError, ValueError) as err: + # KeyError if maybe_get_tz tries and fails to get a + # pytz timezone (actually pytz.UnknownTimeZoneError). + # TypeError if we pass a nonsense tz; + # ValueError if we pass a unit other than "ns" + raise TypeError(msg) from err + raise TypeError(msg) + + raise TypeError("Cannot construct a 'DatetimeTZDtype'") + + def __str__(self) -> str_type: + return f"datetime64[{self.unit}, {self.tz}]" + + @property + def name(self) -> str_type: + """A string representation of the dtype.""" + return str(self) + + def __hash__(self) -> int: + # make myself hashable + # TODO: update this. + return hash(str(self)) + + def __eq__(self, other: Any) -> bool: + if isinstance(other, str): + return other == self.name + + return ( + isinstance(other, DatetimeTZDtype) + and self.unit == other.unit + and str(self.tz) == str(other.tz) + ) + + def __setstate__(self, state): + # for pickle compat. __get_state__ is defined in the + # PandasExtensionDtype superclass and uses the public properties to + # pickle -> need to set the settable private ones here (see GH26067) + self._tz = state["tz"] + self._unit = state["unit"] + + +@register_extension_dtype +class PeriodDtype(PandasExtensionDtype): + """ + An ExtensionDtype for Period data. + + **This is not an actual numpy dtype**, but a duck type. + + Parameters + ---------- + freq : str or DateOffset + The frequency of this PeriodDtype. + + Attributes + ---------- + freq + + Methods + ------- + None + + Examples + -------- + >>> pd.PeriodDtype(freq='D') + period[D] + + >>> pd.PeriodDtype(freq=pd.offsets.MonthEnd()) + period[M] + """ + + type: Type[Period] = Period + kind: str_type = "O" + str = "|O08" + base = np.dtype("O") + num = 102 + _metadata = ("freq",) + _match = re.compile(r"(P|p)eriod\[(?P.+)\]") + _cache: Dict[str_type, PandasExtensionDtype] = {} + + def __new__(cls, freq=None): + """ + Parameters + ---------- + freq : frequency + """ + + if isinstance(freq, PeriodDtype): + return freq + + elif freq is None: + # empty constructor for pickle compat + u = object.__new__(cls) + u._freq = None + return u + + if not isinstance(freq, ABCDateOffset): + freq = cls._parse_dtype_strict(freq) + + try: + return cls._cache[freq.freqstr] + except KeyError: + u = object.__new__(cls) + u._freq = freq + cls._cache[freq.freqstr] = u + return u + + @property + def freq(self): + """ + The frequency object of this PeriodDtype. + """ + return self._freq + + @classmethod + def _parse_dtype_strict(cls, freq): + if isinstance(freq, str): + if freq.startswith("period[") or freq.startswith("Period["): + m = cls._match.search(freq) + if m is not None: + freq = m.group("freq") + from pandas.tseries.frequencies import to_offset + + freq = to_offset(freq) + if freq is not None: + return freq + + raise ValueError("could not construct PeriodDtype") + + @classmethod + def construct_from_string(cls, string): + """ + Strict construction from a string, raise a TypeError if not + possible + """ + if ( + isinstance(string, str) + and (string.startswith("period[") or string.startswith("Period[")) + or isinstance(string, ABCDateOffset) + ): + # do not parse string like U as period[U] + # avoid tuple to be regarded as freq + try: + return cls(freq=string) + except ValueError: + pass + if isinstance(string, str): + msg = f"Cannot construct a 'PeriodDtype' from '{string}'" + else: + msg = f"'construct_from_string' expects a string, got {type(string)}" + raise TypeError(msg) + + def __str__(self) -> str_type: + return self.name + + @property + def name(self) -> str_type: + return f"period[{self.freq.freqstr}]" + + @property + def na_value(self): + return NaT + + def __hash__(self) -> int: + # make myself hashable + return hash(str(self)) + + def __eq__(self, other: Any) -> bool: + if isinstance(other, str): + return other == self.name or other == self.name.title() + + return isinstance(other, PeriodDtype) and self.freq == other.freq + + def __setstate__(self, state): + # for pickle compat. __get_state__ is defined in the + # PandasExtensionDtype superclass and uses the public properties to + # pickle -> need to set the settable private ones here (see GH26067) + self._freq = state["freq"] + + @classmethod + def is_dtype(cls, dtype) -> bool: + """ + Return a boolean if we if the passed type is an actual dtype that we + can match (via string or type) + """ + + if isinstance(dtype, str): + # PeriodDtype can be instantiated from freq string like "U", + # but doesn't regard freq str like "U" as dtype. + if dtype.startswith("period[") or dtype.startswith("Period["): + try: + if cls._parse_dtype_strict(dtype) is not None: + return True + else: + return False + except ValueError: + return False + else: + return False + return super().is_dtype(dtype) + + @classmethod + def construct_array_type(cls): + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + from pandas.core.arrays import PeriodArray + + return PeriodArray + + def __from_arrow__(self, array): + """Construct PeriodArray from pyarrow Array/ChunkedArray.""" + import pyarrow + from pandas.core.arrays import PeriodArray + from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + chunks = array.chunks + + results = [] + for arr in chunks: + data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype="int64") + parr = PeriodArray(data.copy(), freq=self.freq, copy=False) + parr[~mask] = NaT + results.append(parr) + + return PeriodArray._concat_same_type(results) + + +@register_extension_dtype +class IntervalDtype(PandasExtensionDtype): + """ + An ExtensionDtype for Interval data. + + **This is not an actual numpy dtype**, but a duck type. + + Parameters + ---------- + subtype : str, np.dtype + The dtype of the Interval bounds. + + Attributes + ---------- + subtype + + Methods + ------- + None + + Examples + -------- + >>> pd.IntervalDtype(subtype='int64') + interval[int64] + """ + + name = "interval" + kind: str_type = "O" + str = "|O08" + base = np.dtype("O") + num = 103 + _metadata = ("subtype",) + _match = re.compile(r"(I|i)nterval\[(?P.+)\]") + _cache: Dict[str_type, PandasExtensionDtype] = {} + + def __new__(cls, subtype=None): + from pandas.core.dtypes.common import ( + is_categorical_dtype, + is_string_dtype, + pandas_dtype, + ) + + if isinstance(subtype, IntervalDtype): + return subtype + elif subtype is None: + # we are called as an empty constructor + # generally for pickle compat + u = object.__new__(cls) + u._subtype = None + return u + elif isinstance(subtype, str) and subtype.lower() == "interval": + subtype = None + else: + if isinstance(subtype, str): + m = cls._match.search(subtype) + if m is not None: + subtype = m.group("subtype") + + try: + subtype = pandas_dtype(subtype) + except TypeError: + raise TypeError("could not construct IntervalDtype") + + if is_categorical_dtype(subtype) or is_string_dtype(subtype): + # GH 19016 + msg = ( + "category, object, and string subtypes are not supported " + "for IntervalDtype" + ) + raise TypeError(msg) + + try: + return cls._cache[str(subtype)] + except KeyError: + u = object.__new__(cls) + u._subtype = subtype + cls._cache[str(subtype)] = u + return u + + @property + def subtype(self): + """ + The dtype of the Interval bounds. + """ + return self._subtype + + @classmethod + def construct_array_type(cls): + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + from pandas.core.arrays import IntervalArray + + return IntervalArray + + @classmethod + def construct_from_string(cls, string): + """ + attempt to construct this type from a string, raise a TypeError + if its not possible + """ + if not isinstance(string, str): + raise TypeError(f"a string needs to be passed, got type {type(string)}") + + if string.lower() == "interval" or cls._match.search(string) is not None: + return cls(string) + + msg = ( + f"Cannot construct a 'IntervalDtype' from '{string}'.\n\n" + "Incorrectly formatted string passed to constructor. " + "Valid formats include Interval or Interval[dtype] " + "where dtype is numeric, datetime, or timedelta" + ) + raise TypeError(msg) + + @property + def type(self): + return Interval + + def __str__(self) -> str_type: + if self.subtype is None: + return "interval" + return f"interval[{self.subtype}]" + + def __hash__(self) -> int: + # make myself hashable + return hash(str(self)) + + def __eq__(self, other: Any) -> bool: + if isinstance(other, str): + return other.lower() in (self.name.lower(), str(self).lower()) + elif not isinstance(other, IntervalDtype): + return False + elif self.subtype is None or other.subtype is None: + # None should match any subtype + return True + else: + from pandas.core.dtypes.common import is_dtype_equal + + return is_dtype_equal(self.subtype, other.subtype) + + def __setstate__(self, state): + # for pickle compat. __get_state__ is defined in the + # PandasExtensionDtype superclass and uses the public properties to + # pickle -> need to set the settable private ones here (see GH26067) + self._subtype = state["subtype"] + + @classmethod + def is_dtype(cls, dtype) -> bool: + """ + Return a boolean if we if the passed type is an actual dtype that we + can match (via string or type) + """ + + if isinstance(dtype, str): + if dtype.lower().startswith("interval"): + try: + if cls.construct_from_string(dtype) is not None: + return True + else: + return False + except (ValueError, TypeError): + return False + else: + return False + return super().is_dtype(dtype) + + def __from_arrow__(self, array): + """Construct IntervalArray from pyarrow Array/ChunkedArray.""" + import pyarrow + from pandas.core.arrays import IntervalArray + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + chunks = array.chunks + + results = [] + for arr in chunks: + left = np.asarray(arr.storage.field("left"), dtype=self.subtype) + right = np.asarray(arr.storage.field("right"), dtype=self.subtype) + iarr = IntervalArray.from_arrays(left, right, closed=array.type.closed) + results.append(iarr) + + return IntervalArray._concat_same_type(results) diff --git a/venv/Lib/site-packages/pandas/core/dtypes/generic.py b/venv/Lib/site-packages/pandas/core/dtypes/generic.py new file mode 100644 index 0000000..4c3f8b7 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/dtypes/generic.py @@ -0,0 +1,84 @@ +""" define generic base classes for pandas objects """ + + +# define abstract base classes to enable isinstance type checking on our +# objects +def create_pandas_abc_type(name, attr, comp): + + # https://github.com/python/mypy/issues/1006 + # error: 'classmethod' used with a non-method + @classmethod # type: ignore + def _check(cls, inst) -> bool: + return getattr(inst, attr, "_typ") in comp + + dct = dict(__instancecheck__=_check, __subclasscheck__=_check) + meta = type("ABCBase", (type,), dct) + return meta(name, tuple(), dct) + + +ABCIndex = create_pandas_abc_type("ABCIndex", "_typ", ("index",)) +ABCInt64Index = create_pandas_abc_type("ABCInt64Index", "_typ", ("int64index",)) +ABCUInt64Index = create_pandas_abc_type("ABCUInt64Index", "_typ", ("uint64index",)) +ABCRangeIndex = create_pandas_abc_type("ABCRangeIndex", "_typ", ("rangeindex",)) +ABCFloat64Index = create_pandas_abc_type("ABCFloat64Index", "_typ", ("float64index",)) +ABCMultiIndex = create_pandas_abc_type("ABCMultiIndex", "_typ", ("multiindex",)) +ABCDatetimeIndex = create_pandas_abc_type( + "ABCDatetimeIndex", "_typ", ("datetimeindex",) +) +ABCTimedeltaIndex = create_pandas_abc_type( + "ABCTimedeltaIndex", "_typ", ("timedeltaindex",) +) +ABCPeriodIndex = create_pandas_abc_type("ABCPeriodIndex", "_typ", ("periodindex",)) +ABCCategoricalIndex = create_pandas_abc_type( + "ABCCategoricalIndex", "_typ", ("categoricalindex",) +) +ABCIntervalIndex = create_pandas_abc_type( + "ABCIntervalIndex", "_typ", ("intervalindex",) +) +ABCIndexClass = create_pandas_abc_type( + "ABCIndexClass", + "_typ", + ( + "index", + "int64index", + "rangeindex", + "float64index", + "uint64index", + "multiindex", + "datetimeindex", + "timedeltaindex", + "periodindex", + "categoricalindex", + "intervalindex", + ), +) + +ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series",)) +ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe",)) + +ABCSparseArray = create_pandas_abc_type( + "ABCSparseArray", "_subtyp", ("sparse_array", "sparse_series") +) +ABCCategorical = create_pandas_abc_type("ABCCategorical", "_typ", ("categorical")) +ABCDatetimeArray = create_pandas_abc_type("ABCDatetimeArray", "_typ", ("datetimearray")) +ABCTimedeltaArray = create_pandas_abc_type( + "ABCTimedeltaArray", "_typ", ("timedeltaarray") +) +ABCPeriodArray = create_pandas_abc_type("ABCPeriodArray", "_typ", ("periodarray",)) +ABCPeriod = create_pandas_abc_type("ABCPeriod", "_typ", ("period",)) +ABCDateOffset = create_pandas_abc_type("ABCDateOffset", "_typ", ("dateoffset",)) +ABCInterval = create_pandas_abc_type("ABCInterval", "_typ", ("interval",)) +ABCExtensionArray = create_pandas_abc_type( + "ABCExtensionArray", + "_typ", + ("extension", "categorical", "periodarray", "datetimearray", "timedeltaarray"), +) +ABCPandasArray = create_pandas_abc_type("ABCPandasArray", "_typ", ("npy_extension",)) + + +class _ABCGeneric(type): + def __instancecheck__(cls, inst) -> bool: + return hasattr(inst, "_data") + + +ABCGeneric = _ABCGeneric("ABCGeneric", tuple(), {}) diff --git a/venv/Lib/site-packages/pandas/core/dtypes/inference.py b/venv/Lib/site-packages/pandas/core/dtypes/inference.py new file mode 100644 index 0000000..9e92780 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/dtypes/inference.py @@ -0,0 +1,424 @@ +""" basic inference routines """ + +from collections import abc +from numbers import Number +import re +from typing import Pattern + +import numpy as np + +from pandas._libs import lib + +is_bool = lib.is_bool + +is_integer = lib.is_integer + +is_float = lib.is_float + +is_complex = lib.is_complex + +is_scalar = lib.is_scalar + +is_decimal = lib.is_decimal + +is_interval = lib.is_interval + +is_list_like = lib.is_list_like + + +def is_number(obj) -> bool: + """ + Check if the object is a number. + + Returns True when the object is a number, and False if is not. + + Parameters + ---------- + obj : any type + The object to check if is a number. + + Returns + ------- + is_number : bool + Whether `obj` is a number or not. + + See Also + -------- + api.types.is_integer: Checks a subgroup of numbers. + + Examples + -------- + >>> pd.api.types.is_number(1) + True + >>> pd.api.types.is_number(7.15) + True + + Booleans are valid because they are int subclass. + + >>> pd.api.types.is_number(False) + True + + >>> pd.api.types.is_number("foo") + False + >>> pd.api.types.is_number("5") + False + """ + + return isinstance(obj, (Number, np.number)) + + +def _iterable_not_string(obj) -> bool: + """ + Check if the object is an iterable but not a string. + + Parameters + ---------- + obj : The object to check. + + Returns + ------- + is_iter_not_string : bool + Whether `obj` is a non-string iterable. + + Examples + -------- + >>> _iterable_not_string([1, 2, 3]) + True + >>> _iterable_not_string("foo") + False + >>> _iterable_not_string(1) + False + """ + + return isinstance(obj, abc.Iterable) and not isinstance(obj, str) + + +def is_iterator(obj) -> bool: + """ + Check if the object is an iterator. + + For example, lists are considered iterators + but not strings or datetime objects. + + Parameters + ---------- + obj : The object to check + + Returns + ------- + is_iter : bool + Whether `obj` is an iterator. + + Examples + -------- + >>> is_iterator([1, 2, 3]) + True + >>> is_iterator(datetime(2017, 1, 1)) + False + >>> is_iterator("foo") + False + >>> is_iterator(1) + False + """ + + if not hasattr(obj, "__iter__"): + return False + + return hasattr(obj, "__next__") + + +def is_file_like(obj) -> bool: + """ + Check if the object is a file-like object. + + For objects to be considered file-like, they must + be an iterator AND have either a `read` and/or `write` + method as an attribute. + + Note: file-like objects must be iterable, but + iterable objects need not be file-like. + + Parameters + ---------- + obj : The object to check + + Returns + ------- + is_file_like : bool + Whether `obj` has file-like properties. + + Examples + -------- + >>> buffer(StringIO("data")) + >>> is_file_like(buffer) + True + >>> is_file_like([1, 2, 3]) + False + """ + + if not (hasattr(obj, "read") or hasattr(obj, "write")): + return False + + if not hasattr(obj, "__iter__"): + return False + + return True + + +def is_re(obj) -> bool: + """ + Check if the object is a regex pattern instance. + + Parameters + ---------- + obj : The object to check + + Returns + ------- + is_regex : bool + Whether `obj` is a regex pattern. + + Examples + -------- + >>> is_re(re.compile(".*")) + True + >>> is_re("foo") + False + """ + return isinstance(obj, Pattern) + + +def is_re_compilable(obj) -> bool: + """ + Check if the object can be compiled into a regex pattern instance. + + Parameters + ---------- + obj : The object to check + + Returns + ------- + is_regex_compilable : bool + Whether `obj` can be compiled as a regex pattern. + + Examples + -------- + >>> is_re_compilable(".*") + True + >>> is_re_compilable(1) + False + """ + + try: + re.compile(obj) + except TypeError: + return False + else: + return True + + +def is_array_like(obj) -> bool: + """ + Check if the object is array-like. + + For an object to be considered array-like, it must be list-like and + have a `dtype` attribute. + + Parameters + ---------- + obj : The object to check + + Returns + ------- + is_array_like : bool + Whether `obj` has array-like properties. + + Examples + -------- + >>> is_array_like(np.array([1, 2, 3])) + True + >>> is_array_like(pd.Series(["a", "b"])) + True + >>> is_array_like(pd.Index(["2016-01-01"])) + True + >>> is_array_like([1, 2, 3]) + False + >>> is_array_like(("a", "b")) + False + """ + + return is_list_like(obj) and hasattr(obj, "dtype") + + +def is_nested_list_like(obj) -> bool: + """ + Check if the object is list-like, and that all of its elements + are also list-like. + + Parameters + ---------- + obj : The object to check + + Returns + ------- + is_list_like : bool + Whether `obj` has list-like properties. + + Examples + -------- + >>> is_nested_list_like([[1, 2, 3]]) + True + >>> is_nested_list_like([{1, 2, 3}, {1, 2, 3}]) + True + >>> is_nested_list_like(["foo"]) + False + >>> is_nested_list_like([]) + False + >>> is_nested_list_like([[1, 2, 3], 1]) + False + + Notes + ----- + This won't reliably detect whether a consumable iterator (e. g. + a generator) is a nested-list-like without consuming the iterator. + To avoid consuming it, we always return False if the outer container + doesn't define `__len__`. + + See Also + -------- + is_list_like + """ + return ( + is_list_like(obj) + and hasattr(obj, "__len__") + and len(obj) > 0 + and all(is_list_like(item) for item in obj) + ) + + +def is_dict_like(obj) -> bool: + """ + Check if the object is dict-like. + + Parameters + ---------- + obj : The object to check + + Returns + ------- + is_dict_like : bool + Whether `obj` has dict-like properties. + + Examples + -------- + >>> is_dict_like({1: 2}) + True + >>> is_dict_like([1, 2, 3]) + False + >>> is_dict_like(dict) + False + >>> is_dict_like(dict()) + True + """ + dict_like_attrs = ("__getitem__", "keys", "__contains__") + return ( + all(hasattr(obj, attr) for attr in dict_like_attrs) + # [GH 25196] exclude classes + and not isinstance(obj, type) + ) + + +def is_named_tuple(obj) -> bool: + """ + Check if the object is a named tuple. + + Parameters + ---------- + obj : The object to check + + Returns + ------- + is_named_tuple : bool + Whether `obj` is a named tuple. + + Examples + -------- + >>> Point = namedtuple("Point", ["x", "y"]) + >>> p = Point(1, 2) + >>> + >>> is_named_tuple(p) + True + >>> is_named_tuple((1, 2)) + False + """ + + return isinstance(obj, tuple) and hasattr(obj, "_fields") + + +def is_hashable(obj) -> bool: + """ + Return True if hash(obj) will succeed, False otherwise. + + Some types will pass a test against collections.abc.Hashable but fail when + they are actually hashed with hash(). + + Distinguish between these and other types by trying the call to hash() and + seeing if they raise TypeError. + + Returns + ------- + bool + + Examples + -------- + >>> a = ([],) + >>> isinstance(a, collections.abc.Hashable) + True + >>> is_hashable(a) + False + """ + # Unfortunately, we can't use isinstance(obj, collections.abc.Hashable), + # which can be faster than calling hash. That is because numpy scalars + # fail this test. + + # Reconsider this decision once this numpy bug is fixed: + # https://github.com/numpy/numpy/issues/5562 + + try: + hash(obj) + except TypeError: + return False + else: + return True + + +def is_sequence(obj) -> bool: + """ + Check if the object is a sequence of objects. + String types are not included as sequences here. + + Parameters + ---------- + obj : The object to check + + Returns + ------- + is_sequence : bool + Whether `obj` is a sequence of objects. + + Examples + -------- + >>> l = [1, 2, 3] + >>> + >>> is_sequence(l) + True + >>> is_sequence(iter(l)) + False + """ + + try: + iter(obj) # Can iterate over it. + len(obj) # Has a length associated with it. + return not isinstance(obj, (str, bytes)) + except (TypeError, AttributeError): + return False diff --git a/venv/Lib/site-packages/pandas/core/dtypes/missing.py b/venv/Lib/site-packages/pandas/core/dtypes/missing.py new file mode 100644 index 0000000..fb579f2 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/dtypes/missing.py @@ -0,0 +1,609 @@ +""" +missing types & inference +""" +import numpy as np + +from pandas._config import get_option + +from pandas._libs import lib +import pandas._libs.missing as libmissing +from pandas._libs.tslibs import NaT, iNaT + +from pandas.core.dtypes.common import ( + _NS_DTYPE, + _TD_DTYPE, + ensure_object, + is_bool_dtype, + is_complex_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_datetimelike_v_numeric, + is_dtype_equal, + is_extension_array_dtype, + is_float_dtype, + is_integer_dtype, + is_object_dtype, + is_period_dtype, + is_scalar, + is_string_dtype, + is_string_like_dtype, + is_timedelta64_dtype, + needs_i8_conversion, + pandas_dtype, +) +from pandas.core.dtypes.generic import ( + ABCDatetimeArray, + ABCExtensionArray, + ABCGeneric, + ABCIndexClass, + ABCMultiIndex, + ABCSeries, + ABCTimedeltaArray, +) +from pandas.core.dtypes.inference import is_list_like + +isposinf_scalar = libmissing.isposinf_scalar +isneginf_scalar = libmissing.isneginf_scalar + + +def isna(obj): + """ + Detect missing values for an array-like object. + + This function takes a scalar or array-like object and indicates + whether values are missing (``NaN`` in numeric arrays, ``None`` or ``NaN`` + in object arrays, ``NaT`` in datetimelike). + + Parameters + ---------- + obj : scalar or array-like + Object to check for null or missing values. + + Returns + ------- + bool or array-like of bool + For scalar input, returns a scalar boolean. + For array input, returns an array of boolean indicating whether each + corresponding element is missing. + + See Also + -------- + notna : Boolean inverse of pandas.isna. + Series.isna : Detect missing values in a Series. + DataFrame.isna : Detect missing values in a DataFrame. + Index.isna : Detect missing values in an Index. + + Examples + -------- + Scalar arguments (including strings) result in a scalar boolean. + + >>> pd.isna('dog') + False + + >>> pd.isna(pd.NA) + True + + >>> pd.isna(np.nan) + True + + ndarrays result in an ndarray of booleans. + + >>> array = np.array([[1, np.nan, 3], [4, 5, np.nan]]) + >>> array + array([[ 1., nan, 3.], + [ 4., 5., nan]]) + >>> pd.isna(array) + array([[False, True, False], + [False, False, True]]) + + For indexes, an ndarray of booleans is returned. + + >>> index = pd.DatetimeIndex(["2017-07-05", "2017-07-06", None, + ... "2017-07-08"]) + >>> index + DatetimeIndex(['2017-07-05', '2017-07-06', 'NaT', '2017-07-08'], + dtype='datetime64[ns]', freq=None) + >>> pd.isna(index) + array([False, False, True, False]) + + For Series and DataFrame, the same type is returned, containing booleans. + + >>> df = pd.DataFrame([['ant', 'bee', 'cat'], ['dog', None, 'fly']]) + >>> df + 0 1 2 + 0 ant bee cat + 1 dog None fly + >>> pd.isna(df) + 0 1 2 + 0 False False False + 1 False True False + + >>> pd.isna(df[1]) + 0 False + 1 True + Name: 1, dtype: bool + """ + return _isna(obj) + + +isnull = isna + + +def _isna_new(obj): + + if is_scalar(obj): + return libmissing.checknull(obj) + # hack (for now) because MI registers as ndarray + elif isinstance(obj, ABCMultiIndex): + raise NotImplementedError("isna is not defined for MultiIndex") + elif isinstance(obj, type): + return False + elif isinstance( + obj, + ( + ABCSeries, + np.ndarray, + ABCIndexClass, + ABCExtensionArray, + ABCDatetimeArray, + ABCTimedeltaArray, + ), + ): + return _isna_ndarraylike(obj) + elif isinstance(obj, ABCGeneric): + return obj._constructor(obj._data.isna(func=isna)) + elif isinstance(obj, list): + return _isna_ndarraylike(np.asarray(obj, dtype=object)) + elif hasattr(obj, "__array__"): + return _isna_ndarraylike(np.asarray(obj)) + else: + return obj is None + + +def _isna_old(obj): + """ + Detect missing values, treating None, NaN, INF, -INF as null. + + Parameters + ---------- + arr: ndarray or object value + + Returns + ------- + boolean ndarray or boolean + """ + if is_scalar(obj): + return libmissing.checknull_old(obj) + # hack (for now) because MI registers as ndarray + elif isinstance(obj, ABCMultiIndex): + raise NotImplementedError("isna is not defined for MultiIndex") + elif isinstance(obj, type): + return False + elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass, ABCExtensionArray)): + return _isna_ndarraylike_old(obj) + elif isinstance(obj, ABCGeneric): + return obj._constructor(obj._data.isna(func=_isna_old)) + elif isinstance(obj, list): + return _isna_ndarraylike_old(np.asarray(obj, dtype=object)) + elif hasattr(obj, "__array__"): + return _isna_ndarraylike_old(np.asarray(obj)) + else: + return obj is None + + +_isna = _isna_new + + +def _use_inf_as_na(key): + """ + Option change callback for na/inf behaviour. + + Choose which replacement for numpy.isnan / -numpy.isfinite is used. + + Parameters + ---------- + flag: bool + True means treat None, NaN, INF, -INF as null (old way), + False means None and NaN are null, but INF, -INF are not null + (new way). + + Notes + ----- + This approach to setting global module values is discussed and + approved here: + + * https://stackoverflow.com/questions/4859217/ + programmatically-creating-variables-in-python/4859312#4859312 + """ + flag = get_option(key) + if flag: + globals()["_isna"] = _isna_old + else: + globals()["_isna"] = _isna_new + + +def _isna_ndarraylike(obj): + is_extension = is_extension_array_dtype(obj) + + if not is_extension: + # Avoid accessing `.values` on things like + # PeriodIndex, which may be expensive. + values = getattr(obj, "values", obj) + else: + values = obj + + dtype = values.dtype + + if is_extension: + if isinstance(obj, (ABCIndexClass, ABCSeries)): + values = obj._values + else: + values = obj + result = values.isna() + elif isinstance(obj, ABCDatetimeArray): + return obj.isna() + elif is_string_dtype(dtype): + # Working around NumPy ticket 1542 + shape = values.shape + + if is_string_like_dtype(dtype): + # object array of strings + result = np.zeros(values.shape, dtype=bool) + else: + # object array of non-strings + result = np.empty(shape, dtype=bool) + vec = libmissing.isnaobj(values.ravel()) + result[...] = vec.reshape(shape) + + elif needs_i8_conversion(dtype): + # this is the NaT pattern + result = values.view("i8") == iNaT + else: + result = np.isnan(values) + + # box + if isinstance(obj, ABCSeries): + result = obj._constructor(result, index=obj.index, name=obj.name, copy=False) + + return result + + +def _isna_ndarraylike_old(obj): + values = getattr(obj, "values", obj) + dtype = values.dtype + + if is_string_dtype(dtype): + # Working around NumPy ticket 1542 + shape = values.shape + + if is_string_like_dtype(dtype): + result = np.zeros(values.shape, dtype=bool) + else: + result = np.empty(shape, dtype=bool) + vec = libmissing.isnaobj_old(values.ravel()) + result[:] = vec.reshape(shape) + + elif is_datetime64_dtype(dtype): + # this is the NaT pattern + result = values.view("i8") == iNaT + else: + result = ~np.isfinite(values) + + # box + if isinstance(obj, ABCSeries): + result = obj._constructor(result, index=obj.index, name=obj.name, copy=False) + + return result + + +def notna(obj): + """ + Detect non-missing values for an array-like object. + + This function takes a scalar or array-like object and indicates + whether values are valid (not missing, which is ``NaN`` in numeric + arrays, ``None`` or ``NaN`` in object arrays, ``NaT`` in datetimelike). + + Parameters + ---------- + obj : array-like or object value + Object to check for *not* null or *non*-missing values. + + Returns + ------- + bool or array-like of bool + For scalar input, returns a scalar boolean. + For array input, returns an array of boolean indicating whether each + corresponding element is valid. + + See Also + -------- + isna : Boolean inverse of pandas.notna. + Series.notna : Detect valid values in a Series. + DataFrame.notna : Detect valid values in a DataFrame. + Index.notna : Detect valid values in an Index. + + Examples + -------- + Scalar arguments (including strings) result in a scalar boolean. + + >>> pd.notna('dog') + True + + >>> pd.notna(pd.NA) + False + + >>> pd.notna(np.nan) + False + + ndarrays result in an ndarray of booleans. + + >>> array = np.array([[1, np.nan, 3], [4, 5, np.nan]]) + >>> array + array([[ 1., nan, 3.], + [ 4., 5., nan]]) + >>> pd.notna(array) + array([[ True, False, True], + [ True, True, False]]) + + For indexes, an ndarray of booleans is returned. + + >>> index = pd.DatetimeIndex(["2017-07-05", "2017-07-06", None, + ... "2017-07-08"]) + >>> index + DatetimeIndex(['2017-07-05', '2017-07-06', 'NaT', '2017-07-08'], + dtype='datetime64[ns]', freq=None) + >>> pd.notna(index) + array([ True, True, False, True]) + + For Series and DataFrame, the same type is returned, containing booleans. + + >>> df = pd.DataFrame([['ant', 'bee', 'cat'], ['dog', None, 'fly']]) + >>> df + 0 1 2 + 0 ant bee cat + 1 dog None fly + >>> pd.notna(df) + 0 1 2 + 0 True True True + 1 True False True + + >>> pd.notna(df[1]) + 0 True + 1 False + Name: 1, dtype: bool + """ + res = isna(obj) + if is_scalar(res): + return not res + return ~res + + +notnull = notna + + +def _isna_compat(arr, fill_value=np.nan) -> bool: + """ + Parameters + ---------- + arr: a numpy array + fill_value: fill value, default to np.nan + + Returns + ------- + True if we can fill using this fill_value + """ + dtype = arr.dtype + if isna(fill_value): + return not (is_bool_dtype(dtype) or is_integer_dtype(dtype)) + return True + + +def array_equivalent(left, right, strict_nan: bool = False) -> bool: + """ + True if two arrays, left and right, have equal non-NaN elements, and NaNs + in corresponding locations. False otherwise. It is assumed that left and + right are NumPy arrays of the same dtype. The behavior of this function + (particularly with respect to NaNs) is not defined if the dtypes are + different. + + Parameters + ---------- + left, right : ndarrays + strict_nan : bool, default False + If True, consider NaN and None to be different. + + Returns + ------- + b : bool + Returns True if the arrays are equivalent. + + Examples + -------- + >>> array_equivalent( + ... np.array([1, 2, np.nan]), + ... np.array([1, 2, np.nan])) + True + >>> array_equivalent( + ... np.array([1, np.nan, 2]), + ... np.array([1, 2, np.nan])) + False + """ + + left, right = np.asarray(left), np.asarray(right) + + # shape compat + if left.shape != right.shape: + return False + + # Object arrays can contain None, NaN and NaT. + # string dtypes must be come to this path for NumPy 1.7.1 compat + if is_string_dtype(left) or is_string_dtype(right): + + if not strict_nan: + # isna considers NaN and None to be equivalent. + return lib.array_equivalent_object( + ensure_object(left.ravel()), ensure_object(right.ravel()) + ) + + for left_value, right_value in zip(left, right): + if left_value is NaT and right_value is not NaT: + return False + + elif left_value is libmissing.NA and right_value is not libmissing.NA: + return False + + elif isinstance(left_value, float) and np.isnan(left_value): + if not isinstance(right_value, float) or not np.isnan(right_value): + return False + else: + try: + if np.any(np.asarray(left_value != right_value)): + return False + except TypeError as err: + if "Cannot compare tz-naive" in str(err): + # tzawareness compat failure, see GH#28507 + return False + elif "boolean value of NA is ambiguous" in str(err): + return False + raise + return True + + # NaNs can occur in float and complex arrays. + if is_float_dtype(left) or is_complex_dtype(left): + + # empty + if not (np.prod(left.shape) and np.prod(right.shape)): + return True + return ((left == right) | (isna(left) & isna(right))).all() + + elif is_datetimelike_v_numeric(left, right): + # GH#29553 avoid numpy deprecation warning + return False + + elif needs_i8_conversion(left) or needs_i8_conversion(right): + # datetime64, timedelta64, Period + if not is_dtype_equal(left.dtype, right.dtype): + return False + + left = left.view("i8") + right = right.view("i8") + + # if we have structured dtypes, compare first + if left.dtype.type is np.void or right.dtype.type is np.void: + if left.dtype != right.dtype: + return False + + return np.array_equal(left, right) + + +def _infer_fill_value(val): + """ + infer the fill value for the nan/NaT from the provided + scalar/ndarray/list-like if we are a NaT, return the correct dtyped + element to provide proper block construction + """ + + if not is_list_like(val): + val = [val] + val = np.array(val, copy=False) + if needs_i8_conversion(val): + return np.array("NaT", dtype=val.dtype) + elif is_object_dtype(val.dtype): + dtype = lib.infer_dtype(ensure_object(val), skipna=False) + if dtype in ["datetime", "datetime64"]: + return np.array("NaT", dtype=_NS_DTYPE) + elif dtype in ["timedelta", "timedelta64"]: + return np.array("NaT", dtype=_TD_DTYPE) + return np.nan + + +def _maybe_fill(arr, fill_value=np.nan): + """ + if we have a compatible fill_value and arr dtype, then fill + """ + if _isna_compat(arr, fill_value): + arr.fill(fill_value) + return arr + + +def na_value_for_dtype(dtype, compat: bool = True): + """ + Return a dtype compat na value + + Parameters + ---------- + dtype : string / dtype + compat : bool, default True + + Returns + ------- + np.dtype or a pandas dtype + + Examples + -------- + >>> na_value_for_dtype(np.dtype('int64')) + 0 + >>> na_value_for_dtype(np.dtype('int64'), compat=False) + nan + >>> na_value_for_dtype(np.dtype('float64')) + nan + >>> na_value_for_dtype(np.dtype('bool')) + False + >>> na_value_for_dtype(np.dtype('datetime64[ns]')) + NaT + """ + dtype = pandas_dtype(dtype) + + if is_extension_array_dtype(dtype): + return dtype.na_value + if ( + is_datetime64_dtype(dtype) + or is_datetime64tz_dtype(dtype) + or is_timedelta64_dtype(dtype) + or is_period_dtype(dtype) + ): + return NaT + elif is_float_dtype(dtype): + return np.nan + elif is_integer_dtype(dtype): + if compat: + return 0 + return np.nan + elif is_bool_dtype(dtype): + return False + return np.nan + + +def remove_na_arraylike(arr): + """ + Return array-like containing only true/non-NaN values, possibly empty. + """ + if is_extension_array_dtype(arr): + return arr[notna(arr)] + else: + return arr[notna(lib.values_from_object(arr))] + + +def is_valid_nat_for_dtype(obj, dtype) -> bool: + """ + isna check that excludes incompatible dtypes + + Parameters + ---------- + obj : object + dtype : np.datetime64, np.timedelta64, DatetimeTZDtype, or PeriodDtype + + Returns + ------- + bool + """ + if not lib.is_scalar(obj) or not isna(obj): + return False + if dtype.kind == "M": + return not isinstance(obj, np.timedelta64) + if dtype.kind == "m": + return not isinstance(obj, np.datetime64) + + # must be PeriodDType + return not isinstance(obj, (np.datetime64, np.timedelta64)) diff --git a/venv/Lib/site-packages/pandas/core/frame.py b/venv/Lib/site-packages/pandas/core/frame.py new file mode 100644 index 0000000..cfd37ac --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/frame.py @@ -0,0 +1,8473 @@ +""" +DataFrame +--------- +An efficient 2D container for potentially mixed-type time series or other +labeled data series. + +Similar to its R counterpart, data.frame, except providing automatic data +alignment and a host of useful data manipulation methods having to do with the +labeling information +""" +import collections +from collections import abc +from io import StringIO +import itertools +import sys +from textwrap import dedent +from typing import ( + IO, + TYPE_CHECKING, + Any, + FrozenSet, + Hashable, + Iterable, + List, + Optional, + Sequence, + Set, + Tuple, + Type, + Union, + cast, +) +import warnings + +import numpy as np +import numpy.ma as ma + +from pandas._config import get_option + +from pandas._libs import algos as libalgos, lib +from pandas._typing import Axes, Axis, Dtype, FilePathOrBuffer, Level, Renamer +from pandas.compat import PY37 +from pandas.compat._optional import import_optional_dependency +from pandas.compat.numpy import function as nv +from pandas.util._decorators import ( + Appender, + Substitution, + deprecate_kwarg, + rewrite_axis_style_signature, +) +from pandas.util._validators import ( + validate_axis_style_args, + validate_bool_kwarg, + validate_percentile, +) + +from pandas.core.dtypes.cast import ( + cast_scalar_to_array, + coerce_to_dtypes, + find_common_type, + infer_dtype_from_scalar, + invalidate_string_dtypes, + maybe_cast_to_datetime, + maybe_convert_platform, + maybe_downcast_to_dtype, + maybe_infer_to_datetimelike, + maybe_upcast, + maybe_upcast_putmask, +) +from pandas.core.dtypes.common import ( + ensure_float64, + ensure_int64, + ensure_platform_int, + infer_dtype_from_object, + is_bool_dtype, + is_dict_like, + is_dtype_equal, + is_extension_array_dtype, + is_float_dtype, + is_hashable, + is_integer, + is_integer_dtype, + is_iterator, + is_list_like, + is_named_tuple, + is_object_dtype, + is_scalar, + is_sequence, + needs_i8_conversion, +) +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCIndexClass, + ABCMultiIndex, + ABCSeries, +) +from pandas.core.dtypes.missing import isna, notna + +from pandas.core import algorithms, common as com, nanops, ops +from pandas.core.accessor import CachedAccessor +from pandas.core.arrays import Categorical, ExtensionArray +from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray +from pandas.core.arrays.sparse import SparseFrameAccessor +from pandas.core.generic import NDFrame, _shared_docs +from pandas.core.groupby import generic as groupby_generic +from pandas.core.indexes import base as ibase +from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences +from pandas.core.indexes.datetimes import DatetimeIndex +from pandas.core.indexes.multi import maybe_droplevels +from pandas.core.indexes.period import PeriodIndex +from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable +from pandas.core.internals import BlockManager +from pandas.core.internals.construction import ( + arrays_to_mgr, + get_names_from_index, + init_dict, + init_ndarray, + masked_rec_array_to_mgr, + reorder_arrays, + sanitize_index, + to_arrays, +) +from pandas.core.ops.missing import dispatch_fill_zeros +from pandas.core.series import Series + +from pandas.io.common import get_filepath_or_buffer +from pandas.io.formats import console, format as fmt +from pandas.io.formats.printing import pprint_thing +import pandas.plotting + +if TYPE_CHECKING: + from pandas.io.formats.style import Styler + +# --------------------------------------------------------------------- +# Docstring templates + +_shared_doc_kwargs = dict( + axes="index, columns", + klass="DataFrame", + axes_single_arg="{0 or 'index', 1 or 'columns'}", + axis="""axis : {0 or 'index', 1 or 'columns'}, default 0 + If 0 or 'index': apply function to each column. + If 1 or 'columns': apply function to each row.""", + optional_by=""" + by : str or list of str + Name or list of names to sort by. + + - if `axis` is 0 or `'index'` then `by` may contain index + levels and/or column labels. + - if `axis` is 1 or `'columns'` then `by` may contain column + levels and/or index labels. + + .. versionchanged:: 0.23.0 + + Allow specifying index or column level names.""", + versionadded_to_excel="", + optional_labels="""labels : array-like, optional + New labels / index to conform the axis specified by 'axis' to.""", + optional_axis="""axis : int or str, optional + Axis to target. Can be either the axis name ('index', 'columns') + or number (0, 1).""", +) + +_numeric_only_doc = """numeric_only : boolean, default None + Include only float, int, boolean data. If None, will attempt to use + everything, then use only numeric data +""" + +_merge_doc = """ +Merge DataFrame or named Series objects with a database-style join. + +The join is done on columns or indexes. If joining columns on +columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes +on indexes or indexes on a column or columns, the index will be passed on. + +Parameters +----------%s +right : DataFrame or named Series + Object to merge with. +how : {'left', 'right', 'outer', 'inner'}, default 'inner' + Type of merge to be performed. + + * left: use only keys from left frame, similar to a SQL left outer join; + preserve key order. + * right: use only keys from right frame, similar to a SQL right outer join; + preserve key order. + * outer: use union of keys from both frames, similar to a SQL full outer + join; sort keys lexicographically. + * inner: use intersection of keys from both frames, similar to a SQL inner + join; preserve the order of the left keys. +on : label or list + Column or index level names to join on. These must be found in both + DataFrames. If `on` is None and not merging on indexes then this defaults + to the intersection of the columns in both DataFrames. +left_on : label or list, or array-like + Column or index level names to join on in the left DataFrame. Can also + be an array or list of arrays of the length of the left DataFrame. + These arrays are treated as if they are columns. +right_on : label or list, or array-like + Column or index level names to join on in the right DataFrame. Can also + be an array or list of arrays of the length of the right DataFrame. + These arrays are treated as if they are columns. +left_index : bool, default False + Use the index from the left DataFrame as the join key(s). If it is a + MultiIndex, the number of keys in the other DataFrame (either the index + or a number of columns) must match the number of levels. +right_index : bool, default False + Use the index from the right DataFrame as the join key. Same caveats as + left_index. +sort : bool, default False + Sort the join keys lexicographically in the result DataFrame. If False, + the order of the join keys depends on the join type (how keyword). +suffixes : tuple of (str, str), default ('_x', '_y') + Suffix to apply to overlapping column names in the left and right + side, respectively. To raise an exception on overlapping columns use + (False, False). +copy : bool, default True + If False, avoid copy if possible. +indicator : bool or str, default False + If True, adds a column to output DataFrame called "_merge" with + information on the source of each row. + If string, column with information on source of each row will be added to + output DataFrame, and column will be named value of string. + Information column is Categorical-type and takes on a value of "left_only" + for observations whose merge key only appears in 'left' DataFrame, + "right_only" for observations whose merge key only appears in 'right' + DataFrame, and "both" if the observation's merge key is found in both. + +validate : str, optional + If specified, checks if merge is of specified type. + + * "one_to_one" or "1:1": check if merge keys are unique in both + left and right datasets. + * "one_to_many" or "1:m": check if merge keys are unique in left + dataset. + * "many_to_one" or "m:1": check if merge keys are unique in right + dataset. + * "many_to_many" or "m:m": allowed, but does not result in checks. + + .. versionadded:: 0.21.0 + +Returns +------- +DataFrame + A DataFrame of the two merged objects. + +See Also +-------- +merge_ordered : Merge with optional filling/interpolation. +merge_asof : Merge on nearest keys. +DataFrame.join : Similar method using indices. + +Notes +----- +Support for specifying index levels as the `on`, `left_on`, and +`right_on` parameters was added in version 0.23.0 +Support for merging named Series objects was added in version 0.24.0 + +Examples +-------- + +>>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], +... 'value': [1, 2, 3, 5]}) +>>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'], +... 'value': [5, 6, 7, 8]}) +>>> df1 + lkey value +0 foo 1 +1 bar 2 +2 baz 3 +3 foo 5 +>>> df2 + rkey value +0 foo 5 +1 bar 6 +2 baz 7 +3 foo 8 + +Merge df1 and df2 on the lkey and rkey columns. The value columns have +the default suffixes, _x and _y, appended. + +>>> df1.merge(df2, left_on='lkey', right_on='rkey') + lkey value_x rkey value_y +0 foo 1 foo 5 +1 foo 1 foo 8 +2 foo 5 foo 5 +3 foo 5 foo 8 +4 bar 2 bar 6 +5 baz 3 baz 7 + +Merge DataFrames df1 and df2 with specified left and right suffixes +appended to any overlapping columns. + +>>> df1.merge(df2, left_on='lkey', right_on='rkey', +... suffixes=('_left', '_right')) + lkey value_left rkey value_right +0 foo 1 foo 5 +1 foo 1 foo 8 +2 foo 5 foo 5 +3 foo 5 foo 8 +4 bar 2 bar 6 +5 baz 3 baz 7 + +Merge DataFrames df1 and df2, but raise an exception if the DataFrames have +any overlapping columns. + +>>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False)) +Traceback (most recent call last): +... +ValueError: columns overlap but no suffix specified: + Index(['value'], dtype='object') +""" + + +# ----------------------------------------------------------------------- +# DataFrame class + + +class DataFrame(NDFrame): + """ + Two-dimensional, size-mutable, potentially heterogeneous tabular data. + + Data structure also contains labeled axes (rows and columns). + Arithmetic operations align on both row and column labels. Can be + thought of as a dict-like container for Series objects. The primary + pandas data structure. + + Parameters + ---------- + data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame + Dict can contain Series, arrays, constants, or list-like objects. + + .. versionchanged:: 0.23.0 + If data is a dict, column order follows insertion-order for + Python 3.6 and later. + + .. versionchanged:: 0.25.0 + If data is a list of dicts, column order follows insertion-order + for Python 3.6 and later. + + index : Index or array-like + Index to use for resulting frame. Will default to RangeIndex if + no indexing information part of input data and no index provided. + columns : Index or array-like + Column labels to use for resulting frame. Will default to + RangeIndex (0, 1, 2, ..., n) if no column labels are provided. + dtype : dtype, default None + Data type to force. Only a single dtype is allowed. If None, infer. + copy : bool, default False + Copy data from inputs. Only affects DataFrame / 2d ndarray input. + + See Also + -------- + DataFrame.from_records : Constructor from tuples, also record arrays. + DataFrame.from_dict : From dicts of Series, arrays, or dicts. + read_csv + read_table + read_clipboard + + Examples + -------- + Constructing DataFrame from a dictionary. + + >>> d = {'col1': [1, 2], 'col2': [3, 4]} + >>> df = pd.DataFrame(data=d) + >>> df + col1 col2 + 0 1 3 + 1 2 4 + + Notice that the inferred dtype is int64. + + >>> df.dtypes + col1 int64 + col2 int64 + dtype: object + + To enforce a single dtype: + + >>> df = pd.DataFrame(data=d, dtype=np.int8) + >>> df.dtypes + col1 int8 + col2 int8 + dtype: object + + Constructing DataFrame from numpy ndarray: + + >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), + ... columns=['a', 'b', 'c']) + >>> df2 + a b c + 0 1 2 3 + 1 4 5 6 + 2 7 8 9 + """ + + _typ = "dataframe" + + @property + def _constructor(self) -> Type["DataFrame"]: + return DataFrame + + _constructor_sliced: Type[Series] = Series + _deprecations: FrozenSet[str] = NDFrame._deprecations | frozenset([]) + _accessors: Set[str] = {"sparse"} + + @property + def _constructor_expanddim(self): + raise NotImplementedError("Not supported for DataFrames!") + + # ---------------------------------------------------------------------- + # Constructors + + def __init__( + self, + data=None, + index: Optional[Axes] = None, + columns: Optional[Axes] = None, + dtype: Optional[Dtype] = None, + copy: bool = False, + ): + if data is None: + data = {} + if dtype is not None: + dtype = self._validate_dtype(dtype) + + if isinstance(data, DataFrame): + data = data._data + + if isinstance(data, BlockManager): + mgr = self._init_mgr( + data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy + ) + elif isinstance(data, dict): + mgr = init_dict(data, index, columns, dtype=dtype) + elif isinstance(data, ma.MaskedArray): + import numpy.ma.mrecords as mrecords + + # masked recarray + if isinstance(data, mrecords.MaskedRecords): + mgr = masked_rec_array_to_mgr(data, index, columns, dtype, copy) + + # a masked array + else: + mask = ma.getmaskarray(data) + if mask.any(): + data, fill_value = maybe_upcast(data, copy=True) + data.soften_mask() # set hardmask False if it was True + data[mask] = fill_value + else: + data = data.copy() + mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) + + elif isinstance(data, (np.ndarray, Series, Index)): + if data.dtype.names: + data_columns = list(data.dtype.names) + data = {k: data[k] for k in data_columns} + if columns is None: + columns = data_columns + mgr = init_dict(data, index, columns, dtype=dtype) + elif getattr(data, "name", None) is not None: + mgr = init_dict({data.name: data}, index, columns, dtype=dtype) + else: + mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) + + # For data is list-like, or Iterable (will consume into list) + elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)): + if not isinstance(data, (abc.Sequence, ExtensionArray)): + data = list(data) + if len(data) > 0: + if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1: + if is_named_tuple(data[0]) and columns is None: + columns = data[0]._fields + arrays, columns = to_arrays(data, columns, dtype=dtype) + columns = ensure_index(columns) + + # set the index + if index is None: + if isinstance(data[0], Series): + index = get_names_from_index(data) + elif isinstance(data[0], Categorical): + index = ibase.default_index(len(data[0])) + else: + index = ibase.default_index(len(data)) + + mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) + else: + mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) + else: + mgr = init_dict({}, index, columns, dtype=dtype) + else: + try: + arr = np.array(data, dtype=dtype, copy=copy) + except (ValueError, TypeError) as e: + exc = TypeError( + "DataFrame constructor called with " + f"incompatible data and dtype: {e}" + ) + raise exc from e + + if arr.ndim == 0 and index is not None and columns is not None: + values = cast_scalar_to_array( + (len(index), len(columns)), data, dtype=dtype + ) + mgr = init_ndarray( + values, index, columns, dtype=values.dtype, copy=False + ) + else: + raise ValueError("DataFrame constructor not properly called!") + + NDFrame.__init__(self, mgr, fastpath=True) + + # ---------------------------------------------------------------------- + + @property + def axes(self) -> List[Index]: + """ + Return a list representing the axes of the DataFrame. + + It has the row axis labels and column axis labels as the only members. + They are returned in that order. + + Examples + -------- + >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df.axes + [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'], + dtype='object')] + """ + return [self.index, self.columns] + + @property + def shape(self) -> Tuple[int, int]: + """ + Return a tuple representing the dimensionality of the DataFrame. + + See Also + -------- + ndarray.shape + + Examples + -------- + >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df.shape + (2, 2) + + >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4], + ... 'col3': [5, 6]}) + >>> df.shape + (2, 3) + """ + return len(self.index), len(self.columns) + + @property + def _is_homogeneous_type(self) -> bool: + """ + Whether all the columns in a DataFrame have the same type. + + Returns + ------- + bool + + See Also + -------- + Index._is_homogeneous_type : Whether the object has a single + dtype. + MultiIndex._is_homogeneous_type : Whether all the levels of a + MultiIndex have the same dtype. + + Examples + -------- + >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type + True + >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type + False + + Items with the same type but different sizes are considered + different types. + + >>> DataFrame({ + ... "A": np.array([1, 2], dtype=np.int32), + ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type + False + """ + if self._data.any_extension_types: + return len({block.dtype for block in self._data.blocks}) == 1 + else: + return not self._data.is_mixed_type + + # ---------------------------------------------------------------------- + # Rendering Methods + + def _repr_fits_vertical_(self) -> bool: + """ + Check length against max_rows. + """ + max_rows = get_option("display.max_rows") + return len(self) <= max_rows + + def _repr_fits_horizontal_(self, ignore_width: bool = False) -> bool: + """ + Check if full repr fits in horizontal boundaries imposed by the display + options width and max_columns. + + In case off non-interactive session, no boundaries apply. + + `ignore_width` is here so ipnb+HTML output can behave the way + users expect. display.max_columns remains in effect. + GH3541, GH3573 + """ + width, height = console.get_console_size() + max_columns = get_option("display.max_columns") + nb_columns = len(self.columns) + + # exceed max columns + if (max_columns and nb_columns > max_columns) or ( + (not ignore_width) and width and nb_columns > (width // 2) + ): + return False + + # used by repr_html under IPython notebook or scripts ignore terminal + # dims + if ignore_width or not console.in_interactive_session(): + return True + + if get_option("display.width") is not None or console.in_ipython_frontend(): + # check at least the column row for excessive width + max_rows = 1 + else: + max_rows = get_option("display.max_rows") + + # when auto-detecting, so width=None and not in ipython front end + # check whether repr fits horizontal by actually checking + # the width of the rendered repr + buf = StringIO() + + # only care about the stuff we'll actually print out + # and to_string on entire frame may be expensive + d = self + + if not (max_rows is None): # unlimited rows + # min of two, where one may be None + d = d.iloc[: min(max_rows, len(d))] + else: + return True + + d.to_string(buf=buf) + value = buf.getvalue() + repr_width = max(len(l) for l in value.split("\n")) + + return repr_width < width + + def _info_repr(self) -> bool: + """ + True if the repr should show the info view. + """ + info_repr_option = get_option("display.large_repr") == "info" + return info_repr_option and not ( + self._repr_fits_horizontal_() and self._repr_fits_vertical_() + ) + + def __repr__(self) -> str: + """ + Return a string representation for a particular DataFrame. + """ + buf = StringIO("") + if self._info_repr(): + self.info(buf=buf) + return buf.getvalue() + + max_rows = get_option("display.max_rows") + min_rows = get_option("display.min_rows") + max_cols = get_option("display.max_columns") + max_colwidth = get_option("display.max_colwidth") + show_dimensions = get_option("display.show_dimensions") + if get_option("display.expand_frame_repr"): + width, _ = console.get_console_size() + else: + width = None + self.to_string( + buf=buf, + max_rows=max_rows, + min_rows=min_rows, + max_cols=max_cols, + line_width=width, + max_colwidth=max_colwidth, + show_dimensions=show_dimensions, + ) + + return buf.getvalue() + + def _repr_html_(self) -> Optional[str]: + """ + Return a html representation for a particular DataFrame. + + Mainly for IPython notebook. + """ + if self._info_repr(): + buf = StringIO("") + self.info(buf=buf) + # need to escape the , should be the first line. + val = buf.getvalue().replace("<", r"<", 1) + val = val.replace(">", r">", 1) + return "
" + val + "
" + + if get_option("display.notebook_repr_html"): + max_rows = get_option("display.max_rows") + min_rows = get_option("display.min_rows") + max_cols = get_option("display.max_columns") + show_dimensions = get_option("display.show_dimensions") + + formatter = fmt.DataFrameFormatter( + self, + columns=None, + col_space=None, + na_rep="NaN", + formatters=None, + float_format=None, + sparsify=None, + justify=None, + index_names=True, + header=True, + index=True, + bold_rows=True, + escape=True, + max_rows=max_rows, + min_rows=min_rows, + max_cols=max_cols, + show_dimensions=show_dimensions, + decimal=".", + table_id=None, + render_links=False, + ) + return formatter.to_html(notebook=True) + else: + return None + + @Substitution( + header_type="bool or sequence", + header="Write out the column names. If a list of strings " + "is given, it is assumed to be aliases for the " + "column names", + col_space_type="int", + col_space="The minimum width of each column", + ) + @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) + def to_string( + self, + buf: Optional[FilePathOrBuffer[str]] = None, + columns: Optional[Sequence[str]] = None, + col_space: Optional[int] = None, + header: Union[bool, Sequence[str]] = True, + index: bool = True, + na_rep: str = "NaN", + formatters: Optional[fmt.formatters_type] = None, + float_format: Optional[fmt.float_format_type] = None, + sparsify: Optional[bool] = None, + index_names: bool = True, + justify: Optional[str] = None, + max_rows: Optional[int] = None, + min_rows: Optional[int] = None, + max_cols: Optional[int] = None, + show_dimensions: bool = False, + decimal: str = ".", + line_width: Optional[int] = None, + max_colwidth: Optional[int] = None, + encoding: Optional[str] = None, + ) -> Optional[str]: + """ + Render a DataFrame to a console-friendly tabular output. + %(shared_params)s + line_width : int, optional + Width to wrap a line in characters. + max_colwidth : int, optional + Max width to truncate each column in characters. By default, no limit. + + .. versionadded:: 1.0.0 + encoding : str, default "utf-8" + Set character encoding. + + .. versionadded:: 1.0 + %(returns)s + See Also + -------- + to_html : Convert DataFrame to HTML. + + Examples + -------- + >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]} + >>> df = pd.DataFrame(d) + >>> print(df.to_string()) + col1 col2 + 0 1 4 + 1 2 5 + 2 3 6 + """ + + from pandas import option_context + + with option_context("display.max_colwidth", max_colwidth): + formatter = fmt.DataFrameFormatter( + self, + columns=columns, + col_space=col_space, + na_rep=na_rep, + formatters=formatters, + float_format=float_format, + sparsify=sparsify, + justify=justify, + index_names=index_names, + header=header, + index=index, + min_rows=min_rows, + max_rows=max_rows, + max_cols=max_cols, + show_dimensions=show_dimensions, + decimal=decimal, + line_width=line_width, + ) + return formatter.to_string(buf=buf, encoding=encoding) + + # ---------------------------------------------------------------------- + + @property + def style(self) -> "Styler": + """ + Returns a Styler object. + + Contains methods for building a styled HTML representation of the DataFrame. + a styled HTML representation fo the DataFrame. + + See Also + -------- + io.formats.style.Styler + """ + from pandas.io.formats.style import Styler + + return Styler(self) + + _shared_docs[ + "items" + ] = r""" + Iterate over (column name, Series) pairs. + + Iterates over the DataFrame columns, returning a tuple with + the column name and the content as a Series. + + Yields + ------ + label : object + The column names for the DataFrame being iterated over. + content : Series + The column entries belonging to each label, as a Series. + + See Also + -------- + DataFrame.iterrows : Iterate over DataFrame rows as + (index, Series) pairs. + DataFrame.itertuples : Iterate over DataFrame rows as namedtuples + of the values. + + Examples + -------- + >>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'], + ... 'population': [1864, 22000, 80000]}, + ... index=['panda', 'polar', 'koala']) + >>> df + species population + panda bear 1864 + polar bear 22000 + koala marsupial 80000 + >>> for label, content in df.items(): + ... print('label:', label) + ... print('content:', content, sep='\n') + ... + label: species + content: + panda bear + polar bear + koala marsupial + Name: species, dtype: object + label: population + content: + panda 1864 + polar 22000 + koala 80000 + Name: population, dtype: int64 + """ + + @Appender(_shared_docs["items"]) + def items(self) -> Iterable[Tuple[Optional[Hashable], Series]]: + if self.columns.is_unique and hasattr(self, "_item_cache"): + for k in self.columns: + yield k, self._get_item_cache(k) + else: + for i, k in enumerate(self.columns): + yield k, self._ixs(i, axis=1) + + @Appender(_shared_docs["items"]) + def iteritems(self) -> Iterable[Tuple[Optional[Hashable], Series]]: + yield from self.items() + + def iterrows(self) -> Iterable[Tuple[Optional[Hashable], Series]]: + """ + Iterate over DataFrame rows as (index, Series) pairs. + + Yields + ------ + index : label or tuple of label + The index of the row. A tuple for a `MultiIndex`. + data : Series + The data of the row as a Series. + + it : generator + A generator that iterates over the rows of the frame. + + See Also + -------- + DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values. + DataFrame.items : Iterate over (column name, Series) pairs. + + Notes + ----- + + 1. Because ``iterrows`` returns a Series for each row, + it does **not** preserve dtypes across the rows (dtypes are + preserved across columns for DataFrames). For example, + + >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float']) + >>> row = next(df.iterrows())[1] + >>> row + int 1.0 + float 1.5 + Name: 0, dtype: float64 + >>> print(row['int'].dtype) + float64 + >>> print(df['int'].dtype) + int64 + + To preserve dtypes while iterating over the rows, it is better + to use :meth:`itertuples` which returns namedtuples of the values + and which is generally faster than ``iterrows``. + + 2. You should **never modify** something you are iterating over. + This is not guaranteed to work in all cases. Depending on the + data types, the iterator returns a copy and not a view, and writing + to it will have no effect. + """ + columns = self.columns + klass = self._constructor_sliced + for k, v in zip(self.index, self.values): + s = klass(v, index=columns, name=k) + yield k, s + + def itertuples(self, index=True, name="Pandas"): + """ + Iterate over DataFrame rows as namedtuples. + + Parameters + ---------- + index : bool, default True + If True, return the index as the first element of the tuple. + name : str or None, default "Pandas" + The name of the returned namedtuples or None to return regular + tuples. + + Returns + ------- + iterator + An object to iterate over namedtuples for each row in the + DataFrame with the first field possibly being the index and + following fields being the column values. + + See Also + -------- + DataFrame.iterrows : Iterate over DataFrame rows as (index, Series) + pairs. + DataFrame.items : Iterate over (column name, Series) pairs. + + Notes + ----- + The column names will be renamed to positional names if they are + invalid Python identifiers, repeated, or start with an underscore. + On python versions < 3.7 regular tuples are returned for DataFrames + with a large number of columns (>254). + + Examples + -------- + >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]}, + ... index=['dog', 'hawk']) + >>> df + num_legs num_wings + dog 4 0 + hawk 2 2 + >>> for row in df.itertuples(): + ... print(row) + ... + Pandas(Index='dog', num_legs=4, num_wings=0) + Pandas(Index='hawk', num_legs=2, num_wings=2) + + By setting the `index` parameter to False we can remove the index + as the first element of the tuple: + + >>> for row in df.itertuples(index=False): + ... print(row) + ... + Pandas(num_legs=4, num_wings=0) + Pandas(num_legs=2, num_wings=2) + + With the `name` parameter set we set a custom name for the yielded + namedtuples: + + >>> for row in df.itertuples(name='Animal'): + ... print(row) + ... + Animal(Index='dog', num_legs=4, num_wings=0) + Animal(Index='hawk', num_legs=2, num_wings=2) + """ + arrays = [] + fields = list(self.columns) + if index: + arrays.append(self.index) + fields.insert(0, "Index") + + # use integer indexing because of possible duplicate column names + arrays.extend(self.iloc[:, k] for k in range(len(self.columns))) + + # Python versions before 3.7 support at most 255 arguments to constructors + can_return_named_tuples = PY37 or len(self.columns) + index < 255 + if name is not None and can_return_named_tuples: + itertuple = collections.namedtuple(name, fields, rename=True) + return map(itertuple._make, zip(*arrays)) + + # fallback to regular tuples + return zip(*arrays) + + def __len__(self) -> int: + """ + Returns length of info axis, but here we use the index. + """ + return len(self.index) + + def dot(self, other): + """ + Compute the matrix multiplication between the DataFrame and other. + + This method computes the matrix product between the DataFrame and the + values of an other Series, DataFrame or a numpy array. + + It can also be called using ``self @ other`` in Python >= 3.5. + + Parameters + ---------- + other : Series, DataFrame or array-like + The other object to compute the matrix product with. + + Returns + ------- + Series or DataFrame + If other is a Series, return the matrix product between self and + other as a Serie. If other is a DataFrame or a numpy.array, return + the matrix product of self and other in a DataFrame of a np.array. + + See Also + -------- + Series.dot: Similar method for Series. + + Notes + ----- + The dimensions of DataFrame and other must be compatible in order to + compute the matrix multiplication. In addition, the column names of + DataFrame and the index of other must contain the same values, as they + will be aligned prior to the multiplication. + + The dot method for Series computes the inner product, instead of the + matrix product here. + + Examples + -------- + Here we multiply a DataFrame with a Series. + + >>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]]) + >>> s = pd.Series([1, 1, 2, 1]) + >>> df.dot(s) + 0 -4 + 1 5 + dtype: int64 + + Here we multiply a DataFrame with another DataFrame. + + >>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]]) + >>> df.dot(other) + 0 1 + 0 1 4 + 1 2 2 + + Note that the dot method give the same result as @ + + >>> df @ other + 0 1 + 0 1 4 + 1 2 2 + + The dot method works also if other is an np.array. + + >>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]]) + >>> df.dot(arr) + 0 1 + 0 1 4 + 1 2 2 + + Note how shuffling of the objects does not change the result. + + >>> s2 = s.reindex([1, 0, 2, 3]) + >>> df.dot(s2) + 0 -4 + 1 5 + dtype: int64 + """ + if isinstance(other, (Series, DataFrame)): + common = self.columns.union(other.index) + if len(common) > len(self.columns) or len(common) > len(other.index): + raise ValueError("matrices are not aligned") + + left = self.reindex(columns=common, copy=False) + right = other.reindex(index=common, copy=False) + lvals = left.values + rvals = right.values + else: + left = self + lvals = self.values + rvals = np.asarray(other) + if lvals.shape[1] != rvals.shape[0]: + raise ValueError( + f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}" + ) + + if isinstance(other, DataFrame): + return self._constructor( + np.dot(lvals, rvals), index=left.index, columns=other.columns + ) + elif isinstance(other, Series): + return Series(np.dot(lvals, rvals), index=left.index) + elif isinstance(rvals, (np.ndarray, Index)): + result = np.dot(lvals, rvals) + if result.ndim == 2: + return self._constructor(result, index=left.index) + else: + return Series(result, index=left.index) + else: # pragma: no cover + raise TypeError(f"unsupported type: {type(other)}") + + def __matmul__(self, other): + """ + Matrix multiplication using binary `@` operator in Python>=3.5. + """ + return self.dot(other) + + def __rmatmul__(self, other): + """ + Matrix multiplication using binary `@` operator in Python>=3.5. + """ + return self.T.dot(np.transpose(other)).T + + # ---------------------------------------------------------------------- + # IO methods (to / from other formats) + + @classmethod + def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> "DataFrame": + """ + Construct DataFrame from dict of array-like or dicts. + + Creates DataFrame object from dictionary by columns or by index + allowing dtype specification. + + Parameters + ---------- + data : dict + Of the form {field : array-like} or {field : dict}. + orient : {'columns', 'index'}, default 'columns' + The "orientation" of the data. If the keys of the passed dict + should be the columns of the resulting DataFrame, pass 'columns' + (default). Otherwise if the keys should be rows, pass 'index'. + dtype : dtype, default None + Data type to force, otherwise infer. + columns : list, default None + Column labels to use when ``orient='index'``. Raises a ValueError + if used with ``orient='columns'``. + + .. versionadded:: 0.23.0 + + Returns + ------- + DataFrame + + See Also + -------- + DataFrame.from_records : DataFrame from ndarray (structured + dtype), list of tuples, dict, or DataFrame. + DataFrame : DataFrame object creation using constructor. + + Examples + -------- + By default the keys of the dict become the DataFrame columns: + + >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']} + >>> pd.DataFrame.from_dict(data) + col_1 col_2 + 0 3 a + 1 2 b + 2 1 c + 3 0 d + + Specify ``orient='index'`` to create the DataFrame using dictionary + keys as rows: + + >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']} + >>> pd.DataFrame.from_dict(data, orient='index') + 0 1 2 3 + row_1 3 2 1 0 + row_2 a b c d + + When using the 'index' orientation, the column names can be + specified manually: + + >>> pd.DataFrame.from_dict(data, orient='index', + ... columns=['A', 'B', 'C', 'D']) + A B C D + row_1 3 2 1 0 + row_2 a b c d + """ + index = None + orient = orient.lower() + if orient == "index": + if len(data) > 0: + # TODO speed up Series case + if isinstance(list(data.values())[0], (Series, dict)): + data = _from_nested_dict(data) + else: + data, index = list(data.values()), list(data.keys()) + elif orient == "columns": + if columns is not None: + raise ValueError("cannot use columns parameter with orient='columns'") + else: # pragma: no cover + raise ValueError("only recognize index or columns for orient") + + return cls(data, index=index, columns=columns, dtype=dtype) + + def to_numpy(self, dtype=None, copy=False) -> np.ndarray: + """ + Convert the DataFrame to a NumPy array. + + .. versionadded:: 0.24.0 + + By default, the dtype of the returned array will be the common NumPy + dtype of all types in the DataFrame. For example, if the dtypes are + ``float16`` and ``float32``, the results dtype will be ``float32``. + This may require copying data and coercing values, which may be + expensive. + + Parameters + ---------- + dtype : str or numpy.dtype, optional + The dtype to pass to :meth:`numpy.asarray`. + copy : bool, default False + Whether to ensure that the returned value is a not a view on + another array. Note that ``copy=False`` does not *ensure* that + ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that + a copy is made, even if not strictly necessary. + + Returns + ------- + numpy.ndarray + + See Also + -------- + Series.to_numpy : Similar method for Series. + + Examples + -------- + >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy() + array([[1, 3], + [2, 4]]) + + With heterogeneous data, the lowest common type will have to + be used. + + >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]}) + >>> df.to_numpy() + array([[1. , 3. ], + [2. , 4.5]]) + + For a mix of numeric and non-numeric types, the output array will + have object dtype. + + >>> df['C'] = pd.date_range('2000', periods=2) + >>> df.to_numpy() + array([[1, 3.0, Timestamp('2000-01-01 00:00:00')], + [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object) + """ + result = np.array(self.values, dtype=dtype, copy=copy) + return result + + def to_dict(self, orient="dict", into=dict): + """ + Convert the DataFrame to a dictionary. + + The type of the key-value pairs can be customized with the parameters + (see below). + + Parameters + ---------- + orient : str {'dict', 'list', 'series', 'split', 'records', 'index'} + Determines the type of the values of the dictionary. + + - 'dict' (default) : dict like {column -> {index -> value}} + - 'list' : dict like {column -> [values]} + - 'series' : dict like {column -> Series(values)} + - 'split' : dict like + {'index' -> [index], 'columns' -> [columns], 'data' -> [values]} + - 'records' : list like + [{column -> value}, ... , {column -> value}] + - 'index' : dict like {index -> {column -> value}} + + Abbreviations are allowed. `s` indicates `series` and `sp` + indicates `split`. + + into : class, default dict + The collections.abc.Mapping subclass used for all Mappings + in the return value. Can be the actual class or an empty + instance of the mapping type you want. If you want a + collections.defaultdict, you must pass it initialized. + + .. versionadded:: 0.21.0 + + Returns + ------- + dict, list or collections.abc.Mapping + Return a collections.abc.Mapping object representing the DataFrame. + The resulting transformation depends on the `orient` parameter. + + See Also + -------- + DataFrame.from_dict: Create a DataFrame from a dictionary. + DataFrame.to_json: Convert a DataFrame to JSON format. + + Examples + -------- + >>> df = pd.DataFrame({'col1': [1, 2], + ... 'col2': [0.5, 0.75]}, + ... index=['row1', 'row2']) + >>> df + col1 col2 + row1 1 0.50 + row2 2 0.75 + >>> df.to_dict() + {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}} + + You can specify the return orientation. + + >>> df.to_dict('series') + {'col1': row1 1 + row2 2 + Name: col1, dtype: int64, + 'col2': row1 0.50 + row2 0.75 + Name: col2, dtype: float64} + + >>> df.to_dict('split') + {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], + 'data': [[1, 0.5], [2, 0.75]]} + + >>> df.to_dict('records') + [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}] + + >>> df.to_dict('index') + {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}} + + You can also specify the mapping type. + + >>> from collections import OrderedDict, defaultdict + >>> df.to_dict(into=OrderedDict) + OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])), + ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))]) + + If you want a `defaultdict`, you need to initialize it: + + >>> dd = defaultdict(list) + >>> df.to_dict('records', into=dd) + [defaultdict(, {'col1': 1, 'col2': 0.5}), + defaultdict(, {'col1': 2, 'col2': 0.75})] + """ + if not self.columns.is_unique: + warnings.warn( + "DataFrame columns are not unique, some columns will be omitted.", + UserWarning, + stacklevel=2, + ) + # GH16122 + into_c = com.standardize_mapping(into) + if orient.lower().startswith("d"): + return into_c((k, v.to_dict(into)) for k, v in self.items()) + elif orient.lower().startswith("l"): + return into_c((k, v.tolist()) for k, v in self.items()) + elif orient.lower().startswith("sp"): + return into_c( + ( + ("index", self.index.tolist()), + ("columns", self.columns.tolist()), + ( + "data", + [ + list(map(com.maybe_box_datetimelike, t)) + for t in self.itertuples(index=False, name=None) + ], + ), + ) + ) + elif orient.lower().startswith("s"): + return into_c((k, com.maybe_box_datetimelike(v)) for k, v in self.items()) + elif orient.lower().startswith("r"): + columns = self.columns.tolist() + rows = ( + dict(zip(columns, row)) + for row in self.itertuples(index=False, name=None) + ) + return [ + into_c((k, com.maybe_box_datetimelike(v)) for k, v in row.items()) + for row in rows + ] + elif orient.lower().startswith("i"): + if not self.index.is_unique: + raise ValueError("DataFrame index must be unique for orient='index'.") + return into_c( + (t[0], dict(zip(self.columns, t[1:]))) + for t in self.itertuples(name=None) + ) + else: + raise ValueError(f"orient '{orient}' not understood") + + def to_gbq( + self, + destination_table, + project_id=None, + chunksize=None, + reauth=False, + if_exists="fail", + auth_local_webserver=False, + table_schema=None, + location=None, + progress_bar=True, + credentials=None, + ) -> None: + """ + Write a DataFrame to a Google BigQuery table. + + This function requires the `pandas-gbq package + `__. + + See the `How to authenticate with Google BigQuery + `__ + guide for authentication instructions. + + Parameters + ---------- + destination_table : str + Name of table to be written, in the form ``dataset.tablename``. + project_id : str, optional + Google BigQuery Account project ID. Optional when available from + the environment. + chunksize : int, optional + Number of rows to be inserted in each chunk from the dataframe. + Set to ``None`` to load the whole dataframe at once. + reauth : bool, default False + Force Google BigQuery to re-authenticate the user. This is useful + if multiple accounts are used. + if_exists : str, default 'fail' + Behavior when the destination table exists. Value can be one of: + + ``'fail'`` + If table exists raise pandas_gbq.gbq.TableCreationError. + ``'replace'`` + If table exists, drop it, recreate it, and insert data. + ``'append'`` + If table exists, insert data. Create if does not exist. + auth_local_webserver : bool, default False + Use the `local webserver flow`_ instead of the `console flow`_ + when getting user credentials. + + .. _local webserver flow: + http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server + .. _console flow: + http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console + + *New in version 0.2.0 of pandas-gbq*. + table_schema : list of dicts, optional + List of BigQuery table fields to which according DataFrame + columns conform to, e.g. ``[{'name': 'col1', 'type': + 'STRING'},...]``. If schema is not provided, it will be + generated according to dtypes of DataFrame columns. See + BigQuery API documentation on available names of a field. + + *New in version 0.3.1 of pandas-gbq*. + location : str, optional + Location where the load job should run. See the `BigQuery locations + documentation + `__ for a + list of available locations. The location must match that of the + target dataset. + + *New in version 0.5.0 of pandas-gbq*. + progress_bar : bool, default True + Use the library `tqdm` to show the progress bar for the upload, + chunk by chunk. + + *New in version 0.5.0 of pandas-gbq*. + credentials : google.auth.credentials.Credentials, optional + Credentials for accessing Google APIs. Use this parameter to + override default credentials, such as to use Compute Engine + :class:`google.auth.compute_engine.Credentials` or Service + Account :class:`google.oauth2.service_account.Credentials` + directly. + + *New in version 0.8.0 of pandas-gbq*. + + .. versionadded:: 0.24.0 + + See Also + -------- + pandas_gbq.to_gbq : This function in the pandas-gbq library. + read_gbq : Read a DataFrame from Google BigQuery. + """ + from pandas.io import gbq + + gbq.to_gbq( + self, + destination_table, + project_id=project_id, + chunksize=chunksize, + reauth=reauth, + if_exists=if_exists, + auth_local_webserver=auth_local_webserver, + table_schema=table_schema, + location=location, + progress_bar=progress_bar, + credentials=credentials, + ) + + @classmethod + def from_records( + cls, + data, + index=None, + exclude=None, + columns=None, + coerce_float=False, + nrows=None, + ) -> "DataFrame": + """ + Convert structured or record ndarray to DataFrame. + + Parameters + ---------- + data : ndarray (structured dtype), list of tuples, dict, or DataFrame + index : str, list of fields, array-like + Field of array to use as the index, alternately a specific set of + input labels to use. + exclude : sequence, default None + Columns or fields to exclude. + columns : sequence, default None + Column names to use. If the passed data do not have names + associated with them, this argument provides names for the + columns. Otherwise this argument indicates the order of the columns + in the result (any names not found in the data will become all-NA + columns). + coerce_float : bool, default False + Attempt to convert values of non-string, non-numeric objects (like + decimal.Decimal) to floating point, useful for SQL result sets. + nrows : int, default None + Number of rows to read if data is an iterator. + + Returns + ------- + DataFrame + """ + + # Make a copy of the input columns so we can modify it + if columns is not None: + columns = ensure_index(columns) + + if is_iterator(data): + if nrows == 0: + return cls() + + try: + first_row = next(data) + except StopIteration: + return cls(index=index, columns=columns) + + dtype = None + if hasattr(first_row, "dtype") and first_row.dtype.names: + dtype = first_row.dtype + + values = [first_row] + + if nrows is None: + values += data + else: + values.extend(itertools.islice(data, nrows - 1)) + + if dtype is not None: + data = np.array(values, dtype=dtype) + else: + data = values + + if isinstance(data, dict): + if columns is None: + columns = arr_columns = ensure_index(sorted(data)) + arrays = [data[k] for k in columns] + else: + arrays = [] + arr_columns = [] + for k, v in data.items(): + if k in columns: + arr_columns.append(k) + arrays.append(v) + + arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns) + + elif isinstance(data, (np.ndarray, DataFrame)): + arrays, columns = to_arrays(data, columns) + if columns is not None: + columns = ensure_index(columns) + arr_columns = columns + else: + arrays, arr_columns = to_arrays(data, columns, coerce_float=coerce_float) + + arr_columns = ensure_index(arr_columns) + if columns is not None: + columns = ensure_index(columns) + else: + columns = arr_columns + + if exclude is None: + exclude = set() + else: + exclude = set(exclude) + + result_index = None + if index is not None: + if isinstance(index, str) or not hasattr(index, "__iter__"): + i = columns.get_loc(index) + exclude.add(index) + if len(arrays) > 0: + result_index = Index(arrays[i], name=index) + else: + result_index = Index([], name=index) + else: + try: + index_data = [arrays[arr_columns.get_loc(field)] for field in index] + except (KeyError, TypeError): + # raised by get_loc, see GH#29258 + result_index = index + else: + result_index = ensure_index_from_sequences(index_data, names=index) + exclude.update(index) + + if any(exclude): + arr_exclude = [x for x in exclude if x in arr_columns] + to_remove = [arr_columns.get_loc(col) for col in arr_exclude] + arrays = [v for i, v in enumerate(arrays) if i not in to_remove] + + arr_columns = arr_columns.drop(arr_exclude) + columns = columns.drop(exclude) + + mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns) + + return cls(mgr) + + def to_records( + self, index=True, column_dtypes=None, index_dtypes=None + ) -> np.recarray: + """ + Convert DataFrame to a NumPy record array. + + Index will be included as the first field of the record array if + requested. + + Parameters + ---------- + index : bool, default True + Include index in resulting record array, stored in 'index' + field or using the index label, if set. + column_dtypes : str, type, dict, default None + .. versionadded:: 0.24.0 + + If a string or type, the data type to store all columns. If + a dictionary, a mapping of column names and indices (zero-indexed) + to specific data types. + index_dtypes : str, type, dict, default None + .. versionadded:: 0.24.0 + + If a string or type, the data type to store all index levels. If + a dictionary, a mapping of index level names and indices + (zero-indexed) to specific data types. + + This mapping is applied only if `index=True`. + + Returns + ------- + numpy.recarray + NumPy ndarray with the DataFrame labels as fields and each row + of the DataFrame as entries. + + See Also + -------- + DataFrame.from_records: Convert structured or record ndarray + to DataFrame. + numpy.recarray: An ndarray that allows field access using + attributes, analogous to typed columns in a + spreadsheet. + + Examples + -------- + >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]}, + ... index=['a', 'b']) + >>> df + A B + a 1 0.50 + b 2 0.75 + >>> df.to_records() + rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], + dtype=[('index', 'O'), ('A', '>> df.index = df.index.rename("I") + >>> df.to_records() + rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], + dtype=[('I', 'O'), ('A', '>> df.to_records(index=False) + rec.array([(1, 0.5 ), (2, 0.75)], + dtype=[('A', '>> df.to_records(column_dtypes={"A": "int32"}) + rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], + dtype=[('I', 'O'), ('A', '>> df.to_records(index_dtypes=">> index_dtypes = f">> df.to_records(index_dtypes=index_dtypes) + rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)], + dtype=[('I', 'S1'), ('A', ' "DataFrame": + mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) + return cls(mgr) + + @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") + def to_stata( + self, + path, + convert_dates=None, + write_index=True, + byteorder=None, + time_stamp=None, + data_label=None, + variable_labels=None, + version=114, + convert_strl=None, + ): + """ + Export DataFrame object to Stata dta format. + + Writes the DataFrame to a Stata dataset file. + "dta" files contain a Stata dataset. + + Parameters + ---------- + path : str, buffer or path object + String, path object (pathlib.Path or py._path.local.LocalPath) or + object implementing a binary write() function. If using a buffer + then the buffer will not be automatically closed after the file + data has been written. + + .. versionchanged:: 1.0.0 + + Previously this was "fname" + + convert_dates : dict + Dictionary mapping columns containing datetime types to stata + internal format to use when writing the dates. Options are 'tc', + 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer + or a name. Datetime columns that do not have a conversion type + specified will be converted to 'tc'. Raises NotImplementedError if + a datetime column has timezone information. + write_index : bool + Write the index to Stata dataset. + byteorder : str + Can be ">", "<", "little", or "big". default is `sys.byteorder`. + time_stamp : datetime + A datetime to use as file creation date. Default is the current + time. + data_label : str, optional + A label for the data set. Must be 80 characters or smaller. + variable_labels : dict + Dictionary containing columns as keys and variable labels as + values. Each label must be 80 characters or smaller. + version : {114, 117, 118, 119, None}, default 114 + Version to use in the output dta file. Set to None to let pandas + decide between 118 or 119 formats depending on the number of + columns in the frame. Version 114 can be read by Stata 10 and + later. Version 117 can be read by Stata 13 or later. Version 118 + is supported in Stata 14 and later. Version 119 is supported in + Stata 15 and later. Version 114 limits string variables to 244 + characters or fewer while versions 117 and later allow strings + with lengths up to 2,000,000 characters. Versions 118 and 119 + support Unicode characters, and version 119 supports more than + 32,767 variables. + + .. versionadded:: 0.23.0 + .. versionchanged:: 1.0.0 + + Added support for formats 118 and 119. + + convert_strl : list, optional + List of column names to convert to string columns to Stata StrL + format. Only available if version is 117. Storing strings in the + StrL format can produce smaller dta files if strings have more than + 8 characters and values are repeated. + + .. versionadded:: 0.23.0 + + Raises + ------ + NotImplementedError + * If datetimes contain timezone information + * Column dtype is not representable in Stata + ValueError + * Columns listed in convert_dates are neither datetime64[ns] + or datetime.datetime + * Column listed in convert_dates is not in DataFrame + * Categorical label contains more than 32,000 characters + + See Also + -------- + read_stata : Import Stata data files. + io.stata.StataWriter : Low-level writer for Stata data files. + io.stata.StataWriter117 : Low-level writer for version 117 files. + + Examples + -------- + >>> df = pd.DataFrame({'animal': ['falcon', 'parrot', 'falcon', + ... 'parrot'], + ... 'speed': [350, 18, 361, 15]}) + >>> df.to_stata('animals.dta') # doctest: +SKIP + """ + if version not in (114, 117, 118, 119, None): + raise ValueError("Only formats 114, 117, 118 and 119 are supported.") + if version == 114: + if convert_strl is not None: + raise ValueError("strl is not supported in format 114") + from pandas.io.stata import StataWriter as statawriter + elif version == 117: + from pandas.io.stata import StataWriter117 as statawriter + else: # versions 118 and 119 + from pandas.io.stata import StataWriterUTF8 as statawriter + + kwargs = {} + if version is None or version >= 117: + # strl conversion is only supported >= 117 + kwargs["convert_strl"] = convert_strl + if version is None or version >= 118: + # Specifying the version is only supported for UTF8 (118 or 119) + kwargs["version"] = version + + writer = statawriter( + path, + self, + convert_dates=convert_dates, + byteorder=byteorder, + time_stamp=time_stamp, + data_label=data_label, + write_index=write_index, + variable_labels=variable_labels, + **kwargs, + ) + writer.write_file() + + @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") + def to_feather(self, path) -> None: + """ + Write out the binary feather-format for DataFrames. + + Parameters + ---------- + path : str + String file path. + """ + from pandas.io.feather_format import to_feather + + to_feather(self, path) + + @Appender( + """ + Examples + -------- + >>> df = pd.DataFrame( + ... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]} + ... ) + >>> print(df.to_markdown()) + | | animal_1 | animal_2 | + |---:|:-----------|:-----------| + | 0 | elk | dog | + | 1 | pig | quetzal | + """ + ) + @Substitution(klass="DataFrame") + @Appender(_shared_docs["to_markdown"]) + def to_markdown( + self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs + ) -> Optional[str]: + kwargs.setdefault("headers", "keys") + kwargs.setdefault("tablefmt", "pipe") + tabulate = import_optional_dependency("tabulate") + result = tabulate.tabulate(self, **kwargs) + if buf is None: + return result + buf, _, _, _ = get_filepath_or_buffer(buf, mode=mode) + assert buf is not None # Help mypy. + buf.writelines(result) + return None + + @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") + def to_parquet( + self, + path, + engine="auto", + compression="snappy", + index=None, + partition_cols=None, + **kwargs, + ) -> None: + """ + Write a DataFrame to the binary parquet format. + + .. versionadded:: 0.21.0 + + This function writes the dataframe as a `parquet file + `_. You can choose different parquet + backends, and have the option of compression. See + :ref:`the user guide ` for more details. + + Parameters + ---------- + path : str + File path or Root Directory path. Will be used as Root Directory + path while writing a partitioned dataset. + + .. versionchanged:: 1.0.0 + + Previously this was "fname" + + engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto' + Parquet library to use. If 'auto', then the option + ``io.parquet.engine`` is used. The default ``io.parquet.engine`` + behavior is to try 'pyarrow', falling back to 'fastparquet' if + 'pyarrow' is unavailable. + compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy' + Name of the compression to use. Use ``None`` for no compression. + index : bool, default None + If ``True``, include the dataframe's index(es) in the file output. + If ``False``, they will not be written to the file. + If ``None``, similar to ``True`` the dataframe's index(es) + will be saved. However, instead of being saved as values, + the RangeIndex will be stored as a range in the metadata so it + doesn't require much space and is faster. Other indexes will + be included as columns in the file output. + + .. versionadded:: 0.24.0 + + partition_cols : list, optional, default None + Column names by which to partition the dataset. + Columns are partitioned in the order they are given. + + .. versionadded:: 0.24.0 + + **kwargs + Additional arguments passed to the parquet library. See + :ref:`pandas io ` for more details. + + See Also + -------- + read_parquet : Read a parquet file. + DataFrame.to_csv : Write a csv file. + DataFrame.to_sql : Write to a sql table. + DataFrame.to_hdf : Write to hdf. + + Notes + ----- + This function requires either the `fastparquet + `_ or `pyarrow + `_ library. + + Examples + -------- + >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]}) + >>> df.to_parquet('df.parquet.gzip', + ... compression='gzip') # doctest: +SKIP + >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP + col1 col2 + 0 1 3 + 1 2 4 + """ + from pandas.io.parquet import to_parquet + + to_parquet( + self, + path, + engine, + compression=compression, + index=index, + partition_cols=partition_cols, + **kwargs, + ) + + @Substitution( + header_type="bool", + header="Whether to print column labels, default True", + col_space_type="str or int", + col_space="The minimum width of each column in CSS length " + "units. An int is assumed to be px units.\n\n" + " .. versionadded:: 0.25.0\n" + " Ability to use str", + ) + @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) + def to_html( + self, + buf=None, + columns=None, + col_space=None, + header=True, + index=True, + na_rep="NaN", + formatters=None, + float_format=None, + sparsify=None, + index_names=True, + justify=None, + max_rows=None, + max_cols=None, + show_dimensions=False, + decimal=".", + bold_rows=True, + classes=None, + escape=True, + notebook=False, + border=None, + table_id=None, + render_links=False, + encoding=None, + ): + """ + Render a DataFrame as an HTML table. + %(shared_params)s + bold_rows : bool, default True + Make the row labels bold in the output. + classes : str or list or tuple, default None + CSS class(es) to apply to the resulting html table. + escape : bool, default True + Convert the characters <, >, and & to HTML-safe sequences. + notebook : {True, False}, default False + Whether the generated HTML is for IPython Notebook. + border : int + A ``border=border`` attribute is included in the opening + `
` tag. Default ``pd.options.display.html.border``. + encoding : str, default "utf-8" + Set character encoding. + + .. versionadded:: 1.0 + + table_id : str, optional + A css id is included in the opening `
` tag if specified. + + .. versionadded:: 0.23.0 + + render_links : bool, default False + Convert URLs to HTML links. + + .. versionadded:: 0.24.0 + %(returns)s + See Also + -------- + to_string : Convert DataFrame to a string. + """ + + if justify is not None and justify not in fmt._VALID_JUSTIFY_PARAMETERS: + raise ValueError("Invalid value for justify parameter") + + formatter = fmt.DataFrameFormatter( + self, + columns=columns, + col_space=col_space, + na_rep=na_rep, + formatters=formatters, + float_format=float_format, + sparsify=sparsify, + justify=justify, + index_names=index_names, + header=header, + index=index, + bold_rows=bold_rows, + escape=escape, + max_rows=max_rows, + max_cols=max_cols, + show_dimensions=show_dimensions, + decimal=decimal, + table_id=table_id, + render_links=render_links, + ) + # TODO: a generic formatter wld b in DataFrameFormatter + return formatter.to_html( + buf=buf, + classes=classes, + notebook=notebook, + border=border, + encoding=encoding, + ) + + # ---------------------------------------------------------------------- + + def info( + self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None + ) -> None: + """ + Print a concise summary of a DataFrame. + + This method prints information about a DataFrame including + the index dtype and column dtypes, non-null values and memory usage. + + Parameters + ---------- + verbose : bool, optional + Whether to print the full summary. By default, the setting in + ``pandas.options.display.max_info_columns`` is followed. + buf : writable buffer, defaults to sys.stdout + Where to send the output. By default, the output is printed to + sys.stdout. Pass a writable buffer if you need to further process + the output. + max_cols : int, optional + When to switch from the verbose to the truncated output. If the + DataFrame has more than `max_cols` columns, the truncated output + is used. By default, the setting in + ``pandas.options.display.max_info_columns`` is used. + memory_usage : bool, str, optional + Specifies whether total memory usage of the DataFrame + elements (including the index) should be displayed. By default, + this follows the ``pandas.options.display.memory_usage`` setting. + + True always show memory usage. False never shows memory usage. + A value of 'deep' is equivalent to "True with deep introspection". + Memory usage is shown in human-readable units (base-2 + representation). Without deep introspection a memory estimation is + made based in column dtype and number of rows assuming values + consume the same memory amount for corresponding dtypes. With deep + memory introspection, a real memory usage calculation is performed + at the cost of computational resources. + null_counts : bool, optional + Whether to show the non-null counts. By default, this is shown + only if the frame is smaller than + ``pandas.options.display.max_info_rows`` and + ``pandas.options.display.max_info_columns``. A value of True always + shows the counts, and False never shows the counts. + + Returns + ------- + None + This method prints a summary of a DataFrame and returns None. + + See Also + -------- + DataFrame.describe: Generate descriptive statistics of DataFrame + columns. + DataFrame.memory_usage: Memory usage of DataFrame columns. + + Examples + -------- + >>> int_values = [1, 2, 3, 4, 5] + >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] + >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0] + >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values, + ... "float_col": float_values}) + >>> df + int_col text_col float_col + 0 1 alpha 0.00 + 1 2 beta 0.25 + 2 3 gamma 0.50 + 3 4 delta 0.75 + 4 5 epsilon 1.00 + + Prints information of all columns: + + >>> df.info(verbose=True) + + RangeIndex: 5 entries, 0 to 4 + Data columns (total 3 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 int_col 5 non-null int64 + 1 text_col 5 non-null object + 2 float_col 5 non-null float64 + dtypes: float64(1), int64(1), object(1) + memory usage: 248.0+ bytes + + Prints a summary of columns count and its dtypes but not per column + information: + + >>> df.info(verbose=False) + + RangeIndex: 5 entries, 0 to 4 + Columns: 3 entries, int_col to float_col + dtypes: float64(1), int64(1), object(1) + memory usage: 248.0+ bytes + + Pipe output of DataFrame.info to buffer instead of sys.stdout, get + buffer content and writes to a text file: + + >>> import io + >>> buffer = io.StringIO() + >>> df.info(buf=buffer) + >>> s = buffer.getvalue() + >>> with open("df_info.txt", "w", + ... encoding="utf-8") as f: # doctest: +SKIP + ... f.write(s) + 260 + + The `memory_usage` parameter allows deep introspection mode, specially + useful for big DataFrames and fine-tune memory optimization: + + >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) + >>> df = pd.DataFrame({ + ... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6), + ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6), + ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6) + ... }) + >>> df.info() + + RangeIndex: 1000000 entries, 0 to 999999 + Data columns (total 3 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 column_1 1000000 non-null object + 1 column_2 1000000 non-null object + 2 column_3 1000000 non-null object + dtypes: object(3) + memory usage: 22.9+ MB + + >>> df.info(memory_usage='deep') + + RangeIndex: 1000000 entries, 0 to 999999 + Data columns (total 3 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 column_1 1000000 non-null object + 1 column_2 1000000 non-null object + 2 column_3 1000000 non-null object + dtypes: object(3) + memory usage: 188.8 MB + """ + + if buf is None: # pragma: no cover + buf = sys.stdout + + lines = [] + + lines.append(str(type(self))) + lines.append(self.index._summary()) + + if len(self.columns) == 0: + lines.append(f"Empty {type(self).__name__}") + fmt.buffer_put_lines(buf, lines) + return + + cols = self.columns + col_count = len(self.columns) + + # hack + if max_cols is None: + max_cols = get_option("display.max_info_columns", len(self.columns) + 1) + + max_rows = get_option("display.max_info_rows", len(self) + 1) + + if null_counts is None: + show_counts = (col_count <= max_cols) and (len(self) < max_rows) + else: + show_counts = null_counts + exceeds_info_cols = col_count > max_cols + + def _verbose_repr(): + lines.append(f"Data columns (total {len(self.columns)} columns):") + + id_head = " # " + column_head = "Column" + col_space = 2 + + max_col = max(len(pprint_thing(k)) for k in cols) + len_column = len(pprint_thing(column_head)) + space = max(max_col, len_column) + col_space + + max_id = len(pprint_thing(col_count)) + len_id = len(pprint_thing(id_head)) + space_num = max(max_id, len_id) + col_space + counts = None + + header = _put_str(id_head, space_num) + _put_str(column_head, space) + if show_counts: + counts = self.count() + if len(cols) != len(counts): # pragma: no cover + raise AssertionError( + f"Columns must equal counts ({len(cols)} != {len(counts)})" + ) + count_header = "Non-Null Count" + len_count = len(count_header) + non_null = " non-null" + max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null) + space_count = max(len_count, max_count) + col_space + count_temp = "{count}" + non_null + else: + count_header = "" + space_count = len(count_header) + len_count = space_count + count_temp = "{count}" + + dtype_header = "Dtype" + len_dtype = len(dtype_header) + max_dtypes = max(len(pprint_thing(k)) for k in self.dtypes) + space_dtype = max(len_dtype, max_dtypes) + header += _put_str(count_header, space_count) + _put_str( + dtype_header, space_dtype + ) + + lines.append(header) + lines.append( + _put_str("-" * len_id, space_num) + + _put_str("-" * len_column, space) + + _put_str("-" * len_count, space_count) + + _put_str("-" * len_dtype, space_dtype) + ) + + for i, col in enumerate(self.columns): + dtype = self.dtypes.iloc[i] + col = pprint_thing(col) + + line_no = _put_str(" {num}".format(num=i), space_num) + count = "" + if show_counts: + count = counts.iloc[i] + + lines.append( + line_no + + _put_str(col, space) + + _put_str(count_temp.format(count=count), space_count) + + _put_str(dtype, space_dtype) + ) + + def _non_verbose_repr(): + lines.append(self.columns._summary(name="Columns")) + + def _sizeof_fmt(num, size_qualifier): + # returns size in human readable format + for x in ["bytes", "KB", "MB", "GB", "TB"]: + if num < 1024.0: + return f"{num:3.1f}{size_qualifier} {x}" + num /= 1024.0 + return f"{num:3.1f}{size_qualifier} PB" + + if verbose: + _verbose_repr() + elif verbose is False: # specifically set to False, not nesc None + _non_verbose_repr() + else: + if exceeds_info_cols: + _non_verbose_repr() + else: + _verbose_repr() + + counts = self._data.get_dtype_counts() + dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())] + lines.append(f"dtypes: {', '.join(dtypes)}") + + if memory_usage is None: + memory_usage = get_option("display.memory_usage") + if memory_usage: + # append memory usage of df to display + size_qualifier = "" + if memory_usage == "deep": + deep = True + else: + # size_qualifier is just a best effort; not guaranteed to catch + # all cases (e.g., it misses categorical data even with object + # categories) + deep = False + if "object" in counts or self.index._is_memory_usage_qualified(): + size_qualifier = "+" + mem_usage = self.memory_usage(index=True, deep=deep).sum() + lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n") + fmt.buffer_put_lines(buf, lines) + + def memory_usage(self, index=True, deep=False) -> Series: + """ + Return the memory usage of each column in bytes. + + The memory usage can optionally include the contribution of + the index and elements of `object` dtype. + + This value is displayed in `DataFrame.info` by default. This can be + suppressed by setting ``pandas.options.display.memory_usage`` to False. + + Parameters + ---------- + index : bool, default True + Specifies whether to include the memory usage of the DataFrame's + index in returned Series. If ``index=True``, the memory usage of + the index is the first item in the output. + deep : bool, default False + If True, introspect the data deeply by interrogating + `object` dtypes for system-level memory consumption, and include + it in the returned values. + + Returns + ------- + Series + A Series whose index is the original column names and whose values + is the memory usage of each column in bytes. + + See Also + -------- + numpy.ndarray.nbytes : Total bytes consumed by the elements of an + ndarray. + Series.memory_usage : Bytes consumed by a Series. + Categorical : Memory-efficient array for string values with + many repeated values. + DataFrame.info : Concise summary of a DataFrame. + + Examples + -------- + >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool'] + >>> data = dict([(t, np.ones(shape=5000).astype(t)) + ... for t in dtypes]) + >>> df = pd.DataFrame(data) + >>> df.head() + int64 float64 complex128 object bool + 0 1 1.0 1.000000+0.000000j 1 True + 1 1 1.0 1.000000+0.000000j 1 True + 2 1 1.0 1.000000+0.000000j 1 True + 3 1 1.0 1.000000+0.000000j 1 True + 4 1 1.0 1.000000+0.000000j 1 True + + >>> df.memory_usage() + Index 128 + int64 40000 + float64 40000 + complex128 80000 + object 40000 + bool 5000 + dtype: int64 + + >>> df.memory_usage(index=False) + int64 40000 + float64 40000 + complex128 80000 + object 40000 + bool 5000 + dtype: int64 + + The memory footprint of `object` dtype columns is ignored by default: + + >>> df.memory_usage(deep=True) + Index 128 + int64 40000 + float64 40000 + complex128 80000 + object 160000 + bool 5000 + dtype: int64 + + Use a Categorical for efficient storage of an object-dtype column with + many repeated values. + + >>> df['object'].astype('category').memory_usage(deep=True) + 5216 + """ + result = Series( + [c.memory_usage(index=False, deep=deep) for col, c in self.items()], + index=self.columns, + ) + if index: + result = Series(self.index.memory_usage(deep=deep), index=["Index"]).append( + result + ) + return result + + def transpose(self, *args, copy: bool = False) -> "DataFrame": + """ + Transpose index and columns. + + Reflect the DataFrame over its main diagonal by writing rows as columns + and vice-versa. The property :attr:`.T` is an accessor to the method + :meth:`transpose`. + + Parameters + ---------- + *args : tuple, optional + Accepted for compatibility with NumPy. + copy : bool, default False + Whether to copy the data after transposing, even for DataFrames + with a single dtype. + + Note that a copy is always required for mixed dtype DataFrames, + or for DataFrames with any extension types. + + Returns + ------- + DataFrame + The transposed DataFrame. + + See Also + -------- + numpy.transpose : Permute the dimensions of a given array. + + Notes + ----- + Transposing a DataFrame with mixed dtypes will result in a homogeneous + DataFrame with the `object` dtype. In such a case, a copy of the data + is always made. + + Examples + -------- + **Square DataFrame with homogeneous dtype** + + >>> d1 = {'col1': [1, 2], 'col2': [3, 4]} + >>> df1 = pd.DataFrame(data=d1) + >>> df1 + col1 col2 + 0 1 3 + 1 2 4 + + >>> df1_transposed = df1.T # or df1.transpose() + >>> df1_transposed + 0 1 + col1 1 2 + col2 3 4 + + When the dtype is homogeneous in the original DataFrame, we get a + transposed DataFrame with the same dtype: + + >>> df1.dtypes + col1 int64 + col2 int64 + dtype: object + >>> df1_transposed.dtypes + 0 int64 + 1 int64 + dtype: object + + **Non-square DataFrame with mixed dtypes** + + >>> d2 = {'name': ['Alice', 'Bob'], + ... 'score': [9.5, 8], + ... 'employed': [False, True], + ... 'kids': [0, 0]} + >>> df2 = pd.DataFrame(data=d2) + >>> df2 + name score employed kids + 0 Alice 9.5 False 0 + 1 Bob 8.0 True 0 + + >>> df2_transposed = df2.T # or df2.transpose() + >>> df2_transposed + 0 1 + name Alice Bob + score 9.5 8 + employed False True + kids 0 0 + + When the DataFrame has mixed dtypes, we get a transposed DataFrame with + the `object` dtype: + + >>> df2.dtypes + name object + score float64 + employed bool + kids int64 + dtype: object + >>> df2_transposed.dtypes + 0 object + 1 object + dtype: object + """ + nv.validate_transpose(args, dict()) + # construct the args + + dtypes = list(self.dtypes) + if self._is_homogeneous_type and dtypes and is_extension_array_dtype(dtypes[0]): + # We have EAs with the same dtype. We can preserve that dtype in transpose. + dtype = dtypes[0] + arr_type = dtype.construct_array_type() + values = self.values + + new_values = [arr_type._from_sequence(row, dtype=dtype) for row in values] + result = self._constructor( + dict(zip(self.index, new_values)), index=self.columns + ) + + else: + new_values = self.values.T + if copy: + new_values = new_values.copy() + result = self._constructor( + new_values, index=self.columns, columns=self.index + ) + + return result.__finalize__(self) + + T = property(transpose) + + # ---------------------------------------------------------------------- + # Indexing Methods + + def _ixs(self, i: int, axis: int = 0): + """ + Parameters + ---------- + i : int + axis : int + + Notes + ----- + If slice passed, the resulting data will be a view. + """ + # irow + if axis == 0: + new_values = self._data.fast_xs(i) + + # if we are a copy, mark as such + copy = isinstance(new_values, np.ndarray) and new_values.base is None + result = self._constructor_sliced( + new_values, + index=self.columns, + name=self.index[i], + dtype=new_values.dtype, + ) + result._set_is_copy(self, copy=copy) + return result + + # icol + else: + label = self.columns[i] + + # if the values returned are not the same length + # as the index (iow a not found value), iget returns + # a 0-len ndarray. This is effectively catching + # a numpy error (as numpy should really raise) + values = self._data.iget(i) + + if len(self.index) and not len(values): + values = np.array([np.nan] * len(self.index), dtype=object) + result = self._box_col_values(values, label) + + # this is a cached value, mark it so + result._set_as_cached(label, self) + + return result + + def __getitem__(self, key): + key = lib.item_from_zerodim(key) + key = com.apply_if_callable(key, self) + + if is_hashable(key): + # shortcut if the key is in columns + if self.columns.is_unique and key in self.columns: + if self.columns.nlevels > 1: + return self._getitem_multilevel(key) + return self._get_item_cache(key) + + # Do we have a slicer (on rows)? + indexer = convert_to_index_sliceable(self, key) + if indexer is not None: + # either we have a slice or we have a string that can be converted + # to a slice for partial-string date indexing + return self._slice(indexer, axis=0) + + # Do we have a (boolean) DataFrame? + if isinstance(key, DataFrame): + return self.where(key) + + # Do we have a (boolean) 1d indexer? + if com.is_bool_indexer(key): + return self._getitem_bool_array(key) + + # We are left with two options: a single key, and a collection of keys, + # We interpret tuples as collections only for non-MultiIndex + is_single_key = isinstance(key, tuple) or not is_list_like(key) + + if is_single_key: + if self.columns.nlevels > 1: + return self._getitem_multilevel(key) + indexer = self.columns.get_loc(key) + if is_integer(indexer): + indexer = [indexer] + else: + if is_iterator(key): + key = list(key) + indexer = self.loc._get_listlike_indexer(key, axis=1, raise_missing=True)[1] + + # take() does not accept boolean indexers + if getattr(indexer, "dtype", None) == bool: + indexer = np.where(indexer)[0] + + data = self._take_with_is_copy(indexer, axis=1) + + if is_single_key: + # What does looking for a single key in a non-unique index return? + # The behavior is inconsistent. It returns a Series, except when + # - the key itself is repeated (test on data.shape, #9519), or + # - we have a MultiIndex on columns (test on self.columns, #21309) + if data.shape[1] == 1 and not isinstance(self.columns, ABCMultiIndex): + data = data[key] + + return data + + def _getitem_bool_array(self, key): + # also raises Exception if object array with NA values + # warning here just in case -- previously __setitem__ was + # reindexing but __getitem__ was not; it seems more reasonable to + # go with the __setitem__ behavior since that is more consistent + # with all other indexing behavior + if isinstance(key, Series) and not key.index.equals(self.index): + warnings.warn( + "Boolean Series key will be reindexed to match DataFrame index.", + UserWarning, + stacklevel=3, + ) + elif len(key) != len(self.index): + raise ValueError( + f"Item wrong length {len(key)} instead of {len(self.index)}." + ) + + # check_bool_indexer will throw exception if Series key cannot + # be reindexed to match DataFrame rows + key = check_bool_indexer(self.index, key) + indexer = key.nonzero()[0] + return self._take_with_is_copy(indexer, axis=0) + + def _getitem_multilevel(self, key): + # self.columns is a MultiIndex + loc = self.columns.get_loc(key) + if isinstance(loc, (slice, Series, np.ndarray, Index)): + new_columns = self.columns[loc] + result_columns = maybe_droplevels(new_columns, key) + if self._is_mixed_type: + result = self.reindex(columns=new_columns) + result.columns = result_columns + else: + new_values = self.values[:, loc] + result = self._constructor( + new_values, index=self.index, columns=result_columns + ) + result = result.__finalize__(self) + + # If there is only one column being returned, and its name is + # either an empty string, or a tuple with an empty string as its + # first element, then treat the empty string as a placeholder + # and return the column as if the user had provided that empty + # string in the key. If the result is a Series, exclude the + # implied empty string from its name. + if len(result.columns) == 1: + top = result.columns[0] + if isinstance(top, tuple): + top = top[0] + if top == "": + result = result[""] + if isinstance(result, Series): + result = self._constructor_sliced( + result, index=self.index, name=key + ) + + result._set_is_copy(self) + return result + else: + return self._get_item_cache(key) + + def _get_value(self, index, col, takeable: bool = False): + """ + Quickly retrieve single value at passed column and index. + + Parameters + ---------- + index : row label + col : column label + takeable : interpret the index/col as indexers, default False + + Returns + ------- + scalar + """ + if takeable: + series = self._iget_item_cache(col) + return com.maybe_box_datetimelike(series._values[index]) + + series = self._get_item_cache(col) + engine = self.index._engine + + try: + return engine.get_value(series._values, index) + except KeyError: + # GH 20629 + if self.index.nlevels > 1: + # partial indexing forbidden + raise + except (TypeError, ValueError): + pass + + # we cannot handle direct indexing + # use positional + col = self.columns.get_loc(col) + index = self.index.get_loc(index) + return self._get_value(index, col, takeable=True) + + def __setitem__(self, key, value): + key = com.apply_if_callable(key, self) + + # see if we can slice the rows + indexer = convert_to_index_sliceable(self, key) + if indexer is not None: + # either we have a slice or we have a string that can be converted + # to a slice for partial-string date indexing + return self._setitem_slice(indexer, value) + + if isinstance(key, DataFrame) or getattr(key, "ndim", None) == 2: + self._setitem_frame(key, value) + elif isinstance(key, (Series, np.ndarray, list, Index)): + self._setitem_array(key, value) + else: + # set column + self._set_item(key, value) + + def _setitem_slice(self, key, value): + self._check_setitem_copy() + self.loc[key] = value + + def _setitem_array(self, key, value): + # also raises Exception if object array with NA values + if com.is_bool_indexer(key): + if len(key) != len(self.index): + raise ValueError( + f"Item wrong length {len(key)} instead of {len(self.index)}!" + ) + key = check_bool_indexer(self.index, key) + indexer = key.nonzero()[0] + self._check_setitem_copy() + self.loc._setitem_with_indexer(indexer, value) + else: + if isinstance(value, DataFrame): + if len(value.columns) != len(key): + raise ValueError("Columns must be same length as key") + for k1, k2 in zip(key, value.columns): + self[k1] = value[k2] + else: + indexer = self.loc._get_listlike_indexer( + key, axis=1, raise_missing=False + )[1] + self._check_setitem_copy() + self.loc._setitem_with_indexer((slice(None), indexer), value) + + def _setitem_frame(self, key, value): + # support boolean setting with DataFrame input, e.g. + # df[df > df2] = 0 + if isinstance(key, np.ndarray): + if key.shape != self.shape: + raise ValueError("Array conditional must be same shape as self") + key = self._constructor(key, **self._construct_axes_dict()) + + if key.values.size and not is_bool_dtype(key.values): + raise TypeError( + "Must pass DataFrame or 2-d ndarray with boolean values only" + ) + + self._check_inplace_setting(value) + self._check_setitem_copy() + self._where(-key, value, inplace=True) + + def _set_item(self, key, value): + """ + Add series to DataFrame in specified column. + + If series is a numpy-array (not a Series/TimeSeries), it must be the + same length as the DataFrames index or an error will be thrown. + + Series/TimeSeries will be conformed to the DataFrames index to + ensure homogeneity. + """ + + self._ensure_valid_index(value) + value = self._sanitize_column(key, value) + NDFrame._set_item(self, key, value) + + # check if we are modifying a copy + # try to set first as we want an invalid + # value exception to occur first + if len(self): + self._check_setitem_copy() + + def _set_value(self, index, col, value, takeable: bool = False): + """ + Put single value at passed column and index. + + Parameters + ---------- + index : row label + col : column label + value : scalar + takeable : interpret the index/col as indexers, default False + + Returns + ------- + DataFrame + If label pair is contained, will be reference to calling DataFrame, + otherwise a new object. + """ + try: + if takeable is True: + series = self._iget_item_cache(col) + return series._set_value(index, value, takeable=True) + + series = self._get_item_cache(col) + engine = self.index._engine + engine.set_value(series._values, index, value) + return self + except (KeyError, TypeError): + + # set using a non-recursive method & reset the cache + if takeable: + self.iloc[index, col] = value + else: + self.loc[index, col] = value + self._item_cache.pop(col, None) + + return self + + def _ensure_valid_index(self, value): + """ + Ensure that if we don't have an index, that we can create one from the + passed value. + """ + # GH5632, make sure that we are a Series convertible + if not len(self.index) and is_list_like(value) and len(value): + try: + value = Series(value) + except (ValueError, NotImplementedError, TypeError): + raise ValueError( + "Cannot set a frame with no defined index " + "and a value that cannot be converted to a " + "Series" + ) + + self._data = self._data.reindex_axis( + value.index.copy(), axis=1, fill_value=np.nan + ) + + def _box_item_values(self, key, values): + items = self.columns[self.columns.get_loc(key)] + if values.ndim == 2: + return self._constructor(values.T, columns=items, index=self.index) + else: + return self._box_col_values(values, items) + + def _box_col_values(self, values, items): + """ + Provide boxed values for a column. + """ + klass = self._constructor_sliced + return klass(values, index=self.index, name=items, fastpath=True) + + # ---------------------------------------------------------------------- + # Unsorted + + def query(self, expr, inplace=False, **kwargs): + """ + Query the columns of a DataFrame with a boolean expression. + + Parameters + ---------- + expr : str + The query string to evaluate. + + You can refer to variables + in the environment by prefixing them with an '@' character like + ``@a + b``. + + You can refer to column names that contain spaces or operators by + surrounding them in backticks. This way you can also escape + names that start with a digit, or those that are a Python keyword. + Basically when it is not valid Python identifier. See notes down + for more details. + + For example, if one of your columns is called ``a a`` and you want + to sum it with ``b``, your query should be ```a a` + b``. + + .. versionadded:: 0.25.0 + Backtick quoting introduced. + + .. versionadded:: 1.0.0 + Expanding functionality of backtick quoting for more than only spaces. + + inplace : bool + Whether the query should modify the data in place or return + a modified copy. + **kwargs + See the documentation for :func:`eval` for complete details + on the keyword arguments accepted by :meth:`DataFrame.query`. + + Returns + ------- + DataFrame + DataFrame resulting from the provided query expression. + + See Also + -------- + eval : Evaluate a string describing operations on + DataFrame columns. + DataFrame.eval : Evaluate a string describing operations on + DataFrame columns. + + Notes + ----- + The result of the evaluation of this expression is first passed to + :attr:`DataFrame.loc` and if that fails because of a + multidimensional key (e.g., a DataFrame) then the result will be passed + to :meth:`DataFrame.__getitem__`. + + This method uses the top-level :func:`eval` function to + evaluate the passed query. + + The :meth:`~pandas.DataFrame.query` method uses a slightly + modified Python syntax by default. For example, the ``&`` and ``|`` + (bitwise) operators have the precedence of their boolean cousins, + :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python, + however the semantics are different. + + You can change the semantics of the expression by passing the keyword + argument ``parser='python'``. This enforces the same semantics as + evaluation in Python space. Likewise, you can pass ``engine='python'`` + to evaluate an expression using Python itself as a backend. This is not + recommended as it is inefficient compared to using ``numexpr`` as the + engine. + + The :attr:`DataFrame.index` and + :attr:`DataFrame.columns` attributes of the + :class:`~pandas.DataFrame` instance are placed in the query namespace + by default, which allows you to treat both the index and columns of the + frame as a column in the frame. + The identifier ``index`` is used for the frame index; you can also + use the name of the index to identify it in a query. Please note that + Python keywords may not be used as identifiers. + + For further details and examples see the ``query`` documentation in + :ref:`indexing `. + + *Backtick quoted variables* + + Backtick quoted variables are parsed as literal Python code and + are converted internally to a Python valid identifier. + This can lead to the following problems. + + During parsing a number of disallowed characters inside the backtick + quoted string are replaced by strings that are allowed as a Python identifier. + These characters include all operators in Python, the space character, the + question mark, the exclamation mark, the dollar sign, and the euro sign. + For other characters that fall outside the ASCII range (U+0001..U+007F) + and those that are not further specified in PEP 3131, + the query parser will raise an error. + This excludes whitespace different than the space character, + but also the hashtag (as it is used for comments) and the backtick + itself (backtick can also not be escaped). + + In a special case, quotes that make a pair around a backtick can + confuse the parser. + For example, ```it's` > `that's``` will raise an error, + as it forms a quoted string (``'s > `that'``) with a backtick inside. + + See also the Python documentation about lexical analysis + (https://docs.python.org/3/reference/lexical_analysis.html) + in combination with the source code in :mod:`pandas.core.computation.parsing`. + + Examples + -------- + >>> df = pd.DataFrame({'A': range(1, 6), + ... 'B': range(10, 0, -2), + ... 'C C': range(10, 5, -1)}) + >>> df + A B C C + 0 1 10 10 + 1 2 8 9 + 2 3 6 8 + 3 4 4 7 + 4 5 2 6 + >>> df.query('A > B') + A B C C + 4 5 2 6 + + The previous expression is equivalent to + + >>> df[df.A > df.B] + A B C C + 4 5 2 6 + + For columns with spaces in their name, you can use backtick quoting. + + >>> df.query('B == `C C`') + A B C C + 0 1 10 10 + + The previous expression is equivalent to + + >>> df[df.B == df['C C']] + A B C C + 0 1 10 10 + """ + inplace = validate_bool_kwarg(inplace, "inplace") + if not isinstance(expr, str): + msg = f"expr must be a string to be evaluated, {type(expr)} given" + raise ValueError(msg) + kwargs["level"] = kwargs.pop("level", 0) + 1 + kwargs["target"] = None + res = self.eval(expr, **kwargs) + + try: + new_data = self.loc[res] + except ValueError: + # when res is multi-dimensional loc raises, but this is sometimes a + # valid query + new_data = self[res] + + if inplace: + self._update_inplace(new_data) + else: + return new_data + + def eval(self, expr, inplace=False, **kwargs): + """ + Evaluate a string describing operations on DataFrame columns. + + Operates on columns only, not specific rows or elements. This allows + `eval` to run arbitrary code, which can make you vulnerable to code + injection if you pass user input to this function. + + Parameters + ---------- + expr : str + The expression string to evaluate. + inplace : bool, default False + If the expression contains an assignment, whether to perform the + operation inplace and mutate the existing DataFrame. Otherwise, + a new DataFrame is returned. + **kwargs + See the documentation for :func:`eval` for complete details + on the keyword arguments accepted by + :meth:`~pandas.DataFrame.query`. + + Returns + ------- + ndarray, scalar, or pandas object + The result of the evaluation. + + See Also + -------- + DataFrame.query : Evaluates a boolean expression to query the columns + of a frame. + DataFrame.assign : Can evaluate an expression or function to create new + values for a column. + eval : Evaluate a Python expression as a string using various + backends. + + Notes + ----- + For more details see the API documentation for :func:`~eval`. + For detailed examples see :ref:`enhancing performance with eval + `. + + Examples + -------- + >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) + >>> df + A B + 0 1 10 + 1 2 8 + 2 3 6 + 3 4 4 + 4 5 2 + >>> df.eval('A + B') + 0 11 + 1 10 + 2 9 + 3 8 + 4 7 + dtype: int64 + + Assignment is allowed though by default the original DataFrame is not + modified. + + >>> df.eval('C = A + B') + A B C + 0 1 10 11 + 1 2 8 10 + 2 3 6 9 + 3 4 4 8 + 4 5 2 7 + >>> df + A B + 0 1 10 + 1 2 8 + 2 3 6 + 3 4 4 + 4 5 2 + + Use ``inplace=True`` to modify the original DataFrame. + + >>> df.eval('C = A + B', inplace=True) + >>> df + A B C + 0 1 10 11 + 1 2 8 10 + 2 3 6 9 + 3 4 4 8 + 4 5 2 7 + """ + from pandas.core.computation.eval import eval as _eval + + inplace = validate_bool_kwarg(inplace, "inplace") + resolvers = kwargs.pop("resolvers", None) + kwargs["level"] = kwargs.pop("level", 0) + 1 + if resolvers is None: + index_resolvers = self._get_index_resolvers() + column_resolvers = self._get_cleaned_column_resolvers() + resolvers = column_resolvers, index_resolvers + if "target" not in kwargs: + kwargs["target"] = self + kwargs["resolvers"] = kwargs.get("resolvers", ()) + tuple(resolvers) + + return _eval(expr, inplace=inplace, **kwargs) + + def select_dtypes(self, include=None, exclude=None) -> "DataFrame": + """ + Return a subset of the DataFrame's columns based on the column dtypes. + + Parameters + ---------- + include, exclude : scalar or list-like + A selection of dtypes or strings to be included/excluded. At least + one of these parameters must be supplied. + + Returns + ------- + DataFrame + The subset of the frame including the dtypes in ``include`` and + excluding the dtypes in ``exclude``. + + Raises + ------ + ValueError + * If both of ``include`` and ``exclude`` are empty + * If ``include`` and ``exclude`` have overlapping elements + * If any kind of string dtype is passed in. + + Notes + ----- + * To select all *numeric* types, use ``np.number`` or ``'number'`` + * To select strings you must use the ``object`` dtype, but note that + this will return *all* object dtype columns + * See the `numpy dtype hierarchy + `__ + * To select datetimes, use ``np.datetime64``, ``'datetime'`` or + ``'datetime64'`` + * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or + ``'timedelta64'`` + * To select Pandas categorical dtypes, use ``'category'`` + * To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in + 0.20.0) or ``'datetime64[ns, tz]'`` + + Examples + -------- + >>> df = pd.DataFrame({'a': [1, 2] * 3, + ... 'b': [True, False] * 3, + ... 'c': [1.0, 2.0] * 3}) + >>> df + a b c + 0 1 True 1.0 + 1 2 False 2.0 + 2 1 True 1.0 + 3 2 False 2.0 + 4 1 True 1.0 + 5 2 False 2.0 + + >>> df.select_dtypes(include='bool') + b + 0 True + 1 False + 2 True + 3 False + 4 True + 5 False + + >>> df.select_dtypes(include=['float64']) + c + 0 1.0 + 1 2.0 + 2 1.0 + 3 2.0 + 4 1.0 + 5 2.0 + + >>> df.select_dtypes(exclude=['int']) + b c + 0 True 1.0 + 1 False 2.0 + 2 True 1.0 + 3 False 2.0 + 4 True 1.0 + 5 False 2.0 + """ + + if not is_list_like(include): + include = (include,) if include is not None else () + if not is_list_like(exclude): + exclude = (exclude,) if exclude is not None else () + + selection = (frozenset(include), frozenset(exclude)) + + if not any(selection): + raise ValueError("at least one of include or exclude must be nonempty") + + # convert the myriad valid dtypes object to a single representation + include = frozenset(infer_dtype_from_object(x) for x in include) + exclude = frozenset(infer_dtype_from_object(x) for x in exclude) + for dtypes in (include, exclude): + invalidate_string_dtypes(dtypes) + + # can't both include AND exclude! + if not include.isdisjoint(exclude): + raise ValueError(f"include and exclude overlap on {(include & exclude)}") + + # We raise when both include and exclude are empty + # Hence, we can just shrink the columns we want to keep + keep_these = np.full(self.shape[1], True) + + def extract_unique_dtypes_from_dtypes_set( + dtypes_set: FrozenSet[Dtype], unique_dtypes: np.ndarray + ) -> List[Dtype]: + extracted_dtypes = [ + unique_dtype + for unique_dtype in unique_dtypes + if issubclass(unique_dtype.type, tuple(dtypes_set)) # type: ignore + ] + return extracted_dtypes + + unique_dtypes = self.dtypes.unique() + + if include: + included_dtypes = extract_unique_dtypes_from_dtypes_set( + include, unique_dtypes + ) + keep_these &= self.dtypes.isin(included_dtypes) + + if exclude: + excluded_dtypes = extract_unique_dtypes_from_dtypes_set( + exclude, unique_dtypes + ) + keep_these &= ~self.dtypes.isin(excluded_dtypes) + + return self.iloc[:, keep_these.values] + + def insert(self, loc, column, value, allow_duplicates=False) -> None: + """ + Insert column into DataFrame at specified location. + + Raises a ValueError if `column` is already contained in the DataFrame, + unless `allow_duplicates` is set to True. + + Parameters + ---------- + loc : int + Insertion index. Must verify 0 <= loc <= len(columns). + column : str, number, or hashable object + Label of the inserted column. + value : int, Series, or array-like + allow_duplicates : bool, optional + """ + self._ensure_valid_index(value) + value = self._sanitize_column(column, value, broadcast=False) + self._data.insert(loc, column, value, allow_duplicates=allow_duplicates) + + def assign(self, **kwargs) -> "DataFrame": + r""" + Assign new columns to a DataFrame. + + Returns a new object with all original columns in addition to new ones. + Existing columns that are re-assigned will be overwritten. + + Parameters + ---------- + **kwargs : dict of {str: callable or Series} + The column names are keywords. If the values are + callable, they are computed on the DataFrame and + assigned to the new columns. The callable must not + change input DataFrame (though pandas doesn't check it). + If the values are not callable, (e.g. a Series, scalar, or array), + they are simply assigned. + + Returns + ------- + DataFrame + A new DataFrame with the new columns in addition to + all the existing columns. + + Notes + ----- + Assigning multiple columns within the same ``assign`` is possible. + Later items in '\*\*kwargs' may refer to newly created or modified + columns in 'df'; items are computed and assigned into 'df' in order. + + .. versionchanged:: 0.23.0 + + Keyword argument order is maintained. + + Examples + -------- + >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]}, + ... index=['Portland', 'Berkeley']) + >>> df + temp_c + Portland 17.0 + Berkeley 25.0 + + Where the value is a callable, evaluated on `df`: + + >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32) + temp_c temp_f + Portland 17.0 62.6 + Berkeley 25.0 77.0 + + Alternatively, the same behavior can be achieved by directly + referencing an existing Series or sequence: + + >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32) + temp_c temp_f + Portland 17.0 62.6 + Berkeley 25.0 77.0 + + You can create multiple columns within the same assign where one + of the columns depends on another one defined within the same assign: + + >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32, + ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9) + temp_c temp_f temp_k + Portland 17.0 62.6 290.15 + Berkeley 25.0 77.0 298.15 + """ + data = self.copy() + + for k, v in kwargs.items(): + data[k] = com.apply_if_callable(v, data) + return data + + def _sanitize_column(self, key, value, broadcast=True): + """ + Ensures new columns (which go into the BlockManager as new blocks) are + always copied and converted into an array. + + Parameters + ---------- + key : object + value : scalar, Series, or array-like + broadcast : bool, default True + If ``key`` matches multiple duplicate column names in the + DataFrame, this parameter indicates whether ``value`` should be + tiled so that the returned array contains a (duplicated) column for + each occurrence of the key. If False, ``value`` will not be tiled. + + Returns + ------- + numpy.ndarray + """ + + def reindexer(value): + # reindex if necessary + + if value.index.equals(self.index) or not len(self.index): + value = value._values.copy() + else: + + # GH 4107 + try: + value = value.reindex(self.index)._values + except ValueError as err: + # raised in MultiIndex.from_tuples, see test_insert_error_msmgs + if not value.index.is_unique: + # duplicate axis + raise err + + # other + raise TypeError( + "incompatible index of inserted column with frame index" + ) + return value + + if isinstance(value, Series): + value = reindexer(value) + + elif isinstance(value, DataFrame): + # align right-hand-side columns if self.columns + # is multi-index and self[key] is a sub-frame + if isinstance(self.columns, ABCMultiIndex) and key in self.columns: + loc = self.columns.get_loc(key) + if isinstance(loc, (slice, Series, np.ndarray, Index)): + cols = maybe_droplevels(self.columns[loc], key) + if len(cols) and not cols.equals(value.columns): + value = value.reindex(cols, axis=1) + # now align rows + value = reindexer(value).T + + elif isinstance(value, ExtensionArray): + # Explicitly copy here, instead of in sanitize_index, + # as sanitize_index won't copy an EA, even with copy=True + value = value.copy() + value = sanitize_index(value, self.index, copy=False) + + elif isinstance(value, Index) or is_sequence(value): + + # turn me into an ndarray + value = sanitize_index(value, self.index, copy=False) + if not isinstance(value, (np.ndarray, Index)): + if isinstance(value, list) and len(value) > 0: + value = maybe_convert_platform(value) + else: + value = com.asarray_tuplesafe(value) + elif value.ndim == 2: + value = value.copy().T + elif isinstance(value, Index): + value = value.copy(deep=True) + else: + value = value.copy() + + # possibly infer to datetimelike + if is_object_dtype(value.dtype): + value = maybe_infer_to_datetimelike(value) + + else: + # cast ignores pandas dtypes. so save the dtype first + infer_dtype, _ = infer_dtype_from_scalar(value, pandas_dtype=True) + + # upcast + value = cast_scalar_to_array(len(self.index), value) + value = maybe_cast_to_datetime(value, infer_dtype) + + # return internal types directly + if is_extension_array_dtype(value): + return value + + # broadcast across multiple columns if necessary + if broadcast and key in self.columns and value.ndim == 1: + if not self.columns.is_unique or isinstance(self.columns, ABCMultiIndex): + existing_piece = self[key] + if isinstance(existing_piece, DataFrame): + value = np.tile(value, (len(existing_piece.columns), 1)) + + return np.atleast_2d(np.asarray(value)) + + @property + def _series(self): + return { + item: Series(self._data.iget(idx), index=self.index, name=item) + for idx, item in enumerate(self.columns) + } + + def lookup(self, row_labels, col_labels) -> np.ndarray: + """ + Label-based "fancy indexing" function for DataFrame. + + Given equal-length arrays of row and column labels, return an + array of the values corresponding to each (row, col) pair. + + Parameters + ---------- + row_labels : sequence + The row labels to use for lookup. + col_labels : sequence + The column labels to use for lookup. + + Returns + ------- + numpy.ndarray + + Examples + -------- + values : ndarray + The found values + """ + n = len(row_labels) + if n != len(col_labels): + raise ValueError("Row labels must have same size as column labels") + + thresh = 1000 + if not self._is_mixed_type or n > thresh: + values = self.values + ridx = self.index.get_indexer(row_labels) + cidx = self.columns.get_indexer(col_labels) + if (ridx == -1).any(): + raise KeyError("One or more row labels was not found") + if (cidx == -1).any(): + raise KeyError("One or more column labels was not found") + flat_index = ridx * len(self.columns) + cidx + result = values.flat[flat_index] + else: + result = np.empty(n, dtype="O") + for i, (r, c) in enumerate(zip(row_labels, col_labels)): + result[i] = self._get_value(r, c) + + if is_object_dtype(result): + result = lib.maybe_convert_objects(result) + + return result + + # ---------------------------------------------------------------------- + # Reindexing and alignment + + def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy): + frame = self + + columns = axes["columns"] + if columns is not None: + frame = frame._reindex_columns( + columns, method, copy, level, fill_value, limit, tolerance + ) + + index = axes["index"] + if index is not None: + frame = frame._reindex_index( + index, method, copy, level, fill_value, limit, tolerance + ) + + return frame + + def _reindex_index( + self, + new_index, + method, + copy, + level, + fill_value=np.nan, + limit=None, + tolerance=None, + ): + new_index, indexer = self.index.reindex( + new_index, method=method, level=level, limit=limit, tolerance=tolerance + ) + return self._reindex_with_indexers( + {0: [new_index, indexer]}, + copy=copy, + fill_value=fill_value, + allow_dups=False, + ) + + def _reindex_columns( + self, + new_columns, + method, + copy, + level, + fill_value=None, + limit=None, + tolerance=None, + ): + new_columns, indexer = self.columns.reindex( + new_columns, method=method, level=level, limit=limit, tolerance=tolerance + ) + return self._reindex_with_indexers( + {1: [new_columns, indexer]}, + copy=copy, + fill_value=fill_value, + allow_dups=False, + ) + + def _reindex_multi(self, axes, copy, fill_value) -> "DataFrame": + """ + We are guaranteed non-Nones in the axes. + """ + + new_index, row_indexer = self.index.reindex(axes["index"]) + new_columns, col_indexer = self.columns.reindex(axes["columns"]) + + if row_indexer is not None and col_indexer is not None: + indexer = row_indexer, col_indexer + new_values = algorithms.take_2d_multi( + self.values, indexer, fill_value=fill_value + ) + return self._constructor(new_values, index=new_index, columns=new_columns) + else: + return self._reindex_with_indexers( + {0: [new_index, row_indexer], 1: [new_columns, col_indexer]}, + copy=copy, + fill_value=fill_value, + ) + + @Appender(_shared_docs["align"] % _shared_doc_kwargs) + def align( + self, + other, + join="outer", + axis=None, + level=None, + copy=True, + fill_value=None, + method=None, + limit=None, + fill_axis=0, + broadcast_axis=None, + ) -> "DataFrame": + return super().align( + other, + join=join, + axis=axis, + level=level, + copy=copy, + fill_value=fill_value, + method=method, + limit=limit, + fill_axis=fill_axis, + broadcast_axis=broadcast_axis, + ) + + @Substitution(**_shared_doc_kwargs) + @Appender(NDFrame.reindex.__doc__) + @rewrite_axis_style_signature( + "labels", + [ + ("method", None), + ("copy", True), + ("level", None), + ("fill_value", np.nan), + ("limit", None), + ("tolerance", None), + ], + ) + def reindex(self, *args, **kwargs) -> "DataFrame": + axes = validate_axis_style_args(self, args, kwargs, "labels", "reindex") + kwargs.update(axes) + # Pop these, since the values are in `kwargs` under different names + kwargs.pop("axis", None) + kwargs.pop("labels", None) + return self._ensure_type(super().reindex(**kwargs)) + + def drop( + self, + labels=None, + axis=0, + index=None, + columns=None, + level=None, + inplace=False, + errors="raise", + ): + """ + Drop specified labels from rows or columns. + + Remove rows or columns by specifying label names and corresponding + axis, or by specifying directly index or column names. When using a + multi-index, labels on different levels can be removed by specifying + the level. + + Parameters + ---------- + labels : single label or list-like + Index or column labels to drop. + axis : {0 or 'index', 1 or 'columns'}, default 0 + Whether to drop labels from the index (0 or 'index') or + columns (1 or 'columns'). + index : single label or list-like + Alternative to specifying axis (``labels, axis=0`` + is equivalent to ``index=labels``). + + .. versionadded:: 0.21.0 + columns : single label or list-like + Alternative to specifying axis (``labels, axis=1`` + is equivalent to ``columns=labels``). + + .. versionadded:: 0.21.0 + level : int or level name, optional + For MultiIndex, level from which the labels will be removed. + inplace : bool, default False + If True, do operation inplace and return None. + errors : {'ignore', 'raise'}, default 'raise' + If 'ignore', suppress error and only existing labels are + dropped. + + Returns + ------- + DataFrame + DataFrame without the removed index or column labels. + + Raises + ------ + KeyError + If any of the labels is not found in the selected axis. + + See Also + -------- + DataFrame.loc : Label-location based indexer for selection by label. + DataFrame.dropna : Return DataFrame with labels on given axis omitted + where (all or any) data are missing. + DataFrame.drop_duplicates : Return DataFrame with duplicate rows + removed, optionally only considering certain columns. + Series.drop : Return Series with specified index labels removed. + + Examples + -------- + >>> df = pd.DataFrame(np.arange(12).reshape(3, 4), + ... columns=['A', 'B', 'C', 'D']) + >>> df + A B C D + 0 0 1 2 3 + 1 4 5 6 7 + 2 8 9 10 11 + + Drop columns + + >>> df.drop(['B', 'C'], axis=1) + A D + 0 0 3 + 1 4 7 + 2 8 11 + + >>> df.drop(columns=['B', 'C']) + A D + 0 0 3 + 1 4 7 + 2 8 11 + + Drop a row by index + + >>> df.drop([0, 1]) + A B C D + 2 8 9 10 11 + + Drop columns and/or rows of MultiIndex DataFrame + + >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'], + ... ['speed', 'weight', 'length']], + ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], + ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) + >>> df = pd.DataFrame(index=midx, columns=['big', 'small'], + ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20], + ... [250, 150], [1.5, 0.8], [320, 250], + ... [1, 0.8], [0.3, 0.2]]) + >>> df + big small + lama speed 45.0 30.0 + weight 200.0 100.0 + length 1.5 1.0 + cow speed 30.0 20.0 + weight 250.0 150.0 + length 1.5 0.8 + falcon speed 320.0 250.0 + weight 1.0 0.8 + length 0.3 0.2 + + >>> df.drop(index='cow', columns='small') + big + lama speed 45.0 + weight 200.0 + length 1.5 + falcon speed 320.0 + weight 1.0 + length 0.3 + + >>> df.drop(index='length', level=1) + big small + lama speed 45.0 30.0 + weight 200.0 100.0 + cow speed 30.0 20.0 + weight 250.0 150.0 + falcon speed 320.0 250.0 + weight 1.0 0.8 + """ + return super().drop( + labels=labels, + axis=axis, + index=index, + columns=columns, + level=level, + inplace=inplace, + errors=errors, + ) + + @rewrite_axis_style_signature( + "mapper", + [("copy", True), ("inplace", False), ("level", None), ("errors", "ignore")], + ) + def rename( + self, + mapper: Optional[Renamer] = None, + *, + index: Optional[Renamer] = None, + columns: Optional[Renamer] = None, + axis: Optional[Axis] = None, + copy: bool = True, + inplace: bool = False, + level: Optional[Level] = None, + errors: str = "ignore", + ) -> Optional["DataFrame"]: + + """ + Alter axes labels. + + Function / dict values must be unique (1-to-1). Labels not contained in + a dict / Series will be left as-is. Extra labels listed don't throw an + error. + + See the :ref:`user guide ` for more. + + Parameters + ---------- + mapper : dict-like or function + Dict-like or functions transformations to apply to + that axis' values. Use either ``mapper`` and ``axis`` to + specify the axis to target with ``mapper``, or ``index`` and + ``columns``. + index : dict-like or function + Alternative to specifying axis (``mapper, axis=0`` + is equivalent to ``index=mapper``). + columns : dict-like or function + Alternative to specifying axis (``mapper, axis=1`` + is equivalent to ``columns=mapper``). + axis : int or str + Axis to target with ``mapper``. Can be either the axis name + ('index', 'columns') or number (0, 1). The default is 'index'. + copy : bool, default True + Also copy underlying data. + inplace : bool, default False + Whether to return a new DataFrame. If True then value of copy is + ignored. + level : int or level name, default None + In case of a MultiIndex, only rename labels in the specified + level. + errors : {'ignore', 'raise'}, default 'ignore' + If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`, + or `columns` contains labels that are not present in the Index + being transformed. + If 'ignore', existing keys will be renamed and extra keys will be + ignored. + + Returns + ------- + DataFrame + DataFrame with the renamed axis labels. + + Raises + ------ + KeyError + If any of the labels is not found in the selected axis and + "errors='raise'". + + See Also + -------- + DataFrame.rename_axis : Set the name of the axis. + + Examples + -------- + + ``DataFrame.rename`` supports two calling conventions + + * ``(index=index_mapper, columns=columns_mapper, ...)`` + * ``(mapper, axis={'index', 'columns'}, ...)`` + + We *highly* recommend using keyword arguments to clarify your + intent. + + Rename columns using a mapping: + + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + >>> df.rename(columns={"A": "a", "B": "c"}) + a c + 0 1 4 + 1 2 5 + 2 3 6 + + Rename index using a mapping: + + >>> df.rename(index={0: "x", 1: "y", 2: "z"}) + A B + x 1 4 + y 2 5 + z 3 6 + + Cast index labels to a different type: + + >>> df.index + RangeIndex(start=0, stop=3, step=1) + >>> df.rename(index=str).index + Index(['0', '1', '2'], dtype='object') + + >>> df.rename(columns={"A": "a", "B": "b", "C": "c"}, errors="raise") + Traceback (most recent call last): + KeyError: ['C'] not found in axis + + Using axis-style parameters + + >>> df.rename(str.lower, axis='columns') + a b + 0 1 4 + 1 2 5 + 2 3 6 + + >>> df.rename({1: 2, 2: 4}, axis='index') + A B + 0 1 4 + 2 2 5 + 4 3 6 + """ + return super().rename( + mapper=mapper, + index=index, + columns=columns, + axis=axis, + copy=copy, + inplace=inplace, + level=level, + errors=errors, + ) + + @Substitution(**_shared_doc_kwargs) + @Appender(NDFrame.fillna.__doc__) + def fillna( + self, + value=None, + method=None, + axis=None, + inplace=False, + limit=None, + downcast=None, + ) -> Optional["DataFrame"]: + return super().fillna( + value=value, + method=method, + axis=axis, + inplace=inplace, + limit=limit, + downcast=downcast, + ) + + @Appender(_shared_docs["replace"] % _shared_doc_kwargs) + def replace( + self, + to_replace=None, + value=None, + inplace=False, + limit=None, + regex=False, + method="pad", + ): + return super().replace( + to_replace=to_replace, + value=value, + inplace=inplace, + limit=limit, + regex=regex, + method=method, + ) + + @Appender(_shared_docs["shift"] % _shared_doc_kwargs) + def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> "DataFrame": + return self._ensure_type( + super().shift(periods=periods, freq=freq, axis=axis, fill_value=fill_value) + ) + + def set_index( + self, keys, drop=True, append=False, inplace=False, verify_integrity=False + ): + """ + Set the DataFrame index using existing columns. + + Set the DataFrame index (row labels) using one or more existing + columns or arrays (of the correct length). The index can replace the + existing index or expand on it. + + Parameters + ---------- + keys : label or array-like or list of labels/arrays + This parameter can be either a single column key, a single array of + the same length as the calling DataFrame, or a list containing an + arbitrary combination of column keys and arrays. Here, "array" + encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and + instances of :class:`~collections.abc.Iterator`. + drop : bool, default True + Delete columns to be used as the new index. + append : bool, default False + Whether to append columns to existing index. + inplace : bool, default False + Modify the DataFrame in place (do not create a new object). + verify_integrity : bool, default False + Check the new index for duplicates. Otherwise defer the check until + necessary. Setting to False will improve the performance of this + method. + + Returns + ------- + DataFrame + Changed row labels. + + See Also + -------- + DataFrame.reset_index : Opposite of set_index. + DataFrame.reindex : Change to new indices or expand indices. + DataFrame.reindex_like : Change to same indices as other DataFrame. + + Examples + -------- + >>> df = pd.DataFrame({'month': [1, 4, 7, 10], + ... 'year': [2012, 2014, 2013, 2014], + ... 'sale': [55, 40, 84, 31]}) + >>> df + month year sale + 0 1 2012 55 + 1 4 2014 40 + 2 7 2013 84 + 3 10 2014 31 + + Set the index to become the 'month' column: + + >>> df.set_index('month') + year sale + month + 1 2012 55 + 4 2014 40 + 7 2013 84 + 10 2014 31 + + Create a MultiIndex using columns 'year' and 'month': + + >>> df.set_index(['year', 'month']) + sale + year month + 2012 1 55 + 2014 4 40 + 2013 7 84 + 2014 10 31 + + Create a MultiIndex using an Index and a column: + + >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year']) + month sale + year + 1 2012 1 55 + 2 2014 4 40 + 3 2013 7 84 + 4 2014 10 31 + + Create a MultiIndex using two Series: + + >>> s = pd.Series([1, 2, 3, 4]) + >>> df.set_index([s, s**2]) + month year sale + 1 1 1 2012 55 + 2 4 4 2014 40 + 3 9 7 2013 84 + 4 16 10 2014 31 + """ + inplace = validate_bool_kwarg(inplace, "inplace") + if not isinstance(keys, list): + keys = [keys] + + err_msg = ( + 'The parameter "keys" may be a column key, one-dimensional ' + "array, or a list containing only valid column keys and " + "one-dimensional arrays." + ) + + missing: List[Optional[Hashable]] = [] + for col in keys: + if isinstance( + col, (ABCIndexClass, ABCSeries, np.ndarray, list, abc.Iterator) + ): + # arrays are fine as long as they are one-dimensional + # iterators get converted to list below + if getattr(col, "ndim", 1) != 1: + raise ValueError(err_msg) + else: + # everything else gets tried as a key; see GH 24969 + try: + found = col in self.columns + except TypeError: + raise TypeError(f"{err_msg}. Received column of type {type(col)}") + else: + if not found: + missing.append(col) + + if missing: + raise KeyError(f"None of {missing} are in the columns") + + if inplace: + frame = self + else: + frame = self.copy() + + arrays = [] + names = [] + if append: + names = list(self.index.names) + if isinstance(self.index, ABCMultiIndex): + for i in range(self.index.nlevels): + arrays.append(self.index._get_level_values(i)) + else: + arrays.append(self.index) + + to_remove: List[Optional[Hashable]] = [] + for col in keys: + if isinstance(col, ABCMultiIndex): + for n in range(col.nlevels): + arrays.append(col._get_level_values(n)) + names.extend(col.names) + elif isinstance(col, (ABCIndexClass, ABCSeries)): + # if Index then not MultiIndex (treated above) + arrays.append(col) + names.append(col.name) + elif isinstance(col, (list, np.ndarray)): + arrays.append(col) + names.append(None) + elif isinstance(col, abc.Iterator): + arrays.append(list(col)) + names.append(None) + # from here, col can only be a column label + else: + arrays.append(frame[col]._values) + names.append(col) + if drop: + to_remove.append(col) + + if len(arrays[-1]) != len(self): + # check newest element against length of calling frame, since + # ensure_index_from_sequences would not raise for append=False. + raise ValueError( + f"Length mismatch: Expected {len(self)} rows, " + f"received array of length {len(arrays[-1])}" + ) + + index = ensure_index_from_sequences(arrays, names) + + if verify_integrity and not index.is_unique: + duplicates = index[index.duplicated()].unique() + raise ValueError(f"Index has duplicate keys: {duplicates}") + + # use set to handle duplicate column names gracefully in case of drop + for c in set(to_remove): + del frame[c] + + # clear up memory usage + index._cleanup() + + frame.index = index + + if not inplace: + return frame + + def reset_index( + self, + level: Optional[Union[Hashable, Sequence[Hashable]]] = None, + drop: bool = False, + inplace: bool = False, + col_level: Hashable = 0, + col_fill: Optional[Hashable] = "", + ) -> Optional["DataFrame"]: + """ + Reset the index, or a level of it. + + Reset the index of the DataFrame, and use the default one instead. + If the DataFrame has a MultiIndex, this method can remove one or more + levels. + + Parameters + ---------- + level : int, str, tuple, or list, default None + Only remove the given levels from the index. Removes all levels by + default. + drop : bool, default False + Do not try to insert index into dataframe columns. This resets + the index to the default integer index. + inplace : bool, default False + Modify the DataFrame in place (do not create a new object). + col_level : int or str, default 0 + If the columns have multiple levels, determines which level the + labels are inserted into. By default it is inserted into the first + level. + col_fill : object, default '' + If the columns have multiple levels, determines how the other + levels are named. If None then the index name is repeated. + + Returns + ------- + DataFrame or None + DataFrame with the new index or None if ``inplace=True``. + + See Also + -------- + DataFrame.set_index : Opposite of reset_index. + DataFrame.reindex : Change to new indices or expand indices. + DataFrame.reindex_like : Change to same indices as other DataFrame. + + Examples + -------- + >>> df = pd.DataFrame([('bird', 389.0), + ... ('bird', 24.0), + ... ('mammal', 80.5), + ... ('mammal', np.nan)], + ... index=['falcon', 'parrot', 'lion', 'monkey'], + ... columns=('class', 'max_speed')) + >>> df + class max_speed + falcon bird 389.0 + parrot bird 24.0 + lion mammal 80.5 + monkey mammal NaN + + When we reset the index, the old index is added as a column, and a + new sequential index is used: + + >>> df.reset_index() + index class max_speed + 0 falcon bird 389.0 + 1 parrot bird 24.0 + 2 lion mammal 80.5 + 3 monkey mammal NaN + + We can use the `drop` parameter to avoid the old index being added as + a column: + + >>> df.reset_index(drop=True) + class max_speed + 0 bird 389.0 + 1 bird 24.0 + 2 mammal 80.5 + 3 mammal NaN + + You can also use `reset_index` with `MultiIndex`. + + >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'), + ... ('bird', 'parrot'), + ... ('mammal', 'lion'), + ... ('mammal', 'monkey')], + ... names=['class', 'name']) + >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'), + ... ('species', 'type')]) + >>> df = pd.DataFrame([(389.0, 'fly'), + ... ( 24.0, 'fly'), + ... ( 80.5, 'run'), + ... (np.nan, 'jump')], + ... index=index, + ... columns=columns) + >>> df + speed species + max type + class name + bird falcon 389.0 fly + parrot 24.0 fly + mammal lion 80.5 run + monkey NaN jump + + If the index has multiple levels, we can reset a subset of them: + + >>> df.reset_index(level='class') + class speed species + max type + name + falcon bird 389.0 fly + parrot bird 24.0 fly + lion mammal 80.5 run + monkey mammal NaN jump + + If we are not dropping the index, by default, it is placed in the top + level. We can place it in another level: + + >>> df.reset_index(level='class', col_level=1) + speed species + class max type + name + falcon bird 389.0 fly + parrot bird 24.0 fly + lion mammal 80.5 run + monkey mammal NaN jump + + When the index is inserted under another level, we can specify under + which one with the parameter `col_fill`: + + >>> df.reset_index(level='class', col_level=1, col_fill='species') + species speed species + class max type + name + falcon bird 389.0 fly + parrot bird 24.0 fly + lion mammal 80.5 run + monkey mammal NaN jump + + If we specify a nonexistent level for `col_fill`, it is created: + + >>> df.reset_index(level='class', col_level=1, col_fill='genus') + genus speed species + class max type + name + falcon bird 389.0 fly + parrot bird 24.0 fly + lion mammal 80.5 run + monkey mammal NaN jump + """ + inplace = validate_bool_kwarg(inplace, "inplace") + if inplace: + new_obj = self + else: + new_obj = self.copy() + + def _maybe_casted_values(index, labels=None): + values = index._values + if not isinstance(index, (PeriodIndex, DatetimeIndex)): + if values.dtype == np.object_: + values = lib.maybe_convert_objects(values) + + # if we have the labels, extract the values with a mask + if labels is not None: + mask = labels == -1 + + # we can have situations where the whole mask is -1, + # meaning there is nothing found in labels, so make all nan's + if mask.all(): + values = np.empty(len(mask)) + values.fill(np.nan) + else: + values = values.take(labels) + + # TODO(https://github.com/pandas-dev/pandas/issues/24206) + # Push this into maybe_upcast_putmask? + # We can't pass EAs there right now. Looks a bit + # complicated. + # So we unbox the ndarray_values, op, re-box. + values_type = type(values) + values_dtype = values.dtype + + if issubclass(values_type, DatetimeLikeArray): + values = values._data + + if mask.any(): + values, _ = maybe_upcast_putmask(values, mask, np.nan) + + if issubclass(values_type, DatetimeLikeArray): + values = values_type(values, dtype=values_dtype) + + return values + + new_index = ibase.default_index(len(new_obj)) + if level is not None: + if not isinstance(level, (tuple, list)): + level = [level] + level = [self.index._get_level_number(lev) for lev in level] + if len(level) < self.index.nlevels: + new_index = self.index.droplevel(level) + + if not drop: + to_insert: Iterable[Tuple[Any, Optional[Any]]] + if isinstance(self.index, ABCMultiIndex): + names = [ + (n if n is not None else f"level_{i}") + for i, n in enumerate(self.index.names) + ] + to_insert = zip(self.index.levels, self.index.codes) + else: + default = "index" if "index" not in self else "level_0" + names = [default] if self.index.name is None else [self.index.name] + to_insert = ((self.index, None),) + + multi_col = isinstance(self.columns, ABCMultiIndex) + for i, (lev, lab) in reversed(list(enumerate(to_insert))): + if not (level is None or i in level): + continue + name = names[i] + if multi_col: + col_name = list(name) if isinstance(name, tuple) else [name] + if col_fill is None: + if len(col_name) not in (1, self.columns.nlevels): + raise ValueError( + "col_fill=None is incompatible " + f"with incomplete column name {name}" + ) + col_fill = col_name[0] + + lev_num = self.columns._get_level_number(col_level) + name_lst = [col_fill] * lev_num + col_name + missing = self.columns.nlevels - len(name_lst) + name_lst += [col_fill] * missing + name = tuple(name_lst) + # to ndarray and maybe infer different dtype + level_values = _maybe_casted_values(lev, lab) + new_obj.insert(0, name, level_values) + + new_obj.index = new_index + if not inplace: + return new_obj + + return None + + # ---------------------------------------------------------------------- + # Reindex-based selection methods + + @Appender(_shared_docs["isna"] % _shared_doc_kwargs) + def isna(self) -> "DataFrame": + return super().isna() + + @Appender(_shared_docs["isna"] % _shared_doc_kwargs) + def isnull(self) -> "DataFrame": + return super().isnull() + + @Appender(_shared_docs["notna"] % _shared_doc_kwargs) + def notna(self) -> "DataFrame": + return super().notna() + + @Appender(_shared_docs["notna"] % _shared_doc_kwargs) + def notnull(self) -> "DataFrame": + return super().notnull() + + def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): + """ + Remove missing values. + + See the :ref:`User Guide ` for more on which values are + considered missing, and how to work with missing data. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + Determine if rows or columns which contain missing values are + removed. + + * 0, or 'index' : Drop rows which contain missing values. + * 1, or 'columns' : Drop columns which contain missing value. + + .. versionchanged:: 1.0.0 + + Pass tuple or list to drop on multiple axes. + Only a single axis is allowed. + + how : {'any', 'all'}, default 'any' + Determine if row or column is removed from DataFrame, when we have + at least one NA or all NA. + + * 'any' : If any NA values are present, drop that row or column. + * 'all' : If all values are NA, drop that row or column. + + thresh : int, optional + Require that many non-NA values. + subset : array-like, optional + Labels along other axis to consider, e.g. if you are dropping rows + these would be a list of columns to include. + inplace : bool, default False + If True, do operation inplace and return None. + + Returns + ------- + DataFrame + DataFrame with NA entries dropped from it. + + See Also + -------- + DataFrame.isna: Indicate missing values. + DataFrame.notna : Indicate existing (non-missing) values. + DataFrame.fillna : Replace missing values. + Series.dropna : Drop missing values. + Index.dropna : Drop missing indices. + + Examples + -------- + >>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], + ... "toy": [np.nan, 'Batmobile', 'Bullwhip'], + ... "born": [pd.NaT, pd.Timestamp("1940-04-25"), + ... pd.NaT]}) + >>> df + name toy born + 0 Alfred NaN NaT + 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip NaT + + Drop the rows where at least one element is missing. + + >>> df.dropna() + name toy born + 1 Batman Batmobile 1940-04-25 + + Drop the columns where at least one element is missing. + + >>> df.dropna(axis='columns') + name + 0 Alfred + 1 Batman + 2 Catwoman + + Drop the rows where all elements are missing. + + >>> df.dropna(how='all') + name toy born + 0 Alfred NaN NaT + 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip NaT + + Keep only the rows with at least 2 non-NA values. + + >>> df.dropna(thresh=2) + name toy born + 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip NaT + + Define in which columns to look for missing values. + + >>> df.dropna(subset=['name', 'born']) + name toy born + 1 Batman Batmobile 1940-04-25 + + Keep the DataFrame with valid entries in the same variable. + + >>> df.dropna(inplace=True) + >>> df + name toy born + 1 Batman Batmobile 1940-04-25 + """ + inplace = validate_bool_kwarg(inplace, "inplace") + if isinstance(axis, (tuple, list)): + # GH20987 + raise TypeError("supplying multiple axes to axis is no longer supported.") + + axis = self._get_axis_number(axis) + agg_axis = 1 - axis + + agg_obj = self + if subset is not None: + ax = self._get_axis(agg_axis) + indices = ax.get_indexer_for(subset) + check = indices == -1 + if check.any(): + raise KeyError(list(np.compress(check, subset))) + agg_obj = self.take(indices, axis=agg_axis) + + count = agg_obj.count(axis=agg_axis) + + if thresh is not None: + mask = count >= thresh + elif how == "any": + mask = count == len(agg_obj._get_axis(agg_axis)) + elif how == "all": + mask = count > 0 + else: + if how is not None: + raise ValueError(f"invalid how option: {how}") + else: + raise TypeError("must specify how or thresh") + + result = self.loc(axis=axis)[mask] + + if inplace: + self._update_inplace(result) + else: + return result + + def drop_duplicates( + self, + subset: Optional[Union[Hashable, Sequence[Hashable]]] = None, + keep: Union[str, bool] = "first", + inplace: bool = False, + ignore_index: bool = False, + ) -> Optional["DataFrame"]: + """ + Return DataFrame with duplicate rows removed. + + Considering certain columns is optional. Indexes, including time indexes + are ignored. + + Parameters + ---------- + subset : column label or sequence of labels, optional + Only consider certain columns for identifying duplicates, by + default use all of the columns. + keep : {'first', 'last', False}, default 'first' + Determines which duplicates (if any) to keep. + - ``first`` : Drop duplicates except for the first occurrence. + - ``last`` : Drop duplicates except for the last occurrence. + - False : Drop all duplicates. + inplace : bool, default False + Whether to drop duplicates in place or to return a copy. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + + .. versionadded:: 1.0.0 + + Returns + ------- + DataFrame + DataFrame with duplicates removed or None if ``inplace=True``. + """ + if self.empty: + return self.copy() + + inplace = validate_bool_kwarg(inplace, "inplace") + duplicated = self.duplicated(subset, keep=keep) + + if inplace: + (inds,) = (-duplicated)._ndarray_values.nonzero() + new_data = self._data.take(inds) + + if ignore_index: + new_data.axes[1] = ibase.default_index(len(inds)) + self._update_inplace(new_data) + else: + result = self[-duplicated] + + if ignore_index: + result.index = ibase.default_index(len(result)) + return result + + return None + + def duplicated( + self, + subset: Optional[Union[Hashable, Sequence[Hashable]]] = None, + keep: Union[str, bool] = "first", + ) -> "Series": + """ + Return boolean Series denoting duplicate rows. + + Considering certain columns is optional. + + Parameters + ---------- + subset : column label or sequence of labels, optional + Only consider certain columns for identifying duplicates, by + default use all of the columns. + keep : {'first', 'last', False}, default 'first' + Determines which duplicates (if any) to mark. + + - ``first`` : Mark duplicates as ``True`` except for the first occurrence. + - ``last`` : Mark duplicates as ``True`` except for the last occurrence. + - False : Mark all duplicates as ``True``. + + Returns + ------- + Series + """ + from pandas.core.sorting import get_group_index + from pandas._libs.hashtable import duplicated_int64, _SIZE_HINT_LIMIT + + if self.empty: + return Series(dtype=bool) + + def f(vals): + labels, shape = algorithms.factorize( + vals, size_hint=min(len(self), _SIZE_HINT_LIMIT) + ) + return labels.astype("i8", copy=False), len(shape) + + if subset is None: + subset = self.columns + elif ( + not np.iterable(subset) + or isinstance(subset, str) + or isinstance(subset, tuple) + and subset in self.columns + ): + subset = (subset,) + + # needed for mypy since can't narrow types using np.iterable + subset = cast(Iterable, subset) + + # Verify all columns in subset exist in the queried dataframe + # Otherwise, raise a KeyError, same as if you try to __getitem__ with a + # key that doesn't exist. + diff = Index(subset).difference(self.columns) + if not diff.empty: + raise KeyError(diff) + + vals = (col.values for name, col in self.items() if name in subset) + labels, shape = map(list, zip(*map(f, vals))) + + ids = get_group_index(labels, shape, sort=False, xnull=False) + return Series(duplicated_int64(ids, keep), index=self.index) + + # ---------------------------------------------------------------------- + # Sorting + + @Substitution(**_shared_doc_kwargs) + @Appender(NDFrame.sort_values.__doc__) + def sort_values( + self, + by, + axis=0, + ascending=True, + inplace=False, + kind="quicksort", + na_position="last", + ignore_index=False, + ): + inplace = validate_bool_kwarg(inplace, "inplace") + axis = self._get_axis_number(axis) + + if not isinstance(by, list): + by = [by] + if is_sequence(ascending) and len(by) != len(ascending): + raise ValueError( + f"Length of ascending ({len(ascending)}) != length of by ({len(by)})" + ) + if len(by) > 1: + from pandas.core.sorting import lexsort_indexer + + keys = [self._get_label_or_level_values(x, axis=axis) for x in by] + indexer = lexsort_indexer(keys, orders=ascending, na_position=na_position) + indexer = ensure_platform_int(indexer) + else: + from pandas.core.sorting import nargsort + + by = by[0] + k = self._get_label_or_level_values(by, axis=axis) + + if isinstance(ascending, (tuple, list)): + ascending = ascending[0] + + indexer = nargsort( + k, kind=kind, ascending=ascending, na_position=na_position + ) + + new_data = self._data.take( + indexer, axis=self._get_block_manager_axis(axis), verify=False + ) + + if ignore_index: + new_data.axes[1] = ibase.default_index(len(indexer)) + + if inplace: + return self._update_inplace(new_data) + else: + return self._constructor(new_data).__finalize__(self) + + @Substitution(**_shared_doc_kwargs) + @Appender(NDFrame.sort_index.__doc__) + def sort_index( + self, + axis=0, + level=None, + ascending=True, + inplace=False, + kind="quicksort", + na_position="last", + sort_remaining=True, + ignore_index: bool = False, + ): + + # TODO: this can be combined with Series.sort_index impl as + # almost identical + + inplace = validate_bool_kwarg(inplace, "inplace") + + axis = self._get_axis_number(axis) + labels = self._get_axis(axis) + + # make sure that the axis is lexsorted to start + # if not we need to reconstruct to get the correct indexer + labels = labels._sort_levels_monotonic() + if level is not None: + + new_axis, indexer = labels.sortlevel( + level, ascending=ascending, sort_remaining=sort_remaining + ) + + elif isinstance(labels, ABCMultiIndex): + from pandas.core.sorting import lexsort_indexer + + indexer = lexsort_indexer( + labels._get_codes_for_sorting(), + orders=ascending, + na_position=na_position, + ) + else: + from pandas.core.sorting import nargsort + + # Check monotonic-ness before sort an index + # GH11080 + if (ascending and labels.is_monotonic_increasing) or ( + not ascending and labels.is_monotonic_decreasing + ): + if inplace: + return + else: + return self.copy() + + indexer = nargsort( + labels, kind=kind, ascending=ascending, na_position=na_position + ) + + baxis = self._get_block_manager_axis(axis) + new_data = self._data.take(indexer, axis=baxis, verify=False) + + # reconstruct axis if needed + new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic() + + if ignore_index: + new_data.axes[1] = ibase.default_index(len(indexer)) + + if inplace: + return self._update_inplace(new_data) + else: + return self._constructor(new_data).__finalize__(self) + + def nlargest(self, n, columns, keep="first") -> "DataFrame": + """ + Return the first `n` rows ordered by `columns` in descending order. + + Return the first `n` rows with the largest values in `columns`, in + descending order. The columns that are not specified are returned as + well, but not used for ordering. + + This method is equivalent to + ``df.sort_values(columns, ascending=False).head(n)``, but more + performant. + + Parameters + ---------- + n : int + Number of rows to return. + columns : label or list of labels + Column label(s) to order by. + keep : {'first', 'last', 'all'}, default 'first' + Where there are duplicate values: + + - `first` : prioritize the first occurrence(s) + - `last` : prioritize the last occurrence(s) + - ``all`` : do not drop any duplicates, even it means + selecting more than `n` items. + + .. versionadded:: 0.24.0 + + Returns + ------- + DataFrame + The first `n` rows ordered by the given columns in descending + order. + + See Also + -------- + DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in + ascending order. + DataFrame.sort_values : Sort DataFrame by the values. + DataFrame.head : Return the first `n` rows without re-ordering. + + Notes + ----- + This function cannot be used with all column types. For example, when + specifying columns with `object` or `category` dtypes, ``TypeError`` is + raised. + + Examples + -------- + >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000, + ... 434000, 434000, 337000, 11300, + ... 11300, 11300], + ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128, + ... 17036, 182, 38, 311], + ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN", + ... "IS", "NR", "TV", "AI"]}, + ... index=["Italy", "France", "Malta", + ... "Maldives", "Brunei", "Iceland", + ... "Nauru", "Tuvalu", "Anguilla"]) + >>> df + population GDP alpha-2 + Italy 59000000 1937894 IT + France 65000000 2583560 FR + Malta 434000 12011 MT + Maldives 434000 4520 MV + Brunei 434000 12128 BN + Iceland 337000 17036 IS + Nauru 11300 182 NR + Tuvalu 11300 38 TV + Anguilla 11300 311 AI + + In the following example, we will use ``nlargest`` to select the three + rows having the largest values in column "population". + + >>> df.nlargest(3, 'population') + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Malta 434000 12011 MT + + When using ``keep='last'``, ties are resolved in reverse order: + + >>> df.nlargest(3, 'population', keep='last') + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Brunei 434000 12128 BN + + When using ``keep='all'``, all duplicate items are maintained: + + >>> df.nlargest(3, 'population', keep='all') + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Malta 434000 12011 MT + Maldives 434000 4520 MV + Brunei 434000 12128 BN + + To order by the largest values in column "population" and then "GDP", + we can specify multiple columns like in the next example. + + >>> df.nlargest(3, ['population', 'GDP']) + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Brunei 434000 12128 BN + """ + return algorithms.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest() + + def nsmallest(self, n, columns, keep="first") -> "DataFrame": + """ + Return the first `n` rows ordered by `columns` in ascending order. + + Return the first `n` rows with the smallest values in `columns`, in + ascending order. The columns that are not specified are returned as + well, but not used for ordering. + + This method is equivalent to + ``df.sort_values(columns, ascending=True).head(n)``, but more + performant. + + Parameters + ---------- + n : int + Number of items to retrieve. + columns : list or str + Column name or names to order by. + keep : {'first', 'last', 'all'}, default 'first' + Where there are duplicate values: + + - ``first`` : take the first occurrence. + - ``last`` : take the last occurrence. + - ``all`` : do not drop any duplicates, even it means + selecting more than `n` items. + + .. versionadded:: 0.24.0 + + Returns + ------- + DataFrame + + See Also + -------- + DataFrame.nlargest : Return the first `n` rows ordered by `columns` in + descending order. + DataFrame.sort_values : Sort DataFrame by the values. + DataFrame.head : Return the first `n` rows without re-ordering. + + Examples + -------- + >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000, + ... 434000, 434000, 337000, 11300, + ... 11300, 11300], + ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128, + ... 17036, 182, 38, 311], + ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN", + ... "IS", "NR", "TV", "AI"]}, + ... index=["Italy", "France", "Malta", + ... "Maldives", "Brunei", "Iceland", + ... "Nauru", "Tuvalu", "Anguilla"]) + >>> df + population GDP alpha-2 + Italy 59000000 1937894 IT + France 65000000 2583560 FR + Malta 434000 12011 MT + Maldives 434000 4520 MV + Brunei 434000 12128 BN + Iceland 337000 17036 IS + Nauru 11300 182 NR + Tuvalu 11300 38 TV + Anguilla 11300 311 AI + + In the following example, we will use ``nsmallest`` to select the + three rows having the smallest values in column "a". + + >>> df.nsmallest(3, 'population') + population GDP alpha-2 + Nauru 11300 182 NR + Tuvalu 11300 38 TV + Anguilla 11300 311 AI + + When using ``keep='last'``, ties are resolved in reverse order: + + >>> df.nsmallest(3, 'population', keep='last') + population GDP alpha-2 + Anguilla 11300 311 AI + Tuvalu 11300 38 TV + Nauru 11300 182 NR + + When using ``keep='all'``, all duplicate items are maintained: + + >>> df.nsmallest(3, 'population', keep='all') + population GDP alpha-2 + Nauru 11300 182 NR + Tuvalu 11300 38 TV + Anguilla 11300 311 AI + + To order by the largest values in column "a" and then "c", we can + specify multiple columns like in the next example. + + >>> df.nsmallest(3, ['population', 'GDP']) + population GDP alpha-2 + Tuvalu 11300 38 TV + Nauru 11300 182 NR + Anguilla 11300 311 AI + """ + return algorithms.SelectNFrame( + self, n=n, keep=keep, columns=columns + ).nsmallest() + + def swaplevel(self, i=-2, j=-1, axis=0) -> "DataFrame": + """ + Swap levels i and j in a MultiIndex on a particular axis. + + Parameters + ---------- + i, j : int or str + Levels of the indices to be swapped. Can pass level name as string. + + Returns + ------- + DataFrame + """ + result = self.copy() + + axis = self._get_axis_number(axis) + if axis == 0: + result.index = result.index.swaplevel(i, j) + else: + result.columns = result.columns.swaplevel(i, j) + return result + + def reorder_levels(self, order, axis=0) -> "DataFrame": + """ + Rearrange index levels using input order. May not drop or duplicate levels. + + Parameters + ---------- + order : list of int or list of str + List representing new level order. Reference level by number + (position) or by key (label). + axis : int + Where to reorder levels. + + Returns + ------- + DataFrame + """ + axis = self._get_axis_number(axis) + if not isinstance(self._get_axis(axis), ABCMultiIndex): # pragma: no cover + raise TypeError("Can only reorder levels on a hierarchical axis.") + + result = self.copy() + + if axis == 0: + result.index = result.index.reorder_levels(order) + else: + result.columns = result.columns.reorder_levels(order) + return result + + # ---------------------------------------------------------------------- + # Arithmetic / combination related + + def _combine_frame(self, other, func, fill_value=None, level=None): + # at this point we have `self._indexed_same(other)` + + if fill_value is None: + # since _arith_op may be called in a loop, avoid function call + # overhead if possible by doing this check once + _arith_op = func + + else: + + def _arith_op(left, right): + # for the mixed_type case where we iterate over columns, + # _arith_op(left, right) is equivalent to + # left._binop(right, func, fill_value=fill_value) + left, right = ops.fill_binop(left, right, fill_value) + return func(left, right) + + if ops.should_series_dispatch(self, other, func): + # iterate over columns + new_data = ops.dispatch_to_series(self, other, _arith_op) + else: + with np.errstate(all="ignore"): + res_values = _arith_op(self.values, other.values) + new_data = dispatch_fill_zeros(func, self.values, other.values, res_values) + + return new_data + + def _combine_match_index(self, other, func): + # at this point we have `self.index.equals(other.index)` + + if ops.should_series_dispatch(self, other, func): + # operate column-wise; avoid costly object-casting in `.values` + new_data = ops.dispatch_to_series(self, other, func) + else: + # fastpath --> operate directly on values + with np.errstate(all="ignore"): + new_data = func(self.values.T, other.values).T + return new_data + + def _construct_result(self, result) -> "DataFrame": + """ + Wrap the result of an arithmetic, comparison, or logical operation. + + Parameters + ---------- + result : DataFrame + + Returns + ------- + DataFrame + """ + out = self._constructor(result, index=self.index, copy=False) + # Pin columns instead of passing to constructor for compat with + # non-unique columns case + out.columns = self.columns + return out + + def combine( + self, other: "DataFrame", func, fill_value=None, overwrite=True + ) -> "DataFrame": + """ + Perform column-wise combine with another DataFrame. + + Combines a DataFrame with `other` DataFrame using `func` + to element-wise combine columns. The row and column indexes of the + resulting DataFrame will be the union of the two. + + Parameters + ---------- + other : DataFrame + The DataFrame to merge column-wise. + func : function + Function that takes two series as inputs and return a Series or a + scalar. Used to merge the two dataframes column by columns. + fill_value : scalar value, default None + The value to fill NaNs with prior to passing any column to the + merge func. + overwrite : bool, default True + If True, columns in `self` that do not exist in `other` will be + overwritten with NaNs. + + Returns + ------- + DataFrame + Combination of the provided DataFrames. + + See Also + -------- + DataFrame.combine_first : Combine two DataFrame objects and default to + non-null values in frame calling the method. + + Examples + -------- + Combine using a simple function that chooses the smaller column. + + >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]}) + >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) + >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2 + >>> df1.combine(df2, take_smaller) + A B + 0 0 3 + 1 0 3 + + Example using a true element-wise combine function. + + >>> df1 = pd.DataFrame({'A': [5, 0], 'B': [2, 4]}) + >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) + >>> df1.combine(df2, np.minimum) + A B + 0 1 2 + 1 0 3 + + Using `fill_value` fills Nones prior to passing the column to the + merge function. + + >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]}) + >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) + >>> df1.combine(df2, take_smaller, fill_value=-5) + A B + 0 0 -5.0 + 1 0 4.0 + + However, if the same element in both dataframes is None, that None + is preserved + + >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]}) + >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [None, 3]}) + >>> df1.combine(df2, take_smaller, fill_value=-5) + A B + 0 0 -5.0 + 1 0 3.0 + + Example that demonstrates the use of `overwrite` and behavior when + the axis differ between the dataframes. + + >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]}) + >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [-10, 1], }, index=[1, 2]) + >>> df1.combine(df2, take_smaller) + A B C + 0 NaN NaN NaN + 1 NaN 3.0 -10.0 + 2 NaN 3.0 1.0 + + >>> df1.combine(df2, take_smaller, overwrite=False) + A B C + 0 0.0 NaN NaN + 1 0.0 3.0 -10.0 + 2 NaN 3.0 1.0 + + Demonstrating the preference of the passed in dataframe. + + >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1], }, index=[1, 2]) + >>> df2.combine(df1, take_smaller) + A B C + 0 0.0 NaN NaN + 1 0.0 3.0 NaN + 2 NaN 3.0 NaN + + >>> df2.combine(df1, take_smaller, overwrite=False) + A B C + 0 0.0 NaN NaN + 1 0.0 3.0 1.0 + 2 NaN 3.0 1.0 + """ + other_idxlen = len(other.index) # save for compare + + this, other = self.align(other, copy=False) + new_index = this.index + + if other.empty and len(new_index) == len(self.index): + return self.copy() + + if self.empty and len(other) == other_idxlen: + return other.copy() + + # sorts if possible + new_columns = this.columns.union(other.columns) + do_fill = fill_value is not None + result = {} + for col in new_columns: + series = this[col] + otherSeries = other[col] + + this_dtype = series.dtype + other_dtype = otherSeries.dtype + + this_mask = isna(series) + other_mask = isna(otherSeries) + + # don't overwrite columns unnecessarily + # DO propagate if this column is not in the intersection + if not overwrite and other_mask.all(): + result[col] = this[col].copy() + continue + + if do_fill: + series = series.copy() + otherSeries = otherSeries.copy() + series[this_mask] = fill_value + otherSeries[other_mask] = fill_value + + if col not in self.columns: + # If self DataFrame does not have col in other DataFrame, + # try to promote series, which is all NaN, as other_dtype. + new_dtype = other_dtype + try: + series = series.astype(new_dtype, copy=False) + except ValueError: + # e.g. new_dtype is integer types + pass + else: + # if we have different dtypes, possibly promote + new_dtype = find_common_type([this_dtype, other_dtype]) + if not is_dtype_equal(this_dtype, new_dtype): + series = series.astype(new_dtype) + if not is_dtype_equal(other_dtype, new_dtype): + otherSeries = otherSeries.astype(new_dtype) + + arr = func(series, otherSeries) + arr = maybe_downcast_to_dtype(arr, this_dtype) + + result[col] = arr + + # convert_objects just in case + return self._constructor(result, index=new_index, columns=new_columns) + + def combine_first(self, other: "DataFrame") -> "DataFrame": + """ + Update null elements with value in the same location in `other`. + + Combine two DataFrame objects by filling null values in one DataFrame + with non-null values from other DataFrame. The row and column indexes + of the resulting DataFrame will be the union of the two. + + Parameters + ---------- + other : DataFrame + Provided DataFrame to use to fill null values. + + Returns + ------- + DataFrame + + See Also + -------- + DataFrame.combine : Perform series-wise operation on two DataFrames + using a given function. + + Examples + -------- + + >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]}) + >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) + >>> df1.combine_first(df2) + A B + 0 1.0 3.0 + 1 0.0 4.0 + + Null values still persist if the location of that null value + does not exist in `other` + + >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]}) + >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2]) + >>> df1.combine_first(df2) + A B C + 0 NaN 4.0 NaN + 1 0.0 3.0 1.0 + 2 NaN 3.0 1.0 + """ + import pandas.core.computation.expressions as expressions + + def extract_values(arr): + # Does two things: + # 1. maybe gets the values from the Series / Index + # 2. convert datelike to i8 + if isinstance(arr, (ABCIndexClass, ABCSeries)): + arr = arr._values + + if needs_i8_conversion(arr): + if is_extension_array_dtype(arr.dtype): + arr = arr.asi8 + else: + arr = arr.view("i8") + return arr + + def combiner(x, y): + mask = isna(x) + if isinstance(mask, (ABCIndexClass, ABCSeries)): + mask = mask._values + + x_values = extract_values(x) + y_values = extract_values(y) + + # If the column y in other DataFrame is not in first DataFrame, + # just return y_values. + if y.name not in self.columns: + return y_values + + return expressions.where(mask, y_values, x_values) + + return self.combine(other, combiner, overwrite=False) + + def update( + self, other, join="left", overwrite=True, filter_func=None, errors="ignore" + ) -> None: + """ + Modify in place using non-NA values from another DataFrame. + + Aligns on indices. There is no return value. + + Parameters + ---------- + other : DataFrame, or object coercible into a DataFrame + Should have at least one matching index/column label + with the original DataFrame. If a Series is passed, + its name attribute must be set, and that will be + used as the column name to align with the original DataFrame. + join : {'left'}, default 'left' + Only left join is implemented, keeping the index and columns of the + original object. + overwrite : bool, default True + How to handle non-NA values for overlapping keys: + + * True: overwrite original DataFrame's values + with values from `other`. + * False: only update values that are NA in + the original DataFrame. + + filter_func : callable(1d-array) -> bool 1d-array, optional + Can choose to replace values other than NA. Return True for values + that should be updated. + errors : {'raise', 'ignore'}, default 'ignore' + If 'raise', will raise a ValueError if the DataFrame and `other` + both contain non-NA data in the same place. + + .. versionchanged:: 0.24.0 + Changed from `raise_conflict=False|True` + to `errors='ignore'|'raise'`. + + Returns + ------- + None : method directly changes calling object + + Raises + ------ + ValueError + * When `errors='raise'` and there's overlapping non-NA data. + * When `errors` is not either `'ignore'` or `'raise'` + NotImplementedError + * If `join != 'left'` + + See Also + -------- + dict.update : Similar method for dictionaries. + DataFrame.merge : For column(s)-on-columns(s) operations. + + Examples + -------- + >>> df = pd.DataFrame({'A': [1, 2, 3], + ... 'B': [400, 500, 600]}) + >>> new_df = pd.DataFrame({'B': [4, 5, 6], + ... 'C': [7, 8, 9]}) + >>> df.update(new_df) + >>> df + A B + 0 1 4 + 1 2 5 + 2 3 6 + + The DataFrame's length does not increase as a result of the update, + only values at matching index/column labels are updated. + + >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], + ... 'B': ['x', 'y', 'z']}) + >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']}) + >>> df.update(new_df) + >>> df + A B + 0 a d + 1 b e + 2 c f + + For Series, it's name attribute must be set. + + >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], + ... 'B': ['x', 'y', 'z']}) + >>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2]) + >>> df.update(new_column) + >>> df + A B + 0 a d + 1 b y + 2 c e + >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], + ... 'B': ['x', 'y', 'z']}) + >>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2]) + >>> df.update(new_df) + >>> df + A B + 0 a x + 1 b d + 2 c e + + If `other` contains NaNs the corresponding values are not updated + in the original dataframe. + + >>> df = pd.DataFrame({'A': [1, 2, 3], + ... 'B': [400, 500, 600]}) + >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]}) + >>> df.update(new_df) + >>> df + A B + 0 1 4.0 + 1 2 500.0 + 2 3 6.0 + """ + import pandas.core.computation.expressions as expressions + + # TODO: Support other joins + if join != "left": # pragma: no cover + raise NotImplementedError("Only left join is supported") + if errors not in ["ignore", "raise"]: + raise ValueError("The parameter errors must be either 'ignore' or 'raise'") + + if not isinstance(other, DataFrame): + other = DataFrame(other) + + other = other.reindex_like(self) + + for col in self.columns: + this = self[col]._values + that = other[col]._values + if filter_func is not None: + with np.errstate(all="ignore"): + mask = ~filter_func(this) | isna(that) + else: + if errors == "raise": + mask_this = notna(that) + mask_that = notna(this) + if any(mask_this & mask_that): + raise ValueError("Data overlaps.") + + if overwrite: + mask = isna(that) + else: + mask = notna(this) + + # don't overwrite columns unnecessarily + if mask.all(): + continue + + self[col] = expressions.where(mask, this, that) + + # ---------------------------------------------------------------------- + # Data reshaping + @Appender( + """ +Examples +-------- +>>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', +... 'Parrot', 'Parrot'], +... 'Max Speed': [380., 370., 24., 26.]}) +>>> df + Animal Max Speed +0 Falcon 380.0 +1 Falcon 370.0 +2 Parrot 24.0 +3 Parrot 26.0 +>>> df.groupby(['Animal']).mean() + Max Speed +Animal +Falcon 375.0 +Parrot 25.0 + +**Hierarchical Indexes** + +We can groupby different levels of a hierarchical index +using the `level` parameter: + +>>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], +... ['Captive', 'Wild', 'Captive', 'Wild']] +>>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) +>>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]}, +... index=index) +>>> df + Max Speed +Animal Type +Falcon Captive 390.0 + Wild 350.0 +Parrot Captive 30.0 + Wild 20.0 +>>> df.groupby(level=0).mean() + Max Speed +Animal +Falcon 370.0 +Parrot 25.0 +>>> df.groupby(level="Type").mean() + Max Speed +Type +Captive 210.0 +Wild 185.0 +""" + ) + @Appender(_shared_docs["groupby"] % _shared_doc_kwargs) + def groupby( + self, + by=None, + axis=0, + level=None, + as_index: bool = True, + sort: bool = True, + group_keys: bool = True, + squeeze: bool = False, + observed: bool = False, + ) -> "groupby_generic.DataFrameGroupBy": + + if level is None and by is None: + raise TypeError("You have to supply one of 'by' and 'level'") + axis = self._get_axis_number(axis) + + return groupby_generic.DataFrameGroupBy( + obj=self, + keys=by, + axis=axis, + level=level, + as_index=as_index, + sort=sort, + group_keys=group_keys, + squeeze=squeeze, + observed=observed, + ) + + _shared_docs[ + "pivot" + ] = """ + Return reshaped DataFrame organized by given index / column values. + + Reshape data (produce a "pivot" table) based on column values. Uses + unique values from specified `index` / `columns` to form axes of the + resulting DataFrame. This function does not support data + aggregation, multiple values will result in a MultiIndex in the + columns. See the :ref:`User Guide ` for more on reshaping. + + Parameters + ----------%s + index : str or object, optional + Column to use to make new frame's index. If None, uses + existing index. + columns : str or object + Column to use to make new frame's columns. + values : str, object or a list of the previous, optional + Column(s) to use for populating new frame's values. If not + specified, all remaining columns will be used and the result will + have hierarchically indexed columns. + + .. versionchanged:: 0.23.0 + Also accept list of column names. + + Returns + ------- + DataFrame + Returns reshaped DataFrame. + + Raises + ------ + ValueError: + When there are any `index`, `columns` combinations with multiple + values. `DataFrame.pivot_table` when you need to aggregate. + + See Also + -------- + DataFrame.pivot_table : Generalization of pivot that can handle + duplicate values for one index/column pair. + DataFrame.unstack : Pivot based on the index values instead of a + column. + + Notes + ----- + For finer-tuned control, see hierarchical indexing documentation along + with the related stack/unstack methods. + + Examples + -------- + >>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', + ... 'two'], + ... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], + ... 'baz': [1, 2, 3, 4, 5, 6], + ... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']}) + >>> df + foo bar baz zoo + 0 one A 1 x + 1 one B 2 y + 2 one C 3 z + 3 two A 4 q + 4 two B 5 w + 5 two C 6 t + + >>> df.pivot(index='foo', columns='bar', values='baz') + bar A B C + foo + one 1 2 3 + two 4 5 6 + + >>> df.pivot(index='foo', columns='bar')['baz'] + bar A B C + foo + one 1 2 3 + two 4 5 6 + + >>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo']) + baz zoo + bar A B C A B C + foo + one 1 2 3 x y z + two 4 5 6 q w t + + A ValueError is raised if there are any duplicates. + + >>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'], + ... "bar": ['A', 'A', 'B', 'C'], + ... "baz": [1, 2, 3, 4]}) + >>> df + foo bar baz + 0 one A 1 + 1 one A 2 + 2 two B 3 + 3 two C 4 + + Notice that the first two rows are the same for our `index` + and `columns` arguments. + + >>> df.pivot(index='foo', columns='bar', values='baz') + Traceback (most recent call last): + ... + ValueError: Index contains duplicate entries, cannot reshape + """ + + @Substitution("") + @Appender(_shared_docs["pivot"]) + def pivot(self, index=None, columns=None, values=None) -> "DataFrame": + from pandas.core.reshape.pivot import pivot + + return pivot(self, index=index, columns=columns, values=values) + + _shared_docs[ + "pivot_table" + ] = """ + Create a spreadsheet-style pivot table as a DataFrame. + + The levels in the pivot table will be stored in MultiIndex objects + (hierarchical indexes) on the index and columns of the result DataFrame. + + Parameters + ----------%s + values : column to aggregate, optional + index : column, Grouper, array, or list of the previous + If an array is passed, it must be the same length as the data. The + list can contain any of the other types (except list). + Keys to group by on the pivot table index. If an array is passed, + it is being used as the same manner as column values. + columns : column, Grouper, array, or list of the previous + If an array is passed, it must be the same length as the data. The + list can contain any of the other types (except list). + Keys to group by on the pivot table column. If an array is passed, + it is being used as the same manner as column values. + aggfunc : function, list of functions, dict, default numpy.mean + If list of functions passed, the resulting pivot table will have + hierarchical columns whose top level are the function names + (inferred from the function objects themselves) + If dict is passed, the key is column to aggregate and value + is function or list of functions. + fill_value : scalar, default None + Value to replace missing values with. + margins : bool, default False + Add all row / columns (e.g. for subtotal / grand totals). + dropna : bool, default True + Do not include columns whose entries are all NaN. + margins_name : str, default 'All' + Name of the row / column that will contain the totals + when margins is True. + observed : bool, default False + This only applies if any of the groupers are Categoricals. + If True: only show observed values for categorical groupers. + If False: show all values for categorical groupers. + + .. versionchanged:: 0.25.0 + + Returns + ------- + DataFrame + An Excel style pivot table. + + See Also + -------- + DataFrame.pivot : Pivot without aggregation that can handle + non-numeric data. + + Examples + -------- + >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo", + ... "bar", "bar", "bar", "bar"], + ... "B": ["one", "one", "one", "two", "two", + ... "one", "one", "two", "two"], + ... "C": ["small", "large", "large", "small", + ... "small", "large", "small", "small", + ... "large"], + ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], + ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]}) + >>> df + A B C D E + 0 foo one small 1 2 + 1 foo one large 2 4 + 2 foo one large 2 5 + 3 foo two small 3 5 + 4 foo two small 3 6 + 5 bar one large 4 6 + 6 bar one small 5 8 + 7 bar two small 6 9 + 8 bar two large 7 9 + + This first example aggregates values by taking the sum. + + >>> table = pd.pivot_table(df, values='D', index=['A', 'B'], + ... columns=['C'], aggfunc=np.sum) + >>> table + C large small + A B + bar one 4.0 5.0 + two 7.0 6.0 + foo one 4.0 1.0 + two NaN 6.0 + + We can also fill missing values using the `fill_value` parameter. + + >>> table = pd.pivot_table(df, values='D', index=['A', 'B'], + ... columns=['C'], aggfunc=np.sum, fill_value=0) + >>> table + C large small + A B + bar one 4 5 + two 7 6 + foo one 4 1 + two 0 6 + + The next example aggregates by taking the mean across multiple columns. + + >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'], + ... aggfunc={'D': np.mean, + ... 'E': np.mean}) + >>> table + D E + A C + bar large 5.500000 7.500000 + small 5.500000 8.500000 + foo large 2.000000 4.500000 + small 2.333333 4.333333 + + We can also calculate multiple types of aggregations for any given + value column. + + >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'], + ... aggfunc={'D': np.mean, + ... 'E': [min, max, np.mean]}) + >>> table + D E + mean max mean min + A C + bar large 5.500000 9.0 7.500000 6.0 + small 5.500000 9.0 8.500000 8.0 + foo large 2.000000 5.0 4.500000 4.0 + small 2.333333 6.0 4.333333 2.0 + """ + + @Substitution("") + @Appender(_shared_docs["pivot_table"]) + def pivot_table( + self, + values=None, + index=None, + columns=None, + aggfunc="mean", + fill_value=None, + margins=False, + dropna=True, + margins_name="All", + observed=False, + ) -> "DataFrame": + from pandas.core.reshape.pivot import pivot_table + + return pivot_table( + self, + values=values, + index=index, + columns=columns, + aggfunc=aggfunc, + fill_value=fill_value, + margins=margins, + dropna=dropna, + margins_name=margins_name, + observed=observed, + ) + + def stack(self, level=-1, dropna=True): + """ + Stack the prescribed level(s) from columns to index. + + Return a reshaped DataFrame or Series having a multi-level + index with one or more new inner-most levels compared to the current + DataFrame. The new inner-most levels are created by pivoting the + columns of the current dataframe: + + - if the columns have a single level, the output is a Series; + - if the columns have multiple levels, the new index + level(s) is (are) taken from the prescribed level(s) and + the output is a DataFrame. + + The new index levels are sorted. + + Parameters + ---------- + level : int, str, list, default -1 + Level(s) to stack from the column axis onto the index + axis, defined as one index or label, or a list of indices + or labels. + dropna : bool, default True + Whether to drop rows in the resulting Frame/Series with + missing values. Stacking a column level onto the index + axis can create combinations of index and column values + that are missing from the original dataframe. See Examples + section. + + Returns + ------- + DataFrame or Series + Stacked dataframe or series. + + See Also + -------- + DataFrame.unstack : Unstack prescribed level(s) from index axis + onto column axis. + DataFrame.pivot : Reshape dataframe from long format to wide + format. + DataFrame.pivot_table : Create a spreadsheet-style pivot table + as a DataFrame. + + Notes + ----- + The function is named by analogy with a collection of books + being reorganized from being side by side on a horizontal + position (the columns of the dataframe) to being stacked + vertically on top of each other (in the index of the + dataframe). + + Examples + -------- + **Single level columns** + + >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]], + ... index=['cat', 'dog'], + ... columns=['weight', 'height']) + + Stacking a dataframe with a single level column axis returns a Series: + + >>> df_single_level_cols + weight height + cat 0 1 + dog 2 3 + >>> df_single_level_cols.stack() + cat weight 0 + height 1 + dog weight 2 + height 3 + dtype: int64 + + **Multi level columns: simple case** + + >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'), + ... ('weight', 'pounds')]) + >>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]], + ... index=['cat', 'dog'], + ... columns=multicol1) + + Stacking a dataframe with a multi-level column axis: + + >>> df_multi_level_cols1 + weight + kg pounds + cat 1 2 + dog 2 4 + >>> df_multi_level_cols1.stack() + weight + cat kg 1 + pounds 2 + dog kg 2 + pounds 4 + + **Missing values** + + >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'), + ... ('height', 'm')]) + >>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]], + ... index=['cat', 'dog'], + ... columns=multicol2) + + It is common to have missing values when stacking a dataframe + with multi-level columns, as the stacked dataframe typically + has more values than the original dataframe. Missing values + are filled with NaNs: + + >>> df_multi_level_cols2 + weight height + kg m + cat 1.0 2.0 + dog 3.0 4.0 + >>> df_multi_level_cols2.stack() + height weight + cat kg NaN 1.0 + m 2.0 NaN + dog kg NaN 3.0 + m 4.0 NaN + + **Prescribing the level(s) to be stacked** + + The first parameter controls which level or levels are stacked: + + >>> df_multi_level_cols2.stack(0) + kg m + cat height NaN 2.0 + weight 1.0 NaN + dog height NaN 4.0 + weight 3.0 NaN + >>> df_multi_level_cols2.stack([0, 1]) + cat height m 2.0 + weight kg 1.0 + dog height m 4.0 + weight kg 3.0 + dtype: float64 + + **Dropping missing values** + + >>> df_multi_level_cols3 = pd.DataFrame([[None, 1.0], [2.0, 3.0]], + ... index=['cat', 'dog'], + ... columns=multicol2) + + Note that rows where all values are missing are dropped by + default but this behaviour can be controlled via the dropna + keyword parameter: + + >>> df_multi_level_cols3 + weight height + kg m + cat NaN 1.0 + dog 2.0 3.0 + >>> df_multi_level_cols3.stack(dropna=False) + height weight + cat kg NaN NaN + m 1.0 NaN + dog kg NaN 2.0 + m 3.0 NaN + >>> df_multi_level_cols3.stack(dropna=True) + height weight + cat m 1.0 NaN + dog kg NaN 2.0 + m 3.0 NaN + """ + from pandas.core.reshape.reshape import stack, stack_multiple + + if isinstance(level, (tuple, list)): + return stack_multiple(self, level, dropna=dropna) + else: + return stack(self, level, dropna=dropna) + + def explode(self, column: Union[str, Tuple]) -> "DataFrame": + """ + Transform each element of a list-like to a row, replicating index values. + + .. versionadded:: 0.25.0 + + Parameters + ---------- + column : str or tuple + Column to explode. + + Returns + ------- + DataFrame + Exploded lists to rows of the subset columns; + index will be duplicated for these rows. + + Raises + ------ + ValueError : + if columns of the frame are not unique. + + See Also + -------- + DataFrame.unstack : Pivot a level of the (necessarily hierarchical) + index labels. + DataFrame.melt : Unpivot a DataFrame from wide format to long format. + Series.explode : Explode a DataFrame from list-like columns to long format. + + Notes + ----- + This routine will explode list-likes including lists, tuples, + Series, and np.ndarray. The result dtype of the subset rows will + be object. Scalars will be returned unchanged. Empty list-likes will + result in a np.nan for that row. + + Examples + -------- + >>> df = pd.DataFrame({'A': [[1, 2, 3], 'foo', [], [3, 4]], 'B': 1}) + >>> df + A B + 0 [1, 2, 3] 1 + 1 foo 1 + 2 [] 1 + 3 [3, 4] 1 + + >>> df.explode('A') + A B + 0 1 1 + 0 2 1 + 0 3 1 + 1 foo 1 + 2 NaN 1 + 3 3 1 + 3 4 1 + """ + + if not (is_scalar(column) or isinstance(column, tuple)): + raise ValueError("column must be a scalar") + if not self.columns.is_unique: + raise ValueError("columns must be unique") + + df = self.reset_index(drop=True) + # TODO: use overload to refine return type of reset_index + assert df is not None # needed for mypy + result = df[column].explode() + result = df.drop([column], axis=1).join(result) + result.index = self.index.take(result.index) + result = result.reindex(columns=self.columns, copy=False) + + return result + + def unstack(self, level=-1, fill_value=None): + """ + Pivot a level of the (necessarily hierarchical) index labels. + + Returns a DataFrame having a new level of column labels whose inner-most level + consists of the pivoted index labels. + + If the index is not a MultiIndex, the output will be a Series + (the analogue of stack when the columns are not a MultiIndex). + + The level involved will automatically get sorted. + + Parameters + ---------- + level : int, str, or list of these, default -1 (last level) + Level(s) of index to unstack, can pass level name. + fill_value : int, str or dict + Replace NaN with this value if the unstack produces missing values. + + Returns + ------- + Series or DataFrame + + See Also + -------- + DataFrame.pivot : Pivot a table based on column values. + DataFrame.stack : Pivot a level of the column labels (inverse operation + from `unstack`). + + Examples + -------- + >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), + ... ('two', 'a'), ('two', 'b')]) + >>> s = pd.Series(np.arange(1.0, 5.0), index=index) + >>> s + one a 1.0 + b 2.0 + two a 3.0 + b 4.0 + dtype: float64 + + >>> s.unstack(level=-1) + a b + one 1.0 2.0 + two 3.0 4.0 + + >>> s.unstack(level=0) + one two + a 1.0 3.0 + b 2.0 4.0 + + >>> df = s.unstack(level=0) + >>> df.unstack() + one a 1.0 + b 2.0 + two a 3.0 + b 4.0 + dtype: float64 + """ + from pandas.core.reshape.reshape import unstack + + return unstack(self, level, fill_value) + + _shared_docs[ + "melt" + ] = """ + Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. + + This function is useful to massage a DataFrame into a format where one + or more columns are identifier variables (`id_vars`), while all other + columns, considered measured variables (`value_vars`), are "unpivoted" to + the row axis, leaving just two non-identifier columns, 'variable' and + 'value'. + %(versionadded)s + Parameters + ---------- + id_vars : tuple, list, or ndarray, optional + Column(s) to use as identifier variables. + value_vars : tuple, list, or ndarray, optional + Column(s) to unpivot. If not specified, uses all columns that + are not set as `id_vars`. + var_name : scalar + Name to use for the 'variable' column. If None it uses + ``frame.columns.name`` or 'variable'. + value_name : scalar, default 'value' + Name to use for the 'value' column. + col_level : int or str, optional + If columns are a MultiIndex then use this level to melt. + + Returns + ------- + DataFrame + Unpivoted DataFrame. + + See Also + -------- + %(other)s + pivot_table + DataFrame.pivot + Series.explode + + Examples + -------- + >>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'}, + ... 'B': {0: 1, 1: 3, 2: 5}, + ... 'C': {0: 2, 1: 4, 2: 6}}) + >>> df + A B C + 0 a 1 2 + 1 b 3 4 + 2 c 5 6 + + >>> %(caller)sid_vars=['A'], value_vars=['B']) + A variable value + 0 a B 1 + 1 b B 3 + 2 c B 5 + + >>> %(caller)sid_vars=['A'], value_vars=['B', 'C']) + A variable value + 0 a B 1 + 1 b B 3 + 2 c B 5 + 3 a C 2 + 4 b C 4 + 5 c C 6 + + The names of 'variable' and 'value' columns can be customized: + + >>> %(caller)sid_vars=['A'], value_vars=['B'], + ... var_name='myVarname', value_name='myValname') + A myVarname myValname + 0 a B 1 + 1 b B 3 + 2 c B 5 + + If you have multi-index columns: + + >>> df.columns = [list('ABC'), list('DEF')] + >>> df + A B C + D E F + 0 a 1 2 + 1 b 3 4 + 2 c 5 6 + + >>> %(caller)scol_level=0, id_vars=['A'], value_vars=['B']) + A variable value + 0 a B 1 + 1 b B 3 + 2 c B 5 + + >>> %(caller)sid_vars=[('A', 'D')], value_vars=[('B', 'E')]) + (A, D) variable_0 variable_1 value + 0 a B E 1 + 1 b B E 3 + 2 c B E 5 + """ + + @Appender( + _shared_docs["melt"] + % dict( + caller="df.melt(", versionadded=".. versionadded:: 0.20.0\n", other="melt" + ) + ) + def melt( + self, + id_vars=None, + value_vars=None, + var_name=None, + value_name="value", + col_level=None, + ) -> "DataFrame": + from pandas.core.reshape.melt import melt + + return melt( + self, + id_vars=id_vars, + value_vars=value_vars, + var_name=var_name, + value_name=value_name, + col_level=col_level, + ) + + # ---------------------------------------------------------------------- + # Time series-related + + def diff(self, periods=1, axis=0) -> "DataFrame": + """ + First discrete difference of element. + + Calculates the difference of a DataFrame element compared with another + element in the DataFrame (default is the element in the same column + of the previous row). + + Parameters + ---------- + periods : int, default 1 + Periods to shift for calculating difference, accepts negative + values. + axis : {0 or 'index', 1 or 'columns'}, default 0 + Take difference over rows (0) or columns (1). + + Returns + ------- + DataFrame + + See Also + -------- + Series.diff: First discrete difference for a Series. + DataFrame.pct_change: Percent change over given number of periods. + DataFrame.shift: Shift index by desired number of periods with an + optional time freq. + + Notes + ----- + For boolean dtypes, this uses :meth:`operator.xor` rather than + :meth:`operator.sub`. + + Examples + -------- + Difference with previous row + + >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6], + ... 'b': [1, 1, 2, 3, 5, 8], + ... 'c': [1, 4, 9, 16, 25, 36]}) + >>> df + a b c + 0 1 1 1 + 1 2 1 4 + 2 3 2 9 + 3 4 3 16 + 4 5 5 25 + 5 6 8 36 + + >>> df.diff() + a b c + 0 NaN NaN NaN + 1 1.0 0.0 3.0 + 2 1.0 1.0 5.0 + 3 1.0 1.0 7.0 + 4 1.0 2.0 9.0 + 5 1.0 3.0 11.0 + + Difference with previous column + + >>> df.diff(axis=1) + a b c + 0 NaN 0.0 0.0 + 1 NaN -1.0 3.0 + 2 NaN -1.0 7.0 + 3 NaN -1.0 13.0 + 4 NaN 0.0 20.0 + 5 NaN 2.0 28.0 + + Difference with 3rd previous row + + >>> df.diff(periods=3) + a b c + 0 NaN NaN NaN + 1 NaN NaN NaN + 2 NaN NaN NaN + 3 3.0 2.0 15.0 + 4 3.0 4.0 21.0 + 5 3.0 6.0 27.0 + + Difference with following row + + >>> df.diff(periods=-1) + a b c + 0 -1.0 0.0 -3.0 + 1 -1.0 -1.0 -5.0 + 2 -1.0 -1.0 -7.0 + 3 -1.0 -2.0 -9.0 + 4 -1.0 -3.0 -11.0 + 5 NaN NaN NaN + """ + bm_axis = self._get_block_manager_axis(axis) + new_data = self._data.diff(n=periods, axis=bm_axis) + return self._constructor(new_data) + + # ---------------------------------------------------------------------- + # Function application + + def _gotitem( + self, + key: Union[str, List[str]], + ndim: int, + subset: Optional[Union[Series, ABCDataFrame]] = None, + ) -> Union[Series, ABCDataFrame]: + """ + Sub-classes to define. Return a sliced object. + + Parameters + ---------- + key : string / list of selections + ndim : 1,2 + requested ndim of result + subset : object, default None + subset to act on + """ + if subset is None: + subset = self + elif subset.ndim == 1: # is Series + return subset + + # TODO: _shallow_copy(subset)? + return subset[key] + + _agg_summary_and_see_also_doc = dedent( + """ + The aggregation operations are always performed over an axis, either the + index (default) or the column axis. This behavior is different from + `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`, + `var`), where the default is to compute the aggregation of the flattened + array, e.g., ``numpy.mean(arr_2d)`` as opposed to + ``numpy.mean(arr_2d, axis=0)``. + + `agg` is an alias for `aggregate`. Use the alias. + + See Also + -------- + DataFrame.apply : Perform any type of operations. + DataFrame.transform : Perform transformation type operations. + core.groupby.GroupBy : Perform operations over groups. + core.resample.Resampler : Perform operations over resampled bins. + core.window.Rolling : Perform operations over rolling window. + core.window.Expanding : Perform operations over expanding window. + core.window.EWM : Perform operation over exponential weighted + window. + """ + ) + + _agg_examples_doc = dedent( + """ + Examples + -------- + >>> df = pd.DataFrame([[1, 2, 3], + ... [4, 5, 6], + ... [7, 8, 9], + ... [np.nan, np.nan, np.nan]], + ... columns=['A', 'B', 'C']) + + Aggregate these functions over the rows. + + >>> df.agg(['sum', 'min']) + A B C + sum 12.0 15.0 18.0 + min 1.0 2.0 3.0 + + Different aggregations per column. + + >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']}) + A B + max NaN 8.0 + min 1.0 2.0 + sum 12.0 NaN + + Aggregate over the columns. + + >>> df.agg("mean", axis="columns") + 0 2.0 + 1 5.0 + 2 8.0 + 3 NaN + dtype: float64 + """ + ) + + @Substitution( + see_also=_agg_summary_and_see_also_doc, + examples=_agg_examples_doc, + versionadded="\n.. versionadded:: 0.20.0\n", + **_shared_doc_kwargs, + ) + @Appender(_shared_docs["aggregate"]) + def aggregate(self, func, axis=0, *args, **kwargs): + axis = self._get_axis_number(axis) + + result = None + try: + result, how = self._aggregate(func, axis=axis, *args, **kwargs) + except TypeError: + pass + if result is None: + return self.apply(func, axis=axis, args=args, **kwargs) + return result + + def _aggregate(self, arg, axis=0, *args, **kwargs): + if axis == 1: + # NDFrame.aggregate returns a tuple, and we need to transpose + # only result + result, how = self.T._aggregate(arg, *args, **kwargs) + result = result.T if result is not None else result + return result, how + return super()._aggregate(arg, *args, **kwargs) + + agg = aggregate + + @Appender(_shared_docs["transform"] % _shared_doc_kwargs) + def transform(self, func, axis=0, *args, **kwargs) -> "DataFrame": + axis = self._get_axis_number(axis) + if axis == 1: + return self.T.transform(func, *args, **kwargs).T + return super().transform(func, *args, **kwargs) + + def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds): + """ + Apply a function along an axis of the DataFrame. + + Objects passed to the function are Series objects whose index is + either the DataFrame's index (``axis=0``) or the DataFrame's columns + (``axis=1``). By default (``result_type=None``), the final return type + is inferred from the return type of the applied function. Otherwise, + it depends on the `result_type` argument. + + Parameters + ---------- + func : function + Function to apply to each column or row. + axis : {0 or 'index', 1 or 'columns'}, default 0 + Axis along which the function is applied: + + * 0 or 'index': apply function to each column. + * 1 or 'columns': apply function to each row. + + raw : bool, default False + Determines if row or column is passed as a Series or ndarray object: + + * ``False`` : passes each row or column as a Series to the + function. + * ``True`` : the passed function will receive ndarray objects + instead. + If you are just applying a NumPy reduction function this will + achieve much better performance. + + result_type : {'expand', 'reduce', 'broadcast', None}, default None + These only act when ``axis=1`` (columns): + + * 'expand' : list-like results will be turned into columns. + * 'reduce' : returns a Series if possible rather than expanding + list-like results. This is the opposite of 'expand'. + * 'broadcast' : results will be broadcast to the original shape + of the DataFrame, the original index and columns will be + retained. + + The default behaviour (None) depends on the return value of the + applied function: list-like results will be returned as a Series + of those. However if the apply function returns a Series these + are expanded to columns. + + .. versionadded:: 0.23.0 + + args : tuple + Positional arguments to pass to `func` in addition to the + array/series. + **kwds + Additional keyword arguments to pass as keywords arguments to + `func`. + + Returns + ------- + Series or DataFrame + Result of applying ``func`` along the given axis of the + DataFrame. + + See Also + -------- + DataFrame.applymap: For elementwise operations. + DataFrame.aggregate: Only perform aggregating type operations. + DataFrame.transform: Only perform transforming type operations. + + Examples + -------- + + >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B']) + >>> df + A B + 0 4 9 + 1 4 9 + 2 4 9 + + Using a numpy universal function (in this case the same as + ``np.sqrt(df)``): + + >>> df.apply(np.sqrt) + A B + 0 2.0 3.0 + 1 2.0 3.0 + 2 2.0 3.0 + + Using a reducing function on either axis + + >>> df.apply(np.sum, axis=0) + A 12 + B 27 + dtype: int64 + + >>> df.apply(np.sum, axis=1) + 0 13 + 1 13 + 2 13 + dtype: int64 + + Returning a list-like will result in a Series + + >>> df.apply(lambda x: [1, 2], axis=1) + 0 [1, 2] + 1 [1, 2] + 2 [1, 2] + dtype: object + + Passing result_type='expand' will expand list-like results + to columns of a Dataframe + + >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand') + 0 1 + 0 1 2 + 1 1 2 + 2 1 2 + + Returning a Series inside the function is similar to passing + ``result_type='expand'``. The resulting column names + will be the Series index. + + >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1) + foo bar + 0 1 2 + 1 1 2 + 2 1 2 + + Passing ``result_type='broadcast'`` will ensure the same shape + result, whether list-like or scalar is returned by the function, + and broadcast it along the axis. The resulting column names will + be the originals. + + >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast') + A B + 0 1 2 + 1 1 2 + 2 1 2 + """ + from pandas.core.apply import frame_apply + + op = frame_apply( + self, + func=func, + axis=axis, + raw=raw, + result_type=result_type, + args=args, + kwds=kwds, + ) + return op.get_result() + + def applymap(self, func) -> "DataFrame": + """ + Apply a function to a Dataframe elementwise. + + This method applies a function that accepts and returns a scalar + to every element of a DataFrame. + + Parameters + ---------- + func : callable + Python function, returns a single value from a single value. + + Returns + ------- + DataFrame + Transformed DataFrame. + + See Also + -------- + DataFrame.apply : Apply a function along input axis of DataFrame. + + Notes + ----- + In the current implementation applymap calls `func` twice on the + first column/row to decide whether it can take a fast or slow + code path. This can lead to unexpected behavior if `func` has + side-effects, as they will take effect twice for the first + column/row. + + Examples + -------- + >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]]) + >>> df + 0 1 + 0 1.000 2.120 + 1 3.356 4.567 + + >>> df.applymap(lambda x: len(str(x))) + 0 1 + 0 3 4 + 1 5 5 + + Note that a vectorized version of `func` often exists, which will + be much faster. You could square each number elementwise. + + >>> df.applymap(lambda x: x**2) + 0 1 + 0 1.000000 4.494400 + 1 11.262736 20.857489 + + But it's better to avoid applymap in that case. + + >>> df ** 2 + 0 1 + 0 1.000000 4.494400 + 1 11.262736 20.857489 + """ + + # if we have a dtype == 'M8[ns]', provide boxed values + def infer(x): + if x.empty: + return lib.map_infer(x, func) + return lib.map_infer(x.astype(object).values, func) + + return self.apply(infer) + + # ---------------------------------------------------------------------- + # Merging / joining methods + + def append( + self, other, ignore_index=False, verify_integrity=False, sort=False + ) -> "DataFrame": + """ + Append rows of `other` to the end of caller, returning a new object. + + Columns in `other` that are not in the caller are added as new columns. + + Parameters + ---------- + other : DataFrame or Series/dict-like object, or list of these + The data to append. + ignore_index : bool, default False + If True, do not use the index labels. + verify_integrity : bool, default False + If True, raise ValueError on creating index with duplicates. + sort : bool, default False + Sort columns if the columns of `self` and `other` are not aligned. + + .. versionadded:: 0.23.0 + .. versionchanged:: 1.0.0 + + Changed to not sort by default. + + Returns + ------- + DataFrame + + See Also + -------- + concat : General function to concatenate DataFrame or Series objects. + + Notes + ----- + If a list of dict/series is passed and the keys are all contained in + the DataFrame's index, the order of the columns in the resulting + DataFrame will be unchanged. + + Iteratively appending rows to a DataFrame can be more computationally + intensive than a single concatenate. A better solution is to append + those rows to a list and then concatenate the list with the original + DataFrame all at once. + + Examples + -------- + + >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB')) + >>> df + A B + 0 1 2 + 1 3 4 + >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB')) + >>> df.append(df2) + A B + 0 1 2 + 1 3 4 + 0 5 6 + 1 7 8 + + With `ignore_index` set to True: + + >>> df.append(df2, ignore_index=True) + A B + 0 1 2 + 1 3 4 + 2 5 6 + 3 7 8 + + The following, while not recommended methods for generating DataFrames, + show two ways to generate a DataFrame from multiple data sources. + + Less efficient: + + >>> df = pd.DataFrame(columns=['A']) + >>> for i in range(5): + ... df = df.append({'A': i}, ignore_index=True) + >>> df + A + 0 0 + 1 1 + 2 2 + 3 3 + 4 4 + + More efficient: + + >>> pd.concat([pd.DataFrame([i], columns=['A']) for i in range(5)], + ... ignore_index=True) + A + 0 0 + 1 1 + 2 2 + 3 3 + 4 4 + """ + if isinstance(other, (Series, dict)): + if isinstance(other, dict): + other = Series(other) + if other.name is None and not ignore_index: + raise TypeError( + "Can only append a Series if ignore_index=True " + "or if the Series has a name" + ) + + index = Index([other.name], name=self.index.name) + idx_diff = other.index.difference(self.columns) + try: + combined_columns = self.columns.append(idx_diff) + except TypeError: + combined_columns = self.columns.astype(object).append(idx_diff) + other = ( + other.reindex(combined_columns, copy=False) + .to_frame() + .T.infer_objects() + .rename_axis(index.names, copy=False) + ) + if not self.columns.equals(combined_columns): + self = self.reindex(columns=combined_columns) + elif isinstance(other, list): + if not other: + pass + elif not isinstance(other[0], DataFrame): + other = DataFrame(other) + if (self.columns.get_indexer(other.columns) >= 0).all(): + other = other.reindex(columns=self.columns) + + from pandas.core.reshape.concat import concat + + if isinstance(other, (list, tuple)): + to_concat = [self, *other] + else: + to_concat = [self, other] + return concat( + to_concat, + ignore_index=ignore_index, + verify_integrity=verify_integrity, + sort=sort, + ) + + def join( + self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False + ) -> "DataFrame": + """ + Join columns of another DataFrame. + + Join columns with `other` DataFrame either on index or on a key + column. Efficiently join multiple DataFrame objects by index at once by + passing a list. + + Parameters + ---------- + other : DataFrame, Series, or list of DataFrame + Index should be similar to one of the columns in this one. If a + Series is passed, its name attribute must be set, and that will be + used as the column name in the resulting joined DataFrame. + on : str, list of str, or array-like, optional + Column or index level name(s) in the caller to join on the index + in `other`, otherwise joins index-on-index. If multiple + values given, the `other` DataFrame must have a MultiIndex. Can + pass an array as the join key if it is not already contained in + the calling DataFrame. Like an Excel VLOOKUP operation. + how : {'left', 'right', 'outer', 'inner'}, default 'left' + How to handle the operation of the two objects. + + * left: use calling frame's index (or column if on is specified) + * right: use `other`'s index. + * outer: form union of calling frame's index (or column if on is + specified) with `other`'s index, and sort it. + lexicographically. + * inner: form intersection of calling frame's index (or column if + on is specified) with `other`'s index, preserving the order + of the calling's one. + lsuffix : str, default '' + Suffix to use from left frame's overlapping columns. + rsuffix : str, default '' + Suffix to use from right frame's overlapping columns. + sort : bool, default False + Order result DataFrame lexicographically by the join key. If False, + the order of the join key depends on the join type (how keyword). + + Returns + ------- + DataFrame + A dataframe containing columns from both the caller and `other`. + + See Also + -------- + DataFrame.merge : For column(s)-on-columns(s) operations. + + Notes + ----- + Parameters `on`, `lsuffix`, and `rsuffix` are not supported when + passing a list of `DataFrame` objects. + + Support for specifying index levels as the `on` parameter was added + in version 0.23.0. + + Examples + -------- + >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], + ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) + + >>> df + key A + 0 K0 A0 + 1 K1 A1 + 2 K2 A2 + 3 K3 A3 + 4 K4 A4 + 5 K5 A5 + + >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'], + ... 'B': ['B0', 'B1', 'B2']}) + + >>> other + key B + 0 K0 B0 + 1 K1 B1 + 2 K2 B2 + + Join DataFrames using their indexes. + + >>> df.join(other, lsuffix='_caller', rsuffix='_other') + key_caller A key_other B + 0 K0 A0 K0 B0 + 1 K1 A1 K1 B1 + 2 K2 A2 K2 B2 + 3 K3 A3 NaN NaN + 4 K4 A4 NaN NaN + 5 K5 A5 NaN NaN + + If we want to join using the key columns, we need to set key to be + the index in both `df` and `other`. The joined DataFrame will have + key as its index. + + >>> df.set_index('key').join(other.set_index('key')) + A B + key + K0 A0 B0 + K1 A1 B1 + K2 A2 B2 + K3 A3 NaN + K4 A4 NaN + K5 A5 NaN + + Another option to join using the key columns is to use the `on` + parameter. DataFrame.join always uses `other`'s index but we can use + any column in `df`. This method preserves the original DataFrame's + index in the result. + + >>> df.join(other.set_index('key'), on='key') + key A B + 0 K0 A0 B0 + 1 K1 A1 B1 + 2 K2 A2 B2 + 3 K3 A3 NaN + 4 K4 A4 NaN + 5 K5 A5 NaN + """ + return self._join_compat( + other, on=on, how=how, lsuffix=lsuffix, rsuffix=rsuffix, sort=sort + ) + + def _join_compat( + self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False + ): + from pandas.core.reshape.merge import merge + from pandas.core.reshape.concat import concat + + if isinstance(other, Series): + if other.name is None: + raise ValueError("Other Series must have a name") + other = DataFrame({other.name: other}) + + if isinstance(other, DataFrame): + return merge( + self, + other, + left_on=on, + how=how, + left_index=on is None, + right_index=True, + suffixes=(lsuffix, rsuffix), + sort=sort, + ) + else: + if on is not None: + raise ValueError( + "Joining multiple DataFrames only supported for joining on index" + ) + + frames = [self] + list(other) + + can_concat = all(df.index.is_unique for df in frames) + + # join indexes only using concat + if can_concat: + if how == "left": + res = concat( + frames, axis=1, join="outer", verify_integrity=True, sort=sort + ) + return res.reindex(self.index, copy=False) + else: + return concat( + frames, axis=1, join=how, verify_integrity=True, sort=sort + ) + + joined = frames[0] + + for frame in frames[1:]: + joined = merge( + joined, frame, how=how, left_index=True, right_index=True + ) + + return joined + + @Substitution("") + @Appender(_merge_doc, indents=2) + def merge( + self, + right, + how="inner", + on=None, + left_on=None, + right_on=None, + left_index=False, + right_index=False, + sort=False, + suffixes=("_x", "_y"), + copy=True, + indicator=False, + validate=None, + ) -> "DataFrame": + from pandas.core.reshape.merge import merge + + return merge( + self, + right, + how=how, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + sort=sort, + suffixes=suffixes, + copy=copy, + indicator=indicator, + validate=validate, + ) + + def round(self, decimals=0, *args, **kwargs) -> "DataFrame": + """ + Round a DataFrame to a variable number of decimal places. + + Parameters + ---------- + decimals : int, dict, Series + Number of decimal places to round each column to. If an int is + given, round each column to the same number of places. + Otherwise dict and Series round to variable numbers of places. + Column names should be in the keys if `decimals` is a + dict-like, or in the index if `decimals` is a Series. Any + columns not included in `decimals` will be left as is. Elements + of `decimals` which are not columns of the input will be + ignored. + *args + Additional keywords have no effect but might be accepted for + compatibility with numpy. + **kwargs + Additional keywords have no effect but might be accepted for + compatibility with numpy. + + Returns + ------- + DataFrame + A DataFrame with the affected columns rounded to the specified + number of decimal places. + + See Also + -------- + numpy.around : Round a numpy array to the given number of decimals. + Series.round : Round a Series to the given number of decimals. + + Examples + -------- + >>> df = pd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)], + ... columns=['dogs', 'cats']) + >>> df + dogs cats + 0 0.21 0.32 + 1 0.01 0.67 + 2 0.66 0.03 + 3 0.21 0.18 + + By providing an integer each column is rounded to the same number + of decimal places + + >>> df.round(1) + dogs cats + 0 0.2 0.3 + 1 0.0 0.7 + 2 0.7 0.0 + 3 0.2 0.2 + + With a dict, the number of places for specific columns can be + specified with the column names as key and the number of decimal + places as value + + >>> df.round({'dogs': 1, 'cats': 0}) + dogs cats + 0 0.2 0.0 + 1 0.0 1.0 + 2 0.7 0.0 + 3 0.2 0.0 + + Using a Series, the number of places for specific columns can be + specified with the column names as index and the number of + decimal places as value + + >>> decimals = pd.Series([0, 1], index=['cats', 'dogs']) + >>> df.round(decimals) + dogs cats + 0 0.2 0.0 + 1 0.0 1.0 + 2 0.7 0.0 + 3 0.2 0.0 + """ + from pandas.core.reshape.concat import concat + + def _dict_round(df, decimals): + for col, vals in df.items(): + try: + yield _series_round(vals, decimals[col]) + except KeyError: + yield vals + + def _series_round(s, decimals): + if is_integer_dtype(s) or is_float_dtype(s): + return s.round(decimals) + return s + + nv.validate_round(args, kwargs) + + if isinstance(decimals, (dict, Series)): + if isinstance(decimals, Series): + if not decimals.index.is_unique: + raise ValueError("Index of decimals must be unique") + new_cols = list(_dict_round(self, decimals)) + elif is_integer(decimals): + # Dispatch to Series.round + new_cols = [_series_round(v, decimals) for _, v in self.items()] + else: + raise TypeError("decimals must be an integer, a dict-like or a Series") + + if len(new_cols) > 0: + return self._constructor( + concat(new_cols, axis=1), index=self.index, columns=self.columns + ) + else: + return self + + # ---------------------------------------------------------------------- + # Statistical methods, etc. + + def corr(self, method="pearson", min_periods=1) -> "DataFrame": + """ + Compute pairwise correlation of columns, excluding NA/null values. + + Parameters + ---------- + method : {'pearson', 'kendall', 'spearman'} or callable + Method of correlation: + + * pearson : standard correlation coefficient + * kendall : Kendall Tau correlation coefficient + * spearman : Spearman rank correlation + * callable: callable with input two 1d ndarrays + and returning a float. Note that the returned matrix from corr + will have 1 along the diagonals and will be symmetric + regardless of the callable's behavior. + + .. versionadded:: 0.24.0 + + min_periods : int, optional + Minimum number of observations required per pair of columns + to have a valid result. Currently only available for Pearson + and Spearman correlation. + + Returns + ------- + DataFrame + Correlation matrix. + + See Also + -------- + DataFrame.corrwith + Series.corr + + Examples + -------- + >>> def histogram_intersection(a, b): + ... v = np.minimum(a, b).sum().round(decimals=1) + ... return v + >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)], + ... columns=['dogs', 'cats']) + >>> df.corr(method=histogram_intersection) + dogs cats + dogs 1.0 0.3 + cats 0.3 1.0 + """ + numeric_df = self._get_numeric_data() + cols = numeric_df.columns + idx = cols.copy() + mat = numeric_df.values + + if method == "pearson": + correl = libalgos.nancorr(ensure_float64(mat), minp=min_periods) + elif method == "spearman": + correl = libalgos.nancorr_spearman(ensure_float64(mat), minp=min_periods) + elif method == "kendall" or callable(method): + if min_periods is None: + min_periods = 1 + mat = ensure_float64(mat).T + corrf = nanops.get_corr_func(method) + K = len(cols) + correl = np.empty((K, K), dtype=float) + mask = np.isfinite(mat) + for i, ac in enumerate(mat): + for j, bc in enumerate(mat): + if i > j: + continue + + valid = mask[i] & mask[j] + if valid.sum() < min_periods: + c = np.nan + elif i == j: + c = 1.0 + elif not valid.all(): + c = corrf(ac[valid], bc[valid]) + else: + c = corrf(ac, bc) + correl[i, j] = c + correl[j, i] = c + else: + raise ValueError( + "method must be either 'pearson', " + "'spearman', 'kendall', or a callable, " + f"'{method}' was supplied" + ) + + return self._constructor(correl, index=idx, columns=cols) + + def cov(self, min_periods=None) -> "DataFrame": + """ + Compute pairwise covariance of columns, excluding NA/null values. + + Compute the pairwise covariance among the series of a DataFrame. + The returned data frame is the `covariance matrix + `__ of the columns + of the DataFrame. + + Both NA and null values are automatically excluded from the + calculation. (See the note below about bias from missing values.) + A threshold can be set for the minimum number of + observations for each value created. Comparisons with observations + below this threshold will be returned as ``NaN``. + + This method is generally used for the analysis of time series data to + understand the relationship between different measures + across time. + + Parameters + ---------- + min_periods : int, optional + Minimum number of observations required per pair of columns + to have a valid result. + + Returns + ------- + DataFrame + The covariance matrix of the series of the DataFrame. + + See Also + -------- + Series.cov : Compute covariance with another Series. + core.window.EWM.cov: Exponential weighted sample covariance. + core.window.Expanding.cov : Expanding sample covariance. + core.window.Rolling.cov : Rolling sample covariance. + + Notes + ----- + Returns the covariance matrix of the DataFrame's time series. + The covariance is normalized by N-1. + + For DataFrames that have Series that are missing data (assuming that + data is `missing at random + `__) + the returned covariance matrix will be an unbiased estimate + of the variance and covariance between the member Series. + + However, for many applications this estimate may not be acceptable + because the estimate covariance matrix is not guaranteed to be positive + semi-definite. This could lead to estimate correlations having + absolute values which are greater than one, and/or a non-invertible + covariance matrix. See `Estimation of covariance matrices + `__ for more details. + + Examples + -------- + >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)], + ... columns=['dogs', 'cats']) + >>> df.cov() + dogs cats + dogs 0.666667 -1.000000 + cats -1.000000 1.666667 + + >>> np.random.seed(42) + >>> df = pd.DataFrame(np.random.randn(1000, 5), + ... columns=['a', 'b', 'c', 'd', 'e']) + >>> df.cov() + a b c d e + a 0.998438 -0.020161 0.059277 -0.008943 0.014144 + b -0.020161 1.059352 -0.008543 -0.024738 0.009826 + c 0.059277 -0.008543 1.010670 -0.001486 -0.000271 + d -0.008943 -0.024738 -0.001486 0.921297 -0.013692 + e 0.014144 0.009826 -0.000271 -0.013692 0.977795 + + **Minimum number of periods** + + This method also supports an optional ``min_periods`` keyword + that specifies the required minimum number of non-NA observations for + each column pair in order to have a valid result: + + >>> np.random.seed(42) + >>> df = pd.DataFrame(np.random.randn(20, 3), + ... columns=['a', 'b', 'c']) + >>> df.loc[df.index[:5], 'a'] = np.nan + >>> df.loc[df.index[5:10], 'b'] = np.nan + >>> df.cov(min_periods=12) + a b c + a 0.316741 NaN -0.150812 + b NaN 1.248003 0.191417 + c -0.150812 0.191417 0.895202 + """ + numeric_df = self._get_numeric_data() + cols = numeric_df.columns + idx = cols.copy() + mat = numeric_df.values + + if notna(mat).all(): + if min_periods is not None and min_periods > len(mat): + baseCov = np.empty((mat.shape[1], mat.shape[1])) + baseCov.fill(np.nan) + else: + baseCov = np.cov(mat.T) + baseCov = baseCov.reshape((len(cols), len(cols))) + else: + baseCov = libalgos.nancorr(ensure_float64(mat), cov=True, minp=min_periods) + + return self._constructor(baseCov, index=idx, columns=cols) + + def corrwith(self, other, axis=0, drop=False, method="pearson") -> Series: + """ + Compute pairwise correlation. + + Pairwise correlation is computed between rows or columns of + DataFrame with rows or columns of Series or DataFrame. DataFrames + are first aligned along both axes before computing the + correlations. + + Parameters + ---------- + other : DataFrame, Series + Object with which to compute correlations. + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to use. 0 or 'index' to compute column-wise, 1 or 'columns' for + row-wise. + drop : bool, default False + Drop missing indices from result. + method : {'pearson', 'kendall', 'spearman'} or callable + Method of correlation: + + * pearson : standard correlation coefficient + * kendall : Kendall Tau correlation coefficient + * spearman : Spearman rank correlation + * callable: callable with input two 1d ndarrays + and returning a float. + + .. versionadded:: 0.24.0 + + Returns + ------- + Series + Pairwise correlations. + + See Also + -------- + DataFrame.corr + """ + axis = self._get_axis_number(axis) + this = self._get_numeric_data() + + if isinstance(other, Series): + return this.apply(lambda x: other.corr(x, method=method), axis=axis) + + other = other._get_numeric_data() + left, right = this.align(other, join="inner", copy=False) + + if axis == 1: + left = left.T + right = right.T + + if method == "pearson": + # mask missing values + left = left + right * 0 + right = right + left * 0 + + # demeaned data + ldem = left - left.mean() + rdem = right - right.mean() + + num = (ldem * rdem).sum() + dom = (left.count() - 1) * left.std() * right.std() + + correl = num / dom + + elif method in ["kendall", "spearman"] or callable(method): + + def c(x): + return nanops.nancorr(x[0], x[1], method=method) + + correl = Series( + map(c, zip(left.values.T, right.values.T)), index=left.columns + ) + + else: + raise ValueError( + f"Invalid method {method} was passed, " + "valid methods are: 'pearson', 'kendall', " + "'spearman', or callable" + ) + + if not drop: + # Find non-matching labels along the given axis + # and append missing correlations (GH 22375) + raxis = 1 if axis == 0 else 0 + result_index = this._get_axis(raxis).union(other._get_axis(raxis)) + idx_diff = result_index.difference(correl.index) + + if len(idx_diff) > 0: + correl = correl.append(Series([np.nan] * len(idx_diff), index=idx_diff)) + + return correl + + # ---------------------------------------------------------------------- + # ndarray-like stats methods + + def count(self, axis=0, level=None, numeric_only=False): + """ + Count non-NA cells for each column or row. + + The values `None`, `NaN`, `NaT`, and optionally `numpy.inf` (depending + on `pandas.options.mode.use_inf_as_na`) are considered NA. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + If 0 or 'index' counts are generated for each column. + If 1 or 'columns' counts are generated for each **row**. + level : int or str, optional + If the axis is a `MultiIndex` (hierarchical), count along a + particular `level`, collapsing into a `DataFrame`. + A `str` specifies the level name. + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + Returns + ------- + Series or DataFrame + For each column/row the number of non-NA/null entries. + If `level` is specified returns a `DataFrame`. + + See Also + -------- + Series.count: Number of non-NA elements in a Series. + DataFrame.shape: Number of DataFrame rows and columns (including NA + elements). + DataFrame.isna: Boolean same-sized DataFrame showing places of NA + elements. + + Examples + -------- + Constructing DataFrame from a dictionary: + + >>> df = pd.DataFrame({"Person": + ... ["John", "Myla", "Lewis", "John", "Myla"], + ... "Age": [24., np.nan, 21., 33, 26], + ... "Single": [False, True, True, True, False]}) + >>> df + Person Age Single + 0 John 24.0 False + 1 Myla NaN True + 2 Lewis 21.0 True + 3 John 33.0 True + 4 Myla 26.0 False + + Notice the uncounted NA values: + + >>> df.count() + Person 5 + Age 4 + Single 5 + dtype: int64 + + Counts for each **row**: + + >>> df.count(axis='columns') + 0 3 + 1 2 + 2 3 + 3 3 + 4 3 + dtype: int64 + + Counts for one level of a `MultiIndex`: + + >>> df.set_index(["Person", "Single"]).count(level="Person") + Age + Person + John 2 + Lewis 1 + Myla 1 + """ + axis = self._get_axis_number(axis) + if level is not None: + return self._count_level(level, axis=axis, numeric_only=numeric_only) + + if numeric_only: + frame = self._get_numeric_data() + else: + frame = self + + # GH #423 + if len(frame._get_axis(axis)) == 0: + result = Series(0, index=frame._get_agg_axis(axis)) + else: + if frame._is_mixed_type or frame._data.any_extension_types: + # the or any_extension_types is really only hit for single- + # column frames with an extension array + result = notna(frame).sum(axis=axis) + else: + # GH13407 + series_counts = notna(frame).sum(axis=axis) + counts = series_counts.values + result = Series(counts, index=frame._get_agg_axis(axis)) + + return result.astype("int64") + + def _count_level(self, level, axis=0, numeric_only=False): + if numeric_only: + frame = self._get_numeric_data() + else: + frame = self + + count_axis = frame._get_axis(axis) + agg_axis = frame._get_agg_axis(axis) + + if not isinstance(count_axis, ABCMultiIndex): + raise TypeError( + f"Can only count levels on hierarchical {self._get_axis_name(axis)}." + ) + + if frame._is_mixed_type: + # Since we have mixed types, calling notna(frame.values) might + # upcast everything to object + mask = notna(frame).values + else: + # But use the speedup when we have homogeneous dtypes + mask = notna(frame.values) + + if axis == 1: + # We're transposing the mask rather than frame to avoid potential + # upcasts to object, which induces a ~20x slowdown + mask = mask.T + + if isinstance(level, str): + level = count_axis._get_level_number(level) + + level_name = count_axis._names[level] + level_index = count_axis.levels[level]._shallow_copy(name=level_name) + level_codes = ensure_int64(count_axis.codes[level]) + counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=0) + + result = DataFrame(counts, index=level_index, columns=agg_axis) + + if axis == 1: + # Undo our earlier transpose + return result.T + else: + return result + + def _reduce( + self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds + ): + if axis is None and filter_type == "bool": + labels = None + constructor = None + else: + # TODO: Make other agg func handle axis=None properly + axis = self._get_axis_number(axis) + labels = self._get_agg_axis(axis) + constructor = self._constructor + + def f(x): + return op(x, axis=axis, skipna=skipna, **kwds) + + def _get_data(axis_matters): + if filter_type is None or filter_type == "numeric": + data = self._get_numeric_data() + elif filter_type == "bool": + if axis_matters: + # GH#25101, GH#24434 + data = self._get_bool_data() if axis == 0 else self + else: + data = self._get_bool_data() + else: # pragma: no cover + msg = ( + f"Generating numeric_only data with filter_type {filter_type} " + "not supported." + ) + raise NotImplementedError(msg) + return data + + if numeric_only is not None and axis in [0, 1]: + df = self + if numeric_only is True: + df = _get_data(axis_matters=True) + if axis == 1: + df = df.T + axis = 0 + + out_dtype = "bool" if filter_type == "bool" else None + + # After possibly _get_data and transposing, we are now in the + # simple case where we can use BlockManager._reduce + res = df._data.reduce(op, axis=1, skipna=skipna, **kwds) + assert isinstance(res, dict) + if len(res): + assert len(res) == max(list(res.keys())) + 1, res.keys() + out = df._constructor_sliced(res, index=range(len(res)), dtype=out_dtype) + out.index = df.columns + return out + + if numeric_only is None: + values = self.values + try: + result = f(values) + + if filter_type == "bool" and is_object_dtype(values) and axis is None: + # work around https://github.com/numpy/numpy/issues/10489 + # TODO: combine with hasattr(result, 'dtype') further down + # hard since we don't have `values` down there. + result = np.bool_(result) + except TypeError: + # e.g. in nanops trying to convert strs to float + + # try by-column first + if filter_type is None and axis == 0: + # this can end up with a non-reduction + # but not always. if the types are mixed + # with datelike then need to make sure a series + + # we only end up here if we have not specified + # numeric_only and yet we have tried a + # column-by-column reduction, where we have mixed type. + # So let's just do what we can + from pandas.core.apply import frame_apply + + opa = frame_apply( + self, func=f, result_type="expand", ignore_failures=True + ) + result = opa.get_result() + if result.ndim == self.ndim: + result = result.iloc[0] + return result + + # TODO: why doesnt axis matter here? + data = _get_data(axis_matters=False) + with np.errstate(all="ignore"): + result = f(data.values) + labels = data._get_agg_axis(axis) + else: + if numeric_only: + data = _get_data(axis_matters=True) + + values = data.values + labels = data._get_agg_axis(axis) + else: + values = self.values + result = f(values) + + if hasattr(result, "dtype") and is_object_dtype(result.dtype): + try: + if filter_type is None or filter_type == "numeric": + result = result.astype(np.float64) + elif filter_type == "bool" and notna(result).all(): + result = result.astype(np.bool_) + except (ValueError, TypeError): + + # try to coerce to the original dtypes item by item if we can + if axis == 0: + result = coerce_to_dtypes(result, self.dtypes) + + if constructor is not None: + result = Series(result, index=labels) + return result + + def nunique(self, axis=0, dropna=True) -> Series: + """ + Count distinct observations over requested axis. + + Return Series with number of distinct observations. Can ignore NaN + values. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for + column-wise. + dropna : bool, default True + Don't include NaN in the counts. + + Returns + ------- + Series + + See Also + -------- + Series.nunique: Method nunique for Series. + DataFrame.count: Count non-NA cells for each column or row. + + Examples + -------- + >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [1, 1, 1]}) + >>> df.nunique() + A 3 + B 1 + dtype: int64 + + >>> df.nunique(axis=1) + 0 1 + 1 2 + 2 2 + dtype: int64 + """ + return self.apply(Series.nunique, axis=axis, dropna=dropna) + + def idxmin(self, axis=0, skipna=True) -> Series: + """ + Return index of first occurrence of minimum over requested axis. + + NA/null values are excluded. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + + Returns + ------- + Series + Indexes of minima along the specified axis. + + Raises + ------ + ValueError + * If the row/column is empty + + See Also + -------- + Series.idxmin + + Notes + ----- + This method is the DataFrame version of ``ndarray.argmin``. + """ + axis = self._get_axis_number(axis) + indices = nanops.nanargmin(self.values, axis=axis, skipna=skipna) + index = self._get_axis(axis) + result = [index[i] if i >= 0 else np.nan for i in indices] + return Series(result, index=self._get_agg_axis(axis)) + + def idxmax(self, axis=0, skipna=True) -> Series: + """ + Return index of first occurrence of maximum over requested axis. + + NA/null values are excluded. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + + Returns + ------- + Series + Indexes of maxima along the specified axis. + + Raises + ------ + ValueError + * If the row/column is empty + + See Also + -------- + Series.idxmax + + Notes + ----- + This method is the DataFrame version of ``ndarray.argmax``. + """ + axis = self._get_axis_number(axis) + indices = nanops.nanargmax(self.values, axis=axis, skipna=skipna) + index = self._get_axis(axis) + result = [index[i] if i >= 0 else np.nan for i in indices] + return Series(result, index=self._get_agg_axis(axis)) + + def _get_agg_axis(self, axis_num): + """ + Let's be explicit about this. + """ + if axis_num == 0: + return self.columns + elif axis_num == 1: + return self.index + else: + raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})") + + def mode(self, axis=0, numeric_only=False, dropna=True) -> "DataFrame": + """ + Get the mode(s) of each element along the selected axis. + + The mode of a set of values is the value that appears most often. + It can be multiple values. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to iterate over while searching for the mode: + + * 0 or 'index' : get mode of each column + * 1 or 'columns' : get mode of each row. + + numeric_only : bool, default False + If True, only apply to numeric columns. + dropna : bool, default True + Don't consider counts of NaN/NaT. + + .. versionadded:: 0.24.0 + + Returns + ------- + DataFrame + The modes of each column or row. + + See Also + -------- + Series.mode : Return the highest frequency value in a Series. + Series.value_counts : Return the counts of values in a Series. + + Examples + -------- + >>> df = pd.DataFrame([('bird', 2, 2), + ... ('mammal', 4, np.nan), + ... ('arthropod', 8, 0), + ... ('bird', 2, np.nan)], + ... index=('falcon', 'horse', 'spider', 'ostrich'), + ... columns=('species', 'legs', 'wings')) + >>> df + species legs wings + falcon bird 2 2.0 + horse mammal 4 NaN + spider arthropod 8 0.0 + ostrich bird 2 NaN + + By default, missing values are not considered, and the mode of wings + are both 0 and 2. The second row of species and legs contains ``NaN``, + because they have only one mode, but the DataFrame has two rows. + + >>> df.mode() + species legs wings + 0 bird 2.0 0.0 + 1 NaN NaN 2.0 + + Setting ``dropna=False`` ``NaN`` values are considered and they can be + the mode (like for wings). + + >>> df.mode(dropna=False) + species legs wings + 0 bird 2 NaN + + Setting ``numeric_only=True``, only the mode of numeric columns is + computed, and columns of other types are ignored. + + >>> df.mode(numeric_only=True) + legs wings + 0 2.0 0.0 + 1 NaN 2.0 + + To compute the mode over columns and not rows, use the axis parameter: + + >>> df.mode(axis='columns', numeric_only=True) + 0 1 + falcon 2.0 NaN + horse 4.0 NaN + spider 0.0 8.0 + ostrich 2.0 NaN + """ + data = self if not numeric_only else self._get_numeric_data() + + def f(s): + return s.mode(dropna=dropna) + + return data.apply(f, axis=axis) + + def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"): + """ + Return values at the given quantile over requested axis. + + Parameters + ---------- + q : float or array-like, default 0.5 (50% quantile) + Value between 0 <= q <= 1, the quantile(s) to compute. + axis : {0, 1, 'index', 'columns'} (default 0) + Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + numeric_only : bool, default True + If False, the quantile of datetime and timedelta data will be + computed as well. + interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + This optional parameter specifies the interpolation method to use, + when the desired quantile lies between two data points `i` and `j`: + + * linear: `i + (j - i) * fraction`, where `fraction` is the + fractional part of the index surrounded by `i` and `j`. + * lower: `i`. + * higher: `j`. + * nearest: `i` or `j` whichever is nearest. + * midpoint: (`i` + `j`) / 2. + + Returns + ------- + Series or DataFrame + + If ``q`` is an array, a DataFrame will be returned where the + index is ``q``, the columns are the columns of self, and the + values are the quantiles. + If ``q`` is a float, a Series will be returned where the + index is the columns of self and the values are the quantiles. + + See Also + -------- + core.window.Rolling.quantile: Rolling quantile. + numpy.percentile: Numpy function to compute the percentile. + + Examples + -------- + >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), + ... columns=['a', 'b']) + >>> df.quantile(.1) + a 1.3 + b 3.7 + Name: 0.1, dtype: float64 + >>> df.quantile([.1, .5]) + a b + 0.1 1.3 3.7 + 0.5 2.5 55.0 + + Specifying `numeric_only=False` will also compute the quantile of + datetime and timedelta data. + + >>> df = pd.DataFrame({'A': [1, 2], + ... 'B': [pd.Timestamp('2010'), + ... pd.Timestamp('2011')], + ... 'C': [pd.Timedelta('1 days'), + ... pd.Timedelta('2 days')]}) + >>> df.quantile(0.5, numeric_only=False) + A 1.5 + B 2010-07-02 12:00:00 + C 1 days 12:00:00 + Name: 0.5, dtype: object + """ + validate_percentile(q) + + data = self._get_numeric_data() if numeric_only else self + axis = self._get_axis_number(axis) + is_transposed = axis == 1 + + if is_transposed: + data = data.T + + if len(data.columns) == 0: + # GH#23925 _get_numeric_data may have dropped all columns + cols = Index([], name=self.columns.name) + if is_list_like(q): + return self._constructor([], index=q, columns=cols) + return self._constructor_sliced([], index=cols, name=q, dtype=np.float64) + + result = data._data.quantile( + qs=q, axis=1, interpolation=interpolation, transposed=is_transposed + ) + + if result.ndim == 2: + result = self._constructor(result) + else: + result = self._constructor_sliced(result, name=q) + + if is_transposed: + result = result.T + + return result + + def to_timestamp(self, freq=None, how="start", axis=0, copy=True) -> "DataFrame": + """ + Cast to DatetimeIndex of timestamps, at *beginning* of period. + + Parameters + ---------- + freq : str, default frequency of PeriodIndex + Desired frequency. + how : {'s', 'e', 'start', 'end'} + Convention for converting period to timestamp; start of period + vs. end. + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to convert (the index by default). + copy : bool, default True + If False then underlying input data is not copied. + + Returns + ------- + DataFrame with DatetimeIndex + """ + new_data = self._data + if copy: + new_data = new_data.copy() + + axis = self._get_axis_number(axis) + if axis == 0: + new_data.set_axis(1, self.index.to_timestamp(freq=freq, how=how)) + elif axis == 1: + new_data.set_axis(0, self.columns.to_timestamp(freq=freq, how=how)) + else: # pragma: no cover + raise AssertionError(f"Axis must be 0 or 1. Got {axis}") + + return self._constructor(new_data) + + def to_period(self, freq=None, axis=0, copy=True) -> "DataFrame": + """ + Convert DataFrame from DatetimeIndex to PeriodIndex. + + Convert DataFrame from DatetimeIndex to PeriodIndex with desired + frequency (inferred from index if not passed). + + Parameters + ---------- + freq : str, default + Frequency of the PeriodIndex. + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to convert (the index by default). + copy : bool, default True + If False then underlying input data is not copied. + + Returns + ------- + TimeSeries with PeriodIndex + """ + new_data = self._data + if copy: + new_data = new_data.copy() + + axis = self._get_axis_number(axis) + if axis == 0: + new_data.set_axis(1, self.index.to_period(freq=freq)) + elif axis == 1: + new_data.set_axis(0, self.columns.to_period(freq=freq)) + else: # pragma: no cover + raise AssertionError(f"Axis must be 0 or 1. Got {axis}") + + return self._constructor(new_data) + + def isin(self, values) -> "DataFrame": + """ + Whether each element in the DataFrame is contained in values. + + Parameters + ---------- + values : iterable, Series, DataFrame or dict + The result will only be true at a location if all the + labels match. If `values` is a Series, that's the index. If + `values` is a dict, the keys must be the column names, + which must match. If `values` is a DataFrame, + then both the index and column labels must match. + + Returns + ------- + DataFrame + DataFrame of booleans showing whether each element in the DataFrame + is contained in values. + + See Also + -------- + DataFrame.eq: Equality test for DataFrame. + Series.isin: Equivalent method on Series. + Series.str.contains: Test if pattern or regex is contained within a + string of a Series or Index. + + Examples + -------- + + >>> df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]}, + ... index=['falcon', 'dog']) + >>> df + num_legs num_wings + falcon 2 2 + dog 4 0 + + When ``values`` is a list check whether every value in the DataFrame + is present in the list (which animals have 0 or 2 legs or wings) + + >>> df.isin([0, 2]) + num_legs num_wings + falcon True True + dog False True + + When ``values`` is a dict, we can pass values to check for each + column separately: + + >>> df.isin({'num_wings': [0, 3]}) + num_legs num_wings + falcon False False + dog False True + + When ``values`` is a Series or DataFrame the index and column must + match. Note that 'falcon' does not match based on the number of legs + in df2. + + >>> other = pd.DataFrame({'num_legs': [8, 2], 'num_wings': [0, 2]}, + ... index=['spider', 'falcon']) + >>> df.isin(other) + num_legs num_wings + falcon True True + dog False False + """ + if isinstance(values, dict): + from pandas.core.reshape.concat import concat + + values = collections.defaultdict(list, values) + return self._ensure_type( + concat( + ( + self.iloc[:, [i]].isin(values[col]) + for i, col in enumerate(self.columns) + ), + axis=1, + ) + ) + elif isinstance(values, Series): + if not values.index.is_unique: + raise ValueError("cannot compute isin with a duplicate axis.") + return self.eq(values.reindex_like(self), axis="index") + elif isinstance(values, DataFrame): + if not (values.columns.is_unique and values.index.is_unique): + raise ValueError("cannot compute isin with a duplicate axis.") + return self.eq(values.reindex_like(self)) + else: + if not is_list_like(values): + raise TypeError( + "only list-like or dict-like objects are allowed " + "to be passed to DataFrame.isin(), " + f"you passed a {repr(type(values).__name__)}" + ) + return DataFrame( + algorithms.isin(self.values.ravel(), values).reshape(self.shape), + self.index, + self.columns, + ) + + # ---------------------------------------------------------------------- + # Add plotting methods to DataFrame + plot = CachedAccessor("plot", pandas.plotting.PlotAccessor) + hist = pandas.plotting.hist_frame + boxplot = pandas.plotting.boxplot_frame + sparse = CachedAccessor("sparse", SparseFrameAccessor) + + +DataFrame._setup_axes( + ["index", "columns"], + docs={ + "index": "The index (row labels) of the DataFrame.", + "columns": "The column labels of the DataFrame.", + }, +) +DataFrame._add_numeric_operations() +DataFrame._add_series_or_dataframe_operations() + +ops.add_flex_arithmetic_methods(DataFrame) +ops.add_special_arithmetic_methods(DataFrame) + + +def _from_nested_dict(data): + # TODO: this should be seriously cythonized + new_data = {} + for index, s in data.items(): + for col, v in s.items(): + new_data[col] = new_data.get(col, {}) + new_data[col][index] = v + return new_data + + +def _put_str(s, space): + return str(s)[:space].ljust(space) diff --git a/venv/Lib/site-packages/pandas/core/generic.py b/venv/Lib/site-packages/pandas/core/generic.py new file mode 100644 index 0000000..32ea476 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/generic.py @@ -0,0 +1,11369 @@ +import collections +from datetime import timedelta +import functools +import gc +import json +import operator +import pickle +import re +from textwrap import dedent +from typing import ( + Any, + Callable, + Dict, + FrozenSet, + Hashable, + List, + Mapping, + Optional, + Sequence, + Set, + Tuple, + Type, + Union, +) +import warnings +import weakref + +import numpy as np + +from pandas._config import config + +from pandas._libs import Timestamp, iNaT, lib, properties +from pandas._typing import ( + Axis, + Dtype, + FilePathOrBuffer, + FrameOrSeries, + JSONSerializable, + Level, + Renamer, +) +from pandas.compat import set_function_name +from pandas.compat._optional import import_optional_dependency +from pandas.compat.numpy import function as nv +from pandas.errors import AbstractMethodError +from pandas.util._decorators import Appender, Substitution, rewrite_axis_style_signature +from pandas.util._validators import ( + validate_bool_kwarg, + validate_fillna_kwargs, + validate_percentile, +) + +from pandas.core.dtypes.common import ( + ensure_int64, + ensure_object, + ensure_str, + is_bool, + is_bool_dtype, + is_datetime64_any_dtype, + is_datetime64tz_dtype, + is_dict_like, + is_extension_array_dtype, + is_float, + is_integer, + is_list_like, + is_number, + is_numeric_dtype, + is_object_dtype, + is_period_arraylike, + is_re_compilable, + is_scalar, + is_timedelta64_dtype, + pandas_dtype, +) +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries +from pandas.core.dtypes.inference import is_hashable +from pandas.core.dtypes.missing import isna, notna + +import pandas as pd +from pandas.core import missing, nanops +import pandas.core.algorithms as algos +from pandas.core.base import PandasObject, SelectionMixin +import pandas.core.common as com +from pandas.core.construction import create_series_with_explicit_dtype +from pandas.core.indexes.api import ( + Index, + InvalidIndexError, + MultiIndex, + RangeIndex, + ensure_index, +) +from pandas.core.indexes.datetimes import DatetimeIndex +from pandas.core.indexes.period import Period, PeriodIndex +import pandas.core.indexing as indexing +from pandas.core.internals import BlockManager +from pandas.core.missing import find_valid_index +from pandas.core.ops import _align_method_FRAME + +from pandas.io.formats import format as fmt +from pandas.io.formats.format import DataFrameFormatter, format_percentiles +from pandas.io.formats.printing import pprint_thing +from pandas.tseries.frequencies import to_offset + +# goal is to be able to define the docs close to function, while still being +# able to share +_shared_docs: Dict[str, str] = dict() +_shared_doc_kwargs = dict( + axes="keywords for axes", + klass="Series/DataFrame", + axes_single_arg="int or labels for object", + args_transpose="axes to permute (int or label for object)", + optional_by=""" + by : str or list of str + Name or list of names to sort by""", +) + + +def _single_replace(self, to_replace, method, inplace, limit): + """ + Replaces values in a Series using the fill method specified when no + replacement value is given in the replace method + """ + if self.ndim != 1: + raise TypeError( + f"cannot replace {to_replace} with method {method} on a " + f"{type(self).__name__}" + ) + + orig_dtype = self.dtype + result = self if inplace else self.copy() + fill_f = missing.get_fill_func(method) + + mask = missing.mask_missing(result.values, to_replace) + values = fill_f(result.values, limit=limit, mask=mask) + + if values.dtype == orig_dtype and inplace: + return + + result = pd.Series(values, index=self.index, dtype=self.dtype).__finalize__(self) + + if inplace: + self._update_inplace(result._data) + return + + return result + + +bool_t = bool # Need alias because NDFrame has def bool: + + +class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): + """ + N-dimensional analogue of DataFrame. Store multi-dimensional in a + size-mutable, labeled data structure + + Parameters + ---------- + data : BlockManager + axes : list + copy : bool, default False + """ + + _internal_names: List[str] = [ + "_data", + "_cacher", + "_item_cache", + "_cache", + "_is_copy", + "_subtyp", + "_name", + "_index", + "_default_kind", + "_default_fill_value", + "_metadata", + "__array_struct__", + "__array_interface__", + ] + _internal_names_set: Set[str] = set(_internal_names) + _accessors: Set[str] = set() + _deprecations: FrozenSet[str] = frozenset(["get_values", "ix"]) + _metadata: List[str] = [] + _is_copy = None + _data: BlockManager + _attrs: Dict[Optional[Hashable], Any] + _typ: str + + # ---------------------------------------------------------------------- + # Constructors + + def __init__( + self, + data: BlockManager, + axes: Optional[List[Index]] = None, + copy: bool = False, + dtype: Optional[Dtype] = None, + attrs: Optional[Mapping[Optional[Hashable], Any]] = None, + fastpath: bool = False, + ): + + if not fastpath: + if dtype is not None: + data = data.astype(dtype) + elif copy: + data = data.copy() + + if axes is not None: + for i, ax in enumerate(axes): + data = data.reindex_axis(ax, axis=i) + + object.__setattr__(self, "_is_copy", None) + object.__setattr__(self, "_data", data) + object.__setattr__(self, "_item_cache", {}) + if attrs is None: + attrs = {} + else: + attrs = dict(attrs) + object.__setattr__(self, "_attrs", attrs) + + def _init_mgr(self, mgr, axes=None, dtype=None, copy=False): + """ passed a manager and a axes dict """ + for a, axe in axes.items(): + if axe is not None: + mgr = mgr.reindex_axis( + axe, axis=self._get_block_manager_axis(a), copy=False + ) + + # make a copy if explicitly requested + if copy: + mgr = mgr.copy() + if dtype is not None: + # avoid further copies if we can + if len(mgr.blocks) > 1 or mgr.blocks[0].values.dtype != dtype: + mgr = mgr.astype(dtype=dtype) + return mgr + + # ---------------------------------------------------------------------- + + @property + def attrs(self) -> Dict[Optional[Hashable], Any]: + """ + Dictionary of global attributes on this object. + + .. warning:: + + attrs is experimental and may change without warning. + """ + if self._attrs is None: + self._attrs = {} + return self._attrs + + @attrs.setter + def attrs(self, value: Mapping[Optional[Hashable], Any]) -> None: + self._attrs = dict(value) + + def _validate_dtype(self, dtype): + """ validate the passed dtype """ + + if dtype is not None: + dtype = pandas_dtype(dtype) + + # a compound dtype + if dtype.kind == "V": + raise NotImplementedError( + "compound dtypes are not implemented" + f" in the {type(self).__name__} constructor" + ) + + return dtype + + # ---------------------------------------------------------------------- + # Construction + + @property + def _constructor(self: FrameOrSeries) -> Type[FrameOrSeries]: + """Used when a manipulation result has the same dimensions as the + original. + """ + raise AbstractMethodError(self) + + @property + def _constructor_sliced(self): + """Used when a manipulation result has one lower dimension(s) as the + original, such as DataFrame single columns slicing. + """ + raise AbstractMethodError(self) + + @property + def _constructor_expanddim(self): + """Used when a manipulation result has one higher dimension as the + original, such as Series.to_frame() + """ + raise NotImplementedError + + # ---------------------------------------------------------------------- + # Axis + _AXIS_ALIASES = {"rows": 0} + _AXIS_IALIASES = {0: "rows"} + _stat_axis_number = 0 + _stat_axis_name = "index" + _ix = None + _AXIS_ORDERS: List[str] + _AXIS_NUMBERS: Dict[str, int] + _AXIS_NAMES: Dict[int, str] + _AXIS_REVERSED: bool + _info_axis_number: int + _info_axis_name: str + _AXIS_LEN: int + + @classmethod + def _setup_axes(cls, axes: List[str], docs: Dict[str, str]) -> None: + """ + Provide axes setup for the major PandasObjects. + + Parameters + ---------- + axes : the names of the axes in order (lowest to highest) + docs : docstrings for the axis properties + """ + info_axis = len(axes) - 1 + axes_are_reversed = len(axes) > 1 + + cls._AXIS_ORDERS = axes + cls._AXIS_NUMBERS = {a: i for i, a in enumerate(axes)} + cls._AXIS_LEN = len(axes) + cls._AXIS_NAMES = dict(enumerate(axes)) + cls._AXIS_REVERSED = axes_are_reversed + + cls._info_axis_number = info_axis + cls._info_axis_name = axes[info_axis] + + # setup the actual axis + def set_axis(a, i): + setattr(cls, a, properties.AxisProperty(i, docs.get(a, a))) + cls._internal_names_set.add(a) + + if axes_are_reversed: + for i, a in cls._AXIS_NAMES.items(): + set_axis(a, 1 - i) + else: + for i, a in cls._AXIS_NAMES.items(): + set_axis(a, i) + + def _construct_axes_dict(self, axes=None, **kwargs): + """Return an axes dictionary for myself.""" + d = {a: self._get_axis(a) for a in (axes or self._AXIS_ORDERS)} + d.update(kwargs) + return d + + @staticmethod + def _construct_axes_dict_from(self, axes, **kwargs): + """Return an axes dictionary for the passed axes.""" + d = {a: ax for a, ax in zip(self._AXIS_ORDERS, axes)} + d.update(kwargs) + return d + + def _construct_axes_from_arguments( + self, args, kwargs, require_all: bool = False, sentinel=None + ): + """Construct and returns axes if supplied in args/kwargs. + + If require_all, raise if all axis arguments are not supplied + return a tuple of (axes, kwargs). + + sentinel specifies the default parameter when an axis is not + supplied; useful to distinguish when a user explicitly passes None + in scenarios where None has special meaning. + """ + + # construct the args + args = list(args) + for a in self._AXIS_ORDERS: + + # look for a argument by position + if a not in kwargs: + try: + kwargs[a] = args.pop(0) + except IndexError: + if require_all: + raise TypeError("not enough/duplicate arguments specified!") + + axes = {a: kwargs.pop(a, sentinel) for a in self._AXIS_ORDERS} + return axes, kwargs + + @classmethod + def _from_axes(cls: Type[FrameOrSeries], data, axes, **kwargs) -> FrameOrSeries: + # for construction from BlockManager + if isinstance(data, BlockManager): + return cls(data, **kwargs) + else: + if cls._AXIS_REVERSED: + axes = axes[::-1] + d = cls._construct_axes_dict_from(cls, axes, copy=False) + d.update(kwargs) + return cls(data, **d) + + @classmethod + def _get_axis_number(cls, axis): + axis = cls._AXIS_ALIASES.get(axis, axis) + if is_integer(axis): + if axis in cls._AXIS_NAMES: + return axis + else: + try: + return cls._AXIS_NUMBERS[axis] + except KeyError: + pass + raise ValueError(f"No axis named {axis} for object type {cls}") + + @classmethod + def _get_axis_name(cls, axis): + axis = cls._AXIS_ALIASES.get(axis, axis) + if isinstance(axis, str): + if axis in cls._AXIS_NUMBERS: + return axis + else: + try: + return cls._AXIS_NAMES[axis] + except KeyError: + pass + raise ValueError(f"No axis named {axis} for object type {cls}") + + def _get_axis(self, axis): + name = self._get_axis_name(axis) + return getattr(self, name) + + @classmethod + def _get_block_manager_axis(cls, axis): + """Map the axis to the block_manager axis.""" + axis = cls._get_axis_number(axis) + if cls._AXIS_REVERSED: + m = cls._AXIS_LEN - 1 + return m - axis + return axis + + def _get_axis_resolvers(self, axis: str) -> Dict[str, ABCSeries]: + # index or columns + axis_index = getattr(self, axis) + d = dict() + prefix = axis[0] + + for i, name in enumerate(axis_index.names): + if name is not None: + key = level = name + else: + # prefix with 'i' or 'c' depending on the input axis + # e.g., you must do ilevel_0 for the 0th level of an unnamed + # multiiindex + key = f"{prefix}level_{i}" + level = i + + level_values = axis_index.get_level_values(level) + s = level_values.to_series() + s.index = axis_index + d[key] = s + + # put the index/columns itself in the dict + if isinstance(axis_index, MultiIndex): + dindex = axis_index + else: + dindex = axis_index.to_series() + + d[axis] = dindex + return d + + def _get_index_resolvers(self) -> Dict[str, ABCSeries]: + from pandas.core.computation.parsing import clean_column_name + + d: Dict[str, ABCSeries] = {} + for axis_name in self._AXIS_ORDERS: + d.update(self._get_axis_resolvers(axis_name)) + + return {clean_column_name(k): v for k, v in d.items() if not isinstance(k, int)} + + def _get_cleaned_column_resolvers(self) -> Dict[str, ABCSeries]: + """ + Return the special character free column resolvers of a dataframe. + + Column names with special characters are 'cleaned up' so that they can + be referred to by backtick quoting. + Used in :meth:`DataFrame.eval`. + """ + from pandas.core.computation.parsing import clean_column_name + + if isinstance(self, ABCSeries): + return {clean_column_name(self.name): self} + + return { + clean_column_name(k): v for k, v in self.items() if not isinstance(k, int) + } + + @property + def _info_axis(self): + return getattr(self, self._info_axis_name) + + @property + def _stat_axis(self): + return getattr(self, self._stat_axis_name) + + @property + def shape(self) -> Tuple[int, ...]: + """ + Return a tuple of axis dimensions + """ + return tuple(len(self._get_axis(a)) for a in self._AXIS_ORDERS) + + @property + def axes(self) -> List[Index]: + """ + Return index label(s) of the internal NDFrame + """ + # we do it this way because if we have reversed axes, then + # the block manager shows then reversed + return [self._get_axis(a) for a in self._AXIS_ORDERS] + + @property + def ndim(self) -> int: + """ + Return an int representing the number of axes / array dimensions. + + Return 1 if Series. Otherwise return 2 if DataFrame. + + See Also + -------- + ndarray.ndim : Number of array dimensions. + + Examples + -------- + >>> s = pd.Series({'a': 1, 'b': 2, 'c': 3}) + >>> s.ndim + 1 + + >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df.ndim + 2 + """ + return self._data.ndim + + @property + def size(self): + """ + Return an int representing the number of elements in this object. + + Return the number of rows if Series. Otherwise return the number of + rows times number of columns if DataFrame. + + See Also + -------- + ndarray.size : Number of elements in the array. + + Examples + -------- + >>> s = pd.Series({'a': 1, 'b': 2, 'c': 3}) + >>> s.size + 3 + + >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df.size + 4 + """ + return np.prod(self.shape) + + @property + def _selected_obj(self: FrameOrSeries) -> FrameOrSeries: + """ internal compat with SelectionMixin """ + return self + + @property + def _obj_with_exclusions(self: FrameOrSeries) -> FrameOrSeries: + """ internal compat with SelectionMixin """ + return self + + def set_axis(self, labels, axis=0, inplace=False): + """ + Assign desired index to given axis. + + Indexes for column or row labels can be changed by assigning + a list-like or Index. + + .. versionchanged:: 0.21.0 + + The signature is now `labels` and `axis`, consistent with + the rest of pandas API. Previously, the `axis` and `labels` + arguments were respectively the first and second positional + arguments. + + Parameters + ---------- + labels : list-like, Index + The values for the new index. + + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to update. The value 0 identifies the rows, and 1 + identifies the columns. + + inplace : bool, default False + Whether to return a new %(klass)s instance. + + Returns + ------- + renamed : %(klass)s or None + An object of same type as caller if inplace=False, None otherwise. + + See Also + -------- + DataFrame.rename_axis : Alter the name of the index or columns. + + Examples + -------- + **Series** + + >>> s = pd.Series([1, 2, 3]) + >>> s + 0 1 + 1 2 + 2 3 + dtype: int64 + + >>> s.set_axis(['a', 'b', 'c'], axis=0) + a 1 + b 2 + c 3 + dtype: int64 + + **DataFrame** + + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + + Change the row labels. + + >>> df.set_axis(['a', 'b', 'c'], axis='index') + A B + a 1 4 + b 2 5 + c 3 6 + + Change the column labels. + + >>> df.set_axis(['I', 'II'], axis='columns') + I II + 0 1 4 + 1 2 5 + 2 3 6 + + Now, update the labels inplace. + + >>> df.set_axis(['i', 'ii'], axis='columns', inplace=True) + >>> df + i ii + 0 1 4 + 1 2 5 + 2 3 6 + """ + if inplace: + setattr(self, self._get_axis_name(axis), labels) + else: + obj = self.copy() + obj.set_axis(labels, axis=axis, inplace=True) + return obj + + def _set_axis(self, axis, labels) -> None: + self._data.set_axis(axis, labels) + self._clear_item_cache() + + def swapaxes(self: FrameOrSeries, axis1, axis2, copy=True) -> FrameOrSeries: + """ + Interchange axes and swap values axes appropriately. + + Returns + ------- + y : same as input + """ + i = self._get_axis_number(axis1) + j = self._get_axis_number(axis2) + + if i == j: + if copy: + return self.copy() + return self + + mapping = {i: j, j: i} + + new_axes = (self._get_axis(mapping.get(k, k)) for k in range(self._AXIS_LEN)) + new_values = self.values.swapaxes(i, j) + if copy: + new_values = new_values.copy() + + return self._constructor(new_values, *new_axes).__finalize__(self) + + def droplevel(self: FrameOrSeries, level, axis=0) -> FrameOrSeries: + """ + Return DataFrame with requested index / column level(s) removed. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + level : int, str, or list-like + If a string is given, must be the name of a level + If list-like, elements must be names or positional indexes + of levels. + + axis : {0 or 'index', 1 or 'columns'}, default 0 + + Returns + ------- + DataFrame + DataFrame with requested index / column level(s) removed. + + Examples + -------- + >>> df = pd.DataFrame([ + ... [1, 2, 3, 4], + ... [5, 6, 7, 8], + ... [9, 10, 11, 12] + ... ]).set_index([0, 1]).rename_axis(['a', 'b']) + + >>> df.columns = pd.MultiIndex.from_tuples([ + ... ('c', 'e'), ('d', 'f') + ... ], names=['level_1', 'level_2']) + + >>> df + level_1 c d + level_2 e f + a b + 1 2 3 4 + 5 6 7 8 + 9 10 11 12 + + >>> df.droplevel('a') + level_1 c d + level_2 e f + b + 2 3 4 + 6 7 8 + 10 11 12 + + >>> df.droplevel('level2', axis=1) + level_1 c d + a b + 1 2 3 4 + 5 6 7 8 + 9 10 11 12 + """ + labels = self._get_axis(axis) + new_labels = labels.droplevel(level) + result = self.set_axis(new_labels, axis=axis, inplace=False) + return result + + def pop(self: FrameOrSeries, item) -> FrameOrSeries: + """ + Return item and drop from frame. Raise KeyError if not found. + + Parameters + ---------- + item : str + Label of column to be popped. + + Returns + ------- + Series + + Examples + -------- + >>> df = pd.DataFrame([('falcon', 'bird', 389.0), + ... ('parrot', 'bird', 24.0), + ... ('lion', 'mammal', 80.5), + ... ('monkey', 'mammal', np.nan)], + ... columns=('name', 'class', 'max_speed')) + >>> df + name class max_speed + 0 falcon bird 389.0 + 1 parrot bird 24.0 + 2 lion mammal 80.5 + 3 monkey mammal NaN + + >>> df.pop('class') + 0 bird + 1 bird + 2 mammal + 3 mammal + Name: class, dtype: object + + >>> df + name max_speed + 0 falcon 389.0 + 1 parrot 24.0 + 2 lion 80.5 + 3 monkey NaN + """ + result = self[item] + del self[item] + try: + result._reset_cacher() + except AttributeError: + pass + + return result + + def squeeze(self, axis=None): + """ + Squeeze 1 dimensional axis objects into scalars. + + Series or DataFrames with a single element are squeezed to a scalar. + DataFrames with a single column or a single row are squeezed to a + Series. Otherwise the object is unchanged. + + This method is most useful when you don't know if your + object is a Series or DataFrame, but you do know it has just a single + column. In that case you can safely call `squeeze` to ensure you have a + Series. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns', None}, default None + A specific axis to squeeze. By default, all length-1 axes are + squeezed. + + Returns + ------- + DataFrame, Series, or scalar + The projection after squeezing `axis` or all the axes. + + See Also + -------- + Series.iloc : Integer-location based indexing for selecting scalars. + DataFrame.iloc : Integer-location based indexing for selecting Series. + Series.to_frame : Inverse of DataFrame.squeeze for a + single-column DataFrame. + + Examples + -------- + >>> primes = pd.Series([2, 3, 5, 7]) + + Slicing might produce a Series with a single value: + + >>> even_primes = primes[primes % 2 == 0] + >>> even_primes + 0 2 + dtype: int64 + + >>> even_primes.squeeze() + 2 + + Squeezing objects with more than one value in every axis does nothing: + + >>> odd_primes = primes[primes % 2 == 1] + >>> odd_primes + 1 3 + 2 5 + 3 7 + dtype: int64 + + >>> odd_primes.squeeze() + 1 3 + 2 5 + 3 7 + dtype: int64 + + Squeezing is even more effective when used with DataFrames. + + >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b']) + >>> df + a b + 0 1 2 + 1 3 4 + + Slicing a single column will produce a DataFrame with the columns + having only one value: + + >>> df_a = df[['a']] + >>> df_a + a + 0 1 + 1 3 + + So the columns can be squeezed down, resulting in a Series: + + >>> df_a.squeeze('columns') + 0 1 + 1 3 + Name: a, dtype: int64 + + Slicing a single row from a single column will produce a single + scalar DataFrame: + + >>> df_0a = df.loc[df.index < 1, ['a']] + >>> df_0a + a + 0 1 + + Squeezing the rows produces a single scalar Series: + + >>> df_0a.squeeze('rows') + a 1 + Name: 0, dtype: int64 + + Squeezing all axes will project directly into a scalar: + + >>> df_0a.squeeze() + 1 + """ + axis = self._AXIS_NAMES if axis is None else (self._get_axis_number(axis),) + return self.iloc[ + tuple( + 0 if i in axis and len(a) == 1 else slice(None) + for i, a in enumerate(self.axes) + ) + ] + + def swaplevel(self: FrameOrSeries, i=-2, j=-1, axis=0) -> FrameOrSeries: + """ + Swap levels i and j in a MultiIndex on a particular axis + + Parameters + ---------- + i, j : int, str (can be mixed) + Level of index to be swapped. Can pass level name as string. + + Returns + ------- + swapped : same type as caller (new object) + """ + axis = self._get_axis_number(axis) + result = self.copy() + labels = result._data.axes[axis] + result._data.set_axis(axis, labels.swaplevel(i, j)) + return result + + # ---------------------------------------------------------------------- + # Rename + + def rename( + self: FrameOrSeries, + mapper: Optional[Renamer] = None, + *, + index: Optional[Renamer] = None, + columns: Optional[Renamer] = None, + axis: Optional[Axis] = None, + copy: bool = True, + inplace: bool = False, + level: Optional[Level] = None, + errors: str = "ignore", + ) -> Optional[FrameOrSeries]: + """ + Alter axes input function or functions. Function / dict values must be + unique (1-to-1). Labels not contained in a dict / Series will be left + as-is. Extra labels listed don't throw an error. Alternatively, change + ``Series.name`` with a scalar value (Series only). + + Parameters + ---------- + %(axes)s : scalar, list-like, dict-like or function, optional + Scalar or list-like will alter the ``Series.name`` attribute, + and raise on DataFrame. + dict-like or functions are transformations to apply to + that axis' values + copy : bool, default True + Also copy underlying data. + inplace : bool, default False + Whether to return a new %(klass)s. If True then value of copy is + ignored. + level : int or level name, default None + In case of a MultiIndex, only rename labels in the specified + level. + errors : {'ignore', 'raise'}, default 'ignore' + If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`, + or `columns` contains labels that are not present in the Index + being transformed. + If 'ignore', existing keys will be renamed and extra keys will be + ignored. + + Returns + ------- + renamed : %(klass)s (new object) + + Raises + ------ + KeyError + If any of the labels is not found in the selected axis and + "errors='raise'". + + See Also + -------- + NDFrame.rename_axis + + Examples + -------- + + >>> s = pd.Series([1, 2, 3]) + >>> s + 0 1 + 1 2 + 2 3 + dtype: int64 + >>> s.rename("my_name") # scalar, changes Series.name + 0 1 + 1 2 + 2 3 + Name: my_name, dtype: int64 + >>> s.rename(lambda x: x ** 2) # function, changes labels + 0 1 + 1 2 + 4 3 + dtype: int64 + >>> s.rename({1: 3, 2: 5}) # mapping, changes labels + 0 1 + 3 2 + 5 3 + dtype: int64 + + Since ``DataFrame`` doesn't have a ``.name`` attribute, + only mapping-type arguments are allowed. + + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + >>> df.rename(2) + Traceback (most recent call last): + ... + TypeError: 'int' object is not callable + + ``DataFrame.rename`` supports two calling conventions + + * ``(index=index_mapper, columns=columns_mapper, ...)`` + * ``(mapper, axis={'index', 'columns'}, ...)`` + + We *highly* recommend using keyword arguments to clarify your + intent. + + >>> df.rename(index=str, columns={"A": "a", "B": "c"}) + a c + 0 1 4 + 1 2 5 + 2 3 6 + + >>> df.rename(index=str, columns={"A": "a", "C": "c"}) + a B + 0 1 4 + 1 2 5 + 2 3 6 + + Using axis-style parameters + + >>> df.rename(str.lower, axis='columns') + a b + 0 1 4 + 1 2 5 + 2 3 6 + + >>> df.rename({1: 2, 2: 4}, axis='index') + A B + 0 1 4 + 2 2 5 + 4 3 6 + + See the :ref:`user guide ` for more. + """ + if mapper is None and index is None and columns is None: + raise TypeError("must pass an index to rename") + + if index is not None or columns is not None: + if axis is not None: + raise TypeError( + "Cannot specify both 'axis' and any of 'index' or 'columns'" + ) + elif mapper is not None: + raise TypeError( + "Cannot specify both 'mapper' and any of 'index' or 'columns'" + ) + else: + # use the mapper argument + if axis and self._get_axis_number(axis) == 1: + columns = mapper + else: + index = mapper + + result = self if inplace else self.copy(deep=copy) + + for axis_no, replacements in enumerate((index, columns)): + if replacements is None: + continue + + ax = self._get_axis(axis_no) + baxis = self._get_block_manager_axis(axis_no) + f = com.get_rename_function(replacements) + + if level is not None: + level = ax._get_level_number(level) + + # GH 13473 + if not callable(replacements): + indexer = ax.get_indexer_for(replacements) + if errors == "raise" and len(indexer[indexer == -1]): + missing_labels = [ + label + for index, label in enumerate(replacements) + if indexer[index] == -1 + ] + raise KeyError(f"{missing_labels} not found in axis") + + result._data = result._data.rename_axis( + f, axis=baxis, copy=copy, level=level + ) + result._clear_item_cache() + + if inplace: + self._update_inplace(result._data) + return None + else: + return result.__finalize__(self) + + @rewrite_axis_style_signature("mapper", [("copy", True), ("inplace", False)]) + def rename_axis(self, mapper=lib.no_default, **kwargs): + """ + Set the name of the axis for the index or columns. + + Parameters + ---------- + mapper : scalar, list-like, optional + Value to set the axis name attribute. + index, columns : scalar, list-like, dict-like or function, optional + A scalar, list-like, dict-like or functions transformations to + apply to that axis' values. + + Use either ``mapper`` and ``axis`` to + specify the axis to target with ``mapper``, or ``index`` + and/or ``columns``. + + .. versionchanged:: 0.24.0 + + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to rename. + copy : bool, default True + Also copy underlying data. + inplace : bool, default False + Modifies the object directly, instead of creating a new Series + or DataFrame. + + Returns + ------- + Series, DataFrame, or None + The same type as the caller or None if `inplace` is True. + + See Also + -------- + Series.rename : Alter Series index labels or name. + DataFrame.rename : Alter DataFrame index labels or name. + Index.rename : Set new names on index. + + Notes + ----- + ``DataFrame.rename_axis`` supports two calling conventions + + * ``(index=index_mapper, columns=columns_mapper, ...)`` + * ``(mapper, axis={'index', 'columns'}, ...)`` + + The first calling convention will only modify the names of + the index and/or the names of the Index object that is the columns. + In this case, the parameter ``copy`` is ignored. + + The second calling convention will modify the names of the + the corresponding index if mapper is a list or a scalar. + However, if mapper is dict-like or a function, it will use the + deprecated behavior of modifying the axis *labels*. + + We *highly* recommend using keyword arguments to clarify your + intent. + + Examples + -------- + **Series** + + >>> s = pd.Series(["dog", "cat", "monkey"]) + >>> s + 0 dog + 1 cat + 2 monkey + dtype: object + >>> s.rename_axis("animal") + animal + 0 dog + 1 cat + 2 monkey + dtype: object + + **DataFrame** + + >>> df = pd.DataFrame({"num_legs": [4, 4, 2], + ... "num_arms": [0, 0, 2]}, + ... ["dog", "cat", "monkey"]) + >>> df + num_legs num_arms + dog 4 0 + cat 4 0 + monkey 2 2 + >>> df = df.rename_axis("animal") + >>> df + num_legs num_arms + animal + dog 4 0 + cat 4 0 + monkey 2 2 + >>> df = df.rename_axis("limbs", axis="columns") + >>> df + limbs num_legs num_arms + animal + dog 4 0 + cat 4 0 + monkey 2 2 + + **MultiIndex** + + >>> df.index = pd.MultiIndex.from_product([['mammal'], + ... ['dog', 'cat', 'monkey']], + ... names=['type', 'name']) + >>> df + limbs num_legs num_arms + type name + mammal dog 4 0 + cat 4 0 + monkey 2 2 + + >>> df.rename_axis(index={'type': 'class'}) + limbs num_legs num_arms + class name + mammal dog 4 0 + cat 4 0 + monkey 2 2 + + >>> df.rename_axis(columns=str.upper) + LIMBS num_legs num_arms + type name + mammal dog 4 0 + cat 4 0 + monkey 2 2 + """ + axes, kwargs = self._construct_axes_from_arguments( + (), kwargs, sentinel=lib.no_default + ) + copy = kwargs.pop("copy", True) + inplace = kwargs.pop("inplace", False) + axis = kwargs.pop("axis", 0) + if axis is not None: + axis = self._get_axis_number(axis) + + if kwargs: + raise TypeError( + "rename_axis() got an unexpected keyword " + f'argument "{list(kwargs.keys())[0]}"' + ) + + inplace = validate_bool_kwarg(inplace, "inplace") + + if mapper is not lib.no_default: + # Use v0.23 behavior if a scalar or list + non_mapper = is_scalar(mapper) or ( + is_list_like(mapper) and not is_dict_like(mapper) + ) + if non_mapper: + return self._set_axis_name(mapper, axis=axis, inplace=inplace) + else: + raise ValueError("Use `.rename` to alter labels with a mapper.") + else: + # Use new behavior. Means that index and/or columns + # is specified + result = self if inplace else self.copy(deep=copy) + + for axis in range(self._AXIS_LEN): + v = axes.get(self._AXIS_NAMES[axis]) + if v is lib.no_default: + continue + non_mapper = is_scalar(v) or (is_list_like(v) and not is_dict_like(v)) + if non_mapper: + newnames = v + else: + f = com.get_rename_function(v) + curnames = self._get_axis(axis).names + newnames = [f(name) for name in curnames] + result._set_axis_name(newnames, axis=axis, inplace=True) + if not inplace: + return result + + def _set_axis_name(self, name, axis=0, inplace=False): + """ + Set the name(s) of the axis. + + Parameters + ---------- + name : str or list of str + Name(s) to set. + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to set the label. The value 0 or 'index' specifies index, + and the value 1 or 'columns' specifies columns. + inplace : bool, default False + If `True`, do operation inplace and return None. + + .. versionadded:: 0.21.0 + + Returns + ------- + Series, DataFrame, or None + The same type as the caller or `None` if `inplace` is `True`. + + See Also + -------- + DataFrame.rename : Alter the axis labels of :class:`DataFrame`. + Series.rename : Alter the index labels or set the index name + of :class:`Series`. + Index.rename : Set the name of :class:`Index` or :class:`MultiIndex`. + + Examples + -------- + >>> df = pd.DataFrame({"num_legs": [4, 4, 2]}, + ... ["dog", "cat", "monkey"]) + >>> df + num_legs + dog 4 + cat 4 + monkey 2 + >>> df._set_axis_name("animal") + num_legs + animal + dog 4 + cat 4 + monkey 2 + >>> df.index = pd.MultiIndex.from_product( + ... [["mammal"], ['dog', 'cat', 'monkey']]) + >>> df._set_axis_name(["type", "name"]) + legs + type name + mammal dog 4 + cat 4 + monkey 2 + """ + axis = self._get_axis_number(axis) + idx = self._get_axis(axis).set_names(name) + + inplace = validate_bool_kwarg(inplace, "inplace") + renamed = self if inplace else self.copy() + renamed.set_axis(idx, axis=axis, inplace=True) + if not inplace: + return renamed + + # ---------------------------------------------------------------------- + # Comparison Methods + + def _indexed_same(self, other) -> bool: + return all( + self._get_axis(a).equals(other._get_axis(a)) for a in self._AXIS_ORDERS + ) + + def equals(self, other): + """ + Test whether two objects contain the same elements. + + This function allows two Series or DataFrames to be compared against + each other to see if they have the same shape and elements. NaNs in + the same location are considered equal. The column headers do not + need to have the same type, but the elements within the columns must + be the same dtype. + + Parameters + ---------- + other : Series or DataFrame + The other Series or DataFrame to be compared with the first. + + Returns + ------- + bool + True if all elements are the same in both objects, False + otherwise. + + See Also + -------- + Series.eq : Compare two Series objects of the same length + and return a Series where each element is True if the element + in each Series is equal, False otherwise. + DataFrame.eq : Compare two DataFrame objects of the same shape and + return a DataFrame where each element is True if the respective + element in each DataFrame is equal, False otherwise. + testing.assert_series_equal : Raises an AssertionError if left and + right are not equal. Provides an easy interface to ignore + inequality in dtypes, indexes and precision among others. + testing.assert_frame_equal : Like assert_series_equal, but targets + DataFrames. + numpy.array_equal : Return True if two arrays have the same shape + and elements, False otherwise. + + Notes + ----- + This function requires that the elements have the same dtype as their + respective elements in the other Series or DataFrame. However, the + column labels do not need to have the same type, as long as they are + still considered equal. + + Examples + -------- + >>> df = pd.DataFrame({1: [10], 2: [20]}) + >>> df + 1 2 + 0 10 20 + + DataFrames df and exactly_equal have the same types and values for + their elements and column labels, which will return True. + + >>> exactly_equal = pd.DataFrame({1: [10], 2: [20]}) + >>> exactly_equal + 1 2 + 0 10 20 + >>> df.equals(exactly_equal) + True + + DataFrames df and different_column_type have the same element + types and values, but have different types for the column labels, + which will still return True. + + >>> different_column_type = pd.DataFrame({1.0: [10], 2.0: [20]}) + >>> different_column_type + 1.0 2.0 + 0 10 20 + >>> df.equals(different_column_type) + True + + DataFrames df and different_data_type have different types for the + same values for their elements, and will return False even though + their column labels are the same values and types. + + >>> different_data_type = pd.DataFrame({1: [10.0], 2: [20.0]}) + >>> different_data_type + 1 2 + 0 10.0 20.0 + >>> df.equals(different_data_type) + False + """ + if not isinstance(other, self._constructor): + return False + return self._data.equals(other._data) + + # ------------------------------------------------------------------------- + # Unary Methods + + def __neg__(self): + values = com.values_from_object(self) + if is_bool_dtype(values): + arr = operator.inv(values) + elif ( + is_numeric_dtype(values) + or is_timedelta64_dtype(values) + or is_object_dtype(values) + ): + arr = operator.neg(values) + else: + raise TypeError(f"Unary negative expects numeric dtype, not {values.dtype}") + return self.__array_wrap__(arr) + + def __pos__(self): + values = com.values_from_object(self) + if is_bool_dtype(values) or is_period_arraylike(values): + arr = values + elif ( + is_numeric_dtype(values) + or is_timedelta64_dtype(values) + or is_object_dtype(values) + ): + arr = operator.pos(values) + else: + raise TypeError(f"Unary plus expects numeric dtype, not {values.dtype}") + return self.__array_wrap__(arr) + + def __invert__(self): + if not self.size: + # inv fails with 0 len + return self + + arr = operator.inv(com.values_from_object(self)) + return self.__array_wrap__(arr) + + def __nonzero__(self): + raise ValueError( + f"The truth value of a {type(self).__name__} is ambiguous. " + "Use a.empty, a.bool(), a.item(), a.any() or a.all()." + ) + + __bool__ = __nonzero__ + + def bool(self): + """ + Return the bool of a single element PandasObject. + + This must be a boolean scalar value, either True or False. Raise a + ValueError if the PandasObject does not have exactly 1 element, or that + element is not boolean + + Returns + ------- + bool + Same single boolean value converted to bool type. + """ + v = self.squeeze() + if isinstance(v, (bool, np.bool_)): + return bool(v) + elif is_scalar(v): + raise ValueError( + "bool cannot act on a non-boolean single element " + f"{type(self).__name__}" + ) + + self.__nonzero__() + + def __abs__(self: FrameOrSeries) -> FrameOrSeries: + return self.abs() + + def __round__(self: FrameOrSeries, decimals: int = 0) -> FrameOrSeries: + return self.round(decimals) + + # ------------------------------------------------------------------------- + # Label or Level Combination Helpers + # + # A collection of helper methods for DataFrame/Series operations that + # accept a combination of column/index labels and levels. All such + # operations should utilize/extend these methods when possible so that we + # have consistent precedence and validation logic throughout the library. + + def _is_level_reference(self, key, axis=0): + """ + Test whether a key is a level reference for a given axis. + + To be considered a level reference, `key` must be a string that: + - (axis=0): Matches the name of an index level and does NOT match + a column label. + - (axis=1): Matches the name of a column level and does NOT match + an index label. + + Parameters + ---------- + key : str + Potential level name for the given axis + axis : int, default 0 + Axis that levels are associated with (0 for index, 1 for columns) + + Returns + ------- + is_level : bool + """ + axis = self._get_axis_number(axis) + + return ( + key is not None + and is_hashable(key) + and key in self.axes[axis].names + and not self._is_label_reference(key, axis=axis) + ) + + def _is_label_reference(self, key, axis=0) -> bool_t: + """ + Test whether a key is a label reference for a given axis. + + To be considered a label reference, `key` must be a string that: + - (axis=0): Matches a column label + - (axis=1): Matches an index label + + Parameters + ---------- + key: str + Potential label name + axis: int, default 0 + Axis perpendicular to the axis that labels are associated with + (0 means search for column labels, 1 means search for index labels) + + Returns + ------- + is_label: bool + """ + axis = self._get_axis_number(axis) + other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis) + + return ( + key is not None + and is_hashable(key) + and any(key in self.axes[ax] for ax in other_axes) + ) + + def _is_label_or_level_reference(self, key: str, axis: int = 0) -> bool_t: + """ + Test whether a key is a label or level reference for a given axis. + + To be considered either a label or a level reference, `key` must be a + string that: + - (axis=0): Matches a column label or an index level + - (axis=1): Matches an index label or a column level + + Parameters + ---------- + key: str + Potential label or level name + axis: int, default 0 + Axis that levels are associated with (0 for index, 1 for columns) + + Returns + ------- + is_label_or_level: bool + """ + return self._is_level_reference(key, axis=axis) or self._is_label_reference( + key, axis=axis + ) + + def _check_label_or_level_ambiguity(self, key, axis: int = 0) -> None: + """ + Check whether `key` is ambiguous. + + By ambiguous, we mean that it matches both a level of the input + `axis` and a label of the other axis. + + Parameters + ---------- + key: str or object + Label or level name. + axis: int, default 0 + Axis that levels are associated with (0 for index, 1 for columns). + + Raises + ------ + ValueError: `key` is ambiguous + """ + axis = self._get_axis_number(axis) + other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis) + + if ( + key is not None + and is_hashable(key) + and key in self.axes[axis].names + and any(key in self.axes[ax] for ax in other_axes) + ): + + # Build an informative and grammatical warning + level_article, level_type = ( + ("an", "index") if axis == 0 else ("a", "column") + ) + + label_article, label_type = ( + ("a", "column") if axis == 0 else ("an", "index") + ) + + msg = ( + f"'{key}' is both {level_article} {level_type} level and " + f"{label_article} {label_type} label, which is ambiguous." + ) + raise ValueError(msg) + + def _get_label_or_level_values(self, key: str, axis: int = 0) -> np.ndarray: + """ + Return a 1-D array of values associated with `key`, a label or level + from the given `axis`. + + Retrieval logic: + - (axis=0): Return column values if `key` matches a column label. + Otherwise return index level values if `key` matches an index + level. + - (axis=1): Return row values if `key` matches an index label. + Otherwise return column level values if 'key' matches a column + level + + Parameters + ---------- + key: str + Label or level name. + axis: int, default 0 + Axis that levels are associated with (0 for index, 1 for columns) + + Returns + ------- + values: np.ndarray + + Raises + ------ + KeyError + if `key` matches neither a label nor a level + ValueError + if `key` matches multiple labels + FutureWarning + if `key` is ambiguous. This will become an ambiguity error in a + future version + """ + axis = self._get_axis_number(axis) + other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis] + + if self._is_label_reference(key, axis=axis): + self._check_label_or_level_ambiguity(key, axis=axis) + values = self.xs(key, axis=other_axes[0])._values + elif self._is_level_reference(key, axis=axis): + values = self.axes[axis].get_level_values(key)._values + else: + raise KeyError(key) + + # Check for duplicates + if values.ndim > 1: + + if other_axes and isinstance(self._get_axis(other_axes[0]), MultiIndex): + multi_message = ( + "\n" + "For a multi-index, the label must be a " + "tuple with elements corresponding to " + "each level." + ) + else: + multi_message = "" + + label_axis_name = "column" if axis == 0 else "index" + raise ValueError( + ( + f"The {label_axis_name} label '{key}' " + f"is not unique.{multi_message}" + ) + ) + + return values + + def _drop_labels_or_levels(self, keys, axis: int = 0): + """ + Drop labels and/or levels for the given `axis`. + + For each key in `keys`: + - (axis=0): If key matches a column label then drop the column. + Otherwise if key matches an index level then drop the level. + - (axis=1): If key matches an index label then drop the row. + Otherwise if key matches a column level then drop the level. + + Parameters + ---------- + keys: str or list of str + labels or levels to drop + axis: int, default 0 + Axis that levels are associated with (0 for index, 1 for columns) + + Returns + ------- + dropped: DataFrame + + Raises + ------ + ValueError + if any `keys` match neither a label nor a level + """ + axis = self._get_axis_number(axis) + + # Validate keys + keys = com.maybe_make_list(keys) + invalid_keys = [ + k for k in keys if not self._is_label_or_level_reference(k, axis=axis) + ] + + if invalid_keys: + raise ValueError( + ( + "The following keys are not valid labels or " + f"levels for axis {axis}: {invalid_keys}" + ) + ) + + # Compute levels and labels to drop + levels_to_drop = [k for k in keys if self._is_level_reference(k, axis=axis)] + + labels_to_drop = [k for k in keys if not self._is_level_reference(k, axis=axis)] + + # Perform copy upfront and then use inplace operations below. + # This ensures that we always perform exactly one copy. + # ``copy`` and/or ``inplace`` options could be added in the future. + dropped = self.copy() + + if axis == 0: + # Handle dropping index levels + if levels_to_drop: + dropped.reset_index(levels_to_drop, drop=True, inplace=True) + + # Handle dropping columns labels + if labels_to_drop: + dropped.drop(labels_to_drop, axis=1, inplace=True) + else: + # Handle dropping column levels + if levels_to_drop: + if isinstance(dropped.columns, MultiIndex): + # Drop the specified levels from the MultiIndex + dropped.columns = dropped.columns.droplevel(levels_to_drop) + else: + # Drop the last level of Index by replacing with + # a RangeIndex + dropped.columns = RangeIndex(dropped.columns.size) + + # Handle dropping index labels + if labels_to_drop: + dropped.drop(labels_to_drop, axis=0, inplace=True) + + return dropped + + # ---------------------------------------------------------------------- + # Iteration + + def __hash__(self): + raise TypeError( + f"{repr(type(self).__name__)} objects are mutable, " + f"thus they cannot be hashed" + ) + + def __iter__(self): + """ + Iterate over info axis. + + Returns + ------- + iterator + Info axis as iterator. + """ + return iter(self._info_axis) + + # can we get a better explanation of this? + def keys(self): + """ + Get the 'info axis' (see Indexing for more). + + This is index for Series, columns for DataFrame. + + Returns + ------- + Index + Info axis. + """ + return self._info_axis + + def items(self): + """Iterate over (label, values) on info axis + + This is index for Series and columns for DataFrame. + + Returns + ------- + Generator + """ + for h in self._info_axis: + yield h, self[h] + + @Appender(items.__doc__) + def iteritems(self): + return self.items() + + def __len__(self) -> int: + """Returns length of info axis""" + return len(self._info_axis) + + def __contains__(self, key) -> bool_t: + """True if the key is in the info axis""" + return key in self._info_axis + + @property + def empty(self) -> bool_t: + """ + Indicator whether DataFrame is empty. + + True if DataFrame is entirely empty (no items), meaning any of the + axes are of length 0. + + Returns + ------- + bool + If DataFrame is empty, return True, if not return False. + + See Also + -------- + Series.dropna + DataFrame.dropna + + Notes + ----- + If DataFrame contains only NaNs, it is still not considered empty. See + the example below. + + Examples + -------- + An example of an actual empty DataFrame. Notice the index is empty: + + >>> df_empty = pd.DataFrame({'A' : []}) + >>> df_empty + Empty DataFrame + Columns: [A] + Index: [] + >>> df_empty.empty + True + + If we only have NaNs in our DataFrame, it is not considered empty! We + will need to drop the NaNs to make the DataFrame empty: + + >>> df = pd.DataFrame({'A' : [np.nan]}) + >>> df + A + 0 NaN + >>> df.empty + False + >>> df.dropna().empty + True + """ + return any(len(self._get_axis(a)) == 0 for a in self._AXIS_ORDERS) + + # ---------------------------------------------------------------------- + # Array Interface + + # This is also set in IndexOpsMixin + # GH#23114 Ensure ndarray.__op__(DataFrame) returns NotImplemented + __array_priority__ = 1000 + + def __array__(self, dtype=None) -> np.ndarray: + return com.values_from_object(self) + + def __array_wrap__(self, result, context=None): + result = lib.item_from_zerodim(result) + if is_scalar(result): + # e.g. we get here with np.ptp(series) + # ptp also requires the item_from_zerodim + return result + d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False) + return self._constructor(result, **d).__finalize__(self) + + # ideally we would define this to avoid the getattr checks, but + # is slower + # @property + # def __array_interface__(self): + # """ provide numpy array interface method """ + # values = self.values + # return dict(typestr=values.dtype.str,shape=values.shape,data=values) + + # ---------------------------------------------------------------------- + # Picklability + + def __getstate__(self) -> Dict[str, Any]: + meta = {k: getattr(self, k, None) for k in self._metadata} + return dict( + _data=self._data, + _typ=self._typ, + _metadata=self._metadata, + attrs=self.attrs, + **meta, + ) + + def __setstate__(self, state): + + if isinstance(state, BlockManager): + self._data = state + elif isinstance(state, dict): + typ = state.get("_typ") + if typ is not None: + attrs = state.get("_attrs", {}) + object.__setattr__(self, "_attrs", attrs) + + # set in the order of internal names + # to avoid definitional recursion + # e.g. say fill_value needing _data to be + # defined + meta = set(self._internal_names + self._metadata) + for k in list(meta): + if k in state: + v = state[k] + object.__setattr__(self, k, v) + + for k, v in state.items(): + if k not in meta: + object.__setattr__(self, k, v) + + else: + self._unpickle_series_compat(state) + elif len(state) == 2: + self._unpickle_series_compat(state) + + self._item_cache = {} + + # ---------------------------------------------------------------------- + # Rendering Methods + + def __repr__(self) -> str: + # string representation based upon iterating over self + # (since, by definition, `PandasContainers` are iterable) + prepr = f"[{','.join(map(pprint_thing, self))}]" + return f"{type(self).__name__}({prepr})" + + def _repr_latex_(self): + """ + Returns a LaTeX representation for a particular object. + Mainly for use with nbconvert (jupyter notebook conversion to pdf). + """ + if config.get_option("display.latex.repr"): + return self.to_latex() + else: + return None + + def _repr_data_resource_(self): + """ + Not a real Jupyter special repr method, but we use the same + naming convention. + """ + if config.get_option("display.html.table_schema"): + data = self.head(config.get_option("display.max_rows")) + payload = json.loads( + data.to_json(orient="table"), object_pairs_hook=collections.OrderedDict + ) + return payload + + # ---------------------------------------------------------------------- + # I/O Methods + + _shared_docs[ + "to_markdown" + ] = """ + Print %(klass)s in Markdown-friendly format. + + .. versionadded:: 1.0.0 + + Parameters + ---------- + buf : writable buffer, defaults to sys.stdout + Where to send the output. By default, the output is printed to + sys.stdout. Pass a writable buffer if you need to further process + the output. + mode : str, optional + Mode in which file is opened. + **kwargs + These parameters will be passed to `tabulate`. + + Returns + ------- + str + %(klass)s in Markdown-friendly format. + """ + + _shared_docs[ + "to_excel" + ] = """ + Write %(klass)s to an Excel sheet. + + To write a single %(klass)s to an Excel .xlsx file it is only necessary to + specify a target file name. To write to multiple sheets it is necessary to + create an `ExcelWriter` object with a target file name, and specify a sheet + in the file to write to. + + Multiple sheets may be written to by specifying unique `sheet_name`. + With all data written to the file it is necessary to save the changes. + Note that creating an `ExcelWriter` object with a file name that already + exists will result in the contents of the existing file being erased. + + Parameters + ---------- + excel_writer : str or ExcelWriter object + File path or existing ExcelWriter. + sheet_name : str, default 'Sheet1' + Name of sheet which will contain DataFrame. + na_rep : str, default '' + Missing data representation. + float_format : str, optional + Format string for floating point numbers. For example + ``float_format="%%.2f"`` will format 0.1234 to 0.12. + columns : sequence or list of str, optional + Columns to write. + header : bool or list of str, default True + Write out the column names. If a list of string is given it is + assumed to be aliases for the column names. + index : bool, default True + Write row names (index). + index_label : str or sequence, optional + Column label for index column(s) if desired. If not specified, and + `header` and `index` are True, then the index names are used. A + sequence should be given if the DataFrame uses MultiIndex. + startrow : int, default 0 + Upper left cell row to dump data frame. + startcol : int, default 0 + Upper left cell column to dump data frame. + engine : str, optional + Write engine to use, 'openpyxl' or 'xlsxwriter'. You can also set this + via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and + ``io.excel.xlsm.writer``. + merge_cells : bool, default True + Write MultiIndex and Hierarchical Rows as merged cells. + encoding : str, optional + Encoding of the resulting excel file. Only necessary for xlwt, + other writers support unicode natively. + inf_rep : str, default 'inf' + Representation for infinity (there is no native representation for + infinity in Excel). + verbose : bool, default True + Display more information in the error logs. + freeze_panes : tuple of int (length 2), optional + Specifies the one-based bottommost row and rightmost column that + is to be frozen. + + See Also + -------- + to_csv : Write DataFrame to a comma-separated values (csv) file. + ExcelWriter : Class for writing DataFrame objects into excel sheets. + read_excel : Read an Excel file into a pandas DataFrame. + read_csv : Read a comma-separated values (csv) file into DataFrame. + + Notes + ----- + For compatibility with :meth:`~DataFrame.to_csv`, + to_excel serializes lists and dicts to strings before writing. + + Once a workbook has been saved it is not possible write further data + without rewriting the whole workbook. + + Examples + -------- + + Create, write to and save a workbook: + + >>> df1 = pd.DataFrame([['a', 'b'], ['c', 'd']], + ... index=['row 1', 'row 2'], + ... columns=['col 1', 'col 2']) + >>> df1.to_excel("output.xlsx") # doctest: +SKIP + + To specify the sheet name: + + >>> df1.to_excel("output.xlsx", + ... sheet_name='Sheet_name_1') # doctest: +SKIP + + If you wish to write to more than one sheet in the workbook, it is + necessary to specify an ExcelWriter object: + + >>> df2 = df1.copy() + >>> with pd.ExcelWriter('output.xlsx') as writer: # doctest: +SKIP + ... df1.to_excel(writer, sheet_name='Sheet_name_1') + ... df2.to_excel(writer, sheet_name='Sheet_name_2') + + ExcelWriter can also be used to append to an existing Excel file: + + >>> with pd.ExcelWriter('output.xlsx', + ... mode='a') as writer: # doctest: +SKIP + ... df.to_excel(writer, sheet_name='Sheet_name_3') + + To set the library that is used to write the Excel file, + you can pass the `engine` keyword (the default engine is + automatically chosen depending on the file extension): + + >>> df1.to_excel('output1.xlsx', engine='xlsxwriter') # doctest: +SKIP + """ + + @Appender(_shared_docs["to_excel"] % dict(klass="object")) + def to_excel( + self, + excel_writer, + sheet_name="Sheet1", + na_rep="", + float_format=None, + columns=None, + header=True, + index=True, + index_label=None, + startrow=0, + startcol=0, + engine=None, + merge_cells=True, + encoding=None, + inf_rep="inf", + verbose=True, + freeze_panes=None, + ) -> None: + df = self if isinstance(self, ABCDataFrame) else self.to_frame() + + from pandas.io.formats.excel import ExcelFormatter + + formatter = ExcelFormatter( + df, + na_rep=na_rep, + cols=columns, + header=header, + float_format=float_format, + index=index, + index_label=index_label, + merge_cells=merge_cells, + inf_rep=inf_rep, + ) + formatter.write( + excel_writer, + sheet_name=sheet_name, + startrow=startrow, + startcol=startcol, + freeze_panes=freeze_panes, + engine=engine, + ) + + def to_json( + self, + path_or_buf: Optional[FilePathOrBuffer] = None, + orient: Optional[str] = None, + date_format: Optional[str] = None, + double_precision: int = 10, + force_ascii: bool_t = True, + date_unit: str = "ms", + default_handler: Optional[Callable[[Any], JSONSerializable]] = None, + lines: bool_t = False, + compression: Optional[str] = "infer", + index: bool_t = True, + indent: Optional[int] = None, + ) -> Optional[str]: + """ + Convert the object to a JSON string. + + Note NaN's and None will be converted to null and datetime objects + will be converted to UNIX timestamps. + + Parameters + ---------- + path_or_buf : str or file handle, optional + File path or object. If not specified, the result is returned as + a string. + orient : str + Indication of expected JSON string format. + + * Series: + + - default is 'index' + - allowed values are: {'split','records','index','table'}. + + * DataFrame: + + - default is 'columns' + - allowed values are: {'split', 'records', 'index', 'columns', + 'values', 'table'}. + + * The format of the JSON string: + + - 'split' : dict like {'index' -> [index], 'columns' -> [columns], + 'data' -> [values]} + - 'records' : list like [{column -> value}, ... , {column -> value}] + - 'index' : dict like {index -> {column -> value}} + - 'columns' : dict like {column -> {index -> value}} + - 'values' : just the values array + - 'table' : dict like {'schema': {schema}, 'data': {data}} + + Describing the data, where data component is like ``orient='records'``. + + .. versionchanged:: 0.20.0 + + date_format : {None, 'epoch', 'iso'} + Type of date conversion. 'epoch' = epoch milliseconds, + 'iso' = ISO8601. The default depends on the `orient`. For + ``orient='table'``, the default is 'iso'. For all other orients, + the default is 'epoch'. + double_precision : int, default 10 + The number of decimal places to use when encoding + floating point values. + force_ascii : bool, default True + Force encoded string to be ASCII. + date_unit : str, default 'ms' (milliseconds) + The time unit to encode to, governs timestamp and ISO8601 + precision. One of 's', 'ms', 'us', 'ns' for second, millisecond, + microsecond, and nanosecond respectively. + default_handler : callable, default None + Handler to call if object cannot otherwise be converted to a + suitable format for JSON. Should receive a single argument which is + the object to convert and return a serialisable object. + lines : bool, default False + If 'orient' is 'records' write out line delimited json format. Will + throw ValueError if incorrect 'orient' since others are not list + like. + + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None} + + A string representing the compression to use in the output file, + only used when the first argument is a filename. By default, the + compression is inferred from the filename. + + .. versionadded:: 0.21.0 + .. versionchanged:: 0.24.0 + 'infer' option added and set to default + index : bool, default True + Whether to include the index values in the JSON string. Not + including the index (``index=False``) is only supported when + orient is 'split' or 'table'. + + .. versionadded:: 0.23.0 + + indent : int, optional + Length of whitespace used to indent each record. + + .. versionadded:: 1.0.0 + + Returns + ------- + None or str + If path_or_buf is None, returns the resulting json format as a + string. Otherwise returns None. + + See Also + -------- + read_json + + Notes + ----- + The behavior of ``indent=0`` varies from the stdlib, which does not + indent the output but does insert newlines. Currently, ``indent=0`` + and the default ``indent=None`` are equivalent in pandas, though this + may change in a future release. + + Examples + -------- + + >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']], + ... index=['row 1', 'row 2'], + ... columns=['col 1', 'col 2']) + >>> df.to_json(orient='split') + '{"columns":["col 1","col 2"], + "index":["row 1","row 2"], + "data":[["a","b"],["c","d"]]}' + + Encoding/decoding a Dataframe using ``'records'`` formatted JSON. + Note that index labels are not preserved with this encoding. + + >>> df.to_json(orient='records') + '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]' + + Encoding/decoding a Dataframe using ``'index'`` formatted JSON: + + >>> df.to_json(orient='index') + '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}' + + Encoding/decoding a Dataframe using ``'columns'`` formatted JSON: + + >>> df.to_json(orient='columns') + '{"col 1":{"row 1":"a","row 2":"c"},"col 2":{"row 1":"b","row 2":"d"}}' + + Encoding/decoding a Dataframe using ``'values'`` formatted JSON: + + >>> df.to_json(orient='values') + '[["a","b"],["c","d"]]' + + Encoding with Table Schema + + >>> df.to_json(orient='table') + '{"schema": {"fields": [{"name": "index", "type": "string"}, + {"name": "col 1", "type": "string"}, + {"name": "col 2", "type": "string"}], + "primaryKey": "index", + "pandas_version": "0.20.0"}, + "data": [{"index": "row 1", "col 1": "a", "col 2": "b"}, + {"index": "row 2", "col 1": "c", "col 2": "d"}]}' + """ + + from pandas.io import json + + if date_format is None and orient == "table": + date_format = "iso" + elif date_format is None: + date_format = "epoch" + + config.is_nonnegative_int(indent) + indent = indent or 0 + + return json.to_json( + path_or_buf=path_or_buf, + obj=self, + orient=orient, + date_format=date_format, + double_precision=double_precision, + force_ascii=force_ascii, + date_unit=date_unit, + default_handler=default_handler, + lines=lines, + compression=compression, + index=index, + indent=indent, + ) + + def to_hdf( + self, + path_or_buf, + key: str, + mode: str = "a", + complevel: Optional[int] = None, + complib: Optional[str] = None, + append: bool_t = False, + format: Optional[str] = None, + index: bool_t = True, + min_itemsize: Optional[Union[int, Dict[str, int]]] = None, + nan_rep=None, + dropna: Optional[bool_t] = None, + data_columns: Optional[List[str]] = None, + errors: str = "strict", + encoding: str = "UTF-8", + ) -> None: + """ + Write the contained data to an HDF5 file using HDFStore. + + Hierarchical Data Format (HDF) is self-describing, allowing an + application to interpret the structure and contents of a file with + no outside information. One HDF file can hold a mix of related objects + which can be accessed as a group or as individual objects. + + In order to add another DataFrame or Series to an existing HDF file + please use append mode and a different a key. + + For more information see the :ref:`user guide `. + + Parameters + ---------- + path_or_buf : str or pandas.HDFStore + File path or HDFStore object. + key : str + Identifier for the group in the store. + mode : {'a', 'w', 'r+'}, default 'a' + Mode to open file: + + - 'w': write, a new file is created (an existing file with + the same name would be deleted). + - 'a': append, an existing file is opened for reading and + writing, and if the file does not exist it is created. + - 'r+': similar to 'a', but the file must already exist. + complevel : {0-9}, optional + Specifies a compression level for data. + A value of 0 disables compression. + complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib' + Specifies the compression library to be used. + As of v0.20.2 these additional compressors for Blosc are supported + (default if no compressor specified: 'blosc:blosclz'): + {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', + 'blosc:zlib', 'blosc:zstd'}. + Specifying a compression library which is not available issues + a ValueError. + append : bool, default False + For Table formats, append the input data to the existing. + format : {'fixed', 'table', None}, default 'fixed' + Possible values: + + - 'fixed': Fixed format. Fast writing/reading. Not-appendable, + nor searchable. + - 'table': Table format. Write as a PyTables Table structure + which may perform worse but allow more flexible operations + like searching / selecting subsets of the data. + - If None, pd.get_option('io.hdf.default_format') is checked, + followed by fallback to "fixed" + errors : str, default 'strict' + Specifies how encoding and decoding errors are to be handled. + See the errors argument for :func:`open` for a full list + of options. + encoding : str, default "UTF-8" + min_itemsize : dict or int, optional + Map column names to minimum string sizes for columns. + nan_rep : Any, optional + How to represent null values as str. + Not allowed with append=True. + data_columns : list of columns or True, optional + List of columns to create as indexed data columns for on-disk + queries, or True to use all columns. By default only the axes + of the object are indexed. See :ref:`io.hdf5-query-data-columns`. + Applicable only to format='table'. + + See Also + -------- + DataFrame.read_hdf : Read from HDF file. + DataFrame.to_parquet : Write a DataFrame to the binary parquet format. + DataFrame.to_sql : Write to a sql table. + DataFrame.to_feather : Write out feather-format for DataFrames. + DataFrame.to_csv : Write out to a csv file. + + Examples + -------- + >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, + ... index=['a', 'b', 'c']) + >>> df.to_hdf('data.h5', key='df', mode='w') + + We can add another object to the same file: + + >>> s = pd.Series([1, 2, 3, 4]) + >>> s.to_hdf('data.h5', key='s') + + Reading from HDF file: + + >>> pd.read_hdf('data.h5', 'df') + A B + a 1 4 + b 2 5 + c 3 6 + >>> pd.read_hdf('data.h5', 's') + 0 1 + 1 2 + 2 3 + 3 4 + dtype: int64 + + Deleting file with data: + + >>> import os + >>> os.remove('data.h5') + """ + from pandas.io import pytables + + pytables.to_hdf( + path_or_buf, + key, + self, + mode=mode, + complevel=complevel, + complib=complib, + append=append, + format=format, + index=index, + min_itemsize=min_itemsize, + nan_rep=nan_rep, + dropna=dropna, + data_columns=data_columns, + errors=errors, + encoding=encoding, + ) + + def to_sql( + self, + name: str, + con, + schema=None, + if_exists: str = "fail", + index: bool_t = True, + index_label=None, + chunksize=None, + dtype=None, + method=None, + ) -> None: + """ + Write records stored in a DataFrame to a SQL database. + + Databases supported by SQLAlchemy [1]_ are supported. Tables can be + newly created, appended to, or overwritten. + + Parameters + ---------- + name : str + Name of SQL table. + con : sqlalchemy.engine.Engine or sqlite3.Connection + Using SQLAlchemy makes it possible to use any DB supported by that + library. Legacy support is provided for sqlite3.Connection objects. The user + is responsible for engine disposal and connection closure for the SQLAlchemy + connectable See `here \ + `_ + + schema : str, optional + Specify the schema (if database flavor supports this). If None, use + default schema. + if_exists : {'fail', 'replace', 'append'}, default 'fail' + How to behave if the table already exists. + + * fail: Raise a ValueError. + * replace: Drop the table before inserting new values. + * append: Insert new values to the existing table. + + index : bool, default True + Write DataFrame index as a column. Uses `index_label` as the column + name in the table. + index_label : str or sequence, default None + Column label for index column(s). If None is given (default) and + `index` is True, then the index names are used. + A sequence should be given if the DataFrame uses MultiIndex. + chunksize : int, optional + Specify the number of rows in each batch to be written at a time. + By default, all rows will be written at once. + dtype : dict or scalar, optional + Specifying the datatype for columns. If a dictionary is used, the + keys should be the column names and the values should be the + SQLAlchemy types or strings for the sqlite3 legacy mode. If a + scalar is provided, it will be applied to all columns. + method : {None, 'multi', callable}, optional + Controls the SQL insertion clause used: + + * None : Uses standard SQL ``INSERT`` clause (one per row). + * 'multi': Pass multiple values in a single ``INSERT`` clause. + * callable with signature ``(pd_table, conn, keys, data_iter)``. + + Details and a sample callable implementation can be found in the + section :ref:`insert method `. + + .. versionadded:: 0.24.0 + + Raises + ------ + ValueError + When the table already exists and `if_exists` is 'fail' (the + default). + + See Also + -------- + read_sql : Read a DataFrame from a table. + + Notes + ----- + Timezone aware datetime columns will be written as + ``Timestamp with timezone`` type with SQLAlchemy if supported by the + database. Otherwise, the datetimes will be stored as timezone unaware + timestamps local to the original timezone. + + .. versionadded:: 0.24.0 + + References + ---------- + .. [1] http://docs.sqlalchemy.org + .. [2] https://www.python.org/dev/peps/pep-0249/ + + Examples + -------- + + Create an in-memory SQLite database. + + >>> from sqlalchemy import create_engine + >>> engine = create_engine('sqlite://', echo=False) + + Create a table from scratch with 3 rows. + + >>> df = pd.DataFrame({'name' : ['User 1', 'User 2', 'User 3']}) + >>> df + name + 0 User 1 + 1 User 2 + 2 User 3 + + >>> df.to_sql('users', con=engine) + >>> engine.execute("SELECT * FROM users").fetchall() + [(0, 'User 1'), (1, 'User 2'), (2, 'User 3')] + + >>> df1 = pd.DataFrame({'name' : ['User 4', 'User 5']}) + >>> df1.to_sql('users', con=engine, if_exists='append') + >>> engine.execute("SELECT * FROM users").fetchall() + [(0, 'User 1'), (1, 'User 2'), (2, 'User 3'), + (0, 'User 4'), (1, 'User 5')] + + Overwrite the table with just ``df1``. + + >>> df1.to_sql('users', con=engine, if_exists='replace', + ... index_label='id') + >>> engine.execute("SELECT * FROM users").fetchall() + [(0, 'User 4'), (1, 'User 5')] + + Specify the dtype (especially useful for integers with missing values). + Notice that while pandas is forced to store the data as floating point, + the database supports nullable integers. When fetching the data with + Python, we get back integer scalars. + + >>> df = pd.DataFrame({"A": [1, None, 2]}) + >>> df + A + 0 1.0 + 1 NaN + 2 2.0 + + >>> from sqlalchemy.types import Integer + >>> df.to_sql('integers', con=engine, index=False, + ... dtype={"A": Integer()}) + + >>> engine.execute("SELECT * FROM integers").fetchall() + [(1,), (None,), (2,)] + """ + from pandas.io import sql + + sql.to_sql( + self, + name, + con, + schema=schema, + if_exists=if_exists, + index=index, + index_label=index_label, + chunksize=chunksize, + dtype=dtype, + method=method, + ) + + def to_pickle( + self, + path, + compression: Optional[str] = "infer", + protocol: int = pickle.HIGHEST_PROTOCOL, + ) -> None: + """ + Pickle (serialize) object to file. + + Parameters + ---------- + path : str + File path where the pickled object will be stored. + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, \ + default 'infer' + A string representing the compression to use in the output file. By + default, infers from the file extension in specified path. + protocol : int + Int which indicates which protocol should be used by the pickler, + default HIGHEST_PROTOCOL (see [1]_ paragraph 12.1.2). The possible + values are 0, 1, 2, 3, 4. A negative value for the protocol + parameter is equivalent to setting its value to HIGHEST_PROTOCOL. + + .. [1] https://docs.python.org/3/library/pickle.html. + .. versionadded:: 0.21.0. + + See Also + -------- + read_pickle : Load pickled pandas object (or any object) from file. + DataFrame.to_hdf : Write DataFrame to an HDF5 file. + DataFrame.to_sql : Write DataFrame to a SQL database. + DataFrame.to_parquet : Write a DataFrame to the binary parquet format. + + Examples + -------- + >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)}) + >>> original_df + foo bar + 0 0 5 + 1 1 6 + 2 2 7 + 3 3 8 + 4 4 9 + >>> original_df.to_pickle("./dummy.pkl") + + >>> unpickled_df = pd.read_pickle("./dummy.pkl") + >>> unpickled_df + foo bar + 0 0 5 + 1 1 6 + 2 2 7 + 3 3 8 + 4 4 9 + + >>> import os + >>> os.remove("./dummy.pkl") + """ + from pandas.io.pickle import to_pickle + + to_pickle(self, path, compression=compression, protocol=protocol) + + def to_clipboard( + self, excel: bool_t = True, sep: Optional[str] = None, **kwargs + ) -> None: + r""" + Copy object to the system clipboard. + + Write a text representation of object to the system clipboard. + This can be pasted into Excel, for example. + + Parameters + ---------- + excel : bool, default True + Produce output in a csv format for easy pasting into excel. + + - True, use the provided separator for csv pasting. + - False, write a string representation of the object to the clipboard. + + sep : str, default ``'\t'`` + Field delimiter. + **kwargs + These parameters will be passed to DataFrame.to_csv. + + See Also + -------- + DataFrame.to_csv : Write a DataFrame to a comma-separated values + (csv) file. + read_clipboard : Read text from clipboard and pass to read_table. + + Notes + ----- + Requirements for your platform. + + - Linux : `xclip`, or `xsel` (with `PyQt4` modules) + - Windows : none + - OS X : none + + Examples + -------- + Copy the contents of a DataFrame to the clipboard. + + >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C']) + >>> df.to_clipboard(sep=',') + ... # Wrote the following to the system clipboard: + ... # ,A,B,C + ... # 0,1,2,3 + ... # 1,4,5,6 + + We can omit the the index by passing the keyword `index` and setting + it to false. + + >>> df.to_clipboard(sep=',', index=False) + ... # Wrote the following to the system clipboard: + ... # A,B,C + ... # 1,2,3 + ... # 4,5,6 + """ + from pandas.io import clipboards + + clipboards.to_clipboard(self, excel=excel, sep=sep, **kwargs) + + def to_xarray(self): + """ + Return an xarray object from the pandas object. + + Returns + ------- + xarray.DataArray or xarray.Dataset + Data in the pandas structure converted to Dataset if the object is + a DataFrame, or a DataArray if the object is a Series. + + See Also + -------- + DataFrame.to_hdf : Write DataFrame to an HDF5 file. + DataFrame.to_parquet : Write a DataFrame to the binary parquet format. + + Notes + ----- + See the `xarray docs `__ + + Examples + -------- + >>> df = pd.DataFrame([('falcon', 'bird', 389.0, 2), + ... ('parrot', 'bird', 24.0, 2), + ... ('lion', 'mammal', 80.5, 4), + ... ('monkey', 'mammal', np.nan, 4)], + ... columns=['name', 'class', 'max_speed', + ... 'num_legs']) + >>> df + name class max_speed num_legs + 0 falcon bird 389.0 2 + 1 parrot bird 24.0 2 + 2 lion mammal 80.5 4 + 3 monkey mammal NaN 4 + + >>> df.to_xarray() + + Dimensions: (index: 4) + Coordinates: + * index (index) int64 0 1 2 3 + Data variables: + name (index) object 'falcon' 'parrot' 'lion' 'monkey' + class (index) object 'bird' 'bird' 'mammal' 'mammal' + max_speed (index) float64 389.0 24.0 80.5 nan + num_legs (index) int64 2 2 4 4 + + >>> df['max_speed'].to_xarray() + + array([389. , 24. , 80.5, nan]) + Coordinates: + * index (index) int64 0 1 2 3 + + >>> dates = pd.to_datetime(['2018-01-01', '2018-01-01', + ... '2018-01-02', '2018-01-02']) + >>> df_multiindex = pd.DataFrame({'date': dates, + ... 'animal': ['falcon', 'parrot', + ... 'falcon', 'parrot'], + ... 'speed': [350, 18, 361, 15]}) + >>> df_multiindex = df_multiindex.set_index(['date', 'animal']) + + >>> df_multiindex + speed + date animal + 2018-01-01 falcon 350 + parrot 18 + 2018-01-02 falcon 361 + parrot 15 + + >>> df_multiindex.to_xarray() + + Dimensions: (animal: 2, date: 2) + Coordinates: + * date (date) datetime64[ns] 2018-01-01 2018-01-02 + * animal (animal) object 'falcon' 'parrot' + Data variables: + speed (date, animal) int64 350 18 361 15 + """ + xarray = import_optional_dependency("xarray") + + if self.ndim == 1: + return xarray.DataArray.from_series(self) + else: + return xarray.Dataset.from_dataframe(self) + + @Substitution(returns=fmt.return_docstring) + def to_latex( + self, + buf=None, + columns=None, + col_space=None, + header=True, + index=True, + na_rep="NaN", + formatters=None, + float_format=None, + sparsify=None, + index_names=True, + bold_rows=False, + column_format=None, + longtable=None, + escape=None, + encoding=None, + decimal=".", + multicolumn=None, + multicolumn_format=None, + multirow=None, + caption=None, + label=None, + ): + r""" + Render object to a LaTeX tabular, longtable, or nested table/tabular. + + Requires ``\usepackage{booktabs}``. The output can be copy/pasted + into a main LaTeX document or read from an external file + with ``\input{table.tex}``. + + .. versionchanged:: 0.20.2 + Added to Series. + + .. versionchanged:: 1.0.0 + Added caption and label arguments. + + Parameters + ---------- + buf : str, Path or StringIO-like, optional, default None + Buffer to write to. If None, the output is returned as a string. + columns : list of label, optional + The subset of columns to write. Writes all columns by default. + col_space : int, optional + The minimum width of each column. + header : bool or list of str, default True + Write out the column names. If a list of strings is given, + it is assumed to be aliases for the column names. + index : bool, default True + Write row names (index). + na_rep : str, default 'NaN' + Missing data representation. + formatters : list of functions or dict of {str: function}, optional + Formatter functions to apply to columns' elements by position or + name. The result of each function must be a unicode string. + List must be of length equal to the number of columns. + float_format : one-parameter function or str, optional, default None + Formatter for floating point numbers. For example + ``float_format="%%.2f"`` and ``float_format="{:0.2f}".format`` will + both result in 0.1234 being formatted as 0.12. + sparsify : bool, optional + Set to False for a DataFrame with a hierarchical index to print + every multiindex key at each row. By default, the value will be + read from the config module. + index_names : bool, default True + Prints the names of the indexes. + bold_rows : bool, default False + Make the row labels bold in the output. + column_format : str, optional + The columns format as specified in `LaTeX table format + `__ e.g. 'rcl' for 3 + columns. By default, 'l' will be used for all columns except + columns of numbers, which default to 'r'. + longtable : bool, optional + By default, the value will be read from the pandas config + module. Use a longtable environment instead of tabular. Requires + adding a \usepackage{longtable} to your LaTeX preamble. + escape : bool, optional + By default, the value will be read from the pandas config + module. When set to False prevents from escaping latex special + characters in column names. + encoding : str, optional + A string representing the encoding to use in the output file, + defaults to 'utf-8'. + decimal : str, default '.' + Character recognized as decimal separator, e.g. ',' in Europe. + multicolumn : bool, default True + Use \multicolumn to enhance MultiIndex columns. + The default will be read from the config module. + multicolumn_format : str, default 'l' + The alignment for multicolumns, similar to `column_format` + The default will be read from the config module. + multirow : bool, default False + Use \multirow to enhance MultiIndex rows. Requires adding a + \usepackage{multirow} to your LaTeX preamble. Will print + centered labels (instead of top-aligned) across the contained + rows, separating groups via clines. The default will be read + from the pandas config module. + caption : str, optional + The LaTeX caption to be placed inside ``\caption{}`` in the output. + + .. versionadded:: 1.0.0 + + label : str, optional + The LaTeX label to be placed inside ``\label{}`` in the output. + This is used with ``\ref{}`` in the main ``.tex`` file. + + .. versionadded:: 1.0.0 + %(returns)s + See Also + -------- + DataFrame.to_string : Render a DataFrame to a console-friendly + tabular output. + DataFrame.to_html : Render a DataFrame as an HTML table. + + Examples + -------- + >>> df = pd.DataFrame({'name': ['Raphael', 'Donatello'], + ... 'mask': ['red', 'purple'], + ... 'weapon': ['sai', 'bo staff']}) + >>> print(df.to_latex(index=False)) # doctest: +NORMALIZE_WHITESPACE + \begin{tabular}{lll} + \toprule + name & mask & weapon \\ + \midrule + Raphael & red & sai \\ + Donatello & purple & bo staff \\ + \bottomrule + \end{tabular} + """ + # Get defaults from the pandas config + if self.ndim == 1: + self = self.to_frame() + if longtable is None: + longtable = config.get_option("display.latex.longtable") + if escape is None: + escape = config.get_option("display.latex.escape") + if multicolumn is None: + multicolumn = config.get_option("display.latex.multicolumn") + if multicolumn_format is None: + multicolumn_format = config.get_option("display.latex.multicolumn_format") + if multirow is None: + multirow = config.get_option("display.latex.multirow") + + formatter = DataFrameFormatter( + self, + columns=columns, + col_space=col_space, + na_rep=na_rep, + header=header, + index=index, + formatters=formatters, + float_format=float_format, + bold_rows=bold_rows, + sparsify=sparsify, + index_names=index_names, + escape=escape, + decimal=decimal, + ) + return formatter.to_latex( + buf=buf, + column_format=column_format, + longtable=longtable, + encoding=encoding, + multicolumn=multicolumn, + multicolumn_format=multicolumn_format, + multirow=multirow, + caption=caption, + label=label, + ) + + def to_csv( + self, + path_or_buf: Optional[FilePathOrBuffer] = None, + sep: str = ",", + na_rep: str = "", + float_format: Optional[str] = None, + columns: Optional[Sequence[Optional[Hashable]]] = None, + header: Union[bool_t, List[str]] = True, + index: bool_t = True, + index_label: Optional[Union[bool_t, str, Sequence[Optional[Hashable]]]] = None, + mode: str = "w", + encoding: Optional[str] = None, + compression: Optional[Union[str, Mapping[str, str]]] = "infer", + quoting: Optional[int] = None, + quotechar: str = '"', + line_terminator: Optional[str] = None, + chunksize: Optional[int] = None, + date_format: Optional[str] = None, + doublequote: bool_t = True, + escapechar: Optional[str] = None, + decimal: Optional[str] = ".", + ) -> Optional[str]: + r""" + Write object to a comma-separated values (csv) file. + + .. versionchanged:: 0.24.0 + The order of arguments for Series was changed. + + Parameters + ---------- + path_or_buf : str or file handle, default None + File path or object, if None is provided the result is returned as + a string. If a file object is passed it should be opened with + `newline=''`, disabling universal newlines. + + .. versionchanged:: 0.24.0 + + Was previously named "path" for Series. + + sep : str, default ',' + String of length 1. Field delimiter for the output file. + na_rep : str, default '' + Missing data representation. + float_format : str, default None + Format string for floating point numbers. + columns : sequence, optional + Columns to write. + header : bool or list of str, default True + Write out the column names. If a list of strings is given it is + assumed to be aliases for the column names. + + .. versionchanged:: 0.24.0 + + Previously defaulted to False for Series. + + index : bool, default True + Write row names (index). + index_label : str or sequence, or False, default None + Column label for index column(s) if desired. If None is given, and + `header` and `index` are True, then the index names are used. A + sequence should be given if the object uses MultiIndex. If + False do not print fields for index names. Use index_label=False + for easier importing in R. + mode : str + Python write mode, default 'w'. + encoding : str, optional + A string representing the encoding to use in the output file, + defaults to 'utf-8'. + compression : str or dict, default 'infer' + If str, represents compression mode. If dict, value at 'method' is + the compression mode. Compression mode may be any of the following + possible values: {'infer', 'gzip', 'bz2', 'zip', 'xz', None}. If + compression mode is 'infer' and `path_or_buf` is path-like, then + detect compression mode from the following extensions: '.gz', + '.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given + and mode is 'zip' or inferred as 'zip', other entries passed as + additional compression options. + + .. versionchanged:: 1.0.0 + + May now be a dict with key 'method' as compression mode + and other entries as additional compression options if + compression mode is 'zip'. + + quoting : optional constant from csv module + Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` + then floats are converted to strings and thus csv.QUOTE_NONNUMERIC + will treat them as non-numeric. + quotechar : str, default '\"' + String of length 1. Character used to quote fields. + line_terminator : str, optional + The newline character or character sequence to use in the output + file. Defaults to `os.linesep`, which depends on the OS in which + this method is called ('\n' for linux, '\r\n' for Windows, i.e.). + + .. versionchanged:: 0.24.0 + chunksize : int or None + Rows to write at a time. + date_format : str, default None + Format string for datetime objects. + doublequote : bool, default True + Control quoting of `quotechar` inside a field. + escapechar : str, default None + String of length 1. Character used to escape `sep` and `quotechar` + when appropriate. + decimal : str, default '.' + Character recognized as decimal separator. E.g. use ',' for + European data. + + Returns + ------- + None or str + If path_or_buf is None, returns the resulting csv format as a + string. Otherwise returns None. + + See Also + -------- + read_csv : Load a CSV file into a DataFrame. + to_excel : Write DataFrame to an Excel file. + + Examples + -------- + >>> df = pd.DataFrame({'name': ['Raphael', 'Donatello'], + ... 'mask': ['red', 'purple'], + ... 'weapon': ['sai', 'bo staff']}) + >>> df.to_csv(index=False) + 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n' + + # create 'out.zip' containing 'out.csv' + >>> compression_opts = dict(method='zip', + ... archive_name='out.csv') # doctest: +SKIP + + >>> df.to_csv('out.zip', index=False, + ... compression=compression_opts) # doctest: +SKIP + """ + + df = self if isinstance(self, ABCDataFrame) else self.to_frame() + + from pandas.io.formats.csvs import CSVFormatter + + formatter = CSVFormatter( + df, + path_or_buf, + line_terminator=line_terminator, + sep=sep, + encoding=encoding, + compression=compression, + quoting=quoting, + na_rep=na_rep, + float_format=float_format, + cols=columns, + header=header, + index=index, + index_label=index_label, + mode=mode, + chunksize=chunksize, + quotechar=quotechar, + date_format=date_format, + doublequote=doublequote, + escapechar=escapechar, + decimal=decimal, + ) + formatter.save() + + if path_or_buf is None: + return formatter.path_or_buf.getvalue() + + return None + + # ---------------------------------------------------------------------- + # Fancy Indexing + + @classmethod + def _create_indexer(cls, name: str, indexer) -> None: + """Create an indexer like _name in the class. + + Kept for compatibility with geopandas. To be removed in the future. See GH27258 + """ + if getattr(cls, name, None) is None: + _indexer = functools.partial(indexer, name) + setattr(cls, name, property(_indexer, doc=indexer.__doc__)) + + # ---------------------------------------------------------------------- + # Lookup Caching + + def _set_as_cached(self, item, cacher) -> None: + """Set the _cacher attribute on the calling object with a weakref to + cacher. + """ + self._cacher = (item, weakref.ref(cacher)) + + def _reset_cacher(self) -> None: + """Reset the cacher.""" + if hasattr(self, "_cacher"): + del self._cacher + + def _maybe_cache_changed(self, item, value) -> None: + """The object has called back to us saying maybe it has changed. + """ + self._data.set(item, value) + + @property + def _is_cached(self) -> bool_t: + """Return boolean indicating if self is cached or not.""" + return getattr(self, "_cacher", None) is not None + + def _get_cacher(self): + """return my cacher or None""" + cacher = getattr(self, "_cacher", None) + if cacher is not None: + cacher = cacher[1]() + return cacher + + def _maybe_update_cacher( + self, clear: bool_t = False, verify_is_copy: bool_t = True + ) -> None: + """ + See if we need to update our parent cacher if clear, then clear our + cache. + + Parameters + ---------- + clear : bool, default False + Clear the item cache. + verify_is_copy : bool, default True + Provide is_copy checks. + """ + + cacher = getattr(self, "_cacher", None) + if cacher is not None: + ref = cacher[1]() + + # we are trying to reference a dead referant, hence + # a copy + if ref is None: + del self._cacher + else: + # Note: we need to call ref._maybe_cache_changed even in the + # case where it will raise. (Uh, not clear why) + try: + ref._maybe_cache_changed(cacher[0], self) + except AssertionError: + # ref._data.setitem can raise + # AssertionError because of shape mismatch + pass + + if verify_is_copy: + self._check_setitem_copy(stacklevel=5, t="referant") + + if clear: + self._clear_item_cache() + + def _clear_item_cache(self) -> None: + self._item_cache.clear() + + # ---------------------------------------------------------------------- + # Indexing Methods + + def take( + self: FrameOrSeries, indices, axis=0, is_copy: Optional[bool_t] = None, **kwargs + ) -> FrameOrSeries: + """ + Return the elements in the given *positional* indices along an axis. + + This means that we are not indexing according to actual values in + the index attribute of the object. We are indexing according to the + actual position of the element in the object. + + Parameters + ---------- + indices : array-like + An array of ints indicating which positions to take. + axis : {0 or 'index', 1 or 'columns', None}, default 0 + The axis on which to select elements. ``0`` means that we are + selecting rows, ``1`` means that we are selecting columns. + is_copy : bool + Before pandas 1.0, ``is_copy=False`` can be specified to ensure + that the return value is an actual copy. Starting with pandas 1.0, + ``take`` always returns a copy, and the keyword is therefore + deprecated. + + .. deprecated:: 1.0.0 + **kwargs + For compatibility with :meth:`numpy.take`. Has no effect on the + output. + + Returns + ------- + taken : same type as caller + An array-like containing the elements taken from the object. + + See Also + -------- + DataFrame.loc : Select a subset of a DataFrame by labels. + DataFrame.iloc : Select a subset of a DataFrame by positions. + numpy.take : Take elements from an array along an axis. + + Examples + -------- + >>> df = pd.DataFrame([('falcon', 'bird', 389.0), + ... ('parrot', 'bird', 24.0), + ... ('lion', 'mammal', 80.5), + ... ('monkey', 'mammal', np.nan)], + ... columns=['name', 'class', 'max_speed'], + ... index=[0, 2, 3, 1]) + >>> df + name class max_speed + 0 falcon bird 389.0 + 2 parrot bird 24.0 + 3 lion mammal 80.5 + 1 monkey mammal NaN + + Take elements at positions 0 and 3 along the axis 0 (default). + + Note how the actual indices selected (0 and 1) do not correspond to + our selected indices 0 and 3. That's because we are selecting the 0th + and 3rd rows, not rows whose indices equal 0 and 3. + + >>> df.take([0, 3]) + name class max_speed + 0 falcon bird 389.0 + 1 monkey mammal NaN + + Take elements at indices 1 and 2 along the axis 1 (column selection). + + >>> df.take([1, 2], axis=1) + class max_speed + 0 bird 389.0 + 2 bird 24.0 + 3 mammal 80.5 + 1 mammal NaN + + We may take elements using negative integers for positive indices, + starting from the end of the object, just like with Python lists. + + >>> df.take([-1, -2]) + name class max_speed + 1 monkey mammal NaN + 3 lion mammal 80.5 + """ + if is_copy is not None: + warnings.warn( + "is_copy is deprecated and will be removed in a future version. " + "'take' always returns a copy, so there is no need to specify this.", + FutureWarning, + stacklevel=2, + ) + + nv.validate_take(tuple(), kwargs) + + self._consolidate_inplace() + + new_data = self._data.take( + indices, axis=self._get_block_manager_axis(axis), verify=True + ) + return self._constructor(new_data).__finalize__(self) + + def _take_with_is_copy( + self: FrameOrSeries, indices, axis=0, **kwargs + ) -> FrameOrSeries: + """ + Internal version of the `take` method that sets the `_is_copy` + attribute to keep track of the parent dataframe (using in indexing + for the SettingWithCopyWarning). + + See the docstring of `take` for full explanation of the parameters. + """ + result = self.take(indices=indices, axis=axis, **kwargs) + # Maybe set copy if we didn't actually change the index. + if not result._get_axis(axis).equals(self._get_axis(axis)): + result._set_is_copy(self) + return result + + def xs(self, key, axis=0, level=None, drop_level: bool_t = True): + """ + Return cross-section from the Series/DataFrame. + + This method takes a `key` argument to select data at a particular + level of a MultiIndex. + + Parameters + ---------- + key : label or tuple of label + Label contained in the index, or partially in a MultiIndex. + axis : {0 or 'index', 1 or 'columns'}, default 0 + Axis to retrieve cross-section on. + level : object, defaults to first n levels (n=1 or len(key)) + In case of a key partially contained in a MultiIndex, indicate + which levels are used. Levels can be referred by label or position. + drop_level : bool, default True + If False, returns object with same levels as self. + + Returns + ------- + Series or DataFrame + Cross-section from the original Series or DataFrame + corresponding to the selected index levels. + + See Also + -------- + DataFrame.loc : Access a group of rows and columns + by label(s) or a boolean array. + DataFrame.iloc : Purely integer-location based indexing + for selection by position. + + Notes + ----- + `xs` can not be used to set values. + + MultiIndex Slicers is a generic way to get/set values on + any level or levels. + It is a superset of `xs` functionality, see + :ref:`MultiIndex Slicers `. + + Examples + -------- + >>> d = {'num_legs': [4, 4, 2, 2], + ... 'num_wings': [0, 0, 2, 2], + ... 'class': ['mammal', 'mammal', 'mammal', 'bird'], + ... 'animal': ['cat', 'dog', 'bat', 'penguin'], + ... 'locomotion': ['walks', 'walks', 'flies', 'walks']} + >>> df = pd.DataFrame(data=d) + >>> df = df.set_index(['class', 'animal', 'locomotion']) + >>> df + num_legs num_wings + class animal locomotion + mammal cat walks 4 0 + dog walks 4 0 + bat flies 2 2 + bird penguin walks 2 2 + + Get values at specified index + + >>> df.xs('mammal') + num_legs num_wings + animal locomotion + cat walks 4 0 + dog walks 4 0 + bat flies 2 2 + + Get values at several indexes + + >>> df.xs(('mammal', 'dog')) + num_legs num_wings + locomotion + walks 4 0 + + Get values at specified index and level + + >>> df.xs('cat', level=1) + num_legs num_wings + class locomotion + mammal walks 4 0 + + Get values at several indexes and levels + + >>> df.xs(('bird', 'walks'), + ... level=[0, 'locomotion']) + num_legs num_wings + animal + penguin 2 2 + + Get values at specified column and axis + + >>> df.xs('num_wings', axis=1) + class animal locomotion + mammal cat walks 0 + dog walks 0 + bat flies 2 + bird penguin walks 2 + Name: num_wings, dtype: int64 + """ + axis = self._get_axis_number(axis) + labels = self._get_axis(axis) + if level is not None: + loc, new_ax = labels.get_loc_level(key, level=level, drop_level=drop_level) + + # create the tuple of the indexer + _indexer = [slice(None)] * self.ndim + _indexer[axis] = loc + indexer = tuple(_indexer) + + result = self.iloc[indexer] + setattr(result, result._get_axis_name(axis), new_ax) + return result + + if axis == 1: + return self[key] + + self._consolidate_inplace() + + index = self.index + if isinstance(index, MultiIndex): + loc, new_index = self.index.get_loc_level(key, drop_level=drop_level) + else: + loc = self.index.get_loc(key) + + if isinstance(loc, np.ndarray): + if loc.dtype == np.bool_: + (inds,) = loc.nonzero() + return self._take_with_is_copy(inds, axis=axis) + else: + return self._take_with_is_copy(loc, axis=axis) + + if not is_scalar(loc): + new_index = self.index[loc] + + if is_scalar(loc): + new_values = self._data.fast_xs(loc) + + # may need to box a datelike-scalar + # + # if we encounter an array-like and we only have 1 dim + # that means that their are list/ndarrays inside the Series! + # so just return them (GH 6394) + if not is_list_like(new_values) or self.ndim == 1: + return com.maybe_box_datetimelike(new_values) + + result = self._constructor_sliced( + new_values, + index=self.columns, + name=self.index[loc], + dtype=new_values.dtype, + ) + + else: + result = self.iloc[loc] + result.index = new_index + + # this could be a view + # but only in a single-dtyped view sliceable case + result._set_is_copy(self, copy=not result._is_view) + return result + + _xs: Callable = xs + + def __getitem__(self, item): + raise AbstractMethodError(self) + + def _get_item_cache(self, item): + """Return the cached item, item represents a label indexer.""" + cache = self._item_cache + res = cache.get(item) + if res is None: + values = self._data.get(item) + res = self._box_item_values(item, values) + cache[item] = res + res._set_as_cached(item, self) + + # for a chain + res._is_copy = self._is_copy + return res + + def _iget_item_cache(self, item): + """Return the cached item, item represents a positional indexer.""" + ax = self._info_axis + if ax.is_unique: + lower = self._get_item_cache(ax[item]) + else: + lower = self._take_with_is_copy(item, axis=self._info_axis_number) + return lower + + def _box_item_values(self, key, values): + raise AbstractMethodError(self) + + def _slice(self: FrameOrSeries, slobj: slice, axis=0, kind=None) -> FrameOrSeries: + """ + Construct a slice of this container. + + kind parameter is maintained for compatibility with Series slicing. + """ + axis = self._get_block_manager_axis(axis) + result = self._constructor(self._data.get_slice(slobj, axis=axis)) + result = result.__finalize__(self) + + # this could be a view + # but only in a single-dtyped view sliceable case + is_copy = axis != 0 or result._is_view + result._set_is_copy(self, copy=is_copy) + return result + + def _set_item(self, key, value) -> None: + self._data.set(key, value) + self._clear_item_cache() + + def _set_is_copy(self, ref=None, copy: bool_t = True) -> None: + if not copy: + self._is_copy = None + else: + if ref is not None: + self._is_copy = weakref.ref(ref) + else: + self._is_copy = None + + def _check_is_chained_assignment_possible(self) -> bool_t: + """ + Check if we are a view, have a cacher, and are of mixed type. + If so, then force a setitem_copy check. + + Should be called just near setting a value + + Will return a boolean if it we are a view and are cached, but a + single-dtype meaning that the cacher should be updated following + setting. + """ + if self._is_view and self._is_cached: + ref = self._get_cacher() + if ref is not None and ref._is_mixed_type: + self._check_setitem_copy(stacklevel=4, t="referant", force=True) + return True + elif self._is_copy: + self._check_setitem_copy(stacklevel=4, t="referant") + return False + + def _check_setitem_copy(self, stacklevel=4, t="setting", force=False): + """ + + Parameters + ---------- + stacklevel : int, default 4 + the level to show of the stack when the error is output + t : str, the type of setting error + force : bool, default False + If True, then force showing an error. + + validate if we are doing a setitem on a chained copy. + + If you call this function, be sure to set the stacklevel such that the + user will see the error *at the level of setting* + + It is technically possible to figure out that we are setting on + a copy even WITH a multi-dtyped pandas object. In other words, some + blocks may be views while other are not. Currently _is_view will ALWAYS + return False for multi-blocks to avoid having to handle this case. + + df = DataFrame(np.arange(0,9), columns=['count']) + df['group'] = 'b' + + # This technically need not raise SettingWithCopy if both are view + # (which is not # generally guaranteed but is usually True. However, + # this is in general not a good practice and we recommend using .loc. + df.iloc[0:5]['group'] = 'a' + + """ + + # return early if the check is not needed + if not (force or self._is_copy): + return + + value = config.get_option("mode.chained_assignment") + if value is None: + return + + # see if the copy is not actually referred; if so, then dissolve + # the copy weakref + if self._is_copy is not None and not isinstance(self._is_copy, str): + r = self._is_copy() + if not gc.get_referents(r) or r.shape == self.shape: + self._is_copy = None + return + + # a custom message + if isinstance(self._is_copy, str): + t = self._is_copy + + elif t == "referant": + t = ( + "\n" + "A value is trying to be set on a copy of a slice from a " + "DataFrame\n\n" + "See the caveats in the documentation: " + "https://pandas.pydata.org/pandas-docs/stable/user_guide/" + "indexing.html#returning-a-view-versus-a-copy" + ) + + else: + t = ( + "\n" + "A value is trying to be set on a copy of a slice from a " + "DataFrame.\n" + "Try using .loc[row_indexer,col_indexer] = value " + "instead\n\nSee the caveats in the documentation: " + "https://pandas.pydata.org/pandas-docs/stable/user_guide/" + "indexing.html#returning-a-view-versus-a-copy" + ) + + if value == "raise": + raise com.SettingWithCopyError(t) + elif value == "warn": + warnings.warn(t, com.SettingWithCopyWarning, stacklevel=stacklevel) + + def __delitem__(self, key) -> None: + """ + Delete item + """ + deleted = False + + maybe_shortcut = False + if self.ndim == 2 and isinstance(self.columns, MultiIndex): + try: + maybe_shortcut = key not in self.columns._engine + except TypeError: + pass + + if maybe_shortcut: + # Allow shorthand to delete all columns whose first len(key) + # elements match key: + if not isinstance(key, tuple): + key = (key,) + for col in self.columns: + if isinstance(col, tuple) and col[: len(key)] == key: + del self[col] + deleted = True + if not deleted: + # If the above loop ran and didn't delete anything because + # there was no match, this call should raise the appropriate + # exception: + self._data.delete(key) + + # delete from the caches + try: + del self._item_cache[key] + except KeyError: + pass + + # ---------------------------------------------------------------------- + # Unsorted + + def get(self, key, default=None): + """ + Get item from object for given key (ex: DataFrame column). + + Returns default value if not found. + + Parameters + ---------- + key : object + + Returns + ------- + value : same type as items contained in object + """ + try: + return self[key] + except (KeyError, ValueError, IndexError): + return default + + @property + def _is_view(self): + """Return boolean indicating if self is view of another array """ + return self._data.is_view + + def reindex_like( + self: FrameOrSeries, + other, + method: Optional[str] = None, + copy: bool_t = True, + limit=None, + tolerance=None, + ) -> FrameOrSeries: + """ + Return an object with matching indices as other object. + + Conform the object to the same index on all axes. Optional + filling logic, placing NaN in locations having no value + in the previous index. A new object is produced unless the + new index is equivalent to the current one and copy=False. + + Parameters + ---------- + other : Object of the same data type + Its row and column indices are used to define the new indices + of this object. + method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'} + Method to use for filling holes in reindexed DataFrame. + Please note: this is only applicable to DataFrames/Series with a + monotonically increasing/decreasing index. + + * None (default): don't fill gaps + * pad / ffill: propagate last valid observation forward to next + valid + * backfill / bfill: use next valid observation to fill gap + * nearest: use nearest valid observations to fill gap. + + copy : bool, default True + Return a new object, even if the passed indexes are the same. + limit : int, default None + Maximum number of consecutive labels to fill for inexact matches. + tolerance : optional + Maximum distance between original and new labels for inexact + matches. The values of the index at the matching locations most + satisfy the equation ``abs(index[indexer] - target) <= tolerance``. + + Tolerance may be a scalar value, which applies the same tolerance + to all values, or list-like, which applies variable tolerance per + element. List-like includes list, tuple, array, Series, and must be + the same size as the index and its dtype must exactly match the + index's type. + + .. versionadded:: 0.21.0 (list-like tolerance) + + Returns + ------- + Series or DataFrame + Same type as caller, but with changed indices on each axis. + + See Also + -------- + DataFrame.set_index : Set row labels. + DataFrame.reset_index : Remove row labels or move them to new columns. + DataFrame.reindex : Change to new indices or expand indices. + + Notes + ----- + Same as calling + ``.reindex(index=other.index, columns=other.columns,...)``. + + Examples + -------- + >>> df1 = pd.DataFrame([[24.3, 75.7, 'high'], + ... [31, 87.8, 'high'], + ... [22, 71.6, 'medium'], + ... [35, 95, 'medium']], + ... columns=['temp_celsius', 'temp_fahrenheit', + ... 'windspeed'], + ... index=pd.date_range(start='2014-02-12', + ... end='2014-02-15', freq='D')) + + >>> df1 + temp_celsius temp_fahrenheit windspeed + 2014-02-12 24.3 75.7 high + 2014-02-13 31.0 87.8 high + 2014-02-14 22.0 71.6 medium + 2014-02-15 35.0 95.0 medium + + >>> df2 = pd.DataFrame([[28, 'low'], + ... [30, 'low'], + ... [35.1, 'medium']], + ... columns=['temp_celsius', 'windspeed'], + ... index=pd.DatetimeIndex(['2014-02-12', '2014-02-13', + ... '2014-02-15'])) + + >>> df2 + temp_celsius windspeed + 2014-02-12 28.0 low + 2014-02-13 30.0 low + 2014-02-15 35.1 medium + + >>> df2.reindex_like(df1) + temp_celsius temp_fahrenheit windspeed + 2014-02-12 28.0 NaN low + 2014-02-13 30.0 NaN low + 2014-02-14 NaN NaN NaN + 2014-02-15 35.1 NaN medium + """ + d = other._construct_axes_dict( + axes=self._AXIS_ORDERS, + method=method, + copy=copy, + limit=limit, + tolerance=tolerance, + ) + + return self.reindex(**d) + + def drop( + self, + labels=None, + axis=0, + index=None, + columns=None, + level=None, + inplace: bool_t = False, + errors: str = "raise", + ): + + inplace = validate_bool_kwarg(inplace, "inplace") + + if labels is not None: + if index is not None or columns is not None: + raise ValueError("Cannot specify both 'labels' and 'index'/'columns'") + axis_name = self._get_axis_name(axis) + axes = {axis_name: labels} + elif index is not None or columns is not None: + axes, _ = self._construct_axes_from_arguments((index, columns), {}) + else: + raise ValueError( + "Need to specify at least one of 'labels', 'index' or 'columns'" + ) + + obj = self + + for axis, labels in axes.items(): + if labels is not None: + obj = obj._drop_axis(labels, axis, level=level, errors=errors) + + if inplace: + self._update_inplace(obj) + else: + return obj + + def _drop_axis( + self: FrameOrSeries, labels, axis, level=None, errors: str = "raise" + ) -> FrameOrSeries: + """ + Drop labels from specified axis. Used in the ``drop`` method + internally. + + Parameters + ---------- + labels : single label or list-like + axis : int or axis name + level : int or level name, default None + For MultiIndex + errors : {'ignore', 'raise'}, default 'raise' + If 'ignore', suppress error and existing labels are dropped. + + """ + axis = self._get_axis_number(axis) + axis_name = self._get_axis_name(axis) + axis = self._get_axis(axis) + + if axis.is_unique: + if level is not None: + if not isinstance(axis, MultiIndex): + raise AssertionError("axis must be a MultiIndex") + new_axis = axis.drop(labels, level=level, errors=errors) + else: + new_axis = axis.drop(labels, errors=errors) + result = self.reindex(**{axis_name: new_axis}) + + # Case for non-unique axis + else: + labels = ensure_object(com.index_labels_to_array(labels)) + if level is not None: + if not isinstance(axis, MultiIndex): + raise AssertionError("axis must be a MultiIndex") + indexer = ~axis.get_level_values(level).isin(labels) + + # GH 18561 MultiIndex.drop should raise if label is absent + if errors == "raise" and indexer.all(): + raise KeyError(f"{labels} not found in axis") + else: + indexer = ~axis.isin(labels) + # Check if label doesn't exist along axis + labels_missing = (axis.get_indexer_for(labels) == -1).any() + if errors == "raise" and labels_missing: + raise KeyError(f"{labels} not found in axis") + + slicer = [slice(None)] * self.ndim + slicer[self._get_axis_number(axis_name)] = indexer + + result = self.loc[tuple(slicer)] + + return result + + def _update_inplace(self, result, verify_is_copy: bool_t = True) -> None: + """ + Replace self internals with result. + + Parameters + ---------- + verify_is_copy : bool, default True + Provide is_copy checks. + """ + # NOTE: This does *not* call __finalize__ and that's an explicit + # decision that we may revisit in the future. + + self._reset_cache() + self._clear_item_cache() + self._data = getattr(result, "_data", result) + self._maybe_update_cacher(verify_is_copy=verify_is_copy) + + def add_prefix(self: FrameOrSeries, prefix: str) -> FrameOrSeries: + """ + Prefix labels with string `prefix`. + + For Series, the row labels are prefixed. + For DataFrame, the column labels are prefixed. + + Parameters + ---------- + prefix : str + The string to add before each label. + + Returns + ------- + Series or DataFrame + New Series or DataFrame with updated labels. + + See Also + -------- + Series.add_suffix: Suffix row labels with string `suffix`. + DataFrame.add_suffix: Suffix column labels with string `suffix`. + + Examples + -------- + >>> s = pd.Series([1, 2, 3, 4]) + >>> s + 0 1 + 1 2 + 2 3 + 3 4 + dtype: int64 + + >>> s.add_prefix('item_') + item_0 1 + item_1 2 + item_2 3 + item_3 4 + dtype: int64 + + >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]}) + >>> df + A B + 0 1 3 + 1 2 4 + 2 3 5 + 3 4 6 + + >>> df.add_prefix('col_') + col_A col_B + 0 1 3 + 1 2 4 + 2 3 5 + 3 4 6 + """ + f = functools.partial("{prefix}{}".format, prefix=prefix) + + mapper = {self._info_axis_name: f} + return self.rename(**mapper) # type: ignore + + def add_suffix(self: FrameOrSeries, suffix: str) -> FrameOrSeries: + """ + Suffix labels with string `suffix`. + + For Series, the row labels are suffixed. + For DataFrame, the column labels are suffixed. + + Parameters + ---------- + suffix : str + The string to add after each label. + + Returns + ------- + Series or DataFrame + New Series or DataFrame with updated labels. + + See Also + -------- + Series.add_prefix: Prefix row labels with string `prefix`. + DataFrame.add_prefix: Prefix column labels with string `prefix`. + + Examples + -------- + >>> s = pd.Series([1, 2, 3, 4]) + >>> s + 0 1 + 1 2 + 2 3 + 3 4 + dtype: int64 + + >>> s.add_suffix('_item') + 0_item 1 + 1_item 2 + 2_item 3 + 3_item 4 + dtype: int64 + + >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]}) + >>> df + A B + 0 1 3 + 1 2 4 + 2 3 5 + 3 4 6 + + >>> df.add_suffix('_col') + A_col B_col + 0 1 3 + 1 2 4 + 2 3 5 + 3 4 6 + """ + f = functools.partial("{}{suffix}".format, suffix=suffix) + + mapper = {self._info_axis_name: f} + return self.rename(**mapper) # type: ignore + + def sort_values( + self, + by=None, + axis=0, + ascending=True, + inplace: bool_t = False, + kind: str = "quicksort", + na_position: str = "last", + ignore_index: bool_t = False, + ): + """ + Sort by the values along either axis. + + Parameters + ----------%(optional_by)s + axis : %(axes_single_arg)s, default 0 + Axis to be sorted. + ascending : bool or list of bool, default True + Sort ascending vs. descending. Specify list for multiple sort + orders. If this is a list of bools, must match the length of + the by. + inplace : bool, default False + If True, perform operation in-place. + kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort' + Choice of sorting algorithm. See also ndarray.np.sort for more + information. `mergesort` is the only stable algorithm. For + DataFrames, this option is only applied when sorting on a single + column or label. + na_position : {'first', 'last'}, default 'last' + Puts NaNs at the beginning if `first`; `last` puts NaNs at the + end. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + + .. versionadded:: 1.0.0 + + Returns + ------- + sorted_obj : DataFrame or None + DataFrame with sorted values if inplace=False, None otherwise. + + Examples + -------- + >>> df = pd.DataFrame({ + ... 'col1': ['A', 'A', 'B', np.nan, 'D', 'C'], + ... 'col2': [2, 1, 9, 8, 7, 4], + ... 'col3': [0, 1, 9, 4, 2, 3], + ... }) + >>> df + col1 col2 col3 + 0 A 2 0 + 1 A 1 1 + 2 B 9 9 + 3 NaN 8 4 + 4 D 7 2 + 5 C 4 3 + + Sort by col1 + + >>> df.sort_values(by=['col1']) + col1 col2 col3 + 0 A 2 0 + 1 A 1 1 + 2 B 9 9 + 5 C 4 3 + 4 D 7 2 + 3 NaN 8 4 + + Sort by multiple columns + + >>> df.sort_values(by=['col1', 'col2']) + col1 col2 col3 + 1 A 1 1 + 0 A 2 0 + 2 B 9 9 + 5 C 4 3 + 4 D 7 2 + 3 NaN 8 4 + + Sort Descending + + >>> df.sort_values(by='col1', ascending=False) + col1 col2 col3 + 4 D 7 2 + 5 C 4 3 + 2 B 9 9 + 0 A 2 0 + 1 A 1 1 + 3 NaN 8 4 + + Putting NAs first + + >>> df.sort_values(by='col1', ascending=False, na_position='first') + col1 col2 col3 + 3 NaN 8 4 + 4 D 7 2 + 5 C 4 3 + 2 B 9 9 + 0 A 2 0 + 1 A 1 1 + """ + raise AbstractMethodError(self) + + def sort_index( + self, + axis=0, + level=None, + ascending: bool_t = True, + inplace: bool_t = False, + kind: str = "quicksort", + na_position: str = "last", + sort_remaining: bool_t = True, + ignore_index: bool_t = False, + ): + """ + Sort object by labels (along an axis). + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis along which to sort. The value 0 identifies the rows, + and 1 identifies the columns. + level : int or level name or list of ints or list of level names + If not None, sort on values in specified index level(s). + ascending : bool, default True + Sort ascending vs. descending. + inplace : bool, default False + If True, perform operation in-place. + kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort' + Choice of sorting algorithm. See also ndarray.np.sort for more + information. `mergesort` is the only stable algorithm. For + DataFrames, this option is only applied when sorting on a single + column or label. + na_position : {'first', 'last'}, default 'last' + Puts NaNs at the beginning if `first`; `last` puts NaNs at the end. + Not implemented for MultiIndex. + sort_remaining : bool, default True + If True and sorting by level and index is multilevel, sort by other + levels too (in order) after sorting by specified level. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + + .. versionadded:: 1.0.0 + + Returns + ------- + sorted_obj : DataFrame or None + DataFrame with sorted index if inplace=False, None otherwise. + """ + inplace = validate_bool_kwarg(inplace, "inplace") + axis = self._get_axis_number(axis) + axis_name = self._get_axis_name(axis) + labels = self._get_axis(axis) + + if level is not None: + raise NotImplementedError("level is not implemented") + if inplace: + raise NotImplementedError("inplace is not implemented") + + sort_index = labels.argsort() + if not ascending: + sort_index = sort_index[::-1] + + new_axis = labels.take(sort_index) + return self.reindex(**{axis_name: new_axis}) + + def reindex(self: FrameOrSeries, *args, **kwargs) -> FrameOrSeries: + """ + Conform %(klass)s to new index with optional filling logic. + + Places NA/NaN in locations having no value in the previous index. A new object + is produced unless the new index is equivalent to the current one and + ``copy=False``. + + Parameters + ---------- + %(optional_labels)s + %(axes)s : array-like, optional + New labels / index to conform to, should be specified using + keywords. Preferably an Index object to avoid duplicating data. + %(optional_axis)s + method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'} + Method to use for filling holes in reindexed DataFrame. + Please note: this is only applicable to DataFrames/Series with a + monotonically increasing/decreasing index. + + * None (default): don't fill gaps + * pad / ffill: Propagate last valid observation forward to next + valid. + * backfill / bfill: Use next valid observation to fill gap. + * nearest: Use nearest valid observations to fill gap. + + copy : bool, default True + Return a new object, even if the passed indexes are the same. + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level. + fill_value : scalar, default np.NaN + Value to use for missing values. Defaults to NaN, but can be any + "compatible" value. + limit : int, default None + Maximum number of consecutive elements to forward or backward fill. + tolerance : optional + Maximum distance between original and new labels for inexact + matches. The values of the index at the matching locations most + satisfy the equation ``abs(index[indexer] - target) <= tolerance``. + + Tolerance may be a scalar value, which applies the same tolerance + to all values, or list-like, which applies variable tolerance per + element. List-like includes list, tuple, array, Series, and must be + the same size as the index and its dtype must exactly match the + index's type. + + .. versionadded:: 0.21.0 (list-like tolerance) + + Returns + ------- + %(klass)s with changed index. + + See Also + -------- + DataFrame.set_index : Set row labels. + DataFrame.reset_index : Remove row labels or move them to new columns. + DataFrame.reindex_like : Change to same indices as other DataFrame. + + Examples + -------- + + ``DataFrame.reindex`` supports two calling conventions + + * ``(index=index_labels, columns=column_labels, ...)`` + * ``(labels, axis={'index', 'columns'}, ...)`` + + We *highly* recommend using keyword arguments to clarify your + intent. + + Create a dataframe with some fictional data. + + >>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror'] + >>> df = pd.DataFrame({'http_status': [200, 200, 404, 404, 301], + ... 'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]}, + ... index=index) + >>> df + http_status response_time + Firefox 200 0.04 + Chrome 200 0.02 + Safari 404 0.07 + IE10 404 0.08 + Konqueror 301 1.00 + + Create a new index and reindex the dataframe. By default + values in the new index that do not have corresponding + records in the dataframe are assigned ``NaN``. + + >>> new_index = ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10', + ... 'Chrome'] + >>> df.reindex(new_index) + http_status response_time + Safari 404.0 0.07 + Iceweasel NaN NaN + Comodo Dragon NaN NaN + IE10 404.0 0.08 + Chrome 200.0 0.02 + + We can fill in the missing values by passing a value to + the keyword ``fill_value``. Because the index is not monotonically + increasing or decreasing, we cannot use arguments to the keyword + ``method`` to fill the ``NaN`` values. + + >>> df.reindex(new_index, fill_value=0) + http_status response_time + Safari 404 0.07 + Iceweasel 0 0.00 + Comodo Dragon 0 0.00 + IE10 404 0.08 + Chrome 200 0.02 + + >>> df.reindex(new_index, fill_value='missing') + http_status response_time + Safari 404 0.07 + Iceweasel missing missing + Comodo Dragon missing missing + IE10 404 0.08 + Chrome 200 0.02 + + We can also reindex the columns. + + >>> df.reindex(columns=['http_status', 'user_agent']) + http_status user_agent + Firefox 200 NaN + Chrome 200 NaN + Safari 404 NaN + IE10 404 NaN + Konqueror 301 NaN + + Or we can use "axis-style" keyword arguments + + >>> df.reindex(['http_status', 'user_agent'], axis="columns") + http_status user_agent + Firefox 200 NaN + Chrome 200 NaN + Safari 404 NaN + IE10 404 NaN + Konqueror 301 NaN + + To further illustrate the filling functionality in + ``reindex``, we will create a dataframe with a + monotonically increasing index (for example, a sequence + of dates). + + >>> date_index = pd.date_range('1/1/2010', periods=6, freq='D') + >>> df2 = pd.DataFrame({"prices": [100, 101, np.nan, 100, 89, 88]}, + ... index=date_index) + >>> df2 + prices + 2010-01-01 100.0 + 2010-01-02 101.0 + 2010-01-03 NaN + 2010-01-04 100.0 + 2010-01-05 89.0 + 2010-01-06 88.0 + + Suppose we decide to expand the dataframe to cover a wider + date range. + + >>> date_index2 = pd.date_range('12/29/2009', periods=10, freq='D') + >>> df2.reindex(date_index2) + prices + 2009-12-29 NaN + 2009-12-30 NaN + 2009-12-31 NaN + 2010-01-01 100.0 + 2010-01-02 101.0 + 2010-01-03 NaN + 2010-01-04 100.0 + 2010-01-05 89.0 + 2010-01-06 88.0 + 2010-01-07 NaN + + The index entries that did not have a value in the original data frame + (for example, '2009-12-29') are by default filled with ``NaN``. + If desired, we can fill in the missing values using one of several + options. + + For example, to back-propagate the last valid value to fill the ``NaN`` + values, pass ``bfill`` as an argument to the ``method`` keyword. + + >>> df2.reindex(date_index2, method='bfill') + prices + 2009-12-29 100.0 + 2009-12-30 100.0 + 2009-12-31 100.0 + 2010-01-01 100.0 + 2010-01-02 101.0 + 2010-01-03 NaN + 2010-01-04 100.0 + 2010-01-05 89.0 + 2010-01-06 88.0 + 2010-01-07 NaN + + Please note that the ``NaN`` value present in the original dataframe + (at index value 2010-01-03) will not be filled by any of the + value propagation schemes. This is because filling while reindexing + does not look at dataframe values, but only compares the original and + desired indexes. If you do want to fill in the ``NaN`` values present + in the original dataframe, use the ``fillna()`` method. + + See the :ref:`user guide ` for more. + """ + # TODO: Decide if we care about having different examples for different + # kinds + + # construct the args + axes, kwargs = self._construct_axes_from_arguments(args, kwargs) + method = missing.clean_reindex_fill_method(kwargs.pop("method", None)) + level = kwargs.pop("level", None) + copy = kwargs.pop("copy", True) + limit = kwargs.pop("limit", None) + tolerance = kwargs.pop("tolerance", None) + fill_value = kwargs.pop("fill_value", None) + + # Series.reindex doesn't use / need the axis kwarg + # We pop and ignore it here, to make writing Series/Frame generic code + # easier + kwargs.pop("axis", None) + + if kwargs: + raise TypeError( + "reindex() got an unexpected keyword " + f'argument "{list(kwargs.keys())[0]}"' + ) + + self._consolidate_inplace() + + # if all axes that are requested to reindex are equal, then only copy + # if indicated must have index names equal here as well as values + if all( + self._get_axis(axis).identical(ax) + for axis, ax in axes.items() + if ax is not None + ): + if copy: + return self.copy() + return self + + # check if we are a multi reindex + if self._needs_reindex_multi(axes, method, level): + return self._reindex_multi(axes, copy, fill_value) + + # perform the reindex on the axes + return self._reindex_axes( + axes, level, limit, tolerance, method, fill_value, copy + ).__finalize__(self) + + def _reindex_axes( + self: FrameOrSeries, axes, level, limit, tolerance, method, fill_value, copy + ) -> FrameOrSeries: + """Perform the reindex for all the axes.""" + obj = self + for a in self._AXIS_ORDERS: + labels = axes[a] + if labels is None: + continue + + ax = self._get_axis(a) + new_index, indexer = ax.reindex( + labels, level=level, limit=limit, tolerance=tolerance, method=method + ) + + axis = self._get_axis_number(a) + obj = obj._reindex_with_indexers( + {axis: [new_index, indexer]}, + fill_value=fill_value, + copy=copy, + allow_dups=False, + ) + + return obj + + def _needs_reindex_multi(self, axes, method, level) -> bool_t: + """Check if we do need a multi reindex.""" + return ( + (com.count_not_none(*axes.values()) == self._AXIS_LEN) + and method is None + and level is None + and not self._is_mixed_type + ) + + def _reindex_multi(self, axes, copy, fill_value): + raise AbstractMethodError(self) + + def _reindex_with_indexers( + self: FrameOrSeries, + reindexers, + fill_value=None, + copy: bool_t = False, + allow_dups: bool_t = False, + ) -> FrameOrSeries: + """allow_dups indicates an internal call here """ + + # reindex doing multiple operations on different axes if indicated + new_data = self._data + for axis in sorted(reindexers.keys()): + index, indexer = reindexers[axis] + baxis = self._get_block_manager_axis(axis) + + if index is None: + continue + + index = ensure_index(index) + if indexer is not None: + indexer = ensure_int64(indexer) + + # TODO: speed up on homogeneous DataFrame objects + new_data = new_data.reindex_indexer( + index, + indexer, + axis=baxis, + fill_value=fill_value, + allow_dups=allow_dups, + copy=copy, + ) + + if copy and new_data is self._data: + new_data = new_data.copy() + + return self._constructor(new_data).__finalize__(self) + + def filter( + self: FrameOrSeries, + items=None, + like: Optional[str] = None, + regex: Optional[str] = None, + axis=None, + ) -> FrameOrSeries: + """ + Subset the dataframe rows or columns according to the specified index labels. + + Note that this routine does not filter a dataframe on its + contents. The filter is applied to the labels of the index. + + Parameters + ---------- + items : list-like + Keep labels from axis which are in items. + like : str + Keep labels from axis for which "like in label == True". + regex : str (regular expression) + Keep labels from axis for which re.search(regex, label) == True. + axis : {0 or ‘index’, 1 or ‘columns’, None}, default None + The axis to filter on, expressed either as an index (int) + or axis name (str). By default this is the info axis, + 'index' for Series, 'columns' for DataFrame. + + Returns + ------- + same type as input object + + See Also + -------- + DataFrame.loc + + Notes + ----- + The ``items``, ``like``, and ``regex`` parameters are + enforced to be mutually exclusive. + + ``axis`` defaults to the info axis that is used when indexing + with ``[]``. + + Examples + -------- + >>> df = pd.DataFrame(np.array(([1, 2, 3], [4, 5, 6])), + ... index=['mouse', 'rabbit'], + ... columns=['one', 'two', 'three']) + + >>> # select columns by name + >>> df.filter(items=['one', 'three']) + one three + mouse 1 3 + rabbit 4 6 + + >>> # select columns by regular expression + >>> df.filter(regex='e$', axis=1) + one three + mouse 1 3 + rabbit 4 6 + + >>> # select rows containing 'bbi' + >>> df.filter(like='bbi', axis=0) + one two three + rabbit 4 5 6 + """ + nkw = com.count_not_none(items, like, regex) + if nkw > 1: + raise TypeError( + "Keyword arguments `items`, `like`, or `regex` " + "are mutually exclusive" + ) + + if axis is None: + axis = self._info_axis_name + labels = self._get_axis(axis) + + if items is not None: + name = self._get_axis_name(axis) + return self.reindex(**{name: [r for r in items if r in labels]}) + elif like: + + def f(x): + return like in ensure_str(x) + + values = labels.map(f) + return self.loc(axis=axis)[values] + elif regex: + + def f(x): + return matcher.search(ensure_str(x)) is not None + + matcher = re.compile(regex) + values = labels.map(f) + return self.loc(axis=axis)[values] + else: + raise TypeError("Must pass either `items`, `like`, or `regex`") + + def head(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: + """ + Return the first `n` rows. + + This function returns the first `n` rows for the object based + on position. It is useful for quickly testing if your object + has the right type of data in it. + + For negative values of `n`, this function returns all rows except + the last `n` rows, equivalent to ``df[:-n]``. + + Parameters + ---------- + n : int, default 5 + Number of rows to select. + + Returns + ------- + same type as caller + The first `n` rows of the caller object. + + See Also + -------- + DataFrame.tail: Returns the last `n` rows. + + Examples + -------- + >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion', + ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']}) + >>> df + animal + 0 alligator + 1 bee + 2 falcon + 3 lion + 4 monkey + 5 parrot + 6 shark + 7 whale + 8 zebra + + Viewing the first 5 lines + + >>> df.head() + animal + 0 alligator + 1 bee + 2 falcon + 3 lion + 4 monkey + + Viewing the first `n` lines (three in this case) + + >>> df.head(3) + animal + 0 alligator + 1 bee + 2 falcon + + For negative values of `n` + + >>> df.head(-3) + animal + 0 alligator + 1 bee + 2 falcon + 3 lion + 4 monkey + 5 parrot + """ + + return self.iloc[:n] + + def tail(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: + """ + Return the last `n` rows. + + This function returns last `n` rows from the object based on + position. It is useful for quickly verifying data, for example, + after sorting or appending rows. + + For negative values of `n`, this function returns all rows except + the first `n` rows, equivalent to ``df[n:]``. + + Parameters + ---------- + n : int, default 5 + Number of rows to select. + + Returns + ------- + type of caller + The last `n` rows of the caller object. + + See Also + -------- + DataFrame.head : The first `n` rows of the caller object. + + Examples + -------- + >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion', + ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']}) + >>> df + animal + 0 alligator + 1 bee + 2 falcon + 3 lion + 4 monkey + 5 parrot + 6 shark + 7 whale + 8 zebra + + Viewing the last 5 lines + + >>> df.tail() + animal + 4 monkey + 5 parrot + 6 shark + 7 whale + 8 zebra + + Viewing the last `n` lines (three in this case) + + >>> df.tail(3) + animal + 6 shark + 7 whale + 8 zebra + + For negative values of `n` + + >>> df.tail(-3) + animal + 3 lion + 4 monkey + 5 parrot + 6 shark + 7 whale + 8 zebra + """ + + if n == 0: + return self.iloc[0:0] + return self.iloc[-n:] + + def sample( + self: FrameOrSeries, + n=None, + frac=None, + replace=False, + weights=None, + random_state=None, + axis=None, + ) -> FrameOrSeries: + """ + Return a random sample of items from an axis of object. + + You can use `random_state` for reproducibility. + + Parameters + ---------- + n : int, optional + Number of items from axis to return. Cannot be used with `frac`. + Default = 1 if `frac` = None. + frac : float, optional + Fraction of axis items to return. Cannot be used with `n`. + replace : bool, default False + Allow or disallow sampling of the same row more than once. + weights : str or ndarray-like, optional + Default 'None' results in equal probability weighting. + If passed a Series, will align with target object on index. Index + values in weights not found in sampled object will be ignored and + index values in sampled object not in weights will be assigned + weights of zero. + If called on a DataFrame, will accept the name of a column + when axis = 0. + Unless weights are a Series, weights must be same length as axis + being sampled. + If weights do not sum to 1, they will be normalized to sum to 1. + Missing values in the weights column will be treated as zero. + Infinite values not allowed. + random_state : int or numpy.random.RandomState, optional + Seed for the random number generator (if int), or numpy RandomState + object. + axis : {0 or ‘index’, 1 or ‘columns’, None}, default None + Axis to sample. Accepts axis number or name. Default is stat axis + for given data type (0 for Series and DataFrames). + + Returns + ------- + Series or DataFrame + A new object of same type as caller containing `n` items randomly + sampled from the caller object. + + See Also + -------- + numpy.random.choice: Generates a random sample from a given 1-D numpy + array. + + Notes + ----- + If `frac` > 1, `replacement` should be set to `True`. + + Examples + -------- + >>> df = pd.DataFrame({'num_legs': [2, 4, 8, 0], + ... 'num_wings': [2, 0, 0, 0], + ... 'num_specimen_seen': [10, 2, 1, 8]}, + ... index=['falcon', 'dog', 'spider', 'fish']) + >>> df + num_legs num_wings num_specimen_seen + falcon 2 2 10 + dog 4 0 2 + spider 8 0 1 + fish 0 0 8 + + Extract 3 random elements from the ``Series`` ``df['num_legs']``: + Note that we use `random_state` to ensure the reproducibility of + the examples. + + >>> df['num_legs'].sample(n=3, random_state=1) + fish 0 + spider 8 + falcon 2 + Name: num_legs, dtype: int64 + + A random 50% sample of the ``DataFrame`` with replacement: + + >>> df.sample(frac=0.5, replace=True, random_state=1) + num_legs num_wings num_specimen_seen + dog 4 0 2 + fish 0 0 8 + + An upsample sample of the ``DataFrame`` with replacement: + Note that `replace` parameter has to be `True` for `frac` parameter > 1. + + >>> df.sample(frac=2, replace=True, random_state=1) + num_legs num_wings num_specimen_seen + dog 4 0 2 + fish 0 0 8 + falcon 2 2 10 + falcon 2 2 10 + fish 0 0 8 + dog 4 0 2 + fish 0 0 8 + dog 4 0 2 + + Using a DataFrame column as weights. Rows with larger value in the + `num_specimen_seen` column are more likely to be sampled. + + >>> df.sample(n=2, weights='num_specimen_seen', random_state=1) + num_legs num_wings num_specimen_seen + falcon 2 2 10 + fish 0 0 8 + """ + + if axis is None: + axis = self._stat_axis_number + + axis = self._get_axis_number(axis) + axis_length = self.shape[axis] + + # Process random_state argument + rs = com.random_state(random_state) + + # Check weights for compliance + if weights is not None: + + # If a series, align with frame + if isinstance(weights, ABCSeries): + weights = weights.reindex(self.axes[axis]) + + # Strings acceptable if a dataframe and axis = 0 + if isinstance(weights, str): + if isinstance(self, ABCDataFrame): + if axis == 0: + try: + weights = self[weights] + except KeyError: + raise KeyError( + "String passed to weights not a valid column" + ) + else: + raise ValueError( + "Strings can only be passed to " + "weights when sampling from rows on " + "a DataFrame" + ) + else: + raise ValueError( + "Strings cannot be passed as weights " + "when sampling from a Series." + ) + + weights = pd.Series(weights, dtype="float64") + + if len(weights) != axis_length: + raise ValueError( + "Weights and axis to be sampled must be of same length" + ) + + if (weights == np.inf).any() or (weights == -np.inf).any(): + raise ValueError("weight vector may not include `inf` values") + + if (weights < 0).any(): + raise ValueError("weight vector many not include negative values") + + # If has nan, set to zero. + weights = weights.fillna(0) + + # Renormalize if don't sum to 1 + if weights.sum() != 1: + if weights.sum() != 0: + weights = weights / weights.sum() + else: + raise ValueError("Invalid weights: weights sum to zero") + + weights = weights.values + + # If no frac or n, default to n=1. + if n is None and frac is None: + n = 1 + elif frac is not None and frac > 1 and not replace: + raise ValueError( + "Replace has to be set to `True` when " + "upsampling the population `frac` > 1." + ) + elif n is not None and frac is None and n % 1 != 0: + raise ValueError("Only integers accepted as `n` values") + elif n is None and frac is not None: + n = int(round(frac * axis_length)) + elif n is not None and frac is not None: + raise ValueError("Please enter a value for `frac` OR `n`, not both") + + # Check for negative sizes + if n < 0: + raise ValueError( + "A negative number of rows requested. Please provide positive value." + ) + + locs = rs.choice(axis_length, size=n, replace=replace, p=weights) + return self.take(locs, axis=axis) + + _shared_docs[ + "pipe" + ] = r""" + Apply func(self, \*args, \*\*kwargs). + + Parameters + ---------- + func : function + Function to apply to the %(klass)s. + ``args``, and ``kwargs`` are passed into ``func``. + Alternatively a ``(callable, data_keyword)`` tuple where + ``data_keyword`` is a string indicating the keyword of + ``callable`` that expects the %(klass)s. + args : iterable, optional + Positional arguments passed into ``func``. + kwargs : mapping, optional + A dictionary of keyword arguments passed into ``func``. + + Returns + ------- + object : the return type of ``func``. + + See Also + -------- + DataFrame.apply + DataFrame.applymap + Series.map + + Notes + ----- + + Use ``.pipe`` when chaining together functions that expect + Series, DataFrames or GroupBy objects. Instead of writing + + >>> f(g(h(df), arg1=a), arg2=b, arg3=c) + + You can write + + >>> (df.pipe(h) + ... .pipe(g, arg1=a) + ... .pipe(f, arg2=b, arg3=c) + ... ) + + If you have a function that takes the data as (say) the second + argument, pass a tuple indicating which keyword expects the + data. For example, suppose ``f`` takes its data as ``arg2``: + + >>> (df.pipe(h) + ... .pipe(g, arg1=a) + ... .pipe((f, 'arg2'), arg1=a, arg3=c) + ... ) + """ + + @Appender(_shared_docs["pipe"] % _shared_doc_kwargs) + def pipe(self, func, *args, **kwargs): + return com.pipe(self, func, *args, **kwargs) + + _shared_docs["aggregate"] = dedent( + """ + Aggregate using one or more operations over the specified axis. + %(versionadded)s + Parameters + ---------- + func : function, str, list or dict + Function to use for aggregating the data. If a function, must either + work when passed a %(klass)s or when passed to %(klass)s.apply. + + Accepted combinations are: + + - function + - string function name + - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` + - dict of axis labels -> functions, function names or list of such. + %(axis)s + *args + Positional arguments to pass to `func`. + **kwargs + Keyword arguments to pass to `func`. + + Returns + ------- + scalar, Series or DataFrame + + The return can be: + + * scalar : when Series.agg is called with single function + * Series : when DataFrame.agg is called with a single function + * DataFrame : when DataFrame.agg is called with several functions + + Return scalar, Series or DataFrame. + %(see_also)s + Notes + ----- + `agg` is an alias for `aggregate`. Use the alias. + + A passed user-defined-function will be passed a Series for evaluation. + %(examples)s""" + ) + + _shared_docs[ + "transform" + ] = """ + Call ``func`` on self producing a %(klass)s with transformed values. + + Produced %(klass)s will have same axis length as self. + + Parameters + ---------- + func : function, str, list or dict + Function to use for transforming the data. If a function, must either + work when passed a %(klass)s or when passed to %(klass)s.apply. + + Accepted combinations are: + + - function + - string function name + - list of functions and/or function names, e.g. ``[np.exp. 'sqrt']`` + - dict of axis labels -> functions, function names or list of such. + %(axis)s + *args + Positional arguments to pass to `func`. + **kwargs + Keyword arguments to pass to `func`. + + Returns + ------- + %(klass)s + A %(klass)s that must have the same length as self. + + Raises + ------ + ValueError : If the returned %(klass)s has a different length than self. + + See Also + -------- + %(klass)s.agg : Only perform aggregating type operations. + %(klass)s.apply : Invoke function on a %(klass)s. + + Examples + -------- + >>> df = pd.DataFrame({'A': range(3), 'B': range(1, 4)}) + >>> df + A B + 0 0 1 + 1 1 2 + 2 2 3 + >>> df.transform(lambda x: x + 1) + A B + 0 1 2 + 1 2 3 + 2 3 4 + + Even though the resulting %(klass)s must have the same length as the + input %(klass)s, it is possible to provide several input functions: + + >>> s = pd.Series(range(3)) + >>> s + 0 0 + 1 1 + 2 2 + dtype: int64 + >>> s.transform([np.sqrt, np.exp]) + sqrt exp + 0 0.000000 1.000000 + 1 1.000000 2.718282 + 2 1.414214 7.389056 + """ + + # ---------------------------------------------------------------------- + # Attribute access + + def __finalize__( + self: FrameOrSeries, other, method=None, **kwargs + ) -> FrameOrSeries: + """ + Propagate metadata from other to self. + + Parameters + ---------- + other : the object from which to get the attributes that we are going + to propagate + method : optional, a passed method name ; possibly to take different + types of propagation actions based on this + + """ + if isinstance(other, NDFrame): + for name in other.attrs: + self.attrs[name] = other.attrs[name] + # For subclasses using _metadata. + for name in self._metadata: + object.__setattr__(self, name, getattr(other, name, None)) + return self + + def __getattr__(self, name: str): + """After regular attribute access, try looking up the name + This allows simpler access to columns for interactive use. + """ + + # Note: obj.x will always call obj.__getattribute__('x') prior to + # calling obj.__getattr__('x'). + + if ( + name in self._internal_names_set + or name in self._metadata + or name in self._accessors + ): + return object.__getattribute__(self, name) + else: + if self._info_axis._can_hold_identifiers_and_holds_name(name): + return self[name] + return object.__getattribute__(self, name) + + def __setattr__(self, name: str, value) -> None: + """After regular attribute access, try setting the name + This allows simpler access to columns for interactive use. + """ + + # first try regular attribute access via __getattribute__, so that + # e.g. ``obj.x`` and ``obj.x = 4`` will always reference/modify + # the same attribute. + + try: + object.__getattribute__(self, name) + return object.__setattr__(self, name, value) + except AttributeError: + pass + + # if this fails, go on to more involved attribute setting + # (note that this matches __getattr__, above). + if name in self._internal_names_set: + object.__setattr__(self, name, value) + elif name in self._metadata: + object.__setattr__(self, name, value) + else: + try: + existing = getattr(self, name) + if isinstance(existing, Index): + object.__setattr__(self, name, value) + elif name in self._info_axis: + self[name] = value + else: + object.__setattr__(self, name, value) + except (AttributeError, TypeError): + if isinstance(self, ABCDataFrame) and (is_list_like(value)): + warnings.warn( + "Pandas doesn't allow columns to be " + "created via a new attribute name - see " + "https://pandas.pydata.org/pandas-docs/" + "stable/indexing.html#attribute-access", + stacklevel=2, + ) + object.__setattr__(self, name, value) + + def _dir_additions(self): + """ add the string-like attributes from the info_axis. + If info_axis is a MultiIndex, it's first level values are used. + """ + additions = { + c + for c in self._info_axis.unique(level=0)[:100] + if isinstance(c, str) and c.isidentifier() + } + return super()._dir_additions().union(additions) + + # ---------------------------------------------------------------------- + # Consolidation of internals + + def _protect_consolidate(self, f): + """Consolidate _data -- if the blocks have changed, then clear the + cache + """ + blocks_before = len(self._data.blocks) + result = f() + if len(self._data.blocks) != blocks_before: + self._clear_item_cache() + return result + + def _consolidate_inplace(self) -> None: + """Consolidate data in place and return None""" + + def f(): + self._data = self._data.consolidate() + + self._protect_consolidate(f) + + def _consolidate(self, inplace: bool_t = False): + """ + Compute NDFrame with "consolidated" internals (data of each dtype + grouped together in a single ndarray). + + Parameters + ---------- + inplace : bool, default False + If False return new object, otherwise modify existing object. + + Returns + ------- + consolidated : same type as caller + """ + inplace = validate_bool_kwarg(inplace, "inplace") + if inplace: + self._consolidate_inplace() + else: + f = lambda: self._data.consolidate() + cons_data = self._protect_consolidate(f) + return self._constructor(cons_data).__finalize__(self) + + @property + def _is_mixed_type(self): + f = lambda: self._data.is_mixed_type + return self._protect_consolidate(f) + + @property + def _is_numeric_mixed_type(self): + f = lambda: self._data.is_numeric_mixed_type + return self._protect_consolidate(f) + + @property + def _is_datelike_mixed_type(self): + f = lambda: self._data.is_datelike_mixed_type + return self._protect_consolidate(f) + + def _check_inplace_setting(self, value) -> bool_t: + """ check whether we allow in-place setting with this type of value """ + + if self._is_mixed_type: + if not self._is_numeric_mixed_type: + + # allow an actual np.nan thru + if is_float(value) and np.isnan(value): + return True + + raise TypeError( + "Cannot do inplace boolean setting on " + "mixed-types with a non np.nan value" + ) + + return True + + def _get_numeric_data(self): + return self._constructor(self._data.get_numeric_data()).__finalize__(self) + + def _get_bool_data(self): + return self._constructor(self._data.get_bool_data()).__finalize__(self) + + # ---------------------------------------------------------------------- + # Internal Interface Methods + + @property + def values(self) -> np.ndarray: + """ + Return a Numpy representation of the DataFrame. + + .. warning:: + + We recommend using :meth:`DataFrame.to_numpy` instead. + + Only the values in the DataFrame will be returned, the axes labels + will be removed. + + Returns + ------- + numpy.ndarray + The values of the DataFrame. + + See Also + -------- + DataFrame.to_numpy : Recommended alternative to this method. + DataFrame.index : Retrieve the index labels. + DataFrame.columns : Retrieving the column names. + + Notes + ----- + The dtype will be a lower-common-denominator dtype (implicit + upcasting); that is to say if the dtypes (even of numeric types) + are mixed, the one that accommodates all will be chosen. Use this + with care if you are not dealing with the blocks. + + e.g. If the dtypes are float16 and float32, dtype will be upcast to + float32. If dtypes are int32 and uint8, dtype will be upcast to + int32. By :func:`numpy.find_common_type` convention, mixing int64 + and uint64 will result in a float64 dtype. + + Examples + -------- + A DataFrame where all columns are the same type (e.g., int64) results + in an array of the same type. + + >>> df = pd.DataFrame({'age': [ 3, 29], + ... 'height': [94, 170], + ... 'weight': [31, 115]}) + >>> df + age height weight + 0 3 94 31 + 1 29 170 115 + >>> df.dtypes + age int64 + height int64 + weight int64 + dtype: object + >>> df.values + array([[ 3, 94, 31], + [ 29, 170, 115]], dtype=int64) + + A DataFrame with mixed type columns(e.g., str/object, int64, float32) + results in an ndarray of the broadest type that accommodates these + mixed types (e.g., object). + + >>> df2 = pd.DataFrame([('parrot', 24.0, 'second'), + ... ('lion', 80.5, 1), + ... ('monkey', np.nan, None)], + ... columns=('name', 'max_speed', 'rank')) + >>> df2.dtypes + name object + max_speed float64 + rank object + dtype: object + >>> df2.values + array([['parrot', 24.0, 'second'], + ['lion', 80.5, 1], + ['monkey', nan, None]], dtype=object) + """ + self._consolidate_inplace() + return self._data.as_array(transpose=self._AXIS_REVERSED) + + @property + def _values(self) -> np.ndarray: + """internal implementation""" + return self.values + + @property + def _get_values(self) -> np.ndarray: + # compat + return self.values + + def _internal_get_values(self) -> np.ndarray: + """ + Return an ndarray after converting sparse values to dense. + + This is the same as ``.values`` for non-sparse data. For sparse + data contained in a `SparseArray`, the data are first + converted to a dense representation. + + Returns + ------- + numpy.ndarray + Numpy representation of DataFrame. + + See Also + -------- + values : Numpy representation of DataFrame. + SparseArray : Container for sparse data. + """ + return self.values + + @property + def dtypes(self): + """ + Return the dtypes in the DataFrame. + + This returns a Series with the data type of each column. + The result's index is the original DataFrame's columns. Columns + with mixed types are stored with the ``object`` dtype. See + :ref:`the User Guide ` for more. + + Returns + ------- + pandas.Series + The data type of each column. + + Examples + -------- + >>> df = pd.DataFrame({'float': [1.0], + ... 'int': [1], + ... 'datetime': [pd.Timestamp('20180310')], + ... 'string': ['foo']}) + >>> df.dtypes + float float64 + int int64 + datetime datetime64[ns] + string object + dtype: object + """ + from pandas import Series + + return Series(self._data.get_dtypes(), index=self._info_axis, dtype=np.object_) + + def _to_dict_of_blocks(self, copy: bool_t = True): + """ + Return a dict of dtype -> Constructor Types that + each is a homogeneous dtype. + + Internal ONLY + """ + return { + k: self._constructor(v).__finalize__(self) + for k, v, in self._data.to_dict(copy=copy).items() + } + + def astype( + self: FrameOrSeries, dtype, copy: bool_t = True, errors: str = "raise" + ) -> FrameOrSeries: + """ + Cast a pandas object to a specified dtype ``dtype``. + + Parameters + ---------- + dtype : data type, or dict of column name -> data type + Use a numpy.dtype or Python type to cast entire pandas object to + the same type. Alternatively, use {col: dtype, ...}, where col is a + column label and dtype is a numpy.dtype or Python type to cast one + or more of the DataFrame's columns to column-specific types. + copy : bool, default True + Return a copy when ``copy=True`` (be very careful setting + ``copy=False`` as changes to values then may propagate to other + pandas objects). + errors : {'raise', 'ignore'}, default 'raise' + Control raising of exceptions on invalid data for provided dtype. + + - ``raise`` : allow exceptions to be raised + - ``ignore`` : suppress exceptions. On error return original object. + + Returns + ------- + casted : same type as caller + + See Also + -------- + to_datetime : Convert argument to datetime. + to_timedelta : Convert argument to timedelta. + to_numeric : Convert argument to a numeric type. + numpy.ndarray.astype : Cast a numpy array to a specified type. + + Examples + -------- + Create a DataFrame: + + >>> d = {'col1': [1, 2], 'col2': [3, 4]} + >>> df = pd.DataFrame(data=d) + >>> df.dtypes + col1 int64 + col2 int64 + dtype: object + + Cast all columns to int32: + + >>> df.astype('int32').dtypes + col1 int32 + col2 int32 + dtype: object + + Cast col1 to int32 using a dictionary: + + >>> df.astype({'col1': 'int32'}).dtypes + col1 int32 + col2 int64 + dtype: object + + Create a series: + + >>> ser = pd.Series([1, 2], dtype='int32') + >>> ser + 0 1 + 1 2 + dtype: int32 + >>> ser.astype('int64') + 0 1 + 1 2 + dtype: int64 + + Convert to categorical type: + + >>> ser.astype('category') + 0 1 + 1 2 + dtype: category + Categories (2, int64): [1, 2] + + Convert to ordered categorical type with custom ordering: + + >>> cat_dtype = pd.api.types.CategoricalDtype( + ... categories=[2, 1], ordered=True) + >>> ser.astype(cat_dtype) + 0 1 + 1 2 + dtype: category + Categories (2, int64): [2 < 1] + + Note that using ``copy=False`` and changing data on a new + pandas object may propagate changes: + + >>> s1 = pd.Series([1, 2]) + >>> s2 = s1.astype('int64', copy=False) + >>> s2[0] = 10 + >>> s1 # note that s1[0] has changed too + 0 10 + 1 2 + dtype: int64 + """ + if is_dict_like(dtype): + if self.ndim == 1: # i.e. Series + if len(dtype) > 1 or self.name not in dtype: + raise KeyError( + "Only the Series name can be used for " + "the key in Series dtype mappings." + ) + new_type = dtype[self.name] + return self.astype(new_type, copy, errors) + + for col_name in dtype.keys(): + if col_name not in self: + raise KeyError( + "Only a column name can be used for the " + "key in a dtype mappings argument." + ) + results = [] + for col_name, col in self.items(): + if col_name in dtype: + results.append( + col.astype(dtype=dtype[col_name], copy=copy, errors=errors) + ) + else: + results.append(col.copy() if copy else col) + + elif is_extension_array_dtype(dtype) and self.ndim > 1: + # GH 18099/22869: columnwise conversion to extension dtype + # GH 24704: use iloc to handle duplicate column names + results = [ + self.iloc[:, i].astype(dtype, copy=copy) + for i in range(len(self.columns)) + ] + + else: + # else, only a single dtype is given + new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors) + return self._constructor(new_data).__finalize__(self) + + # GH 19920: retain column metadata after concat + result = pd.concat(results, axis=1, copy=False) + result.columns = self.columns + return result + + def copy(self: FrameOrSeries, deep: bool_t = True) -> FrameOrSeries: + """ + Make a copy of this object's indices and data. + + When ``deep=True`` (default), a new object will be created with a + copy of the calling object's data and indices. Modifications to + the data or indices of the copy will not be reflected in the + original object (see notes below). + + When ``deep=False``, a new object will be created without copying + the calling object's data or index (only references to the data + and index are copied). Any changes to the data of the original + will be reflected in the shallow copy (and vice versa). + + Parameters + ---------- + deep : bool, default True + Make a deep copy, including a copy of the data and the indices. + With ``deep=False`` neither the indices nor the data are copied. + + Returns + ------- + copy : Series or DataFrame + Object type matches caller. + + Notes + ----- + When ``deep=True``, data is copied but actual Python objects + will not be copied recursively, only the reference to the object. + This is in contrast to `copy.deepcopy` in the Standard Library, + which recursively copies object data (see examples below). + + While ``Index`` objects are copied when ``deep=True``, the underlying + numpy array is not copied for performance reasons. Since ``Index`` is + immutable, the underlying data can be safely shared and a copy + is not needed. + + Examples + -------- + >>> s = pd.Series([1, 2], index=["a", "b"]) + >>> s + a 1 + b 2 + dtype: int64 + + >>> s_copy = s.copy() + >>> s_copy + a 1 + b 2 + dtype: int64 + + **Shallow copy versus default (deep) copy:** + + >>> s = pd.Series([1, 2], index=["a", "b"]) + >>> deep = s.copy() + >>> shallow = s.copy(deep=False) + + Shallow copy shares data and index with original. + + >>> s is shallow + False + >>> s.values is shallow.values and s.index is shallow.index + True + + Deep copy has own copy of data and index. + + >>> s is deep + False + >>> s.values is deep.values or s.index is deep.index + False + + Updates to the data shared by shallow copy and original is reflected + in both; deep copy remains unchanged. + + >>> s[0] = 3 + >>> shallow[1] = 4 + >>> s + a 3 + b 4 + dtype: int64 + >>> shallow + a 3 + b 4 + dtype: int64 + >>> deep + a 1 + b 2 + dtype: int64 + + Note that when copying an object containing Python objects, a deep copy + will copy the data, but will not do so recursively. Updating a nested + data object will be reflected in the deep copy. + + >>> s = pd.Series([[1, 2], [3, 4]]) + >>> deep = s.copy() + >>> s[0][0] = 10 + >>> s + 0 [10, 2] + 1 [3, 4] + dtype: object + >>> deep + 0 [10, 2] + 1 [3, 4] + dtype: object + """ + data = self._data.copy(deep=deep) + return self._constructor(data).__finalize__(self) + + def __copy__(self: FrameOrSeries, deep: bool_t = True) -> FrameOrSeries: + return self.copy(deep=deep) + + def __deepcopy__(self: FrameOrSeries, memo=None) -> FrameOrSeries: + """ + Parameters + ---------- + memo, default None + Standard signature. Unused + """ + return self.copy(deep=True) + + def _convert( + self: FrameOrSeries, + datetime: bool_t = False, + numeric: bool_t = False, + timedelta: bool_t = False, + coerce: bool_t = False, + copy: bool_t = True, + ) -> FrameOrSeries: + """ + Attempt to infer better dtype for object columns + + Parameters + ---------- + datetime : bool, default False + If True, convert to date where possible. + numeric : bool, default False + If True, attempt to convert to numbers (including strings), with + unconvertible values becoming NaN. + timedelta : bool, default False + If True, convert to timedelta where possible. + coerce : bool, default False + If True, force conversion with unconvertible values converted to + nulls (NaN or NaT). + copy : bool, default True + If True, return a copy even if no copy is necessary (e.g. no + conversion was done). Note: This is meant for internal use, and + should not be confused with inplace. + + Returns + ------- + converted : same as input object + """ + validate_bool_kwarg(datetime, "datetime") + validate_bool_kwarg(numeric, "numeric") + validate_bool_kwarg(timedelta, "timedelta") + validate_bool_kwarg(coerce, "coerce") + validate_bool_kwarg(copy, "copy") + return self._constructor( + self._data.convert( + datetime=datetime, + numeric=numeric, + timedelta=timedelta, + coerce=coerce, + copy=copy, + ) + ).__finalize__(self) + + def infer_objects(self: FrameOrSeries) -> FrameOrSeries: + """ + Attempt to infer better dtypes for object columns. + + Attempts soft conversion of object-dtyped + columns, leaving non-object and unconvertible + columns unchanged. The inference rules are the + same as during normal Series/DataFrame construction. + + .. versionadded:: 0.21.0 + + Returns + ------- + converted : same type as input object + + See Also + -------- + to_datetime : Convert argument to datetime. + to_timedelta : Convert argument to timedelta. + to_numeric : Convert argument to numeric type. + convert_dtypes : Convert argument to best possible dtype. + + Examples + -------- + >>> df = pd.DataFrame({"A": ["a", 1, 2, 3]}) + >>> df = df.iloc[1:] + >>> df + A + 1 1 + 2 2 + 3 3 + + >>> df.dtypes + A object + dtype: object + + >>> df.infer_objects().dtypes + A int64 + dtype: object + """ + # numeric=False necessary to only soft convert; + # python objects will still be converted to + # native numpy numeric types + return self._constructor( + self._data.convert( + datetime=True, numeric=False, timedelta=True, coerce=False, copy=True + ) + ).__finalize__(self) + + def convert_dtypes( + self: FrameOrSeries, + infer_objects: bool_t = True, + convert_string: bool_t = True, + convert_integer: bool_t = True, + convert_boolean: bool_t = True, + ) -> FrameOrSeries: + """ + Convert columns to best possible dtypes using dtypes supporting ``pd.NA``. + + .. versionadded:: 1.0.0 + + Parameters + ---------- + infer_objects : bool, default True + Whether object dtypes should be converted to the best possible types. + convert_string : bool, default True + Whether object dtypes should be converted to ``StringDtype()``. + convert_integer : bool, default True + Whether, if possible, conversion can be done to integer extension types. + convert_boolean : bool, defaults True + Whether object dtypes should be converted to ``BooleanDtypes()``. + + Returns + ------- + Series or DataFrame + Copy of input object with new dtype. + + See Also + -------- + infer_objects : Infer dtypes of objects. + to_datetime : Convert argument to datetime. + to_timedelta : Convert argument to timedelta. + to_numeric : Convert argument to a numeric type. + + Notes + ----- + + By default, ``convert_dtypes`` will attempt to convert a Series (or each + Series in a DataFrame) to dtypes that support ``pd.NA``. By using the options + ``convert_string``, ``convert_integer``, and ``convert_boolean``, it is + possible to turn off individual conversions to ``StringDtype``, the integer + extension types or ``BooleanDtype``, respectively. + + For object-dtyped columns, if ``infer_objects`` is ``True``, use the inference + rules as during normal Series/DataFrame construction. Then, if possible, + convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer extension + type, otherwise leave as ``object``. + + If the dtype is integer, convert to an appropriate integer extension type. + + If the dtype is numeric, and consists of all integers, convert to an + appropriate integer extension type. + + In the future, as new dtypes are added that support ``pd.NA``, the results + of this method will change to support those new dtypes. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), + ... "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")), + ... "c": pd.Series([True, False, np.nan], dtype=np.dtype("O")), + ... "d": pd.Series(["h", "i", np.nan], dtype=np.dtype("O")), + ... "e": pd.Series([10, np.nan, 20], dtype=np.dtype("float")), + ... "f": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")), + ... } + ... ) + + Start with a DataFrame with default dtypes. + + >>> df + a b c d e f + 0 1 x True h 10.0 NaN + 1 2 y False i NaN 100.5 + 2 3 z NaN NaN 20.0 200.0 + + >>> df.dtypes + a int32 + b object + c object + d object + e float64 + f float64 + dtype: object + + Convert the DataFrame to use best possible dtypes. + + >>> dfn = df.convert_dtypes() + >>> dfn + a b c d e f + 0 1 x True h 10 NaN + 1 2 y False i 100.5 + 2 3 z 20 200.0 + + >>> dfn.dtypes + a Int32 + b string + c boolean + d string + e Int64 + f float64 + dtype: object + + Start with a Series of strings and missing data represented by ``np.nan``. + + >>> s = pd.Series(["a", "b", np.nan]) + >>> s + 0 a + 1 b + 2 NaN + dtype: object + + Obtain a Series with dtype ``StringDtype``. + + >>> s.convert_dtypes() + 0 a + 1 b + 2 + dtype: string + """ + if self.ndim == 1: + return self._convert_dtypes( + infer_objects, convert_string, convert_integer, convert_boolean + ) + else: + results = [ + col._convert_dtypes( + infer_objects, convert_string, convert_integer, convert_boolean + ) + for col_name, col in self.items() + ] + result = pd.concat(results, axis=1, copy=False) + return result + + # ---------------------------------------------------------------------- + # Filling NA's + + def fillna( + self: FrameOrSeries, + value=None, + method=None, + axis=None, + inplace: bool_t = False, + limit=None, + downcast=None, + ) -> Optional[FrameOrSeries]: + """ + Fill NA/NaN values using the specified method. + + Parameters + ---------- + value : scalar, dict, Series, or DataFrame + Value to use to fill holes (e.g. 0), alternately a + dict/Series/DataFrame of values specifying which value to use for + each index (for a Series) or column (for a DataFrame). Values not + in the dict/Series/DataFrame will not be filled. This value cannot + be a list. + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + Method to use for filling holes in reindexed Series + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use next valid observation to fill gap. + axis : %(axes_single_arg)s + Axis along which to fill missing values. + inplace : bool, default False + If True, fill in-place. Note: this will modify any + other views on this object (e.g., a no-copy slice for a column in a + DataFrame). + limit : int, default None + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. Must be greater than 0 if not None. + downcast : dict, default is None + A dict of item->dtype of what to downcast if possible, + or the string 'infer' which will try to downcast to an appropriate + equal type (e.g. float64 to int64 if possible). + + Returns + ------- + %(klass)s or None + Object with missing values filled or None if ``inplace=True``. + + See Also + -------- + interpolate : Fill NaN values using interpolation. + reindex : Conform object to new index. + asfreq : Convert TimeSeries to specified frequency. + + Examples + -------- + >>> df = pd.DataFrame([[np.nan, 2, np.nan, 0], + ... [3, 4, np.nan, 1], + ... [np.nan, np.nan, np.nan, 5], + ... [np.nan, 3, np.nan, 4]], + ... columns=list('ABCD')) + >>> df + A B C D + 0 NaN 2.0 NaN 0 + 1 3.0 4.0 NaN 1 + 2 NaN NaN NaN 5 + 3 NaN 3.0 NaN 4 + + Replace all NaN elements with 0s. + + >>> df.fillna(0) + A B C D + 0 0.0 2.0 0.0 0 + 1 3.0 4.0 0.0 1 + 2 0.0 0.0 0.0 5 + 3 0.0 3.0 0.0 4 + + We can also propagate non-null values forward or backward. + + >>> df.fillna(method='ffill') + A B C D + 0 NaN 2.0 NaN 0 + 1 3.0 4.0 NaN 1 + 2 3.0 4.0 NaN 5 + 3 3.0 3.0 NaN 4 + + Replace all NaN elements in column 'A', 'B', 'C', and 'D', with 0, 1, + 2, and 3 respectively. + + >>> values = {'A': 0, 'B': 1, 'C': 2, 'D': 3} + >>> df.fillna(value=values) + A B C D + 0 0.0 2.0 2.0 0 + 1 3.0 4.0 2.0 1 + 2 0.0 1.0 2.0 5 + 3 0.0 3.0 2.0 4 + + Only replace the first NaN element. + + >>> df.fillna(value=values, limit=1) + A B C D + 0 0.0 2.0 2.0 0 + 1 3.0 4.0 NaN 1 + 2 NaN 1.0 NaN 5 + 3 NaN 3.0 NaN 4 + """ + inplace = validate_bool_kwarg(inplace, "inplace") + value, method = validate_fillna_kwargs(value, method) + + self._consolidate_inplace() + + # set the default here, so functions examining the signaure + # can detect if something was set (e.g. in groupby) (GH9221) + if axis is None: + axis = 0 + axis = self._get_axis_number(axis) + + if value is None: + + if self._is_mixed_type and axis == 1: + if inplace: + raise NotImplementedError() + result = self.T.fillna(method=method, limit=limit).T + + # need to downcast here because of all of the transposes + result._data = result._data.downcast() + + return result + + new_data = self._data.interpolate( + method=method, + axis=axis, + limit=limit, + inplace=inplace, + coerce=True, + downcast=downcast, + ) + else: + if len(self._get_axis(axis)) == 0: + return self + + if self.ndim == 1: + if isinstance(value, (dict, ABCSeries)): + value = create_series_with_explicit_dtype( + value, dtype_if_empty=object + ) + elif not is_list_like(value): + pass + else: + raise TypeError( + '"value" parameter must be a scalar, dict ' + "or Series, but you passed a " + f'"{type(value).__name__}"' + ) + + new_data = self._data.fillna( + value=value, limit=limit, inplace=inplace, downcast=downcast + ) + + elif isinstance(value, (dict, ABCSeries)): + if axis == 1: + raise NotImplementedError( + "Currently only can fill " + "with dict/Series column " + "by column" + ) + + result = self if inplace else self.copy() + for k, v in value.items(): + if k not in result: + continue + obj = result[k] + obj.fillna(v, limit=limit, inplace=True, downcast=downcast) + return result if not inplace else None + + elif not is_list_like(value): + new_data = self._data.fillna( + value=value, limit=limit, inplace=inplace, downcast=downcast + ) + elif isinstance(value, ABCDataFrame) and self.ndim == 2: + new_data = self.where(self.notna(), value) + else: + raise ValueError(f"invalid fill value with a {type(value)}") + + if inplace: + self._update_inplace(new_data) + return None + else: + return self._constructor(new_data).__finalize__(self) + + def ffill( + self: FrameOrSeries, + axis=None, + inplace: bool_t = False, + limit=None, + downcast=None, + ) -> Optional[FrameOrSeries]: + """ + Synonym for :meth:`DataFrame.fillna` with ``method='ffill'``. + + Returns + ------- + %(klass)s or None + Object with missing values filled or None if ``inplace=True``. + """ + return self.fillna( + method="ffill", axis=axis, inplace=inplace, limit=limit, downcast=downcast + ) + + def bfill( + self: FrameOrSeries, + axis=None, + inplace: bool_t = False, + limit=None, + downcast=None, + ) -> Optional[FrameOrSeries]: + """ + Synonym for :meth:`DataFrame.fillna` with ``method='bfill'``. + + Returns + ------- + %(klass)s or None + Object with missing values filled or None if ``inplace=True``. + """ + return self.fillna( + method="bfill", axis=axis, inplace=inplace, limit=limit, downcast=downcast + ) + + _shared_docs[ + "replace" + ] = """ + Replace values given in `to_replace` with `value`. + + Values of the %(klass)s are replaced with other values dynamically. + This differs from updating with ``.loc`` or ``.iloc``, which require + you to specify a location to update with some value. + + Parameters + ---------- + to_replace : str, regex, list, dict, Series, int, float, or None + How to find the values that will be replaced. + + * numeric, str or regex: + + - numeric: numeric values equal to `to_replace` will be + replaced with `value` + - str: string exactly matching `to_replace` will be replaced + with `value` + - regex: regexs matching `to_replace` will be replaced with + `value` + + * list of str, regex, or numeric: + + - First, if `to_replace` and `value` are both lists, they + **must** be the same length. + - Second, if ``regex=True`` then all of the strings in **both** + lists will be interpreted as regexs otherwise they will match + directly. This doesn't matter much for `value` since there + are only a few possible substitution regexes you can use. + - str, regex and numeric rules apply as above. + + * dict: + + - Dicts can be used to specify different replacement values + for different existing values. For example, + ``{'a': 'b', 'y': 'z'}`` replaces the value 'a' with 'b' and + 'y' with 'z'. To use a dict in this way the `value` + parameter should be `None`. + - For a DataFrame a dict can specify that different values + should be replaced in different columns. For example, + ``{'a': 1, 'b': 'z'}`` looks for the value 1 in column 'a' + and the value 'z' in column 'b' and replaces these values + with whatever is specified in `value`. The `value` parameter + should not be ``None`` in this case. You can treat this as a + special case of passing two lists except that you are + specifying the column to search in. + - For a DataFrame nested dictionaries, e.g., + ``{'a': {'b': np.nan}}``, are read as follows: look in column + 'a' for the value 'b' and replace it with NaN. The `value` + parameter should be ``None`` to use a nested dict in this + way. You can nest regular expressions as well. Note that + column names (the top-level dictionary keys in a nested + dictionary) **cannot** be regular expressions. + + * None: + + - This means that the `regex` argument must be a string, + compiled regular expression, or list, dict, ndarray or + Series of such elements. If `value` is also ``None`` then + this **must** be a nested dictionary or Series. + + See the examples section for examples of each of these. + value : scalar, dict, list, str, regex, default None + Value to replace any values matching `to_replace` with. + For a DataFrame a dict of values can be used to specify which + value to use for each column (columns not in the dict will not be + filled). Regular expressions, strings and lists or dicts of such + objects are also allowed. + inplace : bool, default False + If True, in place. Note: this will modify any + other views on this object (e.g. a column from a DataFrame). + Returns the caller if this is True. + limit : int, default None + Maximum size gap to forward or backward fill. + regex : bool or same types as `to_replace`, default False + Whether to interpret `to_replace` and/or `value` as regular + expressions. If this is ``True`` then `to_replace` *must* be a + string. Alternatively, this could be a regular expression or a + list, dict, or array of regular expressions in which case + `to_replace` must be ``None``. + method : {'pad', 'ffill', 'bfill', `None`} + The method to use when for replacement, when `to_replace` is a + scalar, list or tuple and `value` is ``None``. + + .. versionchanged:: 0.23.0 + Added to DataFrame. + + Returns + ------- + %(klass)s + Object after replacement. + + Raises + ------ + AssertionError + * If `regex` is not a ``bool`` and `to_replace` is not + ``None``. + TypeError + * If `to_replace` is a ``dict`` and `value` is not a ``list``, + ``dict``, ``ndarray``, or ``Series`` + * If `to_replace` is ``None`` and `regex` is not compilable + into a regular expression or is a list, dict, ndarray, or + Series. + * When replacing multiple ``bool`` or ``datetime64`` objects and + the arguments to `to_replace` does not match the type of the + value being replaced + ValueError + * If a ``list`` or an ``ndarray`` is passed to `to_replace` and + `value` but they are not the same length. + + See Also + -------- + %(klass)s.fillna : Fill NA values. + %(klass)s.where : Replace values based on boolean condition. + Series.str.replace : Simple string replacement. + + Notes + ----- + * Regex substitution is performed under the hood with ``re.sub``. The + rules for substitution for ``re.sub`` are the same. + * Regular expressions will only substitute on strings, meaning you + cannot provide, for example, a regular expression matching floating + point numbers and expect the columns in your frame that have a + numeric dtype to be matched. However, if those floating point + numbers *are* strings, then you can do this. + * This method has *a lot* of options. You are encouraged to experiment + and play with this method to gain intuition about how it works. + * When dict is used as the `to_replace` value, it is like + key(s) in the dict are the to_replace part and + value(s) in the dict are the value parameter. + + Examples + -------- + + **Scalar `to_replace` and `value`** + + >>> s = pd.Series([0, 1, 2, 3, 4]) + >>> s.replace(0, 5) + 0 5 + 1 1 + 2 2 + 3 3 + 4 4 + dtype: int64 + + >>> df = pd.DataFrame({'A': [0, 1, 2, 3, 4], + ... 'B': [5, 6, 7, 8, 9], + ... 'C': ['a', 'b', 'c', 'd', 'e']}) + >>> df.replace(0, 5) + A B C + 0 5 5 a + 1 1 6 b + 2 2 7 c + 3 3 8 d + 4 4 9 e + + **List-like `to_replace`** + + >>> df.replace([0, 1, 2, 3], 4) + A B C + 0 4 5 a + 1 4 6 b + 2 4 7 c + 3 4 8 d + 4 4 9 e + + >>> df.replace([0, 1, 2, 3], [4, 3, 2, 1]) + A B C + 0 4 5 a + 1 3 6 b + 2 2 7 c + 3 1 8 d + 4 4 9 e + + >>> s.replace([1, 2], method='bfill') + 0 0 + 1 3 + 2 3 + 3 3 + 4 4 + dtype: int64 + + **dict-like `to_replace`** + + >>> df.replace({0: 10, 1: 100}) + A B C + 0 10 5 a + 1 100 6 b + 2 2 7 c + 3 3 8 d + 4 4 9 e + + >>> df.replace({'A': 0, 'B': 5}, 100) + A B C + 0 100 100 a + 1 1 6 b + 2 2 7 c + 3 3 8 d + 4 4 9 e + + >>> df.replace({'A': {0: 100, 4: 400}}) + A B C + 0 100 5 a + 1 1 6 b + 2 2 7 c + 3 3 8 d + 4 400 9 e + + **Regular expression `to_replace`** + + >>> df = pd.DataFrame({'A': ['bat', 'foo', 'bait'], + ... 'B': ['abc', 'bar', 'xyz']}) + >>> df.replace(to_replace=r'^ba.$', value='new', regex=True) + A B + 0 new abc + 1 foo new + 2 bait xyz + + >>> df.replace({'A': r'^ba.$'}, {'A': 'new'}, regex=True) + A B + 0 new abc + 1 foo bar + 2 bait xyz + + >>> df.replace(regex=r'^ba.$', value='new') + A B + 0 new abc + 1 foo new + 2 bait xyz + + >>> df.replace(regex={r'^ba.$': 'new', 'foo': 'xyz'}) + A B + 0 new abc + 1 xyz new + 2 bait xyz + + >>> df.replace(regex=[r'^ba.$', 'foo'], value='new') + A B + 0 new abc + 1 new new + 2 bait xyz + + Note that when replacing multiple ``bool`` or ``datetime64`` objects, + the data types in the `to_replace` parameter must match the data + type of the value being replaced: + + >>> df = pd.DataFrame({'A': [True, False, True], + ... 'B': [False, True, False]}) + >>> df.replace({'a string': 'new value', True: False}) # raises + Traceback (most recent call last): + ... + TypeError: Cannot compare types 'ndarray(dtype=bool)' and 'str' + + This raises a ``TypeError`` because one of the ``dict`` keys is not of + the correct type for replacement. + + Compare the behavior of ``s.replace({'a': None})`` and + ``s.replace('a', None)`` to understand the peculiarities + of the `to_replace` parameter: + + >>> s = pd.Series([10, 'a', 'a', 'b', 'a']) + + When one uses a dict as the `to_replace` value, it is like the + value(s) in the dict are equal to the `value` parameter. + ``s.replace({'a': None})`` is equivalent to + ``s.replace(to_replace={'a': None}, value=None, method=None)``: + + >>> s.replace({'a': None}) + 0 10 + 1 None + 2 None + 3 b + 4 None + dtype: object + + When ``value=None`` and `to_replace` is a scalar, list or + tuple, `replace` uses the method parameter (default 'pad') to do the + replacement. So this is why the 'a' values are being replaced by 10 + in rows 1 and 2 and 'b' in row 4 in this case. + The command ``s.replace('a', None)`` is actually equivalent to + ``s.replace(to_replace='a', value=None, method='pad')``: + + >>> s.replace('a', None) + 0 10 + 1 10 + 2 10 + 3 b + 4 b + dtype: object + """ + + @Appender(_shared_docs["replace"] % _shared_doc_kwargs) + def replace( + self, + to_replace=None, + value=None, + inplace=False, + limit=None, + regex=False, + method="pad", + ): + inplace = validate_bool_kwarg(inplace, "inplace") + if not is_bool(regex) and to_replace is not None: + raise AssertionError("'to_replace' must be 'None' if 'regex' is not a bool") + + self._consolidate_inplace() + + if value is None: + # passing a single value that is scalar like + # when value is None (GH5319), for compat + if not is_dict_like(to_replace) and not is_dict_like(regex): + to_replace = [to_replace] + + if isinstance(to_replace, (tuple, list)): + if isinstance(self, ABCDataFrame): + return self.apply( + _single_replace, args=(to_replace, method, inplace, limit) + ) + return _single_replace(self, to_replace, method, inplace, limit) + + if not is_dict_like(to_replace): + if not is_dict_like(regex): + raise TypeError( + 'If "to_replace" and "value" are both None ' + 'and "to_replace" is not a list, then ' + "regex must be a mapping" + ) + to_replace = regex + regex = True + + items = list(to_replace.items()) + keys, values = zip(*items) if items else ([], []) + + are_mappings = [is_dict_like(v) for v in values] + + if any(are_mappings): + if not all(are_mappings): + raise TypeError( + "If a nested mapping is passed, all values " + "of the top level mapping must be mappings" + ) + # passed a nested dict/Series + to_rep_dict = {} + value_dict = {} + + for k, v in items: + keys, values = list(zip(*v.items())) or ([], []) + + to_rep_dict[k] = list(keys) + value_dict[k] = list(values) + + to_replace, value = to_rep_dict, value_dict + else: + to_replace, value = keys, values + + return self.replace( + to_replace, value, inplace=inplace, limit=limit, regex=regex + ) + else: + + # need a non-zero len on all axes + if not self.size: + return self + + new_data = self._data + if is_dict_like(to_replace): + if is_dict_like(value): # {'A' : NA} -> {'A' : 0} + res = self if inplace else self.copy() + for c, src in to_replace.items(): + if c in value and c in self: + # object conversion is handled in + # series.replace which is called recursively + res[c] = res[c].replace( + to_replace=src, + value=value[c], + inplace=False, + regex=regex, + ) + return None if inplace else res + + # {'A': NA} -> 0 + elif not is_list_like(value): + keys = [(k, src) for k, src in to_replace.items() if k in self] + keys_len = len(keys) - 1 + for i, (k, src) in enumerate(keys): + convert = i == keys_len + new_data = new_data.replace( + to_replace=src, + value=value, + filter=[k], + inplace=inplace, + regex=regex, + convert=convert, + ) + else: + raise TypeError("value argument must be scalar, dict, or Series") + + elif is_list_like(to_replace): # [NA, ''] -> [0, 'missing'] + if is_list_like(value): + if len(to_replace) != len(value): + raise ValueError( + f"Replacement lists must match in length. " + f"Expecting {len(to_replace)} got {len(value)} " + ) + + new_data = self._data.replace_list( + src_list=to_replace, + dest_list=value, + inplace=inplace, + regex=regex, + ) + + else: # [NA, ''] -> 0 + new_data = self._data.replace( + to_replace=to_replace, value=value, inplace=inplace, regex=regex + ) + elif to_replace is None: + if not ( + is_re_compilable(regex) + or is_list_like(regex) + or is_dict_like(regex) + ): + raise TypeError( + f"'regex' must be a string or a compiled regular expression " + f"or a list or dict of strings or regular expressions, " + f"you passed a {repr(type(regex).__name__)}" + ) + return self.replace( + regex, value, inplace=inplace, limit=limit, regex=True + ) + else: + + # dest iterable dict-like + if is_dict_like(value): # NA -> {'A' : 0, 'B' : -1} + new_data = self._data + + for k, v in value.items(): + if k in self: + new_data = new_data.replace( + to_replace=to_replace, + value=v, + filter=[k], + inplace=inplace, + regex=regex, + ) + + elif not is_list_like(value): # NA -> 0 + new_data = self._data.replace( + to_replace=to_replace, value=value, inplace=inplace, regex=regex + ) + else: + raise TypeError( + f'Invalid "to_replace" type: {repr(type(to_replace).__name__)}' + ) + + if inplace: + self._update_inplace(new_data) + else: + return self._constructor(new_data).__finalize__(self) + + _shared_docs[ + "interpolate" + ] = """ + Please note that only ``method='linear'`` is supported for + DataFrame/Series with a MultiIndex. + + Parameters + ---------- + method : str, default 'linear' + Interpolation technique to use. One of: + + * 'linear': Ignore the index and treat the values as equally + spaced. This is the only method supported on MultiIndexes. + * 'time': Works on daily and higher resolution data to interpolate + given length of interval. + * 'index', 'values': use the actual numerical values of the index. + * 'pad': Fill in NaNs using existing values. + * 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'spline', + 'barycentric', 'polynomial': Passed to + `scipy.interpolate.interp1d`. These methods use the numerical + values of the index. Both 'polynomial' and 'spline' require that + you also specify an `order` (int), e.g. + ``df.interpolate(method='polynomial', order=5)``. + * 'krogh', 'piecewise_polynomial', 'spline', 'pchip', 'akima': + Wrappers around the SciPy interpolation methods of similar + names. See `Notes`. + * 'from_derivatives': Refers to + `scipy.interpolate.BPoly.from_derivatives` which + replaces 'piecewise_polynomial' interpolation method in + scipy 0.18. + axis : {0 or 'index', 1 or 'columns', None}, default None + Axis to interpolate along. + limit : int, optional + Maximum number of consecutive NaNs to fill. Must be greater than + 0. + inplace : bool, default False + Update the data in place if possible. + limit_direction : {'forward', 'backward', 'both'}, default 'forward' + If limit is specified, consecutive NaNs will be filled in this + direction. + limit_area : {`None`, 'inside', 'outside'}, default None + If limit is specified, consecutive NaNs will be filled with this + restriction. + + * ``None``: No fill restriction. + * 'inside': Only fill NaNs surrounded by valid values + (interpolate). + * 'outside': Only fill NaNs outside valid values (extrapolate). + + .. versionadded:: 0.23.0 + + downcast : optional, 'infer' or None, defaults to None + Downcast dtypes if possible. + **kwargs + Keyword arguments to pass on to the interpolating function. + + Returns + ------- + Series or DataFrame + Returns the same object type as the caller, interpolated at + some or all ``NaN`` values. + + See Also + -------- + fillna : Fill missing values using different methods. + scipy.interpolate.Akima1DInterpolator : Piecewise cubic polynomials + (Akima interpolator). + scipy.interpolate.BPoly.from_derivatives : Piecewise polynomial in the + Bernstein basis. + scipy.interpolate.interp1d : Interpolate a 1-D function. + scipy.interpolate.KroghInterpolator : Interpolate polynomial (Krogh + interpolator). + scipy.interpolate.PchipInterpolator : PCHIP 1-d monotonic cubic + interpolation. + scipy.interpolate.CubicSpline : Cubic spline data interpolator. + + Notes + ----- + The 'krogh', 'piecewise_polynomial', 'spline', 'pchip' and 'akima' + methods are wrappers around the respective SciPy implementations of + similar names. These use the actual numerical values of the index. + For more information on their behavior, see the + `SciPy documentation + `__ + and `SciPy tutorial + `__. + + Examples + -------- + Filling in ``NaN`` in a :class:`~pandas.Series` via linear + interpolation. + + >>> s = pd.Series([0, 1, np.nan, 3]) + >>> s + 0 0.0 + 1 1.0 + 2 NaN + 3 3.0 + dtype: float64 + >>> s.interpolate() + 0 0.0 + 1 1.0 + 2 2.0 + 3 3.0 + dtype: float64 + + Filling in ``NaN`` in a Series by padding, but filling at most two + consecutive ``NaN`` at a time. + + >>> s = pd.Series([np.nan, "single_one", np.nan, + ... "fill_two_more", np.nan, np.nan, np.nan, + ... 4.71, np.nan]) + >>> s + 0 NaN + 1 single_one + 2 NaN + 3 fill_two_more + 4 NaN + 5 NaN + 6 NaN + 7 4.71 + 8 NaN + dtype: object + >>> s.interpolate(method='pad', limit=2) + 0 NaN + 1 single_one + 2 single_one + 3 fill_two_more + 4 fill_two_more + 5 fill_two_more + 6 NaN + 7 4.71 + 8 4.71 + dtype: object + + Filling in ``NaN`` in a Series via polynomial interpolation or splines: + Both 'polynomial' and 'spline' methods require that you also specify + an ``order`` (int). + + >>> s = pd.Series([0, 2, np.nan, 8]) + >>> s.interpolate(method='polynomial', order=2) + 0 0.000000 + 1 2.000000 + 2 4.666667 + 3 8.000000 + dtype: float64 + + Fill the DataFrame forward (that is, going down) along each column + using linear interpolation. + + Note how the last entry in column 'a' is interpolated differently, + because there is no entry after it to use for interpolation. + Note how the first entry in column 'b' remains ``NaN``, because there + is no entry before it to use for interpolation. + + >>> df = pd.DataFrame([(0.0, np.nan, -1.0, 1.0), + ... (np.nan, 2.0, np.nan, np.nan), + ... (2.0, 3.0, np.nan, 9.0), + ... (np.nan, 4.0, -4.0, 16.0)], + ... columns=list('abcd')) + >>> df + a b c d + 0 0.0 NaN -1.0 1.0 + 1 NaN 2.0 NaN NaN + 2 2.0 3.0 NaN 9.0 + 3 NaN 4.0 -4.0 16.0 + >>> df.interpolate(method='linear', limit_direction='forward', axis=0) + a b c d + 0 0.0 NaN -1.0 1.0 + 1 1.0 2.0 -2.0 5.0 + 2 2.0 3.0 -3.0 9.0 + 3 2.0 4.0 -4.0 16.0 + + Using polynomial interpolation. + + >>> df['d'].interpolate(method='polynomial', order=2) + 0 1.0 + 1 4.0 + 2 9.0 + 3 16.0 + Name: d, dtype: float64 + """ + + @Appender(_shared_docs["interpolate"] % _shared_doc_kwargs) + def interpolate( + self, + method="linear", + axis=0, + limit=None, + inplace=False, + limit_direction="forward", + limit_area=None, + downcast=None, + **kwargs, + ): + """ + Interpolate values according to different methods. + """ + inplace = validate_bool_kwarg(inplace, "inplace") + + axis = self._get_axis_number(axis) + + if axis == 0: + ax = self._info_axis_name + _maybe_transposed_self = self + elif axis == 1: + _maybe_transposed_self = self.T + ax = 1 + + ax = _maybe_transposed_self._get_axis_number(ax) + + if _maybe_transposed_self.ndim == 2: + alt_ax = 1 - ax + else: + alt_ax = ax + + if isinstance(_maybe_transposed_self.index, MultiIndex) and method != "linear": + raise ValueError( + "Only `method=linear` interpolation is supported on MultiIndexes." + ) + + if _maybe_transposed_self._data.get_dtype_counts().get("object") == len( + _maybe_transposed_self.T + ): + raise TypeError( + "Cannot interpolate with all object-dtype columns " + "in the DataFrame. Try setting at least one " + "column to a numeric dtype." + ) + + # create/use the index + if method == "linear": + # prior default + index = np.arange(len(_maybe_transposed_self._get_axis(alt_ax))) + else: + index = _maybe_transposed_self._get_axis(alt_ax) + methods = {"index", "values", "nearest", "time"} + is_numeric_or_datetime = ( + is_numeric_dtype(index) + or is_datetime64_any_dtype(index) + or is_timedelta64_dtype(index) + ) + if method not in methods and not is_numeric_or_datetime: + raise ValueError( + "Index column must be numeric or datetime type when " + f"using {method} method other than linear. " + "Try setting a numeric or datetime index column before " + "interpolating." + ) + + if isna(index).any(): + raise NotImplementedError( + "Interpolation with NaNs in the index " + "has not been implemented. Try filling " + "those NaNs before interpolating." + ) + data = _maybe_transposed_self._data + new_data = data.interpolate( + method=method, + axis=ax, + index=index, + values=_maybe_transposed_self, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + inplace=inplace, + downcast=downcast, + **kwargs, + ) + + if inplace: + if axis == 1: + new_data = self._constructor(new_data).T._data + self._update_inplace(new_data) + else: + res = self._constructor(new_data).__finalize__(self) + if axis == 1: + res = res.T + return res + + # ---------------------------------------------------------------------- + # Timeseries methods Methods + + def asof(self, where, subset=None): + """ + Return the last row(s) without any NaNs before `where`. + + The last row (for each element in `where`, if list) without any + NaN is taken. + In case of a :class:`~pandas.DataFrame`, the last row without NaN + considering only the subset of columns (if not `None`) + + If there is no good value, NaN is returned for a Series or + a Series of NaN values for a DataFrame + + Parameters + ---------- + where : date or array-like of dates + Date(s) before which the last row(s) are returned. + subset : str or array-like of str, default `None` + For DataFrame, if not `None`, only use these columns to + check for NaNs. + + Returns + ------- + scalar, Series, or DataFrame + + The return can be: + + * scalar : when `self` is a Series and `where` is a scalar + * Series: when `self` is a Series and `where` is an array-like, + or when `self` is a DataFrame and `where` is a scalar + * DataFrame : when `self` is a DataFrame and `where` is an + array-like + + Return scalar, Series, or DataFrame. + + See Also + -------- + merge_asof : Perform an asof merge. Similar to left join. + + Notes + ----- + Dates are assumed to be sorted. Raises if this is not the case. + + Examples + -------- + A Series and a scalar `where`. + + >>> s = pd.Series([1, 2, np.nan, 4], index=[10, 20, 30, 40]) + >>> s + 10 1.0 + 20 2.0 + 30 NaN + 40 4.0 + dtype: float64 + + >>> s.asof(20) + 2.0 + + For a sequence `where`, a Series is returned. The first value is + NaN, because the first element of `where` is before the first + index value. + + >>> s.asof([5, 20]) + 5 NaN + 20 2.0 + dtype: float64 + + Missing values are not considered. The following is ``2.0``, not + NaN, even though NaN is at the index location for ``30``. + + >>> s.asof(30) + 2.0 + + Take all columns into consideration + + >>> df = pd.DataFrame({'a': [10, 20, 30, 40, 50], + ... 'b': [None, None, None, None, 500]}, + ... index=pd.DatetimeIndex(['2018-02-27 09:01:00', + ... '2018-02-27 09:02:00', + ... '2018-02-27 09:03:00', + ... '2018-02-27 09:04:00', + ... '2018-02-27 09:05:00'])) + >>> df.asof(pd.DatetimeIndex(['2018-02-27 09:03:30', + ... '2018-02-27 09:04:30'])) + a b + 2018-02-27 09:03:30 NaN NaN + 2018-02-27 09:04:30 NaN NaN + + Take a single column into consideration + + >>> df.asof(pd.DatetimeIndex(['2018-02-27 09:03:30', + ... '2018-02-27 09:04:30']), + ... subset=['a']) + a b + 2018-02-27 09:03:30 30.0 NaN + 2018-02-27 09:04:30 40.0 NaN + """ + if isinstance(where, str): + where = Timestamp(where) + + if not self.index.is_monotonic: + raise ValueError("asof requires a sorted index") + + is_series = isinstance(self, ABCSeries) + if is_series: + if subset is not None: + raise ValueError("subset is not valid for Series") + else: + if subset is None: + subset = self.columns + if not is_list_like(subset): + subset = [subset] + + is_list = is_list_like(where) + if not is_list: + start = self.index[0] + if isinstance(self.index, PeriodIndex): + where = Period(where, freq=self.index.freq) + + if where < start: + if not is_series: + from pandas import Series + + return Series(index=self.columns, name=where, dtype=np.float64) + return np.nan + + # It's always much faster to use a *while* loop here for + # Series than pre-computing all the NAs. However a + # *while* loop is extremely expensive for DataFrame + # so we later pre-compute all the NAs and use the same + # code path whether *where* is a scalar or list. + # See PR: https://github.com/pandas-dev/pandas/pull/14476 + if is_series: + loc = self.index.searchsorted(where, side="right") + if loc > 0: + loc -= 1 + + values = self._values + while loc > 0 and isna(values[loc]): + loc -= 1 + return values[loc] + + if not isinstance(where, Index): + where = Index(where) if is_list else Index([where]) + + nulls = self.isna() if is_series else self[subset].isna().any(1) + if nulls.all(): + if is_series: + return self._constructor(np.nan, index=where, name=self.name) + elif is_list: + from pandas import DataFrame + + return DataFrame(np.nan, index=where, columns=self.columns) + else: + from pandas import Series + + return Series(np.nan, index=self.columns, name=where[0]) + + locs = self.index.asof_locs(where, ~(nulls.values)) + + # mask the missing + missing = locs == -1 + data = self.take(locs) + data.index = where + data.loc[missing] = np.nan + return data if is_list else data.iloc[-1] + + # ---------------------------------------------------------------------- + # Action Methods + + _shared_docs[ + "isna" + ] = """ + Detect missing values. + + Return a boolean same-sized object indicating if the values are NA. + NA values, such as None or :attr:`numpy.NaN`, gets mapped to True + values. + Everything else gets mapped to False values. Characters such as empty + strings ``''`` or :attr:`numpy.inf` are not considered NA values + (unless you set ``pandas.options.mode.use_inf_as_na = True``). + + Returns + ------- + %(klass)s + Mask of bool values for each element in %(klass)s that + indicates whether an element is not an NA value. + + See Also + -------- + %(klass)s.isnull : Alias of isna. + %(klass)s.notna : Boolean inverse of isna. + %(klass)s.dropna : Omit axes labels with missing values. + isna : Top-level isna. + + Examples + -------- + Show which entries in a DataFrame are NA. + + >>> df = pd.DataFrame({'age': [5, 6, np.NaN], + ... 'born': [pd.NaT, pd.Timestamp('1939-05-27'), + ... pd.Timestamp('1940-04-25')], + ... 'name': ['Alfred', 'Batman', ''], + ... 'toy': [None, 'Batmobile', 'Joker']}) + >>> df + age born name toy + 0 5.0 NaT Alfred None + 1 6.0 1939-05-27 Batman Batmobile + 2 NaN 1940-04-25 Joker + + >>> df.isna() + age born name toy + 0 False True False True + 1 False False False False + 2 True False False False + + Show which entries in a Series are NA. + + >>> ser = pd.Series([5, 6, np.NaN]) + >>> ser + 0 5.0 + 1 6.0 + 2 NaN + dtype: float64 + + >>> ser.isna() + 0 False + 1 False + 2 True + dtype: bool + """ + + @Appender(_shared_docs["isna"] % _shared_doc_kwargs) + def isna(self: FrameOrSeries) -> FrameOrSeries: + return isna(self).__finalize__(self) + + @Appender(_shared_docs["isna"] % _shared_doc_kwargs) + def isnull(self: FrameOrSeries) -> FrameOrSeries: + return isna(self).__finalize__(self) + + _shared_docs[ + "notna" + ] = """ + Detect existing (non-missing) values. + + Return a boolean same-sized object indicating if the values are not NA. + Non-missing values get mapped to True. Characters such as empty + strings ``''`` or :attr:`numpy.inf` are not considered NA values + (unless you set ``pandas.options.mode.use_inf_as_na = True``). + NA values, such as None or :attr:`numpy.NaN`, get mapped to False + values. + + Returns + ------- + %(klass)s + Mask of bool values for each element in %(klass)s that + indicates whether an element is not an NA value. + + See Also + -------- + %(klass)s.notnull : Alias of notna. + %(klass)s.isna : Boolean inverse of notna. + %(klass)s.dropna : Omit axes labels with missing values. + notna : Top-level notna. + + Examples + -------- + Show which entries in a DataFrame are not NA. + + >>> df = pd.DataFrame({'age': [5, 6, np.NaN], + ... 'born': [pd.NaT, pd.Timestamp('1939-05-27'), + ... pd.Timestamp('1940-04-25')], + ... 'name': ['Alfred', 'Batman', ''], + ... 'toy': [None, 'Batmobile', 'Joker']}) + >>> df + age born name toy + 0 5.0 NaT Alfred None + 1 6.0 1939-05-27 Batman Batmobile + 2 NaN 1940-04-25 Joker + + >>> df.notna() + age born name toy + 0 True False True False + 1 True True True True + 2 False True True True + + Show which entries in a Series are not NA. + + >>> ser = pd.Series([5, 6, np.NaN]) + >>> ser + 0 5.0 + 1 6.0 + 2 NaN + dtype: float64 + + >>> ser.notna() + 0 True + 1 True + 2 False + dtype: bool + """ + + @Appender(_shared_docs["notna"] % _shared_doc_kwargs) + def notna(self: FrameOrSeries) -> FrameOrSeries: + return notna(self).__finalize__(self) + + @Appender(_shared_docs["notna"] % _shared_doc_kwargs) + def notnull(self: FrameOrSeries) -> FrameOrSeries: + return notna(self).__finalize__(self) + + def _clip_with_scalar(self, lower, upper, inplace: bool_t = False): + if (lower is not None and np.any(isna(lower))) or ( + upper is not None and np.any(isna(upper)) + ): + raise ValueError("Cannot use an NA value as a clip threshold") + + result = self + mask = isna(self.values) + + with np.errstate(all="ignore"): + if upper is not None: + subset = self.to_numpy() <= upper + result = result.where(subset, upper, axis=None, inplace=False) + if lower is not None: + subset = self.to_numpy() >= lower + result = result.where(subset, lower, axis=None, inplace=False) + + if np.any(mask): + result[mask] = np.nan + + if inplace: + self._update_inplace(result) + else: + return result + + def _clip_with_one_bound(self, threshold, method, axis, inplace): + + if axis is not None: + axis = self._get_axis_number(axis) + + # method is self.le for upper bound and self.ge for lower bound + if is_scalar(threshold) and is_number(threshold): + if method.__name__ == "le": + return self._clip_with_scalar(None, threshold, inplace=inplace) + return self._clip_with_scalar(threshold, None, inplace=inplace) + + subset = method(threshold, axis=axis) | isna(self) + + # GH #15390 + # In order for where method to work, the threshold must + # be transformed to NDFrame from other array like structure. + if (not isinstance(threshold, ABCSeries)) and is_list_like(threshold): + if isinstance(self, ABCSeries): + threshold = self._constructor(threshold, index=self.index) + else: + threshold = _align_method_FRAME(self, threshold, axis) + return self.where(subset, threshold, axis=axis, inplace=inplace) + + def clip( + self: FrameOrSeries, + lower=None, + upper=None, + axis=None, + inplace: bool_t = False, + *args, + **kwargs, + ) -> FrameOrSeries: + """ + Trim values at input threshold(s). + + Assigns values outside boundary to boundary values. Thresholds + can be singular values or array like, and in the latter case + the clipping is performed element-wise in the specified axis. + + Parameters + ---------- + lower : float or array_like, default None + Minimum threshold value. All values below this + threshold will be set to it. + upper : float or array_like, default None + Maximum threshold value. All values above this + threshold will be set to it. + axis : int or str axis name, optional + Align object with lower and upper along the given axis. + inplace : bool, default False + Whether to perform the operation in place on the data. + + .. versionadded:: 0.21.0 + *args, **kwargs + Additional keywords have no effect but might be accepted + for compatibility with numpy. + + Returns + ------- + Series or DataFrame + Same type as calling object with the values outside the + clip boundaries replaced. + + Examples + -------- + >>> data = {'col_0': [9, -3, 0, -1, 5], 'col_1': [-2, -7, 6, 8, -5]} + >>> df = pd.DataFrame(data) + >>> df + col_0 col_1 + 0 9 -2 + 1 -3 -7 + 2 0 6 + 3 -1 8 + 4 5 -5 + + Clips per column using lower and upper thresholds: + + >>> df.clip(-4, 6) + col_0 col_1 + 0 6 -2 + 1 -3 -4 + 2 0 6 + 3 -1 6 + 4 5 -4 + + Clips using specific lower and upper thresholds per column element: + + >>> t = pd.Series([2, -4, -1, 6, 3]) + >>> t + 0 2 + 1 -4 + 2 -1 + 3 6 + 4 3 + dtype: int64 + + >>> df.clip(t, t + 4, axis=0) + col_0 col_1 + 0 6 2 + 1 -3 -4 + 2 0 3 + 3 6 8 + 4 5 3 + """ + inplace = validate_bool_kwarg(inplace, "inplace") + + axis = nv.validate_clip_with_axis(axis, args, kwargs) + if axis is not None: + axis = self._get_axis_number(axis) + + # GH 17276 + # numpy doesn't like NaN as a clip value + # so ignore + # GH 19992 + # numpy doesn't drop a list-like bound containing NaN + if not is_list_like(lower) and np.any(isna(lower)): + lower = None + if not is_list_like(upper) and np.any(isna(upper)): + upper = None + + # GH 2747 (arguments were reversed) + if lower is not None and upper is not None: + if is_scalar(lower) and is_scalar(upper): + lower, upper = min(lower, upper), max(lower, upper) + + # fast-path for scalars + if (lower is None or (is_scalar(lower) and is_number(lower))) and ( + upper is None or (is_scalar(upper) and is_number(upper)) + ): + return self._clip_with_scalar(lower, upper, inplace=inplace) + + result = self + if lower is not None: + result = result._clip_with_one_bound( + lower, method=self.ge, axis=axis, inplace=inplace + ) + if upper is not None: + if inplace: + result = self + result = result._clip_with_one_bound( + upper, method=self.le, axis=axis, inplace=inplace + ) + + return result + + _shared_docs[ + "groupby" + ] = """ + Group %(klass)s using a mapper or by a Series of columns. + + A groupby operation involves some combination of splitting the + object, applying a function, and combining the results. This can be + used to group large amounts of data and compute operations on these + groups. + + Parameters + ---------- + by : mapping, function, label, or list of labels + Used to determine the groups for the groupby. + If ``by`` is a function, it's called on each value of the object's + index. If a dict or Series is passed, the Series or dict VALUES + will be used to determine the groups (the Series' values are first + aligned; see ``.align()`` method). If an ndarray is passed, the + values are used as-is determine the groups. A label or list of + labels may be passed to group by the columns in ``self``. Notice + that a tuple is interpreted as a (single) key. + axis : {0 or 'index', 1 or 'columns'}, default 0 + Split along rows (0) or columns (1). + level : int, level name, or sequence of such, default None + If the axis is a MultiIndex (hierarchical), group by a particular + level or levels. + as_index : bool, default True + For aggregated output, return object with group labels as the + index. Only relevant for DataFrame input. as_index=False is + effectively "SQL-style" grouped output. + sort : bool, default True + Sort group keys. Get better performance by turning this off. + Note this does not influence the order of observations within each + group. Groupby preserves the order of rows within each group. + group_keys : bool, default True + When calling apply, add group keys to index to identify pieces. + squeeze : bool, default False + Reduce the dimensionality of the return type if possible, + otherwise return a consistent type. + observed : bool, default False + This only applies if any of the groupers are Categoricals. + If True: only show observed values for categorical groupers. + If False: show all values for categorical groupers. + + .. versionadded:: 0.23.0 + + Returns + ------- + %(klass)sGroupBy + Returns a groupby object that contains information about the groups. + + See Also + -------- + resample : Convenience method for frequency conversion and resampling + of time series. + + Notes + ----- + See the `user guide + `_ for more. + """ + + def asfreq( + self: FrameOrSeries, + freq, + method=None, + how: Optional[str] = None, + normalize: bool_t = False, + fill_value=None, + ) -> FrameOrSeries: + """ + Convert TimeSeries to specified frequency. + + Optionally provide filling method to pad/backfill missing values. + + Returns the original data conformed to a new index with the specified + frequency. ``resample`` is more appropriate if an operation, such as + summarization, is necessary to represent the data at the new frequency. + + Parameters + ---------- + freq : DateOffset or str + method : {'backfill'/'bfill', 'pad'/'ffill'}, default None + Method to use for filling holes in reindexed Series (note this + does not fill NaNs that already were present): + + * 'pad' / 'ffill': propagate last valid observation forward to next + valid + * 'backfill' / 'bfill': use NEXT valid observation to fill. + how : {'start', 'end'}, default end + For PeriodIndex only (see PeriodIndex.asfreq). + normalize : bool, default False + Whether to reset output index to midnight. + fill_value : scalar, optional + Value to use for missing values, applied during upsampling (note + this does not fill NaNs that already were present). + + Returns + ------- + converted : same type as caller + + See Also + -------- + reindex + + Notes + ----- + To learn more about the frequency strings, please see `this link + `__. + + Examples + -------- + + Start by creating a series with 4 one minute timestamps. + + >>> index = pd.date_range('1/1/2000', periods=4, freq='T') + >>> series = pd.Series([0.0, None, 2.0, 3.0], index=index) + >>> df = pd.DataFrame({'s':series}) + >>> df + s + 2000-01-01 00:00:00 0.0 + 2000-01-01 00:01:00 NaN + 2000-01-01 00:02:00 2.0 + 2000-01-01 00:03:00 3.0 + + Upsample the series into 30 second bins. + + >>> df.asfreq(freq='30S') + s + 2000-01-01 00:00:00 0.0 + 2000-01-01 00:00:30 NaN + 2000-01-01 00:01:00 NaN + 2000-01-01 00:01:30 NaN + 2000-01-01 00:02:00 2.0 + 2000-01-01 00:02:30 NaN + 2000-01-01 00:03:00 3.0 + + Upsample again, providing a ``fill value``. + + >>> df.asfreq(freq='30S', fill_value=9.0) + s + 2000-01-01 00:00:00 0.0 + 2000-01-01 00:00:30 9.0 + 2000-01-01 00:01:00 NaN + 2000-01-01 00:01:30 9.0 + 2000-01-01 00:02:00 2.0 + 2000-01-01 00:02:30 9.0 + 2000-01-01 00:03:00 3.0 + + Upsample again, providing a ``method``. + + >>> df.asfreq(freq='30S', method='bfill') + s + 2000-01-01 00:00:00 0.0 + 2000-01-01 00:00:30 NaN + 2000-01-01 00:01:00 NaN + 2000-01-01 00:01:30 2.0 + 2000-01-01 00:02:00 2.0 + 2000-01-01 00:02:30 3.0 + 2000-01-01 00:03:00 3.0 + """ + from pandas.core.resample import asfreq + + return asfreq( + self, + freq, + method=method, + how=how, + normalize=normalize, + fill_value=fill_value, + ) + + def at_time( + self: FrameOrSeries, time, asof: bool_t = False, axis=None + ) -> FrameOrSeries: + """ + Select values at particular time of day (e.g. 9:30AM). + + Parameters + ---------- + time : datetime.time or str + axis : {0 or 'index', 1 or 'columns'}, default 0 + + .. versionadded:: 0.24.0 + + Returns + ------- + Series or DataFrame + + Raises + ------ + TypeError + If the index is not a :class:`DatetimeIndex` + + See Also + -------- + between_time : Select values between particular times of the day. + first : Select initial periods of time series based on a date offset. + last : Select final periods of time series based on a date offset. + DatetimeIndex.indexer_at_time : Get just the index locations for + values at particular time of the day. + + Examples + -------- + >>> i = pd.date_range('2018-04-09', periods=4, freq='12H') + >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i) + >>> ts + A + 2018-04-09 00:00:00 1 + 2018-04-09 12:00:00 2 + 2018-04-10 00:00:00 3 + 2018-04-10 12:00:00 4 + + >>> ts.at_time('12:00') + A + 2018-04-09 12:00:00 2 + 2018-04-10 12:00:00 4 + """ + if axis is None: + axis = self._stat_axis_number + axis = self._get_axis_number(axis) + + index = self._get_axis(axis) + try: + indexer = index.indexer_at_time(time, asof=asof) + except AttributeError: + raise TypeError("Index must be DatetimeIndex") + + return self._take_with_is_copy(indexer, axis=axis) + + def between_time( + self: FrameOrSeries, + start_time, + end_time, + include_start: bool_t = True, + include_end: bool_t = True, + axis=None, + ) -> FrameOrSeries: + """ + Select values between particular times of the day (e.g., 9:00-9:30 AM). + + By setting ``start_time`` to be later than ``end_time``, + you can get the times that are *not* between the two times. + + Parameters + ---------- + start_time : datetime.time or str + end_time : datetime.time or str + include_start : bool, default True + include_end : bool, default True + axis : {0 or 'index', 1 or 'columns'}, default 0 + + .. versionadded:: 0.24.0 + + Returns + ------- + Series or DataFrame + + Raises + ------ + TypeError + If the index is not a :class:`DatetimeIndex` + + See Also + -------- + at_time : Select values at a particular time of the day. + first : Select initial periods of time series based on a date offset. + last : Select final periods of time series based on a date offset. + DatetimeIndex.indexer_between_time : Get just the index locations for + values between particular times of the day. + + Examples + -------- + >>> i = pd.date_range('2018-04-09', periods=4, freq='1D20min') + >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i) + >>> ts + A + 2018-04-09 00:00:00 1 + 2018-04-10 00:20:00 2 + 2018-04-11 00:40:00 3 + 2018-04-12 01:00:00 4 + + >>> ts.between_time('0:15', '0:45') + A + 2018-04-10 00:20:00 2 + 2018-04-11 00:40:00 3 + + You get the times that are *not* between two times by setting + ``start_time`` later than ``end_time``: + + >>> ts.between_time('0:45', '0:15') + A + 2018-04-09 00:00:00 1 + 2018-04-12 01:00:00 4 + """ + if axis is None: + axis = self._stat_axis_number + axis = self._get_axis_number(axis) + + index = self._get_axis(axis) + try: + indexer = index.indexer_between_time( + start_time, + end_time, + include_start=include_start, + include_end=include_end, + ) + except AttributeError: + raise TypeError("Index must be DatetimeIndex") + + return self._take_with_is_copy(indexer, axis=axis) + + def resample( + self, + rule, + axis=0, + closed: Optional[str] = None, + label: Optional[str] = None, + convention: str = "start", + kind: Optional[str] = None, + loffset=None, + base: int = 0, + on=None, + level=None, + ): + """ + Resample time-series data. + + Convenience method for frequency conversion and resampling of time + series. Object must have a datetime-like index (`DatetimeIndex`, + `PeriodIndex`, or `TimedeltaIndex`), or pass datetime-like values + to the `on` or `level` keyword. + + Parameters + ---------- + rule : DateOffset, Timedelta or str + The offset string or object representing target conversion. + axis : {0 or 'index', 1 or 'columns'}, default 0 + Which axis to use for up- or down-sampling. For `Series` this + will default to 0, i.e. along the rows. Must be + `DatetimeIndex`, `TimedeltaIndex` or `PeriodIndex`. + closed : {'right', 'left'}, default None + Which side of bin interval is closed. The default is 'left' + for all frequency offsets except for 'M', 'A', 'Q', 'BM', + 'BA', 'BQ', and 'W' which all have a default of 'right'. + label : {'right', 'left'}, default None + Which bin edge label to label bucket with. The default is 'left' + for all frequency offsets except for 'M', 'A', 'Q', 'BM', + 'BA', 'BQ', and 'W' which all have a default of 'right'. + convention : {'start', 'end', 's', 'e'}, default 'start' + For `PeriodIndex` only, controls whether to use the start or + end of `rule`. + kind : {'timestamp', 'period'}, optional, default None + Pass 'timestamp' to convert the resulting index to a + `DateTimeIndex` or 'period' to convert it to a `PeriodIndex`. + By default the input representation is retained. + loffset : timedelta, default None + Adjust the resampled time labels. + base : int, default 0 + For frequencies that evenly subdivide 1 day, the "origin" of the + aggregated intervals. For example, for '5min' frequency, base could + range from 0 through 4. Defaults to 0. + on : str, optional + For a DataFrame, column to use instead of index for resampling. + Column must be datetime-like. + + level : str or int, optional + For a MultiIndex, level (name or number) to use for + resampling. `level` must be datetime-like. + + Returns + ------- + Resampler object + + See Also + -------- + groupby : Group by mapping, function, label, or list of labels. + Series.resample : Resample a Series. + DataFrame.resample: Resample a DataFrame. + + Notes + ----- + See the `user guide + `_ + for more. + + To learn more about the offset strings, please see `this link + `__. + + Examples + -------- + + Start by creating a series with 9 one minute timestamps. + + >>> index = pd.date_range('1/1/2000', periods=9, freq='T') + >>> series = pd.Series(range(9), index=index) + >>> series + 2000-01-01 00:00:00 0 + 2000-01-01 00:01:00 1 + 2000-01-01 00:02:00 2 + 2000-01-01 00:03:00 3 + 2000-01-01 00:04:00 4 + 2000-01-01 00:05:00 5 + 2000-01-01 00:06:00 6 + 2000-01-01 00:07:00 7 + 2000-01-01 00:08:00 8 + Freq: T, dtype: int64 + + Downsample the series into 3 minute bins and sum the values + of the timestamps falling into a bin. + + >>> series.resample('3T').sum() + 2000-01-01 00:00:00 3 + 2000-01-01 00:03:00 12 + 2000-01-01 00:06:00 21 + Freq: 3T, dtype: int64 + + Downsample the series into 3 minute bins as above, but label each + bin using the right edge instead of the left. Please note that the + value in the bucket used as the label is not included in the bucket, + which it labels. For example, in the original series the + bucket ``2000-01-01 00:03:00`` contains the value 3, but the summed + value in the resampled bucket with the label ``2000-01-01 00:03:00`` + does not include 3 (if it did, the summed value would be 6, not 3). + To include this value close the right side of the bin interval as + illustrated in the example below this one. + + >>> series.resample('3T', label='right').sum() + 2000-01-01 00:03:00 3 + 2000-01-01 00:06:00 12 + 2000-01-01 00:09:00 21 + Freq: 3T, dtype: int64 + + Downsample the series into 3 minute bins as above, but close the right + side of the bin interval. + + >>> series.resample('3T', label='right', closed='right').sum() + 2000-01-01 00:00:00 0 + 2000-01-01 00:03:00 6 + 2000-01-01 00:06:00 15 + 2000-01-01 00:09:00 15 + Freq: 3T, dtype: int64 + + Upsample the series into 30 second bins. + + >>> series.resample('30S').asfreq()[0:5] # Select first 5 rows + 2000-01-01 00:00:00 0.0 + 2000-01-01 00:00:30 NaN + 2000-01-01 00:01:00 1.0 + 2000-01-01 00:01:30 NaN + 2000-01-01 00:02:00 2.0 + Freq: 30S, dtype: float64 + + Upsample the series into 30 second bins and fill the ``NaN`` + values using the ``pad`` method. + + >>> series.resample('30S').pad()[0:5] + 2000-01-01 00:00:00 0 + 2000-01-01 00:00:30 0 + 2000-01-01 00:01:00 1 + 2000-01-01 00:01:30 1 + 2000-01-01 00:02:00 2 + Freq: 30S, dtype: int64 + + Upsample the series into 30 second bins and fill the + ``NaN`` values using the ``bfill`` method. + + >>> series.resample('30S').bfill()[0:5] + 2000-01-01 00:00:00 0 + 2000-01-01 00:00:30 1 + 2000-01-01 00:01:00 1 + 2000-01-01 00:01:30 2 + 2000-01-01 00:02:00 2 + Freq: 30S, dtype: int64 + + Pass a custom function via ``apply`` + + >>> def custom_resampler(array_like): + ... return np.sum(array_like) + 5 + ... + >>> series.resample('3T').apply(custom_resampler) + 2000-01-01 00:00:00 8 + 2000-01-01 00:03:00 17 + 2000-01-01 00:06:00 26 + Freq: 3T, dtype: int64 + + For a Series with a PeriodIndex, the keyword `convention` can be + used to control whether to use the start or end of `rule`. + + Resample a year by quarter using 'start' `convention`. Values are + assigned to the first quarter of the period. + + >>> s = pd.Series([1, 2], index=pd.period_range('2012-01-01', + ... freq='A', + ... periods=2)) + >>> s + 2012 1 + 2013 2 + Freq: A-DEC, dtype: int64 + >>> s.resample('Q', convention='start').asfreq() + 2012Q1 1.0 + 2012Q2 NaN + 2012Q3 NaN + 2012Q4 NaN + 2013Q1 2.0 + 2013Q2 NaN + 2013Q3 NaN + 2013Q4 NaN + Freq: Q-DEC, dtype: float64 + + Resample quarters by month using 'end' `convention`. Values are + assigned to the last month of the period. + + >>> q = pd.Series([1, 2, 3, 4], index=pd.period_range('2018-01-01', + ... freq='Q', + ... periods=4)) + >>> q + 2018Q1 1 + 2018Q2 2 + 2018Q3 3 + 2018Q4 4 + Freq: Q-DEC, dtype: int64 + >>> q.resample('M', convention='end').asfreq() + 2018-03 1.0 + 2018-04 NaN + 2018-05 NaN + 2018-06 2.0 + 2018-07 NaN + 2018-08 NaN + 2018-09 3.0 + 2018-10 NaN + 2018-11 NaN + 2018-12 4.0 + Freq: M, dtype: float64 + + For DataFrame objects, the keyword `on` can be used to specify the + column instead of the index for resampling. + + >>> d = dict({'price': [10, 11, 9, 13, 14, 18, 17, 19], + ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}) + >>> df = pd.DataFrame(d) + >>> df['week_starting'] = pd.date_range('01/01/2018', + ... periods=8, + ... freq='W') + >>> df + price volume week_starting + 0 10 50 2018-01-07 + 1 11 60 2018-01-14 + 2 9 40 2018-01-21 + 3 13 100 2018-01-28 + 4 14 50 2018-02-04 + 5 18 100 2018-02-11 + 6 17 40 2018-02-18 + 7 19 50 2018-02-25 + >>> df.resample('M', on='week_starting').mean() + price volume + week_starting + 2018-01-31 10.75 62.5 + 2018-02-28 17.00 60.0 + + For a DataFrame with MultiIndex, the keyword `level` can be used to + specify on which level the resampling needs to take place. + + >>> days = pd.date_range('1/1/2000', periods=4, freq='D') + >>> d2 = dict({'price': [10, 11, 9, 13, 14, 18, 17, 19], + ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}) + >>> df2 = pd.DataFrame(d2, + ... index=pd.MultiIndex.from_product([days, + ... ['morning', + ... 'afternoon']] + ... )) + >>> df2 + price volume + 2000-01-01 morning 10 50 + afternoon 11 60 + 2000-01-02 morning 9 40 + afternoon 13 100 + 2000-01-03 morning 14 50 + afternoon 18 100 + 2000-01-04 morning 17 40 + afternoon 19 50 + >>> df2.resample('D', level=0).sum() + price volume + 2000-01-01 21 110 + 2000-01-02 22 140 + 2000-01-03 32 150 + 2000-01-04 36 90 + """ + + from pandas.core.resample import resample + + axis = self._get_axis_number(axis) + return resample( + self, + freq=rule, + label=label, + closed=closed, + axis=axis, + kind=kind, + loffset=loffset, + convention=convention, + base=base, + key=on, + level=level, + ) + + def first(self: FrameOrSeries, offset) -> FrameOrSeries: + """ + Method to subset initial periods of time series data based on a date offset. + + Parameters + ---------- + offset : str, DateOffset, dateutil.relativedelta + + Returns + ------- + subset : same type as caller + + Raises + ------ + TypeError + If the index is not a :class:`DatetimeIndex` + + See Also + -------- + last : Select final periods of time series based on a date offset. + at_time : Select values at a particular time of the day. + between_time : Select values between particular times of the day. + + Examples + -------- + >>> i = pd.date_range('2018-04-09', periods=4, freq='2D') + >>> ts = pd.DataFrame({'A': [1,2,3,4]}, index=i) + >>> ts + A + 2018-04-09 1 + 2018-04-11 2 + 2018-04-13 3 + 2018-04-15 4 + + Get the rows for the first 3 days: + + >>> ts.first('3D') + A + 2018-04-09 1 + 2018-04-11 2 + + Notice the data for 3 first calender days were returned, not the first + 3 days observed in the dataset, and therefore data for 2018-04-13 was + not returned. + """ + if not isinstance(self.index, DatetimeIndex): + raise TypeError("'first' only supports a DatetimeIndex index") + + if len(self.index) == 0: + return self + + offset = to_offset(offset) + end_date = end = self.index[0] + offset + + # Tick-like, e.g. 3 weeks + if not offset.is_anchored() and hasattr(offset, "_inc"): + if end_date in self.index: + end = self.index.searchsorted(end_date, side="left") + return self.iloc[:end] + + return self.loc[:end] + + def last(self: FrameOrSeries, offset) -> FrameOrSeries: + """ + Method to subset final periods of time series data based on a date offset. + + Parameters + ---------- + offset : str, DateOffset, dateutil.relativedelta + + Returns + ------- + subset : same type as caller + + Raises + ------ + TypeError + If the index is not a :class:`DatetimeIndex` + + See Also + -------- + first : Select initial periods of time series based on a date offset. + at_time : Select values at a particular time of the day. + between_time : Select values between particular times of the day. + + Examples + -------- + >>> i = pd.date_range('2018-04-09', periods=4, freq='2D') + >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i) + >>> ts + A + 2018-04-09 1 + 2018-04-11 2 + 2018-04-13 3 + 2018-04-15 4 + + Get the rows for the last 3 days: + + >>> ts.last('3D') + A + 2018-04-13 3 + 2018-04-15 4 + + Notice the data for 3 last calender days were returned, not the last + 3 observed days in the dataset, and therefore data for 2018-04-11 was + not returned. + """ + if not isinstance(self.index, DatetimeIndex): + raise TypeError("'last' only supports a DatetimeIndex index") + + if len(self.index) == 0: + return self + + offset = to_offset(offset) + + start_date = self.index[-1] - offset + start = self.index.searchsorted(start_date, side="right") + return self.iloc[start:] + + def rank( + self: FrameOrSeries, + axis=0, + method: str = "average", + numeric_only: Optional[bool_t] = None, + na_option: str = "keep", + ascending: bool_t = True, + pct: bool_t = False, + ) -> FrameOrSeries: + """ + Compute numerical data ranks (1 through n) along axis. + + By default, equal values are assigned a rank that is the average of the + ranks of those values. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + Index to direct ranking. + method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' + How to rank the group of records that have the same value (i.e. ties): + + * average: average rank of the group + * min: lowest rank in the group + * max: highest rank in the group + * first: ranks assigned in order they appear in the array + * dense: like 'min', but rank always increases by 1 between groups. + + numeric_only : bool, optional + For DataFrame objects, rank only numeric columns if set to True. + na_option : {'keep', 'top', 'bottom'}, default 'keep' + How to rank NaN values: + + * keep: assign NaN rank to NaN values + * top: assign smallest rank to NaN values if ascending + * bottom: assign highest rank to NaN values if ascending. + + ascending : bool, default True + Whether or not the elements should be ranked in ascending order. + pct : bool, default False + Whether or not to display the returned rankings in percentile + form. + + Returns + ------- + same type as caller + Return a Series or DataFrame with data ranks as values. + + See Also + -------- + core.groupby.GroupBy.rank : Rank of values within each group. + + Examples + -------- + + >>> df = pd.DataFrame(data={'Animal': ['cat', 'penguin', 'dog', + ... 'spider', 'snake'], + ... 'Number_legs': [4, 2, 4, 8, np.nan]}) + >>> df + Animal Number_legs + 0 cat 4.0 + 1 penguin 2.0 + 2 dog 4.0 + 3 spider 8.0 + 4 snake NaN + + The following example shows how the method behaves with the above + parameters: + + * default_rank: this is the default behaviour obtained without using + any parameter. + * max_rank: setting ``method = 'max'`` the records that have the + same values are ranked using the highest rank (e.g.: since 'cat' + and 'dog' are both in the 2nd and 3rd position, rank 3 is assigned.) + * NA_bottom: choosing ``na_option = 'bottom'``, if there are records + with NaN values they are placed at the bottom of the ranking. + * pct_rank: when setting ``pct = True``, the ranking is expressed as + percentile rank. + + >>> df['default_rank'] = df['Number_legs'].rank() + >>> df['max_rank'] = df['Number_legs'].rank(method='max') + >>> df['NA_bottom'] = df['Number_legs'].rank(na_option='bottom') + >>> df['pct_rank'] = df['Number_legs'].rank(pct=True) + >>> df + Animal Number_legs default_rank max_rank NA_bottom pct_rank + 0 cat 4.0 2.5 3.0 2.5 0.625 + 1 penguin 2.0 1.0 1.0 1.0 0.250 + 2 dog 4.0 2.5 3.0 2.5 0.625 + 3 spider 8.0 4.0 4.0 4.0 1.000 + 4 snake NaN NaN NaN 5.0 NaN + """ + axis = self._get_axis_number(axis) + + if na_option not in {"keep", "top", "bottom"}: + msg = "na_option must be one of 'keep', 'top', or 'bottom'" + raise ValueError(msg) + + def ranker(data): + ranks = algos.rank( + data.values, + axis=axis, + method=method, + ascending=ascending, + na_option=na_option, + pct=pct, + ) + ranks = self._constructor(ranks, **data._construct_axes_dict()) + return ranks.__finalize__(self) + + # if numeric_only is None, and we can't get anything, we try with + # numeric_only=True + if numeric_only is None: + try: + return ranker(self) + except TypeError: + numeric_only = True + + if numeric_only: + data = self._get_numeric_data() + else: + data = self + + return ranker(data) + + _shared_docs[ + "align" + ] = """ + Align two objects on their axes with the specified join method. + + Join method is specified for each axis Index. + + Parameters + ---------- + other : DataFrame or Series + join : {'outer', 'inner', 'left', 'right'}, default 'outer' + axis : allowed axis of the other object, default None + Align on index (0), columns (1), or both (None). + level : int or level name, default None + Broadcast across a level, matching Index values on the + passed MultiIndex level. + copy : bool, default True + Always returns new objects. If copy=False and no reindexing is + required then original objects are returned. + fill_value : scalar, default np.NaN + Value to use for missing values. Defaults to NaN, but can be any + "compatible" value. + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + Method to use for filling holes in reindexed Series: + + - pad / ffill: propagate last valid observation forward to next valid. + - backfill / bfill: use NEXT valid observation to fill gap. + + limit : int, default None + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. Must be greater than 0 if not None. + fill_axis : %(axes_single_arg)s, default 0 + Filling axis, method and limit. + broadcast_axis : %(axes_single_arg)s, default None + Broadcast values along this axis, if aligning two objects of + different dimensions. + + Returns + ------- + (left, right) : (%(klass)s, type of other) + Aligned objects. + """ + + @Appender(_shared_docs["align"] % _shared_doc_kwargs) + def align( + self, + other, + join="outer", + axis=None, + level=None, + copy=True, + fill_value=None, + method=None, + limit=None, + fill_axis=0, + broadcast_axis=None, + ): + method = missing.clean_fill_method(method) + + if broadcast_axis == 1 and self.ndim != other.ndim: + if isinstance(self, ABCSeries): + # this means other is a DataFrame, and we need to broadcast + # self + cons = self._constructor_expanddim + df = cons( + {c: self for c in other.columns}, **other._construct_axes_dict() + ) + return df._align_frame( + other, + join=join, + axis=axis, + level=level, + copy=copy, + fill_value=fill_value, + method=method, + limit=limit, + fill_axis=fill_axis, + ) + elif isinstance(other, ABCSeries): + # this means self is a DataFrame, and we need to broadcast + # other + cons = other._constructor_expanddim + df = cons( + {c: other for c in self.columns}, **self._construct_axes_dict() + ) + return self._align_frame( + df, + join=join, + axis=axis, + level=level, + copy=copy, + fill_value=fill_value, + method=method, + limit=limit, + fill_axis=fill_axis, + ) + + if axis is not None: + axis = self._get_axis_number(axis) + if isinstance(other, ABCDataFrame): + return self._align_frame( + other, + join=join, + axis=axis, + level=level, + copy=copy, + fill_value=fill_value, + method=method, + limit=limit, + fill_axis=fill_axis, + ) + elif isinstance(other, ABCSeries): + return self._align_series( + other, + join=join, + axis=axis, + level=level, + copy=copy, + fill_value=fill_value, + method=method, + limit=limit, + fill_axis=fill_axis, + ) + else: # pragma: no cover + raise TypeError(f"unsupported type: {type(other)}") + + def _align_frame( + self, + other, + join="outer", + axis=None, + level=None, + copy: bool_t = True, + fill_value=None, + method=None, + limit=None, + fill_axis=0, + ): + # defaults + join_index, join_columns = None, None + ilidx, iridx = None, None + clidx, cridx = None, None + + is_series = isinstance(self, ABCSeries) + + if axis is None or axis == 0: + if not self.index.equals(other.index): + join_index, ilidx, iridx = self.index.join( + other.index, how=join, level=level, return_indexers=True + ) + + if axis is None or axis == 1: + if not is_series and not self.columns.equals(other.columns): + join_columns, clidx, cridx = self.columns.join( + other.columns, how=join, level=level, return_indexers=True + ) + + if is_series: + reindexers = {0: [join_index, ilidx]} + else: + reindexers = {0: [join_index, ilidx], 1: [join_columns, clidx]} + + left = self._reindex_with_indexers( + reindexers, copy=copy, fill_value=fill_value, allow_dups=True + ) + # other must be always DataFrame + right = other._reindex_with_indexers( + {0: [join_index, iridx], 1: [join_columns, cridx]}, + copy=copy, + fill_value=fill_value, + allow_dups=True, + ) + + if method is not None: + left = self._ensure_type( + left.fillna(method=method, axis=fill_axis, limit=limit) + ) + right = self._ensure_type( + right.fillna(method=method, axis=fill_axis, limit=limit) + ) + + # if DatetimeIndex have different tz, convert to UTC + if is_datetime64tz_dtype(left.index): + if left.index.tz != right.index.tz: + if join_index is not None: + left.index = join_index + right.index = join_index + + return left.__finalize__(self), right.__finalize__(other) + + def _align_series( + self, + other, + join="outer", + axis=None, + level=None, + copy: bool_t = True, + fill_value=None, + method=None, + limit=None, + fill_axis=0, + ): + + is_series = isinstance(self, ABCSeries) + + # series/series compat, other must always be a Series + if is_series: + if axis: + raise ValueError("cannot align series to a series other than axis 0") + + # equal + if self.index.equals(other.index): + join_index, lidx, ridx = None, None, None + else: + join_index, lidx, ridx = self.index.join( + other.index, how=join, level=level, return_indexers=True + ) + + left = self._reindex_indexer(join_index, lidx, copy) + right = other._reindex_indexer(join_index, ridx, copy) + + else: + # one has > 1 ndim + fdata = self._data + if axis == 0: + join_index = self.index + lidx, ridx = None, None + if not self.index.equals(other.index): + join_index, lidx, ridx = self.index.join( + other.index, how=join, level=level, return_indexers=True + ) + + if lidx is not None: + fdata = fdata.reindex_indexer(join_index, lidx, axis=1) + + elif axis == 1: + join_index = self.columns + lidx, ridx = None, None + if not self.columns.equals(other.index): + join_index, lidx, ridx = self.columns.join( + other.index, how=join, level=level, return_indexers=True + ) + + if lidx is not None: + fdata = fdata.reindex_indexer(join_index, lidx, axis=0) + else: + raise ValueError("Must specify axis=0 or 1") + + if copy and fdata is self._data: + fdata = fdata.copy() + + left = self._constructor(fdata) + + if ridx is None: + right = other + else: + right = other.reindex(join_index, level=level) + + # fill + fill_na = notna(fill_value) or (method is not None) + if fill_na: + left = left.fillna(fill_value, method=method, limit=limit, axis=fill_axis) + right = right.fillna(fill_value, method=method, limit=limit) + + # if DatetimeIndex have different tz, convert to UTC + if is_series or (not is_series and axis == 0): + if is_datetime64tz_dtype(left.index): + if left.index.tz != right.index.tz: + if join_index is not None: + left.index = join_index + right.index = join_index + + return left.__finalize__(self), right.__finalize__(other) + + def _where( + self, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + errors="raise", + try_cast=False, + ): + """ + Equivalent to public method `where`, except that `other` is not + applied as a function even if callable. Used in __setitem__. + """ + inplace = validate_bool_kwarg(inplace, "inplace") + + # align the cond to same shape as myself + cond = com.apply_if_callable(cond, self) + if isinstance(cond, NDFrame): + cond, _ = cond.align(self, join="right", broadcast_axis=1) + else: + if not hasattr(cond, "shape"): + cond = np.asanyarray(cond) + if cond.shape != self.shape: + raise ValueError("Array conditional must be same shape as self") + cond = self._constructor(cond, **self._construct_axes_dict()) + + # make sure we are boolean + fill_value = bool(inplace) + cond = cond.fillna(fill_value) + + msg = "Boolean array expected for the condition, not {dtype}" + + if not isinstance(cond, ABCDataFrame): + # This is a single-dimensional object. + if not is_bool_dtype(cond): + raise ValueError(msg.format(dtype=cond.dtype)) + elif not cond.empty: + for dt in cond.dtypes: + if not is_bool_dtype(dt): + raise ValueError(msg.format(dtype=dt)) + + cond = -cond if inplace else cond + + # try to align with other + try_quick = True + if hasattr(other, "align"): + + # align with me + if other.ndim <= self.ndim: + + _, other = self.align( + other, join="left", axis=axis, level=level, fill_value=np.nan + ) + + # if we are NOT aligned, raise as we cannot where index + if axis is None and not all( + other._get_axis(i).equals(ax) for i, ax in enumerate(self.axes) + ): + raise InvalidIndexError + + # slice me out of the other + else: + raise NotImplementedError( + "cannot align with a higher dimensional NDFrame" + ) + + if isinstance(other, np.ndarray): + + if other.shape != self.shape: + + if self.ndim == 1: + + icond = cond.values + + # GH 2745 / GH 4192 + # treat like a scalar + if len(other) == 1: + other = np.array(other[0]) + + # GH 3235 + # match True cond to other + elif len(cond[icond]) == len(other): + + # try to not change dtype at first (if try_quick) + if try_quick: + new_other = com.values_from_object(self) + new_other = new_other.copy() + new_other[icond] = other + other = new_other + + else: + raise ValueError( + "Length of replacements must equal series length" + ) + + else: + raise ValueError( + "other must be the same shape as self when an ndarray" + ) + + # we are the same shape, so create an actual object for alignment + else: + other = self._constructor(other, **self._construct_axes_dict()) + + if axis is None: + axis = 0 + + if self.ndim == getattr(other, "ndim", 0): + align = True + else: + align = self._get_axis_number(axis) == 1 + + block_axis = self._get_block_manager_axis(axis) + + if inplace: + # we may have different type blocks come out of putmask, so + # reconstruct the block manager + + self._check_inplace_setting(other) + new_data = self._data.putmask( + mask=cond, + new=other, + align=align, + inplace=True, + axis=block_axis, + transpose=self._AXIS_REVERSED, + ) + self._update_inplace(new_data) + + else: + new_data = self._data.where( + other=other, + cond=cond, + align=align, + errors=errors, + try_cast=try_cast, + axis=block_axis, + ) + + return self._constructor(new_data).__finalize__(self) + + _shared_docs[ + "where" + ] = """ + Replace values where the condition is %(cond_rev)s. + + Parameters + ---------- + cond : bool %(klass)s, array-like, or callable + Where `cond` is %(cond)s, keep the original value. Where + %(cond_rev)s, replace with corresponding value from `other`. + If `cond` is callable, it is computed on the %(klass)s and + should return boolean %(klass)s or array. The callable must + not change input %(klass)s (though pandas doesn't check it). + other : scalar, %(klass)s, or callable + Entries where `cond` is %(cond_rev)s are replaced with + corresponding value from `other`. + If other is callable, it is computed on the %(klass)s and + should return scalar or %(klass)s. The callable must not + change input %(klass)s (though pandas doesn't check it). + inplace : bool, default False + Whether to perform the operation in place on the data. + axis : int, default None + Alignment axis if needed. + level : int, default None + Alignment level if needed. + errors : str, {'raise', 'ignore'}, default 'raise' + Note that currently this parameter won't affect + the results and will always coerce to a suitable dtype. + + - 'raise' : allow exceptions to be raised. + - 'ignore' : suppress exceptions. On error return original object. + + try_cast : bool, default False + Try to cast the result back to the input type (if possible). + + Returns + ------- + Same type as caller + + See Also + -------- + :func:`DataFrame.%(name_other)s` : Return an object of same shape as + self. + + Notes + ----- + The %(name)s method is an application of the if-then idiom. For each + element in the calling DataFrame, if ``cond`` is ``%(cond)s`` the + element is used; otherwise the corresponding element from the DataFrame + ``other`` is used. + + The signature for :func:`DataFrame.where` differs from + :func:`numpy.where`. Roughly ``df1.where(m, df2)`` is equivalent to + ``np.where(m, df1, df2)``. + + For further details and examples see the ``%(name)s`` documentation in + :ref:`indexing `. + + Examples + -------- + >>> s = pd.Series(range(5)) + >>> s.where(s > 0) + 0 NaN + 1 1.0 + 2 2.0 + 3 3.0 + 4 4.0 + dtype: float64 + + >>> s.mask(s > 0) + 0 0.0 + 1 NaN + 2 NaN + 3 NaN + 4 NaN + dtype: float64 + + >>> s.where(s > 1, 10) + 0 10 + 1 10 + 2 2 + 3 3 + 4 4 + dtype: int64 + + >>> df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B']) + >>> df + A B + 0 0 1 + 1 2 3 + 2 4 5 + 3 6 7 + 4 8 9 + >>> m = df %% 3 == 0 + >>> df.where(m, -df) + A B + 0 0 -1 + 1 -2 3 + 2 -4 -5 + 3 6 -7 + 4 -8 9 + >>> df.where(m, -df) == np.where(m, df, -df) + A B + 0 True True + 1 True True + 2 True True + 3 True True + 4 True True + >>> df.where(m, -df) == df.mask(~m, -df) + A B + 0 True True + 1 True True + 2 True True + 3 True True + 4 True True + """ + + @Appender( + _shared_docs["where"] + % dict( + _shared_doc_kwargs, + cond="True", + cond_rev="False", + name="where", + name_other="mask", + ) + ) + def where( + self, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + errors="raise", + try_cast=False, + ): + + other = com.apply_if_callable(other, self) + return self._where( + cond, other, inplace, axis, level, errors=errors, try_cast=try_cast + ) + + @Appender( + _shared_docs["where"] + % dict( + _shared_doc_kwargs, + cond="False", + cond_rev="True", + name="mask", + name_other="where", + ) + ) + def mask( + self, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + errors="raise", + try_cast=False, + ): + + inplace = validate_bool_kwarg(inplace, "inplace") + cond = com.apply_if_callable(cond, self) + + # see gh-21891 + if not hasattr(cond, "__invert__"): + cond = np.array(cond) + + return self.where( + ~cond, + other=other, + inplace=inplace, + axis=axis, + level=level, + try_cast=try_cast, + errors=errors, + ) + + _shared_docs[ + "shift" + ] = """ + Shift index by desired number of periods with an optional time `freq`. + + When `freq` is not passed, shift the index without realigning the data. + If `freq` is passed (in this case, the index must be date or datetime, + or it will raise a `NotImplementedError`), the index will be + increased using the periods and the `freq`. + + Parameters + ---------- + periods : int + Number of periods to shift. Can be positive or negative. + freq : DateOffset, tseries.offsets, timedelta, or str, optional + Offset to use from the tseries module or time rule (e.g. 'EOM'). + If `freq` is specified then the index values are shifted but the + data is not realigned. That is, use `freq` if you would like to + extend the index when shifting and preserve the original data. + axis : {0 or 'index', 1 or 'columns', None}, default None + Shift direction. + fill_value : object, optional + The scalar value to use for newly introduced missing values. + the default depends on the dtype of `self`. + For numeric data, ``np.nan`` is used. + For datetime, timedelta, or period data, etc. :attr:`NaT` is used. + For extension dtypes, ``self.dtype.na_value`` is used. + + .. versionchanged:: 0.24.0 + + Returns + ------- + %(klass)s + Copy of input object, shifted. + + See Also + -------- + Index.shift : Shift values of Index. + DatetimeIndex.shift : Shift values of DatetimeIndex. + PeriodIndex.shift : Shift values of PeriodIndex. + tshift : Shift the time index, using the index's frequency if + available. + + Examples + -------- + >>> df = pd.DataFrame({'Col1': [10, 20, 15, 30, 45], + ... 'Col2': [13, 23, 18, 33, 48], + ... 'Col3': [17, 27, 22, 37, 52]}) + + >>> df.shift(periods=3) + Col1 Col2 Col3 + 0 NaN NaN NaN + 1 NaN NaN NaN + 2 NaN NaN NaN + 3 10.0 13.0 17.0 + 4 20.0 23.0 27.0 + + >>> df.shift(periods=1, axis='columns') + Col1 Col2 Col3 + 0 NaN 10.0 13.0 + 1 NaN 20.0 23.0 + 2 NaN 15.0 18.0 + 3 NaN 30.0 33.0 + 4 NaN 45.0 48.0 + + >>> df.shift(periods=3, fill_value=0) + Col1 Col2 Col3 + 0 0 0 0 + 1 0 0 0 + 2 0 0 0 + 3 10 13 17 + 4 20 23 27 + """ + + @Appender(_shared_docs["shift"] % _shared_doc_kwargs) + def shift( + self: FrameOrSeries, periods=1, freq=None, axis=0, fill_value=None + ) -> FrameOrSeries: + if periods == 0: + return self.copy() + + block_axis = self._get_block_manager_axis(axis) + if freq is None: + new_data = self._data.shift( + periods=periods, axis=block_axis, fill_value=fill_value + ) + else: + return self.tshift(periods, freq) + + return self._constructor(new_data).__finalize__(self) + + def slice_shift(self: FrameOrSeries, periods: int = 1, axis=0) -> FrameOrSeries: + """ + Equivalent to `shift` without copying data. + + The shifted data will not include the dropped periods and the + shifted axis will be smaller than the original. + + Parameters + ---------- + periods : int + Number of periods to move, can be positive or negative. + + Returns + ------- + shifted : same type as caller + + Notes + ----- + While the `slice_shift` is faster than `shift`, you may pay for it + later during alignment. + """ + if periods == 0: + return self + + if periods > 0: + vslicer = slice(None, -periods) + islicer = slice(periods, None) + else: + vslicer = slice(-periods, None) + islicer = slice(None, periods) + + new_obj = self._slice(vslicer, axis=axis) + shifted_axis = self._get_axis(axis)[islicer] + new_obj.set_axis(shifted_axis, axis=axis, inplace=True) + + return new_obj.__finalize__(self) + + def tshift( + self: FrameOrSeries, periods: int = 1, freq=None, axis=0 + ) -> FrameOrSeries: + """ + Shift the time index, using the index's frequency if available. + + Parameters + ---------- + periods : int + Number of periods to move, can be positive or negative. + freq : DateOffset, timedelta, or str, default None + Increment to use from the tseries module + or time rule expressed as a string (e.g. 'EOM'). + axis : {0 or ‘index’, 1 or ‘columns’, None}, default 0 + Corresponds to the axis that contains the Index. + + Returns + ------- + shifted : Series/DataFrame + + Notes + ----- + If freq is not specified then tries to use the freq or inferred_freq + attributes of the index. If neither of those attributes exist, a + ValueError is thrown + """ + + index = self._get_axis(axis) + if freq is None: + freq = getattr(index, "freq", None) + + if freq is None: + freq = getattr(index, "inferred_freq", None) + + if freq is None: + msg = "Freq was not given and was not set in the index" + raise ValueError(msg) + + if periods == 0: + return self + + if isinstance(freq, str): + freq = to_offset(freq) + + block_axis = self._get_block_manager_axis(axis) + if isinstance(index, PeriodIndex): + orig_freq = to_offset(index.freq) + if freq == orig_freq: + new_data = self._data.copy() + new_data.axes[block_axis] = index.shift(periods) + elif orig_freq is not None: + msg = ( + f"Given freq {freq.rule_code} does not match" + f" PeriodIndex freq {orig_freq.rule_code}" + ) + raise ValueError(msg) + else: + new_data = self._data.copy() + new_data.axes[block_axis] = index.shift(periods, freq) + + return self._constructor(new_data).__finalize__(self) + + def truncate( + self: FrameOrSeries, before=None, after=None, axis=None, copy: bool_t = True + ) -> FrameOrSeries: + """ + Truncate a Series or DataFrame before and after some index value. + + This is a useful shorthand for boolean indexing based on index + values above or below certain thresholds. + + Parameters + ---------- + before : date, str, int + Truncate all rows before this index value. + after : date, str, int + Truncate all rows after this index value. + axis : {0 or 'index', 1 or 'columns'}, optional + Axis to truncate. Truncates the index (rows) by default. + copy : bool, default is True, + Return a copy of the truncated section. + + Returns + ------- + type of caller + The truncated Series or DataFrame. + + See Also + -------- + DataFrame.loc : Select a subset of a DataFrame by label. + DataFrame.iloc : Select a subset of a DataFrame by position. + + Notes + ----- + If the index being truncated contains only datetime values, + `before` and `after` may be specified as strings instead of + Timestamps. + + Examples + -------- + >>> df = pd.DataFrame({'A': ['a', 'b', 'c', 'd', 'e'], + ... 'B': ['f', 'g', 'h', 'i', 'j'], + ... 'C': ['k', 'l', 'm', 'n', 'o']}, + ... index=[1, 2, 3, 4, 5]) + >>> df + A B C + 1 a f k + 2 b g l + 3 c h m + 4 d i n + 5 e j o + + >>> df.truncate(before=2, after=4) + A B C + 2 b g l + 3 c h m + 4 d i n + + The columns of a DataFrame can be truncated. + + >>> df.truncate(before="A", after="B", axis="columns") + A B + 1 a f + 2 b g + 3 c h + 4 d i + 5 e j + + For Series, only rows can be truncated. + + >>> df['A'].truncate(before=2, after=4) + 2 b + 3 c + 4 d + Name: A, dtype: object + + The index values in ``truncate`` can be datetimes or string + dates. + + >>> dates = pd.date_range('2016-01-01', '2016-02-01', freq='s') + >>> df = pd.DataFrame(index=dates, data={'A': 1}) + >>> df.tail() + A + 2016-01-31 23:59:56 1 + 2016-01-31 23:59:57 1 + 2016-01-31 23:59:58 1 + 2016-01-31 23:59:59 1 + 2016-02-01 00:00:00 1 + + >>> df.truncate(before=pd.Timestamp('2016-01-05'), + ... after=pd.Timestamp('2016-01-10')).tail() + A + 2016-01-09 23:59:56 1 + 2016-01-09 23:59:57 1 + 2016-01-09 23:59:58 1 + 2016-01-09 23:59:59 1 + 2016-01-10 00:00:00 1 + + Because the index is a DatetimeIndex containing only dates, we can + specify `before` and `after` as strings. They will be coerced to + Timestamps before truncation. + + >>> df.truncate('2016-01-05', '2016-01-10').tail() + A + 2016-01-09 23:59:56 1 + 2016-01-09 23:59:57 1 + 2016-01-09 23:59:58 1 + 2016-01-09 23:59:59 1 + 2016-01-10 00:00:00 1 + + Note that ``truncate`` assumes a 0 value for any unspecified time + component (midnight). This differs from partial string slicing, which + returns any partially matching dates. + + >>> df.loc['2016-01-05':'2016-01-10', :].tail() + A + 2016-01-10 23:59:55 1 + 2016-01-10 23:59:56 1 + 2016-01-10 23:59:57 1 + 2016-01-10 23:59:58 1 + 2016-01-10 23:59:59 1 + """ + if axis is None: + axis = self._stat_axis_number + axis = self._get_axis_number(axis) + ax = self._get_axis(axis) + + # GH 17935 + # Check that index is sorted + if not ax.is_monotonic_increasing and not ax.is_monotonic_decreasing: + raise ValueError("truncate requires a sorted index") + + # if we have a date index, convert to dates, otherwise + # treat like a slice + if ax.is_all_dates: + from pandas.core.tools.datetimes import to_datetime + + before = to_datetime(before) + after = to_datetime(after) + + if before is not None and after is not None: + if before > after: + raise ValueError(f"Truncate: {after} must be after {before}") + + slicer = [slice(None, None)] * self._AXIS_LEN + slicer[axis] = slice(before, after) + result = self.loc[tuple(slicer)] + + if isinstance(ax, MultiIndex): + setattr(result, self._get_axis_name(axis), ax.truncate(before, after)) + + if copy: + result = result.copy() + + return result + + def tz_convert( + self: FrameOrSeries, tz, axis=0, level=None, copy: bool_t = True + ) -> FrameOrSeries: + """ + Convert tz-aware axis to target time zone. + + Parameters + ---------- + tz : str or tzinfo object + axis : the axis to convert + level : int, str, default None + If axis is a MultiIndex, convert a specific level. Otherwise + must be None. + copy : bool, default True + Also make a copy of the underlying data. + + Returns + ------- + %(klass)s + Object with time zone converted axis. + + Raises + ------ + TypeError + If the axis is tz-naive. + """ + axis = self._get_axis_number(axis) + ax = self._get_axis(axis) + + def _tz_convert(ax, tz): + if not hasattr(ax, "tz_convert"): + if len(ax) > 0: + ax_name = self._get_axis_name(axis) + raise TypeError( + f"{ax_name} is not a valid DatetimeIndex or PeriodIndex" + ) + else: + ax = DatetimeIndex([], tz=tz) + else: + ax = ax.tz_convert(tz) + return ax + + # if a level is given it must be a MultiIndex level or + # equivalent to the axis name + if isinstance(ax, MultiIndex): + level = ax._get_level_number(level) + new_level = _tz_convert(ax.levels[level], tz) + ax = ax.set_levels(new_level, level=level) + else: + if level not in (None, 0, ax.name): + raise ValueError(f"The level {level} is not valid") + ax = _tz_convert(ax, tz) + + result = self._constructor(self._data, copy=copy) + result = result.set_axis(ax, axis=axis, inplace=False) + return result.__finalize__(self) + + def tz_localize( + self: FrameOrSeries, + tz, + axis=0, + level=None, + copy: bool_t = True, + ambiguous="raise", + nonexistent: str = "raise", + ) -> FrameOrSeries: + """ + Localize tz-naive index of a Series or DataFrame to target time zone. + + This operation localizes the Index. To localize the values in a + timezone-naive Series, use :meth:`Series.dt.tz_localize`. + + Parameters + ---------- + tz : str or tzinfo + axis : the axis to localize + level : int, str, default None + If axis ia a MultiIndex, localize a specific level. Otherwise + must be None. + copy : bool, default True + Also make a copy of the underlying data. + ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' + When clocks moved backward due to DST, ambiguous times may arise. + For example in Central European Time (UTC+01), when going from + 03:00 DST to 02:00 non-DST, 02:30:00 local time occurs both at + 00:30:00 UTC and at 01:30:00 UTC. In such a situation, the + `ambiguous` parameter dictates how ambiguous times should be + handled. + + - 'infer' will attempt to infer fall dst-transition hours based on + order + - bool-ndarray where True signifies a DST time, False designates + a non-DST time (note that this flag is only applicable for + ambiguous times) + - 'NaT' will return NaT where there are ambiguous times + - 'raise' will raise an AmbiguousTimeError if there are ambiguous + times. + nonexistent : str, default 'raise' + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. Valid values are: + + - 'shift_forward' will shift the nonexistent time forward to the + closest existing time + - 'shift_backward' will shift the nonexistent time backward to the + closest existing time + - 'NaT' will return NaT where there are nonexistent times + - timedelta objects will shift nonexistent times by the timedelta + - 'raise' will raise an NonExistentTimeError if there are + nonexistent times. + + .. versionadded:: 0.24.0 + + Returns + ------- + Series or DataFrame + Same type as the input. + + Raises + ------ + TypeError + If the TimeSeries is tz-aware and tz is not None. + + Examples + -------- + + Localize local times: + + >>> s = pd.Series([1], + ... index=pd.DatetimeIndex(['2018-09-15 01:30:00'])) + >>> s.tz_localize('CET') + 2018-09-15 01:30:00+02:00 1 + dtype: int64 + + Be careful with DST changes. When there is sequential data, pandas + can infer the DST time: + + >>> s = pd.Series(range(7), + ... index=pd.DatetimeIndex(['2018-10-28 01:30:00', + ... '2018-10-28 02:00:00', + ... '2018-10-28 02:30:00', + ... '2018-10-28 02:00:00', + ... '2018-10-28 02:30:00', + ... '2018-10-28 03:00:00', + ... '2018-10-28 03:30:00'])) + >>> s.tz_localize('CET', ambiguous='infer') + 2018-10-28 01:30:00+02:00 0 + 2018-10-28 02:00:00+02:00 1 + 2018-10-28 02:30:00+02:00 2 + 2018-10-28 02:00:00+01:00 3 + 2018-10-28 02:30:00+01:00 4 + 2018-10-28 03:00:00+01:00 5 + 2018-10-28 03:30:00+01:00 6 + dtype: int64 + + In some cases, inferring the DST is impossible. In such cases, you can + pass an ndarray to the ambiguous parameter to set the DST explicitly + + >>> s = pd.Series(range(3), + ... index=pd.DatetimeIndex(['2018-10-28 01:20:00', + ... '2018-10-28 02:36:00', + ... '2018-10-28 03:46:00'])) + >>> s.tz_localize('CET', ambiguous=np.array([True, True, False])) + 2018-10-28 01:20:00+02:00 0 + 2018-10-28 02:36:00+02:00 1 + 2018-10-28 03:46:00+01:00 2 + dtype: int64 + + If the DST transition causes nonexistent times, you can shift these + dates forward or backwards with a timedelta object or `'shift_forward'` + or `'shift_backwards'`. + >>> s = pd.Series(range(2), + ... index=pd.DatetimeIndex(['2015-03-29 02:30:00', + ... '2015-03-29 03:30:00'])) + >>> s.tz_localize('Europe/Warsaw', nonexistent='shift_forward') + 2015-03-29 03:00:00+02:00 0 + 2015-03-29 03:30:00+02:00 1 + dtype: int64 + >>> s.tz_localize('Europe/Warsaw', nonexistent='shift_backward') + 2015-03-29 01:59:59.999999999+01:00 0 + 2015-03-29 03:30:00+02:00 1 + dtype: int64 + >>> s.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1H')) + 2015-03-29 03:30:00+02:00 0 + 2015-03-29 03:30:00+02:00 1 + dtype: int64 + """ + nonexistent_options = ("raise", "NaT", "shift_forward", "shift_backward") + if nonexistent not in nonexistent_options and not isinstance( + nonexistent, timedelta + ): + raise ValueError( + "The nonexistent argument must be one of 'raise', " + "'NaT', 'shift_forward', 'shift_backward' or " + "a timedelta object" + ) + + axis = self._get_axis_number(axis) + ax = self._get_axis(axis) + + def _tz_localize(ax, tz, ambiguous, nonexistent): + if not hasattr(ax, "tz_localize"): + if len(ax) > 0: + ax_name = self._get_axis_name(axis) + raise TypeError( + f"{ax_name} is not a valid DatetimeIndex or PeriodIndex" + ) + else: + ax = DatetimeIndex([], tz=tz) + else: + ax = ax.tz_localize(tz, ambiguous=ambiguous, nonexistent=nonexistent) + return ax + + # if a level is given it must be a MultiIndex level or + # equivalent to the axis name + if isinstance(ax, MultiIndex): + level = ax._get_level_number(level) + new_level = _tz_localize(ax.levels[level], tz, ambiguous, nonexistent) + ax = ax.set_levels(new_level, level=level) + else: + if level not in (None, 0, ax.name): + raise ValueError(f"The level {level} is not valid") + ax = _tz_localize(ax, tz, ambiguous, nonexistent) + + result = self._constructor(self._data, copy=copy) + result = result.set_axis(ax, axis=axis, inplace=False) + return result.__finalize__(self) + + # ---------------------------------------------------------------------- + # Numeric Methods + def abs(self: FrameOrSeries) -> FrameOrSeries: + """ + Return a Series/DataFrame with absolute numeric value of each element. + + This function only applies to elements that are all numeric. + + Returns + ------- + abs + Series/DataFrame containing the absolute value of each element. + + See Also + -------- + numpy.absolute : Calculate the absolute value element-wise. + + Notes + ----- + For ``complex`` inputs, ``1.2 + 1j``, the absolute value is + :math:`\\sqrt{ a^2 + b^2 }`. + + Examples + -------- + Absolute numeric values in a Series. + + >>> s = pd.Series([-1.10, 2, -3.33, 4]) + >>> s.abs() + 0 1.10 + 1 2.00 + 2 3.33 + 3 4.00 + dtype: float64 + + Absolute numeric values in a Series with complex numbers. + + >>> s = pd.Series([1.2 + 1j]) + >>> s.abs() + 0 1.56205 + dtype: float64 + + Absolute numeric values in a Series with a Timedelta element. + + >>> s = pd.Series([pd.Timedelta('1 days')]) + >>> s.abs() + 0 1 days + dtype: timedelta64[ns] + + Select rows with data closest to certain value using argsort (from + `StackOverflow `__). + + >>> df = pd.DataFrame({ + ... 'a': [4, 5, 6, 7], + ... 'b': [10, 20, 30, 40], + ... 'c': [100, 50, -30, -50] + ... }) + >>> df + a b c + 0 4 10 100 + 1 5 20 50 + 2 6 30 -30 + 3 7 40 -50 + >>> df.loc[(df.c - 43).abs().argsort()] + a b c + 1 5 20 50 + 0 4 10 100 + 2 6 30 -30 + 3 7 40 -50 + """ + return np.abs(self) + + def describe( + self: FrameOrSeries, percentiles=None, include=None, exclude=None + ) -> FrameOrSeries: + """ + Generate descriptive statistics. + + Descriptive statistics include those that summarize the central + tendency, dispersion and shape of a + dataset's distribution, excluding ``NaN`` values. + + Analyzes both numeric and object series, as well + as ``DataFrame`` column sets of mixed data types. The output + will vary depending on what is provided. Refer to the notes + below for more detail. + + Parameters + ---------- + percentiles : list-like of numbers, optional + The percentiles to include in the output. All should + fall between 0 and 1. The default is + ``[.25, .5, .75]``, which returns the 25th, 50th, and + 75th percentiles. + include : 'all', list-like of dtypes or None (default), optional + A white list of data types to include in the result. Ignored + for ``Series``. Here are the options: + + - 'all' : All columns of the input will be included in the output. + - A list-like of dtypes : Limits the results to the + provided data types. + To limit the result to numeric types submit + ``numpy.number``. To limit it instead to object columns submit + the ``numpy.object`` data type. Strings + can also be used in the style of + ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To + select pandas categorical columns, use ``'category'`` + - None (default) : The result will include all numeric columns. + exclude : list-like of dtypes or None (default), optional, + A black list of data types to omit from the result. Ignored + for ``Series``. Here are the options: + + - A list-like of dtypes : Excludes the provided data types + from the result. To exclude numeric types submit + ``numpy.number``. To exclude object columns submit the data + type ``numpy.object``. Strings can also be used in the style of + ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To + exclude pandas categorical columns, use ``'category'`` + - None (default) : The result will exclude nothing. + + Returns + ------- + Series or DataFrame + Summary statistics of the Series or Dataframe provided. + + See Also + -------- + DataFrame.count: Count number of non-NA/null observations. + DataFrame.max: Maximum of the values in the object. + DataFrame.min: Minimum of the values in the object. + DataFrame.mean: Mean of the values. + DataFrame.std: Standard deviation of the observations. + DataFrame.select_dtypes: Subset of a DataFrame including/excluding + columns based on their dtype. + + Notes + ----- + For numeric data, the result's index will include ``count``, + ``mean``, ``std``, ``min``, ``max`` as well as lower, ``50`` and + upper percentiles. By default the lower percentile is ``25`` and the + upper percentile is ``75``. The ``50`` percentile is the + same as the median. + + For object data (e.g. strings or timestamps), the result's index + will include ``count``, ``unique``, ``top``, and ``freq``. The ``top`` + is the most common value. The ``freq`` is the most common value's + frequency. Timestamps also include the ``first`` and ``last`` items. + + If multiple object values have the highest count, then the + ``count`` and ``top`` results will be arbitrarily chosen from + among those with the highest count. + + For mixed data types provided via a ``DataFrame``, the default is to + return only an analysis of numeric columns. If the dataframe consists + only of object and categorical data without any numeric columns, the + default is to return an analysis of both the object and categorical + columns. If ``include='all'`` is provided as an option, the result + will include a union of attributes of each type. + + The `include` and `exclude` parameters can be used to limit + which columns in a ``DataFrame`` are analyzed for the output. + The parameters are ignored when analyzing a ``Series``. + + Examples + -------- + Describing a numeric ``Series``. + + >>> s = pd.Series([1, 2, 3]) + >>> s.describe() + count 3.0 + mean 2.0 + std 1.0 + min 1.0 + 25% 1.5 + 50% 2.0 + 75% 2.5 + max 3.0 + dtype: float64 + + Describing a categorical ``Series``. + + >>> s = pd.Series(['a', 'a', 'b', 'c']) + >>> s.describe() + count 4 + unique 3 + top a + freq 2 + dtype: object + + Describing a timestamp ``Series``. + + >>> s = pd.Series([ + ... np.datetime64("2000-01-01"), + ... np.datetime64("2010-01-01"), + ... np.datetime64("2010-01-01") + ... ]) + >>> s.describe() + count 3 + unique 2 + top 2010-01-01 00:00:00 + freq 2 + first 2000-01-01 00:00:00 + last 2010-01-01 00:00:00 + dtype: object + + Describing a ``DataFrame``. By default only numeric fields + are returned. + + >>> df = pd.DataFrame({'categorical': pd.Categorical(['d','e','f']), + ... 'numeric': [1, 2, 3], + ... 'object': ['a', 'b', 'c'] + ... }) + >>> df.describe() + numeric + count 3.0 + mean 2.0 + std 1.0 + min 1.0 + 25% 1.5 + 50% 2.0 + 75% 2.5 + max 3.0 + + Describing all columns of a ``DataFrame`` regardless of data type. + + >>> df.describe(include='all') + categorical numeric object + count 3 3.0 3 + unique 3 NaN 3 + top f NaN c + freq 1 NaN 1 + mean NaN 2.0 NaN + std NaN 1.0 NaN + min NaN 1.0 NaN + 25% NaN 1.5 NaN + 50% NaN 2.0 NaN + 75% NaN 2.5 NaN + max NaN 3.0 NaN + + Describing a column from a ``DataFrame`` by accessing it as + an attribute. + + >>> df.numeric.describe() + count 3.0 + mean 2.0 + std 1.0 + min 1.0 + 25% 1.5 + 50% 2.0 + 75% 2.5 + max 3.0 + Name: numeric, dtype: float64 + + Including only numeric columns in a ``DataFrame`` description. + + >>> df.describe(include=[np.number]) + numeric + count 3.0 + mean 2.0 + std 1.0 + min 1.0 + 25% 1.5 + 50% 2.0 + 75% 2.5 + max 3.0 + + Including only string columns in a ``DataFrame`` description. + + >>> df.describe(include=[np.object]) + object + count 3 + unique 3 + top c + freq 1 + + Including only categorical columns from a ``DataFrame`` description. + + >>> df.describe(include=['category']) + categorical + count 3 + unique 3 + top f + freq 1 + + Excluding numeric columns from a ``DataFrame`` description. + + >>> df.describe(exclude=[np.number]) + categorical object + count 3 3 + unique 3 3 + top f c + freq 1 1 + + Excluding object columns from a ``DataFrame`` description. + + >>> df.describe(exclude=[np.object]) + categorical numeric + count 3 3.0 + unique 3 NaN + top f NaN + freq 1 NaN + mean NaN 2.0 + std NaN 1.0 + min NaN 1.0 + 25% NaN 1.5 + 50% NaN 2.0 + 75% NaN 2.5 + max NaN 3.0 + """ + if self.ndim == 2 and self.columns.size == 0: + raise ValueError("Cannot describe a DataFrame without columns") + + if percentiles is not None: + # explicit conversion of `percentiles` to list + percentiles = list(percentiles) + + # get them all to be in [0, 1] + validate_percentile(percentiles) + + # median should always be included + if 0.5 not in percentiles: + percentiles.append(0.5) + percentiles = np.asarray(percentiles) + else: + percentiles = np.array([0.25, 0.5, 0.75]) + + # sort and check for duplicates + unique_pcts = np.unique(percentiles) + if len(unique_pcts) < len(percentiles): + raise ValueError("percentiles cannot contain duplicates") + percentiles = unique_pcts + + formatted_percentiles = format_percentiles(percentiles) + + def describe_numeric_1d(series): + stat_index = ( + ["count", "mean", "std", "min"] + formatted_percentiles + ["max"] + ) + d = ( + [series.count(), series.mean(), series.std(), series.min()] + + series.quantile(percentiles).tolist() + + [series.max()] + ) + return pd.Series(d, index=stat_index, name=series.name) + + def describe_categorical_1d(data): + names = ["count", "unique"] + objcounts = data.value_counts() + count_unique = len(objcounts[objcounts != 0]) + result = [data.count(), count_unique] + dtype = None + if result[1] > 0: + top, freq = objcounts.index[0], objcounts.iloc[0] + + if is_datetime64_any_dtype(data): + tz = data.dt.tz + asint = data.dropna().values.view("i8") + top = Timestamp(top) + if top.tzinfo is not None and tz is not None: + # Don't tz_localize(None) if key is already tz-aware + top = top.tz_convert(tz) + else: + top = top.tz_localize(tz) + names += ["top", "freq", "first", "last"] + result += [ + top, + freq, + Timestamp(asint.min(), tz=tz), + Timestamp(asint.max(), tz=tz), + ] + else: + names += ["top", "freq"] + result += [top, freq] + + # If the DataFrame is empty, set 'top' and 'freq' to None + # to maintain output shape consistency + else: + names += ["top", "freq"] + result += [np.nan, np.nan] + dtype = "object" + + return pd.Series(result, index=names, name=data.name, dtype=dtype) + + def describe_1d(data): + if is_bool_dtype(data): + return describe_categorical_1d(data) + elif is_numeric_dtype(data): + return describe_numeric_1d(data) + elif is_timedelta64_dtype(data): + return describe_numeric_1d(data) + else: + return describe_categorical_1d(data) + + if self.ndim == 1: + return describe_1d(self) + elif (include is None) and (exclude is None): + # when some numerics are found, keep only numerics + data = self.select_dtypes(include=[np.number]) + if len(data.columns) == 0: + data = self + elif include == "all": + if exclude is not None: + msg = "exclude must be None when include is 'all'" + raise ValueError(msg) + data = self + else: + data = self.select_dtypes(include=include, exclude=exclude) + + ldesc = [describe_1d(s) for _, s in data.items()] + # set a convenient order for rows + names: List[Optional[Hashable]] = [] + ldesc_indexes = sorted((x.index for x in ldesc), key=len) + for idxnames in ldesc_indexes: + for name in idxnames: + if name not in names: + names.append(name) + + d = pd.concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False) + d.columns = data.columns.copy() + return d + + _shared_docs[ + "pct_change" + ] = """ + Percentage change between the current and a prior element. + + Computes the percentage change from the immediately previous row by + default. This is useful in comparing the percentage of change in a time + series of elements. + + Parameters + ---------- + periods : int, default 1 + Periods to shift for forming percent change. + fill_method : str, default 'pad' + How to handle NAs before computing percent changes. + limit : int, default None + The number of consecutive NAs to fill before stopping. + freq : DateOffset, timedelta, or str, optional + Increment to use from time series API (e.g. 'M' or BDay()). + **kwargs + Additional keyword arguments are passed into + `DataFrame.shift` or `Series.shift`. + + Returns + ------- + chg : Series or DataFrame + The same type as the calling object. + + See Also + -------- + Series.diff : Compute the difference of two elements in a Series. + DataFrame.diff : Compute the difference of two elements in a DataFrame. + Series.shift : Shift the index by some number of periods. + DataFrame.shift : Shift the index by some number of periods. + + Examples + -------- + **Series** + + >>> s = pd.Series([90, 91, 85]) + >>> s + 0 90 + 1 91 + 2 85 + dtype: int64 + + >>> s.pct_change() + 0 NaN + 1 0.011111 + 2 -0.065934 + dtype: float64 + + >>> s.pct_change(periods=2) + 0 NaN + 1 NaN + 2 -0.055556 + dtype: float64 + + See the percentage change in a Series where filling NAs with last + valid observation forward to next valid. + + >>> s = pd.Series([90, 91, None, 85]) + >>> s + 0 90.0 + 1 91.0 + 2 NaN + 3 85.0 + dtype: float64 + + >>> s.pct_change(fill_method='ffill') + 0 NaN + 1 0.011111 + 2 0.000000 + 3 -0.065934 + dtype: float64 + + **DataFrame** + + Percentage change in French franc, Deutsche Mark, and Italian lira from + 1980-01-01 to 1980-03-01. + + >>> df = pd.DataFrame({ + ... 'FR': [4.0405, 4.0963, 4.3149], + ... 'GR': [1.7246, 1.7482, 1.8519], + ... 'IT': [804.74, 810.01, 860.13]}, + ... index=['1980-01-01', '1980-02-01', '1980-03-01']) + >>> df + FR GR IT + 1980-01-01 4.0405 1.7246 804.74 + 1980-02-01 4.0963 1.7482 810.01 + 1980-03-01 4.3149 1.8519 860.13 + + >>> df.pct_change() + FR GR IT + 1980-01-01 NaN NaN NaN + 1980-02-01 0.013810 0.013684 0.006549 + 1980-03-01 0.053365 0.059318 0.061876 + + Percentage of change in GOOG and APPL stock volume. Shows computing + the percentage change between columns. + + >>> df = pd.DataFrame({ + ... '2016': [1769950, 30586265], + ... '2015': [1500923, 40912316], + ... '2014': [1371819, 41403351]}, + ... index=['GOOG', 'APPL']) + >>> df + 2016 2015 2014 + GOOG 1769950 1500923 1371819 + APPL 30586265 40912316 41403351 + + >>> df.pct_change(axis='columns') + 2016 2015 2014 + GOOG NaN -0.151997 -0.086016 + APPL NaN 0.337604 0.012002 + """ + + @Appender(_shared_docs["pct_change"] % _shared_doc_kwargs) + def pct_change( + self: FrameOrSeries, + periods=1, + fill_method="pad", + limit=None, + freq=None, + **kwargs, + ) -> FrameOrSeries: + # TODO: Not sure if above is correct - need someone to confirm. + axis = self._get_axis_number(kwargs.pop("axis", self._stat_axis_name)) + if fill_method is None: + data = self + else: + data = self._ensure_type( + self.fillna(method=fill_method, axis=axis, limit=limit) + ) + + rs = data.div(data.shift(periods=periods, freq=freq, axis=axis, **kwargs)) - 1 + if freq is not None: + # Shift method is implemented differently when freq is not None + # We want to restore the original index + rs = rs.loc[~rs.index.duplicated()] + rs = rs.reindex_like(data) + return rs + + def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs): + if axis is None: + raise ValueError("Must specify 'axis' when aggregating by level.") + grouped = self.groupby(level=level, axis=axis, sort=False) + if hasattr(grouped, name) and skipna: + return getattr(grouped, name)(**kwargs) + axis = self._get_axis_number(axis) + method = getattr(type(self), name) + applyf = lambda x: method(x, axis=axis, skipna=skipna, **kwargs) + return grouped.aggregate(applyf) + + @classmethod + def _add_numeric_operations(cls): + """ + Add the operations to the cls; evaluate the doc strings again + """ + + axis_descr, name, name2 = _doc_parms(cls) + + cls.any = _make_logical_function( + cls, + "any", + name, + name2, + axis_descr, + _any_desc, + nanops.nanany, + _any_see_also, + _any_examples, + empty_value=False, + ) + cls.all = _make_logical_function( + cls, + "all", + name, + name2, + axis_descr, + _all_desc, + nanops.nanall, + _all_see_also, + _all_examples, + empty_value=True, + ) + + @Substitution( + desc="Return the mean absolute deviation of the values " + "for the requested axis.", + name1=name, + name2=name2, + axis_descr=axis_descr, + min_count="", + see_also="", + examples="", + ) + @Appender(_num_doc) + def mad(self, axis=None, skipna=None, level=None): + if skipna is None: + skipna = True + if axis is None: + axis = self._stat_axis_number + if level is not None: + return self._agg_by_level("mad", axis=axis, level=level, skipna=skipna) + + data = self._get_numeric_data() + if axis == 0: + demeaned = data - data.mean(axis=0) + else: + demeaned = data.sub(data.mean(axis=1), axis=0) + return np.abs(demeaned).mean(axis=axis, skipna=skipna) + + cls.mad = mad + + cls.sem = _make_stat_function_ddof( + cls, + "sem", + name, + name2, + axis_descr, + "Return unbiased standard error of the mean over requested " + "axis.\n\nNormalized by N-1 by default. This can be changed " + "using the ddof argument", + nanops.nansem, + ) + cls.var = _make_stat_function_ddof( + cls, + "var", + name, + name2, + axis_descr, + "Return unbiased variance over requested axis.\n\nNormalized by " + "N-1 by default. This can be changed using the ddof argument", + nanops.nanvar, + ) + cls.std = _make_stat_function_ddof( + cls, + "std", + name, + name2, + axis_descr, + "Return sample standard deviation over requested axis." + "\n\nNormalized by N-1 by default. This can be changed using the " + "ddof argument", + nanops.nanstd, + ) + + cls.cummin = _make_cum_function( + cls, + "cummin", + name, + name2, + axis_descr, + "minimum", + np.minimum.accumulate, + "min", + np.inf, + np.nan, + _cummin_examples, + ) + cls.cumsum = _make_cum_function( + cls, + "cumsum", + name, + name2, + axis_descr, + "sum", + np.cumsum, + "sum", + 0.0, + np.nan, + _cumsum_examples, + ) + cls.cumprod = _make_cum_function( + cls, + "cumprod", + name, + name2, + axis_descr, + "product", + np.cumprod, + "prod", + 1.0, + np.nan, + _cumprod_examples, + ) + cls.cummax = _make_cum_function( + cls, + "cummax", + name, + name2, + axis_descr, + "maximum", + np.maximum.accumulate, + "max", + -np.inf, + np.nan, + _cummax_examples, + ) + + cls.sum = _make_min_count_stat_function( + cls, + "sum", + name, + name2, + axis_descr, + """Return the sum of the values for the requested axis.\n + This is equivalent to the method ``numpy.sum``.""", + nanops.nansum, + _stat_func_see_also, + _sum_examples, + ) + cls.mean = _make_stat_function( + cls, + "mean", + name, + name2, + axis_descr, + "Return the mean of the values for the requested axis.", + nanops.nanmean, + ) + cls.skew = _make_stat_function( + cls, + "skew", + name, + name2, + axis_descr, + "Return unbiased skew over requested axis.\n\nNormalized by N-1.", + nanops.nanskew, + ) + cls.kurt = _make_stat_function( + cls, + "kurt", + name, + name2, + axis_descr, + "Return unbiased kurtosis over requested axis.\n\n" + "Kurtosis obtained using Fisher's definition of\n" + "kurtosis (kurtosis of normal == 0.0). Normalized " + "by N-1.", + nanops.nankurt, + ) + cls.kurtosis = cls.kurt + cls.prod = _make_min_count_stat_function( + cls, + "prod", + name, + name2, + axis_descr, + "Return the product of the values for the requested axis.", + nanops.nanprod, + examples=_prod_examples, + ) + cls.product = cls.prod + cls.median = _make_stat_function( + cls, + "median", + name, + name2, + axis_descr, + "Return the median of the values for the requested axis.", + nanops.nanmedian, + ) + cls.max = _make_stat_function( + cls, + "max", + name, + name2, + axis_descr, + """Return the maximum of the values for the requested axis.\n + If you want the *index* of the maximum, use ``idxmax``. This is + the equivalent of the ``numpy.ndarray`` method ``argmax``.""", + nanops.nanmax, + _stat_func_see_also, + _max_examples, + ) + cls.min = _make_stat_function( + cls, + "min", + name, + name2, + axis_descr, + """Return the minimum of the values for the requested axis.\n + If you want the *index* of the minimum, use ``idxmin``. This is + the equivalent of the ``numpy.ndarray`` method ``argmin``.""", + nanops.nanmin, + _stat_func_see_also, + _min_examples, + ) + + @classmethod + def _add_series_or_dataframe_operations(cls): + """ + Add the series or dataframe only operations to the cls; evaluate + the doc strings again. + """ + + from pandas.core.window import EWM, Expanding, Rolling, Window + + @Appender(Rolling.__doc__) + def rolling( + self, + window, + min_periods=None, + center=False, + win_type=None, + on=None, + axis=0, + closed=None, + ): + axis = self._get_axis_number(axis) + + if win_type is not None: + return Window( + self, + window=window, + min_periods=min_periods, + center=center, + win_type=win_type, + on=on, + axis=axis, + closed=closed, + ) + + return Rolling( + self, + window=window, + min_periods=min_periods, + center=center, + win_type=win_type, + on=on, + axis=axis, + closed=closed, + ) + + cls.rolling = rolling + + @Appender(Expanding.__doc__) + def expanding(self, min_periods=1, center=False, axis=0): + axis = self._get_axis_number(axis) + return Expanding(self, min_periods=min_periods, center=center, axis=axis) + + cls.expanding = expanding + + @Appender(EWM.__doc__) + def ewm( + self, + com=None, + span=None, + halflife=None, + alpha=None, + min_periods=0, + adjust=True, + ignore_na=False, + axis=0, + ): + axis = self._get_axis_number(axis) + return EWM( + self, + com=com, + span=span, + halflife=halflife, + alpha=alpha, + min_periods=min_periods, + adjust=adjust, + ignore_na=ignore_na, + axis=axis, + ) + + cls.ewm = ewm + + @Appender(_shared_docs["transform"] % dict(axis="", **_shared_doc_kwargs)) + def transform(self, func, *args, **kwargs): + result = self.agg(func, *args, **kwargs) + if is_scalar(result) or len(result) != len(self): + raise ValueError("transforms cannot produce aggregated results") + + return result + + # ---------------------------------------------------------------------- + # Misc methods + + _shared_docs[ + "valid_index" + ] = """ + Return index for %(position)s non-NA/null value. + + Returns + ------- + scalar : type of index + + Notes + ----- + If all elements are non-NA/null, returns None. + Also returns None for empty %(klass)s. + """ + + def _find_valid_index(self, how: str): + """ + Retrieves the index of the first valid value. + + Parameters + ---------- + how : {'first', 'last'} + Use this parameter to change between the first or last valid index. + + Returns + ------- + idx_first_valid : type of index + """ + + idxpos = find_valid_index(self._values, how) + if idxpos is None: + return None + return self.index[idxpos] + + @Appender( + _shared_docs["valid_index"] % {"position": "first", "klass": "Series/DataFrame"} + ) + def first_valid_index(self): + return self._find_valid_index("first") + + @Appender( + _shared_docs["valid_index"] % {"position": "last", "klass": "Series/DataFrame"} + ) + def last_valid_index(self): + return self._find_valid_index("last") + + +def _doc_parms(cls): + """Return a tuple of the doc parms.""" + axis_descr = ( + f"{{{', '.join(f'{a} ({i})' for i, a in enumerate(cls._AXIS_ORDERS))}}}" + ) + name = cls._constructor_sliced.__name__ if cls._AXIS_LEN > 1 else "scalar" + name2 = cls.__name__ + return axis_descr, name, name2 + + +_num_doc = """ +%(desc)s + +Parameters +---------- +axis : %(axis_descr)s + Axis for the function to be applied on. +skipna : bool, default True + Exclude NA/null values when computing the result. +level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a + particular level, collapsing into a %(name1)s. +numeric_only : bool, default None + Include only float, int, boolean columns. If None, will attempt to use + everything, then use only numeric data. Not implemented for Series. +%(min_count)s\ +**kwargs + Additional keyword arguments to be passed to the function. + +Returns +------- +%(name1)s or %(name2)s (if level specified)\ +%(see_also)s\ +%(examples)s +""" + +_num_ddof_doc = """ +%(desc)s + +Parameters +---------- +axis : %(axis_descr)s +skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. +level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a + particular level, collapsing into a %(name1)s. +ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. +numeric_only : bool, default None + Include only float, int, boolean columns. If None, will attempt to use + everything, then use only numeric data. Not implemented for Series. + +Returns +------- +%(name1)s or %(name2)s (if level specified)\n""" + +_bool_doc = """ +%(desc)s + +Parameters +---------- +axis : {0 or 'index', 1 or 'columns', None}, default 0 + Indicate which axis or axes should be reduced. + + * 0 / 'index' : reduce the index, return a Series whose index is the + original column labels. + * 1 / 'columns' : reduce the columns, return a Series whose index is the + original index. + * None : reduce all axes, return a scalar. + +bool_only : bool, default None + Include only boolean columns. If None, will attempt to use everything, + then use only boolean data. Not implemented for Series. +skipna : bool, default True + Exclude NA/null values. If the entire row/column is NA and skipna is + True, then the result will be %(empty_value)s, as for an empty row/column. + If skipna is False, then NA are treated as True, because these are not + equal to zero. +level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a + particular level, collapsing into a %(name1)s. +**kwargs : any, default None + Additional keywords have no effect but might be accepted for + compatibility with NumPy. + +Returns +------- +%(name1)s or %(name2)s + If level is specified, then, %(name2)s is returned; otherwise, %(name1)s + is returned. + +%(see_also)s +%(examples)s""" + +_all_desc = """\ +Return whether all elements are True, potentially over an axis. + +Returns True unless there at least one element within a series or +along a Dataframe axis that is False or equivalent (e.g. zero or +empty).""" + +_all_examples = """\ +Examples +-------- +**Series** + +>>> pd.Series([True, True]).all() +True +>>> pd.Series([True, False]).all() +False +>>> pd.Series([]).all() +True +>>> pd.Series([np.nan]).all() +True +>>> pd.Series([np.nan]).all(skipna=False) +True + +**DataFrames** + +Create a dataframe from a dictionary. + +>>> df = pd.DataFrame({'col1': [True, True], 'col2': [True, False]}) +>>> df + col1 col2 +0 True True +1 True False + +Default behaviour checks if column-wise values all return True. + +>>> df.all() +col1 True +col2 False +dtype: bool + +Specify ``axis='columns'`` to check if row-wise values all return True. + +>>> df.all(axis='columns') +0 True +1 False +dtype: bool + +Or ``axis=None`` for whether every value is True. + +>>> df.all(axis=None) +False +""" + +_all_see_also = """\ +See Also +-------- +Series.all : Return True if all elements are True. +DataFrame.any : Return True if one (or more) elements are True. +""" + +_cnum_doc = """ +Return cumulative %(desc)s over a DataFrame or Series axis. + +Returns a DataFrame or Series of the same size containing the cumulative +%(desc)s. + +Parameters +---------- +axis : {0 or 'index', 1 or 'columns'}, default 0 + The index or the name of the axis. 0 is equivalent to None or 'index'. +skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. +*args, **kwargs : + Additional keywords have no effect but might be accepted for + compatibility with NumPy. + +Returns +------- +%(name1)s or %(name2)s + +See Also +-------- +core.window.Expanding.%(accum_func_name)s : Similar functionality + but ignores ``NaN`` values. +%(name2)s.%(accum_func_name)s : Return the %(desc)s over + %(name2)s axis. +%(name2)s.cummax : Return cumulative maximum over %(name2)s axis. +%(name2)s.cummin : Return cumulative minimum over %(name2)s axis. +%(name2)s.cumsum : Return cumulative sum over %(name2)s axis. +%(name2)s.cumprod : Return cumulative product over %(name2)s axis. + +%(examples)s""" + +_cummin_examples = """\ +Examples +-------- +**Series** + +>>> s = pd.Series([2, np.nan, 5, -1, 0]) +>>> s +0 2.0 +1 NaN +2 5.0 +3 -1.0 +4 0.0 +dtype: float64 + +By default, NA values are ignored. + +>>> s.cummin() +0 2.0 +1 NaN +2 2.0 +3 -1.0 +4 -1.0 +dtype: float64 + +To include NA values in the operation, use ``skipna=False`` + +>>> s.cummin(skipna=False) +0 2.0 +1 NaN +2 NaN +3 NaN +4 NaN +dtype: float64 + +**DataFrame** + +>>> df = pd.DataFrame([[2.0, 1.0], +... [3.0, np.nan], +... [1.0, 0.0]], +... columns=list('AB')) +>>> df + A B +0 2.0 1.0 +1 3.0 NaN +2 1.0 0.0 + +By default, iterates over rows and finds the minimum +in each column. This is equivalent to ``axis=None`` or ``axis='index'``. + +>>> df.cummin() + A B +0 2.0 1.0 +1 2.0 NaN +2 1.0 0.0 + +To iterate over columns and find the minimum in each row, +use ``axis=1`` + +>>> df.cummin(axis=1) + A B +0 2.0 1.0 +1 3.0 NaN +2 1.0 0.0 +""" + +_cumsum_examples = """\ +Examples +-------- +**Series** + +>>> s = pd.Series([2, np.nan, 5, -1, 0]) +>>> s +0 2.0 +1 NaN +2 5.0 +3 -1.0 +4 0.0 +dtype: float64 + +By default, NA values are ignored. + +>>> s.cumsum() +0 2.0 +1 NaN +2 7.0 +3 6.0 +4 6.0 +dtype: float64 + +To include NA values in the operation, use ``skipna=False`` + +>>> s.cumsum(skipna=False) +0 2.0 +1 NaN +2 NaN +3 NaN +4 NaN +dtype: float64 + +**DataFrame** + +>>> df = pd.DataFrame([[2.0, 1.0], +... [3.0, np.nan], +... [1.0, 0.0]], +... columns=list('AB')) +>>> df + A B +0 2.0 1.0 +1 3.0 NaN +2 1.0 0.0 + +By default, iterates over rows and finds the sum +in each column. This is equivalent to ``axis=None`` or ``axis='index'``. + +>>> df.cumsum() + A B +0 2.0 1.0 +1 5.0 NaN +2 6.0 1.0 + +To iterate over columns and find the sum in each row, +use ``axis=1`` + +>>> df.cumsum(axis=1) + A B +0 2.0 3.0 +1 3.0 NaN +2 1.0 1.0 +""" + +_cumprod_examples = """\ +Examples +-------- +**Series** + +>>> s = pd.Series([2, np.nan, 5, -1, 0]) +>>> s +0 2.0 +1 NaN +2 5.0 +3 -1.0 +4 0.0 +dtype: float64 + +By default, NA values are ignored. + +>>> s.cumprod() +0 2.0 +1 NaN +2 10.0 +3 -10.0 +4 -0.0 +dtype: float64 + +To include NA values in the operation, use ``skipna=False`` + +>>> s.cumprod(skipna=False) +0 2.0 +1 NaN +2 NaN +3 NaN +4 NaN +dtype: float64 + +**DataFrame** + +>>> df = pd.DataFrame([[2.0, 1.0], +... [3.0, np.nan], +... [1.0, 0.0]], +... columns=list('AB')) +>>> df + A B +0 2.0 1.0 +1 3.0 NaN +2 1.0 0.0 + +By default, iterates over rows and finds the product +in each column. This is equivalent to ``axis=None`` or ``axis='index'``. + +>>> df.cumprod() + A B +0 2.0 1.0 +1 6.0 NaN +2 6.0 0.0 + +To iterate over columns and find the product in each row, +use ``axis=1`` + +>>> df.cumprod(axis=1) + A B +0 2.0 2.0 +1 3.0 NaN +2 1.0 0.0 +""" + +_cummax_examples = """\ +Examples +-------- +**Series** + +>>> s = pd.Series([2, np.nan, 5, -1, 0]) +>>> s +0 2.0 +1 NaN +2 5.0 +3 -1.0 +4 0.0 +dtype: float64 + +By default, NA values are ignored. + +>>> s.cummax() +0 2.0 +1 NaN +2 5.0 +3 5.0 +4 5.0 +dtype: float64 + +To include NA values in the operation, use ``skipna=False`` + +>>> s.cummax(skipna=False) +0 2.0 +1 NaN +2 NaN +3 NaN +4 NaN +dtype: float64 + +**DataFrame** + +>>> df = pd.DataFrame([[2.0, 1.0], +... [3.0, np.nan], +... [1.0, 0.0]], +... columns=list('AB')) +>>> df + A B +0 2.0 1.0 +1 3.0 NaN +2 1.0 0.0 + +By default, iterates over rows and finds the maximum +in each column. This is equivalent to ``axis=None`` or ``axis='index'``. + +>>> df.cummax() + A B +0 2.0 1.0 +1 3.0 NaN +2 3.0 1.0 + +To iterate over columns and find the maximum in each row, +use ``axis=1`` + +>>> df.cummax(axis=1) + A B +0 2.0 2.0 +1 3.0 NaN +2 1.0 1.0 +""" + +_any_see_also = """\ +See Also +-------- +numpy.any : Numpy version of this method. +Series.any : Return whether any element is True. +Series.all : Return whether all elements are True. +DataFrame.any : Return whether any element is True over requested axis. +DataFrame.all : Return whether all elements are True over requested axis. +""" + +_any_desc = """\ +Return whether any element is True, potentially over an axis. + +Returns False unless there at least one element within a series or +along a Dataframe axis that is True or equivalent (e.g. non-zero or +non-empty).""" + +_any_examples = """\ +Examples +-------- +**Series** + +For Series input, the output is a scalar indicating whether any element +is True. + +>>> pd.Series([False, False]).any() +False +>>> pd.Series([True, False]).any() +True +>>> pd.Series([]).any() +False +>>> pd.Series([np.nan]).any() +False +>>> pd.Series([np.nan]).any(skipna=False) +True + +**DataFrame** + +Whether each column contains at least one True element (the default). + +>>> df = pd.DataFrame({"A": [1, 2], "B": [0, 2], "C": [0, 0]}) +>>> df + A B C +0 1 0 0 +1 2 2 0 + +>>> df.any() +A True +B True +C False +dtype: bool + +Aggregating over the columns. + +>>> df = pd.DataFrame({"A": [True, False], "B": [1, 2]}) +>>> df + A B +0 True 1 +1 False 2 + +>>> df.any(axis='columns') +0 True +1 True +dtype: bool + +>>> df = pd.DataFrame({"A": [True, False], "B": [1, 0]}) +>>> df + A B +0 True 1 +1 False 0 + +>>> df.any(axis='columns') +0 True +1 False +dtype: bool + +Aggregating over the entire DataFrame with ``axis=None``. + +>>> df.any(axis=None) +True + +`any` for an empty DataFrame is an empty Series. + +>>> pd.DataFrame([]).any() +Series([], dtype: bool) +""" + +_shared_docs[ + "stat_func_example" +] = """ + +Examples +-------- +>>> idx = pd.MultiIndex.from_arrays([ +... ['warm', 'warm', 'cold', 'cold'], +... ['dog', 'falcon', 'fish', 'spider']], +... names=['blooded', 'animal']) +>>> s = pd.Series([4, 2, 0, 8], name='legs', index=idx) +>>> s +blooded animal +warm dog 4 + falcon 2 +cold fish 0 + spider 8 +Name: legs, dtype: int64 + +>>> s.{stat_func}() +{default_output} + +{verb} using level names, as well as indices. + +>>> s.{stat_func}(level='blooded') +blooded +warm {level_output_0} +cold {level_output_1} +Name: legs, dtype: int64 + +>>> s.{stat_func}(level=0) +blooded +warm {level_output_0} +cold {level_output_1} +Name: legs, dtype: int64""" + +_sum_examples = _shared_docs["stat_func_example"].format( + stat_func="sum", verb="Sum", default_output=14, level_output_0=6, level_output_1=8 +) + +_sum_examples += """ + +By default, the sum of an empty or all-NA Series is ``0``. + +>>> pd.Series([]).sum() # min_count=0 is the default +0.0 + +This can be controlled with the ``min_count`` parameter. For example, if +you'd like the sum of an empty series to be NaN, pass ``min_count=1``. + +>>> pd.Series([]).sum(min_count=1) +nan + +Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and +empty series identically. + +>>> pd.Series([np.nan]).sum() +0.0 + +>>> pd.Series([np.nan]).sum(min_count=1) +nan""" + +_max_examples = _shared_docs["stat_func_example"].format( + stat_func="max", verb="Max", default_output=8, level_output_0=4, level_output_1=8 +) + +_min_examples = _shared_docs["stat_func_example"].format( + stat_func="min", verb="Min", default_output=0, level_output_0=2, level_output_1=0 +) + +_stat_func_see_also = """ + +See Also +-------- +Series.sum : Return the sum. +Series.min : Return the minimum. +Series.max : Return the maximum. +Series.idxmin : Return the index of the minimum. +Series.idxmax : Return the index of the maximum. +DataFrame.sum : Return the sum over the requested axis. +DataFrame.min : Return the minimum over the requested axis. +DataFrame.max : Return the maximum over the requested axis. +DataFrame.idxmin : Return the index of the minimum over the requested axis. +DataFrame.idxmax : Return the index of the maximum over the requested axis.""" + +_prod_examples = """ + +Examples +-------- +By default, the product of an empty or all-NA Series is ``1`` + +>>> pd.Series([]).prod() +1.0 + +This can be controlled with the ``min_count`` parameter + +>>> pd.Series([]).prod(min_count=1) +nan + +Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and +empty series identically. + +>>> pd.Series([np.nan]).prod() +1.0 + +>>> pd.Series([np.nan]).prod(min_count=1) +nan""" + +_min_count_stub = """\ +min_count : int, default 0 + The required number of valid values to perform the operation. If fewer than + ``min_count`` non-NA values are present the result will be NA. + + .. versionadded:: 0.22.0 + + Added with the default being 0. This means the sum of an all-NA + or empty Series is 0, and the product of an all-NA or empty + Series is 1. +""" + + +def _make_min_count_stat_function( + cls, name, name1, name2, axis_descr, desc, f, see_also: str = "", examples: str = "" +): + @Substitution( + desc=desc, + name1=name1, + name2=name2, + axis_descr=axis_descr, + min_count=_min_count_stub, + see_also=see_also, + examples=examples, + ) + @Appender(_num_doc) + def stat_func( + self, + axis=None, + skipna=None, + level=None, + numeric_only=None, + min_count=0, + **kwargs, + ): + if name == "sum": + nv.validate_sum(tuple(), kwargs) + elif name == "prod": + nv.validate_prod(tuple(), kwargs) + else: + nv.validate_stat_func(tuple(), kwargs, fname=name) + if skipna is None: + skipna = True + if axis is None: + axis = self._stat_axis_number + if level is not None: + return self._agg_by_level( + name, axis=axis, level=level, skipna=skipna, min_count=min_count + ) + return self._reduce( + f, + name, + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + min_count=min_count, + ) + + return set_function_name(stat_func, name, cls) + + +def _make_stat_function( + cls, name, name1, name2, axis_descr, desc, f, see_also: str = "", examples: str = "" +): + @Substitution( + desc=desc, + name1=name1, + name2=name2, + axis_descr=axis_descr, + min_count="", + see_also=see_also, + examples=examples, + ) + @Appender(_num_doc) + def stat_func( + self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs + ): + if name == "median": + nv.validate_median(tuple(), kwargs) + else: + nv.validate_stat_func(tuple(), kwargs, fname=name) + if skipna is None: + skipna = True + if axis is None: + axis = self._stat_axis_number + if level is not None: + return self._agg_by_level(name, axis=axis, level=level, skipna=skipna) + return self._reduce( + f, name, axis=axis, skipna=skipna, numeric_only=numeric_only + ) + + return set_function_name(stat_func, name, cls) + + +def _make_stat_function_ddof(cls, name, name1, name2, axis_descr, desc, f): + @Substitution(desc=desc, name1=name1, name2=name2, axis_descr=axis_descr) + @Appender(_num_ddof_doc) + def stat_func( + self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + ): + nv.validate_stat_ddof_func(tuple(), kwargs, fname=name) + if skipna is None: + skipna = True + if axis is None: + axis = self._stat_axis_number + if level is not None: + return self._agg_by_level( + name, axis=axis, level=level, skipna=skipna, ddof=ddof + ) + return self._reduce( + f, name, axis=axis, numeric_only=numeric_only, skipna=skipna, ddof=ddof + ) + + return set_function_name(stat_func, name, cls) + + +def _make_cum_function( + cls, + name, + name1, + name2, + axis_descr, + desc, + accum_func, + accum_func_name, + mask_a, + mask_b, + examples, +): + @Substitution( + desc=desc, + name1=name1, + name2=name2, + axis_descr=axis_descr, + accum_func_name=accum_func_name, + examples=examples, + ) + @Appender(_cnum_doc) + def cum_func(self, axis=None, skipna=True, *args, **kwargs): + skipna = nv.validate_cum_func_with_skipna(skipna, args, kwargs, name) + if axis is None: + axis = self._stat_axis_number + else: + axis = self._get_axis_number(axis) + + if axis == 1: + return cum_func(self.T, axis=0, skipna=skipna, *args, **kwargs).T + + def na_accum_func(blk_values): + # We will be applying this function to block values + if blk_values.dtype.kind in ["m", "M"]: + # GH#30460, GH#29058 + # numpy 1.18 started sorting NaTs at the end instead of beginning, + # so we need to work around to maintain backwards-consistency. + orig_dtype = blk_values.dtype + + # We need to define mask before masking NaTs + mask = isna(blk_values) + + if accum_func == np.minimum.accumulate: + # Note: the accum_func comparison fails as an "is" comparison + y = blk_values.view("i8") + y[mask] = np.iinfo(np.int64).max + changed = True + else: + y = blk_values + changed = False + + result = accum_func(y.view("i8"), axis) + if skipna: + np.putmask(result, mask, iNaT) + elif accum_func == np.minimum.accumulate: + # Restore NaTs that we masked previously + nz = (~np.asarray(mask)).nonzero()[0] + if len(nz): + # everything up to the first non-na entry stays NaT + result[: nz[0]] = iNaT + + if changed: + # restore NaT elements + y[mask] = iNaT # TODO: could try/finally for this? + + if isinstance(blk_values, np.ndarray): + result = result.view(orig_dtype) + else: + # DatetimeArray + result = type(blk_values)._from_sequence(result, dtype=orig_dtype) + + elif skipna and not issubclass( + blk_values.dtype.type, (np.integer, np.bool_) + ): + vals = blk_values.copy().T + mask = isna(vals) + np.putmask(vals, mask, mask_a) + result = accum_func(vals, axis) + np.putmask(result, mask, mask_b) + else: + result = accum_func(blk_values.T, axis) + + # transpose back for ndarray, not for EA + return result.T if hasattr(result, "T") else result + + result = self._data.apply(na_accum_func) + + d = self._construct_axes_dict() + d["copy"] = False + return self._constructor(result, **d).__finalize__(self) + + return set_function_name(cum_func, name, cls) + + +def _make_logical_function( + cls, name, name1, name2, axis_descr, desc, f, see_also, examples, empty_value +): + @Substitution( + desc=desc, + name1=name1, + name2=name2, + axis_descr=axis_descr, + see_also=see_also, + examples=examples, + empty_value=empty_value, + ) + @Appender(_bool_doc) + def logical_func(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): + nv.validate_logical_func(tuple(), kwargs, fname=name) + if level is not None: + if bool_only is not None: + raise NotImplementedError( + "Option bool_only is not implemented with option level." + ) + return self._agg_by_level(name, axis=axis, level=level, skipna=skipna) + return self._reduce( + f, + name, + axis=axis, + skipna=skipna, + numeric_only=bool_only, + filter_type="bool", + ) + + return set_function_name(logical_func, name, cls) diff --git a/venv/Lib/site-packages/pandas/core/groupby/__init__.py b/venv/Lib/site-packages/pandas/core/groupby/__init__.py new file mode 100644 index 0000000..0c5d265 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/groupby/__init__.py @@ -0,0 +1,11 @@ +from pandas.core.groupby.generic import DataFrameGroupBy, NamedAgg, SeriesGroupBy +from pandas.core.groupby.groupby import GroupBy +from pandas.core.groupby.grouper import Grouper + +__all__ = [ + "DataFrameGroupBy", + "NamedAgg", + "SeriesGroupBy", + "GroupBy", + "Grouper", +] diff --git a/venv/Lib/site-packages/pandas/core/groupby/base.py b/venv/Lib/site-packages/pandas/core/groupby/base.py new file mode 100644 index 0000000..700d8d5 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/groupby/base.py @@ -0,0 +1,188 @@ +""" +Provide basic components for groupby. These definitions +hold the whitelist of methods that are exposed on the +SeriesGroupBy and the DataFrameGroupBy objects. +""" +import collections + +from pandas.core.dtypes.common import is_list_like, is_scalar + +OutputKey = collections.namedtuple("OutputKey", ["label", "position"]) + + +class GroupByMixin: + """ + Provide the groupby facilities to the mixed object. + """ + + def _gotitem(self, key, ndim, subset=None): + """ + Sub-classes to define. Return a sliced object. + + Parameters + ---------- + key : string / list of selections + ndim : 1,2 + requested ndim of result + subset : object, default None + subset to act on + """ + # create a new object to prevent aliasing + if subset is None: + subset = self.obj + + # we need to make a shallow copy of ourselves + # with the same groupby + kwargs = {attr: getattr(self, attr) for attr in self._attributes} + + # Try to select from a DataFrame, falling back to a Series + try: + groupby = self._groupby[key] + except IndexError: + groupby = self._groupby + + self = type(self)(subset, groupby=groupby, parent=self, **kwargs) + self._reset_cache() + if subset.ndim == 2: + if is_scalar(key) and key in subset or is_list_like(key): + self._selection = key + return self + + +# special case to prevent duplicate plots when catching exceptions when +# forwarding methods from NDFrames +plotting_methods = frozenset(["plot", "hist"]) + +common_apply_whitelist = ( + frozenset( + [ + "quantile", + "fillna", + "mad", + "take", + "idxmax", + "idxmin", + "tshift", + "skew", + "corr", + "cov", + "diff", + ] + ) + | plotting_methods +) + +series_apply_whitelist = ( + ( + common_apply_whitelist + | { + "nlargest", + "nsmallest", + "is_monotonic_increasing", + "is_monotonic_decreasing", + } + ) +) | frozenset(["dtype", "unique"]) + +dataframe_apply_whitelist = common_apply_whitelist | frozenset(["dtypes", "corrwith"]) + +# cythonized transformations or canned "agg+broadcast", which do not +# require postprocessing of the result by transform. +cythonized_kernels = frozenset(["cumprod", "cumsum", "shift", "cummin", "cummax"]) + +cython_cast_blacklist = frozenset(["rank", "count", "size", "idxmin", "idxmax"]) + +# List of aggregation/reduction functions. +# These map each group to a single numeric value +reduction_kernels = frozenset( + [ + "all", + "any", + "count", + "first", + "idxmax", + "idxmin", + "last", + "mad", + "max", + "mean", + "median", + "min", + "ngroup", + "nth", + "nunique", + "prod", + # as long as `quantile`'s signature accepts only + # a single quantile value, it's a reduction. + # GH#27526 might change that. + "quantile", + "sem", + "size", + "skew", + "std", + "sum", + "var", + ] +) + +# List of transformation functions. +# a transformation is a function that, for each group, +# produces a result that has the same shape as the group. +transformation_kernels = frozenset( + [ + "backfill", + "bfill", + "corrwith", + "cumcount", + "cummax", + "cummin", + "cumprod", + "cumsum", + "diff", + "ffill", + "fillna", + "pad", + "pct_change", + "rank", + "shift", + "tshift", + ] +) + +# these are all the public methods on Grouper which don't belong +# in either of the above lists +groupby_other_methods = frozenset( + [ + "agg", + "aggregate", + "apply", + "boxplot", + # corr and cov return ngroups*ncolumns rows, so they + # are neither a transformation nor a reduction + "corr", + "cov", + "describe", + "dtypes", + "expanding", + "filter", + "get_group", + "groups", + "head", + "hist", + "indices", + "ndim", + "ngroups", + "ohlc", + "pipe", + "plot", + "resample", + "rolling", + "tail", + "take", + "transform", + ] +) +# Valid values of `name` for `groupby.transform(name)` +# NOTE: do NOT edit this directly. New additions should be inserted +# into the appropriate list above. +transform_kernel_whitelist = reduction_kernels | transformation_kernels diff --git a/venv/Lib/site-packages/pandas/core/groupby/categorical.py b/venv/Lib/site-packages/pandas/core/groupby/categorical.py new file mode 100644 index 0000000..399ed9d --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/groupby/categorical.py @@ -0,0 +1,99 @@ +import numpy as np + +from pandas.core.algorithms import unique1d +from pandas.core.arrays.categorical import ( + Categorical, + CategoricalDtype, + _recode_for_categories, +) + + +def recode_for_groupby(c: Categorical, sort: bool, observed: bool): + """ + Code the categories to ensure we can groupby for categoricals. + + If observed=True, we return a new Categorical with the observed + categories only. + + If sort=False, return a copy of self, coded with categories as + returned by .unique(), followed by any categories not appearing in + the data. If sort=True, return self. + + This method is needed solely to ensure the categorical index of the + GroupBy result has categories in the order of appearance in the data + (GH-8868). + + Parameters + ---------- + c : Categorical + sort : boolean + The value of the sort parameter groupby was called with. + observed : boolean + Account only for the observed values + + Returns + ------- + New Categorical + If sort=False, the new categories are set to the order of + appearance in codes (unless ordered=True, in which case the + original order is preserved), followed by any unrepresented + categories in the original order. + Categorical or None + If we are observed, return the original categorical, otherwise None + """ + + # we only care about observed values + if observed: + unique_codes = unique1d(c.codes) + + take_codes = unique_codes[unique_codes != -1] + if c.ordered: + take_codes = np.sort(take_codes) + + # we recode according to the uniques + categories = c.categories.take(take_codes) + codes = _recode_for_categories(c.codes, c.categories, categories) + + # return a new categorical that maps our new codes + # and categories + dtype = CategoricalDtype(categories, ordered=c.ordered) + return Categorical(codes, dtype=dtype, fastpath=True), c + + # Already sorted according to c.categories; all is fine + if sort: + return c, None + + # sort=False should order groups in as-encountered order (GH-8868) + cat = c.unique() + + # But for groupby to work, all categories should be present, + # including those missing from the data (GH-13179), which .unique() + # above dropped + cat = cat.add_categories(c.categories[~c.categories.isin(cat.categories)]) + + return c.reorder_categories(cat.categories), None + + +def recode_from_groupby(c: Categorical, sort: bool, ci): + """ + Reverse the codes_to_groupby to account for sort / observed. + + Parameters + ---------- + c : Categorical + sort : boolean + The value of the sort parameter groupby was called with. + ci : CategoricalIndex + The codes / categories to recode + + Returns + ------- + CategoricalIndex + """ + + # we re-order to the original category orderings + if sort: + return ci.set_categories(c.categories) + + # we are not sorting, so add unobserved to the end + return ci.add_categories(c.categories[~c.categories.isin(ci.categories)]) diff --git a/venv/Lib/site-packages/pandas/core/groupby/generic.py b/venv/Lib/site-packages/pandas/core/groupby/generic.py new file mode 100644 index 0000000..c49677f --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/groupby/generic.py @@ -0,0 +1,2080 @@ +""" +Define the SeriesGroupBy and DataFrameGroupBy +classes that hold the groupby interfaces (and some implementations). + +These are user facing as the result of the ``df.groupby(...)`` operations, +which here returns a DataFrameGroupBy object. +""" +from collections import abc, defaultdict, namedtuple +import copy +from functools import partial +from textwrap import dedent +import typing +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + FrozenSet, + Iterable, + List, + Mapping, + Sequence, + Tuple, + Type, + Union, + cast, +) +import warnings + +import numpy as np + +from pandas._libs import Timestamp, lib +from pandas._typing import FrameOrSeries +from pandas.util._decorators import Appender, Substitution + +from pandas.core.dtypes.cast import ( + maybe_convert_objects, + maybe_downcast_numeric, + maybe_downcast_to_dtype, +) +from pandas.core.dtypes.common import ( + ensure_int64, + ensure_platform_int, + is_bool, + is_dict_like, + is_integer_dtype, + is_interval_dtype, + is_list_like, + is_numeric_dtype, + is_object_dtype, + is_scalar, + needs_i8_conversion, +) +from pandas.core.dtypes.missing import _isna_ndarraylike, isna, notna + +import pandas.core.algorithms as algorithms +from pandas.core.base import DataError, SpecificationError +import pandas.core.common as com +from pandas.core.construction import create_series_with_explicit_dtype +from pandas.core.frame import DataFrame +from pandas.core.generic import ABCDataFrame, ABCSeries, NDFrame, _shared_docs +from pandas.core.groupby import base +from pandas.core.groupby.groupby import ( + GroupBy, + _apply_docs, + _transform_template, + get_groupby, +) +from pandas.core.indexes.api import Index, MultiIndex, all_indexes_same +import pandas.core.indexes.base as ibase +from pandas.core.internals import BlockManager, make_block +from pandas.core.series import Series + +from pandas.plotting import boxplot_frame_groupby + +if TYPE_CHECKING: + from pandas.core.internals import Block + + +NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"]) +# TODO(typing) the return value on this callable should be any *scalar*. +AggScalar = Union[str, Callable[..., Any]] +# TODO: validate types on ScalarResult and move to _typing +# Blocked from using by https://github.com/python/mypy/issues/1484 +# See note at _mangle_lambda_list +ScalarResult = typing.TypeVar("ScalarResult") + + +def generate_property(name: str, klass: Type[FrameOrSeries]): + """ + Create a property for a GroupBy subclass to dispatch to DataFrame/Series. + + Parameters + ---------- + name : str + klass : {DataFrame, Series} + + Returns + ------- + property + """ + + def prop(self): + return self._make_wrapper(name) + + parent_method = getattr(klass, name) + prop.__doc__ = parent_method.__doc__ or "" + prop.__name__ = name + return property(prop) + + +def pin_whitelisted_properties(klass: Type[FrameOrSeries], whitelist: FrozenSet[str]): + """ + Create GroupBy member defs for DataFrame/Series names in a whitelist. + + Parameters + ---------- + klass : DataFrame or Series class + class where members are defined. + whitelist : frozenset[str] + Set of names of klass methods to be constructed + + Returns + ------- + class decorator + + Notes + ----- + Since we don't want to override methods explicitly defined in the + base class, any such name is skipped. + """ + + def pinner(cls): + for name in whitelist: + if hasattr(cls, name): + # don't override anything that was explicitly defined + # in the base class + continue + + prop = generate_property(name, klass) + setattr(cls, name, prop) + + return cls + + return pinner + + +@pin_whitelisted_properties(Series, base.series_apply_whitelist) +class SeriesGroupBy(GroupBy): + _apply_whitelist = base.series_apply_whitelist + + def _iterate_slices(self) -> Iterable[Series]: + yield self._selected_obj + + @property + def _selection_name(self): + """ + since we are a series, we by definition only have + a single name, but may be the result of a selection or + the name of our object + """ + if self._selection is None: + return self.obj.name + else: + return self._selection + + _agg_see_also_doc = dedent( + """ + See Also + -------- + pandas.Series.groupby.apply + pandas.Series.groupby.transform + pandas.Series.aggregate + """ + ) + + _agg_examples_doc = dedent( + """ + Examples + -------- + >>> s = pd.Series([1, 2, 3, 4]) + + >>> s + 0 1 + 1 2 + 2 3 + 3 4 + dtype: int64 + + >>> s.groupby([1, 1, 2, 2]).min() + 1 1 + 2 3 + dtype: int64 + + >>> s.groupby([1, 1, 2, 2]).agg('min') + 1 1 + 2 3 + dtype: int64 + + >>> s.groupby([1, 1, 2, 2]).agg(['min', 'max']) + min max + 1 1 2 + 2 3 4 + + The output column names can be controlled by passing + the desired column names and aggregations as keyword arguments. + + >>> s.groupby([1, 1, 2, 2]).agg( + ... minimum='min', + ... maximum='max', + ... ) + minimum maximum + 1 1 2 + 2 3 4 + """ + ) + + @Appender( + _apply_docs["template"].format( + input="series", examples=_apply_docs["series_examples"] + ) + ) + def apply(self, func, *args, **kwargs): + return super().apply(func, *args, **kwargs) + + @Substitution( + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + versionadded="", + klass="Series", + axis="", + ) + @Appender(_shared_docs["aggregate"]) + def aggregate(self, func=None, *args, **kwargs): + + relabeling = func is None + columns = None + no_arg_message = "Must provide 'func' or named aggregation **kwargs." + if relabeling: + columns = list(kwargs) + func = [kwargs[col] for col in columns] + kwargs = {} + if not columns: + raise TypeError(no_arg_message) + + if isinstance(func, str): + return getattr(self, func)(*args, **kwargs) + + elif isinstance(func, abc.Iterable): + # Catch instances of lists / tuples + # but not the class list / tuple itself. + func = _maybe_mangle_lambdas(func) + ret = self._aggregate_multiple_funcs(func) + if relabeling: + ret.columns = columns + else: + cyfunc = self._get_cython_func(func) + if cyfunc and not args and not kwargs: + return getattr(self, cyfunc)() + + if self.grouper.nkeys > 1: + return self._python_agg_general(func, *args, **kwargs) + + try: + return self._python_agg_general(func, *args, **kwargs) + except (ValueError, KeyError): + # TODO: KeyError is raised in _python_agg_general, + # see see test_groupby.test_basic + result = self._aggregate_named(func, *args, **kwargs) + + index = Index(sorted(result), name=self.grouper.names[0]) + ret = create_series_with_explicit_dtype( + result, index=index, dtype_if_empty=object + ) + + if not self.as_index: # pragma: no cover + print("Warning, ignoring as_index=True") + + if isinstance(ret, dict): + from pandas import concat + + ret = concat(ret, axis=1) + return ret + + agg = aggregate + + def _aggregate_multiple_funcs(self, arg): + if isinstance(arg, dict): + + # show the deprecation, but only if we + # have not shown a higher level one + # GH 15931 + if isinstance(self._selected_obj, Series): + raise SpecificationError("nested renamer is not supported") + + columns = list(arg.keys()) + arg = arg.items() + elif any(isinstance(x, (tuple, list)) for x in arg): + arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg] + + # indicated column order + columns = next(zip(*arg)) + else: + # list of functions / function names + columns = [] + for f in arg: + columns.append(com.get_callable_name(f) or f) + + arg = zip(columns, arg) + + results = {} + for name, func in arg: + obj = self + + # reset the cache so that we + # only include the named selection + if name in self._selected_obj: + obj = copy.copy(obj) + obj._reset_cache() + obj._selection = name + results[name] = obj.aggregate(func) + + if any(isinstance(x, DataFrame) for x in results.values()): + # let higher level handle + return results + + return DataFrame(results, columns=columns) + + def _wrap_series_output( + self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Index + ) -> Union[Series, DataFrame]: + """ + Wraps the output of a SeriesGroupBy operation into the expected result. + + Parameters + ---------- + output : Mapping[base.OutputKey, Union[Series, np.ndarray]] + Data to wrap. + index : pd.Index + Index to apply to the output. + + Returns + ------- + Series or DataFrame + + Notes + ----- + In the vast majority of cases output and columns will only contain one + element. The exception is operations that expand dimensions, like ohlc. + """ + indexed_output = {key.position: val for key, val in output.items()} + columns = Index(key.label for key in output) + + result: Union[Series, DataFrame] + if len(output) > 1: + result = DataFrame(indexed_output, index=index) + result.columns = columns + else: + result = Series(indexed_output[0], index=index, name=columns[0]) + + return result + + def _wrap_aggregated_output( + self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] + ) -> Union[Series, DataFrame]: + """ + Wraps the output of a SeriesGroupBy aggregation into the expected result. + + Parameters + ---------- + output : Mapping[base.OutputKey, Union[Series, np.ndarray]] + Data to wrap. + + Returns + ------- + Series or DataFrame + + Notes + ----- + In the vast majority of cases output will only contain one element. + The exception is operations that expand dimensions, like ohlc. + """ + result = self._wrap_series_output( + output=output, index=self.grouper.result_index + ) + return self._reindex_output(result)._convert(datetime=True) + + def _wrap_transformed_output( + self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] + ) -> Series: + """ + Wraps the output of a SeriesGroupBy aggregation into the expected result. + + Parameters + ---------- + output : dict[base.OutputKey, Union[Series, np.ndarray]] + Dict with a sole key of 0 and a value of the result values. + + Returns + ------- + Series + + Notes + ----- + output should always contain one element. It is specified as a dict + for consistency with DataFrame methods and _wrap_aggregated_output. + """ + assert len(output) == 1 + result = self._wrap_series_output(output=output, index=self.obj.index) + + # No transformations increase the ndim of the result + assert isinstance(result, Series) + return result + + def _wrap_applied_output(self, keys, values, not_indexed_same=False): + if len(keys) == 0: + # GH #6265 + return Series([], name=self._selection_name, index=keys, dtype=np.float64) + + def _get_index() -> Index: + if self.grouper.nkeys > 1: + index = MultiIndex.from_tuples(keys, names=self.grouper.names) + else: + index = Index(keys, name=self.grouper.names[0]) + return index + + if isinstance(values[0], dict): + # GH #823 #24880 + index = _get_index() + result = self._reindex_output(DataFrame(values, index=index)) + # if self.observed is False, + # keep all-NaN rows created while re-indexing + result = result.stack(dropna=self.observed) + result.name = self._selection_name + return result + + if isinstance(values[0], Series): + return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) + elif isinstance(values[0], DataFrame): + # possible that Series -> DataFrame by applied function + return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) + else: + # GH #6265 #24880 + result = Series(data=values, index=_get_index(), name=self._selection_name) + return self._reindex_output(result) + + def _aggregate_named(self, func, *args, **kwargs): + result = {} + + for name, group in self: + group.name = name + output = func(group, *args, **kwargs) + if isinstance(output, (Series, Index, np.ndarray)): + raise ValueError("Must produce aggregated value") + result[name] = output + + return result + + @Substitution(klass="Series", selected="A.") + @Appender(_transform_template) + def transform(self, func, *args, **kwargs): + func = self._get_cython_func(func) or func + + if not isinstance(func, str): + return self._transform_general(func, *args, **kwargs) + + elif func not in base.transform_kernel_whitelist: + msg = f"'{func}' is not a valid function name for transform(name)" + raise ValueError(msg) + elif func in base.cythonized_kernels: + # cythonized transform or canned "agg+broadcast" + return getattr(self, func)(*args, **kwargs) + + # If func is a reduction, we need to broadcast the + # result to the whole group. Compute func result + # and deal with possible broadcasting below. + result = getattr(self, func)(*args, **kwargs) + return self._transform_fast(result, func) + + def _transform_general(self, func, *args, **kwargs): + """ + Transform with a non-str `func`. + """ + klass = type(self._selected_obj) + + results = [] + for name, group in self: + object.__setattr__(group, "name", name) + res = func(group, *args, **kwargs) + + if isinstance(res, (ABCDataFrame, ABCSeries)): + res = res._values + + indexer = self._get_index(name) + ser = klass(res, indexer) + results.append(ser) + + # check for empty "results" to avoid concat ValueError + if results: + from pandas.core.reshape.concat import concat + + result = concat(results).sort_index() + else: + result = Series(dtype=np.float64) + + # we will only try to coerce the result type if + # we have a numeric dtype, as these are *always* user-defined funcs + # the cython take a different path (and casting) + dtype = self._selected_obj.dtype + if is_numeric_dtype(dtype): + result = maybe_downcast_to_dtype(result, dtype) + + result.name = self._selected_obj.name + result.index = self._selected_obj.index + return result + + def _transform_fast(self, result, func_nm: str) -> Series: + """ + fast version of transform, only applicable to + builtin/cythonizable functions + """ + ids, _, ngroup = self.grouper.group_info + cast = self._transform_should_cast(func_nm) + out = algorithms.take_1d(result._values, ids) + if cast: + out = self._try_cast(out, self.obj) + return Series(out, index=self.obj.index, name=self.obj.name) + + def filter(self, func, dropna=True, *args, **kwargs): + """ + Return a copy of a Series excluding elements from groups that + do not satisfy the boolean criterion specified by func. + + Parameters + ---------- + func : function + To apply to each group. Should return True or False. + dropna : Drop groups that do not pass the filter. True by default; + if False, groups that evaluate False are filled with NaNs. + + Examples + -------- + >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', + ... 'foo', 'bar'], + ... 'B' : [1, 2, 3, 4, 5, 6], + ... 'C' : [2.0, 5., 8., 1., 2., 9.]}) + >>> grouped = df.groupby('A') + >>> df.groupby('A').B.filter(lambda x: x.mean() > 3.) + 1 2 + 3 4 + 5 6 + Name: B, dtype: int64 + + Returns + ------- + filtered : Series + """ + if isinstance(func, str): + wrapper = lambda x: getattr(x, func)(*args, **kwargs) + else: + wrapper = lambda x: func(x, *args, **kwargs) + + # Interpret np.nan as False. + def true_and_notna(x, *args, **kwargs) -> bool: + b = wrapper(x, *args, **kwargs) + return b and notna(b) + + try: + indices = [ + self._get_index(name) for name, group in self if true_and_notna(group) + ] + except (ValueError, TypeError): + raise TypeError("the filter must return a boolean result") + + filtered = self._apply_filter(indices, dropna) + return filtered + + def nunique(self, dropna: bool = True) -> Series: + """ + Return number of unique elements in the group. + + Returns + ------- + Series + Number of unique values within each group. + """ + ids, _, _ = self.grouper.group_info + + val = self.obj._internal_get_values() + + # GH 27951 + # temporary fix while we wait for NumPy bug 12629 to be fixed + val[isna(val)] = np.datetime64("NaT") + + try: + sorter = np.lexsort((val, ids)) + except TypeError: # catches object dtypes + msg = f"val.dtype must be object, got {val.dtype}" + assert val.dtype == object, msg + val, _ = algorithms.factorize(val, sort=False) + sorter = np.lexsort((val, ids)) + _isna = lambda a: a == -1 + else: + _isna = isna + + ids, val = ids[sorter], val[sorter] + + # group boundaries are where group ids change + # unique observations are where sorted values change + idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]] + inc = np.r_[1, val[1:] != val[:-1]] + + # 1st item of each group is a new unique observation + mask = _isna(val) + if dropna: + inc[idx] = 1 + inc[mask] = 0 + else: + inc[mask & np.r_[False, mask[:-1]]] = 0 + inc[idx] = 1 + + out = np.add.reduceat(inc, idx).astype("int64", copy=False) + if len(ids): + # NaN/NaT group exists if the head of ids is -1, + # so remove it from res and exclude its index from idx + if ids[0] == -1: + res = out[1:] + idx = idx[np.flatnonzero(idx)] + else: + res = out + else: + res = out[1:] + ri = self.grouper.result_index + + # we might have duplications among the bins + if len(res) != len(ri): + res, out = np.zeros(len(ri), dtype=out.dtype), res + res[ids[idx]] = out + + result = Series(res, index=ri, name=self._selection_name) + return self._reindex_output(result, fill_value=0) + + @Appender(Series.describe.__doc__) + def describe(self, **kwargs): + result = self.apply(lambda x: x.describe(**kwargs)) + if self.axis == 1: + return result.T + return result.unstack() + + def value_counts( + self, normalize=False, sort=True, ascending=False, bins=None, dropna=True + ): + + from pandas.core.reshape.tile import cut + from pandas.core.reshape.merge import _get_join_indexers + + if bins is not None and not np.iterable(bins): + # scalar bins cannot be done at top level + # in a backward compatible way + return self.apply( + Series.value_counts, + normalize=normalize, + sort=sort, + ascending=ascending, + bins=bins, + ) + + ids, _, _ = self.grouper.group_info + val = self.obj._internal_get_values() + + # groupby removes null keys from groupings + mask = ids != -1 + ids, val = ids[mask], val[mask] + + if bins is None: + lab, lev = algorithms.factorize(val, sort=True) + llab = lambda lab, inc: lab[inc] + else: + + # lab is a Categorical with categories an IntervalIndex + lab = cut(Series(val), bins, include_lowest=True) + lev = lab.cat.categories + lab = lev.take(lab.cat.codes) + llab = lambda lab, inc: lab[inc]._multiindex.codes[-1] + + if is_interval_dtype(lab): + # TODO: should we do this inside II? + sorter = np.lexsort((lab.left, lab.right, ids)) + else: + sorter = np.lexsort((lab, ids)) + + ids, lab = ids[sorter], lab[sorter] + + # group boundaries are where group ids change + idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]] + + # new values are where sorted labels change + lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1)) + inc = np.r_[True, lchanges] + inc[idx] = True # group boundaries are also new values + out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts + + # num. of times each group should be repeated + rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) + + # multi-index components + codes = self.grouper.reconstructed_codes + codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)] + levels = [ping.group_index for ping in self.grouper.groupings] + [lev] + names = self.grouper.names + [self._selection_name] + + if dropna: + mask = codes[-1] != -1 + if mask.all(): + dropna = False + else: + out, codes = out[mask], [level_codes[mask] for level_codes in codes] + + if normalize: + out = out.astype("float") + d = np.diff(np.r_[idx, len(ids)]) + if dropna: + m = ids[lab == -1] + np.add.at(d, m, -1) + acc = rep(d)[mask] + else: + acc = rep(d) + out /= acc + + if sort and bins is None: + cat = ids[inc][mask] if dropna else ids[inc] + sorter = np.lexsort((out if ascending else -out, cat)) + out, codes[-1] = out[sorter], codes[-1][sorter] + + if bins is None: + mi = MultiIndex( + levels=levels, codes=codes, names=names, verify_integrity=False + ) + + if is_integer_dtype(out): + out = ensure_int64(out) + return Series(out, index=mi, name=self._selection_name) + + # for compat. with libgroupby.value_counts need to ensure every + # bin is present at every index level, null filled with zeros + diff = np.zeros(len(out), dtype="bool") + for level_codes in codes[:-1]: + diff |= np.r_[True, level_codes[1:] != level_codes[:-1]] + + ncat, nbin = diff.sum(), len(levels[-1]) + + left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)] + + right = [diff.cumsum() - 1, codes[-1]] + + _, idx = _get_join_indexers(left, right, sort=False, how="left") + out = np.where(idx != -1, out[idx], 0) + + if sort: + sorter = np.lexsort((out if ascending else -out, left[0])) + out, left[-1] = out[sorter], left[-1][sorter] + + # build the multi-index w/ full levels + def build_codes(lev_codes: np.ndarray) -> np.ndarray: + return np.repeat(lev_codes[diff], nbin) + + codes = [build_codes(lev_codes) for lev_codes in codes[:-1]] + codes.append(left[-1]) + + mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False) + + if is_integer_dtype(out): + out = ensure_int64(out) + return Series(out, index=mi, name=self._selection_name) + + def count(self) -> Series: + """ + Compute count of group, excluding missing values. + + Returns + ------- + Series + Count of values within each group. + """ + ids, _, ngroups = self.grouper.group_info + val = self.obj._internal_get_values() + + mask = (ids != -1) & ~isna(val) + ids = ensure_platform_int(ids) + minlength = ngroups or 0 + out = np.bincount(ids[mask], minlength=minlength) + + result = Series( + out, + index=self.grouper.result_index, + name=self._selection_name, + dtype="int64", + ) + return self._reindex_output(result, fill_value=0) + + def _apply_to_column_groupbys(self, func): + """ return a pass thru """ + return func(self) + + def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None): + """Calculate pct_change of each value to previous entry in group""" + # TODO: Remove this conditional when #23918 is fixed + if freq: + return self.apply( + lambda x: x.pct_change( + periods=periods, fill_method=fill_method, limit=limit, freq=freq + ) + ) + if fill_method is None: # GH30463 + fill_method = "pad" + limit = 0 + filled = getattr(self, fill_method)(limit=limit) + fill_grp = filled.groupby(self.grouper.codes) + shifted = fill_grp.shift(periods=periods, freq=freq) + + return (filled / shifted) - 1 + + +@pin_whitelisted_properties(DataFrame, base.dataframe_apply_whitelist) +class DataFrameGroupBy(GroupBy): + + _apply_whitelist = base.dataframe_apply_whitelist + + _agg_see_also_doc = dedent( + """ + See Also + -------- + pandas.DataFrame.groupby.apply + pandas.DataFrame.groupby.transform + pandas.DataFrame.aggregate + """ + ) + + _agg_examples_doc = dedent( + """ + Examples + -------- + + >>> df = pd.DataFrame({'A': [1, 1, 2, 2], + ... 'B': [1, 2, 3, 4], + ... 'C': np.random.randn(4)}) + + >>> df + A B C + 0 1 1 0.362838 + 1 1 2 0.227877 + 2 2 3 1.267767 + 3 2 4 -0.562860 + + The aggregation is for each column. + + >>> df.groupby('A').agg('min') + B C + A + 1 1 0.227877 + 2 3 -0.562860 + + Multiple aggregations + + >>> df.groupby('A').agg(['min', 'max']) + B C + min max min max + A + 1 1 2 0.227877 0.362838 + 2 3 4 -0.562860 1.267767 + + Select a column for aggregation + + >>> df.groupby('A').B.agg(['min', 'max']) + min max + A + 1 1 2 + 2 3 4 + + Different aggregations per column + + >>> df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'}) + B C + min max sum + A + 1 1 2 0.590716 + 2 3 4 0.704907 + + To control the output names with different aggregations per column, + pandas supports "named aggregation" + + >>> df.groupby("A").agg( + ... b_min=pd.NamedAgg(column="B", aggfunc="min"), + ... c_sum=pd.NamedAgg(column="C", aggfunc="sum")) + b_min c_sum + A + 1 1 -1.956929 + 2 3 -0.322183 + + - The keywords are the *output* column names + - The values are tuples whose first element is the column to select + and the second element is the aggregation to apply to that column. + Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields + ``['column', 'aggfunc']`` to make it clearer what the arguments are. + As usual, the aggregation can be a callable or a string alias. + + See :ref:`groupby.aggregate.named` for more. + """ + ) + + @Substitution( + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + versionadded="", + klass="DataFrame", + axis="", + ) + @Appender(_shared_docs["aggregate"]) + def aggregate(self, func=None, *args, **kwargs): + + relabeling = func is None and _is_multi_agg_with_relabel(**kwargs) + if relabeling: + func, columns, order = _normalize_keyword_aggregation(kwargs) + + kwargs = {} + elif isinstance(func, list) and len(func) > len(set(func)): + + # GH 28426 will raise error if duplicated function names are used and + # there is no reassigned name + raise SpecificationError( + "Function names must be unique if there is no new column " + "names assigned" + ) + elif func is None: + # nicer error message + raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).") + + func = _maybe_mangle_lambdas(func) + + result, how = self._aggregate(func, *args, **kwargs) + if how is None: + return result + + if result is None: + + # grouper specific aggregations + if self.grouper.nkeys > 1: + return self._python_agg_general(func, *args, **kwargs) + elif args or kwargs: + result = self._aggregate_frame(func, *args, **kwargs) + + elif self.axis == 1: + # _aggregate_multiple_funcs does not allow self.axis == 1 + result = self._aggregate_frame(func) + + else: + + # try to treat as if we are passing a list + try: + result = self._aggregate_multiple_funcs([func], _axis=self.axis) + except ValueError as err: + if "no results" not in str(err): + # raised directly by _aggregate_multiple_funcs + raise + result = self._aggregate_frame(func) + else: + result.columns = Index( + result.columns.levels[0], name=self._selected_obj.columns.name + ) + + if not self.as_index: + self._insert_inaxis_grouper_inplace(result) + result.index = np.arange(len(result)) + + if relabeling: + + # used reordered index of columns + result = result.iloc[:, order] + result.columns = columns + + return result._convert(datetime=True) + + agg = aggregate + + def _iterate_slices(self) -> Iterable[Series]: + obj = self._selected_obj + if self.axis == 1: + obj = obj.T + + if isinstance(obj, Series) and obj.name not in self.exclusions: + # Occurs when doing DataFrameGroupBy(...)["X"] + yield obj + else: + for label, values in obj.items(): + if label in self.exclusions: + continue + + yield values + + def _cython_agg_general( + self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 + ) -> DataFrame: + agg_blocks, agg_items = self._cython_agg_blocks( + how, alt=alt, numeric_only=numeric_only, min_count=min_count + ) + return self._wrap_agged_blocks(agg_blocks, items=agg_items) + + def _cython_agg_blocks( + self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 + ) -> "Tuple[List[Block], Index]": + # TODO: the actual managing of mgr_locs is a PITA + # here, it should happen via BlockManager.combine + + data: BlockManager = self._get_data_to_aggregate() + + if numeric_only: + data = data.get_numeric_data(copy=False) + + agg_blocks: List[Block] = [] + new_items: List[np.ndarray] = [] + deleted_items: List[np.ndarray] = [] + no_result = object() + for block in data.blocks: + # Avoid inheriting result from earlier in the loop + result = no_result + locs = block.mgr_locs.as_array + try: + result, _ = self.grouper.aggregate( + block.values, how, axis=1, min_count=min_count + ) + except NotImplementedError: + # generally if we have numeric_only=False + # and non-applicable functions + # try to python agg + + if alt is None: + # we cannot perform the operation + # in an alternate way, exclude the block + assert how == "ohlc" + deleted_items.append(locs) + continue + + # call our grouper again with only this block + obj = self.obj[data.items[locs]] + if obj.shape[1] == 1: + # Avoid call to self.values that can occur in DataFrame + # reductions; see GH#28949 + obj = obj.iloc[:, 0] + + s = get_groupby(obj, self.grouper) + try: + result = s.aggregate(lambda x: alt(x, axis=self.axis)) + except TypeError: + # we may have an exception in trying to aggregate + # continue and exclude the block + deleted_items.append(locs) + continue + else: + result = cast(DataFrame, result) + # unwrap DataFrame to get array + assert len(result._data.blocks) == 1 + result = result._data.blocks[0].values + if isinstance(result, np.ndarray) and result.ndim == 1: + result = result.reshape(1, -1) + + finally: + assert not isinstance(result, DataFrame) + + if result is not no_result: + # see if we can cast the block back to the original dtype + result = maybe_downcast_numeric(result, block.dtype) + + if block.is_extension and isinstance(result, np.ndarray): + # e.g. block.values was an IntegerArray + # (1, N) case can occur if block.values was Categorical + # and result is ndarray[object] + assert result.ndim == 1 or result.shape[0] == 1 + try: + # Cast back if feasible + result = type(block.values)._from_sequence( + result.ravel(), dtype=block.values.dtype + ) + except ValueError: + # reshape to be valid for non-Extension Block + result = result.reshape(1, -1) + + agg_block: Block = block.make_block(result) + + new_items.append(locs) + agg_blocks.append(agg_block) + + if not agg_blocks: + raise DataError("No numeric types to aggregate") + + # reset the locs in the blocks to correspond to our + # current ordering + indexer = np.concatenate(new_items) + agg_items = data.items.take(np.sort(indexer)) + + if deleted_items: + + # we need to adjust the indexer to account for the + # items we have removed + # really should be done in internals :< + + deleted = np.concatenate(deleted_items) + ai = np.arange(len(data)) + mask = np.zeros(len(data)) + mask[deleted] = 1 + indexer = (ai - mask.cumsum())[indexer] + + offset = 0 + for blk in agg_blocks: + loc = len(blk.mgr_locs) + blk.mgr_locs = indexer[offset : (offset + loc)] + offset += loc + + return agg_blocks, agg_items + + def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: + if self.grouper.nkeys != 1: + raise AssertionError("Number of keys must be 1") + + axis = self.axis + obj = self._obj_with_exclusions + + result: Dict[Union[int, str], Union[NDFrame, np.ndarray]] = {} + if axis != obj._info_axis_number: + for name, data in self: + fres = func(data, *args, **kwargs) + result[name] = fres + else: + for name in self.indices: + data = self.get_group(name, obj=obj) + fres = func(data, *args, **kwargs) + result[name] = fres + + return self._wrap_frame_output(result, obj) + + def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: + # only for axis==0 + + obj = self._obj_with_exclusions + result: Dict[Union[int, str], NDFrame] = {} + cannot_agg = [] + for item in obj: + data = obj[item] + colg = SeriesGroupBy(data, selection=item, grouper=self.grouper) + + cast = self._transform_should_cast(func) + try: + result[item] = colg.aggregate(func, *args, **kwargs) + + except ValueError as err: + if "Must produce aggregated value" in str(err): + # raised in _aggregate_named, handle at higher level + # see test_apply_with_mutated_index + raise + # otherwise we get here from an AttributeError in _make_wrapper + cannot_agg.append(item) + continue + + else: + if cast: + result[item] = self._try_cast(result[item], data) + + result_columns = obj.columns + if cannot_agg: + result_columns = result_columns.drop(cannot_agg) + + return DataFrame(result, columns=result_columns) + + def _wrap_applied_output(self, keys, values, not_indexed_same=False): + if len(keys) == 0: + return DataFrame(index=keys) + + key_names = self.grouper.names + + # GH12824. + def first_not_none(values): + try: + return next(com.not_none(*values)) + except StopIteration: + return None + + v = first_not_none(values) + + if v is None: + # GH9684. If all values are None, then this will throw an error. + # We'd prefer it return an empty dataframe. + return DataFrame() + elif isinstance(v, DataFrame): + return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) + elif self.grouper.groupings is not None: + if len(self.grouper.groupings) > 1: + key_index = self.grouper.result_index + + else: + ping = self.grouper.groupings[0] + if len(keys) == ping.ngroups: + key_index = ping.group_index + key_index.name = key_names[0] + + key_lookup = Index(keys) + indexer = key_lookup.get_indexer(key_index) + + # reorder the values + values = [values[i] for i in indexer] + else: + + key_index = Index(keys, name=key_names[0]) + + # don't use the key indexer + if not self.as_index: + key_index = None + + # make Nones an empty object + v = first_not_none(values) + if v is None: + return DataFrame() + elif isinstance(v, NDFrame): + + # this is to silence a DeprecationWarning + # TODO: Remove when default dtype of empty Series is object + kwargs = v._construct_axes_dict() + if v._constructor is Series: + backup = create_series_with_explicit_dtype( + **kwargs, dtype_if_empty=object + ) + else: + backup = v._constructor(**kwargs) + + values = [x if (x is not None) else backup for x in values] + + v = values[0] + + if isinstance(v, (np.ndarray, Index, Series)): + if isinstance(v, Series): + applied_index = self._selected_obj._get_axis(self.axis) + all_indexed_same = all_indexes_same([x.index for x in values]) + singular_series = len(values) == 1 and applied_index.nlevels == 1 + + # GH3596 + # provide a reduction (Frame -> Series) if groups are + # unique + if self.squeeze: + # assign the name to this series + if singular_series: + values[0].name = keys[0] + + # GH2893 + # we have series in the values array, we want to + # produce a series: + # if any of the sub-series are not indexed the same + # OR we don't have a multi-index and we have only a + # single values + return self._concat_objects( + keys, values, not_indexed_same=not_indexed_same + ) + + # still a series + # path added as of GH 5545 + elif all_indexed_same: + from pandas.core.reshape.concat import concat + + return concat(values) + + if not all_indexed_same: + # GH 8467 + return self._concat_objects(keys, values, not_indexed_same=True) + + if self.axis == 0 and isinstance(v, ABCSeries): + # GH6124 if the list of Series have a consistent name, + # then propagate that name to the result. + index = v.index.copy() + if index.name is None: + # Only propagate the series name to the result + # if all series have a consistent name. If the + # series do not have a consistent name, do + # nothing. + names = {v.name for v in values} + if len(names) == 1: + index.name = list(names)[0] + + # normally use vstack as its faster than concat + # and if we have mi-columns + if ( + isinstance(v.index, MultiIndex) + or key_index is None + or isinstance(key_index, MultiIndex) + ): + stacked_values = np.vstack([np.asarray(v) for v in values]) + result = DataFrame( + stacked_values, index=key_index, columns=index + ) + else: + # GH5788 instead of stacking; concat gets the + # dtypes correct + from pandas.core.reshape.concat import concat + + result = concat( + values, + keys=key_index, + names=key_index.names, + axis=self.axis, + ).unstack() + result.columns = index + elif isinstance(v, ABCSeries): + stacked_values = np.vstack([np.asarray(v) for v in values]) + result = DataFrame( + stacked_values.T, index=v.index, columns=key_index + ) + else: + # GH#1738: values is list of arrays of unequal lengths + # fall through to the outer else clause + # TODO: sure this is right? we used to do this + # after raising AttributeError above + return Series(values, index=key_index, name=self._selection_name) + + # if we have date/time like in the original, then coerce dates + # as we are stacking can easily have object dtypes here + so = self._selected_obj + if so.ndim == 2 and so.dtypes.apply(needs_i8_conversion).any(): + result = _recast_datetimelike_result(result) + else: + result = result._convert(datetime=True) + + return self._reindex_output(result) + + # values are not series or array-like but scalars + else: + # only coerce dates if we find at least 1 datetime + should_coerce = any(isinstance(x, Timestamp) for x in values) + # self._selection_name not passed through to Series as the + # result should not take the name of original selection + # of columns + return Series(values, index=key_index)._convert( + datetime=True, coerce=should_coerce + ) + + else: + # Handle cases like BinGrouper + return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) + + def _transform_general(self, func, *args, **kwargs): + from pandas.core.reshape.concat import concat + + applied = [] + obj = self._obj_with_exclusions + gen = self.grouper.get_iterator(obj, axis=self.axis) + fast_path, slow_path = self._define_paths(func, *args, **kwargs) + + path = None + for name, group in gen: + object.__setattr__(group, "name", name) + + if path is None: + # Try slow path and fast path. + try: + path, res = self._choose_path(fast_path, slow_path, group) + except TypeError: + return self._transform_item_by_item(obj, fast_path) + except ValueError: + msg = "transform must return a scalar value for each group" + raise ValueError(msg) + else: + res = path(group) + + if isinstance(res, Series): + + # we need to broadcast across the + # other dimension; this will preserve dtypes + # GH14457 + if not np.prod(group.shape): + continue + elif res.index.is_(obj.index): + r = concat([res] * len(group.columns), axis=1) + r.columns = group.columns + r.index = group.index + else: + r = DataFrame( + np.concatenate([res.values] * len(group.index)).reshape( + group.shape + ), + columns=group.columns, + index=group.index, + ) + + applied.append(r) + else: + applied.append(res) + + concat_index = obj.columns if self.axis == 0 else obj.index + other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1 + concatenated = concat(applied, axis=self.axis, verify_integrity=False) + concatenated = concatenated.reindex(concat_index, axis=other_axis, copy=False) + return self._set_result_index_ordered(concatenated) + + @Substitution(klass="DataFrame", selected="") + @Appender(_transform_template) + def transform(self, func, *args, **kwargs): + + # optimized transforms + func = self._get_cython_func(func) or func + + if not isinstance(func, str): + return self._transform_general(func, *args, **kwargs) + + elif func not in base.transform_kernel_whitelist: + msg = f"'{func}' is not a valid function name for transform(name)" + raise ValueError(msg) + elif func in base.cythonized_kernels: + # cythonized transformation or canned "reduction+broadcast" + return getattr(self, func)(*args, **kwargs) + + # If func is a reduction, we need to broadcast the + # result to the whole group. Compute func result + # and deal with possible broadcasting below. + result = getattr(self, func)(*args, **kwargs) + + # a reduction transform + if not isinstance(result, DataFrame): + return self._transform_general(func, *args, **kwargs) + + obj = self._obj_with_exclusions + + # nuisance columns + if not result.columns.equals(obj.columns): + return self._transform_general(func, *args, **kwargs) + + return self._transform_fast(result, func) + + def _transform_fast(self, result: DataFrame, func_nm: str) -> DataFrame: + """ + Fast transform path for aggregations + """ + # if there were groups with no observations (Categorical only?) + # try casting data to original dtype + cast = self._transform_should_cast(func_nm) + + obj = self._obj_with_exclusions + + # for each col, reshape to to size of original frame + # by take operation + ids, _, ngroup = self.grouper.group_info + output = [] + for i, _ in enumerate(result.columns): + res = algorithms.take_1d(result.iloc[:, i].values, ids) + # TODO: we have no test cases that get here with EA dtypes; + # try_cast may not be needed if EAs never get here + if cast: + res = self._try_cast(res, obj.iloc[:, i]) + output.append(res) + + return DataFrame._from_arrays(output, columns=result.columns, index=obj.index) + + def _define_paths(self, func, *args, **kwargs): + if isinstance(func, str): + fast_path = lambda group: getattr(group, func)(*args, **kwargs) + slow_path = lambda group: group.apply( + lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis + ) + else: + fast_path = lambda group: func(group, *args, **kwargs) + slow_path = lambda group: group.apply( + lambda x: func(x, *args, **kwargs), axis=self.axis + ) + return fast_path, slow_path + + def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFrame): + path = slow_path + res = slow_path(group) + + # if we make it here, test if we can use the fast path + try: + res_fast = fast_path(group) + except AssertionError: + raise + except Exception: + # GH#29631 For user-defined function, we cant predict what may be + # raised; see test_transform.test_transform_fastpath_raises + return path, res + + # verify fast path does not change columns (and names), otherwise + # its results cannot be joined with those of the slow path + if not isinstance(res_fast, DataFrame): + return path, res + + if not res_fast.columns.equals(group.columns): + return path, res + + if res_fast.equals(res): + path = fast_path + + return path, res + + def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame: + # iterate through columns + output = {} + inds = [] + for i, col in enumerate(obj): + try: + output[col] = self[col].transform(wrapper) + except TypeError: + # e.g. trying to call nanmean with string values + pass + else: + inds.append(i) + + if len(output) == 0: + raise TypeError("Transform function invalid for data types") + + columns = obj.columns + if len(output) < len(obj.columns): + columns = columns.take(inds) + + return DataFrame(output, index=obj.index, columns=columns) + + def filter(self, func, dropna=True, *args, **kwargs): + """ + Return a copy of a DataFrame excluding elements from groups that + do not satisfy the boolean criterion specified by func. + + Parameters + ---------- + f : function + Function to apply to each subframe. Should return True or False. + dropna : Drop groups that do not pass the filter. True by default; + If False, groups that evaluate False are filled with NaNs. + + Returns + ------- + filtered : DataFrame + + Notes + ----- + Each subframe is endowed the attribute 'name' in case you need to know + which group you are working on. + + Examples + -------- + >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', + ... 'foo', 'bar'], + ... 'B' : [1, 2, 3, 4, 5, 6], + ... 'C' : [2.0, 5., 8., 1., 2., 9.]}) + >>> grouped = df.groupby('A') + >>> grouped.filter(lambda x: x['B'].mean() > 3.) + A B C + 1 bar 2 5.0 + 3 bar 4 1.0 + 5 bar 6 9.0 + """ + + indices = [] + + obj = self._selected_obj + gen = self.grouper.get_iterator(obj, axis=self.axis) + + for name, group in gen: + object.__setattr__(group, "name", name) + + res = func(group, *args, **kwargs) + + try: + res = res.squeeze() + except AttributeError: # allow e.g., scalars and frames to pass + pass + + # interpret the result of the filter + if is_bool(res) or (is_scalar(res) and isna(res)): + if res and notna(res): + indices.append(self._get_index(name)) + else: + # non scalars aren't allowed + raise TypeError( + f"filter function returned a {type(res).__name__}, " + "but expected a scalar bool" + ) + + return self._apply_filter(indices, dropna) + + def __getitem__(self, key): + # per GH 23566 + if isinstance(key, tuple) and len(key) > 1: + # if len == 1, then it becomes a SeriesGroupBy and this is actually + # valid syntax, so don't raise warning + warnings.warn( + "Indexing with multiple keys (implicitly converted to a tuple " + "of keys) will be deprecated, use a list instead.", + FutureWarning, + stacklevel=2, + ) + return super().__getitem__(key) + + def _gotitem(self, key, ndim: int, subset=None): + """ + sub-classes to define + return a sliced object + + Parameters + ---------- + key : string / list of selections + ndim : 1,2 + requested ndim of result + subset : object, default None + subset to act on + """ + + if ndim == 2: + if subset is None: + subset = self.obj + return DataFrameGroupBy( + subset, + self.grouper, + selection=key, + grouper=self.grouper, + exclusions=self.exclusions, + as_index=self.as_index, + observed=self.observed, + ) + elif ndim == 1: + if subset is None: + subset = self.obj[key] + return SeriesGroupBy( + subset, selection=key, grouper=self.grouper, observed=self.observed + ) + + raise AssertionError("invalid ndim for _gotitem") + + def _wrap_frame_output(self, result, obj) -> DataFrame: + result_index = self.grouper.levels[0] + + if self.axis == 0: + return DataFrame(result, index=obj.columns, columns=result_index).T + else: + return DataFrame(result, index=obj.index, columns=result_index) + + def _get_data_to_aggregate(self) -> BlockManager: + obj = self._obj_with_exclusions + if self.axis == 1: + return obj.T._data + else: + return obj._data + + def _insert_inaxis_grouper_inplace(self, result): + # zip in reverse so we can always insert at loc 0 + izip = zip( + *map( + reversed, + ( + self.grouper.names, + self.grouper.get_group_levels(), + [grp.in_axis for grp in self.grouper.groupings], + ), + ) + ) + + for name, lev, in_axis in izip: + if in_axis: + result.insert(0, name, lev) + + def _wrap_aggregated_output( + self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] + ) -> DataFrame: + """ + Wraps the output of DataFrameGroupBy aggregations into the expected result. + + Parameters + ---------- + output : Mapping[base.OutputKey, Union[Series, np.ndarray]] + Data to wrap. + + Returns + ------- + DataFrame + """ + indexed_output = {key.position: val for key, val in output.items()} + columns = Index(key.label for key in output) + + result = DataFrame(indexed_output) + result.columns = columns + + if not self.as_index: + self._insert_inaxis_grouper_inplace(result) + result = result._consolidate() + else: + index = self.grouper.result_index + result.index = index + + if self.axis == 1: + result = result.T + + return self._reindex_output(result)._convert(datetime=True) + + def _wrap_transformed_output( + self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] + ) -> DataFrame: + """ + Wraps the output of DataFrameGroupBy transformations into the expected result. + + Parameters + ---------- + output : Mapping[base.OutputKey, Union[Series, np.ndarray]] + Data to wrap. + + Returns + ------- + DataFrame + """ + indexed_output = {key.position: val for key, val in output.items()} + columns = Index(key.label for key in output) + + result = DataFrame(indexed_output) + result.columns = columns + result.index = self.obj.index + + return result + + def _wrap_agged_blocks(self, blocks: "Sequence[Block]", items: Index) -> DataFrame: + if not self.as_index: + index = np.arange(blocks[0].values.shape[-1]) + mgr = BlockManager(blocks, axes=[items, index]) + result = DataFrame(mgr) + + self._insert_inaxis_grouper_inplace(result) + result = result._consolidate() + else: + index = self.grouper.result_index + mgr = BlockManager(blocks, axes=[items, index]) + result = DataFrame(mgr) + + if self.axis == 1: + result = result.T + + return self._reindex_output(result)._convert(datetime=True) + + def _iterate_column_groupbys(self): + for i, colname in enumerate(self._selected_obj.columns): + yield colname, SeriesGroupBy( + self._selected_obj.iloc[:, i], + selection=colname, + grouper=self.grouper, + exclusions=self.exclusions, + ) + + def _apply_to_column_groupbys(self, func): + from pandas.core.reshape.concat import concat + + return concat( + (func(col_groupby) for _, col_groupby in self._iterate_column_groupbys()), + keys=self._selected_obj.columns, + axis=1, + ) + + def count(self): + """ + Compute count of group, excluding missing values. + + Returns + ------- + DataFrame + Count of values within each group. + """ + data = self._get_data_to_aggregate() + ids, _, ngroups = self.grouper.group_info + mask = ids != -1 + + vals = ( + (mask & ~_isna_ndarraylike(np.atleast_2d(blk.get_values()))) + for blk in data.blocks + ) + locs = (blk.mgr_locs for blk in data.blocks) + + counted = ( + lib.count_level_2d(x, labels=ids, max_bin=ngroups, axis=1) for x in vals + ) + blocks = [make_block(val, placement=loc) for val, loc in zip(counted, locs)] + + return self._wrap_agged_blocks(blocks, items=data.items) + + def nunique(self, dropna: bool = True): + """ + Return DataFrame with number of distinct observations per group for + each column. + + Parameters + ---------- + dropna : bool, default True + Don't include NaN in the counts. + + Returns + ------- + nunique: DataFrame + + Examples + -------- + >>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam', + ... 'ham', 'ham'], + ... 'value1': [1, 5, 5, 2, 5, 5], + ... 'value2': list('abbaxy')}) + >>> df + id value1 value2 + 0 spam 1 a + 1 egg 5 b + 2 egg 5 b + 3 spam 2 a + 4 ham 5 x + 5 ham 5 y + + >>> df.groupby('id').nunique() + id value1 value2 + id + egg 1 1 1 + ham 1 1 2 + spam 1 2 1 + + Check for rows with the same id but conflicting values: + + >>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any()) + id value1 value2 + 0 spam 1 a + 3 spam 2 a + 4 ham 5 x + 5 ham 5 y + """ + + obj = self._selected_obj + + def groupby_series(obj, col=None): + return SeriesGroupBy(obj, selection=col, grouper=self.grouper).nunique( + dropna=dropna + ) + + if isinstance(obj, Series): + results = groupby_series(obj) + else: + # TODO: this is duplicative of how GroupBy naturally works + # Try to consolidate with normal wrapping functions + from pandas.core.reshape.concat import concat + + axis_number = obj._get_axis_number(self.axis) + other_axis = int(not axis_number) + if axis_number == 0: + iter_func = obj.items + else: + iter_func = obj.iterrows + + results = [groupby_series(content, label) for label, content in iter_func()] + results = concat(results, axis=1) + + if axis_number == 1: + results = results.T + + results._get_axis(other_axis).names = obj._get_axis(other_axis).names + + if not self.as_index: + results.index = ibase.default_index(len(results)) + return results + + boxplot = boxplot_frame_groupby + + +def _is_multi_agg_with_relabel(**kwargs) -> bool: + """ + Check whether kwargs passed to .agg look like multi-agg with relabeling. + + Parameters + ---------- + **kwargs : dict + + Returns + ------- + bool + + Examples + -------- + >>> _is_multi_agg_with_relabel(a='max') + False + >>> _is_multi_agg_with_relabel(a_max=('a', 'max'), + ... a_min=('a', 'min')) + True + >>> _is_multi_agg_with_relabel() + False + """ + return all(isinstance(v, tuple) and len(v) == 2 for v in kwargs.values()) and ( + len(kwargs) > 0 + ) + + +def _normalize_keyword_aggregation(kwargs): + """ + Normalize user-provided "named aggregation" kwargs. + + Transforms from the new ``Mapping[str, NamedAgg]`` style kwargs + to the old Dict[str, List[scalar]]]. + + Parameters + ---------- + kwargs : dict + + Returns + ------- + aggspec : dict + The transformed kwargs. + columns : List[str] + The user-provided keys. + col_idx_order : List[int] + List of columns indices. + + Examples + -------- + >>> _normalize_keyword_aggregation({'output': ('input', 'sum')}) + ({'input': ['sum']}, ('output',), [('input', 'sum')]) + """ + # Normalize the aggregation functions as Mapping[column, List[func]], + # process normally, then fixup the names. + # TODO: aggspec type: typing.Dict[str, List[AggScalar]] + # May be hitting https://github.com/python/mypy/issues/5958 + # saying it doesn't have an attribute __name__ + aggspec = defaultdict(list) + order = [] + columns, pairs = list(zip(*kwargs.items())) + + for name, (column, aggfunc) in zip(columns, pairs): + aggspec[column].append(aggfunc) + order.append((column, com.get_callable_name(aggfunc) or aggfunc)) + + # uniquify aggfunc name if duplicated in order list + uniquified_order = _make_unique(order) + + # GH 25719, due to aggspec will change the order of assigned columns in aggregation + # uniquified_aggspec will store uniquified order list and will compare it with order + # based on index + aggspec_order = [ + (column, com.get_callable_name(aggfunc) or aggfunc) + for column, aggfuncs in aggspec.items() + for aggfunc in aggfuncs + ] + uniquified_aggspec = _make_unique(aggspec_order) + + # get the new indice of columns by comparison + col_idx_order = Index(uniquified_aggspec).get_indexer(uniquified_order) + return aggspec, columns, col_idx_order + + +def _make_unique(seq): + """Uniquify aggfunc name of the pairs in the order list + + Examples: + -------- + >>> _make_unique([('a', ''), ('a', ''), ('b', '')]) + [('a', '_0'), ('a', '_1'), ('b', '')] + """ + return [ + (pair[0], "_".join([pair[1], str(seq[:i].count(pair))])) + if seq.count(pair) > 1 + else pair + for i, pair in enumerate(seq) + ] + + +# TODO: Can't use, because mypy doesn't like us setting __name__ +# error: "partial[Any]" has no attribute "__name__" +# the type is: +# typing.Sequence[Callable[..., ScalarResult]] +# -> typing.Sequence[Callable[..., ScalarResult]]: + + +def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]: + """ + Possibly mangle a list of aggfuncs. + + Parameters + ---------- + aggfuncs : Sequence + + Returns + ------- + mangled: list-like + A new AggSpec sequence, where lambdas have been converted + to have unique names. + + Notes + ----- + If just one aggfunc is passed, the name will not be mangled. + """ + if len(aggfuncs) <= 1: + # don't mangle for .agg([lambda x: .]) + return aggfuncs + i = 0 + mangled_aggfuncs = [] + for aggfunc in aggfuncs: + if com.get_callable_name(aggfunc) == "": + aggfunc = partial(aggfunc) + aggfunc.__name__ = f"" + i += 1 + mangled_aggfuncs.append(aggfunc) + + return mangled_aggfuncs + + +def _maybe_mangle_lambdas(agg_spec: Any) -> Any: + """ + Make new lambdas with unique names. + + Parameters + ---------- + agg_spec : Any + An argument to GroupBy.agg. + Non-dict-like `agg_spec` are pass through as is. + For dict-like `agg_spec` a new spec is returned + with name-mangled lambdas. + + Returns + ------- + mangled : Any + Same type as the input. + + Examples + -------- + >>> _maybe_mangle_lambdas('sum') + 'sum' + + >>> _maybe_mangle_lambdas([lambda: 1, lambda: 2]) # doctest: +SKIP + [, + .f(*args, **kwargs)>] + """ + is_dict = is_dict_like(agg_spec) + if not (is_dict or is_list_like(agg_spec)): + return agg_spec + mangled_aggspec = type(agg_spec)() # dict or OrderdDict + + if is_dict: + for key, aggfuncs in agg_spec.items(): + if is_list_like(aggfuncs) and not is_dict_like(aggfuncs): + mangled_aggfuncs = _managle_lambda_list(aggfuncs) + else: + mangled_aggfuncs = aggfuncs + + mangled_aggspec[key] = mangled_aggfuncs + else: + mangled_aggspec = _managle_lambda_list(agg_spec) + + return mangled_aggspec + + +def _recast_datetimelike_result(result: DataFrame) -> DataFrame: + """ + If we have date/time like in the original, then coerce dates + as we are stacking can easily have object dtypes here. + + Parameters + ---------- + result : DataFrame + + Returns + ------- + DataFrame + + Notes + ----- + - Assumes Groupby._selected_obj has ndim==2 and at least one + datetimelike column + """ + result = result.copy() + + obj_cols = [ + idx + for idx in range(len(result.columns)) + if is_object_dtype(result.dtypes.iloc[idx]) + ] + + # See GH#26285 + for n in obj_cols: + converted = maybe_convert_objects( + result.iloc[:, n].values, convert_numeric=False + ) + + result.iloc[:, n] = converted + return result diff --git a/venv/Lib/site-packages/pandas/core/groupby/groupby.py b/venv/Lib/site-packages/pandas/core/groupby/groupby.py new file mode 100644 index 0000000..a1e5692 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/groupby/groupby.py @@ -0,0 +1,2575 @@ +""" +Provide the groupby split-apply-combine paradigm. Define the GroupBy +class providing the base-class of operations. + +The SeriesGroupBy and DataFrameGroupBy sub-class +(defined in pandas.core.groupby.generic) +expose these user-facing objects to provide specific functionality. +""" + +from contextlib import contextmanager +import datetime +from functools import partial, wraps +import inspect +import re +import types +from typing import ( + Callable, + Dict, + FrozenSet, + Hashable, + Iterable, + List, + Mapping, + Optional, + Tuple, + Type, + Union, +) + +import numpy as np + +from pandas._config.config import option_context + +from pandas._libs import Timestamp +import pandas._libs.groupby as libgroupby +from pandas._typing import FrameOrSeries, Scalar +from pandas.compat import set_function_name +from pandas.compat.numpy import function as nv +from pandas.errors import AbstractMethodError +from pandas.util._decorators import Appender, Substitution, cache_readonly + +from pandas.core.dtypes.cast import maybe_downcast_to_dtype +from pandas.core.dtypes.common import ( + ensure_float, + is_datetime64_dtype, + is_extension_array_dtype, + is_integer_dtype, + is_numeric_dtype, + is_object_dtype, + is_scalar, +) +from pandas.core.dtypes.missing import isna, notna + +from pandas.core import nanops +import pandas.core.algorithms as algorithms +from pandas.core.arrays import Categorical, DatetimeArray, try_cast_to_ea +from pandas.core.base import DataError, PandasObject, SelectionMixin +import pandas.core.common as com +from pandas.core.frame import DataFrame +from pandas.core.generic import NDFrame +from pandas.core.groupby import base, ops +from pandas.core.indexes.api import CategoricalIndex, Index, MultiIndex +from pandas.core.series import Series +from pandas.core.sorting import get_group_index_sorter + +_common_see_also = """ + See Also + -------- + Series.%(name)s + DataFrame.%(name)s +""" + +_apply_docs = dict( + template=""" + Apply function `func` group-wise and combine the results together. + + The function passed to `apply` must take a {input} as its first + argument and return a DataFrame, Series or scalar. `apply` will + then take care of combining the results back together into a single + dataframe or series. `apply` is therefore a highly flexible + grouping method. + + While `apply` is a very flexible method, its downside is that + using it can be quite a bit slower than using more specific methods + like `agg` or `transform`. Pandas offers a wide range of method that will + be much faster than using `apply` for their specific purposes, so try to + use them before reaching for `apply`. + + Parameters + ---------- + func : callable + A callable that takes a {input} as its first argument, and + returns a dataframe, a series or a scalar. In addition the + callable may take positional and keyword arguments. + args, kwargs : tuple and dict + Optional positional and keyword arguments to pass to `func`. + + Returns + ------- + applied : Series or DataFrame + + See Also + -------- + pipe : Apply function to the full GroupBy object instead of to each + group. + aggregate : Apply aggregate function to the GroupBy object. + transform : Apply function column-by-column to the GroupBy object. + Series.apply : Apply a function to a Series. + DataFrame.apply : Apply a function to each row or column of a DataFrame. + """, + dataframe_examples=""" + >>> df = pd.DataFrame({'A': 'a a b'.split(), + 'B': [1,2,3], + 'C': [4,6, 5]}) + >>> g = df.groupby('A') + + Notice that ``g`` has two groups, ``a`` and ``b``. + Calling `apply` in various ways, we can get different grouping results: + + Example 1: below the function passed to `apply` takes a DataFrame as + its argument and returns a DataFrame. `apply` combines the result for + each group together into a new DataFrame: + + >>> g[['B', 'C']].apply(lambda x: x / x.sum()) + B C + 0 0.333333 0.4 + 1 0.666667 0.6 + 2 1.000000 1.0 + + Example 2: The function passed to `apply` takes a DataFrame as + its argument and returns a Series. `apply` combines the result for + each group together into a new DataFrame: + + >>> g[['B', 'C']].apply(lambda x: x.max() - x.min()) + B C + A + a 1 2 + b 0 0 + + Example 3: The function passed to `apply` takes a DataFrame as + its argument and returns a scalar. `apply` combines the result for + each group together into a Series, including setting the index as + appropriate: + + >>> g.apply(lambda x: x.C.max() - x.B.min()) + A + a 5 + b 2 + dtype: int64 + """, + series_examples=""" + >>> s = pd.Series([0, 1, 2], index='a a b'.split()) + >>> g = s.groupby(s.index) + + From ``s`` above we can see that ``g`` has two groups, ``a`` and ``b``. + Calling `apply` in various ways, we can get different grouping results: + + Example 1: The function passed to `apply` takes a Series as + its argument and returns a Series. `apply` combines the result for + each group together into a new Series: + + >>> g.apply(lambda x: x*2 if x.name == 'b' else x/2) + 0 0.0 + 1 0.5 + 2 4.0 + dtype: float64 + + Example 2: The function passed to `apply` takes a Series as + its argument and returns a scalar. `apply` combines the result for + each group together into a Series, including setting the index as + appropriate: + + >>> g.apply(lambda x: x.max() - x.min()) + a 1 + b 0 + dtype: int64 + + Notes + ----- + In the current implementation `apply` calls `func` twice on the + first group to decide whether it can take a fast or slow code + path. This can lead to unexpected behavior if `func` has + side-effects, as they will take effect twice for the first + group. + + Examples + -------- + {examples} + """, +) + +_pipe_template = """ +Apply a function `func` with arguments to this %(klass)s object and return +the function's result. + +%(versionadded)s + +Use `.pipe` when you want to improve readability by chaining together +functions that expect Series, DataFrames, GroupBy or Resampler objects. +Instead of writing + +>>> h(g(f(df.groupby('group')), arg1=a), arg2=b, arg3=c) + +You can write + +>>> (df.groupby('group') +... .pipe(f) +... .pipe(g, arg1=a) +... .pipe(h, arg2=b, arg3=c)) + +which is much more readable. + +Parameters +---------- +func : callable or tuple of (callable, string) + Function to apply to this %(klass)s object or, alternatively, + a `(callable, data_keyword)` tuple where `data_keyword` is a + string indicating the keyword of `callable` that expects the + %(klass)s object. +args : iterable, optional + Positional arguments passed into `func`. +kwargs : dict, optional + A dictionary of keyword arguments passed into `func`. + +Returns +------- +object : the return type of `func`. + +See Also +-------- +Series.pipe : Apply a function with arguments to a series. +DataFrame.pipe: Apply a function with arguments to a dataframe. +apply : Apply function to each group instead of to the + full %(klass)s object. + +Notes +----- +See more `here +`_ + +Examples +-------- +%(examples)s +""" + +_transform_template = """ +Call function producing a like-indexed %(klass)s on each group and +return a %(klass)s having the same indexes as the original object +filled with the transformed values + +Parameters +---------- +f : function + Function to apply to each group + +Returns +------- +%(klass)s + +See Also +-------- +aggregate, transform + +Notes +----- +Each group is endowed the attribute 'name' in case you need to know +which group you are working on. + +The current implementation imposes three requirements on f: + +* f must return a value that either has the same shape as the input + subframe or can be broadcast to the shape of the input subframe. + For example, if `f` returns a scalar it will be broadcast to have the + same shape as the input subframe. +* if this is a DataFrame, f must support application column-by-column + in the subframe. If f also supports application to the entire subframe, + then a fast path is used starting from the second chunk. +* f must not mutate groups. Mutation is not supported and may + produce unexpected results. + +Examples +-------- + +# Same shape +>>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', +... 'foo', 'bar'], +... 'B' : ['one', 'one', 'two', 'three', +... 'two', 'two'], +... 'C' : [1, 5, 5, 2, 5, 5], +... 'D' : [2.0, 5., 8., 1., 2., 9.]}) +>>> grouped = df.groupby('A') +>>> grouped.transform(lambda x: (x - x.mean()) / x.std()) + C D +0 -1.154701 -0.577350 +1 0.577350 0.000000 +2 0.577350 1.154701 +3 -1.154701 -1.000000 +4 0.577350 -0.577350 +5 0.577350 1.000000 + +# Broadcastable +>>> grouped.transform(lambda x: x.max() - x.min()) + C D +0 4 6.0 +1 3 8.0 +2 4 6.0 +3 3 8.0 +4 4 6.0 +5 3 8.0 +""" + + +class GroupByPlot(PandasObject): + """ + Class implementing the .plot attribute for groupby objects. + """ + + def __init__(self, groupby): + self._groupby = groupby + + def __call__(self, *args, **kwargs): + def f(self): + return self.plot(*args, **kwargs) + + f.__name__ = "plot" + return self._groupby.apply(f) + + def __getattr__(self, name: str): + def attr(*args, **kwargs): + def f(self): + return getattr(self.plot, name)(*args, **kwargs) + + return self._groupby.apply(f) + + return attr + + +@contextmanager +def _group_selection_context(groupby): + """ + Set / reset the _group_selection_context. + """ + groupby._set_group_selection() + yield groupby + groupby._reset_group_selection() + + +_KeysArgType = Union[ + Hashable, + List[Hashable], + Callable[[Hashable], Hashable], + List[Callable[[Hashable], Hashable]], + Mapping[Hashable, Hashable], +] + + +class _GroupBy(PandasObject, SelectionMixin): + _group_selection = None + _apply_whitelist: FrozenSet[str] = frozenset() + + def __init__( + self, + obj: NDFrame, + keys: Optional[_KeysArgType] = None, + axis: int = 0, + level=None, + grouper: "Optional[ops.BaseGrouper]" = None, + exclusions=None, + selection=None, + as_index: bool = True, + sort: bool = True, + group_keys: bool = True, + squeeze: bool = False, + observed: bool = False, + mutated: bool = False, + ): + + self._selection = selection + + assert isinstance(obj, NDFrame), type(obj) + obj._consolidate_inplace() + + self.level = level + + if not as_index: + if not isinstance(obj, DataFrame): + raise TypeError("as_index=False only valid with DataFrame") + if axis != 0: + raise ValueError("as_index=False only valid for axis=0") + + self.as_index = as_index + self.keys = keys + self.sort = sort + self.group_keys = group_keys + self.squeeze = squeeze + self.observed = observed + self.mutated = mutated + + if grouper is None: + from pandas.core.groupby.grouper import get_grouper + + grouper, exclusions, obj = get_grouper( + obj, + keys, + axis=axis, + level=level, + sort=sort, + observed=observed, + mutated=self.mutated, + ) + + self.obj = obj + self.axis = obj._get_axis_number(axis) + self.grouper = grouper + self.exclusions = set(exclusions) if exclusions else set() + + def __len__(self) -> int: + return len(self.groups) + + def __repr__(self) -> str: + # TODO: Better repr for GroupBy object + return object.__repr__(self) + + def _assure_grouper(self): + """ + We create the grouper on instantiation sub-classes may have a + different policy. + """ + pass + + @property + def groups(self): + """ + Dict {group name -> group labels}. + """ + self._assure_grouper() + return self.grouper.groups + + @property + def ngroups(self): + self._assure_grouper() + return self.grouper.ngroups + + @property + def indices(self): + """ + Dict {group name -> group indices}. + """ + self._assure_grouper() + return self.grouper.indices + + def _get_indices(self, names): + """ + Safe get multiple indices, translate keys for + datelike to underlying repr. + """ + + def get_converter(s): + # possibly convert to the actual key types + # in the indices, could be a Timestamp or a np.datetime64 + if isinstance(s, datetime.datetime): + return lambda key: Timestamp(key) + elif isinstance(s, np.datetime64): + return lambda key: Timestamp(key).asm8 + else: + return lambda key: key + + if len(names) == 0: + return [] + + if len(self.indices) > 0: + index_sample = next(iter(self.indices)) + else: + index_sample = None # Dummy sample + + name_sample = names[0] + if isinstance(index_sample, tuple): + if not isinstance(name_sample, tuple): + msg = "must supply a tuple to get_group with multiple grouping keys" + raise ValueError(msg) + if not len(name_sample) == len(index_sample): + try: + # If the original grouper was a tuple + return [self.indices[name] for name in names] + except KeyError: + # turns out it wasn't a tuple + msg = ( + "must supply a same-length tuple to get_group " + "with multiple grouping keys" + ) + raise ValueError(msg) + + converters = [get_converter(s) for s in index_sample] + names = (tuple(f(n) for f, n in zip(converters, name)) for name in names) + + else: + converter = get_converter(index_sample) + names = (converter(name) for name in names) + + return [self.indices.get(name, []) for name in names] + + def _get_index(self, name): + """ + Safe get index, translate keys for datelike to underlying repr. + """ + return self._get_indices([name])[0] + + @cache_readonly + def _selected_obj(self): + # Note: _selected_obj is always just `self.obj` for SeriesGroupBy + + if self._selection is None or isinstance(self.obj, Series): + if self._group_selection is not None: + return self.obj[self._group_selection] + return self.obj + else: + return self.obj[self._selection] + + def _reset_group_selection(self): + """ + Clear group based selection. + + Used for methods needing to return info on each group regardless of + whether a group selection was previously set. + """ + if self._group_selection is not None: + # GH12839 clear cached selection too when changing group selection + self._group_selection = None + self._reset_cache("_selected_obj") + + def _set_group_selection(self): + """ + Create group based selection. + + Used when selection is not passed directly but instead via a grouper. + + NOTE: this should be paired with a call to _reset_group_selection + """ + grp = self.grouper + if not ( + self.as_index + and getattr(grp, "groupings", None) is not None + and self.obj.ndim > 1 + and self._group_selection is None + ): + return + + ax = self.obj._info_axis + groupers = [g.name for g in grp.groupings if g.level is None and g.in_axis] + + if len(groupers): + # GH12839 clear selected obj cache when group selection changes + self._group_selection = ax.difference(Index(groupers), sort=False).tolist() + self._reset_cache("_selected_obj") + + def _set_result_index_ordered(self, result): + # set the result index on the passed values object and + # return the new object, xref 8046 + + # the values/counts are repeated according to the group index + # shortcut if we have an already ordered grouper + if not self.grouper.is_monotonic: + index = Index(np.concatenate(self._get_indices(self.grouper.result_index))) + result.set_axis(index, axis=self.axis, inplace=True) + result = result.sort_index(axis=self.axis) + + result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) + return result + + def _dir_additions(self): + return self.obj._dir_additions() | self._apply_whitelist + + def __getattr__(self, attr: str): + if attr in self._internal_names_set: + return object.__getattribute__(self, attr) + if attr in self.obj: + return self[attr] + + raise AttributeError( + f"'{type(self).__name__}' object has no attribute '{attr}'" + ) + + @Substitution( + klass="GroupBy", + versionadded=".. versionadded:: 0.21.0", + examples="""\ +>>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]}) +>>> df + A B +0 a 1 +1 b 2 +2 a 3 +3 b 4 + +To get the difference between each groups maximum and minimum value in one +pass, you can do + +>>> df.groupby('A').pipe(lambda x: x.max() - x.min()) + B +A +a 2 +b 2""", + ) + @Appender(_pipe_template) + def pipe(self, func, *args, **kwargs): + return com.pipe(self, func, *args, **kwargs) + + plot = property(GroupByPlot) + + def _make_wrapper(self, name): + assert name in self._apply_whitelist + + self._set_group_selection() + + # need to setup the selection + # as are not passed directly but in the grouper + f = getattr(self._selected_obj, name) + if not isinstance(f, types.MethodType): + return self.apply(lambda self: getattr(self, name)) + + f = getattr(type(self._selected_obj), name) + sig = inspect.signature(f) + + def wrapper(*args, **kwargs): + # a little trickery for aggregation functions that need an axis + # argument + if "axis" in sig.parameters: + if kwargs.get("axis", None) is None: + kwargs["axis"] = self.axis + + def curried(x): + return f(x, *args, **kwargs) + + # preserve the name so we can detect it when calling plot methods, + # to avoid duplicates + curried.__name__ = name + + # special case otherwise extra plots are created when catching the + # exception below + if name in base.plotting_methods: + return self.apply(curried) + + try: + return self.apply(curried) + except TypeError as err: + if not re.search( + "reduction operation '.*' not allowed for this dtype", str(err) + ): + # We don't have a cython implementation + # TODO: is the above comment accurate? + raise + + if self.obj.ndim == 1: + # this can be called recursively, so need to raise ValueError + raise ValueError + + # GH#3688 try to operate item-by-item + result = self._aggregate_item_by_item(name, *args, **kwargs) + return result + + wrapper.__name__ = name + return wrapper + + def get_group(self, name, obj=None): + """ + Construct DataFrame from group with provided name. + + Parameters + ---------- + name : object + The name of the group to get as a DataFrame. + obj : DataFrame, default None + The DataFrame to take the DataFrame out of. If + it is None, the object groupby was called on will + be used. + + Returns + ------- + group : same type as obj + """ + if obj is None: + obj = self._selected_obj + + inds = self._get_index(name) + if not len(inds): + raise KeyError(name) + + return obj._take_with_is_copy(inds, axis=self.axis) + + def __iter__(self): + """ + Groupby iterator. + + Returns + ------- + Generator yielding sequence of (name, subsetted object) + for each group + """ + return self.grouper.get_iterator(self.obj, axis=self.axis) + + @Appender( + _apply_docs["template"].format( + input="dataframe", examples=_apply_docs["dataframe_examples"] + ) + ) + def apply(self, func, *args, **kwargs): + + func = self._is_builtin_func(func) + + # this is needed so we don't try and wrap strings. If we could + # resolve functions to their callable functions prior, this + # wouldn't be needed + if args or kwargs: + if callable(func): + + @wraps(func) + def f(g): + with np.errstate(all="ignore"): + return func(g, *args, **kwargs) + + elif hasattr(nanops, "nan" + func): + # TODO: should we wrap this in to e.g. _is_builtin_func? + f = getattr(nanops, "nan" + func) + + else: + raise ValueError( + "func must be a callable if args or kwargs are supplied" + ) + else: + f = func + + # ignore SettingWithCopy here in case the user mutates + with option_context("mode.chained_assignment", None): + try: + result = self._python_apply_general(f) + except TypeError: + # gh-20949 + # try again, with .apply acting as a filtering + # operation, by excluding the grouping column + # This would normally not be triggered + # except if the udf is trying an operation that + # fails on *some* columns, e.g. a numeric operation + # on a string grouper column + + with _group_selection_context(self): + return self._python_apply_general(f) + + return result + + def _python_apply_general(self, f): + keys, values, mutated = self.grouper.apply(f, self._selected_obj, self.axis) + + return self._wrap_applied_output( + keys, values, not_indexed_same=mutated or self.mutated + ) + + def _iterate_slices(self) -> Iterable[Series]: + raise AbstractMethodError(self) + + def transform(self, func, *args, **kwargs): + raise AbstractMethodError(self) + + def _cumcount_array(self, ascending: bool = True): + """ + Parameters + ---------- + ascending : bool, default True + If False, number in reverse, from length of group - 1 to 0. + + Notes + ----- + this is currently implementing sort=False + (though the default is sort=True) for groupby in general + """ + ids, _, ngroups = self.grouper.group_info + sorter = get_group_index_sorter(ids, ngroups) + ids, count = ids[sorter], len(ids) + + if count == 0: + return np.empty(0, dtype=np.int64) + + run = np.r_[True, ids[:-1] != ids[1:]] + rep = np.diff(np.r_[np.nonzero(run)[0], count]) + out = (~run).cumsum() + + if ascending: + out -= np.repeat(out[run], rep) + else: + out = np.repeat(out[np.r_[run[1:], True]], rep) - out + + rev = np.empty(count, dtype=np.intp) + rev[sorter] = np.arange(count, dtype=np.intp) + return out[rev].astype(np.int64, copy=False) + + def _try_cast(self, result, obj, numeric_only: bool = False): + """ + Try to cast the result to our obj original type, + we may have roundtripped through object in the mean-time. + + If numeric_only is True, then only try to cast numerics + and not datetimelikes. + + """ + if obj.ndim > 1: + dtype = obj._values.dtype + else: + dtype = obj.dtype + + if not is_scalar(result): + if is_extension_array_dtype(dtype) and dtype.kind != "M": + # The function can return something of any type, so check + # if the type is compatible with the calling EA. + # datetime64tz is handled correctly in agg_series, + # so is excluded here. + + if len(result) and isinstance(result[0], dtype.type): + cls = dtype.construct_array_type() + result = try_cast_to_ea(cls, result, dtype=dtype) + + elif numeric_only and is_numeric_dtype(dtype) or not numeric_only: + result = maybe_downcast_to_dtype(result, dtype) + + return result + + def _transform_should_cast(self, func_nm: str) -> bool: + """ + Parameters + ---------- + func_nm: str + The name of the aggregation function being performed + + Returns + ------- + bool + Whether transform should attempt to cast the result of aggregation + """ + return (self.size().fillna(0) > 0).any() and ( + func_nm not in base.cython_cast_blacklist + ) + + def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): + output: Dict[base.OutputKey, np.ndarray] = {} + for idx, obj in enumerate(self._iterate_slices()): + name = obj.name + is_numeric = is_numeric_dtype(obj.dtype) + if numeric_only and not is_numeric: + continue + + try: + result, _ = self.grouper.transform(obj.values, how, **kwargs) + except NotImplementedError: + continue + + if self._transform_should_cast(how): + result = self._try_cast(result, obj) + + key = base.OutputKey(label=name, position=idx) + output[key] = result + + if len(output) == 0: + raise DataError("No numeric types to aggregate") + + return self._wrap_transformed_output(output) + + def _wrap_aggregated_output(self, output: Mapping[base.OutputKey, np.ndarray]): + raise AbstractMethodError(self) + + def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]): + raise AbstractMethodError(self) + + def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False): + raise AbstractMethodError(self) + + def _cython_agg_general( + self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 + ): + output: Dict[base.OutputKey, Union[np.ndarray, DatetimeArray]] = {} + # Ideally we would be able to enumerate self._iterate_slices and use + # the index from enumeration as the key of output, but ohlc in particular + # returns a (n x 4) array. Output requires 1D ndarrays as values, so we + # need to slice that up into 1D arrays + idx = 0 + for obj in self._iterate_slices(): + name = obj.name + is_numeric = is_numeric_dtype(obj.dtype) + if numeric_only and not is_numeric: + continue + + result, agg_names = self.grouper.aggregate( + obj._values, how, min_count=min_count + ) + + if agg_names: + # e.g. ohlc + assert len(agg_names) == result.shape[1] + for result_column, result_name in zip(result.T, agg_names): + key = base.OutputKey(label=result_name, position=idx) + output[key] = self._try_cast(result_column, obj) + idx += 1 + else: + assert result.ndim == 1 + key = base.OutputKey(label=name, position=idx) + output[key] = self._try_cast(result, obj) + idx += 1 + + if len(output) == 0: + raise DataError("No numeric types to aggregate") + + return self._wrap_aggregated_output(output) + + def _python_agg_general(self, func, *args, **kwargs): + func = self._is_builtin_func(func) + f = lambda x: func(x, *args, **kwargs) + + # iterate through "columns" ex exclusions to populate output dict + output: Dict[base.OutputKey, np.ndarray] = {} + + for idx, obj in enumerate(self._iterate_slices()): + name = obj.name + if self.grouper.ngroups == 0: + # agg_series below assumes ngroups > 0 + continue + + try: + # if this function is invalid for this dtype, we will ignore it. + func(obj[:0]) + except TypeError: + continue + except AssertionError: + raise + except Exception: + # Our function depends on having a non-empty argument + # See test_groupby_agg_err_catching + pass + + result, counts = self.grouper.agg_series(obj, f) + assert result is not None + key = base.OutputKey(label=name, position=idx) + output[key] = self._try_cast(result, obj, numeric_only=True) + + if len(output) == 0: + return self._python_apply_general(f) + + if self.grouper._filter_empty_groups: + + mask = counts.ravel() > 0 + for key, result in output.items(): + + # since we are masking, make sure that we have a float object + values = result + if is_numeric_dtype(values.dtype): + values = ensure_float(values) + + output[key] = self._try_cast(values[mask], result) + + return self._wrap_aggregated_output(output) + + def _concat_objects(self, keys, values, not_indexed_same: bool = False): + from pandas.core.reshape.concat import concat + + def reset_identity(values): + # reset the identities of the components + # of the values to prevent aliasing + for v in com.not_none(*values): + ax = v._get_axis(self.axis) + ax._reset_identity() + return values + + if not not_indexed_same: + result = concat(values, axis=self.axis) + ax = self._selected_obj._get_axis(self.axis) + + if isinstance(result, Series): + result = result.reindex(ax) + else: + + # this is a very unfortunate situation + # we have a multi-index that is NOT lexsorted + # and we have a result which is duplicated + # we can't reindex, so we resort to this + # GH 14776 + if isinstance(ax, MultiIndex) and not ax.is_unique: + indexer = algorithms.unique1d( + result.index.get_indexer_for(ax.values) + ) + result = result.take(indexer, axis=self.axis) + else: + result = result.reindex(ax, axis=self.axis) + + elif self.group_keys: + + values = reset_identity(values) + if self.as_index: + + # possible MI return case + group_keys = keys + group_levels = self.grouper.levels + group_names = self.grouper.names + + result = concat( + values, + axis=self.axis, + keys=group_keys, + levels=group_levels, + names=group_names, + sort=False, + ) + else: + + # GH5610, returns a MI, with the first level being a + # range index + keys = list(range(len(values))) + result = concat(values, axis=self.axis, keys=keys) + else: + values = reset_identity(values) + result = concat(values, axis=self.axis) + + if isinstance(result, Series) and self._selection_name is not None: + + result.name = self._selection_name + + return result + + def _apply_filter(self, indices, dropna): + if len(indices) == 0: + indices = np.array([], dtype="int64") + else: + indices = np.sort(np.concatenate(indices)) + if dropna: + filtered = self._selected_obj.take(indices, axis=self.axis) + else: + mask = np.empty(len(self._selected_obj.index), dtype=bool) + mask.fill(False) + mask[indices.astype(int)] = True + # mask fails to broadcast when passed to where; broadcast manually. + mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T + filtered = self._selected_obj.where(mask) # Fill with NaNs. + return filtered + + +class GroupBy(_GroupBy): + """ + Class for grouping and aggregating relational data. + + See aggregate, transform, and apply functions on this object. + + It's easiest to use obj.groupby(...) to use GroupBy, but you can also do: + + :: + + grouped = groupby(obj, ...) + + Parameters + ---------- + obj : pandas object + axis : int, default 0 + level : int, default None + Level of MultiIndex + groupings : list of Grouping objects + Most users should ignore this + exclusions : array-like, optional + List of columns to exclude + name : str + Most users should ignore this + + Returns + ------- + **Attributes** + groups : dict + {group name -> group labels} + len(grouped) : int + Number of groups + + Notes + ----- + After grouping, see aggregate, apply, and transform functions. Here are + some other brief notes about usage. When grouping by multiple groups, the + result index will be a MultiIndex (hierarchical) by default. + + Iteration produces (key, group) tuples, i.e. chunking the data by group. So + you can write code like: + + :: + + grouped = obj.groupby(keys, axis=axis) + for key, group in grouped: + # do something with the data + + Function calls on GroupBy, if not specially implemented, "dispatch" to the + grouped data. So if you group a DataFrame and wish to invoke the std() + method on each group, you can simply do: + + :: + + df.groupby(mapper).std() + + rather than + + :: + + df.groupby(mapper).aggregate(np.std) + + You can pass arguments to these "wrapped" functions, too. + + See the online documentation for full exposition on these topics and much + more + """ + + def _bool_agg(self, val_test, skipna): + """ + Shared func to call any / all Cython GroupBy implementations. + """ + + def objs_to_bool(vals: np.ndarray) -> Tuple[np.ndarray, Type]: + if is_object_dtype(vals): + vals = np.array([bool(x) for x in vals]) + else: + vals = vals.astype(np.bool) + + return vals.view(np.uint8), np.bool + + def result_to_bool(result: np.ndarray, inference: Type) -> np.ndarray: + return result.astype(inference, copy=False) + + return self._get_cythonized_result( + "group_any_all", + aggregate=True, + cython_dtype=np.dtype(np.uint8), + needs_values=True, + needs_mask=True, + pre_processing=objs_to_bool, + post_processing=result_to_bool, + val_test=val_test, + skipna=skipna, + ) + + @Substitution(name="groupby") + @Appender(_common_see_also) + def any(self, skipna: bool = True): + """ + Return True if any value in the group is truthful, else False. + + Parameters + ---------- + skipna : bool, default True + Flag to ignore nan values during truth testing. + + Returns + ------- + bool + """ + return self._bool_agg("any", skipna) + + @Substitution(name="groupby") + @Appender(_common_see_also) + def all(self, skipna: bool = True): + """ + Return True if all values in the group are truthful, else False. + + Parameters + ---------- + skipna : bool, default True + Flag to ignore nan values during truth testing. + + Returns + ------- + bool + """ + return self._bool_agg("all", skipna) + + @Substitution(name="groupby") + @Appender(_common_see_also) + def count(self): + """ + Compute count of group, excluding missing values. + + Returns + ------- + Series or DataFrame + Count of values within each group. + """ + + # defined here for API doc + raise NotImplementedError + + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def mean(self, *args, **kwargs): + """ + Compute mean of groups, excluding missing values. + + Returns + ------- + pandas.Series or pandas.DataFrame + %(see_also)s + Examples + -------- + >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2], + ... 'B': [np.nan, 2, 3, 4, 5], + ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C']) + + Groupby one column and return the mean of the remaining columns in + each group. + + >>> df.groupby('A').mean() + B C + A + 1 3.0 1.333333 + 2 4.0 1.500000 + + Groupby two columns and return the mean of the remaining column. + + >>> df.groupby(['A', 'B']).mean() + C + A B + 1 2.0 2 + 4.0 1 + 2 3.0 1 + 5.0 2 + + Groupby one column and return the mean of only particular column in + the group. + + >>> df.groupby('A')['B'].mean() + A + 1 3.0 + 2 4.0 + Name: B, dtype: float64 + """ + nv.validate_groupby_func("mean", args, kwargs, ["numeric_only"]) + return self._cython_agg_general( + "mean", alt=lambda x, axis: Series(x).mean(**kwargs), **kwargs + ) + + @Substitution(name="groupby") + @Appender(_common_see_also) + def median(self, **kwargs): + """ + Compute median of groups, excluding missing values. + + For multiple groupings, the result index will be a MultiIndex + + Returns + ------- + Series or DataFrame + Median of values within each group. + """ + return self._cython_agg_general( + "median", + alt=lambda x, axis: Series(x).median(axis=axis, **kwargs), + **kwargs, + ) + + @Substitution(name="groupby") + @Appender(_common_see_also) + def std(self, ddof: int = 1, *args, **kwargs): + """ + Compute standard deviation of groups, excluding missing values. + + For multiple groupings, the result index will be a MultiIndex. + + Parameters + ---------- + ddof : int, default 1 + Degrees of freedom. + + Returns + ------- + Series or DataFrame + Standard deviation of values within each group. + """ + + # TODO: implement at Cython level? + nv.validate_groupby_func("std", args, kwargs) + return np.sqrt(self.var(ddof=ddof, **kwargs)) + + @Substitution(name="groupby") + @Appender(_common_see_also) + def var(self, ddof: int = 1, *args, **kwargs): + """ + Compute variance of groups, excluding missing values. + + For multiple groupings, the result index will be a MultiIndex. + + Parameters + ---------- + ddof : int, default 1 + Degrees of freedom. + + Returns + ------- + Series or DataFrame + Variance of values within each group. + """ + nv.validate_groupby_func("var", args, kwargs) + if ddof == 1: + return self._cython_agg_general( + "var", alt=lambda x, axis: Series(x).var(ddof=ddof, **kwargs), **kwargs + ) + else: + f = lambda x: x.var(ddof=ddof, **kwargs) + with _group_selection_context(self): + return self._python_agg_general(f) + + @Substitution(name="groupby") + @Appender(_common_see_also) + def sem(self, ddof: int = 1): + """ + Compute standard error of the mean of groups, excluding missing values. + + For multiple groupings, the result index will be a MultiIndex. + + Parameters + ---------- + ddof : int, default 1 + Degrees of freedom. + + Returns + ------- + Series or DataFrame + Standard error of the mean of values within each group. + """ + return self.std(ddof=ddof) / np.sqrt(self.count()) + + @Substitution(name="groupby") + @Appender(_common_see_also) + def size(self): + """ + Compute group sizes. + + Returns + ------- + Series + Number of rows in each group. + """ + result = self.grouper.size() + + if isinstance(self.obj, Series): + result.name = self.obj.name + return self._reindex_output(result, fill_value=0) + + @classmethod + def _add_numeric_operations(cls): + """ + Add numeric operations to the GroupBy generically. + """ + + def groupby_function( + name: str, + alias: str, + npfunc, + numeric_only: bool = True, + min_count: int = -1, + ): + + _local_template = """ + Compute %(f)s of group values. + + Returns + ------- + Series or DataFrame + Computed %(f)s of values within each group. + """ + + @Substitution(name="groupby", f=name) + @Appender(_common_see_also) + @Appender(_local_template) + def f(self, **kwargs): + if "numeric_only" not in kwargs: + kwargs["numeric_only"] = numeric_only + if "min_count" not in kwargs: + kwargs["min_count"] = min_count + + self._set_group_selection() + + # try a cython aggregation if we can + try: + return self._cython_agg_general(alias, alt=npfunc, **kwargs) + except DataError: + pass + except NotImplementedError as err: + if "function is not implemented for this dtype" in str(err): + # raised in _get_cython_function, in some cases can + # be trimmed by implementing cython funcs for more dtypes + pass + else: + raise + + # apply a non-cython aggregation + result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) + return result + + set_function_name(f, name, cls) + + return f + + def first_compat(x, axis=0): + def first(x): + x = x.to_numpy() + + x = x[notna(x)] + if len(x) == 0: + return np.nan + return x[0] + + if isinstance(x, DataFrame): + return x.apply(first, axis=axis) + else: + return first(x) + + def last_compat(x, axis=0): + def last(x): + x = x.to_numpy() + x = x[notna(x)] + if len(x) == 0: + return np.nan + return x[-1] + + if isinstance(x, DataFrame): + return x.apply(last, axis=axis) + else: + return last(x) + + cls.sum = groupby_function("sum", "add", np.sum, min_count=0) + cls.prod = groupby_function("prod", "prod", np.prod, min_count=0) + cls.min = groupby_function("min", "min", np.min, numeric_only=False) + cls.max = groupby_function("max", "max", np.max, numeric_only=False) + cls.first = groupby_function("first", "first", first_compat, numeric_only=False) + cls.last = groupby_function("last", "last", last_compat, numeric_only=False) + + @Substitution(name="groupby") + @Appender(_common_see_also) + def ohlc(self) -> DataFrame: + """ + Compute sum of values, excluding missing values. + + For multiple groupings, the result index will be a MultiIndex + + Returns + ------- + DataFrame + Open, high, low and close values within each group. + """ + + return self._apply_to_column_groupbys(lambda x: x._cython_agg_general("ohlc")) + + @Appender(DataFrame.describe.__doc__) + def describe(self, **kwargs): + with _group_selection_context(self): + result = self.apply(lambda x: x.describe(**kwargs)) + if self.axis == 1: + return result.T + return result.unstack() + + def resample(self, rule, *args, **kwargs): + """ + Provide resampling when using a TimeGrouper. + + Given a grouper, the function resamples it according to a string + "string" -> "frequency". + + See the :ref:`frequency aliases ` + documentation for more details. + + Parameters + ---------- + rule : str or DateOffset + The offset string or object representing target grouper conversion. + *args, **kwargs + Possible arguments are `how`, `fill_method`, `limit`, `kind` and + `on`, and other arguments of `TimeGrouper`. + + Returns + ------- + Grouper + Return a new grouper with our resampler appended. + + See Also + -------- + Grouper : Specify a frequency to resample with when + grouping by a key. + DatetimeIndex.resample : Frequency conversion and resampling of + time series. + + Examples + -------- + >>> idx = pd.date_range('1/1/2000', periods=4, freq='T') + >>> df = pd.DataFrame(data=4 * [range(2)], + ... index=idx, + ... columns=['a', 'b']) + >>> df.iloc[2, 0] = 5 + >>> df + a b + 2000-01-01 00:00:00 0 1 + 2000-01-01 00:01:00 0 1 + 2000-01-01 00:02:00 5 1 + 2000-01-01 00:03:00 0 1 + + Downsample the DataFrame into 3 minute bins and sum the values of + the timestamps falling into a bin. + + >>> df.groupby('a').resample('3T').sum() + a b + a + 0 2000-01-01 00:00:00 0 2 + 2000-01-01 00:03:00 0 1 + 5 2000-01-01 00:00:00 5 1 + + Upsample the series into 30 second bins. + + >>> df.groupby('a').resample('30S').sum() + a b + a + 0 2000-01-01 00:00:00 0 1 + 2000-01-01 00:00:30 0 0 + 2000-01-01 00:01:00 0 1 + 2000-01-01 00:01:30 0 0 + 2000-01-01 00:02:00 0 0 + 2000-01-01 00:02:30 0 0 + 2000-01-01 00:03:00 0 1 + 5 2000-01-01 00:02:00 5 1 + + Resample by month. Values are assigned to the month of the period. + + >>> df.groupby('a').resample('M').sum() + a b + a + 0 2000-01-31 0 3 + 5 2000-01-31 5 1 + + Downsample the series into 3 minute bins as above, but close the right + side of the bin interval. + + >>> df.groupby('a').resample('3T', closed='right').sum() + a b + a + 0 1999-12-31 23:57:00 0 1 + 2000-01-01 00:00:00 0 2 + 5 2000-01-01 00:00:00 5 1 + + Downsample the series into 3 minute bins and close the right side of + the bin interval, but label each bin using the right edge instead of + the left. + + >>> df.groupby('a').resample('3T', closed='right', label='right').sum() + a b + a + 0 2000-01-01 00:00:00 0 1 + 2000-01-01 00:03:00 0 2 + 5 2000-01-01 00:03:00 5 1 + + Add an offset of twenty seconds. + + >>> df.groupby('a').resample('3T', loffset='20s').sum() + a b + a + 0 2000-01-01 00:00:20 0 2 + 2000-01-01 00:03:20 0 1 + 5 2000-01-01 00:00:20 5 1 + """ + from pandas.core.resample import get_resampler_for_grouping + + return get_resampler_for_grouping(self, rule, *args, **kwargs) + + @Substitution(name="groupby") + @Appender(_common_see_also) + def rolling(self, *args, **kwargs): + """ + Return a rolling grouper, providing rolling functionality per group. + """ + from pandas.core.window import RollingGroupby + + return RollingGroupby(self, *args, **kwargs) + + @Substitution(name="groupby") + @Appender(_common_see_also) + def expanding(self, *args, **kwargs): + """ + Return an expanding grouper, providing expanding + functionality per group. + """ + from pandas.core.window import ExpandingGroupby + + return ExpandingGroupby(self, *args, **kwargs) + + def _fill(self, direction, limit=None): + """ + Shared function for `pad` and `backfill` to call Cython method. + + Parameters + ---------- + direction : {'ffill', 'bfill'} + Direction passed to underlying Cython function. `bfill` will cause + values to be filled backwards. `ffill` and any other values will + default to a forward fill + limit : int, default None + Maximum number of consecutive values to fill. If `None`, this + method will convert to -1 prior to passing to Cython + + Returns + ------- + `Series` or `DataFrame` with filled values + + See Also + -------- + pad + backfill + """ + # Need int value for Cython + if limit is None: + limit = -1 + + return self._get_cythonized_result( + "group_fillna_indexer", + needs_mask=True, + cython_dtype=np.dtype(np.int64), + result_is_index=True, + direction=direction, + limit=limit, + ) + + @Substitution(name="groupby") + def pad(self, limit=None): + """ + Forward fill the values. + + Parameters + ---------- + limit : int, optional + Limit of how many values to fill. + + Returns + ------- + Series or DataFrame + Object with missing values filled. + + See Also + -------- + Series.pad + DataFrame.pad + Series.fillna + DataFrame.fillna + """ + return self._fill("ffill", limit=limit) + + ffill = pad + + @Substitution(name="groupby") + def backfill(self, limit=None): + """ + Backward fill the values. + + Parameters + ---------- + limit : int, optional + Limit of how many values to fill. + + Returns + ------- + Series or DataFrame + Object with missing values filled. + + See Also + -------- + Series.backfill + DataFrame.backfill + Series.fillna + DataFrame.fillna + """ + return self._fill("bfill", limit=limit) + + bfill = backfill + + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFrame: + """ + Take the nth row from each group if n is an int, or a subset of rows + if n is a list of ints. + + If dropna, will take the nth non-null row, dropna is either + 'all' or 'any'; this is equivalent to calling dropna(how=dropna) + before the groupby. + + Parameters + ---------- + n : int or list of ints + A single nth value for the row or a list of nth values. + dropna : None or str, optional + Apply the specified dropna operation before counting which row is + the nth row. Needs to be None, 'any' or 'all'. + + Returns + ------- + Series or DataFrame + N-th value within each group. + %(see_also)s + Examples + -------- + + >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2], + ... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B']) + >>> g = df.groupby('A') + >>> g.nth(0) + B + A + 1 NaN + 2 3.0 + >>> g.nth(1) + B + A + 1 2.0 + 2 5.0 + >>> g.nth(-1) + B + A + 1 4.0 + 2 5.0 + >>> g.nth([0, 1]) + B + A + 1 NaN + 1 2.0 + 2 3.0 + 2 5.0 + + Specifying `dropna` allows count ignoring ``NaN`` + + >>> g.nth(0, dropna='any') + B + A + 1 2.0 + 2 3.0 + + NaNs denote group exhausted when using dropna + + >>> g.nth(3, dropna='any') + B + A + 1 NaN + 2 NaN + + Specifying `as_index=False` in `groupby` keeps the original index. + + >>> df.groupby('A', as_index=False).nth(1) + A B + 1 1 2.0 + 4 2 5.0 + """ + + valid_containers = (set, list, tuple) + if not isinstance(n, (valid_containers, int)): + raise TypeError("n needs to be an int or a list/set/tuple of ints") + + if not dropna: + + if isinstance(n, int): + nth_values = [n] + elif isinstance(n, valid_containers): + nth_values = list(set(n)) + + nth_array = np.array(nth_values, dtype=np.intp) + self._set_group_selection() + + mask_left = np.in1d(self._cumcount_array(), nth_array) + mask_right = np.in1d(self._cumcount_array(ascending=False) + 1, -nth_array) + mask = mask_left | mask_right + + ids, _, _ = self.grouper.group_info + + # Drop NA values in grouping + mask = mask & (ids != -1) + + out = self._selected_obj[mask] + if not self.as_index: + return out + + result_index = self.grouper.result_index + out.index = result_index[ids[mask]] + + if not self.observed and isinstance(result_index, CategoricalIndex): + out = out.reindex(result_index) + + out = self._reindex_output(out) + return out.sort_index() if self.sort else out + + # dropna is truthy + if isinstance(n, valid_containers): + raise ValueError("dropna option with a list of nth values is not supported") + + if dropna not in ["any", "all"]: + # Note: when agg-ing picker doesn't raise this, just returns NaN + raise ValueError( + "For a DataFrame groupby, dropna must be " + "either None, 'any' or 'all', " + f"(was passed {dropna})." + ) + + # old behaviour, but with all and any support for DataFrames. + # modified in GH 7559 to have better perf + max_len = n if n >= 0 else -1 - n + dropped = self.obj.dropna(how=dropna, axis=self.axis) + + # get a new grouper for our dropped obj + if self.keys is None and self.level is None: + + # we don't have the grouper info available + # (e.g. we have selected out + # a column that is not in the current object) + axis = self.grouper.axis + grouper = axis[axis.isin(dropped.index)] + + else: + + # create a grouper with the original parameters, but on dropped + # object + from pandas.core.groupby.grouper import get_grouper + + grouper, _, _ = get_grouper( + dropped, + key=self.keys, + axis=self.axis, + level=self.level, + sort=self.sort, + mutated=self.mutated, + ) + + grb = dropped.groupby(grouper, as_index=self.as_index, sort=self.sort) + sizes, result = grb.size(), grb.nth(n) + mask = (sizes < max_len).values + + # set the results which don't meet the criteria + if len(result) and mask.any(): + result.loc[mask] = np.nan + + # reset/reindex to the original groups + if len(self.obj) == len(dropped) or len(result) == len( + self.grouper.result_index + ): + result.index = self.grouper.result_index + else: + result = result.reindex(self.grouper.result_index) + + return result + + def quantile(self, q=0.5, interpolation: str = "linear"): + """ + Return group values at the given quantile, a la numpy.percentile. + + Parameters + ---------- + q : float or array-like, default 0.5 (50% quantile) + Value(s) between 0 and 1 providing the quantile(s) to compute. + interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + Method to use when the desired quantile falls between two points. + + Returns + ------- + Series or DataFrame + Return type determined by caller of GroupBy object. + + See Also + -------- + Series.quantile : Similar method for Series. + DataFrame.quantile : Similar method for DataFrame. + numpy.percentile : NumPy method to compute qth percentile. + + Examples + -------- + >>> df = pd.DataFrame([ + ... ['a', 1], ['a', 2], ['a', 3], + ... ['b', 1], ['b', 3], ['b', 5] + ... ], columns=['key', 'val']) + >>> df.groupby('key').quantile() + val + key + a 2.0 + b 3.0 + """ + from pandas import concat + + def pre_processor(vals: np.ndarray) -> Tuple[np.ndarray, Optional[Type]]: + if is_object_dtype(vals): + raise TypeError( + "'quantile' cannot be performed against 'object' dtypes!" + ) + + inference = None + if is_integer_dtype(vals): + inference = np.int64 + elif is_datetime64_dtype(vals): + inference = "datetime64[ns]" + vals = vals.astype(np.float) + + return vals, inference + + def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: + if inference: + # Check for edge case + if not ( + is_integer_dtype(inference) + and interpolation in {"linear", "midpoint"} + ): + vals = vals.astype(inference) + + return vals + + if is_scalar(q): + return self._get_cythonized_result( + "group_quantile", + aggregate=True, + needs_values=True, + needs_mask=True, + cython_dtype=np.dtype(np.float64), + pre_processing=pre_processor, + post_processing=post_processor, + q=q, + interpolation=interpolation, + ) + else: + results = [ + self._get_cythonized_result( + "group_quantile", + aggregate=True, + needs_values=True, + needs_mask=True, + cython_dtype=np.dtype(np.float64), + pre_processing=pre_processor, + post_processing=post_processor, + q=qi, + interpolation=interpolation, + ) + for qi in q + ] + result = concat(results, axis=0, keys=q) + # fix levels to place quantiles on the inside + # TODO(GH-10710): Ideally, we could write this as + # >>> result.stack(0).loc[pd.IndexSlice[:, ..., q], :] + # but this hits https://github.com/pandas-dev/pandas/issues/10710 + # which doesn't reorder the list-like `q` on the inner level. + order = list(range(1, result.index.nlevels)) + [0] + + # temporarily saves the index names + index_names = np.array(result.index.names) + + # set index names to positions to avoid confusion + result.index.names = np.arange(len(index_names)) + + # place quantiles on the inside + result = result.reorder_levels(order) + + # restore the index names in order + result.index.names = index_names[order] + + # reorder rows to keep things sorted + indices = np.arange(len(result)).reshape([len(q), self.ngroups]).T.flatten() + return result.take(indices) + + @Substitution(name="groupby") + def ngroup(self, ascending: bool = True): + """ + Number each group from 0 to the number of groups - 1. + + This is the enumerative complement of cumcount. Note that the + numbers given to the groups match the order in which the groups + would be seen when iterating over the groupby object, not the + order they are first observed. + + Parameters + ---------- + ascending : bool, default True + If False, number in reverse, from number of group - 1 to 0. + + Returns + ------- + Series + Unique numbers for each group. + + See Also + -------- + .cumcount : Number the rows in each group. + + Examples + -------- + + >>> df = pd.DataFrame({"A": list("aaabba")}) + >>> df + A + 0 a + 1 a + 2 a + 3 b + 4 b + 5 a + >>> df.groupby('A').ngroup() + 0 0 + 1 0 + 2 0 + 3 1 + 4 1 + 5 0 + dtype: int64 + >>> df.groupby('A').ngroup(ascending=False) + 0 1 + 1 1 + 2 1 + 3 0 + 4 0 + 5 1 + dtype: int64 + >>> df.groupby(["A", [1,1,2,3,2,1]]).ngroup() + 0 0 + 1 0 + 2 1 + 3 3 + 4 2 + 5 0 + dtype: int64 + """ + + with _group_selection_context(self): + index = self._selected_obj.index + result = Series(self.grouper.group_info[0], index) + if not ascending: + result = self.ngroups - 1 - result + return result + + @Substitution(name="groupby") + def cumcount(self, ascending: bool = True): + """ + Number each item in each group from 0 to the length of that group - 1. + + Essentially this is equivalent to + + >>> self.apply(lambda x: pd.Series(np.arange(len(x)), x.index)) + + Parameters + ---------- + ascending : bool, default True + If False, number in reverse, from length of group - 1 to 0. + + Returns + ------- + Series + Sequence number of each element within each group. + + See Also + -------- + .ngroup : Number the groups themselves. + + Examples + -------- + + >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']], + ... columns=['A']) + >>> df + A + 0 a + 1 a + 2 a + 3 b + 4 b + 5 a + >>> df.groupby('A').cumcount() + 0 0 + 1 1 + 2 2 + 3 0 + 4 1 + 5 3 + dtype: int64 + >>> df.groupby('A').cumcount(ascending=False) + 0 3 + 1 2 + 2 1 + 3 1 + 4 0 + 5 0 + dtype: int64 + """ + + with _group_selection_context(self): + index = self._selected_obj.index + cumcounts = self._cumcount_array(ascending=ascending) + return Series(cumcounts, index) + + @Substitution(name="groupby") + @Appender(_common_see_also) + def rank( + self, + method: str = "average", + ascending: bool = True, + na_option: str = "keep", + pct: bool = False, + axis: int = 0, + ): + """ + Provide the rank of values within each group. + + Parameters + ---------- + method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' + * average: average rank of group. + * min: lowest rank in group. + * max: highest rank in group. + * first: ranks assigned in order they appear in the array. + * dense: like 'min', but rank always increases by 1 between groups. + ascending : bool, default True + False for ranks by high (1) to low (N). + na_option : {'keep', 'top', 'bottom'}, default 'keep' + * keep: leave NA values where they are. + * top: smallest rank if ascending. + * bottom: smallest rank if descending. + pct : bool, default False + Compute percentage rank of data within each group. + axis : int, default 0 + The axis of the object over which to compute the rank. + + Returns + ------- + DataFrame with ranking of values within each group + """ + if na_option not in {"keep", "top", "bottom"}: + msg = "na_option must be one of 'keep', 'top', or 'bottom'" + raise ValueError(msg) + return self._cython_transform( + "rank", + numeric_only=False, + ties_method=method, + ascending=ascending, + na_option=na_option, + pct=pct, + axis=axis, + ) + + @Substitution(name="groupby") + @Appender(_common_see_also) + def cumprod(self, axis=0, *args, **kwargs): + """ + Cumulative product for each group. + + Returns + ------- + Series or DataFrame + """ + nv.validate_groupby_func("cumprod", args, kwargs, ["numeric_only", "skipna"]) + if axis != 0: + return self.apply(lambda x: x.cumprod(axis=axis, **kwargs)) + + return self._cython_transform("cumprod", **kwargs) + + @Substitution(name="groupby") + @Appender(_common_see_also) + def cumsum(self, axis=0, *args, **kwargs): + """ + Cumulative sum for each group. + + Returns + ------- + Series or DataFrame + """ + nv.validate_groupby_func("cumsum", args, kwargs, ["numeric_only", "skipna"]) + if axis != 0: + return self.apply(lambda x: x.cumsum(axis=axis, **kwargs)) + + return self._cython_transform("cumsum", **kwargs) + + @Substitution(name="groupby") + @Appender(_common_see_also) + def cummin(self, axis=0, **kwargs): + """ + Cumulative min for each group. + + Returns + ------- + Series or DataFrame + """ + if axis != 0: + return self.apply(lambda x: np.minimum.accumulate(x, axis)) + + return self._cython_transform("cummin", numeric_only=False) + + @Substitution(name="groupby") + @Appender(_common_see_also) + def cummax(self, axis=0, **kwargs): + """ + Cumulative max for each group. + + Returns + ------- + Series or DataFrame + """ + if axis != 0: + return self.apply(lambda x: np.maximum.accumulate(x, axis)) + + return self._cython_transform("cummax", numeric_only=False) + + def _get_cythonized_result( + self, + how: str, + cython_dtype: np.dtype, + aggregate: bool = False, + needs_values: bool = False, + needs_mask: bool = False, + needs_ngroups: bool = False, + result_is_index: bool = False, + pre_processing=None, + post_processing=None, + **kwargs, + ): + """ + Get result for Cythonized functions. + + Parameters + ---------- + how : str, Cythonized function name to be called + cython_dtype : np.dtype + Type of the array that will be modified by the Cython call. + aggregate : bool, default False + Whether the result should be aggregated to match the number of + groups + needs_values : bool, default False + Whether the values should be a part of the Cython call + signature + needs_mask : bool, default False + Whether boolean mask needs to be part of the Cython call + signature + needs_ngroups : bool, default False + Whether number of groups is part of the Cython call signature + result_is_index : bool, default False + Whether the result of the Cython operation is an index of + values to be retrieved, instead of the actual values themselves + pre_processing : function, default None + Function to be applied to `values` prior to passing to Cython. + Function should return a tuple where the first element is the + values to be passed to Cython and the second element is an optional + type which the values should be converted to after being returned + by the Cython operation. Raises if `needs_values` is False. + post_processing : function, default None + Function to be applied to result of Cython function. Should accept + an array of values as the first argument and type inferences as its + second argument, i.e. the signature should be + (ndarray, Type). + **kwargs : dict + Extra arguments to be passed back to Cython funcs + + Returns + ------- + `Series` or `DataFrame` with filled values + """ + if result_is_index and aggregate: + raise ValueError("'result_is_index' and 'aggregate' cannot both be True!") + if post_processing: + if not callable(pre_processing): + raise ValueError("'post_processing' must be a callable!") + if pre_processing: + if not callable(pre_processing): + raise ValueError("'pre_processing' must be a callable!") + if not needs_values: + raise ValueError( + "Cannot use 'pre_processing' without specifying 'needs_values'!" + ) + + grouper = self.grouper + + labels, _, ngroups = grouper.group_info + output: Dict[base.OutputKey, np.ndarray] = {} + base_func = getattr(libgroupby, how) + + for idx, obj in enumerate(self._iterate_slices()): + name = obj.name + values = obj._data._values + + if aggregate: + result_sz = ngroups + else: + result_sz = len(values) + + result = np.zeros(result_sz, dtype=cython_dtype) + func = partial(base_func, result, labels) + inferences = None + + if needs_values: + vals = values + if pre_processing: + vals, inferences = pre_processing(vals) + func = partial(func, vals) + + if needs_mask: + mask = isna(values).view(np.uint8) + func = partial(func, mask) + + if needs_ngroups: + func = partial(func, ngroups) + + func(**kwargs) # Call func to modify indexer values in place + + if result_is_index: + result = algorithms.take_nd(values, result) + + if post_processing: + result = post_processing(result, inferences) + + key = base.OutputKey(label=name, position=idx) + output[key] = result + + if aggregate: + return self._wrap_aggregated_output(output) + else: + return self._wrap_transformed_output(output) + + @Substitution(name="groupby") + @Appender(_common_see_also) + def shift(self, periods=1, freq=None, axis=0, fill_value=None): + """ + Shift each group by periods observations. + + Parameters + ---------- + periods : int, default 1 + Number of periods to shift. + freq : frequency string + axis : axis to shift, default 0 + fill_value : optional + + .. versionadded:: 0.24.0 + + Returns + ------- + Series or DataFrame + Object shifted within each group. + """ + + if freq is not None or axis != 0 or not isna(fill_value): + return self.apply(lambda x: x.shift(periods, freq, axis, fill_value)) + + return self._get_cythonized_result( + "group_shift_indexer", + cython_dtype=np.dtype(np.int64), + needs_ngroups=True, + result_is_index=True, + periods=periods, + ) + + @Substitution(name="groupby") + @Appender(_common_see_also) + def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, axis=0): + """ + Calculate pct_change of each value to previous entry in group. + + Returns + ------- + Series or DataFrame + Percentage changes within each group. + """ + if freq is not None or axis != 0: + return self.apply( + lambda x: x.pct_change( + periods=periods, + fill_method=fill_method, + limit=limit, + freq=freq, + axis=axis, + ) + ) + if fill_method is None: # GH30463 + fill_method = "pad" + limit = 0 + filled = getattr(self, fill_method)(limit=limit) + fill_grp = filled.groupby(self.grouper.codes) + shifted = fill_grp.shift(periods=periods, freq=freq) + return (filled / shifted) - 1 + + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def head(self, n=5): + """ + Return first n rows of each group. + + Similar to ``.apply(lambda x: x.head(n))``, but it returns a subset of rows + from the original DataFrame with original index and order preserved + (``as_index`` flag is ignored). + + Does not work for negative values of `n`. + + Returns + ------- + Series or DataFrame + %(see_also)s + Examples + -------- + + >>> df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], + ... columns=['A', 'B']) + >>> df.groupby('A').head(1) + A B + 0 1 2 + 2 5 6 + >>> df.groupby('A').head(-1) + Empty DataFrame + Columns: [A, B] + Index: [] + """ + self._reset_group_selection() + mask = self._cumcount_array() < n + return self._selected_obj[mask] + + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def tail(self, n=5): + """ + Return last n rows of each group. + + Similar to ``.apply(lambda x: x.tail(n))``, but it returns a subset of rows + from the original DataFrame with original index and order preserved + (``as_index`` flag is ignored). + + Does not work for negative values of `n`. + + Returns + ------- + Series or DataFrame + %(see_also)s + Examples + -------- + + >>> df = pd.DataFrame([['a', 1], ['a', 2], ['b', 1], ['b', 2]], + ... columns=['A', 'B']) + >>> df.groupby('A').tail(1) + A B + 1 a 2 + 3 b 2 + >>> df.groupby('A').tail(-1) + Empty DataFrame + Columns: [A, B] + Index: [] + """ + self._reset_group_selection() + mask = self._cumcount_array(ascending=False) < n + return self._selected_obj[mask] + + def _reindex_output( + self, output: FrameOrSeries, fill_value: Scalar = np.NaN + ) -> FrameOrSeries: + """ + If we have categorical groupers, then we might want to make sure that + we have a fully re-indexed output to the levels. This means expanding + the output space to accommodate all values in the cartesian product of + our groups, regardless of whether they were observed in the data or + not. This will expand the output space if there are missing groups. + + The method returns early without modifying the input if the number of + groupings is less than 2, self.observed == True or none of the groupers + are categorical. + + Parameters + ---------- + output : Series or DataFrame + Object resulting from grouping and applying an operation. + fill_value : scalar, default np.NaN + Value to use for unobserved categories if self.observed is False. + + Returns + ------- + Series or DataFrame + Object (potentially) re-indexed to include all possible groups. + """ + groupings = self.grouper.groupings + if groupings is None: + return output + elif len(groupings) == 1: + return output + + # if we only care about the observed values + # we are done + elif self.observed: + return output + + # reindexing only applies to a Categorical grouper + elif not any( + isinstance(ping.grouper, (Categorical, CategoricalIndex)) + for ping in groupings + ): + return output + + levels_list = [ping.group_index for ping in groupings] + index, _ = MultiIndex.from_product( + levels_list, names=self.grouper.names + ).sortlevel() + + if self.as_index: + d = { + self.obj._get_axis_name(self.axis): index, + "copy": False, + "fill_value": fill_value, + } + return output.reindex(**d) + + # GH 13204 + # Here, the categorical in-axis groupers, which need to be fully + # expanded, are columns in `output`. An idea is to do: + # output = output.set_index(self.grouper.names) + # .reindex(index).reset_index() + # but special care has to be taken because of possible not-in-axis + # groupers. + # So, we manually select and drop the in-axis grouper columns, + # reindex `output`, and then reset the in-axis grouper columns. + + # Select in-axis groupers + in_axis_grps = ( + (i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis + ) + g_nums, g_names = zip(*in_axis_grps) + + output = output.drop(labels=list(g_names), axis=1) + + # Set a temp index and reindex (possibly expanding) + output = output.set_index(self.grouper.result_index).reindex( + index, copy=False, fill_value=fill_value + ) + + # Reset in-axis grouper columns + # (using level numbers `g_nums` because level names may not be unique) + output = output.reset_index(level=g_nums) + + return output.reset_index(drop=True) + + +GroupBy._add_numeric_operations() + + +@Appender(GroupBy.__doc__) +def get_groupby( + obj: NDFrame, + by: Optional[_KeysArgType] = None, + axis: int = 0, + level=None, + grouper: "Optional[ops.BaseGrouper]" = None, + exclusions=None, + selection=None, + as_index: bool = True, + sort: bool = True, + group_keys: bool = True, + squeeze: bool = False, + observed: bool = False, + mutated: bool = False, +) -> GroupBy: + + klass: Type[GroupBy] + if isinstance(obj, Series): + from pandas.core.groupby.generic import SeriesGroupBy + + klass = SeriesGroupBy + elif isinstance(obj, DataFrame): + from pandas.core.groupby.generic import DataFrameGroupBy + + klass = DataFrameGroupBy + else: + raise TypeError(f"invalid type: {obj}") + + return klass( + obj=obj, + keys=by, + axis=axis, + level=level, + grouper=grouper, + exclusions=exclusions, + selection=selection, + as_index=as_index, + sort=sort, + group_keys=group_keys, + squeeze=squeeze, + observed=observed, + mutated=mutated, + ) diff --git a/venv/Lib/site-packages/pandas/core/groupby/grouper.py b/venv/Lib/site-packages/pandas/core/groupby/grouper.py new file mode 100644 index 0000000..0b89e70 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/groupby/grouper.py @@ -0,0 +1,658 @@ +""" +Provide user facing operators for doing the split part of the +split-apply-combine paradigm. +""" + +from typing import Dict, Hashable, List, Optional, Tuple + +import numpy as np + +from pandas._typing import FrameOrSeries +from pandas.util._decorators import cache_readonly + +from pandas.core.dtypes.common import ( + ensure_categorical, + is_categorical_dtype, + is_datetime64_dtype, + is_list_like, + is_scalar, + is_timedelta64_dtype, +) +from pandas.core.dtypes.generic import ABCSeries + +import pandas.core.algorithms as algorithms +from pandas.core.arrays import Categorical, ExtensionArray +import pandas.core.common as com +from pandas.core.frame import DataFrame +from pandas.core.groupby import ops +from pandas.core.groupby.categorical import recode_for_groupby, recode_from_groupby +from pandas.core.indexes.api import CategoricalIndex, Index, MultiIndex +from pandas.core.series import Series + +from pandas.io.formats.printing import pprint_thing + + +class Grouper: + """ + A Grouper allows the user to specify a groupby instruction for an object. + + This specification will select a column via the key parameter, or if the + level and/or axis parameters are given, a level of the index of the target + object. + + If `axis` and/or `level` are passed as keywords to both `Grouper` and + `groupby`, the values passed to `Grouper` take precedence. + + Parameters + ---------- + key : str, defaults to None + Groupby key, which selects the grouping column of the target. + level : name/number, defaults to None + The level for the target index. + freq : str / frequency object, defaults to None + This will groupby the specified frequency if the target selection + (via key or level) is a datetime-like object. For full specification + of available frequencies, please see `here + `_. + axis : str, int, defaults to 0 + Number/name of the axis. + sort : bool, default to False + Whether to sort the resulting labels. + closed : {'left' or 'right'} + Closed end of interval. Only when `freq` parameter is passed. + label : {'left' or 'right'} + Interval boundary to use for labeling. + Only when `freq` parameter is passed. + convention : {'start', 'end', 'e', 's'} + If grouper is PeriodIndex and `freq` parameter is passed. + base : int, default 0 + Only when `freq` parameter is passed. + loffset : str, DateOffset, timedelta object + Only when `freq` parameter is passed. + + Returns + ------- + A specification for a groupby instruction + + Examples + -------- + + Syntactic sugar for ``df.groupby('A')`` + + >>> df.groupby(Grouper(key='A')) + + Specify a resample operation on the column 'date' + + >>> df.groupby(Grouper(key='date', freq='60s')) + + Specify a resample operation on the level 'date' on the columns axis + with a frequency of 60s + + >>> df.groupby(Grouper(level='date', freq='60s', axis=1)) + """ + + _attributes: Tuple[str, ...] = ("key", "level", "freq", "axis", "sort") + + def __new__(cls, *args, **kwargs): + if kwargs.get("freq") is not None: + from pandas.core.resample import TimeGrouper + + cls = TimeGrouper + return super().__new__(cls) + + def __init__(self, key=None, level=None, freq=None, axis=0, sort=False): + self.key = key + self.level = level + self.freq = freq + self.axis = axis + self.sort = sort + + self.grouper = None + self.obj = None + self.indexer = None + self.binner = None + self._grouper = None + + @property + def ax(self): + return self.grouper + + def _get_grouper(self, obj, validate: bool = True): + """ + Parameters + ---------- + obj : the subject object + validate : boolean, default True + if True, validate the grouper + + Returns + ------- + a tuple of binner, grouper, obj (possibly sorted) + """ + + self._set_grouper(obj) + self.grouper, _, self.obj = get_grouper( + self.obj, + [self.key], + axis=self.axis, + level=self.level, + sort=self.sort, + validate=validate, + ) + return self.binner, self.grouper, self.obj + + def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): + """ + given an object and the specifications, setup the internal grouper + for this particular specification + + Parameters + ---------- + obj : Series or DataFrame + sort : bool, default False + whether the resulting grouper should be sorted + """ + assert obj is not None + + if self.key is not None and self.level is not None: + raise ValueError("The Grouper cannot specify both a key and a level!") + + # Keep self.grouper value before overriding + if self._grouper is None: + self._grouper = self.grouper + + # the key must be a valid info item + if self.key is not None: + key = self.key + # The 'on' is already defined + if getattr(self.grouper, "name", None) == key and isinstance( + obj, ABCSeries + ): + ax = self._grouper.take(obj.index) + else: + if key not in obj._info_axis: + raise KeyError(f"The grouper name {key} is not found") + ax = Index(obj[key], name=key) + + else: + ax = obj._get_axis(self.axis) + if self.level is not None: + level = self.level + + # if a level is given it must be a mi level or + # equivalent to the axis name + if isinstance(ax, MultiIndex): + level = ax._get_level_number(level) + ax = Index(ax._get_level_values(level), name=ax.names[level]) + + else: + if level not in (0, ax.name): + raise ValueError(f"The level {level} is not valid") + + # possibly sort + if (self.sort or sort) and not ax.is_monotonic: + # use stable sort to support first, last, nth + indexer = self.indexer = ax.argsort(kind="mergesort") + ax = ax.take(indexer) + obj = obj.take(indexer, axis=self.axis) + + self.obj = obj + self.grouper = ax + return self.grouper + + @property + def groups(self): + return self.grouper.groups + + def __repr__(self) -> str: + attrs_list = ( + f"{attr_name}={repr(getattr(self, attr_name))}" + for attr_name in self._attributes + if getattr(self, attr_name) is not None + ) + attrs = ", ".join(attrs_list) + cls_name = type(self).__name__ + return f"{cls_name}({attrs})" + + +class Grouping: + """ + Holds the grouping information for a single key + + Parameters + ---------- + index : Index + grouper : + obj Union[DataFrame, Series]: + name : + level : + observed : bool, default False + If we are a Categorical, use the observed values + in_axis : if the Grouping is a column in self.obj and hence among + Groupby.exclusions list + + Returns + ------- + **Attributes**: + * indices : dict of {group -> index_list} + * codes : ndarray, group codes + * group_index : unique groups + * groups : dict of {group -> label_list} + """ + + def __init__( + self, + index: Index, + grouper=None, + obj: Optional[FrameOrSeries] = None, + name=None, + level=None, + sort: bool = True, + observed: bool = False, + in_axis: bool = False, + ): + self.name = name + self.level = level + self.grouper = _convert_grouper(index, grouper) + self.all_grouper = None + self.index = index + self.sort = sort + self.obj = obj + self.observed = observed + self.in_axis = in_axis + + # right place for this? + if isinstance(grouper, (Series, Index)) and name is None: + self.name = grouper.name + + if isinstance(grouper, MultiIndex): + self.grouper = grouper.values + + # we have a single grouper which may be a myriad of things, + # some of which are dependent on the passing in level + + if level is not None: + if not isinstance(level, int): + if level not in index.names: + raise AssertionError(f"Level {level} not in index") + level = index.names.index(level) + + if self.name is None: + self.name = index.names[level] + + ( + self.grouper, + self._codes, + self._group_index, + ) = index._get_grouper_for_level(self.grouper, level) + + # a passed Grouper like, directly get the grouper in the same way + # as single grouper groupby, use the group_info to get codes + elif isinstance(self.grouper, Grouper): + # get the new grouper; we already have disambiguated + # what key/level refer to exactly, don't need to + # check again as we have by this point converted these + # to an actual value (rather than a pd.Grouper) + _, grouper, _ = self.grouper._get_grouper(self.obj, validate=False) + if self.name is None: + self.name = grouper.result_index.name + self.obj = self.grouper.obj + self.grouper = grouper._get_grouper() + + else: + if self.grouper is None and self.name is not None and self.obj is not None: + self.grouper = self.obj[self.name] + + elif isinstance(self.grouper, (list, tuple)): + self.grouper = com.asarray_tuplesafe(self.grouper) + + # a passed Categorical + elif is_categorical_dtype(self.grouper): + + self.grouper, self.all_grouper = recode_for_groupby( + self.grouper, self.sort, observed + ) + categories = self.grouper.categories + + # we make a CategoricalIndex out of the cat grouper + # preserving the categories / ordered attributes + self._codes = self.grouper.codes + if observed: + codes = algorithms.unique1d(self.grouper.codes) + codes = codes[codes != -1] + if sort or self.grouper.ordered: + codes = np.sort(codes) + else: + codes = np.arange(len(categories)) + + self._group_index = CategoricalIndex( + Categorical.from_codes( + codes=codes, categories=categories, ordered=self.grouper.ordered + ), + name=self.name, + ) + + # we are done + if isinstance(self.grouper, Grouping): + self.grouper = self.grouper.grouper + + # no level passed + elif not isinstance( + self.grouper, (Series, Index, ExtensionArray, np.ndarray) + ): + if getattr(self.grouper, "ndim", 1) != 1: + t = self.name or str(type(self.grouper)) + raise ValueError(f"Grouper for '{t}' not 1-dimensional") + self.grouper = self.index.map(self.grouper) + if not ( + hasattr(self.grouper, "__len__") + and len(self.grouper) == len(self.index) + ): + grper = pprint_thing(self.grouper) + errmsg = ( + "Grouper result violates len(labels) == " + f"len(data)\nresult: {grper}" + ) + self.grouper = None # Try for sanity + raise AssertionError(errmsg) + + # if we have a date/time-like grouper, make sure that we have + # Timestamps like + if getattr(self.grouper, "dtype", None) is not None: + if is_datetime64_dtype(self.grouper): + self.grouper = self.grouper.astype("datetime64[ns]") + elif is_timedelta64_dtype(self.grouper): + + self.grouper = self.grouper.astype("timedelta64[ns]") + + def __repr__(self) -> str: + return f"Grouping({self.name})" + + def __iter__(self): + return iter(self.indices) + + _codes: Optional[np.ndarray] = None + _group_index: Optional[Index] = None + + @property + def ngroups(self) -> int: + return len(self.group_index) + + @cache_readonly + def indices(self): + # we have a list of groupers + if isinstance(self.grouper, ops.BaseGrouper): + return self.grouper.indices + + values = ensure_categorical(self.grouper) + return values._reverse_indexer() + + @property + def codes(self) -> np.ndarray: + if self._codes is None: + self._make_codes() + return self._codes + + @cache_readonly + def result_index(self) -> Index: + if self.all_grouper is not None: + return recode_from_groupby(self.all_grouper, self.sort, self.group_index) + return self.group_index + + @property + def group_index(self) -> Index: + if self._group_index is None: + self._make_codes() + assert self._group_index is not None + return self._group_index + + def _make_codes(self) -> None: + if self._codes is None or self._group_index is None: + # we have a list of groupers + if isinstance(self.grouper, ops.BaseGrouper): + codes = self.grouper.codes_info + uniques = self.grouper.result_index + else: + codes, uniques = algorithms.factorize(self.grouper, sort=self.sort) + uniques = Index(uniques, name=self.name) + self._codes = codes + self._group_index = uniques + + @cache_readonly + def groups(self) -> Dict[Hashable, np.ndarray]: + return self.index.groupby(Categorical.from_codes(self.codes, self.group_index)) + + +def get_grouper( + obj: FrameOrSeries, + key=None, + axis: int = 0, + level=None, + sort: bool = True, + observed: bool = False, + mutated: bool = False, + validate: bool = True, +) -> "Tuple[ops.BaseGrouper, List[Hashable], FrameOrSeries]": + """ + Create and return a BaseGrouper, which is an internal + mapping of how to create the grouper indexers. + This may be composed of multiple Grouping objects, indicating + multiple groupers + + Groupers are ultimately index mappings. They can originate as: + index mappings, keys to columns, functions, or Groupers + + Groupers enable local references to axis,level,sort, while + the passed in axis, level, and sort are 'global'. + + This routine tries to figure out what the passing in references + are and then creates a Grouping for each one, combined into + a BaseGrouper. + + If observed & we have a categorical grouper, only show the observed + values. + + If validate, then check for key/level overlaps. + + """ + group_axis = obj._get_axis(axis) + + # validate that the passed single level is compatible with the passed + # axis of the object + if level is not None: + # TODO: These if-block and else-block are almost same. + # MultiIndex instance check is removable, but it seems that there are + # some processes only for non-MultiIndex in else-block, + # eg. `obj.index.name != level`. We have to consider carefully whether + # these are applicable for MultiIndex. Even if these are applicable, + # we need to check if it makes no side effect to subsequent processes + # on the outside of this condition. + # (GH 17621) + if isinstance(group_axis, MultiIndex): + if is_list_like(level) and len(level) == 1: + level = level[0] + + if key is None and is_scalar(level): + # Get the level values from group_axis + key = group_axis.get_level_values(level) + level = None + + else: + # allow level to be a length-one list-like object + # (e.g., level=[0]) + # GH 13901 + if is_list_like(level): + nlevels = len(level) + if nlevels == 1: + level = level[0] + elif nlevels == 0: + raise ValueError("No group keys passed!") + else: + raise ValueError("multiple levels only valid with MultiIndex") + + if isinstance(level, str): + if obj._get_axis(axis).name != level: + raise ValueError( + f"level name {level} is not the name " + f"of the {obj._get_axis_name(axis)}" + ) + elif level > 0 or level < -1: + raise ValueError("level > 0 or level < -1 only valid with MultiIndex") + + # NOTE: `group_axis` and `group_axis.get_level_values(level)` + # are same in this section. + level = None + key = group_axis + + # a passed-in Grouper, directly convert + if isinstance(key, Grouper): + binner, grouper, obj = key._get_grouper(obj, validate=False) + if key.key is None: + return grouper, [], obj + else: + return grouper, [key.key], obj + + # already have a BaseGrouper, just return it + elif isinstance(key, ops.BaseGrouper): + return key, [], obj + + if not isinstance(key, list): + keys = [key] + match_axis_length = False + else: + keys = key + match_axis_length = len(keys) == len(group_axis) + + # what are we after, exactly? + any_callable = any(callable(g) or isinstance(g, dict) for g in keys) + any_groupers = any(isinstance(g, Grouper) for g in keys) + any_arraylike = any( + isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys + ) + + # is this an index replacement? + if ( + not any_callable + and not any_arraylike + and not any_groupers + and match_axis_length + and level is None + ): + if isinstance(obj, DataFrame): + all_in_columns_index = all( + g in obj.columns or g in obj.index.names for g in keys + ) + else: + assert isinstance(obj, Series) + all_in_columns_index = all(g in obj.index.names for g in keys) + + if not all_in_columns_index: + keys = [com.asarray_tuplesafe(keys)] + + if isinstance(level, (tuple, list)): + if key is None: + keys = [None] * len(level) + levels = level + else: + levels = [level] * len(keys) + + groupings: List[Grouping] = [] + exclusions: List[Hashable] = [] + + # if the actual grouper should be obj[key] + def is_in_axis(key) -> bool: + if not _is_label_like(key): + items = obj._data.items + try: + items.get_loc(key) + except (KeyError, TypeError): + # TypeError shows up here if we pass e.g. Int64Index + return False + + return True + + # if the grouper is obj[name] + def is_in_obj(gpr) -> bool: + if not hasattr(gpr, "name"): + return False + try: + return gpr is obj[gpr.name] + except (KeyError, IndexError): + return False + + for i, (gpr, level) in enumerate(zip(keys, levels)): + + if is_in_obj(gpr): # df.groupby(df['name']) + in_axis, name = True, gpr.name + exclusions.append(name) + + elif is_in_axis(gpr): # df.groupby('name') + if gpr in obj: + if validate: + obj._check_label_or_level_ambiguity(gpr, axis=axis) + in_axis, name, gpr = True, gpr, obj[gpr] + exclusions.append(name) + elif obj._is_level_reference(gpr, axis=axis): + in_axis, name, level, gpr = False, None, gpr, None + else: + raise KeyError(gpr) + elif isinstance(gpr, Grouper) and gpr.key is not None: + # Add key to exclusions + exclusions.append(gpr.key) + in_axis, name = False, None + else: + in_axis, name = False, None + + if is_categorical_dtype(gpr) and len(gpr) != obj.shape[axis]: + raise ValueError( + f"Length of grouper ({len(gpr)}) and axis ({obj.shape[axis]}) " + "must be same length" + ) + + # create the Grouping + # allow us to passing the actual Grouping as the gpr + ping = ( + Grouping( + group_axis, + gpr, + obj=obj, + name=name, + level=level, + sort=sort, + observed=observed, + in_axis=in_axis, + ) + if not isinstance(gpr, Grouping) + else gpr + ) + + groupings.append(ping) + + if len(groupings) == 0 and len(obj): + raise ValueError("No group keys passed!") + elif len(groupings) == 0: + groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) + + # create the internals grouper + grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated) + return grouper, exclusions, obj + + +def _is_label_like(val) -> bool: + return isinstance(val, (str, tuple)) or (val is not None and is_scalar(val)) + + +def _convert_grouper(axis: Index, grouper): + if isinstance(grouper, dict): + return grouper.get + elif isinstance(grouper, Series): + if grouper.index.equals(axis): + return grouper._values + else: + return grouper.reindex(axis)._values + elif isinstance(grouper, (list, Series, Index, np.ndarray)): + if len(grouper) != len(axis): + raise ValueError("Grouper and axis must be same length") + return grouper + else: + return grouper diff --git a/venv/Lib/site-packages/pandas/core/groupby/ops.py b/venv/Lib/site-packages/pandas/core/groupby/ops.py new file mode 100644 index 0000000..2e95daa --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/groupby/ops.py @@ -0,0 +1,941 @@ +""" +Provide classes to perform the groupby aggregate operations. + +These are not exposed to the user and provide implementations of the grouping +operations, primarily in cython. These classes (BaseGrouper and BinGrouper) +are contained *in* the SeriesGroupBy and DataFrameGroupBy objects. +""" + +import collections +from typing import List, Optional, Sequence, Tuple, Type + +import numpy as np + +from pandas._libs import NaT, iNaT, lib +import pandas._libs.groupby as libgroupby +import pandas._libs.reduction as libreduction +from pandas._typing import FrameOrSeries +from pandas.errors import AbstractMethodError +from pandas.util._decorators import cache_readonly + +from pandas.core.dtypes.common import ( + ensure_float64, + ensure_int64, + ensure_int_or_float, + ensure_platform_int, + is_bool_dtype, + is_categorical_dtype, + is_complex_dtype, + is_datetime64_any_dtype, + is_datetime64tz_dtype, + is_extension_array_dtype, + is_integer_dtype, + is_numeric_dtype, + is_sparse, + is_timedelta64_dtype, + needs_i8_conversion, +) +from pandas.core.dtypes.missing import _maybe_fill, isna + +import pandas.core.algorithms as algorithms +from pandas.core.base import SelectionMixin +import pandas.core.common as com +from pandas.core.frame import DataFrame +from pandas.core.generic import NDFrame +from pandas.core.groupby import base, grouper +from pandas.core.indexes.api import Index, MultiIndex, ensure_index +from pandas.core.series import Series +from pandas.core.sorting import ( + compress_group_index, + decons_obs_group_ids, + get_flattened_iterator, + get_group_index, + get_group_index_sorter, + get_indexer_dict, +) + + +class BaseGrouper: + """ + This is an internal Grouper class, which actually holds + the generated groups + + Parameters + ---------- + axis : Index + groupings : Sequence[Grouping] + all the grouping instances to handle in this grouper + for example for grouper list to groupby, need to pass the list + sort : bool, default True + whether this grouper will give sorted result or not + group_keys : bool, default True + mutated : bool, default False + indexer : intp array, optional + the indexer created by Grouper + some groupers (TimeGrouper) will sort its axis and its + group_info is also sorted, so need the indexer to reorder + + """ + + def __init__( + self, + axis: Index, + groupings: "Sequence[grouper.Grouping]", + sort: bool = True, + group_keys: bool = True, + mutated: bool = False, + indexer: Optional[np.ndarray] = None, + ): + assert isinstance(axis, Index), axis + + self._filter_empty_groups = self.compressed = len(groupings) != 1 + self.axis = axis + self._groupings: List[grouper.Grouping] = list(groupings) + self.sort = sort + self.group_keys = group_keys + self.mutated = mutated + self.indexer = indexer + + @property + def groupings(self) -> List["grouper.Grouping"]: + return self._groupings + + @property + def shape(self): + return tuple(ping.ngroups for ping in self.groupings) + + def __iter__(self): + return iter(self.indices) + + @property + def nkeys(self) -> int: + return len(self.groupings) + + def get_iterator(self, data: FrameOrSeries, axis: int = 0): + """ + Groupby iterator + + Returns + ------- + Generator yielding sequence of (name, subsetted object) + for each group + """ + splitter = self._get_splitter(data, axis=axis) + keys = self._get_group_keys() + for key, (i, group) in zip(keys, splitter): + yield key, group + + def _get_splitter(self, data: FrameOrSeries, axis: int = 0) -> "DataSplitter": + comp_ids, _, ngroups = self.group_info + return get_splitter(data, comp_ids, ngroups, axis=axis) + + def _get_grouper(self): + """ + We are a grouper as part of another's groupings. + + We have a specific method of grouping, so cannot + convert to a Index for our grouper. + """ + return self.groupings[0].grouper + + def _get_group_keys(self): + if len(self.groupings) == 1: + return self.levels[0] + else: + comp_ids, _, ngroups = self.group_info + + # provide "flattened" iterator for multi-group setting + return get_flattened_iterator(comp_ids, ngroups, self.levels, self.codes) + + def apply(self, f, data: FrameOrSeries, axis: int = 0): + mutated = self.mutated + splitter = self._get_splitter(data, axis=axis) + group_keys = self._get_group_keys() + result_values = None + + sdata: FrameOrSeries = splitter._get_sorted_data() + if sdata.ndim == 2 and np.any(sdata.dtypes.apply(is_extension_array_dtype)): + # calling splitter.fast_apply will raise TypeError via apply_frame_axis0 + # if we pass EA instead of ndarray + # TODO: can we have a workaround for EAs backed by ndarray? + pass + + elif ( + com.get_callable_name(f) not in base.plotting_methods + and isinstance(splitter, FrameSplitter) + and axis == 0 + # fast_apply/libreduction doesn't allow non-numpy backed indexes + and not sdata.index._has_complex_internals + ): + try: + result_values, mutated = splitter.fast_apply(f, group_keys) + + except libreduction.InvalidApply as err: + # This Exception is raised if `f` triggers an exception + # but it is preferable to raise the exception in Python. + if "Let this error raise above us" not in str(err): + # TODO: can we infer anything about whether this is + # worth-retrying in pure-python? + raise + + else: + # If the fast apply path could be used we can return here. + # Otherwise we need to fall back to the slow implementation. + if len(result_values) == len(group_keys): + return group_keys, result_values, mutated + + for key, (i, group) in zip(group_keys, splitter): + object.__setattr__(group, "name", key) + + # result_values is None if fast apply path wasn't taken + # or fast apply aborted with an unexpected exception. + # In either case, initialize the result list and perform + # the slow iteration. + if result_values is None: + result_values = [] + + # If result_values is not None we're in the case that the + # fast apply loop was broken prematurely but we have + # already the result for the first group which we can reuse. + elif i == 0: + continue + + # group might be modified + group_axes = group.axes + res = f(group) + if not _is_indexed_like(res, group_axes): + mutated = True + result_values.append(res) + + return group_keys, result_values, mutated + + @cache_readonly + def indices(self): + """ dict {group name -> group indices} """ + if len(self.groupings) == 1: + return self.groupings[0].indices + else: + codes_list = [ping.codes for ping in self.groupings] + keys = [com.values_from_object(ping.group_index) for ping in self.groupings] + return get_indexer_dict(codes_list, keys) + + @property + def codes(self) -> List[np.ndarray]: + return [ping.codes for ping in self.groupings] + + @property + def levels(self) -> List[Index]: + return [ping.group_index for ping in self.groupings] + + @property + def names(self): + return [ping.name for ping in self.groupings] + + def size(self) -> Series: + """ + Compute group sizes. + """ + ids, _, ngroup = self.group_info + ids = ensure_platform_int(ids) + if ngroup: + out = np.bincount(ids[ids != -1], minlength=ngroup) + else: + out = [] + return Series(out, index=self.result_index, dtype="int64") + + @cache_readonly + def groups(self): + """ dict {group name -> group labels} """ + if len(self.groupings) == 1: + return self.groupings[0].groups + else: + to_groupby = zip(*(ping.grouper for ping in self.groupings)) + to_groupby = Index(to_groupby) + return self.axis.groupby(to_groupby) + + @cache_readonly + def is_monotonic(self) -> bool: + # return if my group orderings are monotonic + return Index(self.group_info[0]).is_monotonic + + @cache_readonly + def group_info(self): + comp_ids, obs_group_ids = self._get_compressed_codes() + + ngroups = len(obs_group_ids) + comp_ids = ensure_int64(comp_ids) + return comp_ids, obs_group_ids, ngroups + + @cache_readonly + def codes_info(self) -> np.ndarray: + # return the codes of items in original grouped axis + codes, _, _ = self.group_info + if self.indexer is not None: + sorter = np.lexsort((codes, self.indexer)) + codes = codes[sorter] + return codes + + def _get_compressed_codes(self) -> Tuple[np.ndarray, np.ndarray]: + all_codes = self.codes + if len(all_codes) > 1: + group_index = get_group_index(all_codes, self.shape, sort=True, xnull=True) + return compress_group_index(group_index, sort=self.sort) + + ping = self.groupings[0] + return ping.codes, np.arange(len(ping.group_index)) + + @cache_readonly + def ngroups(self) -> int: + return len(self.result_index) + + @property + def reconstructed_codes(self) -> List[np.ndarray]: + codes = self.codes + comp_ids, obs_ids, _ = self.group_info + return decons_obs_group_ids(comp_ids, obs_ids, self.shape, codes, xnull=True) + + @cache_readonly + def result_index(self) -> Index: + if not self.compressed and len(self.groupings) == 1: + return self.groupings[0].result_index.rename(self.names[0]) + + codes = self.reconstructed_codes + levels = [ping.result_index for ping in self.groupings] + result = MultiIndex( + levels=levels, codes=codes, verify_integrity=False, names=self.names + ) + return result + + def get_group_levels(self): + if not self.compressed and len(self.groupings) == 1: + return [self.groupings[0].result_index] + + name_list = [] + for ping, codes in zip(self.groupings, self.reconstructed_codes): + codes = ensure_platform_int(codes) + levels = ping.result_index.take(codes) + + name_list.append(levels) + + return name_list + + # ------------------------------------------------------------ + # Aggregation functions + + _cython_functions = { + "aggregate": { + "add": "group_add", + "prod": "group_prod", + "min": "group_min", + "max": "group_max", + "mean": "group_mean", + "median": "group_median", + "var": "group_var", + "first": "group_nth", + "last": "group_last", + "ohlc": "group_ohlc", + }, + "transform": { + "cumprod": "group_cumprod", + "cumsum": "group_cumsum", + "cummin": "group_cummin", + "cummax": "group_cummax", + "rank": "group_rank", + }, + } + + _cython_arity = {"ohlc": 4} # OHLC + + _name_functions = {"ohlc": ["open", "high", "low", "close"]} + + def _is_builtin_func(self, arg): + """ + if we define an builtin function for this argument, return it, + otherwise return the arg + """ + return SelectionMixin._builtin_table.get(arg, arg) + + def _get_cython_function(self, kind: str, how: str, values, is_numeric: bool): + + dtype_str = values.dtype.name + ftype = self._cython_functions[kind][how] + + # see if there is a fused-type version of function + # only valid for numeric + f = getattr(libgroupby, ftype, None) + if f is not None and is_numeric: + return f + + # otherwise find dtype-specific version, falling back to object + for dt in [dtype_str, "object"]: + f2 = getattr(libgroupby, f"{ftype}_{dt}", None) + if f2 is not None: + return f2 + + if hasattr(f, "__signatures__"): + # inspect what fused types are implemented + if dtype_str == "object" and "object" not in f.__signatures__: + # disallow this function so we get a NotImplementedError below + # instead of a TypeError at runtime + f = None + + func = f + + if func is None: + raise NotImplementedError( + f"function is not implemented for this dtype: " + f"[how->{how},dtype->{dtype_str}]" + ) + + return func + + def _get_cython_func_and_vals( + self, kind: str, how: str, values: np.ndarray, is_numeric: bool + ): + """ + Find the appropriate cython function, casting if necessary. + + Parameters + ---------- + kind : sttr + how : srt + values : np.ndarray + is_numeric : bool + + Returns + ------- + func : callable + values : np.ndarray + """ + try: + func = self._get_cython_function(kind, how, values, is_numeric) + except NotImplementedError: + if is_numeric: + try: + values = ensure_float64(values) + except TypeError: + if lib.infer_dtype(values, skipna=False) == "complex": + values = values.astype(complex) + else: + raise + func = self._get_cython_function(kind, how, values, is_numeric) + else: + raise + return func, values + + def _cython_operation( + self, kind: str, values, how: str, axis, min_count: int = -1, **kwargs + ) -> Tuple[np.ndarray, Optional[List[str]]]: + """ + Returns the values of a cython operation as a Tuple of [data, names]. + + Names is only useful when dealing with 2D results, like ohlc + (see self._name_functions). + """ + + assert kind in ["transform", "aggregate"] + orig_values = values + + if values.ndim > 2: + raise NotImplementedError("number of dimensions is currently limited to 2") + elif values.ndim == 2: + # Note: it is *not* the case that axis is always 0 for 1-dim values, + # as we can have 1D ExtensionArrays that we need to treat as 2D + assert axis == 1, axis + + # can we do this operation with our cython functions + # if not raise NotImplementedError + + # we raise NotImplemented if this is an invalid operation + # entirely, e.g. adding datetimes + + # categoricals are only 1d, so we + # are not setup for dim transforming + if is_categorical_dtype(values) or is_sparse(values): + raise NotImplementedError(f"{values.dtype} dtype not supported") + elif is_datetime64_any_dtype(values): + if how in ["add", "prod", "cumsum", "cumprod"]: + raise NotImplementedError( + f"datetime64 type does not support {how} operations" + ) + elif is_timedelta64_dtype(values): + if how in ["prod", "cumprod"]: + raise NotImplementedError( + f"timedelta64 type does not support {how} operations" + ) + + if is_datetime64tz_dtype(values.dtype): + # Cast to naive; we'll cast back at the end of the function + # TODO: possible need to reshape? kludge can be avoided when + # 2D EA is allowed. + values = values.view("M8[ns]") + + is_datetimelike = needs_i8_conversion(values.dtype) + is_numeric = is_numeric_dtype(values.dtype) + + if is_datetimelike: + values = values.view("int64") + is_numeric = True + elif is_bool_dtype(values.dtype): + values = ensure_float64(values) + elif is_integer_dtype(values): + # we use iNaT for the missing value on ints + # so pre-convert to guard this condition + if (values == iNaT).any(): + values = ensure_float64(values) + else: + values = ensure_int_or_float(values) + elif is_numeric and not is_complex_dtype(values): + values = ensure_float64(values) + else: + values = values.astype(object) + + arity = self._cython_arity.get(how, 1) + + vdim = values.ndim + swapped = False + if vdim == 1: + values = values[:, None] + out_shape = (self.ngroups, arity) + else: + if axis > 0: + swapped = True + assert axis == 1, axis + values = values.T + if arity > 1: + raise NotImplementedError( + "arity of more than 1 is not supported for the 'how' argument" + ) + out_shape = (self.ngroups,) + values.shape[1:] + + func, values = self._get_cython_func_and_vals(kind, how, values, is_numeric) + + if how == "rank": + out_dtype = "float" + else: + if is_numeric: + out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}" + else: + out_dtype = "object" + + codes, _, _ = self.group_info + + if kind == "aggregate": + result = _maybe_fill( + np.empty(out_shape, dtype=out_dtype), fill_value=np.nan + ) + counts = np.zeros(self.ngroups, dtype=np.int64) + result = self._aggregate( + result, counts, values, codes, func, is_datetimelike, min_count + ) + elif kind == "transform": + result = _maybe_fill( + np.empty_like(values, dtype=out_dtype), fill_value=np.nan + ) + + # TODO: min_count + result = self._transform( + result, values, codes, func, is_datetimelike, **kwargs + ) + + if is_integer_dtype(result) and not is_datetimelike: + mask = result == iNaT + if mask.any(): + result = result.astype("float64") + result[mask] = np.nan + elif ( + how == "add" + and is_integer_dtype(orig_values.dtype) + and is_extension_array_dtype(orig_values.dtype) + ): + # We need this to ensure that Series[Int64Dtype].resample().sum() + # remains int64 dtype. + # Two options for avoiding this special case + # 1. mask-aware ops and avoid casting to float with NaN above + # 2. specify the result dtype when calling this method + result = result.astype("int64") + + if kind == "aggregate" and self._filter_empty_groups and not counts.all(): + assert result.ndim != 2 + result = result[counts > 0] + + if vdim == 1 and arity == 1: + result = result[:, 0] + + names: Optional[List[str]] = self._name_functions.get(how, None) + + if swapped: + result = result.swapaxes(0, axis) + + if is_datetime64tz_dtype(orig_values.dtype): + result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype) + elif is_datetimelike and kind == "aggregate": + result = result.astype(orig_values.dtype) + + return result, names + + def aggregate( + self, values, how: str, axis: int = 0, min_count: int = -1 + ) -> Tuple[np.ndarray, Optional[List[str]]]: + return self._cython_operation( + "aggregate", values, how, axis, min_count=min_count + ) + + def transform(self, values, how: str, axis: int = 0, **kwargs): + return self._cython_operation("transform", values, how, axis, **kwargs) + + def _aggregate( + self, + result, + counts, + values, + comp_ids, + agg_func, + is_datetimelike: bool, + min_count: int = -1, + ): + if agg_func is libgroupby.group_nth: + # different signature from the others + # TODO: should we be using min_count instead of hard-coding it? + agg_func(result, counts, values, comp_ids, rank=1, min_count=-1) + else: + agg_func(result, counts, values, comp_ids, min_count) + + return result + + def _transform( + self, result, values, comp_ids, transform_func, is_datetimelike: bool, **kwargs + ): + + comp_ids, _, ngroups = self.group_info + transform_func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) + + return result + + def agg_series(self, obj: Series, func): + # Caller is responsible for checking ngroups != 0 + assert self.ngroups != 0 + + if len(obj) == 0: + # SeriesGrouper would raise if we were to call _aggregate_series_fast + return self._aggregate_series_pure_python(obj, func) + + elif is_extension_array_dtype(obj.dtype): + # _aggregate_series_fast would raise TypeError when + # calling libreduction.Slider + # In the datetime64tz case it would incorrectly cast to tz-naive + # TODO: can we get a performant workaround for EAs backed by ndarray? + return self._aggregate_series_pure_python(obj, func) + + elif obj.index._has_complex_internals: + # Pre-empt TypeError in _aggregate_series_fast + return self._aggregate_series_pure_python(obj, func) + + try: + return self._aggregate_series_fast(obj, func) + except ValueError as err: + if "Function does not reduce" in str(err): + # raised in libreduction + pass + else: + raise + return self._aggregate_series_pure_python(obj, func) + + def _aggregate_series_fast(self, obj: Series, func): + # At this point we have already checked that + # - obj.index is not a MultiIndex + # - obj is backed by an ndarray, not ExtensionArray + # - len(obj) > 0 + # - ngroups != 0 + func = self._is_builtin_func(func) + + group_index, _, ngroups = self.group_info + + # avoids object / Series creation overhead + dummy = obj._get_values(slice(None, 0)) + indexer = get_group_index_sorter(group_index, ngroups) + obj = obj.take(indexer) + group_index = algorithms.take_nd(group_index, indexer, allow_fill=False) + grouper = libreduction.SeriesGrouper(obj, func, group_index, ngroups, dummy) + result, counts = grouper.get_result() + return result, counts + + def _aggregate_series_pure_python(self, obj: Series, func): + + group_index, _, ngroups = self.group_info + + counts = np.zeros(ngroups, dtype=int) + result = None + + splitter = get_splitter(obj, group_index, ngroups, axis=0) + + for label, group in splitter: + res = func(group) + if result is None: + if isinstance(res, (Series, Index, np.ndarray)): + if len(res) == 1: + # e.g. test_agg_lambda_with_timezone lambda e: e.head(1) + # FIXME: are we potentially losing import res.index info? + res = res.item() + else: + raise ValueError("Function does not reduce") + result = np.empty(ngroups, dtype="O") + + counts[label] = group.shape[0] + result[label] = res + + assert result is not None + result = lib.maybe_convert_objects(result, try_float=0) + # TODO: try_cast back to EA? + + return result, counts + + +class BinGrouper(BaseGrouper): + """ + This is an internal Grouper class + + Parameters + ---------- + bins : the split index of binlabels to group the item of axis + binlabels : the label list + filter_empty : boolean, default False + mutated : boolean, default False + indexer : a intp array + + Examples + -------- + bins: [2, 4, 6, 8, 10] + binlabels: DatetimeIndex(['2005-01-01', '2005-01-03', + '2005-01-05', '2005-01-07', '2005-01-09'], + dtype='datetime64[ns]', freq='2D') + + the group_info, which contains the label of each item in grouped + axis, the index of label in label list, group number, is + + (array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4]), array([0, 1, 2, 3, 4]), 5) + + means that, the grouped axis has 10 items, can be grouped into 5 + labels, the first and second items belong to the first label, the + third and forth items belong to the second label, and so on + + """ + + def __init__( + self, + bins, + binlabels, + filter_empty: bool = False, + mutated: bool = False, + indexer=None, + ): + self.bins = ensure_int64(bins) + self.binlabels = ensure_index(binlabels) + self._filter_empty_groups = filter_empty + self.mutated = mutated + self.indexer = indexer + + # These lengths must match, otherwise we could call agg_series + # with empty self.bins, which would raise in libreduction. + assert len(self.binlabels) == len(self.bins) + + @cache_readonly + def groups(self): + """ dict {group name -> group labels} """ + + # this is mainly for compat + # GH 3881 + result = { + key: value + for key, value in zip(self.binlabels, self.bins) + if key is not NaT + } + return result + + @property + def nkeys(self) -> int: + return 1 + + def _get_grouper(self): + """ + We are a grouper as part of another's groupings. + + We have a specific method of grouping, so cannot + convert to a Index for our grouper. + """ + return self + + def get_iterator(self, data: FrameOrSeries, axis: int = 0): + """ + Groupby iterator + + Returns + ------- + Generator yielding sequence of (name, subsetted object) + for each group + """ + slicer = lambda start, edge: data._slice(slice(start, edge), axis=axis) + length = len(data.axes[axis]) + + start = 0 + for edge, label in zip(self.bins, self.binlabels): + if label is not NaT: + yield label, slicer(start, edge) + start = edge + + if start < length: + yield self.binlabels[-1], slicer(start, None) + + @cache_readonly + def indices(self): + indices = collections.defaultdict(list) + + i = 0 + for label, bin in zip(self.binlabels, self.bins): + if i < bin: + if label is not NaT: + indices[label] = list(range(i, bin)) + i = bin + return indices + + @cache_readonly + def group_info(self): + ngroups = self.ngroups + obs_group_ids = np.arange(ngroups) + rep = np.diff(np.r_[0, self.bins]) + + rep = ensure_platform_int(rep) + if ngroups == len(self.bins): + comp_ids = np.repeat(np.arange(ngroups), rep) + else: + comp_ids = np.repeat(np.r_[-1, np.arange(ngroups)], rep) + + return ( + comp_ids.astype("int64", copy=False), + obs_group_ids.astype("int64", copy=False), + ngroups, + ) + + @cache_readonly + def reconstructed_codes(self) -> List[np.ndarray]: + # get unique result indices, and prepend 0 as groupby starts from the first + return [np.r_[0, np.flatnonzero(self.bins[1:] != self.bins[:-1]) + 1]] + + @cache_readonly + def result_index(self): + if len(self.binlabels) != 0 and isna(self.binlabels[0]): + return self.binlabels[1:] + + return self.binlabels + + @property + def levels(self): + return [self.binlabels] + + @property + def names(self): + return [self.binlabels.name] + + @property + def groupings(self) -> "List[grouper.Grouping]": + return [ + grouper.Grouping(lvl, lvl, in_axis=False, level=None, name=name) + for lvl, name in zip(self.levels, self.names) + ] + + def agg_series(self, obj: Series, func): + # Caller is responsible for checking ngroups != 0 + assert self.ngroups != 0 + assert len(self.bins) > 0 # otherwise we'd get IndexError in get_result + + if is_extension_array_dtype(obj.dtype): + # pre-empt SeriesBinGrouper from raising TypeError + return self._aggregate_series_pure_python(obj, func) + + dummy = obj[:0] + grouper = libreduction.SeriesBinGrouper(obj, func, self.bins, dummy) + return grouper.get_result() + + +def _is_indexed_like(obj, axes) -> bool: + if isinstance(obj, Series): + if len(axes) > 1: + return False + return obj.index.equals(axes[0]) + elif isinstance(obj, DataFrame): + return obj.index.equals(axes[0]) + + return False + + +# ---------------------------------------------------------------------- +# Splitting / application + + +class DataSplitter: + def __init__(self, data: FrameOrSeries, labels, ngroups: int, axis: int = 0): + self.data = data + self.labels = ensure_int64(labels) + self.ngroups = ngroups + + self.axis = axis + assert isinstance(axis, int), axis + + @cache_readonly + def slabels(self): + # Sorted labels + return algorithms.take_nd(self.labels, self.sort_idx, allow_fill=False) + + @cache_readonly + def sort_idx(self): + # Counting sort indexer + return get_group_index_sorter(self.labels, self.ngroups) + + def __iter__(self): + sdata = self._get_sorted_data() + + if self.ngroups == 0: + # we are inside a generator, rather than raise StopIteration + # we merely return signal the end + return + + starts, ends = lib.generate_slices(self.slabels, self.ngroups) + + for i, (start, end) in enumerate(zip(starts, ends)): + yield i, self._chop(sdata, slice(start, end)) + + def _get_sorted_data(self) -> FrameOrSeries: + return self.data.take(self.sort_idx, axis=self.axis) + + def _chop(self, sdata, slice_obj: slice) -> NDFrame: + raise AbstractMethodError(self) + + +class SeriesSplitter(DataSplitter): + def _chop(self, sdata: Series, slice_obj: slice) -> Series: + return sdata._get_values(slice_obj) + + +class FrameSplitter(DataSplitter): + def fast_apply(self, f, names): + # must return keys::list, values::list, mutated::bool + starts, ends = lib.generate_slices(self.slabels, self.ngroups) + + sdata = self._get_sorted_data() + return libreduction.apply_frame_axis0(sdata, f, names, starts, ends) + + def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: + if self.axis == 0: + return sdata.iloc[slice_obj] + else: + return sdata._slice(slice_obj, axis=1) + + +def get_splitter(data: FrameOrSeries, *args, **kwargs) -> DataSplitter: + if isinstance(data, Series): + klass: Type[DataSplitter] = SeriesSplitter + else: + # i.e. DataFrame + klass = FrameSplitter + + return klass(data, *args, **kwargs) diff --git a/venv/Lib/site-packages/pandas/core/index.py b/venv/Lib/site-packages/pandas/core/index.py new file mode 100644 index 0000000..8cff53d --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/index.py @@ -0,0 +1,31 @@ +import warnings + +from pandas.core.indexes.api import ( # noqa:F401 + CategoricalIndex, + DatetimeIndex, + Float64Index, + Index, + Int64Index, + IntervalIndex, + InvalidIndexError, + MultiIndex, + NaT, + NumericIndex, + PeriodIndex, + RangeIndex, + TimedeltaIndex, + UInt64Index, + _new_Index, + ensure_index, + ensure_index_from_sequences, + get_objs_combined_axis, +) +from pandas.core.indexes.multi import _sparsify # noqa:F401 + +# GH#30193 +warnings.warn( + "pandas.core.index is deprecated and will be removed in a future version. " + "The public classes are available in the top-level namespace.", + FutureWarning, + stacklevel=2, +) diff --git a/venv/Lib/site-packages/pandas/core/indexers.py b/venv/Lib/site-packages/pandas/core/indexers.py new file mode 100644 index 0000000..fe47552 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/indexers.py @@ -0,0 +1,416 @@ +""" +Low-dependency indexing utilities. +""" +import warnings + +import numpy as np + +from pandas._typing import Any, AnyArrayLike + +from pandas.core.dtypes.common import ( + is_array_like, + is_bool_dtype, + is_integer_dtype, + is_list_like, +) +from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries + +# ----------------------------------------------------------- +# Indexer Identification + + +def is_list_like_indexer(key) -> bool: + """ + Check if we have a list-like indexer that is *not* a NamedTuple. + + Parameters + ---------- + key : object + + Returns + ------- + bool + """ + # allow a list_like, but exclude NamedTuples which can be indexers + return is_list_like(key) and not (isinstance(key, tuple) and type(key) is not tuple) + + +def is_scalar_indexer(indexer, arr_value) -> bool: + """ + Return True if we are all scalar indexers. + + Returns + ------- + bool + """ + if arr_value.ndim == 1: + if not isinstance(indexer, tuple): + indexer = tuple([indexer]) + return any(isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer) + return False + + +def is_empty_indexer(indexer, arr_value: np.ndarray) -> bool: + """ + Check if we have an empty indexer. + + Parameters + ---------- + indexer : object + arr_value : np.ndarray + + Returns + ------- + bool + """ + if is_list_like(indexer) and not len(indexer): + return True + if arr_value.ndim == 1: + if not isinstance(indexer, tuple): + indexer = tuple([indexer]) + return any(isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer) + return False + + +# ----------------------------------------------------------- +# Indexer Validation + + +def check_setitem_lengths(indexer, value, values) -> None: + """ + Validate that value and indexer are the same length. + + An special-case is allowed for when the indexer is a boolean array + and the number of true values equals the length of ``value``. In + this case, no exception is raised. + + Parameters + ---------- + indexer : sequence + Key for the setitem. + value : array-like + Value for the setitem. + values : array-like + Values being set into. + + Returns + ------- + None + + Raises + ------ + ValueError + When the indexer is an ndarray or list and the lengths don't match. + """ + # boolean with truth values == len of the value is ok too + if isinstance(indexer, (np.ndarray, list)): + if is_list_like(value) and len(indexer) != len(value): + if not ( + isinstance(indexer, np.ndarray) + and indexer.dtype == np.bool_ + and len(indexer[indexer]) == len(value) + ): + raise ValueError( + "cannot set using a list-like indexer " + "with a different length than the value" + ) + + elif isinstance(indexer, slice): + # slice + if is_list_like(value) and len(values): + if len(value) != length_of_indexer(indexer, values): + raise ValueError( + "cannot set using a slice indexer with a " + "different length than the value" + ) + + +def validate_indices(indices: np.ndarray, n: int) -> None: + """ + Perform bounds-checking for an indexer. + + -1 is allowed for indicating missing values. + + Parameters + ---------- + indices : ndarray + n : int + Length of the array being indexed. + + Raises + ------ + ValueError + + Examples + -------- + >>> validate_indices([1, 2], 3) + # OK + >>> validate_indices([1, -2], 3) + ValueError + >>> validate_indices([1, 2, 3], 3) + IndexError + >>> validate_indices([-1, -1], 0) + # OK + >>> validate_indices([0, 1], 0) + IndexError + """ + if len(indices): + min_idx = indices.min() + if min_idx < -1: + msg = f"'indices' contains values less than allowed ({min_idx} < -1)" + raise ValueError(msg) + + max_idx = indices.max() + if max_idx >= n: + raise IndexError("indices are out-of-bounds") + + +# ----------------------------------------------------------- +# Indexer Conversion + + +def maybe_convert_indices(indices, n: int): + """ + Attempt to convert indices into valid, positive indices. + + If we have negative indices, translate to positive here. + If we have indices that are out-of-bounds, raise an IndexError. + + Parameters + ---------- + indices : array-like + Array of indices that we are to convert. + n : int + Number of elements in the array that we are indexing. + + Returns + ------- + array-like + An array-like of positive indices that correspond to the ones + that were passed in initially to this function. + + Raises + ------ + IndexError + One of the converted indices either exceeded the number of, + elements (specified by `n`), or was still negative. + """ + if isinstance(indices, list): + indices = np.array(indices) + if len(indices) == 0: + # If `indices` is empty, np.array will return a float, + # and will cause indexing errors. + return np.empty(0, dtype=np.intp) + + mask = indices < 0 + if mask.any(): + indices = indices.copy() + indices[mask] += n + + mask = (indices >= n) | (indices < 0) + if mask.any(): + raise IndexError("indices are out-of-bounds") + return indices + + +# ----------------------------------------------------------- +# Unsorted + + +def length_of_indexer(indexer, target=None) -> int: + """ + Return the length of a single non-tuple indexer which could be a slice. + + Returns + ------- + int + """ + if target is not None and isinstance(indexer, slice): + target_len = len(target) + start = indexer.start + stop = indexer.stop + step = indexer.step + if start is None: + start = 0 + elif start < 0: + start += target_len + if stop is None or stop > target_len: + stop = target_len + elif stop < 0: + stop += target_len + if step is None: + step = 1 + elif step < 0: + start, stop = stop + 1, start + 1 + step = -step + return (stop - start + step - 1) // step + elif isinstance(indexer, (ABCSeries, ABCIndexClass, np.ndarray, list)): + return len(indexer) + elif not is_list_like_indexer(indexer): + return 1 + raise AssertionError("cannot find the length of the indexer") + + +def deprecate_ndim_indexing(result): + """ + Helper function to raise the deprecation warning for multi-dimensional + indexing on 1D Series/Index. + + GH#27125 indexer like idx[:, None] expands dim, but we cannot do that + and keep an index, so we currently return ndarray, which is deprecated + (Deprecation GH#30588). + """ + if np.ndim(result) > 1: + warnings.warn( + "Support for multi-dimensional indexing (e.g. `index[:, None]`) " + "on an Index is deprecated and will be removed in a future " + "version. Convert to a numpy array before indexing instead.", + DeprecationWarning, + stacklevel=3, + ) + + +# ----------------------------------------------------------- +# Public indexer validation + + +def check_array_indexer(array: AnyArrayLike, indexer: Any) -> Any: + """ + Check if `indexer` is a valid array indexer for `array`. + + For a boolean mask, `array` and `indexer` are checked to have the same + length. The dtype is validated, and if it is an integer or boolean + ExtensionArray, it is checked if there are missing values present, and + it is converted to the appropriate numpy array. Other dtypes will raise + an error. + + Non-array indexers (integer, slice, Ellipsis, tuples, ..) are passed + through as is. + + .. versionadded:: 1.0.0 + + Parameters + ---------- + array : array-like + The array that is being indexed (only used for the length). + indexer : array-like or list-like + The array-like that's used to index. List-like input that is not yet + a numpy array or an ExtensionArray is converted to one. Other input + types are passed through as is + + Returns + ------- + numpy.ndarray + The validated indexer as a numpy array that can be used to index. + + Raises + ------ + IndexError + When the lengths don't match. + ValueError + When `indexer` cannot be converted to a numpy ndarray to index + (e.g. presence of missing values). + + See Also + -------- + api.types.is_bool_dtype : Check if `key` is of boolean dtype. + + Examples + -------- + When checking a boolean mask, a boolean ndarray is returned when the + arguments are all valid. + + >>> mask = pd.array([True, False]) + >>> arr = pd.array([1, 2]) + >>> pd.api.indexers.check_array_indexer(arr, mask) + array([ True, False]) + + An IndexError is raised when the lengths don't match. + + >>> mask = pd.array([True, False, True]) + >>> pd.api.indexers.check_array_indexer(arr, mask) + Traceback (most recent call last): + ... + IndexError: Boolean index has wrong length: 3 instead of 2. + + A ValueError is raised when the mask cannot be converted to + a bool-dtype ndarray. + + >>> mask = pd.array([True, pd.NA]) + >>> pd.api.indexers.check_array_indexer(arr, mask) + Traceback (most recent call last): + ... + ValueError: Cannot mask with a boolean indexer containing NA values + + A numpy boolean mask will get passed through (if the length is correct): + + >>> mask = np.array([True, False]) + >>> pd.api.indexers.check_array_indexer(arr, mask) + array([ True, False]) + + Similarly for integer indexers, an integer ndarray is returned when it is + a valid indexer, otherwise an error is (for integer indexers, a matching + length is not required): + + >>> indexer = pd.array([0, 2], dtype="Int64") + >>> arr = pd.array([1, 2, 3]) + >>> pd.api.indexers.check_array_indexer(arr, indexer) + array([0, 2]) + + >>> indexer = pd.array([0, pd.NA], dtype="Int64") + >>> pd.api.indexers.check_array_indexer(arr, indexer) + Traceback (most recent call last): + ... + ValueError: Cannot index with an integer indexer containing NA values + + For non-integer/boolean dtypes, an appropriate error is raised: + + >>> indexer = np.array([0., 2.], dtype="float64") + >>> pd.api.indexers.check_array_indexer(arr, indexer) + Traceback (most recent call last): + ... + IndexError: arrays used as indices must be of integer or boolean type + """ + from pandas.core.construction import array as pd_array + + # whathever is not an array-like is returned as-is (possible valid array + # indexers that are not array-like: integer, slice, Ellipsis, None) + # In this context, tuples are not considered as array-like, as they have + # a specific meaning in indexing (multi-dimensional indexing) + if is_list_like(indexer): + if isinstance(indexer, tuple): + return indexer + else: + return indexer + + # convert list-likes to array + if not is_array_like(indexer): + indexer = pd_array(indexer) + if len(indexer) == 0: + # empty list is converted to float array by pd.array + indexer = np.array([], dtype=np.intp) + + dtype = indexer.dtype + if is_bool_dtype(dtype): + try: + indexer = np.asarray(indexer, dtype=bool) + except ValueError: + raise ValueError("Cannot mask with a boolean indexer containing NA values") + + # GH26658 + if len(indexer) != len(array): + raise IndexError( + f"Boolean index has wrong length: " + f"{len(indexer)} instead of {len(array)}" + ) + elif is_integer_dtype(dtype): + try: + indexer = np.asarray(indexer, dtype=np.intp) + except ValueError: + raise ValueError( + "Cannot index with an integer indexer containing NA values" + ) + else: + raise IndexError("arrays used as indices must be of integer or boolean type") + + return indexer diff --git a/venv/Lib/site-packages/pandas/core/indexes/__init__.py b/venv/Lib/site-packages/pandas/core/indexes/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/core/indexes/accessors.py b/venv/Lib/site-packages/pandas/core/indexes/accessors.py new file mode 100644 index 0000000..db774a0 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/indexes/accessors.py @@ -0,0 +1,338 @@ +""" +datetimelike delegation +""" +import numpy as np + +from pandas.core.dtypes.common import ( + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_datetime_arraylike, + is_integer_dtype, + is_list_like, + is_period_arraylike, + is_timedelta64_dtype, +) +from pandas.core.dtypes.generic import ABCSeries + +from pandas.core.accessor import PandasDelegate, delegate_names +from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray +from pandas.core.base import NoNewAttributesMixin, PandasObject +from pandas.core.indexes.datetimes import DatetimeIndex +from pandas.core.indexes.timedeltas import TimedeltaIndex + + +class Properties(PandasDelegate, PandasObject, NoNewAttributesMixin): + def __init__(self, data, orig): + if not isinstance(data, ABCSeries): + raise TypeError( + f"cannot convert an object of type {type(data)} to a datetimelike index" + ) + + self._parent = data + self.orig = orig + self.name = getattr(data, "name", None) + self._freeze() + + def _get_values(self): + data = self._parent + if is_datetime64_dtype(data.dtype): + return DatetimeIndex(data, copy=False, name=self.name) + + elif is_datetime64tz_dtype(data.dtype): + return DatetimeIndex(data, copy=False, name=self.name) + + elif is_timedelta64_dtype(data.dtype): + return TimedeltaIndex(data, copy=False, name=self.name) + + else: + if is_period_arraylike(data): + # TODO: use to_period_array + return PeriodArray(data, copy=False) + if is_datetime_arraylike(data): + return DatetimeIndex(data, copy=False, name=self.name) + + raise TypeError( + f"cannot convert an object of type {type(data)} to a datetimelike index" + ) + + def _delegate_property_get(self, name): + from pandas import Series + + values = self._get_values() + + result = getattr(values, name) + + # maybe need to upcast (ints) + if isinstance(result, np.ndarray): + if is_integer_dtype(result): + result = result.astype("int64") + elif not is_list_like(result): + return result + + result = np.asarray(result) + + if self.orig is not None: + index = self.orig.index + else: + index = self._parent.index + # return the result as a Series, which is by definition a copy + result = Series(result, index=index, name=self.name) + + # setting this object will show a SettingWithCopyWarning/Error + result._is_copy = ( + "modifications to a property of a datetimelike " + "object are not supported and are discarded. " + "Change values on the original." + ) + + return result + + def _delegate_property_set(self, name, value, *args, **kwargs): + raise ValueError( + "modifications to a property of a datetimelike object are not supported. " + "Change values on the original." + ) + + def _delegate_method(self, name, *args, **kwargs): + from pandas import Series + + values = self._get_values() + + method = getattr(values, name) + result = method(*args, **kwargs) + + if not is_list_like(result): + return result + + result = Series(result, index=self._parent.index, name=self.name) + + # setting this object will show a SettingWithCopyWarning/Error + result._is_copy = ( + "modifications to a method of a datetimelike " + "object are not supported and are discarded. " + "Change values on the original." + ) + + return result + + +@delegate_names( + delegate=DatetimeArray, accessors=DatetimeArray._datetimelike_ops, typ="property" +) +@delegate_names( + delegate=DatetimeArray, accessors=DatetimeArray._datetimelike_methods, typ="method" +) +class DatetimeProperties(Properties): + """ + Accessor object for datetimelike properties of the Series values. + + Examples + -------- + >>> s.dt.hour + >>> s.dt.second + >>> s.dt.quarter + + Returns a Series indexed like the original Series. + Raises TypeError if the Series does not contain datetimelike values. + """ + + def to_pydatetime(self): + """ + Return the data as an array of native Python datetime objects. + + Timezone information is retained if present. + + .. warning:: + + Python's datetime uses microsecond resolution, which is lower than + pandas (nanosecond). The values are truncated. + + Returns + ------- + numpy.ndarray + Object dtype array containing native Python datetime objects. + + See Also + -------- + datetime.datetime : Standard library value for a datetime. + + Examples + -------- + >>> s = pd.Series(pd.date_range('20180310', periods=2)) + >>> s + 0 2018-03-10 + 1 2018-03-11 + dtype: datetime64[ns] + + >>> s.dt.to_pydatetime() + array([datetime.datetime(2018, 3, 10, 0, 0), + datetime.datetime(2018, 3, 11, 0, 0)], dtype=object) + + pandas' nanosecond precision is truncated to microseconds. + + >>> s = pd.Series(pd.date_range('20180310', periods=2, freq='ns')) + >>> s + 0 2018-03-10 00:00:00.000000000 + 1 2018-03-10 00:00:00.000000001 + dtype: datetime64[ns] + + >>> s.dt.to_pydatetime() + array([datetime.datetime(2018, 3, 10, 0, 0), + datetime.datetime(2018, 3, 10, 0, 0)], dtype=object) + """ + return self._get_values().to_pydatetime() + + @property + def freq(self): + return self._get_values().inferred_freq + + +@delegate_names( + delegate=TimedeltaArray, accessors=TimedeltaArray._datetimelike_ops, typ="property" +) +@delegate_names( + delegate=TimedeltaArray, + accessors=TimedeltaArray._datetimelike_methods, + typ="method", +) +class TimedeltaProperties(Properties): + """ + Accessor object for datetimelike properties of the Series values. + + Examples + -------- + >>> s.dt.hours + >>> s.dt.seconds + + Returns a Series indexed like the original Series. + Raises TypeError if the Series does not contain datetimelike values. + """ + + def to_pytimedelta(self): + """ + Return an array of native `datetime.timedelta` objects. + + Python's standard `datetime` library uses a different representation + timedelta's. This method converts a Series of pandas Timedeltas + to `datetime.timedelta` format with the same length as the original + Series. + + Returns + ------- + numpy.ndarray + Array of 1D containing data with `datetime.timedelta` type. + + See Also + -------- + datetime.timedelta + + Examples + -------- + >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit='d')) + >>> s + 0 0 days + 1 1 days + 2 2 days + 3 3 days + 4 4 days + dtype: timedelta64[ns] + + >>> s.dt.to_pytimedelta() + array([datetime.timedelta(0), datetime.timedelta(1), + datetime.timedelta(2), datetime.timedelta(3), + datetime.timedelta(4)], dtype=object) + """ + return self._get_values().to_pytimedelta() + + @property + def components(self): + """ + Return a Dataframe of the components of the Timedeltas. + + Returns + ------- + DataFrame + + Examples + -------- + >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit='s')) + >>> s + 0 00:00:00 + 1 00:00:01 + 2 00:00:02 + 3 00:00:03 + 4 00:00:04 + dtype: timedelta64[ns] + >>> s.dt.components + days hours minutes seconds milliseconds microseconds nanoseconds + 0 0 0 0 0 0 0 0 + 1 0 0 0 1 0 0 0 + 2 0 0 0 2 0 0 0 + 3 0 0 0 3 0 0 0 + 4 0 0 0 4 0 0 0 + """ # noqa: E501 + return self._get_values().components.set_index(self._parent.index) + + @property + def freq(self): + return self._get_values().inferred_freq + + +@delegate_names( + delegate=PeriodArray, accessors=PeriodArray._datetimelike_ops, typ="property" +) +@delegate_names( + delegate=PeriodArray, accessors=PeriodArray._datetimelike_methods, typ="method" +) +class PeriodProperties(Properties): + """ + Accessor object for datetimelike properties of the Series values. + + Examples + -------- + >>> s.dt.hour + >>> s.dt.second + >>> s.dt.quarter + + Returns a Series indexed like the original Series. + Raises TypeError if the Series does not contain datetimelike values. + """ + + +class CombinedDatetimelikeProperties( + DatetimeProperties, TimedeltaProperties, PeriodProperties +): + def __new__(cls, data): + # CombinedDatetimelikeProperties isn't really instantiated. Instead + # we need to choose which parent (datetime or timedelta) is + # appropriate. Since we're checking the dtypes anyway, we'll just + # do all the validation here. + from pandas import Series + + if not isinstance(data, ABCSeries): + raise TypeError( + f"cannot convert an object of type {type(data)} to a datetimelike index" + ) + + orig = data if is_categorical_dtype(data) else None + if orig is not None: + data = Series( + orig.array, + name=orig.name, + copy=False, + dtype=orig.values.categories.dtype, + ) + + if is_datetime64_dtype(data.dtype): + return DatetimeProperties(data, orig) + elif is_datetime64tz_dtype(data.dtype): + return DatetimeProperties(data, orig) + elif is_timedelta64_dtype(data.dtype): + return TimedeltaProperties(data, orig) + elif is_period_arraylike(data): + return PeriodProperties(data, orig) + elif is_datetime_arraylike(data): + return DatetimeProperties(data, orig) + + raise AttributeError("Can only use .dt accessor with datetimelike values") diff --git a/venv/Lib/site-packages/pandas/core/indexes/api.py b/venv/Lib/site-packages/pandas/core/indexes/api.py new file mode 100644 index 0000000..4072d06 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/indexes/api.py @@ -0,0 +1,301 @@ +import textwrap +from typing import List, Set + +from pandas._libs import NaT, lib + +import pandas.core.common as com +from pandas.core.indexes.base import ( + Index, + InvalidIndexError, + _new_Index, + ensure_index, + ensure_index_from_sequences, +) +from pandas.core.indexes.category import CategoricalIndex +from pandas.core.indexes.datetimes import DatetimeIndex +from pandas.core.indexes.interval import IntervalIndex +from pandas.core.indexes.multi import MultiIndex +from pandas.core.indexes.numeric import ( + Float64Index, + Int64Index, + NumericIndex, + UInt64Index, +) +from pandas.core.indexes.period import PeriodIndex +from pandas.core.indexes.range import RangeIndex +from pandas.core.indexes.timedeltas import TimedeltaIndex + +_sort_msg = textwrap.dedent( + """\ +Sorting because non-concatenation axis is not aligned. A future version +of pandas will change to not sort by default. + +To accept the future behavior, pass 'sort=False'. + +To retain the current behavior and silence the warning, pass 'sort=True'. +""" +) + + +__all__ = [ + "Index", + "MultiIndex", + "NumericIndex", + "Float64Index", + "Int64Index", + "CategoricalIndex", + "IntervalIndex", + "RangeIndex", + "UInt64Index", + "InvalidIndexError", + "TimedeltaIndex", + "PeriodIndex", + "DatetimeIndex", + "_new_Index", + "NaT", + "ensure_index", + "ensure_index_from_sequences", + "get_objs_combined_axis", + "union_indexes", + "get_consensus_names", + "all_indexes_same", +] + + +def get_objs_combined_axis( + objs, intersect: bool = False, axis=0, sort: bool = True +) -> Index: + """ + Extract combined index: return intersection or union (depending on the + value of "intersect") of indexes on given axis, or None if all objects + lack indexes (e.g. they are numpy arrays). + + Parameters + ---------- + objs : list + Series or DataFrame objects, may be mix of the two. + intersect : bool, default False + If True, calculate the intersection between indexes. Otherwise, + calculate the union. + axis : {0 or 'index', 1 or 'outer'}, default 0 + The axis to extract indexes from. + sort : bool, default True + Whether the result index should come out sorted or not. + + Returns + ------- + Index + """ + obs_idxes = [obj._get_axis(axis) for obj in objs] + return _get_combined_index(obs_idxes, intersect=intersect, sort=sort) + + +def _get_distinct_objs(objs: List[Index]) -> List[Index]: + """ + Return a list with distinct elements of "objs" (different ids). + Preserves order. + """ + ids: Set[int] = set() + res = [] + for obj in objs: + if id(obj) not in ids: + ids.add(id(obj)) + res.append(obj) + return res + + +def _get_combined_index( + indexes: List[Index], intersect: bool = False, sort: bool = False +) -> Index: + """ + Return the union or intersection of indexes. + + Parameters + ---------- + indexes : list of Index or list objects + When intersect=True, do not accept list of lists. + intersect : bool, default False + If True, calculate the intersection between indexes. Otherwise, + calculate the union. + sort : bool, default False + Whether the result index should come out sorted or not. + + Returns + ------- + Index + """ + # TODO: handle index names! + indexes = _get_distinct_objs(indexes) + if len(indexes) == 0: + index = Index([]) + elif len(indexes) == 1: + index = indexes[0] + elif intersect: + index = indexes[0] + for other in indexes[1:]: + index = index.intersection(other) + else: + index = union_indexes(indexes, sort=sort) + index = ensure_index(index) + + if sort: + try: + index = index.sort_values() + except TypeError: + pass + return index + + +def union_indexes(indexes, sort=True) -> Index: + """ + Return the union of indexes. + + The behavior of sort and names is not consistent. + + Parameters + ---------- + indexes : list of Index or list objects + sort : bool, default True + Whether the result index should come out sorted or not. + + Returns + ------- + Index + """ + if len(indexes) == 0: + raise AssertionError("Must have at least 1 Index to union") + if len(indexes) == 1: + result = indexes[0] + if isinstance(result, list): + result = Index(sorted(result)) + return result + + indexes, kind = _sanitize_and_check(indexes) + + def _unique_indices(inds) -> Index: + """ + Convert indexes to lists and concatenate them, removing duplicates. + + The final dtype is inferred. + + Parameters + ---------- + inds : list of Index or list objects + + Returns + ------- + Index + """ + + def conv(i): + if isinstance(i, Index): + i = i.tolist() + return i + + return Index(lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort)) + + if kind == "special": + result = indexes[0] + + if hasattr(result, "union_many"): + # DatetimeIndex + return result.union_many(indexes[1:]) + else: + for other in indexes[1:]: + result = result.union(other) + return result + elif kind == "array": + index = indexes[0] + for other in indexes[1:]: + if not index.equals(other): + return _unique_indices(indexes) + + name = get_consensus_names(indexes)[0] + if name != index.name: + index = index._shallow_copy(name=name) + return index + else: # kind='list' + return _unique_indices(indexes) + + +def _sanitize_and_check(indexes): + """ + Verify the type of indexes and convert lists to Index. + + Cases: + + - [list, list, ...]: Return ([list, list, ...], 'list') + - [list, Index, ...]: Return _sanitize_and_check([Index, Index, ...]) + Lists are sorted and converted to Index. + - [Index, Index, ...]: Return ([Index, Index, ...], TYPE) + TYPE = 'special' if at least one special type, 'array' otherwise. + + Parameters + ---------- + indexes : list of Index or list objects + + Returns + ------- + sanitized_indexes : list of Index or list objects + type : {'list', 'array', 'special'} + """ + kinds = list({type(index) for index in indexes}) + + if list in kinds: + if len(kinds) > 1: + indexes = [ + Index(com.try_sort(x)) if not isinstance(x, Index) else x + for x in indexes + ] + kinds.remove(list) + else: + return indexes, "list" + + if len(kinds) > 1 or Index not in kinds: + return indexes, "special" + else: + return indexes, "array" + + +def get_consensus_names(indexes): + """ + Give a consensus 'names' to indexes. + + If there's exactly one non-empty 'names', return this, + otherwise, return empty. + + Parameters + ---------- + indexes : list of Index objects + + Returns + ------- + list + A list representing the consensus 'names' found. + """ + # find the non-none names, need to tupleify to make + # the set hashable, then reverse on return + consensus_names = {tuple(i.names) for i in indexes if com.any_not_none(*i.names)} + if len(consensus_names) == 1: + return list(list(consensus_names)[0]) + return [None] * indexes[0].nlevels + + +def all_indexes_same(indexes): + """ + Determine if all indexes contain the same elements. + + Parameters + ---------- + indexes : list of Index objects + + Returns + ------- + bool + True if all indexes contain the same elements, False otherwise. + """ + first = indexes[0] + for index in indexes[1:]: + if not first.equals(index): + return False + return True diff --git a/venv/Lib/site-packages/pandas/core/indexes/base.py b/venv/Lib/site-packages/pandas/core/indexes/base.py new file mode 100644 index 0000000..4fcddb5 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/indexes/base.py @@ -0,0 +1,5576 @@ +from datetime import datetime +import operator +from textwrap import dedent +from typing import Dict, FrozenSet, Hashable, Optional, Union +import warnings + +import numpy as np + +from pandas._libs import algos as libalgos, index as libindex, lib +import pandas._libs.join as libjoin +from pandas._libs.lib import is_datetime_array +from pandas._libs.tslibs import OutOfBoundsDatetime, Timestamp +from pandas._libs.tslibs.period import IncompatibleFrequency +from pandas._libs.tslibs.timezones import tz_compare +from pandas.compat import set_function_name +from pandas.compat.numpy import function as nv +from pandas.util._decorators import Appender, Substitution, cache_readonly + +from pandas.core.dtypes import concat as _concat +from pandas.core.dtypes.cast import maybe_cast_to_integer_array +from pandas.core.dtypes.common import ( + ensure_categorical, + ensure_int64, + ensure_object, + ensure_platform_int, + is_bool, + is_bool_dtype, + is_categorical, + is_categorical_dtype, + is_datetime64_any_dtype, + is_datetime64tz_dtype, + is_dtype_equal, + is_extension_array_dtype, + is_float, + is_float_dtype, + is_hashable, + is_integer, + is_integer_dtype, + is_interval_dtype, + is_iterator, + is_list_like, + is_object_dtype, + is_period_dtype, + is_scalar, + is_signed_integer_dtype, + is_timedelta64_dtype, + is_unsigned_integer_dtype, +) +from pandas.core.dtypes.concat import concat_compat +from pandas.core.dtypes.generic import ( + ABCCategorical, + ABCDataFrame, + ABCDatetimeArray, + ABCDatetimeIndex, + ABCIndexClass, + ABCIntervalIndex, + ABCMultiIndex, + ABCPandasArray, + ABCPeriodIndex, + ABCSeries, + ABCTimedeltaIndex, +) +from pandas.core.dtypes.missing import array_equivalent, isna + +from pandas.core import ops +from pandas.core.accessor import CachedAccessor +import pandas.core.algorithms as algos +from pandas.core.arrays import ExtensionArray +from pandas.core.base import IndexOpsMixin, PandasObject +import pandas.core.common as com +from pandas.core.construction import extract_array +from pandas.core.indexers import deprecate_ndim_indexing, maybe_convert_indices +from pandas.core.indexes.frozen import FrozenList +import pandas.core.missing as missing +from pandas.core.ops import get_op_result_name +from pandas.core.ops.invalid import make_invalid_op +from pandas.core.strings import StringMethods + +from pandas.io.formats.printing import ( + default_pprint, + format_object_attrs, + format_object_summary, + pprint_thing, +) + +__all__ = ["Index"] + +_unsortable_types = frozenset(("mixed", "mixed-integer")) + +_index_doc_kwargs = dict( + klass="Index", + inplace="", + target_klass="Index", + raises_section="", + unique="Index", + duplicated="np.ndarray", +) +_index_shared_docs = dict() + + +def _make_comparison_op(op, cls): + def cmp_method(self, other): + if isinstance(other, (np.ndarray, Index, ABCSeries, ExtensionArray)): + if other.ndim > 0 and len(self) != len(other): + raise ValueError("Lengths must match to compare") + + if is_object_dtype(self) and isinstance(other, ABCCategorical): + left = type(other)(self._values, dtype=other.dtype) + return op(left, other) + elif is_object_dtype(self) and isinstance(other, ExtensionArray): + # e.g. PeriodArray + with np.errstate(all="ignore"): + result = op(self.values, other) + + elif is_object_dtype(self) and not isinstance(self, ABCMultiIndex): + # don't pass MultiIndex + with np.errstate(all="ignore"): + result = ops.comp_method_OBJECT_ARRAY(op, self.values, other) + + else: + with np.errstate(all="ignore"): + result = op(self.values, np.asarray(other)) + + if is_bool_dtype(result): + return result + return ops.invalid_comparison(self, other, op) + + name = f"__{op.__name__}__" + return set_function_name(cmp_method, name, cls) + + +def _make_arithmetic_op(op, cls): + def index_arithmetic_method(self, other): + if isinstance(other, (ABCSeries, ABCDataFrame, ABCTimedeltaIndex)): + return NotImplemented + + from pandas import Series + + result = op(Series(self), other) + if isinstance(result, tuple): + return (Index(result[0]), Index(result[1])) + return Index(result) + + name = f"__{op.__name__}__" + # TODO: docstring? + return set_function_name(index_arithmetic_method, name, cls) + + +class InvalidIndexError(Exception): + pass + + +_o_dtype = np.dtype(object) +_Identity = object + + +def _new_Index(cls, d): + """ + This is called upon unpickling, rather than the default which doesn't + have arguments and breaks __new__. + """ + # required for backward compat, because PI can't be instantiated with + # ordinals through __new__ GH #13277 + if issubclass(cls, ABCPeriodIndex): + from pandas.core.indexes.period import _new_PeriodIndex + + return _new_PeriodIndex(cls, **d) + + if issubclass(cls, ABCMultiIndex): + if "labels" in d and "codes" not in d: + # GH#23752 "labels" kwarg has been replaced with "codes" + d["codes"] = d.pop("labels") + + return cls.__new__(cls, **d) + + +class Index(IndexOpsMixin, PandasObject): + """ + Immutable ndarray implementing an ordered, sliceable set. The basic object + storing axis labels for all pandas objects. + + Parameters + ---------- + data : array-like (1-dimensional) + dtype : NumPy dtype (default: object) + If dtype is None, we find the dtype that best fits the data. + If an actual dtype is provided, we coerce to that dtype if it's safe. + Otherwise, an error will be raised. + copy : bool + Make a copy of input ndarray. + name : object + Name to be stored in the index. + tupleize_cols : bool (default: True) + When True, attempt to create a MultiIndex if possible. + + See Also + -------- + RangeIndex : Index implementing a monotonic integer range. + CategoricalIndex : Index of :class:`Categorical` s. + MultiIndex : A multi-level, or hierarchical, Index. + IntervalIndex : An Index of :class:`Interval` s. + DatetimeIndex, TimedeltaIndex, PeriodIndex + Int64Index, UInt64Index, Float64Index + + Notes + ----- + An Index instance can **only** contain hashable objects + + Examples + -------- + >>> pd.Index([1, 2, 3]) + Int64Index([1, 2, 3], dtype='int64') + + >>> pd.Index(list('abc')) + Index(['a', 'b', 'c'], dtype='object') + """ + + # tolist is not actually deprecated, just suppressed in the __dir__ + _deprecations: FrozenSet[str] = ( + PandasObject._deprecations + | IndexOpsMixin._deprecations + | frozenset(["contains", "set_value"]) + ) + + # To hand over control to subclasses + _join_precedence = 1 + + # Cython methods; see github.com/cython/cython/issues/2647 + # for why we need to wrap these instead of making them class attributes + # Moreover, cython will choose the appropriate-dtyped sub-function + # given the dtypes of the passed arguments + def _left_indexer_unique(self, left, right): + return libjoin.left_join_indexer_unique(left, right) + + def _left_indexer(self, left, right): + return libjoin.left_join_indexer(left, right) + + def _inner_indexer(self, left, right): + return libjoin.inner_join_indexer(left, right) + + def _outer_indexer(self, left, right): + return libjoin.outer_join_indexer(left, right) + + _typ = "index" + _data: Union[ExtensionArray, np.ndarray] + _id = None + _name: Optional[Hashable] = None + # MultiIndex.levels previously allowed setting the index name. We + # don't allow this anymore, and raise if it happens rather than + # failing silently. + _no_setting_name: bool = False + _comparables = ["name"] + _attributes = ["name"] + _is_numeric_dtype = False + _can_hold_na = True + + # would we like our indexing holder to defer to us + _defer_to_indexing = False + + # prioritize current class for _shallow_copy_with_infer, + # used to infer integers as datetime-likes + _infer_as_myclass = False + + _engine_type = libindex.ObjectEngine + # whether we support partial string indexing. Overridden + # in DatetimeIndex and PeriodIndex + _supports_partial_string_indexing = False + + _accessors = {"str"} + + str = CachedAccessor("str", StringMethods) + + # -------------------------------------------------------------------- + # Constructors + + def __new__( + cls, data=None, dtype=None, copy=False, name=None, tupleize_cols=True, **kwargs, + ) -> "Index": + + from pandas.core.indexes.range import RangeIndex + + name = maybe_extract_name(name, data, cls) + + if isinstance(data, ABCPandasArray): + # ensure users don't accidentally put a PandasArray in an index. + data = data.to_numpy() + + # range + if isinstance(data, RangeIndex): + return RangeIndex(start=data, copy=copy, dtype=dtype, name=name) + elif isinstance(data, range): + return RangeIndex.from_range(data, dtype=dtype, name=name) + + # categorical + elif is_categorical_dtype(data) or is_categorical_dtype(dtype): + # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 + from pandas.core.indexes.category import CategoricalIndex + + return CategoricalIndex(data, dtype=dtype, copy=copy, name=name, **kwargs) + + # interval + elif is_interval_dtype(data) or is_interval_dtype(dtype): + # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 + from pandas.core.indexes.interval import IntervalIndex + + closed = kwargs.pop("closed", None) + if is_dtype_equal(_o_dtype, dtype): + return IntervalIndex( + data, name=name, copy=copy, closed=closed, **kwargs + ).astype(object) + return IntervalIndex( + data, dtype=dtype, name=name, copy=copy, closed=closed, **kwargs + ) + + elif ( + is_datetime64_any_dtype(data) + or is_datetime64_any_dtype(dtype) + or "tz" in kwargs + ): + # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 + from pandas import DatetimeIndex + + if is_dtype_equal(_o_dtype, dtype): + # GH#23524 passing `dtype=object` to DatetimeIndex is invalid, + # will raise in the where `data` is already tz-aware. So + # we leave it out of this step and cast to object-dtype after + # the DatetimeIndex construction. + # Note we can pass copy=False because the .astype below + # will always make a copy + return DatetimeIndex(data, copy=False, name=name, **kwargs).astype( + object + ) + else: + return DatetimeIndex(data, copy=copy, name=name, dtype=dtype, **kwargs) + + elif is_timedelta64_dtype(data) or is_timedelta64_dtype(dtype): + # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 + from pandas import TimedeltaIndex + + if is_dtype_equal(_o_dtype, dtype): + # Note we can pass copy=False because the .astype below + # will always make a copy + return TimedeltaIndex(data, copy=False, name=name, **kwargs).astype( + object + ) + else: + return TimedeltaIndex(data, copy=copy, name=name, dtype=dtype, **kwargs) + + elif is_period_dtype(data) or is_period_dtype(dtype): + # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 + from pandas import PeriodIndex + + if is_dtype_equal(_o_dtype, dtype): + return PeriodIndex(data, copy=False, name=name, **kwargs).astype(object) + return PeriodIndex(data, dtype=dtype, copy=copy, name=name, **kwargs) + + # extension dtype + elif is_extension_array_dtype(data) or is_extension_array_dtype(dtype): + if not (dtype is None or is_object_dtype(dtype)): + # coerce to the provided dtype + ea_cls = dtype.construct_array_type() + data = ea_cls._from_sequence(data, dtype=dtype, copy=False) + else: + data = np.asarray(data, dtype=object) + + # coerce to the object dtype + data = data.astype(object) + return Index(data, dtype=object, copy=copy, name=name, **kwargs) + + # index-like + elif isinstance(data, (np.ndarray, Index, ABCSeries)): + # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 + from pandas.core.indexes.numeric import ( + Float64Index, + Int64Index, + UInt64Index, + ) + + if dtype is not None: + # we need to avoid having numpy coerce + # things that look like ints/floats to ints unless + # they are actually ints, e.g. '0' and 0.0 + # should not be coerced + # GH 11836 + data = _maybe_cast_with_dtype(data, dtype, copy) + dtype = data.dtype # TODO: maybe not for object? + + # maybe coerce to a sub-class + if is_signed_integer_dtype(data.dtype): + return Int64Index(data, copy=copy, dtype=dtype, name=name) + elif is_unsigned_integer_dtype(data.dtype): + return UInt64Index(data, copy=copy, dtype=dtype, name=name) + elif is_float_dtype(data.dtype): + return Float64Index(data, copy=copy, dtype=dtype, name=name) + elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data): + subarr = data.astype("object") + else: + subarr = com.asarray_tuplesafe(data, dtype=object) + + # asarray_tuplesafe does not always copy underlying data, + # so need to make sure that this happens + if copy: + subarr = subarr.copy() + + if dtype is None: + new_data, new_dtype = _maybe_cast_data_without_dtype(subarr) + if new_dtype is not None: + return cls( + new_data, dtype=new_dtype, copy=False, name=name, **kwargs + ) + + if kwargs: + raise TypeError(f"Unexpected keyword arguments {repr(set(kwargs))}") + if subarr.ndim > 1: + # GH#13601, GH#20285, GH#27125 + raise ValueError("Index data must be 1-dimensional") + return cls._simple_new(subarr, name, **kwargs) + + elif hasattr(data, "__array__"): + return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, **kwargs) + elif data is None or is_scalar(data): + raise cls._scalar_data_error(data) + else: + if tupleize_cols and is_list_like(data): + # GH21470: convert iterable to list before determining if empty + if is_iterator(data): + data = list(data) + + if data and all(isinstance(e, tuple) for e in data): + # we must be all tuples, otherwise don't construct + # 10697 + from pandas.core.indexes.multi import MultiIndex + + return MultiIndex.from_tuples( + data, names=name or kwargs.get("names") + ) + # other iterable of some kind + subarr = com.asarray_tuplesafe(data, dtype=object) + return Index(subarr, dtype=dtype, copy=copy, name=name, **kwargs) + + """ + NOTE for new Index creation: + + - _simple_new: It returns new Index with the same type as the caller. + All metadata (such as name) must be provided by caller's responsibility. + Using _shallow_copy is recommended because it fills these metadata + otherwise specified. + + - _shallow_copy: It returns new Index with the same type (using + _simple_new), but fills caller's metadata otherwise specified. Passed + kwargs will overwrite corresponding metadata. + + - _shallow_copy_with_infer: It returns new Index inferring its type + from passed values. It fills caller's metadata otherwise specified as the + same as _shallow_copy. + + See each method's docstring. + """ + + @property + def asi8(self): + """ + Integer representation of the values. + + Returns + ------- + ndarray + An ndarray with int64 dtype. + """ + return None + + @classmethod + def _simple_new(cls, values, name=None, dtype=None): + """ + We require that we have a dtype compat for the values. If we are passed + a non-dtype compat, then coerce using the constructor. + + Must be careful not to recurse. + """ + if isinstance(values, (ABCSeries, ABCIndexClass)): + # Index._data must always be an ndarray. + # This is no-copy for when _values is an ndarray, + # which should be always at this point. + values = np.asarray(values._values) + + result = object.__new__(cls) + result._data = values + # _index_data is a (temporary?) fix to ensure that the direct data + # manipulation we do in `_libs/reduction.pyx` continues to work. + # We need access to the actual ndarray, since we're messing with + # data buffers and strides. We don't re-use `_ndarray_values`, since + # we actually set this value too. + result._index_data = values + result._name = name + + return result._reset_identity() + + @cache_readonly + def _constructor(self): + return type(self) + + # -------------------------------------------------------------------- + # Index Internals Methods + + def _get_attributes_dict(self): + """ + Return an attributes dict for my class. + """ + return {k: getattr(self, k, None) for k in self._attributes} + + _index_shared_docs[ + "_shallow_copy" + ] = """ + Create a new Index with the same class as the caller, don't copy the + data, use the same object attributes with passed in attributes taking + precedence. + + *this is an internal non-public method* + + Parameters + ---------- + values : the values to create the new Index, optional + kwargs : updates the default attributes for this Index + """ + + @Appender(_index_shared_docs["_shallow_copy"]) + def _shallow_copy(self, values=None, **kwargs): + if values is None: + values = self.values + attributes = self._get_attributes_dict() + attributes.update(kwargs) + if not len(values) and "dtype" not in kwargs: + attributes["dtype"] = self.dtype + + # _simple_new expects an the type of self._data + values = getattr(values, "_values", values) + if isinstance(values, ABCDatetimeArray): + # `self.values` returns `self` for tz-aware, so we need to unwrap + # more specifically + values = values.asi8 + + return self._simple_new(values, **attributes) + + def _shallow_copy_with_infer(self, values, **kwargs): + """ + Create a new Index inferring the class with passed value, don't copy + the data, use the same object attributes with passed in attributes + taking precedence. + + *this is an internal non-public method* + + Parameters + ---------- + values : the values to create the new Index, optional + kwargs : updates the default attributes for this Index + """ + attributes = self._get_attributes_dict() + attributes.update(kwargs) + attributes["copy"] = False + if not len(values) and "dtype" not in kwargs: + attributes["dtype"] = self.dtype + if self._infer_as_myclass: + try: + return self._constructor(values, **attributes) + except (TypeError, ValueError): + pass + return Index(values, **attributes) + + def _update_inplace(self, result, **kwargs): + # guard when called from IndexOpsMixin + raise TypeError("Index can't be updated inplace") + + def is_(self, other) -> bool: + """ + More flexible, faster check like ``is`` but that works through views. + + Note: this is *not* the same as ``Index.identical()``, which checks + that metadata is also the same. + + Parameters + ---------- + other : object + other object to compare against. + + Returns + ------- + True if both have same underlying data, False otherwise : bool + """ + # use something other than None to be clearer + return self._id is getattr(other, "_id", Ellipsis) and self._id is not None + + def _reset_identity(self): + """ + Initializes or resets ``_id`` attribute with new object. + """ + self._id = _Identity() + return self + + def _cleanup(self): + self._engine.clear_mapping() + + @cache_readonly + def _engine(self): + # property, for now, slow to look up + + # to avoid a reference cycle, bind `_ndarray_values` to a local variable, so + # `self` is not passed into the lambda. + _ndarray_values = self._ndarray_values + return self._engine_type(lambda: _ndarray_values, len(self)) + + # -------------------------------------------------------------------- + # Array-Like Methods + + # ndarray compat + def __len__(self) -> int: + """ + Return the length of the Index. + """ + return len(self._data) + + def __array__(self, dtype=None) -> np.ndarray: + """ + The array interface, return my values. + """ + return np.asarray(self._data, dtype=dtype) + + def __array_wrap__(self, result, context=None): + """ + Gets called after a ufunc. + """ + result = lib.item_from_zerodim(result) + if is_bool_dtype(result) or lib.is_scalar(result) or np.ndim(result) > 1: + return result + + attrs = self._get_attributes_dict() + return Index(result, **attrs) + + @cache_readonly + def dtype(self): + """ + Return the dtype object of the underlying data. + """ + return self._data.dtype + + def ravel(self, order="C"): + """ + Return an ndarray of the flattened values of the underlying data. + + Returns + ------- + numpy.ndarray + Flattened array. + + See Also + -------- + numpy.ndarray.ravel + """ + return self._ndarray_values.ravel(order=order) + + def view(self, cls=None): + + # we need to see if we are subclassing an + # index type here + if cls is not None and not hasattr(cls, "_typ"): + result = self._data.view(cls) + else: + result = self._shallow_copy() + if isinstance(result, Index): + result._id = self._id + return result + + _index_shared_docs[ + "astype" + ] = """ + Create an Index with values cast to dtypes. The class of a new Index + is determined by dtype. When conversion is impossible, a ValueError + exception is raised. + + Parameters + ---------- + dtype : numpy dtype or pandas type + Note that any signed integer `dtype` is treated as ``'int64'``, + and any unsigned integer `dtype` is treated as ``'uint64'``, + regardless of the size. + copy : bool, default True + By default, astype always returns a newly allocated object. + If copy is set to False and internal requirements on dtype are + satisfied, the original data is used to create a new Index + or the original Index is returned. + + Returns + ------- + Index + Index with values cast to specified dtype. + """ + + @Appender(_index_shared_docs["astype"]) + def astype(self, dtype, copy=True): + if is_dtype_equal(self.dtype, dtype): + return self.copy() if copy else self + + elif is_categorical_dtype(dtype): + from pandas.core.indexes.category import CategoricalIndex + + return CategoricalIndex(self.values, name=self.name, dtype=dtype, copy=copy) + + elif is_extension_array_dtype(dtype): + return Index(np.asarray(self), dtype=dtype, copy=copy) + + try: + casted = self.values.astype(dtype, copy=copy) + except (TypeError, ValueError): + raise TypeError(f"Cannot cast {type(self).__name__} to dtype {dtype}") + return Index(casted, name=self.name, dtype=dtype) + + _index_shared_docs[ + "take" + ] = """ + Return a new %(klass)s of the values selected by the indices. + + For internal compatibility with numpy arrays. + + Parameters + ---------- + indices : list + Indices to be taken. + axis : int, optional + The axis over which to select values, always 0. + allow_fill : bool, default True + fill_value : bool, default None + If allow_fill=True and fill_value is not None, indices specified by + -1 is regarded as NA. If Index doesn't hold NA, raise ValueError. + + Returns + ------- + numpy.ndarray + Elements of given indices. + + See Also + -------- + numpy.ndarray.take + """ + + @Appender(_index_shared_docs["take"] % _index_doc_kwargs) + def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): + if kwargs: + nv.validate_take(tuple(), kwargs) + indices = ensure_platform_int(indices) + if self._can_hold_na: + taken = self._assert_take_fillable( + self.values, + indices, + allow_fill=allow_fill, + fill_value=fill_value, + na_value=self._na_value, + ) + else: + if allow_fill and fill_value is not None: + cls_name = type(self).__name__ + raise ValueError( + f"Unable to fill values because {cls_name} cannot contain NA" + ) + taken = self.values.take(indices) + return self._shallow_copy(taken) + + def _assert_take_fillable( + self, values, indices, allow_fill=True, fill_value=None, na_value=np.nan + ): + """ + Internal method to handle NA filling of take. + """ + indices = ensure_platform_int(indices) + + # only fill if we are passing a non-None fill_value + if allow_fill and fill_value is not None: + if (indices < -1).any(): + raise ValueError( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) + taken = algos.take( + values, indices, allow_fill=allow_fill, fill_value=na_value + ) + else: + taken = values.take(indices) + return taken + + _index_shared_docs[ + "repeat" + ] = """ + Repeat elements of a %(klass)s. + + Returns a new %(klass)s where each element of the current %(klass)s + is repeated consecutively a given number of times. + + Parameters + ---------- + repeats : int or array of ints + The number of repetitions for each element. This should be a + non-negative integer. Repeating 0 times will return an empty + %(klass)s. + axis : None + Must be ``None``. Has no effect but is accepted for compatibility + with numpy. + + Returns + ------- + repeated_index : %(klass)s + Newly created %(klass)s with repeated elements. + + See Also + -------- + Series.repeat : Equivalent function for Series. + numpy.repeat : Similar method for :class:`numpy.ndarray`. + + Examples + -------- + >>> idx = pd.Index(['a', 'b', 'c']) + >>> idx + Index(['a', 'b', 'c'], dtype='object') + >>> idx.repeat(2) + Index(['a', 'a', 'b', 'b', 'c', 'c'], dtype='object') + >>> idx.repeat([1, 2, 3]) + Index(['a', 'b', 'b', 'c', 'c', 'c'], dtype='object') + """ + + @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) + def repeat(self, repeats, axis=None): + repeats = ensure_platform_int(repeats) + nv.validate_repeat(tuple(), dict(axis=axis)) + return self._shallow_copy(self._values.repeat(repeats)) + + # -------------------------------------------------------------------- + # Copying Methods + + _index_shared_docs[ + "copy" + ] = """ + Make a copy of this object. Name and dtype sets those attributes on + the new object. + + Parameters + ---------- + name : str, optional + deep : bool, default False + dtype : numpy dtype or pandas type + + Returns + ------- + copy : Index + + Notes + ----- + In most cases, there should be no functional difference from using + ``deep``, but if ``deep`` is passed it will attempt to deepcopy. + """ + + @Appender(_index_shared_docs["copy"]) + def copy(self, name=None, deep=False, dtype=None, **kwargs): + if deep: + new_index = self._shallow_copy(self._data.copy()) + else: + new_index = self._shallow_copy() + + names = kwargs.get("names") + names = self._validate_names(name=name, names=names, deep=deep) + new_index = new_index.set_names(names) + + if dtype: + new_index = new_index.astype(dtype) + return new_index + + def __copy__(self, **kwargs): + return self.copy(**kwargs) + + def __deepcopy__(self, memo=None): + """ + Parameters + ---------- + memo, default None + Standard signature. Unused + """ + return self.copy(deep=True) + + # -------------------------------------------------------------------- + # Rendering Methods + + def __repr__(self): + """ + Return a string representation for this object. + """ + klass_name = type(self).__name__ + data = self._format_data() + attrs = self._format_attrs() + space = self._format_space() + attrs_str = [f"{k}={v}" for k, v in attrs] + prepr = f",{space}".join(attrs_str) + + # no data provided, just attributes + if data is None: + data = "" + + res = f"{klass_name}({data}{prepr})" + + return res + + def _format_space(self): + + # using space here controls if the attributes + # are line separated or not (the default) + + # max_seq_items = get_option('display.max_seq_items') + # if len(self) > max_seq_items: + # space = "\n%s" % (' ' * (len(klass) + 1)) + return " " + + @property + def _formatter_func(self): + """ + Return the formatter function. + """ + return default_pprint + + def _format_data(self, name=None): + """ + Return the formatted data as a unicode string. + """ + + # do we want to justify (only do so for non-objects) + is_justify = not ( + self.inferred_type in ("string", "unicode") + or ( + self.inferred_type == "categorical" and is_object_dtype(self.categories) + ) + ) + + return format_object_summary( + self, self._formatter_func, is_justify=is_justify, name=name + ) + + def _format_attrs(self): + """ + Return a list of tuples of the (attr,formatted_value). + """ + return format_object_attrs(self) + + def _mpl_repr(self): + # how to represent ourselves to matplotlib + return self.values + + def format(self, name=False, formatter=None, **kwargs): + """ + Render a string representation of the Index. + """ + header = [] + if name: + header.append( + pprint_thing(self.name, escape_chars=("\t", "\r", "\n")) + if self.name is not None + else "" + ) + + if formatter is not None: + return header + list(self.map(formatter)) + + return self._format_with_header(header, **kwargs) + + def _format_with_header(self, header, na_rep="NaN", **kwargs): + values = self.values + + from pandas.io.formats.format import format_array + + if is_categorical_dtype(values.dtype): + values = np.array(values) + + elif is_object_dtype(values.dtype): + values = lib.maybe_convert_objects(values, safe=1) + + if is_object_dtype(values.dtype): + result = [pprint_thing(x, escape_chars=("\t", "\r", "\n")) for x in values] + + # could have nans + mask = isna(values) + if mask.any(): + result = np.array(result) + result[mask] = na_rep + result = result.tolist() + + else: + result = _trim_front(format_array(values, None, justify="left")) + return header + result + + def to_native_types(self, slicer=None, **kwargs): + """ + Format specified values of `self` and return them. + + Parameters + ---------- + slicer : int, array-like + An indexer into `self` that specifies which values + are used in the formatting process. + kwargs : dict + Options for specifying how the values should be formatted. + These options include the following: + + 1) na_rep : str + The value that serves as a placeholder for NULL values + 2) quoting : bool or None + Whether or not there are quoted values in `self` + 3) date_format : str + The format used to represent date-like values. + + Returns + ------- + numpy.ndarray + Formatted values. + """ + + values = self + if slicer is not None: + values = values[slicer] + return values._format_native_types(**kwargs) + + def _format_native_types(self, na_rep="", quoting=None, **kwargs): + """ + Actually format specific types of the index. + """ + mask = isna(self) + if not self.is_object() and not quoting: + values = np.asarray(self).astype(str) + else: + values = np.array(self, dtype=object, copy=True) + + values[mask] = na_rep + return values + + def _summary(self, name=None): + """ + Return a summarized representation. + + Parameters + ---------- + name : str + name to use in the summary representation + + Returns + ------- + String with a summarized representation of the index + """ + if len(self) > 0: + head = self[0] + if hasattr(head, "format") and not isinstance(head, str): + head = head.format() + tail = self[-1] + if hasattr(tail, "format") and not isinstance(tail, str): + tail = tail.format() + index_summary = f", {head} to {tail}" + else: + index_summary = "" + + if name is None: + name = type(self).__name__ + return f"{name}: {len(self)} entries{index_summary}" + + # -------------------------------------------------------------------- + # Conversion Methods + + def to_flat_index(self): + """ + Identity method. + + .. versionadded:: 0.24.0 + + This is implemented for compatibility with subclass implementations + when chaining. + + Returns + ------- + pd.Index + Caller. + + See Also + -------- + MultiIndex.to_flat_index : Subclass implementation. + """ + return self + + def to_series(self, index=None, name=None): + """ + Create a Series with both index and values equal to the index keys. + + Useful with map for returning an indexer based on an index. + + Parameters + ---------- + index : Index, optional + Index of resulting Series. If None, defaults to original index. + name : str, optional + Dame of resulting Series. If None, defaults to name of original + index. + + Returns + ------- + Series + The dtype will be based on the type of the Index values. + """ + + from pandas import Series + + if index is None: + index = self._shallow_copy() + if name is None: + name = self.name + + return Series(self.values.copy(), index=index, name=name) + + def to_frame(self, index=True, name=None): + """ + Create a DataFrame with a column containing the Index. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + index : bool, default True + Set the index of the returned DataFrame as the original Index. + + name : object, default None + The passed name should substitute for the index name (if it has + one). + + Returns + ------- + DataFrame + DataFrame containing the original Index data. + + See Also + -------- + Index.to_series : Convert an Index to a Series. + Series.to_frame : Convert Series to DataFrame. + + Examples + -------- + >>> idx = pd.Index(['Ant', 'Bear', 'Cow'], name='animal') + >>> idx.to_frame() + animal + animal + Ant Ant + Bear Bear + Cow Cow + + By default, the original Index is reused. To enforce a new Index: + + >>> idx.to_frame(index=False) + animal + 0 Ant + 1 Bear + 2 Cow + + To override the name of the resulting column, specify `name`: + + >>> idx.to_frame(index=False, name='zoo') + zoo + 0 Ant + 1 Bear + 2 Cow + """ + + from pandas import DataFrame + + if name is None: + name = self.name or 0 + result = DataFrame({name: self._values.copy()}) + + if index: + result.index = self + return result + + # -------------------------------------------------------------------- + # Name-Centric Methods + + @property + def name(self): + return self._name + + @name.setter + def name(self, value): + if self._no_setting_name: + # Used in MultiIndex.levels to avoid silently ignoring name updates. + raise RuntimeError( + "Cannot set name on a level of a MultiIndex. Use " + "'MultiIndex.set_names' instead." + ) + maybe_extract_name(value, None, type(self)) + self._name = value + + def _validate_names(self, name=None, names=None, deep=False): + """ + Handles the quirks of having a singular 'name' parameter for general + Index and plural 'names' parameter for MultiIndex. + """ + from copy import deepcopy + + if names is not None and name is not None: + raise TypeError("Can only provide one of `names` and `name`") + elif names is None and name is None: + return deepcopy(self.names) if deep else self.names + elif names is not None: + if not is_list_like(names): + raise TypeError("Must pass list-like as `names`.") + return names + else: + if not is_list_like(name): + return [name] + return name + + def _get_names(self): + return FrozenList((self.name,)) + + def _set_names(self, values, level=None): + """ + Set new names on index. Each name has to be a hashable type. + + Parameters + ---------- + values : str or sequence + name(s) to set + level : int, level name, or sequence of int/level names (default None) + If the index is a MultiIndex (hierarchical), level(s) to set (None + for all levels). Otherwise level must be None + + Raises + ------ + TypeError if each name is not hashable. + """ + if not is_list_like(values): + raise ValueError("Names must be a list-like") + if len(values) != 1: + raise ValueError(f"Length of new names must be 1, got {len(values)}") + + # GH 20527 + # All items in 'name' need to be hashable: + for name in values: + if not is_hashable(name): + raise TypeError(f"{type(self).__name__}.name must be a hashable type") + self._name = values[0] + + names = property(fset=_set_names, fget=_get_names) + + def set_names(self, names, level=None, inplace=False): + """ + Set Index or MultiIndex name. + + Able to set new names partially and by level. + + Parameters + ---------- + names : label or list of label + Name(s) to set. + level : int, label or list of int or label, optional + If the index is a MultiIndex, level(s) to set (None for all + levels). Otherwise level must be None. + inplace : bool, default False + Modifies the object directly, instead of creating a new Index or + MultiIndex. + + Returns + ------- + Index + The same type as the caller or None if inplace is True. + + See Also + -------- + Index.rename : Able to set new names without level. + + Examples + -------- + >>> idx = pd.Index([1, 2, 3, 4]) + >>> idx + Int64Index([1, 2, 3, 4], dtype='int64') + >>> idx.set_names('quarter') + Int64Index([1, 2, 3, 4], dtype='int64', name='quarter') + + >>> idx = pd.MultiIndex.from_product([['python', 'cobra'], + ... [2018, 2019]]) + >>> idx + MultiIndex([('python', 2018), + ('python', 2019), + ( 'cobra', 2018), + ( 'cobra', 2019)], + ) + >>> idx.set_names(['kind', 'year'], inplace=True) + >>> idx + MultiIndex([('python', 2018), + ('python', 2019), + ( 'cobra', 2018), + ( 'cobra', 2019)], + names=['kind', 'year']) + >>> idx.set_names('species', level=0) + MultiIndex([('python', 2018), + ('python', 2019), + ( 'cobra', 2018), + ( 'cobra', 2019)], + names=['species', 'year']) + """ + + if level is not None and not isinstance(self, ABCMultiIndex): + raise ValueError("Level must be None for non-MultiIndex") + + if level is not None and not is_list_like(level) and is_list_like(names): + raise TypeError("Names must be a string when a single level is provided.") + + if not is_list_like(names) and level is None and self.nlevels > 1: + raise TypeError("Must pass list-like as `names`.") + + if not is_list_like(names): + names = [names] + if level is not None and not is_list_like(level): + level = [level] + + if inplace: + idx = self + else: + idx = self._shallow_copy() + idx._set_names(names, level=level) + if not inplace: + return idx + + def rename(self, name, inplace=False): + """ + Alter Index or MultiIndex name. + + Able to set new names without level. Defaults to returning new index. + Length of names must match number of levels in MultiIndex. + + Parameters + ---------- + name : label or list of labels + Name(s) to set. + inplace : bool, default False + Modifies the object directly, instead of creating a new Index or + MultiIndex. + + Returns + ------- + Index + The same type as the caller or None if inplace is True. + + See Also + -------- + Index.set_names : Able to set new names partially and by level. + + Examples + -------- + >>> idx = pd.Index(['A', 'C', 'A', 'B'], name='score') + >>> idx.rename('grade') + Index(['A', 'C', 'A', 'B'], dtype='object', name='grade') + + >>> idx = pd.MultiIndex.from_product([['python', 'cobra'], + ... [2018, 2019]], + ... names=['kind', 'year']) + >>> idx + MultiIndex([('python', 2018), + ('python', 2019), + ( 'cobra', 2018), + ( 'cobra', 2019)], + names=['kind', 'year']) + >>> idx.rename(['species', 'year']) + MultiIndex([('python', 2018), + ('python', 2019), + ( 'cobra', 2018), + ( 'cobra', 2019)], + names=['species', 'year']) + >>> idx.rename('species') + Traceback (most recent call last): + TypeError: Must pass list-like as `names`. + """ + return self.set_names([name], inplace=inplace) + + # -------------------------------------------------------------------- + # Level-Centric Methods + + @property + def nlevels(self) -> int: + """ + Number of levels. + """ + return 1 + + def _sort_levels_monotonic(self): + """ + Compat with MultiIndex. + """ + return self + + def _validate_index_level(self, level): + """ + Validate index level. + + For single-level Index getting level number is a no-op, but some + verification must be done like in MultiIndex. + + """ + if isinstance(level, int): + if level < 0 and level != -1: + raise IndexError( + "Too many levels: Index has only 1 level, " + f"{level} is not a valid level number" + ) + elif level > 0: + raise IndexError( + f"Too many levels: Index has only 1 level, not {level + 1}" + ) + elif level != self.name: + raise KeyError( + f"Requested level ({level}) does not match index name ({self.name})" + ) + + def _get_level_number(self, level): + self._validate_index_level(level) + return 0 + + def sortlevel(self, level=None, ascending=True, sort_remaining=None): + """ + For internal compatibility with with the Index API. + + Sort the Index. This is for compat with MultiIndex + + Parameters + ---------- + ascending : bool, default True + False to sort in descending order + + level, sort_remaining are compat parameters + + Returns + ------- + Index + """ + return self.sort_values(return_indexer=True, ascending=ascending) + + def _get_level_values(self, level): + """ + Return an Index of values for requested level. + + This is primarily useful to get an individual level of values from a + MultiIndex, but is provided on Index as well for compatibility. + + Parameters + ---------- + level : int or str + It is either the integer position or the name of the level. + + Returns + ------- + Index + Calling object, as there is only one level in the Index. + + See Also + -------- + MultiIndex.get_level_values : Get values for a level of a MultiIndex. + + Notes + ----- + For Index, level should be 0, since there are no multiple levels. + + Examples + -------- + + >>> idx = pd.Index(list('abc')) + >>> idx + Index(['a', 'b', 'c'], dtype='object') + + Get level values by supplying `level` as integer: + + >>> idx.get_level_values(0) + Index(['a', 'b', 'c'], dtype='object') + """ + self._validate_index_level(level) + return self + + get_level_values = _get_level_values + + def droplevel(self, level=0): + """ + Return index with requested level(s) removed. + + If resulting index has only 1 level left, the result will be + of Index type, not MultiIndex. + + .. versionadded:: 0.23.1 (support for non-MultiIndex) + + Parameters + ---------- + level : int, str, or list-like, default 0 + If a string is given, must be the name of a level + If list-like, elements must be names or indexes of levels. + + Returns + ------- + Index or MultiIndex + """ + if not isinstance(level, (tuple, list)): + level = [level] + + levnums = sorted(self._get_level_number(lev) for lev in level)[::-1] + + if len(level) == 0: + return self + if len(level) >= self.nlevels: + raise ValueError( + f"Cannot remove {len(level)} levels from an index with {self.nlevels} " + "levels: at least one level must be left." + ) + # The two checks above guarantee that here self is a MultiIndex + + new_levels = list(self.levels) + new_codes = list(self.codes) + new_names = list(self.names) + + for i in levnums: + new_levels.pop(i) + new_codes.pop(i) + new_names.pop(i) + + if len(new_levels) == 1: + + # set nan if needed + mask = new_codes[0] == -1 + result = new_levels[0].take(new_codes[0]) + if mask.any(): + result = result.putmask(mask, np.nan) + + result._name = new_names[0] + return result + else: + from pandas.core.indexes.multi import MultiIndex + + return MultiIndex( + levels=new_levels, + codes=new_codes, + names=new_names, + verify_integrity=False, + ) + + _index_shared_docs[ + "_get_grouper_for_level" + ] = """ + Get index grouper corresponding to an index level + + Parameters + ---------- + mapper: Group mapping function or None + Function mapping index values to groups + level : int or None + Index level + + Returns + ------- + grouper : Index + Index of values to group on. + labels : ndarray of int or None + Array of locations in level_index. + uniques : Index or None + Index of unique values for level. + """ + + @Appender(_index_shared_docs["_get_grouper_for_level"]) + def _get_grouper_for_level(self, mapper, level=None): + assert level is None or level == 0 + if mapper is None: + grouper = self + else: + grouper = self.map(mapper) + + return grouper, None, None + + # -------------------------------------------------------------------- + # Introspection Methods + + @property + def is_monotonic(self) -> bool: + """ + Alias for is_monotonic_increasing. + """ + return self.is_monotonic_increasing + + @property + def is_monotonic_increasing(self): + """ + Return if the index is monotonic increasing (only equal or + increasing) values. + + Examples + -------- + >>> Index([1, 2, 3]).is_monotonic_increasing + True + >>> Index([1, 2, 2]).is_monotonic_increasing + True + >>> Index([1, 3, 2]).is_monotonic_increasing + False + """ + return self._engine.is_monotonic_increasing + + @property + def is_monotonic_decreasing(self) -> bool: + """ + Return if the index is monotonic decreasing (only equal or + decreasing) values. + + Examples + -------- + >>> Index([3, 2, 1]).is_monotonic_decreasing + True + >>> Index([3, 2, 2]).is_monotonic_decreasing + True + >>> Index([3, 1, 2]).is_monotonic_decreasing + False + """ + return self._engine.is_monotonic_decreasing + + @property + def _is_strictly_monotonic_increasing(self) -> bool: + """ + Return if the index is strictly monotonic increasing + (only increasing) values. + + Examples + -------- + >>> Index([1, 2, 3])._is_strictly_monotonic_increasing + True + >>> Index([1, 2, 2])._is_strictly_monotonic_increasing + False + >>> Index([1, 3, 2])._is_strictly_monotonic_increasing + False + """ + return self.is_unique and self.is_monotonic_increasing + + @property + def _is_strictly_monotonic_decreasing(self) -> bool: + """ + Return if the index is strictly monotonic decreasing + (only decreasing) values. + + Examples + -------- + >>> Index([3, 2, 1])._is_strictly_monotonic_decreasing + True + >>> Index([3, 2, 2])._is_strictly_monotonic_decreasing + False + >>> Index([3, 1, 2])._is_strictly_monotonic_decreasing + False + """ + return self.is_unique and self.is_monotonic_decreasing + + @cache_readonly + def is_unique(self) -> bool: + """ + Return if the index has unique values. + """ + return self._engine.is_unique + + @property + def has_duplicates(self) -> bool: + return not self.is_unique + + def is_boolean(self) -> bool: + return self.inferred_type in ["boolean"] + + def is_integer(self) -> bool: + return self.inferred_type in ["integer"] + + def is_floating(self) -> bool: + return self.inferred_type in ["floating", "mixed-integer-float", "integer-na"] + + def is_numeric(self) -> bool: + return self.inferred_type in ["integer", "floating"] + + def is_object(self) -> bool: + return is_object_dtype(self.dtype) + + def is_categorical(self) -> bool: + """ + Check if the Index holds categorical data. + + Returns + ------- + boolean + True if the Index is categorical. + + See Also + -------- + CategoricalIndex : Index for categorical data. + + Examples + -------- + >>> idx = pd.Index(["Watermelon", "Orange", "Apple", + ... "Watermelon"]).astype("category") + >>> idx.is_categorical() + True + + >>> idx = pd.Index([1, 3, 5, 7]) + >>> idx.is_categorical() + False + + >>> s = pd.Series(["Peter", "Victor", "Elisabeth", "Mar"]) + >>> s + 0 Peter + 1 Victor + 2 Elisabeth + 3 Mar + dtype: object + >>> s.index.is_categorical() + False + """ + return self.inferred_type in ["categorical"] + + def is_interval(self) -> bool: + return self.inferred_type in ["interval"] + + def is_mixed(self) -> bool: + return self.inferred_type in ["mixed"] + + def holds_integer(self): + """ + Whether the type is an integer type. + """ + return self.inferred_type in ["integer", "mixed-integer"] + + @cache_readonly + def inferred_type(self): + """ + Return a string of the type inferred from the values. + """ + return lib.infer_dtype(self, skipna=False) + + @cache_readonly + def is_all_dates(self) -> bool: + return is_datetime_array(ensure_object(self.values)) + + # -------------------------------------------------------------------- + # Pickle Methods + + def __reduce__(self): + d = dict(data=self._data) + d.update(self._get_attributes_dict()) + return _new_Index, (type(self), d), None + + # -------------------------------------------------------------------- + # Null Handling Methods + + _na_value = np.nan + """The expected NA value to use with this index.""" + + @cache_readonly + def _isnan(self): + """ + Return if each value is NaN. + """ + if self._can_hold_na: + return isna(self) + else: + # shouldn't reach to this condition by checking hasnans beforehand + values = np.empty(len(self), dtype=np.bool_) + values.fill(False) + return values + + @cache_readonly + def _nan_idxs(self): + if self._can_hold_na: + return self._isnan.nonzero()[0] + else: + return np.array([], dtype=np.int64) + + @cache_readonly + def hasnans(self): + """ + Return if I have any nans; enables various perf speedups. + """ + if self._can_hold_na: + return bool(self._isnan.any()) + else: + return False + + def isna(self): + """ + Detect missing values. + + Return a boolean same-sized object indicating if the values are NA. + NA values, such as ``None``, :attr:`numpy.NaN` or :attr:`pd.NaT`, get + mapped to ``True`` values. + Everything else get mapped to ``False`` values. Characters such as + empty strings `''` or :attr:`numpy.inf` are not considered NA values + (unless you set ``pandas.options.mode.use_inf_as_na = True``). + + Returns + ------- + numpy.ndarray + A boolean array of whether my values are NA. + + See Also + -------- + Index.notna : Boolean inverse of isna. + Index.dropna : Omit entries with missing values. + isna : Top-level isna. + Series.isna : Detect missing values in Series object. + + Examples + -------- + Show which entries in a pandas.Index are NA. The result is an + array. + + >>> idx = pd.Index([5.2, 6.0, np.NaN]) + >>> idx + Float64Index([5.2, 6.0, nan], dtype='float64') + >>> idx.isna() + array([False, False, True], dtype=bool) + + Empty strings are not considered NA values. None is considered an NA + value. + + >>> idx = pd.Index(['black', '', 'red', None]) + >>> idx + Index(['black', '', 'red', None], dtype='object') + >>> idx.isna() + array([False, False, False, True], dtype=bool) + + For datetimes, `NaT` (Not a Time) is considered as an NA value. + + >>> idx = pd.DatetimeIndex([pd.Timestamp('1940-04-25'), + ... pd.Timestamp(''), None, pd.NaT]) + >>> idx + DatetimeIndex(['1940-04-25', 'NaT', 'NaT', 'NaT'], + dtype='datetime64[ns]', freq=None) + >>> idx.isna() + array([False, True, True, True], dtype=bool) + """ + return self._isnan + + isnull = isna + + def notna(self): + """ + Detect existing (non-missing) values. + + Return a boolean same-sized object indicating if the values are not NA. + Non-missing values get mapped to ``True``. Characters such as empty + strings ``''`` or :attr:`numpy.inf` are not considered NA values + (unless you set ``pandas.options.mode.use_inf_as_na = True``). + NA values, such as None or :attr:`numpy.NaN`, get mapped to ``False`` + values. + + Returns + ------- + numpy.ndarray + Boolean array to indicate which entries are not NA. + + See Also + -------- + Index.notnull : Alias of notna. + Index.isna: Inverse of notna. + notna : Top-level notna. + + Examples + -------- + Show which entries in an Index are not NA. The result is an + array. + + >>> idx = pd.Index([5.2, 6.0, np.NaN]) + >>> idx + Float64Index([5.2, 6.0, nan], dtype='float64') + >>> idx.notna() + array([ True, True, False]) + + Empty strings are not considered NA values. None is considered a NA + value. + + >>> idx = pd.Index(['black', '', 'red', None]) + >>> idx + Index(['black', '', 'red', None], dtype='object') + >>> idx.notna() + array([ True, True, True, False]) + """ + return ~self.isna() + + notnull = notna + + _index_shared_docs[ + "fillna" + ] = """ + Fill NA/NaN values with the specified value. + + Parameters + ---------- + value : scalar + Scalar value to use to fill holes (e.g. 0). + This value cannot be a list-likes. + downcast : dict, default is None + a dict of item->dtype of what to downcast if possible, + or the string 'infer' which will try to downcast to an appropriate + equal type (e.g. float64 to int64 if possible). + + Returns + ------- + filled : Index + """ + + @Appender(_index_shared_docs["fillna"]) + def fillna(self, value=None, downcast=None): + self._assert_can_do_op(value) + if self.hasnans: + result = self.putmask(self._isnan, value) + if downcast is None: + # no need to care metadata other than name + # because it can't have freq if + return Index(result, name=self.name) + return self._shallow_copy() + + _index_shared_docs[ + "dropna" + ] = """ + Return Index without NA/NaN values. + + Parameters + ---------- + how : {'any', 'all'}, default 'any' + If the Index is a MultiIndex, drop the value when any or all levels + are NaN. + + Returns + ------- + valid : Index + """ + + @Appender(_index_shared_docs["dropna"]) + def dropna(self, how="any"): + if how not in ("any", "all"): + raise ValueError(f"invalid how option: {how}") + + if self.hasnans: + return self._shallow_copy(self._values[~self._isnan]) + return self._shallow_copy() + + # -------------------------------------------------------------------- + # Uniqueness Methods + + _index_shared_docs[ + "index_unique" + ] = """ + Return unique values in the index. Uniques are returned in order + of appearance, this does NOT sort. + + Parameters + ---------- + level : int or str, optional, default None + Only return values from specified level (for MultiIndex). + + .. versionadded:: 0.23.0 + + Returns + ------- + Index without duplicates + + See Also + -------- + unique + Series.unique + """ + + @Appender(_index_shared_docs["index_unique"] % _index_doc_kwargs) + def unique(self, level=None): + if level is not None: + self._validate_index_level(level) + result = super().unique() + return self._shallow_copy(result) + + def drop_duplicates(self, keep="first"): + """ + Return Index with duplicate values removed. + + Parameters + ---------- + keep : {'first', 'last', ``False``}, default 'first' + - 'first' : Drop duplicates except for the first occurrence. + - 'last' : Drop duplicates except for the last occurrence. + - ``False`` : Drop all duplicates. + + Returns + ------- + deduplicated : Index + + See Also + -------- + Series.drop_duplicates : Equivalent method on Series. + DataFrame.drop_duplicates : Equivalent method on DataFrame. + Index.duplicated : Related method on Index, indicating duplicate + Index values. + + Examples + -------- + Generate an pandas.Index with duplicate values. + + >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo']) + + The `keep` parameter controls which duplicate values are removed. + The value 'first' keeps the first occurrence for each + set of duplicated entries. The default value of keep is 'first'. + + >>> idx.drop_duplicates(keep='first') + Index(['lama', 'cow', 'beetle', 'hippo'], dtype='object') + + The value 'last' keeps the last occurrence for each set of duplicated + entries. + + >>> idx.drop_duplicates(keep='last') + Index(['cow', 'beetle', 'lama', 'hippo'], dtype='object') + + The value ``False`` discards all sets of duplicated entries. + + >>> idx.drop_duplicates(keep=False) + Index(['cow', 'beetle', 'hippo'], dtype='object') + """ + return super().drop_duplicates(keep=keep) + + def duplicated(self, keep="first"): + """ + Indicate duplicate index values. + + Duplicated values are indicated as ``True`` values in the resulting + array. Either all duplicates, all except the first, or all except the + last occurrence of duplicates can be indicated. + + Parameters + ---------- + keep : {'first', 'last', False}, default 'first' + The value or values in a set of duplicates to mark as missing. + + - 'first' : Mark duplicates as ``True`` except for the first + occurrence. + - 'last' : Mark duplicates as ``True`` except for the last + occurrence. + - ``False`` : Mark all duplicates as ``True``. + + Returns + ------- + numpy.ndarray + + See Also + -------- + Series.duplicated : Equivalent method on pandas.Series. + DataFrame.duplicated : Equivalent method on pandas.DataFrame. + Index.drop_duplicates : Remove duplicate values from Index. + + Examples + -------- + By default, for each set of duplicated values, the first occurrence is + set to False and all others to True: + + >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama']) + >>> idx.duplicated() + array([False, False, True, False, True]) + + which is equivalent to + + >>> idx.duplicated(keep='first') + array([False, False, True, False, True]) + + By using 'last', the last occurrence of each set of duplicated values + is set on False and all others on True: + + >>> idx.duplicated(keep='last') + array([ True, False, True, False, False]) + + By setting keep on ``False``, all duplicates are True: + + >>> idx.duplicated(keep=False) + array([ True, False, True, False, True]) + """ + return super().duplicated(keep=keep) + + def _get_unique_index(self, dropna=False): + """ + Returns an index containing unique values. + + Parameters + ---------- + dropna : bool + If True, NaN values are dropped. + + Returns + ------- + uniques : index + """ + if self.is_unique and not dropna: + return self + + values = self.values + + if not self.is_unique: + values = self.unique() + + if dropna: + try: + if self.hasnans: + values = values[~isna(values)] + except NotImplementedError: + pass + + return self._shallow_copy(values) + + # -------------------------------------------------------------------- + # Arithmetic & Logical Methods + + def __add__(self, other): + if isinstance(other, (ABCSeries, ABCDataFrame)): + return NotImplemented + from pandas import Series + + return Index(Series(self) + other) + + def __radd__(self, other): + from pandas import Series + + return Index(other + Series(self)) + + def __iadd__(self, other): + # alias for __add__ + return self + other + + def __sub__(self, other): + return Index(np.array(self) - other) + + def __rsub__(self, other): + # wrap Series to ensure we pin name correctly + from pandas import Series + + return Index(other - Series(self)) + + def __and__(self, other): + return self.intersection(other) + + def __or__(self, other): + return self.union(other) + + def __xor__(self, other): + return self.symmetric_difference(other) + + def __nonzero__(self): + raise ValueError( + f"The truth value of a {type(self).__name__} is ambiguous. " + "Use a.empty, a.bool(), a.item(), a.any() or a.all()." + ) + + __bool__ = __nonzero__ + + # -------------------------------------------------------------------- + # Set Operation Methods + + def _get_reconciled_name_object(self, other): + """ + If the result of a set operation will be self, + return self, unless the name changes, in which + case make a shallow copy of self. + """ + name = get_op_result_name(self, other) + if self.name != name: + return self._shallow_copy(name=name) + return self + + def _union_incompatible_dtypes(self, other, sort): + """ + Casts this and other index to object dtype to allow the formation + of a union between incompatible types. + + Parameters + ---------- + other : Index or array-like + sort : False or None, default False + Whether to sort the resulting index. + + * False : do not sort the result. + * None : sort the result, except when `self` and `other` are equal + or when the values cannot be compared. + + Returns + ------- + Index + """ + this = self.astype(object, copy=False) + # cast to Index for when `other` is list-like + other = Index(other).astype(object, copy=False) + return Index.union(this, other, sort=sort).astype(object, copy=False) + + def _is_compatible_with_other(self, other): + """ + Check whether this and the other dtype are compatible with each other. + Meaning a union can be formed between them without needing to be cast + to dtype object. + + Parameters + ---------- + other : Index or array-like + + Returns + ------- + bool + """ + return type(self) is type(other) and is_dtype_equal(self.dtype, other.dtype) + + def _validate_sort_keyword(self, sort): + if sort not in [None, False]: + raise ValueError( + "The 'sort' keyword only takes the values of " + f"None or False; {sort} was passed." + ) + + def union(self, other, sort=None): + """ + Form the union of two Index objects. + + If the Index objects are incompatible, both Index objects will be + cast to dtype('object') first. + + .. versionchanged:: 0.25.0 + + Parameters + ---------- + other : Index or array-like + sort : bool or None, default None + Whether to sort the resulting Index. + + * None : Sort the result, except when + + 1. `self` and `other` are equal. + 2. `self` or `other` has length 0. + 3. Some values in `self` or `other` cannot be compared. + A RuntimeWarning is issued in this case. + + * False : do not sort the result. + + .. versionadded:: 0.24.0 + + .. versionchanged:: 0.24.1 + + Changed the default value from ``True`` to ``None`` + (without change in behaviour). + + Returns + ------- + union : Index + + Examples + -------- + + Union matching dtypes + + >>> idx1 = pd.Index([1, 2, 3, 4]) + >>> idx2 = pd.Index([3, 4, 5, 6]) + >>> idx1.union(idx2) + Int64Index([1, 2, 3, 4, 5, 6], dtype='int64') + + Union mismatched dtypes + + >>> idx1 = pd.Index(['a', 'b', 'c', 'd']) + >>> idx2 = pd.Index([1, 2, 3, 4]) + >>> idx1.union(idx2) + Index(['a', 'b', 'c', 'd', 1, 2, 3, 4], dtype='object') + """ + self._validate_sort_keyword(sort) + self._assert_can_do_setop(other) + + if not self._is_compatible_with_other(other): + return self._union_incompatible_dtypes(other, sort=sort) + + return self._union(other, sort=sort) + + def _union(self, other, sort): + """ + Specific union logic should go here. In subclasses, union behavior + should be overwritten here rather than in `self.union`. + + Parameters + ---------- + other : Index or array-like + sort : False or None, default False + Whether to sort the resulting index. + + * False : do not sort the result. + * None : sort the result, except when `self` and `other` are equal + or when the values cannot be compared. + + Returns + ------- + Index + """ + + if not len(other) or self.equals(other): + return self._get_reconciled_name_object(other) + + if not len(self): + return other._get_reconciled_name_object(self) + + # TODO(EA): setops-refactor, clean all this up + if is_datetime64tz_dtype(self): + lvals = self._ndarray_values + else: + lvals = self._values + if is_datetime64tz_dtype(other): + rvals = other._ndarray_values + else: + rvals = other._values + + if sort is None and self.is_monotonic and other.is_monotonic: + try: + result = self._outer_indexer(lvals, rvals)[0] + except TypeError: + # incomparable objects + result = list(lvals) + + # worth making this faster? a very unusual case + value_set = set(lvals) + result.extend([x for x in rvals if x not in value_set]) + else: + # find indexes of things in "other" that are not in "self" + if self.is_unique: + indexer = self.get_indexer(other) + indexer = (indexer == -1).nonzero()[0] + else: + indexer = algos.unique1d(self.get_indexer_non_unique(other)[1]) + + if len(indexer) > 0: + other_diff = algos.take_nd(rvals, indexer, allow_fill=False) + result = concat_compat((lvals, other_diff)) + + else: + result = lvals + + if sort is None: + try: + result = algos.safe_sort(result) + except TypeError as err: + warnings.warn( + f"{err}, sort order is undefined for incomparable objects", + RuntimeWarning, + stacklevel=3, + ) + + # for subclasses + return self._wrap_setop_result(other, result) + + def _wrap_setop_result(self, other, result): + return self._constructor(result, name=get_op_result_name(self, other)) + + _index_shared_docs[ + "intersection" + ] = """ + Form the intersection of two Index objects. + + This returns a new Index with elements common to the index and `other`. + + Parameters + ---------- + other : Index or array-like + sort : False or None, default False + Whether to sort the resulting index. + + * False : do not sort the result. + * None : sort the result, except when `self` and `other` are equal + or when the values cannot be compared. + + .. versionadded:: 0.24.0 + + .. versionchanged:: 0.24.1 + + Changed the default from ``True`` to ``False``, to match + the behaviour of 0.23.4 and earlier. + + Returns + ------- + intersection : Index + + Examples + -------- + + >>> idx1 = pd.Index([1, 2, 3, 4]) + >>> idx2 = pd.Index([3, 4, 5, 6]) + >>> idx1.intersection(idx2) + Int64Index([3, 4], dtype='int64') + """ + + # TODO: standardize return type of non-union setops type(self vs other) + @Appender(_index_shared_docs["intersection"]) + def intersection(self, other, sort=False): + self._validate_sort_keyword(sort) + self._assert_can_do_setop(other) + other = ensure_index(other) + + if self.equals(other): + return self._get_reconciled_name_object(other) + + if not is_dtype_equal(self.dtype, other.dtype): + this = self.astype("O") + other = other.astype("O") + return this.intersection(other, sort=sort) + + # TODO(EA): setops-refactor, clean all this up + lvals = self._values + rvals = other._values + + if self.is_monotonic and other.is_monotonic: + try: + result = self._inner_indexer(lvals, rvals)[0] + return self._wrap_setop_result(other, result) + except TypeError: + pass + + try: + indexer = Index(rvals).get_indexer(lvals) + indexer = indexer.take((indexer != -1).nonzero()[0]) + except (InvalidIndexError, IncompatibleFrequency): + # InvalidIndexError raised by get_indexer if non-unique + # IncompatibleFrequency raised by PeriodIndex.get_indexer + indexer = algos.unique1d(Index(rvals).get_indexer_non_unique(lvals)[0]) + indexer = indexer[indexer != -1] + + taken = other.take(indexer) + res_name = get_op_result_name(self, other) + + if sort is None: + taken = algos.safe_sort(taken.values) + return self._shallow_copy(taken, name=res_name) + + taken.name = res_name + return taken + + def difference(self, other, sort=None): + """ + Return a new Index with elements from the index that are not in + `other`. + + This is the set difference of two Index objects. + + Parameters + ---------- + other : Index or array-like + sort : False or None, default None + Whether to sort the resulting index. By default, the + values are attempted to be sorted, but any TypeError from + incomparable elements is caught by pandas. + + * None : Attempt to sort the result, but catch any TypeErrors + from comparing incomparable elements. + * False : Do not sort the result. + + .. versionadded:: 0.24.0 + + .. versionchanged:: 0.24.1 + + Changed the default value from ``True`` to ``None`` + (without change in behaviour). + + Returns + ------- + difference : Index + + Examples + -------- + + >>> idx1 = pd.Index([2, 1, 3, 4]) + >>> idx2 = pd.Index([3, 4, 5, 6]) + >>> idx1.difference(idx2) + Int64Index([1, 2], dtype='int64') + >>> idx1.difference(idx2, sort=False) + Int64Index([2, 1], dtype='int64') + """ + self._validate_sort_keyword(sort) + self._assert_can_do_setop(other) + + if self.equals(other): + # pass an empty np.ndarray with the appropriate dtype + return self._shallow_copy(self._data[:0]) + + other, result_name = self._convert_can_do_setop(other) + + this = self._get_unique_index() + + indexer = this.get_indexer(other) + indexer = indexer.take((indexer != -1).nonzero()[0]) + + label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True) + the_diff = this.values.take(label_diff) + if sort is None: + try: + the_diff = algos.safe_sort(the_diff) + except TypeError: + pass + + return this._shallow_copy(the_diff, name=result_name) + + def symmetric_difference(self, other, result_name=None, sort=None): + """ + Compute the symmetric difference of two Index objects. + + Parameters + ---------- + other : Index or array-like + result_name : str + sort : False or None, default None + Whether to sort the resulting index. By default, the + values are attempted to be sorted, but any TypeError from + incomparable elements is caught by pandas. + + * None : Attempt to sort the result, but catch any TypeErrors + from comparing incomparable elements. + * False : Do not sort the result. + + .. versionadded:: 0.24.0 + + .. versionchanged:: 0.24.1 + + Changed the default value from ``True`` to ``None`` + (without change in behaviour). + + Returns + ------- + symmetric_difference : Index + + Notes + ----- + ``symmetric_difference`` contains elements that appear in either + ``idx1`` or ``idx2`` but not both. Equivalent to the Index created by + ``idx1.difference(idx2) | idx2.difference(idx1)`` with duplicates + dropped. + + Examples + -------- + >>> idx1 = pd.Index([1, 2, 3, 4]) + >>> idx2 = pd.Index([2, 3, 4, 5]) + >>> idx1.symmetric_difference(idx2) + Int64Index([1, 5], dtype='int64') + + You can also use the ``^`` operator: + + >>> idx1 ^ idx2 + Int64Index([1, 5], dtype='int64') + """ + self._validate_sort_keyword(sort) + self._assert_can_do_setop(other) + other, result_name_update = self._convert_can_do_setop(other) + if result_name is None: + result_name = result_name_update + + this = self._get_unique_index() + other = other._get_unique_index() + indexer = this.get_indexer(other) + + # {this} minus {other} + common_indexer = indexer.take((indexer != -1).nonzero()[0]) + left_indexer = np.setdiff1d( + np.arange(this.size), common_indexer, assume_unique=True + ) + left_diff = this._values.take(left_indexer) + + # {other} minus {this} + right_indexer = (indexer == -1).nonzero()[0] + right_diff = other._values.take(right_indexer) + + the_diff = concat_compat([left_diff, right_diff]) + if sort is None: + try: + the_diff = algos.safe_sort(the_diff) + except TypeError: + pass + + attribs = self._get_attributes_dict() + attribs["name"] = result_name + if "freq" in attribs: + attribs["freq"] = None + return self._shallow_copy_with_infer(the_diff, **attribs) + + def _assert_can_do_setop(self, other): + if not is_list_like(other): + raise TypeError("Input must be Index or array-like") + return True + + def _convert_can_do_setop(self, other): + if not isinstance(other, Index): + other = Index(other, name=self.name) + result_name = self.name + else: + result_name = get_op_result_name(self, other) + return other, result_name + + # -------------------------------------------------------------------- + # Indexing Methods + + _index_shared_docs[ + "get_loc" + ] = """ + Get integer location, slice or boolean mask for requested label. + + Parameters + ---------- + key : label + method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional + * default: exact matches only. + * pad / ffill: find the PREVIOUS index value if no exact match. + * backfill / bfill: use NEXT index value if no exact match + * nearest: use the NEAREST index value if no exact match. Tied + distances are broken by preferring the larger index value. + tolerance : int or float, optional + Maximum distance from index value for inexact matches. The value of + the index at the matching location most satisfy the equation + ``abs(index[loc] - key) <= tolerance``. + + .. versionadded:: 0.21.0 (list-like tolerance) + + Returns + ------- + loc : int if unique index, slice if monotonic index, else mask + + Examples + -------- + >>> unique_index = pd.Index(list('abc')) + >>> unique_index.get_loc('b') + 1 + + >>> monotonic_index = pd.Index(list('abbc')) + >>> monotonic_index.get_loc('b') + slice(1, 3, None) + + >>> non_monotonic_index = pd.Index(list('abcb')) + >>> non_monotonic_index.get_loc('b') + array([False, True, False, True], dtype=bool) + """ + + @Appender(_index_shared_docs["get_loc"]) + def get_loc(self, key, method=None, tolerance=None): + if method is None: + if tolerance is not None: + raise ValueError( + "tolerance argument only valid if using pad, " + "backfill or nearest lookups" + ) + try: + return self._engine.get_loc(key) + except KeyError: + return self._engine.get_loc(self._maybe_cast_indexer(key)) + indexer = self.get_indexer([key], method=method, tolerance=tolerance) + if indexer.ndim > 1 or indexer.size > 1: + raise TypeError("get_loc requires scalar valued input") + loc = indexer.item() + if loc == -1: + raise KeyError(key) + return loc + + _index_shared_docs[ + "get_indexer" + ] = """ + Compute indexer and mask for new index given the current index. The + indexer should be then used as an input to ndarray.take to align the + current data to the new index. + + Parameters + ---------- + target : %(target_klass)s + method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional + * default: exact matches only. + * pad / ffill: find the PREVIOUS index value if no exact match. + * backfill / bfill: use NEXT index value if no exact match + * nearest: use the NEAREST index value if no exact match. Tied + distances are broken by preferring the larger index value. + limit : int, optional + Maximum number of consecutive labels in ``target`` to match for + inexact matches. + tolerance : optional + Maximum distance between original and new labels for inexact + matches. The values of the index at the matching locations most + satisfy the equation ``abs(index[indexer] - target) <= tolerance``. + + Tolerance may be a scalar value, which applies the same tolerance + to all values, or list-like, which applies variable tolerance per + element. List-like includes list, tuple, array, Series, and must be + the same size as the index and its dtype must exactly match the + index's type. + + .. versionadded:: 0.21.0 (list-like tolerance) + + Returns + ------- + indexer : ndarray of int + Integers from 0 to n - 1 indicating that the index at these + positions matches the corresponding target values. Missing values + in the target are marked by -1. + %(raises_section)s + Examples + -------- + >>> index = pd.Index(['c', 'a', 'b']) + >>> index.get_indexer(['a', 'b', 'x']) + array([ 1, 2, -1]) + + Notice that the return value is an array of locations in ``index`` + and ``x`` is marked by -1, as it is not in ``index``. + """ + + @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) + def get_indexer(self, target, method=None, limit=None, tolerance=None): + method = missing.clean_reindex_fill_method(method) + target = ensure_index(target) + if tolerance is not None: + tolerance = self._convert_tolerance(tolerance, target) + + # Treat boolean labels passed to a numeric index as not found. Without + # this fix False and True would be treated as 0 and 1 respectively. + # (GH #16877) + if target.is_boolean() and self.is_numeric(): + return ensure_platform_int(np.repeat(-1, target.size)) + + pself, ptarget = self._maybe_promote(target) + if pself is not self or ptarget is not target: + return pself.get_indexer( + ptarget, method=method, limit=limit, tolerance=tolerance + ) + + if not is_dtype_equal(self.dtype, target.dtype): + this = self.astype(object) + target = target.astype(object) + return this.get_indexer( + target, method=method, limit=limit, tolerance=tolerance + ) + + if not self.is_unique: + raise InvalidIndexError( + "Reindexing only valid with uniquely valued Index objects" + ) + + if method == "pad" or method == "backfill": + indexer = self._get_fill_indexer(target, method, limit, tolerance) + elif method == "nearest": + indexer = self._get_nearest_indexer(target, limit, tolerance) + else: + if tolerance is not None: + raise ValueError( + "tolerance argument only valid if doing pad, " + "backfill or nearest reindexing" + ) + if limit is not None: + raise ValueError( + "limit argument only valid if doing pad, " + "backfill or nearest reindexing" + ) + + indexer = self._engine.get_indexer(target._ndarray_values) + + return ensure_platform_int(indexer) + + def _convert_tolerance(self, tolerance, target): + # override this method on subclasses + tolerance = np.asarray(tolerance) + if target.size != tolerance.size and tolerance.size > 1: + raise ValueError("list-like tolerance size must match target index size") + return tolerance + + def _get_fill_indexer(self, target, method, limit=None, tolerance=None): + if self.is_monotonic_increasing and target.is_monotonic_increasing: + method = ( + self._engine.get_pad_indexer + if method == "pad" + else self._engine.get_backfill_indexer + ) + indexer = method(target._ndarray_values, limit) + else: + indexer = self._get_fill_indexer_searchsorted(target, method, limit) + if tolerance is not None: + indexer = self._filter_indexer_tolerance( + target._ndarray_values, indexer, tolerance + ) + return indexer + + def _get_fill_indexer_searchsorted(self, target, method, limit=None): + """ + Fallback pad/backfill get_indexer that works for monotonic decreasing + indexes and non-monotonic targets. + """ + if limit is not None: + raise ValueError( + f"limit argument for {repr(method)} method only well-defined " + "if index and target are monotonic" + ) + + side = "left" if method == "pad" else "right" + + # find exact matches first (this simplifies the algorithm) + indexer = self.get_indexer(target) + nonexact = indexer == -1 + indexer[nonexact] = self._searchsorted_monotonic(target[nonexact], side) + if side == "left": + # searchsorted returns "indices into a sorted array such that, + # if the corresponding elements in v were inserted before the + # indices, the order of a would be preserved". + # Thus, we need to subtract 1 to find values to the left. + indexer[nonexact] -= 1 + # This also mapped not found values (values of 0 from + # np.searchsorted) to -1, which conveniently is also our + # sentinel for missing values + else: + # Mark indices to the right of the largest value as not found + indexer[indexer == len(self)] = -1 + return indexer + + def _get_nearest_indexer(self, target, limit, tolerance): + """ + Get the indexer for the nearest index labels; requires an index with + values that can be subtracted from each other (e.g., not strings or + tuples). + """ + left_indexer = self.get_indexer(target, "pad", limit=limit) + right_indexer = self.get_indexer(target, "backfill", limit=limit) + + target = np.asarray(target) + left_distances = abs(self.values[left_indexer] - target) + right_distances = abs(self.values[right_indexer] - target) + + op = operator.lt if self.is_monotonic_increasing else operator.le + indexer = np.where( + op(left_distances, right_distances) | (right_indexer == -1), + left_indexer, + right_indexer, + ) + if tolerance is not None: + indexer = self._filter_indexer_tolerance(target, indexer, tolerance) + return indexer + + def _filter_indexer_tolerance(self, target, indexer, tolerance): + distance = abs(self.values[indexer] - target) + indexer = np.where(distance <= tolerance, indexer, -1) + return indexer + + # -------------------------------------------------------------------- + # Indexer Conversion Methods + + _index_shared_docs[ + "_convert_scalar_indexer" + ] = """ + Convert a scalar indexer. + + Parameters + ---------- + key : label of the slice bound + kind : {'ix', 'loc', 'getitem', 'iloc'} or None + """ + + @Appender(_index_shared_docs["_convert_scalar_indexer"]) + def _convert_scalar_indexer(self, key, kind=None): + assert kind in ["ix", "loc", "getitem", "iloc", None] + + if kind == "iloc": + return self._validate_indexer("positional", key, kind) + + if len(self) and not isinstance(self, ABCMultiIndex): + + # we can raise here if we are definitive that this + # is positional indexing (eg. .ix on with a float) + # or label indexing if we are using a type able + # to be represented in the index + + if kind in ["getitem", "ix"] and is_float(key): + if not self.is_floating(): + return self._invalid_indexer("label", key) + + elif kind in ["loc"] and is_float(key): + + # we want to raise KeyError on string/mixed here + # technically we *could* raise a TypeError + # on anything but mixed though + if self.inferred_type not in [ + "floating", + "mixed-integer-float", + "integer-na", + "string", + "unicode", + "mixed", + ]: + self._invalid_indexer("label", key) + + elif kind in ["loc"] and is_integer(key): + if not self.holds_integer(): + self._invalid_indexer("label", key) + + return key + + _index_shared_docs[ + "_convert_slice_indexer" + ] = """ + Convert a slice indexer. + + By definition, these are labels unless 'iloc' is passed in. + Floats are not allowed as the start, step, or stop of the slice. + + Parameters + ---------- + key : label of the slice bound + kind : {'ix', 'loc', 'getitem', 'iloc'} or None + """ + + @Appender(_index_shared_docs["_convert_slice_indexer"]) + def _convert_slice_indexer(self, key: slice, kind=None): + assert kind in ["ix", "loc", "getitem", "iloc", None] + + # validate iloc + if kind == "iloc": + return slice( + self._validate_indexer("slice", key.start, kind), + self._validate_indexer("slice", key.stop, kind), + self._validate_indexer("slice", key.step, kind), + ) + + # potentially cast the bounds to integers + start, stop, step = key.start, key.stop, key.step + + # figure out if this is a positional indexer + def is_int(v): + return v is None or is_integer(v) + + is_null_slicer = start is None and stop is None + is_index_slice = is_int(start) and is_int(stop) + is_positional = is_index_slice and not ( + self.is_integer() or self.is_categorical() + ) + + if kind == "getitem": + """ + called from the getitem slicers, validate that we are in fact + integers + """ + if self.is_integer() or is_index_slice: + return slice( + self._validate_indexer("slice", key.start, kind), + self._validate_indexer("slice", key.stop, kind), + self._validate_indexer("slice", key.step, kind), + ) + + # convert the slice to an indexer here + + # if we are mixed and have integers + try: + if is_positional and self.is_mixed(): + # Validate start & stop + if start is not None: + self.get_loc(start) + if stop is not None: + self.get_loc(stop) + is_positional = False + except KeyError: + if self.inferred_type in ["mixed-integer-float", "integer-na"]: + raise + + if is_null_slicer: + indexer = key + elif is_positional: + indexer = key + else: + indexer = self.slice_indexer(start, stop, step, kind=kind) + + return indexer + + def _convert_listlike_indexer(self, keyarr, kind=None): + """ + Parameters + ---------- + keyarr : list-like + Indexer to convert. + + Returns + ------- + indexer : numpy.ndarray or None + Return an ndarray or None if cannot convert. + keyarr : numpy.ndarray + Return tuple-safe keys. + """ + if isinstance(keyarr, Index): + keyarr = self._convert_index_indexer(keyarr) + else: + keyarr = self._convert_arr_indexer(keyarr) + + indexer = self._convert_list_indexer(keyarr, kind=kind) + return indexer, keyarr + + _index_shared_docs[ + "_convert_arr_indexer" + ] = """ + Convert an array-like indexer to the appropriate dtype. + + Parameters + ---------- + keyarr : array-like + Indexer to convert. + + Returns + ------- + converted_keyarr : array-like + """ + + @Appender(_index_shared_docs["_convert_arr_indexer"]) + def _convert_arr_indexer(self, keyarr): + keyarr = com.asarray_tuplesafe(keyarr) + return keyarr + + _index_shared_docs[ + "_convert_index_indexer" + ] = """ + Convert an Index indexer to the appropriate dtype. + + Parameters + ---------- + keyarr : Index (or sub-class) + Indexer to convert. + + Returns + ------- + converted_keyarr : Index (or sub-class) + """ + + @Appender(_index_shared_docs["_convert_index_indexer"]) + def _convert_index_indexer(self, keyarr): + return keyarr + + _index_shared_docs[ + "_convert_list_indexer" + ] = """ + Convert a list-like indexer to the appropriate dtype. + + Parameters + ---------- + keyarr : Index (or sub-class) + Indexer to convert. + kind : iloc, ix, loc, optional + + Returns + ------- + positional indexer or None + """ + + @Appender(_index_shared_docs["_convert_list_indexer"]) + def _convert_list_indexer(self, keyarr, kind=None): + if ( + kind in [None, "iloc", "ix"] + and is_integer_dtype(keyarr) + and not self.is_floating() + and not isinstance(keyarr, ABCPeriodIndex) + ): + + if self.inferred_type == "mixed-integer": + indexer = self.get_indexer(keyarr) + if (indexer >= 0).all(): + return indexer + # missing values are flagged as -1 by get_indexer and negative + # indices are already converted to positive indices in the + # above if-statement, so the negative flags are changed to + # values outside the range of indices so as to trigger an + # IndexError in maybe_convert_indices + indexer[indexer < 0] = len(self) + + return maybe_convert_indices(indexer, len(self)) + + elif not self.inferred_type == "integer": + keyarr = np.where(keyarr < 0, len(self) + keyarr, keyarr) + return keyarr + + return None + + def _invalid_indexer(self, form, key): + """ + Consistent invalid indexer message. + """ + raise TypeError( + f"cannot do {form} indexing on {type(self)} with these " + f"indexers [{key}] of {type(key)}" + ) + + # -------------------------------------------------------------------- + # Reindex Methods + + def _can_reindex(self, indexer): + """ + Check if we are allowing reindexing with this particular indexer. + + Parameters + ---------- + indexer : an integer indexer + + Raises + ------ + ValueError if its a duplicate axis + """ + + # trying to reindex on an axis with duplicates + if not self.is_unique and len(indexer): + raise ValueError("cannot reindex from a duplicate axis") + + def reindex(self, target, method=None, level=None, limit=None, tolerance=None): + """ + Create index with target's values (move/add/delete values + as necessary). + + Parameters + ---------- + target : an iterable + + Returns + ------- + new_index : pd.Index + Resulting index. + indexer : np.ndarray or None + Indices of output values in original index. + """ + # GH6552: preserve names when reindexing to non-named target + # (i.e. neither Index nor Series). + preserve_names = not hasattr(target, "name") + + # GH7774: preserve dtype/tz if target is empty and not an Index. + target = _ensure_has_len(target) # target may be an iterator + + if not isinstance(target, Index) and len(target) == 0: + attrs = self._get_attributes_dict() + attrs.pop("freq", None) # don't preserve freq + values = self._data[:0] # appropriately-dtyped empty array + target = self._simple_new(values, dtype=self.dtype, **attrs) + else: + target = ensure_index(target) + + if level is not None: + if method is not None: + raise TypeError("Fill method not supported if level passed") + _, indexer, _ = self._join_level( + target, level, how="right", return_indexers=True + ) + else: + if self.equals(target): + indexer = None + else: + # check is_overlapping for IntervalIndex compat + if self.is_unique and not getattr(self, "is_overlapping", False): + indexer = self.get_indexer( + target, method=method, limit=limit, tolerance=tolerance + ) + else: + if method is not None or limit is not None: + raise ValueError( + "cannot reindex a non-unique index " + "with a method or limit" + ) + indexer, missing = self.get_indexer_non_unique(target) + + if preserve_names and target.nlevels == 1 and target.name != self.name: + target = target.copy() + target.name = self.name + + return target, indexer + + def _reindex_non_unique(self, target): + """ + Create a new index with target's values (move/add/delete values as + necessary) use with non-unique Index and a possibly non-unique target. + + Parameters + ---------- + target : an iterable + + Returns + ------- + new_index : pd.Index + Resulting index. + indexer : np.ndarray or None + Indices of output values in original index. + + """ + + target = ensure_index(target) + indexer, missing = self.get_indexer_non_unique(target) + check = indexer != -1 + new_labels = self.take(indexer[check]) + new_indexer = None + + if len(missing): + length = np.arange(len(indexer)) + + missing = ensure_platform_int(missing) + missing_labels = target.take(missing) + missing_indexer = ensure_int64(length[~check]) + cur_labels = self.take(indexer[check]).values + cur_indexer = ensure_int64(length[check]) + + new_labels = np.empty(tuple([len(indexer)]), dtype=object) + new_labels[cur_indexer] = cur_labels + new_labels[missing_indexer] = missing_labels + + # a unique indexer + if target.is_unique: + + # see GH5553, make sure we use the right indexer + new_indexer = np.arange(len(indexer)) + new_indexer[cur_indexer] = np.arange(len(cur_labels)) + new_indexer[missing_indexer] = -1 + + # we have a non_unique selector, need to use the original + # indexer here + else: + + # need to retake to have the same size as the indexer + indexer[~check] = -1 + + # reset the new indexer to account for the new size + new_indexer = np.arange(len(self.take(indexer))) + new_indexer[~check] = -1 + + new_index = self._shallow_copy_with_infer(new_labels) + return new_index, indexer, new_indexer + + # -------------------------------------------------------------------- + # Join Methods + + _index_shared_docs[ + "join" + ] = """ + Compute join_index and indexers to conform data + structures to the new index. + + Parameters + ---------- + other : Index + how : {'left', 'right', 'inner', 'outer'} + level : int or level name, default None + return_indexers : bool, default False + sort : bool, default False + Sort the join keys lexicographically in the result Index. If False, + the order of the join keys depends on the join type (how keyword). + + Returns + ------- + join_index, (left_indexer, right_indexer) + """ + + @Appender(_index_shared_docs["join"]) + def join(self, other, how="left", level=None, return_indexers=False, sort=False): + self_is_mi = isinstance(self, ABCMultiIndex) + other_is_mi = isinstance(other, ABCMultiIndex) + + # try to figure out the join level + # GH3662 + if level is None and (self_is_mi or other_is_mi): + + # have the same levels/names so a simple join + if self.names == other.names: + pass + else: + return self._join_multi(other, how=how, return_indexers=return_indexers) + + # join on the level + if level is not None and (self_is_mi or other_is_mi): + return self._join_level( + other, level, how=how, return_indexers=return_indexers + ) + + other = ensure_index(other) + + if len(other) == 0 and how in ("left", "outer"): + join_index = self._shallow_copy() + if return_indexers: + rindexer = np.repeat(-1, len(join_index)) + return join_index, None, rindexer + else: + return join_index + + if len(self) == 0 and how in ("right", "outer"): + join_index = other._shallow_copy() + if return_indexers: + lindexer = np.repeat(-1, len(join_index)) + return join_index, lindexer, None + else: + return join_index + + if self._join_precedence < other._join_precedence: + how = {"right": "left", "left": "right"}.get(how, how) + result = other.join( + self, how=how, level=level, return_indexers=return_indexers + ) + if return_indexers: + x, y, z = result + result = x, z, y + return result + + if not is_dtype_equal(self.dtype, other.dtype): + this = self.astype("O") + other = other.astype("O") + return this.join(other, how=how, return_indexers=return_indexers) + + _validate_join_method(how) + + if not self.is_unique and not other.is_unique: + return self._join_non_unique( + other, how=how, return_indexers=return_indexers + ) + elif not self.is_unique or not other.is_unique: + if self.is_monotonic and other.is_monotonic: + return self._join_monotonic( + other, how=how, return_indexers=return_indexers + ) + else: + return self._join_non_unique( + other, how=how, return_indexers=return_indexers + ) + elif self.is_monotonic and other.is_monotonic: + try: + return self._join_monotonic( + other, how=how, return_indexers=return_indexers + ) + except TypeError: + pass + + if how == "left": + join_index = self + elif how == "right": + join_index = other + elif how == "inner": + # TODO: sort=False here for backwards compat. It may + # be better to use the sort parameter passed into join + join_index = self.intersection(other, sort=False) + elif how == "outer": + # TODO: sort=True here for backwards compat. It may + # be better to use the sort parameter passed into join + join_index = self.union(other) + + if sort: + join_index = join_index.sort_values() + + if return_indexers: + if join_index is self: + lindexer = None + else: + lindexer = self.get_indexer(join_index) + if join_index is other: + rindexer = None + else: + rindexer = other.get_indexer(join_index) + return join_index, lindexer, rindexer + else: + return join_index + + def _join_multi(self, other, how, return_indexers=True): + from pandas.core.indexes.multi import MultiIndex + from pandas.core.reshape.merge import _restore_dropped_levels_multijoin + + # figure out join names + self_names = set(com.not_none(*self.names)) + other_names = set(com.not_none(*other.names)) + overlap = self_names & other_names + + # need at least 1 in common + if not overlap: + raise ValueError("cannot join with no overlapping index names") + + self_is_mi = isinstance(self, MultiIndex) + other_is_mi = isinstance(other, MultiIndex) + + if self_is_mi and other_is_mi: + + # Drop the non-matching levels from left and right respectively + ldrop_names = list(self_names - overlap) + rdrop_names = list(other_names - overlap) + + # if only the order differs + if not len(ldrop_names + rdrop_names): + self_jnlevels = self + other_jnlevels = other.reorder_levels(self.names) + else: + self_jnlevels = self.droplevel(ldrop_names) + other_jnlevels = other.droplevel(rdrop_names) + + # Join left and right + # Join on same leveled multi-index frames is supported + join_idx, lidx, ridx = self_jnlevels.join( + other_jnlevels, how, return_indexers=True + ) + + # Restore the dropped levels + # Returned index level order is + # common levels, ldrop_names, rdrop_names + dropped_names = ldrop_names + rdrop_names + + levels, codes, names = _restore_dropped_levels_multijoin( + self, other, dropped_names, join_idx, lidx, ridx + ) + + # Re-create the multi-index + multi_join_idx = MultiIndex( + levels=levels, codes=codes, names=names, verify_integrity=False + ) + + multi_join_idx = multi_join_idx.remove_unused_levels() + + return multi_join_idx, lidx, ridx + + jl = list(overlap)[0] + + # Case where only one index is multi + # make the indices into mi's that match + flip_order = False + if self_is_mi: + self, other = other, self + flip_order = True + # flip if join method is right or left + how = {"right": "left", "left": "right"}.get(how, how) + + level = other.names.index(jl) + result = self._join_level( + other, level, how=how, return_indexers=return_indexers + ) + + if flip_order: + if isinstance(result, tuple): + return result[0], result[2], result[1] + return result + + def _join_non_unique(self, other, how="left", return_indexers=False): + from pandas.core.reshape.merge import _get_join_indexers + + left_idx, right_idx = _get_join_indexers( + [self._ndarray_values], [other._ndarray_values], how=how, sort=True + ) + + left_idx = ensure_platform_int(left_idx) + right_idx = ensure_platform_int(right_idx) + + join_index = np.asarray(self._ndarray_values.take(left_idx)) + mask = left_idx == -1 + np.putmask(join_index, mask, other._ndarray_values.take(right_idx)) + + join_index = self._wrap_joined_index(join_index, other) + + if return_indexers: + return join_index, left_idx, right_idx + else: + return join_index + + def _join_level( + self, other, level, how="left", return_indexers=False, keep_order=True + ): + """ + The join method *only* affects the level of the resulting + MultiIndex. Otherwise it just exactly aligns the Index data to the + labels of the level in the MultiIndex. + + If ```keep_order == True```, the order of the data indexed by the + MultiIndex will not be changed; otherwise, it will tie out + with `other`. + """ + from pandas.core.indexes.multi import MultiIndex + + def _get_leaf_sorter(labels): + """ + Returns sorter for the inner most level while preserving the + order of higher levels. + """ + if labels[0].size == 0: + return np.empty(0, dtype="int64") + + if len(labels) == 1: + lab = ensure_int64(labels[0]) + sorter, _ = libalgos.groupsort_indexer(lab, 1 + lab.max()) + return sorter + + # find indexers of beginning of each set of + # same-key labels w.r.t all but last level + tic = labels[0][:-1] != labels[0][1:] + for lab in labels[1:-1]: + tic |= lab[:-1] != lab[1:] + + starts = np.hstack(([True], tic, [True])).nonzero()[0] + lab = ensure_int64(labels[-1]) + return lib.get_level_sorter(lab, ensure_int64(starts)) + + if isinstance(self, MultiIndex) and isinstance(other, MultiIndex): + raise TypeError("Join on level between two MultiIndex objects is ambiguous") + + left, right = self, other + + flip_order = not isinstance(self, MultiIndex) + if flip_order: + left, right = right, left + how = {"right": "left", "left": "right"}.get(how, how) + + level = left._get_level_number(level) + old_level = left.levels[level] + + if not right.is_unique: + raise NotImplementedError( + "Index._join_level on non-unique index is not implemented" + ) + + new_level, left_lev_indexer, right_lev_indexer = old_level.join( + right, how=how, return_indexers=True + ) + + if left_lev_indexer is None: + if keep_order or len(left) == 0: + left_indexer = None + join_index = left + else: # sort the leaves + left_indexer = _get_leaf_sorter(left.codes[: level + 1]) + join_index = left[left_indexer] + + else: + left_lev_indexer = ensure_int64(left_lev_indexer) + rev_indexer = lib.get_reverse_indexer(left_lev_indexer, len(old_level)) + + new_lev_codes = algos.take_nd( + rev_indexer, left.codes[level], allow_fill=False + ) + + new_codes = list(left.codes) + new_codes[level] = new_lev_codes + + new_levels = list(left.levels) + new_levels[level] = new_level + + if keep_order: # just drop missing values. o.w. keep order + left_indexer = np.arange(len(left), dtype=np.intp) + mask = new_lev_codes != -1 + if not mask.all(): + new_codes = [lab[mask] for lab in new_codes] + left_indexer = left_indexer[mask] + + else: # tie out the order with other + if level == 0: # outer most level, take the fast route + ngroups = 1 + new_lev_codes.max() + left_indexer, counts = libalgos.groupsort_indexer( + new_lev_codes, ngroups + ) + + # missing values are placed first; drop them! + left_indexer = left_indexer[counts[0] :] + new_codes = [lab[left_indexer] for lab in new_codes] + + else: # sort the leaves + mask = new_lev_codes != -1 + mask_all = mask.all() + if not mask_all: + new_codes = [lab[mask] for lab in new_codes] + + left_indexer = _get_leaf_sorter(new_codes[: level + 1]) + new_codes = [lab[left_indexer] for lab in new_codes] + + # left_indexers are w.r.t masked frame. + # reverse to original frame! + if not mask_all: + left_indexer = mask.nonzero()[0][left_indexer] + + join_index = MultiIndex( + levels=new_levels, + codes=new_codes, + names=left.names, + verify_integrity=False, + ) + + if right_lev_indexer is not None: + right_indexer = algos.take_nd( + right_lev_indexer, join_index.codes[level], allow_fill=False + ) + else: + right_indexer = join_index.codes[level] + + if flip_order: + left_indexer, right_indexer = right_indexer, left_indexer + + if return_indexers: + left_indexer = ( + None if left_indexer is None else ensure_platform_int(left_indexer) + ) + right_indexer = ( + None if right_indexer is None else ensure_platform_int(right_indexer) + ) + return join_index, left_indexer, right_indexer + else: + return join_index + + def _join_monotonic(self, other, how="left", return_indexers=False): + if self.equals(other): + ret_index = other if how == "right" else self + if return_indexers: + return ret_index, None, None + else: + return ret_index + + sv = self._ndarray_values + ov = other._ndarray_values + + if self.is_unique and other.is_unique: + # We can perform much better than the general case + if how == "left": + join_index = self + lidx = None + ridx = self._left_indexer_unique(sv, ov) + elif how == "right": + join_index = other + lidx = self._left_indexer_unique(ov, sv) + ridx = None + elif how == "inner": + join_index, lidx, ridx = self._inner_indexer(sv, ov) + join_index = self._wrap_joined_index(join_index, other) + elif how == "outer": + join_index, lidx, ridx = self._outer_indexer(sv, ov) + join_index = self._wrap_joined_index(join_index, other) + else: + if how == "left": + join_index, lidx, ridx = self._left_indexer(sv, ov) + elif how == "right": + join_index, ridx, lidx = self._left_indexer(ov, sv) + elif how == "inner": + join_index, lidx, ridx = self._inner_indexer(sv, ov) + elif how == "outer": + join_index, lidx, ridx = self._outer_indexer(sv, ov) + join_index = self._wrap_joined_index(join_index, other) + + if return_indexers: + lidx = None if lidx is None else ensure_platform_int(lidx) + ridx = None if ridx is None else ensure_platform_int(ridx) + return join_index, lidx, ridx + else: + return join_index + + def _wrap_joined_index(self, joined, other): + name = get_op_result_name(self, other) + return Index(joined, name=name) + + # -------------------------------------------------------------------- + # Uncategorized Methods + + @property + def values(self): + """ + Return an array representing the data in the Index. + + .. warning:: + + We recommend using :attr:`Index.array` or + :meth:`Index.to_numpy`, depending on whether you need + a reference to the underlying data or a NumPy array. + + Returns + ------- + array: numpy.ndarray or ExtensionArray + + See Also + -------- + Index.array : Reference to the underlying data. + Index.to_numpy : A NumPy array representing the underlying data. + """ + return self._data.view(np.ndarray) + + @cache_readonly + @Appender(IndexOpsMixin.array.__doc__) # type: ignore + def array(self) -> ExtensionArray: + array = self._data + if isinstance(array, np.ndarray): + from pandas.core.arrays.numpy_ import PandasArray + + array = PandasArray(array) + return array + + @property + def _values(self) -> Union[ExtensionArray, ABCIndexClass, np.ndarray]: + # TODO(EA): remove index types as they become extension arrays + """ + The best array representation. + + This is an ndarray, ExtensionArray, or Index subclass. This differs + from ``_ndarray_values``, which always returns an ndarray. + + Both ``_values`` and ``_ndarray_values`` are consistent between + ``Series`` and ``Index``. + + It may differ from the public '.values' method. + + index | values | _values | _ndarray_values | + ----------------- | --------------- | ------------- | --------------- | + Index | ndarray | ndarray | ndarray | + CategoricalIndex | Categorical | Categorical | ndarray[int] | + DatetimeIndex | ndarray[M8ns] | ndarray[M8ns] | ndarray[M8ns] | + DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] | ndarray[M8ns] | + PeriodIndex | ndarray[object] | PeriodArray | ndarray[int] | + IntervalIndex | IntervalArray | IntervalArray | ndarray[object] | + + See Also + -------- + values + _ndarray_values + """ + return self._data + + def _internal_get_values(self): + """ + Return `Index` data as an `numpy.ndarray`. + + Returns + ------- + numpy.ndarray + A one-dimensional numpy array of the `Index` values. + + See Also + -------- + Index.values : The attribute that _internal_get_values wraps. + + Examples + -------- + Getting the `Index` values of a `DataFrame`: + + >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + ... index=['a', 'b', 'c'], columns=['A', 'B', 'C']) + >>> df + A B C + a 1 2 3 + b 4 5 6 + c 7 8 9 + >>> df.index._internal_get_values() + array(['a', 'b', 'c'], dtype=object) + + Standalone `Index` values: + + >>> idx = pd.Index(['1', '2', '3']) + >>> idx._internal_get_values() + array(['1', '2', '3'], dtype=object) + + `MultiIndex` arrays also have only one dimension: + + >>> midx = pd.MultiIndex.from_arrays([[1, 2, 3], ['a', 'b', 'c']], + ... names=('number', 'letter')) + >>> midx._internal_get_values() + array([(1, 'a'), (2, 'b'), (3, 'c')], dtype=object) + >>> midx._internal_get_values().ndim + 1 + """ + return self.values + + @Appender(IndexOpsMixin.memory_usage.__doc__) + def memory_usage(self, deep=False): + result = super().memory_usage(deep=deep) + + # include our engine hashtable + result += self._engine.sizeof(deep=deep) + return result + + _index_shared_docs[ + "where" + ] = """ + Return an Index of same shape as self and whose corresponding + entries are from self where cond is True and otherwise are from + other. + + Parameters + ---------- + cond : bool array-like with the same length as self + other : scalar, or array-like + + Returns + ------- + Index + """ + + @Appender(_index_shared_docs["where"]) + def where(self, cond, other=None): + if other is None: + other = self._na_value + + dtype = self.dtype + values = self.values + + if is_bool(other) or is_bool_dtype(other): + + # bools force casting + values = values.astype(object) + dtype = None + + values = np.where(cond, values, other) + + if self._is_numeric_dtype and np.any(isna(values)): + # We can't coerce to the numeric dtype of "self" (unless + # it's float) if there are NaN values in our output. + dtype = None + + return self._shallow_copy_with_infer(values, dtype=dtype) + + # construction helpers + @classmethod + def _scalar_data_error(cls, data): + # We return the TypeError so that we can raise it from the constructor + # in order to keep mypy happy + return TypeError( + f"{cls.__name__}(...) must be called with a collection of some " + f"kind, {repr(data)} was passed" + ) + + @classmethod + def _string_data_error(cls, data): + raise TypeError( + "String dtype not supported, you may need " + "to explicitly cast to a numeric type" + ) + + def _coerce_scalar_to_index(self, item): + """ + We need to coerce a scalar to a compat for our index type. + + Parameters + ---------- + item : scalar item to coerce + """ + dtype = self.dtype + + if self._is_numeric_dtype and isna(item): + # We can't coerce to the numeric dtype of "self" (unless + # it's float) if there are NaN values in our output. + dtype = None + + return Index([item], dtype=dtype, **self._get_attributes_dict()) + + def _to_safe_for_reshape(self): + """ + Convert to object if we are a categorical. + """ + return self + + def _convert_for_op(self, value): + """ + Convert value to be insertable to ndarray. + """ + return value + + def _assert_can_do_op(self, value): + """ + Check value is valid for scalar op. + """ + if not is_scalar(value): + raise TypeError(f"'value' must be a scalar, passed: {type(value).__name__}") + + @property + def _has_complex_internals(self): + """ + Indicates if an index is not directly backed by a numpy array + """ + # used to avoid libreduction code paths, which raise or require conversion + return False + + def _is_memory_usage_qualified(self) -> bool: + """ + Return a boolean if we need a qualified .info display. + """ + return self.is_object() + + def is_type_compatible(self, kind) -> bool: + """ + Whether the index type is compatible with the provided type. + """ + return kind == self.inferred_type + + _index_shared_docs[ + "contains" + ] = """ + Return a boolean indicating whether the provided key is in the index. + + Parameters + ---------- + key : label + The key to check if it is present in the index. + + Returns + ------- + bool + Whether the key search is in the index. + + See Also + -------- + Index.isin : Returns an ndarray of boolean dtype indicating whether the + list-like key is in the index. + + Examples + -------- + >>> idx = pd.Index([1, 2, 3, 4]) + >>> idx + Int64Index([1, 2, 3, 4], dtype='int64') + + >>> 2 in idx + True + >>> 6 in idx + False + """ + + @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) + def __contains__(self, key) -> bool: + hash(key) + try: + return key in self._engine + except (OverflowError, TypeError, ValueError): + return False + + def __hash__(self): + raise TypeError(f"unhashable type: {repr(type(self).__name__)}") + + def __setitem__(self, key, value): + raise TypeError("Index does not support mutable operations") + + def __getitem__(self, key): + """ + Override numpy.ndarray's __getitem__ method to work as desired. + + This function adds lists and Series as valid boolean indexers + (ndarrays only supports ndarray with dtype=bool). + + If resulting ndim != 1, plain ndarray is returned instead of + corresponding `Index` subclass. + + """ + # There's no custom logic to be implemented in __getslice__, so it's + # not overloaded intentionally. + getitem = self._data.__getitem__ + promote = self._shallow_copy + + if is_scalar(key): + key = com.cast_scalar_indexer(key) + return getitem(key) + + if isinstance(key, slice): + # This case is separated from the conditional above to avoid + # pessimization of basic indexing. + return promote(getitem(key)) + + if com.is_bool_indexer(key): + key = np.asarray(key, dtype=bool) + + key = com.values_from_object(key) + result = getitem(key) + if not is_scalar(result): + if np.ndim(result) > 1: + deprecate_ndim_indexing(result) + return result + return promote(result) + else: + return result + + def _can_hold_identifiers_and_holds_name(self, name) -> bool: + """ + Faster check for ``name in self`` when we know `name` is a Python + identifier (e.g. in NDFrame.__getattr__, which hits this to support + . key lookup). For indexes that can't hold identifiers (everything + but object & categorical) we just return False. + + https://github.com/pandas-dev/pandas/issues/19764 + """ + if self.is_object() or self.is_categorical(): + return name in self + return False + + def append(self, other): + """ + Append a collection of Index options together. + + Parameters + ---------- + other : Index or list/tuple of indices + + Returns + ------- + appended : Index + """ + + to_concat = [self] + + if isinstance(other, (list, tuple)): + to_concat = to_concat + list(other) + else: + to_concat.append(other) + + for obj in to_concat: + if not isinstance(obj, Index): + raise TypeError("all inputs must be Index") + + names = {obj.name for obj in to_concat} + name = None if len(names) > 1 else self.name + + return self._concat(to_concat, name) + + def _concat(self, to_concat, name): + + typs = _concat.get_dtype_kinds(to_concat) + + if len(typs) == 1: + return self._concat_same_dtype(to_concat, name=name) + return Index._concat_same_dtype(self, to_concat, name=name) + + def _concat_same_dtype(self, to_concat, name): + """ + Concatenate to_concat which has the same class. + """ + # must be overridden in specific classes + klasses = ( + ABCDatetimeIndex, + ABCTimedeltaIndex, + ABCPeriodIndex, + ExtensionArray, + ABCIntervalIndex, + ) + to_concat = [ + x.astype(object) if isinstance(x, klasses) else x for x in to_concat + ] + + self = to_concat[0] + attribs = self._get_attributes_dict() + attribs["name"] = name + + to_concat = [x._values if isinstance(x, Index) else x for x in to_concat] + + return self._shallow_copy_with_infer(np.concatenate(to_concat), **attribs) + + def putmask(self, mask, value): + """ + Return a new Index of the values set with the mask. + + Returns + ------- + Index + + See Also + -------- + numpy.ndarray.putmask + """ + values = self.values.copy() + try: + np.putmask(values, mask, self._convert_for_op(value)) + return self._shallow_copy(values) + except (ValueError, TypeError) as err: + if is_object_dtype(self): + raise err + + # coerces to object + return self.astype(object).putmask(mask, value) + + def equals(self, other) -> bool: + """ + Determine if two Index objects contain the same elements. + + Returns + ------- + bool + True if "other" is an Index and it has the same elements as calling + index; False otherwise. + """ + if self.is_(other): + return True + + if not isinstance(other, Index): + return False + + if is_object_dtype(self) and not is_object_dtype(other): + # if other is not object, use other's logic for coercion + return other.equals(self) + + if isinstance(other, ABCMultiIndex): + # d-level MultiIndex can equal d-tuple Index + if not is_object_dtype(self.dtype): + if self.nlevels != other.nlevels: + return False + + return array_equivalent( + com.values_from_object(self), com.values_from_object(other) + ) + + def identical(self, other) -> bool: + """ + Similar to equals, but check that other comparable attributes are + also equal. + + Returns + ------- + bool + If two Index objects have equal elements and same type True, + otherwise False. + """ + return ( + self.equals(other) + and all( + ( + getattr(self, c, None) == getattr(other, c, None) + for c in self._comparables + ) + ) + and type(self) == type(other) + ) + + def asof(self, label): + """ + Return the label from the index, or, if not present, the previous one. + + Assuming that the index is sorted, return the passed index label if it + is in the index, or return the previous index label if the passed one + is not in the index. + + Parameters + ---------- + label : object + The label up to which the method returns the latest index label. + + Returns + ------- + object + The passed label if it is in the index. The previous label if the + passed label is not in the sorted index or `NaN` if there is no + such label. + + See Also + -------- + Series.asof : Return the latest value in a Series up to the + passed index. + merge_asof : Perform an asof merge (similar to left join but it + matches on nearest key rather than equal key). + Index.get_loc : An `asof` is a thin wrapper around `get_loc` + with method='pad'. + + Examples + -------- + `Index.asof` returns the latest index label up to the passed label. + + >>> idx = pd.Index(['2013-12-31', '2014-01-02', '2014-01-03']) + >>> idx.asof('2014-01-01') + '2013-12-31' + + If the label is in the index, the method returns the passed label. + + >>> idx.asof('2014-01-02') + '2014-01-02' + + If all of the labels in the index are later than the passed label, + NaN is returned. + + >>> idx.asof('1999-01-02') + nan + + If the index is not sorted, an error is raised. + + >>> idx_not_sorted = pd.Index(['2013-12-31', '2015-01-02', + ... '2014-01-03']) + >>> idx_not_sorted.asof('2013-12-31') + Traceback (most recent call last): + ValueError: index must be monotonic increasing or decreasing + """ + try: + loc = self.get_loc(label, method="pad") + except KeyError: + return self._na_value + else: + if isinstance(loc, slice): + loc = loc.indices(len(self))[-1] + return self[loc] + + def asof_locs(self, where, mask): + """ + Find the locations (indices) of the labels from the index for + every entry in the `where` argument. + + As in the `asof` function, if the label (a particular entry in + `where`) is not in the index, the latest index label up to the + passed label is chosen and its index returned. + + If all of the labels in the index are later than a label in `where`, + -1 is returned. + + `mask` is used to ignore NA values in the index during calculation. + + Parameters + ---------- + where : Index + An Index consisting of an array of timestamps. + mask : array-like + Array of booleans denoting where values in the original + data are not NA. + + Returns + ------- + numpy.ndarray + An array of locations (indices) of the labels from the Index + which correspond to the return values of the `asof` function + for every element in `where`. + """ + locs = self.values[mask].searchsorted(where.values, side="right") + locs = np.where(locs > 0, locs - 1, 0) + + result = np.arange(len(self))[mask].take(locs) + + first = mask.argmax() + result[(locs == 0) & (where.values < self.values[first])] = -1 + + return result + + def sort_values(self, return_indexer=False, ascending=True): + """ + Return a sorted copy of the index. + + Return a sorted copy of the index, and optionally return the indices + that sorted the index itself. + + Parameters + ---------- + return_indexer : bool, default False + Should the indices that would sort the index be returned. + ascending : bool, default True + Should the index values be sorted in an ascending order. + + Returns + ------- + sorted_index : pandas.Index + Sorted copy of the index. + indexer : numpy.ndarray, optional + The indices that the index itself was sorted by. + + See Also + -------- + Series.sort_values : Sort values of a Series. + DataFrame.sort_values : Sort values in a DataFrame. + + Examples + -------- + >>> idx = pd.Index([10, 100, 1, 1000]) + >>> idx + Int64Index([10, 100, 1, 1000], dtype='int64') + + Sort values in ascending order (default behavior). + + >>> idx.sort_values() + Int64Index([1, 10, 100, 1000], dtype='int64') + + Sort values in descending order, and also get the indices `idx` was + sorted by. + + >>> idx.sort_values(ascending=False, return_indexer=True) + (Int64Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2])) + """ + _as = self.argsort() + if not ascending: + _as = _as[::-1] + + sorted_index = self.take(_as) + + if return_indexer: + return sorted_index, _as + else: + return sorted_index + + def sort(self, *args, **kwargs): + """ + Use sort_values instead. + """ + raise TypeError("cannot sort an Index object in-place, use sort_values instead") + + def shift(self, periods=1, freq=None): + """ + Shift index by desired number of time frequency increments. + + This method is for shifting the values of datetime-like indexes + by a specified time increment a given number of times. + + Parameters + ---------- + periods : int, default 1 + Number of periods (or increments) to shift by, + can be positive or negative. + freq : pandas.DateOffset, pandas.Timedelta or str, optional + Frequency increment to shift by. + If None, the index is shifted by its own `freq` attribute. + Offset aliases are valid strings, e.g., 'D', 'W', 'M' etc. + + Returns + ------- + pandas.Index + Shifted index. + + See Also + -------- + Series.shift : Shift values of Series. + + Notes + ----- + This method is only implemented for datetime-like index classes, + i.e., DatetimeIndex, PeriodIndex and TimedeltaIndex. + + Examples + -------- + Put the first 5 month starts of 2011 into an index. + + >>> month_starts = pd.date_range('1/1/2011', periods=5, freq='MS') + >>> month_starts + DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01', '2011-04-01', + '2011-05-01'], + dtype='datetime64[ns]', freq='MS') + + Shift the index by 10 days. + + >>> month_starts.shift(10, freq='D') + DatetimeIndex(['2011-01-11', '2011-02-11', '2011-03-11', '2011-04-11', + '2011-05-11'], + dtype='datetime64[ns]', freq=None) + + The default value of `freq` is the `freq` attribute of the index, + which is 'MS' (month start) in this example. + + >>> month_starts.shift(10) + DatetimeIndex(['2011-11-01', '2011-12-01', '2012-01-01', '2012-02-01', + '2012-03-01'], + dtype='datetime64[ns]', freq='MS') + """ + raise NotImplementedError(f"Not supported for type {type(self).__name__}") + + def argsort(self, *args, **kwargs): + """ + Return the integer indices that would sort the index. + + Parameters + ---------- + *args + Passed to `numpy.ndarray.argsort`. + **kwargs + Passed to `numpy.ndarray.argsort`. + + Returns + ------- + numpy.ndarray + Integer indices that would sort the index if used as + an indexer. + + See Also + -------- + numpy.argsort : Similar method for NumPy arrays. + Index.sort_values : Return sorted copy of Index. + + Examples + -------- + >>> idx = pd.Index(['b', 'a', 'd', 'c']) + >>> idx + Index(['b', 'a', 'd', 'c'], dtype='object') + + >>> order = idx.argsort() + >>> order + array([1, 0, 3, 2]) + + >>> idx[order] + Index(['a', 'b', 'c', 'd'], dtype='object') + """ + result = self.asi8 + if result is None: + result = np.array(self) + return result.argsort(*args, **kwargs) + + _index_shared_docs[ + "get_value" + ] = """ + Fast lookup of value from 1-dimensional ndarray. Only use this if you + know what you're doing. + + Returns + ------- + scalar + A value in the Series with the index of the key value in self. + """ + + @Appender(_index_shared_docs["get_value"] % _index_doc_kwargs) + def get_value(self, series, key): + + # if we have something that is Index-like, then + # use this, e.g. DatetimeIndex + # Things like `Series._get_value` (via .at) pass the EA directly here. + s = extract_array(series, extract_numpy=True) + if isinstance(s, ExtensionArray): + if is_scalar(key): + # GH 20882, 21257 + # First try to convert the key to a location + # If that fails, raise a KeyError if an integer + # index, otherwise, see if key is an integer, and + # try that + try: + iloc = self.get_loc(key) + return s[iloc] + except KeyError: + if len(self) > 0 and (self.holds_integer() or self.is_boolean()): + raise + elif is_integer(key): + return s[key] + else: + # if key is not a scalar, directly raise an error (the code below + # would convert to numpy arrays and raise later any way) - GH29926 + raise InvalidIndexError(key) + + s = com.values_from_object(series) + k = com.values_from_object(key) + + k = self._convert_scalar_indexer(k, kind="getitem") + try: + return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None)) + except KeyError as e1: + if len(self) > 0 and (self.holds_integer() or self.is_boolean()): + raise + + try: + return libindex.get_value_at(s, key) + except IndexError: + raise + except TypeError: + # generator/iterator-like + if is_iterator(key): + raise InvalidIndexError(key) + else: + raise e1 + except Exception: + raise e1 + except TypeError: + # e.g. "[False] is an invalid key" + if is_scalar(key): + raise IndexError(key) + raise InvalidIndexError(key) + + def set_value(self, arr, key, value): + """ + Fast lookup of value from 1-dimensional ndarray. + + .. deprecated:: 1.0 + + Notes + ----- + Only use this if you know what you're doing. + """ + warnings.warn( + ( + "The 'set_value' method is deprecated, and " + "will be removed in a future version." + ), + FutureWarning, + stacklevel=2, + ) + self._engine.set_value( + com.values_from_object(arr), com.values_from_object(key), value + ) + + _index_shared_docs[ + "get_indexer_non_unique" + ] = """ + Compute indexer and mask for new index given the current index. The + indexer should be then used as an input to ndarray.take to align the + current data to the new index. + + Parameters + ---------- + target : %(target_klass)s + + Returns + ------- + indexer : ndarray of int + Integers from 0 to n - 1 indicating that the index at these + positions matches the corresponding target values. Missing values + in the target are marked by -1. + missing : ndarray of int + An indexer into the target of the values not found. + These correspond to the -1 in the indexer array. + """ + + @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) + def get_indexer_non_unique(self, target): + target = ensure_index(target) + pself, ptarget = self._maybe_promote(target) + if pself is not self or ptarget is not target: + return pself.get_indexer_non_unique(ptarget) + + if is_categorical(target): + tgt_values = np.asarray(target) + elif self.is_all_dates and target.is_all_dates: # GH 30399 + tgt_values = target.asi8 + else: + tgt_values = target._ndarray_values + + indexer, missing = self._engine.get_indexer_non_unique(tgt_values) + return ensure_platform_int(indexer), missing + + def get_indexer_for(self, target, **kwargs): + """ + Guaranteed return of an indexer even when non-unique. + + This dispatches to get_indexer or get_indexer_non_unique + as appropriate. + + Returns + ------- + numpy.ndarray + List of indices. + """ + if self.is_unique: + return self.get_indexer(target, **kwargs) + indexer, _ = self.get_indexer_non_unique(target, **kwargs) + return indexer + + def _maybe_promote(self, other): + # A hack, but it works + + if self.inferred_type == "date" and isinstance(other, ABCDatetimeIndex): + return type(other)(self), other + elif self.inferred_type == "boolean": + if not is_object_dtype(self.dtype): + return self.astype("object"), other.astype("object") + return self, other + + def groupby(self, values) -> Dict[Hashable, np.ndarray]: + """ + Group the index labels by a given array of values. + + Parameters + ---------- + values : array + Values used to determine the groups. + + Returns + ------- + dict + {group name -> group labels} + """ + + # TODO: if we are a MultiIndex, we can do better + # that converting to tuples + if isinstance(values, ABCMultiIndex): + values = values.values + values = ensure_categorical(values) + result = values._reverse_indexer() + + # map to the label + result = {k: self.take(v) for k, v in result.items()} + + return result + + def map(self, mapper, na_action=None): + """ + Map values using input correspondence (a dict, Series, or function). + + Parameters + ---------- + mapper : function, dict, or Series + Mapping correspondence. + na_action : {None, 'ignore'} + If 'ignore', propagate NA values, without passing them to the + mapping correspondence. + + Returns + ------- + applied : Union[Index, MultiIndex], inferred + The output of the mapping function applied to the index. + If the function returns a tuple with more than one element + a MultiIndex will be returned. + """ + + from pandas.core.indexes.multi import MultiIndex + + new_values = super()._map_values(mapper, na_action=na_action) + + attributes = self._get_attributes_dict() + + # we can return a MultiIndex + if new_values.size and isinstance(new_values[0], tuple): + if isinstance(self, MultiIndex): + names = self.names + elif attributes.get("name"): + names = [attributes.get("name")] * len(new_values[0]) + else: + names = None + return MultiIndex.from_tuples(new_values, names=names) + + attributes["copy"] = False + if not new_values.size: + # empty + attributes["dtype"] = self.dtype + + return Index(new_values, **attributes) + + def isin(self, values, level=None): + """ + Return a boolean array where the index values are in `values`. + + Compute boolean array of whether each index value is found in the + passed set of values. The length of the returned boolean array matches + the length of the index. + + Parameters + ---------- + values : set or list-like + Sought values. + level : str or int, optional + Name or position of the index level to use (if the index is a + `MultiIndex`). + + Returns + ------- + is_contained : ndarray + NumPy array of boolean values. + + See Also + -------- + Series.isin : Same for Series. + DataFrame.isin : Same method for DataFrames. + + Notes + ----- + In the case of `MultiIndex` you must either specify `values` as a + list-like object containing tuples that are the same length as the + number of levels, or specify `level`. Otherwise it will raise a + ``ValueError``. + + If `level` is specified: + + - if it is the name of one *and only one* index level, use that level; + - otherwise it should be a number indicating level position. + + Examples + -------- + >>> idx = pd.Index([1,2,3]) + >>> idx + Int64Index([1, 2, 3], dtype='int64') + + Check whether each index value in a list of values. + >>> idx.isin([1, 4]) + array([ True, False, False]) + + >>> midx = pd.MultiIndex.from_arrays([[1,2,3], + ... ['red', 'blue', 'green']], + ... names=('number', 'color')) + >>> midx + MultiIndex(levels=[[1, 2, 3], ['blue', 'green', 'red']], + codes=[[0, 1, 2], [2, 0, 1]], + names=['number', 'color']) + + Check whether the strings in the 'color' level of the MultiIndex + are in a list of colors. + + >>> midx.isin(['red', 'orange', 'yellow'], level='color') + array([ True, False, False]) + + To check across the levels of a MultiIndex, pass a list of tuples: + + >>> midx.isin([(1, 'red'), (3, 'red')]) + array([ True, False, False]) + + For a DatetimeIndex, string values in `values` are converted to + Timestamps. + + >>> dates = ['2000-03-11', '2000-03-12', '2000-03-13'] + >>> dti = pd.to_datetime(dates) + >>> dti + DatetimeIndex(['2000-03-11', '2000-03-12', '2000-03-13'], + dtype='datetime64[ns]', freq=None) + + >>> dti.isin(['2000-03-11']) + array([ True, False, False]) + """ + if level is not None: + self._validate_index_level(level) + return algos.isin(self, values) + + def _get_string_slice(self, key, use_lhs=True, use_rhs=True): + # this is for partial string indexing, + # overridden in DatetimeIndex, TimedeltaIndex and PeriodIndex + raise NotImplementedError + + def slice_indexer(self, start=None, end=None, step=None, kind=None): + """ + For an ordered or unique index, compute the slice indexer for input + labels and step. + + Parameters + ---------- + start : label, default None + If None, defaults to the beginning. + end : label, default None + If None, defaults to the end. + step : int, default None + kind : str, default None + + Returns + ------- + indexer : slice + + Raises + ------ + KeyError : If key does not exist, or key is not unique and index is + not ordered. + + Notes + ----- + This function assumes that the data is sorted, so use at your own peril + + Examples + -------- + This is a method on all index types. For example you can do: + + >>> idx = pd.Index(list('abcd')) + >>> idx.slice_indexer(start='b', end='c') + slice(1, 3) + + >>> idx = pd.MultiIndex.from_arrays([list('abcd'), list('efgh')]) + >>> idx.slice_indexer(start='b', end=('c', 'g')) + slice(1, 3) + """ + start_slice, end_slice = self.slice_locs(start, end, step=step, kind=kind) + + # return a slice + if not is_scalar(start_slice): + raise AssertionError("Start slice bound is non-scalar") + if not is_scalar(end_slice): + raise AssertionError("End slice bound is non-scalar") + + return slice(start_slice, end_slice, step) + + def _maybe_cast_indexer(self, key): + """ + If we have a float key and are not a floating index, then try to cast + to an int if equivalent. + """ + + if is_float(key) and not self.is_floating(): + try: + ckey = int(key) + if ckey == key: + key = ckey + except (OverflowError, ValueError, TypeError): + pass + return key + + def _validate_indexer(self, form, key, kind): + """ + If we are positional indexer, validate that we have appropriate + typed bounds must be an integer. + """ + assert kind in ["ix", "loc", "getitem", "iloc"] + + if key is None: + pass + elif is_integer(key): + pass + elif kind in ["iloc", "getitem"]: + self._invalid_indexer(form, key) + return key + + _index_shared_docs[ + "_maybe_cast_slice_bound" + ] = """ + This function should be overloaded in subclasses that allow non-trivial + casting on label-slice bounds, e.g. datetime-like indices allowing + strings containing formatted datetimes. + + Parameters + ---------- + label : object + side : {'left', 'right'} + kind : {'ix', 'loc', 'getitem'} + + Returns + ------- + label : object + + Notes + ----- + Value of `side` parameter should be validated in caller. + """ + + @Appender(_index_shared_docs["_maybe_cast_slice_bound"]) + def _maybe_cast_slice_bound(self, label, side, kind): + assert kind in ["ix", "loc", "getitem", None] + + # We are a plain index here (sub-class override this method if they + # wish to have special treatment for floats/ints, e.g. Float64Index and + # datetimelike Indexes + # reject them + if is_float(label): + if not (kind in ["ix"] and (self.holds_integer() or self.is_floating())): + self._invalid_indexer("slice", label) + + # we are trying to find integer bounds on a non-integer based index + # this is rejected (generally .loc gets you here) + elif is_integer(label): + self._invalid_indexer("slice", label) + + return label + + def _searchsorted_monotonic(self, label, side="left"): + if self.is_monotonic_increasing: + return self.searchsorted(label, side=side) + elif self.is_monotonic_decreasing: + # np.searchsorted expects ascending sort order, have to reverse + # everything for it to work (element ordering, search side and + # resulting value). + pos = self[::-1].searchsorted( + label, side="right" if side == "left" else "left" + ) + return len(self) - pos + + raise ValueError("index must be monotonic increasing or decreasing") + + def get_slice_bound(self, label, side, kind): + """ + Calculate slice bound that corresponds to given label. + + Returns leftmost (one-past-the-rightmost if ``side=='right'``) position + of given label. + + Parameters + ---------- + label : object + side : {'left', 'right'} + kind : {'ix', 'loc', 'getitem'} + + Returns + ------- + int + Index of label. + """ + assert kind in ["ix", "loc", "getitem", None] + + if side not in ("left", "right"): + raise ValueError( + f"Invalid value for side kwarg, must be either" + f" 'left' or 'right': {side}" + ) + + original_label = label + + # For datetime indices label may be a string that has to be converted + # to datetime boundary according to its resolution. + label = self._maybe_cast_slice_bound(label, side, kind) + + # we need to look up the label + try: + slc = self.get_loc(label) + except KeyError as err: + try: + return self._searchsorted_monotonic(label, side) + except ValueError: + # raise the original KeyError + raise err + + if isinstance(slc, np.ndarray): + # get_loc may return a boolean array or an array of indices, which + # is OK as long as they are representable by a slice. + if is_bool_dtype(slc): + slc = lib.maybe_booleans_to_slice(slc.view("u1")) + else: + slc = lib.maybe_indices_to_slice(slc.astype("i8"), len(self)) + if isinstance(slc, np.ndarray): + raise KeyError( + f"Cannot get {side} slice bound for non-unique " + f"label: {repr(original_label)}" + ) + + if isinstance(slc, slice): + if side == "left": + return slc.start + else: + return slc.stop + else: + if side == "right": + return slc + 1 + else: + return slc + + def slice_locs(self, start=None, end=None, step=None, kind=None): + """ + Compute slice locations for input labels. + + Parameters + ---------- + start : label, default None + If None, defaults to the beginning. + end : label, default None + If None, defaults to the end. + step : int, defaults None + If None, defaults to 1. + kind : {'ix', 'loc', 'getitem'} or None + + Returns + ------- + start, end : int + + See Also + -------- + Index.get_loc : Get location for a single label. + + Notes + ----- + This method only works if the index is monotonic or unique. + + Examples + -------- + >>> idx = pd.Index(list('abcd')) + >>> idx.slice_locs(start='b', end='c') + (1, 3) + """ + inc = step is None or step >= 0 + + if not inc: + # If it's a reverse slice, temporarily swap bounds. + start, end = end, start + + # GH 16785: If start and end happen to be date strings with UTC offsets + # attempt to parse and check that the offsets are the same + if isinstance(start, (str, datetime)) and isinstance(end, (str, datetime)): + try: + ts_start = Timestamp(start) + ts_end = Timestamp(end) + except (ValueError, TypeError): + pass + else: + if not tz_compare(ts_start.tzinfo, ts_end.tzinfo): + raise ValueError("Both dates must have the same UTC offset") + + start_slice = None + if start is not None: + start_slice = self.get_slice_bound(start, "left", kind) + if start_slice is None: + start_slice = 0 + + end_slice = None + if end is not None: + end_slice = self.get_slice_bound(end, "right", kind) + if end_slice is None: + end_slice = len(self) + + if not inc: + # Bounds at this moment are swapped, swap them back and shift by 1. + # + # slice_locs('B', 'A', step=-1): s='B', e='A' + # + # s='A' e='B' + # AFTER SWAP: | | + # v ------------------> V + # ----------------------------------- + # | | |A|A|A|A| | | | | |B|B| | | | | + # ----------------------------------- + # ^ <------------------ ^ + # SHOULD BE: | | + # end=s-1 start=e-1 + # + end_slice, start_slice = start_slice - 1, end_slice - 1 + + # i == -1 triggers ``len(self) + i`` selection that points to the + # last element, not before-the-first one, subtracting len(self) + # compensates that. + if end_slice == -1: + end_slice -= len(self) + if start_slice == -1: + start_slice -= len(self) + + return start_slice, end_slice + + def delete(self, loc): + """ + Make new Index with passed location(-s) deleted. + + Returns + ------- + new_index : Index + """ + return self._shallow_copy(np.delete(self._data, loc)) + + def insert(self, loc, item): + """ + Make new Index inserting new item at location. + + Follows Python list.append semantics for negative values. + + Parameters + ---------- + loc : int + item : object + + Returns + ------- + new_index : Index + """ + _self = np.asarray(self) + item = self._coerce_scalar_to_index(item)._ndarray_values + idx = np.concatenate((_self[:loc], item, _self[loc:])) + return self._shallow_copy_with_infer(idx) + + def drop(self, labels, errors="raise"): + """ + Make new Index with passed list of labels deleted. + + Parameters + ---------- + labels : array-like + errors : {'ignore', 'raise'}, default 'raise' + If 'ignore', suppress error and existing labels are dropped. + + Returns + ------- + dropped : Index + + Raises + ------ + KeyError + If not all of the labels are found in the selected axis + """ + arr_dtype = "object" if self.dtype == "object" else None + labels = com.index_labels_to_array(labels, dtype=arr_dtype) + indexer = self.get_indexer(labels) + mask = indexer == -1 + if mask.any(): + if errors != "ignore": + raise KeyError(f"{labels[mask]} not found in axis") + indexer = indexer[~mask] + return self.delete(indexer) + + # -------------------------------------------------------------------- + # Generated Arithmetic, Comparison, and Unary Methods + + @classmethod + def _add_comparison_methods(cls): + """ + Add in comparison methods. + """ + cls.__eq__ = _make_comparison_op(operator.eq, cls) + cls.__ne__ = _make_comparison_op(operator.ne, cls) + cls.__lt__ = _make_comparison_op(operator.lt, cls) + cls.__gt__ = _make_comparison_op(operator.gt, cls) + cls.__le__ = _make_comparison_op(operator.le, cls) + cls.__ge__ = _make_comparison_op(operator.ge, cls) + + @classmethod + def _add_numeric_methods_add_sub_disabled(cls): + """ + Add in the numeric add/sub methods to disable. + """ + cls.__add__ = make_invalid_op("__add__") + cls.__radd__ = make_invalid_op("__radd__") + cls.__iadd__ = make_invalid_op("__iadd__") + cls.__sub__ = make_invalid_op("__sub__") + cls.__rsub__ = make_invalid_op("__rsub__") + cls.__isub__ = make_invalid_op("__isub__") + + @classmethod + def _add_numeric_methods_disabled(cls): + """ + Add in numeric methods to disable other than add/sub. + """ + cls.__pow__ = make_invalid_op("__pow__") + cls.__rpow__ = make_invalid_op("__rpow__") + cls.__mul__ = make_invalid_op("__mul__") + cls.__rmul__ = make_invalid_op("__rmul__") + cls.__floordiv__ = make_invalid_op("__floordiv__") + cls.__rfloordiv__ = make_invalid_op("__rfloordiv__") + cls.__truediv__ = make_invalid_op("__truediv__") + cls.__rtruediv__ = make_invalid_op("__rtruediv__") + cls.__mod__ = make_invalid_op("__mod__") + cls.__divmod__ = make_invalid_op("__divmod__") + cls.__neg__ = make_invalid_op("__neg__") + cls.__pos__ = make_invalid_op("__pos__") + cls.__abs__ = make_invalid_op("__abs__") + cls.__inv__ = make_invalid_op("__inv__") + + @classmethod + def _add_numeric_methods_binary(cls): + """ + Add in numeric methods. + """ + cls.__add__ = _make_arithmetic_op(operator.add, cls) + cls.__radd__ = _make_arithmetic_op(ops.radd, cls) + cls.__sub__ = _make_arithmetic_op(operator.sub, cls) + cls.__rsub__ = _make_arithmetic_op(ops.rsub, cls) + cls.__rpow__ = _make_arithmetic_op(ops.rpow, cls) + cls.__pow__ = _make_arithmetic_op(operator.pow, cls) + + cls.__truediv__ = _make_arithmetic_op(operator.truediv, cls) + cls.__rtruediv__ = _make_arithmetic_op(ops.rtruediv, cls) + + # TODO: rmod? rdivmod? + cls.__mod__ = _make_arithmetic_op(operator.mod, cls) + cls.__floordiv__ = _make_arithmetic_op(operator.floordiv, cls) + cls.__rfloordiv__ = _make_arithmetic_op(ops.rfloordiv, cls) + cls.__divmod__ = _make_arithmetic_op(divmod, cls) + cls.__mul__ = _make_arithmetic_op(operator.mul, cls) + cls.__rmul__ = _make_arithmetic_op(ops.rmul, cls) + + @classmethod + def _add_numeric_methods_unary(cls): + """ + Add in numeric unary methods. + """ + + def _make_evaluate_unary(op, opstr): + def _evaluate_numeric_unary(self): + + attrs = self._get_attributes_dict() + return Index(op(self.values), **attrs) + + _evaluate_numeric_unary.__name__ = opstr + return _evaluate_numeric_unary + + cls.__neg__ = _make_evaluate_unary(operator.neg, "__neg__") + cls.__pos__ = _make_evaluate_unary(operator.pos, "__pos__") + cls.__abs__ = _make_evaluate_unary(np.abs, "__abs__") + cls.__inv__ = _make_evaluate_unary(lambda x: -x, "__inv__") + + @classmethod + def _add_numeric_methods(cls): + cls._add_numeric_methods_unary() + cls._add_numeric_methods_binary() + + @classmethod + def _add_logical_methods(cls): + """ + Add in logical methods. + """ + _doc = """ + %(desc)s + + Parameters + ---------- + *args + These parameters will be passed to numpy.%(outname)s. + **kwargs + These parameters will be passed to numpy.%(outname)s. + + Returns + ------- + %(outname)s : bool or array_like (if axis is specified) + A single element array_like may be converted to bool.""" + + _index_shared_docs["index_all"] = dedent( + """ + + See Also + -------- + Index.any : Return whether any element in an Index is True. + Series.any : Return whether any element in a Series is True. + Series.all : Return whether all elements in a Series are True. + + Notes + ----- + Not a Number (NaN), positive infinity and negative infinity + evaluate to True because these are not equal to zero. + + Examples + -------- + **all** + + True, because nonzero integers are considered True. + + >>> pd.Index([1, 2, 3]).all() + True + + False, because ``0`` is considered False. + + >>> pd.Index([0, 1, 2]).all() + False + + **any** + + True, because ``1`` is considered True. + + >>> pd.Index([0, 0, 1]).any() + True + + False, because ``0`` is considered False. + + >>> pd.Index([0, 0, 0]).any() + False + """ + ) + + _index_shared_docs["index_any"] = dedent( + """ + + See Also + -------- + Index.all : Return whether all elements are True. + Series.all : Return whether all elements are True. + + Notes + ----- + Not a Number (NaN), positive infinity and negative infinity + evaluate to True because these are not equal to zero. + + Examples + -------- + >>> index = pd.Index([0, 1, 2]) + >>> index.any() + True + + >>> index = pd.Index([0, 0, 0]) + >>> index.any() + False + """ + ) + + def _make_logical_function(name, desc, f): + @Substitution(outname=name, desc=desc) + @Appender(_index_shared_docs["index_" + name]) + @Appender(_doc) + def logical_func(self, *args, **kwargs): + result = f(self.values) + if ( + isinstance(result, (np.ndarray, ABCSeries, Index)) + and result.ndim == 0 + ): + # return NumPy type + return result.dtype.type(result.item()) + else: # pragma: no cover + return result + + logical_func.__name__ = name + return logical_func + + cls.all = _make_logical_function( + "all", "Return whether all elements are True.", np.all + ) + cls.any = _make_logical_function( + "any", "Return whether any element is True.", np.any + ) + + @classmethod + def _add_logical_methods_disabled(cls): + """ + Add in logical methods to disable. + """ + cls.all = make_invalid_op("all") + cls.any = make_invalid_op("any") + + @property + def shape(self): + """ + Return a tuple of the shape of the underlying data. + """ + # not using "(len(self), )" to return "correct" shape if the values + # consists of a >1 D array (see GH-27775) + # overridden in MultiIndex.shape to avoid materializing the values + return self._values.shape + + +Index._add_numeric_methods_disabled() +Index._add_logical_methods() +Index._add_comparison_methods() + + +def ensure_index_from_sequences(sequences, names=None): + """ + Construct an index from sequences of data. + + A single sequence returns an Index. Many sequences returns a + MultiIndex. + + Parameters + ---------- + sequences : sequence of sequences + names : sequence of str + + Returns + ------- + index : Index or MultiIndex + + Examples + -------- + >>> ensure_index_from_sequences([[1, 2, 3]], names=['name']) + Int64Index([1, 2, 3], dtype='int64', name='name') + + >>> ensure_index_from_sequences([['a', 'a'], ['a', 'b']], + names=['L1', 'L2']) + MultiIndex([('a', 'a'), + ('a', 'b')], + names=['L1', 'L2']) + + See Also + -------- + ensure_index + """ + from pandas.core.indexes.multi import MultiIndex + + if len(sequences) == 1: + if names is not None: + names = names[0] + return Index(sequences[0], name=names) + else: + return MultiIndex.from_arrays(sequences, names=names) + + +def ensure_index(index_like, copy=False): + """ + Ensure that we have an index from some index-like object. + + Parameters + ---------- + index : sequence + An Index or other sequence + copy : bool + + Returns + ------- + index : Index or MultiIndex + + Examples + -------- + >>> ensure_index(['a', 'b']) + Index(['a', 'b'], dtype='object') + + >>> ensure_index([('a', 'a'), ('b', 'c')]) + Index([('a', 'a'), ('b', 'c')], dtype='object') + + >>> ensure_index([['a', 'a'], ['b', 'c']]) + MultiIndex([('a', 'b'), + ('a', 'c')], + dtype='object') + ) + + See Also + -------- + ensure_index_from_sequences + """ + if isinstance(index_like, Index): + if copy: + index_like = index_like.copy() + return index_like + if hasattr(index_like, "name"): + return Index(index_like, name=index_like.name, copy=copy) + + if is_iterator(index_like): + index_like = list(index_like) + + # must check for exactly list here because of strict type + # check in clean_index_list + if isinstance(index_like, list): + if type(index_like) != list: + index_like = list(index_like) + + converted, all_arrays = lib.clean_index_list(index_like) + + if len(converted) > 0 and all_arrays: + from pandas.core.indexes.multi import MultiIndex + + return MultiIndex.from_arrays(converted) + else: + index_like = converted + else: + # clean_index_list does the equivalent of copying + # so only need to do this if not list instance + if copy: + from copy import copy + + index_like = copy(index_like) + + return Index(index_like) + + +def _ensure_has_len(seq): + """ + If seq is an iterator, put its values into a list. + """ + try: + len(seq) + except TypeError: + return list(seq) + else: + return seq + + +def _trim_front(strings): + """ + Trims zeros and decimal points. + """ + trimmed = strings + while len(strings) > 0 and all(x[0] == " " for x in trimmed): + trimmed = [x[1:] for x in trimmed] + return trimmed + + +def _validate_join_method(method): + if method not in ["left", "right", "inner", "outer"]: + raise ValueError(f"do not recognize join method {method}") + + +def default_index(n): + from pandas.core.indexes.range import RangeIndex + + return RangeIndex(0, n, name=None) + + +def maybe_extract_name(name, obj, cls) -> Optional[Hashable]: + """ + If no name is passed, then extract it from data, validating hashability. + """ + if name is None and isinstance(obj, (Index, ABCSeries)): + # Note we don't just check for "name" attribute since that would + # pick up e.g. dtype.name + name = obj.name + + # GH#29069 + if not is_hashable(name): + raise TypeError(f"{cls.__name__}.name must be a hashable type") + + return name + + +def _maybe_cast_with_dtype(data: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray: + """ + If a dtype is passed, cast to the closest matching dtype that is supported + by Index. + + Parameters + ---------- + data : np.ndarray + dtype : np.dtype + copy : bool + + Returns + ------- + np.ndarray + """ + # we need to avoid having numpy coerce + # things that look like ints/floats to ints unless + # they are actually ints, e.g. '0' and 0.0 + # should not be coerced + # GH 11836 + if is_integer_dtype(dtype): + inferred = lib.infer_dtype(data, skipna=False) + if inferred == "integer": + data = maybe_cast_to_integer_array(data, dtype, copy=copy) + elif inferred in ["floating", "mixed-integer-float"]: + if isna(data).any(): + raise ValueError("cannot convert float NaN to integer") + + if inferred == "mixed-integer-float": + data = maybe_cast_to_integer_array(data, dtype) + + # If we are actually all equal to integers, + # then coerce to integer. + try: + data = _try_convert_to_int_array(data, copy, dtype) + except ValueError: + data = np.array(data, dtype=np.float64, copy=copy) + + elif inferred == "string": + pass + else: + data = data.astype(dtype) + elif is_float_dtype(dtype): + inferred = lib.infer_dtype(data, skipna=False) + if inferred == "string": + pass + else: + data = data.astype(dtype) + else: + data = np.array(data, dtype=dtype, copy=copy) + + return data + + +def _maybe_cast_data_without_dtype(subarr): + """ + If we have an arraylike input but no passed dtype, try to infer + a supported dtype. + + Parameters + ---------- + subarr : np.ndarray, Index, or Series + + Returns + ------- + converted : np.ndarray or ExtensionArray + dtype : np.dtype or ExtensionDtype + """ + # Runtime import needed bc IntervalArray imports Index + from pandas.core.arrays import ( + IntervalArray, + PeriodArray, + DatetimeArray, + TimedeltaArray, + ) + + inferred = lib.infer_dtype(subarr, skipna=False) + + if inferred == "integer": + try: + data = _try_convert_to_int_array(subarr, False, None) + return data, data.dtype + except ValueError: + pass + + return subarr, object + + elif inferred in ["floating", "mixed-integer-float", "integer-na"]: + # TODO: Returns IntegerArray for integer-na case in the future + return subarr, np.float64 + + elif inferred == "interval": + try: + data = IntervalArray._from_sequence(subarr, copy=False) + return data, data.dtype + except ValueError: + # GH27172: mixed closed Intervals --> object dtype + pass + elif inferred == "boolean": + # don't support boolean explicitly ATM + pass + elif inferred != "string": + if inferred.startswith("datetime"): + try: + data = DatetimeArray._from_sequence(subarr, copy=False) + return data, data.dtype + except (ValueError, OutOfBoundsDatetime): + # GH 27011 + # If we have mixed timezones, just send it + # down the base constructor + pass + + elif inferred.startswith("timedelta"): + data = TimedeltaArray._from_sequence(subarr, copy=False) + return data, data.dtype + elif inferred == "period": + try: + data = PeriodArray._from_sequence(subarr) + return data, data.dtype + except IncompatibleFrequency: + pass + + return subarr, subarr.dtype + + +def _try_convert_to_int_array( + data: np.ndarray, copy: bool, dtype: np.dtype +) -> np.ndarray: + """ + Attempt to convert an array of data into an integer array. + + Parameters + ---------- + data : The data to convert. + copy : bool + Whether to copy the data or not. + dtype : np.dtype + + Returns + ------- + int_array : data converted to either an ndarray[int64] or ndarray[uint64] + + Raises + ------ + ValueError if the conversion was not successful. + """ + + if not is_unsigned_integer_dtype(dtype): + # skip int64 conversion attempt if uint-like dtype is passed, as + # this could return Int64Index when UInt64Index is what's desired + try: + res = data.astype("i8", copy=False) + if (res == data).all(): + return res # TODO: might still need to copy + except (OverflowError, TypeError, ValueError): + pass + + # Conversion to int64 failed (possibly due to overflow) or was skipped, + # so let's try now with uint64. + try: + res = data.astype("u8", copy=False) + if (res == data).all(): + return res # TODO: might still need to copy + except (OverflowError, TypeError, ValueError): + pass + + raise ValueError diff --git a/venv/Lib/site-packages/pandas/core/indexes/category.py b/venv/Lib/site-packages/pandas/core/indexes/category.py new file mode 100644 index 0000000..5120136 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/indexes/category.py @@ -0,0 +1,884 @@ +from typing import Any, List +import warnings + +import numpy as np + +from pandas._config import get_option + +from pandas._libs import index as libindex +from pandas._libs.hashtable import duplicated_int64 +from pandas._typing import AnyArrayLike +from pandas.util._decorators import Appender, cache_readonly + +from pandas.core.dtypes.common import ( + ensure_platform_int, + is_categorical_dtype, + is_interval_dtype, + is_list_like, + is_scalar, +) +from pandas.core.dtypes.dtypes import CategoricalDtype +from pandas.core.dtypes.generic import ABCCategorical, ABCSeries +from pandas.core.dtypes.missing import isna + +from pandas.core import accessor +from pandas.core.algorithms import take_1d +from pandas.core.arrays.categorical import Categorical, _recode_for_categories, contains +import pandas.core.common as com +import pandas.core.indexes.base as ibase +from pandas.core.indexes.base import Index, _index_shared_docs, maybe_extract_name +from pandas.core.indexes.extension import ExtensionIndex, inherit_names +import pandas.core.missing as missing +from pandas.core.ops import get_op_result_name + +_index_doc_kwargs = dict(ibase._index_doc_kwargs) +_index_doc_kwargs.update(dict(target_klass="CategoricalIndex")) + + +@inherit_names( + [ + "argsort", + "_internal_get_values", + "tolist", + "codes", + "categories", + "ordered", + "_reverse_indexer", + "searchsorted", + "is_dtype_equal", + "min", + "max", + ], + Categorical, +) +@accessor.delegate_names( + delegate=Categorical, + accessors=[ + "rename_categories", + "reorder_categories", + "add_categories", + "remove_categories", + "remove_unused_categories", + "set_categories", + "as_ordered", + "as_unordered", + ], + typ="method", + overwrite=True, +) +class CategoricalIndex(ExtensionIndex, accessor.PandasDelegate): + """ + Index based on an underlying :class:`Categorical`. + + CategoricalIndex, like Categorical, can only take on a limited, + and usually fixed, number of possible values (`categories`). Also, + like Categorical, it might have an order, but numerical operations + (additions, divisions, ...) are not possible. + + Parameters + ---------- + data : array-like (1-dimensional) + The values of the categorical. If `categories` are given, values not in + `categories` will be replaced with NaN. + categories : index-like, optional + The categories for the categorical. Items need to be unique. + If the categories are not given here (and also not in `dtype`), they + will be inferred from the `data`. + ordered : bool, optional + Whether or not this categorical is treated as an ordered + categorical. If not given here or in `dtype`, the resulting + categorical will be unordered. + dtype : CategoricalDtype or "category", optional + If :class:`CategoricalDtype`, cannot be used together with + `categories` or `ordered`. + + .. versionadded:: 0.21.0 + copy : bool, default False + Make a copy of input ndarray. + name : object, optional + Name to be stored in the index. + + Attributes + ---------- + codes + categories + ordered + + Methods + ------- + rename_categories + reorder_categories + add_categories + remove_categories + remove_unused_categories + set_categories + as_ordered + as_unordered + map + + Raises + ------ + ValueError + If the categories do not validate. + TypeError + If an explicit ``ordered=True`` is given but no `categories` and the + `values` are not sortable. + + See Also + -------- + Index : The base pandas Index type. + Categorical : A categorical array. + CategoricalDtype : Type for categorical data. + + Notes + ----- + See the `user guide + `_ + for more. + + Examples + -------- + >>> pd.CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c']) + CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], categories=['a', 'b', 'c'], ordered=False, dtype='category') # noqa + + ``CategoricalIndex`` can also be instantiated from a ``Categorical``: + + >>> c = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c']) + >>> pd.CategoricalIndex(c) + CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], categories=['a', 'b', 'c'], ordered=False, dtype='category') # noqa + + Ordered ``CategoricalIndex`` can have a min and max value. + + >>> ci = pd.CategoricalIndex(['a','b','c','a','b','c'], ordered=True, + ... categories=['c', 'b', 'a']) + >>> ci + CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], categories=['c', 'b', 'a'], ordered=True, dtype='category') # noqa + >>> ci.min() + 'c' + """ + + _typ = "categoricalindex" + + _raw_inherit = { + "argsort", + "_internal_get_values", + "tolist", + "codes", + "categories", + "ordered", + "_reverse_indexer", + "searchsorted", + } + + codes: np.ndarray + categories: Index + + @property + def _engine_type(self): + # self.codes can have dtype int8, int16, int32 or int64, so we need + # to return the corresponding engine type (libindex.Int8Engine, etc.). + return { + np.int8: libindex.Int8Engine, + np.int16: libindex.Int16Engine, + np.int32: libindex.Int32Engine, + np.int64: libindex.Int64Engine, + }[self.codes.dtype.type] + + _attributes = ["name"] + + # -------------------------------------------------------------------- + # Constructors + + def __new__( + cls, data=None, categories=None, ordered=None, dtype=None, copy=False, name=None + ): + + dtype = CategoricalDtype._from_values_or_dtype(data, categories, ordered, dtype) + + name = maybe_extract_name(name, data, cls) + + if not is_categorical_dtype(data): + # don't allow scalars + # if data is None, then categories must be provided + if is_scalar(data): + if data is not None or categories is None: + raise cls._scalar_data_error(data) + data = [] + + data = cls._create_categorical(data, dtype=dtype) + + data = data.copy() if copy else data + + return cls._simple_new(data, name=name) + + def _create_from_codes(self, codes, dtype=None, name=None): + """ + *this is an internal non-public method* + + create the correct categorical from codes + + Parameters + ---------- + codes : new codes + dtype: CategoricalDtype, defaults to existing + name : optional name attribute, defaults to existing + + Returns + ------- + CategoricalIndex + """ + + if dtype is None: + dtype = self.dtype + if name is None: + name = self.name + cat = Categorical.from_codes(codes, dtype=dtype) + return CategoricalIndex(cat, name=name) + + @classmethod + def _create_categorical(cls, data, dtype=None): + """ + *this is an internal non-public method* + + create the correct categorical from data and the properties + + Parameters + ---------- + data : data for new Categorical + dtype : CategoricalDtype, defaults to existing + + Returns + ------- + Categorical + """ + if isinstance(data, (cls, ABCSeries)) and is_categorical_dtype(data): + data = data.values + + if not isinstance(data, ABCCategorical): + return Categorical(data, dtype=dtype) + + if isinstance(dtype, CategoricalDtype) and dtype != data.dtype: + # we want to silently ignore dtype='category' + data = data._set_dtype(dtype) + return data + + @classmethod + def _simple_new(cls, values, name=None, dtype=None): + result = object.__new__(cls) + + values = cls._create_categorical(values, dtype=dtype) + result._data = values + result.name = name + + result._reset_identity() + result._no_setting_name = False + return result + + # -------------------------------------------------------------------- + + @Appender(_index_shared_docs["_shallow_copy"]) + def _shallow_copy(self, values=None, dtype=None, **kwargs): + if dtype is None: + dtype = self.dtype + return super()._shallow_copy(values=values, dtype=dtype, **kwargs) + + def _is_dtype_compat(self, other) -> bool: + """ + *this is an internal non-public method* + + provide a comparison between the dtype of self and other (coercing if + needed) + + Raises + ------ + TypeError if the dtypes are not compatible + """ + if is_categorical_dtype(other): + if isinstance(other, CategoricalIndex): + other = other._values + if not other.is_dtype_equal(self): + raise TypeError( + "categories must match existing categories when appending" + ) + else: + values = other + if not is_list_like(values): + values = [values] + other = CategoricalIndex(self._create_categorical(other, dtype=self.dtype)) + if not other.isin(values).all(): + raise TypeError( + "cannot append a non-category item to a CategoricalIndex" + ) + + return other + + def equals(self, other): + """ + Determine if two CategoricalIndex objects contain the same elements. + + Returns + ------- + bool + If two CategoricalIndex objects have equal elements True, + otherwise False. + """ + if self.is_(other): + return True + + if not isinstance(other, Index): + return False + + try: + other = self._is_dtype_compat(other) + if isinstance(other, type(self)): + other = other._data + return self._data.equals(other) + except (TypeError, ValueError): + pass + + return False + + # -------------------------------------------------------------------- + # Rendering Methods + + @property + def _formatter_func(self): + return self.categories._formatter_func + + def _format_attrs(self): + """ + Return a list of tuples of the (attr,formatted_value) + """ + max_categories = ( + 10 + if get_option("display.max_categories") == 0 + else get_option("display.max_categories") + ) + attrs = [ + ( + "categories", + ibase.default_pprint(self.categories, max_seq_items=max_categories), + ), + ("ordered", self.ordered), + ] + if self.name is not None: + attrs.append(("name", ibase.default_pprint(self.name))) + attrs.append(("dtype", f"'{self.dtype.name}'")) + max_seq_items = get_option("display.max_seq_items") or len(self) + if len(self) > max_seq_items: + attrs.append(("length", len(self))) + return attrs + + # -------------------------------------------------------------------- + + @property + def inferred_type(self) -> str: + return "categorical" + + @property + def values(self): + """ return the underlying data, which is a Categorical """ + return self._data + + @property + def _has_complex_internals(self): + # used to avoid libreduction code paths, which raise or require conversion + return True + + def _wrap_setop_result(self, other, result): + name = get_op_result_name(self, other) + # We use _shallow_copy rather than the Index implementation + # (which uses _constructor) in order to preserve dtype. + return self._shallow_copy(result, name=name) + + @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) + def __contains__(self, key) -> bool: + # if key is a NaN, check if any NaN is in self. + if is_scalar(key) and isna(key): + return self.hasnans + + return contains(self, key, container=self._engine) + + def __array__(self, dtype=None) -> np.ndarray: + """ the array interface, return my values """ + return np.array(self._data, dtype=dtype) + + @Appender(_index_shared_docs["astype"]) + def astype(self, dtype, copy=True): + if is_interval_dtype(dtype): + from pandas import IntervalIndex + + return IntervalIndex(np.array(self)) + elif is_categorical_dtype(dtype): + # GH 18630 + dtype = self.dtype.update_dtype(dtype) + if dtype == self.dtype: + return self.copy() if copy else self + + return Index.astype(self, dtype=dtype, copy=copy) + + @cache_readonly + def _isnan(self): + """ return if each value is nan""" + return self._data.codes == -1 + + @Appender(ibase._index_shared_docs["fillna"]) + def fillna(self, value, downcast=None): + self._assert_can_do_op(value) + return CategoricalIndex(self._data.fillna(value), name=self.name) + + @cache_readonly + def _engine(self): + # we are going to look things up with the codes themselves. + # To avoid a reference cycle, bind `codes` to a local variable, so + # `self` is not passed into the lambda. + codes = self.codes + return self._engine_type(lambda: codes, len(self)) + + # introspection + @cache_readonly + def is_unique(self) -> bool: + return self._engine.is_unique + + @property + def is_monotonic_increasing(self): + return self._engine.is_monotonic_increasing + + @property + def is_monotonic_decreasing(self) -> bool: + return self._engine.is_monotonic_decreasing + + @Appender(_index_shared_docs["index_unique"] % _index_doc_kwargs) + def unique(self, level=None): + if level is not None: + self._validate_index_level(level) + result = self.values.unique() + # CategoricalIndex._shallow_copy keeps original dtype + # if not otherwise specified + return self._shallow_copy(result, dtype=result.dtype) + + @Appender(Index.duplicated.__doc__) + def duplicated(self, keep="first"): + codes = self.codes.astype("i8") + return duplicated_int64(codes, keep) + + def _to_safe_for_reshape(self): + """ convert to object if we are a categorical """ + return self.astype("object") + + def get_loc(self, key, method=None): + """ + Get integer location, slice or boolean mask for requested label. + + Parameters + ---------- + key : label + method : {None} + * default: exact matches only. + + Returns + ------- + loc : int if unique index, slice if monotonic index, else mask + + Raises + ------ + KeyError : if the key is not in the index + + Examples + -------- + >>> unique_index = pd.CategoricalIndex(list('abc')) + >>> unique_index.get_loc('b') + 1 + + >>> monotonic_index = pd.CategoricalIndex(list('abbc')) + >>> monotonic_index.get_loc('b') + slice(1, 3, None) + + >>> non_monotonic_index = pd.CategoricalIndex(list('abcb')) + >>> non_monotonic_index.get_loc('b') + array([False, True, False, True], dtype=bool) + """ + code = self.categories.get_loc(key) + code = self.codes.dtype.type(code) + try: + return self._engine.get_loc(code) + except KeyError: + raise KeyError(key) + + def get_value(self, series: AnyArrayLike, key: Any): + """ + Fast lookup of value from 1-dimensional ndarray. Only use this if you + know what you're doing + + Parameters + ---------- + series : Series, ExtensionArray, Index, or ndarray + 1-dimensional array to take values from + key: : scalar + The value of this index at the position of the desired value, + otherwise the positional index of the desired value + + Returns + ------- + Any + The element of the series at the position indicated by the key + """ + try: + k = com.values_from_object(key) + k = self._convert_scalar_indexer(k, kind="getitem") + indexer = self.get_loc(k) + return series.take([indexer])[0] + except (KeyError, TypeError): + pass + + # we might be a positional inexer + return super().get_value(series, key) + + @Appender(_index_shared_docs["where"]) + def where(self, cond, other=None): + # TODO: Investigate an alternative implementation with + # 1. copy the underlying Categorical + # 2. setitem with `cond` and `other` + # 3. Rebuild CategoricalIndex. + if other is None: + other = self._na_value + values = np.where(cond, self.values, other) + cat = Categorical(values, dtype=self.dtype) + return self._shallow_copy(cat, **self._get_attributes_dict()) + + def reindex(self, target, method=None, level=None, limit=None, tolerance=None): + """ + Create index with target's values (move/add/delete values as necessary) + + Returns + ------- + new_index : pd.Index + Resulting index + indexer : np.ndarray or None + Indices of output values in original index + + """ + if method is not None: + raise NotImplementedError( + "argument method is not implemented for CategoricalIndex.reindex" + ) + if level is not None: + raise NotImplementedError( + "argument level is not implemented for CategoricalIndex.reindex" + ) + if limit is not None: + raise NotImplementedError( + "argument limit is not implemented for CategoricalIndex.reindex" + ) + + target = ibase.ensure_index(target) + + missing: List[int] + if self.equals(target): + indexer = None + missing = [] + else: + indexer, missing = self.get_indexer_non_unique(np.array(target)) + + if len(self.codes) and indexer is not None: + new_target = self.take(indexer) + else: + new_target = target + + # filling in missing if needed + if len(missing): + cats = self.categories.get_indexer(target) + + if (cats == -1).any(): + # coerce to a regular index here! + result = Index(np.array(self), name=self.name) + new_target, indexer, _ = result._reindex_non_unique(np.array(target)) + else: + + codes = new_target.codes.copy() + codes[indexer == -1] = cats[missing] + new_target = self._create_from_codes(codes) + + # we always want to return an Index type here + # to be consistent with .reindex for other index types (e.g. they don't + # coerce based on the actual values, only on the dtype) + # unless we had an initial Categorical to begin with + # in which case we are going to conform to the passed Categorical + new_target = np.asarray(new_target) + if is_categorical_dtype(target): + new_target = target._shallow_copy(new_target, name=self.name) + else: + new_target = Index(new_target, name=self.name) + + return new_target, indexer + + def _reindex_non_unique(self, target): + """ reindex from a non-unique; which CategoricalIndex's are almost + always + """ + new_target, indexer = self.reindex(target) + new_indexer = None + + check = indexer == -1 + if check.any(): + new_indexer = np.arange(len(self.take(indexer))) + new_indexer[check] = -1 + + cats = self.categories.get_indexer(target) + if not (cats == -1).any(): + # .reindex returns normal Index. Revert to CategoricalIndex if + # all targets are included in my categories + new_target = self._shallow_copy(new_target) + + return new_target, indexer, new_indexer + + @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) + def get_indexer(self, target, method=None, limit=None, tolerance=None): + method = missing.clean_reindex_fill_method(method) + target = ibase.ensure_index(target) + + if self.is_unique and self.equals(target): + return np.arange(len(self), dtype="intp") + + if method == "pad" or method == "backfill": + raise NotImplementedError( + "method='pad' and method='backfill' not " + "implemented yet for CategoricalIndex" + ) + elif method == "nearest": + raise NotImplementedError( + "method='nearest' not implemented yet for CategoricalIndex" + ) + + if isinstance(target, CategoricalIndex) and self.values.is_dtype_equal(target): + if self.values.equals(target.values): + # we have the same codes + codes = target.codes + else: + codes = _recode_for_categories( + target.codes, target.categories, self.values.categories + ) + else: + if isinstance(target, CategoricalIndex): + code_indexer = self.categories.get_indexer(target.categories) + codes = take_1d(code_indexer, target.codes, fill_value=-1) + else: + codes = self.categories.get_indexer(target) + + indexer, _ = self._engine.get_indexer_non_unique(codes) + return ensure_platform_int(indexer) + + @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) + def get_indexer_non_unique(self, target): + target = ibase.ensure_index(target) + + if isinstance(target, CategoricalIndex): + # Indexing on codes is more efficient if categories are the same: + if target.categories is self.categories: + target = target.codes + indexer, missing = self._engine.get_indexer_non_unique(target) + return ensure_platform_int(indexer), missing + target = target.values + + codes = self.categories.get_indexer(target) + indexer, missing = self._engine.get_indexer_non_unique(codes) + return ensure_platform_int(indexer), missing + + @Appender(_index_shared_docs["_convert_scalar_indexer"]) + def _convert_scalar_indexer(self, key, kind=None): + if kind == "loc": + try: + return self.categories._convert_scalar_indexer(key, kind=kind) + except TypeError: + self._invalid_indexer("label", key) + return super()._convert_scalar_indexer(key, kind=kind) + + @Appender(_index_shared_docs["_convert_list_indexer"]) + def _convert_list_indexer(self, keyarr, kind=None): + # Return our indexer or raise if all of the values are not included in + # the categories + + if self.categories._defer_to_indexing: + indexer = self.categories._convert_list_indexer(keyarr, kind=kind) + return Index(self.codes).get_indexer_for(indexer) + + indexer = self.categories.get_indexer(np.asarray(keyarr)) + if (indexer == -1).any(): + raise KeyError( + "a list-indexer must only include values that are in the categories" + ) + + return self.get_indexer(keyarr) + + @Appender(_index_shared_docs["_convert_arr_indexer"]) + def _convert_arr_indexer(self, keyarr): + keyarr = com.asarray_tuplesafe(keyarr) + + if self.categories._defer_to_indexing: + return keyarr + + return self._shallow_copy(keyarr) + + @Appender(_index_shared_docs["_convert_index_indexer"]) + def _convert_index_indexer(self, keyarr): + return self._shallow_copy(keyarr) + + def take_nd(self, *args, **kwargs): + """Alias for `take`""" + warnings.warn( + "CategoricalIndex.take_nd is deprecated, use CategoricalIndex.take instead", + FutureWarning, + stacklevel=2, + ) + return self.take(*args, **kwargs) + + @Appender(_index_shared_docs["_maybe_cast_slice_bound"]) + def _maybe_cast_slice_bound(self, label, side, kind): + if kind == "loc": + return label + + return super()._maybe_cast_slice_bound(label, side, kind) + + def map(self, mapper): + """ + Map values using input correspondence (a dict, Series, or function). + + Maps the values (their categories, not the codes) of the index to new + categories. If the mapping correspondence is one-to-one the result is a + :class:`~pandas.CategoricalIndex` which has the same order property as + the original, otherwise an :class:`~pandas.Index` is returned. + + If a `dict` or :class:`~pandas.Series` is used any unmapped category is + mapped to `NaN`. Note that if this happens an :class:`~pandas.Index` + will be returned. + + Parameters + ---------- + mapper : function, dict, or Series + Mapping correspondence. + + Returns + ------- + pandas.CategoricalIndex or pandas.Index + Mapped index. + + See Also + -------- + Index.map : Apply a mapping correspondence on an + :class:`~pandas.Index`. + Series.map : Apply a mapping correspondence on a + :class:`~pandas.Series`. + Series.apply : Apply more complex functions on a + :class:`~pandas.Series`. + + Examples + -------- + >>> idx = pd.CategoricalIndex(['a', 'b', 'c']) + >>> idx + CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'], + ordered=False, dtype='category') + >>> idx.map(lambda x: x.upper()) + CategoricalIndex(['A', 'B', 'C'], categories=['A', 'B', 'C'], + ordered=False, dtype='category') + >>> idx.map({'a': 'first', 'b': 'second', 'c': 'third'}) + CategoricalIndex(['first', 'second', 'third'], categories=['first', + 'second', 'third'], ordered=False, dtype='category') + + If the mapping is one-to-one the ordering of the categories is + preserved: + + >>> idx = pd.CategoricalIndex(['a', 'b', 'c'], ordered=True) + >>> idx + CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'], + ordered=True, dtype='category') + >>> idx.map({'a': 3, 'b': 2, 'c': 1}) + CategoricalIndex([3, 2, 1], categories=[3, 2, 1], ordered=True, + dtype='category') + + If the mapping is not one-to-one an :class:`~pandas.Index` is returned: + + >>> idx.map({'a': 'first', 'b': 'second', 'c': 'first'}) + Index(['first', 'second', 'first'], dtype='object') + + If a `dict` is used, all unmapped categories are mapped to `NaN` and + the result is an :class:`~pandas.Index`: + + >>> idx.map({'a': 'first', 'b': 'second'}) + Index(['first', 'second', nan], dtype='object') + """ + return self._shallow_copy_with_infer(self.values.map(mapper)) + + def delete(self, loc): + """ + Make new Index with passed location(-s) deleted + + Returns + ------- + new_index : Index + """ + return self._create_from_codes(np.delete(self.codes, loc)) + + def insert(self, loc, item): + """ + Make new Index inserting new item at location. Follows + Python list.append semantics for negative values + + Parameters + ---------- + loc : int + item : object + + Returns + ------- + new_index : Index + + Raises + ------ + ValueError if the item is not in the categories + + """ + code = self.categories.get_indexer([item]) + if (code == -1) and not (is_scalar(item) and isna(item)): + raise TypeError( + "cannot insert an item into a CategoricalIndex " + "that is not already an existing category" + ) + + codes = self.codes + codes = np.concatenate((codes[:loc], code, codes[loc:])) + return self._create_from_codes(codes) + + def _concat(self, to_concat, name): + # if calling index is category, don't check dtype of others + return CategoricalIndex._concat_same_dtype(self, to_concat, name) + + def _concat_same_dtype(self, to_concat, name): + """ + Concatenate to_concat which has the same class + ValueError if other is not in the categories + """ + codes = np.concatenate([self._is_dtype_compat(c).codes for c in to_concat]) + result = self._create_from_codes(codes, name=name) + # if name is None, _create_from_codes sets self.name + result.name = name + return result + + def _delegate_property_get(self, name, *args, **kwargs): + """ method delegation to the ._values """ + prop = getattr(self._values, name) + return prop # no wrapping for now + + def _delegate_method(self, name, *args, **kwargs): + """ method delegation to the ._values """ + method = getattr(self._values, name) + if "inplace" in kwargs: + raise ValueError("cannot use inplace with CategoricalIndex") + res = method(*args, **kwargs) + if is_scalar(res) or name in self._raw_inherit: + return res + return CategoricalIndex(res, name=self.name) + + +CategoricalIndex._add_numeric_methods_add_sub_disabled() +CategoricalIndex._add_numeric_methods_disabled() +CategoricalIndex._add_logical_methods_disabled() diff --git a/venv/Lib/site-packages/pandas/core/indexes/datetimelike.py b/venv/Lib/site-packages/pandas/core/indexes/datetimelike.py new file mode 100644 index 0000000..c98b4f2 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/indexes/datetimelike.py @@ -0,0 +1,913 @@ +""" +Base and utility classes for tseries type pandas objects. +""" +import operator +from typing import List, Optional, Set + +import numpy as np + +from pandas._libs import NaT, iNaT, join as libjoin, lib +from pandas._libs.algos import unique_deltas +from pandas._libs.tslibs import timezones +from pandas.compat.numpy import function as nv +from pandas.errors import AbstractMethodError +from pandas.util._decorators import Appender, cache_readonly + +from pandas.core.dtypes.common import ( + ensure_int64, + is_bool_dtype, + is_categorical_dtype, + is_dtype_equal, + is_float, + is_integer, + is_list_like, + is_period_dtype, + is_scalar, + needs_i8_conversion, +) +from pandas.core.dtypes.concat import concat_compat +from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries +from pandas.core.dtypes.missing import isna + +from pandas.core import algorithms +from pandas.core.accessor import PandasDelegate +from pandas.core.arrays import DatetimeArray, ExtensionArray, TimedeltaArray +from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin +import pandas.core.indexes.base as ibase +from pandas.core.indexes.base import Index, _index_shared_docs +from pandas.core.indexes.extension import ( + ExtensionIndex, + inherit_names, + make_wrapped_arith_op, +) +from pandas.core.indexes.numeric import Int64Index +from pandas.core.ops import get_op_result_name +from pandas.core.tools.timedeltas import to_timedelta + +from pandas.tseries.frequencies import DateOffset, to_offset + +_index_doc_kwargs = dict(ibase._index_doc_kwargs) + + +def _join_i8_wrapper(joinf, with_indexers: bool = True): + """ + Create the join wrapper methods. + """ + + @staticmethod # type: ignore + def wrapper(left, right): + if isinstance(left, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin)): + left = left.view("i8") + if isinstance(right, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin)): + right = right.view("i8") + + results = joinf(left, right) + if with_indexers: + # dtype should be timedelta64[ns] for TimedeltaIndex + # and datetime64[ns] for DatetimeIndex + dtype = left.dtype.base + + join_index, left_indexer, right_indexer = results + join_index = join_index.view(dtype) + return join_index, left_indexer, right_indexer + return results + + return wrapper + + +@inherit_names( + ["inferred_freq", "_isnan", "_resolution", "resolution"], + DatetimeLikeArrayMixin, + cache=True, +) +@inherit_names( + ["__iter__", "mean", "freq", "freqstr", "_ndarray_values", "asi8", "_box_values"], + DatetimeLikeArrayMixin, +) +class DatetimeIndexOpsMixin(ExtensionIndex): + """ + Common ops mixin to support a unified interface datetimelike Index. + """ + + _data: ExtensionArray + freq: Optional[DateOffset] + freqstr: Optional[str] + _resolution: int + _bool_ops: List[str] = [] + _field_ops: List[str] = [] + + hasnans = cache_readonly(DatetimeLikeArrayMixin._hasnans.fget) # type: ignore + _hasnans = hasnans # for index / array -agnostic code + + @property + def is_all_dates(self) -> bool: + return True + + # ------------------------------------------------------------------------ + # Abstract data attributes + + @property + def values(self): + # Note: PeriodArray overrides this to return an ndarray of objects. + return self._data._data + + def __array_wrap__(self, result, context=None): + """ + Gets called after a ufunc. + """ + result = lib.item_from_zerodim(result) + if is_bool_dtype(result) or lib.is_scalar(result): + return result + + attrs = self._get_attributes_dict() + if not is_period_dtype(self) and attrs["freq"]: + # no need to infer if freq is None + attrs["freq"] = "infer" + return Index(result, **attrs) + + # ------------------------------------------------------------------------ + + def equals(self, other) -> bool: + """ + Determines if two Index objects contain the same elements. + """ + if self.is_(other): + return True + + if not isinstance(other, ABCIndexClass): + return False + elif not isinstance(other, type(self)): + try: + other = type(self)(other) + except (ValueError, TypeError, OverflowError): + # e.g. + # ValueError -> cannot parse str entry, or OutOfBoundsDatetime + # TypeError -> trying to convert IntervalIndex to DatetimeIndex + # OverflowError -> Index([very_large_timedeltas]) + return False + + if not is_dtype_equal(self.dtype, other.dtype): + # have different timezone + return False + + return np.array_equal(self.asi8, other.asi8) + + @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) + def __contains__(self, key): + try: + res = self.get_loc(key) + return ( + is_scalar(res) + or isinstance(res, slice) + or (is_list_like(res) and len(res)) + ) + except (KeyError, TypeError, ValueError): + return False + + def sort_values(self, return_indexer=False, ascending=True): + """ + Return sorted copy of Index. + """ + if return_indexer: + _as = self.argsort() + if not ascending: + _as = _as[::-1] + sorted_index = self.take(_as) + return sorted_index, _as + else: + # NB: using asi8 instead of _ndarray_values matters in numpy 1.18 + # because the treatment of NaT has been changed to put NaT last + # instead of first. + sorted_values = np.sort(self.asi8) + attribs = self._get_attributes_dict() + freq = attribs["freq"] + + if freq is not None and not is_period_dtype(self): + if freq.n > 0 and not ascending: + freq = freq * -1 + elif freq.n < 0 and ascending: + freq = freq * -1 + attribs["freq"] = freq + + if not ascending: + sorted_values = sorted_values[::-1] + + return self._simple_new(sorted_values, **attribs) + + @Appender(_index_shared_docs["take"] % _index_doc_kwargs) + def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): + nv.validate_take(tuple(), kwargs) + indices = ensure_int64(indices) + + maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) + if isinstance(maybe_slice, slice): + return self[maybe_slice] + + return ExtensionIndex.take( + self, indices, axis, allow_fill, fill_value, **kwargs + ) + + _can_hold_na = True + + _na_value = NaT + """The expected NA value to use with this index.""" + + def _convert_tolerance(self, tolerance, target): + tolerance = np.asarray(to_timedelta(tolerance).to_numpy()) + + if target.size != tolerance.size and tolerance.size > 1: + raise ValueError("list-like tolerance size must match target index size") + return tolerance + + def tolist(self) -> List: + """ + Return a list of the underlying data. + """ + return list(self.astype(object)) + + def min(self, axis=None, skipna=True, *args, **kwargs): + """ + Return the minimum value of the Index or minimum along + an axis. + + See Also + -------- + numpy.ndarray.min + Series.min : Return the minimum value in a Series. + """ + nv.validate_min(args, kwargs) + nv.validate_minmax_axis(axis) + + if not len(self): + return self._na_value + + i8 = self.asi8 + try: + # quick check + if len(i8) and self.is_monotonic: + if i8[0] != iNaT: + return self._box_func(i8[0]) + + if self.hasnans: + if skipna: + min_stamp = self[~self._isnan].asi8.min() + else: + return self._na_value + else: + min_stamp = i8.min() + return self._box_func(min_stamp) + except ValueError: + return self._na_value + + def argmin(self, axis=None, skipna=True, *args, **kwargs): + """ + Returns the indices of the minimum values along an axis. + + See `numpy.ndarray.argmin` for more information on the + `axis` parameter. + + See Also + -------- + numpy.ndarray.argmin + """ + nv.validate_argmin(args, kwargs) + nv.validate_minmax_axis(axis) + + i8 = self.asi8 + if self.hasnans: + mask = self._isnan + if mask.all() or not skipna: + return -1 + i8 = i8.copy() + i8[mask] = np.iinfo("int64").max + return i8.argmin() + + def max(self, axis=None, skipna=True, *args, **kwargs): + """ + Return the maximum value of the Index or maximum along + an axis. + + See Also + -------- + numpy.ndarray.max + Series.max : Return the maximum value in a Series. + """ + nv.validate_max(args, kwargs) + nv.validate_minmax_axis(axis) + + if not len(self): + return self._na_value + + i8 = self.asi8 + try: + # quick check + if len(i8) and self.is_monotonic: + if i8[-1] != iNaT: + return self._box_func(i8[-1]) + + if self.hasnans: + if skipna: + max_stamp = self[~self._isnan].asi8.max() + else: + return self._na_value + else: + max_stamp = i8.max() + return self._box_func(max_stamp) + except ValueError: + return self._na_value + + def argmax(self, axis=None, skipna=True, *args, **kwargs): + """ + Returns the indices of the maximum values along an axis. + + See `numpy.ndarray.argmax` for more information on the + `axis` parameter. + + See Also + -------- + numpy.ndarray.argmax + """ + nv.validate_argmax(args, kwargs) + nv.validate_minmax_axis(axis) + + i8 = self.asi8 + if self.hasnans: + mask = self._isnan + if mask.all() or not skipna: + return -1 + i8 = i8.copy() + i8[mask] = 0 + return i8.argmax() + + # -------------------------------------------------------------------- + # Rendering Methods + + def _format_with_header(self, header, na_rep="NaT", **kwargs): + return header + list(self._format_native_types(na_rep, **kwargs)) + + @property + def _formatter_func(self): + raise AbstractMethodError(self) + + def _format_attrs(self): + """ + Return a list of tuples of the (attr,formatted_value). + """ + attrs = super()._format_attrs() + for attrib in self._attributes: + if attrib == "freq": + freq = self.freqstr + if freq is not None: + freq = repr(freq) + attrs.append(("freq", freq)) + return attrs + + # -------------------------------------------------------------------- + + def _convert_scalar_indexer(self, key, kind=None): + """ + We don't allow integer or float indexing on datetime-like when using + loc. + + Parameters + ---------- + key : label of the slice bound + kind : {'ix', 'loc', 'getitem', 'iloc'} or None + """ + + assert kind in ["ix", "loc", "getitem", "iloc", None] + + # we don't allow integer/float indexing for loc + # we don't allow float indexing for ix/getitem + if is_scalar(key): + is_int = is_integer(key) + is_flt = is_float(key) + if kind in ["loc"] and (is_int or is_flt): + self._invalid_indexer("index", key) + elif kind in ["ix", "getitem"] and is_flt: + self._invalid_indexer("index", key) + + return super()._convert_scalar_indexer(key, kind=kind) + + __add__ = make_wrapped_arith_op("__add__") + __radd__ = make_wrapped_arith_op("__radd__") + __sub__ = make_wrapped_arith_op("__sub__") + __rsub__ = make_wrapped_arith_op("__rsub__") + __pow__ = make_wrapped_arith_op("__pow__") + __rpow__ = make_wrapped_arith_op("__rpow__") + __mul__ = make_wrapped_arith_op("__mul__") + __rmul__ = make_wrapped_arith_op("__rmul__") + __floordiv__ = make_wrapped_arith_op("__floordiv__") + __rfloordiv__ = make_wrapped_arith_op("__rfloordiv__") + __mod__ = make_wrapped_arith_op("__mod__") + __rmod__ = make_wrapped_arith_op("__rmod__") + __divmod__ = make_wrapped_arith_op("__divmod__") + __rdivmod__ = make_wrapped_arith_op("__rdivmod__") + __truediv__ = make_wrapped_arith_op("__truediv__") + __rtruediv__ = make_wrapped_arith_op("__rtruediv__") + + def isin(self, values, level=None): + """ + Compute boolean array of whether each index value is found in the + passed set of values. + + Parameters + ---------- + values : set or sequence of values + + Returns + ------- + is_contained : ndarray (boolean dtype) + """ + if level is not None: + self._validate_index_level(level) + + if not isinstance(values, type(self)): + try: + values = type(self)(values) + except ValueError: + return self.astype(object).isin(values) + + return algorithms.isin(self.asi8, values.asi8) + + @Appender(_index_shared_docs["where"] % _index_doc_kwargs) + def where(self, cond, other=None): + values = self.view("i8") + + if is_scalar(other) and isna(other): + other = NaT.value + + else: + # Do type inference if necessary up front + # e.g. we passed PeriodIndex.values and got an ndarray of Periods + other = Index(other) + + if is_categorical_dtype(other): + # e.g. we have a Categorical holding self.dtype + if needs_i8_conversion(other.categories): + other = other._internal_get_values() + + if not is_dtype_equal(self.dtype, other.dtype): + raise TypeError(f"Where requires matching dtype, not {other.dtype}") + + other = other.view("i8") + + result = np.where(cond, values, other).astype("i8") + return self._shallow_copy(result) + + def _summary(self, name=None): + """ + Return a summarized representation. + + Parameters + ---------- + name : str + Name to use in the summary representation. + + Returns + ------- + str + Summarized representation of the index. + """ + formatter = self._formatter_func + if len(self) > 0: + index_summary = f", {formatter(self[0])} to {formatter(self[-1])}" + else: + index_summary = "" + + if name is None: + name = type(self).__name__ + result = f"{name}: {len(self)} entries{index_summary}" + if self.freq: + result += f"\nFreq: {self.freqstr}" + + # display as values, not quoted + result = result.replace("'", "") + return result + + def _concat_same_dtype(self, to_concat, name): + """ + Concatenate to_concat which has the same class. + """ + attribs = self._get_attributes_dict() + attribs["name"] = name + # do not pass tz to set because tzlocal cannot be hashed + if len({str(x.dtype) for x in to_concat}) != 1: + raise ValueError("to_concat must have the same tz") + + new_data = type(self._values)._concat_same_type(to_concat).asi8 + + # GH 3232: If the concat result is evenly spaced, we can retain the + # original frequency + is_diff_evenly_spaced = len(unique_deltas(new_data)) == 1 + if not is_period_dtype(self) and not is_diff_evenly_spaced: + # reset freq + attribs["freq"] = None + + return self._simple_new(new_data, **attribs) + + def shift(self, periods=1, freq=None): + """ + Shift index by desired number of time frequency increments. + + This method is for shifting the values of datetime-like indexes + by a specified time increment a given number of times. + + Parameters + ---------- + periods : int, default 1 + Number of periods (or increments) to shift by, + can be positive or negative. + + .. versionchanged:: 0.24.0 + + freq : pandas.DateOffset, pandas.Timedelta or string, optional + Frequency increment to shift by. + If None, the index is shifted by its own `freq` attribute. + Offset aliases are valid strings, e.g., 'D', 'W', 'M' etc. + + Returns + ------- + pandas.DatetimeIndex + Shifted index. + + See Also + -------- + Index.shift : Shift values of Index. + PeriodIndex.shift : Shift values of PeriodIndex. + """ + result = self._data._time_shift(periods, freq=freq) + return type(self)(result, name=self.name) + + # -------------------------------------------------------------------- + # List-like Methods + + def delete(self, loc): + new_i8s = np.delete(self.asi8, loc) + + freq = None + if is_period_dtype(self): + freq = self.freq + elif is_integer(loc): + if loc in (0, -len(self), -1, len(self) - 1): + freq = self.freq + else: + if is_list_like(loc): + loc = lib.maybe_indices_to_slice(ensure_int64(np.array(loc)), len(self)) + if isinstance(loc, slice) and loc.step in (1, None): + if loc.start in (0, None) or loc.stop in (len(self), None): + freq = self.freq + + return self._shallow_copy(new_i8s, freq=freq) + + +class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin, Int64Index): + """ + Mixin class for methods shared by DatetimeIndex and TimedeltaIndex, + but not PeriodIndex + """ + + # Compat for frequency inference, see GH#23789 + _is_monotonic_increasing = Index.is_monotonic_increasing + _is_monotonic_decreasing = Index.is_monotonic_decreasing + _is_unique = Index.is_unique + + def _set_freq(self, freq): + """ + Set the _freq attribute on our underlying DatetimeArray. + + Parameters + ---------- + freq : DateOffset, None, or "infer" + """ + # GH#29843 + if freq is None: + # Always valid + pass + elif len(self) == 0 and isinstance(freq, DateOffset): + # Always valid. In the TimedeltaIndex case, we assume this + # is a Tick offset. + pass + else: + # As an internal method, we can ensure this assertion always holds + assert freq == "infer" + freq = to_offset(self.inferred_freq) + + self._data._freq = freq + + def _shallow_copy(self, values=None, **kwargs): + if values is None: + values = self._data + if isinstance(values, type(self)): + values = values._data + + attributes = self._get_attributes_dict() + + if "freq" not in kwargs and self.freq is not None: + if isinstance(values, (DatetimeArray, TimedeltaArray)): + if values.freq is None: + del attributes["freq"] + + attributes.update(kwargs) + return self._simple_new(values, **attributes) + + # -------------------------------------------------------------------- + # Set Operation Methods + + @Appender(Index.difference.__doc__) + def difference(self, other, sort=None): + new_idx = super().difference(other, sort=sort) + new_idx._set_freq(None) + return new_idx + + def intersection(self, other, sort=False): + """ + Specialized intersection for DatetimeIndex/TimedeltaIndex. + + May be much faster than Index.intersection + + Parameters + ---------- + other : Same type as self or array-like + sort : False or None, default False + Sort the resulting index if possible. + + .. versionadded:: 0.24.0 + + .. versionchanged:: 0.24.1 + + Changed the default to ``False`` to match the behaviour + from before 0.24.0. + + .. versionchanged:: 0.25.0 + + The `sort` keyword is added + + Returns + ------- + y : Index or same type as self + """ + self._validate_sort_keyword(sort) + self._assert_can_do_setop(other) + + if self.equals(other): + return self._get_reconciled_name_object(other) + + if len(self) == 0: + return self.copy() + if len(other) == 0: + return other.copy() + + if not isinstance(other, type(self)): + result = Index.intersection(self, other, sort=sort) + if isinstance(result, type(self)): + if result.freq is None: + result._set_freq("infer") + return result + + elif ( + other.freq is None + or self.freq is None + or other.freq != self.freq + or not other.freq.is_anchored() + or (not self.is_monotonic or not other.is_monotonic) + ): + result = Index.intersection(self, other, sort=sort) + + # Invalidate the freq of `result`, which may not be correct at + # this point, depending on the values. + + result._set_freq(None) + result = self._shallow_copy( + result._data, name=result.name, dtype=result.dtype, freq=None + ) + if result.freq is None: + result._set_freq("infer") + return result + + # to make our life easier, "sort" the two ranges + if self[0] <= other[0]: + left, right = self, other + else: + left, right = other, self + + # after sorting, the intersection always starts with the right index + # and ends with the index of which the last elements is smallest + end = min(left[-1], right[-1]) + start = right[0] + + if end < start: + return type(self)(data=[]) + else: + lslice = slice(*left.slice_locs(start, end)) + left_chunk = left.values[lslice] + return self._shallow_copy(left_chunk) + + def _can_fast_union(self, other) -> bool: + if not isinstance(other, type(self)): + return False + + freq = self.freq + + if freq is None or freq != other.freq: + return False + + if not self.is_monotonic or not other.is_monotonic: + return False + + if len(self) == 0 or len(other) == 0: + return True + + # to make our life easier, "sort" the two ranges + if self[0] <= other[0]: + left, right = self, other + else: + left, right = other, self + + right_start = right[0] + left_end = left[-1] + + # Only need to "adjoin", not overlap + try: + return (right_start == left_end + freq) or right_start in left + except ValueError: + # if we are comparing a freq that does not propagate timezones + # this will raise + return False + + def _fast_union(self, other, sort=None): + if len(other) == 0: + return self.view(type(self)) + + if len(self) == 0: + return other.view(type(self)) + + # to make our life easier, "sort" the two ranges + if self[0] <= other[0]: + left, right = self, other + elif sort is False: + # TDIs are not in the "correct" order and we don't want + # to sort but want to remove overlaps + left, right = self, other + left_start = left[0] + loc = right.searchsorted(left_start, side="left") + right_chunk = right.values[:loc] + dates = concat_compat((left.values, right_chunk)) + return self._shallow_copy(dates) + else: + left, right = other, self + + left_end = left[-1] + right_end = right[-1] + + # concatenate + if left_end < right_end: + loc = right.searchsorted(left_end, side="right") + right_chunk = right.values[loc:] + dates = concat_compat((left.values, right_chunk)) + return self._shallow_copy(dates) + else: + return left + + def _union(self, other, sort): + if not len(other) or self.equals(other) or not len(self): + return super()._union(other, sort=sort) + + # We are called by `union`, which is responsible for this validation + assert isinstance(other, type(self)) + + this, other = self._maybe_utc_convert(other) + + if this._can_fast_union(other): + return this._fast_union(other, sort=sort) + else: + result = Index._union(this, other, sort=sort) + if isinstance(result, type(self)): + assert result._data.dtype == this.dtype + if result.freq is None: + result._set_freq("infer") + return result + + # -------------------------------------------------------------------- + # Join Methods + _join_precedence = 10 + + _inner_indexer = _join_i8_wrapper(libjoin.inner_join_indexer) + _outer_indexer = _join_i8_wrapper(libjoin.outer_join_indexer) + _left_indexer = _join_i8_wrapper(libjoin.left_join_indexer) + _left_indexer_unique = _join_i8_wrapper( + libjoin.left_join_indexer_unique, with_indexers=False + ) + + def join( + self, other, how: str = "left", level=None, return_indexers=False, sort=False + ): + """ + See Index.join + """ + if self._is_convertible_to_index_for_join(other): + try: + other = type(self)(other) + except (TypeError, ValueError): + pass + + this, other = self._maybe_utc_convert(other) + return Index.join( + this, + other, + how=how, + level=level, + return_indexers=return_indexers, + sort=sort, + ) + + def _maybe_utc_convert(self, other): + this = self + if not hasattr(self, "tz"): + return this, other + + if isinstance(other, type(self)): + if self.tz is not None: + if other.tz is None: + raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") + elif other.tz is not None: + raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") + + if not timezones.tz_compare(self.tz, other.tz): + this = self.tz_convert("UTC") + other = other.tz_convert("UTC") + return this, other + + @classmethod + def _is_convertible_to_index_for_join(cls, other: Index) -> bool: + """ + return a boolean whether I can attempt conversion to a + DatetimeIndex/TimedeltaIndex + """ + if isinstance(other, cls): + return False + elif len(other) > 0 and other.inferred_type not in ( + "floating", + "mixed-integer", + "integer", + "integer-na", + "mixed-integer-float", + "mixed", + ): + return True + return False + + def _wrap_joined_index(self, joined, other): + name = get_op_result_name(self, other) + if ( + isinstance(other, type(self)) + and self.freq == other.freq + and self._can_fast_union(other) + ): + joined = self._shallow_copy(joined) + joined.name = name + return joined + else: + kwargs = {} + if hasattr(self, "tz"): + kwargs["tz"] = getattr(other, "tz", None) + return self._simple_new(joined, name, **kwargs) + + +class DatetimelikeDelegateMixin(PandasDelegate): + """ + Delegation mechanism, specific for Datetime, Timedelta, and Period types. + + Functionality is delegated from the Index class to an Array class. A + few things can be customized + + * _delegated_methods, delegated_properties : List + The list of property / method names being delagated. + * raw_methods : Set + The set of methods whose results should should *not* be + boxed in an index, after being returned from the array + * raw_properties : Set + The set of properties whose results should should *not* be + boxed in an index, after being returned from the array + """ + + # raw_methods : dispatch methods that shouldn't be boxed in an Index + _raw_methods: Set[str] = set() + # raw_properties : dispatch properties that shouldn't be boxed in an Index + _raw_properties: Set[str] = set() + _data: ExtensionArray + + def _delegate_property_get(self, name, *args, **kwargs): + result = getattr(self._data, name) + if name not in self._raw_properties: + result = Index(result, name=self.name) + return result + + def _delegate_property_set(self, name, value, *args, **kwargs): + setattr(self._data, name, value) + + def _delegate_method(self, name, *args, **kwargs): + result = operator.methodcaller(name, *args, **kwargs)(self._data) + if name not in self._raw_methods: + result = Index(result, name=self.name) + return result diff --git a/venv/Lib/site-packages/pandas/core/indexes/datetimes.py b/venv/Lib/site-packages/pandas/core/indexes/datetimes.py new file mode 100644 index 0000000..2241921 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/indexes/datetimes.py @@ -0,0 +1,1288 @@ +from datetime import datetime, time, timedelta, tzinfo +import operator +from typing import Optional +import warnings + +import numpy as np + +from pandas._libs import NaT, Timestamp, index as libindex, lib, tslib as libts +from pandas._libs.tslibs import ccalendar, fields, parsing, timezones +from pandas.util._decorators import Appender, Substitution, cache_readonly + +from pandas.core.dtypes.common import _NS_DTYPE, is_float, is_integer, is_scalar +from pandas.core.dtypes.dtypes import DatetimeTZDtype +from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna + +from pandas.core.accessor import delegate_names +from pandas.core.arrays.datetimes import ( + DatetimeArray, + tz_to_dtype, + validate_tz_from_dtype, +) +from pandas.core.base import _shared_docs +import pandas.core.common as com +from pandas.core.indexes.base import Index, maybe_extract_name +from pandas.core.indexes.datetimelike import ( + DatetimelikeDelegateMixin, + DatetimeTimedeltaMixin, +) +from pandas.core.indexes.extension import inherit_names +from pandas.core.ops import get_op_result_name +import pandas.core.tools.datetimes as tools + +from pandas.tseries.frequencies import Resolution, to_offset +from pandas.tseries.offsets import Nano, prefix_mapping + + +def _new_DatetimeIndex(cls, d): + """ + This is called upon unpickling, rather than the default which doesn't + have arguments and breaks __new__ + """ + if "data" in d and not isinstance(d["data"], DatetimeIndex): + # Avoid need to verify integrity by calling simple_new directly + data = d.pop("data") + result = cls._simple_new(data, **d) + else: + with warnings.catch_warnings(): + # TODO: If we knew what was going in to **d, we might be able to + # go through _simple_new instead + warnings.simplefilter("ignore") + result = cls.__new__(cls, **d) + + return result + + +class DatetimeDelegateMixin(DatetimelikeDelegateMixin): + # Most attrs are dispatched via datetimelike_{ops,methods} + # Some are "raw" methods, the result is not not re-boxed in an Index + # We also have a few "extra" attrs, which may or may not be raw, + # which we we dont' want to expose in the .dt accessor. + _extra_methods = ["to_period", "to_perioddelta", "to_julian_date", "strftime"] + _extra_raw_methods = [ + "to_pydatetime", + "_local_timestamps", + "_has_same_tz", + "_format_native_types", + "__iter__", + ] + _extra_raw_properties = ["_box_func", "tz", "tzinfo", "dtype"] + _delegated_properties = DatetimeArray._datetimelike_ops + _extra_raw_properties + _delegated_methods = ( + DatetimeArray._datetimelike_methods + _extra_methods + _extra_raw_methods + ) + _raw_properties = ( + {"date", "time", "timetz"} + | set(DatetimeArray._bool_ops) + | set(_extra_raw_properties) + ) + _raw_methods = set(_extra_raw_methods) + + +@inherit_names(["_timezone", "is_normalized", "_resolution"], DatetimeArray, cache=True) +@inherit_names( + [ + "_bool_ops", + "_object_ops", + "_field_ops", + "_datetimelike_ops", + "_datetimelike_methods", + ], + DatetimeArray, +) +@delegate_names( + DatetimeArray, DatetimeDelegateMixin._delegated_properties, typ="property" +) +@delegate_names( + DatetimeArray, + DatetimeDelegateMixin._delegated_methods, + typ="method", + overwrite=True, +) +class DatetimeIndex(DatetimeTimedeltaMixin, DatetimeDelegateMixin): + """ + Immutable ndarray of datetime64 data, represented internally as int64, and + which can be boxed to Timestamp objects that are subclasses of datetime and + carry metadata such as frequency information. + + Parameters + ---------- + data : array-like (1-dimensional), optional + Optional datetime-like data to construct index with. + copy : bool + Make a copy of input ndarray. + freq : str or pandas offset object, optional + One of pandas date offset strings or corresponding objects. The string + 'infer' can be passed in order to set the frequency of the index as the + inferred frequency upon creation. + tz : pytz.timezone or dateutil.tz.tzfile + ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' + When clocks moved backward due to DST, ambiguous times may arise. + For example in Central European Time (UTC+01), when going from 03:00 + DST to 02:00 non-DST, 02:30:00 local time occurs both at 00:30:00 UTC + and at 01:30:00 UTC. In such a situation, the `ambiguous` parameter + dictates how ambiguous times should be handled. + + - 'infer' will attempt to infer fall dst-transition hours based on + order + - bool-ndarray where True signifies a DST time, False signifies a + non-DST time (note that this flag is only applicable for ambiguous + times) + - 'NaT' will return NaT where there are ambiguous times + - 'raise' will raise an AmbiguousTimeError if there are ambiguous times. + name : object + Name to be stored in the index. + dayfirst : bool, default False + If True, parse dates in `data` with the day first order. + yearfirst : bool, default False + If True parse dates in `data` with the year first order. + + Attributes + ---------- + year + month + day + hour + minute + second + microsecond + nanosecond + date + time + timetz + dayofyear + weekofyear + week + dayofweek + weekday + quarter + tz + freq + freqstr + is_month_start + is_month_end + is_quarter_start + is_quarter_end + is_year_start + is_year_end + is_leap_year + inferred_freq + + Methods + ------- + normalize + strftime + snap + tz_convert + tz_localize + round + floor + ceil + to_period + to_perioddelta + to_pydatetime + to_series + to_frame + month_name + day_name + mean + + See Also + -------- + Index : The base pandas Index type. + TimedeltaIndex : Index of timedelta64 data. + PeriodIndex : Index of Period data. + to_datetime : Convert argument to datetime. + date_range : Create a fixed-frequency DatetimeIndex. + + Notes + ----- + To learn more about the frequency strings, please see `this link + `__. + """ + + _typ = "datetimeindex" + + _engine_type = libindex.DatetimeEngine + _supports_partial_string_indexing = True + + _comparables = ["name", "freqstr", "tz"] + _attributes = ["name", "tz", "freq"] + + _is_numeric_dtype = False + _infer_as_myclass = True + + tz: Optional[tzinfo] + + # -------------------------------------------------------------------- + # Constructors + + def __new__( + cls, + data=None, + freq=None, + tz=None, + normalize=False, + closed=None, + ambiguous="raise", + dayfirst=False, + yearfirst=False, + dtype=None, + copy=False, + name=None, + ): + + if is_scalar(data): + raise TypeError( + f"{cls.__name__}() must be called with a " + f"collection of some kind, {repr(data)} was passed" + ) + + # - Cases checked above all return/raise before reaching here - # + + name = maybe_extract_name(name, data, cls) + + dtarr = DatetimeArray._from_sequence( + data, + dtype=dtype, + copy=copy, + tz=tz, + freq=freq, + dayfirst=dayfirst, + yearfirst=yearfirst, + ambiguous=ambiguous, + ) + + subarr = cls._simple_new(dtarr, name=name, freq=dtarr.freq, tz=dtarr.tz) + return subarr + + @classmethod + def _simple_new(cls, values, name=None, freq=None, tz=None, dtype=None): + """ + We require the we have a dtype compat for the values + if we are passed a non-dtype compat, then coerce using the constructor + """ + if isinstance(values, DatetimeArray): + if tz: + tz = validate_tz_from_dtype(dtype, tz) + dtype = DatetimeTZDtype(tz=tz) + elif dtype is None: + dtype = _NS_DTYPE + + values = DatetimeArray(values, freq=freq, dtype=dtype) + tz = values.tz + freq = values.freq + values = values._data + + # DatetimeArray._simple_new will accept either i8 or M8[ns] dtypes + if isinstance(values, DatetimeIndex): + values = values._data + + dtype = tz_to_dtype(tz) + dtarr = DatetimeArray._simple_new(values, freq=freq, dtype=dtype) + assert isinstance(dtarr, DatetimeArray) + + result = object.__new__(cls) + result._data = dtarr + result.name = name + result._no_setting_name = False + # For groupby perf. See note in indexes/base about _index_data + result._index_data = dtarr._data + result._reset_identity() + return result + + # -------------------------------------------------------------------- + + def __array__(self, dtype=None) -> np.ndarray: + return np.asarray(self._data, dtype=dtype) + + @cache_readonly + def _is_dates_only(self) -> bool: + """ + Return a boolean if we are only dates (and don't have a timezone) + + Returns + ------- + bool + """ + from pandas.io.formats.format import _is_dates_only + + return _is_dates_only(self.values) and self.tz is None + + def __reduce__(self): + + # we use a special reduce here because we need + # to simply set the .tz (and not reinterpret it) + + d = dict(data=self._data) + d.update(self._get_attributes_dict()) + return _new_DatetimeIndex, (type(self), d), None + + def _convert_for_op(self, value): + """ + Convert value to be insertable to ndarray. + """ + if self._has_same_tz(value): + return Timestamp(value).asm8 + raise ValueError("Passed item and index have different timezone") + + # -------------------------------------------------------------------- + # Rendering Methods + + def _mpl_repr(self): + # how to represent ourselves to matplotlib + return libts.ints_to_pydatetime(self.asi8, self.tz) + + @property + def _formatter_func(self): + from pandas.io.formats.format import _get_format_datetime64 + + formatter = _get_format_datetime64(is_dates_only=self._is_dates_only) + return lambda x: f"'{formatter(x, tz=self.tz)}'" + + # -------------------------------------------------------------------- + # Set Operation Methods + + def union_many(self, others): + """ + A bit of a hack to accelerate unioning a collection of indexes. + """ + this = self + + for other in others: + if not isinstance(this, DatetimeIndex): + this = Index.union(this, other) + continue + + if not isinstance(other, DatetimeIndex): + try: + other = DatetimeIndex(other) + except TypeError: + pass + + this, other = this._maybe_utc_convert(other) + + if this._can_fast_union(other): + this = this._fast_union(other) + else: + dtype = this.dtype + this = Index.union(this, other) + if isinstance(this, DatetimeIndex): + # TODO: we shouldn't be setting attributes like this; + # in all the tests this equality already holds + this._data._dtype = dtype + return this + + def _wrap_setop_result(self, other, result): + name = get_op_result_name(self, other) + return self._shallow_copy(result, name=name, freq=None, tz=self.tz) + + # -------------------------------------------------------------------- + + def _get_time_micros(self): + values = self.asi8 + if self.tz is not None and not timezones.is_utc(self.tz): + values = self._data._local_timestamps() + return fields.get_time_micros(values) + + def to_series(self, keep_tz=lib.no_default, index=None, name=None): + """ + Create a Series with both index and values equal to the index keys + useful with map for returning an indexer based on an index. + + Parameters + ---------- + keep_tz : optional, defaults True + Return the data keeping the timezone. + + If keep_tz is True: + + If the timezone is not set, the resulting + Series will have a datetime64[ns] dtype. + + Otherwise the Series will have an datetime64[ns, tz] dtype; the + tz will be preserved. + + If keep_tz is False: + + Series will have a datetime64[ns] dtype. TZ aware + objects will have the tz removed. + + .. versionchanged:: 1.0.0 + The default value is now True. In a future version, + this keyword will be removed entirely. Stop passing the + argument to obtain the future behavior and silence the warning. + + index : Index, optional + Index of resulting Series. If None, defaults to original index. + name : str, optional + Name of resulting Series. If None, defaults to name of original + index. + + Returns + ------- + Series + """ + from pandas import Series + + if index is None: + index = self._shallow_copy() + if name is None: + name = self.name + + if keep_tz is not lib.no_default: + if keep_tz: + warnings.warn( + "The 'keep_tz' keyword in DatetimeIndex.to_series " + "is deprecated and will be removed in a future version. " + "You can stop passing 'keep_tz' to silence this warning.", + FutureWarning, + stacklevel=2, + ) + else: + warnings.warn( + "Specifying 'keep_tz=False' is deprecated and this " + "option will be removed in a future release. If " + "you want to remove the timezone information, you " + "can do 'idx.tz_convert(None)' before calling " + "'to_series'.", + FutureWarning, + stacklevel=2, + ) + else: + keep_tz = True + + if keep_tz and self.tz is not None: + # preserve the tz & copy + values = self.copy(deep=True) + else: + values = self.values.copy() + + return Series(values, index=index, name=name) + + def snap(self, freq="S"): + """ + Snap time stamps to nearest occurring frequency. + + Returns + ------- + DatetimeIndex + """ + # Superdumb, punting on any optimizing + freq = to_offset(freq) + + snapped = np.empty(len(self), dtype=_NS_DTYPE) + + for i, v in enumerate(self): + s = v + if not freq.is_on_offset(s): + t0 = freq.rollback(s) + t1 = freq.rollforward(s) + if abs(s - t0) < abs(t1 - s): + s = t0 + else: + s = t1 + snapped[i] = s + + # we know it conforms; skip check + return DatetimeIndex._simple_new(snapped, name=self.name, tz=self.tz, freq=freq) + + def _parsed_string_to_bounds(self, reso, parsed): + """ + Calculate datetime bounds for parsed time string and its resolution. + + Parameters + ---------- + reso : Resolution + Resolution provided by parsed string. + parsed : datetime + Datetime from parsed string. + + Returns + ------- + lower, upper: pd.Timestamp + + """ + valid_resos = { + "year", + "month", + "quarter", + "day", + "hour", + "minute", + "second", + "minute", + "second", + "microsecond", + } + if reso not in valid_resos: + raise KeyError + if reso == "year": + start = Timestamp(parsed.year, 1, 1) + end = Timestamp(parsed.year, 12, 31, 23, 59, 59, 999999) + elif reso == "month": + d = ccalendar.get_days_in_month(parsed.year, parsed.month) + start = Timestamp(parsed.year, parsed.month, 1) + end = Timestamp(parsed.year, parsed.month, d, 23, 59, 59, 999999) + elif reso == "quarter": + qe = (((parsed.month - 1) + 2) % 12) + 1 # two months ahead + d = ccalendar.get_days_in_month(parsed.year, qe) # at end of month + start = Timestamp(parsed.year, parsed.month, 1) + end = Timestamp(parsed.year, qe, d, 23, 59, 59, 999999) + elif reso == "day": + start = Timestamp(parsed.year, parsed.month, parsed.day) + end = start + timedelta(days=1) - Nano(1) + elif reso == "hour": + start = Timestamp(parsed.year, parsed.month, parsed.day, parsed.hour) + end = start + timedelta(hours=1) - Nano(1) + elif reso == "minute": + start = Timestamp( + parsed.year, parsed.month, parsed.day, parsed.hour, parsed.minute + ) + end = start + timedelta(minutes=1) - Nano(1) + elif reso == "second": + start = Timestamp( + parsed.year, + parsed.month, + parsed.day, + parsed.hour, + parsed.minute, + parsed.second, + ) + end = start + timedelta(seconds=1) - Nano(1) + elif reso == "microsecond": + start = Timestamp( + parsed.year, + parsed.month, + parsed.day, + parsed.hour, + parsed.minute, + parsed.second, + parsed.microsecond, + ) + end = start + timedelta(microseconds=1) - Nano(1) + # GH 24076 + # If an incoming date string contained a UTC offset, need to localize + # the parsed date to this offset first before aligning with the index's + # timezone + if parsed.tzinfo is not None: + if self.tz is None: + raise ValueError( + "The index must be timezone aware when indexing " + "with a date string with a UTC offset" + ) + start = start.tz_localize(parsed.tzinfo).tz_convert(self.tz) + end = end.tz_localize(parsed.tzinfo).tz_convert(self.tz) + elif self.tz is not None: + start = start.tz_localize(self.tz) + end = end.tz_localize(self.tz) + return start, end + + def _partial_date_slice( + self, reso: str, parsed, use_lhs: bool = True, use_rhs: bool = True + ): + """ + Parameters + ---------- + reso : str + use_lhs : bool, default True + use_rhs : bool, default True + """ + is_monotonic = self.is_monotonic + if ( + is_monotonic + and reso in ["day", "hour", "minute", "second"] + and self._resolution >= Resolution.get_reso(reso) + ): + # These resolution/monotonicity validations came from GH3931, + # GH3452 and GH2369. + + # See also GH14826 + raise KeyError + + if reso == "microsecond": + # _partial_date_slice doesn't allow microsecond resolution, but + # _parsed_string_to_bounds allows it. + raise KeyError + + t1, t2 = self._parsed_string_to_bounds(reso, parsed) + stamps = self.asi8 + + if is_monotonic: + + # we are out of range + if len(stamps) and ( + (use_lhs and t1.value < stamps[0] and t2.value < stamps[0]) + or ((use_rhs and t1.value > stamps[-1] and t2.value > stamps[-1])) + ): + raise KeyError + + # a monotonic (sorted) series can be sliced + left = stamps.searchsorted(t1.value, side="left") if use_lhs else None + right = stamps.searchsorted(t2.value, side="right") if use_rhs else None + + return slice(left, right) + + lhs_mask = (stamps >= t1.value) if use_lhs else True + rhs_mask = (stamps <= t2.value) if use_rhs else True + + # try to find a the dates + return (lhs_mask & rhs_mask).nonzero()[0] + + def _maybe_promote(self, other): + if other.inferred_type == "date": + other = DatetimeIndex(other) + return self, other + + def get_value(self, series, key): + """ + Fast lookup of value from 1-dimensional ndarray. Only use this if you + know what you're doing + """ + + if isinstance(key, (datetime, np.datetime64)): + return self.get_value_maybe_box(series, key) + + if isinstance(key, time): + locs = self.indexer_at_time(key) + return series.take(locs) + + try: + value = Index.get_value(self, series, key) + except KeyError: + try: + loc = self._get_string_slice(key) + return series[loc] + except (TypeError, ValueError, KeyError): + pass + + try: + return self.get_value_maybe_box(series, key) + except (TypeError, ValueError, KeyError): + raise KeyError(key) + else: + return com.maybe_box(self, value, series, key) + + def get_value_maybe_box(self, series, key): + # needed to localize naive datetimes + if self.tz is not None: + key = Timestamp(key) + if key.tzinfo is not None: + key = key.tz_convert(self.tz) + else: + key = key.tz_localize(self.tz) + elif not isinstance(key, Timestamp): + key = Timestamp(key) + values = self._engine.get_value(com.values_from_object(series), key, tz=self.tz) + return com.maybe_box(self, values, series, key) + + def get_loc(self, key, method=None, tolerance=None): + """ + Get integer location for requested label + + Returns + ------- + loc : int + """ + + if tolerance is not None: + # try converting tolerance now, so errors don't get swallowed by + # the try/except clauses below + tolerance = self._convert_tolerance(tolerance, np.asarray(key)) + + if isinstance(key, datetime): + # needed to localize naive datetimes + if key.tzinfo is None: + key = Timestamp(key, tz=self.tz) + else: + key = Timestamp(key).tz_convert(self.tz) + return Index.get_loc(self, key, method, tolerance) + + elif isinstance(key, timedelta): + # GH#20464 + raise TypeError( + f"Cannot index {type(self).__name__} with {type(key).__name__}" + ) + + if isinstance(key, time): + if method is not None: + raise NotImplementedError( + "cannot yet lookup inexact labels when key is a time object" + ) + return self.indexer_at_time(key) + + try: + return Index.get_loc(self, key, method, tolerance) + except (KeyError, ValueError, TypeError): + try: + return self._get_string_slice(key) + except (TypeError, KeyError, ValueError, OverflowError): + pass + + try: + stamp = Timestamp(key) + if stamp.tzinfo is not None and self.tz is not None: + stamp = stamp.tz_convert(self.tz) + else: + stamp = stamp.tz_localize(self.tz) + return Index.get_loc(self, stamp, method, tolerance) + except KeyError: + raise KeyError(key) + except ValueError as e: + # list-like tolerance size must match target index size + if "list-like" in str(e): + raise e + raise KeyError(key) + + def _maybe_cast_slice_bound(self, label, side, kind): + """ + If label is a string, cast it to datetime according to resolution. + + Parameters + ---------- + label : object + side : {'left', 'right'} + kind : {'ix', 'loc', 'getitem'} + + Returns + ------- + label : object + + Notes + ----- + Value of `side` parameter should be validated in caller. + """ + assert kind in ["ix", "loc", "getitem", None] + + if is_float(label) or isinstance(label, time) or is_integer(label): + self._invalid_indexer("slice", label) + + if isinstance(label, str): + freq = getattr(self, "freqstr", getattr(self, "inferred_freq", None)) + _, parsed, reso = parsing.parse_time_string(label, freq) + lower, upper = self._parsed_string_to_bounds(reso, parsed) + # lower, upper form the half-open interval: + # [parsed, parsed + 1 freq) + # because label may be passed to searchsorted + # the bounds need swapped if index is reverse sorted and has a + # length > 1 (is_monotonic_decreasing gives True for empty + # and length 1 index) + if self._is_strictly_monotonic_decreasing and len(self) > 1: + return upper if side == "left" else lower + return lower if side == "left" else upper + else: + return label + + def _get_string_slice(self, key: str, use_lhs: bool = True, use_rhs: bool = True): + freq = getattr(self, "freqstr", getattr(self, "inferred_freq", None)) + _, parsed, reso = parsing.parse_time_string(key, freq) + loc = self._partial_date_slice(reso, parsed, use_lhs=use_lhs, use_rhs=use_rhs) + return loc + + def slice_indexer(self, start=None, end=None, step=None, kind=None): + """ + Return indexer for specified label slice. + Index.slice_indexer, customized to handle time slicing. + + In addition to functionality provided by Index.slice_indexer, does the + following: + + - if both `start` and `end` are instances of `datetime.time`, it + invokes `indexer_between_time` + - if `start` and `end` are both either string or None perform + value-based selection in non-monotonic cases. + + """ + # For historical reasons DatetimeIndex supports slices between two + # instances of datetime.time as if it were applying a slice mask to + # an array of (self.hour, self.minute, self.seconds, self.microsecond). + if isinstance(start, time) and isinstance(end, time): + if step is not None and step != 1: + raise ValueError("Must have step size of 1 with time slices") + return self.indexer_between_time(start, end) + + if isinstance(start, time) or isinstance(end, time): + raise KeyError("Cannot mix time and non-time slice keys") + + try: + return Index.slice_indexer(self, start, end, step, kind=kind) + except KeyError: + # For historical reasons DatetimeIndex by default supports + # value-based partial (aka string) slices on non-monotonic arrays, + # let's try that. + if (start is None or isinstance(start, str)) and ( + end is None or isinstance(end, str) + ): + mask = True + if start is not None: + start_casted = self._maybe_cast_slice_bound(start, "left", kind) + mask = start_casted <= self + + if end is not None: + end_casted = self._maybe_cast_slice_bound(end, "right", kind) + mask = (self <= end_casted) & mask + + indexer = mask.nonzero()[0][::step] + if len(indexer) == len(self): + return slice(None) + else: + return indexer + else: + raise + + # -------------------------------------------------------------------- + + @Substitution(klass="DatetimeIndex") + @Appender(_shared_docs["searchsorted"]) + def searchsorted(self, value, side="left", sorter=None): + if isinstance(value, (np.ndarray, Index)): + if not type(self._data)._is_recognized_dtype(value): + raise TypeError( + "searchsorted requires compatible dtype or scalar, " + f"not {type(value).__name__}" + ) + value = type(self._data)(value) + self._data._check_compatible_with(value) + + elif isinstance(value, self._data._recognized_scalars): + self._data._check_compatible_with(value) + value = self._data._scalar_type(value) + + elif not isinstance(value, DatetimeArray): + raise TypeError( + "searchsorted requires compatible dtype or scalar, " + f"not {type(value).__name__}" + ) + + return self._data.searchsorted(value, side=side) + + def is_type_compatible(self, typ) -> bool: + return typ == self.inferred_type or typ == "datetime" + + @property + def inferred_type(self) -> str: + # b/c datetime is represented as microseconds since the epoch, make + # sure we can't have ambiguous indexing + return "datetime64" + + def insert(self, loc, item): + """ + Make new Index inserting new item at location + + Parameters + ---------- + loc : int + item : object + if not either a Python datetime or a numpy integer-like, returned + Index dtype will be object rather than datetime. + + Returns + ------- + new_index : Index + """ + if isinstance(item, self._data._recognized_scalars): + item = self._data._scalar_type(item) + elif is_valid_nat_for_dtype(item, self.dtype): + # GH 18295 + item = self._na_value + elif is_scalar(item) and isna(item): + # i.e. timedeltat64("NaT") + raise TypeError( + f"cannot insert {type(self).__name__} with incompatible label" + ) + + freq = None + if isinstance(item, self._data._scalar_type) or item is NaT: + self._data._check_compatible_with(item, setitem=True) + + # check freq can be preserved on edge cases + if self.size and self.freq is not None: + if item is NaT: + pass + elif (loc == 0 or loc == -len(self)) and item + self.freq == self[0]: + freq = self.freq + elif (loc == len(self)) and item - self.freq == self[-1]: + freq = self.freq + item = item.asm8 + + try: + new_i8s = np.concatenate( + (self[:loc].asi8, [item.view(np.int64)], self[loc:].asi8) + ) + return self._shallow_copy(new_i8s, freq=freq) + except (AttributeError, TypeError): + + # fall back to object index + if isinstance(item, str): + return self.astype(object).insert(loc, item) + raise TypeError( + f"cannot insert {type(self).__name__} with incompatible label" + ) + + def indexer_at_time(self, time, asof=False): + """ + Return index locations of index values at particular time of day + (e.g. 9:30AM). + + Parameters + ---------- + time : datetime.time or str + datetime.time or string in appropriate format ("%H:%M", "%H%M", + "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", + "%I%M%S%p"). + + Returns + ------- + values_at_time : array of integers + + See Also + -------- + indexer_between_time, DataFrame.at_time + """ + if asof: + raise NotImplementedError("'asof' argument is not supported") + + if isinstance(time, str): + from dateutil.parser import parse + + time = parse(time).time() + + if time.tzinfo: + if self.tz is None: + raise ValueError("Index must be timezone aware.") + time_micros = self.tz_convert(time.tzinfo)._get_time_micros() + else: + time_micros = self._get_time_micros() + micros = _time_to_micros(time) + return (micros == time_micros).nonzero()[0] + + def indexer_between_time( + self, start_time, end_time, include_start=True, include_end=True + ): + """ + Return index locations of values between particular times of day + (e.g., 9:00-9:30AM). + + Parameters + ---------- + start_time, end_time : datetime.time, str + datetime.time or string in appropriate format ("%H:%M", "%H%M", + "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", + "%I%M%S%p"). + include_start : bool, default True + include_end : bool, default True + + Returns + ------- + values_between_time : array of integers + + See Also + -------- + indexer_at_time, DataFrame.between_time + """ + start_time = tools.to_time(start_time) + end_time = tools.to_time(end_time) + time_micros = self._get_time_micros() + start_micros = _time_to_micros(start_time) + end_micros = _time_to_micros(end_time) + + if include_start and include_end: + lop = rop = operator.le + elif include_start: + lop = operator.le + rop = operator.lt + elif include_end: + lop = operator.lt + rop = operator.le + else: + lop = rop = operator.lt + + if start_time <= end_time: + join_op = operator.and_ + else: + join_op = operator.or_ + + mask = join_op(lop(start_micros, time_micros), rop(time_micros, end_micros)) + + return mask.nonzero()[0] + + +DatetimeIndex._add_numeric_methods_disabled() +DatetimeIndex._add_logical_methods_disabled() + + +def date_range( + start=None, + end=None, + periods=None, + freq=None, + tz=None, + normalize=False, + name=None, + closed=None, + **kwargs, +) -> DatetimeIndex: + """ + Return a fixed frequency DatetimeIndex. + + Parameters + ---------- + start : str or datetime-like, optional + Left bound for generating dates. + end : str or datetime-like, optional + Right bound for generating dates. + periods : int, optional + Number of periods to generate. + freq : str or DateOffset, default 'D' + Frequency strings can have multiples, e.g. '5H'. See + :ref:`here ` for a list of + frequency aliases. + tz : str or tzinfo, optional + Time zone name for returning localized DatetimeIndex, for example + 'Asia/Hong_Kong'. By default, the resulting DatetimeIndex is + timezone-naive. + normalize : bool, default False + Normalize start/end dates to midnight before generating date range. + name : str, default None + Name of the resulting DatetimeIndex. + closed : {None, 'left', 'right'}, optional + Make the interval closed with respect to the given frequency to + the 'left', 'right', or both sides (None, the default). + **kwargs + For compatibility. Has no effect on the result. + + Returns + ------- + rng : DatetimeIndex + + See Also + -------- + DatetimeIndex : An immutable container for datetimes. + timedelta_range : Return a fixed frequency TimedeltaIndex. + period_range : Return a fixed frequency PeriodIndex. + interval_range : Return a fixed frequency IntervalIndex. + + Notes + ----- + Of the four parameters ``start``, ``end``, ``periods``, and ``freq``, + exactly three must be specified. If ``freq`` is omitted, the resulting + ``DatetimeIndex`` will have ``periods`` linearly spaced elements between + ``start`` and ``end`` (closed on both sides). + + To learn more about the frequency strings, please see `this link + `__. + + Examples + -------- + **Specifying the values** + + The next four examples generate the same `DatetimeIndex`, but vary + the combination of `start`, `end` and `periods`. + + Specify `start` and `end`, with the default daily frequency. + + >>> pd.date_range(start='1/1/2018', end='1/08/2018') + DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04', + '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'], + dtype='datetime64[ns]', freq='D') + + Specify `start` and `periods`, the number of periods (days). + + >>> pd.date_range(start='1/1/2018', periods=8) + DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04', + '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'], + dtype='datetime64[ns]', freq='D') + + Specify `end` and `periods`, the number of periods (days). + + >>> pd.date_range(end='1/1/2018', periods=8) + DatetimeIndex(['2017-12-25', '2017-12-26', '2017-12-27', '2017-12-28', + '2017-12-29', '2017-12-30', '2017-12-31', '2018-01-01'], + dtype='datetime64[ns]', freq='D') + + Specify `start`, `end`, and `periods`; the frequency is generated + automatically (linearly spaced). + + >>> pd.date_range(start='2018-04-24', end='2018-04-27', periods=3) + DatetimeIndex(['2018-04-24 00:00:00', '2018-04-25 12:00:00', + '2018-04-27 00:00:00'], + dtype='datetime64[ns]', freq=None) + + **Other Parameters** + + Changed the `freq` (frequency) to ``'M'`` (month end frequency). + + >>> pd.date_range(start='1/1/2018', periods=5, freq='M') + DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30', + '2018-05-31'], + dtype='datetime64[ns]', freq='M') + + Multiples are allowed + + >>> pd.date_range(start='1/1/2018', periods=5, freq='3M') + DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31', + '2019-01-31'], + dtype='datetime64[ns]', freq='3M') + + `freq` can also be specified as an Offset object. + + >>> pd.date_range(start='1/1/2018', periods=5, freq=pd.offsets.MonthEnd(3)) + DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31', + '2019-01-31'], + dtype='datetime64[ns]', freq='3M') + + Specify `tz` to set the timezone. + + >>> pd.date_range(start='1/1/2018', periods=5, tz='Asia/Tokyo') + DatetimeIndex(['2018-01-01 00:00:00+09:00', '2018-01-02 00:00:00+09:00', + '2018-01-03 00:00:00+09:00', '2018-01-04 00:00:00+09:00', + '2018-01-05 00:00:00+09:00'], + dtype='datetime64[ns, Asia/Tokyo]', freq='D') + + `closed` controls whether to include `start` and `end` that are on the + boundary. The default includes boundary points on either end. + + >>> pd.date_range(start='2017-01-01', end='2017-01-04', closed=None) + DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04'], + dtype='datetime64[ns]', freq='D') + + Use ``closed='left'`` to exclude `end` if it falls on the boundary. + + >>> pd.date_range(start='2017-01-01', end='2017-01-04', closed='left') + DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03'], + dtype='datetime64[ns]', freq='D') + + Use ``closed='right'`` to exclude `start` if it falls on the boundary. + + >>> pd.date_range(start='2017-01-01', end='2017-01-04', closed='right') + DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04'], + dtype='datetime64[ns]', freq='D') + """ + + if freq is None and com.any_none(periods, start, end): + freq = "D" + + dtarr = DatetimeArray._generate_range( + start=start, + end=end, + periods=periods, + freq=freq, + tz=tz, + normalize=normalize, + closed=closed, + **kwargs, + ) + return DatetimeIndex._simple_new(dtarr, tz=dtarr.tz, freq=dtarr.freq, name=name) + + +def bdate_range( + start=None, + end=None, + periods=None, + freq="B", + tz=None, + normalize=True, + name=None, + weekmask=None, + holidays=None, + closed=None, + **kwargs, +) -> DatetimeIndex: + """ + Return a fixed frequency DatetimeIndex, with business day as the default + frequency. + + Parameters + ---------- + start : str or datetime-like, default None + Left bound for generating dates. + end : str or datetime-like, default None + Right bound for generating dates. + periods : int, default None + Number of periods to generate. + freq : str or DateOffset, default 'B' (business daily) + Frequency strings can have multiples, e.g. '5H'. + tz : str or None + Time zone name for returning localized DatetimeIndex, for example + Asia/Beijing. + normalize : bool, default False + Normalize start/end dates to midnight before generating date range. + name : str, default None + Name of the resulting DatetimeIndex. + weekmask : str or None, default None + Weekmask of valid business days, passed to ``numpy.busdaycalendar``, + only used when custom frequency strings are passed. The default + value None is equivalent to 'Mon Tue Wed Thu Fri'. + + .. versionadded:: 0.21.0 + + holidays : list-like or None, default None + Dates to exclude from the set of valid business days, passed to + ``numpy.busdaycalendar``, only used when custom frequency strings + are passed. + + .. versionadded:: 0.21.0 + + closed : str, default None + Make the interval closed with respect to the given frequency to + the 'left', 'right', or both sides (None). + **kwargs + For compatibility. Has no effect on the result. + + Returns + ------- + DatetimeIndex + + Notes + ----- + Of the four parameters: ``start``, ``end``, ``periods``, and ``freq``, + exactly three must be specified. Specifying ``freq`` is a requirement + for ``bdate_range``. Use ``date_range`` if specifying ``freq`` is not + desired. + + To learn more about the frequency strings, please see `this link + `__. + + Examples + -------- + Note how the two weekend days are skipped in the result. + + >>> pd.bdate_range(start='1/1/2018', end='1/08/2018') + DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04', + '2018-01-05', '2018-01-08'], + dtype='datetime64[ns]', freq='B') + """ + if freq is None: + msg = "freq must be specified for bdate_range; use date_range instead" + raise TypeError(msg) + + if isinstance(freq, str) and freq.startswith("C"): + try: + weekmask = weekmask or "Mon Tue Wed Thu Fri" + freq = prefix_mapping[freq](holidays=holidays, weekmask=weekmask) + except (KeyError, TypeError): + msg = f"invalid custom frequency string: {freq}" + raise ValueError(msg) + elif holidays or weekmask: + msg = ( + "a custom frequency string is required when holidays or " + f"weekmask are passed, got frequency {freq}" + ) + raise ValueError(msg) + + return date_range( + start=start, + end=end, + periods=periods, + freq=freq, + tz=tz, + normalize=normalize, + name=name, + closed=closed, + **kwargs, + ) + + +def _time_to_micros(time): + seconds = time.hour * 60 * 60 + 60 * time.minute + time.second + return 1000000 * seconds + time.microsecond diff --git a/venv/Lib/site-packages/pandas/core/indexes/extension.py b/venv/Lib/site-packages/pandas/core/indexes/extension.py new file mode 100644 index 0000000..d5664d7 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/indexes/extension.py @@ -0,0 +1,295 @@ +""" +Shared methods for Index subclasses backed by ExtensionArray. +""" +from typing import List + +import numpy as np + +from pandas.compat.numpy import function as nv +from pandas.util._decorators import Appender, cache_readonly + +from pandas.core.dtypes.common import ( + ensure_platform_int, + is_dtype_equal, + is_object_dtype, +) +from pandas.core.dtypes.generic import ABCSeries + +from pandas.core.arrays import ExtensionArray +from pandas.core.indexers import deprecate_ndim_indexing +from pandas.core.indexes.base import Index +from pandas.core.ops import get_op_result_name + + +def inherit_from_data(name: str, delegate, cache: bool = False, wrap: bool = False): + """ + Make an alias for a method of the underlying ExtensionArray. + + Parameters + ---------- + name : str + Name of an attribute the class should inherit from its EA parent. + delegate : class + cache : bool, default False + Whether to convert wrapped properties into cache_readonly + wrap : bool, default False + Whether to wrap the inherited result in an Index. + + Returns + ------- + attribute, method, property, or cache_readonly + """ + + attr = getattr(delegate, name) + + if isinstance(attr, property): + if cache: + + def cached(self): + return getattr(self._data, name) + + cached.__name__ = name + cached.__doc__ = attr.__doc__ + method = cache_readonly(cached) + + else: + + def fget(self): + result = getattr(self._data, name) + if wrap: + if isinstance(result, type(self._data)): + return type(self)._simple_new(result, name=self.name) + return Index(result, name=self.name) + return result + + def fset(self, value): + setattr(self._data, name, value) + + fget.__name__ = name + fget.__doc__ = attr.__doc__ + + method = property(fget, fset) + + elif not callable(attr): + # just a normal attribute, no wrapping + method = attr + + else: + + def method(self, *args, **kwargs): + result = attr(self._data, *args, **kwargs) + if wrap: + if isinstance(result, type(self._data)): + return type(self)._simple_new(result, name=self.name) + return Index(result, name=self.name) + return result + + method.__name__ = name + method.__doc__ = attr.__doc__ + return method + + +def inherit_names(names: List[str], delegate, cache: bool = False, wrap: bool = False): + """ + Class decorator to pin attributes from an ExtensionArray to a Index subclass. + + Parameters + ---------- + names : List[str] + delegate : class + cache : bool, default False + wrap : bool, default False + Whether to wrap the inherited result in an Index. + """ + + def wrapper(cls): + for name in names: + meth = inherit_from_data(name, delegate, cache=cache, wrap=wrap) + setattr(cls, name, meth) + + return cls + + return wrapper + + +def _make_wrapped_comparison_op(opname): + """ + Create a comparison method that dispatches to ``._data``. + """ + + def wrapper(self, other): + if isinstance(other, ABCSeries): + # the arrays defer to Series for comparison ops but the indexes + # don't, so we have to unwrap here. + other = other._values + + other = _maybe_unwrap_index(other) + + op = getattr(self._data, opname) + return op(other) + + wrapper.__name__ = opname + return wrapper + + +def make_wrapped_arith_op(opname): + def method(self, other): + if ( + isinstance(other, Index) + and is_object_dtype(other.dtype) + and type(other) is not Index + ): + # We return NotImplemented for object-dtype index *subclasses* so they have + # a chance to implement ops before we unwrap them. + # See https://github.com/pandas-dev/pandas/issues/31109 + return NotImplemented + meth = getattr(self._data, opname) + result = meth(_maybe_unwrap_index(other)) + return _wrap_arithmetic_op(self, other, result) + + method.__name__ = opname + return method + + +def _wrap_arithmetic_op(self, other, result): + if result is NotImplemented: + return NotImplemented + + if isinstance(result, tuple): + # divmod, rdivmod + assert len(result) == 2 + return ( + _wrap_arithmetic_op(self, other, result[0]), + _wrap_arithmetic_op(self, other, result[1]), + ) + + if not isinstance(result, Index): + # Index.__new__ will choose appropriate subclass for dtype + result = Index(result) + + res_name = get_op_result_name(self, other) + result.name = res_name + return result + + +def _maybe_unwrap_index(obj): + """ + If operating against another Index object, we need to unwrap the underlying + data before deferring to the DatetimeArray/TimedeltaArray/PeriodArray + implementation, otherwise we will incorrectly return NotImplemented. + + Parameters + ---------- + obj : object + + Returns + ------- + unwrapped object + """ + if isinstance(obj, Index): + return obj._data + return obj + + +class ExtensionIndex(Index): + """ + Index subclass for indexes backed by ExtensionArray. + """ + + _data: ExtensionArray + + __eq__ = _make_wrapped_comparison_op("__eq__") + __ne__ = _make_wrapped_comparison_op("__ne__") + __lt__ = _make_wrapped_comparison_op("__lt__") + __gt__ = _make_wrapped_comparison_op("__gt__") + __le__ = _make_wrapped_comparison_op("__le__") + __ge__ = _make_wrapped_comparison_op("__ge__") + + def __getitem__(self, key): + result = self._data[key] + if isinstance(result, type(self._data)): + return type(self)(result, name=self.name) + + # Includes cases where we get a 2D ndarray back for MPL compat + deprecate_ndim_indexing(result) + return result + + def __iter__(self): + return self._data.__iter__() + + @property + def _ndarray_values(self) -> np.ndarray: + return self._data._ndarray_values + + @Appender(Index.dropna.__doc__) + def dropna(self, how="any"): + if how not in ("any", "all"): + raise ValueError(f"invalid how option: {how}") + + if self.hasnans: + return self._shallow_copy(self._data[~self._isnan]) + return self._shallow_copy() + + def repeat(self, repeats, axis=None): + nv.validate_repeat(tuple(), dict(axis=axis)) + result = self._data.repeat(repeats, axis=axis) + return self._shallow_copy(result) + + @Appender(Index.take.__doc__) + def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): + nv.validate_take(tuple(), kwargs) + indices = ensure_platform_int(indices) + + taken = self._assert_take_fillable( + self._data, + indices, + allow_fill=allow_fill, + fill_value=fill_value, + na_value=self._na_value, + ) + return type(self)(taken, name=self.name) + + def unique(self, level=None): + if level is not None: + self._validate_index_level(level) + + result = self._data.unique() + return self._shallow_copy(result) + + def _get_unique_index(self, dropna=False): + if self.is_unique and not dropna: + return self + + result = self._data.unique() + if dropna and self.hasnans: + result = result[~result.isna()] + return self._shallow_copy(result) + + @Appender(Index.map.__doc__) + def map(self, mapper, na_action=None): + # Try to run function on index first, and then on elements of index + # Especially important for group-by functionality + try: + result = mapper(self) + + # Try to use this result if we can + if isinstance(result, np.ndarray): + result = Index(result) + + if not isinstance(result, Index): + raise TypeError("The map function must return an Index object") + return result + except Exception: + return self.astype(object).map(mapper) + + @Appender(Index.astype.__doc__) + def astype(self, dtype, copy=True): + if is_dtype_equal(self.dtype, dtype) and copy is False: + # Ensure that self.astype(self.dtype) is self + return self + + new_values = self._data.astype(dtype, copy=copy) + + # pass copy=False because any copying will be done in the + # _data.astype call above + return Index(new_values, dtype=new_values.dtype, name=self.name, copy=False) diff --git a/venv/Lib/site-packages/pandas/core/indexes/frozen.py b/venv/Lib/site-packages/pandas/core/indexes/frozen.py new file mode 100644 index 0000000..909643d --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/indexes/frozen.py @@ -0,0 +1,107 @@ +""" +frozen (immutable) data structures to support MultiIndexing + +These are used for: + +- .names (FrozenList) + +""" + +from typing import Any + +from pandas.core.base import PandasObject + +from pandas.io.formats.printing import pprint_thing + + +class FrozenList(PandasObject, list): + """ + Container that doesn't allow setting item *but* + because it's technically non-hashable, will be used + for lookups, appropriately, etc. + """ + + # Side note: This has to be of type list. Otherwise, + # it messes up PyTables type checks. + + def union(self, other) -> "FrozenList": + """ + Returns a FrozenList with other concatenated to the end of self. + + Parameters + ---------- + other : array-like + The array-like whose elements we are concatenating. + + Returns + ------- + FrozenList + The collection difference between self and other. + """ + if isinstance(other, tuple): + other = list(other) + return type(self)(super().__add__(other)) + + def difference(self, other) -> "FrozenList": + """ + Returns a FrozenList with elements from other removed from self. + + Parameters + ---------- + other : array-like + The array-like whose elements we are removing self. + + Returns + ------- + FrozenList + The collection difference between self and other. + """ + other = set(other) + temp = [x for x in self if x not in other] + return type(self)(temp) + + # TODO: Consider deprecating these in favor of `union` (xref gh-15506) + __add__ = __iadd__ = union + + def __getitem__(self, n): + if isinstance(n, slice): + return type(self)(super().__getitem__(n)) + return super().__getitem__(n) + + def __radd__(self, other): + if isinstance(other, tuple): + other = list(other) + return type(self)(other + list(self)) + + def __eq__(self, other: Any) -> bool: + if isinstance(other, (tuple, FrozenList)): + other = list(other) + return super().__eq__(other) + + __req__ = __eq__ + + def __mul__(self, other): + return type(self)(super().__mul__(other)) + + __imul__ = __mul__ + + def __reduce__(self): + return type(self), (list(self),) + + def __hash__(self): + return hash(tuple(self)) + + def _disabled(self, *args, **kwargs): + """ + This method will not function because object is immutable. + """ + raise TypeError(f"'{type(self).__name__}' does not support mutable operations.") + + def __str__(self) -> str: + return pprint_thing(self, quote_strings=True, escape_chars=("\t", "\r", "\n")) + + def __repr__(self) -> str: + return f"{type(self).__name__}({str(self)})" + + __setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled + pop = append = extend = remove = sort = insert = _disabled diff --git a/venv/Lib/site-packages/pandas/core/indexes/interval.py b/venv/Lib/site-packages/pandas/core/indexes/interval.py new file mode 100644 index 0000000..a5ab7cb --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/indexes/interval.py @@ -0,0 +1,1383 @@ +""" define the IntervalIndex """ +from operator import le, lt +import textwrap +from typing import Any, Optional, Tuple, Union + +import numpy as np + +from pandas._config import get_option + +from pandas._libs import Timedelta, Timestamp, lib +from pandas._libs.interval import Interval, IntervalMixin, IntervalTree +from pandas._typing import AnyArrayLike +from pandas.util._decorators import Appender, Substitution, cache_readonly +from pandas.util._exceptions import rewrite_exception + +from pandas.core.dtypes.cast import ( + find_common_type, + infer_dtype_from_scalar, + maybe_downcast_to_dtype, +) +from pandas.core.dtypes.common import ( + ensure_platform_int, + is_categorical, + is_datetime64tz_dtype, + is_datetime_or_timedelta_dtype, + is_dtype_equal, + is_float, + is_float_dtype, + is_integer, + is_integer_dtype, + is_interval_dtype, + is_list_like, + is_number, + is_object_dtype, + is_scalar, +) +from pandas.core.dtypes.generic import ABCSeries +from pandas.core.dtypes.missing import isna + +from pandas.core.algorithms import take_1d +from pandas.core.arrays.interval import IntervalArray, _interval_shared_docs +import pandas.core.common as com +import pandas.core.indexes.base as ibase +from pandas.core.indexes.base import ( + Index, + InvalidIndexError, + _index_shared_docs, + default_pprint, + ensure_index, + maybe_extract_name, +) +from pandas.core.indexes.datetimes import DatetimeIndex, date_range +from pandas.core.indexes.extension import ExtensionIndex, inherit_names +from pandas.core.indexes.multi import MultiIndex +from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range +from pandas.core.ops import get_op_result_name + +from pandas.tseries.frequencies import to_offset +from pandas.tseries.offsets import DateOffset + +_VALID_CLOSED = {"left", "right", "both", "neither"} +_index_doc_kwargs = dict(ibase._index_doc_kwargs) + +_index_doc_kwargs.update( + dict( + klass="IntervalIndex", + qualname="IntervalIndex", + target_klass="IntervalIndex or list of Intervals", + name=textwrap.dedent( + """\ + name : object, optional + Name to be stored in the index. + """ + ), + ) +) + + +def _get_next_label(label): + dtype = getattr(label, "dtype", type(label)) + if isinstance(label, (Timestamp, Timedelta)): + dtype = "datetime64" + if is_datetime_or_timedelta_dtype(dtype) or is_datetime64tz_dtype(dtype): + return label + np.timedelta64(1, "ns") + elif is_integer_dtype(dtype): + return label + 1 + elif is_float_dtype(dtype): + return np.nextafter(label, np.infty) + else: + raise TypeError(f"cannot determine next label for type {repr(type(label))}") + + +def _get_prev_label(label): + dtype = getattr(label, "dtype", type(label)) + if isinstance(label, (Timestamp, Timedelta)): + dtype = "datetime64" + if is_datetime_or_timedelta_dtype(dtype) or is_datetime64tz_dtype(dtype): + return label - np.timedelta64(1, "ns") + elif is_integer_dtype(dtype): + return label - 1 + elif is_float_dtype(dtype): + return np.nextafter(label, -np.infty) + else: + raise TypeError(f"cannot determine next label for type {repr(type(label))}") + + +def _new_IntervalIndex(cls, d): + """ + This is called upon unpickling, rather than the default which doesn't have + arguments and breaks __new__. + """ + return cls.from_arrays(**d) + + +class SetopCheck: + """ + This is called to decorate the set operations of IntervalIndex + to perform the type check in advance. + """ + + def __init__(self, op_name): + self.op_name = op_name + + def __call__(self, setop): + def func(intvidx_self, other, sort=False): + intvidx_self._assert_can_do_setop(other) + other = ensure_index(other) + + if not isinstance(other, IntervalIndex): + result = getattr(intvidx_self.astype(object), self.op_name)(other) + if self.op_name in ("difference",): + result = result.astype(intvidx_self.dtype) + return result + elif intvidx_self.closed != other.closed: + raise ValueError( + "can only do set operations between two IntervalIndex " + "objects that are closed on the same side" + ) + + # GH 19016: ensure set op will not return a prohibited dtype + subtypes = [intvidx_self.dtype.subtype, other.dtype.subtype] + common_subtype = find_common_type(subtypes) + if is_object_dtype(common_subtype): + raise TypeError( + f"can only do {self.op_name} between two IntervalIndex " + "objects that have compatible dtypes" + ) + + return setop(intvidx_self, other, sort) + + return func + + +@Appender( + _interval_shared_docs["class"] + % dict( + klass="IntervalIndex", + summary="Immutable index of intervals that are closed on the same side.", + name=_index_doc_kwargs["name"], + versionadded="0.20.0", + extra_attributes="is_overlapping\nvalues\n", + extra_methods="", + examples=textwrap.dedent( + """\ + Examples + -------- + A new ``IntervalIndex`` is typically constructed using + :func:`interval_range`: + + >>> pd.interval_range(start=0, end=5) + IntervalIndex([(0, 1], (1, 2], (2, 3], (3, 4], (4, 5]], + closed='right', + dtype='interval[int64]') + + It may also be constructed using one of the constructor + methods: :meth:`IntervalIndex.from_arrays`, + :meth:`IntervalIndex.from_breaks`, and :meth:`IntervalIndex.from_tuples`. + + See further examples in the doc strings of ``interval_range`` and the + mentioned constructor methods. + """ + ), + ) +) +@inherit_names(["set_closed", "to_tuples"], IntervalArray, wrap=True) +@inherit_names( + [ + "__len__", + "__array__", + "overlaps", + "contains", + "size", + "dtype", + "left", + "right", + "length", + ], + IntervalArray, +) +@inherit_names( + ["is_non_overlapping_monotonic", "mid", "_ndarray_values", "closed"], + IntervalArray, + cache=True, +) +class IntervalIndex(IntervalMixin, ExtensionIndex): + _typ = "intervalindex" + _comparables = ["name"] + _attributes = ["name", "closed"] + + # we would like our indexing holder to defer to us + _defer_to_indexing = True + + # Immutable, so we are able to cache computations like isna in '_mask' + _mask = None + + # -------------------------------------------------------------------- + # Constructors + + def __new__( + cls, + data, + closed=None, + dtype=None, + copy: bool = False, + name=None, + verify_integrity: bool = True, + ): + + name = maybe_extract_name(name, data, cls) + + with rewrite_exception("IntervalArray", cls.__name__): + array = IntervalArray( + data, + closed=closed, + copy=copy, + dtype=dtype, + verify_integrity=verify_integrity, + ) + + return cls._simple_new(array, name) + + @classmethod + def _simple_new(cls, array, name, closed=None): + """ + Construct from an IntervalArray + + Parameters + ---------- + array : IntervalArray + name : str + Attached as result.name + closed : Any + Ignored. + """ + result = IntervalMixin.__new__(cls) + result._data = array + result.name = name + result._no_setting_name = False + result._reset_identity() + return result + + @classmethod + @Appender( + _interval_shared_docs["from_breaks"] + % dict( + klass="IntervalIndex", + examples=textwrap.dedent( + """\ + Examples + -------- + >>> pd.IntervalIndex.from_breaks([0, 1, 2, 3]) + IntervalIndex([(0, 1], (1, 2], (2, 3]], + closed='right', + dtype='interval[int64]') + """ + ), + ) + ) + def from_breaks( + cls, breaks, closed: str = "right", name=None, copy: bool = False, dtype=None + ): + with rewrite_exception("IntervalArray", cls.__name__): + array = IntervalArray.from_breaks( + breaks, closed=closed, copy=copy, dtype=dtype + ) + return cls._simple_new(array, name=name) + + @classmethod + @Appender( + _interval_shared_docs["from_arrays"] + % dict( + klass="IntervalIndex", + examples=textwrap.dedent( + """\ + Examples + -------- + >>> pd.IntervalIndex.from_arrays([0, 1, 2], [1, 2, 3]) + IntervalIndex([(0, 1], (1, 2], (2, 3]], + closed='right', + dtype='interval[int64]') + """ + ), + ) + ) + def from_arrays( + cls, + left, + right, + closed: str = "right", + name=None, + copy: bool = False, + dtype=None, + ): + with rewrite_exception("IntervalArray", cls.__name__): + array = IntervalArray.from_arrays( + left, right, closed, copy=copy, dtype=dtype + ) + return cls._simple_new(array, name=name) + + @classmethod + @Appender( + _interval_shared_docs["from_tuples"] + % dict( + klass="IntervalIndex", + examples=textwrap.dedent( + """\ + Examples + -------- + >>> pd.IntervalIndex.from_tuples([(0, 1), (1, 2)]) + IntervalIndex([(0, 1], (1, 2]], + closed='right', + dtype='interval[int64]') + """ + ), + ) + ) + def from_tuples( + cls, data, closed: str = "right", name=None, copy: bool = False, dtype=None + ): + with rewrite_exception("IntervalArray", cls.__name__): + arr = IntervalArray.from_tuples(data, closed=closed, copy=copy, dtype=dtype) + return cls._simple_new(arr, name=name) + + # -------------------------------------------------------------------- + + @Appender(_index_shared_docs["_shallow_copy"]) + def _shallow_copy(self, left=None, right=None, **kwargs): + result = self._data._shallow_copy(left=left, right=right) + attributes = self._get_attributes_dict() + attributes.update(kwargs) + return self._simple_new(result, **attributes) + + @cache_readonly + def _isnan(self): + """ + Return a mask indicating if each value is NA. + """ + if self._mask is None: + self._mask = isna(self.left) + return self._mask + + @cache_readonly + def _engine(self): + left = self._maybe_convert_i8(self.left) + right = self._maybe_convert_i8(self.right) + return IntervalTree(left, right, closed=self.closed) + + def __contains__(self, key) -> bool: + """ + return a boolean if this key is IN the index + We *only* accept an Interval + + Parameters + ---------- + key : Interval + + Returns + ------- + bool + """ + if not isinstance(key, Interval): + return False + + try: + self.get_loc(key) + return True + except KeyError: + return False + + @cache_readonly + def _multiindex(self): + return MultiIndex.from_arrays([self.left, self.right], names=["left", "right"]) + + @cache_readonly + def values(self): + """ + Return the IntervalIndex's data as an IntervalArray. + """ + return self._data + + @cache_readonly + def _values(self): + return self._data + + @property + def _has_complex_internals(self): + # used to avoid libreduction code paths, which raise or require conversion + return True + + def __array_wrap__(self, result, context=None): + # we don't want the superclass implementation + return result + + def __reduce__(self): + d = dict(left=self.left, right=self.right) + d.update(self._get_attributes_dict()) + return _new_IntervalIndex, (type(self), d), None + + @Appender(_index_shared_docs["astype"]) + def astype(self, dtype, copy=True): + with rewrite_exception("IntervalArray", type(self).__name__): + new_values = self.values.astype(dtype, copy=copy) + if is_interval_dtype(new_values): + return self._shallow_copy(new_values.left, new_values.right) + return Index.astype(self, dtype, copy=copy) + + @property + def inferred_type(self) -> str: + """Return a string of the type inferred from the values""" + return "interval" + + @Appender(Index.memory_usage.__doc__) + def memory_usage(self, deep: bool = False) -> int: + # we don't use an explicit engine + # so return the bytes here + return self.left.memory_usage(deep=deep) + self.right.memory_usage(deep=deep) + + @cache_readonly + def is_monotonic(self) -> bool: + """ + Return True if the IntervalIndex is monotonic increasing (only equal or + increasing values), else False + """ + return self.is_monotonic_increasing + + @cache_readonly + def is_monotonic_increasing(self) -> bool: + """ + Return True if the IntervalIndex is monotonic increasing (only equal or + increasing values), else False + """ + return self._engine.is_monotonic_increasing + + @cache_readonly + def is_monotonic_decreasing(self) -> bool: + """ + Return True if the IntervalIndex is monotonic decreasing (only equal or + decreasing values), else False + """ + return self[::-1].is_monotonic_increasing + + @cache_readonly + def is_unique(self): + """ + Return True if the IntervalIndex contains unique elements, else False. + """ + left = self.left + right = self.right + + if self.isna().sum() > 1: + return False + + if left.is_unique or right.is_unique: + return True + + seen_pairs = set() + check_idx = np.where(left.duplicated(keep=False))[0] + for idx in check_idx: + pair = (left[idx], right[idx]) + if pair in seen_pairs: + return False + seen_pairs.add(pair) + + return True + + @property + def is_overlapping(self): + """ + Return True if the IntervalIndex has overlapping intervals, else False. + + Two intervals overlap if they share a common point, including closed + endpoints. Intervals that only have an open endpoint in common do not + overlap. + + .. versionadded:: 0.24.0 + + Returns + ------- + bool + Boolean indicating if the IntervalIndex has overlapping intervals. + + See Also + -------- + Interval.overlaps : Check whether two Interval objects overlap. + IntervalIndex.overlaps : Check an IntervalIndex elementwise for + overlaps. + + Examples + -------- + >>> index = pd.IntervalIndex.from_tuples([(0, 2), (1, 3), (4, 5)]) + >>> index + IntervalIndex([(0, 2], (1, 3], (4, 5]], + closed='right', + dtype='interval[int64]') + >>> index.is_overlapping + True + + Intervals that share closed endpoints overlap: + + >>> index = pd.interval_range(0, 3, closed='both') + >>> index + IntervalIndex([[0, 1], [1, 2], [2, 3]], + closed='both', + dtype='interval[int64]') + >>> index.is_overlapping + True + + Intervals that only have an open endpoint in common do not overlap: + + >>> index = pd.interval_range(0, 3, closed='left') + >>> index + IntervalIndex([[0, 1), [1, 2), [2, 3)], + closed='left', + dtype='interval[int64]') + >>> index.is_overlapping + False + """ + # GH 23309 + return self._engine.is_overlapping + + @Appender(_index_shared_docs["_convert_scalar_indexer"]) + def _convert_scalar_indexer(self, key, kind=None): + if kind == "iloc": + return super()._convert_scalar_indexer(key, kind=kind) + return key + + def _maybe_cast_slice_bound(self, label, side, kind): + return getattr(self, side)._maybe_cast_slice_bound(label, side, kind) + + @Appender(_index_shared_docs["_convert_list_indexer"]) + def _convert_list_indexer(self, keyarr, kind=None): + """ + we are passed a list-like indexer. Return the + indexer for matching intervals. + """ + locs = self.get_indexer_for(keyarr) + + # we have missing values + if (locs == -1).any(): + raise KeyError + + return locs + + def _can_reindex(self, indexer: np.ndarray) -> None: + """ + Check if we are allowing reindexing with this particular indexer. + + Parameters + ---------- + indexer : an integer indexer + + Raises + ------ + ValueError if its a duplicate axis + """ + + # trying to reindex on an axis with duplicates + if self.is_overlapping and len(indexer): + raise ValueError("cannot reindex from an overlapping axis") + + def _needs_i8_conversion(self, key): + """ + Check if a given key needs i8 conversion. Conversion is necessary for + Timestamp, Timedelta, DatetimeIndex, and TimedeltaIndex keys. An + Interval-like requires conversion if it's endpoints are one of the + aforementioned types. + + Assumes that any list-like data has already been cast to an Index. + + Parameters + ---------- + key : scalar or Index-like + The key that should be checked for i8 conversion + + Returns + ------- + bool + """ + if is_interval_dtype(key) or isinstance(key, Interval): + return self._needs_i8_conversion(key.left) + + i8_types = (Timestamp, Timedelta, DatetimeIndex, TimedeltaIndex) + return isinstance(key, i8_types) + + def _maybe_convert_i8(self, key): + """ + Maybe convert a given key to it's equivalent i8 value(s). Used as a + preprocessing step prior to IntervalTree queries (self._engine), which + expects numeric data. + + Parameters + ---------- + key : scalar or list-like + The key that should maybe be converted to i8. + + Returns + ------- + scalar or list-like + The original key if no conversion occurred, int if converted scalar, + Int64Index if converted list-like. + """ + original = key + if is_list_like(key): + key = ensure_index(key) + + if not self._needs_i8_conversion(key): + return original + + scalar = is_scalar(key) + if is_interval_dtype(key) or isinstance(key, Interval): + # convert left/right and reconstruct + left = self._maybe_convert_i8(key.left) + right = self._maybe_convert_i8(key.right) + constructor = Interval if scalar else IntervalIndex.from_arrays + return constructor(left, right, closed=self.closed) + + if scalar: + # Timestamp/Timedelta + key_dtype, key_i8 = infer_dtype_from_scalar(key, pandas_dtype=True) + else: + # DatetimeIndex/TimedeltaIndex + key_dtype, key_i8 = key.dtype, Index(key.asi8) + if key.hasnans: + # convert NaT from it's i8 value to np.nan so it's not viewed + # as a valid value, maybe causing errors (e.g. is_overlapping) + key_i8 = key_i8.where(~key._isnan) + + # ensure consistency with IntervalIndex subtype + subtype = self.dtype.subtype + + if not is_dtype_equal(subtype, key_dtype): + raise ValueError( + f"Cannot index an IntervalIndex of subtype {subtype} with " + f"values of dtype {key_dtype}" + ) + + return key_i8 + + def _check_method(self, method): + if method is None: + return + + if method in ["bfill", "backfill", "pad", "ffill", "nearest"]: + raise NotImplementedError( + f"method {method} not yet implemented for IntervalIndex" + ) + + raise ValueError("Invalid fill method") + + def _searchsorted_monotonic(self, label, side, exclude_label=False): + if not self.is_non_overlapping_monotonic: + raise KeyError( + "can only get slices from an IntervalIndex if bounds are " + "non-overlapping and all monotonic increasing or decreasing" + ) + + if isinstance(label, IntervalMixin): + raise NotImplementedError("Interval objects are not currently supported") + + # GH 20921: "not is_monotonic_increasing" for the second condition + # instead of "is_monotonic_decreasing" to account for single element + # indexes being both increasing and decreasing + if (side == "left" and self.left.is_monotonic_increasing) or ( + side == "right" and not self.left.is_monotonic_increasing + ): + sub_idx = self.right + if self.open_right or exclude_label: + label = _get_next_label(label) + else: + sub_idx = self.left + if self.open_left or exclude_label: + label = _get_prev_label(label) + + return sub_idx._searchsorted_monotonic(label, side) + + def get_loc( + self, key: Any, method: Optional[str] = None, tolerance=None + ) -> Union[int, slice, np.ndarray]: + """ + Get integer location, slice or boolean mask for requested label. + + Parameters + ---------- + key : label + method : {None}, optional + * default: matches where the label is within an interval only. + + Returns + ------- + int if unique index, slice if monotonic index, else mask + + Examples + -------- + >>> i1, i2 = pd.Interval(0, 1), pd.Interval(1, 2) + >>> index = pd.IntervalIndex([i1, i2]) + >>> index.get_loc(1) + 0 + + You can also supply a point inside an interval. + + >>> index.get_loc(1.5) + 1 + + If a label is in several intervals, you get the locations of all the + relevant intervals. + + >>> i3 = pd.Interval(0, 2) + >>> overlapping_index = pd.IntervalIndex([i1, i2, i3]) + >>> overlapping_index.get_loc(0.5) + array([ True, False, True]) + + Only exact matches will be returned if an interval is provided. + + >>> index.get_loc(pd.Interval(0, 1)) + 0 + """ + self._check_method(method) + + # list-like are invalid labels for II but in some cases may work, e.g + # single element array of comparable type, so guard against them early + if is_list_like(key): + raise KeyError(key) + + if isinstance(key, Interval): + if self.closed != key.closed: + raise KeyError(key) + mask = (self.left == key.left) & (self.right == key.right) + else: + # assume scalar + op_left = le if self.closed_left else lt + op_right = le if self.closed_right else lt + try: + mask = op_left(self.left, key) & op_right(key, self.right) + except TypeError: + # scalar is not comparable to II subtype --> invalid label + raise KeyError(key) + + matches = mask.sum() + if matches == 0: + raise KeyError(key) + elif matches == 1: + return mask.argmax() + return lib.maybe_booleans_to_slice(mask.view("u1")) + + @Substitution( + **dict( + _index_doc_kwargs, + **{ + "raises_section": textwrap.dedent( + """ + Raises + ------ + NotImplementedError + If any method argument other than the default of + None is specified as these are not yet implemented. + """ + ) + }, + ) + ) + @Appender(_index_shared_docs["get_indexer"]) + def get_indexer( + self, + target: AnyArrayLike, + method: Optional[str] = None, + limit: Optional[int] = None, + tolerance: Optional[Any] = None, + ) -> np.ndarray: + + self._check_method(method) + + if self.is_overlapping: + raise InvalidIndexError( + "cannot handle overlapping indices; " + "use IntervalIndex.get_indexer_non_unique" + ) + + target_as_index = ensure_index(target) + + if isinstance(target_as_index, IntervalIndex): + # equal indexes -> 1:1 positional match + if self.equals(target_as_index): + return np.arange(len(self), dtype="intp") + + # different closed or incompatible subtype -> no matches + common_subtype = find_common_type( + [self.dtype.subtype, target_as_index.dtype.subtype] + ) + if self.closed != target_as_index.closed or is_object_dtype(common_subtype): + return np.repeat(np.intp(-1), len(target_as_index)) + + # non-overlapping -> at most one match per interval in target_as_index + # want exact matches -> need both left/right to match, so defer to + # left/right get_indexer, compare elementwise, equality -> match + left_indexer = self.left.get_indexer(target_as_index.left) + right_indexer = self.right.get_indexer(target_as_index.right) + indexer = np.where(left_indexer == right_indexer, left_indexer, -1) + elif is_categorical(target_as_index): + # get an indexer for unique categories then propagate to codes via take_1d + categories_indexer = self.get_indexer(target_as_index.categories) + indexer = take_1d(categories_indexer, target_as_index.codes, fill_value=-1) + elif not is_object_dtype(target_as_index): + # homogeneous scalar index: use IntervalTree + target_as_index = self._maybe_convert_i8(target_as_index) + indexer = self._engine.get_indexer(target_as_index.values) + else: + # heterogeneous scalar index: defer elementwise to get_loc + # (non-overlapping so get_loc guarantees scalar of KeyError) + indexer = [] + for key in target_as_index: + try: + loc = self.get_loc(key) + except KeyError: + loc = -1 + indexer.append(loc) + + return ensure_platform_int(indexer) + + @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) + def get_indexer_non_unique( + self, target: AnyArrayLike + ) -> Tuple[np.ndarray, np.ndarray]: + target_as_index = ensure_index(target) + + # check that target_as_index IntervalIndex is compatible + if isinstance(target_as_index, IntervalIndex): + common_subtype = find_common_type( + [self.dtype.subtype, target_as_index.dtype.subtype] + ) + if self.closed != target_as_index.closed or is_object_dtype(common_subtype): + # different closed or incompatible subtype -> no matches + return ( + np.repeat(-1, len(target_as_index)), + np.arange(len(target_as_index)), + ) + + if is_object_dtype(target_as_index) or isinstance( + target_as_index, IntervalIndex + ): + # target_as_index might contain intervals: defer elementwise to get_loc + indexer, missing = [], [] + for i, key in enumerate(target_as_index): + try: + locs = self.get_loc(key) + if isinstance(locs, slice): + locs = np.arange(locs.start, locs.stop, locs.step, dtype="intp") + locs = np.array(locs, ndmin=1) + except KeyError: + missing.append(i) + locs = np.array([-1]) + indexer.append(locs) + indexer = np.concatenate(indexer) + else: + target_as_index = self._maybe_convert_i8(target_as_index) + indexer, missing = self._engine.get_indexer_non_unique( + target_as_index.values + ) + + return ensure_platform_int(indexer), ensure_platform_int(missing) + + def get_indexer_for(self, target: AnyArrayLike, **kwargs) -> np.ndarray: + """ + Guaranteed return of an indexer even when overlapping. + + This dispatches to get_indexer or get_indexer_non_unique + as appropriate. + + Returns + ------- + numpy.ndarray + List of indices. + """ + if self.is_overlapping: + return self.get_indexer_non_unique(target)[0] + return self.get_indexer(target, **kwargs) + + @Appender(_index_shared_docs["get_value"] % _index_doc_kwargs) + def get_value(self, series: ABCSeries, key: Any) -> Any: + + if com.is_bool_indexer(key): + loc = key + elif is_list_like(key): + if self.is_overlapping: + loc, missing = self.get_indexer_non_unique(key) + if len(missing): + raise KeyError + else: + loc = self.get_indexer(key) + elif isinstance(key, slice): + if not (key.step is None or key.step == 1): + raise ValueError("cannot support not-default step in a slice") + loc = self._convert_slice_indexer(key, kind="getitem") + else: + loc = self.get_loc(key) + return series.iloc[loc] + + @Appender(_index_shared_docs["where"]) + def where(self, cond, other=None): + if other is None: + other = self._na_value + values = np.where(cond, self.values, other) + return self._shallow_copy(values) + + def delete(self, loc): + """ + Return a new IntervalIndex with passed location(-s) deleted + + Returns + ------- + IntervalIndex + """ + new_left = self.left.delete(loc) + new_right = self.right.delete(loc) + return self._shallow_copy(new_left, new_right) + + def insert(self, loc, item): + """ + Return a new IntervalIndex inserting new item at location. Follows + Python list.append semantics for negative values. Only Interval + objects and NA can be inserted into an IntervalIndex + + Parameters + ---------- + loc : int + item : object + + Returns + ------- + IntervalIndex + """ + if isinstance(item, Interval): + if item.closed != self.closed: + raise ValueError( + "inserted item must be closed on the same side as the index" + ) + left_insert = item.left + right_insert = item.right + elif is_scalar(item) and isna(item): + # GH 18295 + left_insert = right_insert = item + else: + raise ValueError( + "can only insert Interval objects and NA into an IntervalIndex" + ) + + new_left = self.left.insert(loc, left_insert) + new_right = self.right.insert(loc, right_insert) + return self._shallow_copy(new_left, new_right) + + def _concat_same_dtype(self, to_concat, name): + """ + assert that we all have the same .closed + we allow a 0-len index here as well + """ + if not len({i.closed for i in to_concat if len(i)}) == 1: + raise ValueError( + "can only append two IntervalIndex objects " + "that are closed on the same side" + ) + return super()._concat_same_dtype(to_concat, name) + + @Appender(_index_shared_docs["take"] % _index_doc_kwargs) + def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): + result = self._data.take( + indices, axis=axis, allow_fill=allow_fill, fill_value=fill_value, **kwargs + ) + return self._shallow_copy(result) + + def __getitem__(self, value): + result = self._data[value] + if isinstance(result, IntervalArray): + return self._shallow_copy(result) + else: + # scalar + return result + + # -------------------------------------------------------------------- + # Rendering Methods + # __repr__ associated methods are based on MultiIndex + + def _format_with_header(self, header, **kwargs): + return header + list(self._format_native_types(**kwargs)) + + def _format_native_types(self, na_rep="NaN", quoting=None, **kwargs): + # GH 28210: use base method but with different default na_rep + return super()._format_native_types(na_rep=na_rep, quoting=quoting, **kwargs) + + def _format_data(self, name=None): + + # TODO: integrate with categorical and make generic + # name argument is unused here; just for compat with base / categorical + n = len(self) + max_seq_items = min((get_option("display.max_seq_items") or n) // 10, 10) + + formatter = str + + if n == 0: + summary = "[]" + elif n == 1: + first = formatter(self[0]) + summary = f"[{first}]" + elif n == 2: + first = formatter(self[0]) + last = formatter(self[-1]) + summary = f"[{first}, {last}]" + else: + + if n > max_seq_items: + n = min(max_seq_items // 2, 10) + head = [formatter(x) for x in self[:n]] + tail = [formatter(x) for x in self[-n:]] + head_joined = ", ".join(head) + tail_joined = ", ".join(tail) + summary = f"[{head_joined} ... {tail_joined}]" + else: + tail = [formatter(x) for x in self] + joined = ", ".join(tail) + summary = f"[{joined}]" + + return summary + "," + self._format_space() + + def _format_attrs(self): + attrs = [("closed", repr(self.closed))] + if self.name is not None: + attrs.append(("name", default_pprint(self.name))) + attrs.append(("dtype", f"'{self.dtype}'")) + return attrs + + def _format_space(self) -> str: + space = " " * (len(type(self).__name__) + 1) + return f"\n{space}" + + # -------------------------------------------------------------------- + + def argsort(self, *args, **kwargs): + return np.lexsort((self.right, self.left)) + + def equals(self, other) -> bool: + """ + Determines if two IntervalIndex objects contain the same elements. + """ + if self.is_(other): + return True + + # if we can coerce to an II + # then we can compare + if not isinstance(other, IntervalIndex): + if not is_interval_dtype(other): + return False + other = Index(getattr(other, ".values", other)) + + return ( + self.left.equals(other.left) + and self.right.equals(other.right) + and self.closed == other.closed + ) + + @Appender(_index_shared_docs["intersection"]) + @SetopCheck(op_name="intersection") + def intersection( + self, other: "IntervalIndex", sort: bool = False + ) -> "IntervalIndex": + if self.left.is_unique and self.right.is_unique: + taken = self._intersection_unique(other) + elif other.left.is_unique and other.right.is_unique and self.isna().sum() <= 1: + # Swap other/self if other is unique and self does not have + # multiple NaNs + taken = other._intersection_unique(self) + else: + # duplicates + taken = self._intersection_non_unique(other) + + if sort is None: + taken = taken.sort_values() + + return taken + + def _intersection_unique(self, other: "IntervalIndex") -> "IntervalIndex": + """ + Used when the IntervalIndex does not have any common endpoint, + no mater left or right. + Return the intersection with another IntervalIndex. + + Parameters + ---------- + other : IntervalIndex + + Returns + ------- + IntervalIndex + """ + lindexer = self.left.get_indexer(other.left) + rindexer = self.right.get_indexer(other.right) + + match = (lindexer == rindexer) & (lindexer != -1) + indexer = lindexer.take(match.nonzero()[0]) + + return self.take(indexer) + + def _intersection_non_unique(self, other: "IntervalIndex") -> "IntervalIndex": + """ + Used when the IntervalIndex does have some common endpoints, + on either sides. + Return the intersection with another IntervalIndex. + + Parameters + ---------- + other : IntervalIndex + + Returns + ------- + IntervalIndex + """ + mask = np.zeros(len(self), dtype=bool) + + if self.hasnans and other.hasnans: + first_nan_loc = np.arange(len(self))[self.isna()][0] + mask[first_nan_loc] = True + + other_tups = set(zip(other.left, other.right)) + for i, tup in enumerate(zip(self.left, self.right)): + if tup in other_tups: + mask[i] = True + + return self[mask] + + def _setop(op_name: str, sort=None): + @SetopCheck(op_name=op_name) + def func(self, other, sort=sort): + result = getattr(self._multiindex, op_name)(other._multiindex, sort=sort) + result_name = get_op_result_name(self, other) + + # GH 19101: ensure empty results have correct dtype + if result.empty: + result = result.values.astype(self.dtype.subtype) + else: + result = result.values + + return type(self).from_tuples(result, closed=self.closed, name=result_name) + + return func + + @property + def is_all_dates(self) -> bool: + """ + This is False even when left/right contain datetime-like objects, + as the check is done on the Interval itself + """ + return False + + union = _setop("union") + difference = _setop("difference") + symmetric_difference = _setop("symmetric_difference") + + # TODO: arithmetic operations + + # GH#30817 until IntervalArray implements inequalities, get them from Index + def __lt__(self, other): + return Index.__lt__(self, other) + + def __le__(self, other): + return Index.__le__(self, other) + + def __gt__(self, other): + return Index.__gt__(self, other) + + def __ge__(self, other): + return Index.__ge__(self, other) + + +IntervalIndex._add_logical_methods_disabled() + + +def _is_valid_endpoint(endpoint) -> bool: + """ + Helper for interval_range to check if start/end are valid types. + """ + return any( + [ + is_number(endpoint), + isinstance(endpoint, Timestamp), + isinstance(endpoint, Timedelta), + endpoint is None, + ] + ) + + +def _is_type_compatible(a, b) -> bool: + """ + Helper for interval_range to check type compat of start/end/freq. + """ + is_ts_compat = lambda x: isinstance(x, (Timestamp, DateOffset)) + is_td_compat = lambda x: isinstance(x, (Timedelta, DateOffset)) + return ( + (is_number(a) and is_number(b)) + or (is_ts_compat(a) and is_ts_compat(b)) + or (is_td_compat(a) and is_td_compat(b)) + or com.any_none(a, b) + ) + + +def interval_range( + start=None, end=None, periods=None, freq=None, name=None, closed="right" +): + """ + Return a fixed frequency IntervalIndex. + + Parameters + ---------- + start : numeric or datetime-like, default None + Left bound for generating intervals. + end : numeric or datetime-like, default None + Right bound for generating intervals. + periods : int, default None + Number of periods to generate. + freq : numeric, str, or DateOffset, default None + The length of each interval. Must be consistent with the type of start + and end, e.g. 2 for numeric, or '5H' for datetime-like. Default is 1 + for numeric and 'D' for datetime-like. + name : str, default None + Name of the resulting IntervalIndex. + closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both + or neither. + + Returns + ------- + IntervalIndex + + See Also + -------- + IntervalIndex : An Index of intervals that are all closed on the same side. + + Notes + ----- + Of the four parameters ``start``, ``end``, ``periods``, and ``freq``, + exactly three must be specified. If ``freq`` is omitted, the resulting + ``IntervalIndex`` will have ``periods`` linearly spaced elements between + ``start`` and ``end``, inclusively. + + To learn more about datetime-like frequency strings, please see `this link + `__. + + Examples + -------- + Numeric ``start`` and ``end`` is supported. + + >>> pd.interval_range(start=0, end=5) + IntervalIndex([(0, 1], (1, 2], (2, 3], (3, 4], (4, 5]], + closed='right', dtype='interval[int64]') + + Additionally, datetime-like input is also supported. + + >>> pd.interval_range(start=pd.Timestamp('2017-01-01'), + ... end=pd.Timestamp('2017-01-04')) + IntervalIndex([(2017-01-01, 2017-01-02], (2017-01-02, 2017-01-03], + (2017-01-03, 2017-01-04]], + closed='right', dtype='interval[datetime64[ns]]') + + The ``freq`` parameter specifies the frequency between the left and right. + endpoints of the individual intervals within the ``IntervalIndex``. For + numeric ``start`` and ``end``, the frequency must also be numeric. + + >>> pd.interval_range(start=0, periods=4, freq=1.5) + IntervalIndex([(0.0, 1.5], (1.5, 3.0], (3.0, 4.5], (4.5, 6.0]], + closed='right', dtype='interval[float64]') + + Similarly, for datetime-like ``start`` and ``end``, the frequency must be + convertible to a DateOffset. + + >>> pd.interval_range(start=pd.Timestamp('2017-01-01'), + ... periods=3, freq='MS') + IntervalIndex([(2017-01-01, 2017-02-01], (2017-02-01, 2017-03-01], + (2017-03-01, 2017-04-01]], + closed='right', dtype='interval[datetime64[ns]]') + + Specify ``start``, ``end``, and ``periods``; the frequency is generated + automatically (linearly spaced). + + >>> pd.interval_range(start=0, end=6, periods=4) + IntervalIndex([(0.0, 1.5], (1.5, 3.0], (3.0, 4.5], (4.5, 6.0]], + closed='right', + dtype='interval[float64]') + + The ``closed`` parameter specifies which endpoints of the individual + intervals within the ``IntervalIndex`` are closed. + + >>> pd.interval_range(end=5, periods=4, closed='both') + IntervalIndex([[1, 2], [2, 3], [3, 4], [4, 5]], + closed='both', dtype='interval[int64]') + """ + start = com.maybe_box_datetimelike(start) + end = com.maybe_box_datetimelike(end) + endpoint = start if start is not None else end + + if freq is None and com.any_none(periods, start, end): + freq = 1 if is_number(endpoint) else "D" + + if com.count_not_none(start, end, periods, freq) != 3: + raise ValueError( + "Of the four parameters: start, end, periods, and " + "freq, exactly three must be specified" + ) + + if not _is_valid_endpoint(start): + raise ValueError(f"start must be numeric or datetime-like, got {start}") + elif not _is_valid_endpoint(end): + raise ValueError(f"end must be numeric or datetime-like, got {end}") + + if is_float(periods): + periods = int(periods) + elif not is_integer(periods) and periods is not None: + raise TypeError(f"periods must be a number, got {periods}") + + if freq is not None and not is_number(freq): + try: + freq = to_offset(freq) + except ValueError: + raise ValueError( + f"freq must be numeric or convertible to DateOffset, got {freq}" + ) + + # verify type compatibility + if not all( + [ + _is_type_compatible(start, end), + _is_type_compatible(start, freq), + _is_type_compatible(end, freq), + ] + ): + raise TypeError("start, end, freq need to be type compatible") + + # +1 to convert interval count to breaks count (n breaks = n-1 intervals) + if periods is not None: + periods += 1 + + if is_number(endpoint): + # force consistency between start/end/freq (lower end if freq skips it) + if com.all_not_none(start, end, freq): + end -= (end - start) % freq + + # compute the period/start/end if unspecified (at most one) + if periods is None: + periods = int((end - start) // freq) + 1 + elif start is None: + start = end - (periods - 1) * freq + elif end is None: + end = start + (periods - 1) * freq + + breaks = np.linspace(start, end, periods) + if all(is_integer(x) for x in com.not_none(start, end, freq)): + # np.linspace always produces float output + breaks = maybe_downcast_to_dtype(breaks, "int64") + else: + # delegate to the appropriate range function + if isinstance(endpoint, Timestamp): + range_func = date_range + else: + range_func = timedelta_range + + breaks = range_func(start=start, end=end, periods=periods, freq=freq) + + return IntervalIndex.from_breaks(breaks, name=name, closed=closed) diff --git a/venv/Lib/site-packages/pandas/core/indexes/multi.py b/venv/Lib/site-packages/pandas/core/indexes/multi.py new file mode 100644 index 0000000..75b9666 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/indexes/multi.py @@ -0,0 +1,3510 @@ +import datetime +from sys import getsizeof +from typing import Hashable, List, Optional, Sequence, Union +import warnings + +import numpy as np + +from pandas._config import get_option + +from pandas._libs import Timestamp, algos as libalgos, index as libindex, lib, tslibs +from pandas._libs.hashtable import duplicated_int64 +from pandas.compat.numpy import function as nv +from pandas.errors import PerformanceWarning, UnsortedIndexError +from pandas.util._decorators import Appender, cache_readonly + +from pandas.core.dtypes.cast import coerce_indexer_dtype +from pandas.core.dtypes.common import ( + ensure_int64, + ensure_platform_int, + is_categorical_dtype, + is_hashable, + is_integer, + is_iterator, + is_list_like, + is_object_dtype, + is_scalar, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.generic import ABCDataFrame +from pandas.core.dtypes.missing import array_equivalent, isna + +import pandas.core.algorithms as algos +from pandas.core.arrays import Categorical +from pandas.core.arrays.categorical import factorize_from_iterables +import pandas.core.common as com +import pandas.core.indexes.base as ibase +from pandas.core.indexes.base import ( + Index, + InvalidIndexError, + _index_shared_docs, + ensure_index, +) +from pandas.core.indexes.frozen import FrozenList +import pandas.core.missing as missing +from pandas.core.sorting import ( + get_group_index, + indexer_from_factorized, + lexsort_indexer, +) +from pandas.core.util.hashing import hash_tuple, hash_tuples + +from pandas.io.formats.printing import ( + format_object_attrs, + format_object_summary, + pprint_thing, +) + +_index_doc_kwargs = dict(ibase._index_doc_kwargs) +_index_doc_kwargs.update( + dict(klass="MultiIndex", target_klass="MultiIndex or list of tuples") +) + + +class MultiIndexUIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.UInt64Engine): + """ + This class manages a MultiIndex by mapping label combinations to positive + integers. + """ + + _base = libindex.UInt64Engine + + def _codes_to_ints(self, codes): + """ + Transform combination(s) of uint64 in one uint64 (each), in a strictly + monotonic way (i.e. respecting the lexicographic order of integer + combinations): see BaseMultiIndexCodesEngine documentation. + + Parameters + ---------- + codes : 1- or 2-dimensional array of dtype uint64 + Combinations of integers (one per row) + + Returns + ------- + scalar or 1-dimensional array, of dtype uint64 + Integer(s) representing one combination (each). + """ + # Shift the representation of each level by the pre-calculated number + # of bits: + codes <<= self.offsets + + # Now sum and OR are in fact interchangeable. This is a simple + # composition of the (disjunct) significant bits of each level (i.e. + # each column in "codes") in a single positive integer: + if codes.ndim == 1: + # Single key + return np.bitwise_or.reduce(codes) + + # Multiple keys + return np.bitwise_or.reduce(codes, axis=1) + + +class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.ObjectEngine): + """ + This class manages those (extreme) cases in which the number of possible + label combinations overflows the 64 bits integers, and uses an ObjectEngine + containing Python integers. + """ + + _base = libindex.ObjectEngine + + def _codes_to_ints(self, codes): + """ + Transform combination(s) of uint64 in one Python integer (each), in a + strictly monotonic way (i.e. respecting the lexicographic order of + integer combinations): see BaseMultiIndexCodesEngine documentation. + + Parameters + ---------- + codes : 1- or 2-dimensional array of dtype uint64 + Combinations of integers (one per row) + + Returns + ------- + int, or 1-dimensional array of dtype object + Integer(s) representing one combination (each). + """ + + # Shift the representation of each level by the pre-calculated number + # of bits. Since this can overflow uint64, first make sure we are + # working with Python integers: + codes = codes.astype("object") << self.offsets + + # Now sum and OR are in fact interchangeable. This is a simple + # composition of the (disjunct) significant bits of each level (i.e. + # each column in "codes") in a single positive integer (per row): + if codes.ndim == 1: + # Single key + return np.bitwise_or.reduce(codes) + + # Multiple keys + return np.bitwise_or.reduce(codes, axis=1) + + +class MultiIndex(Index): + """ + A multi-level, or hierarchical, index object for pandas objects. + + Parameters + ---------- + levels : sequence of arrays + The unique labels for each level. + codes : sequence of arrays + Integers for each level designating which label at each location. + + .. versionadded:: 0.24.0 + sortorder : optional int + Level of sortedness (must be lexicographically sorted by that + level). + names : optional sequence of objects + Names for each of the index levels. (name is accepted for compat). + copy : bool, default False + Copy the meta-data. + verify_integrity : bool, default True + Check that the levels/codes are consistent and valid. + + Attributes + ---------- + names + levels + codes + nlevels + levshape + + Methods + ------- + from_arrays + from_tuples + from_product + from_frame + set_levels + set_codes + to_frame + to_flat_index + is_lexsorted + sortlevel + droplevel + swaplevel + reorder_levels + remove_unused_levels + get_locs + + See Also + -------- + MultiIndex.from_arrays : Convert list of arrays to MultiIndex. + MultiIndex.from_product : Create a MultiIndex from the cartesian product + of iterables. + MultiIndex.from_tuples : Convert list of tuples to a MultiIndex. + MultiIndex.from_frame : Make a MultiIndex from a DataFrame. + Index : The base pandas Index type. + + Notes + ----- + See the `user guide + `_ + for more. + + Examples + -------- + A new ``MultiIndex`` is typically constructed using one of the helper + methods :meth:`MultiIndex.from_arrays`, :meth:`MultiIndex.from_product` + and :meth:`MultiIndex.from_tuples`. For example (using ``.from_arrays``): + + >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] + >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) + MultiIndex([(1, 'red'), + (1, 'blue'), + (2, 'red'), + (2, 'blue')], + names=['number', 'color']) + + See further examples for how to construct a MultiIndex in the doc strings + of the mentioned helper methods. + """ + + _deprecations = Index._deprecations | frozenset() + + # initialize to zero-length tuples to make everything work + _typ = "multiindex" + _names = FrozenList() + _levels = FrozenList() + _codes = FrozenList() + _comparables = ["names"] + rename = Index.set_names + + # -------------------------------------------------------------------- + # Constructors + + def __new__( + cls, + levels=None, + codes=None, + sortorder=None, + names=None, + dtype=None, + copy=False, + name=None, + verify_integrity: bool = True, + _set_identity: bool = True, + ): + + # compat with Index + if name is not None: + names = name + if levels is None or codes is None: + raise TypeError("Must pass both levels and codes") + if len(levels) != len(codes): + raise ValueError("Length of levels and codes must be the same.") + if len(levels) == 0: + raise ValueError("Must pass non-zero number of levels/codes") + + result = object.__new__(MultiIndex) + + # we've already validated levels and codes, so shortcut here + result._set_levels(levels, copy=copy, validate=False) + result._set_codes(codes, copy=copy, validate=False) + + result._names = [None] * len(levels) + if names is not None: + # handles name validation + result._set_names(names) + + if sortorder is not None: + result.sortorder = int(sortorder) + else: + result.sortorder = sortorder + + if verify_integrity: + new_codes = result._verify_integrity() + result._codes = new_codes + + if _set_identity: + result._reset_identity() + + return result + + def _validate_codes(self, level: List, code: List): + """ + Reassign code values as -1 if their corresponding levels are NaN. + + Parameters + ---------- + code : list + Code to reassign. + level : list + Level to check for missing values (NaN, NaT, None). + + Returns + ------- + new code where code value = -1 if it corresponds + to a level with missing values (NaN, NaT, None). + """ + null_mask = isna(level) + if np.any(null_mask): + code = np.where(null_mask[code], -1, code) + return code + + def _verify_integrity( + self, codes: Optional[List] = None, levels: Optional[List] = None + ): + """ + Parameters + ---------- + codes : optional list + Codes to check for validity. Defaults to current codes. + levels : optional list + Levels to check for validity. Defaults to current levels. + + Raises + ------ + ValueError + If length of levels and codes don't match, if the codes for any + level would exceed level bounds, or there are any duplicate levels. + + Returns + ------- + new codes where code value = -1 if it corresponds to a + NaN level. + """ + # NOTE: Currently does not check, among other things, that cached + # nlevels matches nor that sortorder matches actually sortorder. + codes = codes or self.codes + levels = levels or self.levels + + if len(levels) != len(codes): + raise ValueError( + "Length of levels and codes must match. NOTE: " + "this index is in an inconsistent state." + ) + codes_length = len(codes[0]) + for i, (level, level_codes) in enumerate(zip(levels, codes)): + if len(level_codes) != codes_length: + raise ValueError( + f"Unequal code lengths: {[len(code_) for code_ in codes]}" + ) + if len(level_codes) and level_codes.max() >= len(level): + raise ValueError( + f"On level {i}, code max ({level_codes.max()}) >= length of " + f"level ({len(level)}). NOTE: this index is in an " + "inconsistent state" + ) + if len(level_codes) and level_codes.min() < -1: + raise ValueError(f"On level {i}, code value ({level_codes.min()}) < -1") + if not level.is_unique: + raise ValueError( + f"Level values must be unique: {list(level)} on level {i}" + ) + if self.sortorder is not None: + if self.sortorder > self._lexsort_depth(): + raise ValueError( + "Value for sortorder must be inferior or equal to actual " + f"lexsort_depth: sortorder {self.sortorder} " + f"with lexsort_depth {self._lexsort_depth()}" + ) + + codes = [ + self._validate_codes(level, code) for level, code in zip(levels, codes) + ] + new_codes = FrozenList(codes) + return new_codes + + @classmethod + def from_arrays(cls, arrays, sortorder=None, names=lib.no_default): + """ + Convert arrays to MultiIndex. + + Parameters + ---------- + arrays : list / sequence of array-likes + Each array-like gives one level's value for each data point. + len(arrays) is the number of levels. + sortorder : int or None + Level of sortedness (must be lexicographically sorted by that + level). + names : list / sequence of str, optional + Names for the levels in the index. + + Returns + ------- + MultiIndex + + See Also + -------- + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + MultiIndex.from_product : Make a MultiIndex from cartesian product + of iterables. + MultiIndex.from_frame : Make a MultiIndex from a DataFrame. + + Examples + -------- + >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] + >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) + MultiIndex([(1, 'red'), + (1, 'blue'), + (2, 'red'), + (2, 'blue')], + names=['number', 'color']) + """ + error_msg = "Input must be a list / sequence of array-likes." + if not is_list_like(arrays): + raise TypeError(error_msg) + elif is_iterator(arrays): + arrays = list(arrays) + + # Check if elements of array are list-like + for array in arrays: + if not is_list_like(array): + raise TypeError(error_msg) + + # Check if lengths of all arrays are equal or not, + # raise ValueError, if not + for i in range(1, len(arrays)): + if len(arrays[i]) != len(arrays[i - 1]): + raise ValueError("all arrays must be same length") + + codes, levels = factorize_from_iterables(arrays) + if names is lib.no_default: + names = [getattr(arr, "name", None) for arr in arrays] + + return MultiIndex( + levels=levels, + codes=codes, + sortorder=sortorder, + names=names, + verify_integrity=False, + ) + + @classmethod + def from_tuples(cls, tuples, sortorder=None, names=None): + """ + Convert list of tuples to MultiIndex. + + Parameters + ---------- + tuples : list / sequence of tuple-likes + Each tuple is the index of one row/column. + sortorder : int or None + Level of sortedness (must be lexicographically sorted by that + level). + names : list / sequence of str, optional + Names for the levels in the index. + + Returns + ------- + MultiIndex + + See Also + -------- + MultiIndex.from_arrays : Convert list of arrays to MultiIndex. + MultiIndex.from_product : Make a MultiIndex from cartesian product + of iterables. + MultiIndex.from_frame : Make a MultiIndex from a DataFrame. + + Examples + -------- + >>> tuples = [(1, 'red'), (1, 'blue'), + ... (2, 'red'), (2, 'blue')] + >>> pd.MultiIndex.from_tuples(tuples, names=('number', 'color')) + MultiIndex([(1, 'red'), + (1, 'blue'), + (2, 'red'), + (2, 'blue')], + names=['number', 'color']) + """ + if not is_list_like(tuples): + raise TypeError("Input must be a list / sequence of tuple-likes.") + elif is_iterator(tuples): + tuples = list(tuples) + + if len(tuples) == 0: + if names is None: + raise TypeError("Cannot infer number of levels from empty list") + arrays = [[]] * len(names) + elif isinstance(tuples, (np.ndarray, Index)): + if isinstance(tuples, Index): + tuples = tuples._values + + arrays = list(lib.tuples_to_object_array(tuples).T) + elif isinstance(tuples, list): + arrays = list(lib.to_object_array_tuples(tuples).T) + else: + arrays = zip(*tuples) + + return MultiIndex.from_arrays(arrays, sortorder=sortorder, names=names) + + @classmethod + def from_product(cls, iterables, sortorder=None, names=lib.no_default): + """ + Make a MultiIndex from the cartesian product of multiple iterables. + + Parameters + ---------- + iterables : list / sequence of iterables + Each iterable has unique labels for each level of the index. + sortorder : int or None + Level of sortedness (must be lexicographically sorted by that + level). + names : list / sequence of str, optional + Names for the levels in the index. + + .. versionchanged:: 1.0.0 + + If not explicitly provided, names will be inferred from the + elements of iterables if an element has a name attribute + + Returns + ------- + MultiIndex + + See Also + -------- + MultiIndex.from_arrays : Convert list of arrays to MultiIndex. + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + MultiIndex.from_frame : Make a MultiIndex from a DataFrame. + + Examples + -------- + >>> numbers = [0, 1, 2] + >>> colors = ['green', 'purple'] + >>> pd.MultiIndex.from_product([numbers, colors], + ... names=['number', 'color']) + MultiIndex([(0, 'green'), + (0, 'purple'), + (1, 'green'), + (1, 'purple'), + (2, 'green'), + (2, 'purple')], + names=['number', 'color']) + """ + from pandas.core.reshape.util import cartesian_product + + if not is_list_like(iterables): + raise TypeError("Input must be a list / sequence of iterables.") + elif is_iterator(iterables): + iterables = list(iterables) + + codes, levels = factorize_from_iterables(iterables) + if names is lib.no_default: + names = [getattr(it, "name", None) for it in iterables] + + codes = cartesian_product(codes) + return MultiIndex(levels, codes, sortorder=sortorder, names=names) + + @classmethod + def from_frame(cls, df, sortorder=None, names=None): + """ + Make a MultiIndex from a DataFrame. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + df : DataFrame + DataFrame to be converted to MultiIndex. + sortorder : int, optional + Level of sortedness (must be lexicographically sorted by that + level). + names : list-like, optional + If no names are provided, use the column names, or tuple of column + names if the columns is a MultiIndex. If a sequence, overwrite + names with the given sequence. + + Returns + ------- + MultiIndex + The MultiIndex representation of the given DataFrame. + + See Also + -------- + MultiIndex.from_arrays : Convert list of arrays to MultiIndex. + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + MultiIndex.from_product : Make a MultiIndex from cartesian product + of iterables. + + Examples + -------- + >>> df = pd.DataFrame([['HI', 'Temp'], ['HI', 'Precip'], + ... ['NJ', 'Temp'], ['NJ', 'Precip']], + ... columns=['a', 'b']) + >>> df + a b + 0 HI Temp + 1 HI Precip + 2 NJ Temp + 3 NJ Precip + + >>> pd.MultiIndex.from_frame(df) + MultiIndex([('HI', 'Temp'), + ('HI', 'Precip'), + ('NJ', 'Temp'), + ('NJ', 'Precip')], + names=['a', 'b']) + + Using explicit names, instead of the column names + + >>> pd.MultiIndex.from_frame(df, names=['state', 'observation']) + MultiIndex([('HI', 'Temp'), + ('HI', 'Precip'), + ('NJ', 'Temp'), + ('NJ', 'Precip')], + names=['state', 'observation']) + """ + if not isinstance(df, ABCDataFrame): + raise TypeError("Input must be a DataFrame") + + column_names, columns = zip(*df.items()) + names = column_names if names is None else names + return cls.from_arrays(columns, sortorder=sortorder, names=names) + + # -------------------------------------------------------------------- + + @property + def levels(self): + result = [ + x._shallow_copy(name=name) for x, name in zip(self._levels, self._names) + ] + for level in result: + # disallow midx.levels[0].name = "foo" + level._no_setting_name = True + return FrozenList(result) + + @property + def _values(self): + # We override here, since our parent uses _data, which we don't use. + return self.values + + @property + def shape(self): + """ + Return a tuple of the shape of the underlying data. + """ + # overriding the base Index.shape definition to avoid materializing + # the values (GH-27384, GH-27775) + return (len(self),) + + @property + def array(self): + """ + Raises a ValueError for `MultiIndex` because there's no single + array backing a MultiIndex. + + Raises + ------ + ValueError + """ + raise ValueError( + "MultiIndex has no single backing array. Use " + "'MultiIndex.to_numpy()' to get a NumPy array of tuples." + ) + + def _set_levels( + self, levels, level=None, copy=False, validate=True, verify_integrity=False + ): + # This is NOT part of the levels property because it should be + # externally not allowed to set levels. User beware if you change + # _levels directly + if validate: + if len(levels) == 0: + raise ValueError("Must set non-zero number of levels.") + if level is None and len(levels) != self.nlevels: + raise ValueError("Length of levels must match number of levels.") + if level is not None and len(levels) != len(level): + raise ValueError("Length of levels must match length of level.") + + if level is None: + new_levels = FrozenList( + ensure_index(lev, copy=copy)._shallow_copy() for lev in levels + ) + else: + level_numbers = [self._get_level_number(lev) for lev in level] + new_levels = list(self._levels) + for lev_num, lev in zip(level_numbers, levels): + new_levels[lev_num] = ensure_index(lev, copy=copy)._shallow_copy() + new_levels = FrozenList(new_levels) + + if verify_integrity: + new_codes = self._verify_integrity(levels=new_levels) + self._codes = new_codes + + names = self.names + self._levels = new_levels + if any(names): + self._set_names(names) + + self._tuples = None + self._reset_cache() + + def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): + """ + Set new levels on MultiIndex. Defaults to returning new index. + + Parameters + ---------- + levels : sequence or list of sequence + New level(s) to apply. + level : int, level name, or sequence of int/level names (default None) + Level(s) to set (None for all levels). + inplace : bool + If True, mutates in place. + verify_integrity : bool, default True + If True, checks that levels and codes are compatible. + + Returns + ------- + new index (of same type and class...etc) + + Examples + -------- + >>> idx = pd.MultiIndex.from_tuples([(1, 'one'), (1, 'two'), + (2, 'one'), (2, 'two'), + (3, 'one'), (3, 'two')], + names=['foo', 'bar']) + >>> idx.set_levels([['a', 'b', 'c'], [1, 2]]) + MultiIndex([('a', 1), + ('a', 2), + ('b', 1), + ('b', 2), + ('c', 1), + ('c', 2)], + names=['foo', 'bar']) + >>> idx.set_levels(['a', 'b', 'c'], level=0) + MultiIndex([('a', 'one'), + ('a', 'two'), + ('b', 'one'), + ('b', 'two'), + ('c', 'one'), + ('c', 'two')], + names=['foo', 'bar']) + >>> idx.set_levels(['a', 'b'], level='bar') + MultiIndex([(1, 'a'), + (1, 'b'), + (2, 'a'), + (2, 'b'), + (3, 'a'), + (3, 'b')], + names=['foo', 'bar']) + + If any of the levels passed to ``set_levels()`` exceeds the + existing length, all of the values from that argument will + be stored in the MultiIndex levels, though the values will + be truncated in the MultiIndex output. + + >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1]) + MultiIndex([('a', 1), + ('a', 2), + ('b', 1), + ('b', 2)], + names=['foo', 'bar']) + >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1]).levels + FrozenList([['a', 'b', 'c'], [1, 2, 3, 4]]) + """ + if is_list_like(levels) and not isinstance(levels, Index): + levels = list(levels) + + if level is not None and not is_list_like(level): + if not is_list_like(levels): + raise TypeError("Levels must be list-like") + if is_list_like(levels[0]): + raise TypeError("Levels must be list-like") + level = [level] + levels = [levels] + elif level is None or is_list_like(level): + if not is_list_like(levels) or not is_list_like(levels[0]): + raise TypeError("Levels must be list of lists-like") + + if inplace: + idx = self + else: + idx = self._shallow_copy() + idx._reset_identity() + idx._set_levels( + levels, level=level, validate=True, verify_integrity=verify_integrity + ) + if not inplace: + return idx + + @property + def codes(self): + return self._codes + + def _set_codes( + self, codes, level=None, copy=False, validate=True, verify_integrity=False + ): + if validate: + if level is None and len(codes) != self.nlevels: + raise ValueError("Length of codes must match number of levels") + if level is not None and len(codes) != len(level): + raise ValueError("Length of codes must match length of levels.") + + if level is None: + new_codes = FrozenList( + _coerce_indexer_frozen(level_codes, lev, copy=copy).view() + for lev, level_codes in zip(self._levels, codes) + ) + else: + level_numbers = [self._get_level_number(lev) for lev in level] + new_codes = list(self._codes) + for lev_num, level_codes in zip(level_numbers, codes): + lev = self.levels[lev_num] + new_codes[lev_num] = _coerce_indexer_frozen(level_codes, lev, copy=copy) + new_codes = FrozenList(new_codes) + + if verify_integrity: + new_codes = self._verify_integrity(codes=new_codes) + + self._codes = new_codes + + self._tuples = None + self._reset_cache() + + def set_codes(self, codes, level=None, inplace=False, verify_integrity=True): + """ + Set new codes on MultiIndex. Defaults to returning + new index. + + .. versionadded:: 0.24.0 + + New name for deprecated method `set_labels`. + + Parameters + ---------- + codes : sequence or list of sequence + New codes to apply. + level : int, level name, or sequence of int/level names (default None) + Level(s) to set (None for all levels). + inplace : bool + If True, mutates in place. + verify_integrity : bool (default True) + If True, checks that levels and codes are compatible. + + Returns + ------- + new index (of same type and class...etc) + + Examples + -------- + >>> idx = pd.MultiIndex.from_tuples([(1, 'one'), + (1, 'two'), + (2, 'one'), + (2, 'two')], + names=['foo', 'bar']) + >>> idx.set_codes([[1, 0, 1, 0], [0, 0, 1, 1]]) + MultiIndex([(2, 'one'), + (1, 'one'), + (2, 'two'), + (1, 'two')], + names=['foo', 'bar']) + >>> idx.set_codes([1, 0, 1, 0], level=0) + MultiIndex([(2, 'one'), + (1, 'two'), + (2, 'one'), + (1, 'two')], + names=['foo', 'bar']) + >>> idx.set_codes([0, 0, 1, 1], level='bar') + MultiIndex([(1, 'one'), + (1, 'one'), + (2, 'two'), + (2, 'two')], + names=['foo', 'bar']) + >>> idx.set_codes([[1, 0, 1, 0], [0, 0, 1, 1]], level=[0, 1]) + MultiIndex([(2, 'one'), + (1, 'one'), + (2, 'two'), + (1, 'two')], + names=['foo', 'bar']) + """ + if level is not None and not is_list_like(level): + if not is_list_like(codes): + raise TypeError("Codes must be list-like") + if is_list_like(codes[0]): + raise TypeError("Codes must be list-like") + level = [level] + codes = [codes] + elif level is None or is_list_like(level): + if not is_list_like(codes) or not is_list_like(codes[0]): + raise TypeError("Codes must be list of lists-like") + + if inplace: + idx = self + else: + idx = self._shallow_copy() + idx._reset_identity() + idx._set_codes(codes, level=level, verify_integrity=verify_integrity) + if not inplace: + return idx + + def copy( + self, + names=None, + dtype=None, + levels=None, + codes=None, + deep=False, + _set_identity=False, + **kwargs, + ): + """ + Make a copy of this object. Names, dtype, levels and codes can be + passed and will be set on new copy. + + Parameters + ---------- + names : sequence, optional + dtype : numpy dtype or pandas type, optional + levels : sequence, optional + codes : sequence, optional + + Returns + ------- + copy : MultiIndex + + Notes + ----- + In most cases, there should be no functional difference from using + ``deep``, but if ``deep`` is passed it will attempt to deepcopy. + This could be potentially expensive on large MultiIndex objects. + """ + name = kwargs.get("name") + names = self._validate_names(name=name, names=names, deep=deep) + if "labels" in kwargs: + raise TypeError("'labels' argument has been removed; use 'codes' instead") + if deep: + from copy import deepcopy + + if levels is None: + levels = deepcopy(self.levels) + if codes is None: + codes = deepcopy(self.codes) + else: + if levels is None: + levels = self.levels + if codes is None: + codes = self.codes + return MultiIndex( + levels=levels, + codes=codes, + names=names, + sortorder=self.sortorder, + verify_integrity=False, + _set_identity=_set_identity, + ) + + def __array__(self, dtype=None) -> np.ndarray: + """ the array interface, return my values """ + return self.values + + def view(self, cls=None): + """ this is defined as a copy with the same identity """ + result = self.copy() + result._id = self._id + return result + + def _shallow_copy_with_infer(self, values, **kwargs): + # On equal MultiIndexes the difference is empty. + # Therefore, an empty MultiIndex is returned GH13490 + if len(values) == 0: + return MultiIndex( + levels=[[] for _ in range(self.nlevels)], + codes=[[] for _ in range(self.nlevels)], + **kwargs, + ) + return self._shallow_copy(values, **kwargs) + + @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) + def __contains__(self, key) -> bool: + hash(key) + try: + self.get_loc(key) + return True + except (LookupError, TypeError, ValueError): + return False + + @Appender(_index_shared_docs["_shallow_copy"]) + def _shallow_copy(self, values=None, **kwargs): + if values is not None: + names = kwargs.pop("names", kwargs.pop("name", self.names)) + # discards freq + kwargs.pop("freq", None) + return MultiIndex.from_tuples(values, names=names, **kwargs) + return self.copy(**kwargs) + + @cache_readonly + def dtype(self) -> np.dtype: + return np.dtype("O") + + def _is_memory_usage_qualified(self) -> bool: + """ return a boolean if we need a qualified .info display """ + + def f(l): + return "mixed" in l or "string" in l or "unicode" in l + + return any(f(l) for l in self._inferred_type_levels) + + @Appender(Index.memory_usage.__doc__) + def memory_usage(self, deep: bool = False) -> int: + # we are overwriting our base class to avoid + # computing .values here which could materialize + # a tuple representation unnecessarily + return self._nbytes(deep) + + @cache_readonly + def nbytes(self) -> int: + """ return the number of bytes in the underlying data """ + return self._nbytes(False) + + def _nbytes(self, deep: bool = False) -> int: + """ + return the number of bytes in the underlying data + deeply introspect the level data if deep=True + + include the engine hashtable + + *this is in internal routine* + + """ + + # for implementations with no useful getsizeof (PyPy) + objsize = 24 + + level_nbytes = sum(i.memory_usage(deep=deep) for i in self.levels) + label_nbytes = sum(i.nbytes for i in self.codes) + names_nbytes = sum(getsizeof(i, objsize) for i in self.names) + result = level_nbytes + label_nbytes + names_nbytes + + # include our engine hashtable + result += self._engine.sizeof(deep=deep) + return result + + # -------------------------------------------------------------------- + # Rendering Methods + def _formatter_func(self, tup): + """ + Formats each item in tup according to its level's formatter function. + """ + formatter_funcs = [level._formatter_func for level in self.levels] + return tuple(func(val) for func, val in zip(formatter_funcs, tup)) + + def _format_data(self, name=None): + """ + Return the formatted data as a unicode string + """ + return format_object_summary( + self, self._formatter_func, name=name, line_break_each_value=True + ) + + def _format_attrs(self): + """ + Return a list of tuples of the (attr,formatted_value). + """ + return format_object_attrs(self, include_dtype=False) + + def _format_native_types(self, na_rep="nan", **kwargs): + new_levels = [] + new_codes = [] + + # go through the levels and format them + for level, level_codes in zip(self.levels, self.codes): + level = level._format_native_types(na_rep=na_rep, **kwargs) + # add nan values, if there are any + mask = level_codes == -1 + if mask.any(): + nan_index = len(level) + level = np.append(level, na_rep) + assert not level_codes.flags.writeable # i.e. copy is needed + level_codes = level_codes.copy() # make writeable + level_codes[mask] = nan_index + new_levels.append(level) + new_codes.append(level_codes) + + if len(new_levels) == 1: + # a single-level multi-index + return Index(new_levels[0].take(new_codes[0]))._format_native_types() + else: + # reconstruct the multi-index + mi = MultiIndex( + levels=new_levels, + codes=new_codes, + names=self.names, + sortorder=self.sortorder, + verify_integrity=False, + ) + return mi.values + + def format( + self, + space=2, + sparsify=None, + adjoin=True, + names=False, + na_rep=None, + formatter=None, + ): + if len(self) == 0: + return [] + + stringified_levels = [] + for lev, level_codes in zip(self.levels, self.codes): + na = na_rep if na_rep is not None else _get_na_rep(lev.dtype.type) + + if len(lev) > 0: + + formatted = lev.take(level_codes).format(formatter=formatter) + + # we have some NA + mask = level_codes == -1 + if mask.any(): + formatted = np.array(formatted, dtype=object) + formatted[mask] = na + formatted = formatted.tolist() + + else: + # weird all NA case + formatted = [ + pprint_thing(na if isna(x) else x, escape_chars=("\t", "\r", "\n")) + for x in algos.take_1d(lev._values, level_codes) + ] + stringified_levels.append(formatted) + + result_levels = [] + for lev, name in zip(stringified_levels, self.names): + level = [] + + if names: + level.append( + pprint_thing(name, escape_chars=("\t", "\r", "\n")) + if name is not None + else "" + ) + + level.extend(np.array(lev, dtype=object)) + result_levels.append(level) + + if sparsify is None: + sparsify = get_option("display.multi_sparse") + + if sparsify: + sentinel = "" + # GH3547 + # use value of sparsify as sentinel, unless it's an obvious + # "Truthy" value + if sparsify not in [True, 1]: + sentinel = sparsify + # little bit of a kludge job for #1217 + result_levels = _sparsify( + result_levels, start=int(names), sentinel=sentinel + ) + + if adjoin: + from pandas.io.formats.format import _get_adjustment + + adj = _get_adjustment() + return adj.adjoin(space, *result_levels).split("\n") + else: + return result_levels + + # -------------------------------------------------------------------- + + def __len__(self) -> int: + return len(self.codes[0]) + + def _get_names(self): + return FrozenList(self._names) + + def _set_names(self, names, level=None, validate=True): + """ + Set new names on index. Each name has to be a hashable type. + + Parameters + ---------- + values : str or sequence + name(s) to set + level : int, level name, or sequence of int/level names (default None) + If the index is a MultiIndex (hierarchical), level(s) to set (None + for all levels). Otherwise level must be None + validate : boolean, default True + validate that the names match level lengths + + Raises + ------ + TypeError if each name is not hashable. + + Notes + ----- + sets names on levels. WARNING: mutates! + + Note that you generally want to set this *after* changing levels, so + that it only acts on copies + """ + # GH 15110 + # Don't allow a single string for names in a MultiIndex + if names is not None and not is_list_like(names): + raise ValueError("Names should be list-like for a MultiIndex") + names = list(names) + + if validate: + if level is not None and len(names) != len(level): + raise ValueError("Length of names must match length of level.") + if level is None and len(names) != self.nlevels: + raise ValueError( + "Length of names must match number of levels in MultiIndex." + ) + + if level is None: + level = range(self.nlevels) + else: + level = [self._get_level_number(lev) for lev in level] + + # set the name + for lev, name in zip(level, names): + if name is not None: + # GH 20527 + # All items in 'names' need to be hashable: + if not is_hashable(name): + raise TypeError( + f"{type(self).__name__}.name must be a hashable type" + ) + self._names[lev] = name + + names = property( + fset=_set_names, fget=_get_names, doc="""\nNames of levels in MultiIndex.\n""" + ) + + @Appender(_index_shared_docs["_get_grouper_for_level"]) + def _get_grouper_for_level(self, mapper, level): + indexer = self.codes[level] + level_index = self.levels[level] + + if mapper is not None: + # Handle group mapping function and return + level_values = self.levels[level].take(indexer) + grouper = level_values.map(mapper) + return grouper, None, None + + codes, uniques = algos.factorize(indexer, sort=True) + + if len(uniques) > 0 and uniques[0] == -1: + # Handle NAs + mask = indexer != -1 + ok_codes, uniques = algos.factorize(indexer[mask], sort=True) + + codes = np.empty(len(indexer), dtype=indexer.dtype) + codes[mask] = ok_codes + codes[~mask] = -1 + + if len(uniques) < len(level_index): + # Remove unobserved levels from level_index + level_index = level_index.take(uniques) + else: + # break references back to us so that setting the name + # on the output of a groupby doesn't reflect back here. + level_index = level_index.copy() + + if len(level_index): + grouper = level_index.take(codes) + else: + grouper = level_index.take(codes, fill_value=True) + + return grouper, codes, level_index + + @property + def _constructor(self): + return MultiIndex.from_tuples + + @cache_readonly + def inferred_type(self) -> str: + return "mixed" + + def _get_level_number(self, level) -> int: + count = self.names.count(level) + if (count > 1) and not is_integer(level): + raise ValueError( + f"The name {level} occurs multiple times, use a level number" + ) + try: + level = self.names.index(level) + except ValueError: + if not is_integer(level): + raise KeyError(f"Level {level} not found") + elif level < 0: + level += self.nlevels + if level < 0: + orig_level = level - self.nlevels + raise IndexError( + f"Too many levels: Index has only {self.nlevels} levels," + f" {orig_level} is not a valid level number" + ) + # Note: levels are zero-based + elif level >= self.nlevels: + raise IndexError( + f"Too many levels: Index has only {self.nlevels} levels, " + f"not {level + 1}" + ) + return level + + _tuples = None + + @cache_readonly + def _engine(self): + # Calculate the number of bits needed to represent labels in each + # level, as log2 of their sizes (including -1 for NaN): + sizes = np.ceil(np.log2([len(l) + 1 for l in self.levels])) + + # Sum bit counts, starting from the _right_.... + lev_bits = np.cumsum(sizes[::-1])[::-1] + + # ... in order to obtain offsets such that sorting the combination of + # shifted codes (one for each level, resulting in a unique integer) is + # equivalent to sorting lexicographically the codes themselves. Notice + # that each level needs to be shifted by the number of bits needed to + # represent the _previous_ ones: + offsets = np.concatenate([lev_bits[1:], [0]]).astype("uint64") + + # Check the total number of bits needed for our representation: + if lev_bits[0] > 64: + # The levels would overflow a 64 bit uint - use Python integers: + return MultiIndexPyIntEngine(self.levels, self.codes, offsets) + return MultiIndexUIntEngine(self.levels, self.codes, offsets) + + @property + def values(self): + if self._tuples is not None: + return self._tuples + + values = [] + + for i in range(self.nlevels): + vals = self._get_level_values(i) + if is_categorical_dtype(vals): + vals = vals._internal_get_values() + if isinstance(vals.dtype, ExtensionDtype) or hasattr(vals, "_box_values"): + vals = vals.astype(object) + vals = np.array(vals, copy=False) + values.append(vals) + + self._tuples = lib.fast_zip(values) + return self._tuples + + @property + def _has_complex_internals(self): + # used to avoid libreduction code paths, which raise or require conversion + return True + + @cache_readonly + def is_monotonic_increasing(self) -> bool: + """ + return if the index is monotonic increasing (only equal or + increasing) values. + """ + + if all(x.is_monotonic for x in self.levels): + # If each level is sorted, we can operate on the codes directly. GH27495 + return libalgos.is_lexsorted( + [x.astype("int64", copy=False) for x in self.codes] + ) + + # reversed() because lexsort() wants the most significant key last. + values = [ + self._get_level_values(i).values for i in reversed(range(len(self.levels))) + ] + try: + sort_order = np.lexsort(values) + return Index(sort_order).is_monotonic + except TypeError: + + # we have mixed types and np.lexsort is not happy + return Index(self.values).is_monotonic + + @cache_readonly + def is_monotonic_decreasing(self) -> bool: + """ + return if the index is monotonic decreasing (only equal or + decreasing) values. + """ + # monotonic decreasing if and only if reverse is monotonic increasing + return self[::-1].is_monotonic_increasing + + @cache_readonly + def _have_mixed_levels(self): + """ return a boolean list indicated if we have mixed levels """ + return ["mixed" in l for l in self._inferred_type_levels] + + @cache_readonly + def _inferred_type_levels(self): + """ return a list of the inferred types, one for each level """ + return [i.inferred_type for i in self.levels] + + @cache_readonly + def _hashed_values(self): + """ return a uint64 ndarray of my hashed values """ + return hash_tuples(self) + + def _hashed_indexing_key(self, key): + """ + validate and return the hash for the provided key + + *this is internal for use for the cython routines* + + Parameters + ---------- + key : string or tuple + + Returns + ------- + np.uint64 + + Notes + ----- + we need to stringify if we have mixed levels + """ + + if not isinstance(key, tuple): + return hash_tuples(key) + + if not len(key) == self.nlevels: + raise KeyError + + def f(k, stringify): + if stringify and not isinstance(k, str): + k = str(k) + return k + + key = tuple( + f(k, stringify) for k, stringify in zip(key, self._have_mixed_levels) + ) + return hash_tuple(key) + + @Appender(Index.duplicated.__doc__) + def duplicated(self, keep="first"): + shape = map(len, self.levels) + ids = get_group_index(self.codes, shape, sort=False, xnull=False) + + return duplicated_int64(ids, keep) + + def fillna(self, value=None, downcast=None): + """ + fillna is not implemented for MultiIndex + """ + raise NotImplementedError("isna is not defined for MultiIndex") + + @Appender(_index_shared_docs["dropna"]) + def dropna(self, how="any"): + nans = [level_codes == -1 for level_codes in self.codes] + if how == "any": + indexer = np.any(nans, axis=0) + elif how == "all": + indexer = np.all(nans, axis=0) + else: + raise ValueError(f"invalid how option: {how}") + + new_codes = [level_codes[~indexer] for level_codes in self.codes] + return self.copy(codes=new_codes, deep=True) + + def get_value(self, series, key): + # Label-based + s = com.values_from_object(series) + k = com.values_from_object(key) + + def _try_mi(k): + # TODO: what if a level contains tuples?? + loc = self.get_loc(k) + new_values = series._values[loc] + new_index = self[loc] + new_index = maybe_droplevels(new_index, k) + return series._constructor( + new_values, index=new_index, name=series.name + ).__finalize__(self) + + try: + return self._engine.get_value(s, k) + except KeyError as e1: + try: + return _try_mi(key) + except KeyError: + pass + + try: + return libindex.get_value_at(s, k) + except IndexError: + raise + except TypeError: + # generator/iterator-like + if is_iterator(key): + raise InvalidIndexError(key) + else: + raise e1 + except Exception: # pragma: no cover + raise e1 + except TypeError: + + # a Timestamp will raise a TypeError in a multi-index + # rather than a KeyError, try it here + # note that a string that 'looks' like a Timestamp will raise + # a KeyError! (GH5725) + if isinstance(key, (datetime.datetime, np.datetime64, str)): + try: + return _try_mi(key) + except KeyError: + raise + except (IndexError, ValueError, TypeError): + pass + + try: + return _try_mi(Timestamp(key)) + except ( + KeyError, + TypeError, + IndexError, + ValueError, + tslibs.OutOfBoundsDatetime, + ): + pass + + raise InvalidIndexError(key) + + def _get_level_values(self, level, unique=False): + """ + Return vector of label values for requested level, + equal to the length of the index + + **this is an internal method** + + Parameters + ---------- + level : int level + unique : bool, default False + if True, drop duplicated values + + Returns + ------- + values : ndarray + """ + + lev = self.levels[level] + level_codes = self.codes[level] + name = self._names[level] + if unique: + level_codes = algos.unique(level_codes) + filled = algos.take_1d(lev._values, level_codes, fill_value=lev._na_value) + return lev._shallow_copy(filled, name=name) + + def get_level_values(self, level): + """ + Return vector of label values for requested level, + equal to the length of the index. + + Parameters + ---------- + level : int or str + ``level`` is either the integer position of the level in the + MultiIndex, or the name of the level. + + Returns + ------- + values : Index + Values is a level of this MultiIndex converted to + a single :class:`Index` (or subclass thereof). + + Examples + -------- + + Create a MultiIndex: + + >>> mi = pd.MultiIndex.from_arrays((list('abc'), list('def'))) + >>> mi.names = ['level_1', 'level_2'] + + Get level values by supplying level as either integer or name: + + >>> mi.get_level_values(0) + Index(['a', 'b', 'c'], dtype='object', name='level_1') + >>> mi.get_level_values('level_2') + Index(['d', 'e', 'f'], dtype='object', name='level_2') + """ + level = self._get_level_number(level) + values = self._get_level_values(level) + return values + + @Appender(_index_shared_docs["index_unique"] % _index_doc_kwargs) + def unique(self, level=None): + + if level is None: + return super().unique() + else: + level = self._get_level_number(level) + return self._get_level_values(level=level, unique=True) + + def _to_safe_for_reshape(self): + """ convert to object if we are a categorical """ + return self.set_levels([i._to_safe_for_reshape() for i in self.levels]) + + def to_frame(self, index=True, name=None): + """ + Create a DataFrame with the levels of the MultiIndex as columns. + + Column ordering is determined by the DataFrame constructor with data as + a dict. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + index : bool, default True + Set the index of the returned DataFrame as the original MultiIndex. + + name : list / sequence of strings, optional + The passed names should substitute index level names. + + Returns + ------- + DataFrame : a DataFrame containing the original MultiIndex data. + + See Also + -------- + DataFrame + """ + + from pandas import DataFrame + + if name is not None: + if not is_list_like(name): + raise TypeError("'name' must be a list / sequence of column names.") + + if len(name) != len(self.levels): + raise ValueError( + "'name' should have same length as number of levels on index." + ) + idx_names = name + else: + idx_names = self.names + + # Guarantee resulting column order - PY36+ dict maintains insertion order + result = DataFrame( + { + (level if lvlname is None else lvlname): self._get_level_values(level) + for lvlname, level in zip(idx_names, range(len(self.levels))) + }, + copy=False, + ) + + if index: + result.index = self + return result + + def to_flat_index(self): + """ + Convert a MultiIndex to an Index of Tuples containing the level values. + + .. versionadded:: 0.24.0 + + Returns + ------- + pd.Index + Index with the MultiIndex data represented in Tuples. + + Notes + ----- + This method will simply return the caller if called by anything other + than a MultiIndex. + + Examples + -------- + >>> index = pd.MultiIndex.from_product( + ... [['foo', 'bar'], ['baz', 'qux']], + ... names=['a', 'b']) + >>> index.to_flat_index() + Index([('foo', 'baz'), ('foo', 'qux'), + ('bar', 'baz'), ('bar', 'qux')], + dtype='object') + """ + return Index(self.values, tupleize_cols=False) + + @property + def is_all_dates(self) -> bool: + return False + + def is_lexsorted(self) -> bool: + """ + Return True if the codes are lexicographically sorted. + + Returns + ------- + bool + """ + return self.lexsort_depth == self.nlevels + + @cache_readonly + def lexsort_depth(self): + if self.sortorder is not None: + return self.sortorder + + return self._lexsort_depth() + + def _lexsort_depth(self) -> int: + """ + Compute and return the lexsort_depth, the number of levels of the + MultiIndex that are sorted lexically + + Returns + ------ + int + """ + int64_codes = [ensure_int64(level_codes) for level_codes in self.codes] + for k in range(self.nlevels, 0, -1): + if libalgos.is_lexsorted(int64_codes[:k]): + return k + return 0 + + def _sort_levels_monotonic(self): + """ + This is an *internal* function. + + Create a new MultiIndex from the current to monotonically sorted + items IN the levels. This does not actually make the entire MultiIndex + monotonic, JUST the levels. + + The resulting MultiIndex will have the same outward + appearance, meaning the same .values and ordering. It will also + be .equals() to the original. + + Returns + ------- + MultiIndex + + Examples + -------- + + >>> mi = pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], + ... codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) + >>> mi + MultiIndex([('a', 'bb'), + ('a', 'aa'), + ('b', 'bb'), + ('b', 'aa')], + ) + + >>> mi.sort_values() + MultiIndex([('a', 'aa'), + ('a', 'bb'), + ('b', 'aa'), + ('b', 'bb')], + ) + """ + + if self.is_lexsorted() and self.is_monotonic: + return self + + new_levels = [] + new_codes = [] + + for lev, level_codes in zip(self.levels, self.codes): + + if not lev.is_monotonic: + try: + # indexer to reorder the levels + indexer = lev.argsort() + except TypeError: + pass + else: + lev = lev.take(indexer) + + # indexer to reorder the level codes + indexer = ensure_int64(indexer) + ri = lib.get_reverse_indexer(indexer, len(indexer)) + level_codes = algos.take_1d(ri, level_codes) + + new_levels.append(lev) + new_codes.append(level_codes) + + return MultiIndex( + new_levels, + new_codes, + names=self.names, + sortorder=self.sortorder, + verify_integrity=False, + ) + + def remove_unused_levels(self): + """ + Create a new MultiIndex from the current that removes + unused levels, meaning that they are not expressed in the labels. + + The resulting MultiIndex will have the same outward + appearance, meaning the same .values and ordering. It will also + be .equals() to the original. + + Returns + ------- + MultiIndex + + Examples + -------- + >>> mi = pd.MultiIndex.from_product([range(2), list('ab')]) + >>> mi + MultiIndex([(0, 'a'), + (0, 'b'), + (1, 'a'), + (1, 'b')], + ) + + >>> mi[2:] + MultiIndex([(1, 'a'), + (1, 'b')], + ) + + The 0 from the first level is not represented + and can be removed + + >>> mi2 = mi[2:].remove_unused_levels() + >>> mi2.levels + FrozenList([[1], ['a', 'b']]) + """ + + new_levels = [] + new_codes = [] + + changed = False + for lev, level_codes in zip(self.levels, self.codes): + + # Since few levels are typically unused, bincount() is more + # efficient than unique() - however it only accepts positive values + # (and drops order): + uniques = np.where(np.bincount(level_codes + 1) > 0)[0] - 1 + has_na = int(len(uniques) and (uniques[0] == -1)) + + if len(uniques) != len(lev) + has_na: + # We have unused levels + changed = True + + # Recalculate uniques, now preserving order. + # Can easily be cythonized by exploiting the already existing + # "uniques" and stop parsing "level_codes" when all items + # are found: + uniques = algos.unique(level_codes) + if has_na: + na_idx = np.where(uniques == -1)[0] + # Just ensure that -1 is in first position: + uniques[[0, na_idx[0]]] = uniques[[na_idx[0], 0]] + + # codes get mapped from uniques to 0:len(uniques) + # -1 (if present) is mapped to last position + code_mapping = np.zeros(len(lev) + has_na) + # ... and reassigned value -1: + code_mapping[uniques] = np.arange(len(uniques)) - has_na + + level_codes = code_mapping[level_codes] + + # new levels are simple + lev = lev.take(uniques[has_na:]) + + new_levels.append(lev) + new_codes.append(level_codes) + + result = self.view() + + if changed: + result._reset_identity() + result._set_levels(new_levels, validate=False) + result._set_codes(new_codes, validate=False) + + return result + + @property + def nlevels(self) -> int: + """ + Integer number of levels in this MultiIndex. + """ + return len(self._levels) + + @property + def levshape(self): + """ + A tuple with the length of each level. + """ + return tuple(len(x) for x in self.levels) + + def __reduce__(self): + """Necessary for making this object picklable""" + d = dict( + levels=list(self.levels), + codes=list(self.codes), + sortorder=self.sortorder, + names=list(self.names), + ) + return ibase._new_Index, (type(self), d), None + + def __setstate__(self, state): + """Necessary for making this object picklable""" + + if isinstance(state, dict): + levels = state.get("levels") + codes = state.get("codes") + sortorder = state.get("sortorder") + names = state.get("names") + + elif isinstance(state, tuple): + + nd_state, own_state = state + levels, codes, sortorder, names = own_state + + self._set_levels([Index(x) for x in levels], validate=False) + self._set_codes(codes) + new_codes = self._verify_integrity() + self._set_codes(new_codes) + self._set_names(names) + self.sortorder = sortorder + self._reset_identity() + + def __getitem__(self, key): + if is_scalar(key): + key = com.cast_scalar_indexer(key) + + retval = [] + for lev, level_codes in zip(self.levels, self.codes): + if level_codes[key] == -1: + retval.append(np.nan) + else: + retval.append(lev[level_codes[key]]) + + return tuple(retval) + else: + if com.is_bool_indexer(key): + key = np.asarray(key, dtype=bool) + sortorder = self.sortorder + else: + # cannot be sure whether the result will be sorted + sortorder = None + + if isinstance(key, Index): + key = np.asarray(key) + + new_codes = [level_codes[key] for level_codes in self.codes] + + return MultiIndex( + levels=self.levels, + codes=new_codes, + names=self.names, + sortorder=sortorder, + verify_integrity=False, + ) + + @Appender(_index_shared_docs["take"] % _index_doc_kwargs) + def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): + nv.validate_take(tuple(), kwargs) + indices = ensure_platform_int(indices) + taken = self._assert_take_fillable( + self.codes, + indices, + allow_fill=allow_fill, + fill_value=fill_value, + na_value=-1, + ) + return MultiIndex( + levels=self.levels, codes=taken, names=self.names, verify_integrity=False + ) + + def _assert_take_fillable( + self, values, indices, allow_fill=True, fill_value=None, na_value=None + ): + """ Internal method to handle NA filling of take """ + # only fill if we are passing a non-None fill_value + if allow_fill and fill_value is not None: + if (indices < -1).any(): + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) + raise ValueError(msg) + taken = [lab.take(indices) for lab in self.codes] + mask = indices == -1 + if mask.any(): + masked = [] + for new_label in taken: + label_values = new_label + label_values[mask] = na_value + masked.append(np.asarray(label_values)) + taken = masked + else: + taken = [lab.take(indices) for lab in self.codes] + return taken + + def append(self, other): + """ + Append a collection of Index options together + + Parameters + ---------- + other : Index or list/tuple of indices + + Returns + ------- + appended : Index + """ + if not isinstance(other, (list, tuple)): + other = [other] + + if all( + (isinstance(o, MultiIndex) and o.nlevels >= self.nlevels) for o in other + ): + arrays = [] + for i in range(self.nlevels): + label = self._get_level_values(i) + appended = [o._get_level_values(i) for o in other] + arrays.append(label.append(appended)) + return MultiIndex.from_arrays(arrays, names=self.names) + + to_concat = (self.values,) + tuple(k._values for k in other) + new_tuples = np.concatenate(to_concat) + + # if all(isinstance(x, MultiIndex) for x in other): + try: + return MultiIndex.from_tuples(new_tuples, names=self.names) + except (TypeError, IndexError): + return Index(new_tuples) + + def argsort(self, *args, **kwargs): + return self.values.argsort(*args, **kwargs) + + @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) + def repeat(self, repeats, axis=None): + nv.validate_repeat(tuple(), dict(axis=axis)) + repeats = ensure_platform_int(repeats) + return MultiIndex( + levels=self.levels, + codes=[ + level_codes.view(np.ndarray).astype(np.intp).repeat(repeats) + for level_codes in self.codes + ], + names=self.names, + sortorder=self.sortorder, + verify_integrity=False, + ) + + def where(self, cond, other=None): + raise NotImplementedError(".where is not supported for MultiIndex operations") + + def drop(self, codes, level=None, errors="raise"): + """ + Make new MultiIndex with passed list of codes deleted + + Parameters + ---------- + codes : array-like + Must be a list of tuples + level : int or level name, default None + errors : str, default 'raise' + + Returns + ------- + dropped : MultiIndex + """ + if level is not None: + return self._drop_from_level(codes, level, errors) + + if not isinstance(codes, (np.ndarray, Index)): + try: + codes = com.index_labels_to_array(codes, dtype=object) + except ValueError: + pass + + inds = [] + for level_codes in codes: + try: + loc = self.get_loc(level_codes) + # get_loc returns either an integer, a slice, or a boolean + # mask + if isinstance(loc, int): + inds.append(loc) + elif isinstance(loc, slice): + inds.extend(range(loc.start, loc.stop)) + elif com.is_bool_indexer(loc): + if self.lexsort_depth == 0: + warnings.warn( + "dropping on a non-lexsorted multi-index " + "without a level parameter may impact performance.", + PerformanceWarning, + stacklevel=3, + ) + loc = loc.nonzero()[0] + inds.extend(loc) + else: + msg = f"unsupported indexer of type {type(loc)}" + raise AssertionError(msg) + except KeyError: + if errors != "ignore": + raise + + return self.delete(inds) + + def _drop_from_level(self, codes, level, errors="raise"): + codes = com.index_labels_to_array(codes) + i = self._get_level_number(level) + index = self.levels[i] + values = index.get_indexer(codes) + + mask = ~algos.isin(self.codes[i], values) + if mask.all() and errors != "ignore": + raise KeyError(f"labels {codes} not found in level") + + return self[mask] + + def swaplevel(self, i=-2, j=-1): + """ + Swap level i with level j. + + Calling this method does not change the ordering of the values. + + Parameters + ---------- + i : int, str, default -2 + First level of index to be swapped. Can pass level name as string. + Type of parameters can be mixed. + j : int, str, default -1 + Second level of index to be swapped. Can pass level name as string. + Type of parameters can be mixed. + + Returns + ------- + MultiIndex + A new MultiIndex. + + See Also + -------- + Series.swaplevel : Swap levels i and j in a MultiIndex. + Dataframe.swaplevel : Swap levels i and j in a MultiIndex on a + particular axis. + + Examples + -------- + >>> mi = pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], + ... codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) + >>> mi + MultiIndex([('a', 'bb'), + ('a', 'aa'), + ('b', 'bb'), + ('b', 'aa')], + ) + >>> mi.swaplevel(0, 1) + MultiIndex([('bb', 'a'), + ('aa', 'a'), + ('bb', 'b'), + ('aa', 'b')], + ) + """ + new_levels = list(self.levels) + new_codes = list(self.codes) + new_names = list(self.names) + + i = self._get_level_number(i) + j = self._get_level_number(j) + + new_levels[i], new_levels[j] = new_levels[j], new_levels[i] + new_codes[i], new_codes[j] = new_codes[j], new_codes[i] + new_names[i], new_names[j] = new_names[j], new_names[i] + + return MultiIndex( + levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False + ) + + def reorder_levels(self, order): + """ + Rearrange levels using input order. May not drop or duplicate levels. + + Parameters + ---------- + + Returns + ------- + MultiIndex + """ + order = [self._get_level_number(i) for i in order] + if len(order) != self.nlevels: + raise AssertionError( + f"Length of order must be same as number of levels ({self.nlevels})," + f" got {len(order)}" + ) + new_levels = [self.levels[i] for i in order] + new_codes = [self.codes[i] for i in order] + new_names = [self.names[i] for i in order] + + return MultiIndex( + levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False + ) + + def _get_codes_for_sorting(self): + """ + we categorizing our codes by using the + available categories (all, not just observed) + excluding any missing ones (-1); this is in preparation + for sorting, where we need to disambiguate that -1 is not + a valid valid + """ + + def cats(level_codes): + return np.arange( + np.array(level_codes).max() + 1 if len(level_codes) else 0, + dtype=level_codes.dtype, + ) + + return [ + Categorical.from_codes(level_codes, cats(level_codes), ordered=True) + for level_codes in self.codes + ] + + def sortlevel(self, level=0, ascending=True, sort_remaining=True): + """ + Sort MultiIndex at the requested level. The result will respect the + original ordering of the associated factor at that level. + + Parameters + ---------- + level : list-like, int or str, default 0 + If a string is given, must be a name of the level. + If list-like must be names or ints of levels. + ascending : bool, default True + False to sort in descending order. + Can also be a list to specify a directed ordering. + sort_remaining : sort by the remaining levels after level + + Returns + ------- + sorted_index : pd.MultiIndex + Resulting index. + indexer : np.ndarray + Indices of output values in original index. + """ + if isinstance(level, (str, int)): + level = [level] + level = [self._get_level_number(lev) for lev in level] + sortorder = None + + # we have a directed ordering via ascending + if isinstance(ascending, list): + if not len(level) == len(ascending): + raise ValueError("level must have same length as ascending") + + indexer = lexsort_indexer( + [self.codes[lev] for lev in level], orders=ascending + ) + + # level ordering + else: + + codes = list(self.codes) + shape = list(self.levshape) + + # partition codes and shape + primary = tuple(codes[lev] for lev in level) + primshp = tuple(shape[lev] for lev in level) + + # Reverse sorted to retain the order of + # smaller indices that needs to be removed + for lev in sorted(level, reverse=True): + codes.pop(lev) + shape.pop(lev) + + if sort_remaining: + primary += primary + tuple(codes) + primshp += primshp + tuple(shape) + else: + sortorder = level[0] + + indexer = indexer_from_factorized(primary, primshp, compress=False) + + if not ascending: + indexer = indexer[::-1] + + indexer = ensure_platform_int(indexer) + new_codes = [level_codes.take(indexer) for level_codes in self.codes] + + new_index = MultiIndex( + codes=new_codes, + levels=self.levels, + names=self.names, + sortorder=sortorder, + verify_integrity=False, + ) + + return new_index, indexer + + def _convert_listlike_indexer(self, keyarr, kind=None): + """ + Parameters + ---------- + keyarr : list-like + Indexer to convert. + + Returns + ------- + tuple (indexer, keyarr) + indexer is an ndarray or None if cannot convert + keyarr are tuple-safe keys + """ + indexer, keyarr = super()._convert_listlike_indexer(keyarr, kind=kind) + + # are we indexing a specific level + if indexer is None and len(keyarr) and not isinstance(keyarr[0], tuple): + level = 0 + _, indexer = self.reindex(keyarr, level=level) + + # take all + if indexer is None: + indexer = np.arange(len(self)) + + check = self.levels[0].get_indexer(keyarr) + mask = check == -1 + if mask.any(): + raise KeyError(f"{keyarr[mask]} not in index") + + return indexer, keyarr + + @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) + def get_indexer(self, target, method=None, limit=None, tolerance=None): + method = missing.clean_reindex_fill_method(method) + target = ensure_index(target) + + # empty indexer + if is_list_like(target) and not len(target): + return ensure_platform_int(np.array([])) + + if not isinstance(target, MultiIndex): + try: + target = MultiIndex.from_tuples(target) + except (TypeError, ValueError): + + # let's instead try with a straight Index + if method is None: + return Index(self.values).get_indexer( + target, method=method, limit=limit, tolerance=tolerance + ) + + if not self.is_unique: + raise ValueError("Reindexing only valid with uniquely valued Index objects") + + if method == "pad" or method == "backfill": + if tolerance is not None: + raise NotImplementedError( + "tolerance not implemented yet for MultiIndex" + ) + indexer = self._engine.get_indexer(target, method, limit) + elif method == "nearest": + raise NotImplementedError( + "method='nearest' not implemented yet " + "for MultiIndex; see GitHub issue 9365" + ) + else: + indexer = self._engine.get_indexer(target) + + return ensure_platform_int(indexer) + + @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) + def get_indexer_non_unique(self, target): + return super().get_indexer_non_unique(target) + + def reindex(self, target, method=None, level=None, limit=None, tolerance=None): + """ + Create index with target's values (move/add/delete values as necessary) + + Returns + ------- + new_index : pd.MultiIndex + Resulting index + indexer : np.ndarray or None + Indices of output values in original index. + + """ + # GH6552: preserve names when reindexing to non-named target + # (i.e. neither Index nor Series). + preserve_names = not hasattr(target, "names") + + if level is not None: + if method is not None: + raise TypeError("Fill method not supported if level passed") + + # GH7774: preserve dtype/tz if target is empty and not an Index. + # target may be an iterator + target = ibase._ensure_has_len(target) + if len(target) == 0 and not isinstance(target, Index): + idx = self.levels[level] + attrs = idx._get_attributes_dict() + attrs.pop("freq", None) # don't preserve freq + target = type(idx)._simple_new(np.empty(0, dtype=idx.dtype), **attrs) + else: + target = ensure_index(target) + target, indexer, _ = self._join_level( + target, level, how="right", return_indexers=True, keep_order=False + ) + else: + target = ensure_index(target) + if self.equals(target): + indexer = None + else: + if self.is_unique: + indexer = self.get_indexer( + target, method=method, limit=limit, tolerance=tolerance + ) + else: + raise ValueError("cannot handle a non-unique multi-index!") + + if not isinstance(target, MultiIndex): + if indexer is None: + target = self + elif (indexer >= 0).all(): + target = self.take(indexer) + else: + # hopefully? + target = MultiIndex.from_tuples(target) + + if ( + preserve_names + and target.nlevels == self.nlevels + and target.names != self.names + ): + target = target.copy(deep=False) + target.names = self.names + + return target, indexer + + def get_slice_bound( + self, label: Union[Hashable, Sequence[Hashable]], side: str, kind: str + ) -> int: + """ + For an ordered MultiIndex, compute slice bound + that corresponds to given label. + + Returns leftmost (one-past-the-rightmost if `side=='right') position + of given label. + + Parameters + ---------- + label : object or tuple of objects + side : {'left', 'right'} + kind : {'loc', 'getitem'} + + Returns + ------- + int + Index of label. + + Notes + ----- + This method only works if level 0 index of the MultiIndex is lexsorted. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([list('abbc'), list('gefd')]) + + Get the locations from the leftmost 'b' in the first level + until the end of the multiindex: + + >>> mi.get_slice_bound('b', side="left", kind="loc") + 1 + + Like above, but if you get the locations from the rightmost + 'b' in the first level and 'f' in the second level: + + >>> mi.get_slice_bound(('b','f'), side="right", kind="loc") + 3 + + See Also + -------- + MultiIndex.get_loc : Get location for a label or a tuple of labels. + MultiIndex.get_locs : Get location for a label/slice/list/mask or a + sequence of such. + """ + + if not isinstance(label, tuple): + label = (label,) + return self._partial_tup_index(label, side=side) + + def slice_locs(self, start=None, end=None, step=None, kind=None): + """ + For an ordered MultiIndex, compute the slice locations for input + labels. + + The input labels can be tuples representing partial levels, e.g. for a + MultiIndex with 3 levels, you can pass a single value (corresponding to + the first level), or a 1-, 2-, or 3-tuple. + + Parameters + ---------- + start : label or tuple, default None + If None, defaults to the beginning + end : label or tuple + If None, defaults to the end + step : int or None + Slice step + kind : string, optional, defaults None + + Returns + ------- + (start, end) : (int, int) + + Notes + ----- + This method only works if the MultiIndex is properly lexsorted. So, + if only the first 2 levels of a 3-level MultiIndex are lexsorted, + you can only pass two levels to ``.slice_locs``. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([list('abbd'), list('deff')], + ... names=['A', 'B']) + + Get the slice locations from the beginning of 'b' in the first level + until the end of the multiindex: + + >>> mi.slice_locs(start='b') + (1, 4) + + Like above, but stop at the end of 'b' in the first level and 'f' in + the second level: + + >>> mi.slice_locs(start='b', end=('b', 'f')) + (1, 3) + + See Also + -------- + MultiIndex.get_loc : Get location for a label or a tuple of labels. + MultiIndex.get_locs : Get location for a label/slice/list/mask or a + sequence of such. + """ + # This function adds nothing to its parent implementation (the magic + # happens in get_slice_bound method), but it adds meaningful doc. + return super().slice_locs(start, end, step, kind=kind) + + def _partial_tup_index(self, tup, side="left"): + if len(tup) > self.lexsort_depth: + raise UnsortedIndexError( + f"Key length ({len(tup)}) was greater than MultiIndex lexsort depth" + f" ({self.lexsort_depth})" + ) + + n = len(tup) + start, end = 0, len(self) + zipped = zip(tup, self.levels, self.codes) + for k, (lab, lev, labs) in enumerate(zipped): + section = labs[start:end] + + if lab not in lev and np.ndim(lab) == 0 and not isna(lab): + if not lev.is_type_compatible(lib.infer_dtype([lab], skipna=False)): + raise TypeError(f"Level type mismatch: {lab}") + + # short circuit + loc = lev.searchsorted(lab, side=side) + if side == "right" and loc >= 0: + loc -= 1 + return start + section.searchsorted(loc, side=side) + + idx = self._get_loc_single_level_index(lev, lab) + if k < n - 1: + end = start + section.searchsorted(idx, side="right") + start = start + section.searchsorted(idx, side="left") + else: + return start + section.searchsorted(idx, side=side) + + def _get_loc_single_level_index(self, level_index: Index, key: Hashable) -> int: + """ + If key is NA value, location of index unify as -1. + + Parameters + ---------- + level_index: Index + key : label + + Returns + ------- + loc : int + If key is NA value, loc is -1 + Else, location of key in index. + + See Also + -------- + Index.get_loc : The get_loc method for (single-level) index. + """ + + if is_scalar(key) and isna(key): + return -1 + else: + return level_index.get_loc(key) + + def get_loc(self, key, method=None): + """ + Get location for a label or a tuple of labels as an integer, slice or + boolean mask. + + Parameters + ---------- + key : label or tuple of labels (one for each level) + method : None + + Returns + ------- + loc : int, slice object or boolean mask + If the key is past the lexsort depth, the return may be a + boolean mask array, otherwise it is always a slice or int. + + See Also + -------- + Index.get_loc : The get_loc method for (single-level) index. + MultiIndex.slice_locs : Get slice location given start label(s) and + end label(s). + MultiIndex.get_locs : Get location for a label/slice/list/mask or a + sequence of such. + + Notes + ----- + The key cannot be a slice, list of same-level labels, a boolean mask, + or a sequence of such. If you want to use those, use + :meth:`MultiIndex.get_locs` instead. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')]) + + >>> mi.get_loc('b') + slice(1, 3, None) + + >>> mi.get_loc(('b', 'e')) + 1 + """ + if method is not None: + raise NotImplementedError( + "only the default get_loc method is " + "currently supported for MultiIndex" + ) + + def _maybe_to_slice(loc): + """convert integer indexer to boolean mask or slice if possible""" + if not isinstance(loc, np.ndarray) or loc.dtype != "int64": + return loc + + loc = lib.maybe_indices_to_slice(loc, len(self)) + if isinstance(loc, slice): + return loc + + mask = np.empty(len(self), dtype="bool") + mask.fill(False) + mask[loc] = True + return mask + + if not isinstance(key, (tuple, list)): + # not including list here breaks some indexing, xref #30892 + loc = self._get_level_indexer(key, level=0) + return _maybe_to_slice(loc) + + keylen = len(key) + if self.nlevels < keylen: + raise KeyError( + f"Key length ({keylen}) exceeds index depth ({self.nlevels})" + ) + + if keylen == self.nlevels and self.is_unique: + return self._engine.get_loc(key) + + # -- partial selection or non-unique index + # break the key into 2 parts based on the lexsort_depth of the index; + # the first part returns a continuous slice of the index; the 2nd part + # needs linear search within the slice + i = self.lexsort_depth + lead_key, follow_key = key[:i], key[i:] + start, stop = ( + self.slice_locs(lead_key, lead_key) if lead_key else (0, len(self)) + ) + + if start == stop: + raise KeyError(key) + + if not follow_key: + return slice(start, stop) + + warnings.warn( + "indexing past lexsort depth may impact performance.", + PerformanceWarning, + stacklevel=10, + ) + + loc = np.arange(start, stop, dtype="int64") + + for i, k in enumerate(follow_key, len(lead_key)): + mask = self.codes[i][loc] == self._get_loc_single_level_index( + self.levels[i], k + ) + if not mask.all(): + loc = loc[mask] + if not len(loc): + raise KeyError(key) + + return _maybe_to_slice(loc) if len(loc) != stop - start else slice(start, stop) + + def get_loc_level(self, key, level=0, drop_level: bool = True): + """ + Get both the location for the requested label(s) and the + resulting sliced index. + + Parameters + ---------- + key : label or sequence of labels + level : int/level name or list thereof, optional + drop_level : bool, default True + If ``False``, the resulting index will not drop any level. + + Returns + ------- + loc : A 2-tuple where the elements are: + Element 0: int, slice object or boolean array + Element 1: The resulting sliced multiindex/index. If the key + contains all levels, this will be ``None``. + + See Also + -------- + MultiIndex.get_loc : Get location for a label or a tuple of labels. + MultiIndex.get_locs : Get location for a label/slice/list/mask or a + sequence of such. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')], + ... names=['A', 'B']) + + >>> mi.get_loc_level('b') + (slice(1, 3, None), Index(['e', 'f'], dtype='object', name='B')) + + >>> mi.get_loc_level('e', level='B') + (array([False, True, False], dtype=bool), + Index(['b'], dtype='object', name='A')) + + >>> mi.get_loc_level(['b', 'e']) + (1, None) + """ + + # different name to distinguish from maybe_droplevels + def maybe_mi_droplevels(indexer, levels, drop_level: bool): + if not drop_level: + return self[indexer] + # kludgearound + orig_index = new_index = self[indexer] + levels = [self._get_level_number(i) for i in levels] + for i in sorted(levels, reverse=True): + try: + new_index = new_index.droplevel(i) + except ValueError: + + # no dropping here + return orig_index + return new_index + + if isinstance(level, (tuple, list)): + if len(key) != len(level): + raise AssertionError( + "Key for location must have same length as number of levels" + ) + result = None + for lev, k in zip(level, key): + loc, new_index = self.get_loc_level(k, level=lev) + if isinstance(loc, slice): + mask = np.zeros(len(self), dtype=bool) + mask[loc] = True + loc = mask + + result = loc if result is None else result & loc + + return result, maybe_mi_droplevels(result, level, drop_level) + + level = self._get_level_number(level) + + # kludge for #1796 + if isinstance(key, list): + key = tuple(key) + + if isinstance(key, tuple) and level == 0: + + try: + if key in self.levels[0]: + indexer = self._get_level_indexer(key, level=level) + new_index = maybe_mi_droplevels(indexer, [0], drop_level) + return indexer, new_index + except TypeError: + pass + + if not any(isinstance(k, slice) for k in key): + + # partial selection + # optionally get indexer to avoid re-calculation + def partial_selection(key, indexer=None): + if indexer is None: + indexer = self.get_loc(key) + ilevels = [ + i for i in range(len(key)) if key[i] != slice(None, None) + ] + return indexer, maybe_mi_droplevels(indexer, ilevels, drop_level) + + if len(key) == self.nlevels and self.is_unique: + # Complete key in unique index -> standard get_loc + try: + return (self._engine.get_loc(key), None) + except KeyError as e: + raise KeyError(key) from e + else: + return partial_selection(key) + else: + indexer = None + for i, k in enumerate(key): + if not isinstance(k, slice): + k = self._get_level_indexer(k, level=i) + if isinstance(k, slice): + # everything + if k.start == 0 and k.stop == len(self): + k = slice(None, None) + else: + k_index = k + + if isinstance(k, slice): + if k == slice(None, None): + continue + else: + raise TypeError(key) + + if indexer is None: + indexer = k_index + else: # pragma: no cover + indexer &= k_index + if indexer is None: + indexer = slice(None, None) + ilevels = [i for i in range(len(key)) if key[i] != slice(None, None)] + return indexer, maybe_mi_droplevels(indexer, ilevels, drop_level) + else: + indexer = self._get_level_indexer(key, level=level) + return indexer, maybe_mi_droplevels(indexer, [level], drop_level) + + def _get_level_indexer(self, key, level=0, indexer=None): + # return an indexer, boolean array or a slice showing where the key is + # in the totality of values + # if the indexer is provided, then use this + + level_index = self.levels[level] + level_codes = self.codes[level] + + def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes): + # given the inputs and the codes/indexer, compute an indexer set + # if we have a provided indexer, then this need not consider + # the entire labels set + + r = np.arange(start, stop, step) + if indexer is not None and len(indexer) != len(codes): + + # we have an indexer which maps the locations in the labels + # that we have already selected (and is not an indexer for the + # entire set) otherwise this is wasteful so we only need to + # examine locations that are in this set the only magic here is + # that the result are the mappings to the set that we have + # selected + from pandas import Series + + mapper = Series(indexer) + indexer = codes.take(ensure_platform_int(indexer)) + result = Series(Index(indexer).isin(r).nonzero()[0]) + m = result.map(mapper)._ndarray_values + + else: + m = np.zeros(len(codes), dtype=bool) + m[np.in1d(codes, r, assume_unique=Index(codes).is_unique)] = True + + return m + + if isinstance(key, slice): + # handle a slice, returning a slice if we can + # otherwise a boolean indexer + + try: + if key.start is not None: + start = level_index.get_loc(key.start) + else: + start = 0 + if key.stop is not None: + stop = level_index.get_loc(key.stop) + else: + stop = len(level_index) - 1 + step = key.step + except KeyError: + + # we have a partial slice (like looking up a partial date + # string) + start = stop = level_index.slice_indexer( + key.start, key.stop, key.step, kind="loc" + ) + step = start.step + + if isinstance(start, slice) or isinstance(stop, slice): + # we have a slice for start and/or stop + # a partial date slicer on a DatetimeIndex generates a slice + # note that the stop ALREADY includes the stopped point (if + # it was a string sliced) + start = getattr(start, "start", start) + stop = getattr(stop, "stop", stop) + return convert_indexer(start, stop, step) + + elif level > 0 or self.lexsort_depth == 0 or step is not None: + # need to have like semantics here to right + # searching as when we are using a slice + # so include the stop+1 (so we include stop) + return convert_indexer(start, stop + 1, step) + else: + # sorted, so can return slice object -> view + i = level_codes.searchsorted(start, side="left") + j = level_codes.searchsorted(stop, side="right") + return slice(i, j, step) + + else: + + code = self._get_loc_single_level_index(level_index, key) + + if level > 0 or self.lexsort_depth == 0: + # Desired level is not sorted + locs = np.array(level_codes == code, dtype=bool, copy=False) + if not locs.any(): + # The label is present in self.levels[level] but unused: + raise KeyError(key) + return locs + + i = level_codes.searchsorted(code, side="left") + j = level_codes.searchsorted(code, side="right") + if i == j: + # The label is present in self.levels[level] but unused: + raise KeyError(key) + return slice(i, j) + + def get_locs(self, seq): + """ + Get location for a sequence of labels. + + Parameters + ---------- + seq : label, slice, list, mask or a sequence of such + You should use one of the above for each level. + If a level should not be used, set it to ``slice(None)``. + + Returns + ------- + numpy.ndarray + NumPy array of integers suitable for passing to iloc. + + See Also + -------- + MultiIndex.get_loc : Get location for a label or a tuple of labels. + MultiIndex.slice_locs : Get slice location given start label(s) and + end label(s). + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')]) + + >>> mi.get_locs('b') # doctest: +SKIP + array([1, 2], dtype=int64) + + >>> mi.get_locs([slice(None), ['e', 'f']]) # doctest: +SKIP + array([1, 2], dtype=int64) + + >>> mi.get_locs([[True, False, True], slice('e', 'f')]) # doctest: +SKIP + array([2], dtype=int64) + """ + from pandas.core.indexes.numeric import Int64Index + + # must be lexsorted to at least as many levels + true_slices = [i for (i, s) in enumerate(com.is_true_slices(seq)) if s] + if true_slices and true_slices[-1] >= self.lexsort_depth: + raise UnsortedIndexError( + "MultiIndex slicing requires the index to be lexsorted: slicing " + f"on levels {true_slices}, lexsort depth {self.lexsort_depth}" + ) + # indexer + # this is the list of all values that we want to select + n = len(self) + indexer = None + + def _convert_to_indexer(r): + # return an indexer + if isinstance(r, slice): + m = np.zeros(n, dtype=bool) + m[r] = True + r = m.nonzero()[0] + elif com.is_bool_indexer(r): + if len(r) != n: + raise ValueError( + "cannot index with a boolean indexer " + "that is not the same length as the " + "index" + ) + r = r.nonzero()[0] + return Int64Index(r) + + def _update_indexer(idxr, indexer=indexer): + if indexer is None: + indexer = Index(np.arange(n)) + if idxr is None: + return indexer + return indexer & idxr + + for i, k in enumerate(seq): + + if com.is_bool_indexer(k): + # a boolean indexer, must be the same length! + k = np.asarray(k) + indexer = _update_indexer(_convert_to_indexer(k), indexer=indexer) + + elif is_list_like(k): + # a collection of labels to include from this level (these + # are or'd) + indexers = None + for x in k: + try: + idxrs = _convert_to_indexer( + self._get_level_indexer(x, level=i, indexer=indexer) + ) + indexers = idxrs if indexers is None else indexers | idxrs + except KeyError: + + # ignore not founds + continue + + if indexers is not None: + indexer = _update_indexer(indexers, indexer=indexer) + else: + # no matches we are done + return Int64Index([])._ndarray_values + + elif com.is_null_slice(k): + # empty slice + indexer = _update_indexer(None, indexer=indexer) + + elif isinstance(k, slice): + + # a slice, include BOTH of the labels + indexer = _update_indexer( + _convert_to_indexer( + self._get_level_indexer(k, level=i, indexer=indexer) + ), + indexer=indexer, + ) + else: + # a single label + indexer = _update_indexer( + _convert_to_indexer( + self.get_loc_level(k, level=i, drop_level=False)[0] + ), + indexer=indexer, + ) + + # empty indexer + if indexer is None: + return Int64Index([])._ndarray_values + return indexer._ndarray_values + + def truncate(self, before=None, after=None): + """ + Slice index between two labels / tuples, return new MultiIndex + + Parameters + ---------- + before : label or tuple, can be partial. Default None + None defaults to start + after : label or tuple, can be partial. Default None + None defaults to end + + Returns + ------- + truncated : MultiIndex + """ + if after and before and after < before: + raise ValueError("after < before") + + i, j = self.levels[0].slice_locs(before, after) + left, right = self.slice_locs(before, after) + + new_levels = list(self.levels) + new_levels[0] = new_levels[0][i:j] + + new_codes = [level_codes[left:right] for level_codes in self.codes] + new_codes[0] = new_codes[0] - i + + return MultiIndex(levels=new_levels, codes=new_codes, verify_integrity=False) + + def equals(self, other) -> bool: + """ + Determines if two MultiIndex objects have the same labeling information + (the levels themselves do not necessarily have to be the same) + + See Also + -------- + equal_levels + """ + if self.is_(other): + return True + + if not isinstance(other, Index): + return False + + if not isinstance(other, MultiIndex): + # d-level MultiIndex can equal d-tuple Index + if not is_object_dtype(other.dtype): + if self.nlevels != other.nlevels: + return False + + other_vals = com.values_from_object(ensure_index(other)) + return array_equivalent(self._ndarray_values, other_vals) + + if self.nlevels != other.nlevels: + return False + + if len(self) != len(other): + return False + + for i in range(self.nlevels): + self_codes = self.codes[i] + self_codes = self_codes[self_codes != -1] + self_values = algos.take_nd( + np.asarray(self.levels[i]._values), self_codes, allow_fill=False + ) + + other_codes = other.codes[i] + other_codes = other_codes[other_codes != -1] + other_values = algos.take_nd( + np.asarray(other.levels[i]._values), other_codes, allow_fill=False + ) + + # since we use NaT both datetime64 and timedelta64 + # we can have a situation where a level is typed say + # timedelta64 in self (IOW it has other values than NaT) + # but types datetime64 in other (where its all NaT) + # but these are equivalent + if len(self_values) == 0 and len(other_values) == 0: + continue + + if not array_equivalent(self_values, other_values): + return False + + return True + + def equal_levels(self, other): + """ + Return True if the levels of both MultiIndex objects are the same + + """ + if self.nlevels != other.nlevels: + return False + + for i in range(self.nlevels): + if not self.levels[i].equals(other.levels[i]): + return False + return True + + def union(self, other, sort=None): + """ + Form the union of two MultiIndex objects + + Parameters + ---------- + other : MultiIndex or array / Index of tuples + sort : False or None, default None + Whether to sort the resulting Index. + + * None : Sort the result, except when + + 1. `self` and `other` are equal. + 2. `self` has length 0. + 3. Some values in `self` or `other` cannot be compared. + A RuntimeWarning is issued in this case. + + * False : do not sort the result. + + .. versionadded:: 0.24.0 + + .. versionchanged:: 0.24.1 + + Changed the default value from ``True`` to ``None`` + (without change in behaviour). + + Returns + ------- + Index + + >>> index.union(index2) + """ + self._validate_sort_keyword(sort) + self._assert_can_do_setop(other) + other, result_names = self._convert_can_do_setop(other) + + if len(other) == 0 or self.equals(other): + return self + + # TODO: Index.union returns other when `len(self)` is 0. + + uniq_tuples = lib.fast_unique_multiple( + [self._ndarray_values, other._ndarray_values], sort=sort + ) + + return MultiIndex.from_arrays( + zip(*uniq_tuples), sortorder=0, names=result_names + ) + + def intersection(self, other, sort=False): + """ + Form the intersection of two MultiIndex objects. + + Parameters + ---------- + other : MultiIndex or array / Index of tuples + sort : False or None, default False + Sort the resulting MultiIndex if possible + + .. versionadded:: 0.24.0 + + .. versionchanged:: 0.24.1 + + Changed the default from ``True`` to ``False``, to match + behaviour from before 0.24.0 + + Returns + ------- + Index + """ + self._validate_sort_keyword(sort) + self._assert_can_do_setop(other) + other, result_names = self._convert_can_do_setop(other) + + if self.equals(other): + return self + + self_tuples = self._ndarray_values + other_tuples = other._ndarray_values + uniq_tuples = set(self_tuples) & set(other_tuples) + + if sort is None: + uniq_tuples = sorted(uniq_tuples) + + if len(uniq_tuples) == 0: + return MultiIndex( + levels=self.levels, + codes=[[]] * self.nlevels, + names=result_names, + verify_integrity=False, + ) + else: + return MultiIndex.from_arrays( + zip(*uniq_tuples), sortorder=0, names=result_names + ) + + def difference(self, other, sort=None): + """ + Compute set difference of two MultiIndex objects + + Parameters + ---------- + other : MultiIndex + sort : False or None, default None + Sort the resulting MultiIndex if possible + + .. versionadded:: 0.24.0 + + .. versionchanged:: 0.24.1 + + Changed the default value from ``True`` to ``None`` + (without change in behaviour). + + Returns + ------- + diff : MultiIndex + """ + self._validate_sort_keyword(sort) + self._assert_can_do_setop(other) + other, result_names = self._convert_can_do_setop(other) + + if len(other) == 0: + return self + + if self.equals(other): + return MultiIndex( + levels=self.levels, + codes=[[]] * self.nlevels, + names=result_names, + verify_integrity=False, + ) + + this = self._get_unique_index() + + indexer = this.get_indexer(other) + indexer = indexer.take((indexer != -1).nonzero()[0]) + + label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True) + difference = this.values.take(label_diff) + if sort is None: + difference = sorted(difference) + + if len(difference) == 0: + return MultiIndex( + levels=[[]] * self.nlevels, + codes=[[]] * self.nlevels, + names=result_names, + verify_integrity=False, + ) + else: + return MultiIndex.from_tuples(difference, sortorder=0, names=result_names) + + @Appender(_index_shared_docs["astype"]) + def astype(self, dtype, copy=True): + dtype = pandas_dtype(dtype) + if is_categorical_dtype(dtype): + msg = "> 1 ndim Categorical are not supported at this time" + raise NotImplementedError(msg) + elif not is_object_dtype(dtype): + raise TypeError( + f"Setting {type(self)} dtype to anything other " + "than object is not supported" + ) + elif copy is True: + return self._shallow_copy() + return self + + def _convert_can_do_setop(self, other): + result_names = self.names + + if not hasattr(other, "names"): + if len(other) == 0: + other = MultiIndex( + levels=[[]] * self.nlevels, + codes=[[]] * self.nlevels, + verify_integrity=False, + ) + else: + msg = "other must be a MultiIndex or a list of tuples" + try: + other = MultiIndex.from_tuples(other) + except TypeError: + raise TypeError(msg) + else: + result_names = self.names if self.names == other.names else None + return other, result_names + + def insert(self, loc, item): + """ + Make new MultiIndex inserting new item at location + + Parameters + ---------- + loc : int + item : tuple + Must be same length as number of levels in the MultiIndex + + Returns + ------- + new_index : Index + """ + # Pad the key with empty strings if lower levels of the key + # aren't specified: + if not isinstance(item, tuple): + item = (item,) + ("",) * (self.nlevels - 1) + elif len(item) != self.nlevels: + raise ValueError("Item must have length equal to number of levels.") + + new_levels = [] + new_codes = [] + for k, level, level_codes in zip(item, self.levels, self.codes): + if k not in level: + # have to insert into level + # must insert at end otherwise you have to recompute all the + # other codes + lev_loc = len(level) + level = level.insert(lev_loc, k) + else: + lev_loc = level.get_loc(k) + + new_levels.append(level) + new_codes.append(np.insert(ensure_int64(level_codes), loc, lev_loc)) + + return MultiIndex( + levels=new_levels, codes=new_codes, names=self.names, verify_integrity=False + ) + + def delete(self, loc): + """ + Make new index with passed location deleted + + Returns + ------- + new_index : MultiIndex + """ + new_codes = [np.delete(level_codes, loc) for level_codes in self.codes] + return MultiIndex( + levels=self.levels, + codes=new_codes, + names=self.names, + verify_integrity=False, + ) + + def _wrap_joined_index(self, joined, other): + names = self.names if self.names == other.names else None + return MultiIndex.from_tuples(joined, names=names) + + @Appender(Index.isin.__doc__) + def isin(self, values, level=None): + if level is None: + values = MultiIndex.from_tuples(values, names=self.names).values + return algos.isin(self.values, values) + else: + num = self._get_level_number(level) + levs = self.get_level_values(num) + + if levs.size == 0: + return np.zeros(len(levs), dtype=np.bool_) + return levs.isin(values) + + +MultiIndex._add_numeric_methods_disabled() +MultiIndex._add_numeric_methods_add_sub_disabled() +MultiIndex._add_logical_methods_disabled() + + +def _sparsify(label_list, start: int = 0, sentinel=""): + pivoted = list(zip(*label_list)) + k = len(label_list) + + result = pivoted[: start + 1] + prev = pivoted[start] + + for cur in pivoted[start + 1 :]: + sparse_cur = [] + + for i, (p, t) in enumerate(zip(prev, cur)): + if i == k - 1: + sparse_cur.append(t) + result.append(sparse_cur) + break + + if p == t: + sparse_cur.append(sentinel) + else: + sparse_cur.extend(cur[i:]) + result.append(sparse_cur) + break + + prev = cur + + return list(zip(*result)) + + +def _get_na_rep(dtype) -> str: + return {np.datetime64: "NaT", np.timedelta64: "NaT"}.get(dtype, "NaN") + + +def maybe_droplevels(index, key): + """ + Attempt to drop level or levels from the given index. + + Parameters + ---------- + index: Index + key : scalar or tuple + + Returns + ------- + Index + """ + # drop levels + original_index = index + if isinstance(key, tuple): + for _ in key: + try: + index = index.droplevel(0) + except ValueError: + # we have dropped too much, so back out + return original_index + else: + try: + index = index.droplevel(0) + except ValueError: + pass + + return index + + +def _coerce_indexer_frozen(array_like, categories, copy: bool = False) -> np.ndarray: + """ + Coerce the array_like indexer to the smallest integer dtype that can encode all + of the given categories. + + Parameters + ---------- + array_like : array-like + categories : array-like + copy : bool + + Returns + ------- + np.ndarray + Non-writeable. + """ + array_like = coerce_indexer_dtype(array_like, categories) + if copy: + array_like = array_like.copy() + array_like.flags.writeable = False + return array_like diff --git a/venv/Lib/site-packages/pandas/core/indexes/numeric.py b/venv/Lib/site-packages/pandas/core/indexes/numeric.py new file mode 100644 index 0000000..b9b4428 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/indexes/numeric.py @@ -0,0 +1,531 @@ +import numpy as np + +from pandas._libs import index as libindex, lib +from pandas._typing import Dtype +from pandas.util._decorators import Appender, cache_readonly + +from pandas.core.dtypes.cast import astype_nansafe +from pandas.core.dtypes.common import ( + is_bool, + is_bool_dtype, + is_dtype_equal, + is_extension_array_dtype, + is_float, + is_float_dtype, + is_integer_dtype, + is_scalar, + is_signed_integer_dtype, + is_unsigned_integer_dtype, + needs_i8_conversion, + pandas_dtype, +) +from pandas.core.dtypes.generic import ( + ABCFloat64Index, + ABCInt64Index, + ABCRangeIndex, + ABCSeries, + ABCUInt64Index, +) +from pandas.core.dtypes.missing import isna + +from pandas.core import algorithms +import pandas.core.common as com +from pandas.core.indexes.base import ( + Index, + InvalidIndexError, + _index_shared_docs, + maybe_extract_name, +) +from pandas.core.ops import get_op_result_name + +_num_index_shared_docs = dict() + + +class NumericIndex(Index): + """ + Provide numeric type operations. + + This is an abstract class. + """ + + _is_numeric_dtype = True + + def __new__(cls, data=None, dtype=None, copy=False, name=None): + cls._validate_dtype(dtype) + + # Coerce to ndarray if not already ndarray or Index + if not isinstance(data, (np.ndarray, Index)): + if is_scalar(data): + raise cls._scalar_data_error(data) + + # other iterable of some kind + if not isinstance(data, (ABCSeries, list, tuple)): + data = list(data) + + data = np.asarray(data, dtype=dtype) + + if issubclass(data.dtype.type, str): + cls._string_data_error(data) + + if copy or not is_dtype_equal(data.dtype, cls._default_dtype): + subarr = np.array(data, dtype=cls._default_dtype, copy=copy) + cls._assert_safe_casting(data, subarr) + else: + subarr = data + + if subarr.ndim > 1: + # GH#13601, GH#20285, GH#27125 + raise ValueError("Index data must be 1-dimensional") + + name = maybe_extract_name(name, data, cls) + return cls._simple_new(subarr, name=name) + + @classmethod + def _validate_dtype(cls, dtype: Dtype) -> None: + if dtype is None: + return + validation_metadata = { + "int64index": (is_signed_integer_dtype, "signed integer"), + "uint64index": (is_unsigned_integer_dtype, "unsigned integer"), + "float64index": (is_float_dtype, "float"), + "rangeindex": (is_signed_integer_dtype, "signed integer"), + } + + validation_func, expected = validation_metadata[cls._typ] + if not validation_func(dtype): + raise ValueError( + f"Incorrect `dtype` passed: expected {expected}, received {dtype}" + ) + + @Appender(_index_shared_docs["_maybe_cast_slice_bound"]) + def _maybe_cast_slice_bound(self, label, side, kind): + assert kind in ["ix", "loc", "getitem", None] + + # we will try to coerce to integers + return self._maybe_cast_indexer(label) + + @Appender(_index_shared_docs["_shallow_copy"]) + def _shallow_copy(self, values=None, **kwargs): + if values is not None and not self._can_hold_na: + # Ensure we are not returning an Int64Index with float data: + return self._shallow_copy_with_infer(values=values, **kwargs) + return super()._shallow_copy(values=values, **kwargs) + + def _convert_for_op(self, value): + """ + Convert value to be insertable to ndarray. + """ + if is_bool(value) or is_bool_dtype(value): + # force conversion to object + # so we don't lose the bools + raise TypeError + + return value + + def _convert_tolerance(self, tolerance, target): + tolerance = np.asarray(tolerance) + if target.size != tolerance.size and tolerance.size > 1: + raise ValueError("list-like tolerance size must match target index size") + if not np.issubdtype(tolerance.dtype, np.number): + if tolerance.ndim > 0: + raise ValueError( + f"tolerance argument for {type(self).__name__} must contain " + "numeric elements if it is list type" + ) + else: + raise ValueError( + f"tolerance argument for {type(self).__name__} must be numeric " + f"if it is a scalar: {repr(tolerance)}" + ) + return tolerance + + @classmethod + def _assert_safe_casting(cls, data, subarr): + """ + Subclasses need to override this only if the process of casting data + from some accepted dtype to the internal dtype(s) bears the risk of + truncation (e.g. float to int). + """ + pass + + def _concat_same_dtype(self, indexes, name): + result = type(indexes[0])(np.concatenate([x._values for x in indexes])) + return result.rename(name) + + @property + def is_all_dates(self) -> bool: + """ + Checks that all the labels are datetime objects. + """ + return False + + @Appender(Index.insert.__doc__) + def insert(self, loc, item): + # treat NA values as nans: + if is_scalar(item) and isna(item): + item = self._na_value + return super().insert(loc, item) + + def _union(self, other, sort): + # Right now, we treat union(int, float) a bit special. + # See https://github.com/pandas-dev/pandas/issues/26778 for discussion + # We may change union(int, float) to go to object. + # float | [u]int -> float (the special case) + # | -> T + # | -> object + needs_cast = (is_integer_dtype(self.dtype) and is_float_dtype(other.dtype)) or ( + is_integer_dtype(other.dtype) and is_float_dtype(self.dtype) + ) + if needs_cast: + first = self.astype("float") + second = other.astype("float") + return first._union(second, sort) + else: + return super()._union(other, sort) + + +_num_index_shared_docs[ + "class_descr" +] = """ + Immutable ndarray implementing an ordered, sliceable set. The basic object + storing axis labels for all pandas objects. %(klass)s is a special case + of `Index` with purely %(ltype)s labels. %(extra)s. + + Parameters + ---------- + data : array-like (1-dimensional) + dtype : NumPy dtype (default: %(dtype)s) + copy : bool + Make a copy of input ndarray. + name : object + Name to be stored in the index. + + Attributes + ---------- + None + + Methods + ------- + None + + See Also + -------- + Index : The base pandas Index type. + + Notes + ----- + An Index instance can **only** contain hashable objects. +""" + +_int64_descr_args = dict(klass="Int64Index", ltype="integer", dtype="int64", extra="") + + +class IntegerIndex(NumericIndex): + """ + This is an abstract class for Int64Index, UInt64Index. + """ + + def __contains__(self, key) -> bool: + """ + Check if key is a float and has a decimal. If it has, return False. + """ + hash(key) + try: + if is_float(key) and int(key) != key: + return False + return key in self._engine + except (OverflowError, TypeError, ValueError): + return False + + +class Int64Index(IntegerIndex): + __doc__ = _num_index_shared_docs["class_descr"] % _int64_descr_args + + _typ = "int64index" + _can_hold_na = False + _engine_type = libindex.Int64Engine + _default_dtype = np.int64 + + @property + def inferred_type(self) -> str: + """ + Always 'integer' for ``Int64Index`` + """ + return "integer" + + @property + def asi8(self) -> np.ndarray: + # do not cache or you'll create a memory leak + return self.values.view("i8") + + @Appender(_index_shared_docs["_convert_scalar_indexer"]) + def _convert_scalar_indexer(self, key, kind=None): + assert kind in ["ix", "loc", "getitem", "iloc", None] + + # don't coerce ilocs to integers + if kind != "iloc": + key = self._maybe_cast_indexer(key) + return super()._convert_scalar_indexer(key, kind=kind) + + def _wrap_joined_index(self, joined, other): + name = get_op_result_name(self, other) + return Int64Index(joined, name=name) + + @classmethod + def _assert_safe_casting(cls, data, subarr): + """ + Ensure incoming data can be represented as ints. + """ + if not issubclass(data.dtype.type, np.signedinteger): + if not np.array_equal(data, subarr): + raise TypeError("Unsafe NumPy casting, you must explicitly cast") + + def _is_compatible_with_other(self, other): + return super()._is_compatible_with_other(other) or all( + isinstance(type(obj), (ABCInt64Index, ABCFloat64Index, ABCRangeIndex)) + for obj in [self, other] + ) + + +Int64Index._add_numeric_methods() +Int64Index._add_logical_methods() + +_uint64_descr_args = dict( + klass="UInt64Index", ltype="unsigned integer", dtype="uint64", extra="" +) + + +class UInt64Index(IntegerIndex): + __doc__ = _num_index_shared_docs["class_descr"] % _uint64_descr_args + + _typ = "uint64index" + _can_hold_na = False + _engine_type = libindex.UInt64Engine + _default_dtype = np.uint64 + + @property + def inferred_type(self) -> str: + """ + Always 'integer' for ``UInt64Index`` + """ + return "integer" + + @property + def asi8(self) -> np.ndarray: + # do not cache or you'll create a memory leak + return self.values.view("u8") + + @Appender(_index_shared_docs["_convert_scalar_indexer"]) + def _convert_scalar_indexer(self, key, kind=None): + assert kind in ["ix", "loc", "getitem", "iloc", None] + + # don't coerce ilocs to integers + if kind != "iloc": + key = self._maybe_cast_indexer(key) + return super()._convert_scalar_indexer(key, kind=kind) + + @Appender(_index_shared_docs["_convert_arr_indexer"]) + def _convert_arr_indexer(self, keyarr): + # Cast the indexer to uint64 if possible so that the values returned + # from indexing are also uint64. + dtype = None + if is_integer_dtype(keyarr) or ( + lib.infer_dtype(keyarr, skipna=False) == "integer" + ): + dtype = np.uint64 + + return com.asarray_tuplesafe(keyarr, dtype=dtype) + + @Appender(_index_shared_docs["_convert_index_indexer"]) + def _convert_index_indexer(self, keyarr): + # Cast the indexer to uint64 if possible so + # that the values returned from indexing are + # also uint64. + if keyarr.is_integer(): + return keyarr.astype(np.uint64) + return keyarr + + def _wrap_joined_index(self, joined, other): + name = get_op_result_name(self, other) + return UInt64Index(joined, name=name) + + @classmethod + def _assert_safe_casting(cls, data, subarr): + """ + Ensure incoming data can be represented as uints. + """ + if not issubclass(data.dtype.type, np.unsignedinteger): + if not np.array_equal(data, subarr): + raise TypeError("Unsafe NumPy casting, you must explicitly cast") + + def _is_compatible_with_other(self, other): + return super()._is_compatible_with_other(other) or all( + isinstance(type(obj), (ABCUInt64Index, ABCFloat64Index)) + for obj in [self, other] + ) + + +UInt64Index._add_numeric_methods() +UInt64Index._add_logical_methods() + +_float64_descr_args = dict( + klass="Float64Index", dtype="float64", ltype="float", extra="" +) + + +class Float64Index(NumericIndex): + __doc__ = _num_index_shared_docs["class_descr"] % _float64_descr_args + + _typ = "float64index" + _engine_type = libindex.Float64Engine + _default_dtype = np.float64 + + @property + def inferred_type(self) -> str: + """ + Always 'floating' for ``Float64Index`` + """ + return "floating" + + @Appender(_index_shared_docs["astype"]) + def astype(self, dtype, copy=True): + dtype = pandas_dtype(dtype) + if needs_i8_conversion(dtype): + raise TypeError( + f"Cannot convert Float64Index to dtype {dtype}; integer " + "values are required for conversion" + ) + elif is_integer_dtype(dtype) and not is_extension_array_dtype(dtype): + # TODO(jreback); this can change once we have an EA Index type + # GH 13149 + arr = astype_nansafe(self.values, dtype=dtype) + return Int64Index(arr) + return super().astype(dtype, copy=copy) + + @Appender(_index_shared_docs["_convert_scalar_indexer"]) + def _convert_scalar_indexer(self, key, kind=None): + assert kind in ["ix", "loc", "getitem", "iloc", None] + + if kind == "iloc": + return self._validate_indexer("positional", key, kind) + + return key + + @Appender(_index_shared_docs["_convert_slice_indexer"]) + def _convert_slice_indexer(self, key, kind=None): + # if we are not a slice, then we are done + if not isinstance(key, slice): + return key + + if kind == "iloc": + return super()._convert_slice_indexer(key, kind=kind) + + # translate to locations + return self.slice_indexer(key.start, key.stop, key.step, kind=kind) + + def _format_native_types( + self, na_rep="", float_format=None, decimal=".", quoting=None, **kwargs + ): + from pandas.io.formats.format import FloatArrayFormatter + + formatter = FloatArrayFormatter( + self.values, + na_rep=na_rep, + float_format=float_format, + decimal=decimal, + quoting=quoting, + fixed_width=False, + ) + return formatter.get_result_as_array() + + def get_value(self, series, key): + """ + We always want to get an index value, never a value. + """ + if not is_scalar(key): + raise InvalidIndexError + + k = com.values_from_object(key) + loc = self.get_loc(k) + new_values = com.values_from_object(series)[loc] + + return new_values + + def equals(self, other) -> bool: + """ + Determines if two Index objects contain the same elements. + """ + if self is other: + return True + + if not isinstance(other, Index): + return False + + # need to compare nans locations and make sure that they are the same + # since nans don't compare equal this is a bit tricky + try: + if not isinstance(other, Float64Index): + other = self._constructor(other) + if not is_dtype_equal(self.dtype, other.dtype) or self.shape != other.shape: + return False + left, right = self._ndarray_values, other._ndarray_values + return ((left == right) | (self._isnan & other._isnan)).all() + except (TypeError, ValueError): + return False + + def __contains__(self, other) -> bool: + if super().__contains__(other): + return True + + try: + # if other is a sequence this throws a ValueError + return np.isnan(other) and self.hasnans + except ValueError: + try: + return len(other) <= 1 and other.item() in self + except AttributeError: + return len(other) <= 1 and other in self + except TypeError: + pass + except TypeError: + pass + + return False + + @Appender(_index_shared_docs["get_loc"]) + def get_loc(self, key, method=None, tolerance=None): + try: + if np.all(np.isnan(key)) or is_bool(key): + nan_idxs = self._nan_idxs + try: + return nan_idxs.item() + except ValueError: + if not len(nan_idxs): + raise KeyError(key) + return nan_idxs + except (TypeError, NotImplementedError): + pass + return super().get_loc(key, method=method, tolerance=tolerance) + + @cache_readonly + def is_unique(self) -> bool: + return super().is_unique and self._nan_idxs.size < 2 + + @Appender(Index.isin.__doc__) + def isin(self, values, level=None): + if level is not None: + self._validate_index_level(level) + return algorithms.isin(np.array(self), values) + + def _is_compatible_with_other(self, other): + return super()._is_compatible_with_other(other) or all( + isinstance( + type(obj), + (ABCInt64Index, ABCFloat64Index, ABCUInt64Index, ABCRangeIndex), + ) + for obj in [self, other] + ) + + +Float64Index._add_numeric_methods() +Float64Index._add_logical_methods_disabled() diff --git a/venv/Lib/site-packages/pandas/core/indexes/period.py b/venv/Lib/site-packages/pandas/core/indexes/period.py new file mode 100644 index 0000000..6877cf0 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/indexes/period.py @@ -0,0 +1,915 @@ +from datetime import datetime, timedelta +import weakref + +import numpy as np + +from pandas._libs import index as libindex +from pandas._libs.tslibs import NaT, frequencies as libfrequencies, iNaT, resolution +from pandas._libs.tslibs.period import Period +from pandas.util._decorators import Appender, Substitution, cache_readonly + +from pandas.core.dtypes.common import ( + ensure_platform_int, + is_bool_dtype, + is_datetime64_any_dtype, + is_dtype_equal, + is_float, + is_float_dtype, + is_integer, + is_integer_dtype, + is_object_dtype, + pandas_dtype, +) + +from pandas.core.accessor import delegate_names +from pandas.core.arrays.period import ( + PeriodArray, + period_array, + raise_on_incompatible, + validate_dtype_freq, +) +from pandas.core.base import _shared_docs +import pandas.core.common as com +import pandas.core.indexes.base as ibase +from pandas.core.indexes.base import ( + _index_shared_docs, + ensure_index, + maybe_extract_name, +) +from pandas.core.indexes.datetimelike import ( + DatetimeIndexOpsMixin, + DatetimelikeDelegateMixin, +) +from pandas.core.indexes.datetimes import DatetimeIndex, Index +from pandas.core.indexes.numeric import Int64Index +from pandas.core.missing import isna +from pandas.core.ops import get_op_result_name +from pandas.core.tools.datetimes import DateParseError, parse_time_string + +from pandas.tseries import frequencies +from pandas.tseries.offsets import DateOffset, Tick + +_index_doc_kwargs = dict(ibase._index_doc_kwargs) +_index_doc_kwargs.update(dict(target_klass="PeriodIndex or list of Periods")) + + +# --- Period index sketch + + +def _new_PeriodIndex(cls, **d): + # GH13277 for unpickling + values = d.pop("data") + if values.dtype == "int64": + freq = d.pop("freq", None) + values = PeriodArray(values, freq=freq) + return cls._simple_new(values, **d) + else: + return cls(values, **d) + + +class PeriodDelegateMixin(DatetimelikeDelegateMixin): + """ + Delegate from PeriodIndex to PeriodArray. + """ + + _raw_methods = {"_format_native_types"} + _raw_properties = {"is_leap_year", "freq"} + + _delegated_properties = PeriodArray._datetimelike_ops + list(_raw_properties) + _delegated_methods = set(PeriodArray._datetimelike_methods) | _raw_methods + + +@delegate_names(PeriodArray, PeriodDelegateMixin._delegated_properties, typ="property") +@delegate_names( + PeriodArray, PeriodDelegateMixin._delegated_methods, typ="method", overwrite=True +) +class PeriodIndex(DatetimeIndexOpsMixin, Int64Index, PeriodDelegateMixin): + """ + Immutable ndarray holding ordinal values indicating regular periods in time. + + Index keys are boxed to Period objects which carries the metadata (eg, + frequency information). + + Parameters + ---------- + data : array-like (1d int np.ndarray or PeriodArray), optional + Optional period-like data to construct index with. + copy : bool + Make a copy of input ndarray. + freq : str or period object, optional + One of pandas period strings or corresponding objects + year : int, array, or Series, default None + month : int, array, or Series, default None + quarter : int, array, or Series, default None + day : int, array, or Series, default None + hour : int, array, or Series, default None + minute : int, array, or Series, default None + second : int, array, or Series, default None + tz : object, default None + Timezone for converting datetime64 data to Periods. + dtype : str or PeriodDtype, default None + + Attributes + ---------- + day + dayofweek + dayofyear + days_in_month + daysinmonth + end_time + freq + freqstr + hour + is_leap_year + minute + month + quarter + qyear + second + start_time + week + weekday + weekofyear + year + + Methods + ------- + asfreq + strftime + to_timestamp + + See Also + -------- + Index : The base pandas Index type. + Period : Represents a period of time. + DatetimeIndex : Index with datetime64 data. + TimedeltaIndex : Index of timedelta64 data. + period_range : Create a fixed-frequency PeriodIndex. + + Examples + -------- + >>> idx = pd.PeriodIndex(year=year_arr, quarter=q_arr) + """ + + _typ = "periodindex" + _attributes = ["name", "freq"] + + # define my properties & methods for delegation + _is_numeric_dtype = False + _infer_as_myclass = True + + _data: PeriodArray + + _engine_type = libindex.PeriodEngine + _supports_partial_string_indexing = True + + # ------------------------------------------------------------------------ + # Index Constructors + + def __new__( + cls, + data=None, + ordinal=None, + freq=None, + tz=None, + dtype=None, + copy=False, + name=None, + **fields, + ): + + valid_field_set = { + "year", + "month", + "day", + "quarter", + "hour", + "minute", + "second", + } + + if not set(fields).issubset(valid_field_set): + argument = list(set(fields) - valid_field_set)[0] + raise TypeError(f"__new__() got an unexpected keyword argument {argument}") + + name = maybe_extract_name(name, data, cls) + + if data is None and ordinal is None: + # range-based. + data, freq2 = PeriodArray._generate_range(None, None, None, freq, fields) + # PeriodArray._generate range does validation that fields is + # empty when really using the range-based constructor. + freq = freq2 + + data = PeriodArray(data, freq=freq) + else: + freq = validate_dtype_freq(dtype, freq) + + # PeriodIndex allow PeriodIndex(period_index, freq=different) + # Let's not encourage that kind of behavior in PeriodArray. + + if freq and isinstance(data, cls) and data.freq != freq: + # TODO: We can do some of these with no-copy / coercion? + # e.g. D -> 2D seems to be OK + data = data.asfreq(freq) + + if data is None and ordinal is not None: + # we strangely ignore `ordinal` if data is passed. + ordinal = np.asarray(ordinal, dtype=np.int64) + data = PeriodArray(ordinal, freq) + else: + # don't pass copy here, since we copy later. + data = period_array(data=data, freq=freq) + + if copy: + data = data.copy() + + return cls._simple_new(data, name=name) + + @classmethod + def _simple_new(cls, values, name=None, freq=None, **kwargs): + """ + Create a new PeriodIndex. + + Parameters + ---------- + values : PeriodArray, PeriodIndex, Index[int64], ndarray[int64] + Values that can be converted to a PeriodArray without inference + or coercion. + + """ + # TODO: raising on floats is tested, but maybe not useful. + # Should the callers know not to pass floats? + # At the very least, I think we can ensure that lists aren't passed. + if isinstance(values, list): + values = np.asarray(values) + if is_float_dtype(values): + raise TypeError("PeriodIndex._simple_new does not accept floats.") + if freq: + freq = Period._maybe_convert_freq(freq) + values = PeriodArray(values, freq=freq) + + if not isinstance(values, PeriodArray): + raise TypeError("PeriodIndex._simple_new only accepts PeriodArray") + result = object.__new__(cls) + result._data = values + # For groupby perf. See note in indexes/base about _index_data + result._index_data = values._data + result.name = name + result._reset_identity() + return result + + # ------------------------------------------------------------------------ + # Data + + @property + def values(self): + return np.asarray(self) + + @property + def _has_complex_internals(self): + # used to avoid libreduction code paths, which raise or require conversion + return True + + def _shallow_copy(self, values=None, **kwargs): + # TODO: simplify, figure out type of values + if values is None: + values = self._data + + if isinstance(values, type(self)): + values = values._data + + if not isinstance(values, PeriodArray): + if isinstance(values, np.ndarray) and values.dtype == "i8": + values = PeriodArray(values, freq=self.freq) + else: + # GH#30713 this should never be reached + raise TypeError(type(values), getattr(values, "dtype", None)) + + # We don't allow changing `freq` in _shallow_copy. + validate_dtype_freq(self.dtype, kwargs.get("freq")) + attributes = self._get_attributes_dict() + + attributes.update(kwargs) + if not len(values) and "dtype" not in kwargs: + attributes["dtype"] = self.dtype + return self._simple_new(values, **attributes) + + def _shallow_copy_with_infer(self, values=None, **kwargs): + """ we always want to return a PeriodIndex """ + return self._shallow_copy(values=values, **kwargs) + + @property + def _box_func(self): + """Maybe box an ordinal or Period""" + # TODO(DatetimeArray): Avoid double-boxing + # PeriodArray takes care of boxing already, so we need to check + # whether we're given an ordinal or a Period. It seems like some + # places outside of indexes/period.py are calling this _box_func, + # but passing data that's already boxed. + def func(x): + if isinstance(x, Period) or x is NaT: + return x + else: + return Period._from_ordinal(ordinal=x, freq=self.freq) + + return func + + def _maybe_convert_timedelta(self, other): + """ + Convert timedelta-like input to an integer multiple of self.freq + + Parameters + ---------- + other : timedelta, np.timedelta64, DateOffset, int, np.ndarray + + Returns + ------- + converted : int, np.ndarray[int64] + + Raises + ------ + IncompatibleFrequency : if the input cannot be written as a multiple + of self.freq. Note IncompatibleFrequency subclasses ValueError. + """ + if isinstance(other, (timedelta, np.timedelta64, Tick, np.ndarray)): + offset = frequencies.to_offset(self.freq.rule_code) + if isinstance(offset, Tick): + # _check_timedeltalike_freq_compat will raise if incompatible + delta = self._data._check_timedeltalike_freq_compat(other) + return delta + elif isinstance(other, DateOffset): + freqstr = other.rule_code + base = libfrequencies.get_base_alias(freqstr) + if base == self.freq.rule_code: + return other.n + + raise raise_on_incompatible(self, other) + elif is_integer(other): + # integer is passed to .shift via + # _add_datetimelike_methods basically + # but ufunc may pass integer to _add_delta + return other + + # raise when input doesn't have freq + raise raise_on_incompatible(self, None) + + # ------------------------------------------------------------------------ + # Rendering Methods + + def _mpl_repr(self): + # how to represent ourselves to matplotlib + return self.astype(object).values + + @property + def _formatter_func(self): + return self.array._formatter(boxed=False) + + # ------------------------------------------------------------------------ + # Indexing + + @cache_readonly + def _engine(self): + # To avoid a reference cycle, pass a weakref of self to _engine_type. + period = weakref.ref(self) + return self._engine_type(period, len(self)) + + @Appender(_index_shared_docs["contains"]) + def __contains__(self, key) -> bool: + if isinstance(key, Period): + if key.freq != self.freq: + return False + else: + return key.ordinal in self._engine + else: + try: + self.get_loc(key) + return True + except (TypeError, KeyError): + # TypeError can be reached if we pass a tuple that is not hashable + return False + + @cache_readonly + def _int64index(self): + return Int64Index._simple_new(self.asi8, name=self.name) + + # ------------------------------------------------------------------------ + # Index Methods + + def __array__(self, dtype=None) -> np.ndarray: + if is_integer_dtype(dtype): + return self.asi8 + else: + return self.astype(object).values + + def __array_wrap__(self, result, context=None): + """ + Gets called after a ufunc. Needs additional handling as + PeriodIndex stores internal data as int dtype + + Replace this to __numpy_ufunc__ in future version + """ + if isinstance(context, tuple) and len(context) > 0: + func = context[0] + if func is np.add: + pass + elif func is np.subtract: + name = self.name + left = context[1][0] + right = context[1][1] + if isinstance(left, PeriodIndex) and isinstance(right, PeriodIndex): + name = left.name if left.name == right.name else None + return Index(result, name=name) + elif isinstance(left, Period) or isinstance(right, Period): + return Index(result, name=name) + elif isinstance(func, np.ufunc): + if "M->M" not in func.types: + msg = f"ufunc '{func.__name__}' not supported for the PeriodIndex" + # This should be TypeError, but TypeError cannot be raised + # from here because numpy catches. + raise ValueError(msg) + + if is_bool_dtype(result): + return result + # the result is object dtype array of Period + # cannot pass _simple_new as it is + return type(self)(result, freq=self.freq, name=self.name) + + def asof_locs(self, where, mask): + """ + where : array of timestamps + mask : array of booleans where data is not NA + + """ + where_idx = where + if isinstance(where_idx, DatetimeIndex): + where_idx = PeriodIndex(where_idx.values, freq=self.freq) + + locs = self._ndarray_values[mask].searchsorted( + where_idx._ndarray_values, side="right" + ) + + locs = np.where(locs > 0, locs - 1, 0) + result = np.arange(len(self))[mask].take(locs) + + first = mask.argmax() + result[ + (locs == 0) & (where_idx._ndarray_values < self._ndarray_values[first]) + ] = -1 + + return result + + @Appender(_index_shared_docs["astype"]) + def astype(self, dtype, copy=True, how="start"): + dtype = pandas_dtype(dtype) + + if is_datetime64_any_dtype(dtype): + # 'how' is index-specific, isn't part of the EA interface. + tz = getattr(dtype, "tz", None) + return self.to_timestamp(how=how).tz_localize(tz) + + # TODO: should probably raise on `how` here, so we don't ignore it. + return super().astype(dtype, copy=copy) + + @Substitution(klass="PeriodIndex") + @Appender(_shared_docs["searchsorted"]) + def searchsorted(self, value, side="left", sorter=None): + if isinstance(value, Period) or value is NaT: + self._data._check_compatible_with(value) + elif isinstance(value, str): + try: + value = Period(value, freq=self.freq) + except DateParseError: + raise KeyError(f"Cannot interpret '{value}' as period") + elif not isinstance(value, PeriodArray): + raise TypeError( + "PeriodIndex.searchsorted requires either a Period or PeriodArray" + ) + + return self._data.searchsorted(value, side=side, sorter=sorter) + + @property + def is_full(self) -> bool: + """ + Returns True if this PeriodIndex is range-like in that all Periods + between start and end are present, in order. + """ + if len(self) == 0: + return True + if not self.is_monotonic: + raise ValueError("Index is not monotonic") + values = self.asi8 + return ((values[1:] - values[:-1]) < 2).all() + + @property + def inferred_type(self) -> str: + # b/c data is represented as ints make sure we can't have ambiguous + # indexing + return "period" + + def get_value(self, series, key): + """ + Fast lookup of value from 1-dimensional ndarray. Only use this if you + know what you're doing + """ + s = com.values_from_object(series) + try: + value = super().get_value(s, key) + except (KeyError, IndexError): + if isinstance(key, str): + asdt, parsed, reso = parse_time_string(key, self.freq) + grp = resolution.Resolution.get_freq_group(reso) + freqn = resolution.get_freq_group(self.freq) + + vals = self._ndarray_values + + # if our data is higher resolution than requested key, slice + if grp < freqn: + iv = Period(asdt, freq=(grp, 1)) + ord1 = iv.asfreq(self.freq, how="S").ordinal + ord2 = iv.asfreq(self.freq, how="E").ordinal + + if ord2 < vals[0] or ord1 > vals[-1]: + raise KeyError(key) + + pos = np.searchsorted(self._ndarray_values, [ord1, ord2]) + key = slice(pos[0], pos[1] + 1) + return series[key] + elif grp == freqn: + key = Period(asdt, freq=self.freq).ordinal + return com.maybe_box( + self, self._int64index.get_value(s, key), series, key + ) + else: + raise KeyError(key) + + period = Period(key, self.freq) + key = period.value if isna(period) else period.ordinal + return com.maybe_box(self, self._int64index.get_value(s, key), series, key) + else: + return com.maybe_box(self, value, series, key) + + @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) + def get_indexer(self, target, method=None, limit=None, tolerance=None): + target = ensure_index(target) + + if isinstance(target, PeriodIndex): + if target.freq != self.freq: + # No matches + no_matches = -1 * np.ones(self.shape, dtype=np.intp) + return no_matches + + target = target.asi8 + self_index = self._int64index + else: + self_index = self + + if tolerance is not None: + tolerance = self._convert_tolerance(tolerance, target) + return Index.get_indexer(self_index, target, method, limit, tolerance) + + @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) + def get_indexer_non_unique(self, target): + target = ensure_index(target) + + if isinstance(target, PeriodIndex): + if target.freq != self.freq: + no_matches = -1 * np.ones(self.shape, dtype=np.intp) + return no_matches, no_matches + + target = target.asi8 + + indexer, missing = self._int64index.get_indexer_non_unique(target) + return ensure_platform_int(indexer), missing + + def get_loc(self, key, method=None, tolerance=None): + """ + Get integer location for requested label + + Returns + ------- + loc : int + """ + try: + return self._engine.get_loc(key) + except KeyError: + if is_integer(key): + raise + + try: + asdt, parsed, reso = parse_time_string(key, self.freq) + key = asdt + except TypeError: + pass + except DateParseError: + # A string with invalid format + raise KeyError(f"Cannot interpret '{key}' as period") + + try: + key = Period(key, freq=self.freq) + except ValueError: + # we cannot construct the Period + # as we have an invalid type + raise KeyError(key) + + try: + ordinal = iNaT if key is NaT else key.ordinal + if tolerance is not None: + tolerance = self._convert_tolerance(tolerance, np.asarray(key)) + return self._int64index.get_loc(ordinal, method, tolerance) + + except KeyError: + raise KeyError(key) + + def _maybe_cast_slice_bound(self, label, side, kind): + """ + If label is a string or a datetime, cast it to Period.ordinal according + to resolution. + + Parameters + ---------- + label : object + side : {'left', 'right'} + kind : {'ix', 'loc', 'getitem'} + + Returns + ------- + bound : Period or object + + Notes + ----- + Value of `side` parameter should be validated in caller. + + """ + assert kind in ["ix", "loc", "getitem"] + + if isinstance(label, datetime): + return Period(label, freq=self.freq) + elif isinstance(label, str): + try: + _, parsed, reso = parse_time_string(label, self.freq) + bounds = self._parsed_string_to_bounds(reso, parsed) + return bounds[0 if side == "left" else 1] + except ValueError: + # string cannot be parsed as datetime-like + # TODO: we need tests for this case + raise KeyError(label) + elif is_integer(label) or is_float(label): + self._invalid_indexer("slice", label) + + return label + + def _parsed_string_to_bounds(self, reso, parsed): + if reso == "year": + t1 = Period(year=parsed.year, freq="A") + elif reso == "month": + t1 = Period(year=parsed.year, month=parsed.month, freq="M") + elif reso == "quarter": + q = (parsed.month - 1) // 3 + 1 + t1 = Period(year=parsed.year, quarter=q, freq="Q-DEC") + elif reso == "day": + t1 = Period(year=parsed.year, month=parsed.month, day=parsed.day, freq="D") + elif reso == "hour": + t1 = Period( + year=parsed.year, + month=parsed.month, + day=parsed.day, + hour=parsed.hour, + freq="H", + ) + elif reso == "minute": + t1 = Period( + year=parsed.year, + month=parsed.month, + day=parsed.day, + hour=parsed.hour, + minute=parsed.minute, + freq="T", + ) + elif reso == "second": + t1 = Period( + year=parsed.year, + month=parsed.month, + day=parsed.day, + hour=parsed.hour, + minute=parsed.minute, + second=parsed.second, + freq="S", + ) + else: + raise KeyError(reso) + return (t1.asfreq(self.freq, how="start"), t1.asfreq(self.freq, how="end")) + + def _get_string_slice(self, key): + if not self.is_monotonic: + raise ValueError("Partial indexing only valid for ordered time series") + + key, parsed, reso = parse_time_string(key, self.freq) + grp = resolution.Resolution.get_freq_group(reso) + freqn = resolution.get_freq_group(self.freq) + if reso in ["day", "hour", "minute", "second"] and not grp < freqn: + raise KeyError(key) + + t1, t2 = self._parsed_string_to_bounds(reso, parsed) + return slice( + self.searchsorted(t1, side="left"), self.searchsorted(t2, side="right") + ) + + def _convert_tolerance(self, tolerance, target): + tolerance = DatetimeIndexOpsMixin._convert_tolerance(self, tolerance, target) + if target.size != tolerance.size and tolerance.size > 1: + raise ValueError("list-like tolerance size must match target index size") + return self._maybe_convert_timedelta(tolerance) + + def insert(self, loc, item): + if not isinstance(item, Period) or self.freq != item.freq: + return self.astype(object).insert(loc, item) + + idx = np.concatenate( + (self[:loc].asi8, np.array([item.ordinal]), self[loc:].asi8) + ) + return self._shallow_copy(idx) + + def join(self, other, how="left", level=None, return_indexers=False, sort=False): + """ + See Index.join + """ + self._assert_can_do_setop(other) + + if not isinstance(other, PeriodIndex): + return self.astype(object).join( + other, how=how, level=level, return_indexers=return_indexers, sort=sort + ) + + result = Int64Index.join( + self, + other, + how=how, + level=level, + return_indexers=return_indexers, + sort=sort, + ) + + if return_indexers: + result, lidx, ridx = result + return self._apply_meta(result), lidx, ridx + return self._apply_meta(result) + + # ------------------------------------------------------------------------ + # Set Operation Methods + + def _assert_can_do_setop(self, other): + super()._assert_can_do_setop(other) + + # *Can't* use PeriodIndexes of different freqs + # *Can* use PeriodIndex/DatetimeIndex + if isinstance(other, PeriodIndex) and self.freq != other.freq: + raise raise_on_incompatible(self, other) + + def intersection(self, other, sort=False): + self._validate_sort_keyword(sort) + self._assert_can_do_setop(other) + res_name = get_op_result_name(self, other) + other = ensure_index(other) + + if self.equals(other): + return self._get_reconciled_name_object(other) + + if not is_dtype_equal(self.dtype, other.dtype): + # TODO: fastpath for if we have a different PeriodDtype + this = self.astype("O") + other = other.astype("O") + return this.intersection(other, sort=sort) + + i8self = Int64Index._simple_new(self.asi8) + i8other = Int64Index._simple_new(other.asi8) + i8result = i8self.intersection(i8other, sort=sort) + + result = self._shallow_copy(np.asarray(i8result, dtype=np.int64), name=res_name) + return result + + def difference(self, other, sort=None): + self._validate_sort_keyword(sort) + self._assert_can_do_setop(other) + res_name = get_op_result_name(self, other) + other = ensure_index(other) + + if self.equals(other): + # pass an empty PeriodArray with the appropriate dtype + return self._shallow_copy(self._data[:0]) + + if is_object_dtype(other): + return self.astype(object).difference(other).astype(self.dtype) + + elif not is_dtype_equal(self.dtype, other.dtype): + return self + + i8self = Int64Index._simple_new(self.asi8) + i8other = Int64Index._simple_new(other.asi8) + i8result = i8self.difference(i8other, sort=sort) + + result = self._shallow_copy(np.asarray(i8result, dtype=np.int64), name=res_name) + return result + + def _union(self, other, sort): + if not len(other) or self.equals(other) or not len(self): + return super()._union(other, sort=sort) + + # We are called by `union`, which is responsible for this validation + assert isinstance(other, type(self)) + + if not is_dtype_equal(self.dtype, other.dtype): + this = self.astype("O") + other = other.astype("O") + return this._union(other, sort=sort) + + i8self = Int64Index._simple_new(self.asi8) + i8other = Int64Index._simple_new(other.asi8) + i8result = i8self._union(i8other, sort=sort) + + res_name = get_op_result_name(self, other) + result = self._shallow_copy(np.asarray(i8result, dtype=np.int64), name=res_name) + return result + + # ------------------------------------------------------------------------ + + def _apply_meta(self, rawarr): + if not isinstance(rawarr, PeriodIndex): + rawarr = PeriodIndex._simple_new(rawarr, freq=self.freq, name=self.name) + return rawarr + + def memory_usage(self, deep=False): + result = super().memory_usage(deep=deep) + if hasattr(self, "_cache") and "_int64index" in self._cache: + result += self._int64index.memory_usage(deep=deep) + return result + + +PeriodIndex._add_numeric_methods_disabled() +PeriodIndex._add_logical_methods_disabled() + + +def period_range( + start=None, end=None, periods=None, freq=None, name=None +) -> PeriodIndex: + """ + Return a fixed frequency PeriodIndex. + + The day (calendar) is the default frequency. + + Parameters + ---------- + start : str or period-like, default None + Left bound for generating periods. + end : str or period-like, default None + Right bound for generating periods. + periods : int, default None + Number of periods to generate. + freq : str or DateOffset, optional + Frequency alias. By default the freq is taken from `start` or `end` + if those are Period objects. Otherwise, the default is ``"D"`` for + daily frequency. + name : str, default None + Name of the resulting PeriodIndex. + + Returns + ------- + PeriodIndex + + Notes + ----- + Of the three parameters: ``start``, ``end``, and ``periods``, exactly two + must be specified. + + To learn more about the frequency strings, please see `this link + `__. + + Examples + -------- + + >>> pd.period_range(start='2017-01-01', end='2018-01-01', freq='M') + PeriodIndex(['2017-01', '2017-02', '2017-03', '2017-04', '2017-05', + '2017-06', '2017-06', '2017-07', '2017-08', '2017-09', + '2017-10', '2017-11', '2017-12', '2018-01'], + dtype='period[M]', freq='M') + + If ``start`` or ``end`` are ``Period`` objects, they will be used as anchor + endpoints for a ``PeriodIndex`` with frequency matching that of the + ``period_range`` constructor. + + >>> pd.period_range(start=pd.Period('2017Q1', freq='Q'), + ... end=pd.Period('2017Q2', freq='Q'), freq='M') + PeriodIndex(['2017-03', '2017-04', '2017-05', '2017-06'], + dtype='period[M]', freq='M') + """ + if com.count_not_none(start, end, periods) != 2: + raise ValueError( + "Of the three parameters: start, end, and periods, " + "exactly two must be specified" + ) + if freq is None and (not isinstance(start, Period) and not isinstance(end, Period)): + freq = "D" + + data, freq = PeriodArray._generate_range(start, end, periods, freq, fields={}) + data = PeriodArray(data, freq=freq) + return PeriodIndex(data, name=name) diff --git a/venv/Lib/site-packages/pandas/core/indexes/range.py b/venv/Lib/site-packages/pandas/core/indexes/range.py new file mode 100644 index 0000000..b4cc71a --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/indexes/range.py @@ -0,0 +1,811 @@ +from datetime import timedelta +import operator +from sys import getsizeof +from typing import Optional, Union +import warnings + +import numpy as np + +from pandas._libs import index as libindex +import pandas.compat as compat +from pandas.compat.numpy import function as nv +from pandas.util._decorators import Appender, cache_readonly + +from pandas.core.dtypes.common import ( + ensure_platform_int, + ensure_python_int, + is_integer, + is_integer_dtype, + is_list_like, + is_scalar, + is_timedelta64_dtype, +) +from pandas.core.dtypes.generic import ABCTimedeltaIndex + +from pandas.core import ops +import pandas.core.common as com +from pandas.core.construction import extract_array +import pandas.core.indexes.base as ibase +from pandas.core.indexes.base import Index, _index_shared_docs, maybe_extract_name +from pandas.core.indexes.numeric import Int64Index +from pandas.core.ops.common import unpack_zerodim_and_defer + +from pandas.io.formats.printing import pprint_thing + + +class RangeIndex(Int64Index): + """ + Immutable Index implementing a monotonic integer range. + + RangeIndex is a memory-saving special case of Int64Index limited to + representing monotonic ranges. Using RangeIndex may in some instances + improve computing speed. + + This is the default index type used + by DataFrame and Series when no explicit index is provided by the user. + + Parameters + ---------- + start : int (default: 0), or other RangeIndex instance + If int and "stop" is not given, interpreted as "stop" instead. + stop : int (default: 0) + step : int (default: 1) + name : object, optional + Name to be stored in the index. + copy : bool, default False + Unused, accepted for homogeneity with other index types. + + Attributes + ---------- + start + stop + step + + Methods + ------- + from_range + + See Also + -------- + Index : The base pandas Index type. + Int64Index : Index of int64 data. + """ + + _typ = "rangeindex" + _engine_type = libindex.Int64Engine + _range: range + + # check whether self._data has been called + _cached_data: Optional[np.ndarray] = None + # -------------------------------------------------------------------- + # Constructors + + def __new__( + cls, start=None, stop=None, step=None, dtype=None, copy=False, name=None, + ): + + cls._validate_dtype(dtype) + name = maybe_extract_name(name, start, cls) + + # RangeIndex + if isinstance(start, RangeIndex): + start = start._range + return cls._simple_new(start, dtype=dtype, name=name) + + # validate the arguments + if com.all_none(start, stop, step): + raise TypeError("RangeIndex(...) must be called with integers") + + start = ensure_python_int(start) if start is not None else 0 + + if stop is None: + start, stop = 0, start + else: + stop = ensure_python_int(stop) + + step = ensure_python_int(step) if step is not None else 1 + if step == 0: + raise ValueError("Step must not be zero") + + rng = range(start, stop, step) + return cls._simple_new(rng, dtype=dtype, name=name) + + @classmethod + def from_range(cls, data, name=None, dtype=None): + """ + Create RangeIndex from a range object. + + Returns + ------- + RangeIndex + """ + if not isinstance(data, range): + raise TypeError( + f"{cls.__name__}(...) must be called with object coercible to a " + f"range, {repr(data)} was passed" + ) + + cls._validate_dtype(dtype) + return cls._simple_new(data, dtype=dtype, name=name) + + @classmethod + def _simple_new(cls, values, name=None, dtype=None): + result = object.__new__(cls) + + # handle passed None, non-integers + if values is None: + # empty + values = range(0, 0, 1) + elif not isinstance(values, range): + return Index(values, dtype=dtype, name=name) + + result._range = values + result.name = name + + result._reset_identity() + return result + + # -------------------------------------------------------------------- + + @cache_readonly + def _constructor(self): + """ return the class to use for construction """ + return Int64Index + + @property + def _data(self): + """ + An int array that for performance reasons is created only when needed. + + The constructed array is saved in ``_cached_data``. This allows us to + check if the array has been created without accessing ``_data`` and + triggering the construction. + """ + if self._cached_data is None: + self._cached_data = np.arange( + self.start, self.stop, self.step, dtype=np.int64 + ) + return self._cached_data + + @cache_readonly + def _int64index(self): + return Int64Index._simple_new(self._data, name=self.name) + + def _get_data_as_items(self): + """ return a list of tuples of start, stop, step """ + rng = self._range + return [("start", rng.start), ("stop", rng.stop), ("step", rng.step)] + + def __reduce__(self): + d = self._get_attributes_dict() + d.update(dict(self._get_data_as_items())) + return ibase._new_Index, (type(self), d), None + + # -------------------------------------------------------------------- + # Rendering Methods + + def _format_attrs(self): + """ + Return a list of tuples of the (attr, formatted_value) + """ + attrs = self._get_data_as_items() + if self.name is not None: + attrs.append(("name", ibase.default_pprint(self.name))) + return attrs + + def _format_data(self, name=None): + # we are formatting thru the attributes + return None + + def _format_with_header(self, header, na_rep="NaN", **kwargs): + return header + list(map(pprint_thing, self._range)) + + # -------------------------------------------------------------------- + _deprecation_message = ( + "RangeIndex.{} is deprecated and will be " + "removed in a future version. Use RangeIndex.{} " + "instead" + ) + + @cache_readonly + def start(self): + """ + The value of the `start` parameter (``0`` if this was not supplied). + """ + # GH 25710 + return self._range.start + + @property + def _start(self): + """ + The value of the `start` parameter (``0`` if this was not supplied). + + .. deprecated:: 0.25.0 + Use ``start`` instead. + """ + warnings.warn( + self._deprecation_message.format("_start", "start"), + FutureWarning, + stacklevel=2, + ) + return self.start + + @cache_readonly + def stop(self): + """ + The value of the `stop` parameter. + """ + return self._range.stop + + @property + def _stop(self): + """ + The value of the `stop` parameter. + + .. deprecated:: 0.25.0 + Use ``stop`` instead. + """ + # GH 25710 + warnings.warn( + self._deprecation_message.format("_stop", "stop"), + FutureWarning, + stacklevel=2, + ) + return self.stop + + @cache_readonly + def step(self): + """ + The value of the `step` parameter (``1`` if this was not supplied). + """ + # GH 25710 + return self._range.step + + @property + def _step(self): + """ + The value of the `step` parameter (``1`` if this was not supplied). + + .. deprecated:: 0.25.0 + Use ``step`` instead. + """ + # GH 25710 + warnings.warn( + self._deprecation_message.format("_step", "step"), + FutureWarning, + stacklevel=2, + ) + return self.step + + @cache_readonly + def nbytes(self) -> int: + """ + Return the number of bytes in the underlying data. + """ + rng = self._range + return getsizeof(rng) + sum( + getsizeof(getattr(rng, attr_name)) + for attr_name in ["start", "stop", "step"] + ) + + def memory_usage(self, deep: bool = False) -> int: + """ + Memory usage of my values + + Parameters + ---------- + deep : bool + Introspect the data deeply, interrogate + `object` dtypes for system-level memory consumption + + Returns + ------- + bytes used + + Notes + ----- + Memory usage does not include memory consumed by elements that + are not components of the array if deep=False + + See Also + -------- + numpy.ndarray.nbytes + """ + return self.nbytes + + @property + def dtype(self) -> np.dtype: + return np.dtype(np.int64) + + @property + def is_unique(self) -> bool: + """ return if the index has unique values """ + return True + + @cache_readonly + def is_monotonic_increasing(self) -> bool: + return self._range.step > 0 or len(self) <= 1 + + @cache_readonly + def is_monotonic_decreasing(self) -> bool: + return self._range.step < 0 or len(self) <= 1 + + @property + def has_duplicates(self) -> bool: + return False + + def __contains__(self, key: Union[int, np.integer]) -> bool: + hash(key) + try: + key = ensure_python_int(key) + except TypeError: + return False + return key in self._range + + @Appender(_index_shared_docs["get_loc"]) + def get_loc(self, key, method=None, tolerance=None): + if is_integer(key) and method is None and tolerance is None: + new_key = int(key) + try: + return self._range.index(new_key) + except ValueError: + raise KeyError(key) + return super().get_loc(key, method=method, tolerance=tolerance) + + @Appender(_index_shared_docs["get_indexer"]) + def get_indexer(self, target, method=None, limit=None, tolerance=None): + if com.any_not_none(method, tolerance, limit) or not is_list_like(target): + return super().get_indexer( + target, method=method, tolerance=tolerance, limit=limit + ) + + if self.step > 0: + start, stop, step = self.start, self.stop, self.step + else: + # GH 28678: work on reversed range for simplicity + reverse = self._range[::-1] + start, stop, step = reverse.start, reverse.stop, reverse.step + + target_array = np.asarray(target) + if not (is_integer_dtype(target_array) and target_array.ndim == 1): + # checks/conversions/roundings are delegated to general method + return super().get_indexer(target, method=method, tolerance=tolerance) + + locs = target_array - start + valid = (locs % step == 0) & (locs >= 0) & (target_array < stop) + locs[~valid] = -1 + locs[valid] = locs[valid] / step + + if step != self.step: + # We reversed this range: transform to original locs + locs[valid] = len(self) - 1 - locs[valid] + return ensure_platform_int(locs) + + def tolist(self): + return list(self._range) + + @Appender(_index_shared_docs["_shallow_copy"]) + def _shallow_copy(self, values=None, **kwargs): + if values is None: + name = kwargs.get("name", self.name) + return self._simple_new(self._range, name=name) + else: + kwargs.setdefault("name", self.name) + return self._int64index._shallow_copy(values, **kwargs) + + @Appender(ibase._index_shared_docs["copy"]) + def copy(self, name=None, deep=False, dtype=None, **kwargs): + self._validate_dtype(dtype) + if name is None: + name = self.name + return self.from_range(self._range, name=name) + + def _minmax(self, meth): + no_steps = len(self) - 1 + if no_steps == -1: + return np.nan + elif (meth == "min" and self.step > 0) or (meth == "max" and self.step < 0): + return self.start + + return self.start + self.step * no_steps + + def min(self, axis=None, skipna=True, *args, **kwargs): + """The minimum value of the RangeIndex""" + nv.validate_minmax_axis(axis) + nv.validate_min(args, kwargs) + return self._minmax("min") + + def max(self, axis=None, skipna=True, *args, **kwargs): + """The maximum value of the RangeIndex""" + nv.validate_minmax_axis(axis) + nv.validate_max(args, kwargs) + return self._minmax("max") + + def argsort(self, *args, **kwargs): + """ + Returns the indices that would sort the index and its + underlying data. + + Returns + ------- + argsorted : numpy array + + See Also + -------- + numpy.ndarray.argsort + """ + nv.validate_argsort(args, kwargs) + + if self._range.step > 0: + return np.arange(len(self)) + else: + return np.arange(len(self) - 1, -1, -1) + + def equals(self, other): + """ + Determines if two Index objects contain the same elements. + """ + if isinstance(other, RangeIndex): + return self._range == other._range + return super().equals(other) + + def intersection(self, other, sort=False): + """ + Form the intersection of two Index objects. + + Parameters + ---------- + other : Index or array-like + sort : False or None, default False + Sort the resulting index if possible + + .. versionadded:: 0.24.0 + + .. versionchanged:: 0.24.1 + + Changed the default to ``False`` to match the behaviour + from before 0.24.0. + + Returns + ------- + intersection : Index + """ + self._validate_sort_keyword(sort) + + if self.equals(other): + return self._get_reconciled_name_object(other) + + if not isinstance(other, RangeIndex): + return super().intersection(other, sort=sort) + + if not len(self) or not len(other): + return self._simple_new(None) + + first = self._range[::-1] if self.step < 0 else self._range + second = other._range[::-1] if other.step < 0 else other._range + + # check whether intervals intersect + # deals with in- and decreasing ranges + int_low = max(first.start, second.start) + int_high = min(first.stop, second.stop) + if int_high <= int_low: + return self._simple_new(None) + + # Method hint: linear Diophantine equation + # solve intersection problem + # performance hint: for identical step sizes, could use + # cheaper alternative + gcd, s, t = self._extended_gcd(first.step, second.step) + + # check whether element sets intersect + if (first.start - second.start) % gcd: + return self._simple_new(None) + + # calculate parameters for the RangeIndex describing the + # intersection disregarding the lower bounds + tmp_start = first.start + (second.start - first.start) * first.step // gcd * s + new_step = first.step * second.step // gcd + new_range = range(tmp_start, int_high, new_step) + new_index = self._simple_new(new_range) + + # adjust index to limiting interval + new_start = new_index._min_fitting_element(int_low) + new_range = range(new_start, new_index.stop, new_index.step) + new_index = self._simple_new(new_range) + + if (self.step < 0 and other.step < 0) is not (new_index.step < 0): + new_index = new_index[::-1] + if sort is None: + new_index = new_index.sort_values() + return new_index + + def _min_fitting_element(self, lower_limit): + """Returns the smallest element greater than or equal to the limit""" + no_steps = -(-(lower_limit - self.start) // abs(self.step)) + return self.start + abs(self.step) * no_steps + + def _max_fitting_element(self, upper_limit): + """Returns the largest element smaller than or equal to the limit""" + no_steps = (upper_limit - self.start) // abs(self.step) + return self.start + abs(self.step) * no_steps + + def _extended_gcd(self, a, b): + """ + Extended Euclidean algorithms to solve Bezout's identity: + a*x + b*y = gcd(x, y) + Finds one particular solution for x, y: s, t + Returns: gcd, s, t + """ + s, old_s = 0, 1 + t, old_t = 1, 0 + r, old_r = b, a + while r: + quotient = old_r // r + old_r, r = r, old_r - quotient * r + old_s, s = s, old_s - quotient * s + old_t, t = t, old_t - quotient * t + return old_r, old_s, old_t + + def _union(self, other, sort): + """ + Form the union of two Index objects and sorts if possible + + Parameters + ---------- + other : Index or array-like + + sort : False or None, default None + Whether to sort resulting index. ``sort=None`` returns a + monotonically increasing ``RangeIndex`` if possible or a sorted + ``Int64Index`` if not. ``sort=False`` always returns an + unsorted ``Int64Index`` + + .. versionadded:: 0.25.0 + + Returns + ------- + union : Index + """ + if not len(other) or self.equals(other) or not len(self): + return super()._union(other, sort=sort) + + if isinstance(other, RangeIndex) and sort is None: + start_s, step_s = self.start, self.step + end_s = self.start + self.step * (len(self) - 1) + start_o, step_o = other.start, other.step + end_o = other.start + other.step * (len(other) - 1) + if self.step < 0: + start_s, step_s, end_s = end_s, -step_s, start_s + if other.step < 0: + start_o, step_o, end_o = end_o, -step_o, start_o + if len(self) == 1 and len(other) == 1: + step_s = step_o = abs(self.start - other.start) + elif len(self) == 1: + step_s = step_o + elif len(other) == 1: + step_o = step_s + start_r = min(start_s, start_o) + end_r = max(end_s, end_o) + if step_o == step_s: + if ( + (start_s - start_o) % step_s == 0 + and (start_s - end_o) <= step_s + and (start_o - end_s) <= step_s + ): + return type(self)(start_r, end_r + step_s, step_s) + if ( + (step_s % 2 == 0) + and (abs(start_s - start_o) <= step_s / 2) + and (abs(end_s - end_o) <= step_s / 2) + ): + return type(self)(start_r, end_r + step_s / 2, step_s / 2) + elif step_o % step_s == 0: + if ( + (start_o - start_s) % step_s == 0 + and (start_o + step_s >= start_s) + and (end_o - step_s <= end_s) + ): + return type(self)(start_r, end_r + step_s, step_s) + elif step_s % step_o == 0: + if ( + (start_s - start_o) % step_o == 0 + and (start_s + step_o >= start_o) + and (end_s - step_o <= end_o) + ): + return type(self)(start_r, end_r + step_o, step_o) + return self._int64index._union(other, sort=sort) + + @Appender(_index_shared_docs["join"]) + def join(self, other, how="left", level=None, return_indexers=False, sort=False): + if how == "outer" and self is not other: + # note: could return RangeIndex in more circumstances + return self._int64index.join(other, how, level, return_indexers, sort) + + return super().join(other, how, level, return_indexers, sort) + + def _concat_same_dtype(self, indexes, name): + """ + Concatenates multiple RangeIndex instances. All members of "indexes" must + be of type RangeIndex; result will be RangeIndex if possible, Int64Index + otherwise. E.g.: + indexes = [RangeIndex(3), RangeIndex(3, 6)] -> RangeIndex(6) + indexes = [RangeIndex(3), RangeIndex(4, 6)] -> Int64Index([0,1,2,4,5]) + """ + start = step = next_ = None + + # Filter the empty indexes + non_empty_indexes = [obj for obj in indexes if len(obj)] + + for obj in non_empty_indexes: + rng: range = obj._range + + if start is None: + # This is set by the first non-empty index + start = rng.start + if step is None and len(rng) > 1: + step = rng.step + elif step is None: + # First non-empty index had only one element + if rng.start == start: + result = Int64Index(np.concatenate([x._values for x in indexes])) + return result.rename(name) + + step = rng.start - start + + non_consecutive = (step != rng.step and len(rng) > 1) or ( + next_ is not None and rng.start != next_ + ) + if non_consecutive: + result = Int64Index(np.concatenate([x._values for x in indexes])) + return result.rename(name) + + if step is not None: + next_ = rng[-1] + step + + if non_empty_indexes: + # Get the stop value from "next" or alternatively + # from the last non-empty index + stop = non_empty_indexes[-1].stop if next_ is None else next_ + return RangeIndex(start, stop, step).rename(name) + + # Here all "indexes" had 0 length, i.e. were empty. + # In this case return an empty range index. + return RangeIndex(0, 0).rename(name) + + def __len__(self) -> int: + """ + return the length of the RangeIndex + """ + return len(self._range) + + @property + def size(self) -> int: + return len(self) + + def __getitem__(self, key): + """ + Conserve RangeIndex type for scalar and slice keys. + """ + if isinstance(key, slice): + new_range = self._range[key] + return self._simple_new(new_range, name=self.name) + elif is_integer(key): + new_key = int(key) + try: + return self._range[new_key] + except IndexError: + raise IndexError( + f"index {key} is out of bounds for axis 0 with size {len(self)}" + ) + elif is_scalar(key): + raise IndexError( + "only integers, slices (`:`), " + "ellipsis (`...`), numpy.newaxis (`None`) " + "and integer or boolean " + "arrays are valid indices" + ) + # fall back to Int64Index + return super().__getitem__(key) + + @unpack_zerodim_and_defer("__floordiv__") + def __floordiv__(self, other): + + if is_integer(other) and other != 0: + if len(self) == 0 or self.start % other == 0 and self.step % other == 0: + start = self.start // other + step = self.step // other + stop = start + len(self) * step + new_range = range(start, stop, step or 1) + return self._simple_new(new_range, name=self.name) + if len(self) == 1: + start = self.start // other + new_range = range(start, start + 1, 1) + return self._simple_new(new_range, name=self.name) + return self._int64index // other + + def all(self) -> bool: + return 0 not in self._range + + def any(self) -> bool: + return any(self._range) + + @classmethod + def _add_numeric_methods_binary(cls): + """ add in numeric methods, specialized to RangeIndex """ + + def _make_evaluate_binop(op, step=False): + """ + Parameters + ---------- + op : callable that accepts 2 parms + perform the binary op + step : callable, optional, default to False + op to apply to the step parm if not None + if False, use the existing step + """ + + @unpack_zerodim_and_defer(op.__name__) + def _evaluate_numeric_binop(self, other): + if isinstance(other, ABCTimedeltaIndex): + # Defer to TimedeltaIndex implementation + return NotImplemented + elif isinstance(other, (timedelta, np.timedelta64)): + # GH#19333 is_integer evaluated True on timedelta64, + # so we need to catch these explicitly + return op(self._int64index, other) + elif is_timedelta64_dtype(other): + # Must be an np.ndarray; GH#22390 + return op(self._int64index, other) + + other = extract_array(other, extract_numpy=True) + attrs = self._get_attributes_dict() + + left, right = self, other + + try: + # apply if we have an override + if step: + with np.errstate(all="ignore"): + rstep = step(left.step, right) + + # we don't have a representable op + # so return a base index + if not is_integer(rstep) or not rstep: + raise ValueError + + else: + rstep = left.step + + with np.errstate(all="ignore"): + rstart = op(left.start, right) + rstop = op(left.stop, right) + + result = type(self)(rstart, rstop, rstep, **attrs) + + # for compat with numpy / Int64Index + # even if we can represent as a RangeIndex, return + # as a Float64Index if we have float-like descriptors + if not all(is_integer(x) for x in [rstart, rstop, rstep]): + result = result.astype("float64") + + return result + + except (ValueError, TypeError, ZeroDivisionError): + # Defer to Int64Index implementation + return op(self._int64index, other) + # TODO: Do attrs get handled reliably? + + name = f"__{op.__name__}__" + return compat.set_function_name(_evaluate_numeric_binop, name, cls) + + cls.__add__ = _make_evaluate_binop(operator.add) + cls.__radd__ = _make_evaluate_binop(ops.radd) + cls.__sub__ = _make_evaluate_binop(operator.sub) + cls.__rsub__ = _make_evaluate_binop(ops.rsub) + cls.__mul__ = _make_evaluate_binop(operator.mul, step=operator.mul) + cls.__rmul__ = _make_evaluate_binop(ops.rmul, step=ops.rmul) + cls.__truediv__ = _make_evaluate_binop(operator.truediv, step=operator.truediv) + cls.__rtruediv__ = _make_evaluate_binop(ops.rtruediv, step=ops.rtruediv) + + +RangeIndex._add_numeric_methods() diff --git a/venv/Lib/site-packages/pandas/core/indexes/timedeltas.py b/venv/Lib/site-packages/pandas/core/indexes/timedeltas.py new file mode 100644 index 0000000..c78020f --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/indexes/timedeltas.py @@ -0,0 +1,509 @@ +""" implement the TimedeltaIndex """ +from datetime import datetime + +import numpy as np + +from pandas._libs import NaT, Timedelta, index as libindex +from pandas.util._decorators import Appender, Substitution + +from pandas.core.dtypes.common import ( + _TD_DTYPE, + is_float, + is_integer, + is_list_like, + is_scalar, + is_timedelta64_dtype, + is_timedelta64_ns_dtype, + pandas_dtype, +) +from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna + +from pandas.core.accessor import delegate_names +from pandas.core.arrays import datetimelike as dtl +from pandas.core.arrays.timedeltas import TimedeltaArray, _is_convertible_to_td +from pandas.core.base import _shared_docs +import pandas.core.common as com +from pandas.core.indexes.base import Index, _index_shared_docs, maybe_extract_name +from pandas.core.indexes.datetimelike import ( + DatetimeIndexOpsMixin, + DatetimelikeDelegateMixin, + DatetimeTimedeltaMixin, +) +from pandas.core.indexes.extension import inherit_names + +from pandas.tseries.frequencies import to_offset + + +class TimedeltaDelegateMixin(DatetimelikeDelegateMixin): + # Most attrs are dispatched via datetimelike_{ops,methods} + # Some are "raw" methods, the result is not re-boxed in an Index + # We also have a few "extra" attrs, which may or may not be raw, + # which we don't want to expose in the .dt accessor. + _raw_properties = {"components", "_box_func"} + _raw_methods = {"to_pytimedelta", "sum", "std", "median", "_format_native_types"} + + _delegated_properties = TimedeltaArray._datetimelike_ops + list(_raw_properties) + _delegated_methods = TimedeltaArray._datetimelike_methods + list(_raw_methods) + + +@inherit_names( + ["_box_values", "__neg__", "__pos__", "__abs__"], TimedeltaArray, wrap=True +) +@inherit_names( + [ + "_bool_ops", + "_object_ops", + "_field_ops", + "_datetimelike_ops", + "_datetimelike_methods", + "_other_ops", + ], + TimedeltaArray, +) +@delegate_names( + TimedeltaArray, TimedeltaDelegateMixin._delegated_properties, typ="property" +) +@delegate_names( + TimedeltaArray, + TimedeltaDelegateMixin._delegated_methods, + typ="method", + overwrite=True, +) +class TimedeltaIndex( + DatetimeTimedeltaMixin, dtl.TimelikeOps, TimedeltaDelegateMixin, +): + """ + Immutable ndarray of timedelta64 data, represented internally as int64, and + which can be boxed to timedelta objects. + + Parameters + ---------- + data : array-like (1-dimensional), optional + Optional timedelta-like data to construct index with. + unit : unit of the arg (D,h,m,s,ms,us,ns) denote the unit, optional + Which is an integer/float number. + freq : str or pandas offset object, optional + One of pandas date offset strings or corresponding objects. The string + 'infer' can be passed in order to set the frequency of the index as the + inferred frequency upon creation. + copy : bool + Make a copy of input ndarray. + name : object + Name to be stored in the index. + + Attributes + ---------- + days + seconds + microseconds + nanoseconds + components + inferred_freq + + Methods + ------- + to_pytimedelta + to_series + round + floor + ceil + to_frame + mean + + See Also + -------- + Index : The base pandas Index type. + Timedelta : Represents a duration between two dates or times. + DatetimeIndex : Index of datetime64 data. + PeriodIndex : Index of Period data. + timedelta_range : Create a fixed-frequency TimedeltaIndex. + + Notes + ----- + To learn more about the frequency strings, please see `this link + `__. + """ + + _typ = "timedeltaindex" + + _engine_type = libindex.TimedeltaEngine + + _comparables = ["name", "freq"] + _attributes = ["name", "freq"] + _is_numeric_dtype = True + _infer_as_myclass = True + + # ------------------------------------------------------------------- + # Constructors + + def __new__( + cls, + data=None, + unit=None, + freq=None, + closed=None, + dtype=_TD_DTYPE, + copy=False, + name=None, + ): + name = maybe_extract_name(name, data, cls) + + if is_scalar(data): + raise TypeError( + f"{cls.__name__}() must be called with a " + f"collection of some kind, {repr(data)} was passed" + ) + + if unit in {"Y", "y", "M"}: + raise ValueError( + "Units 'M' and 'Y' are no longer supported, as they do not " + "represent unambiguous timedelta values durations." + ) + + if isinstance(data, TimedeltaArray): + if copy: + data = data.copy() + return cls._simple_new(data, name=name, freq=freq) + + if isinstance(data, TimedeltaIndex) and freq is None and name is None: + if copy: + return data.copy() + else: + return data._shallow_copy() + + # - Cases checked above all return/raise before reaching here - # + + tdarr = TimedeltaArray._from_sequence( + data, freq=freq, unit=unit, dtype=dtype, copy=copy + ) + return cls._simple_new(tdarr._data, freq=tdarr.freq, name=name) + + @classmethod + def _simple_new(cls, values, name=None, freq=None, dtype=_TD_DTYPE): + # `dtype` is passed by _shallow_copy in corner cases, should always + # be timedelta64[ns] if present + if not isinstance(values, TimedeltaArray): + values = TimedeltaArray._simple_new(values, dtype=dtype, freq=freq) + else: + if freq is None: + freq = values.freq + assert isinstance(values, TimedeltaArray), type(values) + assert dtype == _TD_DTYPE, dtype + assert values.dtype == "m8[ns]", values.dtype + + tdarr = TimedeltaArray._simple_new(values._data, freq=freq) + result = object.__new__(cls) + result._data = tdarr + result._name = name + # For groupby perf. See note in indexes/base about _index_data + result._index_data = tdarr._data + + result._reset_identity() + return result + + # ------------------------------------------------------------------- + # Rendering Methods + + @property + def _formatter_func(self): + from pandas.io.formats.format import _get_format_timedelta64 + + return _get_format_timedelta64(self, box=True) + + # ------------------------------------------------------------------- + + @Appender(_index_shared_docs["astype"]) + def astype(self, dtype, copy=True): + dtype = pandas_dtype(dtype) + if is_timedelta64_dtype(dtype) and not is_timedelta64_ns_dtype(dtype): + # Have to repeat the check for 'timedelta64' (not ns) dtype + # so that we can return a numeric index, since pandas will return + # a TimedeltaIndex when dtype='timedelta' + result = self._data.astype(dtype, copy=copy) + if self.hasnans: + return Index(result, name=self.name) + return Index(result.astype("i8"), name=self.name) + return DatetimeIndexOpsMixin.astype(self, dtype, copy=copy) + + def _maybe_promote(self, other): + if other.inferred_type == "timedelta": + other = TimedeltaIndex(other) + return self, other + + def get_value(self, series, key): + """ + Fast lookup of value from 1-dimensional ndarray. Only use this if you + know what you're doing + """ + + if _is_convertible_to_td(key): + key = Timedelta(key) + return self.get_value_maybe_box(series, key) + + try: + value = Index.get_value(self, series, key) + except KeyError: + try: + loc = self._get_string_slice(key) + return series[loc] + except (TypeError, ValueError, KeyError): + pass + + try: + return self.get_value_maybe_box(series, key) + except (TypeError, ValueError, KeyError): + raise KeyError(key) + else: + return com.maybe_box(self, value, series, key) + + def get_value_maybe_box(self, series, key: Timedelta): + values = self._engine.get_value(com.values_from_object(series), key) + return com.maybe_box(self, values, series, key) + + def get_loc(self, key, method=None, tolerance=None): + """ + Get integer location for requested label + + Returns + ------- + loc : int + """ + if is_list_like(key) or (isinstance(key, datetime) and key is not NaT): + # GH#20464 datetime check here is to ensure we don't allow + # datetime objects to be incorrectly treated as timedelta + # objects; NaT is a special case because it plays a double role + # as Not-A-Timedelta + raise TypeError + + if isna(key): + key = NaT + + if tolerance is not None: + # try converting tolerance now, so errors don't get swallowed by + # the try/except clauses below + tolerance = self._convert_tolerance(tolerance, np.asarray(key)) + + if _is_convertible_to_td(key) or key is NaT: + key = Timedelta(key) + return Index.get_loc(self, key, method, tolerance) + + try: + return Index.get_loc(self, key, method, tolerance) + except (KeyError, ValueError, TypeError): + try: + return self._get_string_slice(key) + except (TypeError, KeyError, ValueError): + pass + + try: + stamp = Timedelta(key) + return Index.get_loc(self, stamp, method, tolerance) + except (KeyError, ValueError): + raise KeyError(key) + + def _maybe_cast_slice_bound(self, label, side, kind): + """ + If label is a string, cast it to timedelta according to resolution. + + Parameters + ---------- + label : object + side : {'left', 'right'} + kind : {'ix', 'loc', 'getitem'} + + Returns + ------- + label : object + """ + assert kind in ["ix", "loc", "getitem", None] + + if isinstance(label, str): + parsed = Timedelta(label) + lbound = parsed.round(parsed.resolution_string) + if side == "left": + return lbound + else: + return lbound + to_offset(parsed.resolution_string) - Timedelta(1, "ns") + elif is_integer(label) or is_float(label): + self._invalid_indexer("slice", label) + + return label + + def _get_string_slice(self, key): + if is_integer(key) or is_float(key) or key is NaT: + self._invalid_indexer("slice", key) + loc = self._partial_td_slice(key) + return loc + + def _partial_td_slice(self, key): + + # given a key, try to figure out a location for a partial slice + if not isinstance(key, str): + return key + + raise NotImplementedError + + @Substitution(klass="TimedeltaIndex") + @Appender(_shared_docs["searchsorted"]) + def searchsorted(self, value, side="left", sorter=None): + if isinstance(value, (np.ndarray, Index)): + if not type(self._data)._is_recognized_dtype(value): + raise TypeError( + "searchsorted requires compatible dtype or scalar, " + f"not {type(value).__name__}" + ) + value = type(self._data)(value) + self._data._check_compatible_with(value) + + elif isinstance(value, self._data._recognized_scalars): + self._data._check_compatible_with(value) + value = self._data._scalar_type(value) + + elif not isinstance(value, TimedeltaArray): + raise TypeError( + "searchsorted requires compatible dtype or scalar, " + f"not {type(value).__name__}" + ) + + return self._data.searchsorted(value, side=side, sorter=sorter) + + def is_type_compatible(self, typ) -> bool: + return typ == self.inferred_type or typ == "timedelta" + + @property + def inferred_type(self) -> str: + return "timedelta64" + + def insert(self, loc, item): + """ + Make new Index inserting new item at location + + Parameters + ---------- + loc : int + item : object + If not either a Python datetime or a numpy integer-like, returned + Index dtype will be object rather than datetime. + + Returns + ------- + new_index : Index + """ + # try to convert if possible + if isinstance(item, self._data._recognized_scalars): + item = self._data._scalar_type(item) + elif is_valid_nat_for_dtype(item, self.dtype): + # GH 18295 + item = self._na_value + elif is_scalar(item) and isna(item): + # i.e. datetime64("NaT") + raise TypeError( + f"cannot insert {type(self).__name__} with incompatible label" + ) + + freq = None + if isinstance(item, self._data._scalar_type) or item is NaT: + self._data._check_compatible_with(item, setitem=True) + + # check freq can be preserved on edge cases + if self.size and self.freq is not None: + if item is NaT: + pass + elif (loc == 0 or loc == -len(self)) and item + self.freq == self[0]: + freq = self.freq + elif (loc == len(self)) and item - self.freq == self[-1]: + freq = self.freq + item = item.asm8 + + try: + new_i8s = np.concatenate( + (self[:loc].asi8, [item.view(np.int64)], self[loc:].asi8) + ) + return self._shallow_copy(new_i8s, freq=freq) + except (AttributeError, TypeError): + + # fall back to object index + if isinstance(item, str): + return self.astype(object).insert(loc, item) + raise TypeError( + f"cannot insert {type(self).__name__} with incompatible label" + ) + + +TimedeltaIndex._add_logical_methods_disabled() + + +def timedelta_range( + start=None, end=None, periods=None, freq=None, name=None, closed=None +) -> TimedeltaIndex: + """ + Return a fixed frequency TimedeltaIndex, with day as the default + frequency. + + Parameters + ---------- + start : str or timedelta-like, default None + Left bound for generating timedeltas. + end : str or timedelta-like, default None + Right bound for generating timedeltas. + periods : int, default None + Number of periods to generate. + freq : str or DateOffset, default 'D' + Frequency strings can have multiples, e.g. '5H'. + name : str, default None + Name of the resulting TimedeltaIndex. + closed : str, default None + Make the interval closed with respect to the given frequency to + the 'left', 'right', or both sides (None). + + Returns + ------- + rng : TimedeltaIndex + + Notes + ----- + Of the four parameters ``start``, ``end``, ``periods``, and ``freq``, + exactly three must be specified. If ``freq`` is omitted, the resulting + ``TimedeltaIndex`` will have ``periods`` linearly spaced elements between + ``start`` and ``end`` (closed on both sides). + + To learn more about the frequency strings, please see `this link + `__. + + Examples + -------- + + >>> pd.timedelta_range(start='1 day', periods=4) + TimedeltaIndex(['1 days', '2 days', '3 days', '4 days'], + dtype='timedelta64[ns]', freq='D') + + The ``closed`` parameter specifies which endpoint is included. The default + behavior is to include both endpoints. + + >>> pd.timedelta_range(start='1 day', periods=4, closed='right') + TimedeltaIndex(['2 days', '3 days', '4 days'], + dtype='timedelta64[ns]', freq='D') + + The ``freq`` parameter specifies the frequency of the TimedeltaIndex. + Only fixed frequencies can be passed, non-fixed frequencies such as + 'M' (month end) will raise. + + >>> pd.timedelta_range(start='1 day', end='2 days', freq='6H') + TimedeltaIndex(['1 days 00:00:00', '1 days 06:00:00', '1 days 12:00:00', + '1 days 18:00:00', '2 days 00:00:00'], + dtype='timedelta64[ns]', freq='6H') + + Specify ``start``, ``end``, and ``periods``; the frequency is generated + automatically (linearly spaced). + + >>> pd.timedelta_range(start='1 day', end='5 days', periods=4) + TimedeltaIndex(['1 days 00:00:00', '2 days 08:00:00', '3 days 16:00:00', + '5 days 00:00:00'], + dtype='timedelta64[ns]', freq=None) + """ + if freq is None and com.any_none(periods, start, end): + freq = "D" + + freq, freq_infer = dtl.maybe_infer_freq(freq) + tdarr = TimedeltaArray._generate_range(start, end, periods, freq, closed=closed) + return TimedeltaIndex._simple_new(tdarr._data, freq=tdarr.freq, name=name) diff --git a/venv/Lib/site-packages/pandas/core/indexing.py b/venv/Lib/site-packages/pandas/core/indexing.py new file mode 100644 index 0000000..5624bb5 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/indexing.py @@ -0,0 +1,2485 @@ +from typing import Hashable, List, Tuple, Union + +import numpy as np + +from pandas._libs.indexing import _NDFrameIndexerBase +from pandas._libs.lib import item_from_zerodim +from pandas.errors import AbstractMethodError +from pandas.util._decorators import Appender + +from pandas.core.dtypes.common import ( + is_float, + is_integer, + is_iterator, + is_list_like, + is_numeric_dtype, + is_scalar, + is_sequence, +) +from pandas.core.dtypes.concat import concat_compat +from pandas.core.dtypes.generic import ABCDataFrame, ABCMultiIndex, ABCSeries +from pandas.core.dtypes.missing import _infer_fill_value, isna + +import pandas.core.common as com +from pandas.core.indexers import ( + check_array_indexer, + is_list_like_indexer, + length_of_indexer, +) +from pandas.core.indexes.api import Index, InvalidIndexError + +# "null slice" +_NS = slice(None, None) + + +# the public IndexSlicerMaker +class _IndexSlice: + """ + Create an object to more easily perform multi-index slicing. + + See Also + -------- + MultiIndex.remove_unused_levels : New MultiIndex with no unused levels. + + Notes + ----- + See :ref:`Defined Levels ` + for further info on slicing a MultiIndex. + + Examples + -------- + + >>> midx = pd.MultiIndex.from_product([['A0','A1'], ['B0','B1','B2','B3']]) + >>> columns = ['foo', 'bar'] + >>> dfmi = pd.DataFrame(np.arange(16).reshape((len(midx), len(columns))), + index=midx, columns=columns) + + Using the default slice command: + + >>> dfmi.loc[(slice(None), slice('B0', 'B1')), :] + foo bar + A0 B0 0 1 + B1 2 3 + A1 B0 8 9 + B1 10 11 + + Using the IndexSlice class for a more intuitive command: + + >>> idx = pd.IndexSlice + >>> dfmi.loc[idx[:, 'B0':'B1'], :] + foo bar + A0 B0 0 1 + B1 2 3 + A1 B0 8 9 + B1 10 11 + """ + + def __getitem__(self, arg): + return arg + + +IndexSlice = _IndexSlice() + + +class IndexingError(Exception): + pass + + +class IndexingMixin: + """Mixin for adding .loc/.iloc/.at/.iat to Datafames and Series. + """ + + @property + def iloc(self) -> "_iLocIndexer": + """ + Purely integer-location based indexing for selection by position. + + ``.iloc[]`` is primarily integer position based (from ``0`` to + ``length-1`` of the axis), but may also be used with a boolean + array. + + Allowed inputs are: + + - An integer, e.g. ``5``. + - A list or array of integers, e.g. ``[4, 3, 0]``. + - A slice object with ints, e.g. ``1:7``. + - A boolean array. + - A ``callable`` function with one argument (the calling Series or + DataFrame) and that returns valid output for indexing (one of the above). + This is useful in method chains, when you don't have a reference to the + calling object, but would like to base your selection on some value. + + ``.iloc`` will raise ``IndexError`` if a requested indexer is + out-of-bounds, except *slice* indexers which allow out-of-bounds + indexing (this conforms with python/numpy *slice* semantics). + + See more at :ref:`Selection by Position `. + + See Also + -------- + DataFrame.iat : Fast integer location scalar accessor. + DataFrame.loc : Purely label-location based indexer for selection by label. + Series.iloc : Purely integer-location based indexing for + selection by position. + + Examples + -------- + + >>> mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4}, + ... {'a': 100, 'b': 200, 'c': 300, 'd': 400}, + ... {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000 }] + >>> df = pd.DataFrame(mydict) + >>> df + a b c d + 0 1 2 3 4 + 1 100 200 300 400 + 2 1000 2000 3000 4000 + + **Indexing just the rows** + + With a scalar integer. + + >>> type(df.iloc[0]) + + >>> df.iloc[0] + a 1 + b 2 + c 3 + d 4 + Name: 0, dtype: int64 + + With a list of integers. + + >>> df.iloc[[0]] + a b c d + 0 1 2 3 4 + >>> type(df.iloc[[0]]) + + + >>> df.iloc[[0, 1]] + a b c d + 0 1 2 3 4 + 1 100 200 300 400 + + With a `slice` object. + + >>> df.iloc[:3] + a b c d + 0 1 2 3 4 + 1 100 200 300 400 + 2 1000 2000 3000 4000 + + With a boolean mask the same length as the index. + + >>> df.iloc[[True, False, True]] + a b c d + 0 1 2 3 4 + 2 1000 2000 3000 4000 + + With a callable, useful in method chains. The `x` passed + to the ``lambda`` is the DataFrame being sliced. This selects + the rows whose index label even. + + >>> df.iloc[lambda x: x.index % 2 == 0] + a b c d + 0 1 2 3 4 + 2 1000 2000 3000 4000 + + **Indexing both axes** + + You can mix the indexer types for the index and columns. Use ``:`` to + select the entire axis. + + With scalar integers. + + >>> df.iloc[0, 1] + 2 + + With lists of integers. + + >>> df.iloc[[0, 2], [1, 3]] + b d + 0 2 4 + 2 2000 4000 + + With `slice` objects. + + >>> df.iloc[1:3, 0:3] + a b c + 1 100 200 300 + 2 1000 2000 3000 + + With a boolean array whose length matches the columns. + + >>> df.iloc[:, [True, False, True, False]] + a c + 0 1 3 + 1 100 300 + 2 1000 3000 + + With a callable function that expects the Series or DataFrame. + + >>> df.iloc[:, lambda df: [0, 2]] + a c + 0 1 3 + 1 100 300 + 2 1000 3000 + """ + return _iLocIndexer("iloc", self) + + @property + def loc(self) -> "_LocIndexer": + """ + Access a group of rows and columns by label(s) or a boolean array. + + ``.loc[]`` is primarily label based, but may also be used with a + boolean array. + + Allowed inputs are: + + - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is + interpreted as a *label* of the index, and **never** as an + integer position along the index). + - A list or array of labels, e.g. ``['a', 'b', 'c']``. + - A slice object with labels, e.g. ``'a':'f'``. + + .. warning:: Note that contrary to usual python slices, **both** the + start and the stop are included + + - A boolean array of the same length as the axis being sliced, + e.g. ``[True, False, True]``. + - A ``callable`` function with one argument (the calling Series or + DataFrame) and that returns valid output for indexing (one of the above) + + See more at :ref:`Selection by Label ` + + Raises + ------ + KeyError + If any items are not found. + + See Also + -------- + DataFrame.at : Access a single value for a row/column label pair. + DataFrame.iloc : Access group of rows and columns by integer position(s). + DataFrame.xs : Returns a cross-section (row(s) or column(s)) from the + Series/DataFrame. + Series.loc : Access group of values using labels. + + Examples + -------- + **Getting values** + + >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], + ... index=['cobra', 'viper', 'sidewinder'], + ... columns=['max_speed', 'shield']) + >>> df + max_speed shield + cobra 1 2 + viper 4 5 + sidewinder 7 8 + + Single label. Note this returns the row as a Series. + + >>> df.loc['viper'] + max_speed 4 + shield 5 + Name: viper, dtype: int64 + + List of labels. Note using ``[[]]`` returns a DataFrame. + + >>> df.loc[['viper', 'sidewinder']] + max_speed shield + viper 4 5 + sidewinder 7 8 + + Single label for row and column + + >>> df.loc['cobra', 'shield'] + 2 + + Slice with labels for row and single label for column. As mentioned + above, note that both the start and stop of the slice are included. + + >>> df.loc['cobra':'viper', 'max_speed'] + cobra 1 + viper 4 + Name: max_speed, dtype: int64 + + Boolean list with the same length as the row axis + + >>> df.loc[[False, False, True]] + max_speed shield + sidewinder 7 8 + + Conditional that returns a boolean Series + + >>> df.loc[df['shield'] > 6] + max_speed shield + sidewinder 7 8 + + Conditional that returns a boolean Series with column labels specified + + >>> df.loc[df['shield'] > 6, ['max_speed']] + max_speed + sidewinder 7 + + Callable that returns a boolean Series + + >>> df.loc[lambda df: df['shield'] == 8] + max_speed shield + sidewinder 7 8 + + **Setting values** + + Set value for all items matching the list of labels + + >>> df.loc[['viper', 'sidewinder'], ['shield']] = 50 + >>> df + max_speed shield + cobra 1 2 + viper 4 50 + sidewinder 7 50 + + Set value for an entire row + + >>> df.loc['cobra'] = 10 + >>> df + max_speed shield + cobra 10 10 + viper 4 50 + sidewinder 7 50 + + Set value for an entire column + + >>> df.loc[:, 'max_speed'] = 30 + >>> df + max_speed shield + cobra 30 10 + viper 30 50 + sidewinder 30 50 + + Set value for rows matching callable condition + + >>> df.loc[df['shield'] > 35] = 0 + >>> df + max_speed shield + cobra 30 10 + viper 0 0 + sidewinder 0 0 + + **Getting values on a DataFrame with an index that has integer labels** + + Another example using integers for the index + + >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], + ... index=[7, 8, 9], columns=['max_speed', 'shield']) + >>> df + max_speed shield + 7 1 2 + 8 4 5 + 9 7 8 + + Slice with integer labels for rows. As mentioned above, note that both + the start and stop of the slice are included. + + >>> df.loc[7:9] + max_speed shield + 7 1 2 + 8 4 5 + 9 7 8 + + **Getting values with a MultiIndex** + + A number of examples using a DataFrame with a MultiIndex + + >>> tuples = [ + ... ('cobra', 'mark i'), ('cobra', 'mark ii'), + ... ('sidewinder', 'mark i'), ('sidewinder', 'mark ii'), + ... ('viper', 'mark ii'), ('viper', 'mark iii') + ... ] + >>> index = pd.MultiIndex.from_tuples(tuples) + >>> values = [[12, 2], [0, 4], [10, 20], + ... [1, 4], [7, 1], [16, 36]] + >>> df = pd.DataFrame(values, columns=['max_speed', 'shield'], index=index) + >>> df + max_speed shield + cobra mark i 12 2 + mark ii 0 4 + sidewinder mark i 10 20 + mark ii 1 4 + viper mark ii 7 1 + mark iii 16 36 + + Single label. Note this returns a DataFrame with a single index. + + >>> df.loc['cobra'] + max_speed shield + mark i 12 2 + mark ii 0 4 + + Single index tuple. Note this returns a Series. + + >>> df.loc[('cobra', 'mark ii')] + max_speed 0 + shield 4 + Name: (cobra, mark ii), dtype: int64 + + Single label for row and column. Similar to passing in a tuple, this + returns a Series. + + >>> df.loc['cobra', 'mark i'] + max_speed 12 + shield 2 + Name: (cobra, mark i), dtype: int64 + + Single tuple. Note using ``[[]]`` returns a DataFrame. + + >>> df.loc[[('cobra', 'mark ii')]] + max_speed shield + cobra mark ii 0 4 + + Single tuple for the index with a single label for the column + + >>> df.loc[('cobra', 'mark i'), 'shield'] + 2 + + Slice from index tuple to single label + + >>> df.loc[('cobra', 'mark i'):'viper'] + max_speed shield + cobra mark i 12 2 + mark ii 0 4 + sidewinder mark i 10 20 + mark ii 1 4 + viper mark ii 7 1 + mark iii 16 36 + + Slice from index tuple to index tuple + + >>> df.loc[('cobra', 'mark i'):('viper', 'mark ii')] + max_speed shield + cobra mark i 12 2 + mark ii 0 4 + sidewinder mark i 10 20 + mark ii 1 4 + viper mark ii 7 1 + """ + return _LocIndexer("loc", self) + + @property + def at(self) -> "_AtIndexer": + """ + Access a single value for a row/column label pair. + + Similar to ``loc``, in that both provide label-based lookups. Use + ``at`` if you only need to get or set a single value in a DataFrame + or Series. + + Raises + ------ + KeyError + If 'label' does not exist in DataFrame. + + See Also + -------- + DataFrame.iat : Access a single value for a row/column pair by integer + position. + DataFrame.loc : Access a group of rows and columns by label(s). + Series.at : Access a single value using a label. + + Examples + -------- + >>> df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], + ... index=[4, 5, 6], columns=['A', 'B', 'C']) + >>> df + A B C + 4 0 2 3 + 5 0 4 1 + 6 10 20 30 + + Get value at specified row/column pair + + >>> df.at[4, 'B'] + 2 + + Set value at specified row/column pair + + >>> df.at[4, 'B'] = 10 + >>> df.at[4, 'B'] + 10 + + Get value within a Series + + >>> df.loc[5].at['B'] + 4 + """ + return _AtIndexer("at", self) + + @property + def iat(self) -> "_iAtIndexer": + """ + Access a single value for a row/column pair by integer position. + + Similar to ``iloc``, in that both provide integer-based lookups. Use + ``iat`` if you only need to get or set a single value in a DataFrame + or Series. + + Raises + ------ + IndexError + When integer position is out of bounds. + + See Also + -------- + DataFrame.at : Access a single value for a row/column label pair. + DataFrame.loc : Access a group of rows and columns by label(s). + DataFrame.iloc : Access a group of rows and columns by integer position(s). + + Examples + -------- + >>> df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], + ... columns=['A', 'B', 'C']) + >>> df + A B C + 0 0 2 3 + 1 0 4 1 + 2 10 20 30 + + Get value at specified row/column pair + + >>> df.iat[1, 2] + 1 + + Set value at specified row/column pair + + >>> df.iat[1, 2] = 10 + >>> df.iat[1, 2] + 10 + + Get value within a series + + >>> df.loc[0].iat[1] + 2 + """ + return _iAtIndexer("iat", self) + + +class _NDFrameIndexer(_NDFrameIndexerBase): + _valid_types: str + axis = None + + def __call__(self, axis=None): + # we need to return a copy of ourselves + new_self = type(self)(self.name, self.obj) + + if axis is not None: + axis = self.obj._get_axis_number(axis) + new_self.axis = axis + return new_self + + # TODO: remove once geopandas no longer needs this + def __getitem__(self, key): + # Used in ix and downstream in geopandas _CoordinateIndexer + if type(key) is tuple: + # Note: we check the type exactly instead of with isinstance + # because NamedTuple is checked separately. + key = tuple(com.apply_if_callable(x, self.obj) for x in key) + try: + values = self.obj._get_value(*key) + except (KeyError, TypeError, InvalidIndexError, AttributeError): + # TypeError occurs here if the key has non-hashable entries, + # generally slice or list. + # TODO(ix): most/all of the TypeError cases here are for ix, + # so this check can be removed once ix is removed. + # The InvalidIndexError is only catched for compatibility + # with geopandas, see + # https://github.com/pandas-dev/pandas/issues/27258 + # TODO: The AttributeError is for IntervalIndex which + # incorrectly implements get_value, see + # https://github.com/pandas-dev/pandas/issues/27865 + pass + else: + if is_scalar(values): + return values + + return self._getitem_tuple(key) + else: + # we by definition only have the 0th axis + axis = self.axis or 0 + + key = com.apply_if_callable(key, self.obj) + return self._getitem_axis(key, axis=axis) + + def _get_label(self, label, axis: int): + if self.ndim == 1: + # for perf reasons we want to try _xs first + # as its basically direct indexing + # but will fail when the index is not present + # see GH5667 + return self.obj._xs(label, axis=axis) + elif isinstance(label, tuple) and isinstance(label[axis], slice): + raise IndexingError("no slices here, handle elsewhere") + + return self.obj._xs(label, axis=axis) + + def _get_loc(self, key: int, axis: int): + return self.obj._ixs(key, axis=axis) + + def _slice(self, obj, axis: int, kind=None): + return self.obj._slice(obj, axis=axis, kind=kind) + + def _get_setitem_indexer(self, key): + if self.axis is not None: + return self._convert_tuple(key) + + ax = self.obj._get_axis(0) + + if isinstance(ax, ABCMultiIndex) and self.name != "iloc": + try: + return ax.get_loc(key) + except (TypeError, KeyError, InvalidIndexError): + # TypeError e.g. passed a bool + pass + + if isinstance(key, tuple): + try: + return self._convert_tuple(key) + except IndexingError: + pass + + if isinstance(key, range): + return list(key) + + axis = self.axis or 0 + try: + return self._convert_to_indexer(key, axis=axis) + except TypeError as e: + + # invalid indexer type vs 'other' indexing errors + if "cannot do" in str(e): + raise + raise IndexingError(key) + + def __setitem__(self, key, value): + if isinstance(key, tuple): + key = tuple(com.apply_if_callable(x, self.obj) for x in key) + else: + key = com.apply_if_callable(key, self.obj) + indexer = self._get_setitem_indexer(key) + self._setitem_with_indexer(indexer, value) + + def _validate_key(self, key, axis: int): + """ + Ensure that key is valid for current indexer. + + Parameters + ---------- + key : scalar, slice or list-like + Key requested. + axis : int + Dimension on which the indexing is being made. + + Raises + ------ + TypeError + If the key (or some element of it) has wrong type. + IndexError + If the key (or some element of it) is out of bounds. + KeyError + If the key was not found. + """ + raise AbstractMethodError(self) + + def _has_valid_tuple(self, key: Tuple): + """ + Check the key for valid keys across my indexer. + """ + for i, k in enumerate(key): + if i >= self.ndim: + raise IndexingError("Too many indexers") + try: + self._validate_key(k, i) + except ValueError: + raise ValueError( + "Location based indexing can only have " + f"[{self._valid_types}] types" + ) + + def _is_nested_tuple_indexer(self, tup: Tuple) -> bool: + """ + Returns + ------- + bool + """ + if any(isinstance(ax, ABCMultiIndex) for ax in self.obj.axes): + return any(is_nested_tuple(tup, ax) for ax in self.obj.axes) + return False + + def _convert_tuple(self, key): + keyidx = [] + if self.axis is not None: + axis = self.obj._get_axis_number(self.axis) + for i in range(self.ndim): + if i == axis: + keyidx.append(self._convert_to_indexer(key, axis=axis)) + else: + keyidx.append(slice(None)) + else: + for i, k in enumerate(key): + if i >= self.ndim: + raise IndexingError("Too many indexers") + idx = self._convert_to_indexer(k, axis=i) + keyidx.append(idx) + return tuple(keyidx) + + def _convert_scalar_indexer(self, key, axis: int): + # if we are accessing via lowered dim, use the last dim + ax = self.obj._get_axis(min(axis, self.ndim - 1)) + # a scalar + return ax._convert_scalar_indexer(key, kind=self.name) + + def _convert_slice_indexer(self, key: slice, axis: int): + # if we are accessing via lowered dim, use the last dim + ax = self.obj._get_axis(min(axis, self.ndim - 1)) + return ax._convert_slice_indexer(key, kind=self.name) + + def _has_valid_setitem_indexer(self, indexer) -> bool: + return True + + def _has_valid_positional_setitem_indexer(self, indexer) -> bool: + """ + Validate that a positional indexer cannot enlarge its target + will raise if needed, does not modify the indexer externally. + + Returns + ------- + bool + """ + if isinstance(indexer, dict): + raise IndexError(f"{self.name} cannot enlarge its target object") + else: + if not isinstance(indexer, tuple): + indexer = _tuplify(self.ndim, indexer) + for ax, i in zip(self.obj.axes, indexer): + if isinstance(i, slice): + # should check the stop slice? + pass + elif is_list_like_indexer(i): + # should check the elements? + pass + elif is_integer(i): + if i >= len(ax): + raise IndexError( + f"{self.name} cannot enlarge its target object" + ) + elif isinstance(i, dict): + raise IndexError(f"{self.name} cannot enlarge its target object") + + return True + + def _setitem_with_indexer(self, indexer, value): + self._has_valid_setitem_indexer(indexer) + + # also has the side effect of consolidating in-place + from pandas import Series + + info_axis = self.obj._info_axis_number + + # maybe partial set + take_split_path = self.obj._is_mixed_type + + # if there is only one block/type, still have to take split path + # unless the block is one-dimensional or it can hold the value + if not take_split_path and self.obj._data.blocks: + (blk,) = self.obj._data.blocks + if 1 < blk.ndim: # in case of dict, keys are indices + val = list(value.values()) if isinstance(value, dict) else value + take_split_path = not blk._can_hold_element(val) + + # if we have any multi-indexes that have non-trivial slices + # (not null slices) then we must take the split path, xref + # GH 10360, GH 27841 + if isinstance(indexer, tuple) and len(indexer) == len(self.obj.axes): + for i, ax in zip(indexer, self.obj.axes): + if isinstance(ax, ABCMultiIndex) and not ( + is_integer(i) or com.is_null_slice(i) + ): + take_split_path = True + break + + if isinstance(indexer, tuple): + nindexer = [] + for i, idx in enumerate(indexer): + if isinstance(idx, dict): + + # reindex the axis to the new value + # and set inplace + key, _ = convert_missing_indexer(idx) + + # if this is the items axes, then take the main missing + # path first + # this correctly sets the dtype and avoids cache issues + # essentially this separates out the block that is needed + # to possibly be modified + if self.ndim > 1 and i == self.obj._info_axis_number: + + # add the new item, and set the value + # must have all defined axes if we have a scalar + # or a list-like on the non-info axes if we have a + # list-like + len_non_info_axes = ( + len(_ax) for _i, _ax in enumerate(self.obj.axes) if _i != i + ) + if any(not l for l in len_non_info_axes): + if not is_list_like_indexer(value): + raise ValueError( + "cannot set a frame with no " + "defined index and a scalar" + ) + self.obj[key] = value + return self.obj + + # add a new item with the dtype setup + self.obj[key] = _infer_fill_value(value) + + new_indexer = convert_from_missing_indexer_tuple( + indexer, self.obj.axes + ) + self._setitem_with_indexer(new_indexer, value) + + return self.obj + + # reindex the axis + # make sure to clear the cache because we are + # just replacing the block manager here + # so the object is the same + index = self.obj._get_axis(i) + labels = index.insert(len(index), key) + self.obj._data = self.obj.reindex(labels, axis=i)._data + self.obj._maybe_update_cacher(clear=True) + self.obj._is_copy = None + + nindexer.append(labels.get_loc(key)) + + else: + nindexer.append(idx) + + indexer = tuple(nindexer) + else: + + indexer, missing = convert_missing_indexer(indexer) + + if missing: + return self._setitem_with_indexer_missing(indexer, value) + + # set + item_labels = self.obj._get_axis(info_axis) + + # align and set the values + if take_split_path: + # Above we only set take_split_path to True for 2D cases + assert self.ndim == 2 + assert info_axis == 1 + + if not isinstance(indexer, tuple): + indexer = _tuplify(self.ndim, indexer) + + if isinstance(value, ABCSeries): + value = self._align_series(indexer, value) + + info_idx = indexer[info_axis] + if is_integer(info_idx): + info_idx = [info_idx] + labels = item_labels[info_idx] + + # if we have a partial multiindex, then need to adjust the plane + # indexer here + if len(labels) == 1 and isinstance( + self.obj[labels[0]].axes[0], ABCMultiIndex + ): + item = labels[0] + obj = self.obj[item] + index = obj.index + idx = indexer[:info_axis][0] + + plane_indexer = tuple([idx]) + indexer[info_axis + 1 :] + lplane_indexer = length_of_indexer(plane_indexer[0], index) + + # require that we are setting the right number of values that + # we are indexing + if ( + is_list_like_indexer(value) + and np.iterable(value) + and lplane_indexer != len(value) + ): + + if len(obj[idx]) != len(value): + raise ValueError( + "cannot set using a multi-index " + "selection indexer with a different " + "length than the value" + ) + + # make sure we have an ndarray + value = getattr(value, "values", value).ravel() + + # we can directly set the series here + # as we select a slice indexer on the mi + idx = index._convert_slice_indexer(idx) + obj._consolidate_inplace() + obj = obj.copy() + obj._data = obj._data.setitem(indexer=tuple([idx]), value=value) + self.obj[item] = obj + return + + # non-mi + else: + plane_indexer = indexer[:info_axis] + indexer[info_axis + 1 :] + plane_axis = self.obj.axes[:info_axis][0] + lplane_indexer = length_of_indexer(plane_indexer[0], plane_axis) + + def setter(item, v): + s = self.obj[item] + pi = plane_indexer[0] if lplane_indexer == 1 else plane_indexer + + # perform the equivalent of a setitem on the info axis + # as we have a null slice or a slice with full bounds + # which means essentially reassign to the columns of a + # multi-dim object + # GH6149 (null slice), GH10408 (full bounds) + if isinstance(pi, tuple) and all( + com.is_null_slice(idx) or com.is_full_slice(idx, len(self.obj)) + for idx in pi + ): + s = v + else: + # set the item, possibly having a dtype change + s._consolidate_inplace() + s = s.copy() + s._data = s._data.setitem(indexer=pi, value=v) + s._maybe_update_cacher(clear=True) + + # reset the sliced object if unique + self.obj[item] = s + + # we need an iterable, with a ndim of at least 1 + # eg. don't pass through np.array(0) + if is_list_like_indexer(value) and getattr(value, "ndim", 1) > 0: + + # we have an equal len Frame + if isinstance(value, ABCDataFrame): + sub_indexer = list(indexer) + multiindex_indexer = isinstance(labels, ABCMultiIndex) + + for item in labels: + if item in value: + sub_indexer[info_axis] = item + v = self._align_series( + tuple(sub_indexer), value[item], multiindex_indexer + ) + else: + v = np.nan + + setter(item, v) + + # we have an equal len ndarray/convertible to our labels + # hasattr first, to avoid coercing to ndarray without reason. + # But we may be relying on the ndarray coercion to check ndim. + # Why not just convert to an ndarray earlier on if needed? + elif np.ndim(value) == 2: + + # note that this coerces the dtype if we are mixed + # GH 7551 + value = np.array(value, dtype=object) + if len(labels) != value.shape[1]: + raise ValueError( + "Must have equal len keys and value " + "when setting with an ndarray" + ) + + for i, item in enumerate(labels): + + # setting with a list, recoerces + setter(item, value[:, i].tolist()) + + # we have an equal len list/ndarray + elif _can_do_equal_len( + labels, value, plane_indexer, lplane_indexer, self.obj + ): + setter(labels[0], value) + + # per label values + else: + + if len(labels) != len(value): + raise ValueError( + "Must have equal len keys and value " + "when setting with an iterable" + ) + + for item, v in zip(labels, value): + setter(item, v) + else: + + # scalar + for item in labels: + setter(item, value) + + else: + if isinstance(indexer, tuple): + indexer = maybe_convert_ix(*indexer) + + # if we are setting on the info axis ONLY + # set using those methods to avoid block-splitting + # logic here + if ( + len(indexer) > info_axis + and is_integer(indexer[info_axis]) + and all( + com.is_null_slice(idx) + for i, idx in enumerate(indexer) + if i != info_axis + ) + and item_labels.is_unique + ): + self.obj[item_labels[indexer[info_axis]]] = value + return + + if isinstance(value, (ABCSeries, dict)): + # TODO(EA): ExtensionBlock.setitem this causes issues with + # setting for extensionarrays that store dicts. Need to decide + # if it's worth supporting that. + value = self._align_series(indexer, Series(value)) + + elif isinstance(value, ABCDataFrame): + value = self._align_frame(indexer, value) + + # check for chained assignment + self.obj._check_is_chained_assignment_possible() + + # actually do the set + self.obj._consolidate_inplace() + self.obj._data = self.obj._data.setitem(indexer=indexer, value=value) + self.obj._maybe_update_cacher(clear=True) + + def _setitem_with_indexer_missing(self, indexer, value): + """ + Insert new row(s) or column(s) into the Series or DataFrame. + """ + from pandas import Series + + # reindex the axis to the new value + # and set inplace + if self.ndim == 1: + index = self.obj.index + new_index = index.insert(len(index), indexer) + + # we have a coerced indexer, e.g. a float + # that matches in an Int64Index, so + # we will not create a duplicate index, rather + # index to that element + # e.g. 0.0 -> 0 + # GH#12246 + if index.is_unique: + new_indexer = index.get_indexer([new_index[-1]]) + if (new_indexer != -1).any(): + return self._setitem_with_indexer(new_indexer, value) + + # this preserves dtype of the value + new_values = Series([value])._values + if len(self.obj._values): + # GH#22717 handle casting compatibility that np.concatenate + # does incorrectly + new_values = concat_compat([self.obj._values, new_values]) + self.obj._data = self.obj._constructor( + new_values, index=new_index, name=self.obj.name + )._data + self.obj._maybe_update_cacher(clear=True) + return self.obj + + elif self.ndim == 2: + + if not len(self.obj.columns): + # no columns and scalar + raise ValueError("cannot set a frame with no defined columns") + + if isinstance(value, ABCSeries): + # append a Series + value = value.reindex(index=self.obj.columns, copy=True) + value.name = indexer + + else: + # a list-list + if is_list_like_indexer(value): + # must have conforming columns + if len(value) != len(self.obj.columns): + raise ValueError("cannot set a row with mismatched columns") + + value = Series(value, index=self.obj.columns, name=indexer) + + self.obj._data = self.obj.append(value)._data + self.obj._maybe_update_cacher(clear=True) + return self.obj + + def _align_series(self, indexer, ser: ABCSeries, multiindex_indexer: bool = False): + """ + Parameters + ---------- + indexer : tuple, slice, scalar + Indexer used to get the locations that will be set to `ser`. + ser : pd.Series + Values to assign to the locations specified by `indexer`. + multiindex_indexer : boolean, optional + Defaults to False. Should be set to True if `indexer` was from + a `pd.MultiIndex`, to avoid unnecessary broadcasting. + + Returns + ------- + `np.array` of `ser` broadcast to the appropriate shape for assignment + to the locations selected by `indexer` + """ + if isinstance(indexer, (slice, np.ndarray, list, Index)): + indexer = tuple([indexer]) + + if isinstance(indexer, tuple): + + # flatten np.ndarray indexers + def ravel(i): + return i.ravel() if isinstance(i, np.ndarray) else i + + indexer = tuple(map(ravel, indexer)) + + aligners = [not com.is_null_slice(idx) for idx in indexer] + sum_aligners = sum(aligners) + single_aligner = sum_aligners == 1 + is_frame = self.ndim == 2 + obj = self.obj + + # are we a single alignable value on a non-primary + # dim (e.g. panel: 1,2, or frame: 0) ? + # hence need to align to a single axis dimension + # rather that find all valid dims + + # frame + if is_frame: + single_aligner = single_aligner and aligners[0] + + # we have a frame, with multiple indexers on both axes; and a + # series, so need to broadcast (see GH5206) + if sum_aligners == self.ndim and all(is_sequence(_) for _ in indexer): + ser = ser.reindex(obj.axes[0][indexer[0]], copy=True)._values + + # single indexer + if len(indexer) > 1 and not multiindex_indexer: + len_indexer = len(indexer[1]) + ser = np.tile(ser, len_indexer).reshape(len_indexer, -1).T + + return ser + + for i, idx in enumerate(indexer): + ax = obj.axes[i] + + # multiple aligners (or null slices) + if is_sequence(idx) or isinstance(idx, slice): + if single_aligner and com.is_null_slice(idx): + continue + new_ix = ax[idx] + if not is_list_like_indexer(new_ix): + new_ix = Index([new_ix]) + else: + new_ix = Index(new_ix) + if ser.index.equals(new_ix) or not len(new_ix): + return ser._values.copy() + + return ser.reindex(new_ix)._values + + # 2 dims + elif single_aligner: + + # reindex along index + ax = self.obj.axes[1] + if ser.index.equals(ax) or not len(ax): + return ser._values.copy() + return ser.reindex(ax)._values + + elif is_scalar(indexer): + ax = self.obj._get_axis(1) + + if ser.index.equals(ax): + return ser._values.copy() + + return ser.reindex(ax)._values + + raise ValueError("Incompatible indexer with Series") + + def _align_frame(self, indexer, df: ABCDataFrame): + is_frame = self.ndim == 2 + + if isinstance(indexer, tuple): + + idx, cols = None, None + sindexers = [] + for i, ix in enumerate(indexer): + ax = self.obj.axes[i] + if is_sequence(ix) or isinstance(ix, slice): + if isinstance(ix, np.ndarray): + ix = ix.ravel() + if idx is None: + idx = ax[ix] + elif cols is None: + cols = ax[ix] + else: + break + else: + sindexers.append(i) + + if idx is not None and cols is not None: + + if df.index.equals(idx) and df.columns.equals(cols): + val = df.copy()._values + else: + val = df.reindex(idx, columns=cols)._values + return val + + elif (isinstance(indexer, slice) or is_list_like_indexer(indexer)) and is_frame: + ax = self.obj.index[indexer] + if df.index.equals(ax): + val = df.copy()._values + else: + + # we have a multi-index and are trying to align + # with a particular, level GH3738 + if ( + isinstance(ax, ABCMultiIndex) + and isinstance(df.index, ABCMultiIndex) + and ax.nlevels != df.index.nlevels + ): + raise TypeError( + "cannot align on a multi-index with out " + "specifying the join levels" + ) + + val = df.reindex(index=ax)._values + return val + + raise ValueError("Incompatible indexer with DataFrame") + + def _getitem_tuple(self, tup: Tuple): + try: + return self._getitem_lowerdim(tup) + except IndexingError: + pass + + # no multi-index, so validate all of the indexers + self._has_valid_tuple(tup) + + # ugly hack for GH #836 + if self._multi_take_opportunity(tup): + return self._multi_take(tup) + + # no shortcut needed + retval = self.obj + for i, key in enumerate(tup): + if com.is_null_slice(key): + continue + + retval = getattr(retval, self.name)._getitem_axis(key, axis=i) + + return retval + + def _multi_take_opportunity(self, tup: Tuple) -> bool: + """ + Check whether there is the possibility to use ``_multi_take``. + + Currently the limit is that all axes being indexed, must be indexed with + list-likes. + + Parameters + ---------- + tup : tuple + Tuple of indexers, one per axis. + + Returns + ------- + bool + Whether the current indexing, + can be passed through `_multi_take`. + """ + if not all(is_list_like_indexer(x) for x in tup): + return False + + # just too complicated + if any(com.is_bool_indexer(x) for x in tup): + return False + + return True + + def _multi_take(self, tup: Tuple): + """ + Create the indexers for the passed tuple of keys, and + executes the take operation. This allows the take operation to be + executed all at once, rather than once for each dimension. + Improving efficiency. + + Parameters + ---------- + tup : tuple + Tuple of indexers, one per axis. + + Returns + ------- + values: same type as the object being indexed + """ + # GH 836 + o = self.obj + d = { + axis: self._get_listlike_indexer(key, axis) + for (key, axis) in zip(tup, o._AXIS_ORDERS) + } + return o._reindex_with_indexers(d, copy=True, allow_dups=True) + + def _convert_for_reindex(self, key, axis: int): + return key + + def _handle_lowerdim_multi_index_axis0(self, tup: Tuple): + # we have an axis0 multi-index, handle or raise + axis = self.axis or 0 + try: + # fast path for series or for tup devoid of slices + return self._get_label(tup, axis=axis) + except TypeError: + # slices are unhashable + pass + except KeyError as ek: + # raise KeyError if number of indexers match + # else IndexingError will be raised + if len(tup) <= self.obj.index.nlevels and len(tup) > self.ndim: + raise ek + + return None + + def _getitem_lowerdim(self, tup: Tuple): + + # we can directly get the axis result since the axis is specified + if self.axis is not None: + axis = self.obj._get_axis_number(self.axis) + return self._getitem_axis(tup, axis=axis) + + # we may have a nested tuples indexer here + if self._is_nested_tuple_indexer(tup): + return self._getitem_nested_tuple(tup) + + # we maybe be using a tuple to represent multiple dimensions here + ax0 = self.obj._get_axis(0) + # ...but iloc should handle the tuple as simple integer-location + # instead of checking it as multiindex representation (GH 13797) + if isinstance(ax0, ABCMultiIndex) and self.name != "iloc": + result = self._handle_lowerdim_multi_index_axis0(tup) + if result is not None: + return result + + if len(tup) > self.ndim: + raise IndexingError("Too many indexers. handle elsewhere") + + for i, key in enumerate(tup): + if is_label_like(key) or isinstance(key, tuple): + section = self._getitem_axis(key, axis=i) + + # we have yielded a scalar ? + if not is_list_like_indexer(section): + return section + + elif section.ndim == self.ndim: + # we're in the middle of slicing through a MultiIndex + # revise the key wrt to `section` by inserting an _NS + new_key = tup[:i] + (_NS,) + tup[i + 1 :] + + else: + new_key = tup[:i] + tup[i + 1 :] + + # unfortunately need an odious kludge here because of + # DataFrame transposing convention + if ( + isinstance(section, ABCDataFrame) + and i > 0 + and len(new_key) == 2 + ): + a, b = new_key + new_key = b, a + + if len(new_key) == 1: + new_key = new_key[0] + + # Slices should return views, but calling iloc/loc with a null + # slice returns a new object. + if com.is_null_slice(new_key): + return section + # This is an elided recursive call to iloc/loc/etc' + return getattr(section, self.name)[new_key] + + raise IndexingError("not applicable") + + def _getitem_nested_tuple(self, tup: Tuple): + # we have a nested tuple so have at least 1 multi-index level + # we should be able to match up the dimensionality here + + # we have too many indexers for our dim, but have at least 1 + # multi-index dimension, try to see if we have something like + # a tuple passed to a series with a multi-index + if len(tup) > self.ndim: + result = self._handle_lowerdim_multi_index_axis0(tup) + if result is not None: + return result + + # this is a series with a multi-index specified a tuple of + # selectors + axis = self.axis or 0 + return self._getitem_axis(tup, axis=axis) + + # handle the multi-axis by taking sections and reducing + # this is iterative + obj = self.obj + axis = 0 + for i, key in enumerate(tup): + + if com.is_null_slice(key): + axis += 1 + continue + + current_ndim = obj.ndim + obj = getattr(obj, self.name)._getitem_axis(key, axis=axis) + axis += 1 + + # if we have a scalar, we are done + if is_scalar(obj) or not hasattr(obj, "ndim"): + break + + # has the dim of the obj changed? + # GH 7199 + if obj.ndim < current_ndim: + axis -= 1 + + return obj + + # TODO: remove once geopandas no longer needs __getitem__ + def _getitem_axis(self, key, axis: int): + if is_iterator(key): + key = list(key) + self._validate_key(key, axis) + + labels = self.obj._get_axis(axis) + if isinstance(key, slice): + return self._get_slice_axis(key, axis=axis) + elif is_list_like_indexer(key) and not ( + isinstance(key, tuple) and isinstance(labels, ABCMultiIndex) + ): + + if hasattr(key, "ndim") and key.ndim > 1: + raise ValueError("Cannot index with multidimensional key") + + return self._getitem_iterable(key, axis=axis) + else: + + # maybe coerce a float scalar to integer + key = labels._maybe_cast_indexer(key) + + if is_integer(key): + if axis == 0 and isinstance(labels, ABCMultiIndex): + try: + return self._get_label(key, axis=axis) + except (KeyError, TypeError): + if self.obj.index.levels[0].is_integer(): + raise + + # this is the fallback! (for a non-float, non-integer index) + if not labels.is_floating() and not labels.is_integer(): + return self._get_loc(key, axis=axis) + + return self._get_label(key, axis=axis) + + def _get_listlike_indexer(self, key, axis: int, raise_missing: bool = False): + """ + Transform a list-like of keys into a new index and an indexer. + + Parameters + ---------- + key : list-like + Targeted labels. + axis: int + Dimension on which the indexing is being made. + raise_missing: bool, default False + Whether to raise a KeyError if some labels were not found. + Will be removed in the future, and then this method will always behave as + if ``raise_missing=True``. + + Raises + ------ + KeyError + If at least one key was requested but none was found, and + raise_missing=True. + + Returns + ------- + keyarr: Index + New index (coinciding with 'key' if the axis is unique). + values : array-like + Indexer for the return object, -1 denotes keys not found. + """ + o = self.obj + ax = o._get_axis(axis) + + # Have the index compute an indexer or return None + # if it cannot handle: + indexer, keyarr = ax._convert_listlike_indexer(key, kind=self.name) + # We only act on all found values: + if indexer is not None and (indexer != -1).all(): + self._validate_read_indexer(key, indexer, axis, raise_missing=raise_missing) + return ax[indexer], indexer + + if ax.is_unique and not getattr(ax, "is_overlapping", False): + # If we are trying to get actual keys from empty Series, we + # patiently wait for a KeyError later on - otherwise, convert + if len(ax) or not len(key): + key = self._convert_for_reindex(key, axis) + indexer = ax.get_indexer_for(key) + keyarr = ax.reindex(keyarr)[0] + else: + keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr) + + self._validate_read_indexer( + keyarr, indexer, o._get_axis_number(axis), raise_missing=raise_missing + ) + return keyarr, indexer + + def _getitem_iterable(self, key, axis: int): + """ + Index current object with an an iterable key. + + The iterable key can be a boolean indexer or a collection of keys. + + Parameters + ---------- + key : iterable + Targeted labels or boolean indexer. + axis: int + Dimension on which the indexing is being made. + + Raises + ------ + KeyError + If no key was found. Will change in the future to raise if not all + keys were found. + IndexingError + If the boolean indexer is unalignable with the object being + indexed. + + Returns + ------- + scalar, DataFrame, or Series: indexed value(s). + """ + # caller is responsible for ensuring non-None axis + self._validate_key(key, axis) + + labels = self.obj._get_axis(axis) + + if com.is_bool_indexer(key): + # A boolean indexer + key = check_bool_indexer(labels, key) + (inds,) = key.nonzero() + return self.obj._take_with_is_copy(inds, axis=axis) + else: + # A collection of keys + keyarr, indexer = self._get_listlike_indexer(key, axis, raise_missing=False) + return self.obj._reindex_with_indexers( + {axis: [keyarr, indexer]}, copy=True, allow_dups=True + ) + + def _validate_read_indexer( + self, key, indexer, axis: int, raise_missing: bool = False + ): + """ + Check that indexer can be used to return a result. + + e.g. at least one element was found, + unless the list of keys was actually empty. + + Parameters + ---------- + key : list-like + Targeted labels (only used to show correct error message). + indexer: array-like of booleans + Indices corresponding to the key, + (with -1 indicating not found). + axis: int + Dimension on which the indexing is being made. + raise_missing: bool + Whether to raise a KeyError if some labels are not found. Will be + removed in the future, and then this method will always behave as + if raise_missing=True. + + Raises + ------ + KeyError + If at least one key was requested but none was found, and + raise_missing=True. + """ + ax = self.obj._get_axis(axis) + + if len(key) == 0: + return + + # Count missing values: + missing = (indexer < 0).sum() + + if missing: + if missing == len(indexer): + axis_name = self.obj._get_axis_name(axis) + raise KeyError(f"None of [{key}] are in the [{axis_name}]") + + # We (temporarily) allow for some missing keys with .loc, except in + # some cases (e.g. setting) in which "raise_missing" will be False + if not (self.name == "loc" and not raise_missing): + not_found = list(set(key) - set(ax)) + raise KeyError(f"{not_found} not in index") + + # we skip the warning on Categorical/Interval + # as this check is actually done (check for + # non-missing values), but a bit later in the + # code, so we want to avoid warning & then + # just raising + if not (ax.is_categorical() or ax.is_interval()): + raise KeyError( + "Passing list-likes to .loc or [] with any missing labels " + "is no longer supported, see " + "https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike" # noqa:E501 + ) + + def _convert_to_indexer(self, obj, axis: int, raise_missing: bool = False): + """ + Convert indexing key into something we can use to do actual fancy + indexing on a ndarray. + + Examples + ix[:5] -> slice(0, 5) + ix[[1,2,3]] -> [1,2,3] + ix[['foo', 'bar', 'baz']] -> [i, j, k] (indices of foo, bar, baz) + + Going by Zen of Python? + 'In the face of ambiguity, refuse the temptation to guess.' + raise AmbiguousIndexError with integer labels? + - No, prefer label-based indexing + """ + labels = self.obj._get_axis(axis) + + if isinstance(obj, slice): + return self._convert_slice_indexer(obj, axis) + + # try to find out correct indexer, if not type correct raise + try: + obj = self._convert_scalar_indexer(obj, axis) + except TypeError: + # but we will allow setting + pass + + # see if we are positional in nature + is_int_index = labels.is_integer() + is_int_positional = is_integer(obj) and not is_int_index + + # if we are a label return me + try: + return labels.get_loc(obj) + except LookupError: + if isinstance(obj, tuple) and isinstance(labels, ABCMultiIndex): + if len(obj) == labels.nlevels: + return {"key": obj} + raise + except TypeError: + pass + except ValueError: + if not is_int_positional: + raise + + # a positional + if is_int_positional: + + # if we are setting and its not a valid location + # its an insert which fails by definition + + if self.name == "loc": + # always valid + return {"key": obj} + + if obj >= self.obj.shape[axis] and not isinstance(labels, ABCMultiIndex): + # a positional + raise ValueError("cannot set by positional indexing with enlargement") + + return obj + + if is_nested_tuple(obj, labels): + return labels.get_locs(obj) + + elif is_list_like_indexer(obj): + + if com.is_bool_indexer(obj): + obj = check_bool_indexer(labels, obj) + (inds,) = obj.nonzero() + return inds + else: + # When setting, missing keys are not allowed, even with .loc: + return self._get_listlike_indexer(obj, axis, raise_missing=True)[1] + else: + try: + return labels.get_loc(obj) + except LookupError: + # allow a not found key only if we are a setter + if not is_list_like_indexer(obj): + return {"key": obj} + raise + + def _get_slice_axis(self, slice_obj: slice, axis: int): + # caller is responsible for ensuring non-None axis + obj = self.obj + + if not need_slice(slice_obj): + return obj.copy(deep=False) + + indexer = self._convert_slice_indexer(slice_obj, axis) + return self._slice(indexer, axis=axis, kind="iloc") + + +class _LocationIndexer(_NDFrameIndexer): + def __getitem__(self, key): + if type(key) is tuple: + key = tuple(com.apply_if_callable(x, self.obj) for x in key) + if self._is_scalar_access(key): + try: + return self._getitem_scalar(key) + except (KeyError, IndexError, AttributeError): + pass + return self._getitem_tuple(key) + else: + # we by definition only have the 0th axis + axis = self.axis or 0 + + maybe_callable = com.apply_if_callable(key, self.obj) + return self._getitem_axis(maybe_callable, axis=axis) + + def _is_scalar_access(self, key: Tuple): + raise NotImplementedError() + + def _getitem_scalar(self, key): + raise NotImplementedError() + + def _getitem_axis(self, key, axis: int): + raise NotImplementedError() + + def _getbool_axis(self, key, axis: int): + # caller is responsible for ensuring non-None axis + labels = self.obj._get_axis(axis) + key = check_bool_indexer(labels, key) + inds = key.nonzero()[0] + return self.obj._take_with_is_copy(inds, axis=axis) + + def _get_slice_axis(self, slice_obj: slice, axis: int): + """ + This is pretty simple as we just have to deal with labels. + """ + # caller is responsible for ensuring non-None axis + obj = self.obj + if not need_slice(slice_obj): + return obj.copy(deep=False) + + labels = obj._get_axis(axis) + indexer = labels.slice_indexer( + slice_obj.start, slice_obj.stop, slice_obj.step, kind=self.name + ) + + if isinstance(indexer, slice): + return self._slice(indexer, axis=axis, kind="iloc") + else: + # DatetimeIndex overrides Index.slice_indexer and may + # return a DatetimeIndex instead of a slice object. + return self.obj._take_with_is_copy(indexer, axis=axis) + + +@Appender(IndexingMixin.loc.__doc__) +class _LocIndexer(_LocationIndexer): + _valid_types = ( + "labels (MUST BE IN THE INDEX), slices of labels (BOTH " + "endpoints included! Can be slices of integers if the " + "index is integers), listlike of labels, boolean" + ) + + @Appender(_NDFrameIndexer._validate_key.__doc__) + def _validate_key(self, key, axis: int): + + # valid for a collection of labels (we check their presence later) + # slice of labels (where start-end in labels) + # slice of integers (only if in the labels) + # boolean + + if isinstance(key, slice): + return + + if com.is_bool_indexer(key): + return + + if not is_list_like_indexer(key): + self._convert_scalar_indexer(key, axis) + + def _is_scalar_access(self, key: Tuple) -> bool: + """ + Returns + ------- + bool + """ + # this is a shortcut accessor to both .loc and .iloc + # that provide the equivalent access of .at and .iat + # a) avoid getting things via sections and (to minimize dtype changes) + # b) provide a performant path + if len(key) != self.ndim: + return False + + for i, k in enumerate(key): + if not is_scalar(k): + return False + + ax = self.obj.axes[i] + if isinstance(ax, ABCMultiIndex): + return False + + if isinstance(k, str) and ax._supports_partial_string_indexing: + # partial string indexing, df.loc['2000', 'A'] + # should not be considered scalar + return False + + if not ax.is_unique: + return False + + return True + + def _getitem_scalar(self, key): + # a fast-path to scalar access + # if not, raise + values = self.obj._get_value(*key) + return values + + def _get_partial_string_timestamp_match_key(self, key, labels): + """ + Translate any partial string timestamp matches in key, returning the + new key. + + (GH 10331) + """ + if isinstance(labels, ABCMultiIndex): + if ( + isinstance(key, str) + and labels.levels[0]._supports_partial_string_indexing + ): + # Convert key '2016-01-01' to + # ('2016-01-01'[, slice(None, None, None)]+) + key = tuple([key] + [slice(None)] * (len(labels.levels) - 1)) + + if isinstance(key, tuple): + # Convert (..., '2016-01-01', ...) in tuple to + # (..., slice('2016-01-01', '2016-01-01', None), ...) + new_key = [] + for i, component in enumerate(key): + if ( + isinstance(component, str) + and labels.levels[i]._supports_partial_string_indexing + ): + new_key.append(slice(component, component, None)) + else: + new_key.append(component) + key = tuple(new_key) + + return key + + def _getitem_axis(self, key, axis: int): + key = item_from_zerodim(key) + if is_iterator(key): + key = list(key) + + labels = self.obj._get_axis(axis) + key = self._get_partial_string_timestamp_match_key(key, labels) + + if isinstance(key, slice): + self._validate_key(key, axis) + return self._get_slice_axis(key, axis=axis) + elif com.is_bool_indexer(key): + return self._getbool_axis(key, axis=axis) + elif is_list_like_indexer(key): + + # convert various list-like indexers + # to a list of keys + # we will use the *values* of the object + # and NOT the index if its a PandasObject + if isinstance(labels, ABCMultiIndex): + + if isinstance(key, (ABCSeries, np.ndarray)) and key.ndim <= 1: + # Series, or 0,1 ndim ndarray + # GH 14730 + key = list(key) + elif isinstance(key, ABCDataFrame): + # GH 15438 + raise NotImplementedError( + "Indexing a MultiIndex with a " + "DataFrame key is not " + "implemented" + ) + elif hasattr(key, "ndim") and key.ndim > 1: + raise NotImplementedError( + "Indexing a MultiIndex with a " + "multidimensional key is not " + "implemented" + ) + + if ( + not isinstance(key, tuple) + and len(key) + and not isinstance(key[0], tuple) + ): + key = tuple([key]) + + # an iterable multi-selection + if not (isinstance(key, tuple) and isinstance(labels, ABCMultiIndex)): + + if hasattr(key, "ndim") and key.ndim > 1: + raise ValueError("Cannot index with multidimensional key") + + return self._getitem_iterable(key, axis=axis) + + # nested tuple slicing + if is_nested_tuple(key, labels): + locs = labels.get_locs(key) + indexer = [slice(None)] * self.ndim + indexer[axis] = locs + return self.obj.iloc[tuple(indexer)] + + # fall thru to straight lookup + self._validate_key(key, axis) + return self._get_label(key, axis=axis) + + +@Appender(IndexingMixin.iloc.__doc__) +class _iLocIndexer(_LocationIndexer): + _valid_types = ( + "integer, integer slice (START point is INCLUDED, END " + "point is EXCLUDED), listlike of integers, boolean array" + ) + _get_slice_axis = _NDFrameIndexer._get_slice_axis + + def _validate_key(self, key, axis: int): + if com.is_bool_indexer(key): + if hasattr(key, "index") and isinstance(key.index, Index): + if key.index.inferred_type == "integer": + raise NotImplementedError( + "iLocation based boolean " + "indexing on an integer type " + "is not available" + ) + raise ValueError( + "iLocation based boolean indexing cannot use " + "an indexable as a mask" + ) + return + + if isinstance(key, slice): + return + elif is_integer(key): + self._validate_integer(key, axis) + elif isinstance(key, tuple): + # a tuple should already have been caught by this point + # so don't treat a tuple as a valid indexer + raise IndexingError("Too many indexers") + elif is_list_like_indexer(key): + arr = np.array(key) + len_axis = len(self.obj._get_axis(axis)) + + # check that the key has a numeric dtype + if not is_numeric_dtype(arr.dtype): + raise IndexError(f".iloc requires numeric indexers, got {arr}") + + # check that the key does not exceed the maximum size of the index + if len(arr) and (arr.max() >= len_axis or arr.min() < -len_axis): + raise IndexError("positional indexers are out-of-bounds") + else: + raise ValueError(f"Can only index by location with a [{self._valid_types}]") + + def _has_valid_setitem_indexer(self, indexer): + self._has_valid_positional_setitem_indexer(indexer) + + def _is_scalar_access(self, key: Tuple) -> bool: + """ + Returns + ------- + bool + """ + # this is a shortcut accessor to both .loc and .iloc + # that provide the equivalent access of .at and .iat + # a) avoid getting things via sections and (to minimize dtype changes) + # b) provide a performant path + if len(key) != self.ndim: + return False + + for i, k in enumerate(key): + if not is_integer(k): + return False + + ax = self.obj.axes[i] + if not ax.is_unique: + return False + + return True + + def _getitem_scalar(self, key): + # a fast-path to scalar access + # if not, raise + values = self.obj._get_value(*key, takeable=True) + return values + + def _validate_integer(self, key: int, axis: int) -> None: + """ + Check that 'key' is a valid position in the desired axis. + + Parameters + ---------- + key : int + Requested position. + axis : int + Desired axis. + + Raises + ------ + IndexError + If 'key' is not a valid position in axis 'axis'. + """ + len_axis = len(self.obj._get_axis(axis)) + if key >= len_axis or key < -len_axis: + raise IndexError("single positional indexer is out-of-bounds") + + def _getitem_tuple(self, tup: Tuple): + + self._has_valid_tuple(tup) + try: + return self._getitem_lowerdim(tup) + except IndexingError: + pass + + retval = self.obj + axis = 0 + for i, key in enumerate(tup): + if com.is_null_slice(key): + axis += 1 + continue + + retval = getattr(retval, self.name)._getitem_axis(key, axis=axis) + + # if the dim was reduced, then pass a lower-dim the next time + if retval.ndim < self.ndim: + # TODO: this is never reached in tests; can we confirm that + # it is impossible? + axis -= 1 + + # try to get for the next axis + axis += 1 + + return retval + + def _get_list_axis(self, key, axis: int): + """ + Return Series values by list or array of integers. + + Parameters + ---------- + key : list-like positional indexer + axis : int + + Returns + ------- + Series object + + Notes + ----- + `axis` can only be zero. + """ + try: + return self.obj._take_with_is_copy(key, axis=axis) + except IndexError: + # re-raise with different error message + raise IndexError("positional indexers are out-of-bounds") + + def _getitem_axis(self, key, axis: int): + if isinstance(key, slice): + return self._get_slice_axis(key, axis=axis) + + if isinstance(key, list): + key = np.asarray(key) + + if com.is_bool_indexer(key): + self._validate_key(key, axis) + return self._getbool_axis(key, axis=axis) + + # a list of integers + elif is_list_like_indexer(key): + return self._get_list_axis(key, axis=axis) + + # a single integer + else: + key = item_from_zerodim(key) + if not is_integer(key): + raise TypeError("Cannot index by location index with a non-integer key") + + # validate the location + self._validate_integer(key, axis) + + return self._get_loc(key, axis=axis) + + # raise_missing is included for compat with the parent class signature + def _convert_to_indexer(self, obj, axis: int, raise_missing: bool = False): + """ + Much simpler as we only have to deal with our valid types. + """ + # make need to convert a float key + if isinstance(obj, slice): + return self._convert_slice_indexer(obj, axis) + + elif is_float(obj): + return self._convert_scalar_indexer(obj, axis) + + try: + self._validate_key(obj, axis) + return obj + except ValueError: + raise ValueError(f"Can only index by location with a [{self._valid_types}]") + + +class _ScalarAccessIndexer(_NDFrameIndexerBase): + """ + Access scalars quickly. + """ + + def _convert_key(self, key, is_setter: bool = False): + raise AbstractMethodError(self) + + def __getitem__(self, key): + if not isinstance(key, tuple): + + # we could have a convertible item here (e.g. Timestamp) + if not is_list_like_indexer(key): + key = tuple([key]) + else: + raise ValueError("Invalid call for scalar access (getting)!") + + key = self._convert_key(key) + return self.obj._get_value(*key, takeable=self._takeable) + + def __setitem__(self, key, value): + if isinstance(key, tuple): + key = tuple(com.apply_if_callable(x, self.obj) for x in key) + else: + # scalar callable may return tuple + key = com.apply_if_callable(key, self.obj) + + if not isinstance(key, tuple): + key = _tuplify(self.ndim, key) + if len(key) != self.ndim: + raise ValueError("Not enough indexers for scalar access (setting)!") + key = list(self._convert_key(key, is_setter=True)) + key.append(value) + self.obj._set_value(*key, takeable=self._takeable) + + +@Appender(IndexingMixin.at.__doc__) +class _AtIndexer(_ScalarAccessIndexer): + _takeable = False + + def _convert_key(self, key, is_setter: bool = False): + """ + Require they keys to be the same type as the index. (so we don't + fallback) + """ + # allow arbitrary setting + if is_setter: + return list(key) + + for ax, i in zip(self.obj.axes, key): + if ax.is_integer(): + if not is_integer(i): + raise ValueError( + "At based indexing on an integer index " + "can only have integer indexers" + ) + else: + if is_integer(i) and not ax.holds_integer(): + raise ValueError( + "At based indexing on an non-integer " + "index can only have non-integer " + "indexers" + ) + return key + + +@Appender(IndexingMixin.iat.__doc__) +class _iAtIndexer(_ScalarAccessIndexer): + _takeable = True + + def _convert_key(self, key, is_setter: bool = False): + """ + Require integer args. (and convert to label arguments) + """ + for a, i in zip(self.obj.axes, key): + if not is_integer(i): + raise ValueError("iAt based indexing can only have integer indexers") + return key + + +def _tuplify(ndim: int, loc: Hashable) -> Tuple[Union[Hashable, slice], ...]: + """ + Given an indexer for the first dimension, create an equivalent tuple + for indexing over all dimensions. + + Parameters + ---------- + ndim : int + loc : object + + Returns + ------- + tuple + """ + _tup: List[Union[Hashable, slice]] + _tup = [slice(None, None) for _ in range(ndim)] + _tup[0] = loc + return tuple(_tup) + + +def convert_to_index_sliceable(obj, key): + """ + If we are index sliceable, then return my slicer, otherwise return None. + """ + idx = obj.index + if isinstance(key, slice): + return idx._convert_slice_indexer(key, kind="getitem") + + elif isinstance(key, str): + + # we are an actual column + if key in obj._data.items: + return None + + # We might have a datetimelike string that we can translate to a + # slice here via partial string indexing + if idx._supports_partial_string_indexing: + try: + return idx._get_string_slice(key) + except (KeyError, ValueError, NotImplementedError): + return None + + return None + + +def check_bool_indexer(index: Index, key) -> np.ndarray: + """ + Check if key is a valid boolean indexer for an object with such index and + perform reindexing or conversion if needed. + + This function assumes that is_bool_indexer(key) == True. + + Parameters + ---------- + index : Index + Index of the object on which the indexing is done. + key : list-like + Boolean indexer to check. + + Returns + ------- + np.array + Resulting key. + + Raises + ------ + IndexError + If the key does not have the same length as index. + IndexingError + If the index of the key is unalignable to index. + """ + result = key + if isinstance(key, ABCSeries) and not key.index.equals(index): + result = result.reindex(index) + mask = isna(result._values) + if mask.any(): + raise IndexingError( + "Unalignable boolean Series provided as " + "indexer (index of the boolean Series and of " + "the indexed object do not match)." + ) + result = result.astype(bool)._values + else: + # key might be sparse / object-dtype bool, check_array_indexer needs bool array + result = np.asarray(result, dtype=bool) + result = check_array_indexer(index, result) + + return result + + +def convert_missing_indexer(indexer): + """ + Reverse convert a missing indexer, which is a dict + return the scalar indexer and a boolean indicating if we converted + """ + if isinstance(indexer, dict): + + # a missing key (but not a tuple indexer) + indexer = indexer["key"] + + if isinstance(indexer, bool): + raise KeyError("cannot use a single bool to index into setitem") + return indexer, True + + return indexer, False + + +def convert_from_missing_indexer_tuple(indexer, axes): + """ + Create a filtered indexer that doesn't have any missing indexers. + """ + + def get_indexer(_i, _idx): + return axes[_i].get_loc(_idx["key"]) if isinstance(_idx, dict) else _idx + + return tuple(get_indexer(_i, _idx) for _i, _idx in enumerate(indexer)) + + +def maybe_convert_ix(*args): + """ + We likely want to take the cross-product. + """ + ixify = True + for arg in args: + if not isinstance(arg, (np.ndarray, list, ABCSeries, Index)): + ixify = False + + if ixify: + return np.ix_(*args) + else: + return args + + +def is_nested_tuple(tup, labels) -> bool: + """ + Returns + ------- + bool + """ + # check for a compatible nested tuple and multiindexes among the axes + if not isinstance(tup, tuple): + return False + + for i, k in enumerate(tup): + + if is_list_like(k) or isinstance(k, slice): + return isinstance(labels, ABCMultiIndex) + + return False + + +def is_label_like(key) -> bool: + """ + Returns + ------- + bool + """ + # select a label or row + return not isinstance(key, slice) and not is_list_like_indexer(key) + + +def need_slice(obj) -> bool: + """ + Returns + ------- + bool + """ + return ( + obj.start is not None + or obj.stop is not None + or (obj.step is not None and obj.step != 1) + ) + + +def _non_reducing_slice(slice_): + """ + Ensurse that a slice doesn't reduce to a Series or Scalar. + + Any user-paseed `subset` should have this called on it + to make sure we're always working with DataFrames. + """ + # default to column slice, like DataFrame + # ['A', 'B'] -> IndexSlices[:, ['A', 'B']] + kinds = (ABCSeries, np.ndarray, Index, list, str) + if isinstance(slice_, kinds): + slice_ = IndexSlice[:, slice_] + + def pred(part) -> bool: + """ + Returns + ------- + bool + True if slice does *not* reduce, + False if `part` is a tuple. + """ + # true when slice does *not* reduce, False when part is a tuple, + # i.e. MultiIndex slice + return (isinstance(part, slice) or is_list_like(part)) and not isinstance( + part, tuple + ) + + if not is_list_like(slice_): + if not isinstance(slice_, slice): + # a 1-d slice, like df.loc[1] + slice_ = [[slice_]] + else: + # slice(a, b, c) + slice_ = [slice_] # to tuplize later + else: + slice_ = [part if pred(part) else [part] for part in slice_] + return tuple(slice_) + + +def _maybe_numeric_slice(df, slice_, include_bool=False): + """ + Want nice defaults for background_gradient that don't break + with non-numeric data. But if slice_ is passed go with that. + """ + if slice_ is None: + dtypes = [np.number] + if include_bool: + dtypes.append(bool) + slice_ = IndexSlice[:, df.select_dtypes(include=dtypes).columns] + return slice_ + + +def _can_do_equal_len(labels, value, plane_indexer, lplane_indexer, obj) -> bool: + """ + Returns + ------- + bool + True if we have an equal len settable. + """ + if not len(labels) == 1 or not np.iterable(value) or is_scalar(plane_indexer[0]): + return False + + item = labels[0] + index = obj[item].index + + values_len = len(value) + # equal len list/ndarray + if len(index) == values_len: + return True + elif lplane_indexer == values_len: + return True + + return False diff --git a/venv/Lib/site-packages/pandas/core/internals/__init__.py b/venv/Lib/site-packages/pandas/core/internals/__init__.py new file mode 100644 index 0000000..37a3405 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/internals/__init__.py @@ -0,0 +1,47 @@ +from pandas.core.internals.blocks import ( # io.pytables, io.packers + Block, + BoolBlock, + CategoricalBlock, + ComplexBlock, + DatetimeBlock, + DatetimeTZBlock, + ExtensionBlock, + FloatBlock, + IntBlock, + ObjectBlock, + TimeDeltaBlock, + _block_shape, + _safe_reshape, + make_block, +) +from pandas.core.internals.managers import ( + BlockManager, + SingleBlockManager, + _transform_index, + concatenate_block_managers, + create_block_manager_from_arrays, + create_block_manager_from_blocks, +) + +__all__ = [ + "Block", + "BoolBlock", + "CategoricalBlock", + "ComplexBlock", + "DatetimeBlock", + "DatetimeTZBlock", + "ExtensionBlock", + "FloatBlock", + "IntBlock", + "ObjectBlock", + "TimeDeltaBlock", + "_safe_reshape", + "make_block", + "_block_shape", + "BlockManager", + "SingleBlockManager", + "_transform_index", + "concatenate_block_managers", + "create_block_manager_from_arrays", + "create_block_manager_from_blocks", +] diff --git a/venv/Lib/site-packages/pandas/core/internals/blocks.py b/venv/Lib/site-packages/pandas/core/internals/blocks.py new file mode 100644 index 0000000..5fcd796 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/internals/blocks.py @@ -0,0 +1,3194 @@ +from datetime import datetime, timedelta +import functools +import inspect +import re +from typing import Any, List +import warnings + +import numpy as np + +from pandas._libs import NaT, algos as libalgos, lib, tslib, writers +from pandas._libs.index import convert_scalar +import pandas._libs.internals as libinternals +from pandas._libs.tslibs import Timedelta, conversion +from pandas._libs.tslibs.timezones import tz_compare +from pandas.util._validators import validate_bool_kwarg + +from pandas.core.dtypes.cast import ( + astype_nansafe, + find_common_type, + infer_dtype_from, + infer_dtype_from_scalar, + maybe_downcast_numeric, + maybe_downcast_to_dtype, + maybe_infer_dtype_type, + maybe_promote, + maybe_upcast, + soft_convert_objects, +) +from pandas.core.dtypes.common import ( + _NS_DTYPE, + _TD_DTYPE, + ensure_platform_int, + is_bool_dtype, + is_categorical, + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_dtype_equal, + is_extension_array_dtype, + is_float_dtype, + is_integer, + is_integer_dtype, + is_interval_dtype, + is_list_like, + is_object_dtype, + is_period_dtype, + is_re, + is_re_compilable, + is_sparse, + is_timedelta64_dtype, + pandas_dtype, +) +from pandas.core.dtypes.concat import concat_categorical, concat_datetime +from pandas.core.dtypes.dtypes import CategoricalDtype, ExtensionDtype +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCExtensionArray, + ABCPandasArray, + ABCSeries, +) +from pandas.core.dtypes.missing import ( + _isna_compat, + array_equivalent, + is_valid_nat_for_dtype, + isna, +) + +import pandas.core.algorithms as algos +from pandas.core.arrays import ( + Categorical, + DatetimeArray, + ExtensionArray, + PandasArray, + PandasDtype, + TimedeltaArray, +) +from pandas.core.base import PandasObject +import pandas.core.common as com +from pandas.core.construction import extract_array +from pandas.core.indexers import ( + check_setitem_lengths, + is_empty_indexer, + is_scalar_indexer, +) +import pandas.core.missing as missing +from pandas.core.nanops import nanpercentile + +from pandas.io.formats.printing import pprint_thing + + +class Block(PandasObject): + """ + Canonical n-dimensional unit of homogeneous dtype contained in a pandas + data structure + + Index-ignorant; let the container take care of that + """ + + __slots__ = ["_mgr_locs", "values", "ndim"] + is_numeric = False + is_float = False + is_integer = False + is_complex = False + is_datetime = False + is_datetimetz = False + is_timedelta = False + is_bool = False + is_object = False + is_categorical = False + is_extension = False + _can_hold_na = False + _can_consolidate = True + _verify_integrity = True + _validate_ndim = True + _ftype = "dense" + _concatenator = staticmethod(np.concatenate) + + def __init__(self, values, placement, ndim=None): + self.ndim = self._check_ndim(values, ndim) + self.mgr_locs = placement + self.values = values + + if self._validate_ndim and self.ndim and len(self.mgr_locs) != len(self.values): + raise ValueError( + f"Wrong number of items passed {len(self.values)}, " + f"placement implies {len(self.mgr_locs)}" + ) + + def _check_ndim(self, values, ndim): + """ + ndim inference and validation. + + Infers ndim from 'values' if not provided to __init__. + Validates that values.ndim and ndim are consistent if and only if + the class variable '_validate_ndim' is True. + + Parameters + ---------- + values : array-like + ndim : int or None + + Returns + ------- + ndim : int + + Raises + ------ + ValueError : the number of dimensions do not match + """ + if ndim is None: + ndim = values.ndim + + if self._validate_ndim and values.ndim != ndim: + raise ValueError( + "Wrong number of dimensions. " + f"values.ndim != ndim [{values.ndim} != {ndim}]" + ) + return ndim + + @property + def _holder(self): + """The array-like that can hold the underlying values. + + None for 'Block', overridden by subclasses that don't + use an ndarray. + """ + return None + + @property + def _consolidate_key(self): + return (self._can_consolidate, self.dtype.name) + + @property + def _is_single_block(self): + return self.ndim == 1 + + @property + def is_view(self): + """ return a boolean if I am possibly a view """ + return self.values.base is not None + + @property + def is_datelike(self): + """ return True if I am a non-datelike """ + return self.is_datetime or self.is_timedelta + + def is_categorical_astype(self, dtype): + """ + validate that we have a astypeable to categorical, + returns a boolean if we are a categorical + """ + if dtype is Categorical or dtype is CategoricalDtype: + # this is a pd.Categorical, but is not + # a valid type for astypeing + raise TypeError(f"invalid type {dtype} for astype") + + elif is_categorical_dtype(dtype): + return True + + return False + + def external_values(self, dtype=None): + """ + The array that Series.values returns (public attribute). + + This has some historical constraints, and is overridden in block + subclasses to return the correct array (e.g. period returns + object ndarray and datetimetz a datetime64[ns] ndarray instead of + proper extension array). + """ + return self.values + + def internal_values(self, dtype=None): + """ return an internal format, currently just the ndarray + this should be the pure internal API format + """ + return self.values + + def array_values(self) -> ExtensionArray: + """ + The array that Series.array returns. Always an ExtensionArray. + """ + return PandasArray(self.values) + + def get_values(self, dtype=None): + """ + return an internal format, currently just the ndarray + this is often overridden to handle to_dense like operations + """ + if is_object_dtype(dtype): + return self.values.astype(object) + return self.values + + def get_block_values(self, dtype=None): + """ + This is used in the JSON C code + """ + return self.get_values(dtype=dtype) + + def to_dense(self): + return self.values.view() + + @property + def fill_value(self): + return np.nan + + @property + def mgr_locs(self): + return self._mgr_locs + + @mgr_locs.setter + def mgr_locs(self, new_mgr_locs): + if not isinstance(new_mgr_locs, libinternals.BlockPlacement): + new_mgr_locs = libinternals.BlockPlacement(new_mgr_locs) + + self._mgr_locs = new_mgr_locs + + @property + def array_dtype(self): + """ the dtype to return if I want to construct this block as an + array + """ + return self.dtype + + def make_block(self, values, placement=None) -> "Block": + """ + Create a new block, with type inference propagate any values that are + not specified + """ + if placement is None: + placement = self.mgr_locs + + return make_block(values, placement=placement, ndim=self.ndim) + + def make_block_same_class(self, values, placement=None, ndim=None): + """ Wrap given values in a block of same type as self. """ + if placement is None: + placement = self.mgr_locs + if ndim is None: + ndim = self.ndim + return make_block(values, placement=placement, ndim=ndim, klass=type(self)) + + def __repr__(self) -> str: + # don't want to print out all of the items here + name = type(self).__name__ + if self._is_single_block: + + result = f"{name}: {len(self)} dtype: {self.dtype}" + + else: + + shape = " x ".join(pprint_thing(s) for s in self.shape) + result = ( + f"{name}: {pprint_thing(self.mgr_locs.indexer)}, " + f"{shape}, dtype: {self.dtype}" + ) + + return result + + def __len__(self) -> int: + return len(self.values) + + def __getstate__(self): + return self.mgr_locs.indexer, self.values + + def __setstate__(self, state): + self.mgr_locs = libinternals.BlockPlacement(state[0]) + self.values = state[1] + self.ndim = self.values.ndim + + def _slice(self, slicer): + """ return a slice of my values """ + return self.values[slicer] + + def getitem_block(self, slicer, new_mgr_locs=None): + """ + Perform __getitem__-like, return result as block. + + As of now, only supports slices that preserve dimensionality. + """ + if new_mgr_locs is None: + if isinstance(slicer, tuple): + axis0_slicer = slicer[0] + else: + axis0_slicer = slicer + new_mgr_locs = self.mgr_locs[axis0_slicer] + + new_values = self._slice(slicer) + + if self._validate_ndim and new_values.ndim != self.ndim: + raise ValueError("Only same dim slicing is allowed") + + return self.make_block_same_class(new_values, new_mgr_locs) + + @property + def shape(self): + return self.values.shape + + @property + def dtype(self): + return self.values.dtype + + @property + def ftype(self): + if getattr(self.values, "_pandas_ftype", False): + dtype = self.dtype.subtype + else: + dtype = self.dtype + return f"{dtype}:{self._ftype}" + + def merge(self, other): + return _merge_blocks([self, other]) + + def concat_same_type(self, to_concat, placement=None): + """ + Concatenate list of single blocks of the same type. + """ + values = self._concatenator( + [blk.values for blk in to_concat], axis=self.ndim - 1 + ) + return self.make_block_same_class( + values, placement=placement or slice(0, len(values), 1) + ) + + def iget(self, i): + return self.values[i] + + def set(self, locs, values): + """ + Modify Block in-place with new item value + + Returns + ------- + None + """ + self.values[locs] = values + + def delete(self, loc): + """ + Delete given loc(-s) from block in-place. + """ + self.values = np.delete(self.values, loc, 0) + self.mgr_locs = self.mgr_locs.delete(loc) + + def apply(self, func, **kwargs): + """ apply the function to my values; return a block if we are not + one + """ + with np.errstate(all="ignore"): + result = func(self.values, **kwargs) + + if is_extension_array_dtype(result) and result.ndim > 1: + # if we get a 2D ExtensionArray, we need to split it into 1D pieces + nbs = [] + for i, loc in enumerate(self.mgr_locs): + vals = result[i] + nv = _block_shape(vals, ndim=self.ndim) + block = self.make_block(values=nv, placement=[loc]) + nbs.append(block) + return nbs + + if not isinstance(result, Block): + result = self.make_block(values=_block_shape(result, ndim=self.ndim)) + + return result + + def fillna(self, value, limit=None, inplace=False, downcast=None): + """ fillna on the block with the value. If we fail, then convert to + ObjectBlock and try again + """ + inplace = validate_bool_kwarg(inplace, "inplace") + + mask = isna(self.values) + if limit is not None: + limit = libalgos._validate_limit(None, limit=limit) + mask[mask.cumsum(self.ndim - 1) > limit] = False + + if not self._can_hold_na: + if inplace: + return self + else: + return self.copy() + + if self._can_hold_element(value): + # equivalent: _try_coerce_args(value) would not raise + blocks = self.putmask(mask, value, inplace=inplace) + return self._maybe_downcast(blocks, downcast) + + # we can't process the value, but nothing to do + if not mask.any(): + return self if inplace else self.copy() + + # operate column-by-column + def f(mask, val, idx): + block = self.coerce_to_target_dtype(value) + + # slice out our block + if idx is not None: + # i.e. self.ndim == 2 + block = block.getitem_block(slice(idx, idx + 1)) + return block.fillna(value, limit=limit, inplace=inplace, downcast=None) + + return self.split_and_operate(None, f, inplace) + + def split_and_operate(self, mask, f, inplace: bool): + """ + split the block per-column, and apply the callable f + per-column, return a new block for each. Handle + masking which will not change a block unless needed. + + Parameters + ---------- + mask : 2-d boolean mask + f : callable accepting (1d-mask, 1d values, indexer) + inplace : boolean + + Returns + ------- + list of blocks + """ + + if mask is None: + mask = np.broadcast_to(True, shape=self.shape) + + new_values = self.values + + def make_a_block(nv, ref_loc): + if isinstance(nv, list): + assert len(nv) == 1, nv + assert isinstance(nv[0], Block) + block = nv[0] + else: + # Put back the dimension that was taken from it and make + # a block out of the result. + nv = _block_shape(nv, ndim=self.ndim) + block = self.make_block(values=nv, placement=ref_loc) + return block + + # ndim == 1 + if self.ndim == 1: + if mask.any(): + nv = f(mask, new_values, None) + else: + nv = new_values if inplace else new_values.copy() + block = make_a_block(nv, self.mgr_locs) + return [block] + + # ndim > 1 + new_blocks = [] + for i, ref_loc in enumerate(self.mgr_locs): + m = mask[i] + v = new_values[i] + + # need a new block + if m.any(): + nv = f(m, v, i) + else: + nv = v if inplace else v.copy() + + block = make_a_block(nv, [ref_loc]) + new_blocks.append(block) + + return new_blocks + + def _maybe_downcast(self, blocks: List["Block"], downcast=None) -> List["Block"]: + + # no need to downcast our float + # unless indicated + if downcast is None and ( + self.is_float or self.is_timedelta or self.is_datetime + ): + return blocks + + return _extend_blocks([b.downcast(downcast) for b in blocks]) + + def downcast(self, dtypes=None): + """ try to downcast each item to the dict of dtypes if present """ + + # turn it off completely + if dtypes is False: + return self + + values = self.values + + # single block handling + if self._is_single_block: + + # try to cast all non-floats here + if dtypes is None: + dtypes = "infer" + + nv = maybe_downcast_to_dtype(values, dtypes) + return self.make_block(nv) + + # ndim > 1 + if dtypes is None: + return self + + if not (dtypes == "infer" or isinstance(dtypes, dict)): + raise ValueError( + "downcast must have a dictionary or 'infer' as its argument" + ) + elif dtypes != "infer": + raise AssertionError("dtypes as dict is not supported yet") + + # operate column-by-column + # this is expensive as it splits the blocks items-by-item + def f(mask, val, idx): + val = maybe_downcast_to_dtype(val, dtype="infer") + return val + + return self.split_and_operate(None, f, False) + + def astype(self, dtype, copy: bool = False, errors: str = "raise"): + """ + Coerce to the new dtype. + + Parameters + ---------- + dtype : str, dtype convertible + copy : bool, default False + copy if indicated + errors : str, {'raise', 'ignore'}, default 'ignore' + - ``raise`` : allow exceptions to be raised + - ``ignore`` : suppress exceptions. On error return original object + + Returns + ------- + Block + """ + errors_legal_values = ("raise", "ignore") + + if errors not in errors_legal_values: + invalid_arg = ( + "Expected value of kwarg 'errors' to be one of " + f"{list(errors_legal_values)}. Supplied value is '{errors}'" + ) + raise ValueError(invalid_arg) + + if inspect.isclass(dtype) and issubclass(dtype, ExtensionDtype): + msg = ( + f"Expected an instance of {dtype.__name__}, " + "but got the class instead. Try instantiating 'dtype'." + ) + raise TypeError(msg) + + # may need to convert to categorical + if self.is_categorical_astype(dtype): + + if is_categorical_dtype(self.values): + # GH 10696/18593: update an existing categorical efficiently + return self.make_block(self.values.astype(dtype, copy=copy)) + + return self.make_block(Categorical(self.values, dtype=dtype)) + + dtype = pandas_dtype(dtype) + + # astype processing + if is_dtype_equal(self.dtype, dtype): + if copy: + return self.copy() + return self + + # force the copy here + if self.is_extension: + # TODO: Should we try/except this astype? + values = self.values.astype(dtype) + else: + if issubclass(dtype.type, str): + + # use native type formatting for datetime/tz/timedelta + if self.is_datelike: + values = self.to_native_types() + + # astype formatting + else: + values = self.get_values() + + else: + values = self.get_values(dtype=dtype) + + # _astype_nansafe works fine with 1-d only + vals1d = values.ravel() + try: + values = astype_nansafe(vals1d, dtype, copy=True) + except (ValueError, TypeError): + # e.g. astype_nansafe can fail on object-dtype of strings + # trying to convert to float + if errors == "raise": + raise + newb = self.copy() if copy else self + return newb + + # TODO(extension) + # should we make this attribute? + if isinstance(values, np.ndarray): + values = values.reshape(self.shape) + + newb = make_block(values, placement=self.mgr_locs, ndim=self.ndim) + + if newb.is_numeric and self.is_numeric: + if newb.shape != self.shape: + raise TypeError( + f"cannot set astype for copy = [{copy}] for dtype " + f"({self.dtype.name} [{self.shape}]) to different shape " + f"({newb.dtype.name} [{newb.shape}])" + ) + return newb + + def convert( + self, + copy: bool = True, + datetime: bool = True, + numeric: bool = True, + timedelta: bool = True, + coerce: bool = False, + ): + """ attempt to coerce any object types to better types return a copy + of the block (if copy = True) by definition we are not an ObjectBlock + here! + """ + + return self.copy() if copy else self + + def _can_hold_element(self, element: Any) -> bool: + """ require the same dtype as ourselves """ + dtype = self.values.dtype.type + tipo = maybe_infer_dtype_type(element) + if tipo is not None: + return issubclass(tipo.type, dtype) + return isinstance(element, dtype) + + def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): + """ convert to our native types format, slicing if desired """ + values = self.get_values() + + if slicer is not None: + values = values[:, slicer] + mask = isna(values) + itemsize = writers.word_len(na_rep) + + if not self.is_object and not quoting and itemsize: + values = values.astype(f"= 1: + if self.ndim - 1 == new.ndim and axis == 1: + new = np.repeat(new, new_values.shape[-1]).reshape(self.shape) + new = new.astype(new_values.dtype) + + # we require exact matches between the len of the + # values we are setting (or is compat). np.putmask + # doesn't check this and will simply truncate / pad + # the output, but we want sane error messages + # + # TODO: this prob needs some better checking + # for 2D cases + if ( + is_list_like(new) + and np.any(mask[mask]) + and getattr(new, "ndim", 1) == 1 + ): + if mask[mask].shape[-1] == len(new): + # GH 30567 + # If length of ``new`` is less than the length of ``new_values``, + # `np.putmask` would first repeat the ``new`` array and then + # assign the masked values hence produces incorrect result. + # `np.place` on the other hand uses the ``new`` values at it is + # to place in the masked locations of ``new_values`` + np.place(new_values, mask, new) + elif mask.shape[-1] == len(new) or len(new) == 1: + np.putmask(new_values, mask, new) + else: + raise ValueError("cannot assign mismatch length to masked array") + else: + np.putmask(new_values, mask, new) + + # maybe upcast me + elif mask.any(): + if transpose: + mask = mask.T + if isinstance(new, np.ndarray): + new = new.T + axis = new_values.ndim - axis - 1 + + # Pseudo-broadcast + if getattr(new, "ndim", 0) >= 1: + if self.ndim - 1 == new.ndim: + new_shape = list(new.shape) + new_shape.insert(axis, 1) + new = new.reshape(tuple(new_shape)) + + # operate column-by-column + def f(mask, val, idx): + + if idx is None: + # ndim==1 case. + n = new + else: + + if isinstance(new, np.ndarray): + n = np.squeeze(new[idx % new.shape[0]]) + else: + n = np.array(new) + + # type of the new block + dtype, _ = maybe_promote(n.dtype) + + # we need to explicitly astype here to make a copy + n = n.astype(dtype) + + nv = _putmask_smart(val, mask, n) + return nv + + new_blocks = self.split_and_operate(mask, f, inplace) + return new_blocks + + if inplace: + return [self] + + if transpose: + new_values = new_values.T + + return [self.make_block(new_values)] + + def coerce_to_target_dtype(self, other): + """ + coerce the current block to a dtype compat for other + we will return a block, possibly object, and not raise + + we can also safely try to coerce to the same dtype + and will receive the same block + """ + + # if we cannot then coerce to object + dtype, _ = infer_dtype_from(other, pandas_dtype=True) + + if is_dtype_equal(self.dtype, dtype): + return self + + if self.is_bool or is_object_dtype(dtype) or is_bool_dtype(dtype): + # we don't upcast to bool + return self.astype(object) + + elif (self.is_float or self.is_complex) and ( + is_integer_dtype(dtype) or is_float_dtype(dtype) + ): + # don't coerce float/complex to int + return self + + elif ( + self.is_datetime + or is_datetime64_dtype(dtype) + or is_datetime64tz_dtype(dtype) + ): + + # not a datetime + if not ( + (is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype)) + and self.is_datetime + ): + return self.astype(object) + + # don't upcast timezone with different timezone or no timezone + mytz = getattr(self.dtype, "tz", None) + othertz = getattr(dtype, "tz", None) + + if not tz_compare(mytz, othertz): + return self.astype(object) + + raise AssertionError( + f"possible recursion in coerce_to_target_dtype: {self} {other}" + ) + + elif self.is_timedelta or is_timedelta64_dtype(dtype): + + # not a timedelta + if not (is_timedelta64_dtype(dtype) and self.is_timedelta): + return self.astype(object) + + raise AssertionError( + f"possible recursion in coerce_to_target_dtype: {self} {other}" + ) + + try: + return self.astype(dtype) + except (ValueError, TypeError, OverflowError): + return self.astype(object) + + def interpolate( + self, + method="pad", + axis=0, + index=None, + values=None, + inplace=False, + limit=None, + limit_direction="forward", + limit_area=None, + fill_value=None, + coerce=False, + downcast=None, + **kwargs, + ): + + inplace = validate_bool_kwarg(inplace, "inplace") + + def check_int_bool(self, inplace): + # Only FloatBlocks will contain NaNs. + # timedelta subclasses IntBlock + if (self.is_bool or self.is_integer) and not self.is_timedelta: + if inplace: + return self + else: + return self.copy() + + # a fill na type method + try: + m = missing.clean_fill_method(method) + except ValueError: + m = None + + if m is not None: + r = check_int_bool(self, inplace) + if r is not None: + return r + return self._interpolate_with_fill( + method=m, + axis=axis, + inplace=inplace, + limit=limit, + fill_value=fill_value, + coerce=coerce, + downcast=downcast, + ) + # validate the interp method + m = missing.clean_interp_method(method, **kwargs) + + r = check_int_bool(self, inplace) + if r is not None: + return r + return self._interpolate( + method=m, + index=index, + values=values, + axis=axis, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + fill_value=fill_value, + inplace=inplace, + downcast=downcast, + **kwargs, + ) + + def _interpolate_with_fill( + self, + method="pad", + axis=0, + inplace=False, + limit=None, + fill_value=None, + coerce=False, + downcast=None, + ): + """ fillna but using the interpolate machinery """ + + inplace = validate_bool_kwarg(inplace, "inplace") + + # if we are coercing, then don't force the conversion + # if the block can't hold the type + if coerce: + if not self._can_hold_na: + if inplace: + return [self] + else: + return [self.copy()] + + values = self.values if inplace else self.values.copy() + + # We only get here for non-ExtensionBlock + fill_value = convert_scalar(self.values, fill_value) + + values = missing.interpolate_2d( + values, + method=method, + axis=axis, + limit=limit, + fill_value=fill_value, + dtype=self.dtype, + ) + + blocks = [self.make_block_same_class(values, ndim=self.ndim)] + return self._maybe_downcast(blocks, downcast) + + def _interpolate( + self, + method=None, + index=None, + values=None, + fill_value=None, + axis=0, + limit=None, + limit_direction="forward", + limit_area=None, + inplace=False, + downcast=None, + **kwargs, + ): + """ interpolate using scipy wrappers """ + + inplace = validate_bool_kwarg(inplace, "inplace") + data = self.values if inplace else self.values.copy() + + # only deal with floats + if not self.is_float: + if not self.is_integer: + return self + data = data.astype(np.float64) + + if fill_value is None: + fill_value = self.fill_value + + if method in ("krogh", "piecewise_polynomial", "pchip"): + if not index.is_monotonic: + raise ValueError( + f"{method} interpolation requires that the index be monotonic." + ) + # process 1-d slices in the axis direction + + def func(x): + + # process a 1-d slice, returning it + # should the axis argument be handled below in apply_along_axis? + # i.e. not an arg to missing.interpolate_1d + return missing.interpolate_1d( + index, + x, + method=method, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + fill_value=fill_value, + bounds_error=False, + **kwargs, + ) + + # interp each column independently + interp_values = np.apply_along_axis(func, axis, data) + + blocks = [self.make_block_same_class(interp_values)] + return self._maybe_downcast(blocks, downcast) + + def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None): + """ + Take values according to indexer and return them as a block.bb + + """ + + # algos.take_nd dispatches for DatetimeTZBlock, CategoricalBlock + # so need to preserve types + # sparse is treated like an ndarray, but needs .get_values() shaping + + values = self.values + + if fill_tuple is None: + fill_value = self.fill_value + allow_fill = False + else: + fill_value = fill_tuple[0] + allow_fill = True + + new_values = algos.take_nd( + values, indexer, axis=axis, allow_fill=allow_fill, fill_value=fill_value + ) + + # Called from three places in managers, all of which satisfy + # this assertion + assert not (axis == 0 and new_mgr_locs is None) + if new_mgr_locs is None: + new_mgr_locs = self.mgr_locs + + if not is_dtype_equal(new_values.dtype, self.dtype): + return self.make_block(new_values, new_mgr_locs) + else: + return self.make_block_same_class(new_values, new_mgr_locs) + + def diff(self, n: int, axis: int = 1) -> List["Block"]: + """ return block for the diff of the values """ + new_values = algos.diff(self.values, n, axis=axis, stacklevel=7) + # We use block_shape for ExtensionBlock subclasses, which may call here + # via a super. + new_values = _block_shape(new_values, ndim=self.ndim) + return [self.make_block(values=new_values)] + + def shift(self, periods, axis=0, fill_value=None): + """ shift the block by periods, possibly upcast """ + + # convert integer to float if necessary. need to do a lot more than + # that, handle boolean etc also + new_values, fill_value = maybe_upcast(self.values, fill_value) + + # make sure array sent to np.roll is c_contiguous + f_ordered = new_values.flags.f_contiguous + if f_ordered: + new_values = new_values.T + axis = new_values.ndim - axis - 1 + + if np.prod(new_values.shape): + new_values = np.roll(new_values, ensure_platform_int(periods), axis=axis) + + axis_indexer = [slice(None)] * self.ndim + if periods > 0: + axis_indexer[axis] = slice(None, periods) + else: + axis_indexer[axis] = slice(periods, None) + new_values[tuple(axis_indexer)] = fill_value + + # restore original order + if f_ordered: + new_values = new_values.T + + return [self.make_block(new_values)] + + def where( + self, + other, + cond, + align=True, + errors="raise", + try_cast: bool = False, + axis: int = 0, + ) -> List["Block"]: + """ + evaluate the block; return result block(s) from the result + + Parameters + ---------- + other : a ndarray/object + cond : the condition to respect + align : boolean, perform alignment on other/cond + errors : str, {'raise', 'ignore'}, default 'raise' + - ``raise`` : allow exceptions to be raised + - ``ignore`` : suppress exceptions. On error return original object + axis : int + + Returns + ------- + a new block(s), the result of the func + """ + import pandas.core.computation.expressions as expressions + + assert errors in ["raise", "ignore"] + transpose = self.ndim == 2 + + values = self.values + orig_other = other + if transpose: + values = values.T + + other = getattr(other, "_values", getattr(other, "values", other)) + cond = getattr(cond, "values", cond) + + # If the default broadcasting would go in the wrong direction, then + # explicitly reshape other instead + if getattr(other, "ndim", 0) >= 1: + if values.ndim - 1 == other.ndim and axis == 1: + other = other.reshape(tuple(other.shape + (1,))) + elif transpose and values.ndim == self.ndim - 1: + cond = cond.T + + if not hasattr(cond, "shape"): + raise ValueError("where must have a condition that is ndarray like") + + # our where function + def func(cond, values, other): + + if not ( + (self.is_integer or self.is_bool) + and lib.is_float(other) + and np.isnan(other) + ): + # np.where will cast integer array to floats in this case + if not self._can_hold_element(other): + raise TypeError + if lib.is_scalar(other) and isinstance(values, np.ndarray): + other = convert_scalar(values, other) + + fastres = expressions.where(cond, values, other) + return fastres + + if cond.ravel().all(): + result = values + else: + # see if we can operate on the entire block, or need item-by-item + # or if we are a single block (ndim == 1) + try: + result = func(cond, values, other) + except TypeError: + + # we cannot coerce, return a compat dtype + # we are explicitly ignoring errors + block = self.coerce_to_target_dtype(other) + blocks = block.where( + orig_other, + cond, + align=align, + errors=errors, + try_cast=try_cast, + axis=axis, + ) + return self._maybe_downcast(blocks, "infer") + + if self._can_hold_na or self.ndim == 1: + + if transpose: + result = result.T + + return [self.make_block(result)] + + # might need to separate out blocks + axis = cond.ndim - 1 + cond = cond.swapaxes(axis, 0) + mask = np.array([cond[i].all() for i in range(cond.shape[0])], dtype=bool) + + result_blocks = [] + for m in [mask, ~mask]: + if m.any(): + taken = result.take(m.nonzero()[0], axis=axis) + r = maybe_downcast_numeric(taken, self.dtype) + nb = self.make_block(r.T, placement=self.mgr_locs[m]) + result_blocks.append(nb) + + return result_blocks + + def equals(self, other) -> bool: + if self.dtype != other.dtype or self.shape != other.shape: + return False + return array_equivalent(self.values, other.values) + + def _unstack(self, unstacker_func, new_columns, n_rows, fill_value): + """Return a list of unstacked blocks of self + + Parameters + ---------- + unstacker_func : callable + Partially applied unstacker. + new_columns : Index + All columns of the unstacked BlockManager. + n_rows : int + Only used in ExtensionBlock._unstack + fill_value : int + Only used in ExtensionBlock._unstack + + Returns + ------- + blocks : list of Block + New blocks of unstacked values. + mask : array_like of bool + The mask of columns of `blocks` we should keep. + """ + unstacker = unstacker_func(self.values.T) + new_items = unstacker.get_new_columns() + new_placement = new_columns.get_indexer(new_items) + new_values, mask = unstacker.get_new_values() + + mask = mask.any(0) + new_values = new_values.T[mask] + new_placement = new_placement[mask] + + blocks = [make_block(new_values, placement=new_placement)] + return blocks, mask + + def quantile(self, qs, interpolation="linear", axis=0): + """ + compute the quantiles of the + + Parameters + ---------- + qs: a scalar or list of the quantiles to be computed + interpolation: type of interpolation, default 'linear' + axis: axis to compute, default 0 + + Returns + ------- + Block + """ + # We should always have ndim == 2 because Series dispatches to DataFrame + assert self.ndim == 2 + + values = self.get_values() + + is_empty = values.shape[axis] == 0 + orig_scalar = not is_list_like(qs) + if orig_scalar: + # make list-like, unpack later + qs = [qs] + + if is_empty: + # create the array of na_values + # 2d len(values) * len(qs) + result = np.repeat( + np.array([self.fill_value] * len(qs)), len(values) + ).reshape(len(values), len(qs)) + else: + # asarray needed for Sparse, see GH#24600 + mask = np.asarray(isna(values)) + result = nanpercentile( + values, + np.array(qs) * 100, + axis=axis, + na_value=self.fill_value, + mask=mask, + ndim=values.ndim, + interpolation=interpolation, + ) + + result = np.array(result, copy=False) + result = result.T + + if orig_scalar and not lib.is_scalar(result): + # result could be scalar in case with is_empty and self.ndim == 1 + assert result.shape[-1] == 1, result.shape + result = result[..., 0] + result = lib.item_from_zerodim(result) + + ndim = np.ndim(result) + return make_block(result, placement=np.arange(len(result)), ndim=ndim) + + def _replace_coerce( + self, to_replace, value, inplace=True, regex=False, convert=False, mask=None + ): + """ + Replace value corresponding to the given boolean array with another + value. + + Parameters + ---------- + to_replace : object or pattern + Scalar to replace or regular expression to match. + value : object + Replacement object. + inplace : bool, default False + Perform inplace modification. + regex : bool, default False + If true, perform regular expression substitution. + convert : bool, default True + If true, try to coerce any object types to better types. + mask : array-like of bool, optional + True indicate corresponding element is ignored. + + Returns + ------- + A new block if there is anything to replace or the original block. + """ + + if mask.any(): + if not regex: + self = self.coerce_to_target_dtype(value) + return self.putmask(mask, value, inplace=inplace) + else: + return self._replace_single( + to_replace, + value, + inplace=inplace, + regex=regex, + convert=convert, + mask=mask, + ) + return self + + +class NonConsolidatableMixIn: + """ hold methods for the nonconsolidatable blocks """ + + _can_consolidate = False + _verify_integrity = False + _validate_ndim = False + + def __init__(self, values, placement, ndim=None): + """Initialize a non-consolidatable block. + + 'ndim' may be inferred from 'placement'. + + This will call continue to call __init__ for the other base + classes mixed in with this Mixin. + """ + # Placement must be converted to BlockPlacement so that we can check + # its length + if not isinstance(placement, libinternals.BlockPlacement): + placement = libinternals.BlockPlacement(placement) + + # Maybe infer ndim from placement + if ndim is None: + if len(placement) != 1: + ndim = 1 + else: + ndim = 2 + super().__init__(values, placement, ndim=ndim) + + @property + def shape(self): + if self.ndim == 1: + return ((len(self.values)),) + return (len(self.mgr_locs), len(self.values)) + + def iget(self, col): + + if self.ndim == 2 and isinstance(col, tuple): + col, loc = col + if not com.is_null_slice(col) and col != 0: + raise IndexError(f"{self} only contains one item") + elif isinstance(col, slice): + if col != slice(None): + raise NotImplementedError(col) + return self.values[[loc]] + return self.values[loc] + else: + if col != 0: + raise IndexError(f"{self} only contains one item") + return self.values + + def should_store(self, value): + return isinstance(value, self._holder) + + def set(self, locs, values, check=False): + assert locs.tolist() == [0] + self.values = values + + def putmask(self, mask, new, align=True, inplace=False, axis=0, transpose=False): + """ + putmask the data to the block; we must be a single block and not + generate other blocks + + return the resulting block + + Parameters + ---------- + mask : the condition to respect + new : a ndarray/object + align : boolean, perform alignment on other/cond, default is True + inplace : perform inplace modification, default is False + + Returns + ------- + a new block, the result of the putmask + """ + inplace = validate_bool_kwarg(inplace, "inplace") + + # use block's copy logic. + # .values may be an Index which does shallow copy by default + new_values = self.values if inplace else self.copy().values + + if isinstance(new, np.ndarray) and len(new) == len(mask): + new = new[mask] + + mask = _safe_reshape(mask, new_values.shape) + + new_values[mask] = new + return [self.make_block(values=new_values)] + + def _get_unstack_items(self, unstacker, new_columns): + """ + Get the placement, values, and mask for a Block unstack. + + This is shared between ObjectBlock and ExtensionBlock. They + differ in that ObjectBlock passes the values, while ExtensionBlock + passes the dummy ndarray of positions to be used by a take + later. + + Parameters + ---------- + unstacker : pandas.core.reshape.reshape._Unstacker + new_columns : Index + All columns of the unstacked BlockManager. + + Returns + ------- + new_placement : ndarray[int] + The placement of the new columns in `new_columns`. + new_values : Union[ndarray, ExtensionArray] + The first return value from _Unstacker.get_new_values. + mask : ndarray[bool] + The second return value from _Unstacker.get_new_values. + """ + # shared with ExtensionBlock + new_items = unstacker.get_new_columns() + new_placement = new_columns.get_indexer(new_items) + new_values, mask = unstacker.get_new_values() + + mask = mask.any(0) + return new_placement, new_values, mask + + +class ExtensionBlock(NonConsolidatableMixIn, Block): + """Block for holding extension types. + + Notes + ----- + This holds all 3rd-party extension array types. It's also the immediate + parent class for our internal extension types' blocks, CategoricalBlock. + + ExtensionArrays are limited to 1-D. + """ + + is_extension = True + + def __init__(self, values, placement, ndim=None): + values = self._maybe_coerce_values(values) + super().__init__(values, placement, ndim) + + def _maybe_coerce_values(self, values): + """ + Unbox to an extension array. + + This will unbox an ExtensionArray stored in an Index or Series. + ExtensionArrays pass through. No dtype coercion is done. + + Parameters + ---------- + values : Index, Series, ExtensionArray + + Returns + ------- + ExtensionArray + """ + return extract_array(values) + + @property + def _holder(self): + # For extension blocks, the holder is values-dependent. + return type(self.values) + + @property + def fill_value(self): + # Used in reindex_indexer + return self.values.dtype.na_value + + @property + def _can_hold_na(self): + # The default ExtensionArray._can_hold_na is True + return self._holder._can_hold_na + + @property + def is_view(self): + """Extension arrays are never treated as views.""" + return False + + @property + def is_numeric(self): + return self.values.dtype._is_numeric + + def setitem(self, indexer, value): + """Set the value inplace, returning a same-typed block. + + This differs from Block.setitem by not allowing setitem to change + the dtype of the Block. + + Parameters + ---------- + indexer : tuple, list-like, array-like, slice + The subset of self.values to set + value : object + The value being set + + Returns + ------- + Block + + Notes + ----- + `indexer` is a direct slice/positional indexer. `value` must + be a compatible shape. + """ + if isinstance(indexer, tuple): + # we are always 1-D + indexer = indexer[0] + + check_setitem_lengths(indexer, value, self.values) + self.values[indexer] = value + return self + + def get_values(self, dtype=None): + # ExtensionArrays must be iterable, so this works. + values = np.asarray(self.values) + if values.ndim == self.ndim - 1: + values = values.reshape((1,) + values.shape) + return values + + def array_values(self) -> ExtensionArray: + return self.values + + def to_dense(self): + return np.asarray(self.values) + + def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): + """override to use ExtensionArray astype for the conversion""" + values = self.values + if slicer is not None: + values = values[slicer] + mask = isna(values) + + values = np.asarray(values.astype(object)) + values[mask] = na_rep + + # we are expected to return a 2-d ndarray + return values.reshape(1, len(values)) + + def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): + """ + Take values according to indexer and return them as a block. + """ + if fill_tuple is None: + fill_value = None + else: + fill_value = fill_tuple[0] + + # axis doesn't matter; we are really a single-dim object + # but are passed the axis depending on the calling routing + # if its REALLY axis 0, then this will be a reindex and not a take + new_values = self.values.take(indexer, fill_value=fill_value, allow_fill=True) + + # Called from three places in managers, all of which satisfy + # this assertion + assert not (self.ndim == 1 and new_mgr_locs is None) + if new_mgr_locs is None: + new_mgr_locs = self.mgr_locs + + return self.make_block_same_class(new_values, new_mgr_locs) + + def _can_hold_element(self, element: Any) -> bool: + # XXX: We may need to think about pushing this onto the array. + # We're doing the same as CategoricalBlock here. + return True + + def _slice(self, slicer): + """ return a slice of my values """ + + # slice the category + # return same dims as we currently have + + if isinstance(slicer, tuple) and len(slicer) == 2: + if not com.is_null_slice(slicer[0]): + raise AssertionError("invalid slicing for a 1-ndim categorical") + slicer = slicer[1] + + return self.values[slicer] + + def concat_same_type(self, to_concat, placement=None): + """ + Concatenate list of single blocks of the same type. + """ + values = self._holder._concat_same_type([blk.values for blk in to_concat]) + placement = placement or slice(0, len(values), 1) + return self.make_block_same_class(values, ndim=self.ndim, placement=placement) + + def fillna(self, value, limit=None, inplace=False, downcast=None): + values = self.values if inplace else self.values.copy() + values = values.fillna(value=value, limit=limit) + return [ + self.make_block_same_class( + values=values, placement=self.mgr_locs, ndim=self.ndim + ) + ] + + def interpolate( + self, method="pad", axis=0, inplace=False, limit=None, fill_value=None, **kwargs + ): + + values = self.values if inplace else self.values.copy() + return self.make_block_same_class( + values=values.fillna(value=fill_value, method=method, limit=limit), + placement=self.mgr_locs, + ) + + def diff(self, n: int, axis: int = 1) -> List["Block"]: + if axis == 1: + # we are by definition 1D. + axis = 0 + return super().diff(n, axis) + + def shift( + self, + periods: int, + axis: libinternals.BlockPlacement = 0, + fill_value: Any = None, + ) -> List["ExtensionBlock"]: + """ + Shift the block by `periods`. + + Dispatches to underlying ExtensionArray and re-boxes in an + ExtensionBlock. + """ + return [ + self.make_block_same_class( + self.values.shift(periods=periods, fill_value=fill_value), + placement=self.mgr_locs, + ndim=self.ndim, + ) + ] + + def where( + self, + other, + cond, + align=True, + errors="raise", + try_cast: bool = False, + axis: int = 0, + ) -> List["Block"]: + if isinstance(other, ABCDataFrame): + # ExtensionArrays are 1-D, so if we get here then + # `other` should be a DataFrame with a single column. + assert other.shape[1] == 1 + other = other.iloc[:, 0] + + other = extract_array(other, extract_numpy=True) + + if isinstance(cond, ABCDataFrame): + assert cond.shape[1] == 1 + cond = cond.iloc[:, 0] + + cond = extract_array(cond, extract_numpy=True) + + if lib.is_scalar(other) and isna(other): + # The default `other` for Series / Frame is np.nan + # we want to replace that with the correct NA value + # for the type + other = self.dtype.na_value + + if is_sparse(self.values): + # TODO(SparseArray.__setitem__): remove this if condition + # We need to re-infer the type of the data after doing the + # where, for cases where the subtypes don't match + dtype = None + else: + dtype = self.dtype + + result = self.values.copy() + icond = ~cond + if lib.is_scalar(other): + set_other = other + else: + set_other = other[icond] + try: + result[icond] = set_other + except (NotImplementedError, TypeError): + # NotImplementedError for class not implementing `__setitem__` + # TypeError for SparseArray, which implements just to raise + # a TypeError + result = self._holder._from_sequence( + np.where(cond, self.values, other), dtype=dtype + ) + + return [self.make_block_same_class(result, placement=self.mgr_locs)] + + @property + def _ftype(self): + return getattr(self.values, "_pandas_ftype", Block._ftype) + + def _unstack(self, unstacker_func, new_columns, n_rows, fill_value): + # ExtensionArray-safe unstack. + # We override ObjectBlock._unstack, which unstacks directly on the + # values of the array. For EA-backed blocks, this would require + # converting to a 2-D ndarray of objects. + # Instead, we unstack an ndarray of integer positions, followed by + # a `take` on the actual values. + dummy_arr = np.arange(n_rows) + dummy_unstacker = functools.partial(unstacker_func, fill_value=-1) + unstacker = dummy_unstacker(dummy_arr) + + new_placement, new_values, mask = self._get_unstack_items( + unstacker, new_columns + ) + + blocks = [ + self.make_block_same_class( + self.values.take(indices, allow_fill=True, fill_value=fill_value), + [place], + ) + for indices, place in zip(new_values.T, new_placement) + ] + return blocks, mask + + +class ObjectValuesExtensionBlock(ExtensionBlock): + """ + Block providing backwards-compatibility for `.values`. + + Used by PeriodArray and IntervalArray to ensure that + Series[T].values is an ndarray of objects. + """ + + def external_values(self, dtype=None): + return self.values.astype(object) + + +class NumericBlock(Block): + __slots__ = () + is_numeric = True + _can_hold_na = True + + +class FloatOrComplexBlock(NumericBlock): + __slots__ = () + + def equals(self, other) -> bool: + if self.dtype != other.dtype or self.shape != other.shape: + return False + left, right = self.values, other.values + return ((left == right) | (np.isnan(left) & np.isnan(right))).all() + + +class FloatBlock(FloatOrComplexBlock): + __slots__ = () + is_float = True + + def _can_hold_element(self, element: Any) -> bool: + tipo = maybe_infer_dtype_type(element) + if tipo is not None: + return issubclass(tipo.type, (np.floating, np.integer)) and not issubclass( + tipo.type, (np.datetime64, np.timedelta64) + ) + return isinstance( + element, (float, int, np.floating, np.int_) + ) and not isinstance( + element, + (bool, np.bool_, datetime, timedelta, np.datetime64, np.timedelta64), + ) + + def to_native_types( + self, + slicer=None, + na_rep="", + float_format=None, + decimal=".", + quoting=None, + **kwargs, + ): + """ convert to our native types format, slicing if desired """ + + values = self.values + if slicer is not None: + values = values[:, slicer] + + # see gh-13418: no special formatting is desired at the + # output (important for appropriate 'quoting' behaviour), + # so do not pass it through the FloatArrayFormatter + if float_format is None and decimal == ".": + mask = isna(values) + + if not quoting: + values = values.astype(str) + else: + values = np.array(values, dtype="object") + + values[mask] = na_rep + return values + + from pandas.io.formats.format import FloatArrayFormatter + + formatter = FloatArrayFormatter( + values, + na_rep=na_rep, + float_format=float_format, + decimal=decimal, + quoting=quoting, + fixed_width=False, + ) + return formatter.get_result_as_array() + + def should_store(self, value): + # when inserting a column should not coerce integers to floats + # unnecessarily + return issubclass(value.dtype.type, np.floating) and value.dtype == self.dtype + + +class ComplexBlock(FloatOrComplexBlock): + __slots__ = () + is_complex = True + + def _can_hold_element(self, element: Any) -> bool: + tipo = maybe_infer_dtype_type(element) + if tipo is not None: + return issubclass(tipo.type, (np.floating, np.integer, np.complexfloating)) + return isinstance( + element, (float, int, complex, np.float_, np.int_) + ) and not isinstance(element, (bool, np.bool_)) + + def should_store(self, value): + return issubclass(value.dtype.type, np.complexfloating) + + +class IntBlock(NumericBlock): + __slots__ = () + is_integer = True + _can_hold_na = False + + def _can_hold_element(self, element: Any) -> bool: + tipo = maybe_infer_dtype_type(element) + if tipo is not None: + return ( + issubclass(tipo.type, np.integer) + and not issubclass(tipo.type, (np.datetime64, np.timedelta64)) + and self.dtype.itemsize >= tipo.itemsize + ) + return is_integer(element) + + def should_store(self, value): + return is_integer_dtype(value) and value.dtype == self.dtype + + +class DatetimeLikeBlockMixin: + """Mixin class for DatetimeBlock, DatetimeTZBlock, and TimedeltaBlock.""" + + @property + def _holder(self): + return DatetimeArray + + @property + def fill_value(self): + return np.datetime64("NaT", "ns") + + def get_values(self, dtype=None): + """ + return object dtype as boxed values, such as Timestamps/Timedelta + """ + if is_object_dtype(dtype): + values = self.values.ravel() + result = self._holder(values).astype(object) + return result.reshape(self.values.shape) + return self.values + + +class DatetimeBlock(DatetimeLikeBlockMixin, Block): + __slots__ = () + is_datetime = True + + def __init__(self, values, placement, ndim=None): + values = self._maybe_coerce_values(values) + super().__init__(values, placement=placement, ndim=ndim) + + @property + def _can_hold_na(self): + return True + + def _maybe_coerce_values(self, values): + """ + Input validation for values passed to __init__. Ensure that + we have datetime64ns, coercing if necessary. + + Parameters + ---------- + values : array-like + Must be convertible to datetime64 + + Returns + ------- + values : ndarray[datetime64ns] + + Overridden by DatetimeTZBlock. + """ + if values.dtype != _NS_DTYPE: + values = conversion.ensure_datetime64ns(values) + + if isinstance(values, DatetimeArray): + values = values._data + + assert isinstance(values, np.ndarray), type(values) + return values + + def astype(self, dtype, copy: bool = False, errors: str = "raise"): + """ + these automatically copy, so copy=True has no effect + raise on an except if raise == True + """ + dtype = pandas_dtype(dtype) + + # if we are passed a datetime64[ns, tz] + if is_datetime64tz_dtype(dtype): + values = self.values + if getattr(values, "tz", None) is None: + values = DatetimeArray(values).tz_localize("UTC") + values = values.tz_convert(dtype.tz) + return self.make_block(values) + + # delegate + return super().astype(dtype=dtype, copy=copy, errors=errors) + + def _can_hold_element(self, element: Any) -> bool: + tipo = maybe_infer_dtype_type(element) + if tipo is not None: + if self.is_datetimetz: + # require exact match, since non-nano does not exist + return is_dtype_equal(tipo, self.dtype) or is_valid_nat_for_dtype( + element, self.dtype + ) + + # GH#27419 if we get a non-nano datetime64 object + return is_datetime64_dtype(tipo) + elif element is NaT: + return True + elif isinstance(element, datetime): + if self.is_datetimetz: + return tz_compare(element.tzinfo, self.dtype.tz) + return element.tzinfo is None + + return is_valid_nat_for_dtype(element, self.dtype) + + def to_native_types( + self, slicer=None, na_rep=None, date_format=None, quoting=None, **kwargs + ): + """ convert to our native types format, slicing if desired """ + + values = self.values + i8values = self.values.view("i8") + + if slicer is not None: + values = values[..., slicer] + i8values = i8values[..., slicer] + + from pandas.io.formats.format import _get_format_datetime64_from_values + + fmt = _get_format_datetime64_from_values(values, date_format) + + result = tslib.format_array_from_datetime( + i8values.ravel(), + tz=getattr(self.values, "tz", None), + format=fmt, + na_rep=na_rep, + ).reshape(i8values.shape) + return np.atleast_2d(result) + + def should_store(self, value): + return ( + issubclass(value.dtype.type, np.datetime64) + and not is_datetime64tz_dtype(value) + and not is_extension_array_dtype(value) + ) + + def set(self, locs, values): + """ + Modify Block in-place with new item value + + Returns + ------- + None + """ + values = conversion.ensure_datetime64ns(values, copy=False) + + self.values[locs] = values + + def external_values(self): + return np.asarray(self.values.astype("datetime64[ns]", copy=False)) + + def array_values(self) -> ExtensionArray: + return DatetimeArray._simple_new(self.values) + + +class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): + """ implement a datetime64 block with a tz attribute """ + + __slots__ = () + is_datetimetz = True + is_extension = True + + _can_hold_element = DatetimeBlock._can_hold_element + to_native_types = DatetimeBlock.to_native_types + fill_value = np.datetime64("NaT", "ns") + + @property + def _holder(self): + return DatetimeArray + + def _maybe_coerce_values(self, values): + """Input validation for values passed to __init__. Ensure that + we have datetime64TZ, coercing if necessary. + + Parameters + ---------- + values : array-like + Must be convertible to datetime64 + + Returns + ------- + values : DatetimeArray + """ + if not isinstance(values, self._holder): + values = self._holder(values) + + if values.tz is None: + raise ValueError("cannot create a DatetimeTZBlock without a tz") + + return values + + @property + def is_view(self): + """ return a boolean if I am possibly a view """ + # check the ndarray values of the DatetimeIndex values + return self.values._data.base is not None + + def get_values(self, dtype=None): + """ + Returns an ndarray of values. + + Parameters + ---------- + dtype : np.dtype + Only `object`-like dtypes are respected here (not sure + why). + + Returns + ------- + values : ndarray + When ``dtype=object``, then and object-dtype ndarray of + boxed values is returned. Otherwise, an M8[ns] ndarray + is returned. + + DatetimeArray is always 1-d. ``get_values`` will reshape + the return value to be the same dimensionality as the + block. + """ + values = self.values + if is_object_dtype(dtype): + values = values.astype(object) + + values = np.asarray(values) + + if self.ndim == 2: + # Ensure that our shape is correct for DataFrame. + # ExtensionArrays are always 1-D, even in a DataFrame when + # the analogous NumPy-backed column would be a 2-D ndarray. + values = values.reshape(1, -1) + return values + + def to_dense(self): + # we request M8[ns] dtype here, even though it discards tzinfo, + # as lots of code (e.g. anything using values_from_object) + # expects that behavior. + return np.asarray(self.values, dtype=_NS_DTYPE) + + def _slice(self, slicer): + """ return a slice of my values """ + if isinstance(slicer, tuple): + col, loc = slicer + if not com.is_null_slice(col) and col != 0: + raise IndexError(f"{self} only contains one item") + return self.values[loc] + return self.values[slicer] + + def diff(self, n: int, axis: int = 0) -> List["Block"]: + """ + 1st discrete difference. + + Parameters + ---------- + n : int + Number of periods to diff. + axis : int, default 0 + Axis to diff upon. + + Returns + ------- + A list with a new TimeDeltaBlock. + + Notes + ----- + The arguments here are mimicking shift so they are called correctly + by apply. + """ + if axis == 0: + # Cannot currently calculate diff across multiple blocks since this + # function is invoked via apply + raise NotImplementedError + new_values = (self.values - self.shift(n, axis=axis)[0].values).asi8 + + # Reshape the new_values like how algos.diff does for timedelta data + new_values = new_values.reshape(1, len(new_values)) + new_values = new_values.astype("timedelta64[ns]") + return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)] + + def concat_same_type(self, to_concat, placement=None): + # need to handle concat([tz1, tz2]) here, since DatetimeArray + # only handles cases where all the tzs are the same. + # Instead of placing the condition here, it could also go into the + # is_uniform_join_units check, but I'm not sure what is better. + if len({x.dtype for x in to_concat}) > 1: + values = concat_datetime([x.values for x in to_concat]) + placement = placement or slice(0, len(values), 1) + + if self.ndim > 1: + values = np.atleast_2d(values) + return ObjectBlock(values, ndim=self.ndim, placement=placement) + return super().concat_same_type(to_concat, placement) + + def fillna(self, value, limit=None, inplace=False, downcast=None): + # We support filling a DatetimeTZ with a `value` whose timezone + # is different by coercing to object. + if self._can_hold_element(value): + return super().fillna(value, limit, inplace, downcast) + + # different timezones, or a non-tz + return self.astype(object).fillna( + value, limit=limit, inplace=inplace, downcast=downcast + ) + + def setitem(self, indexer, value): + # https://github.com/pandas-dev/pandas/issues/24020 + # Need a dedicated setitem until #24020 (type promotion in setitem + # for extension arrays) is designed and implemented. + if self._can_hold_element(value) or ( + isinstance(indexer, np.ndarray) and indexer.size == 0 + ): + return super().setitem(indexer, value) + + obj_vals = self.values.astype(object) + newb = make_block( + obj_vals, placement=self.mgr_locs, klass=ObjectBlock, ndim=self.ndim + ) + return newb.setitem(indexer, value) + + def equals(self, other) -> bool: + # override for significant performance improvement + if self.dtype != other.dtype or self.shape != other.shape: + return False + return (self.values.view("i8") == other.values.view("i8")).all() + + def quantile(self, qs, interpolation="linear", axis=0): + naive = self.values.view("M8[ns]") + + # kludge for 2D block with 1D values + naive = naive.reshape(self.shape) + + blk = self.make_block(naive) + res_blk = blk.quantile(qs, interpolation=interpolation, axis=axis) + + # ravel is kludge for 2D block with 1D values, assumes column-like + aware = self._holder(res_blk.values.ravel(), dtype=self.dtype) + return self.make_block_same_class(aware, ndim=res_blk.ndim) + + +class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock): + __slots__ = () + is_timedelta = True + _can_hold_na = True + is_numeric = False + fill_value = np.timedelta64("NaT", "ns") + + def __init__(self, values, placement, ndim=None): + if values.dtype != _TD_DTYPE: + values = conversion.ensure_timedelta64ns(values) + if isinstance(values, TimedeltaArray): + values = values._data + assert isinstance(values, np.ndarray), type(values) + super().__init__(values, placement=placement, ndim=ndim) + + @property + def _holder(self): + return TimedeltaArray + + def _can_hold_element(self, element: Any) -> bool: + tipo = maybe_infer_dtype_type(element) + if tipo is not None: + return issubclass(tipo.type, np.timedelta64) + elif element is NaT: + return True + elif isinstance(element, (timedelta, np.timedelta64)): + return True + return is_valid_nat_for_dtype(element, self.dtype) + + def fillna(self, value, **kwargs): + + # allow filling with integers to be + # interpreted as nanoseconds + if is_integer(value): + # Deprecation GH#24694, GH#19233 + raise TypeError( + "Passing integers to fillna for timedelta64[ns] dtype is no " + "longer supported. To obtain the old behavior, pass " + "`pd.Timedelta(seconds=n)` instead." + ) + return super().fillna(value, **kwargs) + + def should_store(self, value): + return issubclass( + value.dtype.type, np.timedelta64 + ) and not is_extension_array_dtype(value) + + def to_native_types(self, slicer=None, na_rep=None, quoting=None, **kwargs): + """ convert to our native types format, slicing if desired """ + + values = self.values + if slicer is not None: + values = values[:, slicer] + mask = isna(values) + + rvalues = np.empty(values.shape, dtype=object) + if na_rep is None: + na_rep = "NaT" + rvalues[mask] = na_rep + imask = (~mask).ravel() + + # FIXME: + # should use the formats.format.Timedelta64Formatter here + # to figure what format to pass to the Timedelta + # e.g. to not show the decimals say + rvalues.flat[imask] = np.array( + [Timedelta(val)._repr_base(format="all") for val in values.ravel()[imask]], + dtype=object, + ) + return rvalues + + def external_values(self, dtype=None): + return np.asarray(self.values.astype("timedelta64[ns]", copy=False)) + + def array_values(self) -> ExtensionArray: + return TimedeltaArray._simple_new(self.values) + + +class BoolBlock(NumericBlock): + __slots__ = () + is_bool = True + _can_hold_na = False + + def _can_hold_element(self, element: Any) -> bool: + tipo = maybe_infer_dtype_type(element) + if tipo is not None: + return issubclass(tipo.type, np.bool_) + return isinstance(element, (bool, np.bool_)) + + def should_store(self, value): + return issubclass(value.dtype.type, np.bool_) and not is_extension_array_dtype( + value + ) + + def replace( + self, to_replace, value, inplace=False, filter=None, regex=False, convert=True + ): + inplace = validate_bool_kwarg(inplace, "inplace") + to_replace_values = np.atleast_1d(to_replace) + if not np.can_cast(to_replace_values, bool): + return self + return super().replace( + to_replace, + value, + inplace=inplace, + filter=filter, + regex=regex, + convert=convert, + ) + + +class ObjectBlock(Block): + __slots__ = () + is_object = True + _can_hold_na = True + + def __init__(self, values, placement=None, ndim=2): + if issubclass(values.dtype.type, str): + values = np.array(values, dtype=object) + + super().__init__(values, ndim=ndim, placement=placement) + + @property + def is_bool(self): + """ we can be a bool if we have only bool values but are of type + object + """ + return lib.is_bool_array(self.values.ravel()) + + def convert( + self, + copy: bool = True, + datetime: bool = True, + numeric: bool = True, + timedelta: bool = True, + coerce: bool = False, + ): + """ attempt to coerce any object types to better types return a copy of + the block (if copy = True) by definition we ARE an ObjectBlock!!!!! + + can return multiple blocks! + """ + + # operate column-by-column + def f(mask, val, idx): + shape = val.shape + values = soft_convert_objects( + val.ravel(), + datetime=datetime, + numeric=numeric, + timedelta=timedelta, + coerce=coerce, + copy=copy, + ) + if isinstance(values, np.ndarray): + # TODO: allow EA once reshape is supported + values = values.reshape(shape) + + values = _block_shape(values, ndim=self.ndim) + return values + + if self.ndim == 2: + blocks = self.split_and_operate(None, f, False) + else: + values = f(None, self.values.ravel(), None) + blocks = [make_block(values, ndim=self.ndim, placement=self.mgr_locs)] + + return blocks + + def _maybe_downcast(self, blocks: List["Block"], downcast=None) -> List["Block"]: + + if downcast is not None: + return blocks + + # split and convert the blocks + return _extend_blocks([b.convert(datetime=True, numeric=False) for b in blocks]) + + def _can_hold_element(self, element: Any) -> bool: + return True + + def should_store(self, value): + return not ( + issubclass( + value.dtype.type, + (np.integer, np.floating, np.complexfloating, np.datetime64, np.bool_), + ) + or is_extension_array_dtype(value) + ) + + def replace( + self, to_replace, value, inplace=False, filter=None, regex=False, convert=True + ): + to_rep_is_list = is_list_like(to_replace) + value_is_list = is_list_like(value) + both_lists = to_rep_is_list and value_is_list + either_list = to_rep_is_list or value_is_list + + result_blocks = [] + blocks = [self] + + if not either_list and is_re(to_replace): + return self._replace_single( + to_replace, + value, + inplace=inplace, + filter=filter, + regex=True, + convert=convert, + ) + elif not (either_list or regex): + return super().replace( + to_replace, + value, + inplace=inplace, + filter=filter, + regex=regex, + convert=convert, + ) + elif both_lists: + for to_rep, v in zip(to_replace, value): + result_blocks = [] + for b in blocks: + result = b._replace_single( + to_rep, + v, + inplace=inplace, + filter=filter, + regex=regex, + convert=convert, + ) + result_blocks = _extend_blocks(result, result_blocks) + blocks = result_blocks + return result_blocks + + elif to_rep_is_list and regex: + for to_rep in to_replace: + result_blocks = [] + for b in blocks: + result = b._replace_single( + to_rep, + value, + inplace=inplace, + filter=filter, + regex=regex, + convert=convert, + ) + result_blocks = _extend_blocks(result, result_blocks) + blocks = result_blocks + return result_blocks + + return self._replace_single( + to_replace, + value, + inplace=inplace, + filter=filter, + convert=convert, + regex=regex, + ) + + def _replace_single( + self, + to_replace, + value, + inplace=False, + filter=None, + regex=False, + convert=True, + mask=None, + ): + """ + Replace elements by the given value. + + Parameters + ---------- + to_replace : object or pattern + Scalar to replace or regular expression to match. + value : object + Replacement object. + inplace : bool, default False + Perform inplace modification. + filter : list, optional + regex : bool, default False + If true, perform regular expression substitution. + convert : bool, default True + If true, try to coerce any object types to better types. + mask : array-like of bool, optional + True indicate corresponding element is ignored. + + Returns + ------- + a new block, the result after replacing + """ + inplace = validate_bool_kwarg(inplace, "inplace") + + # to_replace is regex compilable + to_rep_re = regex and is_re_compilable(to_replace) + + # regex is regex compilable + regex_re = is_re_compilable(regex) + + # only one will survive + if to_rep_re and regex_re: + raise AssertionError( + "only one of to_replace and regex can be regex compilable" + ) + + # if regex was passed as something that can be a regex (rather than a + # boolean) + if regex_re: + to_replace = regex + + regex = regex_re or to_rep_re + + # try to get the pattern attribute (compiled re) or it's a string + if is_re(to_replace): + pattern = to_replace.pattern + else: + pattern = to_replace + + # if the pattern is not empty and to_replace is either a string or a + # regex + if regex and pattern: + rx = re.compile(to_replace) + else: + # if the thing to replace is not a string or compiled regex call + # the superclass method -> to_replace is some kind of object + return super().replace( + to_replace, value, inplace=inplace, filter=filter, regex=regex + ) + + new_values = self.values if inplace else self.values.copy() + + # deal with replacing values with objects (strings) that match but + # whose replacement is not a string (numeric, nan, object) + if isna(value) or not isinstance(value, str): + + def re_replacer(s): + if is_re(rx) and isinstance(s, str): + return value if rx.search(s) is not None else s + else: + return s + + else: + # value is guaranteed to be a string here, s can be either a string + # or null if it's null it gets returned + def re_replacer(s): + if is_re(rx) and isinstance(s, str): + return rx.sub(value, s) + else: + return s + + f = np.vectorize(re_replacer, otypes=[self.dtype]) + + if filter is None: + filt = slice(None) + else: + filt = self.mgr_locs.isin(filter).nonzero()[0] + + if mask is None: + new_values[filt] = f(new_values[filt]) + else: + new_values[filt][mask] = f(new_values[filt][mask]) + + # convert + block = self.make_block(new_values) + if convert: + block = block.convert(numeric=False) + return block + + def _replace_coerce( + self, to_replace, value, inplace=True, regex=False, convert=False, mask=None + ): + """ + Replace value corresponding to the given boolean array with another + value. + + Parameters + ---------- + to_replace : object or pattern + Scalar to replace or regular expression to match. + value : object + Replacement object. + inplace : bool, default False + Perform inplace modification. + regex : bool, default False + If true, perform regular expression substitution. + convert : bool, default True + If true, try to coerce any object types to better types. + mask : array-like of bool, optional + True indicate corresponding element is ignored. + + Returns + ------- + A new block if there is anything to replace or the original block. + """ + if mask.any(): + block = super()._replace_coerce( + to_replace=to_replace, + value=value, + inplace=inplace, + regex=regex, + convert=convert, + mask=mask, + ) + if convert: + block = [b.convert(numeric=False, copy=True) for b in block] + return block + if convert: + return [self.convert(numeric=False, copy=True)] + return self + + +class CategoricalBlock(ExtensionBlock): + __slots__ = () + is_categorical = True + _verify_integrity = True + _can_hold_na = True + _concatenator = staticmethod(concat_categorical) + + def __init__(self, values, placement, ndim=None): + # coerce to categorical if we can + values = extract_array(values) + assert isinstance(values, Categorical), type(values) + super().__init__(values, placement=placement, ndim=ndim) + + @property + def _holder(self): + return Categorical + + @property + def array_dtype(self): + """ the dtype to return if I want to construct this block as an + array + """ + return np.object_ + + def to_dense(self): + # Categorical.get_values returns a DatetimeIndex for datetime + # categories, so we can't simply use `np.asarray(self.values)` like + # other types. + return self.values._internal_get_values() + + def to_native_types(self, slicer=None, na_rep="", quoting=None, **kwargs): + """ convert to our native types format, slicing if desired """ + + values = self.values + if slicer is not None: + # Categorical is always one dimension + values = values[slicer] + mask = isna(values) + values = np.array(values, dtype="object") + values[mask] = na_rep + + # we are expected to return a 2-d ndarray + return values.reshape(1, len(values)) + + def concat_same_type(self, to_concat, placement=None): + """ + Concatenate list of single blocks of the same type. + + Note that this CategoricalBlock._concat_same_type *may* not + return a CategoricalBlock. When the categories in `to_concat` + differ, this will return an object ndarray. + + If / when we decide we don't like that behavior: + + 1. Change Categorical._concat_same_type to use union_categoricals + 2. Delete this method. + """ + values = self._concatenator( + [blk.values for blk in to_concat], axis=self.ndim - 1 + ) + # not using self.make_block_same_class as values can be object dtype + return make_block( + values, placement=placement or slice(0, len(values), 1), ndim=self.ndim + ) + + def replace( + self, + to_replace, + value, + inplace: bool = False, + filter=None, + regex: bool = False, + convert: bool = True, + ): + inplace = validate_bool_kwarg(inplace, "inplace") + result = self if inplace else self.copy() + if filter is None: # replace was called on a series + result.values.replace(to_replace, value, inplace=True) + if convert: + return result.convert(numeric=False, copy=not inplace) + else: + return result + else: # replace was called on a DataFrame + if not isna(value): + result.values.add_categories(value, inplace=True) + return super(CategoricalBlock, result).replace( + to_replace, value, inplace, filter, regex, convert + ) + + +# ----------------------------------------------------------------- +# Constructor Helpers + + +def get_block_type(values, dtype=None): + """ + Find the appropriate Block subclass to use for the given values and dtype. + + Parameters + ---------- + values : ndarray-like + dtype : numpy or pandas dtype + + Returns + ------- + cls : class, subclass of Block + """ + dtype = dtype or values.dtype + vtype = dtype.type + + if is_sparse(dtype): + # Need this first(ish) so that Sparse[datetime] is sparse + cls = ExtensionBlock + elif is_categorical(values): + cls = CategoricalBlock + elif issubclass(vtype, np.datetime64): + assert not is_datetime64tz_dtype(values) + cls = DatetimeBlock + elif is_datetime64tz_dtype(values): + cls = DatetimeTZBlock + elif is_interval_dtype(dtype) or is_period_dtype(dtype): + cls = ObjectValuesExtensionBlock + elif is_extension_array_dtype(values): + cls = ExtensionBlock + elif issubclass(vtype, np.floating): + cls = FloatBlock + elif issubclass(vtype, np.timedelta64): + assert issubclass(vtype, np.integer) + cls = TimeDeltaBlock + elif issubclass(vtype, np.complexfloating): + cls = ComplexBlock + elif issubclass(vtype, np.integer): + cls = IntBlock + elif dtype == np.bool_: + cls = BoolBlock + else: + cls = ObjectBlock + return cls + + +def make_block(values, placement, klass=None, ndim=None, dtype=None): + # Ensure that we don't allow PandasArray / PandasDtype in internals. + # For now, blocks should be backed by ndarrays when possible. + if isinstance(values, ABCPandasArray): + values = values.to_numpy() + if ndim and ndim > 1: + values = np.atleast_2d(values) + + if isinstance(dtype, PandasDtype): + dtype = dtype.numpy_dtype + + if klass is None: + dtype = dtype or values.dtype + klass = get_block_type(values, dtype) + + elif klass is DatetimeTZBlock and not is_datetime64tz_dtype(values): + # TODO: This is no longer hit internally; does it need to be retained + # for e.g. pyarrow? + values = DatetimeArray._simple_new(values, dtype=dtype) + + return klass(values, ndim=ndim, placement=placement) + + +# ----------------------------------------------------------------- + + +def _extend_blocks(result, blocks=None): + """ return a new extended blocks, given the result """ + from pandas.core.internals import BlockManager + + if blocks is None: + blocks = [] + if isinstance(result, list): + for r in result: + if isinstance(r, list): + blocks.extend(r) + else: + blocks.append(r) + elif isinstance(result, BlockManager): + blocks.extend(result.blocks) + else: + blocks.append(result) + return blocks + + +def _block_shape(values, ndim=1, shape=None): + """ guarantee the shape of the values to be at least 1 d """ + if values.ndim < ndim: + if shape is None: + shape = values.shape + if not is_extension_array_dtype(values): + # TODO: https://github.com/pandas-dev/pandas/issues/23023 + # block.shape is incorrect for "2D" ExtensionArrays + # We can't, and don't need to, reshape. + values = values.reshape(tuple((1,) + shape)) + return values + + +def _merge_blocks(blocks, dtype=None, _can_consolidate=True): + + if len(blocks) == 1: + return blocks[0] + + if _can_consolidate: + + if dtype is None: + if len({b.dtype for b in blocks}) != 1: + raise AssertionError("_merge_blocks are invalid!") + + # FIXME: optimization potential in case all mgrs contain slices and + # combination of those slices is a slice, too. + new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks]) + new_values = np.vstack([b.values for b in blocks]) + + argsort = np.argsort(new_mgr_locs) + new_values = new_values[argsort] + new_mgr_locs = new_mgr_locs[argsort] + + return make_block(new_values, placement=new_mgr_locs) + + # no merge + return blocks + + +def _safe_reshape(arr, new_shape): + """ + If possible, reshape `arr` to have shape `new_shape`, + with a couple of exceptions (see gh-13012): + + 1) If `arr` is a ExtensionArray or Index, `arr` will be + returned as is. + 2) If `arr` is a Series, the `_values` attribute will + be reshaped and returned. + + Parameters + ---------- + arr : array-like, object to be reshaped + new_shape : int or tuple of ints, the new shape + """ + if isinstance(arr, ABCSeries): + arr = arr._values + if not isinstance(arr, ABCExtensionArray): + arr = arr.reshape(new_shape) + return arr + + +def _putmask_smart(v, mask, n): + """ + Return a new ndarray, try to preserve dtype if possible. + + Parameters + ---------- + v : `values`, updated in-place (array like) + mask : np.ndarray + Applies to both sides (array like). + n : `new values` either scalar or an array like aligned with `values` + + Returns + ------- + values : ndarray with updated values + this *may* be a copy of the original + + See Also + -------- + ndarray.putmask + """ + + # we cannot use np.asarray() here as we cannot have conversions + # that numpy does when numeric are mixed with strings + + # n should be the length of the mask or a scalar here + if not is_list_like(n): + n = np.repeat(n, len(mask)) + + # see if we are only masking values that if putted + # will work in the current dtype + try: + nn = n[mask] + except TypeError: + # TypeError: only integer scalar arrays can be converted to a scalar index + pass + else: + # make sure that we have a nullable type + # if we have nulls + if not _isna_compat(v, nn[0]): + pass + elif not (is_float_dtype(nn.dtype) or is_integer_dtype(nn.dtype)): + # only compare integers/floats + pass + elif not (is_float_dtype(v.dtype) or is_integer_dtype(v.dtype)): + # only compare integers/floats + pass + else: + + # we ignore ComplexWarning here + with warnings.catch_warnings(record=True): + warnings.simplefilter("ignore", np.ComplexWarning) + nn_at = nn.astype(v.dtype) + + comp = nn == nn_at + if is_list_like(comp) and comp.all(): + nv = v.copy() + nv[mask] = nn_at + return nv + + n = np.asarray(n) + + def _putmask_preserve(nv, n): + try: + nv[mask] = n[mask] + except (IndexError, ValueError): + nv[mask] = n + return nv + + # preserves dtype if possible + if v.dtype.kind == n.dtype.kind: + return _putmask_preserve(v, n) + + # change the dtype if needed + dtype, _ = maybe_promote(n.dtype) + + if is_extension_array_dtype(v.dtype) and is_object_dtype(dtype): + v = v._internal_get_values(dtype) + else: + v = v.astype(dtype) + + return _putmask_preserve(v, n) diff --git a/venv/Lib/site-packages/pandas/core/internals/concat.py b/venv/Lib/site-packages/pandas/core/internals/concat.py new file mode 100644 index 0000000..c75373b --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/internals/concat.py @@ -0,0 +1,493 @@ +# TODO: Needs a better name; too many modules are already called "concat" +from collections import defaultdict +import copy + +import numpy as np + +from pandas._libs import internals as libinternals, tslibs +from pandas.util._decorators import cache_readonly + +from pandas.core.dtypes.cast import maybe_promote +from pandas.core.dtypes.common import ( + _get_dtype, + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_extension_array_dtype, + is_float_dtype, + is_numeric_dtype, + is_sparse, + is_timedelta64_dtype, +) +from pandas.core.dtypes.concat import concat_compat +from pandas.core.dtypes.missing import isna + +import pandas.core.algorithms as algos + + +def get_mgr_concatenation_plan(mgr, indexers): + """ + Construct concatenation plan for given block manager and indexers. + + Parameters + ---------- + mgr : BlockManager + indexers : dict of {axis: indexer} + + Returns + ------- + plan : list of (BlockPlacement, JoinUnit) tuples + + """ + # Calculate post-reindex shape , save for item axis which will be separate + # for each block anyway. + mgr_shape = list(mgr.shape) + for ax, indexer in indexers.items(): + mgr_shape[ax] = len(indexer) + mgr_shape = tuple(mgr_shape) + + if 0 in indexers: + ax0_indexer = indexers.pop(0) + blknos = algos.take_1d(mgr._blknos, ax0_indexer, fill_value=-1) + blklocs = algos.take_1d(mgr._blklocs, ax0_indexer, fill_value=-1) + else: + + if mgr._is_single_block: + blk = mgr.blocks[0] + return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))] + + ax0_indexer = None + blknos = mgr._blknos + blklocs = mgr._blklocs + + plan = [] + for blkno, placements in libinternals.get_blkno_placements(blknos, group=False): + + assert placements.is_slice_like + + join_unit_indexers = indexers.copy() + + shape = list(mgr_shape) + shape[0] = len(placements) + shape = tuple(shape) + + if blkno == -1: + unit = JoinUnit(None, shape) + else: + blk = mgr.blocks[blkno] + ax0_blk_indexer = blklocs[placements.indexer] + + unit_no_ax0_reindexing = ( + len(placements) == len(blk.mgr_locs) + and + # Fastpath detection of join unit not + # needing to reindex its block: no ax0 + # reindexing took place and block + # placement was sequential before. + ( + ( + ax0_indexer is None + and blk.mgr_locs.is_slice_like + and blk.mgr_locs.as_slice.step == 1 + ) + or + # Slow-ish detection: all indexer locs + # are sequential (and length match is + # checked above). + (np.diff(ax0_blk_indexer) == 1).all() + ) + ) + + # Omit indexer if no item reindexing is required. + if unit_no_ax0_reindexing: + join_unit_indexers.pop(0, None) + else: + join_unit_indexers[0] = ax0_blk_indexer + + unit = JoinUnit(blk, shape, join_unit_indexers) + + plan.append((placements, unit)) + + return plan + + +class JoinUnit: + def __init__(self, block, shape, indexers=None): + # Passing shape explicitly is required for cases when block is None. + if indexers is None: + indexers = {} + self.block = block + self.indexers = indexers + self.shape = shape + + def __repr__(self) -> str: + return f"{type(self).__name__}({repr(self.block)}, {self.indexers})" + + @cache_readonly + def needs_filling(self): + for indexer in self.indexers.values(): + # FIXME: cache results of indexer == -1 checks. + if (indexer == -1).any(): + return True + + return False + + @cache_readonly + def dtype(self): + if self.block is None: + raise AssertionError("Block is None, no dtype") + + if not self.needs_filling: + return self.block.dtype + else: + return _get_dtype(maybe_promote(self.block.dtype, self.block.fill_value)[0]) + + @cache_readonly + def is_na(self): + if self.block is None: + return True + + if not self.block._can_hold_na: + return False + + # Usually it's enough to check but a small fraction of values to see if + # a block is NOT null, chunks should help in such cases. 1000 value + # was chosen rather arbitrarily. + values = self.block.values + if self.block.is_categorical: + values_flat = values.categories + elif is_sparse(self.block.values.dtype): + return False + elif self.block.is_extension: + values_flat = values + else: + values_flat = values.ravel(order="K") + total_len = values_flat.shape[0] + chunk_len = max(total_len // 40, 1000) + for i in range(0, total_len, chunk_len): + if not isna(values_flat[i : i + chunk_len]).all(): + return False + + return True + + def get_reindexed_values(self, empty_dtype, upcasted_na): + if upcasted_na is None: + # No upcasting is necessary + fill_value = self.block.fill_value + values = self.block.get_values() + else: + fill_value = upcasted_na + + if self.is_na: + if getattr(self.block, "is_object", False): + # we want to avoid filling with np.nan if we are + # using None; we already know that we are all + # nulls + values = self.block.values.ravel(order="K") + if len(values) and values[0] is None: + fill_value = None + + if getattr(self.block, "is_datetimetz", False) or is_datetime64tz_dtype( + empty_dtype + ): + if self.block is None: + array = empty_dtype.construct_array_type() + return array( + np.full(self.shape[1], fill_value.value), dtype=empty_dtype + ) + elif getattr(self.block, "is_categorical", False): + pass + elif getattr(self.block, "is_extension", False): + pass + else: + missing_arr = np.empty(self.shape, dtype=empty_dtype) + missing_arr.fill(fill_value) + return missing_arr + + if not self.indexers: + if not self.block._can_consolidate: + # preserve these for validation in concat_compat + return self.block.values + + if self.block.is_bool and not self.block.is_categorical: + # External code requested filling/upcasting, bool values must + # be upcasted to object to avoid being upcasted to numeric. + values = self.block.astype(np.object_).values + elif self.block.is_extension: + values = self.block.values + else: + # No dtype upcasting is done here, it will be performed during + # concatenation itself. + values = self.block.get_values() + + if not self.indexers: + # If there's no indexing to be done, we want to signal outside + # code that this array must be copied explicitly. This is done + # by returning a view and checking `retval.base`. + values = values.view() + + else: + for ax, indexer in self.indexers.items(): + values = algos.take_nd(values, indexer, axis=ax, fill_value=fill_value) + + return values + + +def concatenate_join_units(join_units, concat_axis, copy): + """ + Concatenate values from several join units along selected axis. + """ + if concat_axis == 0 and len(join_units) > 1: + # Concatenating join units along ax0 is handled in _merge_blocks. + raise AssertionError("Concatenating join units along axis0") + + empty_dtype, upcasted_na = _get_empty_dtype_and_na(join_units) + + to_concat = [ + ju.get_reindexed_values(empty_dtype=empty_dtype, upcasted_na=upcasted_na) + for ju in join_units + ] + + if len(to_concat) == 1: + # Only one block, nothing to concatenate. + concat_values = to_concat[0] + if copy: + if isinstance(concat_values, np.ndarray): + # non-reindexed (=not yet copied) arrays are made into a view + # in JoinUnit.get_reindexed_values + if concat_values.base is not None: + concat_values = concat_values.copy() + else: + concat_values = concat_values.copy() + else: + concat_values = concat_compat(to_concat, axis=concat_axis) + + return concat_values + + +def _get_empty_dtype_and_na(join_units): + """ + Return dtype and N/A values to use when concatenating specified units. + + Returned N/A value may be None which means there was no casting involved. + + Returns + ------- + dtype + na + """ + if len(join_units) == 1: + blk = join_units[0].block + if blk is None: + return np.float64, np.nan + + if _is_uniform_reindex(join_units): + # FIXME: integrate property + empty_dtype = join_units[0].block.dtype + upcasted_na = join_units[0].block.fill_value + return empty_dtype, upcasted_na + + has_none_blocks = False + dtypes = [None] * len(join_units) + for i, unit in enumerate(join_units): + if unit.block is None: + has_none_blocks = True + else: + dtypes[i] = unit.dtype + + upcast_classes = defaultdict(list) + null_upcast_classes = defaultdict(list) + for dtype, unit in zip(dtypes, join_units): + if dtype is None: + continue + + if is_categorical_dtype(dtype): + upcast_cls = "category" + elif is_datetime64tz_dtype(dtype): + upcast_cls = "datetimetz" + elif issubclass(dtype.type, np.bool_): + upcast_cls = "bool" + elif issubclass(dtype.type, np.object_): + upcast_cls = "object" + elif is_datetime64_dtype(dtype): + upcast_cls = "datetime" + elif is_timedelta64_dtype(dtype): + upcast_cls = "timedelta" + elif is_sparse(dtype): + upcast_cls = dtype.subtype.name + elif is_extension_array_dtype(dtype): + upcast_cls = "object" + elif is_float_dtype(dtype) or is_numeric_dtype(dtype): + upcast_cls = dtype.name + else: + upcast_cls = "float" + + # Null blocks should not influence upcast class selection, unless there + # are only null blocks, when same upcasting rules must be applied to + # null upcast classes. + if unit.is_na: + null_upcast_classes[upcast_cls].append(dtype) + else: + upcast_classes[upcast_cls].append(dtype) + + if not upcast_classes: + upcast_classes = null_upcast_classes + + # TODO: de-duplicate with maybe_promote? + # create the result + if "object" in upcast_classes: + return np.dtype(np.object_), np.nan + elif "bool" in upcast_classes: + if has_none_blocks: + return np.dtype(np.object_), np.nan + else: + return np.dtype(np.bool_), None + elif "category" in upcast_classes: + return np.dtype(np.object_), np.nan + elif "datetimetz" in upcast_classes: + # GH-25014. We use NaT instead of iNaT, since this eventually + # ends up in DatetimeArray.take, which does not allow iNaT. + dtype = upcast_classes["datetimetz"] + return dtype[0], tslibs.NaT + elif "datetime" in upcast_classes: + return np.dtype("M8[ns]"), np.datetime64("NaT", "ns") + elif "timedelta" in upcast_classes: + return np.dtype("m8[ns]"), np.timedelta64("NaT", "ns") + else: # pragma + try: + g = np.find_common_type(upcast_classes, []) + except TypeError: + # At least one is an ExtensionArray + return np.dtype(np.object_), np.nan + else: + if is_float_dtype(g): + return g, g.type(np.nan) + elif is_numeric_dtype(g): + if has_none_blocks: + return np.float64, np.nan + else: + return g, None + + msg = "invalid dtype determination in get_concat_dtype" + raise AssertionError(msg) + + +def is_uniform_join_units(join_units): + """ + Check if the join units consist of blocks of uniform type that can + be concatenated using Block.concat_same_type instead of the generic + concatenate_join_units (which uses `concat_compat`). + + """ + return ( + # all blocks need to have the same type + all(type(ju.block) is type(join_units[0].block) for ju in join_units) + and # noqa + # no blocks that would get missing values (can lead to type upcasts) + # unless we're an extension dtype. + all(not ju.is_na or ju.block.is_extension for ju in join_units) + and + # no blocks with indexers (as then the dimensions do not fit) + all(not ju.indexers for ju in join_units) + and + # only use this path when there is something to concatenate + len(join_units) > 1 + ) + + +def _is_uniform_reindex(join_units) -> bool: + return ( + # TODO: should this be ju.block._can_hold_na? + all(ju.block and ju.block.is_extension for ju in join_units) + and len({ju.block.dtype.name for ju in join_units}) == 1 + ) + + +def _trim_join_unit(join_unit, length): + """ + Reduce join_unit's shape along item axis to length. + + Extra items that didn't fit are returned as a separate block. + """ + + if 0 not in join_unit.indexers: + extra_indexers = join_unit.indexers + + if join_unit.block is None: + extra_block = None + else: + extra_block = join_unit.block.getitem_block(slice(length, None)) + join_unit.block = join_unit.block.getitem_block(slice(length)) + else: + extra_block = join_unit.block + + extra_indexers = copy.copy(join_unit.indexers) + extra_indexers[0] = extra_indexers[0][length:] + join_unit.indexers[0] = join_unit.indexers[0][:length] + + extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:] + join_unit.shape = (length,) + join_unit.shape[1:] + + return JoinUnit(block=extra_block, indexers=extra_indexers, shape=extra_shape) + + +def combine_concat_plans(plans, concat_axis): + """ + Combine multiple concatenation plans into one. + + existing_plan is updated in-place. + """ + if len(plans) == 1: + for p in plans[0]: + yield p[0], [p[1]] + + elif concat_axis == 0: + offset = 0 + for plan in plans: + last_plc = None + + for plc, unit in plan: + yield plc.add(offset), [unit] + last_plc = plc + + if last_plc is not None: + offset += last_plc.as_slice.stop + + else: + num_ended = [0] + + def _next_or_none(seq): + retval = next(seq, None) + if retval is None: + num_ended[0] += 1 + return retval + + plans = list(map(iter, plans)) + next_items = list(map(_next_or_none, plans)) + + while num_ended[0] != len(next_items): + if num_ended[0] > 0: + raise ValueError("Plan shapes are not aligned") + + placements, units = zip(*next_items) + + lengths = list(map(len, placements)) + min_len, max_len = min(lengths), max(lengths) + + if min_len == max_len: + yield placements[0], units + next_items[:] = map(_next_or_none, plans) + else: + yielded_placement = None + yielded_units = [None] * len(next_items) + for i, (plc, unit) in enumerate(next_items): + yielded_units[i] = unit + if len(plc) > min_len: + # _trim_join_unit updates unit in place, so only + # placement needs to be sliced to skip min_len. + next_items[i] = (plc[min_len:], _trim_join_unit(unit, min_len)) + else: + yielded_placement = plc + next_items[i] = _next_or_none(plans[i]) + + yield yielded_placement, yielded_units diff --git a/venv/Lib/site-packages/pandas/core/internals/construction.py b/venv/Lib/site-packages/pandas/core/internals/construction.py new file mode 100644 index 0000000..3a92cfd --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/internals/construction.py @@ -0,0 +1,626 @@ +""" +Functions for preparing various inputs passed to the DataFrame or Series +constructors before passing them to a BlockManager. +""" +from collections import abc + +import numpy as np +import numpy.ma as ma + +from pandas._libs import lib + +from pandas.core.dtypes.cast import ( + construct_1d_arraylike_from_scalar, + maybe_cast_to_datetime, + maybe_convert_platform, + maybe_infer_to_datetimelike, + maybe_upcast, +) +from pandas.core.dtypes.common import ( + is_categorical_dtype, + is_datetime64tz_dtype, + is_dtype_equal, + is_extension_array_dtype, + is_integer_dtype, + is_list_like, + is_object_dtype, +) +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCDatetimeIndex, + ABCIndexClass, + ABCPeriodIndex, + ABCSeries, + ABCTimedeltaIndex, +) + +from pandas.core import algorithms, common as com +from pandas.core.arrays import Categorical +from pandas.core.construction import sanitize_array +from pandas.core.indexes import base as ibase +from pandas.core.indexes.api import ( + Index, + ensure_index, + get_objs_combined_axis, + union_indexes, +) +from pandas.core.internals import ( + create_block_manager_from_arrays, + create_block_manager_from_blocks, +) + +# --------------------------------------------------------------------- +# BlockManager Interface + + +def arrays_to_mgr(arrays, arr_names, index, columns, dtype=None): + """ + Segregate Series based on type and coerce into matrices. + + Needs to handle a lot of exceptional cases. + """ + # figure out the index, if necessary + if index is None: + index = extract_index(arrays) + else: + index = ensure_index(index) + + # don't force copy because getting jammed in an ndarray anyway + arrays = _homogenize(arrays, index, dtype) + + # from BlockManager perspective + axes = [ensure_index(columns), index] + + return create_block_manager_from_arrays(arrays, arr_names, axes) + + +def masked_rec_array_to_mgr(data, index, columns, dtype, copy): + """ + Extract from a masked rec array and create the manager. + """ + + # essentially process a record array then fill it + fill_value = data.fill_value + fdata = ma.getdata(data) + if index is None: + index = get_names_from_index(fdata) + if index is None: + index = ibase.default_index(len(data)) + index = ensure_index(index) + + if columns is not None: + columns = ensure_index(columns) + arrays, arr_columns = to_arrays(fdata, columns) + + # fill if needed + new_arrays = [] + for fv, arr, col in zip(fill_value, arrays, arr_columns): + # TODO: numpy docs suggest fv must be scalar, but could it be + # non-scalar for object dtype? + assert lib.is_scalar(fv), fv + mask = ma.getmaskarray(data[col]) + if mask.any(): + arr, fv = maybe_upcast(arr, fill_value=fv, copy=True) + arr[mask] = fv + new_arrays.append(arr) + + # create the manager + arrays, arr_columns = reorder_arrays(new_arrays, arr_columns, columns) + if columns is None: + columns = arr_columns + + mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype) + + if copy: + mgr = mgr.copy() + return mgr + + +# --------------------------------------------------------------------- +# DataFrame Constructor Interface + + +def init_ndarray(values, index, columns, dtype=None, copy=False): + # input must be a ndarray, list, Series, index + + if isinstance(values, ABCSeries): + if columns is None: + if values.name is not None: + columns = [values.name] + if index is None: + index = values.index + else: + values = values.reindex(index) + + # zero len case (GH #2234) + if not len(values) and columns is not None and len(columns): + values = np.empty((0, 1), dtype=object) + + # we could have a categorical type passed or coerced to 'category' + # recast this to an arrays_to_mgr + if is_categorical_dtype(getattr(values, "dtype", None)) or is_categorical_dtype( + dtype + ): + + if not hasattr(values, "dtype"): + values = prep_ndarray(values, copy=copy) + values = values.ravel() + elif copy: + values = values.copy() + + index, columns = _get_axes(len(values), 1, index, columns) + return arrays_to_mgr([values], columns, index, columns, dtype=dtype) + elif is_extension_array_dtype(values) or is_extension_array_dtype(dtype): + # GH#19157 + + if isinstance(values, np.ndarray) and values.ndim > 1: + # GH#12513 a EA dtype passed with a 2D array, split into + # multiple EAs that view the values + values = [values[:, n] for n in range(values.shape[1])] + else: + values = [values] + + if columns is None: + columns = list(range(len(values))) + return arrays_to_mgr(values, columns, index, columns, dtype=dtype) + + # by definition an array here + # the dtypes will be coerced to a single dtype + values = prep_ndarray(values, copy=copy) + + if dtype is not None: + if not is_dtype_equal(values.dtype, dtype): + try: + values = values.astype(dtype) + except Exception as orig: + # e.g. ValueError when trying to cast object dtype to float64 + raise ValueError( + f"failed to cast to '{dtype}' (Exception was: {orig})" + ) from orig + + index, columns = _get_axes(*values.shape, index=index, columns=columns) + values = values.T + + # if we don't have a dtype specified, then try to convert objects + # on the entire block; this is to convert if we have datetimelike's + # embedded in an object type + if dtype is None and is_object_dtype(values): + + if values.ndim == 2 and values.shape[0] != 1: + # transpose and separate blocks + + dvals_list = [maybe_infer_to_datetimelike(row) for row in values] + for n in range(len(dvals_list)): + if isinstance(dvals_list[n], np.ndarray): + dvals_list[n] = dvals_list[n].reshape(1, -1) + + from pandas.core.internals.blocks import make_block + + # TODO: What about re-joining object columns? + block_values = [ + make_block(dvals_list[n], placement=[n]) for n in range(len(dvals_list)) + ] + + else: + datelike_vals = maybe_infer_to_datetimelike(values) + block_values = [datelike_vals] + else: + block_values = [values] + + return create_block_manager_from_blocks(block_values, [columns, index]) + + +def init_dict(data, index, columns, dtype=None): + """ + Segregate Series based on type and coerce into matrices. + Needs to handle a lot of exceptional cases. + """ + if columns is not None: + from pandas.core.series import Series + + arrays = Series(data, index=columns, dtype=object) + data_names = arrays.index + + missing = arrays.isna() + if index is None: + # GH10856 + # raise ValueError if only scalars in dict + index = extract_index(arrays[~missing]) + else: + index = ensure_index(index) + + # no obvious "empty" int column + if missing.any() and not is_integer_dtype(dtype): + if dtype is None or np.issubdtype(dtype, np.flexible): + # GH#1783 + nan_dtype = object + else: + nan_dtype = dtype + val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype) + arrays.loc[missing] = [val] * missing.sum() + + else: + keys = list(data.keys()) + columns = data_names = Index(keys) + arrays = (com.maybe_iterable_to_list(data[k]) for k in keys) + # GH#24096 need copy to be deep for datetime64tz case + # TODO: See if we can avoid these copies + arrays = [ + arr if not isinstance(arr, ABCIndexClass) else arr._data for arr in arrays + ] + arrays = [ + arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays + ] + return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype) + + +# --------------------------------------------------------------------- + + +def prep_ndarray(values, copy=True) -> np.ndarray: + if not isinstance(values, (np.ndarray, ABCSeries, Index)): + if len(values) == 0: + return np.empty((0, 0), dtype=object) + elif isinstance(values, range): + arr = np.arange(values.start, values.stop, values.step, dtype="int64") + return arr[..., np.newaxis] + + def convert(v): + return maybe_convert_platform(v) + + # we could have a 1-dim or 2-dim list here + # this is equiv of np.asarray, but does object conversion + # and platform dtype preservation + try: + if is_list_like(values[0]) or hasattr(values[0], "len"): + values = np.array([convert(v) for v in values]) + elif isinstance(values[0], np.ndarray) and values[0].ndim == 0: + # GH#21861 + values = np.array([convert(v) for v in values]) + else: + values = convert(values) + except (ValueError, TypeError): + values = convert(values) + + else: + + # drop subclass info, do not copy data + values = np.asarray(values) + if copy: + values = values.copy() + + if values.ndim == 1: + values = values.reshape((values.shape[0], 1)) + elif values.ndim != 2: + raise ValueError("Must pass 2-d input") + + return values + + +def _homogenize(data, index, dtype=None): + oindex = None + homogenized = [] + + for val in data: + if isinstance(val, ABCSeries): + if dtype is not None: + val = val.astype(dtype) + if val.index is not index: + # Forces alignment. No need to copy data since we + # are putting it into an ndarray later + val = val.reindex(index, copy=False) + else: + if isinstance(val, dict): + if oindex is None: + oindex = index.astype("O") + + if isinstance(index, (ABCDatetimeIndex, ABCTimedeltaIndex)): + val = com.dict_compat(val) + else: + val = dict(val) + val = lib.fast_multiget(val, oindex.values, default=np.nan) + val = sanitize_array( + val, index, dtype=dtype, copy=False, raise_cast_failure=False + ) + + homogenized.append(val) + + return homogenized + + +def extract_index(data): + index = None + if len(data) == 0: + index = Index([]) + elif len(data) > 0: + raw_lengths = [] + indexes = [] + + have_raw_arrays = False + have_series = False + have_dicts = False + + for val in data: + if isinstance(val, ABCSeries): + have_series = True + indexes.append(val.index) + elif isinstance(val, dict): + have_dicts = True + indexes.append(list(val.keys())) + elif is_list_like(val) and getattr(val, "ndim", 1) == 1: + have_raw_arrays = True + raw_lengths.append(len(val)) + + if not indexes and not raw_lengths: + raise ValueError("If using all scalar values, you must pass an index") + + if have_series: + index = union_indexes(indexes) + elif have_dicts: + index = union_indexes(indexes, sort=False) + + if have_raw_arrays: + lengths = list(set(raw_lengths)) + if len(lengths) > 1: + raise ValueError("arrays must all be same length") + + if have_dicts: + raise ValueError( + "Mixing dicts with non-Series may lead to ambiguous ordering." + ) + + if have_series: + if lengths[0] != len(index): + msg = ( + f"array length {lengths[0]} does not match index " + f"length {len(index)}" + ) + raise ValueError(msg) + else: + index = ibase.default_index(lengths[0]) + + return ensure_index(index) + + +def reorder_arrays(arrays, arr_columns, columns): + # reorder according to the columns + if ( + columns is not None + and len(columns) + and arr_columns is not None + and len(arr_columns) + ): + indexer = ensure_index(arr_columns).get_indexer(columns) + arr_columns = ensure_index([arr_columns[i] for i in indexer]) + arrays = [arrays[i] for i in indexer] + return arrays, arr_columns + + +def get_names_from_index(data): + has_some_name = any(getattr(s, "name", None) is not None for s in data) + if not has_some_name: + return ibase.default_index(len(data)) + + index = list(range(len(data))) + count = 0 + for i, s in enumerate(data): + n = getattr(s, "name", None) + if n is not None: + index[i] = n + else: + index[i] = f"Unnamed {count}" + count += 1 + + return index + + +def _get_axes(N, K, index, columns): + # helper to create the axes as indexes + # return axes or defaults + + if index is None: + index = ibase.default_index(N) + else: + index = ensure_index(index) + + if columns is None: + columns = ibase.default_index(K) + else: + columns = ensure_index(columns) + return index, columns + + +# --------------------------------------------------------------------- +# Conversion of Inputs to Arrays + + +def to_arrays(data, columns, coerce_float=False, dtype=None): + """ + Return list of arrays, columns. + """ + if isinstance(data, ABCDataFrame): + if columns is not None: + arrays = [ + data._ixs(i, axis=1).values + for i, col in enumerate(data.columns) + if col in columns + ] + else: + columns = data.columns + arrays = [data._ixs(i, axis=1).values for i in range(len(columns))] + + return arrays, columns + + if not len(data): + if isinstance(data, np.ndarray): + columns = data.dtype.names + if columns is not None: + return [[]] * len(columns), columns + return [], [] # columns if columns is not None else [] + if isinstance(data[0], (list, tuple)): + return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) + elif isinstance(data[0], abc.Mapping): + return _list_of_dict_to_arrays( + data, columns, coerce_float=coerce_float, dtype=dtype + ) + elif isinstance(data[0], ABCSeries): + return _list_of_series_to_arrays( + data, columns, coerce_float=coerce_float, dtype=dtype + ) + elif isinstance(data[0], Categorical): + if columns is None: + columns = ibase.default_index(len(data)) + return data, columns + elif ( + isinstance(data, (np.ndarray, ABCSeries, Index)) + and data.dtype.names is not None + ): + + columns = list(data.dtype.names) + arrays = [data[k] for k in columns] + return arrays, columns + else: + # last ditch effort + data = [tuple(x) for x in data] + return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) + + +def _list_to_arrays(data, columns, coerce_float=False, dtype=None): + if len(data) > 0 and isinstance(data[0], tuple): + content = list(lib.to_object_array_tuples(data).T) + else: + # list of lists + content = list(lib.to_object_array(data).T) + # gh-26429 do not raise user-facing AssertionError + try: + result = _convert_object_array( + content, columns, dtype=dtype, coerce_float=coerce_float + ) + except AssertionError as e: + raise ValueError(e) from e + return result + + +def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): + if columns is None: + # We know pass_data is non-empty because data[0] is a Series + pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))] + columns = get_objs_combined_axis(pass_data, sort=False) + + indexer_cache = {} + + aligned_values = [] + for s in data: + index = getattr(s, "index", None) + if index is None: + index = ibase.default_index(len(s)) + + if id(index) in indexer_cache: + indexer = indexer_cache[id(index)] + else: + indexer = indexer_cache[id(index)] = index.get_indexer(columns) + + values = com.values_from_object(s) + aligned_values.append(algorithms.take_1d(values, indexer)) + + values = np.vstack(aligned_values) + + if values.dtype == np.object_: + content = list(values.T) + return _convert_object_array( + content, columns, dtype=dtype, coerce_float=coerce_float + ) + else: + return values.T, columns + + +def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): + """Convert list of dicts to numpy arrays + + if `columns` is not passed, column names are inferred from the records + - for OrderedDict and dicts, the column names match + the key insertion-order from the first record to the last. + - For other kinds of dict-likes, the keys are lexically sorted. + + Parameters + ---------- + data : iterable + collection of records (OrderedDict, dict) + columns: iterables or None + coerce_float : bool + dtype : np.dtype + + Returns + ------- + tuple + arrays, columns + """ + + if columns is None: + gen = (list(x.keys()) for x in data) + sort = not any(isinstance(d, dict) for d in data) + columns = lib.fast_unique_multiple_list_gen(gen, sort=sort) + + # assure that they are of the base dict class and not of derived + # classes + data = [(type(d) is dict) and d or dict(d) for d in data] + + content = list(lib.dicts_to_array(data, list(columns)).T) + return _convert_object_array( + content, columns, dtype=dtype, coerce_float=coerce_float + ) + + +def _convert_object_array(content, columns, coerce_float=False, dtype=None): + if columns is None: + columns = ibase.default_index(len(content)) + else: + if len(columns) != len(content): # pragma: no cover + # caller's responsibility to check for this... + raise AssertionError( + f"{len(columns)} columns passed, passed data had " + f"{len(content)} columns" + ) + + # provide soft conversion of object dtypes + def convert(arr): + if dtype != object and dtype != np.object: + arr = lib.maybe_convert_objects(arr, try_float=coerce_float) + arr = maybe_cast_to_datetime(arr, dtype) + return arr + + arrays = [convert(arr) for arr in content] + + return arrays, columns + + +# --------------------------------------------------------------------- +# Series-Based + + +def sanitize_index(data, index, copy=False): + """ + Sanitize an index type to return an ndarray of the underlying, pass + through a non-Index. + """ + + if index is None: + return data + + if len(data) != len(index): + raise ValueError("Length of values does not match length of index") + + if isinstance(data, ABCIndexClass) and not copy: + pass + elif isinstance(data, (ABCPeriodIndex, ABCDatetimeIndex)): + data = data._values + if copy: + data = data.copy() + + elif isinstance(data, np.ndarray): + + # coerce datetimelike types + if data.dtype.kind in ["M", "m"]: + data = sanitize_array(data, index, copy=copy) + + return data diff --git a/venv/Lib/site-packages/pandas/core/internals/managers.py b/venv/Lib/site-packages/pandas/core/internals/managers.py new file mode 100644 index 0000000..67afd06 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/internals/managers.py @@ -0,0 +1,2027 @@ +from collections import defaultdict +from functools import partial +import itertools +import operator +import re +from typing import List, Optional, Sequence, Tuple, Union + +import numpy as np + +from pandas._libs import Timedelta, Timestamp, internals as libinternals, lib +from pandas.util._validators import validate_bool_kwarg + +from pandas.core.dtypes.cast import ( + find_common_type, + infer_dtype_from_scalar, + maybe_convert_objects, + maybe_promote, +) +from pandas.core.dtypes.common import ( + _NS_DTYPE, + is_datetimelike_v_numeric, + is_extension_array_dtype, + is_list_like, + is_numeric_v_string_like, + is_scalar, + is_sparse, +) +from pandas.core.dtypes.concat import concat_compat +from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.generic import ABCExtensionArray, ABCSeries +from pandas.core.dtypes.missing import isna + +import pandas.core.algorithms as algos +from pandas.core.base import PandasObject +from pandas.core.indexers import maybe_convert_indices +from pandas.core.indexes.api import Index, MultiIndex, ensure_index +from pandas.core.internals.blocks import ( + Block, + CategoricalBlock, + DatetimeTZBlock, + ExtensionBlock, + ObjectValuesExtensionBlock, + _extend_blocks, + _merge_blocks, + _safe_reshape, + get_block_type, + make_block, +) +from pandas.core.internals.concat import ( # all for concatenate_block_managers + combine_concat_plans, + concatenate_join_units, + get_mgr_concatenation_plan, + is_uniform_join_units, +) + +from pandas.io.formats.printing import pprint_thing + +# TODO: flexible with index=None and/or items=None + + +class BlockManager(PandasObject): + """ + Core internal data structure to implement DataFrame, Series, etc. + + Manage a bunch of labeled 2D mixed-type ndarrays. Essentially it's a + lightweight blocked set of labeled data to be manipulated by the DataFrame + public API class + + Attributes + ---------- + shape + ndim + axes + values + items + + Methods + ------- + set_axis(axis, new_labels) + copy(deep=True) + + get_dtype_counts + get_dtypes + + apply(func, axes, block_filter_fn) + + get_bool_data + get_numeric_data + + get_slice(slice_like, axis) + get(label) + iget(loc) + + take(indexer, axis) + reindex_axis(new_labels, axis) + reindex_indexer(new_labels, indexer, axis) + + delete(label) + insert(loc, label, value) + set(label, value) + + Parameters + ---------- + + + Notes + ----- + This is *not* a public API class + """ + + __slots__ = [ + "axes", + "blocks", + "_ndim", + "_shape", + "_known_consolidated", + "_is_consolidated", + "_blknos", + "_blklocs", + ] + + def __init__( + self, + blocks: Sequence[Block], + axes: Sequence[Index], + do_integrity_check: bool = True, + ): + self.axes = [ensure_index(ax) for ax in axes] + self.blocks: Tuple[Block, ...] = tuple(blocks) + + for block in blocks: + if self.ndim != block.ndim: + raise AssertionError( + f"Number of Block dimensions ({block.ndim}) must equal " + f"number of axes ({self.ndim})" + ) + + if do_integrity_check: + self._verify_integrity() + + self._consolidate_check() + + self._rebuild_blknos_and_blklocs() + + def make_empty(self, axes=None): + """ return an empty BlockManager with the items axis of len 0 """ + if axes is None: + axes = [ensure_index([])] + [ensure_index(a) for a in self.axes[1:]] + + # preserve dtype if possible + if self.ndim == 1: + blocks = np.array([], dtype=self.array_dtype) + else: + blocks = [] + return type(self)(blocks, axes) + + def __nonzero__(self): + return True + + # Python3 compat + __bool__ = __nonzero__ + + @property + def shape(self): + return tuple(len(ax) for ax in self.axes) + + @property + def ndim(self) -> int: + return len(self.axes) + + def set_axis(self, axis, new_labels): + new_labels = ensure_index(new_labels) + old_len = len(self.axes[axis]) + new_len = len(new_labels) + + if new_len != old_len: + raise ValueError( + f"Length mismatch: Expected axis has {old_len} elements, new " + f"values have {new_len} elements" + ) + + self.axes[axis] = new_labels + + def rename_axis(self, mapper, axis, copy=True, level=None): + """ + Rename one of axes. + + Parameters + ---------- + mapper : unary callable + axis : int + copy : boolean, default True + level : int, default None + """ + obj = self.copy(deep=copy) + obj.set_axis(axis, _transform_index(self.axes[axis], mapper, level)) + return obj + + @property + def _is_single_block(self): + if self.ndim == 1: + return True + + if len(self.blocks) != 1: + return False + + blk = self.blocks[0] + return blk.mgr_locs.is_slice_like and blk.mgr_locs.as_slice == slice( + 0, len(self), 1 + ) + + def _rebuild_blknos_and_blklocs(self): + """ + Update mgr._blknos / mgr._blklocs. + """ + new_blknos = np.empty(self.shape[0], dtype=np.int64) + new_blklocs = np.empty(self.shape[0], dtype=np.int64) + new_blknos.fill(-1) + new_blklocs.fill(-1) + + for blkno, blk in enumerate(self.blocks): + rl = blk.mgr_locs + new_blknos[rl.indexer] = blkno + new_blklocs[rl.indexer] = np.arange(len(rl)) + + if (new_blknos == -1).any(): + raise AssertionError("Gaps in blk ref_locs") + + self._blknos = new_blknos + self._blklocs = new_blklocs + + @property + def items(self): + return self.axes[0] + + def _get_counts(self, f): + """ return a dict of the counts of the function in BlockManager """ + self._consolidate_inplace() + counts = dict() + for b in self.blocks: + v = f(b) + counts[v] = counts.get(v, 0) + b.shape[0] + return counts + + def get_dtype_counts(self): + return self._get_counts(lambda b: b.dtype.name) + + def get_dtypes(self): + dtypes = np.array([blk.dtype for blk in self.blocks]) + return algos.take_1d(dtypes, self._blknos, allow_fill=False) + + def __getstate__(self): + block_values = [b.values for b in self.blocks] + block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks] + axes_array = list(self.axes) + + extra_state = { + "0.14.1": { + "axes": axes_array, + "blocks": [ + dict(values=b.values, mgr_locs=b.mgr_locs.indexer) + for b in self.blocks + ], + } + } + + # First three elements of the state are to maintain forward + # compatibility with 0.13.1. + return axes_array, block_values, block_items, extra_state + + def __setstate__(self, state): + def unpickle_block(values, mgr_locs): + return make_block(values, placement=mgr_locs) + + if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]: + state = state[3]["0.14.1"] + self.axes = [ensure_index(ax) for ax in state["axes"]] + self.blocks = tuple( + unpickle_block(b["values"], b["mgr_locs"]) for b in state["blocks"] + ) + else: + # discard anything after 3rd, support beta pickling format for a + # little while longer + ax_arrays, bvalues, bitems = state[:3] + + self.axes = [ensure_index(ax) for ax in ax_arrays] + + if len(bitems) == 1 and self.axes[0].equals(bitems[0]): + # This is a workaround for pre-0.14.1 pickles that didn't + # support unpickling multi-block frames/panels with non-unique + # columns/items, because given a manager with items ["a", "b", + # "a"] there's no way of knowing which block's "a" is where. + # + # Single-block case can be supported under the assumption that + # block items corresponded to manager items 1-to-1. + all_mgr_locs = [slice(0, len(bitems[0]))] + else: + all_mgr_locs = [ + self.axes[0].get_indexer(blk_items) for blk_items in bitems + ] + + self.blocks = tuple( + unpickle_block(values, mgr_locs) + for values, mgr_locs in zip(bvalues, all_mgr_locs) + ) + + self._post_setstate() + + def _post_setstate(self): + self._is_consolidated = False + self._known_consolidated = False + self._rebuild_blknos_and_blklocs() + + def __len__(self) -> int: + return len(self.items) + + def __repr__(self) -> str: + output = type(self).__name__ + for i, ax in enumerate(self.axes): + if i == 0: + output += f"\nItems: {ax}" + else: + output += f"\nAxis {i}: {ax}" + + for block in self.blocks: + output += f"\n{pprint_thing(block)}" + return output + + def _verify_integrity(self): + mgr_shape = self.shape + tot_items = sum(len(x.mgr_locs) for x in self.blocks) + for block in self.blocks: + if block._verify_integrity and block.shape[1:] != mgr_shape[1:]: + construction_error(tot_items, block.shape[1:], self.axes) + if len(self.items) != tot_items: + raise AssertionError( + "Number of manager items must equal union of " + f"block items\n# manager items: {len(self.items)}, # " + f"tot_items: {tot_items}" + ) + + def reduce(self, func, *args, **kwargs): + # If 2D, we assume that we're operating column-wise + if self.ndim == 1: + # we'll be returning a scalar + blk = self.blocks[0] + return func(blk.values, *args, **kwargs) + + res = {} + for blk in self.blocks: + bres = func(blk.values, *args, **kwargs) + + if np.ndim(bres) == 0: + # EA + assert blk.shape[0] == 1 + new_res = zip(blk.mgr_locs.as_array, [bres]) + else: + assert bres.ndim == 1, bres.shape + assert blk.shape[0] == len(bres), (blk.shape, bres.shape, args, kwargs) + new_res = zip(blk.mgr_locs.as_array, bres) + + nr = dict(new_res) + assert not any(key in res for key in nr) + res.update(nr) + + return res + + def apply(self, f, filter=None, **kwargs): + """ + Iterate over the blocks, collect and create a new BlockManager. + + Parameters + ---------- + f : str or callable + Name of the Block method to apply. + filter : list, if supplied, only call the block if the filter is in + the block + + Returns + ------- + BlockManager + """ + + result_blocks = [] + + # filter kwarg is used in replace-* family of methods + if filter is not None: + filter_locs = set(self.items.get_indexer_for(filter)) + if len(filter_locs) == len(self.items): + # All items are included, as if there were no filtering + filter = None + else: + kwargs["filter"] = filter_locs + + self._consolidate_inplace() + + if f == "where": + align_copy = True + if kwargs.get("align", True): + align_keys = ["other", "cond"] + else: + align_keys = ["cond"] + elif f == "putmask": + align_copy = False + if kwargs.get("align", True): + align_keys = ["new", "mask"] + else: + align_keys = ["mask"] + elif f == "fillna": + # fillna internally does putmask, maybe it's better to do this + # at mgr, not block level? + align_copy = False + align_keys = ["value"] + else: + align_keys = [] + + # TODO(EA): may interfere with ExtensionBlock.setitem for blocks + # with a .values attribute. + aligned_args = { + k: kwargs[k] + for k in align_keys + if not isinstance(kwargs[k], ABCExtensionArray) + and hasattr(kwargs[k], "values") + } + + for b in self.blocks: + if filter is not None: + if not b.mgr_locs.isin(filter_locs).any(): + result_blocks.append(b) + continue + + if aligned_args: + b_items = self.items[b.mgr_locs.indexer] + + for k, obj in aligned_args.items(): + axis = obj._info_axis_number + kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy) + + if callable(f): + applied = b.apply(f, **kwargs) + else: + applied = getattr(b, f)(**kwargs) + result_blocks = _extend_blocks(applied, result_blocks) + + if len(result_blocks) == 0: + return self.make_empty(self.axes) + bm = type(self)(result_blocks, self.axes, do_integrity_check=False) + return bm + + def quantile( + self, + axis=0, + consolidate=True, + transposed=False, + interpolation="linear", + qs=None, + numeric_only=None, + ): + """ + Iterate over blocks applying quantile reduction. + This routine is intended for reduction type operations and + will do inference on the generated blocks. + + Parameters + ---------- + axis: reduction axis, default 0 + consolidate: boolean, default True. Join together blocks having same + dtype + transposed: boolean, default False + we are holding transposed data + interpolation : type of interpolation, default 'linear' + qs : a scalar or list of the quantiles to be computed + numeric_only : ignored + + Returns + ------- + Block Manager (new object) + """ + + # Series dispatches to DataFrame for quantile, which allows us to + # simplify some of the code here and in the blocks + assert self.ndim >= 2 + + if consolidate: + self._consolidate_inplace() + + def get_axe(block, qs, axes): + # Because Series dispatches to DataFrame, we will always have + # block.ndim == 2 + from pandas import Float64Index + + if is_list_like(qs): + ax = Float64Index(qs) + else: + ax = axes[0] + return ax + + axes, blocks = [], [] + for b in self.blocks: + block = b.quantile(axis=axis, qs=qs, interpolation=interpolation) + + axe = get_axe(b, qs, axes=self.axes) + + axes.append(axe) + blocks.append(block) + + # note that some DatetimeTZ, Categorical are always ndim==1 + ndim = {b.ndim for b in blocks} + assert 0 not in ndim, ndim + + if 2 in ndim: + + new_axes = list(self.axes) + + # multiple blocks that are reduced + if len(blocks) > 1: + new_axes[1] = axes[0] + + # reset the placement to the original + for b, sb in zip(blocks, self.blocks): + b.mgr_locs = sb.mgr_locs + + else: + new_axes[axis] = Index(np.concatenate([ax.values for ax in axes])) + + if transposed: + new_axes = new_axes[::-1] + blocks = [ + b.make_block(b.values.T, placement=np.arange(b.shape[1])) + for b in blocks + ] + + return type(self)(blocks, new_axes) + + # single block, i.e. ndim == {1} + values = concat_compat([b.values for b in blocks]) + + # compute the orderings of our original data + if len(self.blocks) > 1: + + indexer = np.empty(len(self.axes[0]), dtype=np.intp) + i = 0 + for b in self.blocks: + for j in b.mgr_locs: + indexer[j] = i + i = i + 1 + + values = values.take(indexer) + + return SingleBlockManager( + [make_block(values, ndim=1, placement=np.arange(len(values)))], axes[0] + ) + + def isna(self, func): + return self.apply("apply", func=func) + + def where(self, **kwargs): + return self.apply("where", **kwargs) + + def setitem(self, **kwargs): + return self.apply("setitem", **kwargs) + + def putmask(self, **kwargs): + return self.apply("putmask", **kwargs) + + def diff(self, **kwargs): + return self.apply("diff", **kwargs) + + def interpolate(self, **kwargs): + return self.apply("interpolate", **kwargs) + + def shift(self, **kwargs): + return self.apply("shift", **kwargs) + + def fillna(self, **kwargs): + return self.apply("fillna", **kwargs) + + def downcast(self, **kwargs): + return self.apply("downcast", **kwargs) + + def astype(self, dtype, copy: bool = False, errors: str = "raise"): + return self.apply("astype", dtype=dtype, copy=copy, errors=errors) + + def convert(self, **kwargs): + return self.apply("convert", **kwargs) + + def replace(self, value, **kwargs): + assert np.ndim(value) == 0, value + return self.apply("replace", value=value, **kwargs) + + def replace_list(self, src_list, dest_list, inplace=False, regex=False): + """ do a list replace """ + + inplace = validate_bool_kwarg(inplace, "inplace") + + # figure out our mask a-priori to avoid repeated replacements + values = self.as_array() + + def comp(s, regex=False): + """ + Generate a bool array by perform an equality check, or perform + an element-wise regular expression matching + """ + if isna(s): + return isna(values) + if isinstance(s, (Timedelta, Timestamp)) and getattr(s, "tz", None) is None: + + return _compare_or_regex_search( + maybe_convert_objects(values), s.asm8, regex + ) + return _compare_or_regex_search(values, s, regex) + + masks = [comp(s, regex) for i, s in enumerate(src_list)] + + result_blocks = [] + src_len = len(src_list) - 1 + for blk in self.blocks: + + # its possible to get multiple result blocks here + # replace ALWAYS will return a list + rb = [blk if inplace else blk.copy()] + for i, (s, d) in enumerate(zip(src_list, dest_list)): + # TODO: assert/validate that `d` is always a scalar? + new_rb = [] + for b in rb: + m = masks[i][b.mgr_locs.indexer] + convert = i == src_len + result = b._replace_coerce( + mask=m, + to_replace=s, + value=d, + inplace=inplace, + convert=convert, + regex=regex, + ) + if m.any() or convert: + new_rb = _extend_blocks(result, new_rb) + else: + new_rb.append(b) + rb = new_rb + result_blocks.extend(rb) + + bm = type(self)(result_blocks, self.axes) + bm._consolidate_inplace() + return bm + + def is_consolidated(self): + """ + Return True if more than one block with the same dtype + """ + if not self._known_consolidated: + self._consolidate_check() + return self._is_consolidated + + def _consolidate_check(self): + ftypes = [blk.ftype for blk in self.blocks] + self._is_consolidated = len(ftypes) == len(set(ftypes)) + self._known_consolidated = True + + @property + def is_mixed_type(self): + # Warning, consolidation needs to get checked upstairs + self._consolidate_inplace() + return len(self.blocks) > 1 + + @property + def is_numeric_mixed_type(self): + # Warning, consolidation needs to get checked upstairs + self._consolidate_inplace() + return all(block.is_numeric for block in self.blocks) + + @property + def is_datelike_mixed_type(self): + # Warning, consolidation needs to get checked upstairs + self._consolidate_inplace() + return any(block.is_datelike for block in self.blocks) + + @property + def any_extension_types(self): + """Whether any of the blocks in this manager are extension blocks""" + return any(block.is_extension for block in self.blocks) + + @property + def is_view(self): + """ return a boolean if we are a single block and are a view """ + if len(self.blocks) == 1: + return self.blocks[0].is_view + + # It is technically possible to figure out which blocks are views + # e.g. [ b.values.base is not None for b in self.blocks ] + # but then we have the case of possibly some blocks being a view + # and some blocks not. setting in theory is possible on the non-view + # blocks w/o causing a SettingWithCopy raise/warn. But this is a bit + # complicated + + return False + + def get_bool_data(self, copy=False): + """ + Parameters + ---------- + copy : boolean, default False + Whether to copy the blocks + """ + self._consolidate_inplace() + return self.combine([b for b in self.blocks if b.is_bool], copy) + + def get_numeric_data(self, copy=False): + """ + Parameters + ---------- + copy : boolean, default False + Whether to copy the blocks + """ + self._consolidate_inplace() + return self.combine([b for b in self.blocks if b.is_numeric], copy) + + def combine(self, blocks, copy=True): + """ return a new manager with the blocks """ + if len(blocks) == 0: + return self.make_empty() + + # FIXME: optimization potential + indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks])) + inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0]) + + new_blocks = [] + for b in blocks: + b = b.copy(deep=copy) + b.mgr_locs = algos.take_1d( + inv_indexer, b.mgr_locs.as_array, axis=0, allow_fill=False + ) + new_blocks.append(b) + + axes = list(self.axes) + axes[0] = self.items.take(indexer) + + return type(self)(new_blocks, axes, do_integrity_check=False) + + def get_slice(self, slobj: slice, axis: int = 0): + if axis >= self.ndim: + raise IndexError("Requested axis not found in manager") + + if axis == 0: + new_blocks = self._slice_take_blocks_ax0(slobj) + else: + _slicer = [slice(None)] * (axis + 1) + _slicer[axis] = slobj + slicer = tuple(_slicer) + new_blocks = [blk.getitem_block(slicer) for blk in self.blocks] + + new_axes = list(self.axes) + new_axes[axis] = new_axes[axis][slobj] + + bm = type(self)(new_blocks, new_axes, do_integrity_check=False) + bm._consolidate_inplace() + return bm + + def __contains__(self, item) -> bool: + return item in self.items + + @property + def nblocks(self) -> int: + return len(self.blocks) + + def copy(self, deep=True): + """ + Make deep or shallow copy of BlockManager + + Parameters + ---------- + deep : bool or string, default True + If False, return shallow copy (do not copy data) + If 'all', copy data and a deep copy of the index + + Returns + ------- + BlockManager + """ + # this preserves the notion of view copying of axes + if deep: + # hit in e.g. tests.io.json.test_pandas + + def copy_func(ax): + if deep == "all": + return ax.copy(deep=True) + else: + return ax.view() + + new_axes = [copy_func(ax) for ax in self.axes] + else: + new_axes = list(self.axes) + + res = self.apply("copy", deep=deep) + res.axes = new_axes + return res + + def as_array(self, transpose=False, items=None): + """Convert the blockmanager data into an numpy array. + + Parameters + ---------- + transpose : boolean, default False + If True, transpose the return array + items : list of strings or None + Names of block items that will be included in the returned + array. ``None`` means that all block items will be used + + Returns + ------- + arr : ndarray + """ + if len(self.blocks) == 0: + arr = np.empty(self.shape, dtype=float) + return arr.transpose() if transpose else arr + + if items is not None: + mgr = self.reindex_axis(items, axis=0) + else: + mgr = self + + if self._is_single_block and mgr.blocks[0].is_datetimetz: + # TODO(Block.get_values): Make DatetimeTZBlock.get_values + # always be object dtype. Some callers seem to want the + # DatetimeArray (previously DTI) + arr = mgr.blocks[0].get_values(dtype=object) + elif self._is_single_block or not self.is_mixed_type: + arr = np.asarray(mgr.blocks[0].get_values()) + else: + arr = mgr._interleave() + + return arr.transpose() if transpose else arr + + def _interleave(self): + """ + Return ndarray from blocks with specified item order + Items must be contained in the blocks + """ + dtype = _interleaved_dtype(self.blocks) + + # TODO: https://github.com/pandas-dev/pandas/issues/22791 + # Give EAs some input on what happens here. Sparse needs this. + if is_sparse(dtype): + dtype = dtype.subtype + elif is_extension_array_dtype(dtype): + dtype = "object" + + result = np.empty(self.shape, dtype=dtype) + + itemmask = np.zeros(self.shape[0]) + + for blk in self.blocks: + rl = blk.mgr_locs + result[rl.indexer] = blk.get_values(dtype) + itemmask[rl.indexer] = 1 + + if not itemmask.all(): + raise AssertionError("Some items were not contained in blocks") + + return result + + def to_dict(self, copy=True): + """ + Return a dict of str(dtype) -> BlockManager + + Parameters + ---------- + copy : boolean, default True + + Returns + ------- + values : a dict of dtype -> BlockManager + + Notes + ----- + This consolidates based on str(dtype) + """ + self._consolidate_inplace() + + bd = {} + for b in self.blocks: + bd.setdefault(str(b.dtype), []).append(b) + + return {dtype: self.combine(blocks, copy=copy) for dtype, blocks in bd.items()} + + def fast_xs(self, loc): + """ + get a cross sectional for a given location in the + items ; handle dups + + return the result, is *could* be a view in the case of a + single block + """ + if len(self.blocks) == 1: + return self.blocks[0].iget((slice(None), loc)) + + items = self.items + + # non-unique (GH4726) + if not items.is_unique: + result = self._interleave() + if self.ndim == 2: + result = result.T + return result[loc] + + # unique + dtype = _interleaved_dtype(self.blocks) + + n = len(items) + if is_extension_array_dtype(dtype): + # we'll eventually construct an ExtensionArray. + result = np.empty(n, dtype=object) + else: + result = np.empty(n, dtype=dtype) + + for blk in self.blocks: + # Such assignment may incorrectly coerce NaT to None + # result[blk.mgr_locs] = blk._slice((slice(None), loc)) + for i, rl in enumerate(blk.mgr_locs): + result[rl] = blk.iget((i, loc)) + + if is_extension_array_dtype(dtype): + result = dtype.construct_array_type()._from_sequence(result, dtype=dtype) + + return result + + def consolidate(self): + """ + Join together blocks having same dtype + + Returns + ------- + y : BlockManager + """ + if self.is_consolidated(): + return self + + bm = type(self)(self.blocks, self.axes) + bm._is_consolidated = False + bm._consolidate_inplace() + return bm + + def _consolidate_inplace(self): + if not self.is_consolidated(): + self.blocks = tuple(_consolidate(self.blocks)) + self._is_consolidated = True + self._known_consolidated = True + self._rebuild_blknos_and_blklocs() + + def get(self, item): + """ + Return values for selected item (ndarray or BlockManager). + """ + if self.items.is_unique: + + if not isna(item): + loc = self.items.get_loc(item) + else: + indexer = np.arange(len(self.items))[isna(self.items)] + + # allow a single nan location indexer + if not is_scalar(indexer): + if len(indexer) == 1: + loc = indexer.item() + else: + raise ValueError("cannot label index with a null key") + + return self.iget(loc) + else: + + if isna(item): + raise TypeError("cannot label index with a null key") + + indexer = self.items.get_indexer_for([item]) + return self.reindex_indexer( + new_axis=self.items[indexer], indexer=indexer, axis=0, allow_dups=True + ) + + def iget(self, i): + """ + Return the data as a SingleBlockManager if possible + + Otherwise return as a ndarray + """ + block = self.blocks[self._blknos[i]] + values = block.iget(self._blklocs[i]) + + # shortcut for select a single-dim from a 2-dim BM + return SingleBlockManager( + [ + block.make_block_same_class( + values, placement=slice(0, len(values)), ndim=1 + ) + ], + self.axes[1], + ) + + def delete(self, item): + """ + Delete selected item (items if non-unique) in-place. + """ + indexer = self.items.get_loc(item) + + is_deleted = np.zeros(self.shape[0], dtype=np.bool_) + is_deleted[indexer] = True + ref_loc_offset = -is_deleted.cumsum() + + is_blk_deleted = [False] * len(self.blocks) + + if isinstance(indexer, int): + affected_start = indexer + else: + affected_start = is_deleted.nonzero()[0][0] + + for blkno, _ in _fast_count_smallints(self._blknos[affected_start:]): + blk = self.blocks[blkno] + bml = blk.mgr_locs + blk_del = is_deleted[bml.indexer].nonzero()[0] + + if len(blk_del) == len(bml): + is_blk_deleted[blkno] = True + continue + elif len(blk_del) != 0: + blk.delete(blk_del) + bml = blk.mgr_locs + + blk.mgr_locs = bml.add(ref_loc_offset[bml.indexer]) + + # FIXME: use Index.delete as soon as it uses fastpath=True + self.axes[0] = self.items[~is_deleted] + self.blocks = tuple( + b for blkno, b in enumerate(self.blocks) if not is_blk_deleted[blkno] + ) + self._shape = None + self._rebuild_blknos_and_blklocs() + + def set(self, item, value): + """ + Set new item in-place. Does not consolidate. Adds new Block if not + contained in the current set of items + """ + # FIXME: refactor, clearly separate broadcasting & zip-like assignment + # can prob also fix the various if tests for sparse/categorical + + value_is_extension_type = is_extension_array_dtype(value) + + # categorical/sparse/datetimetz + if value_is_extension_type: + + def value_getitem(placement): + return value + + else: + if value.ndim == self.ndim - 1: + value = _safe_reshape(value, (1,) + value.shape) + + def value_getitem(placement): + return value + + else: + + def value_getitem(placement): + return value[placement.indexer] + + if value.shape[1:] != self.shape[1:]: + raise AssertionError( + "Shape of new values must be compatible with manager shape" + ) + + try: + loc = self.items.get_loc(item) + except KeyError: + # This item wasn't present, just insert at end + self.insert(len(self.items), item, value) + return + + if isinstance(loc, int): + loc = [loc] + + blknos = self._blknos[loc] + blklocs = self._blklocs[loc].copy() + + unfit_mgr_locs = [] + unfit_val_locs = [] + removed_blknos = [] + for blkno, val_locs in libinternals.get_blkno_placements(blknos, group=True): + blk = self.blocks[blkno] + blk_locs = blklocs[val_locs.indexer] + if blk.should_store(value): + blk.set(blk_locs, value_getitem(val_locs)) + else: + unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs]) + unfit_val_locs.append(val_locs) + + # If all block items are unfit, schedule the block for removal. + if len(val_locs) == len(blk.mgr_locs): + removed_blknos.append(blkno) + else: + self._blklocs[blk.mgr_locs.indexer] = -1 + blk.delete(blk_locs) + self._blklocs[blk.mgr_locs.indexer] = np.arange(len(blk)) + + if len(removed_blknos): + # Remove blocks & update blknos accordingly + is_deleted = np.zeros(self.nblocks, dtype=np.bool_) + is_deleted[removed_blknos] = True + + new_blknos = np.empty(self.nblocks, dtype=np.int64) + new_blknos.fill(-1) + new_blknos[~is_deleted] = np.arange(self.nblocks - len(removed_blknos)) + self._blknos = algos.take_1d( + new_blknos, self._blknos, axis=0, allow_fill=False + ) + self.blocks = tuple( + blk for i, blk in enumerate(self.blocks) if i not in set(removed_blknos) + ) + + if unfit_val_locs: + unfit_mgr_locs = np.concatenate(unfit_mgr_locs) + unfit_count = len(unfit_mgr_locs) + + new_blocks = [] + if value_is_extension_type: + # This code (ab-)uses the fact that sparse blocks contain only + # one item. + new_blocks.extend( + make_block( + values=value.copy(), + ndim=self.ndim, + placement=slice(mgr_loc, mgr_loc + 1), + ) + for mgr_loc in unfit_mgr_locs + ) + + self._blknos[unfit_mgr_locs] = np.arange(unfit_count) + len(self.blocks) + self._blklocs[unfit_mgr_locs] = 0 + + else: + # unfit_val_locs contains BlockPlacement objects + unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:]) + + new_blocks.append( + make_block( + values=value_getitem(unfit_val_items), + ndim=self.ndim, + placement=unfit_mgr_locs, + ) + ) + + self._blknos[unfit_mgr_locs] = len(self.blocks) + self._blklocs[unfit_mgr_locs] = np.arange(unfit_count) + + self.blocks += tuple(new_blocks) + + # Newly created block's dtype may already be present. + self._known_consolidated = False + + def insert(self, loc: int, item, value, allow_duplicates: bool = False): + """ + Insert item at selected position. + + Parameters + ---------- + loc : int + item : hashable + value : array_like + allow_duplicates: bool + If False, trying to insert non-unique item will raise + + """ + if not allow_duplicates and item in self.items: + # Should this be a different kind of error?? + raise ValueError(f"cannot insert {item}, already exists") + + if not isinstance(loc, int): + raise TypeError("loc must be int") + + # insert to the axis; this could possibly raise a TypeError + new_axis = self.items.insert(loc, item) + + block = make_block(values=value, ndim=self.ndim, placement=slice(loc, loc + 1)) + + for blkno, count in _fast_count_smallints(self._blknos[loc:]): + blk = self.blocks[blkno] + if count == len(blk.mgr_locs): + blk.mgr_locs = blk.mgr_locs.add(1) + else: + new_mgr_locs = blk.mgr_locs.as_array.copy() + new_mgr_locs[new_mgr_locs >= loc] += 1 + blk.mgr_locs = new_mgr_locs + + if loc == self._blklocs.shape[0]: + # np.append is a lot faster, let's use it if we can. + self._blklocs = np.append(self._blklocs, 0) + self._blknos = np.append(self._blknos, len(self.blocks)) + else: + self._blklocs = np.insert(self._blklocs, loc, 0) + self._blknos = np.insert(self._blknos, loc, len(self.blocks)) + + self.axes[0] = new_axis + self.blocks += (block,) + self._shape = None + + self._known_consolidated = False + + if len(self.blocks) > 100: + self._consolidate_inplace() + + def reindex_axis( + self, new_index, axis, method=None, limit=None, fill_value=None, copy=True + ): + """ + Conform block manager to new index. + """ + new_index = ensure_index(new_index) + new_index, indexer = self.axes[axis].reindex( + new_index, method=method, limit=limit + ) + + return self.reindex_indexer( + new_index, indexer, axis=axis, fill_value=fill_value, copy=copy + ) + + def reindex_indexer( + self, new_axis, indexer, axis, fill_value=None, allow_dups=False, copy=True + ): + """ + Parameters + ---------- + new_axis : Index + indexer : ndarray of int64 or None + axis : int + fill_value : object + allow_dups : bool + + pandas-indexer with -1's only. + """ + if indexer is None: + if new_axis is self.axes[axis] and not copy: + return self + + result = self.copy(deep=copy) + result.axes = list(self.axes) + result.axes[axis] = new_axis + return result + + self._consolidate_inplace() + + # some axes don't allow reindexing with dups + if not allow_dups: + self.axes[axis]._can_reindex(indexer) + + if axis >= self.ndim: + raise IndexError("Requested axis not found in manager") + + if axis == 0: + new_blocks = self._slice_take_blocks_ax0(indexer, fill_tuple=(fill_value,)) + else: + new_blocks = [ + blk.take_nd( + indexer, + axis=axis, + fill_tuple=( + fill_value if fill_value is not None else blk.fill_value, + ), + ) + for blk in self.blocks + ] + + new_axes = list(self.axes) + new_axes[axis] = new_axis + return type(self)(new_blocks, new_axes) + + def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): + """ + Slice/take blocks along axis=0. + + Overloaded for SingleBlock + + Returns + ------- + new_blocks : list of Block + """ + + allow_fill = fill_tuple is not None + + sl_type, slobj, sllen = _preprocess_slice_or_indexer( + slice_or_indexer, self.shape[0], allow_fill=allow_fill + ) + + if self._is_single_block: + blk = self.blocks[0] + + if sl_type in ("slice", "mask"): + return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))] + elif not allow_fill or self.ndim == 1: + if allow_fill and fill_tuple[0] is None: + _, fill_value = maybe_promote(blk.dtype) + fill_tuple = (fill_value,) + + return [ + blk.take_nd( + slobj, + axis=0, + new_mgr_locs=slice(0, sllen), + fill_tuple=fill_tuple, + ) + ] + + if sl_type in ("slice", "mask"): + blknos = self._blknos[slobj] + blklocs = self._blklocs[slobj] + else: + blknos = algos.take_1d( + self._blknos, slobj, fill_value=-1, allow_fill=allow_fill + ) + blklocs = algos.take_1d( + self._blklocs, slobj, fill_value=-1, allow_fill=allow_fill + ) + + # When filling blknos, make sure blknos is updated before appending to + # blocks list, that way new blkno is exactly len(blocks). + # + # FIXME: mgr_groupby_blknos must return mgr_locs in ascending order, + # pytables serialization will break otherwise. + blocks = [] + for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, group=True): + if blkno == -1: + # If we've got here, fill_tuple was not None. + fill_value = fill_tuple[0] + + blocks.append( + self._make_na_block(placement=mgr_locs, fill_value=fill_value) + ) + else: + blk = self.blocks[blkno] + + # Otherwise, slicing along items axis is necessary. + if not blk._can_consolidate: + # A non-consolidatable block, it's easy, because there's + # only one item and each mgr loc is a copy of that single + # item. + for mgr_loc in mgr_locs: + newblk = blk.copy(deep=True) + newblk.mgr_locs = slice(mgr_loc, mgr_loc + 1) + blocks.append(newblk) + + else: + blocks.append( + blk.take_nd( + blklocs[mgr_locs.indexer], + axis=0, + new_mgr_locs=mgr_locs, + fill_tuple=None, + ) + ) + + return blocks + + def _make_na_block(self, placement, fill_value=None): + # TODO: infer dtypes other than float64 from fill_value + + if fill_value is None: + fill_value = np.nan + block_shape = list(self.shape) + block_shape[0] = len(placement) + + dtype, fill_value = infer_dtype_from_scalar(fill_value) + block_values = np.empty(block_shape, dtype=dtype) + block_values.fill(fill_value) + return make_block(block_values, placement=placement) + + def take(self, indexer, axis=1, verify=True, convert=True): + """ + Take items along any axis. + """ + self._consolidate_inplace() + indexer = ( + np.arange(indexer.start, indexer.stop, indexer.step, dtype="int64") + if isinstance(indexer, slice) + else np.asanyarray(indexer, dtype="int64") + ) + + n = self.shape[axis] + if convert: + indexer = maybe_convert_indices(indexer, n) + + if verify: + if ((indexer == -1) | (indexer >= n)).any(): + raise Exception("Indices must be nonzero and less than the axis length") + + new_labels = self.axes[axis].take(indexer) + return self.reindex_indexer( + new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True + ) + + def equals(self, other): + self_axes, other_axes = self.axes, other.axes + if len(self_axes) != len(other_axes): + return False + if not all(ax1.equals(ax2) for ax1, ax2 in zip(self_axes, other_axes)): + return False + self._consolidate_inplace() + other._consolidate_inplace() + if len(self.blocks) != len(other.blocks): + return False + + # canonicalize block order, using a tuple combining the mgr_locs + # then type name because there might be unconsolidated + # blocks (say, Categorical) which can only be distinguished by + # the iteration order + def canonicalize(block): + return (block.mgr_locs.as_array.tolist(), block.dtype.name) + + self_blocks = sorted(self.blocks, key=canonicalize) + other_blocks = sorted(other.blocks, key=canonicalize) + return all( + block.equals(oblock) for block, oblock in zip(self_blocks, other_blocks) + ) + + def unstack(self, unstacker_func, fill_value): + """Return a blockmanager with all blocks unstacked. + + Parameters + ---------- + unstacker_func : callable + A (partially-applied) ``pd.core.reshape._Unstacker`` class. + fill_value : Any + fill_value for newly introduced missing values. + + Returns + ------- + unstacked : BlockManager + """ + n_rows = self.shape[-1] + dummy = unstacker_func(np.empty((0, 0)), value_columns=self.items) + new_columns = dummy.get_new_columns() + new_index = dummy.get_new_index() + new_blocks = [] + columns_mask = [] + + for blk in self.blocks: + blocks, mask = blk._unstack( + partial(unstacker_func, value_columns=self.items[blk.mgr_locs.indexer]), + new_columns, + n_rows, + fill_value, + ) + + new_blocks.extend(blocks) + columns_mask.extend(mask) + + new_columns = new_columns[columns_mask] + + bm = BlockManager(new_blocks, [new_columns, new_index]) + return bm + + +class SingleBlockManager(BlockManager): + """ manage a single block with """ + + ndim = 1 + _is_consolidated = True + _known_consolidated = True + __slots__ = () + + def __init__( + self, + block: Block, + axis: Union[Index, List[Index]], + do_integrity_check: bool = False, + fastpath: bool = False, + ): + if isinstance(axis, list): + if len(axis) != 1: + raise ValueError( + "cannot create SingleBlockManager with more than 1 axis" + ) + axis = axis[0] + + # passed from constructor, single block, single axis + if fastpath: + self.axes = [axis] + if isinstance(block, list): + + # empty block + if len(block) == 0: + block = [np.array([])] + elif len(block) != 1: + raise ValueError( + "Cannot create SingleBlockManager with more than 1 block" + ) + block = block[0] + else: + self.axes = [ensure_index(axis)] + + # create the block here + if isinstance(block, list): + + # provide consolidation to the interleaved_dtype + if len(block) > 1: + dtype = _interleaved_dtype(block) + block = [b.astype(dtype) for b in block] + block = _consolidate(block) + + if len(block) != 1: + raise ValueError( + "Cannot create SingleBlockManager with more than 1 block" + ) + block = block[0] + + if not isinstance(block, Block): + block = make_block(block, placement=slice(0, len(axis)), ndim=1) + + self.blocks = tuple([block]) + + def _post_setstate(self): + pass + + @property + def _block(self): + return self.blocks[0] + + @property + def _values(self): + return self._block.values + + @property + def _blknos(self): + """ compat with BlockManager """ + return None + + @property + def _blklocs(self): + """ compat with BlockManager """ + return None + + def get_slice(self, slobj, axis=0): + if axis >= self.ndim: + raise IndexError("Requested axis not found in manager") + + return type(self)(self._block._slice(slobj), self.index[slobj], fastpath=True,) + + @property + def index(self): + return self.axes[0] + + @property + def dtype(self): + return self._block.dtype + + @property + def array_dtype(self): + return self._block.array_dtype + + def get_dtype_counts(self): + return {self.dtype.name: 1} + + def get_dtypes(self): + return np.array([self._block.dtype]) + + def external_values(self): + return self._block.external_values() + + def internal_values(self): + return self._block.internal_values() + + def get_values(self): + """ return a dense type view """ + return np.array(self._block.to_dense(), copy=False) + + @property + def _can_hold_na(self): + return self._block._can_hold_na + + def is_consolidated(self): + return True + + def _consolidate_check(self): + pass + + def _consolidate_inplace(self): + pass + + def delete(self, item): + """ + Delete single item from SingleBlockManager. + + Ensures that self.blocks doesn't become empty. + """ + loc = self.items.get_loc(item) + self._block.delete(loc) + self.axes[0] = self.axes[0].delete(loc) + + def fast_xs(self, loc): + """ + fast path for getting a cross-section + return a view of the data + """ + return self._block.values[loc] + + def concat(self, to_concat, new_axis): + """ + Concatenate a list of SingleBlockManagers into a single + SingleBlockManager. + + Used for pd.concat of Series objects with axis=0. + + Parameters + ---------- + to_concat : list of SingleBlockManagers + new_axis : Index of the result + + Returns + ------- + SingleBlockManager + + """ + non_empties = [x for x in to_concat if len(x) > 0] + + # check if all series are of the same block type: + if len(non_empties) > 0: + blocks = [obj.blocks[0] for obj in non_empties] + if len({b.dtype for b in blocks}) == 1: + new_block = blocks[0].concat_same_type(blocks) + else: + values = [x.values for x in blocks] + values = concat_compat(values) + new_block = make_block(values, placement=slice(0, len(values), 1)) + else: + values = [x._block.values for x in to_concat] + values = concat_compat(values) + new_block = make_block(values, placement=slice(0, len(values), 1)) + + mgr = SingleBlockManager(new_block, new_axis) + return mgr + + +# -------------------------------------------------------------------- +# Constructor Helpers + + +def create_block_manager_from_blocks(blocks, axes): + try: + if len(blocks) == 1 and not isinstance(blocks[0], Block): + # if blocks[0] is of length 0, return empty blocks + if not len(blocks[0]): + blocks = [] + else: + # It's OK if a single block is passed as values, its placement + # is basically "all items", but if there're many, don't bother + # converting, it's an error anyway. + blocks = [ + make_block(values=blocks[0], placement=slice(0, len(axes[0]))) + ] + + mgr = BlockManager(blocks, axes) + mgr._consolidate_inplace() + return mgr + + except ValueError as e: + blocks = [getattr(b, "values", b) for b in blocks] + tot_items = sum(b.shape[0] for b in blocks) + construction_error(tot_items, blocks[0].shape[1:], axes, e) + + +def create_block_manager_from_arrays(arrays, names, axes): + + try: + blocks = form_blocks(arrays, names, axes) + mgr = BlockManager(blocks, axes) + mgr._consolidate_inplace() + return mgr + except ValueError as e: + construction_error(len(arrays), arrays[0].shape, axes, e) + + +def construction_error(tot_items, block_shape, axes, e=None): + """ raise a helpful message about our construction """ + passed = tuple(map(int, [tot_items] + list(block_shape))) + # Correcting the user facing error message during dataframe construction + if len(passed) <= 2: + passed = passed[::-1] + + implied = tuple(len(ax) for ax in axes) + # Correcting the user facing error message during dataframe construction + if len(implied) <= 2: + implied = implied[::-1] + + if passed == implied and e is not None: + raise e + if block_shape[0] == 0: + raise ValueError("Empty data passed with indices specified.") + raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}") + + +# ----------------------------------------------------------------------- + + +def form_blocks(arrays, names, axes): + # put "leftover" items in float bucket, where else? + # generalize? + items_dict = defaultdict(list) + extra_locs = [] + + names_idx = ensure_index(names) + if names_idx.equals(axes[0]): + names_indexer = np.arange(len(names_idx)) + else: + assert names_idx.intersection(axes[0]).is_unique + names_indexer = names_idx.get_indexer_for(axes[0]) + + for i, name_idx in enumerate(names_indexer): + if name_idx == -1: + extra_locs.append(i) + continue + + k = names[name_idx] + v = arrays[name_idx] + + block_type = get_block_type(v) + items_dict[block_type.__name__].append((i, k, v)) + + blocks = [] + if len(items_dict["FloatBlock"]): + float_blocks = _multi_blockify(items_dict["FloatBlock"]) + blocks.extend(float_blocks) + + if len(items_dict["ComplexBlock"]): + complex_blocks = _multi_blockify(items_dict["ComplexBlock"]) + blocks.extend(complex_blocks) + + if len(items_dict["TimeDeltaBlock"]): + timedelta_blocks = _multi_blockify(items_dict["TimeDeltaBlock"]) + blocks.extend(timedelta_blocks) + + if len(items_dict["IntBlock"]): + int_blocks = _multi_blockify(items_dict["IntBlock"]) + blocks.extend(int_blocks) + + if len(items_dict["DatetimeBlock"]): + datetime_blocks = _simple_blockify(items_dict["DatetimeBlock"], _NS_DTYPE) + blocks.extend(datetime_blocks) + + if len(items_dict["DatetimeTZBlock"]): + dttz_blocks = [ + make_block(array, klass=DatetimeTZBlock, placement=[i]) + for i, _, array in items_dict["DatetimeTZBlock"] + ] + blocks.extend(dttz_blocks) + + if len(items_dict["BoolBlock"]): + bool_blocks = _simple_blockify(items_dict["BoolBlock"], np.bool_) + blocks.extend(bool_blocks) + + if len(items_dict["ObjectBlock"]) > 0: + object_blocks = _simple_blockify(items_dict["ObjectBlock"], np.object_) + blocks.extend(object_blocks) + + if len(items_dict["CategoricalBlock"]) > 0: + cat_blocks = [ + make_block(array, klass=CategoricalBlock, placement=[i]) + for i, _, array in items_dict["CategoricalBlock"] + ] + blocks.extend(cat_blocks) + + if len(items_dict["ExtensionBlock"]): + + external_blocks = [ + make_block(array, klass=ExtensionBlock, placement=[i]) + for i, _, array in items_dict["ExtensionBlock"] + ] + + blocks.extend(external_blocks) + + if len(items_dict["ObjectValuesExtensionBlock"]): + external_blocks = [ + make_block(array, klass=ObjectValuesExtensionBlock, placement=[i]) + for i, _, array in items_dict["ObjectValuesExtensionBlock"] + ] + + blocks.extend(external_blocks) + + if len(extra_locs): + shape = (len(extra_locs),) + tuple(len(x) for x in axes[1:]) + + # empty items -> dtype object + block_values = np.empty(shape, dtype=object) + block_values.fill(np.nan) + + na_block = make_block(block_values, placement=extra_locs) + blocks.append(na_block) + + return blocks + + +def _simple_blockify(tuples, dtype): + """ return a single array of a block that has a single dtype; if dtype is + not None, coerce to this dtype + """ + values, placement = _stack_arrays(tuples, dtype) + + # TODO: CHECK DTYPE? + if dtype is not None and values.dtype != dtype: # pragma: no cover + values = values.astype(dtype) + + block = make_block(values, placement=placement) + return [block] + + +def _multi_blockify(tuples, dtype=None): + """ return an array of blocks that potentially have different dtypes """ + + # group by dtype + grouper = itertools.groupby(tuples, lambda x: x[2].dtype) + + new_blocks = [] + for dtype, tup_block in grouper: + + values, placement = _stack_arrays(list(tup_block), dtype) + + block = make_block(values, placement=placement) + new_blocks.append(block) + + return new_blocks + + +def _stack_arrays(tuples, dtype): + + # fml + def _asarray_compat(x): + if isinstance(x, ABCSeries): + return x._values + else: + return np.asarray(x) + + def _shape_compat(x): + if isinstance(x, ABCSeries): + return (len(x),) + else: + return x.shape + + placement, names, arrays = zip(*tuples) + + first = arrays[0] + shape = (len(arrays),) + _shape_compat(first) + + stacked = np.empty(shape, dtype=dtype) + for i, arr in enumerate(arrays): + stacked[i] = _asarray_compat(arr) + + return stacked, placement + + +def _interleaved_dtype( + blocks: List[Block], +) -> Optional[Union[np.dtype, ExtensionDtype]]: + """Find the common dtype for `blocks`. + + Parameters + ---------- + blocks : List[Block] + + Returns + ------- + dtype : Optional[Union[np.dtype, ExtensionDtype]] + None is returned when `blocks` is empty. + """ + if not len(blocks): + return None + + return find_common_type([b.dtype for b in blocks]) + + +def _consolidate(blocks): + """ + Merge blocks having same dtype, exclude non-consolidating blocks + """ + + # sort by _can_consolidate, dtype + gkey = lambda x: x._consolidate_key + grouper = itertools.groupby(sorted(blocks, key=gkey), gkey) + + new_blocks = [] + for (_can_consolidate, dtype), group_blocks in grouper: + merged_blocks = _merge_blocks( + list(group_blocks), dtype=dtype, _can_consolidate=_can_consolidate + ) + new_blocks = _extend_blocks(merged_blocks, new_blocks) + return new_blocks + + +def _compare_or_regex_search(a, b, regex=False): + """ + Compare two array_like inputs of the same shape or two scalar values + + Calls operator.eq or re.search, depending on regex argument. If regex is + True, perform an element-wise regex matching. + + Parameters + ---------- + a : array_like or scalar + b : array_like or scalar + regex : bool, default False + + Returns + ------- + mask : array_like of bool + """ + if not regex: + op = lambda x: operator.eq(x, b) + else: + op = np.vectorize( + lambda x: bool(re.search(b, x)) if isinstance(x, str) else False + ) + + is_a_array = isinstance(a, np.ndarray) + is_b_array = isinstance(b, np.ndarray) + + if is_datetimelike_v_numeric(a, b) or is_numeric_v_string_like(a, b): + # GH#29553 avoid deprecation warnings from numpy + result = False + else: + result = op(a) + + if is_scalar(result) and (is_a_array or is_b_array): + type_names = [type(a).__name__, type(b).__name__] + + if is_a_array: + type_names[0] = f"ndarray(dtype={a.dtype})" + + if is_b_array: + type_names[1] = f"ndarray(dtype={b.dtype})" + + raise TypeError( + f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}" + ) + return result + + +def _transform_index(index, func, level=None): + """ + Apply function to all values found in index. + + This includes transforming multiindex entries separately. + Only apply function to one level of the MultiIndex if level is specified. + + """ + if isinstance(index, MultiIndex): + if level is not None: + items = [ + tuple(func(y) if i == level else y for i, y in enumerate(x)) + for x in index + ] + else: + items = [tuple(func(y) for y in x) for x in index] + return MultiIndex.from_tuples(items, names=index.names) + else: + items = [func(x) for x in index] + return Index(items, name=index.name, tupleize_cols=False) + + +def _fast_count_smallints(arr): + """Faster version of set(arr) for sequences of small numbers.""" + counts = np.bincount(arr.astype(np.int_)) + nz = counts.nonzero()[0] + return np.c_[nz, counts[nz]] + + +def _preprocess_slice_or_indexer(slice_or_indexer, length, allow_fill): + if isinstance(slice_or_indexer, slice): + return ( + "slice", + slice_or_indexer, + libinternals.slice_len(slice_or_indexer, length), + ) + elif ( + isinstance(slice_or_indexer, np.ndarray) and slice_or_indexer.dtype == np.bool_ + ): + return "mask", slice_or_indexer, slice_or_indexer.sum() + else: + indexer = np.asanyarray(slice_or_indexer, dtype=np.int64) + if not allow_fill: + indexer = maybe_convert_indices(indexer, length) + return "fancy", indexer, len(indexer) + + +def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): + """ + Concatenate block managers into one. + + Parameters + ---------- + mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples + axes : list of Index + concat_axis : int + copy : bool + + """ + concat_plans = [ + get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers + ] + concat_plan = combine_concat_plans(concat_plans, concat_axis) + blocks = [] + + for placement, join_units in concat_plan: + + if len(join_units) == 1 and not join_units[0].indexers: + b = join_units[0].block + values = b.values + if copy: + values = values.copy() + else: + values = values.view() + b = b.make_block_same_class(values, placement=placement) + elif is_uniform_join_units(join_units): + b = join_units[0].block.concat_same_type( + [ju.block for ju in join_units], placement=placement + ) + else: + b = make_block( + concatenate_join_units(join_units, concat_axis, copy=copy), + placement=placement, + ) + blocks.append(b) + + return BlockManager(blocks, axes) diff --git a/venv/Lib/site-packages/pandas/core/missing.py b/venv/Lib/site-packages/pandas/core/missing.py new file mode 100644 index 0000000..b30a7a2 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/missing.py @@ -0,0 +1,684 @@ +""" +Routines for filling missing data. +""" + +import numpy as np + +from pandas._libs import algos, lib +from pandas.compat._optional import import_optional_dependency + +from pandas.core.dtypes.cast import infer_dtype_from_array +from pandas.core.dtypes.common import ( + ensure_float64, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_integer_dtype, + is_numeric_v_string_like, + is_scalar, + is_timedelta64_dtype, + needs_i8_conversion, +) +from pandas.core.dtypes.missing import isna + + +def mask_missing(arr, values_to_mask): + """ + Return a masking array of same size/shape as arr + with entries equaling any member of values_to_mask set to True + """ + dtype, values_to_mask = infer_dtype_from_array(values_to_mask) + + try: + values_to_mask = np.array(values_to_mask, dtype=dtype) + + except Exception: + values_to_mask = np.array(values_to_mask, dtype=object) + + na_mask = isna(values_to_mask) + nonna = values_to_mask[~na_mask] + + mask = None + for x in nonna: + if mask is None: + if is_numeric_v_string_like(arr, x): + # GH#29553 prevent numpy deprecation warnings + mask = False + else: + mask = arr == x + + # if x is a string and arr is not, then we get False and we must + # expand the mask to size arr.shape + if is_scalar(mask): + mask = np.zeros(arr.shape, dtype=bool) + else: + if is_numeric_v_string_like(arr, x): + # GH#29553 prevent numpy deprecation warnings + mask |= False + else: + mask |= arr == x + + if na_mask.any(): + if mask is None: + mask = isna(arr) + else: + mask |= isna(arr) + + # GH 21977 + if mask is None: + mask = np.zeros(arr.shape, dtype=bool) + + return mask + + +def clean_fill_method(method, allow_nearest=False): + # asfreq is compat for resampling + if method in [None, "asfreq"]: + return None + + if isinstance(method, str): + method = method.lower() + if method == "ffill": + method = "pad" + elif method == "bfill": + method = "backfill" + + valid_methods = ["pad", "backfill"] + expecting = "pad (ffill) or backfill (bfill)" + if allow_nearest: + valid_methods.append("nearest") + expecting = "pad (ffill), backfill (bfill) or nearest" + if method not in valid_methods: + raise ValueError(f"Invalid fill method. Expecting {expecting}. Got {method}") + return method + + +def clean_interp_method(method, **kwargs): + order = kwargs.get("order") + valid = [ + "linear", + "time", + "index", + "values", + "nearest", + "zero", + "slinear", + "quadratic", + "cubic", + "barycentric", + "polynomial", + "krogh", + "piecewise_polynomial", + "pchip", + "akima", + "spline", + "from_derivatives", + ] + if method in ("spline", "polynomial") and order is None: + raise ValueError("You must specify the order of the spline or polynomial.") + if method not in valid: + raise ValueError(f"method must be one of {valid}. Got '{method}' instead.") + + return method + + +def find_valid_index(values, how: str): + """ + Retrieves the index of the first valid value. + + Parameters + ---------- + values : ndarray or ExtensionArray + how : {'first', 'last'} + Use this parameter to change between the first or last valid index. + + Returns + ------- + int or None + """ + assert how in ["first", "last"] + + if len(values) == 0: # early stop + return None + + is_valid = ~isna(values) + + if values.ndim == 2: + is_valid = is_valid.any(1) # reduce axis 1 + + if how == "first": + idxpos = is_valid[::].argmax() + + if how == "last": + idxpos = len(values) - 1 - is_valid[::-1].argmax() + + chk_notna = is_valid[idxpos] + + if not chk_notna: + return None + return idxpos + + +def interpolate_1d( + xvalues, + yvalues, + method="linear", + limit=None, + limit_direction="forward", + limit_area=None, + fill_value=None, + bounds_error=False, + order=None, + **kwargs, +): + """ + Logic for the 1-d interpolation. The result should be 1-d, inputs + xvalues and yvalues will each be 1-d arrays of the same length. + + Bounds_error is currently hardcoded to False since non-scipy ones don't + take it as an argument. + """ + # Treat the original, non-scipy methods first. + + invalid = isna(yvalues) + valid = ~invalid + + if not valid.any(): + # have to call np.asarray(xvalues) since xvalues could be an Index + # which can't be mutated + result = np.empty_like(np.asarray(xvalues), dtype=np.float64) + result.fill(np.nan) + return result + + if valid.all(): + return yvalues + + if method == "time": + if not getattr(xvalues, "is_all_dates", None): + # if not issubclass(xvalues.dtype.type, np.datetime64): + raise ValueError( + "time-weighted interpolation only works " + "on Series or DataFrames with a " + "DatetimeIndex" + ) + method = "values" + + valid_limit_directions = ["forward", "backward", "both"] + limit_direction = limit_direction.lower() + if limit_direction not in valid_limit_directions: + raise ValueError( + "Invalid limit_direction: expecting one of " + f"{valid_limit_directions}, got '{limit_direction}'." + ) + + if limit_area is not None: + valid_limit_areas = ["inside", "outside"] + limit_area = limit_area.lower() + if limit_area not in valid_limit_areas: + raise ValueError( + f"Invalid limit_area: expecting one of {valid_limit_areas}, got " + f"{limit_area}." + ) + + # default limit is unlimited GH #16282 + limit = algos._validate_limit(nobs=None, limit=limit) + + # These are sets of index pointers to invalid values... i.e. {0, 1, etc... + all_nans = set(np.flatnonzero(invalid)) + start_nans = set(range(find_valid_index(yvalues, "first"))) + end_nans = set(range(1 + find_valid_index(yvalues, "last"), len(valid))) + mid_nans = all_nans - start_nans - end_nans + + # Like the sets above, preserve_nans contains indices of invalid values, + # but in this case, it is the final set of indices that need to be + # preserved as NaN after the interpolation. + + # For example if limit_direction='forward' then preserve_nans will + # contain indices of NaNs at the beginning of the series, and NaNs that + # are more than'limit' away from the prior non-NaN. + + # set preserve_nans based on direction using _interp_limit + if limit_direction == "forward": + preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0)) + elif limit_direction == "backward": + preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit)) + else: + # both directions... just use _interp_limit + preserve_nans = set(_interp_limit(invalid, limit, limit)) + + # if limit_area is set, add either mid or outside indices + # to preserve_nans GH #16284 + if limit_area == "inside": + # preserve NaNs on the outside + preserve_nans |= start_nans | end_nans + elif limit_area == "outside": + # preserve NaNs on the inside + preserve_nans |= mid_nans + + # sort preserve_nans and covert to list + preserve_nans = sorted(preserve_nans) + + xvalues = getattr(xvalues, "values", xvalues) + yvalues = getattr(yvalues, "values", yvalues) + result = yvalues.copy() + + if method in ["linear", "time", "index", "values"]: + if method in ("values", "index"): + inds = np.asarray(xvalues) + # hack for DatetimeIndex, #1646 + if needs_i8_conversion(inds.dtype.type): + inds = inds.view(np.int64) + if inds.dtype == np.object_: + inds = lib.maybe_convert_objects(inds) + else: + inds = xvalues + # np.interp requires sorted X values, #21037 + indexer = np.argsort(inds[valid]) + result[invalid] = np.interp( + inds[invalid], inds[valid][indexer], yvalues[valid][indexer] + ) + result[preserve_nans] = np.nan + return result + + sp_methods = [ + "nearest", + "zero", + "slinear", + "quadratic", + "cubic", + "barycentric", + "krogh", + "spline", + "polynomial", + "from_derivatives", + "piecewise_polynomial", + "pchip", + "akima", + ] + + if method in sp_methods: + inds = np.asarray(xvalues) + # hack for DatetimeIndex, #1646 + if issubclass(inds.dtype.type, np.datetime64): + inds = inds.view(np.int64) + result[invalid] = _interpolate_scipy_wrapper( + inds[valid], + yvalues[valid], + inds[invalid], + method=method, + fill_value=fill_value, + bounds_error=bounds_error, + order=order, + **kwargs, + ) + result[preserve_nans] = np.nan + return result + + +def _interpolate_scipy_wrapper( + x, y, new_x, method, fill_value=None, bounds_error=False, order=None, **kwargs +): + """ + Passed off to scipy.interpolate.interp1d. method is scipy's kind. + Returns an array interpolated at new_x. Add any new methods to + the list in _clean_interp_method. + """ + extra = f"{method} interpolation requires SciPy." + import_optional_dependency("scipy", extra=extra) + from scipy import interpolate + + new_x = np.asarray(new_x) + + # ignores some kwargs that could be passed along. + alt_methods = { + "barycentric": interpolate.barycentric_interpolate, + "krogh": interpolate.krogh_interpolate, + "from_derivatives": _from_derivatives, + "piecewise_polynomial": _from_derivatives, + } + + if getattr(x, "is_all_dates", False): + # GH 5975, scipy.interp1d can't handle datetime64s + x, new_x = x._values.astype("i8"), new_x.astype("i8") + + if method == "pchip": + try: + alt_methods["pchip"] = interpolate.pchip_interpolate + except AttributeError: + raise ImportError( + "Your version of Scipy does not support PCHIP interpolation." + ) + elif method == "akima": + alt_methods["akima"] = _akima_interpolate + + interp1d_methods = [ + "nearest", + "zero", + "slinear", + "quadratic", + "cubic", + "polynomial", + ] + if method in interp1d_methods: + if method == "polynomial": + method = order + terp = interpolate.interp1d( + x, y, kind=method, fill_value=fill_value, bounds_error=bounds_error + ) + new_y = terp(new_x) + elif method == "spline": + # GH #10633, #24014 + if isna(order) or (order <= 0): + raise ValueError( + f"order needs to be specified and greater than 0; got order: {order}" + ) + terp = interpolate.UnivariateSpline(x, y, k=order, **kwargs) + new_y = terp(new_x) + else: + # GH 7295: need to be able to write for some reason + # in some circumstances: check all three + if not x.flags.writeable: + x = x.copy() + if not y.flags.writeable: + y = y.copy() + if not new_x.flags.writeable: + new_x = new_x.copy() + method = alt_methods[method] + new_y = method(x, y, new_x, **kwargs) + return new_y + + +def _from_derivatives(xi, yi, x, order=None, der=0, extrapolate=False): + """ + Convenience function for interpolate.BPoly.from_derivatives. + + Construct a piecewise polynomial in the Bernstein basis, compatible + with the specified values and derivatives at breakpoints. + + Parameters + ---------- + xi : array_like + sorted 1D array of x-coordinates + yi : array_like or list of array-likes + yi[i][j] is the j-th derivative known at xi[i] + order: None or int or array_like of ints. Default: None. + Specifies the degree of local polynomials. If not None, some + derivatives are ignored. + der : int or list + How many derivatives to extract; None for all potentially nonzero + derivatives (that is a number equal to the number of points), or a + list of derivatives to extract. This numberincludes the function + value as 0th derivative. + extrapolate : bool, optional + Whether to extrapolate to ouf-of-bounds points based on first and last + intervals, or to return NaNs. Default: True. + + See Also + -------- + scipy.interpolate.BPoly.from_derivatives + + Returns + ------- + y : scalar or array_like + The result, of length R or length M or M by R. + """ + from scipy import interpolate + + # return the method for compat with scipy version & backwards compat + method = interpolate.BPoly.from_derivatives + m = method(xi, yi.reshape(-1, 1), orders=order, extrapolate=extrapolate) + + return m(x) + + +def _akima_interpolate(xi, yi, x, der=0, axis=0): + """ + Convenience function for akima interpolation. + xi and yi are arrays of values used to approximate some function f, + with ``yi = f(xi)``. + + See `Akima1DInterpolator` for details. + + Parameters + ---------- + xi : array_like + A sorted list of x-coordinates, of length N. + yi : array_like + A 1-D array of real values. `yi`'s length along the interpolation + axis must be equal to the length of `xi`. If N-D array, use axis + parameter to select correct axis. + x : scalar or array_like + Of length M. + der : int or list, optional + How many derivatives to extract; None for all potentially + nonzero derivatives (that is a number equal to the number + of points), or a list of derivatives to extract. This number + includes the function value as 0th derivative. + axis : int, optional + Axis in the yi array corresponding to the x-coordinate values. + + See Also + -------- + scipy.interpolate.Akima1DInterpolator + + Returns + ------- + y : scalar or array_like + The result, of length R or length M or M by R, + + """ + from scipy import interpolate + + P = interpolate.Akima1DInterpolator(xi, yi, axis=axis) + + if der == 0: + return P(x) + elif interpolate._isscalar(der): + return P(x, der=der) + else: + return [P(x, nu) for nu in der] + + +def interpolate_2d( + values, method="pad", axis=0, limit=None, fill_value=None, dtype=None +): + """ + Perform an actual interpolation of values, values will be make 2-d if + needed fills inplace, returns the result. + """ + orig_values = values + + transf = (lambda x: x) if axis == 0 else (lambda x: x.T) + + # reshape a 1 dim if needed + ndim = values.ndim + if values.ndim == 1: + if axis != 0: # pragma: no cover + raise AssertionError("cannot interpolate on a ndim == 1 with axis != 0") + values = values.reshape(tuple((1,) + values.shape)) + + if fill_value is None: + mask = None + else: # todo create faster fill func without masking + mask = mask_missing(transf(values), fill_value) + + method = clean_fill_method(method) + if method == "pad": + values = transf(pad_2d(transf(values), limit=limit, mask=mask, dtype=dtype)) + else: + values = transf( + backfill_2d(transf(values), limit=limit, mask=mask, dtype=dtype) + ) + + # reshape back + if ndim == 1: + values = values[0] + + if orig_values.dtype.kind == "M": + # convert float back to datetime64 + values = values.astype(orig_values.dtype) + + return values + + +def _cast_values_for_fillna(values, dtype): + """ + Cast values to a dtype that algos.pad and algos.backfill can handle. + """ + # TODO: for int-dtypes we make a copy, but for everything else this + # alters the values in-place. Is this intentional? + + if ( + is_datetime64_dtype(dtype) + or is_datetime64tz_dtype(dtype) + or is_timedelta64_dtype(dtype) + ): + values = values.view(np.int64) + + elif is_integer_dtype(values): + # NB: this check needs to come after the datetime64 check above + values = ensure_float64(values) + + return values + + +def _fillna_prep(values, mask=None, dtype=None): + # boilerplate for pad_1d, backfill_1d, pad_2d, backfill_2d + if dtype is None: + dtype = values.dtype + + if mask is None: + # This needs to occur before datetime/timedeltas are cast to int64 + mask = isna(values) + + values = _cast_values_for_fillna(values, dtype) + + mask = mask.view(np.uint8) + return values, mask + + +def pad_1d(values, limit=None, mask=None, dtype=None): + values, mask = _fillna_prep(values, mask, dtype) + algos.pad_inplace(values, mask, limit=limit) + return values + + +def backfill_1d(values, limit=None, mask=None, dtype=None): + values, mask = _fillna_prep(values, mask, dtype) + algos.backfill_inplace(values, mask, limit=limit) + return values + + +def pad_2d(values, limit=None, mask=None, dtype=None): + values, mask = _fillna_prep(values, mask, dtype) + + if np.all(values.shape): + algos.pad_2d_inplace(values, mask, limit=limit) + else: + # for test coverage + pass + return values + + +def backfill_2d(values, limit=None, mask=None, dtype=None): + values, mask = _fillna_prep(values, mask, dtype) + + if np.all(values.shape): + algos.backfill_2d_inplace(values, mask, limit=limit) + else: + # for test coverage + pass + return values + + +_fill_methods = {"pad": pad_1d, "backfill": backfill_1d} + + +def get_fill_func(method): + method = clean_fill_method(method) + return _fill_methods[method] + + +def clean_reindex_fill_method(method): + return clean_fill_method(method, allow_nearest=True) + + +def _interp_limit(invalid, fw_limit, bw_limit): + """ + Get indexers of values that won't be filled + because they exceed the limits. + + Parameters + ---------- + invalid : boolean ndarray + fw_limit : int or None + forward limit to index + bw_limit : int or None + backward limit to index + + Returns + ------- + set of indexers + + Notes + ----- + This is equivalent to the more readable, but slower + + .. code-block:: python + + def _interp_limit(invalid, fw_limit, bw_limit): + for x in np.where(invalid)[0]: + if invalid[max(0, x - fw_limit):x + bw_limit + 1].all(): + yield x + """ + # handle forward first; the backward direction is the same except + # 1. operate on the reversed array + # 2. subtract the returned indices from N - 1 + N = len(invalid) + f_idx = set() + b_idx = set() + + def inner(invalid, limit): + limit = min(limit, N) + windowed = _rolling_window(invalid, limit + 1).all(1) + idx = set(np.where(windowed)[0] + limit) | set( + np.where((~invalid[: limit + 1]).cumsum() == 0)[0] + ) + return idx + + if fw_limit is not None: + + if fw_limit == 0: + f_idx = set(np.where(invalid)[0]) + else: + f_idx = inner(invalid, fw_limit) + + if bw_limit is not None: + + if bw_limit == 0: + # then we don't even need to care about backwards + # just use forwards + return f_idx + else: + b_idx = list(inner(invalid[::-1], bw_limit)) + b_idx = set(N - 1 - np.asarray(b_idx)) + if fw_limit == 0: + return b_idx + + return f_idx & b_idx + + +def _rolling_window(a, window): + """ + [True, True, False, True, False], 2 -> + + [ + [True, True], + [True, False], + [False, True], + [True, False], + ] + """ + # https://stackoverflow.com/a/6811241 + shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) + strides = a.strides + (a.strides[-1],) + return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) diff --git a/venv/Lib/site-packages/pandas/core/nanops.py b/venv/Lib/site-packages/pandas/core/nanops.py new file mode 100644 index 0000000..6b03e76 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/nanops.py @@ -0,0 +1,1424 @@ +import functools +import itertools +import operator +from typing import Any, Optional, Tuple, Union + +import numpy as np + +from pandas._config import get_option + +from pandas._libs import NaT, Timedelta, Timestamp, iNaT, lib +from pandas.compat._optional import import_optional_dependency + +from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask +from pandas.core.dtypes.common import ( + _get_dtype, + is_any_int_dtype, + is_bool_dtype, + is_complex, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_datetime_or_timedelta_dtype, + is_float, + is_float_dtype, + is_integer, + is_integer_dtype, + is_numeric_dtype, + is_object_dtype, + is_scalar, + is_timedelta64_dtype, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import DatetimeTZDtype +from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna + +bn = import_optional_dependency("bottleneck", raise_on_missing=False, on_version="warn") +_BOTTLENECK_INSTALLED = bn is not None +_USE_BOTTLENECK = False + + +def set_use_bottleneck(v=True): + # set/unset to use bottleneck + global _USE_BOTTLENECK + if _BOTTLENECK_INSTALLED: + _USE_BOTTLENECK = v + + +set_use_bottleneck(get_option("compute.use_bottleneck")) + + +class disallow: + def __init__(self, *dtypes): + super().__init__() + self.dtypes = tuple(pandas_dtype(dtype).type for dtype in dtypes) + + def check(self, obj) -> bool: + return hasattr(obj, "dtype") and issubclass(obj.dtype.type, self.dtypes) + + def __call__(self, f): + @functools.wraps(f) + def _f(*args, **kwargs): + obj_iter = itertools.chain(args, kwargs.values()) + if any(self.check(obj) for obj in obj_iter): + f_name = f.__name__.replace("nan", "") + raise TypeError( + f"reduction operation '{f_name}' not allowed for this dtype" + ) + try: + with np.errstate(invalid="ignore"): + return f(*args, **kwargs) + except ValueError as e: + # we want to transform an object array + # ValueError message to the more typical TypeError + # e.g. this is normally a disallowed function on + # object arrays that contain strings + if is_object_dtype(args[0]): + raise TypeError(e) + raise + + return _f + + +class bottleneck_switch: + def __init__(self, name=None, **kwargs): + self.name = name + self.kwargs = kwargs + + def __call__(self, alt): + bn_name = self.name or alt.__name__ + + try: + bn_func = getattr(bn, bn_name) + except (AttributeError, NameError): # pragma: no cover + bn_func = None + + @functools.wraps(alt) + def f(values, axis=None, skipna=True, **kwds): + if len(self.kwargs) > 0: + for k, v in self.kwargs.items(): + if k not in kwds: + kwds[k] = v + + if values.size == 0 and kwds.get("min_count") is None: + # We are empty, returning NA for our type + # Only applies for the default `min_count` of None + # since that affects how empty arrays are handled. + # TODO(GH-18976) update all the nanops methods to + # correctly handle empty inputs and remove this check. + # It *may* just be `var` + return _na_for_min_count(values, axis) + + if _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype, bn_name): + if kwds.get("mask", None) is None: + # `mask` is not recognised by bottleneck, would raise + # TypeError if called + kwds.pop("mask", None) + result = bn_func(values, axis=axis, **kwds) + + # prefer to treat inf/-inf as NA, but must compute the func + # twice :( + if _has_infs(result): + result = alt(values, axis=axis, skipna=skipna, **kwds) + else: + result = alt(values, axis=axis, skipna=skipna, **kwds) + else: + result = alt(values, axis=axis, skipna=skipna, **kwds) + + return result + + return f + + +def _bn_ok_dtype(dt, name: str) -> bool: + # Bottleneck chokes on datetime64 + if not is_object_dtype(dt) and not ( + is_datetime_or_timedelta_dtype(dt) or is_datetime64tz_dtype(dt) + ): + + # GH 15507 + # bottleneck does not properly upcast during the sum + # so can overflow + + # GH 9422 + # further we also want to preserve NaN when all elements + # are NaN, unlinke bottleneck/numpy which consider this + # to be 0 + if name in ["nansum", "nanprod"]: + return False + + return True + return False + + +def _has_infs(result) -> bool: + if isinstance(result, np.ndarray): + if result.dtype == "f8": + return lib.has_infs_f8(result.ravel()) + elif result.dtype == "f4": + return lib.has_infs_f4(result.ravel()) + try: + return np.isinf(result).any() + except (TypeError, NotImplementedError): + # if it doesn't support infs, then it can't have infs + return False + + +def _get_fill_value(dtype, fill_value=None, fill_value_typ=None): + """ return the correct fill value for the dtype of the values """ + if fill_value is not None: + return fill_value + if _na_ok_dtype(dtype): + if fill_value_typ is None: + return np.nan + else: + if fill_value_typ == "+inf": + return np.inf + else: + return -np.inf + else: + if fill_value_typ is None: + return iNaT + else: + if fill_value_typ == "+inf": + # need the max int here + return _int64_max + else: + return iNaT + + +def _maybe_get_mask( + values: np.ndarray, skipna: bool, mask: Optional[np.ndarray] +) -> Optional[np.ndarray]: + """ + Compute a mask if and only if necessary. + + This function will compute a mask iff it is necessary. Otherwise, + return the provided mask (potentially None) when a mask does not need to be + computed. + + A mask is never necessary if the values array is of boolean or integer + dtypes, as these are incapable of storing NaNs. If passing a NaN-capable + dtype that is interpretable as either boolean or integer data (eg, + timedelta64), a mask must be provided. + + If the skipna parameter is False, a new mask will not be computed. + + The mask is computed using isna() by default. Setting invert=True selects + notna() as the masking function. + + Parameters + ---------- + values : ndarray + input array to potentially compute mask for + skipna : bool + boolean for whether NaNs should be skipped + mask : Optional[ndarray] + nan-mask if known + + Returns + ------- + Optional[np.ndarray] + """ + + if mask is None: + if is_bool_dtype(values.dtype) or is_integer_dtype(values.dtype): + # Boolean data cannot contain nulls, so signal via mask being None + return None + + if skipna: + mask = isna(values) + + return mask + + +def _get_values( + values: np.ndarray, + skipna: bool, + fill_value: Any = None, + fill_value_typ: Optional[str] = None, + mask: Optional[np.ndarray] = None, +) -> Tuple[np.ndarray, Optional[np.ndarray], np.dtype, np.dtype, Any]: + """ + Utility to get the values view, mask, dtype, dtype_max, and fill_value. + + If both mask and fill_value/fill_value_typ are not None and skipna is True, + the values array will be copied. + + For input arrays of boolean or integer dtypes, copies will only occur if a + precomputed mask, a fill_value/fill_value_typ, and skipna=True are + provided. + + Parameters + ---------- + values : ndarray + input array to potentially compute mask for + skipna : bool + boolean for whether NaNs should be skipped + fill_value : Any + value to fill NaNs with + fill_value_typ : str + Set to '+inf' or '-inf' to handle dtype-specific infinities + mask : Optional[np.ndarray] + nan-mask if known + + Returns + ------- + values : ndarray + Potential copy of input value array + mask : Optional[ndarray[bool]] + Mask for values, if deemed necessary to compute + dtype : dtype + dtype for values + dtype_max : dtype + platform independent dtype + fill_value : Any + fill value used + """ + + # In _get_values is only called from within nanops, and in all cases + # with scalar fill_value. This guarantee is important for the + # maybe_upcast_putmask call below + assert is_scalar(fill_value) + + mask = _maybe_get_mask(values, skipna, mask) + + if is_datetime64tz_dtype(values): + # lib.values_from_object returns M8[ns] dtype instead of tz-aware, + # so this case must be handled separately from the rest + dtype = values.dtype + values = getattr(values, "_values", values) + else: + values = lib.values_from_object(values) + dtype = values.dtype + + if is_datetime_or_timedelta_dtype(values) or is_datetime64tz_dtype(values): + # changing timedelta64/datetime64 to int64 needs to happen after + # finding `mask` above + values = getattr(values, "asi8", values) + values = values.view(np.int64) + + dtype_ok = _na_ok_dtype(dtype) + + # get our fill value (in case we need to provide an alternative + # dtype for it) + fill_value = _get_fill_value( + dtype, fill_value=fill_value, fill_value_typ=fill_value_typ + ) + + copy = (mask is not None) and (fill_value is not None) + + if skipna and copy: + values = values.copy() + if dtype_ok: + np.putmask(values, mask, fill_value) + + # promote if needed + else: + values, _ = maybe_upcast_putmask(values, mask, fill_value) + + # return a platform independent precision dtype + dtype_max = dtype + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + dtype_max = np.int64 + elif is_float_dtype(dtype): + dtype_max = np.float64 + + return values, mask, dtype, dtype_max, fill_value + + +def _na_ok_dtype(dtype): + # TODO: what about datetime64tz? PeriodDtype? + return not issubclass(dtype.type, (np.integer, np.timedelta64, np.datetime64)) + + +def _wrap_results(result, dtype, fill_value=None): + """ wrap our results if needed """ + + if is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): + if fill_value is None: + # GH#24293 + fill_value = iNaT + if not isinstance(result, np.ndarray): + tz = getattr(dtype, "tz", None) + assert not isna(fill_value), "Expected non-null fill_value" + if result == fill_value: + result = np.nan + result = Timestamp(result, tz=tz) + else: + result = result.view(dtype) + elif is_timedelta64_dtype(dtype): + if not isinstance(result, np.ndarray): + if result == fill_value: + result = np.nan + + # raise if we have a timedelta64[ns] which is too large + if np.fabs(result) > _int64_max: + raise ValueError("overflow in timedelta operation") + + result = Timedelta(result, unit="ns") + else: + result = result.astype("m8[ns]").view(dtype) + + return result + + +def _na_for_min_count(values, axis: Optional[int]): + """ + Return the missing value for `values`. + + Parameters + ---------- + values : ndarray + axis : int or None + axis for the reduction, required if values.ndim > 1. + + Returns + ------- + result : scalar or ndarray + For 1-D values, returns a scalar of the correct missing type. + For 2-D values, returns a 1-D array where each element is missing. + """ + # we either return np.nan or pd.NaT + if is_numeric_dtype(values): + values = values.astype("float64") + fill_value = na_value_for_dtype(values.dtype) + + if values.ndim == 1: + return fill_value + else: + assert axis is not None # assertion to make mypy happy + result_shape = values.shape[:axis] + values.shape[axis + 1 :] + result = np.empty(result_shape, dtype=values.dtype) + result.fill(fill_value) + return result + + +def nanany(values, axis=None, skipna: bool = True, mask=None): + """ + Check if any elements along an axis evaluate to True. + + Parameters + ---------- + values : ndarray + axis : int, optional + skipna : bool, default True + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + result : bool + + Examples + -------- + >>> import pandas.core.nanops as nanops + >>> s = pd.Series([1, 2]) + >>> nanops.nanany(s) + True + + >>> import pandas.core.nanops as nanops + >>> s = pd.Series([np.nan]) + >>> nanops.nanany(s) + False + """ + values, _, _, _, _ = _get_values(values, skipna, fill_value=False, mask=mask) + return values.any(axis) + + +def nanall(values, axis=None, skipna: bool = True, mask=None): + """ + Check if all elements along an axis evaluate to True. + + Parameters + ---------- + values : ndarray + axis: int, optional + skipna : bool, default True + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + result : bool + + Examples + -------- + >>> import pandas.core.nanops as nanops + >>> s = pd.Series([1, 2, np.nan]) + >>> nanops.nanall(s) + True + + >>> import pandas.core.nanops as nanops + >>> s = pd.Series([1, 0]) + >>> nanops.nanall(s) + False + """ + values, _, _, _, _ = _get_values(values, skipna, fill_value=True, mask=mask) + return values.all(axis) + + +@disallow("M8") +def nansum(values, axis=None, skipna=True, min_count=0, mask=None): + """ + Sum the elements along an axis ignoring NaNs + + Parameters + ---------- + values : ndarray[dtype] + axis: int, optional + skipna : bool, default True + min_count: int, default 0 + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + result : dtype + + Examples + -------- + >>> import pandas.core.nanops as nanops + >>> s = pd.Series([1, 2, np.nan]) + >>> nanops.nansum(s) + 3.0 + """ + values, mask, dtype, dtype_max, _ = _get_values( + values, skipna, fill_value=0, mask=mask + ) + dtype_sum = dtype_max + if is_float_dtype(dtype): + dtype_sum = dtype + elif is_timedelta64_dtype(dtype): + dtype_sum = np.float64 + the_sum = values.sum(axis, dtype=dtype_sum) + the_sum = _maybe_null_out(the_sum, axis, mask, values.shape, min_count=min_count) + + return _wrap_results(the_sum, dtype) + + +@disallow("M8", DatetimeTZDtype) +@bottleneck_switch() +def nanmean(values, axis=None, skipna=True, mask=None): + """ + Compute the mean of the element along an axis ignoring NaNs + + Parameters + ---------- + values : ndarray + axis: int, optional + skipna : bool, default True + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + result : float + Unless input is a float array, in which case use the same + precision as the input array. + + Examples + -------- + >>> import pandas.core.nanops as nanops + >>> s = pd.Series([1, 2, np.nan]) + >>> nanops.nanmean(s) + 1.5 + """ + values, mask, dtype, dtype_max, _ = _get_values( + values, skipna, fill_value=0, mask=mask + ) + dtype_sum = dtype_max + dtype_count = np.float64 + if ( + is_integer_dtype(dtype) + or is_timedelta64_dtype(dtype) + or is_datetime64_dtype(dtype) + or is_datetime64tz_dtype(dtype) + ): + dtype_sum = np.float64 + elif is_float_dtype(dtype): + dtype_sum = dtype + dtype_count = dtype + count = _get_counts(values.shape, mask, axis, dtype=dtype_count) + the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum)) + + if axis is not None and getattr(the_sum, "ndim", False): + with np.errstate(all="ignore"): + # suppress division by zero warnings + the_mean = the_sum / count + ct_mask = count == 0 + if ct_mask.any(): + the_mean[ct_mask] = np.nan + else: + the_mean = the_sum / count if count > 0 else np.nan + + return _wrap_results(the_mean, dtype) + + +@disallow("M8") +@bottleneck_switch() +def nanmedian(values, axis=None, skipna=True, mask=None): + """ + Parameters + ---------- + values : ndarray + axis: int, optional + skipna : bool, default True + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + result : float + Unless input is a float array, in which case use the same + precision as the input array. + + Examples + -------- + >>> import pandas.core.nanops as nanops + >>> s = pd.Series([1, np.nan, 2, 2]) + >>> nanops.nanmedian(s) + 2.0 + """ + + def get_median(x): + mask = notna(x) + if not skipna and not mask.all(): + return np.nan + return np.nanmedian(x[mask]) + + values, mask, dtype, dtype_max, _ = _get_values(values, skipna, mask=mask) + if not is_float_dtype(values): + values = values.astype("f8") + if mask is not None: + values[mask] = np.nan + + if axis is None: + values = values.ravel() + + notempty = values.size + + # an array from a frame + if values.ndim > 1: + + # there's a non-empty array to apply over otherwise numpy raises + if notempty: + if not skipna: + return _wrap_results( + np.apply_along_axis(get_median, axis, values), dtype + ) + + # fastpath for the skipna case + return _wrap_results(np.nanmedian(values, axis), dtype) + + # must return the correct shape, but median is not defined for the + # empty set so return nans of shape "everything but the passed axis" + # since "axis" is where the reduction would occur if we had a nonempty + # array + shp = np.array(values.shape) + dims = np.arange(values.ndim) + ret = np.empty(shp[dims != axis]) + ret.fill(np.nan) + return _wrap_results(ret, dtype) + + # otherwise return a scalar value + return _wrap_results(get_median(values) if notempty else np.nan, dtype) + + +def _get_counts_nanvar( + value_counts: Tuple[int], + mask: Optional[np.ndarray], + axis: Optional[int], + ddof: int, + dtype=float, +) -> Tuple[Union[int, np.ndarray], Union[int, np.ndarray]]: + """ Get the count of non-null values along an axis, accounting + for degrees of freedom. + + Parameters + ---------- + values_shape : Tuple[int] + shape tuple from values ndarray, used if mask is None + mask : Optional[ndarray[bool]] + locations in values that should be considered missing + axis : Optional[int] + axis to count along + ddof : int + degrees of freedom + dtype : type, optional + type to use for count + + Returns + ------- + count : scalar or array + d : scalar or array + """ + dtype = _get_dtype(dtype) + count = _get_counts(value_counts, mask, axis, dtype=dtype) + d = count - dtype.type(ddof) + + # always return NaN, never inf + if is_scalar(count): + if count <= ddof: + count = np.nan + d = np.nan + else: + mask2: np.ndarray = count <= ddof + if mask2.any(): + np.putmask(d, mask2, np.nan) + np.putmask(count, mask2, np.nan) + return count, d + + +@disallow("M8") +@bottleneck_switch(ddof=1) +def nanstd(values, axis=None, skipna=True, ddof=1, mask=None): + """ + Compute the standard deviation along given axis while ignoring NaNs + + Parameters + ---------- + values : ndarray + axis: int, optional + skipna : bool, default True + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + result : float + Unless input is a float array, in which case use the same + precision as the input array. + + Examples + -------- + >>> import pandas.core.nanops as nanops + >>> s = pd.Series([1, np.nan, 2, 3]) + >>> nanops.nanstd(s) + 1.0 + """ + orig_dtype = values.dtype + values, mask, dtype, dtype_max, fill_value = _get_values(values, skipna, mask=mask) + + result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask)) + return _wrap_results(result, orig_dtype) + + +@disallow("M8", "m8") +@bottleneck_switch(ddof=1) +def nanvar(values, axis=None, skipna=True, ddof=1, mask=None): + """ + Compute the variance along given axis while ignoring NaNs + + Parameters + ---------- + values : ndarray + axis: int, optional + skipna : bool, default True + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + result : float + Unless input is a float array, in which case use the same + precision as the input array. + + Examples + -------- + >>> import pandas.core.nanops as nanops + >>> s = pd.Series([1, np.nan, 2, 3]) + >>> nanops.nanvar(s) + 1.0 + """ + values = lib.values_from_object(values) + dtype = values.dtype + mask = _maybe_get_mask(values, skipna, mask) + if is_any_int_dtype(values): + values = values.astype("f8") + if mask is not None: + values[mask] = np.nan + + if is_float_dtype(values): + count, d = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype) + else: + count, d = _get_counts_nanvar(values.shape, mask, axis, ddof) + + if skipna and mask is not None: + values = values.copy() + np.putmask(values, mask, 0) + + # xref GH10242 + # Compute variance via two-pass algorithm, which is stable against + # cancellation errors and relatively accurate for small numbers of + # observations. + # + # See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count + if axis is not None: + avg = np.expand_dims(avg, axis) + sqr = _ensure_numeric((avg - values) ** 2) + if mask is not None: + np.putmask(sqr, mask, 0) + result = sqr.sum(axis=axis, dtype=np.float64) / d + + # Return variance as np.float64 (the datatype used in the accumulator), + # unless we were dealing with a float array, in which case use the same + # precision as the original values array. + if is_float_dtype(dtype): + result = result.astype(dtype) + return _wrap_results(result, values.dtype) + + +@disallow("M8", "m8") +def nansem(values, axis=None, skipna=True, ddof=1, mask=None): + """ + Compute the standard error in the mean along given axis while ignoring NaNs + + Parameters + ---------- + values : ndarray + axis: int, optional + skipna : bool, default True + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + result : float64 + Unless input is a float array, in which case use the same + precision as the input array. + + Examples + -------- + >>> import pandas.core.nanops as nanops + >>> s = pd.Series([1, np.nan, 2, 3]) + >>> nanops.nansem(s) + 0.5773502691896258 + """ + + # This checks if non-numeric-like data is passed with numeric_only=False + # and raises a TypeError otherwise + nanvar(values, axis, skipna, ddof=ddof, mask=mask) + + mask = _maybe_get_mask(values, skipna, mask) + if not is_float_dtype(values.dtype): + values = values.astype("f8") + + count, _ = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype) + var = nanvar(values, axis, skipna, ddof=ddof) + + return np.sqrt(var) / np.sqrt(count) + + +def _nanminmax(meth, fill_value_typ): + @bottleneck_switch(name="nan" + meth) + def reduction(values, axis=None, skipna=True, mask=None): + + values, mask, dtype, dtype_max, fill_value = _get_values( + values, skipna, fill_value_typ=fill_value_typ, mask=mask + ) + + if (axis is not None and values.shape[axis] == 0) or values.size == 0: + try: + result = getattr(values, meth)(axis, dtype=dtype_max) + result.fill(np.nan) + except (AttributeError, TypeError, ValueError): + result = np.nan + else: + result = getattr(values, meth)(axis) + + result = _wrap_results(result, dtype, fill_value) + return _maybe_null_out(result, axis, mask, values.shape) + + return reduction + + +nanmin = _nanminmax("min", fill_value_typ="+inf") +nanmax = _nanminmax("max", fill_value_typ="-inf") + + +@disallow("O") +def nanargmax(values, axis=None, skipna=True, mask=None): + """ + Parameters + ---------- + values : ndarray + axis: int, optional + skipna : bool, default True + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + result : int + The index of max value in specified axis or -1 in the NA case + + Examples + -------- + >>> import pandas.core.nanops as nanops + >>> s = pd.Series([1, 2, 3, np.nan, 4]) + >>> nanops.nanargmax(s) + 4 + """ + values, mask, dtype, _, _ = _get_values( + values, True, fill_value_typ="-inf", mask=mask + ) + result = values.argmax(axis) + result = _maybe_arg_null_out(result, axis, mask, skipna) + return result + + +@disallow("O") +def nanargmin(values, axis=None, skipna=True, mask=None): + """ + Parameters + ---------- + values : ndarray + axis: int, optional + skipna : bool, default True + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + result : int + The index of min value in specified axis or -1 in the NA case + + Examples + -------- + >>> import pandas.core.nanops as nanops + >>> s = pd.Series([1, 2, 3, np.nan, 4]) + >>> nanops.nanargmin(s) + 0 + """ + values, mask, dtype, _, _ = _get_values( + values, True, fill_value_typ="+inf", mask=mask + ) + result = values.argmin(axis) + result = _maybe_arg_null_out(result, axis, mask, skipna) + return result + + +@disallow("M8", "m8") +def nanskew(values, axis=None, skipna=True, mask=None): + """ Compute the sample skewness. + + The statistic computed here is the adjusted Fisher-Pearson standardized + moment coefficient G1. The algorithm computes this coefficient directly + from the second and third central moment. + + Parameters + ---------- + values : ndarray + axis: int, optional + skipna : bool, default True + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + result : float64 + Unless input is a float array, in which case use the same + precision as the input array. + + Examples + -------- + >>> import pandas.core.nanops as nanops + >>> s = pd.Series([1,np.nan, 1, 2]) + >>> nanops.nanskew(s) + 1.7320508075688787 + """ + values = lib.values_from_object(values) + mask = _maybe_get_mask(values, skipna, mask) + if not is_float_dtype(values.dtype): + values = values.astype("f8") + count = _get_counts(values.shape, mask, axis) + else: + count = _get_counts(values.shape, mask, axis, dtype=values.dtype) + + if skipna and mask is not None: + values = values.copy() + np.putmask(values, mask, 0) + + mean = values.sum(axis, dtype=np.float64) / count + if axis is not None: + mean = np.expand_dims(mean, axis) + + adjusted = values - mean + if skipna and mask is not None: + np.putmask(adjusted, mask, 0) + adjusted2 = adjusted ** 2 + adjusted3 = adjusted2 * adjusted + m2 = adjusted2.sum(axis, dtype=np.float64) + m3 = adjusted3.sum(axis, dtype=np.float64) + + # floating point error + # + # #18044 in _libs/windows.pyx calc_skew follow this behavior + # to fix the fperr to treat m2 <1e-14 as zero + m2 = _zero_out_fperr(m2) + m3 = _zero_out_fperr(m3) + + with np.errstate(invalid="ignore", divide="ignore"): + result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2 ** 1.5) + + dtype = values.dtype + if is_float_dtype(dtype): + result = result.astype(dtype) + + if isinstance(result, np.ndarray): + result = np.where(m2 == 0, 0, result) + result[count < 3] = np.nan + return result + else: + result = 0 if m2 == 0 else result + if count < 3: + return np.nan + return result + + +@disallow("M8", "m8") +def nankurt(values, axis=None, skipna=True, mask=None): + """ + Compute the sample excess kurtosis + + The statistic computed here is the adjusted Fisher-Pearson standardized + moment coefficient G2, computed directly from the second and fourth + central moment. + + Parameters + ---------- + values : ndarray + axis: int, optional + skipna : bool, default True + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + result : float64 + Unless input is a float array, in which case use the same + precision as the input array. + + Examples + -------- + >>> import pandas.core.nanops as nanops + >>> s = pd.Series([1,np.nan, 1, 3, 2]) + >>> nanops.nankurt(s) + -1.2892561983471076 + """ + values = lib.values_from_object(values) + mask = _maybe_get_mask(values, skipna, mask) + if not is_float_dtype(values.dtype): + values = values.astype("f8") + count = _get_counts(values.shape, mask, axis) + else: + count = _get_counts(values.shape, mask, axis, dtype=values.dtype) + + if skipna and mask is not None: + values = values.copy() + np.putmask(values, mask, 0) + + mean = values.sum(axis, dtype=np.float64) / count + if axis is not None: + mean = np.expand_dims(mean, axis) + + adjusted = values - mean + if skipna and mask is not None: + np.putmask(adjusted, mask, 0) + adjusted2 = adjusted ** 2 + adjusted4 = adjusted2 ** 2 + m2 = adjusted2.sum(axis, dtype=np.float64) + m4 = adjusted4.sum(axis, dtype=np.float64) + + with np.errstate(invalid="ignore", divide="ignore"): + adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3)) + numer = count * (count + 1) * (count - 1) * m4 + denom = (count - 2) * (count - 3) * m2 ** 2 + + # floating point error + # + # #18044 in _libs/windows.pyx calc_kurt follow this behavior + # to fix the fperr to treat denom <1e-14 as zero + numer = _zero_out_fperr(numer) + denom = _zero_out_fperr(denom) + + if not isinstance(denom, np.ndarray): + # if ``denom`` is a scalar, check these corner cases first before + # doing division + if count < 4: + return np.nan + if denom == 0: + return 0 + + with np.errstate(invalid="ignore", divide="ignore"): + result = numer / denom - adj + + dtype = values.dtype + if is_float_dtype(dtype): + result = result.astype(dtype) + + if isinstance(result, np.ndarray): + result = np.where(denom == 0, 0, result) + result[count < 4] = np.nan + + return result + + +@disallow("M8", "m8") +def nanprod(values, axis=None, skipna=True, min_count=0, mask=None): + """ + Parameters + ---------- + values : ndarray[dtype] + axis: int, optional + skipna : bool, default True + min_count: int, default 0 + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + result : dtype + + Examples + -------- + >>> import pandas.core.nanops as nanops + >>> s = pd.Series([1, 2, 3, np.nan]) + >>> nanops.nanprod(s) + 6.0 + + Returns + ------- + The product of all elements on a given axis. ( NaNs are treated as 1) + """ + mask = _maybe_get_mask(values, skipna, mask) + + if skipna and mask is not None: + values = values.copy() + values[mask] = 1 + result = values.prod(axis) + return _maybe_null_out(result, axis, mask, values.shape, min_count=min_count) + + +def _maybe_arg_null_out( + result: np.ndarray, axis: Optional[int], mask: Optional[np.ndarray], skipna: bool +) -> Union[np.ndarray, int]: + # helper function for nanargmin/nanargmax + if mask is None: + return result + + if axis is None or not getattr(result, "ndim", False): + if skipna: + if mask.all(): + result = -1 + else: + if mask.any(): + result = -1 + else: + if skipna: + na_mask = mask.all(axis) + else: + na_mask = mask.any(axis) + if na_mask.any(): + result[na_mask] = -1 + return result + + +def _get_counts( + values_shape: Tuple[int], + mask: Optional[np.ndarray], + axis: Optional[int], + dtype=float, +) -> Union[int, np.ndarray]: + """ Get the count of non-null values along an axis + + Parameters + ---------- + values_shape : Tuple[int] + shape tuple from values ndarray, used if mask is None + mask : Optional[ndarray[bool]] + locations in values that should be considered missing + axis : Optional[int] + axis to count along + dtype : type, optional + type to use for count + + Returns + ------- + count : scalar or array + """ + dtype = _get_dtype(dtype) + if axis is None: + if mask is not None: + n = mask.size - mask.sum() + else: + n = np.prod(values_shape) + return dtype.type(n) + + if mask is not None: + count = mask.shape[axis] - mask.sum(axis) + else: + count = values_shape[axis] + + if is_scalar(count): + return dtype.type(count) + try: + return count.astype(dtype) + except AttributeError: + return np.array(count, dtype=dtype) + + +def _maybe_null_out( + result: np.ndarray, + axis: Optional[int], + mask: Optional[np.ndarray], + shape: Tuple, + min_count: int = 1, +) -> np.ndarray: + if mask is not None and axis is not None and getattr(result, "ndim", False): + null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0 + if np.any(null_mask): + if is_numeric_dtype(result): + if np.iscomplexobj(result): + result = result.astype("c16") + else: + result = result.astype("f8") + result[null_mask] = np.nan + else: + # GH12941, use None to auto cast null + result[null_mask] = None + elif result is not NaT: + if mask is not None: + null_mask = mask.size - mask.sum() + else: + null_mask = np.prod(shape) + if null_mask < min_count: + result = np.nan + + return result + + +def _zero_out_fperr(arg): + # #18044 reference this behavior to fix rolling skew/kurt issue + if isinstance(arg, np.ndarray): + with np.errstate(invalid="ignore"): + return np.where(np.abs(arg) < 1e-14, 0, arg) + else: + return arg.dtype.type(0) if np.abs(arg) < 1e-14 else arg + + +@disallow("M8", "m8") +def nancorr(a, b, method="pearson", min_periods=None): + """ + a, b: ndarrays + """ + if len(a) != len(b): + raise AssertionError("Operands to nancorr must have same size") + + if min_periods is None: + min_periods = 1 + + valid = notna(a) & notna(b) + if not valid.all(): + a = a[valid] + b = b[valid] + + if len(a) < min_periods: + return np.nan + + f = get_corr_func(method) + return f(a, b) + + +def get_corr_func(method): + if method in ["kendall", "spearman"]: + from scipy.stats import kendalltau, spearmanr + elif method in ["pearson"]: + pass + elif callable(method): + return method + else: + raise ValueError( + f"Unkown method '{method}', expected one of 'kendall', 'spearman'" + ) + + def _pearson(a, b): + return np.corrcoef(a, b)[0, 1] + + def _kendall(a, b): + # kendallttau returns a tuple of the tau statistic and pvalue + rs = kendalltau(a, b) + return rs[0] + + def _spearman(a, b): + return spearmanr(a, b)[0] + + _cor_methods = {"pearson": _pearson, "kendall": _kendall, "spearman": _spearman} + return _cor_methods[method] + + +@disallow("M8", "m8") +def nancov(a, b, min_periods=None): + if len(a) != len(b): + raise AssertionError("Operands to nancov must have same size") + + if min_periods is None: + min_periods = 1 + + valid = notna(a) & notna(b) + if not valid.all(): + a = a[valid] + b = b[valid] + + if len(a) < min_periods: + return np.nan + + return np.cov(a, b)[0, 1] + + +def _ensure_numeric(x): + if isinstance(x, np.ndarray): + if is_integer_dtype(x) or is_bool_dtype(x): + x = x.astype(np.float64) + elif is_object_dtype(x): + try: + x = x.astype(np.complex128) + except (TypeError, ValueError): + x = x.astype(np.float64) + else: + if not np.any(np.imag(x)): + x = x.real + elif not (is_float(x) or is_integer(x) or is_complex(x)): + try: + x = float(x) + except ValueError: + # e.g. "1+1j" or "foo" + try: + x = complex(x) + except ValueError: + # e.g. "foo" + raise TypeError(f"Could not convert {x} to numeric") + return x + + +# NA-friendly array comparisons + + +def make_nancomp(op): + def f(x, y): + xmask = isna(x) + ymask = isna(y) + mask = xmask | ymask + + with np.errstate(all="ignore"): + result = op(x, y) + + if mask.any(): + if is_bool_dtype(result): + result = result.astype("O") + np.putmask(result, mask, np.nan) + + return result + + return f + + +nangt = make_nancomp(operator.gt) +nange = make_nancomp(operator.ge) +nanlt = make_nancomp(operator.lt) +nanle = make_nancomp(operator.le) +naneq = make_nancomp(operator.eq) +nanne = make_nancomp(operator.ne) + + +def _nanpercentile_1d(values, mask, q, na_value, interpolation): + """ + Wrapper for np.percentile that skips missing values, specialized to + 1-dimensional case. + + Parameters + ---------- + values : array over which to find quantiles + mask : ndarray[bool] + locations in values that should be considered missing + q : scalar or array of quantile indices to find + na_value : scalar + value to return for empty or all-null values + interpolation : str + + Returns + ------- + quantiles : scalar or array + """ + # mask is Union[ExtensionArray, ndarray] + values = values[~mask] + + if len(values) == 0: + if lib.is_scalar(q): + return na_value + else: + return np.array([na_value] * len(q), dtype=values.dtype) + + return np.percentile(values, q, interpolation=interpolation) + + +def nanpercentile(values, q, axis, na_value, mask, ndim, interpolation): + """ + Wrapper for np.percentile that skips missing values. + + Parameters + ---------- + values : array over which to find quantiles + q : scalar or array of quantile indices to find + axis : {0, 1} + na_value : scalar + value to return for empty or all-null values + mask : ndarray[bool] + locations in values that should be considered missing + ndim : {1, 2} + interpolation : str + + Returns + ------- + quantiles : scalar or array + """ + if values.dtype.kind in ["m", "M"]: + # need to cast to integer to avoid rounding errors in numpy + result = nanpercentile( + values.view("i8"), q, axis, na_value.view("i8"), mask, ndim, interpolation + ) + + # Note: we have to do do `astype` and not view because in general we + # have float result at this point, not i8 + return result.astype(values.dtype) + + if not lib.is_scalar(mask) and mask.any(): + if ndim == 1: + return _nanpercentile_1d( + values, mask, q, na_value, interpolation=interpolation + ) + else: + # for nonconsolidatable blocks mask is 1D, but values 2D + if mask.ndim < values.ndim: + mask = mask.reshape(values.shape) + if axis == 0: + values = values.T + mask = mask.T + result = [ + _nanpercentile_1d(val, m, q, na_value, interpolation=interpolation) + for (val, m) in zip(list(values), list(mask)) + ] + result = np.array(result, dtype=values.dtype, copy=False).T + return result + else: + return np.percentile(values, q, axis=axis, interpolation=interpolation) diff --git a/venv/Lib/site-packages/pandas/core/ops/__init__.py b/venv/Lib/site-packages/pandas/core/ops/__init__.py new file mode 100644 index 0000000..f51d71d --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/ops/__init__.py @@ -0,0 +1,793 @@ +""" +Arithmetic operations for PandasObjects + +This is not a public API. +""" +import datetime +import operator +from typing import Set, Tuple, Union + +import numpy as np + +from pandas._libs import Timedelta, Timestamp, lib +from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op # noqa:F401 +from pandas.util._decorators import Appender + +from pandas.core.dtypes.common import is_list_like, is_timedelta64_dtype +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCExtensionArray, + ABCIndexClass, + ABCSeries, +) +from pandas.core.dtypes.missing import isna + +from pandas.core.construction import extract_array +from pandas.core.ops.array_ops import ( + arithmetic_op, + comparison_op, + define_na_arithmetic_op, + get_array_op, + logical_op, +) +from pandas.core.ops.array_ops import comp_method_OBJECT_ARRAY # noqa:F401 +from pandas.core.ops.common import unpack_zerodim_and_defer +from pandas.core.ops.dispatch import should_series_dispatch +from pandas.core.ops.docstrings import ( + _arith_doc_FRAME, + _flex_comp_doc_FRAME, + _make_flex_doc, + _op_descriptions, +) +from pandas.core.ops.invalid import invalid_comparison # noqa:F401 +from pandas.core.ops.mask_ops import kleene_and, kleene_or, kleene_xor # noqa: F401 +from pandas.core.ops.methods import ( # noqa:F401 + add_flex_arithmetic_methods, + add_special_arithmetic_methods, +) +from pandas.core.ops.roperator import ( # noqa:F401 + radd, + rand_, + rdiv, + rdivmod, + rfloordiv, + rmod, + rmul, + ror_, + rpow, + rsub, + rtruediv, + rxor, +) + +# ----------------------------------------------------------------------------- +# constants +ARITHMETIC_BINOPS: Set[str] = { + "add", + "sub", + "mul", + "pow", + "mod", + "floordiv", + "truediv", + "divmod", + "radd", + "rsub", + "rmul", + "rpow", + "rmod", + "rfloordiv", + "rtruediv", + "rdivmod", +} + + +COMPARISON_BINOPS: Set[str] = { + "eq", + "ne", + "lt", + "gt", + "le", + "ge", +} + +# ----------------------------------------------------------------------------- +# Ops Wrapping Utilities + + +def get_op_result_name(left, right): + """ + Find the appropriate name to pin to an operation result. This result + should always be either an Index or a Series. + + Parameters + ---------- + left : {Series, Index} + right : object + + Returns + ------- + name : object + Usually a string + """ + # `left` is always a Series when called from within ops + if isinstance(right, (ABCSeries, ABCIndexClass)): + name = _maybe_match_name(left, right) + else: + name = left.name + return name + + +def _maybe_match_name(a, b): + """ + Try to find a name to attach to the result of an operation between + a and b. If only one of these has a `name` attribute, return that + name. Otherwise return a consensus name if they match of None if + they have different names. + + Parameters + ---------- + a : object + b : object + + Returns + ------- + name : str or None + + See Also + -------- + pandas.core.common.consensus_name_attr + """ + a_has = hasattr(a, "name") + b_has = hasattr(b, "name") + if a_has and b_has: + if a.name == b.name: + return a.name + else: + # TODO: what if they both have np.nan for their names? + return None + elif a_has: + return a.name + elif b_has: + return b.name + return None + + +def maybe_upcast_for_op(obj, shape: Tuple[int, ...]): + """ + Cast non-pandas objects to pandas types to unify behavior of arithmetic + and comparison operations. + + Parameters + ---------- + obj: object + shape : tuple[int] + + Returns + ------- + out : object + + Notes + ----- + Be careful to call this *after* determining the `name` attribute to be + attached to the result of the arithmetic operation. + """ + from pandas.core.arrays import DatetimeArray, TimedeltaArray + + if type(obj) is datetime.timedelta: + # GH#22390 cast up to Timedelta to rely on Timedelta + # implementation; otherwise operation against numeric-dtype + # raises TypeError + return Timedelta(obj) + elif isinstance(obj, np.datetime64): + # GH#28080 numpy casts integer-dtype to datetime64 when doing + # array[int] + datetime64, which we do not allow + if isna(obj): + # Avoid possible ambiguities with pd.NaT + obj = obj.astype("datetime64[ns]") + right = np.broadcast_to(obj, shape) + return DatetimeArray(right) + + return Timestamp(obj) + + elif isinstance(obj, np.timedelta64): + if isna(obj): + # wrapping timedelta64("NaT") in Timedelta returns NaT, + # which would incorrectly be treated as a datetime-NaT, so + # we broadcast and wrap in a TimedeltaArray + obj = obj.astype("timedelta64[ns]") + right = np.broadcast_to(obj, shape) + return TimedeltaArray(right) + + # In particular non-nanosecond timedelta64 needs to be cast to + # nanoseconds, or else we get undesired behavior like + # np.timedelta64(3, 'D') / 2 == np.timedelta64(1, 'D') + return Timedelta(obj) + + elif isinstance(obj, np.ndarray) and is_timedelta64_dtype(obj.dtype): + # GH#22390 Unfortunately we need to special-case right-hand + # timedelta64 dtypes because numpy casts integer dtypes to + # timedelta64 when operating with timedelta64 + return TimedeltaArray._from_sequence(obj) + return obj + + +# ----------------------------------------------------------------------------- + + +def _get_frame_op_default_axis(name): + """ + Only DataFrame cares about default_axis, specifically: + special methods have default_axis=None and flex methods + have default_axis='columns'. + + Parameters + ---------- + name : str + + Returns + ------- + default_axis: str or None + """ + if name.replace("__r", "__") in ["__and__", "__or__", "__xor__"]: + # bool methods + return "columns" + elif name.startswith("__"): + # __add__, __mul__, ... + return None + else: + # add, mul, ... + return "columns" + + +def _get_opstr(op): + """ + Find the operation string, if any, to pass to numexpr for this + operation. + + Parameters + ---------- + op : binary operator + + Returns + ------- + op_str : string or None + """ + + return { + operator.add: "+", + radd: "+", + operator.mul: "*", + rmul: "*", + operator.sub: "-", + rsub: "-", + operator.truediv: "/", + rtruediv: "/", + operator.floordiv: "//", + rfloordiv: "//", + operator.mod: None, # TODO: Why None for mod but '%' for rmod? + rmod: "%", + operator.pow: "**", + rpow: "**", + operator.eq: "==", + operator.ne: "!=", + operator.le: "<=", + operator.lt: "<", + operator.ge: ">=", + operator.gt: ">", + operator.and_: "&", + rand_: "&", + operator.or_: "|", + ror_: "|", + operator.xor: "^", + rxor: "^", + divmod: None, + rdivmod: None, + }[op] + + +def _get_op_name(op, special): + """ + Find the name to attach to this method according to conventions + for special and non-special methods. + + Parameters + ---------- + op : binary operator + special : bool + + Returns + ------- + op_name : str + """ + opname = op.__name__.strip("_") + if special: + opname = f"__{opname}__" + return opname + + +# ----------------------------------------------------------------------------- +# Masking NA values and fallbacks for operations numpy does not support + + +def fill_binop(left, right, fill_value): + """ + If a non-None fill_value is given, replace null entries in left and right + with this value, but only in positions where _one_ of left/right is null, + not both. + + Parameters + ---------- + left : array-like + right : array-like + fill_value : object + + Returns + ------- + left : array-like + right : array-like + + Notes + ----- + Makes copies if fill_value is not None + """ + # TODO: can we make a no-copy implementation? + if fill_value is not None: + left_mask = isna(left) + right_mask = isna(right) + left = left.copy() + right = right.copy() + + # one but not both + mask = left_mask ^ right_mask + left[left_mask & mask] = fill_value + right[right_mask & mask] = fill_value + return left, right + + +# ----------------------------------------------------------------------------- +# Dispatch logic + + +def dispatch_to_series(left, right, func, str_rep=None, axis=None): + """ + Evaluate the frame operation func(left, right) by evaluating + column-by-column, dispatching to the Series implementation. + + Parameters + ---------- + left : DataFrame + right : scalar or DataFrame + func : arithmetic or comparison operator + str_rep : str or None, default None + axis : {None, 0, 1, "index", "columns"} + + Returns + ------- + DataFrame + """ + # Note: we use iloc to access columns for compat with cases + # with non-unique columns. + import pandas.core.computation.expressions as expressions + + right = lib.item_from_zerodim(right) + if lib.is_scalar(right) or np.ndim(right) == 0: + + # Get the appropriate array-op to apply to each block's values. + array_op = get_array_op(func, str_rep=str_rep) + bm = left._data.apply(array_op, right=right) + return type(left)(bm) + + elif isinstance(right, ABCDataFrame): + assert right._indexed_same(left) + + def column_op(a, b): + return {i: func(a.iloc[:, i], b.iloc[:, i]) for i in range(len(a.columns))} + + elif isinstance(right, ABCSeries) and axis == "columns": + # We only get here if called via _combine_series_frame, + # in which case we specifically want to operate row-by-row + assert right.index.equals(left.columns) + + if right.dtype == "timedelta64[ns]": + # ensure we treat NaT values as the correct dtype + # Note: we do not do this unconditionally as it may be lossy or + # expensive for EA dtypes. + right = np.asarray(right) + + def column_op(a, b): + return {i: func(a.iloc[:, i], b[i]) for i in range(len(a.columns))} + + else: + + def column_op(a, b): + return {i: func(a.iloc[:, i], b.iloc[i]) for i in range(len(a.columns))} + + elif isinstance(right, ABCSeries): + assert right.index.equals(left.index) # Handle other cases later + + def column_op(a, b): + return {i: func(a.iloc[:, i], b) for i in range(len(a.columns))} + + else: + # Remaining cases have less-obvious dispatch rules + raise NotImplementedError(right) + + new_data = expressions.evaluate(column_op, str_rep, left, right) + return new_data + + +# ----------------------------------------------------------------------------- +# Series + + +def _align_method_SERIES(left, right, align_asobject=False): + """ align lhs and rhs Series """ + + # ToDo: Different from _align_method_FRAME, list, tuple and ndarray + # are not coerced here + # because Series has inconsistencies described in #13637 + + if isinstance(right, ABCSeries): + # avoid repeated alignment + if not left.index.equals(right.index): + + if align_asobject: + # to keep original value's dtype for bool ops + left = left.astype(object) + right = right.astype(object) + + left, right = left.align(right, copy=False) + + return left, right + + +def _construct_result( + left: ABCSeries, + result: Union[np.ndarray, ABCExtensionArray], + index: ABCIndexClass, + name, +): + """ + Construct an appropriately-labelled Series from the result of an op. + + Parameters + ---------- + left : Series + result : ndarray or ExtensionArray + index : Index + name : object + + Returns + ------- + Series + In the case of __divmod__ or __rdivmod__, a 2-tuple of Series. + """ + if isinstance(result, tuple): + # produced by divmod or rdivmod + return ( + _construct_result(left, result[0], index=index, name=name), + _construct_result(left, result[1], index=index, name=name), + ) + + # We do not pass dtype to ensure that the Series constructor + # does inference in the case where `result` has object-dtype. + out = left._constructor(result, index=index) + out = out.__finalize__(left) + + # Set the result's name after __finalize__ is called because __finalize__ + # would set it back to self.name + out.name = name + return out + + +def _arith_method_SERIES(cls, op, special): + """ + Wrapper function for Series arithmetic operations, to avoid + code duplication. + """ + str_rep = _get_opstr(op) + op_name = _get_op_name(op, special) + + @unpack_zerodim_and_defer(op_name) + def wrapper(left, right): + + left, right = _align_method_SERIES(left, right) + res_name = get_op_result_name(left, right) + + lvalues = extract_array(left, extract_numpy=True) + rvalues = extract_array(right, extract_numpy=True) + result = arithmetic_op(lvalues, rvalues, op, str_rep) + + return _construct_result(left, result, index=left.index, name=res_name) + + wrapper.__name__ = op_name + return wrapper + + +def _comp_method_SERIES(cls, op, special): + """ + Wrapper function for Series arithmetic operations, to avoid + code duplication. + """ + op_name = _get_op_name(op, special) + + @unpack_zerodim_and_defer(op_name) + def wrapper(self, other): + + res_name = get_op_result_name(self, other) + + if isinstance(other, ABCSeries) and not self._indexed_same(other): + raise ValueError("Can only compare identically-labeled Series objects") + + lvalues = extract_array(self, extract_numpy=True) + rvalues = extract_array(other, extract_numpy=True) + + res_values = comparison_op(lvalues, rvalues, op) + + return _construct_result(self, res_values, index=self.index, name=res_name) + + wrapper.__name__ = op_name + return wrapper + + +def _bool_method_SERIES(cls, op, special): + """ + Wrapper function for Series arithmetic operations, to avoid + code duplication. + """ + op_name = _get_op_name(op, special) + + @unpack_zerodim_and_defer(op_name) + def wrapper(self, other): + self, other = _align_method_SERIES(self, other, align_asobject=True) + res_name = get_op_result_name(self, other) + + lvalues = extract_array(self, extract_numpy=True) + rvalues = extract_array(other, extract_numpy=True) + + res_values = logical_op(lvalues, rvalues, op) + return _construct_result(self, res_values, index=self.index, name=res_name) + + wrapper.__name__ = op_name + return wrapper + + +def _flex_method_SERIES(cls, op, special): + name = _get_op_name(op, special) + doc = _make_flex_doc(name, "series") + + @Appender(doc) + def flex_wrapper(self, other, level=None, fill_value=None, axis=0): + # validate axis + if axis is not None: + self._get_axis_number(axis) + + if isinstance(other, ABCSeries): + return self._binop(other, op, level=level, fill_value=fill_value) + elif isinstance(other, (np.ndarray, list, tuple)): + if len(other) != len(self): + raise ValueError("Lengths must be equal") + other = self._constructor(other, self.index) + return self._binop(other, op, level=level, fill_value=fill_value) + else: + if fill_value is not None: + self = self.fillna(fill_value) + + return op(self, other) + + flex_wrapper.__name__ = name + return flex_wrapper + + +# ----------------------------------------------------------------------------- +# DataFrame + + +def _combine_series_frame(self, other, func, fill_value=None, axis=None, level=None): + """ + Apply binary operator `func` to self, other using alignment and fill + conventions determined by the fill_value, axis, and level kwargs. + + Parameters + ---------- + self : DataFrame + other : Series + func : binary operator + fill_value : object, default None + axis : {0, 1, 'columns', 'index', None}, default None + level : int or None, default None + + Returns + ------- + result : DataFrame + """ + if fill_value is not None: + raise NotImplementedError(f"fill_value {fill_value} not supported.") + + if axis is None: + # default axis is columns + axis = 1 + + axis = self._get_axis_number(axis) + left, right = self.align(other, join="outer", axis=axis, level=level, copy=False) + if axis == 0: + new_data = left._combine_match_index(right, func) + else: + new_data = dispatch_to_series(left, right, func, axis="columns") + + return left._construct_result(new_data) + + +def _align_method_FRAME(left, right, axis): + """ convert rhs to meet lhs dims if input is list, tuple or np.ndarray """ + + def to_series(right): + msg = "Unable to coerce to Series, length must be {req_len}: given {given_len}" + if axis is not None and left._get_axis_name(axis) == "index": + if len(left.index) != len(right): + raise ValueError( + msg.format(req_len=len(left.index), given_len=len(right)) + ) + right = left._constructor_sliced(right, index=left.index) + else: + if len(left.columns) != len(right): + raise ValueError( + msg.format(req_len=len(left.columns), given_len=len(right)) + ) + right = left._constructor_sliced(right, index=left.columns) + return right + + if isinstance(right, np.ndarray): + + if right.ndim == 1: + right = to_series(right) + + elif right.ndim == 2: + if right.shape == left.shape: + right = left._constructor(right, index=left.index, columns=left.columns) + + elif right.shape[0] == left.shape[0] and right.shape[1] == 1: + # Broadcast across columns + right = np.broadcast_to(right, left.shape) + right = left._constructor(right, index=left.index, columns=left.columns) + + elif right.shape[1] == left.shape[1] and right.shape[0] == 1: + # Broadcast along rows + right = to_series(right[0, :]) + + else: + raise ValueError( + "Unable to coerce to DataFrame, shape " + f"must be {left.shape}: given {right.shape}" + ) + + elif right.ndim > 2: + raise ValueError( + "Unable to coerce to Series/DataFrame, dim " + f"must be <= 2: {right.shape}" + ) + + elif is_list_like(right) and not isinstance(right, (ABCSeries, ABCDataFrame)): + # GH17901 + right = to_series(right) + + return right + + +def _arith_method_FRAME(cls, op, special): + str_rep = _get_opstr(op) + op_name = _get_op_name(op, special) + default_axis = _get_frame_op_default_axis(op_name) + + na_op = define_na_arithmetic_op(op, str_rep) + is_logical = str_rep in ["&", "|", "^"] + + if op_name in _op_descriptions: + # i.e. include "add" but not "__add__" + doc = _make_flex_doc(op_name, "dataframe") + else: + doc = _arith_doc_FRAME % op_name + + @Appender(doc) + def f(self, other, axis=default_axis, level=None, fill_value=None): + + other = _align_method_FRAME(self, other, axis) + + if isinstance(other, ABCDataFrame): + # Another DataFrame + pass_op = op if should_series_dispatch(self, other, op) else na_op + pass_op = pass_op if not is_logical else op + + left, right = self.align(other, join="outer", level=level, copy=False) + new_data = left._combine_frame(right, pass_op, fill_value) + return left._construct_result(new_data) + + elif isinstance(other, ABCSeries): + # For these values of `axis`, we end up dispatching to Series op, + # so do not want the masked op. + pass_op = op if axis in [0, "columns", None] else na_op + pass_op = pass_op if not is_logical else op + return _combine_series_frame( + self, other, pass_op, fill_value=fill_value, axis=axis, level=level + ) + else: + # in this case we always have `np.ndim(other) == 0` + if fill_value is not None: + self = self.fillna(fill_value) + + new_data = dispatch_to_series(self, other, op, str_rep) + return self._construct_result(new_data) + + f.__name__ = op_name + + return f + + +def _flex_comp_method_FRAME(cls, op, special): + str_rep = _get_opstr(op) + op_name = _get_op_name(op, special) + default_axis = _get_frame_op_default_axis(op_name) + + doc = _flex_comp_doc_FRAME.format( + op_name=op_name, desc=_op_descriptions[op_name]["desc"] + ) + + @Appender(doc) + def f(self, other, axis=default_axis, level=None): + + other = _align_method_FRAME(self, other, axis) + + if isinstance(other, ABCDataFrame): + # Another DataFrame + if not self._indexed_same(other): + self, other = self.align(other, "outer", level=level, copy=False) + new_data = dispatch_to_series(self, other, op, str_rep) + return self._construct_result(new_data) + + elif isinstance(other, ABCSeries): + return _combine_series_frame( + self, other, op, fill_value=None, axis=axis, level=level + ) + else: + # in this case we always have `np.ndim(other) == 0` + new_data = dispatch_to_series(self, other, op) + return self._construct_result(new_data) + + f.__name__ = op_name + + return f + + +def _comp_method_FRAME(cls, op, special): + str_rep = _get_opstr(op) + op_name = _get_op_name(op, special) + + @Appender(f"Wrapper for comparison method {op_name}") + def f(self, other): + + other = _align_method_FRAME(self, other, axis=None) + + if isinstance(other, ABCDataFrame): + # Another DataFrame + if not self._indexed_same(other): + raise ValueError( + "Can only compare identically-labeled DataFrame objects" + ) + new_data = dispatch_to_series(self, other, op, str_rep) + return self._construct_result(new_data) + + elif isinstance(other, ABCSeries): + return _combine_series_frame( + self, other, op, fill_value=None, axis=None, level=None + ) + else: + + # straight boolean comparisons we want to allow all columns + # (regardless of dtype to pass thru) See #4537 for discussion. + new_data = dispatch_to_series(self, other, op) + return self._construct_result(new_data) + + f.__name__ = op_name + + return f diff --git a/venv/Lib/site-packages/pandas/core/ops/array_ops.py b/venv/Lib/site-packages/pandas/core/ops/array_ops.py new file mode 100644 index 0000000..b84d468 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/ops/array_ops.py @@ -0,0 +1,392 @@ +""" +Functions for arithmetic and comparison operations on NumPy arrays and +ExtensionArrays. +""" +from functools import partial +import operator +from typing import Any, Optional, Union + +import numpy as np + +from pandas._libs import Timestamp, lib, ops as libops + +from pandas.core.dtypes.cast import ( + construct_1d_object_array_from_listlike, + find_common_type, + maybe_upcast_putmask, +) +from pandas.core.dtypes.common import ( + ensure_object, + is_bool_dtype, + is_integer_dtype, + is_list_like, + is_object_dtype, + is_scalar, +) +from pandas.core.dtypes.generic import ( + ABCDatetimeArray, + ABCExtensionArray, + ABCIndex, + ABCIndexClass, + ABCSeries, + ABCTimedeltaArray, +) +from pandas.core.dtypes.missing import isna, notna + +from pandas.core.ops import missing +from pandas.core.ops.dispatch import dispatch_to_extension_op, should_extension_dispatch +from pandas.core.ops.invalid import invalid_comparison +from pandas.core.ops.roperator import rpow + + +def comp_method_OBJECT_ARRAY(op, x, y): + if isinstance(y, list): + y = construct_1d_object_array_from_listlike(y) + + # TODO: Should the checks below be ABCIndexClass? + if isinstance(y, (np.ndarray, ABCSeries, ABCIndex)): + # TODO: should this be ABCIndexClass?? + if not is_object_dtype(y.dtype): + y = y.astype(np.object_) + + if isinstance(y, (ABCSeries, ABCIndex)): + y = y.values + + result = libops.vec_compare(x.ravel(), y, op) + else: + result = libops.scalar_compare(x.ravel(), y, op) + return result.reshape(x.shape) + + +def masked_arith_op(x, y, op): + """ + If the given arithmetic operation fails, attempt it again on + only the non-null elements of the input array(s). + + Parameters + ---------- + x : np.ndarray + y : np.ndarray, Series, Index + op : binary operator + """ + # For Series `x` is 1D so ravel() is a no-op; calling it anyway makes + # the logic valid for both Series and DataFrame ops. + xrav = x.ravel() + assert isinstance(x, np.ndarray), type(x) + if isinstance(y, np.ndarray): + dtype = find_common_type([x.dtype, y.dtype]) + result = np.empty(x.size, dtype=dtype) + + # NB: ravel() is only safe since y is ndarray; for e.g. PeriodIndex + # we would get int64 dtype, see GH#19956 + yrav = y.ravel() + mask = notna(xrav) & notna(yrav) + + if yrav.shape != mask.shape: + # FIXME: GH#5284, GH#5035, GH#19448 + # Without specifically raising here we get mismatched + # errors in Py3 (TypeError) vs Py2 (ValueError) + # Note: Only = an issue in DataFrame case + raise ValueError("Cannot broadcast operands together.") + + if mask.any(): + with np.errstate(all="ignore"): + result[mask] = op(xrav[mask], yrav[mask]) + + else: + if not is_scalar(y): + raise TypeError(type(y)) + + # mask is only meaningful for x + result = np.empty(x.size, dtype=x.dtype) + mask = notna(xrav) + + # 1 ** np.nan is 1. So we have to unmask those. + if op is pow: + mask = np.where(x == 1, False, mask) + elif op is rpow: + mask = np.where(y == 1, False, mask) + + if mask.any(): + with np.errstate(all="ignore"): + result[mask] = op(xrav[mask], y) + + result, _ = maybe_upcast_putmask(result, ~mask, np.nan) + result = result.reshape(x.shape) # 2D compat + return result + + +def define_na_arithmetic_op(op, str_rep: str): + def na_op(x, y): + return na_arithmetic_op(x, y, op, str_rep) + + return na_op + + +def na_arithmetic_op(left, right, op, str_rep: str): + """ + Return the result of evaluating op on the passed in values. + + If native types are not compatible, try coersion to object dtype. + + Parameters + ---------- + left : np.ndarray + right : np.ndarray or scalar + str_rep : str or None + + Returns + ------- + array-like + + Raises + ------ + TypeError : invalid operation + """ + import pandas.core.computation.expressions as expressions + + try: + result = expressions.evaluate(op, str_rep, left, right) + except TypeError: + result = masked_arith_op(left, right, op) + + return missing.dispatch_fill_zeros(op, left, right, result) + + +def arithmetic_op( + left: Union[np.ndarray, ABCExtensionArray], right: Any, op, str_rep: str +): + """ + Evaluate an arithmetic operation `+`, `-`, `*`, `/`, `//`, `%`, `**`, ... + + Parameters + ---------- + left : np.ndarray or ExtensionArray + right : object + Cannot be a DataFrame or Index. Series is *not* excluded. + op : {operator.add, operator.sub, ...} + Or one of the reversed variants from roperator. + str_rep : str + + Returns + ------- + ndarrray or ExtensionArray + Or a 2-tuple of these in the case of divmod or rdivmod. + """ + + from pandas.core.ops import maybe_upcast_for_op + + # NB: We assume that extract_array has already been called + # on `left` and `right`. + lvalues = left + rvalues = right + + rvalues = maybe_upcast_for_op(rvalues, lvalues.shape) + + if should_extension_dispatch(left, rvalues) or isinstance( + rvalues, (ABCTimedeltaArray, ABCDatetimeArray, Timestamp) + ): + # TimedeltaArray, DatetimeArray, and Timestamp are included here + # because they have `freq` attribute which is handled correctly + # by dispatch_to_extension_op. + res_values = dispatch_to_extension_op(op, lvalues, rvalues) + + else: + with np.errstate(all="ignore"): + res_values = na_arithmetic_op(lvalues, rvalues, op, str_rep) + + return res_values + + +def comparison_op( + left: Union[np.ndarray, ABCExtensionArray], right: Any, op +) -> Union[np.ndarray, ABCExtensionArray]: + """ + Evaluate a comparison operation `=`, `!=`, `>=`, `>`, `<=`, or `<`. + + Parameters + ---------- + left : np.ndarray or ExtensionArray + right : object + Cannot be a DataFrame, Series, or Index. + op : {operator.eq, operator.ne, operator.gt, operator.ge, operator.lt, operator.le} + + Returns + ------- + ndarrray or ExtensionArray + """ + + # NB: We assume extract_array has already been called on left and right + lvalues = left + rvalues = right + + rvalues = lib.item_from_zerodim(rvalues) + if isinstance(rvalues, list): + # TODO: same for tuples? + rvalues = np.asarray(rvalues) + + if isinstance(rvalues, (np.ndarray, ABCExtensionArray, ABCIndexClass)): + # TODO: make this treatment consistent across ops and classes. + # We are not catching all listlikes here (e.g. frozenset, tuple) + # The ambiguous case is object-dtype. See GH#27803 + if len(lvalues) != len(rvalues): + raise ValueError("Lengths must match to compare") + + if should_extension_dispatch(lvalues, rvalues): + res_values = dispatch_to_extension_op(op, lvalues, rvalues) + + elif is_scalar(rvalues) and isna(rvalues): + # numpy does not like comparisons vs None + if op is operator.ne: + res_values = np.ones(lvalues.shape, dtype=bool) + else: + res_values = np.zeros(lvalues.shape, dtype=bool) + + elif is_object_dtype(lvalues.dtype): + res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues) + + else: + op_name = f"__{op.__name__}__" + method = getattr(lvalues, op_name) + with np.errstate(all="ignore"): + res_values = method(rvalues) + + if res_values is NotImplemented: + res_values = invalid_comparison(lvalues, rvalues, op) + if is_scalar(res_values): + typ = type(rvalues) + raise TypeError(f"Could not compare {typ} type with Series") + + return res_values + + +def na_logical_op(x: np.ndarray, y, op): + try: + # For exposition, write: + # yarr = isinstance(y, np.ndarray) + # yint = is_integer(y) or (yarr and y.dtype.kind == "i") + # ybool = is_bool(y) or (yarr and y.dtype.kind == "b") + # xint = x.dtype.kind == "i" + # xbool = x.dtype.kind == "b" + # Then Cases where this goes through without raising include: + # (xint or xbool) and (yint or bool) + result = op(x, y) + except TypeError: + if isinstance(y, np.ndarray): + # bool-bool dtype operations should be OK, should not get here + assert not (is_bool_dtype(x.dtype) and is_bool_dtype(y.dtype)) + x = ensure_object(x) + y = ensure_object(y) + result = libops.vec_binop(x, y, op) + else: + # let null fall thru + assert lib.is_scalar(y) + if not isna(y): + y = bool(y) + try: + result = libops.scalar_binop(x, y, op) + except ( + TypeError, + ValueError, + AttributeError, + OverflowError, + NotImplementedError, + ): + typ = type(y).__name__ + raise TypeError( + f"Cannot perform '{op.__name__}' with a dtyped [{x.dtype}] array " + f"and scalar of type [{typ}]" + ) + + return result + + +def logical_op( + left: Union[np.ndarray, ABCExtensionArray], right: Any, op +) -> Union[np.ndarray, ABCExtensionArray]: + """ + Evaluate a logical operation `|`, `&`, or `^`. + + Parameters + ---------- + left : np.ndarray or ExtensionArray + right : object + Cannot be a DataFrame, Series, or Index. + op : {operator.and_, operator.or_, operator.xor} + Or one of the reversed variants from roperator. + + Returns + ------- + ndarrray or ExtensionArray + """ + + fill_int = lambda x: x + + def fill_bool(x, left=None): + # if `left` is specifically not-boolean, we do not cast to bool + if x.dtype.kind in ["c", "f", "O"]: + # dtypes that can hold NA + mask = isna(x) + if mask.any(): + x = x.astype(object) + x[mask] = False + + if left is None or is_bool_dtype(left.dtype): + x = x.astype(bool) + return x + + is_self_int_dtype = is_integer_dtype(left.dtype) + + right = lib.item_from_zerodim(right) + if is_list_like(right) and not hasattr(right, "dtype"): + # e.g. list, tuple + right = construct_1d_object_array_from_listlike(right) + + # NB: We assume extract_array has already been called on left and right + lvalues = left + rvalues = right + + if should_extension_dispatch(lvalues, rvalues): + res_values = dispatch_to_extension_op(op, lvalues, rvalues) + + else: + if isinstance(rvalues, np.ndarray): + is_other_int_dtype = is_integer_dtype(rvalues.dtype) + rvalues = rvalues if is_other_int_dtype else fill_bool(rvalues, lvalues) + + else: + # i.e. scalar + is_other_int_dtype = lib.is_integer(rvalues) + + # For int vs int `^`, `|`, `&` are bitwise operators and return + # integer dtypes. Otherwise these are boolean ops + filler = fill_int if is_self_int_dtype and is_other_int_dtype else fill_bool + + res_values = na_logical_op(lvalues, rvalues, op) + res_values = filler(res_values) # type: ignore + + return res_values + + +def get_array_op(op, str_rep: Optional[str] = None): + """ + Return a binary array operation corresponding to the given operator op. + + Parameters + ---------- + op : function + Binary operator from operator or roperator module. + str_rep : str or None, default None + str_rep to pass to arithmetic_op + + Returns + ------- + function + """ + op_name = op.__name__.strip("_") + if op_name in {"eq", "ne", "lt", "le", "gt", "ge"}: + return partial(comparison_op, op=op) + elif op_name in {"and", "or", "xor", "rand", "ror", "rxor"}: + return partial(logical_op, op=op) + else: + return partial(arithmetic_op, op=op, str_rep=str_rep) diff --git a/venv/Lib/site-packages/pandas/core/ops/common.py b/venv/Lib/site-packages/pandas/core/ops/common.py new file mode 100644 index 0000000..f4b16cf --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/ops/common.py @@ -0,0 +1,66 @@ +""" +Boilerplate functions used in defining binary operations. +""" +from functools import wraps + +from pandas._libs.lib import item_from_zerodim + +from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries + + +def unpack_zerodim_and_defer(name: str): + """ + Boilerplate for pandas conventions in arithmetic and comparison methods. + + Parameters + ---------- + name : str + + Returns + ------- + decorator + """ + + def wrapper(method): + return _unpack_zerodim_and_defer(method, name) + + return wrapper + + +def _unpack_zerodim_and_defer(method, name: str): + """ + Boilerplate for pandas conventions in arithmetic and comparison methods. + + Ensure method returns NotImplemented when operating against "senior" + classes. Ensure zero-dimensional ndarrays are always unpacked. + + Parameters + ---------- + method : binary method + name : str + + Returns + ------- + method + """ + + is_cmp = name.strip("__") in {"eq", "ne", "lt", "le", "gt", "ge"} + + @wraps(method) + def new_method(self, other): + + if is_cmp and isinstance(self, ABCIndexClass) and isinstance(other, ABCSeries): + # For comparison ops, Index does *not* defer to Series + pass + else: + for cls in [ABCDataFrame, ABCSeries, ABCIndexClass]: + if isinstance(self, cls): + break + if isinstance(other, cls): + return NotImplemented + + other = item_from_zerodim(other) + + return method(self, other) + + return new_method diff --git a/venv/Lib/site-packages/pandas/core/ops/dispatch.py b/venv/Lib/site-packages/pandas/core/ops/dispatch.py new file mode 100644 index 0000000..61a3032 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/ops/dispatch.py @@ -0,0 +1,126 @@ +""" +Functions for defining unary operations. +""" +from typing import Any, Union + +import numpy as np + +from pandas.core.dtypes.common import ( + is_datetime64_dtype, + is_extension_array_dtype, + is_integer_dtype, + is_object_dtype, + is_scalar, + is_timedelta64_dtype, +) +from pandas.core.dtypes.generic import ABCExtensionArray, ABCSeries + +from pandas.core.construction import array + + +def should_extension_dispatch(left: ABCSeries, right: Any) -> bool: + """ + Identify cases where Series operation should use dispatch_to_extension_op. + + Parameters + ---------- + left : Series + right : object + + Returns + ------- + bool + """ + if ( + is_extension_array_dtype(left.dtype) + or is_datetime64_dtype(left.dtype) + or is_timedelta64_dtype(left.dtype) + ): + return True + + if not is_scalar(right) and is_extension_array_dtype(right): + # GH#22378 disallow scalar to exclude e.g. "category", "Int64" + return True + + return False + + +def should_series_dispatch(left, right, op): + """ + Identify cases where a DataFrame operation should dispatch to its + Series counterpart. + + Parameters + ---------- + left : DataFrame + right : DataFrame or Series + op : binary operator + + Returns + ------- + override : bool + """ + if left._is_mixed_type or right._is_mixed_type: + return True + + if op.__name__.strip("_") in ["and", "or", "xor", "rand", "ror", "rxor"]: + # TODO: GH references for what this fixes + # Note: this check must come before the check for nonempty columns. + return True + + if right.ndim == 1: + # operating with Series, short-circuit checks that would fail + # with AttributeError. + return False + + if not len(left.columns) or not len(right.columns): + # ensure obj.dtypes[0] exists for each obj + return False + + ldtype = left.dtypes.iloc[0] + rdtype = right.dtypes.iloc[0] + + if (is_timedelta64_dtype(ldtype) and is_integer_dtype(rdtype)) or ( + is_timedelta64_dtype(rdtype) and is_integer_dtype(ldtype) + ): + # numpy integer dtypes as timedelta64 dtypes in this scenario + return True + + if is_datetime64_dtype(ldtype) and is_object_dtype(rdtype): + # in particular case where right is an array of DateOffsets + return True + + return False + + +def dispatch_to_extension_op( + op, left: Union[ABCExtensionArray, np.ndarray], right: Any, +): + """ + Assume that left or right is a Series backed by an ExtensionArray, + apply the operator defined by op. + + Parameters + ---------- + op : binary operator + left : ExtensionArray or np.ndarray + right : object + + Returns + ------- + ExtensionArray or np.ndarray + 2-tuple of these if op is divmod or rdivmod + """ + # NB: left and right should already be unboxed, so neither should be + # a Series or Index. + + if left.dtype.kind in "mM" and isinstance(left, np.ndarray): + # We need to cast datetime64 and timedelta64 ndarrays to + # DatetimeArray/TimedeltaArray. But we avoid wrapping others in + # PandasArray as that behaves poorly with e.g. IntegerArray. + left = array(left) + + # The op calls will raise TypeError if the op is not defined + # on the ExtensionArray + res_values = op(left, right) + return res_values diff --git a/venv/Lib/site-packages/pandas/core/ops/docstrings.py b/venv/Lib/site-packages/pandas/core/ops/docstrings.py new file mode 100644 index 0000000..e3db65f --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/ops/docstrings.py @@ -0,0 +1,675 @@ +""" +Templating for ops docstrings +""" +from typing import Dict, Optional + + +def _make_flex_doc(op_name, typ): + """ + Make the appropriate substitutions for the given operation and class-typ + into either _flex_doc_SERIES or _flex_doc_FRAME to return the docstring + to attach to a generated method. + + Parameters + ---------- + op_name : str {'__add__', '__sub__', ... '__eq__', '__ne__', ...} + typ : str {series, 'dataframe']} + + Returns + ------- + doc : str + """ + op_name = op_name.replace("__", "") + op_desc = _op_descriptions[op_name] + + if op_name.startswith("r"): + equiv = "other " + op_desc["op"] + " " + typ + else: + equiv = typ + " " + op_desc["op"] + " other" + + if typ == "series": + base_doc = _flex_doc_SERIES + doc_no_examples = base_doc.format( + desc=op_desc["desc"], + op_name=op_name, + equiv=equiv, + reverse=op_desc["reverse"], + ) + if op_desc["series_examples"]: + doc = doc_no_examples + op_desc["series_examples"] + else: + doc = doc_no_examples + elif typ == "dataframe": + base_doc = _flex_doc_FRAME + doc = base_doc.format( + desc=op_desc["desc"], + op_name=op_name, + equiv=equiv, + reverse=op_desc["reverse"], + ) + else: + raise AssertionError("Invalid typ argument.") + return doc + + +_add_example_SERIES = """ +Examples +-------- +>>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) +>>> a +a 1.0 +b 1.0 +c 1.0 +d NaN +dtype: float64 +>>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) +>>> b +a 1.0 +b NaN +d 1.0 +e NaN +dtype: float64 +>>> a.add(b, fill_value=0) +a 2.0 +b 1.0 +c 1.0 +d 1.0 +e NaN +dtype: float64 +""" + +_sub_example_SERIES = """ +Examples +-------- +>>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) +>>> a +a 1.0 +b 1.0 +c 1.0 +d NaN +dtype: float64 +>>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) +>>> b +a 1.0 +b NaN +d 1.0 +e NaN +dtype: float64 +>>> a.subtract(b, fill_value=0) +a 0.0 +b 1.0 +c 1.0 +d -1.0 +e NaN +dtype: float64 +""" + +_mul_example_SERIES = """ +Examples +-------- +>>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) +>>> a +a 1.0 +b 1.0 +c 1.0 +d NaN +dtype: float64 +>>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) +>>> b +a 1.0 +b NaN +d 1.0 +e NaN +dtype: float64 +>>> a.multiply(b, fill_value=0) +a 1.0 +b 0.0 +c 0.0 +d 0.0 +e NaN +dtype: float64 +""" + +_div_example_SERIES = """ +Examples +-------- +>>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) +>>> a +a 1.0 +b 1.0 +c 1.0 +d NaN +dtype: float64 +>>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) +>>> b +a 1.0 +b NaN +d 1.0 +e NaN +dtype: float64 +>>> a.divide(b, fill_value=0) +a 1.0 +b inf +c inf +d 0.0 +e NaN +dtype: float64 +""" + +_floordiv_example_SERIES = """ +Examples +-------- +>>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) +>>> a +a 1.0 +b 1.0 +c 1.0 +d NaN +dtype: float64 +>>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) +>>> b +a 1.0 +b NaN +d 1.0 +e NaN +dtype: float64 +>>> a.floordiv(b, fill_value=0) +a 1.0 +b NaN +c NaN +d 0.0 +e NaN +dtype: float64 +""" + +_mod_example_SERIES = """ +Examples +-------- +>>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) +>>> a +a 1.0 +b 1.0 +c 1.0 +d NaN +dtype: float64 +>>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) +>>> b +a 1.0 +b NaN +d 1.0 +e NaN +dtype: float64 +>>> a.mod(b, fill_value=0) +a 0.0 +b NaN +c NaN +d 0.0 +e NaN +dtype: float64 +""" +_pow_example_SERIES = """ +Examples +-------- +>>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) +>>> a +a 1.0 +b 1.0 +c 1.0 +d NaN +dtype: float64 +>>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) +>>> b +a 1.0 +b NaN +d 1.0 +e NaN +dtype: float64 +>>> a.pow(b, fill_value=0) +a 1.0 +b 1.0 +c 1.0 +d 0.0 +e NaN +dtype: float64 +""" + +_op_descriptions: Dict[str, Dict[str, Optional[str]]] = { + # Arithmetic Operators + "add": { + "op": "+", + "desc": "Addition", + "reverse": "radd", + "series_examples": _add_example_SERIES, + }, + "sub": { + "op": "-", + "desc": "Subtraction", + "reverse": "rsub", + "series_examples": _sub_example_SERIES, + }, + "mul": { + "op": "*", + "desc": "Multiplication", + "reverse": "rmul", + "series_examples": _mul_example_SERIES, + "df_examples": None, + }, + "mod": { + "op": "%", + "desc": "Modulo", + "reverse": "rmod", + "series_examples": _mod_example_SERIES, + }, + "pow": { + "op": "**", + "desc": "Exponential power", + "reverse": "rpow", + "series_examples": _pow_example_SERIES, + "df_examples": None, + }, + "truediv": { + "op": "/", + "desc": "Floating division", + "reverse": "rtruediv", + "series_examples": _div_example_SERIES, + "df_examples": None, + }, + "floordiv": { + "op": "//", + "desc": "Integer division", + "reverse": "rfloordiv", + "series_examples": _floordiv_example_SERIES, + "df_examples": None, + }, + "divmod": { + "op": "divmod", + "desc": "Integer division and modulo", + "reverse": "rdivmod", + "series_examples": None, + "df_examples": None, + }, + # Comparison Operators + "eq": {"op": "==", "desc": "Equal to", "reverse": None, "series_examples": None}, + "ne": { + "op": "!=", + "desc": "Not equal to", + "reverse": None, + "series_examples": None, + }, + "lt": {"op": "<", "desc": "Less than", "reverse": None, "series_examples": None}, + "le": { + "op": "<=", + "desc": "Less than or equal to", + "reverse": None, + "series_examples": None, + }, + "gt": {"op": ">", "desc": "Greater than", "reverse": None, "series_examples": None}, + "ge": { + "op": ">=", + "desc": "Greater than or equal to", + "reverse": None, + "series_examples": None, + }, +} + +_op_names = list(_op_descriptions.keys()) +for key in _op_names: + reverse_op = _op_descriptions[key]["reverse"] + if reverse_op is not None: + _op_descriptions[reverse_op] = _op_descriptions[key].copy() + _op_descriptions[reverse_op]["reverse"] = key + +_flex_doc_SERIES = """ +Return {desc} of series and other, element-wise (binary operator `{op_name}`). + +Equivalent to ``{equiv}``, but with support to substitute a fill_value for +missing data in one of the inputs. + +Parameters +---------- +other : Series or scalar value +fill_value : None or float value, default None (NaN) + Fill existing missing (NaN) values, and any new element needed for + successful Series alignment, with this value before computation. + If data in both corresponding Series locations is missing + the result will be missing. +level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level. + +Returns +------- +Series + The result of the operation. + +See Also +-------- +Series.{reverse} +""" + +_arith_doc_FRAME = """ +Binary operator %s with support to substitute a fill_value for missing data in +one of the inputs + +Parameters +---------- +other : Series, DataFrame, or constant +axis : {0, 1, 'index', 'columns'} + For Series input, axis to match Series index on +fill_value : None or float value, default None + Fill existing missing (NaN) values, and any new element needed for + successful DataFrame alignment, with this value before computation. + If data in both corresponding DataFrame locations is missing + the result will be missing +level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level + +Returns +------- +result : DataFrame + +Notes +----- +Mismatched indices will be unioned together +""" + +_flex_doc_FRAME = """ +Get {desc} of dataframe and other, element-wise (binary operator `{op_name}`). + +Equivalent to ``{equiv}``, but with support to substitute a fill_value +for missing data in one of the inputs. With reverse version, `{reverse}`. + +Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`) to +arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`. + +Parameters +---------- +other : scalar, sequence, Series, or DataFrame + Any single or multiple element data structure, or list-like object. +axis : {{0 or 'index', 1 or 'columns'}} + Whether to compare by the index (0 or 'index') or columns + (1 or 'columns'). For Series input, axis to match Series index on. +level : int or label + Broadcast across a level, matching Index values on the + passed MultiIndex level. +fill_value : float or None, default None + Fill existing missing (NaN) values, and any new element needed for + successful DataFrame alignment, with this value before computation. + If data in both corresponding DataFrame locations is missing + the result will be missing. + +Returns +------- +DataFrame + Result of the arithmetic operation. + +See Also +-------- +DataFrame.add : Add DataFrames. +DataFrame.sub : Subtract DataFrames. +DataFrame.mul : Multiply DataFrames. +DataFrame.div : Divide DataFrames (float division). +DataFrame.truediv : Divide DataFrames (float division). +DataFrame.floordiv : Divide DataFrames (integer division). +DataFrame.mod : Calculate modulo (remainder after division). +DataFrame.pow : Calculate exponential power. + +Notes +----- +Mismatched indices will be unioned together. + +Examples +-------- +>>> df = pd.DataFrame({{'angles': [0, 3, 4], +... 'degrees': [360, 180, 360]}}, +... index=['circle', 'triangle', 'rectangle']) +>>> df + angles degrees +circle 0 360 +triangle 3 180 +rectangle 4 360 + +Add a scalar with operator version which return the same +results. + +>>> df + 1 + angles degrees +circle 1 361 +triangle 4 181 +rectangle 5 361 + +>>> df.add(1) + angles degrees +circle 1 361 +triangle 4 181 +rectangle 5 361 + +Divide by constant with reverse version. + +>>> df.div(10) + angles degrees +circle 0.0 36.0 +triangle 0.3 18.0 +rectangle 0.4 36.0 + +>>> df.rdiv(10) + angles degrees +circle inf 0.027778 +triangle 3.333333 0.055556 +rectangle 2.500000 0.027778 + +Subtract a list and Series by axis with operator version. + +>>> df - [1, 2] + angles degrees +circle -1 358 +triangle 2 178 +rectangle 3 358 + +>>> df.sub([1, 2], axis='columns') + angles degrees +circle -1 358 +triangle 2 178 +rectangle 3 358 + +>>> df.sub(pd.Series([1, 1, 1], index=['circle', 'triangle', 'rectangle']), +... axis='index') + angles degrees +circle -1 359 +triangle 2 179 +rectangle 3 359 + +Multiply a DataFrame of different shape with operator version. + +>>> other = pd.DataFrame({{'angles': [0, 3, 4]}}, +... index=['circle', 'triangle', 'rectangle']) +>>> other + angles +circle 0 +triangle 3 +rectangle 4 + +>>> df * other + angles degrees +circle 0 NaN +triangle 9 NaN +rectangle 16 NaN + +>>> df.mul(other, fill_value=0) + angles degrees +circle 0 0.0 +triangle 9 0.0 +rectangle 16 0.0 + +Divide by a MultiIndex by level. + +>>> df_multindex = pd.DataFrame({{'angles': [0, 3, 4, 4, 5, 6], +... 'degrees': [360, 180, 360, 360, 540, 720]}}, +... index=[['A', 'A', 'A', 'B', 'B', 'B'], +... ['circle', 'triangle', 'rectangle', +... 'square', 'pentagon', 'hexagon']]) +>>> df_multindex + angles degrees +A circle 0 360 + triangle 3 180 + rectangle 4 360 +B square 4 360 + pentagon 5 540 + hexagon 6 720 + +>>> df.div(df_multindex, level=1, fill_value=0) + angles degrees +A circle NaN 1.0 + triangle 1.0 1.0 + rectangle 1.0 1.0 +B square 0.0 0.0 + pentagon 0.0 0.0 + hexagon 0.0 0.0 +""" + +_flex_comp_doc_FRAME = """ +Get {desc} of dataframe and other, element-wise (binary operator `{op_name}`). + +Among flexible wrappers (`eq`, `ne`, `le`, `lt`, `ge`, `gt`) to comparison +operators. + +Equivalent to `==`, `=!`, `<=`, `<`, `>=`, `>` with support to choose axis +(rows or columns) and level for comparison. + +Parameters +---------- +other : scalar, sequence, Series, or DataFrame + Any single or multiple element data structure, or list-like object. +axis : {{0 or 'index', 1 or 'columns'}}, default 'columns' + Whether to compare by the index (0 or 'index') or columns + (1 or 'columns'). +level : int or label + Broadcast across a level, matching Index values on the passed + MultiIndex level. + +Returns +------- +DataFrame of bool + Result of the comparison. + +See Also +-------- +DataFrame.eq : Compare DataFrames for equality elementwise. +DataFrame.ne : Compare DataFrames for inequality elementwise. +DataFrame.le : Compare DataFrames for less than inequality + or equality elementwise. +DataFrame.lt : Compare DataFrames for strictly less than + inequality elementwise. +DataFrame.ge : Compare DataFrames for greater than inequality + or equality elementwise. +DataFrame.gt : Compare DataFrames for strictly greater than + inequality elementwise. + +Notes +----- +Mismatched indices will be unioned together. +`NaN` values are considered different (i.e. `NaN` != `NaN`). + +Examples +-------- +>>> df = pd.DataFrame({{'cost': [250, 150, 100], +... 'revenue': [100, 250, 300]}}, +... index=['A', 'B', 'C']) +>>> df + cost revenue +A 250 100 +B 150 250 +C 100 300 + +Comparison with a scalar, using either the operator or method: + +>>> df == 100 + cost revenue +A False True +B False False +C True False + +>>> df.eq(100) + cost revenue +A False True +B False False +C True False + +When `other` is a :class:`Series`, the columns of a DataFrame are aligned +with the index of `other` and broadcast: + +>>> df != pd.Series([100, 250], index=["cost", "revenue"]) + cost revenue +A True True +B True False +C False True + +Use the method to control the broadcast axis: + +>>> df.ne(pd.Series([100, 300], index=["A", "D"]), axis='index') + cost revenue +A True False +B True True +C True True +D True True + +When comparing to an arbitrary sequence, the number of columns must +match the number elements in `other`: + +>>> df == [250, 100] + cost revenue +A True True +B False False +C False False + +Use the method to control the axis: + +>>> df.eq([250, 250, 100], axis='index') + cost revenue +A True False +B False True +C True False + +Compare to a DataFrame of different shape. + +>>> other = pd.DataFrame({{'revenue': [300, 250, 100, 150]}}, +... index=['A', 'B', 'C', 'D']) +>>> other + revenue +A 300 +B 250 +C 100 +D 150 + +>>> df.gt(other) + cost revenue +A False False +B False False +C False True +D False False + +Compare to a MultiIndex by level. + +>>> df_multindex = pd.DataFrame({{'cost': [250, 150, 100, 150, 300, 220], +... 'revenue': [100, 250, 300, 200, 175, 225]}}, +... index=[['Q1', 'Q1', 'Q1', 'Q2', 'Q2', 'Q2'], +... ['A', 'B', 'C', 'A', 'B', 'C']]) +>>> df_multindex + cost revenue +Q1 A 250 100 + B 150 250 + C 100 300 +Q2 A 150 200 + B 300 175 + C 220 225 + +>>> df.le(df_multindex, level=1) + cost revenue +Q1 A True True + B True True + C True True +Q2 A False True + B True False + C True False +""" diff --git a/venv/Lib/site-packages/pandas/core/ops/invalid.py b/venv/Lib/site-packages/pandas/core/ops/invalid.py new file mode 100644 index 0000000..cc4a1f1 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/ops/invalid.py @@ -0,0 +1,56 @@ +""" +Templates for invalid operations. +""" +import operator + +import numpy as np + + +def invalid_comparison(left, right, op): + """ + If a comparison has mismatched types and is not necessarily meaningful, + follow python3 conventions by: + + - returning all-False for equality + - returning all-True for inequality + - raising TypeError otherwise + + Parameters + ---------- + left : array-like + right : scalar, array-like + op : operator.{eq, ne, lt, le, gt} + + Raises + ------ + TypeError : on inequality comparisons + """ + if op is operator.eq: + res_values = np.zeros(left.shape, dtype=bool) + elif op is operator.ne: + res_values = np.ones(left.shape, dtype=bool) + else: + typ = type(right).__name__ + raise TypeError(f"Invalid comparison between dtype={left.dtype} and {typ}") + return res_values + + +def make_invalid_op(name: str): + """ + Return a binary method that always raises a TypeError. + + Parameters + ---------- + name : str + + Returns + ------- + invalid_op : function + """ + + def invalid_op(self, other=None): + typ = type(self).__name__ + raise TypeError(f"cannot perform {name} with this index type: {typ}") + + invalid_op.__name__ = name + return invalid_op diff --git a/venv/Lib/site-packages/pandas/core/ops/mask_ops.py b/venv/Lib/site-packages/pandas/core/ops/mask_ops.py new file mode 100644 index 0000000..8fb81fa --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/ops/mask_ops.py @@ -0,0 +1,178 @@ +""" +Ops for masked arrays. +""" +from typing import Optional, Union + +import numpy as np + +from pandas._libs import lib, missing as libmissing + + +def kleene_or( + left: Union[bool, np.ndarray], + right: Union[bool, np.ndarray], + left_mask: Optional[np.ndarray], + right_mask: Optional[np.ndarray], +): + """ + Boolean ``or`` using Kleene logic. + + Values are NA where we have ``NA | NA`` or ``NA | False``. + ``NA | True`` is considered True. + + Parameters + ---------- + left, right : ndarray, NA, or bool + The values of the array. + left_mask, right_mask : ndarray, optional + The masks. Only one of these may be None, which implies that + the associated `left` or `right` value is a scalar. + + Returns + ------- + result, mask: ndarray[bool] + The result of the logical or, and the new mask. + """ + # To reduce the number of cases, we ensure that `left` & `left_mask` + # always come from an array, not a scalar. This is safe, since because + # A | B == B | A + if left_mask is None: + return kleene_or(right, left, right_mask, left_mask) + + assert isinstance(left, np.ndarray) + + raise_for_nan(right, method="or") + + if right is libmissing.NA: + result = left.copy() + else: + result = left | right + + if right_mask is not None: + # output is unknown where (False & NA), (NA & False), (NA & NA) + left_false = ~(left | left_mask) + right_false = ~(right | right_mask) + mask = ( + (left_false & right_mask) + | (right_false & left_mask) + | (left_mask & right_mask) + ) + else: + if right is True: + mask = np.zeros_like(left_mask) + elif right is libmissing.NA: + mask = (~left & ~left_mask) | left_mask + else: + # False + mask = left_mask.copy() + + return result, mask + + +def kleene_xor( + left: Union[bool, np.ndarray], + right: Union[bool, np.ndarray], + left_mask: Optional[np.ndarray], + right_mask: Optional[np.ndarray], +): + """ + Boolean ``xor`` using Kleene logic. + + This is the same as ``or``, with the following adjustments + + * True, True -> False + * True, NA -> NA + + Parameters + ---------- + left, right : ndarray, NA, or bool + The values of the array. + left_mask, right_mask : ndarray, optional + The masks. Only one of these may be None, which implies that + the associated `left` or `right` value is a scalar. + + Returns + ------- + result, mask: ndarray[bool] + The result of the logical xor, and the new mask. + """ + if left_mask is None: + return kleene_xor(right, left, right_mask, left_mask) + + raise_for_nan(right, method="xor") + if right is libmissing.NA: + result = np.zeros_like(left) + else: + result = left ^ right + + if right_mask is None: + if right is libmissing.NA: + mask = np.ones_like(left_mask) + else: + mask = left_mask.copy() + else: + mask = left_mask | right_mask + + return result, mask + + +def kleene_and( + left: Union[bool, libmissing.NAType, np.ndarray], + right: Union[bool, libmissing.NAType, np.ndarray], + left_mask: Optional[np.ndarray], + right_mask: Optional[np.ndarray], +): + """ + Boolean ``and`` using Kleene logic. + + Values are ``NA`` for ``NA & NA`` or ``True & NA``. + + Parameters + ---------- + left, right : ndarray, NA, or bool + The values of the array. + left_mask, right_mask : ndarray, optional + The masks. Only one of these may be None, which implies that + the associated `left` or `right` value is a scalar. + + Returns + ------- + result, mask: ndarray[bool] + The result of the logical xor, and the new mask. + """ + # To reduce the number of cases, we ensure that `left` & `left_mask` + # always come from an array, not a scalar. This is safe, since because + # A | B == B | A + if left_mask is None: + return kleene_and(right, left, right_mask, left_mask) + + assert isinstance(left, np.ndarray) + raise_for_nan(right, method="and") + + if right is libmissing.NA: + result = np.zeros_like(left) + else: + result = left & right + + if right_mask is None: + # Scalar `right` + if right is libmissing.NA: + mask = (left & ~left_mask) | left_mask + + else: + mask = left_mask.copy() + if right is False: + # unmask everything + mask[:] = False + else: + # unmask where either left or right is False + left_false = ~(left | left_mask) + right_false = ~(right | right_mask) + mask = (left_mask & ~right_false) | (right_mask & ~left_false) + + return result, mask + + +def raise_for_nan(value, method): + if lib.is_float(value) and np.isnan(value): + raise ValueError(f"Cannot perform logical '{method}' with floating NaN") diff --git a/venv/Lib/site-packages/pandas/core/ops/methods.py b/venv/Lib/site-packages/pandas/core/ops/methods.py new file mode 100644 index 0000000..c046585 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/ops/methods.py @@ -0,0 +1,233 @@ +""" +Functions to generate methods and pin them to the appropriate classes. +""" +import operator + +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries, ABCSparseArray + +from pandas.core.ops.roperator import ( + radd, + rand_, + rdivmod, + rfloordiv, + rmod, + rmul, + ror_, + rpow, + rsub, + rtruediv, + rxor, +) + + +def _get_method_wrappers(cls): + """ + Find the appropriate operation-wrappers to use when defining flex/special + arithmetic, boolean, and comparison operations with the given class. + + Parameters + ---------- + cls : class + + Returns + ------- + arith_flex : function or None + comp_flex : function or None + arith_special : function + comp_special : function + bool_special : function + + Notes + ----- + None is only returned for SparseArray + """ + # TODO: make these non-runtime imports once the relevant functions + # are no longer in __init__ + from pandas.core.ops import ( + _arith_method_FRAME, + _arith_method_SERIES, + _bool_method_SERIES, + _comp_method_FRAME, + _comp_method_SERIES, + _flex_comp_method_FRAME, + _flex_method_SERIES, + ) + + if issubclass(cls, ABCSeries): + # Just Series + arith_flex = _flex_method_SERIES + comp_flex = _flex_method_SERIES + arith_special = _arith_method_SERIES + comp_special = _comp_method_SERIES + bool_special = _bool_method_SERIES + elif issubclass(cls, ABCDataFrame): + arith_flex = _arith_method_FRAME + comp_flex = _flex_comp_method_FRAME + arith_special = _arith_method_FRAME + comp_special = _comp_method_FRAME + bool_special = _arith_method_FRAME + return arith_flex, comp_flex, arith_special, comp_special, bool_special + + +def add_special_arithmetic_methods(cls): + """ + Adds the full suite of special arithmetic methods (``__add__``, + ``__sub__``, etc.) to the class. + + Parameters + ---------- + cls : class + special methods will be defined and pinned to this class + """ + _, _, arith_method, comp_method, bool_method = _get_method_wrappers(cls) + new_methods = _create_methods( + cls, arith_method, comp_method, bool_method, special=True + ) + # inplace operators (I feel like these should get passed an `inplace=True` + # or just be removed + + def _wrap_inplace_method(method): + """ + return an inplace wrapper for this method + """ + + def f(self, other): + result = method(self, other) + + # this makes sure that we are aligned like the input + # we are updating inplace so we want to ignore is_copy + self._update_inplace( + result.reindex_like(self, copy=False)._data, verify_is_copy=False + ) + + return self + + name = method.__name__.strip("__") + f.__name__ = f"__i{name}__" + return f + + new_methods.update( + dict( + __iadd__=_wrap_inplace_method(new_methods["__add__"]), + __isub__=_wrap_inplace_method(new_methods["__sub__"]), + __imul__=_wrap_inplace_method(new_methods["__mul__"]), + __itruediv__=_wrap_inplace_method(new_methods["__truediv__"]), + __ifloordiv__=_wrap_inplace_method(new_methods["__floordiv__"]), + __imod__=_wrap_inplace_method(new_methods["__mod__"]), + __ipow__=_wrap_inplace_method(new_methods["__pow__"]), + ) + ) + + new_methods.update( + dict( + __iand__=_wrap_inplace_method(new_methods["__and__"]), + __ior__=_wrap_inplace_method(new_methods["__or__"]), + __ixor__=_wrap_inplace_method(new_methods["__xor__"]), + ) + ) + + _add_methods(cls, new_methods=new_methods) + + +def add_flex_arithmetic_methods(cls): + """ + Adds the full suite of flex arithmetic methods (``pow``, ``mul``, ``add``) + to the class. + + Parameters + ---------- + cls : class + flex methods will be defined and pinned to this class + """ + flex_arith_method, flex_comp_method, _, _, _ = _get_method_wrappers(cls) + new_methods = _create_methods( + cls, flex_arith_method, flex_comp_method, bool_method=None, special=False + ) + new_methods.update( + dict( + multiply=new_methods["mul"], + subtract=new_methods["sub"], + divide=new_methods["div"], + ) + ) + # opt out of bool flex methods for now + assert not any(kname in new_methods for kname in ("ror_", "rxor", "rand_")) + + _add_methods(cls, new_methods=new_methods) + + +def _create_methods(cls, arith_method, comp_method, bool_method, special): + # creates actual methods based upon arithmetic, comp and bool method + # constructors. + + have_divmod = issubclass(cls, ABCSeries) + # divmod is available for Series + + new_methods = dict( + add=arith_method(cls, operator.add, special), + radd=arith_method(cls, radd, special), + sub=arith_method(cls, operator.sub, special), + mul=arith_method(cls, operator.mul, special), + truediv=arith_method(cls, operator.truediv, special), + floordiv=arith_method(cls, operator.floordiv, special), + # Causes a floating point exception in the tests when numexpr enabled, + # so for now no speedup + mod=arith_method(cls, operator.mod, special), + pow=arith_method(cls, operator.pow, special), + # not entirely sure why this is necessary, but previously was included + # so it's here to maintain compatibility + rmul=arith_method(cls, rmul, special), + rsub=arith_method(cls, rsub, special), + rtruediv=arith_method(cls, rtruediv, special), + rfloordiv=arith_method(cls, rfloordiv, special), + rpow=arith_method(cls, rpow, special), + rmod=arith_method(cls, rmod, special), + ) + new_methods["div"] = new_methods["truediv"] + new_methods["rdiv"] = new_methods["rtruediv"] + if have_divmod: + # divmod doesn't have an op that is supported by numexpr + new_methods["divmod"] = arith_method(cls, divmod, special) + new_methods["rdivmod"] = arith_method(cls, rdivmod, special) + + new_methods.update( + dict( + eq=comp_method(cls, operator.eq, special), + ne=comp_method(cls, operator.ne, special), + lt=comp_method(cls, operator.lt, special), + gt=comp_method(cls, operator.gt, special), + le=comp_method(cls, operator.le, special), + ge=comp_method(cls, operator.ge, special), + ) + ) + + if bool_method: + new_methods.update( + dict( + and_=bool_method(cls, operator.and_, special), + or_=bool_method(cls, operator.or_, special), + # For some reason ``^`` wasn't used in original. + xor=bool_method(cls, operator.xor, special), + rand_=bool_method(cls, rand_, special), + ror_=bool_method(cls, ror_, special), + rxor=bool_method(cls, rxor, special), + ) + ) + + if special: + dunderize = lambda x: f"__{x.strip('_')}__" + else: + dunderize = lambda x: x + new_methods = {dunderize(k): v for k, v in new_methods.items()} + return new_methods + + +def _add_methods(cls, new_methods): + for name, method in new_methods.items(): + # For most methods, if we find that the class already has a method + # of the same name, it is OK to over-write it. The exception is + # inplace methods (__iadd__, __isub__, ...) for SparseArray, which + # retain the np.ndarray versions. + force = not (issubclass(cls, ABCSparseArray) and name.startswith("__i")) + if force or name not in cls.__dict__: + setattr(cls, name, method) diff --git a/venv/Lib/site-packages/pandas/core/ops/missing.py b/venv/Lib/site-packages/pandas/core/ops/missing.py new file mode 100644 index 0000000..5039ffa --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/ops/missing.py @@ -0,0 +1,179 @@ +""" +Missing data handling for arithmetic operations. + +In particular, pandas conventions regarding division by zero differ +from numpy in the following ways: + 1) np.array([-1, 0, 1], dtype=dtype1) // np.array([0, 0, 0], dtype=dtype2) + gives [nan, nan, nan] for most dtype combinations, and [0, 0, 0] for + the remaining pairs + (the remaining being dtype1==dtype2==intN and dtype==dtype2==uintN). + + pandas convention is to return [-inf, nan, inf] for all dtype + combinations. + + Note: the numpy behavior described here is py3-specific. + + 2) np.array([-1, 0, 1], dtype=dtype1) % np.array([0, 0, 0], dtype=dtype2) + gives precisely the same results as the // operation. + + pandas convention is to return [nan, nan, nan] for all dtype + combinations. + + 3) divmod behavior consistent with 1) and 2). +""" +import operator + +import numpy as np + +from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype, is_scalar + +from pandas.core.ops.roperator import rdivmod, rfloordiv, rmod + + +def fill_zeros(result, x, y): + """ + If this is a reversed op, then flip x,y + + If we have an integer value (or array in y) + and we have 0's, fill them with np.nan, + return the result. + + Mask the nan's from x. + """ + if is_float_dtype(result.dtype): + return result + + is_variable_type = hasattr(y, "dtype") or hasattr(y, "type") + is_scalar_type = is_scalar(y) + + if not is_variable_type and not is_scalar_type: + return result + + if is_scalar_type: + y = np.array(y) + + if is_integer_dtype(y.dtype): + + if (y == 0).any(): + + # GH#7325, mask and nans must be broadcastable (also: GH#9308) + # Raveling and then reshaping makes np.putmask faster + mask = ((y == 0) & ~np.isnan(result)).ravel() + + shape = result.shape + result = result.astype("float64", copy=False).ravel() + + np.putmask(result, mask, np.nan) + + result = result.reshape(shape) + + return result + + +def mask_zero_div_zero(x, y, result): + """ + Set results of 0 / 0 or 0 // 0 to np.nan, regardless of the dtypes + of the numerator or the denominator. + + Parameters + ---------- + x : ndarray + y : ndarray + result : ndarray + + Returns + ------- + filled_result : ndarray + + Examples + -------- + >>> x = np.array([1, 0, -1], dtype=np.int64) + >>> y = 0 # int 0; numpy behavior is different with float + >>> result = x / y + >>> result # raw numpy result does not fill division by zero + array([0, 0, 0]) + >>> mask_zero_div_zero(x, y, result) + array([ inf, nan, -inf]) + """ + if not isinstance(result, np.ndarray): + # FIXME: SparseArray would raise TypeError with np.putmask + return result + + if is_scalar(y): + y = np.array(y) + + zmask = y == 0 + + if isinstance(zmask, bool): + # FIXME: numpy did not evaluate pointwise, seen in docs build + return result + + if zmask.any(): + shape = result.shape + + # Flip sign if necessary for -0.0 + zneg_mask = zmask & np.signbit(y) + zpos_mask = zmask & ~zneg_mask + + nan_mask = (zmask & (x == 0)).ravel() + with np.errstate(invalid="ignore"): + neginf_mask = ((zpos_mask & (x < 0)) | (zneg_mask & (x > 0))).ravel() + posinf_mask = ((zpos_mask & (x > 0)) | (zneg_mask & (x < 0))).ravel() + + if nan_mask.any() or neginf_mask.any() or posinf_mask.any(): + # Fill negative/0 with -inf, positive/0 with +inf, 0/0 with NaN + result = result.astype("float64", copy=False).ravel() + + np.putmask(result, nan_mask, np.nan) + np.putmask(result, posinf_mask, np.inf) + np.putmask(result, neginf_mask, -np.inf) + + result = result.reshape(shape) + + return result + + +def dispatch_fill_zeros(op, left, right, result): + """ + Call fill_zeros with the appropriate fill value depending on the operation, + with special logic for divmod and rdivmod. + + Parameters + ---------- + op : function (operator.add, operator.div, ...) + left : object (np.ndarray for non-reversed ops) + right : object (np.ndarray for reversed ops) + result : ndarray + + Returns + ------- + result : np.ndarray + + Notes + ----- + For divmod and rdivmod, the `result` parameter and returned `result` + is a 2-tuple of ndarray objects. + """ + if op is divmod: + result = ( + mask_zero_div_zero(left, right, result[0]), + fill_zeros(result[1], left, right), + ) + elif op is rdivmod: + result = ( + mask_zero_div_zero(right, left, result[0]), + fill_zeros(result[1], right, left), + ) + elif op is operator.floordiv: + # Note: no need to do this for truediv; in py3 numpy behaves the way + # we want. + result = mask_zero_div_zero(left, right, result) + elif op is rfloordiv: + # Note: no need to do this for rtruediv; in py3 numpy behaves the way + # we want. + result = mask_zero_div_zero(right, left, result) + elif op is operator.mod: + result = fill_zeros(result, left, right) + elif op is rmod: + result = fill_zeros(result, right, left) + return result diff --git a/venv/Lib/site-packages/pandas/core/ops/roperator.py b/venv/Lib/site-packages/pandas/core/ops/roperator.py new file mode 100644 index 0000000..e6691dd --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/ops/roperator.py @@ -0,0 +1,60 @@ +""" +Reversed Operations not available in the stdlib operator module. +Defining these instead of using lambdas allows us to reference them by name. +""" +import operator + + +def radd(left, right): + return right + left + + +def rsub(left, right): + return right - left + + +def rmul(left, right): + return right * left + + +def rdiv(left, right): + return right / left + + +def rtruediv(left, right): + return right / left + + +def rfloordiv(left, right): + return right // left + + +def rmod(left, right): + # check if right is a string as % is the string + # formatting operation; this is a TypeError + # otherwise perform the op + if isinstance(right, str): + typ = type(left).__name__ + raise TypeError(f"{typ} cannot perform the operation mod") + + return right % left + + +def rdivmod(left, right): + return divmod(right, left) + + +def rpow(left, right): + return right ** left + + +def rand_(left, right): + return operator.and_(right, left) + + +def ror_(left, right): + return operator.or_(right, left) + + +def rxor(left, right): + return operator.xor(right, left) diff --git a/venv/Lib/site-packages/pandas/core/resample.py b/venv/Lib/site-packages/pandas/core/resample.py new file mode 100644 index 0000000..0e43880 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/resample.py @@ -0,0 +1,1799 @@ +import copy +from datetime import timedelta +from textwrap import dedent +from typing import Dict, no_type_check + +import numpy as np + +from pandas._libs import lib +from pandas._libs.tslibs import NaT, Period, Timestamp +from pandas._libs.tslibs.frequencies import is_subperiod, is_superperiod +from pandas._libs.tslibs.period import IncompatibleFrequency +from pandas.compat.numpy import function as nv +from pandas.errors import AbstractMethodError +from pandas.util._decorators import Appender, Substitution + +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries + +import pandas.core.algorithms as algos +from pandas.core.base import DataError, ShallowMixin +from pandas.core.generic import _shared_docs +from pandas.core.groupby.base import GroupByMixin +from pandas.core.groupby.generic import SeriesGroupBy +from pandas.core.groupby.groupby import GroupBy, _GroupBy, _pipe_template, get_groupby +from pandas.core.groupby.grouper import Grouper +from pandas.core.groupby.ops import BinGrouper +from pandas.core.indexes.datetimes import DatetimeIndex, date_range +from pandas.core.indexes.period import PeriodIndex, period_range +from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range + +from pandas.tseries.frequencies import to_offset +from pandas.tseries.offsets import DateOffset, Day, Nano, Tick + +_shared_docs_kwargs: Dict[str, str] = dict() + + +class Resampler(_GroupBy, ShallowMixin): + """ + Class for resampling datetimelike data, a groupby-like operation. + See aggregate, transform, and apply functions on this object. + + It's easiest to use obj.resample(...) to use Resampler. + + Parameters + ---------- + obj : pandas object + groupby : a TimeGrouper object + axis : int, default 0 + kind : str or None + 'period', 'timestamp' to override default index treatment + + Returns + ------- + a Resampler of the appropriate type + + Notes + ----- + After resampling, see aggregate, apply, and transform functions. + """ + + # to the groupby descriptor + _attributes = [ + "freq", + "axis", + "closed", + "label", + "convention", + "loffset", + "base", + "kind", + ] + + def __init__(self, obj, groupby=None, axis=0, kind=None, **kwargs): + self.groupby = groupby + self.keys = None + self.sort = True + self.axis = axis + self.kind = kind + self.squeeze = False + self.group_keys = True + self.as_index = True + self.exclusions = set() + self.binner = None + self.grouper = None + + if self.groupby is not None: + self.groupby._set_grouper(self._convert_obj(obj), sort=True) + + def __str__(self) -> str: + """ + Provide a nice str repr of our rolling object. + """ + attrs = ( + f"{k}={getattr(self.groupby, k)}" + for k in self._attributes + if getattr(self.groupby, k, None) is not None + ) + return f"{type(self).__name__} [{', '.join(attrs)}]" + + def __getattr__(self, attr: str): + if attr in self._internal_names_set: + return object.__getattribute__(self, attr) + if attr in self._attributes: + return getattr(self.groupby, attr) + if attr in self.obj: + return self[attr] + + return object.__getattribute__(self, attr) + + def __iter__(self): + """ + Resampler iterator. + + Returns + ------- + Generator yielding sequence of (name, subsetted object) + for each group. + + See Also + -------- + GroupBy.__iter__ + """ + self._set_binner() + return super().__iter__() + + @property + def obj(self): + return self.groupby.obj + + @property + def ax(self): + return self.groupby.ax + + @property + def _typ(self) -> str: + """ + Masquerade for compat as a Series or a DataFrame. + """ + if isinstance(self._selected_obj, ABCSeries): + return "series" + return "dataframe" + + @property + def _from_selection(self) -> bool: + """ + Is the resampling from a DataFrame column or MultiIndex level. + """ + # upsampling and PeriodIndex resampling do not work + # with selection, this state used to catch and raise an error + return self.groupby is not None and ( + self.groupby.key is not None or self.groupby.level is not None + ) + + def _convert_obj(self, obj): + """ + Provide any conversions for the object in order to correctly handle. + + Parameters + ---------- + obj : the object to be resampled + + Returns + ------- + obj : converted object + """ + obj = obj._consolidate() + return obj + + def _get_binner_for_time(self): + raise AbstractMethodError(self) + + def _set_binner(self): + """ + Setup our binners. + + Cache these as we are an immutable object + """ + if self.binner is None: + self.binner, self.grouper = self._get_binner() + + def _get_binner(self): + """ + Create the BinGrouper, assume that self.set_grouper(obj) + has already been called. + """ + + binner, bins, binlabels = self._get_binner_for_time() + assert len(bins) == len(binlabels) + bin_grouper = BinGrouper(bins, binlabels, indexer=self.groupby.indexer) + return binner, bin_grouper + + def _assure_grouper(self): + """ + Make sure that we are creating our binner & grouper. + """ + self._set_binner() + + @Substitution( + klass="Resampler", + versionadded=".. versionadded:: 0.23.0", + examples=""" + >>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, + ... index=pd.date_range('2012-08-02', periods=4)) + >>> df + A + 2012-08-02 1 + 2012-08-03 2 + 2012-08-04 3 + 2012-08-05 4 + + To get the difference between each 2-day period's maximum and minimum + value in one pass, you can do + + >>> df.resample('2D').pipe(lambda x: x.max() - x.min()) + A + 2012-08-02 1 + 2012-08-04 1""", + ) + @Appender(_pipe_template) + def pipe(self, func, *args, **kwargs): + return super().pipe(func, *args, **kwargs) + + _agg_see_also_doc = dedent( + """ + See Also + -------- + DataFrame.groupby.aggregate + DataFrame.resample.transform + DataFrame.aggregate + """ + ) + + _agg_examples_doc = dedent( + """ + Examples + -------- + >>> s = pd.Series([1,2,3,4,5], + index=pd.date_range('20130101', periods=5,freq='s')) + 2013-01-01 00:00:00 1 + 2013-01-01 00:00:01 2 + 2013-01-01 00:00:02 3 + 2013-01-01 00:00:03 4 + 2013-01-01 00:00:04 5 + Freq: S, dtype: int64 + + >>> r = s.resample('2s') + DatetimeIndexResampler [freq=<2 * Seconds>, axis=0, closed=left, + label=left, convention=start, base=0] + + >>> r.agg(np.sum) + 2013-01-01 00:00:00 3 + 2013-01-01 00:00:02 7 + 2013-01-01 00:00:04 5 + Freq: 2S, dtype: int64 + + >>> r.agg(['sum','mean','max']) + sum mean max + 2013-01-01 00:00:00 3 1.5 2 + 2013-01-01 00:00:02 7 3.5 4 + 2013-01-01 00:00:04 5 5.0 5 + + >>> r.agg({'result' : lambda x: x.mean() / x.std(), + 'total' : np.sum}) + total result + 2013-01-01 00:00:00 3 2.121320 + 2013-01-01 00:00:02 7 4.949747 + 2013-01-01 00:00:04 5 NaN + """ + ) + + @Substitution( + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + versionadded="", + klass="DataFrame", + axis="", + ) + @Appender(_shared_docs["aggregate"]) + def aggregate(self, func, *args, **kwargs): + + self._set_binner() + result, how = self._aggregate(func, *args, **kwargs) + if result is None: + how = func + grouper = None + result = self._groupby_and_aggregate(how, grouper, *args, **kwargs) + + result = self._apply_loffset(result) + return result + + agg = aggregate + apply = aggregate + + def transform(self, arg, *args, **kwargs): + """ + Call function producing a like-indexed Series on each group and return + a Series with the transformed values. + + Parameters + ---------- + arg : function + To apply to each group. Should return a Series with the same index. + + Returns + ------- + transformed : Series + + Examples + -------- + >>> resampled.transform(lambda x: (x - x.mean()) / x.std()) + """ + return self._selected_obj.groupby(self.groupby).transform(arg, *args, **kwargs) + + def _downsample(self, f): + raise AbstractMethodError(self) + + def _upsample(self, f, limit=None, fill_value=None): + raise AbstractMethodError(self) + + def _gotitem(self, key, ndim: int, subset=None): + """ + Sub-classes to define. Return a sliced object. + + Parameters + ---------- + key : string / list of selections + ndim : 1,2 + requested ndim of result + subset : object, default None + subset to act on + """ + self._set_binner() + grouper = self.grouper + if subset is None: + subset = self.obj + grouped = get_groupby(subset, by=None, grouper=grouper, axis=self.axis) + + # try the key selection + try: + return grouped[key] + except KeyError: + return grouped + + def _groupby_and_aggregate(self, how, grouper=None, *args, **kwargs): + """ + Re-evaluate the obj with a groupby aggregation. + """ + + if grouper is None: + self._set_binner() + grouper = self.grouper + + obj = self._selected_obj + + grouped = get_groupby(obj, by=None, grouper=grouper, axis=self.axis) + + try: + if isinstance(obj, ABCDataFrame) and callable(how): + # Check if the function is reducing or not. + result = grouped._aggregate_item_by_item(how, *args, **kwargs) + else: + result = grouped.aggregate(how, *args, **kwargs) + except DataError: + # we have a non-reducing function; try to evaluate + result = grouped.apply(how, *args, **kwargs) + except ValueError as err: + if "Must produce aggregated value" in str(err): + # raised in _aggregate_named + pass + elif "len(index) != len(labels)" in str(err): + # raised in libgroupby validation + pass + elif "No objects to concatenate" in str(err): + # raised in concat call + # In tests this is reached via either + # _apply_to_column_groupbys (ohlc) or DataFrameGroupBy.nunique + pass + else: + raise + + # we have a non-reducing function + # try to evaluate + result = grouped.apply(how, *args, **kwargs) + + result = self._apply_loffset(result) + return self._wrap_result(result) + + def _apply_loffset(self, result): + """ + If loffset is set, offset the result index. + + This is NOT an idempotent routine, it will be applied + exactly once to the result. + + Parameters + ---------- + result : Series or DataFrame + the result of resample + """ + + needs_offset = ( + isinstance(self.loffset, (DateOffset, timedelta, np.timedelta64)) + and isinstance(result.index, DatetimeIndex) + and len(result.index) > 0 + ) + + if needs_offset: + result.index = result.index + self.loffset + + self.loffset = None + return result + + def _get_resampler_for_grouping(self, groupby, **kwargs): + """ + Return the correct class for resampling with groupby. + """ + return self._resampler_for_grouping(self, groupby=groupby, **kwargs) + + def _wrap_result(self, result): + """ + Potentially wrap any results. + """ + if isinstance(result, ABCSeries) and self._selection is not None: + result.name = self._selection + + if isinstance(result, ABCSeries) and result.empty: + obj = self.obj + if isinstance(obj.index, PeriodIndex): + result.index = obj.index.asfreq(self.freq) + else: + result.index = obj.index._shallow_copy(freq=self.freq) + result.name = getattr(obj, "name", None) + + return result + + def pad(self, limit=None): + """ + Forward fill the values. + + Parameters + ---------- + limit : int, optional + Limit of how many values to fill. + + Returns + ------- + An upsampled Series. + + See Also + -------- + Series.fillna + DataFrame.fillna + """ + return self._upsample("pad", limit=limit) + + ffill = pad + + def nearest(self, limit=None): + """ + Resample by using the nearest value. + + When resampling data, missing values may appear (e.g., when the + resampling frequency is higher than the original frequency). + The `nearest` method will replace ``NaN`` values that appeared in + the resampled data with the value from the nearest member of the + sequence, based on the index value. + Missing values that existed in the original data will not be modified. + If `limit` is given, fill only this many values in each direction for + each of the original values. + + Parameters + ---------- + limit : int, optional + Limit of how many values to fill. + + .. versionadded:: 0.21.0 + + Returns + ------- + Series or DataFrame + An upsampled Series or DataFrame with ``NaN`` values filled with + their nearest value. + + See Also + -------- + backfill : Backward fill the new missing values in the resampled data. + pad : Forward fill ``NaN`` values. + + Examples + -------- + >>> s = pd.Series([1, 2], + ... index=pd.date_range('20180101', + ... periods=2, + ... freq='1h')) + >>> s + 2018-01-01 00:00:00 1 + 2018-01-01 01:00:00 2 + Freq: H, dtype: int64 + + >>> s.resample('15min').nearest() + 2018-01-01 00:00:00 1 + 2018-01-01 00:15:00 1 + 2018-01-01 00:30:00 2 + 2018-01-01 00:45:00 2 + 2018-01-01 01:00:00 2 + Freq: 15T, dtype: int64 + + Limit the number of upsampled values imputed by the nearest: + + >>> s.resample('15min').nearest(limit=1) + 2018-01-01 00:00:00 1.0 + 2018-01-01 00:15:00 1.0 + 2018-01-01 00:30:00 NaN + 2018-01-01 00:45:00 2.0 + 2018-01-01 01:00:00 2.0 + Freq: 15T, dtype: float64 + """ + return self._upsample("nearest", limit=limit) + + def backfill(self, limit=None): + """ + Backward fill the new missing values in the resampled data. + + In statistics, imputation is the process of replacing missing data with + substituted values [1]_. When resampling data, missing values may + appear (e.g., when the resampling frequency is higher than the original + frequency). The backward fill will replace NaN values that appeared in + the resampled data with the next value in the original sequence. + Missing values that existed in the original data will not be modified. + + Parameters + ---------- + limit : int, optional + Limit of how many values to fill. + + Returns + ------- + Series, DataFrame + An upsampled Series or DataFrame with backward filled NaN values. + + See Also + -------- + bfill : Alias of backfill. + fillna : Fill NaN values using the specified method, which can be + 'backfill'. + nearest : Fill NaN values with nearest neighbor starting from center. + pad : Forward fill NaN values. + Series.fillna : Fill NaN values in the Series using the + specified method, which can be 'backfill'. + DataFrame.fillna : Fill NaN values in the DataFrame using the + specified method, which can be 'backfill'. + + References + ---------- + .. [1] https://en.wikipedia.org/wiki/Imputation_(statistics) + + Examples + -------- + + Resampling a Series: + + >>> s = pd.Series([1, 2, 3], + ... index=pd.date_range('20180101', periods=3, freq='h')) + >>> s + 2018-01-01 00:00:00 1 + 2018-01-01 01:00:00 2 + 2018-01-01 02:00:00 3 + Freq: H, dtype: int64 + + >>> s.resample('30min').backfill() + 2018-01-01 00:00:00 1 + 2018-01-01 00:30:00 2 + 2018-01-01 01:00:00 2 + 2018-01-01 01:30:00 3 + 2018-01-01 02:00:00 3 + Freq: 30T, dtype: int64 + + >>> s.resample('15min').backfill(limit=2) + 2018-01-01 00:00:00 1.0 + 2018-01-01 00:15:00 NaN + 2018-01-01 00:30:00 2.0 + 2018-01-01 00:45:00 2.0 + 2018-01-01 01:00:00 2.0 + 2018-01-01 01:15:00 NaN + 2018-01-01 01:30:00 3.0 + 2018-01-01 01:45:00 3.0 + 2018-01-01 02:00:00 3.0 + Freq: 15T, dtype: float64 + + Resampling a DataFrame that has missing values: + + >>> df = pd.DataFrame({'a': [2, np.nan, 6], 'b': [1, 3, 5]}, + ... index=pd.date_range('20180101', periods=3, + ... freq='h')) + >>> df + a b + 2018-01-01 00:00:00 2.0 1 + 2018-01-01 01:00:00 NaN 3 + 2018-01-01 02:00:00 6.0 5 + + >>> df.resample('30min').backfill() + a b + 2018-01-01 00:00:00 2.0 1 + 2018-01-01 00:30:00 NaN 3 + 2018-01-01 01:00:00 NaN 3 + 2018-01-01 01:30:00 6.0 5 + 2018-01-01 02:00:00 6.0 5 + + >>> df.resample('15min').backfill(limit=2) + a b + 2018-01-01 00:00:00 2.0 1.0 + 2018-01-01 00:15:00 NaN NaN + 2018-01-01 00:30:00 NaN 3.0 + 2018-01-01 00:45:00 NaN 3.0 + 2018-01-01 01:00:00 NaN 3.0 + 2018-01-01 01:15:00 NaN NaN + 2018-01-01 01:30:00 6.0 5.0 + 2018-01-01 01:45:00 6.0 5.0 + 2018-01-01 02:00:00 6.0 5.0 + """ + return self._upsample("backfill", limit=limit) + + bfill = backfill + + def fillna(self, method, limit=None): + """ + Fill missing values introduced by upsampling. + + In statistics, imputation is the process of replacing missing data with + substituted values [1]_. When resampling data, missing values may + appear (e.g., when the resampling frequency is higher than the original + frequency). + + Missing values that existed in the original data will + not be modified. + + Parameters + ---------- + method : {'pad', 'backfill', 'ffill', 'bfill', 'nearest'} + Method to use for filling holes in resampled data + + * 'pad' or 'ffill': use previous valid observation to fill gap + (forward fill). + * 'backfill' or 'bfill': use next valid observation to fill gap. + * 'nearest': use nearest valid observation to fill gap. + + limit : int, optional + Limit of how many consecutive missing values to fill. + + Returns + ------- + Series or DataFrame + An upsampled Series or DataFrame with missing values filled. + + See Also + -------- + backfill : Backward fill NaN values in the resampled data. + pad : Forward fill NaN values in the resampled data. + nearest : Fill NaN values in the resampled data + with nearest neighbor starting from center. + interpolate : Fill NaN values using interpolation. + Series.fillna : Fill NaN values in the Series using the + specified method, which can be 'bfill' and 'ffill'. + DataFrame.fillna : Fill NaN values in the DataFrame using the + specified method, which can be 'bfill' and 'ffill'. + + References + ---------- + .. [1] https://en.wikipedia.org/wiki/Imputation_(statistics) + + Examples + -------- + Resampling a Series: + + >>> s = pd.Series([1, 2, 3], + ... index=pd.date_range('20180101', periods=3, freq='h')) + >>> s + 2018-01-01 00:00:00 1 + 2018-01-01 01:00:00 2 + 2018-01-01 02:00:00 3 + Freq: H, dtype: int64 + + Without filling the missing values you get: + + >>> s.resample("30min").asfreq() + 2018-01-01 00:00:00 1.0 + 2018-01-01 00:30:00 NaN + 2018-01-01 01:00:00 2.0 + 2018-01-01 01:30:00 NaN + 2018-01-01 02:00:00 3.0 + Freq: 30T, dtype: float64 + + >>> s.resample('30min').fillna("backfill") + 2018-01-01 00:00:00 1 + 2018-01-01 00:30:00 2 + 2018-01-01 01:00:00 2 + 2018-01-01 01:30:00 3 + 2018-01-01 02:00:00 3 + Freq: 30T, dtype: int64 + + >>> s.resample('15min').fillna("backfill", limit=2) + 2018-01-01 00:00:00 1.0 + 2018-01-01 00:15:00 NaN + 2018-01-01 00:30:00 2.0 + 2018-01-01 00:45:00 2.0 + 2018-01-01 01:00:00 2.0 + 2018-01-01 01:15:00 NaN + 2018-01-01 01:30:00 3.0 + 2018-01-01 01:45:00 3.0 + 2018-01-01 02:00:00 3.0 + Freq: 15T, dtype: float64 + + >>> s.resample('30min').fillna("pad") + 2018-01-01 00:00:00 1 + 2018-01-01 00:30:00 1 + 2018-01-01 01:00:00 2 + 2018-01-01 01:30:00 2 + 2018-01-01 02:00:00 3 + Freq: 30T, dtype: int64 + + >>> s.resample('30min').fillna("nearest") + 2018-01-01 00:00:00 1 + 2018-01-01 00:30:00 2 + 2018-01-01 01:00:00 2 + 2018-01-01 01:30:00 3 + 2018-01-01 02:00:00 3 + Freq: 30T, dtype: int64 + + Missing values present before the upsampling are not affected. + + >>> sm = pd.Series([1, None, 3], + ... index=pd.date_range('20180101', periods=3, freq='h')) + >>> sm + 2018-01-01 00:00:00 1.0 + 2018-01-01 01:00:00 NaN + 2018-01-01 02:00:00 3.0 + Freq: H, dtype: float64 + + >>> sm.resample('30min').fillna('backfill') + 2018-01-01 00:00:00 1.0 + 2018-01-01 00:30:00 NaN + 2018-01-01 01:00:00 NaN + 2018-01-01 01:30:00 3.0 + 2018-01-01 02:00:00 3.0 + Freq: 30T, dtype: float64 + + >>> sm.resample('30min').fillna('pad') + 2018-01-01 00:00:00 1.0 + 2018-01-01 00:30:00 1.0 + 2018-01-01 01:00:00 NaN + 2018-01-01 01:30:00 NaN + 2018-01-01 02:00:00 3.0 + Freq: 30T, dtype: float64 + + >>> sm.resample('30min').fillna('nearest') + 2018-01-01 00:00:00 1.0 + 2018-01-01 00:30:00 NaN + 2018-01-01 01:00:00 NaN + 2018-01-01 01:30:00 3.0 + 2018-01-01 02:00:00 3.0 + Freq: 30T, dtype: float64 + + DataFrame resampling is done column-wise. All the same options are + available. + + >>> df = pd.DataFrame({'a': [2, np.nan, 6], 'b': [1, 3, 5]}, + ... index=pd.date_range('20180101', periods=3, + ... freq='h')) + >>> df + a b + 2018-01-01 00:00:00 2.0 1 + 2018-01-01 01:00:00 NaN 3 + 2018-01-01 02:00:00 6.0 5 + + >>> df.resample('30min').fillna("bfill") + a b + 2018-01-01 00:00:00 2.0 1 + 2018-01-01 00:30:00 NaN 3 + 2018-01-01 01:00:00 NaN 3 + 2018-01-01 01:30:00 6.0 5 + 2018-01-01 02:00:00 6.0 5 + """ + return self._upsample(method, limit=limit) + + @Appender(_shared_docs["interpolate"] % _shared_docs_kwargs) + def interpolate( + self, + method="linear", + axis=0, + limit=None, + inplace=False, + limit_direction="forward", + limit_area=None, + downcast=None, + **kwargs, + ): + """ + Interpolate values according to different methods. + """ + result = self._upsample(None) + return result.interpolate( + method=method, + axis=axis, + limit=limit, + inplace=inplace, + limit_direction=limit_direction, + limit_area=limit_area, + downcast=downcast, + **kwargs, + ) + + def asfreq(self, fill_value=None): + """ + Return the values at the new freq, essentially a reindex. + + Parameters + ---------- + fill_value : scalar, optional + Value to use for missing values, applied during upsampling (note + this does not fill NaNs that already were present). + + Returns + ------- + DataFrame or Series + Values at the specified freq. + + See Also + -------- + Series.asfreq + DataFrame.asfreq + """ + return self._upsample("asfreq", fill_value=fill_value) + + def std(self, ddof=1, *args, **kwargs): + """ + Compute standard deviation of groups, excluding missing values. + + Parameters + ---------- + ddof : int, default 1 + Degrees of freedom. + + Returns + ------- + DataFrame or Series + Standard deviation of values within each group. + """ + nv.validate_resampler_func("std", args, kwargs) + return self._downsample("std", ddof=ddof) + + def var(self, ddof=1, *args, **kwargs): + """ + Compute variance of groups, excluding missing values. + + Parameters + ---------- + ddof : int, default 1 + Degrees of freedom. + + Returns + ------- + DataFrame or Series + Variance of values within each group. + """ + nv.validate_resampler_func("var", args, kwargs) + return self._downsample("var", ddof=ddof) + + @Appender(GroupBy.size.__doc__) + def size(self): + result = self._downsample("size") + if not len(self.ax): + from pandas import Series + + if self._selected_obj.ndim == 1: + name = self._selected_obj.name + else: + name = None + result = Series([], index=result.index, dtype="int64", name=name) + return result + + @Appender(GroupBy.count.__doc__) + def count(self): + result = self._downsample("count") + if not len(self.ax): + if self._selected_obj.ndim == 1: + result = type(self._selected_obj)( + [], index=result.index, dtype="int64", name=self._selected_obj.name + ) + else: + from pandas import DataFrame + + result = DataFrame( + [], index=result.index, columns=result.columns, dtype="int64" + ) + + return result + + def quantile(self, q=0.5, **kwargs): + """ + Return value at the given quantile. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + q : float or array-like, default 0.5 (50% quantile) + + Returns + ------- + DataFrame or Series + Quantile of values within each group. + + See Also + -------- + Series.quantile + DataFrame.quantile + DataFrameGroupBy.quantile + """ + return self._downsample("quantile", q=q, **kwargs) + + +# downsample methods +for method in ["sum", "prod"]: + + def f(self, _method=method, min_count=0, *args, **kwargs): + nv.validate_resampler_func(_method, args, kwargs) + return self._downsample(_method, min_count=min_count) + + f.__doc__ = getattr(GroupBy, method).__doc__ + setattr(Resampler, method, f) + + +# downsample methods +for method in ["min", "max", "first", "last", "mean", "sem", "median", "ohlc"]: + + def g(self, _method=method, *args, **kwargs): + nv.validate_resampler_func(_method, args, kwargs) + return self._downsample(_method) + + g.__doc__ = getattr(GroupBy, method).__doc__ + setattr(Resampler, method, g) + + +# series only methods +for method in ["nunique"]: + + def h(self, _method=method): + return self._downsample(_method) + + h.__doc__ = getattr(SeriesGroupBy, method).__doc__ + setattr(Resampler, method, h) + + +class _GroupByMixin(GroupByMixin): + """ + Provide the groupby facilities. + """ + + def __init__(self, obj, *args, **kwargs): + + parent = kwargs.pop("parent", None) + groupby = kwargs.pop("groupby", None) + if parent is None: + parent = obj + + # initialize our GroupByMixin object with + # the resampler attributes + for attr in self._attributes: + setattr(self, attr, kwargs.get(attr, getattr(parent, attr))) + + super().__init__(None) + self._groupby = groupby + self._groupby.mutated = True + self._groupby.grouper.mutated = True + self.groupby = copy.copy(parent.groupby) + + @no_type_check + def _apply(self, f, grouper=None, *args, **kwargs): + """ + Dispatch to _upsample; we are stripping all of the _upsample kwargs and + performing the original function call on the grouped object. + """ + + def func(x): + x = self._shallow_copy(x, groupby=self.groupby) + + if isinstance(f, str): + return getattr(x, f)(**kwargs) + + return x.apply(f, *args, **kwargs) + + result = self._groupby.apply(func) + return self._wrap_result(result) + + _upsample = _apply + _downsample = _apply + _groupby_and_aggregate = _apply + + +class DatetimeIndexResampler(Resampler): + @property + def _resampler_for_grouping(self): + return DatetimeIndexResamplerGroupby + + def _get_binner_for_time(self): + + # this is how we are actually creating the bins + if self.kind == "period": + return self.groupby._get_time_period_bins(self.ax) + return self.groupby._get_time_bins(self.ax) + + def _downsample(self, how, **kwargs): + """ + Downsample the cython defined function. + + Parameters + ---------- + how : string / cython mapped function + **kwargs : kw args passed to how function + """ + self._set_binner() + how = self._get_cython_func(how) or how + ax = self.ax + obj = self._selected_obj + + if not len(ax): + # reset to the new freq + obj = obj.copy() + obj.index._set_freq(self.freq) + return obj + + # do we have a regular frequency + if ax.freq is not None or ax.inferred_freq is not None: + + if len(self.grouper.binlabels) > len(ax) and how is None: + + # let's do an asfreq + return self.asfreq() + + # we are downsampling + # we want to call the actual grouper method here + result = obj.groupby(self.grouper, axis=self.axis).aggregate(how, **kwargs) + + result = self._apply_loffset(result) + return self._wrap_result(result) + + def _adjust_binner_for_upsample(self, binner): + """ + Adjust our binner when upsampling. + + The range of a new index should not be outside specified range + """ + if self.closed == "right": + binner = binner[1:] + else: + binner = binner[:-1] + return binner + + def _upsample(self, method, limit=None, fill_value=None): + """ + Parameters + ---------- + method : string {'backfill', 'bfill', 'pad', + 'ffill', 'asfreq'} method for upsampling + limit : int, default None + Maximum size gap to fill when reindexing + fill_value : scalar, default None + Value to use for missing values + + See Also + -------- + .fillna + + """ + self._set_binner() + if self.axis: + raise AssertionError("axis must be 0") + if self._from_selection: + raise ValueError( + "Upsampling from level= or on= selection " + "is not supported, use .set_index(...) " + "to explicitly set index to datetime-like" + ) + + ax = self.ax + obj = self._selected_obj + binner = self.binner + res_index = self._adjust_binner_for_upsample(binner) + + # if we have the same frequency as our axis, then we are equal sampling + if limit is None and to_offset(ax.inferred_freq) == self.freq: + result = obj.copy() + result.index = res_index + else: + result = obj.reindex( + res_index, method=method, limit=limit, fill_value=fill_value + ) + + result = self._apply_loffset(result) + return self._wrap_result(result) + + def _wrap_result(self, result): + result = super()._wrap_result(result) + + # we may have a different kind that we were asked originally + # convert if needed + if self.kind == "period" and not isinstance(result.index, PeriodIndex): + result.index = result.index.to_period(self.freq) + return result + + +class DatetimeIndexResamplerGroupby(_GroupByMixin, DatetimeIndexResampler): + """ + Provides a resample of a groupby implementation + """ + + @property + def _constructor(self): + return DatetimeIndexResampler + + +class PeriodIndexResampler(DatetimeIndexResampler): + @property + def _resampler_for_grouping(self): + return PeriodIndexResamplerGroupby + + def _get_binner_for_time(self): + if self.kind == "timestamp": + return super()._get_binner_for_time() + return self.groupby._get_period_bins(self.ax) + + def _convert_obj(self, obj): + obj = super()._convert_obj(obj) + + if self._from_selection: + # see GH 14008, GH 12871 + msg = ( + "Resampling from level= or on= selection " + "with a PeriodIndex is not currently supported, " + "use .set_index(...) to explicitly set index" + ) + raise NotImplementedError(msg) + + if self.loffset is not None: + # Cannot apply loffset/timedelta to PeriodIndex -> convert to + # timestamps + self.kind = "timestamp" + + # convert to timestamp + if self.kind == "timestamp": + obj = obj.to_timestamp(how=self.convention) + + return obj + + def _downsample(self, how, **kwargs): + """ + Downsample the cython defined function. + + Parameters + ---------- + how : string / cython mapped function + **kwargs : kw args passed to how function + """ + + # we may need to actually resample as if we are timestamps + if self.kind == "timestamp": + return super()._downsample(how, **kwargs) + + how = self._get_cython_func(how) or how + ax = self.ax + + if is_subperiod(ax.freq, self.freq): + # Downsampling + return self._groupby_and_aggregate(how, grouper=self.grouper, **kwargs) + elif is_superperiod(ax.freq, self.freq): + if how == "ohlc": + # GH #13083 + # upsampling to subperiods is handled as an asfreq, which works + # for pure aggregating/reducing methods + # OHLC reduces along the time dimension, but creates multiple + # values for each period -> handle by _groupby_and_aggregate() + return self._groupby_and_aggregate(how, grouper=self.grouper) + return self.asfreq() + elif ax.freq == self.freq: + return self.asfreq() + + raise IncompatibleFrequency( + f"Frequency {ax.freq} cannot be resampled to {self.freq}, " + "as they are not sub or super periods" + ) + + def _upsample(self, method, limit=None, fill_value=None): + """ + Parameters + ---------- + method : string {'backfill', 'bfill', 'pad', 'ffill'} + Method for upsampling. + limit : int, default None + Maximum size gap to fill when reindexing. + fill_value : scalar, default None + Value to use for missing values. + + See Also + -------- + .fillna + + """ + + # we may need to actually resample as if we are timestamps + if self.kind == "timestamp": + return super()._upsample(method, limit=limit, fill_value=fill_value) + + self._set_binner() + ax = self.ax + obj = self.obj + new_index = self.binner + + # Start vs. end of period + memb = ax.asfreq(self.freq, how=self.convention) + + # Get the fill indexer + indexer = memb.get_indexer(new_index, method=method, limit=limit) + return self._wrap_result( + _take_new_index(obj, indexer, new_index, axis=self.axis) + ) + + +class PeriodIndexResamplerGroupby(_GroupByMixin, PeriodIndexResampler): + """ + Provides a resample of a groupby implementation. + """ + + @property + def _constructor(self): + return PeriodIndexResampler + + +class TimedeltaIndexResampler(DatetimeIndexResampler): + @property + def _resampler_for_grouping(self): + return TimedeltaIndexResamplerGroupby + + def _get_binner_for_time(self): + return self.groupby._get_time_delta_bins(self.ax) + + def _adjust_binner_for_upsample(self, binner): + """ + Adjust our binner when upsampling. + + The range of a new index is allowed to be greater than original range + so we don't need to change the length of a binner, GH 13022 + """ + return binner + + +class TimedeltaIndexResamplerGroupby(_GroupByMixin, TimedeltaIndexResampler): + """ + Provides a resample of a groupby implementation. + """ + + @property + def _constructor(self): + return TimedeltaIndexResampler + + +def resample(obj, kind=None, **kwds): + """ + Create a TimeGrouper and return our resampler. + """ + tg = TimeGrouper(**kwds) + return tg._get_resampler(obj, kind=kind) + + +resample.__doc__ = Resampler.__doc__ + + +def get_resampler_for_grouping( + groupby, rule, how=None, fill_method=None, limit=None, kind=None, **kwargs +): + """ + Return our appropriate resampler when grouping as well. + """ + + # .resample uses 'on' similar to how .groupby uses 'key' + kwargs["key"] = kwargs.pop("on", None) + + tg = TimeGrouper(freq=rule, **kwargs) + resampler = tg._get_resampler(groupby.obj, kind=kind) + return resampler._get_resampler_for_grouping(groupby=groupby) + + +class TimeGrouper(Grouper): + """ + Custom groupby class for time-interval grouping. + + Parameters + ---------- + freq : pandas date offset or offset alias for identifying bin edges + closed : closed end of interval; 'left' or 'right' + label : interval boundary to use for labeling; 'left' or 'right' + convention : {'start', 'end', 'e', 's'} + If axis is PeriodIndex + """ + + _attributes = Grouper._attributes + ( + "closed", + "label", + "how", + "loffset", + "kind", + "convention", + "base", + ) + + def __init__( + self, + freq="Min", + closed=None, + label=None, + how="mean", + axis=0, + fill_method=None, + limit=None, + loffset=None, + kind=None, + convention=None, + base=0, + **kwargs, + ): + # Check for correctness of the keyword arguments which would + # otherwise silently use the default if misspelled + if label not in {None, "left", "right"}: + raise ValueError(f"Unsupported value {label} for `label`") + if closed not in {None, "left", "right"}: + raise ValueError(f"Unsupported value {closed} for `closed`") + if convention not in {None, "start", "end", "e", "s"}: + raise ValueError(f"Unsupported value {convention} for `convention`") + + freq = to_offset(freq) + + end_types = {"M", "A", "Q", "BM", "BA", "BQ", "W"} + rule = freq.rule_code + if rule in end_types or ("-" in rule and rule[: rule.find("-")] in end_types): + if closed is None: + closed = "right" + if label is None: + label = "right" + else: + if closed is None: + closed = "left" + if label is None: + label = "left" + + self.closed = closed + self.label = label + self.kind = kind + + self.convention = convention or "E" + self.convention = self.convention.lower() + + if isinstance(loffset, str): + loffset = to_offset(loffset) + self.loffset = loffset + + self.how = how + self.fill_method = fill_method + self.limit = limit + self.base = base + + # always sort time groupers + kwargs["sort"] = True + + super().__init__(freq=freq, axis=axis, **kwargs) + + def _get_resampler(self, obj, kind=None): + """ + Return my resampler or raise if we have an invalid axis. + + Parameters + ---------- + obj : input object + kind : string, optional + 'period','timestamp','timedelta' are valid + + Returns + ------- + a Resampler + + Raises + ------ + TypeError if incompatible axis + + """ + self._set_grouper(obj) + + ax = self.ax + if isinstance(ax, DatetimeIndex): + return DatetimeIndexResampler(obj, groupby=self, kind=kind, axis=self.axis) + elif isinstance(ax, PeriodIndex) or kind == "period": + return PeriodIndexResampler(obj, groupby=self, kind=kind, axis=self.axis) + elif isinstance(ax, TimedeltaIndex): + return TimedeltaIndexResampler(obj, groupby=self, axis=self.axis) + + raise TypeError( + "Only valid with DatetimeIndex, " + "TimedeltaIndex or PeriodIndex, " + f"but got an instance of '{type(ax).__name__}'" + ) + + def _get_grouper(self, obj, validate: bool = True): + # create the resampler and return our binner + r = self._get_resampler(obj) + r._set_binner() + return r.binner, r.grouper, r.obj + + def _get_time_bins(self, ax): + if not isinstance(ax, DatetimeIndex): + raise TypeError( + "axis must be a DatetimeIndex, but got " + f"an instance of {type(ax).__name__}" + ) + + if len(ax) == 0: + binner = labels = DatetimeIndex(data=[], freq=self.freq, name=ax.name) + return binner, [], labels + + first, last = _get_timestamp_range_edges( + ax.min(), ax.max(), self.freq, closed=self.closed, base=self.base + ) + # GH #12037 + # use first/last directly instead of call replace() on them + # because replace() will swallow the nanosecond part + # thus last bin maybe slightly before the end if the end contains + # nanosecond part and lead to `Values falls after last bin` error + binner = labels = date_range( + freq=self.freq, + start=first, + end=last, + tz=ax.tz, + name=ax.name, + ambiguous="infer", + nonexistent="shift_forward", + ) + + ax_values = ax.asi8 + binner, bin_edges = self._adjust_bin_edges(binner, ax_values) + + # general version, knowing nothing about relative frequencies + bins = lib.generate_bins_dt64( + ax_values, bin_edges, self.closed, hasnans=ax.hasnans + ) + + if self.closed == "right": + labels = binner + if self.label == "right": + labels = labels[1:] + elif self.label == "right": + labels = labels[1:] + + if ax.hasnans: + binner = binner.insert(0, NaT) + labels = labels.insert(0, NaT) + + # if we end up with more labels than bins + # adjust the labels + # GH4076 + if len(bins) < len(labels): + labels = labels[: len(bins)] + + return binner, bins, labels + + def _adjust_bin_edges(self, binner, ax_values): + # Some hacks for > daily data, see #1471, #1458, #1483 + + if self.freq != "D" and is_superperiod(self.freq, "D"): + if self.closed == "right": + # GH 21459, GH 9119: Adjust the bins relative to the wall time + bin_edges = binner.tz_localize(None) + bin_edges = bin_edges + timedelta(1) - Nano(1) + bin_edges = bin_edges.tz_localize(binner.tz).asi8 + else: + bin_edges = binner.asi8 + + # intraday values on last day + if bin_edges[-2] > ax_values.max(): + bin_edges = bin_edges[:-1] + binner = binner[:-1] + else: + bin_edges = binner.asi8 + return binner, bin_edges + + def _get_time_delta_bins(self, ax): + if not isinstance(ax, TimedeltaIndex): + raise TypeError( + "axis must be a TimedeltaIndex, but got " + f"an instance of {type(ax).__name__}" + ) + + if not len(ax): + binner = labels = TimedeltaIndex(data=[], freq=self.freq, name=ax.name) + return binner, [], labels + + start, end = ax.min(), ax.max() + labels = binner = timedelta_range( + start=start, end=end, freq=self.freq, name=ax.name + ) + + end_stamps = labels + self.freq + bins = ax.searchsorted(end_stamps, side="left") + + # Addresses GH #10530 + if self.base > 0: + labels += type(self.freq)(self.base) + + return binner, bins, labels + + def _get_time_period_bins(self, ax): + if not isinstance(ax, DatetimeIndex): + raise TypeError( + "axis must be a DatetimeIndex, but got " + f"an instance of {type(ax).__name__}" + ) + + freq = self.freq + + if not len(ax): + binner = labels = PeriodIndex(data=[], freq=freq, name=ax.name) + return binner, [], labels + + labels = binner = period_range(start=ax[0], end=ax[-1], freq=freq, name=ax.name) + + end_stamps = (labels + freq).asfreq(freq, "s").to_timestamp() + if ax.tzinfo: + end_stamps = end_stamps.tz_localize(ax.tzinfo) + bins = ax.searchsorted(end_stamps, side="left") + + return binner, bins, labels + + def _get_period_bins(self, ax): + if not isinstance(ax, PeriodIndex): + raise TypeError( + "axis must be a PeriodIndex, but got " + f"an instance of {type(ax).__name__}" + ) + + memb = ax.asfreq(self.freq, how=self.convention) + + # NaT handling as in pandas._lib.lib.generate_bins_dt64() + nat_count = 0 + if memb.hasnans: + nat_count = np.sum(memb._isnan) + memb = memb[~memb._isnan] + + # if index contains no valid (non-NaT) values, return empty index + if not len(memb): + binner = labels = PeriodIndex(data=[], freq=self.freq, name=ax.name) + return binner, [], labels + + freq_mult = self.freq.n + + start = ax.min().asfreq(self.freq, how=self.convention) + end = ax.max().asfreq(self.freq, how="end") + bin_shift = 0 + + # GH 23882 + if self.base: + # get base adjusted bin edge labels + p_start, end = _get_period_range_edges( + start, end, self.freq, closed=self.closed, base=self.base + ) + + # Get offset for bin edge (not label edge) adjustment + start_offset = Period(start, self.freq) - Period(p_start, self.freq) + bin_shift = start_offset.n % freq_mult + start = p_start + + labels = binner = period_range( + start=start, end=end, freq=self.freq, name=ax.name + ) + + i8 = memb.asi8 + + # when upsampling to subperiods, we need to generate enough bins + expected_bins_count = len(binner) * freq_mult + i8_extend = expected_bins_count - (i8[-1] - i8[0]) + rng = np.arange(i8[0], i8[-1] + i8_extend, freq_mult) + rng += freq_mult + # adjust bin edge indexes to account for base + rng -= bin_shift + + # Wrap in PeriodArray for PeriodArray.searchsorted + prng = type(memb._data)(rng, dtype=memb.dtype) + bins = memb.searchsorted(prng, side="left") + + if nat_count > 0: + # NaT handling as in pandas._lib.lib.generate_bins_dt64() + # shift bins by the number of NaT + bins += nat_count + bins = np.insert(bins, 0, nat_count) + binner = binner.insert(0, NaT) + labels = labels.insert(0, NaT) + + return binner, bins, labels + + +def _take_new_index(obj, indexer, new_index, axis=0): + + if isinstance(obj, ABCSeries): + new_values = algos.take_1d(obj.values, indexer) + return obj._constructor(new_values, index=new_index, name=obj.name) + elif isinstance(obj, ABCDataFrame): + if axis == 1: + raise NotImplementedError("axis 1 is not supported") + return obj._constructor( + obj._data.reindex_indexer(new_axis=new_index, indexer=indexer, axis=1) + ) + else: + raise ValueError("'obj' should be either a Series or a DataFrame") + + +def _get_timestamp_range_edges(first, last, offset, closed="left", base=0): + """ + Adjust the `first` Timestamp to the preceding Timestamp that resides on + the provided offset. Adjust the `last` Timestamp to the following + Timestamp that resides on the provided offset. Input Timestamps that + already reside on the offset will be adjusted depending on the type of + offset and the `closed` parameter. + + Parameters + ---------- + first : pd.Timestamp + The beginning Timestamp of the range to be adjusted. + last : pd.Timestamp + The ending Timestamp of the range to be adjusted. + offset : pd.DateOffset + The dateoffset to which the Timestamps will be adjusted. + closed : {'right', 'left'}, default None + Which side of bin interval is closed. + base : int, default 0 + The "origin" of the adjusted Timestamps. + + Returns + ------- + A tuple of length 2, containing the adjusted pd.Timestamp objects. + """ + if isinstance(offset, Tick): + if isinstance(offset, Day): + # _adjust_dates_anchored assumes 'D' means 24H, but first/last + # might contain a DST transition (23H, 24H, or 25H). + # So "pretend" the dates are naive when adjusting the endpoints + tz = first.tz + first = first.tz_localize(None) + last = last.tz_localize(None) + + first, last = _adjust_dates_anchored( + first, last, offset, closed=closed, base=base + ) + if isinstance(offset, Day): + first = first.tz_localize(tz) + last = last.tz_localize(tz) + return first, last + + else: + first = first.normalize() + last = last.normalize() + + if closed == "left": + first = Timestamp(offset.rollback(first)) + else: + first = Timestamp(first - offset) + + last = Timestamp(last + offset) + + return first, last + + +def _get_period_range_edges(first, last, offset, closed="left", base=0): + """ + Adjust the provided `first` and `last` Periods to the respective Period of + the given offset that encompasses them. + + Parameters + ---------- + first : pd.Period + The beginning Period of the range to be adjusted. + last : pd.Period + The ending Period of the range to be adjusted. + offset : pd.DateOffset + The dateoffset to which the Periods will be adjusted. + closed : {'right', 'left'}, default None + Which side of bin interval is closed. + base : int, default 0 + The "origin" of the adjusted Periods. + + Returns + ------- + A tuple of length 2, containing the adjusted pd.Period objects. + """ + if not all(isinstance(obj, Period) for obj in [first, last]): + raise TypeError("'first' and 'last' must be instances of type Period") + + # GH 23882 + first = first.to_timestamp() + last = last.to_timestamp() + adjust_first = not offset.is_on_offset(first) + adjust_last = offset.is_on_offset(last) + + first, last = _get_timestamp_range_edges( + first, last, offset, closed=closed, base=base + ) + + first = (first + adjust_first * offset).to_period(offset) + last = (last - adjust_last * offset).to_period(offset) + return first, last + + +def _adjust_dates_anchored(first, last, offset, closed="right", base=0): + # First and last offsets should be calculated from the start day to fix an + # error cause by resampling across multiple days when a one day period is + # not a multiple of the frequency. + # + # See https://github.com/pandas-dev/pandas/issues/8683 + + # GH 10117 & GH 19375. If first and last contain timezone information, + # Perform the calculation in UTC in order to avoid localizing on an + # Ambiguous or Nonexistent time. + first_tzinfo = first.tzinfo + last_tzinfo = last.tzinfo + start_day_nanos = first.normalize().value + if first_tzinfo is not None: + first = first.tz_convert("UTC") + if last_tzinfo is not None: + last = last.tz_convert("UTC") + + base_nanos = (base % offset.n) * offset.nanos // offset.n + start_day_nanos += base_nanos + + foffset = (first.value - start_day_nanos) % offset.nanos + loffset = (last.value - start_day_nanos) % offset.nanos + + if closed == "right": + if foffset > 0: + # roll back + fresult = first.value - foffset + else: + fresult = first.value - offset.nanos + + if loffset > 0: + # roll forward + lresult = last.value + (offset.nanos - loffset) + else: + # already the end of the road + lresult = last.value + else: # closed == 'left' + if foffset > 0: + fresult = first.value - foffset + else: + # start of the road + fresult = first.value + + if loffset > 0: + # roll forward + lresult = last.value + (offset.nanos - loffset) + else: + lresult = last.value + offset.nanos + fresult = Timestamp(fresult) + lresult = Timestamp(lresult) + if first_tzinfo is not None: + fresult = fresult.tz_localize("UTC").tz_convert(first_tzinfo) + if last_tzinfo is not None: + lresult = lresult.tz_localize("UTC").tz_convert(last_tzinfo) + return fresult, lresult + + +def asfreq(obj, freq, method=None, how=None, normalize=False, fill_value=None): + """ + Utility frequency conversion method for Series/DataFrame. + """ + if isinstance(obj.index, PeriodIndex): + if method is not None: + raise NotImplementedError("'method' argument is not supported") + + if how is None: + how = "E" + + new_obj = obj.copy() + new_obj.index = obj.index.asfreq(freq, how=how) + + elif len(obj.index) == 0: + new_obj = obj.copy() + new_obj.index = obj.index._shallow_copy(freq=to_offset(freq)) + + else: + dti = date_range(obj.index[0], obj.index[-1], freq=freq) + dti.name = obj.index.name + new_obj = obj.reindex(dti, method=method, fill_value=fill_value) + if normalize: + new_obj.index = new_obj.index.normalize() + + return new_obj diff --git a/venv/Lib/site-packages/pandas/core/reshape/__init__.py b/venv/Lib/site-packages/pandas/core/reshape/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/core/reshape/api.py b/venv/Lib/site-packages/pandas/core/reshape/api.py new file mode 100644 index 0000000..3c76eef --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/reshape/api.py @@ -0,0 +1,8 @@ +# flake8: noqa + +from pandas.core.reshape.concat import concat +from pandas.core.reshape.melt import lreshape, melt, wide_to_long +from pandas.core.reshape.merge import merge, merge_asof, merge_ordered +from pandas.core.reshape.pivot import crosstab, pivot, pivot_table +from pandas.core.reshape.reshape import get_dummies +from pandas.core.reshape.tile import cut, qcut diff --git a/venv/Lib/site-packages/pandas/core/reshape/concat.py b/venv/Lib/site-packages/pandas/core/reshape/concat.py new file mode 100644 index 0000000..502b8d1 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/reshape/concat.py @@ -0,0 +1,701 @@ +""" +concat routines +""" + +from typing import Hashable, Iterable, List, Mapping, Optional, Union, overload + +import numpy as np + +from pandas._typing import FrameOrSeriesUnion + +from pandas import DataFrame, Index, MultiIndex, Series +from pandas.core.arrays.categorical import ( + factorize_from_iterable, + factorize_from_iterables, +) +import pandas.core.common as com +from pandas.core.generic import NDFrame +from pandas.core.indexes.api import ( + all_indexes_same, + ensure_index, + get_consensus_names, + get_objs_combined_axis, +) +import pandas.core.indexes.base as ibase +from pandas.core.internals import concatenate_block_managers + +# --------------------------------------------------------------------- +# Concatenate DataFrame objects + + +@overload +def concat( + objs: Union[Iterable["DataFrame"], Mapping[Optional[Hashable], "DataFrame"]], + axis=0, + join: str = "outer", + ignore_index: bool = False, + keys=None, + levels=None, + names=None, + verify_integrity: bool = False, + sort: bool = False, + copy: bool = True, +) -> "DataFrame": + ... + + +@overload +def concat( + objs: Union[ + Iterable[FrameOrSeriesUnion], Mapping[Optional[Hashable], FrameOrSeriesUnion] + ], + axis=0, + join: str = "outer", + ignore_index: bool = False, + keys=None, + levels=None, + names=None, + verify_integrity: bool = False, + sort: bool = False, + copy: bool = True, +) -> FrameOrSeriesUnion: + ... + + +def concat( + objs: Union[ + Iterable[FrameOrSeriesUnion], Mapping[Optional[Hashable], FrameOrSeriesUnion] + ], + axis=0, + join="outer", + ignore_index: bool = False, + keys=None, + levels=None, + names=None, + verify_integrity: bool = False, + sort: bool = False, + copy: bool = True, +) -> FrameOrSeriesUnion: + """ + Concatenate pandas objects along a particular axis with optional set logic + along the other axes. + + Can also add a layer of hierarchical indexing on the concatenation axis, + which may be useful if the labels are the same (or overlapping) on + the passed axis number. + + Parameters + ---------- + objs : a sequence or mapping of Series or DataFrame objects + If a dict is passed, the sorted keys will be used as the `keys` + argument, unless it is passed, in which case the values will be + selected (see below). Any None objects will be dropped silently unless + they are all None in which case a ValueError will be raised. + axis : {0/'index', 1/'columns'}, default 0 + The axis to concatenate along. + join : {'inner', 'outer'}, default 'outer' + How to handle indexes on other axis (or axes). + ignore_index : bool, default False + If True, do not use the index values along the concatenation axis. The + resulting axis will be labeled 0, ..., n - 1. This is useful if you are + concatenating objects where the concatenation axis does not have + meaningful indexing information. Note the index values on the other + axes are still respected in the join. + keys : sequence, default None + If multiple levels passed, should contain tuples. Construct + hierarchical index using the passed keys as the outermost level. + levels : list of sequences, default None + Specific levels (unique values) to use for constructing a + MultiIndex. Otherwise they will be inferred from the keys. + names : list, default None + Names for the levels in the resulting hierarchical index. + verify_integrity : bool, default False + Check whether the new concatenated axis contains duplicates. This can + be very expensive relative to the actual data concatenation. + sort : bool, default False + Sort non-concatenation axis if it is not already aligned when `join` + is 'outer'. + This has no effect when ``join='inner'``, which already preserves + the order of the non-concatenation axis. + + .. versionadded:: 0.23.0 + .. versionchanged:: 1.0.0 + + Changed to not sort by default. + + copy : bool, default True + If False, do not copy data unnecessarily. + + Returns + ------- + object, type of objs + When concatenating all ``Series`` along the index (axis=0), a + ``Series`` is returned. When ``objs`` contains at least one + ``DataFrame``, a ``DataFrame`` is returned. When concatenating along + the columns (axis=1), a ``DataFrame`` is returned. + + See Also + -------- + Series.append : Concatenate Series. + DataFrame.append : Concatenate DataFrames. + DataFrame.join : Join DataFrames using indexes. + DataFrame.merge : Merge DataFrames by indexes or columns. + + Notes + ----- + The keys, levels, and names arguments are all optional. + + A walkthrough of how this method fits in with other tools for combining + pandas objects can be found `here + `__. + + Examples + -------- + Combine two ``Series``. + + >>> s1 = pd.Series(['a', 'b']) + >>> s2 = pd.Series(['c', 'd']) + >>> pd.concat([s1, s2]) + 0 a + 1 b + 0 c + 1 d + dtype: object + + Clear the existing index and reset it in the result + by setting the ``ignore_index`` option to ``True``. + + >>> pd.concat([s1, s2], ignore_index=True) + 0 a + 1 b + 2 c + 3 d + dtype: object + + Add a hierarchical index at the outermost level of + the data with the ``keys`` option. + + >>> pd.concat([s1, s2], keys=['s1', 's2']) + s1 0 a + 1 b + s2 0 c + 1 d + dtype: object + + Label the index keys you create with the ``names`` option. + + >>> pd.concat([s1, s2], keys=['s1', 's2'], + ... names=['Series name', 'Row ID']) + Series name Row ID + s1 0 a + 1 b + s2 0 c + 1 d + dtype: object + + Combine two ``DataFrame`` objects with identical columns. + + >>> df1 = pd.DataFrame([['a', 1], ['b', 2]], + ... columns=['letter', 'number']) + >>> df1 + letter number + 0 a 1 + 1 b 2 + >>> df2 = pd.DataFrame([['c', 3], ['d', 4]], + ... columns=['letter', 'number']) + >>> df2 + letter number + 0 c 3 + 1 d 4 + >>> pd.concat([df1, df2]) + letter number + 0 a 1 + 1 b 2 + 0 c 3 + 1 d 4 + + Combine ``DataFrame`` objects with overlapping columns + and return everything. Columns outside the intersection will + be filled with ``NaN`` values. + + >>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']], + ... columns=['letter', 'number', 'animal']) + >>> df3 + letter number animal + 0 c 3 cat + 1 d 4 dog + >>> pd.concat([df1, df3], sort=False) + letter number animal + 0 a 1 NaN + 1 b 2 NaN + 0 c 3 cat + 1 d 4 dog + + Combine ``DataFrame`` objects with overlapping columns + and return only those that are shared by passing ``inner`` to + the ``join`` keyword argument. + + >>> pd.concat([df1, df3], join="inner") + letter number + 0 a 1 + 1 b 2 + 0 c 3 + 1 d 4 + + Combine ``DataFrame`` objects horizontally along the x axis by + passing in ``axis=1``. + + >>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']], + ... columns=['animal', 'name']) + >>> pd.concat([df1, df4], axis=1) + letter number animal name + 0 a 1 bird polly + 1 b 2 monkey george + + Prevent the result from including duplicate index values with the + ``verify_integrity`` option. + + >>> df5 = pd.DataFrame([1], index=['a']) + >>> df5 + 0 + a 1 + >>> df6 = pd.DataFrame([2], index=['a']) + >>> df6 + 0 + a 2 + >>> pd.concat([df5, df6], verify_integrity=True) + Traceback (most recent call last): + ... + ValueError: Indexes have overlapping values: ['a'] + """ + op = _Concatenator( + objs, + axis=axis, + ignore_index=ignore_index, + join=join, + keys=keys, + levels=levels, + names=names, + verify_integrity=verify_integrity, + copy=copy, + sort=sort, + ) + + return op.get_result() + + +class _Concatenator: + """ + Orchestrates a concatenation operation for BlockManagers + """ + + def __init__( + self, + objs, + axis=0, + join: str = "outer", + keys=None, + levels=None, + names=None, + ignore_index: bool = False, + verify_integrity: bool = False, + copy: bool = True, + sort=False, + ): + if isinstance(objs, (NDFrame, str)): + raise TypeError( + "first argument must be an iterable of pandas " + "objects, you passed an object of type " + '"{name}"'.format(name=type(objs).__name__) + ) + + if join == "outer": + self.intersect = False + elif join == "inner": + self.intersect = True + else: # pragma: no cover + raise ValueError( + "Only can inner (intersect) or outer (union) join the other axis" + ) + + if isinstance(objs, dict): + if keys is None: + keys = list(objs.keys()) + objs = [objs[k] for k in keys] + else: + objs = list(objs) + + if len(objs) == 0: + raise ValueError("No objects to concatenate") + + if keys is None: + objs = list(com.not_none(*objs)) + else: + # #1649 + clean_keys = [] + clean_objs = [] + for k, v in zip(keys, objs): + if v is None: + continue + clean_keys.append(k) + clean_objs.append(v) + objs = clean_objs + name = getattr(keys, "name", None) + keys = Index(clean_keys, name=name) + + if len(objs) == 0: + raise ValueError("All objects passed were None") + + # consolidate data & figure out what our result ndim is going to be + ndims = set() + for obj in objs: + if not isinstance(obj, (Series, DataFrame)): + msg = ( + "cannot concatenate object of type '{typ}'; " + "only Series and DataFrame objs are valid".format(typ=type(obj)) + ) + raise TypeError(msg) + + # consolidate + obj._consolidate(inplace=True) + ndims.add(obj.ndim) + + # get the sample + # want the highest ndim that we have, and must be non-empty + # unless all objs are empty + sample = None + if len(ndims) > 1: + max_ndim = max(ndims) + for obj in objs: + if obj.ndim == max_ndim and np.sum(obj.shape): + sample = obj + break + + else: + # filter out the empties if we have not multi-index possibilities + # note to keep empty Series as it affect to result columns / name + non_empties = [ + obj for obj in objs if sum(obj.shape) > 0 or isinstance(obj, Series) + ] + + if len(non_empties) and ( + keys is None and names is None and levels is None and not self.intersect + ): + objs = non_empties + sample = objs[0] + + if sample is None: + sample = objs[0] + self.objs = objs + + # Standardize axis parameter to int + if isinstance(sample, Series): + axis = DataFrame._get_axis_number(axis) + else: + axis = sample._get_axis_number(axis) + + # Need to flip BlockManager axis in the DataFrame special case + self._is_frame = isinstance(sample, DataFrame) + if self._is_frame: + axis = 1 if axis == 0 else 0 + + self._is_series = isinstance(sample, Series) + if not 0 <= axis <= sample.ndim: + raise AssertionError( + "axis must be between 0 and {ndim}, input was " + "{axis}".format(ndim=sample.ndim, axis=axis) + ) + + # if we have mixed ndims, then convert to highest ndim + # creating column numbers as needed + if len(ndims) > 1: + current_column = 0 + max_ndim = sample.ndim + self.objs, objs = [], self.objs + for obj in objs: + + ndim = obj.ndim + if ndim == max_ndim: + pass + + elif ndim != max_ndim - 1: + raise ValueError( + "cannot concatenate unaligned mixed " + "dimensional NDFrame objects" + ) + + else: + name = getattr(obj, "name", None) + if ignore_index or name is None: + name = current_column + current_column += 1 + + # doing a row-wise concatenation so need everything + # to line up + if self._is_frame and axis == 1: + name = 0 + obj = sample._constructor({name: obj}) + + self.objs.append(obj) + + # note: this is the BlockManager axis (since DataFrame is transposed) + self.axis = axis + self.keys = keys + self.names = names or getattr(keys, "names", None) + self.levels = levels + self.sort = sort + + self.ignore_index = ignore_index + self.verify_integrity = verify_integrity + self.copy = copy + + self.new_axes = self._get_new_axes() + + def get_result(self): + + # series only + if self._is_series: + + # stack blocks + if self.axis == 0: + name = com.consensus_name_attr(self.objs) + + mgr = self.objs[0]._data.concat( + [x._data for x in self.objs], self.new_axes + ) + cons = self.objs[0]._constructor + return cons(mgr, name=name).__finalize__(self, method="concat") + + # combine as columns in a frame + else: + data = dict(zip(range(len(self.objs)), self.objs)) + cons = DataFrame + + index, columns = self.new_axes + df = cons(data, index=index) + df.columns = columns + return df.__finalize__(self, method="concat") + + # combine block managers + else: + mgrs_indexers = [] + for obj in self.objs: + mgr = obj._data + indexers = {} + for ax, new_labels in enumerate(self.new_axes): + if ax == self.axis: + # Suppress reindexing on concat axis + continue + + obj_labels = mgr.axes[ax] + if not new_labels.equals(obj_labels): + indexers[ax] = obj_labels.reindex(new_labels)[1] + + mgrs_indexers.append((obj._data, indexers)) + + new_data = concatenate_block_managers( + mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=self.copy + ) + if not self.copy: + new_data._consolidate_inplace() + + cons = self.objs[0]._constructor + return cons._from_axes(new_data, self.new_axes).__finalize__( + self, method="concat" + ) + + def _get_result_dim(self) -> int: + if self._is_series and self.axis == 1: + return 2 + else: + return self.objs[0].ndim + + def _get_new_axes(self) -> List[Index]: + ndim = self._get_result_dim() + return [ + self._get_concat_axis() if i == self.axis else self._get_comb_axis(i) + for i in range(ndim) + ] + + def _get_comb_axis(self, i: int) -> Index: + data_axis = self.objs[0]._get_block_manager_axis(i) + return get_objs_combined_axis( + self.objs, axis=data_axis, intersect=self.intersect, sort=self.sort + ) + + def _get_concat_axis(self) -> Index: + """ + Return index to be used along concatenation axis. + """ + if self._is_series: + if self.axis == 0: + indexes = [x.index for x in self.objs] + elif self.ignore_index: + idx = ibase.default_index(len(self.objs)) + return idx + elif self.keys is None: + names: List[Optional[Hashable]] = [None] * len(self.objs) + num = 0 + has_names = False + for i, x in enumerate(self.objs): + if not isinstance(x, Series): + raise TypeError( + f"Cannot concatenate type 'Series' with " + f"object of type '{type(x).__name__}'" + ) + if x.name is not None: + names[i] = x.name + has_names = True + else: + names[i] = num + num += 1 + if has_names: + return Index(names) + else: + return ibase.default_index(len(self.objs)) + else: + return ensure_index(self.keys).set_names(self.names) + else: + indexes = [x._data.axes[self.axis] for x in self.objs] + + if self.ignore_index: + idx = ibase.default_index(sum(len(i) for i in indexes)) + return idx + + if self.keys is None: + concat_axis = _concat_indexes(indexes) + else: + concat_axis = _make_concat_multiindex( + indexes, self.keys, self.levels, self.names + ) + + self._maybe_check_integrity(concat_axis) + + return concat_axis + + def _maybe_check_integrity(self, concat_index: Index): + if self.verify_integrity: + if not concat_index.is_unique: + overlap = concat_index[concat_index.duplicated()].unique() + raise ValueError( + "Indexes have overlapping values: " + "{overlap!s}".format(overlap=overlap) + ) + + +def _concat_indexes(indexes) -> Index: + return indexes[0].append(indexes[1:]) + + +def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiIndex: + + if (levels is None and isinstance(keys[0], tuple)) or ( + levels is not None and len(levels) > 1 + ): + zipped = list(zip(*keys)) + if names is None: + names = [None] * len(zipped) + + if levels is None: + _, levels = factorize_from_iterables(zipped) + else: + levels = [ensure_index(x) for x in levels] + else: + zipped = [keys] + if names is None: + names = [None] + + if levels is None: + levels = [ensure_index(keys)] + else: + levels = [ensure_index(x) for x in levels] + + if not all_indexes_same(indexes): + codes_list = [] + + # things are potentially different sizes, so compute the exact codes + # for each level and pass those to MultiIndex.from_arrays + + for hlevel, level in zip(zipped, levels): + to_concat = [] + for key, index in zip(hlevel, indexes): + try: + i = level.get_loc(key) + except KeyError: + raise ValueError( + "Key {key!s} not in level {level!s}".format( + key=key, level=level + ) + ) + + to_concat.append(np.repeat(i, len(index))) + codes_list.append(np.concatenate(to_concat)) + + concat_index = _concat_indexes(indexes) + + # these go at the end + if isinstance(concat_index, MultiIndex): + levels.extend(concat_index.levels) + codes_list.extend(concat_index.codes) + else: + codes, categories = factorize_from_iterable(concat_index) + levels.append(categories) + codes_list.append(codes) + + if len(names) == len(levels): + names = list(names) + else: + # make sure that all of the passed indices have the same nlevels + if not len({idx.nlevels for idx in indexes}) == 1: + raise AssertionError( + "Cannot concat indices that do " + "not have the same number of levels" + ) + + # also copies + names = names + get_consensus_names(indexes) + + return MultiIndex( + levels=levels, codes=codes_list, names=names, verify_integrity=False + ) + + new_index = indexes[0] + n = len(new_index) + kpieces = len(indexes) + + # also copies + new_names = list(names) + new_levels = list(levels) + + # construct codes + new_codes = [] + + # do something a bit more speedy + + for hlevel, level in zip(zipped, levels): + hlevel = ensure_index(hlevel) + mapped = level.get_indexer(hlevel) + + mask = mapped == -1 + if mask.any(): + raise ValueError( + "Values not found in passed level: {hlevel!s}".format( + hlevel=hlevel[mask] + ) + ) + + new_codes.append(np.repeat(mapped, n)) + + if isinstance(new_index, MultiIndex): + new_levels.extend(new_index.levels) + new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes]) + else: + new_levels.append(new_index) + new_codes.append(np.tile(np.arange(n), kpieces)) + + if len(new_names) < len(new_levels): + new_names.extend(new_index.names) + + return MultiIndex( + levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False + ) diff --git a/venv/Lib/site-packages/pandas/core/reshape/melt.py b/venv/Lib/site-packages/pandas/core/reshape/melt.py new file mode 100644 index 0000000..d4ccb19 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/reshape/melt.py @@ -0,0 +1,474 @@ +import re +from typing import List + +import numpy as np + +from pandas.util._decorators import Appender, deprecate_kwarg + +from pandas.core.dtypes.common import is_extension_array_dtype, is_list_like +from pandas.core.dtypes.concat import concat_compat +from pandas.core.dtypes.generic import ABCMultiIndex +from pandas.core.dtypes.missing import notna + +from pandas.core.arrays import Categorical +import pandas.core.common as com +from pandas.core.frame import DataFrame, _shared_docs +from pandas.core.indexes.base import Index +from pandas.core.reshape.concat import concat +from pandas.core.tools.numeric import to_numeric + + +@Appender( + _shared_docs["melt"] + % dict(caller="pd.melt(df, ", versionadded="", other="DataFrame.melt") +) +def melt( + frame: DataFrame, + id_vars=None, + value_vars=None, + var_name=None, + value_name="value", + col_level=None, +) -> DataFrame: + # TODO: what about the existing index? + # If multiindex, gather names of columns on all level for checking presence + # of `id_vars` and `value_vars` + if isinstance(frame.columns, ABCMultiIndex): + cols = [x for c in frame.columns for x in c] + else: + cols = list(frame.columns) + + if id_vars is not None: + if not is_list_like(id_vars): + id_vars = [id_vars] + elif isinstance(frame.columns, ABCMultiIndex) and not isinstance(id_vars, list): + raise ValueError( + "id_vars must be a list of tuples when columns are a MultiIndex" + ) + else: + # Check that `id_vars` are in frame + id_vars = list(id_vars) + missing = Index(com.flatten(id_vars)).difference(cols) + if not missing.empty: + raise KeyError( + "The following 'id_vars' are not present " + "in the DataFrame: {missing}" + "".format(missing=list(missing)) + ) + else: + id_vars = [] + + if value_vars is not None: + if not is_list_like(value_vars): + value_vars = [value_vars] + elif isinstance(frame.columns, ABCMultiIndex) and not isinstance( + value_vars, list + ): + raise ValueError( + "value_vars must be a list of tuples when columns are a MultiIndex" + ) + else: + value_vars = list(value_vars) + # Check that `value_vars` are in frame + missing = Index(com.flatten(value_vars)).difference(cols) + if not missing.empty: + raise KeyError( + "The following 'value_vars' are not present in " + "the DataFrame: {missing}" + "".format(missing=list(missing)) + ) + frame = frame.loc[:, id_vars + value_vars] + else: + frame = frame.copy() + + if col_level is not None: # allow list or other? + # frame is a copy + frame.columns = frame.columns.get_level_values(col_level) + + if var_name is None: + if isinstance(frame.columns, ABCMultiIndex): + if len(frame.columns.names) == len(set(frame.columns.names)): + var_name = frame.columns.names + else: + var_name = [ + "variable_{i}".format(i=i) for i in range(len(frame.columns.names)) + ] + else: + var_name = [ + frame.columns.name if frame.columns.name is not None else "variable" + ] + if isinstance(var_name, str): + var_name = [var_name] + + N, K = frame.shape + K -= len(id_vars) + + mdata = {} + for col in id_vars: + id_data = frame.pop(col) + if is_extension_array_dtype(id_data): + id_data = concat([id_data] * K, ignore_index=True) + else: + id_data = np.tile(id_data.values, K) + mdata[col] = id_data + + mcolumns = id_vars + var_name + [value_name] + + mdata[value_name] = frame.values.ravel("F") + for i, col in enumerate(var_name): + # asanyarray will keep the columns as an Index + mdata[col] = np.asanyarray(frame.columns._get_level_values(i)).repeat(N) + + return frame._constructor(mdata, columns=mcolumns) + + +@deprecate_kwarg(old_arg_name="label", new_arg_name=None) +def lreshape(data: DataFrame, groups, dropna: bool = True, label=None) -> DataFrame: + """ + Reshape long-format data to wide. Generalized inverse of DataFrame.pivot + + Parameters + ---------- + data : DataFrame + groups : dict + {new_name : list_of_columns} + dropna : boolean, default True + + Examples + -------- + >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526], + ... 'team': ['Red Sox', 'Yankees'], + ... 'year1': [2007, 2007], 'year2': [2008, 2008]}) + >>> data + hr1 hr2 team year1 year2 + 0 514 545 Red Sox 2007 2008 + 1 573 526 Yankees 2007 2008 + + >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']}) + team year hr + 0 Red Sox 2007 514 + 1 Yankees 2007 573 + 2 Red Sox 2008 545 + 3 Yankees 2008 526 + + Returns + ------- + reshaped : DataFrame + """ + if isinstance(groups, dict): + keys = list(groups.keys()) + values = list(groups.values()) + else: + keys, values = zip(*groups) + + all_cols = list(set.union(*[set(x) for x in values])) + id_cols = list(data.columns.difference(all_cols)) + + K = len(values[0]) + + for seq in values: + if len(seq) != K: + raise ValueError("All column lists must be same length") + + mdata = {} + pivot_cols = [] + + for target, names in zip(keys, values): + to_concat = [data[col].values for col in names] + + mdata[target] = concat_compat(to_concat) + pivot_cols.append(target) + + for col in id_cols: + mdata[col] = np.tile(data[col].values, K) + + if dropna: + mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool) + for c in pivot_cols: + mask &= notna(mdata[c]) + if not mask.all(): + mdata = {k: v[mask] for k, v in mdata.items()} + + return data._constructor(mdata, columns=id_cols + pivot_cols) + + +def wide_to_long( + df: DataFrame, stubnames, i, j, sep: str = "", suffix: str = r"\d+" +) -> DataFrame: + r""" + Wide panel to long format. Less flexible but more user-friendly than melt. + + With stubnames ['A', 'B'], this function expects to find one or more + group of columns with format + A-suffix1, A-suffix2,..., B-suffix1, B-suffix2,... + You specify what you want to call this suffix in the resulting long format + with `j` (for example `j='year'`) + + Each row of these wide variables are assumed to be uniquely identified by + `i` (can be a single column name or a list of column names) + + All remaining variables in the data frame are left intact. + + Parameters + ---------- + df : DataFrame + The wide-format DataFrame. + stubnames : str or list-like + The stub name(s). The wide format variables are assumed to + start with the stub names. + i : str or list-like + Column(s) to use as id variable(s). + j : str + The name of the sub-observation variable. What you wish to name your + suffix in the long format. + sep : str, default "" + A character indicating the separation of the variable names + in the wide format, to be stripped from the names in the long format. + For example, if your column names are A-suffix1, A-suffix2, you + can strip the hyphen by specifying `sep='-'`. + suffix : str, default '\\d+' + A regular expression capturing the wanted suffixes. '\\d+' captures + numeric suffixes. Suffixes with no numbers could be specified with the + negated character class '\\D+'. You can also further disambiguate + suffixes, for example, if your wide variables are of the form + A-one, B-two,.., and you have an unrelated column A-rating, you can + ignore the last one by specifying `suffix='(!?one|two)'`. + + .. versionchanged:: 0.23.0 + When all suffixes are numeric, they are cast to int64/float64. + + Returns + ------- + DataFrame + A DataFrame that contains each stub name as a variable, with new index + (i, j). + + Notes + ----- + All extra variables are left untouched. This simply uses + `pandas.melt` under the hood, but is hard-coded to "do the right thing" + in a typical case. + + Examples + -------- + >>> np.random.seed(123) + >>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"}, + ... "A1980" : {0 : "d", 1 : "e", 2 : "f"}, + ... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7}, + ... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1}, + ... "X" : dict(zip(range(3), np.random.randn(3))) + ... }) + >>> df["id"] = df.index + >>> df + A1970 A1980 B1970 B1980 X id + 0 a d 2.5 3.2 -1.085631 0 + 1 b e 1.2 1.3 0.997345 1 + 2 c f 0.7 0.1 0.282978 2 + >>> pd.wide_to_long(df, ["A", "B"], i="id", j="year") + ... # doctest: +NORMALIZE_WHITESPACE + X A B + id year + 0 1970 -1.085631 a 2.5 + 1 1970 0.997345 b 1.2 + 2 1970 0.282978 c 0.7 + 0 1980 -1.085631 d 3.2 + 1 1980 0.997345 e 1.3 + 2 1980 0.282978 f 0.1 + + With multiple id columns + + >>> df = pd.DataFrame({ + ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], + ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], + ... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], + ... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] + ... }) + >>> df + famid birth ht1 ht2 + 0 1 1 2.8 3.4 + 1 1 2 2.9 3.8 + 2 1 3 2.2 2.9 + 3 2 1 2.0 3.2 + 4 2 2 1.8 2.8 + 5 2 3 1.9 2.4 + 6 3 1 2.2 3.3 + 7 3 2 2.3 3.4 + 8 3 3 2.1 2.9 + >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age') + >>> l + ... # doctest: +NORMALIZE_WHITESPACE + ht + famid birth age + 1 1 1 2.8 + 2 3.4 + 2 1 2.9 + 2 3.8 + 3 1 2.2 + 2 2.9 + 2 1 1 2.0 + 2 3.2 + 2 1 1.8 + 2 2.8 + 3 1 1.9 + 2 2.4 + 3 1 1 2.2 + 2 3.3 + 2 1 2.3 + 2 3.4 + 3 1 2.1 + 2 2.9 + + Going from long back to wide just takes some creative use of `unstack` + + >>> w = l.unstack() + >>> w.columns = w.columns.map('{0[0]}{0[1]}'.format) + >>> w.reset_index() + famid birth ht1 ht2 + 0 1 1 2.8 3.4 + 1 1 2 2.9 3.8 + 2 1 3 2.2 2.9 + 3 2 1 2.0 3.2 + 4 2 2 1.8 2.8 + 5 2 3 1.9 2.4 + 6 3 1 2.2 3.3 + 7 3 2 2.3 3.4 + 8 3 3 2.1 2.9 + + Less wieldy column names are also handled + + >>> np.random.seed(0) + >>> df = pd.DataFrame({'A(weekly)-2010': np.random.rand(3), + ... 'A(weekly)-2011': np.random.rand(3), + ... 'B(weekly)-2010': np.random.rand(3), + ... 'B(weekly)-2011': np.random.rand(3), + ... 'X' : np.random.randint(3, size=3)}) + >>> df['id'] = df.index + >>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS + A(weekly)-2010 A(weekly)-2011 B(weekly)-2010 B(weekly)-2011 X id + 0 0.548814 0.544883 0.437587 0.383442 0 0 + 1 0.715189 0.423655 0.891773 0.791725 1 1 + 2 0.602763 0.645894 0.963663 0.528895 1 2 + + >>> pd.wide_to_long(df, ['A(weekly)', 'B(weekly)'], i='id', + ... j='year', sep='-') + ... # doctest: +NORMALIZE_WHITESPACE + X A(weekly) B(weekly) + id year + 0 2010 0 0.548814 0.437587 + 1 2010 1 0.715189 0.891773 + 2 2010 1 0.602763 0.963663 + 0 2011 0 0.544883 0.383442 + 1 2011 1 0.423655 0.791725 + 2 2011 1 0.645894 0.528895 + + If we have many columns, we could also use a regex to find our + stubnames and pass that list on to wide_to_long + + >>> stubnames = sorted( + ... set([match[0] for match in df.columns.str.findall( + ... r'[A-B]\(.*\)').values if match != []]) + ... ) + >>> list(stubnames) + ['A(weekly)', 'B(weekly)'] + + All of the above examples have integers as suffixes. It is possible to + have non-integers as suffixes. + + >>> df = pd.DataFrame({ + ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], + ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], + ... 'ht_one': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], + ... 'ht_two': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] + ... }) + >>> df + famid birth ht_one ht_two + 0 1 1 2.8 3.4 + 1 1 2 2.9 3.8 + 2 1 3 2.2 2.9 + 3 2 1 2.0 3.2 + 4 2 2 1.8 2.8 + 5 2 3 1.9 2.4 + 6 3 1 2.2 3.3 + 7 3 2 2.3 3.4 + 8 3 3 2.1 2.9 + + >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age', + ... sep='_', suffix='\w+') + >>> l + ... # doctest: +NORMALIZE_WHITESPACE + ht + famid birth age + 1 1 one 2.8 + two 3.4 + 2 one 2.9 + two 3.8 + 3 one 2.2 + two 2.9 + 2 1 one 2.0 + two 3.2 + 2 one 1.8 + two 2.8 + 3 one 1.9 + two 2.4 + 3 1 one 2.2 + two 3.3 + 2 one 2.3 + two 3.4 + 3 one 2.1 + two 2.9 + """ + + def get_var_names(df, stub: str, sep: str, suffix: str) -> List[str]: + regex = r"^{stub}{sep}{suffix}$".format( + stub=re.escape(stub), sep=re.escape(sep), suffix=suffix + ) + pattern = re.compile(regex) + return [col for col in df.columns if pattern.match(col)] + + def melt_stub(df, stub: str, i, j, value_vars, sep: str): + newdf = melt( + df, + id_vars=i, + value_vars=value_vars, + value_name=stub.rstrip(sep), + var_name=j, + ) + newdf[j] = Categorical(newdf[j]) + newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "") + + # GH17627 Cast numerics suffixes to int/float + newdf[j] = to_numeric(newdf[j], errors="ignore") + + return newdf.set_index(i + [j]) + + if not is_list_like(stubnames): + stubnames = [stubnames] + else: + stubnames = list(stubnames) + + if any(col in stubnames for col in df.columns): + raise ValueError("stubname can't be identical to a column name") + + if not is_list_like(i): + i = [i] + else: + i = list(i) + + if df[i].duplicated().any(): + raise ValueError("the id variables need to uniquely identify each row") + + value_vars = [get_var_names(df, stub, sep, suffix) for stub in stubnames] + + value_vars_flattened = [e for sublist in value_vars for e in sublist] + id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened)) + + _melted = [melt_stub(df, s, i, j, v, sep) for s, v in zip(stubnames, value_vars)] + melted = _melted[0].join(_melted[1:], how="outer") + + if len(i) == 1: + new = df[id_vars].set_index(i).join(melted) + return new + + new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j]) + + return new diff --git a/venv/Lib/site-packages/pandas/core/reshape/merge.py b/venv/Lib/site-packages/pandas/core/reshape/merge.py new file mode 100644 index 0000000..5f92e4a --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/reshape/merge.py @@ -0,0 +1,2052 @@ +""" +SQL-style merge routines +""" + +import copy +import datetime +from functools import partial +import string +from typing import TYPE_CHECKING, Optional, Tuple, Union +import warnings + +import numpy as np + +from pandas._libs import Timedelta, hashtable as libhashtable, lib +import pandas._libs.join as libjoin +from pandas._typing import FrameOrSeries +from pandas.errors import MergeError +from pandas.util._decorators import Appender, Substitution + +from pandas.core.dtypes.common import ( + ensure_float64, + ensure_int64, + ensure_object, + is_array_like, + is_bool, + is_bool_dtype, + is_categorical_dtype, + is_datetime64tz_dtype, + is_dtype_equal, + is_extension_array_dtype, + is_float_dtype, + is_integer, + is_integer_dtype, + is_list_like, + is_number, + is_numeric_dtype, + is_object_dtype, + needs_i8_conversion, +) +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries +from pandas.core.dtypes.missing import isna, na_value_for_dtype + +from pandas import Categorical, Index, MultiIndex +from pandas.core import groupby +import pandas.core.algorithms as algos +from pandas.core.arrays.categorical import _recode_for_categories +import pandas.core.common as com +from pandas.core.frame import _merge_doc +from pandas.core.internals import _transform_index, concatenate_block_managers +from pandas.core.sorting import is_int64_overflow_possible + +if TYPE_CHECKING: + from pandas import DataFrame, Series # noqa:F401 + + +@Substitution("\nleft : DataFrame") +@Appender(_merge_doc, indents=0) +def merge( + left, + right, + how: str = "inner", + on=None, + left_on=None, + right_on=None, + left_index: bool = False, + right_index: bool = False, + sort: bool = False, + suffixes=("_x", "_y"), + copy: bool = True, + indicator: bool = False, + validate=None, +) -> "DataFrame": + op = _MergeOperation( + left, + right, + how=how, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + sort=sort, + suffixes=suffixes, + copy=copy, + indicator=indicator, + validate=validate, + ) + return op.get_result() + + +if __debug__: + merge.__doc__ = _merge_doc % "\nleft : DataFrame" + + +def _groupby_and_merge( + by, on, left, right: "DataFrame", _merge_pieces, check_duplicates: bool = True +): + """ + groupby & merge; we are always performing a left-by type operation + + Parameters + ---------- + by: field to group + on: duplicates field + left: left frame + right: right frame + _merge_pieces: function for merging + check_duplicates: bool, default True + should we check & clean duplicates + """ + + pieces = [] + if not isinstance(by, (list, tuple)): + by = [by] + + lby = left.groupby(by, sort=False) + rby: Optional[groupby.DataFrameGroupBy] = None + + # if we can groupby the rhs + # then we can get vastly better perf + + # we will check & remove duplicates if indicated + if check_duplicates: + if on is None: + on = [] + elif not isinstance(on, (list, tuple)): + on = [on] + + if right.duplicated(by + on).any(): + _right = right.drop_duplicates(by + on, keep="last") + # TODO: use overload to refine return type of drop_duplicates + assert _right is not None # needed for mypy + right = _right + try: + rby = right.groupby(by, sort=False) + except KeyError: + pass + + for key, lhs in lby: + + if rby is None: + rhs = right + else: + try: + rhs = right.take(rby.indices[key]) + except KeyError: + # key doesn't exist in left + lcols = lhs.columns.tolist() + cols = lcols + [r for r in right.columns if r not in set(lcols)] + merged = lhs.reindex(columns=cols) + merged.index = range(len(merged)) + pieces.append(merged) + continue + + merged = _merge_pieces(lhs, rhs) + + # make sure join keys are in the merged + # TODO, should _merge_pieces do this? + for k in by: + try: + if k in merged: + merged[k] = key + except KeyError: + pass + + pieces.append(merged) + + # preserve the original order + # if we have a missing piece this can be reset + from pandas.core.reshape.concat import concat + + result = concat(pieces, ignore_index=True) + result = result.reindex(columns=pieces[0].columns, copy=False) + return result, lby + + +def merge_ordered( + left, + right, + on=None, + left_on=None, + right_on=None, + left_by=None, + right_by=None, + fill_method=None, + suffixes=("_x", "_y"), + how: str = "outer", +) -> "DataFrame": + """ + Perform merge with optional filling/interpolation. + + Designed for ordered data like time series data. Optionally + perform group-wise merge (see examples). + + Parameters + ---------- + left : DataFrame + right : DataFrame + on : label or list + Field names to join on. Must be found in both DataFrames. + left_on : label or list, or array-like + Field names to join on in left DataFrame. Can be a vector or list of + vectors of the length of the DataFrame to use a particular vector as + the join key instead of columns. + right_on : label or list, or array-like + Field names to join on in right DataFrame or vector/list of vectors per + left_on docs. + left_by : column name or list of column names + Group left DataFrame by group columns and merge piece by piece with + right DataFrame. + right_by : column name or list of column names + Group right DataFrame by group columns and merge piece by piece with + left DataFrame. + fill_method : {'ffill', None}, default None + Interpolation method for data. + suffixes : Sequence, default is ("_x", "_y") + A length-2 sequence where each element is optionally a string + indicating the suffix to add to overlapping column names in + `left` and `right` respectively. Pass a value of `None` instead + of a string to indicate that the column name from `left` or + `right` should be left as-is, with no suffix. At least one of the + values must not be None. + + .. versionchanged:: 0.25.0 + how : {'left', 'right', 'outer', 'inner'}, default 'outer' + * left: use only keys from left frame (SQL: left outer join) + * right: use only keys from right frame (SQL: right outer join) + * outer: use union of keys from both frames (SQL: full outer join) + * inner: use intersection of keys from both frames (SQL: inner join). + + Returns + ------- + DataFrame + The merged DataFrame output type will the be same as + 'left', if it is a subclass of DataFrame. + + See Also + -------- + merge + merge_asof + + Examples + -------- + >>> A + key lvalue group + 0 a 1 a + 1 c 2 a + 2 e 3 a + 3 a 1 b + 4 c 2 b + 5 e 3 b + + >>> B + Key rvalue + 0 b 1 + 1 c 2 + 2 d 3 + + >>> merge_ordered(A, B, fill_method='ffill', left_by='group') + group key lvalue rvalue + 0 a a 1 NaN + 1 a b 1 1.0 + 2 a c 2 2.0 + 3 a d 2 3.0 + 4 a e 3 3.0 + 5 b a 1 NaN + 6 b b 1 1.0 + 7 b c 2 2.0 + 8 b d 2 3.0 + 9 b e 3 3.0 + """ + + def _merger(x, y): + # perform the ordered merge operation + op = _OrderedMerge( + x, + y, + on=on, + left_on=left_on, + right_on=right_on, + suffixes=suffixes, + fill_method=fill_method, + how=how, + ) + return op.get_result() + + if left_by is not None and right_by is not None: + raise ValueError("Can only group either left or right frames") + elif left_by is not None: + result, _ = _groupby_and_merge( + left_by, on, left, right, lambda x, y: _merger(x, y), check_duplicates=False + ) + elif right_by is not None: + result, _ = _groupby_and_merge( + right_by, + on, + right, + left, + lambda x, y: _merger(y, x), + check_duplicates=False, + ) + else: + result = _merger(left, right) + return result + + +def merge_asof( + left, + right, + on=None, + left_on=None, + right_on=None, + left_index: bool = False, + right_index: bool = False, + by=None, + left_by=None, + right_by=None, + suffixes=("_x", "_y"), + tolerance=None, + allow_exact_matches: bool = True, + direction: str = "backward", +) -> "DataFrame": + """ + Perform an asof merge. This is similar to a left-join except that we + match on nearest key rather than equal keys. + + Both DataFrames must be sorted by the key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + 'on' key is less than or equal to the left's key. + + - A "forward" search selects the first row in the right DataFrame whose + 'on' key is greater than or equal to the left's key. + + - A "nearest" search selects the row in the right DataFrame whose 'on' + key is closest in absolute distance to the left's key. + + The default is "backward" and is compatible in versions below 0.20.0. + The direction parameter was added in version 0.20.0 and introduces + "forward" and "nearest". + + Optionally match on equivalent keys with 'by' before searching with 'on'. + + Parameters + ---------- + left : DataFrame + right : DataFrame + on : label + Field name to join on. Must be found in both DataFrames. + The data MUST be ordered. Furthermore this must be a numeric column, + such as datetimelike, integer, or float. On or left_on/right_on + must be given. + left_on : label + Field name to join on in left DataFrame. + right_on : label + Field name to join on in right DataFrame. + left_index : bool + Use the index of the left DataFrame as the join key. + right_index : bool + Use the index of the right DataFrame as the join key. + by : column name or list of column names + Match on these columns before performing merge operation. + left_by : column name + Field names to match on in the left DataFrame. + right_by : column name + Field names to match on in the right DataFrame. + suffixes : 2-length sequence (tuple, list, ...) + Suffix to apply to overlapping column names in the left and right + side, respectively. + tolerance : int or Timedelta, optional, default None + Select asof tolerance within this range; must be compatible + with the merge index. + allow_exact_matches : bool, default True + + - If True, allow matching with the same 'on' value + (i.e. less-than-or-equal-to / greater-than-or-equal-to) + - If False, don't match the same 'on' value + (i.e., strictly less-than / strictly greater-than). + + direction : 'backward' (default), 'forward', or 'nearest' + Whether to search for prior, subsequent, or closest matches. + + Returns + ------- + merged : DataFrame + + See Also + -------- + merge + merge_ordered + + Examples + -------- + >>> left = pd.DataFrame({'a': [1, 5, 10], 'left_val': ['a', 'b', 'c']}) + >>> left + a left_val + 0 1 a + 1 5 b + 2 10 c + + >>> right = pd.DataFrame({'a': [1, 2, 3, 6, 7], + ... 'right_val': [1, 2, 3, 6, 7]}) + >>> right + a right_val + 0 1 1 + 1 2 2 + 2 3 3 + 3 6 6 + 4 7 7 + + >>> pd.merge_asof(left, right, on='a') + a left_val right_val + 0 1 a 1 + 1 5 b 3 + 2 10 c 7 + + >>> pd.merge_asof(left, right, on='a', allow_exact_matches=False) + a left_val right_val + 0 1 a NaN + 1 5 b 3.0 + 2 10 c 7.0 + + >>> pd.merge_asof(left, right, on='a', direction='forward') + a left_val right_val + 0 1 a 1.0 + 1 5 b 6.0 + 2 10 c NaN + + >>> pd.merge_asof(left, right, on='a', direction='nearest') + a left_val right_val + 0 1 a 1 + 1 5 b 6 + 2 10 c 7 + + We can use indexed DataFrames as well. + + >>> left = pd.DataFrame({'left_val': ['a', 'b', 'c']}, index=[1, 5, 10]) + >>> left + left_val + 1 a + 5 b + 10 c + + >>> right = pd.DataFrame({'right_val': [1, 2, 3, 6, 7]}, + ... index=[1, 2, 3, 6, 7]) + >>> right + right_val + 1 1 + 2 2 + 3 3 + 6 6 + 7 7 + + >>> pd.merge_asof(left, right, left_index=True, right_index=True) + left_val right_val + 1 a 1 + 5 b 3 + 10 c 7 + + Here is a real-world times-series example + + >>> quotes + time ticker bid ask + 0 2016-05-25 13:30:00.023 GOOG 720.50 720.93 + 1 2016-05-25 13:30:00.023 MSFT 51.95 51.96 + 2 2016-05-25 13:30:00.030 MSFT 51.97 51.98 + 3 2016-05-25 13:30:00.041 MSFT 51.99 52.00 + 4 2016-05-25 13:30:00.048 GOOG 720.50 720.93 + 5 2016-05-25 13:30:00.049 AAPL 97.99 98.01 + 6 2016-05-25 13:30:00.072 GOOG 720.50 720.88 + 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03 + + >>> trades + time ticker price quantity + 0 2016-05-25 13:30:00.023 MSFT 51.95 75 + 1 2016-05-25 13:30:00.038 MSFT 51.95 155 + 2 2016-05-25 13:30:00.048 GOOG 720.77 100 + 3 2016-05-25 13:30:00.048 GOOG 720.92 100 + 4 2016-05-25 13:30:00.048 AAPL 98.00 100 + + By default we are taking the asof of the quotes + + >>> pd.merge_asof(trades, quotes, + ... on='time', + ... by='ticker') + time ticker price quantity bid ask + 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 + 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 + 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 + 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 + 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN + + We only asof within 2ms between the quote time and the trade time + + >>> pd.merge_asof(trades, quotes, + ... on='time', + ... by='ticker', + ... tolerance=pd.Timedelta('2ms')) + time ticker price quantity bid ask + 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 + 1 2016-05-25 13:30:00.038 MSFT 51.95 155 NaN NaN + 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 + 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 + 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN + + We only asof within 10ms between the quote time and the trade time + and we exclude exact matches on time. However *prior* data will + propagate forward + + >>> pd.merge_asof(trades, quotes, + ... on='time', + ... by='ticker', + ... tolerance=pd.Timedelta('10ms'), + ... allow_exact_matches=False) + time ticker price quantity bid ask + 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN + 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 + 2 2016-05-25 13:30:00.048 GOOG 720.77 100 NaN NaN + 3 2016-05-25 13:30:00.048 GOOG 720.92 100 NaN NaN + 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN + """ + op = _AsOfMerge( + left, + right, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + by=by, + left_by=left_by, + right_by=right_by, + suffixes=suffixes, + how="asof", + tolerance=tolerance, + allow_exact_matches=allow_exact_matches, + direction=direction, + ) + return op.get_result() + + +# TODO: transformations?? +# TODO: only copy DataFrames when modification necessary +class _MergeOperation: + """ + Perform a database (SQL) merge operation between two DataFrame or Series + objects using either columns as keys or their row indexes + """ + + _merge_type = "merge" + + def __init__( + self, + left: Union["Series", "DataFrame"], + right: Union["Series", "DataFrame"], + how: str = "inner", + on=None, + left_on=None, + right_on=None, + axis=1, + left_index: bool = False, + right_index: bool = False, + sort: bool = True, + suffixes=("_x", "_y"), + copy: bool = True, + indicator: bool = False, + validate=None, + ): + _left = _validate_operand(left) + _right = _validate_operand(right) + self.left = self.orig_left = _left + self.right = self.orig_right = _right + self.how = how + self.axis = axis + + self.on = com.maybe_make_list(on) + self.left_on = com.maybe_make_list(left_on) + self.right_on = com.maybe_make_list(right_on) + + self.copy = copy + self.suffixes = suffixes + self.sort = sort + + self.left_index = left_index + self.right_index = right_index + + self.indicator = indicator + + self.indicator_name: Optional[str] + if isinstance(self.indicator, str): + self.indicator_name = self.indicator + elif isinstance(self.indicator, bool): + self.indicator_name = "_merge" if self.indicator else None + else: + raise ValueError( + "indicator option can only accept boolean or string arguments" + ) + + if not is_bool(left_index): + raise ValueError( + "left_index parameter must be of type bool, not " + "{left_index}".format(left_index=type(left_index)) + ) + if not is_bool(right_index): + raise ValueError( + "right_index parameter must be of type bool, not " + "{right_index}".format(right_index=type(right_index)) + ) + + # warn user when merging between different levels + if _left.columns.nlevels != _right.columns.nlevels: + msg = ( + "merging between different levels can give an unintended " + "result ({left} levels on the left, {right} on the right)" + ).format(left=_left.columns.nlevels, right=_right.columns.nlevels) + warnings.warn(msg, UserWarning) + + self._validate_specification() + + # note this function has side effects + ( + self.left_join_keys, + self.right_join_keys, + self.join_names, + ) = self._get_merge_keys() + + # validate the merge keys dtypes. We may need to coerce + # to avoid incompat dtypes + self._maybe_coerce_merge_keys() + + # If argument passed to validate, + # check if columns specified as unique + # are in fact unique. + if validate is not None: + self._validate(validate) + + def get_result(self): + if self.indicator: + self.left, self.right = self._indicator_pre_merge(self.left, self.right) + + join_index, left_indexer, right_indexer = self._get_join_info() + + ldata, rdata = self.left._data, self.right._data + lsuf, rsuf = self.suffixes + + llabels, rlabels = _items_overlap_with_suffix( + ldata.items, lsuf, rdata.items, rsuf + ) + + lindexers = {1: left_indexer} if left_indexer is not None else {} + rindexers = {1: right_indexer} if right_indexer is not None else {} + + result_data = concatenate_block_managers( + [(ldata, lindexers), (rdata, rindexers)], + axes=[llabels.append(rlabels), join_index], + concat_axis=0, + copy=self.copy, + ) + + typ = self.left._constructor + result = typ(result_data).__finalize__(self, method=self._merge_type) + + if self.indicator: + result = self._indicator_post_merge(result) + + self._maybe_add_join_keys(result, left_indexer, right_indexer) + + self._maybe_restore_index_levels(result) + + return result + + def _indicator_pre_merge( + self, left: "DataFrame", right: "DataFrame" + ) -> Tuple["DataFrame", "DataFrame"]: + + columns = left.columns.union(right.columns) + + for i in ["_left_indicator", "_right_indicator"]: + if i in columns: + raise ValueError( + "Cannot use `indicator=True` option when " + "data contains a column named {name}".format(name=i) + ) + if self.indicator_name in columns: + raise ValueError( + "Cannot use name of an existing column for indicator column" + ) + + left = left.copy() + right = right.copy() + + left["_left_indicator"] = 1 + left["_left_indicator"] = left["_left_indicator"].astype("int8") + + right["_right_indicator"] = 2 + right["_right_indicator"] = right["_right_indicator"].astype("int8") + + return left, right + + def _indicator_post_merge(self, result): + + result["_left_indicator"] = result["_left_indicator"].fillna(0) + result["_right_indicator"] = result["_right_indicator"].fillna(0) + + result[self.indicator_name] = Categorical( + (result["_left_indicator"] + result["_right_indicator"]), + categories=[1, 2, 3], + ) + result[self.indicator_name] = result[self.indicator_name].cat.rename_categories( + ["left_only", "right_only", "both"] + ) + + result = result.drop(labels=["_left_indicator", "_right_indicator"], axis=1) + return result + + def _maybe_restore_index_levels(self, result): + """ + Restore index levels specified as `on` parameters + + Here we check for cases where `self.left_on` and `self.right_on` pairs + each reference an index level in their respective DataFrames. The + joined columns corresponding to these pairs are then restored to the + index of `result`. + + **Note:** This method has side effects. It modifies `result` in-place + + Parameters + ---------- + result: DataFrame + merge result + + Returns + ------- + None + """ + names_to_restore = [] + for name, left_key, right_key in zip( + self.join_names, self.left_on, self.right_on + ): + if ( + self.orig_left._is_level_reference(left_key) + and self.orig_right._is_level_reference(right_key) + and name not in result.index.names + ): + + names_to_restore.append(name) + + if names_to_restore: + result.set_index(names_to_restore, inplace=True) + + def _maybe_add_join_keys(self, result, left_indexer, right_indexer): + + left_has_missing = None + right_has_missing = None + + keys = zip(self.join_names, self.left_on, self.right_on) + for i, (name, lname, rname) in enumerate(keys): + if not _should_fill(lname, rname): + continue + + take_left, take_right = None, None + + if name in result: + + if left_indexer is not None and right_indexer is not None: + if name in self.left: + + if left_has_missing is None: + left_has_missing = (left_indexer == -1).any() + + if left_has_missing: + take_right = self.right_join_keys[i] + + if not is_dtype_equal( + result[name].dtype, self.left[name].dtype + ): + take_left = self.left[name]._values + + elif name in self.right: + + if right_has_missing is None: + right_has_missing = (right_indexer == -1).any() + + if right_has_missing: + take_left = self.left_join_keys[i] + + if not is_dtype_equal( + result[name].dtype, self.right[name].dtype + ): + take_right = self.right[name]._values + + elif left_indexer is not None and is_array_like(self.left_join_keys[i]): + take_left = self.left_join_keys[i] + take_right = self.right_join_keys[i] + + if take_left is not None or take_right is not None: + + if take_left is None: + lvals = result[name]._values + else: + lfill = na_value_for_dtype(take_left.dtype) + lvals = algos.take_1d(take_left, left_indexer, fill_value=lfill) + + if take_right is None: + rvals = result[name]._values + else: + rfill = na_value_for_dtype(take_right.dtype) + rvals = algos.take_1d(take_right, right_indexer, fill_value=rfill) + + # if we have an all missing left_indexer + # make sure to just use the right values + mask = left_indexer == -1 + if mask.all(): + key_col = rvals + else: + key_col = Index(lvals).where(~mask, rvals) + + if result._is_label_reference(name): + result[name] = key_col + elif result._is_level_reference(name): + if isinstance(result.index, MultiIndex): + key_col.name = name + idx_list = [ + result.index.get_level_values(level_name) + if level_name != name + else key_col + for level_name in result.index.names + ] + + result.set_index(idx_list, inplace=True) + else: + result.index = Index(key_col, name=name) + else: + result.insert(i, name or "key_{i}".format(i=i), key_col) + + def _get_join_indexers(self): + """ return the join indexers """ + return _get_join_indexers( + self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how + ) + + def _get_join_info(self): + left_ax = self.left._data.axes[self.axis] + right_ax = self.right._data.axes[self.axis] + + if self.left_index and self.right_index and self.how != "asof": + join_index, left_indexer, right_indexer = left_ax.join( + right_ax, how=self.how, return_indexers=True, sort=self.sort + ) + elif self.right_index and self.how == "left": + join_index, left_indexer, right_indexer = _left_join_on_index( + left_ax, right_ax, self.left_join_keys, sort=self.sort + ) + + elif self.left_index and self.how == "right": + join_index, right_indexer, left_indexer = _left_join_on_index( + right_ax, left_ax, self.right_join_keys, sort=self.sort + ) + else: + (left_indexer, right_indexer) = self._get_join_indexers() + + if self.right_index: + if len(self.left) > 0: + join_index = self._create_join_index( + self.left.index, + self.right.index, + left_indexer, + right_indexer, + how="right", + ) + else: + join_index = self.right.index.take(right_indexer) + left_indexer = np.array([-1] * len(join_index)) + elif self.left_index: + if len(self.right) > 0: + join_index = self._create_join_index( + self.right.index, + self.left.index, + right_indexer, + left_indexer, + how="left", + ) + else: + join_index = self.left.index.take(left_indexer) + right_indexer = np.array([-1] * len(join_index)) + else: + join_index = Index(np.arange(len(left_indexer))) + + if len(join_index) == 0: + join_index = join_index.astype(object) + return join_index, left_indexer, right_indexer + + def _create_join_index( + self, + index: Index, + other_index: Index, + indexer, + other_indexer, + how: str = "left", + ): + """ + Create a join index by rearranging one index to match another + + Parameters + ---------- + index: Index being rearranged + other_index: Index used to supply values not found in index + indexer: how to rearrange index + how: replacement is only necessary if indexer based on other_index + + Returns + ------- + join_index + """ + if self.how in (how, "outer") and not isinstance(other_index, MultiIndex): + # if final index requires values in other_index but not target + # index, indexer may hold missing (-1) values, causing Index.take + # to take the final value in target index. So, we set the last + # element to be the desired fill value. We do not use allow_fill + # and fill_value because it throws a ValueError on integer indices + mask = indexer == -1 + if np.any(mask): + fill_value = na_value_for_dtype(index.dtype, compat=False) + index = index.append(Index([fill_value])) + return index.take(indexer) + + def _get_merge_keys(self): + """ + Note: has side effects (copy/delete key columns) + + Parameters + ---------- + left + right + on + + Returns + ------- + left_keys, right_keys + """ + left_keys = [] + right_keys = [] + join_names = [] + right_drop = [] + left_drop = [] + + left, right = self.left, self.right + + is_lkey = lambda x: is_array_like(x) and len(x) == len(left) + is_rkey = lambda x: is_array_like(x) and len(x) == len(right) + + # Note that pd.merge_asof() has separate 'on' and 'by' parameters. A + # user could, for example, request 'left_index' and 'left_by'. In a + # regular pd.merge(), users cannot specify both 'left_index' and + # 'left_on'. (Instead, users have a MultiIndex). That means the + # self.left_on in this function is always empty in a pd.merge(), but + # a pd.merge_asof(left_index=True, left_by=...) will result in a + # self.left_on array with a None in the middle of it. This requires + # a work-around as designated in the code below. + # See _validate_specification() for where this happens. + + # ugh, spaghetti re #733 + if _any(self.left_on) and _any(self.right_on): + for lk, rk in zip(self.left_on, self.right_on): + if is_lkey(lk): + left_keys.append(lk) + if is_rkey(rk): + right_keys.append(rk) + join_names.append(None) # what to do? + else: + if rk is not None: + right_keys.append(right._get_label_or_level_values(rk)) + join_names.append(rk) + else: + # work-around for merge_asof(right_index=True) + right_keys.append(right.index) + join_names.append(right.index.name) + else: + if not is_rkey(rk): + if rk is not None: + right_keys.append(right._get_label_or_level_values(rk)) + else: + # work-around for merge_asof(right_index=True) + right_keys.append(right.index) + if lk is not None and lk == rk: + # avoid key upcast in corner case (length-0) + if len(left) > 0: + right_drop.append(rk) + else: + left_drop.append(lk) + else: + right_keys.append(rk) + if lk is not None: + left_keys.append(left._get_label_or_level_values(lk)) + join_names.append(lk) + else: + # work-around for merge_asof(left_index=True) + left_keys.append(left.index) + join_names.append(left.index.name) + elif _any(self.left_on): + for k in self.left_on: + if is_lkey(k): + left_keys.append(k) + join_names.append(None) + else: + left_keys.append(left._get_label_or_level_values(k)) + join_names.append(k) + if isinstance(self.right.index, MultiIndex): + right_keys = [ + lev._values.take(lev_codes) + for lev, lev_codes in zip( + self.right.index.levels, self.right.index.codes + ) + ] + else: + right_keys = [self.right.index._values] + elif _any(self.right_on): + for k in self.right_on: + if is_rkey(k): + right_keys.append(k) + join_names.append(None) + else: + right_keys.append(right._get_label_or_level_values(k)) + join_names.append(k) + if isinstance(self.left.index, MultiIndex): + left_keys = [ + lev._values.take(lev_codes) + for lev, lev_codes in zip( + self.left.index.levels, self.left.index.codes + ) + ] + else: + left_keys = [self.left.index._values] + + if left_drop: + self.left = self.left._drop_labels_or_levels(left_drop) + + if right_drop: + self.right = self.right._drop_labels_or_levels(right_drop) + + return left_keys, right_keys, join_names + + def _maybe_coerce_merge_keys(self): + # we have valid mergees but we may have to further + # coerce these if they are originally incompatible types + # + # for example if these are categorical, but are not dtype_equal + # or if we have object and integer dtypes + + for lk, rk, name in zip( + self.left_join_keys, self.right_join_keys, self.join_names + ): + if (len(lk) and not len(rk)) or (not len(lk) and len(rk)): + continue + + lk_is_cat = is_categorical_dtype(lk) + rk_is_cat = is_categorical_dtype(rk) + lk_is_object = is_object_dtype(lk) + rk_is_object = is_object_dtype(rk) + + # if either left or right is a categorical + # then the must match exactly in categories & ordered + if lk_is_cat and rk_is_cat: + if lk.is_dtype_equal(rk): + continue + + elif lk_is_cat or rk_is_cat: + pass + + elif is_dtype_equal(lk.dtype, rk.dtype): + continue + + msg = ( + "You are trying to merge on {lk_dtype} and " + "{rk_dtype} columns. If you wish to proceed " + "you should use pd.concat".format(lk_dtype=lk.dtype, rk_dtype=rk.dtype) + ) + + # if we are numeric, then allow differing + # kinds to proceed, eg. int64 and int8, int and float + # further if we are object, but we infer to + # the same, then proceed + if is_numeric_dtype(lk) and is_numeric_dtype(rk): + if lk.dtype.kind == rk.dtype.kind: + continue + + # check whether ints and floats + elif is_integer_dtype(rk) and is_float_dtype(lk): + if not (lk == lk.astype(rk.dtype))[~np.isnan(lk)].all(): + warnings.warn( + "You are merging on int and float " + "columns where the float values " + "are not equal to their int " + "representation", + UserWarning, + ) + continue + + elif is_float_dtype(rk) and is_integer_dtype(lk): + if not (rk == rk.astype(lk.dtype))[~np.isnan(rk)].all(): + warnings.warn( + "You are merging on int and float " + "columns where the float values " + "are not equal to their int " + "representation", + UserWarning, + ) + continue + + # let's infer and see if we are ok + elif lib.infer_dtype(lk, skipna=False) == lib.infer_dtype( + rk, skipna=False + ): + continue + + # Check if we are trying to merge on obviously + # incompatible dtypes GH 9780, GH 15800 + + # bool values are coerced to object + elif (lk_is_object and is_bool_dtype(rk)) or ( + is_bool_dtype(lk) and rk_is_object + ): + pass + + # object values are allowed to be merged + elif (lk_is_object and is_numeric_dtype(rk)) or ( + is_numeric_dtype(lk) and rk_is_object + ): + inferred_left = lib.infer_dtype(lk, skipna=False) + inferred_right = lib.infer_dtype(rk, skipna=False) + bool_types = ["integer", "mixed-integer", "boolean", "empty"] + string_types = ["string", "unicode", "mixed", "bytes", "empty"] + + # inferred bool + if inferred_left in bool_types and inferred_right in bool_types: + pass + + # unless we are merging non-string-like with string-like + elif ( + inferred_left in string_types and inferred_right not in string_types + ) or ( + inferred_right in string_types and inferred_left not in string_types + ): + raise ValueError(msg) + + # datetimelikes must match exactly + elif needs_i8_conversion(lk) and not needs_i8_conversion(rk): + raise ValueError(msg) + elif not needs_i8_conversion(lk) and needs_i8_conversion(rk): + raise ValueError(msg) + elif is_datetime64tz_dtype(lk) and not is_datetime64tz_dtype(rk): + raise ValueError(msg) + elif not is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk): + raise ValueError(msg) + + elif lk_is_object and rk_is_object: + continue + + # Houston, we have a problem! + # let's coerce to object if the dtypes aren't + # categorical, otherwise coerce to the category + # dtype. If we coerced categories to object, + # then we would lose type information on some + # columns, and end up trying to merge + # incompatible dtypes. See GH 16900. + if name in self.left.columns: + typ = lk.categories.dtype if lk_is_cat else object + self.left = self.left.assign(**{name: self.left[name].astype(typ)}) + if name in self.right.columns: + typ = rk.categories.dtype if rk_is_cat else object + self.right = self.right.assign(**{name: self.right[name].astype(typ)}) + + def _validate_specification(self): + # Hm, any way to make this logic less complicated?? + if self.on is None and self.left_on is None and self.right_on is None: + + if self.left_index and self.right_index: + self.left_on, self.right_on = (), () + elif self.left_index: + if self.right_on is None: + raise MergeError("Must pass right_on or right_index=True") + elif self.right_index: + if self.left_on is None: + raise MergeError("Must pass left_on or left_index=True") + else: + # use the common columns + common_cols = self.left.columns.intersection(self.right.columns) + if len(common_cols) == 0: + raise MergeError( + "No common columns to perform merge on. " + "Merge options: left_on={lon}, right_on={ron}, " + "left_index={lidx}, right_index={ridx}".format( + lon=self.left_on, + ron=self.right_on, + lidx=self.left_index, + ridx=self.right_index, + ) + ) + if not common_cols.is_unique: + raise MergeError(f"Data columns not unique: {repr(common_cols)}") + self.left_on = self.right_on = common_cols + elif self.on is not None: + if self.left_on is not None or self.right_on is not None: + raise MergeError( + 'Can only pass argument "on" OR "left_on" ' + 'and "right_on", not a combination of both.' + ) + self.left_on = self.right_on = self.on + elif self.left_on is not None: + n = len(self.left_on) + if self.right_index: + if len(self.left_on) != self.right.index.nlevels: + raise ValueError( + "len(left_on) must equal the number " + 'of levels in the index of "right"' + ) + self.right_on = [None] * n + elif self.right_on is not None: + n = len(self.right_on) + if self.left_index: + if len(self.right_on) != self.left.index.nlevels: + raise ValueError( + "len(right_on) must equal the number " + 'of levels in the index of "left"' + ) + self.left_on = [None] * n + if len(self.right_on) != len(self.left_on): + raise ValueError("len(right_on) must equal len(left_on)") + + def _validate(self, validate: str): + + # Check uniqueness of each + if self.left_index: + left_unique = self.orig_left.index.is_unique + else: + left_unique = MultiIndex.from_arrays(self.left_join_keys).is_unique + + if self.right_index: + right_unique = self.orig_right.index.is_unique + else: + right_unique = MultiIndex.from_arrays(self.right_join_keys).is_unique + + # Check data integrity + if validate in ["one_to_one", "1:1"]: + if not left_unique and not right_unique: + raise MergeError( + "Merge keys are not unique in either left " + "or right dataset; not a one-to-one merge" + ) + elif not left_unique: + raise MergeError( + "Merge keys are not unique in left dataset; " + "not a one-to-one merge" + ) + elif not right_unique: + raise MergeError( + "Merge keys are not unique in right dataset; " + "not a one-to-one merge" + ) + + elif validate in ["one_to_many", "1:m"]: + if not left_unique: + raise MergeError( + "Merge keys are not unique in left dataset; " + "not a one-to-many merge" + ) + + elif validate in ["many_to_one", "m:1"]: + if not right_unique: + raise MergeError( + "Merge keys are not unique in right dataset; " + "not a many-to-one merge" + ) + + elif validate in ["many_to_many", "m:m"]: + pass + + else: + raise ValueError("Not a valid argument for validate") + + +def _get_join_indexers( + left_keys, right_keys, sort: bool = False, how: str = "inner", **kwargs +): + """ + + Parameters + ---------- + left_keys: ndarray, Index, Series + right_keys: ndarray, Index, Series + sort: bool, default False + how: string {'inner', 'outer', 'left', 'right'}, default 'inner' + + Returns + ------- + tuple of (left_indexer, right_indexer) + indexers into the left_keys, right_keys + + """ + assert len(left_keys) == len( + right_keys + ), "left_key and right_keys must be the same length" + + # get left & right join labels and num. of levels at each location + mapped = ( + _factorize_keys(left_keys[n], right_keys[n], sort=sort) + for n in range(len(left_keys)) + ) + zipped = zip(*mapped) + llab, rlab, shape = [list(x) for x in zipped] + + # get flat i8 keys from label lists + lkey, rkey = _get_join_keys(llab, rlab, shape, sort) + + # factorize keys to a dense i8 space + # `count` is the num. of unique keys + # set(lkey) | set(rkey) == range(count) + lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) + + # preserve left frame order if how == 'left' and sort == False + kwargs = copy.copy(kwargs) + if how == "left": + kwargs["sort"] = sort + join_func = _join_functions[how] + + return join_func(lkey, rkey, count, **kwargs) + + +def _restore_dropped_levels_multijoin( + left: MultiIndex, + right: MultiIndex, + dropped_level_names, + join_index, + lindexer, + rindexer, +): + """ + *this is an internal non-public method* + + Returns the levels, labels and names of a multi-index to multi-index join. + Depending on the type of join, this method restores the appropriate + dropped levels of the joined multi-index. + The method relies on lidx, rindexer which hold the index positions of + left and right, where a join was feasible + + Parameters + ---------- + left : MultiIndex + left index + right : MultiIndex + right index + dropped_level_names : str array + list of non-common level names + join_index : MultiIndex + the index of the join between the + common levels of left and right + lindexer : intp array + left indexer + rindexer : intp array + right indexer + + Returns + ------- + levels : list of Index + levels of combined multiindexes + labels : intp array + labels of combined multiindexes + names : str array + names of combined multiindexes + + """ + + def _convert_to_mulitindex(index) -> MultiIndex: + if isinstance(index, MultiIndex): + return index + else: + return MultiIndex.from_arrays([index.values], names=[index.name]) + + # For multi-multi joins with one overlapping level, + # the returned index if of type Index + # Assure that join_index is of type MultiIndex + # so that dropped levels can be appended + join_index = _convert_to_mulitindex(join_index) + + join_levels = join_index.levels + join_codes = join_index.codes + join_names = join_index.names + + # lindexer and rindexer hold the indexes where the join occurred + # for left and right respectively. If left/right is None then + # the join occurred on all indices of left/right + if lindexer is None: + lindexer = range(left.size) + + if rindexer is None: + rindexer = range(right.size) + + # Iterate through the levels that must be restored + for dropped_level_name in dropped_level_names: + if dropped_level_name in left.names: + idx = left + indexer = lindexer + else: + idx = right + indexer = rindexer + + # The index of the level name to be restored + name_idx = idx.names.index(dropped_level_name) + + restore_levels = idx.levels[name_idx] + # Inject -1 in the codes list where a join was not possible + # IOW indexer[i]=-1 + codes = idx.codes[name_idx] + restore_codes = algos.take_nd(codes, indexer, fill_value=-1) + + join_levels = join_levels + [restore_levels] + join_codes = join_codes + [restore_codes] + join_names = join_names + [dropped_level_name] + + return join_levels, join_codes, join_names + + +class _OrderedMerge(_MergeOperation): + _merge_type = "ordered_merge" + + def __init__( + self, + left, + right, + on=None, + left_on=None, + right_on=None, + left_index: bool = False, + right_index: bool = False, + axis=1, + suffixes=("_x", "_y"), + copy: bool = True, + fill_method=None, + how: str = "outer", + ): + + self.fill_method = fill_method + _MergeOperation.__init__( + self, + left, + right, + on=on, + left_on=left_on, + left_index=left_index, + right_index=right_index, + right_on=right_on, + axis=axis, + how=how, + suffixes=suffixes, + sort=True, # factorize sorts + ) + + def get_result(self): + join_index, left_indexer, right_indexer = self._get_join_info() + + # this is a bit kludgy + ldata, rdata = self.left._data, self.right._data + lsuf, rsuf = self.suffixes + + llabels, rlabels = _items_overlap_with_suffix( + ldata.items, lsuf, rdata.items, rsuf + ) + + if self.fill_method == "ffill": + left_join_indexer = libjoin.ffill_indexer(left_indexer) + right_join_indexer = libjoin.ffill_indexer(right_indexer) + else: + left_join_indexer = left_indexer + right_join_indexer = right_indexer + + lindexers = {1: left_join_indexer} if left_join_indexer is not None else {} + rindexers = {1: right_join_indexer} if right_join_indexer is not None else {} + + result_data = concatenate_block_managers( + [(ldata, lindexers), (rdata, rindexers)], + axes=[llabels.append(rlabels), join_index], + concat_axis=0, + copy=self.copy, + ) + + typ = self.left._constructor + result = typ(result_data).__finalize__(self, method=self._merge_type) + + self._maybe_add_join_keys(result, left_indexer, right_indexer) + + return result + + +def _asof_function(direction: str): + name = "asof_join_{dir}".format(dir=direction) + return getattr(libjoin, name, None) + + +def _asof_by_function(direction: str): + name = "asof_join_{dir}_on_X_by_Y".format(dir=direction) + return getattr(libjoin, name, None) + + +_type_casters = { + "int64_t": ensure_int64, + "double": ensure_float64, + "object": ensure_object, +} + + +def _get_cython_type_upcast(dtype): + """ Upcast a dtype to 'int64_t', 'double', or 'object' """ + if is_integer_dtype(dtype): + return "int64_t" + elif is_float_dtype(dtype): + return "double" + else: + return "object" + + +class _AsOfMerge(_OrderedMerge): + _merge_type = "asof_merge" + + def __init__( + self, + left, + right, + on=None, + left_on=None, + right_on=None, + left_index: bool = False, + right_index: bool = False, + by=None, + left_by=None, + right_by=None, + axis=1, + suffixes=("_x", "_y"), + copy: bool = True, + fill_method=None, + how: str = "asof", + tolerance=None, + allow_exact_matches: bool = True, + direction: str = "backward", + ): + + self.by = by + self.left_by = left_by + self.right_by = right_by + self.tolerance = tolerance + self.allow_exact_matches = allow_exact_matches + self.direction = direction + + _OrderedMerge.__init__( + self, + left, + right, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + axis=axis, + how=how, + suffixes=suffixes, + fill_method=fill_method, + ) + + def _validate_specification(self): + super()._validate_specification() + + # we only allow on to be a single item for on + if len(self.left_on) != 1 and not self.left_index: + raise MergeError("can only asof on a key for left") + + if len(self.right_on) != 1 and not self.right_index: + raise MergeError("can only asof on a key for right") + + if self.left_index and isinstance(self.left.index, MultiIndex): + raise MergeError("left can only have one index") + + if self.right_index and isinstance(self.right.index, MultiIndex): + raise MergeError("right can only have one index") + + # set 'by' columns + if self.by is not None: + if self.left_by is not None or self.right_by is not None: + raise MergeError("Can only pass by OR left_by and right_by") + self.left_by = self.right_by = self.by + if self.left_by is None and self.right_by is not None: + raise MergeError("missing left_by") + if self.left_by is not None and self.right_by is None: + raise MergeError("missing right_by") + + # add 'by' to our key-list so we can have it in the + # output as a key + if self.left_by is not None: + if not is_list_like(self.left_by): + self.left_by = [self.left_by] + if not is_list_like(self.right_by): + self.right_by = [self.right_by] + + if len(self.left_by) != len(self.right_by): + raise MergeError("left_by and right_by must be same length") + + self.left_on = self.left_by + list(self.left_on) + self.right_on = self.right_by + list(self.right_on) + + # check 'direction' is valid + if self.direction not in ["backward", "forward", "nearest"]: + raise MergeError( + "direction invalid: {direction}".format(direction=self.direction) + ) + + @property + def _asof_key(self): + """ This is our asof key, the 'on' """ + return self.left_on[-1] + + def _get_merge_keys(self): + + # note this function has side effects + (left_join_keys, right_join_keys, join_names) = super()._get_merge_keys() + + # validate index types are the same + for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)): + if not is_dtype_equal(lk.dtype, rk.dtype): + if is_categorical_dtype(lk.dtype) and is_categorical_dtype(rk.dtype): + # The generic error message is confusing for categoricals. + # + # In this function, the join keys include both the original + # ones of the merge_asof() call, and also the keys passed + # to its by= argument. Unordered but equal categories + # are not supported for the former, but will fail + # later with a ValueError, so we don't *need* to check + # for them here. + msg = ( + "incompatible merge keys [{i}] {lkdtype} and " + "{rkdtype}, both sides category, but not equal ones".format( + i=i, lkdtype=repr(lk.dtype), rkdtype=repr(rk.dtype) + ) + ) + else: + msg = ( + "incompatible merge keys [{i}] {lkdtype} and " + "{rkdtype}, must be the same type".format( + i=i, lkdtype=repr(lk.dtype), rkdtype=repr(rk.dtype) + ) + ) + raise MergeError(msg) + + # validate tolerance; datetime.timedelta or Timedelta if we have a DTI + if self.tolerance is not None: + + if self.left_index: + lt = self.left.index + else: + lt = left_join_keys[-1] + + msg = ( + "incompatible tolerance {tolerance}, must be compat " + "with type {lkdtype}".format( + tolerance=type(self.tolerance), lkdtype=repr(lt.dtype) + ) + ) + + if needs_i8_conversion(lt): + if not isinstance(self.tolerance, datetime.timedelta): + raise MergeError(msg) + if self.tolerance < Timedelta(0): + raise MergeError("tolerance must be positive") + + elif is_integer_dtype(lt): + if not is_integer(self.tolerance): + raise MergeError(msg) + if self.tolerance < 0: + raise MergeError("tolerance must be positive") + + elif is_float_dtype(lt): + if not is_number(self.tolerance): + raise MergeError(msg) + if self.tolerance < 0: + raise MergeError("tolerance must be positive") + + else: + raise MergeError("key must be integer, timestamp or float") + + # validate allow_exact_matches + if not is_bool(self.allow_exact_matches): + msg = "allow_exact_matches must be boolean, passed {passed}" + raise MergeError(msg.format(passed=self.allow_exact_matches)) + + return left_join_keys, right_join_keys, join_names + + def _get_join_indexers(self): + """ return the join indexers """ + + def flip(xs): + """ unlike np.transpose, this returns an array of tuples """ + xs = [ + x if not is_extension_array_dtype(x) else x._ndarray_values for x in xs + ] + labels = list(string.ascii_lowercase[: len(xs)]) + dtypes = [x.dtype for x in xs] + labeled_dtypes = list(zip(labels, dtypes)) + return np.array(list(zip(*xs)), labeled_dtypes) + + # values to compare + left_values = ( + self.left.index.values if self.left_index else self.left_join_keys[-1] + ) + right_values = ( + self.right.index.values if self.right_index else self.right_join_keys[-1] + ) + tolerance = self.tolerance + + # we require sortedness and non-null values in the join keys + msg_sorted = "{side} keys must be sorted" + msg_missings = "Merge keys contain null values on {side} side" + + if not Index(left_values).is_monotonic: + if isna(left_values).any(): + raise ValueError(msg_missings.format(side="left")) + else: + raise ValueError(msg_sorted.format(side="left")) + + if not Index(right_values).is_monotonic: + if isna(right_values).any(): + raise ValueError(msg_missings.format(side="right")) + else: + raise ValueError(msg_sorted.format(side="right")) + + # initial type conversion as needed + if needs_i8_conversion(left_values): + left_values = left_values.view("i8") + right_values = right_values.view("i8") + if tolerance is not None: + tolerance = Timedelta(tolerance) + tolerance = tolerance.value + + # a "by" parameter requires special handling + if self.left_by is not None: + # remove 'on' parameter from values if one existed + if self.left_index and self.right_index: + left_by_values = self.left_join_keys + right_by_values = self.right_join_keys + else: + left_by_values = self.left_join_keys[0:-1] + right_by_values = self.right_join_keys[0:-1] + + # get tuple representation of values if more than one + if len(left_by_values) == 1: + left_by_values = left_by_values[0] + right_by_values = right_by_values[0] + else: + left_by_values = flip(left_by_values) + right_by_values = flip(right_by_values) + + # upcast 'by' parameter because HashTable is limited + by_type = _get_cython_type_upcast(left_by_values.dtype) + by_type_caster = _type_casters[by_type] + left_by_values = by_type_caster(left_by_values) + right_by_values = by_type_caster(right_by_values) + + # choose appropriate function by type + func = _asof_by_function(self.direction) + return func( + left_values, + right_values, + left_by_values, + right_by_values, + self.allow_exact_matches, + tolerance, + ) + else: + # choose appropriate function by type + func = _asof_function(self.direction) + return func(left_values, right_values, self.allow_exact_matches, tolerance) + + +def _get_multiindex_indexer(join_keys, index: MultiIndex, sort: bool): + + # left & right join labels and num. of levels at each location + mapped = ( + _factorize_keys(index.levels[n], join_keys[n], sort=sort) + for n in range(index.nlevels) + ) + zipped = zip(*mapped) + rcodes, lcodes, shape = [list(x) for x in zipped] + if sort: + rcodes = list(map(np.take, rcodes, index.codes)) + else: + i8copy = lambda a: a.astype("i8", subok=False, copy=True) + rcodes = list(map(i8copy, index.codes)) + + # fix right labels if there were any nulls + for i in range(len(join_keys)): + mask = index.codes[i] == -1 + if mask.any(): + # check if there already was any nulls at this location + # if there was, it is factorized to `shape[i] - 1` + a = join_keys[i][lcodes[i] == shape[i] - 1] + if a.size == 0 or not a[0] != a[0]: + shape[i] += 1 + + rcodes[i][mask] = shape[i] - 1 + + # get flat i8 join keys + lkey, rkey = _get_join_keys(lcodes, rcodes, shape, sort) + + # factorize keys to a dense i8 space + lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) + + return libjoin.left_outer_join(lkey, rkey, count, sort=sort) + + +def _get_single_indexer(join_key, index, sort: bool = False): + left_key, right_key, count = _factorize_keys(join_key, index, sort=sort) + + left_indexer, right_indexer = libjoin.left_outer_join( + ensure_int64(left_key), ensure_int64(right_key), count, sort=sort + ) + + return left_indexer, right_indexer + + +def _left_join_on_index(left_ax: Index, right_ax: Index, join_keys, sort: bool = False): + if len(join_keys) > 1: + if not ( + (isinstance(right_ax, MultiIndex) and len(join_keys) == right_ax.nlevels) + ): + raise AssertionError( + "If more than one join key is given then " + "'right_ax' must be a MultiIndex and the " + "number of join keys must be the number of " + "levels in right_ax" + ) + + left_indexer, right_indexer = _get_multiindex_indexer( + join_keys, right_ax, sort=sort + ) + else: + jkey = join_keys[0] + + left_indexer, right_indexer = _get_single_indexer(jkey, right_ax, sort=sort) + + if sort or len(left_ax) != len(left_indexer): + # if asked to sort or there are 1-to-many matches + join_index = left_ax.take(left_indexer) + return join_index, left_indexer, right_indexer + + # left frame preserves order & length of its index + return left_ax, None, right_indexer + + +def _right_outer_join(x, y, max_groups): + right_indexer, left_indexer = libjoin.left_outer_join(y, x, max_groups) + return left_indexer, right_indexer + + +_join_functions = { + "inner": libjoin.inner_join, + "left": libjoin.left_outer_join, + "right": _right_outer_join, + "outer": libjoin.full_outer_join, +} + + +def _factorize_keys(lk, rk, sort=True): + # Some pre-processing for non-ndarray lk / rk + if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk): + lk = getattr(lk, "_values", lk)._data + rk = getattr(rk, "_values", rk)._data + + elif ( + is_categorical_dtype(lk) and is_categorical_dtype(rk) and lk.is_dtype_equal(rk) + ): + if lk.categories.equals(rk.categories): + # if we exactly match in categories, allow us to factorize on codes + rk = rk.codes + else: + # Same categories in different orders -> recode + rk = _recode_for_categories(rk.codes, rk.categories, lk.categories) + + lk = ensure_int64(lk.codes) + rk = ensure_int64(rk) + + elif ( + is_extension_array_dtype(lk.dtype) + and is_extension_array_dtype(rk.dtype) + and lk.dtype == rk.dtype + ): + lk, _ = lk._values_for_factorize() + rk, _ = rk._values_for_factorize() + + if is_integer_dtype(lk) and is_integer_dtype(rk): + # GH#23917 TODO: needs tests for case where lk is integer-dtype + # and rk is datetime-dtype + klass = libhashtable.Int64Factorizer + lk = ensure_int64(com.values_from_object(lk)) + rk = ensure_int64(com.values_from_object(rk)) + elif issubclass(lk.dtype.type, (np.timedelta64, np.datetime64)) and issubclass( + rk.dtype.type, (np.timedelta64, np.datetime64) + ): + # GH#23917 TODO: Needs tests for non-matching dtypes + klass = libhashtable.Int64Factorizer + lk = ensure_int64(com.values_from_object(lk)) + rk = ensure_int64(com.values_from_object(rk)) + else: + klass = libhashtable.Factorizer + lk = ensure_object(lk) + rk = ensure_object(rk) + + rizer = klass(max(len(lk), len(rk))) + + llab = rizer.factorize(lk) + rlab = rizer.factorize(rk) + + count = rizer.get_count() + + if sort: + uniques = rizer.uniques.to_array() + llab, rlab = _sort_labels(uniques, llab, rlab) + + # NA group + lmask = llab == -1 + lany = lmask.any() + rmask = rlab == -1 + rany = rmask.any() + + if lany or rany: + if lany: + np.putmask(llab, lmask, count) + if rany: + np.putmask(rlab, rmask, count) + count += 1 + + return llab, rlab, count + + +def _sort_labels(uniques: np.ndarray, left, right): + if not isinstance(uniques, np.ndarray): + # tuplesafe + uniques = Index(uniques).values + + llength = len(left) + labels = np.concatenate([left, right]) + + _, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1) + new_labels = ensure_int64(new_labels) + new_left, new_right = new_labels[:llength], new_labels[llength:] + + return new_left, new_right + + +def _get_join_keys(llab, rlab, shape, sort: bool): + + # how many levels can be done without overflow + pred = lambda i: not is_int64_overflow_possible(shape[:i]) + nlev = next(filter(pred, range(len(shape), 0, -1))) + + # get keys for the first `nlev` levels + stride = np.prod(shape[1:nlev], dtype="i8") + lkey = stride * llab[0].astype("i8", subok=False, copy=False) + rkey = stride * rlab[0].astype("i8", subok=False, copy=False) + + for i in range(1, nlev): + with np.errstate(divide="ignore"): + stride //= shape[i] + lkey += llab[i] * stride + rkey += rlab[i] * stride + + if nlev == len(shape): # all done! + return lkey, rkey + + # densify current keys to avoid overflow + lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) + + llab = [lkey] + llab[nlev:] + rlab = [rkey] + rlab[nlev:] + shape = [count] + shape[nlev:] + + return _get_join_keys(llab, rlab, shape, sort) + + +def _should_fill(lname, rname) -> bool: + if not isinstance(lname, str) or not isinstance(rname, str): + return True + return lname == rname + + +def _any(x) -> bool: + return x is not None and com.any_not_none(*x) + + +def _validate_operand(obj: FrameOrSeries) -> "DataFrame": + if isinstance(obj, ABCDataFrame): + return obj + elif isinstance(obj, ABCSeries): + if obj.name is None: + raise ValueError("Cannot merge a Series without a name") + else: + return obj.to_frame() + else: + raise TypeError( + "Can only merge Series or DataFrame objects, " + "a {obj} was passed".format(obj=type(obj)) + ) + + +def _items_overlap_with_suffix(left: Index, lsuffix, right: Index, rsuffix): + """ + If two indices overlap, add suffixes to overlapping entries. + + If corresponding suffix is empty, the entry is simply converted to string. + + """ + to_rename = left.intersection(right) + if len(to_rename) == 0: + return left, right + + if not lsuffix and not rsuffix: + raise ValueError( + "columns overlap but no suffix specified: " + "{rename}".format(rename=to_rename) + ) + + def renamer(x, suffix): + """ + Rename the left and right indices. + + If there is overlap, and suffix is not None, add + suffix, otherwise, leave it as-is. + + Parameters + ---------- + x : original column name + suffix : str or None + + Returns + ------- + x : renamed column name + """ + if x in to_rename and suffix is not None: + return "{x}{suffix}".format(x=x, suffix=suffix) + return x + + lrenamer = partial(renamer, suffix=lsuffix) + rrenamer = partial(renamer, suffix=rsuffix) + + return (_transform_index(left, lrenamer), _transform_index(right, rrenamer)) diff --git a/venv/Lib/site-packages/pandas/core/reshape/pivot.py b/venv/Lib/site-packages/pandas/core/reshape/pivot.py new file mode 100644 index 0000000..b443ba1 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/reshape/pivot.py @@ -0,0 +1,705 @@ +from typing import TYPE_CHECKING, Callable, Dict, List, Tuple, Union + +import numpy as np + +from pandas.util._decorators import Appender, Substitution + +from pandas.core.dtypes.cast import maybe_downcast_to_dtype +from pandas.core.dtypes.common import is_integer_dtype, is_list_like, is_scalar +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries + +import pandas.core.common as com +from pandas.core.frame import _shared_docs +from pandas.core.groupby import Grouper +from pandas.core.indexes.api import Index, MultiIndex, get_objs_combined_axis +from pandas.core.reshape.concat import concat +from pandas.core.reshape.util import cartesian_product +from pandas.core.series import Series + +if TYPE_CHECKING: + from pandas import DataFrame + + +# Note: We need to make sure `frame` is imported before `pivot`, otherwise +# _shared_docs['pivot_table'] will not yet exist. TODO: Fix this dependency +@Substitution("\ndata : DataFrame") +@Appender(_shared_docs["pivot_table"], indents=1) +def pivot_table( + data, + values=None, + index=None, + columns=None, + aggfunc="mean", + fill_value=None, + margins=False, + dropna=True, + margins_name="All", + observed=False, +) -> "DataFrame": + index = _convert_by(index) + columns = _convert_by(columns) + + if isinstance(aggfunc, list): + pieces: List[DataFrame] = [] + keys = [] + for func in aggfunc: + table = pivot_table( + data, + values=values, + index=index, + columns=columns, + fill_value=fill_value, + aggfunc=func, + margins=margins, + dropna=dropna, + margins_name=margins_name, + observed=observed, + ) + pieces.append(table) + keys.append(getattr(func, "__name__", func)) + + return concat(pieces, keys=keys, axis=1) + + keys = index + columns + + values_passed = values is not None + if values_passed: + if is_list_like(values): + values_multi = True + values = list(values) + else: + values_multi = False + values = [values] + + # GH14938 Make sure value labels are in data + for i in values: + if i not in data: + raise KeyError(i) + + to_filter = [] + for x in keys + values: + if isinstance(x, Grouper): + x = x.key + try: + if x in data: + to_filter.append(x) + except TypeError: + pass + if len(to_filter) < len(data.columns): + data = data[to_filter] + + else: + values = data.columns + for key in keys: + try: + values = values.drop(key) + except (TypeError, ValueError, KeyError): + pass + values = list(values) + + grouped = data.groupby(keys, observed=observed) + agged = grouped.agg(aggfunc) + if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): + agged = agged.dropna(how="all") + + # gh-21133 + # we want to down cast if + # the original values are ints + # as we grouped with a NaN value + # and then dropped, coercing to floats + for v in values: + if ( + v in data + and is_integer_dtype(data[v]) + and v in agged + and not is_integer_dtype(agged[v]) + ): + agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype) + + table = agged + if table.index.nlevels > 1: + # Related GH #17123 + # If index_names are integers, determine whether the integers refer + # to the level position or name. + index_names = agged.index.names[: len(index)] + to_unstack = [] + for i in range(len(index), len(keys)): + name = agged.index.names[i] + if name is None or name in index_names: + to_unstack.append(i) + else: + to_unstack.append(name) + table = agged.unstack(to_unstack) + + if not dropna: + if table.index.nlevels > 1: + m = MultiIndex.from_arrays( + cartesian_product(table.index.levels), names=table.index.names + ) + table = table.reindex(m, axis=0) + + if table.columns.nlevels > 1: + m = MultiIndex.from_arrays( + cartesian_product(table.columns.levels), names=table.columns.names + ) + table = table.reindex(m, axis=1) + + if isinstance(table, ABCDataFrame): + table = table.sort_index(axis=1) + + if fill_value is not None: + table = table._ensure_type(table.fillna(fill_value, downcast="infer")) + + if margins: + if dropna: + data = data[data.notna().all(axis=1)] + table = _add_margins( + table, + data, + values, + rows=index, + cols=columns, + aggfunc=aggfunc, + observed=dropna, + margins_name=margins_name, + fill_value=fill_value, + ) + + # discard the top level + if ( + values_passed + and not values_multi + and not table.empty + and (table.columns.nlevels > 1) + ): + table = table[values[0]] + + if len(index) == 0 and len(columns) > 0: + table = table.T + + # GH 15193 Make sure empty columns are removed if dropna=True + if isinstance(table, ABCDataFrame) and dropna: + table = table.dropna(how="all", axis=1) + + return table + + +def _add_margins( + table: Union["Series", "DataFrame"], + data, + values, + rows, + cols, + aggfunc, + observed=None, + margins_name: str = "All", + fill_value=None, +): + if not isinstance(margins_name, str): + raise ValueError("margins_name argument must be a string") + + msg = 'Conflicting name "{name}" in margins'.format(name=margins_name) + for level in table.index.names: + if margins_name in table.index.get_level_values(level): + raise ValueError(msg) + + grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name) + + if table.ndim == 2: + # i.e. DataFramae + for level in table.columns.names[1:]: + if margins_name in table.columns.get_level_values(level): + raise ValueError(msg) + + key: Union[str, Tuple[str, ...]] + if len(rows) > 1: + key = (margins_name,) + ("",) * (len(rows) - 1) + else: + key = margins_name + + if not values and isinstance(table, ABCSeries): + # If there are no values and the table is a series, then there is only + # one column in the data. Compute grand margin and return it. + return table.append(Series({key: grand_margin[margins_name]})) + + elif values: + marginal_result_set = _generate_marginal_results( + table, + data, + values, + rows, + cols, + aggfunc, + observed, + grand_margin, + margins_name, + ) + if not isinstance(marginal_result_set, tuple): + return marginal_result_set + result, margin_keys, row_margin = marginal_result_set + else: + # no values, and table is a DataFrame + assert isinstance(table, ABCDataFrame) + marginal_result_set = _generate_marginal_results_without_values( + table, data, rows, cols, aggfunc, observed, margins_name + ) + if not isinstance(marginal_result_set, tuple): + return marginal_result_set + result, margin_keys, row_margin = marginal_result_set + + row_margin = row_margin.reindex(result.columns, fill_value=fill_value) + # populate grand margin + for k in margin_keys: + if isinstance(k, str): + row_margin[k] = grand_margin[k] + else: + row_margin[k] = grand_margin[k[0]] + + from pandas import DataFrame + + margin_dummy = DataFrame(row_margin, columns=[key]).T + + row_names = result.index.names + try: + # check the result column and leave floats + for dtype in set(result.dtypes): + cols = result.select_dtypes([dtype]).columns + margin_dummy[cols] = margin_dummy[cols].apply( + maybe_downcast_to_dtype, args=(dtype,) + ) + result = result.append(margin_dummy) + except TypeError: + + # we cannot reshape, so coerce the axis + result.index = result.index._to_safe_for_reshape() + result = result.append(margin_dummy) + result.index.names = row_names + + return result + + +def _compute_grand_margin(data, values, aggfunc, margins_name: str = "All"): + + if values: + grand_margin = {} + for k, v in data[values].items(): + try: + if isinstance(aggfunc, str): + grand_margin[k] = getattr(v, aggfunc)() + elif isinstance(aggfunc, dict): + if isinstance(aggfunc[k], str): + grand_margin[k] = getattr(v, aggfunc[k])() + else: + grand_margin[k] = aggfunc[k](v) + else: + grand_margin[k] = aggfunc(v) + except TypeError: + pass + return grand_margin + else: + return {margins_name: aggfunc(data.index)} + + +def _generate_marginal_results( + table, + data, + values, + rows, + cols, + aggfunc, + observed, + grand_margin, + margins_name: str = "All", +): + if len(cols) > 0: + # need to "interleave" the margins + table_pieces = [] + margin_keys = [] + + def _all_key(key): + return (key, margins_name) + ("",) * (len(cols) - 1) + + if len(rows) > 0: + margin = data[rows + values].groupby(rows, observed=observed).agg(aggfunc) + cat_axis = 1 + + for key, piece in table.groupby(level=0, axis=cat_axis, observed=observed): + all_key = _all_key(key) + + # we are going to mutate this, so need to copy! + piece = piece.copy() + try: + piece[all_key] = margin[key] + except TypeError: + + # we cannot reshape, so coerce the axis + piece.set_axis( + piece._get_axis(cat_axis)._to_safe_for_reshape(), + axis=cat_axis, + inplace=True, + ) + piece[all_key] = margin[key] + + table_pieces.append(piece) + margin_keys.append(all_key) + else: + margin = grand_margin + cat_axis = 0 + for key, piece in table.groupby(level=0, axis=cat_axis, observed=observed): + all_key = _all_key(key) + table_pieces.append(piece) + table_pieces.append(Series(margin[key], index=[all_key])) + margin_keys.append(all_key) + + result = concat(table_pieces, axis=cat_axis) + + if len(rows) == 0: + return result + else: + result = table + margin_keys = table.columns + + if len(cols) > 0: + row_margin = data[cols + values].groupby(cols, observed=observed).agg(aggfunc) + row_margin = row_margin.stack() + + # slight hack + new_order = [len(cols)] + list(range(len(cols))) + row_margin.index = row_margin.index.reorder_levels(new_order) + else: + row_margin = Series(np.nan, index=result.columns) + + return result, margin_keys, row_margin + + +def _generate_marginal_results_without_values( + table: "DataFrame", data, rows, cols, aggfunc, observed, margins_name: str = "All" +): + if len(cols) > 0: + # need to "interleave" the margins + margin_keys = [] + + def _all_key(): + if len(cols) == 1: + return margins_name + return (margins_name,) + ("",) * (len(cols) - 1) + + if len(rows) > 0: + margin = data[rows].groupby(rows, observed=observed).apply(aggfunc) + all_key = _all_key() + table[all_key] = margin + result = table + margin_keys.append(all_key) + + else: + margin = data.groupby(level=0, axis=0, observed=observed).apply(aggfunc) + all_key = _all_key() + table[all_key] = margin + result = table + margin_keys.append(all_key) + return result + else: + result = table + margin_keys = table.columns + + if len(cols): + row_margin = data[cols].groupby(cols, observed=observed).apply(aggfunc) + else: + row_margin = Series(np.nan, index=result.columns) + + return result, margin_keys, row_margin + + +def _convert_by(by): + if by is None: + by = [] + elif ( + is_scalar(by) + or isinstance(by, (np.ndarray, Index, ABCSeries, Grouper)) + or hasattr(by, "__call__") + ): + by = [by] + else: + by = list(by) + return by + + +@Substitution("\ndata : DataFrame") +@Appender(_shared_docs["pivot"], indents=1) +def pivot(data: "DataFrame", index=None, columns=None, values=None) -> "DataFrame": + if values is None: + cols = [columns] if index is None else [index, columns] + append = index is None + indexed = data.set_index(cols, append=append) + else: + if index is None: + index = data.index + else: + index = data[index] + index = MultiIndex.from_arrays([index, data[columns]]) + + if is_list_like(values) and not isinstance(values, tuple): + # Exclude tuple because it is seen as a single column name + indexed = data._constructor( + data[values].values, index=index, columns=values + ) + else: + indexed = data._constructor_sliced(data[values].values, index=index) + return indexed.unstack(columns) + + +def crosstab( + index, + columns, + values=None, + rownames=None, + colnames=None, + aggfunc=None, + margins=False, + margins_name: str = "All", + dropna: bool = True, + normalize=False, +) -> "DataFrame": + """ + Compute a simple cross tabulation of two (or more) factors. By default + computes a frequency table of the factors unless an array of values and an + aggregation function are passed. + + Parameters + ---------- + index : array-like, Series, or list of arrays/Series + Values to group by in the rows. + columns : array-like, Series, or list of arrays/Series + Values to group by in the columns. + values : array-like, optional + Array of values to aggregate according to the factors. + Requires `aggfunc` be specified. + rownames : sequence, default None + If passed, must match number of row arrays passed. + colnames : sequence, default None + If passed, must match number of column arrays passed. + aggfunc : function, optional + If specified, requires `values` be specified as well. + margins : bool, default False + Add row/column margins (subtotals). + margins_name : str, default 'All' + Name of the row/column that will contain the totals + when margins is True. + + .. versionadded:: 0.21.0 + + dropna : bool, default True + Do not include columns whose entries are all NaN. + normalize : bool, {'all', 'index', 'columns'}, or {0,1}, default False + Normalize by dividing all values by the sum of values. + + - If passed 'all' or `True`, will normalize over all values. + - If passed 'index' will normalize over each row. + - If passed 'columns' will normalize over each column. + - If margins is `True`, will also normalize margin values. + + Returns + ------- + DataFrame + Cross tabulation of the data. + + See Also + -------- + DataFrame.pivot : Reshape data based on column values. + pivot_table : Create a pivot table as a DataFrame. + + Notes + ----- + Any Series passed will have their name attributes used unless row or column + names for the cross-tabulation are specified. + + Any input passed containing Categorical data will have **all** of its + categories included in the cross-tabulation, even if the actual data does + not contain any instances of a particular category. + + In the event that there aren't overlapping indexes an empty DataFrame will + be returned. + + Examples + -------- + >>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar", + ... "bar", "bar", "foo", "foo", "foo"], dtype=object) + >>> b = np.array(["one", "one", "one", "two", "one", "one", + ... "one", "two", "two", "two", "one"], dtype=object) + >>> c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny", + ... "shiny", "dull", "shiny", "shiny", "shiny"], + ... dtype=object) + >>> pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) + b one two + c dull shiny dull shiny + a + bar 1 2 1 0 + foo 2 2 1 2 + + Here 'c' and 'f' are not represented in the data and will not be + shown in the output because dropna is True by default. Set + dropna=False to preserve categories with no data. + + >>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c']) + >>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f']) + >>> pd.crosstab(foo, bar) + col_0 d e + row_0 + a 1 0 + b 0 1 + >>> pd.crosstab(foo, bar, dropna=False) + col_0 d e f + row_0 + a 1 0 0 + b 0 1 0 + c 0 0 0 + """ + + index = com.maybe_make_list(index) + columns = com.maybe_make_list(columns) + + rownames = _get_names(index, rownames, prefix="row") + colnames = _get_names(columns, colnames, prefix="col") + + common_idx = None + pass_objs = [x for x in index + columns if isinstance(x, (ABCSeries, ABCDataFrame))] + if pass_objs: + common_idx = get_objs_combined_axis(pass_objs, intersect=True, sort=False) + + data: Dict = {} + data.update(zip(rownames, index)) + data.update(zip(colnames, columns)) + + if values is None and aggfunc is not None: + raise ValueError("aggfunc cannot be used without values.") + + if values is not None and aggfunc is None: + raise ValueError("values cannot be used without an aggfunc.") + + from pandas import DataFrame + + df = DataFrame(data, index=common_idx) + if values is None: + df["__dummy__"] = 0 + kwargs = {"aggfunc": len, "fill_value": 0} + else: + df["__dummy__"] = values + kwargs = {"aggfunc": aggfunc} + + table = df.pivot_table( + "__dummy__", + index=rownames, + columns=colnames, + margins=margins, + margins_name=margins_name, + dropna=dropna, + **kwargs, + ) + + # Post-process + if normalize is not False: + table = _normalize( + table, normalize=normalize, margins=margins, margins_name=margins_name + ) + + return table + + +def _normalize(table, normalize, margins: bool, margins_name="All"): + + if not isinstance(normalize, (bool, str)): + axis_subs = {0: "index", 1: "columns"} + try: + normalize = axis_subs[normalize] + except KeyError: + raise ValueError("Not a valid normalize argument") + + if margins is False: + + # Actual Normalizations + normalizers: Dict[Union[bool, str], Callable] = { + "all": lambda x: x / x.sum(axis=1).sum(axis=0), + "columns": lambda x: x / x.sum(), + "index": lambda x: x.div(x.sum(axis=1), axis=0), + } + + normalizers[True] = normalizers["all"] + + try: + f = normalizers[normalize] + except KeyError: + raise ValueError("Not a valid normalize argument") + + table = f(table) + table = table.fillna(0) + + elif margins is True: + # keep index and column of pivoted table + table_index = table.index + table_columns = table.columns + + # check if margin name is in (for MI cases) or equal to last + # index/column and save the column and index margin + if (margins_name not in table.iloc[-1, :].name) | ( + margins_name != table.iloc[:, -1].name + ): + raise ValueError( + "{mname} not in pivoted DataFrame".format(mname=margins_name) + ) + column_margin = table.iloc[:-1, -1] + index_margin = table.iloc[-1, :-1] + + # keep the core table + table = table.iloc[:-1, :-1] + + # Normalize core + table = _normalize(table, normalize=normalize, margins=False) + + # Fix Margins + if normalize == "columns": + column_margin = column_margin / column_margin.sum() + table = concat([table, column_margin], axis=1) + table = table.fillna(0) + table.columns = table_columns + + elif normalize == "index": + index_margin = index_margin / index_margin.sum() + table = table.append(index_margin) + table = table.fillna(0) + table.index = table_index + + elif normalize == "all" or normalize is True: + column_margin = column_margin / column_margin.sum() + index_margin = index_margin / index_margin.sum() + index_margin.loc[margins_name] = 1 + table = concat([table, column_margin], axis=1) + table = table.append(index_margin) + + table = table.fillna(0) + table.index = table_index + table.columns = table_columns + + else: + raise ValueError("Not a valid normalize argument") + + else: + raise ValueError("Not a valid margins argument") + + return table + + +def _get_names(arrs, names, prefix: str = "row"): + if names is None: + names = [] + for i, arr in enumerate(arrs): + if isinstance(arr, ABCSeries) and arr.name is not None: + names.append(arr.name) + else: + names.append("{prefix}_{i}".format(prefix=prefix, i=i)) + else: + if len(names) != len(arrs): + raise AssertionError("arrays and names must have the same length") + if not isinstance(names, list): + names = list(names) + + return names diff --git a/venv/Lib/site-packages/pandas/core/reshape/reshape.py b/venv/Lib/site-packages/pandas/core/reshape/reshape.py new file mode 100644 index 0000000..97f416e --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/reshape/reshape.py @@ -0,0 +1,1080 @@ +from functools import partial +import itertools +from typing import List + +import numpy as np + +import pandas._libs.algos as libalgos +import pandas._libs.reshape as libreshape +from pandas._libs.sparse import IntIndex + +from pandas.core.dtypes.cast import maybe_promote +from pandas.core.dtypes.common import ( + ensure_platform_int, + is_bool_dtype, + is_extension_array_dtype, + is_integer, + is_integer_dtype, + is_list_like, + is_object_dtype, + needs_i8_conversion, +) +from pandas.core.dtypes.missing import notna + +import pandas.core.algorithms as algos +from pandas.core.arrays import SparseArray +from pandas.core.arrays.categorical import factorize_from_iterable +from pandas.core.construction import extract_array +from pandas.core.frame import DataFrame +from pandas.core.indexes.api import Index, MultiIndex +from pandas.core.series import Series +from pandas.core.sorting import ( + compress_group_index, + decons_obs_group_ids, + get_compressed_ids, + get_group_index, +) + + +class _Unstacker: + """ + Helper class to unstack data / pivot with multi-level index + + Parameters + ---------- + values : ndarray + Values of DataFrame to "Unstack" + index : object + Pandas ``Index`` + level : int or str, default last level + Level to "unstack". Accepts a name for the level. + value_columns : Index, optional + Pandas ``Index`` or ``MultiIndex`` object if unstacking a DataFrame + fill_value : scalar, optional + Default value to fill in missing values if subgroups do not have the + same set of labels. By default, missing values will be replaced with + the default fill value for that data type, NaN for float, NaT for + datetimelike, etc. For integer types, by default data will converted to + float and missing values will be set to NaN. + constructor : object + Pandas ``DataFrame`` or subclass used to create unstacked + response. If None, DataFrame will be used. + + Examples + -------- + >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), + ... ('two', 'a'), ('two', 'b')]) + >>> s = pd.Series(np.arange(1, 5, dtype=np.int64), index=index) + >>> s + one a 1 + b 2 + two a 3 + b 4 + dtype: int64 + + >>> s.unstack(level=-1) + a b + one 1 2 + two 3 4 + + >>> s.unstack(level=0) + one two + a 1 3 + b 2 4 + + Returns + ------- + unstacked : DataFrame + """ + + def __init__( + self, + values: np.ndarray, + index, + level=-1, + value_columns=None, + fill_value=None, + constructor=None, + ): + + if values.ndim == 1: + values = values[:, np.newaxis] + self.values = values + self.value_columns = value_columns + self.fill_value = fill_value + + if constructor is None: + constructor = DataFrame + self.constructor = constructor + + if value_columns is None and values.shape[1] != 1: # pragma: no cover + raise ValueError("must pass column labels for multi-column data") + + self.index = index.remove_unused_levels() + + self.level = self.index._get_level_number(level) + + # when index includes `nan`, need to lift levels/strides by 1 + self.lift = 1 if -1 in self.index.codes[self.level] else 0 + + self.new_index_levels = list(self.index.levels) + self.new_index_names = list(self.index.names) + + self.removed_name = self.new_index_names.pop(self.level) + self.removed_level = self.new_index_levels.pop(self.level) + self.removed_level_full = index.levels[self.level] + + # Bug fix GH 20601 + # If the data frame is too big, the number of unique index combination + # will cause int32 overflow on windows environments. + # We want to check and raise an error before this happens + num_rows = np.max([index_level.size for index_level in self.new_index_levels]) + num_columns = self.removed_level.size + + # GH20601: This forces an overflow if the number of cells is too high. + num_cells = np.multiply(num_rows, num_columns, dtype=np.int32) + + if num_rows > 0 and num_columns > 0 and num_cells <= 0: + raise ValueError("Unstacked DataFrame is too big, causing int32 overflow") + + self._make_sorted_values_labels() + self._make_selectors() + + def _make_sorted_values_labels(self): + v = self.level + + codes = list(self.index.codes) + levs = list(self.index.levels) + to_sort = codes[:v] + codes[v + 1 :] + [codes[v]] + sizes = [len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]] + + comp_index, obs_ids = get_compressed_ids(to_sort, sizes) + ngroups = len(obs_ids) + + indexer = libalgos.groupsort_indexer(comp_index, ngroups)[0] + indexer = ensure_platform_int(indexer) + + self.sorted_values = algos.take_nd(self.values, indexer, axis=0) + self.sorted_labels = [l.take(indexer) for l in to_sort] + + def _make_selectors(self): + new_levels = self.new_index_levels + + # make the mask + remaining_labels = self.sorted_labels[:-1] + level_sizes = [len(x) for x in new_levels] + + comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes) + ngroups = len(obs_ids) + + comp_index = ensure_platform_int(comp_index) + stride = self.index.levshape[self.level] + self.lift + self.full_shape = ngroups, stride + + selector = self.sorted_labels[-1] + stride * comp_index + self.lift + mask = np.zeros(np.prod(self.full_shape), dtype=bool) + mask.put(selector, True) + + if mask.sum() < len(self.index): + raise ValueError("Index contains duplicate entries, cannot reshape") + + self.group_index = comp_index + self.mask = mask + self.unique_groups = obs_ids + self.compressor = comp_index.searchsorted(np.arange(ngroups)) + + def get_result(self): + values, _ = self.get_new_values() + columns = self.get_new_columns() + index = self.get_new_index() + + return self.constructor(values, index=index, columns=columns) + + def get_new_values(self): + values = self.values + + # place the values + length, width = self.full_shape + stride = values.shape[1] + result_width = width * stride + result_shape = (length, result_width) + mask = self.mask + mask_all = mask.all() + + # we can simply reshape if we don't have a mask + if mask_all and len(values): + new_values = ( + self.sorted_values.reshape(length, width, stride) + .swapaxes(1, 2) + .reshape(result_shape) + ) + new_mask = np.ones(result_shape, dtype=bool) + return new_values, new_mask + + # if our mask is all True, then we can use our existing dtype + if mask_all: + dtype = values.dtype + new_values = np.empty(result_shape, dtype=dtype) + else: + dtype, fill_value = maybe_promote(values.dtype, self.fill_value) + new_values = np.empty(result_shape, dtype=dtype) + new_values.fill(fill_value) + + new_mask = np.zeros(result_shape, dtype=bool) + + name = np.dtype(dtype).name + sorted_values = self.sorted_values + + # we need to convert to a basic dtype + # and possibly coerce an input to our output dtype + # e.g. ints -> floats + if needs_i8_conversion(values): + sorted_values = sorted_values.view("i8") + new_values = new_values.view("i8") + elif is_bool_dtype(values): + sorted_values = sorted_values.astype("object") + new_values = new_values.astype("object") + else: + sorted_values = sorted_values.astype(name, copy=False) + + # fill in our values & mask + libreshape.unstack( + sorted_values, + mask.view("u1"), + stride, + length, + width, + new_values, + new_mask.view("u1"), + ) + + # reconstruct dtype if needed + if needs_i8_conversion(values): + new_values = new_values.view(values.dtype) + + return new_values, new_mask + + def get_new_columns(self): + if self.value_columns is None: + if self.lift == 0: + return self.removed_level._shallow_copy(name=self.removed_name) + + lev = self.removed_level.insert(0, item=self.removed_level._na_value) + return lev.rename(self.removed_name) + + stride = len(self.removed_level) + self.lift + width = len(self.value_columns) + propagator = np.repeat(np.arange(width), stride) + if isinstance(self.value_columns, MultiIndex): + new_levels = self.value_columns.levels + (self.removed_level_full,) + new_names = self.value_columns.names + (self.removed_name,) + + new_codes = [lab.take(propagator) for lab in self.value_columns.codes] + else: + new_levels = [self.value_columns, self.removed_level_full] + new_names = [self.value_columns.name, self.removed_name] + new_codes = [propagator] + + # The two indices differ only if the unstacked level had unused items: + if len(self.removed_level_full) != len(self.removed_level): + # In this case, we remap the new codes to the original level: + repeater = self.removed_level_full.get_indexer(self.removed_level) + if self.lift: + repeater = np.insert(repeater, 0, -1) + else: + # Otherwise, we just use each level item exactly once: + repeater = np.arange(stride) - self.lift + + # The entire level is then just a repetition of the single chunk: + new_codes.append(np.tile(repeater, width)) + return MultiIndex( + levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False + ) + + def get_new_index(self): + result_codes = [lab.take(self.compressor) for lab in self.sorted_labels[:-1]] + + # construct the new index + if len(self.new_index_levels) == 1: + level, level_codes = self.new_index_levels[0], result_codes[0] + if (level_codes == -1).any(): + level = level.insert(len(level), level._na_value) + return level.take(level_codes).rename(self.new_index_names[0]) + + return MultiIndex( + levels=self.new_index_levels, + codes=result_codes, + names=self.new_index_names, + verify_integrity=False, + ) + + +def _unstack_multiple(data, clocs, fill_value=None): + if len(clocs) == 0: + return data + + # NOTE: This doesn't deal with hierarchical columns yet + + index = data.index + + clocs = [index._get_level_number(i) for i in clocs] + + rlocs = [i for i in range(index.nlevels) if i not in clocs] + + clevels = [index.levels[i] for i in clocs] + ccodes = [index.codes[i] for i in clocs] + cnames = [index.names[i] for i in clocs] + rlevels = [index.levels[i] for i in rlocs] + rcodes = [index.codes[i] for i in rlocs] + rnames = [index.names[i] for i in rlocs] + + shape = [len(x) for x in clevels] + group_index = get_group_index(ccodes, shape, sort=False, xnull=False) + + comp_ids, obs_ids = compress_group_index(group_index, sort=False) + recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False) + + if rlocs == []: + # Everything is in clocs, so the dummy df has a regular index + dummy_index = Index(obs_ids, name="__placeholder__") + else: + dummy_index = MultiIndex( + levels=rlevels + [obs_ids], + codes=rcodes + [comp_ids], + names=rnames + ["__placeholder__"], + verify_integrity=False, + ) + + if isinstance(data, Series): + dummy = data.copy() + dummy.index = dummy_index + + unstacked = dummy.unstack("__placeholder__", fill_value=fill_value) + new_levels = clevels + new_names = cnames + new_codes = recons_codes + else: + if isinstance(data.columns, MultiIndex): + result = data + for i in range(len(clocs)): + val = clocs[i] + result = result.unstack(val, fill_value=fill_value) + clocs = [v if i > v else v - 1 for v in clocs] + + return result + + dummy = data.copy() + dummy.index = dummy_index + + unstacked = dummy.unstack("__placeholder__", fill_value=fill_value) + if isinstance(unstacked, Series): + unstcols = unstacked.index + else: + unstcols = unstacked.columns + new_levels = [unstcols.levels[0]] + clevels + new_names = [data.columns.name] + cnames + + new_codes = [unstcols.codes[0]] + for rec in recons_codes: + new_codes.append(rec.take(unstcols.codes[-1])) + + new_columns = MultiIndex( + levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False + ) + + if isinstance(unstacked, Series): + unstacked.index = new_columns + else: + unstacked.columns = new_columns + + return unstacked + + +def unstack(obj, level, fill_value=None): + if isinstance(level, (tuple, list)): + if len(level) != 1: + # _unstack_multiple only handles MultiIndexes, + # and isn't needed for a single level + return _unstack_multiple(obj, level, fill_value=fill_value) + else: + level = level[0] + + # Prioritize integer interpretation (GH #21677): + if not is_integer(level) and not level == "__placeholder__": + level = obj.index._get_level_number(level) + + if isinstance(obj, DataFrame): + if isinstance(obj.index, MultiIndex): + return _unstack_frame(obj, level, fill_value=fill_value) + else: + return obj.T.stack(dropna=False) + else: + if is_extension_array_dtype(obj.dtype): + return _unstack_extension_series(obj, level, fill_value) + unstacker = _Unstacker( + obj.values, + obj.index, + level=level, + fill_value=fill_value, + constructor=obj._constructor_expanddim, + ) + return unstacker.get_result() + + +def _unstack_frame(obj, level, fill_value=None): + if obj._is_mixed_type: + unstacker = partial( + _Unstacker, index=obj.index, level=level, fill_value=fill_value + ) + blocks = obj._data.unstack(unstacker, fill_value=fill_value) + return obj._constructor(blocks) + else: + unstacker = _Unstacker( + obj.values, + obj.index, + level=level, + value_columns=obj.columns, + fill_value=fill_value, + constructor=obj._constructor, + ) + return unstacker.get_result() + + +def _unstack_extension_series(series, level, fill_value): + """ + Unstack an ExtensionArray-backed Series. + + The ExtensionDtype is preserved. + + Parameters + ---------- + series : Series + A Series with an ExtensionArray for values + level : Any + The level name or number. + fill_value : Any + The user-level (not physical storage) fill value to use for + missing values introduced by the reshape. Passed to + ``series.values.take``. + + Returns + ------- + DataFrame + Each column of the DataFrame will have the same dtype as + the input Series. + """ + # Implementation note: the basic idea is to + # 1. Do a regular unstack on a dummy array of integers + # 2. Followup with a columnwise take. + # We use the dummy take to discover newly-created missing values + # introduced by the reshape. + from pandas.core.reshape.concat import concat + + dummy_arr = np.arange(len(series)) + # fill_value=-1, since we will do a series.values.take later + result = _Unstacker( + dummy_arr, series.index, level=level, fill_value=-1 + ).get_result() + + out = [] + values = extract_array(series, extract_numpy=False) + + for col, indices in result.items(): + out.append( + Series( + values.take(indices.values, allow_fill=True, fill_value=fill_value), + name=col, + index=result.index, + ) + ) + return concat(out, axis="columns", copy=False, keys=result.columns) + + +def stack(frame, level=-1, dropna=True): + """ + Convert DataFrame to Series with multi-level Index. Columns become the + second level of the resulting hierarchical index + + Returns + ------- + stacked : Series + """ + + def factorize(index): + if index.is_unique: + return index, np.arange(len(index)) + codes, categories = factorize_from_iterable(index) + return categories, codes + + N, K = frame.shape + + # Will also convert negative level numbers and check if out of bounds. + level_num = frame.columns._get_level_number(level) + + if isinstance(frame.columns, MultiIndex): + return _stack_multi_columns(frame, level_num=level_num, dropna=dropna) + elif isinstance(frame.index, MultiIndex): + new_levels = list(frame.index.levels) + new_codes = [lab.repeat(K) for lab in frame.index.codes] + + clev, clab = factorize(frame.columns) + new_levels.append(clev) + new_codes.append(np.tile(clab, N).ravel()) + + new_names = list(frame.index.names) + new_names.append(frame.columns.name) + new_index = MultiIndex( + levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False + ) + else: + levels, (ilab, clab) = zip(*map(factorize, (frame.index, frame.columns))) + codes = ilab.repeat(K), np.tile(clab, N).ravel() + new_index = MultiIndex( + levels=levels, + codes=codes, + names=[frame.index.name, frame.columns.name], + verify_integrity=False, + ) + + if frame._is_homogeneous_type: + # For homogeneous EAs, frame.values will coerce to object. So + # we concatenate instead. + dtypes = list(frame.dtypes.values) + dtype = dtypes[0] + + if is_extension_array_dtype(dtype): + arr = dtype.construct_array_type() + new_values = arr._concat_same_type( + [col._values for _, col in frame.items()] + ) + new_values = _reorder_for_extension_array_stack(new_values, N, K) + else: + # homogeneous, non-EA + new_values = frame.values.ravel() + + else: + # non-homogeneous + new_values = frame.values.ravel() + + if dropna: + mask = notna(new_values) + new_values = new_values[mask] + new_index = new_index[mask] + + return frame._constructor_sliced(new_values, index=new_index) + + +def stack_multiple(frame, level, dropna=True): + # If all passed levels match up to column names, no + # ambiguity about what to do + if all(lev in frame.columns.names for lev in level): + result = frame + for lev in level: + result = stack(result, lev, dropna=dropna) + + # Otherwise, level numbers may change as each successive level is stacked + elif all(isinstance(lev, int) for lev in level): + # As each stack is done, the level numbers decrease, so we need + # to account for that when level is a sequence of ints + result = frame + # _get_level_number() checks level numbers are in range and converts + # negative numbers to positive + level = [frame.columns._get_level_number(lev) for lev in level] + + # Can't iterate directly through level as we might need to change + # values as we go + for index in range(len(level)): + lev = level[index] + result = stack(result, lev, dropna=dropna) + # Decrement all level numbers greater than current, as these + # have now shifted down by one + updated_level = [] + for other in level: + if other > lev: + updated_level.append(other - 1) + else: + updated_level.append(other) + level = updated_level + + else: + raise ValueError( + "level should contain all level names or all level " + "numbers, not a mixture of the two." + ) + + return result + + +def _stack_multi_columns(frame, level_num=-1, dropna=True): + def _convert_level_number(level_num, columns): + """ + Logic for converting the level number to something we can safely pass + to swaplevel: + + We generally want to convert the level number into a level name, except + when columns do not have names, in which case we must leave as a level + number + """ + if level_num in columns.names: + return columns.names[level_num] + else: + if columns.names[level_num] is None: + return level_num + else: + return columns.names[level_num] + + this = frame.copy() + + # this makes life much simpler + if level_num != frame.columns.nlevels - 1: + # roll levels to put selected level at end + roll_columns = this.columns + for i in range(level_num, frame.columns.nlevels - 1): + # Need to check if the ints conflict with level names + lev1 = _convert_level_number(i, roll_columns) + lev2 = _convert_level_number(i + 1, roll_columns) + roll_columns = roll_columns.swaplevel(lev1, lev2) + this.columns = roll_columns + + if not this.columns.is_lexsorted(): + # Workaround the edge case where 0 is one of the column names, + # which interferes with trying to sort based on the first + # level + level_to_sort = _convert_level_number(0, this.columns) + this = this.sort_index(level=level_to_sort, axis=1) + + # tuple list excluding level for grouping columns + if len(frame.columns.levels) > 2: + tuples = list( + zip( + *[ + lev.take(level_codes) + for lev, level_codes in zip( + this.columns.levels[:-1], this.columns.codes[:-1] + ) + ] + ) + ) + unique_groups = [key for key, _ in itertools.groupby(tuples)] + new_names = this.columns.names[:-1] + new_columns = MultiIndex.from_tuples(unique_groups, names=new_names) + else: + new_columns = this.columns.levels[0]._shallow_copy(name=this.columns.names[0]) + unique_groups = new_columns + + # time to ravel the values + new_data = {} + level_vals = this.columns.levels[-1] + level_codes = sorted(set(this.columns.codes[-1])) + level_vals_used = level_vals[level_codes] + levsize = len(level_codes) + drop_cols = [] + for key in unique_groups: + try: + loc = this.columns.get_loc(key) + except KeyError: + drop_cols.append(key) + continue + + # can make more efficient? + # we almost always return a slice + # but if unsorted can get a boolean + # indexer + if not isinstance(loc, slice): + slice_len = len(loc) + else: + slice_len = loc.stop - loc.start + + if slice_len != levsize: + chunk = this.loc[:, this.columns[loc]] + chunk.columns = level_vals.take(chunk.columns.codes[-1]) + value_slice = chunk.reindex(columns=level_vals_used).values + else: + if frame._is_homogeneous_type and is_extension_array_dtype( + frame.dtypes.iloc[0] + ): + dtype = this[this.columns[loc]].dtypes.iloc[0] + subset = this[this.columns[loc]] + + value_slice = dtype.construct_array_type()._concat_same_type( + [x._values for _, x in subset.items()] + ) + N, K = this.shape + idx = np.arange(N * K).reshape(K, N).T.ravel() + value_slice = value_slice.take(idx) + + elif frame._is_mixed_type: + value_slice = this[this.columns[loc]].values + else: + value_slice = this.values[:, loc] + + if value_slice.ndim > 1: + # i.e. not extension + value_slice = value_slice.ravel() + + new_data[key] = value_slice + + if len(drop_cols) > 0: + new_columns = new_columns.difference(drop_cols) + + N = len(this) + + if isinstance(this.index, MultiIndex): + new_levels = list(this.index.levels) + new_names = list(this.index.names) + new_codes = [lab.repeat(levsize) for lab in this.index.codes] + else: + old_codes, old_levels = factorize_from_iterable(this.index) + new_levels = [old_levels] + new_codes = [old_codes.repeat(levsize)] + new_names = [this.index.name] # something better? + + new_levels.append(level_vals) + new_codes.append(np.tile(level_codes, N)) + new_names.append(frame.columns.names[level_num]) + + new_index = MultiIndex( + levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False + ) + + result = frame._constructor(new_data, index=new_index, columns=new_columns) + + # more efficient way to go about this? can do the whole masking biz but + # will only save a small amount of time... + if dropna: + result = result.dropna(axis=0, how="all") + + return result + + +def get_dummies( + data, + prefix=None, + prefix_sep="_", + dummy_na=False, + columns=None, + sparse=False, + drop_first=False, + dtype=None, +) -> "DataFrame": + """ + Convert categorical variable into dummy/indicator variables. + + Parameters + ---------- + data : array-like, Series, or DataFrame + Data of which to get dummy indicators. + prefix : str, list of str, or dict of str, default None + String to append DataFrame column names. + Pass a list with length equal to the number of columns + when calling get_dummies on a DataFrame. Alternatively, `prefix` + can be a dictionary mapping column names to prefixes. + prefix_sep : str, default '_' + If appending prefix, separator/delimiter to use. Or pass a + list or dictionary as with `prefix`. + dummy_na : bool, default False + Add a column to indicate NaNs, if False NaNs are ignored. + columns : list-like, default None + Column names in the DataFrame to be encoded. + If `columns` is None then all the columns with + `object` or `category` dtype will be converted. + sparse : bool, default False + Whether the dummy-encoded columns should be backed by + a :class:`SparseArray` (True) or a regular NumPy array (False). + drop_first : bool, default False + Whether to get k-1 dummies out of k categorical levels by removing the + first level. + dtype : dtype, default np.uint8 + Data type for new columns. Only a single dtype is allowed. + + .. versionadded:: 0.23.0 + + Returns + ------- + DataFrame + Dummy-coded data. + + See Also + -------- + Series.str.get_dummies : Convert Series to dummy codes. + + Examples + -------- + >>> s = pd.Series(list('abca')) + + >>> pd.get_dummies(s) + a b c + 0 1 0 0 + 1 0 1 0 + 2 0 0 1 + 3 1 0 0 + + >>> s1 = ['a', 'b', np.nan] + + >>> pd.get_dummies(s1) + a b + 0 1 0 + 1 0 1 + 2 0 0 + + >>> pd.get_dummies(s1, dummy_na=True) + a b NaN + 0 1 0 0 + 1 0 1 0 + 2 0 0 1 + + >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], + ... 'C': [1, 2, 3]}) + + >>> pd.get_dummies(df, prefix=['col1', 'col2']) + C col1_a col1_b col2_a col2_b col2_c + 0 1 1 0 0 1 0 + 1 2 0 1 1 0 0 + 2 3 1 0 0 0 1 + + >>> pd.get_dummies(pd.Series(list('abcaa'))) + a b c + 0 1 0 0 + 1 0 1 0 + 2 0 0 1 + 3 1 0 0 + 4 1 0 0 + + >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True) + b c + 0 0 0 + 1 1 0 + 2 0 1 + 3 0 0 + 4 0 0 + + >>> pd.get_dummies(pd.Series(list('abc')), dtype=float) + a b c + 0 1.0 0.0 0.0 + 1 0.0 1.0 0.0 + 2 0.0 0.0 1.0 + """ + from pandas.core.reshape.concat import concat + + dtypes_to_encode = ["object", "category"] + + if isinstance(data, DataFrame): + # determine columns being encoded + if columns is None: + data_to_encode = data.select_dtypes(include=dtypes_to_encode) + elif not is_list_like(columns): + raise TypeError("Input must be a list-like for parameter `columns`") + else: + data_to_encode = data[columns] + + # validate prefixes and separator to avoid silently dropping cols + def check_len(item, name): + len_msg = ( + "Length of '{name}' ({len_item}) did not match the " + "length of the columns being encoded ({len_enc})." + ) + + if is_list_like(item): + if not len(item) == data_to_encode.shape[1]: + len_msg = len_msg.format( + name=name, len_item=len(item), len_enc=data_to_encode.shape[1] + ) + raise ValueError(len_msg) + + check_len(prefix, "prefix") + check_len(prefix_sep, "prefix_sep") + + if isinstance(prefix, str): + prefix = itertools.cycle([prefix]) + if isinstance(prefix, dict): + prefix = [prefix[col] for col in data_to_encode.columns] + + if prefix is None: + prefix = data_to_encode.columns + + # validate separators + if isinstance(prefix_sep, str): + prefix_sep = itertools.cycle([prefix_sep]) + elif isinstance(prefix_sep, dict): + prefix_sep = [prefix_sep[col] for col in data_to_encode.columns] + + if data_to_encode.shape == data.shape: + # Encoding the entire df, do not prepend any dropped columns + with_dummies: List[DataFrame] = [] + elif columns is not None: + # Encoding only cols specified in columns. Get all cols not in + # columns to prepend to result. + with_dummies = [data.drop(columns, axis=1)] + else: + # Encoding only object and category dtype columns. Get remaining + # columns to prepend to result. + with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)] + + for (col, pre, sep) in zip(data_to_encode.items(), prefix, prefix_sep): + # col is (column_name, column), use just column data here + dummy = _get_dummies_1d( + col[1], + prefix=pre, + prefix_sep=sep, + dummy_na=dummy_na, + sparse=sparse, + drop_first=drop_first, + dtype=dtype, + ) + with_dummies.append(dummy) + result = concat(with_dummies, axis=1) + else: + result = _get_dummies_1d( + data, + prefix, + prefix_sep, + dummy_na, + sparse=sparse, + drop_first=drop_first, + dtype=dtype, + ) + return result + + +def _get_dummies_1d( + data, + prefix, + prefix_sep="_", + dummy_na=False, + sparse=False, + drop_first=False, + dtype=None, +): + from pandas.core.reshape.concat import concat + + # Series avoids inconsistent NaN handling + codes, levels = factorize_from_iterable(Series(data)) + + if dtype is None: + dtype = np.uint8 + dtype = np.dtype(dtype) + + if is_object_dtype(dtype): + raise ValueError("dtype=object is not a valid dtype for get_dummies") + + def get_empty_frame(data) -> DataFrame: + if isinstance(data, Series): + index = data.index + else: + index = np.arange(len(data)) + return DataFrame(index=index) + + # if all NaN + if not dummy_na and len(levels) == 0: + return get_empty_frame(data) + + codes = codes.copy() + if dummy_na: + codes[codes == -1] = len(levels) + levels = np.append(levels, np.nan) + + # if dummy_na, we just fake a nan level. drop_first will drop it again + if drop_first and len(levels) == 1: + return get_empty_frame(data) + + number_of_cols = len(levels) + + if prefix is None: + dummy_cols = levels + else: + + # PY2 embedded unicode, gh-22084 + def _make_col_name(prefix, prefix_sep, level) -> str: + fstr = "{prefix}{prefix_sep}{level}" + return fstr.format(prefix=prefix, prefix_sep=prefix_sep, level=level) + + dummy_cols = [_make_col_name(prefix, prefix_sep, level) for level in levels] + + if isinstance(data, Series): + index = data.index + else: + index = None + + if sparse: + + if is_integer_dtype(dtype): + fill_value = 0 + elif dtype == bool: + fill_value = False + else: + fill_value = 0.0 + + sparse_series = [] + N = len(data) + sp_indices = [[] for _ in range(len(dummy_cols))] + mask = codes != -1 + codes = codes[mask] + n_idx = np.arange(N)[mask] + + for ndx, code in zip(n_idx, codes): + sp_indices[code].append(ndx) + + if drop_first: + # remove first categorical level to avoid perfect collinearity + # GH12042 + sp_indices = sp_indices[1:] + dummy_cols = dummy_cols[1:] + for col, ixs in zip(dummy_cols, sp_indices): + sarr = SparseArray( + np.ones(len(ixs), dtype=dtype), + sparse_index=IntIndex(N, ixs), + fill_value=fill_value, + dtype=dtype, + ) + sparse_series.append(Series(data=sarr, index=index, name=col)) + + out = concat(sparse_series, axis=1, copy=False) + return out + + else: + dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0) + + if not dummy_na: + # reset NaN GH4446 + dummy_mat[codes == -1] = 0 + + if drop_first: + # remove first GH12042 + dummy_mat = dummy_mat[:, 1:] + dummy_cols = dummy_cols[1:] + return DataFrame(dummy_mat, index=index, columns=dummy_cols) + + +def _reorder_for_extension_array_stack(arr, n_rows: int, n_columns: int): + """ + Re-orders the values when stacking multiple extension-arrays. + + The indirect stacking method used for EAs requires a followup + take to get the order correct. + + Parameters + ---------- + arr : ExtensionArray + n_rows, n_columns : int + The number of rows and columns in the original DataFrame. + + Returns + ------- + taken : ExtensionArray + The original `arr` with elements re-ordered appropriately + + Examples + -------- + >>> arr = np.array(['a', 'b', 'c', 'd', 'e', 'f']) + >>> _reorder_for_extension_array_stack(arr, 2, 3) + array(['a', 'c', 'e', 'b', 'd', 'f'], dtype='>> _reorder_for_extension_array_stack(arr, 3, 2) + array(['a', 'd', 'b', 'e', 'c', 'f'], dtype='>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3) + ... # doctest: +ELLIPSIS + [(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ... + Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ... + + >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True) + ... # doctest: +ELLIPSIS + ([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ... + Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ... + array([0.994, 3. , 5. , 7. ])) + + Discovers the same bins, but assign them specific labels. Notice that + the returned Categorical's categories are `labels` and is ordered. + + >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), + ... 3, labels=["bad", "medium", "good"]) + [bad, good, medium, medium, good, bad] + Categories (3, object): [bad < medium < good] + + ``labels=False`` implies you just want the bins back. + + >>> pd.cut([0, 1, 1, 2], bins=4, labels=False) + array([0, 1, 1, 3]) + + Passing a Series as an input returns a Series with categorical dtype: + + >>> s = pd.Series(np.array([2, 4, 6, 8, 10]), + ... index=['a', 'b', 'c', 'd', 'e']) + >>> pd.cut(s, 3) + ... # doctest: +ELLIPSIS + a (1.992, 4.667] + b (1.992, 4.667] + c (4.667, 7.333] + d (7.333, 10.0] + e (7.333, 10.0] + dtype: category + Categories (3, interval[float64]): [(1.992, 4.667] < (4.667, ... + + Passing a Series as an input returns a Series with mapping value. + It is used to map numerically to intervals based on bins. + + >>> s = pd.Series(np.array([2, 4, 6, 8, 10]), + ... index=['a', 'b', 'c', 'd', 'e']) + >>> pd.cut(s, [0, 2, 4, 6, 8, 10], labels=False, retbins=True, right=False) + ... # doctest: +ELLIPSIS + (a 0.0 + b 1.0 + c 2.0 + d 3.0 + e 4.0 + dtype: float64, array([0, 2, 4, 6, 8])) + + Use `drop` optional when bins is not unique + + >>> pd.cut(s, [0, 2, 4, 6, 10, 10], labels=False, retbins=True, + ... right=False, duplicates='drop') + ... # doctest: +ELLIPSIS + (a 0.0 + b 1.0 + c 2.0 + d 3.0 + e 3.0 + dtype: float64, array([0, 2, 4, 6, 8])) + + Passing an IntervalIndex for `bins` results in those categories exactly. + Notice that values not covered by the IntervalIndex are set to NaN. 0 + is to the left of the first bin (which is closed on the right), and 1.5 + falls between two bins. + + >>> bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)]) + >>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins) + [NaN, (0, 1], NaN, (2, 3], (4, 5]] + Categories (3, interval[int64]): [(0, 1] < (2, 3] < (4, 5]] + """ + # NOTE: this binning code is changed a bit from histogram for var(x) == 0 + + # for handling the cut for datetime and timedelta objects + original = x + x = _preprocess_for_cut(x) + x, dtype = _coerce_to_type(x) + + # To support cut(IntegerArray), we convert to object dtype with NaN + # Will properly support in the future. + # https://github.com/pandas-dev/pandas/pull/31290 + if is_extension_array_dtype(x.dtype) and is_integer_dtype(x.dtype): + x = x.to_numpy(dtype=object, na_value=np.nan) + + if not np.iterable(bins): + if is_scalar(bins) and bins < 1: + raise ValueError("`bins` should be a positive integer.") + + try: # for array-like + sz = x.size + except AttributeError: + x = np.asarray(x) + sz = x.size + + if sz == 0: + raise ValueError("Cannot cut empty array") + + rng = (nanops.nanmin(x), nanops.nanmax(x)) + mn, mx = [mi + 0.0 for mi in rng] + + if np.isinf(mn) or np.isinf(mx): + # GH 24314 + raise ValueError( + "cannot specify integer `bins` when input data contains infinity" + ) + elif mn == mx: # adjust end points before binning + mn -= 0.001 * abs(mn) if mn != 0 else 0.001 + mx += 0.001 * abs(mx) if mx != 0 else 0.001 + bins = np.linspace(mn, mx, bins + 1, endpoint=True) + else: # adjust end points after binning + bins = np.linspace(mn, mx, bins + 1, endpoint=True) + adj = (mx - mn) * 0.001 # 0.1% of the range + if right: + bins[0] -= adj + else: + bins[-1] += adj + + elif isinstance(bins, IntervalIndex): + if bins.is_overlapping: + raise ValueError("Overlapping IntervalIndex is not accepted.") + + else: + if is_datetime64tz_dtype(bins): + bins = np.asarray(bins, dtype=_NS_DTYPE) + else: + bins = np.asarray(bins) + bins = _convert_bin_to_numeric_type(bins, dtype) + + # GH 26045: cast to float64 to avoid an overflow + if (np.diff(bins.astype("float64")) < 0).any(): + raise ValueError("bins must increase monotonically.") + + fac, bins = _bins_to_cuts( + x, + bins, + right=right, + labels=labels, + precision=precision, + include_lowest=include_lowest, + dtype=dtype, + duplicates=duplicates, + ) + + return _postprocess_for_cut(fac, bins, retbins, dtype, original) + + +def qcut( + x, + q, + labels=None, + retbins: bool = False, + precision: int = 3, + duplicates: str = "raise", +): + """ + Quantile-based discretization function. + + Discretize variable into equal-sized buckets based on rank or based + on sample quantiles. For example 1000 values for 10 quantiles would + produce a Categorical object indicating quantile membership for each data point. + + Parameters + ---------- + x : 1d ndarray or Series + q : int or list-like of int + Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately + array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles. + labels : array or False, default None + Used as labels for the resulting bins. Must be of the same length as + the resulting bins. If False, return only integer indicators of the + bins. If True, raises an error. + retbins : bool, optional + Whether to return the (bins, labels) or not. Can be useful if bins + is given as a scalar. + precision : int, optional + The precision at which to store and display the bins labels. + duplicates : {default 'raise', 'drop'}, optional + If bin edges are not unique, raise ValueError or drop non-uniques. + + Returns + ------- + out : Categorical or Series or array of integers if labels is False + The return type (Categorical or Series) depends on the input: a Series + of type category if input is a Series else Categorical. Bins are + represented as categories when categorical data is returned. + bins : ndarray of floats + Returned only if `retbins` is True. + + Notes + ----- + Out of bounds values will be NA in the resulting Categorical object + + Examples + -------- + >>> pd.qcut(range(5), 4) + ... # doctest: +ELLIPSIS + [(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]] + Categories (4, interval[float64]): [(-0.001, 1.0] < (1.0, 2.0] ... + + >>> pd.qcut(range(5), 3, labels=["good", "medium", "bad"]) + ... # doctest: +SKIP + [good, good, medium, bad, bad] + Categories (3, object): [good < medium < bad] + + >>> pd.qcut(range(5), 4, labels=False) + array([0, 0, 1, 2, 3]) + """ + original = x + x = _preprocess_for_cut(x) + x, dtype = _coerce_to_type(x) + + if is_integer(q): + quantiles = np.linspace(0, 1, q + 1) + else: + quantiles = q + bins = algos.quantile(x, quantiles) + fac, bins = _bins_to_cuts( + x, + bins, + labels=labels, + precision=precision, + include_lowest=True, + dtype=dtype, + duplicates=duplicates, + ) + + return _postprocess_for_cut(fac, bins, retbins, dtype, original) + + +def _bins_to_cuts( + x, + bins, + right: bool = True, + labels=None, + precision: int = 3, + include_lowest: bool = False, + dtype=None, + duplicates: str = "raise", +): + + if duplicates not in ["raise", "drop"]: + raise ValueError( + "invalid value for 'duplicates' parameter, " + "valid options are: raise, drop" + ) + + if isinstance(bins, IntervalIndex): + # we have a fast-path here + ids = bins.get_indexer(x) + result = Categorical.from_codes(ids, categories=bins, ordered=True) + return result, bins + + unique_bins = algos.unique(bins) + if len(unique_bins) < len(bins) and len(bins) != 2: + if duplicates == "raise": + raise ValueError( + f"Bin edges must be unique: {repr(bins)}.\n" + f"You can drop duplicate edges by setting the 'duplicates' kwarg" + ) + else: + bins = unique_bins + + side = "left" if right else "right" + ids = ensure_int64(bins.searchsorted(x, side=side)) + + if include_lowest: + ids[x == bins[0]] = 1 + + na_mask = isna(x) | (ids == len(bins)) | (ids == 0) + has_nas = na_mask.any() + + if labels is not False: + if not (labels is None or is_list_like(labels)): + raise ValueError( + "Bin labels must either be False, None or passed in as a " + "list-like argument" + ) + + elif labels is None: + labels = _format_labels( + bins, precision, right=right, include_lowest=include_lowest, dtype=dtype + ) + + else: + if len(labels) != len(bins) - 1: + raise ValueError( + "Bin labels must be one fewer than the number of bin edges" + ) + + if not is_categorical_dtype(labels): + labels = Categorical(labels, categories=labels, ordered=True) + + np.putmask(ids, na_mask, 0) + result = algos.take_nd(labels, ids - 1) + + else: + result = ids - 1 + if has_nas: + result = result.astype(np.float64) + np.putmask(result, na_mask, np.nan) + + return result, bins + + +def _coerce_to_type(x): + """ + if the passed data is of datetime/timedelta or bool type, + this method converts it to numeric so that cut or qcut method can + handle it + """ + dtype = None + + if is_datetime64tz_dtype(x): + dtype = x.dtype + elif is_datetime64_dtype(x): + x = to_datetime(x) + dtype = np.dtype("datetime64[ns]") + elif is_timedelta64_dtype(x): + x = to_timedelta(x) + dtype = np.dtype("timedelta64[ns]") + elif is_bool_dtype(x): + # GH 20303 + x = x.astype(np.int64) + + if dtype is not None: + # GH 19768: force NaT to NaN during integer conversion + x = np.where(x.notna(), x.view(np.int64), np.nan) + + return x, dtype + + +def _convert_bin_to_numeric_type(bins, dtype): + """ + if the passed bin is of datetime/timedelta type, + this method converts it to integer + + Parameters + ---------- + bins : list-like of bins + dtype : dtype of data + + Raises + ------ + ValueError if bins are not of a compat dtype to dtype + """ + bins_dtype = infer_dtype(bins, skipna=False) + if is_timedelta64_dtype(dtype): + if bins_dtype in ["timedelta", "timedelta64"]: + bins = to_timedelta(bins).view(np.int64) + else: + raise ValueError("bins must be of timedelta64 dtype") + elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): + if bins_dtype in ["datetime", "datetime64"]: + bins = to_datetime(bins).view(np.int64) + else: + raise ValueError("bins must be of datetime64 dtype") + + return bins + + +def _convert_bin_to_datelike_type(bins, dtype): + """ + Convert bins to a DatetimeIndex or TimedeltaIndex if the original dtype is + datelike + + Parameters + ---------- + bins : list-like of bins + dtype : dtype of data + + Returns + ------- + bins : Array-like of bins, DatetimeIndex or TimedeltaIndex if dtype is + datelike + """ + if is_datetime64tz_dtype(dtype): + bins = to_datetime(bins.astype(np.int64), utc=True).tz_convert(dtype.tz) + elif is_datetime_or_timedelta_dtype(dtype): + bins = Index(bins.astype(np.int64), dtype=dtype) + return bins + + +def _format_labels( + bins, precision: int, right: bool = True, include_lowest: bool = False, dtype=None +): + """ based on the dtype, return our labels """ + + closed = "right" if right else "left" + + if is_datetime64tz_dtype(dtype): + formatter = lambda x: Timestamp(x, tz=dtype.tz) + adjust = lambda x: x - Timedelta("1ns") + elif is_datetime64_dtype(dtype): + formatter = Timestamp + adjust = lambda x: x - Timedelta("1ns") + elif is_timedelta64_dtype(dtype): + formatter = Timedelta + adjust = lambda x: x - Timedelta("1ns") + else: + precision = _infer_precision(precision, bins) + formatter = lambda x: _round_frac(x, precision) + adjust = lambda x: x - 10 ** (-precision) + + breaks = [formatter(b) for b in bins] + if right and include_lowest: + # adjust lhs of first interval by precision to account for being right closed + breaks[0] = adjust(breaks[0]) + + return IntervalIndex.from_breaks(breaks, closed=closed) + + +def _preprocess_for_cut(x): + """ + handles preprocessing for cut where we convert passed + input to array, strip the index information and store it + separately + """ + + # Check that the passed array is a Pandas or Numpy object + # We don't want to strip away a Pandas data-type here (e.g. datetimetz) + ndim = getattr(x, "ndim", None) + if ndim is None: + x = np.asarray(x) + if x.ndim != 1: + raise ValueError("Input array must be 1 dimensional") + + return x + + +def _postprocess_for_cut(fac, bins, retbins: bool, dtype, original): + """ + handles post processing for the cut method where + we combine the index information if the originally passed + datatype was a series + """ + if isinstance(original, ABCSeries): + fac = original._constructor(fac, index=original.index, name=original.name) + + if not retbins: + return fac + + bins = _convert_bin_to_datelike_type(bins, dtype) + + return fac, bins + + +def _round_frac(x, precision: int): + """ + Round the fractional part of the given number + """ + if not np.isfinite(x) or x == 0: + return x + else: + frac, whole = np.modf(x) + if whole == 0: + digits = -int(np.floor(np.log10(abs(frac)))) - 1 + precision + else: + digits = precision + return np.around(x, digits) + + +def _infer_precision(base_precision: int, bins) -> int: + """Infer an appropriate precision for _round_frac + """ + for precision in range(base_precision, 20): + levels = [_round_frac(b, precision) for b in bins] + if algos.unique(levels).size == bins.size: + return precision + return base_precision # default diff --git a/venv/Lib/site-packages/pandas/core/reshape/util.py b/venv/Lib/site-packages/pandas/core/reshape/util.py new file mode 100644 index 0000000..d8652c9 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/reshape/util.py @@ -0,0 +1,59 @@ +import numpy as np + +from pandas.core.dtypes.common import is_list_like + +import pandas.core.common as com + + +def cartesian_product(X): + """ + Numpy version of itertools.product. + Sometimes faster (for large inputs)... + + Parameters + ---------- + X : list-like of list-likes + + Returns + ------- + product : list of ndarrays + + Examples + -------- + >>> cartesian_product([list('ABC'), [1, 2]]) + [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='|S1'), + array([1, 2, 1, 2, 1, 2])] + + See Also + -------- + itertools.product : Cartesian product of input iterables. Equivalent to + nested for-loops. + """ + msg = "Input must be a list-like of list-likes" + if not is_list_like(X): + raise TypeError(msg) + for x in X: + if not is_list_like(x): + raise TypeError(msg) + + if len(X) == 0: + return [] + + lenX = np.fromiter((len(x) for x in X), dtype=np.intp) + cumprodX = np.cumproduct(lenX) + + a = np.roll(cumprodX, 1) + a[0] = 1 + + if cumprodX[-1] != 0: + b = cumprodX[-1] / cumprodX + else: + # if any factor is empty, the cartesian product is empty + b = np.zeros_like(cumprodX) + + return [ + np.tile( + np.repeat(np.asarray(com.values_from_object(x)), b[i]), np.product(a[i]) + ) + for i, x in enumerate(X) + ] diff --git a/venv/Lib/site-packages/pandas/core/series.py b/venv/Lib/site-packages/pandas/core/series.py new file mode 100644 index 0000000..6a2a30a --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/series.py @@ -0,0 +1,4577 @@ +""" +Data structure for 1-dimensional cross-sectional and time series data +""" +from io import StringIO +from shutil import get_terminal_size +from textwrap import dedent +from typing import IO, Any, Callable, Hashable, List, Optional +import warnings + +import numpy as np + +from pandas._config import get_option + +from pandas._libs import index as libindex, lib, reshape, tslibs +from pandas.compat.numpy import function as nv +from pandas.util._decorators import Appender, Substitution +from pandas.util._validators import validate_bool_kwarg, validate_percentile + +from pandas.core.dtypes.cast import convert_dtypes +from pandas.core.dtypes.common import ( + _is_unorderable_exception, + ensure_platform_int, + is_bool, + is_categorical_dtype, + is_datetime64_dtype, + is_dict_like, + is_extension_array_dtype, + is_integer, + is_iterator, + is_list_like, + is_object_dtype, + is_scalar, + is_timedelta64_dtype, +) +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCDatetimeIndex, + ABCSeries, + ABCSparseArray, +) +from pandas.core.dtypes.inference import is_hashable +from pandas.core.dtypes.missing import ( + isna, + na_value_for_dtype, + notna, + remove_na_arraylike, +) + +import pandas as pd +from pandas.core import algorithms, base, generic, nanops, ops +from pandas.core.accessor import CachedAccessor +from pandas.core.arrays import ExtensionArray, try_cast_to_ea +from pandas.core.arrays.categorical import Categorical, CategoricalAccessor +from pandas.core.arrays.sparse import SparseAccessor +import pandas.core.common as com +from pandas.core.construction import ( + create_series_with_explicit_dtype, + extract_array, + is_empty_data, + sanitize_array, +) +from pandas.core.groupby import generic as groupby_generic +from pandas.core.indexers import maybe_convert_indices +from pandas.core.indexes.accessors import CombinedDatetimelikeProperties +from pandas.core.indexes.api import ( + Float64Index, + Index, + InvalidIndexError, + MultiIndex, + ensure_index, +) +import pandas.core.indexes.base as ibase +from pandas.core.indexes.datetimes import DatetimeIndex +from pandas.core.indexes.period import PeriodIndex +from pandas.core.indexes.timedeltas import TimedeltaIndex +from pandas.core.indexing import check_bool_indexer +from pandas.core.internals import SingleBlockManager +from pandas.core.strings import StringMethods +from pandas.core.tools.datetimes import to_datetime + +import pandas.io.formats.format as fmt +import pandas.plotting + +__all__ = ["Series"] + +_shared_doc_kwargs = dict( + axes="index", + klass="Series", + axes_single_arg="{0 or 'index'}", + axis="""axis : {0 or 'index'} + Parameter needed for compatibility with DataFrame.""", + inplace="""inplace : boolean, default False + If True, performs operation inplace and returns None.""", + unique="np.ndarray", + duplicated="Series", + optional_by="", + optional_mapper="", + optional_labels="", + optional_axis="", + versionadded_to_excel="\n .. versionadded:: 0.20.0\n", +) + + +def _coerce_method(converter): + """ + Install the scalar coercion methods. + """ + + def wrapper(self): + if len(self) == 1: + return converter(self.iloc[0]) + raise TypeError(f"cannot convert the series to {converter}") + + wrapper.__name__ = f"__{converter.__name__}__" + return wrapper + + +# ---------------------------------------------------------------------- +# Series class + + +class Series(base.IndexOpsMixin, generic.NDFrame): + """ + One-dimensional ndarray with axis labels (including time series). + + Labels need not be unique but must be a hashable type. The object + supports both integer- and label-based indexing and provides a host of + methods for performing operations involving the index. Statistical + methods from ndarray have been overridden to automatically exclude + missing data (currently represented as NaN). + + Operations between Series (+, -, /, *, **) align values based on their + associated index values-- they need not be the same length. The result + index will be the sorted union of the two indexes. + + Parameters + ---------- + data : array-like, Iterable, dict, or scalar value + Contains data stored in Series. + + .. versionchanged:: 0.23.0 + If data is a dict, argument order is maintained for Python 3.6 + and later. + + index : array-like or Index (1d) + Values must be hashable and have the same length as `data`. + Non-unique index values are allowed. Will default to + RangeIndex (0, 1, 2, ..., n) if not provided. If both a dict and index + sequence are used, the index will override the keys found in the + dict. + dtype : str, numpy.dtype, or ExtensionDtype, optional + Data type for the output Series. If not specified, this will be + inferred from `data`. + See the :ref:`user guide ` for more usages. + name : str, optional + The name to give to the Series. + copy : bool, default False + Copy input data. + """ + + _typ = "series" + + _name: Optional[Hashable] + _metadata: List[str] = ["name"] + _accessors = {"dt", "cat", "str", "sparse"} + _deprecations = ( + base.IndexOpsMixin._deprecations + | generic.NDFrame._deprecations + | frozenset(["compress", "ptp"]) + ) + + # Override cache_readonly bc Series is mutable + hasnans = property( + base.IndexOpsMixin.hasnans.func, doc=base.IndexOpsMixin.hasnans.__doc__ + ) + _data: SingleBlockManager + div: Callable[["Series", Any], "Series"] + rdiv: Callable[["Series", Any], "Series"] + + # ---------------------------------------------------------------------- + # Constructors + + def __init__( + self, data=None, index=None, dtype=None, name=None, copy=False, fastpath=False + ): + + # we are called internally, so short-circuit + if fastpath: + + # data is an ndarray, index is defined + if not isinstance(data, SingleBlockManager): + data = SingleBlockManager(data, index, fastpath=True) + if copy: + data = data.copy() + if index is None: + index = data.index + + else: + + name = ibase.maybe_extract_name(name, data, type(self)) + + if is_empty_data(data) and dtype is None: + # gh-17261 + warnings.warn( + "The default dtype for empty Series will be 'object' instead " + "of 'float64' in a future version. Specify a dtype explicitly " + "to silence this warning.", + DeprecationWarning, + stacklevel=2, + ) + # uncomment the line below when removing the DeprecationWarning + # dtype = np.dtype(object) + + if index is not None: + index = ensure_index(index) + + if data is None: + data = {} + if dtype is not None: + dtype = self._validate_dtype(dtype) + + if isinstance(data, MultiIndex): + raise NotImplementedError( + "initializing a Series from a MultiIndex is not supported" + ) + elif isinstance(data, Index): + + if dtype is not None: + # astype copies + data = data.astype(dtype) + else: + # need to copy to avoid aliasing issues + data = data._values.copy() + if isinstance(data, ABCDatetimeIndex) and data.tz is not None: + # GH#24096 need copy to be deep for datetime64tz case + # TODO: See if we can avoid these copies + data = data._values.copy(deep=True) + copy = False + + elif isinstance(data, np.ndarray): + if len(data.dtype): + # GH#13296 we are dealing with a compound dtype, which + # should be treated as 2D + raise ValueError( + "Cannot construct a Series from an ndarray with " + "compound dtype. Use DataFrame instead." + ) + pass + elif isinstance(data, ABCSeries): + if index is None: + index = data.index + else: + data = data.reindex(index, copy=copy) + data = data._data + elif is_dict_like(data): + data, index = self._init_dict(data, index, dtype) + dtype = None + copy = False + elif isinstance(data, SingleBlockManager): + if index is None: + index = data.index + elif not data.index.equals(index) or copy: + # GH#19275 SingleBlockManager input should only be called + # internally + raise AssertionError( + "Cannot pass both SingleBlockManager " + "`data` argument and a different " + "`index` argument. `copy` must be False." + ) + + elif is_extension_array_dtype(data): + pass + elif isinstance(data, (set, frozenset)): + raise TypeError(f"'{type(data).__name__}' type is unordered") + elif isinstance(data, ABCSparseArray): + # handle sparse passed here (and force conversion) + data = data.to_dense() + else: + data = com.maybe_iterable_to_list(data) + + if index is None: + if not is_list_like(data): + data = [data] + index = ibase.default_index(len(data)) + elif is_list_like(data): + + # a scalar numpy array is list-like but doesn't + # have a proper length + try: + if len(index) != len(data): + raise ValueError( + f"Length of passed values is {len(data)}, " + f"index implies {len(index)}." + ) + except TypeError: + pass + + # create/copy the manager + if isinstance(data, SingleBlockManager): + if dtype is not None: + data = data.astype(dtype=dtype, errors="ignore", copy=copy) + elif copy: + data = data.copy() + else: + data = sanitize_array(data, index, dtype, copy, raise_cast_failure=True) + + data = SingleBlockManager(data, index, fastpath=True) + + generic.NDFrame.__init__(self, data, fastpath=True) + self.name = name + self._set_axis(0, index, fastpath=True) + + def _init_dict(self, data, index=None, dtype=None): + """ + Derive the "_data" and "index" attributes of a new Series from a + dictionary input. + + Parameters + ---------- + data : dict or dict-like + Data used to populate the new Series. + index : Index or index-like, default None + Index for the new Series: if None, use dict keys. + dtype : dtype, default None + The dtype for the new Series: if None, infer from data. + + Returns + ------- + _data : BlockManager for the new Series + index : index for the new Series + """ + # Looking for NaN in dict doesn't work ({np.nan : 1}[float('nan')] + # raises KeyError), so we iterate the entire dict, and align + if data: + keys, values = zip(*data.items()) + values = list(values) + elif index is not None: + # fastpath for Series(data=None). Just use broadcasting a scalar + # instead of reindexing. + values = na_value_for_dtype(dtype) + keys = index + else: + keys, values = [], [] + + # Input is now list-like, so rely on "standard" construction: + + # TODO: passing np.float64 to not break anything yet. See GH-17261 + s = create_series_with_explicit_dtype( + values, index=keys, dtype=dtype, dtype_if_empty=np.float64 + ) + + # Now we just make sure the order is respected, if any + if data and index is not None: + s = s.reindex(index, copy=False) + return s._data, s.index + + # ---------------------------------------------------------------------- + + @property + def _constructor(self): + return Series + + @property + def _constructor_expanddim(self): + from pandas.core.frame import DataFrame + + return DataFrame + + # types + @property + def _can_hold_na(self): + return self._data._can_hold_na + + _index = None + + def _set_axis(self, axis, labels, fastpath=False): + """ + Override generic, we want to set the _typ here. + """ + + if not fastpath: + labels = ensure_index(labels) + + is_all_dates = labels.is_all_dates + if is_all_dates: + if not isinstance(labels, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): + try: + labels = DatetimeIndex(labels) + # need to set here because we changed the index + if fastpath: + self._data.set_axis(axis, labels) + except (tslibs.OutOfBoundsDatetime, ValueError): + # labels may exceeds datetime bounds, + # or not be a DatetimeIndex + pass + + self._set_subtyp(is_all_dates) + + object.__setattr__(self, "_index", labels) + if not fastpath: + self._data.set_axis(axis, labels) + + def _set_subtyp(self, is_all_dates): + if is_all_dates: + object.__setattr__(self, "_subtyp", "time_series") + else: + object.__setattr__(self, "_subtyp", "series") + + def _update_inplace(self, result, **kwargs): + # we want to call the generic version and not the IndexOpsMixin + return generic.NDFrame._update_inplace(self, result, **kwargs) + + # ndarray compatibility + @property + def dtype(self): + """ + Return the dtype object of the underlying data. + """ + return self._data.dtype + + @property + def dtypes(self): + """ + Return the dtype object of the underlying data. + """ + return self._data.dtype + + @property + def name(self) -> Optional[Hashable]: + return self._name + + @name.setter + def name(self, value: Optional[Hashable]) -> None: + if not is_hashable(value): + raise TypeError("Series.name must be a hashable type") + object.__setattr__(self, "_name", value) + + @property + def values(self): + """ + Return Series as ndarray or ndarray-like depending on the dtype. + + .. warning:: + + We recommend using :attr:`Series.array` or + :meth:`Series.to_numpy`, depending on whether you need + a reference to the underlying data or a NumPy array. + + Returns + ------- + numpy.ndarray or ndarray-like + + See Also + -------- + Series.array : Reference to the underlying data. + Series.to_numpy : A NumPy array representing the underlying data. + + Examples + -------- + >>> pd.Series([1, 2, 3]).values + array([1, 2, 3]) + + >>> pd.Series(list('aabc')).values + array(['a', 'a', 'b', 'c'], dtype=object) + + >>> pd.Series(list('aabc')).astype('category').values + [a, a, b, c] + Categories (3, object): [a, b, c] + + Timezone aware datetime data is converted to UTC: + + >>> pd.Series(pd.date_range('20130101', periods=3, + ... tz='US/Eastern')).values + array(['2013-01-01T05:00:00.000000000', + '2013-01-02T05:00:00.000000000', + '2013-01-03T05:00:00.000000000'], dtype='datetime64[ns]') + """ + return self._data.external_values() + + @property + def _values(self): + """ + Return the internal repr of this data (defined by Block.interval_values). + This are the values as stored in the Block (ndarray or ExtensionArray + depending on the Block class). + + Differs from the public ``.values`` for certain data types, because of + historical backwards compatibility of the public attribute (e.g. period + returns object ndarray and datetimetz a datetime64[ns] ndarray for + ``.values`` while it returns an ExtensionArray for ``._values`` in those + cases). + + Differs from ``.array`` in that this still returns the numpy array if + the Block is backed by a numpy array, while ``.array`` ensures to always + return an ExtensionArray. + + Differs from ``._ndarray_values``, as that ensures to always return a + numpy array (it will call ``_ndarray_values`` on the ExtensionArray, if + the Series was backed by an ExtensionArray). + + Overview: + + dtype | values | _values | array | _ndarray_values | + ----------- | ------------- | ------------- | ------------- | --------------- | + Numeric | ndarray | ndarray | PandasArray | ndarray | + Category | Categorical | Categorical | Categorical | ndarray[int] | + dt64[ns] | ndarray[M8ns] | ndarray[M8ns] | DatetimeArray | ndarray[M8ns] | + dt64[ns tz] | ndarray[M8ns] | DatetimeArray | DatetimeArray | ndarray[M8ns] | + Period | ndarray[obj] | PeriodArray | PeriodArray | ndarray[int] | + Nullable | EA | EA | EA | ndarray | + + """ + return self._data.internal_values() + + @Appender(base.IndexOpsMixin.array.__doc__) # type: ignore + @property + def array(self) -> ExtensionArray: + return self._data._block.array_values() + + def _internal_get_values(self): + """ + Same as values (but handles sparseness conversions); is a view. + + Returns + ------- + numpy.ndarray + Data of the Series. + """ + + return self._data.get_values() + + # ops + def ravel(self, order="C"): + """ + Return the flattened underlying data as an ndarray. + + Returns + ------- + numpy.ndarray or ndarray-like + Flattened data of the Series. + + See Also + -------- + numpy.ndarray.ravel + """ + return self._values.ravel(order=order) + + def __len__(self) -> int: + """ + Return the length of the Series. + """ + return len(self._data) + + def view(self, dtype=None): + """ + Create a new view of the Series. + + This function will return a new Series with a view of the same + underlying values in memory, optionally reinterpreted with a new data + type. The new data type must preserve the same size in bytes as to not + cause index misalignment. + + Parameters + ---------- + dtype : data type + Data type object or one of their string representations. + + Returns + ------- + Series + A new Series object as a view of the same data in memory. + + See Also + -------- + numpy.ndarray.view : Equivalent numpy function to create a new view of + the same data in memory. + + Notes + ----- + Series are instantiated with ``dtype=float64`` by default. While + ``numpy.ndarray.view()`` will return a view with the same data type as + the original array, ``Series.view()`` (without specified dtype) + will try using ``float64`` and may fail if the original data type size + in bytes is not the same. + + Examples + -------- + >>> s = pd.Series([-2, -1, 0, 1, 2], dtype='int8') + >>> s + 0 -2 + 1 -1 + 2 0 + 3 1 + 4 2 + dtype: int8 + + The 8 bit signed integer representation of `-1` is `0b11111111`, but + the same bytes represent 255 if read as an 8 bit unsigned integer: + + >>> us = s.view('uint8') + >>> us + 0 254 + 1 255 + 2 0 + 3 1 + 4 2 + dtype: uint8 + + The views share the same underlying values: + + >>> us[0] = 128 + >>> s + 0 -128 + 1 -1 + 2 0 + 3 1 + 4 2 + dtype: int8 + """ + return self._constructor( + self._values.view(dtype), index=self.index + ).__finalize__(self) + + # ---------------------------------------------------------------------- + # NDArray Compat + _HANDLED_TYPES = (Index, ExtensionArray, np.ndarray) + + def __array_ufunc__( + self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any + ): + # TODO: handle DataFrame + cls = type(self) + + # for binary ops, use our custom dunder methods + result = ops.maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + return result + + # Determine if we should defer. + no_defer = (np.ndarray.__array_ufunc__, cls.__array_ufunc__) + + for item in inputs: + higher_priority = ( + hasattr(item, "__array_priority__") + and item.__array_priority__ > self.__array_priority__ + ) + has_array_ufunc = ( + hasattr(item, "__array_ufunc__") + and type(item).__array_ufunc__ not in no_defer + and not isinstance(item, self._HANDLED_TYPES) + ) + if higher_priority or has_array_ufunc: + return NotImplemented + + # align all the inputs. + names = [getattr(x, "name") for x in inputs if hasattr(x, "name")] + types = tuple(type(x) for x in inputs) + # TODO: dataframe + alignable = [x for x, t in zip(inputs, types) if issubclass(t, Series)] + + if len(alignable) > 1: + # This triggers alignment. + # At the moment, there aren't any ufuncs with more than two inputs + # so this ends up just being x1.index | x2.index, but we write + # it to handle *args. + index = alignable[0].index + for s in alignable[1:]: + index |= s.index + inputs = tuple( + x.reindex(index) if issubclass(t, Series) else x + for x, t in zip(inputs, types) + ) + else: + index = self.index + + inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs) + result = getattr(ufunc, method)(*inputs, **kwargs) + + name: Optional[Hashable] + if len(set(names)) == 1: + name = names[0] + else: + name = None + + def construct_return(result): + if lib.is_scalar(result): + return result + elif result.ndim > 1: + # e.g. np.subtract.outer + if method == "outer": + # GH#27198 + raise NotImplementedError + return result + return self._constructor(result, index=index, name=name, copy=False) + + if type(result) is tuple: + # multiple return values + return tuple(construct_return(x) for x in result) + elif method == "at": + # no return value + return None + else: + return construct_return(result) + + def __array__(self, dtype=None) -> np.ndarray: + """ + Return the values as a NumPy array. + + Users should not call this directly. Rather, it is invoked by + :func:`numpy.array` and :func:`numpy.asarray`. + + Parameters + ---------- + dtype : str or numpy.dtype, optional + The dtype to use for the resulting NumPy array. By default, + the dtype is inferred from the data. + + Returns + ------- + numpy.ndarray + The values in the series converted to a :class:`numpy.ndarary` + with the specified `dtype`. + + See Also + -------- + array : Create a new array from data. + Series.array : Zero-copy view to the array backing the Series. + Series.to_numpy : Series method for similar behavior. + + Examples + -------- + >>> ser = pd.Series([1, 2, 3]) + >>> np.asarray(ser) + array([1, 2, 3]) + + For timezone-aware data, the timezones may be retained with + ``dtype='object'`` + + >>> tzser = pd.Series(pd.date_range('2000', periods=2, tz="CET")) + >>> np.asarray(tzser, dtype="object") + array([Timestamp('2000-01-01 00:00:00+0100', tz='CET', freq='D'), + Timestamp('2000-01-02 00:00:00+0100', tz='CET', freq='D')], + dtype=object) + + Or the values may be localized to UTC and the tzinfo discarded with + ``dtype='datetime64[ns]'`` + + >>> np.asarray(tzser, dtype="datetime64[ns]") # doctest: +ELLIPSIS + array(['1999-12-31T23:00:00.000000000', ...], + dtype='datetime64[ns]') + """ + return np.asarray(self.array, dtype) + + # ---------------------------------------------------------------------- + # Unary Methods + + # coercion + __float__ = _coerce_method(float) + __long__ = _coerce_method(int) + __int__ = _coerce_method(int) + + # ---------------------------------------------------------------------- + + def _unpickle_series_compat(self, state): + if isinstance(state, dict): + self._data = state["_data"] + self.name = state["name"] + self.index = self._data.index + + elif isinstance(state, tuple): + + # < 0.12 series pickle + + nd_state, own_state = state + + # recreate the ndarray + data = np.empty(nd_state[1], dtype=nd_state[2]) + np.ndarray.__setstate__(data, nd_state) + + # backwards compat + index, name = own_state[0], None + if len(own_state) > 1: + name = own_state[1] + + # recreate + self._data = SingleBlockManager(data, index, fastpath=True) + self._index = index + self.name = name + + else: + raise Exception(f"cannot unpickle legacy formats -> [{state}]") + + # indexers + @property + def axes(self): + """ + Return a list of the row axis labels. + """ + return [self.index] + + # ---------------------------------------------------------------------- + # Indexing Methods + + @Appender(generic.NDFrame.take.__doc__) + def take(self, indices, axis=0, is_copy=None, **kwargs) -> "Series": + if is_copy is not None: + warnings.warn( + "is_copy is deprecated and will be removed in a future version. " + "'take' always returns a copy, so there is no need to specify this.", + FutureWarning, + stacklevel=2, + ) + nv.validate_take(tuple(), kwargs) + + indices = ensure_platform_int(indices) + new_index = self.index.take(indices) + + if is_categorical_dtype(self): + # https://github.com/pandas-dev/pandas/issues/20664 + # TODO: remove when the default Categorical.take behavior changes + indices = maybe_convert_indices(indices, len(self._get_axis(axis))) + kwargs = {"allow_fill": False} + else: + kwargs = {} + new_values = self._values.take(indices, **kwargs) + + return self._constructor( + new_values, index=new_index, fastpath=True + ).__finalize__(self) + + def _take_with_is_copy(self, indices, axis=0, **kwargs): + """ + Internal version of the `take` method that sets the `_is_copy` + attribute to keep track of the parent dataframe (using in indexing + for the SettingWithCopyWarning). For Series this does the same + as the public take (it never sets `_is_copy`). + + See the docstring of `take` for full explanation of the parameters. + """ + return self.take(indices=indices, axis=axis, **kwargs) + + def _ixs(self, i: int, axis: int = 0): + """ + Return the i-th value or values in the Series by location. + + Parameters + ---------- + i : int + + Returns + ------- + scalar (int) or Series (slice, sequence) + """ + + # dispatch to the values if we need + values = self._values + if isinstance(values, np.ndarray): + return libindex.get_value_at(values, i) + else: + return values[i] + + def _slice(self, slobj: slice, axis: int = 0, kind=None): + slobj = self.index._convert_slice_indexer(slobj, kind=kind or "getitem") + return self._get_values(slobj) + + def __getitem__(self, key): + key = com.apply_if_callable(key, self) + try: + result = self.index.get_value(self, key) + + if not is_scalar(result): + if is_list_like(result) and not isinstance(result, Series): + + # we need to box if loc of the key isn't scalar here + # otherwise have inline ndarray/lists + try: + if not is_scalar(self.index.get_loc(key)): + result = self._constructor( + result, index=[key] * len(result), dtype=self.dtype + ).__finalize__(self) + except KeyError: + pass + return result + except InvalidIndexError: + pass + except (KeyError, ValueError): + if isinstance(key, tuple) and isinstance(self.index, MultiIndex): + # kludge + pass + elif key is Ellipsis: + return self + elif com.is_bool_indexer(key): + pass + else: + + # we can try to coerce the indexer (or this will raise) + new_key = self.index._convert_scalar_indexer(key, kind="getitem") + if type(new_key) != type(key): + return self.__getitem__(new_key) + raise + + if is_iterator(key): + key = list(key) + + if com.is_bool_indexer(key): + key = check_bool_indexer(self.index, key) + + return self._get_with(key) + + def _get_with(self, key): + # other: fancy integer or otherwise + if isinstance(key, slice): + return self._slice(key) + elif isinstance(key, ABCDataFrame): + raise TypeError( + "Indexing a Series with DataFrame is not " + "supported, use the appropriate DataFrame column" + ) + elif isinstance(key, tuple): + try: + return self._get_values_tuple(key) + except ValueError: + # if we don't have a MultiIndex, we may still be able to handle + # a 1-tuple. see test_1tuple_without_multiindex + if len(key) == 1: + key = key[0] + if isinstance(key, slice): + return self._get_values(key) + raise + + if not isinstance(key, (list, np.ndarray, Series, Index)): + key = list(key) + + if isinstance(key, Index): + key_type = key.inferred_type + else: + key_type = lib.infer_dtype(key, skipna=False) + + if key_type == "integer": + if self.index.is_integer() or self.index.is_floating(): + return self.loc[key] + else: + return self._get_values(key) + elif key_type == "boolean": + return self._get_values(key) + + if isinstance(key, (list, tuple)): + # TODO: de-dup with tuple case handled above? + # handle the dup indexing case GH#4246 + if len(key) == 1 and isinstance(key[0], slice): + # [slice(0, 5, None)] will break if you convert to ndarray, + # e.g. as requested by np.median + # FIXME: hack + return self._get_values(key) + + return self.loc[key] + + return self.reindex(key) + + def _get_values_tuple(self, key): + # mpl hackaround + if com.any_none(*key): + # suppress warning from slicing the index with a 2d indexer. + # eventually we'll want Series itself to warn. + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", "Support for multi-dim", DeprecationWarning + ) + return self._get_values(key) + + if not isinstance(self.index, MultiIndex): + raise ValueError("Can only tuple-index with a MultiIndex") + + # If key is contained, would have returned by now + indexer, new_index = self.index.get_loc_level(key) + return self._constructor(self._values[indexer], index=new_index).__finalize__( + self + ) + + def _get_values(self, indexer): + try: + return self._constructor( + self._data.get_slice(indexer), fastpath=True + ).__finalize__(self) + except ValueError: + # mpl compat if we look up e.g. ser[:, np.newaxis]; + # see tests.series.timeseries.test_mpl_compat_hack + return self._values[indexer] + + def _get_value(self, label, takeable: bool = False): + """ + Quickly retrieve single value at passed index label. + + Parameters + ---------- + label : object + takeable : interpret the index as indexers, default False + + Returns + ------- + scalar value + """ + if takeable: + return com.maybe_box_datetimelike(self._values[label]) + return self.index.get_value(self._values, label) + + def __setitem__(self, key, value): + key = com.apply_if_callable(key, self) + cacher_needs_updating = self._check_is_chained_assignment_possible() + + try: + self._set_with_engine(key, value) + except com.SettingWithCopyError: + raise + except (KeyError, ValueError): + values = self._values + if is_integer(key) and not self.index.inferred_type == "integer": + values[key] = value + elif key is Ellipsis: + self[:] = value + else: + self.loc[key] = value + + except TypeError as e: + if isinstance(key, tuple) and not isinstance(self.index, MultiIndex): + raise ValueError("Can only tuple-index with a MultiIndex") + + # python 3 type errors should be raised + if _is_unorderable_exception(e): + raise IndexError(key) + + if com.is_bool_indexer(key): + key = check_bool_indexer(self.index, key) + try: + self._where(~key, value, inplace=True) + return + except InvalidIndexError: + pass + + self._set_with(key, value) + + if cacher_needs_updating: + self._maybe_update_cacher() + + def _set_with_engine(self, key, value): + values = self._values + if is_extension_array_dtype(values.dtype): + # The cython indexing engine does not support ExtensionArrays. + values[self.index.get_loc(key)] = value + return + try: + self.index._engine.set_value(values, key, value) + return + except KeyError: + values[self.index.get_loc(key)] = value + return + + def _set_with(self, key, value): + # other: fancy integer or otherwise + if isinstance(key, slice): + indexer = self.index._convert_slice_indexer(key, kind="getitem") + return self._set_values(indexer, value) + + elif is_scalar(key) and not is_integer(key) and key not in self.index: + # GH#12862 adding an new key to the Series + # Note: have to exclude integers because that is ambiguously + # position-based + self.loc[key] = value + return + + else: + if isinstance(key, tuple): + try: + # TODO: no test cases that get here + self._set_values(key, value) + except Exception: + pass + + if is_scalar(key): + key = [key] + + if isinstance(key, Index): + key_type = key.inferred_type + key = key._values + else: + key_type = lib.infer_dtype(key, skipna=False) + + if key_type == "integer": + if self.index.inferred_type == "integer": + self._set_labels(key, value) + else: + return self._set_values(key, value) + elif key_type == "boolean": + self._set_values(key.astype(np.bool_), value) + else: + self._set_labels(key, value) + + def _set_labels(self, key, value): + key = com.asarray_tuplesafe(key) + indexer = self.index.get_indexer(key) + mask = indexer == -1 + if mask.any(): + raise ValueError(f"{key[mask]} not contained in the index") + self._set_values(indexer, value) + + def _set_values(self, key, value): + if isinstance(key, Series): + key = key._values + self._data = self._data.setitem(indexer=key, value=value) + self._maybe_update_cacher() + + def _set_value(self, label, value, takeable: bool = False): + """ + Quickly set single value at passed label. + + If label is not contained, a new object is created with the label + placed at the end of the result index. + + Parameters + ---------- + label : object + Partial indexing with MultiIndex not allowed. + value : object + Scalar value. + takeable : interpret the index as indexers, default False + + Returns + ------- + Series + If label is contained, will be reference to calling Series, + otherwise a new object. + """ + try: + if takeable: + self._values[label] = value + else: + self.index._engine.set_value(self._values, label, value) + except (KeyError, TypeError): + + # set using a non-recursive method + self.loc[label] = value + + return self + + # ---------------------------------------------------------------------- + # Unsorted + + @property + def _is_mixed_type(self): + return False + + def repeat(self, repeats, axis=None): + """ + Repeat elements of a Series. + + Returns a new Series where each element of the current Series + is repeated consecutively a given number of times. + + Parameters + ---------- + repeats : int or array of ints + The number of repetitions for each element. This should be a + non-negative integer. Repeating 0 times will return an empty + Series. + axis : None + Must be ``None``. Has no effect but is accepted for compatibility + with numpy. + + Returns + ------- + Series + Newly created Series with repeated elements. + + See Also + -------- + Index.repeat : Equivalent function for Index. + numpy.repeat : Similar method for :class:`numpy.ndarray`. + + Examples + -------- + >>> s = pd.Series(['a', 'b', 'c']) + >>> s + 0 a + 1 b + 2 c + dtype: object + >>> s.repeat(2) + 0 a + 0 a + 1 b + 1 b + 2 c + 2 c + dtype: object + >>> s.repeat([1, 2, 3]) + 0 a + 1 b + 1 b + 2 c + 2 c + 2 c + dtype: object + """ + nv.validate_repeat(tuple(), dict(axis=axis)) + new_index = self.index.repeat(repeats) + new_values = self._values.repeat(repeats) + return self._constructor(new_values, index=new_index).__finalize__(self) + + def reset_index(self, level=None, drop=False, name=None, inplace=False): + """ + Generate a new DataFrame or Series with the index reset. + + This is useful when the index needs to be treated as a column, or + when the index is meaningless and needs to be reset to the default + before another operation. + + Parameters + ---------- + level : int, str, tuple, or list, default optional + For a Series with a MultiIndex, only remove the specified levels + from the index. Removes all levels by default. + drop : bool, default False + Just reset the index, without inserting it as a column in + the new DataFrame. + name : object, optional + The name to use for the column containing the original Series + values. Uses ``self.name`` by default. This argument is ignored + when `drop` is True. + inplace : bool, default False + Modify the Series in place (do not create a new object). + + Returns + ------- + Series or DataFrame + When `drop` is False (the default), a DataFrame is returned. + The newly created columns will come first in the DataFrame, + followed by the original Series values. + When `drop` is True, a `Series` is returned. + In either case, if ``inplace=True``, no value is returned. + + See Also + -------- + DataFrame.reset_index: Analogous function for DataFrame. + + Examples + -------- + >>> s = pd.Series([1, 2, 3, 4], name='foo', + ... index=pd.Index(['a', 'b', 'c', 'd'], name='idx')) + + Generate a DataFrame with default index. + + >>> s.reset_index() + idx foo + 0 a 1 + 1 b 2 + 2 c 3 + 3 d 4 + + To specify the name of the new column use `name`. + + >>> s.reset_index(name='values') + idx values + 0 a 1 + 1 b 2 + 2 c 3 + 3 d 4 + + To generate a new Series with the default set `drop` to True. + + >>> s.reset_index(drop=True) + 0 1 + 1 2 + 2 3 + 3 4 + Name: foo, dtype: int64 + + To update the Series in place, without generating a new one + set `inplace` to True. Note that it also requires ``drop=True``. + + >>> s.reset_index(inplace=True, drop=True) + >>> s + 0 1 + 1 2 + 2 3 + 3 4 + Name: foo, dtype: int64 + + The `level` parameter is interesting for Series with a multi-level + index. + + >>> arrays = [np.array(['bar', 'bar', 'baz', 'baz']), + ... np.array(['one', 'two', 'one', 'two'])] + >>> s2 = pd.Series( + ... range(4), name='foo', + ... index=pd.MultiIndex.from_arrays(arrays, + ... names=['a', 'b'])) + + To remove a specific level from the Index, use `level`. + + >>> s2.reset_index(level='a') + a foo + b + one bar 0 + two bar 1 + one baz 2 + two baz 3 + + If `level` is not set, all levels are removed from the Index. + + >>> s2.reset_index() + a b foo + 0 bar one 0 + 1 bar two 1 + 2 baz one 2 + 3 baz two 3 + """ + inplace = validate_bool_kwarg(inplace, "inplace") + if drop: + new_index = ibase.default_index(len(self)) + if level is not None: + if not isinstance(level, (tuple, list)): + level = [level] + level = [self.index._get_level_number(lev) for lev in level] + if len(level) < self.index.nlevels: + new_index = self.index.droplevel(level) + + if inplace: + self.index = new_index + # set name if it was passed, otherwise, keep the previous name + self.name = name or self.name + else: + return self._constructor( + self._values.copy(), index=new_index + ).__finalize__(self) + elif inplace: + raise TypeError( + "Cannot reset_index inplace on a Series to create a DataFrame" + ) + else: + df = self.to_frame(name) + return df.reset_index(level=level, drop=drop) + + # ---------------------------------------------------------------------- + # Rendering Methods + + def __repr__(self) -> str: + """ + Return a string representation for a particular Series. + """ + buf = StringIO("") + width, height = get_terminal_size() + max_rows = ( + height + if get_option("display.max_rows") == 0 + else get_option("display.max_rows") + ) + min_rows = ( + height + if get_option("display.max_rows") == 0 + else get_option("display.min_rows") + ) + show_dimensions = get_option("display.show_dimensions") + + self.to_string( + buf=buf, + name=self.name, + dtype=self.dtype, + min_rows=min_rows, + max_rows=max_rows, + length=show_dimensions, + ) + result = buf.getvalue() + + return result + + def to_string( + self, + buf=None, + na_rep="NaN", + float_format=None, + header=True, + index=True, + length=False, + dtype=False, + name=False, + max_rows=None, + min_rows=None, + ): + """ + Render a string representation of the Series. + + Parameters + ---------- + buf : StringIO-like, optional + Buffer to write to. + na_rep : str, optional + String representation of NaN to use, default 'NaN'. + float_format : one-parameter function, optional + Formatter function to apply to columns' elements if they are + floats, default None. + header : bool, default True + Add the Series header (index name). + index : bool, optional + Add index (row) labels, default True. + length : bool, default False + Add the Series length. + dtype : bool, default False + Add the Series dtype. + name : bool, default False + Add the Series name if not None. + max_rows : int, optional + Maximum number of rows to show before truncating. If None, show + all. + min_rows : int, optional + The number of rows to display in a truncated repr (when number + of rows is above `max_rows`). + + Returns + ------- + str or None + String representation of Series if ``buf=None``, otherwise None. + """ + + formatter = fmt.SeriesFormatter( + self, + name=name, + length=length, + header=header, + index=index, + dtype=dtype, + na_rep=na_rep, + float_format=float_format, + min_rows=min_rows, + max_rows=max_rows, + ) + result = formatter.to_string() + + # catch contract violations + if not isinstance(result, str): + raise AssertionError( + "result must be of type str, type" + f" of result is {repr(type(result).__name__)}" + ) + + if buf is None: + return result + else: + try: + buf.write(result) + except AttributeError: + with open(buf, "w") as f: + f.write(result) + + @Appender( + """ + Examples + -------- + >>> s = pd.Series(["elk", "pig", "dog", "quetzal"], name="animal") + >>> print(s.to_markdown()) + | | animal | + |---:|:---------| + | 0 | elk | + | 1 | pig | + | 2 | dog | + | 3 | quetzal | + """ + ) + @Substitution(klass="Series") + @Appender(generic._shared_docs["to_markdown"]) + def to_markdown( + self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs + ) -> Optional[str]: + return self.to_frame().to_markdown(buf, mode, **kwargs) + + # ---------------------------------------------------------------------- + + def items(self): + """ + Lazily iterate over (index, value) tuples. + + This method returns an iterable tuple (index, value). This is + convenient if you want to create a lazy iterator. + + Returns + ------- + iterable + Iterable of tuples containing the (index, value) pairs from a + Series. + + See Also + -------- + DataFrame.items : Iterate over (column name, Series) pairs. + DataFrame.iterrows : Iterate over DataFrame rows as (index, Series) pairs. + + Examples + -------- + >>> s = pd.Series(['A', 'B', 'C']) + >>> for index, value in s.items(): + ... print(f"Index : {index}, Value : {value}") + Index : 0, Value : A + Index : 1, Value : B + Index : 2, Value : C + """ + return zip(iter(self.index), iter(self)) + + @Appender(items.__doc__) + def iteritems(self): + return self.items() + + # ---------------------------------------------------------------------- + # Misc public methods + + def keys(self): + """ + Return alias for index. + + Returns + ------- + Index + Index of the Series. + """ + return self.index + + def to_dict(self, into=dict): + """ + Convert Series to {label -> value} dict or dict-like object. + + Parameters + ---------- + into : class, default dict + The collections.abc.Mapping subclass to use as the return + object. Can be the actual class or an empty + instance of the mapping type you want. If you want a + collections.defaultdict, you must pass it initialized. + + .. versionadded:: 0.21.0 + + Returns + ------- + collections.abc.Mapping + Key-value representation of Series. + + Examples + -------- + >>> s = pd.Series([1, 2, 3, 4]) + >>> s.to_dict() + {0: 1, 1: 2, 2: 3, 3: 4} + >>> from collections import OrderedDict, defaultdict + >>> s.to_dict(OrderedDict) + OrderedDict([(0, 1), (1, 2), (2, 3), (3, 4)]) + >>> dd = defaultdict(list) + >>> s.to_dict(dd) + defaultdict(, {0: 1, 1: 2, 2: 3, 3: 4}) + """ + # GH16122 + into_c = com.standardize_mapping(into) + return into_c(self.items()) + + def to_frame(self, name=None): + """ + Convert Series to DataFrame. + + Parameters + ---------- + name : object, default None + The passed name should substitute for the series name (if it has + one). + + Returns + ------- + DataFrame + DataFrame representation of Series. + + Examples + -------- + >>> s = pd.Series(["a", "b", "c"], + ... name="vals") + >>> s.to_frame() + vals + 0 a + 1 b + 2 c + """ + if name is None: + df = self._constructor_expanddim(self) + else: + df = self._constructor_expanddim({name: self}) + + return df + + def _set_name(self, name, inplace=False): + """ + Set the Series name. + + Parameters + ---------- + name : str + inplace : bool + Whether to modify `self` directly or return a copy. + """ + inplace = validate_bool_kwarg(inplace, "inplace") + ser = self if inplace else self.copy() + ser.name = name + return ser + + @Appender( + """ +Examples +-------- +>>> ser = pd.Series([390., 350., 30., 20.], +... index=['Falcon', 'Falcon', 'Parrot', 'Parrot'], name="Max Speed") +>>> ser +Falcon 390.0 +Falcon 350.0 +Parrot 30.0 +Parrot 20.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(["a", "b", "a", "b"]).mean() +a 210.0 +b 185.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(level=0).mean() +Falcon 370.0 +Parrot 25.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(ser > 100).mean() +Max Speed +False 25.0 +True 370.0 +Name: Max Speed, dtype: float64 + +**Grouping by Indexes** + +We can groupby different levels of a hierarchical index +using the `level` parameter: + +>>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], +... ['Captive', 'Wild', 'Captive', 'Wild']] +>>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) +>>> ser = pd.Series([390., 350., 30., 20.], index=index, name="Max Speed") +>>> ser +Animal Type +Falcon Captive 390.0 + Wild 350.0 +Parrot Captive 30.0 + Wild 20.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(level=0).mean() +Animal +Falcon 370.0 +Parrot 25.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(level="Type").mean() +Type +Captive 210.0 +Wild 185.0 +Name: Max Speed, dtype: float64 +""" + ) + @Appender(generic._shared_docs["groupby"] % _shared_doc_kwargs) + def groupby( + self, + by=None, + axis=0, + level=None, + as_index: bool = True, + sort: bool = True, + group_keys: bool = True, + squeeze: bool = False, + observed: bool = False, + ) -> "groupby_generic.SeriesGroupBy": + + if level is None and by is None: + raise TypeError("You have to supply one of 'by' and 'level'") + axis = self._get_axis_number(axis) + + return groupby_generic.SeriesGroupBy( + obj=self, + keys=by, + axis=axis, + level=level, + as_index=as_index, + sort=sort, + group_keys=group_keys, + squeeze=squeeze, + observed=observed, + ) + + # ---------------------------------------------------------------------- + # Statistics, overridden ndarray methods + + # TODO: integrate bottleneck + + def count(self, level=None): + """ + Return number of non-NA/null observations in the Series. + + Parameters + ---------- + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a + particular level, collapsing into a smaller Series. + + Returns + ------- + int or Series (if level specified) + Number of non-null values in the Series. + + Examples + -------- + >>> s = pd.Series([0.0, 1.0, np.nan]) + >>> s.count() + 2 + """ + if level is None: + return notna(self.array).sum() + + if isinstance(level, str): + level = self.index._get_level_number(level) + + lev = self.index.levels[level] + level_codes = np.array(self.index.codes[level], subok=False, copy=True) + + mask = level_codes == -1 + if mask.any(): + level_codes[mask] = cnt = len(lev) + lev = lev.insert(cnt, lev._na_value) + + obs = level_codes[notna(self.values)] + out = np.bincount(obs, minlength=len(lev) or None) + return self._constructor(out, index=lev, dtype="int64").__finalize__(self) + + def mode(self, dropna=True): + """ + Return the mode(s) of the dataset. + + Always returns Series even if only one value is returned. + + Parameters + ---------- + dropna : bool, default True + Don't consider counts of NaN/NaT. + + .. versionadded:: 0.24.0 + + Returns + ------- + Series + Modes of the Series in sorted order. + """ + # TODO: Add option for bins like value_counts() + return algorithms.mode(self, dropna=dropna) + + def unique(self): + """ + Return unique values of Series object. + + Uniques are returned in order of appearance. Hash table-based unique, + therefore does NOT sort. + + Returns + ------- + ndarray or ExtensionArray + The unique values returned as a NumPy array. See Notes. + + See Also + -------- + unique : Top-level unique method for any 1-d array-like object. + Index.unique : Return Index with unique values from an Index object. + + Notes + ----- + Returns the unique values as a NumPy array. In case of an + extension-array backed Series, a new + :class:`~api.extensions.ExtensionArray` of that type with just + the unique values is returned. This includes + + * Categorical + * Period + * Datetime with Timezone + * Interval + * Sparse + * IntegerNA + + See Examples section. + + Examples + -------- + >>> pd.Series([2, 1, 3, 3], name='A').unique() + array([2, 1, 3]) + + >>> pd.Series([pd.Timestamp('2016-01-01') for _ in range(3)]).unique() + array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]') + + >>> pd.Series([pd.Timestamp('2016-01-01', tz='US/Eastern') + ... for _ in range(3)]).unique() + + ['2016-01-01 00:00:00-05:00'] + Length: 1, dtype: datetime64[ns, US/Eastern] + + An unordered Categorical will return categories in the order of + appearance. + + >>> pd.Series(pd.Categorical(list('baabc'))).unique() + [b, a, c] + Categories (3, object): [b, a, c] + + An ordered Categorical preserves the category ordering. + + >>> pd.Series(pd.Categorical(list('baabc'), categories=list('abc'), + ... ordered=True)).unique() + [b, a, c] + Categories (3, object): [a < b < c] + """ + result = super().unique() + return result + + def drop_duplicates(self, keep="first", inplace=False): + """ + Return Series with duplicate values removed. + + Parameters + ---------- + keep : {'first', 'last', ``False``}, default 'first' + Method to handle dropping duplicates: + + - 'first' : Drop duplicates except for the first occurrence. + - 'last' : Drop duplicates except for the last occurrence. + - ``False`` : Drop all duplicates. + + inplace : bool, default ``False`` + If ``True``, performs operation inplace and returns None. + + Returns + ------- + Series + Series with duplicates dropped. + + See Also + -------- + Index.drop_duplicates : Equivalent method on Index. + DataFrame.drop_duplicates : Equivalent method on DataFrame. + Series.duplicated : Related method on Series, indicating duplicate + Series values. + + Examples + -------- + Generate a Series with duplicated entries. + + >>> s = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo'], + ... name='animal') + >>> s + 0 lama + 1 cow + 2 lama + 3 beetle + 4 lama + 5 hippo + Name: animal, dtype: object + + With the 'keep' parameter, the selection behaviour of duplicated values + can be changed. The value 'first' keeps the first occurrence for each + set of duplicated entries. The default value of keep is 'first'. + + >>> s.drop_duplicates() + 0 lama + 1 cow + 3 beetle + 5 hippo + Name: animal, dtype: object + + The value 'last' for parameter 'keep' keeps the last occurrence for + each set of duplicated entries. + + >>> s.drop_duplicates(keep='last') + 1 cow + 3 beetle + 4 lama + 5 hippo + Name: animal, dtype: object + + The value ``False`` for parameter 'keep' discards all sets of + duplicated entries. Setting the value of 'inplace' to ``True`` performs + the operation inplace and returns ``None``. + + >>> s.drop_duplicates(keep=False, inplace=True) + >>> s + 1 cow + 3 beetle + 5 hippo + Name: animal, dtype: object + """ + return super().drop_duplicates(keep=keep, inplace=inplace) + + def duplicated(self, keep="first"): + """ + Indicate duplicate Series values. + + Duplicated values are indicated as ``True`` values in the resulting + Series. Either all duplicates, all except the first or all except the + last occurrence of duplicates can be indicated. + + Parameters + ---------- + keep : {'first', 'last', False}, default 'first' + Method to handle dropping duplicates: + + - 'first' : Mark duplicates as ``True`` except for the first + occurrence. + - 'last' : Mark duplicates as ``True`` except for the last + occurrence. + - ``False`` : Mark all duplicates as ``True``. + + Returns + ------- + Series + Series indicating whether each value has occurred in the + preceding values. + + See Also + -------- + Index.duplicated : Equivalent method on pandas.Index. + DataFrame.duplicated : Equivalent method on pandas.DataFrame. + Series.drop_duplicates : Remove duplicate values from Series. + + Examples + -------- + By default, for each set of duplicated values, the first occurrence is + set on False and all others on True: + + >>> animals = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama']) + >>> animals.duplicated() + 0 False + 1 False + 2 True + 3 False + 4 True + dtype: bool + + which is equivalent to + + >>> animals.duplicated(keep='first') + 0 False + 1 False + 2 True + 3 False + 4 True + dtype: bool + + By using 'last', the last occurrence of each set of duplicated values + is set on False and all others on True: + + >>> animals.duplicated(keep='last') + 0 True + 1 False + 2 True + 3 False + 4 False + dtype: bool + + By setting keep on ``False``, all duplicates are True: + + >>> animals.duplicated(keep=False) + 0 True + 1 False + 2 True + 3 False + 4 True + dtype: bool + """ + return super().duplicated(keep=keep) + + def idxmin(self, axis=0, skipna=True, *args, **kwargs): + """ + Return the row label of the minimum value. + + If multiple values equal the minimum, the first row label with that + value is returned. + + Parameters + ---------- + axis : int, default 0 + For compatibility with DataFrame.idxmin. Redundant for application + on Series. + skipna : bool, default True + Exclude NA/null values. If the entire Series is NA, the result + will be NA. + *args, **kwargs + Additional arguments and keywords have no effect but might be + accepted for compatibility with NumPy. + + Returns + ------- + Index + Label of the minimum value. + + Raises + ------ + ValueError + If the Series is empty. + + See Also + -------- + numpy.argmin : Return indices of the minimum values + along the given axis. + DataFrame.idxmin : Return index of first occurrence of minimum + over requested axis. + Series.idxmax : Return index *label* of the first occurrence + of maximum of values. + + Notes + ----- + This method is the Series version of ``ndarray.argmin``. This method + returns the label of the minimum, while ``ndarray.argmin`` returns + the position. To get the position, use ``series.values.argmin()``. + + Examples + -------- + >>> s = pd.Series(data=[1, None, 4, 1], + ... index=['A', 'B', 'C', 'D']) + >>> s + A 1.0 + B NaN + C 4.0 + D 1.0 + dtype: float64 + + >>> s.idxmin() + 'A' + + If `skipna` is False and there is an NA value in the data, + the function returns ``nan``. + + >>> s.idxmin(skipna=False) + nan + """ + skipna = nv.validate_argmin_with_skipna(skipna, args, kwargs) + i = nanops.nanargmin(com.values_from_object(self), skipna=skipna) + if i == -1: + return np.nan + return self.index[i] + + def idxmax(self, axis=0, skipna=True, *args, **kwargs): + """ + Return the row label of the maximum value. + + If multiple values equal the maximum, the first row label with that + value is returned. + + Parameters + ---------- + axis : int, default 0 + For compatibility with DataFrame.idxmax. Redundant for application + on Series. + skipna : bool, default True + Exclude NA/null values. If the entire Series is NA, the result + will be NA. + *args, **kwargs + Additional arguments and keywords have no effect but might be + accepted for compatibility with NumPy. + + Returns + ------- + Index + Label of the maximum value. + + Raises + ------ + ValueError + If the Series is empty. + + See Also + -------- + numpy.argmax : Return indices of the maximum values + along the given axis. + DataFrame.idxmax : Return index of first occurrence of maximum + over requested axis. + Series.idxmin : Return index *label* of the first occurrence + of minimum of values. + + Notes + ----- + This method is the Series version of ``ndarray.argmax``. This method + returns the label of the maximum, while ``ndarray.argmax`` returns + the position. To get the position, use ``series.values.argmax()``. + + Examples + -------- + >>> s = pd.Series(data=[1, None, 4, 3, 4], + ... index=['A', 'B', 'C', 'D', 'E']) + >>> s + A 1.0 + B NaN + C 4.0 + D 3.0 + E 4.0 + dtype: float64 + + >>> s.idxmax() + 'C' + + If `skipna` is False and there is an NA value in the data, + the function returns ``nan``. + + >>> s.idxmax(skipna=False) + nan + """ + skipna = nv.validate_argmax_with_skipna(skipna, args, kwargs) + i = nanops.nanargmax(com.values_from_object(self), skipna=skipna) + if i == -1: + return np.nan + return self.index[i] + + def round(self, decimals=0, *args, **kwargs): + """ + Round each value in a Series to the given number of decimals. + + Parameters + ---------- + decimals : int, default 0 + Number of decimal places to round to. If decimals is negative, + it specifies the number of positions to the left of the decimal point. + + Returns + ------- + Series + Rounded values of the Series. + + See Also + -------- + numpy.around : Round values of an np.array. + DataFrame.round : Round values of a DataFrame. + + Examples + -------- + >>> s = pd.Series([0.1, 1.3, 2.7]) + >>> s.round() + 0 0.0 + 1 1.0 + 2 3.0 + dtype: float64 + """ + nv.validate_round(args, kwargs) + result = com.values_from_object(self).round(decimals) + result = self._constructor(result, index=self.index).__finalize__(self) + + return result + + def quantile(self, q=0.5, interpolation="linear"): + """ + Return value at the given quantile. + + Parameters + ---------- + q : float or array-like, default 0.5 (50% quantile) + The quantile(s) to compute, which can lie in range: 0 <= q <= 1. + interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + This optional parameter specifies the interpolation method to use, + when the desired quantile lies between two data points `i` and `j`: + + * linear: `i + (j - i) * fraction`, where `fraction` is the + fractional part of the index surrounded by `i` and `j`. + * lower: `i`. + * higher: `j`. + * nearest: `i` or `j` whichever is nearest. + * midpoint: (`i` + `j`) / 2. + + Returns + ------- + float or Series + If ``q`` is an array, a Series will be returned where the + index is ``q`` and the values are the quantiles, otherwise + a float will be returned. + + See Also + -------- + core.window.Rolling.quantile + numpy.percentile + + Examples + -------- + >>> s = pd.Series([1, 2, 3, 4]) + >>> s.quantile(.5) + 2.5 + >>> s.quantile([.25, .5, .75]) + 0.25 1.75 + 0.50 2.50 + 0.75 3.25 + dtype: float64 + """ + + validate_percentile(q) + + # We dispatch to DataFrame so that core.internals only has to worry + # about 2D cases. + df = self.to_frame() + + result = df.quantile(q=q, interpolation=interpolation, numeric_only=False) + if result.ndim == 2: + result = result.iloc[:, 0] + + if is_list_like(q): + result.name = self.name + return self._constructor(result, index=Float64Index(q), name=self.name) + else: + # scalar + return result.iloc[0] + + def corr(self, other, method="pearson", min_periods=None): + """ + Compute correlation with `other` Series, excluding missing values. + + Parameters + ---------- + other : Series + Series with which to compute the correlation. + method : {'pearson', 'kendall', 'spearman'} or callable + Method used to compute correlation: + + - pearson : Standard correlation coefficient + - kendall : Kendall Tau correlation coefficient + - spearman : Spearman rank correlation + - callable: Callable with input two 1d ndarrays and returning a float. + + .. versionadded:: 0.24.0 + Note that the returned matrix from corr will have 1 along the + diagonals and will be symmetric regardless of the callable's + behavior. + min_periods : int, optional + Minimum number of observations needed to have a valid result. + + Returns + ------- + float + Correlation with other. + + Examples + -------- + >>> def histogram_intersection(a, b): + ... v = np.minimum(a, b).sum().round(decimals=1) + ... return v + >>> s1 = pd.Series([.2, .0, .6, .2]) + >>> s2 = pd.Series([.3, .6, .0, .1]) + >>> s1.corr(s2, method=histogram_intersection) + 0.3 + """ + this, other = self.align(other, join="inner", copy=False) + if len(this) == 0: + return np.nan + + if method in ["pearson", "spearman", "kendall"] or callable(method): + return nanops.nancorr( + this.values, other.values, method=method, min_periods=min_periods + ) + + raise ValueError( + "method must be either 'pearson', " + "'spearman', 'kendall', or a callable, " + f"'{method}' was supplied" + ) + + def cov(self, other, min_periods=None): + """ + Compute covariance with Series, excluding missing values. + + Parameters + ---------- + other : Series + Series with which to compute the covariance. + min_periods : int, optional + Minimum number of observations needed to have a valid result. + + Returns + ------- + float + Covariance between Series and other normalized by N-1 + (unbiased estimator). + + Examples + -------- + >>> s1 = pd.Series([0.90010907, 0.13484424, 0.62036035]) + >>> s2 = pd.Series([0.12528585, 0.26962463, 0.51111198]) + >>> s1.cov(s2) + -0.01685762652715874 + """ + this, other = self.align(other, join="inner", copy=False) + if len(this) == 0: + return np.nan + return nanops.nancov(this.values, other.values, min_periods=min_periods) + + def diff(self, periods=1): + """ + First discrete difference of element. + + Calculates the difference of a Series element compared with another + element in the Series (default is element in previous row). + + Parameters + ---------- + periods : int, default 1 + Periods to shift for calculating difference, accepts negative + values. + + Returns + ------- + Series + First differences of the Series. + + See Also + -------- + Series.pct_change: Percent change over given number of periods. + Series.shift: Shift index by desired number of periods with an + optional time freq. + DataFrame.diff: First discrete difference of object. + + Notes + ----- + For boolean dtypes, this uses :meth:`operator.xor` rather than + :meth:`operator.sub`. + + Examples + -------- + Difference with previous row + + >>> s = pd.Series([1, 1, 2, 3, 5, 8]) + >>> s.diff() + 0 NaN + 1 0.0 + 2 1.0 + 3 1.0 + 4 2.0 + 5 3.0 + dtype: float64 + + Difference with 3rd previous row + + >>> s.diff(periods=3) + 0 NaN + 1 NaN + 2 NaN + 3 2.0 + 4 4.0 + 5 6.0 + dtype: float64 + + Difference with following row + + >>> s.diff(periods=-1) + 0 0.0 + 1 -1.0 + 2 -1.0 + 3 -2.0 + 4 -3.0 + 5 NaN + dtype: float64 + """ + result = algorithms.diff(self.array, periods) + return self._constructor(result, index=self.index).__finalize__(self) + + def autocorr(self, lag=1): + """ + Compute the lag-N autocorrelation. + + This method computes the Pearson correlation between + the Series and its shifted self. + + Parameters + ---------- + lag : int, default 1 + Number of lags to apply before performing autocorrelation. + + Returns + ------- + float + The Pearson correlation between self and self.shift(lag). + + See Also + -------- + Series.corr : Compute the correlation between two Series. + Series.shift : Shift index by desired number of periods. + DataFrame.corr : Compute pairwise correlation of columns. + DataFrame.corrwith : Compute pairwise correlation between rows or + columns of two DataFrame objects. + + Notes + ----- + If the Pearson correlation is not well defined return 'NaN'. + + Examples + -------- + >>> s = pd.Series([0.25, 0.5, 0.2, -0.05]) + >>> s.autocorr() # doctest: +ELLIPSIS + 0.10355... + >>> s.autocorr(lag=2) # doctest: +ELLIPSIS + -0.99999... + + If the Pearson correlation is not well defined, then 'NaN' is returned. + + >>> s = pd.Series([1, 0, 0, 0]) + >>> s.autocorr() + nan + """ + return self.corr(self.shift(lag)) + + def dot(self, other): + """ + Compute the dot product between the Series and the columns of other. + + This method computes the dot product between the Series and another + one, or the Series and each columns of a DataFrame, or the Series and + each columns of an array. + + It can also be called using `self @ other` in Python >= 3.5. + + Parameters + ---------- + other : Series, DataFrame or array-like + The other object to compute the dot product with its columns. + + Returns + ------- + scalar, Series or numpy.ndarray + Return the dot product of the Series and other if other is a + Series, the Series of the dot product of Series and each rows of + other if other is a DataFrame or a numpy.ndarray between the Series + and each columns of the numpy array. + + See Also + -------- + DataFrame.dot: Compute the matrix product with the DataFrame. + Series.mul: Multiplication of series and other, element-wise. + + Notes + ----- + The Series and other has to share the same index if other is a Series + or a DataFrame. + + Examples + -------- + >>> s = pd.Series([0, 1, 2, 3]) + >>> other = pd.Series([-1, 2, -3, 4]) + >>> s.dot(other) + 8 + >>> s @ other + 8 + >>> df = pd.DataFrame([[0, 1], [-2, 3], [4, -5], [6, 7]]) + >>> s.dot(df) + 0 24 + 1 14 + dtype: int64 + >>> arr = np.array([[0, 1], [-2, 3], [4, -5], [6, 7]]) + >>> s.dot(arr) + array([24, 14]) + """ + if isinstance(other, (Series, ABCDataFrame)): + common = self.index.union(other.index) + if len(common) > len(self.index) or len(common) > len(other.index): + raise ValueError("matrices are not aligned") + + left = self.reindex(index=common, copy=False) + right = other.reindex(index=common, copy=False) + lvals = left.values + rvals = right.values + else: + lvals = self.values + rvals = np.asarray(other) + if lvals.shape[0] != rvals.shape[0]: + raise Exception( + f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}" + ) + + if isinstance(other, ABCDataFrame): + return self._constructor( + np.dot(lvals, rvals), index=other.columns + ).__finalize__(self) + elif isinstance(other, Series): + return np.dot(lvals, rvals) + elif isinstance(rvals, np.ndarray): + return np.dot(lvals, rvals) + else: # pragma: no cover + raise TypeError(f"unsupported type: {type(other)}") + + def __matmul__(self, other): + """ + Matrix multiplication using binary `@` operator in Python>=3.5. + """ + return self.dot(other) + + def __rmatmul__(self, other): + """ + Matrix multiplication using binary `@` operator in Python>=3.5. + """ + return self.dot(np.transpose(other)) + + @Substitution(klass="Series") + @Appender(base._shared_docs["searchsorted"]) + def searchsorted(self, value, side="left", sorter=None): + return algorithms.searchsorted(self._values, value, side=side, sorter=sorter) + + # ------------------------------------------------------------------- + # Combination + + def append(self, to_append, ignore_index=False, verify_integrity=False): + """ + Concatenate two or more Series. + + Parameters + ---------- + to_append : Series or list/tuple of Series + Series to append with self. + ignore_index : bool, default False + If True, do not use the index labels. + verify_integrity : bool, default False + If True, raise Exception on creating index with duplicates. + + Returns + ------- + Series + Concatenated Series. + + See Also + -------- + concat : General function to concatenate DataFrame or Series objects. + + Notes + ----- + Iteratively appending to a Series can be more computationally intensive + than a single concatenate. A better solution is to append values to a + list and then concatenate the list with the original Series all at + once. + + Examples + -------- + >>> s1 = pd.Series([1, 2, 3]) + >>> s2 = pd.Series([4, 5, 6]) + >>> s3 = pd.Series([4, 5, 6], index=[3, 4, 5]) + >>> s1.append(s2) + 0 1 + 1 2 + 2 3 + 0 4 + 1 5 + 2 6 + dtype: int64 + + >>> s1.append(s3) + 0 1 + 1 2 + 2 3 + 3 4 + 4 5 + 5 6 + dtype: int64 + + With `ignore_index` set to True: + + >>> s1.append(s2, ignore_index=True) + 0 1 + 1 2 + 2 3 + 3 4 + 4 5 + 5 6 + dtype: int64 + + With `verify_integrity` set to True: + + >>> s1.append(s2, verify_integrity=True) + Traceback (most recent call last): + ... + ValueError: Indexes have overlapping values: [0, 1, 2] + """ + from pandas.core.reshape.concat import concat + + if isinstance(to_append, (list, tuple)): + to_concat = [self] + to_concat.extend(to_append) + else: + to_concat = [self, to_append] + return concat( + to_concat, ignore_index=ignore_index, verify_integrity=verify_integrity + ) + + def _binop(self, other, func, level=None, fill_value=None): + """ + Perform generic binary operation with optional fill value. + + Parameters + ---------- + other : Series + func : binary operator + fill_value : float or object + Value to substitute for NA/null values. If both Series are NA in a + location, the result will be NA regardless of the passed fill value. + level : int or level name, default None + Broadcast across a level, matching Index values on the + passed MultiIndex level. + + Returns + ------- + Series + """ + + if not isinstance(other, Series): + raise AssertionError("Other operand must be Series") + + new_index = self.index + this = self + + if not self.index.equals(other.index): + this, other = self.align(other, level=level, join="outer", copy=False) + new_index = this.index + + this_vals, other_vals = ops.fill_binop(this.values, other.values, fill_value) + + with np.errstate(all="ignore"): + result = func(this_vals, other_vals) + + name = ops.get_op_result_name(self, other) + ret = ops._construct_result(self, result, new_index, name) + return ret + + def combine(self, other, func, fill_value=None): + """ + Combine the Series with a Series or scalar according to `func`. + + Combine the Series and `other` using `func` to perform elementwise + selection for combined Series. + `fill_value` is assumed when value is missing at some index + from one of the two objects being combined. + + Parameters + ---------- + other : Series or scalar + The value(s) to be combined with the `Series`. + func : function + Function that takes two scalars as inputs and returns an element. + fill_value : scalar, optional + The value to assume when an index is missing from + one Series or the other. The default specifies to use the + appropriate NaN value for the underlying dtype of the Series. + + Returns + ------- + Series + The result of combining the Series with the other object. + + See Also + -------- + Series.combine_first : Combine Series values, choosing the calling + Series' values first. + + Examples + -------- + Consider 2 Datasets ``s1`` and ``s2`` containing + highest clocked speeds of different birds. + + >>> s1 = pd.Series({'falcon': 330.0, 'eagle': 160.0}) + >>> s1 + falcon 330.0 + eagle 160.0 + dtype: float64 + >>> s2 = pd.Series({'falcon': 345.0, 'eagle': 200.0, 'duck': 30.0}) + >>> s2 + falcon 345.0 + eagle 200.0 + duck 30.0 + dtype: float64 + + Now, to combine the two datasets and view the highest speeds + of the birds across the two datasets + + >>> s1.combine(s2, max) + duck NaN + eagle 200.0 + falcon 345.0 + dtype: float64 + + In the previous example, the resulting value for duck is missing, + because the maximum of a NaN and a float is a NaN. + So, in the example, we set ``fill_value=0``, + so the maximum value returned will be the value from some dataset. + + >>> s1.combine(s2, max, fill_value=0) + duck 30.0 + eagle 200.0 + falcon 345.0 + dtype: float64 + """ + if fill_value is None: + fill_value = na_value_for_dtype(self.dtype, compat=False) + + if isinstance(other, Series): + # If other is a Series, result is based on union of Series, + # so do this element by element + new_index = self.index.union(other.index) + new_name = ops.get_op_result_name(self, other) + new_values = [] + for idx in new_index: + lv = self.get(idx, fill_value) + rv = other.get(idx, fill_value) + with np.errstate(all="ignore"): + new_values.append(func(lv, rv)) + else: + # Assume that other is a scalar, so apply the function for + # each element in the Series + new_index = self.index + with np.errstate(all="ignore"): + new_values = [func(lv, other) for lv in self._values] + new_name = self.name + + if is_categorical_dtype(self.values): + pass + elif is_extension_array_dtype(self.values): + # The function can return something of any type, so check + # if the type is compatible with the calling EA. + new_values = try_cast_to_ea(self._values, new_values) + return self._constructor(new_values, index=new_index, name=new_name) + + def combine_first(self, other): + """ + Combine Series values, choosing the calling Series's values first. + + Parameters + ---------- + other : Series + The value(s) to be combined with the `Series`. + + Returns + ------- + Series + The result of combining the Series with the other object. + + See Also + -------- + Series.combine : Perform elementwise operation on two Series + using a given function. + + Notes + ----- + Result index will be the union of the two indexes. + + Examples + -------- + >>> s1 = pd.Series([1, np.nan]) + >>> s2 = pd.Series([3, 4]) + >>> s1.combine_first(s2) + 0 1.0 + 1 4.0 + dtype: float64 + """ + new_index = self.index.union(other.index) + this = self.reindex(new_index, copy=False) + other = other.reindex(new_index, copy=False) + if this.dtype.kind == "M" and other.dtype.kind != "M": + other = to_datetime(other) + + return this.where(notna(this), other) + + def update(self, other): + """ + Modify Series in place using non-NA values from passed + Series. Aligns on index. + + Parameters + ---------- + other : Series + + Examples + -------- + >>> s = pd.Series([1, 2, 3]) + >>> s.update(pd.Series([4, 5, 6])) + >>> s + 0 4 + 1 5 + 2 6 + dtype: int64 + + >>> s = pd.Series(['a', 'b', 'c']) + >>> s.update(pd.Series(['d', 'e'], index=[0, 2])) + >>> s + 0 d + 1 b + 2 e + dtype: object + + >>> s = pd.Series([1, 2, 3]) + >>> s.update(pd.Series([4, 5, 6, 7, 8])) + >>> s + 0 4 + 1 5 + 2 6 + dtype: int64 + + If ``other`` contains NaNs the corresponding values are not updated + in the original Series. + + >>> s = pd.Series([1, 2, 3]) + >>> s.update(pd.Series([4, np.nan, 6])) + >>> s + 0 4 + 1 2 + 2 6 + dtype: int64 + """ + other = other.reindex_like(self) + mask = notna(other) + + self._data = self._data.putmask(mask=mask, new=other, inplace=True) + self._maybe_update_cacher() + + # ---------------------------------------------------------------------- + # Reindexing, sorting + + def sort_values( + self, + axis=0, + ascending=True, + inplace=False, + kind="quicksort", + na_position="last", + ignore_index=False, + ): + """ + Sort by the values. + + Sort a Series in ascending or descending order by some + criterion. + + Parameters + ---------- + axis : {0 or 'index'}, default 0 + Axis to direct sorting. The value 'index' is accepted for + compatibility with DataFrame.sort_values. + ascending : bool, default True + If True, sort values in ascending order, otherwise descending. + inplace : bool, default False + If True, perform operation in-place. + kind : {'quicksort', 'mergesort' or 'heapsort'}, default 'quicksort' + Choice of sorting algorithm. See also :func:`numpy.sort` for more + information. 'mergesort' is the only stable algorithm. + na_position : {'first' or 'last'}, default 'last' + Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at + the end. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + + .. versionadded:: 1.0.0 + + Returns + ------- + Series + Series ordered by values. + + See Also + -------- + Series.sort_index : Sort by the Series indices. + DataFrame.sort_values : Sort DataFrame by the values along either axis. + DataFrame.sort_index : Sort DataFrame by indices. + + Examples + -------- + >>> s = pd.Series([np.nan, 1, 3, 10, 5]) + >>> s + 0 NaN + 1 1.0 + 2 3.0 + 3 10.0 + 4 5.0 + dtype: float64 + + Sort values ascending order (default behaviour) + + >>> s.sort_values(ascending=True) + 1 1.0 + 2 3.0 + 4 5.0 + 3 10.0 + 0 NaN + dtype: float64 + + Sort values descending order + + >>> s.sort_values(ascending=False) + 3 10.0 + 4 5.0 + 2 3.0 + 1 1.0 + 0 NaN + dtype: float64 + + Sort values inplace + + >>> s.sort_values(ascending=False, inplace=True) + >>> s + 3 10.0 + 4 5.0 + 2 3.0 + 1 1.0 + 0 NaN + dtype: float64 + + Sort values putting NAs first + + >>> s.sort_values(na_position='first') + 0 NaN + 1 1.0 + 2 3.0 + 4 5.0 + 3 10.0 + dtype: float64 + + Sort a series of strings + + >>> s = pd.Series(['z', 'b', 'd', 'a', 'c']) + >>> s + 0 z + 1 b + 2 d + 3 a + 4 c + dtype: object + + >>> s.sort_values() + 3 a + 1 b + 4 c + 2 d + 0 z + dtype: object + """ + inplace = validate_bool_kwarg(inplace, "inplace") + # Validate the axis parameter + self._get_axis_number(axis) + + # GH 5856/5853 + if inplace and self._is_cached: + raise ValueError( + "This Series is a view of some other array, to " + "sort in-place you must create a copy" + ) + + def _try_kind_sort(arr): + # easier to ask forgiveness than permission + try: + # if kind==mergesort, it can fail for object dtype + return arr.argsort(kind=kind) + except TypeError: + # stable sort not available for object dtype + # uses the argsort default quicksort + return arr.argsort(kind="quicksort") + + arr = self._values + sorted_index = np.empty(len(self), dtype=np.int32) + + bad = isna(arr) + + good = ~bad + idx = ibase.default_index(len(self)) + + argsorted = _try_kind_sort(arr[good]) + + if is_list_like(ascending): + if len(ascending) != 1: + raise ValueError( + f"Length of ascending ({len(ascending)}) must be 1 for Series" + ) + ascending = ascending[0] + + if not is_bool(ascending): + raise ValueError("ascending must be boolean") + + if not ascending: + argsorted = argsorted[::-1] + + if na_position == "last": + n = good.sum() + sorted_index[:n] = idx[good][argsorted] + sorted_index[n:] = idx[bad] + elif na_position == "first": + n = bad.sum() + sorted_index[n:] = idx[good][argsorted] + sorted_index[:n] = idx[bad] + else: + raise ValueError(f"invalid na_position: {na_position}") + + result = self._constructor(arr[sorted_index], index=self.index[sorted_index]) + + if ignore_index: + result.index = ibase.default_index(len(sorted_index)) + + if inplace: + self._update_inplace(result) + else: + return result.__finalize__(self) + + def sort_index( + self, + axis=0, + level=None, + ascending=True, + inplace=False, + kind="quicksort", + na_position="last", + sort_remaining=True, + ignore_index: bool = False, + ): + """ + Sort Series by index labels. + + Returns a new Series sorted by label if `inplace` argument is + ``False``, otherwise updates the original series and returns None. + + Parameters + ---------- + axis : int, default 0 + Axis to direct sorting. This can only be 0 for Series. + level : int, optional + If not None, sort on values in specified index level(s). + ascending : bool, default true + Sort ascending vs. descending. + inplace : bool, default False + If True, perform operation in-place. + kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort' + Choice of sorting algorithm. See also :func:`numpy.sort` for more + information. 'mergesort' is the only stable algorithm. For + DataFrames, this option is only applied when sorting on a single + column or label. + na_position : {'first', 'last'}, default 'last' + If 'first' puts NaNs at the beginning, 'last' puts NaNs at the end. + Not implemented for MultiIndex. + sort_remaining : bool, default True + If True and sorting by level and index is multilevel, sort by other + levels too (in order) after sorting by specified level. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + + .. versionadded:: 1.0.0 + + Returns + ------- + Series + The original Series sorted by the labels. + + See Also + -------- + DataFrame.sort_index: Sort DataFrame by the index. + DataFrame.sort_values: Sort DataFrame by the value. + Series.sort_values : Sort Series by the value. + + Examples + -------- + >>> s = pd.Series(['a', 'b', 'c', 'd'], index=[3, 2, 1, 4]) + >>> s.sort_index() + 1 c + 2 b + 3 a + 4 d + dtype: object + + Sort Descending + + >>> s.sort_index(ascending=False) + 4 d + 3 a + 2 b + 1 c + dtype: object + + Sort Inplace + + >>> s.sort_index(inplace=True) + >>> s + 1 c + 2 b + 3 a + 4 d + dtype: object + + By default NaNs are put at the end, but use `na_position` to place + them at the beginning + + >>> s = pd.Series(['a', 'b', 'c', 'd'], index=[3, 2, 1, np.nan]) + >>> s.sort_index(na_position='first') + NaN d + 1.0 c + 2.0 b + 3.0 a + dtype: object + + Specify index level to sort + + >>> arrays = [np.array(['qux', 'qux', 'foo', 'foo', + ... 'baz', 'baz', 'bar', 'bar']), + ... np.array(['two', 'one', 'two', 'one', + ... 'two', 'one', 'two', 'one'])] + >>> s = pd.Series([1, 2, 3, 4, 5, 6, 7, 8], index=arrays) + >>> s.sort_index(level=1) + bar one 8 + baz one 6 + foo one 4 + qux one 2 + bar two 7 + baz two 5 + foo two 3 + qux two 1 + dtype: int64 + + Does not sort by remaining levels when sorting by levels + + >>> s.sort_index(level=1, sort_remaining=False) + qux one 2 + foo one 4 + baz one 6 + bar one 8 + qux two 1 + foo two 3 + baz two 5 + bar two 7 + dtype: int64 + """ + # TODO: this can be combined with DataFrame.sort_index impl as + # almost identical + inplace = validate_bool_kwarg(inplace, "inplace") + # Validate the axis parameter + self._get_axis_number(axis) + index = self.index + + if level is not None: + new_index, indexer = index.sortlevel( + level, ascending=ascending, sort_remaining=sort_remaining + ) + elif isinstance(index, MultiIndex): + from pandas.core.sorting import lexsort_indexer + + labels = index._sort_levels_monotonic() + indexer = lexsort_indexer( + labels._get_codes_for_sorting(), + orders=ascending, + na_position=na_position, + ) + else: + from pandas.core.sorting import nargsort + + # Check monotonic-ness before sort an index + # GH11080 + if (ascending and index.is_monotonic_increasing) or ( + not ascending and index.is_monotonic_decreasing + ): + if inplace: + return + else: + return self.copy() + + indexer = nargsort( + index, kind=kind, ascending=ascending, na_position=na_position + ) + + indexer = ensure_platform_int(indexer) + new_index = index.take(indexer) + new_index = new_index._sort_levels_monotonic() + + new_values = self._values.take(indexer) + result = self._constructor(new_values, index=new_index) + + if ignore_index: + result.index = ibase.default_index(len(result)) + + if inplace: + self._update_inplace(result) + else: + return result.__finalize__(self) + + def argsort(self, axis=0, kind="quicksort", order=None): + """ + Override ndarray.argsort. Argsorts the value, omitting NA/null values, + and places the result in the same locations as the non-NA values. + + Parameters + ---------- + axis : {0 or "index"} + Has no effect but is accepted for compatibility with numpy. + kind : {'mergesort', 'quicksort', 'heapsort'}, default 'quicksort' + Choice of sorting algorithm. See np.sort for more + information. 'mergesort' is the only stable algorithm. + order : None + Has no effect but is accepted for compatibility with numpy. + + Returns + ------- + Series + Positions of values within the sort order with -1 indicating + nan values. + + See Also + -------- + numpy.ndarray.argsort + """ + values = self._values + mask = isna(values) + + if mask.any(): + result = Series(-1, index=self.index, name=self.name, dtype="int64") + notmask = ~mask + result[notmask] = np.argsort(values[notmask], kind=kind) + return self._constructor(result, index=self.index).__finalize__(self) + else: + return self._constructor( + np.argsort(values, kind=kind), index=self.index, dtype="int64" + ).__finalize__(self) + + def nlargest(self, n=5, keep="first"): + """ + Return the largest `n` elements. + + Parameters + ---------- + n : int, default 5 + Return this many descending sorted values. + keep : {'first', 'last', 'all'}, default 'first' + When there are duplicate values that cannot all fit in a + Series of `n` elements: + + - ``first`` : return the first `n` occurrences in order + of appearance. + - ``last`` : return the last `n` occurrences in reverse + order of appearance. + - ``all`` : keep all occurrences. This can result in a Series of + size larger than `n`. + + Returns + ------- + Series + The `n` largest values in the Series, sorted in decreasing order. + + See Also + -------- + Series.nsmallest: Get the `n` smallest elements. + Series.sort_values: Sort Series by values. + Series.head: Return the first `n` rows. + + Notes + ----- + Faster than ``.sort_values(ascending=False).head(n)`` for small `n` + relative to the size of the ``Series`` object. + + Examples + -------- + >>> countries_population = {"Italy": 59000000, "France": 65000000, + ... "Malta": 434000, "Maldives": 434000, + ... "Brunei": 434000, "Iceland": 337000, + ... "Nauru": 11300, "Tuvalu": 11300, + ... "Anguilla": 11300, "Monserat": 5200} + >>> s = pd.Series(countries_population) + >>> s + Italy 59000000 + France 65000000 + Malta 434000 + Maldives 434000 + Brunei 434000 + Iceland 337000 + Nauru 11300 + Tuvalu 11300 + Anguilla 11300 + Monserat 5200 + dtype: int64 + + The `n` largest elements where ``n=5`` by default. + + >>> s.nlargest() + France 65000000 + Italy 59000000 + Malta 434000 + Maldives 434000 + Brunei 434000 + dtype: int64 + + The `n` largest elements where ``n=3``. Default `keep` value is 'first' + so Malta will be kept. + + >>> s.nlargest(3) + France 65000000 + Italy 59000000 + Malta 434000 + dtype: int64 + + The `n` largest elements where ``n=3`` and keeping the last duplicates. + Brunei will be kept since it is the last with value 434000 based on + the index order. + + >>> s.nlargest(3, keep='last') + France 65000000 + Italy 59000000 + Brunei 434000 + dtype: int64 + + The `n` largest elements where ``n=3`` with all duplicates kept. Note + that the returned Series has five elements due to the three duplicates. + + >>> s.nlargest(3, keep='all') + France 65000000 + Italy 59000000 + Malta 434000 + Maldives 434000 + Brunei 434000 + dtype: int64 + """ + return algorithms.SelectNSeries(self, n=n, keep=keep).nlargest() + + def nsmallest(self, n=5, keep="first"): + """ + Return the smallest `n` elements. + + Parameters + ---------- + n : int, default 5 + Return this many ascending sorted values. + keep : {'first', 'last', 'all'}, default 'first' + When there are duplicate values that cannot all fit in a + Series of `n` elements: + + - ``first`` : return the first `n` occurrences in order + of appearance. + - ``last`` : return the last `n` occurrences in reverse + order of appearance. + - ``all`` : keep all occurrences. This can result in a Series of + size larger than `n`. + + Returns + ------- + Series + The `n` smallest values in the Series, sorted in increasing order. + + See Also + -------- + Series.nlargest: Get the `n` largest elements. + Series.sort_values: Sort Series by values. + Series.head: Return the first `n` rows. + + Notes + ----- + Faster than ``.sort_values().head(n)`` for small `n` relative to + the size of the ``Series`` object. + + Examples + -------- + >>> countries_population = {"Italy": 59000000, "France": 65000000, + ... "Brunei": 434000, "Malta": 434000, + ... "Maldives": 434000, "Iceland": 337000, + ... "Nauru": 11300, "Tuvalu": 11300, + ... "Anguilla": 11300, "Monserat": 5200} + >>> s = pd.Series(countries_population) + >>> s + Italy 59000000 + France 65000000 + Brunei 434000 + Malta 434000 + Maldives 434000 + Iceland 337000 + Nauru 11300 + Tuvalu 11300 + Anguilla 11300 + Monserat 5200 + dtype: int64 + + The `n` smallest elements where ``n=5`` by default. + + >>> s.nsmallest() + Monserat 5200 + Nauru 11300 + Tuvalu 11300 + Anguilla 11300 + Iceland 337000 + dtype: int64 + + The `n` smallest elements where ``n=3``. Default `keep` value is + 'first' so Nauru and Tuvalu will be kept. + + >>> s.nsmallest(3) + Monserat 5200 + Nauru 11300 + Tuvalu 11300 + dtype: int64 + + The `n` smallest elements where ``n=3`` and keeping the last + duplicates. Anguilla and Tuvalu will be kept since they are the last + with value 11300 based on the index order. + + >>> s.nsmallest(3, keep='last') + Monserat 5200 + Anguilla 11300 + Tuvalu 11300 + dtype: int64 + + The `n` smallest elements where ``n=3`` with all duplicates kept. Note + that the returned Series has four elements due to the three duplicates. + + >>> s.nsmallest(3, keep='all') + Monserat 5200 + Nauru 11300 + Tuvalu 11300 + Anguilla 11300 + dtype: int64 + """ + return algorithms.SelectNSeries(self, n=n, keep=keep).nsmallest() + + def swaplevel(self, i=-2, j=-1, copy=True): + """ + Swap levels i and j in a :class:`MultiIndex`. + + Default is to swap the two innermost levels of the index. + + Parameters + ---------- + i, j : int, str + Level of the indices to be swapped. Can pass level name as string. + copy : bool, default True + Whether to copy underlying data. + + Returns + ------- + Series + Series with levels swapped in MultiIndex. + """ + new_index = self.index.swaplevel(i, j) + return self._constructor(self._values, index=new_index, copy=copy).__finalize__( + self + ) + + def reorder_levels(self, order): + """ + Rearrange index levels using input order. + + May not drop or duplicate levels. + + Parameters + ---------- + order : list of int representing new level order + Reference level by number or key. + + Returns + ------- + type of caller (new object) + """ + if not isinstance(self.index, MultiIndex): # pragma: no cover + raise Exception("Can only reorder levels on a hierarchical axis.") + + result = self.copy() + result.index = result.index.reorder_levels(order) + return result + + def explode(self) -> "Series": + """ + Transform each element of a list-like to a row, replicating the + index values. + + .. versionadded:: 0.25.0 + + Returns + ------- + Series + Exploded lists to rows; index will be duplicated for these rows. + + See Also + -------- + Series.str.split : Split string values on specified separator. + Series.unstack : Unstack, a.k.a. pivot, Series with MultiIndex + to produce DataFrame. + DataFrame.melt : Unpivot a DataFrame from wide format to long format. + DataFrame.explode : Explode a DataFrame from list-like + columns to long format. + + Notes + ----- + This routine will explode list-likes including lists, tuples, + Series, and np.ndarray. The result dtype of the subset rows will + be object. Scalars will be returned unchanged. Empty list-likes will + result in a np.nan for that row. + + Examples + -------- + >>> s = pd.Series([[1, 2, 3], 'foo', [], [3, 4]]) + >>> s + 0 [1, 2, 3] + 1 foo + 2 [] + 3 [3, 4] + dtype: object + + >>> s.explode() + 0 1 + 0 2 + 0 3 + 1 foo + 2 NaN + 3 3 + 3 4 + dtype: object + """ + if not len(self) or not is_object_dtype(self): + return self.copy() + + values, counts = reshape.explode(np.asarray(self.array)) + + result = Series(values, index=self.index.repeat(counts), name=self.name) + return result + + def unstack(self, level=-1, fill_value=None): + """ + Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame. + The level involved will automatically get sorted. + + Parameters + ---------- + level : int, str, or list of these, default last level + Level(s) to unstack, can pass level name. + fill_value : scalar value, default None + Value to use when replacing NaN values. + + Returns + ------- + DataFrame + Unstacked Series. + + Examples + -------- + >>> s = pd.Series([1, 2, 3, 4], + ... index=pd.MultiIndex.from_product([['one', 'two'], + ... ['a', 'b']])) + >>> s + one a 1 + b 2 + two a 3 + b 4 + dtype: int64 + + >>> s.unstack(level=-1) + a b + one 1 2 + two 3 4 + + >>> s.unstack(level=0) + one two + a 1 3 + b 2 4 + """ + from pandas.core.reshape.reshape import unstack + + return unstack(self, level, fill_value) + + # ---------------------------------------------------------------------- + # function application + + def map(self, arg, na_action=None): + """ + Map values of Series according to input correspondence. + + Used for substituting each value in a Series with another value, + that may be derived from a function, a ``dict`` or + a :class:`Series`. + + Parameters + ---------- + arg : function, collections.abc.Mapping subclass or Series + Mapping correspondence. + na_action : {None, 'ignore'}, default None + If 'ignore', propagate NaN values, without passing them to the + mapping correspondence. + + Returns + ------- + Series + Same index as caller. + + See Also + -------- + Series.apply : For applying more complex functions on a Series. + DataFrame.apply : Apply a function row-/column-wise. + DataFrame.applymap : Apply a function elementwise on a whole DataFrame. + + Notes + ----- + When ``arg`` is a dictionary, values in Series that are not in the + dictionary (as keys) are converted to ``NaN``. However, if the + dictionary is a ``dict`` subclass that defines ``__missing__`` (i.e. + provides a method for default values), then this default is used + rather than ``NaN``. + + Examples + -------- + >>> s = pd.Series(['cat', 'dog', np.nan, 'rabbit']) + >>> s + 0 cat + 1 dog + 2 NaN + 3 rabbit + dtype: object + + ``map`` accepts a ``dict`` or a ``Series``. Values that are not found + in the ``dict`` are converted to ``NaN``, unless the dict has a default + value (e.g. ``defaultdict``): + + >>> s.map({'cat': 'kitten', 'dog': 'puppy'}) + 0 kitten + 1 puppy + 2 NaN + 3 NaN + dtype: object + + It also accepts a function: + + >>> s.map('I am a {}'.format) + 0 I am a cat + 1 I am a dog + 2 I am a nan + 3 I am a rabbit + dtype: object + + To avoid applying the function to missing values (and keep them as + ``NaN``) ``na_action='ignore'`` can be used: + + >>> s.map('I am a {}'.format, na_action='ignore') + 0 I am a cat + 1 I am a dog + 2 NaN + 3 I am a rabbit + dtype: object + """ + new_values = super()._map_values(arg, na_action=na_action) + return self._constructor(new_values, index=self.index).__finalize__(self) + + def _gotitem(self, key, ndim, subset=None): + """ + Sub-classes to define. Return a sliced object. + + Parameters + ---------- + key : string / list of selections + ndim : 1,2 + Requested ndim of result. + subset : object, default None + Subset to act on. + """ + return self + + _agg_see_also_doc = dedent( + """ + See Also + -------- + Series.apply : Invoke function on a Series. + Series.transform : Transform function producing a Series with like indexes. + """ + ) + + _agg_examples_doc = dedent( + """ + Examples + -------- + >>> s = pd.Series([1, 2, 3, 4]) + >>> s + 0 1 + 1 2 + 2 3 + 3 4 + dtype: int64 + + >>> s.agg('min') + 1 + + >>> s.agg(['min', 'max']) + min 1 + max 4 + dtype: int64 + """ + ) + + @Substitution( + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + versionadded="\n.. versionadded:: 0.20.0\n", + **_shared_doc_kwargs, + ) + @Appender(generic._shared_docs["aggregate"]) + def aggregate(self, func, axis=0, *args, **kwargs): + # Validate the axis parameter + self._get_axis_number(axis) + result, how = self._aggregate(func, *args, **kwargs) + if result is None: + + # we can be called from an inner function which + # passes this meta-data + kwargs.pop("_axis", None) + kwargs.pop("_level", None) + + # try a regular apply, this evaluates lambdas + # row-by-row; however if the lambda is expected a Series + # expression, e.g.: lambda x: x-x.quantile(0.25) + # this will fail, so we can try a vectorized evaluation + + # we cannot FIRST try the vectorized evaluation, because + # then .agg and .apply would have different semantics if the + # operation is actually defined on the Series, e.g. str + try: + result = self.apply(func, *args, **kwargs) + except (ValueError, AttributeError, TypeError): + result = func(self, *args, **kwargs) + + return result + + agg = aggregate + + @Appender(generic._shared_docs["transform"] % _shared_doc_kwargs) + def transform(self, func, axis=0, *args, **kwargs): + # Validate the axis parameter + self._get_axis_number(axis) + return super().transform(func, *args, **kwargs) + + def apply(self, func, convert_dtype=True, args=(), **kwds): + """ + Invoke function on values of Series. + + Can be ufunc (a NumPy function that applies to the entire Series) + or a Python function that only works on single values. + + Parameters + ---------- + func : function + Python function or NumPy ufunc to apply. + convert_dtype : bool, default True + Try to find better dtype for elementwise function results. If + False, leave as dtype=object. + args : tuple + Positional arguments passed to func after the series value. + **kwds + Additional keyword arguments passed to func. + + Returns + ------- + Series or DataFrame + If func returns a Series object the result will be a DataFrame. + + See Also + -------- + Series.map: For element-wise operations. + Series.agg: Only perform aggregating type operations. + Series.transform: Only perform transforming type operations. + + Examples + -------- + Create a series with typical summer temperatures for each city. + + >>> s = pd.Series([20, 21, 12], + ... index=['London', 'New York', 'Helsinki']) + >>> s + London 20 + New York 21 + Helsinki 12 + dtype: int64 + + Square the values by defining a function and passing it as an + argument to ``apply()``. + + >>> def square(x): + ... return x ** 2 + >>> s.apply(square) + London 400 + New York 441 + Helsinki 144 + dtype: int64 + + Square the values by passing an anonymous function as an + argument to ``apply()``. + + >>> s.apply(lambda x: x ** 2) + London 400 + New York 441 + Helsinki 144 + dtype: int64 + + Define a custom function that needs additional positional + arguments and pass these additional arguments using the + ``args`` keyword. + + >>> def subtract_custom_value(x, custom_value): + ... return x - custom_value + + >>> s.apply(subtract_custom_value, args=(5,)) + London 15 + New York 16 + Helsinki 7 + dtype: int64 + + Define a custom function that takes keyword arguments + and pass these arguments to ``apply``. + + >>> def add_custom_values(x, **kwargs): + ... for month in kwargs: + ... x += kwargs[month] + ... return x + + >>> s.apply(add_custom_values, june=30, july=20, august=25) + London 95 + New York 96 + Helsinki 87 + dtype: int64 + + Use a function from the Numpy library. + + >>> s.apply(np.log) + London 2.995732 + New York 3.044522 + Helsinki 2.484907 + dtype: float64 + """ + if len(self) == 0: + return self._constructor(dtype=self.dtype, index=self.index).__finalize__( + self + ) + + # dispatch to agg + if isinstance(func, (list, dict)): + return self.aggregate(func, *args, **kwds) + + # if we are a string, try to dispatch + if isinstance(func, str): + return self._try_aggregate_string_function(func, *args, **kwds) + + # handle ufuncs and lambdas + if kwds or args and not isinstance(func, np.ufunc): + + def f(x): + return func(x, *args, **kwds) + + else: + f = func + + with np.errstate(all="ignore"): + if isinstance(f, np.ufunc): + return f(self) + + # row-wise access + if is_extension_array_dtype(self.dtype) and hasattr(self._values, "map"): + # GH#23179 some EAs do not have `map` + mapped = self._values.map(f) + else: + values = self.astype(object).values + mapped = lib.map_infer(values, f, convert=convert_dtype) + + if len(mapped) and isinstance(mapped[0], Series): + # GH 25959 use pd.array instead of tolist + # so extension arrays can be used + return self._constructor_expanddim(pd.array(mapped), index=self.index) + else: + return self._constructor(mapped, index=self.index).__finalize__(self) + + def _reduce( + self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds + ): + """ + Perform a reduction operation. + + If we have an ndarray as a value, then simply perform the operation, + otherwise delegate to the object. + """ + delegate = self._values + + if axis is not None: + self._get_axis_number(axis) + + if isinstance(delegate, Categorical): + return delegate._reduce(name, skipna=skipna, **kwds) + elif isinstance(delegate, ExtensionArray): + # dispatch to ExtensionArray interface + return delegate._reduce(name, skipna=skipna, **kwds) + elif is_datetime64_dtype(delegate): + # use DatetimeIndex implementation to handle skipna correctly + delegate = DatetimeIndex(delegate) + elif is_timedelta64_dtype(delegate) and hasattr(TimedeltaIndex, name): + # use TimedeltaIndex to handle skipna correctly + # TODO: remove hasattr check after TimedeltaIndex has `std` method + delegate = TimedeltaIndex(delegate) + + # dispatch to numpy arrays + elif isinstance(delegate, np.ndarray): + if numeric_only: + raise NotImplementedError( + f"Series.{name} does not implement numeric_only." + ) + with np.errstate(all="ignore"): + return op(delegate, skipna=skipna, **kwds) + + # TODO(EA) dispatch to Index + # remove once all internals extension types are + # moved to ExtensionArrays + return delegate._reduce( + op=op, + name=name, + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + filter_type=filter_type, + **kwds, + ) + + def _reindex_indexer(self, new_index, indexer, copy): + if indexer is None: + if copy: + return self.copy() + return self + + new_values = algorithms.take_1d( + self._values, indexer, allow_fill=True, fill_value=None + ) + return self._constructor(new_values, index=new_index) + + def _needs_reindex_multi(self, axes, method, level): + """ + Check if we do need a multi reindex; this is for compat with + higher dims. + """ + return False + + @Appender(generic._shared_docs["align"] % _shared_doc_kwargs) + def align( + self, + other, + join="outer", + axis=None, + level=None, + copy=True, + fill_value=None, + method=None, + limit=None, + fill_axis=0, + broadcast_axis=None, + ): + return super().align( + other, + join=join, + axis=axis, + level=level, + copy=copy, + fill_value=fill_value, + method=method, + limit=limit, + fill_axis=fill_axis, + broadcast_axis=broadcast_axis, + ) + + def rename( + self, + index=None, + *, + axis=None, + copy=True, + inplace=False, + level=None, + errors="ignore", + ): + """ + Alter Series index labels or name. + + Function / dict values must be unique (1-to-1). Labels not contained in + a dict / Series will be left as-is. Extra labels listed don't throw an + error. + + Alternatively, change ``Series.name`` with a scalar value. + + See the :ref:`user guide ` for more. + + Parameters + ---------- + axis : {0 or "index"} + Unused. Accepted for compatability with DataFrame method only. + index : scalar, hashable sequence, dict-like or function, optional + Functions or dict-like are transformations to apply to + the index. + Scalar or hashable sequence-like will alter the ``Series.name`` + attribute. + + **kwargs + Additional keyword arguments passed to the function. Only the + "inplace" keyword is used. + + Returns + ------- + Series + Series with index labels or name altered. + + See Also + -------- + DataFrame.rename : Corresponding DataFrame method. + Series.rename_axis : Set the name of the axis. + + Examples + -------- + >>> s = pd.Series([1, 2, 3]) + >>> s + 0 1 + 1 2 + 2 3 + dtype: int64 + >>> s.rename("my_name") # scalar, changes Series.name + 0 1 + 1 2 + 2 3 + Name: my_name, dtype: int64 + >>> s.rename(lambda x: x ** 2) # function, changes labels + 0 1 + 1 2 + 4 3 + dtype: int64 + >>> s.rename({1: 3, 2: 5}) # mapping, changes labels + 0 1 + 3 2 + 5 3 + dtype: int64 + """ + if callable(index) or is_dict_like(index): + return super().rename( + index, copy=copy, inplace=inplace, level=level, errors=errors + ) + else: + return self._set_name(index, inplace=inplace) + + @Substitution(**_shared_doc_kwargs) + @Appender(generic.NDFrame.reindex.__doc__) + def reindex(self, index=None, **kwargs): + return super().reindex(index=index, **kwargs) + + def drop( + self, + labels=None, + axis=0, + index=None, + columns=None, + level=None, + inplace=False, + errors="raise", + ): + """ + Return Series with specified index labels removed. + + Remove elements of a Series based on specifying the index labels. + When using a multi-index, labels on different levels can be removed + by specifying the level. + + Parameters + ---------- + labels : single label or list-like + Index labels to drop. + axis : 0, default 0 + Redundant for application on Series. + index : single label or list-like + Redundant for application on Series, but 'index' can be used instead + of 'labels'. + + .. versionadded:: 0.21.0 + columns : single label or list-like + No change is made to the Series; use 'index' or 'labels' instead. + + .. versionadded:: 0.21.0 + level : int or level name, optional + For MultiIndex, level for which the labels will be removed. + inplace : bool, default False + If True, do operation inplace and return None. + errors : {'ignore', 'raise'}, default 'raise' + If 'ignore', suppress error and only existing labels are dropped. + + Returns + ------- + Series + Series with specified index labels removed. + + Raises + ------ + KeyError + If none of the labels are found in the index. + + See Also + -------- + Series.reindex : Return only specified index labels of Series. + Series.dropna : Return series without null values. + Series.drop_duplicates : Return Series with duplicate values removed. + DataFrame.drop : Drop specified labels from rows or columns. + + Examples + -------- + >>> s = pd.Series(data=np.arange(3), index=['A', 'B', 'C']) + >>> s + A 0 + B 1 + C 2 + dtype: int64 + + Drop labels B en C + + >>> s.drop(labels=['B', 'C']) + A 0 + dtype: int64 + + Drop 2nd level label in MultiIndex Series + + >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'], + ... ['speed', 'weight', 'length']], + ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], + ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) + >>> s = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], + ... index=midx) + >>> s + lama speed 45.0 + weight 200.0 + length 1.2 + cow speed 30.0 + weight 250.0 + length 1.5 + falcon speed 320.0 + weight 1.0 + length 0.3 + dtype: float64 + + >>> s.drop(labels='weight', level=1) + lama speed 45.0 + length 1.2 + cow speed 30.0 + length 1.5 + falcon speed 320.0 + length 0.3 + dtype: float64 + """ + return super().drop( + labels=labels, + axis=axis, + index=index, + columns=columns, + level=level, + inplace=inplace, + errors=errors, + ) + + @Substitution(**_shared_doc_kwargs) + @Appender(generic.NDFrame.fillna.__doc__) + def fillna( + self, + value=None, + method=None, + axis=None, + inplace=False, + limit=None, + downcast=None, + ) -> Optional["Series"]: + return super().fillna( + value=value, + method=method, + axis=axis, + inplace=inplace, + limit=limit, + downcast=downcast, + ) + + @Appender(generic._shared_docs["replace"] % _shared_doc_kwargs) + def replace( + self, + to_replace=None, + value=None, + inplace=False, + limit=None, + regex=False, + method="pad", + ): + return super().replace( + to_replace=to_replace, + value=value, + inplace=inplace, + limit=limit, + regex=regex, + method=method, + ) + + @Appender(generic._shared_docs["shift"] % _shared_doc_kwargs) + def shift(self, periods=1, freq=None, axis=0, fill_value=None): + return super().shift( + periods=periods, freq=freq, axis=axis, fill_value=fill_value + ) + + def memory_usage(self, index=True, deep=False): + """ + Return the memory usage of the Series. + + The memory usage can optionally include the contribution of + the index and of elements of `object` dtype. + + Parameters + ---------- + index : bool, default True + Specifies whether to include the memory usage of the Series index. + deep : bool, default False + If True, introspect the data deeply by interrogating + `object` dtypes for system-level memory consumption, and include + it in the returned value. + + Returns + ------- + int + Bytes of memory consumed. + + See Also + -------- + numpy.ndarray.nbytes : Total bytes consumed by the elements of the + array. + DataFrame.memory_usage : Bytes consumed by a DataFrame. + + Examples + -------- + >>> s = pd.Series(range(3)) + >>> s.memory_usage() + 152 + + Not including the index gives the size of the rest of the data, which + is necessarily smaller: + + >>> s.memory_usage(index=False) + 24 + + The memory footprint of `object` values is ignored by default: + + >>> s = pd.Series(["a", "b"]) + >>> s.values + array(['a', 'b'], dtype=object) + >>> s.memory_usage() + 144 + >>> s.memory_usage(deep=True) + 260 + """ + v = super().memory_usage(deep=deep) + if index: + v += self.index.memory_usage(deep=deep) + return v + + def isin(self, values): + """ + Check whether `values` are contained in Series. + + Return a boolean Series showing whether each element in the Series + matches an element in the passed sequence of `values` exactly. + + Parameters + ---------- + values : set or list-like + The sequence of values to test. Passing in a single string will + raise a ``TypeError``. Instead, turn a single string into a + list of one element. + + Returns + ------- + Series + Series of booleans indicating if each element is in values. + + Raises + ------ + TypeError + * If `values` is a string + + See Also + -------- + DataFrame.isin : Equivalent method on DataFrame. + + Examples + -------- + >>> s = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama', + ... 'hippo'], name='animal') + >>> s.isin(['cow', 'lama']) + 0 True + 1 True + 2 True + 3 False + 4 True + 5 False + Name: animal, dtype: bool + + Passing a single string as ``s.isin('lama')`` will raise an error. Use + a list of one element instead: + + >>> s.isin(['lama']) + 0 True + 1 False + 2 True + 3 False + 4 True + 5 False + Name: animal, dtype: bool + """ + result = algorithms.isin(self, values) + return self._constructor(result, index=self.index).__finalize__(self) + + def between(self, left, right, inclusive=True): + """ + Return boolean Series equivalent to left <= series <= right. + + This function returns a boolean vector containing `True` wherever the + corresponding Series element is between the boundary values `left` and + `right`. NA values are treated as `False`. + + Parameters + ---------- + left : scalar or list-like + Left boundary. + right : scalar or list-like + Right boundary. + inclusive : bool, default True + Include boundaries. + + Returns + ------- + Series + Series representing whether each element is between left and + right (inclusive). + + See Also + -------- + Series.gt : Greater than of series and other. + Series.lt : Less than of series and other. + + Notes + ----- + This function is equivalent to ``(left <= ser) & (ser <= right)`` + + Examples + -------- + >>> s = pd.Series([2, 0, 4, 8, np.nan]) + + Boundary values are included by default: + + >>> s.between(1, 4) + 0 True + 1 False + 2 True + 3 False + 4 False + dtype: bool + + With `inclusive` set to ``False`` boundary values are excluded: + + >>> s.between(1, 4, inclusive=False) + 0 True + 1 False + 2 False + 3 False + 4 False + dtype: bool + + `left` and `right` can be any scalar value: + + >>> s = pd.Series(['Alice', 'Bob', 'Carol', 'Eve']) + >>> s.between('Anna', 'Daniel') + 0 False + 1 True + 2 True + 3 False + dtype: bool + """ + if inclusive: + lmask = self >= left + rmask = self <= right + else: + lmask = self > left + rmask = self < right + + return lmask & rmask + + # ---------------------------------------------------------------------- + # Convert to types that support pd.NA + + def _convert_dtypes( + self: ABCSeries, + infer_objects: bool = True, + convert_string: bool = True, + convert_integer: bool = True, + convert_boolean: bool = True, + ) -> "Series": + input_series = self + if infer_objects: + input_series = input_series.infer_objects() + if is_object_dtype(input_series): + input_series = input_series.copy() + + if convert_string or convert_integer or convert_boolean: + inferred_dtype = convert_dtypes( + input_series._values, convert_string, convert_integer, convert_boolean + ) + try: + result = input_series.astype(inferred_dtype) + except TypeError: + result = input_series.copy() + else: + result = input_series.copy() + return result + + @Appender(generic._shared_docs["isna"] % _shared_doc_kwargs) + def isna(self): + return super().isna() + + @Appender(generic._shared_docs["isna"] % _shared_doc_kwargs) + def isnull(self): + return super().isnull() + + @Appender(generic._shared_docs["notna"] % _shared_doc_kwargs) + def notna(self): + return super().notna() + + @Appender(generic._shared_docs["notna"] % _shared_doc_kwargs) + def notnull(self): + return super().notnull() + + def dropna(self, axis=0, inplace=False, how=None): + """ + Return a new Series with missing values removed. + + See the :ref:`User Guide ` for more on which values are + considered missing, and how to work with missing data. + + Parameters + ---------- + axis : {0 or 'index'}, default 0 + There is only one axis to drop values from. + inplace : bool, default False + If True, do operation inplace and return None. + how : str, optional + Not in use. Kept for compatibility. + + Returns + ------- + Series + Series with NA entries dropped from it. + + See Also + -------- + Series.isna: Indicate missing values. + Series.notna : Indicate existing (non-missing) values. + Series.fillna : Replace missing values. + DataFrame.dropna : Drop rows or columns which contain NA values. + Index.dropna : Drop missing indices. + + Examples + -------- + >>> ser = pd.Series([1., 2., np.nan]) + >>> ser + 0 1.0 + 1 2.0 + 2 NaN + dtype: float64 + + Drop NA values from a Series. + + >>> ser.dropna() + 0 1.0 + 1 2.0 + dtype: float64 + + Keep the Series with valid entries in the same variable. + + >>> ser.dropna(inplace=True) + >>> ser + 0 1.0 + 1 2.0 + dtype: float64 + + Empty strings are not considered NA values. ``None`` is considered an + NA value. + + >>> ser = pd.Series([np.NaN, 2, pd.NaT, '', None, 'I stay']) + >>> ser + 0 NaN + 1 2 + 2 NaT + 3 + 4 None + 5 I stay + dtype: object + >>> ser.dropna() + 1 2 + 3 + 5 I stay + dtype: object + """ + inplace = validate_bool_kwarg(inplace, "inplace") + # Validate the axis parameter + self._get_axis_number(axis or 0) + + if self._can_hold_na: + result = remove_na_arraylike(self) + if inplace: + self._update_inplace(result) + else: + return result + else: + if inplace: + # do nothing + pass + else: + return self.copy() + + # ---------------------------------------------------------------------- + # Time series-oriented methods + + def to_timestamp(self, freq=None, how="start", copy=True): + """ + Cast to DatetimeIndex of Timestamps, at *beginning* of period. + + Parameters + ---------- + freq : str, default frequency of PeriodIndex + Desired frequency. + how : {'s', 'e', 'start', 'end'} + Convention for converting period to timestamp; start of period + vs. end. + copy : bool, default True + Whether or not to return a copy. + + Returns + ------- + Series with DatetimeIndex + """ + new_values = self._values + if copy: + new_values = new_values.copy() + + new_index = self.index.to_timestamp(freq=freq, how=how) + return self._constructor(new_values, index=new_index).__finalize__(self) + + def to_period(self, freq=None, copy=True): + """ + Convert Series from DatetimeIndex to PeriodIndex with desired + frequency (inferred from index if not passed). + + Parameters + ---------- + freq : str, default None + Frequency associated with the PeriodIndex. + copy : bool, default True + Whether or not to return a copy. + + Returns + ------- + Series + Series with index converted to PeriodIndex. + """ + new_values = self._values + if copy: + new_values = new_values.copy() + + new_index = self.index.to_period(freq=freq) + return self._constructor(new_values, index=new_index).__finalize__(self) + + # ---------------------------------------------------------------------- + # Accessor Methods + # ---------------------------------------------------------------------- + str = CachedAccessor("str", StringMethods) + dt = CachedAccessor("dt", CombinedDatetimelikeProperties) + cat = CachedAccessor("cat", CategoricalAccessor) + plot = CachedAccessor("plot", pandas.plotting.PlotAccessor) + sparse = CachedAccessor("sparse", SparseAccessor) + + # ---------------------------------------------------------------------- + # Add plotting methods to Series + hist = pandas.plotting.hist_series + + +Series._setup_axes(["index"], docs={"index": "The index (axis labels) of the Series."}) +Series._add_numeric_operations() +Series._add_series_or_dataframe_operations() + +# Add arithmetic! +ops.add_flex_arithmetic_methods(Series) +ops.add_special_arithmetic_methods(Series) diff --git a/venv/Lib/site-packages/pandas/core/sorting.py b/venv/Lib/site-packages/pandas/core/sorting.py new file mode 100644 index 0000000..51c154a --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/sorting.py @@ -0,0 +1,411 @@ +""" miscellaneous sorting / groupby utilities """ +import numpy as np + +from pandas._libs import algos, hashtable, lib +from pandas._libs.hashtable import unique_label_indices + +from pandas.core.dtypes.common import ( + ensure_int64, + ensure_platform_int, + is_categorical_dtype, + is_extension_array_dtype, +) +from pandas.core.dtypes.missing import isna + +import pandas.core.algorithms as algorithms +from pandas.core.construction import extract_array + +_INT64_MAX = np.iinfo(np.int64).max + + +def get_group_index(labels, shape, sort: bool, xnull: bool): + """ + For the particular label_list, gets the offsets into the hypothetical list + representing the totally ordered cartesian product of all possible label + combinations, *as long as* this space fits within int64 bounds; + otherwise, though group indices identify unique combinations of + labels, they cannot be deconstructed. + - If `sort`, rank of returned ids preserve lexical ranks of labels. + i.e. returned id's can be used to do lexical sort on labels; + - If `xnull` nulls (-1 labels) are passed through. + + Parameters + ---------- + labels : sequence of arrays + Integers identifying levels at each location + shape : sequence of ints + Number of unique levels at each location + sort : bool + If the ranks of returned ids should match lexical ranks of labels + xnull : bool + If true nulls are excluded. i.e. -1 values in the labels are + passed through. + + Returns + ------- + An array of type int64 where two elements are equal if their corresponding + labels are equal at all location. + + Notes + ----- + The length of `labels` and `shape` must be identical. + """ + + def _int64_cut_off(shape) -> int: + acc = 1 + for i, mul in enumerate(shape): + acc *= int(mul) + if not acc < _INT64_MAX: + return i + return len(shape) + + def maybe_lift(lab, size): + # promote nan values (assigned -1 label in lab array) + # so that all output values are non-negative + return (lab + 1, size + 1) if (lab == -1).any() else (lab, size) + + labels = map(ensure_int64, labels) + if not xnull: + labels, shape = map(list, zip(*map(maybe_lift, labels, shape))) + + labels = list(labels) + shape = list(shape) + + # Iteratively process all the labels in chunks sized so less + # than _INT64_MAX unique int ids will be required for each chunk + while True: + # how many levels can be done without overflow: + nlev = _int64_cut_off(shape) + + # compute flat ids for the first `nlev` levels + stride = np.prod(shape[1:nlev], dtype="i8") + out = stride * labels[0].astype("i8", subok=False, copy=False) + + for i in range(1, nlev): + if shape[i] == 0: + stride = 0 + else: + stride //= shape[i] + out += labels[i] * stride + + if xnull: # exclude nulls + mask = labels[0] == -1 + for lab in labels[1:nlev]: + mask |= lab == -1 + out[mask] = -1 + + if nlev == len(shape): # all levels done! + break + + # compress what has been done so far in order to avoid overflow + # to retain lexical ranks, obs_ids should be sorted + comp_ids, obs_ids = compress_group_index(out, sort=sort) + + labels = [comp_ids] + labels[nlev:] + shape = [len(obs_ids)] + shape[nlev:] + + return out + + +def get_compressed_ids(labels, sizes): + """ + Group_index is offsets into cartesian product of all possible labels. This + space can be huge, so this function compresses it, by computing offsets + (comp_ids) into the list of unique labels (obs_group_ids). + + Parameters + ---------- + labels : list of label arrays + sizes : list of size of the levels + + Returns + ------- + tuple of (comp_ids, obs_group_ids) + """ + ids = get_group_index(labels, sizes, sort=True, xnull=False) + return compress_group_index(ids, sort=True) + + +def is_int64_overflow_possible(shape) -> bool: + the_prod = 1 + for x in shape: + the_prod *= int(x) + + return the_prod >= _INT64_MAX + + +def decons_group_index(comp_labels, shape): + # reconstruct labels + if is_int64_overflow_possible(shape): + # at some point group indices are factorized, + # and may not be deconstructed here! wrong path! + raise ValueError("cannot deconstruct factorized group indices!") + + label_list = [] + factor = 1 + y = 0 + x = comp_labels + for i in reversed(range(len(shape))): + labels = (x - y) % (factor * shape[i]) // factor + np.putmask(labels, comp_labels < 0, -1) + label_list.append(labels) + y = labels * factor + factor *= shape[i] + return label_list[::-1] + + +def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull: bool): + """ + Reconstruct labels from observed group ids. + + Parameters + ---------- + xnull : bool + If nulls are excluded; i.e. -1 labels are passed through. + """ + if not xnull: + lift = np.fromiter(((a == -1).any() for a in labels), dtype="i8") + shape = np.asarray(shape, dtype="i8") + lift + + if not is_int64_overflow_possible(shape): + # obs ids are deconstructable! take the fast route! + out = decons_group_index(obs_ids, shape) + return out if xnull or not lift.any() else [x - y for x, y in zip(out, lift)] + + i = unique_label_indices(comp_ids) + i8copy = lambda a: a.astype("i8", subok=False, copy=True) + return [i8copy(lab[i]) for lab in labels] + + +def indexer_from_factorized(labels, shape, compress: bool = True): + ids = get_group_index(labels, shape, sort=True, xnull=False) + + if not compress: + ngroups = (ids.size and ids.max()) + 1 + else: + ids, obs = compress_group_index(ids, sort=True) + ngroups = len(obs) + + return get_group_index_sorter(ids, ngroups) + + +def lexsort_indexer(keys, orders=None, na_position: str = "last"): + """ + Parameters + ---------- + na_position : {'first', 'last'}, default 'last' + """ + from pandas.core.arrays import Categorical + + labels = [] + shape = [] + if isinstance(orders, bool): + orders = [orders] * len(keys) + elif orders is None: + orders = [True] * len(keys) + + for key, order in zip(keys, orders): + + # we are already a Categorical + if is_categorical_dtype(key): + cat = key + + # create the Categorical + else: + cat = Categorical(key, ordered=True) + + if na_position not in ["last", "first"]: + raise ValueError(f"invalid na_position: {na_position}") + + n = len(cat.categories) + codes = cat.codes.copy() + + mask = cat.codes == -1 + if order: # ascending + if na_position == "last": + codes = np.where(mask, n, codes) + elif na_position == "first": + codes += 1 + else: # not order means descending + if na_position == "last": + codes = np.where(mask, n, n - codes - 1) + elif na_position == "first": + codes = np.where(mask, 0, n - codes) + if mask.any(): + n += 1 + + shape.append(n) + labels.append(codes) + + return indexer_from_factorized(labels, shape) + + +def nargsort( + items, kind: str = "quicksort", ascending: bool = True, na_position: str = "last" +): + """ + Intended to be a drop-in replacement for np.argsort which handles NaNs. + + Adds ascending and na_position parameters. + + (GH #6399, #5231) + + Parameters + ---------- + kind : str, default 'quicksort' + ascending : bool, default True + na_position : {'first', 'last'}, default 'last' + """ + items = extract_array(items) + mask = np.asarray(isna(items)) + + if is_extension_array_dtype(items): + items = items._values_for_argsort() + else: + items = np.asanyarray(items) + + idx = np.arange(len(items)) + non_nans = items[~mask] + non_nan_idx = idx[~mask] + nan_idx = np.nonzero(mask)[0] + if not ascending: + non_nans = non_nans[::-1] + non_nan_idx = non_nan_idx[::-1] + indexer = non_nan_idx[non_nans.argsort(kind=kind)] + if not ascending: + indexer = indexer[::-1] + # Finally, place the NaNs at the end or the beginning according to + # na_position + if na_position == "last": + indexer = np.concatenate([indexer, nan_idx]) + elif na_position == "first": + indexer = np.concatenate([nan_idx, indexer]) + else: + raise ValueError(f"invalid na_position: {na_position}") + return indexer + + +class _KeyMapper: + """ + Map compressed group id -> key tuple. + """ + + def __init__(self, comp_ids, ngroups: int, levels, labels): + self.levels = levels + self.labels = labels + self.comp_ids = comp_ids.astype(np.int64) + + self.k = len(labels) + self.tables = [hashtable.Int64HashTable(ngroups) for _ in range(self.k)] + + self._populate_tables() + + def _populate_tables(self): + for labs, table in zip(self.labels, self.tables): + table.map(self.comp_ids, labs.astype(np.int64)) + + def get_key(self, comp_id): + return tuple( + level[table.get_item(comp_id)] + for table, level in zip(self.tables, self.levels) + ) + + +def get_flattened_iterator(comp_ids, ngroups, levels, labels): + # provide "flattened" iterator for multi-group setting + mapper = _KeyMapper(comp_ids, ngroups, levels, labels) + return [mapper.get_key(i) for i in range(ngroups)] + + +def get_indexer_dict(label_list, keys): + """ + Returns + ------- + dict + Labels mapped to indexers. + """ + shape = [len(x) for x in keys] + + group_index = get_group_index(label_list, shape, sort=True, xnull=True) + ngroups = ( + ((group_index.size and group_index.max()) + 1) + if is_int64_overflow_possible(shape) + else np.prod(shape, dtype="i8") + ) + + sorter = get_group_index_sorter(group_index, ngroups) + + sorted_labels = [lab.take(sorter) for lab in label_list] + group_index = group_index.take(sorter) + + return lib.indices_fast(sorter, group_index, keys, sorted_labels) + + +# ---------------------------------------------------------------------- +# sorting levels...cleverly? + + +def get_group_index_sorter(group_index, ngroups: int): + """ + algos.groupsort_indexer implements `counting sort` and it is at least + O(ngroups), where + ngroups = prod(shape) + shape = map(len, keys) + that is, linear in the number of combinations (cartesian product) of unique + values of groupby keys. This can be huge when doing multi-key groupby. + np.argsort(kind='mergesort') is O(count x log(count)) where count is the + length of the data-frame; + Both algorithms are `stable` sort and that is necessary for correctness of + groupby operations. e.g. consider: + df.groupby(key)[col].transform('first') + """ + count = len(group_index) + alpha = 0.0 # taking complexities literally; there may be + beta = 1.0 # some room for fine-tuning these parameters + do_groupsort = count > 0 and ((alpha + beta * ngroups) < (count * np.log(count))) + if do_groupsort: + sorter, _ = algos.groupsort_indexer(ensure_int64(group_index), ngroups) + return ensure_platform_int(sorter) + else: + return group_index.argsort(kind="mergesort") + + +def compress_group_index(group_index, sort: bool = True): + """ + Group_index is offsets into cartesian product of all possible labels. This + space can be huge, so this function compresses it, by computing offsets + (comp_ids) into the list of unique labels (obs_group_ids). + """ + + size_hint = min(len(group_index), hashtable._SIZE_HINT_LIMIT) + table = hashtable.Int64HashTable(size_hint) + + group_index = ensure_int64(group_index) + + # note, group labels come out ascending (ie, 1,2,3 etc) + comp_ids, obs_group_ids = table.get_labels_groupby(group_index) + + if sort and len(obs_group_ids) > 0: + obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids) + + return comp_ids, obs_group_ids + + +def _reorder_by_uniques(uniques, labels): + # sorter is index where elements ought to go + sorter = uniques.argsort() + + # reverse_indexer is where elements came from + reverse_indexer = np.empty(len(sorter), dtype=np.int64) + reverse_indexer.put(sorter, np.arange(len(sorter))) + + mask = labels < 0 + + # move labels to right locations (ie, unsort ascending labels) + labels = algorithms.take_nd(reverse_indexer, labels, allow_fill=False) + np.putmask(labels, mask, -1) + + # sort observed ids + uniques = algorithms.take_nd(uniques, sorter, allow_fill=False) + + return uniques, labels diff --git a/venv/Lib/site-packages/pandas/core/sparse/__init__.py b/venv/Lib/site-packages/pandas/core/sparse/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/core/sparse/api.py b/venv/Lib/site-packages/pandas/core/sparse/api.py new file mode 100644 index 0000000..e7bf94c --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/sparse/api.py @@ -0,0 +1,3 @@ +from pandas.core.arrays.sparse import SparseArray, SparseDtype + +__all__ = ["SparseArray", "SparseDtype"] diff --git a/venv/Lib/site-packages/pandas/core/strings.py b/venv/Lib/site-packages/pandas/core/strings.py new file mode 100644 index 0000000..18c7504 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/strings.py @@ -0,0 +1,3566 @@ +import codecs +from functools import wraps +import re +import textwrap +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Type, Union +import warnings + +import numpy as np + +import pandas._libs.lib as lib +import pandas._libs.missing as libmissing +import pandas._libs.ops as libops +from pandas._typing import ArrayLike, Dtype +from pandas.util._decorators import Appender + +from pandas.core.dtypes.common import ( + ensure_object, + is_bool_dtype, + is_categorical_dtype, + is_extension_array_dtype, + is_integer, + is_integer_dtype, + is_list_like, + is_object_dtype, + is_re, + is_scalar, + is_string_dtype, +) +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCIndexClass, + ABCMultiIndex, + ABCSeries, +) +from pandas.core.dtypes.missing import isna + +from pandas.core.algorithms import take_1d +from pandas.core.base import NoNewAttributesMixin +import pandas.core.common as com +from pandas.core.construction import extract_array + +if TYPE_CHECKING: + from pandas.arrays import StringArray + +_cpython_optimized_encoders = ( + "utf-8", + "utf8", + "latin-1", + "latin1", + "iso-8859-1", + "mbcs", + "ascii", +) +_cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32") + +_shared_docs: Dict[str, str] = dict() + + +def cat_core(list_of_columns: List, sep: str): + """ + Auxiliary function for :meth:`str.cat` + + Parameters + ---------- + list_of_columns : list of numpy arrays + List of arrays to be concatenated with sep; + these arrays may not contain NaNs! + sep : string + The separator string for concatenating the columns. + + Returns + ------- + nd.array + The concatenation of list_of_columns with sep. + """ + if sep == "": + # no need to interleave sep if it is empty + arr_of_cols = np.asarray(list_of_columns, dtype=object) + return np.sum(arr_of_cols, axis=0) + list_with_sep = [sep] * (2 * len(list_of_columns) - 1) + list_with_sep[::2] = list_of_columns + arr_with_sep = np.asarray(list_with_sep, dtype=object) + return np.sum(arr_with_sep, axis=0) + + +def cat_safe(list_of_columns: List, sep: str): + """ + Auxiliary function for :meth:`str.cat`. + + Same signature as cat_core, but handles TypeErrors in concatenation, which + happen if the arrays in list_of columns have the wrong dtypes or content. + + Parameters + ---------- + list_of_columns : list of numpy arrays + List of arrays to be concatenated with sep; + these arrays may not contain NaNs! + sep : string + The separator string for concatenating the columns. + + Returns + ------- + nd.array + The concatenation of list_of_columns with sep. + """ + try: + result = cat_core(list_of_columns, sep) + except TypeError: + # if there are any non-string values (wrong dtype or hidden behind + # object dtype), np.sum will fail; catch and return with better message + for column in list_of_columns: + dtype = lib.infer_dtype(column, skipna=True) + if dtype not in ["string", "empty"]: + raise TypeError( + "Concatenation requires list-likes containing only " + "strings (or missing values). Offending values found in " + f"column {dtype}" + ) from None + return result + + +def _na_map(f, arr, na_result=None, dtype=object): + if is_extension_array_dtype(arr.dtype): + if na_result is None: + na_result = libmissing.NA + # just StringDtype + arr = extract_array(arr) + return _map_stringarray(f, arr, na_value=na_result, dtype=dtype) + if na_result is None: + na_result = np.nan + return _map_object(f, arr, na_mask=True, na_value=na_result, dtype=dtype) + + +def _map_stringarray( + func: Callable[[str], Any], arr: "StringArray", na_value: Any, dtype: Dtype +) -> ArrayLike: + """ + Map a callable over valid elements of a StringArrray. + + Parameters + ---------- + func : Callable[[str], Any] + Apply to each valid element. + arr : StringArray + na_value : Any + The value to use for missing values. By default, this is + the original value (NA). + dtype : Dtype + The result dtype to use. Specifying this avoids an intermediate + object-dtype allocation. + + Returns + ------- + ArrayLike + An ExtensionArray for integer or string dtypes, otherwise + an ndarray. + + """ + from pandas.arrays import IntegerArray, StringArray, BooleanArray + + mask = isna(arr) + + assert isinstance(arr, StringArray) + arr = np.asarray(arr) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + constructor: Union[Type[IntegerArray], Type[BooleanArray]] + if is_integer_dtype(dtype): + constructor = IntegerArray + else: + constructor = BooleanArray + + na_value_is_na = isna(na_value) + if na_value_is_na: + na_value = 1 + result = lib.map_infer_mask( + arr, + func, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=np.dtype(dtype), + ) + + if not na_value_is_na: + mask[:] = False + + return constructor(result, mask) + + elif is_string_dtype(dtype) and not is_object_dtype(dtype): + # i.e. StringDtype + result = lib.map_infer_mask( + arr, func, mask.view("uint8"), convert=False, na_value=na_value + ) + return StringArray(result) + else: + # This is when the result type is object. We reach this when + # -> We know the result type is truly object (e.g. .encode returns bytes + # or .findall returns a list). + # -> We don't know the result type. E.g. `.get` can return anything. + return lib.map_infer_mask(arr, func, mask.view("uint8")) + + +def _map_object(f, arr, na_mask=False, na_value=np.nan, dtype=object): + if not len(arr): + return np.ndarray(0, dtype=dtype) + + if isinstance(arr, ABCSeries): + arr = arr.values + if not isinstance(arr, np.ndarray): + arr = np.asarray(arr, dtype=object) + if na_mask: + mask = isna(arr) + convert = not np.all(mask) + try: + result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert) + except (TypeError, AttributeError) as e: + # Reraise the exception if callable `f` got wrong number of args. + # The user may want to be warned by this, instead of getting NaN + p_err = ( + r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " + r"(?(3)required )positional arguments?" + ) + + if len(e.args) >= 1 and re.search(p_err, e.args[0]): + # FIXME: this should be totally avoidable + raise e + + def g(x): + try: + return f(x) + except (TypeError, AttributeError): + return na_value + + return _map_object(g, arr, dtype=dtype) + if na_value is not np.nan: + np.putmask(result, mask, na_value) + if result.dtype == object: + result = lib.maybe_convert_objects(result) + return result + else: + return lib.map_infer(arr, f) + + +def str_count(arr, pat, flags=0): + """ + Count occurrences of pattern in each string of the Series/Index. + + This function is used to count the number of times a particular regex + pattern is repeated in each of the string elements of the + :class:`~pandas.Series`. + + Parameters + ---------- + pat : str + Valid regular expression. + flags : int, default 0, meaning no flags + Flags for the `re` module. For a complete list, `see here + `_. + **kwargs + For compatibility with other string methods. Not used. + + Returns + ------- + Series or Index + Same type as the calling object containing the integer counts. + + See Also + -------- + re : Standard library module for regular expressions. + str.count : Standard library version, without regular expression support. + + Notes + ----- + Some characters need to be escaped when passing in `pat`. + eg. ``'$'`` has a special meaning in regex and must be escaped when + finding this literal character. + + Examples + -------- + >>> s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat']) + >>> s.str.count('a') + 0 0.0 + 1 0.0 + 2 2.0 + 3 2.0 + 4 NaN + 5 0.0 + 6 1.0 + dtype: float64 + + Escape ``'$'`` to find the literal dollar sign. + + >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat']) + >>> s.str.count('\\$') + 0 1 + 1 0 + 2 1 + 3 2 + 4 2 + 5 0 + dtype: int64 + + This is also available on Index + + >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a') + Int64Index([0, 0, 2, 1], dtype='int64') + """ + regex = re.compile(pat, flags=flags) + f = lambda x: len(regex.findall(x)) + return _na_map(f, arr, dtype="int64") + + +def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): + """ + Test if pattern or regex is contained within a string of a Series or Index. + + Return boolean Series or Index based on whether a given pattern or regex is + contained within a string of a Series or Index. + + Parameters + ---------- + pat : str + Character sequence or regular expression. + case : bool, default True + If True, case sensitive. + flags : int, default 0 (no flags) + Flags to pass through to the re module, e.g. re.IGNORECASE. + na : default NaN + Fill value for missing values. + regex : bool, default True + If True, assumes the pat is a regular expression. + + If False, treats the pat as a literal string. + + Returns + ------- + Series or Index of boolean values + A Series or Index of boolean values indicating whether the + given pattern is contained within the string of each element + of the Series or Index. + + See Also + -------- + match : Analogous, but stricter, relying on re.match instead of re.search. + Series.str.startswith : Test if the start of each string element matches a + pattern. + Series.str.endswith : Same as startswith, but tests the end of string. + + Examples + -------- + + Returning a Series of booleans using only a literal pattern. + + >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN]) + >>> s1.str.contains('og', regex=False) + 0 False + 1 True + 2 False + 3 False + 4 NaN + dtype: object + + Returning an Index of booleans using only a literal pattern. + + >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.NaN]) + >>> ind.str.contains('23', regex=False) + Index([False, False, False, True, nan], dtype='object') + + Specifying case sensitivity using `case`. + + >>> s1.str.contains('oG', case=True, regex=True) + 0 False + 1 False + 2 False + 3 False + 4 NaN + dtype: object + + Specifying `na` to be `False` instead of `NaN` replaces NaN values + with `False`. If Series or Index does not contain NaN values + the resultant dtype will be `bool`, otherwise, an `object` dtype. + + >>> s1.str.contains('og', na=False, regex=True) + 0 False + 1 True + 2 False + 3 False + 4 False + dtype: bool + + Returning 'house' or 'dog' when either expression occurs in a string. + + >>> s1.str.contains('house|dog', regex=True) + 0 False + 1 True + 2 True + 3 False + 4 NaN + dtype: object + + Ignoring case sensitivity using `flags` with regex. + + >>> import re + >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True) + 0 False + 1 False + 2 True + 3 False + 4 NaN + dtype: object + + Returning any digit using regular expression. + + >>> s1.str.contains('\\d', regex=True) + 0 False + 1 False + 2 False + 3 True + 4 NaN + dtype: object + + Ensure `pat` is a not a literal pattern when `regex` is set to True. + Note in the following example one might expect only `s2[1]` and `s2[3]` to + return `True`. However, '.0' as a regex matches any character + followed by a 0. + + >>> s2 = pd.Series(['40', '40.0', '41', '41.0', '35']) + >>> s2.str.contains('.0', regex=True) + 0 True + 1 True + 2 False + 3 True + 4 False + dtype: bool + """ + if regex: + if not case: + flags |= re.IGNORECASE + + regex = re.compile(pat, flags=flags) + + if regex.groups > 0: + warnings.warn( + "This pattern has match groups. To actually get the " + "groups, use str.extract.", + UserWarning, + stacklevel=3, + ) + + f = lambda x: bool(regex.search(x)) + else: + if case: + f = lambda x: pat in x + else: + upper_pat = pat.upper() + f = lambda x: upper_pat in x + uppered = _na_map(lambda x: x.upper(), arr) + return _na_map(f, uppered, na, dtype=bool) + return _na_map(f, arr, na, dtype=bool) + + +def str_startswith(arr, pat, na=np.nan): + """ + Test if the start of each string element matches a pattern. + + Equivalent to :meth:`str.startswith`. + + Parameters + ---------- + pat : str + Character sequence. Regular expressions are not accepted. + na : object, default NaN + Object shown if element tested is not a string. + + Returns + ------- + Series or Index of bool + A Series of booleans indicating whether the given pattern matches + the start of each string element. + + See Also + -------- + str.startswith : Python standard library string method. + Series.str.endswith : Same as startswith, but tests the end of string. + Series.str.contains : Tests if string element contains a pattern. + + Examples + -------- + >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan]) + >>> s + 0 bat + 1 Bear + 2 cat + 3 NaN + dtype: object + + >>> s.str.startswith('b') + 0 True + 1 False + 2 False + 3 NaN + dtype: object + + Specifying `na` to be `False` instead of `NaN`. + + >>> s.str.startswith('b', na=False) + 0 True + 1 False + 2 False + 3 False + dtype: bool + """ + f = lambda x: x.startswith(pat) + return _na_map(f, arr, na, dtype=bool) + + +def str_endswith(arr, pat, na=np.nan): + """ + Test if the end of each string element matches a pattern. + + Equivalent to :meth:`str.endswith`. + + Parameters + ---------- + pat : str + Character sequence. Regular expressions are not accepted. + na : object, default NaN + Object shown if element tested is not a string. + + Returns + ------- + Series or Index of bool + A Series of booleans indicating whether the given pattern matches + the end of each string element. + + See Also + -------- + str.endswith : Python standard library string method. + Series.str.startswith : Same as endswith, but tests the start of string. + Series.str.contains : Tests if string element contains a pattern. + + Examples + -------- + >>> s = pd.Series(['bat', 'bear', 'caT', np.nan]) + >>> s + 0 bat + 1 bear + 2 caT + 3 NaN + dtype: object + + >>> s.str.endswith('t') + 0 True + 1 False + 2 False + 3 NaN + dtype: object + + Specifying `na` to be `False` instead of `NaN`. + + >>> s.str.endswith('t', na=False) + 0 True + 1 False + 2 False + 3 False + dtype: bool + """ + f = lambda x: x.endswith(pat) + return _na_map(f, arr, na, dtype=bool) + + +def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): + r""" + Replace occurrences of pattern/regex in the Series/Index with + some other string. Equivalent to :meth:`str.replace` or + :func:`re.sub`. + + Parameters + ---------- + pat : str or compiled regex + String can be a character sequence or regular expression. + repl : str or callable + Replacement string or a callable. The callable is passed the regex + match object and must return a replacement string to be used. + See :func:`re.sub`. + n : int, default -1 (all) + Number of replacements to make from start. + case : bool, default None + Determines if replace is case sensitive: + + - If True, case sensitive (the default if `pat` is a string) + - Set to False for case insensitive + - Cannot be set if `pat` is a compiled regex. + + flags : int, default 0 (no flags) + Regex module flags, e.g. re.IGNORECASE. Cannot be set if `pat` is a compiled + regex. + regex : bool, default True + Determines if assumes the passed-in pattern is a regular expression: + + - If True, assumes the passed-in pattern is a regular expression. + - If False, treats the pattern as a literal string + - Cannot be set to False if `pat` is a compiled regex or `repl` is + a callable. + + .. versionadded:: 0.23.0 + + Returns + ------- + Series or Index of object + A copy of the object with all matching occurrences of `pat` replaced by + `repl`. + + Raises + ------ + ValueError + * if `regex` is False and `repl` is a callable or `pat` is a compiled + regex + * if `pat` is a compiled regex and `case` or `flags` is set + + Notes + ----- + When `pat` is a compiled regex, all flags should be included in the + compiled regex. Use of `case`, `flags`, or `regex=False` with a compiled + regex will raise an error. + + Examples + -------- + When `pat` is a string and `regex` is True (the default), the given `pat` + is compiled as a regex. When `repl` is a string, it replaces matching + regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are + left as is: + + >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True) + 0 bao + 1 baz + 2 NaN + dtype: object + + When `pat` is a string and `regex` is False, every `pat` is replaced with + `repl` as with :meth:`str.replace`: + + >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False) + 0 bao + 1 fuz + 2 NaN + dtype: object + + When `repl` is a callable, it is called on every `pat` using + :func:`re.sub`. The callable should expect one positional argument + (a regex object) and return a string. + + To get the idea: + + >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr) + 0 <_sre.SRE_Match object; span=(0, 1), match='f'>oo + 1 <_sre.SRE_Match object; span=(0, 1), match='f'>uz + 2 NaN + dtype: object + + Reverse every lowercase alphabetic word: + + >>> repl = lambda m: m.group(0)[::-1] + >>> pd.Series(['foo 123', 'bar baz', np.nan]).str.replace(r'[a-z]+', repl) + 0 oof 123 + 1 rab zab + 2 NaN + dtype: object + + Using regex groups (extract second group and swap case): + + >>> pat = r"(?P\w+) (?P\w+) (?P\w+)" + >>> repl = lambda m: m.group('two').swapcase() + >>> pd.Series(['One Two Three', 'Foo Bar Baz']).str.replace(pat, repl) + 0 tWO + 1 bAR + dtype: object + + Using a compiled regex with flags + + >>> import re + >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE) + >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar') + 0 foo + 1 bar + 2 NaN + dtype: object + """ + + # Check whether repl is valid (GH 13438, GH 15055) + if not (isinstance(repl, str) or callable(repl)): + raise TypeError("repl must be a string or callable") + + is_compiled_re = is_re(pat) + if regex: + if is_compiled_re: + if (case is not None) or (flags != 0): + raise ValueError( + "case and flags cannot be set when pat is a compiled regex" + ) + else: + # not a compiled regex + # set default case + if case is None: + case = True + + # add case flag, if provided + if case is False: + flags |= re.IGNORECASE + if is_compiled_re or len(pat) > 1 or flags or callable(repl): + n = n if n >= 0 else 0 + compiled = re.compile(pat, flags=flags) + f = lambda x: compiled.sub(repl=repl, string=x, count=n) + else: + f = lambda x: x.replace(pat, repl, n) + else: + if is_compiled_re: + raise ValueError( + "Cannot use a compiled regex as replacement pattern with regex=False" + ) + if callable(repl): + raise ValueError("Cannot use a callable replacement when regex=False") + f = lambda x: x.replace(pat, repl, n) + + return _na_map(f, arr, dtype=str) + + +def str_repeat(arr, repeats): + """ + Duplicate each string in the Series or Index. + + Parameters + ---------- + repeats : int or sequence of int + Same value for all (int) or different value per (sequence). + + Returns + ------- + Series or Index of object + Series or Index of repeated string objects specified by + input parameter repeats. + + Examples + -------- + >>> s = pd.Series(['a', 'b', 'c']) + >>> s + 0 a + 1 b + 2 c + dtype: object + + Single int repeats string in Series + + >>> s.str.repeat(repeats=2) + 0 aa + 1 bb + 2 cc + dtype: object + + Sequence of int repeats corresponding string in Series + + >>> s.str.repeat(repeats=[1, 2, 3]) + 0 a + 1 bb + 2 ccc + dtype: object + """ + if is_scalar(repeats): + + def scalar_rep(x): + try: + return bytes.__mul__(x, repeats) + except TypeError: + return str.__mul__(x, repeats) + + return _na_map(scalar_rep, arr, dtype=str) + else: + + def rep(x, r): + try: + return bytes.__mul__(x, r) + except TypeError: + return str.__mul__(x, r) + + repeats = np.asarray(repeats, dtype=object) + result = libops.vec_binop(com.values_from_object(arr), repeats, rep) + return result + + +def str_match(arr, pat, case=True, flags=0, na=np.nan): + """ + Determine if each string matches a regular expression. + + Parameters + ---------- + pat : str + Character sequence or regular expression. + case : bool, default True + If True, case sensitive. + flags : int, default 0 (no flags) + Regex module flags, e.g. re.IGNORECASE. + na : default NaN + Fill value for missing values. + + Returns + ------- + Series/array of boolean values + + See Also + -------- + contains : Analogous, but less strict, relying on re.search instead of + re.match. + extract : Extract matched groups. + """ + if not case: + flags |= re.IGNORECASE + + regex = re.compile(pat, flags=flags) + + dtype = bool + f = lambda x: bool(regex.match(x)) + + return _na_map(f, arr, na, dtype=dtype) + + +def _get_single_group_name(rx): + try: + return list(rx.groupindex.keys()).pop() + except IndexError: + return None + + +def _groups_or_na_fun(regex): + """Used in both extract_noexpand and extract_frame""" + if regex.groups == 0: + raise ValueError("pattern contains no capture groups") + empty_row = [np.nan] * regex.groups + + def f(x): + if not isinstance(x, str): + return empty_row + m = regex.search(x) + if m: + return [np.nan if item is None else item for item in m.groups()] + else: + return empty_row + + return f + + +def _result_dtype(arr): + # workaround #27953 + # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails + # when the list of values is empty. + if arr.dtype.name == "string": + return "string" + else: + return object + + +def _str_extract_noexpand(arr, pat, flags=0): + """ + Find groups in each string in the Series using passed regular + expression. This function is called from + str_extract(expand=False), and can return Series, DataFrame, or + Index. + + """ + from pandas import DataFrame + + regex = re.compile(pat, flags=flags) + groups_or_na = _groups_or_na_fun(regex) + + if regex.groups == 1: + result = np.array([groups_or_na(val)[0] for val in arr], dtype=object) + name = _get_single_group_name(regex) + else: + if isinstance(arr, ABCIndexClass): + raise ValueError("only one regex group is supported with Index") + name = None + names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) + columns = [names.get(1 + i, i) for i in range(regex.groups)] + if arr.empty: + result = DataFrame(columns=columns, dtype=object) + else: + dtype = _result_dtype(arr) + result = DataFrame( + [groups_or_na(val) for val in arr], + columns=columns, + index=arr.index, + dtype=dtype, + ) + return result, name + + +def _str_extract_frame(arr, pat, flags=0): + """ + For each subject string in the Series, extract groups from the + first match of regular expression pat. This function is called from + str_extract(expand=True), and always returns a DataFrame. + + """ + from pandas import DataFrame + + regex = re.compile(pat, flags=flags) + groups_or_na = _groups_or_na_fun(regex) + names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) + columns = [names.get(1 + i, i) for i in range(regex.groups)] + + if len(arr) == 0: + return DataFrame(columns=columns, dtype=object) + try: + result_index = arr.index + except AttributeError: + result_index = None + dtype = _result_dtype(arr) + return DataFrame( + [groups_or_na(val) for val in arr], + columns=columns, + index=result_index, + dtype=dtype, + ) + + +def str_extract(arr, pat, flags=0, expand=True): + r""" + Extract capture groups in the regex `pat` as columns in a DataFrame. + + For each subject string in the Series, extract groups from the + first match of regular expression `pat`. + + Parameters + ---------- + pat : str + Regular expression pattern with capturing groups. + flags : int, default 0 (no flags) + Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that + modify regular expression matching for things like case, + spaces, etc. For more details, see :mod:`re`. + expand : bool, default True + If True, return DataFrame with one column per capture group. + If False, return a Series/Index if there is one capture group + or DataFrame if there are multiple capture groups. + + Returns + ------- + DataFrame or Series or Index + A DataFrame with one row for each subject string, and one + column for each group. Any capture group names in regular + expression pat will be used for column names; otherwise + capture group numbers will be used. The dtype of each result + column is always object, even when no match is found. If + ``expand=False`` and pat has only one capture group, then + return a Series (if subject is a Series) or Index (if subject + is an Index). + + See Also + -------- + extractall : Returns all matches (not just the first match). + + Examples + -------- + A pattern with two groups will return a DataFrame with two columns. + Non-matches will be NaN. + + >>> s = pd.Series(['a1', 'b2', 'c3']) + >>> s.str.extract(r'([ab])(\d)') + 0 1 + 0 a 1 + 1 b 2 + 2 NaN NaN + + A pattern may contain optional groups. + + >>> s.str.extract(r'([ab])?(\d)') + 0 1 + 0 a 1 + 1 b 2 + 2 NaN 3 + + Named groups will become column names in the result. + + >>> s.str.extract(r'(?P[ab])(?P\d)') + letter digit + 0 a 1 + 1 b 2 + 2 NaN NaN + + A pattern with one group will return a DataFrame with one column + if expand=True. + + >>> s.str.extract(r'[ab](\d)', expand=True) + 0 + 0 1 + 1 2 + 2 NaN + + A pattern with one group will return a Series if expand=False. + + >>> s.str.extract(r'[ab](\d)', expand=False) + 0 1 + 1 2 + 2 NaN + dtype: object + """ + if not isinstance(expand, bool): + raise ValueError("expand must be True or False") + if expand: + return _str_extract_frame(arr._orig, pat, flags=flags) + else: + result, name = _str_extract_noexpand(arr._parent, pat, flags=flags) + return arr._wrap_result(result, name=name, expand=expand) + + +def str_extractall(arr, pat, flags=0): + r""" + For each subject string in the Series, extract groups from all + matches of regular expression pat. When each subject string in the + Series has exactly one match, extractall(pat).xs(0, level='match') + is the same as extract(pat). + + Parameters + ---------- + pat : str + Regular expression pattern with capturing groups. + flags : int, default 0 (no flags) + A ``re`` module flag, for example ``re.IGNORECASE``. These allow + to modify regular expression matching for things like case, spaces, + etc. Multiple flags can be combined with the bitwise OR operator, + for example ``re.IGNORECASE | re.MULTILINE``. + + Returns + ------- + DataFrame + A ``DataFrame`` with one row for each match, and one column for each + group. Its rows have a ``MultiIndex`` with first levels that come from + the subject ``Series``. The last level is named 'match' and indexes the + matches in each item of the ``Series``. Any capture group names in + regular expression pat will be used for column names; otherwise capture + group numbers will be used. + + See Also + -------- + extract : Returns first match only (not all matches). + + Examples + -------- + A pattern with one group will return a DataFrame with one column. + Indices with no matches will not appear in the result. + + >>> s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"]) + >>> s.str.extractall(r"[ab](\d)") + 0 + match + A 0 1 + 1 2 + B 0 1 + + Capture group names are used for column names of the result. + + >>> s.str.extractall(r"[ab](?P\d)") + digit + match + A 0 1 + 1 2 + B 0 1 + + A pattern with two groups will return a DataFrame with two columns. + + >>> s.str.extractall(r"(?P[ab])(?P\d)") + letter digit + match + A 0 a 1 + 1 a 2 + B 0 b 1 + + Optional groups that do not match are NaN in the result. + + >>> s.str.extractall(r"(?P[ab])?(?P\d)") + letter digit + match + A 0 a 1 + 1 a 2 + B 0 b 1 + C 0 NaN 1 + """ + + regex = re.compile(pat, flags=flags) + # the regex must contain capture groups. + if regex.groups == 0: + raise ValueError("pattern contains no capture groups") + + if isinstance(arr, ABCIndexClass): + arr = arr.to_series().reset_index(drop=True) + + names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) + columns = [names.get(1 + i, i) for i in range(regex.groups)] + match_list = [] + index_list = [] + is_mi = arr.index.nlevels > 1 + + for subject_key, subject in arr.items(): + if isinstance(subject, str): + + if not is_mi: + subject_key = (subject_key,) + + for match_i, match_tuple in enumerate(regex.findall(subject)): + if isinstance(match_tuple, str): + match_tuple = (match_tuple,) + na_tuple = [np.NaN if group == "" else group for group in match_tuple] + match_list.append(na_tuple) + result_key = tuple(subject_key + (match_i,)) + index_list.append(result_key) + + from pandas import MultiIndex + + index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"]) + dtype = _result_dtype(arr) + + result = arr._constructor_expanddim( + match_list, index=index, columns=columns, dtype=dtype + ) + return result + + +def str_get_dummies(arr, sep="|"): + """ + Split each string in the Series by sep and return a DataFrame + of dummy/indicator variables. + + Parameters + ---------- + sep : str, default "|" + String to split on. + + Returns + ------- + DataFrame + Dummy variables corresponding to values of the Series. + + See Also + -------- + get_dummies : Convert categorical variable into dummy/indicator + variables. + + Examples + -------- + >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies() + a b c + 0 1 1 0 + 1 1 0 0 + 2 1 0 1 + + >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies() + a b c + 0 1 1 0 + 1 0 0 0 + 2 1 0 1 + """ + arr = arr.fillna("") + try: + arr = sep + arr + sep + except TypeError: + arr = sep + arr.astype(str) + sep + + tags = set() + for ts in arr.str.split(sep): + tags.update(ts) + tags = sorted(tags - {""}) + + dummies = np.empty((len(arr), len(tags)), dtype=np.int64) + + for i, t in enumerate(tags): + pat = sep + t + sep + dummies[:, i] = lib.map_infer(arr.to_numpy(), lambda x: pat in x) + return dummies, tags + + +def str_join(arr, sep): + """ + Join lists contained as elements in the Series/Index with passed delimiter. + + If the elements of a Series are lists themselves, join the content of these + lists using the delimiter passed to the function. + This function is an equivalent to :meth:`str.join`. + + Parameters + ---------- + sep : str + Delimiter to use between list entries. + + Returns + ------- + Series/Index: object + The list entries concatenated by intervening occurrences of the + delimiter. + + Raises + ------ + AttributeError + If the supplied Series contains neither strings nor lists. + + See Also + -------- + str.join : Standard library version of this method. + Series.str.split : Split strings around given separator/delimiter. + + Notes + ----- + If any of the list items is not a string object, the result of the join + will be `NaN`. + + Examples + -------- + Example with a list that contains non-string elements. + + >>> s = pd.Series([['lion', 'elephant', 'zebra'], + ... [1.1, 2.2, 3.3], + ... ['cat', np.nan, 'dog'], + ... ['cow', 4.5, 'goat'], + ... ['duck', ['swan', 'fish'], 'guppy']]) + >>> s + 0 [lion, elephant, zebra] + 1 [1.1, 2.2, 3.3] + 2 [cat, nan, dog] + 3 [cow, 4.5, goat] + 4 [duck, [swan, fish], guppy] + dtype: object + + Join all lists using a '-'. The lists containing object(s) of types other + than str will produce a NaN. + + >>> s.str.join('-') + 0 lion-elephant-zebra + 1 NaN + 2 NaN + 3 NaN + 4 NaN + dtype: object + """ + return _na_map(sep.join, arr, dtype=str) + + +def str_findall(arr, pat, flags=0): + """ + Find all occurrences of pattern or regular expression in the Series/Index. + + Equivalent to applying :func:`re.findall` to all the elements in the + Series/Index. + + Parameters + ---------- + pat : str + Pattern or regular expression. + flags : int, default 0 + Flags from ``re`` module, e.g. `re.IGNORECASE` (default is 0, which + means no flags). + + Returns + ------- + Series/Index of lists of strings + All non-overlapping matches of pattern or regular expression in each + string of this Series/Index. + + See Also + -------- + count : Count occurrences of pattern or regular expression in each string + of the Series/Index. + extractall : For each string in the Series, extract groups from all matches + of regular expression and return a DataFrame with one row for each + match and one column for each group. + re.findall : The equivalent ``re`` function to all non-overlapping matches + of pattern or regular expression in string, as a list of strings. + + Examples + -------- + + >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit']) + + The search for the pattern 'Monkey' returns one match: + + >>> s.str.findall('Monkey') + 0 [] + 1 [Monkey] + 2 [] + dtype: object + + On the other hand, the search for the pattern 'MONKEY' doesn't return any + match: + + >>> s.str.findall('MONKEY') + 0 [] + 1 [] + 2 [] + dtype: object + + Flags can be added to the pattern or regular expression. For instance, + to find the pattern 'MONKEY' ignoring the case: + + >>> import re + >>> s.str.findall('MONKEY', flags=re.IGNORECASE) + 0 [] + 1 [Monkey] + 2 [] + dtype: object + + When the pattern matches more than one string in the Series, all matches + are returned: + + >>> s.str.findall('on') + 0 [on] + 1 [on] + 2 [] + dtype: object + + Regular expressions are supported too. For instance, the search for all the + strings ending with the word 'on' is shown next: + + >>> s.str.findall('on$') + 0 [on] + 1 [] + 2 [] + dtype: object + + If the pattern is found more than once in the same string, then a list of + multiple strings is returned: + + >>> s.str.findall('b') + 0 [] + 1 [] + 2 [b, b] + dtype: object + """ + regex = re.compile(pat, flags=flags) + return _na_map(regex.findall, arr) + + +def str_find(arr, sub, start=0, end=None, side="left"): + """ + Return indexes in each strings in the Series/Index where the + substring is fully contained between [start:end]. Return -1 on failure. + + Parameters + ---------- + sub : str + Substring being searched. + start : int + Left edge index. + end : int + Right edge index. + side : {'left', 'right'}, default 'left' + Specifies a starting side, equivalent to ``find`` or ``rfind``. + + Returns + ------- + Series or Index + Indexes where substring is found. + """ + + if not isinstance(sub, str): + msg = f"expected a string object, not {type(sub).__name__}" + raise TypeError(msg) + + if side == "left": + method = "find" + elif side == "right": + method = "rfind" + else: # pragma: no cover + raise ValueError("Invalid side") + + if end is None: + f = lambda x: getattr(x, method)(sub, start) + else: + f = lambda x: getattr(x, method)(sub, start, end) + + return _na_map(f, arr, dtype="int64") + + +def str_index(arr, sub, start=0, end=None, side="left"): + if not isinstance(sub, str): + msg = f"expected a string object, not {type(sub).__name__}" + raise TypeError(msg) + + if side == "left": + method = "index" + elif side == "right": + method = "rindex" + else: # pragma: no cover + raise ValueError("Invalid side") + + if end is None: + f = lambda x: getattr(x, method)(sub, start) + else: + f = lambda x: getattr(x, method)(sub, start, end) + + return _na_map(f, arr, dtype="int64") + + +def str_pad(arr, width, side="left", fillchar=" "): + """ + Pad strings in the Series/Index up to width. + + Parameters + ---------- + width : int + Minimum width of resulting string; additional characters will be filled + with character defined in `fillchar`. + side : {'left', 'right', 'both'}, default 'left' + Side from which to fill resulting string. + fillchar : str, default ' ' + Additional character for filling, default is whitespace. + + Returns + ------- + Series or Index of object + Returns Series or Index with minimum number of char in object. + + See Also + -------- + Series.str.rjust : Fills the left side of strings with an arbitrary + character. Equivalent to ``Series.str.pad(side='left')``. + Series.str.ljust : Fills the right side of strings with an arbitrary + character. Equivalent to ``Series.str.pad(side='right')``. + Series.str.center : Fills boths sides of strings with an arbitrary + character. Equivalent to ``Series.str.pad(side='both')``. + Series.str.zfill : Pad strings in the Series/Index by prepending '0' + character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``. + + Examples + -------- + >>> s = pd.Series(["caribou", "tiger"]) + >>> s + 0 caribou + 1 tiger + dtype: object + + >>> s.str.pad(width=10) + 0 caribou + 1 tiger + dtype: object + + >>> s.str.pad(width=10, side='right', fillchar='-') + 0 caribou--- + 1 tiger----- + dtype: object + + >>> s.str.pad(width=10, side='both', fillchar='-') + 0 -caribou-- + 1 --tiger--- + dtype: object + """ + if not isinstance(fillchar, str): + msg = f"fillchar must be a character, not {type(fillchar).__name__}" + raise TypeError(msg) + + if len(fillchar) != 1: + raise TypeError("fillchar must be a character, not str") + + if not is_integer(width): + msg = f"width must be of integer type, not {type(width).__name__}" + raise TypeError(msg) + + if side == "left": + f = lambda x: x.rjust(width, fillchar) + elif side == "right": + f = lambda x: x.ljust(width, fillchar) + elif side == "both": + f = lambda x: x.center(width, fillchar) + else: # pragma: no cover + raise ValueError("Invalid side") + + return _na_map(f, arr, dtype=str) + + +def str_split(arr, pat=None, n=None): + + if pat is None: + if n is None or n == 0: + n = -1 + f = lambda x: x.split(pat, n) + else: + if len(pat) == 1: + if n is None or n == 0: + n = -1 + f = lambda x: x.split(pat, n) + else: + if n is None or n == -1: + n = 0 + regex = re.compile(pat) + f = lambda x: regex.split(x, maxsplit=n) + res = _na_map(f, arr) + return res + + +def str_rsplit(arr, pat=None, n=None): + + if n is None or n == 0: + n = -1 + f = lambda x: x.rsplit(pat, n) + res = _na_map(f, arr) + return res + + +def str_slice(arr, start=None, stop=None, step=None): + """ + Slice substrings from each element in the Series or Index. + + Parameters + ---------- + start : int, optional + Start position for slice operation. + stop : int, optional + Stop position for slice operation. + step : int, optional + Step size for slice operation. + + Returns + ------- + Series or Index of object + Series or Index from sliced substring from original string object. + + See Also + -------- + Series.str.slice_replace : Replace a slice with a string. + Series.str.get : Return element at position. + Equivalent to `Series.str.slice(start=i, stop=i+1)` with `i` + being the position. + + Examples + -------- + >>> s = pd.Series(["koala", "fox", "chameleon"]) + >>> s + 0 koala + 1 fox + 2 chameleon + dtype: object + + >>> s.str.slice(start=1) + 0 oala + 1 ox + 2 hameleon + dtype: object + + >>> s.str.slice(start=-1) + 0 a + 1 x + 2 n + dtype: object + + >>> s.str.slice(stop=2) + 0 ko + 1 fo + 2 ch + dtype: object + + >>> s.str.slice(step=2) + 0 kaa + 1 fx + 2 caeen + dtype: object + + >>> s.str.slice(start=0, stop=5, step=3) + 0 kl + 1 f + 2 cm + dtype: object + + Equivalent behaviour to: + + >>> s.str[0:5:3] + 0 kl + 1 f + 2 cm + dtype: object + """ + obj = slice(start, stop, step) + f = lambda x: x[obj] + return _na_map(f, arr, dtype=str) + + +def str_slice_replace(arr, start=None, stop=None, repl=None): + """ + Replace a positional slice of a string with another value. + + Parameters + ---------- + start : int, optional + Left index position to use for the slice. If not specified (None), + the slice is unbounded on the left, i.e. slice from the start + of the string. + stop : int, optional + Right index position to use for the slice. If not specified (None), + the slice is unbounded on the right, i.e. slice until the + end of the string. + repl : str, optional + String for replacement. If not specified (None), the sliced region + is replaced with an empty string. + + Returns + ------- + Series or Index + Same type as the original object. + + See Also + -------- + Series.str.slice : Just slicing without replacement. + + Examples + -------- + >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde']) + >>> s + 0 a + 1 ab + 2 abc + 3 abdc + 4 abcde + dtype: object + + Specify just `start`, meaning replace `start` until the end of the + string with `repl`. + + >>> s.str.slice_replace(1, repl='X') + 0 aX + 1 aX + 2 aX + 3 aX + 4 aX + dtype: object + + Specify just `stop`, meaning the start of the string to `stop` is replaced + with `repl`, and the rest of the string is included. + + >>> s.str.slice_replace(stop=2, repl='X') + 0 X + 1 X + 2 Xc + 3 Xdc + 4 Xcde + dtype: object + + Specify `start` and `stop`, meaning the slice from `start` to `stop` is + replaced with `repl`. Everything before or after `start` and `stop` is + included as is. + + >>> s.str.slice_replace(start=1, stop=3, repl='X') + 0 aX + 1 aX + 2 aX + 3 aXc + 4 aXde + dtype: object + """ + if repl is None: + repl = "" + + def f(x): + if x[start:stop] == "": + local_stop = start + else: + local_stop = stop + y = "" + if start is not None: + y += x[:start] + y += repl + if stop is not None: + y += x[local_stop:] + return y + + return _na_map(f, arr, dtype=str) + + +def str_strip(arr, to_strip=None, side="both"): + """ + Strip whitespace (including newlines) from each string in the + Series/Index. + + Parameters + ---------- + to_strip : str or unicode + side : {'left', 'right', 'both'}, default 'both' + + Returns + ------- + Series or Index + """ + if side == "both": + f = lambda x: x.strip(to_strip) + elif side == "left": + f = lambda x: x.lstrip(to_strip) + elif side == "right": + f = lambda x: x.rstrip(to_strip) + else: # pragma: no cover + raise ValueError("Invalid side") + return _na_map(f, arr, dtype=str) + + +def str_wrap(arr, width, **kwargs): + r""" + Wrap long strings in the Series/Index to be formatted in + paragraphs with length less than a given width. + + This method has the same keyword parameters and defaults as + :class:`textwrap.TextWrapper`. + + Parameters + ---------- + width : int + Maximum line width. + expand_tabs : bool, optional + If True, tab characters will be expanded to spaces (default: True). + replace_whitespace : bool, optional + If True, each whitespace character (as defined by string.whitespace) + remaining after tab expansion will be replaced by a single space + (default: True). + drop_whitespace : bool, optional + If True, whitespace that, after wrapping, happens to end up at the + beginning or end of a line is dropped (default: True). + break_long_words : bool, optional + If True, then words longer than width will be broken in order to ensure + that no lines are longer than width. If it is false, long words will + not be broken, and some lines may be longer than width (default: True). + break_on_hyphens : bool, optional + If True, wrapping will occur preferably on whitespace and right after + hyphens in compound words, as it is customary in English. If false, + only whitespaces will be considered as potentially good places for line + breaks, but you need to set break_long_words to false if you want truly + insecable words (default: True). + + Returns + ------- + Series or Index + + Notes + ----- + Internally, this method uses a :class:`textwrap.TextWrapper` instance with + default settings. To achieve behavior matching R's stringr library str_wrap + function, use the arguments: + + - expand_tabs = False + - replace_whitespace = True + - drop_whitespace = True + - break_long_words = False + - break_on_hyphens = False + + Examples + -------- + + >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped']) + >>> s.str.wrap(12) + 0 line to be\nwrapped + 1 another line\nto be\nwrapped + dtype: object + """ + kwargs["width"] = width + + tw = textwrap.TextWrapper(**kwargs) + + return _na_map(lambda s: "\n".join(tw.wrap(s)), arr, dtype=str) + + +def str_translate(arr, table): + """ + Map all characters in the string through the given mapping table. + Equivalent to standard :meth:`str.translate`. + + Parameters + ---------- + table : dict + Table is a mapping of Unicode ordinals to Unicode ordinals, strings, or + None. Unmapped characters are left untouched. + Characters mapped to None are deleted. :meth:`str.maketrans` is a + helper function for making translation tables. + + Returns + ------- + Series or Index + """ + return _na_map(lambda x: x.translate(table), arr, dtype=str) + + +def str_get(arr, i): + """ + Extract element from each component at specified position. + + Extract element from lists, tuples, or strings in each element in the + Series/Index. + + Parameters + ---------- + i : int + Position of element to extract. + + Returns + ------- + Series or Index + + Examples + -------- + >>> s = pd.Series(["String", + ... (1, 2, 3), + ... ["a", "b", "c"], + ... 123, + ... -456, + ... {1: "Hello", "2": "World"}]) + >>> s + 0 String + 1 (1, 2, 3) + 2 [a, b, c] + 3 123 + 4 -456 + 5 {1: 'Hello', '2': 'World'} + dtype: object + + >>> s.str.get(1) + 0 t + 1 2 + 2 b + 3 NaN + 4 NaN + 5 Hello + dtype: object + + >>> s.str.get(-1) + 0 g + 1 3 + 2 c + 3 NaN + 4 NaN + 5 None + dtype: object + """ + + def f(x): + if isinstance(x, dict): + return x.get(i) + elif len(x) > i >= -len(x): + return x[i] + return np.nan + + return _na_map(f, arr) + + +def str_decode(arr, encoding, errors="strict"): + """ + Decode character string in the Series/Index using indicated encoding. + Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in + python3. + + Parameters + ---------- + encoding : str + errors : str, optional + + Returns + ------- + Series or Index + """ + if encoding in _cpython_optimized_decoders: + # CPython optimized implementation + f = lambda x: x.decode(encoding, errors) + else: + decoder = codecs.getdecoder(encoding) + f = lambda x: decoder(x, errors)[0] + return _na_map(f, arr) + + +def str_encode(arr, encoding, errors="strict"): + """ + Encode character string in the Series/Index using indicated encoding. + Equivalent to :meth:`str.encode`. + + Parameters + ---------- + encoding : str + errors : str, optional + + Returns + ------- + encoded : Series/Index of objects + """ + if encoding in _cpython_optimized_encoders: + # CPython optimized implementation + f = lambda x: x.encode(encoding, errors) + else: + encoder = codecs.getencoder(encoding) + f = lambda x: encoder(x, errors)[0] + return _na_map(f, arr) + + +def forbid_nonstring_types(forbidden, name=None): + """ + Decorator to forbid specific types for a method of StringMethods. + + For calling `.str.{method}` on a Series or Index, it is necessary to first + initialize the :class:`StringMethods` object, and then call the method. + However, different methods allow different input types, and so this can not + be checked during :meth:`StringMethods.__init__`, but must be done on a + per-method basis. This decorator exists to facilitate this process, and + make it explicit which (inferred) types are disallowed by the method. + + :meth:`StringMethods.__init__` allows the *union* of types its different + methods allow (after skipping NaNs; see :meth:`StringMethods._validate`), + namely: ['string', 'empty', 'bytes', 'mixed', 'mixed-integer']. + + The default string types ['string', 'empty'] are allowed for all methods. + For the additional types ['bytes', 'mixed', 'mixed-integer'], each method + then needs to forbid the types it is not intended for. + + Parameters + ---------- + forbidden : list-of-str or None + List of forbidden non-string types, may be one or more of + `['bytes', 'mixed', 'mixed-integer']`. + name : str, default None + Name of the method to use in the error message. By default, this is + None, in which case the name from the method being wrapped will be + copied. However, for working with further wrappers (like _pat_wrapper + and _noarg_wrapper), it is necessary to specify the name. + + Returns + ------- + func : wrapper + The method to which the decorator is applied, with an added check that + enforces the inferred type to not be in the list of forbidden types. + + Raises + ------ + TypeError + If the inferred type of the underlying data is in `forbidden`. + """ + + # deal with None + forbidden = [] if forbidden is None else forbidden + + allowed_types = {"string", "empty", "bytes", "mixed", "mixed-integer"} - set( + forbidden + ) + + def _forbid_nonstring_types(func): + func_name = func.__name__ if name is None else name + + @wraps(func) + def wrapper(self, *args, **kwargs): + if self._inferred_dtype not in allowed_types: + msg = ( + f"Cannot use .str.{func_name} with values of " + f"inferred dtype '{self._inferred_dtype}'." + ) + raise TypeError(msg) + return func(self, *args, **kwargs) + + wrapper.__name__ = func_name + return wrapper + + return _forbid_nonstring_types + + +def _noarg_wrapper( + f, + name=None, + docstring=None, + forbidden_types=["bytes"], + returns_string=True, + **kargs, +): + @forbid_nonstring_types(forbidden_types, name=name) + def wrapper(self): + result = _na_map(f, self._parent, **kargs) + return self._wrap_result(result, returns_string=returns_string) + + wrapper.__name__ = f.__name__ if name is None else name + if docstring is not None: + wrapper.__doc__ = docstring + else: + raise ValueError("Provide docstring") + + return wrapper + + +def _pat_wrapper( + f, + flags=False, + na=False, + name=None, + forbidden_types=["bytes"], + returns_string=True, + **kwargs, +): + @forbid_nonstring_types(forbidden_types, name=name) + def wrapper1(self, pat): + result = f(self._parent, pat) + return self._wrap_result(result, returns_string=returns_string) + + @forbid_nonstring_types(forbidden_types, name=name) + def wrapper2(self, pat, flags=0, **kwargs): + result = f(self._parent, pat, flags=flags, **kwargs) + return self._wrap_result(result, returns_string=returns_string) + + @forbid_nonstring_types(forbidden_types, name=name) + def wrapper3(self, pat, na=np.nan): + result = f(self._parent, pat, na=na) + return self._wrap_result(result, returns_string=returns_string) + + wrapper = wrapper3 if na else wrapper2 if flags else wrapper1 + + wrapper.__name__ = f.__name__ if name is None else name + if f.__doc__: + wrapper.__doc__ = f.__doc__ + + return wrapper + + +def copy(source): + "Copy a docstring from another source function (if present)" + + def do_copy(target): + if source.__doc__: + target.__doc__ = source.__doc__ + return target + + return do_copy + + +class StringMethods(NoNewAttributesMixin): + """ + Vectorized string functions for Series and Index. NAs stay NA unless + handled otherwise by a particular method. Patterned after Python's string + methods, with some inspiration from R's stringr package. + + Examples + -------- + >>> s.str.split('_') + >>> s.str.replace('_', '') + """ + + def __init__(self, data): + self._inferred_dtype = self._validate(data) + self._is_categorical = is_categorical_dtype(data) + self._is_string = data.dtype.name == "string" + + # .values.categories works for both Series/Index + self._parent = data.values.categories if self._is_categorical else data + # save orig to blow up categoricals to the right type + self._orig = data + self._freeze() + + @staticmethod + def _validate(data): + """ + Auxiliary function for StringMethods, infers and checks dtype of data. + + This is a "first line of defence" at the creation of the StringMethods- + object (see _make_accessor), and just checks that the dtype is in the + *union* of the allowed types over all string methods below; this + restriction is then refined on a per-method basis using the decorator + @forbid_nonstring_types (more info in the corresponding docstring). + + This really should exclude all series/index with any non-string values, + but that isn't practical for performance reasons until we have a str + dtype (GH 9343 / 13877) + + Parameters + ---------- + data : The content of the Series + + Returns + ------- + dtype : inferred dtype of data + """ + from pandas import StringDtype + + if isinstance(data, ABCMultiIndex): + raise AttributeError( + "Can only use .str accessor with Index, not MultiIndex" + ) + + # see _libs/lib.pyx for list of inferred types + allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"] + + values = getattr(data, "values", data) # Series / Index + values = getattr(values, "categories", values) # categorical / normal + + # explicitly allow StringDtype + if isinstance(values.dtype, StringDtype): + return "string" + + try: + inferred_dtype = lib.infer_dtype(values, skipna=True) + except ValueError: + # GH#27571 mostly occurs with ExtensionArray + inferred_dtype = None + + if inferred_dtype not in allowed_types: + raise AttributeError("Can only use .str accessor with string values!") + return inferred_dtype + + def __getitem__(self, key): + if isinstance(key, slice): + return self.slice(start=key.start, stop=key.stop, step=key.step) + else: + return self.get(key) + + def __iter__(self): + warnings.warn( + "Columnar iteration over characters will be deprecated in future releases.", + FutureWarning, + stacklevel=2, + ) + i = 0 + g = self.get(i) + while g.notna().any(): + yield g + i += 1 + g = self.get(i) + + def _wrap_result( + self, + result, + use_codes=True, + name=None, + expand=None, + fill_value=np.nan, + returns_string=True, + ): + + from pandas import Index, Series, MultiIndex + + # for category, we do the stuff on the categories, so blow it up + # to the full series again + # But for some operations, we have to do the stuff on the full values, + # so make it possible to skip this step as the method already did this + # before the transformation... + if use_codes and self._is_categorical: + # if self._orig is a CategoricalIndex, there is no .cat-accessor + result = take_1d( + result, Series(self._orig, copy=False).cat.codes, fill_value=fill_value + ) + + if not hasattr(result, "ndim") or not hasattr(result, "dtype"): + return result + assert result.ndim < 3 + + # We can be wrapping a string / object / categorical result, in which + # case we'll want to return the same dtype as the input. + # Or we can be wrapping a numeric output, in which case we don't want + # to return a StringArray. + if self._is_string and returns_string: + dtype = "string" + else: + dtype = None + + if expand is None: + # infer from ndim if expand is not specified + expand = result.ndim != 1 + + elif expand is True and not isinstance(self._orig, ABCIndexClass): + # required when expand=True is explicitly specified + # not needed when inferred + + def cons_row(x): + if is_list_like(x): + return x + else: + return [x] + + result = [cons_row(x) for x in result] + if result: + # propagate nan values to match longest sequence (GH 18450) + max_len = max(len(x) for x in result) + result = [ + x * max_len if len(x) == 0 or x[0] is np.nan else x for x in result + ] + + if not isinstance(expand, bool): + raise ValueError("expand must be True or False") + + if expand is False: + # if expand is False, result should have the same name + # as the original otherwise specified + if name is None: + name = getattr(result, "name", None) + if name is None: + # do not use logical or, _orig may be a DataFrame + # which has "name" column + name = self._orig.name + + # Wait until we are sure result is a Series or Index before + # checking attributes (GH 12180) + if isinstance(self._orig, ABCIndexClass): + # if result is a boolean np.array, return the np.array + # instead of wrapping it into a boolean Index (GH 8875) + if is_bool_dtype(result): + return result + + if expand: + result = list(result) + out = MultiIndex.from_tuples(result, names=name) + if out.nlevels == 1: + # We had all tuples of length-one, which are + # better represented as a regular Index. + out = out.get_level_values(0) + return out + else: + return Index(result, name=name) + else: + index = self._orig.index + if expand: + cons = self._orig._constructor_expanddim + result = cons(result, columns=name, index=index, dtype=dtype) + else: + # Must be a Series + cons = self._orig._constructor + result = cons(result, name=name, index=index, dtype=dtype) + return result + + def _get_series_list(self, others): + """ + Auxiliary function for :meth:`str.cat`. Turn potentially mixed input + into a list of Series (elements without an index must match the length + of the calling Series/Index). + + Parameters + ---------- + others : Series, DataFrame, np.ndarray, list-like or list-like of + Objects that are either Series, Index or np.ndarray (1-dim). + + Returns + ------- + list of Series + Others transformed into list of Series. + """ + from pandas import Series, DataFrame + + # self._orig is either Series or Index + idx = self._orig if isinstance(self._orig, ABCIndexClass) else self._orig.index + + # Generally speaking, all objects without an index inherit the index + # `idx` of the calling Series/Index - i.e. must have matching length. + # Objects with an index (i.e. Series/Index/DataFrame) keep their own. + if isinstance(others, ABCSeries): + return [others] + elif isinstance(others, ABCIndexClass): + return [Series(others.values, index=others)] + elif isinstance(others, ABCDataFrame): + return [others[x] for x in others] + elif isinstance(others, np.ndarray) and others.ndim == 2: + others = DataFrame(others, index=idx) + return [others[x] for x in others] + elif is_list_like(others, allow_sets=False): + others = list(others) # ensure iterators do not get read twice etc + + # in case of list-like `others`, all elements must be + # either Series/Index/np.ndarray (1-dim)... + if all( + isinstance(x, (ABCSeries, ABCIndexClass)) + or (isinstance(x, np.ndarray) and x.ndim == 1) + for x in others + ): + los = [] + while others: # iterate through list and append each element + los = los + self._get_series_list(others.pop(0)) + return los + # ... or just strings + elif all(not is_list_like(x) for x in others): + return [Series(others, index=idx)] + raise TypeError( + "others must be Series, Index, DataFrame, np.ndarrary " + "or list-like (either containing only strings or " + "containing only objects of type Series/Index/" + "np.ndarray[1-dim])" + ) + + @forbid_nonstring_types(["bytes", "mixed", "mixed-integer"]) + def cat(self, others=None, sep=None, na_rep=None, join="left"): + """ + Concatenate strings in the Series/Index with given separator. + + If `others` is specified, this function concatenates the Series/Index + and elements of `others` element-wise. + If `others` is not passed, then all values in the Series/Index are + concatenated into a single string with a given `sep`. + + Parameters + ---------- + others : Series, Index, DataFrame, np.ndarray or list-like + Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and + other list-likes of strings must have the same length as the + calling Series/Index, with the exception of indexed objects (i.e. + Series/Index/DataFrame) if `join` is not None. + + If others is a list-like that contains a combination of Series, + Index or np.ndarray (1-dim), then all elements will be unpacked and + must satisfy the above criteria individually. + + If others is None, the method returns the concatenation of all + strings in the calling Series/Index. + sep : str, default '' + The separator between the different elements/columns. By default + the empty string `''` is used. + na_rep : str or None, default None + Representation that is inserted for all missing values: + + - If `na_rep` is None, and `others` is None, missing values in the + Series/Index are omitted from the result. + - If `na_rep` is None, and `others` is not None, a row containing a + missing value in any of the columns (before concatenation) will + have a missing value in the result. + join : {'left', 'right', 'outer', 'inner'}, default 'left' + Determines the join-style between the calling Series/Index and any + Series/Index/DataFrame in `others` (objects without an index need + to match the length of the calling Series/Index). To disable + alignment, use `.values` on any Series/Index/DataFrame in `others`. + + .. versionadded:: 0.23.0 + .. versionchanged:: 1.0.0 + Changed default of `join` from None to `'left'`. + + Returns + ------- + str, Series or Index + If `others` is None, `str` is returned, otherwise a `Series/Index` + (same type as caller) of objects is returned. + + See Also + -------- + split : Split each string in the Series/Index. + join : Join lists contained as elements in the Series/Index. + + Examples + -------- + When not passing `others`, all values are concatenated into a single + string: + + >>> s = pd.Series(['a', 'b', np.nan, 'd']) + >>> s.str.cat(sep=' ') + 'a b d' + + By default, NA values in the Series are ignored. Using `na_rep`, they + can be given a representation: + + >>> s.str.cat(sep=' ', na_rep='?') + 'a b ? d' + + If `others` is specified, corresponding values are concatenated with + the separator. Result will be a Series of strings. + + >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',') + 0 a,A + 1 b,B + 2 NaN + 3 d,D + dtype: object + + Missing values will remain missing in the result, but can again be + represented using `na_rep` + + >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-') + 0 a,A + 1 b,B + 2 -,C + 3 d,D + dtype: object + + If `sep` is not specified, the values are concatenated without + separation. + + >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-') + 0 aA + 1 bB + 2 -C + 3 dD + dtype: object + + Series with different indexes can be aligned before concatenation. The + `join`-keyword works as in other methods. + + >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2]) + >>> s.str.cat(t, join='left', na_rep='-') + 0 aa + 1 b- + 2 -c + 3 dd + dtype: object + >>> + >>> s.str.cat(t, join='outer', na_rep='-') + 0 aa + 1 b- + 2 -c + 3 dd + 4 -e + dtype: object + >>> + >>> s.str.cat(t, join='inner', na_rep='-') + 0 aa + 2 -c + 3 dd + dtype: object + >>> + >>> s.str.cat(t, join='right', na_rep='-') + 3 dd + 0 aa + 4 -e + 2 -c + dtype: object + + For more examples, see :ref:`here `. + """ + from pandas import Index, Series, concat + + if isinstance(others, str): + raise ValueError("Did you mean to supply a `sep` keyword?") + if sep is None: + sep = "" + + if isinstance(self._orig, ABCIndexClass): + data = Series(self._orig, index=self._orig) + else: # Series + data = self._orig + + # concatenate Series/Index with itself if no "others" + if others is None: + data = ensure_object(data) + na_mask = isna(data) + if na_rep is None and na_mask.any(): + data = data[~na_mask] + elif na_rep is not None and na_mask.any(): + data = np.where(na_mask, na_rep, data) + return sep.join(data) + + try: + # turn anything in "others" into lists of Series + others = self._get_series_list(others) + except ValueError: # do not catch TypeError raised by _get_series_list + raise ValueError( + "If `others` contains arrays or lists (or other " + "list-likes without an index), these must all be " + "of the same length as the calling Series/Index." + ) + + # align if required + if any(not data.index.equals(x.index) for x in others): + # Need to add keys for uniqueness in case of duplicate columns + others = concat( + others, + axis=1, + join=(join if join == "inner" else "outer"), + keys=range(len(others)), + sort=False, + copy=False, + ) + data, others = data.align(others, join=join) + others = [others[x] for x in others] # again list of Series + + all_cols = [ensure_object(x) for x in [data] + others] + na_masks = np.array([isna(x) for x in all_cols]) + union_mask = np.logical_or.reduce(na_masks, axis=0) + + if na_rep is None and union_mask.any(): + # no na_rep means NaNs for all rows where any column has a NaN + # only necessary if there are actually any NaNs + result = np.empty(len(data), dtype=object) + np.putmask(result, union_mask, np.nan) + + not_masked = ~union_mask + result[not_masked] = cat_safe([x[not_masked] for x in all_cols], sep) + elif na_rep is not None and union_mask.any(): + # fill NaNs with na_rep in case there are actually any NaNs + all_cols = [ + np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols) + ] + result = cat_safe(all_cols, sep) + else: + # no NaNs - can just concatenate + result = cat_safe(all_cols, sep) + + if isinstance(self._orig, ABCIndexClass): + # add dtype for case that result is all-NA + result = Index(result, dtype=object, name=self._orig.name) + else: # Series + if is_categorical_dtype(self._orig.dtype): + # We need to infer the new categories. + dtype = None + else: + dtype = self._orig.dtype + result = Series(result, dtype=dtype, index=data.index, name=self._orig.name) + return result + + _shared_docs[ + "str_split" + ] = r""" + Split strings around given separator/delimiter. + + Splits the string in the Series/Index from the %(side)s, + at the specified delimiter string. Equivalent to :meth:`str.%(method)s`. + + Parameters + ---------- + pat : str, optional + String or regular expression to split on. + If not specified, split on whitespace. + n : int, default -1 (all) + Limit number of splits in output. + ``None``, 0 and -1 will be interpreted as return all splits. + expand : bool, default False + Expand the splitted strings into separate columns. + + * If ``True``, return DataFrame/MultiIndex expanding dimensionality. + * If ``False``, return Series/Index, containing lists of strings. + + Returns + ------- + Series, Index, DataFrame or MultiIndex + Type matches caller unless ``expand=True`` (see Notes). + + See Also + -------- + Series.str.split : Split strings around given separator/delimiter. + Series.str.rsplit : Splits string around given separator/delimiter, + starting from the right. + Series.str.join : Join lists contained as elements in the Series/Index + with passed delimiter. + str.split : Standard library version for split. + str.rsplit : Standard library version for rsplit. + + Notes + ----- + The handling of the `n` keyword depends on the number of found splits: + + - If found splits > `n`, make first `n` splits only + - If found splits <= `n`, make all splits + - If for a certain row the number of found splits < `n`, + append `None` for padding up to `n` if ``expand=True`` + + If using ``expand=True``, Series and Index callers return DataFrame and + MultiIndex objects, respectively. + + Examples + -------- + >>> s = pd.Series(["this is a regular sentence", + ... "https://docs.python.org/3/tutorial/index.html", + ... np.nan]) + 0 this is a regular sentence + 1 https://docs.python.org/3/tutorial/index.html + 2 NaN + dtype: object + + In the default setting, the string is split by whitespace. + + >>> s.str.split() + 0 [this, is, a, regular, sentence] + 1 [https://docs.python.org/3/tutorial/index.html] + 2 NaN + dtype: object + + Without the `n` parameter, the outputs of `rsplit` and `split` + are identical. + + >>> s.str.rsplit() + 0 [this, is, a, regular, sentence] + 1 [https://docs.python.org/3/tutorial/index.html] + 2 NaN + dtype: object + + The `n` parameter can be used to limit the number of splits on the + delimiter. The outputs of `split` and `rsplit` are different. + + >>> s.str.split(n=2) + 0 [this, is, a regular sentence] + 1 [https://docs.python.org/3/tutorial/index.html] + 2 NaN + dtype: object + + >>> s.str.rsplit(n=2) + 0 [this is a, regular, sentence] + 1 [https://docs.python.org/3/tutorial/index.html] + 2 NaN + dtype: object + + The `pat` parameter can be used to split by other characters. + + >>> s.str.split(pat = "/") + 0 [this is a regular sentence] + 1 [https:, , docs.python.org, 3, tutorial, index... + 2 NaN + dtype: object + + When using ``expand=True``, the split elements will expand out into + separate columns. If NaN is present, it is propagated throughout + the columns during the split. + + >>> s.str.split(expand=True) + 0 1 2 3 + 0 this is a regular + 1 https://docs.python.org/3/tutorial/index.html None None None + 2 NaN NaN NaN NaN \ + 4 + 0 sentence + 1 None + 2 NaN + + For slightly more complex use cases like splitting the html document name + from a url, a combination of parameter settings can be used. + + >>> s.str.rsplit("/", n=1, expand=True) + 0 1 + 0 this is a regular sentence None + 1 https://docs.python.org/3/tutorial index.html + 2 NaN NaN + + Remember to escape special characters when explicitly using regular + expressions. + + >>> s = pd.Series(["1+1=2"]) + + >>> s.str.split(r"\+|=", expand=True) + 0 1 2 + 0 1 1 2 + """ + + @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"}) + @forbid_nonstring_types(["bytes"]) + def split(self, pat=None, n=-1, expand=False): + result = str_split(self._parent, pat, n=n) + return self._wrap_result(result, expand=expand, returns_string=expand) + + @Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"}) + @forbid_nonstring_types(["bytes"]) + def rsplit(self, pat=None, n=-1, expand=False): + result = str_rsplit(self._parent, pat, n=n) + return self._wrap_result(result, expand=expand, returns_string=expand) + + _shared_docs[ + "str_partition" + ] = """ + Split the string at the %(side)s occurrence of `sep`. + + This method splits the string at the %(side)s occurrence of `sep`, + and returns 3 elements containing the part before the separator, + the separator itself, and the part after the separator. + If the separator is not found, return %(return)s. + + Parameters + ---------- + sep : str, default whitespace + String to split on. + expand : bool, default True + If True, return DataFrame/MultiIndex expanding dimensionality. + If False, return Series/Index. + + Returns + ------- + DataFrame/MultiIndex or Series/Index of objects + + See Also + -------- + %(also)s + Series.str.split : Split strings around given separators. + str.partition : Standard library version. + + Examples + -------- + + >>> s = pd.Series(['Linda van der Berg', 'George Pitt-Rivers']) + >>> s + 0 Linda van der Berg + 1 George Pitt-Rivers + dtype: object + + >>> s.str.partition() + 0 1 2 + 0 Linda van der Berg + 1 George Pitt-Rivers + + To partition by the last space instead of the first one: + + >>> s.str.rpartition() + 0 1 2 + 0 Linda van der Berg + 1 George Pitt-Rivers + + To partition by something different than a space: + + >>> s.str.partition('-') + 0 1 2 + 0 Linda van der Berg + 1 George Pitt - Rivers + + To return a Series containing tuples instead of a DataFrame: + + >>> s.str.partition('-', expand=False) + 0 (Linda van der Berg, , ) + 1 (George Pitt, -, Rivers) + dtype: object + + Also available on indices: + + >>> idx = pd.Index(['X 123', 'Y 999']) + >>> idx + Index(['X 123', 'Y 999'], dtype='object') + + Which will create a MultiIndex: + + >>> idx.str.partition() + MultiIndex([('X', ' ', '123'), + ('Y', ' ', '999')], + dtype='object') + + Or an index with tuples with ``expand=False``: + + >>> idx.str.partition(expand=False) + Index([('X', ' ', '123'), ('Y', ' ', '999')], dtype='object') + """ + + @Appender( + _shared_docs["str_partition"] + % { + "side": "first", + "return": "3 elements containing the string itself, followed by two " + "empty strings", + "also": "rpartition : Split the string at the last occurrence of `sep`.", + } + ) + @forbid_nonstring_types(["bytes"]) + def partition(self, sep=" ", expand=True): + f = lambda x: x.partition(sep) + result = _na_map(f, self._parent) + return self._wrap_result(result, expand=expand, returns_string=expand) + + @Appender( + _shared_docs["str_partition"] + % { + "side": "last", + "return": "3 elements containing two empty strings, followed by the " + "string itself", + "also": "partition : Split the string at the first occurrence of `sep`.", + } + ) + @forbid_nonstring_types(["bytes"]) + def rpartition(self, sep=" ", expand=True): + f = lambda x: x.rpartition(sep) + result = _na_map(f, self._parent) + return self._wrap_result(result, expand=expand, returns_string=expand) + + @copy(str_get) + def get(self, i): + result = str_get(self._parent, i) + return self._wrap_result(result) + + @copy(str_join) + @forbid_nonstring_types(["bytes"]) + def join(self, sep): + result = str_join(self._parent, sep) + return self._wrap_result(result) + + @copy(str_contains) + @forbid_nonstring_types(["bytes"]) + def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): + result = str_contains( + self._parent, pat, case=case, flags=flags, na=na, regex=regex + ) + return self._wrap_result(result, fill_value=na, returns_string=False) + + @copy(str_match) + @forbid_nonstring_types(["bytes"]) + def match(self, pat, case=True, flags=0, na=np.nan): + result = str_match(self._parent, pat, case=case, flags=flags, na=na) + return self._wrap_result(result, fill_value=na, returns_string=False) + + @copy(str_replace) + @forbid_nonstring_types(["bytes"]) + def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): + result = str_replace( + self._parent, pat, repl, n=n, case=case, flags=flags, regex=regex + ) + return self._wrap_result(result) + + @copy(str_repeat) + @forbid_nonstring_types(["bytes"]) + def repeat(self, repeats): + result = str_repeat(self._parent, repeats) + return self._wrap_result(result) + + @copy(str_pad) + @forbid_nonstring_types(["bytes"]) + def pad(self, width, side="left", fillchar=" "): + result = str_pad(self._parent, width, side=side, fillchar=fillchar) + return self._wrap_result(result) + + _shared_docs[ + "str_pad" + ] = """ + Filling %(side)s side of strings in the Series/Index with an + additional character. Equivalent to :meth:`str.%(method)s`. + + Parameters + ---------- + width : int + Minimum width of resulting string; additional characters will be filled + with ``fillchar``. + fillchar : str + Additional character for filling, default is whitespace. + + Returns + ------- + filled : Series/Index of objects. + """ + + @Appender(_shared_docs["str_pad"] % dict(side="left and right", method="center")) + @forbid_nonstring_types(["bytes"]) + def center(self, width, fillchar=" "): + return self.pad(width, side="both", fillchar=fillchar) + + @Appender(_shared_docs["str_pad"] % dict(side="right", method="ljust")) + @forbid_nonstring_types(["bytes"]) + def ljust(self, width, fillchar=" "): + return self.pad(width, side="right", fillchar=fillchar) + + @Appender(_shared_docs["str_pad"] % dict(side="left", method="rjust")) + @forbid_nonstring_types(["bytes"]) + def rjust(self, width, fillchar=" "): + return self.pad(width, side="left", fillchar=fillchar) + + @forbid_nonstring_types(["bytes"]) + def zfill(self, width): + """ + Pad strings in the Series/Index by prepending '0' characters. + + Strings in the Series/Index are padded with '0' characters on the + left of the string to reach a total string length `width`. Strings + in the Series/Index with length greater or equal to `width` are + unchanged. + + Parameters + ---------- + width : int + Minimum length of resulting string; strings with length less + than `width` be prepended with '0' characters. + + Returns + ------- + Series/Index of objects. + + See Also + -------- + Series.str.rjust : Fills the left side of strings with an arbitrary + character. + Series.str.ljust : Fills the right side of strings with an arbitrary + character. + Series.str.pad : Fills the specified sides of strings with an arbitrary + character. + Series.str.center : Fills boths sides of strings with an arbitrary + character. + + Notes + ----- + Differs from :meth:`str.zfill` which has special handling + for '+'/'-' in the string. + + Examples + -------- + >>> s = pd.Series(['-1', '1', '1000', 10, np.nan]) + >>> s + 0 -1 + 1 1 + 2 1000 + 3 10 + 4 NaN + dtype: object + + Note that ``10`` and ``NaN`` are not strings, therefore they are + converted to ``NaN``. The minus sign in ``'-1'`` is treated as a + regular character and the zero is added to the left of it + (:meth:`str.zfill` would have moved it to the left). ``1000`` + remains unchanged as it is longer than `width`. + + >>> s.str.zfill(3) + 0 0-1 + 1 001 + 2 1000 + 3 NaN + 4 NaN + dtype: object + """ + result = str_pad(self._parent, width, side="left", fillchar="0") + return self._wrap_result(result) + + @copy(str_slice) + def slice(self, start=None, stop=None, step=None): + result = str_slice(self._parent, start, stop, step) + return self._wrap_result(result) + + @copy(str_slice_replace) + @forbid_nonstring_types(["bytes"]) + def slice_replace(self, start=None, stop=None, repl=None): + result = str_slice_replace(self._parent, start, stop, repl) + return self._wrap_result(result) + + @copy(str_decode) + def decode(self, encoding, errors="strict"): + # need to allow bytes here + result = str_decode(self._parent, encoding, errors) + # TODO: Not sure how to handle this. + return self._wrap_result(result, returns_string=False) + + @copy(str_encode) + @forbid_nonstring_types(["bytes"]) + def encode(self, encoding, errors="strict"): + result = str_encode(self._parent, encoding, errors) + return self._wrap_result(result, returns_string=False) + + _shared_docs[ + "str_strip" + ] = r""" + Remove leading and trailing characters. + + Strip whitespaces (including newlines) or a set of specified characters + from each string in the Series/Index from %(side)s. + Equivalent to :meth:`str.%(method)s`. + + Parameters + ---------- + to_strip : str or None, default None + Specifying the set of characters to be removed. + All combinations of this set of characters will be stripped. + If None then whitespaces are removed. + + Returns + ------- + Series or Index of object + + See Also + -------- + Series.str.strip : Remove leading and trailing characters in Series/Index. + Series.str.lstrip : Remove leading characters in Series/Index. + Series.str.rstrip : Remove trailing characters in Series/Index. + + Examples + -------- + >>> s = pd.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', np.nan]) + >>> s + 0 1. Ant. + 1 2. Bee!\n + 2 3. Cat?\t + 3 NaN + dtype: object + + >>> s.str.strip() + 0 1. Ant. + 1 2. Bee! + 2 3. Cat? + 3 NaN + dtype: object + + >>> s.str.lstrip('123.') + 0 Ant. + 1 Bee!\n + 2 Cat?\t + 3 NaN + dtype: object + + >>> s.str.rstrip('.!? \n\t') + 0 1. Ant + 1 2. Bee + 2 3. Cat + 3 NaN + dtype: object + + >>> s.str.strip('123.!? \n\t') + 0 Ant + 1 Bee + 2 Cat + 3 NaN + dtype: object + """ + + @Appender( + _shared_docs["str_strip"] % dict(side="left and right sides", method="strip") + ) + @forbid_nonstring_types(["bytes"]) + def strip(self, to_strip=None): + result = str_strip(self._parent, to_strip, side="both") + return self._wrap_result(result) + + @Appender(_shared_docs["str_strip"] % dict(side="left side", method="lstrip")) + @forbid_nonstring_types(["bytes"]) + def lstrip(self, to_strip=None): + result = str_strip(self._parent, to_strip, side="left") + return self._wrap_result(result) + + @Appender(_shared_docs["str_strip"] % dict(side="right side", method="rstrip")) + @forbid_nonstring_types(["bytes"]) + def rstrip(self, to_strip=None): + result = str_strip(self._parent, to_strip, side="right") + return self._wrap_result(result) + + @copy(str_wrap) + @forbid_nonstring_types(["bytes"]) + def wrap(self, width, **kwargs): + result = str_wrap(self._parent, width, **kwargs) + return self._wrap_result(result) + + @copy(str_get_dummies) + @forbid_nonstring_types(["bytes"]) + def get_dummies(self, sep="|"): + # we need to cast to Series of strings as only that has all + # methods available for making the dummies... + data = self._orig.astype(str) if self._is_categorical else self._parent + result, name = str_get_dummies(data, sep) + return self._wrap_result( + result, + use_codes=(not self._is_categorical), + name=name, + expand=True, + returns_string=False, + ) + + @copy(str_translate) + @forbid_nonstring_types(["bytes"]) + def translate(self, table): + result = str_translate(self._parent, table) + return self._wrap_result(result) + + count = _pat_wrapper(str_count, flags=True, name="count", returns_string=False) + startswith = _pat_wrapper( + str_startswith, na=True, name="startswith", returns_string=False + ) + endswith = _pat_wrapper( + str_endswith, na=True, name="endswith", returns_string=False + ) + findall = _pat_wrapper( + str_findall, flags=True, name="findall", returns_string=False + ) + + @copy(str_extract) + @forbid_nonstring_types(["bytes"]) + def extract(self, pat, flags=0, expand=True): + return str_extract(self, pat, flags=flags, expand=expand) + + @copy(str_extractall) + @forbid_nonstring_types(["bytes"]) + def extractall(self, pat, flags=0): + return str_extractall(self._orig, pat, flags=flags) + + _shared_docs[ + "find" + ] = """ + Return %(side)s indexes in each strings in the Series/Index + where the substring is fully contained between [start:end]. + Return -1 on failure. Equivalent to standard :meth:`str.%(method)s`. + + Parameters + ---------- + sub : str + Substring being searched. + start : int + Left edge index. + end : int + Right edge index. + + Returns + ------- + Series or Index of int. + + See Also + -------- + %(also)s + """ + + @Appender( + _shared_docs["find"] + % dict( + side="lowest", + method="find", + also="rfind : Return highest indexes in each strings.", + ) + ) + @forbid_nonstring_types(["bytes"]) + def find(self, sub, start=0, end=None): + result = str_find(self._parent, sub, start=start, end=end, side="left") + return self._wrap_result(result, returns_string=False) + + @Appender( + _shared_docs["find"] + % dict( + side="highest", + method="rfind", + also="find : Return lowest indexes in each strings.", + ) + ) + @forbid_nonstring_types(["bytes"]) + def rfind(self, sub, start=0, end=None): + result = str_find(self._parent, sub, start=start, end=end, side="right") + return self._wrap_result(result, returns_string=False) + + @forbid_nonstring_types(["bytes"]) + def normalize(self, form): + """ + Return the Unicode normal form for the strings in the Series/Index. + For more information on the forms, see the + :func:`unicodedata.normalize`. + + Parameters + ---------- + form : {'NFC', 'NFKC', 'NFD', 'NFKD'} + Unicode form. + + Returns + ------- + normalized : Series/Index of objects + """ + import unicodedata + + f = lambda x: unicodedata.normalize(form, x) + result = _na_map(f, self._parent, dtype=str) + return self._wrap_result(result) + + _shared_docs[ + "index" + ] = """ + Return %(side)s indexes in each strings where the substring is + fully contained between [start:end]. This is the same as + ``str.%(similar)s`` except instead of returning -1, it raises a ValueError + when the substring is not found. Equivalent to standard ``str.%(method)s``. + + Parameters + ---------- + sub : str + Substring being searched. + start : int + Left edge index. + end : int + Right edge index. + + Returns + ------- + Series or Index of object + + See Also + -------- + %(also)s + """ + + @Appender( + _shared_docs["index"] + % dict( + side="lowest", + similar="find", + method="index", + also="rindex : Return highest indexes in each strings.", + ) + ) + @forbid_nonstring_types(["bytes"]) + def index(self, sub, start=0, end=None): + result = str_index(self._parent, sub, start=start, end=end, side="left") + return self._wrap_result(result, returns_string=False) + + @Appender( + _shared_docs["index"] + % dict( + side="highest", + similar="rfind", + method="rindex", + also="index : Return lowest indexes in each strings.", + ) + ) + @forbid_nonstring_types(["bytes"]) + def rindex(self, sub, start=0, end=None): + result = str_index(self._parent, sub, start=start, end=end, side="right") + return self._wrap_result(result, returns_string=False) + + _shared_docs[ + "len" + ] = """ + Compute the length of each element in the Series/Index. The element may be + a sequence (such as a string, tuple or list) or a collection + (such as a dictionary). + + Returns + ------- + Series or Index of int + A Series or Index of integer values indicating the length of each + element in the Series or Index. + + See Also + -------- + str.len : Python built-in function returning the length of an object. + Series.size : Returns the length of the Series. + + Examples + -------- + Returns the length (number of characters) in a string. Returns the + number of entries for dictionaries, lists or tuples. + + >>> s = pd.Series(['dog', + ... '', + ... 5, + ... {'foo' : 'bar'}, + ... [2, 3, 5, 7], + ... ('one', 'two', 'three')]) + >>> s + 0 dog + 1 + 2 5 + 3 {'foo': 'bar'} + 4 [2, 3, 5, 7] + 5 (one, two, three) + dtype: object + >>> s.str.len() + 0 3.0 + 1 0.0 + 2 NaN + 3 1.0 + 4 4.0 + 5 3.0 + dtype: float64 + """ + len = _noarg_wrapper( + len, + docstring=_shared_docs["len"], + forbidden_types=None, + dtype="int64", + returns_string=False, + ) + + _shared_docs[ + "casemethods" + ] = """ + Convert strings in the Series/Index to %(type)s. + %(version)s + Equivalent to :meth:`str.%(method)s`. + + Returns + ------- + Series or Index of object + + See Also + -------- + Series.str.lower : Converts all characters to lowercase. + Series.str.upper : Converts all characters to uppercase. + Series.str.title : Converts first character of each word to uppercase and + remaining to lowercase. + Series.str.capitalize : Converts first character to uppercase and + remaining to lowercase. + Series.str.swapcase : Converts uppercase to lowercase and lowercase to + uppercase. + Series.str.casefold: Removes all case distinctions in the string. + + Examples + -------- + >>> s = pd.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe']) + >>> s + 0 lower + 1 CAPITALS + 2 this is a sentence + 3 SwApCaSe + dtype: object + + >>> s.str.lower() + 0 lower + 1 capitals + 2 this is a sentence + 3 swapcase + dtype: object + + >>> s.str.upper() + 0 LOWER + 1 CAPITALS + 2 THIS IS A SENTENCE + 3 SWAPCASE + dtype: object + + >>> s.str.title() + 0 Lower + 1 Capitals + 2 This Is A Sentence + 3 Swapcase + dtype: object + + >>> s.str.capitalize() + 0 Lower + 1 Capitals + 2 This is a sentence + 3 Swapcase + dtype: object + + >>> s.str.swapcase() + 0 LOWER + 1 capitals + 2 THIS IS A SENTENCE + 3 sWaPcAsE + dtype: object + """ + + # _doc_args holds dict of strings to use in substituting casemethod docs + _doc_args: Dict[str, Dict[str, str]] = {} + _doc_args["lower"] = dict(type="lowercase", method="lower", version="") + _doc_args["upper"] = dict(type="uppercase", method="upper", version="") + _doc_args["title"] = dict(type="titlecase", method="title", version="") + _doc_args["capitalize"] = dict( + type="be capitalized", method="capitalize", version="" + ) + _doc_args["swapcase"] = dict(type="be swapcased", method="swapcase", version="") + _doc_args["casefold"] = dict( + type="be casefolded", + method="casefold", + version="\n .. versionadded:: 0.25.0\n", + ) + lower = _noarg_wrapper( + lambda x: x.lower(), + name="lower", + docstring=_shared_docs["casemethods"] % _doc_args["lower"], + dtype=str, + ) + upper = _noarg_wrapper( + lambda x: x.upper(), + name="upper", + docstring=_shared_docs["casemethods"] % _doc_args["upper"], + dtype=str, + ) + title = _noarg_wrapper( + lambda x: x.title(), + name="title", + docstring=_shared_docs["casemethods"] % _doc_args["title"], + dtype=str, + ) + capitalize = _noarg_wrapper( + lambda x: x.capitalize(), + name="capitalize", + docstring=_shared_docs["casemethods"] % _doc_args["capitalize"], + dtype=str, + ) + swapcase = _noarg_wrapper( + lambda x: x.swapcase(), + name="swapcase", + docstring=_shared_docs["casemethods"] % _doc_args["swapcase"], + dtype=str, + ) + casefold = _noarg_wrapper( + lambda x: x.casefold(), + name="casefold", + docstring=_shared_docs["casemethods"] % _doc_args["casefold"], + dtype=str, + ) + + _shared_docs[ + "ismethods" + ] = """ + Check whether all characters in each string are %(type)s. + + This is equivalent to running the Python string method + :meth:`str.%(method)s` for each element of the Series/Index. If a string + has zero characters, ``False`` is returned for that check. + + Returns + ------- + Series or Index of bool + Series or Index of boolean values with the same length as the original + Series/Index. + + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.isupper : Check whether all characters are uppercase. + Series.str.istitle : Check whether all characters are titlecase. + + Examples + -------- + **Checks for Alphabetic and Numeric Characters** + + >>> s1 = pd.Series(['one', 'one1', '1', '']) + + >>> s1.str.isalpha() + 0 True + 1 False + 2 False + 3 False + dtype: bool + + >>> s1.str.isnumeric() + 0 False + 1 False + 2 True + 3 False + dtype: bool + + >>> s1.str.isalnum() + 0 True + 1 True + 2 True + 3 False + dtype: bool + + Note that checks against characters mixed with any additional punctuation + or whitespace will evaluate to false for an alphanumeric check. + + >>> s2 = pd.Series(['A B', '1.5', '3,000']) + >>> s2.str.isalnum() + 0 False + 1 False + 2 False + dtype: bool + + **More Detailed Checks for Numeric Characters** + + There are several different but overlapping sets of numeric characters that + can be checked for. + + >>> s3 = pd.Series(['23', '³', '⅕', '']) + + The ``s3.str.isdecimal`` method checks for characters used to form numbers + in base 10. + + >>> s3.str.isdecimal() + 0 True + 1 False + 2 False + 3 False + dtype: bool + + The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also + includes special digits, like superscripted and subscripted digits in + unicode. + + >>> s3.str.isdigit() + 0 True + 1 True + 2 False + 3 False + dtype: bool + + The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also + includes other characters that can represent quantities such as unicode + fractions. + + >>> s3.str.isnumeric() + 0 True + 1 True + 2 True + 3 False + dtype: bool + + **Checks for Whitespace** + + >>> s4 = pd.Series([' ', '\\t\\r\\n ', '']) + >>> s4.str.isspace() + 0 True + 1 True + 2 False + dtype: bool + + **Checks for Character Case** + + >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) + + >>> s5.str.islower() + 0 True + 1 False + 2 False + 3 False + dtype: bool + + >>> s5.str.isupper() + 0 False + 1 False + 2 True + 3 False + dtype: bool + + The ``s5.str.istitle`` method checks for whether all words are in title + case (whether only the first letter of each word is capitalized). Words are + assumed to be as any sequence of non-numeric characters separated by + whitespace characters. + + >>> s5.str.istitle() + 0 False + 1 True + 2 False + 3 False + dtype: bool + """ + _doc_args["isalnum"] = dict(type="alphanumeric", method="isalnum") + _doc_args["isalpha"] = dict(type="alphabetic", method="isalpha") + _doc_args["isdigit"] = dict(type="digits", method="isdigit") + _doc_args["isspace"] = dict(type="whitespace", method="isspace") + _doc_args["islower"] = dict(type="lowercase", method="islower") + _doc_args["isupper"] = dict(type="uppercase", method="isupper") + _doc_args["istitle"] = dict(type="titlecase", method="istitle") + _doc_args["isnumeric"] = dict(type="numeric", method="isnumeric") + _doc_args["isdecimal"] = dict(type="decimal", method="isdecimal") + # force _noarg_wrapper return type with dtype=bool (GH 29624) + isalnum = _noarg_wrapper( + lambda x: x.isalnum(), + name="isalnum", + docstring=_shared_docs["ismethods"] % _doc_args["isalnum"], + returns_string=False, + dtype=bool, + ) + isalpha = _noarg_wrapper( + lambda x: x.isalpha(), + name="isalpha", + docstring=_shared_docs["ismethods"] % _doc_args["isalpha"], + returns_string=False, + dtype=bool, + ) + isdigit = _noarg_wrapper( + lambda x: x.isdigit(), + name="isdigit", + docstring=_shared_docs["ismethods"] % _doc_args["isdigit"], + returns_string=False, + dtype=bool, + ) + isspace = _noarg_wrapper( + lambda x: x.isspace(), + name="isspace", + docstring=_shared_docs["ismethods"] % _doc_args["isspace"], + returns_string=False, + dtype=bool, + ) + islower = _noarg_wrapper( + lambda x: x.islower(), + name="islower", + docstring=_shared_docs["ismethods"] % _doc_args["islower"], + returns_string=False, + dtype=bool, + ) + isupper = _noarg_wrapper( + lambda x: x.isupper(), + name="isupper", + docstring=_shared_docs["ismethods"] % _doc_args["isupper"], + returns_string=False, + dtype=bool, + ) + istitle = _noarg_wrapper( + lambda x: x.istitle(), + name="istitle", + docstring=_shared_docs["ismethods"] % _doc_args["istitle"], + returns_string=False, + dtype=bool, + ) + isnumeric = _noarg_wrapper( + lambda x: x.isnumeric(), + name="isnumeric", + docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"], + returns_string=False, + dtype=bool, + ) + isdecimal = _noarg_wrapper( + lambda x: x.isdecimal(), + name="isdecimal", + docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"], + returns_string=False, + dtype=bool, + ) + + @classmethod + def _make_accessor(cls, data): + cls._validate(data) + return cls(data) diff --git a/venv/Lib/site-packages/pandas/core/tools/__init__.py b/venv/Lib/site-packages/pandas/core/tools/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/core/tools/datetimes.py b/venv/Lib/site-packages/pandas/core/tools/datetimes.py new file mode 100644 index 0000000..8c2be70 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/tools/datetimes.py @@ -0,0 +1,1041 @@ +from collections import abc +from datetime import datetime, time +from functools import partial +from itertools import islice +from typing import Optional, TypeVar, Union + +import numpy as np + +from pandas._libs import tslib, tslibs +from pandas._libs.tslibs import Timestamp, conversion, parsing +from pandas._libs.tslibs.parsing import ( # noqa + DateParseError, + _format_is_iso, + _guess_datetime_format, + parse_time_string, +) +from pandas._libs.tslibs.strptime import array_strptime +from pandas._typing import ArrayLike + +from pandas.core.dtypes.common import ( + ensure_object, + is_datetime64_dtype, + is_datetime64_ns_dtype, + is_datetime64tz_dtype, + is_float, + is_integer, + is_integer_dtype, + is_list_like, + is_numeric_dtype, + is_scalar, +) +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCDatetimeIndex, + ABCIndex, + ABCIndexClass, + ABCSeries, +) +from pandas.core.dtypes.missing import notna + +from pandas.arrays import IntegerArray +from pandas.core import algorithms +from pandas.core.algorithms import unique + +# --------------------------------------------------------------------- +# types used in annotations + +ArrayConvertible = Union[list, tuple, ArrayLike, ABCSeries] +Scalar = Union[int, float, str] +DatetimeScalar = TypeVar("DatetimeScalar", Scalar, datetime) +DatetimeScalarOrArrayConvertible = Union[ + DatetimeScalar, list, tuple, ArrayLike, ABCSeries +] + + +# --------------------------------------------------------------------- + + +def _guess_datetime_format_for_array(arr, **kwargs): + # Try to guess the format based on the first non-NaN element + non_nan_elements = notna(arr).nonzero()[0] + if len(non_nan_elements): + return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs) + + +def should_cache( + arg: ArrayConvertible, unique_share: float = 0.7, check_count: Optional[int] = None +) -> bool: + """ + Decides whether to do caching. + + If the percent of unique elements among `check_count` elements less + than `unique_share * 100` then we can do caching. + + Parameters + ---------- + arg: listlike, tuple, 1-d array, Series + unique_share: float, default=0.7, optional + 0 < unique_share < 1 + check_count: int, optional + 0 <= check_count <= len(arg) + + Returns + ------- + do_caching: bool + + Notes + ----- + By default for a sequence of less than 50 items in size, we don't do + caching; for the number of elements less than 5000, we take ten percent of + all elements to check for a uniqueness share; if the sequence size is more + than 5000, then we check only the first 500 elements. + All constants were chosen empirically by. + """ + do_caching = True + + # default realization + if check_count is None: + # in this case, the gain from caching is negligible + if len(arg) <= 50: + return False + + if len(arg) <= 5000: + check_count = int(len(arg) * 0.1) + else: + check_count = 500 + else: + assert ( + 0 <= check_count <= len(arg) + ), "check_count must be in next bounds: [0; len(arg)]" + if check_count == 0: + return False + + assert 0 < unique_share < 1, "unique_share must be in next bounds: (0; 1)" + + unique_elements = set(islice(arg, check_count)) + if len(unique_elements) > check_count * unique_share: + do_caching = False + return do_caching + + +def _maybe_cache(arg, format, cache, convert_listlike): + """ + Create a cache of unique dates from an array of dates + + Parameters + ---------- + arg : listlike, tuple, 1-d array, Series + format : string + Strftime format to parse time + cache : boolean + True attempts to create a cache of converted values + convert_listlike : function + Conversion function to apply on dates + + Returns + ------- + cache_array : Series + Cache of converted, unique dates. Can be empty + """ + from pandas import Series + + cache_array = Series(dtype=object) + + if cache: + # Perform a quicker unique check + if not should_cache(arg): + return cache_array + + unique_dates = unique(arg) + if len(unique_dates) < len(arg): + cache_dates = convert_listlike(unique_dates, format) + cache_array = Series(cache_dates, index=unique_dates) + return cache_array + + +def _box_as_indexlike( + dt_array: ArrayLike, utc: Optional[bool] = None, name: Optional[str] = None +) -> Union[ABCIndex, ABCDatetimeIndex]: + """ + Properly boxes the ndarray of datetimes to DatetimeIndex + if it is possible or to generic Index instead + + Parameters + ---------- + dt_array: 1-d array + Array of datetimes to be wrapped in an Index. + tz : object + None or 'utc' + name : string, default None + Name for a resulting index + + Returns + ------- + result : datetime of converted dates + - DatetimeIndex if convertible to sole datetime64 type + - general Index otherwise + """ + from pandas import DatetimeIndex, Index + + if is_datetime64_dtype(dt_array): + tz = "utc" if utc else None + return DatetimeIndex(dt_array, tz=tz, name=name) + return Index(dt_array, name=name) + + +def _convert_and_box_cache( + arg: DatetimeScalarOrArrayConvertible, + cache_array: ABCSeries, + name: Optional[str] = None, +) -> ABCIndexClass: + """ + Convert array of dates with a cache and wrap the result in an Index. + + Parameters + ---------- + arg : integer, float, string, datetime, list, tuple, 1-d array, Series + cache_array : Series + Cache of converted, unique dates + name : string, default None + Name for a DatetimeIndex + + Returns + ------- + result : Index-like of converted dates + """ + from pandas import Series + + result = Series(arg).map(cache_array) + return _box_as_indexlike(result, utc=None, name=name) + + +def _return_parsed_timezone_results(result, timezones, tz, name): + """ + Return results from array_strptime if a %z or %Z directive was passed. + + Parameters + ---------- + result : ndarray + int64 date representations of the dates + timezones : ndarray + pytz timezone objects + tz : object + None or pytz timezone object + name : string, default None + Name for a DatetimeIndex + + Returns + ------- + tz_result : Index-like of parsed dates with timezone + """ + if tz is not None: + raise ValueError( + "Cannot pass a tz argument when " + "parsing strings with timezone " + "information." + ) + tz_results = np.array( + [Timestamp(res).tz_localize(zone) for res, zone in zip(result, timezones)] + ) + from pandas import Index + + return Index(tz_results, name=name) + + +def _convert_listlike_datetimes( + arg, + format, + name=None, + tz=None, + unit=None, + errors=None, + infer_datetime_format=None, + dayfirst=None, + yearfirst=None, + exact=None, +): + """ + Helper function for to_datetime. Performs the conversions of 1D listlike + of dates + + Parameters + ---------- + arg : list, tuple, ndarray, Series, Index + date to be parced + name : object + None or string for the Index name + tz : object + None or 'utc' + unit : string + None or string of the frequency of the passed data + errors : string + error handing behaviors from to_datetime, 'raise', 'coerce', 'ignore' + infer_datetime_format : boolean + inferring format behavior from to_datetime + dayfirst : boolean + dayfirst parsing behavior from to_datetime + yearfirst : boolean + yearfirst parsing behavior from to_datetime + exact : boolean + exact format matching behavior from to_datetime + + Returns + ------- + Index-like of parsed dates + """ + from pandas import DatetimeIndex + from pandas.core.arrays import DatetimeArray + from pandas.core.arrays.datetimes import ( + maybe_convert_dtype, + objects_to_datetime64ns, + ) + + if isinstance(arg, (list, tuple)): + arg = np.array(arg, dtype="O") + + # these are shortcutable + if is_datetime64tz_dtype(arg): + if not isinstance(arg, (DatetimeArray, DatetimeIndex)): + return DatetimeIndex(arg, tz=tz, name=name) + if tz == "utc": + arg = arg.tz_convert(None).tz_localize(tz) + return arg + + elif is_datetime64_ns_dtype(arg): + if not isinstance(arg, (DatetimeArray, DatetimeIndex)): + try: + return DatetimeIndex(arg, tz=tz, name=name) + except ValueError: + pass + elif tz: + # DatetimeArray, DatetimeIndex + return arg.tz_localize(tz) + + return arg + + elif unit is not None: + if format is not None: + raise ValueError("cannot specify both format and unit") + arg = getattr(arg, "_values", arg) + + # GH 30050 pass an ndarray to tslib.array_with_unit_to_datetime + # because it expects an ndarray argument + if isinstance(arg, IntegerArray): + # Explicitly pass NaT mask to array_with_unit_to_datetime + mask = arg.isna() + arg = arg._ndarray_values + else: + mask = None + + result, tz_parsed = tslib.array_with_unit_to_datetime( + arg, mask, unit, errors=errors + ) + + if errors == "ignore": + from pandas import Index + + result = Index(result, name=name) + else: + result = DatetimeIndex(result, name=name) + # GH 23758: We may still need to localize the result with tz + # GH 25546: Apply tz_parsed first (from arg), then tz (from caller) + # result will be naive but in UTC + try: + result = result.tz_localize("UTC").tz_convert(tz_parsed) + except AttributeError: + # Regular Index from 'ignore' path + return result + if tz is not None: + if result.tz is None: + result = result.tz_localize(tz) + else: + result = result.tz_convert(tz) + return result + elif getattr(arg, "ndim", 1) > 1: + raise TypeError( + "arg must be a string, datetime, list, tuple, 1-d array, or Series" + ) + + # warn if passing timedelta64, raise for PeriodDtype + # NB: this must come after unit transformation + orig_arg = arg + arg, _ = maybe_convert_dtype(arg, copy=False) + + arg = ensure_object(arg) + require_iso8601 = False + + if infer_datetime_format and format is None: + format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) + + if format is not None: + # There is a special fast-path for iso8601 formatted + # datetime strings, so in those cases don't use the inferred + # format because this path makes process slower in this + # special case + format_is_iso8601 = _format_is_iso(format) + if format_is_iso8601: + require_iso8601 = not infer_datetime_format + format = None + + tz_parsed = None + result = None + + if format is not None: + try: + # shortcut formatting here + if format == "%Y%m%d": + try: + # pass orig_arg as float-dtype may have been converted to + # datetime64[ns] + orig_arg = ensure_object(orig_arg) + result = _attempt_YYYYMMDD(orig_arg, errors=errors) + except (ValueError, TypeError, tslibs.OutOfBoundsDatetime): + raise ValueError("cannot convert the input to '%Y%m%d' date format") + + # fallback + if result is None: + try: + result, timezones = array_strptime( + arg, format, exact=exact, errors=errors + ) + if "%Z" in format or "%z" in format: + return _return_parsed_timezone_results( + result, timezones, tz, name + ) + except tslibs.OutOfBoundsDatetime: + if errors == "raise": + raise + elif errors == "coerce": + result = np.empty(arg.shape, dtype="M8[ns]") + iresult = result.view("i8") + iresult.fill(tslibs.iNaT) + else: + result = arg + except ValueError: + # if format was inferred, try falling back + # to array_to_datetime - terminate here + # for specified formats + if not infer_datetime_format: + if errors == "raise": + raise + elif errors == "coerce": + result = np.empty(arg.shape, dtype="M8[ns]") + iresult = result.view("i8") + iresult.fill(tslibs.iNaT) + else: + result = arg + except ValueError as e: + # Fallback to try to convert datetime objects if timezone-aware + # datetime objects are found without passing `utc=True` + try: + values, tz = conversion.datetime_to_datetime64(arg) + return DatetimeIndex._simple_new(values, name=name, tz=tz) + except (ValueError, TypeError): + raise e + + if result is None: + assert format is None or infer_datetime_format + utc = tz == "utc" + result, tz_parsed = objects_to_datetime64ns( + arg, + dayfirst=dayfirst, + yearfirst=yearfirst, + utc=utc, + errors=errors, + require_iso8601=require_iso8601, + allow_object=True, + ) + + if tz_parsed is not None: + # We can take a shortcut since the datetime64 numpy array + # is in UTC + return DatetimeIndex._simple_new(result, name=name, tz=tz_parsed) + + utc = tz == "utc" + return _box_as_indexlike(result, utc=utc, name=name) + + +def _adjust_to_origin(arg, origin, unit): + """ + Helper function for to_datetime. + Adjust input argument to the specified origin + + Parameters + ---------- + arg : list, tuple, ndarray, Series, Index + date to be adjusted + origin : 'julian' or Timestamp + origin offset for the arg + unit : string + passed unit from to_datetime, must be 'D' + + Returns + ------- + ndarray or scalar of adjusted date(s) + """ + if origin == "julian": + original = arg + j0 = Timestamp(0).to_julian_date() + if unit != "D": + raise ValueError("unit must be 'D' for origin='julian'") + try: + arg = arg - j0 + except TypeError: + raise ValueError("incompatible 'arg' type for given 'origin'='julian'") + + # preemptively check this for a nice range + j_max = Timestamp.max.to_julian_date() - j0 + j_min = Timestamp.min.to_julian_date() - j0 + if np.any(arg > j_max) or np.any(arg < j_min): + raise tslibs.OutOfBoundsDatetime( + f"{original} is Out of Bounds for origin='julian'" + ) + else: + # arg must be numeric + if not ( + (is_scalar(arg) and (is_integer(arg) or is_float(arg))) + or is_numeric_dtype(np.asarray(arg)) + ): + raise ValueError( + f"'{arg}' is not compatible with origin='{origin}'; " + "it must be numeric with a unit specified" + ) + + # we are going to offset back to unix / epoch time + try: + offset = Timestamp(origin) + except tslibs.OutOfBoundsDatetime: + raise tslibs.OutOfBoundsDatetime(f"origin {origin} is Out of Bounds") + except ValueError: + raise ValueError(f"origin {origin} cannot be converted to a Timestamp") + + if offset.tz is not None: + raise ValueError(f"origin offset {offset} must be tz-naive") + offset -= Timestamp(0) + + # convert the offset to the unit of the arg + # this should be lossless in terms of precision + offset = offset // tslibs.Timedelta(1, unit=unit) + + # scalars & ndarray-like can handle the addition + if is_list_like(arg) and not isinstance( + arg, (ABCSeries, ABCIndexClass, np.ndarray) + ): + arg = np.asarray(arg) + arg = arg + offset + return arg + + +def to_datetime( + arg, + errors="raise", + dayfirst=False, + yearfirst=False, + utc=None, + format=None, + exact=True, + unit=None, + infer_datetime_format=False, + origin="unix", + cache=True, +): + """ + Convert argument to datetime. + + Parameters + ---------- + arg : int, float, str, datetime, list, tuple, 1-d array, Series DataFrame/dict-like + The object to convert to a datetime. + errors : {'ignore', 'raise', 'coerce'}, default 'raise' + - If 'raise', then invalid parsing will raise an exception. + - If 'coerce', then invalid parsing will be set as NaT. + - If 'ignore', then invalid parsing will return the input. + dayfirst : bool, default False + Specify a date parse order if `arg` is str or its list-likes. + If True, parses dates with the day first, eg 10/11/12 is parsed as + 2012-11-10. + Warning: dayfirst=True is not strict, but will prefer to parse + with day first (this is a known bug, based on dateutil behavior). + yearfirst : bool, default False + Specify a date parse order if `arg` is str or its list-likes. + + - If True parses dates with the year first, eg 10/11/12 is parsed as + 2010-11-12. + - If both dayfirst and yearfirst are True, yearfirst is preceded (same + as dateutil). + + Warning: yearfirst=True is not strict, but will prefer to parse + with year first (this is a known bug, based on dateutil behavior). + utc : bool, default None + Return UTC DatetimeIndex if True (converting any tz-aware + datetime.datetime objects as well). + format : str, default None + The strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse + all the way up to nanoseconds. + See strftime documentation for more information on choices: + https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior. + exact : bool, True by default + Behaves as: + - If True, require an exact format match. + - If False, allow the format to match anywhere in the target string. + + unit : str, default 'ns' + The unit of the arg (D,s,ms,us,ns) denote the unit, which is an + integer or float number. This will be based off the origin. + Example, with unit='ms' and origin='unix' (the default), this + would calculate the number of milliseconds to the unix epoch start. + infer_datetime_format : bool, default False + If True and no `format` is given, attempt to infer the format of the + datetime strings, and if it can be inferred, switch to a faster + method of parsing them. In some cases this can increase the parsing + speed by ~5-10x. + origin : scalar, default 'unix' + Define the reference date. The numeric values would be parsed as number + of units (defined by `unit`) since this reference date. + + - If 'unix' (or POSIX) time; origin is set to 1970-01-01. + - If 'julian', unit must be 'D', and origin is set to beginning of + Julian Calendar. Julian day number 0 is assigned to the day starting + at noon on January 1, 4713 BC. + - If Timestamp convertible, origin is set to Timestamp identified by + origin. + cache : bool, default True + If True, use a cache of unique, converted dates to apply the datetime + conversion. May produce significant speed-up when parsing duplicate + date strings, especially ones with timezone offsets. + + .. versionadded:: 0.23.0 + + .. versionchanged:: 0.25.0 + - changed default value from False to True. + + Returns + ------- + datetime + If parsing succeeded. + Return type depends on input: + + - list-like: DatetimeIndex + - Series: Series of datetime64 dtype + - scalar: Timestamp + + In case when it is not possible to return designated types (e.g. when + any element of input is before Timestamp.min or after Timestamp.max) + return will have datetime.datetime type (or corresponding + array/Series). + + See Also + -------- + DataFrame.astype : Cast argument to a specified dtype. + to_timedelta : Convert argument to timedelta. + convert_dtypes : Convert dtypes. + + Examples + -------- + Assembling a datetime from multiple columns of a DataFrame. The keys can be + common abbreviations like ['year', 'month', 'day', 'minute', 'second', + 'ms', 'us', 'ns']) or plurals of the same + + >>> df = pd.DataFrame({'year': [2015, 2016], + ... 'month': [2, 3], + ... 'day': [4, 5]}) + >>> pd.to_datetime(df) + 0 2015-02-04 + 1 2016-03-05 + dtype: datetime64[ns] + + If a date does not meet the `timestamp limitations + `_, passing errors='ignore' + will return the original input instead of raising any exception. + + Passing errors='coerce' will force an out-of-bounds date to NaT, + in addition to forcing non-dates (or non-parseable dates) to NaT. + + >>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore') + datetime.datetime(1300, 1, 1, 0, 0) + >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce') + NaT + + Passing infer_datetime_format=True can often-times speedup a parsing + if its not an ISO8601 format exactly, but in a regular format. + + >>> s = pd.Series(['3/11/2000', '3/12/2000', '3/13/2000'] * 1000) + >>> s.head() + 0 3/11/2000 + 1 3/12/2000 + 2 3/13/2000 + 3 3/11/2000 + 4 3/12/2000 + dtype: object + + >>> %timeit pd.to_datetime(s, infer_datetime_format=True) # doctest: +SKIP + 100 loops, best of 3: 10.4 ms per loop + + >>> %timeit pd.to_datetime(s, infer_datetime_format=False) # doctest: +SKIP + 1 loop, best of 3: 471 ms per loop + + Using a unix epoch time + + >>> pd.to_datetime(1490195805, unit='s') + Timestamp('2017-03-22 15:16:45') + >>> pd.to_datetime(1490195805433502912, unit='ns') + Timestamp('2017-03-22 15:16:45.433502912') + + .. warning:: For float arg, precision rounding might happen. To prevent + unexpected behavior use a fixed-width exact type. + + Using a non-unix epoch origin + + >>> pd.to_datetime([1, 2, 3], unit='D', + ... origin=pd.Timestamp('1960-01-01')) + DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'], \ +dtype='datetime64[ns]', freq=None) + """ + if arg is None: + return None + + if origin != "unix": + arg = _adjust_to_origin(arg, origin, unit) + + tz = "utc" if utc else None + convert_listlike = partial( + _convert_listlike_datetimes, + tz=tz, + unit=unit, + dayfirst=dayfirst, + yearfirst=yearfirst, + errors=errors, + exact=exact, + infer_datetime_format=infer_datetime_format, + ) + + if isinstance(arg, Timestamp): + result = arg + if tz is not None: + if arg.tz is not None: + result = result.tz_convert(tz) + else: + result = result.tz_localize(tz) + elif isinstance(arg, ABCSeries): + cache_array = _maybe_cache(arg, format, cache, convert_listlike) + if not cache_array.empty: + result = arg.map(cache_array) + else: + values = convert_listlike(arg._values, format) + result = arg._constructor(values, index=arg.index, name=arg.name) + elif isinstance(arg, (ABCDataFrame, abc.MutableMapping)): + result = _assemble_from_unit_mappings(arg, errors, tz) + elif isinstance(arg, ABCIndexClass): + cache_array = _maybe_cache(arg, format, cache, convert_listlike) + if not cache_array.empty: + result = _convert_and_box_cache(arg, cache_array, name=arg.name) + else: + convert_listlike = partial(convert_listlike, name=arg.name) + result = convert_listlike(arg, format) + elif is_list_like(arg): + cache_array = _maybe_cache(arg, format, cache, convert_listlike) + if not cache_array.empty: + result = _convert_and_box_cache(arg, cache_array) + else: + result = convert_listlike(arg, format) + else: + result = convert_listlike(np.array([arg]), format)[0] + + return result + + +# mappings for assembling units +_unit_map = { + "year": "year", + "years": "year", + "month": "month", + "months": "month", + "day": "day", + "days": "day", + "hour": "h", + "hours": "h", + "minute": "m", + "minutes": "m", + "second": "s", + "seconds": "s", + "ms": "ms", + "millisecond": "ms", + "milliseconds": "ms", + "us": "us", + "microsecond": "us", + "microseconds": "us", + "ns": "ns", + "nanosecond": "ns", + "nanoseconds": "ns", +} + + +def _assemble_from_unit_mappings(arg, errors, tz): + """ + assemble the unit specified fields from the arg (DataFrame) + Return a Series for actual parsing + + Parameters + ---------- + arg : DataFrame + errors : {'ignore', 'raise', 'coerce'}, default 'raise' + + - If 'raise', then invalid parsing will raise an exception + - If 'coerce', then invalid parsing will be set as NaT + - If 'ignore', then invalid parsing will return the input + tz : None or 'utc' + + Returns + ------- + Series + """ + from pandas import to_timedelta, to_numeric, DataFrame + + arg = DataFrame(arg) + if not arg.columns.is_unique: + raise ValueError("cannot assemble with duplicate keys") + + # replace passed unit with _unit_map + def f(value): + if value in _unit_map: + return _unit_map[value] + + # m is case significant + if value.lower() in _unit_map: + return _unit_map[value.lower()] + + return value + + unit = {k: f(k) for k in arg.keys()} + unit_rev = {v: k for k, v in unit.items()} + + # we require at least Ymd + required = ["year", "month", "day"] + req = sorted(set(required) - set(unit_rev.keys())) + if len(req): + required = ",".join(req) + raise ValueError( + "to assemble mappings requires at least that " + f"[year, month, day] be specified: [{required}] " + "is missing" + ) + + # keys we don't recognize + excess = sorted(set(unit_rev.keys()) - set(_unit_map.values())) + if len(excess): + excess = ",".join(excess) + raise ValueError( + f"extra keys have been passed to the datetime assemblage: [{excess}]" + ) + + def coerce(values): + # we allow coercion to if errors allows + values = to_numeric(values, errors=errors) + + # prevent overflow in case of int8 or int16 + if is_integer_dtype(values): + values = values.astype("int64", copy=False) + return values + + values = ( + coerce(arg[unit_rev["year"]]) * 10000 + + coerce(arg[unit_rev["month"]]) * 100 + + coerce(arg[unit_rev["day"]]) + ) + try: + values = to_datetime(values, format="%Y%m%d", errors=errors, utc=tz) + except (TypeError, ValueError) as err: + raise ValueError(f"cannot assemble the datetimes: {err}") + + for u in ["h", "m", "s", "ms", "us", "ns"]: + value = unit_rev.get(u) + if value is not None and value in arg: + try: + values += to_timedelta(coerce(arg[value]), unit=u, errors=errors) + except (TypeError, ValueError) as err: + raise ValueError(f"cannot assemble the datetimes [{value}]: {err}") + return values + + +def _attempt_YYYYMMDD(arg, errors): + """ + try to parse the YYYYMMDD/%Y%m%d format, try to deal with NaT-like, + arg is a passed in as an object dtype, but could really be ints/strings + with nan-like/or floats (e.g. with nan) + + Parameters + ---------- + arg : passed value + errors : 'raise','ignore','coerce' + """ + + def calc(carg): + # calculate the actual result + carg = carg.astype(object) + parsed = parsing.try_parse_year_month_day( + carg / 10000, carg / 100 % 100, carg % 100 + ) + return tslib.array_to_datetime(parsed, errors=errors)[0] + + def calc_with_mask(carg, mask): + result = np.empty(carg.shape, dtype="M8[ns]") + iresult = result.view("i8") + iresult[~mask] = tslibs.iNaT + + masked_result = calc(carg[mask].astype(np.float64).astype(np.int64)) + result[mask] = masked_result.astype("M8[ns]") + return result + + # try intlike / strings that are ints + try: + return calc(arg.astype(np.int64)) + except (ValueError, OverflowError, TypeError): + pass + + # a float with actual np.nan + try: + carg = arg.astype(np.float64) + return calc_with_mask(carg, notna(carg)) + except (ValueError, OverflowError, TypeError): + pass + + # string with NaN-like + try: + mask = ~algorithms.isin(arg, list(tslib.nat_strings)) + return calc_with_mask(arg, mask) + except (ValueError, OverflowError, TypeError): + pass + + return None + + +# Fixed time formats for time parsing +_time_formats = [ + "%H:%M", + "%H%M", + "%I:%M%p", + "%I%M%p", + "%H:%M:%S", + "%H%M%S", + "%I:%M:%S%p", + "%I%M%S%p", +] + + +def _guess_time_format_for_array(arr): + # Try to guess the format based on the first non-NaN element + non_nan_elements = notna(arr).nonzero()[0] + if len(non_nan_elements): + element = arr[non_nan_elements[0]] + for time_format in _time_formats: + try: + datetime.strptime(element, time_format) + return time_format + except ValueError: + pass + + return None + + +def to_time(arg, format=None, infer_time_format=False, errors="raise"): + """ + Parse time strings to time objects using fixed strptime formats ("%H:%M", + "%H%M", "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", + "%I%M%S%p") + + Use infer_time_format if all the strings are in the same format to speed + up conversion. + + Parameters + ---------- + arg : string in time format, datetime.time, list, tuple, 1-d array, Series + format : str, default None + Format used to convert arg into a time object. If None, fixed formats + are used. + infer_time_format: bool, default False + Infer the time format based on the first non-NaN element. If all + strings are in the same format, this will speed up conversion. + errors : {'ignore', 'raise', 'coerce'}, default 'raise' + - If 'raise', then invalid parsing will raise an exception + - If 'coerce', then invalid parsing will be set as None + - If 'ignore', then invalid parsing will return the input + + Returns + ------- + datetime.time + """ + + def _convert_listlike(arg, format): + + if isinstance(arg, (list, tuple)): + arg = np.array(arg, dtype="O") + + elif getattr(arg, "ndim", 1) > 1: + raise TypeError( + "arg must be a string, datetime, list, tuple, 1-d array, or Series" + ) + + arg = ensure_object(arg) + + if infer_time_format and format is None: + format = _guess_time_format_for_array(arg) + + times = [] + if format is not None: + for element in arg: + try: + times.append(datetime.strptime(element, format).time()) + except (ValueError, TypeError): + if errors == "raise": + msg = ( + f"Cannot convert {element} to a time with given " + f"format {format}" + ) + raise ValueError(msg) + elif errors == "ignore": + return arg + else: + times.append(None) + else: + formats = _time_formats[:] + format_found = False + for element in arg: + time_object = None + for time_format in formats: + try: + time_object = datetime.strptime(element, time_format).time() + if not format_found: + # Put the found format in front + fmt = formats.pop(formats.index(time_format)) + formats.insert(0, fmt) + format_found = True + break + except (ValueError, TypeError): + continue + + if time_object is not None: + times.append(time_object) + elif errors == "raise": + raise ValueError(f"Cannot convert arg {arg} to a time") + elif errors == "ignore": + return arg + else: + times.append(None) + + return times + + if arg is None: + return arg + elif isinstance(arg, time): + return arg + elif isinstance(arg, ABCSeries): + values = _convert_listlike(arg._values, format) + return arg._constructor(values, index=arg.index, name=arg.name) + elif isinstance(arg, ABCIndexClass): + return _convert_listlike(arg, format) + elif is_list_like(arg): + return _convert_listlike(arg, format) + + return _convert_listlike(np.array([arg]), format)[0] diff --git a/venv/Lib/site-packages/pandas/core/tools/numeric.py b/venv/Lib/site-packages/pandas/core/tools/numeric.py new file mode 100644 index 0000000..4939cbf --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/tools/numeric.py @@ -0,0 +1,194 @@ +import numpy as np + +from pandas._libs import lib + +from pandas.core.dtypes.cast import maybe_downcast_to_dtype +from pandas.core.dtypes.common import ( + ensure_object, + is_datetime_or_timedelta_dtype, + is_decimal, + is_number, + is_numeric_dtype, + is_scalar, +) +from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries + +import pandas as pd + + +def to_numeric(arg, errors="raise", downcast=None): + """ + Convert argument to a numeric type. + + The default return dtype is `float64` or `int64` + depending on the data supplied. Use the `downcast` parameter + to obtain other dtypes. + + Please note that precision loss may occur if really large numbers + are passed in. Due to the internal limitations of `ndarray`, if + numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min) + or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are + passed in, it is very likely they will be converted to float so that + they can stored in an `ndarray`. These warnings apply similarly to + `Series` since it internally leverages `ndarray`. + + Parameters + ---------- + arg : scalar, list, tuple, 1-d array, or Series + errors : {'ignore', 'raise', 'coerce'}, default 'raise' + - If 'raise', then invalid parsing will raise an exception. + - If 'coerce', then invalid parsing will be set as NaN. + - If 'ignore', then invalid parsing will return the input. + downcast : {'integer', 'signed', 'unsigned', 'float'}, default None + If not None, and if the data has been successfully cast to a + numerical dtype (or if the data was numeric to begin with), + downcast that resulting data to the smallest numerical dtype + possible according to the following rules: + + - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) + - 'unsigned': smallest unsigned int dtype (min.: np.uint8) + - 'float': smallest float dtype (min.: np.float32) + + As this behaviour is separate from the core conversion to + numeric values, any errors raised during the downcasting + will be surfaced regardless of the value of the 'errors' input. + + In addition, downcasting will only occur if the size + of the resulting data's dtype is strictly larger than + the dtype it is to be cast to, so if none of the dtypes + checked satisfy that specification, no downcasting will be + performed on the data. + + Returns + ------- + ret : numeric if parsing succeeded. + Return type depends on input. Series if Series, otherwise ndarray. + + See Also + -------- + DataFrame.astype : Cast argument to a specified dtype. + to_datetime : Convert argument to datetime. + to_timedelta : Convert argument to timedelta. + numpy.ndarray.astype : Cast a numpy array to a specified type. + convert_dtypes : Convert dtypes. + + Examples + -------- + Take separate series and convert to numeric, coercing when told to + + >>> s = pd.Series(['1.0', '2', -3]) + >>> pd.to_numeric(s) + 0 1.0 + 1 2.0 + 2 -3.0 + dtype: float64 + >>> pd.to_numeric(s, downcast='float') + 0 1.0 + 1 2.0 + 2 -3.0 + dtype: float32 + >>> pd.to_numeric(s, downcast='signed') + 0 1 + 1 2 + 2 -3 + dtype: int8 + >>> s = pd.Series(['apple', '1.0', '2', -3]) + >>> pd.to_numeric(s, errors='ignore') + 0 apple + 1 1.0 + 2 2 + 3 -3 + dtype: object + >>> pd.to_numeric(s, errors='coerce') + 0 NaN + 1 1.0 + 2 2.0 + 3 -3.0 + dtype: float64 + """ + if downcast not in (None, "integer", "signed", "unsigned", "float"): + raise ValueError("invalid downcasting method provided") + + if errors not in ("ignore", "raise", "coerce"): + raise ValueError("invalid error value specified") + + is_series = False + is_index = False + is_scalars = False + + if isinstance(arg, ABCSeries): + is_series = True + values = arg.values + elif isinstance(arg, ABCIndexClass): + is_index = True + values = arg.asi8 + if values is None: + values = arg.values + elif isinstance(arg, (list, tuple)): + values = np.array(arg, dtype="O") + elif is_scalar(arg): + if is_decimal(arg): + return float(arg) + if is_number(arg): + return arg + is_scalars = True + values = np.array([arg], dtype="O") + elif getattr(arg, "ndim", 1) > 1: + raise TypeError("arg must be a list, tuple, 1-d array, or Series") + else: + values = arg + + if is_numeric_dtype(values): + pass + elif is_datetime_or_timedelta_dtype(values): + values = values.astype(np.int64) + else: + values = ensure_object(values) + coerce_numeric = errors not in ("ignore", "raise") + try: + values = lib.maybe_convert_numeric( + values, set(), coerce_numeric=coerce_numeric + ) + except (ValueError, TypeError): + if errors == "raise": + raise + + # attempt downcast only if the data has been successfully converted + # to a numerical dtype and if a downcast method has been specified + if downcast is not None and is_numeric_dtype(values): + typecodes = None + + if downcast in ("integer", "signed"): + typecodes = np.typecodes["Integer"] + elif downcast == "unsigned" and np.min(values) >= 0: + typecodes = np.typecodes["UnsignedInteger"] + elif downcast == "float": + typecodes = np.typecodes["Float"] + + # pandas support goes only to np.float32, + # as float dtypes smaller than that are + # extremely rare and not well supported + float_32_char = np.dtype(np.float32).char + float_32_ind = typecodes.index(float_32_char) + typecodes = typecodes[float_32_ind:] + + if typecodes is not None: + # from smallest to largest + for dtype in typecodes: + if np.dtype(dtype).itemsize <= values.dtype.itemsize: + values = maybe_downcast_to_dtype(values, dtype) + + # successful conversion + if values.dtype == dtype: + break + + if is_series: + return pd.Series(values, index=arg.index, name=arg.name) + elif is_index: + # because we want to coerce to numeric if possible, + # do not use _shallow_copy_with_infer + return pd.Index(values, name=arg.name) + elif is_scalars: + return values[0] + else: + return values diff --git a/venv/Lib/site-packages/pandas/core/tools/timedeltas.py b/venv/Lib/site-packages/pandas/core/tools/timedeltas.py new file mode 100644 index 0000000..3f0cfce --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/tools/timedeltas.py @@ -0,0 +1,157 @@ +""" +timedelta support tools +""" + +import numpy as np + +from pandas._libs.tslibs import NaT +from pandas._libs.tslibs.timedeltas import Timedelta, parse_timedelta_unit + +from pandas.core.dtypes.common import is_list_like +from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries + +from pandas.core.arrays.timedeltas import sequence_to_td64ns + + +def to_timedelta(arg, unit="ns", errors="raise"): + """ + Convert argument to timedelta. + + Timedeltas are absolute differences in times, expressed in difference + units (e.g. days, hours, minutes, seconds). This method converts + an argument from a recognized timedelta format / value into + a Timedelta type. + + Parameters + ---------- + arg : str, timedelta, list-like or Series + The data to be converted to timedelta. + unit : str, default 'ns' + Denotes the unit of the arg. Possible values: + ('Y', 'M', 'W', 'D', 'days', 'day', 'hours', hour', 'hr', + 'h', 'm', 'minute', 'min', 'minutes', 'T', 'S', 'seconds', + 'sec', 'second', 'ms', 'milliseconds', 'millisecond', + 'milli', 'millis', 'L', 'us', 'microseconds', 'microsecond', + 'micro', 'micros', 'U', 'ns', 'nanoseconds', 'nano', 'nanos', + 'nanosecond', 'N'). + + errors : {'ignore', 'raise', 'coerce'}, default 'raise' + - If 'raise', then invalid parsing will raise an exception. + - If 'coerce', then invalid parsing will be set as NaT. + - If 'ignore', then invalid parsing will return the input. + + Returns + ------- + timedelta64 or numpy.array of timedelta64 + Output type returned if parsing succeeded. + + See Also + -------- + DataFrame.astype : Cast argument to a specified dtype. + to_datetime : Convert argument to datetime. + convert_dtypes : Convert dtypes. + + Examples + -------- + + Parsing a single string to a Timedelta: + + >>> pd.to_timedelta('1 days 06:05:01.00003') + Timedelta('1 days 06:05:01.000030') + >>> pd.to_timedelta('15.5us') + Timedelta('0 days 00:00:00.000015') + + Parsing a list or array of strings: + + >>> pd.to_timedelta(['1 days 06:05:01.00003', '15.5us', 'nan']) + TimedeltaIndex(['1 days 06:05:01.000030', '0 days 00:00:00.000015', NaT], + dtype='timedelta64[ns]', freq=None) + + Converting numbers by specifying the `unit` keyword argument: + + >>> pd.to_timedelta(np.arange(5), unit='s') + TimedeltaIndex(['00:00:00', '00:00:01', '00:00:02', + '00:00:03', '00:00:04'], + dtype='timedelta64[ns]', freq=None) + >>> pd.to_timedelta(np.arange(5), unit='d') + TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], + dtype='timedelta64[ns]', freq=None) + """ + unit = parse_timedelta_unit(unit) + + if errors not in ("ignore", "raise", "coerce"): + raise ValueError("errors must be one of 'ignore', 'raise', or 'coerce'}") + + if unit in {"Y", "y", "M"}: + raise ValueError( + "Units 'M' and 'Y' are no longer supported, as they do not " + "represent unambiguous timedelta values durations." + ) + + if arg is None: + return arg + elif isinstance(arg, ABCSeries): + values = _convert_listlike(arg._values, unit=unit, errors=errors) + return arg._constructor(values, index=arg.index, name=arg.name) + elif isinstance(arg, ABCIndexClass): + return _convert_listlike(arg, unit=unit, errors=errors, name=arg.name) + elif isinstance(arg, np.ndarray) and arg.ndim == 0: + # extract array scalar and process below + arg = arg.item() + elif is_list_like(arg) and getattr(arg, "ndim", 1) == 1: + return _convert_listlike(arg, unit=unit, errors=errors) + elif getattr(arg, "ndim", 1) > 1: + raise TypeError( + "arg must be a string, timedelta, list, tuple, 1-d array, or Series" + ) + + # ...so it must be a scalar value. Return scalar. + return _coerce_scalar_to_timedelta_type(arg, unit=unit, errors=errors) + + +def _coerce_scalar_to_timedelta_type(r, unit="ns", errors="raise"): + """Convert string 'r' to a timedelta object.""" + + try: + result = Timedelta(r, unit) + except ValueError: + if errors == "raise": + raise + elif errors == "ignore": + return r + + # coerce + result = NaT + + return result + + +def _convert_listlike(arg, unit="ns", errors="raise", name=None): + """Convert a list of objects to a timedelta index object.""" + + if isinstance(arg, (list, tuple)) or not hasattr(arg, "dtype"): + # This is needed only to ensure that in the case where we end up + # returning arg (errors == "ignore"), and where the input is a + # generator, we return a useful list-like instead of a + # used-up generator + arg = np.array(list(arg), dtype=object) + + try: + value = sequence_to_td64ns(arg, unit=unit, errors=errors, copy=False)[0] + except ValueError: + if errors == "ignore": + return arg + else: + # This else-block accounts for the cases when errors='raise' + # and errors='coerce'. If errors == 'raise', these errors + # should be raised. If errors == 'coerce', we shouldn't + # expect any errors to be raised, since all parsing errors + # cause coercion to pd.NaT. However, if an error / bug is + # introduced that causes an Exception to be raised, we would + # like to surface it. + raise + + from pandas import TimedeltaIndex + + value = TimedeltaIndex(value, unit="ns", name=name) + return value diff --git a/venv/Lib/site-packages/pandas/core/util/__init__.py b/venv/Lib/site-packages/pandas/core/util/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/core/util/hashing.py b/venv/Lib/site-packages/pandas/core/util/hashing.py new file mode 100644 index 0000000..3366f10 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/util/hashing.py @@ -0,0 +1,359 @@ +""" +data hash pandas / numpy objects +""" +import itertools +from typing import Optional + +import numpy as np + +from pandas._libs import Timestamp +import pandas._libs.hashing as hashing + +from pandas.core.dtypes.cast import infer_dtype_from_scalar +from pandas.core.dtypes.common import ( + is_categorical_dtype, + is_extension_array_dtype, + is_list_like, +) +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCIndexClass, + ABCMultiIndex, + ABCSeries, +) +from pandas.core.dtypes.missing import isna + +# 16 byte long hashing key +_default_hash_key = "0123456789123456" + + +def _combine_hash_arrays(arrays, num_items: int): + """ + Parameters + ---------- + arrays : generator + num_items : int + + Should be the same as CPython's tupleobject.c + """ + try: + first = next(arrays) + except StopIteration: + return np.array([], dtype=np.uint64) + + arrays = itertools.chain([first], arrays) + + mult = np.uint64(1000003) + out = np.zeros_like(first) + np.uint64(0x345678) + for i, a in enumerate(arrays): + inverse_i = num_items - i + out ^= a + out *= mult + mult += np.uint64(82520 + inverse_i + inverse_i) + assert i + 1 == num_items, "Fed in wrong num_items" + out += np.uint64(97531) + return out + + +def hash_pandas_object( + obj, + index: bool = True, + encoding: str = "utf8", + hash_key: Optional[str] = _default_hash_key, + categorize: bool = True, +): + """ + Return a data hash of the Index/Series/DataFrame. + + Parameters + ---------- + index : bool, default True + Include the index in the hash (if Series/DataFrame). + encoding : str, default 'utf8' + Encoding for data & key when strings. + hash_key : str, default _default_hash_key + Hash_key for string key to encode. + categorize : bool, default True + Whether to first categorize object arrays before hashing. This is more + efficient when the array contains duplicate values. + + Returns + ------- + Series of uint64, same length as the object + """ + from pandas import Series + + if hash_key is None: + hash_key = _default_hash_key + + if isinstance(obj, ABCMultiIndex): + return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False) + + elif isinstance(obj, ABCIndexClass): + h = hash_array(obj.values, encoding, hash_key, categorize).astype( + "uint64", copy=False + ) + h = Series(h, index=obj, dtype="uint64", copy=False) + + elif isinstance(obj, ABCSeries): + h = hash_array(obj.values, encoding, hash_key, categorize).astype( + "uint64", copy=False + ) + if index: + index_iter = ( + hash_pandas_object( + obj.index, + index=False, + encoding=encoding, + hash_key=hash_key, + categorize=categorize, + ).values + for _ in [None] + ) + arrays = itertools.chain([h], index_iter) + h = _combine_hash_arrays(arrays, 2) + + h = Series(h, index=obj.index, dtype="uint64", copy=False) + + elif isinstance(obj, ABCDataFrame): + hashes = (hash_array(series.values) for _, series in obj.items()) + num_items = len(obj.columns) + if index: + index_hash_generator = ( + hash_pandas_object( + obj.index, + index=False, + encoding=encoding, + hash_key=hash_key, + categorize=categorize, + ).values # noqa + for _ in [None] + ) + num_items += 1 + + # keep `hashes` specifically a generator to keep mypy happy + _hashes = itertools.chain(hashes, index_hash_generator) + hashes = (x for x in _hashes) + h = _combine_hash_arrays(hashes, num_items) + + h = Series(h, index=obj.index, dtype="uint64", copy=False) + else: + raise TypeError(f"Unexpected type for hashing {type(obj)}") + return h + + +def hash_tuples(vals, encoding="utf8", hash_key: str = _default_hash_key): + """ + Hash an MultiIndex / list-of-tuples efficiently + + Parameters + ---------- + vals : MultiIndex, list-of-tuples, or single tuple + encoding : str, default 'utf8' + hash_key : str, default _default_hash_key + + Returns + ------- + ndarray of hashed values array + """ + is_tuple = False + if isinstance(vals, tuple): + vals = [vals] + is_tuple = True + elif not is_list_like(vals): + raise TypeError("must be convertible to a list-of-tuples") + + from pandas import Categorical, MultiIndex + + if not isinstance(vals, ABCMultiIndex): + vals = MultiIndex.from_tuples(vals) + + # create a list-of-Categoricals + vals = [ + Categorical(vals.codes[level], vals.levels[level], ordered=False, fastpath=True) + for level in range(vals.nlevels) + ] + + # hash the list-of-ndarrays + hashes = ( + _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in vals + ) + h = _combine_hash_arrays(hashes, len(vals)) + if is_tuple: + h = h[0] + + return h + + +def hash_tuple(val, encoding: str = "utf8", hash_key: str = _default_hash_key): + """ + Hash a single tuple efficiently + + Parameters + ---------- + val : single tuple + encoding : str, default 'utf8' + hash_key : str, default _default_hash_key + + Returns + ------- + hash + + """ + hashes = (_hash_scalar(v, encoding=encoding, hash_key=hash_key) for v in val) + + h = _combine_hash_arrays(hashes, len(val))[0] + + return h + + +def _hash_categorical(c, encoding: str, hash_key: str): + """ + Hash a Categorical by hashing its categories, and then mapping the codes + to the hashes + + Parameters + ---------- + c : Categorical + encoding : str + hash_key : str + + Returns + ------- + ndarray of hashed values array, same size as len(c) + """ + # Convert ExtensionArrays to ndarrays + values = np.asarray(c.categories.values) + hashed = hash_array(values, encoding, hash_key, categorize=False) + + # we have uint64, as we don't directly support missing values + # we don't want to use take_nd which will coerce to float + # instead, directly construct the result with a + # max(np.uint64) as the missing value indicator + # + # TODO: GH 15362 + + mask = c.isna() + if len(hashed): + result = hashed.take(c.codes) + else: + result = np.zeros(len(mask), dtype="uint64") + + if mask.any(): + result[mask] = np.iinfo(np.uint64).max + + return result + + +def hash_array( + vals, + encoding: str = "utf8", + hash_key: str = _default_hash_key, + categorize: bool = True, +): + """ + Given a 1d array, return an array of deterministic integers. + + Parameters + ---------- + vals : ndarray, Categorical + encoding : str, default 'utf8' + Encoding for data & key when strings. + hash_key : str, default _default_hash_key + Hash_key for string key to encode. + categorize : bool, default True + Whether to first categorize object arrays before hashing. This is more + efficient when the array contains duplicate values. + + Returns + ------- + 1d uint64 numpy array of hash values, same length as the vals + """ + + if not hasattr(vals, "dtype"): + raise TypeError("must pass a ndarray-like") + dtype = vals.dtype + + # For categoricals, we hash the categories, then remap the codes to the + # hash values. (This check is above the complex check so that we don't ask + # numpy if categorical is a subdtype of complex, as it will choke). + if is_categorical_dtype(dtype): + return _hash_categorical(vals, encoding, hash_key) + elif is_extension_array_dtype(dtype): + vals, _ = vals._values_for_factorize() + dtype = vals.dtype + + # we'll be working with everything as 64-bit values, so handle this + # 128-bit value early + if np.issubdtype(dtype, np.complex128): + return hash_array(np.real(vals)) + 23 * hash_array(np.imag(vals)) + + # First, turn whatever array this is into unsigned 64-bit ints, if we can + # manage it. + elif isinstance(dtype, np.bool): + vals = vals.astype("u8") + elif issubclass(dtype.type, (np.datetime64, np.timedelta64)): + vals = vals.view("i8").astype("u8", copy=False) + elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8: + vals = vals.view("u{}".format(vals.dtype.itemsize)).astype("u8") + else: + # With repeated values, its MUCH faster to categorize object dtypes, + # then hash and rename categories. We allow skipping the categorization + # when the values are known/likely to be unique. + if categorize: + from pandas import factorize, Categorical, Index + + codes, categories = factorize(vals, sort=False) + cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) + return _hash_categorical(cat, encoding, hash_key) + + try: + vals = hashing.hash_object_array(vals, hash_key, encoding) + except TypeError: + # we have mixed types + vals = hashing.hash_object_array( + vals.astype(str).astype(object), hash_key, encoding + ) + + # Then, redistribute these 64-bit ints within the space of 64-bit ints + vals ^= vals >> 30 + vals *= np.uint64(0xBF58476D1CE4E5B9) + vals ^= vals >> 27 + vals *= np.uint64(0x94D049BB133111EB) + vals ^= vals >> 31 + return vals + + +def _hash_scalar( + val, encoding: str = "utf8", hash_key: str = _default_hash_key +) -> np.ndarray: + """ + Hash scalar value. + + Parameters + ---------- + val : scalar + encoding : str, default "utf8" + hash_key : str, default _default_hash_key + + Returns + ------- + 1d uint64 numpy array of hash value, of length 1 + """ + + if isna(val): + # this is to be consistent with the _hash_categorical implementation + return np.array([np.iinfo(np.uint64).max], dtype="u8") + + if getattr(val, "tzinfo", None) is not None: + # for tz-aware datetimes, we need the underlying naive UTC value and + # not the tz aware object or pd extension type (as + # infer_dtype_from_scalar would do) + if not isinstance(val, Timestamp): + val = Timestamp(val) + val = val.tz_convert(None) + + dtype, val = infer_dtype_from_scalar(val) + vals = np.array([val], dtype=dtype) + + return hash_array(vals, hash_key=hash_key, encoding=encoding, categorize=False) diff --git a/venv/Lib/site-packages/pandas/core/window/__init__.py b/venv/Lib/site-packages/pandas/core/window/__init__.py new file mode 100644 index 0000000..dcf58a4 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/window/__init__.py @@ -0,0 +1,3 @@ +from pandas.core.window.ewm import EWM # noqa:F401 +from pandas.core.window.expanding import Expanding, ExpandingGroupby # noqa:F401 +from pandas.core.window.rolling import Rolling, RollingGroupby, Window # noqa:F401 diff --git a/venv/Lib/site-packages/pandas/core/window/common.py b/venv/Lib/site-packages/pandas/core/window/common.py new file mode 100644 index 0000000..64ec0e6 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/window/common.py @@ -0,0 +1,326 @@ +"""Common utility functions for rolling operations""" +from collections import defaultdict +from typing import Callable, Optional +import warnings + +import numpy as np + +from pandas.core.dtypes.common import is_integer +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries + +import pandas.core.common as com +from pandas.core.generic import _shared_docs +from pandas.core.groupby.base import GroupByMixin +from pandas.core.indexes.api import MultiIndex + +_shared_docs = dict(**_shared_docs) +_doc_template = """ + Returns + ------- + Series or DataFrame + Return type is determined by the caller. + + See Also + -------- + Series.%(name)s : Series %(name)s. + DataFrame.%(name)s : DataFrame %(name)s. +""" + + +def _dispatch(name: str, *args, **kwargs): + """ + Dispatch to apply. + """ + + def outer(self, *args, **kwargs): + def f(x): + x = self._shallow_copy(x, groupby=self._groupby) + return getattr(x, name)(*args, **kwargs) + + return self._groupby.apply(f) + + outer.__name__ = name + return outer + + +class WindowGroupByMixin(GroupByMixin): + """ + Provide the groupby facilities. + """ + + def __init__(self, obj, *args, **kwargs): + kwargs.pop("parent", None) + groupby = kwargs.pop("groupby", None) + if groupby is None: + groupby, obj = obj, obj.obj + self._groupby = groupby + self._groupby.mutated = True + self._groupby.grouper.mutated = True + super().__init__(obj, *args, **kwargs) + + count = _dispatch("count") + corr = _dispatch("corr", other=None, pairwise=None) + cov = _dispatch("cov", other=None, pairwise=None) + + def _apply( + self, + func: Callable, + center: bool, + require_min_periods: int = 0, + floor: int = 1, + is_weighted: bool = False, + name: Optional[str] = None, + use_numba_cache: bool = False, + **kwargs, + ): + """ + Dispatch to apply; we are stripping all of the _apply kwargs and + performing the original function call on the grouped object. + """ + kwargs.pop("floor", None) + + # TODO: can we de-duplicate with _dispatch? + def f(x, name=name, *args): + x = self._shallow_copy(x) + + if isinstance(name, str): + return getattr(x, name)(*args, **kwargs) + + return x.apply(name, *args, **kwargs) + + return self._groupby.apply(f) + + +def _flex_binary_moment(arg1, arg2, f, pairwise=False): + + if not ( + isinstance(arg1, (np.ndarray, ABCSeries, ABCDataFrame)) + and isinstance(arg2, (np.ndarray, ABCSeries, ABCDataFrame)) + ): + raise TypeError( + "arguments to moment function must be of type " + "np.ndarray/Series/DataFrame" + ) + + if isinstance(arg1, (np.ndarray, ABCSeries)) and isinstance( + arg2, (np.ndarray, ABCSeries) + ): + X, Y = prep_binary(arg1, arg2) + return f(X, Y) + + elif isinstance(arg1, ABCDataFrame): + from pandas import DataFrame + + def dataframe_from_int_dict(data, frame_template): + result = DataFrame(data, index=frame_template.index) + if len(result.columns) > 0: + result.columns = frame_template.columns[result.columns] + return result + + results = {} + if isinstance(arg2, ABCDataFrame): + if pairwise is False: + if arg1 is arg2: + # special case in order to handle duplicate column names + for i, col in enumerate(arg1.columns): + results[i] = f(arg1.iloc[:, i], arg2.iloc[:, i]) + return dataframe_from_int_dict(results, arg1) + else: + if not arg1.columns.is_unique: + raise ValueError("'arg1' columns are not unique") + if not arg2.columns.is_unique: + raise ValueError("'arg2' columns are not unique") + with warnings.catch_warnings(record=True): + warnings.simplefilter("ignore", RuntimeWarning) + X, Y = arg1.align(arg2, join="outer") + X = X + 0 * Y + Y = Y + 0 * X + + with warnings.catch_warnings(record=True): + warnings.simplefilter("ignore", RuntimeWarning) + res_columns = arg1.columns.union(arg2.columns) + for col in res_columns: + if col in X and col in Y: + results[col] = f(X[col], Y[col]) + return DataFrame(results, index=X.index, columns=res_columns) + elif pairwise is True: + results = defaultdict(dict) + for i, k1 in enumerate(arg1.columns): + for j, k2 in enumerate(arg2.columns): + if j < i and arg2 is arg1: + # Symmetric case + results[i][j] = results[j][i] + else: + results[i][j] = f( + *prep_binary(arg1.iloc[:, i], arg2.iloc[:, j]) + ) + + from pandas import concat + + result_index = arg1.index.union(arg2.index) + if len(result_index): + + # construct result frame + result = concat( + [ + concat( + [results[i][j] for j, c in enumerate(arg2.columns)], + ignore_index=True, + ) + for i, c in enumerate(arg1.columns) + ], + ignore_index=True, + axis=1, + ) + result.columns = arg1.columns + + # set the index and reorder + if arg2.columns.nlevels > 1: + result.index = MultiIndex.from_product( + arg2.columns.levels + [result_index] + ) + result = result.reorder_levels([2, 0, 1]).sort_index() + else: + result.index = MultiIndex.from_product( + [range(len(arg2.columns)), range(len(result_index))] + ) + result = result.swaplevel(1, 0).sort_index() + result.index = MultiIndex.from_product( + [result_index] + [arg2.columns] + ) + else: + + # empty result + result = DataFrame( + index=MultiIndex( + levels=[arg1.index, arg2.columns], codes=[[], []] + ), + columns=arg2.columns, + dtype="float64", + ) + + # reset our index names to arg1 names + # reset our column names to arg2 names + # careful not to mutate the original names + result.columns = result.columns.set_names(arg1.columns.names) + result.index = result.index.set_names( + result_index.names + arg2.columns.names + ) + + return result + + else: + raise ValueError("'pairwise' is not True/False") + else: + results = { + i: f(*prep_binary(arg1.iloc[:, i], arg2)) + for i, col in enumerate(arg1.columns) + } + return dataframe_from_int_dict(results, arg1) + + else: + return _flex_binary_moment(arg2, arg1, f) + + +def _get_center_of_mass(comass, span, halflife, alpha): + valid_count = com.count_not_none(comass, span, halflife, alpha) + if valid_count > 1: + raise ValueError("comass, span, halflife, and alpha are mutually exclusive") + + # Convert to center of mass; domain checks ensure 0 < alpha <= 1 + if comass is not None: + if comass < 0: + raise ValueError("comass must satisfy: comass >= 0") + elif span is not None: + if span < 1: + raise ValueError("span must satisfy: span >= 1") + comass = (span - 1) / 2.0 + elif halflife is not None: + if halflife <= 0: + raise ValueError("halflife must satisfy: halflife > 0") + decay = 1 - np.exp(np.log(0.5) / halflife) + comass = 1 / decay - 1 + elif alpha is not None: + if alpha <= 0 or alpha > 1: + raise ValueError("alpha must satisfy: 0 < alpha <= 1") + comass = (1.0 - alpha) / alpha + else: + raise ValueError("Must pass one of comass, span, halflife, or alpha") + + return float(comass) + + +def calculate_center_offset(window): + if not is_integer(window): + window = len(window) + return int((window - 1) / 2.0) + + +def calculate_min_periods( + window: int, + min_periods: Optional[int], + num_values: int, + required_min_periods: int, + floor: int, +) -> int: + """ + Calculates final minimum periods value for rolling aggregations. + + Parameters + ---------- + window : passed window value + min_periods : passed min periods value + num_values : total number of values + required_min_periods : required min periods per aggregation function + floor : required min periods per aggregation function + + Returns + ------- + min_periods : int + """ + if min_periods is None: + min_periods = window + else: + min_periods = max(required_min_periods, min_periods) + if min_periods > window: + raise ValueError(f"min_periods {min_periods} must be <= window {window}") + elif min_periods > num_values: + min_periods = num_values + 1 + elif min_periods < 0: + raise ValueError("min_periods must be >= 0") + return max(min_periods, floor) + + +def zsqrt(x): + with np.errstate(all="ignore"): + result = np.sqrt(x) + mask = x < 0 + + if isinstance(x, ABCDataFrame): + if mask.values.any(): + result[mask] = 0 + else: + if mask.any(): + result[mask] = 0 + + return result + + +def prep_binary(arg1, arg2): + if not isinstance(arg2, type(arg1)): + raise Exception("Input arrays must be of the same type!") + + # mask out values, this also makes a common index... + X = arg1 + 0 * arg2 + Y = arg2 + 0 * arg1 + + return X, Y + + +def get_weighted_roll_func(cfunc: Callable) -> Callable: + def func(arg, window, min_periods=None): + if min_periods is None: + min_periods = len(window) + return cfunc(arg, window, min_periods) + + return func diff --git a/venv/Lib/site-packages/pandas/core/window/ewm.py b/venv/Lib/site-packages/pandas/core/window/ewm.py new file mode 100644 index 0000000..37e3cd4 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/window/ewm.py @@ -0,0 +1,403 @@ +from textwrap import dedent + +import numpy as np + +import pandas._libs.window.aggregations as window_aggregations +from pandas.compat.numpy import function as nv +from pandas.util._decorators import Appender, Substitution + +from pandas.core.dtypes.generic import ABCDataFrame + +from pandas.core.base import DataError +from pandas.core.window.common import ( + _doc_template, + _get_center_of_mass, + _shared_docs, + zsqrt, +) +from pandas.core.window.rolling import _flex_binary_moment, _Rolling + +_bias_template = """ + Parameters + ---------- + bias : bool, default False + Use a standard estimation bias correction. + *args, **kwargs + Arguments and keyword arguments to be passed into func. +""" + + +class EWM(_Rolling): + r""" + Provide exponential weighted functions. + + Parameters + ---------- + com : float, optional + Specify decay in terms of center of mass, + :math:`\alpha = 1 / (1 + com),\text{ for } com \geq 0`. + span : float, optional + Specify decay in terms of span, + :math:`\alpha = 2 / (span + 1),\text{ for } span \geq 1`. + halflife : float, optional + Specify decay in terms of half-life, + :math:`\alpha = 1 - exp(log(0.5) / halflife),\text{for} halflife > 0`. + alpha : float, optional + Specify smoothing factor :math:`\alpha` directly, + :math:`0 < \alpha \leq 1`. + min_periods : int, default 0 + Minimum number of observations in window required to have a value + (otherwise result is NA). + adjust : bool, default True + Divide by decaying adjustment factor in beginning periods to account + for imbalance in relative weightings + (viewing EWMA as a moving average). + ignore_na : bool, default False + Ignore missing values when calculating weights; + specify True to reproduce pre-0.15.0 behavior. + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to use. The value 0 identifies the rows, and 1 + identifies the columns. + + Returns + ------- + DataFrame + A Window sub-classed for the particular operation. + + See Also + -------- + rolling : Provides rolling window calculations. + expanding : Provides expanding transformations. + + Notes + ----- + Exactly one of center of mass, span, half-life, and alpha must be provided. + Allowed values and relationship between the parameters are specified in the + parameter descriptions above; see the link at the end of this section for + a detailed explanation. + + When adjust is True (default), weighted averages are calculated using + weights (1-alpha)**(n-1), (1-alpha)**(n-2), ..., 1-alpha, 1. + + When adjust is False, weighted averages are calculated recursively as: + weighted_average[0] = arg[0]; + weighted_average[i] = (1-alpha)*weighted_average[i-1] + alpha*arg[i]. + + When ignore_na is False (default), weights are based on absolute positions. + For example, the weights of x and y used in calculating the final weighted + average of [x, None, y] are (1-alpha)**2 and 1 (if adjust is True), and + (1-alpha)**2 and alpha (if adjust is False). + + When ignore_na is True (reproducing pre-0.15.0 behavior), weights are based + on relative positions. For example, the weights of x and y used in + calculating the final weighted average of [x, None, y] are 1-alpha and 1 + (if adjust is True), and 1-alpha and alpha (if adjust is False). + + More details can be found at + https://pandas.pydata.org/pandas-docs/stable/user_guide/computation.html#exponentially-weighted-windows + + Examples + -------- + + >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) + >>> df + B + 0 0.0 + 1 1.0 + 2 2.0 + 3 NaN + 4 4.0 + + >>> df.ewm(com=0.5).mean() + B + 0 0.000000 + 1 0.750000 + 2 1.615385 + 3 1.615385 + 4 3.670213 + """ + _attributes = ["com", "min_periods", "adjust", "ignore_na", "axis"] + + def __init__( + self, + obj, + com=None, + span=None, + halflife=None, + alpha=None, + min_periods=0, + adjust=True, + ignore_na=False, + axis=0, + ): + self.obj = obj + self.com = _get_center_of_mass(com, span, halflife, alpha) + self.min_periods = min_periods + self.adjust = adjust + self.ignore_na = ignore_na + self.axis = axis + self.on = None + + @property + def _constructor(self): + return EWM + + _agg_see_also_doc = dedent( + """ + See Also + -------- + pandas.DataFrame.rolling.aggregate + """ + ) + + _agg_examples_doc = dedent( + """ + Examples + -------- + + >>> df = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C']) + >>> df + A B C + 0 -2.385977 -0.102758 0.438822 + 1 -1.004295 0.905829 -0.954544 + 2 0.735167 -0.165272 -1.619346 + 3 -0.702657 -1.340923 -0.706334 + 4 -0.246845 0.211596 -0.901819 + 5 2.463718 3.157577 -1.380906 + 6 -1.142255 2.340594 -0.039875 + 7 1.396598 -1.647453 1.677227 + 8 -0.543425 1.761277 -0.220481 + 9 -0.640505 0.289374 -1.550670 + + >>> df.ewm(alpha=0.5).mean() + A B C + 0 -2.385977 -0.102758 0.438822 + 1 -1.464856 0.569633 -0.490089 + 2 -0.207700 0.149687 -1.135379 + 3 -0.471677 -0.645305 -0.906555 + 4 -0.355635 -0.203033 -0.904111 + 5 1.076417 1.503943 -1.146293 + 6 -0.041654 1.925562 -0.588728 + 7 0.680292 0.132049 0.548693 + 8 0.067236 0.948257 0.163353 + 9 -0.286980 0.618493 -0.694496 + """ + ) + + @Substitution( + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + versionadded="", + klass="Series/Dataframe", + axis="", + ) + @Appender(_shared_docs["aggregate"]) + def aggregate(self, func, *args, **kwargs): + return super().aggregate(func, *args, **kwargs) + + agg = aggregate + + def _apply(self, func, **kwargs): + """ + Rolling statistical measure using supplied function. Designed to be + used with passed-in Cython array-based functions. + + Parameters + ---------- + func : str/callable to apply + + Returns + ------- + y : same type as input argument + """ + blocks, obj = self._create_blocks() + block_list = list(blocks) + + results = [] + exclude = [] + for i, b in enumerate(blocks): + try: + values = self._prep_values(b.values) + + except (TypeError, NotImplementedError): + if isinstance(obj, ABCDataFrame): + exclude.extend(b.columns) + del block_list[i] + continue + else: + raise DataError("No numeric types to aggregate") + + if values.size == 0: + results.append(values.copy()) + continue + + # if we have a string function name, wrap it + if isinstance(func, str): + cfunc = getattr(window_aggregations, func, None) + if cfunc is None: + raise ValueError( + f"we do not support this function in window_aggregations.{func}" + ) + + def func(arg): + return cfunc( + arg, + self.com, + int(self.adjust), + int(self.ignore_na), + int(self.min_periods), + ) + + results.append(np.apply_along_axis(func, self.axis, values)) + + return self._wrap_results(results, block_list, obj, exclude) + + @Substitution(name="ewm") + @Appender(_doc_template) + def mean(self, *args, **kwargs): + """ + Exponential weighted moving average. + + Parameters + ---------- + *args, **kwargs + Arguments and keyword arguments to be passed into func. + """ + nv.validate_window_func("mean", args, kwargs) + return self._apply("ewma", **kwargs) + + @Substitution(name="ewm") + @Appender(_doc_template) + @Appender(_bias_template) + def std(self, bias=False, *args, **kwargs): + """ + Exponential weighted moving stddev. + """ + nv.validate_window_func("std", args, kwargs) + return zsqrt(self.var(bias=bias, **kwargs)) + + vol = std + + @Substitution(name="ewm") + @Appender(_doc_template) + @Appender(_bias_template) + def var(self, bias=False, *args, **kwargs): + """ + Exponential weighted moving variance. + """ + nv.validate_window_func("var", args, kwargs) + + def f(arg): + return window_aggregations.ewmcov( + arg, + arg, + self.com, + int(self.adjust), + int(self.ignore_na), + int(self.min_periods), + int(bias), + ) + + return self._apply(f, **kwargs) + + @Substitution(name="ewm") + @Appender(_doc_template) + def cov(self, other=None, pairwise=None, bias=False, **kwargs): + """ + Exponential weighted sample covariance. + + Parameters + ---------- + other : Series, DataFrame, or ndarray, optional + If not supplied then will default to self and produce pairwise + output. + pairwise : bool, default None + If False then only matching columns between self and other will be + used and the output will be a DataFrame. + If True then all pairwise combinations will be calculated and the + output will be a MultiIndex DataFrame in the case of DataFrame + inputs. In the case of missing elements, only complete pairwise + observations will be used. + bias : bool, default False + Use a standard estimation bias correction. + **kwargs + Keyword arguments to be passed into func. + """ + if other is None: + other = self._selected_obj + # only default unset + pairwise = True if pairwise is None else pairwise + other = self._shallow_copy(other) + + def _get_cov(X, Y): + X = self._shallow_copy(X) + Y = self._shallow_copy(Y) + cov = window_aggregations.ewmcov( + X._prep_values(), + Y._prep_values(), + self.com, + int(self.adjust), + int(self.ignore_na), + int(self.min_periods), + int(bias), + ) + return X._wrap_result(cov) + + return _flex_binary_moment( + self._selected_obj, other._selected_obj, _get_cov, pairwise=bool(pairwise) + ) + + @Substitution(name="ewm") + @Appender(_doc_template) + def corr(self, other=None, pairwise=None, **kwargs): + """ + Exponential weighted sample correlation. + + Parameters + ---------- + other : Series, DataFrame, or ndarray, optional + If not supplied then will default to self and produce pairwise + output. + pairwise : bool, default None + If False then only matching columns between self and other will be + used and the output will be a DataFrame. + If True then all pairwise combinations will be calculated and the + output will be a MultiIndex DataFrame in the case of DataFrame + inputs. In the case of missing elements, only complete pairwise + observations will be used. + **kwargs + Keyword arguments to be passed into func. + """ + if other is None: + other = self._selected_obj + # only default unset + pairwise = True if pairwise is None else pairwise + other = self._shallow_copy(other) + + def _get_corr(X, Y): + X = self._shallow_copy(X) + Y = self._shallow_copy(Y) + + def _cov(x, y): + return window_aggregations.ewmcov( + x, + y, + self.com, + int(self.adjust), + int(self.ignore_na), + int(self.min_periods), + 1, + ) + + x_values = X._prep_values() + y_values = Y._prep_values() + with np.errstate(all="ignore"): + cov = _cov(x_values, y_values) + x_var = _cov(x_values, x_values) + y_var = _cov(y_values, y_values) + corr = cov / zsqrt(x_var * y_var) + return X._wrap_result(corr) + + return _flex_binary_moment( + self._selected_obj, other._selected_obj, _get_corr, pairwise=bool(pairwise) + ) diff --git a/venv/Lib/site-packages/pandas/core/window/expanding.py b/venv/Lib/site-packages/pandas/core/window/expanding.py new file mode 100644 index 0000000..68c3514 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/window/expanding.py @@ -0,0 +1,259 @@ +from textwrap import dedent + +from pandas.compat.numpy import function as nv +from pandas.util._decorators import Appender, Substitution + +from pandas.core.window.common import WindowGroupByMixin, _doc_template, _shared_docs +from pandas.core.window.rolling import _Rolling_and_Expanding + + +class Expanding(_Rolling_and_Expanding): + """ + Provide expanding transformations. + + Parameters + ---------- + min_periods : int, default 1 + Minimum number of observations in window required to have a value + (otherwise result is NA). + center : bool, default False + Set the labels at the center of the window. + axis : int or str, default 0 + + Returns + ------- + a Window sub-classed for the particular operation + + See Also + -------- + rolling : Provides rolling window calculations. + ewm : Provides exponential weighted functions. + + Notes + ----- + By default, the result is set to the right edge of the window. This can be + changed to the center of the window by setting ``center=True``. + + Examples + -------- + + >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) + B + 0 0.0 + 1 1.0 + 2 2.0 + 3 NaN + 4 4.0 + + >>> df.expanding(2).sum() + B + 0 NaN + 1 1.0 + 2 3.0 + 3 3.0 + 4 7.0 + """ + + _attributes = ["min_periods", "center", "axis"] + + def __init__(self, obj, min_periods=1, center=False, axis=0, **kwargs): + super().__init__(obj=obj, min_periods=min_periods, center=center, axis=axis) + + @property + def _constructor(self): + return Expanding + + def _get_window(self, other=None, **kwargs): + """ + Get the window length over which to perform some operation. + + Parameters + ---------- + other : object, default None + The other object that is involved in the operation. + Such an object is involved for operations like covariance. + + Returns + ------- + window : int + The window length. + """ + axis = self.obj._get_axis(self.axis) + length = len(axis) + (other is not None) * len(axis) + + other = self.min_periods or -1 + return max(length, other) + + _agg_see_also_doc = dedent( + """ + See Also + -------- + DataFrame.expanding.aggregate + DataFrame.rolling.aggregate + DataFrame.aggregate + """ + ) + + _agg_examples_doc = dedent( + """ + Examples + -------- + + >>> df = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C']) + >>> df + A B C + 0 -2.385977 -0.102758 0.438822 + 1 -1.004295 0.905829 -0.954544 + 2 0.735167 -0.165272 -1.619346 + 3 -0.702657 -1.340923 -0.706334 + 4 -0.246845 0.211596 -0.901819 + 5 2.463718 3.157577 -1.380906 + 6 -1.142255 2.340594 -0.039875 + 7 1.396598 -1.647453 1.677227 + 8 -0.543425 1.761277 -0.220481 + 9 -0.640505 0.289374 -1.550670 + + >>> df.ewm(alpha=0.5).mean() + A B C + 0 -2.385977 -0.102758 0.438822 + 1 -1.464856 0.569633 -0.490089 + 2 -0.207700 0.149687 -1.135379 + 3 -0.471677 -0.645305 -0.906555 + 4 -0.355635 -0.203033 -0.904111 + 5 1.076417 1.503943 -1.146293 + 6 -0.041654 1.925562 -0.588728 + 7 0.680292 0.132049 0.548693 + 8 0.067236 0.948257 0.163353 + 9 -0.286980 0.618493 -0.694496 + """ + ) + + @Substitution( + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + versionadded="", + klass="Series/Dataframe", + axis="", + ) + @Appender(_shared_docs["aggregate"]) + def aggregate(self, func, *args, **kwargs): + return super().aggregate(func, *args, **kwargs) + + agg = aggregate + + @Substitution(name="expanding") + @Appender(_shared_docs["count"]) + def count(self, **kwargs): + return super().count(**kwargs) + + @Substitution(name="expanding") + @Appender(_shared_docs["apply"]) + def apply(self, func, raw=False, args=(), kwargs={}): + return super().apply(func, raw=raw, args=args, kwargs=kwargs) + + @Substitution(name="expanding") + @Appender(_shared_docs["sum"]) + def sum(self, *args, **kwargs): + nv.validate_expanding_func("sum", args, kwargs) + return super().sum(*args, **kwargs) + + @Substitution(name="expanding") + @Appender(_doc_template) + @Appender(_shared_docs["max"]) + def max(self, *args, **kwargs): + nv.validate_expanding_func("max", args, kwargs) + return super().max(*args, **kwargs) + + @Substitution(name="expanding") + @Appender(_shared_docs["min"]) + def min(self, *args, **kwargs): + nv.validate_expanding_func("min", args, kwargs) + return super().min(*args, **kwargs) + + @Substitution(name="expanding") + @Appender(_shared_docs["mean"]) + def mean(self, *args, **kwargs): + nv.validate_expanding_func("mean", args, kwargs) + return super().mean(*args, **kwargs) + + @Substitution(name="expanding") + @Appender(_shared_docs["median"]) + def median(self, **kwargs): + return super().median(**kwargs) + + @Substitution(name="expanding", versionadded="") + @Appender(_shared_docs["std"]) + def std(self, ddof=1, *args, **kwargs): + nv.validate_expanding_func("std", args, kwargs) + return super().std(ddof=ddof, **kwargs) + + @Substitution(name="expanding", versionadded="") + @Appender(_shared_docs["var"]) + def var(self, ddof=1, *args, **kwargs): + nv.validate_expanding_func("var", args, kwargs) + return super().var(ddof=ddof, **kwargs) + + @Substitution(name="expanding") + @Appender(_doc_template) + @Appender(_shared_docs["skew"]) + def skew(self, **kwargs): + return super().skew(**kwargs) + + _agg_doc = dedent( + """ + Examples + -------- + + The example below will show an expanding calculation with a window size of + four matching the equivalent function call using `scipy.stats`. + + >>> arr = [1, 2, 3, 4, 999] + >>> import scipy.stats + >>> print(f"{scipy.stats.kurtosis(arr[:-1], bias=False):.6f}") + -1.200000 + >>> print(f"{scipy.stats.kurtosis(arr, bias=False):.6f}") + 4.999874 + >>> s = pd.Series(arr) + >>> s.expanding(4).kurt() + 0 NaN + 1 NaN + 2 NaN + 3 -1.200000 + 4 4.999874 + dtype: float64 + """ + ) + + @Appender(_agg_doc) + @Substitution(name="expanding") + @Appender(_shared_docs["kurt"]) + def kurt(self, **kwargs): + return super().kurt(**kwargs) + + @Substitution(name="expanding") + @Appender(_shared_docs["quantile"]) + def quantile(self, quantile, interpolation="linear", **kwargs): + return super().quantile( + quantile=quantile, interpolation=interpolation, **kwargs + ) + + @Substitution(name="expanding") + @Appender(_doc_template) + @Appender(_shared_docs["cov"]) + def cov(self, other=None, pairwise=None, ddof=1, **kwargs): + return super().cov(other=other, pairwise=pairwise, ddof=ddof, **kwargs) + + @Substitution(name="expanding") + @Appender(_shared_docs["corr"]) + def corr(self, other=None, pairwise=None, **kwargs): + return super().corr(other=other, pairwise=pairwise, **kwargs) + + +class ExpandingGroupby(WindowGroupByMixin, Expanding): + """ + Provide a expanding groupby implementation. + """ + + @property + def _constructor(self): + return Expanding diff --git a/venv/Lib/site-packages/pandas/core/window/indexers.py b/venv/Lib/site-packages/pandas/core/window/indexers.py new file mode 100644 index 0000000..0fa24a0 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/window/indexers.py @@ -0,0 +1,122 @@ +"""Indexer objects for computing start/end window bounds for rolling operations""" +from typing import Optional, Tuple + +import numpy as np + +from pandas._libs.window.indexers import calculate_variable_window_bounds +from pandas.util._decorators import Appender + +get_window_bounds_doc = """ +Computes the bounds of a window. + +Parameters +---------- +num_values : int, default 0 + number of values that will be aggregated over +window_size : int, default 0 + the number of rows in a window +min_periods : int, default None + min_periods passed from the top level rolling API +center : bool, default None + center passed from the top level rolling API +closed : str, default None + closed passed from the top level rolling API +win_type : str, default None + win_type passed from the top level rolling API + +Returns +------- +A tuple of ndarray[int64]s, indicating the boundaries of each +window +""" + + +class BaseIndexer: + """Base class for window bounds calculations""" + + def __init__( + self, index_array: Optional[np.ndarray] = None, window_size: int = 0, **kwargs, + ): + """ + Parameters + ---------- + **kwargs : + keyword arguments that will be available when get_window_bounds is called + """ + self.index_array = index_array + self.window_size = window_size + # Set user defined kwargs as attributes that can be used in get_window_bounds + for key, value in kwargs.items(): + setattr(self, key, value) + + @Appender(get_window_bounds_doc) + def get_window_bounds( + self, + num_values: int = 0, + min_periods: Optional[int] = None, + center: Optional[bool] = None, + closed: Optional[str] = None, + ) -> Tuple[np.ndarray, np.ndarray]: + + raise NotImplementedError + + +class FixedWindowIndexer(BaseIndexer): + """Creates window boundaries that are of fixed length.""" + + @Appender(get_window_bounds_doc) + def get_window_bounds( + self, + num_values: int = 0, + min_periods: Optional[int] = None, + center: Optional[bool] = None, + closed: Optional[str] = None, + ) -> Tuple[np.ndarray, np.ndarray]: + + start_s = np.zeros(self.window_size, dtype="int64") + start_e = ( + np.arange(self.window_size, num_values, dtype="int64") + - self.window_size + + 1 + ) + start = np.concatenate([start_s, start_e])[:num_values] + + end_s = np.arange(self.window_size, dtype="int64") + 1 + end_e = start_e + self.window_size + end = np.concatenate([end_s, end_e])[:num_values] + return start, end + + +class VariableWindowIndexer(BaseIndexer): + """Creates window boundaries that are of variable length, namely for time series.""" + + @Appender(get_window_bounds_doc) + def get_window_bounds( + self, + num_values: int = 0, + min_periods: Optional[int] = None, + center: Optional[bool] = None, + closed: Optional[str] = None, + ) -> Tuple[np.ndarray, np.ndarray]: + + return calculate_variable_window_bounds( + num_values, self.window_size, min_periods, center, closed, self.index_array, + ) + + +class ExpandingIndexer(BaseIndexer): + """Calculate expanding window bounds, mimicking df.expanding()""" + + @Appender(get_window_bounds_doc) + def get_window_bounds( + self, + num_values: int = 0, + min_periods: Optional[int] = None, + center: Optional[bool] = None, + closed: Optional[str] = None, + ) -> Tuple[np.ndarray, np.ndarray]: + + return ( + np.zeros(num_values, dtype=np.int64), + np.arange(1, num_values + 1, dtype=np.int64), + ) diff --git a/venv/Lib/site-packages/pandas/core/window/numba_.py b/venv/Lib/site-packages/pandas/core/window/numba_.py new file mode 100644 index 0000000..1279579 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/window/numba_.py @@ -0,0 +1,127 @@ +import types +from typing import Any, Callable, Dict, Optional, Tuple + +import numpy as np + +from pandas._typing import Scalar +from pandas.compat._optional import import_optional_dependency + + +def make_rolling_apply( + func: Callable[..., Scalar], + args: Tuple, + nogil: bool, + parallel: bool, + nopython: bool, +): + """ + Creates a JITted rolling apply function with a JITted version of + the user's function. + + Parameters + ---------- + func : function + function to be applied to each window and will be JITed + args : tuple + *args to be passed into the function + nogil : bool + nogil parameter from engine_kwargs for numba.jit + parallel : bool + parallel parameter from engine_kwargs for numba.jit + nopython : bool + nopython parameter from engine_kwargs for numba.jit + + Returns + ------- + Numba function + """ + numba = import_optional_dependency("numba") + + if parallel: + loop_range = numba.prange + else: + loop_range = range + + if isinstance(func, numba.targets.registry.CPUDispatcher): + # Don't jit a user passed jitted function + numba_func = func + else: + + @numba.generated_jit(nopython=nopython, nogil=nogil, parallel=parallel) + def numba_func(window, *_args): + if getattr(np, func.__name__, False) is func or isinstance( + func, types.BuiltinFunctionType + ): + jf = func + else: + jf = numba.jit(func, nopython=nopython, nogil=nogil) + + def impl(window, *_args): + return jf(window, *_args) + + return impl + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def roll_apply( + values: np.ndarray, begin: np.ndarray, end: np.ndarray, minimum_periods: int, + ) -> np.ndarray: + result = np.empty(len(begin)) + for i in loop_range(len(result)): + start = begin[i] + stop = end[i] + window = values[start:stop] + count_nan = np.sum(np.isnan(window)) + if len(window) - count_nan >= minimum_periods: + result[i] = numba_func(window, *args) + else: + result[i] = np.nan + return result + + return roll_apply + + +def generate_numba_apply_func( + args: Tuple, + kwargs: Dict[str, Any], + func: Callable[..., Scalar], + engine_kwargs: Optional[Dict[str, bool]], +): + """ + Generate a numba jitted apply function specified by values from engine_kwargs. + + 1. jit the user's function + 2. Return a rolling apply function with the jitted function inline + + Configurations specified in engine_kwargs apply to both the user's + function _AND_ the rolling apply function. + + Parameters + ---------- + args : tuple + *args to be passed into the function + kwargs : dict + **kwargs to be passed into the function + func : function + function to be applied to each window and will be JITed + engine_kwargs : dict + dictionary of arguments to be passed into numba.jit + + Returns + ------- + Numba function + """ + + if engine_kwargs is None: + engine_kwargs = {} + + nopython = engine_kwargs.get("nopython", True) + nogil = engine_kwargs.get("nogil", False) + parallel = engine_kwargs.get("parallel", False) + + if kwargs and nopython: + raise ValueError( + "numba does not support kwargs with nopython=True: " + "https://github.com/numba/numba/issues/2916" + ) + + return make_rolling_apply(func, args, nogil, parallel, nopython) diff --git a/venv/Lib/site-packages/pandas/core/window/rolling.py b/venv/Lib/site-packages/pandas/core/window/rolling.py new file mode 100644 index 0000000..c1e3475 --- /dev/null +++ b/venv/Lib/site-packages/pandas/core/window/rolling.py @@ -0,0 +1,2120 @@ +""" +Provide a generic structure to support window functions, +similar to how we have a Groupby object. +""" +from datetime import timedelta +from functools import partial +import inspect +from textwrap import dedent +from typing import Callable, Dict, List, Optional, Set, Tuple, Union + +import numpy as np + +import pandas._libs.window.aggregations as window_aggregations +from pandas._typing import Axis, FrameOrSeries, Scalar +from pandas.compat._optional import import_optional_dependency +from pandas.compat.numpy import function as nv +from pandas.util._decorators import Appender, Substitution, cache_readonly + +from pandas.core.dtypes.common import ( + ensure_float64, + is_bool, + is_float_dtype, + is_integer, + is_integer_dtype, + is_list_like, + is_scalar, + needs_i8_conversion, +) +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCDateOffset, + ABCDatetimeIndex, + ABCPeriodIndex, + ABCSeries, + ABCTimedeltaIndex, +) + +from pandas.core.base import DataError, PandasObject, SelectionMixin, ShallowMixin +import pandas.core.common as com +from pandas.core.indexes.api import Index, ensure_index +from pandas.core.window.common import ( + WindowGroupByMixin, + _doc_template, + _flex_binary_moment, + _shared_docs, + calculate_center_offset, + calculate_min_periods, + get_weighted_roll_func, + zsqrt, +) +from pandas.core.window.indexers import ( + BaseIndexer, + FixedWindowIndexer, + VariableWindowIndexer, +) +from pandas.core.window.numba_ import generate_numba_apply_func + + +class _Window(PandasObject, ShallowMixin, SelectionMixin): + _attributes: List[str] = [ + "window", + "min_periods", + "center", + "win_type", + "axis", + "on", + "closed", + ] + exclusions: Set[str] = set() + + def __init__( + self, + obj, + window=None, + min_periods: Optional[int] = None, + center: Optional[bool] = False, + win_type: Optional[str] = None, + axis: Axis = 0, + on: Optional[Union[str, Index]] = None, + closed: Optional[str] = None, + **kwargs, + ): + + self.__dict__.update(kwargs) + self.obj = obj + self.on = on + self.closed = closed + self.window = window + self.min_periods = min_periods + self.center = center + self.win_type = win_type + self.win_freq = None + self.axis = obj._get_axis_number(axis) if axis is not None else None + self.validate() + self._numba_func_cache: Dict[Optional[str], Callable] = dict() + + @property + def _constructor(self): + return Window + + @property + def is_datetimelike(self) -> Optional[bool]: + return None + + @property + def _on(self): + return None + + @property + def is_freq_type(self) -> bool: + return self.win_type == "freq" + + def validate(self) -> None: + if self.center is not None and not is_bool(self.center): + raise ValueError("center must be a boolean") + if self.min_periods is not None and not is_integer(self.min_periods): + raise ValueError("min_periods must be an integer") + if self.closed is not None and self.closed not in [ + "right", + "both", + "left", + "neither", + ]: + raise ValueError("closed must be 'right', 'left', 'both' or 'neither'") + if not isinstance(self.obj, (ABCSeries, ABCDataFrame)): + raise TypeError(f"invalid type: {type(self)}") + if isinstance(self.window, BaseIndexer): + self._validate_get_window_bounds_signature(self.window) + + @staticmethod + def _validate_get_window_bounds_signature(window: BaseIndexer) -> None: + """ + Validate that the passed BaseIndexer subclass has + a get_window_bounds with the correct signature. + """ + get_window_bounds_signature = inspect.signature( + window.get_window_bounds + ).parameters.keys() + expected_signature = inspect.signature( + BaseIndexer().get_window_bounds + ).parameters.keys() + if get_window_bounds_signature != expected_signature: + raise ValueError( + f"{type(window).__name__} does not implement the correct signature for " + f"get_window_bounds" + ) + + def _create_blocks(self): + """ + Split data into blocks & return conformed data. + """ + + obj = self._selected_obj + + # filter out the on from the object + if self.on is not None and not isinstance(self.on, Index): + if obj.ndim == 2: + obj = obj.reindex(columns=obj.columns.difference([self.on]), copy=False) + blocks = obj._to_dict_of_blocks(copy=False).values() + + return blocks, obj + + def _gotitem(self, key, ndim, subset=None): + """ + Sub-classes to define. Return a sliced object. + + Parameters + ---------- + key : str / list of selections + ndim : 1,2 + requested ndim of result + subset : object, default None + subset to act on + """ + + # create a new object to prevent aliasing + if subset is None: + subset = self.obj + self = self._shallow_copy(subset) + self._reset_cache() + if subset.ndim == 2: + if is_scalar(key) and key in subset or is_list_like(key): + self._selection = key + return self + + def __getattr__(self, attr: str): + if attr in self._internal_names_set: + return object.__getattribute__(self, attr) + if attr in self.obj: + return self[attr] + + raise AttributeError( + f"'{type(self).__name__}' object has no attribute '{attr}'" + ) + + def _dir_additions(self): + return self.obj._dir_additions() + + def _get_win_type(self, kwargs: Dict): + """ + Exists for compatibility, overriden by subclass Window. + + Parameters + ---------- + kwargs : dict + ignored, exists for compatibility + + Returns + ------- + None + """ + return None + + def _get_window(self, other=None, win_type: Optional[str] = None) -> int: + """ + Return window length. + + Parameters + ---------- + other : + ignored, exists for compatibility + win_type : + ignored, exists for compatibility + + Returns + ------- + window : int + """ + if isinstance(self.window, BaseIndexer): + return self.min_periods or 0 + return self.window + + @property + def _window_type(self) -> str: + return type(self).__name__ + + def __repr__(self) -> str: + """ + Provide a nice str repr of our rolling object. + """ + + attrs_list = ( + f"{attr_name}={getattr(self, attr_name)}" + for attr_name in self._attributes + if getattr(self, attr_name, None) is not None + ) + attrs = ",".join(attrs_list) + return f"{self._window_type} [{attrs}]" + + def __iter__(self): + url = "https://github.com/pandas-dev/pandas/issues/11704" + raise NotImplementedError(f"See issue #11704 {url}") + + def _prep_values(self, values: Optional[np.ndarray] = None) -> np.ndarray: + """Convert input to numpy arrays for Cython routines""" + if values is None: + values = getattr(self._selected_obj, "values", self._selected_obj) + + # GH #12373 : rolling functions error on float32 data + # make sure the data is coerced to float64 + if is_float_dtype(values.dtype): + values = ensure_float64(values) + elif is_integer_dtype(values.dtype): + values = ensure_float64(values) + elif needs_i8_conversion(values.dtype): + raise NotImplementedError( + f"ops for {self._window_type} for this " + f"dtype {values.dtype} are not implemented" + ) + else: + try: + values = ensure_float64(values) + except (ValueError, TypeError): + raise TypeError(f"cannot handle this type -> {values.dtype}") + + # Convert inf to nan for C funcs + inf = np.isinf(values) + if inf.any(): + values = np.where(inf, np.nan, values) + + return values + + def _wrap_result(self, result, block=None, obj=None): + """ + Wrap a single result. + """ + + if obj is None: + obj = self._selected_obj + index = obj.index + + if isinstance(result, np.ndarray): + + if result.ndim == 1: + from pandas import Series + + return Series(result, index, name=obj.name) + + return type(obj)(result, index=index, columns=block.columns) + return result + + def _wrap_results(self, results, blocks, obj, exclude=None) -> FrameOrSeries: + """ + Wrap the results. + + Parameters + ---------- + results : list of ndarrays + blocks : list of blocks + obj : conformed data (may be resampled) + exclude: list of columns to exclude, default to None + """ + + from pandas import Series, concat + + final = [] + for result, block in zip(results, blocks): + + result = self._wrap_result(result, block=block, obj=obj) + if result.ndim == 1: + return result + final.append(result) + + # if we have an 'on' column + # we want to put it back into the results + # in the same location + columns = self._selected_obj.columns + if self.on is not None and not self._on.equals(obj.index): + + name = self._on.name + final.append(Series(self._on, index=obj.index, name=name)) + + if self._selection is not None: + + selection = ensure_index(self._selection) + + # need to reorder to include original location of + # the on column (if its not already there) + if name not in selection: + columns = self.obj.columns + indexer = columns.get_indexer(selection.tolist() + [name]) + columns = columns.take(sorted(indexer)) + + # exclude nuisance columns so that they are not reindexed + if exclude is not None and exclude: + columns = [c for c in columns if c not in exclude] + + if not columns: + raise DataError("No numeric types to aggregate") + + if not len(final): + return obj.astype("float64") + return concat(final, axis=1).reindex(columns=columns, copy=False) + + def _center_window(self, result, window) -> np.ndarray: + """ + Center the result in the window. + """ + if self.axis > result.ndim - 1: + raise ValueError("Requested axis is larger then no. of argument dimensions") + + offset = calculate_center_offset(window) + if offset > 0: + lead_indexer = [slice(None)] * result.ndim + lead_indexer[self.axis] = slice(offset, None) + result = np.copy(result[tuple(lead_indexer)]) + return result + + def _get_roll_func(self, func_name: str) -> Callable: + """ + Wrap rolling function to check values passed. + + Parameters + ---------- + func_name : str + Cython function used to calculate rolling statistics + + Returns + ------- + func : callable + """ + window_func = getattr(window_aggregations, func_name, None) + if window_func is None: + raise ValueError( + f"we do not support this function in window_aggregations.{func_name}" + ) + return window_func + + def _get_cython_func_type(self, func: str) -> Callable: + """ + Return a variable or fixed cython function type. + + Variable algorithms do not use window while fixed do. + """ + if self.is_freq_type or isinstance(self.window, BaseIndexer): + return self._get_roll_func(f"{func}_variable") + return partial(self._get_roll_func(f"{func}_fixed"), win=self._get_window()) + + def _get_window_indexer(self, window: int) -> BaseIndexer: + """ + Return an indexer class that will compute the window start and end bounds + """ + if isinstance(self.window, BaseIndexer): + return self.window + if self.is_freq_type: + return VariableWindowIndexer(index_array=self._on.asi8, window_size=window) + return FixedWindowIndexer(window_size=window) + + def _apply( + self, + func: Callable, + center: bool, + require_min_periods: int = 0, + floor: int = 1, + is_weighted: bool = False, + name: Optional[str] = None, + use_numba_cache: bool = False, + **kwargs, + ): + """ + Rolling statistical measure using supplied function. + + Designed to be used with passed-in Cython array-based functions. + + Parameters + ---------- + func : callable function to apply + center : bool + require_min_periods : int + floor : int + is_weighted : bool + name : str, + compatibility with groupby.rolling + use_numba_cache : bool + whether to cache a numba compiled function. Only available for numba + enabled methods (so far only apply) + **kwargs + additional arguments for rolling function and window function + + Returns + ------- + y : type of input + """ + win_type = self._get_win_type(kwargs) + window = self._get_window(win_type=win_type) + + blocks, obj = self._create_blocks() + block_list = list(blocks) + window_indexer = self._get_window_indexer(window) + + results = [] + exclude: List[Scalar] = [] + for i, b in enumerate(blocks): + try: + values = self._prep_values(b.values) + + except (TypeError, NotImplementedError): + if isinstance(obj, ABCDataFrame): + exclude.extend(b.columns) + del block_list[i] + continue + else: + raise DataError("No numeric types to aggregate") + + if values.size == 0: + results.append(values.copy()) + continue + + # calculation function + offset = calculate_center_offset(window) if center else 0 + additional_nans = np.array([np.nan] * offset) + + if not is_weighted: + + def calc(x): + x = np.concatenate((x, additional_nans)) + if not isinstance(window, BaseIndexer): + min_periods = calculate_min_periods( + window, self.min_periods, len(x), require_min_periods, floor + ) + else: + min_periods = calculate_min_periods( + self.min_periods or 1, + self.min_periods, + len(x), + require_min_periods, + floor, + ) + start, end = window_indexer.get_window_bounds( + num_values=len(x), + min_periods=self.min_periods, + center=self.center, + closed=self.closed, + ) + return func(x, start, end, min_periods) + + else: + + def calc(x): + x = np.concatenate((x, additional_nans)) + return func(x, window, self.min_periods) + + with np.errstate(all="ignore"): + if values.ndim > 1: + result = np.apply_along_axis(calc, self.axis, values) + else: + result = calc(values) + result = np.asarray(result) + + if use_numba_cache: + self._numba_func_cache[name] = func + + if center: + result = self._center_window(result, window) + + results.append(result) + + return self._wrap_results(results, block_list, obj, exclude) + + def aggregate(self, func, *args, **kwargs): + result, how = self._aggregate(func, *args, **kwargs) + if result is None: + return self.apply(func, raw=False, args=args, kwargs=kwargs) + return result + + agg = aggregate + + _shared_docs["sum"] = dedent( + """ + Calculate %(name)s sum of given DataFrame or Series. + + Parameters + ---------- + *args, **kwargs + For compatibility with other %(name)s methods. Has no effect + on the computed value. + + Returns + ------- + Series or DataFrame + Same type as the input, with the same index, containing the + %(name)s sum. + + See Also + -------- + Series.sum : Reducing sum for Series. + DataFrame.sum : Reducing sum for DataFrame. + + Examples + -------- + >>> s = pd.Series([1, 2, 3, 4, 5]) + >>> s + 0 1 + 1 2 + 2 3 + 3 4 + 4 5 + dtype: int64 + + >>> s.rolling(3).sum() + 0 NaN + 1 NaN + 2 6.0 + 3 9.0 + 4 12.0 + dtype: float64 + + >>> s.expanding(3).sum() + 0 NaN + 1 NaN + 2 6.0 + 3 10.0 + 4 15.0 + dtype: float64 + + >>> s.rolling(3, center=True).sum() + 0 NaN + 1 6.0 + 2 9.0 + 3 12.0 + 4 NaN + dtype: float64 + + For DataFrame, each %(name)s sum is computed column-wise. + + >>> df = pd.DataFrame({"A": s, "B": s ** 2}) + >>> df + A B + 0 1 1 + 1 2 4 + 2 3 9 + 3 4 16 + 4 5 25 + + >>> df.rolling(3).sum() + A B + 0 NaN NaN + 1 NaN NaN + 2 6.0 14.0 + 3 9.0 29.0 + 4 12.0 50.0 + """ + ) + + _shared_docs["mean"] = dedent( + """ + Calculate the %(name)s mean of the values. + + Parameters + ---------- + *args + Under Review. + **kwargs + Under Review. + + Returns + ------- + Series or DataFrame + Returned object type is determined by the caller of the %(name)s + calculation. + + See Also + -------- + Series.%(name)s : Calling object with Series data. + DataFrame.%(name)s : Calling object with DataFrames. + Series.mean : Equivalent method for Series. + DataFrame.mean : Equivalent method for DataFrame. + + Examples + -------- + The below examples will show rolling mean calculations with window sizes of + two and three, respectively. + + >>> s = pd.Series([1, 2, 3, 4]) + >>> s.rolling(2).mean() + 0 NaN + 1 1.5 + 2 2.5 + 3 3.5 + dtype: float64 + + >>> s.rolling(3).mean() + 0 NaN + 1 NaN + 2 2.0 + 3 3.0 + dtype: float64 + """ + ) + + _shared_docs["var"] = dedent( + """ + Calculate unbiased %(name)s variance. + %(versionadded)s + Normalized by N-1 by default. This can be changed using the `ddof` + argument. + + Parameters + ---------- + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + *args, **kwargs + For NumPy compatibility. No additional arguments are used. + + Returns + ------- + Series or DataFrame + Returns the same object type as the caller of the %(name)s calculation. + + See Also + -------- + Series.%(name)s : Calling object with Series data. + DataFrame.%(name)s : Calling object with DataFrames. + Series.var : Equivalent method for Series. + DataFrame.var : Equivalent method for DataFrame. + numpy.var : Equivalent method for Numpy array. + + Notes + ----- + The default `ddof` of 1 used in :meth:`Series.var` is different than the + default `ddof` of 0 in :func:`numpy.var`. + + A minimum of 1 period is required for the rolling calculation. + + Examples + -------- + >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5]) + >>> s.rolling(3).var() + 0 NaN + 1 NaN + 2 0.333333 + 3 1.000000 + 4 1.000000 + 5 1.333333 + 6 0.000000 + dtype: float64 + + >>> s.expanding(3).var() + 0 NaN + 1 NaN + 2 0.333333 + 3 0.916667 + 4 0.800000 + 5 0.700000 + 6 0.619048 + dtype: float64 + """ + ) + + _shared_docs["std"] = dedent( + """ + Calculate %(name)s standard deviation. + %(versionadded)s + Normalized by N-1 by default. This can be changed using the `ddof` + argument. + + Parameters + ---------- + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + *args, **kwargs + For NumPy compatibility. No additional arguments are used. + + Returns + ------- + Series or DataFrame + Returns the same object type as the caller of the %(name)s calculation. + + See Also + -------- + Series.%(name)s : Calling object with Series data. + DataFrame.%(name)s : Calling object with DataFrames. + Series.std : Equivalent method for Series. + DataFrame.std : Equivalent method for DataFrame. + numpy.std : Equivalent method for Numpy array. + + Notes + ----- + The default `ddof` of 1 used in Series.std is different than the default + `ddof` of 0 in numpy.std. + + A minimum of one period is required for the rolling calculation. + + Examples + -------- + >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5]) + >>> s.rolling(3).std() + 0 NaN + 1 NaN + 2 0.577350 + 3 1.000000 + 4 1.000000 + 5 1.154701 + 6 0.000000 + dtype: float64 + + >>> s.expanding(3).std() + 0 NaN + 1 NaN + 2 0.577350 + 3 0.957427 + 4 0.894427 + 5 0.836660 + 6 0.786796 + dtype: float64 + """ + ) + + +class Window(_Window): + """ + Provide rolling window calculations. + + Parameters + ---------- + window : int, offset, or BaseIndexer subclass + Size of the moving window. This is the number of observations used for + calculating the statistic. Each window will be a fixed size. + + If its an offset then this will be the time period of each window. Each + window will be a variable sized based on the observations included in + the time-period. This is only valid for datetimelike indexes. + + If a BaseIndexer subclass is passed, calculates the window boundaries + based on the defined ``get_window_bounds`` method. Additional rolling + keyword arguments, namely `min_periods`, `center`, and + `closed` will be passed to `get_window_bounds`. + min_periods : int, default None + Minimum number of observations in window required to have a value + (otherwise result is NA). For a window that is specified by an offset, + `min_periods` will default to 1. Otherwise, `min_periods` will default + to the size of the window. + center : bool, default False + Set the labels at the center of the window. + win_type : str, default None + Provide a window type. If ``None``, all points are evenly weighted. + See the notes below for further information. + on : str, optional + For a DataFrame, a datetime-like column or MultiIndex level on which + to calculate the rolling window, rather than the DataFrame's index. + Provided integer column is ignored and excluded from result since + an integer index is not used to calculate the rolling window. + axis : int or str, default 0 + closed : str, default None + Make the interval closed on the 'right', 'left', 'both' or + 'neither' endpoints. + For offset-based windows, it defaults to 'right'. + For fixed windows, defaults to 'both'. Remaining cases not implemented + for fixed windows. + + Returns + ------- + a Window or Rolling sub-classed for the particular operation + + See Also + -------- + expanding : Provides expanding transformations. + ewm : Provides exponential weighted functions. + + Notes + ----- + By default, the result is set to the right edge of the window. This can be + changed to the center of the window by setting ``center=True``. + + To learn more about the offsets & frequency strings, please see `this link + `__. + + The recognized win_types are: + + * ``boxcar`` + * ``triang`` + * ``blackman`` + * ``hamming`` + * ``bartlett`` + * ``parzen`` + * ``bohman`` + * ``blackmanharris`` + * ``nuttall`` + * ``barthann`` + * ``kaiser`` (needs beta) + * ``gaussian`` (needs std) + * ``general_gaussian`` (needs power, width) + * ``slepian`` (needs width) + * ``exponential`` (needs tau), center is set to None. + + If ``win_type=None`` all points are evenly weighted. To learn more about + different window types see `scipy.signal window functions + `__. + + Examples + -------- + + >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) + >>> df + B + 0 0.0 + 1 1.0 + 2 2.0 + 3 NaN + 4 4.0 + + Rolling sum with a window length of 2, using the 'triang' + window type. + + >>> df.rolling(2, win_type='triang').sum() + B + 0 NaN + 1 0.5 + 2 1.5 + 3 NaN + 4 NaN + + Rolling sum with a window length of 2, using the 'gaussian' + window type (note how we need to specify std). + + >>> df.rolling(2, win_type='gaussian').sum(std=3) + B + 0 NaN + 1 0.986207 + 2 2.958621 + 3 NaN + 4 NaN + + Rolling sum with a window length of 2, min_periods defaults + to the window length. + + >>> df.rolling(2).sum() + B + 0 NaN + 1 1.0 + 2 3.0 + 3 NaN + 4 NaN + + Same as above, but explicitly set the min_periods + + >>> df.rolling(2, min_periods=1).sum() + B + 0 0.0 + 1 1.0 + 2 3.0 + 3 2.0 + 4 4.0 + + A ragged (meaning not-a-regular frequency), time-indexed DataFrame + + >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, + ... index = [pd.Timestamp('20130101 09:00:00'), + ... pd.Timestamp('20130101 09:00:02'), + ... pd.Timestamp('20130101 09:00:03'), + ... pd.Timestamp('20130101 09:00:05'), + ... pd.Timestamp('20130101 09:00:06')]) + + >>> df + B + 2013-01-01 09:00:00 0.0 + 2013-01-01 09:00:02 1.0 + 2013-01-01 09:00:03 2.0 + 2013-01-01 09:00:05 NaN + 2013-01-01 09:00:06 4.0 + + Contrasting to an integer rolling window, this will roll a variable + length window corresponding to the time period. + The default for min_periods is 1. + + >>> df.rolling('2s').sum() + B + 2013-01-01 09:00:00 0.0 + 2013-01-01 09:00:02 1.0 + 2013-01-01 09:00:03 3.0 + 2013-01-01 09:00:05 NaN + 2013-01-01 09:00:06 4.0 + """ + + def validate(self): + super().validate() + + window = self.window + if isinstance(window, BaseIndexer): + raise NotImplementedError( + "BaseIndexer subclasses not implemented with win_types." + ) + elif isinstance(window, (list, tuple, np.ndarray)): + pass + elif is_integer(window): + if window <= 0: + raise ValueError("window must be > 0 ") + import_optional_dependency( + "scipy", extra="Scipy is required to generate window weight." + ) + import scipy.signal as sig + + if not isinstance(self.win_type, str): + raise ValueError(f"Invalid win_type {self.win_type}") + if getattr(sig, self.win_type, None) is None: + raise ValueError(f"Invalid win_type {self.win_type}") + else: + raise ValueError(f"Invalid window {window}") + + def _get_win_type(self, kwargs: Dict) -> Union[str, Tuple]: + """ + Extract arguments for the window type, provide validation for it + and return the validated window type. + + Parameters + ---------- + kwargs : dict + + Returns + ------- + win_type : str, or tuple + """ + # the below may pop from kwargs + def _validate_win_type(win_type, kwargs): + arg_map = { + "kaiser": ["beta"], + "gaussian": ["std"], + "general_gaussian": ["power", "width"], + "slepian": ["width"], + "exponential": ["tau"], + } + + if win_type in arg_map: + win_args = _pop_args(win_type, arg_map[win_type], kwargs) + if win_type == "exponential": + # exponential window requires the first arg (center) + # to be set to None (necessary for symmetric window) + win_args.insert(0, None) + + return tuple([win_type] + win_args) + + return win_type + + def _pop_args(win_type, arg_names, kwargs): + all_args = [] + for n in arg_names: + if n not in kwargs: + raise ValueError(f"{win_type} window requires {n}") + all_args.append(kwargs.pop(n)) + return all_args + + return _validate_win_type(self.win_type, kwargs) + + def _get_window( + self, other=None, win_type: Optional[Union[str, Tuple]] = None + ) -> np.ndarray: + """ + Get the window, weights. + + Parameters + ---------- + other : + ignored, exists for compatibility + win_type : str, or tuple + type of window to create + + Returns + ------- + window : ndarray + the window, weights + """ + + window = self.window + if isinstance(window, (list, tuple, np.ndarray)): + return com.asarray_tuplesafe(window).astype(float) + elif is_integer(window): + import scipy.signal as sig + + # GH #15662. `False` makes symmetric window, rather than periodic. + return sig.get_window(win_type, window, False).astype(float) + + _agg_see_also_doc = dedent( + """ + See Also + -------- + pandas.DataFrame.rolling.aggregate + pandas.DataFrame.aggregate + """ + ) + + _agg_examples_doc = dedent( + """ + Examples + -------- + + >>> df = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C']) + >>> df + A B C + 0 -2.385977 -0.102758 0.438822 + 1 -1.004295 0.905829 -0.954544 + 2 0.735167 -0.165272 -1.619346 + 3 -0.702657 -1.340923 -0.706334 + 4 -0.246845 0.211596 -0.901819 + 5 2.463718 3.157577 -1.380906 + 6 -1.142255 2.340594 -0.039875 + 7 1.396598 -1.647453 1.677227 + 8 -0.543425 1.761277 -0.220481 + 9 -0.640505 0.289374 -1.550670 + + >>> df.rolling(3, win_type='boxcar').agg('mean') + A B C + 0 NaN NaN NaN + 1 NaN NaN NaN + 2 -0.885035 0.212600 -0.711689 + 3 -0.323928 -0.200122 -1.093408 + 4 -0.071445 -0.431533 -1.075833 + 5 0.504739 0.676083 -0.996353 + 6 0.358206 1.903256 -0.774200 + 7 0.906020 1.283573 0.085482 + 8 -0.096361 0.818139 0.472290 + 9 0.070889 0.134399 -0.031308 + """ + ) + + @Substitution( + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + versionadded="", + klass="Series/DataFrame", + axis="", + ) + @Appender(_shared_docs["aggregate"]) + def aggregate(self, func, *args, **kwargs): + result, how = self._aggregate(func, *args, **kwargs) + if result is None: + + # these must apply directly + result = func(self) + + return result + + agg = aggregate + + @Substitution(name="window") + @Appender(_shared_docs["sum"]) + def sum(self, *args, **kwargs): + nv.validate_window_func("sum", args, kwargs) + window_func = self._get_roll_func("roll_weighted_sum") + window_func = get_weighted_roll_func(window_func) + return self._apply( + window_func, center=self.center, is_weighted=True, name="sum", **kwargs + ) + + @Substitution(name="window") + @Appender(_shared_docs["mean"]) + def mean(self, *args, **kwargs): + nv.validate_window_func("mean", args, kwargs) + window_func = self._get_roll_func("roll_weighted_mean") + window_func = get_weighted_roll_func(window_func) + return self._apply( + window_func, center=self.center, is_weighted=True, name="mean", **kwargs + ) + + @Substitution(name="window", versionadded="\n.. versionadded:: 1.0.0\n") + @Appender(_shared_docs["var"]) + def var(self, ddof=1, *args, **kwargs): + nv.validate_window_func("var", args, kwargs) + window_func = partial(self._get_roll_func("roll_weighted_var"), ddof=ddof) + window_func = get_weighted_roll_func(window_func) + kwargs.pop("name", None) + return self._apply( + window_func, center=self.center, is_weighted=True, name="var", **kwargs + ) + + @Substitution(name="window", versionadded="\n.. versionadded:: 1.0.0\n") + @Appender(_shared_docs["std"]) + def std(self, ddof=1, *args, **kwargs): + nv.validate_window_func("std", args, kwargs) + return zsqrt(self.var(ddof=ddof, name="std", **kwargs)) + + +class _Rolling(_Window): + @property + def _constructor(self): + return Rolling + + +class _Rolling_and_Expanding(_Rolling): + + _shared_docs["count"] = dedent( + r""" + The %(name)s count of any non-NaN observations inside the window. + + Returns + ------- + Series or DataFrame + Returned object type is determined by the caller of the %(name)s + calculation. + + See Also + -------- + Series.%(name)s : Calling object with Series data. + DataFrame.%(name)s : Calling object with DataFrames. + DataFrame.count : Count of the full DataFrame. + + Examples + -------- + >>> s = pd.Series([2, 3, np.nan, 10]) + >>> s.rolling(2).count() + 0 1.0 + 1 2.0 + 2 1.0 + 3 1.0 + dtype: float64 + >>> s.rolling(3).count() + 0 1.0 + 1 2.0 + 2 2.0 + 3 2.0 + dtype: float64 + >>> s.rolling(4).count() + 0 1.0 + 1 2.0 + 2 2.0 + 3 3.0 + dtype: float64 + """ + ) + + def count(self): + + blocks, obj = self._create_blocks() + results = [] + for b in blocks: + result = b.notna().astype(int) + result = self._constructor( + result, + window=self._get_window(), + min_periods=self.min_periods or 0, + center=self.center, + axis=self.axis, + closed=self.closed, + ).sum() + results.append(result) + + return self._wrap_results(results, blocks, obj) + + _shared_docs["apply"] = dedent( + r""" + The %(name)s function's apply function. + + Parameters + ---------- + func : function + Must produce a single value from an ndarray input if ``raw=True`` + or a single value from a Series if ``raw=False``. Can also accept a + Numba JIT function with ``engine='numba'`` specified. + + .. versionchanged:: 1.0.0 + + raw : bool, default None + * ``False`` : passes each row or column as a Series to the + function. + * ``True`` : the passed function will receive ndarray + objects instead. + If you are just applying a NumPy reduction function this will + achieve much better performance. + engine : str, default 'cython' + * ``'cython'`` : Runs rolling apply through C-extensions from cython. + * ``'numba'`` : Runs rolling apply through JIT compiled code from numba. + Only available when ``raw`` is set to ``True``. + + .. versionadded:: 1.0.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be + applied to both the ``func`` and the ``apply`` rolling aggregation. + + .. versionadded:: 1.0.0 + + args : tuple, default None + Positional arguments to be passed into func. + kwargs : dict, default None + Keyword arguments to be passed into func. + + Returns + ------- + Series or DataFrame + Return type is determined by the caller. + + See Also + -------- + Series.%(name)s : Series %(name)s. + DataFrame.%(name)s : DataFrame %(name)s. + + Notes + ----- + See :ref:`stats.rolling_apply` for extended documentation and performance + considerations for the Numba engine. + """ + ) + + def apply( + self, + func, + raw: bool = False, + engine: str = "cython", + engine_kwargs: Optional[Dict] = None, + args: Optional[Tuple] = None, + kwargs: Optional[Dict] = None, + ): + if args is None: + args = () + if kwargs is None: + kwargs = {} + kwargs.pop("_level", None) + kwargs.pop("floor", None) + window = self._get_window() + offset = calculate_center_offset(window) if self.center else 0 + if not is_bool(raw): + raise ValueError("raw parameter must be `True` or `False`") + + if engine == "cython": + if engine_kwargs is not None: + raise ValueError("cython engine does not accept engine_kwargs") + apply_func = self._generate_cython_apply_func( + args, kwargs, raw, offset, func + ) + elif engine == "numba": + if raw is False: + raise ValueError("raw must be `True` when using the numba engine") + if func in self._numba_func_cache: + # Return an already compiled version of roll_apply if available + apply_func = self._numba_func_cache[func] + else: + apply_func = generate_numba_apply_func( + args, kwargs, func, engine_kwargs + ) + else: + raise ValueError("engine must be either 'numba' or 'cython'") + + # TODO: Why do we always pass center=False? + # name=func for WindowGroupByMixin._apply + return self._apply( + apply_func, + center=False, + floor=0, + name=func, + use_numba_cache=engine == "numba", + ) + + def _generate_cython_apply_func(self, args, kwargs, raw, offset, func): + from pandas import Series + + window_func = partial( + self._get_cython_func_type("roll_generic"), + args=args, + kwargs=kwargs, + raw=raw, + offset=offset, + func=func, + ) + + def apply_func(values, begin, end, min_periods, raw=raw): + if not raw: + values = Series(values, index=self.obj.index) + return window_func(values, begin, end, min_periods) + + return apply_func + + def sum(self, *args, **kwargs): + nv.validate_window_func("sum", args, kwargs) + window_func = self._get_cython_func_type("roll_sum") + kwargs.pop("floor", None) + return self._apply( + window_func, center=self.center, floor=0, name="sum", **kwargs + ) + + _shared_docs["max"] = dedent( + """ + Calculate the %(name)s maximum. + + Parameters + ---------- + *args, **kwargs + Arguments and keyword arguments to be passed into func. + """ + ) + + def max(self, *args, **kwargs): + nv.validate_window_func("max", args, kwargs) + window_func = self._get_cython_func_type("roll_max") + return self._apply(window_func, center=self.center, name="max", **kwargs) + + _shared_docs["min"] = dedent( + """ + Calculate the %(name)s minimum. + + Parameters + ---------- + **kwargs + Under Review. + + Returns + ------- + Series or DataFrame + Returned object type is determined by the caller of the %(name)s + calculation. + + See Also + -------- + Series.%(name)s : Calling object with a Series. + DataFrame.%(name)s : Calling object with a DataFrame. + Series.min : Similar method for Series. + DataFrame.min : Similar method for DataFrame. + + Examples + -------- + Performing a rolling minimum with a window size of 3. + + >>> s = pd.Series([4, 3, 5, 2, 6]) + >>> s.rolling(3).min() + 0 NaN + 1 NaN + 2 3.0 + 3 2.0 + 4 2.0 + dtype: float64 + """ + ) + + def min(self, *args, **kwargs): + nv.validate_window_func("min", args, kwargs) + window_func = self._get_cython_func_type("roll_min") + return self._apply(window_func, center=self.center, name="min", **kwargs) + + def mean(self, *args, **kwargs): + nv.validate_window_func("mean", args, kwargs) + window_func = self._get_cython_func_type("roll_mean") + return self._apply(window_func, center=self.center, name="mean", **kwargs) + + _shared_docs["median"] = dedent( + """ + Calculate the %(name)s median. + + Parameters + ---------- + **kwargs + For compatibility with other %(name)s methods. Has no effect + on the computed median. + + Returns + ------- + Series or DataFrame + Returned type is the same as the original object. + + See Also + -------- + Series.%(name)s : Calling object with Series data. + DataFrame.%(name)s : Calling object with DataFrames. + Series.median : Equivalent method for Series. + DataFrame.median : Equivalent method for DataFrame. + + Examples + -------- + Compute the rolling median of a series with a window size of 3. + + >>> s = pd.Series([0, 1, 2, 3, 4]) + >>> s.rolling(3).median() + 0 NaN + 1 NaN + 2 1.0 + 3 2.0 + 4 3.0 + dtype: float64 + """ + ) + + def median(self, **kwargs): + window_func = self._get_roll_func("roll_median_c") + window_func = partial(window_func, win=self._get_window()) + return self._apply(window_func, center=self.center, name="median", **kwargs) + + def std(self, ddof=1, *args, **kwargs): + nv.validate_window_func("std", args, kwargs) + kwargs.pop("require_min_periods", None) + window_func = self._get_cython_func_type("roll_var") + + def zsqrt_func(values, begin, end, min_periods): + return zsqrt(window_func(values, begin, end, min_periods, ddof=ddof)) + + # ddof passed again for compat with groupby.rolling + return self._apply( + zsqrt_func, + center=self.center, + require_min_periods=1, + name="std", + ddof=ddof, + **kwargs, + ) + + def var(self, ddof=1, *args, **kwargs): + nv.validate_window_func("var", args, kwargs) + kwargs.pop("require_min_periods", None) + window_func = partial(self._get_cython_func_type("roll_var"), ddof=ddof) + # ddof passed again for compat with groupby.rolling + return self._apply( + window_func, + center=self.center, + require_min_periods=1, + name="var", + ddof=ddof, + **kwargs, + ) + + _shared_docs[ + "skew" + ] = """ + Unbiased %(name)s skewness. + + Parameters + ---------- + **kwargs + Keyword arguments to be passed into func. + """ + + def skew(self, **kwargs): + window_func = self._get_cython_func_type("roll_skew") + kwargs.pop("require_min_periods", None) + return self._apply( + window_func, + center=self.center, + require_min_periods=3, + name="skew", + **kwargs, + ) + + _shared_docs["kurt"] = dedent( + """ + Calculate unbiased %(name)s kurtosis. + + This function uses Fisher's definition of kurtosis without bias. + + Parameters + ---------- + **kwargs + Under Review. + + Returns + ------- + Series or DataFrame + Returned object type is determined by the caller of the %(name)s + calculation. + + See Also + -------- + Series.%(name)s : Calling object with Series data. + DataFrame.%(name)s : Calling object with DataFrames. + Series.kurt : Equivalent method for Series. + DataFrame.kurt : Equivalent method for DataFrame. + scipy.stats.skew : Third moment of a probability density. + scipy.stats.kurtosis : Reference SciPy method. + + Notes + ----- + A minimum of 4 periods is required for the %(name)s calculation. + """ + ) + + def kurt(self, **kwargs): + window_func = self._get_cython_func_type("roll_kurt") + kwargs.pop("require_min_periods", None) + return self._apply( + window_func, + center=self.center, + require_min_periods=4, + name="kurt", + **kwargs, + ) + + _shared_docs["quantile"] = dedent( + """ + Calculate the %(name)s quantile. + + Parameters + ---------- + quantile : float + Quantile to compute. 0 <= quantile <= 1. + interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + .. versionadded:: 0.23.0 + + This optional parameter specifies the interpolation method to use, + when the desired quantile lies between two data points `i` and `j`: + + * linear: `i + (j - i) * fraction`, where `fraction` is the + fractional part of the index surrounded by `i` and `j`. + * lower: `i`. + * higher: `j`. + * nearest: `i` or `j` whichever is nearest. + * midpoint: (`i` + `j`) / 2. + **kwargs + For compatibility with other %(name)s methods. Has no effect on + the result. + + Returns + ------- + Series or DataFrame + Returned object type is determined by the caller of the %(name)s + calculation. + + See Also + -------- + Series.quantile : Computes value at the given quantile over all data + in Series. + DataFrame.quantile : Computes values at the given quantile over + requested axis in DataFrame. + + Examples + -------- + >>> s = pd.Series([1, 2, 3, 4]) + >>> s.rolling(2).quantile(.4, interpolation='lower') + 0 NaN + 1 1.0 + 2 2.0 + 3 3.0 + dtype: float64 + + >>> s.rolling(2).quantile(.4, interpolation='midpoint') + 0 NaN + 1 1.5 + 2 2.5 + 3 3.5 + dtype: float64 + """ + ) + + def quantile(self, quantile, interpolation="linear", **kwargs): + if quantile == 1.0: + window_func = self._get_cython_func_type("roll_max") + elif quantile == 0.0: + window_func = self._get_cython_func_type("roll_min") + else: + window_func = partial( + self._get_roll_func("roll_quantile"), + win=self._get_window(), + quantile=quantile, + interpolation=interpolation, + ) + + # Pass through for groupby.rolling + kwargs["quantile"] = quantile + kwargs["interpolation"] = interpolation + return self._apply(window_func, center=self.center, name="quantile", **kwargs) + + _shared_docs[ + "cov" + ] = """ + Calculate the %(name)s sample covariance. + + Parameters + ---------- + other : Series, DataFrame, or ndarray, optional + If not supplied then will default to self and produce pairwise + output. + pairwise : bool, default None + If False then only matching columns between self and other will be + used and the output will be a DataFrame. + If True then all pairwise combinations will be calculated and the + output will be a MultiIndexed DataFrame in the case of DataFrame + inputs. In the case of missing elements, only complete pairwise + observations will be used. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + **kwargs + Keyword arguments to be passed into func. + """ + + def cov(self, other=None, pairwise=None, ddof=1, **kwargs): + if other is None: + other = self._selected_obj + # only default unset + pairwise = True if pairwise is None else pairwise + other = self._shallow_copy(other) + + # GH 16058: offset window + if self.is_freq_type: + window = self.win_freq + else: + window = self._get_window(other) + + def _get_cov(X, Y): + # GH #12373 : rolling functions error on float32 data + # to avoid potential overflow, cast the data to float64 + X = X.astype("float64") + Y = Y.astype("float64") + mean = lambda x: x.rolling( + window, self.min_periods, center=self.center + ).mean(**kwargs) + count = ( + (X + Y) + .rolling(window=window, min_periods=0, center=self.center) + .count(**kwargs) + ) + bias_adj = count / (count - ddof) + return (mean(X * Y) - mean(X) * mean(Y)) * bias_adj + + return _flex_binary_moment( + self._selected_obj, other._selected_obj, _get_cov, pairwise=bool(pairwise) + ) + + _shared_docs["corr"] = dedent( + """ + Calculate %(name)s correlation. + + Parameters + ---------- + other : Series, DataFrame, or ndarray, optional + If not supplied then will default to self. + pairwise : bool, default None + Calculate pairwise combinations of columns within a + DataFrame. If `other` is not specified, defaults to `True`, + otherwise defaults to `False`. + Not relevant for :class:`~pandas.Series`. + **kwargs + Unused. + + Returns + ------- + Series or DataFrame + Returned object type is determined by the caller of the + %(name)s calculation. + + See Also + -------- + Series.%(name)s : Calling object with Series data. + DataFrame.%(name)s : Calling object with DataFrames. + Series.corr : Equivalent method for Series. + DataFrame.corr : Equivalent method for DataFrame. + %(name)s.cov : Similar method to calculate covariance. + numpy.corrcoef : NumPy Pearson's correlation calculation. + + Notes + ----- + This function uses Pearson's definition of correlation + (https://en.wikipedia.org/wiki/Pearson_correlation_coefficient). + + When `other` is not specified, the output will be self correlation (e.g. + all 1's), except for :class:`~pandas.DataFrame` inputs with `pairwise` + set to `True`. + + Function will return ``NaN`` for correlations of equal valued sequences; + this is the result of a 0/0 division error. + + When `pairwise` is set to `False`, only matching columns between `self` and + `other` will be used. + + When `pairwise` is set to `True`, the output will be a MultiIndex DataFrame + with the original index on the first level, and the `other` DataFrame + columns on the second level. + + In the case of missing elements, only complete pairwise observations + will be used. + + Examples + -------- + The below example shows a rolling calculation with a window size of + four matching the equivalent function call using :meth:`numpy.corrcoef`. + + >>> v1 = [3, 3, 3, 5, 8] + >>> v2 = [3, 4, 4, 4, 8] + >>> # numpy returns a 2X2 array, the correlation coefficient + >>> # is the number at entry [0][1] + >>> print(f"{np.corrcoef(v1[:-1], v2[:-1])[0][1]:.6f}") + 0.333333 + >>> print(f"{np.corrcoef(v1[1:], v2[1:])[0][1]:.6f}") + 0.916949 + >>> s1 = pd.Series(v1) + >>> s2 = pd.Series(v2) + >>> s1.rolling(4).corr(s2) + 0 NaN + 1 NaN + 2 NaN + 3 0.333333 + 4 0.916949 + dtype: float64 + + The below example shows a similar rolling calculation on a + DataFrame using the pairwise option. + + >>> matrix = np.array([[51., 35.], [49., 30.], [47., 32.],\ + [46., 31.], [50., 36.]]) + >>> print(np.corrcoef(matrix[:-1,0], matrix[:-1,1]).round(7)) + [[1. 0.6263001] + [0.6263001 1. ]] + >>> print(np.corrcoef(matrix[1:,0], matrix[1:,1]).round(7)) + [[1. 0.5553681] + [0.5553681 1. ]] + >>> df = pd.DataFrame(matrix, columns=['X','Y']) + >>> df + X Y + 0 51.0 35.0 + 1 49.0 30.0 + 2 47.0 32.0 + 3 46.0 31.0 + 4 50.0 36.0 + >>> df.rolling(4).corr(pairwise=True) + X Y + 0 X NaN NaN + Y NaN NaN + 1 X NaN NaN + Y NaN NaN + 2 X NaN NaN + Y NaN NaN + 3 X 1.000000 0.626300 + Y 0.626300 1.000000 + 4 X 1.000000 0.555368 + Y 0.555368 1.000000 + """ + ) + + def corr(self, other=None, pairwise=None, **kwargs): + if other is None: + other = self._selected_obj + # only default unset + pairwise = True if pairwise is None else pairwise + other = self._shallow_copy(other) + window = self._get_window(other) + + def _get_corr(a, b): + a = a.rolling( + window=window, min_periods=self.min_periods, center=self.center + ) + b = b.rolling( + window=window, min_periods=self.min_periods, center=self.center + ) + + return a.cov(b, **kwargs) / (a.std(**kwargs) * b.std(**kwargs)) + + return _flex_binary_moment( + self._selected_obj, other._selected_obj, _get_corr, pairwise=bool(pairwise) + ) + + +class Rolling(_Rolling_and_Expanding): + @cache_readonly + def is_datetimelike(self) -> bool: + return isinstance( + self._on, (ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex) + ) + + @cache_readonly + def _on(self) -> Index: + if self.on is None: + if self.axis == 0: + return self.obj.index + else: + # i.e. self.axis == 1 + return self.obj.columns + elif isinstance(self.on, Index): + return self.on + elif isinstance(self.obj, ABCDataFrame) and self.on in self.obj.columns: + return Index(self.obj[self.on]) + else: + raise ValueError( + f"invalid on specified as {self.on}, " + "must be a column (of DataFrame), an Index " + "or None" + ) + + def validate(self): + super().validate() + + # we allow rolling on a datetimelike index + if (self.obj.empty or self.is_datetimelike) and isinstance( + self.window, (str, ABCDateOffset, timedelta) + ): + + self._validate_monotonic() + freq = self._validate_freq() + + # we don't allow center + if self.center: + raise NotImplementedError( + "center is not implemented " + "for datetimelike and offset " + "based windows" + ) + + # this will raise ValueError on non-fixed freqs + self.win_freq = self.window + self.window = freq.nanos + self.win_type = "freq" + + # min_periods must be an integer + if self.min_periods is None: + self.min_periods = 1 + + elif isinstance(self.window, BaseIndexer): + # Passed BaseIndexer subclass should handle all other rolling kwargs + return + elif not is_integer(self.window): + raise ValueError("window must be an integer") + elif self.window < 0: + raise ValueError("window must be non-negative") + + if not self.is_datetimelike and self.closed is not None: + raise ValueError( + "closed only implemented for datetimelike and offset based windows" + ) + + def _validate_monotonic(self): + """ + Validate monotonic (increasing or decreasing). + """ + if not (self._on.is_monotonic_increasing or self._on.is_monotonic_decreasing): + formatted = self.on + if self.on is None: + formatted = "index" + raise ValueError(f"{formatted} must be monotonic") + + def _validate_freq(self): + """ + Validate & return window frequency. + """ + from pandas.tseries.frequencies import to_offset + + try: + return to_offset(self.window) + except (TypeError, ValueError): + raise ValueError( + f"passed window {self.window} is not " + "compatible with a datetimelike " + "index" + ) + + _agg_see_also_doc = dedent( + """ + See Also + -------- + Series.rolling + DataFrame.rolling + """ + ) + + _agg_examples_doc = dedent( + """ + Examples + -------- + + >>> df = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C']) + >>> df + A B C + 0 -2.385977 -0.102758 0.438822 + 1 -1.004295 0.905829 -0.954544 + 2 0.735167 -0.165272 -1.619346 + 3 -0.702657 -1.340923 -0.706334 + 4 -0.246845 0.211596 -0.901819 + 5 2.463718 3.157577 -1.380906 + 6 -1.142255 2.340594 -0.039875 + 7 1.396598 -1.647453 1.677227 + 8 -0.543425 1.761277 -0.220481 + 9 -0.640505 0.289374 -1.550670 + + >>> df.rolling(3).sum() + A B C + 0 NaN NaN NaN + 1 NaN NaN NaN + 2 -2.655105 0.637799 -2.135068 + 3 -0.971785 -0.600366 -3.280224 + 4 -0.214334 -1.294599 -3.227500 + 5 1.514216 2.028250 -2.989060 + 6 1.074618 5.709767 -2.322600 + 7 2.718061 3.850718 0.256446 + 8 -0.289082 2.454418 1.416871 + 9 0.212668 0.403198 -0.093924 + + >>> df.rolling(3).agg({'A':'sum', 'B':'min'}) + A B + 0 NaN NaN + 1 NaN NaN + 2 -2.655105 -0.165272 + 3 -0.971785 -1.340923 + 4 -0.214334 -1.340923 + 5 1.514216 -1.340923 + 6 1.074618 0.211596 + 7 2.718061 -1.647453 + 8 -0.289082 -1.647453 + 9 0.212668 -1.647453 + """ + ) + + @Substitution( + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + versionadded="", + klass="Series/Dataframe", + axis="", + ) + @Appender(_shared_docs["aggregate"]) + def aggregate(self, func, *args, **kwargs): + return super().aggregate(func, *args, **kwargs) + + agg = aggregate + + @Substitution(name="rolling") + @Appender(_shared_docs["count"]) + def count(self): + + # different impl for freq counting + if self.is_freq_type: + window_func = self._get_roll_func("roll_count") + return self._apply(window_func, center=self.center, name="count") + + return super().count() + + @Substitution(name="rolling") + @Appender(_shared_docs["apply"]) + def apply( + self, + func, + raw=False, + engine="cython", + engine_kwargs=None, + args=None, + kwargs=None, + ): + return super().apply( + func, + raw=raw, + engine=engine, + engine_kwargs=engine_kwargs, + args=args, + kwargs=kwargs, + ) + + @Substitution(name="rolling") + @Appender(_shared_docs["sum"]) + def sum(self, *args, **kwargs): + nv.validate_rolling_func("sum", args, kwargs) + return super().sum(*args, **kwargs) + + @Substitution(name="rolling") + @Appender(_doc_template) + @Appender(_shared_docs["max"]) + def max(self, *args, **kwargs): + nv.validate_rolling_func("max", args, kwargs) + return super().max(*args, **kwargs) + + @Substitution(name="rolling") + @Appender(_shared_docs["min"]) + def min(self, *args, **kwargs): + nv.validate_rolling_func("min", args, kwargs) + return super().min(*args, **kwargs) + + @Substitution(name="rolling") + @Appender(_shared_docs["mean"]) + def mean(self, *args, **kwargs): + nv.validate_rolling_func("mean", args, kwargs) + return super().mean(*args, **kwargs) + + @Substitution(name="rolling") + @Appender(_shared_docs["median"]) + def median(self, **kwargs): + return super().median(**kwargs) + + @Substitution(name="rolling", versionadded="") + @Appender(_shared_docs["std"]) + def std(self, ddof=1, *args, **kwargs): + nv.validate_rolling_func("std", args, kwargs) + return super().std(ddof=ddof, **kwargs) + + @Substitution(name="rolling", versionadded="") + @Appender(_shared_docs["var"]) + def var(self, ddof=1, *args, **kwargs): + nv.validate_rolling_func("var", args, kwargs) + return super().var(ddof=ddof, **kwargs) + + @Substitution(name="rolling") + @Appender(_doc_template) + @Appender(_shared_docs["skew"]) + def skew(self, **kwargs): + return super().skew(**kwargs) + + _agg_doc = dedent( + """ + Examples + -------- + + The example below will show a rolling calculation with a window size of + four matching the equivalent function call using `scipy.stats`. + + >>> arr = [1, 2, 3, 4, 999] + >>> import scipy.stats + >>> print(f"{scipy.stats.kurtosis(arr[:-1], bias=False):.6f}") + -1.200000 + >>> print(f"{scipy.stats.kurtosis(arr[1:], bias=False):.6f}") + 3.999946 + >>> s = pd.Series(arr) + >>> s.rolling(4).kurt() + 0 NaN + 1 NaN + 2 NaN + 3 -1.200000 + 4 3.999946 + dtype: float64 + """ + ) + + @Appender(_agg_doc) + @Substitution(name="rolling") + @Appender(_shared_docs["kurt"]) + def kurt(self, **kwargs): + return super().kurt(**kwargs) + + @Substitution(name="rolling") + @Appender(_shared_docs["quantile"]) + def quantile(self, quantile, interpolation="linear", **kwargs): + return super().quantile( + quantile=quantile, interpolation=interpolation, **kwargs + ) + + @Substitution(name="rolling") + @Appender(_doc_template) + @Appender(_shared_docs["cov"]) + def cov(self, other=None, pairwise=None, ddof=1, **kwargs): + return super().cov(other=other, pairwise=pairwise, ddof=ddof, **kwargs) + + @Substitution(name="rolling") + @Appender(_shared_docs["corr"]) + def corr(self, other=None, pairwise=None, **kwargs): + return super().corr(other=other, pairwise=pairwise, **kwargs) + + +Rolling.__doc__ = Window.__doc__ + + +class RollingGroupby(WindowGroupByMixin, Rolling): + """ + Provide a rolling groupby implementation. + """ + + @property + def _constructor(self): + return Rolling + + def _gotitem(self, key, ndim, subset=None): + + # we are setting the index on the actual object + # here so our index is carried thru to the selected obj + # when we do the splitting for the groupby + if self.on is not None: + self._groupby.obj = self._groupby.obj.set_index(self._on) + self.on = None + return super()._gotitem(key, ndim, subset=subset) + + def _validate_monotonic(self): + """ + Validate that on is monotonic; + we don't care for groupby.rolling + because we have already validated at a higher + level. + """ + pass diff --git a/venv/Lib/site-packages/pandas/errors/__init__.py b/venv/Lib/site-packages/pandas/errors/__init__.py new file mode 100644 index 0000000..ebe9a3d --- /dev/null +++ b/venv/Lib/site-packages/pandas/errors/__init__.py @@ -0,0 +1,184 @@ +# flake8: noqa + +""" +Expose public exceptions & warnings +""" + +from pandas._libs.tslibs import NullFrequencyError, OutOfBoundsDatetime + + +class PerformanceWarning(Warning): + """ + Warning raised when there is a possible performance impact. + """ + + +class UnsupportedFunctionCall(ValueError): + """ + Exception raised when attempting to call a numpy function + on a pandas object, but that function is not supported by + the object e.g. ``np.cumsum(groupby_object)``. + """ + + +class UnsortedIndexError(KeyError): + """ + Error raised when attempting to get a slice of a MultiIndex, + and the index has not been lexsorted. Subclass of `KeyError`. + """ + + +class ParserError(ValueError): + """ + Exception that is raised by an error encountered in parsing file contents. + + This is a generic error raised for errors encountered when functions like + `read_csv` or `read_html` are parsing contents of a file. + + See Also + -------- + read_csv : Read CSV (comma-separated) file into a DataFrame. + read_html : Read HTML table into a DataFrame. + """ + + +class DtypeWarning(Warning): + """ + Warning raised when reading different dtypes in a column from a file. + + Raised for a dtype incompatibility. This can happen whenever `read_csv` + or `read_table` encounter non-uniform dtypes in a column(s) of a given + CSV file. + + See Also + -------- + read_csv : Read CSV (comma-separated) file into a DataFrame. + read_table : Read general delimited file into a DataFrame. + + Notes + ----- + This warning is issued when dealing with larger files because the dtype + checking happens per chunk read. + + Despite the warning, the CSV file is read with mixed types in a single + column which will be an object type. See the examples below to better + understand this issue. + + Examples + -------- + This example creates and reads a large CSV file with a column that contains + `int` and `str`. + + >>> df = pd.DataFrame({'a': (['1'] * 100000 + ['X'] * 100000 + + ... ['1'] * 100000), + ... 'b': ['b'] * 300000}) + >>> df.to_csv('test.csv', index=False) + >>> df2 = pd.read_csv('test.csv') + ... # DtypeWarning: Columns (0) have mixed types + + Important to notice that ``df2`` will contain both `str` and `int` for the + same input, '1'. + + >>> df2.iloc[262140, 0] + '1' + >>> type(df2.iloc[262140, 0]) + + >>> df2.iloc[262150, 0] + 1 + >>> type(df2.iloc[262150, 0]) + + + One way to solve this issue is using the `dtype` parameter in the + `read_csv` and `read_table` functions to explicit the conversion: + + >>> df2 = pd.read_csv('test.csv', sep=',', dtype={'a': str}) + + No warning was issued. + + >>> import os + >>> os.remove('test.csv') + """ + + +class EmptyDataError(ValueError): + """ + Exception that is thrown in `pd.read_csv` (by both the C and + Python engines) when empty data or header is encountered. + """ + + +class ParserWarning(Warning): + """ + Warning raised when reading a file that doesn't use the default 'c' parser. + + Raised by `pd.read_csv` and `pd.read_table` when it is necessary to change + parsers, generally from the default 'c' parser to 'python'. + + It happens due to a lack of support or functionality for parsing a + particular attribute of a CSV file with the requested engine. + + Currently, 'c' unsupported options include the following parameters: + + 1. `sep` other than a single character (e.g. regex separators) + 2. `skipfooter` higher than 0 + 3. `sep=None` with `delim_whitespace=False` + + The warning can be avoided by adding `engine='python'` as a parameter in + `pd.read_csv` and `pd.read_table` methods. + + See Also + -------- + pd.read_csv : Read CSV (comma-separated) file into DataFrame. + pd.read_table : Read general delimited file into DataFrame. + + Examples + -------- + Using a `sep` in `pd.read_csv` other than a single character: + + >>> import io + >>> csv = '''a;b;c + ... 1;1,8 + ... 1;2,1''' + >>> df = pd.read_csv(io.StringIO(csv), sep='[;,]') # doctest: +SKIP + ... # ParserWarning: Falling back to the 'python' engine... + + Adding `engine='python'` to `pd.read_csv` removes the Warning: + + >>> df = pd.read_csv(io.StringIO(csv), sep='[;,]', engine='python') + """ + + +class MergeError(ValueError): + """ + Error raised when problems arise during merging due to problems + with input data. Subclass of `ValueError`. + """ + + +class AccessorRegistrationWarning(Warning): + """ + Warning for attribute conflicts in accessor registration. + """ + + +class AbstractMethodError(NotImplementedError): + """ + Raise this error instead of NotImplementedError for abstract methods + while keeping compatibility with Python 2 and Python 3. + """ + + def __init__(self, class_instance, methodtype="method"): + types = {"method", "classmethod", "staticmethod", "property"} + if methodtype not in types: + raise ValueError( + f"methodtype must be one of {methodtype}, got {types} instead." + ) + self.methodtype = methodtype + self.class_instance = class_instance + + def __str__(self) -> str: + if self.methodtype == "classmethod": + name = self.class_instance.__name__ + else: + name = type(self.class_instance).__name__ + return f"This {self.methodtype} must be defined in the concrete class {name}" diff --git a/venv/Lib/site-packages/pandas/io/__init__.py b/venv/Lib/site-packages/pandas/io/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/io/api.py b/venv/Lib/site-packages/pandas/io/api.py new file mode 100644 index 0000000..2d25ffe --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/api.py @@ -0,0 +1,21 @@ +""" +Data IO api +""" + +# flake8: noqa + +from pandas.io.clipboards import read_clipboard +from pandas.io.excel import ExcelFile, ExcelWriter, read_excel +from pandas.io.feather_format import read_feather +from pandas.io.gbq import read_gbq +from pandas.io.html import read_html +from pandas.io.json import read_json +from pandas.io.orc import read_orc +from pandas.io.parquet import read_parquet +from pandas.io.parsers import read_csv, read_fwf, read_table +from pandas.io.pickle import read_pickle, to_pickle +from pandas.io.pytables import HDFStore, read_hdf +from pandas.io.sas import read_sas +from pandas.io.spss import read_spss +from pandas.io.sql import read_sql, read_sql_query, read_sql_table +from pandas.io.stata import read_stata diff --git a/venv/Lib/site-packages/pandas/io/clipboard/__init__.py b/venv/Lib/site-packages/pandas/io/clipboard/__init__.py new file mode 100644 index 0000000..f808b7e --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/clipboard/__init__.py @@ -0,0 +1,667 @@ +""" +Pyperclip + +A cross-platform clipboard module for Python, +with copy & paste functions for plain text. +By Al Sweigart al@inventwithpython.com +BSD License + +Usage: + import pyperclip + pyperclip.copy('The text to be copied to the clipboard.') + spam = pyperclip.paste() + + if not pyperclip.is_available(): + print("Copy functionality unavailable!") + +On Windows, no additional modules are needed. +On Mac, the pyobjc module is used, falling back to the pbcopy and pbpaste cli + commands. (These commands should come with OS X.). +On Linux, install xclip or xsel via package manager. For example, in Debian: + sudo apt-get install xclip + sudo apt-get install xsel + +Otherwise on Linux, you will need the PyQt5 modules installed. + +This module does not work with PyGObject yet. + +Cygwin is currently not supported. + +Security Note: This module runs programs with these names: + - which + - where + - pbcopy + - pbpaste + - xclip + - xsel + - klipper + - qdbus +A malicious user could rename or add programs with these names, tricking +Pyperclip into running them with whatever permissions the Python process has. + +""" +__version__ = "1.7.0" + +import contextlib +import ctypes +from ctypes import c_size_t, c_wchar, c_wchar_p, get_errno, sizeof +import os +import platform +import subprocess +import time +import warnings + +# `import PyQt4` sys.exit()s if DISPLAY is not in the environment. +# Thus, we need to detect the presence of $DISPLAY manually +# and not load PyQt4 if it is absent. +HAS_DISPLAY = os.getenv("DISPLAY", False) + +EXCEPT_MSG = """ + Pyperclip could not find a copy/paste mechanism for your system. + For more information, please visit + https://pyperclip.readthedocs.io/en/latest/introduction.html#not-implemented-error + """ + +ENCODING = "utf-8" + +# The "which" unix command finds where a command is. +if platform.system() == "Windows": + WHICH_CMD = "where" +else: + WHICH_CMD = "which" + + +def _executable_exists(name): + return ( + subprocess.call( + [WHICH_CMD, name], stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + == 0 + ) + + +# Exceptions +class PyperclipException(RuntimeError): + pass + + +class PyperclipWindowsException(PyperclipException): + def __init__(self, message): + message += f" ({ctypes.WinError()})" + super().__init__(message) + + +def _stringifyText(text) -> str: + acceptedTypes = (str, int, float, bool) + if not isinstance(text, acceptedTypes): + raise PyperclipException( + f"only str, int, float, and bool values " + f"can be copied to the clipboard, not {type(text).__name__}" + ) + return str(text) + + +def init_osx_pbcopy_clipboard(): + def copy_osx_pbcopy(text): + text = _stringifyText(text) # Converts non-str values to str. + p = subprocess.Popen(["pbcopy", "w"], stdin=subprocess.PIPE, close_fds=True) + p.communicate(input=text.encode(ENCODING)) + + def paste_osx_pbcopy(): + p = subprocess.Popen(["pbpaste", "r"], stdout=subprocess.PIPE, close_fds=True) + stdout, stderr = p.communicate() + return stdout.decode(ENCODING) + + return copy_osx_pbcopy, paste_osx_pbcopy + + +def init_osx_pyobjc_clipboard(): + def copy_osx_pyobjc(text): + """Copy string argument to clipboard""" + text = _stringifyText(text) # Converts non-str values to str. + newStr = Foundation.NSString.stringWithString_(text).nsstring() + newData = newStr.dataUsingEncoding_(Foundation.NSUTF8StringEncoding) + board = AppKit.NSPasteboard.generalPasteboard() + board.declareTypes_owner_([AppKit.NSStringPboardType], None) + board.setData_forType_(newData, AppKit.NSStringPboardType) + + def paste_osx_pyobjc(): + "Returns contents of clipboard" + board = AppKit.NSPasteboard.generalPasteboard() + content = board.stringForType_(AppKit.NSStringPboardType) + return content + + return copy_osx_pyobjc, paste_osx_pyobjc + + +def init_qt_clipboard(): + global QApplication + # $DISPLAY should exist + + # Try to import from qtpy, but if that fails try PyQt5 then PyQt4 + try: + from qtpy.QtWidgets import QApplication + except ImportError: + try: + from PyQt5.QtWidgets import QApplication + except ImportError: + from PyQt4.QtGui import QApplication + + app = QApplication.instance() + if app is None: + app = QApplication([]) + + def copy_qt(text): + text = _stringifyText(text) # Converts non-str values to str. + cb = app.clipboard() + cb.setText(text) + + def paste_qt() -> str: + cb = app.clipboard() + return str(cb.text()) + + return copy_qt, paste_qt + + +def init_xclip_clipboard(): + DEFAULT_SELECTION = "c" + PRIMARY_SELECTION = "p" + + def copy_xclip(text, primary=False): + text = _stringifyText(text) # Converts non-str values to str. + selection = DEFAULT_SELECTION + if primary: + selection = PRIMARY_SELECTION + p = subprocess.Popen( + ["xclip", "-selection", selection], stdin=subprocess.PIPE, close_fds=True + ) + p.communicate(input=text.encode(ENCODING)) + + def paste_xclip(primary=False): + selection = DEFAULT_SELECTION + if primary: + selection = PRIMARY_SELECTION + p = subprocess.Popen( + ["xclip", "-selection", selection, "-o"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + close_fds=True, + ) + stdout, stderr = p.communicate() + # Intentionally ignore extraneous output on stderr when clipboard is empty + return stdout.decode(ENCODING) + + return copy_xclip, paste_xclip + + +def init_xsel_clipboard(): + DEFAULT_SELECTION = "-b" + PRIMARY_SELECTION = "-p" + + def copy_xsel(text, primary=False): + text = _stringifyText(text) # Converts non-str values to str. + selection_flag = DEFAULT_SELECTION + if primary: + selection_flag = PRIMARY_SELECTION + p = subprocess.Popen( + ["xsel", selection_flag, "-i"], stdin=subprocess.PIPE, close_fds=True + ) + p.communicate(input=text.encode(ENCODING)) + + def paste_xsel(primary=False): + selection_flag = DEFAULT_SELECTION + if primary: + selection_flag = PRIMARY_SELECTION + p = subprocess.Popen( + ["xsel", selection_flag, "-o"], stdout=subprocess.PIPE, close_fds=True + ) + stdout, stderr = p.communicate() + return stdout.decode(ENCODING) + + return copy_xsel, paste_xsel + + +def init_klipper_clipboard(): + def copy_klipper(text): + text = _stringifyText(text) # Converts non-str values to str. + p = subprocess.Popen( + [ + "qdbus", + "org.kde.klipper", + "/klipper", + "setClipboardContents", + text.encode(ENCODING), + ], + stdin=subprocess.PIPE, + close_fds=True, + ) + p.communicate(input=None) + + def paste_klipper(): + p = subprocess.Popen( + ["qdbus", "org.kde.klipper", "/klipper", "getClipboardContents"], + stdout=subprocess.PIPE, + close_fds=True, + ) + stdout, stderr = p.communicate() + + # Workaround for https://bugs.kde.org/show_bug.cgi?id=342874 + # TODO: https://github.com/asweigart/pyperclip/issues/43 + clipboardContents = stdout.decode(ENCODING) + # even if blank, Klipper will append a newline at the end + assert len(clipboardContents) > 0 + # make sure that newline is there + assert clipboardContents.endswith("\n") + if clipboardContents.endswith("\n"): + clipboardContents = clipboardContents[:-1] + return clipboardContents + + return copy_klipper, paste_klipper + + +def init_dev_clipboard_clipboard(): + def copy_dev_clipboard(text): + text = _stringifyText(text) # Converts non-str values to str. + if text == "": + warnings.warn( + "Pyperclip cannot copy a blank string to the clipboard on Cygwin." + "This is effectively a no-op." + ) + if "\r" in text: + warnings.warn("Pyperclip cannot handle \\r characters on Cygwin.") + + with open("/dev/clipboard", "wt") as fo: + fo.write(text) + + def paste_dev_clipboard() -> str: + with open("/dev/clipboard", "rt") as fo: + content = fo.read() + return content + + return copy_dev_clipboard, paste_dev_clipboard + + +def init_no_clipboard(): + class ClipboardUnavailable: + def __call__(self, *args, **kwargs): + raise PyperclipException(EXCEPT_MSG) + + def __bool__(self) -> bool: + return False + + return ClipboardUnavailable(), ClipboardUnavailable() + + +# Windows-related clipboard functions: +class CheckedCall: + def __init__(self, f): + super().__setattr__("f", f) + + def __call__(self, *args): + ret = self.f(*args) + if not ret and get_errno(): + raise PyperclipWindowsException("Error calling " + self.f.__name__) + return ret + + def __setattr__(self, key, value): + setattr(self.f, key, value) + + +def init_windows_clipboard(): + global HGLOBAL, LPVOID, DWORD, LPCSTR, INT + global HWND, HINSTANCE, HMENU, BOOL, UINT, HANDLE + from ctypes.wintypes import ( + HGLOBAL, + LPVOID, + DWORD, + LPCSTR, + INT, + HWND, + HINSTANCE, + HMENU, + BOOL, + UINT, + HANDLE, + ) + + windll = ctypes.windll + msvcrt = ctypes.CDLL("msvcrt") + + safeCreateWindowExA = CheckedCall(windll.user32.CreateWindowExA) + safeCreateWindowExA.argtypes = [ + DWORD, + LPCSTR, + LPCSTR, + DWORD, + INT, + INT, + INT, + INT, + HWND, + HMENU, + HINSTANCE, + LPVOID, + ] + safeCreateWindowExA.restype = HWND + + safeDestroyWindow = CheckedCall(windll.user32.DestroyWindow) + safeDestroyWindow.argtypes = [HWND] + safeDestroyWindow.restype = BOOL + + OpenClipboard = windll.user32.OpenClipboard + OpenClipboard.argtypes = [HWND] + OpenClipboard.restype = BOOL + + safeCloseClipboard = CheckedCall(windll.user32.CloseClipboard) + safeCloseClipboard.argtypes = [] + safeCloseClipboard.restype = BOOL + + safeEmptyClipboard = CheckedCall(windll.user32.EmptyClipboard) + safeEmptyClipboard.argtypes = [] + safeEmptyClipboard.restype = BOOL + + safeGetClipboardData = CheckedCall(windll.user32.GetClipboardData) + safeGetClipboardData.argtypes = [UINT] + safeGetClipboardData.restype = HANDLE + + safeSetClipboardData = CheckedCall(windll.user32.SetClipboardData) + safeSetClipboardData.argtypes = [UINT, HANDLE] + safeSetClipboardData.restype = HANDLE + + safeGlobalAlloc = CheckedCall(windll.kernel32.GlobalAlloc) + safeGlobalAlloc.argtypes = [UINT, c_size_t] + safeGlobalAlloc.restype = HGLOBAL + + safeGlobalLock = CheckedCall(windll.kernel32.GlobalLock) + safeGlobalLock.argtypes = [HGLOBAL] + safeGlobalLock.restype = LPVOID + + safeGlobalUnlock = CheckedCall(windll.kernel32.GlobalUnlock) + safeGlobalUnlock.argtypes = [HGLOBAL] + safeGlobalUnlock.restype = BOOL + + wcslen = CheckedCall(msvcrt.wcslen) + wcslen.argtypes = [c_wchar_p] + wcslen.restype = UINT + + GMEM_MOVEABLE = 0x0002 + CF_UNICODETEXT = 13 + + @contextlib.contextmanager + def window(): + """ + Context that provides a valid Windows hwnd. + """ + # we really just need the hwnd, so setting "STATIC" + # as predefined lpClass is just fine. + hwnd = safeCreateWindowExA( + 0, b"STATIC", None, 0, 0, 0, 0, 0, None, None, None, None + ) + try: + yield hwnd + finally: + safeDestroyWindow(hwnd) + + @contextlib.contextmanager + def clipboard(hwnd): + """ + Context manager that opens the clipboard and prevents + other applications from modifying the clipboard content. + """ + # We may not get the clipboard handle immediately because + # some other application is accessing it (?) + # We try for at least 500ms to get the clipboard. + t = time.time() + 0.5 + success = False + while time.time() < t: + success = OpenClipboard(hwnd) + if success: + break + time.sleep(0.01) + if not success: + raise PyperclipWindowsException("Error calling OpenClipboard") + + try: + yield + finally: + safeCloseClipboard() + + def copy_windows(text): + # This function is heavily based on + # http://msdn.com/ms649016#_win32_Copying_Information_to_the_Clipboard + + text = _stringifyText(text) # Converts non-str values to str. + + with window() as hwnd: + # http://msdn.com/ms649048 + # If an application calls OpenClipboard with hwnd set to NULL, + # EmptyClipboard sets the clipboard owner to NULL; + # this causes SetClipboardData to fail. + # => We need a valid hwnd to copy something. + with clipboard(hwnd): + safeEmptyClipboard() + + if text: + # http://msdn.com/ms649051 + # If the hMem parameter identifies a memory object, + # the object must have been allocated using the + # function with the GMEM_MOVEABLE flag. + count = wcslen(text) + 1 + handle = safeGlobalAlloc(GMEM_MOVEABLE, count * sizeof(c_wchar)) + locked_handle = safeGlobalLock(handle) + + ctypes.memmove( + c_wchar_p(locked_handle), + c_wchar_p(text), + count * sizeof(c_wchar), + ) + + safeGlobalUnlock(handle) + safeSetClipboardData(CF_UNICODETEXT, handle) + + def paste_windows(): + with clipboard(None): + handle = safeGetClipboardData(CF_UNICODETEXT) + if not handle: + # GetClipboardData may return NULL with errno == NO_ERROR + # if the clipboard is empty. + # (Also, it may return a handle to an empty buffer, + # but technically that's not empty) + return "" + return c_wchar_p(handle).value + + return copy_windows, paste_windows + + +def init_wsl_clipboard(): + def copy_wsl(text): + text = _stringifyText(text) # Converts non-str values to str. + p = subprocess.Popen(["clip.exe"], stdin=subprocess.PIPE, close_fds=True) + p.communicate(input=text.encode(ENCODING)) + + def paste_wsl(): + p = subprocess.Popen( + ["powershell.exe", "-command", "Get-Clipboard"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + close_fds=True, + ) + stdout, stderr = p.communicate() + # WSL appends "\r\n" to the contents. + return stdout[:-2].decode(ENCODING) + + return copy_wsl, paste_wsl + + +# Automatic detection of clipboard mechanisms +# and importing is done in deteremine_clipboard(): +def determine_clipboard(): + """ + Determine the OS/platform and set the copy() and paste() functions + accordingly. + """ + + global Foundation, AppKit, qtpy, PyQt4, PyQt5 + + # Setup for the CYGWIN platform: + if ( + "cygwin" in platform.system().lower() + ): # Cygwin has a variety of values returned by platform.system(), + # such as 'CYGWIN_NT-6.1' + # FIXME: pyperclip currently does not support Cygwin, + # see https://github.com/asweigart/pyperclip/issues/55 + if os.path.exists("/dev/clipboard"): + warnings.warn( + "Pyperclip's support for Cygwin is not perfect," + "see https://github.com/asweigart/pyperclip/issues/55" + ) + return init_dev_clipboard_clipboard() + + # Setup for the WINDOWS platform: + elif os.name == "nt" or platform.system() == "Windows": + return init_windows_clipboard() + + if platform.system() == "Linux": + with open("/proc/version", "r") as f: + if "Microsoft" in f.read(): + return init_wsl_clipboard() + + # Setup for the MAC OS X platform: + if os.name == "mac" or platform.system() == "Darwin": + try: + import Foundation # check if pyobjc is installed + import AppKit + except ImportError: + return init_osx_pbcopy_clipboard() + else: + return init_osx_pyobjc_clipboard() + + # Setup for the LINUX platform: + if HAS_DISPLAY: + if _executable_exists("xsel"): + return init_xsel_clipboard() + if _executable_exists("xclip"): + return init_xclip_clipboard() + if _executable_exists("klipper") and _executable_exists("qdbus"): + return init_klipper_clipboard() + + try: + # qtpy is a small abstraction layer that lets you write applications + # using a single api call to either PyQt or PySide. + # https://pypi.python.org/project/QtPy + import qtpy # check if qtpy is installed + except ImportError: + # If qtpy isn't installed, fall back on importing PyQt4. + try: + import PyQt5 # check if PyQt5 is installed + except ImportError: + try: + import PyQt4 # check if PyQt4 is installed + except ImportError: + pass # We want to fail fast for all non-ImportError exceptions. + else: + return init_qt_clipboard() + else: + return init_qt_clipboard() + else: + return init_qt_clipboard() + + return init_no_clipboard() + + +def set_clipboard(clipboard): + """ + Explicitly sets the clipboard mechanism. The "clipboard mechanism" is how + the copy() and paste() functions interact with the operating system to + implement the copy/paste feature. The clipboard parameter must be one of: + - pbcopy + - pbobjc (default on Mac OS X) + - qt + - xclip + - xsel + - klipper + - windows (default on Windows) + - no (this is what is set when no clipboard mechanism can be found) + """ + global copy, paste + + clipboard_types = { + "pbcopy": init_osx_pbcopy_clipboard, + "pyobjc": init_osx_pyobjc_clipboard, + "qt": init_qt_clipboard, # TODO - split this into 'qtpy', 'pyqt4', and 'pyqt5' + "xclip": init_xclip_clipboard, + "xsel": init_xsel_clipboard, + "klipper": init_klipper_clipboard, + "windows": init_windows_clipboard, + "no": init_no_clipboard, + } + + if clipboard not in clipboard_types: + allowed_clipboard_types = [repr(_) for _ in clipboard_types.keys()] + raise ValueError( + f"Argument must be one of {', '.join(allowed_clipboard_types)}" + ) + + # Sets pyperclip's copy() and paste() functions: + copy, paste = clipboard_types[clipboard]() + + +def lazy_load_stub_copy(text): + """ + A stub function for copy(), which will load the real copy() function when + called so that the real copy() function is used for later calls. + + This allows users to import pyperclip without having determine_clipboard() + automatically run, which will automatically select a clipboard mechanism. + This could be a problem if it selects, say, the memory-heavy PyQt4 module + but the user was just going to immediately call set_clipboard() to use a + different clipboard mechanism. + + The lazy loading this stub function implements gives the user a chance to + call set_clipboard() to pick another clipboard mechanism. Or, if the user + simply calls copy() or paste() without calling set_clipboard() first, + will fall back on whatever clipboard mechanism that determine_clipboard() + automatically chooses. + """ + global copy, paste + copy, paste = determine_clipboard() + return copy(text) + + +def lazy_load_stub_paste(): + """ + A stub function for paste(), which will load the real paste() function when + called so that the real paste() function is used for later calls. + + This allows users to import pyperclip without having determine_clipboard() + automatically run, which will automatically select a clipboard mechanism. + This could be a problem if it selects, say, the memory-heavy PyQt4 module + but the user was just going to immediately call set_clipboard() to use a + different clipboard mechanism. + + The lazy loading this stub function implements gives the user a chance to + call set_clipboard() to pick another clipboard mechanism. Or, if the user + simply calls copy() or paste() without calling set_clipboard() first, + will fall back on whatever clipboard mechanism that determine_clipboard() + automatically chooses. + """ + global copy, paste + copy, paste = determine_clipboard() + return paste() + + +def is_available() -> bool: + return copy != lazy_load_stub_copy and paste != lazy_load_stub_paste + + +# Initially, copy() and paste() are set to lazy loading wrappers which will +# set `copy` and `paste` to real functions the first time they're used, unless +# set_clipboard() or determine_clipboard() is called first. +copy, paste = lazy_load_stub_copy, lazy_load_stub_paste + + +__all__ = ["copy", "paste", "set_clipboard", "determine_clipboard"] + +# pandas aliases +clipboard_get = paste +clipboard_set = copy diff --git a/venv/Lib/site-packages/pandas/io/clipboards.py b/venv/Lib/site-packages/pandas/io/clipboards.py new file mode 100644 index 0000000..34e8e03 --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/clipboards.py @@ -0,0 +1,138 @@ +""" io on the clipboard """ +from io import StringIO +import warnings + +from pandas.core.dtypes.generic import ABCDataFrame + +from pandas import get_option, option_context + + +def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover + r""" + Read text from clipboard and pass to read_csv. + + Parameters + ---------- + sep : str, default '\s+' + A string or regex delimiter. The default of '\s+' denotes + one or more whitespace characters. + + **kwargs + See read_csv for the full argument list. + + Returns + ------- + DataFrame + A parsed DataFrame object. + """ + encoding = kwargs.pop("encoding", "utf-8") + + # only utf-8 is valid for passed value because that's what clipboard + # supports + if encoding is not None and encoding.lower().replace("-", "") != "utf8": + raise NotImplementedError("reading from clipboard only supports utf-8 encoding") + + from pandas.io.clipboard import clipboard_get + from pandas.io.parsers import read_csv + + text = clipboard_get() + + # Try to decode (if needed, as "text" might already be a string here). + try: + text = text.decode(kwargs.get("encoding") or get_option("display.encoding")) + except AttributeError: + pass + + # Excel copies into clipboard with \t separation + # inspect no more then the 10 first lines, if they + # all contain an equal number (>0) of tabs, infer + # that this came from excel and set 'sep' accordingly + lines = text[:10000].split("\n")[:-1][:10] + + # Need to remove leading white space, since read_csv + # accepts: + # a b + # 0 1 2 + # 1 3 4 + + counts = {x.lstrip().count("\t") for x in lines} + if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0: + sep = "\t" + + # Edge case where sep is specified to be None, return to default + if sep is None and kwargs.get("delim_whitespace") is None: + sep = r"\s+" + + # Regex separator currently only works with python engine. + # Default to python if separator is multi-character (regex) + if len(sep) > 1 and kwargs.get("engine") is None: + kwargs["engine"] = "python" + elif len(sep) > 1 and kwargs.get("engine") == "c": + warnings.warn( + "read_clipboard with regex separator does not work " + "properly with c engine" + ) + + return read_csv(StringIO(text), sep=sep, **kwargs) + + +def to_clipboard(obj, excel=True, sep=None, **kwargs): # pragma: no cover + """ + Attempt to write text representation of object to the system clipboard + The clipboard can be then pasted into Excel for example. + + Parameters + ---------- + obj : the object to write to the clipboard + excel : boolean, defaults to True + if True, use the provided separator, writing in a csv + format for allowing easy pasting into excel. + if False, write a string representation of the object + to the clipboard + sep : optional, defaults to tab + other keywords are passed to to_csv + + Notes + ----- + Requirements for your platform + - Linux: xclip, or xsel (with PyQt4 modules) + - Windows: + - OS X: + """ + encoding = kwargs.pop("encoding", "utf-8") + + # testing if an invalid encoding is passed to clipboard + if encoding is not None and encoding.lower().replace("-", "") != "utf8": + raise ValueError("clipboard only supports utf-8 encoding") + + from pandas.io.clipboard import clipboard_set + + if excel is None: + excel = True + + if excel: + try: + if sep is None: + sep = "\t" + buf = StringIO() + + # clipboard_set (pyperclip) expects unicode + obj.to_csv(buf, sep=sep, encoding="utf-8", **kwargs) + text = buf.getvalue() + + clipboard_set(text) + return + except TypeError: + warnings.warn( + "to_clipboard in excel mode requires a single character separator." + ) + elif sep is not None: + warnings.warn("to_clipboard with excel=False ignores the sep argument") + + if isinstance(obj, ABCDataFrame): + # str(df) has various unhelpful defaults, like truncation + with option_context("display.max_colwidth", None): + objstr = obj.to_string(**kwargs) + else: + objstr = str(obj) + clipboard_set(objstr) diff --git a/venv/Lib/site-packages/pandas/io/common.py b/venv/Lib/site-packages/pandas/io/common.py new file mode 100644 index 0000000..771a302 --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/common.py @@ -0,0 +1,528 @@ +"""Common IO api utilities""" + +import bz2 +from collections import abc +import gzip +from io import BufferedIOBase, BytesIO +import mmap +import os +import pathlib +from typing import IO, Any, AnyStr, Dict, List, Mapping, Optional, Tuple, Union +from urllib.parse import ( # noqa + urlencode, + urljoin, + urlparse as parse_url, + uses_netloc, + uses_params, + uses_relative, +) +import zipfile + +from pandas._typing import FilePathOrBuffer +from pandas.compat import _get_lzma_file, _import_lzma +from pandas.errors import ( # noqa + AbstractMethodError, + DtypeWarning, + EmptyDataError, + ParserError, + ParserWarning, +) + +from pandas.core.dtypes.common import is_file_like + +lzma = _import_lzma() + + +_VALID_URLS = set(uses_relative + uses_netloc + uses_params) +_VALID_URLS.discard("") + + +def is_url(url) -> bool: + """ + Check to see if a URL has a valid protocol. + + Parameters + ---------- + url : str or unicode + + Returns + ------- + isurl : bool + If `url` has a valid protocol return True otherwise False. + """ + if not isinstance(url, str): + return False + return parse_url(url).scheme in _VALID_URLS + + +def _expand_user( + filepath_or_buffer: FilePathOrBuffer[AnyStr], +) -> FilePathOrBuffer[AnyStr]: + """Return the argument with an initial component of ~ or ~user + replaced by that user's home directory. + + Parameters + ---------- + filepath_or_buffer : object to be converted if possible + + Returns + ------- + expanded_filepath_or_buffer : an expanded filepath or the + input if not expandable + """ + if isinstance(filepath_or_buffer, str): + return os.path.expanduser(filepath_or_buffer) + return filepath_or_buffer + + +def validate_header_arg(header) -> None: + if isinstance(header, bool): + raise TypeError( + "Passing a bool to header is invalid. " + "Use header=None for no header or " + "header=int or list-like of ints to specify " + "the row(s) making up the column names" + ) + + +def stringify_path( + filepath_or_buffer: FilePathOrBuffer[AnyStr], +) -> FilePathOrBuffer[AnyStr]: + """Attempt to convert a path-like object to a string. + + Parameters + ---------- + filepath_or_buffer : object to be converted + + Returns + ------- + str_filepath_or_buffer : maybe a string version of the object + + Notes + ----- + Objects supporting the fspath protocol (python 3.6+) are coerced + according to its __fspath__ method. + + For backwards compatibility with older pythons, pathlib.Path and + py.path objects are specially coerced. + + Any other object is passed through unchanged, which includes bytes, + strings, buffers, or anything else that's not even path-like. + """ + if hasattr(filepath_or_buffer, "__fspath__"): + # https://github.com/python/mypy/issues/1424 + return filepath_or_buffer.__fspath__() # type: ignore + elif isinstance(filepath_or_buffer, pathlib.Path): + return str(filepath_or_buffer) + return _expand_user(filepath_or_buffer) + + +def is_s3_url(url) -> bool: + """Check for an s3, s3n, or s3a url""" + if not isinstance(url, str): + return False + return parse_url(url).scheme in ["s3", "s3n", "s3a"] + + +def is_gcs_url(url) -> bool: + """Check for a gcs url""" + if not isinstance(url, str): + return False + return parse_url(url).scheme in ["gcs", "gs"] + + +def urlopen(*args, **kwargs): + """ + Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of + the stdlib. + """ + import urllib.request + + return urllib.request.urlopen(*args, **kwargs) + + +def get_filepath_or_buffer( + filepath_or_buffer: FilePathOrBuffer, + encoding: Optional[str] = None, + compression: Optional[str] = None, + mode: Optional[str] = None, +): + """ + If the filepath_or_buffer is a url, translate and return the buffer. + Otherwise passthrough. + + Parameters + ---------- + filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), + or buffer + compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional + encoding : the encoding to use to decode bytes, default is 'utf-8' + mode : str, optional + + Returns + ------- + tuple of ({a filepath_ or buffer or S3File instance}, + encoding, str, + compression, str, + should_close, bool) + """ + filepath_or_buffer = stringify_path(filepath_or_buffer) + + if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer): + req = urlopen(filepath_or_buffer) + content_encoding = req.headers.get("Content-Encoding", None) + if content_encoding == "gzip": + # Override compression based on Content-Encoding header + compression = "gzip" + reader = BytesIO(req.read()) + req.close() + return reader, encoding, compression, True + + if is_s3_url(filepath_or_buffer): + from pandas.io import s3 + + return s3.get_filepath_or_buffer( + filepath_or_buffer, encoding=encoding, compression=compression, mode=mode + ) + + if is_gcs_url(filepath_or_buffer): + from pandas.io import gcs + + return gcs.get_filepath_or_buffer( + filepath_or_buffer, encoding=encoding, compression=compression, mode=mode + ) + + if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): + return _expand_user(filepath_or_buffer), None, compression, False + + if not is_file_like(filepath_or_buffer): + msg = f"Invalid file path or buffer object type: {type(filepath_or_buffer)}" + raise ValueError(msg) + + return filepath_or_buffer, None, compression, False + + +def file_path_to_url(path: str) -> str: + """ + converts an absolute native path to a FILE URL. + + Parameters + ---------- + path : a path in native format + + Returns + ------- + a valid FILE URL + """ + # lazify expensive import (~30ms) + from urllib.request import pathname2url + + return urljoin("file:", pathname2url(path)) + + +_compression_to_extension = {"gzip": ".gz", "bz2": ".bz2", "zip": ".zip", "xz": ".xz"} + + +def get_compression_method( + compression: Optional[Union[str, Mapping[str, str]]] +) -> Tuple[Optional[str], Dict[str, str]]: + """ + Simplifies a compression argument to a compression method string and + a mapping containing additional arguments. + + Parameters + ---------- + compression : str or mapping + If string, specifies the compression method. If mapping, value at key + 'method' specifies compression method. + + Returns + ------- + tuple of ({compression method}, Optional[str] + {compression arguments}, Dict[str, str]) + + Raises + ------ + ValueError on mapping missing 'method' key + """ + if isinstance(compression, Mapping): + compression_args = dict(compression) + try: + compression = compression_args.pop("method") + except KeyError: + raise ValueError("If mapping, compression must have key 'method'") + else: + compression_args = {} + return compression, compression_args + + +def infer_compression( + filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] +) -> Optional[str]: + """ + Get the compression method for filepath_or_buffer. If compression='infer', + the inferred compression method is returned. Otherwise, the input + compression method is returned unchanged, unless it's invalid, in which + case an error is raised. + + Parameters + ---------- + filepath_or_buffer : str or file handle + File path or object. + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None} + If 'infer' and `filepath_or_buffer` is path-like, then detect + compression from the following extensions: '.gz', '.bz2', '.zip', + or '.xz' (otherwise no compression). + + Returns + ------- + string or None + + Raises + ------ + ValueError on invalid compression specified. + """ + + # No compression has been explicitly specified + if compression is None: + return None + + # Infer compression + if compression == "infer": + # Convert all path types (e.g. pathlib.Path) to strings + filepath_or_buffer = stringify_path(filepath_or_buffer) + if not isinstance(filepath_or_buffer, str): + # Cannot infer compression of a buffer, assume no compression + return None + + # Infer compression from the filename/URL extension + for compression, extension in _compression_to_extension.items(): + if filepath_or_buffer.endswith(extension): + return compression + return None + + # Compression has been specified. Check that it's valid + if compression in _compression_to_extension: + return compression + + msg = f"Unrecognized compression type: {compression}" + valid = ["infer", None] + sorted(_compression_to_extension) + msg += f"\nValid compression types are {valid}" + raise ValueError(msg) + + +def get_handle( + path_or_buf, + mode: str, + encoding=None, + compression: Optional[Union[str, Mapping[str, Any]]] = None, + memory_map: bool = False, + is_text: bool = True, +): + """ + Get file handle for given path/buffer and mode. + + Parameters + ---------- + path_or_buf : str or file handle + File path or object. + mode : str + Mode to open path_or_buf with. + encoding : str or None + Encoding to use. + compression : str or dict, default None + If string, specifies compression mode. If dict, value at key 'method' + specifies compression mode. Compression mode must be one of {'infer', + 'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer' + and `filepath_or_buffer` is path-like, then detect compression from + the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise + no compression). If dict and compression mode is 'zip' or inferred as + 'zip', other entries passed as additional compression options. + + .. versionchanged:: 1.0.0 + + May now be a dict with key 'method' as compression mode + and other keys as compression options if compression + mode is 'zip'. + + memory_map : boolean, default False + See parsers._parser_params for more information. + is_text : boolean, default True + whether file/buffer is in text format (csv, json, etc.), or in binary + mode (pickle, etc.). + + Returns + ------- + f : file-like + A file-like object. + handles : list of file-like objects + A list of file-like object that were opened in this function. + """ + try: + from s3fs import S3File + + need_text_wrapping = (BufferedIOBase, S3File) + except ImportError: + need_text_wrapping = BufferedIOBase # type: ignore + + handles: List[IO] = list() + f = path_or_buf + + # Convert pathlib.Path/py.path.local or string + path_or_buf = stringify_path(path_or_buf) + is_path = isinstance(path_or_buf, str) + + compression, compression_args = get_compression_method(compression) + if is_path: + compression = infer_compression(path_or_buf, compression) + + if compression: + + # GZ Compression + if compression == "gzip": + if is_path: + f = gzip.open(path_or_buf, mode) + else: + f = gzip.GzipFile(fileobj=path_or_buf) + + # BZ Compression + elif compression == "bz2": + if is_path: + f = bz2.BZ2File(path_or_buf, mode) + else: + f = bz2.BZ2File(path_or_buf) + + # ZIP Compression + elif compression == "zip": + zf = _BytesZipFile(path_or_buf, mode, **compression_args) + # Ensure the container is closed as well. + handles.append(zf) + if zf.mode == "w": + f = zf + elif zf.mode == "r": + zip_names = zf.namelist() + if len(zip_names) == 1: + f = zf.open(zip_names.pop()) + elif len(zip_names) == 0: + raise ValueError(f"Zero files found in ZIP file {path_or_buf}") + else: + raise ValueError( + "Multiple files found in ZIP file." + f" Only one file per ZIP: {zip_names}" + ) + + # XZ Compression + elif compression == "xz": + f = _get_lzma_file(lzma)(path_or_buf, mode) + + # Unrecognized Compression + else: + msg = f"Unrecognized compression type: {compression}" + raise ValueError(msg) + + handles.append(f) + + elif is_path: + if encoding: + # Encoding + f = open(path_or_buf, mode, encoding=encoding, newline="") + elif is_text: + # No explicit encoding + f = open(path_or_buf, mode, errors="replace", newline="") + else: + # Binary mode + f = open(path_or_buf, mode) + handles.append(f) + + # Convert BytesIO or file objects passed with an encoding + if is_text and (compression or isinstance(f, need_text_wrapping)): + from io import TextIOWrapper + + g = TextIOWrapper(f, encoding=encoding, newline="") + if not isinstance(f, BufferedIOBase): + handles.append(g) + f = g + + if memory_map and hasattr(f, "fileno"): + try: + wrapped = _MMapWrapper(f) + f.close() + f = wrapped + except Exception: + # we catch any errors that may have occurred + # because that is consistent with the lower-level + # functionality of the C engine (pd.read_csv), so + # leave the file handler as is then + pass + + return f, handles + + +class _BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore + """ + Wrapper for standard library class ZipFile and allow the returned file-like + handle to accept byte strings via `write` method. + + BytesIO provides attributes of file-like object and ZipFile.writestr writes + bytes strings into a member of the archive. + """ + + # GH 17778 + def __init__( + self, + file: FilePathOrBuffer, + mode: str, + archive_name: Optional[str] = None, + **kwargs, + ): + if mode in ["wb", "rb"]: + mode = mode.replace("b", "") + self.archive_name = archive_name + super().__init__(file, mode, zipfile.ZIP_DEFLATED, **kwargs) + + def write(self, data): + archive_name = self.filename + if self.archive_name is not None: + archive_name = self.archive_name + super().writestr(archive_name, data) + + @property + def closed(self): + return self.fp is None + + +class _MMapWrapper(abc.Iterator): + """ + Wrapper for the Python's mmap class so that it can be properly read in + by Python's csv.reader class. + + Parameters + ---------- + f : file object + File object to be mapped onto memory. Must support the 'fileno' + method or have an equivalent attribute + + """ + + def __init__(self, f: IO): + self.mmap = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) + + def __getattr__(self, name: str): + return getattr(self.mmap, name) + + def __iter__(self) -> "_MMapWrapper": + return self + + def __next__(self) -> str: + newbytes = self.mmap.readline() + + # readline returns bytes, not str, but Python's CSV reader + # expects str, so convert the output to str before continuing + newline = newbytes.decode("utf-8") + + # mmap doesn't raise if reading past the allocated + # data but instead returns an empty string, so raise + # if that is returned + if newline == "": + raise StopIteration + return newline diff --git a/venv/Lib/site-packages/pandas/io/date_converters.py b/venv/Lib/site-packages/pandas/io/date_converters.py new file mode 100644 index 0000000..7fdca2d --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/date_converters.py @@ -0,0 +1,64 @@ +"""This module is designed for community supported date conversion functions""" +import numpy as np + +from pandas._libs.tslibs import parsing + + +def parse_date_time(date_col, time_col): + date_col = _maybe_cast(date_col) + time_col = _maybe_cast(time_col) + return parsing.try_parse_date_and_time(date_col, time_col) + + +def parse_date_fields(year_col, month_col, day_col): + year_col = _maybe_cast(year_col) + month_col = _maybe_cast(month_col) + day_col = _maybe_cast(day_col) + return parsing.try_parse_year_month_day(year_col, month_col, day_col) + + +def parse_all_fields(year_col, month_col, day_col, hour_col, minute_col, second_col): + year_col = _maybe_cast(year_col) + month_col = _maybe_cast(month_col) + day_col = _maybe_cast(day_col) + hour_col = _maybe_cast(hour_col) + minute_col = _maybe_cast(minute_col) + second_col = _maybe_cast(second_col) + return parsing.try_parse_datetime_components( + year_col, month_col, day_col, hour_col, minute_col, second_col + ) + + +def generic_parser(parse_func, *cols): + N = _check_columns(cols) + results = np.empty(N, dtype=object) + + for i in range(N): + args = [c[i] for c in cols] + results[i] = parse_func(*args) + + return results + + +def _maybe_cast(arr): + if not arr.dtype.type == np.object_: + arr = np.array(arr, dtype=object) + return arr + + +def _check_columns(cols): + if not len(cols): + raise AssertionError("There must be at least 1 column") + + head, tail = cols[0], cols[1:] + + N = len(head) + + for i, n in enumerate(map(len, tail)): + if n != N: + raise AssertionError( + f"All columns must have the same length: {N}; " + f"column {i} has length {n}" + ) + + return N diff --git a/venv/Lib/site-packages/pandas/io/excel/__init__.py b/venv/Lib/site-packages/pandas/io/excel/__init__.py new file mode 100644 index 0000000..455abaa --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/excel/__init__.py @@ -0,0 +1,16 @@ +from pandas.io.excel._base import ExcelFile, ExcelWriter, read_excel +from pandas.io.excel._openpyxl import _OpenpyxlWriter +from pandas.io.excel._util import register_writer +from pandas.io.excel._xlsxwriter import _XlsxWriter +from pandas.io.excel._xlwt import _XlwtWriter + +__all__ = ["read_excel", "ExcelWriter", "ExcelFile"] + + +register_writer(_OpenpyxlWriter) + + +register_writer(_XlwtWriter) + + +register_writer(_XlsxWriter) diff --git a/venv/Lib/site-packages/pandas/io/excel/_base.py b/venv/Lib/site-packages/pandas/io/excel/_base.py new file mode 100644 index 0000000..2a91381 --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/excel/_base.py @@ -0,0 +1,919 @@ +import abc +from datetime import date, datetime, timedelta +from io import BytesIO +import os +from textwrap import fill + +from pandas._config import config + +from pandas._libs.parsers import STR_NA_VALUES +from pandas.errors import EmptyDataError +from pandas.util._decorators import Appender + +from pandas.core.dtypes.common import is_bool, is_float, is_integer, is_list_like + +from pandas.core.frame import DataFrame + +from pandas.io.common import ( + get_filepath_or_buffer, + is_url, + stringify_path, + urlopen, + validate_header_arg, +) +from pandas.io.excel._util import ( + _fill_mi_header, + _get_default_writer, + _maybe_convert_usecols, + _pop_header_name, + get_writer, +) +from pandas.io.formats.printing import pprint_thing +from pandas.io.parsers import TextParser + +_read_excel_doc = ( + """ +Read an Excel file into a pandas DataFrame. + +Supports `xls`, `xlsx`, `xlsm`, `xlsb`, and `odf` file extensions +read from a local filesystem or URL. Supports an option to read +a single sheet or a list of sheets. + +Parameters +---------- +io : str, bytes, ExcelFile, xlrd.Book, path object, or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: ``file://localhost/path/to/table.xlsx``. + + If you want to pass in a path object, pandas accepts any ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, + such as a file handler (e.g. via builtin ``open`` function) + or ``StringIO``. +sheet_name : str, int, list, or None, default 0 + Strings are used for sheet names. Integers are used in zero-indexed + sheet positions. Lists of strings/integers are used to request + multiple sheets. Specify None to get all sheets. + + Available cases: + + * Defaults to ``0``: 1st sheet as a `DataFrame` + * ``1``: 2nd sheet as a `DataFrame` + * ``"Sheet1"``: Load sheet with name "Sheet1" + * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5" + as a dict of `DataFrame` + * None: All sheets. + +header : int, list of int, default 0 + Row (0-indexed) to use for the column labels of the parsed + DataFrame. If a list of integers is passed those row positions will + be combined into a ``MultiIndex``. Use None if there is no header. +names : array-like, default None + List of column names to use. If file contains no header row, + then you should explicitly pass header=None. +index_col : int, list of int, default None + Column (0-indexed) to use as the row labels of the DataFrame. + Pass None if there is no such column. If a list is passed, + those columns will be combined into a ``MultiIndex``. If a + subset of data is selected with ``usecols``, index_col + is based on the subset. +usecols : int, str, list-like, or callable default None + * If None, then parse all columns. + * If str, then indicates comma separated list of Excel column letters + and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of + both sides. + * If list of int, then indicates list of column numbers to be parsed. + * If list of string, then indicates list of column names to be parsed. + + .. versionadded:: 0.24.0 + + * If callable, then evaluate each column name against it and parse the + column if the callable returns ``True``. + + Returns a subset of the columns according to behavior above. + + .. versionadded:: 0.24.0 + +squeeze : bool, default False + If the parsed data only contains one column then return a Series. +dtype : Type name or dict of column -> type, default None + Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} + Use `object` to preserve data as stored in Excel and not interpret dtype. + If converters are specified, they will be applied INSTEAD + of dtype conversion. +engine : str, default None + If io is not a buffer or path, this must be set to identify io. + Acceptable values are None, "xlrd", "openpyxl" or "odf". +converters : dict, default None + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels, values are functions that take one + input argument, the Excel cell content, and return the transformed + content. +true_values : list, default None + Values to consider as True. +false_values : list, default None + Values to consider as False. +skiprows : list-like + Rows to skip at the beginning (0-indexed). +nrows : int, default None + Number of rows to parse. + + .. versionadded:: 0.23.0 + +na_values : scalar, str, list-like, or dict, default None + Additional strings to recognize as NA/NaN. If dict passed, specific + per-column NA values. By default the following values are interpreted + as NaN: '""" + + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ") + + """'. +keep_default_na : bool, default True + Whether or not to include the default NaN values when parsing the data. + Depending on whether `na_values` is passed in, the behavior is as follows: + + * If `keep_default_na` is True, and `na_values` are specified, `na_values` + is appended to the default NaN values used for parsing. + * If `keep_default_na` is True, and `na_values` are not specified, only + the default NaN values are used for parsing. + * If `keep_default_na` is False, and `na_values` are specified, only + the NaN values specified `na_values` are used for parsing. + * If `keep_default_na` is False, and `na_values` are not specified, no + strings will be parsed as NaN. + + Note that if `na_filter` is passed in as False, the `keep_default_na` and + `na_values` parameters will be ignored. +na_filter : bool, default True + Detect missing value markers (empty strings and the value of na_values). In + data without any NAs, passing na_filter=False can improve the performance + of reading a large file. +verbose : bool, default False + Indicate number of NA values placed in non-numeric columns. +parse_dates : bool, list-like, or dict, default False + The behavior is as follows: + + * bool. If True -> try parsing the index. + * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 + each as a separate date column. + * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as + a single date column. + * dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call + result 'foo' + + If a column or index contains an unparseable date, the entire column or + index will be returned unaltered as an object data type. If you don`t want to + parse some cells as date just change their type in Excel to "Text". + For non-standard datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``. + + Note: A fast-path exists for iso8601-formatted dates. +date_parser : function, optional + Function to use for converting a sequence of string columns to an array of + datetime instances. The default uses ``dateutil.parser.parser`` to do the + conversion. Pandas will try to call `date_parser` in three different ways, + advancing to the next if an exception occurs: 1) Pass one or more arrays + (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the + string values from the columns defined by `parse_dates` into a single array + and pass that; and 3) call `date_parser` once for each row using one or + more strings (corresponding to the columns defined by `parse_dates`) as + arguments. +thousands : str, default None + Thousands separator for parsing string columns to numeric. Note that + this parameter is only necessary for columns stored as TEXT in Excel, + any numeric columns will automatically be parsed, regardless of display + format. +comment : str, default None + Comments out remainder of line. Pass a character or characters to this + argument to indicate comments in the input file. Any data between the + comment string and the end of the current line is ignored. +skipfooter : int, default 0 + Rows at the end to skip (0-indexed). +convert_float : bool, default True + Convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric + data will be read in as floats: Excel stores all numbers as floats + internally. +mangle_dupe_cols : bool, default True + Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than + 'X'...'X'. Passing in False will cause data to be overwritten if there + are duplicate names in the columns. +**kwds : optional + Optional keyword arguments can be passed to ``TextFileReader``. + +Returns +------- +DataFrame or dict of DataFrames + DataFrame from the passed in Excel file. See notes in sheet_name + argument for more information on when a dict of DataFrames is returned. + +See Also +-------- +to_excel : Write DataFrame to an Excel file. +to_csv : Write DataFrame to a comma-separated values (csv) file. +read_csv : Read a comma-separated values (csv) file into DataFrame. +read_fwf : Read a table of fixed-width formatted lines into DataFrame. + +Examples +-------- +The file can be read using the file name as string or an open file object: + +>>> pd.read_excel('tmp.xlsx', index_col=0) # doctest: +SKIP + Name Value +0 string1 1 +1 string2 2 +2 #Comment 3 + +>>> pd.read_excel(open('tmp.xlsx', 'rb'), +... sheet_name='Sheet3') # doctest: +SKIP + Unnamed: 0 Name Value +0 0 string1 1 +1 1 string2 2 +2 2 #Comment 3 + +Index and header can be specified via the `index_col` and `header` arguments + +>>> pd.read_excel('tmp.xlsx', index_col=None, header=None) # doctest: +SKIP + 0 1 2 +0 NaN Name Value +1 0.0 string1 1 +2 1.0 string2 2 +3 2.0 #Comment 3 + +Column types are inferred but can be explicitly specified + +>>> pd.read_excel('tmp.xlsx', index_col=0, +... dtype={'Name': str, 'Value': float}) # doctest: +SKIP + Name Value +0 string1 1.0 +1 string2 2.0 +2 #Comment 3.0 + +True, False, and NA values, and thousands separators have defaults, +but can be explicitly specified, too. Supply the values you would like +as strings or lists of strings! + +>>> pd.read_excel('tmp.xlsx', index_col=0, +... na_values=['string1', 'string2']) # doctest: +SKIP + Name Value +0 NaN 1 +1 NaN 2 +2 #Comment 3 + +Comment lines in the excel input file can be skipped using the `comment` kwarg + +>>> pd.read_excel('tmp.xlsx', index_col=0, comment='#') # doctest: +SKIP + Name Value +0 string1 1.0 +1 string2 2.0 +2 None NaN +""" +) + + +@Appender(_read_excel_doc) +def read_excel( + io, + sheet_name=0, + header=0, + names=None, + index_col=None, + usecols=None, + squeeze=False, + dtype=None, + engine=None, + converters=None, + true_values=None, + false_values=None, + skiprows=None, + nrows=None, + na_values=None, + keep_default_na=True, + verbose=False, + parse_dates=False, + date_parser=None, + thousands=None, + comment=None, + skipfooter=0, + convert_float=True, + mangle_dupe_cols=True, + **kwds, +): + + for arg in ("sheet", "sheetname", "parse_cols"): + if arg in kwds: + raise TypeError(f"read_excel() got an unexpected keyword argument `{arg}`") + + if not isinstance(io, ExcelFile): + io = ExcelFile(io, engine=engine) + elif engine and engine != io.engine: + raise ValueError( + "Engine should not be specified when passing " + "an ExcelFile - ExcelFile already has the engine set" + ) + + return io.parse( + sheet_name=sheet_name, + header=header, + names=names, + index_col=index_col, + usecols=usecols, + squeeze=squeeze, + dtype=dtype, + converters=converters, + true_values=true_values, + false_values=false_values, + skiprows=skiprows, + nrows=nrows, + na_values=na_values, + keep_default_na=keep_default_na, + verbose=verbose, + parse_dates=parse_dates, + date_parser=date_parser, + thousands=thousands, + comment=comment, + skipfooter=skipfooter, + convert_float=convert_float, + mangle_dupe_cols=mangle_dupe_cols, + **kwds, + ) + + +class _BaseExcelReader(metaclass=abc.ABCMeta): + def __init__(self, filepath_or_buffer): + # If filepath_or_buffer is a url, load the data into a BytesIO + if is_url(filepath_or_buffer): + filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read()) + elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): + filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer) + + if isinstance(filepath_or_buffer, self._workbook_class): + self.book = filepath_or_buffer + elif hasattr(filepath_or_buffer, "read"): + # N.B. xlrd.Book has a read attribute too + filepath_or_buffer.seek(0) + self.book = self.load_workbook(filepath_or_buffer) + elif isinstance(filepath_or_buffer, str): + self.book = self.load_workbook(filepath_or_buffer) + elif isinstance(filepath_or_buffer, bytes): + self.book = self.load_workbook(BytesIO(filepath_or_buffer)) + else: + raise ValueError( + "Must explicitly set engine if not passing in buffer or path for io." + ) + + @property + @abc.abstractmethod + def _workbook_class(self): + pass + + @abc.abstractmethod + def load_workbook(self, filepath_or_buffer): + pass + + @property + @abc.abstractmethod + def sheet_names(self): + pass + + @abc.abstractmethod + def get_sheet_by_name(self, name): + pass + + @abc.abstractmethod + def get_sheet_by_index(self, index): + pass + + @abc.abstractmethod + def get_sheet_data(self, sheet, convert_float): + pass + + def parse( + self, + sheet_name=0, + header=0, + names=None, + index_col=None, + usecols=None, + squeeze=False, + dtype=None, + true_values=None, + false_values=None, + skiprows=None, + nrows=None, + na_values=None, + verbose=False, + parse_dates=False, + date_parser=None, + thousands=None, + comment=None, + skipfooter=0, + convert_float=True, + mangle_dupe_cols=True, + **kwds, + ): + + validate_header_arg(header) + + ret_dict = False + + # Keep sheetname to maintain backwards compatibility. + if isinstance(sheet_name, list): + sheets = sheet_name + ret_dict = True + elif sheet_name is None: + sheets = self.sheet_names + ret_dict = True + else: + sheets = [sheet_name] + + # handle same-type duplicates. + sheets = list(dict.fromkeys(sheets).keys()) + + output = {} + + for asheetname in sheets: + if verbose: + print(f"Reading sheet {asheetname}") + + if isinstance(asheetname, str): + sheet = self.get_sheet_by_name(asheetname) + else: # assume an integer if not a string + sheet = self.get_sheet_by_index(asheetname) + + data = self.get_sheet_data(sheet, convert_float) + usecols = _maybe_convert_usecols(usecols) + + if not data: + output[asheetname] = DataFrame() + continue + + if is_list_like(header) and len(header) == 1: + header = header[0] + + # forward fill and pull out names for MultiIndex column + header_names = None + if header is not None and is_list_like(header): + header_names = [] + control_row = [True] * len(data[0]) + + for row in header: + if is_integer(skiprows): + row += skiprows + + data[row], control_row = _fill_mi_header(data[row], control_row) + + if index_col is not None: + header_name, _ = _pop_header_name(data[row], index_col) + header_names.append(header_name) + + if is_list_like(index_col): + # Forward fill values for MultiIndex index. + if not is_list_like(header): + offset = 1 + header + else: + offset = 1 + max(header) + + # Check if we have an empty dataset + # before trying to collect data. + if offset < len(data): + for col in index_col: + last = data[offset][col] + + for row in range(offset + 1, len(data)): + if data[row][col] == "" or data[row][col] is None: + data[row][col] = last + else: + last = data[row][col] + + has_index_names = is_list_like(header) and len(header) > 1 + + # GH 12292 : error when read one empty column from excel file + try: + parser = TextParser( + data, + names=names, + header=header, + index_col=index_col, + has_index_names=has_index_names, + squeeze=squeeze, + dtype=dtype, + true_values=true_values, + false_values=false_values, + skiprows=skiprows, + nrows=nrows, + na_values=na_values, + parse_dates=parse_dates, + date_parser=date_parser, + thousands=thousands, + comment=comment, + skipfooter=skipfooter, + usecols=usecols, + mangle_dupe_cols=mangle_dupe_cols, + **kwds, + ) + + output[asheetname] = parser.read(nrows=nrows) + + if not squeeze or isinstance(output[asheetname], DataFrame): + if header_names: + output[asheetname].columns = output[ + asheetname + ].columns.set_names(header_names) + + except EmptyDataError: + # No Data, return an empty DataFrame + output[asheetname] = DataFrame() + + if ret_dict: + return output + else: + return output[asheetname] + + +class ExcelWriter(metaclass=abc.ABCMeta): + """ + Class for writing DataFrame objects into excel sheets. + + Default is to use xlwt for xls, openpyxl for xlsx. + See DataFrame.to_excel for typical usage. + + Parameters + ---------- + path : str + Path to xls or xlsx file. + engine : str (optional) + Engine to use for writing. If None, defaults to + ``io.excel..writer``. NOTE: can only be passed as a keyword + argument. + date_format : str, default None + Format string for dates written into Excel files (e.g. 'YYYY-MM-DD'). + datetime_format : str, default None + Format string for datetime objects written into Excel files. + (e.g. 'YYYY-MM-DD HH:MM:SS'). + mode : {'w', 'a'}, default 'w' + File mode to use (write or append). + + .. versionadded:: 0.24.0 + + Attributes + ---------- + None + + Methods + ------- + None + + Notes + ----- + None of the methods and properties are considered public. + + For compatibility with CSV writers, ExcelWriter serializes lists + and dicts to strings before writing. + + Examples + -------- + Default usage: + + >>> with ExcelWriter('path_to_file.xlsx') as writer: + ... df.to_excel(writer) + + To write to separate sheets in a single file: + + >>> with ExcelWriter('path_to_file.xlsx') as writer: + ... df1.to_excel(writer, sheet_name='Sheet1') + ... df2.to_excel(writer, sheet_name='Sheet2') + + You can set the date format or datetime format: + + >>> with ExcelWriter('path_to_file.xlsx', + date_format='YYYY-MM-DD', + datetime_format='YYYY-MM-DD HH:MM:SS') as writer: + ... df.to_excel(writer) + + You can also append to an existing Excel file: + + >>> with ExcelWriter('path_to_file.xlsx', mode='a') as writer: + ... df.to_excel(writer, sheet_name='Sheet3') + """ + + # Defining an ExcelWriter implementation (see abstract methods for more...) + + # - Mandatory + # - ``write_cells(self, cells, sheet_name=None, startrow=0, startcol=0)`` + # --> called to write additional DataFrames to disk + # - ``supported_extensions`` (tuple of supported extensions), used to + # check that engine supports the given extension. + # - ``engine`` - string that gives the engine name. Necessary to + # instantiate class directly and bypass ``ExcelWriterMeta`` engine + # lookup. + # - ``save(self)`` --> called to save file to disk + # - Mostly mandatory (i.e. should at least exist) + # - book, cur_sheet, path + + # - Optional: + # - ``__init__(self, path, engine=None, **kwargs)`` --> always called + # with path as first argument. + + # You also need to register the class with ``register_writer()``. + # Technically, ExcelWriter implementations don't need to subclass + # ExcelWriter. + def __new__(cls, path, engine=None, **kwargs): + # only switch class if generic(ExcelWriter) + + if cls is ExcelWriter: + if engine is None or (isinstance(engine, str) and engine == "auto"): + if isinstance(path, str): + ext = os.path.splitext(path)[-1][1:] + else: + ext = "xlsx" + + try: + engine = config.get_option(f"io.excel.{ext}.writer") + if engine == "auto": + engine = _get_default_writer(ext) + except KeyError: + raise ValueError(f"No engine for filetype: '{ext}'") + cls = get_writer(engine) + + return object.__new__(cls) + + # declare external properties you can count on + book = None + curr_sheet = None + path = None + + @property + @abc.abstractmethod + def supported_extensions(self): + """Extensions that writer engine supports.""" + pass + + @property + @abc.abstractmethod + def engine(self): + """Name of engine.""" + pass + + @abc.abstractmethod + def write_cells( + self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None + ): + """ + Write given formatted cells into Excel an excel sheet + + Parameters + ---------- + cells : generator + cell of formatted data to save to Excel sheet + sheet_name : str, default None + Name of Excel sheet, if None, then use self.cur_sheet + startrow : upper left cell row to dump data frame + startcol : upper left cell column to dump data frame + freeze_panes: int tuple of length 2 + contains the bottom-most row and right-most column to freeze + """ + pass + + @abc.abstractmethod + def save(self): + """ + Save workbook to disk. + """ + pass + + def __init__( + self, + path, + engine=None, + date_format=None, + datetime_format=None, + mode="w", + **engine_kwargs, + ): + # validate that this engine can handle the extension + if isinstance(path, str): + ext = os.path.splitext(path)[-1] + else: + ext = "xls" if engine == "xlwt" else "xlsx" + + self.check_extension(ext) + + self.path = path + self.sheets = {} + self.cur_sheet = None + + if date_format is None: + self.date_format = "YYYY-MM-DD" + else: + self.date_format = date_format + if datetime_format is None: + self.datetime_format = "YYYY-MM-DD HH:MM:SS" + else: + self.datetime_format = datetime_format + + self.mode = mode + + def __fspath__(self): + return stringify_path(self.path) + + def _get_sheet_name(self, sheet_name): + if sheet_name is None: + sheet_name = self.cur_sheet + if sheet_name is None: # pragma: no cover + raise ValueError("Must pass explicit sheet_name or set cur_sheet property") + return sheet_name + + def _value_with_fmt(self, val): + """Convert numpy types to Python types for the Excel writers. + + Parameters + ---------- + val : object + Value to be written into cells + + Returns + ------- + Tuple with the first element being the converted value and the second + being an optional format + """ + fmt = None + + if is_integer(val): + val = int(val) + elif is_float(val): + val = float(val) + elif is_bool(val): + val = bool(val) + elif isinstance(val, datetime): + fmt = self.datetime_format + elif isinstance(val, date): + fmt = self.date_format + elif isinstance(val, timedelta): + val = val.total_seconds() / float(86400) + fmt = "0" + else: + val = str(val) + + return val, fmt + + @classmethod + def check_extension(cls, ext): + """checks that path's extension against the Writer's supported + extensions. If it isn't supported, raises UnsupportedFiletypeError.""" + if ext.startswith("."): + ext = ext[1:] + if not any(ext in extension for extension in cls.supported_extensions): + msg = "Invalid extension for engine" + f"'{pprint_thing(cls.engine)}': '{pprint_thing(ext)}'" + raise ValueError(msg) + else: + return True + + # Allow use as a contextmanager + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + + def close(self): + """synonym for save, to make it more file-like""" + return self.save() + + +class ExcelFile: + """ + Class for parsing tabular excel sheets into DataFrame objects. + Uses xlrd. See read_excel for more documentation + + Parameters + ---------- + io : str, path object (pathlib.Path or py._path.local.LocalPath), + a file-like object, xlrd workbook or openpypl workbook. + If a string or path object, expected to be a path to xls, xlsx or odf file. + engine : str, default None + If io is not a buffer or path, this must be set to identify io. + Acceptable values are None, ``xlrd``, ``openpyxl``, ``odf``, or ``pyxlsb``. + Note that ``odf`` reads tables out of OpenDocument formatted files. + """ + + from pandas.io.excel._odfreader import _ODFReader + from pandas.io.excel._openpyxl import _OpenpyxlReader + from pandas.io.excel._xlrd import _XlrdReader + from pandas.io.excel._pyxlsb import _PyxlsbReader + + _engines = { + "xlrd": _XlrdReader, + "openpyxl": _OpenpyxlReader, + "odf": _ODFReader, + "pyxlsb": _PyxlsbReader, + } + + def __init__(self, io, engine=None): + if engine is None: + engine = "xlrd" + if engine not in self._engines: + raise ValueError(f"Unknown engine: {engine}") + + self.engine = engine + # could be a str, ExcelFile, Book, etc. + self.io = io + # Always a string + self._io = stringify_path(io) + + self._reader = self._engines[engine](self._io) + + def __fspath__(self): + return self._io + + def parse( + self, + sheet_name=0, + header=0, + names=None, + index_col=None, + usecols=None, + squeeze=False, + converters=None, + true_values=None, + false_values=None, + skiprows=None, + nrows=None, + na_values=None, + parse_dates=False, + date_parser=None, + thousands=None, + comment=None, + skipfooter=0, + convert_float=True, + mangle_dupe_cols=True, + **kwds, + ): + """ + Parse specified sheet(s) into a DataFrame. + + Equivalent to read_excel(ExcelFile, ...) See the read_excel + docstring for more info on accepted parameters. + + Returns + ------- + DataFrame or dict of DataFrames + DataFrame from the passed in Excel file. + """ + if "chunksize" in kwds: + raise NotImplementedError( + "chunksize keyword of read_excel is not implemented" + ) + + return self._reader.parse( + sheet_name=sheet_name, + header=header, + names=names, + index_col=index_col, + usecols=usecols, + squeeze=squeeze, + converters=converters, + true_values=true_values, + false_values=false_values, + skiprows=skiprows, + nrows=nrows, + na_values=na_values, + parse_dates=parse_dates, + date_parser=date_parser, + thousands=thousands, + comment=comment, + skipfooter=skipfooter, + convert_float=convert_float, + mangle_dupe_cols=mangle_dupe_cols, + **kwds, + ) + + @property + def book(self): + return self._reader.book + + @property + def sheet_names(self): + return self._reader.sheet_names + + def close(self): + """close io if necessary""" + if self.engine == "openpyxl": + # https://stackoverflow.com/questions/31416842/ + # openpyxl-does-not-close-excel-workbook-in-read-only-mode + wb = self.book + wb._archive.close() + + if hasattr(self.io, "close"): + self.io.close() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + + def __del__(self): + # Ensure we don't leak file descriptors, but put in try/except in case + # attributes are already deleted + try: + self.close() + except AttributeError: + pass diff --git a/venv/Lib/site-packages/pandas/io/excel/_odfreader.py b/venv/Lib/site-packages/pandas/io/excel/_odfreader.py new file mode 100644 index 0000000..ec5f6fc --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/excel/_odfreader.py @@ -0,0 +1,181 @@ +from typing import List + +from pandas._typing import FilePathOrBuffer, Scalar +from pandas.compat._optional import import_optional_dependency + +import pandas as pd + +from pandas.io.excel._base import _BaseExcelReader + + +class _ODFReader(_BaseExcelReader): + """ + Read tables out of OpenDocument formatted files. + + Parameters + ---------- + filepath_or_buffer: string, path to be parsed or + an open readable stream. + """ + + def __init__(self, filepath_or_buffer: FilePathOrBuffer): + import_optional_dependency("odf") + super().__init__(filepath_or_buffer) + + @property + def _workbook_class(self): + from odf.opendocument import OpenDocument + + return OpenDocument + + def load_workbook(self, filepath_or_buffer: FilePathOrBuffer): + from odf.opendocument import load + + return load(filepath_or_buffer) + + @property + def empty_value(self) -> str: + """Property for compat with other readers.""" + return "" + + @property + def sheet_names(self) -> List[str]: + """Return a list of sheet names present in the document""" + from odf.table import Table + + tables = self.book.getElementsByType(Table) + return [t.getAttribute("name") for t in tables] + + def get_sheet_by_index(self, index: int): + from odf.table import Table + + tables = self.book.getElementsByType(Table) + return tables[index] + + def get_sheet_by_name(self, name: str): + from odf.table import Table + + tables = self.book.getElementsByType(Table) + + for table in tables: + if table.getAttribute("name") == name: + return table + + raise ValueError(f"sheet {name} not found") + + def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: + """Parse an ODF Table into a list of lists + """ + from odf.table import CoveredTableCell, TableCell, TableRow + + covered_cell_name = CoveredTableCell().qname + table_cell_name = TableCell().qname + cell_names = {covered_cell_name, table_cell_name} + + sheet_rows = sheet.getElementsByType(TableRow) + empty_rows = 0 + max_row_len = 0 + + table: List[List[Scalar]] = [] + + for i, sheet_row in enumerate(sheet_rows): + sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names] + empty_cells = 0 + table_row: List[Scalar] = [] + + for j, sheet_cell in enumerate(sheet_cells): + if sheet_cell.qname == table_cell_name: + value = self._get_cell_value(sheet_cell, convert_float) + else: + value = self.empty_value + + column_repeat = self._get_column_repeat(sheet_cell) + + # Queue up empty values, writing only if content succeeds them + if value == self.empty_value: + empty_cells += column_repeat + else: + table_row.extend([self.empty_value] * empty_cells) + empty_cells = 0 + table_row.extend([value] * column_repeat) + + if max_row_len < len(table_row): + max_row_len = len(table_row) + + row_repeat = self._get_row_repeat(sheet_row) + if self._is_empty_row(sheet_row): + empty_rows += row_repeat + else: + # add blank rows to our table + table.extend([[self.empty_value]] * empty_rows) + empty_rows = 0 + for _ in range(row_repeat): + table.append(table_row) + + # Make our table square + for row in table: + if len(row) < max_row_len: + row.extend([self.empty_value] * (max_row_len - len(row))) + + return table + + def _get_row_repeat(self, row) -> int: + """Return number of times this row was repeated + Repeating an empty row appeared to be a common way + of representing sparse rows in the table. + """ + from odf.namespaces import TABLENS + + return int(row.attributes.get((TABLENS, "number-rows-repeated"), 1)) + + def _get_column_repeat(self, cell) -> int: + from odf.namespaces import TABLENS + + return int(cell.attributes.get((TABLENS, "number-columns-repeated"), 1)) + + def _is_empty_row(self, row) -> bool: + """Helper function to find empty rows + """ + for column in row.childNodes: + if len(column.childNodes) > 0: + return False + + return True + + def _get_cell_value(self, cell, convert_float: bool) -> Scalar: + from odf.namespaces import OFFICENS + + cell_type = cell.attributes.get((OFFICENS, "value-type")) + if cell_type == "boolean": + if str(cell) == "TRUE": + return True + return False + if cell_type is None: + return self.empty_value + elif cell_type == "float": + # GH5394 + cell_value = float(cell.attributes.get((OFFICENS, "value"))) + + if cell_value == 0.0: # NA handling + return str(cell) + + if convert_float: + val = int(cell_value) + if val == cell_value: + return val + return cell_value + elif cell_type == "percentage": + cell_value = cell.attributes.get((OFFICENS, "value")) + return float(cell_value) + elif cell_type == "string": + return str(cell) + elif cell_type == "currency": + cell_value = cell.attributes.get((OFFICENS, "value")) + return float(cell_value) + elif cell_type == "date": + cell_value = cell.attributes.get((OFFICENS, "date-value")) + return pd.to_datetime(cell_value) + elif cell_type == "time": + return pd.to_datetime(str(cell)).time() + else: + raise ValueError(f"Unrecognized type {cell_type}") diff --git a/venv/Lib/site-packages/pandas/io/excel/_openpyxl.py b/venv/Lib/site-packages/pandas/io/excel/_openpyxl.py new file mode 100644 index 0000000..be52523 --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/excel/_openpyxl.py @@ -0,0 +1,537 @@ +from typing import List + +import numpy as np + +from pandas._typing import FilePathOrBuffer, Scalar +from pandas.compat._optional import import_optional_dependency + +from pandas.io.excel._base import ExcelWriter, _BaseExcelReader +from pandas.io.excel._util import _validate_freeze_panes + + +class _OpenpyxlWriter(ExcelWriter): + engine = "openpyxl" + supported_extensions = (".xlsx", ".xlsm") + + def __init__(self, path, engine=None, mode="w", **engine_kwargs): + # Use the openpyxl module as the Excel writer. + from openpyxl.workbook import Workbook + + super().__init__(path, mode=mode, **engine_kwargs) + + if self.mode == "a": # Load from existing workbook + from openpyxl import load_workbook + + book = load_workbook(self.path) + self.book = book + else: + # Create workbook object with default optimized_write=True. + self.book = Workbook() + + if self.book.worksheets: + try: + self.book.remove(self.book.worksheets[0]) + except AttributeError: + + # compat - for openpyxl <= 2.4 + self.book.remove_sheet(self.book.worksheets[0]) + + def save(self): + """ + Save workbook to disk. + """ + return self.book.save(self.path) + + @classmethod + def _convert_to_style(cls, style_dict): + """ + Converts a style_dict to an openpyxl style object. + + Parameters + ---------- + style_dict : style dictionary to convert + """ + + from openpyxl.style import Style + + xls_style = Style() + for key, value in style_dict.items(): + for nk, nv in value.items(): + if key == "borders": + ( + xls_style.borders.__getattribute__(nk).__setattr__( + "border_style", nv + ) + ) + else: + xls_style.__getattribute__(key).__setattr__(nk, nv) + + return xls_style + + @classmethod + def _convert_to_style_kwargs(cls, style_dict): + """ + Convert a style_dict to a set of kwargs suitable for initializing + or updating-on-copy an openpyxl v2 style object. + + Parameters + ---------- + style_dict : dict + A dict with zero or more of the following keys (or their synonyms). + 'font' + 'fill' + 'border' ('borders') + 'alignment' + 'number_format' + 'protection' + + Returns + ------- + style_kwargs : dict + A dict with the same, normalized keys as ``style_dict`` but each + value has been replaced with a native openpyxl style object of the + appropriate class. + """ + + _style_key_map = {"borders": "border"} + + style_kwargs = {} + for k, v in style_dict.items(): + if k in _style_key_map: + k = _style_key_map[k] + _conv_to_x = getattr(cls, f"_convert_to_{k}", lambda x: None) + new_v = _conv_to_x(v) + if new_v: + style_kwargs[k] = new_v + + return style_kwargs + + @classmethod + def _convert_to_color(cls, color_spec): + """ + Convert ``color_spec`` to an openpyxl v2 Color object. + + Parameters + ---------- + color_spec : str, dict + A 32-bit ARGB hex string, or a dict with zero or more of the + following keys. + 'rgb' + 'indexed' + 'auto' + 'theme' + 'tint' + 'index' + 'type' + + Returns + ------- + color : openpyxl.styles.Color + """ + + from openpyxl.styles import Color + + if isinstance(color_spec, str): + return Color(color_spec) + else: + return Color(**color_spec) + + @classmethod + def _convert_to_font(cls, font_dict): + """ + Convert ``font_dict`` to an openpyxl v2 Font object. + + Parameters + ---------- + font_dict : dict + A dict with zero or more of the following keys (or their synonyms). + 'name' + 'size' ('sz') + 'bold' ('b') + 'italic' ('i') + 'underline' ('u') + 'strikethrough' ('strike') + 'color' + 'vertAlign' ('vertalign') + 'charset' + 'scheme' + 'family' + 'outline' + 'shadow' + 'condense' + + Returns + ------- + font : openpyxl.styles.Font + """ + + from openpyxl.styles import Font + + _font_key_map = { + "sz": "size", + "b": "bold", + "i": "italic", + "u": "underline", + "strike": "strikethrough", + "vertalign": "vertAlign", + } + + font_kwargs = {} + for k, v in font_dict.items(): + if k in _font_key_map: + k = _font_key_map[k] + if k == "color": + v = cls._convert_to_color(v) + font_kwargs[k] = v + + return Font(**font_kwargs) + + @classmethod + def _convert_to_stop(cls, stop_seq): + """ + Convert ``stop_seq`` to a list of openpyxl v2 Color objects, + suitable for initializing the ``GradientFill`` ``stop`` parameter. + + Parameters + ---------- + stop_seq : iterable + An iterable that yields objects suitable for consumption by + ``_convert_to_color``. + + Returns + ------- + stop : list of openpyxl.styles.Color + """ + + return map(cls._convert_to_color, stop_seq) + + @classmethod + def _convert_to_fill(cls, fill_dict): + """ + Convert ``fill_dict`` to an openpyxl v2 Fill object. + + Parameters + ---------- + fill_dict : dict + A dict with one or more of the following keys (or their synonyms), + 'fill_type' ('patternType', 'patterntype') + 'start_color' ('fgColor', 'fgcolor') + 'end_color' ('bgColor', 'bgcolor') + or one or more of the following keys (or their synonyms). + 'type' ('fill_type') + 'degree' + 'left' + 'right' + 'top' + 'bottom' + 'stop' + + Returns + ------- + fill : openpyxl.styles.Fill + """ + + from openpyxl.styles import PatternFill, GradientFill + + _pattern_fill_key_map = { + "patternType": "fill_type", + "patterntype": "fill_type", + "fgColor": "start_color", + "fgcolor": "start_color", + "bgColor": "end_color", + "bgcolor": "end_color", + } + + _gradient_fill_key_map = {"fill_type": "type"} + + pfill_kwargs = {} + gfill_kwargs = {} + for k, v in fill_dict.items(): + pk = gk = None + if k in _pattern_fill_key_map: + pk = _pattern_fill_key_map[k] + if k in _gradient_fill_key_map: + gk = _gradient_fill_key_map[k] + if pk in ["start_color", "end_color"]: + v = cls._convert_to_color(v) + if gk == "stop": + v = cls._convert_to_stop(v) + if pk: + pfill_kwargs[pk] = v + elif gk: + gfill_kwargs[gk] = v + else: + pfill_kwargs[k] = v + gfill_kwargs[k] = v + + try: + return PatternFill(**pfill_kwargs) + except TypeError: + return GradientFill(**gfill_kwargs) + + @classmethod + def _convert_to_side(cls, side_spec): + """ + Convert ``side_spec`` to an openpyxl v2 Side object. + + Parameters + ---------- + side_spec : str, dict + A string specifying the border style, or a dict with zero or more + of the following keys (or their synonyms). + 'style' ('border_style') + 'color' + + Returns + ------- + side : openpyxl.styles.Side + """ + + from openpyxl.styles import Side + + _side_key_map = {"border_style": "style"} + + if isinstance(side_spec, str): + return Side(style=side_spec) + + side_kwargs = {} + for k, v in side_spec.items(): + if k in _side_key_map: + k = _side_key_map[k] + if k == "color": + v = cls._convert_to_color(v) + side_kwargs[k] = v + + return Side(**side_kwargs) + + @classmethod + def _convert_to_border(cls, border_dict): + """ + Convert ``border_dict`` to an openpyxl v2 Border object. + + Parameters + ---------- + border_dict : dict + A dict with zero or more of the following keys (or their synonyms). + 'left' + 'right' + 'top' + 'bottom' + 'diagonal' + 'diagonal_direction' + 'vertical' + 'horizontal' + 'diagonalUp' ('diagonalup') + 'diagonalDown' ('diagonaldown') + 'outline' + + Returns + ------- + border : openpyxl.styles.Border + """ + + from openpyxl.styles import Border + + _border_key_map = {"diagonalup": "diagonalUp", "diagonaldown": "diagonalDown"} + + border_kwargs = {} + for k, v in border_dict.items(): + if k in _border_key_map: + k = _border_key_map[k] + if k == "color": + v = cls._convert_to_color(v) + if k in ["left", "right", "top", "bottom", "diagonal"]: + v = cls._convert_to_side(v) + border_kwargs[k] = v + + return Border(**border_kwargs) + + @classmethod + def _convert_to_alignment(cls, alignment_dict): + """ + Convert ``alignment_dict`` to an openpyxl v2 Alignment object. + + Parameters + ---------- + alignment_dict : dict + A dict with zero or more of the following keys (or their synonyms). + 'horizontal' + 'vertical' + 'text_rotation' + 'wrap_text' + 'shrink_to_fit' + 'indent' + Returns + ------- + alignment : openpyxl.styles.Alignment + """ + + from openpyxl.styles import Alignment + + return Alignment(**alignment_dict) + + @classmethod + def _convert_to_number_format(cls, number_format_dict): + """ + Convert ``number_format_dict`` to an openpyxl v2.1.0 number format + initializer. + Parameters + ---------- + number_format_dict : dict + A dict with zero or more of the following keys. + 'format_code' : str + Returns + ------- + number_format : str + """ + return number_format_dict["format_code"] + + @classmethod + def _convert_to_protection(cls, protection_dict): + """ + Convert ``protection_dict`` to an openpyxl v2 Protection object. + Parameters + ---------- + protection_dict : dict + A dict with zero or more of the following keys. + 'locked' + 'hidden' + Returns + ------- + """ + + from openpyxl.styles import Protection + + return Protection(**protection_dict) + + def write_cells( + self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None + ): + # Write the frame cells using openpyxl. + sheet_name = self._get_sheet_name(sheet_name) + + _style_cache = {} + + if sheet_name in self.sheets: + wks = self.sheets[sheet_name] + else: + wks = self.book.create_sheet() + wks.title = sheet_name + self.sheets[sheet_name] = wks + + if _validate_freeze_panes(freeze_panes): + wks.freeze_panes = wks.cell( + row=freeze_panes[0] + 1, column=freeze_panes[1] + 1 + ) + + for cell in cells: + xcell = wks.cell( + row=startrow + cell.row + 1, column=startcol + cell.col + 1 + ) + xcell.value, fmt = self._value_with_fmt(cell.val) + if fmt: + xcell.number_format = fmt + + style_kwargs = {} + if cell.style: + key = str(cell.style) + style_kwargs = _style_cache.get(key) + if style_kwargs is None: + style_kwargs = self._convert_to_style_kwargs(cell.style) + _style_cache[key] = style_kwargs + + if style_kwargs: + for k, v in style_kwargs.items(): + setattr(xcell, k, v) + + if cell.mergestart is not None and cell.mergeend is not None: + + wks.merge_cells( + start_row=startrow + cell.row + 1, + start_column=startcol + cell.col + 1, + end_column=startcol + cell.mergeend + 1, + end_row=startrow + cell.mergestart + 1, + ) + + # When cells are merged only the top-left cell is preserved + # The behaviour of the other cells in a merged range is + # undefined + if style_kwargs: + first_row = startrow + cell.row + 1 + last_row = startrow + cell.mergestart + 1 + first_col = startcol + cell.col + 1 + last_col = startcol + cell.mergeend + 1 + + for row in range(first_row, last_row + 1): + for col in range(first_col, last_col + 1): + if row == first_row and col == first_col: + # Ignore first cell. It is already handled. + continue + xcell = wks.cell(column=col, row=row) + for k, v in style_kwargs.items(): + setattr(xcell, k, v) + + +class _OpenpyxlReader(_BaseExcelReader): + def __init__(self, filepath_or_buffer: FilePathOrBuffer) -> None: + """Reader using openpyxl engine. + + Parameters + ---------- + filepath_or_buffer : string, path object or Workbook + Object to be parsed. + """ + import_optional_dependency("openpyxl") + super().__init__(filepath_or_buffer) + + @property + def _workbook_class(self): + from openpyxl import Workbook + + return Workbook + + def load_workbook(self, filepath_or_buffer: FilePathOrBuffer): + from openpyxl import load_workbook + + return load_workbook( + filepath_or_buffer, read_only=True, data_only=True, keep_links=False + ) + + @property + def sheet_names(self) -> List[str]: + return self.book.sheetnames + + def get_sheet_by_name(self, name: str): + return self.book[name] + + def get_sheet_by_index(self, index: int): + return self.book.worksheets[index] + + def _convert_cell(self, cell, convert_float: bool) -> Scalar: + + # TODO: replace with openpyxl constants + if cell.is_date: + return cell.value + elif cell.data_type == "e": + return np.nan + elif cell.data_type == "b": + return bool(cell.value) + elif cell.value is None: + return "" # compat with xlrd + elif cell.data_type == "n": + # GH5394 + if convert_float: + val = int(cell.value) + if val == cell.value: + return val + else: + return float(cell.value) + + return cell.value + + def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: + data: List[List[Scalar]] = [] + for row in sheet.rows: + data.append([self._convert_cell(cell, convert_float) for cell in row]) + + return data diff --git a/venv/Lib/site-packages/pandas/io/excel/_pyxlsb.py b/venv/Lib/site-packages/pandas/io/excel/_pyxlsb.py new file mode 100644 index 0000000..df6a380 --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/excel/_pyxlsb.py @@ -0,0 +1,68 @@ +from typing import List + +from pandas._typing import FilePathOrBuffer, Scalar +from pandas.compat._optional import import_optional_dependency + +from pandas.io.excel._base import _BaseExcelReader + + +class _PyxlsbReader(_BaseExcelReader): + def __init__(self, filepath_or_buffer: FilePathOrBuffer): + """Reader using pyxlsb engine. + + Parameters + __________ + filepath_or_buffer: string, path object, or Workbook + Object to be parsed. + """ + import_optional_dependency("pyxlsb") + # This will call load_workbook on the filepath or buffer + # And set the result to the book-attribute + super().__init__(filepath_or_buffer) + + @property + def _workbook_class(self): + from pyxlsb import Workbook + + return Workbook + + def load_workbook(self, filepath_or_buffer: FilePathOrBuffer): + from pyxlsb import open_workbook + + # Todo: hack in buffer capability + # This might need some modifications to the Pyxlsb library + # Actual work for opening it is in xlsbpackage.py, line 20-ish + + return open_workbook(filepath_or_buffer) + + @property + def sheet_names(self) -> List[str]: + return self.book.sheets + + def get_sheet_by_name(self, name: str): + return self.book.get_sheet(name) + + def get_sheet_by_index(self, index: int): + # pyxlsb sheets are indexed from 1 onwards + # There's a fix for this in the source, but the pypi package doesn't have it + return self.book.get_sheet(index + 1) + + def _convert_cell(self, cell, convert_float: bool) -> Scalar: + # Todo: there is no way to distinguish between floats and datetimes in pyxlsb + # This means that there is no way to read datetime types from an xlsb file yet + if cell.v is None: + return "" # Prevents non-named columns from not showing up as Unnamed: i + if isinstance(cell.v, float) and convert_float: + val = int(cell.v) + if val == cell.v: + return val + else: + return float(cell.v) + + return cell.v + + def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: + return [ + [self._convert_cell(c, convert_float) for c in r] + for r in sheet.rows(sparse=False) + ] diff --git a/venv/Lib/site-packages/pandas/io/excel/_util.py b/venv/Lib/site-packages/pandas/io/excel/_util.py new file mode 100644 index 0000000..a084be5 --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/excel/_util.py @@ -0,0 +1,229 @@ +from pandas.compat._optional import import_optional_dependency + +from pandas.core.dtypes.common import is_integer, is_list_like + +_writers = {} + + +def register_writer(klass): + """ + Add engine to the excel writer registry.io.excel. + + You must use this method to integrate with ``to_excel``. + + Parameters + ---------- + klass : ExcelWriter + """ + if not callable(klass): + raise ValueError("Can only register callables as engines") + engine_name = klass.engine + _writers[engine_name] = klass + + +def _get_default_writer(ext): + """ + Return the default writer for the given extension. + + Parameters + ---------- + ext : str + The excel file extension for which to get the default engine. + + Returns + ------- + str + The default engine for the extension. + """ + _default_writers = {"xlsx": "openpyxl", "xlsm": "openpyxl", "xls": "xlwt"} + xlsxwriter = import_optional_dependency( + "xlsxwriter", raise_on_missing=False, on_version="warn" + ) + if xlsxwriter: + _default_writers["xlsx"] = "xlsxwriter" + return _default_writers[ext] + + +def get_writer(engine_name): + try: + return _writers[engine_name] + except KeyError: + raise ValueError(f"No Excel writer '{engine_name}'") + + +def _excel2num(x): + """ + Convert Excel column name like 'AB' to 0-based column index. + + Parameters + ---------- + x : str + The Excel column name to convert to a 0-based column index. + + Returns + ------- + num : int + The column index corresponding to the name. + + Raises + ------ + ValueError + Part of the Excel column name was invalid. + """ + index = 0 + + for c in x.upper().strip(): + cp = ord(c) + + if cp < ord("A") or cp > ord("Z"): + raise ValueError(f"Invalid column name: {x}") + + index = index * 26 + cp - ord("A") + 1 + + return index - 1 + + +def _range2cols(areas): + """ + Convert comma separated list of column names and ranges to indices. + + Parameters + ---------- + areas : str + A string containing a sequence of column ranges (or areas). + + Returns + ------- + cols : list + A list of 0-based column indices. + + Examples + -------- + >>> _range2cols('A:E') + [0, 1, 2, 3, 4] + >>> _range2cols('A,C,Z:AB') + [0, 2, 25, 26, 27] + """ + cols = [] + + for rng in areas.split(","): + if ":" in rng: + rng = rng.split(":") + cols.extend(range(_excel2num(rng[0]), _excel2num(rng[1]) + 1)) + else: + cols.append(_excel2num(rng)) + + return cols + + +def _maybe_convert_usecols(usecols): + """ + Convert `usecols` into a compatible format for parsing in `parsers.py`. + + Parameters + ---------- + usecols : object + The use-columns object to potentially convert. + + Returns + ------- + converted : object + The compatible format of `usecols`. + """ + if usecols is None: + return usecols + + if is_integer(usecols): + raise ValueError( + "Passing an integer for `usecols` is no longer supported. " + "Please pass in a list of int from 0 to `usecols` " + "inclusive instead." + ) + + if isinstance(usecols, str): + return _range2cols(usecols) + + return usecols + + +def _validate_freeze_panes(freeze_panes): + if freeze_panes is not None: + if len(freeze_panes) == 2 and all( + isinstance(item, int) for item in freeze_panes + ): + return True + + raise ValueError( + "freeze_panes must be of form (row, column) " + "where row and column are integers" + ) + + # freeze_panes wasn't specified, return False so it won't be applied + # to output sheet + return False + + +def _trim_excel_header(row): + # trim header row so auto-index inference works + # xlrd uses '' , openpyxl None + while len(row) > 0 and (row[0] == "" or row[0] is None): + row = row[1:] + return row + + +def _fill_mi_header(row, control_row): + """Forward fill blank entries in row but only inside the same parent index. + + Used for creating headers in Multiindex. + Parameters + ---------- + row : list + List of items in a single row. + control_row : list of bool + Helps to determine if particular column is in same parent index as the + previous value. Used to stop propagation of empty cells between + different indexes. + + Returns + ------- + Returns changed row and control_row + """ + last = row[0] + for i in range(1, len(row)): + if not control_row[i]: + last = row[i] + + if row[i] == "" or row[i] is None: + row[i] = last + else: + control_row[i] = False + last = row[i] + + return row, control_row + + +def _pop_header_name(row, index_col): + """ + Pop the header name for MultiIndex parsing. + + Parameters + ---------- + row : list + The data row to parse for the header name. + index_col : int, list + The index columns for our data. Assumed to be non-null. + + Returns + ------- + header_name : str + The extracted header name. + trimmed_row : list + The original data row with the header name removed. + """ + # Pop out header name and fill w/blank. + i = index_col if not is_list_like(index_col) else max(index_col) + + header_name = row[i] + header_name = None if header_name == "" else header_name + + return header_name, row[:i] + [""] + row[i + 1 :] diff --git a/venv/Lib/site-packages/pandas/io/excel/_xlrd.py b/venv/Lib/site-packages/pandas/io/excel/_xlrd.py new file mode 100644 index 0000000..be1b78e --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/excel/_xlrd.py @@ -0,0 +1,106 @@ +from datetime import time + +import numpy as np + +from pandas.compat._optional import import_optional_dependency + +from pandas.io.excel._base import _BaseExcelReader + + +class _XlrdReader(_BaseExcelReader): + def __init__(self, filepath_or_buffer): + """Reader using xlrd engine. + + Parameters + ---------- + filepath_or_buffer : string, path object or Workbook + Object to be parsed. + """ + err_msg = "Install xlrd >= 1.0.0 for Excel support" + import_optional_dependency("xlrd", extra=err_msg) + super().__init__(filepath_or_buffer) + + @property + def _workbook_class(self): + from xlrd import Book + + return Book + + def load_workbook(self, filepath_or_buffer): + from xlrd import open_workbook + + if hasattr(filepath_or_buffer, "read"): + data = filepath_or_buffer.read() + return open_workbook(file_contents=data) + else: + return open_workbook(filepath_or_buffer) + + @property + def sheet_names(self): + return self.book.sheet_names() + + def get_sheet_by_name(self, name): + return self.book.sheet_by_name(name) + + def get_sheet_by_index(self, index): + return self.book.sheet_by_index(index) + + def get_sheet_data(self, sheet, convert_float): + from xlrd import ( + xldate, + XL_CELL_DATE, + XL_CELL_ERROR, + XL_CELL_BOOLEAN, + XL_CELL_NUMBER, + ) + + epoch1904 = self.book.datemode + + def _parse_cell(cell_contents, cell_typ): + """converts the contents of the cell into a pandas + appropriate object""" + + if cell_typ == XL_CELL_DATE: + + # Use the newer xlrd datetime handling. + try: + cell_contents = xldate.xldate_as_datetime(cell_contents, epoch1904) + except OverflowError: + return cell_contents + + # Excel doesn't distinguish between dates and time, + # so we treat dates on the epoch as times only. + # Also, Excel supports 1900 and 1904 epochs. + year = (cell_contents.timetuple())[0:3] + if (not epoch1904 and year == (1899, 12, 31)) or ( + epoch1904 and year == (1904, 1, 1) + ): + cell_contents = time( + cell_contents.hour, + cell_contents.minute, + cell_contents.second, + cell_contents.microsecond, + ) + + elif cell_typ == XL_CELL_ERROR: + cell_contents = np.nan + elif cell_typ == XL_CELL_BOOLEAN: + cell_contents = bool(cell_contents) + elif convert_float and cell_typ == XL_CELL_NUMBER: + # GH5394 - Excel 'numbers' are always floats + # it's a minimal perf hit and less surprising + val = int(cell_contents) + if val == cell_contents: + cell_contents = val + return cell_contents + + data = [] + + for i in range(sheet.nrows): + row = [ + _parse_cell(value, typ) + for value, typ in zip(sheet.row_values(i), sheet.row_types(i)) + ] + data.append(row) + + return data diff --git a/venv/Lib/site-packages/pandas/io/excel/_xlsxwriter.py b/venv/Lib/site-packages/pandas/io/excel/_xlsxwriter.py new file mode 100644 index 0000000..6d9ff9b --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/excel/_xlsxwriter.py @@ -0,0 +1,237 @@ +import pandas._libs.json as json + +from pandas.io.excel._base import ExcelWriter +from pandas.io.excel._util import _validate_freeze_panes + + +class _XlsxStyler: + # Map from openpyxl-oriented styles to flatter xlsxwriter representation + # Ordering necessary for both determinism and because some are keyed by + # prefixes of others. + STYLE_MAPPING = { + "font": [ + (("name",), "font_name"), + (("sz",), "font_size"), + (("size",), "font_size"), + (("color", "rgb"), "font_color"), + (("color",), "font_color"), + (("b",), "bold"), + (("bold",), "bold"), + (("i",), "italic"), + (("italic",), "italic"), + (("u",), "underline"), + (("underline",), "underline"), + (("strike",), "font_strikeout"), + (("vertAlign",), "font_script"), + (("vertalign",), "font_script"), + ], + "number_format": [(("format_code",), "num_format"), ((), "num_format")], + "protection": [(("locked",), "locked"), (("hidden",), "hidden")], + "alignment": [ + (("horizontal",), "align"), + (("vertical",), "valign"), + (("text_rotation",), "rotation"), + (("wrap_text",), "text_wrap"), + (("indent",), "indent"), + (("shrink_to_fit",), "shrink"), + ], + "fill": [ + (("patternType",), "pattern"), + (("patterntype",), "pattern"), + (("fill_type",), "pattern"), + (("start_color", "rgb"), "fg_color"), + (("fgColor", "rgb"), "fg_color"), + (("fgcolor", "rgb"), "fg_color"), + (("start_color",), "fg_color"), + (("fgColor",), "fg_color"), + (("fgcolor",), "fg_color"), + (("end_color", "rgb"), "bg_color"), + (("bgColor", "rgb"), "bg_color"), + (("bgcolor", "rgb"), "bg_color"), + (("end_color",), "bg_color"), + (("bgColor",), "bg_color"), + (("bgcolor",), "bg_color"), + ], + "border": [ + (("color", "rgb"), "border_color"), + (("color",), "border_color"), + (("style",), "border"), + (("top", "color", "rgb"), "top_color"), + (("top", "color"), "top_color"), + (("top", "style"), "top"), + (("top",), "top"), + (("right", "color", "rgb"), "right_color"), + (("right", "color"), "right_color"), + (("right", "style"), "right"), + (("right",), "right"), + (("bottom", "color", "rgb"), "bottom_color"), + (("bottom", "color"), "bottom_color"), + (("bottom", "style"), "bottom"), + (("bottom",), "bottom"), + (("left", "color", "rgb"), "left_color"), + (("left", "color"), "left_color"), + (("left", "style"), "left"), + (("left",), "left"), + ], + } + + @classmethod + def convert(cls, style_dict, num_format_str=None): + """ + converts a style_dict to an xlsxwriter format dict + + Parameters + ---------- + style_dict : style dictionary to convert + num_format_str : optional number format string + """ + + # Create a XlsxWriter format object. + props = {} + + if num_format_str is not None: + props["num_format"] = num_format_str + + if style_dict is None: + return props + + if "borders" in style_dict: + style_dict = style_dict.copy() + style_dict["border"] = style_dict.pop("borders") + + for style_group_key, style_group in style_dict.items(): + for src, dst in cls.STYLE_MAPPING.get(style_group_key, []): + # src is a sequence of keys into a nested dict + # dst is a flat key + if dst in props: + continue + v = style_group + for k in src: + try: + v = v[k] + except (KeyError, TypeError): + break + else: + props[dst] = v + + if isinstance(props.get("pattern"), str): + # TODO: support other fill patterns + props["pattern"] = 0 if props["pattern"] == "none" else 1 + + for k in ["border", "top", "right", "bottom", "left"]: + if isinstance(props.get(k), str): + try: + props[k] = [ + "none", + "thin", + "medium", + "dashed", + "dotted", + "thick", + "double", + "hair", + "mediumDashed", + "dashDot", + "mediumDashDot", + "dashDotDot", + "mediumDashDotDot", + "slantDashDot", + ].index(props[k]) + except ValueError: + props[k] = 2 + + if isinstance(props.get("font_script"), str): + props["font_script"] = ["baseline", "superscript", "subscript"].index( + props["font_script"] + ) + + if isinstance(props.get("underline"), str): + props["underline"] = { + "none": 0, + "single": 1, + "double": 2, + "singleAccounting": 33, + "doubleAccounting": 34, + }[props["underline"]] + + return props + + +class _XlsxWriter(ExcelWriter): + engine = "xlsxwriter" + supported_extensions = (".xlsx",) + + def __init__( + self, + path, + engine=None, + date_format=None, + datetime_format=None, + mode="w", + **engine_kwargs, + ): + # Use the xlsxwriter module as the Excel writer. + import xlsxwriter + + if mode == "a": + raise ValueError("Append mode is not supported with xlsxwriter!") + + super().__init__( + path, + engine=engine, + date_format=date_format, + datetime_format=datetime_format, + mode=mode, + **engine_kwargs, + ) + + self.book = xlsxwriter.Workbook(path, **engine_kwargs) + + def save(self): + """ + Save workbook to disk. + """ + + return self.book.close() + + def write_cells( + self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None + ): + # Write the frame cells using xlsxwriter. + sheet_name = self._get_sheet_name(sheet_name) + + if sheet_name in self.sheets: + wks = self.sheets[sheet_name] + else: + wks = self.book.add_worksheet(sheet_name) + self.sheets[sheet_name] = wks + + style_dict = {"null": None} + + if _validate_freeze_panes(freeze_panes): + wks.freeze_panes(*(freeze_panes)) + + for cell in cells: + val, fmt = self._value_with_fmt(cell.val) + + stylekey = json.dumps(cell.style) + if fmt: + stylekey += fmt + + if stylekey in style_dict: + style = style_dict[stylekey] + else: + style = self.book.add_format(_XlsxStyler.convert(cell.style, fmt)) + style_dict[stylekey] = style + + if cell.mergestart is not None and cell.mergeend is not None: + wks.merge_range( + startrow + cell.row, + startcol + cell.col, + startrow + cell.mergestart, + startcol + cell.mergeend, + val, + style, + ) + else: + wks.write(startrow + cell.row, startcol + cell.col, val, style) diff --git a/venv/Lib/site-packages/pandas/io/excel/_xlwt.py b/venv/Lib/site-packages/pandas/io/excel/_xlwt.py new file mode 100644 index 0000000..d102a88 --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/excel/_xlwt.py @@ -0,0 +1,138 @@ +import pandas._libs.json as json + +from pandas.io.excel._base import ExcelWriter +from pandas.io.excel._util import _validate_freeze_panes + + +class _XlwtWriter(ExcelWriter): + engine = "xlwt" + supported_extensions = (".xls",) + + def __init__(self, path, engine=None, encoding=None, mode="w", **engine_kwargs): + # Use the xlwt module as the Excel writer. + import xlwt + + engine_kwargs["engine"] = engine + + if mode == "a": + raise ValueError("Append mode is not supported with xlwt!") + + super().__init__(path, mode=mode, **engine_kwargs) + + if encoding is None: + encoding = "ascii" + self.book = xlwt.Workbook(encoding=encoding) + self.fm_datetime = xlwt.easyxf(num_format_str=self.datetime_format) + self.fm_date = xlwt.easyxf(num_format_str=self.date_format) + + def save(self): + """ + Save workbook to disk. + """ + return self.book.save(self.path) + + def write_cells( + self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None + ): + # Write the frame cells using xlwt. + + sheet_name = self._get_sheet_name(sheet_name) + + if sheet_name in self.sheets: + wks = self.sheets[sheet_name] + else: + wks = self.book.add_sheet(sheet_name) + self.sheets[sheet_name] = wks + + if _validate_freeze_panes(freeze_panes): + wks.set_panes_frozen(True) + wks.set_horz_split_pos(freeze_panes[0]) + wks.set_vert_split_pos(freeze_panes[1]) + + style_dict = {} + + for cell in cells: + val, fmt = self._value_with_fmt(cell.val) + + stylekey = json.dumps(cell.style) + if fmt: + stylekey += fmt + + if stylekey in style_dict: + style = style_dict[stylekey] + else: + style = self._convert_to_style(cell.style, fmt) + style_dict[stylekey] = style + + if cell.mergestart is not None and cell.mergeend is not None: + wks.write_merge( + startrow + cell.row, + startrow + cell.mergestart, + startcol + cell.col, + startcol + cell.mergeend, + val, + style, + ) + else: + wks.write(startrow + cell.row, startcol + cell.col, val, style) + + @classmethod + def _style_to_xlwt( + cls, item, firstlevel: bool = True, field_sep=",", line_sep=";" + ) -> str: + """helper which recursively generate an xlwt easy style string + for example: + + hstyle = {"font": {"bold": True}, + "border": {"top": "thin", + "right": "thin", + "bottom": "thin", + "left": "thin"}, + "align": {"horiz": "center"}} + will be converted to + font: bold on; \ + border: top thin, right thin, bottom thin, left thin; \ + align: horiz center; + """ + if hasattr(item, "items"): + if firstlevel: + it = [ + f"{key}: {cls._style_to_xlwt(value, False)}" + for key, value in item.items() + ] + out = f"{(line_sep).join(it)} " + return out + else: + it = [ + f"{key} {cls._style_to_xlwt(value, False)}" + for key, value in item.items() + ] + out = f"{(field_sep).join(it)} " + return out + else: + item = f"{item}" + item = item.replace("True", "on") + item = item.replace("False", "off") + return item + + @classmethod + def _convert_to_style(cls, style_dict, num_format_str=None): + """ + converts a style_dict to an xlwt style object + + Parameters + ---------- + style_dict : style dictionary to convert + num_format_str : optional number format string + """ + import xlwt + + if style_dict: + xlwt_stylestr = cls._style_to_xlwt(style_dict) + style = xlwt.easyxf(xlwt_stylestr, field_sep=",", line_sep=";") + else: + style = xlwt.XFStyle() + if num_format_str is not None: + style.num_format_str = num_format_str + + return style diff --git a/venv/Lib/site-packages/pandas/io/feather_format.py b/venv/Lib/site-packages/pandas/io/feather_format.py new file mode 100644 index 0000000..eb05004 --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/feather_format.py @@ -0,0 +1,103 @@ +""" feather-format compat """ + +from pandas.compat._optional import import_optional_dependency + +from pandas import DataFrame, Int64Index, RangeIndex + +from pandas.io.common import stringify_path + + +def to_feather(df: DataFrame, path): + """ + Write a DataFrame to the feather-format + + Parameters + ---------- + df : DataFrame + path : string file path, or file-like object + + """ + import_optional_dependency("pyarrow") + from pyarrow import feather + + path = stringify_path(path) + + if not isinstance(df, DataFrame): + raise ValueError("feather only support IO with DataFrames") + + valid_types = {"string", "unicode"} + + # validate index + # -------------- + + # validate that we have only a default index + # raise on anything else as we don't serialize the index + + if not isinstance(df.index, Int64Index): + typ = type(df.index) + raise ValueError( + f"feather does not support serializing {typ} " + "for the index; you can .reset_index() " + "to make the index into column(s)" + ) + + if not df.index.equals(RangeIndex.from_range(range(len(df)))): + raise ValueError( + "feather does not support serializing a " + "non-default index for the index; you " + "can .reset_index() to make the index " + "into column(s)" + ) + + if df.index.name is not None: + raise ValueError( + "feather does not serialize index meta-data on a default index" + ) + + # validate columns + # ---------------- + + # must have value column names (strings only) + if df.columns.inferred_type not in valid_types: + raise ValueError("feather must have string column names") + + feather.write_feather(df, path) + + +def read_feather(path, columns=None, use_threads: bool = True): + """ + Load a feather-format object from the file path. + + Parameters + ---------- + path : str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: + ``file://localhost/path/to/table.feather``. + + If you want to pass in a path object, pandas accepts any + ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, + such as a file handler (e.g. via builtin ``open`` function) + or ``StringIO``. + columns : sequence, default None + If not provided, all columns are read. + + .. versionadded:: 0.24.0 + use_threads : bool, default True + Whether to parallelize reading using multiple threads. + + .. versionadded:: 0.24.0 + + Returns + ------- + type of object stored in file + """ + import_optional_dependency("pyarrow") + from pyarrow import feather + + path = stringify_path(path) + + return feather.read_feather(path, columns=columns, use_threads=bool(use_threads)) diff --git a/venv/Lib/site-packages/pandas/io/formats/__init__.py b/venv/Lib/site-packages/pandas/io/formats/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/io/formats/console.py b/venv/Lib/site-packages/pandas/io/formats/console.py new file mode 100644 index 0000000..bed29e1 --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/formats/console.py @@ -0,0 +1,91 @@ +""" +Internal module for console introspection +""" + +from shutil import get_terminal_size + + +def get_console_size(): + """ + Return console size as tuple = (width, height). + + Returns (None,None) in non-interactive session. + """ + from pandas import get_option + + display_width = get_option("display.width") + display_height = get_option("display.max_rows") + + # Consider + # interactive shell terminal, can detect term size + # interactive non-shell terminal (ipnb/ipqtconsole), cannot detect term + # size non-interactive script, should disregard term size + + # in addition + # width,height have default values, but setting to 'None' signals + # should use Auto-Detection, But only in interactive shell-terminal. + # Simple. yeah. + + if in_interactive_session(): + if in_ipython_frontend(): + # sane defaults for interactive non-shell terminal + # match default for width,height in config_init + from pandas._config.config import get_default_val + + terminal_width = get_default_val("display.width") + terminal_height = get_default_val("display.max_rows") + else: + # pure terminal + terminal_width, terminal_height = get_terminal_size() + else: + terminal_width, terminal_height = None, None + + # Note if the User sets width/Height to None (auto-detection) + # and we're in a script (non-inter), this will return (None,None) + # caller needs to deal. + return (display_width or terminal_width, display_height or terminal_height) + + +# ---------------------------------------------------------------------- +# Detect our environment + + +def in_interactive_session(): + """ + Check if we're running in an interactive shell. + + Returns + ------- + bool + True if running under python/ipython interactive shell. + """ + from pandas import get_option + + def check_main(): + try: + import __main__ as main + except ModuleNotFoundError: + return get_option("mode.sim_interactive") + return not hasattr(main, "__file__") or get_option("mode.sim_interactive") + + try: + return __IPYTHON__ or check_main() # noqa + except NameError: + return check_main() + + +def in_ipython_frontend(): + """ + Check if we're inside an an IPython zmq frontend. + + Returns + ------- + bool + """ + try: + ip = get_ipython() # noqa + return "zmq" in str(type(ip)).lower() + except NameError: + pass + + return False diff --git a/venv/Lib/site-packages/pandas/io/formats/css.py b/venv/Lib/site-packages/pandas/io/formats/css.py new file mode 100644 index 0000000..b40d2a5 --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/formats/css.py @@ -0,0 +1,264 @@ +""" +Utilities for interpreting CSS from Stylers for formatting non-HTML outputs. +""" + +import re +import warnings + + +class CSSWarning(UserWarning): + """ + This CSS syntax cannot currently be parsed. + """ + + pass + + +def _side_expander(prop_fmt: str): + def expand(self, prop, value: str): + tokens = value.split() + try: + mapping = self.SIDE_SHORTHANDS[len(tokens)] + except KeyError: + warnings.warn( + f'Could not expand "{prop}: {value}"', CSSWarning, + ) + return + for key, idx in zip(self.SIDES, mapping): + yield prop_fmt.format(key), tokens[idx] + + return expand + + +class CSSResolver: + """ + A callable for parsing and resolving CSS to atomic properties. + """ + + def __call__(self, declarations_str, inherited=None): + """ + The given declarations to atomic properties. + + Parameters + ---------- + declarations_str : str + A list of CSS declarations + inherited : dict, optional + Atomic properties indicating the inherited style context in which + declarations_str is to be resolved. ``inherited`` should already + be resolved, i.e. valid output of this method. + + Returns + ------- + dict + Atomic CSS 2.2 properties. + + Examples + -------- + >>> resolve = CSSResolver() + >>> inherited = {'font-family': 'serif', 'font-weight': 'bold'} + >>> out = resolve(''' + ... border-color: BLUE RED; + ... font-size: 1em; + ... font-size: 2em; + ... font-weight: normal; + ... font-weight: inherit; + ... ''', inherited) + >>> sorted(out.items()) # doctest: +NORMALIZE_WHITESPACE + [('border-bottom-color', 'blue'), + ('border-left-color', 'red'), + ('border-right-color', 'red'), + ('border-top-color', 'blue'), + ('font-family', 'serif'), + ('font-size', '24pt'), + ('font-weight', 'bold')] + """ + props = dict(self.atomize(self.parse(declarations_str))) + if inherited is None: + inherited = {} + + # 1. resolve inherited, initial + for prop, val in inherited.items(): + if prop not in props: + props[prop] = val + + for prop, val in list(props.items()): + if val == "inherit": + val = inherited.get(prop, "initial") + if val == "initial": + val = None + + if val is None: + # we do not define a complete initial stylesheet + del props[prop] + else: + props[prop] = val + + # 2. resolve relative font size + if props.get("font-size"): + if "font-size" in inherited: + em_pt = inherited["font-size"] + assert em_pt[-2:] == "pt" + em_pt = float(em_pt[:-2]) + else: + em_pt = None + props["font-size"] = self.size_to_pt( + props["font-size"], em_pt, conversions=self.FONT_SIZE_RATIOS + ) + + font_size = float(props["font-size"][:-2]) + else: + font_size = None + + # 3. TODO: resolve other font-relative units + for side in self.SIDES: + prop = f"border-{side}-width" + if prop in props: + props[prop] = self.size_to_pt( + props[prop], em_pt=font_size, conversions=self.BORDER_WIDTH_RATIOS + ) + for prop in [ + f"margin-{side}", + f"padding-{side}", + ]: + if prop in props: + # TODO: support % + props[prop] = self.size_to_pt( + props[prop], em_pt=font_size, conversions=self.MARGIN_RATIOS + ) + + return props + + UNIT_RATIOS = { + "rem": ("pt", 12), + "ex": ("em", 0.5), + # 'ch': + "px": ("pt", 0.75), + "pc": ("pt", 12), + "in": ("pt", 72), + "cm": ("in", 1 / 2.54), + "mm": ("in", 1 / 25.4), + "q": ("mm", 0.25), + "!!default": ("em", 0), + } + + FONT_SIZE_RATIOS = UNIT_RATIOS.copy() + FONT_SIZE_RATIOS.update( + { + "%": ("em", 0.01), + "xx-small": ("rem", 0.5), + "x-small": ("rem", 0.625), + "small": ("rem", 0.8), + "medium": ("rem", 1), + "large": ("rem", 1.125), + "x-large": ("rem", 1.5), + "xx-large": ("rem", 2), + "smaller": ("em", 1 / 1.2), + "larger": ("em", 1.2), + "!!default": ("em", 1), + } + ) + + MARGIN_RATIOS = UNIT_RATIOS.copy() + MARGIN_RATIOS.update({"none": ("pt", 0)}) + + BORDER_WIDTH_RATIOS = UNIT_RATIOS.copy() + BORDER_WIDTH_RATIOS.update( + { + "none": ("pt", 0), + "thick": ("px", 4), + "medium": ("px", 2), + "thin": ("px", 1), + # Default: medium only if solid + } + ) + + def size_to_pt(self, in_val, em_pt=None, conversions=UNIT_RATIOS): + def _error(): + warnings.warn(f"Unhandled size: {repr(in_val)}", CSSWarning) + return self.size_to_pt("1!!default", conversions=conversions) + + try: + val, unit = re.match(r"^(\S*?)([a-zA-Z%!].*)", in_val).groups() + except AttributeError: + return _error() + if val == "": + # hack for 'large' etc. + val = 1 + else: + try: + val = float(val) + except ValueError: + return _error() + + while unit != "pt": + if unit == "em": + if em_pt is None: + unit = "rem" + else: + val *= em_pt + unit = "pt" + continue + + try: + unit, mul = conversions[unit] + except KeyError: + return _error() + val *= mul + + val = round(val, 5) + if int(val) == val: + size_fmt = f"{int(val):d}pt" + else: + size_fmt = f"{val:f}pt" + return size_fmt + + def atomize(self, declarations): + for prop, value in declarations: + attr = "expand_" + prop.replace("-", "_") + try: + expand = getattr(self, attr) + except AttributeError: + yield prop, value + else: + for prop, value in expand(prop, value): + yield prop, value + + SIDE_SHORTHANDS = { + 1: [0, 0, 0, 0], + 2: [0, 1, 0, 1], + 3: [0, 1, 2, 1], + 4: [0, 1, 2, 3], + } + SIDES = ("top", "right", "bottom", "left") + + expand_border_color = _side_expander("border-{:s}-color") + expand_border_style = _side_expander("border-{:s}-style") + expand_border_width = _side_expander("border-{:s}-width") + expand_margin = _side_expander("margin-{:s}") + expand_padding = _side_expander("padding-{:s}") + + def parse(self, declarations_str: str): + """ + Generates (prop, value) pairs from declarations. + + In a future version may generate parsed tokens from tinycss/tinycss2 + + Parameters + ---------- + declarations_str : str + """ + for decl in declarations_str.split(";"): + if not decl.strip(): + continue + prop, sep, val = decl.partition(":") + prop = prop.strip().lower() + # TODO: don't lowercase case sensitive parts of values (strings) + val = val.strip().lower() + if sep: + yield prop, val + else: + warnings.warn( + f"Ill-formatted attribute: expected a colon in {repr(decl)}", + CSSWarning, + ) diff --git a/venv/Lib/site-packages/pandas/io/formats/csvs.py b/venv/Lib/site-packages/pandas/io/formats/csvs.py new file mode 100644 index 0000000..0d581f3 --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/formats/csvs.py @@ -0,0 +1,354 @@ +""" +Module for formatting output data into CSV files. +""" + +import csv as csvlib +from io import StringIO +import os +from typing import Hashable, List, Mapping, Optional, Sequence, Union +import warnings +from zipfile import ZipFile + +import numpy as np + +from pandas._libs import writers as libwriters +from pandas._typing import FilePathOrBuffer + +from pandas.core.dtypes.generic import ( + ABCDatetimeIndex, + ABCIndexClass, + ABCMultiIndex, + ABCPeriodIndex, +) +from pandas.core.dtypes.missing import notna + +from pandas.io.common import ( + get_compression_method, + get_filepath_or_buffer, + get_handle, + infer_compression, +) + + +class CSVFormatter: + def __init__( + self, + obj, + path_or_buf: Optional[FilePathOrBuffer[str]] = None, + sep: str = ",", + na_rep: str = "", + float_format: Optional[str] = None, + cols=None, + header: Union[bool, Sequence[Hashable]] = True, + index: bool = True, + index_label: Optional[Union[bool, Hashable, Sequence[Hashable]]] = None, + mode: str = "w", + encoding: Optional[str] = None, + compression: Union[str, Mapping[str, str], None] = "infer", + quoting: Optional[int] = None, + line_terminator="\n", + chunksize: Optional[int] = None, + quotechar='"', + date_format: Optional[str] = None, + doublequote: bool = True, + escapechar: Optional[str] = None, + decimal=".", + ): + self.obj = obj + + if path_or_buf is None: + path_or_buf = StringIO() + + # Extract compression mode as given, if dict + compression, self.compression_args = get_compression_method(compression) + + self.path_or_buf, _, _, _ = get_filepath_or_buffer( + path_or_buf, encoding=encoding, compression=compression, mode=mode + ) + self.sep = sep + self.na_rep = na_rep + self.float_format = float_format + self.decimal = decimal + + self.header = header + self.index = index + self.index_label = index_label + self.mode = mode + if encoding is None: + encoding = "utf-8" + self.encoding = encoding + self.compression = infer_compression(self.path_or_buf, compression) + + if quoting is None: + quoting = csvlib.QUOTE_MINIMAL + self.quoting = quoting + + if quoting == csvlib.QUOTE_NONE: + # prevents crash in _csv + quotechar = None + self.quotechar = quotechar + + self.doublequote = doublequote + self.escapechar = escapechar + + self.line_terminator = line_terminator or os.linesep + + self.date_format = date_format + + self.has_mi_columns = isinstance(obj.columns, ABCMultiIndex) + + # validate mi options + if self.has_mi_columns: + if cols is not None: + raise TypeError("cannot specify cols with a MultiIndex on the columns") + + if cols is not None: + if isinstance(cols, ABCIndexClass): + cols = cols.to_native_types( + na_rep=na_rep, + float_format=float_format, + date_format=date_format, + quoting=self.quoting, + ) + else: + cols = list(cols) + self.obj = self.obj.loc[:, cols] + + # update columns to include possible multiplicity of dupes + # and make sure sure cols is just a list of labels + cols = self.obj.columns + if isinstance(cols, ABCIndexClass): + cols = cols.to_native_types( + na_rep=na_rep, + float_format=float_format, + date_format=date_format, + quoting=self.quoting, + ) + else: + cols = list(cols) + + # save it + self.cols = cols + + # preallocate data 2d list + self.blocks = self.obj._data.blocks + ncols = sum(b.shape[0] for b in self.blocks) + self.data = [None] * ncols + + if chunksize is None: + chunksize = (100000 // (len(self.cols) or 1)) or 1 + self.chunksize = int(chunksize) + + self.data_index = obj.index + if ( + isinstance(self.data_index, (ABCDatetimeIndex, ABCPeriodIndex)) + and date_format is not None + ): + from pandas import Index + + self.data_index = Index( + [x.strftime(date_format) if notna(x) else "" for x in self.data_index] + ) + + self.nlevels = getattr(self.data_index, "nlevels", 1) + if not index: + self.nlevels = 0 + + def save(self) -> None: + """ + Create the writer & save. + """ + # GH21227 internal compression is not used when file-like passed. + if self.compression and hasattr(self.path_or_buf, "write"): + warnings.warn( + "compression has no effect when passing file-like object as input.", + RuntimeWarning, + stacklevel=2, + ) + + # when zip compression is called. + is_zip = isinstance(self.path_or_buf, ZipFile) or ( + not hasattr(self.path_or_buf, "write") and self.compression == "zip" + ) + + if is_zip: + # zipfile doesn't support writing string to archive. uses string + # buffer to receive csv writing and dump into zip compression + # file handle. GH21241, GH21118 + f = StringIO() + close = False + elif hasattr(self.path_or_buf, "write"): + f = self.path_or_buf + close = False + else: + f, handles = get_handle( + self.path_or_buf, + self.mode, + encoding=self.encoding, + compression=dict(self.compression_args, method=self.compression), + ) + close = True + + try: + # Note: self.encoding is irrelevant here + self.writer = csvlib.writer( + f, + lineterminator=self.line_terminator, + delimiter=self.sep, + quoting=self.quoting, + doublequote=self.doublequote, + escapechar=self.escapechar, + quotechar=self.quotechar, + ) + + self._save() + + finally: + if is_zip: + # GH17778 handles zip compression separately. + buf = f.getvalue() + if hasattr(self.path_or_buf, "write"): + self.path_or_buf.write(buf) + else: + compression = dict(self.compression_args, method=self.compression) + + f, handles = get_handle( + self.path_or_buf, + self.mode, + encoding=self.encoding, + compression=compression, + ) + f.write(buf) + close = True + if close: + f.close() + for _fh in handles: + _fh.close() + + def _save_header(self): + writer = self.writer + obj = self.obj + index_label = self.index_label + cols = self.cols + has_mi_columns = self.has_mi_columns + header = self.header + encoded_labels: List[str] = [] + + has_aliases = isinstance(header, (tuple, list, np.ndarray, ABCIndexClass)) + if not (has_aliases or self.header): + return + if has_aliases: + if len(header) != len(cols): + raise ValueError( + f"Writing {len(cols)} cols but got {len(header)} aliases" + ) + else: + write_cols = header + else: + write_cols = cols + + if self.index: + # should write something for index label + if index_label is not False: + if index_label is None: + if isinstance(obj.index, ABCMultiIndex): + index_label = [] + for i, name in enumerate(obj.index.names): + if name is None: + name = "" + index_label.append(name) + else: + index_label = obj.index.name + if index_label is None: + index_label = [""] + else: + index_label = [index_label] + elif not isinstance( + index_label, (list, tuple, np.ndarray, ABCIndexClass) + ): + # given a string for a DF with Index + index_label = [index_label] + + encoded_labels = list(index_label) + else: + encoded_labels = [] + + if not has_mi_columns or has_aliases: + encoded_labels += list(write_cols) + writer.writerow(encoded_labels) + else: + # write out the mi + columns = obj.columns + + # write out the names for each level, then ALL of the values for + # each level + for i in range(columns.nlevels): + + # we need at least 1 index column to write our col names + col_line = [] + if self.index: + + # name is the first column + col_line.append(columns.names[i]) + + if isinstance(index_label, list) and len(index_label) > 1: + col_line.extend([""] * (len(index_label) - 1)) + + col_line.extend(columns._get_level_values(i)) + + writer.writerow(col_line) + + # Write out the index line if it's not empty. + # Otherwise, we will print out an extraneous + # blank line between the mi and the data rows. + if encoded_labels and set(encoded_labels) != {""}: + encoded_labels.extend([""] * len(columns)) + writer.writerow(encoded_labels) + + def _save(self) -> None: + self._save_header() + + nrows = len(self.data_index) + + # write in chunksize bites + chunksize = self.chunksize + chunks = int(nrows / chunksize) + 1 + + for i in range(chunks): + start_i = i * chunksize + end_i = min((i + 1) * chunksize, nrows) + if start_i >= end_i: + break + + self._save_chunk(start_i, end_i) + + def _save_chunk(self, start_i: int, end_i: int) -> None: + data_index = self.data_index + + # create the data for a chunk + slicer = slice(start_i, end_i) + for i in range(len(self.blocks)): + b = self.blocks[i] + d = b.to_native_types( + slicer=slicer, + na_rep=self.na_rep, + float_format=self.float_format, + decimal=self.decimal, + date_format=self.date_format, + quoting=self.quoting, + ) + + for col_loc, col in zip(b.mgr_locs, d): + # self.data is a preallocated list + self.data[col_loc] = col + + ix = data_index.to_native_types( + slicer=slicer, + na_rep=self.na_rep, + float_format=self.float_format, + decimal=self.decimal, + date_format=self.date_format, + quoting=self.quoting, + ) + + libwriters.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer) diff --git a/venv/Lib/site-packages/pandas/io/formats/excel.py b/venv/Lib/site-packages/pandas/io/formats/excel.py new file mode 100644 index 0000000..9b0f100 --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/formats/excel.py @@ -0,0 +1,738 @@ +"""Utilities for conversion to writer-agnostic Excel representation +""" + +from functools import reduce +import itertools +import re +from typing import Callable, Dict, List, Optional, Sequence, Union +import warnings + +import numpy as np + +from pandas.core.dtypes import missing +from pandas.core.dtypes.common import is_float, is_scalar +from pandas.core.dtypes.generic import ABCMultiIndex, ABCPeriodIndex + +from pandas import Index +import pandas.core.common as com + +from pandas.io.common import stringify_path +from pandas.io.formats.css import CSSResolver, CSSWarning +from pandas.io.formats.format import get_level_lengths +from pandas.io.formats.printing import pprint_thing + + +class ExcelCell: + __fields__ = ("row", "col", "val", "style", "mergestart", "mergeend") + __slots__ = __fields__ + + def __init__( + self, row: int, col: int, val, style=None, mergestart=None, mergeend=None + ): + self.row = row + self.col = col + self.val = val + self.style = style + self.mergestart = mergestart + self.mergeend = mergeend + + +class CSSToExcelConverter: + """A callable for converting CSS declarations to ExcelWriter styles + + Supports parts of CSS 2.2, with minimal CSS 3.0 support (e.g. text-shadow), + focusing on font styling, backgrounds, borders and alignment. + + Operates by first computing CSS styles in a fairly generic + way (see :meth:`compute_css`) then determining Excel style + properties from CSS properties (see :meth:`build_xlstyle`). + + Parameters + ---------- + inherited : str, optional + CSS declarations understood to be the containing scope for the + CSS processed by :meth:`__call__`. + """ + + # NB: Most of the methods here could be classmethods, as only __init__ + # and __call__ make use of instance attributes. We leave them as + # instancemethods so that users can easily experiment with extensions + # without monkey-patching. + + def __init__(self, inherited: Optional[str] = None): + if inherited is not None: + inherited = self.compute_css(inherited) + + self.inherited = inherited + + compute_css = CSSResolver() + + def __call__(self, declarations_str: str) -> Dict[str, Dict[str, str]]: + """ + Convert CSS declarations to ExcelWriter style. + + Parameters + ---------- + declarations_str : str + List of CSS declarations. + e.g. "font-weight: bold; background: blue" + + Returns + ------- + xlstyle : dict + A style as interpreted by ExcelWriter when found in + ExcelCell.style. + """ + # TODO: memoize? + properties = self.compute_css(declarations_str, self.inherited) + return self.build_xlstyle(properties) + + def build_xlstyle(self, props: Dict[str, str]) -> Dict[str, Dict[str, str]]: + out = { + "alignment": self.build_alignment(props), + "border": self.build_border(props), + "fill": self.build_fill(props), + "font": self.build_font(props), + "number_format": self.build_number_format(props), + } + + # TODO: handle cell width and height: needs support in pandas.io.excel + + def remove_none(d: Dict[str, str]) -> None: + """Remove key where value is None, through nested dicts""" + for k, v in list(d.items()): + if v is None: + del d[k] + elif isinstance(v, dict): + remove_none(v) + if not v: + del d[k] + + remove_none(out) + return out + + VERTICAL_MAP = { + "top": "top", + "text-top": "top", + "middle": "center", + "baseline": "bottom", + "bottom": "bottom", + "text-bottom": "bottom", + # OpenXML also has 'justify', 'distributed' + } + + def build_alignment(self, props) -> Dict[str, Optional[Union[bool, str]]]: + # TODO: text-indent, padding-left -> alignment.indent + return { + "horizontal": props.get("text-align"), + "vertical": self.VERTICAL_MAP.get(props.get("vertical-align")), + "wrap_text": ( + None + if props.get("white-space") is None + else props["white-space"] not in ("nowrap", "pre", "pre-line") + ), + } + + def build_border(self, props: Dict) -> Dict[str, Dict[str, str]]: + return { + side: { + "style": self._border_style( + props.get(f"border-{side}-style"), + props.get(f"border-{side}-width"), + ), + "color": self.color_to_excel(props.get(f"border-{side}-color")), + } + for side in ["top", "right", "bottom", "left"] + } + + def _border_style(self, style: Optional[str], width): + # convert styles and widths to openxml, one of: + # 'dashDot' + # 'dashDotDot' + # 'dashed' + # 'dotted' + # 'double' + # 'hair' + # 'medium' + # 'mediumDashDot' + # 'mediumDashDotDot' + # 'mediumDashed' + # 'slantDashDot' + # 'thick' + # 'thin' + if width is None and style is None: + return None + if style == "none" or style == "hidden": + return None + + if width is None: + width = "2pt" + width = float(width[:-2]) + if width < 1e-5: + return None + elif width < 1.3: + width_name = "thin" + elif width < 2.8: + width_name = "medium" + else: + width_name = "thick" + + if style in (None, "groove", "ridge", "inset", "outset"): + # not handled + style = "solid" + + if style == "double": + return "double" + if style == "solid": + return width_name + if style == "dotted": + if width_name in ("hair", "thin"): + return "dotted" + return "mediumDashDotDot" + if style == "dashed": + if width_name in ("hair", "thin"): + return "dashed" + return "mediumDashed" + + def build_fill(self, props: Dict[str, str]): + # TODO: perhaps allow for special properties + # -excel-pattern-bgcolor and -excel-pattern-type + fill_color = props.get("background-color") + if fill_color not in (None, "transparent", "none"): + return {"fgColor": self.color_to_excel(fill_color), "patternType": "solid"} + + BOLD_MAP = { + "bold": True, + "bolder": True, + "600": True, + "700": True, + "800": True, + "900": True, + "normal": False, + "lighter": False, + "100": False, + "200": False, + "300": False, + "400": False, + "500": False, + } + ITALIC_MAP = {"normal": False, "italic": True, "oblique": True} + + def build_font(self, props) -> Dict[str, Optional[Union[bool, int, str]]]: + size = props.get("font-size") + if size is not None: + assert size.endswith("pt") + size = float(size[:-2]) + + font_names_tmp = re.findall( + r"""(?x) + ( + "(?:[^"]|\\")+" + | + '(?:[^']|\\')+' + | + [^'",]+ + )(?=,|\s*$) + """, + props.get("font-family", ""), + ) + font_names = [] + for name in font_names_tmp: + if name[:1] == '"': + name = name[1:-1].replace('\\"', '"') + elif name[:1] == "'": + name = name[1:-1].replace("\\'", "'") + else: + name = name.strip() + if name: + font_names.append(name) + + family = None + for name in font_names: + if name == "serif": + family = 1 # roman + break + elif name == "sans-serif": + family = 2 # swiss + break + elif name == "cursive": + family = 4 # script + break + elif name == "fantasy": + family = 5 # decorative + break + + decoration = props.get("text-decoration") + if decoration is not None: + decoration = decoration.split() + else: + decoration = () + + return { + "name": font_names[0] if font_names else None, + "family": family, + "size": size, + "bold": self.BOLD_MAP.get(props.get("font-weight")), + "italic": self.ITALIC_MAP.get(props.get("font-style")), + "underline": ("single" if "underline" in decoration else None), + "strike": ("line-through" in decoration) or None, + "color": self.color_to_excel(props.get("color")), + # shadow if nonzero digit before shadow color + "shadow": ( + bool(re.search("^[^#(]*[1-9]", props["text-shadow"])) + if "text-shadow" in props + else None + ), + # FIXME: dont leave commented-out + # 'vertAlign':, + # 'charset': , + # 'scheme': , + # 'outline': , + # 'condense': , + } + + NAMED_COLORS = { + "maroon": "800000", + "brown": "A52A2A", + "red": "FF0000", + "pink": "FFC0CB", + "orange": "FFA500", + "yellow": "FFFF00", + "olive": "808000", + "green": "008000", + "purple": "800080", + "fuchsia": "FF00FF", + "lime": "00FF00", + "teal": "008080", + "aqua": "00FFFF", + "blue": "0000FF", + "navy": "000080", + "black": "000000", + "gray": "808080", + "grey": "808080", + "silver": "C0C0C0", + "white": "FFFFFF", + } + + def color_to_excel(self, val: Optional[str]): + if val is None: + return None + if val.startswith("#") and len(val) == 7: + return val[1:].upper() + if val.startswith("#") and len(val) == 4: + return (val[1] * 2 + val[2] * 2 + val[3] * 2).upper() + try: + return self.NAMED_COLORS[val] + except KeyError: + warnings.warn(f"Unhandled color format: {repr(val)}", CSSWarning) + + def build_number_format(self, props: Dict) -> Dict[str, Optional[str]]: + return {"format_code": props.get("number-format")} + + +class ExcelFormatter: + """ + Class for formatting a DataFrame to a list of ExcelCells, + + Parameters + ---------- + df : DataFrame or Styler + na_rep: na representation + float_format : string, default None + Format string for floating point numbers + cols : sequence, optional + Columns to write + header : boolean or list of string, default True + Write out column names. If a list of string is given it is + assumed to be aliases for the column names + index : boolean, default True + output row names (index) + index_label : string or sequence, default None + Column label for index column(s) if desired. If None is given, and + `header` and `index` are True, then the index names are used. A + sequence should be given if the DataFrame uses MultiIndex. + merge_cells : boolean, default False + Format MultiIndex and Hierarchical Rows as merged cells. + inf_rep : string, default `'inf'` + representation for np.inf values (which aren't representable in Excel) + A `'-'` sign will be added in front of -inf. + style_converter : callable, optional + This translates Styler styles (CSS) into ExcelWriter styles. + Defaults to ``CSSToExcelConverter()``. + It should have signature css_declarations string -> excel style. + This is only called for body cells. + """ + + max_rows = 2 ** 20 + max_cols = 2 ** 14 + + def __init__( + self, + df, + na_rep: str = "", + float_format: Optional[str] = None, + cols: Optional[Sequence] = None, + header: Union[bool, List[str]] = True, + index: bool = True, + index_label: Union[str, Sequence, None] = None, + merge_cells: bool = False, + inf_rep: str = "inf", + style_converter: Optional[Callable] = None, + ): + self.rowcounter = 0 + self.na_rep = na_rep + if hasattr(df, "render"): + self.styler = df + df = df.data + if style_converter is None: + style_converter = CSSToExcelConverter() + self.style_converter = style_converter + else: + self.styler = None + self.df = df + if cols is not None: + + # all missing, raise + if not len(Index(cols) & df.columns): + raise KeyError("passes columns are not ALL present dataframe") + + if len(Index(cols) & df.columns) != len(cols): + # Deprecated in GH#17295, enforced in 1.0.0 + raise KeyError("Not all names specified in 'columns' are found") + + self.df = df + + self.columns = self.df.columns + self.float_format = float_format + self.index = index + self.index_label = index_label + self.header = header + self.merge_cells = merge_cells + self.inf_rep = inf_rep + + @property + def header_style(self): + return { + "font": {"bold": True}, + "borders": { + "top": "thin", + "right": "thin", + "bottom": "thin", + "left": "thin", + }, + "alignment": {"horizontal": "center", "vertical": "top"}, + } + + def _format_value(self, val): + if is_scalar(val) and missing.isna(val): + val = self.na_rep + elif is_float(val): + if missing.isposinf_scalar(val): + val = self.inf_rep + elif missing.isneginf_scalar(val): + val = f"-{self.inf_rep}" + elif self.float_format is not None: + val = float(self.float_format % val) + if getattr(val, "tzinfo", None) is not None: + raise ValueError( + "Excel does not support datetimes with " + "timezones. Please ensure that datetimes " + "are timezone unaware before writing to Excel." + ) + return val + + def _format_header_mi(self): + if self.columns.nlevels > 1: + if not self.index: + raise NotImplementedError( + "Writing to Excel with MultiIndex columns and no " + "index ('index'=False) is not yet implemented." + ) + + has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index)) + if not (has_aliases or self.header): + return + + columns = self.columns + level_strs = columns.format( + sparsify=self.merge_cells, adjoin=False, names=False + ) + level_lengths = get_level_lengths(level_strs) + coloffset = 0 + lnum = 0 + + if self.index and isinstance(self.df.index, ABCMultiIndex): + coloffset = len(self.df.index[0]) - 1 + + if self.merge_cells: + # Format multi-index as a merged cells. + for lnum in range(len(level_lengths)): + name = columns.names[lnum] + yield ExcelCell(lnum, coloffset, name, self.header_style) + + for lnum, (spans, levels, level_codes) in enumerate( + zip(level_lengths, columns.levels, columns.codes) + ): + values = levels.take(level_codes) + for i in spans: + if spans[i] > 1: + yield ExcelCell( + lnum, + coloffset + i + 1, + values[i], + self.header_style, + lnum, + coloffset + i + spans[i], + ) + else: + yield ExcelCell( + lnum, coloffset + i + 1, values[i], self.header_style + ) + else: + # Format in legacy format with dots to indicate levels. + for i, values in enumerate(zip(*level_strs)): + v = ".".join(map(pprint_thing, values)) + yield ExcelCell(lnum, coloffset + i + 1, v, self.header_style) + + self.rowcounter = lnum + + def _format_header_regular(self): + has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index)) + if has_aliases or self.header: + coloffset = 0 + + if self.index: + coloffset = 1 + if isinstance(self.df.index, ABCMultiIndex): + coloffset = len(self.df.index[0]) + + colnames = self.columns + if has_aliases: + if len(self.header) != len(self.columns): + raise ValueError( + f"Writing {len(self.columns)} cols but got {len(self.header)} " + "aliases" + ) + else: + colnames = self.header + + for colindex, colname in enumerate(colnames): + yield ExcelCell( + self.rowcounter, colindex + coloffset, colname, self.header_style + ) + + def _format_header(self): + if isinstance(self.columns, ABCMultiIndex): + gen = self._format_header_mi() + else: + gen = self._format_header_regular() + + gen2 = () + if self.df.index.names: + row = [x if x is not None else "" for x in self.df.index.names] + [ + "" + ] * len(self.columns) + if reduce(lambda x, y: x and y, map(lambda x: x != "", row)): + gen2 = ( + ExcelCell(self.rowcounter, colindex, val, self.header_style) + for colindex, val in enumerate(row) + ) + self.rowcounter += 1 + return itertools.chain(gen, gen2) + + def _format_body(self): + if isinstance(self.df.index, ABCMultiIndex): + return self._format_hierarchical_rows() + else: + return self._format_regular_rows() + + def _format_regular_rows(self): + has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index)) + if has_aliases or self.header: + self.rowcounter += 1 + + # output index and index_label? + if self.index: + # check aliases + # if list only take first as this is not a MultiIndex + if self.index_label and isinstance( + self.index_label, (list, tuple, np.ndarray, Index) + ): + index_label = self.index_label[0] + # if string good to go + elif self.index_label and isinstance(self.index_label, str): + index_label = self.index_label + else: + index_label = self.df.index.names[0] + + if isinstance(self.columns, ABCMultiIndex): + self.rowcounter += 1 + + if index_label and self.header is not False: + yield ExcelCell(self.rowcounter - 1, 0, index_label, self.header_style) + + # write index_values + index_values = self.df.index + if isinstance(self.df.index, ABCPeriodIndex): + index_values = self.df.index.to_timestamp() + + for idx, idxval in enumerate(index_values): + yield ExcelCell(self.rowcounter + idx, 0, idxval, self.header_style) + + coloffset = 1 + else: + coloffset = 0 + + for cell in self._generate_body(coloffset): + yield cell + + def _format_hierarchical_rows(self): + has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index)) + if has_aliases or self.header: + self.rowcounter += 1 + + gcolidx = 0 + + if self.index: + index_labels = self.df.index.names + # check for aliases + if self.index_label and isinstance( + self.index_label, (list, tuple, np.ndarray, Index) + ): + index_labels = self.index_label + + # MultiIndex columns require an extra row + # with index names (blank if None) for + # unambiguous round-trip, unless not merging, + # in which case the names all go on one row Issue #11328 + if isinstance(self.columns, ABCMultiIndex) and self.merge_cells: + self.rowcounter += 1 + + # if index labels are not empty go ahead and dump + if com.any_not_none(*index_labels) and self.header is not False: + + for cidx, name in enumerate(index_labels): + yield ExcelCell(self.rowcounter - 1, cidx, name, self.header_style) + + if self.merge_cells: + # Format hierarchical rows as merged cells. + level_strs = self.df.index.format( + sparsify=True, adjoin=False, names=False + ) + level_lengths = get_level_lengths(level_strs) + + for spans, levels, level_codes in zip( + level_lengths, self.df.index.levels, self.df.index.codes + ): + + values = levels.take( + level_codes, allow_fill=levels._can_hold_na, fill_value=True + ) + + for i in spans: + if spans[i] > 1: + yield ExcelCell( + self.rowcounter + i, + gcolidx, + values[i], + self.header_style, + self.rowcounter + i + spans[i] - 1, + gcolidx, + ) + else: + yield ExcelCell( + self.rowcounter + i, + gcolidx, + values[i], + self.header_style, + ) + gcolidx += 1 + + else: + # Format hierarchical rows with non-merged values. + for indexcolvals in zip(*self.df.index): + for idx, indexcolval in enumerate(indexcolvals): + yield ExcelCell( + self.rowcounter + idx, + gcolidx, + indexcolval, + self.header_style, + ) + gcolidx += 1 + + for cell in self._generate_body(gcolidx): + yield cell + + def _generate_body(self, coloffset: int): + if self.styler is None: + styles = None + else: + styles = self.styler._compute().ctx + if not styles: + styles = None + xlstyle = None + + # Write the body of the frame data series by series. + for colidx in range(len(self.columns)): + series = self.df.iloc[:, colidx] + for i, val in enumerate(series): + if styles is not None: + xlstyle = self.style_converter(";".join(styles[i, colidx])) + yield ExcelCell(self.rowcounter + i, colidx + coloffset, val, xlstyle) + + def get_formatted_cells(self): + for cell in itertools.chain(self._format_header(), self._format_body()): + cell.val = self._format_value(cell.val) + yield cell + + def write( + self, + writer, + sheet_name="Sheet1", + startrow=0, + startcol=0, + freeze_panes=None, + engine=None, + ): + """ + writer : string or ExcelWriter object + File path or existing ExcelWriter + sheet_name : string, default 'Sheet1' + Name of sheet which will contain DataFrame + startrow : + upper left cell row to dump data frame + startcol : + upper left cell column to dump data frame + freeze_panes : tuple of integer (length 2), default None + Specifies the one-based bottommost row and rightmost column that + is to be frozen + engine : string, default None + write engine to use if writer is a path - you can also set this + via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, + and ``io.excel.xlsm.writer``. + """ + from pandas.io.excel import ExcelWriter + + num_rows, num_cols = self.df.shape + if num_rows > self.max_rows or num_cols > self.max_cols: + raise ValueError( + f"This sheet is too large! Your sheet size is: {num_rows}, {num_cols} " + f"Max sheet size is: {self.max_rows}, {self.max_cols}" + ) + + if isinstance(writer, ExcelWriter): + need_save = False + else: + writer = ExcelWriter(stringify_path(writer), engine=engine) + need_save = True + + formatted_cells = self.get_formatted_cells() + writer.write_cells( + formatted_cells, + sheet_name, + startrow=startrow, + startcol=startcol, + freeze_panes=freeze_panes, + ) + if need_save: + writer.save() diff --git a/venv/Lib/site-packages/pandas/io/formats/format.py b/venv/Lib/site-packages/pandas/io/formats/format.py new file mode 100644 index 0000000..6adf69a --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/formats/format.py @@ -0,0 +1,1994 @@ +""" +Internal module for formatting output data in csv, html, +and latex files. This module also applies to display formatting. +""" + +from contextlib import contextmanager +from datetime import tzinfo +import decimal +from functools import partial +from io import StringIO +import math +import re +from shutil import get_terminal_size +from typing import ( + IO, + TYPE_CHECKING, + Any, + Callable, + Dict, + Iterable, + List, + Mapping, + Optional, + Sequence, + Tuple, + Type, + Union, + cast, +) +from unicodedata import east_asian_width + +import numpy as np + +from pandas._config.config import get_option, set_option + +from pandas._libs import lib +from pandas._libs.missing import NA +from pandas._libs.tslib import format_array_from_datetime +from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT +from pandas._libs.tslibs.nattype import NaTType +from pandas._typing import FilePathOrBuffer +from pandas.errors import AbstractMethodError + +from pandas.core.dtypes.common import ( + is_categorical_dtype, + is_complex_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_extension_array_dtype, + is_float, + is_float_dtype, + is_integer, + is_integer_dtype, + is_list_like, + is_numeric_dtype, + is_scalar, + is_timedelta64_dtype, +) +from pandas.core.dtypes.generic import ( + ABCIndexClass, + ABCMultiIndex, + ABCSeries, + ABCSparseArray, +) +from pandas.core.dtypes.missing import isna, notna + +from pandas.core.arrays.datetimes import DatetimeArray +from pandas.core.arrays.timedeltas import TimedeltaArray +from pandas.core.base import PandasObject +import pandas.core.common as com +from pandas.core.indexes.api import Index, ensure_index +from pandas.core.indexes.datetimes import DatetimeIndex +from pandas.core.indexes.timedeltas import TimedeltaIndex + +from pandas.io.common import stringify_path +from pandas.io.formats.printing import adjoin, justify, pprint_thing + +if TYPE_CHECKING: + from pandas import Series, DataFrame, Categorical + +formatters_type = Union[ + List[Callable], Tuple[Callable, ...], Mapping[Union[str, int], Callable] +] +float_format_type = Union[str, Callable, "EngFormatter"] + +common_docstring = """ + Parameters + ---------- + buf : str, Path or StringIO-like, optional, default None + Buffer to write to. If None, the output is returned as a string. + columns : sequence, optional, default None + The subset of columns to write. Writes all columns by default. + col_space : %(col_space_type)s, optional + %(col_space)s. + header : %(header_type)s, optional + %(header)s. + index : bool, optional, default True + Whether to print index (row) labels. + na_rep : str, optional, default 'NaN' + String representation of NAN to use. + formatters : list, tuple or dict of one-param. functions, optional + Formatter functions to apply to columns' elements by position or + name. + The result of each function must be a unicode string. + List/tuple must be of length equal to the number of columns. + float_format : one-parameter function, optional, default None + Formatter function to apply to columns' elements if they are + floats. The result of this function must be a unicode string. + sparsify : bool, optional, default True + Set to False for a DataFrame with a hierarchical index to print + every multiindex key at each row. + index_names : bool, optional, default True + Prints the names of the indexes. + justify : str, default None + How to justify the column labels. If None uses the option from + the print configuration (controlled by set_option), 'right' out + of the box. Valid values are + + * left + * right + * center + * justify + * justify-all + * start + * end + * inherit + * match-parent + * initial + * unset. + max_rows : int, optional + Maximum number of rows to display in the console. + min_rows : int, optional + The number of rows to display in the console in a truncated repr + (when number of rows is above `max_rows`). + max_cols : int, optional + Maximum number of columns to display in the console. + show_dimensions : bool, default False + Display DataFrame dimensions (number of rows by number of columns). + decimal : str, default '.' + Character recognized as decimal separator, e.g. ',' in Europe. + """ + +_VALID_JUSTIFY_PARAMETERS = ( + "left", + "right", + "center", + "justify", + "justify-all", + "start", + "end", + "inherit", + "match-parent", + "initial", + "unset", +) + +return_docstring = """ + Returns + ------- + str or None + If buf is None, returns the result as a string. Otherwise returns + None. + """ + + +class CategoricalFormatter: + def __init__( + self, + categorical: "Categorical", + buf: Optional[IO[str]] = None, + length: bool = True, + na_rep: str = "NaN", + footer: bool = True, + ): + self.categorical = categorical + self.buf = buf if buf is not None else StringIO("") + self.na_rep = na_rep + self.length = length + self.footer = footer + + def _get_footer(self) -> str: + footer = "" + + if self.length: + if footer: + footer += ", " + footer += "Length: {length}".format(length=len(self.categorical)) + + level_info = self.categorical._repr_categories_info() + + # Levels are added in a newline + if footer: + footer += "\n" + footer += level_info + + return str(footer) + + def _get_formatted_values(self) -> List[str]: + return format_array( + self.categorical._internal_get_values(), + None, + float_format=None, + na_rep=self.na_rep, + ) + + def to_string(self) -> str: + categorical = self.categorical + + if len(categorical) == 0: + if self.footer: + return self._get_footer() + else: + return "" + + fmt_values = self._get_formatted_values() + + fmt_values = ["{i}".format(i=i) for i in fmt_values] + fmt_values = [i.strip() for i in fmt_values] + values = ", ".join(fmt_values) + result = ["[" + values + "]"] + if self.footer: + footer = self._get_footer() + if footer: + result.append(footer) + + return str("\n".join(result)) + + +class SeriesFormatter: + def __init__( + self, + series: "Series", + buf: Optional[IO[str]] = None, + length: Union[bool, str] = True, + header: bool = True, + index: bool = True, + na_rep: str = "NaN", + name: bool = False, + float_format: Optional[str] = None, + dtype: bool = True, + max_rows: Optional[int] = None, + min_rows: Optional[int] = None, + ): + self.series = series + self.buf = buf if buf is not None else StringIO() + self.name = name + self.na_rep = na_rep + self.header = header + self.length = length + self.index = index + self.max_rows = max_rows + self.min_rows = min_rows + + if float_format is None: + float_format = get_option("display.float_format") + self.float_format = float_format + self.dtype = dtype + self.adj = _get_adjustment() + + self._chk_truncate() + + def _chk_truncate(self) -> None: + from pandas.core.reshape.concat import concat + + self.tr_row_num: Optional[int] + + min_rows = self.min_rows + max_rows = self.max_rows + # truncation determined by max_rows, actual truncated number of rows + # used below by min_rows + truncate_v = max_rows and (len(self.series) > max_rows) + series = self.series + if truncate_v: + max_rows = cast(int, max_rows) + if min_rows: + # if min_rows is set (not None or 0), set max_rows to minimum + # of both + max_rows = min(min_rows, max_rows) + if max_rows == 1: + row_num = max_rows + series = series.iloc[:max_rows] + else: + row_num = max_rows // 2 + series = series._ensure_type( + concat((series.iloc[:row_num], series.iloc[-row_num:])) + ) + self.tr_row_num = row_num + else: + self.tr_row_num = None + self.tr_series = series + self.truncate_v = truncate_v + + def _get_footer(self) -> str: + name = self.series.name + footer = "" + + if getattr(self.series.index, "freq", None) is not None: + footer += "Freq: {freq}".format(freq=self.series.index.freqstr) + + if self.name is not False and name is not None: + if footer: + footer += ", " + + series_name = pprint_thing(name, escape_chars=("\t", "\r", "\n")) + footer += ( + ("Name: {sname}".format(sname=series_name)) if name is not None else "" + ) + + if self.length is True or (self.length == "truncate" and self.truncate_v): + if footer: + footer += ", " + footer += "Length: {length}".format(length=len(self.series)) + + if self.dtype is not False and self.dtype is not None: + name = getattr(self.tr_series.dtype, "name", None) + if name: + if footer: + footer += ", " + footer += "dtype: {typ}".format(typ=pprint_thing(name)) + + # level infos are added to the end and in a new line, like it is done + # for Categoricals + if is_categorical_dtype(self.tr_series.dtype): + level_info = self.tr_series._values._repr_categories_info() + if footer: + footer += "\n" + footer += level_info + + return str(footer) + + def _get_formatted_index(self) -> Tuple[List[str], bool]: + index = self.tr_series.index + is_multi = isinstance(index, ABCMultiIndex) + + if is_multi: + have_header = any(name for name in index.names) + fmt_index = index.format(names=True) + else: + have_header = index.name is not None + fmt_index = index.format(name=True) + return fmt_index, have_header + + def _get_formatted_values(self) -> List[str]: + return format_array( + self.tr_series._values, + None, + float_format=self.float_format, + na_rep=self.na_rep, + ) + + def to_string(self) -> str: + series = self.tr_series + footer = self._get_footer() + + if len(series) == 0: + return "{name}([], {footer})".format( + name=type(self.series).__name__, footer=footer + ) + + fmt_index, have_header = self._get_formatted_index() + fmt_values = self._get_formatted_values() + + if self.truncate_v: + n_header_rows = 0 + row_num = self.tr_row_num + row_num = cast(int, row_num) + width = self.adj.len(fmt_values[row_num - 1]) + if width > 3: + dot_str = "..." + else: + dot_str = ".." + # Series uses mode=center because it has single value columns + # DataFrame uses mode=left + dot_str = self.adj.justify([dot_str], width, mode="center")[0] + fmt_values.insert(row_num + n_header_rows, dot_str) + fmt_index.insert(row_num + 1, "") + + if self.index: + result = self.adj.adjoin(3, *[fmt_index[1:], fmt_values]) + else: + result = self.adj.adjoin(3, fmt_values) + + if self.header and have_header: + result = fmt_index[0] + "\n" + result + + if footer: + result += "\n" + footer + + return str("".join(result)) + + +class TextAdjustment: + def __init__(self): + self.encoding = get_option("display.encoding") + + def len(self, text: str) -> int: + return len(text) + + def justify(self, texts: Any, max_len: int, mode: str = "right") -> List[str]: + return justify(texts, max_len, mode=mode) + + def adjoin(self, space: int, *lists, **kwargs) -> str: + return adjoin(space, *lists, strlen=self.len, justfunc=self.justify, **kwargs) + + +class EastAsianTextAdjustment(TextAdjustment): + def __init__(self): + super().__init__() + if get_option("display.unicode.ambiguous_as_wide"): + self.ambiguous_width = 2 + else: + self.ambiguous_width = 1 + + # Definition of East Asian Width + # http://unicode.org/reports/tr11/ + # Ambiguous width can be changed by option + self._EAW_MAP = {"Na": 1, "N": 1, "W": 2, "F": 2, "H": 1} + + def len(self, text: str) -> int: + """ + Calculate display width considering unicode East Asian Width + """ + if not isinstance(text, str): + return len(text) + + return sum( + self._EAW_MAP.get(east_asian_width(c), self.ambiguous_width) for c in text + ) + + def justify( + self, texts: Iterable[str], max_len: int, mode: str = "right" + ) -> List[str]: + # re-calculate padding space per str considering East Asian Width + def _get_pad(t): + return max_len - self.len(t) + len(t) + + if mode == "left": + return [x.ljust(_get_pad(x)) for x in texts] + elif mode == "center": + return [x.center(_get_pad(x)) for x in texts] + else: + return [x.rjust(_get_pad(x)) for x in texts] + + +def _get_adjustment() -> TextAdjustment: + use_east_asian_width = get_option("display.unicode.east_asian_width") + if use_east_asian_width: + return EastAsianTextAdjustment() + else: + return TextAdjustment() + + +class TableFormatter: + + show_dimensions: Union[bool, str] + is_truncated: bool + formatters: formatters_type + columns: Index + + @property + def should_show_dimensions(self) -> bool: + return self.show_dimensions is True or ( + self.show_dimensions == "truncate" and self.is_truncated + ) + + def _get_formatter(self, i: Union[str, int]) -> Optional[Callable]: + if isinstance(self.formatters, (list, tuple)): + if is_integer(i): + i = cast(int, i) + return self.formatters[i] + else: + return None + else: + if is_integer(i) and i not in self.columns: + i = self.columns[i] + return self.formatters.get(i, None) + + @contextmanager + def get_buffer( + self, buf: Optional[FilePathOrBuffer[str]], encoding: Optional[str] = None + ): + """ + Context manager to open, yield and close buffer for filenames or Path-like + objects, otherwise yield buf unchanged. + """ + if buf is not None: + buf = stringify_path(buf) + else: + buf = StringIO() + + if encoding is None: + encoding = "utf-8" + elif not isinstance(buf, str): + raise ValueError("buf is not a file name and encoding is specified.") + + if hasattr(buf, "write"): + yield buf + elif isinstance(buf, str): + with open(buf, "w", encoding=encoding, newline="") as f: + # GH#30034 open instead of codecs.open prevents a file leak + # if we have an invalid encoding argument. + # newline="" is needed to roundtrip correctly on + # windows test_to_latex_filename + yield f + else: + raise TypeError("buf is not a file name and it has no write method") + + def write_result(self, buf: IO[str]) -> None: + """ + Write the result of serialization to buf. + """ + raise AbstractMethodError(self) + + def get_result( + self, + buf: Optional[FilePathOrBuffer[str]] = None, + encoding: Optional[str] = None, + ) -> Optional[str]: + """ + Perform serialization. Write to buf or return as string if buf is None. + """ + with self.get_buffer(buf, encoding=encoding) as f: + self.write_result(buf=f) + if buf is None: + return f.getvalue() + return None + + +class DataFrameFormatter(TableFormatter): + """ + Render a DataFrame + + self.to_string() : console-friendly tabular output + self.to_html() : html table + self.to_latex() : LaTeX tabular environment table + + """ + + __doc__ = __doc__ if __doc__ else "" + __doc__ += common_docstring + return_docstring + + def __init__( + self, + frame: "DataFrame", + columns: Optional[Sequence[str]] = None, + col_space: Optional[Union[str, int]] = None, + header: Union[bool, Sequence[str]] = True, + index: bool = True, + na_rep: str = "NaN", + formatters: Optional[formatters_type] = None, + justify: Optional[str] = None, + float_format: Optional[float_format_type] = None, + sparsify: Optional[bool] = None, + index_names: bool = True, + line_width: Optional[int] = None, + max_rows: Optional[int] = None, + min_rows: Optional[int] = None, + max_cols: Optional[int] = None, + show_dimensions: Union[bool, str] = False, + decimal: str = ".", + table_id: Optional[str] = None, + render_links: bool = False, + bold_rows: bool = False, + escape: bool = True, + ): + self.frame = frame + self.show_index_names = index_names + + if sparsify is None: + sparsify = get_option("display.multi_sparse") + + self.sparsify = sparsify + + self.float_format = float_format + if formatters is None: + self.formatters = {} + elif len(frame.columns) == len(formatters) or isinstance(formatters, dict): + self.formatters = formatters + else: + raise ValueError( + ( + "Formatters length({flen}) should match " + "DataFrame number of columns({dlen})" + ).format(flen=len(formatters), dlen=len(frame.columns)) + ) + self.na_rep = na_rep + self.decimal = decimal + self.col_space = col_space + self.header = header + self.index = index + self.line_width = line_width + self.max_rows = max_rows + self.min_rows = min_rows + self.max_cols = max_cols + self.max_rows_displayed = min(max_rows or len(self.frame), len(self.frame)) + self.show_dimensions = show_dimensions + self.table_id = table_id + self.render_links = render_links + + if justify is None: + self.justify = get_option("display.colheader_justify") + else: + self.justify = justify + + self.bold_rows = bold_rows + self.escape = escape + + if columns is not None: + self.columns = ensure_index(columns) + self.frame = self.frame[self.columns] + else: + self.columns = frame.columns + + self._chk_truncate() + self.adj = _get_adjustment() + + def _chk_truncate(self) -> None: + """ + Checks whether the frame should be truncated. If so, slices + the frame up. + """ + from pandas.core.reshape.concat import concat + + # Cut the data to the information actually printed + max_cols = self.max_cols + max_rows = self.max_rows + self.max_rows_adj: Optional[int] + max_rows_adj: Optional[int] + + if max_cols == 0 or max_rows == 0: # assume we are in the terminal + (w, h) = get_terminal_size() + self.w = w + self.h = h + if self.max_rows == 0: + dot_row = 1 + prompt_row = 1 + if self.show_dimensions: + show_dimension_rows = 3 + # assume we only get here if self.header is boolean. + # i.e. not to_latex() where self.header may be List[str] + self.header = cast(bool, self.header) + n_add_rows = self.header + dot_row + show_dimension_rows + prompt_row + # rows available to fill with actual data + max_rows_adj = self.h - n_add_rows + self.max_rows_adj = max_rows_adj + + # Format only rows and columns that could potentially fit the + # screen + if max_cols == 0 and len(self.frame.columns) > w: + max_cols = w + if max_rows == 0 and len(self.frame) > h: + max_rows = h + + if not hasattr(self, "max_rows_adj"): + if max_rows: + if (len(self.frame) > max_rows) and self.min_rows: + # if truncated, set max_rows showed to min_rows + max_rows = min(self.min_rows, max_rows) + self.max_rows_adj = max_rows + if not hasattr(self, "max_cols_adj"): + self.max_cols_adj = max_cols + + max_cols_adj = self.max_cols_adj + max_rows_adj = self.max_rows_adj + + truncate_h = max_cols_adj and (len(self.columns) > max_cols_adj) + truncate_v = max_rows_adj and (len(self.frame) > max_rows_adj) + + frame = self.frame + if truncate_h: + # cast here since if truncate_h is True, max_cols_adj is not None + max_cols_adj = cast(int, max_cols_adj) + if max_cols_adj == 0: + col_num = len(frame.columns) + elif max_cols_adj == 1: + max_cols = cast(int, max_cols) + frame = frame.iloc[:, :max_cols] + col_num = max_cols + else: + col_num = max_cols_adj // 2 + frame = concat( + (frame.iloc[:, :col_num], frame.iloc[:, -col_num:]), axis=1 + ) + # truncate formatter + if isinstance(self.formatters, (list, tuple)): + truncate_fmt = self.formatters + self.formatters = [ + *truncate_fmt[:col_num], + *truncate_fmt[-col_num:], + ] + self.tr_col_num = col_num + if truncate_v: + # cast here since if truncate_v is True, max_rows_adj is not None + max_rows_adj = cast(int, max_rows_adj) + if max_rows_adj == 1: + row_num = max_rows + frame = frame.iloc[:max_rows, :] + else: + row_num = max_rows_adj // 2 + frame = concat((frame.iloc[:row_num, :], frame.iloc[-row_num:, :])) + self.tr_row_num = row_num + else: + self.tr_row_num = None + + self.tr_frame = frame + self.truncate_h = truncate_h + self.truncate_v = truncate_v + self.is_truncated = bool(self.truncate_h or self.truncate_v) + + def _to_str_columns(self) -> List[List[str]]: + """ + Render a DataFrame to a list of columns (as lists of strings). + """ + # this method is not used by to_html where self.col_space + # could be a string so safe to cast + self.col_space = cast(int, self.col_space) + + frame = self.tr_frame + # may include levels names also + + str_index = self._get_formatted_index(frame) + + if not is_list_like(self.header) and not self.header: + stringified = [] + for i, c in enumerate(frame): + fmt_values = self._format_col(i) + fmt_values = _make_fixed_width( + fmt_values, + self.justify, + minimum=(self.col_space or 0), + adj=self.adj, + ) + stringified.append(fmt_values) + else: + if is_list_like(self.header): + # cast here since can't be bool if is_list_like + self.header = cast(List[str], self.header) + if len(self.header) != len(self.columns): + raise ValueError( + ( + "Writing {ncols} cols but got {nalias} " + "aliases".format( + ncols=len(self.columns), nalias=len(self.header) + ) + ) + ) + str_columns = [[label] for label in self.header] + else: + str_columns = self._get_formatted_column_labels(frame) + + if self.show_row_idx_names: + for x in str_columns: + x.append("") + + stringified = [] + for i, c in enumerate(frame): + cheader = str_columns[i] + header_colwidth = max( + self.col_space or 0, *(self.adj.len(x) for x in cheader) + ) + fmt_values = self._format_col(i) + fmt_values = _make_fixed_width( + fmt_values, self.justify, minimum=header_colwidth, adj=self.adj + ) + + max_len = max(max(self.adj.len(x) for x in fmt_values), header_colwidth) + cheader = self.adj.justify(cheader, max_len, mode=self.justify) + stringified.append(cheader + fmt_values) + + strcols = stringified + if self.index: + strcols.insert(0, str_index) + + # Add ... to signal truncated + truncate_h = self.truncate_h + truncate_v = self.truncate_v + + if truncate_h: + col_num = self.tr_col_num + strcols.insert(self.tr_col_num + 1, [" ..."] * (len(str_index))) + if truncate_v: + n_header_rows = len(str_index) - len(frame) + row_num = self.tr_row_num + # cast here since if truncate_v is True, self.tr_row_num is not None + row_num = cast(int, row_num) + for ix, col in enumerate(strcols): + # infer from above row + cwidth = self.adj.len(strcols[ix][row_num]) + is_dot_col = False + if truncate_h: + is_dot_col = ix == col_num + 1 + if cwidth > 3 or is_dot_col: + my_str = "..." + else: + my_str = ".." + + if ix == 0: + dot_mode = "left" + elif is_dot_col: + cwidth = 4 + dot_mode = "right" + else: + dot_mode = "right" + dot_str = self.adj.justify([my_str], cwidth, mode=dot_mode)[0] + strcols[ix].insert(row_num + n_header_rows, dot_str) + return strcols + + def write_result(self, buf: IO[str]) -> None: + """ + Render a DataFrame to a console-friendly tabular output. + """ + from pandas import Series + + frame = self.frame + + if len(frame.columns) == 0 or len(frame.index) == 0: + info_line = "Empty {name}\nColumns: {col}\nIndex: {idx}".format( + name=type(self.frame).__name__, + col=pprint_thing(frame.columns), + idx=pprint_thing(frame.index), + ) + text = info_line + else: + + strcols = self._to_str_columns() + if self.line_width is None: # no need to wrap around just print + # the whole frame + text = self.adj.adjoin(1, *strcols) + elif ( + not isinstance(self.max_cols, int) or self.max_cols > 0 + ): # need to wrap around + text = self._join_multiline(*strcols) + else: # max_cols == 0. Try to fit frame to terminal + lines = self.adj.adjoin(1, *strcols).split("\n") + max_len = Series(lines).str.len().max() + # plus truncate dot col + dif = max_len - self.w + # '+ 1' to avoid too wide repr (GH PR #17023) + adj_dif = dif + 1 + col_lens = Series([Series(ele).apply(len).max() for ele in strcols]) + n_cols = len(col_lens) + counter = 0 + while adj_dif > 0 and n_cols > 1: + counter += 1 + mid = int(round(n_cols / 2.0)) + mid_ix = col_lens.index[mid] + col_len = col_lens[mid_ix] + # adjoin adds one + adj_dif -= col_len + 1 + col_lens = col_lens.drop(mid_ix) + n_cols = len(col_lens) + # subtract index column + max_cols_adj = n_cols - self.index + # GH-21180. Ensure that we print at least two. + max_cols_adj = max(max_cols_adj, 2) + self.max_cols_adj = max_cols_adj + + # Call again _chk_truncate to cut frame appropriately + # and then generate string representation + self._chk_truncate() + strcols = self._to_str_columns() + text = self.adj.adjoin(1, *strcols) + buf.writelines(text) + + if self.should_show_dimensions: + buf.write( + "\n\n[{nrows} rows x {ncols} columns]".format( + nrows=len(frame), ncols=len(frame.columns) + ) + ) + + def _join_multiline(self, *args) -> str: + lwidth = self.line_width + adjoin_width = 1 + strcols = list(args) + if self.index: + idx = strcols.pop(0) + lwidth -= np.array([self.adj.len(x) for x in idx]).max() + adjoin_width + + col_widths = [ + np.array([self.adj.len(x) for x in col]).max() if len(col) > 0 else 0 + for col in strcols + ] + + assert lwidth is not None + col_bins = _binify(col_widths, lwidth) + nbins = len(col_bins) + + if self.truncate_v: + # cast here since if truncate_v is True, max_rows_adj is not None + self.max_rows_adj = cast(int, self.max_rows_adj) + nrows = self.max_rows_adj + 1 + else: + nrows = len(self.frame) + + str_lst = [] + st = 0 + for i, ed in enumerate(col_bins): + row = strcols[st:ed] + if self.index: + row.insert(0, idx) + if nbins > 1: + if ed <= len(strcols) and i < nbins - 1: + row.append([" \\"] + [" "] * (nrows - 1)) + else: + row.append([" "] * nrows) + str_lst.append(self.adj.adjoin(adjoin_width, *row)) + st = ed + return "\n\n".join(str_lst) + + def to_string( + self, + buf: Optional[FilePathOrBuffer[str]] = None, + encoding: Optional[str] = None, + ) -> Optional[str]: + return self.get_result(buf=buf, encoding=encoding) + + def to_latex( + self, + buf: Optional[FilePathOrBuffer[str]] = None, + column_format: Optional[str] = None, + longtable: bool = False, + encoding: Optional[str] = None, + multicolumn: bool = False, + multicolumn_format: Optional[str] = None, + multirow: bool = False, + caption: Optional[str] = None, + label: Optional[str] = None, + ) -> Optional[str]: + """ + Render a DataFrame to a LaTeX tabular/longtable environment output. + """ + + from pandas.io.formats.latex import LatexFormatter + + return LatexFormatter( + self, + column_format=column_format, + longtable=longtable, + multicolumn=multicolumn, + multicolumn_format=multicolumn_format, + multirow=multirow, + caption=caption, + label=label, + ).get_result(buf=buf, encoding=encoding) + + def _format_col(self, i: int) -> List[str]: + frame = self.tr_frame + formatter = self._get_formatter(i) + return format_array( + frame.iloc[:, i]._values, + formatter, + float_format=self.float_format, + na_rep=self.na_rep, + space=self.col_space, + decimal=self.decimal, + ) + + def to_html( + self, + buf: Optional[FilePathOrBuffer[str]] = None, + encoding: Optional[str] = None, + classes: Optional[Union[str, List, Tuple]] = None, + notebook: bool = False, + border: Optional[int] = None, + ) -> Optional[str]: + """ + Render a DataFrame to a html table. + + Parameters + ---------- + classes : str or list-like + classes to include in the `class` attribute of the opening + ``
`` tag, in addition to the default "dataframe". + notebook : {True, False}, optional, default False + Whether the generated HTML is for IPython Notebook. + border : int + A ``border=border`` attribute is included in the opening + ``
`` tag. Default ``pd.options.display.html.border``. + """ + from pandas.io.formats.html import HTMLFormatter, NotebookFormatter + + Klass = NotebookFormatter if notebook else HTMLFormatter + return Klass(self, classes=classes, border=border).get_result( + buf=buf, encoding=encoding + ) + + def _get_formatted_column_labels(self, frame: "DataFrame") -> List[List[str]]: + from pandas.core.indexes.multi import _sparsify + + columns = frame.columns + + if isinstance(columns, ABCMultiIndex): + fmt_columns = columns.format(sparsify=False, adjoin=False) + fmt_columns = list(zip(*fmt_columns)) + dtypes = self.frame.dtypes._values + + # if we have a Float level, they don't use leading space at all + restrict_formatting = any(l.is_floating for l in columns.levels) + need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) + + def space_format(x, y): + if ( + y not in self.formatters + and need_leadsp[x] + and not restrict_formatting + ): + return " " + y + return y + + str_columns = list( + zip(*[[space_format(x, y) for y in x] for x in fmt_columns]) + ) + if self.sparsify and len(str_columns): + str_columns = _sparsify(str_columns) + + str_columns = [list(x) for x in zip(*str_columns)] + else: + fmt_columns = columns.format() + dtypes = self.frame.dtypes + need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) + str_columns = [ + [" " + x if not self._get_formatter(i) and need_leadsp[x] else x] + for i, (col, x) in enumerate(zip(columns, fmt_columns)) + ] + # self.str_columns = str_columns + return str_columns + + @property + def has_index_names(self) -> bool: + return _has_names(self.frame.index) + + @property + def has_column_names(self) -> bool: + return _has_names(self.frame.columns) + + @property + def show_row_idx_names(self) -> bool: + return all((self.has_index_names, self.index, self.show_index_names)) + + @property + def show_col_idx_names(self) -> bool: + return all((self.has_column_names, self.show_index_names, self.header)) + + def _get_formatted_index(self, frame: "DataFrame") -> List[str]: + # Note: this is only used by to_string() and to_latex(), not by + # to_html(). so safe to cast col_space here. + self.col_space = cast(int, self.col_space) + index = frame.index + columns = frame.columns + fmt = self._get_formatter("__index__") + + if isinstance(index, ABCMultiIndex): + fmt_index = index.format( + sparsify=self.sparsify, + adjoin=False, + names=self.show_row_idx_names, + formatter=fmt, + ) + else: + fmt_index = [index.format(name=self.show_row_idx_names, formatter=fmt)] + + fmt_index = [ + tuple( + _make_fixed_width( + list(x), justify="left", minimum=(self.col_space or 0), adj=self.adj + ) + ) + for x in fmt_index + ] + + adjoined = self.adj.adjoin(1, *fmt_index).split("\n") + + # empty space for columns + if self.show_col_idx_names: + col_header = ["{x}".format(x=x) for x in self._get_column_name_list()] + else: + col_header = [""] * columns.nlevels + + if self.header: + return col_header + adjoined + else: + return adjoined + + def _get_column_name_list(self) -> List[str]: + names: List[str] = [] + columns = self.frame.columns + if isinstance(columns, ABCMultiIndex): + names.extend("" if name is None else name for name in columns.names) + else: + names.append("" if columns.name is None else columns.name) + return names + + +# ---------------------------------------------------------------------- +# Array formatters + + +def format_array( + values: Any, + formatter: Optional[Callable], + float_format: Optional[float_format_type] = None, + na_rep: str = "NaN", + digits: Optional[int] = None, + space: Optional[Union[str, int]] = None, + justify: str = "right", + decimal: str = ".", + leading_space: Optional[bool] = None, +) -> List[str]: + """ + Format an array for printing. + + Parameters + ---------- + values + formatter + float_format + na_rep + digits + space + justify + decimal + leading_space : bool, optional + Whether the array should be formatted with a leading space. + When an array as a column of a Series or DataFrame, we do want + the leading space to pad between columns. + + When formatting an Index subclass + (e.g. IntervalIndex._format_native_types), we don't want the + leading space since it should be left-aligned. + + Returns + ------- + List[str] + """ + + fmt_klass: Type[GenericArrayFormatter] + if is_datetime64_dtype(values.dtype): + fmt_klass = Datetime64Formatter + elif is_datetime64tz_dtype(values): + fmt_klass = Datetime64TZFormatter + elif is_timedelta64_dtype(values.dtype): + fmt_klass = Timedelta64Formatter + elif is_extension_array_dtype(values.dtype): + fmt_klass = ExtensionArrayFormatter + elif is_float_dtype(values.dtype) or is_complex_dtype(values.dtype): + fmt_klass = FloatArrayFormatter + elif is_integer_dtype(values.dtype): + fmt_klass = IntArrayFormatter + else: + fmt_klass = GenericArrayFormatter + + if space is None: + space = get_option("display.column_space") + + if float_format is None: + float_format = get_option("display.float_format") + + if digits is None: + digits = get_option("display.precision") + + fmt_obj = fmt_klass( + values, + digits=digits, + na_rep=na_rep, + float_format=float_format, + formatter=formatter, + space=space, + justify=justify, + decimal=decimal, + leading_space=leading_space, + ) + + return fmt_obj.get_result() + + +class GenericArrayFormatter: + def __init__( + self, + values: Any, + digits: int = 7, + formatter: Optional[Callable] = None, + na_rep: str = "NaN", + space: Union[str, int] = 12, + float_format: Optional[float_format_type] = None, + justify: str = "right", + decimal: str = ".", + quoting: Optional[int] = None, + fixed_width: bool = True, + leading_space: Optional[bool] = None, + ): + self.values = values + self.digits = digits + self.na_rep = na_rep + self.space = space + self.formatter = formatter + self.float_format = float_format + self.justify = justify + self.decimal = decimal + self.quoting = quoting + self.fixed_width = fixed_width + self.leading_space = leading_space + + def get_result(self) -> List[str]: + fmt_values = self._format_strings() + return _make_fixed_width(fmt_values, self.justify) + + def _format_strings(self) -> List[str]: + if self.float_format is None: + float_format = get_option("display.float_format") + if float_format is None: + fmt_str = "{{x: .{prec:d}g}}".format( + prec=get_option("display.precision") + ) + float_format = lambda x: fmt_str.format(x=x) + else: + float_format = self.float_format + + formatter = ( + self.formatter + if self.formatter is not None + else (lambda x: pprint_thing(x, escape_chars=("\t", "\r", "\n"))) + ) + + def _format(x): + if self.na_rep is not None and is_scalar(x) and isna(x): + try: + # try block for np.isnat specifically + # determine na_rep if x is None or NaT-like + if x is None: + return "None" + elif x is NA: + return str(NA) + elif x is NaT or np.isnat(x): + return "NaT" + except (TypeError, ValueError): + # np.isnat only handles datetime or timedelta objects + pass + return self.na_rep + elif isinstance(x, PandasObject): + return "{x}".format(x=x) + else: + # object dtype + return "{x}".format(x=formatter(x)) + + vals = self.values + if isinstance(vals, Index): + vals = vals._values + elif isinstance(vals, ABCSparseArray): + vals = vals.values + + is_float_type = lib.map_infer(vals, is_float) & notna(vals) + leading_space = self.leading_space + if leading_space is None: + leading_space = is_float_type.any() + + fmt_values = [] + for i, v in enumerate(vals): + if not is_float_type[i] and leading_space: + fmt_values.append(" {v}".format(v=_format(v))) + elif is_float_type[i]: + fmt_values.append(float_format(v)) + else: + if leading_space is False: + # False specifically, so that the default is + # to include a space if we get here. + tpl = "{v}" + else: + tpl = " {v}" + fmt_values.append(tpl.format(v=_format(v))) + + return fmt_values + + +class FloatArrayFormatter(GenericArrayFormatter): + """ + + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # float_format is expected to be a string + # formatter should be used to pass a function + if self.float_format is not None and self.formatter is None: + # GH21625, GH22270 + self.fixed_width = False + if callable(self.float_format): + self.formatter = self.float_format + self.float_format = None + + def _value_formatter( + self, + float_format: Optional[float_format_type] = None, + threshold: Optional[Union[float, int]] = None, + ) -> Callable: + """Returns a function to be applied on each value to format it + """ + + # the float_format parameter supersedes self.float_format + if float_format is None: + float_format = self.float_format + + # we are going to compose different functions, to first convert to + # a string, then replace the decimal symbol, and finally chop according + # to the threshold + + # when there is no float_format, we use str instead of '%g' + # because str(0.0) = '0.0' while '%g' % 0.0 = '0' + if float_format: + + def base_formatter(v): + return float_format(value=v) if notna(v) else self.na_rep + + else: + + def base_formatter(v): + return str(v) if notna(v) else self.na_rep + + if self.decimal != ".": + + def decimal_formatter(v): + return base_formatter(v).replace(".", self.decimal, 1) + + else: + decimal_formatter = base_formatter + + if threshold is None: + return decimal_formatter + + def formatter(value): + if notna(value): + if abs(value) > threshold: + return decimal_formatter(value) + else: + return decimal_formatter(0.0) + else: + return self.na_rep + + return formatter + + def get_result_as_array(self) -> np.ndarray: + """ + Returns the float values converted into strings using + the parameters given at initialisation, as a numpy array + """ + + if self.formatter is not None: + return np.array([self.formatter(x) for x in self.values]) + + if self.fixed_width: + threshold = get_option("display.chop_threshold") + else: + threshold = None + + # if we have a fixed_width, we'll need to try different float_format + def format_values_with(float_format): + formatter = self._value_formatter(float_format, threshold) + + # default formatter leaves a space to the left when formatting + # floats, must be consistent for left-justifying NaNs (GH #25061) + if self.justify == "left": + na_rep = " " + self.na_rep + else: + na_rep = self.na_rep + + # separate the wheat from the chaff + values = self.values + is_complex = is_complex_dtype(values) + mask = isna(values) + if hasattr(values, "to_dense"): # sparse numpy ndarray + values = values.to_dense() + values = np.array(values, dtype="object") + values[mask] = na_rep + imask = (~mask).ravel() + values.flat[imask] = np.array( + [formatter(val) for val in values.ravel()[imask]] + ) + + if self.fixed_width: + if is_complex: + result = _trim_zeros_complex(values, na_rep) + else: + result = _trim_zeros_float(values, na_rep) + return np.asarray(result, dtype="object") + + return values + + # There is a special default string when we are fixed-width + # The default is otherwise to use str instead of a formatting string + float_format: Optional[float_format_type] + if self.float_format is None: + if self.fixed_width: + float_format = partial( + "{value: .{digits:d}f}".format, digits=self.digits + ) + else: + float_format = self.float_format + else: + float_format = lambda value: self.float_format % value + + formatted_values = format_values_with(float_format) + + if not self.fixed_width: + return formatted_values + + # we need do convert to engineering format if some values are too small + # and would appear as 0, or if some values are too big and take too + # much space + + if len(formatted_values) > 0: + maxlen = max(len(x) for x in formatted_values) + too_long = maxlen > self.digits + 6 + else: + too_long = False + + with np.errstate(invalid="ignore"): + abs_vals = np.abs(self.values) + # this is pretty arbitrary for now + # large values: more that 8 characters including decimal symbol + # and first digit, hence > 1e6 + has_large_values = (abs_vals > 1e6).any() + has_small_values = ( + (abs_vals < 10 ** (-self.digits)) & (abs_vals > 0) + ).any() + + if has_small_values or (too_long and has_large_values): + float_format = partial("{value: .{digits:d}e}".format, digits=self.digits) + formatted_values = format_values_with(float_format) + + return formatted_values + + def _format_strings(self) -> List[str]: + # shortcut + if self.formatter is not None: + return [self.formatter(x) for x in self.values] + + return list(self.get_result_as_array()) + + +class IntArrayFormatter(GenericArrayFormatter): + def _format_strings(self) -> List[str]: + formatter = self.formatter or (lambda x: "{x: d}".format(x=x)) + fmt_values = [formatter(x) for x in self.values] + return fmt_values + + +class Datetime64Formatter(GenericArrayFormatter): + def __init__( + self, + values: Union[np.ndarray, "Series", DatetimeIndex, DatetimeArray], + nat_rep: str = "NaT", + date_format: None = None, + **kwargs, + ): + super().__init__(values, **kwargs) + self.nat_rep = nat_rep + self.date_format = date_format + + def _format_strings(self) -> List[str]: + """ we by definition have DO NOT have a TZ """ + + values = self.values + + if not isinstance(values, DatetimeIndex): + values = DatetimeIndex(values) + + if self.formatter is not None and callable(self.formatter): + return [self.formatter(x) for x in values] + + fmt_values = format_array_from_datetime( + values.asi8.ravel(), + format=_get_format_datetime64_from_values(values, self.date_format), + na_rep=self.nat_rep, + ).reshape(values.shape) + return fmt_values.tolist() + + +class ExtensionArrayFormatter(GenericArrayFormatter): + def _format_strings(self) -> List[str]: + values = self.values + if isinstance(values, (ABCIndexClass, ABCSeries)): + values = values._values + + formatter = values._formatter(boxed=True) + + if is_categorical_dtype(values.dtype): + # Categorical is special for now, so that we can preserve tzinfo + array = values._internal_get_values() + else: + array = np.asarray(values) + + fmt_values = format_array( + array, + formatter, + float_format=self.float_format, + na_rep=self.na_rep, + digits=self.digits, + space=self.space, + justify=self.justify, + leading_space=self.leading_space, + ) + return fmt_values + + +def format_percentiles( + percentiles: Union[ + np.ndarray, List[Union[int, float]], List[float], List[Union[str, float]] + ] +) -> List[str]: + """ + Outputs rounded and formatted percentiles. + + Parameters + ---------- + percentiles : list-like, containing floats from interval [0,1] + + Returns + ------- + formatted : list of strings + + Notes + ----- + Rounding precision is chosen so that: (1) if any two elements of + ``percentiles`` differ, they remain different after rounding + (2) no entry is *rounded* to 0% or 100%. + Any non-integer is always rounded to at least 1 decimal place. + + Examples + -------- + Keeps all entries different after rounding: + + >>> format_percentiles([0.01999, 0.02001, 0.5, 0.666666, 0.9999]) + ['1.999%', '2.001%', '50%', '66.667%', '99.99%'] + + No element is rounded to 0% or 100% (unless already equal to it). + Duplicates are allowed: + + >>> format_percentiles([0, 0.5, 0.02001, 0.5, 0.666666, 0.9999]) + ['0%', '50%', '2.0%', '50%', '66.67%', '99.99%'] + """ + + percentiles = np.asarray(percentiles) + + # It checks for np.NaN as well + with np.errstate(invalid="ignore"): + if ( + not is_numeric_dtype(percentiles) + or not np.all(percentiles >= 0) + or not np.all(percentiles <= 1) + ): + raise ValueError("percentiles should all be in the interval [0,1]") + + percentiles = 100 * percentiles + int_idx = np.isclose(percentiles.astype(int), percentiles) + + if np.all(int_idx): + out = percentiles.astype(int).astype(str) + return [i + "%" for i in out] + + unique_pcts = np.unique(percentiles) + to_begin = unique_pcts[0] if unique_pcts[0] > 0 else None + to_end = 100 - unique_pcts[-1] if unique_pcts[-1] < 100 else None + + # Least precision that keeps percentiles unique after rounding + prec = -np.floor( + np.log10(np.min(np.ediff1d(unique_pcts, to_begin=to_begin, to_end=to_end))) + ).astype(int) + prec = max(1, prec) + out = np.empty_like(percentiles, dtype=object) + out[int_idx] = percentiles[int_idx].astype(int).astype(str) + out[~int_idx] = percentiles[~int_idx].round(prec).astype(str) + return [i + "%" for i in out] + + +def _is_dates_only( + values: Union[np.ndarray, DatetimeArray, Index, DatetimeIndex] +) -> bool: + # return a boolean if we are only dates (and don't have a timezone) + assert values.ndim == 1 + + values = DatetimeIndex(values) + if values.tz is not None: + return False + + values_int = values.asi8 + consider_values = values_int != iNaT + one_day_nanos = 86400 * 1e9 + even_days = ( + np.logical_and(consider_values, values_int % int(one_day_nanos) != 0).sum() == 0 + ) + if even_days: + return True + return False + + +def _format_datetime64( + x: Union[NaTType, Timestamp], tz: Optional[tzinfo] = None, nat_rep: str = "NaT" +) -> str: + if x is None or (is_scalar(x) and isna(x)): + return nat_rep + + if tz is not None or not isinstance(x, Timestamp): + if getattr(x, "tzinfo", None) is not None: + x = Timestamp(x).tz_convert(tz) + else: + x = Timestamp(x).tz_localize(tz) + + return str(x) + + +def _format_datetime64_dateonly( + x: Union[NaTType, Timestamp], nat_rep: str = "NaT", date_format: None = None +) -> str: + if x is None or (is_scalar(x) and isna(x)): + return nat_rep + + if not isinstance(x, Timestamp): + x = Timestamp(x) + + if date_format: + return x.strftime(date_format) + else: + return x._date_repr + + +def _get_format_datetime64( + is_dates_only: bool, nat_rep: str = "NaT", date_format: None = None +) -> Callable: + + if is_dates_only: + return lambda x, tz=None: _format_datetime64_dateonly( + x, nat_rep=nat_rep, date_format=date_format + ) + else: + return lambda x, tz=None: _format_datetime64(x, tz=tz, nat_rep=nat_rep) + + +def _get_format_datetime64_from_values( + values: Union[np.ndarray, DatetimeArray, DatetimeIndex], date_format: Optional[str] +) -> Optional[str]: + """ given values and a date_format, return a string format """ + + if isinstance(values, np.ndarray) and values.ndim > 1: + # We don't actually care about the order of values, and DatetimeIndex + # only accepts 1D values + values = values.ravel() + + is_dates_only = _is_dates_only(values) + if is_dates_only: + return date_format or "%Y-%m-%d" + return date_format + + +class Datetime64TZFormatter(Datetime64Formatter): + def _format_strings(self) -> List[str]: + """ we by definition have a TZ """ + + values = self.values.astype(object) + is_dates_only = _is_dates_only(values) + formatter = self.formatter or _get_format_datetime64( + is_dates_only, date_format=self.date_format + ) + fmt_values = [formatter(x) for x in values] + + return fmt_values + + +class Timedelta64Formatter(GenericArrayFormatter): + def __init__( + self, + values: Union[np.ndarray, TimedeltaIndex], + nat_rep: str = "NaT", + box: bool = False, + **kwargs, + ): + super().__init__(values, **kwargs) + self.nat_rep = nat_rep + self.box = box + + def _format_strings(self) -> List[str]: + formatter = self.formatter or _get_format_timedelta64( + self.values, nat_rep=self.nat_rep, box=self.box + ) + return [formatter(x) for x in self.values] + + +def _get_format_timedelta64( + values: Union[np.ndarray, TimedeltaIndex, TimedeltaArray], + nat_rep: str = "NaT", + box: bool = False, +) -> Callable: + """ + Return a formatter function for a range of timedeltas. + These will all have the same format argument + + If box, then show the return in quotes + """ + + values_int = values.astype(np.int64) + + consider_values = values_int != iNaT + + one_day_nanos = 86400 * 1e9 + even_days = ( + np.logical_and(consider_values, values_int % one_day_nanos != 0).sum() == 0 + ) + all_sub_day = ( + np.logical_and(consider_values, np.abs(values_int) >= one_day_nanos).sum() == 0 + ) + + if even_days: + format = None + elif all_sub_day: + format = "sub_day" + else: + format = "long" + + def _formatter(x): + if x is None or (is_scalar(x) and isna(x)): + return nat_rep + + if not isinstance(x, Timedelta): + x = Timedelta(x) + result = x._repr_base(format=format) + if box: + result = "'{res}'".format(res=result) + return result + + return _formatter + + +def _make_fixed_width( + strings: List[str], + justify: str = "right", + minimum: Optional[int] = None, + adj: Optional[TextAdjustment] = None, +) -> List[str]: + + if len(strings) == 0 or justify == "all": + return strings + + if adj is None: + adj = _get_adjustment() + + max_len = max(adj.len(x) for x in strings) + + if minimum is not None: + max_len = max(minimum, max_len) + + conf_max = get_option("display.max_colwidth") + if conf_max is not None and max_len > conf_max: + max_len = conf_max + + def just(x): + if conf_max is not None: + if (conf_max > 3) & (adj.len(x) > max_len): + x = x[: max_len - 3] + "..." + return x + + strings = [just(x) for x in strings] + result = adj.justify(strings, max_len, mode=justify) + return result + + +def _trim_zeros_complex(str_complexes: np.ndarray, na_rep: str = "NaN") -> List[str]: + """ + Separates the real and imaginary parts from the complex number, and + executes the _trim_zeros_float method on each of those. + """ + return [ + "".join(_trim_zeros_float(re.split(r"([j+-])", x), na_rep)) + for x in str_complexes + ] + + +def _trim_zeros_float( + str_floats: Union[np.ndarray, List[str]], na_rep: str = "NaN" +) -> List[str]: + """ + Trims zeros, leaving just one before the decimal points if need be. + """ + trimmed = str_floats + + def _is_number(x): + return x != na_rep and not x.endswith("inf") + + def _cond(values): + finite = [x for x in values if _is_number(x)] + return ( + len(finite) > 0 + and all(x.endswith("0") for x in finite) + and not (any(("e" in x) or ("E" in x) for x in finite)) + ) + + while _cond(trimmed): + trimmed = [x[:-1] if _is_number(x) else x for x in trimmed] + + # leave one 0 after the decimal points if need be. + return [x + "0" if x.endswith(".") and _is_number(x) else x for x in trimmed] + + +def _has_names(index: Index) -> bool: + if isinstance(index, ABCMultiIndex): + return com.any_not_none(*index.names) + else: + return index.name is not None + + +class EngFormatter: + """ + Formats float values according to engineering format. + + Based on matplotlib.ticker.EngFormatter + """ + + # The SI engineering prefixes + ENG_PREFIXES = { + -24: "y", + -21: "z", + -18: "a", + -15: "f", + -12: "p", + -9: "n", + -6: "u", + -3: "m", + 0: "", + 3: "k", + 6: "M", + 9: "G", + 12: "T", + 15: "P", + 18: "E", + 21: "Z", + 24: "Y", + } + + def __init__(self, accuracy: Optional[int] = None, use_eng_prefix: bool = False): + self.accuracy = accuracy + self.use_eng_prefix = use_eng_prefix + + def __call__(self, num: Union[int, float]) -> str: + """ Formats a number in engineering notation, appending a letter + representing the power of 1000 of the original number. Some examples: + + >>> format_eng(0) # for self.accuracy = 0 + ' 0' + + >>> format_eng(1000000) # for self.accuracy = 1, + # self.use_eng_prefix = True + ' 1.0M' + + >>> format_eng("-1e-6") # for self.accuracy = 2 + # self.use_eng_prefix = False + '-1.00E-06' + + @param num: the value to represent + @type num: either a numeric value or a string that can be converted to + a numeric value (as per decimal.Decimal constructor) + + @return: engineering formatted string + """ + dnum = decimal.Decimal(str(num)) + + if decimal.Decimal.is_nan(dnum): + return "NaN" + + if decimal.Decimal.is_infinite(dnum): + return "inf" + + sign = 1 + + if dnum < 0: # pragma: no cover + sign = -1 + dnum = -dnum + + if dnum != 0: + pow10 = decimal.Decimal(int(math.floor(dnum.log10() / 3) * 3)) + else: + pow10 = decimal.Decimal(0) + + pow10 = pow10.min(max(self.ENG_PREFIXES.keys())) + pow10 = pow10.max(min(self.ENG_PREFIXES.keys())) + int_pow10 = int(pow10) + + if self.use_eng_prefix: + prefix = self.ENG_PREFIXES[int_pow10] + else: + if int_pow10 < 0: + prefix = "E-{pow10:02d}".format(pow10=-int_pow10) + else: + prefix = "E+{pow10:02d}".format(pow10=int_pow10) + + mant = sign * dnum / (10 ** pow10) + + if self.accuracy is None: # pragma: no cover + format_str = "{mant: g}{prefix}" + else: + format_str = "{{mant: .{acc:d}f}}{{prefix}}".format(acc=self.accuracy) + + formatted = format_str.format(mant=mant, prefix=prefix) + + return formatted + + +def set_eng_float_format(accuracy: int = 3, use_eng_prefix: bool = False) -> None: + """ + Alter default behavior on how float is formatted in DataFrame. + Format float in engineering format. By accuracy, we mean the number of + decimal digits after the floating point. + + See also EngFormatter. + """ + + set_option("display.float_format", EngFormatter(accuracy, use_eng_prefix)) + set_option("display.column_space", max(12, accuracy + 9)) + + +def _binify(cols: List[int], line_width: int) -> List[int]: + adjoin_width = 1 + bins = [] + curr_width = 0 + i_last_column = len(cols) - 1 + for i, w in enumerate(cols): + w_adjoined = w + adjoin_width + curr_width += w_adjoined + if i_last_column == i: + wrap = curr_width + 1 > line_width and i > 0 + else: + wrap = curr_width + 2 > line_width and i > 0 + if wrap: + bins.append(i) + curr_width = w_adjoined + + bins.append(len(cols)) + return bins + + +def get_level_lengths( + levels: Any, sentinel: Union[bool, object, str] = "" +) -> List[Dict[int, int]]: + """For each index in each level the function returns lengths of indexes. + + Parameters + ---------- + levels : list of lists + List of values on for level. + sentinel : string, optional + Value which states that no new index starts on there. + + Returns + ------- + Returns list of maps. For each level returns map of indexes (key is index + in row and value is length of index). + """ + if len(levels) == 0: + return [] + + control = [True] * len(levels[0]) + + result = [] + for level in levels: + last_index = 0 + + lengths = {} + for i, key in enumerate(level): + if control[i] and key == sentinel: + pass + else: + control[i] = False + lengths[last_index] = i - last_index + last_index = i + + lengths[last_index] = len(level) - last_index + + result.append(lengths) + + return result + + +def buffer_put_lines(buf: IO[str], lines: List[str]) -> None: + """ + Appends lines to a buffer. + + Parameters + ---------- + buf + The buffer to write to + lines + The lines to append. + """ + if any(isinstance(x, str) for x in lines): + lines = [str(x) for x in lines] + buf.write("\n".join(lines)) diff --git a/venv/Lib/site-packages/pandas/io/formats/html.py b/venv/Lib/site-packages/pandas/io/formats/html.py new file mode 100644 index 0000000..b46b2f6 --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/formats/html.py @@ -0,0 +1,614 @@ +""" +Module for formatting output data in HTML. +""" + +from textwrap import dedent +from typing import IO, Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union, cast + +from pandas._config import get_option + +from pandas._libs import lib + +from pandas.core.dtypes.generic import ABCMultiIndex + +from pandas import option_context + +from pandas.io.common import is_url +from pandas.io.formats.format import ( + DataFrameFormatter, + TableFormatter, + buffer_put_lines, + get_level_lengths, +) +from pandas.io.formats.printing import pprint_thing + + +class HTMLFormatter(TableFormatter): + """ + Internal class for formatting output data in html. + This class is intended for shared functionality between + DataFrame.to_html() and DataFrame._repr_html_(). + Any logic in common with other output formatting methods + should ideally be inherited from classes in format.py + and this class responsible for only producing html markup. + """ + + indent_delta = 2 + + def __init__( + self, + formatter: DataFrameFormatter, + classes: Optional[Union[str, List[str], Tuple[str, ...]]] = None, + border: Optional[int] = None, + ) -> None: + self.fmt = formatter + self.classes = classes + + self.frame = self.fmt.frame + self.columns = self.fmt.tr_frame.columns + self.elements: List[str] = [] + self.bold_rows = self.fmt.bold_rows + self.escape = self.fmt.escape + self.show_dimensions = self.fmt.show_dimensions + if border is None: + border = cast(int, get_option("display.html.border")) + self.border = border + self.table_id = self.fmt.table_id + self.render_links = self.fmt.render_links + if isinstance(self.fmt.col_space, int): + self.fmt.col_space = "{colspace}px".format(colspace=self.fmt.col_space) + + @property + def show_row_idx_names(self) -> bool: + return self.fmt.show_row_idx_names + + @property + def show_col_idx_names(self) -> bool: + return self.fmt.show_col_idx_names + + @property + def row_levels(self) -> int: + if self.fmt.index: + # showing (row) index + return self.frame.index.nlevels + elif self.show_col_idx_names: + # see gh-22579 + # Column misalignment also occurs for + # a standard index when the columns index is named. + # If the row index is not displayed a column of + # blank cells need to be included before the DataFrame values. + return 1 + # not showing (row) index + return 0 + + def _get_columns_formatted_values(self) -> Iterable: + return self.columns + + # https://github.com/python/mypy/issues/1237 + @property + def is_truncated(self) -> bool: # type: ignore + return self.fmt.is_truncated + + @property + def ncols(self) -> int: + return len(self.fmt.tr_frame.columns) + + def write(self, s: Any, indent: int = 0) -> None: + rs = pprint_thing(s) + self.elements.append(" " * indent + rs) + + def write_th( + self, s: Any, header: bool = False, indent: int = 0, tags: Optional[str] = None + ) -> None: + """ + Method for writting a formatted . This will + cause min-width to be set if there is one. + indent : int, default 0 + The indentation level of the cell. + tags : str, default None + Tags to include in the cell. + + Returns + ------- + A written ", indent) + else: + self.write(''.format(align=align), indent) + indent += indent_delta + + for i, s in enumerate(line): + val_tag = tags.get(i, None) + if header or (self.bold_rows and i < nindex_levels): + self.write_th(s, indent=indent, header=header, tags=val_tag) + else: + self.write_td(s, indent, tags=val_tag) + + indent -= indent_delta + self.write("", indent) + + def render(self) -> List[str]: + self._write_table() + + if self.should_show_dimensions: + by = chr(215) # × + self.write( + "

{rows} rows {by} {cols} columns

".format( + rows=len(self.frame), by=by, cols=len(self.frame.columns) + ) + ) + + return self.elements + + def write_result(self, buf: IO[str]) -> None: + buffer_put_lines(buf, self.render()) + + def _write_table(self, indent: int = 0) -> None: + _classes = ["dataframe"] # Default class. + use_mathjax = get_option("display.html.use_mathjax") + if not use_mathjax: + _classes.append("tex2jax_ignore") + if self.classes is not None: + if isinstance(self.classes, str): + self.classes = self.classes.split() + if not isinstance(self.classes, (list, tuple)): + raise TypeError( + "classes must be a string, list, or tuple, " + "not {typ}".format(typ=type(self.classes)) + ) + _classes.extend(self.classes) + + if self.table_id is None: + id_section = "" + else: + id_section = ' id="{table_id}"'.format(table_id=self.table_id) + + self.write( + '
cell. + + If col_space is set on the formatter then that is used for + the value of min-width. + + Parameters + ---------- + s : object + The data to be written inside the cell. + header : bool, default False + Set to True if the is for use inside
cell. + """ + if header and self.fmt.col_space is not None: + tags = tags or "" + tags += 'style="min-width: {colspace};"'.format(colspace=self.fmt.col_space) + + self._write_cell(s, kind="th", indent=indent, tags=tags) + + def write_td(self, s: Any, indent: int = 0, tags: Optional[str] = None) -> None: + self._write_cell(s, kind="td", indent=indent, tags=tags) + + def _write_cell( + self, s: Any, kind: str = "td", indent: int = 0, tags: Optional[str] = None + ) -> None: + if tags is not None: + start_tag = "<{kind} {tags}>".format(kind=kind, tags=tags) + else: + start_tag = "<{kind}>".format(kind=kind) + + if self.escape: + # escape & first to prevent double escaping of & + esc = {"&": r"&", "<": r"<", ">": r">"} + else: + esc = {} + + rs = pprint_thing(s, escape_chars=esc).strip() + + if self.render_links and is_url(rs): + rs_unescaped = pprint_thing(s, escape_chars={}).strip() + start_tag += ''.format(url=rs_unescaped) + end_a = "" + else: + end_a = "" + + self.write( + "{start}{rs}{end_a}".format( + start=start_tag, rs=rs, end_a=end_a, kind=kind + ), + indent, + ) + + def write_tr( + self, + line: Iterable, + indent: int = 0, + indent_delta: int = 0, + header: bool = False, + align: Optional[str] = None, + tags: Optional[Dict[int, str]] = None, + nindex_levels: int = 0, + ) -> None: + if tags is None: + tags = {} + + if align is None: + self.write("
'.format( + border=self.border, cls=" ".join(_classes), id_section=id_section + ), + indent, + ) + + if self.fmt.header or self.show_row_idx_names: + self._write_header(indent + self.indent_delta) + + self._write_body(indent + self.indent_delta) + + self.write("
", indent) + + def _write_col_header(self, indent: int) -> None: + truncate_h = self.fmt.truncate_h + if isinstance(self.columns, ABCMultiIndex): + template = 'colspan="{span:d}" halign="left"' + + if self.fmt.sparsify: + # GH3547 + sentinel = lib.no_default + else: + sentinel = False + levels = self.columns.format(sparsify=sentinel, adjoin=False, names=False) + level_lengths = get_level_lengths(levels, sentinel) + inner_lvl = len(level_lengths) - 1 + for lnum, (records, values) in enumerate(zip(level_lengths, levels)): + if truncate_h: + # modify the header lines + ins_col = self.fmt.tr_col_num + if self.fmt.sparsify: + recs_new = {} + # Increment tags after ... col. + for tag, span in list(records.items()): + if tag >= ins_col: + recs_new[tag + 1] = span + elif tag + span > ins_col: + recs_new[tag] = span + 1 + if lnum == inner_lvl: + values = ( + values[:ins_col] + ("...",) + values[ins_col:] + ) + else: + # sparse col headers do not receive a ... + values = ( + values[:ins_col] + + (values[ins_col - 1],) + + values[ins_col:] + ) + else: + recs_new[tag] = span + # if ins_col lies between tags, all col headers + # get ... + if tag + span == ins_col: + recs_new[ins_col] = 1 + values = values[:ins_col] + ("...",) + values[ins_col:] + records = recs_new + inner_lvl = len(level_lengths) - 1 + if lnum == inner_lvl: + records[ins_col] = 1 + else: + recs_new = {} + for tag, span in list(records.items()): + if tag >= ins_col: + recs_new[tag + 1] = span + else: + recs_new[tag] = span + recs_new[ins_col] = 1 + records = recs_new + values = values[:ins_col] + ["..."] + values[ins_col:] + + # see gh-22579 + # Column Offset Bug with to_html(index=False) with + # MultiIndex Columns and Index. + # Initially fill row with blank cells before column names. + # TODO: Refactor to remove code duplication with code + # block below for standard columns index. + row = [""] * (self.row_levels - 1) + if self.fmt.index or self.show_col_idx_names: + # see gh-22747 + # If to_html(index_names=False) do not show columns + # index names. + # TODO: Refactor to use _get_column_name_list from + # DataFrameFormatter class and create a + # _get_formatted_column_labels function for code + # parity with DataFrameFormatter class. + if self.fmt.show_index_names: + name = self.columns.names[lnum] + row.append(pprint_thing(name or "")) + else: + row.append("") + + tags = {} + j = len(row) + for i, v in enumerate(values): + if i in records: + if records[i] > 1: + tags[j] = template.format(span=records[i]) + else: + continue + j += 1 + row.append(v) + self.write_tr(row, indent, self.indent_delta, tags=tags, header=True) + else: + # see gh-22579 + # Column misalignment also occurs for + # a standard index when the columns index is named. + # Initially fill row with blank cells before column names. + # TODO: Refactor to remove code duplication with code block + # above for columns MultiIndex. + row = [""] * (self.row_levels - 1) + if self.fmt.index or self.show_col_idx_names: + # see gh-22747 + # If to_html(index_names=False) do not show columns + # index names. + # TODO: Refactor to use _get_column_name_list from + # DataFrameFormatter class. + if self.fmt.show_index_names: + row.append(self.columns.name or "") + else: + row.append("") + row.extend(self._get_columns_formatted_values()) + align = self.fmt.justify + + if truncate_h: + ins_col = self.row_levels + self.fmt.tr_col_num + row.insert(ins_col, "...") + + self.write_tr(row, indent, self.indent_delta, header=True, align=align) + + def _write_row_header(self, indent: int) -> None: + truncate_h = self.fmt.truncate_h + row = [x if x is not None else "" for x in self.frame.index.names] + [""] * ( + self.ncols + (1 if truncate_h else 0) + ) + self.write_tr(row, indent, self.indent_delta, header=True) + + def _write_header(self, indent: int) -> None: + self.write("", indent) + + if self.fmt.header: + self._write_col_header(indent + self.indent_delta) + + if self.show_row_idx_names: + self._write_row_header(indent + self.indent_delta) + + self.write("", indent) + + def _get_formatted_values(self) -> Dict[int, List[str]]: + with option_context("display.max_colwidth", None): + fmt_values = {i: self.fmt._format_col(i) for i in range(self.ncols)} + return fmt_values + + def _write_body(self, indent: int) -> None: + self.write("", indent) + fmt_values = self._get_formatted_values() + + # write values + if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex): + self._write_hierarchical_rows(fmt_values, indent + self.indent_delta) + else: + self._write_regular_rows(fmt_values, indent + self.indent_delta) + + self.write("", indent) + + def _write_regular_rows( + self, fmt_values: Mapping[int, List[str]], indent: int + ) -> None: + truncate_h = self.fmt.truncate_h + truncate_v = self.fmt.truncate_v + + nrows = len(self.fmt.tr_frame) + + if self.fmt.index: + fmt = self.fmt._get_formatter("__index__") + if fmt is not None: + index_values = self.fmt.tr_frame.index.map(fmt) + else: + index_values = self.fmt.tr_frame.index.format() + + row: List[str] = [] + for i in range(nrows): + + if truncate_v and i == (self.fmt.tr_row_num): + str_sep_row = ["..."] * len(row) + self.write_tr( + str_sep_row, + indent, + self.indent_delta, + tags=None, + nindex_levels=self.row_levels, + ) + + row = [] + if self.fmt.index: + row.append(index_values[i]) + # see gh-22579 + # Column misalignment also occurs for + # a standard index when the columns index is named. + # Add blank cell before data cells. + elif self.show_col_idx_names: + row.append("") + row.extend(fmt_values[j][i] for j in range(self.ncols)) + + if truncate_h: + dot_col_ix = self.fmt.tr_col_num + self.row_levels + row.insert(dot_col_ix, "...") + self.write_tr( + row, indent, self.indent_delta, tags=None, nindex_levels=self.row_levels + ) + + def _write_hierarchical_rows( + self, fmt_values: Mapping[int, List[str]], indent: int + ) -> None: + template = 'rowspan="{span}" valign="top"' + + truncate_h = self.fmt.truncate_h + truncate_v = self.fmt.truncate_v + frame = self.fmt.tr_frame + nrows = len(frame) + + idx_values = frame.index.format(sparsify=False, adjoin=False, names=False) + idx_values = list(zip(*idx_values)) + + if self.fmt.sparsify: + # GH3547 + sentinel = lib.no_default + levels = frame.index.format(sparsify=sentinel, adjoin=False, names=False) + + level_lengths = get_level_lengths(levels, sentinel) + inner_lvl = len(level_lengths) - 1 + if truncate_v: + # Insert ... row and adjust idx_values and + # level_lengths to take this into account. + ins_row = self.fmt.tr_row_num + # cast here since if truncate_v is True, self.fmt.tr_row_num is not None + ins_row = cast(int, ins_row) + inserted = False + for lnum, records in enumerate(level_lengths): + rec_new = {} + for tag, span in list(records.items()): + if tag >= ins_row: + rec_new[tag + 1] = span + elif tag + span > ins_row: + rec_new[tag] = span + 1 + + # GH 14882 - Make sure insertion done once + if not inserted: + dot_row = list(idx_values[ins_row - 1]) + dot_row[-1] = "..." + idx_values.insert(ins_row, tuple(dot_row)) + inserted = True + else: + dot_row = list(idx_values[ins_row]) + dot_row[inner_lvl - lnum] = "..." + idx_values[ins_row] = tuple(dot_row) + else: + rec_new[tag] = span + # If ins_row lies between tags, all cols idx cols + # receive ... + if tag + span == ins_row: + rec_new[ins_row] = 1 + if lnum == 0: + idx_values.insert( + ins_row, tuple(["..."] * len(level_lengths)) + ) + + # GH 14882 - Place ... in correct level + elif inserted: + dot_row = list(idx_values[ins_row]) + dot_row[inner_lvl - lnum] = "..." + idx_values[ins_row] = tuple(dot_row) + level_lengths[lnum] = rec_new + + level_lengths[inner_lvl][ins_row] = 1 + for ix_col in range(len(fmt_values)): + fmt_values[ix_col].insert(ins_row, "...") + nrows += 1 + + for i in range(nrows): + row = [] + tags = {} + + sparse_offset = 0 + j = 0 + for records, v in zip(level_lengths, idx_values[i]): + if i in records: + if records[i] > 1: + tags[j] = template.format(span=records[i]) + else: + sparse_offset += 1 + continue + + j += 1 + row.append(v) + + row.extend(fmt_values[j][i] for j in range(self.ncols)) + if truncate_h: + row.insert( + self.row_levels - sparse_offset + self.fmt.tr_col_num, "..." + ) + self.write_tr( + row, + indent, + self.indent_delta, + tags=tags, + nindex_levels=len(levels) - sparse_offset, + ) + else: + row = [] + for i in range(len(frame)): + if truncate_v and i == (self.fmt.tr_row_num): + str_sep_row = ["..."] * len(row) + self.write_tr( + str_sep_row, + indent, + self.indent_delta, + tags=None, + nindex_levels=self.row_levels, + ) + + idx_values = list( + zip(*frame.index.format(sparsify=False, adjoin=False, names=False)) + ) + row = [] + row.extend(idx_values[i]) + row.extend(fmt_values[j][i] for j in range(self.ncols)) + if truncate_h: + row.insert(self.row_levels + self.fmt.tr_col_num, "...") + self.write_tr( + row, + indent, + self.indent_delta, + tags=None, + nindex_levels=frame.index.nlevels, + ) + + +class NotebookFormatter(HTMLFormatter): + """ + Internal class for formatting output data in html for display in Jupyter + Notebooks. This class is intended for functionality specific to + DataFrame._repr_html_() and DataFrame.to_html(notebook=True) + """ + + def _get_formatted_values(self) -> Dict[int, List[str]]: + return {i: self.fmt._format_col(i) for i in range(self.ncols)} + + def _get_columns_formatted_values(self) -> List[str]: + return self.columns.format() + + def write_style(self) -> None: + # We use the "scoped" attribute here so that the desired + # style properties for the data frame are not then applied + # throughout the entire notebook. + template_first = """\ + """ + template_select = """\ + .dataframe %s { + %s: %s; + }""" + element_props = [ + ("tbody tr th:only-of-type", "vertical-align", "middle"), + ("tbody tr th", "vertical-align", "top"), + ] + if isinstance(self.columns, ABCMultiIndex): + element_props.append(("thead tr th", "text-align", "left")) + if self.show_row_idx_names: + element_props.append( + ("thead tr:last-of-type th", "text-align", "right") + ) + else: + element_props.append(("thead th", "text-align", "right")) + template_mid = "\n\n".join(map(lambda t: template_select % t, element_props)) + template = dedent("\n".join((template_first, template_mid, template_last))) + self.write(template) + + def render(self) -> List[str]: + self.write("
") + self.write_style() + super().render() + self.write("
") + return self.elements diff --git a/venv/Lib/site-packages/pandas/io/formats/latex.py b/venv/Lib/site-packages/pandas/io/formats/latex.py new file mode 100644 index 0000000..008a994 --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/formats/latex.py @@ -0,0 +1,377 @@ +""" +Module for formatting output data in Latex. +""" +from typing import IO, List, Optional, Tuple + +import numpy as np + +from pandas.core.dtypes.generic import ABCMultiIndex + +from pandas.io.formats.format import DataFrameFormatter, TableFormatter + + +class LatexFormatter(TableFormatter): + """ + Used to render a DataFrame to a LaTeX tabular/longtable environment output. + + Parameters + ---------- + formatter : `DataFrameFormatter` + column_format : str, default None + The columns format as specified in `LaTeX table format + `__ e.g 'rcl' for 3 columns + longtable : boolean, default False + Use a longtable environment instead of tabular. + + See Also + -------- + HTMLFormatter + """ + + def __init__( + self, + formatter: DataFrameFormatter, + column_format: Optional[str] = None, + longtable: bool = False, + multicolumn: bool = False, + multicolumn_format: Optional[str] = None, + multirow: bool = False, + caption: Optional[str] = None, + label: Optional[str] = None, + ): + self.fmt = formatter + self.frame = self.fmt.frame + self.bold_rows = self.fmt.bold_rows + self.column_format = column_format + self.longtable = longtable + self.multicolumn = multicolumn + self.multicolumn_format = multicolumn_format + self.multirow = multirow + self.caption = caption + self.label = label + self.escape = self.fmt.escape + + def write_result(self, buf: IO[str]) -> None: + """ + Render a DataFrame to a LaTeX tabular, longtable, or table/tabular + environment output. + """ + + # string representation of the columns + if len(self.frame.columns) == 0 or len(self.frame.index) == 0: + info_line = "Empty {name}\nColumns: {col}\nIndex: {idx}".format( + name=type(self.frame).__name__, + col=self.frame.columns, + idx=self.frame.index, + ) + strcols = [[info_line]] + else: + strcols = self.fmt._to_str_columns() + + def get_col_type(dtype): + if issubclass(dtype.type, np.number): + return "r" + else: + return "l" + + # reestablish the MultiIndex that has been joined by _to_str_column + if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex): + out = self.frame.index.format( + adjoin=False, + sparsify=self.fmt.sparsify, + names=self.fmt.has_index_names, + na_rep=self.fmt.na_rep, + ) + + # index.format will sparsify repeated entries with empty strings + # so pad these with some empty space + def pad_empties(x): + for pad in reversed(x): + if pad: + break + return [x[0]] + [i if i else " " * len(pad) for i in x[1:]] + + out = (pad_empties(i) for i in out) + + # Add empty spaces for each column level + clevels = self.frame.columns.nlevels + out = [[" " * len(i[-1])] * clevels + i for i in out] + + # Add the column names to the last index column + cnames = self.frame.columns.names + if any(cnames): + new_names = [i if i else "{}" for i in cnames] + out[self.frame.index.nlevels - 1][:clevels] = new_names + + # Get rid of old multiindex column and add new ones + strcols = out + strcols[1:] + + if self.column_format is None: + dtypes = self.frame.dtypes._values + column_format = "".join(map(get_col_type, dtypes)) + if self.fmt.index: + index_format = "l" * self.frame.index.nlevels + column_format = index_format + column_format + elif not isinstance(self.column_format, str): # pragma: no cover + raise AssertionError( + "column_format must be str or unicode, " + "not {typ}".format(typ=type(column_format)) + ) + else: + column_format = self.column_format + + if self.longtable: + self._write_longtable_begin(buf, column_format) + else: + self._write_tabular_begin(buf, column_format) + + buf.write("\\toprule\n") + + ilevels = self.frame.index.nlevels + clevels = self.frame.columns.nlevels + nlevels = clevels + if self.fmt.has_index_names and self.fmt.show_index_names: + nlevels += 1 + strrows = list(zip(*strcols)) + self.clinebuf: List[List[int]] = [] + + for i, row in enumerate(strrows): + if i == nlevels and self.fmt.header: + buf.write("\\midrule\n") # End of header + if self.longtable: + buf.write("\\endhead\n") + buf.write("\\midrule\n") + buf.write( + "\\multicolumn{{{n}}}{{r}}{{{{Continued on next " + "page}}}} \\\\\n".format(n=len(row)) + ) + buf.write("\\midrule\n") + buf.write("\\endfoot\n\n") + buf.write("\\bottomrule\n") + buf.write("\\endlastfoot\n") + if self.escape: + # escape backslashes first + crow = [ + ( + x.replace("\\", "\\textbackslash ") + .replace("_", "\\_") + .replace("%", "\\%") + .replace("$", "\\$") + .replace("#", "\\#") + .replace("{", "\\{") + .replace("}", "\\}") + .replace("~", "\\textasciitilde ") + .replace("^", "\\textasciicircum ") + .replace("&", "\\&") + if (x and x != "{}") + else "{}" + ) + for x in row + ] + else: + crow = [x if x else "{}" for x in row] + if self.bold_rows and self.fmt.index: + # bold row labels + crow = [ + "\\textbf{{{x}}}".format(x=x) + if j < ilevels and x.strip() not in ["", "{}"] + else x + for j, x in enumerate(crow) + ] + if i < clevels and self.fmt.header and self.multicolumn: + # sum up columns to multicolumns + crow = self._format_multicolumn(crow, ilevels) + if i >= nlevels and self.fmt.index and self.multirow and ilevels > 1: + # sum up rows to multirows + crow = self._format_multirow(crow, ilevels, i, strrows) + buf.write(" & ".join(crow)) + buf.write(" \\\\\n") + if self.multirow and i < len(strrows) - 1: + self._print_cline(buf, i, len(strcols)) + + if self.longtable: + self._write_longtable_end(buf) + else: + self._write_tabular_end(buf) + + def _format_multicolumn(self, row: List[str], ilevels: int) -> List[str]: + r""" + Combine columns belonging to a group to a single multicolumn entry + according to self.multicolumn_format + + e.g.: + a & & & b & c & + will become + \multicolumn{3}{l}{a} & b & \multicolumn{2}{l}{c} + """ + row2 = list(row[:ilevels]) + ncol = 1 + coltext = "" + + def append_col(): + # write multicolumn if needed + if ncol > 1: + row2.append( + "\\multicolumn{{{ncol:d}}}{{{fmt:s}}}{{{txt:s}}}".format( + ncol=ncol, fmt=self.multicolumn_format, txt=coltext.strip() + ) + ) + # don't modify where not needed + else: + row2.append(coltext) + + for c in row[ilevels:]: + # if next col has text, write the previous + if c.strip(): + if coltext: + append_col() + coltext = c + ncol = 1 + # if not, add it to the previous multicolumn + else: + ncol += 1 + # write last column name + if coltext: + append_col() + return row2 + + def _format_multirow( + self, row: List[str], ilevels: int, i: int, rows: List[Tuple[str, ...]] + ) -> List[str]: + r""" + Check following rows, whether row should be a multirow + + e.g.: becomes: + a & 0 & \multirow{2}{*}{a} & 0 & + & 1 & & 1 & + b & 0 & \cline{1-2} + b & 0 & + """ + for j in range(ilevels): + if row[j].strip(): + nrow = 1 + for r in rows[i + 1 :]: + if not r[j].strip(): + nrow += 1 + else: + break + if nrow > 1: + # overwrite non-multirow entry + row[j] = "\\multirow{{{nrow:d}}}{{*}}{{{row:s}}}".format( + nrow=nrow, row=row[j].strip() + ) + # save when to end the current block with \cline + self.clinebuf.append([i + nrow - 1, j + 1]) + return row + + def _print_cline(self, buf: IO[str], i: int, icol: int) -> None: + """ + Print clines after multirow-blocks are finished. + """ + for cl in self.clinebuf: + if cl[0] == i: + buf.write("\\cline{{{cl:d}-{icol:d}}}\n".format(cl=cl[1], icol=icol)) + # remove entries that have been written to buffer + self.clinebuf = [x for x in self.clinebuf if x[0] != i] + + def _write_tabular_begin(self, buf, column_format: str): + """ + Write the beginning of a tabular environment or + nested table/tabular environments including caption and label. + + Parameters + ---------- + buf : string or file handle + File path or object. If not specified, the result is returned as + a string. + column_format : str + The columns format as specified in `LaTeX table format + `__ e.g 'rcl' + for 3 columns + """ + if self.caption is not None or self.label is not None: + # then write output in a nested table/tabular environment + if self.caption is None: + caption_ = "" + else: + caption_ = "\n\\caption{{{}}}".format(self.caption) + + if self.label is None: + label_ = "" + else: + label_ = "\n\\label{{{}}}".format(self.label) + + buf.write("\\begin{{table}}\n\\centering{}{}\n".format(caption_, label_)) + else: + # then write output only in a tabular environment + pass + + buf.write("\\begin{{tabular}}{{{fmt}}}\n".format(fmt=column_format)) + + def _write_tabular_end(self, buf): + """ + Write the end of a tabular environment or nested table/tabular + environment. + + Parameters + ---------- + buf : string or file handle + File path or object. If not specified, the result is returned as + a string. + + """ + buf.write("\\bottomrule\n") + buf.write("\\end{tabular}\n") + if self.caption is not None or self.label is not None: + buf.write("\\end{table}\n") + else: + pass + + def _write_longtable_begin(self, buf, column_format: str): + """ + Write the beginning of a longtable environment including caption and + label if provided by user. + + Parameters + ---------- + buf : string or file handle + File path or object. If not specified, the result is returned as + a string. + column_format : str + The columns format as specified in `LaTeX table format + `__ e.g 'rcl' + for 3 columns + """ + buf.write("\\begin{{longtable}}{{{fmt}}}\n".format(fmt=column_format)) + + if self.caption is not None or self.label is not None: + if self.caption is None: + pass + else: + buf.write("\\caption{{{}}}".format(self.caption)) + + if self.label is None: + pass + else: + buf.write("\\label{{{}}}".format(self.label)) + + # a double-backslash is required at the end of the line + # as discussed here: + # https://tex.stackexchange.com/questions/219138 + buf.write("\\\\\n") + else: + pass + + @staticmethod + def _write_longtable_end(buf): + """ + Write the end of a longtable environment. + + Parameters + ---------- + buf : string or file handle + File path or object. If not specified, the result is returned as + a string. + + """ + buf.write("\\end{longtable}\n") diff --git a/venv/Lib/site-packages/pandas/io/formats/printing.py b/venv/Lib/site-packages/pandas/io/formats/printing.py new file mode 100644 index 0000000..4b5b5e9 --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/formats/printing.py @@ -0,0 +1,530 @@ +""" +Printing tools. +""" + +import sys +from typing import ( + Any, + Callable, + Iterable, + List, + Mapping, + Optional, + Sequence, + Tuple, + Union, +) + +from pandas._config import get_option + +from pandas.core.dtypes.inference import is_sequence + +EscapeChars = Union[Mapping[str, str], Iterable[str]] + + +def adjoin(space: int, *lists: List[str], **kwargs) -> str: + """ + Glues together two sets of strings using the amount of space requested. + The idea is to prettify. + + ---------- + space : int + number of spaces for padding + lists : str + list of str which being joined + strlen : callable + function used to calculate the length of each str. Needed for unicode + handling. + justfunc : callable + function used to justify str. Needed for unicode handling. + """ + strlen = kwargs.pop("strlen", len) + justfunc = kwargs.pop("justfunc", justify) + + out_lines = [] + newLists = [] + lengths = [max(map(strlen, x)) + space for x in lists[:-1]] + # not the last one + lengths.append(max(map(len, lists[-1]))) + maxLen = max(map(len, lists)) + for i, lst in enumerate(lists): + nl = justfunc(lst, lengths[i], mode="left") + nl.extend([" " * lengths[i]] * (maxLen - len(lst))) + newLists.append(nl) + toJoin = zip(*newLists) + for lines in toJoin: + out_lines.append("".join(lines)) + return "\n".join(out_lines) + + +def justify(texts: Iterable[str], max_len: int, mode: str = "right") -> List[str]: + """ + Perform ljust, center, rjust against string or list-like + """ + if mode == "left": + return [x.ljust(max_len) for x in texts] + elif mode == "center": + return [x.center(max_len) for x in texts] + else: + return [x.rjust(max_len) for x in texts] + + +# Unicode consolidation +# --------------------- +# +# pprinting utility functions for generating Unicode text or +# bytes(3.x)/str(2.x) representations of objects. +# Try to use these as much as possible rather then rolling your own. +# +# When to use +# ----------- +# +# 1) If you're writing code internal to pandas (no I/O directly involved), +# use pprint_thing(). +# +# It will always return unicode text which can handled by other +# parts of the package without breakage. +# +# 2) if you need to write something out to file, use +# pprint_thing_encoded(encoding). +# +# If no encoding is specified, it defaults to utf-8. Since encoding pure +# ascii with utf-8 is a no-op you can safely use the default utf-8 if you're +# working with straight ascii. + + +def _pprint_seq( + seq: Sequence, _nest_lvl: int = 0, max_seq_items: Optional[int] = None, **kwds +) -> str: + """ + internal. pprinter for iterables. you should probably use pprint_thing() + rather then calling this directly. + + bounds length of printed sequence, depending on options + """ + if isinstance(seq, set): + fmt = "{{{body}}}" + else: + fmt = "[{body}]" if hasattr(seq, "__setitem__") else "({body})" + + if max_seq_items is False: + nitems = len(seq) + else: + nitems = max_seq_items or get_option("max_seq_items") or len(seq) + + s = iter(seq) + # handle sets, no slicing + r = [ + pprint_thing(next(s), _nest_lvl + 1, max_seq_items=max_seq_items, **kwds) + for i in range(min(nitems, len(seq))) + ] + body = ", ".join(r) + + if nitems < len(seq): + body += ", ..." + elif isinstance(seq, tuple) and len(seq) == 1: + body += "," + + return fmt.format(body=body) + + +def _pprint_dict( + seq: Mapping, _nest_lvl: int = 0, max_seq_items: Optional[int] = None, **kwds +) -> str: + """ + internal. pprinter for iterables. you should probably use pprint_thing() + rather then calling this directly. + """ + fmt = "{{{things}}}" + pairs = [] + + pfmt = "{key}: {val}" + + if max_seq_items is False: + nitems = len(seq) + else: + nitems = max_seq_items or get_option("max_seq_items") or len(seq) + + for k, v in list(seq.items())[:nitems]: + pairs.append( + pfmt.format( + key=pprint_thing(k, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds), + val=pprint_thing(v, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds), + ) + ) + + if nitems < len(seq): + return fmt.format(things=", ".join(pairs) + ", ...") + else: + return fmt.format(things=", ".join(pairs)) + + +def pprint_thing( + thing: Any, + _nest_lvl: int = 0, + escape_chars: Optional[EscapeChars] = None, + default_escapes: bool = False, + quote_strings: bool = False, + max_seq_items: Optional[int] = None, +) -> str: + """ + This function is the sanctioned way of converting objects + to a string representation and properly handles nested sequences. + + Parameters + ---------- + thing : anything to be formatted + _nest_lvl : internal use only. pprint_thing() is mutually-recursive + with pprint_sequence, this argument is used to keep track of the + current nesting level, and limit it. + escape_chars : list or dict, optional + Characters to escape. If a dict is passed the values are the + replacements + default_escapes : bool, default False + Whether the input escape characters replaces or adds to the defaults + max_seq_items : int or None, default None + Pass through to other pretty printers to limit sequence printing + + Returns + ------- + str + """ + + def as_escaped_string( + thing: Any, escape_chars: Optional[EscapeChars] = escape_chars + ) -> str: + translate = {"\t": r"\t", "\n": r"\n", "\r": r"\r"} + if isinstance(escape_chars, dict): + if default_escapes: + translate.update(escape_chars) + else: + translate = escape_chars + escape_chars = list(escape_chars.keys()) + else: + escape_chars = escape_chars or tuple() + + result = str(thing) + for c in escape_chars: + result = result.replace(c, translate[c]) + return result + + if hasattr(thing, "__next__"): + return str(thing) + elif isinstance(thing, dict) and _nest_lvl < get_option( + "display.pprint_nest_depth" + ): + result = _pprint_dict( + thing, _nest_lvl, quote_strings=True, max_seq_items=max_seq_items + ) + elif is_sequence(thing) and _nest_lvl < get_option("display.pprint_nest_depth"): + result = _pprint_seq( + thing, + _nest_lvl, + escape_chars=escape_chars, + quote_strings=quote_strings, + max_seq_items=max_seq_items, + ) + elif isinstance(thing, str) and quote_strings: + result = "'{thing}'".format(thing=as_escaped_string(thing)) + else: + result = as_escaped_string(thing) + + return result + + +def pprint_thing_encoded( + object, encoding: str = "utf-8", errors: str = "replace" +) -> bytes: + value = pprint_thing(object) # get unicode representation of object + return value.encode(encoding, errors) + + +def _enable_data_resource_formatter(enable: bool) -> None: + if "IPython" not in sys.modules: + # definitely not in IPython + return + from IPython import get_ipython + + ip = get_ipython() + if ip is None: + # still not in IPython + return + + formatters = ip.display_formatter.formatters + mimetype = "application/vnd.dataresource+json" + + if enable: + if mimetype not in formatters: + # define tableschema formatter + from IPython.core.formatters import BaseFormatter + + class TableSchemaFormatter(BaseFormatter): + print_method = "_repr_data_resource_" + _return_type = (dict,) + + # register it: + formatters[mimetype] = TableSchemaFormatter() + # enable it if it's been disabled: + formatters[mimetype].enabled = True + else: + # unregister tableschema mime-type + if mimetype in formatters: + formatters[mimetype].enabled = False + + +default_pprint = lambda x, max_seq_items=None: pprint_thing( + x, escape_chars=("\t", "\r", "\n"), quote_strings=True, max_seq_items=max_seq_items +) + + +def format_object_summary( + obj, + formatter: Callable, + is_justify: bool = True, + name: Optional[str] = None, + indent_for_name: bool = True, + line_break_each_value: bool = False, +) -> str: + """ + Return the formatted obj as a unicode string + + Parameters + ---------- + obj : object + must be iterable and support __getitem__ + formatter : callable + string formatter for an element + is_justify : boolean + should justify the display + name : name, optional + defaults to the class name of the obj + indent_for_name : bool, default True + Whether subsequent lines should be be indented to + align with the name. + line_break_each_value : bool, default False + If True, inserts a line break for each value of ``obj``. + If False, only break lines when the a line of values gets wider + than the display width. + + .. versionadded:: 0.25.0 + + Returns + ------- + summary string + """ + from pandas.io.formats.console import get_console_size + from pandas.io.formats.format import _get_adjustment + + display_width, _ = get_console_size() + if display_width is None: + display_width = get_option("display.width") or 80 + if name is None: + name = type(obj).__name__ + + if indent_for_name: + name_len = len(name) + space1 = f'\n{(" " * (name_len + 1))}' + space2 = f'\n{(" " * (name_len + 2))}' + else: + space1 = "\n" + space2 = "\n " # space for the opening '[' + + n = len(obj) + if line_break_each_value: + # If we want to vertically align on each value of obj, we need to + # separate values by a line break and indent the values + sep = ",\n " + " " * len(name) + else: + sep = "," + max_seq_items = get_option("display.max_seq_items") or n + + # are we a truncated display + is_truncated = n > max_seq_items + + # adj can optionally handle unicode eastern asian width + adj = _get_adjustment() + + def _extend_line( + s: str, line: str, value: str, display_width: int, next_line_prefix: str + ) -> Tuple[str, str]: + + if adj.len(line.rstrip()) + adj.len(value.rstrip()) >= display_width: + s += line.rstrip() + line = next_line_prefix + line += value + return s, line + + def best_len(values: List[str]) -> int: + if values: + return max(adj.len(x) for x in values) + else: + return 0 + + close = ", " + + if n == 0: + summary = f"[]{close}" + elif n == 1 and not line_break_each_value: + first = formatter(obj[0]) + summary = f"[{first}]{close}" + elif n == 2 and not line_break_each_value: + first = formatter(obj[0]) + last = formatter(obj[-1]) + summary = f"[{first}, {last}]{close}" + else: + + if n > max_seq_items: + n = min(max_seq_items // 2, 10) + head = [formatter(x) for x in obj[:n]] + tail = [formatter(x) for x in obj[-n:]] + else: + head = [] + tail = [formatter(x) for x in obj] + + # adjust all values to max length if needed + if is_justify: + if line_break_each_value: + # Justify each string in the values of head and tail, so the + # strings will right align when head and tail are stacked + # vertically. + head, tail = _justify(head, tail) + elif is_truncated or not ( + len(", ".join(head)) < display_width + and len(", ".join(tail)) < display_width + ): + # Each string in head and tail should align with each other + max_length = max(best_len(head), best_len(tail)) + head = [x.rjust(max_length) for x in head] + tail = [x.rjust(max_length) for x in tail] + # If we are not truncated and we are only a single + # line, then don't justify + + if line_break_each_value: + # Now head and tail are of type List[Tuple[str]]. Below we + # convert them into List[str], so there will be one string per + # value. Also truncate items horizontally if wider than + # max_space + max_space = display_width - len(space2) + value = tail[0] + for max_items in reversed(range(1, len(value) + 1)): + pprinted_seq = _pprint_seq(value, max_seq_items=max_items) + if len(pprinted_seq) < max_space: + break + head = [_pprint_seq(x, max_seq_items=max_items) for x in head] + tail = [_pprint_seq(x, max_seq_items=max_items) for x in tail] + + summary = "" + line = space2 + + for max_items in range(len(head)): + word = head[max_items] + sep + " " + summary, line = _extend_line(summary, line, word, display_width, space2) + + if is_truncated: + # remove trailing space of last line + summary += line.rstrip() + space2 + "..." + line = space2 + + for max_items in range(len(tail) - 1): + word = tail[max_items] + sep + " " + summary, line = _extend_line(summary, line, word, display_width, space2) + + # last value: no sep added + 1 space of width used for trailing ',' + summary, line = _extend_line(summary, line, tail[-1], display_width - 2, space2) + summary += line + + # right now close is either '' or ', ' + # Now we want to include the ']', but not the maybe space. + close = "]" + close.rstrip(" ") + summary += close + + if len(summary) > (display_width) or line_break_each_value: + summary += space1 + else: # one row + summary += " " + + # remove initial space + summary = "[" + summary[len(space2) :] + + return summary + + +def _justify( + head: List[Sequence[str]], tail: List[Sequence[str]] +) -> Tuple[List[Tuple[str, ...]], List[Tuple[str, ...]]]: + """ + Justify items in head and tail, so they are right-aligned when stacked. + + Parameters + ---------- + head : list-like of list-likes of strings + tail : list-like of list-likes of strings + + Returns + ------- + tuple of list of tuples of strings + Same as head and tail, but items are right aligned when stacked + vertically. + + Examples + -------- + >>> _justify([['a', 'b']], [['abc', 'abcd']]) + ([(' a', ' b')], [('abc', 'abcd')]) + """ + combined = head + tail + + # For each position for the sequences in ``combined``, + # find the length of the largest string. + max_length = [0] * len(combined[0]) + for inner_seq in combined: + length = [len(item) for item in inner_seq] + max_length = [max(x, y) for x, y in zip(max_length, length)] + + # justify each item in each list-like in head and tail using max_length + head = [ + tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length)) for seq in head + ] + tail = [ + tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length)) for seq in tail + ] + # https://github.com/python/mypy/issues/4975 + # error: Incompatible return value type (got "Tuple[List[Sequence[str]], + # List[Sequence[str]]]", expected "Tuple[List[Tuple[str, ...]], + # List[Tuple[str, ...]]]") + return head, tail # type: ignore + + +def format_object_attrs( + obj: Sequence, include_dtype: bool = True +) -> List[Tuple[str, Union[str, int]]]: + """ + Return a list of tuples of the (attr, formatted_value) + for common attrs, including dtype, name, length + + Parameters + ---------- + obj : object + must be iterable + include_dtype : bool + If False, dtype won't be in the returned list + + Returns + ------- + list of 2-tuple + + """ + attrs: List[Tuple[str, Union[str, int]]] = [] + if hasattr(obj, "dtype") and include_dtype: + # error: "Sequence[Any]" has no attribute "dtype" + attrs.append(("dtype", f"'{obj.dtype}'")) # type: ignore + if getattr(obj, "name", None) is not None: + # error: "Sequence[Any]" has no attribute "name" + attrs.append(("name", default_pprint(obj.name))) # type: ignore + # error: "Sequence[Any]" has no attribute "names" + elif getattr(obj, "names", None) is not None and any(obj.names): # type: ignore + # error: "Sequence[Any]" has no attribute "names" + attrs.append(("names", default_pprint(obj.names))) # type: ignore + max_seq_items = get_option("display.max_seq_items") or len(obj) + if len(obj) > max_seq_items: + attrs.append(("length", len(obj))) + return attrs diff --git a/venv/Lib/site-packages/pandas/io/formats/style.py b/venv/Lib/site-packages/pandas/io/formats/style.py new file mode 100644 index 0000000..8570875 --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/formats/style.py @@ -0,0 +1,1528 @@ +""" +Module for applying conditional formatting to +DataFrames and Series. +""" + +from collections import defaultdict +from contextlib import contextmanager +import copy +from functools import partial +from itertools import product +from typing import Any, Callable, DefaultDict, Dict, List, Optional, Sequence, Tuple +from uuid import uuid1 + +import numpy as np + +from pandas._config import get_option + +from pandas._libs import lib +from pandas.compat._optional import import_optional_dependency +from pandas.util._decorators import Appender + +from pandas.core.dtypes.common import is_float + +import pandas as pd +from pandas.api.types import is_dict_like, is_list_like +import pandas.core.common as com +from pandas.core.generic import _shared_docs +from pandas.core.indexing import _maybe_numeric_slice, _non_reducing_slice + +jinja2 = import_optional_dependency("jinja2", extra="DataFrame.style requires jinja2.") + + +try: + import matplotlib.pyplot as plt + from matplotlib import colors + + has_mpl = True +except ImportError: + has_mpl = False + no_mpl_message = "{0} requires matplotlib." + + +@contextmanager +def _mpl(func): + if has_mpl: + yield plt, colors + else: + raise ImportError(no_mpl_message.format(func.__name__)) + + +class Styler: + """ + Helps style a DataFrame or Series according to the data with HTML and CSS. + + Parameters + ---------- + data : Series or DataFrame + Data to be styled - either a Series or DataFrame. + precision : int + Precision to round floats to, defaults to pd.options.display.precision. + table_styles : list-like, default None + List of {selector: (attr, value)} dicts; see Notes. + uuid : str, default None + A unique identifier to avoid CSS collisions; generated automatically. + caption : str, default None + Caption to attach to the table. + table_attributes : str, default None + Items that show up in the opening ```` tag + in addition to automatic (by default) id. + cell_ids : bool, default True + If True, each cell will have an ``id`` attribute in their HTML tag. + The ``id`` takes the form ``T__row_col`` + where ```` is the unique identifier, ```` is the row + number and ```` is the column number. + na_rep : str, optional + Representation for missing values. + If ``na_rep`` is None, no special formatting is applied + + .. versionadded:: 1.0.0 + + Attributes + ---------- + env : Jinja2 jinja2.Environment + template : Jinja2 Template + loader : Jinja2 Loader + + See Also + -------- + DataFrame.style : Return a Styler object containing methods for building + a styled HTML representation for the DataFrame. + + Notes + ----- + Most styling will be done by passing style functions into + ``Styler.apply`` or ``Styler.applymap``. Style functions should + return values with strings containing CSS ``'attr: value'`` that will + be applied to the indicated cells. + + If using in the Jupyter notebook, Styler has defined a ``_repr_html_`` + to automatically render itself. Otherwise call Styler.render to get + the generated HTML. + + CSS classes are attached to the generated HTML + + * Index and Column names include ``index_name`` and ``level`` + where `k` is its level in a MultiIndex + * Index label cells include + + * ``row_heading`` + * ``row`` where `n` is the numeric position of the row + * ``level`` where `k` is the level in a MultiIndex + + * Column label cells include + * ``col_heading`` + * ``col`` where `n` is the numeric position of the column + * ``evel`` where `k` is the level in a MultiIndex + + * Blank cells include ``blank`` + * Data cells include ``data`` + """ + + loader = jinja2.PackageLoader("pandas", "io/formats/templates") + env = jinja2.Environment(loader=loader, trim_blocks=True) + template = env.get_template("html.tpl") + + def __init__( + self, + data, + precision=None, + table_styles=None, + uuid=None, + caption=None, + table_attributes=None, + cell_ids=True, + na_rep: Optional[str] = None, + ): + self.ctx: DefaultDict[Tuple[int, int], List[str]] = defaultdict(list) + self._todo: List[Tuple[Callable, Tuple, Dict]] = [] + + if not isinstance(data, (pd.Series, pd.DataFrame)): + raise TypeError("``data`` must be a Series or DataFrame") + if data.ndim == 1: + data = data.to_frame() + if not data.index.is_unique or not data.columns.is_unique: + raise ValueError("style is not supported for non-unique indices.") + + self.data = data + self.index = data.index + self.columns = data.columns + + self.uuid = uuid + self.table_styles = table_styles + self.caption = caption + if precision is None: + precision = get_option("display.precision") + self.precision = precision + self.table_attributes = table_attributes + self.hidden_index = False + self.hidden_columns: Sequence[int] = [] + self.cell_ids = cell_ids + self.na_rep = na_rep + + # display_funcs maps (row, col) -> formatting function + + def default_display_func(x): + if self.na_rep is not None and pd.isna(x): + return self.na_rep + elif is_float(x): + display_format = f"{x:.{self.precision}f}" + return display_format + else: + return x + + self._display_funcs: DefaultDict[ + Tuple[int, int], Callable[[Any], str] + ] = defaultdict(lambda: default_display_func) + + def _repr_html_(self): + """ + Hooks into Jupyter notebook rich display system. + """ + return self.render() + + @Appender( + _shared_docs["to_excel"] + % dict( + axes="index, columns", + klass="Styler", + axes_single_arg="{0 or 'index', 1 or 'columns'}", + optional_by=""" + by : str or list of str + Name or list of names which refer to the axis items.""", + versionadded_to_excel="\n .. versionadded:: 0.20", + ) + ) + def to_excel( + self, + excel_writer, + sheet_name="Sheet1", + na_rep="", + float_format=None, + columns=None, + header=True, + index=True, + index_label=None, + startrow=0, + startcol=0, + engine=None, + merge_cells=True, + encoding=None, + inf_rep="inf", + verbose=True, + freeze_panes=None, + ): + + from pandas.io.formats.excel import ExcelFormatter + + formatter = ExcelFormatter( + self, + na_rep=na_rep, + cols=columns, + header=header, + float_format=float_format, + index=index, + index_label=index_label, + merge_cells=merge_cells, + inf_rep=inf_rep, + ) + formatter.write( + excel_writer, + sheet_name=sheet_name, + startrow=startrow, + startcol=startcol, + freeze_panes=freeze_panes, + engine=engine, + ) + + def _translate(self): + """ + Convert the DataFrame in `self.data` and the attrs from `_build_styles` + into a dictionary of {head, body, uuid, cellstyle}. + """ + table_styles = self.table_styles or [] + caption = self.caption + ctx = self.ctx + precision = self.precision + hidden_index = self.hidden_index + hidden_columns = self.hidden_columns + uuid = self.uuid or str(uuid1()).replace("-", "_") + ROW_HEADING_CLASS = "row_heading" + COL_HEADING_CLASS = "col_heading" + INDEX_NAME_CLASS = "index_name" + + DATA_CLASS = "data" + BLANK_CLASS = "blank" + BLANK_VALUE = "" + + def format_attr(pair): + return f"{pair['key']}={pair['value']}" + + # for sparsifying a MultiIndex + idx_lengths = _get_level_lengths(self.index) + col_lengths = _get_level_lengths(self.columns, hidden_columns) + + cell_context = dict() + + n_rlvls = self.data.index.nlevels + n_clvls = self.data.columns.nlevels + rlabels = self.data.index.tolist() + clabels = self.data.columns.tolist() + + if n_rlvls == 1: + rlabels = [[x] for x in rlabels] + if n_clvls == 1: + clabels = [[x] for x in clabels] + clabels = list(zip(*clabels)) + + cellstyle = [] + head = [] + + for r in range(n_clvls): + # Blank for Index columns... + row_es = [ + { + "type": "th", + "value": BLANK_VALUE, + "display_value": BLANK_VALUE, + "is_visible": not hidden_index, + "class": " ".join([BLANK_CLASS]), + } + ] * (n_rlvls - 1) + + # ... except maybe the last for columns.names + name = self.data.columns.names[r] + cs = [ + BLANK_CLASS if name is None else INDEX_NAME_CLASS, + f"level{r}", + ] + name = BLANK_VALUE if name is None else name + row_es.append( + { + "type": "th", + "value": name, + "display_value": name, + "class": " ".join(cs), + "is_visible": not hidden_index, + } + ) + + if clabels: + for c, value in enumerate(clabels[r]): + cs = [ + COL_HEADING_CLASS, + f"level{r}", + f"col{c}", + ] + cs.extend( + cell_context.get("col_headings", {}).get(r, {}).get(c, []) + ) + es = { + "type": "th", + "value": value, + "display_value": value, + "class": " ".join(cs), + "is_visible": _is_visible(c, r, col_lengths), + } + colspan = col_lengths.get((r, c), 0) + if colspan > 1: + es["attributes"] = [ + format_attr({"key": "colspan", "value": colspan}) + ] + row_es.append(es) + head.append(row_es) + + if ( + self.data.index.names + and com.any_not_none(*self.data.index.names) + and not hidden_index + ): + index_header_row = [] + + for c, name in enumerate(self.data.index.names): + cs = [INDEX_NAME_CLASS, f"level{c}"] + name = "" if name is None else name + index_header_row.append( + {"type": "th", "value": name, "class": " ".join(cs)} + ) + + index_header_row.extend( + [{"type": "th", "value": BLANK_VALUE, "class": " ".join([BLANK_CLASS])}] + * (len(clabels[0]) - len(hidden_columns)) + ) + + head.append(index_header_row) + + body = [] + for r, idx in enumerate(self.data.index): + row_es = [] + for c, value in enumerate(rlabels[r]): + rid = [ + ROW_HEADING_CLASS, + f"level{c}", + f"row{r}", + ] + es = { + "type": "th", + "is_visible": (_is_visible(r, c, idx_lengths) and not hidden_index), + "value": value, + "display_value": value, + "id": "_".join(rid[1:]), + "class": " ".join(rid), + } + rowspan = idx_lengths.get((c, r), 0) + if rowspan > 1: + es["attributes"] = [ + format_attr({"key": "rowspan", "value": rowspan}) + ] + row_es.append(es) + + for c, col in enumerate(self.data.columns): + cs = [DATA_CLASS, f"row{r}", f"col{c}"] + cs.extend(cell_context.get("data", {}).get(r, {}).get(c, [])) + formatter = self._display_funcs[(r, c)] + value = self.data.iloc[r, c] + row_dict = { + "type": "td", + "value": value, + "class": " ".join(cs), + "display_value": formatter(value), + "is_visible": (c not in hidden_columns), + } + # only add an id if the cell has a style + if self.cell_ids or not (len(ctx[r, c]) == 1 and ctx[r, c][0] == ""): + row_dict["id"] = "_".join(cs[1:]) + row_es.append(row_dict) + props = [] + for x in ctx[r, c]: + # have to handle empty styles like [''] + if x.count(":"): + props.append(x.split(":")) + else: + props.append(["", ""]) + cellstyle.append({"props": props, "selector": f"row{r}_col{c}"}) + body.append(row_es) + + table_attr = self.table_attributes + use_mathjax = get_option("display.html.use_mathjax") + if not use_mathjax: + table_attr = table_attr or "" + if 'class="' in table_attr: + table_attr = table_attr.replace('class="', 'class="tex2jax_ignore ') + else: + table_attr += ' class="tex2jax_ignore"' + + return dict( + head=head, + cellstyle=cellstyle, + body=body, + uuid=uuid, + precision=precision, + table_styles=table_styles, + caption=caption, + table_attributes=table_attr, + ) + + def format(self, formatter, subset=None, na_rep: Optional[str] = None): + """ + Format the text display value of cells. + + Parameters + ---------- + formatter : str, callable, dict or None + If ``formatter`` is None, the default formatter is used + subset : IndexSlice + An argument to ``DataFrame.loc`` that restricts which elements + ``formatter`` is applied to. + na_rep : str, optional + Representation for missing values. + If ``na_rep`` is None, no special formatting is applied + + .. versionadded:: 1.0.0 + + Returns + ------- + self : Styler + + Notes + ----- + + ``formatter`` is either an ``a`` or a dict ``{column name: a}`` where + ``a`` is one of + + - str: this will be wrapped in: ``a.format(x)`` + - callable: called with the value of an individual cell + + The default display value for numeric values is the "general" (``g``) + format with ``pd.options.display.precision`` precision. + + Examples + -------- + + >>> df = pd.DataFrame(np.random.randn(4, 2), columns=['a', 'b']) + >>> df.style.format("{:.2%}") + >>> df['c'] = ['a', 'b', 'c', 'd'] + >>> df.style.format({'c': str.upper}) + """ + if formatter is None: + assert self._display_funcs.default_factory is not None + formatter = self._display_funcs.default_factory() + + if subset is None: + row_locs = range(len(self.data)) + col_locs = range(len(self.data.columns)) + else: + subset = _non_reducing_slice(subset) + if len(subset) == 1: + subset = subset, self.data.columns + + sub_df = self.data.loc[subset] + row_locs = self.data.index.get_indexer_for(sub_df.index) + col_locs = self.data.columns.get_indexer_for(sub_df.columns) + + if is_dict_like(formatter): + for col, col_formatter in formatter.items(): + # formatter must be callable, so '{}' are converted to lambdas + col_formatter = _maybe_wrap_formatter(col_formatter, na_rep) + col_num = self.data.columns.get_indexer_for([col])[0] + + for row_num in row_locs: + self._display_funcs[(row_num, col_num)] = col_formatter + else: + # single scalar to format all cells with + formatter = _maybe_wrap_formatter(formatter, na_rep) + locs = product(*(row_locs, col_locs)) + for i, j in locs: + self._display_funcs[(i, j)] = formatter + return self + + def render(self, **kwargs): + """ + Render the built up styles to HTML. + + Parameters + ---------- + **kwargs + Any additional keyword arguments are passed + through to ``self.template.render``. + This is useful when you need to provide + additional variables for a custom template. + + Returns + ------- + rendered : str + The rendered HTML. + + Notes + ----- + ``Styler`` objects have defined the ``_repr_html_`` method + which automatically calls ``self.render()`` when it's the + last item in a Notebook cell. When calling ``Styler.render()`` + directly, wrap the result in ``IPython.display.HTML`` to view + the rendered HTML in the notebook. + + Pandas uses the following keys in render. Arguments passed + in ``**kwargs`` take precedence, so think carefully if you want + to override them: + + * head + * cellstyle + * body + * uuid + * precision + * table_styles + * caption + * table_attributes + """ + self._compute() + # TODO: namespace all the pandas keys + d = self._translate() + # filter out empty styles, every cell will have a class + # but the list of props may just be [['', '']]. + # so we have the neested anys below + trimmed = [x for x in d["cellstyle"] if any(any(y) for y in x["props"])] + d["cellstyle"] = trimmed + d.update(kwargs) + return self.template.render(**d) + + def _update_ctx(self, attrs): + """ + Update the state of the Styler. + + Collects a mapping of {index_label: [': ']}. + + attrs : Series or DataFrame + should contain strings of ': ;: ' + Whitespace shouldn't matter and the final trailing ';' shouldn't + matter. + """ + for row_label, v in attrs.iterrows(): + for col_label, col in v.items(): + i = self.index.get_indexer([row_label])[0] + j = self.columns.get_indexer([col_label])[0] + for pair in col.rstrip(";").split(";"): + self.ctx[(i, j)].append(pair) + + def _copy(self, deepcopy=False): + styler = Styler( + self.data, + precision=self.precision, + caption=self.caption, + uuid=self.uuid, + table_styles=self.table_styles, + na_rep=self.na_rep, + ) + if deepcopy: + styler.ctx = copy.deepcopy(self.ctx) + styler._todo = copy.deepcopy(self._todo) + else: + styler.ctx = self.ctx + styler._todo = self._todo + return styler + + def __copy__(self): + """ + Deep copy by default. + """ + return self._copy(deepcopy=False) + + def __deepcopy__(self, memo): + return self._copy(deepcopy=True) + + def clear(self): + """ + Reset the styler, removing any previously applied styles. + + Returns None. + """ + self.ctx.clear() + self._todo = [] + + def _compute(self): + """ + Execute the style functions built up in `self._todo`. + + Relies on the conventions that all style functions go through + .apply or .applymap. The append styles to apply as tuples of + + (application method, *args, **kwargs) + """ + r = self + for func, args, kwargs in self._todo: + r = func(self)(*args, **kwargs) + return r + + def _apply(self, func, axis=0, subset=None, **kwargs): + subset = slice(None) if subset is None else subset + subset = _non_reducing_slice(subset) + data = self.data.loc[subset] + if axis is not None: + result = data.apply(func, axis=axis, result_type="expand", **kwargs) + result.columns = data.columns + else: + result = func(data, **kwargs) + if not isinstance(result, pd.DataFrame): + raise TypeError( + f"Function {repr(func)} must return a DataFrame when " + f"passed to `Styler.apply` with axis=None" + ) + if not ( + result.index.equals(data.index) and result.columns.equals(data.columns) + ): + raise ValueError( + f"Result of {repr(func)} must have identical " + f"index and columns as the input" + ) + + result_shape = result.shape + expected_shape = self.data.loc[subset].shape + if result_shape != expected_shape: + raise ValueError( + f"Function {repr(func)} returned the wrong shape.\n" + f"Result has shape: {result.shape}\n" + f"Expected shape: {expected_shape}" + ) + self._update_ctx(result) + return self + + def apply(self, func, axis=0, subset=None, **kwargs): + """ + Apply a function column-wise, row-wise, or table-wise. + + Updates the HTML representation with the result. + + Parameters + ---------- + func : function + ``func`` should take a Series or DataFrame (depending + on ``axis``), and return an object with the same shape. + Must return a DataFrame with identical index and + column labels when ``axis=None``. + axis : {0 or 'index', 1 or 'columns', None}, default 0 + Apply to each column (``axis=0`` or ``'index'``), to each row + (``axis=1`` or ``'columns'``), or to the entire DataFrame at once + with ``axis=None``. + subset : IndexSlice + A valid indexer to limit ``data`` to *before* applying the + function. Consider using a pandas.IndexSlice. + **kwargs : dict + Pass along to ``func``. + + Returns + ------- + self : Styler + + Notes + ----- + The output shape of ``func`` should match the input, i.e. if + ``x`` is the input row, column, or table (depending on ``axis``), + then ``func(x).shape == x.shape`` should be true. + + This is similar to ``DataFrame.apply``, except that ``axis=None`` + applies the function to the entire DataFrame at once, + rather than column-wise or row-wise. + + Examples + -------- + >>> def highlight_max(x): + ... return ['background-color: yellow' if v == x.max() else '' + for v in x] + ... + >>> df = pd.DataFrame(np.random.randn(5, 2)) + >>> df.style.apply(highlight_max) + """ + self._todo.append( + (lambda instance: getattr(instance, "_apply"), (func, axis, subset), kwargs) + ) + return self + + def _applymap(self, func, subset=None, **kwargs): + func = partial(func, **kwargs) # applymap doesn't take kwargs? + if subset is None: + subset = pd.IndexSlice[:] + subset = _non_reducing_slice(subset) + result = self.data.loc[subset].applymap(func) + self._update_ctx(result) + return self + + def applymap(self, func, subset=None, **kwargs): + """ + Apply a function elementwise. + + Updates the HTML representation with the result. + + Parameters + ---------- + func : function + ``func`` should take a scalar and return a scalar. + subset : IndexSlice + A valid indexer to limit ``data`` to *before* applying the + function. Consider using a pandas.IndexSlice. + **kwargs : dict + Pass along to ``func``. + + Returns + ------- + self : Styler + + See Also + -------- + Styler.where + """ + self._todo.append( + (lambda instance: getattr(instance, "_applymap"), (func, subset), kwargs) + ) + return self + + def where(self, cond, value, other=None, subset=None, **kwargs): + """ + Apply a function elementwise. + + Updates the HTML representation with a style which is + selected in accordance with the return value of a function. + + .. versionadded:: 0.21.0 + + Parameters + ---------- + cond : callable + ``cond`` should take a scalar and return a boolean. + value : str + Applied when ``cond`` returns true. + other : str + Applied when ``cond`` returns false. + subset : IndexSlice + A valid indexer to limit ``data`` to *before* applying the + function. Consider using a pandas.IndexSlice. + **kwargs : dict + Pass along to ``cond``. + + Returns + ------- + self : Styler + + See Also + -------- + Styler.applymap + """ + + if other is None: + other = "" + + return self.applymap( + lambda val: value if cond(val) else other, subset=subset, **kwargs + ) + + def set_precision(self, precision): + """ + Set the precision used to render. + + Parameters + ---------- + precision : int + + Returns + ------- + self : Styler + """ + self.precision = precision + return self + + def set_table_attributes(self, attributes): + """ + Set the table attributes. + + These are the items that show up in the opening ``
`` tag + in addition to to automatic (by default) id. + + Parameters + ---------- + attributes : str + + Returns + ------- + self : Styler + + Examples + -------- + >>> df = pd.DataFrame(np.random.randn(10, 4)) + >>> df.style.set_table_attributes('class="pure-table"') + # ...
... + """ + self.table_attributes = attributes + return self + + def export(self): + """ + Export the styles to applied to the current Styler. + + Can be applied to a second style with ``Styler.use``. + + Returns + ------- + styles : list + + See Also + -------- + Styler.use + """ + return self._todo + + def use(self, styles): + """ + Set the styles on the current Styler. + + Possibly uses styles from ``Styler.export``. + + Parameters + ---------- + styles : list + List of style functions. + + Returns + ------- + self : Styler + + See Also + -------- + Styler.export + """ + self._todo.extend(styles) + return self + + def set_uuid(self, uuid): + """ + Set the uuid for a Styler. + + Parameters + ---------- + uuid : str + + Returns + ------- + self : Styler + """ + self.uuid = uuid + return self + + def set_caption(self, caption): + """ + Set the caption on a Styler. + + Parameters + ---------- + caption : str + + Returns + ------- + self : Styler + """ + self.caption = caption + return self + + def set_table_styles(self, table_styles): + """ + Set the table styles on a Styler. + + These are placed in a `` +{%- endblock style %} +{%- block before_table %}{% endblock before_table %} +{%- block table %} +
+{%- block caption %} +{%- if caption -%} + +{%- endif -%} +{%- endblock caption %} +{%- block thead %} + + {%- block before_head_rows %}{% endblock %} + {%- for r in head %} + {%- block head_tr scoped %} + + {%- for c in r %} + {%- if c.is_visible != False %} + <{{ c.type }} class="{{c.class}}" {{ c.attributes|join(" ") }}>{{c.value}} + {%- endif %} + {%- endfor %} + + {%- endblock head_tr %} + {%- endfor %} + {%- block after_head_rows %}{% endblock %} + +{%- endblock thead %} +{%- block tbody %} + + {% block before_rows %}{% endblock before_rows %} + {% for r in body %} + {% block tr scoped %} + + {% for c in r %} + {% if c.is_visible != False %} + <{{ c.type }} {% if c.id is defined -%} id="T_{{ uuid }}{{ c.id }}" {%- endif %} class="{{ c.class }}" {{ c.attributes|join(" ") }}>{{ c.display_value }} + {% endif %} + {%- endfor %} + + {% endblock tr %} + {%- endfor %} + {%- block after_rows %}{%- endblock after_rows %} + +{%- endblock tbody %} +
{{caption}}
+{%- endblock table %} +{%- block after_table %}{% endblock after_table %} diff --git a/venv/Lib/site-packages/pandas/io/gbq.py b/venv/Lib/site-packages/pandas/io/gbq.py new file mode 100644 index 0000000..69ebc47 --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/gbq.py @@ -0,0 +1,220 @@ +""" Google BigQuery support """ +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union + +from pandas.compat._optional import import_optional_dependency + +if TYPE_CHECKING: + from pandas import DataFrame + + +def _try_import(): + # since pandas is a dependency of pandas-gbq + # we need to import on first use + msg = ( + "pandas-gbq is required to load data from Google BigQuery. " + "See the docs: https://pandas-gbq.readthedocs.io." + ) + pandas_gbq = import_optional_dependency("pandas_gbq", extra=msg) + return pandas_gbq + + +def read_gbq( + query: str, + project_id: Optional[str] = None, + index_col: Optional[str] = None, + col_order: Optional[List[str]] = None, + reauth: bool = False, + auth_local_webserver: bool = False, + dialect: Optional[str] = None, + location: Optional[str] = None, + configuration: Optional[Dict[str, Any]] = None, + credentials=None, + use_bqstorage_api: Optional[bool] = None, + private_key=None, + verbose=None, + progress_bar_type: Optional[str] = None, +) -> "DataFrame": + """ + Load data from Google BigQuery. + + This function requires the `pandas-gbq package + `__. + + See the `How to authenticate with Google BigQuery + `__ + guide for authentication instructions. + + Parameters + ---------- + query : str + SQL-Like Query to return data values. + project_id : str, optional + Google BigQuery Account project ID. Optional when available from + the environment. + index_col : str, optional + Name of result column to use for index in results DataFrame. + col_order : list(str), optional + List of BigQuery column names in the desired order for results + DataFrame. + reauth : bool, default False + Force Google BigQuery to re-authenticate the user. This is useful + if multiple accounts are used. + auth_local_webserver : bool, default False + Use the `local webserver flow`_ instead of the `console flow`_ + when getting user credentials. + + .. _local webserver flow: + http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server + .. _console flow: + http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console + + *New in version 0.2.0 of pandas-gbq*. + dialect : str, default 'legacy' + Note: The default value is changing to 'standard' in a future version. + + SQL syntax dialect to use. Value can be one of: + + ``'legacy'`` + Use BigQuery's legacy SQL dialect. For more information see + `BigQuery Legacy SQL Reference + `__. + ``'standard'`` + Use BigQuery's standard SQL, which is + compliant with the SQL 2011 standard. For more information + see `BigQuery Standard SQL Reference + `__. + + .. versionchanged:: 0.24.0 + location : str, optional + Location where the query job should run. See the `BigQuery locations + documentation + `__ for a + list of available locations. The location must match that of any + datasets used in the query. + + *New in version 0.5.0 of pandas-gbq*. + configuration : dict, optional + Query config parameters for job processing. + For example: + + configuration = {'query': {'useQueryCache': False}} + + For more information see `BigQuery REST API Reference + `__. + credentials : google.auth.credentials.Credentials, optional + Credentials for accessing Google APIs. Use this parameter to override + default credentials, such as to use Compute Engine + :class:`google.auth.compute_engine.Credentials` or Service Account + :class:`google.oauth2.service_account.Credentials` directly. + + *New in version 0.8.0 of pandas-gbq*. + + .. versionadded:: 0.24.0 + use_bqstorage_api : bool, default False + Use the `BigQuery Storage API + `__ to + download query results quickly, but at an increased cost. To use this + API, first `enable it in the Cloud Console + `__. + You must also have the `bigquery.readsessions.create + `__ + permission on the project you are billing queries to. + + This feature requires version 0.10.0 or later of the ``pandas-gbq`` + package. It also requires the ``google-cloud-bigquery-storage`` and + ``fastavro`` packages. + + .. versionadded:: 0.25.0 + progress_bar_type : Optional, str + If set, use the `tqdm `__ library to + display a progress bar while the data downloads. Install the + ``tqdm`` package to use this feature. + + Possible values of ``progress_bar_type`` include: + + ``None`` + No progress bar. + ``'tqdm'`` + Use the :func:`tqdm.tqdm` function to print a progress bar + to :data:`sys.stderr`. + ``'tqdm_notebook'`` + Use the :func:`tqdm.tqdm_notebook` function to display a + progress bar as a Jupyter notebook widget. + ``'tqdm_gui'`` + Use the :func:`tqdm.tqdm_gui` function to display a + progress bar as a graphical dialog box. + + Note that his feature requires version 0.12.0 or later of the + ``pandas-gbq`` package. And it requires the ``tqdm`` package. Slightly + different than ``pandas-gbq``, here the default is ``None``. + + .. versionadded:: 1.0.0 + + Returns + ------- + df: DataFrame + DataFrame representing results of query. + + See Also + -------- + pandas_gbq.read_gbq : This function in the pandas-gbq library. + DataFrame.to_gbq : Write a DataFrame to Google BigQuery. + """ + pandas_gbq = _try_import() + + kwargs: Dict[str, Union[str, bool]] = {} + + # START: new kwargs. Don't populate unless explicitly set. + if use_bqstorage_api is not None: + kwargs["use_bqstorage_api"] = use_bqstorage_api + + if progress_bar_type is not None: + kwargs["progress_bar_type"] = progress_bar_type + # END: new kwargs + + return pandas_gbq.read_gbq( + query, + project_id=project_id, + index_col=index_col, + col_order=col_order, + reauth=reauth, + auth_local_webserver=auth_local_webserver, + dialect=dialect, + location=location, + configuration=configuration, + credentials=credentials, + **kwargs, + ) + + +def to_gbq( + dataframe: "DataFrame", + destination_table: str, + project_id: Optional[str] = None, + chunksize: Optional[int] = None, + reauth: bool = False, + if_exists: str = "fail", + auth_local_webserver: bool = False, + table_schema: Optional[List[Dict[str, str]]] = None, + location: Optional[str] = None, + progress_bar: bool = True, + credentials=None, + verbose=None, + private_key=None, +) -> None: + pandas_gbq = _try_import() + pandas_gbq.to_gbq( + dataframe, + destination_table, + project_id=project_id, + chunksize=chunksize, + reauth=reauth, + if_exists=if_exists, + auth_local_webserver=auth_local_webserver, + table_schema=table_schema, + location=location, + progress_bar=progress_bar, + credentials=credentials, + verbose=verbose, + private_key=private_key, + ) diff --git a/venv/Lib/site-packages/pandas/io/gcs.py b/venv/Lib/site-packages/pandas/io/gcs.py new file mode 100644 index 0000000..1f5e0fa --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/gcs.py @@ -0,0 +1,18 @@ +""" GCS support for remote file interactivity """ +from pandas.compat._optional import import_optional_dependency + +gcsfs = import_optional_dependency( + "gcsfs", extra="The gcsfs library is required to handle GCS files" +) + + +def get_filepath_or_buffer( + filepath_or_buffer, encoding=None, compression=None, mode=None +): + + if mode is None: + mode = "rb" + + fs = gcsfs.GCSFileSystem() + filepath_or_buffer = fs.open(filepath_or_buffer, mode) + return filepath_or_buffer, None, compression, True diff --git a/venv/Lib/site-packages/pandas/io/html.py b/venv/Lib/site-packages/pandas/io/html.py new file mode 100644 index 0000000..04f9f31 --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/html.py @@ -0,0 +1,1101 @@ +""" +:mod:`pandas.io.html` is a module containing functionality for dealing with +HTML IO. + +""" + +from collections import abc +import numbers +import os +import re + +from pandas.compat._optional import import_optional_dependency +from pandas.errors import AbstractMethodError, EmptyDataError + +from pandas.core.dtypes.common import is_list_like + +from pandas.core.construction import create_series_with_explicit_dtype + +from pandas.io.common import is_url, urlopen, validate_header_arg +from pandas.io.formats.printing import pprint_thing +from pandas.io.parsers import TextParser + +_IMPORTS = False +_HAS_BS4 = False +_HAS_LXML = False +_HAS_HTML5LIB = False + + +def _importers(): + # import things we need + # but make this done on a first use basis + + global _IMPORTS + if _IMPORTS: + return + + global _HAS_BS4, _HAS_LXML, _HAS_HTML5LIB + bs4 = import_optional_dependency("bs4", raise_on_missing=False, on_version="ignore") + _HAS_BS4 = bs4 is not None + + lxml = import_optional_dependency( + "lxml.etree", raise_on_missing=False, on_version="ignore" + ) + _HAS_LXML = lxml is not None + + html5lib = import_optional_dependency( + "html5lib", raise_on_missing=False, on_version="ignore" + ) + _HAS_HTML5LIB = html5lib is not None + + _IMPORTS = True + + +############# +# READ HTML # +############# +_RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}") + + +def _remove_whitespace(s: str, regex=_RE_WHITESPACE) -> str: + """ + Replace extra whitespace inside of a string with a single space. + + Parameters + ---------- + s : str or unicode + The string from which to remove extra whitespace. + regex : re.Pattern + The regular expression to use to remove extra whitespace. + + Returns + ------- + subd : str or unicode + `s` with all extra whitespace replaced with a single space. + """ + return regex.sub(" ", s.strip()) + + +def _get_skiprows(skiprows): + """ + Get an iterator given an integer, slice or container. + + Parameters + ---------- + skiprows : int, slice, container + The iterator to use to skip rows; can also be a slice. + + Raises + ------ + TypeError + * If `skiprows` is not a slice, integer, or Container + + Returns + ------- + it : iterable + A proper iterator to use to skip rows of a DataFrame. + """ + if isinstance(skiprows, slice): + start, step = skiprows.start or 0, skiprows.step or 1 + return list(range(start, skiprows.stop, step)) + elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows): + return skiprows + elif skiprows is None: + return 0 + raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows") + + +def _read(obj): + """ + Try to read from a url, file or string. + + Parameters + ---------- + obj : str, unicode, or file-like + + Returns + ------- + raw_text : str + """ + if is_url(obj): + with urlopen(obj) as url: + text = url.read() + elif hasattr(obj, "read"): + text = obj.read() + elif isinstance(obj, (str, bytes)): + text = obj + try: + if os.path.isfile(text): + with open(text, "rb") as f: + return f.read() + except (TypeError, ValueError): + pass + else: + raise TypeError(f"Cannot read object of type '{type(obj).__name__}'") + return text + + +class _HtmlFrameParser: + """ + Base class for parsers that parse HTML into DataFrames. + + Parameters + ---------- + io : str or file-like + This can be either a string of raw HTML, a valid URL using the HTTP, + FTP, or FILE protocols or a file-like object. + + match : str or regex + The text to match in the document. + + attrs : dict + List of HTML element attributes to match. + + encoding : str + Encoding to be used by parser + + displayed_only : bool + Whether or not items with "display:none" should be ignored + + .. versionadded:: 0.23.0 + + Attributes + ---------- + io : str or file-like + raw HTML, URL, or file-like object + + match : regex + The text to match in the raw HTML + + attrs : dict-like + A dictionary of valid table attributes to use to search for table + elements. + + encoding : str + Encoding to be used by parser + + displayed_only : bool + Whether or not items with "display:none" should be ignored + + .. versionadded:: 0.23.0 + + Notes + ----- + To subclass this class effectively you must override the following methods: + * :func:`_build_doc` + * :func:`_attr_getter` + * :func:`_text_getter` + * :func:`_parse_td` + * :func:`_parse_thead_tr` + * :func:`_parse_tbody_tr` + * :func:`_parse_tfoot_tr` + * :func:`_parse_tables` + * :func:`_equals_tag` + See each method's respective documentation for details on their + functionality. + """ + + def __init__(self, io, match, attrs, encoding, displayed_only): + self.io = io + self.match = match + self.attrs = attrs + self.encoding = encoding + self.displayed_only = displayed_only + + def parse_tables(self): + """ + Parse and return all tables from the DOM. + + Returns + ------- + list of parsed (header, body, footer) tuples from tables. + """ + tables = self._parse_tables(self._build_doc(), self.match, self.attrs) + return (self._parse_thead_tbody_tfoot(table) for table in tables) + + def _attr_getter(self, obj, attr): + """ + Return the attribute value of an individual DOM node. + + Parameters + ---------- + obj : node-like + A DOM node. + + attr : str or unicode + The attribute, such as "colspan" + + Returns + ------- + str or unicode + The attribute value. + """ + # Both lxml and BeautifulSoup have the same implementation: + return obj.get(attr) + + def _text_getter(self, obj): + """ + Return the text of an individual DOM node. + + Parameters + ---------- + obj : node-like + A DOM node. + + Returns + ------- + text : str or unicode + The text from an individual DOM node. + """ + raise AbstractMethodError(self) + + def _parse_td(self, obj): + """ + Return the td elements from a row element. + + Parameters + ---------- + obj : node-like + A DOM node. + + Returns + ------- + list of node-like + These are the elements of each row, i.e., the columns. + """ + raise AbstractMethodError(self) + + def _parse_thead_tr(self, table): + """ + Return the list of thead row elements from the parsed table element. + + Parameters + ---------- + table : a table element that contains zero or more thead elements. + + Returns + ------- + list of node-like + These are the row elements of a table. + """ + raise AbstractMethodError(self) + + def _parse_tbody_tr(self, table): + """ + Return the list of tbody row elements from the parsed table element. + + HTML5 table bodies consist of either 0 or more elements (which + only contain elements) or 0 or more elements. This method + checks for both structures. + + Parameters + ---------- + table : a table element that contains row elements. + + Returns + ------- + list of node-like + These are the row elements of a table. + """ + raise AbstractMethodError(self) + + def _parse_tfoot_tr(self, table): + """ + Return the list of tfoot row elements from the parsed table element. + + Parameters + ---------- + table : a table element that contains row elements. + + Returns + ------- + list of node-like + These are the row elements of a table. + """ + raise AbstractMethodError(self) + + def _parse_tables(self, doc, match, attrs): + """ + Return all tables from the parsed DOM. + + Parameters + ---------- + doc : the DOM from which to parse the table element. + + match : str or regular expression + The text to search for in the DOM tree. + + attrs : dict + A dictionary of table attributes that can be used to disambiguate + multiple tables on a page. + + Raises + ------ + ValueError : `match` does not match any text in the document. + + Returns + ------- + list of node-like + HTML
elements to be parsed into raw data. + """ + raise AbstractMethodError(self) + + def _equals_tag(self, obj, tag): + """ + Return whether an individual DOM node matches a tag + + Parameters + ---------- + obj : node-like + A DOM node. + + tag : str + Tag name to be checked for equality. + + Returns + ------- + boolean + Whether `obj`'s tag name is `tag` + """ + raise AbstractMethodError(self) + + def _build_doc(self): + """ + Return a tree-like object that can be used to iterate over the DOM. + + Returns + ------- + node-like + The DOM from which to parse the table element. + """ + raise AbstractMethodError(self) + + def _parse_thead_tbody_tfoot(self, table_html): + """ + Given a table, return parsed header, body, and foot. + + Parameters + ---------- + table_html : node-like + + Returns + ------- + tuple of (header, body, footer), each a list of list-of-text rows. + + Notes + ----- + Header and body are lists-of-lists. Top level list is a list of + rows. Each row is a list of str text. + + Logic: Use , , elements to identify + header, body, and footer, otherwise: + - Put all rows into body + - Move rows from top of body to header only if + all elements inside row are . Move the top all- or + while body_rows and row_is_all_th(body_rows[0]): + header_rows.append(body_rows.pop(0)) + + header = self._expand_colspan_rowspan(header_rows) + body = self._expand_colspan_rowspan(body_rows) + footer = self._expand_colspan_rowspan(footer_rows) + + return header, body, footer + + def _expand_colspan_rowspan(self, rows): + """ + Given a list of s, return a list of text rows. + + Parameters + ---------- + rows : list of node-like + List of s + + Returns + ------- + list of list + Each returned row is a list of str text. + + Notes + ----- + Any cell with ``rowspan`` or ``colspan`` will have its contents copied + to subsequent cells. + """ + + all_texts = [] # list of rows, each a list of str + remainder = [] # list of (index, text, nrows) + + for tr in rows: + texts = [] # the output for this row + next_remainder = [] + + index = 0 + tds = self._parse_td(tr) + for td in tds: + # Append texts from previous rows with rowspan>1 that come + # before this or (see _parse_thead_tr). + return row.xpath("./td|./th") + + def _parse_tables(self, doc, match, kwargs): + pattern = match.pattern + + # 1. check all descendants for the given pattern and only search tables + # 2. go up the tree until we find a table + xpath_expr = f"//table//*[re:test(text(), {repr(pattern)})]/ancestor::table" + + # if any table attributes were given build an xpath expression to + # search for them + if kwargs: + xpath_expr += _build_xpath_expr(kwargs) + + tables = doc.xpath(xpath_expr, namespaces=_re_namespace) + + tables = self._handle_hidden_tables(tables, "attrib") + if self.displayed_only: + for table in tables: + # lxml utilizes XPATH 1.0 which does not have regex + # support. As a result, we find all elements with a style + # attribute and iterate them to check for display:none + for elem in table.xpath(".//*[@style]"): + if "display:none" in elem.attrib.get("style", "").replace(" ", ""): + elem.getparent().remove(elem) + + if not tables: + raise ValueError(f"No tables found matching regex {repr(pattern)}") + return tables + + def _equals_tag(self, obj, tag): + return obj.tag == tag + + def _build_doc(self): + """ + Raises + ------ + ValueError + * If a URL that lxml cannot parse is passed. + + Exception + * Any other ``Exception`` thrown. For example, trying to parse a + URL that is syntactically correct on a machine with no internet + connection will fail. + + See Also + -------- + pandas.io.html._HtmlFrameParser._build_doc + """ + from lxml.html import parse, fromstring, HTMLParser + from lxml.etree import XMLSyntaxError + + parser = HTMLParser(recover=True, encoding=self.encoding) + + try: + if is_url(self.io): + with urlopen(self.io) as f: + r = parse(f, parser=parser) + else: + # try to parse the input in the simplest way + r = parse(self.io, parser=parser) + try: + r = r.getroot() + except AttributeError: + pass + except (UnicodeDecodeError, IOError) as e: + # if the input is a blob of html goop + if not is_url(self.io): + r = fromstring(self.io, parser=parser) + + try: + r = r.getroot() + except AttributeError: + pass + else: + raise e + else: + if not hasattr(r, "text_content"): + raise XMLSyntaxError("no text parsed from document", 0, 0, 0) + return r + + def _parse_thead_tr(self, table): + rows = [] + + for thead in table.xpath(".//thead"): + rows.extend(thead.xpath("./tr")) + + # HACK: lxml does not clean up the clearly-erroneous + # . (Missing ). Add + # the and _pretend_ it's a ; _parse_td() will find its + # children as though it's a . + # + # Better solution would be to use html5lib. + elements_at_root = thead.xpath("./td|./th") + if elements_at_root: + rows.append(thead) + + return rows + + def _parse_tbody_tr(self, table): + from_tbody = table.xpath(".//tbody//tr") + from_root = table.xpath("./tr") + # HTML spec: at most one of these lists has content + return from_tbody + from_root + + def _parse_tfoot_tr(self, table): + return table.xpath(".//tfoot//tr") + + +def _expand_elements(body): + data = [len(elem) for elem in body] + lens = create_series_with_explicit_dtype(data, dtype_if_empty=object) + lens_max = lens.max() + not_max = lens[lens != lens_max] + + empty = [""] + for ind, length in not_max.items(): + body[ind] += empty * (lens_max - length) + + +def _data_to_frame(**kwargs): + head, body, foot = kwargs.pop("data") + header = kwargs.pop("header") + kwargs["skiprows"] = _get_skiprows(kwargs["skiprows"]) + if head: + body = head + body + + # Infer header when there is a or top ") + + result1 = self.read_html(data1)[0] + result2 = self.read_html(data2)[0] + + tm.assert_frame_equal(result1, expected1) + tm.assert_frame_equal(result2, expected2) + + def test_parse_header_of_non_string_column(self): + # GH5048: if header is specified explicitly, an int column should be + # parsed as int while its header is parsed as str + result = self.read_html( + """ +
+ - Move rows from bottom of body to footer only if + all elements inside row are + """ + + header_rows = self._parse_thead_tr(table_html) + body_rows = self._parse_tbody_tr(table_html) + footer_rows = self._parse_tfoot_tr(table_html) + + def row_is_all_th(row): + return all(self._equals_tag(t, "th") for t in self._parse_td(row)) + + if not header_rows: + # The table has no
rows from + # body_rows to header_rows. (This is a common case because many + # tables in the wild have no
+ while remainder and remainder[0][0] <= index: + prev_i, prev_text, prev_rowspan = remainder.pop(0) + texts.append(prev_text) + if prev_rowspan > 1: + next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) + index += 1 + + # Append the text from this , colspan times + text = _remove_whitespace(self._text_getter(td)) + rowspan = int(self._attr_getter(td, "rowspan") or 1) + colspan = int(self._attr_getter(td, "colspan") or 1) + + for _ in range(colspan): + texts.append(text) + if rowspan > 1: + next_remainder.append((index, text, rowspan - 1)) + index += 1 + + # Append texts from previous rows at the final position + for prev_i, prev_text, prev_rowspan in remainder: + texts.append(prev_text) + if prev_rowspan > 1: + next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) + + all_texts.append(texts) + remainder = next_remainder + + # Append rows that only appear because the previous row had non-1 + # rowspan + while remainder: + next_remainder = [] + texts = [] + for prev_i, prev_text, prev_rowspan in remainder: + texts.append(prev_text) + if prev_rowspan > 1: + next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) + all_texts.append(texts) + remainder = next_remainder + + return all_texts + + def _handle_hidden_tables(self, tbl_list, attr_name): + """ + Return list of tables, potentially removing hidden elements + + Parameters + ---------- + tbl_list : list of node-like + Type of list elements will vary depending upon parser used + attr_name : str + Name of the accessor for retrieving HTML attributes + + Returns + ------- + list of node-like + Return type matches `tbl_list` + """ + if not self.displayed_only: + return tbl_list + + return [ + x + for x in tbl_list + if "display:none" + not in getattr(x, attr_name).get("style", "").replace(" ", "") + ] + + +class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser): + """ + HTML to DataFrame parser that uses BeautifulSoup under the hood. + + See Also + -------- + pandas.io.html._HtmlFrameParser + pandas.io.html._LxmlFrameParser + + Notes + ----- + Documentation strings for this class are in the base class + :class:`pandas.io.html._HtmlFrameParser`. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + from bs4 import SoupStrainer + + self._strainer = SoupStrainer("table") + + def _parse_tables(self, doc, match, attrs): + element_name = self._strainer.name + tables = doc.find_all(element_name, attrs=attrs) + + if not tables: + raise ValueError("No tables found") + + result = [] + unique_tables = set() + tables = self._handle_hidden_tables(tables, "attrs") + + for table in tables: + if self.displayed_only: + for elem in table.find_all(style=re.compile(r"display:\s*none")): + elem.decompose() + + if table not in unique_tables and table.find(text=match) is not None: + result.append(table) + unique_tables.add(table) + + if not result: + raise ValueError(f"No tables found matching pattern {repr(match.pattern)}") + return result + + def _text_getter(self, obj): + return obj.text + + def _equals_tag(self, obj, tag): + return obj.name == tag + + def _parse_td(self, row): + return row.find_all(("td", "th"), recursive=False) + + def _parse_thead_tr(self, table): + return table.select("thead tr") + + def _parse_tbody_tr(self, table): + from_tbody = table.select("tbody tr") + from_root = table.find_all("tr", recursive=False) + # HTML spec: at most one of these lists has content + return from_tbody + from_root + + def _parse_tfoot_tr(self, table): + return table.select("tfoot tr") + + def _setup_build_doc(self): + raw_text = _read(self.io) + if not raw_text: + raise ValueError(f"No text parsed from document: {self.io}") + return raw_text + + def _build_doc(self): + from bs4 import BeautifulSoup + + bdoc = self._setup_build_doc() + if isinstance(bdoc, bytes) and self.encoding is not None: + udoc = bdoc.decode(self.encoding) + from_encoding = None + else: + udoc = bdoc + from_encoding = self.encoding + return BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding) + + +def _build_xpath_expr(attrs) -> str: + """Build an xpath expression to simulate bs4's ability to pass in kwargs to + search for attributes when using the lxml parser. + + Parameters + ---------- + attrs : dict + A dict of HTML attributes. These are NOT checked for validity. + + Returns + ------- + expr : unicode + An XPath expression that checks for the given HTML attributes. + """ + # give class attribute as class_ because class is a python keyword + if "class_" in attrs: + attrs["class"] = attrs.pop("class_") + + s = " and ".join([f"@{k}={repr(v)}" for k, v in attrs.items()]) + return f"[{s}]" + + +_re_namespace = {"re": "http://exslt.org/regular-expressions"} +_valid_schemes = "http", "file", "ftp" + + +class _LxmlFrameParser(_HtmlFrameParser): + """ + HTML to DataFrame parser that uses lxml under the hood. + + Warning + ------- + This parser can only handle HTTP, FTP, and FILE urls. + + See Also + -------- + _HtmlFrameParser + _BeautifulSoupLxmlFrameParser + + Notes + ----- + Documentation strings for this class are in the base class + :class:`_HtmlFrameParser`. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def _text_getter(self, obj): + return obj.text_content() + + def _parse_td(self, row): + # Look for direct children only: the "row" element here may be a + #
foobar
-only rows + if header is None: + if len(head) == 1: + header = 0 + else: + # ignore all-empty-text rows + header = [i for i, row in enumerate(head) if any(text for text in row)] + + if foot: + body += foot + + # fill out elements of body that are "ragged" + _expand_elements(body) + tp = TextParser(body, header=header, **kwargs) + df = tp.read() + return df + + +_valid_parsers = { + "lxml": _LxmlFrameParser, + None: _LxmlFrameParser, + "html5lib": _BeautifulSoupHtml5LibFrameParser, + "bs4": _BeautifulSoupHtml5LibFrameParser, +} + + +def _parser_dispatch(flavor): + """ + Choose the parser based on the input flavor. + + Parameters + ---------- + flavor : str + The type of parser to use. This must be a valid backend. + + Returns + ------- + cls : _HtmlFrameParser subclass + The parser class based on the requested input flavor. + + Raises + ------ + ValueError + * If `flavor` is not a valid backend. + ImportError + * If you do not have the requested `flavor` + """ + valid_parsers = list(_valid_parsers.keys()) + if flavor not in valid_parsers: + raise ValueError( + f"{repr(flavor)} is not a valid flavor, valid flavors are {valid_parsers}" + ) + + if flavor in ("bs4", "html5lib"): + if not _HAS_HTML5LIB: + raise ImportError("html5lib not found, please install it") + if not _HAS_BS4: + raise ImportError("BeautifulSoup4 (bs4) not found, please install it") + # Although we call this above, we want to raise here right before use. + bs4 = import_optional_dependency("bs4") # noqa:F841 + + else: + if not _HAS_LXML: + raise ImportError("lxml not found, please install it") + return _valid_parsers[flavor] + + +def _print_as_set(s) -> str: + arg = ", ".join(pprint_thing(el) for el in s) + return f"{{{arg}}}" + + +def _validate_flavor(flavor): + if flavor is None: + flavor = "lxml", "bs4" + elif isinstance(flavor, str): + flavor = (flavor,) + elif isinstance(flavor, abc.Iterable): + if not all(isinstance(flav, str) for flav in flavor): + raise TypeError( + f"Object of type {repr(type(flavor).__name__)} " + f"is not an iterable of strings" + ) + else: + msg = repr(flavor) if isinstance(flavor, str) else str(flavor) + msg += " is not a valid flavor" + raise ValueError(msg) + + flavor = tuple(flavor) + valid_flavors = set(_valid_parsers) + flavor_set = set(flavor) + + if not flavor_set & valid_flavors: + raise ValueError( + f"{_print_as_set(flavor_set)} is not a valid set of flavors, valid " + f"flavors are {_print_as_set(valid_flavors)}" + ) + return flavor + + +def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): + flavor = _validate_flavor(flavor) + compiled_match = re.compile(match) # you can pass a compiled regex here + + retained = None + for flav in flavor: + parser = _parser_dispatch(flav) + p = parser(io, compiled_match, attrs, encoding, displayed_only) + + try: + tables = p.parse_tables() + except ValueError as caught: + # if `io` is an io-like object, check if it's seekable + # and try to rewind it before trying the next parser + if hasattr(io, "seekable") and io.seekable(): + io.seek(0) + elif hasattr(io, "seekable") and not io.seekable(): + # if we couldn't rewind it, let the user know + raise ValueError( + f"The flavor {flav} failed to parse your input. " + "Since you passed a non-rewindable file " + "object, we can't rewind it to try " + "another parser. Try read_html() with a " + "different flavor." + ) + + retained = caught + else: + break + else: + raise retained + + ret = [] + for table in tables: + try: + ret.append(_data_to_frame(data=table, **kwargs)) + except EmptyDataError: # empty table + continue + return ret + + +def read_html( + io, + match=".+", + flavor=None, + header=None, + index_col=None, + skiprows=None, + attrs=None, + parse_dates=False, + thousands=",", + encoding=None, + decimal=".", + converters=None, + na_values=None, + keep_default_na=True, + displayed_only=True, +): + r""" + Read HTML tables into a ``list`` of ``DataFrame`` objects. + + Parameters + ---------- + io : str, path object or file-like object + A URL, a file-like object, or a raw string containing HTML. Note that + lxml only accepts the http, ftp and file url protocols. If you have a + URL that starts with ``'https'`` you might try removing the ``'s'``. + + match : str or compiled regular expression, optional + The set of tables containing text matching this regex or string will be + returned. Unless the HTML is extremely simple you will probably need to + pass a non-empty string here. Defaults to '.+' (match any non-empty + string). The default value will return all tables contained on a page. + This value is converted to a regular expression so that there is + consistent behavior between Beautiful Soup and lxml. + + flavor : str or None + The parsing engine to use. 'bs4' and 'html5lib' are synonymous with + each other, they are both there for backwards compatibility. The + default of ``None`` tries to use ``lxml`` to parse and if that fails it + falls back on ``bs4`` + ``html5lib``. + + header : int or list-like or None, optional + The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to + make the columns headers. + + index_col : int or list-like or None, optional + The column (or list of columns) to use to create the index. + + skiprows : int or list-like or slice or None, optional + Number of rows to skip after parsing the column integer. 0-based. If a + sequence of integers or a slice is given, will skip the rows indexed by + that sequence. Note that a single element sequence means 'skip the nth + row' whereas an integer means 'skip n rows'. + + attrs : dict or None, optional + This is a dictionary of attributes that you can pass to use to identify + the table in the HTML. These are not checked for validity before being + passed to lxml or Beautiful Soup. However, these attributes must be + valid HTML table attributes to work correctly. For example, :: + + attrs = {'id': 'table'} + + is a valid attribute dictionary because the 'id' HTML tag attribute is + a valid HTML attribute for *any* HTML tag as per `this document + `__. :: + + attrs = {'asdf': 'table'} + + is *not* a valid attribute dictionary because 'asdf' is not a valid + HTML attribute even if it is a valid XML attribute. Valid HTML 4.01 + table attributes can be found `here + `__. A + working draft of the HTML 5 spec can be found `here + `__. It contains the + latest information on table attributes for the modern web. + + parse_dates : bool, optional + See :func:`~read_csv` for more details. + + thousands : str, optional + Separator to use to parse thousands. Defaults to ``','``. + + encoding : str or None, optional + The encoding used to decode the web page. Defaults to ``None``.``None`` + preserves the previous encoding behavior, which depends on the + underlying parser library (e.g., the parser library will try to use + the encoding provided by the document). + + decimal : str, default '.' + Character to recognize as decimal point (e.g. use ',' for European + data). + + converters : dict, default None + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels, values are functions that take one + input argument, the cell (not column) content, and return the + transformed content. + + na_values : iterable, default None + Custom NA values. + + keep_default_na : bool, default True + If na_values are specified and keep_default_na is False the default NaN + values are overridden, otherwise they're appended to. + + displayed_only : bool, default True + Whether elements with "display: none" should be parsed. + + Returns + ------- + dfs + A list of DataFrames. + + See Also + -------- + read_csv + + Notes + ----- + Before using this function you should read the :ref:`gotchas about the + HTML parsing libraries `. + + Expect to do some cleanup after you call this function. For example, you + might need to manually assign column names if the column names are + converted to NaN when you pass the `header=0` argument. We try to assume as + little as possible about the structure of the table and push the + idiosyncrasies of the HTML contained in the table to the user. + + This function searches for ```` elements and only for ```` + and ```` or ```` argument, it is used to construct + the header, otherwise the function attempts to find the header within + the body (by putting rows with only ``" not in df._repr_html_() + + with option_context("display.max_rows", 12, "display.min_rows", None): + # when set to None, follow value of max_rows + assert "5 5" in repr(df) + assert "" in df._repr_html_() + + with option_context("display.max_rows", 10, "display.min_rows", 12): + # when set value higher as max_rows, use the minimum + assert "5 5" not in repr(df) + assert "" not in df._repr_html_() + + with option_context("display.max_rows", None, "display.min_rows", 12): + # max_rows of None -> never truncate + assert ".." not in repr(df) + assert ".." not in df._repr_html_() + + def test_str_max_colwidth(self): + # GH 7856 + df = pd.DataFrame( + [ + { + "a": "foo", + "b": "bar", + "c": "uncomfortably long line with lots of stuff", + "d": 1, + }, + {"a": "foo", "b": "bar", "c": "stuff", "d": 1}, + ] + ) + df.set_index(["a", "b", "c"]) + assert str(df) == ( + " a b c d\n" + "0 foo bar uncomfortably long line with lots of stuff 1\n" + "1 foo bar stuff 1" + ) + with option_context("max_colwidth", 20): + assert str(df) == ( + " a b c d\n" + "0 foo bar uncomfortably lo... 1\n" + "1 foo bar stuff 1" + ) + + def test_to_string_truncate(self): + # GH 9784 - dont truncate when calling DataFrame.to_string + df = pd.DataFrame( + [ + { + "a": "foo", + "b": "bar", + "c": "let's make this a very VERY long line that is longer " + "than the default 50 character limit", + "d": 1, + }, + {"a": "foo", "b": "bar", "c": "stuff", "d": 1}, + ] + ) + df.set_index(["a", "b", "c"]) + assert df.to_string() == ( + " a b " + " c d\n" + "0 foo bar let's make this a very VERY long line t" + "hat is longer than the default 50 character limit 1\n" + "1 foo bar " + " stuff 1" + ) + with option_context("max_colwidth", 20): + # the display option has no effect on the to_string method + assert df.to_string() == ( + " a b " + " c d\n" + "0 foo bar let's make this a very VERY long line t" + "hat is longer than the default 50 character limit 1\n" + "1 foo bar " + " stuff 1" + ) + assert df.to_string(max_colwidth=20) == ( + " a b c d\n" + "0 foo bar let's make this ... 1\n" + "1 foo bar stuff 1" + ) + + def test_auto_detect(self): + term_width, term_height = get_terminal_size() + fac = 1.05 # Arbitrary large factor to exceed term width + cols = range(int(term_width * fac)) + index = range(10) + df = DataFrame(index=index, columns=cols) + with option_context("mode.sim_interactive", True): + with option_context("max_rows", None): + with option_context("max_columns", None): + # Wrap around with None + assert has_expanded_repr(df) + with option_context("max_rows", 0): + with option_context("max_columns", 0): + # Truncate with auto detection. + assert has_horizontally_truncated_repr(df) + + index = range(int(term_height * fac)) + df = DataFrame(index=index, columns=cols) + with option_context("max_rows", 0): + with option_context("max_columns", None): + # Wrap around with None + assert has_expanded_repr(df) + # Truncate vertically + assert has_vertically_truncated_repr(df) + + with option_context("max_rows", None): + with option_context("max_columns", 0): + assert has_horizontally_truncated_repr(df) + + def test_to_string_repr_unicode(self): + buf = StringIO() + + unicode_values = ["\u03c3"] * 10 + unicode_values = np.array(unicode_values, dtype=object) + df = DataFrame({"unicode": unicode_values}) + df.to_string(col_space=10, buf=buf) + + # it works! + repr(df) + + idx = Index(["abc", "\u03c3a", "aegdvg"]) + ser = Series(np.random.randn(len(idx)), idx) + rs = repr(ser).split("\n") + line_len = len(rs[0]) + for line in rs[1:]: + try: + line = line.decode(get_option("display.encoding")) + except AttributeError: + pass + if not line.startswith("dtype:"): + assert len(line) == line_len + + # it works even if sys.stdin in None + _stdin = sys.stdin + try: + sys.stdin = None + repr(df) + finally: + sys.stdin = _stdin + + def test_to_string_unicode_columns(self, float_frame): + df = DataFrame({"\u03c3": np.arange(10.0)}) + + buf = StringIO() + df.to_string(buf=buf) + buf.getvalue() + + buf = StringIO() + df.info(buf=buf) + buf.getvalue() + + result = float_frame.to_string() + assert isinstance(result, str) + + def test_to_string_utf8_columns(self): + n = "\u05d0".encode("utf-8") + + with option_context("display.max_rows", 1): + df = DataFrame([1, 2], columns=[n]) + repr(df) + + def test_to_string_unicode_two(self): + dm = DataFrame({"c/\u03c3": []}) + buf = StringIO() + dm.to_string(buf) + + def test_to_string_unicode_three(self): + dm = DataFrame(["\xc2"]) + buf = StringIO() + dm.to_string(buf) + + def test_to_string_with_formatters(self): + df = DataFrame( + { + "int": [1, 2, 3], + "float": [1.0, 2.0, 3.0], + "object": [(1, 2), True, False], + }, + columns=["int", "float", "object"], + ) + + formatters = [ + ("int", lambda x: f"0x{x:x}"), + ("float", lambda x: f"[{x: 4.1f}]"), + ("object", lambda x: f"-{x!s}-"), + ] + result = df.to_string(formatters=dict(formatters)) + result2 = df.to_string(formatters=list(zip(*formatters))[1]) + assert result == ( + " int float object\n" + "0 0x1 [ 1.0] -(1, 2)-\n" + "1 0x2 [ 2.0] -True-\n" + "2 0x3 [ 3.0] -False-" + ) + assert result == result2 + + def test_to_string_with_datetime64_monthformatter(self): + months = [datetime(2016, 1, 1), datetime(2016, 2, 2)] + x = DataFrame({"months": months}) + + def format_func(x): + return x.strftime("%Y-%m") + + result = x.to_string(formatters={"months": format_func}) + expected = "months\n0 2016-01\n1 2016-02" + assert result.strip() == expected + + def test_to_string_with_datetime64_hourformatter(self): + + x = DataFrame( + { + "hod": pd.to_datetime( + ["10:10:10.100", "12:12:12.120"], format="%H:%M:%S.%f" + ) + } + ) + + def format_func(x): + return x.strftime("%H:%M") + + result = x.to_string(formatters={"hod": format_func}) + expected = "hod\n0 10:10\n1 12:12" + assert result.strip() == expected + + def test_to_string_with_formatters_unicode(self): + df = DataFrame({"c/\u03c3": [1, 2, 3]}) + result = df.to_string(formatters={"c/\u03c3": str}) + assert result == " c/\u03c3\n" + "0 1\n1 2\n2 3" + + def test_east_asian_unicode_false(self): + # not aligned properly because of east asian width + + # mid col + df = DataFrame( + {"a": ["あ", "いいい", "う", "ええええええ"], "b": [1, 222, 33333, 4]}, + index=["a", "bb", "c", "ddd"], + ) + expected = ( + " a b\na あ 1\n" + "bb いいい 222\nc う 33333\n" + "ddd ええええええ 4" + ) + assert repr(df) == expected + + # last col + df = DataFrame( + {"a": [1, 222, 33333, 4], "b": ["あ", "いいい", "う", "ええええええ"]}, + index=["a", "bb", "c", "ddd"], + ) + expected = ( + " a b\na 1 あ\n" + "bb 222 いいい\nc 33333 う\n" + "ddd 4 ええええええ" + ) + assert repr(df) == expected + + # all col + df = DataFrame( + {"a": ["あああああ", "い", "う", "えええ"], "b": ["あ", "いいい", "う", "ええええええ"]}, + index=["a", "bb", "c", "ddd"], + ) + expected = ( + " a b\na あああああ あ\n" + "bb い いいい\nc う う\n" + "ddd えええ ええええええ" + ) + assert repr(df) == expected + + # column name + df = DataFrame( + {"b": ["あ", "いいい", "う", "ええええええ"], "あああああ": [1, 222, 33333, 4]}, + index=["a", "bb", "c", "ddd"], + ) + expected = ( + " b あああああ\na あ 1\n" + "bb いいい 222\nc う 33333\n" + "ddd ええええええ 4" + ) + assert repr(df) == expected + + # index + df = DataFrame( + {"a": ["あああああ", "い", "う", "えええ"], "b": ["あ", "いいい", "う", "ええええええ"]}, + index=["あああ", "いいいいいい", "うう", "え"], + ) + expected = ( + " a b\nあああ あああああ あ\n" + "いいいいいい い いいい\nうう う う\n" + "え えええ ええええええ" + ) + assert repr(df) == expected + + # index name + df = DataFrame( + {"a": ["あああああ", "い", "う", "えええ"], "b": ["あ", "いいい", "う", "ええええええ"]}, + index=pd.Index(["あ", "い", "うう", "え"], name="おおおお"), + ) + expected = ( + " a b\n" + "おおおお \n" + "あ あああああ あ\n" + "い い いいい\n" + "うう う う\n" + "え えええ ええええええ" + ) + assert repr(df) == expected + + # all + df = DataFrame( + {"あああ": ["あああ", "い", "う", "えええええ"], "いいいいい": ["あ", "いいい", "う", "ええ"]}, + index=pd.Index(["あ", "いいい", "うう", "え"], name="お"), + ) + expected = ( + " あああ いいいいい\n" + "お \n" + "あ あああ あ\n" + "いいい い いいい\n" + "うう う う\n" + "え えええええ ええ" + ) + assert repr(df) == expected + + # MultiIndex + idx = pd.MultiIndex.from_tuples( + [("あ", "いい"), ("う", "え"), ("おおお", "かかかか"), ("き", "くく")] + ) + df = DataFrame( + {"a": ["あああああ", "い", "う", "えええ"], "b": ["あ", "いいい", "う", "ええええええ"]}, + index=idx, + ) + expected = ( + " a b\n" + "あ いい あああああ あ\n" + "う え い いいい\n" + "おおお かかかか う う\n" + "き くく えええ ええええええ" + ) + assert repr(df) == expected + + # truncate + with option_context("display.max_rows", 3, "display.max_columns", 3): + df = pd.DataFrame( + { + "a": ["あああああ", "い", "う", "えええ"], + "b": ["あ", "いいい", "う", "ええええええ"], + "c": ["お", "か", "ききき", "くくくくくく"], + "ああああ": ["さ", "し", "す", "せ"], + }, + columns=["a", "b", "c", "ああああ"], + ) + + expected = ( + " a ... ああああ\n0 あああああ ... さ\n" + ".. ... ... ...\n3 えええ ... せ\n" + "\n[4 rows x 4 columns]" + ) + assert repr(df) == expected + + df.index = ["あああ", "いいいい", "う", "aaa"] + expected = ( + " a ... ああああ\nあああ あああああ ... さ\n" + ".. ... ... ...\naaa えええ ... せ\n" + "\n[4 rows x 4 columns]" + ) + assert repr(df) == expected + + def test_east_asian_unicode_true(self): + # Enable Unicode option ----------------------------------------- + with option_context("display.unicode.east_asian_width", True): + + # mid col + df = DataFrame( + {"a": ["あ", "いいい", "う", "ええええええ"], "b": [1, 222, 33333, 4]}, + index=["a", "bb", "c", "ddd"], + ) + expected = ( + " a b\na あ 1\n" + "bb いいい 222\nc う 33333\n" + "ddd ええええええ 4" + ) + assert repr(df) == expected + + # last col + df = DataFrame( + {"a": [1, 222, 33333, 4], "b": ["あ", "いいい", "う", "ええええええ"]}, + index=["a", "bb", "c", "ddd"], + ) + expected = ( + " a b\na 1 あ\n" + "bb 222 いいい\nc 33333 う\n" + "ddd 4 ええええええ" + ) + assert repr(df) == expected + + # all col + df = DataFrame( + {"a": ["あああああ", "い", "う", "えええ"], "b": ["あ", "いいい", "う", "ええええええ"]}, + index=["a", "bb", "c", "ddd"], + ) + expected = ( + " a b\n" + "a あああああ あ\n" + "bb い いいい\n" + "c う う\n" + "ddd えええ ええええええ" + ) + assert repr(df) == expected + + # column name + df = DataFrame( + {"b": ["あ", "いいい", "う", "ええええええ"], "あああああ": [1, 222, 33333, 4]}, + index=["a", "bb", "c", "ddd"], + ) + expected = ( + " b あああああ\n" + "a あ 1\n" + "bb いいい 222\n" + "c う 33333\n" + "ddd ええええええ 4" + ) + assert repr(df) == expected + + # index + df = DataFrame( + {"a": ["あああああ", "い", "う", "えええ"], "b": ["あ", "いいい", "う", "ええええええ"]}, + index=["あああ", "いいいいいい", "うう", "え"], + ) + expected = ( + " a b\n" + "あああ あああああ あ\n" + "いいいいいい い いいい\n" + "うう う う\n" + "え えええ ええええええ" + ) + assert repr(df) == expected + + # index name + df = DataFrame( + {"a": ["あああああ", "い", "う", "えええ"], "b": ["あ", "いいい", "う", "ええええええ"]}, + index=pd.Index(["あ", "い", "うう", "え"], name="おおおお"), + ) + expected = ( + " a b\n" + "おおおお \n" + "あ あああああ あ\n" + "い い いいい\n" + "うう う う\n" + "え えええ ええええええ" + ) + assert repr(df) == expected + + # all + df = DataFrame( + {"あああ": ["あああ", "い", "う", "えええええ"], "いいいいい": ["あ", "いいい", "う", "ええ"]}, + index=pd.Index(["あ", "いいい", "うう", "え"], name="お"), + ) + expected = ( + " あああ いいいいい\n" + "お \n" + "あ あああ あ\n" + "いいい い いいい\n" + "うう う う\n" + "え えええええ ええ" + ) + assert repr(df) == expected + + # MultiIndex + idx = pd.MultiIndex.from_tuples( + [("あ", "いい"), ("う", "え"), ("おおお", "かかかか"), ("き", "くく")] + ) + df = DataFrame( + {"a": ["あああああ", "い", "う", "えええ"], "b": ["あ", "いいい", "う", "ええええええ"]}, + index=idx, + ) + expected = ( + " a b\n" + "あ いい あああああ あ\n" + "う え い いいい\n" + "おおお かかかか う う\n" + "き くく えええ ええええええ" + ) + assert repr(df) == expected + + # truncate + with option_context("display.max_rows", 3, "display.max_columns", 3): + + df = pd.DataFrame( + { + "a": ["あああああ", "い", "う", "えええ"], + "b": ["あ", "いいい", "う", "ええええええ"], + "c": ["お", "か", "ききき", "くくくくくく"], + "ああああ": ["さ", "し", "す", "せ"], + }, + columns=["a", "b", "c", "ああああ"], + ) + + expected = ( + " a ... ああああ\n" + "0 あああああ ... さ\n" + ".. ... ... ...\n" + "3 えええ ... せ\n" + "\n[4 rows x 4 columns]" + ) + assert repr(df) == expected + + df.index = ["あああ", "いいいい", "う", "aaa"] + expected = ( + " a ... ああああ\n" + "あああ あああああ ... さ\n" + "... ... ... ...\n" + "aaa えええ ... せ\n" + "\n[4 rows x 4 columns]" + ) + assert repr(df) == expected + + # ambiguous unicode + df = DataFrame( + {"b": ["あ", "いいい", "¡¡", "ええええええ"], "あああああ": [1, 222, 33333, 4]}, + index=["a", "bb", "c", "¡¡¡"], + ) + expected = ( + " b あああああ\n" + "a あ 1\n" + "bb いいい 222\n" + "c ¡¡ 33333\n" + "¡¡¡ ええええええ 4" + ) + assert repr(df) == expected + + def test_to_string_buffer_all_unicode(self): + buf = StringIO() + + empty = DataFrame({"c/\u03c3": Series(dtype=object)}) + nonempty = DataFrame({"c/\u03c3": Series([1, 2, 3])}) + + print(empty, file=buf) + print(nonempty, file=buf) + + # this should work + buf.getvalue() + + def test_to_string_with_col_space(self): + df = DataFrame(np.random.random(size=(1, 3))) + c10 = len(df.to_string(col_space=10).split("\n")[1]) + c20 = len(df.to_string(col_space=20).split("\n")[1]) + c30 = len(df.to_string(col_space=30).split("\n")[1]) + assert c10 < c20 < c30 + + # GH 8230 + # col_space wasn't being applied with header=False + with_header = df.to_string(col_space=20) + with_header_row1 = with_header.splitlines()[1] + no_header = df.to_string(col_space=20, header=False) + assert len(with_header_row1) == len(no_header) + + def test_to_string_truncate_indices(self): + for index in [ + tm.makeStringIndex, + tm.makeUnicodeIndex, + tm.makeIntIndex, + tm.makeDateIndex, + tm.makePeriodIndex, + ]: + for column in [tm.makeStringIndex]: + for h in [10, 20]: + for w in [10, 20]: + with option_context("display.expand_frame_repr", False): + df = DataFrame(index=index(h), columns=column(w)) + with option_context("display.max_rows", 15): + if h == 20: + assert has_vertically_truncated_repr(df) + else: + assert not has_vertically_truncated_repr(df) + with option_context("display.max_columns", 15): + if w == 20: + assert has_horizontally_truncated_repr(df) + else: + assert not (has_horizontally_truncated_repr(df)) + with option_context( + "display.max_rows", 15, "display.max_columns", 15 + ): + if h == 20 and w == 20: + assert has_doubly_truncated_repr(df) + else: + assert not has_doubly_truncated_repr(df) + + def test_to_string_truncate_multilevel(self): + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + df = DataFrame(index=arrays, columns=arrays) + with option_context("display.max_rows", 7, "display.max_columns", 7): + assert has_doubly_truncated_repr(df) + + def test_truncate_with_different_dtypes(self): + + # 11594, 12045 + # when truncated the dtypes of the splits can differ + + # 11594 + import datetime + + s = Series( + [datetime.datetime(2012, 1, 1)] * 10 + + [datetime.datetime(1012, 1, 2)] + + [datetime.datetime(2012, 1, 3)] * 10 + ) + + with pd.option_context("display.max_rows", 8): + result = str(s) + assert "object" in result + + # 12045 + df = DataFrame({"text": ["some words"] + [None] * 9}) + + with pd.option_context("display.max_rows", 8, "display.max_columns", 3): + result = str(df) + assert "None" in result + assert "NaN" not in result + + def test_truncate_with_different_dtypes_multiindex(self): + # GH#13000 + df = DataFrame({"Vals": range(100)}) + frame = pd.concat([df], keys=["Sweep"], names=["Sweep", "Index"]) + result = repr(frame) + + result2 = repr(frame.iloc[:5]) + assert result.startswith(result2) + + def test_datetimelike_frame(self): + + # GH 12211 + df = DataFrame( + {"date": [pd.Timestamp("20130101").tz_localize("UTC")] + [pd.NaT] * 5} + ) + + with option_context("display.max_rows", 5): + result = str(df) + assert "2013-01-01 00:00:00+00:00" in result + assert "NaT" in result + assert "..." in result + assert "[6 rows x 1 columns]" in result + + dts = [pd.Timestamp("2011-01-01", tz="US/Eastern")] * 5 + [pd.NaT] * 5 + df = pd.DataFrame({"dt": dts, "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) + with option_context("display.max_rows", 5): + expected = ( + " dt x\n" + "0 2011-01-01 00:00:00-05:00 1\n" + "1 2011-01-01 00:00:00-05:00 2\n" + ".. ... ..\n" + "8 NaT 9\n" + "9 NaT 10\n\n" + "[10 rows x 2 columns]" + ) + assert repr(df) == expected + + dts = [pd.NaT] * 5 + [pd.Timestamp("2011-01-01", tz="US/Eastern")] * 5 + df = pd.DataFrame({"dt": dts, "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) + with option_context("display.max_rows", 5): + expected = ( + " dt x\n" + "0 NaT 1\n" + "1 NaT 2\n" + ".. ... ..\n" + "8 2011-01-01 00:00:00-05:00 9\n" + "9 2011-01-01 00:00:00-05:00 10\n\n" + "[10 rows x 2 columns]" + ) + assert repr(df) == expected + + dts = [pd.Timestamp("2011-01-01", tz="Asia/Tokyo")] * 5 + [ + pd.Timestamp("2011-01-01", tz="US/Eastern") + ] * 5 + df = pd.DataFrame({"dt": dts, "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) + with option_context("display.max_rows", 5): + expected = ( + " dt x\n" + "0 2011-01-01 00:00:00+09:00 1\n" + "1 2011-01-01 00:00:00+09:00 2\n" + ".. ... ..\n" + "8 2011-01-01 00:00:00-05:00 9\n" + "9 2011-01-01 00:00:00-05:00 10\n\n" + "[10 rows x 2 columns]" + ) + assert repr(df) == expected + + @pytest.mark.parametrize( + "start_date", + [ + "2017-01-01 23:59:59.999999999", + "2017-01-01 23:59:59.99999999", + "2017-01-01 23:59:59.9999999", + "2017-01-01 23:59:59.999999", + "2017-01-01 23:59:59.99999", + "2017-01-01 23:59:59.9999", + ], + ) + def test_datetimeindex_highprecision(self, start_date): + # GH19030 + # Check that high-precision time values for the end of day are + # included in repr for DatetimeIndex + df = DataFrame({"A": date_range(start=start_date, freq="D", periods=5)}) + result = str(df) + assert start_date in result + + dti = date_range(start=start_date, freq="D", periods=5) + df = DataFrame({"A": range(5)}, index=dti) + result = str(df.index) + assert start_date in result + + def test_nonunicode_nonascii_alignment(self): + df = DataFrame([["aa\xc3\xa4\xc3\xa4", 1], ["bbbb", 2]]) + rep_str = df.to_string() + lines = rep_str.split("\n") + assert len(lines[1]) == len(lines[2]) + + def test_unicode_problem_decoding_as_ascii(self): + dm = DataFrame({"c/\u03c3": Series({"test": np.nan})}) + str(dm.to_string()) + + def test_string_repr_encoding(self, datapath): + filepath = datapath("io", "parser", "data", "unicode_series.csv") + df = pd.read_csv(filepath, header=None, encoding="latin1") + repr(df) + repr(df[1]) + + def test_repr_corner(self): + # representing infs poses no problems + df = DataFrame({"foo": [-np.inf, np.inf]}) + repr(df) + + def test_frame_info_encoding(self): + index = ["'Til There Was You (1997)", "ldum klaka (Cold Fever) (1994)"] + fmt.set_option("display.max_rows", 1) + df = DataFrame(columns=["a", "b", "c"], index=index) + repr(df) + repr(df.T) + fmt.set_option("display.max_rows", 200) + + def test_wide_repr(self): + with option_context( + "mode.sim_interactive", + True, + "display.show_dimensions", + True, + "display.max_columns", + 20, + ): + max_cols = get_option("display.max_columns") + df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1))) + set_option("display.expand_frame_repr", False) + rep_str = repr(df) + + assert f"10 rows x {max_cols - 1} columns" in rep_str + set_option("display.expand_frame_repr", True) + wide_repr = repr(df) + assert rep_str != wide_repr + + with option_context("display.width", 120): + wider_repr = repr(df) + assert len(wider_repr) < len(wide_repr) + + reset_option("display.expand_frame_repr") + + def test_wide_repr_wide_columns(self): + with option_context("mode.sim_interactive", True, "display.max_columns", 20): + df = DataFrame( + np.random.randn(5, 3), columns=["a" * 90, "b" * 90, "c" * 90] + ) + rep_str = repr(df) + + assert len(rep_str.splitlines()) == 20 + + def test_wide_repr_named(self): + with option_context("mode.sim_interactive", True, "display.max_columns", 20): + max_cols = get_option("display.max_columns") + df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1))) + df.index.name = "DataFrame Index" + set_option("display.expand_frame_repr", False) + + rep_str = repr(df) + set_option("display.expand_frame_repr", True) + wide_repr = repr(df) + assert rep_str != wide_repr + + with option_context("display.width", 150): + wider_repr = repr(df) + assert len(wider_repr) < len(wide_repr) + + for line in wide_repr.splitlines()[1::13]: + assert "DataFrame Index" in line + + reset_option("display.expand_frame_repr") + + def test_wide_repr_multiindex(self): + with option_context("mode.sim_interactive", True, "display.max_columns", 20): + midx = MultiIndex.from_arrays(tm.rands_array(5, size=(2, 10))) + max_cols = get_option("display.max_columns") + df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)), index=midx) + df.index.names = ["Level 0", "Level 1"] + set_option("display.expand_frame_repr", False) + rep_str = repr(df) + set_option("display.expand_frame_repr", True) + wide_repr = repr(df) + assert rep_str != wide_repr + + with option_context("display.width", 150): + wider_repr = repr(df) + assert len(wider_repr) < len(wide_repr) + + for line in wide_repr.splitlines()[1::13]: + assert "Level 0 Level 1" in line + + reset_option("display.expand_frame_repr") + + def test_wide_repr_multiindex_cols(self): + with option_context("mode.sim_interactive", True, "display.max_columns", 20): + max_cols = get_option("display.max_columns") + midx = MultiIndex.from_arrays(tm.rands_array(5, size=(2, 10))) + mcols = MultiIndex.from_arrays(tm.rands_array(3, size=(2, max_cols - 1))) + df = DataFrame( + tm.rands_array(25, (10, max_cols - 1)), index=midx, columns=mcols + ) + df.index.names = ["Level 0", "Level 1"] + set_option("display.expand_frame_repr", False) + rep_str = repr(df) + set_option("display.expand_frame_repr", True) + wide_repr = repr(df) + assert rep_str != wide_repr + + with option_context("display.width", 150, "display.max_columns", 20): + wider_repr = repr(df) + assert len(wider_repr) < len(wide_repr) + + reset_option("display.expand_frame_repr") + + def test_wide_repr_unicode(self): + with option_context("mode.sim_interactive", True, "display.max_columns", 20): + max_cols = 20 + df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1))) + set_option("display.expand_frame_repr", False) + rep_str = repr(df) + set_option("display.expand_frame_repr", True) + wide_repr = repr(df) + assert rep_str != wide_repr + + with option_context("display.width", 150): + wider_repr = repr(df) + assert len(wider_repr) < len(wide_repr) + + reset_option("display.expand_frame_repr") + + def test_wide_repr_wide_long_columns(self): + with option_context("mode.sim_interactive", True): + df = DataFrame({"a": ["a" * 30, "b" * 30], "b": ["c" * 70, "d" * 80]}) + + result = repr(df) + assert "ccccc" in result + assert "ddddd" in result + + def test_long_series(self): + n = 1000 + s = Series( + np.random.randint(-50, 50, n), + index=[f"s{x:04d}" for x in range(n)], + dtype="int64", + ) + + import re + + str_rep = str(s) + nmatches = len(re.findall("dtype", str_rep)) + assert nmatches == 1 + + def test_index_with_nan(self): + # GH 2850 + df = DataFrame( + { + "id1": {0: "1a3", 1: "9h4"}, + "id2": {0: np.nan, 1: "d67"}, + "id3": {0: "78d", 1: "79d"}, + "value": {0: 123, 1: 64}, + } + ) + + # multi-index + y = df.set_index(["id1", "id2", "id3"]) + result = y.to_string() + expected = ( + " value\nid1 id2 id3 \n" + "1a3 NaN 78d 123\n9h4 d67 79d 64" + ) + assert result == expected + + # index + y = df.set_index("id2") + result = y.to_string() + expected = ( + " id1 id3 value\nid2 \n" + "NaN 1a3 78d 123\nd67 9h4 79d 64" + ) + assert result == expected + + # with append (this failed in 0.12) + y = df.set_index(["id1", "id2"]).set_index("id3", append=True) + result = y.to_string() + expected = ( + " value\nid1 id2 id3 \n" + "1a3 NaN 78d 123\n9h4 d67 79d 64" + ) + assert result == expected + + # all-nan in mi + df2 = df.copy() + df2.loc[:, "id2"] = np.nan + y = df2.set_index("id2") + result = y.to_string() + expected = ( + " id1 id3 value\nid2 \n" + "NaN 1a3 78d 123\nNaN 9h4 79d 64" + ) + assert result == expected + + # partial nan in mi + df2 = df.copy() + df2.loc[:, "id2"] = np.nan + y = df2.set_index(["id2", "id3"]) + result = y.to_string() + expected = ( + " id1 value\nid2 id3 \n" + "NaN 78d 1a3 123\n 79d 9h4 64" + ) + assert result == expected + + df = DataFrame( + { + "id1": {0: np.nan, 1: "9h4"}, + "id2": {0: np.nan, 1: "d67"}, + "id3": {0: np.nan, 1: "79d"}, + "value": {0: 123, 1: 64}, + } + ) + + y = df.set_index(["id1", "id2", "id3"]) + result = y.to_string() + expected = ( + " value\nid1 id2 id3 \n" + "NaN NaN NaN 123\n9h4 d67 79d 64" + ) + assert result == expected + + def test_to_string(self): + + # big mixed + biggie = DataFrame( + {"A": np.random.randn(200), "B": tm.makeStringIndex(200)}, + index=np.arange(200), + ) + + biggie.loc[:20, "A"] = np.nan + biggie.loc[:20, "B"] = np.nan + s = biggie.to_string() + + buf = StringIO() + retval = biggie.to_string(buf=buf) + assert retval is None + assert buf.getvalue() == s + + assert isinstance(s, str) + + # print in right order + result = biggie.to_string( + columns=["B", "A"], col_space=17, float_format="%.5f".__mod__ + ) + lines = result.split("\n") + header = lines[0].strip().split() + joined = "\n".join(re.sub(r"\s+", " ", x).strip() for x in lines[1:]) + recons = read_csv(StringIO(joined), names=header, header=None, sep=" ") + tm.assert_series_equal(recons["B"], biggie["B"]) + assert recons["A"].count() == biggie["A"].count() + assert (np.abs(recons["A"].dropna() - biggie["A"].dropna()) < 0.1).all() + + # expected = ['B', 'A'] + # assert header == expected + + result = biggie.to_string(columns=["A"], col_space=17) + header = result.split("\n")[0].strip().split() + expected = ["A"] + assert header == expected + + biggie.to_string(columns=["B", "A"], formatters={"A": lambda x: f"{x:.1f}"}) + + biggie.to_string(columns=["B", "A"], float_format=str) + biggie.to_string(columns=["B", "A"], col_space=12, float_format=str) + + frame = DataFrame(index=np.arange(200)) + frame.to_string() + + def test_to_string_no_header(self): + df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) + + df_s = df.to_string(header=False) + expected = "0 1 4\n1 2 5\n2 3 6" + + assert df_s == expected + + def test_to_string_specified_header(self): + df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) + + df_s = df.to_string(header=["X", "Y"]) + expected = " X Y\n0 1 4\n1 2 5\n2 3 6" + + assert df_s == expected + + with pytest.raises(ValueError): + df.to_string(header=["X"]) + + def test_to_string_no_index(self): + # GH 16839, GH 13032 + df = DataFrame({"x": [11, 22], "y": [33, -44], "z": ["AAA", " "]}) + + df_s = df.to_string(index=False) + # Leading space is expected for positive numbers. + expected = " x y z\n 11 33 AAA\n 22 -44 " + assert df_s == expected + + df_s = df[["y", "x", "z"]].to_string(index=False) + expected = " y x z\n 33 11 AAA\n-44 22 " + assert df_s == expected + + def test_to_string_line_width_no_index(self): + # GH 13998, GH 22505 + df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) + + df_s = df.to_string(line_width=1, index=False) + expected = " x \\\n 1 \n 2 \n 3 \n\n y \n 4 \n 5 \n 6 " + + assert df_s == expected + + df = DataFrame({"x": [11, 22, 33], "y": [4, 5, 6]}) + + df_s = df.to_string(line_width=1, index=False) + expected = " x \\\n 11 \n 22 \n 33 \n\n y \n 4 \n 5 \n 6 " + + assert df_s == expected + + df = DataFrame({"x": [11, 22, -33], "y": [4, 5, -6]}) + + df_s = df.to_string(line_width=1, index=False) + expected = " x \\\n 11 \n 22 \n-33 \n\n y \n 4 \n 5 \n-6 " + + assert df_s == expected + + def test_to_string_float_formatting(self): + tm.reset_display_options() + fmt.set_option( + "display.precision", + 5, + "display.column_space", + 12, + "display.notebook_repr_html", + False, + ) + + df = DataFrame( + {"x": [0, 0.25, 3456.000, 12e45, 1.64e6, 1.7e8, 1.253456, np.pi, -1e6]} + ) + + df_s = df.to_string() + + if _three_digit_exp(): + expected = ( + " x\n0 0.00000e+000\n1 2.50000e-001\n" + "2 3.45600e+003\n3 1.20000e+046\n4 1.64000e+006\n" + "5 1.70000e+008\n6 1.25346e+000\n7 3.14159e+000\n" + "8 -1.00000e+006" + ) + else: + expected = ( + " x\n0 0.00000e+00\n1 2.50000e-01\n" + "2 3.45600e+03\n3 1.20000e+46\n4 1.64000e+06\n" + "5 1.70000e+08\n6 1.25346e+00\n7 3.14159e+00\n" + "8 -1.00000e+06" + ) + assert df_s == expected + + df = DataFrame({"x": [3234, 0.253]}) + df_s = df.to_string() + + expected = " x\n0 3234.000\n1 0.253" + assert df_s == expected + + tm.reset_display_options() + assert get_option("display.precision") == 6 + + df = DataFrame({"x": [1e9, 0.2512]}) + df_s = df.to_string() + + if _three_digit_exp(): + expected = " x\n0 1.000000e+009\n1 2.512000e-001" + else: + expected = " x\n0 1.000000e+09\n1 2.512000e-01" + assert df_s == expected + + def test_to_string_float_format_no_fixed_width(self): + + # GH 21625 + df = DataFrame({"x": [0.19999]}) + expected = " x\n0 0.200" + assert df.to_string(float_format="%.3f") == expected + + # GH 22270 + df = DataFrame({"x": [100.0]}) + expected = " x\n0 100" + assert df.to_string(float_format="%.0f") == expected + + def test_to_string_small_float_values(self): + df = DataFrame({"a": [1.5, 1e-17, -5.5e-7]}) + + result = df.to_string() + # sadness per above + if _three_digit_exp(): + expected = ( + " a\n" + "0 1.500000e+000\n" + "1 1.000000e-017\n" + "2 -5.500000e-007" + ) + else: + expected = ( + " a\n" + "0 1.500000e+00\n" + "1 1.000000e-17\n" + "2 -5.500000e-07" + ) + assert result == expected + + # but not all exactly zero + df = df * 0 + result = df.to_string() + expected = " 0\n0 0\n1 0\n2 -0" + + def test_to_string_float_index(self): + index = Index([1.5, 2, 3, 4, 5]) + df = DataFrame(np.arange(5), index=index) + + result = df.to_string() + expected = " 0\n1.5 0\n2.0 1\n3.0 2\n4.0 3\n5.0 4" + assert result == expected + + def test_to_string_complex_float_formatting(self): + # GH #25514, 25745 + with pd.option_context("display.precision", 5): + df = DataFrame( + { + "x": [ + (0.4467846931321966 + 0.0715185102060818j), + (0.2739442392974528 + 0.23515228785438969j), + (0.26974928742135185 + 0.3250604054898979j), + (-1j), + ] + } + ) + result = df.to_string() + expected = ( + " x\n0 0.44678+0.07152j\n" + "1 0.27394+0.23515j\n" + "2 0.26975+0.32506j\n" + "3 -0.00000-1.00000j" + ) + assert result == expected + + def test_to_string_ascii_error(self): + data = [ + ( + "0 ", + " .gitignore ", + " 5 ", + " \xe2\x80\xa2\xe2\x80\xa2\xe2\x80\xa2\xe2\x80\xa2\xe2\x80\xa2", + ) + ] + df = DataFrame(data) + + # it works! + repr(df) + + def test_to_string_int_formatting(self): + df = DataFrame({"x": [-15, 20, 25, -35]}) + assert issubclass(df["x"].dtype.type, np.integer) + + output = df.to_string() + expected = " x\n0 -15\n1 20\n2 25\n3 -35" + assert output == expected + + def test_to_string_index_formatter(self): + df = DataFrame([range(5), range(5, 10), range(10, 15)]) + + rs = df.to_string(formatters={"__index__": lambda x: "abc"[x]}) + + xp = """\ + 0 1 2 3 4 +a 0 1 2 3 4 +b 5 6 7 8 9 +c 10 11 12 13 14\ +""" + + assert rs == xp + + def test_to_string_left_justify_cols(self): + tm.reset_display_options() + df = DataFrame({"x": [3234, 0.253]}) + df_s = df.to_string(justify="left") + expected = " x \n0 3234.000\n1 0.253" + assert df_s == expected + + def test_to_string_format_na(self): + tm.reset_display_options() + df = DataFrame( + { + "A": [np.nan, -1, -2.1234, 3, 4], + "B": [np.nan, "foo", "foooo", "fooooo", "bar"], + } + ) + result = df.to_string() + + expected = ( + " A B\n" + "0 NaN NaN\n" + "1 -1.0000 foo\n" + "2 -2.1234 foooo\n" + "3 3.0000 fooooo\n" + "4 4.0000 bar" + ) + assert result == expected + + df = DataFrame( + { + "A": [np.nan, -1.0, -2.0, 3.0, 4.0], + "B": [np.nan, "foo", "foooo", "fooooo", "bar"], + } + ) + result = df.to_string() + + expected = ( + " A B\n" + "0 NaN NaN\n" + "1 -1.0 foo\n" + "2 -2.0 foooo\n" + "3 3.0 fooooo\n" + "4 4.0 bar" + ) + assert result == expected + + def test_to_string_format_inf(self): + # Issue #24861 + tm.reset_display_options() + df = DataFrame( + { + "A": [-np.inf, np.inf, -1, -2.1234, 3, 4], + "B": [-np.inf, np.inf, "foo", "foooo", "fooooo", "bar"], + } + ) + result = df.to_string() + + expected = ( + " A B\n" + "0 -inf -inf\n" + "1 inf inf\n" + "2 -1.0000 foo\n" + "3 -2.1234 foooo\n" + "4 3.0000 fooooo\n" + "5 4.0000 bar" + ) + assert result == expected + + df = DataFrame( + { + "A": [-np.inf, np.inf, -1.0, -2.0, 3.0, 4.0], + "B": [-np.inf, np.inf, "foo", "foooo", "fooooo", "bar"], + } + ) + result = df.to_string() + + expected = ( + " A B\n" + "0 -inf -inf\n" + "1 inf inf\n" + "2 -1.0 foo\n" + "3 -2.0 foooo\n" + "4 3.0 fooooo\n" + "5 4.0 bar" + ) + assert result == expected + + def test_to_string_decimal(self): + # Issue #23614 + df = DataFrame({"A": [6.0, 3.1, 2.2]}) + expected = " A\n0 6,0\n1 3,1\n2 2,2" + assert df.to_string(decimal=",") == expected + + def test_to_string_line_width(self): + df = DataFrame(123, index=range(10, 15), columns=range(30)) + s = df.to_string(line_width=80) + assert max(len(l) for l in s.split("\n")) == 80 + + def test_show_dimensions(self): + df = DataFrame(123, index=range(10, 15), columns=range(30)) + + with option_context( + "display.max_rows", + 10, + "display.max_columns", + 40, + "display.width", + 500, + "display.expand_frame_repr", + "info", + "display.show_dimensions", + True, + ): + assert "5 rows" in str(df) + assert "5 rows" in df._repr_html_() + with option_context( + "display.max_rows", + 10, + "display.max_columns", + 40, + "display.width", + 500, + "display.expand_frame_repr", + "info", + "display.show_dimensions", + False, + ): + assert "5 rows" not in str(df) + assert "5 rows" not in df._repr_html_() + with option_context( + "display.max_rows", + 2, + "display.max_columns", + 2, + "display.width", + 500, + "display.expand_frame_repr", + "info", + "display.show_dimensions", + "truncate", + ): + assert "5 rows" in str(df) + assert "5 rows" in df._repr_html_() + with option_context( + "display.max_rows", + 10, + "display.max_columns", + 40, + "display.width", + 500, + "display.expand_frame_repr", + "info", + "display.show_dimensions", + "truncate", + ): + assert "5 rows" not in str(df) + assert "5 rows" not in df._repr_html_() + + def test_repr_html(self, float_frame): + df = float_frame + df._repr_html_() + + fmt.set_option("display.max_rows", 1, "display.max_columns", 1) + df._repr_html_() + + fmt.set_option("display.notebook_repr_html", False) + df._repr_html_() + + tm.reset_display_options() + + df = DataFrame([[1, 2], [3, 4]]) + fmt.set_option("display.show_dimensions", True) + assert "2 rows" in df._repr_html_() + fmt.set_option("display.show_dimensions", False) + assert "2 rows" not in df._repr_html_() + + tm.reset_display_options() + + def test_repr_html_mathjax(self): + df = DataFrame([[1, 2], [3, 4]]) + assert "tex2jax_ignore" not in df._repr_html_() + + with pd.option_context("display.html.use_mathjax", False): + assert "tex2jax_ignore" in df._repr_html_() + + def test_repr_html_wide(self): + max_cols = 20 + df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1))) + with option_context("display.max_rows", 60, "display.max_columns", 20): + assert "..." not in df._repr_html_() + + wide_df = DataFrame(tm.rands_array(25, size=(10, max_cols + 1))) + with option_context("display.max_rows", 60, "display.max_columns", 20): + assert "..." in wide_df._repr_html_() + + def test_repr_html_wide_multiindex_cols(self): + max_cols = 20 + + mcols = MultiIndex.from_product( + [np.arange(max_cols // 2), ["foo", "bar"]], names=["first", "second"] + ) + df = DataFrame(tm.rands_array(25, size=(10, len(mcols))), columns=mcols) + reg_repr = df._repr_html_() + assert "..." not in reg_repr + + mcols = MultiIndex.from_product( + (np.arange(1 + (max_cols // 2)), ["foo", "bar"]), names=["first", "second"] + ) + df = DataFrame(tm.rands_array(25, size=(10, len(mcols))), columns=mcols) + with option_context("display.max_rows", 60, "display.max_columns", 20): + assert "..." in df._repr_html_() + + def test_repr_html_long(self): + with option_context("display.max_rows", 60): + max_rows = get_option("display.max_rows") + h = max_rows - 1 + df = DataFrame({"A": np.arange(1, 1 + h), "B": np.arange(41, 41 + h)}) + reg_repr = df._repr_html_() + assert ".." not in reg_repr + assert str(41 + max_rows // 2) in reg_repr + + h = max_rows + 1 + df = DataFrame({"A": np.arange(1, 1 + h), "B": np.arange(41, 41 + h)}) + long_repr = df._repr_html_() + assert ".." in long_repr + assert str(41 + max_rows // 2) not in long_repr + assert f"{h} rows " in long_repr + assert "2 columns" in long_repr + + def test_repr_html_float(self): + with option_context("display.max_rows", 60): + + max_rows = get_option("display.max_rows") + h = max_rows - 1 + df = DataFrame( + { + "idx": np.linspace(-10, 10, h), + "A": np.arange(1, 1 + h), + "B": np.arange(41, 41 + h), + } + ).set_index("idx") + reg_repr = df._repr_html_() + assert ".." not in reg_repr + assert f"" in reg_repr + + h = max_rows + 1 + df = DataFrame( + { + "idx": np.linspace(-10, 10, h), + "A": np.arange(1, 1 + h), + "B": np.arange(41, 41 + h), + } + ).set_index("idx") + long_repr = df._repr_html_() + assert ".." in long_repr + assert "" not in long_repr + assert f"{h} rows " in long_repr + assert "2 columns" in long_repr + + def test_repr_html_long_multiindex(self): + max_rows = 60 + max_L1 = max_rows // 2 + + tuples = list(itertools.product(np.arange(max_L1), ["foo", "bar"])) + idx = MultiIndex.from_tuples(tuples, names=["first", "second"]) + df = DataFrame(np.random.randn(max_L1 * 2, 2), index=idx, columns=["A", "B"]) + with option_context("display.max_rows", 60, "display.max_columns", 20): + reg_repr = df._repr_html_() + assert "..." not in reg_repr + + tuples = list(itertools.product(np.arange(max_L1 + 1), ["foo", "bar"])) + idx = MultiIndex.from_tuples(tuples, names=["first", "second"]) + df = DataFrame( + np.random.randn((max_L1 + 1) * 2, 2), index=idx, columns=["A", "B"] + ) + long_repr = df._repr_html_() + assert "..." in long_repr + + def test_repr_html_long_and_wide(self): + max_cols = 20 + max_rows = 60 + + h, w = max_rows - 1, max_cols - 1 + df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) + with option_context("display.max_rows", 60, "display.max_columns", 20): + assert "..." not in df._repr_html_() + + h, w = max_rows + 1, max_cols + 1 + df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) + with option_context("display.max_rows", 60, "display.max_columns", 20): + assert "..." in df._repr_html_() + + def test_info_repr(self): + # GH#21746 For tests inside a terminal (i.e. not CI) we need to detect + # the terminal size to ensure that we try to print something "too big" + term_width, term_height = get_terminal_size() + + max_rows = 60 + max_cols = 20 + (max(term_width, 80) - 80) // 4 + # Long + h, w = max_rows + 1, max_cols - 1 + df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) + assert has_vertically_truncated_repr(df) + with option_context("display.large_repr", "info"): + assert has_info_repr(df) + + # Wide + h, w = max_rows - 1, max_cols + 1 + df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) + assert has_horizontally_truncated_repr(df) + with option_context( + "display.large_repr", "info", "display.max_columns", max_cols + ): + assert has_info_repr(df) + + def test_info_repr_max_cols(self): + # GH #6939 + df = DataFrame(np.random.randn(10, 5)) + with option_context( + "display.large_repr", + "info", + "display.max_columns", + 1, + "display.max_info_columns", + 4, + ): + assert has_non_verbose_info_repr(df) + + with option_context( + "display.large_repr", + "info", + "display.max_columns", + 1, + "display.max_info_columns", + 5, + ): + assert not has_non_verbose_info_repr(df) + + # test verbose overrides + # fmt.set_option('display.max_info_columns', 4) # exceeded + + def test_info_repr_html(self): + max_rows = 60 + max_cols = 20 + # Long + h, w = max_rows + 1, max_cols - 1 + df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) + assert r"<class" not in df._repr_html_() + with option_context("display.large_repr", "info"): + assert r"<class" in df._repr_html_() + + # Wide + h, w = max_rows - 1, max_cols + 1 + df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) + assert " never truncate + assert ".." not in repr(s) + + def test_to_string_name(self): + s = Series(range(100), dtype="int64") + s.name = "myser" + res = s.to_string(max_rows=2, name=True) + exp = "0 0\n ..\n99 99\nName: myser" + assert res == exp + res = s.to_string(max_rows=2, name=False) + exp = "0 0\n ..\n99 99" + assert res == exp + + def test_to_string_dtype(self): + s = Series(range(100), dtype="int64") + res = s.to_string(max_rows=2, dtype=True) + exp = "0 0\n ..\n99 99\ndtype: int64" + assert res == exp + res = s.to_string(max_rows=2, dtype=False) + exp = "0 0\n ..\n99 99" + assert res == exp + + def test_to_string_length(self): + s = Series(range(100), dtype="int64") + res = s.to_string(max_rows=2, length=True) + exp = "0 0\n ..\n99 99\nLength: 100" + assert res == exp + + def test_to_string_na_rep(self): + s = pd.Series(index=range(100), dtype=np.float64) + res = s.to_string(na_rep="foo", max_rows=2) + exp = "0 foo\n ..\n99 foo" + assert res == exp + + def test_to_string_float_format(self): + s = pd.Series(range(10), dtype="float64") + res = s.to_string(float_format=lambda x: f"{x:2.1f}", max_rows=2) + exp = "0 0.0\n ..\n9 9.0" + assert res == exp + + def test_to_string_header(self): + s = pd.Series(range(10), dtype="int64") + s.index.name = "foo" + res = s.to_string(header=True, max_rows=2) + exp = "foo\n0 0\n ..\n9 9" + assert res == exp + res = s.to_string(header=False, max_rows=2) + exp = "0 0\n ..\n9 9" + assert res == exp + + def test_to_string_multindex_header(self): + # GH 16718 + df = pd.DataFrame({"a": [0], "b": [1], "c": [2], "d": [3]}).set_index( + ["a", "b"] + ) + res = df.to_string(header=["r1", "r2"]) + exp = " r1 r2\na b \n0 1 2 3" + assert res == exp + + +def _three_digit_exp(): + return f"{1.7e8:.4g}" == "1.7e+008" + + +class TestFloatArrayFormatter: + def test_misc(self): + obj = fmt.FloatArrayFormatter(np.array([], dtype=np.float64)) + result = obj.get_result() + assert len(result) == 0 + + def test_format(self): + obj = fmt.FloatArrayFormatter(np.array([12, 0], dtype=np.float64)) + result = obj.get_result() + assert result[0] == " 12.0" + assert result[1] == " 0.0" + + def test_output_significant_digits(self): + # Issue #9764 + + # In case default display precision changes: + with pd.option_context("display.precision", 6): + # DataFrame example from issue #9764 + d = pd.DataFrame( + { + "col1": [ + 9.999e-8, + 1e-7, + 1.0001e-7, + 2e-7, + 4.999e-7, + 5e-7, + 5.0001e-7, + 6e-7, + 9.999e-7, + 1e-6, + 1.0001e-6, + 2e-6, + 4.999e-6, + 5e-6, + 5.0001e-6, + 6e-6, + ] + } + ) + + expected_output = { + (0, 6): " col1\n" + "0 9.999000e-08\n" + "1 1.000000e-07\n" + "2 1.000100e-07\n" + "3 2.000000e-07\n" + "4 4.999000e-07\n" + "5 5.000000e-07", + (1, 6): " col1\n" + "1 1.000000e-07\n" + "2 1.000100e-07\n" + "3 2.000000e-07\n" + "4 4.999000e-07\n" + "5 5.000000e-07", + (1, 8): " col1\n" + "1 1.000000e-07\n" + "2 1.000100e-07\n" + "3 2.000000e-07\n" + "4 4.999000e-07\n" + "5 5.000000e-07\n" + "6 5.000100e-07\n" + "7 6.000000e-07", + (8, 16): " col1\n" + "8 9.999000e-07\n" + "9 1.000000e-06\n" + "10 1.000100e-06\n" + "11 2.000000e-06\n" + "12 4.999000e-06\n" + "13 5.000000e-06\n" + "14 5.000100e-06\n" + "15 6.000000e-06", + (9, 16): " col1\n" + "9 0.000001\n" + "10 0.000001\n" + "11 0.000002\n" + "12 0.000005\n" + "13 0.000005\n" + "14 0.000005\n" + "15 0.000006", + } + + for (start, stop), v in expected_output.items(): + assert str(d[start:stop]) == v + + def test_too_long(self): + # GH 10451 + with pd.option_context("display.precision", 4): + # need both a number > 1e6 and something that normally formats to + # having length > display.precision + 6 + df = pd.DataFrame(dict(x=[12345.6789])) + assert str(df) == " x\n0 12345.6789" + df = pd.DataFrame(dict(x=[2e6])) + assert str(df) == " x\n0 2000000.0" + df = pd.DataFrame(dict(x=[12345.6789, 2e6])) + assert str(df) == " x\n0 1.2346e+04\n1 2.0000e+06" + + +class TestRepr_timedelta64: + def test_none(self): + delta_1d = pd.to_timedelta(1, unit="D") + delta_0d = pd.to_timedelta(0, unit="D") + delta_1s = pd.to_timedelta(1, unit="s") + delta_500ms = pd.to_timedelta(500, unit="ms") + + drepr = lambda x: x._repr_base() + assert drepr(delta_1d) == "1 days" + assert drepr(-delta_1d) == "-1 days" + assert drepr(delta_0d) == "0 days" + assert drepr(delta_1s) == "0 days 00:00:01" + assert drepr(delta_500ms) == "0 days 00:00:00.500000" + assert drepr(delta_1d + delta_1s) == "1 days 00:00:01" + assert drepr(-delta_1d + delta_1s) == "-1 days +00:00:01" + assert drepr(delta_1d + delta_500ms) == "1 days 00:00:00.500000" + assert drepr(-delta_1d + delta_500ms) == "-1 days +00:00:00.500000" + + def test_sub_day(self): + delta_1d = pd.to_timedelta(1, unit="D") + delta_0d = pd.to_timedelta(0, unit="D") + delta_1s = pd.to_timedelta(1, unit="s") + delta_500ms = pd.to_timedelta(500, unit="ms") + + drepr = lambda x: x._repr_base(format="sub_day") + assert drepr(delta_1d) == "1 days" + assert drepr(-delta_1d) == "-1 days" + assert drepr(delta_0d) == "00:00:00" + assert drepr(delta_1s) == "00:00:01" + assert drepr(delta_500ms) == "00:00:00.500000" + assert drepr(delta_1d + delta_1s) == "1 days 00:00:01" + assert drepr(-delta_1d + delta_1s) == "-1 days +00:00:01" + assert drepr(delta_1d + delta_500ms) == "1 days 00:00:00.500000" + assert drepr(-delta_1d + delta_500ms) == "-1 days +00:00:00.500000" + + def test_long(self): + delta_1d = pd.to_timedelta(1, unit="D") + delta_0d = pd.to_timedelta(0, unit="D") + delta_1s = pd.to_timedelta(1, unit="s") + delta_500ms = pd.to_timedelta(500, unit="ms") + + drepr = lambda x: x._repr_base(format="long") + assert drepr(delta_1d) == "1 days 00:00:00" + assert drepr(-delta_1d) == "-1 days +00:00:00" + assert drepr(delta_0d) == "0 days 00:00:00" + assert drepr(delta_1s) == "0 days 00:00:01" + assert drepr(delta_500ms) == "0 days 00:00:00.500000" + assert drepr(delta_1d + delta_1s) == "1 days 00:00:01" + assert drepr(-delta_1d + delta_1s) == "-1 days +00:00:01" + assert drepr(delta_1d + delta_500ms) == "1 days 00:00:00.500000" + assert drepr(-delta_1d + delta_500ms) == "-1 days +00:00:00.500000" + + def test_all(self): + delta_1d = pd.to_timedelta(1, unit="D") + delta_0d = pd.to_timedelta(0, unit="D") + delta_1ns = pd.to_timedelta(1, unit="ns") + + drepr = lambda x: x._repr_base(format="all") + assert drepr(delta_1d) == "1 days 00:00:00.000000000" + assert drepr(-delta_1d) == "-1 days +00:00:00.000000000" + assert drepr(delta_0d) == "0 days 00:00:00.000000000" + assert drepr(delta_1ns) == "0 days 00:00:00.000000001" + assert drepr(-delta_1d + delta_1ns) == "-1 days +00:00:00.000000001" + + +class TestTimedelta64Formatter: + def test_days(self): + x = pd.to_timedelta(list(range(5)) + [pd.NaT], unit="D") + result = fmt.Timedelta64Formatter(x, box=True).get_result() + assert result[0].strip() == "'0 days'" + assert result[1].strip() == "'1 days'" + + result = fmt.Timedelta64Formatter(x[1:2], box=True).get_result() + assert result[0].strip() == "'1 days'" + + result = fmt.Timedelta64Formatter(x, box=False).get_result() + assert result[0].strip() == "0 days" + assert result[1].strip() == "1 days" + + result = fmt.Timedelta64Formatter(x[1:2], box=False).get_result() + assert result[0].strip() == "1 days" + + def test_days_neg(self): + x = pd.to_timedelta(list(range(5)) + [pd.NaT], unit="D") + result = fmt.Timedelta64Formatter(-x, box=True).get_result() + assert result[0].strip() == "'0 days'" + assert result[1].strip() == "'-1 days'" + + def test_subdays(self): + y = pd.to_timedelta(list(range(5)) + [pd.NaT], unit="s") + result = fmt.Timedelta64Formatter(y, box=True).get_result() + assert result[0].strip() == "'00:00:00'" + assert result[1].strip() == "'00:00:01'" + + def test_subdays_neg(self): + y = pd.to_timedelta(list(range(5)) + [pd.NaT], unit="s") + result = fmt.Timedelta64Formatter(-y, box=True).get_result() + assert result[0].strip() == "'00:00:00'" + assert result[1].strip() == "'-1 days +23:59:59'" + + def test_zero(self): + x = pd.to_timedelta(list(range(1)) + [pd.NaT], unit="D") + result = fmt.Timedelta64Formatter(x, box=True).get_result() + assert result[0].strip() == "'0 days'" + + x = pd.to_timedelta(list(range(1)), unit="D") + result = fmt.Timedelta64Formatter(x, box=True).get_result() + assert result[0].strip() == "'0 days'" + + +class TestDatetime64Formatter: + def test_mixed(self): + x = Series([datetime(2013, 1, 1), datetime(2013, 1, 1, 12), pd.NaT]) + result = fmt.Datetime64Formatter(x).get_result() + assert result[0].strip() == "2013-01-01 00:00:00" + assert result[1].strip() == "2013-01-01 12:00:00" + + def test_dates(self): + x = Series([datetime(2013, 1, 1), datetime(2013, 1, 2), pd.NaT]) + result = fmt.Datetime64Formatter(x).get_result() + assert result[0].strip() == "2013-01-01" + assert result[1].strip() == "2013-01-02" + + def test_date_nanos(self): + x = Series([Timestamp(200)]) + result = fmt.Datetime64Formatter(x).get_result() + assert result[0].strip() == "1970-01-01 00:00:00.000000200" + + def test_dates_display(self): + + # 10170 + # make sure that we are consistently display date formatting + x = Series(date_range("20130101 09:00:00", periods=5, freq="D")) + x.iloc[1] = np.nan + result = fmt.Datetime64Formatter(x).get_result() + assert result[0].strip() == "2013-01-01 09:00:00" + assert result[1].strip() == "NaT" + assert result[4].strip() == "2013-01-05 09:00:00" + + x = Series(date_range("20130101 09:00:00", periods=5, freq="s")) + x.iloc[1] = np.nan + result = fmt.Datetime64Formatter(x).get_result() + assert result[0].strip() == "2013-01-01 09:00:00" + assert result[1].strip() == "NaT" + assert result[4].strip() == "2013-01-01 09:00:04" + + x = Series(date_range("20130101 09:00:00", periods=5, freq="ms")) + x.iloc[1] = np.nan + result = fmt.Datetime64Formatter(x).get_result() + assert result[0].strip() == "2013-01-01 09:00:00.000" + assert result[1].strip() == "NaT" + assert result[4].strip() == "2013-01-01 09:00:00.004" + + x = Series(date_range("20130101 09:00:00", periods=5, freq="us")) + x.iloc[1] = np.nan + result = fmt.Datetime64Formatter(x).get_result() + assert result[0].strip() == "2013-01-01 09:00:00.000000" + assert result[1].strip() == "NaT" + assert result[4].strip() == "2013-01-01 09:00:00.000004" + + x = Series(date_range("20130101 09:00:00", periods=5, freq="N")) + x.iloc[1] = np.nan + result = fmt.Datetime64Formatter(x).get_result() + assert result[0].strip() == "2013-01-01 09:00:00.000000000" + assert result[1].strip() == "NaT" + assert result[4].strip() == "2013-01-01 09:00:00.000000004" + + def test_datetime64formatter_yearmonth(self): + x = Series([datetime(2016, 1, 1), datetime(2016, 2, 2)]) + + def format_func(x): + return x.strftime("%Y-%m") + + formatter = fmt.Datetime64Formatter(x, formatter=format_func) + result = formatter.get_result() + assert result == ["2016-01", "2016-02"] + + def test_datetime64formatter_hoursecond(self): + + x = Series( + pd.to_datetime(["10:10:10.100", "12:12:12.120"], format="%H:%M:%S.%f") + ) + + def format_func(x): + return x.strftime("%H:%M") + + formatter = fmt.Datetime64Formatter(x, formatter=format_func) + result = formatter.get_result() + assert result == ["10:10", "12:12"] + + +class TestNaTFormatting: + def test_repr(self): + assert repr(pd.NaT) == "NaT" + + def test_str(self): + assert str(pd.NaT) == "NaT" + + +class TestDatetimeIndexFormat: + def test_datetime(self): + formatted = pd.to_datetime([datetime(2003, 1, 1, 12), pd.NaT]).format() + assert formatted[0] == "2003-01-01 12:00:00" + assert formatted[1] == "NaT" + + def test_date(self): + formatted = pd.to_datetime([datetime(2003, 1, 1), pd.NaT]).format() + assert formatted[0] == "2003-01-01" + assert formatted[1] == "NaT" + + def test_date_tz(self): + formatted = pd.to_datetime([datetime(2013, 1, 1)], utc=True).format() + assert formatted[0] == "2013-01-01 00:00:00+00:00" + + formatted = pd.to_datetime([datetime(2013, 1, 1), pd.NaT], utc=True).format() + assert formatted[0] == "2013-01-01 00:00:00+00:00" + + def test_date_explicit_date_format(self): + formatted = pd.to_datetime([datetime(2003, 2, 1), pd.NaT]).format( + date_format="%m-%d-%Y", na_rep="UT" + ) + assert formatted[0] == "02-01-2003" + assert formatted[1] == "UT" + + +class TestDatetimeIndexUnicode: + def test_dates(self): + text = str(pd.to_datetime([datetime(2013, 1, 1), datetime(2014, 1, 1)])) + assert "['2013-01-01'," in text + assert ", '2014-01-01']" in text + + def test_mixed(self): + text = str( + pd.to_datetime( + [datetime(2013, 1, 1), datetime(2014, 1, 1, 12), datetime(2014, 1, 1)] + ) + ) + assert "'2013-01-01 00:00:00'," in text + assert "'2014-01-01 00:00:00']" in text + + +class TestStringRepTimestamp: + def test_no_tz(self): + dt_date = datetime(2013, 1, 2) + assert str(dt_date) == str(Timestamp(dt_date)) + + dt_datetime = datetime(2013, 1, 2, 12, 1, 3) + assert str(dt_datetime) == str(Timestamp(dt_datetime)) + + dt_datetime_us = datetime(2013, 1, 2, 12, 1, 3, 45) + assert str(dt_datetime_us) == str(Timestamp(dt_datetime_us)) + + ts_nanos_only = Timestamp(200) + assert str(ts_nanos_only) == "1970-01-01 00:00:00.000000200" + + ts_nanos_micros = Timestamp(1200) + assert str(ts_nanos_micros) == "1970-01-01 00:00:00.000001200" + + def test_tz_pytz(self): + dt_date = datetime(2013, 1, 2, tzinfo=pytz.utc) + assert str(dt_date) == str(Timestamp(dt_date)) + + dt_datetime = datetime(2013, 1, 2, 12, 1, 3, tzinfo=pytz.utc) + assert str(dt_datetime) == str(Timestamp(dt_datetime)) + + dt_datetime_us = datetime(2013, 1, 2, 12, 1, 3, 45, tzinfo=pytz.utc) + assert str(dt_datetime_us) == str(Timestamp(dt_datetime_us)) + + def test_tz_dateutil(self): + utc = dateutil.tz.tzutc() + + dt_date = datetime(2013, 1, 2, tzinfo=utc) + assert str(dt_date) == str(Timestamp(dt_date)) + + dt_datetime = datetime(2013, 1, 2, 12, 1, 3, tzinfo=utc) + assert str(dt_datetime) == str(Timestamp(dt_datetime)) + + dt_datetime_us = datetime(2013, 1, 2, 12, 1, 3, 45, tzinfo=utc) + assert str(dt_datetime_us) == str(Timestamp(dt_datetime_us)) + + def test_nat_representations(self): + for f in (str, repr, methodcaller("isoformat")): + assert f(pd.NaT) == "NaT" + + +def test_format_percentiles(): + result = fmt.format_percentiles([0.01999, 0.02001, 0.5, 0.666666, 0.9999]) + expected = ["1.999%", "2.001%", "50%", "66.667%", "99.99%"] + assert result == expected + + result = fmt.format_percentiles([0, 0.5, 0.02001, 0.5, 0.666666, 0.9999]) + expected = ["0%", "50%", "2.0%", "50%", "66.67%", "99.99%"] + assert result == expected + + msg = r"percentiles should all be in the interval \[0,1\]" + with pytest.raises(ValueError, match=msg): + fmt.format_percentiles([0.1, np.nan, 0.5]) + with pytest.raises(ValueError, match=msg): + fmt.format_percentiles([-0.001, 0.1, 0.5]) + with pytest.raises(ValueError, match=msg): + fmt.format_percentiles([2, 0.1, 0.5]) + with pytest.raises(ValueError, match=msg): + fmt.format_percentiles([0.1, 0.5, "a"]) + + +def test_format_percentiles_integer_idx(): + # Issue #26660 + result = fmt.format_percentiles(np.linspace(0, 1, 10 + 1)) + expected = [ + "0%", + "10%", + "20%", + "30%", + "40%", + "50%", + "60%", + "70%", + "80%", + "90%", + "100%", + ] + assert result == expected + + +def test_repr_html_ipython_config(ip): + code = textwrap.dedent( + """\ + import pandas as pd + df = pd.DataFrame({"A": [1, 2]}) + df._repr_html_() + + cfg = get_ipython().config + cfg['IPKernelApp']['parent_appname'] + df._repr_html_() + """ + ) + result = ip.run_cell(code) + assert not result.error_in_exec + + +@pytest.mark.parametrize("method", ["to_string", "to_html", "to_latex"]) +@pytest.mark.parametrize( + "encoding, data", + [(None, "abc"), ("utf-8", "abc"), ("gbk", "造成输出中文显示乱码"), ("foo", "abc")], +) +def test_filepath_or_buffer_arg( + method, + filepath_or_buffer, + assert_filepath_or_buffer_equals, + encoding, + data, + filepath_or_buffer_id, +): + df = DataFrame([data]) + + if filepath_or_buffer_id not in ["string", "pathlike"] and encoding is not None: + with pytest.raises( + ValueError, match="buf is not a file name and encoding is specified." + ): + getattr(df, method)(buf=filepath_or_buffer, encoding=encoding) + elif encoding == "foo": + with tm.assert_produces_warning(None): + with pytest.raises(LookupError, match="unknown encoding"): + getattr(df, method)(buf=filepath_or_buffer, encoding=encoding) + else: + expected = getattr(df, method)() + getattr(df, method)(buf=filepath_or_buffer, encoding=encoding) + assert_filepath_or_buffer_equals(expected) + + +@pytest.mark.parametrize("method", ["to_string", "to_html", "to_latex"]) +def test_filepath_or_buffer_bad_arg_raises(float_frame, method): + msg = "buf is not a file name and it has no write method" + with pytest.raises(TypeError, match=msg): + getattr(float_frame, method)(buf=object()) diff --git a/venv/Lib/site-packages/pandas/tests/io/formats/test_printing.py b/venv/Lib/site-packages/pandas/tests/io/formats/test_printing.py new file mode 100644 index 0000000..f0d5ef1 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/formats/test_printing.py @@ -0,0 +1,205 @@ +import numpy as np +import pytest + +import pandas._config.config as cf + +import pandas as pd + +import pandas.io.formats.format as fmt +import pandas.io.formats.printing as printing + + +def test_adjoin(): + data = [["a", "b", "c"], ["dd", "ee", "ff"], ["ggg", "hhh", "iii"]] + expected = "a dd ggg\nb ee hhh\nc ff iii" + + adjoined = printing.adjoin(2, *data) + + assert adjoined == expected + + +def test_repr_binary_type(): + import string + + letters = string.ascii_letters + try: + raw = bytes(letters, encoding=cf.get_option("display.encoding")) + except TypeError: + raw = bytes(letters) + b = str(raw.decode("utf-8")) + res = printing.pprint_thing(b, quote_strings=True) + assert res == repr(b) + res = printing.pprint_thing(b, quote_strings=False) + assert res == b + + +class TestFormattBase: + def test_adjoin(self): + data = [["a", "b", "c"], ["dd", "ee", "ff"], ["ggg", "hhh", "iii"]] + expected = "a dd ggg\nb ee hhh\nc ff iii" + + adjoined = printing.adjoin(2, *data) + + assert adjoined == expected + + def test_adjoin_unicode(self): + data = [["あ", "b", "c"], ["dd", "ええ", "ff"], ["ggg", "hhh", "いいい"]] + expected = "あ dd ggg\nb ええ hhh\nc ff いいい" + adjoined = printing.adjoin(2, *data) + assert adjoined == expected + + adj = fmt.EastAsianTextAdjustment() + + expected = """あ dd ggg +b ええ hhh +c ff いいい""" + + adjoined = adj.adjoin(2, *data) + assert adjoined == expected + cols = adjoined.split("\n") + assert adj.len(cols[0]) == 13 + assert adj.len(cols[1]) == 13 + assert adj.len(cols[2]) == 16 + + expected = """あ dd ggg +b ええ hhh +c ff いいい""" + + adjoined = adj.adjoin(7, *data) + assert adjoined == expected + cols = adjoined.split("\n") + assert adj.len(cols[0]) == 23 + assert adj.len(cols[1]) == 23 + assert adj.len(cols[2]) == 26 + + def test_justify(self): + adj = fmt.EastAsianTextAdjustment() + + def just(x, *args, **kwargs): + # wrapper to test single str + return adj.justify([x], *args, **kwargs)[0] + + assert just("abc", 5, mode="left") == "abc " + assert just("abc", 5, mode="center") == " abc " + assert just("abc", 5, mode="right") == " abc" + assert just("abc", 5, mode="left") == "abc " + assert just("abc", 5, mode="center") == " abc " + assert just("abc", 5, mode="right") == " abc" + + assert just("パンダ", 5, mode="left") == "パンダ" + assert just("パンダ", 5, mode="center") == "パンダ" + assert just("パンダ", 5, mode="right") == "パンダ" + + assert just("パンダ", 10, mode="left") == "パンダ " + assert just("パンダ", 10, mode="center") == " パンダ " + assert just("パンダ", 10, mode="right") == " パンダ" + + def test_east_asian_len(self): + adj = fmt.EastAsianTextAdjustment() + + assert adj.len("abc") == 3 + assert adj.len("abc") == 3 + + assert adj.len("パンダ") == 6 + assert adj.len("パンダ") == 5 + assert adj.len("パンダpanda") == 11 + assert adj.len("パンダpanda") == 10 + + def test_ambiguous_width(self): + adj = fmt.EastAsianTextAdjustment() + assert adj.len("¡¡ab") == 4 + + with cf.option_context("display.unicode.ambiguous_as_wide", True): + adj = fmt.EastAsianTextAdjustment() + assert adj.len("¡¡ab") == 6 + + data = [["あ", "b", "c"], ["dd", "ええ", "ff"], ["ggg", "¡¡ab", "いいい"]] + expected = "あ dd ggg \nb ええ ¡¡ab\nc ff いいい" + adjoined = adj.adjoin(2, *data) + assert adjoined == expected + + +class TestTableSchemaRepr: + @classmethod + def setup_class(cls): + pytest.importorskip("IPython") + + from IPython.core.interactiveshell import InteractiveShell + + cls.display_formatter = InteractiveShell.instance().display_formatter + + def test_publishes(self): + + df = pd.DataFrame({"A": [1, 2]}) + objects = [df["A"], df, df] # dataframe / series + expected_keys = [ + {"text/plain", "application/vnd.dataresource+json"}, + {"text/plain", "text/html", "application/vnd.dataresource+json"}, + ] + + opt = pd.option_context("display.html.table_schema", True) + for obj, expected in zip(objects, expected_keys): + with opt: + formatted = self.display_formatter.format(obj) + assert set(formatted[0].keys()) == expected + + with_latex = pd.option_context("display.latex.repr", True) + + with opt, with_latex: + formatted = self.display_formatter.format(obj) + + expected = { + "text/plain", + "text/html", + "text/latex", + "application/vnd.dataresource+json", + } + assert set(formatted[0].keys()) == expected + + def test_publishes_not_implemented(self): + # column MultiIndex + # GH 15996 + midx = pd.MultiIndex.from_product([["A", "B"], ["a", "b", "c"]]) + df = pd.DataFrame(np.random.randn(5, len(midx)), columns=midx) + + opt = pd.option_context("display.html.table_schema", True) + + with opt: + formatted = self.display_formatter.format(df) + + expected = {"text/plain", "text/html"} + assert set(formatted[0].keys()) == expected + + def test_config_on(self): + df = pd.DataFrame({"A": [1, 2]}) + with pd.option_context("display.html.table_schema", True): + result = df._repr_data_resource_() + + assert result is not None + + def test_config_default_off(self): + df = pd.DataFrame({"A": [1, 2]}) + with pd.option_context("display.html.table_schema", False): + result = df._repr_data_resource_() + + assert result is None + + def test_enable_data_resource_formatter(self): + # GH 10491 + formatters = self.display_formatter.formatters + mimetype = "application/vnd.dataresource+json" + + with pd.option_context("display.html.table_schema", True): + assert "application/vnd.dataresource+json" in formatters + assert formatters[mimetype].enabled + + # still there, just disabled + assert "application/vnd.dataresource+json" in formatters + assert not formatters[mimetype].enabled + + # able to re-set + with pd.option_context("display.html.table_schema", True): + assert "application/vnd.dataresource+json" in formatters + assert formatters[mimetype].enabled + # smoke test that it works + self.display_formatter.format(cf) diff --git a/venv/Lib/site-packages/pandas/tests/io/formats/test_style.py b/venv/Lib/site-packages/pandas/tests/io/formats/test_style.py new file mode 100644 index 0000000..e5dac18 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/formats/test_style.py @@ -0,0 +1,1789 @@ +import copy +import re +import textwrap + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import DataFrame +import pandas._testing as tm + +jinja2 = pytest.importorskip("jinja2") +from pandas.io.formats.style import Styler, _get_level_lengths # noqa # isort:skip + + +class TestStyler: + def setup_method(self, method): + np.random.seed(24) + self.s = DataFrame({"A": np.random.permutation(range(6))}) + self.df = DataFrame({"A": [0, 1], "B": np.random.randn(2)}) + self.f = lambda x: x + self.g = lambda x: x + + def h(x, foo="bar"): + return pd.Series(f"color: {foo}", index=x.index, name=x.name) + + self.h = h + self.styler = Styler(self.df) + self.attrs = pd.DataFrame({"A": ["color: red", "color: blue"]}) + self.dataframes = [ + self.df, + pd.DataFrame( + {"f": [1.0, 2.0], "o": ["a", "b"], "c": pd.Categorical(["a", "b"])} + ), + ] + + def test_init_non_pandas(self): + with pytest.raises(TypeError): + Styler([1, 2, 3]) + + def test_init_series(self): + result = Styler(pd.Series([1, 2])) + assert result.data.ndim == 2 + + def test_repr_html_ok(self): + self.styler._repr_html_() + + def test_repr_html_mathjax(self): + # gh-19824 + assert "tex2jax_ignore" not in self.styler._repr_html_() + + with pd.option_context("display.html.use_mathjax", False): + assert "tex2jax_ignore" in self.styler._repr_html_() + + def test_update_ctx(self): + self.styler._update_ctx(self.attrs) + expected = {(0, 0): ["color: red"], (1, 0): ["color: blue"]} + assert self.styler.ctx == expected + + def test_update_ctx_flatten_multi(self): + attrs = DataFrame({"A": ["color: red; foo: bar", "color: blue; foo: baz"]}) + self.styler._update_ctx(attrs) + expected = { + (0, 0): ["color: red", " foo: bar"], + (1, 0): ["color: blue", " foo: baz"], + } + assert self.styler.ctx == expected + + def test_update_ctx_flatten_multi_traliing_semi(self): + attrs = DataFrame({"A": ["color: red; foo: bar;", "color: blue; foo: baz;"]}) + self.styler._update_ctx(attrs) + expected = { + (0, 0): ["color: red", " foo: bar"], + (1, 0): ["color: blue", " foo: baz"], + } + assert self.styler.ctx == expected + + def test_copy(self): + s2 = copy.copy(self.styler) + assert self.styler is not s2 + assert self.styler.ctx is s2.ctx # shallow + assert self.styler._todo is s2._todo + + self.styler._update_ctx(self.attrs) + self.styler.highlight_max() + assert self.styler.ctx == s2.ctx + assert self.styler._todo == s2._todo + + def test_deepcopy(self): + s2 = copy.deepcopy(self.styler) + assert self.styler is not s2 + assert self.styler.ctx is not s2.ctx + assert self.styler._todo is not s2._todo + + self.styler._update_ctx(self.attrs) + self.styler.highlight_max() + assert self.styler.ctx != s2.ctx + assert s2._todo == [] + assert self.styler._todo != s2._todo + + def test_clear(self): + s = self.df.style.highlight_max()._compute() + assert len(s.ctx) > 0 + assert len(s._todo) > 0 + s.clear() + assert len(s.ctx) == 0 + assert len(s._todo) == 0 + + def test_render(self): + df = pd.DataFrame({"A": [0, 1]}) + style = lambda x: pd.Series(["color: red", "color: blue"], name=x.name) + s = Styler(df, uuid="AB").apply(style) + s.render() + # it worked? + + def test_render_empty_dfs(self): + empty_df = DataFrame() + es = Styler(empty_df) + es.render() + # An index but no columns + DataFrame(columns=["a"]).style.render() + # A column but no index + DataFrame(index=["a"]).style.render() + # No IndexError raised? + + def test_render_double(self): + df = pd.DataFrame({"A": [0, 1]}) + style = lambda x: pd.Series( + ["color: red; border: 1px", "color: blue; border: 2px"], name=x.name + ) + s = Styler(df, uuid="AB").apply(style) + s.render() + # it worked? + + def test_set_properties(self): + df = pd.DataFrame({"A": [0, 1]}) + result = df.style.set_properties(color="white", size="10px")._compute().ctx + # order is deterministic + v = ["color: white", "size: 10px"] + expected = {(0, 0): v, (1, 0): v} + assert result.keys() == expected.keys() + for v1, v2 in zip(result.values(), expected.values()): + assert sorted(v1) == sorted(v2) + + def test_set_properties_subset(self): + df = pd.DataFrame({"A": [0, 1]}) + result = ( + df.style.set_properties(subset=pd.IndexSlice[0, "A"], color="white") + ._compute() + .ctx + ) + expected = {(0, 0): ["color: white"]} + assert result == expected + + def test_empty_index_name_doesnt_display(self): + # https://github.com/pandas-dev/pandas/pull/12090#issuecomment-180695902 + df = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) + result = df.style._translate() + + expected = [ + [ + { + "class": "blank level0", + "type": "th", + "value": "", + "is_visible": True, + "display_value": "", + }, + { + "class": "col_heading level0 col0", + "display_value": "A", + "type": "th", + "value": "A", + "is_visible": True, + }, + { + "class": "col_heading level0 col1", + "display_value": "B", + "type": "th", + "value": "B", + "is_visible": True, + }, + { + "class": "col_heading level0 col2", + "display_value": "C", + "type": "th", + "value": "C", + "is_visible": True, + }, + ] + ] + + assert result["head"] == expected + + def test_index_name(self): + # https://github.com/pandas-dev/pandas/issues/11655 + df = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) + result = df.set_index("A").style._translate() + + expected = [ + [ + { + "class": "blank level0", + "type": "th", + "value": "", + "display_value": "", + "is_visible": True, + }, + { + "class": "col_heading level0 col0", + "type": "th", + "value": "B", + "display_value": "B", + "is_visible": True, + }, + { + "class": "col_heading level0 col1", + "type": "th", + "value": "C", + "display_value": "C", + "is_visible": True, + }, + ], + [ + {"class": "index_name level0", "type": "th", "value": "A"}, + {"class": "blank", "type": "th", "value": ""}, + {"class": "blank", "type": "th", "value": ""}, + ], + ] + + assert result["head"] == expected + + def test_multiindex_name(self): + # https://github.com/pandas-dev/pandas/issues/11655 + df = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) + result = df.set_index(["A", "B"]).style._translate() + + expected = [ + [ + { + "class": "blank", + "type": "th", + "value": "", + "display_value": "", + "is_visible": True, + }, + { + "class": "blank level0", + "type": "th", + "value": "", + "display_value": "", + "is_visible": True, + }, + { + "class": "col_heading level0 col0", + "type": "th", + "value": "C", + "display_value": "C", + "is_visible": True, + }, + ], + [ + {"class": "index_name level0", "type": "th", "value": "A"}, + {"class": "index_name level1", "type": "th", "value": "B"}, + {"class": "blank", "type": "th", "value": ""}, + ], + ] + + assert result["head"] == expected + + def test_numeric_columns(self): + # https://github.com/pandas-dev/pandas/issues/12125 + # smoke test for _translate + df = pd.DataFrame({0: [1, 2, 3]}) + df.style._translate() + + def test_apply_axis(self): + df = pd.DataFrame({"A": [0, 0], "B": [1, 1]}) + f = lambda x: [f"val: {x.max()}" for v in x] + result = df.style.apply(f, axis=1) + assert len(result._todo) == 1 + assert len(result.ctx) == 0 + result._compute() + expected = { + (0, 0): ["val: 1"], + (0, 1): ["val: 1"], + (1, 0): ["val: 1"], + (1, 1): ["val: 1"], + } + assert result.ctx == expected + + result = df.style.apply(f, axis=0) + expected = { + (0, 0): ["val: 0"], + (0, 1): ["val: 1"], + (1, 0): ["val: 0"], + (1, 1): ["val: 1"], + } + result._compute() + assert result.ctx == expected + result = df.style.apply(f) # default + result._compute() + assert result.ctx == expected + + def test_apply_subset(self): + axes = [0, 1] + slices = [ + pd.IndexSlice[:], + pd.IndexSlice[:, ["A"]], + pd.IndexSlice[[1], :], + pd.IndexSlice[[1], ["A"]], + pd.IndexSlice[:2, ["A", "B"]], + ] + for ax in axes: + for slice_ in slices: + result = ( + self.df.style.apply(self.h, axis=ax, subset=slice_, foo="baz") + ._compute() + .ctx + ) + expected = { + (r, c): ["color: baz"] + for r, row in enumerate(self.df.index) + for c, col in enumerate(self.df.columns) + if row in self.df.loc[slice_].index + and col in self.df.loc[slice_].columns + } + assert result == expected + + def test_applymap_subset(self): + def f(x): + return "foo: bar" + + slices = [ + pd.IndexSlice[:], + pd.IndexSlice[:, ["A"]], + pd.IndexSlice[[1], :], + pd.IndexSlice[[1], ["A"]], + pd.IndexSlice[:2, ["A", "B"]], + ] + + for slice_ in slices: + result = self.df.style.applymap(f, subset=slice_)._compute().ctx + expected = { + (r, c): ["foo: bar"] + for r, row in enumerate(self.df.index) + for c, col in enumerate(self.df.columns) + if row in self.df.loc[slice_].index + and col in self.df.loc[slice_].columns + } + assert result == expected + + def test_applymap_subset_multiindex(self): + # GH 19861 + # Smoke test for applymap + def color_negative_red(val): + """ + Takes a scalar and returns a string with + the css property `'color: red'` for negative + strings, black otherwise. + """ + color = "red" if val < 0 else "black" + return f"color: {color}" + + dic = { + ("a", "d"): [-1.12, 2.11], + ("a", "c"): [2.78, -2.88], + ("b", "c"): [-3.99, 3.77], + ("b", "d"): [4.21, -1.22], + } + + idx = pd.IndexSlice + df = pd.DataFrame(dic, index=[0, 1]) + + (df.style.applymap(color_negative_red, subset=idx[:, idx["b", "d"]]).render()) + + def test_applymap_subset_multiindex_code(self): + # https://github.com/pandas-dev/pandas/issues/25858 + # Checks styler.applymap works with multindex when codes are provided + codes = np.array([[0, 0, 1, 1], [0, 1, 0, 1]]) + columns = pd.MultiIndex( + levels=[["a", "b"], ["%", "#"]], codes=codes, names=["", ""] + ) + df = DataFrame( + [[1, -1, 1, 1], [-1, 1, 1, 1]], index=["hello", "world"], columns=columns + ) + pct_subset = pd.IndexSlice[:, pd.IndexSlice[:, "%":"%"]] + + def color_negative_red(val): + color = "red" if val < 0 else "black" + return f"color: {color}" + + df.loc[pct_subset] + df.style.applymap(color_negative_red, subset=pct_subset) + + def test_where_with_one_style(self): + # GH 17474 + def f(x): + return x > 0.5 + + style1 = "foo: bar" + + result = self.df.style.where(f, style1)._compute().ctx + expected = { + (r, c): [style1 if f(self.df.loc[row, col]) else ""] + for r, row in enumerate(self.df.index) + for c, col in enumerate(self.df.columns) + } + assert result == expected + + def test_where_subset(self): + # GH 17474 + def f(x): + return x > 0.5 + + style1 = "foo: bar" + style2 = "baz: foo" + + slices = [ + pd.IndexSlice[:], + pd.IndexSlice[:, ["A"]], + pd.IndexSlice[[1], :], + pd.IndexSlice[[1], ["A"]], + pd.IndexSlice[:2, ["A", "B"]], + ] + + for slice_ in slices: + result = ( + self.df.style.where(f, style1, style2, subset=slice_)._compute().ctx + ) + expected = { + (r, c): [style1 if f(self.df.loc[row, col]) else style2] + for r, row in enumerate(self.df.index) + for c, col in enumerate(self.df.columns) + if row in self.df.loc[slice_].index + and col in self.df.loc[slice_].columns + } + assert result == expected + + def test_where_subset_compare_with_applymap(self): + # GH 17474 + def f(x): + return x > 0.5 + + style1 = "foo: bar" + style2 = "baz: foo" + + def g(x): + return style1 if f(x) else style2 + + slices = [ + pd.IndexSlice[:], + pd.IndexSlice[:, ["A"]], + pd.IndexSlice[[1], :], + pd.IndexSlice[[1], ["A"]], + pd.IndexSlice[:2, ["A", "B"]], + ] + + for slice_ in slices: + result = ( + self.df.style.where(f, style1, style2, subset=slice_)._compute().ctx + ) + expected = self.df.style.applymap(g, subset=slice_)._compute().ctx + assert result == expected + + def test_empty(self): + df = pd.DataFrame({"A": [1, 0]}) + s = df.style + s.ctx = {(0, 0): ["color: red"], (1, 0): [""]} + + result = s._translate()["cellstyle"] + expected = [ + {"props": [["color", " red"]], "selector": "row0_col0"}, + {"props": [["", ""]], "selector": "row1_col0"}, + ] + assert result == expected + + def test_bar_align_left(self): + df = pd.DataFrame({"A": [0, 1, 2]}) + result = df.style.bar()._compute().ctx + expected = { + (0, 0): ["width: 10em", " height: 80%"], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(" + "90deg,#d65f5f 50.0%, transparent 50.0%)", + ], + (2, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(" + "90deg,#d65f5f 100.0%, transparent 100.0%)", + ], + } + assert result == expected + + result = df.style.bar(color="red", width=50)._compute().ctx + expected = { + (0, 0): ["width: 10em", " height: 80%"], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,red 25.0%, transparent 25.0%)", + ], + (2, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,red 50.0%, transparent 50.0%)", + ], + } + assert result == expected + + df["C"] = ["a"] * len(df) + result = df.style.bar(color="red", width=50)._compute().ctx + assert result == expected + df["C"] = df["C"].astype("category") + result = df.style.bar(color="red", width=50)._compute().ctx + assert result == expected + + def test_bar_align_left_0points(self): + df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + result = df.style.bar()._compute().ctx + expected = { + (0, 0): ["width: 10em", " height: 80%"], + (0, 1): ["width: 10em", " height: 80%"], + (0, 2): ["width: 10em", " height: 80%"], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 50.0%, transparent 50.0%)", + ], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 50.0%, transparent 50.0%)", + ], + (1, 2): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 50.0%, transparent 50.0%)", + ], + (2, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 100.0%" + ", transparent 100.0%)", + ], + (2, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 100.0%" + ", transparent 100.0%)", + ], + (2, 2): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 100.0%" + ", transparent 100.0%)", + ], + } + assert result == expected + + result = df.style.bar(axis=1)._compute().ctx + expected = { + (0, 0): ["width: 10em", " height: 80%"], + (0, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 50.0%, transparent 50.0%)", + ], + (0, 2): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 100.0%" + ", transparent 100.0%)", + ], + (1, 0): ["width: 10em", " height: 80%"], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 50.0%" + ", transparent 50.0%)", + ], + (1, 2): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 100.0%" + ", transparent 100.0%)", + ], + (2, 0): ["width: 10em", " height: 80%"], + (2, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 50.0%" + ", transparent 50.0%)", + ], + (2, 2): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 100.0%" + ", transparent 100.0%)", + ], + } + assert result == expected + + def test_bar_align_mid_pos_and_neg(self): + df = pd.DataFrame({"A": [-10, 0, 20, 90]}) + + result = df.style.bar(align="mid", color=["#d65f5f", "#5fba7d"])._compute().ctx + + expected = { + (0, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#d65f5f 10.0%, transparent 10.0%)", + ], + (1, 0): ["width: 10em", " height: 80%"], + (2, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 10.0%, #5fba7d 10.0%" + ", #5fba7d 30.0%, transparent 30.0%)", + ], + (3, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 10.0%, " + "#5fba7d 10.0%, #5fba7d 100.0%, " + "transparent 100.0%)", + ], + } + + assert result == expected + + def test_bar_align_mid_all_pos(self): + df = pd.DataFrame({"A": [10, 20, 50, 100]}) + + result = df.style.bar(align="mid", color=["#d65f5f", "#5fba7d"])._compute().ctx + + expected = { + (0, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#5fba7d 10.0%, transparent 10.0%)", + ], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#5fba7d 20.0%, transparent 20.0%)", + ], + (2, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#5fba7d 50.0%, transparent 50.0%)", + ], + (3, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#5fba7d 100.0%, transparent 100.0%)", + ], + } + + assert result == expected + + def test_bar_align_mid_all_neg(self): + df = pd.DataFrame({"A": [-100, -60, -30, -20]}) + + result = df.style.bar(align="mid", color=["#d65f5f", "#5fba7d"])._compute().ctx + + expected = { + (0, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#d65f5f 100.0%, transparent 100.0%)", + ], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 40.0%, " + "#d65f5f 40.0%, #d65f5f 100.0%, " + "transparent 100.0%)", + ], + (2, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 70.0%, " + "#d65f5f 70.0%, #d65f5f 100.0%, " + "transparent 100.0%)", + ], + (3, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 80.0%, " + "#d65f5f 80.0%, #d65f5f 100.0%, " + "transparent 100.0%)", + ], + } + assert result == expected + + def test_bar_align_zero_pos_and_neg(self): + # See https://github.com/pandas-dev/pandas/pull/14757 + df = pd.DataFrame({"A": [-10, 0, 20, 90]}) + + result = ( + df.style.bar(align="zero", color=["#d65f5f", "#5fba7d"], width=90) + ._compute() + .ctx + ) + expected = { + (0, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 40.0%, #d65f5f 40.0%, " + "#d65f5f 45.0%, transparent 45.0%)", + ], + (1, 0): ["width: 10em", " height: 80%"], + (2, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 45.0%, #5fba7d 45.0%, " + "#5fba7d 55.0%, transparent 55.0%)", + ], + (3, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 45.0%, #5fba7d 45.0%, " + "#5fba7d 90.0%, transparent 90.0%)", + ], + } + assert result == expected + + def test_bar_align_left_axis_none(self): + df = pd.DataFrame({"A": [0, 1], "B": [2, 4]}) + result = df.style.bar(axis=None)._compute().ctx + expected = { + (0, 0): ["width: 10em", " height: 80%"], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#d65f5f 25.0%, transparent 25.0%)", + ], + (0, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#d65f5f 50.0%, transparent 50.0%)", + ], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#d65f5f 100.0%, transparent 100.0%)", + ], + } + assert result == expected + + def test_bar_align_zero_axis_none(self): + df = pd.DataFrame({"A": [0, 1], "B": [-2, 4]}) + result = df.style.bar(align="zero", axis=None)._compute().ctx + expected = { + (0, 0): ["width: 10em", " height: 80%"], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 50.0%, #d65f5f 50.0%, " + "#d65f5f 62.5%, transparent 62.5%)", + ], + (0, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 25.0%, #d65f5f 25.0%, " + "#d65f5f 50.0%, transparent 50.0%)", + ], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 50.0%, #d65f5f 50.0%, " + "#d65f5f 100.0%, transparent 100.0%)", + ], + } + assert result == expected + + def test_bar_align_mid_axis_none(self): + df = pd.DataFrame({"A": [0, 1], "B": [-2, 4]}) + result = df.style.bar(align="mid", axis=None)._compute().ctx + expected = { + (0, 0): ["width: 10em", " height: 80%"], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 33.3%, #d65f5f 33.3%, " + "#d65f5f 50.0%, transparent 50.0%)", + ], + (0, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#d65f5f 33.3%, transparent 33.3%)", + ], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 33.3%, #d65f5f 33.3%, " + "#d65f5f 100.0%, transparent 100.0%)", + ], + } + assert result == expected + + def test_bar_align_mid_vmin(self): + df = pd.DataFrame({"A": [0, 1], "B": [-2, 4]}) + result = df.style.bar(align="mid", axis=None, vmin=-6)._compute().ctx + expected = { + (0, 0): ["width: 10em", " height: 80%"], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 60.0%, #d65f5f 60.0%, " + "#d65f5f 70.0%, transparent 70.0%)", + ], + (0, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 40.0%, #d65f5f 40.0%, " + "#d65f5f 60.0%, transparent 60.0%)", + ], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 60.0%, #d65f5f 60.0%, " + "#d65f5f 100.0%, transparent 100.0%)", + ], + } + assert result == expected + + def test_bar_align_mid_vmax(self): + df = pd.DataFrame({"A": [0, 1], "B": [-2, 4]}) + result = df.style.bar(align="mid", axis=None, vmax=8)._compute().ctx + expected = { + (0, 0): ["width: 10em", " height: 80%"], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 20.0%, #d65f5f 20.0%, " + "#d65f5f 30.0%, transparent 30.0%)", + ], + (0, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#d65f5f 20.0%, transparent 20.0%)", + ], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 20.0%, #d65f5f 20.0%, " + "#d65f5f 60.0%, transparent 60.0%)", + ], + } + assert result == expected + + def test_bar_align_mid_vmin_vmax_wide(self): + df = pd.DataFrame({"A": [0, 1], "B": [-2, 4]}) + result = df.style.bar(align="mid", axis=None, vmin=-3, vmax=7)._compute().ctx + expected = { + (0, 0): ["width: 10em", " height: 80%"], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 30.0%, #d65f5f 30.0%, " + "#d65f5f 40.0%, transparent 40.0%)", + ], + (0, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 10.0%, #d65f5f 10.0%, " + "#d65f5f 30.0%, transparent 30.0%)", + ], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 30.0%, #d65f5f 30.0%, " + "#d65f5f 70.0%, transparent 70.0%)", + ], + } + assert result == expected + + def test_bar_align_mid_vmin_vmax_clipping(self): + df = pd.DataFrame({"A": [0, 1], "B": [-2, 4]}) + result = df.style.bar(align="mid", axis=None, vmin=-1, vmax=3)._compute().ctx + expected = { + (0, 0): ["width: 10em", " height: 80%"], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 25.0%, #d65f5f 25.0%, " + "#d65f5f 50.0%, transparent 50.0%)", + ], + (0, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#d65f5f 25.0%, transparent 25.0%)", + ], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 25.0%, #d65f5f 25.0%, " + "#d65f5f 100.0%, transparent 100.0%)", + ], + } + assert result == expected + + def test_bar_align_mid_nans(self): + df = pd.DataFrame({"A": [1, None], "B": [-1, 3]}) + result = df.style.bar(align="mid", axis=None)._compute().ctx + expected = { + (0, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 25.0%, #d65f5f 25.0%, " + "#d65f5f 50.0%, transparent 50.0%)", + ], + (1, 0): [""], + (0, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#d65f5f 25.0%, transparent 25.0%)", + ], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 25.0%, #d65f5f 25.0%, " + "#d65f5f 100.0%, transparent 100.0%)", + ], + } + assert result == expected + + def test_bar_align_zero_nans(self): + df = pd.DataFrame({"A": [1, None], "B": [-1, 2]}) + result = df.style.bar(align="zero", axis=None)._compute().ctx + expected = { + (0, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 50.0%, #d65f5f 50.0%, " + "#d65f5f 75.0%, transparent 75.0%)", + ], + (1, 0): [""], + (0, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 25.0%, #d65f5f 25.0%, " + "#d65f5f 50.0%, transparent 50.0%)", + ], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 50.0%, #d65f5f 50.0%, " + "#d65f5f 100.0%, transparent 100.0%)", + ], + } + assert result == expected + + def test_bar_bad_align_raises(self): + df = pd.DataFrame({"A": [-100, -60, -30, -20]}) + with pytest.raises(ValueError): + df.style.bar(align="poorly", color=["#d65f5f", "#5fba7d"]) + + def test_format_with_na_rep(self): + # GH 21527 28358 + df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"]) + + ctx = df.style.format(None, na_rep="-")._translate() + assert ctx["body"][0][1]["display_value"] == "-" + assert ctx["body"][0][2]["display_value"] == "-" + + ctx = df.style.format("{:.2%}", na_rep="-")._translate() + assert ctx["body"][0][1]["display_value"] == "-" + assert ctx["body"][0][2]["display_value"] == "-" + assert ctx["body"][1][1]["display_value"] == "110.00%" + assert ctx["body"][1][2]["display_value"] == "120.00%" + + ctx = df.style.format("{:.2%}", na_rep="-", subset=["B"])._translate() + assert ctx["body"][0][2]["display_value"] == "-" + assert ctx["body"][1][2]["display_value"] == "120.00%" + + def test_init_with_na_rep(self): + # GH 21527 28358 + df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"]) + + ctx = Styler(df, na_rep="NA")._translate() + assert ctx["body"][0][1]["display_value"] == "NA" + assert ctx["body"][0][2]["display_value"] == "NA" + + def test_set_na_rep(self): + # GH 21527 28358 + df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"]) + + ctx = df.style.set_na_rep("NA")._translate() + assert ctx["body"][0][1]["display_value"] == "NA" + assert ctx["body"][0][2]["display_value"] == "NA" + + ctx = ( + df.style.set_na_rep("NA") + .format(None, na_rep="-", subset=["B"]) + ._translate() + ) + assert ctx["body"][0][1]["display_value"] == "NA" + assert ctx["body"][0][2]["display_value"] == "-" + + def test_format_non_numeric_na(self): + # GH 21527 28358 + df = pd.DataFrame( + { + "object": [None, np.nan, "foo"], + "datetime": [None, pd.NaT, pd.Timestamp("20120101")], + } + ) + + ctx = df.style.set_na_rep("NA")._translate() + assert ctx["body"][0][1]["display_value"] == "NA" + assert ctx["body"][0][2]["display_value"] == "NA" + assert ctx["body"][1][1]["display_value"] == "NA" + assert ctx["body"][1][2]["display_value"] == "NA" + + ctx = df.style.format(None, na_rep="-")._translate() + assert ctx["body"][0][1]["display_value"] == "-" + assert ctx["body"][0][2]["display_value"] == "-" + assert ctx["body"][1][1]["display_value"] == "-" + assert ctx["body"][1][2]["display_value"] == "-" + + def test_format_with_bad_na_rep(self): + # GH 21527 28358 + df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"]) + with pytest.raises(TypeError): + df.style.format(None, na_rep=-1) + + def test_highlight_null(self, null_color="red"): + df = pd.DataFrame({"A": [0, np.nan]}) + result = df.style.highlight_null()._compute().ctx + expected = {(0, 0): [""], (1, 0): ["background-color: red"]} + assert result == expected + + def test_nonunique_raises(self): + df = pd.DataFrame([[1, 2]], columns=["A", "A"]) + with pytest.raises(ValueError): + df.style + + with pytest.raises(ValueError): + Styler(df) + + def test_caption(self): + styler = Styler(self.df, caption="foo") + result = styler.render() + assert all(["caption" in result, "foo" in result]) + + styler = self.df.style + result = styler.set_caption("baz") + assert styler is result + assert styler.caption == "baz" + + def test_uuid(self): + styler = Styler(self.df, uuid="abc123") + result = styler.render() + assert "abc123" in result + + styler = self.df.style + result = styler.set_uuid("aaa") + assert result is styler + assert result.uuid == "aaa" + + def test_unique_id(self): + # See https://github.com/pandas-dev/pandas/issues/16780 + df = pd.DataFrame({"a": [1, 3, 5, 6], "b": [2, 4, 12, 21]}) + result = df.style.render(uuid="test") + assert "test" in result + ids = re.findall('id="(.*?)"', result) + assert np.unique(ids).size == len(ids) + + def test_table_styles(self): + style = [{"selector": "th", "props": [("foo", "bar")]}] + styler = Styler(self.df, table_styles=style) + result = " ".join(styler.render().split()) + assert "th { foo: bar; }" in result + + styler = self.df.style + result = styler.set_table_styles(style) + assert styler is result + assert styler.table_styles == style + + def test_table_attributes(self): + attributes = 'class="foo" data-bar' + styler = Styler(self.df, table_attributes=attributes) + result = styler.render() + assert 'class="foo" data-bar' in result + + result = self.df.style.set_table_attributes(attributes).render() + assert 'class="foo" data-bar' in result + + def test_precision(self): + with pd.option_context("display.precision", 10): + s = Styler(self.df) + assert s.precision == 10 + s = Styler(self.df, precision=2) + assert s.precision == 2 + + s2 = s.set_precision(4) + assert s is s2 + assert s.precision == 4 + + def test_apply_none(self): + def f(x): + return pd.DataFrame( + np.where(x == x.max(), "color: red", ""), + index=x.index, + columns=x.columns, + ) + + result = pd.DataFrame([[1, 2], [3, 4]]).style.apply(f, axis=None)._compute().ctx + assert result[(1, 1)] == ["color: red"] + + def test_trim(self): + result = self.df.style.render() # trim=True + assert result.count("#") == 0 + + result = self.df.style.highlight_max().render() + assert result.count("#") == len(self.df.columns) + + def test_highlight_max(self): + df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + # max(df) = min(-df) + for max_ in [True, False]: + if max_: + attr = "highlight_max" + else: + df = -df + attr = "highlight_min" + result = getattr(df.style, attr)()._compute().ctx + assert result[(1, 1)] == ["background-color: yellow"] + + result = getattr(df.style, attr)(color="green")._compute().ctx + assert result[(1, 1)] == ["background-color: green"] + + result = getattr(df.style, attr)(subset="A")._compute().ctx + assert result[(1, 0)] == ["background-color: yellow"] + + result = getattr(df.style, attr)(axis=0)._compute().ctx + expected = { + (1, 0): ["background-color: yellow"], + (1, 1): ["background-color: yellow"], + (0, 1): [""], + (0, 0): [""], + } + assert result == expected + + result = getattr(df.style, attr)(axis=1)._compute().ctx + expected = { + (0, 1): ["background-color: yellow"], + (1, 1): ["background-color: yellow"], + (0, 0): [""], + (1, 0): [""], + } + assert result == expected + + # separate since we can't negate the strs + df["C"] = ["a", "b"] + result = df.style.highlight_max()._compute().ctx + expected = {(1, 1): ["background-color: yellow"]} + + result = df.style.highlight_min()._compute().ctx + expected = {(0, 0): ["background-color: yellow"]} + + def test_export(self): + f = lambda x: "color: red" if x > 0 else "color: blue" + g = lambda x, z: f"color: {z}" if x > 0 else f"color: {z}" + style1 = self.styler + style1.applymap(f).applymap(g, z="b").highlight_max() + result = style1.export() + style2 = self.df.style + style2.use(result) + assert style1._todo == style2._todo + style2.render() + + def test_display_format(self): + df = pd.DataFrame(np.random.random(size=(2, 2))) + ctx = df.style.format("{:0.1f}")._translate() + + assert all(["display_value" in c for c in row] for row in ctx["body"]) + assert all( + [len(c["display_value"]) <= 3 for c in row[1:]] for row in ctx["body"] + ) + assert len(ctx["body"][0][1]["display_value"].lstrip("-")) <= 3 + + def test_display_format_raises(self): + df = pd.DataFrame(np.random.randn(2, 2)) + with pytest.raises(TypeError): + df.style.format(5) + with pytest.raises(TypeError): + df.style.format(True) + + def test_display_set_precision(self): + # Issue #13257 + df = pd.DataFrame(data=[[1.0, 2.0090], [3.2121, 4.566]], columns=["a", "b"]) + s = Styler(df) + + ctx = s.set_precision(1)._translate() + + assert s.precision == 1 + assert ctx["body"][0][1]["display_value"] == "1.0" + assert ctx["body"][0][2]["display_value"] == "2.0" + assert ctx["body"][1][1]["display_value"] == "3.2" + assert ctx["body"][1][2]["display_value"] == "4.6" + + ctx = s.set_precision(2)._translate() + assert s.precision == 2 + assert ctx["body"][0][1]["display_value"] == "1.00" + assert ctx["body"][0][2]["display_value"] == "2.01" + assert ctx["body"][1][1]["display_value"] == "3.21" + assert ctx["body"][1][2]["display_value"] == "4.57" + + ctx = s.set_precision(3)._translate() + assert s.precision == 3 + assert ctx["body"][0][1]["display_value"] == "1.000" + assert ctx["body"][0][2]["display_value"] == "2.009" + assert ctx["body"][1][1]["display_value"] == "3.212" + assert ctx["body"][1][2]["display_value"] == "4.566" + + def test_display_subset(self): + df = pd.DataFrame([[0.1234, 0.1234], [1.1234, 1.1234]], columns=["a", "b"]) + ctx = df.style.format( + {"a": "{:0.1f}", "b": "{0:.2%}"}, subset=pd.IndexSlice[0, :] + )._translate() + expected = "0.1" + raw_11 = "1.123400" + assert ctx["body"][0][1]["display_value"] == expected + assert ctx["body"][1][1]["display_value"] == raw_11 + assert ctx["body"][0][2]["display_value"] == "12.34%" + + ctx = df.style.format("{:0.1f}", subset=pd.IndexSlice[0, :])._translate() + assert ctx["body"][0][1]["display_value"] == expected + assert ctx["body"][1][1]["display_value"] == raw_11 + + ctx = df.style.format("{:0.1f}", subset=pd.IndexSlice["a"])._translate() + assert ctx["body"][0][1]["display_value"] == expected + assert ctx["body"][0][2]["display_value"] == "0.123400" + + ctx = df.style.format("{:0.1f}", subset=pd.IndexSlice[0, "a"])._translate() + assert ctx["body"][0][1]["display_value"] == expected + assert ctx["body"][1][1]["display_value"] == raw_11 + + ctx = df.style.format( + "{:0.1f}", subset=pd.IndexSlice[[0, 1], ["a"]] + )._translate() + assert ctx["body"][0][1]["display_value"] == expected + assert ctx["body"][1][1]["display_value"] == "1.1" + assert ctx["body"][0][2]["display_value"] == "0.123400" + assert ctx["body"][1][2]["display_value"] == raw_11 + + def test_display_dict(self): + df = pd.DataFrame([[0.1234, 0.1234], [1.1234, 1.1234]], columns=["a", "b"]) + ctx = df.style.format({"a": "{:0.1f}", "b": "{0:.2%}"})._translate() + assert ctx["body"][0][1]["display_value"] == "0.1" + assert ctx["body"][0][2]["display_value"] == "12.34%" + df["c"] = ["aaa", "bbb"] + ctx = df.style.format({"a": "{:0.1f}", "c": str.upper})._translate() + assert ctx["body"][0][1]["display_value"] == "0.1" + assert ctx["body"][0][3]["display_value"] == "AAA" + + def test_bad_apply_shape(self): + df = pd.DataFrame([[1, 2], [3, 4]]) + with pytest.raises(ValueError): + df.style._apply(lambda x: "x", subset=pd.IndexSlice[[0, 1], :]) + + with pytest.raises(ValueError): + df.style._apply(lambda x: [""], subset=pd.IndexSlice[[0, 1], :]) + + with pytest.raises(ValueError): + df.style._apply(lambda x: ["", "", "", ""]) + + with pytest.raises(ValueError): + df.style._apply(lambda x: ["", "", ""], subset=1) + + with pytest.raises(ValueError): + df.style._apply(lambda x: ["", "", ""], axis=1) + + def test_apply_bad_return(self): + def f(x): + return "" + + df = pd.DataFrame([[1, 2], [3, 4]]) + with pytest.raises(TypeError): + df.style._apply(f, axis=None) + + def test_apply_bad_labels(self): + def f(x): + return pd.DataFrame(index=[1, 2], columns=["a", "b"]) + + df = pd.DataFrame([[1, 2], [3, 4]]) + with pytest.raises(ValueError): + df.style._apply(f, axis=None) + + def test_get_level_lengths(self): + index = pd.MultiIndex.from_product([["a", "b"], [0, 1, 2]]) + expected = { + (0, 0): 3, + (0, 3): 3, + (1, 0): 1, + (1, 1): 1, + (1, 2): 1, + (1, 3): 1, + (1, 4): 1, + (1, 5): 1, + } + result = _get_level_lengths(index) + tm.assert_dict_equal(result, expected) + + def test_get_level_lengths_un_sorted(self): + index = pd.MultiIndex.from_arrays([[1, 1, 2, 1], ["a", "b", "b", "d"]]) + expected = { + (0, 0): 2, + (0, 2): 1, + (0, 3): 1, + (1, 0): 1, + (1, 1): 1, + (1, 2): 1, + (1, 3): 1, + } + result = _get_level_lengths(index) + tm.assert_dict_equal(result, expected) + + def test_mi_sparse(self): + df = pd.DataFrame( + {"A": [1, 2]}, index=pd.MultiIndex.from_arrays([["a", "a"], [0, 1]]) + ) + + result = df.style._translate() + body_0 = result["body"][0][0] + expected_0 = { + "value": "a", + "display_value": "a", + "is_visible": True, + "type": "th", + "attributes": ["rowspan=2"], + "class": "row_heading level0 row0", + "id": "level0_row0", + } + tm.assert_dict_equal(body_0, expected_0) + + body_1 = result["body"][0][1] + expected_1 = { + "value": 0, + "display_value": 0, + "is_visible": True, + "type": "th", + "class": "row_heading level1 row0", + "id": "level1_row0", + } + tm.assert_dict_equal(body_1, expected_1) + + body_10 = result["body"][1][0] + expected_10 = { + "value": "a", + "display_value": "a", + "is_visible": False, + "type": "th", + "class": "row_heading level0 row1", + "id": "level0_row1", + } + tm.assert_dict_equal(body_10, expected_10) + + head = result["head"][0] + expected = [ + { + "type": "th", + "class": "blank", + "value": "", + "is_visible": True, + "display_value": "", + }, + { + "type": "th", + "class": "blank level0", + "value": "", + "is_visible": True, + "display_value": "", + }, + { + "type": "th", + "class": "col_heading level0 col0", + "value": "A", + "is_visible": True, + "display_value": "A", + }, + ] + assert head == expected + + def test_mi_sparse_disabled(self): + with pd.option_context("display.multi_sparse", False): + df = pd.DataFrame( + {"A": [1, 2]}, index=pd.MultiIndex.from_arrays([["a", "a"], [0, 1]]) + ) + result = df.style._translate() + body = result["body"] + for row in body: + assert "attributes" not in row[0] + + def test_mi_sparse_index_names(self): + df = pd.DataFrame( + {"A": [1, 2]}, + index=pd.MultiIndex.from_arrays( + [["a", "a"], [0, 1]], names=["idx_level_0", "idx_level_1"] + ), + ) + result = df.style._translate() + head = result["head"][1] + expected = [ + {"class": "index_name level0", "value": "idx_level_0", "type": "th"}, + {"class": "index_name level1", "value": "idx_level_1", "type": "th"}, + {"class": "blank", "value": "", "type": "th"}, + ] + + assert head == expected + + def test_mi_sparse_column_names(self): + df = pd.DataFrame( + np.arange(16).reshape(4, 4), + index=pd.MultiIndex.from_arrays( + [["a", "a", "b", "a"], [0, 1, 1, 2]], + names=["idx_level_0", "idx_level_1"], + ), + columns=pd.MultiIndex.from_arrays( + [["C1", "C1", "C2", "C2"], [1, 0, 1, 0]], names=["col_0", "col_1"] + ), + ) + result = df.style._translate() + head = result["head"][1] + expected = [ + { + "class": "blank", + "value": "", + "display_value": "", + "type": "th", + "is_visible": True, + }, + { + "class": "index_name level1", + "value": "col_1", + "display_value": "col_1", + "is_visible": True, + "type": "th", + }, + { + "class": "col_heading level1 col0", + "display_value": 1, + "is_visible": True, + "type": "th", + "value": 1, + }, + { + "class": "col_heading level1 col1", + "display_value": 0, + "is_visible": True, + "type": "th", + "value": 0, + }, + { + "class": "col_heading level1 col2", + "display_value": 1, + "is_visible": True, + "type": "th", + "value": 1, + }, + { + "class": "col_heading level1 col3", + "display_value": 0, + "is_visible": True, + "type": "th", + "value": 0, + }, + ] + assert head == expected + + def test_hide_single_index(self): + # GH 14194 + # single unnamed index + ctx = self.df.style._translate() + assert ctx["body"][0][0]["is_visible"] + assert ctx["head"][0][0]["is_visible"] + ctx2 = self.df.style.hide_index()._translate() + assert not ctx2["body"][0][0]["is_visible"] + assert not ctx2["head"][0][0]["is_visible"] + + # single named index + ctx3 = self.df.set_index("A").style._translate() + assert ctx3["body"][0][0]["is_visible"] + assert len(ctx3["head"]) == 2 # 2 header levels + assert ctx3["head"][0][0]["is_visible"] + + ctx4 = self.df.set_index("A").style.hide_index()._translate() + assert not ctx4["body"][0][0]["is_visible"] + assert len(ctx4["head"]) == 1 # only 1 header levels + assert not ctx4["head"][0][0]["is_visible"] + + def test_hide_multiindex(self): + # GH 14194 + df = pd.DataFrame( + {"A": [1, 2]}, + index=pd.MultiIndex.from_arrays( + [["a", "a"], [0, 1]], names=["idx_level_0", "idx_level_1"] + ), + ) + ctx1 = df.style._translate() + # tests for 'a' and '0' + assert ctx1["body"][0][0]["is_visible"] + assert ctx1["body"][0][1]["is_visible"] + # check for blank header rows + assert ctx1["head"][0][0]["is_visible"] + assert ctx1["head"][0][1]["is_visible"] + + ctx2 = df.style.hide_index()._translate() + # tests for 'a' and '0' + assert not ctx2["body"][0][0]["is_visible"] + assert not ctx2["body"][0][1]["is_visible"] + # check for blank header rows + assert not ctx2["head"][0][0]["is_visible"] + assert not ctx2["head"][0][1]["is_visible"] + + def test_hide_columns_single_level(self): + # GH 14194 + # test hiding single column + ctx = self.df.style._translate() + assert ctx["head"][0][1]["is_visible"] + assert ctx["head"][0][1]["display_value"] == "A" + assert ctx["head"][0][2]["is_visible"] + assert ctx["head"][0][2]["display_value"] == "B" + assert ctx["body"][0][1]["is_visible"] # col A, row 1 + assert ctx["body"][1][2]["is_visible"] # col B, row 1 + + ctx = self.df.style.hide_columns("A")._translate() + assert not ctx["head"][0][1]["is_visible"] + assert not ctx["body"][0][1]["is_visible"] # col A, row 1 + assert ctx["body"][1][2]["is_visible"] # col B, row 1 + + # test hiding mulitiple columns + ctx = self.df.style.hide_columns(["A", "B"])._translate() + assert not ctx["head"][0][1]["is_visible"] + assert not ctx["head"][0][2]["is_visible"] + assert not ctx["body"][0][1]["is_visible"] # col A, row 1 + assert not ctx["body"][1][2]["is_visible"] # col B, row 1 + + def test_hide_columns_mult_levels(self): + # GH 14194 + # setup dataframe with multiple column levels and indices + i1 = pd.MultiIndex.from_arrays( + [["a", "a"], [0, 1]], names=["idx_level_0", "idx_level_1"] + ) + i2 = pd.MultiIndex.from_arrays( + [["b", "b"], [0, 1]], names=["col_level_0", "col_level_1"] + ) + df = pd.DataFrame([[1, 2], [3, 4]], index=i1, columns=i2) + ctx = df.style._translate() + # column headers + assert ctx["head"][0][2]["is_visible"] + assert ctx["head"][1][2]["is_visible"] + assert ctx["head"][1][3]["display_value"] == 1 + # indices + assert ctx["body"][0][0]["is_visible"] + # data + assert ctx["body"][1][2]["is_visible"] + assert ctx["body"][1][2]["display_value"] == 3 + assert ctx["body"][1][3]["is_visible"] + assert ctx["body"][1][3]["display_value"] == 4 + + # hide top column level, which hides both columns + ctx = df.style.hide_columns("b")._translate() + assert not ctx["head"][0][2]["is_visible"] # b + assert not ctx["head"][1][2]["is_visible"] # 0 + assert not ctx["body"][1][2]["is_visible"] # 3 + assert ctx["body"][0][0]["is_visible"] # index + + # hide first column only + ctx = df.style.hide_columns([("b", 0)])._translate() + assert ctx["head"][0][2]["is_visible"] # b + assert not ctx["head"][1][2]["is_visible"] # 0 + assert not ctx["body"][1][2]["is_visible"] # 3 + assert ctx["body"][1][3]["is_visible"] + assert ctx["body"][1][3]["display_value"] == 4 + + # hide second column and index + ctx = df.style.hide_columns([("b", 1)]).hide_index()._translate() + assert not ctx["body"][0][0]["is_visible"] # index + assert ctx["head"][0][2]["is_visible"] # b + assert ctx["head"][1][2]["is_visible"] # 0 + assert not ctx["head"][1][3]["is_visible"] # 1 + assert not ctx["body"][1][3]["is_visible"] # 4 + assert ctx["body"][1][2]["is_visible"] + assert ctx["body"][1][2]["display_value"] == 3 + + def test_pipe(self): + def set_caption_from_template(styler, a, b): + return styler.set_caption(f"Dataframe with a = {a} and b = {b}") + + styler = self.df.style.pipe(set_caption_from_template, "A", b="B") + assert "Dataframe with a = A and b = B" in styler.render() + + # Test with an argument that is a (callable, keyword_name) pair. + def f(a, b, styler): + return (a, b, styler) + + styler = self.df.style + result = styler.pipe((f, "styler"), a=1, b=2) + assert result == (1, 2, styler) + + +@td.skip_if_no_mpl +class TestStylerMatplotlibDep: + def test_background_gradient(self): + df = pd.DataFrame([[1, 2], [2, 4]], columns=["A", "B"]) + + for c_map in [None, "YlOrRd"]: + result = df.style.background_gradient(cmap=c_map)._compute().ctx + assert all("#" in x[0] for x in result.values()) + assert result[(0, 0)] == result[(0, 1)] + assert result[(1, 0)] == result[(1, 1)] + + result = ( + df.style.background_gradient(subset=pd.IndexSlice[1, "A"])._compute().ctx + ) + + assert result[(1, 0)] == ["background-color: #fff7fb", "color: #000000"] + + @pytest.mark.parametrize( + "c_map,expected", + [ + ( + None, + { + (0, 0): ["background-color: #440154", "color: #f1f1f1"], + (1, 0): ["background-color: #fde725", "color: #000000"], + }, + ), + ( + "YlOrRd", + { + (0, 0): ["background-color: #ffffcc", "color: #000000"], + (1, 0): ["background-color: #800026", "color: #f1f1f1"], + }, + ), + ], + ) + def test_text_color_threshold(self, c_map, expected): + df = pd.DataFrame([1, 2], columns=["A"]) + result = df.style.background_gradient(cmap=c_map)._compute().ctx + assert result == expected + + @pytest.mark.parametrize("text_color_threshold", [1.1, "1", -1, [2, 2]]) + def test_text_color_threshold_raises(self, text_color_threshold): + df = pd.DataFrame([[1, 2], [2, 4]], columns=["A", "B"]) + msg = "`text_color_threshold` must be a value from 0 to 1." + with pytest.raises(ValueError, match=msg): + df.style.background_gradient( + text_color_threshold=text_color_threshold + )._compute() + + @td.skip_if_no_mpl + def test_background_gradient_axis(self): + df = pd.DataFrame([[1, 2], [2, 4]], columns=["A", "B"]) + + low = ["background-color: #f7fbff", "color: #000000"] + high = ["background-color: #08306b", "color: #f1f1f1"] + mid = ["background-color: #abd0e6", "color: #000000"] + result = df.style.background_gradient(cmap="Blues", axis=0)._compute().ctx + assert result[(0, 0)] == low + assert result[(0, 1)] == low + assert result[(1, 0)] == high + assert result[(1, 1)] == high + + result = df.style.background_gradient(cmap="Blues", axis=1)._compute().ctx + assert result[(0, 0)] == low + assert result[(0, 1)] == high + assert result[(1, 0)] == low + assert result[(1, 1)] == high + + result = df.style.background_gradient(cmap="Blues", axis=None)._compute().ctx + assert result[(0, 0)] == low + assert result[(0, 1)] == mid + assert result[(1, 0)] == mid + assert result[(1, 1)] == high + + def test_background_gradient_vmin_vmax(self): + # GH 12145 + df = pd.DataFrame(range(5)) + ctx = df.style.background_gradient(vmin=1, vmax=3)._compute().ctx + assert ctx[(0, 0)] == ctx[(1, 0)] + assert ctx[(4, 0)] == ctx[(3, 0)] + + def test_background_gradient_int64(self): + # GH 28869 + df1 = pd.Series(range(3)).to_frame() + df2 = pd.Series(range(3), dtype="Int64").to_frame() + ctx1 = df1.style.background_gradient()._compute().ctx + ctx2 = df2.style.background_gradient()._compute().ctx + assert ctx2[(0, 0)] == ctx1[(0, 0)] + assert ctx2[(1, 0)] == ctx1[(1, 0)] + assert ctx2[(2, 0)] == ctx1[(2, 0)] + + +def test_block_names(): + # catch accidental removal of a block + expected = { + "before_style", + "style", + "table_styles", + "before_cellstyle", + "cellstyle", + "before_table", + "table", + "caption", + "thead", + "tbody", + "after_table", + "before_head_rows", + "head_tr", + "after_head_rows", + "before_rows", + "tr", + "after_rows", + } + result = set(Styler.template.blocks) + assert result == expected + + +def test_from_custom_template(tmpdir): + p = tmpdir.mkdir("templates").join("myhtml.tpl") + p.write( + textwrap.dedent( + """\ + {% extends "html.tpl" %} + {% block table %} +

{{ table_title|default("My Table") }}

+ {{ super() }} + {% endblock table %}""" + ) + ) + result = Styler.from_custom_template(str(tmpdir.join("templates")), "myhtml.tpl") + assert issubclass(result, Styler) + assert result.env is not Styler.env + assert result.template is not Styler.template + styler = result(pd.DataFrame({"A": [1, 2]})) + assert styler.render() diff --git a/venv/Lib/site-packages/pandas/tests/io/formats/test_to_csv.py b/venv/Lib/site-packages/pandas/tests/io/formats/test_to_csv.py new file mode 100644 index 0000000..a211ac1 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/formats/test_to_csv.py @@ -0,0 +1,585 @@ +import io +import os +import sys + +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, compat +import pandas._testing as tm + + +class TestToCSV: + @pytest.mark.xfail( + (3, 6, 5) > sys.version_info, + reason=("Python csv library bug (see https://bugs.python.org/issue32255)"), + ) + def test_to_csv_with_single_column(self): + # see gh-18676, https://bugs.python.org/issue32255 + # + # Python's CSV library adds an extraneous '""' + # before the newline when the NaN-value is in + # the first row. Otherwise, only the newline + # character is added. This behavior is inconsistent + # and was patched in https://bugs.python.org/pull_request4672. + df1 = DataFrame([None, 1]) + expected1 = """\ +"" +1.0 +""" + with tm.ensure_clean("test.csv") as path: + df1.to_csv(path, header=None, index=None) + with open(path, "r") as f: + assert f.read() == expected1 + + df2 = DataFrame([1, None]) + expected2 = """\ +1.0 +"" +""" + with tm.ensure_clean("test.csv") as path: + df2.to_csv(path, header=None, index=None) + with open(path, "r") as f: + assert f.read() == expected2 + + def test_to_csv_defualt_encoding(self): + # GH17097 + df = DataFrame({"col": ["AAAAA", "ÄÄÄÄÄ", "ßßßßß", "聞聞聞聞聞"]}) + + with tm.ensure_clean("test.csv") as path: + # the default to_csv encoding is uft-8. + df.to_csv(path) + tm.assert_frame_equal(pd.read_csv(path, index_col=0), df) + + def test_to_csv_quotechar(self): + df = DataFrame({"col": [1, 2]}) + expected = """\ +"","col" +"0","1" +"1","2" +""" + + with tm.ensure_clean("test.csv") as path: + df.to_csv(path, quoting=1) # 1=QUOTE_ALL + with open(path, "r") as f: + assert f.read() == expected + + expected = """\ +$$,$col$ +$0$,$1$ +$1$,$2$ +""" + + with tm.ensure_clean("test.csv") as path: + df.to_csv(path, quoting=1, quotechar="$") + with open(path, "r") as f: + assert f.read() == expected + + with tm.ensure_clean("test.csv") as path: + with pytest.raises(TypeError, match="quotechar"): + df.to_csv(path, quoting=1, quotechar=None) + + def test_to_csv_doublequote(self): + df = DataFrame({"col": ['a"a', '"bb"']}) + expected = '''\ +"","col" +"0","a""a" +"1","""bb""" +''' + + with tm.ensure_clean("test.csv") as path: + df.to_csv(path, quoting=1, doublequote=True) # QUOTE_ALL + with open(path, "r") as f: + assert f.read() == expected + + from _csv import Error + + with tm.ensure_clean("test.csv") as path: + with pytest.raises(Error, match="escapechar"): + df.to_csv(path, doublequote=False) # no escapechar set + + def test_to_csv_escapechar(self): + df = DataFrame({"col": ['a"a', '"bb"']}) + expected = """\ +"","col" +"0","a\\"a" +"1","\\"bb\\"" +""" + + with tm.ensure_clean("test.csv") as path: # QUOTE_ALL + df.to_csv(path, quoting=1, doublequote=False, escapechar="\\") + with open(path, "r") as f: + assert f.read() == expected + + df = DataFrame({"col": ["a,a", ",bb,"]}) + expected = """\ +,col +0,a\\,a +1,\\,bb\\, +""" + + with tm.ensure_clean("test.csv") as path: + df.to_csv(path, quoting=3, escapechar="\\") # QUOTE_NONE + with open(path, "r") as f: + assert f.read() == expected + + def test_csv_to_string(self): + df = DataFrame({"col": [1, 2]}) + expected_rows = [",col", "0,1", "1,2"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert df.to_csv() == expected + + def test_to_csv_decimal(self): + # see gh-781 + df = DataFrame({"col1": [1], "col2": ["a"], "col3": [10.1]}) + + expected_rows = [",col1,col2,col3", "0,1,a,10.1"] + expected_default = tm.convert_rows_list_to_csv_str(expected_rows) + assert df.to_csv() == expected_default + + expected_rows = [";col1;col2;col3", "0;1;a;10,1"] + expected_european_excel = tm.convert_rows_list_to_csv_str(expected_rows) + assert df.to_csv(decimal=",", sep=";") == expected_european_excel + + expected_rows = [",col1,col2,col3", "0,1,a,10.10"] + expected_float_format_default = tm.convert_rows_list_to_csv_str(expected_rows) + assert df.to_csv(float_format="%.2f") == expected_float_format_default + + expected_rows = [";col1;col2;col3", "0;1;a;10,10"] + expected_float_format = tm.convert_rows_list_to_csv_str(expected_rows) + assert ( + df.to_csv(decimal=",", sep=";", float_format="%.2f") + == expected_float_format + ) + + # see gh-11553: testing if decimal is taken into account for '0.0' + df = pd.DataFrame({"a": [0, 1.1], "b": [2.2, 3.3], "c": 1}) + + expected_rows = ["a,b,c", "0^0,2^2,1", "1^1,3^3,1"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert df.to_csv(index=False, decimal="^") == expected + + # same but for an index + assert df.set_index("a").to_csv(decimal="^") == expected + + # same for a multi-index + assert df.set_index(["a", "b"]).to_csv(decimal="^") == expected + + def test_to_csv_float_format(self): + # testing if float_format is taken into account for the index + # GH 11553 + df = pd.DataFrame({"a": [0, 1], "b": [2.2, 3.3], "c": 1}) + + expected_rows = ["a,b,c", "0,2.20,1", "1,3.30,1"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert df.set_index("a").to_csv(float_format="%.2f") == expected + + # same for a multi-index + assert df.set_index(["a", "b"]).to_csv(float_format="%.2f") == expected + + def test_to_csv_na_rep(self): + # see gh-11553 + # + # Testing if NaN values are correctly represented in the index. + df = DataFrame({"a": [0, np.NaN], "b": [0, 1], "c": [2, 3]}) + expected_rows = ["a,b,c", "0.0,0,2", "_,1,3"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + + assert df.set_index("a").to_csv(na_rep="_") == expected + assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected + + # now with an index containing only NaNs + df = DataFrame({"a": np.NaN, "b": [0, 1], "c": [2, 3]}) + expected_rows = ["a,b,c", "_,0,2", "_,1,3"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + + assert df.set_index("a").to_csv(na_rep="_") == expected + assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected + + # check if na_rep parameter does not break anything when no NaN + df = DataFrame({"a": 0, "b": [0, 1], "c": [2, 3]}) + expected_rows = ["a,b,c", "0,0,2", "0,1,3"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + + assert df.set_index("a").to_csv(na_rep="_") == expected + assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected + + # GH 29975 + # Make sure full na_rep shows up when a dtype is provided + csv = pd.Series(["a", pd.NA, "c"]).to_csv(na_rep="ZZZZZ") + expected = tm.convert_rows_list_to_csv_str([",0", "0,a", "1,ZZZZZ", "2,c"]) + assert expected == csv + csv = pd.Series(["a", pd.NA, "c"], dtype="string").to_csv(na_rep="ZZZZZ") + assert expected == csv + + def test_to_csv_date_format(self): + # GH 10209 + df_sec = DataFrame({"A": pd.date_range("20130101", periods=5, freq="s")}) + df_day = DataFrame({"A": pd.date_range("20130101", periods=5, freq="d")}) + + expected_rows = [ + ",A", + "0,2013-01-01 00:00:00", + "1,2013-01-01 00:00:01", + "2,2013-01-01 00:00:02", + "3,2013-01-01 00:00:03", + "4,2013-01-01 00:00:04", + ] + expected_default_sec = tm.convert_rows_list_to_csv_str(expected_rows) + assert df_sec.to_csv() == expected_default_sec + + expected_rows = [ + ",A", + "0,2013-01-01 00:00:00", + "1,2013-01-02 00:00:00", + "2,2013-01-03 00:00:00", + "3,2013-01-04 00:00:00", + "4,2013-01-05 00:00:00", + ] + expected_ymdhms_day = tm.convert_rows_list_to_csv_str(expected_rows) + assert df_day.to_csv(date_format="%Y-%m-%d %H:%M:%S") == expected_ymdhms_day + + expected_rows = [ + ",A", + "0,2013-01-01", + "1,2013-01-01", + "2,2013-01-01", + "3,2013-01-01", + "4,2013-01-01", + ] + expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows) + assert df_sec.to_csv(date_format="%Y-%m-%d") == expected_ymd_sec + + expected_rows = [ + ",A", + "0,2013-01-01", + "1,2013-01-02", + "2,2013-01-03", + "3,2013-01-04", + "4,2013-01-05", + ] + expected_default_day = tm.convert_rows_list_to_csv_str(expected_rows) + assert df_day.to_csv() == expected_default_day + assert df_day.to_csv(date_format="%Y-%m-%d") == expected_default_day + + # see gh-7791 + # + # Testing if date_format parameter is taken into account + # for multi-indexed DataFrames. + df_sec["B"] = 0 + df_sec["C"] = 1 + + expected_rows = ["A,B,C", "2013-01-01,0,1"] + expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows) + + df_sec_grouped = df_sec.groupby([pd.Grouper(key="A", freq="1h"), "B"]) + assert df_sec_grouped.mean().to_csv(date_format="%Y-%m-%d") == expected_ymd_sec + + def test_to_csv_multi_index(self): + # see gh-6618 + df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]])) + + exp_rows = [",1", ",2", "0,1"] + exp = tm.convert_rows_list_to_csv_str(exp_rows) + assert df.to_csv() == exp + + exp_rows = ["1", "2", "1"] + exp = tm.convert_rows_list_to_csv_str(exp_rows) + assert df.to_csv(index=False) == exp + + df = DataFrame( + [1], + columns=pd.MultiIndex.from_arrays([[1], [2]]), + index=pd.MultiIndex.from_arrays([[1], [2]]), + ) + + exp_rows = [",,1", ",,2", "1,2,1"] + exp = tm.convert_rows_list_to_csv_str(exp_rows) + assert df.to_csv() == exp + + exp_rows = ["1", "2", "1"] + exp = tm.convert_rows_list_to_csv_str(exp_rows) + assert df.to_csv(index=False) == exp + + df = DataFrame([1], columns=pd.MultiIndex.from_arrays([["foo"], ["bar"]])) + + exp_rows = [",foo", ",bar", "0,1"] + exp = tm.convert_rows_list_to_csv_str(exp_rows) + assert df.to_csv() == exp + + exp_rows = ["foo", "bar", "1"] + exp = tm.convert_rows_list_to_csv_str(exp_rows) + assert df.to_csv(index=False) == exp + + @pytest.mark.parametrize( + "ind,expected", + [ + ( + pd.MultiIndex(levels=[[1.0]], codes=[[0]], names=["x"]), + "x,data\n1.0,1\n", + ), + ( + pd.MultiIndex( + levels=[[1.0], [2.0]], codes=[[0], [0]], names=["x", "y"] + ), + "x,y,data\n1.0,2.0,1\n", + ), + ], + ) + @pytest.mark.parametrize("klass", [pd.DataFrame, pd.Series]) + def test_to_csv_single_level_multi_index(self, ind, expected, klass): + # see gh-19589 + result = klass(pd.Series([1], ind, name="data")).to_csv( + line_terminator="\n", header=True + ) + assert result == expected + + def test_to_csv_string_array_ascii(self): + # GH 10813 + str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}] + df = pd.DataFrame(str_array) + expected_ascii = """\ +,names +0,"['foo', 'bar']" +1,"['baz', 'qux']" +""" + with tm.ensure_clean("str_test.csv") as path: + df.to_csv(path, encoding="ascii") + with open(path, "r") as f: + assert f.read() == expected_ascii + + def test_to_csv_string_array_utf8(self): + # GH 10813 + str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}] + df = pd.DataFrame(str_array) + expected_utf8 = """\ +,names +0,"['foo', 'bar']" +1,"['baz', 'qux']" +""" + with tm.ensure_clean("unicode_test.csv") as path: + df.to_csv(path, encoding="utf-8") + with open(path, "r") as f: + assert f.read() == expected_utf8 + + def test_to_csv_string_with_lf(self): + # GH 20353 + data = {"int": [1, 2, 3], "str_lf": ["abc", "d\nef", "g\nh\n\ni"]} + df = pd.DataFrame(data) + with tm.ensure_clean("lf_test.csv") as path: + # case 1: The default line terminator(=os.linesep)(PR 21406) + os_linesep = os.linesep.encode("utf-8") + expected_noarg = ( + b"int,str_lf" + + os_linesep + + b"1,abc" + + os_linesep + + b'2,"d\nef"' + + os_linesep + + b'3,"g\nh\n\ni"' + + os_linesep + ) + df.to_csv(path, index=False) + with open(path, "rb") as f: + assert f.read() == expected_noarg + with tm.ensure_clean("lf_test.csv") as path: + # case 2: LF as line terminator + expected_lf = b'int,str_lf\n1,abc\n2,"d\nef"\n3,"g\nh\n\ni"\n' + df.to_csv(path, line_terminator="\n", index=False) + with open(path, "rb") as f: + assert f.read() == expected_lf + with tm.ensure_clean("lf_test.csv") as path: + # case 3: CRLF as line terminator + # 'line_terminator' should not change inner element + expected_crlf = b'int,str_lf\r\n1,abc\r\n2,"d\nef"\r\n3,"g\nh\n\ni"\r\n' + df.to_csv(path, line_terminator="\r\n", index=False) + with open(path, "rb") as f: + assert f.read() == expected_crlf + + def test_to_csv_string_with_crlf(self): + # GH 20353 + data = {"int": [1, 2, 3], "str_crlf": ["abc", "d\r\nef", "g\r\nh\r\n\r\ni"]} + df = pd.DataFrame(data) + with tm.ensure_clean("crlf_test.csv") as path: + # case 1: The default line terminator(=os.linesep)(PR 21406) + os_linesep = os.linesep.encode("utf-8") + expected_noarg = ( + b"int,str_crlf" + + os_linesep + + b"1,abc" + + os_linesep + + b'2,"d\r\nef"' + + os_linesep + + b'3,"g\r\nh\r\n\r\ni"' + + os_linesep + ) + df.to_csv(path, index=False) + with open(path, "rb") as f: + assert f.read() == expected_noarg + with tm.ensure_clean("crlf_test.csv") as path: + # case 2: LF as line terminator + expected_lf = b'int,str_crlf\n1,abc\n2,"d\r\nef"\n3,"g\r\nh\r\n\r\ni"\n' + df.to_csv(path, line_terminator="\n", index=False) + with open(path, "rb") as f: + assert f.read() == expected_lf + with tm.ensure_clean("crlf_test.csv") as path: + # case 3: CRLF as line terminator + # 'line_terminator' should not change inner element + expected_crlf = ( + b"int,str_crlf\r\n" + b"1,abc\r\n" + b'2,"d\r\nef"\r\n' + b'3,"g\r\nh\r\n\r\ni"\r\n' + ) + df.to_csv(path, line_terminator="\r\n", index=False) + with open(path, "rb") as f: + assert f.read() == expected_crlf + + def test_to_csv_stdout_file(self, capsys): + # GH 21561 + df = pd.DataFrame( + [["foo", "bar"], ["baz", "qux"]], columns=["name_1", "name_2"] + ) + expected_rows = [",name_1,name_2", "0,foo,bar", "1,baz,qux"] + expected_ascii = tm.convert_rows_list_to_csv_str(expected_rows) + + df.to_csv(sys.stdout, encoding="ascii") + captured = capsys.readouterr() + + assert captured.out == expected_ascii + assert not sys.stdout.closed + + @pytest.mark.xfail( + compat.is_platform_windows(), + reason=( + "Especially in Windows, file stream should not be passed" + "to csv writer without newline='' option." + "(https://docs.python.org/3.6/library/csv.html#csv.writer)" + ), + ) + def test_to_csv_write_to_open_file(self): + # GH 21696 + df = pd.DataFrame({"a": ["x", "y", "z"]}) + expected = """\ +manual header +x +y +z +""" + with tm.ensure_clean("test.txt") as path: + with open(path, "w") as f: + f.write("manual header\n") + df.to_csv(f, header=None, index=None) + with open(path, "r") as f: + assert f.read() == expected + + def test_to_csv_write_to_open_file_with_newline_py3(self): + # see gh-21696 + # see gh-20353 + df = pd.DataFrame({"a": ["x", "y", "z"]}) + expected_rows = ["x", "y", "z"] + expected = "manual header\n" + tm.convert_rows_list_to_csv_str(expected_rows) + with tm.ensure_clean("test.txt") as path: + with open(path, "w", newline="") as f: + f.write("manual header\n") + df.to_csv(f, header=None, index=None) + + with open(path, "rb") as f: + assert f.read() == bytes(expected, "utf-8") + + @pytest.mark.parametrize("to_infer", [True, False]) + @pytest.mark.parametrize("read_infer", [True, False]) + def test_to_csv_compression(self, compression_only, read_infer, to_infer): + # see gh-15008 + compression = compression_only + + if compression == "zip": + pytest.skip(f"{compression} is not supported for to_csv") + + # We'll complete file extension subsequently. + filename = "test." + + if compression == "gzip": + filename += "gz" + else: + # xz --> .xz + # bz2 --> .bz2 + filename += compression + + df = DataFrame({"A": [1]}) + + to_compression = "infer" if to_infer else compression + read_compression = "infer" if read_infer else compression + + with tm.ensure_clean(filename) as path: + df.to_csv(path, compression=to_compression) + result = pd.read_csv(path, index_col=0, compression=read_compression) + tm.assert_frame_equal(result, df) + + def test_to_csv_compression_dict(self, compression_only): + # GH 26023 + method = compression_only + df = DataFrame({"ABC": [1]}) + filename = "to_csv_compress_as_dict." + filename += "gz" if method == "gzip" else method + with tm.ensure_clean(filename) as path: + df.to_csv(path, compression={"method": method}) + read_df = pd.read_csv(path, index_col=0) + tm.assert_frame_equal(read_df, df) + + def test_to_csv_compression_dict_no_method_raises(self): + # GH 26023 + df = DataFrame({"ABC": [1]}) + compression = {"some_option": True} + msg = "must have key 'method'" + + with tm.ensure_clean("out.zip") as path: + with pytest.raises(ValueError, match=msg): + df.to_csv(path, compression=compression) + + @pytest.mark.parametrize("compression", ["zip", "infer"]) + @pytest.mark.parametrize( + "archive_name", [None, "test_to_csv.csv", "test_to_csv.zip"] + ) + def test_to_csv_zip_arguments(self, compression, archive_name): + # GH 26023 + from zipfile import ZipFile + + df = DataFrame({"ABC": [1]}) + with tm.ensure_clean("to_csv_archive_name.zip") as path: + df.to_csv( + path, compression={"method": compression, "archive_name": archive_name} + ) + zp = ZipFile(path) + expected_arcname = path if archive_name is None else archive_name + expected_arcname = os.path.basename(expected_arcname) + assert len(zp.filelist) == 1 + archived_file = os.path.basename(zp.filelist[0].filename) + assert archived_file == expected_arcname + + @pytest.mark.parametrize("df_new_type", ["Int64"]) + def test_to_csv_na_rep_long_string(self, df_new_type): + # see gh-25099 + df = pd.DataFrame({"c": [float("nan")] * 3}) + df = df.astype(df_new_type) + expected_rows = ["c", "mynull", "mynull", "mynull"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + + result = df.to_csv(index=False, na_rep="mynull", encoding="ascii") + + assert expected == result + + def test_to_csv_timedelta_precision(self): + # GH 6783 + s = pd.Series([1, 1]).astype("timedelta64[ns]") + buf = io.StringIO() + s.to_csv(buf) + result = buf.getvalue() + expected_rows = [ + ",0", + "0,0 days 00:00:00.000000001", + "1,0 days 00:00:00.000000001", + ] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected diff --git a/venv/Lib/site-packages/pandas/tests/io/formats/test_to_excel.py b/venv/Lib/site-packages/pandas/tests/io/formats/test_to_excel.py new file mode 100644 index 0000000..883240b --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/formats/test_to_excel.py @@ -0,0 +1,315 @@ +"""Tests formatting as writer-agnostic ExcelCells + +ExcelFormatter is tested implicitly in pandas/tests/io/test_excel.py +""" + +import pytest + +import pandas._testing as tm + +from pandas.io.formats.css import CSSWarning +from pandas.io.formats.excel import CSSToExcelConverter + + +@pytest.mark.parametrize( + "css,expected", + [ + # FONT + # - name + ("font-family: foo,bar", {"font": {"name": "foo"}}), + ('font-family: "foo bar",baz', {"font": {"name": "foo bar"}}), + ("font-family: foo,\nbar", {"font": {"name": "foo"}}), + ("font-family: foo, bar, baz", {"font": {"name": "foo"}}), + ("font-family: bar, foo", {"font": {"name": "bar"}}), + ("font-family: 'foo bar', baz", {"font": {"name": "foo bar"}}), + ("font-family: 'foo \\'bar', baz", {"font": {"name": "foo 'bar"}}), + ('font-family: "foo \\"bar", baz', {"font": {"name": 'foo "bar'}}), + ('font-family: "foo ,bar", baz', {"font": {"name": "foo ,bar"}}), + # - family + ("font-family: serif", {"font": {"name": "serif", "family": 1}}), + ("font-family: Serif", {"font": {"name": "serif", "family": 1}}), + ("font-family: roman, serif", {"font": {"name": "roman", "family": 1}}), + ("font-family: roman, sans-serif", {"font": {"name": "roman", "family": 2}}), + ("font-family: roman, sans serif", {"font": {"name": "roman"}}), + ("font-family: roman, sansserif", {"font": {"name": "roman"}}), + ("font-family: roman, cursive", {"font": {"name": "roman", "family": 4}}), + ("font-family: roman, fantasy", {"font": {"name": "roman", "family": 5}}), + # - size + ("font-size: 1em", {"font": {"size": 12}}), + ("font-size: xx-small", {"font": {"size": 6}}), + ("font-size: x-small", {"font": {"size": 7.5}}), + ("font-size: small", {"font": {"size": 9.6}}), + ("font-size: medium", {"font": {"size": 12}}), + ("font-size: large", {"font": {"size": 13.5}}), + ("font-size: x-large", {"font": {"size": 18}}), + ("font-size: xx-large", {"font": {"size": 24}}), + ("font-size: 50%", {"font": {"size": 6}}), + # - bold + ("font-weight: 100", {"font": {"bold": False}}), + ("font-weight: 200", {"font": {"bold": False}}), + ("font-weight: 300", {"font": {"bold": False}}), + ("font-weight: 400", {"font": {"bold": False}}), + ("font-weight: normal", {"font": {"bold": False}}), + ("font-weight: lighter", {"font": {"bold": False}}), + ("font-weight: bold", {"font": {"bold": True}}), + ("font-weight: bolder", {"font": {"bold": True}}), + ("font-weight: 700", {"font": {"bold": True}}), + ("font-weight: 800", {"font": {"bold": True}}), + ("font-weight: 900", {"font": {"bold": True}}), + # - italic + ("font-style: italic", {"font": {"italic": True}}), + ("font-style: oblique", {"font": {"italic": True}}), + # - underline + ("text-decoration: underline", {"font": {"underline": "single"}}), + ("text-decoration: overline", {}), + ("text-decoration: none", {}), + # - strike + ("text-decoration: line-through", {"font": {"strike": True}}), + ( + "text-decoration: underline line-through", + {"font": {"strike": True, "underline": "single"}}, + ), + ( + "text-decoration: underline; text-decoration: line-through", + {"font": {"strike": True}}, + ), + # - color + ("color: red", {"font": {"color": "FF0000"}}), + ("color: #ff0000", {"font": {"color": "FF0000"}}), + ("color: #f0a", {"font": {"color": "FF00AA"}}), + # - shadow + ("text-shadow: none", {"font": {"shadow": False}}), + ("text-shadow: 0px -0em 0px #CCC", {"font": {"shadow": False}}), + ("text-shadow: 0px -0em 0px #999", {"font": {"shadow": False}}), + ("text-shadow: 0px -0em 0px", {"font": {"shadow": False}}), + ("text-shadow: 2px -0em 0px #CCC", {"font": {"shadow": True}}), + ("text-shadow: 0px -2em 0px #CCC", {"font": {"shadow": True}}), + ("text-shadow: 0px -0em 2px #CCC", {"font": {"shadow": True}}), + ("text-shadow: 0px -0em 2px", {"font": {"shadow": True}}), + ("text-shadow: 0px -2em", {"font": {"shadow": True}}), + # FILL + # - color, fillType + ( + "background-color: red", + {"fill": {"fgColor": "FF0000", "patternType": "solid"}}, + ), + ( + "background-color: #ff0000", + {"fill": {"fgColor": "FF0000", "patternType": "solid"}}, + ), + ( + "background-color: #f0a", + {"fill": {"fgColor": "FF00AA", "patternType": "solid"}}, + ), + # BORDER + # - style + ( + "border-style: solid", + { + "border": { + "top": {"style": "medium"}, + "bottom": {"style": "medium"}, + "left": {"style": "medium"}, + "right": {"style": "medium"}, + } + }, + ), + ( + "border-style: solid; border-width: thin", + { + "border": { + "top": {"style": "thin"}, + "bottom": {"style": "thin"}, + "left": {"style": "thin"}, + "right": {"style": "thin"}, + } + }, + ), + ( + "border-top-style: solid; border-top-width: thin", + {"border": {"top": {"style": "thin"}}}, + ), + ( + "border-top-style: solid; border-top-width: 1pt", + {"border": {"top": {"style": "thin"}}}, + ), + ("border-top-style: solid", {"border": {"top": {"style": "medium"}}}), + ( + "border-top-style: solid; border-top-width: medium", + {"border": {"top": {"style": "medium"}}}, + ), + ( + "border-top-style: solid; border-top-width: 2pt", + {"border": {"top": {"style": "medium"}}}, + ), + ( + "border-top-style: solid; border-top-width: thick", + {"border": {"top": {"style": "thick"}}}, + ), + ( + "border-top-style: solid; border-top-width: 4pt", + {"border": {"top": {"style": "thick"}}}, + ), + ( + "border-top-style: dotted", + {"border": {"top": {"style": "mediumDashDotDot"}}}, + ), + ( + "border-top-style: dotted; border-top-width: thin", + {"border": {"top": {"style": "dotted"}}}, + ), + ("border-top-style: dashed", {"border": {"top": {"style": "mediumDashed"}}}), + ( + "border-top-style: dashed; border-top-width: thin", + {"border": {"top": {"style": "dashed"}}}, + ), + ("border-top-style: double", {"border": {"top": {"style": "double"}}}), + # - color + ( + "border-style: solid; border-color: #0000ff", + { + "border": { + "top": {"style": "medium", "color": "0000FF"}, + "right": {"style": "medium", "color": "0000FF"}, + "bottom": {"style": "medium", "color": "0000FF"}, + "left": {"style": "medium", "color": "0000FF"}, + } + }, + ), + ( + "border-top-style: double; border-top-color: blue", + {"border": {"top": {"style": "double", "color": "0000FF"}}}, + ), + ( + "border-top-style: solid; border-top-color: #06c", + {"border": {"top": {"style": "medium", "color": "0066CC"}}}, + ), + # ALIGNMENT + # - horizontal + ("text-align: center", {"alignment": {"horizontal": "center"}}), + ("text-align: left", {"alignment": {"horizontal": "left"}}), + ("text-align: right", {"alignment": {"horizontal": "right"}}), + ("text-align: justify", {"alignment": {"horizontal": "justify"}}), + # - vertical + ("vertical-align: top", {"alignment": {"vertical": "top"}}), + ("vertical-align: text-top", {"alignment": {"vertical": "top"}}), + ("vertical-align: middle", {"alignment": {"vertical": "center"}}), + ("vertical-align: bottom", {"alignment": {"vertical": "bottom"}}), + ("vertical-align: text-bottom", {"alignment": {"vertical": "bottom"}}), + # - wrap_text + ("white-space: nowrap", {"alignment": {"wrap_text": False}}), + ("white-space: pre", {"alignment": {"wrap_text": False}}), + ("white-space: pre-line", {"alignment": {"wrap_text": False}}), + ("white-space: normal", {"alignment": {"wrap_text": True}}), + # NUMBER FORMAT + ("number-format: 0%", {"number_format": {"format_code": "0%"}}), + ], +) +def test_css_to_excel(css, expected): + convert = CSSToExcelConverter() + assert expected == convert(css) + + +def test_css_to_excel_multiple(): + convert = CSSToExcelConverter() + actual = convert( + """ + font-weight: bold; + text-decoration: underline; + color: red; + border-width: thin; + text-align: center; + vertical-align: top; + unused: something; + """ + ) + assert { + "font": {"bold": True, "underline": "single", "color": "FF0000"}, + "border": { + "top": {"style": "thin"}, + "right": {"style": "thin"}, + "bottom": {"style": "thin"}, + "left": {"style": "thin"}, + }, + "alignment": {"horizontal": "center", "vertical": "top"}, + } == actual + + +@pytest.mark.parametrize( + "css,inherited,expected", + [ + ("font-weight: bold", "", {"font": {"bold": True}}), + ("", "font-weight: bold", {"font": {"bold": True}}), + ( + "font-weight: bold", + "font-style: italic", + {"font": {"bold": True, "italic": True}}, + ), + ("font-style: normal", "font-style: italic", {"font": {"italic": False}}), + ("font-style: inherit", "", {}), + ( + "font-style: normal; font-style: inherit", + "font-style: italic", + {"font": {"italic": True}}, + ), + ], +) +def test_css_to_excel_inherited(css, inherited, expected): + convert = CSSToExcelConverter(inherited) + assert expected == convert(css) + + +@pytest.mark.parametrize( + "input_color,output_color", + ( + list(CSSToExcelConverter.NAMED_COLORS.items()) + + [("#" + rgb, rgb) for rgb in CSSToExcelConverter.NAMED_COLORS.values()] + + [("#F0F", "FF00FF"), ("#ABC", "AABBCC")] + ), +) +def test_css_to_excel_good_colors(input_color, output_color): + # see gh-18392 + css = ( + f"border-top-color: {input_color}; " + f"border-right-color: {input_color}; " + f"border-bottom-color: {input_color}; " + f"border-left-color: {input_color}; " + f"background-color: {input_color}; " + f"color: {input_color}" + ) + + expected = dict() + + expected["fill"] = {"patternType": "solid", "fgColor": output_color} + + expected["font"] = {"color": output_color} + + expected["border"] = { + k: {"color": output_color} for k in ("top", "right", "bottom", "left") + } + + with tm.assert_produces_warning(None): + convert = CSSToExcelConverter() + assert expected == convert(css) + + +@pytest.mark.parametrize("input_color", [None, "not-a-color"]) +def test_css_to_excel_bad_colors(input_color): + # see gh-18392 + css = ( + f"border-top-color: {input_color}; " + f"border-right-color: {input_color}; " + f"border-bottom-color: {input_color}; " + f"border-left-color: {input_color}; " + f"background-color: {input_color}; " + f"color: {input_color}" + ) + + expected = dict() + + if input_color is not None: + expected["fill"] = {"patternType": "solid"} + + with tm.assert_produces_warning(CSSWarning): + convert = CSSToExcelConverter() + assert expected == convert(css) diff --git a/venv/Lib/site-packages/pandas/tests/io/formats/test_to_html.py b/venv/Lib/site-packages/pandas/tests/io/formats/test_to_html.py new file mode 100644 index 0000000..d3f044a --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/formats/test_to_html.py @@ -0,0 +1,788 @@ +from datetime import datetime +from io import StringIO +import re + +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Index, MultiIndex, option_context +import pandas._testing as tm + +import pandas.io.formats.format as fmt + +lorem_ipsum = ( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod " + "tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim " + "veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex " + "ea commodo consequat. Duis aute irure dolor in reprehenderit in " + "voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur " + "sint occaecat cupidatat non proident, sunt in culpa qui officia " + "deserunt mollit anim id est laborum." +) + + +def expected_html(datapath, name): + """ + Read HTML file from formats data directory. + + Parameters + ---------- + datapath : pytest fixture + The datapath fixture injected into a test by pytest. + name : str + The name of the HTML file without the suffix. + + Returns + ------- + str : contents of HTML file. + """ + filename = ".".join([name, "html"]) + filepath = datapath("io", "formats", "data", "html", filename) + with open(filepath, encoding="utf-8") as f: + html = f.read() + return html.rstrip() + + +@pytest.fixture(params=["mixed", "empty"]) +def biggie_df_fixture(request): + """Fixture for a big mixed Dataframe and an empty Dataframe""" + if request.param == "mixed": + df = DataFrame( + {"A": np.random.randn(200), "B": tm.makeStringIndex(200)}, + index=np.arange(200), + ) + df.loc[:20, "A"] = np.nan + df.loc[:20, "B"] = np.nan + return df + elif request.param == "empty": + df = DataFrame(index=np.arange(200)) + return df + + +@pytest.fixture(params=fmt._VALID_JUSTIFY_PARAMETERS) +def justify(request): + return request.param + + +@pytest.mark.parametrize("col_space", [30, 50]) +def test_to_html_with_col_space(col_space): + df = DataFrame(np.random.random(size=(1, 3))) + # check that col_space affects HTML generation + # and be very brittle about it. + result = df.to_html(col_space=col_space) + hdrs = [x for x in result.split(r"\n") if re.search(r"\s]", x)] + assert len(hdrs) > 0 + for h in hdrs: + assert "min-width" in h + assert str(col_space) in h + + +def test_to_html_with_empty_string_label(): + # GH 3547, to_html regards empty string labels as repeated labels + data = {"c1": ["a", "b"], "c2": ["a", ""], "data": [1, 2]} + df = DataFrame(data).set_index(["c1", "c2"]) + result = df.to_html() + assert "rowspan" not in result + + +@pytest.mark.parametrize( + "df,expected", + [ + (DataFrame({"\u03c3": np.arange(10.0)}), "unicode_1"), + (DataFrame({"A": ["\u03c3"]}), "unicode_2"), + ], +) +def test_to_html_unicode(df, expected, datapath): + expected = expected_html(datapath, expected) + result = df.to_html() + assert result == expected + + +def test_to_html_encoding(float_frame, tmp_path): + # GH 28663 + path = tmp_path / "test.html" + float_frame.to_html(path, encoding="gbk") + with open(str(path), "r", encoding="gbk") as f: + assert float_frame.to_html() == f.read() + + +def test_to_html_decimal(datapath): + # GH 12031 + df = DataFrame({"A": [6.0, 3.1, 2.2]}) + result = df.to_html(decimal=",") + expected = expected_html(datapath, "gh12031_expected_output") + assert result == expected + + +@pytest.mark.parametrize( + "kwargs,string,expected", + [ + (dict(), "", "escaped"), + (dict(escape=False), "bold", "escape_disabled"), + ], +) +def test_to_html_escaped(kwargs, string, expected, datapath): + a = "strl2": {a: string, b: string}} + result = DataFrame(test_dict).to_html(**kwargs) + expected = expected_html(datapath, expected) + assert result == expected + + +@pytest.mark.parametrize("index_is_named", [True, False]) +def test_to_html_multiindex_index_false(index_is_named, datapath): + # GH 8452 + df = DataFrame( + {"a": range(2), "b": range(3, 5), "c": range(5, 7), "d": range(3, 5)} + ) + df.columns = MultiIndex.from_product([["a", "b"], ["c", "d"]]) + if index_is_named: + df.index = Index(df.index.values, name="idx") + result = df.to_html(index=False) + expected = expected_html(datapath, "gh8452_expected_output") + assert result == expected + + +@pytest.mark.parametrize( + "multi_sparse,expected", + [ + (False, "multiindex_sparsify_false_multi_sparse_1"), + (False, "multiindex_sparsify_false_multi_sparse_2"), + (True, "multiindex_sparsify_1"), + (True, "multiindex_sparsify_2"), + ], +) +def test_to_html_multiindex_sparsify(multi_sparse, expected, datapath): + index = MultiIndex.from_arrays([[0, 0, 1, 1], [0, 1, 0, 1]], names=["foo", None]) + df = DataFrame([[0, 1], [2, 3], [4, 5], [6, 7]], index=index) + if expected.endswith("2"): + df.columns = index[::2] + with option_context("display.multi_sparse", multi_sparse): + result = df.to_html() + expected = expected_html(datapath, expected) + assert result == expected + + +@pytest.mark.parametrize( + "max_rows,expected", + [ + (60, "gh14882_expected_output_1"), + # Test that ... appears in a middle level + (56, "gh14882_expected_output_2"), + ], +) +def test_to_html_multiindex_odd_even_truncate(max_rows, expected, datapath): + # GH 14882 - Issue on truncation with odd length DataFrame + index = MultiIndex.from_product( + [[100, 200, 300], [10, 20, 30], [1, 2, 3, 4, 5, 6, 7]], names=["a", "b", "c"] + ) + df = DataFrame({"n": range(len(index))}, index=index) + result = df.to_html(max_rows=max_rows) + expected = expected_html(datapath, expected) + assert result == expected + + +@pytest.mark.parametrize( + "df,formatters,expected", + [ + ( + DataFrame( + [[0, 1], [2, 3], [4, 5], [6, 7]], + columns=["foo", None], + index=np.arange(4), + ), + {"__index__": lambda x: "abcd"[x]}, + "index_formatter", + ), + ( + DataFrame({"months": [datetime(2016, 1, 1), datetime(2016, 2, 2)]}), + {"months": lambda x: x.strftime("%Y-%m")}, + "datetime64_monthformatter", + ), + ( + DataFrame( + { + "hod": pd.to_datetime( + ["10:10:10.100", "12:12:12.120"], format="%H:%M:%S.%f" + ) + } + ), + {"hod": lambda x: x.strftime("%H:%M")}, + "datetime64_hourformatter", + ), + ], +) +def test_to_html_formatters(df, formatters, expected, datapath): + expected = expected_html(datapath, expected) + result = df.to_html(formatters=formatters) + assert result == expected + + +def test_to_html_regression_GH6098(): + df = DataFrame( + { + "clé1": ["a", "a", "b", "b", "a"], + "clé2": ["1er", "2ème", "1er", "2ème", "1er"], + "données1": np.random.randn(5), + "données2": np.random.randn(5), + } + ) + + # it works + df.pivot_table(index=["clé1"], columns=["clé2"])._repr_html_() + + +def test_to_html_truncate(datapath): + index = pd.date_range(start="20010101", freq="D", periods=20) + df = DataFrame(index=index, columns=range(20)) + result = df.to_html(max_rows=8, max_cols=4) + expected = expected_html(datapath, "truncate") + assert result == expected + + +@pytest.mark.parametrize("size", [1, 5]) +def test_html_invalid_formatters_arg_raises(size): + # issue-28469 + df = DataFrame(columns=["a", "b", "c"]) + msg = "Formatters length({}) should match DataFrame number of columns(3)" + with pytest.raises(ValueError, match=re.escape(msg.format(size))): + df.to_html(formatters=["{}".format] * size) + + +def test_to_html_truncate_formatter(datapath): + # issue-25955 + data = [ + {"A": 1, "B": 2, "C": 3, "D": 4}, + {"A": 5, "B": 6, "C": 7, "D": 8}, + {"A": 9, "B": 10, "C": 11, "D": 12}, + {"A": 13, "B": 14, "C": 15, "D": 16}, + ] + + df = DataFrame(data) + fmt = lambda x: str(x) + "_mod" + formatters = [fmt, fmt, None, None] + result = df.to_html(formatters=formatters, max_cols=3) + expected = expected_html(datapath, "truncate_formatter") + assert result == expected + + +@pytest.mark.parametrize( + "sparsify,expected", + [(True, "truncate_multi_index"), (False, "truncate_multi_index_sparse_off")], +) +def test_to_html_truncate_multi_index(sparsify, expected, datapath): + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + df = DataFrame(index=arrays, columns=arrays) + result = df.to_html(max_rows=7, max_cols=7, sparsify=sparsify) + expected = expected_html(datapath, expected) + assert result == expected + + +@pytest.mark.parametrize( + "option,result,expected", + [ + (None, lambda df: df.to_html(), "1"), + (None, lambda df: df.to_html(border=0), "0"), + (0, lambda df: df.to_html(), "0"), + (0, lambda df: df._repr_html_(), "0"), + ], +) +def test_to_html_border(option, result, expected): + df = DataFrame({"A": [1, 2]}) + if option is None: + result = result(df) + else: + with option_context("display.html.border", option): + result = result(df) + expected = 'border="{}"'.format(expected) + assert expected in result + + +@pytest.mark.parametrize("biggie_df_fixture", ["mixed"], indirect=True) +def test_to_html(biggie_df_fixture): + # TODO: split this test + df = biggie_df_fixture + s = df.to_html() + + buf = StringIO() + retval = df.to_html(buf=buf) + assert retval is None + assert buf.getvalue() == s + + assert isinstance(s, str) + + df.to_html(columns=["B", "A"], col_space=17) + df.to_html(columns=["B", "A"], formatters={"A": lambda x: "{x:.1f}".format(x=x)}) + + df.to_html(columns=["B", "A"], float_format=str) + df.to_html(columns=["B", "A"], col_space=12, float_format=str) + + +@pytest.mark.parametrize("biggie_df_fixture", ["empty"], indirect=True) +def test_to_html_empty_dataframe(biggie_df_fixture): + df = biggie_df_fixture + df.to_html() + + +def test_to_html_filename(biggie_df_fixture, tmpdir): + df = biggie_df_fixture + expected = df.to_html() + path = tmpdir.join("test.html") + df.to_html(path) + result = path.read() + assert result == expected + + +def test_to_html_with_no_bold(): + df = DataFrame({"x": np.random.randn(5)}) + html = df.to_html(bold_rows=False) + result = html[html.find("
")] + assert "B" not in result + + +@pytest.mark.parametrize( + "columns,justify,expected", + [ + ( + MultiIndex.from_tuples( + list(zip(np.arange(2).repeat(2), np.mod(range(4), 2))), + names=["CL0", "CL1"], + ), + "left", + "multiindex_1", + ), + ( + MultiIndex.from_tuples(list(zip(range(4), np.mod(range(4), 2)))), + "right", + "multiindex_2", + ), + ], +) +def test_to_html_multiindex(columns, justify, expected, datapath): + df = DataFrame([list("abcd"), list("efgh")], columns=columns) + result = df.to_html(justify=justify) + expected = expected_html(datapath, expected) + assert result == expected + + +def test_to_html_justify(justify, datapath): + df = DataFrame( + {"A": [6, 30000, 2], "B": [1, 2, 70000], "C": [223442, 0, 1]}, + columns=["A", "B", "C"], + ) + result = df.to_html(justify=justify) + expected = expected_html(datapath, "justify").format(justify=justify) + assert result == expected + + +@pytest.mark.parametrize( + "justify", ["super-right", "small-left", "noinherit", "tiny", "pandas"] +) +def test_to_html_invalid_justify(justify): + # GH 17527 + df = DataFrame() + msg = "Invalid value for justify parameter" + + with pytest.raises(ValueError, match=msg): + df.to_html(justify=justify) + + +def test_to_html_index(datapath): + # TODO: split this test + index = ["foo", "bar", "baz"] + df = DataFrame( + {"A": [1, 2, 3], "B": [1.2, 3.4, 5.6], "C": ["one", "two", np.nan]}, + columns=["A", "B", "C"], + index=index, + ) + expected_with_index = expected_html(datapath, "index_1") + assert df.to_html() == expected_with_index + + expected_without_index = expected_html(datapath, "index_2") + result = df.to_html(index=False) + for i in index: + assert i not in result + assert result == expected_without_index + df.index = Index(["foo", "bar", "baz"], name="idx") + expected_with_index = expected_html(datapath, "index_3") + assert df.to_html() == expected_with_index + assert df.to_html(index=False) == expected_without_index + + tuples = [("foo", "car"), ("foo", "bike"), ("bar", "car")] + df.index = MultiIndex.from_tuples(tuples) + + expected_with_index = expected_html(datapath, "index_4") + assert df.to_html() == expected_with_index + + result = df.to_html(index=False) + for i in ["foo", "bar", "car", "bike"]: + assert i not in result + # must be the same result as normal index + assert result == expected_without_index + + df.index = MultiIndex.from_tuples(tuples, names=["idx1", "idx2"]) + expected_with_index = expected_html(datapath, "index_5") + assert df.to_html() == expected_with_index + assert df.to_html(index=False) == expected_without_index + + +@pytest.mark.parametrize("classes", ["sortable draggable", ["sortable", "draggable"]]) +def test_to_html_with_classes(classes, datapath): + df = DataFrame() + expected = expected_html(datapath, "with_classes") + result = df.to_html(classes=classes) + assert result == expected + + +def test_to_html_no_index_max_rows(datapath): + # GH 14998 + df = DataFrame({"A": [1, 2, 3, 4]}) + result = df.to_html(index=False, max_rows=1) + expected = expected_html(datapath, "gh14998_expected_output") + assert result == expected + + +def test_to_html_multiindex_max_cols(datapath): + # GH 6131 + index = MultiIndex( + levels=[["ba", "bb", "bc"], ["ca", "cb", "cc"]], + codes=[[0, 1, 2], [0, 1, 2]], + names=["b", "c"], + ) + columns = MultiIndex( + levels=[["d"], ["aa", "ab", "ac"]], + codes=[[0, 0, 0], [0, 1, 2]], + names=[None, "a"], + ) + data = np.array( + [[1.0, np.nan, np.nan], [np.nan, 2.0, np.nan], [np.nan, np.nan, 3.0]] + ) + df = DataFrame(data, index, columns) + result = df.to_html(max_cols=2) + expected = expected_html(datapath, "gh6131_expected_output") + assert result == expected + + +def test_to_html_multi_indexes_index_false(datapath): + # GH 22579 + df = DataFrame( + {"a": range(10), "b": range(10, 20), "c": range(10, 20), "d": range(10, 20)} + ) + df.columns = MultiIndex.from_product([["a", "b"], ["c", "d"]]) + df.index = MultiIndex.from_product([["a", "b"], ["c", "d", "e", "f", "g"]]) + result = df.to_html(index=False) + expected = expected_html(datapath, "gh22579_expected_output") + assert result == expected + + +@pytest.mark.parametrize("index_names", [True, False]) +@pytest.mark.parametrize("header", [True, False]) +@pytest.mark.parametrize("index", [True, False]) +@pytest.mark.parametrize( + "column_index, column_type", + [ + (Index([0, 1]), "unnamed_standard"), + (Index([0, 1], name="columns.name"), "named_standard"), + (MultiIndex.from_product([["a"], ["b", "c"]]), "unnamed_multi"), + ( + MultiIndex.from_product( + [["a"], ["b", "c"]], names=["columns.name.0", "columns.name.1"] + ), + "named_multi", + ), + ], +) +@pytest.mark.parametrize( + "row_index, row_type", + [ + (Index([0, 1]), "unnamed_standard"), + (Index([0, 1], name="index.name"), "named_standard"), + (MultiIndex.from_product([["a"], ["b", "c"]]), "unnamed_multi"), + ( + MultiIndex.from_product( + [["a"], ["b", "c"]], names=["index.name.0", "index.name.1"] + ), + "named_multi", + ), + ], +) +def test_to_html_basic_alignment( + datapath, row_index, row_type, column_index, column_type, index, header, index_names +): + # GH 22747, GH 22579 + df = DataFrame(np.zeros((2, 2), dtype=int), index=row_index, columns=column_index) + result = df.to_html(index=index, header=header, index_names=index_names) + + if not index: + row_type = "none" + elif not index_names and row_type.startswith("named"): + row_type = "un" + row_type + + if not header: + column_type = "none" + elif not index_names and column_type.startswith("named"): + column_type = "un" + column_type + + filename = "index_" + row_type + "_columns_" + column_type + expected = expected_html(datapath, filename) + assert result == expected + + +@pytest.mark.parametrize("index_names", [True, False]) +@pytest.mark.parametrize("header", [True, False]) +@pytest.mark.parametrize("index", [True, False]) +@pytest.mark.parametrize( + "column_index, column_type", + [ + (Index(np.arange(8)), "unnamed_standard"), + (Index(np.arange(8), name="columns.name"), "named_standard"), + ( + MultiIndex.from_product([["a", "b"], ["c", "d"], ["e", "f"]]), + "unnamed_multi", + ), + ( + MultiIndex.from_product( + [["a", "b"], ["c", "d"], ["e", "f"]], names=["foo", None, "baz"] + ), + "named_multi", + ), + ], +) +@pytest.mark.parametrize( + "row_index, row_type", + [ + (Index(np.arange(8)), "unnamed_standard"), + (Index(np.arange(8), name="index.name"), "named_standard"), + ( + MultiIndex.from_product([["a", "b"], ["c", "d"], ["e", "f"]]), + "unnamed_multi", + ), + ( + MultiIndex.from_product( + [["a", "b"], ["c", "d"], ["e", "f"]], names=["foo", None, "baz"] + ), + "named_multi", + ), + ], +) +def test_to_html_alignment_with_truncation( + datapath, row_index, row_type, column_index, column_type, index, header, index_names +): + # GH 22747, GH 22579 + df = DataFrame(np.arange(64).reshape(8, 8), index=row_index, columns=column_index) + result = df.to_html( + max_rows=4, max_cols=4, index=index, header=header, index_names=index_names + ) + + if not index: + row_type = "none" + elif not index_names and row_type.startswith("named"): + row_type = "un" + row_type + + if not header: + column_type = "none" + elif not index_names and column_type.startswith("named"): + column_type = "un" + column_type + + filename = "trunc_df_index_" + row_type + "_columns_" + column_type + expected = expected_html(datapath, filename) + assert result == expected + + +@pytest.mark.parametrize("index", [False, 0]) +def test_to_html_truncation_index_false_max_rows(datapath, index): + # GH 15019 + data = [ + [1.764052, 0.400157], + [0.978738, 2.240893], + [1.867558, -0.977278], + [0.950088, -0.151357], + [-0.103219, 0.410599], + ] + df = DataFrame(data) + result = df.to_html(max_rows=4, index=index) + expected = expected_html(datapath, "gh15019_expected_output") + assert result == expected + + +@pytest.mark.parametrize("index", [False, 0]) +@pytest.mark.parametrize( + "col_index_named, expected_output", + [(False, "gh22783_expected_output"), (True, "gh22783_named_columns_index")], +) +def test_to_html_truncation_index_false_max_cols( + datapath, index, col_index_named, expected_output +): + # GH 22783 + data = [ + [1.764052, 0.400157, 0.978738, 2.240893, 1.867558], + [-0.977278, 0.950088, -0.151357, -0.103219, 0.410599], + ] + df = DataFrame(data) + if col_index_named: + df.columns.rename("columns.name", inplace=True) + result = df.to_html(max_cols=4, index=index) + expected = expected_html(datapath, expected_output) + assert result == expected + + +@pytest.mark.parametrize("notebook", [True, False]) +def test_to_html_notebook_has_style(notebook): + df = DataFrame({"A": [1, 2, 3]}) + result = df.to_html(notebook=notebook) + + if notebook: + assert "tbody tr th:only-of-type" in result + assert "vertical-align: middle;" in result + assert "thead th" in result + else: + assert "tbody tr th:only-of-type" not in result + assert "vertical-align: middle;" not in result + assert "thead th" not in result + + +def test_to_html_with_index_names_false(): + # GH 16493 + df = DataFrame({"A": [1, 2]}, index=Index(["a", "b"], name="myindexname")) + result = df.to_html(index_names=False) + assert "myindexname" not in result + + +def test_to_html_with_id(): + # GH 8496 + df = DataFrame({"A": [1, 2]}, index=Index(["a", "b"], name="myindexname")) + result = df.to_html(index_names=False, table_id="TEST_ID") + assert ' id="TEST_ID"' in result + + +@pytest.mark.parametrize( + "value,float_format,expected", + [ + (0.19999, "%.3f", "gh21625_expected_output"), + (100.0, "%.0f", "gh22270_expected_output"), + ], +) +def test_to_html_float_format_no_fixed_width(value, float_format, expected, datapath): + # GH 21625, GH 22270 + df = DataFrame({"x": [value]}) + expected = expected_html(datapath, expected) + result = df.to_html(float_format=float_format) + assert result == expected + + +@pytest.mark.parametrize( + "render_links,expected", + [(True, "render_links_true"), (False, "render_links_false")], +) +def test_to_html_render_links(render_links, expected, datapath): + # GH 2679 + data = [ + [0, "https://pandas.pydata.org/?q1=a&q2=b", "pydata.org"], + [0, "www.pydata.org", "pydata.org"], + ] + df = DataFrame(data, columns=["foo", "bar", None]) + + result = df.to_html(render_links=render_links) + expected = expected_html(datapath, expected) + assert result == expected + + +@pytest.mark.parametrize( + "method,expected", + [ + ("to_html", lambda x: lorem_ipsum), + ("_repr_html_", lambda x: lorem_ipsum[: x - 4] + "..."), # regression case + ], +) +@pytest.mark.parametrize("max_colwidth", [10, 20, 50, 100]) +def test_ignore_display_max_colwidth(method, expected, max_colwidth): + # see gh-17004 + df = DataFrame([lorem_ipsum]) + with pd.option_context("display.max_colwidth", max_colwidth): + result = getattr(df, method)() + expected = expected(max_colwidth) + assert expected in result + + +@pytest.mark.parametrize("classes", [True, 0]) +def test_to_html_invalid_classes_type(classes): + # GH 25608 + df = DataFrame() + msg = "classes must be a string, list, or tuple" + + with pytest.raises(TypeError, match=msg): + df.to_html(classes=classes) + + +def test_to_html_round_column_headers(): + # GH 17280 + df = DataFrame([1], columns=[0.55555]) + with pd.option_context("display.precision", 3): + html = df.to_html(notebook=False) + notebook = df.to_html(notebook=True) + assert "0.55555" in html + assert "0.556" in notebook + + +@pytest.mark.parametrize("unit", ["100px", "10%", "5em", 150]) +def test_to_html_with_col_space_units(unit): + # GH 25941 + df = DataFrame(np.random.random(size=(1, 3))) + result = df.to_html(col_space=unit) + result = result.split("tbody")[0] + hdrs = [x for x in result.split("\n") if re.search(r"\s]", x)] + if isinstance(unit, int): + unit = str(unit) + "px" + for h in hdrs: + expected = ' within on malformed HTML. + """ + result = self.read_html( + """
`` rows and ```` elements within each ``
`` + element in the table. ```` stands for "table data". This function + attempts to properly handle ``colspan`` and ``rowspan`` attributes. + If the function has a ``
`` elements into the header). + + .. versionadded:: 0.21.0 + + Similar to :func:`~read_csv` the `header` argument is applied + **after** `skiprows` is applied. + + This function will *always* return a list of :class:`DataFrame` *or* + it will fail, e.g., it will *not* return an empty list. + + Examples + -------- + See the :ref:`read_html documentation in the IO section of the docs + ` for some examples of reading in HTML tables. + """ + _importers() + + # Type check here. We don't want to parse only to fail because of an + # invalid value of an integer skiprows. + if isinstance(skiprows, numbers.Integral) and skiprows < 0: + raise ValueError( + "cannot skip rows starting from the end of the " + "data (you passed a negative value)" + ) + validate_header_arg(header) + return _parse( + flavor=flavor, + io=io, + match=match, + header=header, + index_col=index_col, + skiprows=skiprows, + parse_dates=parse_dates, + thousands=thousands, + attrs=attrs, + encoding=encoding, + decimal=decimal, + converters=converters, + na_values=na_values, + keep_default_na=keep_default_na, + displayed_only=displayed_only, + ) diff --git a/venv/Lib/site-packages/pandas/io/json/__init__.py b/venv/Lib/site-packages/pandas/io/json/__init__.py new file mode 100644 index 0000000..48febb0 --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/json/__init__.py @@ -0,0 +1,13 @@ +from pandas.io.json._json import dumps, loads, read_json, to_json +from pandas.io.json._normalize import _json_normalize, json_normalize +from pandas.io.json._table_schema import build_table_schema + +__all__ = [ + "dumps", + "loads", + "read_json", + "to_json", + "_json_normalize", + "json_normalize", + "build_table_schema", +] diff --git a/venv/Lib/site-packages/pandas/io/json/_json.py b/venv/Lib/site-packages/pandas/io/json/_json.py new file mode 100644 index 0000000..12ce5e4 --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/json/_json.py @@ -0,0 +1,1180 @@ +from collections import abc +import functools +from io import StringIO +from itertools import islice +import os +from typing import Any, Callable, Optional, Type + +import numpy as np + +import pandas._libs.json as json +from pandas._libs.tslibs import iNaT +from pandas._typing import JSONSerializable +from pandas.errors import AbstractMethodError +from pandas.util._decorators import deprecate_kwarg + +from pandas.core.dtypes.common import ensure_str, is_period_dtype + +from pandas import DataFrame, MultiIndex, Series, isna, to_datetime +from pandas.core.construction import create_series_with_explicit_dtype +from pandas.core.reshape.concat import concat + +from pandas.io.common import ( + get_filepath_or_buffer, + get_handle, + infer_compression, + stringify_path, +) +from pandas.io.json._normalize import convert_to_line_delimits +from pandas.io.json._table_schema import build_table_schema, parse_table_schema +from pandas.io.parsers import _validate_integer + +loads = json.loads +dumps = json.dumps + +TABLE_SCHEMA_VERSION = "0.20.0" + + +# interface to/from +def to_json( + path_or_buf, + obj, + orient: Optional[str] = None, + date_format: str = "epoch", + double_precision: int = 10, + force_ascii: bool = True, + date_unit: str = "ms", + default_handler: Optional[Callable[[Any], JSONSerializable]] = None, + lines: bool = False, + compression: Optional[str] = "infer", + index: bool = True, + indent: int = 0, +): + + if not index and orient not in ["split", "table"]: + raise ValueError( + "'index=False' is only valid when 'orient' is 'split' or 'table'" + ) + + path_or_buf = stringify_path(path_or_buf) + if lines and orient != "records": + raise ValueError("'lines' keyword only valid when 'orient' is records") + + if orient == "table" and isinstance(obj, Series): + obj = obj.to_frame(name=obj.name or "values") + + writer: Type["Writer"] + if orient == "table" and isinstance(obj, DataFrame): + writer = JSONTableWriter + elif isinstance(obj, Series): + writer = SeriesWriter + elif isinstance(obj, DataFrame): + writer = FrameWriter + else: + raise NotImplementedError("'obj' should be a Series or a DataFrame") + + s = writer( + obj, + orient=orient, + date_format=date_format, + double_precision=double_precision, + ensure_ascii=force_ascii, + date_unit=date_unit, + default_handler=default_handler, + index=index, + indent=indent, + ).write() + + if lines: + s = convert_to_line_delimits(s) + + if isinstance(path_or_buf, str): + fh, handles = get_handle(path_or_buf, "w", compression=compression) + try: + fh.write(s) + finally: + fh.close() + elif path_or_buf is None: + return s + else: + path_or_buf.write(s) + + +class Writer: + def __init__( + self, + obj, + orient: Optional[str], + date_format: str, + double_precision: int, + ensure_ascii: bool, + date_unit: str, + index: bool, + default_handler: Optional[Callable[[Any], JSONSerializable]] = None, + indent: int = 0, + ): + self.obj = obj + + if orient is None: + orient = self._default_orient # type: ignore + + self.orient = orient + self.date_format = date_format + self.double_precision = double_precision + self.ensure_ascii = ensure_ascii + self.date_unit = date_unit + self.default_handler = default_handler + self.index = index + self.indent = indent + + self.is_copy = None + self._format_axes() + + def _format_axes(self): + raise AbstractMethodError(self) + + def write(self): + return self._write( + self.obj, + self.orient, + self.double_precision, + self.ensure_ascii, + self.date_unit, + self.date_format == "iso", + self.default_handler, + self.indent, + ) + + def _write( + self, + obj, + orient: Optional[str], + double_precision: int, + ensure_ascii: bool, + date_unit: str, + iso_dates: bool, + default_handler: Optional[Callable[[Any], JSONSerializable]], + indent: int, + ): + return dumps( + obj, + orient=orient, + double_precision=double_precision, + ensure_ascii=ensure_ascii, + date_unit=date_unit, + iso_dates=iso_dates, + default_handler=default_handler, + indent=indent, + ) + + +class SeriesWriter(Writer): + _default_orient = "index" + + def _format_axes(self): + if not self.obj.index.is_unique and self.orient == "index": + raise ValueError(f"Series index must be unique for orient='{self.orient}'") + + def _write( + self, + obj, + orient: Optional[str], + double_precision: int, + ensure_ascii: bool, + date_unit: str, + iso_dates: bool, + default_handler: Optional[Callable[[Any], JSONSerializable]], + indent: int, + ): + if not self.index and orient == "split": + obj = {"name": obj.name, "data": obj.values} + return super()._write( + obj, + orient, + double_precision, + ensure_ascii, + date_unit, + iso_dates, + default_handler, + indent, + ) + + +class FrameWriter(Writer): + _default_orient = "columns" + + def _format_axes(self): + """ + Try to format axes if they are datelike. + """ + if not self.obj.index.is_unique and self.orient in ("index", "columns"): + raise ValueError( + f"DataFrame index must be unique for orient='{self.orient}'." + ) + if not self.obj.columns.is_unique and self.orient in ( + "index", + "columns", + "records", + ): + raise ValueError( + f"DataFrame columns must be unique for orient='{self.orient}'." + ) + + def _write( + self, + obj, + orient: Optional[str], + double_precision: int, + ensure_ascii: bool, + date_unit: str, + iso_dates: bool, + default_handler: Optional[Callable[[Any], JSONSerializable]], + indent: int, + ): + if not self.index and orient == "split": + obj = obj.to_dict(orient="split") + del obj["index"] + return super()._write( + obj, + orient, + double_precision, + ensure_ascii, + date_unit, + iso_dates, + default_handler, + indent, + ) + + +class JSONTableWriter(FrameWriter): + _default_orient = "records" + + def __init__( + self, + obj, + orient: Optional[str], + date_format: str, + double_precision: int, + ensure_ascii: bool, + date_unit: str, + index: bool, + default_handler: Optional[Callable[[Any], JSONSerializable]] = None, + indent: int = 0, + ): + """ + Adds a `schema` attribute with the Table Schema, resets + the index (can't do in caller, because the schema inference needs + to know what the index is, forces orient to records, and forces + date_format to 'iso'. + """ + + super().__init__( + obj, + orient, + date_format, + double_precision, + ensure_ascii, + date_unit, + index, + default_handler=default_handler, + indent=indent, + ) + + if date_format != "iso": + msg = ( + "Trying to write with `orient='table'` and " + f"`date_format='{date_format}'`. Table Schema requires dates " + "to be formatted with `date_format='iso'`" + ) + raise ValueError(msg) + + self.schema = build_table_schema(obj, index=self.index) + + # NotImplemented on a column MultiIndex + if obj.ndim == 2 and isinstance(obj.columns, MultiIndex): + raise NotImplementedError("orient='table' is not supported for MultiIndex") + + # TODO: Do this timedelta properly in objToJSON.c See GH #15137 + if ( + (obj.ndim == 1) + and (obj.name in set(obj.index.names)) + or len(obj.columns & obj.index.names) + ): + msg = "Overlapping names between the index and columns" + raise ValueError(msg) + + obj = obj.copy() + timedeltas = obj.select_dtypes(include=["timedelta"]).columns + if len(timedeltas): + obj[timedeltas] = obj[timedeltas].applymap(lambda x: x.isoformat()) + # Convert PeriodIndex to datetimes before serializing + if is_period_dtype(obj.index): + obj.index = obj.index.to_timestamp() + + # exclude index from obj if index=False + if not self.index: + self.obj = obj.reset_index(drop=True) + else: + self.obj = obj.reset_index(drop=False) + self.date_format = "iso" + self.orient = "records" + self.index = index + + def _write( + self, + obj, + orient, + double_precision, + ensure_ascii, + date_unit, + iso_dates, + default_handler, + indent, + ): + table_obj = {"schema": self.schema, "data": obj} + serialized = super()._write( + table_obj, + orient, + double_precision, + ensure_ascii, + date_unit, + iso_dates, + default_handler, + indent, + ) + + return serialized + + +@deprecate_kwarg(old_arg_name="numpy", new_arg_name=None) +def read_json( + path_or_buf=None, + orient=None, + typ="frame", + dtype=None, + convert_axes=None, + convert_dates=True, + keep_default_dates=True, + numpy=False, + precise_float=False, + date_unit=None, + encoding=None, + lines=False, + chunksize=None, + compression="infer", +): + """ + Convert a JSON string to pandas object. + + Parameters + ---------- + path_or_buf : a valid JSON str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: + ``file://localhost/path/to/table.json``. + + If you want to pass in a path object, pandas accepts any + ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, + such as a file handler (e.g. via builtin ``open`` function) + or ``StringIO``. + orient : str + Indication of expected JSON string format. + Compatible JSON strings can be produced by ``to_json()`` with a + corresponding orient value. + The set of possible orients is: + + - ``'split'`` : dict like + ``{index -> [index], columns -> [columns], data -> [values]}`` + - ``'records'`` : list like + ``[{column -> value}, ... , {column -> value}]`` + - ``'index'`` : dict like ``{index -> {column -> value}}`` + - ``'columns'`` : dict like ``{column -> {index -> value}}`` + - ``'values'`` : just the values array + + The allowed and default values depend on the value + of the `typ` parameter. + + * when ``typ == 'series'``, + + - allowed orients are ``{'split','records','index'}`` + - default is ``'index'`` + - The Series index must be unique for orient ``'index'``. + + * when ``typ == 'frame'``, + + - allowed orients are ``{'split','records','index', + 'columns','values', 'table'}`` + - default is ``'columns'`` + - The DataFrame index must be unique for orients ``'index'`` and + ``'columns'``. + - The DataFrame columns must be unique for orients ``'index'``, + ``'columns'``, and ``'records'``. + + .. versionadded:: 0.23.0 + 'table' as an allowed value for the ``orient`` argument + + typ : {'frame', 'series'}, default 'frame' + The type of object to recover. + + dtype : bool or dict, default None + If True, infer dtypes; if a dict of column to dtype, then use those; + if False, then don't infer dtypes at all, applies only to the data. + + For all ``orient`` values except ``'table'``, default is True. + + .. versionchanged:: 0.25.0 + + Not applicable for ``orient='table'``. + + convert_axes : bool, default None + Try to convert the axes to the proper dtypes. + + For all ``orient`` values except ``'table'``, default is True. + + .. versionchanged:: 0.25.0 + + Not applicable for ``orient='table'``. + + convert_dates : bool or list of str, default True + List of columns to parse for dates. If True, then try to parse + datelike columns. A column label is datelike if + + * it ends with ``'_at'``, + + * it ends with ``'_time'``, + + * it begins with ``'timestamp'``, + + * it is ``'modified'``, or + + * it is ``'date'``. + + keep_default_dates : bool, default True + If parsing dates, then parse the default datelike columns. + + numpy : bool, default False + Direct decoding to numpy arrays. Supports numeric data only, but + non-numeric column and index labels are supported. Note also that the + JSON ordering MUST be the same for each term if numpy=True. + + .. deprecated:: 1.0.0 + + precise_float : bool, default False + Set to enable usage of higher precision (strtod) function when + decoding string to double values. Default (False) is to use fast but + less precise builtin functionality. + + date_unit : str, default None + The timestamp unit to detect if converting dates. The default behaviour + is to try and detect the correct precision, but if this is not desired + then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds, + milliseconds, microseconds or nanoseconds respectively. + + encoding : str, default is 'utf-8' + The encoding to use to decode py3 bytes. + + lines : bool, default False + Read the file as a json object per line. + + chunksize : int, optional + Return JsonReader object for iteration. + See the `line-delimited json docs + `_ + for more information on ``chunksize``. + This can only be passed if `lines=True`. + If this is None, the file will be read into memory all at once. + + .. versionadded:: 0.21.0 + + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' + For on-the-fly decompression of on-disk data. If 'infer', then use + gzip, bz2, zip or xz if path_or_buf is a string ending in + '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression + otherwise. If using 'zip', the ZIP file must contain only one data + file to be read in. Set to None for no decompression. + + .. versionadded:: 0.21.0 + + Returns + ------- + Series or DataFrame + The type returned depends on the value of `typ`. + + See Also + -------- + DataFrame.to_json : Convert a DataFrame to a JSON string. + Series.to_json : Convert a Series to a JSON string. + + Notes + ----- + Specific to ``orient='table'``, if a :class:`DataFrame` with a literal + :class:`Index` name of `index` gets written with :func:`to_json`, the + subsequent read operation will incorrectly set the :class:`Index` name to + ``None``. This is because `index` is also used by :func:`DataFrame.to_json` + to denote a missing :class:`Index` name, and the subsequent + :func:`read_json` operation cannot distinguish between the two. The same + limitation is encountered with a :class:`MultiIndex` and any names + beginning with ``'level_'``. + + Examples + -------- + + >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']], + ... index=['row 1', 'row 2'], + ... columns=['col 1', 'col 2']) + + Encoding/decoding a Dataframe using ``'split'`` formatted JSON: + + >>> df.to_json(orient='split') + '{"columns":["col 1","col 2"], + "index":["row 1","row 2"], + "data":[["a","b"],["c","d"]]}' + >>> pd.read_json(_, orient='split') + col 1 col 2 + row 1 a b + row 2 c d + + Encoding/decoding a Dataframe using ``'index'`` formatted JSON: + + >>> df.to_json(orient='index') + '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}' + >>> pd.read_json(_, orient='index') + col 1 col 2 + row 1 a b + row 2 c d + + Encoding/decoding a Dataframe using ``'records'`` formatted JSON. + Note that index labels are not preserved with this encoding. + + >>> df.to_json(orient='records') + '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]' + >>> pd.read_json(_, orient='records') + col 1 col 2 + 0 a b + 1 c d + + Encoding with Table Schema + + >>> df.to_json(orient='table') + '{"schema": {"fields": [{"name": "index", "type": "string"}, + {"name": "col 1", "type": "string"}, + {"name": "col 2", "type": "string"}], + "primaryKey": "index", + "pandas_version": "0.20.0"}, + "data": [{"index": "row 1", "col 1": "a", "col 2": "b"}, + {"index": "row 2", "col 1": "c", "col 2": "d"}]}' + """ + + if orient == "table" and dtype: + raise ValueError("cannot pass both dtype and orient='table'") + if orient == "table" and convert_axes: + raise ValueError("cannot pass both convert_axes and orient='table'") + + if dtype is None and orient != "table": + dtype = True + if convert_axes is None and orient != "table": + convert_axes = True + if encoding is None: + encoding = "utf-8" + + compression = infer_compression(path_or_buf, compression) + filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( + path_or_buf, encoding=encoding, compression=compression + ) + + json_reader = JsonReader( + filepath_or_buffer, + orient=orient, + typ=typ, + dtype=dtype, + convert_axes=convert_axes, + convert_dates=convert_dates, + keep_default_dates=keep_default_dates, + numpy=numpy, + precise_float=precise_float, + date_unit=date_unit, + encoding=encoding, + lines=lines, + chunksize=chunksize, + compression=compression, + ) + + if chunksize: + return json_reader + + result = json_reader.read() + if should_close: + filepath_or_buffer.close() + + return result + + +class JsonReader(abc.Iterator): + """ + JsonReader provides an interface for reading in a JSON file. + + If initialized with ``lines=True`` and ``chunksize``, can be iterated over + ``chunksize`` lines at a time. Otherwise, calling ``read`` reads in the + whole document. + """ + + def __init__( + self, + filepath_or_buffer, + orient, + typ, + dtype, + convert_axes, + convert_dates, + keep_default_dates, + numpy, + precise_float, + date_unit, + encoding, + lines, + chunksize, + compression, + ): + + self.path_or_buf = filepath_or_buffer + self.orient = orient + self.typ = typ + self.dtype = dtype + self.convert_axes = convert_axes + self.convert_dates = convert_dates + self.keep_default_dates = keep_default_dates + self.numpy = numpy + self.precise_float = precise_float + self.date_unit = date_unit + self.encoding = encoding + self.compression = compression + self.lines = lines + self.chunksize = chunksize + self.nrows_seen = 0 + self.should_close = False + + if self.chunksize is not None: + self.chunksize = _validate_integer("chunksize", self.chunksize, 1) + if not self.lines: + raise ValueError("chunksize can only be passed if lines=True") + + data = self._get_data_from_filepath(filepath_or_buffer) + self.data = self._preprocess_data(data) + + def _preprocess_data(self, data): + """ + At this point, the data either has a `read` attribute (e.g. a file + object or a StringIO) or is a string that is a JSON document. + + If self.chunksize, we prepare the data for the `__next__` method. + Otherwise, we read it into memory for the `read` method. + """ + if hasattr(data, "read") and not self.chunksize: + data = data.read() + if not hasattr(data, "read") and self.chunksize: + data = StringIO(data) + + return data + + def _get_data_from_filepath(self, filepath_or_buffer): + """ + The function read_json accepts three input types: + 1. filepath (string-like) + 2. file-like object (e.g. open file object, StringIO) + 3. JSON string + + This method turns (1) into (2) to simplify the rest of the processing. + It returns input types (2) and (3) unchanged. + """ + data = filepath_or_buffer + + exists = False + if isinstance(data, str): + try: + exists = os.path.exists(filepath_or_buffer) + # gh-5874: if the filepath is too long will raise here + except (TypeError, ValueError): + pass + + if exists or self.compression is not None: + data, _ = get_handle( + filepath_or_buffer, + "r", + encoding=self.encoding, + compression=self.compression, + ) + self.should_close = True + self.open_stream = data + + return data + + def _combine_lines(self, lines) -> str: + """ + Combines a list of JSON objects into one JSON object. + """ + lines = filter(None, map(lambda x: x.strip(), lines)) + return "[" + ",".join(lines) + "]" + + def read(self): + """ + Read the whole JSON input into a pandas object. + """ + if self.lines and self.chunksize: + obj = concat(self) + elif self.lines: + data = ensure_str(self.data) + obj = self._get_object_parser(self._combine_lines(data.split("\n"))) + else: + obj = self._get_object_parser(self.data) + self.close() + return obj + + def _get_object_parser(self, json): + """ + Parses a json document into a pandas object. + """ + typ = self.typ + dtype = self.dtype + kwargs = { + "orient": self.orient, + "dtype": self.dtype, + "convert_axes": self.convert_axes, + "convert_dates": self.convert_dates, + "keep_default_dates": self.keep_default_dates, + "numpy": self.numpy, + "precise_float": self.precise_float, + "date_unit": self.date_unit, + } + obj = None + if typ == "frame": + obj = FrameParser(json, **kwargs).parse() + + if typ == "series" or obj is None: + if not isinstance(dtype, bool): + kwargs["dtype"] = dtype + obj = SeriesParser(json, **kwargs).parse() + + return obj + + def close(self): + """ + If we opened a stream earlier, in _get_data_from_filepath, we should + close it. + + If an open stream or file was passed, we leave it open. + """ + if self.should_close: + try: + self.open_stream.close() + except (IOError, AttributeError): + pass + + def __next__(self): + lines = list(islice(self.data, self.chunksize)) + if lines: + lines_json = self._combine_lines(lines) + obj = self._get_object_parser(lines_json) + + # Make sure that the returned objects have the right index. + obj.index = range(self.nrows_seen, self.nrows_seen + len(obj)) + self.nrows_seen += len(obj) + + return obj + + self.close() + raise StopIteration + + +class Parser: + + _STAMP_UNITS = ("s", "ms", "us", "ns") + _MIN_STAMPS = { + "s": 31536000, + "ms": 31536000000, + "us": 31536000000000, + "ns": 31536000000000000, + } + + def __init__( + self, + json, + orient, + dtype=None, + convert_axes=True, + convert_dates=True, + keep_default_dates=False, + numpy=False, + precise_float=False, + date_unit=None, + ): + self.json = json + + if orient is None: + orient = self._default_orient + self.orient = orient + + self.dtype = dtype + + if orient == "split": + numpy = False + + if date_unit is not None: + date_unit = date_unit.lower() + if date_unit not in self._STAMP_UNITS: + raise ValueError(f"date_unit must be one of {self._STAMP_UNITS}") + self.min_stamp = self._MIN_STAMPS[date_unit] + else: + self.min_stamp = self._MIN_STAMPS["s"] + + self.numpy = numpy + self.precise_float = precise_float + self.convert_axes = convert_axes + self.convert_dates = convert_dates + self.date_unit = date_unit + self.keep_default_dates = keep_default_dates + self.obj = None + + def check_keys_split(self, decoded): + """ + Checks that dict has only the appropriate keys for orient='split'. + """ + bad_keys = set(decoded.keys()).difference(set(self._split_keys)) + if bad_keys: + bad_keys = ", ".join(bad_keys) + raise ValueError(f"JSON data had unexpected key(s): {bad_keys}") + + def parse(self): + + # try numpy + numpy = self.numpy + if numpy: + self._parse_numpy() + + else: + self._parse_no_numpy() + + if self.obj is None: + return None + if self.convert_axes: + self._convert_axes() + self._try_convert_types() + return self.obj + + def _convert_axes(self): + """ + Try to convert axes. + """ + for axis in self.obj._AXIS_NUMBERS.keys(): + new_axis, result = self._try_convert_data( + axis, self.obj._get_axis(axis), use_dtypes=False, convert_dates=True + ) + if result: + setattr(self.obj, axis, new_axis) + + def _try_convert_types(self): + raise AbstractMethodError(self) + + def _try_convert_data(self, name, data, use_dtypes=True, convert_dates=True): + """ + Try to parse a ndarray like into a column by inferring dtype. + """ + + # don't try to coerce, unless a force conversion + if use_dtypes: + if not self.dtype: + return data, False + elif self.dtype is True: + pass + else: + # dtype to force + dtype = ( + self.dtype.get(name) if isinstance(self.dtype, dict) else self.dtype + ) + if dtype is not None: + try: + dtype = np.dtype(dtype) + return data.astype(dtype), True + except (TypeError, ValueError): + return data, False + + if convert_dates: + new_data, result = self._try_convert_to_date(data) + if result: + return new_data, True + + result = False + + if data.dtype == "object": + + # try float + try: + data = data.astype("float64") + result = True + except (TypeError, ValueError): + pass + + if data.dtype.kind == "f": + + if data.dtype != "float64": + + # coerce floats to 64 + try: + data = data.astype("float64") + result = True + except (TypeError, ValueError): + pass + + # don't coerce 0-len data + if len(data) and (data.dtype == "float" or data.dtype == "object"): + + # coerce ints if we can + try: + new_data = data.astype("int64") + if (new_data == data).all(): + data = new_data + result = True + except (TypeError, ValueError): + pass + + # coerce ints to 64 + if data.dtype == "int": + + # coerce floats to 64 + try: + data = data.astype("int64") + result = True + except (TypeError, ValueError): + pass + + return data, result + + def _try_convert_to_date(self, data): + """ + Try to parse a ndarray like into a date column. + + Try to coerce object in epoch/iso formats and integer/float in epoch + formats. Return a boolean if parsing was successful. + """ + + # no conversion on empty + if not len(data): + return data, False + + new_data = data + if new_data.dtype == "object": + try: + new_data = data.astype("int64") + except (TypeError, ValueError, OverflowError): + pass + + # ignore numbers that are out of range + if issubclass(new_data.dtype.type, np.number): + in_range = ( + isna(new_data.values) + | (new_data > self.min_stamp) + | (new_data.values == iNaT) + ) + if not in_range.all(): + return data, False + + date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS + for date_unit in date_units: + try: + new_data = to_datetime(new_data, errors="raise", unit=date_unit) + except (ValueError, OverflowError): + continue + return new_data, True + return data, False + + def _try_convert_dates(self): + raise AbstractMethodError(self) + + +class SeriesParser(Parser): + _default_orient = "index" + _split_keys = ("name", "index", "data") + + def _parse_no_numpy(self): + data = loads(self.json, precise_float=self.precise_float) + + if self.orient == "split": + decoded = {str(k): v for k, v in data.items()} + self.check_keys_split(decoded) + self.obj = create_series_with_explicit_dtype(**decoded) + else: + self.obj = create_series_with_explicit_dtype(data, dtype_if_empty=object) + + def _parse_numpy(self): + load_kwargs = { + "dtype": None, + "numpy": True, + "precise_float": self.precise_float, + } + if self.orient in ["columns", "index"]: + load_kwargs["labelled"] = True + loads_ = functools.partial(loads, **load_kwargs) + data = loads_(self.json) + + if self.orient == "split": + decoded = {str(k): v for k, v in data.items()} + self.check_keys_split(decoded) + self.obj = create_series_with_explicit_dtype(**decoded) + elif self.orient in ["columns", "index"]: + self.obj = create_series_with_explicit_dtype(*data, dtype_if_empty=object) + else: + self.obj = create_series_with_explicit_dtype(data, dtype_if_empty=object) + + def _try_convert_types(self): + if self.obj is None: + return + obj, result = self._try_convert_data( + "data", self.obj, convert_dates=self.convert_dates + ) + if result: + self.obj = obj + + +class FrameParser(Parser): + _default_orient = "columns" + _split_keys = ("columns", "index", "data") + + def _parse_numpy(self): + + json = self.json + orient = self.orient + + if orient == "columns": + args = loads( + json, + dtype=None, + numpy=True, + labelled=True, + precise_float=self.precise_float, + ) + if len(args): + args = (args[0].T, args[2], args[1]) + self.obj = DataFrame(*args) + elif orient == "split": + decoded = loads( + json, dtype=None, numpy=True, precise_float=self.precise_float + ) + decoded = {str(k): v for k, v in decoded.items()} + self.check_keys_split(decoded) + self.obj = DataFrame(**decoded) + elif orient == "values": + self.obj = DataFrame( + loads(json, dtype=None, numpy=True, precise_float=self.precise_float) + ) + else: + self.obj = DataFrame( + *loads( + json, + dtype=None, + numpy=True, + labelled=True, + precise_float=self.precise_float, + ) + ) + + def _parse_no_numpy(self): + + json = self.json + orient = self.orient + + if orient == "columns": + self.obj = DataFrame( + loads(json, precise_float=self.precise_float), dtype=None + ) + elif orient == "split": + decoded = { + str(k): v + for k, v in loads(json, precise_float=self.precise_float).items() + } + self.check_keys_split(decoded) + self.obj = DataFrame(dtype=None, **decoded) + elif orient == "index": + self.obj = DataFrame.from_dict( + loads(json, precise_float=self.precise_float), + dtype=None, + orient="index", + ) + elif orient == "table": + self.obj = parse_table_schema(json, precise_float=self.precise_float) + else: + self.obj = DataFrame( + loads(json, precise_float=self.precise_float), dtype=None + ) + + def _process_converter(self, f, filt=None): + """ + Take a conversion function and possibly recreate the frame. + """ + + if filt is None: + filt = lambda col, c: True + + needs_new_obj = False + new_obj = dict() + for i, (col, c) in enumerate(self.obj.items()): + if filt(col, c): + new_data, result = f(col, c) + if result: + c = new_data + needs_new_obj = True + new_obj[i] = c + + if needs_new_obj: + + # possibly handle dup columns + new_obj = DataFrame(new_obj, index=self.obj.index) + new_obj.columns = self.obj.columns + self.obj = new_obj + + def _try_convert_types(self): + if self.obj is None: + return + if self.convert_dates: + self._try_convert_dates() + + self._process_converter( + lambda col, c: self._try_convert_data(col, c, convert_dates=False) + ) + + def _try_convert_dates(self): + if self.obj is None: + return + + # our columns to parse + convert_dates = self.convert_dates + if convert_dates is True: + convert_dates = [] + convert_dates = set(convert_dates) + + def is_ok(col) -> bool: + """ + Return if this col is ok to try for a date parse. + """ + if not isinstance(col, str): + return False + + col_lower = col.lower() + if ( + col_lower.endswith("_at") + or col_lower.endswith("_time") + or col_lower == "modified" + or col_lower == "date" + or col_lower == "datetime" + or col_lower.startswith("timestamp") + ): + return True + return False + + self._process_converter( + lambda col, c: self._try_convert_to_date(c), + lambda col, c: ( + (self.keep_default_dates and is_ok(col)) or col in convert_dates + ), + ) diff --git a/venv/Lib/site-packages/pandas/io/json/_normalize.py b/venv/Lib/site-packages/pandas/io/json/_normalize.py new file mode 100644 index 0000000..c0596c9 --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/json/_normalize.py @@ -0,0 +1,349 @@ +# --------------------------------------------------------------------- +# JSON normalization routines + +from collections import defaultdict +import copy +from typing import Any, DefaultDict, Dict, Iterable, List, Optional, Union + +import numpy as np + +from pandas._libs.writers import convert_json_to_lines +from pandas.util._decorators import deprecate + +import pandas as pd +from pandas import DataFrame + + +def convert_to_line_delimits(s): + """ + Helper function that converts JSON lists to line delimited JSON. + """ + + # Determine we have a JSON list to turn to lines otherwise just return the + # json object, only lists can + if not s[0] == "[" and s[-1] == "]": + return s + s = s[1:-1] + + return convert_json_to_lines(s) + + +def nested_to_record( + ds, + prefix: str = "", + sep: str = ".", + level: int = 0, + max_level: Optional[int] = None, +): + """ + A simplified json_normalize + + Converts a nested dict into a flat dict ("record"), unlike json_normalize, + it does not attempt to extract a subset of the data. + + Parameters + ---------- + ds : dict or list of dicts + prefix: the prefix, optional, default: "" + sep : str, default '.' + Nested records will generate names separated by sep, + e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar + level: int, optional, default: 0 + The number of levels in the json string. + + max_level: int, optional, default: None + The max depth to normalize. + + .. versionadded:: 0.25.0 + + Returns + ------- + d - dict or list of dicts, matching `ds` + + Examples + -------- + + IN[52]: nested_to_record(dict(flat1=1,dict1=dict(c=1,d=2), + nested=dict(e=dict(c=1,d=2),d=2))) + Out[52]: + {'dict1.c': 1, + 'dict1.d': 2, + 'flat1': 1, + 'nested.d': 2, + 'nested.e.c': 1, + 'nested.e.d': 2} + """ + singleton = False + if isinstance(ds, dict): + ds = [ds] + singleton = True + new_ds = [] + for d in ds: + new_d = copy.deepcopy(d) + for k, v in d.items(): + # each key gets renamed with prefix + if not isinstance(k, str): + k = str(k) + if level == 0: + newkey = k + else: + newkey = prefix + sep + k + + # flatten if type is dict and + # current dict level < maximum level provided and + # only dicts gets recurse-flattened + # only at level>1 do we rename the rest of the keys + if not isinstance(v, dict) or ( + max_level is not None and level >= max_level + ): + if level != 0: # so we skip copying for top level, common case + v = new_d.pop(k) + new_d[newkey] = v + continue + else: + v = new_d.pop(k) + new_d.update(nested_to_record(v, newkey, sep, level + 1, max_level)) + new_ds.append(new_d) + + if singleton: + return new_ds[0] + return new_ds + + +def _json_normalize( + data: Union[Dict, List[Dict]], + record_path: Optional[Union[str, List]] = None, + meta: Optional[Union[str, List[Union[str, List[str]]]]] = None, + meta_prefix: Optional[str] = None, + record_prefix: Optional[str] = None, + errors: Optional[str] = "raise", + sep: str = ".", + max_level: Optional[int] = None, +) -> "DataFrame": + """ + Normalize semi-structured JSON data into a flat table. + + Parameters + ---------- + data : dict or list of dicts + Unserialized JSON objects. + record_path : str or list of str, default None + Path in each object to list of records. If not passed, data will be + assumed to be an array of records. + meta : list of paths (str or list of str), default None + Fields to use as metadata for each record in resulting table. + meta_prefix : str, default None + If True, prefix records with dotted (?) path, e.g. foo.bar.field if + meta is ['foo', 'bar']. + record_prefix : str, default None + If True, prefix records with dotted (?) path, e.g. foo.bar.field if + path to records is ['foo', 'bar']. + errors : {'raise', 'ignore'}, default 'raise' + Configures error handling. + + * 'ignore' : will ignore KeyError if keys listed in meta are not + always present. + * 'raise' : will raise KeyError if keys listed in meta are not + always present. + sep : str, default '.' + Nested records will generate names separated by sep. + e.g., for sep='.', {'foo': {'bar': 0}} -> foo.bar. + max_level : int, default None + Max number of levels(depth of dict) to normalize. + if None, normalizes all levels. + + .. versionadded:: 0.25.0 + + Returns + ------- + frame : DataFrame + Normalize semi-structured JSON data into a flat table. + + Examples + -------- + + >>> from pandas.io.json import json_normalize + >>> data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}}, + ... {'name': {'given': 'Mose', 'family': 'Regner'}}, + ... {'id': 2, 'name': 'Faye Raker'}] + >>> json_normalize(data) + id name name.family name.first name.given name.last + 0 1.0 NaN NaN Coleen NaN Volk + 1 NaN NaN Regner NaN Mose NaN + 2 2.0 Faye Raker NaN NaN NaN NaN + + >>> data = [{'id': 1, + ... 'name': "Cole Volk", + ... 'fitness': {'height': 130, 'weight': 60}}, + ... {'name': "Mose Reg", + ... 'fitness': {'height': 130, 'weight': 60}}, + ... {'id': 2, 'name': 'Faye Raker', + ... 'fitness': {'height': 130, 'weight': 60}}] + >>> json_normalize(data, max_level=0) + fitness id name + 0 {'height': 130, 'weight': 60} 1.0 Cole Volk + 1 {'height': 130, 'weight': 60} NaN Mose Reg + 2 {'height': 130, 'weight': 60} 2.0 Faye Raker + + Normalizes nested data up to level 1. + + >>> data = [{'id': 1, + ... 'name': "Cole Volk", + ... 'fitness': {'height': 130, 'weight': 60}}, + ... {'name': "Mose Reg", + ... 'fitness': {'height': 130, 'weight': 60}}, + ... {'id': 2, 'name': 'Faye Raker', + ... 'fitness': {'height': 130, 'weight': 60}}] + >>> json_normalize(data, max_level=1) + fitness.height fitness.weight id name + 0 130 60 1.0 Cole Volk + 1 130 60 NaN Mose Reg + 2 130 60 2.0 Faye Raker + + >>> data = [{'state': 'Florida', + ... 'shortname': 'FL', + ... 'info': {'governor': 'Rick Scott'}, + ... 'counties': [{'name': 'Dade', 'population': 12345}, + ... {'name': 'Broward', 'population': 40000}, + ... {'name': 'Palm Beach', 'population': 60000}]}, + ... {'state': 'Ohio', + ... 'shortname': 'OH', + ... 'info': {'governor': 'John Kasich'}, + ... 'counties': [{'name': 'Summit', 'population': 1234}, + ... {'name': 'Cuyahoga', 'population': 1337}]}] + >>> result = json_normalize(data, 'counties', ['state', 'shortname', + ... ['info', 'governor']]) + >>> result + name population state shortname info.governor + 0 Dade 12345 Florida FL Rick Scott + 1 Broward 40000 Florida FL Rick Scott + 2 Palm Beach 60000 Florida FL Rick Scott + 3 Summit 1234 Ohio OH John Kasich + 4 Cuyahoga 1337 Ohio OH John Kasich + + >>> data = {'A': [1, 2]} + >>> json_normalize(data, 'A', record_prefix='Prefix.') + Prefix.0 + 0 1 + 1 2 + + Returns normalized data with columns prefixed with the given string. + """ + + def _pull_field(js: Dict[str, Any], spec: Union[List, str]) -> Iterable: + result = js # type: ignore + if isinstance(spec, list): + for field in spec: + result = result[field] + else: + result = result[spec] + + if not isinstance(result, Iterable): + if pd.isnull(result): + result = [] # type: ignore + else: + raise TypeError( + f"{js} has non iterable value {result} for path {spec}. " + "Must be iterable or null." + ) + + return result + + if isinstance(data, list) and not data: + return DataFrame() + + # A bit of a hackjob + if isinstance(data, dict): + data = [data] + + if record_path is None: + if any([isinstance(x, dict) for x in y.values()] for y in data): + # naive normalization, this is idempotent for flat records + # and potentially will inflate the data considerably for + # deeply nested structures: + # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@} + # + # TODO: handle record value which are lists, at least error + # reasonably + data = nested_to_record(data, sep=sep, max_level=max_level) + return DataFrame(data) + elif not isinstance(record_path, list): + record_path = [record_path] + + if meta is None: + meta = [] + elif not isinstance(meta, list): + meta = [meta] + + _meta = [m if isinstance(m, list) else [m] for m in meta] + + # Disastrously inefficient for now + records: List = [] + lengths = [] + + meta_vals: DefaultDict = defaultdict(list) + meta_keys = [sep.join(val) for val in _meta] + + def _recursive_extract(data, path, seen_meta, level=0): + if isinstance(data, dict): + data = [data] + if len(path) > 1: + for obj in data: + for val, key in zip(_meta, meta_keys): + if level + 1 == len(val): + seen_meta[key] = _pull_field(obj, val[-1]) + + _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1) + else: + for obj in data: + recs = _pull_field(obj, path[0]) + recs = [ + nested_to_record(r, sep=sep, max_level=max_level) + if isinstance(r, dict) + else r + for r in recs + ] + + # For repeating the metadata later + lengths.append(len(recs)) + for val, key in zip(_meta, meta_keys): + if level + 1 > len(val): + meta_val = seen_meta[key] + else: + try: + meta_val = _pull_field(obj, val[level:]) + except KeyError as e: + if errors == "ignore": + meta_val = np.nan + else: + raise KeyError( + "Try running with " + "errors='ignore' as key " + f"{e} is not always present" + ) + meta_vals[key].append(meta_val) + records.extend(recs) + + _recursive_extract(data, record_path, {}, level=0) + + result = DataFrame(records) + + if record_prefix is not None: + result = result.rename(columns=lambda x: f"{record_prefix}{x}") + + # Data types, a problem + for k, v in meta_vals.items(): + if meta_prefix is not None: + k = meta_prefix + k + + if k in result: + raise ValueError( + f"Conflicting metadata name {k}, need distinguishing prefix " + ) + result[k] = np.array(v, dtype=object).repeat(lengths) + return result + + +json_normalize = deprecate( + "pandas.io.json.json_normalize", _json_normalize, "1.0.0", "pandas.json_normalize" +) diff --git a/venv/Lib/site-packages/pandas/io/json/_table_schema.py b/venv/Lib/site-packages/pandas/io/json/_table_schema.py new file mode 100644 index 0000000..5f23b95 --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/json/_table_schema.py @@ -0,0 +1,338 @@ +""" +Table Schema builders + +http://specs.frictionlessdata.io/json-table-schema/ +""" +import warnings + +import pandas._libs.json as json + +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_integer_dtype, + is_numeric_dtype, + is_period_dtype, + is_string_dtype, + is_timedelta64_dtype, +) +from pandas.core.dtypes.dtypes import CategoricalDtype + +from pandas import DataFrame +import pandas.core.common as com + +loads = json.loads + + +def as_json_table_type(x): + """ + Convert a NumPy / pandas type to its corresponding json_table. + + Parameters + ---------- + x : array or dtype + + Returns + ------- + t : str + the Table Schema data types + + Notes + ----- + This table shows the relationship between NumPy / pandas dtypes, + and Table Schema dtypes. + + ============== ================= + Pandas type Table Schema type + ============== ================= + int64 integer + float64 number + bool boolean + datetime64[ns] datetime + timedelta64[ns] duration + object str + categorical any + =============== ================= + """ + if is_integer_dtype(x): + return "integer" + elif is_bool_dtype(x): + return "boolean" + elif is_numeric_dtype(x): + return "number" + elif is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or is_period_dtype(x): + return "datetime" + elif is_timedelta64_dtype(x): + return "duration" + elif is_categorical_dtype(x): + return "any" + elif is_string_dtype(x): + return "string" + else: + return "any" + + +def set_default_names(data): + """Sets index names to 'index' for regular, or 'level_x' for Multi""" + if com.all_not_none(*data.index.names): + nms = data.index.names + if len(nms) == 1 and data.index.name == "index": + warnings.warn("Index name of 'index' is not round-trippable") + elif len(nms) > 1 and any(x.startswith("level_") for x in nms): + warnings.warn("Index names beginning with 'level_' are not round-trippable") + return data + + data = data.copy() + if data.index.nlevels > 1: + names = [ + name if name is not None else f"level_{i}" + for i, name in enumerate(data.index.names) + ] + data.index.names = names + else: + data.index.name = data.index.name or "index" + return data + + +def convert_pandas_type_to_json_field(arr, dtype=None): + dtype = dtype or arr.dtype + if arr.name is None: + name = "values" + else: + name = arr.name + field = {"name": name, "type": as_json_table_type(dtype)} + + if is_categorical_dtype(arr): + if hasattr(arr, "categories"): + cats = arr.categories + ordered = arr.ordered + else: + cats = arr.cat.categories + ordered = arr.cat.ordered + field["constraints"] = {"enum": list(cats)} + field["ordered"] = ordered + elif is_period_dtype(arr): + field["freq"] = arr.freqstr + elif is_datetime64tz_dtype(arr): + if hasattr(arr, "dt"): + field["tz"] = arr.dt.tz.zone + else: + field["tz"] = arr.tz.zone + return field + + +def convert_json_field_to_pandas_type(field): + """ + Converts a JSON field descriptor into its corresponding NumPy / pandas type + + Parameters + ---------- + field + A JSON field descriptor + + Returns + ------- + dtype + + Raises + ------ + ValueError + If the type of the provided field is unknown or currently unsupported + + Examples + -------- + >>> convert_json_field_to_pandas_type({'name': 'an_int', + 'type': 'integer'}) + 'int64' + >>> convert_json_field_to_pandas_type({'name': 'a_categorical', + 'type': 'any', + 'constraints': {'enum': [ + 'a', 'b', 'c']}, + 'ordered': True}) + 'CategoricalDtype(categories=['a', 'b', 'c'], ordered=True)' + >>> convert_json_field_to_pandas_type({'name': 'a_datetime', + 'type': 'datetime'}) + 'datetime64[ns]' + >>> convert_json_field_to_pandas_type({'name': 'a_datetime_with_tz', + 'type': 'datetime', + 'tz': 'US/Central'}) + 'datetime64[ns, US/Central]' + """ + typ = field["type"] + if typ == "string": + return "object" + elif typ == "integer": + return "int64" + elif typ == "number": + return "float64" + elif typ == "boolean": + return "bool" + elif typ == "duration": + return "timedelta64" + elif typ == "datetime": + if field.get("tz"): + return f"datetime64[ns, {field['tz']}]" + else: + return "datetime64[ns]" + elif typ == "any": + if "constraints" in field and "ordered" in field: + return CategoricalDtype( + categories=field["constraints"]["enum"], ordered=field["ordered"] + ) + else: + return "object" + + raise ValueError(f"Unsupported or invalid field type: {typ}") + + +def build_table_schema(data, index=True, primary_key=None, version=True): + """ + Create a Table schema from ``data``. + + Parameters + ---------- + data : Series, DataFrame + index : bool, default True + Whether to include ``data.index`` in the schema. + primary_key : bool or None, default True + Column names to designate as the primary key. + The default `None` will set `'primaryKey'` to the index + level or levels if the index is unique. + version : bool, default True + Whether to include a field `pandas_version` with the version + of pandas that generated the schema. + + Returns + ------- + schema : dict + + Notes + ----- + See `_as_json_table_type` for conversion types. + Timedeltas as converted to ISO8601 duration format with + 9 decimal places after the seconds field for nanosecond precision. + + Categoricals are converted to the `any` dtype, and use the `enum` field + constraint to list the allowed values. The `ordered` attribute is included + in an `ordered` field. + + Examples + -------- + >>> df = pd.DataFrame( + ... {'A': [1, 2, 3], + ... 'B': ['a', 'b', 'c'], + ... 'C': pd.date_range('2016-01-01', freq='d', periods=3), + ... }, index=pd.Index(range(3), name='idx')) + >>> build_table_schema(df) + {'fields': [{'name': 'idx', 'type': 'integer'}, + {'name': 'A', 'type': 'integer'}, + {'name': 'B', 'type': 'string'}, + {'name': 'C', 'type': 'datetime'}], + 'pandas_version': '0.20.0', + 'primaryKey': ['idx']} + """ + if index is True: + data = set_default_names(data) + + schema = {} + fields = [] + + if index: + if data.index.nlevels > 1: + for level, name in zip(data.index.levels, data.index.names): + new_field = convert_pandas_type_to_json_field(level) + new_field["name"] = name + fields.append(new_field) + else: + fields.append(convert_pandas_type_to_json_field(data.index)) + + if data.ndim > 1: + for column, s in data.items(): + fields.append(convert_pandas_type_to_json_field(s)) + else: + fields.append(convert_pandas_type_to_json_field(data)) + + schema["fields"] = fields + if index and data.index.is_unique and primary_key is None: + if data.index.nlevels == 1: + schema["primaryKey"] = [data.index.name] + else: + schema["primaryKey"] = data.index.names + elif primary_key is not None: + schema["primaryKey"] = primary_key + + if version: + schema["pandas_version"] = "0.20.0" + return schema + + +def parse_table_schema(json, precise_float): + """ + Builds a DataFrame from a given schema + + Parameters + ---------- + json : + A JSON table schema + precise_float : boolean + Flag controlling precision when decoding string to double values, as + dictated by ``read_json`` + + Returns + ------- + df : DataFrame + + Raises + ------ + NotImplementedError + If the JSON table schema contains either timezone or timedelta data + + Notes + ----- + Because :func:`DataFrame.to_json` uses the string 'index' to denote a + name-less :class:`Index`, this function sets the name of the returned + :class:`DataFrame` to ``None`` when said string is encountered with a + normal :class:`Index`. For a :class:`MultiIndex`, the same limitation + applies to any strings beginning with 'level_'. Therefore, an + :class:`Index` name of 'index' and :class:`MultiIndex` names starting + with 'level_' are not supported. + + See Also + -------- + build_table_schema : Inverse function. + pandas.read_json + """ + table = loads(json, precise_float=precise_float) + col_order = [field["name"] for field in table["schema"]["fields"]] + df = DataFrame(table["data"], columns=col_order)[col_order] + + dtypes = { + field["name"]: convert_json_field_to_pandas_type(field) + for field in table["schema"]["fields"] + } + + # Cannot directly use as_type with timezone data on object; raise for now + if any(str(x).startswith("datetime64[ns, ") for x in dtypes.values()): + raise NotImplementedError('table="orient" can not yet read timezone data') + + # No ISO constructor for Timedelta as of yet, so need to raise + if "timedelta64" in dtypes.values(): + raise NotImplementedError( + 'table="orient" can not yet read ISO-formatted Timedelta data' + ) + + df = df.astype(dtypes) + + if "primaryKey" in table["schema"]: + df = df.set_index(table["schema"]["primaryKey"]) + if len(df.index.names) == 1: + if df.index.name == "index": + df.index.name = None + else: + df.index.names = [ + None if x.startswith("level_") else x for x in df.index.names + ] + + return df diff --git a/venv/Lib/site-packages/pandas/io/orc.py b/venv/Lib/site-packages/pandas/io/orc.py new file mode 100644 index 0000000..bbefe44 --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/orc.py @@ -0,0 +1,57 @@ +""" orc compat """ + +import distutils +from typing import TYPE_CHECKING, List, Optional + +from pandas._typing import FilePathOrBuffer + +from pandas.io.common import get_filepath_or_buffer + +if TYPE_CHECKING: + from pandas import DataFrame + + +def read_orc( + path: FilePathOrBuffer, columns: Optional[List[str]] = None, **kwargs, +) -> "DataFrame": + """ + Load an ORC object from the file path, returning a DataFrame. + + .. versionadded:: 1.0.0 + + Parameters + ---------- + path : str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: + ``file://localhost/path/to/table.orc``. + + If you want to pass in a path object, pandas accepts any + ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, + such as a file handler (e.g. via builtin ``open`` function) + or ``StringIO``. + columns : list, default None + If not None, only these columns will be read from the file. + **kwargs + Any additional kwargs are passed to pyarrow. + + Returns + ------- + DataFrame + """ + + # we require a newer version of pyarrow than we support for parquet + import pyarrow + + if distutils.version.LooseVersion(pyarrow.__version__) < "0.13.0": + raise ImportError("pyarrow must be >= 0.13.0 for read_orc") + + import pyarrow.orc + + path, _, _, _ = get_filepath_or_buffer(path) + orc_file = pyarrow.orc.ORCFile(path) + result = orc_file.read(columns=columns, **kwargs).to_pandas() + return result diff --git a/venv/Lib/site-packages/pandas/io/parquet.py b/venv/Lib/site-packages/pandas/io/parquet.py new file mode 100644 index 0000000..3a686a1 --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/parquet.py @@ -0,0 +1,310 @@ +""" parquet compat """ + +from typing import Any, Dict, Optional +from warnings import catch_warnings + +from pandas.compat._optional import import_optional_dependency +from pandas.errors import AbstractMethodError + +from pandas import DataFrame, get_option + +from pandas.io.common import get_filepath_or_buffer, is_gcs_url, is_s3_url + + +def get_engine(engine: str) -> "BaseImpl": + """ return our implementation """ + + if engine == "auto": + engine = get_option("io.parquet.engine") + + if engine == "auto": + # try engines in this order + try: + return PyArrowImpl() + except ImportError: + pass + + try: + return FastParquetImpl() + except ImportError: + pass + + raise ImportError( + "Unable to find a usable engine; " + "tried using: 'pyarrow', 'fastparquet'.\n" + "pyarrow or fastparquet is required for parquet " + "support" + ) + + if engine == "pyarrow": + return PyArrowImpl() + elif engine == "fastparquet": + return FastParquetImpl() + + raise ValueError("engine must be one of 'pyarrow', 'fastparquet'") + + +class BaseImpl: + @staticmethod + def validate_dataframe(df: DataFrame): + + if not isinstance(df, DataFrame): + raise ValueError("to_parquet only supports IO with DataFrames") + + # must have value column names (strings only) + if df.columns.inferred_type not in {"string", "unicode", "empty"}: + raise ValueError("parquet must have string column names") + + # index level names must be strings + valid_names = all( + isinstance(name, str) for name in df.index.names if name is not None + ) + if not valid_names: + raise ValueError("Index level names must be strings") + + def write(self, df: DataFrame, path, compression, **kwargs): + raise AbstractMethodError(self) + + def read(self, path, columns=None, **kwargs): + raise AbstractMethodError(self) + + +class PyArrowImpl(BaseImpl): + def __init__(self): + import_optional_dependency( + "pyarrow", extra="pyarrow is required for parquet support." + ) + import pyarrow.parquet + + # import utils to register the pyarrow extension types + import pandas.core.arrays._arrow_utils # noqa + + self.api = pyarrow + + def write( + self, + df: DataFrame, + path, + compression="snappy", + coerce_timestamps="ms", + index: Optional[bool] = None, + partition_cols=None, + **kwargs, + ): + self.validate_dataframe(df) + path, _, _, _ = get_filepath_or_buffer(path, mode="wb") + + from_pandas_kwargs: Dict[str, Any] = {"schema": kwargs.pop("schema", None)} + if index is not None: + from_pandas_kwargs["preserve_index"] = index + + table = self.api.Table.from_pandas(df, **from_pandas_kwargs) + if partition_cols is not None: + self.api.parquet.write_to_dataset( + table, + path, + compression=compression, + coerce_timestamps=coerce_timestamps, + partition_cols=partition_cols, + **kwargs, + ) + else: + self.api.parquet.write_table( + table, + path, + compression=compression, + coerce_timestamps=coerce_timestamps, + **kwargs, + ) + + def read(self, path, columns=None, **kwargs): + path, _, _, should_close = get_filepath_or_buffer(path) + + kwargs["use_pandas_metadata"] = True + result = self.api.parquet.read_table( + path, columns=columns, **kwargs + ).to_pandas() + if should_close: + path.close() + + return result + + +class FastParquetImpl(BaseImpl): + def __init__(self): + # since pandas is a dependency of fastparquet + # we need to import on first use + fastparquet = import_optional_dependency( + "fastparquet", extra="fastparquet is required for parquet support." + ) + self.api = fastparquet + + def write( + self, + df: DataFrame, + path, + compression="snappy", + index=None, + partition_cols=None, + **kwargs, + ): + self.validate_dataframe(df) + # thriftpy/protocol/compact.py:339: + # DeprecationWarning: tostring() is deprecated. + # Use tobytes() instead. + + if "partition_on" in kwargs and partition_cols is not None: + raise ValueError( + "Cannot use both partition_on and " + "partition_cols. Use partition_cols for " + "partitioning data" + ) + elif "partition_on" in kwargs: + partition_cols = kwargs.pop("partition_on") + + if partition_cols is not None: + kwargs["file_scheme"] = "hive" + + if is_s3_url(path) or is_gcs_url(path): + # if path is s3:// or gs:// we need to open the file in 'wb' mode. + # TODO: Support 'ab' + + path, _, _, _ = get_filepath_or_buffer(path, mode="wb") + # And pass the opened file to the fastparquet internal impl. + kwargs["open_with"] = lambda path, _: path + else: + path, _, _, _ = get_filepath_or_buffer(path) + + with catch_warnings(record=True): + self.api.write( + path, + df, + compression=compression, + write_index=index, + partition_on=partition_cols, + **kwargs, + ) + + def read(self, path, columns=None, **kwargs): + if is_s3_url(path): + from pandas.io.s3 import get_file_and_filesystem + + # When path is s3:// an S3File is returned. + # We need to retain the original path(str) while also + # pass the S3File().open function to fsatparquet impl. + s3, filesystem = get_file_and_filesystem(path) + try: + parquet_file = self.api.ParquetFile(path, open_with=filesystem.open) + finally: + s3.close() + else: + path, _, _, _ = get_filepath_or_buffer(path) + parquet_file = self.api.ParquetFile(path) + + return parquet_file.to_pandas(columns=columns, **kwargs) + + +def to_parquet( + df: DataFrame, + path, + engine: str = "auto", + compression="snappy", + index: Optional[bool] = None, + partition_cols=None, + **kwargs, +): + """ + Write a DataFrame to the parquet format. + + Parameters + ---------- + df : DataFrame + path : str + File path or Root Directory path. Will be used as Root Directory path + while writing a partitioned dataset. + + .. versionchanged:: 0.24.0 + + engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto' + Parquet library to use. If 'auto', then the option + ``io.parquet.engine`` is used. The default ``io.parquet.engine`` + behavior is to try 'pyarrow', falling back to 'fastparquet' if + 'pyarrow' is unavailable. + compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy' + Name of the compression to use. Use ``None`` for no compression. + index : bool, default None + If ``True``, include the dataframe's index(es) in the file output. If + ``False``, they will not be written to the file. + If ``None``, similar to ``True`` the dataframe's index(es) + will be saved. However, instead of being saved as values, + the RangeIndex will be stored as a range in the metadata so it + doesn't require much space and is faster. Other indexes will + be included as columns in the file output. + + .. versionadded:: 0.24.0 + + partition_cols : str or list, optional, default None + Column names by which to partition the dataset + Columns are partitioned in the order they are given + + .. versionadded:: 0.24.0 + + kwargs + Additional keyword arguments passed to the engine + """ + if isinstance(partition_cols, str): + partition_cols = [partition_cols] + impl = get_engine(engine) + return impl.write( + df, + path, + compression=compression, + index=index, + partition_cols=partition_cols, + **kwargs, + ) + + +def read_parquet(path, engine: str = "auto", columns=None, **kwargs): + """ + Load a parquet object from the file path, returning a DataFrame. + + .. versionadded:: 0.21.0 + + Parameters + ---------- + path : str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: + ``file://localhost/path/to/table.parquet``. + A file URL can also be a path to a directory that contains multiple + partitioned parquet files. Both pyarrow and fastparquet support + paths to directories as well as file URLs. A directory path could be: + ``file://localhost/path/to/tables`` + + If you want to pass in a path object, pandas accepts any + ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, + such as a file handler (e.g. via builtin ``open`` function) + or ``StringIO``. + engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto' + Parquet library to use. If 'auto', then the option + ``io.parquet.engine`` is used. The default ``io.parquet.engine`` + behavior is to try 'pyarrow', falling back to 'fastparquet' if + 'pyarrow' is unavailable. + columns : list, default=None + If not None, only these columns will be read from the file. + + .. versionadded:: 0.21.1 + **kwargs + Any additional kwargs are passed to the engine. + + Returns + ------- + DataFrame + """ + + impl = get_engine(engine) + return impl.read(path, columns=columns, **kwargs) diff --git a/venv/Lib/site-packages/pandas/io/parsers.py b/venv/Lib/site-packages/pandas/io/parsers.py new file mode 100644 index 0000000..b4eb2fb --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/parsers.py @@ -0,0 +1,3671 @@ +""" +Module contains tools for processing files into DataFrames or other objects +""" + +from collections import abc, defaultdict +import csv +import datetime +from io import BufferedIOBase, StringIO, TextIOWrapper +import re +import sys +from textwrap import fill +from typing import Any, Dict, Set +import warnings + +import numpy as np + +import pandas._libs.lib as lib +import pandas._libs.ops as libops +import pandas._libs.parsers as parsers +from pandas._libs.parsers import STR_NA_VALUES +from pandas._libs.tslibs import parsing +from pandas._typing import FilePathOrBuffer +from pandas.errors import ( + AbstractMethodError, + EmptyDataError, + ParserError, + ParserWarning, +) +from pandas.util._decorators import Appender + +from pandas.core.dtypes.cast import astype_nansafe +from pandas.core.dtypes.common import ( + ensure_object, + ensure_str, + is_bool_dtype, + is_categorical_dtype, + is_dtype_equal, + is_extension_array_dtype, + is_file_like, + is_float, + is_integer, + is_integer_dtype, + is_list_like, + is_object_dtype, + is_scalar, + is_string_dtype, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import CategoricalDtype +from pandas.core.dtypes.missing import isna + +from pandas.core import algorithms +from pandas.core.arrays import Categorical +from pandas.core.frame import DataFrame +from pandas.core.indexes.api import ( + Index, + MultiIndex, + RangeIndex, + ensure_index_from_sequences, +) +from pandas.core.series import Series +from pandas.core.tools import datetimes as tools + +from pandas.io.common import ( + get_filepath_or_buffer, + get_handle, + infer_compression, + validate_header_arg, +) +from pandas.io.date_converters import generic_parser + +# BOM character (byte order mark) +# This exists at the beginning of a file to indicate endianness +# of a file (stream). Unfortunately, this marker screws up parsing, +# so we need to remove it if we see it. +_BOM = "\ufeff" + +_doc_read_csv_and_table = ( + r""" +{summary} + +Also supports optionally iterating or breaking of the file +into chunks. + +Additional help can be found in the online docs for +`IO Tools `_. + +Parameters +---------- +filepath_or_buffer : str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: file://localhost/path/to/table.csv. + + If you want to pass in a path object, pandas accepts any ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, such as + a file handler (e.g. via builtin ``open`` function) or ``StringIO``. +sep : str, default {_default_sep} + Delimiter to use. If sep is None, the C engine cannot automatically detect + the separator, but the Python parsing engine can, meaning the latter will + be used and automatically detect the separator by Python's builtin sniffer + tool, ``csv.Sniffer``. In addition, separators longer than 1 character and + different from ``'\s+'`` will be interpreted as regular expressions and + will also force the use of the Python parsing engine. Note that regex + delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``. +delimiter : str, default ``None`` + Alias for sep. +header : int, list of int, default 'infer' + Row number(s) to use as the column names, and the start of the + data. Default behavior is to infer the column names: if no names + are passed the behavior is identical to ``header=0`` and column + names are inferred from the first line of the file, if column + names are passed explicitly then the behavior is identical to + ``header=None``. Explicitly pass ``header=0`` to be able to + replace existing names. The header can be a list of integers that + specify row locations for a multi-index on the columns + e.g. [0,1,3]. Intervening rows that are not specified will be + skipped (e.g. 2 in this example is skipped). Note that this + parameter ignores commented lines and empty lines if + ``skip_blank_lines=True``, so ``header=0`` denotes the first line of + data rather than the first line of the file. +names : array-like, optional + List of column names to use. If the file contains a header row, + then you should explicitly pass ``header=0`` to override the column names. + Duplicates in this list are not allowed. +index_col : int, str, sequence of int / str, or False, default ``None`` + Column(s) to use as the row labels of the ``DataFrame``, either given as + string name or column index. If a sequence of int / str is given, a + MultiIndex is used. + + Note: ``index_col=False`` can be used to force pandas to *not* use the first + column as the index, e.g. when you have a malformed file with delimiters at + the end of each line. +usecols : list-like or callable, optional + Return a subset of the columns. If list-like, all elements must either + be positional (i.e. integer indices into the document columns) or strings + that correspond to column names provided either by the user in `names` or + inferred from the document header row(s). For example, a valid list-like + `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. + Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. + To instantiate a DataFrame from ``data`` with element order preserved use + ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns + in ``['foo', 'bar']`` order or + ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` + for ``['bar', 'foo']`` order. + + If callable, the callable function will be evaluated against the column + names, returning names where the callable function evaluates to True. An + example of a valid callable argument would be ``lambda x: x.upper() in + ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster + parsing time and lower memory usage. +squeeze : bool, default False + If the parsed data only contains one column then return a Series. +prefix : str, optional + Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ... +mangle_dupe_cols : bool, default True + Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than + 'X'...'X'. Passing in False will cause data to be overwritten if there + are duplicate names in the columns. +dtype : Type name or dict of column -> type, optional + Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32, + 'c': 'Int64'}} + Use `str` or `object` together with suitable `na_values` settings + to preserve and not interpret dtype. + If converters are specified, they will be applied INSTEAD + of dtype conversion. +engine : {{'c', 'python'}}, optional + Parser engine to use. The C engine is faster while the python engine is + currently more feature-complete. +converters : dict, optional + Dict of functions for converting values in certain columns. Keys can either + be integers or column labels. +true_values : list, optional + Values to consider as True. +false_values : list, optional + Values to consider as False. +skipinitialspace : bool, default False + Skip spaces after delimiter. +skiprows : list-like, int or callable, optional + Line numbers to skip (0-indexed) or number of lines to skip (int) + at the start of the file. + + If callable, the callable function will be evaluated against the row + indices, returning True if the row should be skipped and False otherwise. + An example of a valid callable argument would be ``lambda x: x in [0, 2]``. +skipfooter : int, default 0 + Number of lines at bottom of file to skip (Unsupported with engine='c'). +nrows : int, optional + Number of rows of file to read. Useful for reading pieces of large files. +na_values : scalar, str, list-like, or dict, optional + Additional strings to recognize as NA/NaN. If dict passed, specific + per-column NA values. By default the following values are interpreted as + NaN: '""" + + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ") + + """'. +keep_default_na : bool, default True + Whether or not to include the default NaN values when parsing the data. + Depending on whether `na_values` is passed in, the behavior is as follows: + + * If `keep_default_na` is True, and `na_values` are specified, `na_values` + is appended to the default NaN values used for parsing. + * If `keep_default_na` is True, and `na_values` are not specified, only + the default NaN values are used for parsing. + * If `keep_default_na` is False, and `na_values` are specified, only + the NaN values specified `na_values` are used for parsing. + * If `keep_default_na` is False, and `na_values` are not specified, no + strings will be parsed as NaN. + + Note that if `na_filter` is passed in as False, the `keep_default_na` and + `na_values` parameters will be ignored. +na_filter : bool, default True + Detect missing value markers (empty strings and the value of na_values). In + data without any NAs, passing na_filter=False can improve the performance + of reading a large file. +verbose : bool, default False + Indicate number of NA values placed in non-numeric columns. +skip_blank_lines : bool, default True + If True, skip over blank lines rather than interpreting as NaN values. +parse_dates : bool or list of int or names or list of lists or dict, \ +default False + The behavior is as follows: + + * boolean. If True -> try parsing the index. + * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 + each as a separate date column. + * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as + a single date column. + * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call + result 'foo' + + If a column or index cannot be represented as an array of datetimes, + say because of an unparseable value or a mixture of timezones, the column + or index will be returned unaltered as an object data type. For + non-standard datetime parsing, use ``pd.to_datetime`` after + ``pd.read_csv``. To parse an index or column with a mixture of timezones, + specify ``date_parser`` to be a partially-applied + :func:`pandas.to_datetime` with ``utc=True``. See + :ref:`io.csv.mixed_timezones` for more. + + Note: A fast-path exists for iso8601-formatted dates. +infer_datetime_format : bool, default False + If True and `parse_dates` is enabled, pandas will attempt to infer the + format of the datetime strings in the columns, and if it can be inferred, + switch to a faster method of parsing them. In some cases this can increase + the parsing speed by 5-10x. +keep_date_col : bool, default False + If True and `parse_dates` specifies combining multiple columns then + keep the original columns. +date_parser : function, optional + Function to use for converting a sequence of string columns to an array of + datetime instances. The default uses ``dateutil.parser.parser`` to do the + conversion. Pandas will try to call `date_parser` in three different ways, + advancing to the next if an exception occurs: 1) Pass one or more arrays + (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the + string values from the columns defined by `parse_dates` into a single array + and pass that; and 3) call `date_parser` once for each row using one or + more strings (corresponding to the columns defined by `parse_dates`) as + arguments. +dayfirst : bool, default False + DD/MM format dates, international and European format. +cache_dates : bool, default True + If True, use a cache of unique, converted dates to apply the datetime + conversion. May produce significant speed-up when parsing duplicate + date strings, especially ones with timezone offsets. + + .. versionadded:: 0.25.0 +iterator : bool, default False + Return TextFileReader object for iteration or getting chunks with + ``get_chunk()``. +chunksize : int, optional + Return TextFileReader object for iteration. + See the `IO Tools docs + `_ + for more information on ``iterator`` and ``chunksize``. +compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' + For on-the-fly decompression of on-disk data. If 'infer' and + `filepath_or_buffer` is path-like, then detect compression from the + following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no + decompression). If using 'zip', the ZIP file must contain only one data + file to be read in. Set to None for no decompression. +thousands : str, optional + Thousands separator. +decimal : str, default '.' + Character to recognize as decimal point (e.g. use ',' for European data). +lineterminator : str (length 1), optional + Character to break file into lines. Only valid with C parser. +quotechar : str (length 1), optional + The character used to denote the start and end of a quoted item. Quoted + items can include the delimiter and it will be ignored. +quoting : int or csv.QUOTE_* instance, default 0 + Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of + QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3). +doublequote : bool, default ``True`` + When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate + whether or not to interpret two consecutive quotechar elements INSIDE a + field as a single ``quotechar`` element. +escapechar : str (length 1), optional + One-character string used to escape other characters. +comment : str, optional + Indicates remainder of line should not be parsed. If found at the beginning + of a line, the line will be ignored altogether. This parameter must be a + single character. Like empty lines (as long as ``skip_blank_lines=True``), + fully commented lines are ignored by the parameter `header` but not by + `skiprows`. For example, if ``comment='#'``, parsing + ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in 'a,b,c' being + treated as the header. +encoding : str, optional + Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python + standard encodings + `_ . +dialect : str or csv.Dialect, optional + If provided, this parameter will override values (default or not) for the + following parameters: `delimiter`, `doublequote`, `escapechar`, + `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to + override values, a ParserWarning will be issued. See csv.Dialect + documentation for more details. +error_bad_lines : bool, default True + Lines with too many fields (e.g. a csv line with too many commas) will by + default cause an exception to be raised, and no DataFrame will be returned. + If False, then these "bad lines" will dropped from the DataFrame that is + returned. +warn_bad_lines : bool, default True + If error_bad_lines is False, and warn_bad_lines is True, a warning for each + "bad line" will be output. +delim_whitespace : bool, default False + Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be + used as the sep. Equivalent to setting ``sep='\\s+'``. If this option + is set to True, nothing should be passed in for the ``delimiter`` + parameter. +low_memory : bool, default True + Internally process the file in chunks, resulting in lower memory use + while parsing, but possibly mixed type inference. To ensure no mixed + types either set False, or specify the type with the `dtype` parameter. + Note that the entire file is read into a single DataFrame regardless, + use the `chunksize` or `iterator` parameter to return the data in chunks. + (Only valid with C parser). +memory_map : bool, default False + If a filepath is provided for `filepath_or_buffer`, map the file object + directly onto memory and access the data directly from there. Using this + option can improve performance because there is no longer any I/O overhead. +float_precision : str, optional + Specifies which converter the C engine should use for floating-point + values. The options are `None` for the ordinary converter, + `high` for the high-precision converter, and `round_trip` for the + round-trip converter. + +Returns +------- +DataFrame or TextParser + A comma-separated values (csv) file is returned as two-dimensional + data structure with labeled axes. + +See Also +-------- +to_csv : Write DataFrame to a comma-separated values (csv) file. +read_csv : Read a comma-separated values (csv) file into DataFrame. +read_fwf : Read a table of fixed-width formatted lines into DataFrame. + +Examples +-------- +>>> pd.{func_name}('data.csv') # doctest: +SKIP +""" +) + + +def _validate_integer(name, val, min_val=0): + """ + Checks whether the 'name' parameter for parsing is either + an integer OR float that can SAFELY be cast to an integer + without losing accuracy. Raises a ValueError if that is + not the case. + + Parameters + ---------- + name : string + Parameter name (used for error reporting) + val : int or float + The value to check + min_val : int + Minimum allowed value (val < min_val will result in a ValueError) + """ + msg = f"'{name:s}' must be an integer >={min_val:d}" + + if val is not None: + if is_float(val): + if int(val) != val: + raise ValueError(msg) + val = int(val) + elif not (is_integer(val) and val >= min_val): + raise ValueError(msg) + + return val + + +def _validate_names(names): + """ + Raise ValueError if the `names` parameter contains duplicates. + + Parameters + ---------- + names : array-like or None + An array containing a list of the names used for the output DataFrame. + + Raises + ------ + ValueError + If names are not unique. + """ + + if names is not None: + if len(names) != len(set(names)): + raise ValueError("Duplicate names are not allowed.") + + +def _read(filepath_or_buffer: FilePathOrBuffer, kwds): + """Generic reader of line files.""" + encoding = kwds.get("encoding", None) + if encoding is not None: + encoding = re.sub("_", "-", encoding).lower() + kwds["encoding"] = encoding + + compression = kwds.get("compression", "infer") + compression = infer_compression(filepath_or_buffer, compression) + + # TODO: get_filepath_or_buffer could return + # Union[FilePathOrBuffer, s3fs.S3File, gcsfs.GCSFile] + # though mypy handling of conditional imports is difficult. + # See https://github.com/python/mypy/issues/1297 + fp_or_buf, _, compression, should_close = get_filepath_or_buffer( + filepath_or_buffer, encoding, compression + ) + kwds["compression"] = compression + + if kwds.get("date_parser", None) is not None: + if isinstance(kwds["parse_dates"], bool): + kwds["parse_dates"] = True + + # Extract some of the arguments (pass chunksize on). + iterator = kwds.get("iterator", False) + chunksize = _validate_integer("chunksize", kwds.get("chunksize", None), 1) + nrows = kwds.get("nrows", None) + + # Check for duplicates in names. + _validate_names(kwds.get("names", None)) + + # Create the parser. + parser = TextFileReader(fp_or_buf, **kwds) + + if chunksize or iterator: + return parser + + try: + data = parser.read(nrows) + finally: + parser.close() + + if should_close: + try: + fp_or_buf.close() + except ValueError: + pass + + return data + + +_parser_defaults = { + "delimiter": None, + "escapechar": None, + "quotechar": '"', + "quoting": csv.QUOTE_MINIMAL, + "doublequote": True, + "skipinitialspace": False, + "lineterminator": None, + "header": "infer", + "index_col": None, + "names": None, + "prefix": None, + "skiprows": None, + "skipfooter": 0, + "nrows": None, + "na_values": None, + "keep_default_na": True, + "true_values": None, + "false_values": None, + "converters": None, + "dtype": None, + "cache_dates": True, + "thousands": None, + "comment": None, + "decimal": ".", + # 'engine': 'c', + "parse_dates": False, + "keep_date_col": False, + "dayfirst": False, + "date_parser": None, + "usecols": None, + # 'iterator': False, + "chunksize": None, + "verbose": False, + "encoding": None, + "squeeze": False, + "compression": None, + "mangle_dupe_cols": True, + "infer_datetime_format": False, + "skip_blank_lines": True, +} + + +_c_parser_defaults = { + "delim_whitespace": False, + "na_filter": True, + "low_memory": True, + "memory_map": False, + "error_bad_lines": True, + "warn_bad_lines": True, + "float_precision": None, +} + +_fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None} + +_c_unsupported = {"skipfooter"} +_python_unsupported = {"low_memory", "float_precision"} + +_deprecated_defaults: Dict[str, Any] = {} +_deprecated_args: Set[str] = set() + + +def _make_parser_function(name, default_sep=","): + def parser_f( + filepath_or_buffer: FilePathOrBuffer, + sep=default_sep, + delimiter=None, + # Column and Index Locations and Names + header="infer", + names=None, + index_col=None, + usecols=None, + squeeze=False, + prefix=None, + mangle_dupe_cols=True, + # General Parsing Configuration + dtype=None, + engine=None, + converters=None, + true_values=None, + false_values=None, + skipinitialspace=False, + skiprows=None, + skipfooter=0, + nrows=None, + # NA and Missing Data Handling + na_values=None, + keep_default_na=True, + na_filter=True, + verbose=False, + skip_blank_lines=True, + # Datetime Handling + parse_dates=False, + infer_datetime_format=False, + keep_date_col=False, + date_parser=None, + dayfirst=False, + cache_dates=True, + # Iteration + iterator=False, + chunksize=None, + # Quoting, Compression, and File Format + compression="infer", + thousands=None, + decimal: str = ".", + lineterminator=None, + quotechar='"', + quoting=csv.QUOTE_MINIMAL, + doublequote=True, + escapechar=None, + comment=None, + encoding=None, + dialect=None, + # Error Handling + error_bad_lines=True, + warn_bad_lines=True, + # Internal + delim_whitespace=False, + low_memory=_c_parser_defaults["low_memory"], + memory_map=False, + float_precision=None, + ): + + # gh-23761 + # + # When a dialect is passed, it overrides any of the overlapping + # parameters passed in directly. We don't want to warn if the + # default parameters were passed in (since it probably means + # that the user didn't pass them in explicitly in the first place). + # + # "delimiter" is the annoying corner case because we alias it to + # "sep" before doing comparison to the dialect values later on. + # Thus, we need a flag to indicate that we need to "override" + # the comparison to dialect values by checking if default values + # for BOTH "delimiter" and "sep" were provided. + if dialect is not None: + sep_override = delimiter is None and sep == default_sep + kwds = dict(sep_override=sep_override) + else: + kwds = dict() + + # Alias sep -> delimiter. + if delimiter is None: + delimiter = sep + + if delim_whitespace and delimiter != default_sep: + raise ValueError( + "Specified a delimiter with both sep and " + "delim_whitespace=True; you can only " + "specify one." + ) + + if engine is not None: + engine_specified = True + else: + engine = "c" + engine_specified = False + + kwds.update( + delimiter=delimiter, + engine=engine, + dialect=dialect, + compression=compression, + engine_specified=engine_specified, + doublequote=doublequote, + escapechar=escapechar, + quotechar=quotechar, + quoting=quoting, + skipinitialspace=skipinitialspace, + lineterminator=lineterminator, + header=header, + index_col=index_col, + names=names, + prefix=prefix, + skiprows=skiprows, + skipfooter=skipfooter, + na_values=na_values, + true_values=true_values, + false_values=false_values, + keep_default_na=keep_default_na, + thousands=thousands, + comment=comment, + decimal=decimal, + parse_dates=parse_dates, + keep_date_col=keep_date_col, + dayfirst=dayfirst, + date_parser=date_parser, + cache_dates=cache_dates, + nrows=nrows, + iterator=iterator, + chunksize=chunksize, + converters=converters, + dtype=dtype, + usecols=usecols, + verbose=verbose, + encoding=encoding, + squeeze=squeeze, + memory_map=memory_map, + float_precision=float_precision, + na_filter=na_filter, + delim_whitespace=delim_whitespace, + warn_bad_lines=warn_bad_lines, + error_bad_lines=error_bad_lines, + low_memory=low_memory, + mangle_dupe_cols=mangle_dupe_cols, + infer_datetime_format=infer_datetime_format, + skip_blank_lines=skip_blank_lines, + ) + + return _read(filepath_or_buffer, kwds) + + parser_f.__name__ = name + + return parser_f + + +read_csv = _make_parser_function("read_csv", default_sep=",") +read_csv = Appender( + _doc_read_csv_and_table.format( + func_name="read_csv", + summary="Read a comma-separated values (csv) file into DataFrame.", + _default_sep="','", + ) +)(read_csv) + +read_table = _make_parser_function("read_table", default_sep="\t") +read_table = Appender( + _doc_read_csv_and_table.format( + func_name="read_table", + summary="Read general delimited file into DataFrame.", + _default_sep=r"'\\t' (tab-stop)", + ) +)(read_table) + + +def read_fwf( + filepath_or_buffer: FilePathOrBuffer, + colspecs="infer", + widths=None, + infer_nrows=100, + **kwds, +): + + r""" + Read a table of fixed-width formatted lines into DataFrame. + + Also supports optionally iterating or breaking of the file + into chunks. + + Additional help can be found in the `online docs for IO Tools + `_. + + Parameters + ---------- + filepath_or_buffer : str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: + ``file://localhost/path/to/table.csv``. + + If you want to pass in a path object, pandas accepts any + ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, + such as a file handler (e.g. via builtin ``open`` function) + or ``StringIO``. + colspecs : list of tuple (int, int) or 'infer'. optional + A list of tuples giving the extents of the fixed-width + fields of each line as half-open intervals (i.e., [from, to[ ). + String value 'infer' can be used to instruct the parser to try + detecting the column specifications from the first 100 rows of + the data which are not being skipped via skiprows (default='infer'). + widths : list of int, optional + A list of field widths which can be used instead of 'colspecs' if + the intervals are contiguous. + infer_nrows : int, default 100 + The number of rows to consider when letting the parser determine the + `colspecs`. + + .. versionadded:: 0.24.0 + **kwds : optional + Optional keyword arguments can be passed to ``TextFileReader``. + + Returns + ------- + DataFrame or TextParser + A comma-separated values (csv) file is returned as two-dimensional + data structure with labeled axes. + + See Also + -------- + to_csv : Write DataFrame to a comma-separated values (csv) file. + read_csv : Read a comma-separated values (csv) file into DataFrame. + + Examples + -------- + >>> pd.read_fwf('data.csv') # doctest: +SKIP + """ + + # Check input arguments. + if colspecs is None and widths is None: + raise ValueError("Must specify either colspecs or widths") + elif colspecs not in (None, "infer") and widths is not None: + raise ValueError("You must specify only one of 'widths' and 'colspecs'") + + # Compute 'colspecs' from 'widths', if specified. + if widths is not None: + colspecs, col = [], 0 + for w in widths: + colspecs.append((col, col + w)) + col += w + + kwds["colspecs"] = colspecs + kwds["infer_nrows"] = infer_nrows + kwds["engine"] = "python-fwf" + return _read(filepath_or_buffer, kwds) + + +class TextFileReader(abc.Iterator): + """ + + Passed dialect overrides any of the related parser options + + """ + + def __init__(self, f, engine=None, **kwds): + + self.f = f + + if engine is not None: + engine_specified = True + else: + engine = "python" + engine_specified = False + + self._engine_specified = kwds.get("engine_specified", engine_specified) + + if kwds.get("dialect") is not None: + dialect = kwds["dialect"] + if dialect in csv.list_dialects(): + dialect = csv.get_dialect(dialect) + + # Any valid dialect should have these attributes. + # If any are missing, we will raise automatically. + for param in ( + "delimiter", + "doublequote", + "escapechar", + "skipinitialspace", + "quotechar", + "quoting", + ): + try: + dialect_val = getattr(dialect, param) + except AttributeError: + raise ValueError(f"Invalid dialect {kwds['dialect']} provided") + parser_default = _parser_defaults[param] + provided = kwds.get(param, parser_default) + + # Messages for conflicting values between the dialect + # instance and the actual parameters provided. + conflict_msgs = [] + + # Don't warn if the default parameter was passed in, + # even if it conflicts with the dialect (gh-23761). + if provided != parser_default and provided != dialect_val: + msg = ( + f"Conflicting values for '{param}': '{provided}' was " + f"provided, but the dialect specifies '{dialect_val}'. " + "Using the dialect-specified value." + ) + + # Annoying corner case for not warning about + # conflicts between dialect and delimiter parameter. + # Refer to the outer "_read_" function for more info. + if not (param == "delimiter" and kwds.pop("sep_override", False)): + conflict_msgs.append(msg) + + if conflict_msgs: + warnings.warn( + "\n\n".join(conflict_msgs), ParserWarning, stacklevel=2 + ) + kwds[param] = dialect_val + + if kwds.get("skipfooter"): + if kwds.get("iterator") or kwds.get("chunksize"): + raise ValueError("'skipfooter' not supported for 'iteration'") + if kwds.get("nrows"): + raise ValueError("'skipfooter' not supported with 'nrows'") + + if kwds.get("header", "infer") == "infer": + kwds["header"] = 0 if kwds.get("names") is None else None + + self.orig_options = kwds + + # miscellanea + self.engine = engine + self._engine = None + self._currow = 0 + + options = self._get_options_with_defaults(engine) + + self.chunksize = options.pop("chunksize", None) + self.nrows = options.pop("nrows", None) + self.squeeze = options.pop("squeeze", False) + + # might mutate self.engine + self.engine = self._check_file_or_buffer(f, engine) + self.options, self.engine = self._clean_options(options, engine) + + if "has_index_names" in kwds: + self.options["has_index_names"] = kwds["has_index_names"] + + self._make_engine(self.engine) + + def close(self): + self._engine.close() + + def _get_options_with_defaults(self, engine): + kwds = self.orig_options + + options = {} + + for argname, default in _parser_defaults.items(): + value = kwds.get(argname, default) + + # see gh-12935 + if argname == "mangle_dupe_cols" and not value: + raise ValueError("Setting mangle_dupe_cols=False is not supported yet") + else: + options[argname] = value + + for argname, default in _c_parser_defaults.items(): + if argname in kwds: + value = kwds[argname] + + if engine != "c" and value != default: + if "python" in engine and argname not in _python_unsupported: + pass + elif value == _deprecated_defaults.get(argname, default): + pass + else: + raise ValueError( + f"The {repr(argname)} option is not supported with the" + f" {repr(engine)} engine" + ) + else: + value = _deprecated_defaults.get(argname, default) + options[argname] = value + + if engine == "python-fwf": + for argname, default in _fwf_defaults.items(): + options[argname] = kwds.get(argname, default) + + return options + + def _check_file_or_buffer(self, f, engine): + # see gh-16530 + if is_file_like(f): + next_attr = "__next__" + + # The C engine doesn't need the file-like to have the "next" or + # "__next__" attribute. However, the Python engine explicitly calls + # "next(...)" when iterating through such an object, meaning it + # needs to have that attribute ("next" for Python 2.x, "__next__" + # for Python 3.x) + if engine != "c" and not hasattr(f, next_attr): + msg = "The 'python' engine cannot iterate through this file buffer." + raise ValueError(msg) + + return engine + + def _clean_options(self, options, engine): + result = options.copy() + + engine_specified = self._engine_specified + fallback_reason = None + + sep = options["delimiter"] + delim_whitespace = options["delim_whitespace"] + + # C engine not supported yet + if engine == "c": + if options["skipfooter"] > 0: + fallback_reason = "the 'c' engine does not support skipfooter" + engine = "python" + + encoding = sys.getfilesystemencoding() or "utf-8" + if sep is None and not delim_whitespace: + if engine == "c": + fallback_reason = ( + "the 'c' engine does not support " + "sep=None with delim_whitespace=False" + ) + engine = "python" + elif sep is not None and len(sep) > 1: + if engine == "c" and sep == r"\s+": + result["delim_whitespace"] = True + del result["delimiter"] + elif engine not in ("python", "python-fwf"): + # wait until regex engine integrated + fallback_reason = ( + "the 'c' engine does not support " + "regex separators (separators > 1 char and " + r"different from '\s+' are " + "interpreted as regex)" + ) + engine = "python" + elif delim_whitespace: + if "python" in engine: + result["delimiter"] = r"\s+" + elif sep is not None: + encodeable = True + try: + if len(sep.encode(encoding)) > 1: + encodeable = False + except UnicodeDecodeError: + encodeable = False + if not encodeable and engine not in ("python", "python-fwf"): + fallback_reason = ( + f"the separator encoded in {encoding} " + "is > 1 char long, and the 'c' engine " + "does not support such separators" + ) + engine = "python" + + quotechar = options["quotechar"] + if quotechar is not None and isinstance(quotechar, (str, bytes)): + if ( + len(quotechar) == 1 + and ord(quotechar) > 127 + and engine not in ("python", "python-fwf") + ): + fallback_reason = ( + "ord(quotechar) > 127, meaning the " + "quotechar is larger than one byte, " + "and the 'c' engine does not support " + "such quotechars" + ) + engine = "python" + + if fallback_reason and engine_specified: + raise ValueError(fallback_reason) + + if engine == "c": + for arg in _c_unsupported: + del result[arg] + + if "python" in engine: + for arg in _python_unsupported: + if fallback_reason and result[arg] != _c_parser_defaults[arg]: + raise ValueError( + "Falling back to the 'python' engine because " + f"{fallback_reason}, but this causes {repr(arg)} to be " + "ignored as it is not supported by the 'python' engine." + ) + del result[arg] + + if fallback_reason: + warnings.warn( + ( + "Falling back to the 'python' engine because " + f"{fallback_reason}; you can avoid this warning by specifying " + "engine='python'." + ), + ParserWarning, + stacklevel=5, + ) + + index_col = options["index_col"] + names = options["names"] + converters = options["converters"] + na_values = options["na_values"] + skiprows = options["skiprows"] + + validate_header_arg(options["header"]) + + depr_warning = "" + + for arg in _deprecated_args: + parser_default = _c_parser_defaults[arg] + depr_default = _deprecated_defaults[arg] + + msg = ( + f"The {repr(arg)} argument has been deprecated and will be " + "removed in a future version." + ) + + if result.get(arg, depr_default) != depr_default: + depr_warning += msg + "\n\n" + else: + result[arg] = parser_default + + if depr_warning != "": + warnings.warn(depr_warning, FutureWarning, stacklevel=2) + + if index_col is True: + raise ValueError("The value of index_col couldn't be 'True'") + if _is_index_col(index_col): + if not isinstance(index_col, (list, tuple, np.ndarray)): + index_col = [index_col] + result["index_col"] = index_col + + names = list(names) if names is not None else names + + # type conversion-related + if converters is not None: + if not isinstance(converters, dict): + raise TypeError( + "Type converters must be a dict or subclass, " + f"input was a {type(converters).__name__}" + ) + else: + converters = {} + + # Converting values to NA + keep_default_na = options["keep_default_na"] + na_values, na_fvalues = _clean_na_values(na_values, keep_default_na) + + # handle skiprows; this is internally handled by the + # c-engine, so only need for python parsers + if engine != "c": + if is_integer(skiprows): + skiprows = list(range(skiprows)) + if skiprows is None: + skiprows = set() + elif not callable(skiprows): + skiprows = set(skiprows) + + # put stuff back + result["names"] = names + result["converters"] = converters + result["na_values"] = na_values + result["na_fvalues"] = na_fvalues + result["skiprows"] = skiprows + + return result, engine + + def __next__(self): + try: + return self.get_chunk() + except StopIteration: + self.close() + raise + + def _make_engine(self, engine="c"): + if engine == "c": + self._engine = CParserWrapper(self.f, **self.options) + else: + if engine == "python": + klass = PythonParser + elif engine == "python-fwf": + klass = FixedWidthFieldParser + else: + raise ValueError( + f"Unknown engine: {engine} (valid options are " + '"c", "python", or ' + '"python-fwf")' + ) + self._engine = klass(self.f, **self.options) + + def _failover_to_python(self): + raise AbstractMethodError(self) + + def read(self, nrows=None): + nrows = _validate_integer("nrows", nrows) + ret = self._engine.read(nrows) + + # May alter columns / col_dict + index, columns, col_dict = self._create_index(ret) + + if index is None: + if col_dict: + # Any column is actually fine: + new_rows = len(next(iter(col_dict.values()))) + index = RangeIndex(self._currow, self._currow + new_rows) + else: + new_rows = 0 + else: + new_rows = len(index) + + df = DataFrame(col_dict, columns=columns, index=index) + + self._currow += new_rows + + if self.squeeze and len(df.columns) == 1: + return df[df.columns[0]].copy() + return df + + def _create_index(self, ret): + index, columns, col_dict = ret + return index, columns, col_dict + + def get_chunk(self, size=None): + if size is None: + size = self.chunksize + if self.nrows is not None: + if self._currow >= self.nrows: + raise StopIteration + size = min(size, self.nrows - self._currow) + return self.read(nrows=size) + + +def _is_index_col(col): + return col is not None and col is not False + + +def _is_potential_multi_index(columns): + """ + Check whether or not the `columns` parameter + could be converted into a MultiIndex. + + Parameters + ---------- + columns : array-like + Object which may or may not be convertible into a MultiIndex + + Returns + ------- + boolean : Whether or not columns could become a MultiIndex + """ + return ( + len(columns) + and not isinstance(columns, MultiIndex) + and all(isinstance(c, tuple) for c in columns) + ) + + +def _evaluate_usecols(usecols, names): + """ + Check whether or not the 'usecols' parameter + is a callable. If so, enumerates the 'names' + parameter and returns a set of indices for + each entry in 'names' that evaluates to True. + If not a callable, returns 'usecols'. + """ + if callable(usecols): + return {i for i, name in enumerate(names) if usecols(name)} + return usecols + + +def _validate_usecols_names(usecols, names): + """ + Validates that all usecols are present in a given + list of names. If not, raise a ValueError that + shows what usecols are missing. + + Parameters + ---------- + usecols : iterable of usecols + The columns to validate are present in names. + names : iterable of names + The column names to check against. + + Returns + ------- + usecols : iterable of usecols + The `usecols` parameter if the validation succeeds. + + Raises + ------ + ValueError : Columns were missing. Error message will list them. + """ + missing = [c for c in usecols if c not in names] + if len(missing) > 0: + raise ValueError( + "Usecols do not match columns, " + f"columns expected but not found: {missing}" + ) + + return usecols + + +def _validate_skipfooter_arg(skipfooter): + """ + Validate the 'skipfooter' parameter. + + Checks whether 'skipfooter' is a non-negative integer. + Raises a ValueError if that is not the case. + + Parameters + ---------- + skipfooter : non-negative integer + The number of rows to skip at the end of the file. + + Returns + ------- + validated_skipfooter : non-negative integer + The original input if the validation succeeds. + + Raises + ------ + ValueError : 'skipfooter' was not a non-negative integer. + """ + + if not is_integer(skipfooter): + raise ValueError("skipfooter must be an integer") + + if skipfooter < 0: + raise ValueError("skipfooter cannot be negative") + + return skipfooter + + +def _validate_usecols_arg(usecols): + """ + Validate the 'usecols' parameter. + + Checks whether or not the 'usecols' parameter contains all integers + (column selection by index), strings (column by name) or is a callable. + Raises a ValueError if that is not the case. + + Parameters + ---------- + usecols : list-like, callable, or None + List of columns to use when parsing or a callable that can be used + to filter a list of table columns. + + Returns + ------- + usecols_tuple : tuple + A tuple of (verified_usecols, usecols_dtype). + + 'verified_usecols' is either a set if an array-like is passed in or + 'usecols' if a callable or None is passed in. + + 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like + is passed in or None if a callable or None is passed in. + """ + msg = ( + "'usecols' must either be list-like of all strings, all unicode, " + "all integers or a callable." + ) + if usecols is not None: + if callable(usecols): + return usecols, None + + if not is_list_like(usecols): + # see gh-20529 + # + # Ensure it is iterable container but not string. + raise ValueError(msg) + + usecols_dtype = lib.infer_dtype(usecols, skipna=False) + + if usecols_dtype not in ("empty", "integer", "string", "unicode"): + raise ValueError(msg) + + usecols = set(usecols) + + return usecols, usecols_dtype + return usecols, None + + +def _validate_parse_dates_arg(parse_dates): + """ + Check whether or not the 'parse_dates' parameter + is a non-boolean scalar. Raises a ValueError if + that is the case. + """ + msg = ( + "Only booleans, lists, and " + "dictionaries are accepted " + "for the 'parse_dates' parameter" + ) + + if parse_dates is not None: + if is_scalar(parse_dates): + if not lib.is_bool(parse_dates): + raise TypeError(msg) + + elif not isinstance(parse_dates, (list, dict)): + raise TypeError(msg) + + return parse_dates + + +class ParserBase: + def __init__(self, kwds): + self.names = kwds.get("names") + self.orig_names = None + self.prefix = kwds.pop("prefix", None) + + self.index_col = kwds.get("index_col", None) + self.unnamed_cols = set() + self.index_names = None + self.col_names = None + + self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False)) + self.date_parser = kwds.pop("date_parser", None) + self.dayfirst = kwds.pop("dayfirst", False) + self.keep_date_col = kwds.pop("keep_date_col", False) + + self.na_values = kwds.get("na_values") + self.na_fvalues = kwds.get("na_fvalues") + self.na_filter = kwds.get("na_filter", False) + self.keep_default_na = kwds.get("keep_default_na", True) + + self.true_values = kwds.get("true_values") + self.false_values = kwds.get("false_values") + self.mangle_dupe_cols = kwds.get("mangle_dupe_cols", True) + self.infer_datetime_format = kwds.pop("infer_datetime_format", False) + self.cache_dates = kwds.pop("cache_dates", True) + + self._date_conv = _make_date_converter( + date_parser=self.date_parser, + dayfirst=self.dayfirst, + infer_datetime_format=self.infer_datetime_format, + cache_dates=self.cache_dates, + ) + + # validate header options for mi + self.header = kwds.get("header") + if isinstance(self.header, (list, tuple, np.ndarray)): + if not all(map(is_integer, self.header)): + raise ValueError("header must be integer or list of integers") + if any(i < 0 for i in self.header): + raise ValueError( + "cannot specify multi-index header with negative integers" + ) + if kwds.get("usecols"): + raise ValueError( + "cannot specify usecols when specifying a multi-index header" + ) + if kwds.get("names"): + raise ValueError( + "cannot specify names when specifying a multi-index header" + ) + + # validate index_col that only contains integers + if self.index_col is not None: + is_sequence = isinstance(self.index_col, (list, tuple, np.ndarray)) + if not ( + is_sequence + and all(map(is_integer, self.index_col)) + or is_integer(self.index_col) + ): + raise ValueError( + "index_col must only contain row numbers " + "when specifying a multi-index header" + ) + + # GH 16338 + elif self.header is not None and not is_integer(self.header): + raise ValueError("header must be integer or list of integers") + + # GH 27779 + elif self.header is not None and self.header < 0: + raise ValueError( + "Passing negative integer to header is invalid. " + "For no header, use header=None instead" + ) + + self._name_processed = False + + self._first_chunk = True + + # GH 13932 + # keep references to file handles opened by the parser itself + self.handles = [] + + def close(self): + for f in self.handles: + f.close() + + @property + def _has_complex_date_col(self): + return isinstance(self.parse_dates, dict) or ( + isinstance(self.parse_dates, list) + and len(self.parse_dates) > 0 + and isinstance(self.parse_dates[0], list) + ) + + def _should_parse_dates(self, i): + if isinstance(self.parse_dates, bool): + return self.parse_dates + else: + if self.index_names is not None: + name = self.index_names[i] + else: + name = None + j = self.index_col[i] + + if is_scalar(self.parse_dates): + return (j == self.parse_dates) or ( + name is not None and name == self.parse_dates + ) + else: + return (j in self.parse_dates) or ( + name is not None and name in self.parse_dates + ) + + def _extract_multi_indexer_columns( + self, header, index_names, col_names, passed_names=False + ): + """ extract and return the names, index_names, col_names + header is a list-of-lists returned from the parsers """ + if len(header) < 2: + return header[0], index_names, col_names, passed_names + + # the names are the tuples of the header that are not the index cols + # 0 is the name of the index, assuming index_col is a list of column + # numbers + ic = self.index_col + if ic is None: + ic = [] + + if not isinstance(ic, (list, tuple, np.ndarray)): + ic = [ic] + sic = set(ic) + + # clean the index_names + index_names = header.pop(-1) + index_names, names, index_col = _clean_index_names( + index_names, self.index_col, self.unnamed_cols + ) + + # extract the columns + field_count = len(header[0]) + + def extract(r): + return tuple(r[i] for i in range(field_count) if i not in sic) + + columns = list(zip(*(extract(r) for r in header))) + names = ic + columns + + # If we find unnamed columns all in a single + # level, then our header was too long. + for n in range(len(columns[0])): + if all(ensure_str(col[n]) in self.unnamed_cols for col in columns): + raise ParserError( + "Passed header=[{header}] are too many rows for this " + "multi_index of columns".format( + header=",".join(str(x) for x in self.header) + ) + ) + + # Clean the column names (if we have an index_col). + if len(ic): + col_names = [ + r[0] if (len(r[0]) and r[0] not in self.unnamed_cols) else None + for r in header + ] + else: + col_names = [None] * len(header) + + passed_names = True + + return names, index_names, col_names, passed_names + + def _maybe_dedup_names(self, names): + # see gh-7160 and gh-9424: this helps to provide + # immediate alleviation of the duplicate names + # issue and appears to be satisfactory to users, + # but ultimately, not needing to butcher the names + # would be nice! + if self.mangle_dupe_cols: + names = list(names) # so we can index + counts = defaultdict(int) + is_potential_mi = _is_potential_multi_index(names) + + for i, col in enumerate(names): + cur_count = counts[col] + + while cur_count > 0: + counts[col] = cur_count + 1 + + if is_potential_mi: + col = col[:-1] + (f"{col[-1]}.{cur_count}",) + else: + col = f"{col}.{cur_count}" + cur_count = counts[col] + + names[i] = col + counts[col] = cur_count + 1 + + return names + + def _maybe_make_multi_index_columns(self, columns, col_names=None): + # possibly create a column mi here + if _is_potential_multi_index(columns): + columns = MultiIndex.from_tuples(columns, names=col_names) + return columns + + def _make_index(self, data, alldata, columns, indexnamerow=False): + if not _is_index_col(self.index_col) or not self.index_col: + index = None + + elif not self._has_complex_date_col: + index = self._get_simple_index(alldata, columns) + index = self._agg_index(index) + elif self._has_complex_date_col: + if not self._name_processed: + (self.index_names, _, self.index_col) = _clean_index_names( + list(columns), self.index_col, self.unnamed_cols + ) + self._name_processed = True + index = self._get_complex_date_index(data, columns) + index = self._agg_index(index, try_parse_dates=False) + + # add names for the index + if indexnamerow: + coffset = len(indexnamerow) - len(columns) + index = index.set_names(indexnamerow[:coffset]) + + # maybe create a mi on the columns + columns = self._maybe_make_multi_index_columns(columns, self.col_names) + + return index, columns + + _implicit_index = False + + def _get_simple_index(self, data, columns): + def ix(col): + if not isinstance(col, str): + return col + raise ValueError(f"Index {col} invalid") + + to_remove = [] + index = [] + for idx in self.index_col: + i = ix(idx) + to_remove.append(i) + index.append(data[i]) + + # remove index items from content and columns, don't pop in + # loop + for i in sorted(to_remove, reverse=True): + data.pop(i) + if not self._implicit_index: + columns.pop(i) + + return index + + def _get_complex_date_index(self, data, col_names): + def _get_name(icol): + if isinstance(icol, str): + return icol + + if col_names is None: + raise ValueError(f"Must supply column order to use {icol!s} as index") + + for i, c in enumerate(col_names): + if i == icol: + return c + + to_remove = [] + index = [] + for idx in self.index_col: + name = _get_name(idx) + to_remove.append(name) + index.append(data[name]) + + # remove index items from content and columns, don't pop in + # loop + for c in sorted(to_remove, reverse=True): + data.pop(c) + col_names.remove(c) + + return index + + def _agg_index(self, index, try_parse_dates=True): + arrays = [] + + for i, arr in enumerate(index): + + if try_parse_dates and self._should_parse_dates(i): + arr = self._date_conv(arr) + + if self.na_filter: + col_na_values = self.na_values + col_na_fvalues = self.na_fvalues + else: + col_na_values = set() + col_na_fvalues = set() + + if isinstance(self.na_values, dict): + col_name = self.index_names[i] + if col_name is not None: + col_na_values, col_na_fvalues = _get_na_values( + col_name, self.na_values, self.na_fvalues, self.keep_default_na + ) + + arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues) + arrays.append(arr) + + names = self.index_names + index = ensure_index_from_sequences(arrays, names) + + return index + + def _convert_to_ndarrays( + self, dct, na_values, na_fvalues, verbose=False, converters=None, dtypes=None + ): + result = {} + for c, values in dct.items(): + conv_f = None if converters is None else converters.get(c, None) + if isinstance(dtypes, dict): + cast_type = dtypes.get(c, None) + else: + # single dtype or None + cast_type = dtypes + + if self.na_filter: + col_na_values, col_na_fvalues = _get_na_values( + c, na_values, na_fvalues, self.keep_default_na + ) + else: + col_na_values, col_na_fvalues = set(), set() + + if conv_f is not None: + # conv_f applied to data before inference + if cast_type is not None: + warnings.warn( + ( + "Both a converter and dtype were specified " + f"for column {c} - only the converter will " + "be used" + ), + ParserWarning, + stacklevel=7, + ) + + try: + values = lib.map_infer(values, conv_f) + except ValueError: + mask = algorithms.isin(values, list(na_values)).view(np.uint8) + values = lib.map_infer_mask(values, conv_f, mask) + + cvals, na_count = self._infer_types( + values, set(col_na_values) | col_na_fvalues, try_num_bool=False + ) + else: + is_str_or_ea_dtype = is_string_dtype( + cast_type + ) or is_extension_array_dtype(cast_type) + # skip inference if specified dtype is object + # or casting to an EA + try_num_bool = not (cast_type and is_str_or_ea_dtype) + + # general type inference and conversion + cvals, na_count = self._infer_types( + values, set(col_na_values) | col_na_fvalues, try_num_bool + ) + + # type specified in dtype param or cast_type is an EA + if cast_type and ( + not is_dtype_equal(cvals, cast_type) + or is_extension_array_dtype(cast_type) + ): + try: + if ( + is_bool_dtype(cast_type) + and not is_categorical_dtype(cast_type) + and na_count > 0 + ): + raise ValueError(f"Bool column has NA values in column {c}") + except (AttributeError, TypeError): + # invalid input to is_bool_dtype + pass + cvals = self._cast_types(cvals, cast_type, c) + + result[c] = cvals + if verbose and na_count: + print(f"Filled {na_count} NA values in column {c!s}") + return result + + def _infer_types(self, values, na_values, try_num_bool=True): + """ + Infer types of values, possibly casting + + Parameters + ---------- + values : ndarray + na_values : set + try_num_bool : bool, default try + try to cast values to numeric (first preference) or boolean + + Returns + ------- + converted : ndarray + na_count : int + """ + na_count = 0 + if issubclass(values.dtype.type, (np.number, np.bool_)): + mask = algorithms.isin(values, list(na_values)) + na_count = mask.sum() + if na_count > 0: + if is_integer_dtype(values): + values = values.astype(np.float64) + np.putmask(values, mask, np.nan) + return values, na_count + + if try_num_bool and is_object_dtype(values.dtype): + # exclude e.g DatetimeIndex here + try: + result = lib.maybe_convert_numeric(values, na_values, False) + except (ValueError, TypeError): + # e.g. encountering datetime string gets ValueError + # TypeError can be raised in floatify + result = values + na_count = parsers.sanitize_objects(result, na_values, False) + else: + na_count = isna(result).sum() + else: + result = values + if values.dtype == np.object_: + na_count = parsers.sanitize_objects(values, na_values, False) + + if result.dtype == np.object_ and try_num_bool: + result = libops.maybe_convert_bool( + np.asarray(values), + true_values=self.true_values, + false_values=self.false_values, + ) + + return result, na_count + + def _cast_types(self, values, cast_type, column): + """ + Cast values to specified type + + Parameters + ---------- + values : ndarray + cast_type : string or np.dtype + dtype to cast values to + column : string + column name - used only for error reporting + + Returns + ------- + converted : ndarray + """ + + if is_categorical_dtype(cast_type): + known_cats = ( + isinstance(cast_type, CategoricalDtype) + and cast_type.categories is not None + ) + + if not is_object_dtype(values) and not known_cats: + # XXX this is for consistency with + # c-parser which parses all categories + # as strings + values = astype_nansafe(values, str) + + cats = Index(values).unique().dropna() + values = Categorical._from_inferred_categories( + cats, cats.get_indexer(values), cast_type, true_values=self.true_values + ) + + # use the EA's implementation of casting + elif is_extension_array_dtype(cast_type): + # ensure cast_type is an actual dtype and not a string + cast_type = pandas_dtype(cast_type) + array_type = cast_type.construct_array_type() + try: + return array_type._from_sequence_of_strings(values, dtype=cast_type) + except NotImplementedError: + raise NotImplementedError( + f"Extension Array: {array_type} must implement " + "_from_sequence_of_strings in order " + "to be used in parser methods" + ) + + else: + try: + values = astype_nansafe(values, cast_type, copy=True, skipna=True) + except ValueError: + raise ValueError( + f"Unable to convert column {column} to type {cast_type}" + ) + return values + + def _do_date_conversions(self, names, data): + # returns data, columns + + if self.parse_dates is not None: + data, names = _process_date_conversion( + data, + self._date_conv, + self.parse_dates, + self.index_col, + self.index_names, + names, + keep_date_col=self.keep_date_col, + ) + + return names, data + + +class CParserWrapper(ParserBase): + """ + + """ + + def __init__(self, src, **kwds): + self.kwds = kwds + kwds = kwds.copy() + + ParserBase.__init__(self, kwds) + + encoding = kwds.get("encoding") + + if kwds.get("compression") is None and encoding: + if isinstance(src, str): + src = open(src, "rb") + self.handles.append(src) + + # Handle the file object with universal line mode enabled. + # We will handle the newline character ourselves later on. + if isinstance(src, BufferedIOBase): + src = TextIOWrapper(src, encoding=encoding, newline="") + + kwds["encoding"] = "utf-8" + + # #2442 + kwds["allow_leading_cols"] = self.index_col is not False + + # GH20529, validate usecol arg before TextReader + self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) + kwds["usecols"] = self.usecols + + self._reader = parsers.TextReader(src, **kwds) + self.unnamed_cols = self._reader.unnamed_cols + + passed_names = self.names is None + + if self._reader.header is None: + self.names = None + else: + if len(self._reader.header) > 1: + # we have a multi index in the columns + ( + self.names, + self.index_names, + self.col_names, + passed_names, + ) = self._extract_multi_indexer_columns( + self._reader.header, self.index_names, self.col_names, passed_names + ) + else: + self.names = list(self._reader.header[0]) + + if self.names is None: + if self.prefix: + self.names = [ + f"{self.prefix}{i}" for i in range(self._reader.table_width) + ] + else: + self.names = list(range(self._reader.table_width)) + + # gh-9755 + # + # need to set orig_names here first + # so that proper indexing can be done + # with _set_noconvert_columns + # + # once names has been filtered, we will + # then set orig_names again to names + self.orig_names = self.names[:] + + if self.usecols: + usecols = _evaluate_usecols(self.usecols, self.orig_names) + + # GH 14671 + if self.usecols_dtype == "string" and not set(usecols).issubset( + self.orig_names + ): + _validate_usecols_names(usecols, self.orig_names) + + if len(self.names) > len(usecols): + self.names = [ + n + for i, n in enumerate(self.names) + if (i in usecols or n in usecols) + ] + + if len(self.names) < len(usecols): + _validate_usecols_names(usecols, self.names) + + self._set_noconvert_columns() + + self.orig_names = self.names + + if not self._has_complex_date_col: + if self._reader.leading_cols == 0 and _is_index_col(self.index_col): + + self._name_processed = True + (index_names, self.names, self.index_col) = _clean_index_names( + self.names, self.index_col, self.unnamed_cols + ) + + if self.index_names is None: + self.index_names = index_names + + if self._reader.header is None and not passed_names: + self.index_names = [None] * len(self.index_names) + + self._implicit_index = self._reader.leading_cols > 0 + + def close(self): + for f in self.handles: + f.close() + + # close additional handles opened by C parser (for compression) + try: + self._reader.close() + except ValueError: + pass + + def _set_noconvert_columns(self): + """ + Set the columns that should not undergo dtype conversions. + + Currently, any column that is involved with date parsing will not + undergo such conversions. + """ + names = self.orig_names + if self.usecols_dtype == "integer": + # A set of integers will be converted to a list in + # the correct order every single time. + usecols = list(self.usecols) + usecols.sort() + elif callable(self.usecols) or self.usecols_dtype not in ("empty", None): + # The names attribute should have the correct columns + # in the proper order for indexing with parse_dates. + usecols = self.names[:] + else: + # Usecols is empty. + usecols = None + + def _set(x): + if usecols is not None and is_integer(x): + x = usecols[x] + + if not is_integer(x): + x = names.index(x) + + self._reader.set_noconvert(x) + + if isinstance(self.parse_dates, list): + for val in self.parse_dates: + if isinstance(val, list): + for k in val: + _set(k) + else: + _set(val) + + elif isinstance(self.parse_dates, dict): + for val in self.parse_dates.values(): + if isinstance(val, list): + for k in val: + _set(k) + else: + _set(val) + + elif self.parse_dates: + if isinstance(self.index_col, list): + for k in self.index_col: + _set(k) + elif self.index_col is not None: + _set(self.index_col) + + def set_error_bad_lines(self, status): + self._reader.set_error_bad_lines(int(status)) + + def read(self, nrows=None): + try: + data = self._reader.read(nrows) + except StopIteration: + if self._first_chunk: + self._first_chunk = False + names = self._maybe_dedup_names(self.orig_names) + index, columns, col_dict = _get_empty_meta( + names, + self.index_col, + self.index_names, + dtype=self.kwds.get("dtype"), + ) + columns = self._maybe_make_multi_index_columns(columns, self.col_names) + + if self.usecols is not None: + columns = self._filter_usecols(columns) + + col_dict = dict( + filter(lambda item: item[0] in columns, col_dict.items()) + ) + + return index, columns, col_dict + + else: + raise + + # Done with first read, next time raise StopIteration + self._first_chunk = False + + names = self.names + + if self._reader.leading_cols: + if self._has_complex_date_col: + raise NotImplementedError("file structure not yet supported") + + # implicit index, no index names + arrays = [] + + for i in range(self._reader.leading_cols): + if self.index_col is None: + values = data.pop(i) + else: + values = data.pop(self.index_col[i]) + + values = self._maybe_parse_dates(values, i, try_parse_dates=True) + arrays.append(values) + + index = ensure_index_from_sequences(arrays) + + if self.usecols is not None: + names = self._filter_usecols(names) + + names = self._maybe_dedup_names(names) + + # rename dict keys + data = sorted(data.items()) + data = {k: v for k, (i, v) in zip(names, data)} + + names, data = self._do_date_conversions(names, data) + + else: + # rename dict keys + data = sorted(data.items()) + + # ugh, mutation + names = list(self.orig_names) + names = self._maybe_dedup_names(names) + + if self.usecols is not None: + names = self._filter_usecols(names) + + # columns as list + alldata = [x[1] for x in data] + + data = {k: v for k, (i, v) in zip(names, data)} + + names, data = self._do_date_conversions(names, data) + index, names = self._make_index(data, alldata, names) + + # maybe create a mi on the columns + names = self._maybe_make_multi_index_columns(names, self.col_names) + + return index, names, data + + def _filter_usecols(self, names): + # hackish + usecols = _evaluate_usecols(self.usecols, names) + if usecols is not None and len(names) != len(usecols): + names = [ + name for i, name in enumerate(names) if i in usecols or name in usecols + ] + return names + + def _get_index_names(self): + names = list(self._reader.header[0]) + idx_names = None + + if self._reader.leading_cols == 0 and self.index_col is not None: + (idx_names, names, self.index_col) = _clean_index_names( + names, self.index_col, self.unnamed_cols + ) + + return names, idx_names + + def _maybe_parse_dates(self, values, index, try_parse_dates=True): + if try_parse_dates and self._should_parse_dates(index): + values = self._date_conv(values) + return values + + +def TextParser(*args, **kwds): + """ + Converts lists of lists/tuples into DataFrames with proper type inference + and optional (e.g. string to datetime) conversion. Also enables iterating + lazily over chunks of large files + + Parameters + ---------- + data : file-like object or list + delimiter : separator character to use + dialect : str or csv.Dialect instance, optional + Ignored if delimiter is longer than 1 character + names : sequence, default + header : int, default 0 + Row to use to parse column labels. Defaults to the first row. Prior + rows will be discarded + index_col : int or list, optional + Column or columns to use as the (possibly hierarchical) index + has_index_names: bool, default False + True if the cols defined in index_col have an index name and are + not in the header. + na_values : scalar, str, list-like, or dict, optional + Additional strings to recognize as NA/NaN. + keep_default_na : bool, default True + thousands : str, optional + Thousands separator + comment : str, optional + Comment out remainder of line + parse_dates : bool, default False + keep_date_col : bool, default False + date_parser : function, optional + skiprows : list of integers + Row numbers to skip + skipfooter : int + Number of line at bottom of file to skip + converters : dict, optional + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels, values are functions that take one + input argument, the cell (not column) content, and return the + transformed content. + encoding : str, optional + Encoding to use for UTF when reading/writing (ex. 'utf-8') + squeeze : bool, default False + returns Series if only one column. + infer_datetime_format: bool, default False + If True and `parse_dates` is True for a column, try to infer the + datetime format based on the first datetime string. If the format + can be inferred, there often will be a large parsing speed-up. + float_precision : str, optional + Specifies which converter the C engine should use for floating-point + values. The options are None for the ordinary converter, + 'high' for the high-precision converter, and 'round_trip' for the + round-trip converter. + """ + kwds["engine"] = "python" + return TextFileReader(*args, **kwds) + + +def count_empty_vals(vals): + return sum(1 for v in vals if v == "" or v is None) + + +class PythonParser(ParserBase): + def __init__(self, f, **kwds): + """ + Workhorse function for processing nested list into DataFrame + """ + ParserBase.__init__(self, kwds) + + self.data = None + self.buf = [] + self.pos = 0 + self.line_pos = 0 + + self.encoding = kwds["encoding"] + self.compression = kwds["compression"] + self.memory_map = kwds["memory_map"] + self.skiprows = kwds["skiprows"] + + if callable(self.skiprows): + self.skipfunc = self.skiprows + else: + self.skipfunc = lambda x: x in self.skiprows + + self.skipfooter = _validate_skipfooter_arg(kwds["skipfooter"]) + self.delimiter = kwds["delimiter"] + + self.quotechar = kwds["quotechar"] + if isinstance(self.quotechar, str): + self.quotechar = str(self.quotechar) + + self.escapechar = kwds["escapechar"] + self.doublequote = kwds["doublequote"] + self.skipinitialspace = kwds["skipinitialspace"] + self.lineterminator = kwds["lineterminator"] + self.quoting = kwds["quoting"] + self.usecols, _ = _validate_usecols_arg(kwds["usecols"]) + self.skip_blank_lines = kwds["skip_blank_lines"] + + self.warn_bad_lines = kwds["warn_bad_lines"] + self.error_bad_lines = kwds["error_bad_lines"] + + self.names_passed = kwds["names"] or None + + self.has_index_names = False + if "has_index_names" in kwds: + self.has_index_names = kwds["has_index_names"] + + self.verbose = kwds["verbose"] + self.converters = kwds["converters"] + + self.dtype = kwds["dtype"] + self.thousands = kwds["thousands"] + self.decimal = kwds["decimal"] + + self.comment = kwds["comment"] + self._comment_lines = [] + + f, handles = get_handle( + f, + "r", + encoding=self.encoding, + compression=self.compression, + memory_map=self.memory_map, + ) + self.handles.extend(handles) + + # Set self.data to something that can read lines. + if hasattr(f, "readline"): + self._make_reader(f) + else: + self.data = f + + # Get columns in two steps: infer from data, then + # infer column indices from self.usecols if it is specified. + self._col_indices = None + ( + self.columns, + self.num_original_columns, + self.unnamed_cols, + ) = self._infer_columns() + + # Now self.columns has the set of columns that we will process. + # The original set is stored in self.original_columns. + if len(self.columns) > 1: + # we are processing a multi index column + ( + self.columns, + self.index_names, + self.col_names, + _, + ) = self._extract_multi_indexer_columns( + self.columns, self.index_names, self.col_names + ) + # Update list of original names to include all indices. + self.num_original_columns = len(self.columns) + else: + self.columns = self.columns[0] + + # get popped off for index + self.orig_names = list(self.columns) + + # needs to be cleaned/refactored + # multiple date column thing turning into a real spaghetti factory + + if not self._has_complex_date_col: + (index_names, self.orig_names, self.columns) = self._get_index_name( + self.columns + ) + self._name_processed = True + if self.index_names is None: + self.index_names = index_names + + if self.parse_dates: + self._no_thousands_columns = self._set_no_thousands_columns() + else: + self._no_thousands_columns = None + + if len(self.decimal) != 1: + raise ValueError("Only length-1 decimal markers supported") + + if self.thousands is None: + self.nonnum = re.compile(fr"[^-^0-9^{self.decimal}]+") + else: + self.nonnum = re.compile(fr"[^-^0-9^{self.thousands}^{self.decimal}]+") + + def _set_no_thousands_columns(self): + # Create a set of column ids that are not to be stripped of thousands + # operators. + noconvert_columns = set() + + def _set(x): + if is_integer(x): + noconvert_columns.add(x) + else: + noconvert_columns.add(self.columns.index(x)) + + if isinstance(self.parse_dates, list): + for val in self.parse_dates: + if isinstance(val, list): + for k in val: + _set(k) + else: + _set(val) + + elif isinstance(self.parse_dates, dict): + for val in self.parse_dates.values(): + if isinstance(val, list): + for k in val: + _set(k) + else: + _set(val) + + elif self.parse_dates: + if isinstance(self.index_col, list): + for k in self.index_col: + _set(k) + elif self.index_col is not None: + _set(self.index_col) + + return noconvert_columns + + def _make_reader(self, f): + sep = self.delimiter + + if sep is None or len(sep) == 1: + if self.lineterminator: + raise ValueError( + "Custom line terminators not supported in python parser (yet)" + ) + + class MyDialect(csv.Dialect): + delimiter = self.delimiter + quotechar = self.quotechar + escapechar = self.escapechar + doublequote = self.doublequote + skipinitialspace = self.skipinitialspace + quoting = self.quoting + lineterminator = "\n" + + dia = MyDialect + + sniff_sep = True + + if sep is not None: + sniff_sep = False + dia.delimiter = sep + # attempt to sniff the delimiter + if sniff_sep: + line = f.readline() + while self.skipfunc(self.pos): + self.pos += 1 + line = f.readline() + + line = self._check_comments([line])[0] + + self.pos += 1 + self.line_pos += 1 + sniffed = csv.Sniffer().sniff(line) + dia.delimiter = sniffed.delimiter + + # Note: self.encoding is irrelevant here + line_rdr = csv.reader(StringIO(line), dialect=dia) + self.buf.extend(list(line_rdr)) + + # Note: self.encoding is irrelevant here + reader = csv.reader(f, dialect=dia, strict=True) + + else: + + def _read(): + line = f.readline() + pat = re.compile(sep) + + yield pat.split(line.strip()) + + for line in f: + yield pat.split(line.strip()) + + reader = _read() + + self.data = reader + + def read(self, rows=None): + try: + content = self._get_lines(rows) + except StopIteration: + if self._first_chunk: + content = [] + else: + raise + + # done with first read, next time raise StopIteration + self._first_chunk = False + + columns = list(self.orig_names) + if not len(content): # pragma: no cover + # DataFrame with the right metadata, even though it's length 0 + names = self._maybe_dedup_names(self.orig_names) + index, columns, col_dict = _get_empty_meta( + names, self.index_col, self.index_names, self.dtype + ) + columns = self._maybe_make_multi_index_columns(columns, self.col_names) + return index, columns, col_dict + + # handle new style for names in index + count_empty_content_vals = count_empty_vals(content[0]) + indexnamerow = None + if self.has_index_names and count_empty_content_vals == len(columns): + indexnamerow = content[0] + content = content[1:] + + alldata = self._rows_to_cols(content) + data = self._exclude_implicit_index(alldata) + + columns = self._maybe_dedup_names(self.columns) + columns, data = self._do_date_conversions(columns, data) + + data = self._convert_data(data) + index, columns = self._make_index(data, alldata, columns, indexnamerow) + + return index, columns, data + + def _exclude_implicit_index(self, alldata): + names = self._maybe_dedup_names(self.orig_names) + + if self._implicit_index: + excl_indices = self.index_col + + data = {} + offset = 0 + for i, col in enumerate(names): + while i + offset in excl_indices: + offset += 1 + data[col] = alldata[i + offset] + else: + data = {k: v for k, v in zip(names, alldata)} + + return data + + # legacy + def get_chunk(self, size=None): + if size is None: + size = self.chunksize + return self.read(rows=size) + + def _convert_data(self, data): + # apply converters + def _clean_mapping(mapping): + "converts col numbers to names" + clean = {} + for col, v in mapping.items(): + if isinstance(col, int) and col not in self.orig_names: + col = self.orig_names[col] + clean[col] = v + return clean + + clean_conv = _clean_mapping(self.converters) + if not isinstance(self.dtype, dict): + # handles single dtype applied to all columns + clean_dtypes = self.dtype + else: + clean_dtypes = _clean_mapping(self.dtype) + + # Apply NA values. + clean_na_values = {} + clean_na_fvalues = {} + + if isinstance(self.na_values, dict): + for col in self.na_values: + na_value = self.na_values[col] + na_fvalue = self.na_fvalues[col] + + if isinstance(col, int) and col not in self.orig_names: + col = self.orig_names[col] + + clean_na_values[col] = na_value + clean_na_fvalues[col] = na_fvalue + else: + clean_na_values = self.na_values + clean_na_fvalues = self.na_fvalues + + return self._convert_to_ndarrays( + data, + clean_na_values, + clean_na_fvalues, + self.verbose, + clean_conv, + clean_dtypes, + ) + + def _infer_columns(self): + names = self.names + num_original_columns = 0 + clear_buffer = True + unnamed_cols = set() + + if self.header is not None: + header = self.header + + if isinstance(header, (list, tuple, np.ndarray)): + have_mi_columns = len(header) > 1 + # we have a mi columns, so read an extra line + if have_mi_columns: + header = list(header) + [header[-1] + 1] + else: + have_mi_columns = False + header = [header] + + columns = [] + for level, hr in enumerate(header): + try: + line = self._buffered_line() + + while self.line_pos <= hr: + line = self._next_line() + + except StopIteration: + if self.line_pos < hr: + raise ValueError( + f"Passed header={hr} but only {self.line_pos + 1} lines in " + "file" + ) + + # We have an empty file, so check + # if columns are provided. That will + # serve as the 'line' for parsing + if have_mi_columns and hr > 0: + if clear_buffer: + self._clear_buffer() + columns.append([None] * len(columns[-1])) + return columns, num_original_columns, unnamed_cols + + if not self.names: + raise EmptyDataError("No columns to parse from file") + + line = self.names[:] + + this_columns = [] + this_unnamed_cols = [] + + for i, c in enumerate(line): + if c == "": + if have_mi_columns: + col_name = f"Unnamed: {i}_level_{level}" + else: + col_name = f"Unnamed: {i}" + + this_unnamed_cols.append(i) + this_columns.append(col_name) + else: + this_columns.append(c) + + if not have_mi_columns and self.mangle_dupe_cols: + counts = defaultdict(int) + + for i, col in enumerate(this_columns): + cur_count = counts[col] + + while cur_count > 0: + counts[col] = cur_count + 1 + col = f"{col}.{cur_count}" + cur_count = counts[col] + + this_columns[i] = col + counts[col] = cur_count + 1 + elif have_mi_columns: + + # if we have grabbed an extra line, but its not in our + # format so save in the buffer, and create an blank extra + # line for the rest of the parsing code + if hr == header[-1]: + lc = len(this_columns) + ic = len(self.index_col) if self.index_col is not None else 0 + unnamed_count = len(this_unnamed_cols) + + if lc != unnamed_count and lc - ic > unnamed_count: + clear_buffer = False + this_columns = [None] * lc + self.buf = [self.buf[-1]] + + columns.append(this_columns) + unnamed_cols.update({this_columns[i] for i in this_unnamed_cols}) + + if len(columns) == 1: + num_original_columns = len(this_columns) + + if clear_buffer: + self._clear_buffer() + + if names is not None: + if (self.usecols is not None and len(names) != len(self.usecols)) or ( + self.usecols is None and len(names) != len(columns[0]) + ): + raise ValueError( + "Number of passed names did not match " + "number of header fields in the file" + ) + if len(columns) > 1: + raise TypeError("Cannot pass names with multi-index columns") + + if self.usecols is not None: + # Set _use_cols. We don't store columns because they are + # overwritten. + self._handle_usecols(columns, names) + else: + self._col_indices = None + num_original_columns = len(names) + columns = [names] + else: + columns = self._handle_usecols(columns, columns[0]) + else: + try: + line = self._buffered_line() + + except StopIteration: + if not names: + raise EmptyDataError("No columns to parse from file") + + line = names[:] + + ncols = len(line) + num_original_columns = ncols + + if not names: + if self.prefix: + columns = [[f"{self.prefix}{i}" for i in range(ncols)]] + else: + columns = [list(range(ncols))] + columns = self._handle_usecols(columns, columns[0]) + else: + if self.usecols is None or len(names) >= num_original_columns: + columns = self._handle_usecols([names], names) + num_original_columns = len(names) + else: + if not callable(self.usecols) and len(names) != len(self.usecols): + raise ValueError( + "Number of passed names did not match number of " + "header fields in the file" + ) + # Ignore output but set used columns. + self._handle_usecols([names], names) + columns = [names] + num_original_columns = ncols + + return columns, num_original_columns, unnamed_cols + + def _handle_usecols(self, columns, usecols_key): + """ + Sets self._col_indices + + usecols_key is used if there are string usecols. + """ + if self.usecols is not None: + if callable(self.usecols): + col_indices = _evaluate_usecols(self.usecols, usecols_key) + elif any(isinstance(u, str) for u in self.usecols): + if len(columns) > 1: + raise ValueError( + "If using multiple headers, usecols must be integers." + ) + col_indices = [] + + for col in self.usecols: + if isinstance(col, str): + try: + col_indices.append(usecols_key.index(col)) + except ValueError: + _validate_usecols_names(self.usecols, usecols_key) + else: + col_indices.append(col) + else: + col_indices = self.usecols + + columns = [ + [n for i, n in enumerate(column) if i in col_indices] + for column in columns + ] + self._col_indices = col_indices + return columns + + def _buffered_line(self): + """ + Return a line from buffer, filling buffer if required. + """ + if len(self.buf) > 0: + return self.buf[0] + else: + return self._next_line() + + def _check_for_bom(self, first_row): + """ + Checks whether the file begins with the BOM character. + If it does, remove it. In addition, if there is quoting + in the field subsequent to the BOM, remove it as well + because it technically takes place at the beginning of + the name, not the middle of it. + """ + # first_row will be a list, so we need to check + # that that list is not empty before proceeding. + if not first_row: + return first_row + + # The first element of this row is the one that could have the + # BOM that we want to remove. Check that the first element is a + # string before proceeding. + if not isinstance(first_row[0], str): + return first_row + + # Check that the string is not empty, as that would + # obviously not have a BOM at the start of it. + if not first_row[0]: + return first_row + + # Since the string is non-empty, check that it does + # in fact begin with a BOM. + first_elt = first_row[0][0] + if first_elt != _BOM: + return first_row + + first_row_bom = first_row[0] + + if len(first_row_bom) > 1 and first_row_bom[1] == self.quotechar: + start = 2 + quote = first_row_bom[1] + end = first_row_bom[2:].index(quote) + 2 + + # Extract the data between the quotation marks + new_row = first_row_bom[start:end] + + # Extract any remaining data after the second + # quotation mark. + if len(first_row_bom) > end + 1: + new_row += first_row_bom[end + 1 :] + return [new_row] + first_row[1:] + + elif len(first_row_bom) > 1: + return [first_row_bom[1:]] + else: + # First row is just the BOM, so we + # return an empty string. + return [""] + + def _is_line_empty(self, line): + """ + Check if a line is empty or not. + + Parameters + ---------- + line : str, array-like + The line of data to check. + + Returns + ------- + boolean : Whether or not the line is empty. + """ + return not line or all(not x for x in line) + + def _next_line(self): + if isinstance(self.data, list): + while self.skipfunc(self.pos): + self.pos += 1 + + while True: + try: + line = self._check_comments([self.data[self.pos]])[0] + self.pos += 1 + # either uncommented or blank to begin with + if not self.skip_blank_lines and ( + self._is_line_empty(self.data[self.pos - 1]) or line + ): + break + elif self.skip_blank_lines: + ret = self._remove_empty_lines([line]) + if ret: + line = ret[0] + break + except IndexError: + raise StopIteration + else: + while self.skipfunc(self.pos): + self.pos += 1 + next(self.data) + + while True: + orig_line = self._next_iter_line(row_num=self.pos + 1) + self.pos += 1 + + if orig_line is not None: + line = self._check_comments([orig_line])[0] + + if self.skip_blank_lines: + ret = self._remove_empty_lines([line]) + + if ret: + line = ret[0] + break + elif self._is_line_empty(orig_line) or line: + break + + # This was the first line of the file, + # which could contain the BOM at the + # beginning of it. + if self.pos == 1: + line = self._check_for_bom(line) + + self.line_pos += 1 + self.buf.append(line) + return line + + def _alert_malformed(self, msg, row_num): + """ + Alert a user about a malformed row. + + If `self.error_bad_lines` is True, the alert will be `ParserError`. + If `self.warn_bad_lines` is True, the alert will be printed out. + + Parameters + ---------- + msg : The error message to display. + row_num : The row number where the parsing error occurred. + Because this row number is displayed, we 1-index, + even though we 0-index internally. + """ + + if self.error_bad_lines: + raise ParserError(msg) + elif self.warn_bad_lines: + base = f"Skipping line {row_num}: " + sys.stderr.write(base + msg + "\n") + + def _next_iter_line(self, row_num): + """ + Wrapper around iterating through `self.data` (CSV source). + + When a CSV error is raised, we check for specific + error messages that allow us to customize the + error message displayed to the user. + + Parameters + ---------- + row_num : The row number of the line being parsed. + """ + + try: + return next(self.data) + except csv.Error as e: + if self.warn_bad_lines or self.error_bad_lines: + msg = str(e) + + if "NULL byte" in msg or "line contains NUL" in msg: + msg = ( + "NULL byte detected. This byte " + "cannot be processed in Python's " + "native csv library at the moment, " + "so please pass in engine='c' instead" + ) + + if self.skipfooter > 0: + reason = ( + "Error could possibly be due to " + "parsing errors in the skipped footer rows " + "(the skipfooter keyword is only applied " + "after Python's csv library has parsed " + "all rows)." + ) + msg += ". " + reason + + self._alert_malformed(msg, row_num) + return None + + def _check_comments(self, lines): + if self.comment is None: + return lines + ret = [] + for l in lines: + rl = [] + for x in l: + if not isinstance(x, str) or self.comment not in x: + rl.append(x) + else: + x = x[: x.find(self.comment)] + if len(x) > 0: + rl.append(x) + break + ret.append(rl) + return ret + + def _remove_empty_lines(self, lines): + """ + Iterate through the lines and remove any that are + either empty or contain only one whitespace value + + Parameters + ---------- + lines : array-like + The array of lines that we are to filter. + + Returns + ------- + filtered_lines : array-like + The same array of lines with the "empty" ones removed. + """ + + ret = [] + for l in lines: + # Remove empty lines and lines with only one whitespace value + if ( + len(l) > 1 + or len(l) == 1 + and (not isinstance(l[0], str) or l[0].strip()) + ): + ret.append(l) + return ret + + def _check_thousands(self, lines): + if self.thousands is None: + return lines + + return self._search_replace_num_columns( + lines=lines, search=self.thousands, replace="" + ) + + def _search_replace_num_columns(self, lines, search, replace): + ret = [] + for l in lines: + rl = [] + for i, x in enumerate(l): + if ( + not isinstance(x, str) + or search not in x + or (self._no_thousands_columns and i in self._no_thousands_columns) + or self.nonnum.search(x.strip()) + ): + rl.append(x) + else: + rl.append(x.replace(search, replace)) + ret.append(rl) + return ret + + def _check_decimal(self, lines): + if self.decimal == _parser_defaults["decimal"]: + return lines + + return self._search_replace_num_columns( + lines=lines, search=self.decimal, replace="." + ) + + def _clear_buffer(self): + self.buf = [] + + _implicit_index = False + + def _get_index_name(self, columns): + """ + Try several cases to get lines: + + 0) There are headers on row 0 and row 1 and their + total summed lengths equals the length of the next line. + Treat row 0 as columns and row 1 as indices + 1) Look for implicit index: there are more columns + on row 1 than row 0. If this is true, assume that row + 1 lists index columns and row 0 lists normal columns. + 2) Get index from the columns if it was listed. + """ + orig_names = list(columns) + columns = list(columns) + + try: + line = self._next_line() + except StopIteration: + line = None + + try: + next_line = self._next_line() + except StopIteration: + next_line = None + + # implicitly index_col=0 b/c 1 fewer column names + implicit_first_cols = 0 + if line is not None: + # leave it 0, #2442 + # Case 1 + if self.index_col is not False: + implicit_first_cols = len(line) - self.num_original_columns + + # Case 0 + if next_line is not None: + if len(next_line) == len(line) + self.num_original_columns: + # column and index names on diff rows + self.index_col = list(range(len(line))) + self.buf = self.buf[1:] + + for c in reversed(line): + columns.insert(0, c) + + # Update list of original names to include all indices. + orig_names = list(columns) + self.num_original_columns = len(columns) + return line, orig_names, columns + + if implicit_first_cols > 0: + # Case 1 + self._implicit_index = True + if self.index_col is None: + self.index_col = list(range(implicit_first_cols)) + + index_name = None + + else: + # Case 2 + (index_name, columns_, self.index_col) = _clean_index_names( + columns, self.index_col, self.unnamed_cols + ) + + return index_name, orig_names, columns + + def _rows_to_cols(self, content): + col_len = self.num_original_columns + + if self._implicit_index: + col_len += len(self.index_col) + + max_len = max(len(row) for row in content) + + # Check that there are no rows with too many + # elements in their row (rows with too few + # elements are padded with NaN). + if max_len > col_len and self.index_col is not False and self.usecols is None: + + footers = self.skipfooter if self.skipfooter else 0 + bad_lines = [] + + iter_content = enumerate(content) + content_len = len(content) + content = [] + + for (i, l) in iter_content: + actual_len = len(l) + + if actual_len > col_len: + if self.error_bad_lines or self.warn_bad_lines: + row_num = self.pos - (content_len - i + footers) + bad_lines.append((row_num, actual_len)) + + if self.error_bad_lines: + break + else: + content.append(l) + + for row_num, actual_len in bad_lines: + msg = ( + f"Expected {col_len} fields in line {row_num + 1}, saw " + f"{actual_len}" + ) + if ( + self.delimiter + and len(self.delimiter) > 1 + and self.quoting != csv.QUOTE_NONE + ): + # see gh-13374 + reason = ( + "Error could possibly be due to quotes being " + "ignored when a multi-char delimiter is used." + ) + msg += ". " + reason + + self._alert_malformed(msg, row_num + 1) + + # see gh-13320 + zipped_content = list(lib.to_object_array(content, min_width=col_len).T) + + if self.usecols: + if self._implicit_index: + zipped_content = [ + a + for i, a in enumerate(zipped_content) + if ( + i < len(self.index_col) + or i - len(self.index_col) in self._col_indices + ) + ] + else: + zipped_content = [ + a for i, a in enumerate(zipped_content) if i in self._col_indices + ] + return zipped_content + + def _get_lines(self, rows=None): + lines = self.buf + new_rows = None + + # already fetched some number + if rows is not None: + # we already have the lines in the buffer + if len(self.buf) >= rows: + new_rows, self.buf = self.buf[:rows], self.buf[rows:] + + # need some lines + else: + rows -= len(self.buf) + + if new_rows is None: + if isinstance(self.data, list): + if self.pos > len(self.data): + raise StopIteration + if rows is None: + new_rows = self.data[self.pos :] + new_pos = len(self.data) + else: + new_rows = self.data[self.pos : self.pos + rows] + new_pos = self.pos + rows + + # Check for stop rows. n.b.: self.skiprows is a set. + if self.skiprows: + new_rows = [ + row + for i, row in enumerate(new_rows) + if not self.skipfunc(i + self.pos) + ] + + lines.extend(new_rows) + self.pos = new_pos + + else: + new_rows = [] + try: + if rows is not None: + for _ in range(rows): + new_rows.append(next(self.data)) + lines.extend(new_rows) + else: + rows = 0 + + while True: + new_row = self._next_iter_line(row_num=self.pos + rows + 1) + rows += 1 + + if new_row is not None: + new_rows.append(new_row) + + except StopIteration: + if self.skiprows: + new_rows = [ + row + for i, row in enumerate(new_rows) + if not self.skipfunc(i + self.pos) + ] + lines.extend(new_rows) + if len(lines) == 0: + raise + self.pos += len(new_rows) + + self.buf = [] + else: + lines = new_rows + + if self.skipfooter: + lines = lines[: -self.skipfooter] + + lines = self._check_comments(lines) + if self.skip_blank_lines: + lines = self._remove_empty_lines(lines) + lines = self._check_thousands(lines) + return self._check_decimal(lines) + + +def _make_date_converter( + date_parser=None, dayfirst=False, infer_datetime_format=False, cache_dates=True +): + def converter(*date_cols): + if date_parser is None: + strs = parsing._concat_date_cols(date_cols) + + try: + return tools.to_datetime( + ensure_object(strs), + utc=None, + dayfirst=dayfirst, + errors="ignore", + infer_datetime_format=infer_datetime_format, + cache=cache_dates, + ).to_numpy() + + except ValueError: + return tools.to_datetime( + parsing.try_parse_dates(strs, dayfirst=dayfirst), cache=cache_dates + ) + else: + try: + result = tools.to_datetime( + date_parser(*date_cols), errors="ignore", cache=cache_dates + ) + if isinstance(result, datetime.datetime): + raise Exception("scalar parser") + return result + except Exception: + try: + return tools.to_datetime( + parsing.try_parse_dates( + parsing._concat_date_cols(date_cols), + parser=date_parser, + dayfirst=dayfirst, + ), + errors="ignore", + ) + except Exception: + return generic_parser(date_parser, *date_cols) + + return converter + + +def _process_date_conversion( + data_dict, + converter, + parse_spec, + index_col, + index_names, + columns, + keep_date_col=False, +): + def _isindex(colspec): + return (isinstance(index_col, list) and colspec in index_col) or ( + isinstance(index_names, list) and colspec in index_names + ) + + new_cols = [] + new_data = {} + + orig_names = columns + columns = list(columns) + + date_cols = set() + + if parse_spec is None or isinstance(parse_spec, bool): + return data_dict, columns + + if isinstance(parse_spec, list): + # list of column lists + for colspec in parse_spec: + if is_scalar(colspec): + if isinstance(colspec, int) and colspec not in data_dict: + colspec = orig_names[colspec] + if _isindex(colspec): + continue + data_dict[colspec] = converter(data_dict[colspec]) + else: + new_name, col, old_names = _try_convert_dates( + converter, colspec, data_dict, orig_names + ) + if new_name in data_dict: + raise ValueError(f"New date column already in dict {new_name}") + new_data[new_name] = col + new_cols.append(new_name) + date_cols.update(old_names) + + elif isinstance(parse_spec, dict): + # dict of new name to column list + for new_name, colspec in parse_spec.items(): + if new_name in data_dict: + raise ValueError(f"Date column {new_name} already in dict") + + _, col, old_names = _try_convert_dates( + converter, colspec, data_dict, orig_names + ) + + new_data[new_name] = col + new_cols.append(new_name) + date_cols.update(old_names) + + data_dict.update(new_data) + new_cols.extend(columns) + + if not keep_date_col: + for c in list(date_cols): + data_dict.pop(c) + new_cols.remove(c) + + return data_dict, new_cols + + +def _try_convert_dates(parser, colspec, data_dict, columns): + colset = set(columns) + colnames = [] + + for c in colspec: + if c in colset: + colnames.append(c) + elif isinstance(c, int) and c not in columns: + colnames.append(columns[c]) + else: + colnames.append(c) + + new_name = "_".join(str(x) for x in colnames) + to_parse = [data_dict[c] for c in colnames if c in data_dict] + + new_col = parser(*to_parse) + return new_name, new_col, colnames + + +def _clean_na_values(na_values, keep_default_na=True): + + if na_values is None: + if keep_default_na: + na_values = STR_NA_VALUES + else: + na_values = set() + na_fvalues = set() + elif isinstance(na_values, dict): + old_na_values = na_values.copy() + na_values = {} # Prevent aliasing. + + # Convert the values in the na_values dictionary + # into array-likes for further use. This is also + # where we append the default NaN values, provided + # that `keep_default_na=True`. + for k, v in old_na_values.items(): + if not is_list_like(v): + v = [v] + + if keep_default_na: + v = set(v) | STR_NA_VALUES + + na_values[k] = v + na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()} + else: + if not is_list_like(na_values): + na_values = [na_values] + na_values = _stringify_na_values(na_values) + if keep_default_na: + na_values = na_values | STR_NA_VALUES + + na_fvalues = _floatify_na_values(na_values) + + return na_values, na_fvalues + + +def _clean_index_names(columns, index_col, unnamed_cols): + if not _is_index_col(index_col): + return None, columns, index_col + + columns = list(columns) + + cp_cols = list(columns) + index_names = [] + + # don't mutate + index_col = list(index_col) + + for i, c in enumerate(index_col): + if isinstance(c, str): + index_names.append(c) + for j, name in enumerate(cp_cols): + if name == c: + index_col[i] = j + columns.remove(name) + break + else: + name = cp_cols[c] + columns.remove(name) + index_names.append(name) + + # Only clean index names that were placeholders. + for i, name in enumerate(index_names): + if isinstance(name, str) and name in unnamed_cols: + index_names[i] = None + + return index_names, columns, index_col + + +def _get_empty_meta(columns, index_col, index_names, dtype=None): + columns = list(columns) + + # Convert `dtype` to a defaultdict of some kind. + # This will enable us to write `dtype[col_name]` + # without worrying about KeyError issues later on. + if not isinstance(dtype, dict): + # if dtype == None, default will be np.object. + default_dtype = dtype or np.object + dtype = defaultdict(lambda: default_dtype) + else: + # Save a copy of the dictionary. + _dtype = dtype.copy() + dtype = defaultdict(lambda: np.object) + + # Convert column indexes to column names. + for k, v in _dtype.items(): + col = columns[k] if is_integer(k) else k + dtype[col] = v + + # Even though we have no data, the "index" of the empty DataFrame + # could for example still be an empty MultiIndex. Thus, we need to + # check whether we have any index columns specified, via either: + # + # 1) index_col (column indices) + # 2) index_names (column names) + # + # Both must be non-null to ensure a successful construction. Otherwise, + # we have to create a generic empty Index. + if (index_col is None or index_col is False) or index_names is None: + index = Index([]) + else: + data = [Series([], dtype=dtype[name]) for name in index_names] + index = ensure_index_from_sequences(data, names=index_names) + index_col.sort() + + for i, n in enumerate(index_col): + columns.pop(n - i) + + col_dict = {col_name: Series([], dtype=dtype[col_name]) for col_name in columns} + + return index, columns, col_dict + + +def _floatify_na_values(na_values): + # create float versions of the na_values + result = set() + for v in na_values: + try: + v = float(v) + if not np.isnan(v): + result.add(v) + except (TypeError, ValueError, OverflowError): + pass + return result + + +def _stringify_na_values(na_values): + """ return a stringified and numeric for these values """ + result = [] + for x in na_values: + result.append(str(x)) + result.append(x) + try: + v = float(x) + + # we are like 999 here + if v == int(v): + v = int(v) + result.append(f"{v}.0") + result.append(str(v)) + + result.append(v) + except (TypeError, ValueError, OverflowError): + pass + try: + result.append(int(x)) + except (TypeError, ValueError, OverflowError): + pass + return set(result) + + +def _get_na_values(col, na_values, na_fvalues, keep_default_na): + """ + Get the NaN values for a given column. + + Parameters + ---------- + col : str + The name of the column. + na_values : array-like, dict + The object listing the NaN values as strings. + na_fvalues : array-like, dict + The object listing the NaN values as floats. + keep_default_na : bool + If `na_values` is a dict, and the column is not mapped in the + dictionary, whether to return the default NaN values or the empty set. + + Returns + ------- + nan_tuple : A length-two tuple composed of + + 1) na_values : the string NaN values for that column. + 2) na_fvalues : the float NaN values for that column. + """ + + if isinstance(na_values, dict): + if col in na_values: + return na_values[col], na_fvalues[col] + else: + if keep_default_na: + return STR_NA_VALUES, set() + + return set(), set() + else: + return na_values, na_fvalues + + +def _get_col_names(colspec, columns): + colset = set(columns) + colnames = [] + for c in colspec: + if c in colset: + colnames.append(c) + elif isinstance(c, int): + colnames.append(columns[c]) + return colnames + + +class FixedWidthReader(abc.Iterator): + """ + A reader of fixed-width lines. + """ + + def __init__(self, f, colspecs, delimiter, comment, skiprows=None, infer_nrows=100): + self.f = f + self.buffer = None + self.delimiter = "\r\n" + delimiter if delimiter else "\n\r\t " + self.comment = comment + if colspecs == "infer": + self.colspecs = self.detect_colspecs( + infer_nrows=infer_nrows, skiprows=skiprows + ) + else: + self.colspecs = colspecs + + if not isinstance(self.colspecs, (tuple, list)): + raise TypeError( + "column specifications must be a list or tuple, " + f"input was a {type(colspecs).__name__}" + ) + + for colspec in self.colspecs: + if not ( + isinstance(colspec, (tuple, list)) + and len(colspec) == 2 + and isinstance(colspec[0], (int, np.integer, type(None))) + and isinstance(colspec[1], (int, np.integer, type(None))) + ): + raise TypeError( + "Each column specification must be " + "2 element tuple or list of integers" + ) + + def get_rows(self, infer_nrows, skiprows=None): + """ + Read rows from self.f, skipping as specified. + + We distinguish buffer_rows (the first <= infer_nrows + lines) from the rows returned to detect_colspecs + because it's simpler to leave the other locations + with skiprows logic alone than to modify them to + deal with the fact we skipped some rows here as + well. + + Parameters + ---------- + infer_nrows : int + Number of rows to read from self.f, not counting + rows that are skipped. + skiprows: set, optional + Indices of rows to skip. + + Returns + ------- + detect_rows : list of str + A list containing the rows to read. + + """ + if skiprows is None: + skiprows = set() + buffer_rows = [] + detect_rows = [] + for i, row in enumerate(self.f): + if i not in skiprows: + detect_rows.append(row) + buffer_rows.append(row) + if len(detect_rows) >= infer_nrows: + break + self.buffer = iter(buffer_rows) + return detect_rows + + def detect_colspecs(self, infer_nrows=100, skiprows=None): + # Regex escape the delimiters + delimiters = "".join(r"\{}".format(x) for x in self.delimiter) + pattern = re.compile("([^{}]+)".format(delimiters)) + rows = self.get_rows(infer_nrows, skiprows) + if not rows: + raise EmptyDataError("No rows from which to infer column width") + max_len = max(map(len, rows)) + mask = np.zeros(max_len + 1, dtype=int) + if self.comment is not None: + rows = [row.partition(self.comment)[0] for row in rows] + for row in rows: + for m in pattern.finditer(row): + mask[m.start() : m.end()] = 1 + shifted = np.roll(mask, 1) + shifted[0] = 0 + edges = np.where((mask ^ shifted) == 1)[0] + edge_pairs = list(zip(edges[::2], edges[1::2])) + return edge_pairs + + def __next__(self): + if self.buffer is not None: + try: + line = next(self.buffer) + except StopIteration: + self.buffer = None + line = next(self.f) + else: + line = next(self.f) + # Note: 'colspecs' is a sequence of half-open intervals. + return [line[fromm:to].strip(self.delimiter) for (fromm, to) in self.colspecs] + + +class FixedWidthFieldParser(PythonParser): + """ + Specialization that Converts fixed-width fields into DataFrames. + See PythonParser for details. + """ + + def __init__(self, f, **kwds): + # Support iterators, convert to a list. + self.colspecs = kwds.pop("colspecs") + self.infer_nrows = kwds.pop("infer_nrows") + PythonParser.__init__(self, f, **kwds) + + def _make_reader(self, f): + self.data = FixedWidthReader( + f, + self.colspecs, + self.delimiter, + self.comment, + self.skiprows, + self.infer_nrows, + ) diff --git a/venv/Lib/site-packages/pandas/io/pickle.py b/venv/Lib/site-packages/pandas/io/pickle.py new file mode 100644 index 0000000..e51f24b --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/pickle.py @@ -0,0 +1,198 @@ +""" pickle compat """ +import pickle +from typing import Any, Optional +import warnings + +from pandas._typing import FilePathOrBuffer +from pandas.compat import pickle_compat as pc + +from pandas.io.common import get_filepath_or_buffer, get_handle + + +def to_pickle( + obj: Any, + filepath_or_buffer: FilePathOrBuffer, + compression: Optional[str] = "infer", + protocol: int = pickle.HIGHEST_PROTOCOL, +): + """ + Pickle (serialize) object to file. + + Parameters + ---------- + obj : any object + Any python object. + filepath_or_buffer : str, path object or file-like object + File path, URL, or buffer where the pickled object will be stored. + + .. versionchanged:: 1.0.0 + Accept URL. URL has to be of S3 or GCS. + + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' + If 'infer' and 'path_or_url' is path-like, then detect compression from + the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no + compression) If 'infer' and 'path_or_url' is not path-like, then use + None (= no decompression). + protocol : int + Int which indicates which protocol should be used by the pickler, + default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible + values for this parameter depend on the version of Python. For Python + 2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value. + For Python >= 3.4, 4 is a valid value. A negative value for the + protocol parameter is equivalent to setting its value to + HIGHEST_PROTOCOL. + + .. [1] https://docs.python.org/3/library/pickle.html + .. versionadded:: 0.21.0 + + See Also + -------- + read_pickle : Load pickled pandas object (or any object) from file. + DataFrame.to_hdf : Write DataFrame to an HDF5 file. + DataFrame.to_sql : Write DataFrame to a SQL database. + DataFrame.to_parquet : Write a DataFrame to the binary parquet format. + + Examples + -------- + >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)}) + >>> original_df + foo bar + 0 0 5 + 1 1 6 + 2 2 7 + 3 3 8 + 4 4 9 + >>> pd.to_pickle(original_df, "./dummy.pkl") + + >>> unpickled_df = pd.read_pickle("./dummy.pkl") + >>> unpickled_df + foo bar + 0 0 5 + 1 1 6 + 2 2 7 + 3 3 8 + 4 4 9 + + >>> import os + >>> os.remove("./dummy.pkl") + """ + fp_or_buf, _, compression, should_close = get_filepath_or_buffer( + filepath_or_buffer, compression=compression, mode="wb" + ) + if not isinstance(fp_or_buf, str) and compression == "infer": + compression = None + f, fh = get_handle(fp_or_buf, "wb", compression=compression, is_text=False) + if protocol < 0: + protocol = pickle.HIGHEST_PROTOCOL + try: + f.write(pickle.dumps(obj, protocol=protocol)) + finally: + f.close() + for _f in fh: + _f.close() + if should_close: + try: + fp_or_buf.close() + except ValueError: + pass + + +def read_pickle( + filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] = "infer" +): + """ + Load pickled pandas object (or any object) from file. + + .. warning:: + + Loading pickled data received from untrusted sources can be + unsafe. See `here `__. + + Parameters + ---------- + filepath_or_buffer : str, path object or file-like object + File path, URL, or buffer where the pickled object will be loaded from. + + .. versionchanged:: 1.0.0 + Accept URL. URL is not limited to S3 and GCS. + + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' + If 'infer' and 'path_or_url' is path-like, then detect compression from + the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no + compression) If 'infer' and 'path_or_url' is not path-like, then use + None (= no decompression). + + Returns + ------- + unpickled : same type as object stored in file + + See Also + -------- + DataFrame.to_pickle : Pickle (serialize) DataFrame object to file. + Series.to_pickle : Pickle (serialize) Series object to file. + read_hdf : Read HDF5 file into a DataFrame. + read_sql : Read SQL query or database table into a DataFrame. + read_parquet : Load a parquet object, returning a DataFrame. + + Notes + ----- + read_pickle is only guaranteed to be backwards compatible to pandas 0.20.3. + + Examples + -------- + >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)}) + >>> original_df + foo bar + 0 0 5 + 1 1 6 + 2 2 7 + 3 3 8 + 4 4 9 + >>> pd.to_pickle(original_df, "./dummy.pkl") + + >>> unpickled_df = pd.read_pickle("./dummy.pkl") + >>> unpickled_df + foo bar + 0 0 5 + 1 1 6 + 2 2 7 + 3 3 8 + 4 4 9 + + >>> import os + >>> os.remove("./dummy.pkl") + """ + fp_or_buf, _, compression, should_close = get_filepath_or_buffer( + filepath_or_buffer, compression=compression + ) + if not isinstance(fp_or_buf, str) and compression == "infer": + compression = None + f, fh = get_handle(fp_or_buf, "rb", compression=compression, is_text=False) + + # 1) try standard library Pickle + # 2) try pickle_compat (older pandas version) to handle subclass changes + + excs_to_catch = (AttributeError, ImportError, ModuleNotFoundError) + + try: + with warnings.catch_warnings(record=True): + # We want to silence any warnings about, e.g. moved modules. + warnings.simplefilter("ignore", Warning) + return pickle.load(f) + except excs_to_catch: + # e.g. + # "No module named 'pandas.core.sparse.series'" + # "Can't get attribute '__nat_unpickle' on %s,key->%s] [items->%s] +""" + +# formats +_FORMAT_MAP = {"f": "fixed", "fixed": "fixed", "t": "table", "table": "table"} + +# axes map +_AXES_MAP = {DataFrame: [0]} + +# register our configuration options +dropna_doc = """ +: boolean + drop ALL nan rows when appending to a table +""" +format_doc = """ +: format + default format writing format, if None, then + put will default to 'fixed' and append will default to 'table' +""" + +with config.config_prefix("io.hdf"): + config.register_option("dropna_table", False, dropna_doc, validator=config.is_bool) + config.register_option( + "default_format", + None, + format_doc, + validator=config.is_one_of_factory(["fixed", "table", None]), + ) + +# oh the troubles to reduce import time +_table_mod = None +_table_file_open_policy_is_strict = False + + +def _tables(): + global _table_mod + global _table_file_open_policy_is_strict + if _table_mod is None: + import tables + + _table_mod = tables + + # set the file open policy + # return the file open policy; this changes as of pytables 3.1 + # depending on the HDF5 version + try: + _table_file_open_policy_is_strict = ( + tables.file._FILE_OPEN_POLICY == "strict" + ) + except AttributeError: + pass + + return _table_mod + + +# interface to/from ### + + +def to_hdf( + path_or_buf, + key: str, + value: FrameOrSeries, + mode: str = "a", + complevel: Optional[int] = None, + complib: Optional[str] = None, + append: bool = False, + format: Optional[str] = None, + index: bool = True, + min_itemsize: Optional[Union[int, Dict[str, int]]] = None, + nan_rep=None, + dropna: Optional[bool] = None, + data_columns: Optional[List[str]] = None, + errors: str = "strict", + encoding: str = "UTF-8", +): + """ store this object, close it if we opened it """ + + if append: + f = lambda store: store.append( + key, + value, + format=format, + index=index, + min_itemsize=min_itemsize, + nan_rep=nan_rep, + dropna=dropna, + data_columns=data_columns, + errors=errors, + encoding=encoding, + ) + else: + # NB: dropna is not passed to `put` + f = lambda store: store.put( + key, + value, + format=format, + index=index, + min_itemsize=min_itemsize, + nan_rep=nan_rep, + data_columns=data_columns, + errors=errors, + encoding=encoding, + ) + + path_or_buf = stringify_path(path_or_buf) + if isinstance(path_or_buf, str): + with HDFStore( + path_or_buf, mode=mode, complevel=complevel, complib=complib + ) as store: + f(store) + else: + f(path_or_buf) + + +def read_hdf( + path_or_buf, + key=None, + mode: str = "r", + errors: str = "strict", + where=None, + start: Optional[int] = None, + stop: Optional[int] = None, + columns=None, + iterator=False, + chunksize: Optional[int] = None, + **kwargs, +): + """ + Read from the store, close it if we opened it. + + Retrieve pandas object stored in file, optionally based on where + criteria + + Parameters + ---------- + path_or_buf : str, path object, pandas.HDFStore or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: ``file://localhost/path/to/table.h5``. + + If you want to pass in a path object, pandas accepts any + ``os.PathLike``. + + Alternatively, pandas accepts an open :class:`pandas.HDFStore` object. + + By file-like object, we refer to objects with a ``read()`` method, + such as a file handler (e.g. via builtin ``open`` function) + or ``StringIO``. + + .. versionadded:: 0.21.0 support for __fspath__ protocol. + + key : object, optional + The group identifier in the store. Can be omitted if the HDF file + contains a single pandas object. + mode : {'r', 'r+', 'a'}, default 'r' + Mode to use when opening the file. Ignored if path_or_buf is a + :class:`pandas.HDFStore`. Default is 'r'. + where : list, optional + A list of Term (or convertible) objects. + start : int, optional + Row number to start selection. + stop : int, optional + Row number to stop selection. + columns : list, optional + A list of columns names to return. + iterator : bool, optional + Return an iterator object. + chunksize : int, optional + Number of rows to include in an iteration when using an iterator. + errors : str, default 'strict' + Specifies how encoding and decoding errors are to be handled. + See the errors argument for :func:`open` for a full list + of options. + **kwargs + Additional keyword arguments passed to HDFStore. + + Returns + ------- + item : object + The selected object. Return type depends on the object stored. + + See Also + -------- + DataFrame.to_hdf : Write a HDF file from a DataFrame. + HDFStore : Low-level access to HDF files. + + Examples + -------- + >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z']) + >>> df.to_hdf('./store.h5', 'data') + >>> reread = pd.read_hdf('./store.h5') + """ + + if mode not in ["r", "r+", "a"]: + raise ValueError( + f"mode {mode} is not allowed while performing a read. " + f"Allowed modes are r, r+ and a." + ) + # grab the scope + if where is not None: + where = _ensure_term(where, scope_level=1) + + if isinstance(path_or_buf, HDFStore): + if not path_or_buf.is_open: + raise IOError("The HDFStore must be open for reading.") + + store = path_or_buf + auto_close = False + else: + path_or_buf = stringify_path(path_or_buf) + if not isinstance(path_or_buf, str): + raise NotImplementedError( + "Support for generic buffers has not been implemented." + ) + try: + exists = os.path.exists(path_or_buf) + + # if filepath is too long + except (TypeError, ValueError): + exists = False + + if not exists: + raise FileNotFoundError(f"File {path_or_buf} does not exist") + + store = HDFStore(path_or_buf, mode=mode, errors=errors, **kwargs) + # can't auto open/close if we are using an iterator + # so delegate to the iterator + auto_close = True + + try: + if key is None: + groups = store.groups() + if len(groups) == 0: + raise ValueError("No dataset in HDF5 file.") + candidate_only_group = groups[0] + + # For the HDF file to have only one dataset, all other groups + # should then be metadata groups for that candidate group. (This + # assumes that the groups() method enumerates parent groups + # before their children.) + for group_to_check in groups[1:]: + if not _is_metadata_of(group_to_check, candidate_only_group): + raise ValueError( + "key must be provided when HDF5 file " + "contains multiple datasets." + ) + key = candidate_only_group._v_pathname + return store.select( + key, + where=where, + start=start, + stop=stop, + columns=columns, + iterator=iterator, + chunksize=chunksize, + auto_close=auto_close, + ) + except (ValueError, TypeError, KeyError): + if not isinstance(path_or_buf, HDFStore): + # if there is an error, close the store if we opened it. + try: + store.close() + except AttributeError: + pass + + raise + + +def _is_metadata_of(group: "Node", parent_group: "Node") -> bool: + """Check if a given group is a metadata group for a given parent_group.""" + if group._v_depth <= parent_group._v_depth: + return False + + current = group + while current._v_depth > 1: + parent = current._v_parent + if parent == parent_group and current._v_name == "meta": + return True + current = current._v_parent + return False + + +class HDFStore: + """ + Dict-like IO interface for storing pandas objects in PyTables. + + Either Fixed or Table format. + + Parameters + ---------- + path : string + File path to HDF5 file + mode : {'a', 'w', 'r', 'r+'}, default 'a' + + ``'r'`` + Read-only; no data can be modified. + ``'w'`` + Write; a new file is created (an existing file with the same + name would be deleted). + ``'a'`` + Append; an existing file is opened for reading and writing, + and if the file does not exist it is created. + ``'r+'`` + It is similar to ``'a'``, but the file must already exist. + complevel : int, 0-9, default None + Specifies a compression level for data. + A value of 0 or None disables compression. + complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib' + Specifies the compression library to be used. + As of v0.20.2 these additional compressors for Blosc are supported + (default if no compressor specified: 'blosc:blosclz'): + {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', + 'blosc:zlib', 'blosc:zstd'}. + Specifying a compression library which is not available issues + a ValueError. + fletcher32 : bool, default False + If applying compression use the fletcher32 checksum + + Examples + -------- + >>> bar = pd.DataFrame(np.random.randn(10, 4)) + >>> store = pd.HDFStore('test.h5') + >>> store['foo'] = bar # write to HDF5 + >>> bar = store['foo'] # retrieve + >>> store.close() + """ + + _handle: Optional["File"] + _mode: str + _complevel: int + _fletcher32: bool + + def __init__( + self, + path, + mode: str = "a", + complevel: Optional[int] = None, + complib=None, + fletcher32: bool = False, + **kwargs, + ): + + if "format" in kwargs: + raise ValueError("format is not a defined argument for HDFStore") + + tables = import_optional_dependency("tables") + + if complib is not None and complib not in tables.filters.all_complibs: + raise ValueError( + f"complib only supports {tables.filters.all_complibs} compression." + ) + + if complib is None and complevel is not None: + complib = tables.filters.default_complib + + self._path = stringify_path(path) + if mode is None: + mode = "a" + self._mode = mode + self._handle = None + self._complevel = complevel if complevel else 0 + self._complib = complib + self._fletcher32 = fletcher32 + self._filters = None + self.open(mode=mode, **kwargs) + + def __fspath__(self): + return self._path + + @property + def root(self): + """ return the root node """ + self._check_if_open() + return self._handle.root + + @property + def filename(self): + return self._path + + def __getitem__(self, key: str): + return self.get(key) + + def __setitem__(self, key: str, value): + self.put(key, value) + + def __delitem__(self, key: str): + return self.remove(key) + + def __getattr__(self, name: str): + """ allow attribute access to get stores """ + try: + return self.get(name) + except (KeyError, ClosedFileError): + pass + raise AttributeError( + f"'{type(self).__name__}' object has no attribute '{name}'" + ) + + def __contains__(self, key: str) -> bool: + """ check for existence of this key + can match the exact pathname or the pathnm w/o the leading '/' + """ + node = self.get_node(key) + if node is not None: + name = node._v_pathname + if name == key or name[1:] == key: + return True + return False + + def __len__(self) -> int: + return len(self.groups()) + + def __repr__(self) -> str: + pstr = pprint_thing(self._path) + return f"{type(self)}\nFile path: {pstr}\n" + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + + def keys(self) -> List[str]: + """ + Return a list of keys corresponding to objects stored in HDFStore. + + Returns + ------- + list + List of ABSOLUTE path-names (e.g. have the leading '/'). + """ + return [n._v_pathname for n in self.groups()] + + def __iter__(self): + return iter(self.keys()) + + def items(self): + """ + iterate on key->group + """ + for g in self.groups(): + yield g._v_pathname, g + + iteritems = items + + def open(self, mode: str = "a", **kwargs): + """ + Open the file in the specified mode + + Parameters + ---------- + mode : {'a', 'w', 'r', 'r+'}, default 'a' + See HDFStore docstring or tables.open_file for info about modes + """ + tables = _tables() + + if self._mode != mode: + + # if we are changing a write mode to read, ok + if self._mode in ["a", "w"] and mode in ["r", "r+"]: + pass + elif mode in ["w"]: + + # this would truncate, raise here + if self.is_open: + raise PossibleDataLossError( + f"Re-opening the file [{self._path}] with mode [{self._mode}] " + "will delete the current file!" + ) + + self._mode = mode + + # close and reopen the handle + if self.is_open: + self.close() + + if self._complevel and self._complevel > 0: + self._filters = _tables().Filters( + self._complevel, self._complib, fletcher32=self._fletcher32 + ) + + try: + self._handle = tables.open_file(self._path, self._mode, **kwargs) + except IOError as err: # pragma: no cover + if "can not be written" in str(err): + print(f"Opening {self._path} in read-only mode") + self._handle = tables.open_file(self._path, "r", **kwargs) + else: + raise + + except ValueError as err: + + # trap PyTables >= 3.1 FILE_OPEN_POLICY exception + # to provide an updated message + if "FILE_OPEN_POLICY" in str(err): + hdf_version = tables.get_hdf5_version() + err = ValueError( + f"PyTables [{tables.__version__}] no longer supports " + "opening multiple files\n" + "even in read-only mode on this HDF5 version " + f"[{hdf_version}]. You can accept this\n" + "and not open the same file multiple times at once,\n" + "upgrade the HDF5 version, or downgrade to PyTables 3.0.0 " + "which allows\n" + "files to be opened multiple times at once\n" + ) + + raise err + + except Exception as err: + + # trying to read from a non-existent file causes an error which + # is not part of IOError, make it one + if self._mode == "r" and "Unable to open/create file" in str(err): + raise IOError(str(err)) + raise + + def close(self): + """ + Close the PyTables file handle + """ + if self._handle is not None: + self._handle.close() + self._handle = None + + @property + def is_open(self) -> bool: + """ + return a boolean indicating whether the file is open + """ + if self._handle is None: + return False + return bool(self._handle.isopen) + + def flush(self, fsync: bool = False): + """ + Force all buffered modifications to be written to disk. + + Parameters + ---------- + fsync : bool (default False) + call ``os.fsync()`` on the file handle to force writing to disk. + + Notes + ----- + Without ``fsync=True``, flushing may not guarantee that the OS writes + to disk. With fsync, the operation will block until the OS claims the + file has been written; however, other caching layers may still + interfere. + """ + if self._handle is not None: + self._handle.flush() + if fsync: + try: + os.fsync(self._handle.fileno()) + except OSError: + pass + + def get(self, key: str): + """ + Retrieve pandas object stored in file. + + Parameters + ---------- + key : str + + Returns + ------- + object + Same type as object stored in file. + """ + group = self.get_node(key) + if group is None: + raise KeyError(f"No object named {key} in the file") + return self._read_group(group) + + def select( + self, + key: str, + where=None, + start=None, + stop=None, + columns=None, + iterator=False, + chunksize=None, + auto_close: bool = False, + ): + """ + Retrieve pandas object stored in file, optionally based on where criteria. + + Parameters + ---------- + key : str + Object being retrieved from file. + where : list, default None + List of Term (or convertible) objects, optional. + start : int, default None + Row number to start selection. + stop : int, default None + Row number to stop selection. + columns : list, default None + A list of columns that if not None, will limit the return columns. + iterator : bool, default False + Returns an iterator. + chunksize : int, default None + Number or rows to include in iteration, return an iterator. + auto_close : bool, default False + Should automatically close the store when finished. + + Returns + ------- + object + Retrieved object from file. + """ + group = self.get_node(key) + if group is None: + raise KeyError(f"No object named {key} in the file") + + # create the storer and axes + where = _ensure_term(where, scope_level=1) + s = self._create_storer(group) + s.infer_axes() + + # function to call on iteration + def func(_start, _stop, _where): + return s.read(start=_start, stop=_stop, where=_where, columns=columns) + + # create the iterator + it = TableIterator( + self, + s, + func, + where=where, + nrows=s.nrows, + start=start, + stop=stop, + iterator=iterator, + chunksize=chunksize, + auto_close=auto_close, + ) + + return it.get_result() + + def select_as_coordinates( + self, + key: str, + where=None, + start: Optional[int] = None, + stop: Optional[int] = None, + ): + """ + return the selection as an Index + + Parameters + ---------- + key : str + where : list of Term (or convertible) objects, optional + start : integer (defaults to None), row number to start selection + stop : integer (defaults to None), row number to stop selection + """ + where = _ensure_term(where, scope_level=1) + tbl = self.get_storer(key) + if not isinstance(tbl, Table): + raise TypeError("can only read_coordinates with a table") + return tbl.read_coordinates(where=where, start=start, stop=stop) + + def select_column( + self, + key: str, + column: str, + start: Optional[int] = None, + stop: Optional[int] = None, + ): + """ + return a single column from the table. This is generally only useful to + select an indexable + + Parameters + ---------- + key : str + column : str + The column of interest. + start : int or None, default None + stop : int or None, default None + + Raises + ------ + raises KeyError if the column is not found (or key is not a valid + store) + raises ValueError if the column can not be extracted individually (it + is part of a data block) + + """ + tbl = self.get_storer(key) + if not isinstance(tbl, Table): + raise TypeError("can only read_column with a table") + return tbl.read_column(column=column, start=start, stop=stop) + + def select_as_multiple( + self, + keys, + where=None, + selector=None, + columns=None, + start=None, + stop=None, + iterator=False, + chunksize=None, + auto_close: bool = False, + ): + """ + Retrieve pandas objects from multiple tables. + + Parameters + ---------- + keys : a list of the tables + selector : the table to apply the where criteria (defaults to keys[0] + if not supplied) + columns : the columns I want back + start : integer (defaults to None), row number to start selection + stop : integer (defaults to None), row number to stop selection + iterator : boolean, return an iterator, default False + chunksize : nrows to include in iteration, return an iterator + auto_close : bool, default False + Should automatically close the store when finished. + + Raises + ------ + raises KeyError if keys or selector is not found or keys is empty + raises TypeError if keys is not a list or tuple + raises ValueError if the tables are not ALL THE SAME DIMENSIONS + """ + + # default to single select + where = _ensure_term(where, scope_level=1) + if isinstance(keys, (list, tuple)) and len(keys) == 1: + keys = keys[0] + if isinstance(keys, str): + return self.select( + key=keys, + where=where, + columns=columns, + start=start, + stop=stop, + iterator=iterator, + chunksize=chunksize, + auto_close=auto_close, + ) + + if not isinstance(keys, (list, tuple)): + raise TypeError("keys must be a list/tuple") + + if not len(keys): + raise ValueError("keys must have a non-zero length") + + if selector is None: + selector = keys[0] + + # collect the tables + tbls = [self.get_storer(k) for k in keys] + s = self.get_storer(selector) + + # validate rows + nrows = None + for t, k in itertools.chain([(s, selector)], zip(tbls, keys)): + if t is None: + raise KeyError(f"Invalid table [{k}]") + if not t.is_table: + raise TypeError( + f"object [{t.pathname}] is not a table, and cannot be used in all " + "select as multiple" + ) + + if nrows is None: + nrows = t.nrows + elif t.nrows != nrows: + raise ValueError("all tables must have exactly the same nrows!") + + # The isinstance checks here are redundant with the check above, + # but necessary for mypy; see GH#29757 + _tbls = [x for x in tbls if isinstance(x, Table)] + + # axis is the concentration axes + axis = list({t.non_index_axes[0][0] for t in _tbls})[0] + + def func(_start, _stop, _where): + + # retrieve the objs, _where is always passed as a set of + # coordinates here + objs = [ + t.read(where=_where, columns=columns, start=_start, stop=_stop) + for t in tbls + ] + + # concat and return + return concat(objs, axis=axis, verify_integrity=False)._consolidate() + + # create the iterator + it = TableIterator( + self, + s, + func, + where=where, + nrows=nrows, + start=start, + stop=stop, + iterator=iterator, + chunksize=chunksize, + auto_close=auto_close, + ) + + return it.get_result(coordinates=True) + + def put( + self, + key: str, + value: FrameOrSeries, + format=None, + index=True, + append=False, + complib=None, + complevel: Optional[int] = None, + min_itemsize: Optional[Union[int, Dict[str, int]]] = None, + nan_rep=None, + data_columns: Optional[List[str]] = None, + encoding=None, + errors: str = "strict", + ): + """ + Store object in HDFStore. + + Parameters + ---------- + key : str + value : {Series, DataFrame} + format : 'fixed(f)|table(t)', default is 'fixed' + fixed(f) : Fixed format + Fast writing/reading. Not-appendable, nor searchable. + table(t) : Table format + Write as a PyTables Table structure which may perform + worse but allow more flexible operations like searching + / selecting subsets of the data. + append : bool, default False + This will force Table format, append the input data to the + existing. + data_columns : list, default None + List of columns to create as data columns, or True to + use all columns. See `here + `__. + encoding : str, default None + Provide an encoding for strings. + dropna : bool, default False, do not write an ALL nan row to + The store settable by the option 'io.hdf.dropna_table'. + """ + if format is None: + format = get_option("io.hdf.default_format") or "fixed" + format = self._validate_format(format) + self._write_to_group( + key, + value, + format=format, + index=index, + append=append, + complib=complib, + complevel=complevel, + min_itemsize=min_itemsize, + nan_rep=nan_rep, + data_columns=data_columns, + encoding=encoding, + errors=errors, + ) + + def remove(self, key: str, where=None, start=None, stop=None): + """ + Remove pandas object partially by specifying the where condition + + Parameters + ---------- + key : string + Node to remove or delete rows from + where : list of Term (or convertible) objects, optional + start : integer (defaults to None), row number to start selection + stop : integer (defaults to None), row number to stop selection + + Returns + ------- + number of rows removed (or None if not a Table) + + Raises + ------ + raises KeyError if key is not a valid store + + """ + where = _ensure_term(where, scope_level=1) + try: + s = self.get_storer(key) + except KeyError: + # the key is not a valid store, re-raising KeyError + raise + except AssertionError: + # surface any assertion errors for e.g. debugging + raise + except Exception: + # In tests we get here with ClosedFileError, TypeError, and + # _table_mod.NoSuchNodeError. TODO: Catch only these? + + if where is not None: + raise ValueError( + "trying to remove a node with a non-None where clause!" + ) + + # we are actually trying to remove a node (with children) + node = self.get_node(key) + if node is not None: + node._f_remove(recursive=True) + return None + + # remove the node + if com.all_none(where, start, stop): + s.group._f_remove(recursive=True) + + # delete from the table + else: + if not s.is_table: + raise ValueError( + "can only remove with where on objects written as tables" + ) + return s.delete(where=where, start=start, stop=stop) + + def append( + self, + key: str, + value: FrameOrSeries, + format=None, + axes=None, + index=True, + append=True, + complib=None, + complevel: Optional[int] = None, + columns=None, + min_itemsize: Optional[Union[int, Dict[str, int]]] = None, + nan_rep=None, + chunksize=None, + expectedrows=None, + dropna: Optional[bool] = None, + data_columns: Optional[List[str]] = None, + encoding=None, + errors: str = "strict", + ): + """ + Append to Table in file. Node must already exist and be Table + format. + + Parameters + ---------- + key : str + value : {Series, DataFrame} + format : 'table' is the default + table(t) : table format + Write as a PyTables Table structure which may perform + worse but allow more flexible operations like searching + / selecting subsets of the data. + append : bool, default True + Append the input data to the existing. + data_columns : list of columns, or True, default None + List of columns to create as indexed data columns for on-disk + queries, or True to use all columns. By default only the axes + of the object are indexed. See `here + `__. + min_itemsize : dict of columns that specify minimum string sizes + nan_rep : string to use as string nan representation + chunksize : size to chunk the writing + expectedrows : expected TOTAL row size of this table + encoding : default None, provide an encoding for strings + dropna : bool, default False + Do not write an ALL nan row to the store settable + by the option 'io.hdf.dropna_table'. + + Notes + ----- + Does *not* check if data being appended overlaps with existing + data in the table, so be careful + """ + if columns is not None: + raise TypeError( + "columns is not a supported keyword in append, try data_columns" + ) + + if dropna is None: + dropna = get_option("io.hdf.dropna_table") + if format is None: + format = get_option("io.hdf.default_format") or "table" + format = self._validate_format(format) + self._write_to_group( + key, + value, + format=format, + axes=axes, + index=index, + append=append, + complib=complib, + complevel=complevel, + min_itemsize=min_itemsize, + nan_rep=nan_rep, + chunksize=chunksize, + expectedrows=expectedrows, + dropna=dropna, + data_columns=data_columns, + encoding=encoding, + errors=errors, + ) + + def append_to_multiple( + self, + d: Dict, + value, + selector, + data_columns=None, + axes=None, + dropna=False, + **kwargs, + ): + """ + Append to multiple tables + + Parameters + ---------- + d : a dict of table_name to table_columns, None is acceptable as the + values of one node (this will get all the remaining columns) + value : a pandas object + selector : a string that designates the indexable table; all of its + columns will be designed as data_columns, unless data_columns is + passed, in which case these are used + data_columns : list of columns to create as data columns, or True to + use all columns + dropna : if evaluates to True, drop rows from all tables if any single + row in each table has all NaN. Default False. + + Notes + ----- + axes parameter is currently not accepted + + """ + if axes is not None: + raise TypeError( + "axes is currently not accepted as a parameter to append_to_multiple; " + "you can create the tables independently instead" + ) + + if not isinstance(d, dict): + raise ValueError( + "append_to_multiple must have a dictionary specified as the " + "way to split the value" + ) + + if selector not in d: + raise ValueError( + "append_to_multiple requires a selector that is in passed dict" + ) + + # figure out the splitting axis (the non_index_axis) + axis = list(set(range(value.ndim)) - set(_AXES_MAP[type(value)]))[0] + + # figure out how to split the value + remain_key = None + remain_values: List = [] + for k, v in d.items(): + if v is None: + if remain_key is not None: + raise ValueError( + "append_to_multiple can only have one value in d that " + "is None" + ) + remain_key = k + else: + remain_values.extend(v) + if remain_key is not None: + ordered = value.axes[axis] + ordd = ordered.difference(Index(remain_values)) + ordd = sorted(ordered.get_indexer(ordd)) + d[remain_key] = ordered.take(ordd) + + # data_columns + if data_columns is None: + data_columns = d[selector] + + # ensure rows are synchronized across the tables + if dropna: + idxs = (value[cols].dropna(how="all").index for cols in d.values()) + valid_index = next(idxs) + for index in idxs: + valid_index = valid_index.intersection(index) + value = value.loc[valid_index] + + # append + for k, v in d.items(): + dc = data_columns if k == selector else None + + # compute the val + val = value.reindex(v, axis=axis) + + self.append(k, val, data_columns=dc, **kwargs) + + def create_table_index( + self, + key: str, + columns=None, + optlevel: Optional[int] = None, + kind: Optional[str] = None, + ): + """ + Create a pytables index on the table. + + Parameters + ---------- + key : str + columns : None, bool, or listlike[str] + Indicate which columns to create an index on. + + * False : Do not create any indexes. + * True : Create indexes on all columns. + * None : Create indexes on all columns. + * listlike : Create indexes on the given columns. + + optlevel : int or None, default None + Optimization level, if None, pytables defaults to 6. + kind : str or None, default None + Kind of index, if None, pytables defaults to "medium". + + Raises + ------ + TypeError: raises if the node is not a table + """ + + # version requirements + _tables() + s = self.get_storer(key) + if s is None: + return + + if not isinstance(s, Table): + raise TypeError("cannot create table index on a Fixed format store") + s.create_index(columns=columns, optlevel=optlevel, kind=kind) + + def groups(self): + """ + Return a list of all the top-level nodes. + + Each node returned is not a pandas storage object. + + Returns + ------- + list + List of objects. + """ + _tables() + self._check_if_open() + return [ + g + for g in self._handle.walk_groups() + if ( + not isinstance(g, _table_mod.link.Link) + and ( + getattr(g._v_attrs, "pandas_type", None) + or getattr(g, "table", None) + or (isinstance(g, _table_mod.table.Table) and g._v_name != "table") + ) + ) + ] + + def walk(self, where="/"): + """ + Walk the pytables group hierarchy for pandas objects. + + This generator will yield the group path, subgroups and pandas object + names for each group. + + Any non-pandas PyTables objects that are not a group will be ignored. + + The `where` group itself is listed first (preorder), then each of its + child groups (following an alphanumerical order) is also traversed, + following the same procedure. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + where : str, default "/" + Group where to start walking. + + Yields + ------ + path : str + Full path to a group (without trailing '/'). + groups : list + Names (strings) of the groups contained in `path`. + leaves : list + Names (strings) of the pandas objects contained in `path`. + """ + _tables() + self._check_if_open() + for g in self._handle.walk_groups(where): + if getattr(g._v_attrs, "pandas_type", None) is not None: + continue + + groups = [] + leaves = [] + for child in g._v_children.values(): + pandas_type = getattr(child._v_attrs, "pandas_type", None) + if pandas_type is None: + if isinstance(child, _table_mod.group.Group): + groups.append(child._v_name) + else: + leaves.append(child._v_name) + + yield (g._v_pathname.rstrip("/"), groups, leaves) + + def get_node(self, key: str) -> Optional["Node"]: + """ return the node with the key or None if it does not exist """ + self._check_if_open() + if not key.startswith("/"): + key = "/" + key + + assert self._handle is not None + assert _table_mod is not None # for mypy + try: + node = self._handle.get_node(self.root, key) + except _table_mod.exceptions.NoSuchNodeError: + return None + + assert isinstance(node, _table_mod.Node), type(node) + return node + + def get_storer(self, key: str) -> Union["GenericFixed", "Table"]: + """ return the storer object for a key, raise if not in the file """ + group = self.get_node(key) + if group is None: + raise KeyError(f"No object named {key} in the file") + + s = self._create_storer(group) + s.infer_axes() + return s + + def copy( + self, + file, + mode="w", + propindexes: bool = True, + keys=None, + complib=None, + complevel: Optional[int] = None, + fletcher32: bool = False, + overwrite=True, + ): + """ + Copy the existing store to a new file, updating in place. + + Parameters + ---------- + propindexes: bool, default True + Restore indexes in copied file. + keys : list of keys to include in the copy (defaults to all) + overwrite : overwrite (remove and replace) existing nodes in the + new store (default is True) + mode, complib, complevel, fletcher32 same as in HDFStore.__init__ + + Returns + ------- + open file handle of the new store + """ + new_store = HDFStore( + file, mode=mode, complib=complib, complevel=complevel, fletcher32=fletcher32 + ) + if keys is None: + keys = list(self.keys()) + if not isinstance(keys, (tuple, list)): + keys = [keys] + for k in keys: + s = self.get_storer(k) + if s is not None: + + if k in new_store: + if overwrite: + new_store.remove(k) + + data = self.select(k) + if isinstance(s, Table): + + index: Union[bool, List[str]] = False + if propindexes: + index = [a.name for a in s.axes if a.is_indexed] + new_store.append( + k, + data, + index=index, + data_columns=getattr(s, "data_columns", None), + encoding=s.encoding, + ) + else: + new_store.put(k, data, encoding=s.encoding) + + return new_store + + def info(self) -> str: + """ + Print detailed information on the store. + + .. versionadded:: 0.21.0 + + Returns + ------- + str + """ + path = pprint_thing(self._path) + output = f"{type(self)}\nFile path: {path}\n" + + if self.is_open: + lkeys = sorted(self.keys()) + if len(lkeys): + keys = [] + values = [] + + for k in lkeys: + try: + s = self.get_storer(k) + if s is not None: + keys.append(pprint_thing(s.pathname or k)) + values.append(pprint_thing(s or "invalid_HDFStore node")) + except AssertionError: + # surface any assertion errors for e.g. debugging + raise + except Exception as detail: + keys.append(k) + dstr = pprint_thing(detail) + values.append(f"[invalid_HDFStore node: {dstr}]") + + output += adjoin(12, keys, values) + else: + output += "Empty" + else: + output += "File is CLOSED" + + return output + + # ------------------------------------------------------------------------ + # private methods + + def _check_if_open(self): + if not self.is_open: + raise ClosedFileError(f"{self._path} file is not open!") + + def _validate_format(self, format: str) -> str: + """ validate / deprecate formats """ + + # validate + try: + format = _FORMAT_MAP[format.lower()] + except KeyError: + raise TypeError(f"invalid HDFStore format specified [{format}]") + + return format + + def _create_storer( + self, + group, + format=None, + value: Optional[FrameOrSeries] = None, + encoding: str = "UTF-8", + errors: str = "strict", + ) -> Union["GenericFixed", "Table"]: + """ return a suitable class to operate """ + + cls: Union[Type["GenericFixed"], Type["Table"]] + + if value is not None and not isinstance(value, (Series, DataFrame)): + raise TypeError("value must be None, Series, or DataFrame") + + def error(t): + # return instead of raising so mypy can tell where we are raising + return TypeError( + f"cannot properly create the storer for: [{t}] [group->" + f"{group},value->{type(value)},format->{format}" + ) + + pt = _ensure_decoded(getattr(group._v_attrs, "pandas_type", None)) + tt = _ensure_decoded(getattr(group._v_attrs, "table_type", None)) + + # infer the pt from the passed value + if pt is None: + if value is None: + + _tables() + assert _table_mod is not None # for mypy + if getattr(group, "table", None) or isinstance( + group, _table_mod.table.Table + ): + pt = "frame_table" + tt = "generic_table" + else: + raise TypeError( + "cannot create a storer if the object is not existing " + "nor a value are passed" + ) + else: + _TYPE_MAP = {Series: "series", DataFrame: "frame"} + pt = _TYPE_MAP[type(value)] + + # we are actually a table + if format == "table": + pt += "_table" + + # a storer node + if "table" not in pt: + _STORER_MAP = {"series": SeriesFixed, "frame": FrameFixed} + try: + cls = _STORER_MAP[pt] + except KeyError: + raise error("_STORER_MAP") + return cls(self, group, encoding=encoding, errors=errors) + + # existing node (and must be a table) + if tt is None: + + # if we are a writer, determine the tt + if value is not None: + + if pt == "series_table": + index = getattr(value, "index", None) + if index is not None: + if index.nlevels == 1: + tt = "appendable_series" + elif index.nlevels > 1: + tt = "appendable_multiseries" + elif pt == "frame_table": + index = getattr(value, "index", None) + if index is not None: + if index.nlevels == 1: + tt = "appendable_frame" + elif index.nlevels > 1: + tt = "appendable_multiframe" + + _TABLE_MAP = { + "generic_table": GenericTable, + "appendable_series": AppendableSeriesTable, + "appendable_multiseries": AppendableMultiSeriesTable, + "appendable_frame": AppendableFrameTable, + "appendable_multiframe": AppendableMultiFrameTable, + "worm": WORMTable, + } + try: + cls = _TABLE_MAP[tt] + except KeyError: + raise error("_TABLE_MAP") + + return cls(self, group, encoding=encoding, errors=errors) + + def _write_to_group( + self, + key: str, + value: FrameOrSeries, + format, + axes=None, + index=True, + append=False, + complib=None, + complevel: Optional[int] = None, + fletcher32=None, + min_itemsize: Optional[Union[int, Dict[str, int]]] = None, + chunksize=None, + expectedrows=None, + dropna=False, + nan_rep=None, + data_columns=None, + encoding=None, + errors: str = "strict", + ): + group = self.get_node(key) + + # we make this assertion for mypy; the get_node call will already + # have raised if this is incorrect + assert self._handle is not None + + # remove the node if we are not appending + if group is not None and not append: + self._handle.remove_node(group, recursive=True) + group = None + + # we don't want to store a table node at all if our object is 0-len + # as there are not dtypes + if getattr(value, "empty", None) and (format == "table" or append): + return + + if group is None: + paths = key.split("/") + + # recursively create the groups + path = "/" + for p in paths: + if not len(p): + continue + new_path = path + if not path.endswith("/"): + new_path += "/" + new_path += p + group = self.get_node(new_path) + if group is None: + group = self._handle.create_group(path, p) + path = new_path + + s = self._create_storer(group, format, value, encoding=encoding, errors=errors) + if append: + # raise if we are trying to append to a Fixed format, + # or a table that exists (and we are putting) + if not s.is_table or (s.is_table and format == "fixed" and s.is_exists): + raise ValueError("Can only append to Tables") + if not s.is_exists: + s.set_object_info() + else: + s.set_object_info() + + if not s.is_table and complib: + raise ValueError("Compression not supported on Fixed format stores") + + # write the object + s.write( + obj=value, + axes=axes, + append=append, + complib=complib, + complevel=complevel, + fletcher32=fletcher32, + min_itemsize=min_itemsize, + chunksize=chunksize, + expectedrows=expectedrows, + dropna=dropna, + nan_rep=nan_rep, + data_columns=data_columns, + ) + + if isinstance(s, Table) and index: + s.create_index(columns=index) + + def _read_group(self, group: "Node"): + s = self._create_storer(group) + s.infer_axes() + return s.read() + + +class TableIterator: + """ + Define the iteration interface on a table + + Parameters + ---------- + store : HDFStore + s : the referred storer + func : the function to execute the query + where : the where of the query + nrows : the rows to iterate on + start : the passed start value (default is None) + stop : the passed stop value (default is None) + iterator : bool, default False + Whether to use the default iterator. + chunksize : the passed chunking value (default is 100000) + auto_close : bool, default False + Whether to automatically close the store at the end of iteration. + """ + + chunksize: Optional[int] + store: HDFStore + s: Union["GenericFixed", "Table"] + + def __init__( + self, + store: HDFStore, + s: Union["GenericFixed", "Table"], + func, + where, + nrows, + start=None, + stop=None, + iterator: bool = False, + chunksize: Optional[int] = None, + auto_close: bool = False, + ): + self.store = store + self.s = s + self.func = func + self.where = where + + # set start/stop if they are not set if we are a table + if self.s.is_table: + if nrows is None: + nrows = 0 + if start is None: + start = 0 + if stop is None: + stop = nrows + stop = min(nrows, stop) + + self.nrows = nrows + self.start = start + self.stop = stop + + self.coordinates = None + if iterator or chunksize is not None: + if chunksize is None: + chunksize = 100000 + self.chunksize = int(chunksize) + else: + self.chunksize = None + + self.auto_close = auto_close + + def __iter__(self): + + # iterate + current = self.start + while current < self.stop: + + stop = min(current + self.chunksize, self.stop) + value = self.func(None, None, self.coordinates[current:stop]) + current = stop + if value is None or not len(value): + continue + + yield value + + self.close() + + def close(self): + if self.auto_close: + self.store.close() + + def get_result(self, coordinates: bool = False): + + # return the actual iterator + if self.chunksize is not None: + if not isinstance(self.s, Table): + raise TypeError("can only use an iterator or chunksize on a table") + + self.coordinates = self.s.read_coordinates(where=self.where) + + return self + + # if specified read via coordinates (necessary for multiple selections + if coordinates: + if not isinstance(self.s, Table): + raise TypeError("can only read_coordinates on a table") + where = self.s.read_coordinates( + where=self.where, start=self.start, stop=self.stop + ) + else: + where = self.where + + # directly return the result + results = self.func(self.start, self.stop, where) + self.close() + return results + + +class IndexCol: + """ an index column description class + + Parameters + ---------- + + axis : axis which I reference + values : the ndarray like converted values + kind : a string description of this type + typ : the pytables type + pos : the position in the pytables + + """ + + is_an_indexable = True + is_data_indexable = True + _info_fields = ["freq", "tz", "index_name"] + + name: str + cname: str + + def __init__( + self, + name: str, + values=None, + kind=None, + typ=None, + cname: Optional[str] = None, + axis=None, + pos=None, + freq=None, + tz=None, + index_name=None, + ordered=None, + table=None, + meta=None, + metadata=None, + ): + + if not isinstance(name, str): + raise ValueError("`name` must be a str.") + + self.values = values + self.kind = kind + self.typ = typ + self.name = name + self.cname = cname or name + self.axis = axis + self.pos = pos + self.freq = freq + self.tz = tz + self.index_name = index_name + self.ordered = ordered + self.table = table + self.meta = meta + self.metadata = metadata + + if pos is not None: + self.set_pos(pos) + + # These are ensured as long as the passed arguments match the + # constructor annotations. + assert isinstance(self.name, str) + assert isinstance(self.cname, str) + + @property + def itemsize(self) -> int: + # Assumes self.typ has already been initialized + return self.typ.itemsize + + @property + def kind_attr(self) -> str: + return f"{self.name}_kind" + + def set_pos(self, pos: int): + """ set the position of this column in the Table """ + self.pos = pos + if pos is not None and self.typ is not None: + self.typ._v_pos = pos + + def __repr__(self) -> str: + temp = tuple( + map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind)) + ) + return ",".join( + ( + f"{key}->{value}" + for key, value in zip(["name", "cname", "axis", "pos", "kind"], temp) + ) + ) + + def __eq__(self, other: Any) -> bool: + """ compare 2 col items """ + return all( + getattr(self, a, None) == getattr(other, a, None) + for a in ["name", "cname", "axis", "pos"] + ) + + def __ne__(self, other) -> bool: + return not self.__eq__(other) + + @property + def is_indexed(self) -> bool: + """ return whether I am an indexed column """ + if not hasattr(self.table, "cols"): + # e.g. if infer hasn't been called yet, self.table will be None. + return False + # GH#29692 mypy doesn't recognize self.table as having a "cols" attribute + # 'error: "None" has no attribute "cols"' + return getattr(self.table.cols, self.cname).is_indexed # type: ignore + + def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): + """ + Convert the data from this selection to the appropriate pandas type. + """ + assert isinstance(values, np.ndarray), type(values) + + # values is a recarray + if values.dtype.fields is not None: + values = values[self.cname] + + val_kind = _ensure_decoded(self.kind) + values = _maybe_convert(values, val_kind, encoding, errors) + + kwargs = dict() + kwargs["name"] = _ensure_decoded(self.index_name) + + if self.freq is not None: + kwargs["freq"] = _ensure_decoded(self.freq) + + # making an Index instance could throw a number of different errors + try: + new_pd_index = Index(values, **kwargs) + except ValueError: + # if the output freq is different that what we recorded, + # it should be None (see also 'doc example part 2') + if "freq" in kwargs: + kwargs["freq"] = None + new_pd_index = Index(values, **kwargs) + + new_pd_index = _set_tz(new_pd_index, self.tz) + return new_pd_index, new_pd_index + + def take_data(self): + """ return the values""" + return self.values + + @property + def attrs(self): + return self.table._v_attrs + + @property + def description(self): + return self.table.description + + @property + def col(self): + """ return my current col description """ + return getattr(self.description, self.cname, None) + + @property + def cvalues(self): + """ return my cython values """ + return self.values + + def __iter__(self): + return iter(self.values) + + def maybe_set_size(self, min_itemsize=None): + """ maybe set a string col itemsize: + min_itemsize can be an integer or a dict with this columns name + with an integer size """ + if _ensure_decoded(self.kind) == "string": + + if isinstance(min_itemsize, dict): + min_itemsize = min_itemsize.get(self.name) + + if min_itemsize is not None and self.typ.itemsize < min_itemsize: + self.typ = _tables().StringCol(itemsize=min_itemsize, pos=self.pos) + + def validate_names(self): + pass + + def validate_and_set(self, handler: "AppendableTable", append: bool): + self.table = handler.table + self.validate_col() + self.validate_attr(append) + self.validate_metadata(handler) + self.write_metadata(handler) + self.set_attr() + + def validate_col(self, itemsize=None): + """ validate this column: return the compared against itemsize """ + + # validate this column for string truncation (or reset to the max size) + if _ensure_decoded(self.kind) == "string": + c = self.col + if c is not None: + if itemsize is None: + itemsize = self.itemsize + if c.itemsize < itemsize: + raise ValueError( + f"Trying to store a string with len [{itemsize}] in " + f"[{self.cname}] column but\nthis column has a limit of " + f"[{c.itemsize}]!\nConsider using min_itemsize to " + "preset the sizes on these columns" + ) + return c.itemsize + + return None + + def validate_attr(self, append: bool): + # check for backwards incompatibility + if append: + existing_kind = getattr(self.attrs, self.kind_attr, None) + if existing_kind is not None and existing_kind != self.kind: + raise TypeError( + f"incompatible kind in col [{existing_kind} - {self.kind}]" + ) + + def update_info(self, info): + """ set/update the info for this indexable with the key/value + if there is a conflict raise/warn as needed """ + + for key in self._info_fields: + + value = getattr(self, key, None) + idx = info.setdefault(self.name, {}) + + existing_value = idx.get(key) + if key in idx and value is not None and existing_value != value: + + # frequency/name just warn + if key in ["freq", "index_name"]: + ws = attribute_conflict_doc % (key, existing_value, value) + warnings.warn(ws, AttributeConflictWarning, stacklevel=6) + + # reset + idx[key] = None + setattr(self, key, None) + + else: + raise ValueError( + f"invalid info for [{self.name}] for [{key}], " + f"existing_value [{existing_value}] conflicts with " + f"new value [{value}]" + ) + else: + if value is not None or existing_value is not None: + idx[key] = value + + def set_info(self, info): + """ set my state from the passed info """ + idx = info.get(self.name) + if idx is not None: + self.__dict__.update(idx) + + def set_attr(self): + """ set the kind for this column """ + setattr(self.attrs, self.kind_attr, self.kind) + + def validate_metadata(self, handler: "AppendableTable"): + """ validate that kind=category does not change the categories """ + if self.meta == "category": + new_metadata = self.metadata + cur_metadata = handler.read_metadata(self.cname) + if ( + new_metadata is not None + and cur_metadata is not None + and not array_equivalent(new_metadata, cur_metadata) + ): + raise ValueError( + "cannot append a categorical with " + "different categories to the existing" + ) + + def write_metadata(self, handler: "AppendableTable"): + """ set the meta data """ + if self.metadata is not None: + handler.write_metadata(self.cname, self.metadata) + + +class GenericIndexCol(IndexCol): + """ an index which is not represented in the data of the table """ + + @property + def is_indexed(self) -> bool: + return False + + def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): + """ + Convert the data from this selection to the appropriate pandas type. + + Parameters + ---------- + values : np.ndarray + nan_rep : str + encoding : str + errors : str + """ + assert isinstance(values, np.ndarray), type(values) + + values = Int64Index(np.arange(len(values))) + return values, values + + def set_attr(self): + pass + + +class DataCol(IndexCol): + """ a data holding column, by definition this is not indexable + + Parameters + ---------- + + data : the actual data + cname : the column name in the table to hold the data (typically + values) + meta : a string description of the metadata + metadata : the actual metadata + """ + + is_an_indexable = False + is_data_indexable = False + _info_fields = ["tz", "ordered"] + + def __init__( + self, + name: str, + values=None, + kind=None, + typ=None, + cname=None, + pos=None, + tz=None, + ordered=None, + table=None, + meta=None, + metadata=None, + dtype=None, + data=None, + ): + super().__init__( + name=name, + values=values, + kind=kind, + typ=typ, + pos=pos, + cname=cname, + tz=tz, + ordered=ordered, + table=table, + meta=meta, + metadata=metadata, + ) + self.dtype = dtype + self.data = data + + @property + def dtype_attr(self) -> str: + return f"{self.name}_dtype" + + @property + def meta_attr(self) -> str: + return f"{self.name}_meta" + + def __repr__(self) -> str: + temp = tuple( + map( + pprint_thing, (self.name, self.cname, self.dtype, self.kind, self.shape) + ) + ) + return ",".join( + ( + f"{key}->{value}" + for key, value in zip(["name", "cname", "dtype", "kind", "shape"], temp) + ) + ) + + def __eq__(self, other: Any) -> bool: + """ compare 2 col items """ + return all( + getattr(self, a, None) == getattr(other, a, None) + for a in ["name", "cname", "dtype", "pos"] + ) + + def set_data(self, data: Union[np.ndarray, ABCExtensionArray]): + assert data is not None + assert self.dtype is None + + data, dtype_name = _get_data_and_dtype_name(data) + + self.data = data + self.dtype = dtype_name + self.kind = _dtype_to_kind(dtype_name) + + def take_data(self): + """ return the data """ + return self.data + + @classmethod + def _get_atom(cls, values: Union[np.ndarray, ABCExtensionArray]) -> "Col": + """ + Get an appropriately typed and shaped pytables.Col object for values. + """ + + dtype = values.dtype + itemsize = dtype.itemsize + + shape = values.shape + if values.ndim == 1: + # EA, use block shape pretending it is 2D + shape = (1, values.size) + + if is_categorical_dtype(dtype): + codes = values.codes + atom = cls.get_atom_data(shape, kind=codes.dtype.name) + elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): + atom = cls.get_atom_datetime64(shape) + elif is_timedelta64_dtype(dtype): + atom = cls.get_atom_timedelta64(shape) + elif is_complex_dtype(dtype): + atom = _tables().ComplexCol(itemsize=itemsize, shape=shape[0]) + + elif is_string_dtype(dtype): + atom = cls.get_atom_string(shape, itemsize) + + else: + atom = cls.get_atom_data(shape, kind=dtype.name) + + return atom + + @classmethod + def get_atom_string(cls, shape, itemsize): + return _tables().StringCol(itemsize=itemsize, shape=shape[0]) + + @classmethod + def get_atom_coltype(cls, kind: str) -> Type["Col"]: + """ return the PyTables column class for this column """ + if kind.startswith("uint"): + k4 = kind[4:] + col_name = f"UInt{k4}Col" + elif kind.startswith("period"): + # we store as integer + col_name = "Int64Col" + else: + kcap = kind.capitalize() + col_name = f"{kcap}Col" + + return getattr(_tables(), col_name) + + @classmethod + def get_atom_data(cls, shape, kind: str) -> "Col": + return cls.get_atom_coltype(kind=kind)(shape=shape[0]) + + @classmethod + def get_atom_datetime64(cls, shape): + return _tables().Int64Col(shape=shape[0]) + + @classmethod + def get_atom_timedelta64(cls, shape): + return _tables().Int64Col(shape=shape[0]) + + @property + def shape(self): + return getattr(self.data, "shape", None) + + @property + def cvalues(self): + """ return my cython values """ + return self.data + + def validate_attr(self, append): + """validate that we have the same order as the existing & same dtype""" + if append: + existing_fields = getattr(self.attrs, self.kind_attr, None) + if existing_fields is not None and existing_fields != list(self.values): + raise ValueError("appended items do not match existing items in table!") + + existing_dtype = getattr(self.attrs, self.dtype_attr, None) + if existing_dtype is not None and existing_dtype != self.dtype: + raise ValueError( + "appended items dtype do not match existing " + "items dtype in table!" + ) + + def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): + """ + Convert the data from this selection to the appropriate pandas type. + + Parameters + ---------- + values : np.ndarray + nan_rep : + encoding : str + errors : str + + Returns + ------- + index : listlike to become an Index + data : ndarraylike to become a column + """ + assert isinstance(values, np.ndarray), type(values) + + # values is a recarray + if values.dtype.fields is not None: + values = values[self.cname] + + assert self.typ is not None + if self.dtype is None: + # Note: in tests we never have timedelta64 or datetime64, + # so the _get_data_and_dtype_name may be unnecessary + converted, dtype_name = _get_data_and_dtype_name(values) + kind = _dtype_to_kind(dtype_name) + else: + converted = values + dtype_name = self.dtype + kind = self.kind + + assert isinstance(converted, np.ndarray) # for mypy + + # use the meta if needed + meta = _ensure_decoded(self.meta) + metadata = self.metadata + ordered = self.ordered + tz = self.tz + + assert dtype_name is not None + # convert to the correct dtype + dtype = _ensure_decoded(dtype_name) + + # reverse converts + if dtype == "datetime64": + + # recreate with tz if indicated + converted = _set_tz(converted, tz, coerce=True) + + elif dtype == "timedelta64": + converted = np.asarray(converted, dtype="m8[ns]") + elif dtype == "date": + try: + converted = np.asarray( + [date.fromordinal(v) for v in converted], dtype=object + ) + except ValueError: + converted = np.asarray( + [date.fromtimestamp(v) for v in converted], dtype=object + ) + + elif meta == "category": + + # we have a categorical + categories = metadata + codes = converted.ravel() + + # if we have stored a NaN in the categories + # then strip it; in theory we could have BOTH + # -1s in the codes and nulls :< + if categories is None: + # Handle case of NaN-only categorical columns in which case + # the categories are an empty array; when this is stored, + # pytables cannot write a zero-len array, so on readback + # the categories would be None and `read_hdf()` would fail. + categories = Index([], dtype=np.float64) + else: + mask = isna(categories) + if mask.any(): + categories = categories[~mask] + codes[codes != -1] -= mask.astype(int).cumsum().values + + converted = Categorical.from_codes( + codes, categories=categories, ordered=ordered + ) + + else: + + try: + converted = converted.astype(dtype, copy=False) + except TypeError: + converted = converted.astype("O", copy=False) + + # convert nans / decode + if _ensure_decoded(kind) == "string": + converted = _unconvert_string_array( + converted, nan_rep=nan_rep, encoding=encoding, errors=errors + ) + + return self.values, converted + + def set_attr(self): + """ set the data for this column """ + setattr(self.attrs, self.kind_attr, self.values) + setattr(self.attrs, self.meta_attr, self.meta) + assert self.dtype is not None + setattr(self.attrs, self.dtype_attr, self.dtype) + + +class DataIndexableCol(DataCol): + """ represent a data column that can be indexed """ + + is_data_indexable = True + + def validate_names(self): + if not Index(self.values).is_object(): + # TODO: should the message here be more specifically non-str? + raise ValueError("cannot have non-object label DataIndexableCol") + + @classmethod + def get_atom_string(cls, shape, itemsize): + return _tables().StringCol(itemsize=itemsize) + + @classmethod + def get_atom_data(cls, shape, kind: str) -> "Col": + return cls.get_atom_coltype(kind=kind)() + + @classmethod + def get_atom_datetime64(cls, shape): + return _tables().Int64Col() + + @classmethod + def get_atom_timedelta64(cls, shape): + return _tables().Int64Col() + + +class GenericDataIndexableCol(DataIndexableCol): + """ represent a generic pytables data column """ + + pass + + +class Fixed: + """ represent an object in my store + facilitate read/write of various types of objects + this is an abstract base class + + Parameters + ---------- + parent : HDFStore + group : Node + The group node where the table resides. + """ + + pandas_kind: str + format_type: str = "fixed" # GH#30962 needed by dask + obj_type: Type[Union[DataFrame, Series]] + ndim: int + encoding: str + parent: HDFStore + group: "Node" + errors: str + is_table = False + + def __init__( + self, + parent: HDFStore, + group: "Node", + encoding: str = "UTF-8", + errors: str = "strict", + ): + assert isinstance(parent, HDFStore), type(parent) + assert _table_mod is not None # needed for mypy + assert isinstance(group, _table_mod.Node), type(group) + self.parent = parent + self.group = group + self.encoding = _ensure_encoding(encoding) + self.errors = errors + + @property + def is_old_version(self) -> bool: + return self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1 + + @property + def version(self) -> Tuple[int, int, int]: + """ compute and set our version """ + version = _ensure_decoded(getattr(self.group._v_attrs, "pandas_version", None)) + try: + version = tuple(int(x) for x in version.split(".")) + if len(version) == 2: + version = version + (0,) + except AttributeError: + version = (0, 0, 0) + return version + + @property + def pandas_type(self): + return _ensure_decoded(getattr(self.group._v_attrs, "pandas_type", None)) + + def __repr__(self) -> str: + """ return a pretty representation of myself """ + self.infer_axes() + s = self.shape + if s is not None: + if isinstance(s, (list, tuple)): + jshape = ",".join(pprint_thing(x) for x in s) + s = f"[{jshape}]" + return f"{self.pandas_type:12.12} (shape->{s})" + return self.pandas_type + + def set_object_info(self): + """ set my pandas type & version """ + self.attrs.pandas_type = str(self.pandas_kind) + self.attrs.pandas_version = str(_version) + + def copy(self): + new_self = copy.copy(self) + return new_self + + @property + def shape(self): + return self.nrows + + @property + def pathname(self): + return self.group._v_pathname + + @property + def _handle(self): + return self.parent._handle + + @property + def _filters(self): + return self.parent._filters + + @property + def _complevel(self) -> int: + return self.parent._complevel + + @property + def _fletcher32(self) -> bool: + return self.parent._fletcher32 + + @property + def attrs(self): + return self.group._v_attrs + + def set_attrs(self): + """ set our object attributes """ + pass + + def get_attrs(self): + """ get our object attributes """ + pass + + @property + def storable(self): + """ return my storable """ + return self.group + + @property + def is_exists(self) -> bool: + return False + + @property + def nrows(self): + return getattr(self.storable, "nrows", None) + + def validate(self, other): + """ validate against an existing storable """ + if other is None: + return + return True + + def validate_version(self, where=None): + """ are we trying to operate on an old version? """ + return True + + def infer_axes(self): + """ infer the axes of my storer + return a boolean indicating if we have a valid storer or not """ + + s = self.storable + if s is None: + return False + self.get_attrs() + return True + + def read( + self, + where=None, + columns=None, + start: Optional[int] = None, + stop: Optional[int] = None, + ): + raise NotImplementedError( + "cannot read on an abstract storer: subclasses should implement" + ) + + def write(self, **kwargs): + raise NotImplementedError( + "cannot write on an abstract storer: subclasses should implement" + ) + + def delete( + self, where=None, start: Optional[int] = None, stop: Optional[int] = None + ): + """ + support fully deleting the node in its entirety (only) - where + specification must be None + """ + if com.all_none(where, start, stop): + self._handle.remove_node(self.group, recursive=True) + return None + + raise TypeError("cannot delete on an abstract storer") + + +class GenericFixed(Fixed): + """ a generified fixed version """ + + _index_type_map = {DatetimeIndex: "datetime", PeriodIndex: "period"} + _reverse_index_map = {v: k for k, v in _index_type_map.items()} + attributes: List[str] = [] + + # indexer helpders + def _class_to_alias(self, cls) -> str: + return self._index_type_map.get(cls, "") + + def _alias_to_class(self, alias): + if isinstance(alias, type): # pragma: no cover + # compat: for a short period of time master stored types + return alias + return self._reverse_index_map.get(alias, Index) + + def _get_index_factory(self, klass): + if klass == DatetimeIndex: + + def f(values, freq=None, tz=None): + # data are already in UTC, localize and convert if tz present + result = DatetimeIndex._simple_new(values.values, name=None, freq=freq) + if tz is not None: + result = result.tz_localize("UTC").tz_convert(tz) + return result + + return f + elif klass == PeriodIndex: + + def f(values, freq=None, tz=None): + return PeriodIndex._simple_new(values, name=None, freq=freq) + + return f + + return klass + + def validate_read(self, columns, where): + """ + raise if any keywords are passed which are not-None + """ + if columns is not None: + raise TypeError( + "cannot pass a column specification when reading " + "a Fixed format store. this store must be " + "selected in its entirety" + ) + if where is not None: + raise TypeError( + "cannot pass a where specification when reading " + "from a Fixed format store. this store must be " + "selected in its entirety" + ) + + @property + def is_exists(self) -> bool: + return True + + def set_attrs(self): + """ set our object attributes """ + self.attrs.encoding = self.encoding + self.attrs.errors = self.errors + + def get_attrs(self): + """ retrieve our attributes """ + self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None)) + self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict")) + for n in self.attributes: + setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None))) + + def write(self, obj, **kwargs): + self.set_attrs() + + def read_array( + self, key: str, start: Optional[int] = None, stop: Optional[int] = None + ): + """ read an array for the specified node (off of group """ + import tables + + node = getattr(self.group, key) + attrs = node._v_attrs + + transposed = getattr(attrs, "transposed", False) + + if isinstance(node, tables.VLArray): + ret = node[0][start:stop] + else: + dtype = getattr(attrs, "value_type", None) + shape = getattr(attrs, "shape", None) + + if shape is not None: + # length 0 axis + ret = np.empty(shape, dtype=dtype) + else: + ret = node[start:stop] + + if dtype == "datetime64": + + # reconstruct a timezone if indicated + tz = getattr(attrs, "tz", None) + ret = _set_tz(ret, tz, coerce=True) + + elif dtype == "timedelta64": + ret = np.asarray(ret, dtype="m8[ns]") + + if transposed: + return ret.T + else: + return ret + + def read_index( + self, key: str, start: Optional[int] = None, stop: Optional[int] = None + ) -> Index: + variety = _ensure_decoded(getattr(self.attrs, f"{key}_variety")) + + if variety == "multi": + return self.read_multi_index(key, start=start, stop=stop) + elif variety == "regular": + node = getattr(self.group, key) + index = self.read_index_node(node, start=start, stop=stop) + return index + else: # pragma: no cover + raise TypeError(f"unrecognized index variety: {variety}") + + def write_index(self, key: str, index: Index): + if isinstance(index, MultiIndex): + setattr(self.attrs, f"{key}_variety", "multi") + self.write_multi_index(key, index) + else: + setattr(self.attrs, f"{key}_variety", "regular") + converted = _convert_index("index", index, self.encoding, self.errors) + + self.write_array(key, converted.values) + + node = getattr(self.group, key) + node._v_attrs.kind = converted.kind + node._v_attrs.name = index.name + + if isinstance(index, (DatetimeIndex, PeriodIndex)): + node._v_attrs.index_class = self._class_to_alias(type(index)) + + if isinstance(index, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): + node._v_attrs.freq = index.freq + + if isinstance(index, DatetimeIndex) and index.tz is not None: + node._v_attrs.tz = _get_tz(index.tz) + + def write_multi_index(self, key: str, index: MultiIndex): + setattr(self.attrs, f"{key}_nlevels", index.nlevels) + + for i, (lev, level_codes, name) in enumerate( + zip(index.levels, index.codes, index.names) + ): + # write the level + if is_extension_array_dtype(lev): + raise NotImplementedError( + "Saving a MultiIndex with an extension dtype is not supported." + ) + level_key = f"{key}_level{i}" + conv_level = _convert_index(level_key, lev, self.encoding, self.errors) + self.write_array(level_key, conv_level.values) + node = getattr(self.group, level_key) + node._v_attrs.kind = conv_level.kind + node._v_attrs.name = name + + # write the name + setattr(node._v_attrs, f"{key}_name{name}", name) + + # write the labels + label_key = f"{key}_label{i}" + self.write_array(label_key, level_codes) + + def read_multi_index( + self, key: str, start: Optional[int] = None, stop: Optional[int] = None + ) -> MultiIndex: + nlevels = getattr(self.attrs, f"{key}_nlevels") + + levels = [] + codes = [] + names: List[Optional[Hashable]] = [] + for i in range(nlevels): + level_key = f"{key}_level{i}" + node = getattr(self.group, level_key) + lev = self.read_index_node(node, start=start, stop=stop) + levels.append(lev) + names.append(lev.name) + + label_key = f"{key}_label{i}" + level_codes = self.read_array(label_key, start=start, stop=stop) + codes.append(level_codes) + + return MultiIndex( + levels=levels, codes=codes, names=names, verify_integrity=True + ) + + def read_index_node( + self, node: "Node", start: Optional[int] = None, stop: Optional[int] = None + ) -> Index: + data = node[start:stop] + # If the index was an empty array write_array_empty() will + # have written a sentinel. Here we relace it with the original. + if "shape" in node._v_attrs and np.prod(node._v_attrs.shape) == 0: + data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type,) + kind = _ensure_decoded(node._v_attrs.kind) + name = None + + if "name" in node._v_attrs: + name = _ensure_str(node._v_attrs.name) + name = _ensure_decoded(name) + + index_class = self._alias_to_class( + _ensure_decoded(getattr(node._v_attrs, "index_class", "")) + ) + factory = self._get_index_factory(index_class) + + kwargs = {} + if "freq" in node._v_attrs: + kwargs["freq"] = node._v_attrs["freq"] + + if "tz" in node._v_attrs: + if isinstance(node._v_attrs["tz"], bytes): + # created by python2 + kwargs["tz"] = node._v_attrs["tz"].decode("utf-8") + else: + # created by python3 + kwargs["tz"] = node._v_attrs["tz"] + + if kind == "date": + index = factory( + _unconvert_index( + data, kind, encoding=self.encoding, errors=self.errors + ), + dtype=object, + **kwargs, + ) + else: + index = factory( + _unconvert_index( + data, kind, encoding=self.encoding, errors=self.errors + ), + **kwargs, + ) + + index.name = name + + return index + + def write_array_empty(self, key: str, value: ArrayLike): + """ write a 0-len array """ + + # ugly hack for length 0 axes + arr = np.empty((1,) * value.ndim) + self._handle.create_array(self.group, key, arr) + node = getattr(self.group, key) + node._v_attrs.value_type = str(value.dtype) + node._v_attrs.shape = value.shape + + def write_array(self, key: str, value: ArrayLike, items: Optional[Index] = None): + # TODO: we only have one test that gets here, the only EA + # that gets passed is DatetimeArray, and we never have + # both self._filters and EA + assert isinstance(value, (np.ndarray, ABCExtensionArray)), type(value) + + if key in self.group: + self._handle.remove_node(self.group, key) + + # Transform needed to interface with pytables row/col notation + empty_array = value.size == 0 + transposed = False + + if is_categorical_dtype(value): + raise NotImplementedError( + "Cannot store a category dtype in " + "a HDF5 dataset that uses format=" + '"fixed". Use format="table".' + ) + if not empty_array: + if hasattr(value, "T"): + # ExtensionArrays (1d) may not have transpose. + value = value.T + transposed = True + + atom = None + if self._filters is not None: + try: + # get the atom for this datatype + atom = _tables().Atom.from_dtype(value.dtype) + except ValueError: + pass + + if atom is not None: + # We only get here if self._filters is non-None and + # the Atom.from_dtype call succeeded + + # create an empty chunked array and fill it from value + if not empty_array: + ca = self._handle.create_carray( + self.group, key, atom, value.shape, filters=self._filters + ) + ca[:] = value + + else: + self.write_array_empty(key, value) + + elif value.dtype.type == np.object_: + + # infer the type, warn if we have a non-string type here (for + # performance) + inferred_type = lib.infer_dtype(value.ravel(), skipna=False) + if empty_array: + pass + elif inferred_type == "string": + pass + else: + ws = performance_doc % (inferred_type, key, items) + warnings.warn(ws, PerformanceWarning, stacklevel=7) + + vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom()) + vlarr.append(value) + + elif empty_array: + self.write_array_empty(key, value) + elif is_datetime64_dtype(value.dtype): + self._handle.create_array(self.group, key, value.view("i8")) + getattr(self.group, key)._v_attrs.value_type = "datetime64" + elif is_datetime64tz_dtype(value.dtype): + # store as UTC + # with a zone + self._handle.create_array(self.group, key, value.asi8) + + node = getattr(self.group, key) + node._v_attrs.tz = _get_tz(value.tz) + node._v_attrs.value_type = "datetime64" + elif is_timedelta64_dtype(value.dtype): + self._handle.create_array(self.group, key, value.view("i8")) + getattr(self.group, key)._v_attrs.value_type = "timedelta64" + else: + self._handle.create_array(self.group, key, value) + + getattr(self.group, key)._v_attrs.transposed = transposed + + +class SeriesFixed(GenericFixed): + pandas_kind = "series" + attributes = ["name"] + + name: Optional[Hashable] + + @property + def shape(self): + try: + return (len(self.group.values),) + except (TypeError, AttributeError): + return None + + def read( + self, + where=None, + columns=None, + start: Optional[int] = None, + stop: Optional[int] = None, + ): + self.validate_read(columns, where) + index = self.read_index("index", start=start, stop=stop) + values = self.read_array("values", start=start, stop=stop) + return Series(values, index=index, name=self.name) + + def write(self, obj, **kwargs): + super().write(obj, **kwargs) + self.write_index("index", obj.index) + self.write_array("values", obj.values) + self.attrs.name = obj.name + + +class BlockManagerFixed(GenericFixed): + attributes = ["ndim", "nblocks"] + + nblocks: int + + @property + def shape(self): + try: + ndim = self.ndim + + # items + items = 0 + for i in range(self.nblocks): + node = getattr(self.group, f"block{i}_items") + shape = getattr(node, "shape", None) + if shape is not None: + items += shape[0] + + # data shape + node = self.group.block0_values + shape = getattr(node, "shape", None) + if shape is not None: + shape = list(shape[0 : (ndim - 1)]) + else: + shape = [] + + shape.append(items) + + return shape + except AttributeError: + return None + + def read( + self, + where=None, + columns=None, + start: Optional[int] = None, + stop: Optional[int] = None, + ): + # start, stop applied to rows, so 0th axis only + self.validate_read(columns, where) + select_axis = self.obj_type()._get_block_manager_axis(0) + + axes = [] + for i in range(self.ndim): + + _start, _stop = (start, stop) if i == select_axis else (None, None) + ax = self.read_index(f"axis{i}", start=_start, stop=_stop) + axes.append(ax) + + items = axes[0] + dfs = [] + + for i in range(self.nblocks): + + blk_items = self.read_index(f"block{i}_items") + values = self.read_array(f"block{i}_values", start=_start, stop=_stop) + + columns = items[items.get_indexer(blk_items)] + df = DataFrame(values.T, columns=columns, index=axes[1]) + dfs.append(df) + + if len(dfs) > 0: + out = concat(dfs, axis=1) + out = out.reindex(columns=items, copy=False) + return out + + return DataFrame(columns=axes[0], index=axes[1]) + + def write(self, obj, **kwargs): + super().write(obj, **kwargs) + data = obj._data + if not data.is_consolidated(): + data = data.consolidate() + + self.attrs.ndim = data.ndim + for i, ax in enumerate(data.axes): + if i == 0: + if not ax.is_unique: + raise ValueError("Columns index has to be unique for fixed format") + self.write_index(f"axis{i}", ax) + + # Supporting mixed-type DataFrame objects...nontrivial + self.attrs.nblocks = len(data.blocks) + for i, blk in enumerate(data.blocks): + # I have no idea why, but writing values before items fixed #2299 + blk_items = data.items.take(blk.mgr_locs) + self.write_array(f"block{i}_values", blk.values, items=blk_items) + self.write_index(f"block{i}_items", blk_items) + + +class FrameFixed(BlockManagerFixed): + pandas_kind = "frame" + obj_type = DataFrame + + +class Table(Fixed): + """ represent a table: + facilitate read/write of various types of tables + + Attrs in Table Node + ------------------- + These are attributes that are store in the main table node, they are + necessary to recreate these tables when read back in. + + index_axes : a list of tuples of the (original indexing axis and + index column) + non_index_axes: a list of tuples of the (original index axis and + columns on a non-indexing axis) + values_axes : a list of the columns which comprise the data of this + table + data_columns : a list of the columns that we are allowing indexing + (these become single columns in values_axes), or True to force all + columns + nan_rep : the string to use for nan representations for string + objects + levels : the names of levels + metadata : the names of the metadata columns + + """ + + pandas_kind = "wide_table" + format_type: str = "table" # GH#30962 needed by dask + table_type: str + levels = 1 + is_table = True + + index_axes: List[IndexCol] + non_index_axes: List[Tuple[int, Any]] + values_axes: List[DataCol] + data_columns: List + metadata: List + info: Dict + + def __init__( + self, + parent: HDFStore, + group: "Node", + encoding=None, + errors: str = "strict", + index_axes=None, + non_index_axes=None, + values_axes=None, + data_columns=None, + info=None, + nan_rep=None, + ): + super().__init__(parent, group, encoding=encoding, errors=errors) + self.index_axes = index_axes or [] + self.non_index_axes = non_index_axes or [] + self.values_axes = values_axes or [] + self.data_columns = data_columns or [] + self.info = info or dict() + self.nan_rep = nan_rep + + @property + def table_type_short(self) -> str: + return self.table_type.split("_")[0] + + def __repr__(self) -> str: + """ return a pretty representation of myself """ + self.infer_axes() + jdc = ",".join(self.data_columns) if len(self.data_columns) else "" + dc = f",dc->[{jdc}]" + + ver = "" + if self.is_old_version: + jver = ".".join(str(x) for x in self.version) + ver = f"[{jver}]" + + jindex_axes = ",".join(a.name for a in self.index_axes) + return ( + f"{self.pandas_type:12.12}{ver} " + f"(typ->{self.table_type_short},nrows->{self.nrows}," + f"ncols->{self.ncols},indexers->[{jindex_axes}]{dc})" + ) + + def __getitem__(self, c: str): + """ return the axis for c """ + for a in self.axes: + if c == a.name: + return a + return None + + def validate(self, other): + """ validate against an existing table """ + if other is None: + return + + if other.table_type != self.table_type: + raise TypeError( + "incompatible table_type with existing " + f"[{other.table_type} - {self.table_type}]" + ) + + for c in ["index_axes", "non_index_axes", "values_axes"]: + sv = getattr(self, c, None) + ov = getattr(other, c, None) + if sv != ov: + + # show the error for the specific axes + for i, sax in enumerate(sv): + oax = ov[i] + if sax != oax: + raise ValueError( + f"invalid combination of [{c}] on appending data " + f"[{sax}] vs current table [{oax}]" + ) + + # should never get here + raise Exception( + f"invalid combination of [{c}] on appending data [{sv}] vs " + f"current table [{ov}]" + ) + + @property + def is_multi_index(self) -> bool: + """the levels attribute is 1 or a list in the case of a multi-index""" + return isinstance(self.levels, list) + + def validate_multiindex(self, obj): + """validate that we can store the multi-index; reset and return the + new object + """ + levels = [ + l if l is not None else f"level_{i}" for i, l in enumerate(obj.index.names) + ] + try: + return obj.reset_index(), levels + except ValueError: + raise ValueError( + "duplicate names/columns in the multi-index when storing as a table" + ) + + @property + def nrows_expected(self) -> int: + """ based on our axes, compute the expected nrows """ + return np.prod([i.cvalues.shape[0] for i in self.index_axes]) + + @property + def is_exists(self) -> bool: + """ has this table been created """ + return "table" in self.group + + @property + def storable(self): + return getattr(self.group, "table", None) + + @property + def table(self): + """ return the table group (this is my storable) """ + return self.storable + + @property + def dtype(self): + return self.table.dtype + + @property + def description(self): + return self.table.description + + @property + def axes(self): + return itertools.chain(self.index_axes, self.values_axes) + + @property + def ncols(self) -> int: + """ the number of total columns in the values axes """ + return sum(len(a.values) for a in self.values_axes) + + @property + def is_transposed(self) -> bool: + return False + + @property + def data_orientation(self): + """return a tuple of my permutated axes, non_indexable at the front""" + return tuple( + itertools.chain( + [int(a[0]) for a in self.non_index_axes], + [int(a.axis) for a in self.index_axes], + ) + ) + + def queryables(self) -> Dict[str, Any]: + """ return a dict of the kinds allowable columns for this object """ + + # mypy doesn't recognize DataFrame._AXIS_NAMES, so we re-write it here + axis_names = {0: "index", 1: "columns"} + + # compute the values_axes queryables + d1 = [(a.cname, a) for a in self.index_axes] + d2 = [(axis_names[axis], None) for axis, values in self.non_index_axes] + d3 = [ + (v.cname, v) for v in self.values_axes if v.name in set(self.data_columns) + ] + + return dict(d1 + d2 + d3) # type: ignore + # error: List comprehension has incompatible type + # List[Tuple[Any, None]]; expected List[Tuple[str, IndexCol]] + + def index_cols(self): + """ return a list of my index cols """ + # Note: each `i.cname` below is assured to be a str. + return [(i.axis, i.cname) for i in self.index_axes] + + def values_cols(self) -> List[str]: + """ return a list of my values cols """ + return [i.cname for i in self.values_axes] + + def _get_metadata_path(self, key: str) -> str: + """ return the metadata pathname for this key """ + group = self.group._v_pathname + return f"{group}/meta/{key}/meta" + + def write_metadata(self, key: str, values: np.ndarray): + """ + Write out a metadata array to the key as a fixed-format Series. + + Parameters + ---------- + key : str + values : ndarray + """ + values = Series(values) + self.parent.put( + self._get_metadata_path(key), + values, + format="table", + encoding=self.encoding, + errors=self.errors, + nan_rep=self.nan_rep, + ) + + def read_metadata(self, key: str): + """ return the meta data array for this key """ + if getattr(getattr(self.group, "meta", None), key, None) is not None: + return self.parent.select(self._get_metadata_path(key)) + return None + + def set_attrs(self): + """ set our table type & indexables """ + self.attrs.table_type = str(self.table_type) + self.attrs.index_cols = self.index_cols() + self.attrs.values_cols = self.values_cols() + self.attrs.non_index_axes = self.non_index_axes + self.attrs.data_columns = self.data_columns + self.attrs.nan_rep = self.nan_rep + self.attrs.encoding = self.encoding + self.attrs.errors = self.errors + self.attrs.levels = self.levels + self.attrs.info = self.info + + def get_attrs(self): + """ retrieve our attributes """ + self.non_index_axes = getattr(self.attrs, "non_index_axes", None) or [] + self.data_columns = getattr(self.attrs, "data_columns", None) or [] + self.info = getattr(self.attrs, "info", None) or dict() + self.nan_rep = getattr(self.attrs, "nan_rep", None) + self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None)) + self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict")) + self.levels = getattr(self.attrs, "levels", None) or [] + self.index_axes = [a for a in self.indexables if a.is_an_indexable] + self.values_axes = [a for a in self.indexables if not a.is_an_indexable] + + def validate_version(self, where=None): + """ are we trying to operate on an old version? """ + if where is not None: + if self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1: + ws = incompatibility_doc % ".".join([str(x) for x in self.version]) + warnings.warn(ws, IncompatibilityWarning) + + def validate_min_itemsize(self, min_itemsize): + """validate the min_itemsize doesn't contain items that are not in the + axes this needs data_columns to be defined + """ + if min_itemsize is None: + return + if not isinstance(min_itemsize, dict): + return + + q = self.queryables() + for k, v in min_itemsize.items(): + + # ok, apply generally + if k == "values": + continue + if k not in q: + raise ValueError( + f"min_itemsize has the key [{k}] which is not an axis or " + "data_column" + ) + + @cache_readonly + def indexables(self): + """ create/cache the indexables if they don't exist """ + _indexables = [] + + desc = self.description + table_attrs = self.table.attrs + + # Note: each of the `name` kwargs below are str, ensured + # by the definition in index_cols. + # index columns + for i, (axis, name) in enumerate(self.attrs.index_cols): + atom = getattr(desc, name) + md = self.read_metadata(name) + meta = "category" if md is not None else None + + kind_attr = f"{name}_kind" + kind = getattr(table_attrs, kind_attr, None) + + index_col = IndexCol( + name=name, + axis=axis, + pos=i, + kind=kind, + typ=atom, + table=self.table, + meta=meta, + metadata=md, + ) + _indexables.append(index_col) + + # values columns + dc = set(self.data_columns) + base_pos = len(_indexables) + + def f(i, c): + assert isinstance(c, str) + klass = DataCol + if c in dc: + klass = DataIndexableCol + + atom = getattr(desc, c) + adj_name = _maybe_adjust_name(c, self.version) + + # TODO: why kind_attr here? + values = getattr(table_attrs, f"{adj_name}_kind", None) + dtype = getattr(table_attrs, f"{adj_name}_dtype", None) + kind = _dtype_to_kind(dtype) + + md = self.read_metadata(c) + # TODO: figure out why these two versions of `meta` dont always match. + # meta = "category" if md is not None else None + meta = getattr(table_attrs, f"{adj_name}_meta", None) + + obj = klass( + name=adj_name, + cname=c, + values=values, + kind=kind, + pos=base_pos + i, + typ=atom, + table=self.table, + meta=meta, + metadata=md, + dtype=dtype, + ) + return obj + + # Note: the definition of `values_cols` ensures that each + # `c` below is a str. + _indexables.extend([f(i, c) for i, c in enumerate(self.attrs.values_cols)]) + + return _indexables + + def create_index(self, columns=None, optlevel=None, kind: Optional[str] = None): + """ + Create a pytables index on the specified columns. + + Parameters + ---------- + columns : None, bool, or listlike[str] + Indicate which columns to create an index on. + + * False : Do not create any indexes. + * True : Create indexes on all columns. + * None : Create indexes on all columns. + * listlike : Create indexes on the given columns. + + optlevel : int or None, default None + Optimization level, if None, pytables defaults to 6. + kind : str or None, default None + Kind of index, if None, pytables defaults to "medium". + + Raises + ------ + TypeError if trying to create an index on a complex-type column. + + Notes + ----- + Cannot index Time64Col or ComplexCol. + Pytables must be >= 3.0. + """ + + if not self.infer_axes(): + return + if columns is False: + return + + # index all indexables and data_columns + if columns is None or columns is True: + columns = [a.cname for a in self.axes if a.is_data_indexable] + if not isinstance(columns, (tuple, list)): + columns = [columns] + + kw = dict() + if optlevel is not None: + kw["optlevel"] = optlevel + if kind is not None: + kw["kind"] = kind + + table = self.table + for c in columns: + v = getattr(table.cols, c, None) + if v is not None: + + # remove the index if the kind/optlevel have changed + if v.is_indexed: + index = v.index + cur_optlevel = index.optlevel + cur_kind = index.kind + + if kind is not None and cur_kind != kind: + v.remove_index() + else: + kw["kind"] = cur_kind + + if optlevel is not None and cur_optlevel != optlevel: + v.remove_index() + else: + kw["optlevel"] = cur_optlevel + + # create the index + if not v.is_indexed: + if v.type.startswith("complex"): + raise TypeError( + "Columns containing complex values can be stored but " + "cannot be indexed when using table format. Either use " + "fixed format, set index=False, or do not include " + "the columns containing complex values to " + "data_columns when initializing the table." + ) + v.create_index(**kw) + + def _read_axes( + self, where, start: Optional[int] = None, stop: Optional[int] = None + ) -> List[Tuple[ArrayLike, ArrayLike]]: + """ + Create the axes sniffed from the table. + + Parameters + ---------- + where : ??? + start : int or None, default None + stop : int or None, default None + + Returns + ------- + List[Tuple[index_values, column_values]] + """ + + # create the selection + selection = Selection(self, where=where, start=start, stop=stop) + values = selection.select() + + results = [] + # convert the data + for a in self.axes: + a.set_info(self.info) + res = a.convert( + values, + nan_rep=self.nan_rep, + encoding=self.encoding, + errors=self.errors, + ) + results.append(res) + + return results + + @classmethod + def get_object(cls, obj, transposed: bool): + """ return the data for this obj """ + return obj + + def validate_data_columns(self, data_columns, min_itemsize, non_index_axes): + """take the input data_columns and min_itemize and create a data + columns spec + """ + + if not len(non_index_axes): + return [] + + axis, axis_labels = non_index_axes[0] + info = self.info.get(axis, dict()) + if info.get("type") == "MultiIndex" and data_columns: + raise ValueError( + f"cannot use a multi-index on axis [{axis}] with " + f"data_columns {data_columns}" + ) + + # evaluate the passed data_columns, True == use all columns + # take only valide axis labels + if data_columns is True: + data_columns = list(axis_labels) + elif data_columns is None: + data_columns = [] + + # if min_itemsize is a dict, add the keys (exclude 'values') + if isinstance(min_itemsize, dict): + + existing_data_columns = set(data_columns) + data_columns = list(data_columns) # ensure we do not modify + data_columns.extend( + [ + k + for k in min_itemsize.keys() + if k != "values" and k not in existing_data_columns + ] + ) + + # return valid columns in the order of our axis + return [c for c in data_columns if c in axis_labels] + + def _create_axes( + self, + axes, + obj: DataFrame, + validate: bool = True, + nan_rep=None, + data_columns=None, + min_itemsize=None, + ): + """ + Create and return the axes. + + Parameters + ---------- + axes: list or None + The names or numbers of the axes to create. + obj : DataFrame + The object to create axes on. + validate: bool, default True + Whether to validate the obj against an existing object already written. + nan_rep : + A value to use for string column nan_rep. + data_columns : List[str], True, or None, default None + Specify the columns that we want to create to allow indexing on. + + * True : Use all available columns. + * None : Use no columns. + * List[str] : Use the specified columns. + + min_itemsize: Dict[str, int] or None, default None + The min itemsize for a column in bytes. + """ + + if not isinstance(obj, DataFrame): + group = self.group._v_name + raise TypeError( + f"cannot properly create the storer for: [group->{group}," + f"value->{type(obj)}]" + ) + + # set the default axes if needed + if axes is None: + axes = [0] + + # map axes to numbers + axes = [obj._get_axis_number(a) for a in axes] + + # do we have an existing table (if so, use its axes & data_columns) + if self.infer_axes(): + table_exists = True + axes = [a.axis for a in self.index_axes] + data_columns = list(self.data_columns) + nan_rep = self.nan_rep + # TODO: do we always have validate=True here? + else: + table_exists = False + + new_info = self.info + + assert self.ndim == 2 # with next check, we must have len(axes) == 1 + # currently support on ndim-1 axes + if len(axes) != self.ndim - 1: + raise ValueError( + "currently only support ndim-1 indexers in an AppendableTable" + ) + + # create according to the new data + new_non_index_axes: List = [] + + # nan_representation + if nan_rep is None: + nan_rep = "nan" + + # We construct the non-index-axis first, since that alters new_info + idx = [x for x in [0, 1] if x not in axes][0] + + a = obj.axes[idx] + # we might be able to change the axes on the appending data if necessary + append_axis = list(a) + if table_exists: + indexer = len(new_non_index_axes) # i.e. 0 + exist_axis = self.non_index_axes[indexer][1] + if not array_equivalent(np.array(append_axis), np.array(exist_axis)): + + # ahah! -> reindex + if array_equivalent( + np.array(sorted(append_axis)), np.array(sorted(exist_axis)) + ): + append_axis = exist_axis + + # the non_index_axes info + info = new_info.setdefault(idx, {}) + info["names"] = list(a.names) + info["type"] = type(a).__name__ + + new_non_index_axes.append((idx, append_axis)) + + # Now we can construct our new index axis + idx = axes[0] + a = obj.axes[idx] + axis_name = obj._AXIS_NAMES[idx] + new_index = _convert_index(axis_name, a, self.encoding, self.errors) + new_index.axis = idx + + # Because we are always 2D, there is only one new_index, so + # we know it will have pos=0 + new_index.set_pos(0) + new_index.update_info(new_info) + new_index.maybe_set_size(min_itemsize) # check for column conflicts + + new_index_axes = [new_index] + j = len(new_index_axes) # i.e. 1 + assert j == 1 + + # reindex by our non_index_axes & compute data_columns + assert len(new_non_index_axes) == 1 + for a in new_non_index_axes: + obj = _reindex_axis(obj, a[0], a[1]) + + def get_blk_items(mgr, blocks): + return [mgr.items.take(blk.mgr_locs) for blk in blocks] + + transposed = new_index.axis == 1 + + # figure out data_columns and get out blocks + data_columns = self.validate_data_columns( + data_columns, min_itemsize, new_non_index_axes + ) + + block_obj = self.get_object(obj, transposed)._consolidate() + + blocks, blk_items = self._get_blocks_and_items( + block_obj, table_exists, new_non_index_axes, self.values_axes, data_columns + ) + + # add my values + vaxes = [] + for i, (b, b_items) in enumerate(zip(blocks, blk_items)): + + # shape of the data column are the indexable axes + klass = DataCol + name = None + + # we have a data_column + if data_columns and len(b_items) == 1 and b_items[0] in data_columns: + klass = DataIndexableCol + name = b_items[0] + if not (name is None or isinstance(name, str)): + # TODO: should the message here be more specifically non-str? + raise ValueError("cannot have non-object label DataIndexableCol") + + # make sure that we match up the existing columns + # if we have an existing table + existing_col: Optional[DataCol] + + if table_exists and validate: + try: + existing_col = self.values_axes[i] + except (IndexError, KeyError): + raise ValueError( + f"Incompatible appended table [{blocks}]" + f"with existing table [{self.values_axes}]" + ) + else: + existing_col = None + + new_name = name or f"values_block_{i}" + data_converted = _maybe_convert_for_string_atom( + new_name, + b, + existing_col=existing_col, + min_itemsize=min_itemsize, + nan_rep=nan_rep, + encoding=self.encoding, + errors=self.errors, + ) + adj_name = _maybe_adjust_name(new_name, self.version) + + typ = klass._get_atom(data_converted) + kind = _dtype_to_kind(data_converted.dtype.name) + tz = _get_tz(data_converted.tz) if hasattr(data_converted, "tz") else None + + meta = metadata = ordered = None + if is_categorical_dtype(data_converted): + ordered = data_converted.ordered + meta = "category" + metadata = np.array(data_converted.categories, copy=False).ravel() + + data, dtype_name = _get_data_and_dtype_name(data_converted) + + col = klass( + name=adj_name, + cname=new_name, + values=list(b_items), + typ=typ, + pos=j, + kind=kind, + tz=tz, + ordered=ordered, + meta=meta, + metadata=metadata, + dtype=dtype_name, + data=data, + ) + col.update_info(new_info) + + vaxes.append(col) + + j += 1 + + dcs = [col.name for col in vaxes if col.is_data_indexable] + + new_table = type(self)( + parent=self.parent, + group=self.group, + encoding=self.encoding, + errors=self.errors, + index_axes=new_index_axes, + non_index_axes=new_non_index_axes, + values_axes=vaxes, + data_columns=dcs, + info=new_info, + nan_rep=nan_rep, + ) + if hasattr(self, "levels"): + # TODO: get this into constructor, only for appropriate subclass + new_table.levels = self.levels + + new_table.validate_min_itemsize(min_itemsize) + + if validate and table_exists: + new_table.validate(self) + + return new_table + + @staticmethod + def _get_blocks_and_items( + block_obj, table_exists, new_non_index_axes, values_axes, data_columns + ): + # Helper to clarify non-state-altering parts of _create_axes + + def get_blk_items(mgr, blocks): + return [mgr.items.take(blk.mgr_locs) for blk in blocks] + + blocks = block_obj._data.blocks + blk_items = get_blk_items(block_obj._data, blocks) + + if len(data_columns): + axis, axis_labels = new_non_index_axes[0] + new_labels = Index(axis_labels).difference(Index(data_columns)) + mgr = block_obj.reindex(new_labels, axis=axis)._data + + blocks = list(mgr.blocks) + blk_items = get_blk_items(mgr, blocks) + for c in data_columns: + mgr = block_obj.reindex([c], axis=axis)._data + blocks.extend(mgr.blocks) + blk_items.extend(get_blk_items(mgr, mgr.blocks)) + + # reorder the blocks in the same order as the existing table if we can + if table_exists: + by_items = { + tuple(b_items.tolist()): (b, b_items) + for b, b_items in zip(blocks, blk_items) + } + new_blocks = [] + new_blk_items = [] + for ea in values_axes: + items = tuple(ea.values) + try: + b, b_items = by_items.pop(items) + new_blocks.append(b) + new_blk_items.append(b_items) + except (IndexError, KeyError): + jitems = ",".join(pprint_thing(item) for item in items) + raise ValueError( + f"cannot match existing table structure for [{jitems}] " + "on appending data" + ) + blocks = new_blocks + blk_items = new_blk_items + + return blocks, blk_items + + def process_axes(self, obj, selection: "Selection", columns=None): + """ process axes filters """ + + # make a copy to avoid side effects + if columns is not None: + columns = list(columns) + + # make sure to include levels if we have them + if columns is not None and self.is_multi_index: + assert isinstance(self.levels, list) # assured by is_multi_index + for n in self.levels: + if n not in columns: + columns.insert(0, n) + + # reorder by any non_index_axes & limit to the select columns + for axis, labels in self.non_index_axes: + obj = _reindex_axis(obj, axis, labels, columns) + + # apply the selection filters (but keep in the same order) + if selection.filter is not None: + for field, op, filt in selection.filter.format(): + + def process_filter(field, filt): + + for axis_name in obj._AXIS_NAMES.values(): + axis_number = obj._get_axis_number(axis_name) + axis_values = obj._get_axis(axis_name) + assert axis_number is not None + + # see if the field is the name of an axis + if field == axis_name: + + # if we have a multi-index, then need to include + # the levels + if self.is_multi_index: + filt = filt.union(Index(self.levels)) + + takers = op(axis_values, filt) + return obj.loc(axis=axis_number)[takers] + + # this might be the name of a file IN an axis + elif field in axis_values: + + # we need to filter on this dimension + values = ensure_index(getattr(obj, field).values) + filt = ensure_index(filt) + + # hack until we support reversed dim flags + if isinstance(obj, DataFrame): + axis_number = 1 - axis_number + takers = op(values, filt) + return obj.loc(axis=axis_number)[takers] + + raise ValueError(f"cannot find the field [{field}] for filtering!") + + obj = process_filter(field, filt) + + return obj + + def create_description( + self, + complib, + complevel: Optional[int], + fletcher32: bool, + expectedrows: Optional[int], + ) -> Dict[str, Any]: + """ create the description of the table from the axes & values """ + + # provided expected rows if its passed + if expectedrows is None: + expectedrows = max(self.nrows_expected, 10000) + + d = dict(name="table", expectedrows=expectedrows) + + # description from the axes & values + d["description"] = {a.cname: a.typ for a in self.axes} + + if complib: + if complevel is None: + complevel = self._complevel or 9 + filters = _tables().Filters( + complevel=complevel, + complib=complib, + fletcher32=fletcher32 or self._fletcher32, + ) + d["filters"] = filters + elif self._filters is not None: + d["filters"] = self._filters + + return d + + def read_coordinates( + self, where=None, start: Optional[int] = None, stop: Optional[int] = None, + ): + """select coordinates (row numbers) from a table; return the + coordinates object + """ + + # validate the version + self.validate_version(where) + + # infer the data kind + if not self.infer_axes(): + return False + + # create the selection + selection = Selection(self, where=where, start=start, stop=stop) + coords = selection.select_coords() + if selection.filter is not None: + for field, op, filt in selection.filter.format(): + data = self.read_column( + field, start=coords.min(), stop=coords.max() + 1 + ) + coords = coords[op(data.iloc[coords - coords.min()], filt).values] + + return Index(coords) + + def read_column( + self, + column: str, + where=None, + start: Optional[int] = None, + stop: Optional[int] = None, + ): + """return a single column from the table, generally only indexables + are interesting + """ + + # validate the version + self.validate_version() + + # infer the data kind + if not self.infer_axes(): + return False + + if where is not None: + raise TypeError("read_column does not currently accept a where clause") + + # find the axes + for a in self.axes: + if column == a.name: + + if not a.is_data_indexable: + raise ValueError( + f"column [{column}] can not be extracted individually; " + "it is not data indexable" + ) + + # column must be an indexable or a data column + c = getattr(self.table.cols, column) + a.set_info(self.info) + col_values = a.convert( + c[start:stop], + nan_rep=self.nan_rep, + encoding=self.encoding, + errors=self.errors, + ) + return Series(_set_tz(col_values[1], a.tz), name=column) + + raise KeyError(f"column [{column}] not found in the table") + + +class WORMTable(Table): + """ a write-once read-many table: this format DOES NOT ALLOW appending to a + table. writing is a one-time operation the data are stored in a format + that allows for searching the data on disk + """ + + table_type = "worm" + + def read( + self, + where=None, + columns=None, + start: Optional[int] = None, + stop: Optional[int] = None, + ): + """ read the indices and the indexing array, calculate offset rows and + return """ + raise NotImplementedError("WORMTable needs to implement read") + + def write(self, **kwargs): + """ write in a format that we can search later on (but cannot append + to): write out the indices and the values using _write_array + (e.g. a CArray) create an indexing table so that we can search + """ + raise NotImplementedError("WORMTable needs to implement write") + + +class AppendableTable(Table): + """ support the new appendable table formats """ + + table_type = "appendable" + + def write( + self, + obj, + axes=None, + append=False, + complib=None, + complevel=None, + fletcher32=None, + min_itemsize=None, + chunksize=None, + expectedrows=None, + dropna=False, + nan_rep=None, + data_columns=None, + ): + + if not append and self.is_exists: + self._handle.remove_node(self.group, "table") + + # create the axes + table = self._create_axes( + axes=axes, + obj=obj, + validate=append, + min_itemsize=min_itemsize, + nan_rep=nan_rep, + data_columns=data_columns, + ) + + for a in table.axes: + a.validate_names() + + if not table.is_exists: + + # create the table + options = table.create_description( + complib=complib, + complevel=complevel, + fletcher32=fletcher32, + expectedrows=expectedrows, + ) + + # set the table attributes + table.set_attrs() + + # create the table + table._handle.create_table(table.group, **options) + + # update my info + table.attrs.info = table.info + + # validate the axes and set the kinds + for a in table.axes: + a.validate_and_set(table, append) + + # add the rows + table.write_data(chunksize, dropna=dropna) + + def write_data(self, chunksize: Optional[int], dropna: bool = False): + """ we form the data into a 2-d including indexes,values,mask + write chunk-by-chunk """ + + names = self.dtype.names + nrows = self.nrows_expected + + # if dropna==True, then drop ALL nan rows + masks = [] + if dropna: + + for a in self.values_axes: + + # figure the mask: only do if we can successfully process this + # column, otherwise ignore the mask + mask = isna(a.data).all(axis=0) + if isinstance(mask, np.ndarray): + masks.append(mask.astype("u1", copy=False)) + + # consolidate masks + if len(masks): + mask = masks[0] + for m in masks[1:]: + mask = mask & m + mask = mask.ravel() + else: + mask = None + + # broadcast the indexes if needed + indexes = [a.cvalues for a in self.index_axes] + nindexes = len(indexes) + assert nindexes == 1, nindexes # ensures we dont need to broadcast + + # transpose the values so first dimension is last + # reshape the values if needed + values = [a.take_data() for a in self.values_axes] + values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1)) for v in values] + bvalues = [] + for i, v in enumerate(values): + new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape + bvalues.append(values[i].reshape(new_shape)) + + # write the chunks + if chunksize is None: + chunksize = 100000 + + rows = np.empty(min(chunksize, nrows), dtype=self.dtype) + chunks = int(nrows / chunksize) + 1 + for i in range(chunks): + start_i = i * chunksize + end_i = min((i + 1) * chunksize, nrows) + if start_i >= end_i: + break + + self.write_data_chunk( + rows, + indexes=[a[start_i:end_i] for a in indexes], + mask=mask[start_i:end_i] if mask is not None else None, + values=[v[start_i:end_i] for v in bvalues], + ) + + def write_data_chunk( + self, + rows: np.ndarray, + indexes: List[np.ndarray], + mask: Optional[np.ndarray], + values: List[np.ndarray], + ): + """ + Parameters + ---------- + rows : an empty memory space where we are putting the chunk + indexes : an array of the indexes + mask : an array of the masks + values : an array of the values + """ + + # 0 len + for v in values: + if not np.prod(v.shape): + return + + nrows = indexes[0].shape[0] + if nrows != len(rows): + rows = np.empty(nrows, dtype=self.dtype) + names = self.dtype.names + nindexes = len(indexes) + + # indexes + for i, idx in enumerate(indexes): + rows[names[i]] = idx + + # values + for i, v in enumerate(values): + rows[names[i + nindexes]] = v + + # mask + if mask is not None: + m = ~mask.ravel().astype(bool, copy=False) + if not m.all(): + rows = rows[m] + + if len(rows): + self.table.append(rows) + self.table.flush() + + def delete( + self, where=None, start: Optional[int] = None, stop: Optional[int] = None, + ): + + # delete all rows (and return the nrows) + if where is None or not len(where): + if start is None and stop is None: + nrows = self.nrows + self._handle.remove_node(self.group, recursive=True) + else: + # pytables<3.0 would remove a single row with stop=None + if stop is None: + stop = self.nrows + nrows = self.table.remove_rows(start=start, stop=stop) + self.table.flush() + return nrows + + # infer the data kind + if not self.infer_axes(): + return None + + # create the selection + table = self.table + selection = Selection(self, where, start=start, stop=stop) + values = selection.select_coords() + + # delete the rows in reverse order + sorted_series = Series(values).sort_values() + ln = len(sorted_series) + + if ln: + + # construct groups of consecutive rows + diff = sorted_series.diff() + groups = list(diff[diff > 1].index) + + # 1 group + if not len(groups): + groups = [0] + + # final element + if groups[-1] != ln: + groups.append(ln) + + # initial element + if groups[0] != 0: + groups.insert(0, 0) + + # we must remove in reverse order! + pg = groups.pop() + for g in reversed(groups): + rows = sorted_series.take(range(g, pg)) + table.remove_rows( + start=rows[rows.index[0]], stop=rows[rows.index[-1]] + 1 + ) + pg = g + + self.table.flush() + + # return the number of rows removed + return ln + + +class AppendableFrameTable(AppendableTable): + """ support the new appendable table formats """ + + pandas_kind = "frame_table" + table_type = "appendable_frame" + ndim = 2 + obj_type: Type[Union[DataFrame, Series]] = DataFrame + + @property + def is_transposed(self) -> bool: + return self.index_axes[0].axis == 1 + + @classmethod + def get_object(cls, obj, transposed: bool): + """ these are written transposed """ + if transposed: + obj = obj.T + return obj + + def read( + self, + where=None, + columns=None, + start: Optional[int] = None, + stop: Optional[int] = None, + ): + + # validate the version + self.validate_version(where) + + # infer the data kind + if not self.infer_axes(): + return None + + result = self._read_axes(where=where, start=start, stop=stop) + + info = ( + self.info.get(self.non_index_axes[0][0], dict()) + if len(self.non_index_axes) + else dict() + ) + + inds = [i for i, ax in enumerate(self.axes) if ax is self.index_axes[0]] + assert len(inds) == 1 + ind = inds[0] + + index = result[ind][0] + + frames = [] + for i, a in enumerate(self.axes): + if a not in self.values_axes: + continue + index_vals, cvalues = result[i] + + # we could have a multi-index constructor here + # ensure_index doesn't recognized our list-of-tuples here + if info.get("type") == "MultiIndex": + cols = MultiIndex.from_tuples(index_vals) + else: + cols = Index(index_vals) + + names = info.get("names") + if names is not None: + cols.set_names(names, inplace=True) + + if self.is_transposed: + values = cvalues + index_ = cols + cols_ = Index(index, name=getattr(index, "name", None)) + else: + values = cvalues.T + index_ = Index(index, name=getattr(index, "name", None)) + cols_ = cols + + # if we have a DataIndexableCol, its shape will only be 1 dim + if values.ndim == 1 and isinstance(values, np.ndarray): + values = values.reshape((1, values.shape[0])) + + if isinstance(values, np.ndarray): + df = DataFrame(values.T, columns=cols_, index=index_) + elif isinstance(values, Index): + df = DataFrame(values, columns=cols_, index=index_) + else: + # Categorical + df = DataFrame([values], columns=cols_, index=index_) + assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) + frames.append(df) + + if len(frames) == 1: + df = frames[0] + else: + df = concat(frames, axis=1) + + selection = Selection(self, where=where, start=start, stop=stop) + # apply the selection filters & axis orderings + df = self.process_axes(df, selection=selection, columns=columns) + + return df + + +class AppendableSeriesTable(AppendableFrameTable): + """ support the new appendable table formats """ + + pandas_kind = "series_table" + table_type = "appendable_series" + ndim = 2 + obj_type = Series + + @property + def is_transposed(self) -> bool: + return False + + @classmethod + def get_object(cls, obj, transposed: bool): + return obj + + def write(self, obj, data_columns=None, **kwargs): + """ we are going to write this as a frame table """ + if not isinstance(obj, DataFrame): + name = obj.name or "values" + obj = obj.to_frame(name) + return super().write(obj=obj, data_columns=obj.columns.tolist(), **kwargs) + + def read( + self, + where=None, + columns=None, + start: Optional[int] = None, + stop: Optional[int] = None, + ) -> Series: + + is_multi_index = self.is_multi_index + if columns is not None and is_multi_index: + assert isinstance(self.levels, list) # needed for mypy + for n in self.levels: + if n not in columns: + columns.insert(0, n) + s = super().read(where=where, columns=columns, start=start, stop=stop) + if is_multi_index: + s.set_index(self.levels, inplace=True) + + s = s.iloc[:, 0] + + # remove the default name + if s.name == "values": + s.name = None + return s + + +class AppendableMultiSeriesTable(AppendableSeriesTable): + """ support the new appendable table formats """ + + pandas_kind = "series_table" + table_type = "appendable_multiseries" + + def write(self, obj, **kwargs): + """ we are going to write this as a frame table """ + name = obj.name or "values" + obj, self.levels = self.validate_multiindex(obj) + cols = list(self.levels) + cols.append(name) + obj.columns = cols + return super().write(obj=obj, **kwargs) + + +class GenericTable(AppendableFrameTable): + """ a table that read/writes the generic pytables table format """ + + pandas_kind = "frame_table" + table_type = "generic_table" + ndim = 2 + obj_type = DataFrame + + @property + def pandas_type(self) -> str: + return self.pandas_kind + + @property + def storable(self): + return getattr(self.group, "table", None) or self.group + + def get_attrs(self): + """ retrieve our attributes """ + self.non_index_axes = [] + self.nan_rep = None + self.levels = [] + + self.index_axes = [a for a in self.indexables if a.is_an_indexable] + self.values_axes = [a for a in self.indexables if not a.is_an_indexable] + self.data_columns = [a.name for a in self.values_axes] + + @cache_readonly + def indexables(self): + """ create the indexables from the table description """ + d = self.description + + # TODO: can we get a typ for this? AFAICT it is the only place + # where we aren't passing one + # the index columns is just a simple index + md = self.read_metadata("index") + meta = "category" if md is not None else None + index_col = GenericIndexCol( + name="index", axis=0, table=self.table, meta=meta, metadata=md + ) + + _indexables = [index_col] + + for i, n in enumerate(d._v_names): + assert isinstance(n, str) + + atom = getattr(d, n) + md = self.read_metadata(n) + meta = "category" if md is not None else None + dc = GenericDataIndexableCol( + name=n, + pos=i, + values=[n], + typ=atom, + table=self.table, + meta=meta, + metadata=md, + ) + _indexables.append(dc) + + return _indexables + + def write(self, **kwargs): + raise NotImplementedError("cannot write on an generic table") + + +class AppendableMultiFrameTable(AppendableFrameTable): + """ a frame with a multi-index """ + + table_type = "appendable_multiframe" + obj_type = DataFrame + ndim = 2 + _re_levels = re.compile(r"^level_\d+$") + + @property + def table_type_short(self) -> str: + return "appendable_multi" + + def write(self, obj, data_columns=None, **kwargs): + if data_columns is None: + data_columns = [] + elif data_columns is True: + data_columns = obj.columns.tolist() + obj, self.levels = self.validate_multiindex(obj) + for n in self.levels: + if n not in data_columns: + data_columns.insert(0, n) + return super().write(obj=obj, data_columns=data_columns, **kwargs) + + def read( + self, + where=None, + columns=None, + start: Optional[int] = None, + stop: Optional[int] = None, + ): + + df = super().read(where=where, columns=columns, start=start, stop=stop) + df = df.set_index(self.levels) + + # remove names for 'level_%d' + df.index = df.index.set_names( + [None if self._re_levels.search(l) else l for l in df.index.names] + ) + + return df + + +def _reindex_axis(obj: DataFrame, axis: int, labels: Index, other=None) -> DataFrame: + ax = obj._get_axis(axis) + labels = ensure_index(labels) + + # try not to reindex even if other is provided + # if it equals our current index + if other is not None: + other = ensure_index(other) + if (other is None or labels.equals(other)) and labels.equals(ax): + return obj + + labels = ensure_index(labels.unique()) + if other is not None: + labels = ensure_index(other.unique()).intersection(labels, sort=False) + if not labels.equals(ax): + slicer: List[Union[slice, Index]] = [slice(None, None)] * obj.ndim + slicer[axis] = labels + obj = obj.loc[tuple(slicer)] + return obj + + +# tz to/from coercion + + +def _get_tz(tz: tzinfo) -> Union[str, tzinfo]: + """ for a tz-aware type, return an encoded zone """ + zone = timezones.get_timezone(tz) + return zone + + +def _set_tz( + values: Union[np.ndarray, Index], + tz: Optional[Union[str, tzinfo]], + coerce: bool = False, +) -> Union[np.ndarray, DatetimeIndex]: + """ + coerce the values to a DatetimeIndex if tz is set + preserve the input shape if possible + + Parameters + ---------- + values : ndarray or Index + tz : str or tzinfo + coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray + """ + if isinstance(values, DatetimeIndex): + # If values is tzaware, the tz gets dropped in the values.ravel() + # call below (which returns an ndarray). So we are only non-lossy + # if `tz` matches `values.tz`. + assert values.tz is None or values.tz == tz + + if tz is not None: + name = getattr(values, "name", None) + values = values.ravel() + tz = timezones.get_timezone(_ensure_decoded(tz)) + values = DatetimeIndex(values, name=name) + values = values.tz_localize("UTC").tz_convert(tz) + elif coerce: + values = np.asarray(values, dtype="M8[ns]") + + return values + + +def _convert_index(name: str, index: Index, encoding: str, errors: str) -> IndexCol: + assert isinstance(name, str) + + index_name = index.name + converted, dtype_name = _get_data_and_dtype_name(index) + kind = _dtype_to_kind(dtype_name) + atom = DataIndexableCol._get_atom(converted) + + if isinstance(index, Int64Index): + # Includes Int64Index, RangeIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex, + # in which case "kind" is "integer", "integer", "datetime64", + # "timedelta64", and "integer", respectively. + return IndexCol( + name, + values=converted, + kind=kind, + typ=atom, + freq=getattr(index, "freq", None), + tz=getattr(index, "tz", None), + index_name=index_name, + ) + + if isinstance(index, MultiIndex): + raise TypeError("MultiIndex not supported here!") + + inferred_type = lib.infer_dtype(index, skipna=False) + # we wont get inferred_type of "datetime64" or "timedelta64" as these + # would go through the DatetimeIndex/TimedeltaIndex paths above + + values = np.asarray(index) + + if inferred_type == "date": + converted = np.asarray([v.toordinal() for v in values], dtype=np.int32) + return IndexCol( + name, converted, "date", _tables().Time32Col(), index_name=index_name, + ) + elif inferred_type == "string": + + converted = _convert_string_array(values, encoding, errors) + itemsize = converted.dtype.itemsize + return IndexCol( + name, + converted, + "string", + _tables().StringCol(itemsize), + index_name=index_name, + ) + + elif inferred_type in ["integer", "floating"]: + return IndexCol( + name, values=converted, kind=kind, typ=atom, index_name=index_name, + ) + else: + assert isinstance(converted, np.ndarray) and converted.dtype == object + assert kind == "object", kind + atom = _tables().ObjectAtom() + return IndexCol(name, converted, kind, atom, index_name=index_name,) + + +def _unconvert_index( + data, kind: str, encoding: str, errors: str +) -> Union[np.ndarray, Index]: + index: Union[Index, np.ndarray] + + if kind == "datetime64": + index = DatetimeIndex(data) + elif kind == "timedelta64": + index = TimedeltaIndex(data) + elif kind == "date": + try: + index = np.asarray([date.fromordinal(v) for v in data], dtype=object) + except (ValueError): + index = np.asarray([date.fromtimestamp(v) for v in data], dtype=object) + elif kind in ("integer", "float"): + index = np.asarray(data) + elif kind in ("string"): + index = _unconvert_string_array( + data, nan_rep=None, encoding=encoding, errors=errors + ) + elif kind == "object": + index = np.asarray(data[0]) + else: # pragma: no cover + raise ValueError(f"unrecognized index type {kind}") + return index + + +def _maybe_convert_for_string_atom( + name: str, block, existing_col, min_itemsize, nan_rep, encoding, errors +): + + if not block.is_object: + return block.values + + dtype_name = block.dtype.name + inferred_type = lib.infer_dtype(block.values, skipna=False) + + if inferred_type == "date": + raise TypeError("[date] is not implemented as a table column") + elif inferred_type == "datetime": + # after GH#8260 + # this only would be hit for a multi-timezone dtype which is an error + raise TypeError( + "too many timezones in this block, create separate data columns" + ) + + elif not (inferred_type == "string" or dtype_name == "object"): + return block.values + + block = block.fillna(nan_rep, downcast=False) + if isinstance(block, list): + # Note: because block is always object dtype, fillna goes + # through a path such that the result is always a 1-element list + block = block[0] + data = block.values + + # see if we have a valid string type + inferred_type = lib.infer_dtype(data.ravel(), skipna=False) + if inferred_type != "string": + + # we cannot serialize this data, so report an exception on a column + # by column basis + for i in range(len(block.shape[0])): + + col = block.iget(i) + inferred_type = lib.infer_dtype(col.ravel(), skipna=False) + if inferred_type != "string": + iloc = block.mgr_locs.indexer[i] + raise TypeError( + f"Cannot serialize the column [{iloc}] because\n" + f"its data contents are [{inferred_type}] object dtype" + ) + + # itemsize is the maximum length of a string (along any dimension) + data_converted = _convert_string_array(data, encoding, errors).reshape(data.shape) + assert data_converted.shape == block.shape, (data_converted.shape, block.shape) + itemsize = data_converted.itemsize + + # specified min_itemsize? + if isinstance(min_itemsize, dict): + min_itemsize = int(min_itemsize.get(name) or min_itemsize.get("values") or 0) + itemsize = max(min_itemsize or 0, itemsize) + + # check for column in the values conflicts + if existing_col is not None: + eci = existing_col.validate_col(itemsize) + if eci > itemsize: + itemsize = eci + + data_converted = data_converted.astype(f"|S{itemsize}", copy=False) + return data_converted + + +def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.ndarray: + """ + Take a string-like that is object dtype and coerce to a fixed size string type. + + Parameters + ---------- + data : np.ndarray[object] + encoding : str + errors : str + Handler for encoding errors. + + Returns + ------- + np.ndarray[fixed-length-string] + """ + + # encode if needed + if len(data): + data = ( + Series(data.ravel()).str.encode(encoding, errors).values.reshape(data.shape) + ) + + # create the sized dtype + ensured = ensure_object(data.ravel()) + itemsize = max(1, libwriters.max_len_string_array(ensured)) + + data = np.asarray(data, dtype=f"S{itemsize}") + return data + + +def _unconvert_string_array( + data: np.ndarray, nan_rep, encoding: str, errors: str +) -> np.ndarray: + """ + Inverse of _convert_string_array. + + Parameters + ---------- + data : np.ndarray[fixed-length-string] + nan_rep : the storage repr of NaN + encoding : str + errors : str + Handler for encoding errors. + + Returns + ------- + np.ndarray[object] + Decoded data. + """ + shape = data.shape + data = np.asarray(data.ravel(), dtype=object) + + if len(data): + + itemsize = libwriters.max_len_string_array(ensure_object(data)) + dtype = f"U{itemsize}" + + if isinstance(data[0], bytes): + data = Series(data).str.decode(encoding, errors=errors).values + else: + data = data.astype(dtype, copy=False).astype(object, copy=False) + + if nan_rep is None: + nan_rep = "nan" + + data = libwriters.string_array_replace_from_nan_rep(data, nan_rep) + return data.reshape(shape) + + +def _maybe_convert(values: np.ndarray, val_kind: str, encoding: str, errors: str): + assert isinstance(val_kind, str), type(val_kind) + if _need_convert(val_kind): + conv = _get_converter(val_kind, encoding, errors) + values = conv(values) + return values + + +def _get_converter(kind: str, encoding: str, errors: str): + if kind == "datetime64": + return lambda x: np.asarray(x, dtype="M8[ns]") + elif kind == "string": + return lambda x: _unconvert_string_array( + x, nan_rep=None, encoding=encoding, errors=errors + ) + else: # pragma: no cover + raise ValueError(f"invalid kind {kind}") + + +def _need_convert(kind: str) -> bool: + if kind in ("datetime64", "string"): + return True + return False + + +def _maybe_adjust_name(name: str, version) -> str: + """ + Prior to 0.10.1, we named values blocks like: values_block_0 an the + name values_0, adjust the given name if necessary. + + Parameters + ---------- + name : str + version : Tuple[int, int, int] + + Returns + ------- + str + """ + try: + if version[0] == 0 and version[1] <= 10 and version[2] == 0: + m = re.search(r"values_block_(\d+)", name) + if m: + grp = m.groups()[0] + name = f"values_{grp}" + except IndexError: + pass + return name + + +def _dtype_to_kind(dtype_str: str) -> str: + """ + Find the "kind" string describing the given dtype name. + """ + dtype_str = _ensure_decoded(dtype_str) + + if dtype_str.startswith("string") or dtype_str.startswith("bytes"): + kind = "string" + elif dtype_str.startswith("float"): + kind = "float" + elif dtype_str.startswith("complex"): + kind = "complex" + elif dtype_str.startswith("int") or dtype_str.startswith("uint"): + kind = "integer" + elif dtype_str.startswith("datetime64"): + kind = "datetime64" + elif dtype_str.startswith("timedelta"): + kind = "timedelta64" + elif dtype_str.startswith("bool"): + kind = "bool" + elif dtype_str.startswith("category"): + kind = "category" + elif dtype_str.startswith("period"): + # We store the `freq` attr so we can restore from integers + kind = "integer" + elif dtype_str == "object": + kind = "object" + else: + raise ValueError(f"cannot interpret dtype of [{dtype_str}]") + + return kind + + +def _get_data_and_dtype_name(data: Union[np.ndarray, ABCExtensionArray]): + """ + Convert the passed data into a storable form and a dtype string. + """ + if is_categorical_dtype(data.dtype): + data = data.codes + + # For datetime64tz we need to drop the TZ in tests TODO: why? + dtype_name = data.dtype.name.split("[")[0] + + if data.dtype.kind in ["m", "M"]: + data = np.asarray(data.view("i8")) + # TODO: we used to reshape for the dt64tz case, but no longer + # doing that doesn't seem to break anything. why? + + elif isinstance(data, PeriodIndex): + data = data.asi8 + + data = np.asarray(data) + return data, dtype_name + + +class Selection: + """ + Carries out a selection operation on a tables.Table object. + + Parameters + ---------- + table : a Table object + where : list of Terms (or convertible to) + start, stop: indices to start and/or stop selection + + """ + + def __init__( + self, + table: Table, + where=None, + start: Optional[int] = None, + stop: Optional[int] = None, + ): + self.table = table + self.where = where + self.start = start + self.stop = stop + self.condition = None + self.filter = None + self.terms = None + self.coordinates = None + + if is_list_like(where): + + # see if we have a passed coordinate like + try: + inferred = lib.infer_dtype(where, skipna=False) + if inferred == "integer" or inferred == "boolean": + where = np.asarray(where) + if where.dtype == np.bool_: + start, stop = self.start, self.stop + if start is None: + start = 0 + if stop is None: + stop = self.table.nrows + self.coordinates = np.arange(start, stop)[where] + elif issubclass(where.dtype.type, np.integer): + if (self.start is not None and (where < self.start).any()) or ( + self.stop is not None and (where >= self.stop).any() + ): + raise ValueError( + "where must have index locations >= start and < stop" + ) + self.coordinates = where + + except ValueError: + pass + + if self.coordinates is None: + + self.terms = self.generate(where) + + # create the numexpr & the filter + if self.terms is not None: + self.condition, self.filter = self.terms.evaluate() + + def generate(self, where): + """ where can be a : dict,list,tuple,string """ + if where is None: + return None + + q = self.table.queryables() + try: + return PyTablesExpr(where, queryables=q, encoding=self.table.encoding) + except NameError: + # raise a nice message, suggesting that the user should use + # data_columns + qkeys = ",".join(q.keys()) + raise ValueError( + f"The passed where expression: {where}\n" + " contains an invalid variable reference\n" + " all of the variable references must be a " + "reference to\n" + " an axis (e.g. 'index' or 'columns'), or a " + "data_column\n" + f" The currently defined references are: {qkeys}\n" + ) + + def select(self): + """ + generate the selection + """ + if self.condition is not None: + return self.table.table.read_where( + self.condition.format(), start=self.start, stop=self.stop + ) + elif self.coordinates is not None: + return self.table.table.read_coordinates(self.coordinates) + return self.table.table.read(start=self.start, stop=self.stop) + + def select_coords(self): + """ + generate the selection + """ + start, stop = self.start, self.stop + nrows = self.table.nrows + if start is None: + start = 0 + elif start < 0: + start += nrows + if self.stop is None: + stop = nrows + elif stop < 0: + stop += nrows + + if self.condition is not None: + return self.table.table.get_where_list( + self.condition.format(), start=start, stop=stop, sort=True + ) + elif self.coordinates is not None: + return self.coordinates + + return np.arange(start, stop) diff --git a/venv/Lib/site-packages/pandas/io/s3.py b/venv/Lib/site-packages/pandas/io/s3.py new file mode 100644 index 0000000..976c319 --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/s3.py @@ -0,0 +1,49 @@ +""" s3 support for remote file interactivity """ +from typing import IO, Any, Optional, Tuple +from urllib.parse import urlparse as parse_url + +from pandas._typing import FilePathOrBuffer +from pandas.compat._optional import import_optional_dependency + +s3fs = import_optional_dependency( + "s3fs", extra="The s3fs package is required to handle s3 files." +) + + +def _strip_schema(url): + """Returns the url without the s3:// part""" + result = parse_url(url, allow_fragments=False) + return result.netloc + result.path + + +def get_file_and_filesystem( + filepath_or_buffer: FilePathOrBuffer, mode: Optional[str] = None +) -> Tuple[IO, Any]: + from botocore.exceptions import NoCredentialsError + + if mode is None: + mode = "rb" + + fs = s3fs.S3FileSystem(anon=False) + try: + file = fs.open(_strip_schema(filepath_or_buffer), mode) + except (FileNotFoundError, NoCredentialsError): + # boto3 has troubles when trying to access a public file + # when credentialed... + # An OSError is raised if you have credentials, but they + # aren't valid for that bucket. + # A NoCredentialsError is raised if you don't have creds + # for that bucket. + fs = s3fs.S3FileSystem(anon=True) + file = fs.open(_strip_schema(filepath_or_buffer), mode) + return file, fs + + +def get_filepath_or_buffer( + filepath_or_buffer: FilePathOrBuffer, + encoding: Optional[str] = None, + compression: Optional[str] = None, + mode: Optional[str] = None, +) -> Tuple[IO, Optional[str], Optional[str], bool]: + file, _fs = get_file_and_filesystem(filepath_or_buffer, mode=mode) + return file, None, compression, True diff --git a/venv/Lib/site-packages/pandas/io/sas/__init__.py b/venv/Lib/site-packages/pandas/io/sas/__init__.py new file mode 100644 index 0000000..8f81352 --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/sas/__init__.py @@ -0,0 +1 @@ +from pandas.io.sas.sasreader import read_sas # noqa diff --git a/venv/Lib/site-packages/pandas/io/sas/_sas.cp37-win_amd64.pyd b/venv/Lib/site-packages/pandas/io/sas/_sas.cp37-win_amd64.pyd new file mode 100644 index 0000000..e2294b7 Binary files /dev/null and b/venv/Lib/site-packages/pandas/io/sas/_sas.cp37-win_amd64.pyd differ diff --git a/venv/Lib/site-packages/pandas/io/sas/sas7bdat.py b/venv/Lib/site-packages/pandas/io/sas/sas7bdat.py new file mode 100644 index 0000000..f917477 --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/sas/sas7bdat.py @@ -0,0 +1,732 @@ +""" +Read SAS7BDAT files + +Based on code written by Jared Hobbs: + https://bitbucket.org/jaredhobbs/sas7bdat + +See also: + https://github.com/BioStatMatt/sas7bdat + +Partial documentation of the file format: + https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf + +Reference for binary data compression: + http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm +""" +from collections import abc +from datetime import datetime +import struct + +import numpy as np + +from pandas.errors import EmptyDataError + +import pandas as pd + +from pandas.io.common import get_filepath_or_buffer +from pandas.io.sas._sas import Parser +import pandas.io.sas.sas_constants as const + + +class _subheader_pointer: + pass + + +class _column: + pass + + +# SAS7BDAT represents a SAS data file in SAS7BDAT format. +class SAS7BDATReader(abc.Iterator): + """ + Read SAS files in SAS7BDAT format. + + Parameters + ---------- + path_or_buf : path name or buffer + Name of SAS file or file-like object pointing to SAS file + contents. + index : column identifier, defaults to None + Column to use as index. + convert_dates : boolean, defaults to True + Attempt to convert dates to Pandas datetime values. Note that + some rarely used SAS date formats may be unsupported. + blank_missing : boolean, defaults to True + Convert empty strings to missing values (SAS uses blanks to + indicate missing character variables). + chunksize : int, defaults to None + Return SAS7BDATReader object for iterations, returns chunks + with given number of lines. + encoding : string, defaults to None + String encoding. + convert_text : bool, defaults to True + If False, text variables are left as raw bytes. + convert_header_text : bool, defaults to True + If False, header text, including column names, are left as raw + bytes. + """ + + def __init__( + self, + path_or_buf, + index=None, + convert_dates=True, + blank_missing=True, + chunksize=None, + encoding=None, + convert_text=True, + convert_header_text=True, + ): + + self.index = index + self.convert_dates = convert_dates + self.blank_missing = blank_missing + self.chunksize = chunksize + self.encoding = encoding + self.convert_text = convert_text + self.convert_header_text = convert_header_text + + self.default_encoding = "latin-1" + self.compression = "" + self.column_names_strings = [] + self.column_names = [] + self.column_formats = [] + self.columns = [] + + self._current_page_data_subheader_pointers = [] + self._cached_page = None + self._column_data_lengths = [] + self._column_data_offsets = [] + self._column_types = [] + + self._current_row_in_file_index = 0 + self._current_row_on_page_index = 0 + self._current_row_in_file_index = 0 + + self._path_or_buf, _, _, _ = get_filepath_or_buffer(path_or_buf) + if isinstance(self._path_or_buf, str): + self._path_or_buf = open(self._path_or_buf, "rb") + self.handle = self._path_or_buf + + self._get_properties() + self._parse_metadata() + + def column_data_lengths(self): + """Return a numpy int64 array of the column data lengths""" + return np.asarray(self._column_data_lengths, dtype=np.int64) + + def column_data_offsets(self): + """Return a numpy int64 array of the column offsets""" + return np.asarray(self._column_data_offsets, dtype=np.int64) + + def column_types(self): + """Returns a numpy character array of the column types: + s (string) or d (double)""" + return np.asarray(self._column_types, dtype=np.dtype("S1")) + + def close(self): + try: + self.handle.close() + except AttributeError: + pass + + def _get_properties(self): + + # Check magic number + self._path_or_buf.seek(0) + self._cached_page = self._path_or_buf.read(288) + if self._cached_page[0 : len(const.magic)] != const.magic: + self.close() + raise ValueError("magic number mismatch (not a SAS file?)") + + # Get alignment information + align1, align2 = 0, 0 + buf = self._read_bytes(const.align_1_offset, const.align_1_length) + if buf == const.u64_byte_checker_value: + align2 = const.align_2_value + self.U64 = True + self._int_length = 8 + self._page_bit_offset = const.page_bit_offset_x64 + self._subheader_pointer_length = const.subheader_pointer_length_x64 + else: + self.U64 = False + self._page_bit_offset = const.page_bit_offset_x86 + self._subheader_pointer_length = const.subheader_pointer_length_x86 + self._int_length = 4 + buf = self._read_bytes(const.align_2_offset, const.align_2_length) + if buf == const.align_1_checker_value: + align1 = const.align_2_value + total_align = align1 + align2 + + # Get endianness information + buf = self._read_bytes(const.endianness_offset, const.endianness_length) + if buf == b"\x01": + self.byte_order = "<" + else: + self.byte_order = ">" + + # Get encoding information + buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0] + if buf in const.encoding_names: + self.file_encoding = const.encoding_names[buf] + else: + self.file_encoding = f"unknown (code={buf})" + + # Get platform information + buf = self._read_bytes(const.platform_offset, const.platform_length) + if buf == b"1": + self.platform = "unix" + elif buf == b"2": + self.platform = "windows" + else: + self.platform = "unknown" + + buf = self._read_bytes(const.dataset_offset, const.dataset_length) + self.name = buf.rstrip(b"\x00 ") + if self.convert_header_text: + self.name = self.name.decode(self.encoding or self.default_encoding) + + buf = self._read_bytes(const.file_type_offset, const.file_type_length) + self.file_type = buf.rstrip(b"\x00 ") + if self.convert_header_text: + self.file_type = self.file_type.decode( + self.encoding or self.default_encoding + ) + + # Timestamp is epoch 01/01/1960 + epoch = datetime(1960, 1, 1) + x = self._read_float( + const.date_created_offset + align1, const.date_created_length + ) + self.date_created = epoch + pd.to_timedelta(x, unit="s") + x = self._read_float( + const.date_modified_offset + align1, const.date_modified_length + ) + self.date_modified = epoch + pd.to_timedelta(x, unit="s") + + self.header_length = self._read_int( + const.header_size_offset + align1, const.header_size_length + ) + + # Read the rest of the header into cached_page. + buf = self._path_or_buf.read(self.header_length - 288) + self._cached_page += buf + if len(self._cached_page) != self.header_length: + self.close() + raise ValueError("The SAS7BDAT file appears to be truncated.") + + self._page_length = self._read_int( + const.page_size_offset + align1, const.page_size_length + ) + self._page_count = self._read_int( + const.page_count_offset + align1, const.page_count_length + ) + + buf = self._read_bytes( + const.sas_release_offset + total_align, const.sas_release_length + ) + self.sas_release = buf.rstrip(b"\x00 ") + if self.convert_header_text: + self.sas_release = self.sas_release.decode( + self.encoding or self.default_encoding + ) + + buf = self._read_bytes( + const.sas_server_type_offset + total_align, const.sas_server_type_length + ) + self.server_type = buf.rstrip(b"\x00 ") + if self.convert_header_text: + self.server_type = self.server_type.decode( + self.encoding or self.default_encoding + ) + + buf = self._read_bytes( + const.os_version_number_offset + total_align, const.os_version_number_length + ) + self.os_version = buf.rstrip(b"\x00 ") + if self.convert_header_text: + self.os_version = self.os_version.decode( + self.encoding or self.default_encoding + ) + + buf = self._read_bytes(const.os_name_offset + total_align, const.os_name_length) + buf = buf.rstrip(b"\x00 ") + if len(buf) > 0: + self.os_name = buf.decode(self.encoding or self.default_encoding) + else: + buf = self._read_bytes( + const.os_maker_offset + total_align, const.os_maker_length + ) + self.os_name = buf.rstrip(b"\x00 ") + if self.convert_header_text: + self.os_name = self.os_name.decode( + self.encoding or self.default_encoding + ) + + def __next__(self): + da = self.read(nrows=self.chunksize or 1) + if da is None: + raise StopIteration + return da + + # Read a single float of the given width (4 or 8). + def _read_float(self, offset, width): + if width not in (4, 8): + self.close() + raise ValueError("invalid float width") + buf = self._read_bytes(offset, width) + fd = "f" if width == 4 else "d" + return struct.unpack(self.byte_order + fd, buf)[0] + + # Read a single signed integer of the given width (1, 2, 4 or 8). + def _read_int(self, offset, width): + if width not in (1, 2, 4, 8): + self.close() + raise ValueError("invalid int width") + buf = self._read_bytes(offset, width) + it = {1: "b", 2: "h", 4: "l", 8: "q"}[width] + iv = struct.unpack(self.byte_order + it, buf)[0] + return iv + + def _read_bytes(self, offset, length): + if self._cached_page is None: + self._path_or_buf.seek(offset) + buf = self._path_or_buf.read(length) + if len(buf) < length: + self.close() + msg = f"Unable to read {length:d} bytes from file position {offset:d}." + raise ValueError(msg) + return buf + else: + if offset + length > len(self._cached_page): + self.close() + raise ValueError("The cached page is too small.") + return self._cached_page[offset : offset + length] + + def _parse_metadata(self): + done = False + while not done: + self._cached_page = self._path_or_buf.read(self._page_length) + if len(self._cached_page) <= 0: + break + if len(self._cached_page) != self._page_length: + self.close() + raise ValueError("Failed to read a meta data page from the SAS file.") + done = self._process_page_meta() + + def _process_page_meta(self): + self._read_page_header() + pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types + if self._current_page_type in pt: + self._process_page_metadata() + is_data_page = self._current_page_type & const.page_data_type + is_mix_page = self._current_page_type in const.page_mix_types + return ( + is_data_page + or is_mix_page + or self._current_page_data_subheader_pointers != [] + ) + + def _read_page_header(self): + bit_offset = self._page_bit_offset + tx = const.page_type_offset + bit_offset + self._current_page_type = self._read_int(tx, const.page_type_length) + tx = const.block_count_offset + bit_offset + self._current_page_block_count = self._read_int(tx, const.block_count_length) + tx = const.subheader_count_offset + bit_offset + self._current_page_subheaders_count = self._read_int( + tx, const.subheader_count_length + ) + + def _process_page_metadata(self): + bit_offset = self._page_bit_offset + + for i in range(self._current_page_subheaders_count): + pointer = self._process_subheader_pointers( + const.subheader_pointers_offset + bit_offset, i + ) + if pointer.length == 0: + continue + if pointer.compression == const.truncated_subheader_id: + continue + subheader_signature = self._read_subheader_signature(pointer.offset) + subheader_index = self._get_subheader_index( + subheader_signature, pointer.compression, pointer.ptype + ) + self._process_subheader(subheader_index, pointer) + + def _get_subheader_index(self, signature, compression, ptype): + index = const.subheader_signature_to_index.get(signature) + if index is None: + f1 = (compression == const.compressed_subheader_id) or (compression == 0) + f2 = ptype == const.compressed_subheader_type + if (self.compression != "") and f1 and f2: + index = const.SASIndex.data_subheader_index + else: + self.close() + raise ValueError("Unknown subheader signature") + return index + + def _process_subheader_pointers(self, offset, subheader_pointer_index): + + subheader_pointer_length = self._subheader_pointer_length + total_offset = offset + subheader_pointer_length * subheader_pointer_index + + subheader_offset = self._read_int(total_offset, self._int_length) + total_offset += self._int_length + + subheader_length = self._read_int(total_offset, self._int_length) + total_offset += self._int_length + + subheader_compression = self._read_int(total_offset, 1) + total_offset += 1 + + subheader_type = self._read_int(total_offset, 1) + + x = _subheader_pointer() + x.offset = subheader_offset + x.length = subheader_length + x.compression = subheader_compression + x.ptype = subheader_type + + return x + + def _read_subheader_signature(self, offset): + subheader_signature = self._read_bytes(offset, self._int_length) + return subheader_signature + + def _process_subheader(self, subheader_index, pointer): + offset = pointer.offset + length = pointer.length + + if subheader_index == const.SASIndex.row_size_index: + processor = self._process_rowsize_subheader + elif subheader_index == const.SASIndex.column_size_index: + processor = self._process_columnsize_subheader + elif subheader_index == const.SASIndex.column_text_index: + processor = self._process_columntext_subheader + elif subheader_index == const.SASIndex.column_name_index: + processor = self._process_columnname_subheader + elif subheader_index == const.SASIndex.column_attributes_index: + processor = self._process_columnattributes_subheader + elif subheader_index == const.SASIndex.format_and_label_index: + processor = self._process_format_subheader + elif subheader_index == const.SASIndex.column_list_index: + processor = self._process_columnlist_subheader + elif subheader_index == const.SASIndex.subheader_counts_index: + processor = self._process_subheader_counts + elif subheader_index == const.SASIndex.data_subheader_index: + self._current_page_data_subheader_pointers.append(pointer) + return + else: + raise ValueError("unknown subheader index") + + processor(offset, length) + + def _process_rowsize_subheader(self, offset, length): + + int_len = self._int_length + lcs_offset = offset + lcp_offset = offset + if self.U64: + lcs_offset += 682 + lcp_offset += 706 + else: + lcs_offset += 354 + lcp_offset += 378 + + self.row_length = self._read_int( + offset + const.row_length_offset_multiplier * int_len, int_len + ) + self.row_count = self._read_int( + offset + const.row_count_offset_multiplier * int_len, int_len + ) + self.col_count_p1 = self._read_int( + offset + const.col_count_p1_multiplier * int_len, int_len + ) + self.col_count_p2 = self._read_int( + offset + const.col_count_p2_multiplier * int_len, int_len + ) + mx = const.row_count_on_mix_page_offset_multiplier * int_len + self._mix_page_row_count = self._read_int(offset + mx, int_len) + self._lcs = self._read_int(lcs_offset, 2) + self._lcp = self._read_int(lcp_offset, 2) + + def _process_columnsize_subheader(self, offset, length): + int_len = self._int_length + offset += int_len + self.column_count = self._read_int(offset, int_len) + if self.col_count_p1 + self.col_count_p2 != self.column_count: + print( + f"Warning: column count mismatch ({self.col_count_p1} + " + f"{self.col_count_p2} != " + f"{self.column_count})\n" + ) + + # Unknown purpose + def _process_subheader_counts(self, offset, length): + pass + + def _process_columntext_subheader(self, offset, length): + + offset += self._int_length + text_block_size = self._read_int(offset, const.text_block_size_length) + + buf = self._read_bytes(offset, text_block_size) + cname_raw = buf[0:text_block_size].rstrip(b"\x00 ") + cname = cname_raw + if self.convert_header_text: + cname = cname.decode(self.encoding or self.default_encoding) + self.column_names_strings.append(cname) + + if len(self.column_names_strings) == 1: + compression_literal = "" + for cl in const.compression_literals: + if cl in cname_raw: + compression_literal = cl + self.compression = compression_literal + offset -= self._int_length + + offset1 = offset + 16 + if self.U64: + offset1 += 4 + + buf = self._read_bytes(offset1, self._lcp) + compression_literal = buf.rstrip(b"\x00") + if compression_literal == "": + self._lcs = 0 + offset1 = offset + 32 + if self.U64: + offset1 += 4 + buf = self._read_bytes(offset1, self._lcp) + self.creator_proc = buf[0 : self._lcp] + elif compression_literal == const.rle_compression: + offset1 = offset + 40 + if self.U64: + offset1 += 4 + buf = self._read_bytes(offset1, self._lcp) + self.creator_proc = buf[0 : self._lcp] + elif self._lcs > 0: + self._lcp = 0 + offset1 = offset + 16 + if self.U64: + offset1 += 4 + buf = self._read_bytes(offset1, self._lcs) + self.creator_proc = buf[0 : self._lcp] + if self.convert_header_text: + if hasattr(self, "creator_proc"): + self.creator_proc = self.creator_proc.decode( + self.encoding or self.default_encoding + ) + + def _process_columnname_subheader(self, offset, length): + int_len = self._int_length + offset += int_len + column_name_pointers_count = (length - 2 * int_len - 12) // 8 + for i in range(column_name_pointers_count): + text_subheader = ( + offset + + const.column_name_pointer_length * (i + 1) + + const.column_name_text_subheader_offset + ) + col_name_offset = ( + offset + + const.column_name_pointer_length * (i + 1) + + const.column_name_offset_offset + ) + col_name_length = ( + offset + + const.column_name_pointer_length * (i + 1) + + const.column_name_length_offset + ) + + idx = self._read_int( + text_subheader, const.column_name_text_subheader_length + ) + col_offset = self._read_int( + col_name_offset, const.column_name_offset_length + ) + col_len = self._read_int(col_name_length, const.column_name_length_length) + + name_str = self.column_names_strings[idx] + self.column_names.append(name_str[col_offset : col_offset + col_len]) + + def _process_columnattributes_subheader(self, offset, length): + int_len = self._int_length + column_attributes_vectors_count = (length - 2 * int_len - 12) // (int_len + 8) + for i in range(column_attributes_vectors_count): + col_data_offset = ( + offset + int_len + const.column_data_offset_offset + i * (int_len + 8) + ) + col_data_len = ( + offset + + 2 * int_len + + const.column_data_length_offset + + i * (int_len + 8) + ) + col_types = ( + offset + 2 * int_len + const.column_type_offset + i * (int_len + 8) + ) + + x = self._read_int(col_data_offset, int_len) + self._column_data_offsets.append(x) + + x = self._read_int(col_data_len, const.column_data_length_length) + self._column_data_lengths.append(x) + + x = self._read_int(col_types, const.column_type_length) + self._column_types.append(b"d" if x == 1 else b"s") + + def _process_columnlist_subheader(self, offset, length): + # unknown purpose + pass + + def _process_format_subheader(self, offset, length): + int_len = self._int_length + text_subheader_format = ( + offset + const.column_format_text_subheader_index_offset + 3 * int_len + ) + col_format_offset = offset + const.column_format_offset_offset + 3 * int_len + col_format_len = offset + const.column_format_length_offset + 3 * int_len + text_subheader_label = ( + offset + const.column_label_text_subheader_index_offset + 3 * int_len + ) + col_label_offset = offset + const.column_label_offset_offset + 3 * int_len + col_label_len = offset + const.column_label_length_offset + 3 * int_len + + x = self._read_int( + text_subheader_format, const.column_format_text_subheader_index_length + ) + format_idx = min(x, len(self.column_names_strings) - 1) + + format_start = self._read_int( + col_format_offset, const.column_format_offset_length + ) + format_len = self._read_int(col_format_len, const.column_format_length_length) + + label_idx = self._read_int( + text_subheader_label, const.column_label_text_subheader_index_length + ) + label_idx = min(label_idx, len(self.column_names_strings) - 1) + + label_start = self._read_int(col_label_offset, const.column_label_offset_length) + label_len = self._read_int(col_label_len, const.column_label_length_length) + + label_names = self.column_names_strings[label_idx] + column_label = label_names[label_start : label_start + label_len] + format_names = self.column_names_strings[format_idx] + column_format = format_names[format_start : format_start + format_len] + current_column_number = len(self.columns) + + col = _column() + col.col_id = current_column_number + col.name = self.column_names[current_column_number] + col.label = column_label + col.format = column_format + col.ctype = self._column_types[current_column_number] + col.length = self._column_data_lengths[current_column_number] + + self.column_formats.append(column_format) + self.columns.append(col) + + def read(self, nrows=None): + + if (nrows is None) and (self.chunksize is not None): + nrows = self.chunksize + elif nrows is None: + nrows = self.row_count + + if len(self._column_types) == 0: + self.close() + raise EmptyDataError("No columns to parse from file") + + if self._current_row_in_file_index >= self.row_count: + return None + + m = self.row_count - self._current_row_in_file_index + if nrows > m: + nrows = m + + nd = self._column_types.count(b"d") + ns = self._column_types.count(b"s") + + self._string_chunk = np.empty((ns, nrows), dtype=np.object) + self._byte_chunk = np.zeros((nd, 8 * nrows), dtype=np.uint8) + + self._current_row_in_chunk_index = 0 + p = Parser(self) + p.read(nrows) + + rslt = self._chunk_to_dataframe() + if self.index is not None: + rslt = rslt.set_index(self.index) + + return rslt + + def _read_next_page(self): + self._current_page_data_subheader_pointers = [] + self._cached_page = self._path_or_buf.read(self._page_length) + if len(self._cached_page) <= 0: + return True + elif len(self._cached_page) != self._page_length: + self.close() + msg = ( + "failed to read complete page from file (read " + f"{len(self._cached_page):d} of " + f"{self._page_length:d} bytes)" + ) + raise ValueError(msg) + + self._read_page_header() + page_type = self._current_page_type + if page_type == const.page_meta_type: + self._process_page_metadata() + + is_data_page = page_type & const.page_data_type + pt = [const.page_meta_type] + const.page_mix_types + if not is_data_page and self._current_page_type not in pt: + return self._read_next_page() + + return False + + def _chunk_to_dataframe(self): + + n = self._current_row_in_chunk_index + m = self._current_row_in_file_index + ix = range(m - n, m) + rslt = pd.DataFrame(index=ix) + + js, jb = 0, 0 + for j in range(self.column_count): + + name = self.column_names[j] + + if self._column_types[j] == b"d": + rslt[name] = self._byte_chunk[jb, :].view(dtype=self.byte_order + "d") + rslt[name] = np.asarray(rslt[name], dtype=np.float64) + if self.convert_dates: + unit = None + if self.column_formats[j] in const.sas_date_formats: + unit = "d" + elif self.column_formats[j] in const.sas_datetime_formats: + unit = "s" + if unit: + rslt[name] = pd.to_datetime( + rslt[name], unit=unit, origin="1960-01-01" + ) + jb += 1 + elif self._column_types[j] == b"s": + rslt[name] = self._string_chunk[js, :] + if self.convert_text and (self.encoding is not None): + rslt[name] = rslt[name].str.decode( + self.encoding or self.default_encoding + ) + if self.blank_missing: + ii = rslt[name].str.len() == 0 + rslt.loc[ii, name] = np.nan + js += 1 + else: + self.close() + raise ValueError(f"unknown column type {self._column_types[j]}") + + return rslt diff --git a/venv/Lib/site-packages/pandas/io/sas/sas_constants.py b/venv/Lib/site-packages/pandas/io/sas/sas_constants.py new file mode 100644 index 0000000..23b23a1 --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/sas/sas_constants.py @@ -0,0 +1,253 @@ +magic = ( + b"\x00\x00\x00\x00\x00\x00\x00\x00" + + b"\x00\x00\x00\x00\xc2\xea\x81\x60" + + b"\xb3\x14\x11\xcf\xbd\x92\x08\x00" + + b"\x09\xc7\x31\x8c\x18\x1f\x10\x11" +) + +align_1_checker_value = b"3" +align_1_offset = 32 +align_1_length = 1 +align_1_value = 4 +u64_byte_checker_value = b"3" +align_2_offset = 35 +align_2_length = 1 +align_2_value = 4 +endianness_offset = 37 +endianness_length = 1 +platform_offset = 39 +platform_length = 1 +encoding_offset = 70 +encoding_length = 1 +dataset_offset = 92 +dataset_length = 64 +file_type_offset = 156 +file_type_length = 8 +date_created_offset = 164 +date_created_length = 8 +date_modified_offset = 172 +date_modified_length = 8 +header_size_offset = 196 +header_size_length = 4 +page_size_offset = 200 +page_size_length = 4 +page_count_offset = 204 +page_count_length = 4 +sas_release_offset = 216 +sas_release_length = 8 +sas_server_type_offset = 224 +sas_server_type_length = 16 +os_version_number_offset = 240 +os_version_number_length = 16 +os_maker_offset = 256 +os_maker_length = 16 +os_name_offset = 272 +os_name_length = 16 +page_bit_offset_x86 = 16 +page_bit_offset_x64 = 32 +subheader_pointer_length_x86 = 12 +subheader_pointer_length_x64 = 24 +page_type_offset = 0 +page_type_length = 2 +block_count_offset = 2 +block_count_length = 2 +subheader_count_offset = 4 +subheader_count_length = 2 +page_meta_type = 0 +page_data_type = 256 +page_amd_type = 1024 +page_metc_type = 16384 +page_comp_type = -28672 +page_mix_types = [512, 640] +subheader_pointers_offset = 8 +truncated_subheader_id = 1 +compressed_subheader_id = 4 +compressed_subheader_type = 1 +text_block_size_length = 2 +row_length_offset_multiplier = 5 +row_count_offset_multiplier = 6 +col_count_p1_multiplier = 9 +col_count_p2_multiplier = 10 +row_count_on_mix_page_offset_multiplier = 15 +column_name_pointer_length = 8 +column_name_text_subheader_offset = 0 +column_name_text_subheader_length = 2 +column_name_offset_offset = 2 +column_name_offset_length = 2 +column_name_length_offset = 4 +column_name_length_length = 2 +column_data_offset_offset = 8 +column_data_length_offset = 8 +column_data_length_length = 4 +column_type_offset = 14 +column_type_length = 1 +column_format_text_subheader_index_offset = 22 +column_format_text_subheader_index_length = 2 +column_format_offset_offset = 24 +column_format_offset_length = 2 +column_format_length_offset = 26 +column_format_length_length = 2 +column_label_text_subheader_index_offset = 28 +column_label_text_subheader_index_length = 2 +column_label_offset_offset = 30 +column_label_offset_length = 2 +column_label_length_offset = 32 +column_label_length_length = 2 +rle_compression = b"SASYZCRL" +rdc_compression = b"SASYZCR2" + +compression_literals = [rle_compression, rdc_compression] + +# Incomplete list of encodings, using SAS nomenclature: +# http://support.sas.com/documentation/cdl/en/nlsref/61893/HTML/default/viewer.htm#a002607278.htm +encoding_names = { + 29: "latin1", + 20: "utf-8", + 33: "cyrillic", + 60: "wlatin2", + 61: "wcyrillic", + 62: "wlatin1", + 90: "ebcdic870", +} + + +class SASIndex: + row_size_index = 0 + column_size_index = 1 + subheader_counts_index = 2 + column_text_index = 3 + column_name_index = 4 + column_attributes_index = 5 + format_and_label_index = 6 + column_list_index = 7 + data_subheader_index = 8 + + +subheader_signature_to_index = { + b"\xF7\xF7\xF7\xF7": SASIndex.row_size_index, + b"\x00\x00\x00\x00\xF7\xF7\xF7\xF7": SASIndex.row_size_index, + b"\xF7\xF7\xF7\xF7\x00\x00\x00\x00": SASIndex.row_size_index, + b"\xF7\xF7\xF7\xF7\xFF\xFF\xFB\xFE": SASIndex.row_size_index, + b"\xF6\xF6\xF6\xF6": SASIndex.column_size_index, + b"\x00\x00\x00\x00\xF6\xF6\xF6\xF6": SASIndex.column_size_index, + b"\xF6\xF6\xF6\xF6\x00\x00\x00\x00": SASIndex.column_size_index, + b"\xF6\xF6\xF6\xF6\xFF\xFF\xFB\xFE": SASIndex.column_size_index, + b"\x00\xFC\xFF\xFF": SASIndex.subheader_counts_index, + b"\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index, + b"\x00\xFC\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.subheader_counts_index, + b"\xFF\xFF\xFF\xFF\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index, + b"\xFD\xFF\xFF\xFF": SASIndex.column_text_index, + b"\xFF\xFF\xFF\xFD": SASIndex.column_text_index, + b"\xFD\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_text_index, + b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFD": SASIndex.column_text_index, + b"\xFF\xFF\xFF\xFF": SASIndex.column_name_index, + b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_name_index, + b"\xFC\xFF\xFF\xFF": SASIndex.column_attributes_index, + b"\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index, + b"\xFC\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_attributes_index, + b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index, + b"\xFE\xFB\xFF\xFF": SASIndex.format_and_label_index, + b"\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index, + b"\xFE\xFB\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.format_and_label_index, + b"\xFF\xFF\xFF\xFF\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index, + b"\xFE\xFF\xFF\xFF": SASIndex.column_list_index, + b"\xFF\xFF\xFF\xFE": SASIndex.column_list_index, + b"\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_list_index, + b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE": SASIndex.column_list_index, +} + + +# List of frequently used SAS date and datetime formats +# http://support.sas.com/documentation/cdl/en/etsug/60372/HTML/default/viewer.htm#etsug_intervals_sect009.htm +# https://github.com/epam/parso/blob/master/src/main/java/com/epam/parso/impl/SasFileConstants.java +sas_date_formats = ( + "DATE", + "DAY", + "DDMMYY", + "DOWNAME", + "JULDAY", + "JULIAN", + "MMDDYY", + "MMYY", + "MMYYC", + "MMYYD", + "MMYYP", + "MMYYS", + "MMYYN", + "MONNAME", + "MONTH", + "MONYY", + "QTR", + "QTRR", + "NENGO", + "WEEKDATE", + "WEEKDATX", + "WEEKDAY", + "WEEKV", + "WORDDATE", + "WORDDATX", + "YEAR", + "YYMM", + "YYMMC", + "YYMMD", + "YYMMP", + "YYMMS", + "YYMMN", + "YYMON", + "YYMMDD", + "YYQ", + "YYQC", + "YYQD", + "YYQP", + "YYQS", + "YYQN", + "YYQR", + "YYQRC", + "YYQRD", + "YYQRP", + "YYQRS", + "YYQRN", + "YYMMDDP", + "YYMMDDC", + "E8601DA", + "YYMMDDN", + "MMDDYYC", + "MMDDYYS", + "MMDDYYD", + "YYMMDDS", + "B8601DA", + "DDMMYYN", + "YYMMDDD", + "DDMMYYB", + "DDMMYYP", + "MMDDYYP", + "YYMMDDB", + "MMDDYYN", + "DDMMYYC", + "DDMMYYD", + "DDMMYYS", + "MINGUO", +) + +sas_datetime_formats = ( + "DATETIME", + "DTWKDATX", + "B8601DN", + "B8601DT", + "B8601DX", + "B8601DZ", + "B8601LX", + "E8601DN", + "E8601DT", + "E8601DX", + "E8601DZ", + "E8601LX", + "DATEAMPM", + "DTDATE", + "DTMONYY", + "DTMONYY", + "DTWKDATX", + "DTYEAR", + "TOD", + "MDYAMPM", +) diff --git a/venv/Lib/site-packages/pandas/io/sas/sas_xport.py b/venv/Lib/site-packages/pandas/io/sas/sas_xport.py new file mode 100644 index 0000000..3cf7fd8 --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/sas/sas_xport.py @@ -0,0 +1,507 @@ +""" +Read a SAS XPort format file into a Pandas DataFrame. + +Based on code from Jack Cushman (github.com/jcushman/xport). + +The file format is defined here: + +https://support.sas.com/techsup/technote/ts140.pdf +""" +from collections import abc +from datetime import datetime +from io import BytesIO +import struct +import warnings + +import numpy as np + +from pandas.util._decorators import Appender + +import pandas as pd + +from pandas.io.common import get_filepath_or_buffer + +_correct_line1 = ( + "HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!" + "000000000000000000000000000000 " +) +_correct_header1 = ( + "HEADER RECORD*******MEMBER HEADER RECORD!!!!!!!000000000000000001600000000" +) +_correct_header2 = ( + "HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!" + "000000000000000000000000000000 " +) +_correct_obs_header = ( + "HEADER RECORD*******OBS HEADER RECORD!!!!!!!" + "000000000000000000000000000000 " +) +_fieldkeys = [ + "ntype", + "nhfun", + "field_length", + "nvar0", + "name", + "label", + "nform", + "nfl", + "num_decimals", + "nfj", + "nfill", + "niform", + "nifl", + "nifd", + "npos", + "_", +] + + +_base_params_doc = """\ +Parameters +---------- +filepath_or_buffer : string or file-like object + Path to SAS file or object implementing binary read method.""" + +_params2_doc = """\ +index : identifier of index column + Identifier of column that should be used as index of the DataFrame. +encoding : string + Encoding for text data. +chunksize : int + Read file `chunksize` lines at a time, returns iterator.""" + +_format_params_doc = """\ +format : string + File format, only `xport` is currently supported.""" + +_iterator_doc = """\ +iterator : boolean, default False + Return XportReader object for reading file incrementally.""" + + +_read_sas_doc = """Read a SAS file into a DataFrame. + +%(_base_params_doc)s +%(_format_params_doc)s +%(_params2_doc)s +%(_iterator_doc)s + +Returns +------- +DataFrame or XportReader + +Examples +-------- +Read a SAS Xport file: + +>>> df = pd.read_sas('filename.XPT') + +Read a Xport file in 10,000 line chunks: + +>>> itr = pd.read_sas('filename.XPT', chunksize=10000) +>>> for chunk in itr: +>>> do_something(chunk) + +""" % { + "_base_params_doc": _base_params_doc, + "_format_params_doc": _format_params_doc, + "_params2_doc": _params2_doc, + "_iterator_doc": _iterator_doc, +} + + +_xport_reader_doc = """\ +Class for reading SAS Xport files. + +%(_base_params_doc)s +%(_params2_doc)s + +Attributes +---------- +member_info : list + Contains information about the file +fields : list + Contains information about the variables in the file +""" % { + "_base_params_doc": _base_params_doc, + "_params2_doc": _params2_doc, +} + + +_read_method_doc = """\ +Read observations from SAS Xport file, returning as data frame. + +Parameters +---------- +nrows : int + Number of rows to read from data file; if None, read whole + file. + +Returns +------- +A DataFrame. +""" + + +def _parse_date(datestr: str) -> datetime: + """ Given a date in xport format, return Python date. """ + try: + # e.g. "16FEB11:10:07:55" + return datetime.strptime(datestr, "%d%b%y:%H:%M:%S") + except ValueError: + return pd.NaT + + +def _split_line(s: str, parts): + """ + Parameters + ---------- + s: str + Fixed-length string to split + parts: list of (name, length) pairs + Used to break up string, name '_' will be filtered from output. + + Returns + ------- + Dict of name:contents of string at given location. + """ + out = {} + start = 0 + for name, length in parts: + out[name] = s[start : start + length].strip() + start += length + del out["_"] + return out + + +def _handle_truncated_float_vec(vec, nbytes): + # This feature is not well documented, but some SAS XPORT files + # have 2-7 byte "truncated" floats. To read these truncated + # floats, pad them with zeros on the right to make 8 byte floats. + # + # References: + # https://github.com/jcushman/xport/pull/3 + # The R "foreign" library + + if nbytes != 8: + vec1 = np.zeros(len(vec), np.dtype("S8")) + dtype = np.dtype("S%d,S%d" % (nbytes, 8 - nbytes)) + vec2 = vec1.view(dtype=dtype) + vec2["f0"] = vec + return vec2 + + return vec + + +def _parse_float_vec(vec): + """ + Parse a vector of float values representing IBM 8 byte floats into + native 8 byte floats. + """ + + dtype = np.dtype(">u4,>u4") + vec1 = vec.view(dtype=dtype) + xport1 = vec1["f0"] + xport2 = vec1["f1"] + + # Start by setting first half of ieee number to first half of IBM + # number sans exponent + ieee1 = xport1 & 0x00FFFFFF + + # The fraction bit to the left of the binary point in the ieee + # format was set and the number was shifted 0, 1, 2, or 3 + # places. This will tell us how to adjust the ibm exponent to be a + # power of 2 ieee exponent and how to shift the fraction bits to + # restore the correct magnitude. + shift = np.zeros(len(vec), dtype=np.uint8) + shift[np.where(xport1 & 0x00200000)] = 1 + shift[np.where(xport1 & 0x00400000)] = 2 + shift[np.where(xport1 & 0x00800000)] = 3 + + # shift the ieee number down the correct number of places then + # set the second half of the ieee number to be the second half + # of the ibm number shifted appropriately, ored with the bits + # from the first half that would have been shifted in if we + # could shift a double. All we are worried about are the low + # order 3 bits of the first half since we're only shifting by + # 1, 2, or 3. + ieee1 >>= shift + ieee2 = (xport2 >> shift) | ((xport1 & 0x00000007) << (29 + (3 - shift))) + + # clear the 1 bit to the left of the binary point + ieee1 &= 0xFFEFFFFF + + # set the exponent of the ieee number to be the actual exponent + # plus the shift count + 1023. Or this into the first half of the + # ieee number. The ibm exponent is excess 64 but is adjusted by 65 + # since during conversion to ibm format the exponent is + # incremented by 1 and the fraction bits left 4 positions to the + # right of the radix point. (had to add >> 24 because C treats & + # 0x7f as 0x7f000000 and Python doesn't) + ieee1 |= ((((((xport1 >> 24) & 0x7F) - 65) << 2) + shift + 1023) << 20) | ( + xport1 & 0x80000000 + ) + + ieee = np.empty((len(ieee1),), dtype=">u4,>u4") + ieee["f0"] = ieee1 + ieee["f1"] = ieee2 + ieee = ieee.view(dtype=">f8") + ieee = ieee.astype("f8") + + return ieee + + +class XportReader(abc.Iterator): + __doc__ = _xport_reader_doc + + def __init__( + self, filepath_or_buffer, index=None, encoding="ISO-8859-1", chunksize=None + ): + + self._encoding = encoding + self._lines_read = 0 + self._index = index + self._chunksize = chunksize + + if isinstance(filepath_or_buffer, str): + ( + filepath_or_buffer, + encoding, + compression, + should_close, + ) = get_filepath_or_buffer(filepath_or_buffer, encoding=encoding) + + if isinstance(filepath_or_buffer, (str, bytes)): + self.filepath_or_buffer = open(filepath_or_buffer, "rb") + else: + # Copy to BytesIO, and ensure no encoding + contents = filepath_or_buffer.read() + try: + contents = contents.encode(self._encoding) + except UnicodeEncodeError: + pass + self.filepath_or_buffer = BytesIO(contents) + + self._read_header() + + def close(self): + self.filepath_or_buffer.close() + + def _get_row(self): + return self.filepath_or_buffer.read(80).decode() + + def _read_header(self): + self.filepath_or_buffer.seek(0) + + # read file header + line1 = self._get_row() + if line1 != _correct_line1: + self.close() + raise ValueError("Header record is not an XPORT file.") + + line2 = self._get_row() + fif = [["prefix", 24], ["version", 8], ["OS", 8], ["_", 24], ["created", 16]] + file_info = _split_line(line2, fif) + if file_info["prefix"] != "SAS SAS SASLIB": + self.close() + raise ValueError("Header record has invalid prefix.") + file_info["created"] = _parse_date(file_info["created"]) + self.file_info = file_info + + line3 = self._get_row() + file_info["modified"] = _parse_date(line3[:16]) + + # read member header + header1 = self._get_row() + header2 = self._get_row() + headflag1 = header1.startswith(_correct_header1) + headflag2 = header2 == _correct_header2 + if not (headflag1 and headflag2): + self.close() + raise ValueError("Member header not found") + # usually 140, could be 135 + fieldnamelength = int(header1[-5:-2]) + + # member info + mem = [ + ["prefix", 8], + ["set_name", 8], + ["sasdata", 8], + ["version", 8], + ["OS", 8], + ["_", 24], + ["created", 16], + ] + member_info = _split_line(self._get_row(), mem) + mem = [["modified", 16], ["_", 16], ["label", 40], ["type", 8]] + member_info.update(_split_line(self._get_row(), mem)) + member_info["modified"] = _parse_date(member_info["modified"]) + member_info["created"] = _parse_date(member_info["created"]) + self.member_info = member_info + + # read field names + types = {1: "numeric", 2: "char"} + fieldcount = int(self._get_row()[54:58]) + datalength = fieldnamelength * fieldcount + # round up to nearest 80 + if datalength % 80: + datalength += 80 - datalength % 80 + fielddata = self.filepath_or_buffer.read(datalength) + fields = [] + obs_length = 0 + while len(fielddata) >= fieldnamelength: + # pull data for one field + field, fielddata = ( + fielddata[:fieldnamelength], + fielddata[fieldnamelength:], + ) + + # rest at end gets ignored, so if field is short, pad out + # to match struct pattern below + field = field.ljust(140) + + fieldstruct = struct.unpack(">hhhh8s40s8shhh2s8shhl52s", field) + field = dict(zip(_fieldkeys, fieldstruct)) + del field["_"] + field["ntype"] = types[field["ntype"]] + fl = field["field_length"] + if field["ntype"] == "numeric" and ((fl < 2) or (fl > 8)): + self.close() + msg = f"Floating field width {fl} is not between 2 and 8." + raise TypeError(msg) + + for k, v in field.items(): + try: + field[k] = v.strip() + except AttributeError: + pass + + obs_length += field["field_length"] + fields += [field] + + header = self._get_row() + if not header == _correct_obs_header: + self.close() + raise ValueError("Observation header not found.") + + self.fields = fields + self.record_length = obs_length + self.record_start = self.filepath_or_buffer.tell() + + self.nobs = self._record_count() + self.columns = [x["name"].decode() for x in self.fields] + + # Setup the dtype. + dtypel = [ + ("s" + str(i), "S" + str(field["field_length"])) + for i, field in enumerate(self.fields) + ] + dtype = np.dtype(dtypel) + self._dtype = dtype + + def __next__(self): + return self.read(nrows=self._chunksize or 1) + + def _record_count(self) -> int: + """ + Get number of records in file. + + This is maybe suboptimal because we have to seek to the end of + the file. + + Side effect: returns file position to record_start. + """ + + self.filepath_or_buffer.seek(0, 2) + total_records_length = self.filepath_or_buffer.tell() - self.record_start + + if total_records_length % 80 != 0: + warnings.warn("xport file may be corrupted") + + if self.record_length > 80: + self.filepath_or_buffer.seek(self.record_start) + return total_records_length // self.record_length + + self.filepath_or_buffer.seek(-80, 2) + last_card = self.filepath_or_buffer.read(80) + last_card = np.frombuffer(last_card, dtype=np.uint64) + + # 8 byte blank + ix = np.flatnonzero(last_card == 2314885530818453536) + + if len(ix) == 0: + tail_pad = 0 + else: + tail_pad = 8 * len(ix) + + self.filepath_or_buffer.seek(self.record_start) + + return (total_records_length - tail_pad) // self.record_length + + def get_chunk(self, size=None): + """ + Reads lines from Xport file and returns as dataframe + + Parameters + ---------- + size : int, defaults to None + Number of lines to read. If None, reads whole file. + + Returns + ------- + DataFrame + """ + if size is None: + size = self._chunksize + return self.read(nrows=size) + + def _missing_double(self, vec): + v = vec.view(dtype="u1,u1,u2,u4") + miss = (v["f1"] == 0) & (v["f2"] == 0) & (v["f3"] == 0) + miss1 = ( + ((v["f0"] >= 0x41) & (v["f0"] <= 0x5A)) + | (v["f0"] == 0x5F) + | (v["f0"] == 0x2E) + ) + miss &= miss1 + return miss + + @Appender(_read_method_doc) + def read(self, nrows=None): + + if nrows is None: + nrows = self.nobs + + read_lines = min(nrows, self.nobs - self._lines_read) + read_len = read_lines * self.record_length + if read_len <= 0: + self.close() + raise StopIteration + raw = self.filepath_or_buffer.read(read_len) + data = np.frombuffer(raw, dtype=self._dtype, count=read_lines) + + df = pd.DataFrame(index=range(read_lines)) + for j, x in enumerate(self.columns): + vec = data["s" + str(j)] + ntype = self.fields[j]["ntype"] + if ntype == "numeric": + vec = _handle_truncated_float_vec(vec, self.fields[j]["field_length"]) + miss = self._missing_double(vec) + v = _parse_float_vec(vec) + v[miss] = np.nan + elif self.fields[j]["ntype"] == "char": + v = [y.rstrip() for y in vec] + + if self._encoding is not None: + v = [y.decode(self._encoding) for y in v] + + df[x] = v + + if self._index is None: + df.index = range(self._lines_read, self._lines_read + read_lines) + else: + df = df.set_index(self._index) + + self._lines_read += read_lines + + return df diff --git a/venv/Lib/site-packages/pandas/io/sas/sasreader.py b/venv/Lib/site-packages/pandas/io/sas/sasreader.py new file mode 100644 index 0000000..56ebb58 --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/sas/sasreader.py @@ -0,0 +1,86 @@ +""" +Read SAS sas7bdat or xport files. +""" +from pandas.io.common import stringify_path + + +def read_sas( + filepath_or_buffer, + format=None, + index=None, + encoding=None, + chunksize=None, + iterator=False, +): + """ + Read SAS files stored as either XPORT or SAS7BDAT format files. + + Parameters + ---------- + filepath_or_buffer : str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: + ``file://localhost/path/to/table.sas``. + + If you want to pass in a path object, pandas accepts any + ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, + such as a file handler (e.g. via builtin ``open`` function) + or ``StringIO``. + format : str {'xport', 'sas7bdat'} or None + If None, file format is inferred from file extension. If 'xport' or + 'sas7bdat', uses the corresponding format. + index : identifier of index column, defaults to None + Identifier of column that should be used as index of the DataFrame. + encoding : str, default is None + Encoding for text data. If None, text data are stored as raw bytes. + chunksize : int + Read file `chunksize` lines at a time, returns iterator. + iterator : bool, defaults to False + If True, returns an iterator for reading the file incrementally. + + Returns + ------- + DataFrame if iterator=False and chunksize=None, else SAS7BDATReader + or XportReader + """ + if format is None: + buffer_error_msg = ( + "If this is a buffer object rather " + "than a string name, you must specify " + "a format string" + ) + filepath_or_buffer = stringify_path(filepath_or_buffer) + if not isinstance(filepath_or_buffer, str): + raise ValueError(buffer_error_msg) + fname = filepath_or_buffer.lower() + if fname.endswith(".xpt"): + format = "xport" + elif fname.endswith(".sas7bdat"): + format = "sas7bdat" + else: + raise ValueError("unable to infer format of SAS file") + + if format.lower() == "xport": + from pandas.io.sas.sas_xport import XportReader + + reader = XportReader( + filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize + ) + elif format.lower() == "sas7bdat": + from pandas.io.sas.sas7bdat import SAS7BDATReader + + reader = SAS7BDATReader( + filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize + ) + else: + raise ValueError("unknown SAS format") + + if iterator or chunksize: + return reader + + data = reader.read() + reader.close() + return data diff --git a/venv/Lib/site-packages/pandas/io/spss.py b/venv/Lib/site-packages/pandas/io/spss.py new file mode 100644 index 0000000..cdbe14e --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/spss.py @@ -0,0 +1,45 @@ +from pathlib import Path +from typing import Optional, Sequence, Union + +from pandas.compat._optional import import_optional_dependency + +from pandas.core.dtypes.inference import is_list_like + +from pandas.core.api import DataFrame + + +def read_spss( + path: Union[str, Path], + usecols: Optional[Sequence[str]] = None, + convert_categoricals: bool = True, +) -> DataFrame: + """ + Load an SPSS file from the file path, returning a DataFrame. + + .. versionadded:: 0.25.0 + + Parameters + ---------- + path : string or Path + File path. + usecols : list-like, optional + Return a subset of the columns. If None, return all columns. + convert_categoricals : bool, default is True + Convert categorical columns into pd.Categorical. + + Returns + ------- + DataFrame + """ + pyreadstat = import_optional_dependency("pyreadstat") + + if usecols is not None: + if not is_list_like(usecols): + raise TypeError("usecols must be list-like.") + else: + usecols = list(usecols) # pyreadstat requires a list + + df, _ = pyreadstat.read_sav( + path, usecols=usecols, apply_value_formats=convert_categoricals + ) + return df diff --git a/venv/Lib/site-packages/pandas/io/sql.py b/venv/Lib/site-packages/pandas/io/sql.py new file mode 100644 index 0000000..f452799 --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/sql.py @@ -0,0 +1,1782 @@ +""" +Collection of query wrappers / abstractions to both facilitate data +retrieval and to reduce dependency on DB-specific API. +""" + +from contextlib import contextmanager +from datetime import date, datetime, time +from functools import partial +import re +import warnings + +import numpy as np + +import pandas._libs.lib as lib + +from pandas.core.dtypes.common import is_datetime64tz_dtype, is_dict_like, is_list_like +from pandas.core.dtypes.dtypes import DatetimeTZDtype +from pandas.core.dtypes.missing import isna + +from pandas.core.api import DataFrame, Series +from pandas.core.base import PandasObject +from pandas.core.tools.datetimes import to_datetime + + +class SQLAlchemyRequired(ImportError): + pass + + +class DatabaseError(IOError): + pass + + +# ----------------------------------------------------------------------------- +# -- Helper functions + +_SQLALCHEMY_INSTALLED = None + + +def _is_sqlalchemy_connectable(con): + global _SQLALCHEMY_INSTALLED + if _SQLALCHEMY_INSTALLED is None: + try: + import sqlalchemy + + _SQLALCHEMY_INSTALLED = True + except ImportError: + _SQLALCHEMY_INSTALLED = False + + if _SQLALCHEMY_INSTALLED: + import sqlalchemy # noqa: F811 + + return isinstance(con, sqlalchemy.engine.Connectable) + else: + return False + + +def _convert_params(sql, params): + """Convert SQL and params args to DBAPI2.0 compliant format.""" + args = [sql] + if params is not None: + if hasattr(params, "keys"): # test if params is a mapping + args += [params] + else: + args += [list(params)] + return args + + +def _process_parse_dates_argument(parse_dates): + """Process parse_dates argument for read_sql functions""" + # handle non-list entries for parse_dates gracefully + if parse_dates is True or parse_dates is None or parse_dates is False: + parse_dates = [] + + elif not hasattr(parse_dates, "__iter__"): + parse_dates = [parse_dates] + return parse_dates + + +def _handle_date_column(col, utc=None, format=None): + if isinstance(format, dict): + return to_datetime(col, errors="ignore", **format) + else: + # Allow passing of formatting string for integers + # GH17855 + if format is None and ( + issubclass(col.dtype.type, np.floating) + or issubclass(col.dtype.type, np.integer) + ): + format = "s" + if format in ["D", "d", "h", "m", "s", "ms", "us", "ns"]: + return to_datetime(col, errors="coerce", unit=format, utc=utc) + elif is_datetime64tz_dtype(col): + # coerce to UTC timezone + # GH11216 + return to_datetime(col, utc=True) + else: + return to_datetime(col, errors="coerce", format=format, utc=utc) + + +def _parse_date_columns(data_frame, parse_dates): + """ + Force non-datetime columns to be read as such. + Supports both string formatted and integer timestamp columns. + """ + parse_dates = _process_parse_dates_argument(parse_dates) + + # we want to coerce datetime64_tz dtypes for now to UTC + # we could in theory do a 'nice' conversion from a FixedOffset tz + # GH11216 + for col_name, df_col in data_frame.items(): + if is_datetime64tz_dtype(df_col) or col_name in parse_dates: + try: + fmt = parse_dates[col_name] + except TypeError: + fmt = None + data_frame[col_name] = _handle_date_column(df_col, format=fmt) + + return data_frame + + +def _wrap_result(data, columns, index_col=None, coerce_float=True, parse_dates=None): + """Wrap result set of query in a DataFrame.""" + + frame = DataFrame.from_records(data, columns=columns, coerce_float=coerce_float) + + frame = _parse_date_columns(frame, parse_dates) + + if index_col is not None: + frame.set_index(index_col, inplace=True) + + return frame + + +def execute(sql, con, cur=None, params=None): + """ + Execute the given SQL query using the provided connection object. + + Parameters + ---------- + sql : string + SQL query to be executed. + con : SQLAlchemy connectable(engine/connection) or sqlite3 connection + Using SQLAlchemy makes it possible to use any DB supported by the + library. + If a DBAPI2 object, only sqlite3 is supported. + cur : deprecated, cursor is obtained from connection, default: None + params : list or tuple, optional, default: None + List of parameters to pass to execute method. + + Returns + ------- + Results Iterable + """ + if cur is None: + pandas_sql = pandasSQL_builder(con) + else: + pandas_sql = pandasSQL_builder(cur, is_cursor=True) + args = _convert_params(sql, params) + return pandas_sql.execute(*args) + + +# ----------------------------------------------------------------------------- +# -- Read and write to DataFrames + + +def read_sql_table( + table_name, + con, + schema=None, + index_col=None, + coerce_float=True, + parse_dates=None, + columns=None, + chunksize=None, +): + """ + Read SQL database table into a DataFrame. + + Given a table name and a SQLAlchemy connectable, returns a DataFrame. + This function does not support DBAPI connections. + + Parameters + ---------- + table_name : str + Name of SQL table in database. + con : SQLAlchemy connectable or str + A database URI could be provided as as str. + SQLite DBAPI connection mode not supported. + schema : str, default None + Name of SQL schema in database to query (if database flavor + supports this). Uses default schema if None (default). + index_col : str or list of str, optional, default: None + Column(s) to set as index(MultiIndex). + coerce_float : bool, default True + Attempts to convert values of non-string, non-numeric objects (like + decimal.Decimal) to floating point. Can result in loss of Precision. + parse_dates : list or dict, default None + - List of column names to parse as dates. + - Dict of ``{column_name: format string}`` where format string is + strftime compatible in case of parsing string times or is one of + (D, s, ns, ms, us) in case of parsing integer timestamps. + - Dict of ``{column_name: arg dict}``, where the arg dict corresponds + to the keyword arguments of :func:`pandas.to_datetime` + Especially useful with databases without native Datetime support, + such as SQLite. + columns : list, default None + List of column names to select from SQL table. + chunksize : int, default None + If specified, returns an iterator where `chunksize` is the number of + rows to include in each chunk. + + Returns + ------- + DataFrame + A SQL table is returned as two-dimensional data structure with labeled + axes. + + See Also + -------- + read_sql_query : Read SQL query into a DataFrame. + read_sql : Read SQL query or database table into a DataFrame. + + Notes + ----- + Any datetime values with time zone information will be converted to UTC. + + Examples + -------- + >>> pd.read_sql_table('table_name', 'postgres:///db_name') # doctest:+SKIP + """ + + con = _engine_builder(con) + if not _is_sqlalchemy_connectable(con): + raise NotImplementedError( + "read_sql_table only supported for SQLAlchemy connectable." + ) + import sqlalchemy + from sqlalchemy.schema import MetaData + + meta = MetaData(con, schema=schema) + try: + meta.reflect(only=[table_name], views=True) + except sqlalchemy.exc.InvalidRequestError: + raise ValueError(f"Table {table_name} not found") + + pandas_sql = SQLDatabase(con, meta=meta) + table = pandas_sql.read_table( + table_name, + index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates, + columns=columns, + chunksize=chunksize, + ) + + if table is not None: + return table + else: + raise ValueError(f"Table {table_name} not found", con) + + +def read_sql_query( + sql, + con, + index_col=None, + coerce_float=True, + params=None, + parse_dates=None, + chunksize=None, +): + """ + Read SQL query into a DataFrame. + + Returns a DataFrame corresponding to the result set of the query + string. Optionally provide an `index_col` parameter to use one of the + columns as the index, otherwise default integer index will be used. + + Parameters + ---------- + sql : str SQL query or SQLAlchemy Selectable (select or text object) + SQL query to be executed. + con : SQLAlchemy connectable(engine/connection), database str URI, + or sqlite3 DBAPI2 connection + Using SQLAlchemy makes it possible to use any DB supported by that + library. + If a DBAPI2 object, only sqlite3 is supported. + index_col : str or list of strings, optional, default: None + Column(s) to set as index(MultiIndex). + coerce_float : bool, default True + Attempts to convert values of non-string, non-numeric objects (like + decimal.Decimal) to floating point. Useful for SQL result sets. + params : list, tuple or dict, optional, default: None + List of parameters to pass to execute method. The syntax used + to pass parameters is database driver dependent. Check your + database driver documentation for which of the five syntax styles, + described in PEP 249's paramstyle, is supported. + Eg. for psycopg2, uses %(name)s so use params={'name' : 'value'}. + parse_dates : list or dict, default: None + - List of column names to parse as dates. + - Dict of ``{column_name: format string}`` where format string is + strftime compatible in case of parsing string times, or is one of + (D, s, ns, ms, us) in case of parsing integer timestamps. + - Dict of ``{column_name: arg dict}``, where the arg dict corresponds + to the keyword arguments of :func:`pandas.to_datetime` + Especially useful with databases without native Datetime support, + such as SQLite. + chunksize : int, default None + If specified, return an iterator where `chunksize` is the number of + rows to include in each chunk. + + Returns + ------- + DataFrame + + See Also + -------- + read_sql_table : Read SQL database table into a DataFrame. + read_sql + + Notes + ----- + Any datetime values with time zone information parsed via the `parse_dates` + parameter will be converted to UTC. + """ + pandas_sql = pandasSQL_builder(con) + return pandas_sql.read_query( + sql, + index_col=index_col, + params=params, + coerce_float=coerce_float, + parse_dates=parse_dates, + chunksize=chunksize, + ) + + +def read_sql( + sql, + con, + index_col=None, + coerce_float=True, + params=None, + parse_dates=None, + columns=None, + chunksize=None, +): + """ + Read SQL query or database table into a DataFrame. + + This function is a convenience wrapper around ``read_sql_table`` and + ``read_sql_query`` (for backward compatibility). It will delegate + to the specific function depending on the provided input. A SQL query + will be routed to ``read_sql_query``, while a database table name will + be routed to ``read_sql_table``. Note that the delegated function might + have more specific notes about their functionality not listed here. + + Parameters + ---------- + sql : str or SQLAlchemy Selectable (select or text object) + SQL query to be executed or a table name. + con : SQLAlchemy connectable (engine/connection) or database str URI + or DBAPI2 connection (fallback mode)' + + Using SQLAlchemy makes it possible to use any DB supported by that + library. If a DBAPI2 object, only sqlite3 is supported. The user is responsible + for engine disposal and connection closure for the SQLAlchemy connectable. See + `here `_ + index_col : str or list of strings, optional, default: None + Column(s) to set as index(MultiIndex). + coerce_float : bool, default True + Attempts to convert values of non-string, non-numeric objects (like + decimal.Decimal) to floating point, useful for SQL result sets. + params : list, tuple or dict, optional, default: None + List of parameters to pass to execute method. The syntax used + to pass parameters is database driver dependent. Check your + database driver documentation for which of the five syntax styles, + described in PEP 249's paramstyle, is supported. + Eg. for psycopg2, uses %(name)s so use params={'name' : 'value'}. + parse_dates : list or dict, default: None + - List of column names to parse as dates. + - Dict of ``{column_name: format string}`` where format string is + strftime compatible in case of parsing string times, or is one of + (D, s, ns, ms, us) in case of parsing integer timestamps. + - Dict of ``{column_name: arg dict}``, where the arg dict corresponds + to the keyword arguments of :func:`pandas.to_datetime` + Especially useful with databases without native Datetime support, + such as SQLite. + columns : list, default: None + List of column names to select from SQL table (only used when reading + a table). + chunksize : int, default None + If specified, return an iterator where `chunksize` is the + number of rows to include in each chunk. + + Returns + ------- + DataFrame + + See Also + -------- + read_sql_table : Read SQL database table into a DataFrame. + read_sql_query : Read SQL query into a DataFrame. + """ + pandas_sql = pandasSQL_builder(con) + + if isinstance(pandas_sql, SQLiteDatabase): + return pandas_sql.read_query( + sql, + index_col=index_col, + params=params, + coerce_float=coerce_float, + parse_dates=parse_dates, + chunksize=chunksize, + ) + + try: + _is_table_name = pandas_sql.has_table(sql) + except Exception: + # using generic exception to catch errors from sql drivers (GH24988) + _is_table_name = False + + if _is_table_name: + pandas_sql.meta.reflect(only=[sql]) + return pandas_sql.read_table( + sql, + index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates, + columns=columns, + chunksize=chunksize, + ) + else: + return pandas_sql.read_query( + sql, + index_col=index_col, + params=params, + coerce_float=coerce_float, + parse_dates=parse_dates, + chunksize=chunksize, + ) + + +def to_sql( + frame, + name, + con, + schema=None, + if_exists="fail", + index=True, + index_label=None, + chunksize=None, + dtype=None, + method=None, +): + """ + Write records stored in a DataFrame to a SQL database. + + Parameters + ---------- + frame : DataFrame, Series + name : str + Name of SQL table. + con : SQLAlchemy connectable(engine/connection) or database string URI + or sqlite3 DBAPI2 connection + Using SQLAlchemy makes it possible to use any DB supported by that + library. + If a DBAPI2 object, only sqlite3 is supported. + schema : str, optional + Name of SQL schema in database to write to (if database flavor + supports this). If None, use default schema (default). + if_exists : {'fail', 'replace', 'append'}, default 'fail' + - fail: If table exists, do nothing. + - replace: If table exists, drop it, recreate it, and insert data. + - append: If table exists, insert data. Create if does not exist. + index : boolean, default True + Write DataFrame index as a column. + index_label : str or sequence, optional + Column label for index column(s). If None is given (default) and + `index` is True, then the index names are used. + A sequence should be given if the DataFrame uses MultiIndex. + chunksize : int, optional + Specify the number of rows in each batch to be written at a time. + By default, all rows will be written at once. + dtype : dict or scalar, optional + Specifying the datatype for columns. If a dictionary is used, the + keys should be the column names and the values should be the + SQLAlchemy types or strings for the sqlite3 fallback mode. If a + scalar is provided, it will be applied to all columns. + method : {None, 'multi', callable}, optional + Controls the SQL insertion clause used: + + - None : Uses standard SQL ``INSERT`` clause (one per row). + - 'multi': Pass multiple values in a single ``INSERT`` clause. + - callable with signature ``(pd_table, conn, keys, data_iter)``. + + Details and a sample callable implementation can be found in the + section :ref:`insert method `. + + .. versionadded:: 0.24.0 + """ + if if_exists not in ("fail", "replace", "append"): + raise ValueError(f"'{if_exists}' is not valid for if_exists") + + pandas_sql = pandasSQL_builder(con, schema=schema) + + if isinstance(frame, Series): + frame = frame.to_frame() + elif not isinstance(frame, DataFrame): + raise NotImplementedError( + "'frame' argument should be either a Series or a DataFrame" + ) + + pandas_sql.to_sql( + frame, + name, + if_exists=if_exists, + index=index, + index_label=index_label, + schema=schema, + chunksize=chunksize, + dtype=dtype, + method=method, + ) + + +def has_table(table_name, con, schema=None): + """ + Check if DataBase has named table. + + Parameters + ---------- + table_name: string + Name of SQL table. + con: SQLAlchemy connectable(engine/connection) or sqlite3 DBAPI2 connection + Using SQLAlchemy makes it possible to use any DB supported by that + library. + If a DBAPI2 object, only sqlite3 is supported. + schema : string, default None + Name of SQL schema in database to write to (if database flavor supports + this). If None, use default schema (default). + + Returns + ------- + boolean + """ + pandas_sql = pandasSQL_builder(con, schema=schema) + return pandas_sql.has_table(table_name) + + +table_exists = has_table + + +def _engine_builder(con): + """ + Returns a SQLAlchemy engine from a URI (if con is a string) + else it just return con without modifying it. + """ + global _SQLALCHEMY_INSTALLED + if isinstance(con, str): + try: + import sqlalchemy + except ImportError: + _SQLALCHEMY_INSTALLED = False + else: + con = sqlalchemy.create_engine(con) + return con + + return con + + +def pandasSQL_builder(con, schema=None, meta=None, is_cursor=False): + """ + Convenience function to return the correct PandasSQL subclass based on the + provided parameters. + """ + # When support for DBAPI connections is removed, + # is_cursor should not be necessary. + con = _engine_builder(con) + if _is_sqlalchemy_connectable(con): + return SQLDatabase(con, schema=schema, meta=meta) + elif isinstance(con, str): + raise ImportError("Using URI string without sqlalchemy installed.") + else: + return SQLiteDatabase(con, is_cursor=is_cursor) + + +class SQLTable(PandasObject): + """ + For mapping Pandas tables to SQL tables. + Uses fact that table is reflected by SQLAlchemy to + do better type conversions. + Also holds various flags needed to avoid having to + pass them between functions all the time. + """ + + # TODO: support for multiIndex + + def __init__( + self, + name, + pandas_sql_engine, + frame=None, + index=True, + if_exists="fail", + prefix="pandas", + index_label=None, + schema=None, + keys=None, + dtype=None, + ): + self.name = name + self.pd_sql = pandas_sql_engine + self.prefix = prefix + self.frame = frame + self.index = self._index_name(index, index_label) + self.schema = schema + self.if_exists = if_exists + self.keys = keys + self.dtype = dtype + + if frame is not None: + # We want to initialize based on a dataframe + self.table = self._create_table_setup() + else: + # no data provided, read-only mode + self.table = self.pd_sql.get_table(self.name, self.schema) + + if self.table is None: + raise ValueError(f"Could not init table '{name}'") + + def exists(self): + return self.pd_sql.has_table(self.name, self.schema) + + def sql_schema(self): + from sqlalchemy.schema import CreateTable + + return str(CreateTable(self.table).compile(self.pd_sql.connectable)) + + def _execute_create(self): + # Inserting table into database, add to MetaData object + self.table = self.table.tometadata(self.pd_sql.meta) + self.table.create() + + def create(self): + if self.exists(): + if self.if_exists == "fail": + raise ValueError(f"Table '{self.name}' already exists.") + elif self.if_exists == "replace": + self.pd_sql.drop_table(self.name, self.schema) + self._execute_create() + elif self.if_exists == "append": + pass + else: + raise ValueError(f"'{self.if_exists}' is not valid for if_exists") + else: + self._execute_create() + + def _execute_insert(self, conn, keys, data_iter): + """Execute SQL statement inserting data + + Parameters + ---------- + conn : sqlalchemy.engine.Engine or sqlalchemy.engine.Connection + keys : list of str + Column names + data_iter : generator of list + Each item contains a list of values to be inserted + """ + data = [dict(zip(keys, row)) for row in data_iter] + conn.execute(self.table.insert(), data) + + def _execute_insert_multi(self, conn, keys, data_iter): + """Alternative to _execute_insert for DBs support multivalue INSERT. + + Note: multi-value insert is usually faster for analytics DBs + and tables containing a few columns + but performance degrades quickly with increase of columns. + """ + data = [dict(zip(keys, row)) for row in data_iter] + conn.execute(self.table.insert(data)) + + def insert_data(self): + if self.index is not None: + temp = self.frame.copy() + temp.index.names = self.index + try: + temp.reset_index(inplace=True) + except ValueError as err: + raise ValueError(f"duplicate name in index/columns: {err}") + else: + temp = self.frame + + column_names = list(map(str, temp.columns)) + ncols = len(column_names) + data_list = [None] * ncols + blocks = temp._data.blocks + + for b in blocks: + if b.is_datetime: + # return datetime.datetime objects + if b.is_datetimetz: + # GH 9086: Ensure we return datetimes with timezone info + # Need to return 2-D data; DatetimeIndex is 1D + d = b.values.to_pydatetime() + d = np.atleast_2d(d) + else: + # convert to microsecond resolution for datetime.datetime + d = b.values.astype("M8[us]").astype(object) + else: + d = np.array(b.get_values(), dtype=object) + + # replace NaN with None + if b._can_hold_na: + mask = isna(d) + d[mask] = None + + for col_loc, col in zip(b.mgr_locs, d): + data_list[col_loc] = col + + return column_names, data_list + + def insert(self, chunksize=None, method=None): + + # set insert method + if method is None: + exec_insert = self._execute_insert + elif method == "multi": + exec_insert = self._execute_insert_multi + elif callable(method): + exec_insert = partial(method, self) + else: + raise ValueError(f"Invalid parameter `method`: {method}") + + keys, data_list = self.insert_data() + + nrows = len(self.frame) + + if nrows == 0: + return + + if chunksize is None: + chunksize = nrows + elif chunksize == 0: + raise ValueError("chunksize argument should be non-zero") + + chunks = int(nrows / chunksize) + 1 + + with self.pd_sql.run_transaction() as conn: + for i in range(chunks): + start_i = i * chunksize + end_i = min((i + 1) * chunksize, nrows) + if start_i >= end_i: + break + + chunk_iter = zip(*[arr[start_i:end_i] for arr in data_list]) + exec_insert(conn, keys, chunk_iter) + + def _query_iterator( + self, result, chunksize, columns, coerce_float=True, parse_dates=None + ): + """Return generator through chunked result set.""" + + while True: + data = result.fetchmany(chunksize) + if not data: + break + else: + self.frame = DataFrame.from_records( + data, columns=columns, coerce_float=coerce_float + ) + + self._harmonize_columns(parse_dates=parse_dates) + + if self.index is not None: + self.frame.set_index(self.index, inplace=True) + + yield self.frame + + def read(self, coerce_float=True, parse_dates=None, columns=None, chunksize=None): + + if columns is not None and len(columns) > 0: + from sqlalchemy import select + + cols = [self.table.c[n] for n in columns] + if self.index is not None: + for idx in self.index[::-1]: + cols.insert(0, self.table.c[idx]) + sql_select = select(cols) + else: + sql_select = self.table.select() + + result = self.pd_sql.execute(sql_select) + column_names = result.keys() + + if chunksize is not None: + return self._query_iterator( + result, + chunksize, + column_names, + coerce_float=coerce_float, + parse_dates=parse_dates, + ) + else: + data = result.fetchall() + self.frame = DataFrame.from_records( + data, columns=column_names, coerce_float=coerce_float + ) + + self._harmonize_columns(parse_dates=parse_dates) + + if self.index is not None: + self.frame.set_index(self.index, inplace=True) + + return self.frame + + def _index_name(self, index, index_label): + # for writing: index=True to include index in sql table + if index is True: + nlevels = self.frame.index.nlevels + # if index_label is specified, set this as index name(s) + if index_label is not None: + if not isinstance(index_label, list): + index_label = [index_label] + if len(index_label) != nlevels: + raise ValueError( + "Length of 'index_label' should match number of " + f"levels, which is {nlevels}" + ) + else: + return index_label + # return the used column labels for the index columns + if ( + nlevels == 1 + and "index" not in self.frame.columns + and self.frame.index.name is None + ): + return ["index"] + else: + return [ + l if l is not None else f"level_{i}" + for i, l in enumerate(self.frame.index.names) + ] + + # for reading: index=(list of) string to specify column to set as index + elif isinstance(index, str): + return [index] + elif isinstance(index, list): + return index + else: + return None + + def _get_column_names_and_types(self, dtype_mapper): + column_names_and_types = [] + if self.index is not None: + for i, idx_label in enumerate(self.index): + idx_type = dtype_mapper(self.frame.index._get_level_values(i)) + column_names_and_types.append((str(idx_label), idx_type, True)) + + column_names_and_types += [ + (str(self.frame.columns[i]), dtype_mapper(self.frame.iloc[:, i]), False) + for i in range(len(self.frame.columns)) + ] + + return column_names_and_types + + def _create_table_setup(self): + from sqlalchemy import Table, Column, PrimaryKeyConstraint + + column_names_and_types = self._get_column_names_and_types(self._sqlalchemy_type) + + columns = [ + Column(name, typ, index=is_index) + for name, typ, is_index in column_names_and_types + ] + + if self.keys is not None: + if not is_list_like(self.keys): + keys = [self.keys] + else: + keys = self.keys + pkc = PrimaryKeyConstraint(*keys, name=self.name + "_pk") + columns.append(pkc) + + schema = self.schema or self.pd_sql.meta.schema + + # At this point, attach to new metadata, only attach to self.meta + # once table is created. + from sqlalchemy.schema import MetaData + + meta = MetaData(self.pd_sql, schema=schema) + + return Table(self.name, meta, *columns, schema=schema) + + def _harmonize_columns(self, parse_dates=None): + """ + Make the DataFrame's column types align with the SQL table + column types. + Need to work around limited NA value support. Floats are always + fine, ints must always be floats if there are Null values. + Booleans are hard because converting bool column with None replaces + all Nones with false. Therefore only convert bool if there are no + NA values. + Datetimes should already be converted to np.datetime64 if supported, + but here we also force conversion if required. + """ + parse_dates = _process_parse_dates_argument(parse_dates) + + for sql_col in self.table.columns: + col_name = sql_col.name + try: + df_col = self.frame[col_name] + + # Handle date parsing upfront; don't try to convert columns + # twice + if col_name in parse_dates: + try: + fmt = parse_dates[col_name] + except TypeError: + fmt = None + self.frame[col_name] = _handle_date_column(df_col, format=fmt) + continue + + # the type the dataframe column should have + col_type = self._get_dtype(sql_col.type) + + if ( + col_type is datetime + or col_type is date + or col_type is DatetimeTZDtype + ): + # Convert tz-aware Datetime SQL columns to UTC + utc = col_type is DatetimeTZDtype + self.frame[col_name] = _handle_date_column(df_col, utc=utc) + elif col_type is float: + # floats support NA, can always convert! + self.frame[col_name] = df_col.astype(col_type, copy=False) + + elif len(df_col) == df_col.count(): + # No NA values, can convert ints and bools + if col_type is np.dtype("int64") or col_type is bool: + self.frame[col_name] = df_col.astype(col_type, copy=False) + except KeyError: + pass # this column not in results + + def _sqlalchemy_type(self, col): + + dtype = self.dtype or {} + if col.name in dtype: + return self.dtype[col.name] + + # Infer type of column, while ignoring missing values. + # Needed for inserting typed data containing NULLs, GH 8778. + col_type = lib.infer_dtype(col, skipna=True) + + from sqlalchemy.types import ( + BigInteger, + Integer, + Float, + Text, + Boolean, + DateTime, + Date, + Time, + TIMESTAMP, + ) + + if col_type == "datetime64" or col_type == "datetime": + # GH 9086: TIMESTAMP is the suggested type if the column contains + # timezone information + try: + if col.dt.tz is not None: + return TIMESTAMP(timezone=True) + except AttributeError: + # The column is actually a DatetimeIndex + if col.tz is not None: + return TIMESTAMP(timezone=True) + return DateTime + if col_type == "timedelta64": + warnings.warn( + "the 'timedelta' type is not supported, and will be " + "written as integer values (ns frequency) to the " + "database.", + UserWarning, + stacklevel=8, + ) + return BigInteger + elif col_type == "floating": + if col.dtype == "float32": + return Float(precision=23) + else: + return Float(precision=53) + elif col_type == "integer": + if col.dtype == "int32": + return Integer + else: + return BigInteger + elif col_type == "boolean": + return Boolean + elif col_type == "date": + return Date + elif col_type == "time": + return Time + elif col_type == "complex": + raise ValueError("Complex datatypes not supported") + + return Text + + def _get_dtype(self, sqltype): + from sqlalchemy.types import Integer, Float, Boolean, DateTime, Date, TIMESTAMP + + if isinstance(sqltype, Float): + return float + elif isinstance(sqltype, Integer): + # TODO: Refine integer size. + return np.dtype("int64") + elif isinstance(sqltype, TIMESTAMP): + # we have a timezone capable type + if not sqltype.timezone: + return datetime + return DatetimeTZDtype + elif isinstance(sqltype, DateTime): + # Caution: np.datetime64 is also a subclass of np.number. + return datetime + elif isinstance(sqltype, Date): + return date + elif isinstance(sqltype, Boolean): + return bool + return object + + +class PandasSQL(PandasObject): + """ + Subclasses Should define read_sql and to_sql. + """ + + def read_sql(self, *args, **kwargs): + raise ValueError( + "PandasSQL must be created with an SQLAlchemy " + "connectable or sqlite connection" + ) + + def to_sql(self, *args, **kwargs): + raise ValueError( + "PandasSQL must be created with an SQLAlchemy " + "connectable or sqlite connection" + ) + + +class SQLDatabase(PandasSQL): + """ + This class enables conversion between DataFrame and SQL databases + using SQLAlchemy to handle DataBase abstraction. + + Parameters + ---------- + engine : SQLAlchemy connectable + Connectable to connect with the database. Using SQLAlchemy makes it + possible to use any DB supported by that library. + schema : string, default None + Name of SQL schema in database to write to (if database flavor + supports this). If None, use default schema (default). + meta : SQLAlchemy MetaData object, default None + If provided, this MetaData object is used instead of a newly + created. This allows to specify database flavor specific + arguments in the MetaData object. + + """ + + def __init__(self, engine, schema=None, meta=None): + self.connectable = engine + if not meta: + from sqlalchemy.schema import MetaData + + meta = MetaData(self.connectable, schema=schema) + + self.meta = meta + + @contextmanager + def run_transaction(self): + with self.connectable.begin() as tx: + if hasattr(tx, "execute"): + yield tx + else: + yield self.connectable + + def execute(self, *args, **kwargs): + """Simple passthrough to SQLAlchemy connectable""" + return self.connectable.execute(*args, **kwargs) + + def read_table( + self, + table_name, + index_col=None, + coerce_float=True, + parse_dates=None, + columns=None, + schema=None, + chunksize=None, + ): + """Read SQL database table into a DataFrame. + + Parameters + ---------- + table_name : string + Name of SQL table in database. + index_col : string, optional, default: None + Column to set as index. + coerce_float : boolean, default True + Attempts to convert values of non-string, non-numeric objects + (like decimal.Decimal) to floating point. This can result in + loss of precision. + parse_dates : list or dict, default: None + - List of column names to parse as dates. + - Dict of ``{column_name: format string}`` where format string is + strftime compatible in case of parsing string times, or is one of + (D, s, ns, ms, us) in case of parsing integer timestamps. + - Dict of ``{column_name: arg}``, where the arg corresponds + to the keyword arguments of :func:`pandas.to_datetime`. + Especially useful with databases without native Datetime support, + such as SQLite. + columns : list, default: None + List of column names to select from SQL table. + schema : string, default None + Name of SQL schema in database to query (if database flavor + supports this). If specified, this overwrites the default + schema of the SQL database object. + chunksize : int, default None + If specified, return an iterator where `chunksize` is the number + of rows to include in each chunk. + + Returns + ------- + DataFrame + + See Also + -------- + pandas.read_sql_table + SQLDatabase.read_query + + """ + table = SQLTable(table_name, self, index=index_col, schema=schema) + return table.read( + coerce_float=coerce_float, + parse_dates=parse_dates, + columns=columns, + chunksize=chunksize, + ) + + @staticmethod + def _query_iterator( + result, chunksize, columns, index_col=None, coerce_float=True, parse_dates=None + ): + """Return generator through chunked result set""" + + while True: + data = result.fetchmany(chunksize) + if not data: + break + else: + yield _wrap_result( + data, + columns, + index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates, + ) + + def read_query( + self, + sql, + index_col=None, + coerce_float=True, + parse_dates=None, + params=None, + chunksize=None, + ): + """Read SQL query into a DataFrame. + + Parameters + ---------- + sql : string + SQL query to be executed. + index_col : string, optional, default: None + Column name to use as index for the returned DataFrame object. + coerce_float : boolean, default True + Attempt to convert values of non-string, non-numeric objects (like + decimal.Decimal) to floating point, useful for SQL result sets. + params : list, tuple or dict, optional, default: None + List of parameters to pass to execute method. The syntax used + to pass parameters is database driver dependent. Check your + database driver documentation for which of the five syntax styles, + described in PEP 249's paramstyle, is supported. + Eg. for psycopg2, uses %(name)s so use params={'name' : 'value'} + parse_dates : list or dict, default: None + - List of column names to parse as dates. + - Dict of ``{column_name: format string}`` where format string is + strftime compatible in case of parsing string times, or is one of + (D, s, ns, ms, us) in case of parsing integer timestamps. + - Dict of ``{column_name: arg dict}``, where the arg dict + corresponds to the keyword arguments of + :func:`pandas.to_datetime` Especially useful with databases + without native Datetime support, such as SQLite. + chunksize : int, default None + If specified, return an iterator where `chunksize` is the number + of rows to include in each chunk. + + Returns + ------- + DataFrame + + See Also + -------- + read_sql_table : Read SQL database table into a DataFrame. + read_sql + + """ + args = _convert_params(sql, params) + + result = self.execute(*args) + columns = result.keys() + + if chunksize is not None: + return self._query_iterator( + result, + chunksize, + columns, + index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates, + ) + else: + data = result.fetchall() + frame = _wrap_result( + data, + columns, + index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates, + ) + return frame + + read_sql = read_query + + def to_sql( + self, + frame, + name, + if_exists="fail", + index=True, + index_label=None, + schema=None, + chunksize=None, + dtype=None, + method=None, + ): + """ + Write records stored in a DataFrame to a SQL database. + + Parameters + ---------- + frame : DataFrame + name : string + Name of SQL table. + if_exists : {'fail', 'replace', 'append'}, default 'fail' + - fail: If table exists, do nothing. + - replace: If table exists, drop it, recreate it, and insert data. + - append: If table exists, insert data. Create if does not exist. + index : boolean, default True + Write DataFrame index as a column. + index_label : string or sequence, default None + Column label for index column(s). If None is given (default) and + `index` is True, then the index names are used. + A sequence should be given if the DataFrame uses MultiIndex. + schema : string, default None + Name of SQL schema in database to write to (if database flavor + supports this). If specified, this overwrites the default + schema of the SQLDatabase object. + chunksize : int, default None + If not None, then rows will be written in batches of this size at a + time. If None, all rows will be written at once. + dtype : single type or dict of column name to SQL type, default None + Optional specifying the datatype for columns. The SQL type should + be a SQLAlchemy type. If all columns are of the same type, one + single value can be used. + method : {None', 'multi', callable}, default None + Controls the SQL insertion clause used: + + * None : Uses standard SQL ``INSERT`` clause (one per row). + * 'multi': Pass multiple values in a single ``INSERT`` clause. + * callable with signature ``(pd_table, conn, keys, data_iter)``. + + Details and a sample callable implementation can be found in the + section :ref:`insert method `. + + .. versionadded:: 0.24.0 + """ + if dtype and not is_dict_like(dtype): + dtype = {col_name: dtype for col_name in frame} + + if dtype is not None: + from sqlalchemy.types import to_instance, TypeEngine + + for col, my_type in dtype.items(): + if not isinstance(to_instance(my_type), TypeEngine): + raise ValueError(f"The type of {col} is not a SQLAlchemy type") + + table = SQLTable( + name, + self, + frame=frame, + index=index, + if_exists=if_exists, + index_label=index_label, + schema=schema, + dtype=dtype, + ) + table.create() + table.insert(chunksize, method=method) + if not name.isdigit() and not name.islower(): + # check for potentially case sensitivity issues (GH7815) + # Only check when name is not a number and name is not lower case + engine = self.connectable.engine + with self.connectable.connect() as conn: + table_names = engine.table_names( + schema=schema or self.meta.schema, connection=conn + ) + if name not in table_names: + msg = ( + f"The provided table name '{name}' is not found exactly as " + "such in the database after writing the table, possibly " + "due to case sensitivity issues. Consider using lower " + "case table names." + ) + warnings.warn(msg, UserWarning) + + @property + def tables(self): + return self.meta.tables + + def has_table(self, name, schema=None): + return self.connectable.run_callable( + self.connectable.dialect.has_table, name, schema or self.meta.schema + ) + + def get_table(self, table_name, schema=None): + schema = schema or self.meta.schema + if schema: + tbl = self.meta.tables.get(".".join([schema, table_name])) + else: + tbl = self.meta.tables.get(table_name) + + # Avoid casting double-precision floats into decimals + from sqlalchemy import Numeric + + for column in tbl.columns: + if isinstance(column.type, Numeric): + column.type.asdecimal = False + + return tbl + + def drop_table(self, table_name, schema=None): + schema = schema or self.meta.schema + if self.has_table(table_name, schema): + self.meta.reflect(only=[table_name], schema=schema) + self.get_table(table_name, schema).drop() + self.meta.clear() + + def _create_sql_schema(self, frame, table_name, keys=None, dtype=None): + table = SQLTable( + table_name, self, frame=frame, index=False, keys=keys, dtype=dtype + ) + return str(table.sql_schema()) + + +# ---- SQL without SQLAlchemy --- +# sqlite-specific sql strings and handler class +# dictionary used for readability purposes +_SQL_TYPES = { + "string": "TEXT", + "floating": "REAL", + "integer": "INTEGER", + "datetime": "TIMESTAMP", + "date": "DATE", + "time": "TIME", + "boolean": "INTEGER", +} + + +def _get_unicode_name(name): + try: + uname = str(name).encode("utf-8", "strict").decode("utf-8") + except UnicodeError: + raise ValueError(f"Cannot convert identifier to UTF-8: '{name}'") + return uname + + +def _get_valid_sqlite_name(name): + # See https://stackoverflow.com/questions/6514274/how-do-you-escape-strings\ + # -for-sqlite-table-column-names-in-python + # Ensure the string can be encoded as UTF-8. + # Ensure the string does not include any NUL characters. + # Replace all " with "". + # Wrap the entire thing in double quotes. + + uname = _get_unicode_name(name) + if not len(uname): + raise ValueError("Empty table or column name specified") + + nul_index = uname.find("\x00") + if nul_index >= 0: + raise ValueError("SQLite identifier cannot contain NULs") + return '"' + uname.replace('"', '""') + '"' + + +_SAFE_NAMES_WARNING = ( + "The spaces in these column names will not be changed. " + "In pandas versions < 0.14, spaces were converted to " + "underscores." +) + + +class SQLiteTable(SQLTable): + """ + Patch the SQLTable for fallback support. + Instead of a table variable just use the Create Table statement. + """ + + def __init__(self, *args, **kwargs): + # GH 8341 + # register an adapter callable for datetime.time object + import sqlite3 + + # this will transform time(12,34,56,789) into '12:34:56.000789' + # (this is what sqlalchemy does) + sqlite3.register_adapter(time, lambda _: _.strftime("%H:%M:%S.%f")) + super().__init__(*args, **kwargs) + + def sql_schema(self): + return str(";\n".join(self.table)) + + def _execute_create(self): + with self.pd_sql.run_transaction() as conn: + for stmt in self.table: + conn.execute(stmt) + + def insert_statement(self): + names = list(map(str, self.frame.columns)) + wld = "?" # wildcard char + escape = _get_valid_sqlite_name + + if self.index is not None: + for idx in self.index[::-1]: + names.insert(0, idx) + + bracketed_names = [escape(column) for column in names] + col_names = ",".join(bracketed_names) + wildcards = ",".join([wld] * len(names)) + insert_statement = ( + f"INSERT INTO {escape(self.name)} ({col_names}) VALUES ({wildcards})" + ) + return insert_statement + + def _execute_insert(self, conn, keys, data_iter): + data_list = list(data_iter) + conn.executemany(self.insert_statement(), data_list) + + def _create_table_setup(self): + """ + Return a list of SQL statements that creates a table reflecting the + structure of a DataFrame. The first entry will be a CREATE TABLE + statement while the rest will be CREATE INDEX statements. + """ + column_names_and_types = self._get_column_names_and_types(self._sql_type_name) + + pat = re.compile(r"\s+") + column_names = [col_name for col_name, _, _ in column_names_and_types] + if any(map(pat.search, column_names)): + warnings.warn(_SAFE_NAMES_WARNING, stacklevel=6) + + escape = _get_valid_sqlite_name + + create_tbl_stmts = [ + escape(cname) + " " + ctype for cname, ctype, _ in column_names_and_types + ] + + if self.keys is not None and len(self.keys): + if not is_list_like(self.keys): + keys = [self.keys] + else: + keys = self.keys + cnames_br = ", ".join(escape(c) for c in keys) + create_tbl_stmts.append( + f"CONSTRAINT {self.name}_pk PRIMARY KEY ({cnames_br})" + ) + + create_stmts = [ + "CREATE TABLE " + + escape(self.name) + + " (\n" + + ",\n ".join(create_tbl_stmts) + + "\n)" + ] + + ix_cols = [cname for cname, _, is_index in column_names_and_types if is_index] + if len(ix_cols): + cnames = "_".join(ix_cols) + cnames_br = ",".join(escape(c) for c in ix_cols) + create_stmts.append( + "CREATE INDEX " + + escape("ix_" + self.name + "_" + cnames) + + "ON " + + escape(self.name) + + " (" + + cnames_br + + ")" + ) + + return create_stmts + + def _sql_type_name(self, col): + dtype = self.dtype or {} + if col.name in dtype: + return dtype[col.name] + + # Infer type of column, while ignoring missing values. + # Needed for inserting typed data containing NULLs, GH 8778. + col_type = lib.infer_dtype(col, skipna=True) + + if col_type == "timedelta64": + warnings.warn( + "the 'timedelta' type is not supported, and will be " + "written as integer values (ns frequency) to the " + "database.", + UserWarning, + stacklevel=8, + ) + col_type = "integer" + + elif col_type == "datetime64": + col_type = "datetime" + + elif col_type == "empty": + col_type = "string" + + elif col_type == "complex": + raise ValueError("Complex datatypes not supported") + + if col_type not in _SQL_TYPES: + col_type = "string" + + return _SQL_TYPES[col_type] + + +class SQLiteDatabase(PandasSQL): + """ + Version of SQLDatabase to support SQLite connections (fallback without + SQLAlchemy). This should only be used internally. + + Parameters + ---------- + con : sqlite connection object + + """ + + def __init__(self, con, is_cursor=False): + self.is_cursor = is_cursor + self.con = con + + @contextmanager + def run_transaction(self): + cur = self.con.cursor() + try: + yield cur + self.con.commit() + except Exception: + self.con.rollback() + raise + finally: + cur.close() + + def execute(self, *args, **kwargs): + if self.is_cursor: + cur = self.con + else: + cur = self.con.cursor() + try: + cur.execute(*args, **kwargs) + return cur + except Exception as exc: + try: + self.con.rollback() + except Exception as inner_exc: # pragma: no cover + ex = DatabaseError( + f"Execution failed on sql: {args[0]}\n{exc}\nunable to rollback" + ) + raise ex from inner_exc + + ex = DatabaseError(f"Execution failed on sql '{args[0]}': {exc}") + raise ex from exc + + @staticmethod + def _query_iterator( + cursor, chunksize, columns, index_col=None, coerce_float=True, parse_dates=None + ): + """Return generator through chunked result set""" + + while True: + data = cursor.fetchmany(chunksize) + if type(data) == tuple: + data = list(data) + if not data: + cursor.close() + break + else: + yield _wrap_result( + data, + columns, + index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates, + ) + + def read_query( + self, + sql, + index_col=None, + coerce_float=True, + params=None, + parse_dates=None, + chunksize=None, + ): + + args = _convert_params(sql, params) + cursor = self.execute(*args) + columns = [col_desc[0] for col_desc in cursor.description] + + if chunksize is not None: + return self._query_iterator( + cursor, + chunksize, + columns, + index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates, + ) + else: + data = self._fetchall_as_list(cursor) + cursor.close() + + frame = _wrap_result( + data, + columns, + index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates, + ) + return frame + + def _fetchall_as_list(self, cur): + result = cur.fetchall() + if not isinstance(result, list): + result = list(result) + return result + + def to_sql( + self, + frame, + name, + if_exists="fail", + index=True, + index_label=None, + schema=None, + chunksize=None, + dtype=None, + method=None, + ): + """ + Write records stored in a DataFrame to a SQL database. + + Parameters + ---------- + frame: DataFrame + name: string + Name of SQL table. + if_exists: {'fail', 'replace', 'append'}, default 'fail' + fail: If table exists, do nothing. + replace: If table exists, drop it, recreate it, and insert data. + append: If table exists, insert data. Create if it does not exist. + index : boolean, default True + Write DataFrame index as a column + index_label : string or sequence, default None + Column label for index column(s). If None is given (default) and + `index` is True, then the index names are used. + A sequence should be given if the DataFrame uses MultiIndex. + schema : string, default None + Ignored parameter included for compatibility with SQLAlchemy + version of ``to_sql``. + chunksize : int, default None + If not None, then rows will be written in batches of this + size at a time. If None, all rows will be written at once. + dtype : single type or dict of column name to SQL type, default None + Optional specifying the datatype for columns. The SQL type should + be a string. If all columns are of the same type, one single value + can be used. + method : {None, 'multi', callable}, default None + Controls the SQL insertion clause used: + + * None : Uses standard SQL ``INSERT`` clause (one per row). + * 'multi': Pass multiple values in a single ``INSERT`` clause. + * callable with signature ``(pd_table, conn, keys, data_iter)``. + + Details and a sample callable implementation can be found in the + section :ref:`insert method `. + + .. versionadded:: 0.24.0 + """ + if dtype and not is_dict_like(dtype): + dtype = {col_name: dtype for col_name in frame} + + if dtype is not None: + for col, my_type in dtype.items(): + if not isinstance(my_type, str): + raise ValueError(f"{col} ({my_type}) not a string") + + table = SQLiteTable( + name, + self, + frame=frame, + index=index, + if_exists=if_exists, + index_label=index_label, + dtype=dtype, + ) + table.create() + table.insert(chunksize, method) + + def has_table(self, name, schema=None): + # TODO(wesm): unused? + # escape = _get_valid_sqlite_name + # esc_name = escape(name) + + wld = "?" + query = f"SELECT name FROM sqlite_master WHERE type='table' AND name={wld};" + + return len(self.execute(query, [name]).fetchall()) > 0 + + def get_table(self, table_name, schema=None): + return None # not supported in fallback mode + + def drop_table(self, name, schema=None): + drop_sql = f"DROP TABLE {_get_valid_sqlite_name(name)}" + self.execute(drop_sql) + + def _create_sql_schema(self, frame, table_name, keys=None, dtype=None): + table = SQLiteTable( + table_name, self, frame=frame, index=False, keys=keys, dtype=dtype + ) + return str(table.sql_schema()) + + +def get_schema(frame, name, keys=None, con=None, dtype=None): + """ + Get the SQL db table schema for the given frame. + + Parameters + ---------- + frame : DataFrame + name : string + name of SQL table + keys : string or sequence, default: None + columns to use a primary key + con: an open SQL database connection object or a SQLAlchemy connectable + Using SQLAlchemy makes it possible to use any DB supported by that + library, default: None + If a DBAPI2 object, only sqlite3 is supported. + dtype : dict of column name to SQL type, default None + Optional specifying the datatype for columns. The SQL type should + be a SQLAlchemy type, or a string for sqlite3 fallback connection. + + """ + + pandas_sql = pandasSQL_builder(con=con) + return pandas_sql._create_sql_schema(frame, name, keys=keys, dtype=dtype) diff --git a/venv/Lib/site-packages/pandas/io/stata.py b/venv/Lib/site-packages/pandas/io/stata.py new file mode 100644 index 0000000..a724665 --- /dev/null +++ b/venv/Lib/site-packages/pandas/io/stata.py @@ -0,0 +1,3333 @@ +""" +Module contains tools for processing Stata files into DataFrames + +The StataReader below was originally written by Joe Presbrey as part of PyDTA. +It has been extended and improved by Skipper Seabold from the Statsmodels +project who also developed the StataWriter and was finally added to pandas in +a once again improved version. + +You can find more information on http://presbrey.mit.edu/PyDTA and +http://www.statsmodels.org/devel/ +""" +from collections import abc +import datetime +from io import BytesIO +import os +import struct +import sys +from typing import Any, Dict, Hashable, Optional, Sequence +import warnings + +from dateutil.relativedelta import relativedelta +import numpy as np + +from pandas._libs.lib import infer_dtype +from pandas._libs.writers import max_len_string_array +from pandas._typing import FilePathOrBuffer +from pandas.util._decorators import Appender + +from pandas.core.dtypes.common import ( + ensure_object, + is_categorical_dtype, + is_datetime64_dtype, +) + +from pandas import ( + Categorical, + DatetimeIndex, + NaT, + Timestamp, + concat, + isna, + to_datetime, + to_timedelta, +) +from pandas.core.frame import DataFrame +from pandas.core.series import Series + +from pandas.io.common import get_filepath_or_buffer, stringify_path + +_version_error = ( + "Version of given Stata file is {version}. pandas supports importing " + "versions 104, 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), " + "114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16)," + "and 119 (Stata 15/16, over 32,767 variables)." +) + +_statafile_processing_params1 = """\ +convert_dates : bool, default True + Convert date variables to DataFrame time values. +convert_categoricals : bool, default True + Read value labels and convert columns to Categorical/Factor variables.""" + +_statafile_processing_params2 = """\ +index_col : str, optional + Column to set as index. +convert_missing : bool, default False + Flag indicating whether to convert missing values to their Stata + representations. If False, missing values are replaced with nan. + If True, columns containing missing values are returned with + object data types and missing values are represented by + StataMissingValue objects. +preserve_dtypes : bool, default True + Preserve Stata datatypes. If False, numeric data are upcast to pandas + default types for foreign data (float64 or int64). +columns : list or None + Columns to retain. Columns will be returned in the given order. None + returns all columns. +order_categoricals : bool, default True + Flag indicating whether converted categorical data are ordered.""" + +_chunksize_params = """\ +chunksize : int, default None + Return StataReader object for iterations, returns chunks with + given number of lines.""" + +_iterator_params = """\ +iterator : bool, default False + Return StataReader object.""" + +_read_stata_doc = f""" +Read Stata file into DataFrame. + +Parameters +---------- +filepath_or_buffer : str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: ``file://localhost/path/to/table.dta``. + + If you want to pass in a path object, pandas accepts any ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, + such as a file handler (e.g. via builtin ``open`` function) + or ``StringIO``. +{_statafile_processing_params1} +{_statafile_processing_params2} +{_chunksize_params} +{_iterator_params} + +Returns +------- +DataFrame or StataReader + +See Also +-------- +io.stata.StataReader : Low-level reader for Stata data files. +DataFrame.to_stata: Export Stata data files. + +Examples +-------- +Read a Stata dta file: + +>>> df = pd.read_stata('filename.dta') + +Read a Stata dta file in 10,000 line chunks: + +>>> itr = pd.read_stata('filename.dta', chunksize=10000) +>>> for chunk in itr: +... do_something(chunk) +""" + +_read_method_doc = f"""\ +Reads observations from Stata file, converting them into a dataframe + +Parameters +---------- +nrows : int + Number of lines to read from data file, if None read whole file. +{_statafile_processing_params1} +{_statafile_processing_params2} + +Returns +------- +DataFrame +""" + +_stata_reader_doc = f"""\ +Class for reading Stata dta files. + +Parameters +---------- +path_or_buf : path (string), buffer or path object + string, path object (pathlib.Path or py._path.local.LocalPath) or object + implementing a binary read() functions. + + .. versionadded:: 0.23.0 support for pathlib, py.path. +{_statafile_processing_params1} +{_statafile_processing_params2} +{_chunksize_params} +""" + + +@Appender(_read_stata_doc) +def read_stata( + filepath_or_buffer, + convert_dates=True, + convert_categoricals=True, + index_col=None, + convert_missing=False, + preserve_dtypes=True, + columns=None, + order_categoricals=True, + chunksize=None, + iterator=False, +): + + reader = StataReader( + filepath_or_buffer, + convert_dates=convert_dates, + convert_categoricals=convert_categoricals, + index_col=index_col, + convert_missing=convert_missing, + preserve_dtypes=preserve_dtypes, + columns=columns, + order_categoricals=order_categoricals, + chunksize=chunksize, + ) + + if iterator or chunksize: + data = reader + else: + try: + data = reader.read() + finally: + reader.close() + return data + + +_date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"] + + +stata_epoch = datetime.datetime(1960, 1, 1) + + +def _stata_elapsed_date_to_datetime_vec(dates, fmt): + """ + Convert from SIF to datetime. http://www.stata.com/help.cgi?datetime + + Parameters + ---------- + dates : Series + The Stata Internal Format date to convert to datetime according to fmt + fmt : str + The format to convert to. Can be, tc, td, tw, tm, tq, th, ty + Returns + + Returns + ------- + converted : Series + The converted dates + + Examples + -------- + >>> dates = pd.Series([52]) + >>> _stata_elapsed_date_to_datetime_vec(dates , "%tw") + 0 1961-01-01 + dtype: datetime64[ns] + + Notes + ----- + datetime/c - tc + milliseconds since 01jan1960 00:00:00.000, assuming 86,400 s/day + datetime/C - tC - NOT IMPLEMENTED + milliseconds since 01jan1960 00:00:00.000, adjusted for leap seconds + date - td + days since 01jan1960 (01jan1960 = 0) + weekly date - tw + weeks since 1960w1 + This assumes 52 weeks in a year, then adds 7 * remainder of the weeks. + The datetime value is the start of the week in terms of days in the + year, not ISO calendar weeks. + monthly date - tm + months since 1960m1 + quarterly date - tq + quarters since 1960q1 + half-yearly date - th + half-years since 1960h1 yearly + date - ty + years since 0000 + + If you don't have pandas with datetime support, then you can't do + milliseconds accurately. + """ + MIN_YEAR, MAX_YEAR = Timestamp.min.year, Timestamp.max.year + MAX_DAY_DELTA = (Timestamp.max - datetime.datetime(1960, 1, 1)).days + MIN_DAY_DELTA = (Timestamp.min - datetime.datetime(1960, 1, 1)).days + MIN_MS_DELTA = MIN_DAY_DELTA * 24 * 3600 * 1000 + MAX_MS_DELTA = MAX_DAY_DELTA * 24 * 3600 * 1000 + + def convert_year_month_safe(year, month): + """ + Convert year and month to datetimes, using pandas vectorized versions + when the date range falls within the range supported by pandas. + Otherwise it falls back to a slower but more robust method + using datetime. + """ + if year.max() < MAX_YEAR and year.min() > MIN_YEAR: + return to_datetime(100 * year + month, format="%Y%m") + else: + index = getattr(year, "index", None) + return Series( + [datetime.datetime(y, m, 1) for y, m in zip(year, month)], index=index + ) + + def convert_year_days_safe(year, days): + """ + Converts year (e.g. 1999) and days since the start of the year to a + datetime or datetime64 Series + """ + if year.max() < (MAX_YEAR - 1) and year.min() > MIN_YEAR: + return to_datetime(year, format="%Y") + to_timedelta(days, unit="d") + else: + index = getattr(year, "index", None) + value = [ + datetime.datetime(y, 1, 1) + relativedelta(days=int(d)) + for y, d in zip(year, days) + ] + return Series(value, index=index) + + def convert_delta_safe(base, deltas, unit): + """ + Convert base dates and deltas to datetimes, using pandas vectorized + versions if the deltas satisfy restrictions required to be expressed + as dates in pandas. + """ + index = getattr(deltas, "index", None) + if unit == "d": + if deltas.max() > MAX_DAY_DELTA or deltas.min() < MIN_DAY_DELTA: + values = [base + relativedelta(days=int(d)) for d in deltas] + return Series(values, index=index) + elif unit == "ms": + if deltas.max() > MAX_MS_DELTA or deltas.min() < MIN_MS_DELTA: + values = [ + base + relativedelta(microseconds=(int(d) * 1000)) for d in deltas + ] + return Series(values, index=index) + else: + raise ValueError("format not understood") + base = to_datetime(base) + deltas = to_timedelta(deltas, unit=unit) + return base + deltas + + # TODO: If/when pandas supports more than datetime64[ns], this should be + # improved to use correct range, e.g. datetime[Y] for yearly + bad_locs = np.isnan(dates) + has_bad_values = False + if bad_locs.any(): + has_bad_values = True + data_col = Series(dates) + data_col[bad_locs] = 1.0 # Replace with NaT + dates = dates.astype(np.int64) + + if fmt.startswith(("%tc", "tc")): # Delta ms relative to base + base = stata_epoch + ms = dates + conv_dates = convert_delta_safe(base, ms, "ms") + elif fmt.startswith(("%tC", "tC")): + + warnings.warn("Encountered %tC format. Leaving in Stata Internal Format.") + conv_dates = Series(dates, dtype=np.object) + if has_bad_values: + conv_dates[bad_locs] = NaT + return conv_dates + # Delta days relative to base + elif fmt.startswith(("%td", "td", "%d", "d")): + base = stata_epoch + days = dates + conv_dates = convert_delta_safe(base, days, "d") + # does not count leap days - 7 days is a week. + # 52nd week may have more than 7 days + elif fmt.startswith(("%tw", "tw")): + year = stata_epoch.year + dates // 52 + days = (dates % 52) * 7 + conv_dates = convert_year_days_safe(year, days) + elif fmt.startswith(("%tm", "tm")): # Delta months relative to base + year = stata_epoch.year + dates // 12 + month = (dates % 12) + 1 + conv_dates = convert_year_month_safe(year, month) + elif fmt.startswith(("%tq", "tq")): # Delta quarters relative to base + year = stata_epoch.year + dates // 4 + month = (dates % 4) * 3 + 1 + conv_dates = convert_year_month_safe(year, month) + elif fmt.startswith(("%th", "th")): # Delta half-years relative to base + year = stata_epoch.year + dates // 2 + month = (dates % 2) * 6 + 1 + conv_dates = convert_year_month_safe(year, month) + elif fmt.startswith(("%ty", "ty")): # Years -- not delta + year = dates + month = np.ones_like(dates) + conv_dates = convert_year_month_safe(year, month) + else: + raise ValueError(f"Date fmt {fmt} not understood") + + if has_bad_values: # Restore NaT for bad values + conv_dates[bad_locs] = NaT + + return conv_dates + + +def _datetime_to_stata_elapsed_vec(dates, fmt): + """ + Convert from datetime to SIF. http://www.stata.com/help.cgi?datetime + + Parameters + ---------- + dates : Series + Series or array containing datetime.datetime or datetime64[ns] to + convert to the Stata Internal Format given by fmt + fmt : str + The format to convert to. Can be, tc, td, tw, tm, tq, th, ty + """ + index = dates.index + NS_PER_DAY = 24 * 3600 * 1000 * 1000 * 1000 + US_PER_DAY = NS_PER_DAY / 1000 + + def parse_dates_safe(dates, delta=False, year=False, days=False): + d = {} + if is_datetime64_dtype(dates.values): + if delta: + delta = dates - stata_epoch + d["delta"] = delta.values.astype(np.int64) // 1000 # microseconds + if days or year: + dates = DatetimeIndex(dates) + d["year"], d["month"] = dates.year, dates.month + if days: + days = dates.astype(np.int64) - to_datetime( + d["year"], format="%Y" + ).astype(np.int64) + d["days"] = days // NS_PER_DAY + + elif infer_dtype(dates, skipna=False) == "datetime": + if delta: + delta = dates.values - stata_epoch + f = lambda x: US_PER_DAY * x.days + 1000000 * x.seconds + x.microseconds + v = np.vectorize(f) + d["delta"] = v(delta) + if year: + year_month = dates.apply(lambda x: 100 * x.year + x.month) + d["year"] = year_month.values // 100 + d["month"] = year_month.values - d["year"] * 100 + if days: + f = lambda x: (x - datetime.datetime(x.year, 1, 1)).days + v = np.vectorize(f) + d["days"] = v(dates) + else: + raise ValueError( + "Columns containing dates must contain either " + "datetime64, datetime.datetime or null values." + ) + + return DataFrame(d, index=index) + + bad_loc = isna(dates) + index = dates.index + if bad_loc.any(): + dates = Series(dates) + if is_datetime64_dtype(dates): + dates[bad_loc] = to_datetime(stata_epoch) + else: + dates[bad_loc] = stata_epoch + + if fmt in ["%tc", "tc"]: + d = parse_dates_safe(dates, delta=True) + conv_dates = d.delta / 1000 + elif fmt in ["%tC", "tC"]: + warnings.warn("Stata Internal Format tC not supported.") + conv_dates = dates + elif fmt in ["%td", "td"]: + d = parse_dates_safe(dates, delta=True) + conv_dates = d.delta // US_PER_DAY + elif fmt in ["%tw", "tw"]: + d = parse_dates_safe(dates, year=True, days=True) + conv_dates = 52 * (d.year - stata_epoch.year) + d.days // 7 + elif fmt in ["%tm", "tm"]: + d = parse_dates_safe(dates, year=True) + conv_dates = 12 * (d.year - stata_epoch.year) + d.month - 1 + elif fmt in ["%tq", "tq"]: + d = parse_dates_safe(dates, year=True) + conv_dates = 4 * (d.year - stata_epoch.year) + (d.month - 1) // 3 + elif fmt in ["%th", "th"]: + d = parse_dates_safe(dates, year=True) + conv_dates = 2 * (d.year - stata_epoch.year) + (d.month > 6).astype(np.int) + elif fmt in ["%ty", "ty"]: + d = parse_dates_safe(dates, year=True) + conv_dates = d.year + else: + raise ValueError(f"Format {fmt} is not a known Stata date format") + + conv_dates = Series(conv_dates, dtype=np.float64) + missing_value = struct.unpack("= 2 ** 53: + ws = precision_loss_doc.format("uint64", "float64") + + data[col] = data[col].astype(dtype) + + # Check values and upcast if necessary + if dtype == np.int8: + if data[col].max() > 100 or data[col].min() < -127: + data[col] = data[col].astype(np.int16) + elif dtype == np.int16: + if data[col].max() > 32740 or data[col].min() < -32767: + data[col] = data[col].astype(np.int32) + elif dtype == np.int64: + if data[col].max() <= 2147483620 and data[col].min() >= -2147483647: + data[col] = data[col].astype(np.int32) + else: + data[col] = data[col].astype(np.float64) + if data[col].max() >= 2 ** 53 or data[col].min() <= -(2 ** 53): + ws = precision_loss_doc.format("int64", "float64") + elif dtype in (np.float32, np.float64): + value = data[col].max() + if np.isinf(value): + raise ValueError( + f"Column {col} has a maximum value of infinity which is outside " + "the range supported by Stata." + ) + if dtype == np.float32 and value > float32_max: + data[col] = data[col].astype(np.float64) + elif dtype == np.float64: + if value > float64_max: + raise ValueError( + f"Column {col} has a maximum value ({value}) outside the range " + f"supported by Stata ({float64_max})" + ) + + if ws: + warnings.warn(ws, PossiblePrecisionLoss) + + return data + + +class StataValueLabel: + """ + Parse a categorical column and prepare formatted output + + Parameters + ---------- + catarray : Categorical + Categorical Series to encode + encoding : {"latin-1", "utf-8"} + Encoding to use for value labels. + """ + + def __init__(self, catarray, encoding="latin-1"): + + if encoding not in ("latin-1", "utf-8"): + raise ValueError("Only latin-1 and utf-8 are supported.") + self.labname = catarray.name + self._encoding = encoding + categories = catarray.cat.categories + self.value_labels = list(zip(np.arange(len(categories)), categories)) + self.value_labels.sort(key=lambda x: x[0]) + self.text_len = np.int32(0) + self.off = [] + self.val = [] + self.txt = [] + self.n = 0 + + # Compute lengths and setup lists of offsets and labels + for vl in self.value_labels: + category = vl[1] + if not isinstance(category, str): + category = str(category) + warnings.warn( + value_label_mismatch_doc.format(catarray.name), + ValueLabelTypeMismatch, + ) + category = category.encode(encoding) + self.off.append(self.text_len) + self.text_len += len(category) + 1 # +1 for the padding + self.val.append(vl[0]) + self.txt.append(category) + self.n += 1 + + if self.text_len > 32000: + raise ValueError( + "Stata value labels for a single variable must " + "have a combined length less than 32,000 " + "characters." + ) + + # Ensure int32 + self.off = np.array(self.off, dtype=np.int32) + self.val = np.array(self.val, dtype=np.int32) + + # Total length + self.len = 4 + 4 + 4 * self.n + 4 * self.n + self.text_len + + def _encode(self, s): + """ + Python 3 compatibility shim + """ + return s.encode(self._encoding) + + def generate_value_label(self, byteorder): + """ + Generate the binary representation of the value labals. + + Parameters + ---------- + byteorder : str + Byte order of the output + + Returns + ------- + value_label : bytes + Bytes containing the formatted value label + """ + encoding = self._encoding + bio = BytesIO() + null_byte = b"\x00" + + # len + bio.write(struct.pack(byteorder + "i", self.len)) + + # labname + labname = self.labname[:32].encode(encoding) + lab_len = 32 if encoding not in ("utf-8", "utf8") else 128 + labname = _pad_bytes(labname, lab_len + 1) + bio.write(labname) + + # padding - 3 bytes + for i in range(3): + bio.write(struct.pack("c", null_byte)) + + # value_label_table + # n - int32 + bio.write(struct.pack(byteorder + "i", self.n)) + + # textlen - int32 + bio.write(struct.pack(byteorder + "i", self.text_len)) + + # off - int32 array (n elements) + for offset in self.off: + bio.write(struct.pack(byteorder + "i", offset)) + + # val - int32 array (n elements) + for value in self.val: + bio.write(struct.pack(byteorder + "i", value)) + + # txt - Text labels, null terminated + for text in self.txt: + bio.write(text + null_byte) + + bio.seek(0) + return bio.read() + + +class StataMissingValue: + """ + An observation's missing value. + + Parameters + ---------- + value : int8, int16, int32, float32 or float64 + The Stata missing value code + + Attributes + ---------- + string : string + String representation of the Stata missing value + value : int8, int16, int32, float32 or float64 + The original encoded missing value + + Notes + ----- + More information: + + Integer missing values make the code '.', '.a', ..., '.z' to the ranges + 101 ... 127 (for int8), 32741 ... 32767 (for int16) and 2147483621 ... + 2147483647 (for int32). Missing values for floating point data types are + more complex but the pattern is simple to discern from the following table. + + np.float32 missing values (float in Stata) + 0000007f . + 0008007f .a + 0010007f .b + ... + 00c0007f .x + 00c8007f .y + 00d0007f .z + + np.float64 missing values (double in Stata) + 000000000000e07f . + 000000000001e07f .a + 000000000002e07f .b + ... + 000000000018e07f .x + 000000000019e07f .y + 00000000001ae07f .z + """ + + # Construct a dictionary of missing values + MISSING_VALUES = {} + bases = (101, 32741, 2147483621) + for b in bases: + # Conversion to long to avoid hash issues on 32 bit platforms #8968 + MISSING_VALUES[b] = "." + for i in range(1, 27): + MISSING_VALUES[i + b] = "." + chr(96 + i) + + float32_base = b"\x00\x00\x00\x7f" + increment = struct.unpack(" 0: + MISSING_VALUES[value] += chr(96 + i) + int_value = struct.unpack(" 0: + MISSING_VALUES[value] += chr(96 + i) + int_value = struct.unpack("q", struct.pack(" str: + return self.string + + def __repr__(self) -> str: + return f"{type(self)}({self})" + + def __eq__(self, other: Any) -> bool: + return ( + isinstance(other, type(self)) + and self.string == other.string + and self.value == other.value + ) + + @classmethod + def get_base_missing_value(cls, dtype): + if dtype == np.int8: + value = cls.BASE_MISSING_VALUES["int8"] + elif dtype == np.int16: + value = cls.BASE_MISSING_VALUES["int16"] + elif dtype == np.int32: + value = cls.BASE_MISSING_VALUES["int32"] + elif dtype == np.float32: + value = cls.BASE_MISSING_VALUES["float32"] + elif dtype == np.float64: + value = cls.BASE_MISSING_VALUES["float64"] + else: + raise ValueError("Unsupported dtype") + return value + + +class StataParser: + def __init__(self): + + # type code. + # -------------------- + # str1 1 = 0x01 + # str2 2 = 0x02 + # ... + # str244 244 = 0xf4 + # byte 251 = 0xfb (sic) + # int 252 = 0xfc + # long 253 = 0xfd + # float 254 = 0xfe + # double 255 = 0xff + # -------------------- + # NOTE: the byte type seems to be reserved for categorical variables + # with a label, but the underlying variable is -127 to 100 + # we're going to drop the label and cast to int + self.DTYPE_MAP = dict( + list(zip(range(1, 245), ["a" + str(i) for i in range(1, 245)])) + + [ + (251, np.int8), + (252, np.int16), + (253, np.int32), + (254, np.float32), + (255, np.float64), + ] + ) + self.DTYPE_MAP_XML = dict( + [ + (32768, np.uint8), # Keys to GSO + (65526, np.float64), + (65527, np.float32), + (65528, np.int32), + (65529, np.int16), + (65530, np.int8), + ] + ) + self.TYPE_MAP = list(range(251)) + list("bhlfd") + self.TYPE_MAP_XML = dict( + [ + # Not really a Q, unclear how to handle byteswap + (32768, "Q"), + (65526, "d"), + (65527, "f"), + (65528, "l"), + (65529, "h"), + (65530, "b"), + ] + ) + # NOTE: technically, some of these are wrong. there are more numbers + # that can be represented. it's the 27 ABOVE and BELOW the max listed + # numeric data type in [U] 12.2.2 of the 11.2 manual + float32_min = b"\xff\xff\xff\xfe" + float32_max = b"\xff\xff\xff\x7e" + float64_min = b"\xff\xff\xff\xff\xff\xff\xef\xff" + float64_max = b"\xff\xff\xff\xff\xff\xff\xdf\x7f" + self.VALID_RANGE = { + "b": (-127, 100), + "h": (-32767, 32740), + "l": (-2147483647, 2147483620), + "f": ( + np.float32(struct.unpack(" 0 + + # calculate size of a data record + self.col_sizes = [self._calcsize(typ) for typ in self.typlist] + + def _read_new_header(self, first_char): + # The first part of the header is common to 117 - 119. + self.path_or_buf.read(27) # stata_dta>
+ self.format_version = int(self.path_or_buf.read(3)) + if self.format_version not in [117, 118, 119]: + raise ValueError(_version_error.format(version=self.format_version)) + self._set_encoding() + self.path_or_buf.read(21) # + self.byteorder = self.path_or_buf.read(3) == b"MSF" and ">" or "<" + self.path_or_buf.read(15) # + nvar_type = "H" if self.format_version <= 118 else "I" + nvar_size = 2 if self.format_version <= 118 else 4 + self.nvar = struct.unpack( + self.byteorder + nvar_type, self.path_or_buf.read(nvar_size) + )[0] + self.path_or_buf.read(7) # + + self.nobs = self._get_nobs() + self.path_or_buf.read(11) # + self.time_stamp = self._get_time_stamp() + self.path_or_buf.read(26) #
+ self.path_or_buf.read(8) # 0x0000000000000000 + self.path_or_buf.read(8) # position of + + self._seek_vartypes = ( + struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 16 + ) + self._seek_varnames = ( + struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 10 + ) + self._seek_sortlist = ( + struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 10 + ) + self._seek_formats = ( + struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 9 + ) + self._seek_value_label_names = ( + struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 19 + ) + + # Requires version-specific treatment + self._seek_variable_labels = self._get_seek_variable_labels() + + self.path_or_buf.read(8) # + self.data_location = ( + struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 6 + ) + self.seek_strls = ( + struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 7 + ) + self.seek_value_labels = ( + struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 14 + ) + + self.typlist, self.dtyplist = self._get_dtypes(self._seek_vartypes) + + self.path_or_buf.seek(self._seek_varnames) + self.varlist = self._get_varlist() + + self.path_or_buf.seek(self._seek_sortlist) + self.srtlist = struct.unpack( + self.byteorder + ("h" * (self.nvar + 1)), + self.path_or_buf.read(2 * (self.nvar + 1)), + )[:-1] + + self.path_or_buf.seek(self._seek_formats) + self.fmtlist = self._get_fmtlist() + + self.path_or_buf.seek(self._seek_value_label_names) + self.lbllist = self._get_lbllist() + + self.path_or_buf.seek(self._seek_variable_labels) + self._variable_labels = self._get_variable_labels() + + # Get data type information, works for versions 117-119. + def _get_dtypes(self, seek_vartypes): + + self.path_or_buf.seek(seek_vartypes) + raw_typlist = [ + struct.unpack(self.byteorder + "H", self.path_or_buf.read(2))[0] + for i in range(self.nvar) + ] + + def f(typ): + if typ <= 2045: + return typ + try: + return self.TYPE_MAP_XML[typ] + except KeyError: + raise ValueError(f"cannot convert stata types [{typ}]") + + typlist = [f(x) for x in raw_typlist] + + def f(typ): + if typ <= 2045: + return str(typ) + try: + return self.DTYPE_MAP_XML[typ] + except KeyError: + raise ValueError(f"cannot convert stata dtype [{typ}]") + + dtyplist = [f(x) for x in raw_typlist] + + return typlist, dtyplist + + def _get_varlist(self): + if self.format_version == 117: + b = 33 + elif self.format_version >= 118: + b = 129 + + return [self._decode(self.path_or_buf.read(b)) for i in range(self.nvar)] + + # Returns the format list + def _get_fmtlist(self): + if self.format_version >= 118: + b = 57 + elif self.format_version > 113: + b = 49 + elif self.format_version > 104: + b = 12 + else: + b = 7 + + return [self._decode(self.path_or_buf.read(b)) for i in range(self.nvar)] + + # Returns the label list + def _get_lbllist(self): + if self.format_version >= 118: + b = 129 + elif self.format_version > 108: + b = 33 + else: + b = 9 + return [self._decode(self.path_or_buf.read(b)) for i in range(self.nvar)] + + def _get_variable_labels(self): + if self.format_version >= 118: + vlblist = [ + self._decode(self.path_or_buf.read(321)) for i in range(self.nvar) + ] + elif self.format_version > 105: + vlblist = [ + self._decode(self.path_or_buf.read(81)) for i in range(self.nvar) + ] + else: + vlblist = [ + self._decode(self.path_or_buf.read(32)) for i in range(self.nvar) + ] + return vlblist + + def _get_nobs(self): + if self.format_version >= 118: + return struct.unpack(self.byteorder + "Q", self.path_or_buf.read(8))[0] + else: + return struct.unpack(self.byteorder + "I", self.path_or_buf.read(4))[0] + + def _get_data_label(self): + if self.format_version >= 118: + strlen = struct.unpack(self.byteorder + "H", self.path_or_buf.read(2))[0] + return self._decode(self.path_or_buf.read(strlen)) + elif self.format_version == 117: + strlen = struct.unpack("b", self.path_or_buf.read(1))[0] + return self._decode(self.path_or_buf.read(strlen)) + elif self.format_version > 105: + return self._decode(self.path_or_buf.read(81)) + else: + return self._decode(self.path_or_buf.read(32)) + + def _get_time_stamp(self): + if self.format_version >= 118: + strlen = struct.unpack("b", self.path_or_buf.read(1))[0] + return self.path_or_buf.read(strlen).decode("utf-8") + elif self.format_version == 117: + strlen = struct.unpack("b", self.path_or_buf.read(1))[0] + return self._decode(self.path_or_buf.read(strlen)) + elif self.format_version > 104: + return self._decode(self.path_or_buf.read(18)) + else: + raise ValueError() + + def _get_seek_variable_labels(self): + if self.format_version == 117: + self.path_or_buf.read(8) # , throw away + # Stata 117 data files do not follow the described format. This is + # a work around that uses the previous label, 33 bytes for each + # variable, 20 for the closing tag and 17 for the opening tag + return self._seek_value_label_names + (33 * self.nvar) + 20 + 17 + elif self.format_version >= 118: + return struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 17 + else: + raise ValueError() + + def _read_old_header(self, first_char): + self.format_version = struct.unpack("b", first_char)[0] + if self.format_version not in [104, 105, 108, 111, 113, 114, 115]: + raise ValueError(_version_error.format(version=self.format_version)) + self._set_encoding() + self.byteorder = ( + struct.unpack("b", self.path_or_buf.read(1))[0] == 0x1 and ">" or "<" + ) + self.filetype = struct.unpack("b", self.path_or_buf.read(1))[0] + self.path_or_buf.read(1) # unused + + self.nvar = struct.unpack(self.byteorder + "H", self.path_or_buf.read(2))[0] + self.nobs = self._get_nobs() + + self._data_label = self._get_data_label() + + self.time_stamp = self._get_time_stamp() + + # descriptors + if self.format_version > 108: + typlist = [ord(self.path_or_buf.read(1)) for i in range(self.nvar)] + else: + buf = self.path_or_buf.read(self.nvar) + typlistb = np.frombuffer(buf, dtype=np.uint8) + typlist = [] + for tp in typlistb: + if tp in self.OLD_TYPE_MAPPING: + typlist.append(self.OLD_TYPE_MAPPING[tp]) + else: + typlist.append(tp - 127) # bytes + + try: + self.typlist = [self.TYPE_MAP[typ] for typ in typlist] + except ValueError: + invalid_types = ",".join(str(x) for x in typlist) + raise ValueError(f"cannot convert stata types [{invalid_types}]") + try: + self.dtyplist = [self.DTYPE_MAP[typ] for typ in typlist] + except ValueError: + invalid_dtypes = ",".join(str(x) for x in typlist) + raise ValueError(f"cannot convert stata dtypes [{invalid_dtypes}]") + + if self.format_version > 108: + self.varlist = [ + self._decode(self.path_or_buf.read(33)) for i in range(self.nvar) + ] + else: + self.varlist = [ + self._decode(self.path_or_buf.read(9)) for i in range(self.nvar) + ] + self.srtlist = struct.unpack( + self.byteorder + ("h" * (self.nvar + 1)), + self.path_or_buf.read(2 * (self.nvar + 1)), + )[:-1] + + self.fmtlist = self._get_fmtlist() + + self.lbllist = self._get_lbllist() + + self._variable_labels = self._get_variable_labels() + + # ignore expansion fields (Format 105 and later) + # When reading, read five bytes; the last four bytes now tell you + # the size of the next read, which you discard. You then continue + # like this until you read 5 bytes of zeros. + + if self.format_version > 104: + while True: + data_type = struct.unpack( + self.byteorder + "b", self.path_or_buf.read(1) + )[0] + if self.format_version > 108: + data_len = struct.unpack( + self.byteorder + "i", self.path_or_buf.read(4) + )[0] + else: + data_len = struct.unpack( + self.byteorder + "h", self.path_or_buf.read(2) + )[0] + if data_type == 0: + break + self.path_or_buf.read(data_len) + + # necessary data to continue parsing + self.data_location = self.path_or_buf.tell() + + def _setup_dtype(self): + """Map between numpy and state dtypes""" + if self._dtype is not None: + return self._dtype + + dtype = [] # Convert struct data types to numpy data type + for i, typ in enumerate(self.typlist): + if typ in self.NUMPY_TYPE_MAP: + dtype.append(("s" + str(i), self.byteorder + self.NUMPY_TYPE_MAP[typ])) + else: + dtype.append(("s" + str(i), "S" + str(typ))) + dtype = np.dtype(dtype) + self._dtype = dtype + + return self._dtype + + def _calcsize(self, fmt): + return type(fmt) is int and fmt or struct.calcsize(self.byteorder + fmt) + + def _decode(self, s): + # have bytes not strings, so must decode + s = s.partition(b"\0")[0] + try: + return s.decode(self._encoding) + except UnicodeDecodeError: + # GH 25960, fallback to handle incorrect format produced when 117 + # files are converted to 118 files in Stata + encoding = self._encoding + msg = f""" +One or more strings in the dta file could not be decoded using {encoding}, and +so the fallback encoding of latin-1 is being used. This can happen when a file +has been incorrectly encoded by Stata or some other software. You should verify +the string values returned are correct.""" + warnings.warn(msg, UnicodeWarning) + return s.decode("latin-1") + + def _read_value_labels(self): + if self._value_labels_read: + # Don't read twice + return + if self.format_version <= 108: + # Value labels are not supported in version 108 and earlier. + self._value_labels_read = True + self.value_label_dict = dict() + return + + if self.format_version >= 117: + self.path_or_buf.seek(self.seek_value_labels) + else: + offset = self.nobs * self._dtype.itemsize + self.path_or_buf.seek(self.data_location + offset) + + self._value_labels_read = True + self.value_label_dict = dict() + + while True: + if self.format_version >= 117: + if self.path_or_buf.read(5) == b" + break # end of value label table + + slength = self.path_or_buf.read(4) + if not slength: + break # end of value label table (format < 117) + if self.format_version <= 117: + labname = self._decode(self.path_or_buf.read(33)) + else: + labname = self._decode(self.path_or_buf.read(129)) + self.path_or_buf.read(3) # padding + + n = struct.unpack(self.byteorder + "I", self.path_or_buf.read(4))[0] + txtlen = struct.unpack(self.byteorder + "I", self.path_or_buf.read(4))[0] + off = np.frombuffer( + self.path_or_buf.read(4 * n), dtype=self.byteorder + "i4", count=n + ) + val = np.frombuffer( + self.path_or_buf.read(4 * n), dtype=self.byteorder + "i4", count=n + ) + ii = np.argsort(off) + off = off[ii] + val = val[ii] + txt = self.path_or_buf.read(txtlen) + self.value_label_dict[labname] = dict() + for i in range(n): + end = off[i + 1] if i < n - 1 else txtlen + self.value_label_dict[labname][val[i]] = self._decode(txt[off[i] : end]) + if self.format_version >= 117: + self.path_or_buf.read(6) # + self._value_labels_read = True + + def _read_strls(self): + self.path_or_buf.seek(self.seek_strls) + # Wrap v_o in a string to allow uint64 values as keys on 32bit OS + self.GSO = {"0": ""} + while True: + if self.path_or_buf.read(3) != b"GSO": + break + + if self.format_version == 117: + v_o = struct.unpack(self.byteorder + "Q", self.path_or_buf.read(8))[0] + else: + buf = self.path_or_buf.read(12) + # Only tested on little endian file on little endian machine. + v_size = 2 if self.format_version == 118 else 3 + if self.byteorder == "<": + buf = buf[0:v_size] + buf[4 : 12 - v_size] + else: + # This path may not be correct, impossible to test + buf = buf[0:v_size] + buf[4 + v_size :] + v_o = struct.unpack("Q", buf)[0] + typ = struct.unpack("B", self.path_or_buf.read(1))[0] + length = struct.unpack(self.byteorder + "I", self.path_or_buf.read(4))[0] + va = self.path_or_buf.read(length) + if typ == 130: + va = va[0:-1].decode(self._encoding) + # Wrap v_o in a string to allow uint64 values as keys on 32bit OS + self.GSO[str(v_o)] = va + + def __next__(self): + return self.read(nrows=self._chunksize or 1) + + def get_chunk(self, size=None): + """ + Reads lines from Stata file and returns as dataframe + + Parameters + ---------- + size : int, defaults to None + Number of lines to read. If None, reads whole file. + + Returns + ------- + DataFrame + """ + if size is None: + size = self._chunksize + return self.read(nrows=size) + + @Appender(_read_method_doc) + def read( + self, + nrows=None, + convert_dates=None, + convert_categoricals=None, + index_col=None, + convert_missing=None, + preserve_dtypes=None, + columns=None, + order_categoricals=None, + ): + # Handle empty file or chunk. If reading incrementally raise + # StopIteration. If reading the whole thing return an empty + # data frame. + if (self.nobs == 0) and (nrows is None): + self._can_read_value_labels = True + self._data_read = True + self.close() + return DataFrame(columns=self.varlist) + + # Handle options + if convert_dates is None: + convert_dates = self._convert_dates + if convert_categoricals is None: + convert_categoricals = self._convert_categoricals + if convert_missing is None: + convert_missing = self._convert_missing + if preserve_dtypes is None: + preserve_dtypes = self._preserve_dtypes + if columns is None: + columns = self._columns + if order_categoricals is None: + order_categoricals = self._order_categoricals + if index_col is None: + index_col = self._index_col + + if nrows is None: + nrows = self.nobs + + if (self.format_version >= 117) and (not self._value_labels_read): + self._can_read_value_labels = True + self._read_strls() + + # Read data + dtype = self._dtype + max_read_len = (self.nobs - self._lines_read) * dtype.itemsize + read_len = nrows * dtype.itemsize + read_len = min(read_len, max_read_len) + if read_len <= 0: + # Iterator has finished, should never be here unless + # we are reading the file incrementally + if convert_categoricals: + self._read_value_labels() + self.close() + raise StopIteration + offset = self._lines_read * dtype.itemsize + self.path_or_buf.seek(self.data_location + offset) + read_lines = min(nrows, self.nobs - self._lines_read) + data = np.frombuffer( + self.path_or_buf.read(read_len), dtype=dtype, count=read_lines + ) + + self._lines_read += read_lines + if self._lines_read == self.nobs: + self._can_read_value_labels = True + self._data_read = True + # if necessary, swap the byte order to native here + if self.byteorder != self._native_byteorder: + data = data.byteswap().newbyteorder() + + if convert_categoricals: + self._read_value_labels() + + if len(data) == 0: + data = DataFrame(columns=self.varlist) + else: + data = DataFrame.from_records(data) + data.columns = self.varlist + + # If index is not specified, use actual row number rather than + # restarting at 0 for each chunk. + if index_col is None: + ix = np.arange(self._lines_read - read_lines, self._lines_read) + data = data.set_index(ix) + + if columns is not None: + try: + data = self._do_select_columns(data, columns) + except ValueError: + self.close() + raise + + # Decode strings + for col, typ in zip(data, self.typlist): + if type(typ) is int: + data[col] = data[col].apply(self._decode, convert_dtype=True) + + data = self._insert_strls(data) + + cols_ = np.where(self.dtyplist)[0] + + # Convert columns (if needed) to match input type + ix = data.index + requires_type_conversion = False + data_formatted = [] + for i in cols_: + if self.dtyplist[i] is not None: + col = data.columns[i] + dtype = data[col].dtype + if dtype != np.dtype(object) and dtype != self.dtyplist[i]: + requires_type_conversion = True + data_formatted.append( + (col, Series(data[col], ix, self.dtyplist[i])) + ) + else: + data_formatted.append((col, data[col])) + if requires_type_conversion: + data = DataFrame.from_dict(dict(data_formatted)) + del data_formatted + + data = self._do_convert_missing(data, convert_missing) + + if convert_dates: + + def any_startswith(x: str) -> bool: + return any(x.startswith(fmt) for fmt in _date_formats) + + cols = np.where([any_startswith(x) for x in self.fmtlist])[0] + for i in cols: + col = data.columns[i] + try: + data[col] = _stata_elapsed_date_to_datetime_vec( + data[col], self.fmtlist[i] + ) + except ValueError: + self.close() + raise + + if convert_categoricals and self.format_version > 108: + data = self._do_convert_categoricals( + data, self.value_label_dict, self.lbllist, order_categoricals + ) + + if not preserve_dtypes: + retyped_data = [] + convert = False + for col in data: + dtype = data[col].dtype + if dtype in (np.float16, np.float32): + dtype = np.float64 + convert = True + elif dtype in (np.int8, np.int16, np.int32): + dtype = np.int64 + convert = True + retyped_data.append((col, data[col].astype(dtype))) + if convert: + data = DataFrame.from_dict(dict(retyped_data)) + + if index_col is not None: + data = data.set_index(data.pop(index_col)) + + return data + + def _do_convert_missing(self, data, convert_missing): + # Check for missing values, and replace if found + replacements = {} + for i, colname in enumerate(data): + fmt = self.typlist[i] + if fmt not in self.VALID_RANGE: + continue + + nmin, nmax = self.VALID_RANGE[fmt] + series = data[colname] + missing = np.logical_or(series < nmin, series > nmax) + + if not missing.any(): + continue + + if convert_missing: # Replacement follows Stata notation + + missing_loc = np.argwhere(missing._ndarray_values) + umissing, umissing_loc = np.unique(series[missing], return_inverse=True) + replacement = Series(series, dtype=np.object) + for j, um in enumerate(umissing): + missing_value = StataMissingValue(um) + + loc = missing_loc[umissing_loc == j] + replacement.iloc[loc] = missing_value + else: # All replacements are identical + dtype = series.dtype + if dtype not in (np.float32, np.float64): + dtype = np.float64 + replacement = Series(series, dtype=dtype) + replacement[missing] = np.nan + replacements[colname] = replacement + if replacements: + columns = data.columns + replacements = DataFrame(replacements) + data = concat([data.drop(replacements.columns, 1), replacements], 1) + data = data[columns] + return data + + def _insert_strls(self, data): + if not hasattr(self, "GSO") or len(self.GSO) == 0: + return data + for i, typ in enumerate(self.typlist): + if typ != "Q": + continue + # Wrap v_o in a string to allow uint64 values as keys on 32bit OS + data.iloc[:, i] = [self.GSO[str(k)] for k in data.iloc[:, i]] + return data + + def _do_select_columns(self, data, columns): + + if not self._column_selector_set: + column_set = set(columns) + if len(column_set) != len(columns): + raise ValueError("columns contains duplicate entries") + unmatched = column_set.difference(data.columns) + if unmatched: + raise ValueError( + "The following columns were not found in the " + "Stata data set: " + ", ".join(list(unmatched)) + ) + # Copy information for retained columns for later processing + dtyplist = [] + typlist = [] + fmtlist = [] + lbllist = [] + for col in columns: + i = data.columns.get_loc(col) + dtyplist.append(self.dtyplist[i]) + typlist.append(self.typlist[i]) + fmtlist.append(self.fmtlist[i]) + lbllist.append(self.lbllist[i]) + + self.dtyplist = dtyplist + self.typlist = typlist + self.fmtlist = fmtlist + self.lbllist = lbllist + self._column_selector_set = True + + return data[columns] + + def _do_convert_categoricals( + self, data, value_label_dict, lbllist, order_categoricals + ): + """ + Converts categorical columns to Categorical type. + """ + value_labels = list(value_label_dict.keys()) + cat_converted_data = [] + for col, label in zip(data, lbllist): + if label in value_labels: + # Explicit call with ordered=True + cat_data = Categorical(data[col], ordered=order_categoricals) + categories = [] + for category in cat_data.categories: + if category in value_label_dict[label]: + categories.append(value_label_dict[label][category]) + else: + categories.append(category) # Partially labeled + try: + cat_data.categories = categories + except ValueError: + vc = Series(categories).value_counts() + repeats = list(vc.index[vc > 1]) + repeats = "-" * 80 + "\n" + "\n".join(repeats) + # GH 25772 + msg = f""" +Value labels for column {col} are not unique. These cannot be converted to +pandas categoricals. + +Either read the file with `convert_categoricals` set to False or use the +low level interface in `StataReader` to separately read the values and the +value_labels. + +The repeated labels are: +{repeats} +""" + raise ValueError(msg) + # TODO: is the next line needed above in the data(...) method? + cat_data = Series(cat_data, index=data.index) + cat_converted_data.append((col, cat_data)) + else: + cat_converted_data.append((col, data[col])) + data = DataFrame.from_dict(dict(cat_converted_data)) + return data + + @property + def data_label(self): + """ + Return data label of Stata file. + """ + return self._data_label + + def variable_labels(self): + """ + Return variable labels as a dict, associating each variable name + with corresponding label. + + Returns + ------- + dict + """ + return dict(zip(self.varlist, self._variable_labels)) + + def value_labels(self): + """ + Return a dict, associating each variable name a dict, associating + each value its corresponding label. + + Returns + ------- + dict + """ + if not self._value_labels_read: + self._read_value_labels() + + return self.value_label_dict + + +def _open_file_binary_write(fname): + """ + Open a binary file or no-op if file-like. + + Parameters + ---------- + fname : string path, path object or buffer + + Returns + ------- + file : file-like object + File object supporting write + own : bool + True if the file was created, otherwise False + """ + if hasattr(fname, "write"): + # if 'b' not in fname.mode: + return fname, False + return open(fname, "wb"), True + + +def _set_endianness(endianness): + if endianness.lower() in ["<", "little"]: + return "<" + elif endianness.lower() in [">", "big"]: + return ">" + else: # pragma : no cover + raise ValueError(f"Endianness {endianness} not understood") + + +def _pad_bytes(name, length): + """ + Take a char string and pads it with null bytes until it's length chars. + """ + if isinstance(name, bytes): + return name + b"\x00" * (length - len(name)) + return name + "\x00" * (length - len(name)) + + +def _convert_datetime_to_stata_type(fmt): + """ + Convert from one of the stata date formats to a type in TYPE_MAP. + """ + if fmt in [ + "tc", + "%tc", + "td", + "%td", + "tw", + "%tw", + "tm", + "%tm", + "tq", + "%tq", + "th", + "%th", + "ty", + "%ty", + ]: + return np.float64 # Stata expects doubles for SIFs + else: + raise NotImplementedError(f"Format {fmt} not implemented") + + +def _maybe_convert_to_int_keys(convert_dates, varlist): + new_dict = {} + for key in convert_dates: + if not convert_dates[key].startswith("%"): # make sure proper fmts + convert_dates[key] = "%" + convert_dates[key] + if key in varlist: + new_dict.update({varlist.index(key): convert_dates[key]}) + else: + if not isinstance(key, int): + raise ValueError("convert_dates key must be a column or an integer") + new_dict.update({key: convert_dates[key]}) + return new_dict + + +def _dtype_to_stata_type(dtype, column): + """ + Convert dtype types to stata types. Returns the byte of the given ordinal. + See TYPE_MAP and comments for an explanation. This is also explained in + the dta spec. + 1 - 244 are strings of this length + Pandas Stata + 251 - for int8 byte + 252 - for int16 int + 253 - for int32 long + 254 - for float32 float + 255 - for double double + + If there are dates to convert, then dtype will already have the correct + type inserted. + """ + # TODO: expand to handle datetime to integer conversion + if dtype.type == np.object_: # try to coerce it to the biggest string + # not memory efficient, what else could we + # do? + itemsize = max_len_string_array(ensure_object(column.values)) + return max(itemsize, 1) + elif dtype == np.float64: + return 255 + elif dtype == np.float32: + return 254 + elif dtype == np.int32: + return 253 + elif dtype == np.int16: + return 252 + elif dtype == np.int8: + return 251 + else: # pragma : no cover + raise NotImplementedError(f"Data type {dtype} not supported.") + + +def _dtype_to_default_stata_fmt(dtype, column, dta_version=114, force_strl=False): + """ + Map numpy dtype to stata's default format for this type. Not terribly + important since users can change this in Stata. Semantics are + + object -> "%DDs" where DD is the length of the string. If not a string, + raise ValueError + float64 -> "%10.0g" + float32 -> "%9.0g" + int64 -> "%9.0g" + int32 -> "%12.0g" + int16 -> "%8.0g" + int8 -> "%8.0g" + strl -> "%9s" + """ + # TODO: Refactor to combine type with format + # TODO: expand this to handle a default datetime format? + if dta_version < 117: + max_str_len = 244 + else: + max_str_len = 2045 + if force_strl: + return "%9s" + if dtype.type == np.object_: + itemsize = max_len_string_array(ensure_object(column.values)) + if itemsize > max_str_len: + if dta_version >= 117: + return "%9s" + else: + raise ValueError(excessive_string_length_error.format(column.name)) + return "%" + str(max(itemsize, 1)) + "s" + elif dtype == np.float64: + return "%10.0g" + elif dtype == np.float32: + return "%9.0g" + elif dtype == np.int32: + return "%12.0g" + elif dtype == np.int8 or dtype == np.int16: + return "%8.0g" + else: # pragma : no cover + raise NotImplementedError(f"Data type {dtype} not supported.") + + +class StataWriter(StataParser): + """ + A class for writing Stata binary dta files + + Parameters + ---------- + fname : path (string), buffer or path object + string, path object (pathlib.Path or py._path.local.LocalPath) or + object implementing a binary write() functions. If using a buffer + then the buffer will not be automatically closed after the file + is written. + + .. versionadded:: 0.23.0 support for pathlib, py.path. + + data : DataFrame + Input to save + convert_dates : dict + Dictionary mapping columns containing datetime types to stata internal + format to use when writing the dates. Options are 'tc', 'td', 'tm', + 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name. + Datetime columns that do not have a conversion type specified will be + converted to 'tc'. Raises NotImplementedError if a datetime column has + timezone information + write_index : bool + Write the index to Stata dataset. + byteorder : str + Can be ">", "<", "little", or "big". default is `sys.byteorder` + time_stamp : datetime + A datetime to use as file creation date. Default is the current time + data_label : str + A label for the data set. Must be 80 characters or smaller. + variable_labels : dict + Dictionary containing columns as keys and variable labels as values. + Each label must be 80 characters or smaller. + + Returns + ------- + writer : StataWriter instance + The StataWriter instance has a write_file method, which will + write the file to the given `fname`. + + Raises + ------ + NotImplementedError + * If datetimes contain timezone information + ValueError + * Columns listed in convert_dates are neither datetime64[ns] + or datetime.datetime + * Column dtype is not representable in Stata + * Column listed in convert_dates is not in DataFrame + * Categorical label contains more than 32,000 characters + + Examples + -------- + >>> data = pd.DataFrame([[1.0, 1]], columns=['a', 'b']) + >>> writer = StataWriter('./data_file.dta', data) + >>> writer.write_file() + + Or with dates + >>> from datetime import datetime + >>> data = pd.DataFrame([[datetime(2000,1,1)]], columns=['date']) + >>> writer = StataWriter('./date_data_file.dta', data, {'date' : 'tw'}) + >>> writer.write_file() + """ + + _max_string_length = 244 + _encoding = "latin-1" + + def __init__( + self, + fname, + data, + convert_dates=None, + write_index=True, + byteorder=None, + time_stamp=None, + data_label=None, + variable_labels=None, + ): + super().__init__() + self._convert_dates = {} if convert_dates is None else convert_dates + self._write_index = write_index + self._time_stamp = time_stamp + self._data_label = data_label + self._variable_labels = variable_labels + self._own_file = True + # attach nobs, nvars, data, varlist, typlist + self._prepare_pandas(data) + + if byteorder is None: + byteorder = sys.byteorder + self._byteorder = _set_endianness(byteorder) + self._fname = stringify_path(fname) + self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8} + self._converted_names = {} + + def _write(self, to_write): + """ + Helper to call encode before writing to file for Python 3 compat. + """ + self._file.write(to_write.encode(self._encoding or self._default_encoding)) + + def _prepare_categoricals(self, data): + """Check for categorical columns, retain categorical information for + Stata file and convert categorical data to int""" + + is_cat = [is_categorical_dtype(data[col]) for col in data] + self._is_col_cat = is_cat + self._value_labels = [] + if not any(is_cat): + return data + + get_base_missing_value = StataMissingValue.get_base_missing_value + data_formatted = [] + for col, col_is_cat in zip(data, is_cat): + if col_is_cat: + svl = StataValueLabel(data[col], encoding=self._encoding) + self._value_labels.append(svl) + dtype = data[col].cat.codes.dtype + if dtype == np.int64: + raise ValueError( + "It is not possible to export " + "int64-based categorical data to Stata." + ) + values = data[col].cat.codes.values.copy() + + # Upcast if needed so that correct missing values can be set + if values.max() >= get_base_missing_value(dtype): + if dtype == np.int8: + dtype = np.int16 + elif dtype == np.int16: + dtype = np.int32 + else: + dtype = np.float64 + values = np.array(values, dtype=dtype) + + # Replace missing values with Stata missing value for type + values[values == -1] = get_base_missing_value(dtype) + data_formatted.append((col, values)) + else: + data_formatted.append((col, data[col])) + return DataFrame.from_dict(dict(data_formatted)) + + def _replace_nans(self, data): + # return data + """Checks floating point data columns for nans, and replaces these with + the generic Stata for missing value (.)""" + for c in data: + dtype = data[c].dtype + if dtype in (np.float32, np.float64): + if dtype == np.float32: + replacement = self.MISSING_VALUES["f"] + else: + replacement = self.MISSING_VALUES["d"] + data[c] = data[c].fillna(replacement) + + return data + + def _update_strl_names(self): + """No-op, forward compatibility""" + pass + + def _validate_variable_name(self, name): + """ + Validate variable names for Stata export. + + Parameters + ---------- + name : str + Variable name + + Returns + ------- + str + The validated name with invalid characters replaced with + underscores. + + Notes + ----- + Stata 114 and 117 support ascii characters in a-z, A-Z, 0-9 + and _. + """ + for c in name: + if ( + (c < "A" or c > "Z") + and (c < "a" or c > "z") + and (c < "0" or c > "9") + and c != "_" + ): + name = name.replace(c, "_") + return name + + def _check_column_names(self, data): + """ + Checks column names to ensure that they are valid Stata column names. + This includes checks for: + * Non-string names + * Stata keywords + * Variables that start with numbers + * Variables with names that are too long + + When an illegal variable name is detected, it is converted, and if + dates are exported, the variable name is propagated to the date + conversion dictionary + """ + converted_names = {} + columns = list(data.columns) + original_columns = columns[:] + + duplicate_var_id = 0 + for j, name in enumerate(columns): + orig_name = name + if not isinstance(name, str): + name = str(name) + + name = self._validate_variable_name(name) + + # Variable name must not be a reserved word + if name in self.RESERVED_WORDS: + name = "_" + name + + # Variable name may not start with a number + if name[0] >= "0" and name[0] <= "9": + name = "_" + name + + name = name[: min(len(name), 32)] + + if not name == orig_name: + # check for duplicates + while columns.count(name) > 0: + # prepend ascending number to avoid duplicates + name = "_" + str(duplicate_var_id) + name + name = name[: min(len(name), 32)] + duplicate_var_id += 1 + converted_names[orig_name] = name + + columns[j] = name + + data.columns = columns + + # Check date conversion, and fix key if needed + if self._convert_dates: + for c, o in zip(columns, original_columns): + if c != o: + self._convert_dates[c] = self._convert_dates[o] + del self._convert_dates[o] + + if converted_names: + conversion_warning = [] + for orig_name, name in converted_names.items(): + # need to possibly encode the orig name if its unicode + try: + orig_name = orig_name.encode("utf-8") + except (UnicodeDecodeError, AttributeError): + pass + msg = f"{orig_name} -> {name}" + conversion_warning.append(msg) + + ws = invalid_name_doc.format("\n ".join(conversion_warning)) + warnings.warn(ws, InvalidColumnName) + + self._converted_names = converted_names + self._update_strl_names() + + return data + + def _set_formats_and_types(self, dtypes): + self.typlist = [] + self.fmtlist = [] + for col, dtype in dtypes.items(): + self.fmtlist.append(_dtype_to_default_stata_fmt(dtype, self.data[col])) + self.typlist.append(_dtype_to_stata_type(dtype, self.data[col])) + + def _prepare_pandas(self, data): + # NOTE: we might need a different API / class for pandas objects so + # we can set different semantics - handle this with a PR to pandas.io + + data = data.copy() + + if self._write_index: + data = data.reset_index() + + # Ensure column names are strings + data = self._check_column_names(data) + + # Check columns for compatibility with stata, upcast if necessary + # Raise if outside the supported range + data = _cast_to_stata_types(data) + + # Replace NaNs with Stata missing values + data = self._replace_nans(data) + + # Convert categoricals to int data, and strip labels + data = self._prepare_categoricals(data) + + self.nobs, self.nvar = data.shape + self.data = data + self.varlist = data.columns.tolist() + + dtypes = data.dtypes + + # Ensure all date columns are converted + for col in data: + if col in self._convert_dates: + continue + if is_datetime64_dtype(data[col]): + self._convert_dates[col] = "tc" + + self._convert_dates = _maybe_convert_to_int_keys( + self._convert_dates, self.varlist + ) + for key in self._convert_dates: + new_type = _convert_datetime_to_stata_type(self._convert_dates[key]) + dtypes[key] = np.dtype(new_type) + + # Verify object arrays are strings and encode to bytes + self._encode_strings() + + self._set_formats_and_types(dtypes) + + # set the given format for the datetime cols + if self._convert_dates is not None: + for key in self._convert_dates: + self.fmtlist[key] = self._convert_dates[key] + + def _encode_strings(self): + """ + Encode strings in dta-specific encoding + + Do not encode columns marked for date conversion or for strL + conversion. The strL converter independently handles conversion and + also accepts empty string arrays. + """ + convert_dates = self._convert_dates + # _convert_strl is not available in dta 114 + convert_strl = getattr(self, "_convert_strl", []) + for i, col in enumerate(self.data): + # Skip columns marked for date conversion or strl conversion + if i in convert_dates or col in convert_strl: + continue + column = self.data[col] + dtype = column.dtype + if dtype.type == np.object_: + inferred_dtype = infer_dtype(column, skipna=True) + if not ((inferred_dtype in ("string", "unicode")) or len(column) == 0): + col = column.name + raise ValueError( + f"""\ +Column `{col}` cannot be exported.\n\nOnly string-like object arrays +containing all strings or a mix of strings and None can be exported. +Object arrays containing only null values are prohibited. Other object +types cannot be exported and must first be converted to one of the +supported types.""" + ) + encoded = self.data[col].str.encode(self._encoding) + # If larger than _max_string_length do nothing + if ( + max_len_string_array(ensure_object(encoded.values)) + <= self._max_string_length + ): + self.data[col] = encoded + + def write_file(self): + self._file, self._own_file = _open_file_binary_write(self._fname) + try: + self._write_header(data_label=self._data_label, time_stamp=self._time_stamp) + self._write_map() + self._write_variable_types() + self._write_varnames() + self._write_sortlist() + self._write_formats() + self._write_value_label_names() + self._write_variable_labels() + self._write_expansion_fields() + self._write_characteristics() + self._prepare_data() + self._write_data() + self._write_strls() + self._write_value_labels() + self._write_file_close_tag() + self._write_map() + except Exception as exc: + self._close() + if self._own_file: + try: + os.unlink(self._fname) + except OSError: + warnings.warn( + f"This save was not successful but {self._fname} could not " + "be deleted. This file is not valid.", + ResourceWarning, + ) + raise exc + else: + self._close() + + def _close(self): + """ + Close the file if it was created by the writer. + + If a buffer or file-like object was passed in, for example a GzipFile, + then leave this file open for the caller to close. In either case, + attempt to flush the file contents to ensure they are written to disk + (if supported) + """ + # Some file-like objects might not support flush + try: + self._file.flush() + except AttributeError: + pass + if self._own_file: + self._file.close() + + def _write_map(self): + """No-op, future compatibility""" + pass + + def _write_file_close_tag(self): + """No-op, future compatibility""" + pass + + def _write_characteristics(self): + """No-op, future compatibility""" + pass + + def _write_strls(self): + """No-op, future compatibility""" + pass + + def _write_expansion_fields(self): + """Write 5 zeros for expansion fields""" + self._write(_pad_bytes("", 5)) + + def _write_value_labels(self): + for vl in self._value_labels: + self._file.write(vl.generate_value_label(self._byteorder)) + + def _write_header(self, data_label=None, time_stamp=None): + byteorder = self._byteorder + # ds_format - just use 114 + self._file.write(struct.pack("b", 114)) + # byteorder + self._write(byteorder == ">" and "\x01" or "\x02") + # filetype + self._write("\x01") + # unused + self._write("\x00") + # number of vars, 2 bytes + self._file.write(struct.pack(byteorder + "h", self.nvar)[:2]) + # number of obs, 4 bytes + self._file.write(struct.pack(byteorder + "i", self.nobs)[:4]) + # data label 81 bytes, char, null terminated + if data_label is None: + self._file.write(self._null_terminate(_pad_bytes("", 80))) + else: + self._file.write(self._null_terminate(_pad_bytes(data_label[:80], 80))) + # time stamp, 18 bytes, char, null terminated + # format dd Mon yyyy hh:mm + if time_stamp is None: + time_stamp = datetime.datetime.now() + elif not isinstance(time_stamp, datetime.datetime): + raise ValueError("time_stamp should be datetime type") + # GH #13856 + # Avoid locale-specific month conversion + months = [ + "Jan", + "Feb", + "Mar", + "Apr", + "May", + "Jun", + "Jul", + "Aug", + "Sep", + "Oct", + "Nov", + "Dec", + ] + month_lookup = {i + 1: month for i, month in enumerate(months)} + ts = ( + time_stamp.strftime("%d ") + + month_lookup[time_stamp.month] + + time_stamp.strftime(" %Y %H:%M") + ) + self._file.write(self._null_terminate(ts)) + + def _write_variable_types(self): + for typ in self.typlist: + self._file.write(struct.pack("B", typ)) + + def _write_varnames(self): + # varlist names are checked by _check_column_names + # varlist, requires null terminated + for name in self.varlist: + name = self._null_terminate(name, True) + name = _pad_bytes(name[:32], 33) + self._write(name) + + def _write_sortlist(self): + # srtlist, 2*(nvar+1), int array, encoded by byteorder + srtlist = _pad_bytes("", 2 * (self.nvar + 1)) + self._write(srtlist) + + def _write_formats(self): + # fmtlist, 49*nvar, char array + for fmt in self.fmtlist: + self._write(_pad_bytes(fmt, 49)) + + def _write_value_label_names(self): + # lbllist, 33*nvar, char array + for i in range(self.nvar): + # Use variable name when categorical + if self._is_col_cat[i]: + name = self.varlist[i] + name = self._null_terminate(name, True) + name = _pad_bytes(name[:32], 33) + self._write(name) + else: # Default is empty label + self._write(_pad_bytes("", 33)) + + def _write_variable_labels(self): + # Missing labels are 80 blank characters plus null termination + blank = _pad_bytes("", 81) + + if self._variable_labels is None: + for i in range(self.nvar): + self._write(blank) + return + + for col in self.data: + if col in self._variable_labels: + label = self._variable_labels[col] + if len(label) > 80: + raise ValueError("Variable labels must be 80 characters or fewer") + is_latin1 = all(ord(c) < 256 for c in label) + if not is_latin1: + raise ValueError( + "Variable labels must contain only characters that " + "can be encoded in Latin-1" + ) + self._write(_pad_bytes(label, 81)) + else: + self._write(blank) + + def _convert_strls(self, data): + """No-op, future compatibility""" + return data + + def _prepare_data(self): + data = self.data + typlist = self.typlist + convert_dates = self._convert_dates + # 1. Convert dates + if self._convert_dates is not None: + for i, col in enumerate(data): + if i in convert_dates: + data[col] = _datetime_to_stata_elapsed_vec( + data[col], self.fmtlist[i] + ) + # 2. Convert strls + data = self._convert_strls(data) + + # 3. Convert bad string data to '' and pad to correct length + dtypes = {} + native_byteorder = self._byteorder == _set_endianness(sys.byteorder) + for i, col in enumerate(data): + typ = typlist[i] + if typ <= self._max_string_length: + data[col] = data[col].fillna("").apply(_pad_bytes, args=(typ,)) + stype = f"S{typ}" + dtypes[col] = stype + data[col] = data[col].astype(stype) + else: + dtype = data[col].dtype + if not native_byteorder: + dtype = dtype.newbyteorder(self._byteorder) + dtypes[col] = dtype + + self.data = data.to_records(index=False, column_dtypes=dtypes) + + def _write_data(self): + data = self.data + self._file.write(data.tobytes()) + + def _null_terminate(self, s, as_string=False): + null_byte = "\x00" + s += null_byte + + if not as_string: + s = s.encode(self._encoding) + + return s + + +def _dtype_to_stata_type_117(dtype, column, force_strl): + """ + Converts dtype types to stata types. Returns the byte of the given ordinal. + See TYPE_MAP and comments for an explanation. This is also explained in + the dta spec. + 1 - 2045 are strings of this length + Pandas Stata + 32768 - for object strL + 65526 - for int8 byte + 65527 - for int16 int + 65528 - for int32 long + 65529 - for float32 float + 65530 - for double double + + If there are dates to convert, then dtype will already have the correct + type inserted. + """ + # TODO: expand to handle datetime to integer conversion + if force_strl: + return 32768 + if dtype.type == np.object_: # try to coerce it to the biggest string + # not memory efficient, what else could we + # do? + itemsize = max_len_string_array(ensure_object(column.values)) + itemsize = max(itemsize, 1) + if itemsize <= 2045: + return itemsize + return 32768 + elif dtype == np.float64: + return 65526 + elif dtype == np.float32: + return 65527 + elif dtype == np.int32: + return 65528 + elif dtype == np.int16: + return 65529 + elif dtype == np.int8: + return 65530 + else: # pragma : no cover + raise NotImplementedError(f"Data type {dtype} not supported.") + + +def _pad_bytes_new(name, length): + """ + Takes a bytes instance and pads it with null bytes until it's length chars. + """ + if isinstance(name, str): + name = bytes(name, "utf-8") + return name + b"\x00" * (length - len(name)) + + +class StataStrLWriter: + """ + Converter for Stata StrLs + + Stata StrLs map 8 byte values to strings which are stored using a + dictionary-like format where strings are keyed to two values. + + Parameters + ---------- + df : DataFrame + DataFrame to convert + columns : list + List of columns names to convert to StrL + version : int, optional + dta version. Currently supports 117, 118 and 119 + byteorder : str, optional + Can be ">", "<", "little", or "big". default is `sys.byteorder` + + Notes + ----- + Supports creation of the StrL block of a dta file for dta versions + 117, 118 and 119. These differ in how the GSO is stored. 118 and + 119 store the GSO lookup value as a uint32 and a uint64, while 117 + uses two uint32s. 118 and 119 also encode all strings as unicode + which is required by the format. 117 uses 'latin-1' a fixed width + encoding that extends the 7-bit ascii table with an additional 128 + characters. + """ + + def __init__(self, df, columns, version=117, byteorder=None): + if version not in (117, 118, 119): + raise ValueError("Only dta versions 117, 118 and 119 supported") + self._dta_ver = version + + self.df = df + self.columns = columns + self._gso_table = {"": (0, 0)} + if byteorder is None: + byteorder = sys.byteorder + self._byteorder = _set_endianness(byteorder) + + gso_v_type = "I" # uint32 + gso_o_type = "Q" # uint64 + self._encoding = "utf-8" + if version == 117: + o_size = 4 + gso_o_type = "I" # 117 used uint32 + self._encoding = "latin-1" + elif version == 118: + o_size = 6 + else: # version == 119 + o_size = 5 + self._o_offet = 2 ** (8 * (8 - o_size)) + self._gso_o_type = gso_o_type + self._gso_v_type = gso_v_type + + def _convert_key(self, key): + v, o = key + return v + self._o_offet * o + + def generate_table(self): + """ + Generates the GSO lookup table for the DataFrame + + Returns + ------- + gso_table : dict + Ordered dictionary using the string found as keys + and their lookup position (v,o) as values + gso_df : DataFrame + DataFrame where strl columns have been converted to + (v,o) values + + Notes + ----- + Modifies the DataFrame in-place. + + The DataFrame returned encodes the (v,o) values as uint64s. The + encoding depends on the dta version, and can be expressed as + + enc = v + o * 2 ** (o_size * 8) + + so that v is stored in the lower bits and o is in the upper + bits. o_size is + + * 117: 4 + * 118: 6 + * 119: 5 + """ + + gso_table = self._gso_table + gso_df = self.df + columns = list(gso_df.columns) + selected = gso_df[self.columns] + col_index = [(col, columns.index(col)) for col in self.columns] + keys = np.empty(selected.shape, dtype=np.uint64) + for o, (idx, row) in enumerate(selected.iterrows()): + for j, (col, v) in enumerate(col_index): + val = row[col] + # Allow columns with mixed str and None (GH 23633) + val = "" if val is None else val + key = gso_table.get(val, None) + if key is None: + # Stata prefers human numbers + key = (v + 1, o + 1) + gso_table[val] = key + keys[o, j] = self._convert_key(key) + for i, col in enumerate(self.columns): + gso_df[col] = keys[:, i] + + return gso_table, gso_df + + def generate_blob(self, gso_table): + """ + Generates the binary blob of GSOs that is written to the dta file. + + Parameters + ---------- + gso_table : dict + Ordered dictionary (str, vo) + + Returns + ------- + gso : bytes + Binary content of dta file to be placed between strl tags + + Notes + ----- + Output format depends on dta version. 117 uses two uint32s to + express v and o while 118+ uses a uint32 for v and a uint64 for o. + """ + # Format information + # Length includes null term + # 117 + # GSOvvvvooootllllxxxxxxxxxxxxxxx...x + # 3 u4 u4 u1 u4 string + null term + # + # 118, 119 + # GSOvvvvooooooootllllxxxxxxxxxxxxxxx...x + # 3 u4 u8 u1 u4 string + null term + + bio = BytesIO() + gso = bytes("GSO", "ascii") + gso_type = struct.pack(self._byteorder + "B", 130) + null = struct.pack(self._byteorder + "B", 0) + v_type = self._byteorder + self._gso_v_type + o_type = self._byteorder + self._gso_o_type + len_type = self._byteorder + "I" + for strl, vo in gso_table.items(): + if vo == (0, 0): + continue + v, o = vo + + # GSO + bio.write(gso) + + # vvvv + bio.write(struct.pack(v_type, v)) + + # oooo / oooooooo + bio.write(struct.pack(o_type, o)) + + # t + bio.write(gso_type) + + # llll + utf8_string = bytes(strl, "utf-8") + bio.write(struct.pack(len_type, len(utf8_string) + 1)) + + # xxx...xxx + bio.write(utf8_string) + bio.write(null) + + bio.seek(0) + return bio.read() + + +class StataWriter117(StataWriter): + """ + A class for writing Stata binary dta files in Stata 13 format (117) + + .. versionadded:: 0.23.0 + + Parameters + ---------- + fname : path (string), buffer or path object + string, path object (pathlib.Path or py._path.local.LocalPath) or + object implementing a binary write() functions. If using a buffer + then the buffer will not be automatically closed after the file + is written. + data : DataFrame + Input to save + convert_dates : dict + Dictionary mapping columns containing datetime types to stata internal + format to use when writing the dates. Options are 'tc', 'td', 'tm', + 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name. + Datetime columns that do not have a conversion type specified will be + converted to 'tc'. Raises NotImplementedError if a datetime column has + timezone information + write_index : bool + Write the index to Stata dataset. + byteorder : str + Can be ">", "<", "little", or "big". default is `sys.byteorder` + time_stamp : datetime + A datetime to use as file creation date. Default is the current time + data_label : str + A label for the data set. Must be 80 characters or smaller. + variable_labels : dict + Dictionary containing columns as keys and variable labels as values. + Each label must be 80 characters or smaller. + convert_strl : list + List of columns names to convert to Stata StrL format. Columns with + more than 2045 characters are automatically written as StrL. + Smaller columns can be converted by including the column name. Using + StrLs can reduce output file size when strings are longer than 8 + characters, and either frequently repeated or sparse. + + Returns + ------- + writer : StataWriter117 instance + The StataWriter117 instance has a write_file method, which will + write the file to the given `fname`. + + Raises + ------ + NotImplementedError + * If datetimes contain timezone information + ValueError + * Columns listed in convert_dates are neither datetime64[ns] + or datetime.datetime + * Column dtype is not representable in Stata + * Column listed in convert_dates is not in DataFrame + * Categorical label contains more than 32,000 characters + + Examples + -------- + >>> from pandas.io.stata import StataWriter117 + >>> data = pd.DataFrame([[1.0, 1, 'a']], columns=['a', 'b', 'c']) + >>> writer = StataWriter117('./data_file.dta', data) + >>> writer.write_file() + + Or with long strings stored in strl format + + >>> data = pd.DataFrame([['A relatively long string'], [''], ['']], + ... columns=['strls']) + >>> writer = StataWriter117('./data_file_with_long_strings.dta', data, + ... convert_strl=['strls']) + >>> writer.write_file() + """ + + _max_string_length = 2045 + _dta_version = 117 + + def __init__( + self, + fname, + data, + convert_dates=None, + write_index=True, + byteorder=None, + time_stamp=None, + data_label=None, + variable_labels=None, + convert_strl=None, + ): + # Shallow copy since convert_strl might be modified later + self._convert_strl = [] if convert_strl is None else convert_strl[:] + + super().__init__( + fname, + data, + convert_dates, + write_index, + byteorder=byteorder, + time_stamp=time_stamp, + data_label=data_label, + variable_labels=variable_labels, + ) + self._map = None + self._strl_blob = None + + @staticmethod + def _tag(val, tag): + """Surround val with """ + if isinstance(val, str): + val = bytes(val, "utf-8") + return bytes("<" + tag + ">", "utf-8") + val + bytes("", "utf-8") + + def _update_map(self, tag): + """Update map location for tag with file position""" + self._map[tag] = self._file.tell() + + def _write_header(self, data_label=None, time_stamp=None): + """Write the file header""" + byteorder = self._byteorder + self._file.write(bytes("", "utf-8")) + bio = BytesIO() + # ds_format - 117 + bio.write(self._tag(bytes(str(self._dta_version), "utf-8"), "release")) + # byteorder + bio.write(self._tag(byteorder == ">" and "MSF" or "LSF", "byteorder")) + # number of vars, 2 bytes in 117 and 118, 4 byte in 119 + nvar_type = "H" if self._dta_version <= 118 else "I" + bio.write(self._tag(struct.pack(byteorder + nvar_type, self.nvar), "K")) + # 117 uses 4 bytes, 118 uses 8 + nobs_size = "I" if self._dta_version == 117 else "Q" + bio.write(self._tag(struct.pack(byteorder + nobs_size, self.nobs), "N")) + # data label 81 bytes, char, null terminated + label = data_label[:80] if data_label is not None else "" + label = label.encode(self._encoding) + label_size = "B" if self._dta_version == 117 else "H" + label_len = struct.pack(byteorder + label_size, len(label)) + label = label_len + label + bio.write(self._tag(label, "label")) + # time stamp, 18 bytes, char, null terminated + # format dd Mon yyyy hh:mm + if time_stamp is None: + time_stamp = datetime.datetime.now() + elif not isinstance(time_stamp, datetime.datetime): + raise ValueError("time_stamp should be datetime type") + # Avoid locale-specific month conversion + months = [ + "Jan", + "Feb", + "Mar", + "Apr", + "May", + "Jun", + "Jul", + "Aug", + "Sep", + "Oct", + "Nov", + "Dec", + ] + month_lookup = {i + 1: month for i, month in enumerate(months)} + ts = ( + time_stamp.strftime("%d ") + + month_lookup[time_stamp.month] + + time_stamp.strftime(" %Y %H:%M") + ) + # '\x11' added due to inspection of Stata file + ts = b"\x11" + bytes(ts, "utf-8") + bio.write(self._tag(ts, "timestamp")) + bio.seek(0) + self._file.write(self._tag(bio.read(), "header")) + + def _write_map(self): + """Called twice during file write. The first populates the values in + the map with 0s. The second call writes the final map locations when + all blocks have been written.""" + if self._map is None: + self._map = dict( + ( + ("stata_data", 0), + ("map", self._file.tell()), + ("variable_types", 0), + ("varnames", 0), + ("sortlist", 0), + ("formats", 0), + ("value_label_names", 0), + ("variable_labels", 0), + ("characteristics", 0), + ("data", 0), + ("strls", 0), + ("value_labels", 0), + ("stata_data_close", 0), + ("end-of-file", 0), + ) + ) + # Move to start of map + self._file.seek(self._map["map"]) + bio = BytesIO() + for val in self._map.values(): + bio.write(struct.pack(self._byteorder + "Q", val)) + bio.seek(0) + self._file.write(self._tag(bio.read(), "map")) + + def _write_variable_types(self): + self._update_map("variable_types") + bio = BytesIO() + for typ in self.typlist: + bio.write(struct.pack(self._byteorder + "H", typ)) + bio.seek(0) + self._file.write(self._tag(bio.read(), "variable_types")) + + def _write_varnames(self): + self._update_map("varnames") + bio = BytesIO() + # 118 scales by 4 to accommodate utf-8 data worst case encoding + vn_len = 32 if self._dta_version == 117 else 128 + for name in self.varlist: + name = self._null_terminate(name, True) + name = _pad_bytes_new(name[:32].encode(self._encoding), vn_len + 1) + bio.write(name) + bio.seek(0) + self._file.write(self._tag(bio.read(), "varnames")) + + def _write_sortlist(self): + self._update_map("sortlist") + sort_size = 2 if self._dta_version < 119 else 4 + self._file.write(self._tag(b"\x00" * sort_size * (self.nvar + 1), "sortlist")) + + def _write_formats(self): + self._update_map("formats") + bio = BytesIO() + fmt_len = 49 if self._dta_version == 117 else 57 + for fmt in self.fmtlist: + bio.write(_pad_bytes_new(fmt.encode(self._encoding), fmt_len)) + bio.seek(0) + self._file.write(self._tag(bio.read(), "formats")) + + def _write_value_label_names(self): + self._update_map("value_label_names") + bio = BytesIO() + # 118 scales by 4 to accommodate utf-8 data worst case encoding + vl_len = 32 if self._dta_version == 117 else 128 + for i in range(self.nvar): + # Use variable name when categorical + name = "" # default name + if self._is_col_cat[i]: + name = self.varlist[i] + name = self._null_terminate(name, True) + name = _pad_bytes_new(name[:32].encode(self._encoding), vl_len + 1) + bio.write(name) + bio.seek(0) + self._file.write(self._tag(bio.read(), "value_label_names")) + + def _write_variable_labels(self): + # Missing labels are 80 blank characters plus null termination + self._update_map("variable_labels") + bio = BytesIO() + # 118 scales by 4 to accommodate utf-8 data worst case encoding + vl_len = 80 if self._dta_version == 117 else 320 + blank = _pad_bytes_new("", vl_len + 1) + + if self._variable_labels is None: + for _ in range(self.nvar): + bio.write(blank) + bio.seek(0) + self._file.write(self._tag(bio.read(), "variable_labels")) + return + + for col in self.data: + if col in self._variable_labels: + label = self._variable_labels[col] + if len(label) > 80: + raise ValueError("Variable labels must be 80 characters or fewer") + try: + encoded = label.encode(self._encoding) + except UnicodeEncodeError: + raise ValueError( + "Variable labels must contain only characters that " + f"can be encoded in {self._encoding}" + ) + + bio.write(_pad_bytes_new(encoded, vl_len + 1)) + else: + bio.write(blank) + bio.seek(0) + self._file.write(self._tag(bio.read(), "variable_labels")) + + def _write_characteristics(self): + self._update_map("characteristics") + self._file.write(self._tag(b"", "characteristics")) + + def _write_data(self): + self._update_map("data") + data = self.data + self._file.write(b"") + self._file.write(data.tobytes()) + self._file.write(b"") + + def _write_strls(self): + self._update_map("strls") + strls = b"" + if self._strl_blob is not None: + strls = self._strl_blob + self._file.write(self._tag(strls, "strls")) + + def _write_expansion_fields(self): + """No-op in dta 117+""" + pass + + def _write_value_labels(self): + self._update_map("value_labels") + bio = BytesIO() + for vl in self._value_labels: + lab = vl.generate_value_label(self._byteorder) + lab = self._tag(lab, "lbl") + bio.write(lab) + bio.seek(0) + self._file.write(self._tag(bio.read(), "value_labels")) + + def _write_file_close_tag(self): + self._update_map("stata_data_close") + self._file.write(bytes("", "utf-8")) + self._update_map("end-of-file") + + def _update_strl_names(self): + """Update column names for conversion to strl if they might have been + changed to comply with Stata naming rules""" + # Update convert_strl if names changed + for orig, new in self._converted_names.items(): + if orig in self._convert_strl: + idx = self._convert_strl.index(orig) + self._convert_strl[idx] = new + + def _convert_strls(self, data): + """Convert columns to StrLs if either very large or in the + convert_strl variable""" + convert_cols = [ + col + for i, col in enumerate(data) + if self.typlist[i] == 32768 or col in self._convert_strl + ] + + if convert_cols: + ssw = StataStrLWriter(data, convert_cols, version=self._dta_version) + tab, new_data = ssw.generate_table() + data = new_data + self._strl_blob = ssw.generate_blob(tab) + return data + + def _set_formats_and_types(self, dtypes): + self.typlist = [] + self.fmtlist = [] + for col, dtype in dtypes.items(): + force_strl = col in self._convert_strl + fmt = _dtype_to_default_stata_fmt( + dtype, + self.data[col], + dta_version=self._dta_version, + force_strl=force_strl, + ) + self.fmtlist.append(fmt) + self.typlist.append( + _dtype_to_stata_type_117(dtype, self.data[col], force_strl) + ) + + +class StataWriterUTF8(StataWriter117): + """ + Stata binary dta file writing in Stata 15 (118) and 16 (119) formats + + DTA 118 and 119 format files support unicode string data (both fixed + and strL) format. Unicode is also supported in value labels, variable + labels and the dataset label. Format 119 is automatically used if the + file contains more than 32,767 variables. + + .. versionadded:: 1.0.0 + + Parameters + ---------- + fname : path (string), buffer or path object + string, path object (pathlib.Path or py._path.local.LocalPath) or + object implementing a binary write() functions. If using a buffer + then the buffer will not be automatically closed after the file + is written. + data : DataFrame + Input to save + convert_dates : dict, default None + Dictionary mapping columns containing datetime types to stata internal + format to use when writing the dates. Options are 'tc', 'td', 'tm', + 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name. + Datetime columns that do not have a conversion type specified will be + converted to 'tc'. Raises NotImplementedError if a datetime column has + timezone information + write_index : bool, default True + Write the index to Stata dataset. + byteorder : str, default None + Can be ">", "<", "little", or "big". default is `sys.byteorder` + time_stamp : datetime, default None + A datetime to use as file creation date. Default is the current time + data_label : str, default None + A label for the data set. Must be 80 characters or smaller. + variable_labels : dict, default None + Dictionary containing columns as keys and variable labels as values. + Each label must be 80 characters or smaller. + convert_strl : list, default None + List of columns names to convert to Stata StrL format. Columns with + more than 2045 characters are automatically written as StrL. + Smaller columns can be converted by including the column name. Using + StrLs can reduce output file size when strings are longer than 8 + characters, and either frequently repeated or sparse. + version : int, default None + The dta version to use. By default, uses the size of data to determine + the version. 118 is used if data.shape[1] <= 32767, and 119 is used + for storing larger DataFrames. + + Returns + ------- + StataWriterUTF8 + The instance has a write_file method, which will write the file to the + given `fname`. + + Raises + ------ + NotImplementedError + * If datetimes contain timezone information + ValueError + * Columns listed in convert_dates are neither datetime64[ns] + or datetime.datetime + * Column dtype is not representable in Stata + * Column listed in convert_dates is not in DataFrame + * Categorical label contains more than 32,000 characters + + Examples + -------- + Using Unicode data and column names + + >>> from pandas.io.stata import StataWriterUTF8 + >>> data = pd.DataFrame([[1.0, 1, 'ᴬ']], columns=['a', 'β', 'ĉ']) + >>> writer = StataWriterUTF8('./data_file.dta', data) + >>> writer.write_file() + + Or with long strings stored in strl format + + >>> data = pd.DataFrame([['ᴀ relatively long ŝtring'], [''], ['']], + ... columns=['strls']) + >>> writer = StataWriterUTF8('./data_file_with_long_strings.dta', data, + ... convert_strl=['strls']) + >>> writer.write_file() + """ + + _encoding = "utf-8" + + def __init__( + self, + fname: FilePathOrBuffer, + data: DataFrame, + convert_dates: Optional[Dict[Hashable, str]] = None, + write_index: bool = True, + byteorder: Optional[str] = None, + time_stamp: Optional[datetime.datetime] = None, + data_label: Optional[str] = None, + variable_labels: Optional[Dict[Hashable, str]] = None, + convert_strl: Optional[Sequence[Hashable]] = None, + version: Optional[int] = None, + ): + if version is None: + version = 118 if data.shape[1] <= 32767 else 119 + elif version not in (118, 119): + raise ValueError("version must be either 118 or 119.") + elif version == 118 and data.shape[1] > 32767: + raise ValueError( + "You must use version 119 for data sets containing more than" + "32,767 variables" + ) + + super().__init__( + fname, + data, + convert_dates=convert_dates, + write_index=write_index, + byteorder=byteorder, + time_stamp=time_stamp, + data_label=data_label, + variable_labels=variable_labels, + convert_strl=convert_strl, + ) + # Override version set in StataWriter117 init + self._dta_version = version + + def _validate_variable_name(self, name: str) -> str: + """ + Validate variable names for Stata export. + + Parameters + ---------- + name : str + Variable name + + Returns + ------- + str + The validated name with invalid characters replaced with + underscores. + + Notes + ----- + Stata 118+ support most unicode characters. The only limitation is in + the ascii range where the characters supported are a-z, A-Z, 0-9 and _. + """ + # High code points appear to be acceptable + for c in name: + if ( + ord(c) < 128 + and (c < "A" or c > "Z") + and (c < "a" or c > "z") + and (c < "0" or c > "9") + and c != "_" + ) or 128 <= ord(c) < 256: + name = name.replace(c, "_") + + return name diff --git a/venv/Lib/site-packages/pandas/plotting/__init__.py b/venv/Lib/site-packages/pandas/plotting/__init__.py new file mode 100644 index 0000000..55c861e --- /dev/null +++ b/venv/Lib/site-packages/pandas/plotting/__init__.py @@ -0,0 +1,98 @@ +""" +Plotting public API. + +Authors of third-party plotting backends should implement a module with a +public ``plot(data, kind, **kwargs)``. The parameter `data` will contain +the data structure and can be a `Series` or a `DataFrame`. For example, +for ``df.plot()`` the parameter `data` will contain the DataFrame `df`. +In some cases, the data structure is transformed before being sent to +the backend (see PlotAccessor.__call__ in pandas/plotting/_core.py for +the exact transformations). + +The parameter `kind` will be one of: + +- line +- bar +- barh +- box +- hist +- kde +- area +- pie +- scatter +- hexbin + +See the pandas API reference for documentation on each kind of plot. + +Any other keyword argument is currently assumed to be backend specific, +but some parameters may be unified and added to the signature in the +future (e.g. `title` which should be useful for any backend). + +Currently, all the Matplotlib functions in pandas are accessed through +the selected backend. For example, `pandas.plotting.boxplot` (equivalent +to `DataFrame.boxplot`) is also accessed in the selected backend. This +is expected to change, and the exact API is under discussion. But with +the current version, backends are expected to implement the next functions: + +- plot (describe above, used for `Series.plot` and `DataFrame.plot`) +- hist_series and hist_frame (for `Series.hist` and `DataFrame.hist`) +- boxplot (`pandas.plotting.boxplot(df)` equivalent to `DataFrame.boxplot`) +- boxplot_frame and boxplot_frame_groupby +- register and deregister (register converters for the tick formats) +- Plots not called as `Series` and `DataFrame` methods: + - table + - andrews_curves + - autocorrelation_plot + - bootstrap_plot + - lag_plot + - parallel_coordinates + - radviz + - scatter_matrix + +Use the code in pandas/plotting/_matplotib.py and +https://github.com/pyviz/hvplot as a reference on how to write a backend. + +For the discussion about the API see +https://github.com/pandas-dev/pandas/issues/26747. +""" +from pandas.plotting._core import ( + PlotAccessor, + boxplot, + boxplot_frame, + boxplot_frame_groupby, + hist_frame, + hist_series, +) +from pandas.plotting._misc import ( + andrews_curves, + autocorrelation_plot, + bootstrap_plot, + deregister as deregister_matplotlib_converters, + lag_plot, + parallel_coordinates, + plot_params, + radviz, + register as register_matplotlib_converters, + scatter_matrix, + table, +) + +__all__ = [ + "PlotAccessor", + "boxplot", + "boxplot_frame", + "boxplot_frame_groupby", + "hist_frame", + "hist_series", + "scatter_matrix", + "radviz", + "andrews_curves", + "bootstrap_plot", + "parallel_coordinates", + "lag_plot", + "autocorrelation_plot", + "table", + "plot_params", + "register_matplotlib_converters", + "deregister_matplotlib_converters", +] diff --git a/venv/Lib/site-packages/pandas/plotting/_core.py b/venv/Lib/site-packages/pandas/plotting/_core.py new file mode 100644 index 0000000..c239f11 --- /dev/null +++ b/venv/Lib/site-packages/pandas/plotting/_core.py @@ -0,0 +1,1678 @@ +import importlib + +from pandas._config import get_option + +from pandas.util._decorators import Appender, Substitution + +from pandas.core.dtypes.common import is_integer, is_list_like +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries + +from pandas.core.base import PandasObject + + +def hist_series( + self, + by=None, + ax=None, + grid=True, + xlabelsize=None, + xrot=None, + ylabelsize=None, + yrot=None, + figsize=None, + bins=10, + backend=None, + **kwargs, +): + """ + Draw histogram of the input series using matplotlib. + + Parameters + ---------- + by : object, optional + If passed, then used to form histograms for separate groups. + ax : matplotlib axis object + If not passed, uses gca(). + grid : bool, default True + Whether to show axis grid lines. + xlabelsize : int, default None + If specified changes the x-axis label size. + xrot : float, default None + Rotation of x axis labels. + ylabelsize : int, default None + If specified changes the y-axis label size. + yrot : float, default None + Rotation of y axis labels. + figsize : tuple, default None + Figure size in inches by default. + bins : int or sequence, default 10 + Number of histogram bins to be used. If an integer is given, bins + 1 + bin edges are calculated and returned. If bins is a sequence, gives + bin edges, including left edge of first bin and right edge of last + bin. In this case, bins is returned unmodified. + backend : str, default None + Backend to use instead of the backend specified in the option + ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to + specify the ``plotting.backend`` for the whole session, set + ``pd.options.plotting.backend``. + + .. versionadded:: 1.0.0 + + **kwargs + To be passed to the actual plotting function. + + Returns + ------- + matplotlib.AxesSubplot + A histogram plot. + + See Also + -------- + matplotlib.axes.Axes.hist : Plot a histogram using matplotlib. + """ + plot_backend = _get_plot_backend(backend) + return plot_backend.hist_series( + self, + by=by, + ax=ax, + grid=grid, + xlabelsize=xlabelsize, + xrot=xrot, + ylabelsize=ylabelsize, + yrot=yrot, + figsize=figsize, + bins=bins, + **kwargs, + ) + + +def hist_frame( + data, + column=None, + by=None, + grid=True, + xlabelsize=None, + xrot=None, + ylabelsize=None, + yrot=None, + ax=None, + sharex=False, + sharey=False, + figsize=None, + layout=None, + bins=10, + backend=None, + **kwargs, +): + """ + Make a histogram of the DataFrame's. + + A `histogram`_ is a representation of the distribution of data. + This function calls :meth:`matplotlib.pyplot.hist`, on each series in + the DataFrame, resulting in one histogram per column. + + .. _histogram: https://en.wikipedia.org/wiki/Histogram + + Parameters + ---------- + data : DataFrame + The pandas object holding the data. + column : str or sequence + If passed, will be used to limit data to a subset of columns. + by : object, optional + If passed, then used to form histograms for separate groups. + grid : bool, default True + Whether to show axis grid lines. + xlabelsize : int, default None + If specified changes the x-axis label size. + xrot : float, default None + Rotation of x axis labels. For example, a value of 90 displays the + x labels rotated 90 degrees clockwise. + ylabelsize : int, default None + If specified changes the y-axis label size. + yrot : float, default None + Rotation of y axis labels. For example, a value of 90 displays the + y labels rotated 90 degrees clockwise. + ax : Matplotlib axes object, default None + The axes to plot the histogram on. + sharex : bool, default True if ax is None else False + In case subplots=True, share x axis and set some x axis labels to + invisible; defaults to True if ax is None otherwise False if an ax + is passed in. + Note that passing in both an ax and sharex=True will alter all x axis + labels for all subplots in a figure. + sharey : bool, default False + In case subplots=True, share y axis and set some y axis labels to + invisible. + figsize : tuple + The size in inches of the figure to create. Uses the value in + `matplotlib.rcParams` by default. + layout : tuple, optional + Tuple of (rows, columns) for the layout of the histograms. + bins : int or sequence, default 10 + Number of histogram bins to be used. If an integer is given, bins + 1 + bin edges are calculated and returned. If bins is a sequence, gives + bin edges, including left edge of first bin and right edge of last + bin. In this case, bins is returned unmodified. + backend : str, default None + Backend to use instead of the backend specified in the option + ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to + specify the ``plotting.backend`` for the whole session, set + ``pd.options.plotting.backend``. + + .. versionadded:: 1.0.0 + + **kwargs + All other plotting keyword arguments to be passed to + :meth:`matplotlib.pyplot.hist`. + + Returns + ------- + matplotlib.AxesSubplot or numpy.ndarray of them + + See Also + -------- + matplotlib.pyplot.hist : Plot a histogram using matplotlib. + + Examples + -------- + + .. plot:: + :context: close-figs + + This example draws a histogram based on the length and width of + some animals, displayed in three bins + + >>> df = pd.DataFrame({ + ... 'length': [1.5, 0.5, 1.2, 0.9, 3], + ... 'width': [0.7, 0.2, 0.15, 0.2, 1.1] + ... }, index=['pig', 'rabbit', 'duck', 'chicken', 'horse']) + >>> hist = df.hist(bins=3) + """ + plot_backend = _get_plot_backend(backend) + return plot_backend.hist_frame( + data, + column=column, + by=by, + grid=grid, + xlabelsize=xlabelsize, + xrot=xrot, + ylabelsize=ylabelsize, + yrot=yrot, + ax=ax, + sharex=sharex, + sharey=sharey, + figsize=figsize, + layout=layout, + bins=bins, + **kwargs, + ) + + +_boxplot_doc = """ +Make a box plot from DataFrame columns. + +Make a box-and-whisker plot from DataFrame columns, optionally grouped +by some other columns. A box plot is a method for graphically depicting +groups of numerical data through their quartiles. +The box extends from the Q1 to Q3 quartile values of the data, +with a line at the median (Q2). The whiskers extend from the edges +of box to show the range of the data. The position of the whiskers +is set by default to `1.5 * IQR (IQR = Q3 - Q1)` from the edges of the box. +Outlier points are those past the end of the whiskers. + +For further details see +Wikipedia's entry for `boxplot `_. + +Parameters +---------- +column : str or list of str, optional + Column name or list of names, or vector. + Can be any valid input to :meth:`pandas.DataFrame.groupby`. +by : str or array-like, optional + Column in the DataFrame to :meth:`pandas.DataFrame.groupby`. + One box-plot will be done per value of columns in `by`. +ax : object of class matplotlib.axes.Axes, optional + The matplotlib axes to be used by boxplot. +fontsize : float or str + Tick label font size in points or as a string (e.g., `large`). +rot : int or float, default 0 + The rotation angle of labels (in degrees) + with respect to the screen coordinate system. +grid : bool, default True + Setting this to True will show the grid. +figsize : A tuple (width, height) in inches + The size of the figure to create in matplotlib. +layout : tuple (rows, columns), optional + For example, (3, 5) will display the subplots + using 3 columns and 5 rows, starting from the top-left. +return_type : {'axes', 'dict', 'both'} or None, default 'axes' + The kind of object to return. The default is ``axes``. + + * 'axes' returns the matplotlib axes the boxplot is drawn on. + * 'dict' returns a dictionary whose values are the matplotlib + Lines of the boxplot. + * 'both' returns a namedtuple with the axes and dict. + * when grouping with ``by``, a Series mapping columns to + ``return_type`` is returned. + + If ``return_type`` is `None`, a NumPy array + of axes with the same shape as ``layout`` is returned. +%(backend)s\ + +**kwargs + All other plotting keyword arguments to be passed to + :func:`matplotlib.pyplot.boxplot`. + +Returns +------- +result + See Notes. + +See Also +-------- +Series.plot.hist: Make a histogram. +matplotlib.pyplot.boxplot : Matplotlib equivalent plot. + +Notes +----- +The return type depends on the `return_type` parameter: + +* 'axes' : object of class matplotlib.axes.Axes +* 'dict' : dict of matplotlib.lines.Line2D objects +* 'both' : a namedtuple with structure (ax, lines) + +For data grouped with ``by``, return a Series of the above or a numpy +array: + +* :class:`~pandas.Series` +* :class:`~numpy.array` (for ``return_type = None``) + +Use ``return_type='dict'`` when you want to tweak the appearance +of the lines after plotting. In this case a dict containing the Lines +making up the boxes, caps, fliers, medians, and whiskers is returned. + +Examples +-------- + +Boxplots can be created for every column in the dataframe +by ``df.boxplot()`` or indicating the columns to be used: + +.. plot:: + :context: close-figs + + >>> np.random.seed(1234) + >>> df = pd.DataFrame(np.random.randn(10, 4), + ... columns=['Col1', 'Col2', 'Col3', 'Col4']) + >>> boxplot = df.boxplot(column=['Col1', 'Col2', 'Col3']) + +Boxplots of variables distributions grouped by the values of a third +variable can be created using the option ``by``. For instance: + +.. plot:: + :context: close-figs + + >>> df = pd.DataFrame(np.random.randn(10, 2), + ... columns=['Col1', 'Col2']) + >>> df['X'] = pd.Series(['A', 'A', 'A', 'A', 'A', + ... 'B', 'B', 'B', 'B', 'B']) + >>> boxplot = df.boxplot(by='X') + +A list of strings (i.e. ``['X', 'Y']``) can be passed to boxplot +in order to group the data by combination of the variables in the x-axis: + +.. plot:: + :context: close-figs + + >>> df = pd.DataFrame(np.random.randn(10, 3), + ... columns=['Col1', 'Col2', 'Col3']) + >>> df['X'] = pd.Series(['A', 'A', 'A', 'A', 'A', + ... 'B', 'B', 'B', 'B', 'B']) + >>> df['Y'] = pd.Series(['A', 'B', 'A', 'B', 'A', + ... 'B', 'A', 'B', 'A', 'B']) + >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by=['X', 'Y']) + +The layout of boxplot can be adjusted giving a tuple to ``layout``: + +.. plot:: + :context: close-figs + + >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by='X', + ... layout=(2, 1)) + +Additional formatting can be done to the boxplot, like suppressing the grid +(``grid=False``), rotating the labels in the x-axis (i.e. ``rot=45``) +or changing the fontsize (i.e. ``fontsize=15``): + +.. plot:: + :context: close-figs + + >>> boxplot = df.boxplot(grid=False, rot=45, fontsize=15) + +The parameter ``return_type`` can be used to select the type of element +returned by `boxplot`. When ``return_type='axes'`` is selected, +the matplotlib axes on which the boxplot is drawn are returned: + + >>> boxplot = df.boxplot(column=['Col1', 'Col2'], return_type='axes') + >>> type(boxplot) + + +When grouping with ``by``, a Series mapping columns to ``return_type`` +is returned: + + >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by='X', + ... return_type='axes') + >>> type(boxplot) + + +If ``return_type`` is `None`, a NumPy array of axes with the same shape +as ``layout`` is returned: + + >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by='X', + ... return_type=None) + >>> type(boxplot) + +""" + +_backend_doc = """\ +backend : str, default None + Backend to use instead of the backend specified in the option + ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to + specify the ``plotting.backend`` for the whole session, set + ``pd.options.plotting.backend``. + + .. versionadded:: 1.0.0 +""" + + +@Substitution(backend="") +@Appender(_boxplot_doc) +def boxplot( + data, + column=None, + by=None, + ax=None, + fontsize=None, + rot=0, + grid=True, + figsize=None, + layout=None, + return_type=None, + **kwargs, +): + plot_backend = _get_plot_backend("matplotlib") + return plot_backend.boxplot( + data, + column=column, + by=by, + ax=ax, + fontsize=fontsize, + rot=rot, + grid=grid, + figsize=figsize, + layout=layout, + return_type=return_type, + **kwargs, + ) + + +@Substitution(backend=_backend_doc) +@Appender(_boxplot_doc) +def boxplot_frame( + self, + column=None, + by=None, + ax=None, + fontsize=None, + rot=0, + grid=True, + figsize=None, + layout=None, + return_type=None, + backend=None, + **kwargs, +): + plot_backend = _get_plot_backend(backend) + return plot_backend.boxplot_frame( + self, + column=column, + by=by, + ax=ax, + fontsize=fontsize, + rot=rot, + grid=grid, + figsize=figsize, + layout=layout, + return_type=return_type, + **kwargs, + ) + + +def boxplot_frame_groupby( + grouped, + subplots=True, + column=None, + fontsize=None, + rot=0, + grid=True, + ax=None, + figsize=None, + layout=None, + sharex=False, + sharey=True, + backend=None, + **kwargs, +): + """ + Make box plots from DataFrameGroupBy data. + + Parameters + ---------- + grouped : Grouped DataFrame + subplots : bool + * ``False`` - no subplots will be used + * ``True`` - create a subplot for each group. + + column : column name or list of names, or vector + Can be any valid input to groupby. + fontsize : int or str + rot : label rotation angle + grid : Setting this to True will show the grid + ax : Matplotlib axis object, default None + figsize : A tuple (width, height) in inches + layout : tuple (optional) + The layout of the plot: (rows, columns). + sharex : bool, default False + Whether x-axes will be shared among subplots. + + .. versionadded:: 0.23.1 + sharey : bool, default True + Whether y-axes will be shared among subplots. + + .. versionadded:: 0.23.1 + backend : str, default None + Backend to use instead of the backend specified in the option + ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to + specify the ``plotting.backend`` for the whole session, set + ``pd.options.plotting.backend``. + + .. versionadded:: 1.0.0 + + **kwargs + All other plotting keyword arguments to be passed to + matplotlib's boxplot function. + + Returns + ------- + dict of key/value = group key/DataFrame.boxplot return value + or DataFrame.boxplot return value in case subplots=figures=False + + Examples + -------- + >>> import itertools + >>> tuples = [t for t in itertools.product(range(1000), range(4))] + >>> index = pd.MultiIndex.from_tuples(tuples, names=['lvl0', 'lvl1']) + >>> data = np.random.randn(len(index),4) + >>> df = pd.DataFrame(data, columns=list('ABCD'), index=index) + >>> + >>> grouped = df.groupby(level='lvl1') + >>> boxplot_frame_groupby(grouped) + >>> + >>> grouped = df.unstack(level='lvl1').groupby(level=0, axis=1) + >>> boxplot_frame_groupby(grouped, subplots=False) + """ + plot_backend = _get_plot_backend(backend) + return plot_backend.boxplot_frame_groupby( + grouped, + subplots=subplots, + column=column, + fontsize=fontsize, + rot=rot, + grid=grid, + ax=ax, + figsize=figsize, + layout=layout, + sharex=sharex, + sharey=sharey, + **kwargs, + ) + + +class PlotAccessor(PandasObject): + """ + Make plots of Series or DataFrame. + + Uses the backend specified by the + option ``plotting.backend``. By default, matplotlib is used. + + Parameters + ---------- + data : Series or DataFrame + The object for which the method is called. + x : label or position, default None + Only used if data is a DataFrame. + y : label, position or list of label, positions, default None + Allows plotting of one column versus another. Only used if data is a + DataFrame. + kind : str + The kind of plot to produce: + + - 'line' : line plot (default) + - 'bar' : vertical bar plot + - 'barh' : horizontal bar plot + - 'hist' : histogram + - 'box' : boxplot + - 'kde' : Kernel Density Estimation plot + - 'density' : same as 'kde' + - 'area' : area plot + - 'pie' : pie plot + - 'scatter' : scatter plot + - 'hexbin' : hexbin plot. + + figsize : a tuple (width, height) in inches + use_index : bool, default True + Use index as ticks for x axis. + title : str or list + Title to use for the plot. If a string is passed, print the string + at the top of the figure. If a list is passed and `subplots` is + True, print each item in the list above the corresponding subplot. + grid : bool, default None (matlab style default) + Axis grid lines. + legend : bool or {'reverse'} + Place legend on axis subplots. + style : list or dict + The matplotlib line style per column. + logx : bool or 'sym', default False + Use log scaling or symlog scaling on x axis. + .. versionchanged:: 0.25.0 + + logy : bool or 'sym' default False + Use log scaling or symlog scaling on y axis. + .. versionchanged:: 0.25.0 + + loglog : bool or 'sym', default False + Use log scaling or symlog scaling on both x and y axes. + .. versionchanged:: 0.25.0 + + xticks : sequence + Values to use for the xticks. + yticks : sequence + Values to use for the yticks. + xlim : 2-tuple/list + ylim : 2-tuple/list + rot : int, default None + Rotation for ticks (xticks for vertical, yticks for horizontal + plots). + fontsize : int, default None + Font size for xticks and yticks. + colormap : str or matplotlib colormap object, default None + Colormap to select colors from. If string, load colormap with that + name from matplotlib. + colorbar : bool, optional + If True, plot colorbar (only relevant for 'scatter' and 'hexbin' + plots). + position : float + Specify relative alignments for bar plot layout. + From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 + (center). + table : bool, Series or DataFrame, default False + If True, draw a table using the data in the DataFrame and the data + will be transposed to meet matplotlib's default layout. + If a Series or DataFrame is passed, use passed data to draw a + table. + yerr : DataFrame, Series, array-like, dict and str + See :ref:`Plotting with Error Bars ` for + detail. + xerr : DataFrame, Series, array-like, dict and str + Equivalent to yerr. + mark_right : bool, default True + When using a secondary_y axis, automatically mark the column + labels with "(right)" in the legend. + include_bool : bool, default is False + If True, boolean values can be plotted. + backend : str, default None + Backend to use instead of the backend specified in the option + ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to + specify the ``plotting.backend`` for the whole session, set + ``pd.options.plotting.backend``. + + .. versionadded:: 1.0.0 + + **kwargs + Options to pass to matplotlib plotting method. + + Returns + ------- + :class:`matplotlib.axes.Axes` or numpy.ndarray of them + If the backend is not the default matplotlib one, the return value + will be the object returned by the backend. + + Notes + ----- + - See matplotlib documentation online for more on this subject + - If `kind` = 'bar' or 'barh', you can specify relative alignments + for bar plot layout by `position` keyword. + From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 + (center) + """ + + _common_kinds = ("line", "bar", "barh", "kde", "density", "area", "hist", "box") + _series_kinds = ("pie",) + _dataframe_kinds = ("scatter", "hexbin") + _kind_aliases = {"density": "kde"} + _all_kinds = _common_kinds + _series_kinds + _dataframe_kinds + + def __init__(self, data): + self._parent = data + + @staticmethod + def _get_call_args(backend_name, data, args, kwargs): + """ + This function makes calls to this accessor `__call__` method compatible + with the previous `SeriesPlotMethods.__call__` and + `DataFramePlotMethods.__call__`. Those had slightly different + signatures, since `DataFramePlotMethods` accepted `x` and `y` + parameters. + """ + if isinstance(data, ABCSeries): + arg_def = [ + ("kind", "line"), + ("ax", None), + ("figsize", None), + ("use_index", True), + ("title", None), + ("grid", None), + ("legend", False), + ("style", None), + ("logx", False), + ("logy", False), + ("loglog", False), + ("xticks", None), + ("yticks", None), + ("xlim", None), + ("ylim", None), + ("rot", None), + ("fontsize", None), + ("colormap", None), + ("table", False), + ("yerr", None), + ("xerr", None), + ("label", None), + ("secondary_y", False), + ] + elif isinstance(data, ABCDataFrame): + arg_def = [ + ("x", None), + ("y", None), + ("kind", "line"), + ("ax", None), + ("subplots", False), + ("sharex", None), + ("sharey", False), + ("layout", None), + ("figsize", None), + ("use_index", True), + ("title", None), + ("grid", None), + ("legend", True), + ("style", None), + ("logx", False), + ("logy", False), + ("loglog", False), + ("xticks", None), + ("yticks", None), + ("xlim", None), + ("ylim", None), + ("rot", None), + ("fontsize", None), + ("colormap", None), + ("table", False), + ("yerr", None), + ("xerr", None), + ("secondary_y", False), + ("sort_columns", False), + ] + else: + raise TypeError( + f"Called plot accessor for type {type(data).__name__}, " + "expected Series or DataFrame" + ) + + if args and isinstance(data, ABCSeries): + positional_args = str(args)[1:-1] + keyword_args = ", ".join( + f"{name}={repr(value)}" for (name, default), value in zip(arg_def, args) + ) + msg = ( + "`Series.plot()` should not be called with positional " + "arguments, only keyword arguments. The order of " + "positional arguments will change in the future. " + f"Use `Series.plot({keyword_args})` instead of " + f"`Series.plot({positional_args})`." + ) + raise TypeError(msg) + + pos_args = {name: value for value, (name, _) in zip(args, arg_def)} + if backend_name == "pandas.plotting._matplotlib": + kwargs = dict(arg_def, **pos_args, **kwargs) + else: + kwargs = dict(pos_args, **kwargs) + + x = kwargs.pop("x", None) + y = kwargs.pop("y", None) + kind = kwargs.pop("kind", "line") + return x, y, kind, kwargs + + def __call__(self, *args, **kwargs): + plot_backend = _get_plot_backend(kwargs.pop("backend", None)) + + x, y, kind, kwargs = self._get_call_args( + plot_backend.__name__, self._parent, args, kwargs + ) + + kind = self._kind_aliases.get(kind, kind) + + # when using another backend, get out of the way + if plot_backend.__name__ != "pandas.plotting._matplotlib": + return plot_backend.plot(self._parent, x=x, y=y, kind=kind, **kwargs) + + if kind not in self._all_kinds: + raise ValueError(f"{kind} is not a valid plot kind") + + # The original data structured can be transformed before passed to the + # backend. For example, for DataFrame is common to set the index as the + # `x` parameter, and return a Series with the parameter `y` as values. + data = self._parent.copy() + + if isinstance(data, ABCSeries): + kwargs["reuse_plot"] = True + + if kind in self._dataframe_kinds: + if isinstance(data, ABCDataFrame): + return plot_backend.plot(data, x=x, y=y, kind=kind, **kwargs) + else: + raise ValueError(f"plot kind {kind} can only be used for data frames") + elif kind in self._series_kinds: + if isinstance(data, ABCDataFrame): + if y is None and kwargs.get("subplots") is False: + raise ValueError( + f"{kind} requires either y column or 'subplots=True'" + ) + elif y is not None: + if is_integer(y) and not data.columns.holds_integer(): + y = data.columns[y] + # converted to series actually. copy to not modify + data = data[y].copy() + data.index.name = y + elif isinstance(data, ABCDataFrame): + data_cols = data.columns + if x is not None: + if is_integer(x) and not data.columns.holds_integer(): + x = data_cols[x] + elif not isinstance(data[x], ABCSeries): + raise ValueError("x must be a label or position") + data = data.set_index(x) + if y is not None: + # check if we have y as int or list of ints + int_ylist = is_list_like(y) and all(is_integer(c) for c in y) + int_y_arg = is_integer(y) or int_ylist + if int_y_arg and not data.columns.holds_integer(): + y = data_cols[y] + + label_kw = kwargs["label"] if "label" in kwargs else False + for kw in ["xerr", "yerr"]: + if kw in kwargs and ( + isinstance(kwargs[kw], str) or is_integer(kwargs[kw]) + ): + try: + kwargs[kw] = data[kwargs[kw]] + except (IndexError, KeyError, TypeError): + pass + + # don't overwrite + data = data[y].copy() + + if isinstance(data, ABCSeries): + label_name = label_kw or y + data.name = label_name + else: + match = is_list_like(label_kw) and len(label_kw) == len(y) + if label_kw and not match: + raise ValueError( + "label should be list-like and same length as y" + ) + label_name = label_kw or data.columns + data.columns = label_name + + return plot_backend.plot(data, kind=kind, **kwargs) + + __call__.__doc__ = __doc__ + + def line(self, x=None, y=None, **kwargs): + """ + Plot Series or DataFrame as lines. + + This function is useful to plot lines using DataFrame's values + as coordinates. + + Parameters + ---------- + x : int or str, optional + Columns to use for the horizontal axis. + Either the location or the label of the columns to be used. + By default, it will use the DataFrame indices. + y : int, str, or list of them, optional + The values to be plotted. + Either the location or the label of the columns to be used. + By default, it will use the remaining DataFrame numeric columns. + **kwargs + Keyword arguments to pass on to :meth:`DataFrame.plot`. + + Returns + ------- + :class:`matplotlib.axes.Axes` or :class:`numpy.ndarray` + Return an ndarray when ``subplots=True``. + + See Also + -------- + matplotlib.pyplot.plot : Plot y versus x as lines and/or markers. + + Examples + -------- + + .. plot:: + :context: close-figs + + >>> s = pd.Series([1, 3, 2]) + >>> s.plot.line() + + .. plot:: + :context: close-figs + + The following example shows the populations for some animals + over the years. + + >>> df = pd.DataFrame({ + ... 'pig': [20, 18, 489, 675, 1776], + ... 'horse': [4, 25, 281, 600, 1900] + ... }, index=[1990, 1997, 2003, 2009, 2014]) + >>> lines = df.plot.line() + + .. plot:: + :context: close-figs + + An example with subplots, so an array of axes is returned. + + >>> axes = df.plot.line(subplots=True) + >>> type(axes) + + + .. plot:: + :context: close-figs + + The following example shows the relationship between both + populations. + + >>> lines = df.plot.line(x='pig', y='horse') + """ + return self(kind="line", x=x, y=y, **kwargs) + + def bar(self, x=None, y=None, **kwargs): + """ + Vertical bar plot. + + A bar plot is a plot that presents categorical data with + rectangular bars with lengths proportional to the values that they + represent. A bar plot shows comparisons among discrete categories. One + axis of the plot shows the specific categories being compared, and the + other axis represents a measured value. + + Parameters + ---------- + x : label or position, optional + Allows plotting of one column versus another. If not specified, + the index of the DataFrame is used. + y : label or position, optional + Allows plotting of one column versus another. If not specified, + all numerical columns are used. + **kwargs + Additional keyword arguments are documented in + :meth:`DataFrame.plot`. + + Returns + ------- + matplotlib.axes.Axes or np.ndarray of them + An ndarray is returned with one :class:`matplotlib.axes.Axes` + per column when ``subplots=True``. + + See Also + -------- + DataFrame.plot.barh : Horizontal bar plot. + DataFrame.plot : Make plots of a DataFrame. + matplotlib.pyplot.bar : Make a bar plot with matplotlib. + + Examples + -------- + Basic plot. + + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame({'lab':['A', 'B', 'C'], 'val':[10, 30, 20]}) + >>> ax = df.plot.bar(x='lab', y='val', rot=0) + + Plot a whole dataframe to a bar plot. Each column is assigned a + distinct color, and each row is nested in a group along the + horizontal axis. + + .. plot:: + :context: close-figs + + >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88] + >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28] + >>> index = ['snail', 'pig', 'elephant', + ... 'rabbit', 'giraffe', 'coyote', 'horse'] + >>> df = pd.DataFrame({'speed': speed, + ... 'lifespan': lifespan}, index=index) + >>> ax = df.plot.bar(rot=0) + + Instead of nesting, the figure can be split by column with + ``subplots=True``. In this case, a :class:`numpy.ndarray` of + :class:`matplotlib.axes.Axes` are returned. + + .. plot:: + :context: close-figs + + >>> axes = df.plot.bar(rot=0, subplots=True) + >>> axes[1].legend(loc=2) # doctest: +SKIP + + Plot a single column. + + .. plot:: + :context: close-figs + + >>> ax = df.plot.bar(y='speed', rot=0) + + Plot only selected categories for the DataFrame. + + .. plot:: + :context: close-figs + + >>> ax = df.plot.bar(x='lifespan', rot=0) + """ + return self(kind="bar", x=x, y=y, **kwargs) + + def barh(self, x=None, y=None, **kwargs): + """ + Make a horizontal bar plot. + + A horizontal bar plot is a plot that presents quantitative data with + rectangular bars with lengths proportional to the values that they + represent. A bar plot shows comparisons among discrete categories. One + axis of the plot shows the specific categories being compared, and the + other axis represents a measured value. + + Parameters + ---------- + x : label or position, default DataFrame.index + Column to be used for categories. + y : label or position, default All numeric columns in dataframe + Columns to be plotted from the DataFrame. + **kwargs + Keyword arguments to pass on to :meth:`DataFrame.plot`. + + Returns + ------- + :class:`matplotlib.axes.Axes` or numpy.ndarray of them + + See Also + -------- + DataFrame.plot.bar: Vertical bar plot. + DataFrame.plot : Make plots of DataFrame using matplotlib. + matplotlib.axes.Axes.bar : Plot a vertical bar plot using matplotlib. + + Examples + -------- + Basic example + + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame({'lab': ['A', 'B', 'C'], 'val': [10, 30, 20]}) + >>> ax = df.plot.barh(x='lab', y='val') + + Plot a whole DataFrame to a horizontal bar plot + + .. plot:: + :context: close-figs + + >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88] + >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28] + >>> index = ['snail', 'pig', 'elephant', + ... 'rabbit', 'giraffe', 'coyote', 'horse'] + >>> df = pd.DataFrame({'speed': speed, + ... 'lifespan': lifespan}, index=index) + >>> ax = df.plot.barh() + + Plot a column of the DataFrame to a horizontal bar plot + + .. plot:: + :context: close-figs + + >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88] + >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28] + >>> index = ['snail', 'pig', 'elephant', + ... 'rabbit', 'giraffe', 'coyote', 'horse'] + >>> df = pd.DataFrame({'speed': speed, + ... 'lifespan': lifespan}, index=index) + >>> ax = df.plot.barh(y='speed') + + Plot DataFrame versus the desired column + + .. plot:: + :context: close-figs + + >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88] + >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28] + >>> index = ['snail', 'pig', 'elephant', + ... 'rabbit', 'giraffe', 'coyote', 'horse'] + >>> df = pd.DataFrame({'speed': speed, + ... 'lifespan': lifespan}, index=index) + >>> ax = df.plot.barh(x='lifespan') + """ + return self(kind="barh", x=x, y=y, **kwargs) + + def box(self, by=None, **kwargs): + r""" + Make a box plot of the DataFrame columns. + + A box plot is a method for graphically depicting groups of numerical + data through their quartiles. + The box extends from the Q1 to Q3 quartile values of the data, + with a line at the median (Q2). The whiskers extend from the edges + of box to show the range of the data. The position of the whiskers + is set by default to 1.5*IQR (IQR = Q3 - Q1) from the edges of the + box. Outlier points are those past the end of the whiskers. + + For further details see Wikipedia's + entry for `boxplot `__. + + A consideration when using this chart is that the box and the whiskers + can overlap, which is very common when plotting small sets of data. + + Parameters + ---------- + by : str or sequence + Column in the DataFrame to group by. + **kwargs + Additional keywords are documented in + :meth:`DataFrame.plot`. + + Returns + ------- + :class:`matplotlib.axes.Axes` or numpy.ndarray of them + + See Also + -------- + DataFrame.boxplot: Another method to draw a box plot. + Series.plot.box: Draw a box plot from a Series object. + matplotlib.pyplot.boxplot: Draw a box plot in matplotlib. + + Examples + -------- + Draw a box plot from a DataFrame with four columns of randomly + generated data. + + .. plot:: + :context: close-figs + + >>> data = np.random.randn(25, 4) + >>> df = pd.DataFrame(data, columns=list('ABCD')) + >>> ax = df.plot.box() + """ + return self(kind="box", by=by, **kwargs) + + def hist(self, by=None, bins=10, **kwargs): + """ + Draw one histogram of the DataFrame's columns. + + A histogram is a representation of the distribution of data. + This function groups the values of all given Series in the DataFrame + into bins and draws all bins in one :class:`matplotlib.axes.Axes`. + This is useful when the DataFrame's Series are in a similar scale. + + Parameters + ---------- + by : str or sequence, optional + Column in the DataFrame to group by. + bins : int, default 10 + Number of histogram bins to be used. + **kwargs + Additional keyword arguments are documented in + :meth:`DataFrame.plot`. + + Returns + ------- + class:`matplotlib.AxesSubplot` + Return a histogram plot. + + See Also + -------- + DataFrame.hist : Draw histograms per DataFrame's Series. + Series.hist : Draw a histogram with Series' data. + + Examples + -------- + When we draw a dice 6000 times, we expect to get each value around 1000 + times. But when we draw two dices and sum the result, the distribution + is going to be quite different. A histogram illustrates those + distributions. + + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame( + ... np.random.randint(1, 7, 6000), + ... columns = ['one']) + >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000) + >>> ax = df.plot.hist(bins=12, alpha=0.5) + """ + return self(kind="hist", by=by, bins=bins, **kwargs) + + def kde(self, bw_method=None, ind=None, **kwargs): + """ + Generate Kernel Density Estimate plot using Gaussian kernels. + + In statistics, `kernel density estimation`_ (KDE) is a non-parametric + way to estimate the probability density function (PDF) of a random + variable. This function uses Gaussian kernels and includes automatic + bandwidth determination. + + .. _kernel density estimation: + https://en.wikipedia.org/wiki/Kernel_density_estimation + + Parameters + ---------- + bw_method : str, scalar or callable, optional + The method used to calculate the estimator bandwidth. This can be + 'scott', 'silverman', a scalar constant or a callable. + If None (default), 'scott' is used. + See :class:`scipy.stats.gaussian_kde` for more information. + ind : NumPy array or int, optional + Evaluation points for the estimated PDF. If None (default), + 1000 equally spaced points are used. If `ind` is a NumPy array, the + KDE is evaluated at the points passed. If `ind` is an integer, + `ind` number of equally spaced points are used. + **kwargs + Additional keyword arguments are documented in + :meth:`pandas.%(this-datatype)s.plot`. + + Returns + ------- + matplotlib.axes.Axes or numpy.ndarray of them + + See Also + -------- + scipy.stats.gaussian_kde : Representation of a kernel-density + estimate using Gaussian kernels. This is the function used + internally to estimate the PDF. + + Examples + -------- + Given a Series of points randomly sampled from an unknown + distribution, estimate its PDF using KDE with automatic + bandwidth determination and plot the results, evaluating them at + 1000 equally spaced points (default): + + .. plot:: + :context: close-figs + + >>> s = pd.Series([1, 2, 2.5, 3, 3.5, 4, 5]) + >>> ax = s.plot.kde() + + A scalar bandwidth can be specified. Using a small bandwidth value can + lead to over-fitting, while using a large bandwidth value may result + in under-fitting: + + .. plot:: + :context: close-figs + + >>> ax = s.plot.kde(bw_method=0.3) + + .. plot:: + :context: close-figs + + >>> ax = s.plot.kde(bw_method=3) + + Finally, the `ind` parameter determines the evaluation points for the + plot of the estimated PDF: + + .. plot:: + :context: close-figs + + >>> ax = s.plot.kde(ind=[1, 2, 3, 4, 5]) + + For DataFrame, it works in the same way: + + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame({ + ... 'x': [1, 2, 2.5, 3, 3.5, 4, 5], + ... 'y': [4, 4, 4.5, 5, 5.5, 6, 6], + ... }) + >>> ax = df.plot.kde() + + A scalar bandwidth can be specified. Using a small bandwidth value can + lead to over-fitting, while using a large bandwidth value may result + in under-fitting: + + .. plot:: + :context: close-figs + + >>> ax = df.plot.kde(bw_method=0.3) + + .. plot:: + :context: close-figs + + >>> ax = df.plot.kde(bw_method=3) + + Finally, the `ind` parameter determines the evaluation points for the + plot of the estimated PDF: + + .. plot:: + :context: close-figs + + >>> ax = df.plot.kde(ind=[1, 2, 3, 4, 5, 6]) + """ + return self(kind="kde", bw_method=bw_method, ind=ind, **kwargs) + + density = kde + + def area(self, x=None, y=None, **kwargs): + """ + Draw a stacked area plot. + + An area plot displays quantitative data visually. + This function wraps the matplotlib area function. + + Parameters + ---------- + x : label or position, optional + Coordinates for the X axis. By default uses the index. + y : label or position, optional + Column to plot. By default uses all columns. + stacked : bool, default True + Area plots are stacked by default. Set to False to create a + unstacked plot. + **kwargs + Additional keyword arguments are documented in + :meth:`DataFrame.plot`. + + Returns + ------- + matplotlib.axes.Axes or numpy.ndarray + Area plot, or array of area plots if subplots is True. + + See Also + -------- + DataFrame.plot : Make plots of DataFrame using matplotlib / pylab. + + Examples + -------- + Draw an area plot based on basic business metrics: + + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame({ + ... 'sales': [3, 2, 3, 9, 10, 6], + ... 'signups': [5, 5, 6, 12, 14, 13], + ... 'visits': [20, 42, 28, 62, 81, 50], + ... }, index=pd.date_range(start='2018/01/01', end='2018/07/01', + ... freq='M')) + >>> ax = df.plot.area() + + Area plots are stacked by default. To produce an unstacked plot, + pass ``stacked=False``: + + .. plot:: + :context: close-figs + + >>> ax = df.plot.area(stacked=False) + + Draw an area plot for a single column: + + .. plot:: + :context: close-figs + + >>> ax = df.plot.area(y='sales') + + Draw with a different `x`: + + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame({ + ... 'sales': [3, 2, 3], + ... 'visits': [20, 42, 28], + ... 'day': [1, 2, 3], + ... }) + >>> ax = df.plot.area(x='day') + """ + return self(kind="area", x=x, y=y, **kwargs) + + def pie(self, **kwargs): + """ + Generate a pie plot. + + A pie plot is a proportional representation of the numerical data in a + column. This function wraps :meth:`matplotlib.pyplot.pie` for the + specified column. If no column reference is passed and + ``subplots=True`` a pie plot is drawn for each numerical column + independently. + + Parameters + ---------- + y : int or label, optional + Label or position of the column to plot. + If not provided, ``subplots=True`` argument must be passed. + **kwargs + Keyword arguments to pass on to :meth:`DataFrame.plot`. + + Returns + ------- + matplotlib.axes.Axes or np.ndarray of them + A NumPy array is returned when `subplots` is True. + + See Also + -------- + Series.plot.pie : Generate a pie plot for a Series. + DataFrame.plot : Make plots of a DataFrame. + + Examples + -------- + In the example below we have a DataFrame with the information about + planet's mass and radius. We pass the the 'mass' column to the + pie function to get a pie plot. + + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame({'mass': [0.330, 4.87 , 5.97], + ... 'radius': [2439.7, 6051.8, 6378.1]}, + ... index=['Mercury', 'Venus', 'Earth']) + >>> plot = df.plot.pie(y='mass', figsize=(5, 5)) + + .. plot:: + :context: close-figs + + >>> plot = df.plot.pie(subplots=True, figsize=(6, 3)) + """ + if ( + isinstance(self._parent, ABCDataFrame) + and kwargs.get("y", None) is None + and not kwargs.get("subplots", False) + ): + raise ValueError("pie requires either y column or 'subplots=True'") + return self(kind="pie", **kwargs) + + def scatter(self, x, y, s=None, c=None, **kwargs): + """ + Create a scatter plot with varying marker point size and color. + + The coordinates of each point are defined by two dataframe columns and + filled circles are used to represent each point. This kind of plot is + useful to see complex correlations between two variables. Points could + be for instance natural 2D coordinates like longitude and latitude in + a map or, in general, any pair of metrics that can be plotted against + each other. + + Parameters + ---------- + x : int or str + The column name or column position to be used as horizontal + coordinates for each point. + y : int or str + The column name or column position to be used as vertical + coordinates for each point. + s : scalar or array_like, optional + The size of each point. Possible values are: + + - A single scalar so all points have the same size. + + - A sequence of scalars, which will be used for each point's size + recursively. For instance, when passing [2,14] all points size + will be either 2 or 14, alternatively. + + c : str, int or array_like, optional + The color of each point. Possible values are: + + - A single color string referred to by name, RGB or RGBA code, + for instance 'red' or '#a98d19'. + + - A sequence of color strings referred to by name, RGB or RGBA + code, which will be used for each point's color recursively. For + instance ['green','yellow'] all points will be filled in green or + yellow, alternatively. + + - A column name or position whose values will be used to color the + marker points according to a colormap. + + **kwargs + Keyword arguments to pass on to :meth:`DataFrame.plot`. + + Returns + ------- + :class:`matplotlib.axes.Axes` or numpy.ndarray of them + + See Also + -------- + matplotlib.pyplot.scatter : Scatter plot using multiple input data + formats. + + Examples + -------- + Let's see how to draw a scatter plot using coordinates from the values + in a DataFrame's columns. + + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame([[5.1, 3.5, 0], [4.9, 3.0, 0], [7.0, 3.2, 1], + ... [6.4, 3.2, 1], [5.9, 3.0, 2]], + ... columns=['length', 'width', 'species']) + >>> ax1 = df.plot.scatter(x='length', + ... y='width', + ... c='DarkBlue') + + And now with the color determined by a column as well. + + .. plot:: + :context: close-figs + + >>> ax2 = df.plot.scatter(x='length', + ... y='width', + ... c='species', + ... colormap='viridis') + """ + return self(kind="scatter", x=x, y=y, s=s, c=c, **kwargs) + + def hexbin(self, x, y, C=None, reduce_C_function=None, gridsize=None, **kwargs): + """ + Generate a hexagonal binning plot. + + Generate a hexagonal binning plot of `x` versus `y`. If `C` is `None` + (the default), this is a histogram of the number of occurrences + of the observations at ``(x[i], y[i])``. + + If `C` is specified, specifies values at given coordinates + ``(x[i], y[i])``. These values are accumulated for each hexagonal + bin and then reduced according to `reduce_C_function`, + having as default the NumPy's mean function (:meth:`numpy.mean`). + (If `C` is specified, it must also be a 1-D sequence + of the same length as `x` and `y`, or a column label.) + + Parameters + ---------- + x : int or str + The column label or position for x points. + y : int or str + The column label or position for y points. + C : int or str, optional + The column label or position for the value of `(x, y)` point. + reduce_C_function : callable, default `np.mean` + Function of one argument that reduces all the values in a bin to + a single number (e.g. `np.mean`, `np.max`, `np.sum`, `np.std`). + gridsize : int or tuple of (int, int), default 100 + The number of hexagons in the x-direction. + The corresponding number of hexagons in the y-direction is + chosen in a way that the hexagons are approximately regular. + Alternatively, gridsize can be a tuple with two elements + specifying the number of hexagons in the x-direction and the + y-direction. + **kwargs + Additional keyword arguments are documented in + :meth:`DataFrame.plot`. + + Returns + ------- + matplotlib.AxesSubplot + The matplotlib ``Axes`` on which the hexbin is plotted. + + See Also + -------- + DataFrame.plot : Make plots of a DataFrame. + matplotlib.pyplot.hexbin : Hexagonal binning plot using matplotlib, + the matplotlib function that is used under the hood. + + Examples + -------- + The following examples are generated with random data from + a normal distribution. + + .. plot:: + :context: close-figs + + >>> n = 10000 + >>> df = pd.DataFrame({'x': np.random.randn(n), + ... 'y': np.random.randn(n)}) + >>> ax = df.plot.hexbin(x='x', y='y', gridsize=20) + + The next example uses `C` and `np.sum` as `reduce_C_function`. + Note that `'observations'` values ranges from 1 to 5 but the result + plot shows values up to more than 25. This is because of the + `reduce_C_function`. + + .. plot:: + :context: close-figs + + >>> n = 500 + >>> df = pd.DataFrame({ + ... 'coord_x': np.random.uniform(-3, 3, size=n), + ... 'coord_y': np.random.uniform(30, 50, size=n), + ... 'observations': np.random.randint(1,5, size=n) + ... }) + >>> ax = df.plot.hexbin(x='coord_x', + ... y='coord_y', + ... C='observations', + ... reduce_C_function=np.sum, + ... gridsize=10, + ... cmap="viridis") + """ + if reduce_C_function is not None: + kwargs["reduce_C_function"] = reduce_C_function + if gridsize is not None: + kwargs["gridsize"] = gridsize + + return self(kind="hexbin", x=x, y=y, C=C, **kwargs) + + +_backends = {} + + +def _find_backend(backend: str): + """ + Find a pandas plotting backend> + + Parameters + ---------- + backend : str + The identifier for the backend. Either an entrypoint item registered + with pkg_resources, or a module name. + + Notes + ----- + Modifies _backends with imported backends as a side effect. + + Returns + ------- + types.ModuleType + The imported backend. + """ + import pkg_resources # Delay import for performance. + + for entry_point in pkg_resources.iter_entry_points("pandas_plotting_backends"): + if entry_point.name == "matplotlib": + # matplotlib is an optional dependency. When + # missing, this would raise. + continue + _backends[entry_point.name] = entry_point.load() + + try: + return _backends[backend] + except KeyError: + # Fall back to unregisted, module name approach. + try: + module = importlib.import_module(backend) + except ImportError: + # We re-raise later on. + pass + else: + if hasattr(module, "plot"): + # Validate that the interface is implemented when the option + # is set, rather than at plot time. + _backends[backend] = module + return module + + raise ValueError( + f"Could not find plotting backend '{backend}'. Ensure that you've installed " + f"the package providing the '{backend}' entrypoint, or that the package has a " + "top-level `.plot` method." + ) + + +def _get_plot_backend(backend=None): + """ + Return the plotting backend to use (e.g. `pandas.plotting._matplotlib`). + + The plotting system of pandas has been using matplotlib, but the idea here + is that it can also work with other third-party backends. In the future, + this function will return the backend from a pandas option, and all the + rest of the code in this file will use the backend specified there for the + plotting. + + The backend is imported lazily, as matplotlib is a soft dependency, and + pandas can be used without it being installed. + """ + backend = backend or get_option("plotting.backend") + + if backend == "matplotlib": + # Because matplotlib is an optional dependency and first-party backend, + # we need to attempt an import here to raise an ImportError if needed. + try: + import pandas.plotting._matplotlib as module + except ImportError: + raise ImportError( + "matplotlib is required for plotting when the " + 'default backend "matplotlib" is selected.' + ) from None + + _backends["matplotlib"] = module + + if backend in _backends: + return _backends[backend] + + module = _find_backend(backend) + _backends[backend] = module + return module diff --git a/venv/Lib/site-packages/pandas/plotting/_matplotlib/__init__.py b/venv/Lib/site-packages/pandas/plotting/_matplotlib/__init__.py new file mode 100644 index 0000000..27b1d55 --- /dev/null +++ b/venv/Lib/site-packages/pandas/plotting/_matplotlib/__init__.py @@ -0,0 +1,83 @@ +from typing import TYPE_CHECKING, Dict, Type + +from pandas.plotting._matplotlib.boxplot import ( + BoxPlot, + boxplot, + boxplot_frame, + boxplot_frame_groupby, +) +from pandas.plotting._matplotlib.converter import deregister, register +from pandas.plotting._matplotlib.core import ( + AreaPlot, + BarhPlot, + BarPlot, + HexBinPlot, + LinePlot, + PiePlot, + ScatterPlot, +) +from pandas.plotting._matplotlib.hist import HistPlot, KdePlot, hist_frame, hist_series +from pandas.plotting._matplotlib.misc import ( + andrews_curves, + autocorrelation_plot, + bootstrap_plot, + lag_plot, + parallel_coordinates, + radviz, + scatter_matrix, +) +from pandas.plotting._matplotlib.tools import table + +if TYPE_CHECKING: + from pandas.plotting._matplotlib.core import MPLPlot # noqa: F401 + +PLOT_CLASSES: Dict[str, Type["MPLPlot"]] = { + "line": LinePlot, + "bar": BarPlot, + "barh": BarhPlot, + "box": BoxPlot, + "hist": HistPlot, + "kde": KdePlot, + "area": AreaPlot, + "pie": PiePlot, + "scatter": ScatterPlot, + "hexbin": HexBinPlot, +} + + +def plot(data, kind, **kwargs): + # Importing pyplot at the top of the file (before the converters are + # registered) causes problems in matplotlib 2 (converters seem to not + # work) + import matplotlib.pyplot as plt + + if kwargs.pop("reuse_plot", False): + ax = kwargs.get("ax") + if ax is None and len(plt.get_fignums()) > 0: + with plt.rc_context(): + ax = plt.gca() + kwargs["ax"] = getattr(ax, "left_ax", ax) + plot_obj = PLOT_CLASSES[kind](data, **kwargs) + plot_obj.generate() + plot_obj.draw() + return plot_obj.result + + +__all__ = [ + "plot", + "hist_series", + "hist_frame", + "boxplot", + "boxplot_frame", + "boxplot_frame_groupby", + "table", + "andrews_curves", + "autocorrelation_plot", + "bootstrap_plot", + "lag_plot", + "parallel_coordinates", + "radviz", + "scatter_matrix", + "register", + "deregister", +] diff --git a/venv/Lib/site-packages/pandas/plotting/_matplotlib/boxplot.py b/venv/Lib/site-packages/pandas/plotting/_matplotlib/boxplot.py new file mode 100644 index 0000000..deeeb00 --- /dev/null +++ b/venv/Lib/site-packages/pandas/plotting/_matplotlib/boxplot.py @@ -0,0 +1,436 @@ +from collections import namedtuple +import warnings + +from matplotlib.artist import setp +import numpy as np + +from pandas.core.dtypes.common import is_dict_like +from pandas.core.dtypes.generic import ABCSeries +from pandas.core.dtypes.missing import remove_na_arraylike + +import pandas as pd + +from pandas.io.formats.printing import pprint_thing +from pandas.plotting._matplotlib.core import LinePlot, MPLPlot +from pandas.plotting._matplotlib.style import _get_standard_colors +from pandas.plotting._matplotlib.tools import _flatten, _subplots + + +class BoxPlot(LinePlot): + _kind = "box" + _layout_type = "horizontal" + + _valid_return_types = (None, "axes", "dict", "both") + # namedtuple to hold results + BP = namedtuple("Boxplot", ["ax", "lines"]) + + def __init__(self, data, return_type="axes", **kwargs): + # Do not call LinePlot.__init__ which may fill nan + if return_type not in self._valid_return_types: + raise ValueError("return_type must be {None, 'axes', 'dict', 'both'}") + + self.return_type = return_type + MPLPlot.__init__(self, data, **kwargs) + + def _args_adjust(self): + if self.subplots: + # Disable label ax sharing. Otherwise, all subplots shows last + # column label + if self.orientation == "vertical": + self.sharex = False + else: + self.sharey = False + + @classmethod + def _plot(cls, ax, y, column_num=None, return_type="axes", **kwds): + if y.ndim == 2: + y = [remove_na_arraylike(v) for v in y] + # Boxplot fails with empty arrays, so need to add a NaN + # if any cols are empty + # GH 8181 + y = [v if v.size > 0 else np.array([np.nan]) for v in y] + else: + y = remove_na_arraylike(y) + bp = ax.boxplot(y, **kwds) + + if return_type == "dict": + return bp, bp + elif return_type == "both": + return cls.BP(ax=ax, lines=bp), bp + else: + return ax, bp + + def _validate_color_args(self): + if "color" in self.kwds: + if self.colormap is not None: + warnings.warn( + "'color' and 'colormap' cannot be used " + "simultaneously. Using 'color'" + ) + self.color = self.kwds.pop("color") + + if isinstance(self.color, dict): + valid_keys = ["boxes", "whiskers", "medians", "caps"] + for key, values in self.color.items(): + if key not in valid_keys: + raise ValueError( + f"color dict contains invalid key '{key}'. " + f"The key must be either {valid_keys}" + ) + else: + self.color = None + + # get standard colors for default + colors = _get_standard_colors(num_colors=3, colormap=self.colormap, color=None) + # use 2 colors by default, for box/whisker and median + # flier colors isn't needed here + # because it can be specified by ``sym`` kw + self._boxes_c = colors[0] + self._whiskers_c = colors[0] + self._medians_c = colors[2] + self._caps_c = "k" # mpl default + + def _get_colors(self, num_colors=None, color_kwds="color"): + pass + + def maybe_color_bp(self, bp): + if isinstance(self.color, dict): + boxes = self.color.get("boxes", self._boxes_c) + whiskers = self.color.get("whiskers", self._whiskers_c) + medians = self.color.get("medians", self._medians_c) + caps = self.color.get("caps", self._caps_c) + else: + # Other types are forwarded to matplotlib + # If None, use default colors + boxes = self.color or self._boxes_c + whiskers = self.color or self._whiskers_c + medians = self.color or self._medians_c + caps = self.color or self._caps_c + + setp(bp["boxes"], color=boxes, alpha=1) + setp(bp["whiskers"], color=whiskers, alpha=1) + setp(bp["medians"], color=medians, alpha=1) + setp(bp["caps"], color=caps, alpha=1) + + def _make_plot(self): + if self.subplots: + self._return_obj = pd.Series(dtype=object) + + for i, (label, y) in enumerate(self._iter_data()): + ax = self._get_ax(i) + kwds = self.kwds.copy() + + ret, bp = self._plot( + ax, y, column_num=i, return_type=self.return_type, **kwds + ) + self.maybe_color_bp(bp) + self._return_obj[label] = ret + + label = [pprint_thing(label)] + self._set_ticklabels(ax, label) + else: + y = self.data.values.T + ax = self._get_ax(0) + kwds = self.kwds.copy() + + ret, bp = self._plot( + ax, y, column_num=0, return_type=self.return_type, **kwds + ) + self.maybe_color_bp(bp) + self._return_obj = ret + + labels = [l for l, _ in self._iter_data()] + labels = [pprint_thing(l) for l in labels] + if not self.use_index: + labels = [pprint_thing(key) for key in range(len(labels))] + self._set_ticklabels(ax, labels) + + def _set_ticklabels(self, ax, labels): + if self.orientation == "vertical": + ax.set_xticklabels(labels) + else: + ax.set_yticklabels(labels) + + def _make_legend(self): + pass + + def _post_plot_logic(self, ax, data): + pass + + @property + def orientation(self): + if self.kwds.get("vert", True): + return "vertical" + else: + return "horizontal" + + @property + def result(self): + if self.return_type is None: + return super().result + else: + return self._return_obj + + +def _grouped_plot_by_column( + plotf, + data, + columns=None, + by=None, + numeric_only=True, + grid=False, + figsize=None, + ax=None, + layout=None, + return_type=None, + **kwargs, +): + grouped = data.groupby(by) + if columns is None: + if not isinstance(by, (list, tuple)): + by = [by] + columns = data._get_numeric_data().columns.difference(by) + naxes = len(columns) + fig, axes = _subplots( + naxes=naxes, sharex=True, sharey=True, figsize=figsize, ax=ax, layout=layout + ) + + _axes = _flatten(axes) + + ax_values = [] + + for i, col in enumerate(columns): + ax = _axes[i] + gp_col = grouped[col] + keys, values = zip(*gp_col) + re_plotf = plotf(keys, values, ax, **kwargs) + ax.set_title(col) + ax.set_xlabel(pprint_thing(by)) + ax_values.append(re_plotf) + ax.grid(grid) + + result = pd.Series(ax_values, index=columns) + + # Return axes in multiplot case, maybe revisit later # 985 + if return_type is None: + result = axes + + byline = by[0] if len(by) == 1 else by + fig.suptitle(f"Boxplot grouped by {byline}") + fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2) + + return result + + +def boxplot( + data, + column=None, + by=None, + ax=None, + fontsize=None, + rot=0, + grid=True, + figsize=None, + layout=None, + return_type=None, + **kwds, +): + + import matplotlib.pyplot as plt + + # validate return_type: + if return_type not in BoxPlot._valid_return_types: + raise ValueError("return_type must be {'axes', 'dict', 'both'}") + + if isinstance(data, ABCSeries): + data = data.to_frame("x") + column = "x" + + def _get_colors(): + # num_colors=3 is required as method maybe_color_bp takes the colors + # in positions 0 and 2. + # if colors not provided, use same defaults as DataFrame.plot.box + result = _get_standard_colors(num_colors=3) + result = np.take(result, [0, 0, 2]) + result = np.append(result, "k") + + colors = kwds.pop("color", None) + if colors: + if is_dict_like(colors): + # replace colors in result array with user-specified colors + # taken from the colors dict parameter + # "boxes" value placed in position 0, "whiskers" in 1, etc. + valid_keys = ["boxes", "whiskers", "medians", "caps"] + key_to_index = dict(zip(valid_keys, range(4))) + for key, value in colors.items(): + if key in valid_keys: + result[key_to_index[key]] = value + else: + raise ValueError( + f"color dict contains invalid key '{key}'. " + f"The key must be either {valid_keys}" + ) + else: + result.fill(colors) + + return result + + def maybe_color_bp(bp): + setp(bp["boxes"], color=colors[0], alpha=1) + setp(bp["whiskers"], color=colors[1], alpha=1) + setp(bp["medians"], color=colors[2], alpha=1) + setp(bp["caps"], color=colors[3], alpha=1) + + def plot_group(keys, values, ax): + keys = [pprint_thing(x) for x in keys] + values = [np.asarray(remove_na_arraylike(v)) for v in values] + bp = ax.boxplot(values, **kwds) + if fontsize is not None: + ax.tick_params(axis="both", labelsize=fontsize) + if kwds.get("vert", 1): + ax.set_xticklabels(keys, rotation=rot) + else: + ax.set_yticklabels(keys, rotation=rot) + maybe_color_bp(bp) + + # Return axes in multiplot case, maybe revisit later # 985 + if return_type == "dict": + return bp + elif return_type == "both": + return BoxPlot.BP(ax=ax, lines=bp) + else: + return ax + + colors = _get_colors() + if column is None: + columns = None + else: + if isinstance(column, (list, tuple)): + columns = column + else: + columns = [column] + + if by is not None: + # Prefer array return type for 2-D plots to match the subplot layout + # https://github.com/pandas-dev/pandas/pull/12216#issuecomment-241175580 + result = _grouped_plot_by_column( + plot_group, + data, + columns=columns, + by=by, + grid=grid, + figsize=figsize, + ax=ax, + layout=layout, + return_type=return_type, + ) + else: + if return_type is None: + return_type = "axes" + if layout is not None: + raise ValueError("The 'layout' keyword is not supported when 'by' is None") + + if ax is None: + rc = {"figure.figsize": figsize} if figsize is not None else {} + with plt.rc_context(rc): + ax = plt.gca() + data = data._get_numeric_data() + if columns is None: + columns = data.columns + else: + data = data[columns] + + result = plot_group(columns, data.values.T, ax) + ax.grid(grid) + + return result + + +def boxplot_frame( + self, + column=None, + by=None, + ax=None, + fontsize=None, + rot=0, + grid=True, + figsize=None, + layout=None, + return_type=None, + **kwds, +): + import matplotlib.pyplot as plt + + ax = boxplot( + self, + column=column, + by=by, + ax=ax, + fontsize=fontsize, + grid=grid, + rot=rot, + figsize=figsize, + layout=layout, + return_type=return_type, + **kwds, + ) + plt.draw_if_interactive() + return ax + + +def boxplot_frame_groupby( + grouped, + subplots=True, + column=None, + fontsize=None, + rot=0, + grid=True, + ax=None, + figsize=None, + layout=None, + sharex=False, + sharey=True, + **kwds, +): + if subplots is True: + naxes = len(grouped) + fig, axes = _subplots( + naxes=naxes, + squeeze=False, + ax=ax, + sharex=sharex, + sharey=sharey, + figsize=figsize, + layout=layout, + ) + axes = _flatten(axes) + + ret = pd.Series(dtype=object) + + for (key, group), ax in zip(grouped, axes): + d = group.boxplot( + ax=ax, column=column, fontsize=fontsize, rot=rot, grid=grid, **kwds + ) + ax.set_title(pprint_thing(key)) + ret.loc[key] = d + fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2) + else: + keys, frames = zip(*grouped) + if grouped.axis == 0: + df = pd.concat(frames, keys=keys, axis=1) + else: + if len(frames) > 1: + df = frames[0].join(frames[1::]) + else: + df = frames[0] + ret = df.boxplot( + column=column, + fontsize=fontsize, + rot=rot, + grid=grid, + ax=ax, + figsize=figsize, + layout=layout, + **kwds, + ) + return ret diff --git a/venv/Lib/site-packages/pandas/plotting/_matplotlib/compat.py b/venv/Lib/site-packages/pandas/plotting/_matplotlib/compat.py new file mode 100644 index 0000000..e785506 --- /dev/null +++ b/venv/Lib/site-packages/pandas/plotting/_matplotlib/compat.py @@ -0,0 +1,22 @@ +# being a bit too dynamic +from distutils.version import LooseVersion +import operator + + +def _mpl_version(version, op): + def inner(): + try: + import matplotlib as mpl + except ImportError: + return False + return ( + op(LooseVersion(mpl.__version__), LooseVersion(version)) + and str(mpl.__version__)[0] != "0" + ) + + return inner + + +_mpl_ge_2_2_3 = _mpl_version("2.2.3", operator.ge) +_mpl_ge_3_0_0 = _mpl_version("3.0.0", operator.ge) +_mpl_ge_3_1_0 = _mpl_version("3.1.0", operator.ge) diff --git a/venv/Lib/site-packages/pandas/plotting/_matplotlib/converter.py b/venv/Lib/site-packages/pandas/plotting/_matplotlib/converter.py new file mode 100644 index 0000000..5b37ebb --- /dev/null +++ b/venv/Lib/site-packages/pandas/plotting/_matplotlib/converter.py @@ -0,0 +1,1132 @@ +import contextlib +import datetime as pydt +from datetime import datetime, timedelta +import functools + +from dateutil.relativedelta import relativedelta +import matplotlib.dates as dates +from matplotlib.ticker import AutoLocator, Formatter, Locator +from matplotlib.transforms import nonsingular +import matplotlib.units as units +import numpy as np + +from pandas._libs import lib, tslibs +from pandas._libs.tslibs import resolution +from pandas._libs.tslibs.frequencies import FreqGroup, get_freq + +from pandas.core.dtypes.common import ( + is_datetime64_ns_dtype, + is_float, + is_float_dtype, + is_integer, + is_integer_dtype, + is_nested_list_like, +) +from pandas.core.dtypes.generic import ABCSeries + +from pandas import Index, get_option +import pandas.core.common as com +from pandas.core.indexes.datetimes import date_range +from pandas.core.indexes.period import Period, PeriodIndex, period_range +import pandas.core.tools.datetimes as tools + +# constants +HOURS_PER_DAY = 24.0 +MIN_PER_HOUR = 60.0 +SEC_PER_MIN = 60.0 + +SEC_PER_HOUR = SEC_PER_MIN * MIN_PER_HOUR +SEC_PER_DAY = SEC_PER_HOUR * HOURS_PER_DAY + +MUSEC_PER_DAY = 1e6 * SEC_PER_DAY + +_mpl_units = {} # Cache for units overwritten by us + + +def get_pairs(): + pairs = [ + (tslibs.Timestamp, DatetimeConverter), + (Period, PeriodConverter), + (pydt.datetime, DatetimeConverter), + (pydt.date, DatetimeConverter), + (pydt.time, TimeConverter), + (np.datetime64, DatetimeConverter), + ] + return pairs + + +def register_pandas_matplotlib_converters(func): + """ + Decorator applying pandas_converters. + """ + + @functools.wraps(func) + def wrapper(*args, **kwargs): + with pandas_converters(): + return func(*args, **kwargs) + + return wrapper + + +@contextlib.contextmanager +def pandas_converters(): + """ + Context manager registering pandas' converters for a plot. + + See Also + -------- + register_pandas_matplotlib_converters : Decorator that applies this. + """ + value = get_option("plotting.matplotlib.register_converters") + + if value: + # register for True or "auto" + register() + try: + yield + finally: + if value == "auto": + # only deregister for "auto" + deregister() + + +def register(): + pairs = get_pairs() + for type_, cls in pairs: + # Cache previous converter if present + if type_ in units.registry and not isinstance(units.registry[type_], cls): + previous = units.registry[type_] + _mpl_units[type_] = previous + # Replace with pandas converter + units.registry[type_] = cls() + + +def deregister(): + # Renamed in pandas.plotting.__init__ + for type_, cls in get_pairs(): + # We use type to catch our classes directly, no inheritance + if type(units.registry.get(type_)) is cls: + units.registry.pop(type_) + + # restore the old keys + for unit, formatter in _mpl_units.items(): + if type(formatter) not in {DatetimeConverter, PeriodConverter, TimeConverter}: + # make it idempotent by excluding ours. + units.registry[unit] = formatter + + +def _to_ordinalf(tm): + tot_sec = tm.hour * 3600 + tm.minute * 60 + tm.second + float(tm.microsecond / 1e6) + return tot_sec + + +def time2num(d): + if isinstance(d, str): + parsed = tools.to_datetime(d) + if not isinstance(parsed, datetime): + raise ValueError(f"Could not parse time {d}") + return _to_ordinalf(parsed.time()) + if isinstance(d, pydt.time): + return _to_ordinalf(d) + return d + + +class TimeConverter(units.ConversionInterface): + @staticmethod + def convert(value, unit, axis): + valid_types = (str, pydt.time) + if isinstance(value, valid_types) or is_integer(value) or is_float(value): + return time2num(value) + if isinstance(value, Index): + return value.map(time2num) + if isinstance(value, (list, tuple, np.ndarray, Index)): + return [time2num(x) for x in value] + return value + + @staticmethod + def axisinfo(unit, axis): + if unit != "time": + return None + + majloc = AutoLocator() + majfmt = TimeFormatter(majloc) + return units.AxisInfo(majloc=majloc, majfmt=majfmt, label="time") + + @staticmethod + def default_units(x, axis): + return "time" + + +# time formatter +class TimeFormatter(Formatter): + def __init__(self, locs): + self.locs = locs + + def __call__(self, x, pos=0): + """ + Return the time of day as a formatted string. + + Parameters + ---------- + x : float + The time of day specified as seconds since 00:00 (midnight), + with up to microsecond precision. + pos + Unused + + Returns + ------- + str + A string in HH:MM:SS.mmmuuu format. Microseconds, + milliseconds and seconds are only displayed if non-zero. + """ + fmt = "%H:%M:%S.%f" + s = int(x) + msus = int(round((x - s) * 1e6)) + ms = msus // 1000 + us = msus % 1000 + m, s = divmod(s, 60) + h, m = divmod(m, 60) + _, h = divmod(h, 24) + if us != 0: + return pydt.time(h, m, s, msus).strftime(fmt) + elif ms != 0: + return pydt.time(h, m, s, msus).strftime(fmt)[:-3] + elif s != 0: + return pydt.time(h, m, s).strftime("%H:%M:%S") + + return pydt.time(h, m).strftime("%H:%M") + + +# Period Conversion + + +class PeriodConverter(dates.DateConverter): + @staticmethod + def convert(values, units, axis): + if is_nested_list_like(values): + values = [PeriodConverter._convert_1d(v, units, axis) for v in values] + else: + values = PeriodConverter._convert_1d(values, units, axis) + return values + + @staticmethod + def _convert_1d(values, units, axis): + if not hasattr(axis, "freq"): + raise TypeError("Axis must have `freq` set to convert to Periods") + valid_types = (str, datetime, Period, pydt.date, pydt.time, np.datetime64) + if isinstance(values, valid_types) or is_integer(values) or is_float(values): + return get_datevalue(values, axis.freq) + elif isinstance(values, PeriodIndex): + return values.asfreq(axis.freq)._ndarray_values + elif isinstance(values, Index): + return values.map(lambda x: get_datevalue(x, axis.freq)) + elif lib.infer_dtype(values, skipna=False) == "period": + # https://github.com/pandas-dev/pandas/issues/24304 + # convert ndarray[period] -> PeriodIndex + return PeriodIndex(values, freq=axis.freq)._ndarray_values + elif isinstance(values, (list, tuple, np.ndarray, Index)): + return [get_datevalue(x, axis.freq) for x in values] + return values + + +def get_datevalue(date, freq): + if isinstance(date, Period): + return date.asfreq(freq).ordinal + elif isinstance(date, (str, datetime, pydt.date, pydt.time, np.datetime64)): + return Period(date, freq).ordinal + elif ( + is_integer(date) + or is_float(date) + or (isinstance(date, (np.ndarray, Index)) and (date.size == 1)) + ): + return date + elif date is None: + return None + raise ValueError(f"Unrecognizable date '{date}'") + + +def _dt_to_float_ordinal(dt): + """ + Convert :mod:`datetime` to the Gregorian date as UTC float days, + preserving hours, minutes, seconds and microseconds. Return value + is a :func:`float`. + """ + if isinstance(dt, (np.ndarray, Index, ABCSeries)) and is_datetime64_ns_dtype(dt): + base = dates.epoch2num(dt.asi8 / 1.0e9) + else: + base = dates.date2num(dt) + return base + + +# Datetime Conversion +class DatetimeConverter(dates.DateConverter): + @staticmethod + def convert(values, unit, axis): + # values might be a 1-d array, or a list-like of arrays. + if is_nested_list_like(values): + values = [DatetimeConverter._convert_1d(v, unit, axis) for v in values] + else: + values = DatetimeConverter._convert_1d(values, unit, axis) + return values + + @staticmethod + def _convert_1d(values, unit, axis): + def try_parse(values): + try: + return _dt_to_float_ordinal(tools.to_datetime(values)) + except Exception: + return values + + if isinstance(values, (datetime, pydt.date)): + return _dt_to_float_ordinal(values) + elif isinstance(values, np.datetime64): + return _dt_to_float_ordinal(tslibs.Timestamp(values)) + elif isinstance(values, pydt.time): + return dates.date2num(values) + elif is_integer(values) or is_float(values): + return values + elif isinstance(values, str): + return try_parse(values) + elif isinstance(values, (list, tuple, np.ndarray, Index, ABCSeries)): + if isinstance(values, ABCSeries): + # https://github.com/matplotlib/matplotlib/issues/11391 + # Series was skipped. Convert to DatetimeIndex to get asi8 + values = Index(values) + if isinstance(values, Index): + values = values.values + if not isinstance(values, np.ndarray): + values = com.asarray_tuplesafe(values) + + if is_integer_dtype(values) or is_float_dtype(values): + return values + + try: + values = tools.to_datetime(values) + if isinstance(values, Index): + values = _dt_to_float_ordinal(values) + else: + values = [_dt_to_float_ordinal(x) for x in values] + except Exception: + values = _dt_to_float_ordinal(values) + + return values + + @staticmethod + def axisinfo(unit, axis): + """ + Return the :class:`~matplotlib.units.AxisInfo` for *unit*. + + *unit* is a tzinfo instance or None. + The *axis* argument is required but not used. + """ + tz = unit + + majloc = PandasAutoDateLocator(tz=tz) + majfmt = PandasAutoDateFormatter(majloc, tz=tz) + datemin = pydt.date(2000, 1, 1) + datemax = pydt.date(2010, 1, 1) + + return units.AxisInfo( + majloc=majloc, majfmt=majfmt, label="", default_limits=(datemin, datemax) + ) + + +class PandasAutoDateFormatter(dates.AutoDateFormatter): + def __init__(self, locator, tz=None, defaultfmt="%Y-%m-%d"): + dates.AutoDateFormatter.__init__(self, locator, tz, defaultfmt) + + +class PandasAutoDateLocator(dates.AutoDateLocator): + def get_locator(self, dmin, dmax): + """Pick the best locator based on a distance.""" + delta = relativedelta(dmax, dmin) + + num_days = (delta.years * 12.0 + delta.months) * 31.0 + delta.days + num_sec = (delta.hours * 60.0 + delta.minutes) * 60.0 + delta.seconds + tot_sec = num_days * 86400.0 + num_sec + + if abs(tot_sec) < self.minticks: + self._freq = -1 + locator = MilliSecondLocator(self.tz) + locator.set_axis(self.axis) + + locator.set_view_interval(*self.axis.get_view_interval()) + locator.set_data_interval(*self.axis.get_data_interval()) + return locator + + return dates.AutoDateLocator.get_locator(self, dmin, dmax) + + def _get_unit(self): + return MilliSecondLocator.get_unit_generic(self._freq) + + +class MilliSecondLocator(dates.DateLocator): + + UNIT = 1.0 / (24 * 3600 * 1000) + + def __init__(self, tz): + dates.DateLocator.__init__(self, tz) + self._interval = 1.0 + + def _get_unit(self): + return self.get_unit_generic(-1) + + @staticmethod + def get_unit_generic(freq): + unit = dates.RRuleLocator.get_unit_generic(freq) + if unit < 0: + return MilliSecondLocator.UNIT + return unit + + def __call__(self): + # if no data have been set, this will tank with a ValueError + try: + dmin, dmax = self.viewlim_to_dt() + except ValueError: + return [] + + # We need to cap at the endpoints of valid datetime + + # FIXME: dont leave commented-out + # TODO(wesm) unused? + # if dmin > dmax: + # dmax, dmin = dmin, dmax + # delta = relativedelta(dmax, dmin) + # try: + # start = dmin - delta + # except ValueError: + # start = _from_ordinal(1.0) + + # try: + # stop = dmax + delta + # except ValueError: + # # The magic number! + # stop = _from_ordinal(3652059.9999999) + + nmax, nmin = dates.date2num((dmax, dmin)) + + num = (nmax - nmin) * 86400 * 1000 + max_millis_ticks = 6 + for interval in [1, 10, 50, 100, 200, 500]: + if num <= interval * (max_millis_ticks - 1): + self._interval = interval + break + else: + # We went through the whole loop without breaking, default to 1 + self._interval = 1000.0 + + estimate = (nmax - nmin) / (self._get_unit() * self._get_interval()) + + if estimate > self.MAXTICKS * 2: + raise RuntimeError( + "MillisecondLocator estimated to generate " + f"{estimate:d} ticks from {dmin} to {dmax}: " + "exceeds Locator.MAXTICKS" + f"* 2 ({self.MAXTICKS * 2:d}) " + ) + + interval = self._get_interval() + freq = f"{interval}L" + tz = self.tz.tzname(None) + st = _from_ordinal(dates.date2num(dmin)) # strip tz + ed = _from_ordinal(dates.date2num(dmax)) + all_dates = date_range(start=st, end=ed, freq=freq, tz=tz).astype(object) + + try: + if len(all_dates) > 0: + locs = self.raise_if_exceeds(dates.date2num(all_dates)) + return locs + except Exception: # pragma: no cover + pass + + lims = dates.date2num([dmin, dmax]) + return lims + + def _get_interval(self): + return self._interval + + def autoscale(self): + """ + Set the view limits to include the data range. + """ + dmin, dmax = self.datalim_to_dt() + if dmin > dmax: + dmax, dmin = dmin, dmax + + # We need to cap at the endpoints of valid datetime + + # FIXME: dont leave commented-out + # TODO(wesm): unused? + + # delta = relativedelta(dmax, dmin) + # try: + # start = dmin - delta + # except ValueError: + # start = _from_ordinal(1.0) + + # try: + # stop = dmax + delta + # except ValueError: + # # The magic number! + # stop = _from_ordinal(3652059.9999999) + + dmin, dmax = self.datalim_to_dt() + + vmin = dates.date2num(dmin) + vmax = dates.date2num(dmax) + + return self.nonsingular(vmin, vmax) + + +def _from_ordinal(x, tz=None): + ix = int(x) + dt = datetime.fromordinal(ix) + remainder = float(x) - ix + hour, remainder = divmod(24 * remainder, 1) + minute, remainder = divmod(60 * remainder, 1) + second, remainder = divmod(60 * remainder, 1) + microsecond = int(1e6 * remainder) + if microsecond < 10: + microsecond = 0 # compensate for rounding errors + dt = datetime( + dt.year, dt.month, dt.day, int(hour), int(minute), int(second), microsecond + ) + if tz is not None: + dt = dt.astimezone(tz) + + if microsecond > 999990: # compensate for rounding errors + dt += timedelta(microseconds=1e6 - microsecond) + + return dt + + +# Fixed frequency dynamic tick locators and formatters + +# ------------------------------------------------------------------------- +# --- Locators --- +# ------------------------------------------------------------------------- + + +def _get_default_annual_spacing(nyears): + """ + Returns a default spacing between consecutive ticks for annual data. + """ + if nyears < 11: + (min_spacing, maj_spacing) = (1, 1) + elif nyears < 20: + (min_spacing, maj_spacing) = (1, 2) + elif nyears < 50: + (min_spacing, maj_spacing) = (1, 5) + elif nyears < 100: + (min_spacing, maj_spacing) = (5, 10) + elif nyears < 200: + (min_spacing, maj_spacing) = (5, 25) + elif nyears < 600: + (min_spacing, maj_spacing) = (10, 50) + else: + factor = nyears // 1000 + 1 + (min_spacing, maj_spacing) = (factor * 20, factor * 100) + return (min_spacing, maj_spacing) + + +def period_break(dates, period): + """ + Returns the indices where the given period changes. + + Parameters + ---------- + dates : PeriodIndex + Array of intervals to monitor. + period : string + Name of the period to monitor. + """ + current = getattr(dates, period) + previous = getattr(dates - 1 * dates.freq, period) + return np.nonzero(current - previous)[0] + + +def has_level_label(label_flags, vmin): + """ + Returns true if the ``label_flags`` indicate there is at least one label + for this level. + + if the minimum view limit is not an exact integer, then the first tick + label won't be shown, so we must adjust for that. + """ + if label_flags.size == 0 or ( + label_flags.size == 1 and label_flags[0] == 0 and vmin % 1 > 0.0 + ): + return False + else: + return True + + +def _daily_finder(vmin, vmax, freq): + periodsperday = -1 + + if freq >= FreqGroup.FR_HR: + if freq == FreqGroup.FR_NS: + periodsperday = 24 * 60 * 60 * 1000000000 + elif freq == FreqGroup.FR_US: + periodsperday = 24 * 60 * 60 * 1000000 + elif freq == FreqGroup.FR_MS: + periodsperday = 24 * 60 * 60 * 1000 + elif freq == FreqGroup.FR_SEC: + periodsperday = 24 * 60 * 60 + elif freq == FreqGroup.FR_MIN: + periodsperday = 24 * 60 + elif freq == FreqGroup.FR_HR: + periodsperday = 24 + else: # pragma: no cover + raise ValueError(f"unexpected frequency: {freq}") + periodsperyear = 365 * periodsperday + periodspermonth = 28 * periodsperday + + elif freq == FreqGroup.FR_BUS: + periodsperyear = 261 + periodspermonth = 19 + elif freq == FreqGroup.FR_DAY: + periodsperyear = 365 + periodspermonth = 28 + elif resolution.get_freq_group(freq) == FreqGroup.FR_WK: + periodsperyear = 52 + periodspermonth = 3 + else: # pragma: no cover + raise ValueError("unexpected frequency") + + # save this for later usage + vmin_orig = vmin + + (vmin, vmax) = ( + Period(ordinal=int(vmin), freq=freq), + Period(ordinal=int(vmax), freq=freq), + ) + span = vmax.ordinal - vmin.ordinal + 1 + dates_ = period_range(start=vmin, end=vmax, freq=freq) + # Initialize the output + info = np.zeros( + span, dtype=[("val", np.int64), ("maj", bool), ("min", bool), ("fmt", "|S20")] + ) + info["val"][:] = dates_._ndarray_values + info["fmt"][:] = "" + info["maj"][[0, -1]] = True + # .. and set some shortcuts + info_maj = info["maj"] + info_min = info["min"] + info_fmt = info["fmt"] + + def first_label(label_flags): + if (label_flags[0] == 0) and (label_flags.size > 1) and ((vmin_orig % 1) > 0.0): + return label_flags[1] + else: + return label_flags[0] + + # Case 1. Less than a month + if span <= periodspermonth: + day_start = period_break(dates_, "day") + month_start = period_break(dates_, "month") + + def _hour_finder(label_interval, force_year_start): + _hour = dates_.hour + _prev_hour = (dates_ - 1 * dates_.freq).hour + hour_start = (_hour - _prev_hour) != 0 + info_maj[day_start] = True + info_min[hour_start & (_hour % label_interval == 0)] = True + year_start = period_break(dates_, "year") + info_fmt[hour_start & (_hour % label_interval == 0)] = "%H:%M" + info_fmt[day_start] = "%H:%M\n%d-%b" + info_fmt[year_start] = "%H:%M\n%d-%b\n%Y" + if force_year_start and not has_level_label(year_start, vmin_orig): + info_fmt[first_label(day_start)] = "%H:%M\n%d-%b\n%Y" + + def _minute_finder(label_interval): + hour_start = period_break(dates_, "hour") + _minute = dates_.minute + _prev_minute = (dates_ - 1 * dates_.freq).minute + minute_start = (_minute - _prev_minute) != 0 + info_maj[hour_start] = True + info_min[minute_start & (_minute % label_interval == 0)] = True + year_start = period_break(dates_, "year") + info_fmt = info["fmt"] + info_fmt[minute_start & (_minute % label_interval == 0)] = "%H:%M" + info_fmt[day_start] = "%H:%M\n%d-%b" + info_fmt[year_start] = "%H:%M\n%d-%b\n%Y" + + def _second_finder(label_interval): + minute_start = period_break(dates_, "minute") + _second = dates_.second + _prev_second = (dates_ - 1 * dates_.freq).second + second_start = (_second - _prev_second) != 0 + info["maj"][minute_start] = True + info["min"][second_start & (_second % label_interval == 0)] = True + year_start = period_break(dates_, "year") + info_fmt = info["fmt"] + info_fmt[second_start & (_second % label_interval == 0)] = "%H:%M:%S" + info_fmt[day_start] = "%H:%M:%S\n%d-%b" + info_fmt[year_start] = "%H:%M:%S\n%d-%b\n%Y" + + if span < periodsperday / 12000.0: + _second_finder(1) + elif span < periodsperday / 6000.0: + _second_finder(2) + elif span < periodsperday / 2400.0: + _second_finder(5) + elif span < periodsperday / 1200.0: + _second_finder(10) + elif span < periodsperday / 800.0: + _second_finder(15) + elif span < periodsperday / 400.0: + _second_finder(30) + elif span < periodsperday / 150.0: + _minute_finder(1) + elif span < periodsperday / 70.0: + _minute_finder(2) + elif span < periodsperday / 24.0: + _minute_finder(5) + elif span < periodsperday / 12.0: + _minute_finder(15) + elif span < periodsperday / 6.0: + _minute_finder(30) + elif span < periodsperday / 2.5: + _hour_finder(1, False) + elif span < periodsperday / 1.5: + _hour_finder(2, False) + elif span < periodsperday * 1.25: + _hour_finder(3, False) + elif span < periodsperday * 2.5: + _hour_finder(6, True) + elif span < periodsperday * 4: + _hour_finder(12, True) + else: + info_maj[month_start] = True + info_min[day_start] = True + year_start = period_break(dates_, "year") + info_fmt = info["fmt"] + info_fmt[day_start] = "%d" + info_fmt[month_start] = "%d\n%b" + info_fmt[year_start] = "%d\n%b\n%Y" + if not has_level_label(year_start, vmin_orig): + if not has_level_label(month_start, vmin_orig): + info_fmt[first_label(day_start)] = "%d\n%b\n%Y" + else: + info_fmt[first_label(month_start)] = "%d\n%b\n%Y" + + # Case 2. Less than three months + elif span <= periodsperyear // 4: + month_start = period_break(dates_, "month") + info_maj[month_start] = True + if freq < FreqGroup.FR_HR: + info["min"] = True + else: + day_start = period_break(dates_, "day") + info["min"][day_start] = True + week_start = period_break(dates_, "week") + year_start = period_break(dates_, "year") + info_fmt[week_start] = "%d" + info_fmt[month_start] = "\n\n%b" + info_fmt[year_start] = "\n\n%b\n%Y" + if not has_level_label(year_start, vmin_orig): + if not has_level_label(month_start, vmin_orig): + info_fmt[first_label(week_start)] = "\n\n%b\n%Y" + else: + info_fmt[first_label(month_start)] = "\n\n%b\n%Y" + # Case 3. Less than 14 months ............... + elif span <= 1.15 * periodsperyear: + year_start = period_break(dates_, "year") + month_start = period_break(dates_, "month") + week_start = period_break(dates_, "week") + info_maj[month_start] = True + info_min[week_start] = True + info_min[year_start] = False + info_min[month_start] = False + info_fmt[month_start] = "%b" + info_fmt[year_start] = "%b\n%Y" + if not has_level_label(year_start, vmin_orig): + info_fmt[first_label(month_start)] = "%b\n%Y" + # Case 4. Less than 2.5 years ............... + elif span <= 2.5 * periodsperyear: + year_start = period_break(dates_, "year") + quarter_start = period_break(dates_, "quarter") + month_start = period_break(dates_, "month") + info_maj[quarter_start] = True + info_min[month_start] = True + info_fmt[quarter_start] = "%b" + info_fmt[year_start] = "%b\n%Y" + # Case 4. Less than 4 years ................. + elif span <= 4 * periodsperyear: + year_start = period_break(dates_, "year") + month_start = period_break(dates_, "month") + info_maj[year_start] = True + info_min[month_start] = True + info_min[year_start] = False + + month_break = dates_[month_start].month + jan_or_jul = month_start[(month_break == 1) | (month_break == 7)] + info_fmt[jan_or_jul] = "%b" + info_fmt[year_start] = "%b\n%Y" + # Case 5. Less than 11 years ................ + elif span <= 11 * periodsperyear: + year_start = period_break(dates_, "year") + quarter_start = period_break(dates_, "quarter") + info_maj[year_start] = True + info_min[quarter_start] = True + info_min[year_start] = False + info_fmt[year_start] = "%Y" + # Case 6. More than 12 years ................ + else: + year_start = period_break(dates_, "year") + year_break = dates_[year_start].year + nyears = span / periodsperyear + (min_anndef, maj_anndef) = _get_default_annual_spacing(nyears) + major_idx = year_start[(year_break % maj_anndef == 0)] + info_maj[major_idx] = True + minor_idx = year_start[(year_break % min_anndef == 0)] + info_min[minor_idx] = True + info_fmt[major_idx] = "%Y" + + return info + + +def _monthly_finder(vmin, vmax, freq): + periodsperyear = 12 + + vmin_orig = vmin + (vmin, vmax) = (int(vmin), int(vmax)) + span = vmax - vmin + 1 + + # Initialize the output + info = np.zeros( + span, dtype=[("val", int), ("maj", bool), ("min", bool), ("fmt", "|S8")] + ) + info["val"] = np.arange(vmin, vmax + 1) + dates_ = info["val"] + info["fmt"] = "" + year_start = (dates_ % 12 == 0).nonzero()[0] + info_maj = info["maj"] + info_fmt = info["fmt"] + + if span <= 1.15 * periodsperyear: + info_maj[year_start] = True + info["min"] = True + + info_fmt[:] = "%b" + info_fmt[year_start] = "%b\n%Y" + + if not has_level_label(year_start, vmin_orig): + if dates_.size > 1: + idx = 1 + else: + idx = 0 + info_fmt[idx] = "%b\n%Y" + + elif span <= 2.5 * periodsperyear: + quarter_start = (dates_ % 3 == 0).nonzero() + info_maj[year_start] = True + # TODO: Check the following : is it really info['fmt'] ? + info["fmt"][quarter_start] = True + info["min"] = True + + info_fmt[quarter_start] = "%b" + info_fmt[year_start] = "%b\n%Y" + + elif span <= 4 * periodsperyear: + info_maj[year_start] = True + info["min"] = True + + jan_or_jul = (dates_ % 12 == 0) | (dates_ % 12 == 6) + info_fmt[jan_or_jul] = "%b" + info_fmt[year_start] = "%b\n%Y" + + elif span <= 11 * periodsperyear: + quarter_start = (dates_ % 3 == 0).nonzero() + info_maj[year_start] = True + info["min"][quarter_start] = True + + info_fmt[year_start] = "%Y" + + else: + nyears = span / periodsperyear + (min_anndef, maj_anndef) = _get_default_annual_spacing(nyears) + years = dates_[year_start] // 12 + 1 + major_idx = year_start[(years % maj_anndef == 0)] + info_maj[major_idx] = True + info["min"][year_start[(years % min_anndef == 0)]] = True + + info_fmt[major_idx] = "%Y" + + return info + + +def _quarterly_finder(vmin, vmax, freq): + periodsperyear = 4 + vmin_orig = vmin + (vmin, vmax) = (int(vmin), int(vmax)) + span = vmax - vmin + 1 + + info = np.zeros( + span, dtype=[("val", int), ("maj", bool), ("min", bool), ("fmt", "|S8")] + ) + info["val"] = np.arange(vmin, vmax + 1) + info["fmt"] = "" + dates_ = info["val"] + info_maj = info["maj"] + info_fmt = info["fmt"] + year_start = (dates_ % 4 == 0).nonzero()[0] + + if span <= 3.5 * periodsperyear: + info_maj[year_start] = True + info["min"] = True + + info_fmt[:] = "Q%q" + info_fmt[year_start] = "Q%q\n%F" + if not has_level_label(year_start, vmin_orig): + if dates_.size > 1: + idx = 1 + else: + idx = 0 + info_fmt[idx] = "Q%q\n%F" + + elif span <= 11 * periodsperyear: + info_maj[year_start] = True + info["min"] = True + info_fmt[year_start] = "%F" + + else: + years = dates_[year_start] // 4 + 1 + nyears = span / periodsperyear + (min_anndef, maj_anndef) = _get_default_annual_spacing(nyears) + major_idx = year_start[(years % maj_anndef == 0)] + info_maj[major_idx] = True + info["min"][year_start[(years % min_anndef == 0)]] = True + info_fmt[major_idx] = "%F" + + return info + + +def _annual_finder(vmin, vmax, freq): + (vmin, vmax) = (int(vmin), int(vmax + 1)) + span = vmax - vmin + 1 + + info = np.zeros( + span, dtype=[("val", int), ("maj", bool), ("min", bool), ("fmt", "|S8")] + ) + info["val"] = np.arange(vmin, vmax + 1) + info["fmt"] = "" + dates_ = info["val"] + + (min_anndef, maj_anndef) = _get_default_annual_spacing(span) + major_idx = dates_ % maj_anndef == 0 + info["maj"][major_idx] = True + info["min"][(dates_ % min_anndef == 0)] = True + info["fmt"][major_idx] = "%Y" + + return info + + +def get_finder(freq): + if isinstance(freq, str): + freq = get_freq(freq) + fgroup = resolution.get_freq_group(freq) + + if fgroup == FreqGroup.FR_ANN: + return _annual_finder + elif fgroup == FreqGroup.FR_QTR: + return _quarterly_finder + elif freq == FreqGroup.FR_MTH: + return _monthly_finder + elif (freq >= FreqGroup.FR_BUS) or fgroup == FreqGroup.FR_WK: + return _daily_finder + else: # pragma: no cover + raise NotImplementedError(f"Unsupported frequency: {freq}") + + +class TimeSeries_DateLocator(Locator): + """ + Locates the ticks along an axis controlled by a :class:`Series`. + + Parameters + ---------- + freq : {var} + Valid frequency specifier. + minor_locator : {False, True}, optional + Whether the locator is for minor ticks (True) or not. + dynamic_mode : {True, False}, optional + Whether the locator should work in dynamic mode. + base : {int}, optional + quarter : {int}, optional + month : {int}, optional + day : {int}, optional + """ + + def __init__( + self, + freq, + minor_locator=False, + dynamic_mode=True, + base=1, + quarter=1, + month=1, + day=1, + plot_obj=None, + ): + if isinstance(freq, str): + freq = get_freq(freq) + self.freq = freq + self.base = base + (self.quarter, self.month, self.day) = (quarter, month, day) + self.isminor = minor_locator + self.isdynamic = dynamic_mode + self.offset = 0 + self.plot_obj = plot_obj + self.finder = get_finder(freq) + + def _get_default_locs(self, vmin, vmax): + "Returns the default locations of ticks." + + if self.plot_obj.date_axis_info is None: + self.plot_obj.date_axis_info = self.finder(vmin, vmax, self.freq) + + locator = self.plot_obj.date_axis_info + + if self.isminor: + return np.compress(locator["min"], locator["val"]) + return np.compress(locator["maj"], locator["val"]) + + def __call__(self): + "Return the locations of the ticks." + # axis calls Locator.set_axis inside set_m_formatter + + vi = tuple(self.axis.get_view_interval()) + if vi != self.plot_obj.view_interval: + self.plot_obj.date_axis_info = None + self.plot_obj.view_interval = vi + vmin, vmax = vi + if vmax < vmin: + vmin, vmax = vmax, vmin + if self.isdynamic: + locs = self._get_default_locs(vmin, vmax) + else: # pragma: no cover + base = self.base + (d, m) = divmod(vmin, base) + vmin = (d + 1) * base + locs = list(range(vmin, vmax + 1, base)) + return locs + + def autoscale(self): + """ + Sets the view limits to the nearest multiples of base that contain the + data. + """ + # requires matplotlib >= 0.98.0 + (vmin, vmax) = self.axis.get_data_interval() + + locs = self._get_default_locs(vmin, vmax) + (vmin, vmax) = locs[[0, -1]] + if vmin == vmax: + vmin -= 1 + vmax += 1 + return nonsingular(vmin, vmax) + + +# ------------------------------------------------------------------------- +# --- Formatter --- +# ------------------------------------------------------------------------- + + +class TimeSeries_DateFormatter(Formatter): + """ + Formats the ticks along an axis controlled by a :class:`PeriodIndex`. + + Parameters + ---------- + freq : {int, string} + Valid frequency specifier. + minor_locator : {False, True} + Whether the current formatter should apply to minor ticks (True) or + major ticks (False). + dynamic_mode : {True, False} + Whether the formatter works in dynamic mode or not. + """ + + def __init__(self, freq, minor_locator=False, dynamic_mode=True, plot_obj=None): + if isinstance(freq, str): + freq = get_freq(freq) + self.format = None + self.freq = freq + self.locs = [] + self.formatdict = None + self.isminor = minor_locator + self.isdynamic = dynamic_mode + self.offset = 0 + self.plot_obj = plot_obj + self.finder = get_finder(freq) + + def _set_default_format(self, vmin, vmax): + "Returns the default ticks spacing." + + if self.plot_obj.date_axis_info is None: + self.plot_obj.date_axis_info = self.finder(vmin, vmax, self.freq) + info = self.plot_obj.date_axis_info + + if self.isminor: + format = np.compress(info["min"] & np.logical_not(info["maj"]), info) + else: + format = np.compress(info["maj"], info) + self.formatdict = {x: f for (x, _, _, f) in format} + return self.formatdict + + def set_locs(self, locs): + "Sets the locations of the ticks" + # don't actually use the locs. This is just needed to work with + # matplotlib. Force to use vmin, vmax + + self.locs = locs + + (vmin, vmax) = vi = tuple(self.axis.get_view_interval()) + if vi != self.plot_obj.view_interval: + self.plot_obj.date_axis_info = None + self.plot_obj.view_interval = vi + if vmax < vmin: + (vmin, vmax) = (vmax, vmin) + self._set_default_format(vmin, vmax) + + def __call__(self, x, pos=0): + + if self.formatdict is None: + return "" + else: + fmt = self.formatdict.pop(x, "") + if isinstance(fmt, np.bytes_): + fmt = fmt.decode("utf-8") + return Period(ordinal=int(x), freq=self.freq).strftime(fmt) + + +class TimeSeries_TimedeltaFormatter(Formatter): + """ + Formats the ticks along an axis controlled by a :class:`TimedeltaIndex`. + """ + + @staticmethod + def format_timedelta_ticks(x, pos, n_decimals): + """ + Convert seconds to 'D days HH:MM:SS.F' + """ + s, ns = divmod(x, 1e9) + m, s = divmod(s, 60) + h, m = divmod(m, 60) + d, h = divmod(h, 24) + decimals = int(ns * 10 ** (n_decimals - 9)) + s = f"{int(h):02d}:{int(m):02d}:{int(s):02d}" + if n_decimals > 0: + s += f".{decimals:0{n_decimals}d}" + if d != 0: + s = f"{int(d):d} days {s}" + return s + + def __call__(self, x, pos=0): + (vmin, vmax) = tuple(self.axis.get_view_interval()) + n_decimals = int(np.ceil(np.log10(100 * 1e9 / (vmax - vmin)))) + if n_decimals > 9: + n_decimals = 9 + return self.format_timedelta_ticks(x, pos, n_decimals) diff --git a/venv/Lib/site-packages/pandas/plotting/_matplotlib/core.py b/venv/Lib/site-packages/pandas/plotting/_matplotlib/core.py new file mode 100644 index 0000000..2d68bb4 --- /dev/null +++ b/venv/Lib/site-packages/pandas/plotting/_matplotlib/core.py @@ -0,0 +1,1517 @@ +import re +from typing import Optional +import warnings + +import numpy as np + +from pandas.errors import AbstractMethodError +from pandas.util._decorators import cache_readonly + +from pandas.core.dtypes.common import ( + is_hashable, + is_integer, + is_iterator, + is_list_like, + is_number, +) +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCIndexClass, + ABCMultiIndex, + ABCPeriodIndex, + ABCSeries, +) +from pandas.core.dtypes.missing import isna, notna + +import pandas.core.common as com + +from pandas.io.formats.printing import pprint_thing +from pandas.plotting._matplotlib.compat import _mpl_ge_3_0_0 +from pandas.plotting._matplotlib.converter import register_pandas_matplotlib_converters +from pandas.plotting._matplotlib.style import _get_standard_colors +from pandas.plotting._matplotlib.tools import ( + _flatten, + _get_all_lines, + _get_xlim, + _handle_shared_axes, + _subplots, + format_date_labels, + table, +) + + +class MPLPlot: + """ + Base class for assembling a pandas plot using matplotlib + + Parameters + ---------- + data : + + """ + + @property + def _kind(self): + """Specify kind str. Must be overridden in child class""" + raise NotImplementedError + + _layout_type = "vertical" + _default_rot = 0 + orientation: Optional[str] = None + _pop_attributes = [ + "label", + "style", + "logy", + "logx", + "loglog", + "mark_right", + "stacked", + ] + _attr_defaults = { + "logy": False, + "logx": False, + "loglog": False, + "mark_right": True, + "stacked": False, + } + + def __init__( + self, + data, + kind=None, + by=None, + subplots=False, + sharex=None, + sharey=False, + use_index=True, + figsize=None, + grid=None, + legend=True, + rot=None, + ax=None, + fig=None, + title=None, + xlim=None, + ylim=None, + xticks=None, + yticks=None, + sort_columns=False, + fontsize=None, + secondary_y=False, + colormap=None, + table=False, + layout=None, + include_bool=False, + **kwds, + ): + + import matplotlib.pyplot as plt + + self.data = data + self.by = by + + self.kind = kind + + self.sort_columns = sort_columns + + self.subplots = subplots + + if sharex is None: + if ax is None: + self.sharex = True + else: + # if we get an axis, the users should do the visibility + # setting... + self.sharex = False + else: + self.sharex = sharex + + self.sharey = sharey + self.figsize = figsize + self.layout = layout + + self.xticks = xticks + self.yticks = yticks + self.xlim = xlim + self.ylim = ylim + self.title = title + self.use_index = use_index + + self.fontsize = fontsize + + if rot is not None: + self.rot = rot + # need to know for format_date_labels since it's rotated to 30 by + # default + self._rot_set = True + else: + self._rot_set = False + self.rot = self._default_rot + + if grid is None: + grid = False if secondary_y else plt.rcParams["axes.grid"] + + self.grid = grid + self.legend = legend + self.legend_handles = [] + self.legend_labels = [] + + for attr in self._pop_attributes: + value = kwds.pop(attr, self._attr_defaults.get(attr, None)) + setattr(self, attr, value) + + self.ax = ax + self.fig = fig + self.axes = None + + # parse errorbar input if given + xerr = kwds.pop("xerr", None) + yerr = kwds.pop("yerr", None) + self.errors = { + kw: self._parse_errorbars(kw, err) + for kw, err in zip(["xerr", "yerr"], [xerr, yerr]) + } + + if not isinstance(secondary_y, (bool, tuple, list, np.ndarray, ABCIndexClass)): + secondary_y = [secondary_y] + self.secondary_y = secondary_y + + # ugly TypeError if user passes matplotlib's `cmap` name. + # Probably better to accept either. + if "cmap" in kwds and colormap: + raise TypeError("Only specify one of `cmap` and `colormap`.") + elif "cmap" in kwds: + self.colormap = kwds.pop("cmap") + else: + self.colormap = colormap + + self.table = table + self.include_bool = include_bool + + self.kwds = kwds + + self._validate_color_args() + + def _validate_color_args(self): + import matplotlib.colors + + if ( + "color" in self.kwds + and self.nseries == 1 + and not is_list_like(self.kwds["color"]) + ): + # support series.plot(color='green') + self.kwds["color"] = [self.kwds["color"]] + + if ( + "color" in self.kwds + and isinstance(self.kwds["color"], tuple) + and self.nseries == 1 + and len(self.kwds["color"]) in (3, 4) + ): + # support RGB and RGBA tuples in series plot + self.kwds["color"] = [self.kwds["color"]] + + if ( + "color" in self.kwds or "colors" in self.kwds + ) and self.colormap is not None: + warnings.warn( + "'color' and 'colormap' cannot be used simultaneously. Using 'color'" + ) + + if "color" in self.kwds and self.style is not None: + if is_list_like(self.style): + styles = self.style + else: + styles = [self.style] + # need only a single match + for s in styles: + for char in s: + if char in matplotlib.colors.BASE_COLORS: + raise ValueError( + "Cannot pass 'style' string with a color symbol and " + "'color' keyword argument. Please use one or the other or " + "pass 'style' without a color symbol" + ) + + def _iter_data(self, data=None, keep_index=False, fillna=None): + if data is None: + data = self.data + if fillna is not None: + data = data.fillna(fillna) + + for col, values in data.items(): + if keep_index is True: + yield col, values + else: + yield col, values.values + + @property + def nseries(self): + if self.data.ndim == 1: + return 1 + else: + return self.data.shape[1] + + def draw(self): + self.plt.draw_if_interactive() + + def generate(self): + self._args_adjust() + self._compute_plot_data() + self._setup_subplots() + self._make_plot() + self._add_table() + self._make_legend() + self._adorn_subplots() + + for ax in self.axes: + self._post_plot_logic_common(ax, self.data) + self._post_plot_logic(ax, self.data) + + def _args_adjust(self): + pass + + def _has_plotted_object(self, ax): + """check whether ax has data""" + return len(ax.lines) != 0 or len(ax.artists) != 0 or len(ax.containers) != 0 + + def _maybe_right_yaxis(self, ax, axes_num): + if not self.on_right(axes_num): + # secondary axes may be passed via ax kw + return self._get_ax_layer(ax) + + if hasattr(ax, "right_ax"): + # if it has right_ax proparty, ``ax`` must be left axes + return ax.right_ax + elif hasattr(ax, "left_ax"): + # if it has left_ax proparty, ``ax`` must be right axes + return ax + else: + # otherwise, create twin axes + orig_ax, new_ax = ax, ax.twinx() + # TODO: use Matplotlib public API when available + new_ax._get_lines = orig_ax._get_lines + new_ax._get_patches_for_fill = orig_ax._get_patches_for_fill + orig_ax.right_ax, new_ax.left_ax = new_ax, orig_ax + + if not self._has_plotted_object(orig_ax): # no data on left y + orig_ax.get_yaxis().set_visible(False) + + if self.logy is True or self.loglog is True: + new_ax.set_yscale("log") + elif self.logy == "sym" or self.loglog == "sym": + new_ax.set_yscale("symlog") + return new_ax + + def _setup_subplots(self): + if self.subplots: + fig, axes = _subplots( + naxes=self.nseries, + sharex=self.sharex, + sharey=self.sharey, + figsize=self.figsize, + ax=self.ax, + layout=self.layout, + layout_type=self._layout_type, + ) + else: + if self.ax is None: + fig = self.plt.figure(figsize=self.figsize) + axes = fig.add_subplot(111) + else: + fig = self.ax.get_figure() + if self.figsize is not None: + fig.set_size_inches(self.figsize) + axes = self.ax + + axes = _flatten(axes) + + valid_log = {False, True, "sym", None} + input_log = {self.logx, self.logy, self.loglog} + if input_log - valid_log: + invalid_log = next(iter((input_log - valid_log))) + raise ValueError( + f"Boolean, None and 'sym' are valid options, '{invalid_log}' is given." + ) + + if self.logx is True or self.loglog is True: + [a.set_xscale("log") for a in axes] + elif self.logx == "sym" or self.loglog == "sym": + [a.set_xscale("symlog") for a in axes] + + if self.logy is True or self.loglog is True: + [a.set_yscale("log") for a in axes] + elif self.logy == "sym" or self.loglog == "sym": + [a.set_yscale("symlog") for a in axes] + + self.fig = fig + self.axes = axes + + @property + def result(self): + """ + Return result axes + """ + if self.subplots: + if self.layout is not None and not is_list_like(self.ax): + return self.axes.reshape(*self.layout) + else: + return self.axes + else: + sec_true = isinstance(self.secondary_y, bool) and self.secondary_y + all_sec = ( + is_list_like(self.secondary_y) and len(self.secondary_y) == self.nseries + ) + if sec_true or all_sec: + # if all data is plotted on secondary, return right axes + return self._get_ax_layer(self.axes[0], primary=False) + else: + return self.axes[0] + + def _compute_plot_data(self): + data = self.data + + if isinstance(data, ABCSeries): + label = self.label + if label is None and data.name is None: + label = "None" + data = data.to_frame(name=label) + + # GH16953, _convert is needed as fallback, for ``Series`` + # with ``dtype == object`` + data = data._convert(datetime=True, timedelta=True) + include_type = [np.number, "datetime", "datetimetz", "timedelta"] + + # GH23719, allow plotting boolean + if self.include_bool is True: + include_type.append(np.bool_) + + # GH22799, exclude datatime-like type for boxplot + exclude_type = None + if self._kind == "box": + # TODO: change after solving issue 27881 + include_type = [np.number] + exclude_type = ["timedelta"] + + # GH 18755, include object and category type for scatter plot + if self._kind == "scatter": + include_type.extend(["object", "category"]) + + numeric_data = data.select_dtypes(include=include_type, exclude=exclude_type) + + try: + is_empty = numeric_data.columns.empty + except AttributeError: + is_empty = not len(numeric_data) + + # no non-numeric frames or series allowed + if is_empty: + raise TypeError("no numeric data to plot") + + # GH25587: cast ExtensionArray of pandas (IntegerArray, etc.) to + # np.ndarray before plot. + numeric_data = numeric_data.copy() + for col in numeric_data: + numeric_data[col] = np.asarray(numeric_data[col]) + + self.data = numeric_data + + def _make_plot(self): + raise AbstractMethodError(self) + + def _add_table(self): + if self.table is False: + return + elif self.table is True: + data = self.data.transpose() + else: + data = self.table + ax = self._get_ax(0) + table(ax, data) + + def _post_plot_logic_common(self, ax, data): + """Common post process for each axes""" + + if self.orientation == "vertical" or self.orientation is None: + self._apply_axis_properties(ax.xaxis, rot=self.rot, fontsize=self.fontsize) + self._apply_axis_properties(ax.yaxis, fontsize=self.fontsize) + + if hasattr(ax, "right_ax"): + self._apply_axis_properties(ax.right_ax.yaxis, fontsize=self.fontsize) + + elif self.orientation == "horizontal": + self._apply_axis_properties(ax.yaxis, rot=self.rot, fontsize=self.fontsize) + self._apply_axis_properties(ax.xaxis, fontsize=self.fontsize) + + if hasattr(ax, "right_ax"): + self._apply_axis_properties(ax.right_ax.yaxis, fontsize=self.fontsize) + else: # pragma no cover + raise ValueError + + def _post_plot_logic(self, ax, data): + """Post process for each axes. Overridden in child classes""" + pass + + def _adorn_subplots(self): + """Common post process unrelated to data""" + if len(self.axes) > 0: + all_axes = self._get_subplots() + nrows, ncols = self._get_axes_layout() + _handle_shared_axes( + axarr=all_axes, + nplots=len(all_axes), + naxes=nrows * ncols, + nrows=nrows, + ncols=ncols, + sharex=self.sharex, + sharey=self.sharey, + ) + + for ax in self.axes: + if self.yticks is not None: + ax.set_yticks(self.yticks) + + if self.xticks is not None: + ax.set_xticks(self.xticks) + + if self.ylim is not None: + ax.set_ylim(self.ylim) + + if self.xlim is not None: + ax.set_xlim(self.xlim) + + ax.grid(self.grid) + + if self.title: + if self.subplots: + if is_list_like(self.title): + if len(self.title) != self.nseries: + raise ValueError( + "The length of `title` must equal the number " + "of columns if using `title` of type `list` " + "and `subplots=True`.\n" + f"length of title = {len(self.title)}\n" + f"number of columns = {self.nseries}" + ) + + for (ax, title) in zip(self.axes, self.title): + ax.set_title(title) + else: + self.fig.suptitle(self.title) + else: + if is_list_like(self.title): + msg = ( + "Using `title` of type `list` is not supported " + "unless `subplots=True` is passed" + ) + raise ValueError(msg) + self.axes[0].set_title(self.title) + + def _apply_axis_properties(self, axis, rot=None, fontsize=None): + """ Tick creation within matplotlib is reasonably expensive and is + internally deferred until accessed as Ticks are created/destroyed + multiple times per draw. It's therefore beneficial for us to avoid + accessing unless we will act on the Tick. + """ + + if rot is not None or fontsize is not None: + # rot=0 is a valid setting, hence the explicit None check + labels = axis.get_majorticklabels() + axis.get_minorticklabels() + for label in labels: + if rot is not None: + label.set_rotation(rot) + if fontsize is not None: + label.set_fontsize(fontsize) + + @property + def legend_title(self): + if not isinstance(self.data.columns, ABCMultiIndex): + name = self.data.columns.name + if name is not None: + name = pprint_thing(name) + return name + else: + stringified = map(pprint_thing, self.data.columns.names) + return ",".join(stringified) + + def _add_legend_handle(self, handle, label, index=None): + if label is not None: + if self.mark_right and index is not None: + if self.on_right(index): + label = label + " (right)" + self.legend_handles.append(handle) + self.legend_labels.append(label) + + def _make_legend(self): + ax, leg, handle = self._get_ax_legend_handle(self.axes[0]) + + handles = [] + labels = [] + title = "" + + if not self.subplots: + if leg is not None: + title = leg.get_title().get_text() + # Replace leg.LegendHandles because it misses marker info + handles.extend(handle) + labels = [x.get_text() for x in leg.get_texts()] + + if self.legend: + if self.legend == "reverse": + self.legend_handles = reversed(self.legend_handles) + self.legend_labels = reversed(self.legend_labels) + + handles += self.legend_handles + labels += self.legend_labels + + if self.legend_title is not None: + title = self.legend_title + + if len(handles) > 0: + ax.legend(handles, labels, loc="best", title=title) + + elif self.subplots and self.legend: + for ax in self.axes: + if ax.get_visible(): + ax.legend(loc="best") + + def _get_ax_legend_handle(self, ax): + """ + Take in axes and return ax, legend and handle under different scenarios + """ + leg = ax.get_legend() + + # Get handle from axes + handle, _ = ax.get_legend_handles_labels() + other_ax = getattr(ax, "left_ax", None) or getattr(ax, "right_ax", None) + other_leg = None + if other_ax is not None: + other_leg = other_ax.get_legend() + if leg is None and other_leg is not None: + leg = other_leg + ax = other_ax + return ax, leg, handle + + @cache_readonly + def plt(self): + import matplotlib.pyplot as plt + + return plt + + _need_to_set_index = False + + def _get_xticks(self, convert_period=False): + index = self.data.index + is_datetype = index.inferred_type in ("datetime", "date", "datetime64", "time") + + if self.use_index: + if convert_period and isinstance(index, ABCPeriodIndex): + self.data = self.data.reindex(index=index.sort_values()) + x = self.data.index.to_timestamp()._mpl_repr() + elif index.is_numeric(): + """ + Matplotlib supports numeric values or datetime objects as + xaxis values. Taking LBYL approach here, by the time + matplotlib raises exception when using non numeric/datetime + values for xaxis, several actions are already taken by plt. + """ + x = index._mpl_repr() + elif is_datetype: + self.data = self.data[notna(self.data.index)] + self.data = self.data.sort_index() + x = self.data.index._mpl_repr() + else: + self._need_to_set_index = True + x = list(range(len(index))) + else: + x = list(range(len(index))) + + return x + + @classmethod + @register_pandas_matplotlib_converters + def _plot(cls, ax, x, y, style=None, is_errorbar=False, **kwds): + mask = isna(y) + if mask.any(): + y = np.ma.array(y) + y = np.ma.masked_where(mask, y) + + if isinstance(x, ABCIndexClass): + x = x._mpl_repr() + + if is_errorbar: + if "xerr" in kwds: + kwds["xerr"] = np.array(kwds.get("xerr")) + if "yerr" in kwds: + kwds["yerr"] = np.array(kwds.get("yerr")) + return ax.errorbar(x, y, **kwds) + else: + # prevent style kwarg from going to errorbar, where it is + # unsupported + if style is not None: + args = (x, y, style) + else: + args = (x, y) + return ax.plot(*args, **kwds) + + def _get_index_name(self): + if isinstance(self.data.index, ABCMultiIndex): + name = self.data.index.names + if com.any_not_none(*name): + name = ",".join(pprint_thing(x) for x in name) + else: + name = None + else: + name = self.data.index.name + if name is not None: + name = pprint_thing(name) + + return name + + @classmethod + def _get_ax_layer(cls, ax, primary=True): + """get left (primary) or right (secondary) axes""" + if primary: + return getattr(ax, "left_ax", ax) + else: + return getattr(ax, "right_ax", ax) + + def _get_ax(self, i): + # get the twinx ax if appropriate + if self.subplots: + ax = self.axes[i] + ax = self._maybe_right_yaxis(ax, i) + self.axes[i] = ax + else: + ax = self.axes[0] + ax = self._maybe_right_yaxis(ax, i) + + ax.get_yaxis().set_visible(True) + return ax + + @classmethod + def get_default_ax(cls, ax): + import matplotlib.pyplot as plt + + if ax is None and len(plt.get_fignums()) > 0: + with plt.rc_context(): + ax = plt.gca() + ax = cls._get_ax_layer(ax) + + def on_right(self, i): + if isinstance(self.secondary_y, bool): + return self.secondary_y + + if isinstance(self.secondary_y, (tuple, list, np.ndarray, ABCIndexClass)): + return self.data.columns[i] in self.secondary_y + + def _apply_style_colors(self, colors, kwds, col_num, label): + """ + Manage style and color based on column number and its label. + Returns tuple of appropriate style and kwds which "color" may be added. + """ + style = None + if self.style is not None: + if isinstance(self.style, list): + try: + style = self.style[col_num] + except IndexError: + pass + elif isinstance(self.style, dict): + style = self.style.get(label, style) + else: + style = self.style + + has_color = "color" in kwds or self.colormap is not None + nocolor_style = style is None or re.match("[a-z]+", style) is None + if (has_color or self.subplots) and nocolor_style: + kwds["color"] = colors[col_num % len(colors)] + return style, kwds + + def _get_colors(self, num_colors=None, color_kwds="color"): + if num_colors is None: + num_colors = self.nseries + + return _get_standard_colors( + num_colors=num_colors, + colormap=self.colormap, + color=self.kwds.get(color_kwds), + ) + + def _parse_errorbars(self, label, err): + """ + Look for error keyword arguments and return the actual errorbar data + or return the error DataFrame/dict + + Error bars can be specified in several ways: + Series: the user provides a pandas.Series object of the same + length as the data + ndarray: provides a np.ndarray of the same length as the data + DataFrame/dict: error values are paired with keys matching the + key in the plotted DataFrame + str: the name of the column within the plotted DataFrame + """ + + if err is None: + return None + + def match_labels(data, e): + e = e.reindex(data.index) + return e + + # key-matched DataFrame + if isinstance(err, ABCDataFrame): + + err = match_labels(self.data, err) + # key-matched dict + elif isinstance(err, dict): + pass + + # Series of error values + elif isinstance(err, ABCSeries): + # broadcast error series across data + err = match_labels(self.data, err) + err = np.atleast_2d(err) + err = np.tile(err, (self.nseries, 1)) + + # errors are a column in the dataframe + elif isinstance(err, str): + evalues = self.data[err].values + self.data = self.data[self.data.columns.drop(err)] + err = np.atleast_2d(evalues) + err = np.tile(err, (self.nseries, 1)) + + elif is_list_like(err): + if is_iterator(err): + err = np.atleast_2d(list(err)) + else: + # raw error values + err = np.atleast_2d(err) + + err_shape = err.shape + + # asymmetrical error bars + if err.ndim == 3: + if ( + (err_shape[0] != self.nseries) + or (err_shape[1] != 2) + or (err_shape[2] != len(self.data)) + ): + raise ValueError( + "Asymmetrical error bars should be provided " + f"with the shape ({self.nseries}, 2, {len(self.data)})" + ) + + # broadcast errors to each data series + if len(err) == 1: + err = np.tile(err, (self.nseries, 1)) + + elif is_number(err): + err = np.tile([err], (self.nseries, len(self.data))) + + else: + msg = f"No valid {label} detected" + raise ValueError(msg) + + return err + + def _get_errorbars(self, label=None, index=None, xerr=True, yerr=True): + errors = {} + + for kw, flag in zip(["xerr", "yerr"], [xerr, yerr]): + if flag: + err = self.errors[kw] + # user provided label-matched dataframe of errors + if isinstance(err, (ABCDataFrame, dict)): + if label is not None and label in err.keys(): + err = err[label] + else: + err = None + elif index is not None and err is not None: + err = err[index] + + if err is not None: + errors[kw] = err + return errors + + def _get_subplots(self): + from matplotlib.axes import Subplot + + return [ + ax for ax in self.axes[0].get_figure().get_axes() if isinstance(ax, Subplot) + ] + + def _get_axes_layout(self): + axes = self._get_subplots() + x_set = set() + y_set = set() + for ax in axes: + # check axes coordinates to estimate layout + points = ax.get_position().get_points() + x_set.add(points[0][0]) + y_set.add(points[0][1]) + return (len(y_set), len(x_set)) + + +class PlanePlot(MPLPlot): + """ + Abstract class for plotting on plane, currently scatter and hexbin. + """ + + _layout_type = "single" + + def __init__(self, data, x, y, **kwargs): + MPLPlot.__init__(self, data, **kwargs) + if x is None or y is None: + raise ValueError(self._kind + " requires an x and y column") + if is_integer(x) and not self.data.columns.holds_integer(): + x = self.data.columns[x] + if is_integer(y) and not self.data.columns.holds_integer(): + y = self.data.columns[y] + + # Scatter plot allows to plot objects data + if self._kind == "hexbin": + if len(self.data[x]._get_numeric_data()) == 0: + raise ValueError(self._kind + " requires x column to be numeric") + if len(self.data[y]._get_numeric_data()) == 0: + raise ValueError(self._kind + " requires y column to be numeric") + + self.x = x + self.y = y + + @property + def nseries(self): + return 1 + + def _post_plot_logic(self, ax, data): + x, y = self.x, self.y + ax.set_ylabel(pprint_thing(y)) + ax.set_xlabel(pprint_thing(x)) + + def _plot_colorbar(self, ax, **kwds): + # Addresses issues #10611 and #10678: + # When plotting scatterplots and hexbinplots in IPython + # inline backend the colorbar axis height tends not to + # exactly match the parent axis height. + # The difference is due to small fractional differences + # in floating points with similar representation. + # To deal with this, this method forces the colorbar + # height to take the height of the parent axes. + # For a more detailed description of the issue + # see the following link: + # https://github.com/ipython/ipython/issues/11215 + img = ax.collections[0] + cbar = self.fig.colorbar(img, ax=ax, **kwds) + + if _mpl_ge_3_0_0(): + # The workaround below is no longer necessary. + return + + points = ax.get_position().get_points() + cbar_points = cbar.ax.get_position().get_points() + + cbar.ax.set_position( + [ + cbar_points[0, 0], + points[0, 1], + cbar_points[1, 0] - cbar_points[0, 0], + points[1, 1] - points[0, 1], + ] + ) + # To see the discrepancy in axis heights uncomment + # the following two lines: + # print(points[1, 1] - points[0, 1]) + # print(cbar_points[1, 1] - cbar_points[0, 1]) + + +class ScatterPlot(PlanePlot): + _kind = "scatter" + + def __init__(self, data, x, y, s=None, c=None, **kwargs): + if s is None: + # hide the matplotlib default for size, in case we want to change + # the handling of this argument later + s = 20 + super().__init__(data, x, y, s=s, **kwargs) + if is_integer(c) and not self.data.columns.holds_integer(): + c = self.data.columns[c] + self.c = c + + def _make_plot(self): + x, y, c, data = self.x, self.y, self.c, self.data + ax = self.axes[0] + + c_is_column = is_hashable(c) and c in self.data.columns + + # plot a colorbar only if a colormap is provided or necessary + cb = self.kwds.pop("colorbar", self.colormap or c_is_column) + + # pandas uses colormap, matplotlib uses cmap. + cmap = self.colormap or "Greys" + cmap = self.plt.cm.get_cmap(cmap) + color = self.kwds.pop("color", None) + if c is not None and color is not None: + raise TypeError("Specify exactly one of `c` and `color`") + elif c is None and color is None: + c_values = self.plt.rcParams["patch.facecolor"] + elif color is not None: + c_values = color + elif c_is_column: + c_values = self.data[c].values + else: + c_values = c + + if self.legend and hasattr(self, "label"): + label = self.label + else: + label = None + scatter = ax.scatter( + data[x].values, + data[y].values, + c=c_values, + label=label, + cmap=cmap, + **self.kwds, + ) + if cb: + cbar_label = c if c_is_column else "" + self._plot_colorbar(ax, label=cbar_label) + + if label is not None: + self._add_legend_handle(scatter, label) + else: + self.legend = False + + errors_x = self._get_errorbars(label=x, index=0, yerr=False) + errors_y = self._get_errorbars(label=y, index=0, xerr=False) + if len(errors_x) > 0 or len(errors_y) > 0: + err_kwds = dict(errors_x, **errors_y) + err_kwds["ecolor"] = scatter.get_facecolor()[0] + ax.errorbar(data[x].values, data[y].values, linestyle="none", **err_kwds) + + +class HexBinPlot(PlanePlot): + _kind = "hexbin" + + def __init__(self, data, x, y, C=None, **kwargs): + super().__init__(data, x, y, **kwargs) + if is_integer(C) and not self.data.columns.holds_integer(): + C = self.data.columns[C] + self.C = C + + def _make_plot(self): + x, y, data, C = self.x, self.y, self.data, self.C + ax = self.axes[0] + # pandas uses colormap, matplotlib uses cmap. + cmap = self.colormap or "BuGn" + cmap = self.plt.cm.get_cmap(cmap) + cb = self.kwds.pop("colorbar", True) + + if C is None: + c_values = None + else: + c_values = data[C].values + + ax.hexbin(data[x].values, data[y].values, C=c_values, cmap=cmap, **self.kwds) + if cb: + self._plot_colorbar(ax) + + def _make_legend(self): + pass + + +class LinePlot(MPLPlot): + _kind = "line" + _default_rot = 0 + orientation = "vertical" + + def __init__(self, data, **kwargs): + from pandas.plotting import plot_params + + MPLPlot.__init__(self, data, **kwargs) + if self.stacked: + self.data = self.data.fillna(value=0) + self.x_compat = plot_params["x_compat"] + if "x_compat" in self.kwds: + self.x_compat = bool(self.kwds.pop("x_compat")) + + def _is_ts_plot(self): + # this is slightly deceptive + return not self.x_compat and self.use_index and self._use_dynamic_x() + + def _use_dynamic_x(self): + from pandas.plotting._matplotlib.timeseries import _use_dynamic_x + + return _use_dynamic_x(self._get_ax(0), self.data) + + def _make_plot(self): + if self._is_ts_plot(): + from pandas.plotting._matplotlib.timeseries import _maybe_convert_index + + data = _maybe_convert_index(self._get_ax(0), self.data) + + x = data.index # dummy, not used + plotf = self._ts_plot + it = self._iter_data(data=data, keep_index=True) + else: + x = self._get_xticks(convert_period=True) + plotf = self._plot + it = self._iter_data() + + stacking_id = self._get_stacking_id() + is_errorbar = com.any_not_none(*self.errors.values()) + + colors = self._get_colors() + for i, (label, y) in enumerate(it): + ax = self._get_ax(i) + kwds = self.kwds.copy() + style, kwds = self._apply_style_colors(colors, kwds, i, label) + + errors = self._get_errorbars(label=label, index=i) + kwds = dict(kwds, **errors) + + label = pprint_thing(label) # .encode('utf-8') + kwds["label"] = label + + newlines = plotf( + ax, + x, + y, + style=style, + column_num=i, + stacking_id=stacking_id, + is_errorbar=is_errorbar, + **kwds, + ) + self._add_legend_handle(newlines[0], label, index=i) + + if self._is_ts_plot(): + + # reset of xlim should be used for ts data + # TODO: GH28021, should find a way to change view limit on xaxis + lines = _get_all_lines(ax) + left, right = _get_xlim(lines) + ax.set_xlim(left, right) + + @classmethod + def _plot(cls, ax, x, y, style=None, column_num=None, stacking_id=None, **kwds): + # column_num is used to get the target column from protf in line and + # area plots + if column_num == 0: + cls._initialize_stacker(ax, stacking_id, len(y)) + y_values = cls._get_stacked_values(ax, stacking_id, y, kwds["label"]) + lines = MPLPlot._plot(ax, x, y_values, style=style, **kwds) + cls._update_stacker(ax, stacking_id, y) + return lines + + @classmethod + def _ts_plot(cls, ax, x, data, style=None, **kwds): + from pandas.plotting._matplotlib.timeseries import ( + _maybe_resample, + _decorate_axes, + format_dateaxis, + ) + + # accept x to be consistent with normal plot func, + # x is not passed to tsplot as it uses data.index as x coordinate + # column_num must be in kwds for stacking purpose + freq, data = _maybe_resample(data, ax, kwds) + + # Set ax with freq info + _decorate_axes(ax, freq, kwds) + # digging deeper + if hasattr(ax, "left_ax"): + _decorate_axes(ax.left_ax, freq, kwds) + if hasattr(ax, "right_ax"): + _decorate_axes(ax.right_ax, freq, kwds) + ax._plot_data.append((data, cls._kind, kwds)) + + lines = cls._plot(ax, data.index, data.values, style=style, **kwds) + # set date formatter, locators and rescale limits + format_dateaxis(ax, ax.freq, data.index) + return lines + + def _get_stacking_id(self): + if self.stacked: + return id(self.data) + else: + return None + + @classmethod + def _initialize_stacker(cls, ax, stacking_id, n): + if stacking_id is None: + return + if not hasattr(ax, "_stacker_pos_prior"): + ax._stacker_pos_prior = {} + if not hasattr(ax, "_stacker_neg_prior"): + ax._stacker_neg_prior = {} + ax._stacker_pos_prior[stacking_id] = np.zeros(n) + ax._stacker_neg_prior[stacking_id] = np.zeros(n) + + @classmethod + def _get_stacked_values(cls, ax, stacking_id, values, label): + if stacking_id is None: + return values + if not hasattr(ax, "_stacker_pos_prior"): + # stacker may not be initialized for subplots + cls._initialize_stacker(ax, stacking_id, len(values)) + + if (values >= 0).all(): + return ax._stacker_pos_prior[stacking_id] + values + elif (values <= 0).all(): + return ax._stacker_neg_prior[stacking_id] + values + + raise ValueError( + "When stacked is True, each column must be either " + "all positive or negative." + f"{label} contains both positive and negative values" + ) + + @classmethod + def _update_stacker(cls, ax, stacking_id, values): + if stacking_id is None: + return + if (values >= 0).all(): + ax._stacker_pos_prior[stacking_id] += values + elif (values <= 0).all(): + ax._stacker_neg_prior[stacking_id] += values + + def _post_plot_logic(self, ax, data): + from matplotlib.ticker import FixedLocator + + def get_label(i): + try: + return pprint_thing(data.index[i]) + except Exception: + return "" + + if self._need_to_set_index: + xticks = ax.get_xticks() + xticklabels = [get_label(x) for x in xticks] + ax.set_xticklabels(xticklabels) + ax.xaxis.set_major_locator(FixedLocator(xticks)) + + condition = ( + not self._use_dynamic_x() + and data.index.is_all_dates + and not self.subplots + or (self.subplots and self.sharex) + ) + + index_name = self._get_index_name() + + if condition: + # irregular TS rotated 30 deg. by default + # probably a better place to check / set this. + if not self._rot_set: + self.rot = 30 + format_date_labels(ax, rot=self.rot) + + if index_name is not None and self.use_index: + ax.set_xlabel(index_name) + + +class AreaPlot(LinePlot): + _kind = "area" + + def __init__(self, data, **kwargs): + kwargs.setdefault("stacked", True) + data = data.fillna(value=0) + LinePlot.__init__(self, data, **kwargs) + + if not self.stacked: + # use smaller alpha to distinguish overlap + self.kwds.setdefault("alpha", 0.5) + + if self.logy or self.loglog: + raise ValueError("Log-y scales are not supported in area plot") + + @classmethod + def _plot( + cls, + ax, + x, + y, + style=None, + column_num=None, + stacking_id=None, + is_errorbar=False, + **kwds, + ): + + if column_num == 0: + cls._initialize_stacker(ax, stacking_id, len(y)) + y_values = cls._get_stacked_values(ax, stacking_id, y, kwds["label"]) + + # need to remove label, because subplots uses mpl legend as it is + line_kwds = kwds.copy() + line_kwds.pop("label") + lines = MPLPlot._plot(ax, x, y_values, style=style, **line_kwds) + + # get data from the line to get coordinates for fill_between + xdata, y_values = lines[0].get_data(orig=False) + + # unable to use ``_get_stacked_values`` here to get starting point + if stacking_id is None: + start = np.zeros(len(y)) + elif (y >= 0).all(): + start = ax._stacker_pos_prior[stacking_id] + elif (y <= 0).all(): + start = ax._stacker_neg_prior[stacking_id] + else: + start = np.zeros(len(y)) + + if "color" not in kwds: + kwds["color"] = lines[0].get_color() + + rect = ax.fill_between(xdata, start, y_values, **kwds) + cls._update_stacker(ax, stacking_id, y) + + # LinePlot expects list of artists + res = [rect] + return res + + def _post_plot_logic(self, ax, data): + LinePlot._post_plot_logic(self, ax, data) + + if self.ylim is None: + if (data >= 0).all().all(): + ax.set_ylim(0, None) + elif (data <= 0).all().all(): + ax.set_ylim(None, 0) + + +class BarPlot(MPLPlot): + _kind = "bar" + _default_rot = 90 + orientation = "vertical" + + def __init__(self, data, **kwargs): + # we have to treat a series differently than a + # 1-column DataFrame w.r.t. color handling + self._is_series = isinstance(data, ABCSeries) + self.bar_width = kwargs.pop("width", 0.5) + pos = kwargs.pop("position", 0.5) + kwargs.setdefault("align", "center") + self.tick_pos = np.arange(len(data)) + + self.bottom = kwargs.pop("bottom", 0) + self.left = kwargs.pop("left", 0) + + self.log = kwargs.pop("log", False) + MPLPlot.__init__(self, data, **kwargs) + + if self.stacked or self.subplots: + self.tickoffset = self.bar_width * pos + if kwargs["align"] == "edge": + self.lim_offset = self.bar_width / 2 + else: + self.lim_offset = 0 + else: + if kwargs["align"] == "edge": + w = self.bar_width / self.nseries + self.tickoffset = self.bar_width * (pos - 0.5) + w * 0.5 + self.lim_offset = w * 0.5 + else: + self.tickoffset = self.bar_width * pos + self.lim_offset = 0 + + self.ax_pos = self.tick_pos - self.tickoffset + + def _args_adjust(self): + if is_list_like(self.bottom): + self.bottom = np.array(self.bottom) + if is_list_like(self.left): + self.left = np.array(self.left) + + @classmethod + def _plot(cls, ax, x, y, w, start=0, log=False, **kwds): + return ax.bar(x, y, w, bottom=start, log=log, **kwds) + + @property + def _start_base(self): + return self.bottom + + def _make_plot(self): + import matplotlib as mpl + + colors = self._get_colors() + ncolors = len(colors) + + pos_prior = neg_prior = np.zeros(len(self.data)) + K = self.nseries + + for i, (label, y) in enumerate(self._iter_data(fillna=0)): + ax = self._get_ax(i) + kwds = self.kwds.copy() + if self._is_series: + kwds["color"] = colors + else: + kwds["color"] = colors[i % ncolors] + + errors = self._get_errorbars(label=label, index=i) + kwds = dict(kwds, **errors) + + label = pprint_thing(label) + + if (("yerr" in kwds) or ("xerr" in kwds)) and (kwds.get("ecolor") is None): + kwds["ecolor"] = mpl.rcParams["xtick.color"] + + start = 0 + if self.log and (y >= 1).all(): + start = 1 + start = start + self._start_base + + if self.subplots: + w = self.bar_width / 2 + rect = self._plot( + ax, + self.ax_pos + w, + y, + self.bar_width, + start=start, + label=label, + log=self.log, + **kwds, + ) + ax.set_title(label) + elif self.stacked: + mask = y > 0 + start = np.where(mask, pos_prior, neg_prior) + self._start_base + w = self.bar_width / 2 + rect = self._plot( + ax, + self.ax_pos + w, + y, + self.bar_width, + start=start, + label=label, + log=self.log, + **kwds, + ) + pos_prior = pos_prior + np.where(mask, y, 0) + neg_prior = neg_prior + np.where(mask, 0, y) + else: + w = self.bar_width / K + rect = self._plot( + ax, + self.ax_pos + (i + 0.5) * w, + y, + w, + start=start, + label=label, + log=self.log, + **kwds, + ) + self._add_legend_handle(rect, label, index=i) + + def _post_plot_logic(self, ax, data): + if self.use_index: + str_index = [pprint_thing(key) for key in data.index] + else: + str_index = [pprint_thing(key) for key in range(data.shape[0])] + name = self._get_index_name() + + s_edge = self.ax_pos[0] - 0.25 + self.lim_offset + e_edge = self.ax_pos[-1] + 0.25 + self.bar_width + self.lim_offset + + self._decorate_ticks(ax, name, str_index, s_edge, e_edge) + + def _decorate_ticks(self, ax, name, ticklabels, start_edge, end_edge): + ax.set_xlim((start_edge, end_edge)) + + if self.xticks is not None: + ax.set_xticks(np.array(self.xticks)) + else: + ax.set_xticks(self.tick_pos) + ax.set_xticklabels(ticklabels) + + if name is not None and self.use_index: + ax.set_xlabel(name) + + +class BarhPlot(BarPlot): + _kind = "barh" + _default_rot = 0 + orientation = "horizontal" + + @property + def _start_base(self): + return self.left + + @classmethod + def _plot(cls, ax, x, y, w, start=0, log=False, **kwds): + return ax.barh(x, y, w, left=start, log=log, **kwds) + + def _decorate_ticks(self, ax, name, ticklabels, start_edge, end_edge): + # horizontal bars + ax.set_ylim((start_edge, end_edge)) + ax.set_yticks(self.tick_pos) + ax.set_yticklabels(ticklabels) + if name is not None and self.use_index: + ax.set_ylabel(name) + + +class PiePlot(MPLPlot): + _kind = "pie" + _layout_type = "horizontal" + + def __init__(self, data, kind=None, **kwargs): + data = data.fillna(value=0) + if (data < 0).any().any(): + raise ValueError(f"{kind} doesn't allow negative values") + MPLPlot.__init__(self, data, kind=kind, **kwargs) + + def _args_adjust(self): + self.grid = False + self.logy = False + self.logx = False + self.loglog = False + + def _validate_color_args(self): + pass + + def _make_plot(self): + colors = self._get_colors(num_colors=len(self.data), color_kwds="colors") + self.kwds.setdefault("colors", colors) + + for i, (label, y) in enumerate(self._iter_data()): + ax = self._get_ax(i) + if label is not None: + label = pprint_thing(label) + ax.set_ylabel(label) + + kwds = self.kwds.copy() + + def blank_labeler(label, value): + if value == 0: + return "" + else: + return label + + idx = [pprint_thing(v) for v in self.data.index] + labels = kwds.pop("labels", idx) + # labels is used for each wedge's labels + # Blank out labels for values of 0 so they don't overlap + # with nonzero wedges + if labels is not None: + blabels = [blank_labeler(l, value) for l, value in zip(labels, y)] + else: + blabels = None + results = ax.pie(y, labels=blabels, **kwds) + + if kwds.get("autopct", None) is not None: + patches, texts, autotexts = results + else: + patches, texts = results + autotexts = [] + + if self.fontsize is not None: + for t in texts + autotexts: + t.set_fontsize(self.fontsize) + + # leglabels is used for legend labels + leglabels = labels if labels is not None else idx + for p, l in zip(patches, leglabels): + self._add_legend_handle(p, l) diff --git a/venv/Lib/site-packages/pandas/plotting/_matplotlib/hist.py b/venv/Lib/site-packages/pandas/plotting/_matplotlib/hist.py new file mode 100644 index 0000000..f8b2c7a --- /dev/null +++ b/venv/Lib/site-packages/pandas/plotting/_matplotlib/hist.py @@ -0,0 +1,413 @@ +import numpy as np + +from pandas.core.dtypes.common import is_integer, is_list_like +from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass +from pandas.core.dtypes.missing import isna, remove_na_arraylike + +import pandas.core.common as com + +from pandas.io.formats.printing import pprint_thing +from pandas.plotting._matplotlib.core import LinePlot, MPLPlot +from pandas.plotting._matplotlib.tools import _flatten, _set_ticks_props, _subplots + + +class HistPlot(LinePlot): + _kind = "hist" + + def __init__(self, data, bins=10, bottom=0, **kwargs): + self.bins = bins # use mpl default + self.bottom = bottom + # Do not call LinePlot.__init__ which may fill nan + MPLPlot.__init__(self, data, **kwargs) + + def _args_adjust(self): + if is_integer(self.bins): + # create common bin edge + values = self.data._convert(datetime=True)._get_numeric_data() + values = np.ravel(values) + values = values[~isna(values)] + + _, self.bins = np.histogram( + values, + bins=self.bins, + range=self.kwds.get("range", None), + weights=self.kwds.get("weights", None), + ) + + if is_list_like(self.bottom): + self.bottom = np.array(self.bottom) + + @classmethod + def _plot( + cls, + ax, + y, + style=None, + bins=None, + bottom=0, + column_num=0, + stacking_id=None, + **kwds, + ): + if column_num == 0: + cls._initialize_stacker(ax, stacking_id, len(bins) - 1) + y = y[~isna(y)] + + base = np.zeros(len(bins) - 1) + bottom = bottom + cls._get_stacked_values(ax, stacking_id, base, kwds["label"]) + # ignore style + n, bins, patches = ax.hist(y, bins=bins, bottom=bottom, **kwds) + cls._update_stacker(ax, stacking_id, n) + return patches + + def _make_plot(self): + colors = self._get_colors() + stacking_id = self._get_stacking_id() + + for i, (label, y) in enumerate(self._iter_data()): + ax = self._get_ax(i) + + kwds = self.kwds.copy() + + label = pprint_thing(label) + kwds["label"] = label + + style, kwds = self._apply_style_colors(colors, kwds, i, label) + if style is not None: + kwds["style"] = style + + kwds = self._make_plot_keywords(kwds, y) + artists = self._plot(ax, y, column_num=i, stacking_id=stacking_id, **kwds) + self._add_legend_handle(artists[0], label, index=i) + + def _make_plot_keywords(self, kwds, y): + """merge BoxPlot/KdePlot properties to passed kwds""" + # y is required for KdePlot + kwds["bottom"] = self.bottom + kwds["bins"] = self.bins + return kwds + + def _post_plot_logic(self, ax, data): + if self.orientation == "horizontal": + ax.set_xlabel("Frequency") + else: + ax.set_ylabel("Frequency") + + @property + def orientation(self): + if self.kwds.get("orientation", None) == "horizontal": + return "horizontal" + else: + return "vertical" + + +class KdePlot(HistPlot): + _kind = "kde" + orientation = "vertical" + + def __init__(self, data, bw_method=None, ind=None, **kwargs): + MPLPlot.__init__(self, data, **kwargs) + self.bw_method = bw_method + self.ind = ind + + def _args_adjust(self): + pass + + def _get_ind(self, y): + if self.ind is None: + # np.nanmax() and np.nanmin() ignores the missing values + sample_range = np.nanmax(y) - np.nanmin(y) + ind = np.linspace( + np.nanmin(y) - 0.5 * sample_range, + np.nanmax(y) + 0.5 * sample_range, + 1000, + ) + elif is_integer(self.ind): + sample_range = np.nanmax(y) - np.nanmin(y) + ind = np.linspace( + np.nanmin(y) - 0.5 * sample_range, + np.nanmax(y) + 0.5 * sample_range, + self.ind, + ) + else: + ind = self.ind + return ind + + @classmethod + def _plot( + cls, + ax, + y, + style=None, + bw_method=None, + ind=None, + column_num=None, + stacking_id=None, + **kwds, + ): + from scipy.stats import gaussian_kde + + y = remove_na_arraylike(y) + gkde = gaussian_kde(y, bw_method=bw_method) + + y = gkde.evaluate(ind) + lines = MPLPlot._plot(ax, ind, y, style=style, **kwds) + return lines + + def _make_plot_keywords(self, kwds, y): + kwds["bw_method"] = self.bw_method + kwds["ind"] = self._get_ind(y) + return kwds + + def _post_plot_logic(self, ax, data): + ax.set_ylabel("Density") + + +def _grouped_plot( + plotf, + data, + column=None, + by=None, + numeric_only=True, + figsize=None, + sharex=True, + sharey=True, + layout=None, + rot=0, + ax=None, + **kwargs, +): + + if figsize == "default": + # allowed to specify mpl default with 'default' + raise ValueError( + "figsize='default' is no longer supported. " + "Specify figure size by tuple instead" + ) + + grouped = data.groupby(by) + if column is not None: + grouped = grouped[column] + + naxes = len(grouped) + fig, axes = _subplots( + naxes=naxes, figsize=figsize, sharex=sharex, sharey=sharey, ax=ax, layout=layout + ) + + _axes = _flatten(axes) + + for i, (key, group) in enumerate(grouped): + ax = _axes[i] + if numeric_only and isinstance(group, ABCDataFrame): + group = group._get_numeric_data() + plotf(group, ax, **kwargs) + ax.set_title(pprint_thing(key)) + + return fig, axes + + +def _grouped_hist( + data, + column=None, + by=None, + ax=None, + bins=50, + figsize=None, + layout=None, + sharex=False, + sharey=False, + rot=90, + grid=True, + xlabelsize=None, + xrot=None, + ylabelsize=None, + yrot=None, + **kwargs, +): + """ + Grouped histogram + + Parameters + ---------- + data : Series/DataFrame + column : object, optional + by : object, optional + ax : axes, optional + bins : int, default 50 + figsize : tuple, optional + layout : optional + sharex : bool, default False + sharey : bool, default False + rot : int, default 90 + grid : bool, default True + kwargs : dict, keyword arguments passed to matplotlib.Axes.hist + + Returns + ------- + collection of Matplotlib Axes + """ + + def plot_group(group, ax): + ax.hist(group.dropna().values, bins=bins, **kwargs) + + if xrot is None: + xrot = rot + + fig, axes = _grouped_plot( + plot_group, + data, + column=column, + by=by, + sharex=sharex, + sharey=sharey, + ax=ax, + figsize=figsize, + layout=layout, + rot=rot, + ) + + _set_ticks_props( + axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot + ) + + fig.subplots_adjust( + bottom=0.15, top=0.9, left=0.1, right=0.9, hspace=0.5, wspace=0.3 + ) + return axes + + +def hist_series( + self, + by=None, + ax=None, + grid=True, + xlabelsize=None, + xrot=None, + ylabelsize=None, + yrot=None, + figsize=None, + bins=10, + **kwds, +): + import matplotlib.pyplot as plt + + if by is None: + if kwds.get("layout", None) is not None: + raise ValueError("The 'layout' keyword is not supported when 'by' is None") + # hack until the plotting interface is a bit more unified + fig = kwds.pop( + "figure", plt.gcf() if plt.get_fignums() else plt.figure(figsize=figsize) + ) + if figsize is not None and tuple(figsize) != tuple(fig.get_size_inches()): + fig.set_size_inches(*figsize, forward=True) + if ax is None: + ax = fig.gca() + elif ax.get_figure() != fig: + raise AssertionError("passed axis not bound to passed figure") + values = self.dropna().values + + ax.hist(values, bins=bins, **kwds) + ax.grid(grid) + axes = np.array([ax]) + + _set_ticks_props( + axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot + ) + + else: + if "figure" in kwds: + raise ValueError( + "Cannot pass 'figure' when using the " + "'by' argument, since a new 'Figure' instance " + "will be created" + ) + axes = _grouped_hist( + self, + by=by, + ax=ax, + grid=grid, + figsize=figsize, + bins=bins, + xlabelsize=xlabelsize, + xrot=xrot, + ylabelsize=ylabelsize, + yrot=yrot, + **kwds, + ) + + if hasattr(axes, "ndim"): + if axes.ndim == 1 and len(axes) == 1: + return axes[0] + return axes + + +def hist_frame( + data, + column=None, + by=None, + grid=True, + xlabelsize=None, + xrot=None, + ylabelsize=None, + yrot=None, + ax=None, + sharex=False, + sharey=False, + figsize=None, + layout=None, + bins=10, + **kwds, +): + if by is not None: + axes = _grouped_hist( + data, + column=column, + by=by, + ax=ax, + grid=grid, + figsize=figsize, + sharex=sharex, + sharey=sharey, + layout=layout, + bins=bins, + xlabelsize=xlabelsize, + xrot=xrot, + ylabelsize=ylabelsize, + yrot=yrot, + **kwds, + ) + return axes + + if column is not None: + if not isinstance(column, (list, np.ndarray, ABCIndexClass)): + column = [column] + data = data[column] + data = data._get_numeric_data() + naxes = len(data.columns) + + if naxes == 0: + raise ValueError("hist method requires numerical columns, nothing to plot.") + + fig, axes = _subplots( + naxes=naxes, + ax=ax, + squeeze=False, + sharex=sharex, + sharey=sharey, + figsize=figsize, + layout=layout, + ) + _axes = _flatten(axes) + + for i, col in enumerate(com.try_sort(data.columns)): + ax = _axes[i] + ax.hist(data[col].dropna().values, bins=bins, **kwds) + ax.set_title(col) + ax.grid(grid) + + _set_ticks_props( + axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot + ) + fig.subplots_adjust(wspace=0.3, hspace=0.3) + + return axes diff --git a/venv/Lib/site-packages/pandas/plotting/_matplotlib/misc.py b/venv/Lib/site-packages/pandas/plotting/_matplotlib/misc.py new file mode 100644 index 0000000..0720f54 --- /dev/null +++ b/venv/Lib/site-packages/pandas/plotting/_matplotlib/misc.py @@ -0,0 +1,431 @@ +import random + +import matplotlib.lines as mlines +import matplotlib.patches as patches +import numpy as np + +from pandas.core.dtypes.missing import notna + +from pandas.io.formats.printing import pprint_thing +from pandas.plotting._matplotlib.style import _get_standard_colors +from pandas.plotting._matplotlib.tools import _set_ticks_props, _subplots + + +def scatter_matrix( + frame, + alpha=0.5, + figsize=None, + ax=None, + grid=False, + diagonal="hist", + marker=".", + density_kwds=None, + hist_kwds=None, + range_padding=0.05, + **kwds, +): + df = frame._get_numeric_data() + n = df.columns.size + naxes = n * n + fig, axes = _subplots(naxes=naxes, figsize=figsize, ax=ax, squeeze=False) + + # no gaps between subplots + fig.subplots_adjust(wspace=0, hspace=0) + + mask = notna(df) + + marker = _get_marker_compat(marker) + + hist_kwds = hist_kwds or {} + density_kwds = density_kwds or {} + + # GH 14855 + kwds.setdefault("edgecolors", "none") + + boundaries_list = [] + for a in df.columns: + values = df[a].values[mask[a].values] + rmin_, rmax_ = np.min(values), np.max(values) + rdelta_ext = (rmax_ - rmin_) * range_padding / 2.0 + boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext)) + + for i, a in enumerate(df.columns): + for j, b in enumerate(df.columns): + ax = axes[i, j] + + if i == j: + values = df[a].values[mask[a].values] + + # Deal with the diagonal by drawing a histogram there. + if diagonal == "hist": + ax.hist(values, **hist_kwds) + + elif diagonal in ("kde", "density"): + from scipy.stats import gaussian_kde + + y = values + gkde = gaussian_kde(y) + ind = np.linspace(y.min(), y.max(), 1000) + ax.plot(ind, gkde.evaluate(ind), **density_kwds) + + ax.set_xlim(boundaries_list[i]) + + else: + common = (mask[a] & mask[b]).values + + ax.scatter( + df[b][common], df[a][common], marker=marker, alpha=alpha, **kwds + ) + + ax.set_xlim(boundaries_list[j]) + ax.set_ylim(boundaries_list[i]) + + ax.set_xlabel(b) + ax.set_ylabel(a) + + if j != 0: + ax.yaxis.set_visible(False) + if i != n - 1: + ax.xaxis.set_visible(False) + + if len(df.columns) > 1: + lim1 = boundaries_list[0] + locs = axes[0][1].yaxis.get_majorticklocs() + locs = locs[(lim1[0] <= locs) & (locs <= lim1[1])] + adj = (locs - lim1[0]) / (lim1[1] - lim1[0]) + + lim0 = axes[0][0].get_ylim() + adj = adj * (lim0[1] - lim0[0]) + lim0[0] + axes[0][0].yaxis.set_ticks(adj) + + if np.all(locs == locs.astype(int)): + # if all ticks are int + locs = locs.astype(int) + axes[0][0].yaxis.set_ticklabels(locs) + + _set_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) + + return axes + + +def _get_marker_compat(marker): + if marker not in mlines.lineMarkers: + return "o" + return marker + + +def radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds): + import matplotlib.pyplot as plt + + def normalize(series): + a = min(series) + b = max(series) + return (series - a) / (b - a) + + n = len(frame) + classes = frame[class_column].drop_duplicates() + class_col = frame[class_column] + df = frame.drop(class_column, axis=1).apply(normalize) + + if ax is None: + ax = plt.gca(xlim=[-1, 1], ylim=[-1, 1]) + + to_plot = {} + colors = _get_standard_colors( + num_colors=len(classes), colormap=colormap, color_type="random", color=color + ) + + for kls in classes: + to_plot[kls] = [[], []] + + m = len(frame.columns) - 1 + s = np.array( + [ + (np.cos(t), np.sin(t)) + for t in [2.0 * np.pi * (i / float(m)) for i in range(m)] + ] + ) + + for i in range(n): + row = df.iloc[i].values + row_ = np.repeat(np.expand_dims(row, axis=1), 2, axis=1) + y = (s * row_).sum(axis=0) / row.sum() + kls = class_col.iat[i] + to_plot[kls][0].append(y[0]) + to_plot[kls][1].append(y[1]) + + for i, kls in enumerate(classes): + ax.scatter( + to_plot[kls][0], + to_plot[kls][1], + color=colors[i], + label=pprint_thing(kls), + **kwds, + ) + ax.legend() + + ax.add_patch(patches.Circle((0.0, 0.0), radius=1.0, facecolor="none")) + + for xy, name in zip(s, df.columns): + + ax.add_patch(patches.Circle(xy, radius=0.025, facecolor="gray")) + + if xy[0] < 0.0 and xy[1] < 0.0: + ax.text( + xy[0] - 0.025, xy[1] - 0.025, name, ha="right", va="top", size="small" + ) + elif xy[0] < 0.0 and xy[1] >= 0.0: + ax.text( + xy[0] - 0.025, + xy[1] + 0.025, + name, + ha="right", + va="bottom", + size="small", + ) + elif xy[0] >= 0.0 and xy[1] < 0.0: + ax.text( + xy[0] + 0.025, xy[1] - 0.025, name, ha="left", va="top", size="small" + ) + elif xy[0] >= 0.0 and xy[1] >= 0.0: + ax.text( + xy[0] + 0.025, xy[1] + 0.025, name, ha="left", va="bottom", size="small" + ) + + ax.axis("equal") + return ax + + +def andrews_curves( + frame, class_column, ax=None, samples=200, color=None, colormap=None, **kwds +): + import matplotlib.pyplot as plt + + def function(amplitudes): + def f(t): + x1 = amplitudes[0] + result = x1 / np.sqrt(2.0) + + # Take the rest of the coefficients and resize them + # appropriately. Take a copy of amplitudes as otherwise numpy + # deletes the element from amplitudes itself. + coeffs = np.delete(np.copy(amplitudes), 0) + coeffs.resize(int((coeffs.size + 1) / 2), 2) + + # Generate the harmonics and arguments for the sin and cos + # functions. + harmonics = np.arange(0, coeffs.shape[0]) + 1 + trig_args = np.outer(harmonics, t) + + result += np.sum( + coeffs[:, 0, np.newaxis] * np.sin(trig_args) + + coeffs[:, 1, np.newaxis] * np.cos(trig_args), + axis=0, + ) + return result + + return f + + n = len(frame) + class_col = frame[class_column] + classes = frame[class_column].drop_duplicates() + df = frame.drop(class_column, axis=1) + t = np.linspace(-np.pi, np.pi, samples) + used_legends = set() + + color_values = _get_standard_colors( + num_colors=len(classes), colormap=colormap, color_type="random", color=color + ) + colors = dict(zip(classes, color_values)) + if ax is None: + ax = plt.gca(xlim=(-np.pi, np.pi)) + for i in range(n): + row = df.iloc[i].values + f = function(row) + y = f(t) + kls = class_col.iat[i] + label = pprint_thing(kls) + if label not in used_legends: + used_legends.add(label) + ax.plot(t, y, color=colors[kls], label=label, **kwds) + else: + ax.plot(t, y, color=colors[kls], **kwds) + + ax.legend(loc="upper right") + ax.grid() + return ax + + +def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): + + import matplotlib.pyplot as plt + + # random.sample(ndarray, int) fails on python 3.3, sigh + data = list(series.values) + samplings = [random.sample(data, size) for _ in range(samples)] + + means = np.array([np.mean(sampling) for sampling in samplings]) + medians = np.array([np.median(sampling) for sampling in samplings]) + midranges = np.array( + [(min(sampling) + max(sampling)) * 0.5 for sampling in samplings] + ) + if fig is None: + fig = plt.figure() + x = list(range(samples)) + axes = [] + ax1 = fig.add_subplot(2, 3, 1) + ax1.set_xlabel("Sample") + axes.append(ax1) + ax1.plot(x, means, **kwds) + ax2 = fig.add_subplot(2, 3, 2) + ax2.set_xlabel("Sample") + axes.append(ax2) + ax2.plot(x, medians, **kwds) + ax3 = fig.add_subplot(2, 3, 3) + ax3.set_xlabel("Sample") + axes.append(ax3) + ax3.plot(x, midranges, **kwds) + ax4 = fig.add_subplot(2, 3, 4) + ax4.set_xlabel("Mean") + axes.append(ax4) + ax4.hist(means, **kwds) + ax5 = fig.add_subplot(2, 3, 5) + ax5.set_xlabel("Median") + axes.append(ax5) + ax5.hist(medians, **kwds) + ax6 = fig.add_subplot(2, 3, 6) + ax6.set_xlabel("Midrange") + axes.append(ax6) + ax6.hist(midranges, **kwds) + for axis in axes: + plt.setp(axis.get_xticklabels(), fontsize=8) + plt.setp(axis.get_yticklabels(), fontsize=8) + return fig + + +def parallel_coordinates( + frame, + class_column, + cols=None, + ax=None, + color=None, + use_columns=False, + xticks=None, + colormap=None, + axvlines=True, + axvlines_kwds=None, + sort_labels=False, + **kwds, +): + import matplotlib.pyplot as plt + + if axvlines_kwds is None: + axvlines_kwds = {"linewidth": 1, "color": "black"} + + n = len(frame) + classes = frame[class_column].drop_duplicates() + class_col = frame[class_column] + + if cols is None: + df = frame.drop(class_column, axis=1) + else: + df = frame[cols] + + used_legends = set() + + ncols = len(df.columns) + + # determine values to use for xticks + if use_columns is True: + if not np.all(np.isreal(list(df.columns))): + raise ValueError("Columns must be numeric to be used as xticks") + x = df.columns + elif xticks is not None: + if not np.all(np.isreal(xticks)): + raise ValueError("xticks specified must be numeric") + elif len(xticks) != ncols: + raise ValueError("Length of xticks must match number of columns") + x = xticks + else: + x = list(range(ncols)) + + if ax is None: + ax = plt.gca() + + color_values = _get_standard_colors( + num_colors=len(classes), colormap=colormap, color_type="random", color=color + ) + + if sort_labels: + classes = sorted(classes) + color_values = sorted(color_values) + colors = dict(zip(classes, color_values)) + + for i in range(n): + y = df.iloc[i].values + kls = class_col.iat[i] + label = pprint_thing(kls) + if label not in used_legends: + used_legends.add(label) + ax.plot(x, y, color=colors[kls], label=label, **kwds) + else: + ax.plot(x, y, color=colors[kls], **kwds) + + if axvlines: + for i in x: + ax.axvline(i, **axvlines_kwds) + + ax.set_xticks(x) + ax.set_xticklabels(df.columns) + ax.set_xlim(x[0], x[-1]) + ax.legend(loc="upper right") + ax.grid() + return ax + + +def lag_plot(series, lag=1, ax=None, **kwds): + # workaround because `c='b'` is hardcoded in matplotlibs scatter method + import matplotlib.pyplot as plt + + kwds.setdefault("c", plt.rcParams["patch.facecolor"]) + + data = series.values + y1 = data[:-lag] + y2 = data[lag:] + if ax is None: + ax = plt.gca() + ax.set_xlabel("y(t)") + ax.set_ylabel(f"y(t + {lag})") + ax.scatter(y1, y2, **kwds) + return ax + + +def autocorrelation_plot(series, ax=None, **kwds): + import matplotlib.pyplot as plt + + n = len(series) + data = np.asarray(series) + if ax is None: + ax = plt.gca(xlim=(1, n), ylim=(-1.0, 1.0)) + mean = np.mean(data) + c0 = np.sum((data - mean) ** 2) / float(n) + + def r(h): + return ((data[: n - h] - mean) * (data[h:] - mean)).sum() / float(n) / c0 + + x = np.arange(n) + 1 + y = [r(loc) for loc in x] + z95 = 1.959963984540054 + z99 = 2.5758293035489004 + ax.axhline(y=z99 / np.sqrt(n), linestyle="--", color="grey") + ax.axhline(y=z95 / np.sqrt(n), color="grey") + ax.axhline(y=0.0, color="black") + ax.axhline(y=-z95 / np.sqrt(n), color="grey") + ax.axhline(y=-z99 / np.sqrt(n), linestyle="--", color="grey") + ax.set_xlabel("Lag") + ax.set_ylabel("Autocorrelation") + ax.plot(x, y, **kwds) + if "label" in kwds: + ax.legend() + ax.grid() + return ax diff --git a/venv/Lib/site-packages/pandas/plotting/_matplotlib/style.py b/venv/Lib/site-packages/pandas/plotting/_matplotlib/style.py new file mode 100644 index 0000000..fd69265 --- /dev/null +++ b/venv/Lib/site-packages/pandas/plotting/_matplotlib/style.py @@ -0,0 +1,92 @@ +# being a bit too dynamic +import warnings + +import matplotlib.cm as cm +import matplotlib.colors +import numpy as np + +from pandas.core.dtypes.common import is_list_like + +import pandas.core.common as com + + +def _get_standard_colors( + num_colors=None, colormap=None, color_type="default", color=None +): + import matplotlib.pyplot as plt + + if color is None and colormap is not None: + if isinstance(colormap, str): + cmap = colormap + colormap = cm.get_cmap(colormap) + if colormap is None: + raise ValueError(f"Colormap {cmap} is not recognized") + colors = [colormap(num) for num in np.linspace(0, 1, num=num_colors)] + elif color is not None: + if colormap is not None: + warnings.warn( + "'color' and 'colormap' cannot be used simultaneously. Using 'color'" + ) + colors = list(color) if is_list_like(color) else color + else: + if color_type == "default": + # need to call list() on the result to copy so we don't + # modify the global rcParams below + try: + colors = [c["color"] for c in list(plt.rcParams["axes.prop_cycle"])] + except KeyError: + colors = list(plt.rcParams.get("axes.color_cycle", list("bgrcmyk"))) + if isinstance(colors, str): + colors = list(colors) + + colors = colors[0:num_colors] + elif color_type == "random": + + def random_color(column): + """ Returns a random color represented as a list of length 3""" + # GH17525 use common._random_state to avoid resetting the seed + rs = com.random_state(column) + return rs.rand(3).tolist() + + colors = [random_color(num) for num in range(num_colors)] + else: + raise ValueError("color_type must be either 'default' or 'random'") + + if isinstance(colors, str): + conv = matplotlib.colors.ColorConverter() + + def _maybe_valid_colors(colors): + try: + [conv.to_rgba(c) for c in colors] + return True + except ValueError: + return False + + # check whether the string can be convertible to single color + maybe_single_color = _maybe_valid_colors([colors]) + # check whether each character can be convertible to colors + maybe_color_cycle = _maybe_valid_colors(list(colors)) + if maybe_single_color and maybe_color_cycle and len(colors) > 1: + hex_color = [c["color"] for c in list(plt.rcParams["axes.prop_cycle"])] + colors = [hex_color[int(colors[1])]] + elif maybe_single_color: + colors = [colors] + else: + # ``colors`` is regarded as color cycle. + # mpl will raise error any of them is invalid + pass + + # Append more colors by cycling if there is not enough color. + # Extra colors will be ignored by matplotlib if there are more colors + # than needed and nothing needs to be done here. + if len(colors) < num_colors: + try: + multiple = num_colors // len(colors) - 1 + except ZeroDivisionError: + raise ValueError("Invalid color argument: ''") + mod = num_colors % len(colors) + + colors += multiple * colors + colors += colors[:mod] + + return colors diff --git a/venv/Lib/site-packages/pandas/plotting/_matplotlib/timeseries.py b/venv/Lib/site-packages/pandas/plotting/_matplotlib/timeseries.py new file mode 100644 index 0000000..dd04811 --- /dev/null +++ b/venv/Lib/site-packages/pandas/plotting/_matplotlib/timeseries.py @@ -0,0 +1,311 @@ +# TODO: Use the fact that axis can have units to simplify the process + +import functools + +import numpy as np + +from pandas._libs.tslibs.frequencies import ( + FreqGroup, + get_base_alias, + get_freq, + is_subperiod, + is_superperiod, +) +from pandas._libs.tslibs.period import Period + +from pandas.core.dtypes.generic import ( + ABCDatetimeIndex, + ABCPeriodIndex, + ABCTimedeltaIndex, +) + +from pandas.io.formats.printing import pprint_thing +from pandas.plotting._matplotlib.converter import ( + TimeSeries_DateFormatter, + TimeSeries_DateLocator, + TimeSeries_TimedeltaFormatter, +) +import pandas.tseries.frequencies as frequencies +from pandas.tseries.offsets import DateOffset + +# --------------------------------------------------------------------- +# Plotting functions and monkey patches + + +def _maybe_resample(series, ax, kwargs): + # resample against axes freq if necessary + freq, ax_freq = _get_freq(ax, series) + + if freq is None: # pragma: no cover + raise ValueError("Cannot use dynamic axis without frequency info") + + # Convert DatetimeIndex to PeriodIndex + if isinstance(series.index, ABCDatetimeIndex): + series = series.to_period(freq=freq) + + if ax_freq is not None and freq != ax_freq: + if is_superperiod(freq, ax_freq): # upsample input + series = series.copy() + series.index = series.index.asfreq(ax_freq, how="s") + freq = ax_freq + elif _is_sup(freq, ax_freq): # one is weekly + how = kwargs.pop("how", "last") + series = getattr(series.resample("D"), how)().dropna() + series = getattr(series.resample(ax_freq), how)().dropna() + freq = ax_freq + elif is_subperiod(freq, ax_freq) or _is_sub(freq, ax_freq): + _upsample_others(ax, freq, kwargs) + else: # pragma: no cover + raise ValueError("Incompatible frequency conversion") + return freq, series + + +def _is_sub(f1, f2): + return (f1.startswith("W") and is_subperiod("D", f2)) or ( + f2.startswith("W") and is_subperiod(f1, "D") + ) + + +def _is_sup(f1, f2): + return (f1.startswith("W") and is_superperiod("D", f2)) or ( + f2.startswith("W") and is_superperiod(f1, "D") + ) + + +def _upsample_others(ax, freq, kwargs): + legend = ax.get_legend() + lines, labels = _replot_ax(ax, freq, kwargs) + _replot_ax(ax, freq, kwargs) + + other_ax = None + if hasattr(ax, "left_ax"): + other_ax = ax.left_ax + if hasattr(ax, "right_ax"): + other_ax = ax.right_ax + + if other_ax is not None: + rlines, rlabels = _replot_ax(other_ax, freq, kwargs) + lines.extend(rlines) + labels.extend(rlabels) + + if legend is not None and kwargs.get("legend", True) and len(lines) > 0: + title = legend.get_title().get_text() + if title == "None": + title = None + ax.legend(lines, labels, loc="best", title=title) + + +def _replot_ax(ax, freq, kwargs): + data = getattr(ax, "_plot_data", None) + + # clear current axes and data + ax._plot_data = [] + ax.clear() + + _decorate_axes(ax, freq, kwargs) + + lines = [] + labels = [] + if data is not None: + for series, plotf, kwds in data: + series = series.copy() + idx = series.index.asfreq(freq, how="S") + series.index = idx + ax._plot_data.append((series, plotf, kwds)) + + # for tsplot + if isinstance(plotf, str): + from pandas.plotting._matplotlib import PLOT_CLASSES + + plotf = PLOT_CLASSES[plotf]._plot + + lines.append(plotf(ax, series.index._mpl_repr(), series.values, **kwds)[0]) + labels.append(pprint_thing(series.name)) + + return lines, labels + + +def _decorate_axes(ax, freq, kwargs): + """Initialize axes for time-series plotting""" + if not hasattr(ax, "_plot_data"): + ax._plot_data = [] + + ax.freq = freq + xaxis = ax.get_xaxis() + xaxis.freq = freq + if not hasattr(ax, "legendlabels"): + ax.legendlabels = [kwargs.get("label", None)] + else: + ax.legendlabels.append(kwargs.get("label", None)) + ax.view_interval = None + ax.date_axis_info = None + + +def _get_ax_freq(ax): + """ + Get the freq attribute of the ax object if set. + Also checks shared axes (eg when using secondary yaxis, sharex=True + or twinx) + """ + ax_freq = getattr(ax, "freq", None) + if ax_freq is None: + # check for left/right ax in case of secondary yaxis + if hasattr(ax, "left_ax"): + ax_freq = getattr(ax.left_ax, "freq", None) + elif hasattr(ax, "right_ax"): + ax_freq = getattr(ax.right_ax, "freq", None) + if ax_freq is None: + # check if a shared ax (sharex/twinx) has already freq set + shared_axes = ax.get_shared_x_axes().get_siblings(ax) + if len(shared_axes) > 1: + for shared_ax in shared_axes: + ax_freq = getattr(shared_ax, "freq", None) + if ax_freq is not None: + break + return ax_freq + + +def _get_freq(ax, series): + # get frequency from data + freq = getattr(series.index, "freq", None) + if freq is None: + freq = getattr(series.index, "inferred_freq", None) + + ax_freq = _get_ax_freq(ax) + + # use axes freq if no data freq + if freq is None: + freq = ax_freq + + # get the period frequency + if isinstance(freq, DateOffset): + freq = freq.rule_code + else: + freq = get_base_alias(freq) + + freq = frequencies.get_period_alias(freq) + return freq, ax_freq + + +def _use_dynamic_x(ax, data): + freq = _get_index_freq(data) + ax_freq = _get_ax_freq(ax) + + if freq is None: # convert irregular if axes has freq info + freq = ax_freq + else: # do not use tsplot if irregular was plotted first + if (ax_freq is None) and (len(ax.get_lines()) > 0): + return False + + if freq is None: + return False + + if isinstance(freq, DateOffset): + freq = freq.rule_code + else: + freq = get_base_alias(freq) + freq = frequencies.get_period_alias(freq) + + if freq is None: + return False + + # hack this for 0.10.1, creating more technical debt...sigh + if isinstance(data.index, ABCDatetimeIndex): + base = get_freq(freq) + x = data.index + if base <= FreqGroup.FR_DAY: + return x[:1].is_normalized + return Period(x[0], freq).to_timestamp(tz=x.tz) == x[0] + return True + + +def _get_index_freq(data): + freq = getattr(data.index, "freq", None) + if freq is None: + freq = getattr(data.index, "inferred_freq", None) + if freq == "B": + weekdays = np.unique(data.index.dayofweek) + if (5 in weekdays) or (6 in weekdays): + freq = None + return freq + + +def _maybe_convert_index(ax, data): + # tsplot converts automatically, but don't want to convert index + # over and over for DataFrames + if isinstance(data.index, (ABCDatetimeIndex, ABCPeriodIndex)): + freq = getattr(data.index, "freq", None) + + if freq is None: + freq = getattr(data.index, "inferred_freq", None) + if isinstance(freq, DateOffset): + freq = freq.rule_code + + if freq is None: + freq = _get_ax_freq(ax) + + if freq is None: + raise ValueError("Could not get frequency alias for plotting") + + freq = get_base_alias(freq) + freq = frequencies.get_period_alias(freq) + + if isinstance(data.index, ABCDatetimeIndex): + data = data.to_period(freq=freq) + elif isinstance(data.index, ABCPeriodIndex): + data.index = data.index.asfreq(freq=freq) + return data + + +# Patch methods for subplot. Only format_dateaxis is currently used. +# Do we need the rest for convenience? + + +def _format_coord(freq, t, y): + time_period = Period(ordinal=int(t), freq=freq) + return f"t = {time_period} y = {y:8f}" + + +def format_dateaxis(subplot, freq, index): + """ + Pretty-formats the date axis (x-axis). + + Major and minor ticks are automatically set for the frequency of the + current underlying series. As the dynamic mode is activated by + default, changing the limits of the x axis will intelligently change + the positions of the ticks. + """ + from matplotlib import pylab + + # handle index specific formatting + # Note: DatetimeIndex does not use this + # interface. DatetimeIndex uses matplotlib.date directly + if isinstance(index, ABCPeriodIndex): + + majlocator = TimeSeries_DateLocator( + freq, dynamic_mode=True, minor_locator=False, plot_obj=subplot + ) + minlocator = TimeSeries_DateLocator( + freq, dynamic_mode=True, minor_locator=True, plot_obj=subplot + ) + subplot.xaxis.set_major_locator(majlocator) + subplot.xaxis.set_minor_locator(minlocator) + + majformatter = TimeSeries_DateFormatter( + freq, dynamic_mode=True, minor_locator=False, plot_obj=subplot + ) + minformatter = TimeSeries_DateFormatter( + freq, dynamic_mode=True, minor_locator=True, plot_obj=subplot + ) + subplot.xaxis.set_major_formatter(majformatter) + subplot.xaxis.set_minor_formatter(minformatter) + + # x and y coord info + subplot.format_coord = functools.partial(_format_coord, freq) + + elif isinstance(index, ABCTimedeltaIndex): + subplot.xaxis.set_major_formatter(TimeSeries_TimedeltaFormatter()) + else: + raise TypeError("index type not supported") + + pylab.draw_if_interactive() diff --git a/venv/Lib/site-packages/pandas/plotting/_matplotlib/tools.py b/venv/Lib/site-packages/pandas/plotting/_matplotlib/tools.py new file mode 100644 index 0000000..dd4034a --- /dev/null +++ b/venv/Lib/site-packages/pandas/plotting/_matplotlib/tools.py @@ -0,0 +1,370 @@ +# being a bit too dynamic +from math import ceil +import warnings + +import matplotlib.table +import matplotlib.ticker as ticker +import numpy as np + +from pandas.core.dtypes.common import is_list_like +from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries + + +def format_date_labels(ax, rot): + # mini version of autofmt_xdate + for label in ax.get_xticklabels(): + label.set_ha("right") + label.set_rotation(rot) + fig = ax.get_figure() + fig.subplots_adjust(bottom=0.2) + + +def table(ax, data, rowLabels=None, colLabels=None, **kwargs): + if isinstance(data, ABCSeries): + data = data.to_frame() + elif isinstance(data, ABCDataFrame): + pass + else: + raise ValueError("Input data must be DataFrame or Series") + + if rowLabels is None: + rowLabels = data.index + + if colLabels is None: + colLabels = data.columns + + cellText = data.values + + table = matplotlib.table.table( + ax, cellText=cellText, rowLabels=rowLabels, colLabels=colLabels, **kwargs + ) + return table + + +def _get_layout(nplots, layout=None, layout_type="box"): + if layout is not None: + if not isinstance(layout, (tuple, list)) or len(layout) != 2: + raise ValueError("Layout must be a tuple of (rows, columns)") + + nrows, ncols = layout + + # Python 2 compat + ceil_ = lambda x: int(ceil(x)) + if nrows == -1 and ncols > 0: + layout = nrows, ncols = (ceil_(float(nplots) / ncols), ncols) + elif ncols == -1 and nrows > 0: + layout = nrows, ncols = (nrows, ceil_(float(nplots) / nrows)) + elif ncols <= 0 and nrows <= 0: + msg = "At least one dimension of layout must be positive" + raise ValueError(msg) + + if nrows * ncols < nplots: + raise ValueError( + f"Layout of {nrows}x{ncols} must be larger than required size {nplots}" + ) + + return layout + + if layout_type == "single": + return (1, 1) + elif layout_type == "horizontal": + return (1, nplots) + elif layout_type == "vertical": + return (nplots, 1) + + layouts = {1: (1, 1), 2: (1, 2), 3: (2, 2), 4: (2, 2)} + try: + return layouts[nplots] + except KeyError: + k = 1 + while k ** 2 < nplots: + k += 1 + + if (k - 1) * k >= nplots: + return k, (k - 1) + else: + return k, k + + +# copied from matplotlib/pyplot.py and modified for pandas.plotting + + +def _subplots( + naxes=None, + sharex=False, + sharey=False, + squeeze=True, + subplot_kw=None, + ax=None, + layout=None, + layout_type="box", + **fig_kw, +): + """Create a figure with a set of subplots already made. + + This utility wrapper makes it convenient to create common layouts of + subplots, including the enclosing figure object, in a single call. + + Keyword arguments: + + naxes : int + Number of required axes. Exceeded axes are set invisible. Default is + nrows * ncols. + + sharex : bool + If True, the X axis will be shared amongst all subplots. + + sharey : bool + If True, the Y axis will be shared amongst all subplots. + + squeeze : bool + + If True, extra dimensions are squeezed out from the returned axis object: + - if only one subplot is constructed (nrows=ncols=1), the resulting + single Axis object is returned as a scalar. + - for Nx1 or 1xN subplots, the returned object is a 1-d numpy object + array of Axis objects are returned as numpy 1-d arrays. + - for NxM subplots with N>1 and M>1 are returned as a 2d array. + + If False, no squeezing is done: the returned axis object is always + a 2-d array containing Axis instances, even if it ends up being 1x1. + + subplot_kw : dict + Dict with keywords passed to the add_subplot() call used to create each + subplots. + + ax : Matplotlib axis object, optional + + layout : tuple + Number of rows and columns of the subplot grid. + If not specified, calculated from naxes and layout_type + + layout_type : {'box', 'horizontal', 'vertical'}, default 'box' + Specify how to layout the subplot grid. + + fig_kw : Other keyword arguments to be passed to the figure() call. + Note that all keywords not recognized above will be + automatically included here. + + Returns: + + fig, ax : tuple + - fig is the Matplotlib Figure object + - ax can be either a single axis object or an array of axis objects if + more than one subplot was created. The dimensions of the resulting array + can be controlled with the squeeze keyword, see above. + + **Examples:** + + x = np.linspace(0, 2*np.pi, 400) + y = np.sin(x**2) + + # Just a figure and one subplot + f, ax = plt.subplots() + ax.plot(x, y) + ax.set_title('Simple plot') + + # Two subplots, unpack the output array immediately + f, (ax1, ax2) = plt.subplots(1, 2, sharey=True) + ax1.plot(x, y) + ax1.set_title('Sharing Y axis') + ax2.scatter(x, y) + + # Four polar axes + plt.subplots(2, 2, subplot_kw=dict(polar=True)) + """ + import matplotlib.pyplot as plt + + if subplot_kw is None: + subplot_kw = {} + + if ax is None: + fig = plt.figure(**fig_kw) + else: + if is_list_like(ax): + ax = _flatten(ax) + if layout is not None: + warnings.warn( + "When passing multiple axes, layout keyword is ignored", UserWarning + ) + if sharex or sharey: + warnings.warn( + "When passing multiple axes, sharex and sharey " + "are ignored. These settings must be specified " + "when creating axes", + UserWarning, + stacklevel=4, + ) + if len(ax) == naxes: + fig = ax[0].get_figure() + return fig, ax + else: + raise ValueError( + f"The number of passed axes must be {naxes}, the " + "same as the output plot" + ) + + fig = ax.get_figure() + # if ax is passed and a number of subplots is 1, return ax as it is + if naxes == 1: + if squeeze: + return fig, ax + else: + return fig, _flatten(ax) + else: + warnings.warn( + "To output multiple subplots, the figure containing " + "the passed axes is being cleared", + UserWarning, + stacklevel=4, + ) + fig.clear() + + nrows, ncols = _get_layout(naxes, layout=layout, layout_type=layout_type) + nplots = nrows * ncols + + # Create empty object array to hold all axes. It's easiest to make it 1-d + # so we can just append subplots upon creation, and then + axarr = np.empty(nplots, dtype=object) + + # Create first subplot separately, so we can share it if requested + ax0 = fig.add_subplot(nrows, ncols, 1, **subplot_kw) + + if sharex: + subplot_kw["sharex"] = ax0 + if sharey: + subplot_kw["sharey"] = ax0 + axarr[0] = ax0 + + # Note off-by-one counting because add_subplot uses the MATLAB 1-based + # convention. + for i in range(1, nplots): + kwds = subplot_kw.copy() + # Set sharex and sharey to None for blank/dummy axes, these can + # interfere with proper axis limits on the visible axes if + # they share axes e.g. issue #7528 + if i >= naxes: + kwds["sharex"] = None + kwds["sharey"] = None + ax = fig.add_subplot(nrows, ncols, i + 1, **kwds) + axarr[i] = ax + + if naxes != nplots: + for ax in axarr[naxes:]: + ax.set_visible(False) + + _handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey) + + if squeeze: + # Reshape the array to have the final desired dimension (nrow,ncol), + # though discarding unneeded dimensions that equal 1. If we only have + # one subplot, just return it instead of a 1-element array. + if nplots == 1: + axes = axarr[0] + else: + axes = axarr.reshape(nrows, ncols).squeeze() + else: + # returned axis array will be always 2-d, even if nrows=ncols=1 + axes = axarr.reshape(nrows, ncols) + + return fig, axes + + +def _remove_labels_from_axis(axis): + for t in axis.get_majorticklabels(): + t.set_visible(False) + + # set_visible will not be effective if + # minor axis has NullLocator and NullFormattor (default) + if isinstance(axis.get_minor_locator(), ticker.NullLocator): + axis.set_minor_locator(ticker.AutoLocator()) + if isinstance(axis.get_minor_formatter(), ticker.NullFormatter): + axis.set_minor_formatter(ticker.FormatStrFormatter("")) + for t in axis.get_minorticklabels(): + t.set_visible(False) + + axis.get_label().set_visible(False) + + +def _handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey): + if nplots > 1: + + if nrows > 1: + try: + # first find out the ax layout, + # so that we can correctly handle 'gaps" + layout = np.zeros((nrows + 1, ncols + 1), dtype=np.bool) + for ax in axarr: + layout[ax.rowNum, ax.colNum] = ax.get_visible() + + for ax in axarr: + # only the last row of subplots should get x labels -> all + # other off layout handles the case that the subplot is + # the last in the column, because below is no subplot/gap. + if not layout[ax.rowNum + 1, ax.colNum]: + continue + if sharex or len(ax.get_shared_x_axes().get_siblings(ax)) > 1: + _remove_labels_from_axis(ax.xaxis) + + except IndexError: + # if gridspec is used, ax.rowNum and ax.colNum may different + # from layout shape. in this case, use last_row logic + for ax in axarr: + if ax.is_last_row(): + continue + if sharex or len(ax.get_shared_x_axes().get_siblings(ax)) > 1: + _remove_labels_from_axis(ax.xaxis) + + if ncols > 1: + for ax in axarr: + # only the first column should get y labels -> set all other to + # off as we only have labels in the first column and we always + # have a subplot there, we can skip the layout test + if ax.is_first_col(): + continue + if sharey or len(ax.get_shared_y_axes().get_siblings(ax)) > 1: + _remove_labels_from_axis(ax.yaxis) + + +def _flatten(axes): + if not is_list_like(axes): + return np.array([axes]) + elif isinstance(axes, (np.ndarray, ABCIndexClass)): + return axes.ravel() + return np.array(axes) + + +def _set_ticks_props(axes, xlabelsize=None, xrot=None, ylabelsize=None, yrot=None): + import matplotlib.pyplot as plt + + for ax in _flatten(axes): + if xlabelsize is not None: + plt.setp(ax.get_xticklabels(), fontsize=xlabelsize) + if xrot is not None: + plt.setp(ax.get_xticklabels(), rotation=xrot) + if ylabelsize is not None: + plt.setp(ax.get_yticklabels(), fontsize=ylabelsize) + if yrot is not None: + plt.setp(ax.get_yticklabels(), rotation=yrot) + return axes + + +def _get_all_lines(ax): + lines = ax.get_lines() + + if hasattr(ax, "right_ax"): + lines += ax.right_ax.get_lines() + + if hasattr(ax, "left_ax"): + lines += ax.left_ax.get_lines() + + return lines + + +def _get_xlim(lines): + left, right = np.inf, -np.inf + for l in lines: + x = l.get_xdata(orig=False) + left = min(np.nanmin(x), left) + right = max(np.nanmax(x), right) + return left, right diff --git a/venv/Lib/site-packages/pandas/plotting/_misc.py b/venv/Lib/site-packages/pandas/plotting/_misc.py new file mode 100644 index 0000000..ccd42d3 --- /dev/null +++ b/venv/Lib/site-packages/pandas/plotting/_misc.py @@ -0,0 +1,487 @@ +from contextlib import contextmanager + +from pandas.plotting._core import _get_plot_backend + + +def table(ax, data, rowLabels=None, colLabels=None, **kwargs): + """ + Helper function to convert DataFrame and Series to matplotlib.table. + + Parameters + ---------- + ax : Matplotlib axes object + data : DataFrame or Series + Data for table contents. + **kwargs + Keyword arguments to be passed to matplotlib.table.table. + If `rowLabels` or `colLabels` is not specified, data index or column + name will be used. + + Returns + ------- + matplotlib table object + """ + plot_backend = _get_plot_backend("matplotlib") + return plot_backend.table( + ax=ax, data=data, rowLabels=None, colLabels=None, **kwargs + ) + + +def register(): + """ + Register Pandas Formatters and Converters with matplotlib. + + This function modifies the global ``matplotlib.units.registry`` + dictionary. Pandas adds custom converters for + + * pd.Timestamp + * pd.Period + * np.datetime64 + * datetime.datetime + * datetime.date + * datetime.time + + See Also + -------- + deregister_matplotlib_converters + """ + plot_backend = _get_plot_backend("matplotlib") + plot_backend.register() + + +def deregister(): + """ + Remove pandas' formatters and converters. + + Removes the custom converters added by :func:`register`. This + attempts to set the state of the registry back to the state before + pandas registered its own units. Converters for pandas' own types like + Timestamp and Period are removed completely. Converters for types + pandas overwrites, like ``datetime.datetime``, are restored to their + original value. + + See Also + -------- + register_matplotlib_converters + """ + plot_backend = _get_plot_backend("matplotlib") + plot_backend.deregister() + + +def scatter_matrix( + frame, + alpha=0.5, + figsize=None, + ax=None, + grid=False, + diagonal="hist", + marker=".", + density_kwds=None, + hist_kwds=None, + range_padding=0.05, + **kwargs, +): + """ + Draw a matrix of scatter plots. + + Parameters + ---------- + frame : DataFrame + alpha : float, optional + Amount of transparency applied. + figsize : (float,float), optional + A tuple (width, height) in inches. + ax : Matplotlib axis object, optional + grid : bool, optional + Setting this to True will show the grid. + diagonal : {'hist', 'kde'} + Pick between 'kde' and 'hist' for either Kernel Density Estimation or + Histogram plot in the diagonal. + marker : str, optional + Matplotlib marker type, default '.'. + density_kwds : keywords + Keyword arguments to be passed to kernel density estimate plot. + hist_kwds : keywords + Keyword arguments to be passed to hist function. + range_padding : float, default 0.05 + Relative extension of axis range in x and y with respect to + (x_max - x_min) or (y_max - y_min). + **kwargs + Keyword arguments to be passed to scatter function. + + Returns + ------- + numpy.ndarray + A matrix of scatter plots. + + Examples + -------- + >>> df = pd.DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D']) + >>> scatter_matrix(df, alpha=0.2) + """ + plot_backend = _get_plot_backend("matplotlib") + return plot_backend.scatter_matrix( + frame=frame, + alpha=alpha, + figsize=figsize, + ax=ax, + grid=grid, + diagonal=diagonal, + marker=marker, + density_kwds=density_kwds, + hist_kwds=hist_kwds, + range_padding=range_padding, + **kwargs, + ) + + +def radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds): + """ + Plot a multidimensional dataset in 2D. + + Each Series in the DataFrame is represented as a evenly distributed + slice on a circle. Each data point is rendered in the circle according to + the value on each Series. Highly correlated `Series` in the `DataFrame` + are placed closer on the unit circle. + + RadViz allow to project a N-dimensional data set into a 2D space where the + influence of each dimension can be interpreted as a balance between the + influence of all dimensions. + + More info available at the `original article + `_ + describing RadViz. + + Parameters + ---------- + frame : `DataFrame` + Pandas object holding the data. + class_column : str + Column name containing the name of the data point category. + ax : :class:`matplotlib.axes.Axes`, optional + A plot instance to which to add the information. + color : list[str] or tuple[str], optional + Assign a color to each category. Example: ['blue', 'green']. + colormap : str or :class:`matplotlib.colors.Colormap`, default None + Colormap to select colors from. If string, load colormap with that + name from matplotlib. + **kwds + Options to pass to matplotlib scatter plotting method. + + Returns + ------- + class:`matplotlib.axes.Axes` + + See Also + -------- + plotting.andrews_curves : Plot clustering visualization. + + Examples + -------- + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame({ + ... 'SepalLength': [6.5, 7.7, 5.1, 5.8, 7.6, 5.0, 5.4, 4.6, + ... 6.7, 4.6], + ... 'SepalWidth': [3.0, 3.8, 3.8, 2.7, 3.0, 2.3, 3.0, 3.2, + ... 3.3, 3.6], + ... 'PetalLength': [5.5, 6.7, 1.9, 5.1, 6.6, 3.3, 4.5, 1.4, + ... 5.7, 1.0], + ... 'PetalWidth': [1.8, 2.2, 0.4, 1.9, 2.1, 1.0, 1.5, 0.2, + ... 2.1, 0.2], + ... 'Category': ['virginica', 'virginica', 'setosa', + ... 'virginica', 'virginica', 'versicolor', + ... 'versicolor', 'setosa', 'virginica', + ... 'setosa'] + ... }) + >>> rad_viz = pd.plotting.radviz(df, 'Category') # doctest: +SKIP + """ + plot_backend = _get_plot_backend("matplotlib") + return plot_backend.radviz( + frame=frame, + class_column=class_column, + ax=ax, + color=color, + colormap=colormap, + **kwds, + ) + + +def andrews_curves( + frame, class_column, ax=None, samples=200, color=None, colormap=None, **kwargs +): + """ + Generate a matplotlib plot of Andrews curves, for visualising clusters of + multivariate data. + + Andrews curves have the functional form: + + f(t) = x_1/sqrt(2) + x_2 sin(t) + x_3 cos(t) + + x_4 sin(2t) + x_5 cos(2t) + ... + + Where x coefficients correspond to the values of each dimension and t is + linearly spaced between -pi and +pi. Each row of frame then corresponds to + a single curve. + + Parameters + ---------- + frame : DataFrame + Data to be plotted, preferably normalized to (0.0, 1.0). + class_column : Name of the column containing class names + ax : matplotlib axes object, default None + samples : Number of points to plot in each curve + color : list or tuple, optional + Colors to use for the different classes. + colormap : str or matplotlib colormap object, default None + Colormap to select colors from. If string, load colormap with that name + from matplotlib. + **kwargs + Options to pass to matplotlib plotting method. + + Returns + ------- + class:`matplotlip.axis.Axes` + """ + plot_backend = _get_plot_backend("matplotlib") + return plot_backend.andrews_curves( + frame=frame, + class_column=class_column, + ax=ax, + samples=samples, + color=color, + colormap=colormap, + **kwargs, + ) + + +def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): + """ + Bootstrap plot on mean, median and mid-range statistics. + + The bootstrap plot is used to estimate the uncertainty of a statistic + by relaying on random sampling with replacement [1]_. This function will + generate bootstrapping plots for mean, median and mid-range statistics + for the given number of samples of the given size. + + .. [1] "Bootstrapping (statistics)" in \ + https://en.wikipedia.org/wiki/Bootstrapping_%28statistics%29 + + Parameters + ---------- + series : pandas.Series + Pandas Series from where to get the samplings for the bootstrapping. + fig : matplotlib.figure.Figure, default None + If given, it will use the `fig` reference for plotting instead of + creating a new one with default parameters. + size : int, default 50 + Number of data points to consider during each sampling. It must be + greater or equal than the length of the `series`. + samples : int, default 500 + Number of times the bootstrap procedure is performed. + **kwds + Options to pass to matplotlib plotting method. + + Returns + ------- + matplotlib.figure.Figure + Matplotlib figure. + + See Also + -------- + DataFrame.plot : Basic plotting for DataFrame objects. + Series.plot : Basic plotting for Series objects. + + Examples + -------- + + .. plot:: + :context: close-figs + + >>> s = pd.Series(np.random.uniform(size=100)) + >>> fig = pd.plotting.bootstrap_plot(s) # doctest: +SKIP + """ + plot_backend = _get_plot_backend("matplotlib") + return plot_backend.bootstrap_plot( + series=series, fig=fig, size=size, samples=samples, **kwds + ) + + +def parallel_coordinates( + frame, + class_column, + cols=None, + ax=None, + color=None, + use_columns=False, + xticks=None, + colormap=None, + axvlines=True, + axvlines_kwds=None, + sort_labels=False, + **kwargs, +): + """ + Parallel coordinates plotting. + + Parameters + ---------- + frame : DataFrame + class_column : str + Column name containing class names. + cols : list, optional + A list of column names to use. + ax : matplotlib.axis, optional + Matplotlib axis object. + color : list or tuple, optional + Colors to use for the different classes. + use_columns : bool, optional + If true, columns will be used as xticks. + xticks : list or tuple, optional + A list of values to use for xticks. + colormap : str or matplotlib colormap, default None + Colormap to use for line colors. + axvlines : bool, optional + If true, vertical lines will be added at each xtick. + axvlines_kwds : keywords, optional + Options to be passed to axvline method for vertical lines. + sort_labels : bool, default False + Sort class_column labels, useful when assigning colors. + **kwargs + Options to pass to matplotlib plotting method. + + Returns + ------- + class:`matplotlib.axis.Axes` + + Examples + -------- + >>> from matplotlib import pyplot as plt + >>> df = pd.read_csv('https://raw.github.com/pandas-dev/pandas/master' + '/pandas/tests/data/csv/iris.csv') + >>> pd.plotting.parallel_coordinates( + df, 'Name', + color=('#556270', '#4ECDC4', '#C7F464')) + >>> plt.show() + """ + plot_backend = _get_plot_backend("matplotlib") + return plot_backend.parallel_coordinates( + frame=frame, + class_column=class_column, + cols=cols, + ax=ax, + color=color, + use_columns=use_columns, + xticks=xticks, + colormap=colormap, + axvlines=axvlines, + axvlines_kwds=axvlines_kwds, + sort_labels=sort_labels, + **kwargs, + ) + + +def lag_plot(series, lag=1, ax=None, **kwds): + """ + Lag plot for time series. + + Parameters + ---------- + series : Time series + lag : lag of the scatter plot, default 1 + ax : Matplotlib axis object, optional + **kwds + Matplotlib scatter method keyword arguments. + + Returns + ------- + class:`matplotlib.axis.Axes` + """ + plot_backend = _get_plot_backend("matplotlib") + return plot_backend.lag_plot(series=series, lag=lag, ax=ax, **kwds) + + +def autocorrelation_plot(series, ax=None, **kwargs): + """ + Autocorrelation plot for time series. + + Parameters + ---------- + series : Time series + ax : Matplotlib axis object, optional + **kwargs + Options to pass to matplotlib plotting method. + + Returns + ------- + class:`matplotlib.axis.Axes` + """ + plot_backend = _get_plot_backend("matplotlib") + return plot_backend.autocorrelation_plot(series=series, ax=ax, **kwargs) + + +class _Options(dict): + """ + Stores pandas plotting options. + + Allows for parameter aliasing so you can just use parameter names that are + the same as the plot function parameters, but is stored in a canonical + format that makes it easy to breakdown into groups later. + """ + + # alias so the names are same as plotting method parameter names + _ALIASES = {"x_compat": "xaxis.compat"} + _DEFAULT_KEYS = ["xaxis.compat"] + + def __init__(self, deprecated=False): + self._deprecated = deprecated + super().__setitem__("xaxis.compat", False) + + def __getitem__(self, key): + key = self._get_canonical_key(key) + if key not in self: + raise ValueError(f"{key} is not a valid pandas plotting option") + return super().__getitem__(key) + + def __setitem__(self, key, value): + key = self._get_canonical_key(key) + return super().__setitem__(key, value) + + def __delitem__(self, key): + key = self._get_canonical_key(key) + if key in self._DEFAULT_KEYS: + raise ValueError(f"Cannot remove default parameter {key}") + return super().__delitem__(key) + + def __contains__(self, key) -> bool: + key = self._get_canonical_key(key) + return super().__contains__(key) + + def reset(self): + """ + Reset the option store to its initial state + + Returns + ------- + None + """ + self.__init__() + + def _get_canonical_key(self, key): + return self._ALIASES.get(key, key) + + @contextmanager + def use(self, key, value): + """ + Temporarily set a parameter value using the with statement. + Aliasing allowed. + """ + old_value = self[key] + try: + self[key] = value + yield self + finally: + self[key] = old_value + + +plot_params = _Options() diff --git a/venv/Lib/site-packages/pandas/testing.py b/venv/Lib/site-packages/pandas/testing.py new file mode 100644 index 0000000..0445fa5 --- /dev/null +++ b/venv/Lib/site-packages/pandas/testing.py @@ -0,0 +1,17 @@ +""" +Public testing utility functions. +""" + +from pandas._testing import ( + assert_extension_array_equal, + assert_frame_equal, + assert_index_equal, + assert_series_equal, +) + +__all__ = [ + "assert_extension_array_equal", + "assert_frame_equal", + "assert_series_equal", + "assert_index_equal", +] diff --git a/venv/Lib/site-packages/pandas/tests/__init__.py b/venv/Lib/site-packages/pandas/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/api/__init__.py b/venv/Lib/site-packages/pandas/tests/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/api/test_api.py b/venv/Lib/site-packages/pandas/tests/api/test_api.py new file mode 100644 index 0000000..406d5f0 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/api/test_api.py @@ -0,0 +1,331 @@ +import subprocess +import sys +from typing import List + +import pytest + +import pandas as pd +from pandas import api, compat +import pandas._testing as tm + + +class Base: + def check(self, namespace, expected, ignored=None): + # see which names are in the namespace, minus optional + # ignored ones + # compare vs the expected + + result = sorted(f for f in dir(namespace) if not f.startswith("__")) + if ignored is not None: + result = sorted(set(result) - set(ignored)) + + expected = sorted(expected) + tm.assert_almost_equal(result, expected) + + +class TestPDApi(Base): + # these are optionally imported based on testing + # & need to be ignored + ignored = ["tests", "locale", "conftest"] + + # top-level sub-packages + lib = [ + "api", + "arrays", + "compat", + "core", + "errors", + "pandas", + "plotting", + "test", + "testing", + "tseries", + "util", + "options", + "io", + ] + + # these are already deprecated; awaiting removal + deprecated_modules: List[str] = ["np", "datetime"] + + # misc + misc = ["IndexSlice", "NaT", "NA"] + + # top-level classes + classes = [ + "Categorical", + "CategoricalIndex", + "DataFrame", + "DateOffset", + "DatetimeIndex", + "ExcelFile", + "ExcelWriter", + "Float64Index", + "Grouper", + "HDFStore", + "Index", + "Int64Index", + "MultiIndex", + "Period", + "PeriodIndex", + "RangeIndex", + "UInt64Index", + "Series", + "SparseDtype", + "StringDtype", + "Timedelta", + "TimedeltaIndex", + "Timestamp", + "Interval", + "IntervalIndex", + "CategoricalDtype", + "PeriodDtype", + "IntervalDtype", + "DatetimeTZDtype", + "BooleanDtype", + "Int8Dtype", + "Int16Dtype", + "Int32Dtype", + "Int64Dtype", + "UInt8Dtype", + "UInt16Dtype", + "UInt32Dtype", + "UInt64Dtype", + "NamedAgg", + ] + + # these are already deprecated; awaiting removal + deprecated_classes: List[str] = [] + + # these should be deprecated in the future + deprecated_classes_in_future: List[str] = ["SparseArray"] + + if not compat.PY37: + classes.extend(["Panel", "SparseSeries", "SparseDataFrame"]) + # deprecated_modules.extend(["np", "datetime"]) + # deprecated_classes_in_future.extend(["SparseArray"]) + + # external modules exposed in pandas namespace + modules: List[str] = [] + + # top-level functions + funcs = [ + "array", + "bdate_range", + "concat", + "crosstab", + "cut", + "date_range", + "interval_range", + "eval", + "factorize", + "get_dummies", + "infer_freq", + "isna", + "isnull", + "lreshape", + "melt", + "notna", + "notnull", + "offsets", + "merge", + "merge_ordered", + "merge_asof", + "period_range", + "pivot", + "pivot_table", + "qcut", + "show_versions", + "timedelta_range", + "unique", + "value_counts", + "wide_to_long", + ] + + # top-level option funcs + funcs_option = [ + "reset_option", + "describe_option", + "get_option", + "option_context", + "set_option", + "set_eng_float_format", + ] + + # top-level read_* funcs + funcs_read = [ + "read_clipboard", + "read_csv", + "read_excel", + "read_fwf", + "read_gbq", + "read_hdf", + "read_html", + "read_json", + "read_pickle", + "read_sas", + "read_sql", + "read_sql_query", + "read_sql_table", + "read_stata", + "read_table", + "read_feather", + "read_parquet", + "read_orc", + "read_spss", + ] + + # top-level json funcs + funcs_json = ["json_normalize"] + + # top-level to_* funcs + funcs_to = ["to_datetime", "to_numeric", "to_pickle", "to_timedelta"] + + # top-level to deprecate in the future + deprecated_funcs_in_future: List[str] = [] + + # these are already deprecated; awaiting removal + deprecated_funcs: List[str] = [] + + # private modules in pandas namespace + private_modules = [ + "_config", + "_hashtable", + "_lib", + "_libs", + "_np_version_under1p14", + "_np_version_under1p15", + "_np_version_under1p16", + "_np_version_under1p17", + "_np_version_under1p18", + "_testing", + "_tslib", + "_typing", + "_version", + ] + + def test_api(self): + + checkthese = ( + self.lib + + self.misc + + self.modules + + self.classes + + self.funcs + + self.funcs_option + + self.funcs_read + + self.funcs_json + + self.funcs_to + + self.private_modules + ) + if not compat.PY37: + checkthese.extend( + self.deprecated_modules + + self.deprecated_classes + + self.deprecated_classes_in_future + + self.deprecated_funcs_in_future + + self.deprecated_funcs + ) + self.check(pd, checkthese, self.ignored) + + def test_depr(self): + deprecated_list = ( + self.deprecated_modules + + self.deprecated_classes + + self.deprecated_classes_in_future + + self.deprecated_funcs + + self.deprecated_funcs_in_future + ) + for depr in deprecated_list: + with tm.assert_produces_warning(FutureWarning): + deprecated = getattr(pd, depr) + if not compat.PY37: + if depr == "datetime": + deprecated.__getattr__(dir(pd.datetime.datetime)[-1]) + elif depr == "SparseArray": + deprecated([]) + else: + deprecated.__getattr__(dir(deprecated)[-1]) + + +def test_datetime(): + from datetime import datetime + import warnings + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + assert datetime(2015, 1, 2, 0, 0) == pd.datetime(2015, 1, 2, 0, 0) + + assert isinstance(pd.datetime(2015, 1, 2, 0, 0), pd.datetime) + + +def test_sparsearray(): + import warnings + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + assert isinstance(pd.array([1, 2, 3], dtype="Sparse"), pd.SparseArray) + + +def test_np(): + import numpy as np + import warnings + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + assert (pd.np.arange(0, 10) == np.arange(0, 10)).all() + + +class TestApi(Base): + allowed = ["types", "extensions", "indexers"] + + def test_api(self): + self.check(api, self.allowed) + + +class TestTesting(Base): + funcs = [ + "assert_frame_equal", + "assert_series_equal", + "assert_index_equal", + "assert_extension_array_equal", + ] + + def test_testing(self): + from pandas import testing + + self.check(testing, self.funcs) + + def test_util_testing_deprecated(self): + # avoid cache state affecting the test + sys.modules.pop("pandas.util.testing", None) + + with tm.assert_produces_warning(FutureWarning) as m: + import pandas.util.testing # noqa: F401 + + assert "pandas.util.testing is deprecated" in str(m[0].message) + assert "pandas.testing instead" in str(m[0].message) + + def test_util_testing_deprecated_direct(self): + # avoid cache state affecting the test + sys.modules.pop("pandas.util.testing", None) + with tm.assert_produces_warning(FutureWarning) as m: + from pandas.util.testing import assert_series_equal # noqa: F401 + + assert "pandas.util.testing is deprecated" in str(m[0].message) + assert "pandas.testing instead" in str(m[0].message) + + def test_util_in_top_level(self): + # in a subprocess to avoid import caching issues + out = subprocess.check_output( + [ + sys.executable, + "-c", + "import pandas; pandas.util.testing.assert_series_equal", + ], + stderr=subprocess.STDOUT, + ).decode() + assert "pandas.util.testing is deprecated" in out + + with pytest.raises(AttributeError, match="foo"): + pd.util.foo diff --git a/venv/Lib/site-packages/pandas/tests/api/test_types.py b/venv/Lib/site-packages/pandas/tests/api/test_types.py new file mode 100644 index 0000000..31423c0 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/api/test_types.py @@ -0,0 +1,64 @@ +import pandas._testing as tm +from pandas.api import types + +from .test_api import Base + + +class TestTypes(Base): + + allowed = [ + "is_bool", + "is_bool_dtype", + "is_categorical", + "is_categorical_dtype", + "is_complex", + "is_complex_dtype", + "is_datetime64_any_dtype", + "is_datetime64_dtype", + "is_datetime64_ns_dtype", + "is_datetime64tz_dtype", + "is_dtype_equal", + "is_float", + "is_float_dtype", + "is_int64_dtype", + "is_integer", + "is_integer_dtype", + "is_number", + "is_numeric_dtype", + "is_object_dtype", + "is_scalar", + "is_sparse", + "is_string_dtype", + "is_signed_integer_dtype", + "is_timedelta64_dtype", + "is_timedelta64_ns_dtype", + "is_unsigned_integer_dtype", + "is_period_dtype", + "is_interval", + "is_interval_dtype", + "is_re", + "is_re_compilable", + "is_dict_like", + "is_iterator", + "is_file_like", + "is_list_like", + "is_hashable", + "is_array_like", + "is_named_tuple", + "pandas_dtype", + "union_categoricals", + "infer_dtype", + "is_extension_array_dtype", + ] + deprecated = ["is_extension_type"] + dtypes = ["CategoricalDtype", "DatetimeTZDtype", "PeriodDtype", "IntervalDtype"] + + def test_types(self): + + self.check(types, self.allowed + self.dtypes + self.deprecated) + + def test_deprecated_from_api_types(self): + + for t in self.deprecated: + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + getattr(types, t)(1) diff --git a/venv/Lib/site-packages/pandas/tests/arithmetic/__init__.py b/venv/Lib/site-packages/pandas/tests/arithmetic/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/arithmetic/common.py b/venv/Lib/site-packages/pandas/tests/arithmetic/common.py new file mode 100644 index 0000000..83d19b8 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arithmetic/common.py @@ -0,0 +1,89 @@ +""" +Assertion helpers for arithmetic tests. +""" +import numpy as np +import pytest + +from pandas import DataFrame, Index, Series +import pandas._testing as tm + + +def assert_invalid_addsub_type(left, right, msg=None): + """ + Helper to assert that left and right can be neither added nor subtracted. + + Parameters + --------- + left : object + right : object + msg : str or None, default None + """ + with pytest.raises(TypeError, match=msg): + left + right + with pytest.raises(TypeError, match=msg): + right + left + with pytest.raises(TypeError, match=msg): + left - right + with pytest.raises(TypeError, match=msg): + right - left + + +def get_upcast_box(box, vector): + """ + Given two box-types, find the one that takes priority + """ + if box is DataFrame or isinstance(vector, DataFrame): + return DataFrame + if box is Series or isinstance(vector, Series): + return Series + if box is Index or isinstance(vector, Index): + return Index + return box + + +def assert_invalid_comparison(left, right, box): + """ + Assert that comparison operations with mismatched types behave correctly. + + Parameters + ---------- + left : np.ndarray, ExtensionArray, Index, or Series + right : object + box : {pd.DataFrame, pd.Series, pd.Index, tm.to_array} + """ + # Not for tznaive-tzaware comparison + + # Note: not quite the same as how we do this for tm.box_expected + xbox = box if box is not Index else np.array + + result = left == right + expected = xbox(np.zeros(result.shape, dtype=np.bool_)) + + tm.assert_equal(result, expected) + + result = right == left + tm.assert_equal(result, expected) + + result = left != right + tm.assert_equal(result, ~expected) + + result = right != left + tm.assert_equal(result, ~expected) + + msg = "Invalid comparison between|Cannot compare type|not supported between" + with pytest.raises(TypeError, match=msg): + left < right + with pytest.raises(TypeError, match=msg): + left <= right + with pytest.raises(TypeError, match=msg): + left > right + with pytest.raises(TypeError, match=msg): + left >= right + with pytest.raises(TypeError, match=msg): + right < left + with pytest.raises(TypeError, match=msg): + right <= left + with pytest.raises(TypeError, match=msg): + right > left + with pytest.raises(TypeError, match=msg): + right >= left diff --git a/venv/Lib/site-packages/pandas/tests/arithmetic/conftest.py b/venv/Lib/site-packages/pandas/tests/arithmetic/conftest.py new file mode 100644 index 0000000..577093c --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arithmetic/conftest.py @@ -0,0 +1,248 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + +# ------------------------------------------------------------------ +# Helper Functions + + +def id_func(x): + if isinstance(x, tuple): + assert len(x) == 2 + return x[0].__name__ + "-" + str(x[1]) + else: + return x.__name__ + + +# ------------------------------------------------------------------ + + +@pytest.fixture(params=[1, np.array(1, dtype=np.int64)]) +def one(request): + """ + Several variants of integer value 1. The zero-dim integer array + behaves like an integer. + + This fixture can be used to check that datetimelike indexes handle + addition and subtraction of integers and zero-dimensional arrays + of integers. + + Examples + -------- + >>> dti = pd.date_range('2016-01-01', periods=2, freq='H') + >>> dti + DatetimeIndex(['2016-01-01 00:00:00', '2016-01-01 01:00:00'], + dtype='datetime64[ns]', freq='H') + >>> dti + one + DatetimeIndex(['2016-01-01 01:00:00', '2016-01-01 02:00:00'], + dtype='datetime64[ns]', freq='H') + """ + return request.param + + +zeros = [ + box_cls([0] * 5, dtype=dtype) + for box_cls in [pd.Index, np.array] + for dtype in [np.int64, np.uint64, np.float64] +] +zeros.extend( + [box_cls([-0.0] * 5, dtype=np.float64) for box_cls in [pd.Index, np.array]] +) +zeros.extend([np.array(0, dtype=dtype) for dtype in [np.int64, np.uint64, np.float64]]) +zeros.extend([np.array(-0.0, dtype=np.float64)]) +zeros.extend([0, 0.0, -0.0]) + + +@pytest.fixture(params=zeros) +def zero(request): + """ + Several types of scalar zeros and length 5 vectors of zeros. + + This fixture can be used to check that numeric-dtype indexes handle + division by any zero numeric-dtype. + + Uses vector of length 5 for broadcasting with `numeric_idx` fixture, + which creates numeric-dtype vectors also of length 5. + + Examples + -------- + >>> arr = pd.RangeIndex(5) + >>> arr / zeros + Float64Index([nan, inf, inf, inf, inf], dtype='float64') + """ + return request.param + + +# ------------------------------------------------------------------ +# Vector Fixtures + + +@pytest.fixture( + params=[ + pd.Float64Index(np.arange(5, dtype="float64")), + pd.Int64Index(np.arange(5, dtype="int64")), + pd.UInt64Index(np.arange(5, dtype="uint64")), + pd.RangeIndex(5), + ], + ids=lambda x: type(x).__name__, +) +def numeric_idx(request): + """ + Several types of numeric-dtypes Index objects + """ + return request.param + + +# ------------------------------------------------------------------ +# Scalar Fixtures + + +@pytest.fixture( + params=[ + pd.Timedelta("5m4s").to_pytimedelta(), + pd.Timedelta("5m4s"), + pd.Timedelta("5m4s").to_timedelta64(), + ], + ids=lambda x: type(x).__name__, +) +def scalar_td(request): + """ + Several variants of Timedelta scalars representing 5 minutes and 4 seconds + """ + return request.param + + +@pytest.fixture( + params=[ + pd.offsets.Day(3), + pd.offsets.Hour(72), + pd.Timedelta(days=3).to_pytimedelta(), + pd.Timedelta("72:00:00"), + np.timedelta64(3, "D"), + np.timedelta64(72, "h"), + ], + ids=lambda x: type(x).__name__, +) +def three_days(request): + """ + Several timedelta-like and DateOffset objects that each represent + a 3-day timedelta + """ + return request.param + + +@pytest.fixture( + params=[ + pd.offsets.Hour(2), + pd.offsets.Minute(120), + pd.Timedelta(hours=2).to_pytimedelta(), + pd.Timedelta(seconds=2 * 3600), + np.timedelta64(2, "h"), + np.timedelta64(120, "m"), + ], + ids=lambda x: type(x).__name__, +) +def two_hours(request): + """ + Several timedelta-like and DateOffset objects that each represent + a 2-hour timedelta + """ + return request.param + + +_common_mismatch = [ + pd.offsets.YearBegin(2), + pd.offsets.MonthBegin(1), + pd.offsets.Minute(), +] + + +@pytest.fixture( + params=[ + pd.Timedelta(minutes=30).to_pytimedelta(), + np.timedelta64(30, "s"), + pd.Timedelta(seconds=30), + ] + + _common_mismatch +) +def not_hourly(request): + """ + Several timedelta-like and DateOffset instances that are _not_ + compatible with Hourly frequencies. + """ + return request.param + + +@pytest.fixture( + params=[ + np.timedelta64(4, "h"), + pd.Timedelta(hours=23).to_pytimedelta(), + pd.Timedelta("23:00:00"), + ] + + _common_mismatch +) +def not_daily(request): + """ + Several timedelta-like and DateOffset instances that are _not_ + compatible with Daily frequencies. + """ + return request.param + + +@pytest.fixture( + params=[ + np.timedelta64(365, "D"), + pd.Timedelta(days=365).to_pytimedelta(), + pd.Timedelta(days=365), + ] + + _common_mismatch +) +def mismatched_freq(request): + """ + Several timedelta-like and DateOffset instances that are _not_ + compatible with Monthly or Annual frequencies. + """ + return request.param + + +# ------------------------------------------------------------------ + + +@pytest.fixture(params=[pd.Index, pd.Series, pd.DataFrame], ids=id_func) +def box(request): + """ + Several array-like containers that should have effectively identical + behavior with respect to arithmetic operations. + """ + return request.param + + +@pytest.fixture( + params=[ + pd.Index, + pd.Series, + pytest.param(pd.DataFrame, marks=pytest.mark.xfail), + tm.to_array, + ], + ids=id_func, +) +def box_df_fail(request): + """ + Fixture equivalent to `box` fixture but xfailing the DataFrame case. + """ + return request.param + + +@pytest.fixture(params=[pd.Index, pd.Series, pd.DataFrame, tm.to_array], ids=id_func) +def box_with_array(request): + """ + Fixture to test behavior for Index, Series, DataFrame, and pandas Array + classes + """ + return request.param + + +# alias so we can use the same fixture for multiple parameters in a test +box_with_array2 = box_with_array diff --git a/venv/Lib/site-packages/pandas/tests/arithmetic/test_datetime64.py b/venv/Lib/site-packages/pandas/tests/arithmetic/test_datetime64.py new file mode 100644 index 0000000..d3f9ac4 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arithmetic/test_datetime64.py @@ -0,0 +1,2397 @@ +# Arithmetic tests for DataFrame/Series/Index/Array classes that should +# behave identically. +# Specifically for datetime64 and datetime64tz dtypes +from datetime import datetime, timedelta +from itertools import product, starmap +import operator +import warnings + +import numpy as np +import pytest +import pytz + +from pandas._libs.tslibs.conversion import localize_pydatetime +from pandas._libs.tslibs.offsets import shift_months +from pandas.compat.numpy import np_datetime64_compat +from pandas.errors import PerformanceWarning + +import pandas as pd +from pandas import ( + DatetimeIndex, + NaT, + Period, + Series, + Timedelta, + TimedeltaIndex, + Timestamp, + date_range, +) +import pandas._testing as tm +from pandas.core.ops import roperator +from pandas.tests.arithmetic.common import ( + assert_invalid_addsub_type, + assert_invalid_comparison, + get_upcast_box, +) + +# ------------------------------------------------------------------ +# Comparisons + + +class TestDatetime64ArrayLikeComparisons: + # Comparison tests for datetime64 vectors fully parametrized over + # DataFrame/Series/DatetimeIndex/DatetimeArray. Ideally all comparison + # tests will eventually end up here. + + def test_compare_zerodim(self, tz_naive_fixture, box_with_array): + # Test comparison with zero-dimensional array is unboxed + tz = tz_naive_fixture + box = box_with_array + xbox = box_with_array if box_with_array is not pd.Index else np.ndarray + dti = date_range("20130101", periods=3, tz=tz) + + other = np.array(dti.to_numpy()[0]) + + dtarr = tm.box_expected(dti, box) + result = dtarr <= other + expected = np.array([True, False, False]) + expected = tm.box_expected(expected, xbox) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize( + "other", + [ + "foo", + -1, + 99, + 4.0, + object(), + timedelta(days=2), + # GH#19800, GH#19301 datetime.date comparison raises to + # match DatetimeIndex/Timestamp. This also matches the behavior + # of stdlib datetime.datetime + datetime(2001, 1, 1).date(), + # GH#19301 None and NaN are *not* cast to NaT for comparisons + None, + np.nan, + ], + ) + def test_dt64arr_cmp_scalar_invalid(self, other, tz_naive_fixture, box_with_array): + # GH#22074, GH#15966 + tz = tz_naive_fixture + + rng = date_range("1/1/2000", periods=10, tz=tz) + dtarr = tm.box_expected(rng, box_with_array) + assert_invalid_comparison(dtarr, other, box_with_array) + + @pytest.mark.parametrize( + "other", + [ + list(range(10)), + np.arange(10), + np.arange(10).astype(np.float32), + np.arange(10).astype(object), + pd.timedelta_range("1ns", periods=10).array, + np.array(pd.timedelta_range("1ns", periods=10)), + list(pd.timedelta_range("1ns", periods=10)), + pd.timedelta_range("1 Day", periods=10).astype(object), + pd.period_range("1971-01-01", freq="D", periods=10).array, + pd.period_range("1971-01-01", freq="D", periods=10).astype(object), + ], + ) + def test_dt64arr_cmp_arraylike_invalid(self, other, tz_naive_fixture): + # We don't parametrize this over box_with_array because listlike + # other plays poorly with assert_invalid_comparison reversed checks + tz = tz_naive_fixture + + dta = date_range("1970-01-01", freq="ns", periods=10, tz=tz)._data + assert_invalid_comparison(dta, other, tm.to_array) + + def test_dt64arr_cmp_mixed_invalid(self, tz_naive_fixture): + tz = tz_naive_fixture + + dta = date_range("1970-01-01", freq="h", periods=5, tz=tz)._data + + other = np.array([0, 1, 2, dta[3], pd.Timedelta(days=1)]) + result = dta == other + expected = np.array([False, False, False, True, False]) + tm.assert_numpy_array_equal(result, expected) + + result = dta != other + tm.assert_numpy_array_equal(result, ~expected) + + msg = "Invalid comparison between|Cannot compare type|not supported between" + with pytest.raises(TypeError, match=msg): + dta < other + with pytest.raises(TypeError, match=msg): + dta > other + with pytest.raises(TypeError, match=msg): + dta <= other + with pytest.raises(TypeError, match=msg): + dta >= other + + def test_dt64arr_nat_comparison(self, tz_naive_fixture, box_with_array): + # GH#22242, GH#22163 DataFrame considered NaT == ts incorrectly + tz = tz_naive_fixture + box = box_with_array + xbox = box if box is not pd.Index else np.ndarray + + ts = pd.Timestamp.now(tz) + ser = pd.Series([ts, pd.NaT]) + + # FIXME: Can't transpose because that loses the tz dtype on + # the NaT column + obj = tm.box_expected(ser, box, transpose=False) + + expected = pd.Series([True, False], dtype=np.bool_) + expected = tm.box_expected(expected, xbox, transpose=False) + + result = obj == ts + tm.assert_equal(result, expected) + + +class TestDatetime64SeriesComparison: + # TODO: moved from tests.series.test_operators; needs cleanup + + @pytest.mark.parametrize( + "pair", + [ + ( + [pd.Timestamp("2011-01-01"), NaT, pd.Timestamp("2011-01-03")], + [NaT, NaT, pd.Timestamp("2011-01-03")], + ), + ( + [pd.Timedelta("1 days"), NaT, pd.Timedelta("3 days")], + [NaT, NaT, pd.Timedelta("3 days")], + ), + ( + [pd.Period("2011-01", freq="M"), NaT, pd.Period("2011-03", freq="M")], + [NaT, NaT, pd.Period("2011-03", freq="M")], + ), + ], + ) + @pytest.mark.parametrize("reverse", [True, False]) + @pytest.mark.parametrize("dtype", [None, object]) + def test_nat_comparisons(self, dtype, index_or_series, reverse, pair): + box = index_or_series + l, r = pair + if reverse: + # add lhs / rhs switched data + l, r = r, l + + left = Series(l, dtype=dtype) + right = box(r, dtype=dtype) + # Series, Index + + expected = Series([False, False, True]) + tm.assert_series_equal(left == right, expected) + + expected = Series([True, True, False]) + tm.assert_series_equal(left != right, expected) + + expected = Series([False, False, False]) + tm.assert_series_equal(left < right, expected) + + expected = Series([False, False, False]) + tm.assert_series_equal(left > right, expected) + + expected = Series([False, False, True]) + tm.assert_series_equal(left >= right, expected) + + expected = Series([False, False, True]) + tm.assert_series_equal(left <= right, expected) + + def test_comparison_invalid(self, tz_naive_fixture, box_with_array): + # GH#4968 + # invalid date/int comparisons + tz = tz_naive_fixture + ser = Series(range(5)) + ser2 = Series(pd.date_range("20010101", periods=5, tz=tz)) + + ser = tm.box_expected(ser, box_with_array) + ser2 = tm.box_expected(ser2, box_with_array) + + assert_invalid_comparison(ser, ser2, box_with_array) + + @pytest.mark.parametrize( + "data", + [ + [Timestamp("2011-01-01"), NaT, Timestamp("2011-01-03")], + [Timedelta("1 days"), NaT, Timedelta("3 days")], + [Period("2011-01", freq="M"), NaT, Period("2011-03", freq="M")], + ], + ) + @pytest.mark.parametrize("dtype", [None, object]) + def test_nat_comparisons_scalar(self, dtype, data, box_with_array): + if box_with_array is tm.to_array and dtype is object: + # dont bother testing ndarray comparison methods as this fails + # on older numpys (since they check object identity) + return + + xbox = box_with_array if box_with_array is not pd.Index else np.ndarray + + left = Series(data, dtype=dtype) + left = tm.box_expected(left, box_with_array) + + expected = [False, False, False] + expected = tm.box_expected(expected, xbox) + tm.assert_equal(left == NaT, expected) + tm.assert_equal(NaT == left, expected) + + expected = [True, True, True] + expected = tm.box_expected(expected, xbox) + tm.assert_equal(left != NaT, expected) + tm.assert_equal(NaT != left, expected) + + expected = [False, False, False] + expected = tm.box_expected(expected, xbox) + tm.assert_equal(left < NaT, expected) + tm.assert_equal(NaT > left, expected) + tm.assert_equal(left <= NaT, expected) + tm.assert_equal(NaT >= left, expected) + + tm.assert_equal(left > NaT, expected) + tm.assert_equal(NaT < left, expected) + tm.assert_equal(left >= NaT, expected) + tm.assert_equal(NaT <= left, expected) + + @pytest.mark.parametrize("val", [datetime(2000, 1, 4), datetime(2000, 1, 5)]) + def test_series_comparison_scalars(self, val): + series = Series(date_range("1/1/2000", periods=10)) + + result = series > val + expected = Series([x > val for x in series]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "left,right", [("lt", "gt"), ("le", "ge"), ("eq", "eq"), ("ne", "ne")] + ) + def test_timestamp_compare_series(self, left, right): + # see gh-4982 + # Make sure we can compare Timestamps on the right AND left hand side. + ser = pd.Series(pd.date_range("20010101", periods=10), name="dates") + s_nat = ser.copy(deep=True) + + ser[0] = pd.Timestamp("nat") + ser[3] = pd.Timestamp("nat") + + left_f = getattr(operator, left) + right_f = getattr(operator, right) + + # No NaT + expected = left_f(ser, pd.Timestamp("20010109")) + result = right_f(pd.Timestamp("20010109"), ser) + tm.assert_series_equal(result, expected) + + # NaT + expected = left_f(ser, pd.Timestamp("nat")) + result = right_f(pd.Timestamp("nat"), ser) + tm.assert_series_equal(result, expected) + + # Compare to Timestamp with series containing NaT + expected = left_f(s_nat, pd.Timestamp("20010109")) + result = right_f(pd.Timestamp("20010109"), s_nat) + tm.assert_series_equal(result, expected) + + # Compare to NaT with series containing NaT + expected = left_f(s_nat, pd.Timestamp("nat")) + result = right_f(pd.Timestamp("nat"), s_nat) + tm.assert_series_equal(result, expected) + + def test_dt64arr_timestamp_equality(self, box_with_array): + # GH#11034 + xbox = box_with_array if box_with_array is not pd.Index else np.ndarray + + ser = pd.Series([pd.Timestamp("2000-01-29 01:59:00"), "NaT"]) + ser = tm.box_expected(ser, box_with_array) + + result = ser != ser + expected = tm.box_expected([False, True], xbox) + tm.assert_equal(result, expected) + + result = ser != ser[0] + expected = tm.box_expected([False, True], xbox) + tm.assert_equal(result, expected) + + result = ser != ser[1] + expected = tm.box_expected([True, True], xbox) + tm.assert_equal(result, expected) + + result = ser == ser + expected = tm.box_expected([True, False], xbox) + tm.assert_equal(result, expected) + + result = ser == ser[0] + expected = tm.box_expected([True, False], xbox) + tm.assert_equal(result, expected) + + result = ser == ser[1] + expected = tm.box_expected([False, False], xbox) + tm.assert_equal(result, expected) + + +class TestDatetimeIndexComparisons: + + # TODO: moved from tests.indexes.test_base; parametrize and de-duplicate + @pytest.mark.parametrize( + "op", + [operator.eq, operator.ne, operator.gt, operator.lt, operator.ge, operator.le], + ) + def test_comparators(self, op): + index = tm.makeDateIndex(100) + element = index[len(index) // 2] + element = Timestamp(element).to_datetime64() + + arr = np.array(index) + arr_result = op(arr, element) + index_result = op(index, element) + + assert isinstance(index_result, np.ndarray) + tm.assert_numpy_array_equal(arr_result, index_result) + + @pytest.mark.parametrize( + "other", + [datetime(2016, 1, 1), Timestamp("2016-01-01"), np.datetime64("2016-01-01")], + ) + def test_dti_cmp_datetimelike(self, other, tz_naive_fixture): + tz = tz_naive_fixture + dti = pd.date_range("2016-01-01", periods=2, tz=tz) + if tz is not None: + if isinstance(other, np.datetime64): + # no tzaware version available + return + other = localize_pydatetime(other, dti.tzinfo) + + result = dti == other + expected = np.array([True, False]) + tm.assert_numpy_array_equal(result, expected) + + result = dti > other + expected = np.array([False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = dti >= other + expected = np.array([True, True]) + tm.assert_numpy_array_equal(result, expected) + + result = dti < other + expected = np.array([False, False]) + tm.assert_numpy_array_equal(result, expected) + + result = dti <= other + expected = np.array([True, False]) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("dtype", [None, object]) + def test_dti_cmp_nat(self, dtype, box_with_array): + if box_with_array is tm.to_array and dtype is object: + # dont bother testing ndarray comparison methods as this fails + # on older numpys (since they check object identity) + return + + xbox = box_with_array if box_with_array is not pd.Index else np.ndarray + + left = pd.DatetimeIndex( + [pd.Timestamp("2011-01-01"), pd.NaT, pd.Timestamp("2011-01-03")] + ) + right = pd.DatetimeIndex([pd.NaT, pd.NaT, pd.Timestamp("2011-01-03")]) + + left = tm.box_expected(left, box_with_array) + right = tm.box_expected(right, box_with_array) + + lhs, rhs = left, right + if dtype is object: + lhs, rhs = left.astype(object), right.astype(object) + + result = rhs == lhs + expected = np.array([False, False, True]) + expected = tm.box_expected(expected, xbox) + tm.assert_equal(result, expected) + + result = lhs != rhs + expected = np.array([True, True, False]) + expected = tm.box_expected(expected, xbox) + tm.assert_equal(result, expected) + + expected = np.array([False, False, False]) + expected = tm.box_expected(expected, xbox) + tm.assert_equal(lhs == pd.NaT, expected) + tm.assert_equal(pd.NaT == rhs, expected) + + expected = np.array([True, True, True]) + expected = tm.box_expected(expected, xbox) + tm.assert_equal(lhs != pd.NaT, expected) + tm.assert_equal(pd.NaT != lhs, expected) + + expected = np.array([False, False, False]) + expected = tm.box_expected(expected, xbox) + tm.assert_equal(lhs < pd.NaT, expected) + tm.assert_equal(pd.NaT > lhs, expected) + + def test_dti_cmp_nat_behaves_like_float_cmp_nan(self): + fidx1 = pd.Index([1.0, np.nan, 3.0, np.nan, 5.0, 7.0]) + fidx2 = pd.Index([2.0, 3.0, np.nan, np.nan, 6.0, 7.0]) + + didx1 = pd.DatetimeIndex( + ["2014-01-01", pd.NaT, "2014-03-01", pd.NaT, "2014-05-01", "2014-07-01"] + ) + didx2 = pd.DatetimeIndex( + ["2014-02-01", "2014-03-01", pd.NaT, pd.NaT, "2014-06-01", "2014-07-01"] + ) + darr = np.array( + [ + np_datetime64_compat("2014-02-01 00:00Z"), + np_datetime64_compat("2014-03-01 00:00Z"), + np_datetime64_compat("nat"), + np.datetime64("nat"), + np_datetime64_compat("2014-06-01 00:00Z"), + np_datetime64_compat("2014-07-01 00:00Z"), + ] + ) + + cases = [(fidx1, fidx2), (didx1, didx2), (didx1, darr)] + + # Check pd.NaT is handles as the same as np.nan + with tm.assert_produces_warning(None): + for idx1, idx2 in cases: + + result = idx1 < idx2 + expected = np.array([True, False, False, False, True, False]) + tm.assert_numpy_array_equal(result, expected) + + result = idx2 > idx1 + expected = np.array([True, False, False, False, True, False]) + tm.assert_numpy_array_equal(result, expected) + + result = idx1 <= idx2 + expected = np.array([True, False, False, False, True, True]) + tm.assert_numpy_array_equal(result, expected) + + result = idx2 >= idx1 + expected = np.array([True, False, False, False, True, True]) + tm.assert_numpy_array_equal(result, expected) + + result = idx1 == idx2 + expected = np.array([False, False, False, False, False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = idx1 != idx2 + expected = np.array([True, True, True, True, True, False]) + tm.assert_numpy_array_equal(result, expected) + + with tm.assert_produces_warning(None): + for idx1, val in [(fidx1, np.nan), (didx1, pd.NaT)]: + result = idx1 < val + expected = np.array([False, False, False, False, False, False]) + tm.assert_numpy_array_equal(result, expected) + result = idx1 > val + tm.assert_numpy_array_equal(result, expected) + + result = idx1 <= val + tm.assert_numpy_array_equal(result, expected) + result = idx1 >= val + tm.assert_numpy_array_equal(result, expected) + + result = idx1 == val + tm.assert_numpy_array_equal(result, expected) + + result = idx1 != val + expected = np.array([True, True, True, True, True, True]) + tm.assert_numpy_array_equal(result, expected) + + # Check pd.NaT is handles as the same as np.nan + with tm.assert_produces_warning(None): + for idx1, val in [(fidx1, 3), (didx1, datetime(2014, 3, 1))]: + result = idx1 < val + expected = np.array([True, False, False, False, False, False]) + tm.assert_numpy_array_equal(result, expected) + result = idx1 > val + expected = np.array([False, False, False, False, True, True]) + tm.assert_numpy_array_equal(result, expected) + + result = idx1 <= val + expected = np.array([True, False, True, False, False, False]) + tm.assert_numpy_array_equal(result, expected) + result = idx1 >= val + expected = np.array([False, False, True, False, True, True]) + tm.assert_numpy_array_equal(result, expected) + + result = idx1 == val + expected = np.array([False, False, True, False, False, False]) + tm.assert_numpy_array_equal(result, expected) + + result = idx1 != val + expected = np.array([True, True, False, True, True, True]) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "op", + [operator.eq, operator.ne, operator.gt, operator.ge, operator.lt, operator.le], + ) + def test_comparison_tzawareness_compat(self, op, box_df_fail): + # GH#18162 + box = box_df_fail + + dr = pd.date_range("2016-01-01", periods=6) + dz = dr.tz_localize("US/Pacific") + + dr = tm.box_expected(dr, box) + dz = tm.box_expected(dz, box) + + msg = "Cannot compare tz-naive and tz-aware" + with pytest.raises(TypeError, match=msg): + op(dr, dz) + + # FIXME: DataFrame case fails to raise for == and !=, wrong + # message for inequalities + with pytest.raises(TypeError, match=msg): + op(dr, list(dz)) + with pytest.raises(TypeError, match=msg): + op(dr, np.array(list(dz), dtype=object)) + with pytest.raises(TypeError, match=msg): + op(dz, dr) + + # FIXME: DataFrame case fails to raise for == and !=, wrong + # message for inequalities + with pytest.raises(TypeError, match=msg): + op(dz, list(dr)) + with pytest.raises(TypeError, match=msg): + op(dz, np.array(list(dr), dtype=object)) + + # The aware==aware and naive==naive comparisons should *not* raise + assert np.all(dr == dr) + assert np.all(dr == list(dr)) + assert np.all(list(dr) == dr) + assert np.all(np.array(list(dr), dtype=object) == dr) + assert np.all(dr == np.array(list(dr), dtype=object)) + + assert np.all(dz == dz) + assert np.all(dz == list(dz)) + assert np.all(list(dz) == dz) + assert np.all(np.array(list(dz), dtype=object) == dz) + assert np.all(dz == np.array(list(dz), dtype=object)) + + @pytest.mark.parametrize( + "op", + [operator.eq, operator.ne, operator.gt, operator.ge, operator.lt, operator.le], + ) + def test_comparison_tzawareness_compat_scalars(self, op, box_with_array): + # GH#18162 + dr = pd.date_range("2016-01-01", periods=6) + dz = dr.tz_localize("US/Pacific") + + dr = tm.box_expected(dr, box_with_array) + dz = tm.box_expected(dz, box_with_array) + + # Check comparisons against scalar Timestamps + ts = pd.Timestamp("2000-03-14 01:59") + ts_tz = pd.Timestamp("2000-03-14 01:59", tz="Europe/Amsterdam") + + assert np.all(dr > ts) + msg = "Cannot compare tz-naive and tz-aware" + with pytest.raises(TypeError, match=msg): + op(dr, ts_tz) + + assert np.all(dz > ts_tz) + with pytest.raises(TypeError, match=msg): + op(dz, ts) + + # GH#12601: Check comparison against Timestamps and DatetimeIndex + with pytest.raises(TypeError, match=msg): + op(ts, dz) + + @pytest.mark.parametrize( + "op", + [operator.eq, operator.ne, operator.gt, operator.ge, operator.lt, operator.le], + ) + @pytest.mark.parametrize( + "other", + [datetime(2016, 1, 1), Timestamp("2016-01-01"), np.datetime64("2016-01-01")], + ) + # Bug in NumPy? https://github.com/numpy/numpy/issues/13841 + # Raising in __eq__ will fallback to NumPy, which warns, fails, + # then re-raises the original exception. So we just need to ignore. + @pytest.mark.filterwarnings("ignore:elementwise comp:DeprecationWarning") + @pytest.mark.filterwarnings("ignore:Converting timezone-aware:FutureWarning") + def test_scalar_comparison_tzawareness( + self, op, other, tz_aware_fixture, box_with_array + ): + tz = tz_aware_fixture + dti = pd.date_range("2016-01-01", periods=2, tz=tz) + + dtarr = tm.box_expected(dti, box_with_array) + msg = "Cannot compare tz-naive and tz-aware" + with pytest.raises(TypeError, match=msg): + op(dtarr, other) + with pytest.raises(TypeError, match=msg): + op(other, dtarr) + + @pytest.mark.parametrize( + "op", + [operator.eq, operator.ne, operator.gt, operator.ge, operator.lt, operator.le], + ) + def test_nat_comparison_tzawareness(self, op): + # GH#19276 + # tzaware DatetimeIndex should not raise when compared to NaT + dti = pd.DatetimeIndex( + ["2014-01-01", pd.NaT, "2014-03-01", pd.NaT, "2014-05-01", "2014-07-01"] + ) + expected = np.array([op == operator.ne] * len(dti)) + result = op(dti, pd.NaT) + tm.assert_numpy_array_equal(result, expected) + + result = op(dti.tz_localize("US/Pacific"), pd.NaT) + tm.assert_numpy_array_equal(result, expected) + + def test_dti_cmp_str(self, tz_naive_fixture): + # GH#22074 + # regardless of tz, we expect these comparisons are valid + tz = tz_naive_fixture + rng = date_range("1/1/2000", periods=10, tz=tz) + other = "1/1/2000" + + result = rng == other + expected = np.array([True] + [False] * 9) + tm.assert_numpy_array_equal(result, expected) + + result = rng != other + expected = np.array([False] + [True] * 9) + tm.assert_numpy_array_equal(result, expected) + + result = rng < other + expected = np.array([False] * 10) + tm.assert_numpy_array_equal(result, expected) + + result = rng <= other + expected = np.array([True] + [False] * 9) + tm.assert_numpy_array_equal(result, expected) + + result = rng > other + expected = np.array([False] + [True] * 9) + tm.assert_numpy_array_equal(result, expected) + + result = rng >= other + expected = np.array([True] * 10) + tm.assert_numpy_array_equal(result, expected) + + def test_dti_cmp_list(self): + rng = date_range("1/1/2000", periods=10) + + result = rng == list(rng) + expected = rng == rng + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "other", + [ + pd.timedelta_range("1D", periods=10), + pd.timedelta_range("1D", periods=10).to_series(), + pd.timedelta_range("1D", periods=10).asi8.view("m8[ns]"), + ], + ids=lambda x: type(x).__name__, + ) + def test_dti_cmp_tdi_tzawareness(self, other): + # GH#22074 + # reversion test that we _don't_ call _assert_tzawareness_compat + # when comparing against TimedeltaIndex + dti = date_range("2000-01-01", periods=10, tz="Asia/Tokyo") + + result = dti == other + expected = np.array([False] * 10) + tm.assert_numpy_array_equal(result, expected) + + result = dti != other + expected = np.array([True] * 10) + tm.assert_numpy_array_equal(result, expected) + msg = "Invalid comparison between" + with pytest.raises(TypeError, match=msg): + dti < other + with pytest.raises(TypeError, match=msg): + dti <= other + with pytest.raises(TypeError, match=msg): + dti > other + with pytest.raises(TypeError, match=msg): + dti >= other + + def test_dti_cmp_object_dtype(self): + # GH#22074 + dti = date_range("2000-01-01", periods=10, tz="Asia/Tokyo") + + other = dti.astype("O") + + result = dti == other + expected = np.array([True] * 10) + tm.assert_numpy_array_equal(result, expected) + + other = dti.tz_localize(None) + msg = "Cannot compare tz-naive and tz-aware" + with pytest.raises(TypeError, match=msg): + # tzawareness failure + dti != other + + other = np.array(list(dti[:5]) + [Timedelta(days=1)] * 5) + result = dti == other + expected = np.array([True] * 5 + [False] * 5) + tm.assert_numpy_array_equal(result, expected) + msg = "Cannot compare type" + with pytest.raises(TypeError, match=msg): + dti >= other + + +# ------------------------------------------------------------------ +# Arithmetic + + +class TestDatetime64Arithmetic: + # This class is intended for "finished" tests that are fully parametrized + # over DataFrame/Series/Index/DatetimeArray + + # ------------------------------------------------------------- + # Addition/Subtraction of timedelta-like + + def test_dt64arr_add_timedeltalike_scalar( + self, tz_naive_fixture, two_hours, box_with_array + ): + # GH#22005, GH#22163 check DataFrame doesn't raise TypeError + tz = tz_naive_fixture + + rng = pd.date_range("2000-01-01", "2000-02-01", tz=tz) + expected = pd.date_range("2000-01-01 02:00", "2000-02-01 02:00", tz=tz) + + rng = tm.box_expected(rng, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + result = rng + two_hours + tm.assert_equal(result, expected) + + def test_dt64arr_iadd_timedeltalike_scalar( + self, tz_naive_fixture, two_hours, box_with_array + ): + tz = tz_naive_fixture + + rng = pd.date_range("2000-01-01", "2000-02-01", tz=tz) + expected = pd.date_range("2000-01-01 02:00", "2000-02-01 02:00", tz=tz) + + rng = tm.box_expected(rng, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + rng += two_hours + tm.assert_equal(rng, expected) + + def test_dt64arr_sub_timedeltalike_scalar( + self, tz_naive_fixture, two_hours, box_with_array + ): + tz = tz_naive_fixture + + rng = pd.date_range("2000-01-01", "2000-02-01", tz=tz) + expected = pd.date_range("1999-12-31 22:00", "2000-01-31 22:00", tz=tz) + + rng = tm.box_expected(rng, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + result = rng - two_hours + tm.assert_equal(result, expected) + + def test_dt64arr_isub_timedeltalike_scalar( + self, tz_naive_fixture, two_hours, box_with_array + ): + tz = tz_naive_fixture + + rng = pd.date_range("2000-01-01", "2000-02-01", tz=tz) + expected = pd.date_range("1999-12-31 22:00", "2000-01-31 22:00", tz=tz) + + rng = tm.box_expected(rng, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + rng -= two_hours + tm.assert_equal(rng, expected) + + # TODO: redundant with test_dt64arr_add_timedeltalike_scalar + def test_dt64arr_add_td64_scalar(self, box_with_array): + # scalar timedeltas/np.timedelta64 objects + # operate with np.timedelta64 correctly + ser = Series([Timestamp("20130101 9:01"), Timestamp("20130101 9:02")]) + + expected = Series( + [Timestamp("20130101 9:01:01"), Timestamp("20130101 9:02:01")] + ) + + dtarr = tm.box_expected(ser, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + result = dtarr + np.timedelta64(1, "s") + tm.assert_equal(result, expected) + result = np.timedelta64(1, "s") + dtarr + tm.assert_equal(result, expected) + + expected = Series( + [Timestamp("20130101 9:01:00.005"), Timestamp("20130101 9:02:00.005")] + ) + expected = tm.box_expected(expected, box_with_array) + + result = dtarr + np.timedelta64(5, "ms") + tm.assert_equal(result, expected) + result = np.timedelta64(5, "ms") + dtarr + tm.assert_equal(result, expected) + + def test_dt64arr_add_sub_td64_nat(self, box_with_array, tz_naive_fixture): + # GH#23320 special handling for timedelta64("NaT") + tz = tz_naive_fixture + + dti = pd.date_range("1994-04-01", periods=9, tz=tz, freq="QS") + other = np.timedelta64("NaT") + expected = pd.DatetimeIndex(["NaT"] * 9, tz=tz) + + # FIXME: fails with transpose=True due to tz-aware DataFrame + # transpose bug + obj = tm.box_expected(dti, box_with_array, transpose=False) + expected = tm.box_expected(expected, box_with_array, transpose=False) + + result = obj + other + tm.assert_equal(result, expected) + result = other + obj + tm.assert_equal(result, expected) + result = obj - other + tm.assert_equal(result, expected) + msg = "cannot subtract" + with pytest.raises(TypeError, match=msg): + other - obj + + def test_dt64arr_add_sub_td64ndarray(self, tz_naive_fixture, box_with_array): + + tz = tz_naive_fixture + dti = pd.date_range("2016-01-01", periods=3, tz=tz) + tdi = pd.TimedeltaIndex(["-1 Day", "-1 Day", "-1 Day"]) + tdarr = tdi.values + + expected = pd.date_range("2015-12-31", periods=3, tz=tz) + + dtarr = tm.box_expected(dti, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + result = dtarr + tdarr + tm.assert_equal(result, expected) + result = tdarr + dtarr + tm.assert_equal(result, expected) + + expected = pd.date_range("2016-01-02", periods=3, tz=tz) + expected = tm.box_expected(expected, box_with_array) + + result = dtarr - tdarr + tm.assert_equal(result, expected) + msg = "cannot subtract|(bad|unsupported) operand type for unary" + with pytest.raises(TypeError, match=msg): + tdarr - dtarr + + # ----------------------------------------------------------------- + # Subtraction of datetime-like scalars + + @pytest.mark.parametrize( + "ts", + [ + pd.Timestamp("2013-01-01"), + pd.Timestamp("2013-01-01").to_pydatetime(), + pd.Timestamp("2013-01-01").to_datetime64(), + ], + ) + def test_dt64arr_sub_dtscalar(self, box_with_array, ts): + # GH#8554, GH#22163 DataFrame op should _not_ return dt64 dtype + idx = pd.date_range("2013-01-01", periods=3) + idx = tm.box_expected(idx, box_with_array) + + expected = pd.TimedeltaIndex(["0 Days", "1 Day", "2 Days"]) + expected = tm.box_expected(expected, box_with_array) + + result = idx - ts + tm.assert_equal(result, expected) + + def test_dt64arr_sub_datetime64_not_ns(self, box_with_array): + # GH#7996, GH#22163 ensure non-nano datetime64 is converted to nano + # for DataFrame operation + dt64 = np.datetime64("2013-01-01") + assert dt64.dtype == "datetime64[D]" + + dti = pd.date_range("20130101", periods=3) + dtarr = tm.box_expected(dti, box_with_array) + + expected = pd.TimedeltaIndex(["0 Days", "1 Day", "2 Days"]) + expected = tm.box_expected(expected, box_with_array) + + result = dtarr - dt64 + tm.assert_equal(result, expected) + + result = dt64 - dtarr + tm.assert_equal(result, -expected) + + def test_dt64arr_sub_timestamp(self, box_with_array): + ser = pd.date_range("2014-03-17", periods=2, freq="D", tz="US/Eastern") + ts = ser[0] + + ser = tm.box_expected(ser, box_with_array) + + delta_series = pd.Series([np.timedelta64(0, "D"), np.timedelta64(1, "D")]) + expected = tm.box_expected(delta_series, box_with_array) + + tm.assert_equal(ser - ts, expected) + tm.assert_equal(ts - ser, -expected) + + def test_dt64arr_sub_NaT(self, box_with_array): + # GH#18808 + dti = pd.DatetimeIndex([pd.NaT, pd.Timestamp("19900315")]) + ser = tm.box_expected(dti, box_with_array) + + result = ser - pd.NaT + expected = pd.Series([pd.NaT, pd.NaT], dtype="timedelta64[ns]") + expected = tm.box_expected(expected, box_with_array) + tm.assert_equal(result, expected) + + dti_tz = dti.tz_localize("Asia/Tokyo") + ser_tz = tm.box_expected(dti_tz, box_with_array) + + result = ser_tz - pd.NaT + expected = pd.Series([pd.NaT, pd.NaT], dtype="timedelta64[ns]") + expected = tm.box_expected(expected, box_with_array) + tm.assert_equal(result, expected) + + # ------------------------------------------------------------- + # Subtraction of datetime-like array-like + + def test_dt64arr_naive_sub_dt64ndarray(self, box_with_array): + dti = pd.date_range("2016-01-01", periods=3, tz=None) + dt64vals = dti.values + + dtarr = tm.box_expected(dti, box_with_array) + + expected = dtarr - dtarr + result = dtarr - dt64vals + tm.assert_equal(result, expected) + result = dt64vals - dtarr + tm.assert_equal(result, expected) + + def test_dt64arr_aware_sub_dt64ndarray_raises( + self, tz_aware_fixture, box_with_array + ): + + tz = tz_aware_fixture + dti = pd.date_range("2016-01-01", periods=3, tz=tz) + dt64vals = dti.values + + dtarr = tm.box_expected(dti, box_with_array) + msg = "subtraction must have the same timezones or" + with pytest.raises(TypeError, match=msg): + dtarr - dt64vals + with pytest.raises(TypeError, match=msg): + dt64vals - dtarr + + # ------------------------------------------------------------- + # Addition of datetime-like others (invalid) + + def test_dt64arr_add_dt64ndarray_raises(self, tz_naive_fixture, box_with_array): + + tz = tz_naive_fixture + dti = pd.date_range("2016-01-01", periods=3, tz=tz) + dt64vals = dti.values + + dtarr = tm.box_expected(dti, box_with_array) + msg = "cannot add" + with pytest.raises(TypeError, match=msg): + dtarr + dt64vals + with pytest.raises(TypeError, match=msg): + dt64vals + dtarr + + def test_dt64arr_add_timestamp_raises(self, box_with_array): + # GH#22163 ensure DataFrame doesn't cast Timestamp to i8 + idx = DatetimeIndex(["2011-01-01", "2011-01-02"]) + idx = tm.box_expected(idx, box_with_array) + msg = "cannot add" + with pytest.raises(TypeError, match=msg): + idx + Timestamp("2011-01-01") + with pytest.raises(TypeError, match=msg): + Timestamp("2011-01-01") + idx + + # ------------------------------------------------------------- + # Other Invalid Addition/Subtraction + + @pytest.mark.parametrize( + "other", + [ + 3.14, + np.array([2.0, 3.0]), + # GH#13078 datetime +/- Period is invalid + pd.Period("2011-01-01", freq="D"), + ], + ) + @pytest.mark.parametrize("dti_freq", [None, "D"]) + def test_dt64arr_add_sub_invalid(self, dti_freq, other, box_with_array): + dti = DatetimeIndex(["2011-01-01", "2011-01-02"], freq=dti_freq) + dtarr = tm.box_expected(dti, box_with_array) + msg = "|".join( + [ + "unsupported operand type", + "cannot (add|subtract)", + "cannot use operands with types", + "ufunc '?(add|subtract)'? cannot use operands with types", + ] + ) + assert_invalid_addsub_type(dtarr, other, msg) + + @pytest.mark.parametrize("pi_freq", ["D", "W", "Q", "H"]) + @pytest.mark.parametrize("dti_freq", [None, "D"]) + def test_dt64arr_add_sub_parr( + self, dti_freq, pi_freq, box_with_array, box_with_array2 + ): + # GH#20049 subtracting PeriodIndex should raise TypeError + dti = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], freq=dti_freq) + pi = dti.to_period(pi_freq) + + dtarr = tm.box_expected(dti, box_with_array) + parr = tm.box_expected(pi, box_with_array2) + msg = "|".join( + [ + "cannot (add|subtract)", + "unsupported operand", + "descriptor.*requires", + "ufunc.*cannot use operands", + ] + ) + assert_invalid_addsub_type(dtarr, parr, msg) + + +class TestDatetime64DateOffsetArithmetic: + + # ------------------------------------------------------------- + # Tick DateOffsets + + # TODO: parametrize over timezone? + def test_dt64arr_series_add_tick_DateOffset(self, box_with_array): + # GH#4532 + # operate with pd.offsets + ser = Series([Timestamp("20130101 9:01"), Timestamp("20130101 9:02")]) + expected = Series( + [Timestamp("20130101 9:01:05"), Timestamp("20130101 9:02:05")] + ) + + ser = tm.box_expected(ser, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + result = ser + pd.offsets.Second(5) + tm.assert_equal(result, expected) + + result2 = pd.offsets.Second(5) + ser + tm.assert_equal(result2, expected) + + def test_dt64arr_series_sub_tick_DateOffset(self, box_with_array): + # GH#4532 + # operate with pd.offsets + ser = Series([Timestamp("20130101 9:01"), Timestamp("20130101 9:02")]) + expected = Series( + [Timestamp("20130101 9:00:55"), Timestamp("20130101 9:01:55")] + ) + + ser = tm.box_expected(ser, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + result = ser - pd.offsets.Second(5) + tm.assert_equal(result, expected) + + result2 = -pd.offsets.Second(5) + ser + tm.assert_equal(result2, expected) + msg = "(bad|unsupported) operand type for unary" + with pytest.raises(TypeError, match=msg): + pd.offsets.Second(5) - ser + + @pytest.mark.parametrize( + "cls_name", ["Day", "Hour", "Minute", "Second", "Milli", "Micro", "Nano"] + ) + def test_dt64arr_add_sub_tick_DateOffset_smoke(self, cls_name, box_with_array): + # GH#4532 + # smoke tests for valid DateOffsets + ser = Series([Timestamp("20130101 9:01"), Timestamp("20130101 9:02")]) + ser = tm.box_expected(ser, box_with_array) + + offset_cls = getattr(pd.offsets, cls_name) + ser + offset_cls(5) + offset_cls(5) + ser + ser - offset_cls(5) + + def test_dti_add_tick_tzaware(self, tz_aware_fixture, box_with_array): + # GH#21610, GH#22163 ensure DataFrame doesn't return object-dtype + tz = tz_aware_fixture + if tz == "US/Pacific": + dates = date_range("2012-11-01", periods=3, tz=tz) + offset = dates + pd.offsets.Hour(5) + assert dates[0] + pd.offsets.Hour(5) == offset[0] + + dates = date_range("2010-11-01 00:00", periods=3, tz=tz, freq="H") + expected = DatetimeIndex( + ["2010-11-01 05:00", "2010-11-01 06:00", "2010-11-01 07:00"], + freq="H", + tz=tz, + ) + + dates = tm.box_expected(dates, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + # TODO: parametrize over the scalar being added? radd? sub? + offset = dates + pd.offsets.Hour(5) + tm.assert_equal(offset, expected) + offset = dates + np.timedelta64(5, "h") + tm.assert_equal(offset, expected) + offset = dates + timedelta(hours=5) + tm.assert_equal(offset, expected) + + # ------------------------------------------------------------- + # RelativeDelta DateOffsets + + def test_dt64arr_add_sub_relativedelta_offsets(self, box_with_array): + # GH#10699 + vec = DatetimeIndex( + [ + Timestamp("2000-01-05 00:15:00"), + Timestamp("2000-01-31 00:23:00"), + Timestamp("2000-01-01"), + Timestamp("2000-03-31"), + Timestamp("2000-02-29"), + Timestamp("2000-12-31"), + Timestamp("2000-05-15"), + Timestamp("2001-06-15"), + ] + ) + vec = tm.box_expected(vec, box_with_array) + vec_items = vec.squeeze() if box_with_array is pd.DataFrame else vec + + # DateOffset relativedelta fastpath + relative_kwargs = [ + ("years", 2), + ("months", 5), + ("days", 3), + ("hours", 5), + ("minutes", 10), + ("seconds", 2), + ("microseconds", 5), + ] + for i, kwd in enumerate(relative_kwargs): + off = pd.DateOffset(**dict([kwd])) + + expected = DatetimeIndex([x + off for x in vec_items]) + expected = tm.box_expected(expected, box_with_array) + tm.assert_equal(expected, vec + off) + + expected = DatetimeIndex([x - off for x in vec_items]) + expected = tm.box_expected(expected, box_with_array) + tm.assert_equal(expected, vec - off) + + off = pd.DateOffset(**dict(relative_kwargs[: i + 1])) + + expected = DatetimeIndex([x + off for x in vec_items]) + expected = tm.box_expected(expected, box_with_array) + tm.assert_equal(expected, vec + off) + + expected = DatetimeIndex([x - off for x in vec_items]) + expected = tm.box_expected(expected, box_with_array) + tm.assert_equal(expected, vec - off) + msg = "(bad|unsupported) operand type for unary" + with pytest.raises(TypeError, match=msg): + off - vec + + # ------------------------------------------------------------- + # Non-Tick, Non-RelativeDelta DateOffsets + + # TODO: redundant with test_dt64arr_add_sub_DateOffset? that includes + # tz-aware cases which this does not + @pytest.mark.parametrize( + "cls_and_kwargs", + [ + "YearBegin", + ("YearBegin", {"month": 5}), + "YearEnd", + ("YearEnd", {"month": 5}), + "MonthBegin", + "MonthEnd", + "SemiMonthEnd", + "SemiMonthBegin", + "Week", + ("Week", {"weekday": 3}), + "Week", + ("Week", {"weekday": 6}), + "BusinessDay", + "BDay", + "QuarterEnd", + "QuarterBegin", + "CustomBusinessDay", + "CDay", + "CBMonthEnd", + "CBMonthBegin", + "BMonthBegin", + "BMonthEnd", + "BusinessHour", + "BYearBegin", + "BYearEnd", + "BQuarterBegin", + ("LastWeekOfMonth", {"weekday": 2}), + ( + "FY5253Quarter", + { + "qtr_with_extra_week": 1, + "startingMonth": 1, + "weekday": 2, + "variation": "nearest", + }, + ), + ("FY5253", {"weekday": 0, "startingMonth": 2, "variation": "nearest"}), + ("WeekOfMonth", {"weekday": 2, "week": 2}), + "Easter", + ("DateOffset", {"day": 4}), + ("DateOffset", {"month": 5}), + ], + ) + @pytest.mark.parametrize("normalize", [True, False]) + @pytest.mark.parametrize("n", [0, 5]) + def test_dt64arr_add_sub_DateOffsets( + self, box_with_array, n, normalize, cls_and_kwargs + ): + # GH#10699 + # assert vectorized operation matches pointwise operations + + if isinstance(cls_and_kwargs, tuple): + # If cls_name param is a tuple, then 2nd entry is kwargs for + # the offset constructor + cls_name, kwargs = cls_and_kwargs + else: + cls_name = cls_and_kwargs + kwargs = {} + + if n == 0 and cls_name in [ + "WeekOfMonth", + "LastWeekOfMonth", + "FY5253Quarter", + "FY5253", + ]: + # passing n = 0 is invalid for these offset classes + return + + vec = DatetimeIndex( + [ + Timestamp("2000-01-05 00:15:00"), + Timestamp("2000-01-31 00:23:00"), + Timestamp("2000-01-01"), + Timestamp("2000-03-31"), + Timestamp("2000-02-29"), + Timestamp("2000-12-31"), + Timestamp("2000-05-15"), + Timestamp("2001-06-15"), + ] + ) + vec = tm.box_expected(vec, box_with_array) + vec_items = vec.squeeze() if box_with_array is pd.DataFrame else vec + + offset_cls = getattr(pd.offsets, cls_name) + + with warnings.catch_warnings(record=True): + # pandas.errors.PerformanceWarning: Non-vectorized DateOffset being + # applied to Series or DatetimeIndex + # we aren't testing that here, so ignore. + warnings.simplefilter("ignore", PerformanceWarning) + + offset = offset_cls(n, normalize=normalize, **kwargs) + + expected = DatetimeIndex([x + offset for x in vec_items]) + expected = tm.box_expected(expected, box_with_array) + tm.assert_equal(expected, vec + offset) + + expected = DatetimeIndex([x - offset for x in vec_items]) + expected = tm.box_expected(expected, box_with_array) + tm.assert_equal(expected, vec - offset) + + expected = DatetimeIndex([offset + x for x in vec_items]) + expected = tm.box_expected(expected, box_with_array) + tm.assert_equal(expected, offset + vec) + msg = "(bad|unsupported) operand type for unary" + with pytest.raises(TypeError, match=msg): + offset - vec + + def test_dt64arr_add_sub_DateOffset(self, box_with_array): + # GH#10699 + s = date_range("2000-01-01", "2000-01-31", name="a") + s = tm.box_expected(s, box_with_array) + result = s + pd.DateOffset(years=1) + result2 = pd.DateOffset(years=1) + s + exp = date_range("2001-01-01", "2001-01-31", name="a") + exp = tm.box_expected(exp, box_with_array) + tm.assert_equal(result, exp) + tm.assert_equal(result2, exp) + + result = s - pd.DateOffset(years=1) + exp = date_range("1999-01-01", "1999-01-31", name="a") + exp = tm.box_expected(exp, box_with_array) + tm.assert_equal(result, exp) + + s = DatetimeIndex( + [ + Timestamp("2000-01-15 00:15:00", tz="US/Central"), + Timestamp("2000-02-15", tz="US/Central"), + ], + name="a", + ) + s = tm.box_expected(s, box_with_array) + result = s + pd.offsets.Day() + result2 = pd.offsets.Day() + s + exp = DatetimeIndex( + [ + Timestamp("2000-01-16 00:15:00", tz="US/Central"), + Timestamp("2000-02-16", tz="US/Central"), + ], + name="a", + ) + exp = tm.box_expected(exp, box_with_array) + tm.assert_equal(result, exp) + tm.assert_equal(result2, exp) + + s = DatetimeIndex( + [ + Timestamp("2000-01-15 00:15:00", tz="US/Central"), + Timestamp("2000-02-15", tz="US/Central"), + ], + name="a", + ) + s = tm.box_expected(s, box_with_array) + result = s + pd.offsets.MonthEnd() + result2 = pd.offsets.MonthEnd() + s + exp = DatetimeIndex( + [ + Timestamp("2000-01-31 00:15:00", tz="US/Central"), + Timestamp("2000-02-29", tz="US/Central"), + ], + name="a", + ) + exp = tm.box_expected(exp, box_with_array) + tm.assert_equal(result, exp) + tm.assert_equal(result2, exp) + + # TODO: __sub__, __rsub__ + def test_dt64arr_add_mixed_offset_array(self, box_with_array): + # GH#10699 + # array of offsets + s = DatetimeIndex([Timestamp("2000-1-1"), Timestamp("2000-2-1")]) + s = tm.box_expected(s, box_with_array) + + warn = None if box_with_array is pd.DataFrame else PerformanceWarning + with tm.assert_produces_warning(warn): + other = pd.Index([pd.offsets.DateOffset(years=1), pd.offsets.MonthEnd()]) + other = tm.box_expected(other, box_with_array) + result = s + other + exp = DatetimeIndex([Timestamp("2001-1-1"), Timestamp("2000-2-29")]) + exp = tm.box_expected(exp, box_with_array) + tm.assert_equal(result, exp) + + # same offset + other = pd.Index( + [pd.offsets.DateOffset(years=1), pd.offsets.DateOffset(years=1)] + ) + other = tm.box_expected(other, box_with_array) + result = s + other + exp = DatetimeIndex([Timestamp("2001-1-1"), Timestamp("2001-2-1")]) + exp = tm.box_expected(exp, box_with_array) + tm.assert_equal(result, exp) + + # TODO: overlap with test_dt64arr_add_mixed_offset_array? + def test_dt64arr_add_sub_offset_ndarray(self, tz_naive_fixture, box_with_array): + # GH#18849 + + tz = tz_naive_fixture + dti = pd.date_range("2017-01-01", periods=2, tz=tz) + dtarr = tm.box_expected(dti, box_with_array) + + other = np.array([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)]) + + warn = None if box_with_array is pd.DataFrame else PerformanceWarning + with tm.assert_produces_warning(warn): + res = dtarr + other + expected = DatetimeIndex( + [dti[n] + other[n] for n in range(len(dti))], name=dti.name, freq="infer" + ) + expected = tm.box_expected(expected, box_with_array) + tm.assert_equal(res, expected) + + with tm.assert_produces_warning(warn): + res2 = other + dtarr + tm.assert_equal(res2, expected) + + with tm.assert_produces_warning(warn): + res = dtarr - other + expected = DatetimeIndex( + [dti[n] - other[n] for n in range(len(dti))], name=dti.name, freq="infer" + ) + expected = tm.box_expected(expected, box_with_array) + tm.assert_equal(res, expected) + + @pytest.mark.parametrize( + "op, offset, exp, exp_freq", + [ + ( + "__add__", + pd.DateOffset(months=3, days=10), + [ + Timestamp("2014-04-11"), + Timestamp("2015-04-11"), + Timestamp("2016-04-11"), + Timestamp("2017-04-11"), + ], + None, + ), + ( + "__add__", + pd.DateOffset(months=3), + [ + Timestamp("2014-04-01"), + Timestamp("2015-04-01"), + Timestamp("2016-04-01"), + Timestamp("2017-04-01"), + ], + "AS-APR", + ), + ( + "__sub__", + pd.DateOffset(months=3, days=10), + [ + Timestamp("2013-09-21"), + Timestamp("2014-09-21"), + Timestamp("2015-09-21"), + Timestamp("2016-09-21"), + ], + None, + ), + ( + "__sub__", + pd.DateOffset(months=3), + [ + Timestamp("2013-10-01"), + Timestamp("2014-10-01"), + Timestamp("2015-10-01"), + Timestamp("2016-10-01"), + ], + "AS-OCT", + ), + ], + ) + def test_dti_add_sub_nonzero_mth_offset( + self, op, offset, exp, exp_freq, tz_aware_fixture, box_with_array + ): + # GH 26258 + tz = tz_aware_fixture + date = date_range(start="01 Jan 2014", end="01 Jan 2017", freq="AS", tz=tz) + date = tm.box_expected(date, box_with_array, False) + mth = getattr(date, op) + result = mth(offset) + + expected = pd.DatetimeIndex(exp, tz=tz, freq=exp_freq) + expected = tm.box_expected(expected, box_with_array, False) + tm.assert_equal(result, expected) + + +class TestDatetime64OverflowHandling: + # TODO: box + de-duplicate + + def test_dt64_overflow_masking(self, box_with_array): + # GH#25317 + left = Series([Timestamp("1969-12-31")]) + right = Series([NaT]) + + left = tm.box_expected(left, box_with_array) + right = tm.box_expected(right, box_with_array) + + expected = TimedeltaIndex([NaT]) + expected = tm.box_expected(expected, box_with_array) + + result = left - right + tm.assert_equal(result, expected) + + def test_dt64_series_arith_overflow(self): + # GH#12534, fixed by GH#19024 + dt = pd.Timestamp("1700-01-31") + td = pd.Timedelta("20000 Days") + dti = pd.date_range("1949-09-30", freq="100Y", periods=4) + ser = pd.Series(dti) + msg = "Overflow in int64 addition" + with pytest.raises(OverflowError, match=msg): + ser - dt + with pytest.raises(OverflowError, match=msg): + dt - ser + with pytest.raises(OverflowError, match=msg): + ser + td + with pytest.raises(OverflowError, match=msg): + td + ser + + ser.iloc[-1] = pd.NaT + expected = pd.Series( + ["2004-10-03", "2104-10-04", "2204-10-04", "NaT"], dtype="datetime64[ns]" + ) + res = ser + td + tm.assert_series_equal(res, expected) + res = td + ser + tm.assert_series_equal(res, expected) + + ser.iloc[1:] = pd.NaT + expected = pd.Series( + ["91279 Days", "NaT", "NaT", "NaT"], dtype="timedelta64[ns]" + ) + res = ser - dt + tm.assert_series_equal(res, expected) + res = dt - ser + tm.assert_series_equal(res, -expected) + + def test_datetimeindex_sub_timestamp_overflow(self): + dtimax = pd.to_datetime(["now", pd.Timestamp.max]) + dtimin = pd.to_datetime(["now", pd.Timestamp.min]) + + tsneg = Timestamp("1950-01-01") + ts_neg_variants = [ + tsneg, + tsneg.to_pydatetime(), + tsneg.to_datetime64().astype("datetime64[ns]"), + tsneg.to_datetime64().astype("datetime64[D]"), + ] + + tspos = Timestamp("1980-01-01") + ts_pos_variants = [ + tspos, + tspos.to_pydatetime(), + tspos.to_datetime64().astype("datetime64[ns]"), + tspos.to_datetime64().astype("datetime64[D]"), + ] + msg = "Overflow in int64 addition" + for variant in ts_neg_variants: + with pytest.raises(OverflowError, match=msg): + dtimax - variant + + expected = pd.Timestamp.max.value - tspos.value + for variant in ts_pos_variants: + res = dtimax - variant + assert res[1].value == expected + + expected = pd.Timestamp.min.value - tsneg.value + for variant in ts_neg_variants: + res = dtimin - variant + assert res[1].value == expected + + for variant in ts_pos_variants: + with pytest.raises(OverflowError, match=msg): + dtimin - variant + + def test_datetimeindex_sub_datetimeindex_overflow(self): + # GH#22492, GH#22508 + dtimax = pd.to_datetime(["now", pd.Timestamp.max]) + dtimin = pd.to_datetime(["now", pd.Timestamp.min]) + + ts_neg = pd.to_datetime(["1950-01-01", "1950-01-01"]) + ts_pos = pd.to_datetime(["1980-01-01", "1980-01-01"]) + + # General tests + expected = pd.Timestamp.max.value - ts_pos[1].value + result = dtimax - ts_pos + assert result[1].value == expected + + expected = pd.Timestamp.min.value - ts_neg[1].value + result = dtimin - ts_neg + assert result[1].value == expected + msg = "Overflow in int64 addition" + with pytest.raises(OverflowError, match=msg): + dtimax - ts_neg + + with pytest.raises(OverflowError, match=msg): + dtimin - ts_pos + + # Edge cases + tmin = pd.to_datetime([pd.Timestamp.min]) + t1 = tmin + pd.Timedelta.max + pd.Timedelta("1us") + with pytest.raises(OverflowError, match=msg): + t1 - tmin + + tmax = pd.to_datetime([pd.Timestamp.max]) + t2 = tmax + pd.Timedelta.min - pd.Timedelta("1us") + with pytest.raises(OverflowError, match=msg): + tmax - t2 + + +class TestTimestampSeriesArithmetic: + def test_empty_series_add_sub(self): + # GH#13844 + a = Series(dtype="M8[ns]") + b = Series(dtype="m8[ns]") + tm.assert_series_equal(a, a + b) + tm.assert_series_equal(a, a - b) + tm.assert_series_equal(a, b + a) + msg = "cannot subtract" + with pytest.raises(TypeError, match=msg): + b - a + + def test_operators_datetimelike(self): + + # ## timedelta64 ### + td1 = Series([timedelta(minutes=5, seconds=3)] * 3) + td1.iloc[2] = np.nan + + # ## datetime64 ### + dt1 = Series( + [ + pd.Timestamp("20111230"), + pd.Timestamp("20120101"), + pd.Timestamp("20120103"), + ] + ) + dt1.iloc[2] = np.nan + dt2 = Series( + [ + pd.Timestamp("20111231"), + pd.Timestamp("20120102"), + pd.Timestamp("20120104"), + ] + ) + dt1 - dt2 + dt2 - dt1 + + # datetime64 with timetimedelta + dt1 + td1 + td1 + dt1 + dt1 - td1 + + # timetimedelta with datetime64 + td1 + dt1 + dt1 + td1 + + def test_dt64ser_sub_datetime_dtype(self): + ts = Timestamp(datetime(1993, 1, 7, 13, 30, 00)) + dt = datetime(1993, 6, 22, 13, 30) + ser = Series([ts]) + result = pd.to_timedelta(np.abs(ser - dt)) + assert result.dtype == "timedelta64[ns]" + + # ------------------------------------------------------------- + # TODO: This next block of tests came from tests.series.test_operators, + # needs to be de-duplicated and parametrized over `box` classes + + def test_operators_datetimelike_invalid(self, all_arithmetic_operators): + # these are all TypeEror ops + op_str = all_arithmetic_operators + + def check(get_ser, test_ser): + + # check that we are getting a TypeError + # with 'operate' (from core/ops.py) for the ops that are not + # defined + op = getattr(get_ser, op_str, None) + # Previously, _validate_for_numeric_binop in core/indexes/base.py + # did this for us. + with pytest.raises( + TypeError, match="operate|[cC]annot|unsupported operand" + ): + op(test_ser) + + # ## timedelta64 ### + td1 = Series([timedelta(minutes=5, seconds=3)] * 3) + td1.iloc[2] = np.nan + + # ## datetime64 ### + dt1 = Series( + [Timestamp("20111230"), Timestamp("20120101"), Timestamp("20120103")] + ) + dt1.iloc[2] = np.nan + dt2 = Series( + [Timestamp("20111231"), Timestamp("20120102"), Timestamp("20120104")] + ) + if op_str not in ["__sub__", "__rsub__"]: + check(dt1, dt2) + + # ## datetime64 with timetimedelta ### + # TODO(jreback) __rsub__ should raise? + if op_str not in ["__add__", "__radd__", "__sub__"]: + check(dt1, td1) + + # 8260, 10763 + # datetime64 with tz + tz = "US/Eastern" + dt1 = Series(date_range("2000-01-01 09:00:00", periods=5, tz=tz), name="foo") + dt2 = dt1.copy() + dt2.iloc[2] = np.nan + td1 = Series(pd.timedelta_range("1 days 1 min", periods=5, freq="H")) + td2 = td1.copy() + td2.iloc[1] = np.nan + + if op_str not in ["__add__", "__radd__", "__sub__", "__rsub__"]: + check(dt2, td2) + + def test_sub_single_tz(self): + # GH#12290 + s1 = Series([pd.Timestamp("2016-02-10", tz="America/Sao_Paulo")]) + s2 = Series([pd.Timestamp("2016-02-08", tz="America/Sao_Paulo")]) + result = s1 - s2 + expected = Series([Timedelta("2days")]) + tm.assert_series_equal(result, expected) + result = s2 - s1 + expected = Series([Timedelta("-2days")]) + tm.assert_series_equal(result, expected) + + def test_dt64tz_series_sub_dtitz(self): + # GH#19071 subtracting tzaware DatetimeIndex from tzaware Series + # (with same tz) raises, fixed by #19024 + dti = pd.date_range("1999-09-30", periods=10, tz="US/Pacific") + ser = pd.Series(dti) + expected = pd.Series(pd.TimedeltaIndex(["0days"] * 10)) + + res = dti - ser + tm.assert_series_equal(res, expected) + res = ser - dti + tm.assert_series_equal(res, expected) + + def test_sub_datetime_compat(self): + # see GH#14088 + s = Series([datetime(2016, 8, 23, 12, tzinfo=pytz.utc), pd.NaT]) + dt = datetime(2016, 8, 22, 12, tzinfo=pytz.utc) + exp = Series([Timedelta("1 days"), pd.NaT]) + tm.assert_series_equal(s - dt, exp) + tm.assert_series_equal(s - Timestamp(dt), exp) + + def test_dt64_series_add_mixed_tick_DateOffset(self): + # GH#4532 + # operate with pd.offsets + s = Series([Timestamp("20130101 9:01"), Timestamp("20130101 9:02")]) + + result = s + pd.offsets.Milli(5) + result2 = pd.offsets.Milli(5) + s + expected = Series( + [Timestamp("20130101 9:01:00.005"), Timestamp("20130101 9:02:00.005")] + ) + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result2, expected) + + result = s + pd.offsets.Minute(5) + pd.offsets.Milli(5) + expected = Series( + [Timestamp("20130101 9:06:00.005"), Timestamp("20130101 9:07:00.005")] + ) + tm.assert_series_equal(result, expected) + + def test_datetime64_ops_nat(self): + # GH#11349 + datetime_series = Series([NaT, Timestamp("19900315")]) + nat_series_dtype_timestamp = Series([NaT, NaT], dtype="datetime64[ns]") + single_nat_dtype_datetime = Series([NaT], dtype="datetime64[ns]") + + # subtraction + tm.assert_series_equal(-NaT + datetime_series, nat_series_dtype_timestamp) + msg = "Unary negative expects" + with pytest.raises(TypeError, match=msg): + -single_nat_dtype_datetime + datetime_series + + tm.assert_series_equal( + -NaT + nat_series_dtype_timestamp, nat_series_dtype_timestamp + ) + with pytest.raises(TypeError, match=msg): + -single_nat_dtype_datetime + nat_series_dtype_timestamp + + # addition + tm.assert_series_equal( + nat_series_dtype_timestamp + NaT, nat_series_dtype_timestamp + ) + tm.assert_series_equal( + NaT + nat_series_dtype_timestamp, nat_series_dtype_timestamp + ) + + tm.assert_series_equal( + nat_series_dtype_timestamp + NaT, nat_series_dtype_timestamp + ) + tm.assert_series_equal( + NaT + nat_series_dtype_timestamp, nat_series_dtype_timestamp + ) + + # ------------------------------------------------------------- + # Invalid Operations + # TODO: this block also needs to be de-duplicated and parametrized + + @pytest.mark.parametrize( + "dt64_series", + [ + Series([Timestamp("19900315"), Timestamp("19900315")]), + Series([pd.NaT, Timestamp("19900315")]), + Series([pd.NaT, pd.NaT], dtype="datetime64[ns]"), + ], + ) + @pytest.mark.parametrize("one", [1, 1.0, np.array(1)]) + def test_dt64_mul_div_numeric_invalid(self, one, dt64_series): + # multiplication + msg = "cannot perform .* with this index type" + with pytest.raises(TypeError, match=msg): + dt64_series * one + with pytest.raises(TypeError, match=msg): + one * dt64_series + + # division + with pytest.raises(TypeError, match=msg): + dt64_series / one + with pytest.raises(TypeError, match=msg): + one / dt64_series + + # TODO: parametrize over box + @pytest.mark.parametrize("op", ["__add__", "__radd__", "__sub__", "__rsub__"]) + @pytest.mark.parametrize("tz", [None, "Asia/Tokyo"]) + def test_dt64_series_add_intlike(self, tz, op): + # GH#19123 + dti = pd.DatetimeIndex(["2016-01-02", "2016-02-03", "NaT"], tz=tz) + ser = Series(dti) + + other = Series([20, 30, 40], dtype="uint8") + + method = getattr(ser, op) + msg = "|".join( + [ + "Addition/subtraction of integers and integer-arrays", + "cannot subtract .* from ndarray", + ] + ) + with pytest.raises(TypeError, match=msg): + method(1) + with pytest.raises(TypeError, match=msg): + method(other) + with pytest.raises(TypeError, match=msg): + method(np.array(other)) + with pytest.raises(TypeError, match=msg): + method(pd.Index(other)) + + # ------------------------------------------------------------- + # Timezone-Centric Tests + + def test_operators_datetimelike_with_timezones(self): + tz = "US/Eastern" + dt1 = Series(date_range("2000-01-01 09:00:00", periods=5, tz=tz), name="foo") + dt2 = dt1.copy() + dt2.iloc[2] = np.nan + + td1 = Series(pd.timedelta_range("1 days 1 min", periods=5, freq="H")) + td2 = td1.copy() + td2.iloc[1] = np.nan + + result = dt1 + td1[0] + exp = (dt1.dt.tz_localize(None) + td1[0]).dt.tz_localize(tz) + tm.assert_series_equal(result, exp) + + result = dt2 + td2[0] + exp = (dt2.dt.tz_localize(None) + td2[0]).dt.tz_localize(tz) + tm.assert_series_equal(result, exp) + + # odd numpy behavior with scalar timedeltas + result = td1[0] + dt1 + exp = (dt1.dt.tz_localize(None) + td1[0]).dt.tz_localize(tz) + tm.assert_series_equal(result, exp) + + result = td2[0] + dt2 + exp = (dt2.dt.tz_localize(None) + td2[0]).dt.tz_localize(tz) + tm.assert_series_equal(result, exp) + + result = dt1 - td1[0] + exp = (dt1.dt.tz_localize(None) - td1[0]).dt.tz_localize(tz) + tm.assert_series_equal(result, exp) + msg = "(bad|unsupported) operand type for unary" + with pytest.raises(TypeError, match=msg): + td1[0] - dt1 + + result = dt2 - td2[0] + exp = (dt2.dt.tz_localize(None) - td2[0]).dt.tz_localize(tz) + tm.assert_series_equal(result, exp) + with pytest.raises(TypeError, match=msg): + td2[0] - dt2 + + result = dt1 + td1 + exp = (dt1.dt.tz_localize(None) + td1).dt.tz_localize(tz) + tm.assert_series_equal(result, exp) + + result = dt2 + td2 + exp = (dt2.dt.tz_localize(None) + td2).dt.tz_localize(tz) + tm.assert_series_equal(result, exp) + + result = dt1 - td1 + exp = (dt1.dt.tz_localize(None) - td1).dt.tz_localize(tz) + tm.assert_series_equal(result, exp) + + result = dt2 - td2 + exp = (dt2.dt.tz_localize(None) - td2).dt.tz_localize(tz) + tm.assert_series_equal(result, exp) + msg = "cannot (add|subtract)" + with pytest.raises(TypeError, match=msg): + td1 - dt1 + with pytest.raises(TypeError, match=msg): + td2 - dt2 + + +class TestDatetimeIndexArithmetic: + + # ------------------------------------------------------------- + # Binary operations DatetimeIndex and int + + def test_dti_addsub_int(self, tz_naive_fixture, one): + # Variants of `one` for #19012 + tz = tz_naive_fixture + rng = pd.date_range("2000-01-01 09:00", freq="H", periods=10, tz=tz) + msg = "Addition/subtraction of integers" + + with pytest.raises(TypeError, match=msg): + rng + one + with pytest.raises(TypeError, match=msg): + rng += one + with pytest.raises(TypeError, match=msg): + rng - one + with pytest.raises(TypeError, match=msg): + rng -= one + + # ------------------------------------------------------------- + # __add__/__sub__ with integer arrays + + @pytest.mark.parametrize("freq", ["H", "D"]) + @pytest.mark.parametrize("int_holder", [np.array, pd.Index]) + def test_dti_add_intarray_tick(self, int_holder, freq): + # GH#19959 + dti = pd.date_range("2016-01-01", periods=2, freq=freq) + other = int_holder([4, -1]) + + msg = "Addition/subtraction of integers|cannot subtract DatetimeArray from" + assert_invalid_addsub_type(dti, other, msg) + + @pytest.mark.parametrize("freq", ["W", "M", "MS", "Q"]) + @pytest.mark.parametrize("int_holder", [np.array, pd.Index]) + def test_dti_add_intarray_non_tick(self, int_holder, freq): + # GH#19959 + dti = pd.date_range("2016-01-01", periods=2, freq=freq) + other = int_holder([4, -1]) + + msg = "Addition/subtraction of integers|cannot subtract DatetimeArray from" + assert_invalid_addsub_type(dti, other, msg) + + @pytest.mark.parametrize("int_holder", [np.array, pd.Index]) + def test_dti_add_intarray_no_freq(self, int_holder): + # GH#19959 + dti = pd.DatetimeIndex(["2016-01-01", "NaT", "2017-04-05 06:07:08"]) + other = int_holder([9, 4, -1]) + msg = "|".join( + ["cannot subtract DatetimeArray from", "Addition/subtraction of integers"] + ) + assert_invalid_addsub_type(dti, other, msg) + + # ------------------------------------------------------------- + # Binary operations DatetimeIndex and TimedeltaIndex/array + + def test_dti_add_tdi(self, tz_naive_fixture): + # GH#17558 + tz = tz_naive_fixture + dti = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10) + tdi = pd.timedelta_range("0 days", periods=10) + expected = pd.date_range("2017-01-01", periods=10, tz=tz) + + # add with TimdeltaIndex + result = dti + tdi + tm.assert_index_equal(result, expected) + + result = tdi + dti + tm.assert_index_equal(result, expected) + + # add with timedelta64 array + result = dti + tdi.values + tm.assert_index_equal(result, expected) + + result = tdi.values + dti + tm.assert_index_equal(result, expected) + + def test_dti_iadd_tdi(self, tz_naive_fixture): + # GH#17558 + tz = tz_naive_fixture + dti = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10) + tdi = pd.timedelta_range("0 days", periods=10) + expected = pd.date_range("2017-01-01", periods=10, tz=tz) + + # iadd with TimdeltaIndex + result = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10) + result += tdi + tm.assert_index_equal(result, expected) + + result = pd.timedelta_range("0 days", periods=10) + result += dti + tm.assert_index_equal(result, expected) + + # iadd with timedelta64 array + result = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10) + result += tdi.values + tm.assert_index_equal(result, expected) + + result = pd.timedelta_range("0 days", periods=10) + result += dti + tm.assert_index_equal(result, expected) + + def test_dti_sub_tdi(self, tz_naive_fixture): + # GH#17558 + tz = tz_naive_fixture + dti = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10) + tdi = pd.timedelta_range("0 days", periods=10) + expected = pd.date_range("2017-01-01", periods=10, tz=tz, freq="-1D") + + # sub with TimedeltaIndex + result = dti - tdi + tm.assert_index_equal(result, expected) + + msg = "cannot subtract .*TimedeltaArray" + with pytest.raises(TypeError, match=msg): + tdi - dti + + # sub with timedelta64 array + result = dti - tdi.values + tm.assert_index_equal(result, expected) + + msg = "cannot subtract DatetimeArray from" + with pytest.raises(TypeError, match=msg): + tdi.values - dti + + def test_dti_isub_tdi(self, tz_naive_fixture): + # GH#17558 + tz = tz_naive_fixture + dti = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10) + tdi = pd.timedelta_range("0 days", periods=10) + expected = pd.date_range("2017-01-01", periods=10, tz=tz, freq="-1D") + + # isub with TimedeltaIndex + result = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10) + result -= tdi + tm.assert_index_equal(result, expected) + + msg = "cannot subtract .* from a TimedeltaArray" + with pytest.raises(TypeError, match=msg): + tdi -= dti + + # isub with timedelta64 array + result = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10) + result -= tdi.values + tm.assert_index_equal(result, expected) + + msg = "|".join( + [ + "cannot perform __neg__ with this index type:", + "ufunc subtract cannot use operands with types", + "cannot subtract DatetimeArray from", + ] + ) + with pytest.raises(TypeError, match=msg): + tdi.values -= dti + + # ------------------------------------------------------------- + # Binary Operations DatetimeIndex and datetime-like + # TODO: A couple other tests belong in this section. Move them in + # A PR where there isn't already a giant diff. + + @pytest.mark.parametrize( + "addend", + [ + datetime(2011, 1, 1), + DatetimeIndex(["2011-01-01", "2011-01-02"]), + DatetimeIndex(["2011-01-01", "2011-01-02"]).tz_localize("US/Eastern"), + np.datetime64("2011-01-01"), + Timestamp("2011-01-01"), + ], + ids=lambda x: type(x).__name__, + ) + @pytest.mark.parametrize("tz", [None, "US/Eastern"]) + def test_add_datetimelike_and_dtarr(self, box_with_array, addend, tz): + # GH#9631 + dti = DatetimeIndex(["2011-01-01", "2011-01-02"]).tz_localize(tz) + dtarr = tm.box_expected(dti, box_with_array) + msg = "cannot add DatetimeArray and" + + with pytest.raises(TypeError, match=msg): + dtarr + addend + with pytest.raises(TypeError, match=msg): + addend + dtarr + + # ------------------------------------------------------------- + + def test_dta_add_sub_index(self, tz_naive_fixture): + # Check that DatetimeArray defers to Index classes + dti = date_range("20130101", periods=3, tz=tz_naive_fixture) + dta = dti.array + result = dta - dti + expected = dti - dti + tm.assert_index_equal(result, expected) + + tdi = result + result = dta + tdi + expected = dti + tdi + tm.assert_index_equal(result, expected) + + result = dta - tdi + expected = dti - tdi + tm.assert_index_equal(result, expected) + + def test_sub_dti_dti(self): + # previously performed setop (deprecated in 0.16.0), now changed to + # return subtraction -> TimeDeltaIndex (GH ...) + + dti = date_range("20130101", periods=3) + dti_tz = date_range("20130101", periods=3).tz_localize("US/Eastern") + dti_tz2 = date_range("20130101", periods=3).tz_localize("UTC") + expected = TimedeltaIndex([0, 0, 0]) + + result = dti - dti + tm.assert_index_equal(result, expected) + + result = dti_tz - dti_tz + tm.assert_index_equal(result, expected) + msg = "DatetimeArray subtraction must have the same timezones or" + with pytest.raises(TypeError, match=msg): + dti_tz - dti + + with pytest.raises(TypeError, match=msg): + dti - dti_tz + + with pytest.raises(TypeError, match=msg): + dti_tz - dti_tz2 + + # isub + dti -= dti + tm.assert_index_equal(dti, expected) + + # different length raises ValueError + dti1 = date_range("20130101", periods=3) + dti2 = date_range("20130101", periods=4) + msg = "cannot add indices of unequal length" + with pytest.raises(ValueError, match=msg): + dti1 - dti2 + + # NaN propagation + dti1 = DatetimeIndex(["2012-01-01", np.nan, "2012-01-03"]) + dti2 = DatetimeIndex(["2012-01-02", "2012-01-03", np.nan]) + expected = TimedeltaIndex(["1 days", np.nan, np.nan]) + result = dti2 - dti1 + tm.assert_index_equal(result, expected) + + # ------------------------------------------------------------------- + # TODO: Most of this block is moved from series or frame tests, needs + # cleanup, box-parametrization, and de-duplication + + @pytest.mark.parametrize("op", [operator.add, operator.sub]) + def test_timedelta64_equal_timedelta_supported_ops(self, op): + ser = Series( + [ + Timestamp("20130301"), + Timestamp("20130228 23:00:00"), + Timestamp("20130228 22:00:00"), + Timestamp("20130228 21:00:00"), + ] + ) + + intervals = ["D", "h", "m", "s", "us"] + + def timedelta64(*args): + # see casting notes in NumPy gh-12927 + return np.sum(list(starmap(np.timedelta64, zip(args, intervals)))) + + for d, h, m, s, us in product(*([range(2)] * 5)): + nptd = timedelta64(d, h, m, s, us) + pytd = timedelta(days=d, hours=h, minutes=m, seconds=s, microseconds=us) + lhs = op(ser, nptd) + rhs = op(ser, pytd) + + tm.assert_series_equal(lhs, rhs) + + def test_ops_nat_mixed_datetime64_timedelta64(self): + # GH#11349 + timedelta_series = Series([NaT, Timedelta("1s")]) + datetime_series = Series([NaT, Timestamp("19900315")]) + nat_series_dtype_timedelta = Series([NaT, NaT], dtype="timedelta64[ns]") + nat_series_dtype_timestamp = Series([NaT, NaT], dtype="datetime64[ns]") + single_nat_dtype_datetime = Series([NaT], dtype="datetime64[ns]") + single_nat_dtype_timedelta = Series([NaT], dtype="timedelta64[ns]") + + # subtraction + tm.assert_series_equal( + datetime_series - single_nat_dtype_datetime, nat_series_dtype_timedelta + ) + + tm.assert_series_equal( + datetime_series - single_nat_dtype_timedelta, nat_series_dtype_timestamp + ) + tm.assert_series_equal( + -single_nat_dtype_timedelta + datetime_series, nat_series_dtype_timestamp + ) + + # without a Series wrapping the NaT, it is ambiguous + # whether it is a datetime64 or timedelta64 + # defaults to interpreting it as timedelta64 + tm.assert_series_equal( + nat_series_dtype_timestamp - single_nat_dtype_datetime, + nat_series_dtype_timedelta, + ) + + tm.assert_series_equal( + nat_series_dtype_timestamp - single_nat_dtype_timedelta, + nat_series_dtype_timestamp, + ) + tm.assert_series_equal( + -single_nat_dtype_timedelta + nat_series_dtype_timestamp, + nat_series_dtype_timestamp, + ) + msg = "cannot subtract a datelike" + with pytest.raises(TypeError, match=msg): + timedelta_series - single_nat_dtype_datetime + + # addition + tm.assert_series_equal( + nat_series_dtype_timestamp + single_nat_dtype_timedelta, + nat_series_dtype_timestamp, + ) + tm.assert_series_equal( + single_nat_dtype_timedelta + nat_series_dtype_timestamp, + nat_series_dtype_timestamp, + ) + + tm.assert_series_equal( + nat_series_dtype_timestamp + single_nat_dtype_timedelta, + nat_series_dtype_timestamp, + ) + tm.assert_series_equal( + single_nat_dtype_timedelta + nat_series_dtype_timestamp, + nat_series_dtype_timestamp, + ) + + tm.assert_series_equal( + nat_series_dtype_timedelta + single_nat_dtype_datetime, + nat_series_dtype_timestamp, + ) + tm.assert_series_equal( + single_nat_dtype_datetime + nat_series_dtype_timedelta, + nat_series_dtype_timestamp, + ) + + def test_ufunc_coercions(self): + idx = date_range("2011-01-01", periods=3, freq="2D", name="x") + + delta = np.timedelta64(1, "D") + exp = date_range("2011-01-02", periods=3, freq="2D", name="x") + for result in [idx + delta, np.add(idx, delta)]: + assert isinstance(result, DatetimeIndex) + tm.assert_index_equal(result, exp) + assert result.freq == "2D" + + exp = date_range("2010-12-31", periods=3, freq="2D", name="x") + for result in [idx - delta, np.subtract(idx, delta)]: + assert isinstance(result, DatetimeIndex) + tm.assert_index_equal(result, exp) + assert result.freq == "2D" + + delta = np.array( + [np.timedelta64(1, "D"), np.timedelta64(2, "D"), np.timedelta64(3, "D")] + ) + exp = DatetimeIndex( + ["2011-01-02", "2011-01-05", "2011-01-08"], freq="3D", name="x" + ) + for result in [idx + delta, np.add(idx, delta)]: + assert isinstance(result, DatetimeIndex) + tm.assert_index_equal(result, exp) + assert result.freq == "3D" + + exp = DatetimeIndex( + ["2010-12-31", "2011-01-01", "2011-01-02"], freq="D", name="x" + ) + for result in [idx - delta, np.subtract(idx, delta)]: + assert isinstance(result, DatetimeIndex) + tm.assert_index_equal(result, exp) + assert result.freq == "D" + + @pytest.mark.parametrize( + "names", [("foo", None, None), ("baz", "bar", None), ("bar", "bar", "bar")] + ) + @pytest.mark.parametrize("tz", [None, "America/Chicago"]) + def test_dti_add_series(self, tz, names): + # GH#13905 + index = DatetimeIndex( + ["2016-06-28 05:30", "2016-06-28 05:31"], tz=tz, name=names[0] + ) + ser = Series([Timedelta(seconds=5)] * 2, index=index, name=names[1]) + expected = Series(index + Timedelta(seconds=5), index=index, name=names[2]) + + # passing name arg isn't enough when names[2] is None + expected.name = names[2] + assert expected.dtype == index.dtype + result = ser + index + tm.assert_series_equal(result, expected) + result2 = index + ser + tm.assert_series_equal(result2, expected) + + expected = index + Timedelta(seconds=5) + result3 = ser.values + index + tm.assert_index_equal(result3, expected) + result4 = index + ser.values + tm.assert_index_equal(result4, expected) + + @pytest.mark.parametrize("op", [operator.add, roperator.radd, operator.sub]) + @pytest.mark.parametrize( + "names", [(None, None, None), ("foo", "bar", None), ("foo", "foo", "foo")] + ) + def test_dti_addsub_offset_arraylike( + self, tz_naive_fixture, names, op, index_or_series + ): + # GH#18849, GH#19744 + box = pd.Index + other_box = index_or_series + + tz = tz_naive_fixture + dti = pd.date_range("2017-01-01", periods=2, tz=tz, name=names[0]) + other = other_box([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)], name=names[1]) + + xbox = get_upcast_box(box, other) + + with tm.assert_produces_warning(PerformanceWarning): + res = op(dti, other) + + expected = DatetimeIndex( + [op(dti[n], other[n]) for n in range(len(dti))], name=names[2], freq="infer" + ) + expected = tm.box_expected(expected, xbox) + tm.assert_equal(res, expected) + + @pytest.mark.parametrize("other_box", [pd.Index, np.array]) + def test_dti_addsub_object_arraylike( + self, tz_naive_fixture, box_with_array, other_box + ): + tz = tz_naive_fixture + + dti = pd.date_range("2017-01-01", periods=2, tz=tz) + dtarr = tm.box_expected(dti, box_with_array) + other = other_box([pd.offsets.MonthEnd(), pd.Timedelta(days=4)]) + xbox = get_upcast_box(box_with_array, other) + + expected = pd.DatetimeIndex(["2017-01-31", "2017-01-06"], tz=tz_naive_fixture) + expected = tm.box_expected(expected, xbox) + + warn = None if box_with_array is pd.DataFrame else PerformanceWarning + with tm.assert_produces_warning(warn): + result = dtarr + other + tm.assert_equal(result, expected) + + expected = pd.DatetimeIndex(["2016-12-31", "2016-12-29"], tz=tz_naive_fixture) + expected = tm.box_expected(expected, xbox) + + with tm.assert_produces_warning(warn): + result = dtarr - other + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("years", [-1, 0, 1]) +@pytest.mark.parametrize("months", [-2, 0, 2]) +def test_shift_months(years, months): + dti = DatetimeIndex( + [ + Timestamp("2000-01-05 00:15:00"), + Timestamp("2000-01-31 00:23:00"), + Timestamp("2000-01-01"), + Timestamp("2000-02-29"), + Timestamp("2000-12-31"), + ] + ) + actual = DatetimeIndex(shift_months(dti.asi8, years * 12 + months)) + + raw = [x + pd.offsets.DateOffset(years=years, months=months) for x in dti] + expected = DatetimeIndex(raw) + tm.assert_index_equal(actual, expected) diff --git a/venv/Lib/site-packages/pandas/tests/arithmetic/test_interval.py b/venv/Lib/site-packages/pandas/tests/arithmetic/test_interval.py new file mode 100644 index 0000000..f9e1a51 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arithmetic/test_interval.py @@ -0,0 +1,273 @@ +import operator + +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_list_like + +import pandas as pd +from pandas import ( + Categorical, + Index, + Interval, + IntervalIndex, + Period, + Series, + Timedelta, + Timestamp, + date_range, + period_range, + timedelta_range, +) +import pandas._testing as tm +from pandas.core.arrays import IntervalArray + + +@pytest.fixture( + params=[ + (Index([0, 2, 4, 4]), Index([1, 3, 5, 8])), + (Index([0.0, 1.0, 2.0, np.nan]), Index([1.0, 2.0, 3.0, np.nan])), + ( + timedelta_range("0 days", periods=3).insert(4, pd.NaT), + timedelta_range("1 day", periods=3).insert(4, pd.NaT), + ), + ( + date_range("20170101", periods=3).insert(4, pd.NaT), + date_range("20170102", periods=3).insert(4, pd.NaT), + ), + ( + date_range("20170101", periods=3, tz="US/Eastern").insert(4, pd.NaT), + date_range("20170102", periods=3, tz="US/Eastern").insert(4, pd.NaT), + ), + ], + ids=lambda x: str(x[0].dtype), +) +def left_right_dtypes(request): + """ + Fixture for building an IntervalArray from various dtypes + """ + return request.param + + +@pytest.fixture +def array(left_right_dtypes): + """ + Fixture to generate an IntervalArray of various dtypes containing NA if possible + """ + left, right = left_right_dtypes + return IntervalArray.from_arrays(left, right) + + +def create_categorical_intervals(left, right, closed="right"): + return Categorical(IntervalIndex.from_arrays(left, right, closed)) + + +def create_series_intervals(left, right, closed="right"): + return Series(IntervalArray.from_arrays(left, right, closed)) + + +def create_series_categorical_intervals(left, right, closed="right"): + return Series(Categorical(IntervalIndex.from_arrays(left, right, closed))) + + +class TestComparison: + @pytest.fixture(params=[operator.eq, operator.ne]) + def op(self, request): + return request.param + + @pytest.fixture( + params=[ + IntervalArray.from_arrays, + IntervalIndex.from_arrays, + create_categorical_intervals, + create_series_intervals, + create_series_categorical_intervals, + ], + ids=[ + "IntervalArray", + "IntervalIndex", + "Categorical[Interval]", + "Series[Interval]", + "Series[Categorical[Interval]]", + ], + ) + def interval_constructor(self, request): + """ + Fixture for all pandas native interval constructors. + To be used as the LHS of IntervalArray comparisons. + """ + return request.param + + def elementwise_comparison(self, op, array, other): + """ + Helper that performs elementwise comparisions between `array` and `other` + """ + other = other if is_list_like(other) else [other] * len(array) + return np.array([op(x, y) for x, y in zip(array, other)]) + + def test_compare_scalar_interval(self, op, array): + # matches first interval + other = array[0] + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + # matches on a single endpoint but not both + other = Interval(array.left[0], array.right[1]) + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + def test_compare_scalar_interval_mixed_closed(self, op, closed, other_closed): + array = IntervalArray.from_arrays(range(2), range(1, 3), closed=closed) + other = Interval(0, 1, closed=other_closed) + + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + def test_compare_scalar_na(self, op, array, nulls_fixture): + result = op(array, nulls_fixture) + expected = self.elementwise_comparison(op, array, nulls_fixture) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "other", + [ + 0, + 1.0, + True, + "foo", + Timestamp("2017-01-01"), + Timestamp("2017-01-01", tz="US/Eastern"), + Timedelta("0 days"), + Period("2017-01-01", "D"), + ], + ) + def test_compare_scalar_other(self, op, array, other): + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + def test_compare_list_like_interval( + self, op, array, interval_constructor, + ): + # same endpoints + other = interval_constructor(array.left, array.right) + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + # different endpoints + other = interval_constructor(array.left[::-1], array.right[::-1]) + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + # all nan endpoints + other = interval_constructor([np.nan] * 4, [np.nan] * 4) + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + def test_compare_list_like_interval_mixed_closed( + self, op, interval_constructor, closed, other_closed + ): + array = IntervalArray.from_arrays(range(2), range(1, 3), closed=closed) + other = interval_constructor(range(2), range(1, 3), closed=other_closed) + + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "other", + [ + ( + Interval(0, 1), + Interval(Timedelta("1 day"), Timedelta("2 days")), + Interval(4, 5, "both"), + Interval(10, 20, "neither"), + ), + (0, 1.5, Timestamp("20170103"), np.nan), + ( + Timestamp("20170102", tz="US/Eastern"), + Timedelta("2 days"), + "baz", + pd.NaT, + ), + ], + ) + def test_compare_list_like_object(self, op, array, other): + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + def test_compare_list_like_nan(self, op, array, nulls_fixture): + other = [nulls_fixture] * 4 + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "other", + [ + np.arange(4, dtype="int64"), + np.arange(4, dtype="float64"), + date_range("2017-01-01", periods=4), + date_range("2017-01-01", periods=4, tz="US/Eastern"), + timedelta_range("0 days", periods=4), + period_range("2017-01-01", periods=4, freq="D"), + Categorical(list("abab")), + Categorical(date_range("2017-01-01", periods=4)), + pd.array(list("abcd")), + pd.array(["foo", 3.14, None, object()]), + ], + ids=lambda x: str(x.dtype), + ) + def test_compare_list_like_other(self, op, array, other): + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("length", [1, 3, 5]) + @pytest.mark.parametrize("other_constructor", [IntervalArray, list]) + def test_compare_length_mismatch_errors(self, op, other_constructor, length): + array = IntervalArray.from_arrays(range(4), range(1, 5)) + other = other_constructor([Interval(0, 1)] * length) + with pytest.raises(ValueError, match="Lengths must match to compare"): + op(array, other) + + @pytest.mark.parametrize( + "constructor, expected_type, assert_func", + [ + (IntervalIndex, np.array, tm.assert_numpy_array_equal), + (Series, Series, tm.assert_series_equal), + ], + ) + def test_index_series_compat(self, op, constructor, expected_type, assert_func): + # IntervalIndex/Series that rely on IntervalArray for comparisons + breaks = range(4) + index = constructor(IntervalIndex.from_breaks(breaks)) + + # scalar comparisons + other = index[0] + result = op(index, other) + expected = expected_type(self.elementwise_comparison(op, index, other)) + assert_func(result, expected) + + other = breaks[0] + result = op(index, other) + expected = expected_type(self.elementwise_comparison(op, index, other)) + assert_func(result, expected) + + # list-like comparisons + other = IntervalArray.from_breaks(breaks) + result = op(index, other) + expected = expected_type(self.elementwise_comparison(op, index, other)) + assert_func(result, expected) + + other = [index[0], breaks[0], "foo"] + result = op(index, other) + expected = expected_type(self.elementwise_comparison(op, index, other)) + assert_func(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/arithmetic/test_numeric.py b/venv/Lib/site-packages/pandas/tests/arithmetic/test_numeric.py new file mode 100644 index 0000000..f55e2b9 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arithmetic/test_numeric.py @@ -0,0 +1,1269 @@ +# Arithmetic tests for DataFrame/Series/Index/Array classes that should +# behave identically. +# Specifically for numeric dtypes +from collections import abc +from decimal import Decimal +from itertools import combinations +import operator +from typing import Any, List + +import numpy as np +import pytest + +import pandas as pd +from pandas import Index, Series, Timedelta, TimedeltaIndex +import pandas._testing as tm +from pandas.core import ops + + +def adjust_negative_zero(zero, expected): + """ + Helper to adjust the expected result if we are dividing by -0.0 + as opposed to 0.0 + """ + if np.signbit(np.array(zero)).any(): + # All entries in the `zero` fixture should be either + # all-negative or no-negative. + assert np.signbit(np.array(zero)).all() + + expected *= -1 + + return expected + + +# TODO: remove this kludge once mypy stops giving false positives here +# List comprehension has incompatible type List[PandasObject]; expected List[RangeIndex] +# See GH#29725 +ser_or_index: List[Any] = [pd.Series, pd.Index] +lefts: List[Any] = [pd.RangeIndex(10, 40, 10)] +lefts.extend( + [ + cls([10, 20, 30], dtype=dtype) + for dtype in ["i1", "i2", "i4", "i8", "u1", "u2", "u4", "u8", "f2", "f4", "f8"] + for cls in ser_or_index + ] +) + +# ------------------------------------------------------------------ +# Comparisons + + +class TestNumericComparisons: + def test_operator_series_comparison_zerorank(self): + # GH#13006 + result = np.float64(0) > pd.Series([1, 2, 3]) + expected = 0.0 > pd.Series([1, 2, 3]) + tm.assert_series_equal(result, expected) + result = pd.Series([1, 2, 3]) < np.float64(0) + expected = pd.Series([1, 2, 3]) < 0.0 + tm.assert_series_equal(result, expected) + result = np.array([0, 1, 2])[0] > pd.Series([0, 1, 2]) + expected = 0.0 > pd.Series([1, 2, 3]) + tm.assert_series_equal(result, expected) + + def test_df_numeric_cmp_dt64_raises(self): + # GH#8932, GH#22163 + ts = pd.Timestamp.now() + df = pd.DataFrame({"x": range(5)}) + + msg = "Invalid comparison between dtype=int64 and Timestamp" + + with pytest.raises(TypeError, match=msg): + df > ts + with pytest.raises(TypeError, match=msg): + df < ts + with pytest.raises(TypeError, match=msg): + ts < df + with pytest.raises(TypeError, match=msg): + ts > df + + assert not (df == ts).any().any() + assert (df != ts).all().all() + + def test_compare_invalid(self): + # GH#8058 + # ops testing + a = pd.Series(np.random.randn(5), name=0) + b = pd.Series(np.random.randn(5)) + b.name = pd.Timestamp("2000-01-01") + tm.assert_series_equal(a / b, 1 / (b / a)) + + +# ------------------------------------------------------------------ +# Numeric dtypes Arithmetic with Datetime/Timedelta Scalar + + +class TestNumericArraylikeArithmeticWithDatetimeLike: + + # TODO: also check name retentention + @pytest.mark.parametrize("box_cls", [np.array, pd.Index, pd.Series]) + @pytest.mark.parametrize( + "left", lefts, ids=lambda x: type(x).__name__ + str(x.dtype), + ) + def test_mul_td64arr(self, left, box_cls): + # GH#22390 + right = np.array([1, 2, 3], dtype="m8[s]") + right = box_cls(right) + + expected = pd.TimedeltaIndex(["10s", "40s", "90s"]) + if isinstance(left, pd.Series) or box_cls is pd.Series: + expected = pd.Series(expected) + + result = left * right + tm.assert_equal(result, expected) + + result = right * left + tm.assert_equal(result, expected) + + # TODO: also check name retentention + @pytest.mark.parametrize("box_cls", [np.array, pd.Index, pd.Series]) + @pytest.mark.parametrize( + "left", lefts, ids=lambda x: type(x).__name__ + str(x.dtype), + ) + def test_div_td64arr(self, left, box_cls): + # GH#22390 + right = np.array([10, 40, 90], dtype="m8[s]") + right = box_cls(right) + + expected = pd.TimedeltaIndex(["1s", "2s", "3s"]) + if isinstance(left, pd.Series) or box_cls is pd.Series: + expected = pd.Series(expected) + + result = right / left + tm.assert_equal(result, expected) + + result = right // left + tm.assert_equal(result, expected) + + with pytest.raises(TypeError): + left / right + + with pytest.raises(TypeError): + left // right + + # TODO: de-duplicate with test_numeric_arr_mul_tdscalar + def test_ops_series(self): + # regression test for G#H8813 + td = Timedelta("1 day") + other = pd.Series([1, 2]) + expected = pd.Series(pd.to_timedelta(["1 day", "2 days"])) + tm.assert_series_equal(expected, td * other) + tm.assert_series_equal(expected, other * td) + + # TODO: also test non-nanosecond timedelta64 and Tick objects; + # see test_numeric_arr_rdiv_tdscalar for note on these failing + @pytest.mark.parametrize( + "scalar_td", + [ + Timedelta(days=1), + Timedelta(days=1).to_timedelta64(), + Timedelta(days=1).to_pytimedelta(), + ], + ids=lambda x: type(x).__name__, + ) + def test_numeric_arr_mul_tdscalar(self, scalar_td, numeric_idx, box): + # GH#19333 + index = numeric_idx + + expected = pd.timedelta_range("0 days", "4 days") + + index = tm.box_expected(index, box) + expected = tm.box_expected(expected, box) + + result = index * scalar_td + tm.assert_equal(result, expected) + + commute = scalar_td * index + tm.assert_equal(commute, expected) + + def test_numeric_arr_rdiv_tdscalar(self, three_days, numeric_idx, box): + index = numeric_idx[1:3] + + expected = TimedeltaIndex(["3 Days", "36 Hours"]) + + index = tm.box_expected(index, box) + expected = tm.box_expected(expected, box) + + result = three_days / index + tm.assert_equal(result, expected) + + with pytest.raises(TypeError): + index / three_days + + @pytest.mark.parametrize( + "other", + [ + pd.Timedelta(hours=31), + pd.Timedelta(hours=31).to_pytimedelta(), + pd.Timedelta(hours=31).to_timedelta64(), + pd.Timedelta(hours=31).to_timedelta64().astype("m8[h]"), + np.timedelta64("NaT"), + np.timedelta64("NaT", "D"), + pd.offsets.Minute(3), + pd.offsets.Second(0), + ], + ) + def test_add_sub_timedeltalike_invalid(self, numeric_idx, other, box): + left = tm.box_expected(numeric_idx, box) + with pytest.raises(TypeError): + left + other + with pytest.raises(TypeError): + other + left + with pytest.raises(TypeError): + left - other + with pytest.raises(TypeError): + other - left + + @pytest.mark.parametrize( + "other", + [ + pd.Timestamp.now().to_pydatetime(), + pd.Timestamp.now(tz="UTC").to_pydatetime(), + pd.Timestamp.now().to_datetime64(), + pd.NaT, + ], + ) + @pytest.mark.filterwarnings("ignore:elementwise comp:DeprecationWarning") + def test_add_sub_datetimelike_invalid(self, numeric_idx, other, box): + # GH#28080 numeric+datetime64 should raise; Timestamp raises + # NullFrequencyError instead of TypeError so is excluded. + left = tm.box_expected(numeric_idx, box) + + with pytest.raises(TypeError): + left + other + with pytest.raises(TypeError): + other + left + with pytest.raises(TypeError): + left - other + with pytest.raises(TypeError): + other - left + + +# ------------------------------------------------------------------ +# Arithmetic + + +class TestDivisionByZero: + def test_div_zero(self, zero, numeric_idx): + idx = numeric_idx + + expected = pd.Index([np.nan, np.inf, np.inf, np.inf, np.inf], dtype=np.float64) + # We only adjust for Index, because Series does not yet apply + # the adjustment correctly. + expected2 = adjust_negative_zero(zero, expected) + + result = idx / zero + tm.assert_index_equal(result, expected2) + ser_compat = Series(idx).astype("i8") / np.array(zero).astype("i8") + tm.assert_series_equal(ser_compat, Series(expected)) + + def test_floordiv_zero(self, zero, numeric_idx): + idx = numeric_idx + + expected = pd.Index([np.nan, np.inf, np.inf, np.inf, np.inf], dtype=np.float64) + # We only adjust for Index, because Series does not yet apply + # the adjustment correctly. + expected2 = adjust_negative_zero(zero, expected) + + result = idx // zero + tm.assert_index_equal(result, expected2) + ser_compat = Series(idx).astype("i8") // np.array(zero).astype("i8") + tm.assert_series_equal(ser_compat, Series(expected)) + + def test_mod_zero(self, zero, numeric_idx): + idx = numeric_idx + + expected = pd.Index([np.nan, np.nan, np.nan, np.nan, np.nan], dtype=np.float64) + result = idx % zero + tm.assert_index_equal(result, expected) + ser_compat = Series(idx).astype("i8") % np.array(zero).astype("i8") + tm.assert_series_equal(ser_compat, Series(result)) + + def test_divmod_zero(self, zero, numeric_idx): + idx = numeric_idx + + exleft = pd.Index([np.nan, np.inf, np.inf, np.inf, np.inf], dtype=np.float64) + exright = pd.Index([np.nan, np.nan, np.nan, np.nan, np.nan], dtype=np.float64) + exleft = adjust_negative_zero(zero, exleft) + + result = divmod(idx, zero) + tm.assert_index_equal(result[0], exleft) + tm.assert_index_equal(result[1], exright) + + @pytest.mark.parametrize("op", [operator.truediv, operator.floordiv]) + def test_div_negative_zero(self, zero, numeric_idx, op): + # Check that -1 / -0.0 returns np.inf, not -np.inf + if isinstance(numeric_idx, pd.UInt64Index): + return + idx = numeric_idx - 3 + + expected = pd.Index( + [-np.inf, -np.inf, -np.inf, np.nan, np.inf], dtype=np.float64 + ) + expected = adjust_negative_zero(zero, expected) + + result = op(idx, zero) + tm.assert_index_equal(result, expected) + + # ------------------------------------------------------------------ + + @pytest.mark.parametrize("dtype1", [np.int64, np.float64, np.uint64]) + def test_ser_div_ser(self, dtype1, any_real_dtype): + # no longer do integer div for any ops, but deal with the 0's + dtype2 = any_real_dtype + + first = Series([3, 4, 5, 8], name="first").astype(dtype1) + second = Series([0, 0, 0, 3], name="second").astype(dtype2) + + with np.errstate(all="ignore"): + expected = Series( + first.values.astype(np.float64) / second.values, + dtype="float64", + name=None, + ) + expected.iloc[0:3] = np.inf + + result = first / second + tm.assert_series_equal(result, expected) + assert not result.equals(second / first) + + @pytest.mark.parametrize("dtype1", [np.int64, np.float64, np.uint64]) + def test_ser_divmod_zero(self, dtype1, any_real_dtype): + # GH#26987 + dtype2 = any_real_dtype + left = pd.Series([1, 1]).astype(dtype1) + right = pd.Series([0, 2]).astype(dtype2) + + # GH#27321 pandas convention is to set 1 // 0 to np.inf, as opposed + # to numpy which sets to np.nan; patch `expected[0]` below + expected = left // right, left % right + expected = list(expected) + expected[0] = expected[0].astype(np.float64) + expected[0][0] = np.inf + result = divmod(left, right) + + tm.assert_series_equal(result[0], expected[0]) + tm.assert_series_equal(result[1], expected[1]) + + # rdivmod case + result = divmod(left.values, right) + tm.assert_series_equal(result[0], expected[0]) + tm.assert_series_equal(result[1], expected[1]) + + def test_ser_divmod_inf(self): + left = pd.Series([np.inf, 1.0]) + right = pd.Series([np.inf, 2.0]) + + expected = left // right, left % right + result = divmod(left, right) + + tm.assert_series_equal(result[0], expected[0]) + tm.assert_series_equal(result[1], expected[1]) + + # rdivmod case + result = divmod(left.values, right) + tm.assert_series_equal(result[0], expected[0]) + tm.assert_series_equal(result[1], expected[1]) + + def test_rdiv_zero_compat(self): + # GH#8674 + zero_array = np.array([0] * 5) + data = np.random.randn(5) + expected = Series([0.0] * 5) + + result = zero_array / Series(data) + tm.assert_series_equal(result, expected) + + result = Series(zero_array) / data + tm.assert_series_equal(result, expected) + + result = Series(zero_array) / Series(data) + tm.assert_series_equal(result, expected) + + def test_div_zero_inf_signs(self): + # GH#9144, inf signing + ser = Series([-1, 0, 1], name="first") + expected = Series([-np.inf, np.nan, np.inf], name="first") + + result = ser / 0 + tm.assert_series_equal(result, expected) + + def test_rdiv_zero(self): + # GH#9144 + ser = Series([-1, 0, 1], name="first") + expected = Series([0.0, np.nan, 0.0], name="first") + + result = 0 / ser + tm.assert_series_equal(result, expected) + + def test_floordiv_div(self): + # GH#9144 + ser = Series([-1, 0, 1], name="first") + + result = ser // 0 + expected = Series([-np.inf, np.nan, np.inf], name="first") + tm.assert_series_equal(result, expected) + + def test_df_div_zero_df(self): + # integer div, but deal with the 0's (GH#9144) + df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) + result = df / df + + first = pd.Series([1.0, 1.0, 1.0, 1.0]) + second = pd.Series([np.nan, np.nan, np.nan, 1]) + expected = pd.DataFrame({"first": first, "second": second}) + tm.assert_frame_equal(result, expected) + + def test_df_div_zero_array(self): + # integer div, but deal with the 0's (GH#9144) + df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) + + first = pd.Series([1.0, 1.0, 1.0, 1.0]) + second = pd.Series([np.nan, np.nan, np.nan, 1]) + expected = pd.DataFrame({"first": first, "second": second}) + + with np.errstate(all="ignore"): + arr = df.values.astype("float") / df.values + result = pd.DataFrame(arr, index=df.index, columns=df.columns) + tm.assert_frame_equal(result, expected) + + def test_df_div_zero_int(self): + # integer div, but deal with the 0's (GH#9144) + df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) + + result = df / 0 + expected = pd.DataFrame(np.inf, index=df.index, columns=df.columns) + expected.iloc[0:3, 1] = np.nan + tm.assert_frame_equal(result, expected) + + # numpy has a slightly different (wrong) treatment + with np.errstate(all="ignore"): + arr = df.values.astype("float64") / 0 + result2 = pd.DataFrame(arr, index=df.index, columns=df.columns) + tm.assert_frame_equal(result2, expected) + + def test_df_div_zero_series_does_not_commute(self): + # integer div, but deal with the 0's (GH#9144) + df = pd.DataFrame(np.random.randn(10, 5)) + ser = df[0] + res = ser / df + res2 = df / ser + assert not res.fillna(0).equals(res2.fillna(0)) + + # ------------------------------------------------------------------ + # Mod By Zero + + def test_df_mod_zero_df(self): + # GH#3590, modulo as ints + df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) + + # this is technically wrong, as the integer portion is coerced to float + # ### + first = pd.Series([0, 0, 0, 0], dtype="float64") + second = pd.Series([np.nan, np.nan, np.nan, 0]) + expected = pd.DataFrame({"first": first, "second": second}) + result = df % df + tm.assert_frame_equal(result, expected) + + def test_df_mod_zero_array(self): + # GH#3590, modulo as ints + df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) + + # this is technically wrong, as the integer portion is coerced to float + # ### + first = pd.Series([0, 0, 0, 0], dtype="float64") + second = pd.Series([np.nan, np.nan, np.nan, 0]) + expected = pd.DataFrame({"first": first, "second": second}) + + # numpy has a slightly different (wrong) treatment + with np.errstate(all="ignore"): + arr = df.values % df.values + result2 = pd.DataFrame(arr, index=df.index, columns=df.columns, dtype="float64") + result2.iloc[0:3, 1] = np.nan + tm.assert_frame_equal(result2, expected) + + def test_df_mod_zero_int(self): + # GH#3590, modulo as ints + df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) + + result = df % 0 + expected = pd.DataFrame(np.nan, index=df.index, columns=df.columns) + tm.assert_frame_equal(result, expected) + + # numpy has a slightly different (wrong) treatment + with np.errstate(all="ignore"): + arr = df.values.astype("float64") % 0 + result2 = pd.DataFrame(arr, index=df.index, columns=df.columns) + tm.assert_frame_equal(result2, expected) + + def test_df_mod_zero_series_does_not_commute(self): + # GH#3590, modulo as ints + # not commutative with series + df = pd.DataFrame(np.random.randn(10, 5)) + ser = df[0] + res = ser % df + res2 = df % ser + assert not res.fillna(0).equals(res2.fillna(0)) + + +class TestMultiplicationDivision: + # __mul__, __rmul__, __div__, __rdiv__, __floordiv__, __rfloordiv__ + # for non-timestamp/timedelta/period dtypes + + @pytest.mark.parametrize( + "box", + [ + pytest.param( + pd.Index, + marks=pytest.mark.xfail( + reason="Index.__div__ always raises", raises=TypeError + ), + ), + pd.Series, + pd.DataFrame, + ], + ids=lambda x: x.__name__, + ) + def test_divide_decimal(self, box): + # resolves issue GH#9787 + ser = Series([Decimal(10)]) + expected = Series([Decimal(5)]) + + ser = tm.box_expected(ser, box) + expected = tm.box_expected(expected, box) + + result = ser / Decimal(2) + + tm.assert_equal(result, expected) + + result = ser // Decimal(2) + tm.assert_equal(result, expected) + + def test_div_equiv_binop(self): + # Test Series.div as well as Series.__div__ + # float/integer issue + # GH#7785 + first = Series([1, 0], name="first") + second = Series([-0.01, -0.02], name="second") + expected = Series([-0.01, -np.inf]) + + result = second.div(first) + tm.assert_series_equal(result, expected, check_names=False) + + result = second / first + tm.assert_series_equal(result, expected) + + def test_div_int(self, numeric_idx): + idx = numeric_idx + result = idx / 1 + expected = idx.astype("float64") + tm.assert_index_equal(result, expected) + + result = idx / 2 + expected = Index(idx.values / 2) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("op", [operator.mul, ops.rmul, operator.floordiv]) + def test_mul_int_identity(self, op, numeric_idx, box_with_array): + idx = numeric_idx + idx = tm.box_expected(idx, box_with_array) + + result = op(idx, 1) + tm.assert_equal(result, idx) + + def test_mul_int_array(self, numeric_idx): + idx = numeric_idx + didx = idx * idx + + result = idx * np.array(5, dtype="int64") + tm.assert_index_equal(result, idx * 5) + + arr_dtype = "uint64" if isinstance(idx, pd.UInt64Index) else "int64" + result = idx * np.arange(5, dtype=arr_dtype) + tm.assert_index_equal(result, didx) + + def test_mul_int_series(self, numeric_idx): + idx = numeric_idx + didx = idx * idx + + arr_dtype = "uint64" if isinstance(idx, pd.UInt64Index) else "int64" + result = idx * Series(np.arange(5, dtype=arr_dtype)) + tm.assert_series_equal(result, Series(didx)) + + def test_mul_float_series(self, numeric_idx): + idx = numeric_idx + rng5 = np.arange(5, dtype="float64") + + result = idx * Series(rng5 + 0.1) + expected = Series(rng5 * (rng5 + 0.1)) + tm.assert_series_equal(result, expected) + + def test_mul_index(self, numeric_idx): + # in general not true for RangeIndex + idx = numeric_idx + if not isinstance(idx, pd.RangeIndex): + result = idx * idx + tm.assert_index_equal(result, idx ** 2) + + def test_mul_datelike_raises(self, numeric_idx): + idx = numeric_idx + with pytest.raises(TypeError): + idx * pd.date_range("20130101", periods=5) + + def test_mul_size_mismatch_raises(self, numeric_idx): + idx = numeric_idx + with pytest.raises(ValueError): + idx * idx[0:3] + with pytest.raises(ValueError): + idx * np.array([1, 2]) + + @pytest.mark.parametrize("op", [operator.pow, ops.rpow]) + def test_pow_float(self, op, numeric_idx, box_with_array): + # test power calculations both ways, GH#14973 + box = box_with_array + idx = numeric_idx + expected = pd.Float64Index(op(idx.values, 2.0)) + + idx = tm.box_expected(idx, box) + expected = tm.box_expected(expected, box) + + result = op(idx, 2.0) + tm.assert_equal(result, expected) + + def test_modulo(self, numeric_idx, box_with_array): + # GH#9244 + box = box_with_array + idx = numeric_idx + expected = Index(idx.values % 2) + + idx = tm.box_expected(idx, box) + expected = tm.box_expected(expected, box) + + result = idx % 2 + tm.assert_equal(result, expected) + + def test_divmod_scalar(self, numeric_idx): + idx = numeric_idx + + result = divmod(idx, 2) + with np.errstate(all="ignore"): + div, mod = divmod(idx.values, 2) + + expected = Index(div), Index(mod) + for r, e in zip(result, expected): + tm.assert_index_equal(r, e) + + def test_divmod_ndarray(self, numeric_idx): + idx = numeric_idx + other = np.ones(idx.values.shape, dtype=idx.values.dtype) * 2 + + result = divmod(idx, other) + with np.errstate(all="ignore"): + div, mod = divmod(idx.values, other) + + expected = Index(div), Index(mod) + for r, e in zip(result, expected): + tm.assert_index_equal(r, e) + + def test_divmod_series(self, numeric_idx): + idx = numeric_idx + other = np.ones(idx.values.shape, dtype=idx.values.dtype) * 2 + + result = divmod(idx, Series(other)) + with np.errstate(all="ignore"): + div, mod = divmod(idx.values, other) + + expected = Series(div), Series(mod) + for r, e in zip(result, expected): + tm.assert_series_equal(r, e) + + @pytest.mark.parametrize("other", [np.nan, 7, -23, 2.718, -3.14, np.inf]) + def test_ops_np_scalar(self, other): + vals = np.random.randn(5, 3) + f = lambda x: pd.DataFrame( + x, index=list("ABCDE"), columns=["jim", "joe", "jolie"] + ) + + df = f(vals) + + tm.assert_frame_equal(df / np.array(other), f(vals / other)) + tm.assert_frame_equal(np.array(other) * df, f(vals * other)) + tm.assert_frame_equal(df + np.array(other), f(vals + other)) + tm.assert_frame_equal(np.array(other) - df, f(other - vals)) + + # TODO: This came from series.test.test_operators, needs cleanup + def test_operators_frame(self): + # rpow does not work with DataFrame + ts = tm.makeTimeSeries() + ts.name = "ts" + + df = pd.DataFrame({"A": ts}) + + tm.assert_series_equal(ts + ts, ts + df["A"], check_names=False) + tm.assert_series_equal(ts ** ts, ts ** df["A"], check_names=False) + tm.assert_series_equal(ts < ts, ts < df["A"], check_names=False) + tm.assert_series_equal(ts / ts, ts / df["A"], check_names=False) + + # TODO: this came from tests.series.test_analytics, needs cleanup and + # de-duplication with test_modulo above + def test_modulo2(self): + with np.errstate(all="ignore"): + + # GH#3590, modulo as ints + p = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) + result = p["first"] % p["second"] + expected = Series(p["first"].values % p["second"].values, dtype="float64") + expected.iloc[0:3] = np.nan + tm.assert_series_equal(result, expected) + + result = p["first"] % 0 + expected = Series(np.nan, index=p.index, name="first") + tm.assert_series_equal(result, expected) + + p = p.astype("float64") + result = p["first"] % p["second"] + expected = Series(p["first"].values % p["second"].values) + tm.assert_series_equal(result, expected) + + p = p.astype("float64") + result = p["first"] % p["second"] + result2 = p["second"] % p["first"] + assert not result.equals(result2) + + def test_modulo_zero_int(self): + # GH#9144 + with np.errstate(all="ignore"): + s = Series([0, 1]) + + result = s % 0 + expected = Series([np.nan, np.nan]) + tm.assert_series_equal(result, expected) + + result = 0 % s + expected = Series([np.nan, 0.0]) + tm.assert_series_equal(result, expected) + + +class TestAdditionSubtraction: + # __add__, __sub__, __radd__, __rsub__, __iadd__, __isub__ + # for non-timestamp/timedelta/period dtypes + + # TODO: This came from series.test.test_operators, needs cleanup + def test_arith_ops_df_compat(self): + # GH#1134 + s1 = pd.Series([1, 2, 3], index=list("ABC"), name="x") + s2 = pd.Series([2, 2, 2], index=list("ABD"), name="x") + + exp = pd.Series([3.0, 4.0, np.nan, np.nan], index=list("ABCD"), name="x") + tm.assert_series_equal(s1 + s2, exp) + tm.assert_series_equal(s2 + s1, exp) + + exp = pd.DataFrame({"x": [3.0, 4.0, np.nan, np.nan]}, index=list("ABCD")) + tm.assert_frame_equal(s1.to_frame() + s2.to_frame(), exp) + tm.assert_frame_equal(s2.to_frame() + s1.to_frame(), exp) + + # different length + s3 = pd.Series([1, 2, 3], index=list("ABC"), name="x") + s4 = pd.Series([2, 2, 2, 2], index=list("ABCD"), name="x") + + exp = pd.Series([3, 4, 5, np.nan], index=list("ABCD"), name="x") + tm.assert_series_equal(s3 + s4, exp) + tm.assert_series_equal(s4 + s3, exp) + + exp = pd.DataFrame({"x": [3, 4, 5, np.nan]}, index=list("ABCD")) + tm.assert_frame_equal(s3.to_frame() + s4.to_frame(), exp) + tm.assert_frame_equal(s4.to_frame() + s3.to_frame(), exp) + + # TODO: This came from series.test.test_operators, needs cleanup + def test_series_frame_radd_bug(self): + # GH#353 + vals = pd.Series(tm.rands_array(5, 10)) + result = "foo_" + vals + expected = vals.map(lambda x: "foo_" + x) + tm.assert_series_equal(result, expected) + + frame = pd.DataFrame({"vals": vals}) + result = "foo_" + frame + expected = pd.DataFrame({"vals": vals.map(lambda x: "foo_" + x)}) + tm.assert_frame_equal(result, expected) + + ts = tm.makeTimeSeries() + ts.name = "ts" + + # really raise this time + now = pd.Timestamp.now().to_pydatetime() + with pytest.raises(TypeError): + now + ts + + with pytest.raises(TypeError): + ts + now + + # TODO: This came from series.test.test_operators, needs cleanup + def test_datetime64_with_index(self): + # arithmetic integer ops with an index + ser = pd.Series(np.random.randn(5)) + expected = ser - ser.index.to_series() + result = ser - ser.index + tm.assert_series_equal(result, expected) + + # GH#4629 + # arithmetic datetime64 ops with an index + ser = pd.Series( + pd.date_range("20130101", periods=5), + index=pd.date_range("20130101", periods=5), + ) + expected = ser - ser.index.to_series() + result = ser - ser.index + tm.assert_series_equal(result, expected) + + with pytest.raises(TypeError): + # GH#18850 + result = ser - ser.index.to_period() + + df = pd.DataFrame( + np.random.randn(5, 2), index=pd.date_range("20130101", periods=5) + ) + df["date"] = pd.Timestamp("20130102") + df["expected"] = df["date"] - df.index.to_series() + df["result"] = df["date"] - df.index + tm.assert_series_equal(df["result"], df["expected"], check_names=False) + + # TODO: taken from tests.frame.test_operators, needs cleanup + def test_frame_operators(self, float_frame): + frame = float_frame + frame2 = pd.DataFrame(float_frame, columns=["D", "C", "B", "A"]) + + garbage = np.random.random(4) + colSeries = pd.Series(garbage, index=np.array(frame.columns)) + + idSum = frame + frame + seriesSum = frame + colSeries + + for col, series in idSum.items(): + for idx, val in series.items(): + origVal = frame[col][idx] * 2 + if not np.isnan(val): + assert val == origVal + else: + assert np.isnan(origVal) + + for col, series in seriesSum.items(): + for idx, val in series.items(): + origVal = frame[col][idx] + colSeries[col] + if not np.isnan(val): + assert val == origVal + else: + assert np.isnan(origVal) + + added = frame2 + frame2 + expected = frame2 * 2 + tm.assert_frame_equal(added, expected) + + df = pd.DataFrame({"a": ["a", None, "b"]}) + tm.assert_frame_equal(df + df, pd.DataFrame({"a": ["aa", np.nan, "bb"]})) + + # Test for issue #10181 + for dtype in ("float", "int64"): + frames = [ + pd.DataFrame(dtype=dtype), + pd.DataFrame(columns=["A"], dtype=dtype), + pd.DataFrame(index=[0], dtype=dtype), + ] + for df in frames: + assert (df + df).equals(df) + tm.assert_frame_equal(df + df, df) + + # TODO: taken from tests.series.test_operators; needs cleanup + def test_series_operators(self): + def _check_op(series, other, op, pos_only=False, check_dtype=True): + left = np.abs(series) if pos_only else series + right = np.abs(other) if pos_only else other + + cython_or_numpy = op(left, right) + python = left.combine(right, op) + tm.assert_series_equal(cython_or_numpy, python, check_dtype=check_dtype) + + def check(series, other): + simple_ops = ["add", "sub", "mul", "truediv", "floordiv", "mod"] + + for opname in simple_ops: + _check_op(series, other, getattr(operator, opname)) + + _check_op(series, other, operator.pow, pos_only=True) + + _check_op(series, other, ops.radd) + _check_op(series, other, ops.rsub) + _check_op(series, other, ops.rtruediv) + _check_op(series, other, ops.rfloordiv) + _check_op(series, other, ops.rmul) + _check_op(series, other, ops.rpow, pos_only=True) + _check_op(series, other, ops.rmod) + + tser = tm.makeTimeSeries().rename("ts") + check(tser, tser * 2) + check(tser, tser[::2]) + check(tser, 5) + + def check_comparators(series, other, check_dtype=True): + _check_op(series, other, operator.gt, check_dtype=check_dtype) + _check_op(series, other, operator.ge, check_dtype=check_dtype) + _check_op(series, other, operator.eq, check_dtype=check_dtype) + _check_op(series, other, operator.lt, check_dtype=check_dtype) + _check_op(series, other, operator.le, check_dtype=check_dtype) + + check_comparators(tser, 5) + check_comparators(tser, tser + 1, check_dtype=False) + + # TODO: taken from tests.series.test_operators; needs cleanup + def test_divmod(self): + def check(series, other): + results = divmod(series, other) + if isinstance(other, abc.Iterable) and len(series) != len(other): + # if the lengths don't match, this is the test where we use + # `tser[::2]`. Pad every other value in `other_np` with nan. + other_np = [] + for n in other: + other_np.append(n) + other_np.append(np.nan) + else: + other_np = other + other_np = np.asarray(other_np) + with np.errstate(all="ignore"): + expecteds = divmod(series.values, np.asarray(other_np)) + + for result, expected in zip(results, expecteds): + # check the values, name, and index separately + tm.assert_almost_equal(np.asarray(result), expected) + + assert result.name == series.name + tm.assert_index_equal(result.index, series.index) + + tser = tm.makeTimeSeries().rename("ts") + check(tser, tser * 2) + check(tser, tser[::2]) + check(tser, 5) + + def test_series_divmod_zero(self): + # Check that divmod uses pandas convention for division by zero, + # which does not match numpy. + # pandas convention has + # 1/0 == np.inf + # -1/0 == -np.inf + # 1/-0.0 == -np.inf + # -1/-0.0 == np.inf + tser = tm.makeTimeSeries().rename("ts") + other = tser * 0 + + result = divmod(tser, other) + exp1 = pd.Series([np.inf] * len(tser), index=tser.index, name="ts") + exp2 = pd.Series([np.nan] * len(tser), index=tser.index, name="ts") + tm.assert_series_equal(result[0], exp1) + tm.assert_series_equal(result[1], exp2) + + +class TestUFuncCompat: + @pytest.mark.parametrize( + "holder", + [pd.Int64Index, pd.UInt64Index, pd.Float64Index, pd.RangeIndex, pd.Series], + ) + def test_ufunc_compat(self, holder): + box = pd.Series if holder is pd.Series else pd.Index + + if holder is pd.RangeIndex: + idx = pd.RangeIndex(0, 5) + else: + idx = holder(np.arange(5, dtype="int64")) + result = np.sin(idx) + expected = box(np.sin(np.arange(5, dtype="int64"))) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize( + "holder", [pd.Int64Index, pd.UInt64Index, pd.Float64Index, pd.Series] + ) + def test_ufunc_coercions(self, holder): + idx = holder([1, 2, 3, 4, 5], name="x") + box = pd.Series if holder is pd.Series else pd.Index + + result = np.sqrt(idx) + assert result.dtype == "f8" and isinstance(result, box) + exp = pd.Float64Index(np.sqrt(np.array([1, 2, 3, 4, 5])), name="x") + exp = tm.box_expected(exp, box) + tm.assert_equal(result, exp) + + result = np.divide(idx, 2.0) + assert result.dtype == "f8" and isinstance(result, box) + exp = pd.Float64Index([0.5, 1.0, 1.5, 2.0, 2.5], name="x") + exp = tm.box_expected(exp, box) + tm.assert_equal(result, exp) + + # _evaluate_numeric_binop + result = idx + 2.0 + assert result.dtype == "f8" and isinstance(result, box) + exp = pd.Float64Index([3.0, 4.0, 5.0, 6.0, 7.0], name="x") + exp = tm.box_expected(exp, box) + tm.assert_equal(result, exp) + + result = idx - 2.0 + assert result.dtype == "f8" and isinstance(result, box) + exp = pd.Float64Index([-1.0, 0.0, 1.0, 2.0, 3.0], name="x") + exp = tm.box_expected(exp, box) + tm.assert_equal(result, exp) + + result = idx * 1.0 + assert result.dtype == "f8" and isinstance(result, box) + exp = pd.Float64Index([1.0, 2.0, 3.0, 4.0, 5.0], name="x") + exp = tm.box_expected(exp, box) + tm.assert_equal(result, exp) + + result = idx / 2.0 + assert result.dtype == "f8" and isinstance(result, box) + exp = pd.Float64Index([0.5, 1.0, 1.5, 2.0, 2.5], name="x") + exp = tm.box_expected(exp, box) + tm.assert_equal(result, exp) + + @pytest.mark.parametrize( + "holder", [pd.Int64Index, pd.UInt64Index, pd.Float64Index, pd.Series] + ) + def test_ufunc_multiple_return_values(self, holder): + obj = holder([1, 2, 3], name="x") + box = pd.Series if holder is pd.Series else pd.Index + + result = np.modf(obj) + assert isinstance(result, tuple) + exp1 = pd.Float64Index([0.0, 0.0, 0.0], name="x") + exp2 = pd.Float64Index([1.0, 2.0, 3.0], name="x") + tm.assert_equal(result[0], tm.box_expected(exp1, box)) + tm.assert_equal(result[1], tm.box_expected(exp2, box)) + + def test_ufunc_at(self): + s = pd.Series([0, 1, 2], index=[1, 2, 3], name="x") + np.add.at(s, [0, 2], 10) + expected = pd.Series([10, 1, 12], index=[1, 2, 3], name="x") + tm.assert_series_equal(s, expected) + + +class TestObjectDtypeEquivalence: + # Tests that arithmetic operations match operations executed elementwise + + @pytest.mark.parametrize("dtype", [None, object]) + def test_numarr_with_dtype_add_nan(self, dtype, box_with_array): + box = box_with_array + ser = pd.Series([1, 2, 3], dtype=dtype) + expected = pd.Series([np.nan, np.nan, np.nan], dtype=dtype) + + ser = tm.box_expected(ser, box) + expected = tm.box_expected(expected, box) + + result = np.nan + ser + tm.assert_equal(result, expected) + + result = ser + np.nan + tm.assert_equal(result, expected) + + @pytest.mark.parametrize("dtype", [None, object]) + def test_numarr_with_dtype_add_int(self, dtype, box_with_array): + box = box_with_array + ser = pd.Series([1, 2, 3], dtype=dtype) + expected = pd.Series([2, 3, 4], dtype=dtype) + + ser = tm.box_expected(ser, box) + expected = tm.box_expected(expected, box) + + result = 1 + ser + tm.assert_equal(result, expected) + + result = ser + 1 + tm.assert_equal(result, expected) + + # TODO: moved from tests.series.test_operators; needs cleanup + @pytest.mark.parametrize( + "op", + [operator.add, operator.sub, operator.mul, operator.truediv, operator.floordiv], + ) + def test_operators_reverse_object(self, op): + # GH#56 + arr = pd.Series(np.random.randn(10), index=np.arange(10), dtype=object) + + result = op(1.0, arr) + expected = op(1.0, arr.astype(float)) + tm.assert_series_equal(result.astype(float), expected) + + +class TestNumericArithmeticUnsorted: + # Tests in this class have been moved from type-specific test modules + # but not yet sorted, parametrized, and de-duplicated + + def check_binop(self, ops, scalars, idxs): + for op in ops: + for a, b in combinations(idxs, 2): + result = op(a, b) + expected = op(pd.Int64Index(a), pd.Int64Index(b)) + tm.assert_index_equal(result, expected) + for idx in idxs: + for scalar in scalars: + result = op(idx, scalar) + expected = op(pd.Int64Index(idx), scalar) + tm.assert_index_equal(result, expected) + + def test_binops(self): + ops = [ + operator.add, + operator.sub, + operator.mul, + operator.floordiv, + operator.truediv, + ] + scalars = [-1, 1, 2] + idxs = [ + pd.RangeIndex(0, 10, 1), + pd.RangeIndex(0, 20, 2), + pd.RangeIndex(-10, 10, 2), + pd.RangeIndex(5, -5, -1), + ] + self.check_binop(ops, scalars, idxs) + + def test_binops_pow(self): + # numpy does not allow powers of negative integers so test separately + # https://github.com/numpy/numpy/pull/8127 + ops = [pow] + scalars = [1, 2] + idxs = [pd.RangeIndex(0, 10, 1), pd.RangeIndex(0, 20, 2)] + self.check_binop(ops, scalars, idxs) + + # TODO: mod, divmod? + @pytest.mark.parametrize( + "op", + [ + operator.add, + operator.sub, + operator.mul, + operator.floordiv, + operator.truediv, + operator.pow, + ], + ) + def test_arithmetic_with_frame_or_series(self, op): + # check that we return NotImplemented when operating with Series + # or DataFrame + index = pd.RangeIndex(5) + other = pd.Series(np.random.randn(5)) + + expected = op(pd.Series(index), other) + result = op(index, other) + tm.assert_series_equal(result, expected) + + other = pd.DataFrame(np.random.randn(2, 5)) + expected = op(pd.DataFrame([index, index]), other) + result = op(index, other) + tm.assert_frame_equal(result, expected) + + def test_numeric_compat2(self): + # validate that we are handling the RangeIndex overrides to numeric ops + # and returning RangeIndex where possible + + idx = pd.RangeIndex(0, 10, 2) + + result = idx * 2 + expected = pd.RangeIndex(0, 20, 4) + tm.assert_index_equal(result, expected, exact=True) + + result = idx + 2 + expected = pd.RangeIndex(2, 12, 2) + tm.assert_index_equal(result, expected, exact=True) + + result = idx - 2 + expected = pd.RangeIndex(-2, 8, 2) + tm.assert_index_equal(result, expected, exact=True) + + result = idx / 2 + expected = pd.RangeIndex(0, 5, 1).astype("float64") + tm.assert_index_equal(result, expected, exact=True) + + result = idx / 4 + expected = pd.RangeIndex(0, 10, 2) / 4 + tm.assert_index_equal(result, expected, exact=True) + + result = idx // 1 + expected = idx + tm.assert_index_equal(result, expected, exact=True) + + # __mul__ + result = idx * idx + expected = Index(idx.values * idx.values) + tm.assert_index_equal(result, expected, exact=True) + + # __pow__ + idx = pd.RangeIndex(0, 1000, 2) + result = idx ** 2 + expected = idx._int64index ** 2 + tm.assert_index_equal(Index(result.values), expected, exact=True) + + # __floordiv__ + cases_exact = [ + (pd.RangeIndex(0, 1000, 2), 2, pd.RangeIndex(0, 500, 1)), + (pd.RangeIndex(-99, -201, -3), -3, pd.RangeIndex(33, 67, 1)), + (pd.RangeIndex(0, 1000, 1), 2, pd.RangeIndex(0, 1000, 1)._int64index // 2), + ( + pd.RangeIndex(0, 100, 1), + 2.0, + pd.RangeIndex(0, 100, 1)._int64index // 2.0, + ), + (pd.RangeIndex(0), 50, pd.RangeIndex(0)), + (pd.RangeIndex(2, 4, 2), 3, pd.RangeIndex(0, 1, 1)), + (pd.RangeIndex(-5, -10, -6), 4, pd.RangeIndex(-2, -1, 1)), + (pd.RangeIndex(-100, -200, 3), 2, pd.RangeIndex(0)), + ] + for idx, div, expected in cases_exact: + tm.assert_index_equal(idx // div, expected, exact=True) + + @pytest.mark.parametrize("dtype", [np.int64, np.float64]) + @pytest.mark.parametrize("delta", [1, 0, -1]) + def test_addsub_arithmetic(self, dtype, delta): + # GH#8142 + delta = dtype(delta) + index = pd.Index([10, 11, 12], dtype=dtype) + result = index + delta + expected = pd.Index(index.values + delta, dtype=dtype) + tm.assert_index_equal(result, expected) + + # this subtraction used to fail + result = index - delta + expected = pd.Index(index.values - delta, dtype=dtype) + tm.assert_index_equal(result, expected) + + tm.assert_index_equal(index + index, 2 * index) + tm.assert_index_equal(index - index, 0 * index) + assert not (index - index).empty + + +def test_fill_value_inf_masking(): + # GH #27464 make sure we mask 0/1 with Inf and not NaN + df = pd.DataFrame({"A": [0, 1, 2], "B": [1.1, None, 1.1]}) + + other = pd.DataFrame({"A": [1.1, 1.2, 1.3]}, index=[0, 2, 3]) + + result = df.rfloordiv(other, fill_value=1) + + expected = pd.DataFrame( + {"A": [np.inf, 1.0, 0.0, 1.0], "B": [0.0, np.nan, 0.0, np.nan]} + ) + tm.assert_frame_equal(result, expected) + + +def test_dataframe_div_silenced(): + # GH#26793 + pdf1 = pd.DataFrame( + { + "A": np.arange(10), + "B": [np.nan, 1, 2, 3, 4] * 2, + "C": [np.nan] * 10, + "D": np.arange(10), + }, + index=list("abcdefghij"), + columns=list("ABCD"), + ) + pdf2 = pd.DataFrame( + np.random.randn(10, 4), index=list("abcdefghjk"), columns=list("ABCX") + ) + with tm.assert_produces_warning(None): + pdf1.div(pdf2, fill_value=0) diff --git a/venv/Lib/site-packages/pandas/tests/arithmetic/test_object.py b/venv/Lib/site-packages/pandas/tests/arithmetic/test_object.py new file mode 100644 index 0000000..d0f204a --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arithmetic/test_object.py @@ -0,0 +1,365 @@ +# Arithmetic tests for DataFrame/Series/Index/Array classes that should +# behave identically. +# Specifically for object dtype +import datetime +from decimal import Decimal +import operator + +import numpy as np +import pytest + +import pandas as pd +from pandas import Series, Timestamp +import pandas._testing as tm +from pandas.core import ops + +# ------------------------------------------------------------------ +# Comparisons + + +class TestObjectComparisons: + def test_comparison_object_numeric_nas(self): + ser = Series(np.random.randn(10), dtype=object) + shifted = ser.shift(2) + + ops = ["lt", "le", "gt", "ge", "eq", "ne"] + for op in ops: + func = getattr(operator, op) + + result = func(ser, shifted) + expected = func(ser.astype(float), shifted.astype(float)) + tm.assert_series_equal(result, expected) + + def test_object_comparisons(self): + ser = Series(["a", "b", np.nan, "c", "a"]) + + result = ser == "a" + expected = Series([True, False, False, False, True]) + tm.assert_series_equal(result, expected) + + result = ser < "a" + expected = Series([False, False, False, False, False]) + tm.assert_series_equal(result, expected) + + result = ser != "a" + expected = -(ser == "a") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("dtype", [None, object]) + def test_more_na_comparisons(self, dtype): + left = Series(["a", np.nan, "c"], dtype=dtype) + right = Series(["a", np.nan, "d"], dtype=dtype) + + result = left == right + expected = Series([True, False, False]) + tm.assert_series_equal(result, expected) + + result = left != right + expected = Series([False, True, True]) + tm.assert_series_equal(result, expected) + + result = left == np.nan + expected = Series([False, False, False]) + tm.assert_series_equal(result, expected) + + result = left != np.nan + expected = Series([True, True, True]) + tm.assert_series_equal(result, expected) + + +# ------------------------------------------------------------------ +# Arithmetic + + +class TestArithmetic: + + # TODO: parametrize + def test_pow_ops_object(self): + # GH#22922 + # pow is weird with masking & 1, so testing here + a = Series([1, np.nan, 1, np.nan], dtype=object) + b = Series([1, np.nan, np.nan, 1], dtype=object) + result = a ** b + expected = Series(a.values ** b.values, dtype=object) + tm.assert_series_equal(result, expected) + + result = b ** a + expected = Series(b.values ** a.values, dtype=object) + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("op", [operator.add, ops.radd]) + @pytest.mark.parametrize("other", ["category", "Int64"]) + def test_add_extension_scalar(self, other, box_with_array, op): + # GH#22378 + # Check that scalars satisfying is_extension_array_dtype(obj) + # do not incorrectly try to dispatch to an ExtensionArray operation + + arr = pd.Series(["a", "b", "c"]) + expected = pd.Series([op(x, other) for x in arr]) + + arr = tm.box_expected(arr, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + result = op(arr, other) + tm.assert_equal(result, expected) + + def test_objarr_add_str(self, box): + ser = pd.Series(["x", np.nan, "x"]) + expected = pd.Series(["xa", np.nan, "xa"]) + + ser = tm.box_expected(ser, box) + expected = tm.box_expected(expected, box) + + result = ser + "a" + tm.assert_equal(result, expected) + + def test_objarr_radd_str(self, box): + ser = pd.Series(["x", np.nan, "x"]) + expected = pd.Series(["ax", np.nan, "ax"]) + + ser = tm.box_expected(ser, box) + expected = tm.box_expected(expected, box) + + result = "a" + ser + tm.assert_equal(result, expected) + + @pytest.mark.parametrize( + "data", + [ + [1, 2, 3], + [1.1, 2.2, 3.3], + [Timestamp("2011-01-01"), Timestamp("2011-01-02"), pd.NaT], + ["x", "y", 1], + ], + ) + @pytest.mark.parametrize("dtype", [None, object]) + def test_objarr_radd_str_invalid(self, dtype, data, box_with_array): + ser = Series(data, dtype=dtype) + + ser = tm.box_expected(ser, box_with_array) + with pytest.raises(TypeError): + "foo_" + ser + + @pytest.mark.parametrize("op", [operator.add, ops.radd, operator.sub, ops.rsub]) + def test_objarr_add_invalid(self, op, box_with_array): + # invalid ops + box = box_with_array + + obj_ser = tm.makeObjectSeries() + obj_ser.name = "objects" + + obj_ser = tm.box_expected(obj_ser, box) + with pytest.raises(Exception): + op(obj_ser, 1) + with pytest.raises(Exception): + op(obj_ser, np.array(1, dtype=np.int64)) + + # TODO: Moved from tests.series.test_operators; needs cleanup + def test_operators_na_handling(self): + ser = Series(["foo", "bar", "baz", np.nan]) + result = "prefix_" + ser + expected = pd.Series(["prefix_foo", "prefix_bar", "prefix_baz", np.nan]) + tm.assert_series_equal(result, expected) + + result = ser + "_suffix" + expected = pd.Series(["foo_suffix", "bar_suffix", "baz_suffix", np.nan]) + tm.assert_series_equal(result, expected) + + # TODO: parametrize over box + @pytest.mark.parametrize("dtype", [None, object]) + def test_series_with_dtype_radd_timedelta(self, dtype): + # note this test is _not_ aimed at timedelta64-dtyped Series + ser = pd.Series( + [pd.Timedelta("1 days"), pd.Timedelta("2 days"), pd.Timedelta("3 days")], + dtype=dtype, + ) + expected = pd.Series( + [pd.Timedelta("4 days"), pd.Timedelta("5 days"), pd.Timedelta("6 days")] + ) + + result = pd.Timedelta("3 days") + ser + tm.assert_series_equal(result, expected) + + result = ser + pd.Timedelta("3 days") + tm.assert_series_equal(result, expected) + + # TODO: cleanup & parametrize over box + def test_mixed_timezone_series_ops_object(self): + # GH#13043 + ser = pd.Series( + [ + pd.Timestamp("2015-01-01", tz="US/Eastern"), + pd.Timestamp("2015-01-01", tz="Asia/Tokyo"), + ], + name="xxx", + ) + assert ser.dtype == object + + exp = pd.Series( + [ + pd.Timestamp("2015-01-02", tz="US/Eastern"), + pd.Timestamp("2015-01-02", tz="Asia/Tokyo"), + ], + name="xxx", + ) + tm.assert_series_equal(ser + pd.Timedelta("1 days"), exp) + tm.assert_series_equal(pd.Timedelta("1 days") + ser, exp) + + # object series & object series + ser2 = pd.Series( + [ + pd.Timestamp("2015-01-03", tz="US/Eastern"), + pd.Timestamp("2015-01-05", tz="Asia/Tokyo"), + ], + name="xxx", + ) + assert ser2.dtype == object + exp = pd.Series([pd.Timedelta("2 days"), pd.Timedelta("4 days")], name="xxx") + tm.assert_series_equal(ser2 - ser, exp) + tm.assert_series_equal(ser - ser2, -exp) + + ser = pd.Series( + [pd.Timedelta("01:00:00"), pd.Timedelta("02:00:00")], + name="xxx", + dtype=object, + ) + assert ser.dtype == object + + exp = pd.Series( + [pd.Timedelta("01:30:00"), pd.Timedelta("02:30:00")], name="xxx" + ) + tm.assert_series_equal(ser + pd.Timedelta("00:30:00"), exp) + tm.assert_series_equal(pd.Timedelta("00:30:00") + ser, exp) + + # TODO: cleanup & parametrize over box + def test_iadd_preserves_name(self): + # GH#17067, GH#19723 __iadd__ and __isub__ should preserve index name + ser = pd.Series([1, 2, 3]) + ser.index.name = "foo" + + ser.index += 1 + assert ser.index.name == "foo" + + ser.index -= 1 + assert ser.index.name == "foo" + + def test_add_string(self): + # from bug report + index = pd.Index(["a", "b", "c"]) + index2 = index + "foo" + + assert "a" not in index2 + assert "afoo" in index2 + + def test_iadd_string(self): + index = pd.Index(["a", "b", "c"]) + # doesn't fail test unless there is a check before `+=` + assert "a" in index + + index += "_x" + assert "a_x" in index + + def test_add(self): + index = tm.makeStringIndex(100) + expected = pd.Index(index.values * 2) + tm.assert_index_equal(index + index, expected) + tm.assert_index_equal(index + index.tolist(), expected) + tm.assert_index_equal(index.tolist() + index, expected) + + # test add and radd + index = pd.Index(list("abc")) + expected = pd.Index(["a1", "b1", "c1"]) + tm.assert_index_equal(index + "1", expected) + expected = pd.Index(["1a", "1b", "1c"]) + tm.assert_index_equal("1" + index, expected) + + def test_sub_fail(self): + index = tm.makeStringIndex(100) + with pytest.raises(TypeError): + index - "a" + with pytest.raises(TypeError): + index - index + with pytest.raises(TypeError): + index - index.tolist() + with pytest.raises(TypeError): + index.tolist() - index + + def test_sub_object(self): + # GH#19369 + index = pd.Index([Decimal(1), Decimal(2)]) + expected = pd.Index([Decimal(0), Decimal(1)]) + + result = index - Decimal(1) + tm.assert_index_equal(result, expected) + + result = index - pd.Index([Decimal(1), Decimal(1)]) + tm.assert_index_equal(result, expected) + + with pytest.raises(TypeError): + index - "foo" + + with pytest.raises(TypeError): + index - np.array([2, "foo"]) + + def test_rsub_object(self): + # GH#19369 + index = pd.Index([Decimal(1), Decimal(2)]) + expected = pd.Index([Decimal(1), Decimal(0)]) + + result = Decimal(2) - index + tm.assert_index_equal(result, expected) + + result = np.array([Decimal(2), Decimal(2)]) - index + tm.assert_index_equal(result, expected) + + with pytest.raises(TypeError): + "foo" - index + + with pytest.raises(TypeError): + np.array([True, pd.Timestamp.now()]) - index + + +class MyIndex(pd.Index): + # Simple index subclass that tracks ops calls. + + _calls: int + + @classmethod + def _simple_new(cls, values, name=None, dtype=None): + result = object.__new__(cls) + result._data = values + result._index_data = values + result._name = name + result._calls = 0 + + return result._reset_identity() + + def __add__(self, other): + self._calls += 1 + return self._simple_new(self._index_data) + + def __radd__(self, other): + return self.__add__(other) + + +@pytest.mark.parametrize( + "other", + [ + [datetime.timedelta(1), datetime.timedelta(2)], + [datetime.datetime(2000, 1, 1), datetime.datetime(2000, 1, 2)], + [pd.Period("2000"), pd.Period("2001")], + ["a", "b"], + ], + ids=["timedelta", "datetime", "period", "object"], +) +def test_index_ops_defer_to_unknown_subclasses(other): + # https://github.com/pandas-dev/pandas/issues/31109 + values = np.array( + [datetime.date(2000, 1, 1), datetime.date(2000, 1, 2)], dtype=object + ) + a = MyIndex._simple_new(values) + other = pd.Index(other) + result = other + a + assert isinstance(result, MyIndex) + assert a._calls == 1 diff --git a/venv/Lib/site-packages/pandas/tests/arithmetic/test_period.py b/venv/Lib/site-packages/pandas/tests/arithmetic/test_period.py new file mode 100644 index 0000000..abb6672 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arithmetic/test_period.py @@ -0,0 +1,1450 @@ +# Arithmetic tests for DataFrame/Series/Index/Array classes that should +# behave identically. +# Specifically for Period dtype +import operator + +import numpy as np +import pytest + +from pandas._libs.tslibs.period import IncompatibleFrequency +from pandas.errors import PerformanceWarning + +import pandas as pd +from pandas import Period, PeriodIndex, Series, period_range +import pandas._testing as tm +from pandas.core import ops +from pandas.core.arrays import TimedeltaArray + +from pandas.tseries.frequencies import to_offset + +from .common import assert_invalid_comparison + +# ------------------------------------------------------------------ +# Comparisons + + +class TestPeriodArrayLikeComparisons: + # Comparison tests for PeriodDtype vectors fully parametrized over + # DataFrame/Series/PeriodIndex/PeriodArray. Ideally all comparison + # tests will eventually end up here. + + def test_compare_zerodim(self, box_with_array): + # GH#26689 make sure we unbox zero-dimensional arrays + xbox = box_with_array if box_with_array is not pd.Index else np.ndarray + + pi = pd.period_range("2000", periods=4) + other = np.array(pi.to_numpy()[0]) + + pi = tm.box_expected(pi, box_with_array) + result = pi <= other + expected = np.array([True, False, False, False]) + expected = tm.box_expected(expected, xbox) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize( + "scalar", ["foo", pd.Timestamp.now(), pd.Timedelta(days=4)] + ) + def test_compare_invalid_scalar(self, box_with_array, scalar): + # comparison with scalar that cannot be interpreted as a Period + pi = pd.period_range("2000", periods=4) + parr = tm.box_expected(pi, box_with_array) + assert_invalid_comparison(parr, scalar, box_with_array) + + @pytest.mark.parametrize( + "other", + [ + pd.date_range("2000", periods=4).array, + pd.timedelta_range("1D", periods=4).array, + np.arange(4), + np.arange(4).astype(np.float64), + list(range(4)), + ], + ) + def test_compare_invalid_listlike(self, box_with_array, other): + pi = pd.period_range("2000", periods=4) + parr = tm.box_expected(pi, box_with_array) + assert_invalid_comparison(parr, other, box_with_array) + + @pytest.mark.parametrize("other_box", [list, np.array, lambda x: x.astype(object)]) + def test_compare_object_dtype(self, box_with_array, other_box): + pi = pd.period_range("2000", periods=5) + parr = tm.box_expected(pi, box_with_array) + + xbox = np.ndarray if box_with_array is pd.Index else box_with_array + + other = other_box(pi) + + expected = np.array([True, True, True, True, True]) + expected = tm.box_expected(expected, xbox) + + result = parr == other + tm.assert_equal(result, expected) + result = parr <= other + tm.assert_equal(result, expected) + result = parr >= other + tm.assert_equal(result, expected) + + result = parr != other + tm.assert_equal(result, ~expected) + result = parr < other + tm.assert_equal(result, ~expected) + result = parr > other + tm.assert_equal(result, ~expected) + + other = other_box(pi[::-1]) + + expected = np.array([False, False, True, False, False]) + expected = tm.box_expected(expected, xbox) + result = parr == other + tm.assert_equal(result, expected) + + expected = np.array([True, True, True, False, False]) + expected = tm.box_expected(expected, xbox) + result = parr <= other + tm.assert_equal(result, expected) + + expected = np.array([False, False, True, True, True]) + expected = tm.box_expected(expected, xbox) + result = parr >= other + tm.assert_equal(result, expected) + + expected = np.array([True, True, False, True, True]) + expected = tm.box_expected(expected, xbox) + result = parr != other + tm.assert_equal(result, expected) + + expected = np.array([True, True, False, False, False]) + expected = tm.box_expected(expected, xbox) + result = parr < other + tm.assert_equal(result, expected) + + expected = np.array([False, False, False, True, True]) + expected = tm.box_expected(expected, xbox) + result = parr > other + tm.assert_equal(result, expected) + + +class TestPeriodIndexComparisons: + # TODO: parameterize over boxes + + @pytest.mark.parametrize("other", ["2017", pd.Period("2017", freq="D")]) + def test_eq(self, other): + idx = PeriodIndex(["2017", "2017", "2018"], freq="D") + expected = np.array([True, True, False]) + result = idx == other + + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "other", + [ + 2017, + [2017, 2017, 2017], + np.array([2017, 2017, 2017]), + np.array([2017, 2017, 2017], dtype=object), + pd.Index([2017, 2017, 2017]), + ], + ) + def test_eq_integer_disallowed(self, other): + # match Period semantics by not treating integers as Periods + + idx = PeriodIndex(["2017", "2017", "2018"], freq="D") + expected = np.array([False, False, False]) + result = idx == other + + tm.assert_numpy_array_equal(result, expected) + + with pytest.raises(TypeError): + idx < other + with pytest.raises(TypeError): + idx > other + with pytest.raises(TypeError): + idx <= other + with pytest.raises(TypeError): + idx >= other + + def test_pi_cmp_period(self): + idx = period_range("2007-01", periods=20, freq="M") + + result = idx < idx[10] + exp = idx.values < idx.values[10] + tm.assert_numpy_array_equal(result, exp) + + # TODO: moved from test_datetime64; de-duplicate with version below + def test_parr_cmp_period_scalar2(self, box_with_array): + xbox = box_with_array if box_with_array is not pd.Index else np.ndarray + + pi = pd.period_range("2000-01-01", periods=10, freq="D") + + val = Period("2000-01-04", freq="D") + expected = [x > val for x in pi] + + ser = tm.box_expected(pi, box_with_array) + expected = tm.box_expected(expected, xbox) + result = ser > val + tm.assert_equal(result, expected) + + val = pi[5] + result = ser > val + expected = [x > val for x in pi] + expected = tm.box_expected(expected, xbox) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) + def test_parr_cmp_period_scalar(self, freq, box_with_array): + # GH#13200 + xbox = np.ndarray if box_with_array is pd.Index else box_with_array + + base = PeriodIndex(["2011-01", "2011-02", "2011-03", "2011-04"], freq=freq) + base = tm.box_expected(base, box_with_array) + per = Period("2011-02", freq=freq) + + exp = np.array([False, True, False, False]) + exp = tm.box_expected(exp, xbox) + tm.assert_equal(base == per, exp) + tm.assert_equal(per == base, exp) + + exp = np.array([True, False, True, True]) + exp = tm.box_expected(exp, xbox) + tm.assert_equal(base != per, exp) + tm.assert_equal(per != base, exp) + + exp = np.array([False, False, True, True]) + exp = tm.box_expected(exp, xbox) + tm.assert_equal(base > per, exp) + tm.assert_equal(per < base, exp) + + exp = np.array([True, False, False, False]) + exp = tm.box_expected(exp, xbox) + tm.assert_equal(base < per, exp) + tm.assert_equal(per > base, exp) + + exp = np.array([False, True, True, True]) + exp = tm.box_expected(exp, xbox) + tm.assert_equal(base >= per, exp) + tm.assert_equal(per <= base, exp) + + exp = np.array([True, True, False, False]) + exp = tm.box_expected(exp, xbox) + tm.assert_equal(base <= per, exp) + tm.assert_equal(per >= base, exp) + + @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) + def test_parr_cmp_pi(self, freq, box_with_array): + # GH#13200 + xbox = np.ndarray if box_with_array is pd.Index else box_with_array + + base = PeriodIndex(["2011-01", "2011-02", "2011-03", "2011-04"], freq=freq) + base = tm.box_expected(base, box_with_array) + + # TODO: could also box idx? + idx = PeriodIndex(["2011-02", "2011-01", "2011-03", "2011-05"], freq=freq) + + exp = np.array([False, False, True, False]) + exp = tm.box_expected(exp, xbox) + tm.assert_equal(base == idx, exp) + + exp = np.array([True, True, False, True]) + exp = tm.box_expected(exp, xbox) + tm.assert_equal(base != idx, exp) + + exp = np.array([False, True, False, False]) + exp = tm.box_expected(exp, xbox) + tm.assert_equal(base > idx, exp) + + exp = np.array([True, False, False, True]) + exp = tm.box_expected(exp, xbox) + tm.assert_equal(base < idx, exp) + + exp = np.array([False, True, True, False]) + exp = tm.box_expected(exp, xbox) + tm.assert_equal(base >= idx, exp) + + exp = np.array([True, False, True, True]) + exp = tm.box_expected(exp, xbox) + tm.assert_equal(base <= idx, exp) + + @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) + def test_parr_cmp_pi_mismatched_freq_raises(self, freq, box_with_array): + # GH#13200 + # different base freq + base = PeriodIndex(["2011-01", "2011-02", "2011-03", "2011-04"], freq=freq) + base = tm.box_expected(base, box_with_array) + + msg = "Input has different freq=A-DEC from " + with pytest.raises(IncompatibleFrequency, match=msg): + base <= Period("2011", freq="A") + + with pytest.raises(IncompatibleFrequency, match=msg): + Period("2011", freq="A") >= base + + # TODO: Could parametrize over boxes for idx? + idx = PeriodIndex(["2011", "2012", "2013", "2014"], freq="A") + rev_msg = r"Input has different freq=(M|2M|3M) from PeriodArray\(freq=A-DEC\)" + idx_msg = rev_msg if box_with_array is tm.to_array else msg + with pytest.raises(IncompatibleFrequency, match=idx_msg): + base <= idx + + # Different frequency + msg = "Input has different freq=4M from " + with pytest.raises(IncompatibleFrequency, match=msg): + base <= Period("2011", freq="4M") + + with pytest.raises(IncompatibleFrequency, match=msg): + Period("2011", freq="4M") >= base + + idx = PeriodIndex(["2011", "2012", "2013", "2014"], freq="4M") + rev_msg = r"Input has different freq=(M|2M|3M) from PeriodArray\(freq=4M\)" + idx_msg = rev_msg if box_with_array is tm.to_array else msg + with pytest.raises(IncompatibleFrequency, match=idx_msg): + base <= idx + + @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) + def test_pi_cmp_nat(self, freq): + idx1 = PeriodIndex(["2011-01", "2011-02", "NaT", "2011-05"], freq=freq) + + result = idx1 > Period("2011-02", freq=freq) + exp = np.array([False, False, False, True]) + tm.assert_numpy_array_equal(result, exp) + result = Period("2011-02", freq=freq) < idx1 + tm.assert_numpy_array_equal(result, exp) + + result = idx1 == Period("NaT", freq=freq) + exp = np.array([False, False, False, False]) + tm.assert_numpy_array_equal(result, exp) + result = Period("NaT", freq=freq) == idx1 + tm.assert_numpy_array_equal(result, exp) + + result = idx1 != Period("NaT", freq=freq) + exp = np.array([True, True, True, True]) + tm.assert_numpy_array_equal(result, exp) + result = Period("NaT", freq=freq) != idx1 + tm.assert_numpy_array_equal(result, exp) + + idx2 = PeriodIndex(["2011-02", "2011-01", "2011-04", "NaT"], freq=freq) + result = idx1 < idx2 + exp = np.array([True, False, False, False]) + tm.assert_numpy_array_equal(result, exp) + + result = idx1 == idx2 + exp = np.array([False, False, False, False]) + tm.assert_numpy_array_equal(result, exp) + + result = idx1 != idx2 + exp = np.array([True, True, True, True]) + tm.assert_numpy_array_equal(result, exp) + + result = idx1 == idx1 + exp = np.array([True, True, False, True]) + tm.assert_numpy_array_equal(result, exp) + + result = idx1 != idx1 + exp = np.array([False, False, True, False]) + tm.assert_numpy_array_equal(result, exp) + + @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) + def test_pi_cmp_nat_mismatched_freq_raises(self, freq): + idx1 = PeriodIndex(["2011-01", "2011-02", "NaT", "2011-05"], freq=freq) + + diff = PeriodIndex(["2011-02", "2011-01", "2011-04", "NaT"], freq="4M") + msg = "Input has different freq=4M from Period(Array|Index)" + with pytest.raises(IncompatibleFrequency, match=msg): + idx1 > diff + + with pytest.raises(IncompatibleFrequency, match=msg): + idx1 == diff + + # TODO: De-duplicate with test_pi_cmp_nat + @pytest.mark.parametrize("dtype", [object, None]) + def test_comp_nat(self, dtype): + left = pd.PeriodIndex( + [pd.Period("2011-01-01"), pd.NaT, pd.Period("2011-01-03")] + ) + right = pd.PeriodIndex([pd.NaT, pd.NaT, pd.Period("2011-01-03")]) + + if dtype is not None: + left = left.astype(dtype) + right = right.astype(dtype) + + result = left == right + expected = np.array([False, False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = left != right + expected = np.array([True, True, False]) + tm.assert_numpy_array_equal(result, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(left == pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT == right, expected) + + expected = np.array([True, True, True]) + tm.assert_numpy_array_equal(left != pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT != left, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(left < pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT > left, expected) + + +class TestPeriodSeriesComparisons: + def test_cmp_series_period_series_mixed_freq(self): + # GH#13200 + base = Series( + [ + Period("2011", freq="A"), + Period("2011-02", freq="M"), + Period("2013", freq="A"), + Period("2011-04", freq="M"), + ] + ) + + ser = Series( + [ + Period("2012", freq="A"), + Period("2011-01", freq="M"), + Period("2013", freq="A"), + Period("2011-05", freq="M"), + ] + ) + + exp = Series([False, False, True, False]) + tm.assert_series_equal(base == ser, exp) + + exp = Series([True, True, False, True]) + tm.assert_series_equal(base != ser, exp) + + exp = Series([False, True, False, False]) + tm.assert_series_equal(base > ser, exp) + + exp = Series([True, False, False, True]) + tm.assert_series_equal(base < ser, exp) + + exp = Series([False, True, True, False]) + tm.assert_series_equal(base >= ser, exp) + + exp = Series([True, False, True, True]) + tm.assert_series_equal(base <= ser, exp) + + +class TestPeriodIndexSeriesComparisonConsistency: + """ Test PeriodIndex and Period Series Ops consistency """ + + # TODO: needs parametrization+de-duplication + + def _check(self, values, func, expected): + # Test PeriodIndex and Period Series Ops consistency + + idx = pd.PeriodIndex(values) + result = func(idx) + + # check that we don't pass an unwanted type to tm.assert_equal + assert isinstance(expected, (pd.Index, np.ndarray)) + tm.assert_equal(result, expected) + + s = pd.Series(values) + result = func(s) + + exp = pd.Series(expected, name=values.name) + tm.assert_series_equal(result, exp) + + def test_pi_comp_period(self): + idx = PeriodIndex( + ["2011-01", "2011-02", "2011-03", "2011-04"], freq="M", name="idx" + ) + + f = lambda x: x == pd.Period("2011-03", freq="M") + exp = np.array([False, False, True, False], dtype=np.bool) + self._check(idx, f, exp) + f = lambda x: pd.Period("2011-03", freq="M") == x + self._check(idx, f, exp) + + f = lambda x: x != pd.Period("2011-03", freq="M") + exp = np.array([True, True, False, True], dtype=np.bool) + self._check(idx, f, exp) + f = lambda x: pd.Period("2011-03", freq="M") != x + self._check(idx, f, exp) + + f = lambda x: pd.Period("2011-03", freq="M") >= x + exp = np.array([True, True, True, False], dtype=np.bool) + self._check(idx, f, exp) + + f = lambda x: x > pd.Period("2011-03", freq="M") + exp = np.array([False, False, False, True], dtype=np.bool) + self._check(idx, f, exp) + + f = lambda x: pd.Period("2011-03", freq="M") >= x + exp = np.array([True, True, True, False], dtype=np.bool) + self._check(idx, f, exp) + + def test_pi_comp_period_nat(self): + idx = PeriodIndex( + ["2011-01", "NaT", "2011-03", "2011-04"], freq="M", name="idx" + ) + + f = lambda x: x == pd.Period("2011-03", freq="M") + exp = np.array([False, False, True, False], dtype=np.bool) + self._check(idx, f, exp) + f = lambda x: pd.Period("2011-03", freq="M") == x + self._check(idx, f, exp) + + f = lambda x: x == pd.NaT + exp = np.array([False, False, False, False], dtype=np.bool) + self._check(idx, f, exp) + f = lambda x: pd.NaT == x + self._check(idx, f, exp) + + f = lambda x: x != pd.Period("2011-03", freq="M") + exp = np.array([True, True, False, True], dtype=np.bool) + self._check(idx, f, exp) + f = lambda x: pd.Period("2011-03", freq="M") != x + self._check(idx, f, exp) + + f = lambda x: x != pd.NaT + exp = np.array([True, True, True, True], dtype=np.bool) + self._check(idx, f, exp) + f = lambda x: pd.NaT != x + self._check(idx, f, exp) + + f = lambda x: pd.Period("2011-03", freq="M") >= x + exp = np.array([True, False, True, False], dtype=np.bool) + self._check(idx, f, exp) + + f = lambda x: x < pd.Period("2011-03", freq="M") + exp = np.array([True, False, False, False], dtype=np.bool) + self._check(idx, f, exp) + + f = lambda x: x > pd.NaT + exp = np.array([False, False, False, False], dtype=np.bool) + self._check(idx, f, exp) + + f = lambda x: pd.NaT >= x + exp = np.array([False, False, False, False], dtype=np.bool) + self._check(idx, f, exp) + + +# ------------------------------------------------------------------ +# Arithmetic + + +class TestPeriodFrameArithmetic: + def test_ops_frame_period(self): + # GH#13043 + df = pd.DataFrame( + { + "A": [pd.Period("2015-01", freq="M"), pd.Period("2015-02", freq="M")], + "B": [pd.Period("2014-01", freq="M"), pd.Period("2014-02", freq="M")], + } + ) + assert df["A"].dtype == "Period[M]" + assert df["B"].dtype == "Period[M]" + + p = pd.Period("2015-03", freq="M") + off = p.freq + # dtype will be object because of original dtype + exp = pd.DataFrame( + { + "A": np.array([2 * off, 1 * off], dtype=object), + "B": np.array([14 * off, 13 * off], dtype=object), + } + ) + tm.assert_frame_equal(p - df, exp) + tm.assert_frame_equal(df - p, -1 * exp) + + df2 = pd.DataFrame( + { + "A": [pd.Period("2015-05", freq="M"), pd.Period("2015-06", freq="M")], + "B": [pd.Period("2015-05", freq="M"), pd.Period("2015-06", freq="M")], + } + ) + assert df2["A"].dtype == "Period[M]" + assert df2["B"].dtype == "Period[M]" + + exp = pd.DataFrame( + { + "A": np.array([4 * off, 4 * off], dtype=object), + "B": np.array([16 * off, 16 * off], dtype=object), + } + ) + tm.assert_frame_equal(df2 - df, exp) + tm.assert_frame_equal(df - df2, -1 * exp) + + +class TestPeriodIndexArithmetic: + # --------------------------------------------------------------- + # __add__/__sub__ with PeriodIndex + # PeriodIndex + other is defined for integers and timedelta-like others + # PeriodIndex - other is defined for integers, timedelta-like others, + # and PeriodIndex (with matching freq) + + def test_parr_add_iadd_parr_raises(self, box_with_array): + rng = pd.period_range("1/1/2000", freq="D", periods=5) + other = pd.period_range("1/6/2000", freq="D", periods=5) + # TODO: parametrize over boxes for other? + + rng = tm.box_expected(rng, box_with_array) + # An earlier implementation of PeriodIndex addition performed + # a set operation (union). This has since been changed to + # raise a TypeError. See GH#14164 and GH#13077 for historical + # reference. + with pytest.raises(TypeError): + rng + other + + with pytest.raises(TypeError): + rng += other + + def test_pi_sub_isub_pi(self): + # GH#20049 + # For historical reference see GH#14164, GH#13077. + # PeriodIndex subtraction originally performed set difference, + # then changed to raise TypeError before being implemented in GH#20049 + rng = pd.period_range("1/1/2000", freq="D", periods=5) + other = pd.period_range("1/6/2000", freq="D", periods=5) + + off = rng.freq + expected = pd.Index([-5 * off] * 5) + result = rng - other + tm.assert_index_equal(result, expected) + + rng -= other + tm.assert_index_equal(rng, expected) + + def test_pi_sub_pi_with_nat(self): + rng = pd.period_range("1/1/2000", freq="D", periods=5) + other = rng[1:].insert(0, pd.NaT) + assert other[1:].equals(rng[1:]) + + result = rng - other + off = rng.freq + expected = pd.Index([pd.NaT, 0 * off, 0 * off, 0 * off, 0 * off]) + tm.assert_index_equal(result, expected) + + def test_parr_sub_pi_mismatched_freq(self, box_with_array): + rng = pd.period_range("1/1/2000", freq="D", periods=5) + other = pd.period_range("1/6/2000", freq="H", periods=5) + # TODO: parametrize over boxes for other? + + rng = tm.box_expected(rng, box_with_array) + with pytest.raises(IncompatibleFrequency): + rng - other + + @pytest.mark.parametrize("n", [1, 2, 3, 4]) + def test_sub_n_gt_1_ticks(self, tick_classes, n): + # GH 23878 + p1_d = "19910905" + p2_d = "19920406" + p1 = pd.PeriodIndex([p1_d], freq=tick_classes(n)) + p2 = pd.PeriodIndex([p2_d], freq=tick_classes(n)) + + expected = pd.PeriodIndex([p2_d], freq=p2.freq.base) - pd.PeriodIndex( + [p1_d], freq=p1.freq.base + ) + + tm.assert_index_equal((p2 - p1), expected) + + @pytest.mark.parametrize("n", [1, 2, 3, 4]) + @pytest.mark.parametrize( + "offset, kwd_name", + [ + (pd.offsets.YearEnd, "month"), + (pd.offsets.QuarterEnd, "startingMonth"), + (pd.offsets.MonthEnd, None), + (pd.offsets.Week, "weekday"), + ], + ) + def test_sub_n_gt_1_offsets(self, offset, kwd_name, n): + # GH 23878 + kwds = {kwd_name: 3} if kwd_name is not None else {} + p1_d = "19910905" + p2_d = "19920406" + freq = offset(n, normalize=False, **kwds) + p1 = pd.PeriodIndex([p1_d], freq=freq) + p2 = pd.PeriodIndex([p2_d], freq=freq) + + result = p2 - p1 + expected = pd.PeriodIndex([p2_d], freq=freq.base) - pd.PeriodIndex( + [p1_d], freq=freq.base + ) + + tm.assert_index_equal(result, expected) + + # ------------------------------------------------------------- + # Invalid Operations + + @pytest.mark.parametrize("other", [3.14, np.array([2.0, 3.0])]) + @pytest.mark.parametrize("op", [operator.add, ops.radd, operator.sub, ops.rsub]) + def test_parr_add_sub_float_raises(self, op, other, box_with_array): + dti = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], freq="D") + pi = dti.to_period("D") + pi = tm.box_expected(pi, box_with_array) + with pytest.raises(TypeError): + op(pi, other) + + @pytest.mark.parametrize( + "other", + [ + # datetime scalars + pd.Timestamp.now(), + pd.Timestamp.now().to_pydatetime(), + pd.Timestamp.now().to_datetime64(), + # datetime-like arrays + pd.date_range("2016-01-01", periods=3, freq="H"), + pd.date_range("2016-01-01", periods=3, tz="Europe/Brussels"), + pd.date_range("2016-01-01", periods=3, freq="S")._data, + pd.date_range("2016-01-01", periods=3, tz="Asia/Tokyo")._data, + # Miscellaneous invalid types + ], + ) + def test_parr_add_sub_invalid(self, other, box_with_array): + # GH#23215 + rng = pd.period_range("1/1/2000", freq="D", periods=3) + rng = tm.box_expected(rng, box_with_array) + + with pytest.raises(TypeError): + rng + other + with pytest.raises(TypeError): + other + rng + with pytest.raises(TypeError): + rng - other + with pytest.raises(TypeError): + other - rng + + # ----------------------------------------------------------------- + # __add__/__sub__ with ndarray[datetime64] and ndarray[timedelta64] + + def test_pi_add_sub_td64_array_non_tick_raises(self): + rng = pd.period_range("1/1/2000", freq="Q", periods=3) + tdi = pd.TimedeltaIndex(["-1 Day", "-1 Day", "-1 Day"]) + tdarr = tdi.values + + with pytest.raises(IncompatibleFrequency): + rng + tdarr + with pytest.raises(IncompatibleFrequency): + tdarr + rng + + with pytest.raises(IncompatibleFrequency): + rng - tdarr + with pytest.raises(TypeError): + tdarr - rng + + def test_pi_add_sub_td64_array_tick(self): + # PeriodIndex + Timedelta-like is allowed only with + # tick-like frequencies + rng = pd.period_range("1/1/2000", freq="90D", periods=3) + tdi = pd.TimedeltaIndex(["-1 Day", "-1 Day", "-1 Day"]) + tdarr = tdi.values + + expected = pd.period_range("12/31/1999", freq="90D", periods=3) + result = rng + tdi + tm.assert_index_equal(result, expected) + result = rng + tdarr + tm.assert_index_equal(result, expected) + result = tdi + rng + tm.assert_index_equal(result, expected) + result = tdarr + rng + tm.assert_index_equal(result, expected) + + expected = pd.period_range("1/2/2000", freq="90D", periods=3) + + result = rng - tdi + tm.assert_index_equal(result, expected) + result = rng - tdarr + tm.assert_index_equal(result, expected) + + with pytest.raises(TypeError): + tdarr - rng + + with pytest.raises(TypeError): + tdi - rng + + # ----------------------------------------------------------------- + # operations with array/Index of DateOffset objects + + @pytest.mark.parametrize("box", [np.array, pd.Index]) + def test_pi_add_offset_array(self, box): + # GH#18849 + pi = pd.PeriodIndex([pd.Period("2015Q1"), pd.Period("2016Q2")]) + offs = box( + [ + pd.offsets.QuarterEnd(n=1, startingMonth=12), + pd.offsets.QuarterEnd(n=-2, startingMonth=12), + ] + ) + expected = pd.PeriodIndex([pd.Period("2015Q2"), pd.Period("2015Q4")]) + + with tm.assert_produces_warning(PerformanceWarning): + res = pi + offs + tm.assert_index_equal(res, expected) + + with tm.assert_produces_warning(PerformanceWarning): + res2 = offs + pi + tm.assert_index_equal(res2, expected) + + unanchored = np.array([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)]) + # addition/subtraction ops with incompatible offsets should issue + # a PerformanceWarning and _then_ raise a TypeError. + with pytest.raises(IncompatibleFrequency): + with tm.assert_produces_warning(PerformanceWarning): + pi + unanchored + with pytest.raises(IncompatibleFrequency): + with tm.assert_produces_warning(PerformanceWarning): + unanchored + pi + + @pytest.mark.parametrize("box", [np.array, pd.Index]) + def test_pi_sub_offset_array(self, box): + # GH#18824 + pi = pd.PeriodIndex([pd.Period("2015Q1"), pd.Period("2016Q2")]) + other = box( + [ + pd.offsets.QuarterEnd(n=1, startingMonth=12), + pd.offsets.QuarterEnd(n=-2, startingMonth=12), + ] + ) + + expected = PeriodIndex([pi[n] - other[n] for n in range(len(pi))]) + + with tm.assert_produces_warning(PerformanceWarning): + res = pi - other + tm.assert_index_equal(res, expected) + + anchored = box([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)]) + + # addition/subtraction ops with anchored offsets should issue + # a PerformanceWarning and _then_ raise a TypeError. + with pytest.raises(IncompatibleFrequency): + with tm.assert_produces_warning(PerformanceWarning): + pi - anchored + with pytest.raises(IncompatibleFrequency): + with tm.assert_produces_warning(PerformanceWarning): + anchored - pi + + def test_pi_add_iadd_int(self, one): + # Variants of `one` for #19012 + rng = pd.period_range("2000-01-01 09:00", freq="H", periods=10) + result = rng + one + expected = pd.period_range("2000-01-01 10:00", freq="H", periods=10) + tm.assert_index_equal(result, expected) + rng += one + tm.assert_index_equal(rng, expected) + + def test_pi_sub_isub_int(self, one): + """ + PeriodIndex.__sub__ and __isub__ with several representations of + the integer 1, e.g. int, np.int64, np.uint8, ... + """ + rng = pd.period_range("2000-01-01 09:00", freq="H", periods=10) + result = rng - one + expected = pd.period_range("2000-01-01 08:00", freq="H", periods=10) + tm.assert_index_equal(result, expected) + rng -= one + tm.assert_index_equal(rng, expected) + + @pytest.mark.parametrize("five", [5, np.array(5, dtype=np.int64)]) + def test_pi_sub_intlike(self, five): + rng = period_range("2007-01", periods=50) + + result = rng - five + exp = rng + (-five) + tm.assert_index_equal(result, exp) + + def test_pi_sub_isub_offset(self): + # offset + # DateOffset + rng = pd.period_range("2014", "2024", freq="A") + result = rng - pd.offsets.YearEnd(5) + expected = pd.period_range("2009", "2019", freq="A") + tm.assert_index_equal(result, expected) + rng -= pd.offsets.YearEnd(5) + tm.assert_index_equal(rng, expected) + + rng = pd.period_range("2014-01", "2016-12", freq="M") + result = rng - pd.offsets.MonthEnd(5) + expected = pd.period_range("2013-08", "2016-07", freq="M") + tm.assert_index_equal(result, expected) + + rng -= pd.offsets.MonthEnd(5) + tm.assert_index_equal(rng, expected) + + @pytest.mark.parametrize("transpose", [True, False]) + def test_pi_add_offset_n_gt1(self, box_with_array, transpose): + # GH#23215 + # add offset to PeriodIndex with freq.n > 1 + + per = pd.Period("2016-01", freq="2M") + pi = pd.PeriodIndex([per]) + + expected = pd.PeriodIndex(["2016-03"], freq="2M") + + pi = tm.box_expected(pi, box_with_array, transpose=transpose) + expected = tm.box_expected(expected, box_with_array, transpose=transpose) + + result = pi + per.freq + tm.assert_equal(result, expected) + + result = per.freq + pi + tm.assert_equal(result, expected) + + def test_pi_add_offset_n_gt1_not_divisible(self, box_with_array): + # GH#23215 + # PeriodIndex with freq.n > 1 add offset with offset.n % freq.n != 0 + pi = pd.PeriodIndex(["2016-01"], freq="2M") + expected = pd.PeriodIndex(["2016-04"], freq="2M") + + # FIXME: with transposing these tests fail + pi = tm.box_expected(pi, box_with_array, transpose=False) + expected = tm.box_expected(expected, box_with_array, transpose=False) + + result = pi + to_offset("3M") + tm.assert_equal(result, expected) + + result = to_offset("3M") + pi + tm.assert_equal(result, expected) + + # --------------------------------------------------------------- + # __add__/__sub__ with integer arrays + + @pytest.mark.parametrize("int_holder", [np.array, pd.Index]) + @pytest.mark.parametrize("op", [operator.add, ops.radd]) + def test_pi_add_intarray(self, int_holder, op): + # GH#19959 + pi = pd.PeriodIndex([pd.Period("2015Q1"), pd.Period("NaT")]) + other = int_holder([4, -1]) + + result = op(pi, other) + expected = pd.PeriodIndex([pd.Period("2016Q1"), pd.Period("NaT")]) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("int_holder", [np.array, pd.Index]) + def test_pi_sub_intarray(self, int_holder): + # GH#19959 + pi = pd.PeriodIndex([pd.Period("2015Q1"), pd.Period("NaT")]) + other = int_holder([4, -1]) + + result = pi - other + expected = pd.PeriodIndex([pd.Period("2014Q1"), pd.Period("NaT")]) + tm.assert_index_equal(result, expected) + + with pytest.raises(TypeError): + other - pi + + # --------------------------------------------------------------- + # Timedelta-like (timedelta, timedelta64, Timedelta, Tick) + # TODO: Some of these are misnomers because of non-Tick DateOffsets + + def test_pi_add_timedeltalike_minute_gt1(self, three_days): + # GH#23031 adding a time-delta-like offset to a PeriodArray that has + # minute frequency with n != 1. A more general case is tested below + # in test_pi_add_timedeltalike_tick_gt1, but here we write out the + # expected result more explicitly. + other = three_days + rng = pd.period_range("2014-05-01", periods=3, freq="2D") + + expected = pd.PeriodIndex(["2014-05-04", "2014-05-06", "2014-05-08"], freq="2D") + + result = rng + other + tm.assert_index_equal(result, expected) + + result = other + rng + tm.assert_index_equal(result, expected) + + # subtraction + expected = pd.PeriodIndex(["2014-04-28", "2014-04-30", "2014-05-02"], freq="2D") + result = rng - other + tm.assert_index_equal(result, expected) + + with pytest.raises(TypeError): + other - rng + + @pytest.mark.parametrize("freqstr", ["5ns", "5us", "5ms", "5s", "5T", "5h", "5d"]) + def test_pi_add_timedeltalike_tick_gt1(self, three_days, freqstr): + # GH#23031 adding a time-delta-like offset to a PeriodArray that has + # tick-like frequency with n != 1 + other = three_days + rng = pd.period_range("2014-05-01", periods=6, freq=freqstr) + + expected = pd.period_range(rng[0] + other, periods=6, freq=freqstr) + + result = rng + other + tm.assert_index_equal(result, expected) + + result = other + rng + tm.assert_index_equal(result, expected) + + # subtraction + expected = pd.period_range(rng[0] - other, periods=6, freq=freqstr) + result = rng - other + tm.assert_index_equal(result, expected) + + with pytest.raises(TypeError): + other - rng + + def test_pi_add_iadd_timedeltalike_daily(self, three_days): + # Tick + other = three_days + rng = pd.period_range("2014-05-01", "2014-05-15", freq="D") + expected = pd.period_range("2014-05-04", "2014-05-18", freq="D") + + result = rng + other + tm.assert_index_equal(result, expected) + + rng += other + tm.assert_index_equal(rng, expected) + + def test_pi_sub_isub_timedeltalike_daily(self, three_days): + # Tick-like 3 Days + other = three_days + rng = pd.period_range("2014-05-01", "2014-05-15", freq="D") + expected = pd.period_range("2014-04-28", "2014-05-12", freq="D") + + result = rng - other + tm.assert_index_equal(result, expected) + + rng -= other + tm.assert_index_equal(rng, expected) + + def test_pi_add_sub_timedeltalike_freq_mismatch_daily(self, not_daily): + other = not_daily + rng = pd.period_range("2014-05-01", "2014-05-15", freq="D") + msg = "Input has different freq(=.+)? from Period.*?\\(freq=D\\)" + with pytest.raises(IncompatibleFrequency, match=msg): + rng + other + with pytest.raises(IncompatibleFrequency, match=msg): + rng += other + with pytest.raises(IncompatibleFrequency, match=msg): + rng - other + with pytest.raises(IncompatibleFrequency, match=msg): + rng -= other + + def test_pi_add_iadd_timedeltalike_hourly(self, two_hours): + other = two_hours + rng = pd.period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="H") + expected = pd.period_range("2014-01-01 12:00", "2014-01-05 12:00", freq="H") + + result = rng + other + tm.assert_index_equal(result, expected) + + rng += other + tm.assert_index_equal(rng, expected) + + def test_pi_add_timedeltalike_mismatched_freq_hourly(self, not_hourly): + other = not_hourly + rng = pd.period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="H") + msg = "Input has different freq(=.+)? from Period.*?\\(freq=H\\)" + + with pytest.raises(IncompatibleFrequency, match=msg): + rng + other + + with pytest.raises(IncompatibleFrequency, match=msg): + rng += other + + def test_pi_sub_isub_timedeltalike_hourly(self, two_hours): + other = two_hours + rng = pd.period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="H") + expected = pd.period_range("2014-01-01 08:00", "2014-01-05 08:00", freq="H") + + result = rng - other + tm.assert_index_equal(result, expected) + + rng -= other + tm.assert_index_equal(rng, expected) + + def test_add_iadd_timedeltalike_annual(self): + # offset + # DateOffset + rng = pd.period_range("2014", "2024", freq="A") + result = rng + pd.offsets.YearEnd(5) + expected = pd.period_range("2019", "2029", freq="A") + tm.assert_index_equal(result, expected) + rng += pd.offsets.YearEnd(5) + tm.assert_index_equal(rng, expected) + + def test_pi_add_sub_timedeltalike_freq_mismatch_annual(self, mismatched_freq): + other = mismatched_freq + rng = pd.period_range("2014", "2024", freq="A") + msg = "Input has different freq(=.+)? from Period.*?\\(freq=A-DEC\\)" + with pytest.raises(IncompatibleFrequency, match=msg): + rng + other + with pytest.raises(IncompatibleFrequency, match=msg): + rng += other + with pytest.raises(IncompatibleFrequency, match=msg): + rng - other + with pytest.raises(IncompatibleFrequency, match=msg): + rng -= other + + def test_pi_add_iadd_timedeltalike_M(self): + rng = pd.period_range("2014-01", "2016-12", freq="M") + expected = pd.period_range("2014-06", "2017-05", freq="M") + + result = rng + pd.offsets.MonthEnd(5) + tm.assert_index_equal(result, expected) + + rng += pd.offsets.MonthEnd(5) + tm.assert_index_equal(rng, expected) + + def test_pi_add_sub_timedeltalike_freq_mismatch_monthly(self, mismatched_freq): + other = mismatched_freq + rng = pd.period_range("2014-01", "2016-12", freq="M") + msg = "Input has different freq(=.+)? from Period.*?\\(freq=M\\)" + with pytest.raises(IncompatibleFrequency, match=msg): + rng + other + with pytest.raises(IncompatibleFrequency, match=msg): + rng += other + with pytest.raises(IncompatibleFrequency, match=msg): + rng - other + with pytest.raises(IncompatibleFrequency, match=msg): + rng -= other + + @pytest.mark.parametrize("transpose", [True, False]) + def test_parr_add_sub_td64_nat(self, box_with_array, transpose): + # GH#23320 special handling for timedelta64("NaT") + pi = pd.period_range("1994-04-01", periods=9, freq="19D") + other = np.timedelta64("NaT") + expected = pd.PeriodIndex(["NaT"] * 9, freq="19D") + + obj = tm.box_expected(pi, box_with_array, transpose=transpose) + expected = tm.box_expected(expected, box_with_array, transpose=transpose) + + result = obj + other + tm.assert_equal(result, expected) + result = other + obj + tm.assert_equal(result, expected) + result = obj - other + tm.assert_equal(result, expected) + with pytest.raises(TypeError): + other - obj + + @pytest.mark.parametrize( + "other", + [ + np.array(["NaT"] * 9, dtype="m8[ns]"), + TimedeltaArray._from_sequence(["NaT"] * 9), + ], + ) + def test_parr_add_sub_tdt64_nat_array(self, box_with_array, other): + pi = pd.period_range("1994-04-01", periods=9, freq="19D") + expected = pd.PeriodIndex(["NaT"] * 9, freq="19D") + + obj = tm.box_expected(pi, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + result = obj + other + tm.assert_equal(result, expected) + result = other + obj + tm.assert_equal(result, expected) + result = obj - other + tm.assert_equal(result, expected) + with pytest.raises(TypeError): + other - obj + + # --------------------------------------------------------------- + # Unsorted + + def test_parr_add_sub_index(self): + # Check that PeriodArray defers to Index on arithmetic ops + pi = pd.period_range("2000-12-31", periods=3) + parr = pi.array + + result = parr - pi + expected = pi - pi + tm.assert_index_equal(result, expected) + + def test_parr_add_sub_object_array(self): + pi = pd.period_range("2000-12-31", periods=3, freq="D") + parr = pi.array + + other = np.array([pd.Timedelta(days=1), pd.offsets.Day(2), 3]) + + with tm.assert_produces_warning(PerformanceWarning): + result = parr + other + + expected = pd.PeriodIndex( + ["2001-01-01", "2001-01-03", "2001-01-05"], freq="D" + ).array + tm.assert_equal(result, expected) + + with tm.assert_produces_warning(PerformanceWarning): + result = parr - other + + expected = pd.PeriodIndex(["2000-12-30"] * 3, freq="D").array + tm.assert_equal(result, expected) + + +class TestPeriodSeriesArithmetic: + def test_ops_series_timedelta(self): + # GH#13043 + ser = pd.Series( + [pd.Period("2015-01-01", freq="D"), pd.Period("2015-01-02", freq="D")], + name="xxx", + ) + assert ser.dtype == "Period[D]" + + expected = pd.Series( + [pd.Period("2015-01-02", freq="D"), pd.Period("2015-01-03", freq="D")], + name="xxx", + ) + + result = ser + pd.Timedelta("1 days") + tm.assert_series_equal(result, expected) + + result = pd.Timedelta("1 days") + ser + tm.assert_series_equal(result, expected) + + result = ser + pd.tseries.offsets.Day() + tm.assert_series_equal(result, expected) + + result = pd.tseries.offsets.Day() + ser + tm.assert_series_equal(result, expected) + + def test_ops_series_period(self): + # GH#13043 + ser = pd.Series( + [pd.Period("2015-01-01", freq="D"), pd.Period("2015-01-02", freq="D")], + name="xxx", + ) + assert ser.dtype == "Period[D]" + + per = pd.Period("2015-01-10", freq="D") + off = per.freq + # dtype will be object because of original dtype + expected = pd.Series([9 * off, 8 * off], name="xxx", dtype=object) + tm.assert_series_equal(per - ser, expected) + tm.assert_series_equal(ser - per, -1 * expected) + + s2 = pd.Series( + [pd.Period("2015-01-05", freq="D"), pd.Period("2015-01-04", freq="D")], + name="xxx", + ) + assert s2.dtype == "Period[D]" + + expected = pd.Series([4 * off, 2 * off], name="xxx", dtype=object) + tm.assert_series_equal(s2 - ser, expected) + tm.assert_series_equal(ser - s2, -1 * expected) + + +class TestPeriodIndexSeriesMethods: + """ Test PeriodIndex and Period Series Ops consistency """ + + def _check(self, values, func, expected): + idx = pd.PeriodIndex(values) + result = func(idx) + tm.assert_equal(result, expected) + + ser = pd.Series(values) + result = func(ser) + + exp = pd.Series(expected, name=values.name) + tm.assert_series_equal(result, exp) + + def test_pi_ops(self): + idx = PeriodIndex( + ["2011-01", "2011-02", "2011-03", "2011-04"], freq="M", name="idx" + ) + + expected = PeriodIndex( + ["2011-03", "2011-04", "2011-05", "2011-06"], freq="M", name="idx" + ) + + self._check(idx, lambda x: x + 2, expected) + self._check(idx, lambda x: 2 + x, expected) + + self._check(idx + 2, lambda x: x - 2, idx) + + result = idx - Period("2011-01", freq="M") + off = idx.freq + exp = pd.Index([0 * off, 1 * off, 2 * off, 3 * off], name="idx") + tm.assert_index_equal(result, exp) + + result = Period("2011-01", freq="M") - idx + exp = pd.Index([0 * off, -1 * off, -2 * off, -3 * off], name="idx") + tm.assert_index_equal(result, exp) + + @pytest.mark.parametrize("ng", ["str", 1.5]) + @pytest.mark.parametrize( + "func", + [ + lambda obj, ng: obj + ng, + lambda obj, ng: ng + obj, + lambda obj, ng: obj - ng, + lambda obj, ng: ng - obj, + lambda obj, ng: np.add(obj, ng), + lambda obj, ng: np.add(ng, obj), + lambda obj, ng: np.subtract(obj, ng), + lambda obj, ng: np.subtract(ng, obj), + ], + ) + def test_parr_ops_errors(self, ng, func, box_with_array): + idx = PeriodIndex( + ["2011-01", "2011-02", "2011-03", "2011-04"], freq="M", name="idx" + ) + obj = tm.box_expected(idx, box_with_array) + msg = ( + r"unsupported operand type\(s\)|can only concatenate|" + r"must be str|object to str implicitly" + ) + + with pytest.raises(TypeError, match=msg): + func(obj, ng) + + def test_pi_ops_nat(self): + idx = PeriodIndex( + ["2011-01", "2011-02", "NaT", "2011-04"], freq="M", name="idx" + ) + expected = PeriodIndex( + ["2011-03", "2011-04", "NaT", "2011-06"], freq="M", name="idx" + ) + + self._check(idx, lambda x: x + 2, expected) + self._check(idx, lambda x: 2 + x, expected) + self._check(idx, lambda x: np.add(x, 2), expected) + + self._check(idx + 2, lambda x: x - 2, idx) + self._check(idx + 2, lambda x: np.subtract(x, 2), idx) + + # freq with mult + idx = PeriodIndex( + ["2011-01", "2011-02", "NaT", "2011-04"], freq="2M", name="idx" + ) + expected = PeriodIndex( + ["2011-07", "2011-08", "NaT", "2011-10"], freq="2M", name="idx" + ) + + self._check(idx, lambda x: x + 3, expected) + self._check(idx, lambda x: 3 + x, expected) + self._check(idx, lambda x: np.add(x, 3), expected) + + self._check(idx + 3, lambda x: x - 3, idx) + self._check(idx + 3, lambda x: np.subtract(x, 3), idx) + + def test_pi_ops_array_int(self): + + idx = PeriodIndex( + ["2011-01", "2011-02", "NaT", "2011-04"], freq="M", name="idx" + ) + f = lambda x: x + np.array([1, 2, 3, 4]) + exp = PeriodIndex( + ["2011-02", "2011-04", "NaT", "2011-08"], freq="M", name="idx" + ) + self._check(idx, f, exp) + + f = lambda x: np.add(x, np.array([4, -1, 1, 2])) + exp = PeriodIndex( + ["2011-05", "2011-01", "NaT", "2011-06"], freq="M", name="idx" + ) + self._check(idx, f, exp) + + f = lambda x: x - np.array([1, 2, 3, 4]) + exp = PeriodIndex( + ["2010-12", "2010-12", "NaT", "2010-12"], freq="M", name="idx" + ) + self._check(idx, f, exp) + + f = lambda x: np.subtract(x, np.array([3, 2, 3, -2])) + exp = PeriodIndex( + ["2010-10", "2010-12", "NaT", "2011-06"], freq="M", name="idx" + ) + self._check(idx, f, exp) + + def test_pi_ops_offset(self): + idx = PeriodIndex( + ["2011-01-01", "2011-02-01", "2011-03-01", "2011-04-01"], + freq="D", + name="idx", + ) + f = lambda x: x + pd.offsets.Day() + exp = PeriodIndex( + ["2011-01-02", "2011-02-02", "2011-03-02", "2011-04-02"], + freq="D", + name="idx", + ) + self._check(idx, f, exp) + + f = lambda x: x + pd.offsets.Day(2) + exp = PeriodIndex( + ["2011-01-03", "2011-02-03", "2011-03-03", "2011-04-03"], + freq="D", + name="idx", + ) + self._check(idx, f, exp) + + f = lambda x: x - pd.offsets.Day(2) + exp = PeriodIndex( + ["2010-12-30", "2011-01-30", "2011-02-27", "2011-03-30"], + freq="D", + name="idx", + ) + self._check(idx, f, exp) + + def test_pi_offset_errors(self): + idx = PeriodIndex( + ["2011-01-01", "2011-02-01", "2011-03-01", "2011-04-01"], + freq="D", + name="idx", + ) + ser = pd.Series(idx) + + # Series op is applied per Period instance, thus error is raised + # from Period + for obj in [idx, ser]: + msg = r"Input has different freq=2H from Period.*?\(freq=D\)" + with pytest.raises(IncompatibleFrequency, match=msg): + obj + pd.offsets.Hour(2) + + with pytest.raises(IncompatibleFrequency, match=msg): + pd.offsets.Hour(2) + obj + + msg = r"Input has different freq=-2H from Period.*?\(freq=D\)" + with pytest.raises(IncompatibleFrequency, match=msg): + obj - pd.offsets.Hour(2) + + def test_pi_sub_period(self): + # GH#13071 + idx = PeriodIndex( + ["2011-01", "2011-02", "2011-03", "2011-04"], freq="M", name="idx" + ) + + result = idx - pd.Period("2012-01", freq="M") + off = idx.freq + exp = pd.Index([-12 * off, -11 * off, -10 * off, -9 * off], name="idx") + tm.assert_index_equal(result, exp) + + result = np.subtract(idx, pd.Period("2012-01", freq="M")) + tm.assert_index_equal(result, exp) + + result = pd.Period("2012-01", freq="M") - idx + exp = pd.Index([12 * off, 11 * off, 10 * off, 9 * off], name="idx") + tm.assert_index_equal(result, exp) + + result = np.subtract(pd.Period("2012-01", freq="M"), idx) + tm.assert_index_equal(result, exp) + + exp = pd.TimedeltaIndex([np.nan, np.nan, np.nan, np.nan], name="idx") + tm.assert_index_equal(idx - pd.Period("NaT", freq="M"), exp) + tm.assert_index_equal(pd.Period("NaT", freq="M") - idx, exp) + + def test_pi_sub_pdnat(self): + # GH#13071 + idx = PeriodIndex( + ["2011-01", "2011-02", "NaT", "2011-04"], freq="M", name="idx" + ) + exp = pd.TimedeltaIndex([pd.NaT] * 4, name="idx") + tm.assert_index_equal(pd.NaT - idx, exp) + tm.assert_index_equal(idx - pd.NaT, exp) + + def test_pi_sub_period_nat(self): + # GH#13071 + idx = PeriodIndex( + ["2011-01", "NaT", "2011-03", "2011-04"], freq="M", name="idx" + ) + + result = idx - pd.Period("2012-01", freq="M") + off = idx.freq + exp = pd.Index([-12 * off, pd.NaT, -10 * off, -9 * off], name="idx") + tm.assert_index_equal(result, exp) + + result = pd.Period("2012-01", freq="M") - idx + exp = pd.Index([12 * off, pd.NaT, 10 * off, 9 * off], name="idx") + tm.assert_index_equal(result, exp) + + exp = pd.TimedeltaIndex([np.nan, np.nan, np.nan, np.nan], name="idx") + tm.assert_index_equal(idx - pd.Period("NaT", freq="M"), exp) + tm.assert_index_equal(pd.Period("NaT", freq="M") - idx, exp) diff --git a/venv/Lib/site-packages/pandas/tests/arithmetic/test_timedelta64.py b/venv/Lib/site-packages/pandas/tests/arithmetic/test_timedelta64.py new file mode 100644 index 0000000..158da37 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arithmetic/test_timedelta64.py @@ -0,0 +1,2176 @@ +# Arithmetic tests for DataFrame/Series/Index/Array classes that should +# behave identically. +from datetime import datetime, timedelta + +import numpy as np +import pytest + +from pandas.errors import OutOfBoundsDatetime, PerformanceWarning + +import pandas as pd +from pandas import ( + DataFrame, + DatetimeIndex, + NaT, + Series, + Timedelta, + TimedeltaIndex, + Timestamp, + timedelta_range, +) +import pandas._testing as tm +from pandas.tests.arithmetic.common import ( + assert_invalid_addsub_type, + assert_invalid_comparison, + get_upcast_box, +) + +# ------------------------------------------------------------------ +# Timedelta64[ns] dtype Comparisons + + +class TestTimedelta64ArrayLikeComparisons: + # Comparison tests for timedelta64[ns] vectors fully parametrized over + # DataFrame/Series/TimedeltaIndex/TimedeltaArray. Ideally all comparison + # tests will eventually end up here. + + def test_compare_timedelta64_zerodim(self, box_with_array): + # GH#26689 should unbox when comparing with zerodim array + box = box_with_array + xbox = box_with_array if box_with_array is not pd.Index else np.ndarray + + tdi = pd.timedelta_range("2H", periods=4) + other = np.array(tdi.to_numpy()[0]) + + tdi = tm.box_expected(tdi, box) + res = tdi <= other + expected = np.array([True, False, False, False]) + expected = tm.box_expected(expected, xbox) + tm.assert_equal(res, expected) + + with pytest.raises(TypeError): + # zero-dim of wrong dtype should still raise + tdi >= np.array(4) + + @pytest.mark.parametrize( + "td_scalar", + [timedelta(days=1), Timedelta(days=1), Timedelta(days=1).to_timedelta64()], + ) + def test_compare_timedeltalike_scalar(self, box_with_array, td_scalar): + # regression test for GH#5963 + box = box_with_array + xbox = box if box is not pd.Index else np.ndarray + ser = pd.Series([timedelta(days=1), timedelta(days=2)]) + ser = tm.box_expected(ser, box) + actual = ser > td_scalar + expected = pd.Series([False, True]) + expected = tm.box_expected(expected, xbox) + tm.assert_equal(actual, expected) + + @pytest.mark.parametrize("invalid", [345600000000000, "a"]) + def test_td64_comparisons_invalid(self, box_with_array, invalid): + # GH#13624 for str + box = box_with_array + rng = timedelta_range("1 days", periods=10) + obj = tm.box_expected(rng, box) + + assert_invalid_comparison(obj, invalid, box) + + @pytest.mark.parametrize( + "other", + [ + list(range(10)), + np.arange(10), + np.arange(10).astype(np.float32), + np.arange(10).astype(object), + pd.date_range("1970-01-01", periods=10, tz="UTC").array, + np.array(pd.date_range("1970-01-01", periods=10)), + list(pd.date_range("1970-01-01", periods=10)), + pd.date_range("1970-01-01", periods=10).astype(object), + pd.period_range("1971-01-01", freq="D", periods=10).array, + pd.period_range("1971-01-01", freq="D", periods=10).astype(object), + ], + ) + def test_td64arr_cmp_arraylike_invalid(self, other): + # We don't parametrize this over box_with_array because listlike + # other plays poorly with assert_invalid_comparison reversed checks + + rng = timedelta_range("1 days", periods=10)._data + assert_invalid_comparison(rng, other, tm.to_array) + + def test_td64arr_cmp_mixed_invalid(self): + rng = timedelta_range("1 days", periods=5)._data + + other = np.array([0, 1, 2, rng[3], pd.Timestamp.now()]) + result = rng == other + expected = np.array([False, False, False, True, False]) + tm.assert_numpy_array_equal(result, expected) + + result = rng != other + tm.assert_numpy_array_equal(result, ~expected) + + msg = "Invalid comparison between|Cannot compare type|not supported between" + with pytest.raises(TypeError, match=msg): + rng < other + with pytest.raises(TypeError, match=msg): + rng > other + with pytest.raises(TypeError, match=msg): + rng <= other + with pytest.raises(TypeError, match=msg): + rng >= other + + +class TestTimedelta64ArrayComparisons: + # TODO: All of these need to be parametrized over box + + @pytest.mark.parametrize("dtype", [None, object]) + def test_comp_nat(self, dtype): + left = pd.TimedeltaIndex( + [pd.Timedelta("1 days"), pd.NaT, pd.Timedelta("3 days")] + ) + right = pd.TimedeltaIndex([pd.NaT, pd.NaT, pd.Timedelta("3 days")]) + + lhs, rhs = left, right + if dtype is object: + lhs, rhs = left.astype(object), right.astype(object) + + result = rhs == lhs + expected = np.array([False, False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = rhs != lhs + expected = np.array([True, True, False]) + tm.assert_numpy_array_equal(result, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(lhs == pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT == rhs, expected) + + expected = np.array([True, True, True]) + tm.assert_numpy_array_equal(lhs != pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT != lhs, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(lhs < pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT > lhs, expected) + + def test_comparisons_nat(self): + tdidx1 = pd.TimedeltaIndex( + [ + "1 day", + pd.NaT, + "1 day 00:00:01", + pd.NaT, + "1 day 00:00:01", + "5 day 00:00:03", + ] + ) + tdidx2 = pd.TimedeltaIndex( + ["2 day", "2 day", pd.NaT, pd.NaT, "1 day 00:00:02", "5 days 00:00:03"] + ) + tdarr = np.array( + [ + np.timedelta64(2, "D"), + np.timedelta64(2, "D"), + np.timedelta64("nat"), + np.timedelta64("nat"), + np.timedelta64(1, "D") + np.timedelta64(2, "s"), + np.timedelta64(5, "D") + np.timedelta64(3, "s"), + ] + ) + + cases = [(tdidx1, tdidx2), (tdidx1, tdarr)] + + # Check pd.NaT is handles as the same as np.nan + for idx1, idx2 in cases: + + result = idx1 < idx2 + expected = np.array([True, False, False, False, True, False]) + tm.assert_numpy_array_equal(result, expected) + + result = idx2 > idx1 + expected = np.array([True, False, False, False, True, False]) + tm.assert_numpy_array_equal(result, expected) + + result = idx1 <= idx2 + expected = np.array([True, False, False, False, True, True]) + tm.assert_numpy_array_equal(result, expected) + + result = idx2 >= idx1 + expected = np.array([True, False, False, False, True, True]) + tm.assert_numpy_array_equal(result, expected) + + result = idx1 == idx2 + expected = np.array([False, False, False, False, False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = idx1 != idx2 + expected = np.array([True, True, True, True, True, False]) + tm.assert_numpy_array_equal(result, expected) + + # TODO: better name + def test_comparisons_coverage(self): + rng = timedelta_range("1 days", periods=10) + + result = rng < rng[3] + expected = np.array([True, True, True] + [False] * 7) + tm.assert_numpy_array_equal(result, expected) + + result = rng == list(rng) + exp = rng == rng + tm.assert_numpy_array_equal(result, exp) + + +# ------------------------------------------------------------------ +# Timedelta64[ns] dtype Arithmetic Operations + + +class TestTimedelta64ArithmeticUnsorted: + # Tests moved from type-specific test files but not + # yet sorted/parametrized/de-duplicated + + def test_ufunc_coercions(self): + # normal ops are also tested in tseries/test_timedeltas.py + idx = TimedeltaIndex(["2H", "4H", "6H", "8H", "10H"], freq="2H", name="x") + + for result in [idx * 2, np.multiply(idx, 2)]: + assert isinstance(result, TimedeltaIndex) + exp = TimedeltaIndex(["4H", "8H", "12H", "16H", "20H"], freq="4H", name="x") + tm.assert_index_equal(result, exp) + assert result.freq == "4H" + + for result in [idx / 2, np.divide(idx, 2)]: + assert isinstance(result, TimedeltaIndex) + exp = TimedeltaIndex(["1H", "2H", "3H", "4H", "5H"], freq="H", name="x") + tm.assert_index_equal(result, exp) + assert result.freq == "H" + + idx = TimedeltaIndex(["2H", "4H", "6H", "8H", "10H"], freq="2H", name="x") + for result in [-idx, np.negative(idx)]: + assert isinstance(result, TimedeltaIndex) + exp = TimedeltaIndex( + ["-2H", "-4H", "-6H", "-8H", "-10H"], freq="-2H", name="x" + ) + tm.assert_index_equal(result, exp) + assert result.freq == "-2H" + + idx = TimedeltaIndex(["-2H", "-1H", "0H", "1H", "2H"], freq="H", name="x") + for result in [abs(idx), np.absolute(idx)]: + assert isinstance(result, TimedeltaIndex) + exp = TimedeltaIndex(["2H", "1H", "0H", "1H", "2H"], freq=None, name="x") + tm.assert_index_equal(result, exp) + assert result.freq is None + + def test_subtraction_ops(self): + # with datetimes/timedelta and tdi/dti + tdi = TimedeltaIndex(["1 days", pd.NaT, "2 days"], name="foo") + dti = pd.date_range("20130101", periods=3, name="bar") + td = Timedelta("1 days") + dt = Timestamp("20130101") + + msg = "cannot subtract a datelike from a TimedeltaArray" + with pytest.raises(TypeError, match=msg): + tdi - dt + with pytest.raises(TypeError, match=msg): + tdi - dti + + msg = r"unsupported operand type\(s\) for -" + with pytest.raises(TypeError, match=msg): + td - dt + + msg = "(bad|unsupported) operand type for unary" + with pytest.raises(TypeError, match=msg): + td - dti + + result = dt - dti + expected = TimedeltaIndex(["0 days", "-1 days", "-2 days"], name="bar") + tm.assert_index_equal(result, expected) + + result = dti - dt + expected = TimedeltaIndex(["0 days", "1 days", "2 days"], name="bar") + tm.assert_index_equal(result, expected) + + result = tdi - td + expected = TimedeltaIndex(["0 days", pd.NaT, "1 days"], name="foo") + tm.assert_index_equal(result, expected, check_names=False) + + result = td - tdi + expected = TimedeltaIndex(["0 days", pd.NaT, "-1 days"], name="foo") + tm.assert_index_equal(result, expected, check_names=False) + + result = dti - td + expected = DatetimeIndex(["20121231", "20130101", "20130102"], name="bar") + tm.assert_index_equal(result, expected, check_names=False) + + result = dt - tdi + expected = DatetimeIndex(["20121231", pd.NaT, "20121230"], name="foo") + tm.assert_index_equal(result, expected) + + def test_subtraction_ops_with_tz(self): + + # check that dt/dti subtraction ops with tz are validated + dti = pd.date_range("20130101", periods=3) + ts = Timestamp("20130101") + dt = ts.to_pydatetime() + dti_tz = pd.date_range("20130101", periods=3).tz_localize("US/Eastern") + ts_tz = Timestamp("20130101").tz_localize("US/Eastern") + ts_tz2 = Timestamp("20130101").tz_localize("CET") + dt_tz = ts_tz.to_pydatetime() + td = Timedelta("1 days") + + def _check(result, expected): + assert result == expected + assert isinstance(result, Timedelta) + + # scalars + result = ts - ts + expected = Timedelta("0 days") + _check(result, expected) + + result = dt_tz - ts_tz + expected = Timedelta("0 days") + _check(result, expected) + + result = ts_tz - dt_tz + expected = Timedelta("0 days") + _check(result, expected) + + # tz mismatches + msg = "Timestamp subtraction must have the same timezones or no timezones" + with pytest.raises(TypeError, match=msg): + dt_tz - ts + msg = "can't subtract offset-naive and offset-aware datetimes" + with pytest.raises(TypeError, match=msg): + dt_tz - dt + msg = "Timestamp subtraction must have the same timezones or no timezones" + with pytest.raises(TypeError, match=msg): + dt_tz - ts_tz2 + msg = "can't subtract offset-naive and offset-aware datetimes" + with pytest.raises(TypeError, match=msg): + dt - dt_tz + msg = "Timestamp subtraction must have the same timezones or no timezones" + with pytest.raises(TypeError, match=msg): + ts - dt_tz + with pytest.raises(TypeError, match=msg): + ts_tz2 - ts + with pytest.raises(TypeError, match=msg): + ts_tz2 - dt + with pytest.raises(TypeError, match=msg): + ts_tz - ts_tz2 + + # with dti + with pytest.raises(TypeError, match=msg): + dti - ts_tz + with pytest.raises(TypeError, match=msg): + dti_tz - ts + with pytest.raises(TypeError, match=msg): + dti_tz - ts_tz2 + + result = dti_tz - dt_tz + expected = TimedeltaIndex(["0 days", "1 days", "2 days"]) + tm.assert_index_equal(result, expected) + + result = dt_tz - dti_tz + expected = TimedeltaIndex(["0 days", "-1 days", "-2 days"]) + tm.assert_index_equal(result, expected) + + result = dti_tz - ts_tz + expected = TimedeltaIndex(["0 days", "1 days", "2 days"]) + tm.assert_index_equal(result, expected) + + result = ts_tz - dti_tz + expected = TimedeltaIndex(["0 days", "-1 days", "-2 days"]) + tm.assert_index_equal(result, expected) + + result = td - td + expected = Timedelta("0 days") + _check(result, expected) + + result = dti_tz - td + expected = DatetimeIndex(["20121231", "20130101", "20130102"], tz="US/Eastern") + tm.assert_index_equal(result, expected) + + def test_dti_tdi_numeric_ops(self): + # These are normally union/diff set-like ops + tdi = TimedeltaIndex(["1 days", pd.NaT, "2 days"], name="foo") + dti = pd.date_range("20130101", periods=3, name="bar") + + # TODO(wesm): unused? + # td = Timedelta('1 days') + # dt = Timestamp('20130101') + + result = tdi - tdi + expected = TimedeltaIndex(["0 days", pd.NaT, "0 days"], name="foo") + tm.assert_index_equal(result, expected) + + result = tdi + tdi + expected = TimedeltaIndex(["2 days", pd.NaT, "4 days"], name="foo") + tm.assert_index_equal(result, expected) + + result = dti - tdi # name will be reset + expected = DatetimeIndex(["20121231", pd.NaT, "20130101"]) + tm.assert_index_equal(result, expected) + + def test_addition_ops(self): + # with datetimes/timedelta and tdi/dti + tdi = TimedeltaIndex(["1 days", pd.NaT, "2 days"], name="foo") + dti = pd.date_range("20130101", periods=3, name="bar") + td = Timedelta("1 days") + dt = Timestamp("20130101") + + result = tdi + dt + expected = DatetimeIndex(["20130102", pd.NaT, "20130103"], name="foo") + tm.assert_index_equal(result, expected) + + result = dt + tdi + expected = DatetimeIndex(["20130102", pd.NaT, "20130103"], name="foo") + tm.assert_index_equal(result, expected) + + result = td + tdi + expected = TimedeltaIndex(["2 days", pd.NaT, "3 days"], name="foo") + tm.assert_index_equal(result, expected) + + result = tdi + td + expected = TimedeltaIndex(["2 days", pd.NaT, "3 days"], name="foo") + tm.assert_index_equal(result, expected) + + # unequal length + msg = "cannot add indices of unequal length" + with pytest.raises(ValueError, match=msg): + tdi + dti[0:1] + with pytest.raises(ValueError, match=msg): + tdi[0:1] + dti + + # random indexes + with pytest.raises(TypeError): + tdi + pd.Int64Index([1, 2, 3]) + + # this is a union! + # pytest.raises(TypeError, lambda : Int64Index([1,2,3]) + tdi) + + result = tdi + dti # name will be reset + expected = DatetimeIndex(["20130102", pd.NaT, "20130105"]) + tm.assert_index_equal(result, expected) + + result = dti + tdi # name will be reset + expected = DatetimeIndex(["20130102", pd.NaT, "20130105"]) + tm.assert_index_equal(result, expected) + + result = dt + td + expected = Timestamp("20130102") + assert result == expected + + result = td + dt + expected = Timestamp("20130102") + assert result == expected + + # TODO: Needs more informative name, probably split up into + # more targeted tests + @pytest.mark.parametrize("freq", ["D", "B"]) + def test_timedelta(self, freq): + index = pd.date_range("1/1/2000", periods=50, freq=freq) + + shifted = index + timedelta(1) + back = shifted + timedelta(-1) + tm.assert_index_equal(index, back) + + if freq == "D": + expected = pd.tseries.offsets.Day(1) + assert index.freq == expected + assert shifted.freq == expected + assert back.freq == expected + else: # freq == 'B' + assert index.freq == pd.tseries.offsets.BusinessDay(1) + assert shifted.freq is None + assert back.freq == pd.tseries.offsets.BusinessDay(1) + + result = index - timedelta(1) + expected = index + timedelta(-1) + tm.assert_index_equal(result, expected) + + # GH#4134, buggy with timedeltas + rng = pd.date_range("2013", "2014") + s = Series(rng) + result1 = rng - pd.offsets.Hour(1) + result2 = DatetimeIndex(s - np.timedelta64(100000000)) + result3 = rng - np.timedelta64(100000000) + result4 = DatetimeIndex(s - pd.offsets.Hour(1)) + tm.assert_index_equal(result1, result4) + tm.assert_index_equal(result2, result3) + + def test_tda_add_sub_index(self): + # Check that TimedeltaArray defers to Index on arithmetic ops + tdi = TimedeltaIndex(["1 days", pd.NaT, "2 days"]) + tda = tdi.array + + dti = pd.date_range("1999-12-31", periods=3, freq="D") + + result = tda + dti + expected = tdi + dti + tm.assert_index_equal(result, expected) + + result = tda + tdi + expected = tdi + tdi + tm.assert_index_equal(result, expected) + + result = tda - tdi + expected = tdi - tdi + tm.assert_index_equal(result, expected) + + # ------------------------------------------------------------- + # Binary operations TimedeltaIndex and timedelta-like + + def test_tdi_iadd_timedeltalike(self, two_hours): + # only test adding/sub offsets as + is now numeric + rng = timedelta_range("1 days", "10 days") + expected = timedelta_range("1 days 02:00:00", "10 days 02:00:00", freq="D") + rng += two_hours + tm.assert_index_equal(rng, expected) + + def test_tdi_isub_timedeltalike(self, two_hours): + # only test adding/sub offsets as - is now numeric + rng = timedelta_range("1 days", "10 days") + expected = timedelta_range("0 days 22:00:00", "9 days 22:00:00") + rng -= two_hours + tm.assert_index_equal(rng, expected) + + # ------------------------------------------------------------- + + def test_tdi_ops_attributes(self): + rng = timedelta_range("2 days", periods=5, freq="2D", name="x") + + result = rng + 1 * rng.freq + exp = timedelta_range("4 days", periods=5, freq="2D", name="x") + tm.assert_index_equal(result, exp) + assert result.freq == "2D" + + result = rng - 2 * rng.freq + exp = timedelta_range("-2 days", periods=5, freq="2D", name="x") + tm.assert_index_equal(result, exp) + assert result.freq == "2D" + + result = rng * 2 + exp = timedelta_range("4 days", periods=5, freq="4D", name="x") + tm.assert_index_equal(result, exp) + assert result.freq == "4D" + + result = rng / 2 + exp = timedelta_range("1 days", periods=5, freq="D", name="x") + tm.assert_index_equal(result, exp) + assert result.freq == "D" + + result = -rng + exp = timedelta_range("-2 days", periods=5, freq="-2D", name="x") + tm.assert_index_equal(result, exp) + assert result.freq == "-2D" + + rng = pd.timedelta_range("-2 days", periods=5, freq="D", name="x") + + result = abs(rng) + exp = TimedeltaIndex( + ["2 days", "1 days", "0 days", "1 days", "2 days"], name="x" + ) + tm.assert_index_equal(result, exp) + assert result.freq is None + + +class TestAddSubNaTMasking: + # TODO: parametrize over boxes + + def test_tdi_add_timestamp_nat_masking(self): + # GH#17991 checking for overflow-masking with NaT + tdinat = pd.to_timedelta(["24658 days 11:15:00", "NaT"]) + + tsneg = Timestamp("1950-01-01") + ts_neg_variants = [ + tsneg, + tsneg.to_pydatetime(), + tsneg.to_datetime64().astype("datetime64[ns]"), + tsneg.to_datetime64().astype("datetime64[D]"), + ] + + tspos = Timestamp("1980-01-01") + ts_pos_variants = [ + tspos, + tspos.to_pydatetime(), + tspos.to_datetime64().astype("datetime64[ns]"), + tspos.to_datetime64().astype("datetime64[D]"), + ] + + for variant in ts_neg_variants + ts_pos_variants: + res = tdinat + variant + assert res[1] is pd.NaT + + def test_tdi_add_overflow(self): + # See GH#14068 + # preliminary test scalar analogue of vectorized tests below + with pytest.raises(OutOfBoundsDatetime): + pd.to_timedelta(106580, "D") + Timestamp("2000") + with pytest.raises(OutOfBoundsDatetime): + Timestamp("2000") + pd.to_timedelta(106580, "D") + + _NaT = int(pd.NaT) + 1 + msg = "Overflow in int64 addition" + with pytest.raises(OverflowError, match=msg): + pd.to_timedelta([106580], "D") + Timestamp("2000") + with pytest.raises(OverflowError, match=msg): + Timestamp("2000") + pd.to_timedelta([106580], "D") + with pytest.raises(OverflowError, match=msg): + pd.to_timedelta([_NaT]) - Timedelta("1 days") + with pytest.raises(OverflowError, match=msg): + pd.to_timedelta(["5 days", _NaT]) - Timedelta("1 days") + with pytest.raises(OverflowError, match=msg): + ( + pd.to_timedelta([_NaT, "5 days", "1 hours"]) + - pd.to_timedelta(["7 seconds", _NaT, "4 hours"]) + ) + + # These should not overflow! + exp = TimedeltaIndex([pd.NaT]) + result = pd.to_timedelta([pd.NaT]) - Timedelta("1 days") + tm.assert_index_equal(result, exp) + + exp = TimedeltaIndex(["4 days", pd.NaT]) + result = pd.to_timedelta(["5 days", pd.NaT]) - Timedelta("1 days") + tm.assert_index_equal(result, exp) + + exp = TimedeltaIndex([pd.NaT, pd.NaT, "5 hours"]) + result = pd.to_timedelta([pd.NaT, "5 days", "1 hours"]) + pd.to_timedelta( + ["7 seconds", pd.NaT, "4 hours"] + ) + tm.assert_index_equal(result, exp) + + +class TestTimedeltaArraylikeAddSubOps: + # Tests for timedelta64[ns] __add__, __sub__, __radd__, __rsub__ + + # TODO: moved from tests.indexes.timedeltas.test_arithmetic; needs + # parametrization+de-duplication + def test_timedelta_ops_with_missing_values(self): + # setup + s1 = pd.to_timedelta(Series(["00:00:01"])) + s2 = pd.to_timedelta(Series(["00:00:02"])) + + msg = r"dtype datetime64\[ns\] cannot be converted to timedelta64\[ns\]" + with pytest.raises(TypeError, match=msg): + # Passing datetime64-dtype data to TimedeltaIndex is no longer + # supported GH#29794 + pd.to_timedelta(Series([pd.NaT])) + + sn = pd.to_timedelta(Series([pd.NaT], dtype="m8[ns]")) + + df1 = pd.DataFrame(["00:00:01"]).apply(pd.to_timedelta) + df2 = pd.DataFrame(["00:00:02"]).apply(pd.to_timedelta) + with pytest.raises(TypeError, match=msg): + # Passing datetime64-dtype data to TimedeltaIndex is no longer + # supported GH#29794 + pd.DataFrame([pd.NaT]).apply(pd.to_timedelta) + + dfn = pd.DataFrame([pd.NaT.value]).apply(pd.to_timedelta) + + scalar1 = pd.to_timedelta("00:00:01") + scalar2 = pd.to_timedelta("00:00:02") + timedelta_NaT = pd.to_timedelta("NaT") + + actual = scalar1 + scalar1 + assert actual == scalar2 + actual = scalar2 - scalar1 + assert actual == scalar1 + + actual = s1 + s1 + tm.assert_series_equal(actual, s2) + actual = s2 - s1 + tm.assert_series_equal(actual, s1) + + actual = s1 + scalar1 + tm.assert_series_equal(actual, s2) + actual = scalar1 + s1 + tm.assert_series_equal(actual, s2) + actual = s2 - scalar1 + tm.assert_series_equal(actual, s1) + actual = -scalar1 + s2 + tm.assert_series_equal(actual, s1) + + actual = s1 + timedelta_NaT + tm.assert_series_equal(actual, sn) + actual = timedelta_NaT + s1 + tm.assert_series_equal(actual, sn) + actual = s1 - timedelta_NaT + tm.assert_series_equal(actual, sn) + actual = -timedelta_NaT + s1 + tm.assert_series_equal(actual, sn) + + with pytest.raises(TypeError): + s1 + np.nan + with pytest.raises(TypeError): + np.nan + s1 + with pytest.raises(TypeError): + s1 - np.nan + with pytest.raises(TypeError): + -np.nan + s1 + + actual = s1 + pd.NaT + tm.assert_series_equal(actual, sn) + actual = s2 - pd.NaT + tm.assert_series_equal(actual, sn) + + actual = s1 + df1 + tm.assert_frame_equal(actual, df2) + actual = s2 - df1 + tm.assert_frame_equal(actual, df1) + actual = df1 + s1 + tm.assert_frame_equal(actual, df2) + actual = df2 - s1 + tm.assert_frame_equal(actual, df1) + + actual = df1 + df1 + tm.assert_frame_equal(actual, df2) + actual = df2 - df1 + tm.assert_frame_equal(actual, df1) + + actual = df1 + scalar1 + tm.assert_frame_equal(actual, df2) + actual = df2 - scalar1 + tm.assert_frame_equal(actual, df1) + + actual = df1 + timedelta_NaT + tm.assert_frame_equal(actual, dfn) + actual = df1 - timedelta_NaT + tm.assert_frame_equal(actual, dfn) + + with pytest.raises(TypeError): + df1 + np.nan + with pytest.raises(TypeError): + df1 - np.nan + + actual = df1 + pd.NaT # NaT is datetime, not timedelta + tm.assert_frame_equal(actual, dfn) + actual = df1 - pd.NaT + tm.assert_frame_equal(actual, dfn) + + # TODO: moved from tests.series.test_operators, needs splitting, cleanup, + # de-duplication, box-parametrization... + def test_operators_timedelta64(self): + # series ops + v1 = pd.date_range("2012-1-1", periods=3, freq="D") + v2 = pd.date_range("2012-1-2", periods=3, freq="D") + rs = Series(v2) - Series(v1) + xp = Series(1e9 * 3600 * 24, rs.index).astype("int64").astype("timedelta64[ns]") + tm.assert_series_equal(rs, xp) + assert rs.dtype == "timedelta64[ns]" + + df = DataFrame(dict(A=v1)) + td = Series([timedelta(days=i) for i in range(3)]) + assert td.dtype == "timedelta64[ns]" + + # series on the rhs + result = df["A"] - df["A"].shift() + assert result.dtype == "timedelta64[ns]" + + result = df["A"] + td + assert result.dtype == "M8[ns]" + + # scalar Timestamp on rhs + maxa = df["A"].max() + assert isinstance(maxa, Timestamp) + + resultb = df["A"] - df["A"].max() + assert resultb.dtype == "timedelta64[ns]" + + # timestamp on lhs + result = resultb + df["A"] + values = [Timestamp("20111230"), Timestamp("20120101"), Timestamp("20120103")] + expected = Series(values, name="A") + tm.assert_series_equal(result, expected) + + # datetimes on rhs + result = df["A"] - datetime(2001, 1, 1) + expected = Series([timedelta(days=4017 + i) for i in range(3)], name="A") + tm.assert_series_equal(result, expected) + assert result.dtype == "m8[ns]" + + d = datetime(2001, 1, 1, 3, 4) + resulta = df["A"] - d + assert resulta.dtype == "m8[ns]" + + # roundtrip + resultb = resulta + d + tm.assert_series_equal(df["A"], resultb) + + # timedeltas on rhs + td = timedelta(days=1) + resulta = df["A"] + td + resultb = resulta - td + tm.assert_series_equal(resultb, df["A"]) + assert resultb.dtype == "M8[ns]" + + # roundtrip + td = timedelta(minutes=5, seconds=3) + resulta = df["A"] + td + resultb = resulta - td + tm.assert_series_equal(df["A"], resultb) + assert resultb.dtype == "M8[ns]" + + # inplace + value = rs[2] + np.timedelta64(timedelta(minutes=5, seconds=1)) + rs[2] += np.timedelta64(timedelta(minutes=5, seconds=1)) + assert rs[2] == value + + def test_timedelta64_ops_nat(self): + # GH 11349 + timedelta_series = Series([NaT, Timedelta("1s")]) + nat_series_dtype_timedelta = Series([NaT, NaT], dtype="timedelta64[ns]") + single_nat_dtype_timedelta = Series([NaT], dtype="timedelta64[ns]") + + # subtraction + tm.assert_series_equal(timedelta_series - NaT, nat_series_dtype_timedelta) + tm.assert_series_equal(-NaT + timedelta_series, nat_series_dtype_timedelta) + + tm.assert_series_equal( + timedelta_series - single_nat_dtype_timedelta, nat_series_dtype_timedelta + ) + tm.assert_series_equal( + -single_nat_dtype_timedelta + timedelta_series, nat_series_dtype_timedelta + ) + + # addition + tm.assert_series_equal( + nat_series_dtype_timedelta + NaT, nat_series_dtype_timedelta + ) + tm.assert_series_equal( + NaT + nat_series_dtype_timedelta, nat_series_dtype_timedelta + ) + + tm.assert_series_equal( + nat_series_dtype_timedelta + single_nat_dtype_timedelta, + nat_series_dtype_timedelta, + ) + tm.assert_series_equal( + single_nat_dtype_timedelta + nat_series_dtype_timedelta, + nat_series_dtype_timedelta, + ) + + tm.assert_series_equal(timedelta_series + NaT, nat_series_dtype_timedelta) + tm.assert_series_equal(NaT + timedelta_series, nat_series_dtype_timedelta) + + tm.assert_series_equal( + timedelta_series + single_nat_dtype_timedelta, nat_series_dtype_timedelta + ) + tm.assert_series_equal( + single_nat_dtype_timedelta + timedelta_series, nat_series_dtype_timedelta + ) + + tm.assert_series_equal( + nat_series_dtype_timedelta + NaT, nat_series_dtype_timedelta + ) + tm.assert_series_equal( + NaT + nat_series_dtype_timedelta, nat_series_dtype_timedelta + ) + + tm.assert_series_equal( + nat_series_dtype_timedelta + single_nat_dtype_timedelta, + nat_series_dtype_timedelta, + ) + tm.assert_series_equal( + single_nat_dtype_timedelta + nat_series_dtype_timedelta, + nat_series_dtype_timedelta, + ) + + # multiplication + tm.assert_series_equal( + nat_series_dtype_timedelta * 1.0, nat_series_dtype_timedelta + ) + tm.assert_series_equal( + 1.0 * nat_series_dtype_timedelta, nat_series_dtype_timedelta + ) + + tm.assert_series_equal(timedelta_series * 1, timedelta_series) + tm.assert_series_equal(1 * timedelta_series, timedelta_series) + + tm.assert_series_equal(timedelta_series * 1.5, Series([NaT, Timedelta("1.5s")])) + tm.assert_series_equal(1.5 * timedelta_series, Series([NaT, Timedelta("1.5s")])) + + tm.assert_series_equal(timedelta_series * np.nan, nat_series_dtype_timedelta) + tm.assert_series_equal(np.nan * timedelta_series, nat_series_dtype_timedelta) + + # division + tm.assert_series_equal(timedelta_series / 2, Series([NaT, Timedelta("0.5s")])) + tm.assert_series_equal(timedelta_series / 2.0, Series([NaT, Timedelta("0.5s")])) + tm.assert_series_equal(timedelta_series / np.nan, nat_series_dtype_timedelta) + + # ------------------------------------------------------------- + # Binary operations td64 arraylike and datetime-like + + def test_td64arr_sub_timestamp_raises(self, box_with_array): + idx = TimedeltaIndex(["1 day", "2 day"]) + idx = tm.box_expected(idx, box_with_array) + + msg = ( + "cannot subtract a datelike from|" + "Could not operate|" + "cannot perform operation" + ) + with pytest.raises(TypeError, match=msg): + idx - Timestamp("2011-01-01") + + def test_td64arr_add_timestamp(self, box_with_array, tz_naive_fixture): + # GH#23215 + + # TODO: parametrize over scalar datetime types? + tz = tz_naive_fixture + other = Timestamp("2011-01-01", tz=tz) + + idx = TimedeltaIndex(["1 day", "2 day"]) + expected = DatetimeIndex(["2011-01-02", "2011-01-03"], tz=tz) + + idx = tm.box_expected(idx, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + result = idx + other + tm.assert_equal(result, expected) + + result = other + idx + tm.assert_equal(result, expected) + + @pytest.mark.parametrize( + "ts", + [ + Timestamp("2012-01-01"), + Timestamp("2012-01-01").to_pydatetime(), + Timestamp("2012-01-01").to_datetime64(), + ], + ) + def test_td64arr_add_sub_datetimelike_scalar(self, ts, box_with_array): + # GH#11925, GH#29558 + tdi = timedelta_range("1 day", periods=3) + expected = pd.date_range("2012-01-02", periods=3) + + tdarr = tm.box_expected(tdi, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + tm.assert_equal(ts + tdarr, expected) + tm.assert_equal(tdarr + ts, expected) + + expected2 = pd.date_range("2011-12-31", periods=3, freq="-1D") + expected2 = tm.box_expected(expected2, box_with_array) + + tm.assert_equal(ts - tdarr, expected2) + tm.assert_equal(ts + (-tdarr), expected2) + + with pytest.raises(TypeError): + tdarr - ts + + def test_tdi_sub_dt64_array(self, box_with_array): + dti = pd.date_range("2016-01-01", periods=3) + tdi = dti - dti.shift(1) + dtarr = dti.values + expected = pd.DatetimeIndex(dtarr) - tdi + + tdi = tm.box_expected(tdi, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + with pytest.raises(TypeError): + tdi - dtarr + + # TimedeltaIndex.__rsub__ + result = dtarr - tdi + tm.assert_equal(result, expected) + + def test_tdi_add_dt64_array(self, box_with_array): + dti = pd.date_range("2016-01-01", periods=3) + tdi = dti - dti.shift(1) + dtarr = dti.values + expected = pd.DatetimeIndex(dtarr) + tdi + + tdi = tm.box_expected(tdi, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + result = tdi + dtarr + tm.assert_equal(result, expected) + result = dtarr + tdi + tm.assert_equal(result, expected) + + def test_td64arr_add_datetime64_nat(self, box_with_array): + # GH#23215 + other = np.datetime64("NaT") + + tdi = timedelta_range("1 day", periods=3) + expected = pd.DatetimeIndex(["NaT", "NaT", "NaT"]) + + tdser = tm.box_expected(tdi, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + tm.assert_equal(tdser + other, expected) + tm.assert_equal(other + tdser, expected) + + # ------------------------------------------------------------------ + # Invalid __add__/__sub__ operations + + # TODO: moved from frame tests; needs parametrization/de-duplication + def test_td64_df_add_int_frame(self): + # GH#22696 Check that we don't dispatch to numpy implementation, + # which treats int64 as m8[ns] + tdi = pd.timedelta_range("1", periods=3) + df = tdi.to_frame() + other = pd.DataFrame([1, 2, 3], index=tdi) # indexed like `df` + assert_invalid_addsub_type(df, other) + + @pytest.mark.parametrize("pi_freq", ["D", "W", "Q", "H"]) + @pytest.mark.parametrize("tdi_freq", [None, "H"]) + def test_td64arr_sub_periodlike(self, box_with_array, tdi_freq, pi_freq): + # GH#20049 subtracting PeriodIndex should raise TypeError + tdi = TimedeltaIndex(["1 hours", "2 hours"], freq=tdi_freq) + dti = Timestamp("2018-03-07 17:16:40") + tdi + pi = dti.to_period(pi_freq) + + # TODO: parametrize over box for pi? + tdi = tm.box_expected(tdi, box_with_array) + with pytest.raises(TypeError): + tdi - pi + + # FIXME: don't leave commented-out + # FIXME: this raises with period scalar but not with PeriodIndex? + # with pytest.raises(TypeError): + # pi - tdi + + # GH#13078 subtraction of Period scalar not supported + with pytest.raises(TypeError): + tdi - pi[0] + with pytest.raises(TypeError): + pi[0] - tdi + + @pytest.mark.parametrize( + "other", + [ + # GH#12624 for str case + "a", + # GH#19123 + 1, + 1.5, + np.array(2), + ], + ) + def test_td64arr_addsub_numeric_scalar_invalid(self, box_with_array, other): + # vector-like others are tested in test_td64arr_add_sub_numeric_arr_invalid + tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") + tdarr = tm.box_expected(tdser, box_with_array) + + assert_invalid_addsub_type(tdarr, other) + + @pytest.mark.parametrize( + "vec", + [ + np.array([1, 2, 3]), + pd.Index([1, 2, 3]), + Series([1, 2, 3]), + DataFrame([[1, 2, 3]]), + ], + ids=lambda x: type(x).__name__, + ) + def test_td64arr_addsub_numeric_arr_invalid( + self, box_with_array, vec, any_real_dtype + ): + tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") + tdarr = tm.box_expected(tdser, box_with_array) + + vector = vec.astype(any_real_dtype) + assert_invalid_addsub_type(tdarr, vector) + + def test_td64arr_add_sub_int(self, box_with_array, one): + # Variants of `one` for #19012, deprecated GH#22535 + rng = timedelta_range("1 days 09:00:00", freq="H", periods=10) + tdarr = tm.box_expected(rng, box_with_array) + + msg = "Addition/subtraction of integers" + assert_invalid_addsub_type(tdarr, one, msg) + + # TOOD: get inplace ops into assert_invalid_addsub_type + with pytest.raises(TypeError, match=msg): + tdarr += one + with pytest.raises(TypeError, match=msg): + tdarr -= one + + def test_td64arr_add_sub_integer_array(self, box_with_array): + # GH#19959, deprecated GH#22535 + rng = timedelta_range("1 days 09:00:00", freq="H", periods=3) + tdarr = tm.box_expected(rng, box_with_array) + other = tm.box_expected([4, 3, 2], box_with_array) + + msg = "Addition/subtraction of integers and integer-arrays" + assert_invalid_addsub_type(tdarr, other, msg) + + def test_td64arr_addsub_integer_array_no_freq(self, box_with_array): + # GH#19959 + tdi = TimedeltaIndex(["1 Day", "NaT", "3 Hours"]) + tdarr = tm.box_expected(tdi, box_with_array) + other = tm.box_expected([14, -1, 16], box_with_array) + + msg = "Addition/subtraction of integers" + assert_invalid_addsub_type(tdarr, other, msg) + + # ------------------------------------------------------------------ + # Operations with timedelta-like others + + # TODO: this was taken from tests.series.test_ops; de-duplicate + def test_operators_timedelta64_with_timedelta(self, scalar_td): + # smoke tests + td1 = Series([timedelta(minutes=5, seconds=3)] * 3) + td1.iloc[2] = np.nan + + td1 + scalar_td + scalar_td + td1 + td1 - scalar_td + scalar_td - td1 + td1 / scalar_td + scalar_td / td1 + + # TODO: this was taken from tests.series.test_ops; de-duplicate + def test_timedelta64_operations_with_timedeltas(self): + # td operate with td + td1 = Series([timedelta(minutes=5, seconds=3)] * 3) + td2 = timedelta(minutes=5, seconds=4) + result = td1 - td2 + expected = Series([timedelta(seconds=0)] * 3) - Series( + [timedelta(seconds=1)] * 3 + ) + assert result.dtype == "m8[ns]" + tm.assert_series_equal(result, expected) + + result2 = td2 - td1 + expected = Series([timedelta(seconds=1)] * 3) - Series( + [timedelta(seconds=0)] * 3 + ) + tm.assert_series_equal(result2, expected) + + # roundtrip + tm.assert_series_equal(result + td2, td1) + + # Now again, using pd.to_timedelta, which should build + # a Series or a scalar, depending on input. + td1 = Series(pd.to_timedelta(["00:05:03"] * 3)) + td2 = pd.to_timedelta("00:05:04") + result = td1 - td2 + expected = Series([timedelta(seconds=0)] * 3) - Series( + [timedelta(seconds=1)] * 3 + ) + assert result.dtype == "m8[ns]" + tm.assert_series_equal(result, expected) + + result2 = td2 - td1 + expected = Series([timedelta(seconds=1)] * 3) - Series( + [timedelta(seconds=0)] * 3 + ) + tm.assert_series_equal(result2, expected) + + # roundtrip + tm.assert_series_equal(result + td2, td1) + + def test_td64arr_add_td64_array(self, box_with_array): + box = box_with_array + dti = pd.date_range("2016-01-01", periods=3) + tdi = dti - dti.shift(1) + tdarr = tdi.values + + expected = 2 * tdi + tdi = tm.box_expected(tdi, box) + expected = tm.box_expected(expected, box) + + result = tdi + tdarr + tm.assert_equal(result, expected) + result = tdarr + tdi + tm.assert_equal(result, expected) + + def test_td64arr_sub_td64_array(self, box_with_array): + box = box_with_array + dti = pd.date_range("2016-01-01", periods=3) + tdi = dti - dti.shift(1) + tdarr = tdi.values + + expected = 0 * tdi + tdi = tm.box_expected(tdi, box) + expected = tm.box_expected(expected, box) + + result = tdi - tdarr + tm.assert_equal(result, expected) + result = tdarr - tdi + tm.assert_equal(result, expected) + + # TODO: parametrize over [add, sub, radd, rsub]? + @pytest.mark.parametrize( + "names", + [ + (None, None, None), + ("Egon", "Venkman", None), + ("NCC1701D", "NCC1701D", "NCC1701D"), + ], + ) + def test_td64arr_add_sub_tdi(self, box, names): + # GH#17250 make sure result dtype is correct + # GH#19043 make sure names are propagated correctly + if box is pd.DataFrame and names[1] == "Venkman": + pytest.skip( + "Name propagation for DataFrame does not behave like " + "it does for Index/Series" + ) + + tdi = TimedeltaIndex(["0 days", "1 day"], name=names[0]) + ser = Series([Timedelta(hours=3), Timedelta(hours=4)], name=names[1]) + expected = Series( + [Timedelta(hours=3), Timedelta(days=1, hours=4)], name=names[2] + ) + + ser = tm.box_expected(ser, box) + expected = tm.box_expected(expected, box) + + result = tdi + ser + tm.assert_equal(result, expected) + if box is not pd.DataFrame: + assert result.dtype == "timedelta64[ns]" + else: + assert result.dtypes[0] == "timedelta64[ns]" + + result = ser + tdi + tm.assert_equal(result, expected) + if box is not pd.DataFrame: + assert result.dtype == "timedelta64[ns]" + else: + assert result.dtypes[0] == "timedelta64[ns]" + + expected = Series( + [Timedelta(hours=-3), Timedelta(days=1, hours=-4)], name=names[2] + ) + expected = tm.box_expected(expected, box) + + result = tdi - ser + tm.assert_equal(result, expected) + if box is not pd.DataFrame: + assert result.dtype == "timedelta64[ns]" + else: + assert result.dtypes[0] == "timedelta64[ns]" + + result = ser - tdi + tm.assert_equal(result, -expected) + if box is not pd.DataFrame: + assert result.dtype == "timedelta64[ns]" + else: + assert result.dtypes[0] == "timedelta64[ns]" + + def test_td64arr_add_sub_td64_nat(self, box_with_array): + # GH#23320 special handling for timedelta64("NaT") + box = box_with_array + tdi = pd.TimedeltaIndex([NaT, Timedelta("1s")]) + other = np.timedelta64("NaT") + expected = pd.TimedeltaIndex(["NaT"] * 2) + + obj = tm.box_expected(tdi, box) + expected = tm.box_expected(expected, box) + + result = obj + other + tm.assert_equal(result, expected) + result = other + obj + tm.assert_equal(result, expected) + result = obj - other + tm.assert_equal(result, expected) + result = other - obj + tm.assert_equal(result, expected) + + def test_td64arr_sub_NaT(self, box_with_array): + # GH#18808 + box = box_with_array + ser = Series([NaT, Timedelta("1s")]) + expected = Series([NaT, NaT], dtype="timedelta64[ns]") + + ser = tm.box_expected(ser, box) + expected = tm.box_expected(expected, box) + + res = ser - pd.NaT + tm.assert_equal(res, expected) + + def test_td64arr_add_timedeltalike(self, two_hours, box_with_array): + # only test adding/sub offsets as + is now numeric + box = box_with_array + rng = timedelta_range("1 days", "10 days") + expected = timedelta_range("1 days 02:00:00", "10 days 02:00:00", freq="D") + rng = tm.box_expected(rng, box) + expected = tm.box_expected(expected, box) + + result = rng + two_hours + tm.assert_equal(result, expected) + + def test_td64arr_sub_timedeltalike(self, two_hours, box_with_array): + # only test adding/sub offsets as - is now numeric + box = box_with_array + rng = timedelta_range("1 days", "10 days") + expected = timedelta_range("0 days 22:00:00", "9 days 22:00:00") + + rng = tm.box_expected(rng, box) + expected = tm.box_expected(expected, box) + + result = rng - two_hours + tm.assert_equal(result, expected) + + # ------------------------------------------------------------------ + # __add__/__sub__ with DateOffsets and arrays of DateOffsets + + # TODO: this was taken from tests.series.test_operators; de-duplicate + def test_timedelta64_operations_with_DateOffset(self): + # GH#10699 + td = Series([timedelta(minutes=5, seconds=3)] * 3) + result = td + pd.offsets.Minute(1) + expected = Series([timedelta(minutes=6, seconds=3)] * 3) + tm.assert_series_equal(result, expected) + + result = td - pd.offsets.Minute(1) + expected = Series([timedelta(minutes=4, seconds=3)] * 3) + tm.assert_series_equal(result, expected) + + with tm.assert_produces_warning(PerformanceWarning): + result = td + Series( + [pd.offsets.Minute(1), pd.offsets.Second(3), pd.offsets.Hour(2)] + ) + expected = Series( + [ + timedelta(minutes=6, seconds=3), + timedelta(minutes=5, seconds=6), + timedelta(hours=2, minutes=5, seconds=3), + ] + ) + tm.assert_series_equal(result, expected) + + result = td + pd.offsets.Minute(1) + pd.offsets.Second(12) + expected = Series([timedelta(minutes=6, seconds=15)] * 3) + tm.assert_series_equal(result, expected) + + # valid DateOffsets + for do in ["Hour", "Minute", "Second", "Day", "Micro", "Milli", "Nano"]: + op = getattr(pd.offsets, do) + td + op(5) + op(5) + td + td - op(5) + op(5) - td + + @pytest.mark.parametrize( + "names", [(None, None, None), ("foo", "bar", None), ("foo", "foo", "foo")] + ) + def test_td64arr_add_offset_index(self, names, box): + # GH#18849, GH#19744 + if box is pd.DataFrame and names[1] == "bar": + pytest.skip( + "Name propagation for DataFrame does not behave like " + "it does for Index/Series" + ) + + tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"], name=names[0]) + other = pd.Index([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)], name=names[1]) + + expected = TimedeltaIndex( + [tdi[n] + other[n] for n in range(len(tdi))], freq="infer", name=names[2] + ) + tdi = tm.box_expected(tdi, box) + expected = tm.box_expected(expected, box) + + # The DataFrame operation is transposed and so operates as separate + # scalar operations, which do not issue a PerformanceWarning + warn = PerformanceWarning if box is not pd.DataFrame else None + with tm.assert_produces_warning(warn): + res = tdi + other + tm.assert_equal(res, expected) + + with tm.assert_produces_warning(warn): + res2 = other + tdi + tm.assert_equal(res2, expected) + + # TODO: combine with test_td64arr_add_offset_index by parametrizing + # over second box? + def test_td64arr_add_offset_array(self, box_with_array): + # GH#18849 + box = box_with_array + tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"]) + other = np.array([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)]) + + expected = TimedeltaIndex( + [tdi[n] + other[n] for n in range(len(tdi))], freq="infer" + ) + + tdi = tm.box_expected(tdi, box) + expected = tm.box_expected(expected, box) + + # The DataFrame operation is transposed and so operates as separate + # scalar operations, which do not issue a PerformanceWarning + warn = PerformanceWarning if box is not pd.DataFrame else None + with tm.assert_produces_warning(warn): + res = tdi + other + tm.assert_equal(res, expected) + + with tm.assert_produces_warning(warn): + res2 = other + tdi + tm.assert_equal(res2, expected) + + @pytest.mark.parametrize( + "names", [(None, None, None), ("foo", "bar", None), ("foo", "foo", "foo")] + ) + def test_td64arr_sub_offset_index(self, names, box_with_array): + # GH#18824, GH#19744 + box = box_with_array + xbox = box if box is not tm.to_array else pd.Index + exname = names[2] if box is not tm.to_array else names[1] + + if box is pd.DataFrame and names[1] == "bar": + pytest.skip( + "Name propagation for DataFrame does not behave like " + "it does for Index/Series" + ) + + tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"], name=names[0]) + other = pd.Index([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)], name=names[1]) + + expected = TimedeltaIndex( + [tdi[n] - other[n] for n in range(len(tdi))], freq="infer", name=exname + ) + + tdi = tm.box_expected(tdi, box) + expected = tm.box_expected(expected, xbox) + + # The DataFrame operation is transposed and so operates as separate + # scalar operations, which do not issue a PerformanceWarning + warn = PerformanceWarning if box is not pd.DataFrame else None + with tm.assert_produces_warning(warn): + res = tdi - other + tm.assert_equal(res, expected) + + def test_td64arr_sub_offset_array(self, box_with_array): + # GH#18824 + tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"]) + other = np.array([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)]) + + expected = TimedeltaIndex( + [tdi[n] - other[n] for n in range(len(tdi))], freq="infer" + ) + + tdi = tm.box_expected(tdi, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + # The DataFrame operation is transposed and so operates as separate + # scalar operations, which do not issue a PerformanceWarning + warn = None if box_with_array is pd.DataFrame else PerformanceWarning + with tm.assert_produces_warning(warn): + res = tdi - other + tm.assert_equal(res, expected) + + @pytest.mark.parametrize( + "names", [(None, None, None), ("foo", "bar", None), ("foo", "foo", "foo")] + ) + def test_td64arr_with_offset_series(self, names, box_df_fail): + # GH#18849 + box = box_df_fail + box2 = Series if box in [pd.Index, tm.to_array] else box + exname = names[2] if box is not tm.to_array else names[1] + + tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"], name=names[0]) + other = Series([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)], name=names[1]) + + expected_add = Series([tdi[n] + other[n] for n in range(len(tdi))], name=exname) + tdi = tm.box_expected(tdi, box) + expected_add = tm.box_expected(expected_add, box2) + + with tm.assert_produces_warning(PerformanceWarning): + res = tdi + other + tm.assert_equal(res, expected_add) + + with tm.assert_produces_warning(PerformanceWarning): + res2 = other + tdi + tm.assert_equal(res2, expected_add) + + # TODO: separate/parametrize add/sub test? + expected_sub = Series([tdi[n] - other[n] for n in range(len(tdi))], name=exname) + expected_sub = tm.box_expected(expected_sub, box2) + + with tm.assert_produces_warning(PerformanceWarning): + res3 = tdi - other + tm.assert_equal(res3, expected_sub) + + @pytest.mark.parametrize("obox", [np.array, pd.Index, pd.Series]) + def test_td64arr_addsub_anchored_offset_arraylike(self, obox, box_with_array): + # GH#18824 + tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"]) + tdi = tm.box_expected(tdi, box_with_array) + + anchored = obox([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)]) + + # addition/subtraction ops with anchored offsets should issue + # a PerformanceWarning and _then_ raise a TypeError. + with pytest.raises(TypeError): + with tm.assert_produces_warning(PerformanceWarning): + tdi + anchored + with pytest.raises(TypeError): + with tm.assert_produces_warning(PerformanceWarning): + anchored + tdi + with pytest.raises(TypeError): + with tm.assert_produces_warning(PerformanceWarning): + tdi - anchored + with pytest.raises(TypeError): + with tm.assert_produces_warning(PerformanceWarning): + anchored - tdi + + # ------------------------------------------------------------------ + # Unsorted + + def test_td64arr_add_sub_object_array(self, box_with_array): + tdi = pd.timedelta_range("1 day", periods=3, freq="D") + tdarr = tm.box_expected(tdi, box_with_array) + + other = np.array( + [pd.Timedelta(days=1), pd.offsets.Day(2), pd.Timestamp("2000-01-04")] + ) + + warn = PerformanceWarning if box_with_array is not pd.DataFrame else None + with tm.assert_produces_warning(warn): + result = tdarr + other + + expected = pd.Index( + [pd.Timedelta(days=2), pd.Timedelta(days=4), pd.Timestamp("2000-01-07")] + ) + expected = tm.box_expected(expected, box_with_array) + tm.assert_equal(result, expected) + + with pytest.raises(TypeError): + with tm.assert_produces_warning(warn): + tdarr - other + + with tm.assert_produces_warning(warn): + result = other - tdarr + + expected = pd.Index( + [pd.Timedelta(0), pd.Timedelta(0), pd.Timestamp("2000-01-01")] + ) + expected = tm.box_expected(expected, box_with_array) + tm.assert_equal(result, expected) + + +class TestTimedeltaArraylikeMulDivOps: + # Tests for timedelta64[ns] + # __mul__, __rmul__, __div__, __rdiv__, __floordiv__, __rfloordiv__ + + # TODO: Moved from tests.series.test_operators; needs cleanup + @pytest.mark.parametrize("m", [1, 3, 10]) + @pytest.mark.parametrize("unit", ["D", "h", "m", "s", "ms", "us", "ns"]) + def test_timedelta64_conversions(self, m, unit): + startdate = Series(pd.date_range("2013-01-01", "2013-01-03")) + enddate = Series(pd.date_range("2013-03-01", "2013-03-03")) + + ser = enddate - startdate + ser[2] = np.nan + + # op + expected = Series([x / np.timedelta64(m, unit) for x in ser]) + result = ser / np.timedelta64(m, unit) + tm.assert_series_equal(result, expected) + + # reverse op + expected = Series([Timedelta(np.timedelta64(m, unit)) / x for x in ser]) + result = np.timedelta64(m, unit) / ser + tm.assert_series_equal(result, expected) + + # ------------------------------------------------------------------ + # Multiplication + # organized with scalar others first, then array-like + + def test_td64arr_mul_int(self, box_with_array): + idx = TimedeltaIndex(np.arange(5, dtype="int64")) + idx = tm.box_expected(idx, box_with_array) + + result = idx * 1 + tm.assert_equal(result, idx) + + result = 1 * idx + tm.assert_equal(result, idx) + + def test_td64arr_mul_tdlike_scalar_raises(self, two_hours, box_with_array): + rng = timedelta_range("1 days", "10 days", name="foo") + rng = tm.box_expected(rng, box_with_array) + with pytest.raises(TypeError): + rng * two_hours + + def test_tdi_mul_int_array_zerodim(self, box_with_array): + rng5 = np.arange(5, dtype="int64") + idx = TimedeltaIndex(rng5) + expected = TimedeltaIndex(rng5 * 5) + + idx = tm.box_expected(idx, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + result = idx * np.array(5, dtype="int64") + tm.assert_equal(result, expected) + + def test_tdi_mul_int_array(self, box_with_array): + rng5 = np.arange(5, dtype="int64") + idx = TimedeltaIndex(rng5) + expected = TimedeltaIndex(rng5 ** 2) + + idx = tm.box_expected(idx, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + result = idx * rng5 + tm.assert_equal(result, expected) + + def test_tdi_mul_int_series(self, box_with_array): + box = box_with_array + xbox = pd.Series if box in [pd.Index, tm.to_array] else box + + idx = TimedeltaIndex(np.arange(5, dtype="int64")) + expected = TimedeltaIndex(np.arange(5, dtype="int64") ** 2) + + idx = tm.box_expected(idx, box) + expected = tm.box_expected(expected, xbox) + + result = idx * pd.Series(np.arange(5, dtype="int64")) + tm.assert_equal(result, expected) + + def test_tdi_mul_float_series(self, box_with_array): + box = box_with_array + xbox = pd.Series if box in [pd.Index, tm.to_array] else box + + idx = TimedeltaIndex(np.arange(5, dtype="int64")) + idx = tm.box_expected(idx, box) + + rng5f = np.arange(5, dtype="float64") + expected = TimedeltaIndex(rng5f * (rng5f + 1.0)) + expected = tm.box_expected(expected, xbox) + + result = idx * Series(rng5f + 1.0) + tm.assert_equal(result, expected) + + # TODO: Put Series/DataFrame in others? + @pytest.mark.parametrize( + "other", + [ + np.arange(1, 11), + pd.Int64Index(range(1, 11)), + pd.UInt64Index(range(1, 11)), + pd.Float64Index(range(1, 11)), + pd.RangeIndex(1, 11), + ], + ids=lambda x: type(x).__name__, + ) + def test_tdi_rmul_arraylike(self, other, box_with_array): + box = box_with_array + xbox = get_upcast_box(box, other) + + tdi = TimedeltaIndex(["1 Day"] * 10) + expected = timedelta_range("1 days", "10 days") + expected._data.freq = None + + tdi = tm.box_expected(tdi, box) + expected = tm.box_expected(expected, xbox) + + result = other * tdi + tm.assert_equal(result, expected) + commute = tdi * other + tm.assert_equal(commute, expected) + + # ------------------------------------------------------------------ + # __div__, __rdiv__ + + def test_td64arr_div_nat_invalid(self, box_with_array): + # don't allow division by NaT (maybe could in the future) + rng = timedelta_range("1 days", "10 days", name="foo") + rng = tm.box_expected(rng, box_with_array) + + with pytest.raises(TypeError, match="unsupported operand type"): + rng / pd.NaT + with pytest.raises(TypeError, match="Cannot divide NaTType by"): + pd.NaT / rng + + def test_td64arr_div_td64nat(self, box_with_array): + # GH#23829 + rng = timedelta_range("1 days", "10 days") + rng = tm.box_expected(rng, box_with_array) + + other = np.timedelta64("NaT") + + expected = np.array([np.nan] * 10) + expected = tm.box_expected(expected, box_with_array) + + result = rng / other + tm.assert_equal(result, expected) + + result = other / rng + tm.assert_equal(result, expected) + + def test_td64arr_div_int(self, box_with_array): + idx = TimedeltaIndex(np.arange(5, dtype="int64")) + idx = tm.box_expected(idx, box_with_array) + + result = idx / 1 + tm.assert_equal(result, idx) + + with pytest.raises(TypeError, match="Cannot divide"): + # GH#23829 + 1 / idx + + def test_td64arr_div_tdlike_scalar(self, two_hours, box_with_array): + # GH#20088, GH#22163 ensure DataFrame returns correct dtype + rng = timedelta_range("1 days", "10 days", name="foo") + expected = pd.Float64Index((np.arange(10) + 1) * 12, name="foo") + + rng = tm.box_expected(rng, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + result = rng / two_hours + tm.assert_equal(result, expected) + + result = two_hours / rng + expected = 1 / expected + tm.assert_equal(result, expected) + + def test_td64arr_div_tdlike_scalar_with_nat(self, two_hours, box_with_array): + rng = TimedeltaIndex(["1 days", pd.NaT, "2 days"], name="foo") + expected = pd.Float64Index([12, np.nan, 24], name="foo") + + rng = tm.box_expected(rng, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + result = rng / two_hours + tm.assert_equal(result, expected) + + result = two_hours / rng + expected = 1 / expected + tm.assert_equal(result, expected) + + def test_td64arr_div_td64_ndarray(self, box_with_array): + # GH#22631 + rng = TimedeltaIndex(["1 days", pd.NaT, "2 days"]) + expected = pd.Float64Index([12, np.nan, 24]) + + rng = tm.box_expected(rng, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + other = np.array([2, 4, 2], dtype="m8[h]") + result = rng / other + tm.assert_equal(result, expected) + + result = rng / tm.box_expected(other, box_with_array) + tm.assert_equal(result, expected) + + result = rng / other.astype(object) + tm.assert_equal(result, expected) + + result = rng / list(other) + tm.assert_equal(result, expected) + + # reversed op + expected = 1 / expected + result = other / rng + tm.assert_equal(result, expected) + + result = tm.box_expected(other, box_with_array) / rng + tm.assert_equal(result, expected) + + result = other.astype(object) / rng + tm.assert_equal(result, expected) + + result = list(other) / rng + tm.assert_equal(result, expected) + + def test_tdarr_div_length_mismatch(self, box_with_array): + rng = TimedeltaIndex(["1 days", pd.NaT, "2 days"]) + mismatched = [1, 2, 3, 4] + + rng = tm.box_expected(rng, box_with_array) + for obj in [mismatched, mismatched[:2]]: + # one shorter, one longer + for other in [obj, np.array(obj), pd.Index(obj)]: + with pytest.raises(ValueError): + rng / other + with pytest.raises(ValueError): + other / rng + + # ------------------------------------------------------------------ + # __floordiv__, __rfloordiv__ + + def test_td64arr_floordiv_tdscalar(self, box_with_array, scalar_td): + # GH#18831 + td1 = Series([timedelta(minutes=5, seconds=3)] * 3) + td1.iloc[2] = np.nan + + expected = Series([0, 0, np.nan]) + + td1 = tm.box_expected(td1, box_with_array, transpose=False) + expected = tm.box_expected(expected, box_with_array, transpose=False) + + result = td1 // scalar_td + tm.assert_equal(result, expected) + + def test_td64arr_rfloordiv_tdscalar(self, box_with_array, scalar_td): + # GH#18831 + td1 = Series([timedelta(minutes=5, seconds=3)] * 3) + td1.iloc[2] = np.nan + + expected = Series([1, 1, np.nan]) + + td1 = tm.box_expected(td1, box_with_array, transpose=False) + expected = tm.box_expected(expected, box_with_array, transpose=False) + + result = scalar_td // td1 + tm.assert_equal(result, expected) + + def test_td64arr_rfloordiv_tdscalar_explicit(self, box_with_array, scalar_td): + # GH#18831 + td1 = Series([timedelta(minutes=5, seconds=3)] * 3) + td1.iloc[2] = np.nan + + expected = Series([1, 1, np.nan]) + + td1 = tm.box_expected(td1, box_with_array, transpose=False) + expected = tm.box_expected(expected, box_with_array, transpose=False) + + # We can test __rfloordiv__ using this syntax, + # see `test_timedelta_rfloordiv` + result = td1.__rfloordiv__(scalar_td) + tm.assert_equal(result, expected) + + def test_td64arr_floordiv_int(self, box_with_array): + idx = TimedeltaIndex(np.arange(5, dtype="int64")) + idx = tm.box_expected(idx, box_with_array) + result = idx // 1 + tm.assert_equal(result, idx) + + pattern = "floor_divide cannot use operands|Cannot divide int by Timedelta*" + with pytest.raises(TypeError, match=pattern): + 1 // idx + + def test_td64arr_floordiv_tdlike_scalar(self, two_hours, box_with_array): + tdi = timedelta_range("1 days", "10 days", name="foo") + expected = pd.Int64Index((np.arange(10) + 1) * 12, name="foo") + + tdi = tm.box_expected(tdi, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + result = tdi // two_hours + tm.assert_equal(result, expected) + + # TODO: Is this redundant with test_td64arr_floordiv_tdlike_scalar? + @pytest.mark.parametrize( + "scalar_td", + [ + timedelta(minutes=10, seconds=7), + Timedelta("10m7s"), + Timedelta("10m7s").to_timedelta64(), + ], + ids=lambda x: type(x).__name__, + ) + def test_td64arr_rfloordiv_tdlike_scalar(self, scalar_td, box_with_array): + # GH#19125 + tdi = TimedeltaIndex(["00:05:03", "00:05:03", pd.NaT], freq=None) + expected = pd.Index([2.0, 2.0, np.nan]) + + tdi = tm.box_expected(tdi, box_with_array, transpose=False) + expected = tm.box_expected(expected, box_with_array, transpose=False) + + res = tdi.__rfloordiv__(scalar_td) + tm.assert_equal(res, expected) + + expected = pd.Index([0.0, 0.0, np.nan]) + expected = tm.box_expected(expected, box_with_array, transpose=False) + + res = tdi // (scalar_td) + tm.assert_equal(res, expected) + + # ------------------------------------------------------------------ + # mod, divmod + # TODO: operations with timedelta-like arrays, numeric arrays, + # reversed ops + + def test_td64arr_mod_tdscalar(self, box_with_array, three_days): + tdi = timedelta_range("1 Day", "9 days") + tdarr = tm.box_expected(tdi, box_with_array) + + expected = TimedeltaIndex(["1 Day", "2 Days", "0 Days"] * 3) + expected = tm.box_expected(expected, box_with_array) + + result = tdarr % three_days + tm.assert_equal(result, expected) + + if box_with_array is pd.DataFrame: + pytest.xfail("DataFrame does not have __divmod__ or __rdivmod__") + + result = divmod(tdarr, three_days) + tm.assert_equal(result[1], expected) + tm.assert_equal(result[0], tdarr // three_days) + + def test_td64arr_mod_int(self, box_with_array): + tdi = timedelta_range("1 ns", "10 ns", periods=10) + tdarr = tm.box_expected(tdi, box_with_array) + + expected = TimedeltaIndex(["1 ns", "0 ns"] * 5) + expected = tm.box_expected(expected, box_with_array) + + result = tdarr % 2 + tm.assert_equal(result, expected) + + with pytest.raises(TypeError): + 2 % tdarr + + if box_with_array is pd.DataFrame: + pytest.xfail("DataFrame does not have __divmod__ or __rdivmod__") + + result = divmod(tdarr, 2) + tm.assert_equal(result[1], expected) + tm.assert_equal(result[0], tdarr // 2) + + def test_td64arr_rmod_tdscalar(self, box_with_array, three_days): + tdi = timedelta_range("1 Day", "9 days") + tdarr = tm.box_expected(tdi, box_with_array) + + expected = ["0 Days", "1 Day", "0 Days"] + ["3 Days"] * 6 + expected = TimedeltaIndex(expected) + expected = tm.box_expected(expected, box_with_array) + + result = three_days % tdarr + tm.assert_equal(result, expected) + + if box_with_array is pd.DataFrame: + pytest.xfail("DataFrame does not have __divmod__ or __rdivmod__") + + result = divmod(three_days, tdarr) + tm.assert_equal(result[1], expected) + tm.assert_equal(result[0], three_days // tdarr) + + # ------------------------------------------------------------------ + # Operations with invalid others + + def test_td64arr_mul_tdscalar_invalid(self, box_with_array, scalar_td): + td1 = Series([timedelta(minutes=5, seconds=3)] * 3) + td1.iloc[2] = np.nan + + td1 = tm.box_expected(td1, box_with_array) + + # check that we are getting a TypeError + # with 'operate' (from core/ops.py) for the ops that are not + # defined + pattern = "operate|unsupported|cannot|not supported" + with pytest.raises(TypeError, match=pattern): + td1 * scalar_td + with pytest.raises(TypeError, match=pattern): + scalar_td * td1 + + def test_td64arr_mul_too_short_raises(self, box_with_array): + idx = TimedeltaIndex(np.arange(5, dtype="int64")) + idx = tm.box_expected(idx, box_with_array) + with pytest.raises(TypeError): + idx * idx[:3] + with pytest.raises(ValueError): + idx * np.array([1, 2]) + + def test_td64arr_mul_td64arr_raises(self, box_with_array): + idx = TimedeltaIndex(np.arange(5, dtype="int64")) + idx = tm.box_expected(idx, box_with_array) + with pytest.raises(TypeError): + idx * idx + + # ------------------------------------------------------------------ + # Operations with numeric others + + def test_td64arr_mul_numeric_scalar(self, box_with_array, one): + # GH#4521 + # divide/multiply by integers + tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") + expected = Series(["-59 Days", "-59 Days", "NaT"], dtype="timedelta64[ns]") + + tdser = tm.box_expected(tdser, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + result = tdser * (-one) + tm.assert_equal(result, expected) + result = (-one) * tdser + tm.assert_equal(result, expected) + + expected = Series(["118 Days", "118 Days", "NaT"], dtype="timedelta64[ns]") + expected = tm.box_expected(expected, box_with_array) + + result = tdser * (2 * one) + tm.assert_equal(result, expected) + result = (2 * one) * tdser + tm.assert_equal(result, expected) + + @pytest.mark.parametrize("two", [2, 2.0, np.array(2), np.array(2.0)]) + def test_td64arr_div_numeric_scalar(self, box_with_array, two): + # GH#4521 + # divide/multiply by integers + tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") + expected = Series(["29.5D", "29.5D", "NaT"], dtype="timedelta64[ns]") + + tdser = tm.box_expected(tdser, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + result = tdser / two + tm.assert_equal(result, expected) + + with pytest.raises(TypeError, match="Cannot divide"): + two / tdser + + @pytest.mark.parametrize( + "vector", + [np.array([20, 30, 40]), pd.Index([20, 30, 40]), Series([20, 30, 40])], + ids=lambda x: type(x).__name__, + ) + def test_td64arr_rmul_numeric_array(self, box_with_array, vector, any_real_dtype): + # GH#4521 + # divide/multiply by integers + xbox = get_upcast_box(box_with_array, vector) + + tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") + vector = vector.astype(any_real_dtype) + + expected = Series(["1180 Days", "1770 Days", "NaT"], dtype="timedelta64[ns]") + + tdser = tm.box_expected(tdser, box_with_array) + expected = tm.box_expected(expected, xbox) + + result = tdser * vector + tm.assert_equal(result, expected) + + result = vector * tdser + tm.assert_equal(result, expected) + + @pytest.mark.parametrize( + "vector", + [np.array([20, 30, 40]), pd.Index([20, 30, 40]), Series([20, 30, 40])], + ids=lambda x: type(x).__name__, + ) + def test_td64arr_div_numeric_array(self, box_with_array, vector, any_real_dtype): + # GH#4521 + # divide/multiply by integers + xbox = get_upcast_box(box_with_array, vector) + + tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") + vector = vector.astype(any_real_dtype) + + expected = Series(["2.95D", "1D 23H 12m", "NaT"], dtype="timedelta64[ns]") + + tdser = tm.box_expected(tdser, box_with_array) + expected = tm.box_expected(expected, xbox) + + result = tdser / vector + tm.assert_equal(result, expected) + + pattern = ( + "true_divide cannot use operands|" + "cannot perform __div__|" + "cannot perform __truediv__|" + "unsupported operand|" + "Cannot divide" + ) + with pytest.raises(TypeError, match=pattern): + vector / tdser + + if not isinstance(vector, pd.Index): + # Index.__rdiv__ won't try to operate elementwise, just raises + result = tdser / vector.astype(object) + if box_with_array is pd.DataFrame: + expected = [tdser.iloc[0, n] / vector[n] for n in range(len(vector))] + else: + expected = [tdser[n] / vector[n] for n in range(len(tdser))] + expected = tm.box_expected(expected, xbox) + tm.assert_equal(result, expected) + + with pytest.raises(TypeError, match=pattern): + vector.astype(object) / tdser + + @pytest.mark.parametrize( + "names", + [ + (None, None, None), + ("Egon", "Venkman", None), + ("NCC1701D", "NCC1701D", "NCC1701D"), + ], + ) + def test_td64arr_mul_int_series(self, box_df_fail, names): + # GH#19042 test for correct name attachment + box = box_df_fail # broadcasts along wrong axis, but doesn't raise + exname = names[2] if box is not tm.to_array else names[1] + + tdi = TimedeltaIndex( + ["0days", "1day", "2days", "3days", "4days"], name=names[0] + ) + # TODO: Should we be parametrizing over types for `ser` too? + ser = Series([0, 1, 2, 3, 4], dtype=np.int64, name=names[1]) + + expected = Series( + ["0days", "1day", "4days", "9days", "16days"], + dtype="timedelta64[ns]", + name=exname, + ) + + tdi = tm.box_expected(tdi, box) + box = Series if (box is pd.Index or box is tm.to_array) else box + expected = tm.box_expected(expected, box) + + result = ser * tdi + tm.assert_equal(result, expected) + + # The direct operation tdi * ser still needs to be fixed. + result = ser.__rmul__(tdi) + tm.assert_equal(result, expected) + + # TODO: Should we be parametrizing over types for `ser` too? + @pytest.mark.parametrize( + "names", + [ + (None, None, None), + ("Egon", "Venkman", None), + ("NCC1701D", "NCC1701D", "NCC1701D"), + ], + ) + def test_float_series_rdiv_td64arr(self, box_with_array, names): + # GH#19042 test for correct name attachment + # TODO: the direct operation TimedeltaIndex / Series still + # needs to be fixed. + box = box_with_array + tdi = TimedeltaIndex( + ["0days", "1day", "2days", "3days", "4days"], name=names[0] + ) + ser = Series([1.5, 3, 4.5, 6, 7.5], dtype=np.float64, name=names[1]) + + xname = names[2] if box is not tm.to_array else names[1] + expected = Series( + [tdi[n] / ser[n] for n in range(len(ser))], + dtype="timedelta64[ns]", + name=xname, + ) + + xbox = box + if box in [pd.Index, tm.to_array] and type(ser) is Series: + xbox = Series + + tdi = tm.box_expected(tdi, box) + expected = tm.box_expected(expected, xbox) + + result = ser.__rdiv__(tdi) + if box is pd.DataFrame: + # TODO: Should we skip this case sooner or test something else? + assert result is NotImplemented + else: + tm.assert_equal(result, expected) + + +class TestTimedelta64ArrayLikeArithmetic: + # Arithmetic tests for timedelta64[ns] vectors fully parametrized over + # DataFrame/Series/TimedeltaIndex/TimedeltaArray. Ideally all arithmetic + # tests will eventually end up here. + + def test_td64arr_pow_invalid(self, scalar_td, box_with_array): + td1 = Series([timedelta(minutes=5, seconds=3)] * 3) + td1.iloc[2] = np.nan + + td1 = tm.box_expected(td1, box_with_array) + + # check that we are getting a TypeError + # with 'operate' (from core/ops.py) for the ops that are not + # defined + pattern = "operate|unsupported|cannot|not supported" + with pytest.raises(TypeError, match=pattern): + scalar_td ** td1 + + with pytest.raises(TypeError, match=pattern): + td1 ** scalar_td diff --git a/venv/Lib/site-packages/pandas/tests/arrays/__init__.py b/venv/Lib/site-packages/pandas/tests/arrays/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/arrays/categorical/__init__.py b/venv/Lib/site-packages/pandas/tests/arrays/categorical/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/arrays/categorical/common.py b/venv/Lib/site-packages/pandas/tests/arrays/categorical/common.py new file mode 100644 index 0000000..4ef9390 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arrays/categorical/common.py @@ -0,0 +1,8 @@ +from pandas import Categorical + + +class TestCategorical: + def setup_method(self, method): + self.factor = Categorical( + ["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True + ) diff --git a/venv/Lib/site-packages/pandas/tests/arrays/categorical/conftest.py b/venv/Lib/site-packages/pandas/tests/arrays/categorical/conftest.py new file mode 100644 index 0000000..640f5df --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arrays/categorical/conftest.py @@ -0,0 +1,7 @@ +import pytest + + +@pytest.fixture(params=[True, False]) +def allow_fill(request): + """Boolean 'allow_fill' parameter for Categorical.take""" + return request.param diff --git a/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_algos.py b/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_algos.py new file mode 100644 index 0000000..835aa87 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_algos.py @@ -0,0 +1,198 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize("ordered", [True, False]) +@pytest.mark.parametrize("categories", [["b", "a", "c"], ["a", "b", "c", "d"]]) +def test_factorize(categories, ordered): + cat = pd.Categorical( + ["b", "b", "a", "c", None], categories=categories, ordered=ordered + ) + codes, uniques = pd.factorize(cat) + expected_codes = np.array([0, 0, 1, 2, -1], dtype=np.intp) + expected_uniques = pd.Categorical( + ["b", "a", "c"], categories=categories, ordered=ordered + ) + + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_categorical_equal(uniques, expected_uniques) + + +def test_factorized_sort(): + cat = pd.Categorical(["b", "b", None, "a"]) + codes, uniques = pd.factorize(cat, sort=True) + expected_codes = np.array([1, 1, -1, 0], dtype=np.intp) + expected_uniques = pd.Categorical(["a", "b"]) + + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_categorical_equal(uniques, expected_uniques) + + +def test_factorized_sort_ordered(): + cat = pd.Categorical( + ["b", "b", None, "a"], categories=["c", "b", "a"], ordered=True + ) + + codes, uniques = pd.factorize(cat, sort=True) + expected_codes = np.array([0, 0, -1, 1], dtype=np.intp) + expected_uniques = pd.Categorical( + ["b", "a"], categories=["c", "b", "a"], ordered=True + ) + + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_categorical_equal(uniques, expected_uniques) + + +def test_isin_cats(): + # GH2003 + cat = pd.Categorical(["a", "b", np.nan]) + + result = cat.isin(["a", np.nan]) + expected = np.array([True, False, True], dtype=bool) + tm.assert_numpy_array_equal(expected, result) + + result = cat.isin(["a", "c"]) + expected = np.array([True, False, False], dtype=bool) + tm.assert_numpy_array_equal(expected, result) + + +@pytest.mark.parametrize( + "to_replace, value, result, expected_error_msg", + [ + ("b", "c", ["a", "c"], "Categorical.categories are different"), + ("c", "d", ["a", "b"], None), + ("b", None, ["a", None], "Categorical.categories length are different"), + ], +) +def test_replace(to_replace, value, result, expected_error_msg): + # GH 26988 + cat = pd.Categorical(["a", "b"]) + expected = pd.Categorical(result) + result = cat.replace(to_replace, value) + tm.assert_categorical_equal(result, expected) + if to_replace == "b": # the "c" test is supposed to be unchanged + with pytest.raises(AssertionError, match=expected_error_msg): + # ensure non-inplace call does not affect original + tm.assert_categorical_equal(cat, expected) + cat.replace(to_replace, value, inplace=True) + tm.assert_categorical_equal(cat, expected) + + +@pytest.mark.parametrize("empty", [[], pd.Series(dtype=object), np.array([])]) +def test_isin_empty(empty): + s = pd.Categorical(["a", "b"]) + expected = np.array([False, False], dtype=bool) + + result = s.isin(empty) + tm.assert_numpy_array_equal(expected, result) + + +def test_diff(): + s = pd.Series([1, 2, 3], dtype="category") + with tm.assert_produces_warning(FutureWarning): + result = s.diff() + expected = pd.Series([np.nan, 1, 1]) + tm.assert_series_equal(result, expected) + + expected = expected.to_frame(name="A") + df = s.to_frame(name="A") + with tm.assert_produces_warning(FutureWarning): + result = df.diff() + + tm.assert_frame_equal(result, expected) + + +class TestTake: + # https://github.com/pandas-dev/pandas/issues/20664 + + def test_take_default_allow_fill(self): + cat = pd.Categorical(["a", "b"]) + with tm.assert_produces_warning(None): + result = cat.take([0, -1]) + + assert result.equals(cat) + + def test_take_positive_no_warning(self): + cat = pd.Categorical(["a", "b"]) + with tm.assert_produces_warning(None): + cat.take([0, 0]) + + def test_take_bounds(self, allow_fill): + # https://github.com/pandas-dev/pandas/issues/20664 + cat = pd.Categorical(["a", "b", "a"]) + if allow_fill: + msg = "indices are out-of-bounds" + else: + msg = "index 4 is out of bounds for( axis 0 with)? size 3" + with pytest.raises(IndexError, match=msg): + cat.take([4, 5], allow_fill=allow_fill) + + def test_take_empty(self, allow_fill): + # https://github.com/pandas-dev/pandas/issues/20664 + cat = pd.Categorical([], categories=["a", "b"]) + if allow_fill: + msg = "indices are out-of-bounds" + else: + msg = "cannot do a non-empty take from an empty axes" + with pytest.raises(IndexError, match=msg): + cat.take([0], allow_fill=allow_fill) + + def test_positional_take(self, ordered_fixture): + cat = pd.Categorical( + ["a", "a", "b", "b"], categories=["b", "a"], ordered=ordered_fixture + ) + result = cat.take([0, 1, 2], allow_fill=False) + expected = pd.Categorical( + ["a", "a", "b"], categories=cat.categories, ordered=ordered_fixture + ) + tm.assert_categorical_equal(result, expected) + + def test_positional_take_unobserved(self, ordered_fixture): + cat = pd.Categorical( + ["a", "b"], categories=["a", "b", "c"], ordered=ordered_fixture + ) + result = cat.take([1, 0], allow_fill=False) + expected = pd.Categorical( + ["b", "a"], categories=cat.categories, ordered=ordered_fixture + ) + tm.assert_categorical_equal(result, expected) + + def test_take_allow_fill(self): + # https://github.com/pandas-dev/pandas/issues/23296 + cat = pd.Categorical(["a", "a", "b"]) + result = cat.take([0, -1, -1], allow_fill=True) + expected = pd.Categorical(["a", np.nan, np.nan], categories=["a", "b"]) + tm.assert_categorical_equal(result, expected) + + def test_take_fill_with_negative_one(self): + # -1 was a category + cat = pd.Categorical([-1, 0, 1]) + result = cat.take([0, -1, 1], allow_fill=True, fill_value=-1) + expected = pd.Categorical([-1, -1, 0], categories=[-1, 0, 1]) + tm.assert_categorical_equal(result, expected) + + def test_take_fill_value(self): + # https://github.com/pandas-dev/pandas/issues/23296 + cat = pd.Categorical(["a", "b", "c"]) + result = cat.take([0, 1, -1], fill_value="a", allow_fill=True) + expected = pd.Categorical(["a", "b", "a"], categories=["a", "b", "c"]) + tm.assert_categorical_equal(result, expected) + + def test_take_fill_value_new_raises(self): + # https://github.com/pandas-dev/pandas/issues/23296 + cat = pd.Categorical(["a", "b", "c"]) + xpr = r"'fill_value' \('d'\) is not in this Categorical's categories." + with pytest.raises(TypeError, match=xpr): + cat.take([0, 1, -1], fill_value="d", allow_fill=True) + + def test_take_nd_deprecated(self): + cat = pd.Categorical(["a", "b", "c"]) + with tm.assert_produces_warning(FutureWarning): + cat.take_nd([0, 1]) + + ci = pd.Index(cat) + with tm.assert_produces_warning(FutureWarning): + ci.take_nd([0, 1]) diff --git a/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_analytics.py b/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_analytics.py new file mode 100644 index 0000000..90fcf12 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_analytics.py @@ -0,0 +1,340 @@ +import sys + +import numpy as np +import pytest + +from pandas.compat import PYPY + +from pandas import Categorical, Index, NaT, Series, date_range +import pandas._testing as tm +from pandas.api.types import is_scalar + + +class TestCategoricalAnalytics: + @pytest.mark.parametrize("aggregation", ["min", "max"]) + def test_min_max_not_ordered_raises(self, aggregation): + # unordered cats have no min/max + cat = Categorical(["a", "b", "c", "d"], ordered=False) + msg = "Categorical is not ordered for operation {}" + agg_func = getattr(cat, aggregation) + + with pytest.raises(TypeError, match=msg.format(aggregation)): + agg_func() + + def test_min_max_ordered(self): + cat = Categorical(["a", "b", "c", "d"], ordered=True) + _min = cat.min() + _max = cat.max() + assert _min == "a" + assert _max == "d" + + cat = Categorical( + ["a", "b", "c", "d"], categories=["d", "c", "b", "a"], ordered=True + ) + _min = cat.min() + _max = cat.max() + assert _min == "d" + assert _max == "a" + + @pytest.mark.parametrize( + "categories,expected", + [ + (list("ABC"), np.NaN), + ([1, 2, 3], np.NaN), + pytest.param( + Series(date_range("2020-01-01", periods=3), dtype="category"), + NaT, + marks=pytest.mark.xfail( + reason="https://github.com/pandas-dev/pandas/issues/29962" + ), + ), + ], + ) + @pytest.mark.parametrize("aggregation", ["min", "max"]) + def test_min_max_ordered_empty(self, categories, expected, aggregation): + # GH 30227 + cat = Categorical([], categories=list("ABC"), ordered=True) + + agg_func = getattr(cat, aggregation) + result = agg_func() + assert result is expected + + @pytest.mark.parametrize("skipna", [True, False]) + def test_min_max_with_nan(self, skipna): + # GH 25303 + cat = Categorical( + [np.nan, "b", "c", np.nan], categories=["d", "c", "b", "a"], ordered=True + ) + _min = cat.min(skipna=skipna) + _max = cat.max(skipna=skipna) + + if skipna is False: + assert np.isnan(_min) + assert np.isnan(_max) + else: + assert _min == "c" + assert _max == "b" + + cat = Categorical( + [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True + ) + _min = cat.min(skipna=skipna) + _max = cat.max(skipna=skipna) + + if skipna is False: + assert np.isnan(_min) + assert np.isnan(_max) + else: + assert _min == 2 + assert _max == 1 + + @pytest.mark.parametrize("method", ["min", "max"]) + def test_deprecate_numeric_only_min_max(self, method): + # GH 25303 + cat = Categorical( + [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True + ) + with tm.assert_produces_warning(expected_warning=FutureWarning): + getattr(cat, method)(numeric_only=True) + + @pytest.mark.parametrize( + "values,categories,exp_mode", + [ + ([1, 1, 2, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5]), + ([1, 1, 1, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5, 1]), + ([1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [5, 4, 3, 2, 1]), + ([np.nan, np.nan, np.nan, 4, 5], [5, 4, 3, 2, 1], [5, 4]), + ([np.nan, np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4]), + ([np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4]), + ], + ) + def test_mode(self, values, categories, exp_mode): + s = Categorical(values, categories=categories, ordered=True) + res = s.mode() + exp = Categorical(exp_mode, categories=categories, ordered=True) + tm.assert_categorical_equal(res, exp) + + def test_searchsorted(self, ordered_fixture): + # https://github.com/pandas-dev/pandas/issues/8420 + # https://github.com/pandas-dev/pandas/issues/14522 + + cat = Categorical( + ["cheese", "milk", "apple", "bread", "bread"], + categories=["cheese", "milk", "apple", "bread"], + ordered=ordered_fixture, + ) + ser = Series(cat) + + # Searching for single item argument, side='left' (default) + res_cat = cat.searchsorted("apple") + assert res_cat == 2 + assert is_scalar(res_cat) + + res_ser = ser.searchsorted("apple") + assert res_ser == 2 + assert is_scalar(res_ser) + + # Searching for single item array, side='left' (default) + res_cat = cat.searchsorted(["bread"]) + res_ser = ser.searchsorted(["bread"]) + exp = np.array([3], dtype=np.intp) + tm.assert_numpy_array_equal(res_cat, exp) + tm.assert_numpy_array_equal(res_ser, exp) + + # Searching for several items array, side='right' + res_cat = cat.searchsorted(["apple", "bread"], side="right") + res_ser = ser.searchsorted(["apple", "bread"], side="right") + exp = np.array([3, 5], dtype=np.intp) + tm.assert_numpy_array_equal(res_cat, exp) + tm.assert_numpy_array_equal(res_ser, exp) + + # Searching for a single value that is not from the Categorical + with pytest.raises(KeyError, match="cucumber"): + cat.searchsorted("cucumber") + with pytest.raises(KeyError, match="cucumber"): + ser.searchsorted("cucumber") + + # Searching for multiple values one of each is not from the Categorical + with pytest.raises(KeyError, match="cucumber"): + cat.searchsorted(["bread", "cucumber"]) + with pytest.raises(KeyError, match="cucumber"): + ser.searchsorted(["bread", "cucumber"]) + + def test_unique(self): + # categories are reordered based on value when ordered=False + cat = Categorical(["a", "b"]) + exp = Index(["a", "b"]) + res = cat.unique() + tm.assert_index_equal(res.categories, exp) + tm.assert_categorical_equal(res, cat) + + cat = Categorical(["a", "b", "a", "a"], categories=["a", "b", "c"]) + res = cat.unique() + tm.assert_index_equal(res.categories, exp) + tm.assert_categorical_equal(res, Categorical(exp)) + + cat = Categorical(["c", "a", "b", "a", "a"], categories=["a", "b", "c"]) + exp = Index(["c", "a", "b"]) + res = cat.unique() + tm.assert_index_equal(res.categories, exp) + exp_cat = Categorical(exp, categories=["c", "a", "b"]) + tm.assert_categorical_equal(res, exp_cat) + + # nan must be removed + cat = Categorical(["b", np.nan, "b", np.nan, "a"], categories=["a", "b", "c"]) + res = cat.unique() + exp = Index(["b", "a"]) + tm.assert_index_equal(res.categories, exp) + exp_cat = Categorical(["b", np.nan, "a"], categories=["b", "a"]) + tm.assert_categorical_equal(res, exp_cat) + + def test_unique_ordered(self): + # keep categories order when ordered=True + cat = Categorical(["b", "a", "b"], categories=["a", "b"], ordered=True) + res = cat.unique() + exp_cat = Categorical(["b", "a"], categories=["a", "b"], ordered=True) + tm.assert_categorical_equal(res, exp_cat) + + cat = Categorical( + ["c", "b", "a", "a"], categories=["a", "b", "c"], ordered=True + ) + res = cat.unique() + exp_cat = Categorical(["c", "b", "a"], categories=["a", "b", "c"], ordered=True) + tm.assert_categorical_equal(res, exp_cat) + + cat = Categorical(["b", "a", "a"], categories=["a", "b", "c"], ordered=True) + res = cat.unique() + exp_cat = Categorical(["b", "a"], categories=["a", "b"], ordered=True) + tm.assert_categorical_equal(res, exp_cat) + + cat = Categorical( + ["b", "b", np.nan, "a"], categories=["a", "b", "c"], ordered=True + ) + res = cat.unique() + exp_cat = Categorical(["b", np.nan, "a"], categories=["a", "b"], ordered=True) + tm.assert_categorical_equal(res, exp_cat) + + def test_unique_index_series(self): + c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1]) + # Categorical.unique sorts categories by appearance order + # if ordered=False + exp = Categorical([3, 1, 2], categories=[3, 1, 2]) + tm.assert_categorical_equal(c.unique(), exp) + + tm.assert_index_equal(Index(c).unique(), Index(exp)) + tm.assert_categorical_equal(Series(c).unique(), exp) + + c = Categorical([1, 1, 2, 2], categories=[3, 2, 1]) + exp = Categorical([1, 2], categories=[1, 2]) + tm.assert_categorical_equal(c.unique(), exp) + tm.assert_index_equal(Index(c).unique(), Index(exp)) + tm.assert_categorical_equal(Series(c).unique(), exp) + + c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1], ordered=True) + # Categorical.unique keeps categories order if ordered=True + exp = Categorical([3, 1, 2], categories=[3, 2, 1], ordered=True) + tm.assert_categorical_equal(c.unique(), exp) + + tm.assert_index_equal(Index(c).unique(), Index(exp)) + tm.assert_categorical_equal(Series(c).unique(), exp) + + def test_shift(self): + # GH 9416 + cat = Categorical(["a", "b", "c", "d", "a"]) + + # shift forward + sp1 = cat.shift(1) + xp1 = Categorical([np.nan, "a", "b", "c", "d"]) + tm.assert_categorical_equal(sp1, xp1) + tm.assert_categorical_equal(cat[:-1], sp1[1:]) + + # shift back + sn2 = cat.shift(-2) + xp2 = Categorical( + ["c", "d", "a", np.nan, np.nan], categories=["a", "b", "c", "d"] + ) + tm.assert_categorical_equal(sn2, xp2) + tm.assert_categorical_equal(cat[2:], sn2[:-2]) + + # shift by zero + tm.assert_categorical_equal(cat, cat.shift(0)) + + def test_nbytes(self): + cat = Categorical([1, 2, 3]) + exp = 3 + 3 * 8 # 3 int8s for values + 3 int64s for categories + assert cat.nbytes == exp + + def test_memory_usage(self): + cat = Categorical([1, 2, 3]) + + # .categories is an index, so we include the hashtable + assert 0 < cat.nbytes <= cat.memory_usage() + assert 0 < cat.nbytes <= cat.memory_usage(deep=True) + + cat = Categorical(["foo", "foo", "bar"]) + assert cat.memory_usage(deep=True) > cat.nbytes + + if not PYPY: + # sys.getsizeof will call the .memory_usage with + # deep=True, and add on some GC overhead + diff = cat.memory_usage(deep=True) - sys.getsizeof(cat) + assert abs(diff) < 100 + + def test_map(self): + c = Categorical(list("ABABC"), categories=list("CBA"), ordered=True) + result = c.map(lambda x: x.lower()) + exp = Categorical(list("ababc"), categories=list("cba"), ordered=True) + tm.assert_categorical_equal(result, exp) + + c = Categorical(list("ABABC"), categories=list("ABC"), ordered=False) + result = c.map(lambda x: x.lower()) + exp = Categorical(list("ababc"), categories=list("abc"), ordered=False) + tm.assert_categorical_equal(result, exp) + + result = c.map(lambda x: 1) + # GH 12766: Return an index not an array + tm.assert_index_equal(result, Index(np.array([1] * 5, dtype=np.int64))) + + @pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0]) + def test_validate_inplace_raises(self, value): + cat = Categorical(["A", "B", "B", "C", "A"]) + msg = ( + 'For argument "inplace" expected type bool, ' + f"received type {type(value).__name__}" + ) + with pytest.raises(ValueError, match=msg): + cat.set_ordered(value=True, inplace=value) + + with pytest.raises(ValueError, match=msg): + cat.as_ordered(inplace=value) + + with pytest.raises(ValueError, match=msg): + cat.as_unordered(inplace=value) + + with pytest.raises(ValueError, match=msg): + cat.set_categories(["X", "Y", "Z"], rename=True, inplace=value) + + with pytest.raises(ValueError, match=msg): + cat.rename_categories(["X", "Y", "Z"], inplace=value) + + with pytest.raises(ValueError, match=msg): + cat.reorder_categories(["X", "Y", "Z"], ordered=True, inplace=value) + + with pytest.raises(ValueError, match=msg): + cat.add_categories(new_categories=["D", "E", "F"], inplace=value) + + with pytest.raises(ValueError, match=msg): + cat.remove_categories(removals=["D", "E", "F"], inplace=value) + + with pytest.raises(ValueError, match=msg): + cat.remove_unused_categories(inplace=value) + + with pytest.raises(ValueError, match=msg): + cat.sort_values(inplace=value) + + def test_isna(self): + exp = np.array([False, False, True]) + c = Categorical(["a", "b", np.nan]) + res = c.isna() + + tm.assert_numpy_array_equal(res, exp) diff --git a/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_api.py b/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_api.py new file mode 100644 index 0000000..f49f70f --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_api.py @@ -0,0 +1,511 @@ +import re + +import numpy as np +import pytest + +from pandas import Categorical, CategoricalIndex, DataFrame, Index, Series +import pandas._testing as tm +from pandas.core.arrays.categorical import _recode_for_categories +from pandas.tests.arrays.categorical.common import TestCategorical + + +class TestCategoricalAPI: + def test_ordered_api(self): + # GH 9347 + cat1 = Categorical(list("acb"), ordered=False) + tm.assert_index_equal(cat1.categories, Index(["a", "b", "c"])) + assert not cat1.ordered + + cat2 = Categorical(list("acb"), categories=list("bca"), ordered=False) + tm.assert_index_equal(cat2.categories, Index(["b", "c", "a"])) + assert not cat2.ordered + + cat3 = Categorical(list("acb"), ordered=True) + tm.assert_index_equal(cat3.categories, Index(["a", "b", "c"])) + assert cat3.ordered + + cat4 = Categorical(list("acb"), categories=list("bca"), ordered=True) + tm.assert_index_equal(cat4.categories, Index(["b", "c", "a"])) + assert cat4.ordered + + def test_set_ordered(self): + + cat = Categorical(["a", "b", "c", "a"], ordered=True) + cat2 = cat.as_unordered() + assert not cat2.ordered + cat2 = cat.as_ordered() + assert cat2.ordered + cat2.as_unordered(inplace=True) + assert not cat2.ordered + cat2.as_ordered(inplace=True) + assert cat2.ordered + + assert cat2.set_ordered(True).ordered + assert not cat2.set_ordered(False).ordered + cat2.set_ordered(True, inplace=True) + assert cat2.ordered + cat2.set_ordered(False, inplace=True) + assert not cat2.ordered + + # removed in 0.19.0 + msg = "can't set attribute" + with pytest.raises(AttributeError, match=msg): + cat.ordered = True + with pytest.raises(AttributeError, match=msg): + cat.ordered = False + + def test_rename_categories(self): + cat = Categorical(["a", "b", "c", "a"]) + + # inplace=False: the old one must not be changed + res = cat.rename_categories([1, 2, 3]) + tm.assert_numpy_array_equal( + res.__array__(), np.array([1, 2, 3, 1], dtype=np.int64) + ) + tm.assert_index_equal(res.categories, Index([1, 2, 3])) + + exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_) + tm.assert_numpy_array_equal(cat.__array__(), exp_cat) + + exp_cat = Index(["a", "b", "c"]) + tm.assert_index_equal(cat.categories, exp_cat) + + # GH18862 (let rename_categories take callables) + result = cat.rename_categories(lambda x: x.upper()) + expected = Categorical(["A", "B", "C", "A"]) + tm.assert_categorical_equal(result, expected) + + # and now inplace + res = cat.rename_categories([1, 2, 3], inplace=True) + assert res is None + tm.assert_numpy_array_equal( + cat.__array__(), np.array([1, 2, 3, 1], dtype=np.int64) + ) + tm.assert_index_equal(cat.categories, Index([1, 2, 3])) + + @pytest.mark.parametrize("new_categories", [[1, 2, 3, 4], [1, 2]]) + def test_rename_categories_wrong_length_raises(self, new_categories): + cat = Categorical(["a", "b", "c", "a"]) + msg = ( + "new categories need to have the same number of items as the " + "old categories!" + ) + with pytest.raises(ValueError, match=msg): + cat.rename_categories(new_categories) + + def test_rename_categories_series(self): + # https://github.com/pandas-dev/pandas/issues/17981 + c = Categorical(["a", "b"]) + result = c.rename_categories(Series([0, 1], index=["a", "b"])) + expected = Categorical([0, 1]) + tm.assert_categorical_equal(result, expected) + + def test_rename_categories_dict(self): + # GH 17336 + cat = Categorical(["a", "b", "c", "d"]) + res = cat.rename_categories({"a": 4, "b": 3, "c": 2, "d": 1}) + expected = Index([4, 3, 2, 1]) + tm.assert_index_equal(res.categories, expected) + + # Test for inplace + res = cat.rename_categories({"a": 4, "b": 3, "c": 2, "d": 1}, inplace=True) + assert res is None + tm.assert_index_equal(cat.categories, expected) + + # Test for dicts of smaller length + cat = Categorical(["a", "b", "c", "d"]) + res = cat.rename_categories({"a": 1, "c": 3}) + + expected = Index([1, "b", 3, "d"]) + tm.assert_index_equal(res.categories, expected) + + # Test for dicts with bigger length + cat = Categorical(["a", "b", "c", "d"]) + res = cat.rename_categories({"a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6}) + expected = Index([1, 2, 3, 4]) + tm.assert_index_equal(res.categories, expected) + + # Test for dicts with no items from old categories + cat = Categorical(["a", "b", "c", "d"]) + res = cat.rename_categories({"f": 1, "g": 3}) + + expected = Index(["a", "b", "c", "d"]) + tm.assert_index_equal(res.categories, expected) + + def test_reorder_categories(self): + cat = Categorical(["a", "b", "c", "a"], ordered=True) + old = cat.copy() + new = Categorical( + ["a", "b", "c", "a"], categories=["c", "b", "a"], ordered=True + ) + + # first inplace == False + res = cat.reorder_categories(["c", "b", "a"]) + # cat must be the same as before + tm.assert_categorical_equal(cat, old) + # only res is changed + tm.assert_categorical_equal(res, new) + + # inplace == True + res = cat.reorder_categories(["c", "b", "a"], inplace=True) + assert res is None + tm.assert_categorical_equal(cat, new) + + @pytest.mark.parametrize( + "new_categories", + [ + ["a"], # not all "old" included in "new" + ["a", "b", "d"], # still not all "old" in "new" + ["a", "b", "c", "d"], # all "old" included in "new", but too long + ], + ) + def test_reorder_categories_raises(self, new_categories): + cat = Categorical(["a", "b", "c", "a"], ordered=True) + msg = "items in new_categories are not the same as in old categories" + with pytest.raises(ValueError, match=msg): + cat.reorder_categories(new_categories) + + def test_add_categories(self): + cat = Categorical(["a", "b", "c", "a"], ordered=True) + old = cat.copy() + new = Categorical( + ["a", "b", "c", "a"], categories=["a", "b", "c", "d"], ordered=True + ) + + # first inplace == False + res = cat.add_categories("d") + tm.assert_categorical_equal(cat, old) + tm.assert_categorical_equal(res, new) + + res = cat.add_categories(["d"]) + tm.assert_categorical_equal(cat, old) + tm.assert_categorical_equal(res, new) + + # inplace == True + res = cat.add_categories("d", inplace=True) + tm.assert_categorical_equal(cat, new) + assert res is None + + # GH 9927 + cat = Categorical(list("abc"), ordered=True) + expected = Categorical(list("abc"), categories=list("abcde"), ordered=True) + # test with Series, np.array, index, list + res = cat.add_categories(Series(["d", "e"])) + tm.assert_categorical_equal(res, expected) + res = cat.add_categories(np.array(["d", "e"])) + tm.assert_categorical_equal(res, expected) + res = cat.add_categories(Index(["d", "e"])) + tm.assert_categorical_equal(res, expected) + res = cat.add_categories(["d", "e"]) + tm.assert_categorical_equal(res, expected) + + def test_add_categories_existing_raises(self): + # new is in old categories + cat = Categorical(["a", "b", "c", "d"], ordered=True) + msg = re.escape("new categories must not include old categories: {'d'}") + with pytest.raises(ValueError, match=msg): + cat.add_categories(["d"]) + + def test_set_categories(self): + cat = Categorical(["a", "b", "c", "a"], ordered=True) + exp_categories = Index(["c", "b", "a"]) + exp_values = np.array(["a", "b", "c", "a"], dtype=np.object_) + + res = cat.set_categories(["c", "b", "a"], inplace=True) + tm.assert_index_equal(cat.categories, exp_categories) + tm.assert_numpy_array_equal(cat.__array__(), exp_values) + assert res is None + + res = cat.set_categories(["a", "b", "c"]) + # cat must be the same as before + tm.assert_index_equal(cat.categories, exp_categories) + tm.assert_numpy_array_equal(cat.__array__(), exp_values) + # only res is changed + exp_categories_back = Index(["a", "b", "c"]) + tm.assert_index_equal(res.categories, exp_categories_back) + tm.assert_numpy_array_equal(res.__array__(), exp_values) + + # not all "old" included in "new" -> all not included ones are now + # np.nan + cat = Categorical(["a", "b", "c", "a"], ordered=True) + res = cat.set_categories(["a"]) + tm.assert_numpy_array_equal(res.codes, np.array([0, -1, -1, 0], dtype=np.int8)) + + # still not all "old" in "new" + res = cat.set_categories(["a", "b", "d"]) + tm.assert_numpy_array_equal(res.codes, np.array([0, 1, -1, 0], dtype=np.int8)) + tm.assert_index_equal(res.categories, Index(["a", "b", "d"])) + + # all "old" included in "new" + cat = cat.set_categories(["a", "b", "c", "d"]) + exp_categories = Index(["a", "b", "c", "d"]) + tm.assert_index_equal(cat.categories, exp_categories) + + # internals... + c = Categorical([1, 2, 3, 4, 1], categories=[1, 2, 3, 4], ordered=True) + tm.assert_numpy_array_equal(c._codes, np.array([0, 1, 2, 3, 0], dtype=np.int8)) + tm.assert_index_equal(c.categories, Index([1, 2, 3, 4])) + + exp = np.array([1, 2, 3, 4, 1], dtype=np.int64) + tm.assert_numpy_array_equal(c.to_dense(), exp) + + # all "pointers" to '4' must be changed from 3 to 0,... + c = c.set_categories([4, 3, 2, 1]) + + # positions are changed + tm.assert_numpy_array_equal(c._codes, np.array([3, 2, 1, 0, 3], dtype=np.int8)) + + # categories are now in new order + tm.assert_index_equal(c.categories, Index([4, 3, 2, 1])) + + # output is the same + exp = np.array([1, 2, 3, 4, 1], dtype=np.int64) + tm.assert_numpy_array_equal(c.to_dense(), exp) + assert c.min() == 4 + assert c.max() == 1 + + # set_categories should set the ordering if specified + c2 = c.set_categories([4, 3, 2, 1], ordered=False) + assert not c2.ordered + + tm.assert_numpy_array_equal(c.to_dense(), c2.to_dense()) + + # set_categories should pass thru the ordering + c2 = c.set_ordered(False).set_categories([4, 3, 2, 1]) + assert not c2.ordered + + tm.assert_numpy_array_equal(c.to_dense(), c2.to_dense()) + + @pytest.mark.parametrize( + "values, categories, new_categories", + [ + # No NaNs, same cats, same order + (["a", "b", "a"], ["a", "b"], ["a", "b"]), + # No NaNs, same cats, different order + (["a", "b", "a"], ["a", "b"], ["b", "a"]), + # Same, unsorted + (["b", "a", "a"], ["a", "b"], ["a", "b"]), + # No NaNs, same cats, different order + (["b", "a", "a"], ["a", "b"], ["b", "a"]), + # NaNs + (["a", "b", "c"], ["a", "b"], ["a", "b"]), + (["a", "b", "c"], ["a", "b"], ["b", "a"]), + (["b", "a", "c"], ["a", "b"], ["a", "b"]), + (["b", "a", "c"], ["a", "b"], ["a", "b"]), + # Introduce NaNs + (["a", "b", "c"], ["a", "b"], ["a"]), + (["a", "b", "c"], ["a", "b"], ["b"]), + (["b", "a", "c"], ["a", "b"], ["a"]), + (["b", "a", "c"], ["a", "b"], ["a"]), + # No overlap + (["a", "b", "c"], ["a", "b"], ["d", "e"]), + ], + ) + @pytest.mark.parametrize("ordered", [True, False]) + def test_set_categories_many(self, values, categories, new_categories, ordered): + c = Categorical(values, categories) + expected = Categorical(values, new_categories, ordered) + result = c.set_categories(new_categories, ordered=ordered) + tm.assert_categorical_equal(result, expected) + + def test_set_categories_rename_less(self): + # GH 24675 + cat = Categorical(["A", "B"]) + result = cat.set_categories(["A"], rename=True) + expected = Categorical(["A", np.nan]) + tm.assert_categorical_equal(result, expected) + + def test_set_categories_private(self): + cat = Categorical(["a", "b", "c"], categories=["a", "b", "c", "d"]) + cat._set_categories(["a", "c", "d", "e"]) + expected = Categorical(["a", "c", "d"], categories=list("acde")) + tm.assert_categorical_equal(cat, expected) + + # fastpath + cat = Categorical(["a", "b", "c"], categories=["a", "b", "c", "d"]) + cat._set_categories(["a", "c", "d", "e"], fastpath=True) + expected = Categorical(["a", "c", "d"], categories=list("acde")) + tm.assert_categorical_equal(cat, expected) + + def test_remove_categories(self): + cat = Categorical(["a", "b", "c", "a"], ordered=True) + old = cat.copy() + new = Categorical(["a", "b", np.nan, "a"], categories=["a", "b"], ordered=True) + + # first inplace == False + res = cat.remove_categories("c") + tm.assert_categorical_equal(cat, old) + tm.assert_categorical_equal(res, new) + + res = cat.remove_categories(["c"]) + tm.assert_categorical_equal(cat, old) + tm.assert_categorical_equal(res, new) + + # inplace == True + res = cat.remove_categories("c", inplace=True) + tm.assert_categorical_equal(cat, new) + assert res is None + + @pytest.mark.parametrize("removals", [["c"], ["c", np.nan], "c", ["c", "c"]]) + def test_remove_categories_raises(self, removals): + cat = Categorical(["a", "b", "a"]) + message = re.escape("removals must all be in old categories: {'c'}") + + with pytest.raises(ValueError, match=message): + cat.remove_categories(removals) + + def test_remove_unused_categories(self): + c = Categorical(["a", "b", "c", "d", "a"], categories=["a", "b", "c", "d", "e"]) + exp_categories_all = Index(["a", "b", "c", "d", "e"]) + exp_categories_dropped = Index(["a", "b", "c", "d"]) + + tm.assert_index_equal(c.categories, exp_categories_all) + + res = c.remove_unused_categories() + tm.assert_index_equal(res.categories, exp_categories_dropped) + tm.assert_index_equal(c.categories, exp_categories_all) + + res = c.remove_unused_categories(inplace=True) + tm.assert_index_equal(c.categories, exp_categories_dropped) + assert res is None + + # with NaN values (GH11599) + c = Categorical(["a", "b", "c", np.nan], categories=["a", "b", "c", "d", "e"]) + res = c.remove_unused_categories() + tm.assert_index_equal(res.categories, Index(np.array(["a", "b", "c"]))) + exp_codes = np.array([0, 1, 2, -1], dtype=np.int8) + tm.assert_numpy_array_equal(res.codes, exp_codes) + tm.assert_index_equal(c.categories, exp_categories_all) + + val = ["F", np.nan, "D", "B", "D", "F", np.nan] + cat = Categorical(values=val, categories=list("ABCDEFG")) + out = cat.remove_unused_categories() + tm.assert_index_equal(out.categories, Index(["B", "D", "F"])) + exp_codes = np.array([2, -1, 1, 0, 1, 2, -1], dtype=np.int8) + tm.assert_numpy_array_equal(out.codes, exp_codes) + assert out.tolist() == val + + alpha = list("abcdefghijklmnopqrstuvwxyz") + val = np.random.choice(alpha[::2], 10000).astype("object") + val[np.random.choice(len(val), 100)] = np.nan + + cat = Categorical(values=val, categories=alpha) + out = cat.remove_unused_categories() + assert out.tolist() == val.tolist() + + +class TestCategoricalAPIWithFactor(TestCategorical): + def test_describe(self): + # string type + desc = self.factor.describe() + assert self.factor.ordered + exp_index = CategoricalIndex( + ["a", "b", "c"], name="categories", ordered=self.factor.ordered + ) + expected = DataFrame( + {"counts": [3, 2, 3], "freqs": [3 / 8.0, 2 / 8.0, 3 / 8.0]}, index=exp_index + ) + tm.assert_frame_equal(desc, expected) + + # check unused categories + cat = self.factor.copy() + cat.set_categories(["a", "b", "c", "d"], inplace=True) + desc = cat.describe() + + exp_index = CategoricalIndex( + list("abcd"), ordered=self.factor.ordered, name="categories" + ) + expected = DataFrame( + {"counts": [3, 2, 3, 0], "freqs": [3 / 8.0, 2 / 8.0, 3 / 8.0, 0]}, + index=exp_index, + ) + tm.assert_frame_equal(desc, expected) + + # check an integer one + cat = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1]) + desc = cat.describe() + exp_index = CategoricalIndex([1, 2, 3], ordered=cat.ordered, name="categories") + expected = DataFrame( + {"counts": [5, 3, 3], "freqs": [5 / 11.0, 3 / 11.0, 3 / 11.0]}, + index=exp_index, + ) + tm.assert_frame_equal(desc, expected) + + # https://github.com/pandas-dev/pandas/issues/3678 + # describe should work with NaN + cat = Categorical([np.nan, 1, 2, 2]) + desc = cat.describe() + expected = DataFrame( + {"counts": [1, 2, 1], "freqs": [1 / 4.0, 2 / 4.0, 1 / 4.0]}, + index=CategoricalIndex( + [1, 2, np.nan], categories=[1, 2], name="categories" + ), + ) + tm.assert_frame_equal(desc, expected) + + def test_set_categories_inplace(self): + cat = self.factor.copy() + cat.set_categories(["a", "b", "c", "d"], inplace=True) + tm.assert_index_equal(cat.categories, Index(["a", "b", "c", "d"])) + + +class TestPrivateCategoricalAPI: + def test_codes_immutable(self): + + # Codes should be read only + c = Categorical(["a", "b", "c", "a", np.nan]) + exp = np.array([0, 1, 2, 0, -1], dtype="int8") + tm.assert_numpy_array_equal(c.codes, exp) + + # Assignments to codes should raise + with pytest.raises(ValueError, match="cannot set Categorical codes directly"): + c.codes = np.array([0, 1, 2, 0, 1], dtype="int8") + + # changes in the codes array should raise + codes = c.codes + + with pytest.raises(ValueError, match="assignment destination is read-only"): + codes[4] = 1 + + # But even after getting the codes, the original array should still be + # writeable! + c[4] = "a" + exp = np.array([0, 1, 2, 0, 0], dtype="int8") + tm.assert_numpy_array_equal(c.codes, exp) + c._codes[4] = 2 + exp = np.array([0, 1, 2, 0, 2], dtype="int8") + tm.assert_numpy_array_equal(c.codes, exp) + + @pytest.mark.parametrize( + "codes, old, new, expected", + [ + ([0, 1], ["a", "b"], ["a", "b"], [0, 1]), + ([0, 1], ["b", "a"], ["b", "a"], [0, 1]), + ([0, 1], ["a", "b"], ["b", "a"], [1, 0]), + ([0, 1], ["b", "a"], ["a", "b"], [1, 0]), + ([0, 1, 0, 1], ["a", "b"], ["a", "b", "c"], [0, 1, 0, 1]), + ([0, 1, 2, 2], ["a", "b", "c"], ["a", "b"], [0, 1, -1, -1]), + ([0, 1, -1], ["a", "b", "c"], ["a", "b", "c"], [0, 1, -1]), + ([0, 1, -1], ["a", "b", "c"], ["b"], [-1, 0, -1]), + ([0, 1, -1], ["a", "b", "c"], ["d"], [-1, -1, -1]), + ([0, 1, -1], ["a", "b", "c"], [], [-1, -1, -1]), + ([-1, -1], [], ["a", "b"], [-1, -1]), + ([1, 0], ["b", "a"], ["a", "b"], [0, 1]), + ], + ) + def test_recode_to_categories(self, codes, old, new, expected): + codes = np.asanyarray(codes, dtype=np.int8) + expected = np.asanyarray(expected, dtype=np.int8) + old = Index(old) + new = Index(new) + result = _recode_for_categories(codes, old, new) + tm.assert_numpy_array_equal(result, expected) + + def test_recode_to_categories_large(self): + N = 1000 + codes = np.arange(N) + old = Index(codes) + expected = np.arange(N - 1, -1, -1, dtype=np.int16) + new = Index(expected) + result = _recode_for_categories(codes, old, new) + tm.assert_numpy_array_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_constructors.py b/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_constructors.py new file mode 100644 index 0000000..cfba3da --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_constructors.py @@ -0,0 +1,610 @@ +from datetime import datetime + +import numpy as np +import pytest + +from pandas.compat.numpy import _np_version_under1p16 + +from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype +from pandas.core.dtypes.dtypes import CategoricalDtype + +import pandas as pd +from pandas import ( + Categorical, + CategoricalIndex, + DatetimeIndex, + Index, + Interval, + IntervalIndex, + MultiIndex, + NaT, + Series, + Timestamp, + date_range, + period_range, + timedelta_range, +) +import pandas._testing as tm + + +class TestCategoricalConstructors: + def test_validate_ordered(self): + # see gh-14058 + exp_msg = "'ordered' must either be 'True' or 'False'" + exp_err = TypeError + + # This should be a boolean. + ordered = np.array([0, 1, 2]) + + with pytest.raises(exp_err, match=exp_msg): + Categorical([1, 2, 3], ordered=ordered) + + with pytest.raises(exp_err, match=exp_msg): + Categorical.from_codes( + [0, 0, 1], categories=["a", "b", "c"], ordered=ordered + ) + + def test_constructor_empty(self): + # GH 17248 + c = Categorical([]) + expected = Index([]) + tm.assert_index_equal(c.categories, expected) + + c = Categorical([], categories=[1, 2, 3]) + expected = pd.Int64Index([1, 2, 3]) + tm.assert_index_equal(c.categories, expected) + + def test_constructor_empty_boolean(self): + # see gh-22702 + cat = pd.Categorical([], categories=[True, False]) + categories = sorted(cat.categories.tolist()) + assert categories == [False, True] + + def test_constructor_tuples(self): + values = np.array([(1,), (1, 2), (1,), (1, 2)], dtype=object) + result = Categorical(values) + expected = Index([(1,), (1, 2)], tupleize_cols=False) + tm.assert_index_equal(result.categories, expected) + assert result.ordered is False + + def test_constructor_tuples_datetimes(self): + # numpy will auto reshape when all of the tuples are the + # same len, so add an extra one with 2 items and slice it off + values = np.array( + [ + (Timestamp("2010-01-01"),), + (Timestamp("2010-01-02"),), + (Timestamp("2010-01-01"),), + (Timestamp("2010-01-02"),), + ("a", "b"), + ], + dtype=object, + )[:-1] + result = Categorical(values) + expected = Index( + [(Timestamp("2010-01-01"),), (Timestamp("2010-01-02"),)], + tupleize_cols=False, + ) + tm.assert_index_equal(result.categories, expected) + + def test_constructor_unsortable(self): + + # it works! + arr = np.array([1, 2, 3, datetime.now()], dtype="O") + factor = Categorical(arr, ordered=False) + assert not factor.ordered + + # this however will raise as cannot be sorted + msg = ( + "'values' is not ordered, please explicitly specify the " + "categories order by passing in a categories argument." + ) + with pytest.raises(TypeError, match=msg): + Categorical(arr, ordered=True) + + def test_constructor_interval(self): + result = Categorical( + [Interval(1, 2), Interval(2, 3), Interval(3, 6)], ordered=True + ) + ii = IntervalIndex([Interval(1, 2), Interval(2, 3), Interval(3, 6)]) + exp = Categorical(ii, ordered=True) + tm.assert_categorical_equal(result, exp) + tm.assert_index_equal(result.categories, ii) + + def test_constructor(self): + + exp_arr = np.array(["a", "b", "c", "a", "b", "c"], dtype=np.object_) + c1 = Categorical(exp_arr) + tm.assert_numpy_array_equal(c1.__array__(), exp_arr) + c2 = Categorical(exp_arr, categories=["a", "b", "c"]) + tm.assert_numpy_array_equal(c2.__array__(), exp_arr) + c2 = Categorical(exp_arr, categories=["c", "b", "a"]) + tm.assert_numpy_array_equal(c2.__array__(), exp_arr) + + # categories must be unique + msg = "Categorical categories must be unique" + with pytest.raises(ValueError, match=msg): + Categorical([1, 2], [1, 2, 2]) + + with pytest.raises(ValueError, match=msg): + Categorical(["a", "b"], ["a", "b", "b"]) + + # The default should be unordered + c1 = Categorical(["a", "b", "c", "a"]) + assert not c1.ordered + + # Categorical as input + c1 = Categorical(["a", "b", "c", "a"]) + c2 = Categorical(c1) + tm.assert_categorical_equal(c1, c2) + + c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) + c2 = Categorical(c1) + tm.assert_categorical_equal(c1, c2) + + c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"]) + c2 = Categorical(c1) + tm.assert_categorical_equal(c1, c2) + + c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"]) + c2 = Categorical(c1, categories=["a", "b", "c"]) + tm.assert_numpy_array_equal(c1.__array__(), c2.__array__()) + tm.assert_index_equal(c2.categories, Index(["a", "b", "c"])) + + # Series of dtype category + c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) + c2 = Categorical(Series(c1)) + tm.assert_categorical_equal(c1, c2) + + c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"]) + c2 = Categorical(Series(c1)) + tm.assert_categorical_equal(c1, c2) + + # Series + c1 = Categorical(["a", "b", "c", "a"]) + c2 = Categorical(Series(["a", "b", "c", "a"])) + tm.assert_categorical_equal(c1, c2) + + c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) + c2 = Categorical(Series(["a", "b", "c", "a"]), categories=["a", "b", "c", "d"]) + tm.assert_categorical_equal(c1, c2) + + # This should result in integer categories, not float! + cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) + assert is_integer_dtype(cat.categories) + + # https://github.com/pandas-dev/pandas/issues/3678 + cat = Categorical([np.nan, 1, 2, 3]) + assert is_integer_dtype(cat.categories) + + # this should result in floats + cat = Categorical([np.nan, 1, 2.0, 3]) + assert is_float_dtype(cat.categories) + + cat = Categorical([np.nan, 1.0, 2.0, 3.0]) + assert is_float_dtype(cat.categories) + + # This doesn't work -> this would probably need some kind of "remember + # the original type" feature to try to cast the array interface result + # to... + + # vals = np.asarray(cat[cat.notna()]) + # assert is_integer_dtype(vals) + + # corner cases + cat = Categorical([1]) + assert len(cat.categories) == 1 + assert cat.categories[0] == 1 + assert len(cat.codes) == 1 + assert cat.codes[0] == 0 + + cat = Categorical(["a"]) + assert len(cat.categories) == 1 + assert cat.categories[0] == "a" + assert len(cat.codes) == 1 + assert cat.codes[0] == 0 + + # Scalars should be converted to lists + cat = Categorical(1) + assert len(cat.categories) == 1 + assert cat.categories[0] == 1 + assert len(cat.codes) == 1 + assert cat.codes[0] == 0 + + # two arrays + # - when the first is an integer dtype and the second is not + # - when the resulting codes are all -1/NaN + with tm.assert_produces_warning(None): + c_old = Categorical([0, 1, 2, 0, 1, 2], categories=["a", "b", "c"]) # noqa + + with tm.assert_produces_warning(None): + c_old = Categorical([0, 1, 2, 0, 1, 2], categories=[3, 4, 5]) # noqa + + # the next one are from the old docs + with tm.assert_produces_warning(None): + c_old2 = Categorical([0, 1, 2, 0, 1, 2], [1, 2, 3]) # noqa + cat = Categorical([1, 2], categories=[1, 2, 3]) + + # this is a legitimate constructor + with tm.assert_produces_warning(None): + c = Categorical( # noqa + np.array([], dtype="int64"), categories=[3, 2, 1], ordered=True + ) + + def test_constructor_with_existing_categories(self): + # GH25318: constructing with pd.Series used to bogusly skip recoding + # categories + c0 = Categorical(["a", "b", "c", "a"]) + c1 = Categorical(["a", "b", "c", "a"], categories=["b", "c"]) + + c2 = Categorical(c0, categories=c1.categories) + tm.assert_categorical_equal(c1, c2) + + c3 = Categorical(Series(c0), categories=c1.categories) + tm.assert_categorical_equal(c1, c3) + + def test_constructor_not_sequence(self): + # https://github.com/pandas-dev/pandas/issues/16022 + msg = r"^Parameter 'categories' must be list-like, was" + with pytest.raises(TypeError, match=msg): + Categorical(["a", "b"], categories="a") + + def test_constructor_with_null(self): + + # Cannot have NaN in categories + msg = "Categorial categories cannot be null" + with pytest.raises(ValueError, match=msg): + Categorical([np.nan, "a", "b", "c"], categories=[np.nan, "a", "b", "c"]) + + with pytest.raises(ValueError, match=msg): + Categorical([None, "a", "b", "c"], categories=[None, "a", "b", "c"]) + + with pytest.raises(ValueError, match=msg): + Categorical( + DatetimeIndex(["nat", "20160101"]), + categories=[NaT, Timestamp("20160101")], + ) + + def test_constructor_with_index(self): + ci = CategoricalIndex(list("aabbca"), categories=list("cab")) + tm.assert_categorical_equal(ci.values, Categorical(ci)) + + ci = CategoricalIndex(list("aabbca"), categories=list("cab")) + tm.assert_categorical_equal( + ci.values, Categorical(ci.astype(object), categories=ci.categories) + ) + + def test_constructor_with_generator(self): + # This was raising an Error in isna(single_val).any() because isna + # returned a scalar for a generator + + exp = Categorical([0, 1, 2]) + cat = Categorical((x for x in [0, 1, 2])) + tm.assert_categorical_equal(cat, exp) + cat = Categorical(range(3)) + tm.assert_categorical_equal(cat, exp) + + MultiIndex.from_product([range(5), ["a", "b", "c"]]) + + # check that categories accept generators and sequences + cat = Categorical([0, 1, 2], categories=(x for x in [0, 1, 2])) + tm.assert_categorical_equal(cat, exp) + cat = Categorical([0, 1, 2], categories=range(3)) + tm.assert_categorical_equal(cat, exp) + + @pytest.mark.parametrize( + "dtl", + [ + date_range("1995-01-01 00:00:00", periods=5, freq="s"), + date_range("1995-01-01 00:00:00", periods=5, freq="s", tz="US/Eastern"), + timedelta_range("1 day", periods=5, freq="s"), + ], + ) + def test_constructor_with_datetimelike(self, dtl): + # see gh-12077 + # constructor with a datetimelike and NaT + + s = Series(dtl) + c = Categorical(s) + + expected = type(dtl)(s) + expected._data.freq = None + + tm.assert_index_equal(c.categories, expected) + tm.assert_numpy_array_equal(c.codes, np.arange(5, dtype="int8")) + + # with NaT + s2 = s.copy() + s2.iloc[-1] = NaT + c = Categorical(s2) + + expected = type(dtl)(s2.dropna()) + expected._data.freq = None + + tm.assert_index_equal(c.categories, expected) + + exp = np.array([0, 1, 2, 3, -1], dtype=np.int8) + tm.assert_numpy_array_equal(c.codes, exp) + + result = repr(c) + assert "NaT" in result + + def test_constructor_from_index_series_datetimetz(self): + idx = date_range("2015-01-01 10:00", freq="D", periods=3, tz="US/Eastern") + result = Categorical(idx) + tm.assert_index_equal(result.categories, idx) + + result = Categorical(Series(idx)) + tm.assert_index_equal(result.categories, idx) + + def test_constructor_from_index_series_timedelta(self): + idx = timedelta_range("1 days", freq="D", periods=3) + result = Categorical(idx) + tm.assert_index_equal(result.categories, idx) + + result = Categorical(Series(idx)) + tm.assert_index_equal(result.categories, idx) + + def test_constructor_from_index_series_period(self): + idx = period_range("2015-01-01", freq="D", periods=3) + result = Categorical(idx) + tm.assert_index_equal(result.categories, idx) + + result = Categorical(Series(idx)) + tm.assert_index_equal(result.categories, idx) + + def test_constructor_invariant(self): + # GH 14190 + vals = [ + np.array([1.0, 1.2, 1.8, np.nan]), + np.array([1, 2, 3], dtype="int64"), + ["a", "b", "c", np.nan], + [pd.Period("2014-01"), pd.Period("2014-02"), NaT], + [Timestamp("2014-01-01"), Timestamp("2014-01-02"), NaT], + [ + Timestamp("2014-01-01", tz="US/Eastern"), + Timestamp("2014-01-02", tz="US/Eastern"), + NaT, + ], + ] + for val in vals: + c = Categorical(val) + c2 = Categorical(c) + tm.assert_categorical_equal(c, c2) + + @pytest.mark.parametrize("ordered", [True, False]) + def test_constructor_with_dtype(self, ordered): + categories = ["b", "a", "c"] + dtype = CategoricalDtype(categories, ordered=ordered) + result = Categorical(["a", "b", "a", "c"], dtype=dtype) + expected = Categorical( + ["a", "b", "a", "c"], categories=categories, ordered=ordered + ) + tm.assert_categorical_equal(result, expected) + assert result.ordered is ordered + + def test_constructor_dtype_and_others_raises(self): + dtype = CategoricalDtype(["a", "b"], ordered=True) + msg = "Cannot specify `categories` or `ordered` together with `dtype`." + with pytest.raises(ValueError, match=msg): + Categorical(["a", "b"], categories=["a", "b"], dtype=dtype) + + with pytest.raises(ValueError, match=msg): + Categorical(["a", "b"], ordered=True, dtype=dtype) + + with pytest.raises(ValueError, match=msg): + Categorical(["a", "b"], ordered=False, dtype=dtype) + + @pytest.mark.parametrize("categories", [None, ["a", "b"], ["a", "c"]]) + @pytest.mark.parametrize("ordered", [True, False]) + def test_constructor_str_category(self, categories, ordered): + result = Categorical( + ["a", "b"], categories=categories, ordered=ordered, dtype="category" + ) + expected = Categorical(["a", "b"], categories=categories, ordered=ordered) + tm.assert_categorical_equal(result, expected) + + def test_constructor_str_unknown(self): + with pytest.raises(ValueError, match="Unknown dtype"): + Categorical([1, 2], dtype="foo") + + def test_constructor_from_categorical_with_dtype(self): + dtype = CategoricalDtype(["a", "b", "c"], ordered=True) + values = Categorical(["a", "b", "d"]) + result = Categorical(values, dtype=dtype) + # We use dtype.categories, not values.categories + expected = Categorical( + ["a", "b", "d"], categories=["a", "b", "c"], ordered=True + ) + tm.assert_categorical_equal(result, expected) + + def test_constructor_from_categorical_with_unknown_dtype(self): + dtype = CategoricalDtype(None, ordered=True) + values = Categorical(["a", "b", "d"]) + result = Categorical(values, dtype=dtype) + # We use values.categories, not dtype.categories + expected = Categorical( + ["a", "b", "d"], categories=["a", "b", "d"], ordered=True + ) + tm.assert_categorical_equal(result, expected) + + def test_constructor_from_categorical_string(self): + values = Categorical(["a", "b", "d"]) + # use categories, ordered + result = Categorical( + values, categories=["a", "b", "c"], ordered=True, dtype="category" + ) + expected = Categorical( + ["a", "b", "d"], categories=["a", "b", "c"], ordered=True + ) + tm.assert_categorical_equal(result, expected) + + # No string + result = Categorical(values, categories=["a", "b", "c"], ordered=True) + tm.assert_categorical_equal(result, expected) + + def test_constructor_with_categorical_categories(self): + # GH17884 + expected = Categorical(["a", "b"], categories=["a", "b", "c"]) + + result = Categorical(["a", "b"], categories=Categorical(["a", "b", "c"])) + tm.assert_categorical_equal(result, expected) + + result = Categorical(["a", "b"], categories=CategoricalIndex(["a", "b", "c"])) + tm.assert_categorical_equal(result, expected) + + def test_from_codes(self): + + # too few categories + dtype = CategoricalDtype(categories=[1, 2]) + msg = "codes need to be between " + with pytest.raises(ValueError, match=msg): + Categorical.from_codes([1, 2], categories=dtype.categories) + with pytest.raises(ValueError, match=msg): + Categorical.from_codes([1, 2], dtype=dtype) + + # no int codes + msg = "codes need to be array-like integers" + with pytest.raises(ValueError, match=msg): + Categorical.from_codes(["a"], categories=dtype.categories) + with pytest.raises(ValueError, match=msg): + Categorical.from_codes(["a"], dtype=dtype) + + # no unique categories + with pytest.raises(ValueError, match="Categorical categories must be unique"): + Categorical.from_codes([0, 1, 2], categories=["a", "a", "b"]) + + # NaN categories included + with pytest.raises(ValueError, match="Categorial categories cannot be null"): + Categorical.from_codes([0, 1, 2], categories=["a", "b", np.nan]) + + # too negative + dtype = CategoricalDtype(categories=["a", "b", "c"]) + msg = r"codes need to be between -1 and len\(categories\)-1" + with pytest.raises(ValueError, match=msg): + Categorical.from_codes([-2, 1, 2], categories=dtype.categories) + with pytest.raises(ValueError, match=msg): + Categorical.from_codes([-2, 1, 2], dtype=dtype) + + exp = Categorical(["a", "b", "c"], ordered=False) + res = Categorical.from_codes([0, 1, 2], categories=dtype.categories) + tm.assert_categorical_equal(exp, res) + + res = Categorical.from_codes([0, 1, 2], dtype=dtype) + tm.assert_categorical_equal(exp, res) + + def test_from_codes_with_categorical_categories(self): + # GH17884 + expected = Categorical(["a", "b"], categories=["a", "b", "c"]) + + result = Categorical.from_codes([0, 1], categories=Categorical(["a", "b", "c"])) + tm.assert_categorical_equal(result, expected) + + result = Categorical.from_codes( + [0, 1], categories=CategoricalIndex(["a", "b", "c"]) + ) + tm.assert_categorical_equal(result, expected) + + # non-unique Categorical still raises + with pytest.raises(ValueError, match="Categorical categories must be unique"): + Categorical.from_codes([0, 1], Categorical(["a", "b", "a"])) + + def test_from_codes_with_nan_code(self): + # GH21767 + codes = [1, 2, np.nan] + dtype = CategoricalDtype(categories=["a", "b", "c"]) + with pytest.raises(ValueError, match="codes need to be array-like integers"): + Categorical.from_codes(codes, categories=dtype.categories) + with pytest.raises(ValueError, match="codes need to be array-like integers"): + Categorical.from_codes(codes, dtype=dtype) + + def test_from_codes_with_float(self): + # GH21767 + codes = [1.0, 2.0, 0] # integer, but in float dtype + dtype = CategoricalDtype(categories=["a", "b", "c"]) + + # empty codes should not raise for floats + Categorical.from_codes([], dtype.categories) + + with pytest.raises(ValueError, match="codes need to be array-like integers"): + Categorical.from_codes(codes, dtype.categories) + + with pytest.raises(ValueError, match="codes need to be array-like integers"): + Categorical.from_codes(codes, dtype=dtype) + + codes = [1.1, 2.0, 0] # non-integer + with pytest.raises(ValueError, match="codes need to be array-like integers"): + Categorical.from_codes(codes, dtype.categories) + with pytest.raises(ValueError, match="codes need to be array-like integers"): + Categorical.from_codes(codes, dtype=dtype) + + def test_from_codes_with_dtype_raises(self): + msg = "Cannot specify" + with pytest.raises(ValueError, match=msg): + Categorical.from_codes( + [0, 1], categories=["a", "b"], dtype=CategoricalDtype(["a", "b"]) + ) + + with pytest.raises(ValueError, match=msg): + Categorical.from_codes( + [0, 1], ordered=True, dtype=CategoricalDtype(["a", "b"]) + ) + + def test_from_codes_neither(self): + msg = "Both were None" + with pytest.raises(ValueError, match=msg): + Categorical.from_codes([0, 1]) + + @pytest.mark.parametrize("dtype", [None, "category"]) + def test_from_inferred_categories(self, dtype): + cats = ["a", "b"] + codes = np.array([0, 0, 1, 1], dtype="i8") + result = Categorical._from_inferred_categories(cats, codes, dtype) + expected = Categorical.from_codes(codes, cats) + tm.assert_categorical_equal(result, expected) + + @pytest.mark.parametrize("dtype", [None, "category"]) + def test_from_inferred_categories_sorts(self, dtype): + cats = ["b", "a"] + codes = np.array([0, 1, 1, 1], dtype="i8") + result = Categorical._from_inferred_categories(cats, codes, dtype) + expected = Categorical.from_codes([1, 0, 0, 0], ["a", "b"]) + tm.assert_categorical_equal(result, expected) + + def test_from_inferred_categories_dtype(self): + cats = ["a", "b", "d"] + codes = np.array([0, 1, 0, 2], dtype="i8") + dtype = CategoricalDtype(["c", "b", "a"], ordered=True) + result = Categorical._from_inferred_categories(cats, codes, dtype) + expected = Categorical( + ["a", "b", "a", "d"], categories=["c", "b", "a"], ordered=True + ) + tm.assert_categorical_equal(result, expected) + + def test_from_inferred_categories_coerces(self): + cats = ["1", "2", "bad"] + codes = np.array([0, 0, 1, 2], dtype="i8") + dtype = CategoricalDtype([1, 2]) + result = Categorical._from_inferred_categories(cats, codes, dtype) + expected = Categorical([1, 1, 2, np.nan]) + tm.assert_categorical_equal(result, expected) + + @pytest.mark.parametrize("ordered", [None, True, False]) + def test_construction_with_ordered(self, ordered): + # GH 9347, 9190 + cat = Categorical([0, 1, 2], ordered=ordered) + assert cat.ordered == bool(ordered) + + @pytest.mark.xfail(reason="Imaginary values not supported in Categorical") + def test_constructor_imaginary(self): + values = [1, 2, 3 + 1j] + c1 = Categorical(values) + tm.assert_index_equal(c1.categories, Index(values)) + tm.assert_numpy_array_equal(np.array(c1), np.array(values)) + + @pytest.mark.skipif(_np_version_under1p16, reason="Skipping for NumPy <1.16") + def test_constructor_string_and_tuples(self): + # GH 21416 + c = pd.Categorical(np.array(["c", ("a", "b"), ("b", "a"), "c"], dtype=object)) + expected_index = pd.Index([("a", "b"), ("b", "a"), "c"]) + assert c.categories.equals(expected_index) diff --git a/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_dtypes.py b/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_dtypes.py new file mode 100644 index 0000000..19746d7 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_dtypes.py @@ -0,0 +1,173 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.dtypes import CategoricalDtype + +from pandas import Categorical, CategoricalIndex, Index, Series, Timestamp +import pandas._testing as tm + + +class TestCategoricalDtypes: + def test_is_equal_dtype(self): + + # test dtype comparisons between cats + + c1 = Categorical(list("aabca"), categories=list("abc"), ordered=False) + c2 = Categorical(list("aabca"), categories=list("cab"), ordered=False) + c3 = Categorical(list("aabca"), categories=list("cab"), ordered=True) + assert c1.is_dtype_equal(c1) + assert c2.is_dtype_equal(c2) + assert c3.is_dtype_equal(c3) + assert c1.is_dtype_equal(c2) + assert not c1.is_dtype_equal(c3) + assert not c1.is_dtype_equal(Index(list("aabca"))) + assert not c1.is_dtype_equal(c1.astype(object)) + assert c1.is_dtype_equal(CategoricalIndex(c1)) + assert c1.is_dtype_equal(CategoricalIndex(c1, categories=list("cab"))) + assert not c1.is_dtype_equal(CategoricalIndex(c1, ordered=True)) + + # GH 16659 + s1 = Series(c1) + s2 = Series(c2) + s3 = Series(c3) + assert c1.is_dtype_equal(s1) + assert c2.is_dtype_equal(s2) + assert c3.is_dtype_equal(s3) + assert c1.is_dtype_equal(s2) + assert not c1.is_dtype_equal(s3) + assert not c1.is_dtype_equal(s1.astype(object)) + + def test_set_dtype_same(self): + c = Categorical(["a", "b", "c"]) + result = c._set_dtype(CategoricalDtype(["a", "b", "c"])) + tm.assert_categorical_equal(result, c) + + def test_set_dtype_new_categories(self): + c = Categorical(["a", "b", "c"]) + result = c._set_dtype(CategoricalDtype(list("abcd"))) + tm.assert_numpy_array_equal(result.codes, c.codes) + tm.assert_index_equal(result.dtype.categories, Index(list("abcd"))) + + @pytest.mark.parametrize( + "values, categories, new_categories", + [ + # No NaNs, same cats, same order + (["a", "b", "a"], ["a", "b"], ["a", "b"]), + # No NaNs, same cats, different order + (["a", "b", "a"], ["a", "b"], ["b", "a"]), + # Same, unsorted + (["b", "a", "a"], ["a", "b"], ["a", "b"]), + # No NaNs, same cats, different order + (["b", "a", "a"], ["a", "b"], ["b", "a"]), + # NaNs + (["a", "b", "c"], ["a", "b"], ["a", "b"]), + (["a", "b", "c"], ["a", "b"], ["b", "a"]), + (["b", "a", "c"], ["a", "b"], ["a", "b"]), + (["b", "a", "c"], ["a", "b"], ["a", "b"]), + # Introduce NaNs + (["a", "b", "c"], ["a", "b"], ["a"]), + (["a", "b", "c"], ["a", "b"], ["b"]), + (["b", "a", "c"], ["a", "b"], ["a"]), + (["b", "a", "c"], ["a", "b"], ["a"]), + # No overlap + (["a", "b", "c"], ["a", "b"], ["d", "e"]), + ], + ) + @pytest.mark.parametrize("ordered", [True, False]) + def test_set_dtype_many(self, values, categories, new_categories, ordered): + c = Categorical(values, categories) + expected = Categorical(values, new_categories, ordered) + result = c._set_dtype(expected.dtype) + tm.assert_categorical_equal(result, expected) + + def test_set_dtype_no_overlap(self): + c = Categorical(["a", "b", "c"], ["d", "e"]) + result = c._set_dtype(CategoricalDtype(["a", "b"])) + expected = Categorical([None, None, None], categories=["a", "b"]) + tm.assert_categorical_equal(result, expected) + + def test_codes_dtypes(self): + + # GH 8453 + result = Categorical(["foo", "bar", "baz"]) + assert result.codes.dtype == "int8" + + result = Categorical(["foo{i:05d}".format(i=i) for i in range(400)]) + assert result.codes.dtype == "int16" + + result = Categorical(["foo{i:05d}".format(i=i) for i in range(40000)]) + assert result.codes.dtype == "int32" + + # adding cats + result = Categorical(["foo", "bar", "baz"]) + assert result.codes.dtype == "int8" + result = result.add_categories(["foo{i:05d}".format(i=i) for i in range(400)]) + assert result.codes.dtype == "int16" + + # removing cats + result = result.remove_categories( + ["foo{i:05d}".format(i=i) for i in range(300)] + ) + assert result.codes.dtype == "int8" + + @pytest.mark.parametrize("ordered", [True, False]) + def test_astype(self, ordered): + # string + cat = Categorical(list("abbaaccc"), ordered=ordered) + result = cat.astype(object) + expected = np.array(cat) + tm.assert_numpy_array_equal(result, expected) + + msg = "could not convert string to float" + with pytest.raises(ValueError, match=msg): + cat.astype(float) + + # numeric + cat = Categorical([0, 1, 2, 2, 1, 0, 1, 0, 2], ordered=ordered) + result = cat.astype(object) + expected = np.array(cat, dtype=object) + tm.assert_numpy_array_equal(result, expected) + + result = cat.astype(int) + expected = np.array(cat, dtype=np.int) + tm.assert_numpy_array_equal(result, expected) + + result = cat.astype(float) + expected = np.array(cat, dtype=np.float) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("dtype_ordered", [True, False]) + @pytest.mark.parametrize("cat_ordered", [True, False]) + def test_astype_category(self, dtype_ordered, cat_ordered): + # GH 10696/18593 + data = list("abcaacbab") + cat = Categorical(data, categories=list("bac"), ordered=cat_ordered) + + # standard categories + dtype = CategoricalDtype(ordered=dtype_ordered) + result = cat.astype(dtype) + expected = Categorical(data, categories=cat.categories, ordered=dtype_ordered) + tm.assert_categorical_equal(result, expected) + + # non-standard categories + dtype = CategoricalDtype(list("adc"), dtype_ordered) + result = cat.astype(dtype) + expected = Categorical(data, dtype=dtype) + tm.assert_categorical_equal(result, expected) + + if dtype_ordered is False: + # dtype='category' can't specify ordered, so only test once + result = cat.astype("category") + expected = cat + tm.assert_categorical_equal(result, expected) + + def test_iter_python_types(self): + # GH-19909 + cat = Categorical([1, 2]) + assert isinstance(list(cat)[0], int) + assert isinstance(cat.tolist()[0], int) + + def test_iter_python_types_datetime(self): + cat = Categorical([Timestamp("2017-01-01"), Timestamp("2017-01-02")]) + assert isinstance(list(cat)[0], Timestamp) + assert isinstance(cat.tolist()[0], Timestamp) diff --git a/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_indexing.py b/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_indexing.py new file mode 100644 index 0000000..85d5a6a --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_indexing.py @@ -0,0 +1,277 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import Categorical, CategoricalIndex, Index, PeriodIndex, Series +import pandas._testing as tm +import pandas.core.common as com +from pandas.tests.arrays.categorical.common import TestCategorical + + +class TestCategoricalIndexingWithFactor(TestCategorical): + def test_getitem(self): + assert self.factor[0] == "a" + assert self.factor[-1] == "c" + + subf = self.factor[[0, 1, 2]] + tm.assert_numpy_array_equal(subf._codes, np.array([0, 1, 1], dtype=np.int8)) + + subf = self.factor[np.asarray(self.factor) == "c"] + tm.assert_numpy_array_equal(subf._codes, np.array([2, 2, 2], dtype=np.int8)) + + def test_setitem(self): + + # int/positional + c = self.factor.copy() + c[0] = "b" + assert c[0] == "b" + c[-1] = "a" + assert c[-1] == "a" + + # boolean + c = self.factor.copy() + indexer = np.zeros(len(c), dtype="bool") + indexer[0] = True + indexer[-1] = True + c[indexer] = "c" + expected = Categorical(["c", "b", "b", "a", "a", "c", "c", "c"], ordered=True) + + tm.assert_categorical_equal(c, expected) + + @pytest.mark.parametrize( + "other", + [pd.Categorical(["b", "a"]), pd.Categorical(["b", "a"], categories=["b", "a"])], + ) + def test_setitem_same_but_unordered(self, other): + # GH-24142 + target = pd.Categorical(["a", "b"], categories=["a", "b"]) + mask = np.array([True, False]) + target[mask] = other[mask] + expected = pd.Categorical(["b", "b"], categories=["a", "b"]) + tm.assert_categorical_equal(target, expected) + + @pytest.mark.parametrize( + "other", + [ + pd.Categorical(["b", "a"], categories=["b", "a", "c"]), + pd.Categorical(["b", "a"], categories=["a", "b", "c"]), + pd.Categorical(["a", "a"], categories=["a"]), + pd.Categorical(["b", "b"], categories=["b"]), + ], + ) + def test_setitem_different_unordered_raises(self, other): + # GH-24142 + target = pd.Categorical(["a", "b"], categories=["a", "b"]) + mask = np.array([True, False]) + msg = "Cannot set a Categorical with another, without identical categories" + with pytest.raises(ValueError, match=msg): + target[mask] = other[mask] + + @pytest.mark.parametrize( + "other", + [ + pd.Categorical(["b", "a"]), + pd.Categorical(["b", "a"], categories=["b", "a"], ordered=True), + pd.Categorical(["b", "a"], categories=["a", "b", "c"], ordered=True), + ], + ) + def test_setitem_same_ordered_rasies(self, other): + # Gh-24142 + target = pd.Categorical(["a", "b"], categories=["a", "b"], ordered=True) + mask = np.array([True, False]) + msg = "Cannot set a Categorical with another, without identical categories" + with pytest.raises(ValueError, match=msg): + target[mask] = other[mask] + + +class TestCategoricalIndexing: + def test_getitem_listlike(self): + + # GH 9469 + # properly coerce the input indexers + np.random.seed(1) + c = Categorical(np.random.randint(0, 5, size=150000).astype(np.int8)) + result = c.codes[np.array([100000]).astype(np.int64)] + expected = c[np.array([100000]).astype(np.int64)].codes + tm.assert_numpy_array_equal(result, expected) + + def test_periodindex(self): + idx1 = PeriodIndex( + ["2014-01", "2014-01", "2014-02", "2014-02", "2014-03", "2014-03"], freq="M" + ) + + cat1 = Categorical(idx1) + str(cat1) + exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.int8) + exp_idx = PeriodIndex(["2014-01", "2014-02", "2014-03"], freq="M") + tm.assert_numpy_array_equal(cat1._codes, exp_arr) + tm.assert_index_equal(cat1.categories, exp_idx) + + idx2 = PeriodIndex( + ["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"], freq="M" + ) + cat2 = Categorical(idx2, ordered=True) + str(cat2) + exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.int8) + exp_idx2 = PeriodIndex(["2014-01", "2014-02", "2014-03"], freq="M") + tm.assert_numpy_array_equal(cat2._codes, exp_arr) + tm.assert_index_equal(cat2.categories, exp_idx2) + + idx3 = PeriodIndex( + [ + "2013-12", + "2013-11", + "2013-10", + "2013-09", + "2013-08", + "2013-07", + "2013-05", + ], + freq="M", + ) + cat3 = Categorical(idx3, ordered=True) + exp_arr = np.array([6, 5, 4, 3, 2, 1, 0], dtype=np.int8) + exp_idx = PeriodIndex( + [ + "2013-05", + "2013-07", + "2013-08", + "2013-09", + "2013-10", + "2013-11", + "2013-12", + ], + freq="M", + ) + tm.assert_numpy_array_equal(cat3._codes, exp_arr) + tm.assert_index_equal(cat3.categories, exp_idx) + + def test_categories_assigments(self): + s = Categorical(["a", "b", "c", "a"]) + exp = np.array([1, 2, 3, 1], dtype=np.int64) + s.categories = [1, 2, 3] + tm.assert_numpy_array_equal(s.__array__(), exp) + tm.assert_index_equal(s.categories, Index([1, 2, 3])) + + @pytest.mark.parametrize("new_categories", [[1, 2, 3, 4], [1, 2]]) + def test_categories_assigments_wrong_length_raises(self, new_categories): + cat = Categorical(["a", "b", "c", "a"]) + msg = ( + "new categories need to have the same number of items " + "as the old categories!" + ) + with pytest.raises(ValueError, match=msg): + cat.categories = new_categories + + # Combinations of sorted/unique: + @pytest.mark.parametrize( + "idx_values", [[1, 2, 3, 4], [1, 3, 2, 4], [1, 3, 3, 4], [1, 2, 2, 4]] + ) + # Combinations of missing/unique + @pytest.mark.parametrize("key_values", [[1, 2], [1, 5], [1, 1], [5, 5]]) + @pytest.mark.parametrize("key_class", [Categorical, CategoricalIndex]) + def test_get_indexer_non_unique(self, idx_values, key_values, key_class): + # GH 21448 + key = key_class(key_values, categories=range(1, 5)) + # Test for flat index and CategoricalIndex with same/different cats: + for dtype in None, "category", key.dtype: + idx = Index(idx_values, dtype=dtype) + expected, exp_miss = idx.get_indexer_non_unique(key_values) + result, res_miss = idx.get_indexer_non_unique(key) + + tm.assert_numpy_array_equal(expected, result) + tm.assert_numpy_array_equal(exp_miss, res_miss) + + def test_where_unobserved_nan(self): + ser = pd.Series(pd.Categorical(["a", "b"])) + result = ser.where([True, False]) + expected = pd.Series(pd.Categorical(["a", None], categories=["a", "b"])) + tm.assert_series_equal(result, expected) + + # all NA + ser = pd.Series(pd.Categorical(["a", "b"])) + result = ser.where([False, False]) + expected = pd.Series(pd.Categorical([None, None], categories=["a", "b"])) + tm.assert_series_equal(result, expected) + + def test_where_unobserved_categories(self): + ser = pd.Series(Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"])) + result = ser.where([True, True, False], other="b") + expected = pd.Series( + Categorical(["a", "b", "b"], categories=ser.cat.categories) + ) + tm.assert_series_equal(result, expected) + + def test_where_other_categorical(self): + ser = pd.Series(Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"])) + other = Categorical(["b", "c", "a"], categories=["a", "c", "b", "d"]) + result = ser.where([True, False, True], other) + expected = pd.Series(Categorical(["a", "c", "c"], dtype=ser.dtype)) + tm.assert_series_equal(result, expected) + + def test_where_new_category_raises(self): + ser = pd.Series(Categorical(["a", "b", "c"])) + msg = "Cannot setitem on a Categorical with a new category" + with pytest.raises(ValueError, match=msg): + ser.where([True, False, True], "d") + + def test_where_ordered_differs_rasies(self): + ser = pd.Series( + Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"], ordered=True) + ) + other = Categorical( + ["b", "c", "a"], categories=["a", "c", "b", "d"], ordered=True + ) + with pytest.raises(ValueError, match="without identical categories"): + ser.where([True, False, True], other) + + +@pytest.mark.parametrize("index", [True, False]) +def test_mask_with_boolean(index): + s = Series(range(3)) + idx = Categorical([True, False, True]) + if index: + idx = CategoricalIndex(idx) + + assert com.is_bool_indexer(idx) + result = s[idx] + expected = s[idx.astype("object")] + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("index", [True, False]) +def test_mask_with_boolean_raises(index): + s = Series(range(3)) + idx = Categorical([True, False, None]) + if index: + idx = CategoricalIndex(idx) + + with pytest.raises(ValueError, match="NA / NaN"): + s[idx] + + +@pytest.fixture +def non_coercible_categorical(monkeypatch): + """ + Monkeypatch Categorical.__array__ to ensure no implicit conversion. + + Raises + ------ + ValueError + When Categorical.__array__ is called. + """ + # TODO(Categorical): identify other places where this may be + # useful and move to a conftest.py + def array(self, dtype=None): + raise ValueError("I cannot be converted.") + + with monkeypatch.context() as m: + m.setattr(Categorical, "__array__", array) + yield + + +def test_series_at(non_coercible_categorical): + arr = Categorical(["a", "b", "c"]) + ser = Series(arr) + result = ser.at[0] + assert result == "a" diff --git a/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_missing.py b/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_missing.py new file mode 100644 index 0000000..8889f45 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_missing.py @@ -0,0 +1,84 @@ +import collections + +import numpy as np +import pytest + +from pandas.core.dtypes.dtypes import CategoricalDtype + +from pandas import Categorical, Index, Series, isna +import pandas._testing as tm + + +class TestCategoricalMissing: + def test_na_flags_int_categories(self): + # #1457 + + categories = list(range(10)) + labels = np.random.randint(0, 10, 20) + labels[::5] = -1 + + cat = Categorical(labels, categories, fastpath=True) + repr(cat) + + tm.assert_numpy_array_equal(isna(cat), labels == -1) + + def test_nan_handling(self): + + # Nans are represented as -1 in codes + c = Categorical(["a", "b", np.nan, "a"]) + tm.assert_index_equal(c.categories, Index(["a", "b"])) + tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8)) + c[1] = np.nan + tm.assert_index_equal(c.categories, Index(["a", "b"])) + tm.assert_numpy_array_equal(c._codes, np.array([0, -1, -1, 0], dtype=np.int8)) + + # Adding nan to categories should make assigned nan point to the + # category! + c = Categorical(["a", "b", np.nan, "a"]) + tm.assert_index_equal(c.categories, Index(["a", "b"])) + tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8)) + + def test_set_dtype_nans(self): + c = Categorical(["a", "b", np.nan]) + result = c._set_dtype(CategoricalDtype(["a", "c"])) + tm.assert_numpy_array_equal(result.codes, np.array([0, -1, -1], dtype="int8")) + + def test_set_item_nan(self): + cat = Categorical([1, 2, 3]) + cat[1] = np.nan + + exp = Categorical([1, np.nan, 3], categories=[1, 2, 3]) + tm.assert_categorical_equal(cat, exp) + + @pytest.mark.parametrize( + "fillna_kwargs, msg", + [ + ( + dict(value=1, method="ffill"), + "Cannot specify both 'value' and 'method'.", + ), + (dict(), "Must specify a fill 'value' or 'method'."), + (dict(method="bad"), "Invalid fill method. Expecting .* bad"), + (dict(value=Series([1, 2, 3, 4, "a"])), "fill value must be in categories"), + ], + ) + def test_fillna_raises(self, fillna_kwargs, msg): + # https://github.com/pandas-dev/pandas/issues/19682 + # https://github.com/pandas-dev/pandas/issues/13628 + cat = Categorical([1, 2, 3, None, None]) + + with pytest.raises(ValueError, match=msg): + cat.fillna(**fillna_kwargs) + + @pytest.mark.parametrize("named", [True, False]) + def test_fillna_iterable_category(self, named): + # https://github.com/pandas-dev/pandas/issues/21097 + if named: + Point = collections.namedtuple("Point", "x y") + else: + Point = lambda *args: args # tuple + cat = Categorical(np.array([Point(0, 0), Point(0, 1), None], dtype=object)) + result = cat.fillna(Point(0, 0)) + expected = Categorical([Point(0, 0), Point(0, 1), Point(0, 0)]) + + tm.assert_categorical_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_operators.py b/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_operators.py new file mode 100644 index 0000000..8643e7f --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_operators.py @@ -0,0 +1,442 @@ +import operator +import warnings + +import numpy as np +import pytest + +import pandas as pd +from pandas import Categorical, DataFrame, Series, date_range +import pandas._testing as tm +from pandas.tests.arrays.categorical.common import TestCategorical + + +class TestCategoricalOpsWithFactor(TestCategorical): + def test_categories_none_comparisons(self): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) + tm.assert_categorical_equal(factor, self.factor) + + def test_comparisons(self): + result = self.factor[self.factor == "a"] + expected = self.factor[np.asarray(self.factor) == "a"] + tm.assert_categorical_equal(result, expected) + + result = self.factor[self.factor != "a"] + expected = self.factor[np.asarray(self.factor) != "a"] + tm.assert_categorical_equal(result, expected) + + result = self.factor[self.factor < "c"] + expected = self.factor[np.asarray(self.factor) < "c"] + tm.assert_categorical_equal(result, expected) + + result = self.factor[self.factor > "a"] + expected = self.factor[np.asarray(self.factor) > "a"] + tm.assert_categorical_equal(result, expected) + + result = self.factor[self.factor >= "b"] + expected = self.factor[np.asarray(self.factor) >= "b"] + tm.assert_categorical_equal(result, expected) + + result = self.factor[self.factor <= "b"] + expected = self.factor[np.asarray(self.factor) <= "b"] + tm.assert_categorical_equal(result, expected) + + n = len(self.factor) + + other = self.factor[np.random.permutation(n)] + result = self.factor == other + expected = np.asarray(self.factor) == np.asarray(other) + tm.assert_numpy_array_equal(result, expected) + + result = self.factor == "d" + expected = np.zeros(len(self.factor), dtype=bool) + tm.assert_numpy_array_equal(result, expected) + + # comparisons with categoricals + cat_rev = Categorical(["a", "b", "c"], categories=["c", "b", "a"], ordered=True) + cat_rev_base = Categorical( + ["b", "b", "b"], categories=["c", "b", "a"], ordered=True + ) + cat = Categorical(["a", "b", "c"], ordered=True) + cat_base = Categorical(["b", "b", "b"], categories=cat.categories, ordered=True) + + # comparisons need to take categories ordering into account + res_rev = cat_rev > cat_rev_base + exp_rev = np.array([True, False, False]) + tm.assert_numpy_array_equal(res_rev, exp_rev) + + res_rev = cat_rev < cat_rev_base + exp_rev = np.array([False, False, True]) + tm.assert_numpy_array_equal(res_rev, exp_rev) + + res = cat > cat_base + exp = np.array([False, False, True]) + tm.assert_numpy_array_equal(res, exp) + + # Only categories with same categories can be compared + msg = "Categoricals can only be compared if 'categories' are the same" + with pytest.raises(TypeError, match=msg): + cat > cat_rev + + cat_rev_base2 = Categorical(["b", "b", "b"], categories=["c", "b", "a", "d"]) + + msg = ( + "Categoricals can only be compared if 'categories' are the same. " + "Categories are different lengths" + ) + with pytest.raises(TypeError, match=msg): + cat_rev > cat_rev_base2 + + # Only categories with same ordering information can be compared + cat_unorderd = cat.set_ordered(False) + assert not (cat > cat).any() + + msg = "Categoricals can only be compared if 'ordered' is the same" + with pytest.raises(TypeError, match=msg): + cat > cat_unorderd + + # comparison (in both directions) with Series will raise + s = Series(["b", "b", "b"]) + msg = ( + "Cannot compare a Categorical for op __gt__ with type" + r" " + ) + with pytest.raises(TypeError, match=msg): + cat > s + with pytest.raises(TypeError, match=msg): + cat_rev > s + with pytest.raises(TypeError, match=msg): + s < cat + with pytest.raises(TypeError, match=msg): + s < cat_rev + + # comparison with numpy.array will raise in both direction, but only on + # newer numpy versions + a = np.array(["b", "b", "b"]) + with pytest.raises(TypeError, match=msg): + cat > a + with pytest.raises(TypeError, match=msg): + cat_rev > a + + # Make sure that unequal comparison take the categories order in + # account + cat_rev = Categorical(list("abc"), categories=list("cba"), ordered=True) + exp = np.array([True, False, False]) + res = cat_rev > "b" + tm.assert_numpy_array_equal(res, exp) + + # check that zero-dim array gets unboxed + res = cat_rev > np.array("b") + tm.assert_numpy_array_equal(res, exp) + + +class TestCategoricalOps: + def test_compare_frame(self): + # GH#24282 check that Categorical.__cmp__(DataFrame) defers to frame + data = ["a", "b", 2, "a"] + cat = Categorical(data) + + df = DataFrame(cat) + + result = cat == df.T + expected = DataFrame([[True, True, True, True]]) + tm.assert_frame_equal(result, expected) + + result = cat[::-1] != df.T + expected = DataFrame([[False, True, True, False]]) + tm.assert_frame_equal(result, expected) + + def test_compare_frame_raises(self, all_compare_operators): + # alignment raises unless we transpose + op = getattr(operator, all_compare_operators) + cat = Categorical(["a", "b", 2, "a"]) + df = DataFrame(cat) + msg = "Unable to coerce to Series, length must be 1: given 4" + with pytest.raises(ValueError, match=msg): + op(cat, df) + + def test_datetime_categorical_comparison(self): + dt_cat = Categorical(date_range("2014-01-01", periods=3), ordered=True) + tm.assert_numpy_array_equal(dt_cat > dt_cat[0], np.array([False, True, True])) + tm.assert_numpy_array_equal(dt_cat[0] < dt_cat, np.array([False, True, True])) + + def test_reflected_comparison_with_scalars(self): + # GH8658 + cat = Categorical([1, 2, 3], ordered=True) + tm.assert_numpy_array_equal(cat > cat[0], np.array([False, True, True])) + tm.assert_numpy_array_equal(cat[0] < cat, np.array([False, True, True])) + + def test_comparison_with_unknown_scalars(self): + # https://github.com/pandas-dev/pandas/issues/9836#issuecomment-92123057 + # and following comparisons with scalars not in categories should raise + # for unequal comps, but not for equal/not equal + cat = Categorical([1, 2, 3], ordered=True) + + msg = ( + "Cannot compare a Categorical for op __{}__ with a scalar, " + "which is not a category" + ) + with pytest.raises(TypeError, match=msg.format("lt")): + cat < 4 + with pytest.raises(TypeError, match=msg.format("gt")): + cat > 4 + with pytest.raises(TypeError, match=msg.format("gt")): + 4 < cat + with pytest.raises(TypeError, match=msg.format("lt")): + 4 > cat + + tm.assert_numpy_array_equal(cat == 4, np.array([False, False, False])) + tm.assert_numpy_array_equal(cat != 4, np.array([True, True, True])) + + def test_comparison_of_ordered_categorical_with_nan_to_scalar( + self, compare_operators_no_eq_ne + ): + # https://github.com/pandas-dev/pandas/issues/26504 + # BUG: fix ordered categorical comparison with missing values (#26504 ) + # and following comparisons with scalars in categories with missing + # values should be evaluated as False + + cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) + scalar = 2 + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + expected = getattr(np.array(cat), compare_operators_no_eq_ne)(scalar) + actual = getattr(cat, compare_operators_no_eq_ne)(scalar) + tm.assert_numpy_array_equal(actual, expected) + + def test_comparison_of_ordered_categorical_with_nan_to_listlike( + self, compare_operators_no_eq_ne + ): + # https://github.com/pandas-dev/pandas/issues/26504 + # and following comparisons of missing values in ordered Categorical + # with listlike should be evaluated as False + + cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) + other = Categorical([2, 2, 2, 2], categories=[1, 2, 3], ordered=True) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + expected = getattr(np.array(cat), compare_operators_no_eq_ne)(2) + actual = getattr(cat, compare_operators_no_eq_ne)(other) + tm.assert_numpy_array_equal(actual, expected) + + @pytest.mark.parametrize( + "data,reverse,base", + [(list("abc"), list("cba"), list("bbb")), ([1, 2, 3], [3, 2, 1], [2, 2, 2])], + ) + def test_comparisons(self, data, reverse, base): + cat_rev = Series(Categorical(data, categories=reverse, ordered=True)) + cat_rev_base = Series(Categorical(base, categories=reverse, ordered=True)) + cat = Series(Categorical(data, ordered=True)) + cat_base = Series( + Categorical(base, categories=cat.cat.categories, ordered=True) + ) + s = Series(base) + a = np.array(base) + + # comparisons need to take categories ordering into account + res_rev = cat_rev > cat_rev_base + exp_rev = Series([True, False, False]) + tm.assert_series_equal(res_rev, exp_rev) + + res_rev = cat_rev < cat_rev_base + exp_rev = Series([False, False, True]) + tm.assert_series_equal(res_rev, exp_rev) + + res = cat > cat_base + exp = Series([False, False, True]) + tm.assert_series_equal(res, exp) + + scalar = base[1] + res = cat > scalar + exp = Series([False, False, True]) + exp2 = cat.values > scalar + tm.assert_series_equal(res, exp) + tm.assert_numpy_array_equal(res.values, exp2) + res_rev = cat_rev > scalar + exp_rev = Series([True, False, False]) + exp_rev2 = cat_rev.values > scalar + tm.assert_series_equal(res_rev, exp_rev) + tm.assert_numpy_array_equal(res_rev.values, exp_rev2) + + # Only categories with same categories can be compared + msg = "Categoricals can only be compared if 'categories' are the same" + with pytest.raises(TypeError, match=msg): + cat > cat_rev + + # categorical cannot be compared to Series or numpy array, and also + # not the other way around + msg = ( + "Cannot compare a Categorical for op __gt__ with type" + r" " + ) + with pytest.raises(TypeError, match=msg): + cat > s + with pytest.raises(TypeError, match=msg): + cat_rev > s + with pytest.raises(TypeError, match=msg): + cat > a + with pytest.raises(TypeError, match=msg): + cat_rev > a + + with pytest.raises(TypeError, match=msg): + s < cat + with pytest.raises(TypeError, match=msg): + s < cat_rev + + with pytest.raises(TypeError, match=msg): + a < cat + with pytest.raises(TypeError, match=msg): + a < cat_rev + + @pytest.mark.parametrize( + "ctor", + [ + lambda *args, **kwargs: Categorical(*args, **kwargs), + lambda *args, **kwargs: Series(Categorical(*args, **kwargs)), + ], + ) + def test_unordered_different_order_equal(self, ctor): + # https://github.com/pandas-dev/pandas/issues/16014 + c1 = ctor(["a", "b"], categories=["a", "b"], ordered=False) + c2 = ctor(["a", "b"], categories=["b", "a"], ordered=False) + assert (c1 == c2).all() + + c1 = ctor(["a", "b"], categories=["a", "b"], ordered=False) + c2 = ctor(["b", "a"], categories=["b", "a"], ordered=False) + assert (c1 != c2).all() + + c1 = ctor(["a", "a"], categories=["a", "b"], ordered=False) + c2 = ctor(["b", "b"], categories=["b", "a"], ordered=False) + assert (c1 != c2).all() + + c1 = ctor(["a", "a"], categories=["a", "b"], ordered=False) + c2 = ctor(["a", "b"], categories=["b", "a"], ordered=False) + result = c1 == c2 + tm.assert_numpy_array_equal(np.array(result), np.array([True, False])) + + def test_unordered_different_categories_raises(self): + c1 = Categorical(["a", "b"], categories=["a", "b"], ordered=False) + c2 = Categorical(["a", "c"], categories=["c", "a"], ordered=False) + + with pytest.raises(TypeError, match=("Categoricals can only be compared")): + c1 == c2 + + def test_compare_different_lengths(self): + c1 = Categorical([], categories=["a", "b"]) + c2 = Categorical([], categories=["a"]) + + msg = "Categories are different lengths" + with pytest.raises(TypeError, match=msg): + c1 == c2 + + def test_compare_unordered_different_order(self): + # https://github.com/pandas-dev/pandas/issues/16603#issuecomment- + # 349290078 + a = pd.Categorical(["a"], categories=["a", "b"]) + b = pd.Categorical(["b"], categories=["b", "a"]) + assert not a.equals(b) + + def test_numeric_like_ops(self): + + df = DataFrame({"value": np.random.randint(0, 10000, 100)}) + labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] + cat_labels = Categorical(labels, labels) + + df = df.sort_values(by=["value"], ascending=True) + df["value_group"] = pd.cut( + df.value, range(0, 10500, 500), right=False, labels=cat_labels + ) + + # numeric ops should not succeed + for op, str_rep in [ + ("__add__", r"\+"), + ("__sub__", "-"), + ("__mul__", r"\*"), + ("__truediv__", "/"), + ]: + msg = r"Series cannot perform the operation {}|unsupported operand".format( + str_rep + ) + with pytest.raises(TypeError, match=msg): + getattr(df, op)(df) + + # reduction ops should not succeed (unless specifically defined, e.g. + # min/max) + s = df["value_group"] + for op in ["kurt", "skew", "var", "std", "mean", "sum", "median"]: + msg = "Categorical cannot perform the operation {}".format(op) + with pytest.raises(TypeError, match=msg): + getattr(s, op)(numeric_only=False) + + # mad technically works because it takes always the numeric data + + # numpy ops + s = Series(Categorical([1, 2, 3, 4])) + with pytest.raises( + TypeError, match="Categorical cannot perform the operation sum" + ): + np.sum(s) + + # numeric ops on a Series + for op, str_rep in [ + ("__add__", r"\+"), + ("__sub__", "-"), + ("__mul__", r"\*"), + ("__truediv__", "/"), + ]: + msg = r"Series cannot perform the operation {}|unsupported operand".format( + str_rep + ) + with pytest.raises(TypeError, match=msg): + getattr(s, op)(2) + + # invalid ufunc + msg = "Object with dtype category cannot perform the numpy op log" + with pytest.raises(TypeError, match=msg): + np.log(s) + + def test_contains(self): + # GH21508 + c = pd.Categorical(list("aabbca"), categories=list("cab")) + + assert "b" in c + assert "z" not in c + assert np.nan not in c + with pytest.raises(TypeError, match="unhashable type: 'list'"): + assert [1] in c + + # assert codes NOT in index + assert 0 not in c + assert 1 not in c + + c = pd.Categorical(list("aabbca") + [np.nan], categories=list("cab")) + assert np.nan in c + + @pytest.mark.parametrize( + "item, expected", + [ + (pd.Interval(0, 1), True), + (1.5, True), + (pd.Interval(0.5, 1.5), False), + ("a", False), + (pd.Timestamp(1), False), + (pd.Timedelta(1), False), + ], + ids=str, + ) + def test_contains_interval(self, item, expected): + # GH 23705 + cat = Categorical(pd.IntervalIndex.from_breaks(range(3))) + result = item in cat + assert result is expected + + def test_contains_list(self): + # GH#21729 + cat = Categorical([1, 2, 3]) + + assert "a" not in cat + + with pytest.raises(TypeError, match="unhashable type"): + ["a"] in cat + + with pytest.raises(TypeError, match="unhashable type"): + ["a", "b"] in cat diff --git a/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_repr.py b/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_repr.py new file mode 100644 index 0000000..d08c4b4 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_repr.py @@ -0,0 +1,525 @@ +import numpy as np + +from pandas import ( + Categorical, + CategoricalIndex, + Series, + date_range, + option_context, + period_range, + timedelta_range, +) +from pandas.tests.arrays.categorical.common import TestCategorical + + +class TestCategoricalReprWithFactor(TestCategorical): + def test_print(self): + expected = ["[a, b, b, a, a, c, c, c]", "Categories (3, object): [a < b < c]"] + expected = "\n".join(expected) + actual = repr(self.factor) + assert actual == expected + + +class TestCategoricalRepr: + def test_big_print(self): + factor = Categorical([0, 1, 2, 0, 1, 2] * 100, ["a", "b", "c"], fastpath=True) + expected = [ + "[a, b, c, a, b, ..., b, c, a, b, c]", + "Length: 600", + "Categories (3, object): [a, b, c]", + ] + expected = "\n".join(expected) + + actual = repr(factor) + + assert actual == expected + + def test_empty_print(self): + factor = Categorical([], ["a", "b", "c"]) + expected = "[], Categories (3, object): [a, b, c]" + actual = repr(factor) + assert actual == expected + + assert expected == actual + factor = Categorical([], ["a", "b", "c"], ordered=True) + expected = "[], Categories (3, object): [a < b < c]" + actual = repr(factor) + assert expected == actual + + factor = Categorical([], []) + expected = "[], Categories (0, object): []" + assert expected == repr(factor) + + def test_print_none_width(self): + # GH10087 + a = Series(Categorical([1, 2, 3, 4])) + exp = ( + "0 1\n1 2\n2 3\n3 4\n" + "dtype: category\nCategories (4, int64): [1, 2, 3, 4]" + ) + + with option_context("display.width", None): + assert exp == repr(a) + + def test_unicode_print(self): + c = Categorical(["aaaaa", "bb", "cccc"] * 20) + expected = """\ +[aaaaa, bb, cccc, aaaaa, bb, ..., bb, cccc, aaaaa, bb, cccc] +Length: 60 +Categories (3, object): [aaaaa, bb, cccc]""" + + assert repr(c) == expected + + c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20) + expected = """\ +[ああああ, いいいいい, ううううううう, ああああ, いいいいい, ..., いいいいい, ううううううう, ああああ, いいいいい, ううううううう] +Length: 60 +Categories (3, object): [ああああ, いいいいい, ううううううう]""" # noqa + + assert repr(c) == expected + + # unicode option should not affect to Categorical, as it doesn't care + # the repr width + with option_context("display.unicode.east_asian_width", True): + + c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20) + expected = """[ああああ, いいいいい, ううううううう, ああああ, いいいいい, ..., いいいいい, ううううううう, ああああ, いいいいい, ううううううう] +Length: 60 +Categories (3, object): [ああああ, いいいいい, ううううううう]""" # noqa + + assert repr(c) == expected + + def test_categorical_repr(self): + c = Categorical([1, 2, 3]) + exp = """[1, 2, 3] +Categories (3, int64): [1, 2, 3]""" + + assert repr(c) == exp + + c = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3]) + exp = """[1, 2, 3, 1, 2, 3] +Categories (3, int64): [1, 2, 3]""" + + assert repr(c) == exp + + c = Categorical([1, 2, 3, 4, 5] * 10) + exp = """[1, 2, 3, 4, 5, ..., 1, 2, 3, 4, 5] +Length: 50 +Categories (5, int64): [1, 2, 3, 4, 5]""" + + assert repr(c) == exp + + c = Categorical(np.arange(20)) + exp = """[0, 1, 2, 3, 4, ..., 15, 16, 17, 18, 19] +Length: 20 +Categories (20, int64): [0, 1, 2, 3, ..., 16, 17, 18, 19]""" + + assert repr(c) == exp + + def test_categorical_repr_ordered(self): + c = Categorical([1, 2, 3], ordered=True) + exp = """[1, 2, 3] +Categories (3, int64): [1 < 2 < 3]""" + + assert repr(c) == exp + + c = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3], ordered=True) + exp = """[1, 2, 3, 1, 2, 3] +Categories (3, int64): [1 < 2 < 3]""" + + assert repr(c) == exp + + c = Categorical([1, 2, 3, 4, 5] * 10, ordered=True) + exp = """[1, 2, 3, 4, 5, ..., 1, 2, 3, 4, 5] +Length: 50 +Categories (5, int64): [1 < 2 < 3 < 4 < 5]""" + + assert repr(c) == exp + + c = Categorical(np.arange(20), ordered=True) + exp = """[0, 1, 2, 3, 4, ..., 15, 16, 17, 18, 19] +Length: 20 +Categories (20, int64): [0 < 1 < 2 < 3 ... 16 < 17 < 18 < 19]""" + + assert repr(c) == exp + + def test_categorical_repr_datetime(self): + idx = date_range("2011-01-01 09:00", freq="H", periods=5) + c = Categorical(idx) + + exp = ( + "[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, " + "2011-01-01 12:00:00, 2011-01-01 13:00:00]\n" + "Categories (5, datetime64[ns]): [2011-01-01 09:00:00, " + "2011-01-01 10:00:00, 2011-01-01 11:00:00,\n" + " 2011-01-01 12:00:00, " + "2011-01-01 13:00:00]" + "" + ) + assert repr(c) == exp + + c = Categorical(idx.append(idx), categories=idx) + exp = ( + "[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, " + "2011-01-01 12:00:00, 2011-01-01 13:00:00, 2011-01-01 09:00:00, " + "2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, " + "2011-01-01 13:00:00]\n" + "Categories (5, datetime64[ns]): [2011-01-01 09:00:00, " + "2011-01-01 10:00:00, 2011-01-01 11:00:00,\n" + " 2011-01-01 12:00:00, " + "2011-01-01 13:00:00]" + ) + + assert repr(c) == exp + + idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") + c = Categorical(idx) + exp = ( + "[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, " + "2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, " + "2011-01-01 13:00:00-05:00]\n" + "Categories (5, datetime64[ns, US/Eastern]): " + "[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00,\n" + " " + "2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00,\n" + " " + "2011-01-01 13:00:00-05:00]" + ) + + assert repr(c) == exp + + c = Categorical(idx.append(idx), categories=idx) + exp = ( + "[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, " + "2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, " + "2011-01-01 13:00:00-05:00, 2011-01-01 09:00:00-05:00, " + "2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, " + "2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00]\n" + "Categories (5, datetime64[ns, US/Eastern]): " + "[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00,\n" + " " + "2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00,\n" + " " + "2011-01-01 13:00:00-05:00]" + ) + + assert repr(c) == exp + + def test_categorical_repr_datetime_ordered(self): + idx = date_range("2011-01-01 09:00", freq="H", periods=5) + c = Categorical(idx, ordered=True) + exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00] +Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 < + 2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" # noqa + + assert repr(c) == exp + + c = Categorical(idx.append(idx), categories=idx, ordered=True) + exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00, 2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00] +Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 < + 2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" # noqa + + assert repr(c) == exp + + idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") + c = Categorical(idx, ordered=True) + exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00] +Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 < + 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 < + 2011-01-01 13:00:00-05:00]""" # noqa + + assert repr(c) == exp + + c = Categorical(idx.append(idx), categories=idx, ordered=True) + exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00, 2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00] +Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 < + 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 < + 2011-01-01 13:00:00-05:00]""" # noqa + + assert repr(c) == exp + + def test_categorical_repr_int_with_nan(self): + c = Categorical([1, 2, np.nan]) + c_exp = """[1, 2, NaN]\nCategories (2, int64): [1, 2]""" + assert repr(c) == c_exp + + s = Series([1, 2, np.nan], dtype="object").astype("category") + s_exp = """0 1\n1 2\n2 NaN +dtype: category +Categories (2, int64): [1, 2]""" + assert repr(s) == s_exp + + def test_categorical_repr_period(self): + idx = period_range("2011-01-01 09:00", freq="H", periods=5) + c = Categorical(idx) + exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] +Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, + 2011-01-01 13:00]""" # noqa + + assert repr(c) == exp + + c = Categorical(idx.append(idx), categories=idx) + exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] +Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, + 2011-01-01 13:00]""" # noqa + + assert repr(c) == exp + + idx = period_range("2011-01", freq="M", periods=5) + c = Categorical(idx) + exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05] +Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" + + assert repr(c) == exp + + c = Categorical(idx.append(idx), categories=idx) + exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05, 2011-01, 2011-02, 2011-03, 2011-04, 2011-05] +Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" # noqa + + assert repr(c) == exp + + def test_categorical_repr_period_ordered(self): + idx = period_range("2011-01-01 09:00", freq="H", periods=5) + c = Categorical(idx, ordered=True) + exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] +Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < + 2011-01-01 13:00]""" # noqa + + assert repr(c) == exp + + c = Categorical(idx.append(idx), categories=idx, ordered=True) + exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] +Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < + 2011-01-01 13:00]""" # noqa + + assert repr(c) == exp + + idx = period_range("2011-01", freq="M", periods=5) + c = Categorical(idx, ordered=True) + exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05] +Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" + + assert repr(c) == exp + + c = Categorical(idx.append(idx), categories=idx, ordered=True) + exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05, 2011-01, 2011-02, 2011-03, 2011-04, 2011-05] +Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" # noqa + + assert repr(c) == exp + + def test_categorical_repr_timedelta(self): + idx = timedelta_range("1 days", periods=5) + c = Categorical(idx) + exp = """[1 days, 2 days, 3 days, 4 days, 5 days] +Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]""" + + assert repr(c) == exp + + c = Categorical(idx.append(idx), categories=idx) + exp = """[1 days, 2 days, 3 days, 4 days, 5 days, 1 days, 2 days, 3 days, 4 days, 5 days] +Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]""" # noqa + + assert repr(c) == exp + + idx = timedelta_range("1 hours", periods=20) + c = Categorical(idx) + exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00] +Length: 20 +Categories (20, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, + 3 days 01:00:00, ..., 16 days 01:00:00, 17 days 01:00:00, + 18 days 01:00:00, 19 days 01:00:00]""" # noqa + + assert repr(c) == exp + + c = Categorical(idx.append(idx), categories=idx) + exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00] +Length: 40 +Categories (20, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, + 3 days 01:00:00, ..., 16 days 01:00:00, 17 days 01:00:00, + 18 days 01:00:00, 19 days 01:00:00]""" # noqa + + assert repr(c) == exp + + def test_categorical_repr_timedelta_ordered(self): + idx = timedelta_range("1 days", periods=5) + c = Categorical(idx, ordered=True) + exp = """[1 days, 2 days, 3 days, 4 days, 5 days] +Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" # noqa + + assert repr(c) == exp + + c = Categorical(idx.append(idx), categories=idx, ordered=True) + exp = """[1 days, 2 days, 3 days, 4 days, 5 days, 1 days, 2 days, 3 days, 4 days, 5 days] +Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" # noqa + + assert repr(c) == exp + + idx = timedelta_range("1 hours", periods=20) + c = Categorical(idx, ordered=True) + exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00] +Length: 20 +Categories (20, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 < + 3 days 01:00:00 ... 16 days 01:00:00 < 17 days 01:00:00 < + 18 days 01:00:00 < 19 days 01:00:00]""" # noqa + + assert repr(c) == exp + + c = Categorical(idx.append(idx), categories=idx, ordered=True) + exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00] +Length: 40 +Categories (20, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 < + 3 days 01:00:00 ... 16 days 01:00:00 < 17 days 01:00:00 < + 18 days 01:00:00 < 19 days 01:00:00]""" # noqa + + assert repr(c) == exp + + def test_categorical_index_repr(self): + idx = CategoricalIndex(Categorical([1, 2, 3])) + exp = """CategoricalIndex([1, 2, 3], categories=[1, 2, 3], ordered=False, dtype='category')""" # noqa + assert repr(idx) == exp + + i = CategoricalIndex(Categorical(np.arange(10))) + exp = """CategoricalIndex([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], categories=[0, 1, 2, 3, 4, 5, 6, 7, ...], ordered=False, dtype='category')""" # noqa + assert repr(i) == exp + + def test_categorical_index_repr_ordered(self): + i = CategoricalIndex(Categorical([1, 2, 3], ordered=True)) + exp = """CategoricalIndex([1, 2, 3], categories=[1, 2, 3], ordered=True, dtype='category')""" # noqa + assert repr(i) == exp + + i = CategoricalIndex(Categorical(np.arange(10), ordered=True)) + exp = """CategoricalIndex([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], categories=[0, 1, 2, 3, 4, 5, 6, 7, ...], ordered=True, dtype='category')""" # noqa + assert repr(i) == exp + + def test_categorical_index_repr_datetime(self): + idx = date_range("2011-01-01 09:00", freq="H", periods=5) + i = CategoricalIndex(Categorical(idx)) + exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00', + '2011-01-01 11:00:00', '2011-01-01 12:00:00', + '2011-01-01 13:00:00'], + categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=False, dtype='category')""" # noqa + + assert repr(i) == exp + + idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") + i = CategoricalIndex(Categorical(idx)) + exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', + '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00', + '2011-01-01 13:00:00-05:00'], + categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=False, dtype='category')""" # noqa + + assert repr(i) == exp + + def test_categorical_index_repr_datetime_ordered(self): + idx = date_range("2011-01-01 09:00", freq="H", periods=5) + i = CategoricalIndex(Categorical(idx, ordered=True)) + exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00', + '2011-01-01 11:00:00', '2011-01-01 12:00:00', + '2011-01-01 13:00:00'], + categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=True, dtype='category')""" # noqa + + assert repr(i) == exp + + idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") + i = CategoricalIndex(Categorical(idx, ordered=True)) + exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', + '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00', + '2011-01-01 13:00:00-05:00'], + categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category')""" # noqa + + assert repr(i) == exp + + i = CategoricalIndex(Categorical(idx.append(idx), ordered=True)) + exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', + '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00', + '2011-01-01 13:00:00-05:00', '2011-01-01 09:00:00-05:00', + '2011-01-01 10:00:00-05:00', '2011-01-01 11:00:00-05:00', + '2011-01-01 12:00:00-05:00', '2011-01-01 13:00:00-05:00'], + categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category')""" # noqa + + assert repr(i) == exp + + def test_categorical_index_repr_period(self): + # test all length + idx = period_range("2011-01-01 09:00", freq="H", periods=1) + i = CategoricalIndex(Categorical(idx)) + exp = """CategoricalIndex(['2011-01-01 09:00'], categories=[2011-01-01 09:00], ordered=False, dtype='category')""" # noqa + assert repr(i) == exp + + idx = period_range("2011-01-01 09:00", freq="H", periods=2) + i = CategoricalIndex(Categorical(idx)) + exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00], ordered=False, dtype='category')""" # noqa + assert repr(i) == exp + + idx = period_range("2011-01-01 09:00", freq="H", periods=3) + i = CategoricalIndex(Categorical(idx)) + exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00], ordered=False, dtype='category')""" # noqa + assert repr(i) == exp + + idx = period_range("2011-01-01 09:00", freq="H", periods=5) + i = CategoricalIndex(Categorical(idx)) + exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00', + '2011-01-01 12:00', '2011-01-01 13:00'], + categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category')""" # noqa + + assert repr(i) == exp + + i = CategoricalIndex(Categorical(idx.append(idx))) + exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00', + '2011-01-01 12:00', '2011-01-01 13:00', '2011-01-01 09:00', + '2011-01-01 10:00', '2011-01-01 11:00', '2011-01-01 12:00', + '2011-01-01 13:00'], + categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category')""" # noqa + + assert repr(i) == exp + + idx = period_range("2011-01", freq="M", periods=5) + i = CategoricalIndex(Categorical(idx)) + exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=False, dtype='category')""" # noqa + assert repr(i) == exp + + def test_categorical_index_repr_period_ordered(self): + idx = period_range("2011-01-01 09:00", freq="H", periods=5) + i = CategoricalIndex(Categorical(idx, ordered=True)) + exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00', + '2011-01-01 12:00', '2011-01-01 13:00'], + categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=True, dtype='category')""" # noqa + + assert repr(i) == exp + + idx = period_range("2011-01", freq="M", periods=5) + i = CategoricalIndex(Categorical(idx, ordered=True)) + exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=True, dtype='category')""" # noqa + assert repr(i) == exp + + def test_categorical_index_repr_timedelta(self): + idx = timedelta_range("1 days", periods=5) + i = CategoricalIndex(Categorical(idx)) + exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days 00:00:00, 2 days 00:00:00, 3 days 00:00:00, 4 days 00:00:00, 5 days 00:00:00], ordered=False, dtype='category')""" # noqa + assert repr(i) == exp + + idx = timedelta_range("1 hours", periods=10) + i = CategoricalIndex(Categorical(idx)) + exp = """CategoricalIndex(['0 days 01:00:00', '1 days 01:00:00', '2 days 01:00:00', + '3 days 01:00:00', '4 days 01:00:00', '5 days 01:00:00', + '6 days 01:00:00', '7 days 01:00:00', '8 days 01:00:00', + '9 days 01:00:00'], + categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, 5 days 01:00:00, 6 days 01:00:00, 7 days 01:00:00, ...], ordered=False, dtype='category')""" # noqa + + assert repr(i) == exp + + def test_categorical_index_repr_timedelta_ordered(self): + idx = timedelta_range("1 days", periods=5) + i = CategoricalIndex(Categorical(idx, ordered=True)) + exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days 00:00:00, 2 days 00:00:00, 3 days 00:00:00, 4 days 00:00:00, 5 days 00:00:00], ordered=True, dtype='category')""" # noqa + assert repr(i) == exp + + idx = timedelta_range("1 hours", periods=10) + i = CategoricalIndex(Categorical(idx, ordered=True)) + exp = """CategoricalIndex(['0 days 01:00:00', '1 days 01:00:00', '2 days 01:00:00', + '3 days 01:00:00', '4 days 01:00:00', '5 days 01:00:00', + '6 days 01:00:00', '7 days 01:00:00', '8 days 01:00:00', + '9 days 01:00:00'], + categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, 5 days 01:00:00, 6 days 01:00:00, 7 days 01:00:00, ...], ordered=True, dtype='category')""" # noqa + + assert repr(i) == exp diff --git a/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_sorting.py b/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_sorting.py new file mode 100644 index 0000000..2a0ef04 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_sorting.py @@ -0,0 +1,124 @@ +import numpy as np +import pytest + +from pandas import Categorical, Index +import pandas._testing as tm + + +class TestCategoricalSort: + def test_argsort(self): + c = Categorical([5, 3, 1, 4, 2], ordered=True) + + expected = np.array([2, 4, 1, 3, 0]) + tm.assert_numpy_array_equal( + c.argsort(ascending=True), expected, check_dtype=False + ) + + expected = expected[::-1] + tm.assert_numpy_array_equal( + c.argsort(ascending=False), expected, check_dtype=False + ) + + def test_numpy_argsort(self): + c = Categorical([5, 3, 1, 4, 2], ordered=True) + + expected = np.array([2, 4, 1, 3, 0]) + tm.assert_numpy_array_equal(np.argsort(c), expected, check_dtype=False) + + tm.assert_numpy_array_equal( + np.argsort(c, kind="mergesort"), expected, check_dtype=False + ) + + msg = "the 'axis' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.argsort(c, axis=0) + + msg = "the 'order' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.argsort(c, order="C") + + def test_sort_values(self): + + # unordered cats are sortable + cat = Categorical(["a", "b", "b", "a"], ordered=False) + cat.sort_values() + + cat = Categorical(["a", "c", "b", "d"], ordered=True) + + # sort_values + res = cat.sort_values() + exp = np.array(["a", "b", "c", "d"], dtype=object) + tm.assert_numpy_array_equal(res.__array__(), exp) + tm.assert_index_equal(res.categories, cat.categories) + + cat = Categorical( + ["a", "c", "b", "d"], categories=["a", "b", "c", "d"], ordered=True + ) + res = cat.sort_values() + exp = np.array(["a", "b", "c", "d"], dtype=object) + tm.assert_numpy_array_equal(res.__array__(), exp) + tm.assert_index_equal(res.categories, cat.categories) + + res = cat.sort_values(ascending=False) + exp = np.array(["d", "c", "b", "a"], dtype=object) + tm.assert_numpy_array_equal(res.__array__(), exp) + tm.assert_index_equal(res.categories, cat.categories) + + # sort (inplace order) + cat1 = cat.copy() + cat1.sort_values(inplace=True) + exp = np.array(["a", "b", "c", "d"], dtype=object) + tm.assert_numpy_array_equal(cat1.__array__(), exp) + tm.assert_index_equal(res.categories, cat.categories) + + # reverse + cat = Categorical(["a", "c", "c", "b", "d"], ordered=True) + res = cat.sort_values(ascending=False) + exp_val = np.array(["d", "c", "c", "b", "a"], dtype=object) + exp_categories = Index(["a", "b", "c", "d"]) + tm.assert_numpy_array_equal(res.__array__(), exp_val) + tm.assert_index_equal(res.categories, exp_categories) + + def test_sort_values_na_position(self): + # see gh-12882 + cat = Categorical([5, 2, np.nan, 2, np.nan], ordered=True) + exp_categories = Index([2, 5]) + + exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan]) + res = cat.sort_values() # default arguments + tm.assert_numpy_array_equal(res.__array__(), exp) + tm.assert_index_equal(res.categories, exp_categories) + + exp = np.array([np.nan, np.nan, 2.0, 2.0, 5.0]) + res = cat.sort_values(ascending=True, na_position="first") + tm.assert_numpy_array_equal(res.__array__(), exp) + tm.assert_index_equal(res.categories, exp_categories) + + exp = np.array([np.nan, np.nan, 5.0, 2.0, 2.0]) + res = cat.sort_values(ascending=False, na_position="first") + tm.assert_numpy_array_equal(res.__array__(), exp) + tm.assert_index_equal(res.categories, exp_categories) + + exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan]) + res = cat.sort_values(ascending=True, na_position="last") + tm.assert_numpy_array_equal(res.__array__(), exp) + tm.assert_index_equal(res.categories, exp_categories) + + exp = np.array([5.0, 2.0, 2.0, np.nan, np.nan]) + res = cat.sort_values(ascending=False, na_position="last") + tm.assert_numpy_array_equal(res.__array__(), exp) + tm.assert_index_equal(res.categories, exp_categories) + + cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True) + res = cat.sort_values(ascending=False, na_position="last") + exp_val = np.array(["d", "c", "b", "a", np.nan], dtype=object) + exp_categories = Index(["a", "b", "c", "d"]) + tm.assert_numpy_array_equal(res.__array__(), exp_val) + tm.assert_index_equal(res.categories, exp_categories) + + cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True) + res = cat.sort_values(ascending=False, na_position="first") + exp_val = np.array([np.nan, "d", "c", "b", "a"], dtype=object) + exp_categories = Index(["a", "b", "c", "d"]) + tm.assert_numpy_array_equal(res.__array__(), exp_val) + tm.assert_index_equal(res.categories, exp_categories) diff --git a/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_subclass.py b/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_subclass.py new file mode 100644 index 0000000..b80d0ff --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_subclass.py @@ -0,0 +1,22 @@ +from pandas import Categorical +import pandas._testing as tm + + +class TestCategoricalSubclassing: + def test_constructor(self): + sc = tm.SubclassedCategorical(["a", "b", "c"]) + assert isinstance(sc, tm.SubclassedCategorical) + tm.assert_categorical_equal(sc, Categorical(["a", "b", "c"])) + + def test_from_codes(self): + sc = tm.SubclassedCategorical.from_codes([1, 0, 2], ["a", "b", "c"]) + assert isinstance(sc, tm.SubclassedCategorical) + exp = Categorical.from_codes([1, 0, 2], ["a", "b", "c"]) + tm.assert_categorical_equal(sc, exp) + + def test_map(self): + sc = tm.SubclassedCategorical(["a", "b", "c"]) + res = sc.map(lambda x: x.upper()) + assert isinstance(res, tm.SubclassedCategorical) + exp = Categorical(["A", "B", "C"]) + tm.assert_categorical_equal(res, exp) diff --git a/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_warnings.py b/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_warnings.py new file mode 100644 index 0000000..9e164a2 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arrays/categorical/test_warnings.py @@ -0,0 +1,29 @@ +import pytest + +from pandas.util._test_decorators import async_mark + +import pandas._testing as tm + + +class TestCategoricalWarnings: + @async_mark() + async def test_tab_complete_warning(self, ip): + # https://github.com/pandas-dev/pandas/issues/16409 + pytest.importorskip("IPython", minversion="6.0.0") + from IPython.core.completer import provisionalcompleter + + code = "import pandas as pd; c = Categorical([])" + await ip.run_code(code) + + # GH 31324 newer jedi version raises Deprecation warning + import jedi + + if jedi.__version__ < "0.16.0": + warning = tm.assert_produces_warning(None) + else: + warning = tm.assert_produces_warning( + DeprecationWarning, check_stacklevel=False + ) + with warning: + with provisionalcompleter("ignore"): + list(ip.Completer.completions("c.", 1)) diff --git a/venv/Lib/site-packages/pandas/tests/arrays/interval/__init__.py b/venv/Lib/site-packages/pandas/tests/arrays/interval/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/arrays/interval/test_interval.py b/venv/Lib/site-packages/pandas/tests/arrays/interval/test_interval.py new file mode 100644 index 0000000..e046d87 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arrays/interval/test_interval.py @@ -0,0 +1,214 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + Index, + Interval, + IntervalIndex, + Timedelta, + Timestamp, + date_range, + timedelta_range, +) +import pandas._testing as tm +from pandas.core.arrays import IntervalArray + + +@pytest.fixture( + params=[ + (Index([0, 2, 4]), Index([1, 3, 5])), + (Index([0.0, 1.0, 2.0]), Index([1.0, 2.0, 3.0])), + (timedelta_range("0 days", periods=3), timedelta_range("1 day", periods=3)), + (date_range("20170101", periods=3), date_range("20170102", periods=3)), + ( + date_range("20170101", periods=3, tz="US/Eastern"), + date_range("20170102", periods=3, tz="US/Eastern"), + ), + ], + ids=lambda x: str(x[0].dtype), +) +def left_right_dtypes(request): + """ + Fixture for building an IntervalArray from various dtypes + """ + return request.param + + +class TestAttributes: + @pytest.mark.parametrize( + "left, right", + [ + (0, 1), + (Timedelta("0 days"), Timedelta("1 day")), + (Timestamp("2018-01-01"), Timestamp("2018-01-02")), + ( + Timestamp("2018-01-01", tz="US/Eastern"), + Timestamp("2018-01-02", tz="US/Eastern"), + ), + ], + ) + @pytest.mark.parametrize("constructor", [IntervalArray, IntervalIndex]) + def test_is_empty(self, constructor, left, right, closed): + # GH27219 + tuples = [(left, left), (left, right), np.nan] + expected = np.array([closed != "both", False, False]) + result = constructor.from_tuples(tuples, closed=closed).is_empty + tm.assert_numpy_array_equal(result, expected) + + +class TestMethods: + @pytest.mark.parametrize("new_closed", ["left", "right", "both", "neither"]) + def test_set_closed(self, closed, new_closed): + # GH 21670 + array = IntervalArray.from_breaks(range(10), closed=closed) + result = array.set_closed(new_closed) + expected = IntervalArray.from_breaks(range(10), closed=new_closed) + tm.assert_extension_array_equal(result, expected) + + @pytest.mark.parametrize( + "other", + [ + Interval(0, 1, closed="right"), + IntervalArray.from_breaks([1, 2, 3, 4], closed="right"), + ], + ) + def test_where_raises(self, other): + ser = pd.Series(IntervalArray.from_breaks([1, 2, 3, 4], closed="left")) + match = "'value.closed' is 'right', expected 'left'." + with pytest.raises(ValueError, match=match): + ser.where([True, False, True], other=other) + + +class TestSetitem: + def test_set_na(self, left_right_dtypes): + left, right = left_right_dtypes + result = IntervalArray.from_arrays(left, right) + result[0] = np.nan + + expected_left = Index([left._na_value] + list(left[1:])) + expected_right = Index([right._na_value] + list(right[1:])) + expected = IntervalArray.from_arrays(expected_left, expected_right) + + tm.assert_extension_array_equal(result, expected) + + +def test_repr(): + # GH 25022 + arr = IntervalArray.from_tuples([(0, 1), (1, 2)]) + result = repr(arr) + expected = ( + "\n" + "[(0, 1], (1, 2]]\n" + "Length: 2, closed: right, dtype: interval[int64]" + ) + assert result == expected + + +# ---------------------------------------------------------------------------- +# Arrow interaction + + +pyarrow_skip = td.skip_if_no("pyarrow", min_version="0.15.1.dev") + + +@pyarrow_skip +def test_arrow_extension_type(): + import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowIntervalType + + p1 = ArrowIntervalType(pa.int64(), "left") + p2 = ArrowIntervalType(pa.int64(), "left") + p3 = ArrowIntervalType(pa.int64(), "right") + + assert p1.closed == "left" + assert p1 == p2 + assert not p1 == p3 + assert hash(p1) == hash(p2) + assert not hash(p1) == hash(p3) + + +@pyarrow_skip +def test_arrow_array(): + import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowIntervalType + + intervals = pd.interval_range(1, 5, freq=1).array + + result = pa.array(intervals) + assert isinstance(result.type, ArrowIntervalType) + assert result.type.closed == intervals.closed + assert result.type.subtype == pa.int64() + assert result.storage.field("left").equals(pa.array([1, 2, 3, 4], type="int64")) + assert result.storage.field("right").equals(pa.array([2, 3, 4, 5], type="int64")) + + expected = pa.array([{"left": i, "right": i + 1} for i in range(1, 5)]) + assert result.storage.equals(expected) + + # convert to its storage type + result = pa.array(intervals, type=expected.type) + assert result.equals(expected) + + # unsupported conversions + with pytest.raises(TypeError): + pa.array(intervals, type="float64") + + with pytest.raises(TypeError, match="different 'subtype'"): + pa.array(intervals, type=ArrowIntervalType(pa.float64(), "left")) + + +@pyarrow_skip +def test_arrow_array_missing(): + import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowIntervalType + + arr = IntervalArray.from_breaks([0, 1, 2, 3]) + arr[1] = None + + result = pa.array(arr) + assert isinstance(result.type, ArrowIntervalType) + assert result.type.closed == arr.closed + assert result.type.subtype == pa.float64() + + # fields have missing values (not NaN) + left = pa.array([0.0, None, 2.0], type="float64") + right = pa.array([1.0, None, 3.0], type="float64") + assert result.storage.field("left").equals(left) + assert result.storage.field("right").equals(right) + + # structarray itself also has missing values on the array level + vals = [ + {"left": 0.0, "right": 1.0}, + {"left": None, "right": None}, + {"left": 2.0, "right": 3.0}, + ] + expected = pa.StructArray.from_pandas(vals, mask=np.array([False, True, False])) + assert result.storage.equals(expected) + + +@pyarrow_skip +@pytest.mark.parametrize( + "breaks", + [[0, 1, 2, 3], pd.date_range("2017", periods=4, freq="D")], + ids=["int", "datetime64[ns]"], +) +def test_arrow_table_roundtrip(breaks): + import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowIntervalType + + arr = IntervalArray.from_breaks(breaks) + arr[1] = None + df = pd.DataFrame({"a": arr}) + + table = pa.table(df) + assert isinstance(table.field("a").type, ArrowIntervalType) + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.IntervalDtype) + tm.assert_frame_equal(result, df) + + table2 = pa.concat_tables([table, table]) + result = table2.to_pandas() + expected = pd.concat([df, df], ignore_index=True) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/arrays/interval/test_ops.py b/venv/Lib/site-packages/pandas/tests/arrays/interval/test_ops.py new file mode 100644 index 0000000..b4de80d --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arrays/interval/test_ops.py @@ -0,0 +1,88 @@ +"""Tests for Interval-Interval operations, such as overlaps, contains, etc.""" +import numpy as np +import pytest + +from pandas import Interval, IntervalIndex, Timedelta, Timestamp +import pandas._testing as tm +from pandas.core.arrays import IntervalArray + + +@pytest.fixture(params=[IntervalArray, IntervalIndex]) +def constructor(request): + """ + Fixture for testing both interval container classes. + """ + return request.param + + +@pytest.fixture( + params=[ + (Timedelta("0 days"), Timedelta("1 day")), + (Timestamp("2018-01-01"), Timedelta("1 day")), + (0, 1), + ], + ids=lambda x: type(x[0]).__name__, +) +def start_shift(request): + """ + Fixture for generating intervals of different types from a start value + and a shift value that can be added to start to generate an endpoint. + """ + return request.param + + +class TestOverlaps: + def test_overlaps_interval(self, constructor, start_shift, closed, other_closed): + start, shift = start_shift + interval = Interval(start, start + 3 * shift, other_closed) + + # intervals: identical, nested, spanning, partial, adjacent, disjoint + tuples = [ + (start, start + 3 * shift), + (start + shift, start + 2 * shift), + (start - shift, start + 4 * shift), + (start + 2 * shift, start + 4 * shift), + (start + 3 * shift, start + 4 * shift), + (start + 4 * shift, start + 5 * shift), + ] + interval_container = constructor.from_tuples(tuples, closed) + + adjacent = interval.closed_right and interval_container.closed_left + expected = np.array([True, True, True, True, adjacent, False]) + result = interval_container.overlaps(interval) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("other_constructor", [IntervalArray, IntervalIndex]) + def test_overlaps_interval_container(self, constructor, other_constructor): + # TODO: modify this test when implemented + interval_container = constructor.from_breaks(range(5)) + other_container = other_constructor.from_breaks(range(5)) + with pytest.raises(NotImplementedError): + interval_container.overlaps(other_container) + + def test_overlaps_na(self, constructor, start_shift): + """NA values are marked as False""" + start, shift = start_shift + interval = Interval(start, start + shift) + + tuples = [ + (start, start + shift), + np.nan, + (start + 2 * shift, start + 3 * shift), + ] + interval_container = constructor.from_tuples(tuples) + + expected = np.array([True, False, False]) + result = interval_container.overlaps(interval) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "other", + [10, True, "foo", Timedelta("1 day"), Timestamp("2018-01-01")], + ids=lambda x: type(x).__name__, + ) + def test_overlaps_invalid_type(self, constructor, other): + interval_container = constructor.from_breaks(range(5)) + msg = f"`other` must be Interval-like, got {type(other).__name__}" + with pytest.raises(TypeError, match=msg): + interval_container.overlaps(other) diff --git a/venv/Lib/site-packages/pandas/tests/arrays/sparse/__init__.py b/venv/Lib/site-packages/pandas/tests/arrays/sparse/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/arrays/sparse/test_accessor.py b/venv/Lib/site-packages/pandas/tests/arrays/sparse/test_accessor.py new file mode 100644 index 0000000..d8a1831 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arrays/sparse/test_accessor.py @@ -0,0 +1,123 @@ +import string + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays.sparse import SparseArray, SparseDtype + + +class TestSeriesAccessor: + # TODO: collect other Series accessor tests + def test_to_dense(self): + s = pd.Series([0, 1, 0, 10], dtype="Sparse[int64]") + result = s.sparse.to_dense() + expected = pd.Series([0, 1, 0, 10]) + tm.assert_series_equal(result, expected) + + +class TestFrameAccessor: + def test_accessor_raises(self): + df = pd.DataFrame({"A": [0, 1]}) + with pytest.raises(AttributeError, match="sparse"): + df.sparse + + @pytest.mark.parametrize("format", ["csc", "csr", "coo"]) + @pytest.mark.parametrize("labels", [None, list(string.ascii_letters[:10])]) + @pytest.mark.parametrize("dtype", ["float64", "int64"]) + @td.skip_if_no_scipy + def test_from_spmatrix(self, format, labels, dtype): + import scipy.sparse + + sp_dtype = SparseDtype(dtype, np.array(0, dtype=dtype).item()) + + mat = scipy.sparse.eye(10, format=format, dtype=dtype) + result = pd.DataFrame.sparse.from_spmatrix(mat, index=labels, columns=labels) + expected = pd.DataFrame( + np.eye(10, dtype=dtype), index=labels, columns=labels + ).astype(sp_dtype) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "columns", + [["a", "b"], pd.MultiIndex.from_product([["A"], ["a", "b"]]), ["a", "a"]], + ) + @td.skip_if_no_scipy + def test_from_spmatrix_columns(self, columns): + import scipy.sparse + + dtype = SparseDtype("float64", 0.0) + + mat = scipy.sparse.random(10, 2, density=0.5) + result = pd.DataFrame.sparse.from_spmatrix(mat, columns=columns) + expected = pd.DataFrame(mat.toarray(), columns=columns).astype(dtype) + tm.assert_frame_equal(result, expected) + + @td.skip_if_no_scipy + def test_to_coo(self): + import scipy.sparse + + df = pd.DataFrame({"A": [0, 1, 0], "B": [1, 0, 0]}, dtype="Sparse[int64, 0]") + result = df.sparse.to_coo() + expected = scipy.sparse.coo_matrix(np.asarray(df)) + assert (result != expected).nnz == 0 + + def test_to_dense(self): + df = pd.DataFrame( + { + "A": SparseArray([1, 0], dtype=SparseDtype("int64", 0)), + "B": SparseArray([1, 0], dtype=SparseDtype("int64", 1)), + "C": SparseArray([1.0, 0.0], dtype=SparseDtype("float64", 0.0)), + }, + index=["b", "a"], + ) + result = df.sparse.to_dense() + expected = pd.DataFrame( + {"A": [1, 0], "B": [1, 0], "C": [1.0, 0.0]}, index=["b", "a"] + ) + tm.assert_frame_equal(result, expected) + + def test_density(self): + df = pd.DataFrame( + { + "A": SparseArray([1, 0, 2, 1], fill_value=0), + "B": SparseArray([0, 1, 1, 1], fill_value=0), + } + ) + res = df.sparse.density + expected = 0.75 + assert res == expected + + @pytest.mark.parametrize("dtype", ["int64", "float64"]) + @pytest.mark.parametrize("dense_index", [True, False]) + @td.skip_if_no_scipy + def test_series_from_coo(self, dtype, dense_index): + import scipy.sparse + + A = scipy.sparse.eye(3, format="coo", dtype=dtype) + result = pd.Series.sparse.from_coo(A, dense_index=dense_index) + index = pd.MultiIndex.from_tuples([(0, 0), (1, 1), (2, 2)]) + expected = pd.Series(SparseArray(np.array([1, 1, 1], dtype=dtype)), index=index) + if dense_index: + expected = expected.reindex(pd.MultiIndex.from_product(index.levels)) + + tm.assert_series_equal(result, expected) + + @td.skip_if_no_scipy + def test_series_from_coo_incorrect_format_raises(self): + # gh-26554 + import scipy.sparse + + m = scipy.sparse.csr_matrix(np.array([[0, 1], [0, 0]])) + with pytest.raises( + TypeError, match="Expected coo_matrix. Got csr_matrix instead." + ): + pd.Series.sparse.from_coo(m) + + def test_with_column_named_sparse(self): + # https://github.com/pandas-dev/pandas/issues/30758 + df = pd.DataFrame({"sparse": pd.arrays.SparseArray([1, 2])}) + assert isinstance(df.sparse, pd.core.arrays.sparse.accessor.SparseFrameAccessor) diff --git a/venv/Lib/site-packages/pandas/tests/arrays/sparse/test_arithmetics.py b/venv/Lib/site-packages/pandas/tests/arrays/sparse/test_arithmetics.py new file mode 100644 index 0000000..73652da --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arrays/sparse/test_arithmetics.py @@ -0,0 +1,487 @@ +import operator + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.core import ops +from pandas.core.arrays.sparse import SparseArray, SparseDtype + + +@pytest.fixture(params=["integer", "block"]) +def kind(request): + """kind kwarg to pass to SparseArray/SparseSeries""" + return request.param + + +@pytest.fixture(params=[True, False]) +def mix(request): + # whether to operate op(sparse, dense) instead of op(sparse, sparse) + return request.param + + +class TestSparseArrayArithmetics: + + _base = np.array + _klass = SparseArray + + def _assert(self, a, b): + tm.assert_numpy_array_equal(a, b) + + def _check_numeric_ops(self, a, b, a_dense, b_dense, mix, op): + with np.errstate(invalid="ignore", divide="ignore"): + if op in [operator.floordiv, ops.rfloordiv]: + # FIXME: GH#13843 + if self._base == pd.Series and a.dtype.subtype == np.dtype("int64"): + pytest.xfail("Not defined/working. See GH#13843") + + if mix: + result = op(a, b_dense).to_dense() + else: + result = op(a, b).to_dense() + + if op in [operator.truediv, ops.rtruediv]: + # pandas uses future division + expected = op(a_dense * 1.0, b_dense) + else: + expected = op(a_dense, b_dense) + + if op in [operator.floordiv, ops.rfloordiv]: + # Series sets 1//0 to np.inf, which SparseArray does not do (yet) + mask = np.isinf(expected) + if mask.any(): + expected[mask] = np.nan + + self._assert(result, expected) + + def _check_bool_result(self, res): + assert isinstance(res, self._klass) + assert isinstance(res.dtype, SparseDtype) + assert res.dtype.subtype == np.bool + assert isinstance(res.fill_value, bool) + + def _check_comparison_ops(self, a, b, a_dense, b_dense): + with np.errstate(invalid="ignore"): + # Unfortunately, trying to wrap the computation of each expected + # value is with np.errstate() is too tedious. + # + # sparse & sparse + self._check_bool_result(a == b) + self._assert((a == b).to_dense(), a_dense == b_dense) + + self._check_bool_result(a != b) + self._assert((a != b).to_dense(), a_dense != b_dense) + + self._check_bool_result(a >= b) + self._assert((a >= b).to_dense(), a_dense >= b_dense) + + self._check_bool_result(a <= b) + self._assert((a <= b).to_dense(), a_dense <= b_dense) + + self._check_bool_result(a > b) + self._assert((a > b).to_dense(), a_dense > b_dense) + + self._check_bool_result(a < b) + self._assert((a < b).to_dense(), a_dense < b_dense) + + # sparse & dense + self._check_bool_result(a == b_dense) + self._assert((a == b_dense).to_dense(), a_dense == b_dense) + + self._check_bool_result(a != b_dense) + self._assert((a != b_dense).to_dense(), a_dense != b_dense) + + self._check_bool_result(a >= b_dense) + self._assert((a >= b_dense).to_dense(), a_dense >= b_dense) + + self._check_bool_result(a <= b_dense) + self._assert((a <= b_dense).to_dense(), a_dense <= b_dense) + + self._check_bool_result(a > b_dense) + self._assert((a > b_dense).to_dense(), a_dense > b_dense) + + self._check_bool_result(a < b_dense) + self._assert((a < b_dense).to_dense(), a_dense < b_dense) + + def _check_logical_ops(self, a, b, a_dense, b_dense): + # sparse & sparse + self._check_bool_result(a & b) + self._assert((a & b).to_dense(), a_dense & b_dense) + + self._check_bool_result(a | b) + self._assert((a | b).to_dense(), a_dense | b_dense) + # sparse & dense + self._check_bool_result(a & b_dense) + self._assert((a & b_dense).to_dense(), a_dense & b_dense) + + self._check_bool_result(a | b_dense) + self._assert((a | b_dense).to_dense(), a_dense | b_dense) + + @pytest.mark.parametrize("scalar", [0, 1, 3]) + @pytest.mark.parametrize("fill_value", [None, 0, 2]) + def test_float_scalar( + self, kind, mix, all_arithmetic_functions, fill_value, scalar + ): + op = all_arithmetic_functions + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + + a = self._klass(values, kind=kind, fill_value=fill_value) + self._check_numeric_ops(a, scalar, values, scalar, mix, op) + + def test_float_scalar_comparison(self, kind): + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + + a = self._klass(values, kind=kind) + self._check_comparison_ops(a, 1, values, 1) + self._check_comparison_ops(a, 0, values, 0) + self._check_comparison_ops(a, 3, values, 3) + + a = self._klass(values, kind=kind, fill_value=0) + self._check_comparison_ops(a, 1, values, 1) + self._check_comparison_ops(a, 0, values, 0) + self._check_comparison_ops(a, 3, values, 3) + + a = self._klass(values, kind=kind, fill_value=2) + self._check_comparison_ops(a, 1, values, 1) + self._check_comparison_ops(a, 0, values, 0) + self._check_comparison_ops(a, 3, values, 3) + + def test_float_same_index(self, kind, mix, all_arithmetic_functions): + # when sp_index are the same + op = all_arithmetic_functions + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = self._base([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan]) + + a = self._klass(values, kind=kind) + b = self._klass(rvalues, kind=kind) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + + values = self._base([0.0, 1.0, 2.0, 6.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0]) + rvalues = self._base([0.0, 2.0, 3.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0, 0.0]) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind, fill_value=0) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + + def test_float_same_index_comparison(self, kind): + # when sp_index are the same + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = self._base([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan]) + + a = self._klass(values, kind=kind) + b = self._klass(rvalues, kind=kind) + self._check_comparison_ops(a, b, values, rvalues) + + values = self._base([0.0, 1.0, 2.0, 6.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0]) + rvalues = self._base([0.0, 2.0, 3.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0, 0.0]) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind, fill_value=0) + self._check_comparison_ops(a, b, values, rvalues) + + def test_float_array(self, kind, mix, all_arithmetic_functions): + op = all_arithmetic_functions + + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) + + a = self._klass(values, kind=kind) + b = self._klass(rvalues, kind=kind) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind, fill_value=0) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + + a = self._klass(values, kind=kind, fill_value=1) + b = self._klass(rvalues, kind=kind, fill_value=2) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + + def test_float_array_different_kind(self, mix, all_arithmetic_functions): + op = all_arithmetic_functions + + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) + + a = self._klass(values, kind="integer") + b = self._klass(rvalues, kind="block") + self._check_numeric_ops(a, b, values, rvalues, mix, op) + self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op) + + a = self._klass(values, kind="integer", fill_value=0) + b = self._klass(rvalues, kind="block") + self._check_numeric_ops(a, b, values, rvalues, mix, op) + + a = self._klass(values, kind="integer", fill_value=0) + b = self._klass(rvalues, kind="block", fill_value=0) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + + a = self._klass(values, kind="integer", fill_value=1) + b = self._klass(rvalues, kind="block", fill_value=2) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + + def test_float_array_comparison(self, kind): + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) + + a = self._klass(values, kind=kind) + b = self._klass(rvalues, kind=kind) + self._check_comparison_ops(a, b, values, rvalues) + self._check_comparison_ops(a, b * 0, values, rvalues * 0) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind) + self._check_comparison_ops(a, b, values, rvalues) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind, fill_value=0) + self._check_comparison_ops(a, b, values, rvalues) + + a = self._klass(values, kind=kind, fill_value=1) + b = self._klass(rvalues, kind=kind, fill_value=2) + self._check_comparison_ops(a, b, values, rvalues) + + def test_int_array(self, kind, mix, all_arithmetic_functions): + op = all_arithmetic_functions + + # have to specify dtype explicitly until fixing GH 667 + dtype = np.int64 + + values = self._base([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype) + rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype) + + a = self._klass(values, dtype=dtype, kind=kind) + assert a.dtype == SparseDtype(dtype) + b = self._klass(rvalues, dtype=dtype, kind=kind) + assert b.dtype == SparseDtype(dtype) + + self._check_numeric_ops(a, b, values, rvalues, mix, op) + self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op) + + a = self._klass(values, fill_value=0, dtype=dtype, kind=kind) + assert a.dtype == SparseDtype(dtype) + b = self._klass(rvalues, dtype=dtype, kind=kind) + assert b.dtype == SparseDtype(dtype) + + self._check_numeric_ops(a, b, values, rvalues, mix, op) + + a = self._klass(values, fill_value=0, dtype=dtype, kind=kind) + assert a.dtype == SparseDtype(dtype) + b = self._klass(rvalues, fill_value=0, dtype=dtype, kind=kind) + assert b.dtype == SparseDtype(dtype) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + + a = self._klass(values, fill_value=1, dtype=dtype, kind=kind) + assert a.dtype == SparseDtype(dtype, fill_value=1) + b = self._klass(rvalues, fill_value=2, dtype=dtype, kind=kind) + assert b.dtype == SparseDtype(dtype, fill_value=2) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + + def test_int_array_comparison(self, kind): + dtype = "int64" + # int32 NI ATM + + values = self._base([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype) + rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype) + + a = self._klass(values, dtype=dtype, kind=kind) + b = self._klass(rvalues, dtype=dtype, kind=kind) + self._check_comparison_ops(a, b, values, rvalues) + self._check_comparison_ops(a, b * 0, values, rvalues * 0) + + a = self._klass(values, dtype=dtype, kind=kind, fill_value=0) + b = self._klass(rvalues, dtype=dtype, kind=kind) + self._check_comparison_ops(a, b, values, rvalues) + + a = self._klass(values, dtype=dtype, kind=kind, fill_value=0) + b = self._klass(rvalues, dtype=dtype, kind=kind, fill_value=0) + self._check_comparison_ops(a, b, values, rvalues) + + a = self._klass(values, dtype=dtype, kind=kind, fill_value=1) + b = self._klass(rvalues, dtype=dtype, kind=kind, fill_value=2) + self._check_comparison_ops(a, b, values, rvalues) + + @pytest.mark.parametrize("fill_value", [True, False, np.nan]) + def test_bool_same_index(self, kind, fill_value): + # GH 14000 + # when sp_index are the same + values = self._base([True, False, True, True], dtype=np.bool) + rvalues = self._base([True, False, True, True], dtype=np.bool) + + a = self._klass(values, kind=kind, dtype=np.bool, fill_value=fill_value) + b = self._klass(rvalues, kind=kind, dtype=np.bool, fill_value=fill_value) + self._check_logical_ops(a, b, values, rvalues) + + @pytest.mark.parametrize("fill_value", [True, False, np.nan]) + def test_bool_array_logical(self, kind, fill_value): + # GH 14000 + # when sp_index are the same + values = self._base([True, False, True, False, True, True], dtype=np.bool) + rvalues = self._base([True, False, False, True, False, True], dtype=np.bool) + + a = self._klass(values, kind=kind, dtype=np.bool, fill_value=fill_value) + b = self._klass(rvalues, kind=kind, dtype=np.bool, fill_value=fill_value) + self._check_logical_ops(a, b, values, rvalues) + + def test_mixed_array_float_int(self, kind, mix, all_arithmetic_functions): + op = all_arithmetic_functions + + rdtype = "int64" + + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype) + + a = self._klass(values, kind=kind) + b = self._klass(rvalues, kind=kind) + assert b.dtype == SparseDtype(rdtype) + + self._check_numeric_ops(a, b, values, rvalues, mix, op) + self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind) + assert b.dtype == SparseDtype(rdtype) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind, fill_value=0) + assert b.dtype == SparseDtype(rdtype) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + + a = self._klass(values, kind=kind, fill_value=1) + b = self._klass(rvalues, kind=kind, fill_value=2) + assert b.dtype == SparseDtype(rdtype, fill_value=2) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + + def test_mixed_array_comparison(self, kind): + rdtype = "int64" + # int32 NI ATM + + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype) + + a = self._klass(values, kind=kind) + b = self._klass(rvalues, kind=kind) + assert b.dtype == SparseDtype(rdtype) + + self._check_comparison_ops(a, b, values, rvalues) + self._check_comparison_ops(a, b * 0, values, rvalues * 0) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind) + assert b.dtype == SparseDtype(rdtype) + self._check_comparison_ops(a, b, values, rvalues) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind, fill_value=0) + assert b.dtype == SparseDtype(rdtype) + self._check_comparison_ops(a, b, values, rvalues) + + a = self._klass(values, kind=kind, fill_value=1) + b = self._klass(rvalues, kind=kind, fill_value=2) + assert b.dtype == SparseDtype(rdtype, fill_value=2) + self._check_comparison_ops(a, b, values, rvalues) + + def test_xor(self): + s = SparseArray([True, True, False, False]) + t = SparseArray([True, False, True, False]) + result = s ^ t + sp_index = pd.core.arrays.sparse.IntIndex(4, np.array([0, 1, 2], dtype="int32")) + expected = SparseArray([False, True, True], sparse_index=sp_index) + tm.assert_sp_array_equal(result, expected) + + +@pytest.mark.parametrize("op", [operator.eq, operator.add]) +def test_with_list(op): + arr = SparseArray([0, 1], fill_value=0) + result = op(arr, [0, 1]) + expected = op(arr, SparseArray([0, 1])) + tm.assert_sp_array_equal(result, expected) + + +def test_with_dataframe(): + # GH#27910 + arr = SparseArray([0, 1], fill_value=0) + df = pd.DataFrame([[1, 2], [3, 4]]) + result = arr.__add__(df) + assert result is NotImplemented + + +def test_with_zerodim_ndarray(): + # GH#27910 + arr = SparseArray([0, 1], fill_value=0) + + result = arr * np.array(2) + expected = arr * 2 + tm.assert_sp_array_equal(result, expected) + + +@pytest.mark.parametrize("ufunc", [np.abs, np.exp]) +@pytest.mark.parametrize( + "arr", [SparseArray([0, 0, -1, 1]), SparseArray([None, None, -1, 1])] +) +def test_ufuncs(ufunc, arr): + result = ufunc(arr) + fill_value = ufunc(arr.fill_value) + expected = SparseArray(ufunc(np.asarray(arr)), fill_value=fill_value) + tm.assert_sp_array_equal(result, expected) + + +@pytest.mark.parametrize( + "a, b", + [ + (SparseArray([0, 0, 0]), np.array([0, 1, 2])), + (SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), + (SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), + (SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), + (SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), + ], +) +@pytest.mark.parametrize("ufunc", [np.add, np.greater]) +def test_binary_ufuncs(ufunc, a, b): + # can't say anything about fill value here. + result = ufunc(a, b) + expected = ufunc(np.asarray(a), np.asarray(b)) + assert isinstance(result, SparseArray) + tm.assert_numpy_array_equal(np.asarray(result), expected) + + +def test_ndarray_inplace(): + sparray = SparseArray([0, 2, 0, 0]) + ndarray = np.array([0, 1, 2, 3]) + ndarray += sparray + expected = np.array([0, 3, 2, 3]) + tm.assert_numpy_array_equal(ndarray, expected) + + +def test_sparray_inplace(): + sparray = SparseArray([0, 2, 0, 0]) + ndarray = np.array([0, 1, 2, 3]) + sparray += ndarray + expected = SparseArray([0, 3, 2, 3], fill_value=0) + tm.assert_sp_array_equal(sparray, expected) + + +@pytest.mark.parametrize("fill_value", [True, False]) +def test_invert(fill_value): + arr = np.array([True, False, False, True]) + sparray = SparseArray(arr, fill_value=fill_value) + result = ~sparray + expected = SparseArray(~arr, fill_value=not fill_value) + tm.assert_sp_array_equal(result, expected) + + +@pytest.mark.parametrize("fill_value", [0, np.nan]) +@pytest.mark.parametrize("op", [operator.pos, operator.neg]) +def test_unary_op(op, fill_value): + arr = np.array([0, 1, np.nan, 2]) + sparray = SparseArray(arr, fill_value=fill_value) + result = op(sparray) + expected = SparseArray(op(arr), fill_value=op(fill_value)) + tm.assert_sp_array_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/arrays/sparse/test_array.py b/venv/Lib/site-packages/pandas/tests/arrays/sparse/test_array.py new file mode 100644 index 0000000..baca182 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arrays/sparse/test_array.py @@ -0,0 +1,1249 @@ +import operator +import re +import warnings + +import numpy as np +import pytest + +from pandas._libs.sparse import IntIndex +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import isna +import pandas._testing as tm +from pandas.core.arrays.sparse import SparseArray, SparseDtype + + +@pytest.fixture(params=["integer", "block"]) +def kind(request): + return request.param + + +class TestSparseArray: + def setup_method(self, method): + self.arr_data = np.array([np.nan, np.nan, 1, 2, 3, np.nan, 4, 5, np.nan, 6]) + self.arr = SparseArray(self.arr_data) + self.zarr = SparseArray([0, 0, 1, 2, 3, 0, 4, 5, 0, 6], fill_value=0) + + def test_constructor_dtype(self): + arr = SparseArray([np.nan, 1, 2, np.nan]) + assert arr.dtype == SparseDtype(np.float64, np.nan) + assert arr.dtype.subtype == np.float64 + assert np.isnan(arr.fill_value) + + arr = SparseArray([np.nan, 1, 2, np.nan], fill_value=0) + assert arr.dtype == SparseDtype(np.float64, 0) + assert arr.fill_value == 0 + + arr = SparseArray([0, 1, 2, 4], dtype=np.float64) + assert arr.dtype == SparseDtype(np.float64, np.nan) + assert np.isnan(arr.fill_value) + + arr = SparseArray([0, 1, 2, 4], dtype=np.int64) + assert arr.dtype == SparseDtype(np.int64, 0) + assert arr.fill_value == 0 + + arr = SparseArray([0, 1, 2, 4], fill_value=0, dtype=np.int64) + assert arr.dtype == SparseDtype(np.int64, 0) + assert arr.fill_value == 0 + + arr = SparseArray([0, 1, 2, 4], dtype=None) + assert arr.dtype == SparseDtype(np.int64, 0) + assert arr.fill_value == 0 + + arr = SparseArray([0, 1, 2, 4], fill_value=0, dtype=None) + assert arr.dtype == SparseDtype(np.int64, 0) + assert arr.fill_value == 0 + + def test_constructor_dtype_str(self): + result = SparseArray([1, 2, 3], dtype="int") + expected = SparseArray([1, 2, 3], dtype=int) + tm.assert_sp_array_equal(result, expected) + + def test_constructor_sparse_dtype(self): + result = SparseArray([1, 0, 0, 1], dtype=SparseDtype("int64", -1)) + expected = SparseArray([1, 0, 0, 1], fill_value=-1, dtype=np.int64) + tm.assert_sp_array_equal(result, expected) + assert result.sp_values.dtype == np.dtype("int64") + + def test_constructor_sparse_dtype_str(self): + result = SparseArray([1, 0, 0, 1], dtype="Sparse[int32]") + expected = SparseArray([1, 0, 0, 1], dtype=np.int32) + tm.assert_sp_array_equal(result, expected) + assert result.sp_values.dtype == np.dtype("int32") + + def test_constructor_object_dtype(self): + # GH 11856 + arr = SparseArray(["A", "A", np.nan, "B"], dtype=np.object) + assert arr.dtype == SparseDtype(np.object) + assert np.isnan(arr.fill_value) + + arr = SparseArray(["A", "A", np.nan, "B"], dtype=np.object, fill_value="A") + assert arr.dtype == SparseDtype(np.object, "A") + assert arr.fill_value == "A" + + # GH 17574 + data = [False, 0, 100.0, 0.0] + arr = SparseArray(data, dtype=np.object, fill_value=False) + assert arr.dtype == SparseDtype(np.object, False) + assert arr.fill_value is False + arr_expected = np.array(data, dtype=np.object) + it = (type(x) == type(y) and x == y for x, y in zip(arr, arr_expected)) + assert np.fromiter(it, dtype=np.bool).all() + + @pytest.mark.parametrize("dtype", [SparseDtype(int, 0), int]) + def test_constructor_na_dtype(self, dtype): + with pytest.raises(ValueError, match="Cannot convert"): + SparseArray([0, 1, np.nan], dtype=dtype) + + def test_constructor_spindex_dtype(self): + arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2])) + # XXX: Behavior change: specifying SparseIndex no longer changes the + # fill_value + expected = SparseArray([0, 1, 2, 0], kind="integer") + tm.assert_sp_array_equal(arr, expected) + assert arr.dtype == SparseDtype(np.int64) + assert arr.fill_value == 0 + + arr = SparseArray( + data=[1, 2, 3], + sparse_index=IntIndex(4, [1, 2, 3]), + dtype=np.int64, + fill_value=0, + ) + exp = SparseArray([0, 1, 2, 3], dtype=np.int64, fill_value=0) + tm.assert_sp_array_equal(arr, exp) + assert arr.dtype == SparseDtype(np.int64) + assert arr.fill_value == 0 + + arr = SparseArray( + data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=np.int64 + ) + exp = SparseArray([0, 1, 2, 0], fill_value=0, dtype=np.int64) + tm.assert_sp_array_equal(arr, exp) + assert arr.dtype == SparseDtype(np.int64) + assert arr.fill_value == 0 + + arr = SparseArray( + data=[1, 2, 3], + sparse_index=IntIndex(4, [1, 2, 3]), + dtype=None, + fill_value=0, + ) + exp = SparseArray([0, 1, 2, 3], dtype=None) + tm.assert_sp_array_equal(arr, exp) + assert arr.dtype == SparseDtype(np.int64) + assert arr.fill_value == 0 + + @pytest.mark.parametrize("sparse_index", [None, IntIndex(1, [0])]) + def test_constructor_spindex_dtype_scalar(self, sparse_index): + # scalar input + arr = SparseArray(data=1, sparse_index=sparse_index, dtype=None) + exp = SparseArray([1], dtype=None) + tm.assert_sp_array_equal(arr, exp) + assert arr.dtype == SparseDtype(np.int64) + assert arr.fill_value == 0 + + arr = SparseArray(data=1, sparse_index=IntIndex(1, [0]), dtype=None) + exp = SparseArray([1], dtype=None) + tm.assert_sp_array_equal(arr, exp) + assert arr.dtype == SparseDtype(np.int64) + assert arr.fill_value == 0 + + def test_constructor_spindex_dtype_scalar_broadcasts(self): + arr = SparseArray( + data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=None + ) + exp = SparseArray([0, 1, 2, 0], fill_value=0, dtype=None) + tm.assert_sp_array_equal(arr, exp) + assert arr.dtype == SparseDtype(np.int64) + assert arr.fill_value == 0 + + @pytest.mark.parametrize( + "data, fill_value", + [ + (np.array([1, 2]), 0), + (np.array([1.0, 2.0]), np.nan), + ([True, False], False), + ([pd.Timestamp("2017-01-01")], pd.NaT), + ], + ) + def test_constructor_inferred_fill_value(self, data, fill_value): + result = SparseArray(data).fill_value + + if pd.isna(fill_value): + assert pd.isna(result) + else: + assert result == fill_value + + @pytest.mark.parametrize("format", ["coo", "csc", "csr"]) + @pytest.mark.parametrize( + "size", + [pytest.param(0, marks=td.skip_if_np_lt("1.16", reason="NumPy-11383")), 10], + ) + @td.skip_if_no_scipy + def test_from_spmatrix(self, size, format): + import scipy.sparse + + mat = scipy.sparse.random(size, 1, density=0.5, format=format) + result = SparseArray.from_spmatrix(mat) + + result = np.asarray(result) + expected = mat.toarray().ravel() + tm.assert_numpy_array_equal(result, expected) + + @td.skip_if_no_scipy + def test_from_spmatrix_raises(self): + import scipy.sparse + + mat = scipy.sparse.eye(5, 4, format="csc") + + with pytest.raises(ValueError, match="not '4'"): + SparseArray.from_spmatrix(mat) + + @pytest.mark.parametrize( + "scalar,dtype", + [ + (False, SparseDtype(bool, False)), + (0.0, SparseDtype("float64", 0)), + (1, SparseDtype("int64", 1)), + ("z", SparseDtype("object", "z")), + ], + ) + def test_scalar_with_index_infer_dtype(self, scalar, dtype): + # GH 19163 + arr = SparseArray(scalar, index=[1, 2, 3], fill_value=scalar) + exp = SparseArray([scalar, scalar, scalar], fill_value=scalar) + + tm.assert_sp_array_equal(arr, exp) + + assert arr.dtype == dtype + assert exp.dtype == dtype + + def test_get_item(self): + + assert np.isnan(self.arr[1]) + assert self.arr[2] == 1 + assert self.arr[7] == 5 + + assert self.zarr[0] == 0 + assert self.zarr[2] == 1 + assert self.zarr[7] == 5 + + errmsg = re.compile("bounds") + + with pytest.raises(IndexError, match=errmsg): + self.arr[11] + + with pytest.raises(IndexError, match=errmsg): + self.arr[-11] + + assert self.arr[-1] == self.arr[len(self.arr) - 1] + + def test_take_scalar_raises(self): + msg = "'indices' must be an array, not a scalar '2'." + with pytest.raises(ValueError, match=msg): + self.arr.take(2) + + def test_take(self): + exp = SparseArray(np.take(self.arr_data, [2, 3])) + tm.assert_sp_array_equal(self.arr.take([2, 3]), exp) + + exp = SparseArray(np.take(self.arr_data, [0, 1, 2])) + tm.assert_sp_array_equal(self.arr.take([0, 1, 2]), exp) + + def test_take_fill_value(self): + data = np.array([1, np.nan, 0, 3, 0]) + sparse = SparseArray(data, fill_value=0) + + exp = SparseArray(np.take(data, [0]), fill_value=0) + tm.assert_sp_array_equal(sparse.take([0]), exp) + + exp = SparseArray(np.take(data, [1, 3, 4]), fill_value=0) + tm.assert_sp_array_equal(sparse.take([1, 3, 4]), exp) + + def test_take_negative(self): + exp = SparseArray(np.take(self.arr_data, [-1])) + tm.assert_sp_array_equal(self.arr.take([-1]), exp) + + exp = SparseArray(np.take(self.arr_data, [-4, -3, -2])) + tm.assert_sp_array_equal(self.arr.take([-4, -3, -2]), exp) + + @pytest.mark.parametrize("fill_value", [0, None, np.nan]) + def test_shift_fill_value(self, fill_value): + # GH #24128 + sparse = SparseArray(np.array([1, 0, 0, 3, 0]), fill_value=8.0) + res = sparse.shift(1, fill_value=fill_value) + if isna(fill_value): + fill_value = res.dtype.na_value + exp = SparseArray(np.array([fill_value, 1, 0, 0, 3]), fill_value=8.0) + tm.assert_sp_array_equal(res, exp) + + def test_bad_take(self): + with pytest.raises(IndexError, match="bounds"): + self.arr.take([11]) + + def test_take_filling(self): + # similar tests as GH 12631 + sparse = SparseArray([np.nan, np.nan, 1, np.nan, 4]) + result = sparse.take(np.array([1, 0, -1])) + expected = SparseArray([np.nan, np.nan, 4]) + tm.assert_sp_array_equal(result, expected) + + # XXX: test change: fill_value=True -> allow_fill=True + result = sparse.take(np.array([1, 0, -1]), allow_fill=True) + expected = SparseArray([np.nan, np.nan, np.nan]) + tm.assert_sp_array_equal(result, expected) + + # allow_fill=False + result = sparse.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = SparseArray([np.nan, np.nan, 4]) + tm.assert_sp_array_equal(result, expected) + + msg = "Invalid value in 'indices'" + with pytest.raises(ValueError, match=msg): + sparse.take(np.array([1, 0, -2]), allow_fill=True) + + with pytest.raises(ValueError, match=msg): + sparse.take(np.array([1, 0, -5]), allow_fill=True) + + msg = "out of bounds value in 'indices'" + with pytest.raises(IndexError, match=msg): + sparse.take(np.array([1, -6])) + with pytest.raises(IndexError, match=msg): + sparse.take(np.array([1, 5])) + with pytest.raises(IndexError, match=msg): + sparse.take(np.array([1, 5]), allow_fill=True) + + def test_take_filling_fill_value(self): + # same tests as GH 12631 + sparse = SparseArray([np.nan, 0, 1, 0, 4], fill_value=0) + result = sparse.take(np.array([1, 0, -1])) + expected = SparseArray([0, np.nan, 4], fill_value=0) + tm.assert_sp_array_equal(result, expected) + + # fill_value + result = sparse.take(np.array([1, 0, -1]), allow_fill=True) + # XXX: behavior change. + # the old way of filling self.fill_value doesn't follow EA rules. + # It's supposed to be self.dtype.na_value (nan in this case) + expected = SparseArray([0, np.nan, np.nan], fill_value=0) + tm.assert_sp_array_equal(result, expected) + + # allow_fill=False + result = sparse.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = SparseArray([0, np.nan, 4], fill_value=0) + tm.assert_sp_array_equal(result, expected) + + msg = "Invalid value in 'indices'." + with pytest.raises(ValueError, match=msg): + sparse.take(np.array([1, 0, -2]), allow_fill=True) + with pytest.raises(ValueError, match=msg): + sparse.take(np.array([1, 0, -5]), allow_fill=True) + + msg = "out of bounds value in 'indices'" + with pytest.raises(IndexError, match=msg): + sparse.take(np.array([1, -6])) + with pytest.raises(IndexError, match=msg): + sparse.take(np.array([1, 5])) + with pytest.raises(IndexError, match=msg): + sparse.take(np.array([1, 5]), fill_value=True) + + def test_take_filling_all_nan(self): + sparse = SparseArray([np.nan, np.nan, np.nan, np.nan, np.nan]) + # XXX: did the default kind from take change? + result = sparse.take(np.array([1, 0, -1])) + expected = SparseArray([np.nan, np.nan, np.nan], kind="block") + tm.assert_sp_array_equal(result, expected) + + result = sparse.take(np.array([1, 0, -1]), fill_value=True) + expected = SparseArray([np.nan, np.nan, np.nan], kind="block") + tm.assert_sp_array_equal(result, expected) + + msg = "out of bounds value in 'indices'" + with pytest.raises(IndexError, match=msg): + sparse.take(np.array([1, -6])) + with pytest.raises(IndexError, match=msg): + sparse.take(np.array([1, 5])) + with pytest.raises(IndexError, match=msg): + sparse.take(np.array([1, 5]), fill_value=True) + + def test_set_item(self): + def setitem(): + self.arr[5] = 3 + + def setslice(): + self.arr[1:5] = 2 + + with pytest.raises(TypeError, match="assignment via setitem"): + setitem() + + with pytest.raises(TypeError, match="assignment via setitem"): + setslice() + + def test_constructor_from_too_large_array(self): + with pytest.raises(TypeError, match="expected dimension <= 1 data"): + SparseArray(np.arange(10).reshape((2, 5))) + + def test_constructor_from_sparse(self): + res = SparseArray(self.zarr) + assert res.fill_value == 0 + tm.assert_almost_equal(res.sp_values, self.zarr.sp_values) + + def test_constructor_copy(self): + cp = SparseArray(self.arr, copy=True) + cp.sp_values[:3] = 0 + assert not (self.arr.sp_values[:3] == 0).any() + + not_copy = SparseArray(self.arr) + not_copy.sp_values[:3] = 0 + assert (self.arr.sp_values[:3] == 0).all() + + def test_constructor_bool(self): + # GH 10648 + data = np.array([False, False, True, True, False, False]) + arr = SparseArray(data, fill_value=False, dtype=bool) + + assert arr.dtype == SparseDtype(bool) + tm.assert_numpy_array_equal(arr.sp_values, np.array([True, True])) + # Behavior change: np.asarray densifies. + # tm.assert_numpy_array_equal(arr.sp_values, np.asarray(arr)) + tm.assert_numpy_array_equal(arr.sp_index.indices, np.array([2, 3], np.int32)) + + dense = arr.to_dense() + assert dense.dtype == bool + tm.assert_numpy_array_equal(dense, data) + + def test_constructor_bool_fill_value(self): + arr = SparseArray([True, False, True], dtype=None) + assert arr.dtype == SparseDtype(np.bool) + assert not arr.fill_value + + arr = SparseArray([True, False, True], dtype=np.bool) + assert arr.dtype == SparseDtype(np.bool) + assert not arr.fill_value + + arr = SparseArray([True, False, True], dtype=np.bool, fill_value=True) + assert arr.dtype == SparseDtype(np.bool, True) + assert arr.fill_value + + def test_constructor_float32(self): + # GH 10648 + data = np.array([1.0, np.nan, 3], dtype=np.float32) + arr = SparseArray(data, dtype=np.float32) + + assert arr.dtype == SparseDtype(np.float32) + tm.assert_numpy_array_equal(arr.sp_values, np.array([1, 3], dtype=np.float32)) + # Behavior change: np.asarray densifies. + # tm.assert_numpy_array_equal(arr.sp_values, np.asarray(arr)) + tm.assert_numpy_array_equal( + arr.sp_index.indices, np.array([0, 2], dtype=np.int32) + ) + + dense = arr.to_dense() + assert dense.dtype == np.float32 + tm.assert_numpy_array_equal(dense, data) + + def test_astype(self): + # float -> float + arr = SparseArray([None, None, 0, 2]) + result = arr.astype("Sparse[float32]") + expected = SparseArray([None, None, 0, 2], dtype=np.dtype("float32")) + tm.assert_sp_array_equal(result, expected) + + dtype = SparseDtype("float64", fill_value=0) + result = arr.astype(dtype) + expected = SparseArray._simple_new( + np.array([0.0, 2.0], dtype=dtype.subtype), IntIndex(4, [2, 3]), dtype + ) + tm.assert_sp_array_equal(result, expected) + + dtype = SparseDtype("int64", 0) + result = arr.astype(dtype) + expected = SparseArray._simple_new( + np.array([0, 2], dtype=np.int64), IntIndex(4, [2, 3]), dtype + ) + tm.assert_sp_array_equal(result, expected) + + arr = SparseArray([0, np.nan, 0, 1], fill_value=0) + with pytest.raises(ValueError, match="NA"): + arr.astype("Sparse[i8]") + + def test_astype_bool(self): + a = SparseArray([1, 0, 0, 1], dtype=SparseDtype(int, 0)) + result = a.astype(bool) + expected = SparseArray([True, 0, 0, True], dtype=SparseDtype(bool, 0)) + tm.assert_sp_array_equal(result, expected) + + # update fill value + result = a.astype(SparseDtype(bool, False)) + expected = SparseArray( + [True, False, False, True], dtype=SparseDtype(bool, False) + ) + tm.assert_sp_array_equal(result, expected) + + def test_astype_all(self, any_real_dtype): + vals = np.array([1, 2, 3]) + arr = SparseArray(vals, fill_value=1) + typ = np.dtype(any_real_dtype) + res = arr.astype(typ) + assert res.dtype == SparseDtype(typ, 1) + assert res.sp_values.dtype == typ + + tm.assert_numpy_array_equal(np.asarray(res.to_dense()), vals.astype(typ)) + + @pytest.mark.parametrize( + "array, dtype, expected", + [ + ( + SparseArray([0, 1]), + "float", + SparseArray([0.0, 1.0], dtype=SparseDtype(float, 0.0)), + ), + (SparseArray([0, 1]), bool, SparseArray([False, True])), + ( + SparseArray([0, 1], fill_value=1), + bool, + SparseArray([False, True], dtype=SparseDtype(bool, True)), + ), + pytest.param( + SparseArray([0, 1]), + "datetime64[ns]", + SparseArray( + np.array([0, 1], dtype="datetime64[ns]"), + dtype=SparseDtype("datetime64[ns]", pd.Timestamp("1970")), + ), + marks=[pytest.mark.xfail(reason="NumPy-7619")], + ), + ( + SparseArray([0, 1, 10]), + str, + SparseArray(["0", "1", "10"], dtype=SparseDtype(str, "0")), + ), + (SparseArray(["10", "20"]), float, SparseArray([10.0, 20.0])), + ( + SparseArray([0, 1, 0]), + object, + SparseArray([0, 1, 0], dtype=SparseDtype(object, 0)), + ), + ], + ) + def test_astype_more(self, array, dtype, expected): + result = array.astype(dtype) + tm.assert_sp_array_equal(result, expected) + + def test_astype_nan_raises(self): + arr = SparseArray([1.0, np.nan]) + with pytest.raises(ValueError, match="Cannot convert non-finite"): + arr.astype(int) + + def test_set_fill_value(self): + arr = SparseArray([1.0, np.nan, 2.0], fill_value=np.nan) + arr.fill_value = 2 + assert arr.fill_value == 2 + + arr = SparseArray([1, 0, 2], fill_value=0, dtype=np.int64) + arr.fill_value = 2 + assert arr.fill_value == 2 + + # XXX: this seems fine? You can construct an integer + # sparsearray with NaN fill value, why not update one? + # coerces to int + # msg = "unable to set fill_value 3\\.1 to int64 dtype" + # with pytest.raises(ValueError, match=msg): + arr.fill_value = 3.1 + assert arr.fill_value == 3.1 + + # msg = "unable to set fill_value nan to int64 dtype" + # with pytest.raises(ValueError, match=msg): + arr.fill_value = np.nan + assert np.isnan(arr.fill_value) + + arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool) + arr.fill_value = True + assert arr.fill_value + + # coerces to bool + # msg = "unable to set fill_value 0 to bool dtype" + # with pytest.raises(ValueError, match=msg): + arr.fill_value = 0 + assert arr.fill_value == 0 + + # msg = "unable to set fill_value nan to bool dtype" + # with pytest.raises(ValueError, match=msg): + arr.fill_value = np.nan + assert np.isnan(arr.fill_value) + + @pytest.mark.parametrize("val", [[1, 2, 3], np.array([1, 2]), (1, 2, 3)]) + def test_set_fill_invalid_non_scalar(self, val): + arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool) + msg = "fill_value must be a scalar" + + with pytest.raises(ValueError, match=msg): + arr.fill_value = val + + def test_copy(self): + arr2 = self.arr.copy() + assert arr2.sp_values is not self.arr.sp_values + assert arr2.sp_index is self.arr.sp_index + + def test_values_asarray(self): + tm.assert_almost_equal(self.arr.to_dense(), self.arr_data) + + @pytest.mark.parametrize( + "data,shape,dtype", + [ + ([0, 0, 0, 0, 0], (5,), None), + ([], (0,), None), + ([0], (1,), None), + (["A", "A", np.nan, "B"], (4,), np.object), + ], + ) + def test_shape(self, data, shape, dtype): + # GH 21126 + out = SparseArray(data, dtype=dtype) + assert out.shape == shape + + @pytest.mark.parametrize( + "vals", + [ + [np.nan, np.nan, np.nan, np.nan, np.nan], + [1, np.nan, np.nan, 3, np.nan], + [1, np.nan, 0, 3, 0], + ], + ) + @pytest.mark.parametrize("fill_value", [None, 0]) + def test_dense_repr(self, vals, fill_value): + vals = np.array(vals) + arr = SparseArray(vals, fill_value=fill_value) + + res = arr.to_dense() + tm.assert_numpy_array_equal(res, vals) + + res2 = arr._internal_get_values() + + tm.assert_numpy_array_equal(res2, vals) + + def test_getitem(self): + def _checkit(i): + tm.assert_almost_equal(self.arr[i], self.arr.to_dense()[i]) + + for i in range(len(self.arr)): + _checkit(i) + _checkit(-i) + + def test_getitem_arraylike_mask(self): + arr = SparseArray([0, 1, 2]) + result = arr[[True, False, True]] + expected = SparseArray([0, 2]) + tm.assert_sp_array_equal(result, expected) + + def test_getslice(self): + result = self.arr[:-3] + exp = SparseArray(self.arr.to_dense()[:-3]) + tm.assert_sp_array_equal(result, exp) + + result = self.arr[-4:] + exp = SparseArray(self.arr.to_dense()[-4:]) + tm.assert_sp_array_equal(result, exp) + + # two corner cases from Series + result = self.arr[-12:] + exp = SparseArray(self.arr) + tm.assert_sp_array_equal(result, exp) + + result = self.arr[:-12] + exp = SparseArray(self.arr.to_dense()[:0]) + tm.assert_sp_array_equal(result, exp) + + def test_getslice_tuple(self): + dense = np.array([np.nan, 0, 3, 4, 0, 5, np.nan, np.nan, 0]) + + sparse = SparseArray(dense) + res = sparse[ + 4:, + ] # noqa: E231 + exp = SparseArray(dense[4:,]) # noqa: E231 + tm.assert_sp_array_equal(res, exp) + + sparse = SparseArray(dense, fill_value=0) + res = sparse[ + 4:, + ] # noqa: E231 + exp = SparseArray(dense[4:,], fill_value=0) # noqa: E231 + tm.assert_sp_array_equal(res, exp) + + msg = "too many indices for array" + with pytest.raises(IndexError, match=msg): + sparse[4:, :] + + with pytest.raises(IndexError, match=msg): + # check numpy compat + dense[4:, :] + + def test_boolean_slice_empty(self): + arr = SparseArray([0, 1, 2]) + res = arr[[False, False, False]] + assert res.dtype == arr.dtype + + @pytest.mark.parametrize("op", ["add", "sub", "mul", "truediv", "floordiv", "pow"]) + def test_binary_operators(self, op): + op = getattr(operator, op) + data1 = np.random.randn(20) + data2 = np.random.randn(20) + + data1[::2] = np.nan + data2[::3] = np.nan + + arr1 = SparseArray(data1) + arr2 = SparseArray(data2) + + data1[::2] = 3 + data2[::3] = 3 + farr1 = SparseArray(data1, fill_value=3) + farr2 = SparseArray(data2, fill_value=3) + + def _check_op(op, first, second): + res = op(first, second) + exp = SparseArray( + op(first.to_dense(), second.to_dense()), fill_value=first.fill_value + ) + assert isinstance(res, SparseArray) + tm.assert_almost_equal(res.to_dense(), exp.to_dense()) + + res2 = op(first, second.to_dense()) + assert isinstance(res2, SparseArray) + tm.assert_sp_array_equal(res, res2) + + res3 = op(first.to_dense(), second) + assert isinstance(res3, SparseArray) + tm.assert_sp_array_equal(res, res3) + + res4 = op(first, 4) + assert isinstance(res4, SparseArray) + + # Ignore this if the actual op raises (e.g. pow). + try: + exp = op(first.to_dense(), 4) + exp_fv = op(first.fill_value, 4) + except ValueError: + pass + else: + tm.assert_almost_equal(res4.fill_value, exp_fv) + tm.assert_almost_equal(res4.to_dense(), exp) + + with np.errstate(all="ignore"): + for first_arr, second_arr in [(arr1, arr2), (farr1, farr2)]: + _check_op(op, first_arr, second_arr) + + def test_pickle(self): + def _check_roundtrip(obj): + unpickled = tm.round_trip_pickle(obj) + tm.assert_sp_array_equal(unpickled, obj) + + _check_roundtrip(self.arr) + _check_roundtrip(self.zarr) + + def test_generator_warnings(self): + sp_arr = SparseArray([1, 2, 3]) + with warnings.catch_warnings(record=True) as w: + warnings.filterwarnings(action="always", category=DeprecationWarning) + warnings.filterwarnings(action="always", category=PendingDeprecationWarning) + for _ in sp_arr: + pass + assert len(w) == 0 + + def test_fillna(self): + s = SparseArray([1, np.nan, np.nan, 3, np.nan]) + res = s.fillna(-1) + exp = SparseArray([1, -1, -1, 3, -1], fill_value=-1, dtype=np.float64) + tm.assert_sp_array_equal(res, exp) + + s = SparseArray([1, np.nan, np.nan, 3, np.nan], fill_value=0) + res = s.fillna(-1) + exp = SparseArray([1, -1, -1, 3, -1], fill_value=0, dtype=np.float64) + tm.assert_sp_array_equal(res, exp) + + s = SparseArray([1, np.nan, 0, 3, 0]) + res = s.fillna(-1) + exp = SparseArray([1, -1, 0, 3, 0], fill_value=-1, dtype=np.float64) + tm.assert_sp_array_equal(res, exp) + + s = SparseArray([1, np.nan, 0, 3, 0], fill_value=0) + res = s.fillna(-1) + exp = SparseArray([1, -1, 0, 3, 0], fill_value=0, dtype=np.float64) + tm.assert_sp_array_equal(res, exp) + + s = SparseArray([np.nan, np.nan, np.nan, np.nan]) + res = s.fillna(-1) + exp = SparseArray([-1, -1, -1, -1], fill_value=-1, dtype=np.float64) + tm.assert_sp_array_equal(res, exp) + + s = SparseArray([np.nan, np.nan, np.nan, np.nan], fill_value=0) + res = s.fillna(-1) + exp = SparseArray([-1, -1, -1, -1], fill_value=0, dtype=np.float64) + tm.assert_sp_array_equal(res, exp) + + # float dtype's fill_value is np.nan, replaced by -1 + s = SparseArray([0.0, 0.0, 0.0, 0.0]) + res = s.fillna(-1) + exp = SparseArray([0.0, 0.0, 0.0, 0.0], fill_value=-1) + tm.assert_sp_array_equal(res, exp) + + # int dtype shouldn't have missing. No changes. + s = SparseArray([0, 0, 0, 0]) + assert s.dtype == SparseDtype(np.int64) + assert s.fill_value == 0 + res = s.fillna(-1) + tm.assert_sp_array_equal(res, s) + + s = SparseArray([0, 0, 0, 0], fill_value=0) + assert s.dtype == SparseDtype(np.int64) + assert s.fill_value == 0 + res = s.fillna(-1) + exp = SparseArray([0, 0, 0, 0], fill_value=0) + tm.assert_sp_array_equal(res, exp) + + # fill_value can be nan if there is no missing hole. + # only fill_value will be changed + s = SparseArray([0, 0, 0, 0], fill_value=np.nan) + assert s.dtype == SparseDtype(np.int64, fill_value=np.nan) + assert np.isnan(s.fill_value) + res = s.fillna(-1) + exp = SparseArray([0, 0, 0, 0], fill_value=-1) + tm.assert_sp_array_equal(res, exp) + + def test_fillna_overlap(self): + s = SparseArray([1, np.nan, np.nan, 3, np.nan]) + # filling with existing value doesn't replace existing value with + # fill_value, i.e. existing 3 remains in sp_values + res = s.fillna(3) + exp = np.array([1, 3, 3, 3, 3], dtype=np.float64) + tm.assert_numpy_array_equal(res.to_dense(), exp) + + s = SparseArray([1, np.nan, np.nan, 3, np.nan], fill_value=0) + res = s.fillna(3) + exp = SparseArray([1, 3, 3, 3, 3], fill_value=0, dtype=np.float64) + tm.assert_sp_array_equal(res, exp) + + def test_nonzero(self): + # Tests regression #21172. + sa = SparseArray([float("nan"), float("nan"), 1, 0, 0, 2, 0, 0, 0, 3, 0, 0]) + expected = np.array([2, 5, 9], dtype=np.int32) + (result,) = sa.nonzero() + tm.assert_numpy_array_equal(expected, result) + + sa = SparseArray([0, 0, 1, 0, 0, 2, 0, 0, 0, 3, 0, 0]) + (result,) = sa.nonzero() + tm.assert_numpy_array_equal(expected, result) + + +class TestSparseArrayAnalytics: + @pytest.mark.parametrize( + "data,pos,neg", + [ + ([True, True, True], True, False), + ([1, 2, 1], 1, 0), + ([1.0, 2.0, 1.0], 1.0, 0.0), + ], + ) + def test_all(self, data, pos, neg): + # GH 17570 + out = SparseArray(data).all() + assert out + + out = SparseArray(data, fill_value=pos).all() + assert out + + data[1] = neg + out = SparseArray(data).all() + assert not out + + out = SparseArray(data, fill_value=pos).all() + assert not out + + @pytest.mark.parametrize( + "data,pos,neg", + [ + ([True, True, True], True, False), + ([1, 2, 1], 1, 0), + ([1.0, 2.0, 1.0], 1.0, 0.0), + ], + ) + @td.skip_if_np_lt("1.15") # prior didn't dispatch + def test_numpy_all(self, data, pos, neg): + # GH 17570 + out = np.all(SparseArray(data)) + assert out + + out = np.all(SparseArray(data, fill_value=pos)) + assert out + + data[1] = neg + out = np.all(SparseArray(data)) + assert not out + + out = np.all(SparseArray(data, fill_value=pos)) + assert not out + + # raises with a different message on py2. + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.all(SparseArray(data), out=np.array([])) + + @pytest.mark.parametrize( + "data,pos,neg", + [ + ([False, True, False], True, False), + ([0, 2, 0], 2, 0), + ([0.0, 2.0, 0.0], 2.0, 0.0), + ], + ) + def test_any(self, data, pos, neg): + # GH 17570 + out = SparseArray(data).any() + assert out + + out = SparseArray(data, fill_value=pos).any() + assert out + + data[1] = neg + out = SparseArray(data).any() + assert not out + + out = SparseArray(data, fill_value=pos).any() + assert not out + + @pytest.mark.parametrize( + "data,pos,neg", + [ + ([False, True, False], True, False), + ([0, 2, 0], 2, 0), + ([0.0, 2.0, 0.0], 2.0, 0.0), + ], + ) + @td.skip_if_np_lt("1.15") # prior didn't dispatch + def test_numpy_any(self, data, pos, neg): + # GH 17570 + out = np.any(SparseArray(data)) + assert out + + out = np.any(SparseArray(data, fill_value=pos)) + assert out + + data[1] = neg + out = np.any(SparseArray(data)) + assert not out + + out = np.any(SparseArray(data, fill_value=pos)) + assert not out + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.any(SparseArray(data), out=out) + + def test_sum(self): + data = np.arange(10).astype(float) + out = SparseArray(data).sum() + assert out == 45.0 + + data[5] = np.nan + out = SparseArray(data, fill_value=2).sum() + assert out == 40.0 + + out = SparseArray(data, fill_value=np.nan).sum() + assert out == 40.0 + + def test_numpy_sum(self): + data = np.arange(10).astype(float) + out = np.sum(SparseArray(data)) + assert out == 45.0 + + data[5] = np.nan + out = np.sum(SparseArray(data, fill_value=2)) + assert out == 40.0 + + out = np.sum(SparseArray(data, fill_value=np.nan)) + assert out == 40.0 + + msg = "the 'dtype' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.sum(SparseArray(data), dtype=np.int64) + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.sum(SparseArray(data), out=out) + + @pytest.mark.parametrize( + "data,expected", + [ + ( + np.array([1, 2, 3, 4, 5], dtype=float), # non-null data + SparseArray(np.array([1.0, 3.0, 6.0, 10.0, 15.0])), + ), + ( + np.array([1, 2, np.nan, 4, 5], dtype=float), # null data + SparseArray(np.array([1.0, 3.0, np.nan, 7.0, 12.0])), + ), + ], + ) + @pytest.mark.parametrize("numpy", [True, False]) + def test_cumsum(self, data, expected, numpy): + cumsum = np.cumsum if numpy else lambda s: s.cumsum() + + out = cumsum(SparseArray(data)) + tm.assert_sp_array_equal(out, expected) + + out = cumsum(SparseArray(data, fill_value=np.nan)) + tm.assert_sp_array_equal(out, expected) + + out = cumsum(SparseArray(data, fill_value=2)) + tm.assert_sp_array_equal(out, expected) + + if numpy: # numpy compatibility checks. + msg = "the 'dtype' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.cumsum(SparseArray(data), dtype=np.int64) + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.cumsum(SparseArray(data), out=out) + else: + axis = 1 # SparseArray currently 1-D, so only axis = 0 is valid. + msg = re.escape(f"axis(={axis}) out of bounds") + with pytest.raises(ValueError, match=msg): + SparseArray(data).cumsum(axis=axis) + + def test_mean(self): + data = np.arange(10).astype(float) + out = SparseArray(data).mean() + assert out == 4.5 + + data[5] = np.nan + out = SparseArray(data).mean() + assert out == 40.0 / 9 + + def test_numpy_mean(self): + data = np.arange(10).astype(float) + out = np.mean(SparseArray(data)) + assert out == 4.5 + + data[5] = np.nan + out = np.mean(SparseArray(data)) + assert out == 40.0 / 9 + + msg = "the 'dtype' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.mean(SparseArray(data), dtype=np.int64) + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.mean(SparseArray(data), out=out) + + def test_ufunc(self): + # GH 13853 make sure ufunc is applied to fill_value + sparse = SparseArray([1, np.nan, 2, np.nan, -2]) + result = SparseArray([1, np.nan, 2, np.nan, 2]) + tm.assert_sp_array_equal(abs(sparse), result) + tm.assert_sp_array_equal(np.abs(sparse), result) + + sparse = SparseArray([1, -1, 2, -2], fill_value=1) + result = SparseArray([1, 2, 2], sparse_index=sparse.sp_index, fill_value=1) + tm.assert_sp_array_equal(abs(sparse), result) + tm.assert_sp_array_equal(np.abs(sparse), result) + + sparse = SparseArray([1, -1, 2, -2], fill_value=-1) + result = SparseArray([1, 2, 2], sparse_index=sparse.sp_index, fill_value=1) + tm.assert_sp_array_equal(abs(sparse), result) + tm.assert_sp_array_equal(np.abs(sparse), result) + + sparse = SparseArray([1, np.nan, 2, np.nan, -2]) + result = SparseArray(np.sin([1, np.nan, 2, np.nan, -2])) + tm.assert_sp_array_equal(np.sin(sparse), result) + + sparse = SparseArray([1, -1, 2, -2], fill_value=1) + result = SparseArray(np.sin([1, -1, 2, -2]), fill_value=np.sin(1)) + tm.assert_sp_array_equal(np.sin(sparse), result) + + sparse = SparseArray([1, -1, 0, -2], fill_value=0) + result = SparseArray(np.sin([1, -1, 0, -2]), fill_value=np.sin(0)) + tm.assert_sp_array_equal(np.sin(sparse), result) + + def test_ufunc_args(self): + # GH 13853 make sure ufunc is applied to fill_value, including its arg + sparse = SparseArray([1, np.nan, 2, np.nan, -2]) + result = SparseArray([2, np.nan, 3, np.nan, -1]) + tm.assert_sp_array_equal(np.add(sparse, 1), result) + + sparse = SparseArray([1, -1, 2, -2], fill_value=1) + result = SparseArray([2, 0, 3, -1], fill_value=2) + tm.assert_sp_array_equal(np.add(sparse, 1), result) + + sparse = SparseArray([1, -1, 0, -2], fill_value=0) + result = SparseArray([2, 0, 1, -1], fill_value=1) + tm.assert_sp_array_equal(np.add(sparse, 1), result) + + @pytest.mark.parametrize("fill_value", [0.0, np.nan]) + def test_modf(self, fill_value): + # https://github.com/pandas-dev/pandas/issues/26946 + sparse = SparseArray([fill_value] * 10 + [1.1, 2.2], fill_value=fill_value) + r1, r2 = np.modf(sparse) + e1, e2 = np.modf(np.asarray(sparse)) + tm.assert_sp_array_equal(r1, SparseArray(e1, fill_value=fill_value)) + tm.assert_sp_array_equal(r2, SparseArray(e2, fill_value=fill_value)) + + def test_nbytes_integer(self): + arr = SparseArray([1, 0, 0, 0, 2], kind="integer") + result = arr.nbytes + # (2 * 8) + 2 * 4 + assert result == 24 + + def test_nbytes_block(self): + arr = SparseArray([1, 2, 0, 0, 0], kind="block") + result = arr.nbytes + # (2 * 8) + 4 + 4 + # sp_values, blocs, blenghts + assert result == 24 + + def test_asarray_datetime64(self): + s = SparseArray(pd.to_datetime(["2012", None, None, "2013"])) + np.asarray(s) + + def test_density(self): + arr = SparseArray([0, 1]) + assert arr.density == 0.5 + + def test_npoints(self): + arr = SparseArray([0, 1]) + assert arr.npoints == 1 + + +class TestAccessor: + @pytest.mark.parametrize("attr", ["npoints", "density", "fill_value", "sp_values"]) + def test_get_attributes(self, attr): + arr = SparseArray([0, 1]) + ser = pd.Series(arr) + + result = getattr(ser.sparse, attr) + expected = getattr(arr, attr) + assert result == expected + + @td.skip_if_no_scipy + def test_from_coo(self): + import scipy.sparse + + row = [0, 3, 1, 0] + col = [0, 3, 1, 2] + data = [4, 5, 7, 9] + sp_array = scipy.sparse.coo_matrix((data, (row, col))) + result = pd.Series.sparse.from_coo(sp_array) + + index = pd.MultiIndex.from_arrays([[0, 0, 1, 3], [0, 2, 1, 3]]) + expected = pd.Series([4, 9, 7, 5], index=index, dtype="Sparse[int]") + tm.assert_series_equal(result, expected) + + @td.skip_if_no_scipy + def test_to_coo(self): + import scipy.sparse + + ser = pd.Series( + [1, 2, 3], + index=pd.MultiIndex.from_product([[0], [1, 2, 3]], names=["a", "b"]), + dtype="Sparse[int]", + ) + A, _, _ = ser.sparse.to_coo() + assert isinstance(A, scipy.sparse.coo.coo_matrix) + + def test_non_sparse_raises(self): + ser = pd.Series([1, 2, 3]) + with pytest.raises(AttributeError, match=".sparse"): + ser.sparse.density + + +def test_setting_fill_value_fillna_still_works(): + # This is why letting users update fill_value / dtype is bad + # astype has the same problem. + arr = SparseArray([1.0, np.nan, 1.0], fill_value=0.0) + arr.fill_value = np.nan + result = arr.isna() + # Can't do direct comparison, since the sp_index will be different + # So let's convert to ndarray and check there. + result = np.asarray(result) + + expected = np.array([False, True, False]) + tm.assert_numpy_array_equal(result, expected) + + +def test_setting_fill_value_updates(): + arr = SparseArray([0.0, np.nan], fill_value=0) + arr.fill_value = np.nan + # use private constructor to get the index right + # otherwise both nans would be un-stored. + expected = SparseArray._simple_new( + sparse_array=np.array([np.nan]), + sparse_index=IntIndex(2, [1]), + dtype=SparseDtype(float, np.nan), + ) + tm.assert_sp_array_equal(arr, expected) + + +@pytest.mark.parametrize( + "arr, loc", + [ + ([None, 1, 2], 0), + ([0, None, 2], 1), + ([0, 1, None], 2), + ([0, 1, 1, None, None], 3), + ([1, 1, 1, 2], -1), + ([], -1), + ], +) +def test_first_fill_value_loc(arr, loc): + result = SparseArray(arr)._first_fill_value_loc() + assert result == loc + + +@pytest.mark.parametrize( + "arr", [[1, 2, np.nan, np.nan], [1, np.nan, 2, np.nan], [1, 2, np.nan]] +) +@pytest.mark.parametrize("fill_value", [np.nan, 0, 1]) +def test_unique_na_fill(arr, fill_value): + a = SparseArray(arr, fill_value=fill_value).unique() + b = pd.Series(arr).unique() + assert isinstance(a, SparseArray) + a = np.asarray(a) + tm.assert_numpy_array_equal(a, b) + + +def test_unique_all_sparse(): + # https://github.com/pandas-dev/pandas/issues/23168 + arr = SparseArray([0, 0]) + result = arr.unique() + expected = SparseArray([0]) + tm.assert_sp_array_equal(result, expected) + + +def test_map(): + arr = SparseArray([0, 1, 2]) + expected = SparseArray([10, 11, 12], fill_value=10) + + # dict + result = arr.map({0: 10, 1: 11, 2: 12}) + tm.assert_sp_array_equal(result, expected) + + # series + result = arr.map(pd.Series({0: 10, 1: 11, 2: 12})) + tm.assert_sp_array_equal(result, expected) + + # function + result = arr.map(pd.Series({0: 10, 1: 11, 2: 12})) + expected = SparseArray([10, 11, 12], fill_value=10) + tm.assert_sp_array_equal(result, expected) + + +def test_map_missing(): + arr = SparseArray([0, 1, 2]) + expected = SparseArray([10, 11, None], fill_value=10) + + result = arr.map({0: 10, 1: 11}) + tm.assert_sp_array_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/arrays/sparse/test_combine_concat.py b/venv/Lib/site-packages/pandas/tests/arrays/sparse/test_combine_concat.py new file mode 100644 index 0000000..f1697dc --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arrays/sparse/test_combine_concat.py @@ -0,0 +1,31 @@ +import numpy as np +import pytest + +import pandas._testing as tm +from pandas.core.arrays.sparse import SparseArray + + +class TestSparseArrayConcat: + @pytest.mark.parametrize("kind", ["integer", "block"]) + def test_basic(self, kind): + a = SparseArray([1, 0, 0, 2], kind=kind) + b = SparseArray([1, 0, 2, 2], kind=kind) + + result = SparseArray._concat_same_type([a, b]) + # Can't make any assertions about the sparse index itself + # since we aren't don't merge sparse blocs across arrays + # in to_concat + expected = np.array([1, 2, 1, 2, 2], dtype="int64") + tm.assert_numpy_array_equal(result.sp_values, expected) + assert result.kind == kind + + @pytest.mark.parametrize("kind", ["integer", "block"]) + def test_uses_first_kind(self, kind): + other = "integer" if kind == "block" else "block" + a = SparseArray([1, 0, 0, 2], kind=kind) + b = SparseArray([1, 0, 2, 2], kind=other) + + result = SparseArray._concat_same_type([a, b]) + expected = np.array([1, 2, 1, 2, 2], dtype="int64") + tm.assert_numpy_array_equal(result.sp_values, expected) + assert result.kind == kind diff --git a/venv/Lib/site-packages/pandas/tests/arrays/sparse/test_dtype.py b/venv/Lib/site-packages/pandas/tests/arrays/sparse/test_dtype.py new file mode 100644 index 0000000..5e9e2d8 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arrays/sparse/test_dtype.py @@ -0,0 +1,198 @@ +import re + +import numpy as np +import pytest + +import pandas as pd +from pandas.core.arrays.sparse import SparseDtype + + +@pytest.mark.parametrize( + "dtype, fill_value", + [ + ("int", 0), + ("float", np.nan), + ("bool", False), + ("object", np.nan), + ("datetime64[ns]", pd.NaT), + ("timedelta64[ns]", pd.NaT), + ], +) +def test_inferred_dtype(dtype, fill_value): + sparse_dtype = SparseDtype(dtype) + result = sparse_dtype.fill_value + if pd.isna(fill_value): + assert pd.isna(result) and type(result) == type(fill_value) + else: + assert result == fill_value + + +def test_from_sparse_dtype(): + dtype = SparseDtype("float", 0) + result = SparseDtype(dtype) + assert result.fill_value == 0 + + +def test_from_sparse_dtype_fill_value(): + dtype = SparseDtype("int", 1) + result = SparseDtype(dtype, fill_value=2) + expected = SparseDtype("int", 2) + assert result == expected + + +@pytest.mark.parametrize( + "dtype, fill_value", + [ + ("int", None), + ("float", None), + ("bool", None), + ("object", None), + ("datetime64[ns]", None), + ("timedelta64[ns]", None), + ("int", np.nan), + ("float", 0), + ], +) +def test_equal(dtype, fill_value): + a = SparseDtype(dtype, fill_value) + b = SparseDtype(dtype, fill_value) + assert a == b + assert b == a + + +def test_nans_equal(): + a = SparseDtype(float, float("nan")) + b = SparseDtype(float, np.nan) + assert a == b + assert b == a + + +@pytest.mark.parametrize( + "a, b", + [ + (SparseDtype("float64"), SparseDtype("float32")), + (SparseDtype("float64"), SparseDtype("float64", 0)), + (SparseDtype("float64"), SparseDtype("datetime64[ns]", np.nan)), + (SparseDtype(int, pd.NaT), SparseDtype(float, pd.NaT)), + (SparseDtype("float64"), np.dtype("float64")), + ], +) +def test_not_equal(a, b): + assert a != b + + +def test_construct_from_string_raises(): + with pytest.raises( + TypeError, match="Cannot construct a 'SparseDtype' from 'not a dtype'" + ): + SparseDtype.construct_from_string("not a dtype") + + +@pytest.mark.parametrize( + "dtype, expected", + [ + (SparseDtype(int), True), + (SparseDtype(float), True), + (SparseDtype(bool), True), + (SparseDtype(object), False), + (SparseDtype(str), False), + ], +) +def test_is_numeric(dtype, expected): + assert dtype._is_numeric is expected + + +def test_str_uses_object(): + result = SparseDtype(str).subtype + assert result == np.dtype("object") + + +@pytest.mark.parametrize( + "string, expected", + [ + ("Sparse[float64]", SparseDtype(np.dtype("float64"))), + ("Sparse[float32]", SparseDtype(np.dtype("float32"))), + ("Sparse[int]", SparseDtype(np.dtype("int"))), + ("Sparse[str]", SparseDtype(np.dtype("str"))), + ("Sparse[datetime64[ns]]", SparseDtype(np.dtype("datetime64[ns]"))), + ("Sparse", SparseDtype(np.dtype("float"), np.nan)), + ], +) +def test_construct_from_string(string, expected): + result = SparseDtype.construct_from_string(string) + assert result == expected + + +@pytest.mark.parametrize( + "a, b, expected", + [ + (SparseDtype(float, 0.0), SparseDtype(np.dtype("float"), 0.0), True), + (SparseDtype(int, 0), SparseDtype(int, 0), True), + (SparseDtype(float, float("nan")), SparseDtype(float, np.nan), True), + (SparseDtype(float, 0), SparseDtype(float, np.nan), False), + (SparseDtype(int, 0.0), SparseDtype(float, 0.0), False), + ], +) +def test_hash_equal(a, b, expected): + result = a == b + assert result is expected + + result = hash(a) == hash(b) + assert result is expected + + +@pytest.mark.parametrize( + "string, expected", + [ + ("Sparse[int]", "int"), + ("Sparse[int, 0]", "int"), + ("Sparse[int64]", "int64"), + ("Sparse[int64, 0]", "int64"), + ("Sparse[datetime64[ns], 0]", "datetime64[ns]"), + ], +) +def test_parse_subtype(string, expected): + subtype, _ = SparseDtype._parse_subtype(string) + assert subtype == expected + + +@pytest.mark.parametrize( + "string", ["Sparse[int, 1]", "Sparse[float, 0.0]", "Sparse[bool, True]"] +) +def test_construct_from_string_fill_value_raises(string): + with pytest.raises(TypeError, match="fill_value in the string is not"): + SparseDtype.construct_from_string(string) + + +@pytest.mark.parametrize( + "original, dtype, expected", + [ + (SparseDtype(int, 0), float, SparseDtype(float, 0.0)), + (SparseDtype(int, 1), float, SparseDtype(float, 1.0)), + (SparseDtype(int, 1), str, SparseDtype(object, "1")), + (SparseDtype(float, 1.5), int, SparseDtype(int, 1)), + ], +) +def test_update_dtype(original, dtype, expected): + result = original.update_dtype(dtype) + assert result == expected + + +@pytest.mark.parametrize( + "original, dtype, expected_error_msg", + [ + ( + SparseDtype(float, np.nan), + int, + re.escape("Cannot convert non-finite values (NA or inf) to integer"), + ), + ( + SparseDtype(str, "abc"), + int, + re.escape("invalid literal for int() with base 10: 'abc'"), + ), + ], +) +def test_update_dtype_raises(original, dtype, expected_error_msg): + with pytest.raises(ValueError, match=expected_error_msg): + original.update_dtype(dtype) diff --git a/venv/Lib/site-packages/pandas/tests/arrays/sparse/test_libsparse.py b/venv/Lib/site-packages/pandas/tests/arrays/sparse/test_libsparse.py new file mode 100644 index 0000000..a2f861d --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arrays/sparse/test_libsparse.py @@ -0,0 +1,601 @@ +import operator + +import numpy as np +import pytest + +import pandas._libs.sparse as splib +import pandas.util._test_decorators as td + +from pandas import Series +import pandas._testing as tm +from pandas.core.arrays.sparse import BlockIndex, IntIndex, _make_index + +TEST_LENGTH = 20 + +plain_case = dict( + xloc=[0, 7, 15], + xlen=[3, 5, 5], + yloc=[2, 9, 14], + ylen=[2, 3, 5], + intersect_loc=[2, 9, 15], + intersect_len=[1, 3, 4], +) +delete_blocks = dict( + xloc=[0, 5], xlen=[4, 4], yloc=[1], ylen=[4], intersect_loc=[1], intersect_len=[3] +) +split_blocks = dict( + xloc=[0], + xlen=[10], + yloc=[0, 5], + ylen=[3, 7], + intersect_loc=[0, 5], + intersect_len=[3, 5], +) +skip_block = dict( + xloc=[10], + xlen=[5], + yloc=[0, 12], + ylen=[5, 3], + intersect_loc=[12], + intersect_len=[3], +) + +no_intersect = dict( + xloc=[0, 10], + xlen=[4, 6], + yloc=[5, 17], + ylen=[4, 2], + intersect_loc=[], + intersect_len=[], +) + + +def check_cases(_check_case): + def _check_case_dict(case): + _check_case( + case["xloc"], + case["xlen"], + case["yloc"], + case["ylen"], + case["intersect_loc"], + case["intersect_len"], + ) + + _check_case_dict(plain_case) + _check_case_dict(delete_blocks) + _check_case_dict(split_blocks) + _check_case_dict(skip_block) + _check_case_dict(no_intersect) + + # one or both is empty + _check_case([0], [5], [], [], [], []) + _check_case([], [], [], [], [], []) + + +class TestSparseIndexUnion: + def test_index_make_union(self): + def _check_case(xloc, xlen, yloc, ylen, eloc, elen): + xindex = BlockIndex(TEST_LENGTH, xloc, xlen) + yindex = BlockIndex(TEST_LENGTH, yloc, ylen) + bresult = xindex.make_union(yindex) + assert isinstance(bresult, BlockIndex) + tm.assert_numpy_array_equal(bresult.blocs, np.array(eloc, dtype=np.int32)) + tm.assert_numpy_array_equal( + bresult.blengths, np.array(elen, dtype=np.int32) + ) + + ixindex = xindex.to_int_index() + iyindex = yindex.to_int_index() + iresult = ixindex.make_union(iyindex) + assert isinstance(iresult, IntIndex) + tm.assert_numpy_array_equal(iresult.indices, bresult.to_int_index().indices) + + """ + x: ---- + y: ---- + r: -------- + """ + xloc = [0] + xlen = [5] + yloc = [5] + ylen = [4] + eloc = [0] + elen = [9] + _check_case(xloc, xlen, yloc, ylen, eloc, elen) + """ + x: ----- ----- + y: ----- -- + """ + xloc = [0, 10] + xlen = [5, 5] + yloc = [2, 17] + ylen = [5, 2] + eloc = [0, 10, 17] + elen = [7, 5, 2] + _check_case(xloc, xlen, yloc, ylen, eloc, elen) + """ + x: ------ + y: ------- + r: ---------- + """ + xloc = [1] + xlen = [5] + yloc = [3] + ylen = [5] + eloc = [1] + elen = [7] + _check_case(xloc, xlen, yloc, ylen, eloc, elen) + """ + x: ------ ----- + y: ------- + r: ------------- + """ + xloc = [2, 10] + xlen = [4, 4] + yloc = [4] + ylen = [8] + eloc = [2] + elen = [12] + _check_case(xloc, xlen, yloc, ylen, eloc, elen) + """ + x: --- ----- + y: ------- + r: ------------- + """ + xloc = [0, 5] + xlen = [3, 5] + yloc = [0] + ylen = [7] + eloc = [0] + elen = [10] + _check_case(xloc, xlen, yloc, ylen, eloc, elen) + """ + x: ------ ----- + y: ------- --- + r: ------------- + """ + xloc = [2, 10] + xlen = [4, 4] + yloc = [4, 13] + ylen = [8, 4] + eloc = [2] + elen = [15] + _check_case(xloc, xlen, yloc, ylen, eloc, elen) + """ + x: ---------------------- + y: ---- ---- --- + r: ---------------------- + """ + xloc = [2] + xlen = [15] + yloc = [4, 9, 14] + ylen = [3, 2, 2] + eloc = [2] + elen = [15] + _check_case(xloc, xlen, yloc, ylen, eloc, elen) + """ + x: ---- --- + y: --- --- + """ + xloc = [0, 10] + xlen = [3, 3] + yloc = [5, 15] + ylen = [2, 2] + eloc = [0, 5, 10, 15] + elen = [3, 2, 3, 2] + _check_case(xloc, xlen, yloc, ylen, eloc, elen) + + def test_int_index_make_union(self): + a = IntIndex(5, np.array([0, 3, 4], dtype=np.int32)) + b = IntIndex(5, np.array([0, 2], dtype=np.int32)) + res = a.make_union(b) + exp = IntIndex(5, np.array([0, 2, 3, 4], np.int32)) + assert res.equals(exp) + + a = IntIndex(5, np.array([], dtype=np.int32)) + b = IntIndex(5, np.array([0, 2], dtype=np.int32)) + res = a.make_union(b) + exp = IntIndex(5, np.array([0, 2], np.int32)) + assert res.equals(exp) + + a = IntIndex(5, np.array([], dtype=np.int32)) + b = IntIndex(5, np.array([], dtype=np.int32)) + res = a.make_union(b) + exp = IntIndex(5, np.array([], np.int32)) + assert res.equals(exp) + + a = IntIndex(5, np.array([0, 1, 2, 3, 4], dtype=np.int32)) + b = IntIndex(5, np.array([0, 1, 2, 3, 4], dtype=np.int32)) + res = a.make_union(b) + exp = IntIndex(5, np.array([0, 1, 2, 3, 4], np.int32)) + assert res.equals(exp) + + a = IntIndex(5, np.array([0, 1], dtype=np.int32)) + b = IntIndex(4, np.array([0, 1], dtype=np.int32)) + + msg = "Indices must reference same underlying length" + with pytest.raises(ValueError, match=msg): + a.make_union(b) + + +class TestSparseIndexIntersect: + @td.skip_if_windows + def test_intersect(self): + def _check_correct(a, b, expected): + result = a.intersect(b) + assert result.equals(expected) + + def _check_length_exc(a, longer): + msg = "Indices must reference same underlying length" + with pytest.raises(Exception, match=msg): + a.intersect(longer) + + def _check_case(xloc, xlen, yloc, ylen, eloc, elen): + xindex = BlockIndex(TEST_LENGTH, xloc, xlen) + yindex = BlockIndex(TEST_LENGTH, yloc, ylen) + expected = BlockIndex(TEST_LENGTH, eloc, elen) + longer_index = BlockIndex(TEST_LENGTH + 1, yloc, ylen) + + _check_correct(xindex, yindex, expected) + _check_correct( + xindex.to_int_index(), yindex.to_int_index(), expected.to_int_index() + ) + + _check_length_exc(xindex, longer_index) + _check_length_exc(xindex.to_int_index(), longer_index.to_int_index()) + + check_cases(_check_case) + + def test_intersect_empty(self): + xindex = IntIndex(4, np.array([], dtype=np.int32)) + yindex = IntIndex(4, np.array([2, 3], dtype=np.int32)) + assert xindex.intersect(yindex).equals(xindex) + assert yindex.intersect(xindex).equals(xindex) + + xindex = xindex.to_block_index() + yindex = yindex.to_block_index() + assert xindex.intersect(yindex).equals(xindex) + assert yindex.intersect(xindex).equals(xindex) + + def test_intersect_identical(self): + cases = [ + IntIndex(5, np.array([1, 2], dtype=np.int32)), + IntIndex(5, np.array([0, 2, 4], dtype=np.int32)), + IntIndex(0, np.array([], dtype=np.int32)), + IntIndex(5, np.array([], dtype=np.int32)), + ] + + for case in cases: + assert case.intersect(case).equals(case) + case = case.to_block_index() + assert case.intersect(case).equals(case) + + +class TestSparseIndexCommon: + def test_int_internal(self): + idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind="integer") + assert isinstance(idx, IntIndex) + assert idx.npoints == 2 + tm.assert_numpy_array_equal(idx.indices, np.array([2, 3], dtype=np.int32)) + + idx = _make_index(4, np.array([], dtype=np.int32), kind="integer") + assert isinstance(idx, IntIndex) + assert idx.npoints == 0 + tm.assert_numpy_array_equal(idx.indices, np.array([], dtype=np.int32)) + + idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer") + assert isinstance(idx, IntIndex) + assert idx.npoints == 4 + tm.assert_numpy_array_equal(idx.indices, np.array([0, 1, 2, 3], dtype=np.int32)) + + def test_block_internal(self): + idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind="block") + assert isinstance(idx, BlockIndex) + assert idx.npoints == 2 + tm.assert_numpy_array_equal(idx.blocs, np.array([2], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.array([2], dtype=np.int32)) + + idx = _make_index(4, np.array([], dtype=np.int32), kind="block") + assert isinstance(idx, BlockIndex) + assert idx.npoints == 0 + tm.assert_numpy_array_equal(idx.blocs, np.array([], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.array([], dtype=np.int32)) + + idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block") + assert isinstance(idx, BlockIndex) + assert idx.npoints == 4 + tm.assert_numpy_array_equal(idx.blocs, np.array([0], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.array([4], dtype=np.int32)) + + idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block") + assert isinstance(idx, BlockIndex) + assert idx.npoints == 3 + tm.assert_numpy_array_equal(idx.blocs, np.array([0, 2], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.array([1, 2], dtype=np.int32)) + + def test_lookup(self): + for kind in ["integer", "block"]: + idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind=kind) + assert idx.lookup(-1) == -1 + assert idx.lookup(0) == -1 + assert idx.lookup(1) == -1 + assert idx.lookup(2) == 0 + assert idx.lookup(3) == 1 + assert idx.lookup(4) == -1 + + idx = _make_index(4, np.array([], dtype=np.int32), kind=kind) + + for i in range(-1, 5): + assert idx.lookup(i) == -1 + + idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind) + assert idx.lookup(-1) == -1 + assert idx.lookup(0) == 0 + assert idx.lookup(1) == 1 + assert idx.lookup(2) == 2 + assert idx.lookup(3) == 3 + assert idx.lookup(4) == -1 + + idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind) + assert idx.lookup(-1) == -1 + assert idx.lookup(0) == 0 + assert idx.lookup(1) == -1 + assert idx.lookup(2) == 1 + assert idx.lookup(3) == 2 + assert idx.lookup(4) == -1 + + def test_lookup_array(self): + for kind in ["integer", "block"]: + idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind=kind) + + res = idx.lookup_array(np.array([-1, 0, 2], dtype=np.int32)) + exp = np.array([-1, -1, 0], dtype=np.int32) + tm.assert_numpy_array_equal(res, exp) + + res = idx.lookup_array(np.array([4, 2, 1, 3], dtype=np.int32)) + exp = np.array([-1, 0, -1, 1], dtype=np.int32) + tm.assert_numpy_array_equal(res, exp) + + idx = _make_index(4, np.array([], dtype=np.int32), kind=kind) + res = idx.lookup_array(np.array([-1, 0, 2, 4], dtype=np.int32)) + exp = np.array([-1, -1, -1, -1], dtype=np.int32) + + idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind) + res = idx.lookup_array(np.array([-1, 0, 2], dtype=np.int32)) + exp = np.array([-1, 0, 2], dtype=np.int32) + tm.assert_numpy_array_equal(res, exp) + + res = idx.lookup_array(np.array([4, 2, 1, 3], dtype=np.int32)) + exp = np.array([-1, 2, 1, 3], dtype=np.int32) + tm.assert_numpy_array_equal(res, exp) + + idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind) + res = idx.lookup_array(np.array([2, 1, 3, 0], dtype=np.int32)) + exp = np.array([1, -1, 2, 0], dtype=np.int32) + tm.assert_numpy_array_equal(res, exp) + + res = idx.lookup_array(np.array([1, 4, 2, 5], dtype=np.int32)) + exp = np.array([-1, -1, 1, -1], dtype=np.int32) + tm.assert_numpy_array_equal(res, exp) + + def test_lookup_basics(self): + def _check(index): + assert index.lookup(0) == -1 + assert index.lookup(5) == 0 + assert index.lookup(7) == 2 + assert index.lookup(8) == -1 + assert index.lookup(9) == -1 + assert index.lookup(10) == -1 + assert index.lookup(11) == -1 + assert index.lookup(12) == 3 + assert index.lookup(17) == 8 + assert index.lookup(18) == -1 + + bindex = BlockIndex(20, [5, 12], [3, 6]) + iindex = bindex.to_int_index() + + _check(bindex) + _check(iindex) + + # corner cases + + +class TestBlockIndex: + def test_block_internal(self): + idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind="block") + assert isinstance(idx, BlockIndex) + assert idx.npoints == 2 + tm.assert_numpy_array_equal(idx.blocs, np.array([2], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.array([2], dtype=np.int32)) + + idx = _make_index(4, np.array([], dtype=np.int32), kind="block") + assert isinstance(idx, BlockIndex) + assert idx.npoints == 0 + tm.assert_numpy_array_equal(idx.blocs, np.array([], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.array([], dtype=np.int32)) + + idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block") + assert isinstance(idx, BlockIndex) + assert idx.npoints == 4 + tm.assert_numpy_array_equal(idx.blocs, np.array([0], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.array([4], dtype=np.int32)) + + idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block") + assert isinstance(idx, BlockIndex) + assert idx.npoints == 3 + tm.assert_numpy_array_equal(idx.blocs, np.array([0, 2], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.array([1, 2], dtype=np.int32)) + + def test_make_block_boundary(self): + for i in [5, 10, 100, 101]: + idx = _make_index(i, np.arange(0, i, 2, dtype=np.int32), kind="block") + + exp = np.arange(0, i, 2, dtype=np.int32) + tm.assert_numpy_array_equal(idx.blocs, exp) + tm.assert_numpy_array_equal(idx.blengths, np.ones(len(exp), dtype=np.int32)) + + def test_equals(self): + index = BlockIndex(10, [0, 4], [2, 5]) + + assert index.equals(index) + assert not index.equals(BlockIndex(10, [0, 4], [2, 6])) + + def test_check_integrity(self): + locs = [] + lengths = [] + + # 0-length OK + # TODO: index variables are not used...is that right? + index = BlockIndex(0, locs, lengths) # noqa + + # also OK even though empty + index = BlockIndex(1, locs, lengths) # noqa + + msg = "Block 0 extends beyond end" + with pytest.raises(ValueError, match=msg): + BlockIndex(10, [5], [10]) + + msg = "Block 0 overlaps" + with pytest.raises(ValueError, match=msg): + BlockIndex(10, [2, 5], [5, 3]) + + def test_to_int_index(self): + locs = [0, 10] + lengths = [4, 6] + exp_inds = [0, 1, 2, 3, 10, 11, 12, 13, 14, 15] + + block = BlockIndex(20, locs, lengths) + dense = block.to_int_index() + + tm.assert_numpy_array_equal(dense.indices, np.array(exp_inds, dtype=np.int32)) + + def test_to_block_index(self): + index = BlockIndex(10, [0, 5], [4, 5]) + assert index.to_block_index() is index + + +class TestIntIndex: + def test_check_integrity(self): + + # Too many indices than specified in self.length + msg = "Too many indices" + + with pytest.raises(ValueError, match=msg): + IntIndex(length=1, indices=[1, 2, 3]) + + # No index can be negative. + msg = "No index can be less than zero" + + with pytest.raises(ValueError, match=msg): + IntIndex(length=5, indices=[1, -2, 3]) + + # No index can be negative. + msg = "No index can be less than zero" + + with pytest.raises(ValueError, match=msg): + IntIndex(length=5, indices=[1, -2, 3]) + + # All indices must be less than the length. + msg = "All indices must be less than the length" + + with pytest.raises(ValueError, match=msg): + IntIndex(length=5, indices=[1, 2, 5]) + + with pytest.raises(ValueError, match=msg): + IntIndex(length=5, indices=[1, 2, 6]) + + # Indices must be strictly ascending. + msg = "Indices must be strictly increasing" + + with pytest.raises(ValueError, match=msg): + IntIndex(length=5, indices=[1, 3, 2]) + + with pytest.raises(ValueError, match=msg): + IntIndex(length=5, indices=[1, 3, 3]) + + def test_int_internal(self): + idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind="integer") + assert isinstance(idx, IntIndex) + assert idx.npoints == 2 + tm.assert_numpy_array_equal(idx.indices, np.array([2, 3], dtype=np.int32)) + + idx = _make_index(4, np.array([], dtype=np.int32), kind="integer") + assert isinstance(idx, IntIndex) + assert idx.npoints == 0 + tm.assert_numpy_array_equal(idx.indices, np.array([], dtype=np.int32)) + + idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer") + assert isinstance(idx, IntIndex) + assert idx.npoints == 4 + tm.assert_numpy_array_equal(idx.indices, np.array([0, 1, 2, 3], dtype=np.int32)) + + def test_equals(self): + index = IntIndex(10, [0, 1, 2, 3, 4]) + assert index.equals(index) + assert not index.equals(IntIndex(10, [0, 1, 2, 3])) + + def test_to_block_index(self): + def _check_case(xloc, xlen, yloc, ylen, eloc, elen): + xindex = BlockIndex(TEST_LENGTH, xloc, xlen) + yindex = BlockIndex(TEST_LENGTH, yloc, ylen) + + # see if survive the round trip + xbindex = xindex.to_int_index().to_block_index() + ybindex = yindex.to_int_index().to_block_index() + assert isinstance(xbindex, BlockIndex) + assert xbindex.equals(xindex) + assert ybindex.equals(yindex) + + check_cases(_check_case) + + def test_to_int_index(self): + index = IntIndex(10, [2, 3, 4, 5, 6]) + assert index.to_int_index() is index + + +class TestSparseOperators: + def _op_tests(self, sparse_op, python_op): + def _check_case(xloc, xlen, yloc, ylen, eloc, elen): + xindex = BlockIndex(TEST_LENGTH, xloc, xlen) + yindex = BlockIndex(TEST_LENGTH, yloc, ylen) + + xdindex = xindex.to_int_index() + ydindex = yindex.to_int_index() + + x = np.arange(xindex.npoints) * 10.0 + 1 + y = np.arange(yindex.npoints) * 100.0 + 1 + + xfill = 0 + yfill = 2 + + result_block_vals, rb_index, bfill = sparse_op( + x, xindex, xfill, y, yindex, yfill + ) + result_int_vals, ri_index, ifill = sparse_op( + x, xdindex, xfill, y, ydindex, yfill + ) + + assert rb_index.to_int_index().equals(ri_index) + tm.assert_numpy_array_equal(result_block_vals, result_int_vals) + assert bfill == ifill + + # check versus Series... + xseries = Series(x, xdindex.indices) + xseries = xseries.reindex(np.arange(TEST_LENGTH)).fillna(xfill) + + yseries = Series(y, ydindex.indices) + yseries = yseries.reindex(np.arange(TEST_LENGTH)).fillna(yfill) + + series_result = python_op(xseries, yseries) + series_result = series_result.reindex(ri_index.indices) + + tm.assert_numpy_array_equal(result_block_vals, series_result.values) + tm.assert_numpy_array_equal(result_int_vals, series_result.values) + + check_cases(_check_case) + + @pytest.mark.parametrize("opname", ["add", "sub", "mul", "truediv", "floordiv"]) + def test_op(self, opname): + sparse_op = getattr(splib, f"sparse_{opname}_float64") + python_op = getattr(operator, opname) + self._op_tests(sparse_op, python_op) diff --git a/venv/Lib/site-packages/pandas/tests/arrays/string_/__init__.py b/venv/Lib/site-packages/pandas/tests/arrays/string_/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/arrays/string_/test_string.py b/venv/Lib/site-packages/pandas/tests/arrays/string_/test_string.py new file mode 100644 index 0000000..5e2f14a --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arrays/string_/test_string.py @@ -0,0 +1,271 @@ +import operator + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +import pandas._testing as tm + + +def test_repr(): + df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype="string")}) + expected = " A\n0 a\n1 \n2 b" + assert repr(df) == expected + + expected = "0 a\n1 \n2 b\nName: A, dtype: string" + assert repr(df.A) == expected + + expected = "\n['a', , 'b']\nLength: 3, dtype: string" + assert repr(df.A.array) == expected + + +def test_none_to_nan(): + a = pd.arrays.StringArray._from_sequence(["a", None, "b"]) + assert a[1] is not None + assert a[1] is pd.NA + + +def test_setitem_validates(): + a = pd.arrays.StringArray._from_sequence(["a", "b"]) + with pytest.raises(ValueError, match="10"): + a[0] = 10 + + with pytest.raises(ValueError, match="strings"): + a[:] = np.array([1, 2]) + + +def test_setitem_with_scalar_string(): + # is_float_dtype considers some strings, like 'd', to be floats + # which can cause issues. + arr = pd.array(["a", "c"], dtype="string") + arr[0] = "d" + expected = pd.array(["d", "c"], dtype="string") + tm.assert_extension_array_equal(arr, expected) + + +@pytest.mark.parametrize( + "input, method", + [ + (["a", "b", "c"], operator.methodcaller("capitalize")), + (["a", "b", "c"], operator.methodcaller("capitalize")), + (["a b", "a bc. de"], operator.methodcaller("capitalize")), + ], +) +def test_string_methods(input, method): + a = pd.Series(input, dtype="string") + b = pd.Series(input, dtype="object") + result = method(a.str) + expected = method(b.str) + + assert result.dtype.name == "string" + tm.assert_series_equal(result.astype(object), expected) + + +def test_astype_roundtrip(): + s = pd.Series(pd.date_range("2000", periods=12)) + s[0] = None + + result = s.astype("string").astype("datetime64[ns]") + tm.assert_series_equal(result, s) + + +def test_add(): + a = pd.Series(["a", "b", "c", None, None], dtype="string") + b = pd.Series(["x", "y", None, "z", None], dtype="string") + + result = a + b + expected = pd.Series(["ax", "by", None, None, None], dtype="string") + tm.assert_series_equal(result, expected) + + result = a.add(b) + tm.assert_series_equal(result, expected) + + result = a.radd(b) + expected = pd.Series(["xa", "yb", None, None, None], dtype="string") + tm.assert_series_equal(result, expected) + + result = a.add(b, fill_value="-") + expected = pd.Series(["ax", "by", "c-", "-z", None], dtype="string") + tm.assert_series_equal(result, expected) + + +def test_add_2d(): + a = pd.array(["a", "b", "c"], dtype="string") + b = np.array([["a", "b", "c"]], dtype=object) + with pytest.raises(ValueError, match="3 != 1"): + a + b + + s = pd.Series(a) + with pytest.raises(ValueError, match="3 != 1"): + s + b + + +def test_add_sequence(): + a = pd.array(["a", "b", None, None], dtype="string") + other = ["x", None, "y", None] + + result = a + other + expected = pd.array(["ax", None, None, None], dtype="string") + tm.assert_extension_array_equal(result, expected) + + result = other + a + expected = pd.array(["xa", None, None, None], dtype="string") + tm.assert_extension_array_equal(result, expected) + + +def test_mul(): + a = pd.array(["a", "b", None], dtype="string") + result = a * 2 + expected = pd.array(["aa", "bb", None], dtype="string") + tm.assert_extension_array_equal(result, expected) + + result = 2 * a + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.xfail(reason="GH-28527") +def test_add_strings(): + array = pd.array(["a", "b", "c", "d"], dtype="string") + df = pd.DataFrame([["t", "u", "v", "w"]]) + assert array.__add__(df) is NotImplemented + + result = array + df + expected = pd.DataFrame([["at", "bu", "cv", "dw"]]).astype("string") + tm.assert_frame_equal(result, expected) + + result = df + array + expected = pd.DataFrame([["ta", "ub", "vc", "wd"]]).astype("string") + tm.assert_frame_equal(result, expected) + + +@pytest.mark.xfail(reason="GH-28527") +def test_add_frame(): + array = pd.array(["a", "b", np.nan, np.nan], dtype="string") + df = pd.DataFrame([["x", np.nan, "y", np.nan]]) + + assert array.__add__(df) is NotImplemented + + result = array + df + expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]]).astype("string") + tm.assert_frame_equal(result, expected) + + result = df + array + expected = pd.DataFrame([["xa", np.nan, np.nan, np.nan]]).astype("string") + tm.assert_frame_equal(result, expected) + + +def test_comparison_methods_scalar(all_compare_operators): + op_name = all_compare_operators + + a = pd.array(["a", None, "c"], dtype="string") + other = "a" + result = getattr(a, op_name)(other) + expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object) + expected = pd.array(expected, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = getattr(a, op_name)(pd.NA) + expected = pd.array([None, None, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_comparison_methods_array(all_compare_operators): + op_name = all_compare_operators + + a = pd.array(["a", None, "c"], dtype="string") + other = [None, None, "c"] + result = getattr(a, op_name)(other) + expected = np.empty_like(a, dtype="object") + expected[-1] = getattr(other[-1], op_name)(a[-1]) + expected = pd.array(expected, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = getattr(a, op_name)(pd.NA) + expected = pd.array([None, None, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_constructor_raises(): + with pytest.raises(ValueError, match="sequence of strings"): + pd.arrays.StringArray(np.array(["a", "b"], dtype="S1")) + + with pytest.raises(ValueError, match="sequence of strings"): + pd.arrays.StringArray(np.array([])) + + with pytest.raises(ValueError, match="strings or pandas.NA"): + pd.arrays.StringArray(np.array(["a", np.nan], dtype=object)) + + with pytest.raises(ValueError, match="strings or pandas.NA"): + pd.arrays.StringArray(np.array(["a", None], dtype=object)) + + with pytest.raises(ValueError, match="strings or pandas.NA"): + pd.arrays.StringArray(np.array(["a", pd.NaT], dtype=object)) + + +@pytest.mark.parametrize("copy", [True, False]) +def test_from_sequence_no_mutate(copy): + a = np.array(["a", np.nan], dtype=object) + original = a.copy() + result = pd.arrays.StringArray._from_sequence(a, copy=copy) + expected = pd.arrays.StringArray(np.array(["a", pd.NA], dtype=object)) + tm.assert_extension_array_equal(result, expected) + tm.assert_numpy_array_equal(a, original) + + +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.xfail(reason="Not implemented StringArray.sum") +def test_reduce(skipna): + arr = pd.Series(["a", "b", "c"], dtype="string") + result = arr.sum(skipna=skipna) + assert result == "abc" + + +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.xfail(reason="Not implemented StringArray.sum") +def test_reduce_missing(skipna): + arr = pd.Series([None, "a", None, "b", "c", None], dtype="string") + result = arr.sum(skipna=skipna) + if skipna: + assert result == "abc" + else: + assert pd.isna(result) + + +@td.skip_if_no("pyarrow", min_version="0.15.0") +def test_arrow_array(): + # protocol added in 0.15.0 + import pyarrow as pa + + data = pd.array(["a", "b", "c"], dtype="string") + arr = pa.array(data) + expected = pa.array(list(data), type=pa.string(), from_pandas=True) + assert arr.equals(expected) + + +@td.skip_if_no("pyarrow", min_version="0.15.1.dev") +def test_arrow_roundtrip(): + # roundtrip possible from arrow 1.0.0 + import pyarrow as pa + + data = pd.array(["a", "b", None], dtype="string") + df = pd.DataFrame({"a": data}) + table = pa.table(df) + assert table.field("a").type == "string" + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.StringDtype) + tm.assert_frame_equal(result, df) + # ensure the missing value is represented by NA and not np.nan or None + assert result.loc[2, "a"] is pd.NA + + +def test_value_counts_na(): + arr = pd.array(["a", "b", "a", pd.NA], dtype="string") + result = arr.value_counts(dropna=False) + expected = pd.Series([2, 1, 1], index=["a", "b", pd.NA], dtype="Int64") + tm.assert_series_equal(result, expected) + + result = arr.value_counts(dropna=True) + expected = pd.Series([2, 1], index=["a", "b"], dtype="Int64") + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/arrays/test_array.py b/venv/Lib/site-packages/pandas/tests/arrays/test_array.py new file mode 100644 index 0000000..b1b5a94 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arrays/test_array.py @@ -0,0 +1,385 @@ +import datetime +import decimal + +import numpy as np +import pytest +import pytz + +from pandas.core.dtypes.dtypes import registry + +import pandas as pd +import pandas._testing as tm +from pandas.api.extensions import register_extension_dtype +from pandas.api.types import is_scalar +from pandas.arrays import ( + BooleanArray, + DatetimeArray, + IntegerArray, + IntervalArray, + SparseArray, + StringArray, + TimedeltaArray, +) +from pandas.core.arrays import PandasArray, integer_array, period_array +from pandas.tests.extension.decimal import DecimalArray, DecimalDtype, to_decimal + + +@pytest.mark.parametrize( + "data, dtype, expected", + [ + # Basic NumPy defaults. + ([1, 2], None, IntegerArray._from_sequence([1, 2])), + ([1, 2], object, PandasArray(np.array([1, 2], dtype=object))), + ( + [1, 2], + np.dtype("float32"), + PandasArray(np.array([1.0, 2.0], dtype=np.dtype("float32"))), + ), + (np.array([1, 2], dtype="int64"), None, IntegerArray._from_sequence([1, 2]),), + # String alias passes through to NumPy + ([1, 2], "float32", PandasArray(np.array([1, 2], dtype="float32"))), + # Period alias + ( + [pd.Period("2000", "D"), pd.Period("2001", "D")], + "Period[D]", + period_array(["2000", "2001"], freq="D"), + ), + # Period dtype + ( + [pd.Period("2000", "D")], + pd.PeriodDtype("D"), + period_array(["2000"], freq="D"), + ), + # Datetime (naive) + ( + [1, 2], + np.dtype("datetime64[ns]"), + DatetimeArray._from_sequence(np.array([1, 2], dtype="datetime64[ns]")), + ), + ( + np.array([1, 2], dtype="datetime64[ns]"), + None, + DatetimeArray._from_sequence(np.array([1, 2], dtype="datetime64[ns]")), + ), + ( + pd.DatetimeIndex(["2000", "2001"]), + np.dtype("datetime64[ns]"), + DatetimeArray._from_sequence(["2000", "2001"]), + ), + ( + pd.DatetimeIndex(["2000", "2001"]), + None, + DatetimeArray._from_sequence(["2000", "2001"]), + ), + ( + ["2000", "2001"], + np.dtype("datetime64[ns]"), + DatetimeArray._from_sequence(["2000", "2001"]), + ), + # Datetime (tz-aware) + ( + ["2000", "2001"], + pd.DatetimeTZDtype(tz="CET"), + DatetimeArray._from_sequence( + ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET") + ), + ), + # Timedelta + ( + ["1H", "2H"], + np.dtype("timedelta64[ns]"), + TimedeltaArray._from_sequence(["1H", "2H"]), + ), + ( + pd.TimedeltaIndex(["1H", "2H"]), + np.dtype("timedelta64[ns]"), + TimedeltaArray._from_sequence(["1H", "2H"]), + ), + ( + pd.TimedeltaIndex(["1H", "2H"]), + None, + TimedeltaArray._from_sequence(["1H", "2H"]), + ), + # Category + (["a", "b"], "category", pd.Categorical(["a", "b"])), + ( + ["a", "b"], + pd.CategoricalDtype(None, ordered=True), + pd.Categorical(["a", "b"], ordered=True), + ), + # Interval + ( + [pd.Interval(1, 2), pd.Interval(3, 4)], + "interval", + IntervalArray.from_tuples([(1, 2), (3, 4)]), + ), + # Sparse + ([0, 1], "Sparse[int64]", SparseArray([0, 1], dtype="int64")), + # IntegerNA + ([1, None], "Int16", integer_array([1, None], dtype="Int16")), + (pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), + # String + (["a", None], "string", StringArray._from_sequence(["a", None])), + (["a", None], pd.StringDtype(), StringArray._from_sequence(["a", None]),), + # Boolean + ([True, None], "boolean", BooleanArray._from_sequence([True, None])), + ([True, None], pd.BooleanDtype(), BooleanArray._from_sequence([True, None]),), + # Index + (pd.Index([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), + # Series[EA] returns the EA + ( + pd.Series(pd.Categorical(["a", "b"], categories=["a", "b", "c"])), + None, + pd.Categorical(["a", "b"], categories=["a", "b", "c"]), + ), + # "3rd party" EAs work + ([decimal.Decimal(0), decimal.Decimal(1)], "decimal", to_decimal([0, 1])), + # pass an ExtensionArray, but a different dtype + ( + period_array(["2000", "2001"], freq="D"), + "category", + pd.Categorical([pd.Period("2000", "D"), pd.Period("2001", "D")]), + ), + ], +) +def test_array(data, dtype, expected): + result = pd.array(data, dtype=dtype) + tm.assert_equal(result, expected) + + +def test_array_copy(): + a = np.array([1, 2]) + # default is to copy + b = pd.array(a, dtype=a.dtype) + assert np.shares_memory(a, b._ndarray) is False + + # copy=True + b = pd.array(a, dtype=a.dtype, copy=True) + assert np.shares_memory(a, b._ndarray) is False + + # copy=False + b = pd.array(a, dtype=a.dtype, copy=False) + assert np.shares_memory(a, b._ndarray) is True + + +cet = pytz.timezone("CET") + + +@pytest.mark.parametrize( + "data, expected", + [ + # period + ( + [pd.Period("2000", "D"), pd.Period("2001", "D")], + period_array(["2000", "2001"], freq="D"), + ), + # interval + ([pd.Interval(0, 1), pd.Interval(1, 2)], IntervalArray.from_breaks([0, 1, 2]),), + # datetime + ( + [pd.Timestamp("2000"), pd.Timestamp("2001")], + DatetimeArray._from_sequence(["2000", "2001"]), + ), + ( + [datetime.datetime(2000, 1, 1), datetime.datetime(2001, 1, 1)], + DatetimeArray._from_sequence(["2000", "2001"]), + ), + ( + np.array([1, 2], dtype="M8[ns]"), + DatetimeArray(np.array([1, 2], dtype="M8[ns]")), + ), + ( + np.array([1, 2], dtype="M8[us]"), + DatetimeArray(np.array([1000, 2000], dtype="M8[ns]")), + ), + # datetimetz + ( + [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2001", tz="CET")], + DatetimeArray._from_sequence( + ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET") + ), + ), + ( + [ + datetime.datetime(2000, 1, 1, tzinfo=cet), + datetime.datetime(2001, 1, 1, tzinfo=cet), + ], + DatetimeArray._from_sequence(["2000", "2001"], tz=cet), + ), + # timedelta + ( + [pd.Timedelta("1H"), pd.Timedelta("2H")], + TimedeltaArray._from_sequence(["1H", "2H"]), + ), + ( + np.array([1, 2], dtype="m8[ns]"), + TimedeltaArray(np.array([1, 2], dtype="m8[ns]")), + ), + ( + np.array([1, 2], dtype="m8[us]"), + TimedeltaArray(np.array([1000, 2000], dtype="m8[ns]")), + ), + # integer + ([1, 2], IntegerArray._from_sequence([1, 2])), + ([1, None], IntegerArray._from_sequence([1, None])), + # string + (["a", "b"], StringArray._from_sequence(["a", "b"])), + (["a", None], StringArray._from_sequence(["a", None])), + # Boolean + ([True, False], BooleanArray._from_sequence([True, False])), + ([True, None], BooleanArray._from_sequence([True, None])), + ], +) +def test_array_inference(data, expected): + result = pd.array(data) + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "data", + [ + # mix of frequencies + [pd.Period("2000", "D"), pd.Period("2001", "A")], + # mix of closed + [pd.Interval(0, 1, closed="left"), pd.Interval(1, 2, closed="right")], + # Mix of timezones + [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000", tz="UTC")], + # Mix of tz-aware and tz-naive + [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000")], + np.array([pd.Timestamp("2000"), pd.Timestamp("2000", tz="CET")]), + ], +) +def test_array_inference_fails(data): + result = pd.array(data) + expected = PandasArray(np.array(data, dtype=object)) + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize("data", [np.array([[1, 2], [3, 4]]), [[1, 2], [3, 4]]]) +def test_nd_raises(data): + with pytest.raises(ValueError, match="PandasArray must be 1-dimensional"): + pd.array(data, dtype="int64") + + +def test_scalar_raises(): + with pytest.raises(ValueError, match="Cannot pass scalar '1'"): + pd.array(1) + + +# --------------------------------------------------------------------------- +# A couple dummy classes to ensure that Series and Indexes are unboxed before +# getting to the EA classes. + + +@register_extension_dtype +class DecimalDtype2(DecimalDtype): + name = "decimal2" + + @classmethod + def construct_array_type(cls): + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + return DecimalArray2 + + +class DecimalArray2(DecimalArray): + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + if isinstance(scalars, (pd.Series, pd.Index)): + raise TypeError + + return super()._from_sequence(scalars, dtype=dtype, copy=copy) + + +def test_array_unboxes(index_or_series): + box = index_or_series + + data = box([decimal.Decimal("1"), decimal.Decimal("2")]) + # make sure it works + with pytest.raises(TypeError): + DecimalArray2._from_sequence(data) + + result = pd.array(data, dtype="decimal2") + expected = DecimalArray2._from_sequence(data.values) + tm.assert_equal(result, expected) + + +@pytest.fixture +def registry_without_decimal(): + idx = registry.dtypes.index(DecimalDtype) + registry.dtypes.pop(idx) + yield + registry.dtypes.append(DecimalDtype) + + +def test_array_not_registered(registry_without_decimal): + # check we aren't on it + assert registry.find("decimal") is None + data = [decimal.Decimal("1"), decimal.Decimal("2")] + + result = pd.array(data, dtype=DecimalDtype) + expected = DecimalArray._from_sequence(data) + tm.assert_equal(result, expected) + + +class TestArrayAnalytics: + def test_searchsorted(self, string_dtype): + arr = pd.array(["a", "b", "c"], dtype=string_dtype) + + result = arr.searchsorted("a", side="left") + assert is_scalar(result) + assert result == 0 + + result = arr.searchsorted("a", side="right") + assert is_scalar(result) + assert result == 1 + + def test_searchsorted_numeric_dtypes_scalar(self, any_real_dtype): + arr = pd.array([1, 3, 90], dtype=any_real_dtype) + result = arr.searchsorted(30) + assert is_scalar(result) + assert result == 2 + + result = arr.searchsorted([30]) + expected = np.array([2], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + def test_searchsorted_numeric_dtypes_vector(self, any_real_dtype): + arr = pd.array([1, 3, 90], dtype=any_real_dtype) + result = arr.searchsorted([2, 30]) + expected = np.array([1, 2], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "arr, val", + [ + [ + pd.date_range("20120101", periods=10, freq="2D"), + pd.Timestamp("20120102"), + ], + [ + pd.date_range("20120101", periods=10, freq="2D", tz="Asia/Hong_Kong"), + pd.Timestamp("20120102", tz="Asia/Hong_Kong"), + ], + [ + pd.timedelta_range(start="1 day", end="10 days", periods=10), + pd.Timedelta("2 days"), + ], + ], + ) + def test_search_sorted_datetime64_scalar(self, arr, val): + arr = pd.array(arr) + result = arr.searchsorted(val) + assert is_scalar(result) + assert result == 1 + + def test_searchsorted_sorter(self, any_real_dtype): + arr = pd.array([3, 1, 2], dtype=any_real_dtype) + result = arr.searchsorted([0, 3], sorter=np.argsort(arr)) + expected = np.array([0, 2], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/arrays/test_boolean.py b/venv/Lib/site-packages/pandas/tests/arrays/test_boolean.py new file mode 100644 index 0000000..200446f --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arrays/test_boolean.py @@ -0,0 +1,913 @@ +import operator + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +import pandas._testing as tm +from pandas.arrays import BooleanArray +from pandas.core.arrays.boolean import coerce_to_array +from pandas.tests.extension.base import BaseOpsUtil + + +def make_data(): + return [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False] + + +@pytest.fixture +def dtype(): + return pd.BooleanDtype() + + +@pytest.fixture +def data(dtype): + return pd.array(make_data(), dtype=dtype) + + +def test_boolean_array_constructor(): + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + + result = BooleanArray(values, mask) + expected = pd.array([True, False, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + with pytest.raises(TypeError, match="values should be boolean numpy array"): + BooleanArray(values.tolist(), mask) + + with pytest.raises(TypeError, match="mask should be boolean numpy array"): + BooleanArray(values, mask.tolist()) + + with pytest.raises(TypeError, match="values should be boolean numpy array"): + BooleanArray(values.astype(int), mask) + + with pytest.raises(TypeError, match="mask should be boolean numpy array"): + BooleanArray(values, None) + + with pytest.raises(ValueError, match="values must be a 1D array"): + BooleanArray(values.reshape(1, -1), mask) + + with pytest.raises(ValueError, match="mask must be a 1D array"): + BooleanArray(values, mask.reshape(1, -1)) + + +def test_boolean_array_constructor_copy(): + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + + result = BooleanArray(values, mask) + assert result._data is values + assert result._mask is mask + + result = BooleanArray(values, mask, copy=True) + assert result._data is not values + assert result._mask is not mask + + +def test_to_boolean_array(): + expected = BooleanArray( + np.array([True, False, True]), np.array([False, False, False]) + ) + + result = pd.array([True, False, True], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = pd.array(np.array([True, False, True]), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = pd.array(np.array([True, False, True], dtype=object), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + # with missing values + expected = BooleanArray( + np.array([True, False, True]), np.array([False, False, True]) + ) + + result = pd.array([True, False, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = pd.array(np.array([True, False, None], dtype=object), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_to_boolean_array_all_none(): + expected = BooleanArray(np.array([True, True, True]), np.array([True, True, True])) + + result = pd.array([None, None, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = pd.array(np.array([None, None, None], dtype=object), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "a, b", + [ + ([True, False, None, np.nan, pd.NA], [True, False, None, None, None]), + ([True, np.nan], [True, None]), + ([True, pd.NA], [True, None]), + ([np.nan, np.nan], [None, None]), + (np.array([np.nan, np.nan], dtype=float), [None, None]), + ], +) +def test_to_boolean_array_missing_indicators(a, b): + result = pd.array(a, dtype="boolean") + expected = pd.array(b, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "values", + [ + ["foo", "bar"], + ["1", "2"], + # "foo", + [1, 2], + [1.0, 2.0], + pd.date_range("20130101", periods=2), + np.array(["foo"]), + np.array([1, 2]), + np.array([1.0, 2.0]), + [np.nan, {"a": 1}], + ], +) +def test_to_boolean_array_error(values): + # error in converting existing arrays to BooleanArray + with pytest.raises(TypeError): + pd.array(values, dtype="boolean") + + +def test_to_boolean_array_from_integer_array(): + result = pd.array(np.array([1, 0, 1, 0]), dtype="boolean") + expected = pd.array([True, False, True, False], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + # with missing values + result = pd.array(np.array([1, 0, 1, None]), dtype="boolean") + expected = pd.array([True, False, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_to_boolean_array_from_float_array(): + result = pd.array(np.array([1.0, 0.0, 1.0, 0.0]), dtype="boolean") + expected = pd.array([True, False, True, False], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + # with missing values + result = pd.array(np.array([1.0, 0.0, 1.0, np.nan]), dtype="boolean") + expected = pd.array([True, False, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_to_boolean_array_integer_like(): + # integers of 0's and 1's + result = pd.array([1, 0, 1, 0], dtype="boolean") + expected = pd.array([True, False, True, False], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + # with missing values + result = pd.array([1, 0, 1, None], dtype="boolean") + expected = pd.array([True, False, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_coerce_to_array(): + # TODO this is currently not public API + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + result = BooleanArray(*coerce_to_array(values, mask=mask)) + expected = BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + assert result._data is values + assert result._mask is mask + result = BooleanArray(*coerce_to_array(values, mask=mask, copy=True)) + expected = BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + assert result._data is not values + assert result._mask is not mask + + # mixed missing from values and mask + values = [True, False, None, False] + mask = np.array([False, False, False, True], dtype="bool") + result = BooleanArray(*coerce_to_array(values, mask=mask)) + expected = BooleanArray( + np.array([True, False, True, True]), np.array([False, False, True, True]) + ) + tm.assert_extension_array_equal(result, expected) + result = BooleanArray(*coerce_to_array(np.array(values, dtype=object), mask=mask)) + tm.assert_extension_array_equal(result, expected) + result = BooleanArray(*coerce_to_array(values, mask=mask.tolist())) + tm.assert_extension_array_equal(result, expected) + + # raise errors for wrong dimension + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + + with pytest.raises(ValueError, match="values must be a 1D list-like"): + coerce_to_array(values.reshape(1, -1)) + + with pytest.raises(ValueError, match="mask must be a 1D list-like"): + coerce_to_array(values, mask=mask.reshape(1, -1)) + + +def test_coerce_to_array_from_boolean_array(): + # passing BooleanArray to coerce_to_array + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + arr = BooleanArray(values, mask) + result = BooleanArray(*coerce_to_array(arr)) + tm.assert_extension_array_equal(result, arr) + # no copy + assert result._data is arr._data + assert result._mask is arr._mask + + result = BooleanArray(*coerce_to_array(arr), copy=True) + tm.assert_extension_array_equal(result, arr) + assert result._data is not arr._data + assert result._mask is not arr._mask + + with pytest.raises(ValueError, match="cannot pass mask for BooleanArray input"): + coerce_to_array(arr, mask=mask) + + +def test_coerce_to_numpy_array(): + # with missing values -> object dtype + arr = pd.array([True, False, None], dtype="boolean") + result = np.array(arr) + expected = np.array([True, False, pd.NA], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + # also with no missing values -> object dtype + arr = pd.array([True, False, True], dtype="boolean") + result = np.array(arr) + expected = np.array([True, False, True], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + # force bool dtype + result = np.array(arr, dtype="bool") + expected = np.array([True, False, True], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + # with missing values will raise error + arr = pd.array([True, False, None], dtype="boolean") + with pytest.raises(ValueError): + np.array(arr, dtype="bool") + + +def test_to_boolean_array_from_strings(): + result = BooleanArray._from_sequence_of_strings( + np.array(["True", "False", np.nan], dtype=object) + ) + expected = BooleanArray( + np.array([True, False, False]), np.array([False, False, True]) + ) + + tm.assert_extension_array_equal(result, expected) + + +def test_to_boolean_array_from_strings_invalid_string(): + with pytest.raises(ValueError, match="cannot be cast"): + BooleanArray._from_sequence_of_strings(["donkey"]) + + +def test_repr(): + df = pd.DataFrame({"A": pd.array([True, False, None], dtype="boolean")}) + expected = " A\n0 True\n1 False\n2 " + assert repr(df) == expected + + expected = "0 True\n1 False\n2 \nName: A, dtype: boolean" + assert repr(df.A) == expected + + expected = "\n[True, False, ]\nLength: 3, dtype: boolean" + assert repr(df.A.array) == expected + + +@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) +def test_to_numpy(box): + con = pd.Series if box else pd.array + # default (with or without missing values) -> object dtype + arr = con([True, False, True], dtype="boolean") + result = arr.to_numpy() + expected = np.array([True, False, True], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + arr = con([True, False, None], dtype="boolean") + result = arr.to_numpy() + expected = np.array([True, False, pd.NA], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + arr = con([True, False, None], dtype="boolean") + result = arr.to_numpy(dtype="str") + expected = np.array([True, False, pd.NA], dtype=" can convert to bool, otherwise raises + arr = con([True, False, True], dtype="boolean") + result = arr.to_numpy(dtype="bool") + expected = np.array([True, False, True], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + arr = con([True, False, None], dtype="boolean") + with pytest.raises(ValueError, match="cannot convert to 'bool'-dtype"): + result = arr.to_numpy(dtype="bool") + + # specify dtype and na_value + arr = con([True, False, None], dtype="boolean") + result = arr.to_numpy(dtype=object, na_value=None) + expected = np.array([True, False, None], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype=bool, na_value=False) + expected = np.array([True, False, False], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype="int64", na_value=-99) + expected = np.array([1, 0, -99], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype="float64", na_value=np.nan) + expected = np.array([1, 0, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + # converting to int or float without specifying na_value raises + with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"): + arr.to_numpy(dtype="int64") + with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"): + arr.to_numpy(dtype="float64") + + +def test_to_numpy_copy(): + # to_numpy can be zero-copy if no missing values + arr = pd.array([True, False, True], dtype="boolean") + result = arr.to_numpy(dtype=bool) + result[0] = False + tm.assert_extension_array_equal( + arr, pd.array([False, False, True], dtype="boolean") + ) + + arr = pd.array([True, False, True], dtype="boolean") + result = arr.to_numpy(dtype=bool, copy=True) + result[0] = False + tm.assert_extension_array_equal(arr, pd.array([True, False, True], dtype="boolean")) + + +def test_astype(): + # with missing values + arr = pd.array([True, False, None], dtype="boolean") + + with pytest.raises(ValueError, match="cannot convert NA to integer"): + arr.astype("int64") + + with pytest.raises(ValueError, match="cannot convert float NaN to"): + arr.astype("bool") + + result = arr.astype("float64") + expected = np.array([1, 0, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + result = arr.astype("str") + expected = np.array(["True", "False", ""], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + # no missing values + arr = pd.array([True, False, True], dtype="boolean") + result = arr.astype("int64") + expected = np.array([1, 0, 1], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + result = arr.astype("bool") + expected = np.array([True, False, True], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + +def test_astype_to_boolean_array(): + # astype to BooleanArray + arr = pd.array([True, False, None], dtype="boolean") + + result = arr.astype("boolean") + tm.assert_extension_array_equal(result, arr) + result = arr.astype(pd.BooleanDtype()) + tm.assert_extension_array_equal(result, arr) + + +def test_astype_to_integer_array(): + # astype to IntegerArray + arr = pd.array([True, False, None], dtype="boolean") + + result = arr.astype("Int64") + expected = pd.array([1, 0, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize("na", [None, np.nan, pd.NA]) +def test_setitem_missing_values(na): + arr = pd.array([True, False, None], dtype="boolean") + expected = pd.array([True, None, None], dtype="boolean") + arr[1] = na + tm.assert_extension_array_equal(arr, expected) + + +@pytest.mark.parametrize( + "ufunc", [np.add, np.logical_or, np.logical_and, np.logical_xor] +) +def test_ufuncs_binary(ufunc): + # two BooleanArrays + a = pd.array([True, False, None], dtype="boolean") + result = ufunc(a, a) + expected = pd.array(ufunc(a._data, a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + result = ufunc(s, a) + expected = pd.Series(ufunc(a._data, a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_series_equal(result, expected) + + # Boolean with numpy array + arr = np.array([True, True, False]) + result = ufunc(a, arr) + expected = pd.array(ufunc(a._data, arr), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + result = ufunc(arr, a) + expected = pd.array(ufunc(arr, a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + # BooleanArray with scalar + result = ufunc(a, True) + expected = pd.array(ufunc(a._data, True), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + result = ufunc(True, a) + expected = pd.array(ufunc(True, a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + # not handled types + with pytest.raises(TypeError): + ufunc(a, "test") + + +@pytest.mark.parametrize("ufunc", [np.logical_not]) +def test_ufuncs_unary(ufunc): + a = pd.array([True, False, None], dtype="boolean") + result = ufunc(a) + expected = pd.array(ufunc(a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + result = ufunc(s) + expected = pd.Series(ufunc(a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("values", [[True, False], [True, None]]) +def test_ufunc_reduce_raises(values): + a = pd.array(values, dtype="boolean") + with pytest.raises(NotImplementedError): + np.add.reduce(a) + + +class TestLogicalOps(BaseOpsUtil): + def test_numpy_scalars_ok(self, all_logical_operators): + a = pd.array([True, False, None], dtype="boolean") + op = getattr(a, all_logical_operators) + + tm.assert_extension_array_equal(op(True), op(np.bool(True))) + tm.assert_extension_array_equal(op(False), op(np.bool(False))) + + def get_op_from_name(self, op_name): + short_opname = op_name.strip("_") + short_opname = short_opname if "xor" in short_opname else short_opname + "_" + try: + op = getattr(operator, short_opname) + except AttributeError: + # Assume it is the reverse operator + rop = getattr(operator, short_opname[1:]) + op = lambda x, y: rop(y, x) + + return op + + def test_empty_ok(self, all_logical_operators): + a = pd.array([], dtype="boolean") + op_name = all_logical_operators + result = getattr(a, op_name)(True) + tm.assert_extension_array_equal(a, result) + + result = getattr(a, op_name)(False) + tm.assert_extension_array_equal(a, result) + + # TODO: pd.NA + # result = getattr(a, op_name)(pd.NA) + # tm.assert_extension_array_equal(a, result) + + def test_logical_length_mismatch_raises(self, all_logical_operators): + op_name = all_logical_operators + a = pd.array([True, False, None], dtype="boolean") + msg = "Lengths must match to compare" + + with pytest.raises(ValueError, match=msg): + getattr(a, op_name)([True, False]) + + with pytest.raises(ValueError, match=msg): + getattr(a, op_name)(np.array([True, False])) + + with pytest.raises(ValueError, match=msg): + getattr(a, op_name)(pd.array([True, False], dtype="boolean")) + + def test_logical_nan_raises(self, all_logical_operators): + op_name = all_logical_operators + a = pd.array([True, False, None], dtype="boolean") + msg = "Got float instead" + + with pytest.raises(TypeError, match=msg): + getattr(a, op_name)(np.nan) + + @pytest.mark.parametrize("other", ["a", 1]) + def test_non_bool_or_na_other_raises(self, other, all_logical_operators): + a = pd.array([True, False], dtype="boolean") + with pytest.raises(TypeError, match=str(type(other).__name__)): + getattr(a, all_logical_operators)(other) + + def test_kleene_or(self): + # A clear test of behavior. + a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + b = pd.array([True, False, None] * 3, dtype="boolean") + result = a | b + expected = pd.array( + [True, True, True, True, False, None, True, None, None], dtype="boolean" + ) + tm.assert_extension_array_equal(result, expected) + + result = b | a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + ) + tm.assert_extension_array_equal( + b, pd.array([True, False, None] * 3, dtype="boolean") + ) + + @pytest.mark.parametrize( + "other, expected", + [ + (pd.NA, [True, None, None]), + (True, [True, True, True]), + (np.bool_(True), [True, True, True]), + (False, [True, False, None]), + (np.bool_(False), [True, False, None]), + ], + ) + def test_kleene_or_scalar(self, other, expected): + # TODO: test True & False + a = pd.array([True, False, None], dtype="boolean") + result = a | other + expected = pd.array(expected, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = other | a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True, False, None], dtype="boolean") + ) + + def test_kleene_and(self): + # A clear test of behavior. + a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + b = pd.array([True, False, None] * 3, dtype="boolean") + result = a & b + expected = pd.array( + [True, False, None, False, False, False, None, False, None], dtype="boolean" + ) + tm.assert_extension_array_equal(result, expected) + + result = b & a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + ) + tm.assert_extension_array_equal( + b, pd.array([True, False, None] * 3, dtype="boolean") + ) + + @pytest.mark.parametrize( + "other, expected", + [ + (pd.NA, [None, False, None]), + (True, [True, False, None]), + (False, [False, False, False]), + (np.bool_(True), [True, False, None]), + (np.bool_(False), [False, False, False]), + ], + ) + def test_kleene_and_scalar(self, other, expected): + a = pd.array([True, False, None], dtype="boolean") + result = a & other + expected = pd.array(expected, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = other & a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True, False, None], dtype="boolean") + ) + + def test_kleene_xor(self): + a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + b = pd.array([True, False, None] * 3, dtype="boolean") + result = a ^ b + expected = pd.array( + [False, True, None, True, False, None, None, None, None], dtype="boolean" + ) + tm.assert_extension_array_equal(result, expected) + + result = b ^ a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + ) + tm.assert_extension_array_equal( + b, pd.array([True, False, None] * 3, dtype="boolean") + ) + + @pytest.mark.parametrize( + "other, expected", + [ + (pd.NA, [None, None, None]), + (True, [False, True, None]), + (np.bool_(True), [False, True, None]), + (np.bool_(False), [True, False, None]), + ], + ) + def test_kleene_xor_scalar(self, other, expected): + a = pd.array([True, False, None], dtype="boolean") + result = a ^ other + expected = pd.array(expected, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = other ^ a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True, False, None], dtype="boolean") + ) + + @pytest.mark.parametrize( + "other", [True, False, pd.NA, [True, False, None] * 3], + ) + def test_no_masked_assumptions(self, other, all_logical_operators): + # The logical operations should not assume that masked values are False! + a = pd.arrays.BooleanArray( + np.array([True, True, True, False, False, False, True, False, True]), + np.array([False] * 6 + [True, True, True]), + ) + b = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + if isinstance(other, list): + other = pd.array(other, dtype="boolean") + + result = getattr(a, all_logical_operators)(other) + expected = getattr(b, all_logical_operators)(other) + tm.assert_extension_array_equal(result, expected) + + if isinstance(other, BooleanArray): + other._data[other._mask] = True + a._data[a._mask] = False + + result = getattr(a, all_logical_operators)(other) + expected = getattr(b, all_logical_operators)(other) + tm.assert_extension_array_equal(result, expected) + + +class TestComparisonOps(BaseOpsUtil): + def _compare_other(self, data, op_name, other): + op = self.get_op_from_name(op_name) + + # array + result = pd.Series(op(data, other)) + expected = pd.Series(op(data._data, other), dtype="boolean") + # propagate NAs + expected[data._mask] = pd.NA + + tm.assert_series_equal(result, expected) + + # series + s = pd.Series(data) + result = op(s, other) + + expected = pd.Series(data._data) + expected = op(expected, other) + expected = expected.astype("boolean") + # propagate NAs + expected[data._mask] = pd.NA + + tm.assert_series_equal(result, expected) + + def test_compare_scalar(self, data, all_compare_operators): + op_name = all_compare_operators + self._compare_other(data, op_name, True) + + def test_compare_array(self, data, all_compare_operators): + op_name = all_compare_operators + other = pd.array([True] * len(data), dtype="boolean") + self._compare_other(data, op_name, other) + other = np.array([True] * len(data)) + self._compare_other(data, op_name, other) + other = pd.Series([True] * len(data)) + self._compare_other(data, op_name, other) + + @pytest.mark.parametrize("other", [True, False, pd.NA]) + def test_scalar(self, other, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([True, False, None], dtype="boolean") + + result = op(a, other) + + if other is pd.NA: + expected = pd.array([None, None, None], dtype="boolean") + else: + values = op(a._data, other) + expected = BooleanArray(values, a._mask, copy=True) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = None + tm.assert_extension_array_equal( + a, pd.array([True, False, None], dtype="boolean") + ) + + def test_array(self, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + b = pd.array([True, False, None] * 3, dtype="boolean") + + result = op(a, b) + + values = op(a._data, b._data) + mask = a._mask | b._mask + expected = BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = None + tm.assert_extension_array_equal( + a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + ) + tm.assert_extension_array_equal( + b, pd.array([True, False, None] * 3, dtype="boolean") + ) + + +class TestArithmeticOps(BaseOpsUtil): + def test_error(self, data, all_arithmetic_operators): + # invalid ops + + op = all_arithmetic_operators + s = pd.Series(data) + ops = getattr(s, op) + opa = getattr(data, op) + + # invalid scalars + with pytest.raises(TypeError): + ops("foo") + with pytest.raises(TypeError): + ops(pd.Timestamp("20180101")) + + # invalid array-likes + if op not in ("__mul__", "__rmul__"): + # TODO(extension) numpy's mul with object array sees booleans as numbers + with pytest.raises(TypeError): + ops(pd.Series("foo", index=s.index)) + + # 2d + result = opa(pd.DataFrame({"A": s})) + assert result is NotImplemented + + with pytest.raises(NotImplementedError): + opa(np.arange(len(s)).reshape(-1, len(s))) + + +@pytest.mark.parametrize("dropna", [True, False]) +def test_reductions_return_types(dropna, data, all_numeric_reductions): + op = all_numeric_reductions + s = pd.Series(data) + if dropna: + s = s.dropna() + + if op in ("sum", "prod"): + assert isinstance(getattr(s, op)(), np.int64) + elif op in ("min", "max"): + assert isinstance(getattr(s, op)(), np.bool_) + else: + # "mean", "std", "var", "median", "kurt", "skew" + assert isinstance(getattr(s, op)(), np.float64) + + +@pytest.mark.parametrize( + "values, exp_any, exp_all, exp_any_noskip, exp_all_noskip", + [ + ([True, pd.NA], True, True, True, pd.NA), + ([False, pd.NA], False, False, pd.NA, False), + ([pd.NA], False, True, pd.NA, pd.NA), + ([], False, True, False, True), + ], +) +def test_any_all(values, exp_any, exp_all, exp_any_noskip, exp_all_noskip): + # the methods return numpy scalars + exp_any = pd.NA if exp_any is pd.NA else np.bool_(exp_any) + exp_all = pd.NA if exp_all is pd.NA else np.bool_(exp_all) + exp_any_noskip = pd.NA if exp_any_noskip is pd.NA else np.bool_(exp_any_noskip) + exp_all_noskip = pd.NA if exp_all_noskip is pd.NA else np.bool_(exp_all_noskip) + + for con in [pd.array, pd.Series]: + a = con(values, dtype="boolean") + assert a.any() is exp_any + assert a.all() is exp_all + assert a.any(skipna=False) is exp_any_noskip + assert a.all(skipna=False) is exp_all_noskip + + assert np.any(a.any()) is exp_any + assert np.all(a.all()) is exp_all + + +# TODO when BooleanArray coerces to object dtype numpy array, need to do conversion +# manually in the indexing code +# def test_indexing_boolean_mask(): +# arr = pd.array([1, 2, 3, 4], dtype="Int64") +# mask = pd.array([True, False, True, False], dtype="boolean") +# result = arr[mask] +# expected = pd.array([1, 3], dtype="Int64") +# tm.assert_extension_array_equal(result, expected) + +# # missing values -> error +# mask = pd.array([True, False, True, None], dtype="boolean") +# with pytest.raises(IndexError): +# result = arr[mask] + + +@td.skip_if_no("pyarrow", min_version="0.15.0") +def test_arrow_array(data): + # protocol added in 0.15.0 + import pyarrow as pa + + arr = pa.array(data) + + # TODO use to_numpy(na_value=None) here + data_object = np.array(data, dtype=object) + data_object[data.isna()] = None + expected = pa.array(data_object, type=pa.bool_(), from_pandas=True) + assert arr.equals(expected) + + +@td.skip_if_no("pyarrow", min_version="0.15.1.dev") +def test_arrow_roundtrip(): + # roundtrip possible from arrow 1.0.0 + import pyarrow as pa + + data = pd.array([True, False, None], dtype="boolean") + df = pd.DataFrame({"a": data}) + table = pa.table(df) + assert table.field("a").type == "bool" + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.BooleanDtype) + tm.assert_frame_equal(result, df) + + +def test_value_counts_na(): + arr = pd.array([True, False, pd.NA], dtype="boolean") + result = arr.value_counts(dropna=False) + expected = pd.Series([1, 1, 1], index=[True, False, pd.NA], dtype="Int64") + tm.assert_series_equal(result, expected) + + result = arr.value_counts(dropna=True) + expected = pd.Series([1, 1], index=[True, False], dtype="Int64") + tm.assert_series_equal(result, expected) + + +def test_diff(): + a = pd.array( + [True, True, False, False, True, None, True, None, False], dtype="boolean" + ) + result = pd.core.algorithms.diff(a, 1) + expected = pd.array( + [None, False, True, False, True, None, None, None, None], dtype="boolean" + ) + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + result = s.diff() + expected = pd.Series(expected) + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/arrays/test_datetimelike.py b/venv/Lib/site-packages/pandas/tests/arrays/test_datetimelike.py new file mode 100644 index 0000000..fa45db9 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arrays/test_datetimelike.py @@ -0,0 +1,796 @@ +from typing import Type, Union + +import numpy as np +import pytest + +from pandas._libs import OutOfBoundsDatetime +from pandas.compat.numpy import _np_version_under1p18 + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray +from pandas.core.indexes.datetimes import DatetimeIndex +from pandas.core.indexes.period import PeriodIndex +from pandas.core.indexes.timedeltas import TimedeltaIndex + + +# TODO: more freq variants +@pytest.fixture(params=["D", "B", "W", "M", "Q", "Y"]) +def period_index(request): + """ + A fixture to provide PeriodIndex objects with different frequencies. + + Most PeriodArray behavior is already tested in PeriodIndex tests, + so here we just test that the PeriodArray behavior matches + the PeriodIndex behavior. + """ + freqstr = request.param + # TODO: non-monotone indexes; NaTs, different start dates + pi = pd.period_range(start=pd.Timestamp("2000-01-01"), periods=100, freq=freqstr) + return pi + + +@pytest.fixture(params=["D", "B", "W", "M", "Q", "Y"]) +def datetime_index(request): + """ + A fixture to provide DatetimeIndex objects with different frequencies. + + Most DatetimeArray behavior is already tested in DatetimeIndex tests, + so here we just test that the DatetimeArray behavior matches + the DatetimeIndex behavior. + """ + freqstr = request.param + # TODO: non-monotone indexes; NaTs, different start dates, timezones + dti = pd.date_range(start=pd.Timestamp("2000-01-01"), periods=100, freq=freqstr) + return dti + + +@pytest.fixture +def timedelta_index(request): + """ + A fixture to provide TimedeltaIndex objects with different frequencies. + Most TimedeltaArray behavior is already tested in TimedeltaIndex tests, + so here we just test that the TimedeltaArray behavior matches + the TimedeltaIndex behavior. + """ + # TODO: flesh this out + return pd.TimedeltaIndex(["1 Day", "3 Hours", "NaT"]) + + +class SharedTests: + index_cls: Type[Union[DatetimeIndex, PeriodIndex, TimedeltaIndex]] + + def test_compare_len1_raises(self): + # make sure we raise when comparing with different lengths, specific + # to the case where one has length-1, which numpy would broadcast + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + + idx = self.index_cls._simple_new(data, freq="D") + arr = self.array_cls(idx) + + with pytest.raises(ValueError, match="Lengths must match"): + arr == arr[:1] + + # test the index classes while we're at it, GH#23078 + with pytest.raises(ValueError, match="Lengths must match"): + idx <= idx[[0]] + + def test_take(self): + data = np.arange(100, dtype="i8") * 24 * 3600 * 10 ** 9 + np.random.shuffle(data) + + idx = self.index_cls._simple_new(data, freq="D") + arr = self.array_cls(idx) + + takers = [1, 4, 94] + result = arr.take(takers) + expected = idx.take(takers) + + tm.assert_index_equal(self.index_cls(result), expected) + + takers = np.array([1, 4, 94]) + result = arr.take(takers) + expected = idx.take(takers) + + tm.assert_index_equal(self.index_cls(result), expected) + + def test_take_fill(self): + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + + idx = self.index_cls._simple_new(data, freq="D") + arr = self.array_cls(idx) + + result = arr.take([-1, 1], allow_fill=True, fill_value=None) + assert result[0] is pd.NaT + + result = arr.take([-1, 1], allow_fill=True, fill_value=np.nan) + assert result[0] is pd.NaT + + result = arr.take([-1, 1], allow_fill=True, fill_value=pd.NaT) + assert result[0] is pd.NaT + + with pytest.raises(ValueError): + arr.take([0, 1], allow_fill=True, fill_value=2) + + with pytest.raises(ValueError): + arr.take([0, 1], allow_fill=True, fill_value=2.0) + + with pytest.raises(ValueError): + arr.take([0, 1], allow_fill=True, fill_value=pd.Timestamp.now().time) + + def test_concat_same_type(self): + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + + idx = self.index_cls._simple_new(data, freq="D").insert(0, pd.NaT) + arr = self.array_cls(idx) + + result = arr._concat_same_type([arr[:-1], arr[1:], arr]) + expected = idx._concat_same_dtype([idx[:-1], idx[1:], idx], None) + + tm.assert_index_equal(self.index_cls(result), expected) + + def test_unbox_scalar(self): + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = self.array_cls(data, freq="D") + result = arr._unbox_scalar(arr[0]) + assert isinstance(result, int) + + result = arr._unbox_scalar(pd.NaT) + assert isinstance(result, int) + + with pytest.raises(ValueError): + arr._unbox_scalar("foo") + + def test_check_compatible_with(self): + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = self.array_cls(data, freq="D") + + arr._check_compatible_with(arr[0]) + arr._check_compatible_with(arr[:1]) + arr._check_compatible_with(pd.NaT) + + def test_scalar_from_string(self): + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = self.array_cls(data, freq="D") + result = arr._scalar_from_string(str(arr[0])) + assert result == arr[0] + + def test_reduce_invalid(self): + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = self.array_cls(data, freq="D") + + with pytest.raises(TypeError, match="cannot perform"): + arr._reduce("not a method") + + @pytest.mark.parametrize("method", ["pad", "backfill"]) + def test_fillna_method_doesnt_change_orig(self, method): + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = self.array_cls(data, freq="D") + arr[4] = pd.NaT + + fill_value = arr[3] if method == "pad" else arr[5] + + result = arr.fillna(method=method) + assert result[4] == fill_value + + # check that the original was not changed + assert arr[4] is pd.NaT + + def test_searchsorted(self): + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = self.array_cls(data, freq="D") + + # scalar + result = arr.searchsorted(arr[1]) + assert result == 1 + + result = arr.searchsorted(arr[2], side="right") + assert result == 3 + + # own-type + result = arr.searchsorted(arr[1:3]) + expected = np.array([1, 2], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + result = arr.searchsorted(arr[1:3], side="right") + expected = np.array([2, 3], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + # Following numpy convention, NaT goes at the beginning + # (unlike NaN which goes at the end) + result = arr.searchsorted(pd.NaT) + assert result == 0 + + def test_setitem(self): + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = self.array_cls(data, freq="D") + + arr[0] = arr[1] + expected = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + expected[0] = expected[1] + + tm.assert_numpy_array_equal(arr.asi8, expected) + + arr[:2] = arr[-2:] + expected[:2] = expected[-2:] + tm.assert_numpy_array_equal(arr.asi8, expected) + + def test_setitem_raises(self): + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = self.array_cls(data, freq="D") + val = arr[0] + + with pytest.raises(IndexError, match="index 12 is out of bounds"): + arr[12] = val + + with pytest.raises(TypeError, match="'value' should be a.* 'object'"): + arr[0] = object() + + def test_inplace_arithmetic(self): + # GH#24115 check that iadd and isub are actually in-place + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = self.array_cls(data, freq="D") + + expected = arr + pd.Timedelta(days=1) + arr += pd.Timedelta(days=1) + tm.assert_equal(arr, expected) + + expected = arr - pd.Timedelta(days=1) + arr -= pd.Timedelta(days=1) + tm.assert_equal(arr, expected) + + +class TestDatetimeArray(SharedTests): + index_cls = pd.DatetimeIndex + array_cls = DatetimeArray + + def test_round(self, tz_naive_fixture): + # GH#24064 + tz = tz_naive_fixture + dti = pd.date_range("2016-01-01 01:01:00", periods=3, freq="H", tz=tz) + + result = dti.round(freq="2T") + expected = dti - pd.Timedelta(minutes=1) + tm.assert_index_equal(result, expected) + + def test_array_interface(self, datetime_index): + arr = DatetimeArray(datetime_index) + + # default asarray gives the same underlying data (for tz naive) + result = np.asarray(arr) + expected = arr._data + assert result is expected + tm.assert_numpy_array_equal(result, expected) + result = np.array(arr, copy=False) + assert result is expected + tm.assert_numpy_array_equal(result, expected) + + # specifying M8[ns] gives the same result as default + result = np.asarray(arr, dtype="datetime64[ns]") + expected = arr._data + assert result is expected + tm.assert_numpy_array_equal(result, expected) + result = np.array(arr, dtype="datetime64[ns]", copy=False) + assert result is expected + tm.assert_numpy_array_equal(result, expected) + result = np.array(arr, dtype="datetime64[ns]") + assert result is not expected + tm.assert_numpy_array_equal(result, expected) + + # to object dtype + result = np.asarray(arr, dtype=object) + expected = np.array(list(arr), dtype=object) + tm.assert_numpy_array_equal(result, expected) + + # to other dtype always copies + result = np.asarray(arr, dtype="int64") + assert result is not arr.asi8 + assert not np.may_share_memory(arr, result) + expected = arr.asi8.copy() + tm.assert_numpy_array_equal(result, expected) + + # other dtypes handled by numpy + for dtype in ["float64", str]: + result = np.asarray(arr, dtype=dtype) + expected = np.asarray(arr).astype(dtype) + tm.assert_numpy_array_equal(result, expected) + + def test_array_object_dtype(self, tz_naive_fixture): + # GH#23524 + tz = tz_naive_fixture + dti = pd.date_range("2016-01-01", periods=3, tz=tz) + arr = DatetimeArray(dti) + + expected = np.array(list(dti)) + + result = np.array(arr, dtype=object) + tm.assert_numpy_array_equal(result, expected) + + # also test the DatetimeIndex method while we're at it + result = np.array(dti, dtype=object) + tm.assert_numpy_array_equal(result, expected) + + def test_array_tz(self, tz_naive_fixture): + # GH#23524 + tz = tz_naive_fixture + dti = pd.date_range("2016-01-01", periods=3, tz=tz) + arr = DatetimeArray(dti) + + expected = dti.asi8.view("M8[ns]") + result = np.array(arr, dtype="M8[ns]") + tm.assert_numpy_array_equal(result, expected) + + result = np.array(arr, dtype="datetime64[ns]") + tm.assert_numpy_array_equal(result, expected) + + # check that we are not making copies when setting copy=False + result = np.array(arr, dtype="M8[ns]", copy=False) + assert result.base is expected.base + assert result.base is not None + result = np.array(arr, dtype="datetime64[ns]", copy=False) + assert result.base is expected.base + assert result.base is not None + + def test_array_i8_dtype(self, tz_naive_fixture): + tz = tz_naive_fixture + dti = pd.date_range("2016-01-01", periods=3, tz=tz) + arr = DatetimeArray(dti) + + expected = dti.asi8 + result = np.array(arr, dtype="i8") + tm.assert_numpy_array_equal(result, expected) + + result = np.array(arr, dtype=np.int64) + tm.assert_numpy_array_equal(result, expected) + + # check that we are still making copies when setting copy=False + result = np.array(arr, dtype="i8", copy=False) + assert result.base is not expected.base + assert result.base is None + + def test_from_array_keeps_base(self): + # Ensure that DatetimeArray._data.base isn't lost. + arr = np.array(["2000-01-01", "2000-01-02"], dtype="M8[ns]") + dta = DatetimeArray(arr) + + assert dta._data is arr + dta = DatetimeArray(arr[:0]) + assert dta._data.base is arr + + def test_from_dti(self, tz_naive_fixture): + tz = tz_naive_fixture + dti = pd.date_range("2016-01-01", periods=3, tz=tz) + arr = DatetimeArray(dti) + assert list(dti) == list(arr) + + # Check that Index.__new__ knows what to do with DatetimeArray + dti2 = pd.Index(arr) + assert isinstance(dti2, pd.DatetimeIndex) + assert list(dti2) == list(arr) + + def test_astype_object(self, tz_naive_fixture): + tz = tz_naive_fixture + dti = pd.date_range("2016-01-01", periods=3, tz=tz) + arr = DatetimeArray(dti) + asobj = arr.astype("O") + assert isinstance(asobj, np.ndarray) + assert asobj.dtype == "O" + assert list(asobj) == list(dti) + + @pytest.mark.parametrize("freqstr", ["D", "B", "W", "M", "Q", "Y"]) + def test_to_perioddelta(self, datetime_index, freqstr): + # GH#23113 + dti = datetime_index + arr = DatetimeArray(dti) + + expected = dti.to_perioddelta(freq=freqstr) + result = arr.to_perioddelta(freq=freqstr) + assert isinstance(result, TimedeltaArray) + + # placeholder until these become actual EA subclasses and we can use + # an EA-specific tm.assert_ function + tm.assert_index_equal(pd.Index(result), pd.Index(expected)) + + @pytest.mark.parametrize("freqstr", ["D", "B", "W", "M", "Q", "Y"]) + def test_to_period(self, datetime_index, freqstr): + dti = datetime_index + arr = DatetimeArray(dti) + + expected = dti.to_period(freq=freqstr) + result = arr.to_period(freq=freqstr) + assert isinstance(result, PeriodArray) + + # placeholder until these become actual EA subclasses and we can use + # an EA-specific tm.assert_ function + tm.assert_index_equal(pd.Index(result), pd.Index(expected)) + + @pytest.mark.parametrize("propname", pd.DatetimeIndex._bool_ops) + def test_bool_properties(self, datetime_index, propname): + # in this case _bool_ops is just `is_leap_year` + dti = datetime_index + arr = DatetimeArray(dti) + assert dti.freq == arr.freq + + result = getattr(arr, propname) + expected = np.array(getattr(dti, propname), dtype=result.dtype) + + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("propname", pd.DatetimeIndex._field_ops) + def test_int_properties(self, datetime_index, propname): + dti = datetime_index + arr = DatetimeArray(dti) + + result = getattr(arr, propname) + expected = np.array(getattr(dti, propname), dtype=result.dtype) + + tm.assert_numpy_array_equal(result, expected) + + def test_take_fill_valid(self, datetime_index, tz_naive_fixture): + dti = datetime_index.tz_localize(tz_naive_fixture) + arr = DatetimeArray(dti) + + now = pd.Timestamp.now().tz_localize(dti.tz) + result = arr.take([-1, 1], allow_fill=True, fill_value=now) + assert result[0] == now + + with pytest.raises(ValueError): + # fill_value Timedelta invalid + arr.take([-1, 1], allow_fill=True, fill_value=now - now) + + with pytest.raises(ValueError): + # fill_value Period invalid + arr.take([-1, 1], allow_fill=True, fill_value=pd.Period("2014Q1")) + + tz = None if dti.tz is not None else "US/Eastern" + now = pd.Timestamp.now().tz_localize(tz) + with pytest.raises(TypeError): + # Timestamp with mismatched tz-awareness + arr.take([-1, 1], allow_fill=True, fill_value=now) + + with pytest.raises(ValueError): + # require NaT, not iNaT, as it could be confused with an integer + arr.take([-1, 1], allow_fill=True, fill_value=pd.NaT.value) + + def test_concat_same_type_invalid(self, datetime_index): + # different timezones + dti = datetime_index + arr = DatetimeArray(dti) + + if arr.tz is None: + other = arr.tz_localize("UTC") + else: + other = arr.tz_localize(None) + + with pytest.raises(AssertionError): + arr._concat_same_type([arr, other]) + + def test_concat_same_type_different_freq(self): + # we *can* concatenate DTI with different freqs. + a = DatetimeArray(pd.date_range("2000", periods=2, freq="D", tz="US/Central")) + b = DatetimeArray(pd.date_range("2000", periods=2, freq="H", tz="US/Central")) + result = DatetimeArray._concat_same_type([a, b]) + expected = DatetimeArray( + pd.to_datetime( + [ + "2000-01-01 00:00:00", + "2000-01-02 00:00:00", + "2000-01-01 00:00:00", + "2000-01-01 01:00:00", + ] + ).tz_localize("US/Central") + ) + + tm.assert_datetime_array_equal(result, expected) + + def test_strftime(self, datetime_index): + arr = DatetimeArray(datetime_index) + + result = arr.strftime("%Y %b") + expected = np.array([ts.strftime("%Y %b") for ts in arr], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + def test_strftime_nat(self): + # GH 29578 + arr = DatetimeArray(DatetimeIndex(["2019-01-01", pd.NaT])) + + result = arr.strftime("%Y-%m-%d") + expected = np.array(["2019-01-01", np.nan], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + +class TestTimedeltaArray(SharedTests): + index_cls = pd.TimedeltaIndex + array_cls = TimedeltaArray + + def test_from_tdi(self): + tdi = pd.TimedeltaIndex(["1 Day", "3 Hours"]) + arr = TimedeltaArray(tdi) + assert list(arr) == list(tdi) + + # Check that Index.__new__ knows what to do with TimedeltaArray + tdi2 = pd.Index(arr) + assert isinstance(tdi2, pd.TimedeltaIndex) + assert list(tdi2) == list(arr) + + def test_astype_object(self): + tdi = pd.TimedeltaIndex(["1 Day", "3 Hours"]) + arr = TimedeltaArray(tdi) + asobj = arr.astype("O") + assert isinstance(asobj, np.ndarray) + assert asobj.dtype == "O" + assert list(asobj) == list(tdi) + + def test_to_pytimedelta(self, timedelta_index): + tdi = timedelta_index + arr = TimedeltaArray(tdi) + + expected = tdi.to_pytimedelta() + result = arr.to_pytimedelta() + + tm.assert_numpy_array_equal(result, expected) + + def test_total_seconds(self, timedelta_index): + tdi = timedelta_index + arr = TimedeltaArray(tdi) + + expected = tdi.total_seconds() + result = arr.total_seconds() + + tm.assert_numpy_array_equal(result, expected.values) + + @pytest.mark.parametrize("propname", pd.TimedeltaIndex._field_ops) + def test_int_properties(self, timedelta_index, propname): + tdi = timedelta_index + arr = TimedeltaArray(tdi) + + result = getattr(arr, propname) + expected = np.array(getattr(tdi, propname), dtype=result.dtype) + + tm.assert_numpy_array_equal(result, expected) + + def test_array_interface(self, timedelta_index): + arr = TimedeltaArray(timedelta_index) + + # default asarray gives the same underlying data + result = np.asarray(arr) + expected = arr._data + assert result is expected + tm.assert_numpy_array_equal(result, expected) + result = np.array(arr, copy=False) + assert result is expected + tm.assert_numpy_array_equal(result, expected) + + # specifying m8[ns] gives the same result as default + result = np.asarray(arr, dtype="timedelta64[ns]") + expected = arr._data + assert result is expected + tm.assert_numpy_array_equal(result, expected) + result = np.array(arr, dtype="timedelta64[ns]", copy=False) + assert result is expected + tm.assert_numpy_array_equal(result, expected) + result = np.array(arr, dtype="timedelta64[ns]") + assert result is not expected + tm.assert_numpy_array_equal(result, expected) + + # to object dtype + result = np.asarray(arr, dtype=object) + expected = np.array(list(arr), dtype=object) + tm.assert_numpy_array_equal(result, expected) + + # to other dtype always copies + result = np.asarray(arr, dtype="int64") + assert result is not arr.asi8 + assert not np.may_share_memory(arr, result) + expected = arr.asi8.copy() + tm.assert_numpy_array_equal(result, expected) + + # other dtypes handled by numpy + for dtype in ["float64", str]: + result = np.asarray(arr, dtype=dtype) + expected = np.asarray(arr).astype(dtype) + tm.assert_numpy_array_equal(result, expected) + + def test_take_fill_valid(self, timedelta_index): + tdi = timedelta_index + arr = TimedeltaArray(tdi) + + td1 = pd.Timedelta(days=1) + result = arr.take([-1, 1], allow_fill=True, fill_value=td1) + assert result[0] == td1 + + now = pd.Timestamp.now() + with pytest.raises(ValueError): + # fill_value Timestamp invalid + arr.take([0, 1], allow_fill=True, fill_value=now) + + with pytest.raises(ValueError): + # fill_value Period invalid + arr.take([0, 1], allow_fill=True, fill_value=now.to_period("D")) + + +class TestPeriodArray(SharedTests): + index_cls = pd.PeriodIndex + array_cls = PeriodArray + + def test_from_pi(self, period_index): + pi = period_index + arr = PeriodArray(pi) + assert list(arr) == list(pi) + + # Check that Index.__new__ knows what to do with PeriodArray + pi2 = pd.Index(arr) + assert isinstance(pi2, pd.PeriodIndex) + assert list(pi2) == list(arr) + + def test_astype_object(self, period_index): + pi = period_index + arr = PeriodArray(pi) + asobj = arr.astype("O") + assert isinstance(asobj, np.ndarray) + assert asobj.dtype == "O" + assert list(asobj) == list(pi) + + @pytest.mark.parametrize("how", ["S", "E"]) + def test_to_timestamp(self, how, period_index): + pi = period_index + arr = PeriodArray(pi) + + expected = DatetimeArray(pi.to_timestamp(how=how)) + result = arr.to_timestamp(how=how) + assert isinstance(result, DatetimeArray) + + # placeholder until these become actual EA subclasses and we can use + # an EA-specific tm.assert_ function + tm.assert_index_equal(pd.Index(result), pd.Index(expected)) + + def test_to_timestamp_out_of_bounds(self): + # GH#19643 previously overflowed silently + pi = pd.period_range("1500", freq="Y", periods=3) + with pytest.raises(OutOfBoundsDatetime): + pi.to_timestamp() + + with pytest.raises(OutOfBoundsDatetime): + pi._data.to_timestamp() + + @pytest.mark.parametrize("propname", PeriodArray._bool_ops) + def test_bool_properties(self, period_index, propname): + # in this case _bool_ops is just `is_leap_year` + pi = period_index + arr = PeriodArray(pi) + + result = getattr(arr, propname) + expected = np.array(getattr(pi, propname)) + + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("propname", PeriodArray._field_ops) + def test_int_properties(self, period_index, propname): + pi = period_index + arr = PeriodArray(pi) + + result = getattr(arr, propname) + expected = np.array(getattr(pi, propname)) + + tm.assert_numpy_array_equal(result, expected) + + def test_array_interface(self, period_index): + arr = PeriodArray(period_index) + + # default asarray gives objects + result = np.asarray(arr) + expected = np.array(list(arr), dtype=object) + tm.assert_numpy_array_equal(result, expected) + + # to object dtype (same as default) + result = np.asarray(arr, dtype=object) + tm.assert_numpy_array_equal(result, expected) + + # to other dtypes + with pytest.raises(TypeError): + np.asarray(arr, dtype="int64") + + with pytest.raises(TypeError): + np.asarray(arr, dtype="float64") + + result = np.asarray(arr, dtype="S20") + expected = np.asarray(arr).astype("S20") + tm.assert_numpy_array_equal(result, expected) + + def test_strftime(self, period_index): + arr = PeriodArray(period_index) + + result = arr.strftime("%Y") + expected = np.array([per.strftime("%Y") for per in arr], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + def test_strftime_nat(self): + # GH 29578 + arr = PeriodArray(PeriodIndex(["2019-01-01", pd.NaT], dtype="period[D]")) + + result = arr.strftime("%Y-%m-%d") + expected = np.array(["2019-01-01", np.nan], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize( + "array,casting_nats", + [ + ( + pd.TimedeltaIndex(["1 Day", "3 Hours", "NaT"])._data, + (pd.NaT, np.timedelta64("NaT", "ns")), + ), + ( + pd.date_range("2000-01-01", periods=3, freq="D")._data, + (pd.NaT, np.datetime64("NaT", "ns")), + ), + (pd.period_range("2000-01-01", periods=3, freq="D")._data, (pd.NaT,)), + ], + ids=lambda x: type(x).__name__, +) +def test_casting_nat_setitem_array(array, casting_nats): + expected = type(array)._from_sequence([pd.NaT, array[1], array[2]]) + + for nat in casting_nats: + arr = array.copy() + arr[0] = nat + tm.assert_equal(arr, expected) + + +@pytest.mark.parametrize( + "array,non_casting_nats", + [ + ( + pd.TimedeltaIndex(["1 Day", "3 Hours", "NaT"])._data, + (np.datetime64("NaT", "ns"), pd.NaT.value), + ), + ( + pd.date_range("2000-01-01", periods=3, freq="D")._data, + (np.timedelta64("NaT", "ns"), pd.NaT.value), + ), + ( + pd.period_range("2000-01-01", periods=3, freq="D")._data, + (np.datetime64("NaT", "ns"), np.timedelta64("NaT", "ns"), pd.NaT.value), + ), + ], + ids=lambda x: type(x).__name__, +) +def test_invalid_nat_setitem_array(array, non_casting_nats): + for nat in non_casting_nats: + with pytest.raises(TypeError): + array[0] = nat + + +@pytest.mark.parametrize( + "array", + [ + pd.date_range("2000", periods=4).array, + pd.timedelta_range("2000", periods=4).array, + ], +) +def test_to_numpy_extra(array): + if _np_version_under1p18: + # np.isnan(NaT) raises, so use pandas' + isnan = pd.isna + else: + isnan = np.isnan + + array[0] = pd.NaT + original = array.copy() + + result = array.to_numpy() + assert isnan(result[0]) + + result = array.to_numpy(dtype="int64") + assert result[0] == -9223372036854775808 + + result = array.to_numpy(dtype="int64", na_value=0) + assert result[0] == 0 + + result = array.to_numpy(na_value=array[1].to_numpy()) + assert result[0] == result[1] + + result = array.to_numpy(na_value=array[1].to_numpy(copy=False)) + assert result[0] == result[1] + + tm.assert_equal(array, original) diff --git a/venv/Lib/site-packages/pandas/tests/arrays/test_datetimes.py b/venv/Lib/site-packages/pandas/tests/arrays/test_datetimes.py new file mode 100644 index 0000000..5608ab5 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arrays/test_datetimes.py @@ -0,0 +1,406 @@ +""" +Tests for DatetimeArray +""" +import operator + +import numpy as np +import pytest + +from pandas.core.dtypes.dtypes import DatetimeTZDtype + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import DatetimeArray +from pandas.core.arrays.datetimes import sequence_to_dt64ns + + +class TestDatetimeArrayConstructor: + def test_from_sequence_invalid_type(self): + mi = pd.MultiIndex.from_product([np.arange(5), np.arange(5)]) + with pytest.raises(TypeError, match="Cannot create a DatetimeArray"): + DatetimeArray._from_sequence(mi) + + def test_only_1dim_accepted(self): + arr = np.array([0, 1, 2, 3], dtype="M8[h]").astype("M8[ns]") + + with pytest.raises(ValueError, match="Only 1-dimensional"): + # 3-dim, we allow 2D to sneak in for ops purposes GH#29853 + DatetimeArray(arr.reshape(2, 2, 1)) + + with pytest.raises(ValueError, match="Only 1-dimensional"): + # 0-dim + DatetimeArray(arr[[0]].squeeze()) + + def test_freq_validation(self): + # GH#24623 check that invalid instances cannot be created with the + # public constructor + arr = np.arange(5, dtype=np.int64) * 3600 * 10 ** 9 + + msg = ( + "Inferred frequency H from passed values does not " + "conform to passed frequency W-SUN" + ) + with pytest.raises(ValueError, match=msg): + DatetimeArray(arr, freq="W") + + @pytest.mark.parametrize( + "meth", + [ + DatetimeArray._from_sequence, + sequence_to_dt64ns, + pd.to_datetime, + pd.DatetimeIndex, + ], + ) + def test_mixing_naive_tzaware_raises(self, meth): + # GH#24569 + arr = np.array([pd.Timestamp("2000"), pd.Timestamp("2000", tz="CET")]) + + msg = ( + "Cannot mix tz-aware with tz-naive values|" + "Tz-aware datetime.datetime cannot be converted " + "to datetime64 unless utc=True" + ) + + for obj in [arr, arr[::-1]]: + # check that we raise regardless of whether naive is found + # before aware or vice-versa + with pytest.raises(ValueError, match=msg): + meth(obj) + + def test_from_pandas_array(self): + arr = pd.array(np.arange(5, dtype=np.int64)) * 3600 * 10 ** 9 + + result = DatetimeArray._from_sequence(arr, freq="infer") + + expected = pd.date_range("1970-01-01", periods=5, freq="H")._data + tm.assert_datetime_array_equal(result, expected) + + def test_mismatched_timezone_raises(self): + arr = DatetimeArray( + np.array(["2000-01-01T06:00:00"], dtype="M8[ns]"), + dtype=DatetimeTZDtype(tz="US/Central"), + ) + dtype = DatetimeTZDtype(tz="US/Eastern") + with pytest.raises(TypeError, match="Timezone of the array"): + DatetimeArray(arr, dtype=dtype) + + def test_non_array_raises(self): + with pytest.raises(ValueError, match="list"): + DatetimeArray([1, 2, 3]) + + def test_other_type_raises(self): + with pytest.raises( + ValueError, match="The dtype of 'values' is incorrect.*bool" + ): + DatetimeArray(np.array([1, 2, 3], dtype="bool")) + + def test_incorrect_dtype_raises(self): + with pytest.raises(ValueError, match="Unexpected value for 'dtype'."): + DatetimeArray(np.array([1, 2, 3], dtype="i8"), dtype="category") + + def test_freq_infer_raises(self): + with pytest.raises(ValueError, match="Frequency inference"): + DatetimeArray(np.array([1, 2, 3], dtype="i8"), freq="infer") + + def test_copy(self): + data = np.array([1, 2, 3], dtype="M8[ns]") + arr = DatetimeArray(data, copy=False) + assert arr._data is data + + arr = DatetimeArray(data, copy=True) + assert arr._data is not data + + +class TestDatetimeArrayComparisons: + # TODO: merge this into tests/arithmetic/test_datetime64 once it is + # sufficiently robust + + def test_cmp_dt64_arraylike_tznaive(self, all_compare_operators): + # arbitrary tz-naive DatetimeIndex + opname = all_compare_operators.strip("_") + op = getattr(operator, opname) + + dti = pd.date_range("2016-01-1", freq="MS", periods=9, tz=None) + arr = DatetimeArray(dti) + assert arr.freq == dti.freq + assert arr.tz == dti.tz + + right = dti + + expected = np.ones(len(arr), dtype=bool) + if opname in ["ne", "gt", "lt"]: + # for these the comparisons should be all-False + expected = ~expected + + result = op(arr, arr) + tm.assert_numpy_array_equal(result, expected) + for other in [right, np.array(right)]: + # TODO: add list and tuple, and object-dtype once those + # are fixed in the constructor + result = op(arr, other) + tm.assert_numpy_array_equal(result, expected) + + result = op(other, arr) + tm.assert_numpy_array_equal(result, expected) + + +class TestDatetimeArray: + def test_astype_to_same(self): + arr = DatetimeArray._from_sequence(["2000"], tz="US/Central") + result = arr.astype(DatetimeTZDtype(tz="US/Central"), copy=False) + assert result is arr + + @pytest.mark.parametrize("dtype", [int, np.int32, np.int64, "uint32", "uint64"]) + def test_astype_int(self, dtype): + arr = DatetimeArray._from_sequence([pd.Timestamp("2000"), pd.Timestamp("2001")]) + result = arr.astype(dtype) + + if np.dtype(dtype).kind == "u": + expected_dtype = np.dtype("uint64") + else: + expected_dtype = np.dtype("int64") + expected = arr.astype(expected_dtype) + + assert result.dtype == expected_dtype + tm.assert_numpy_array_equal(result, expected) + + def test_tz_setter_raises(self): + arr = DatetimeArray._from_sequence(["2000"], tz="US/Central") + with pytest.raises(AttributeError, match="tz_localize"): + arr.tz = "UTC" + + def test_setitem_different_tz_raises(self): + data = np.array([1, 2, 3], dtype="M8[ns]") + arr = DatetimeArray(data, copy=False, dtype=DatetimeTZDtype(tz="US/Central")) + with pytest.raises(TypeError, match="Cannot compare tz-naive and tz-aware"): + arr[0] = pd.Timestamp("2000") + + with pytest.raises(ValueError, match="US/Central"): + arr[0] = pd.Timestamp("2000", tz="US/Eastern") + + def test_setitem_clears_freq(self): + a = DatetimeArray(pd.date_range("2000", periods=2, freq="D", tz="US/Central")) + a[0] = pd.Timestamp("2000", tz="US/Central") + assert a.freq is None + + @pytest.mark.parametrize( + "obj", + [ + pd.Timestamp.now(), + pd.Timestamp.now().to_datetime64(), + pd.Timestamp.now().to_pydatetime(), + ], + ) + def test_setitem_objects(self, obj): + # make sure we accept datetime64 and datetime in addition to Timestamp + dti = pd.date_range("2000", periods=2, freq="D") + arr = dti._data + + arr[0] = obj + assert arr[0] == obj + + def test_repeat_preserves_tz(self): + dti = pd.date_range("2000", periods=2, freq="D", tz="US/Central") + arr = DatetimeArray(dti) + + repeated = arr.repeat([1, 1]) + + # preserves tz and values, but not freq + expected = DatetimeArray(arr.asi8, freq=None, dtype=arr.dtype) + tm.assert_equal(repeated, expected) + + def test_value_counts_preserves_tz(self): + dti = pd.date_range("2000", periods=2, freq="D", tz="US/Central") + arr = DatetimeArray(dti).repeat([4, 3]) + + result = arr.value_counts() + + # Note: not tm.assert_index_equal, since `freq`s do not match + assert result.index.equals(dti) + + arr[-2] = pd.NaT + result = arr.value_counts() + expected = pd.Series([1, 4, 2], index=[pd.NaT, dti[0], dti[1]]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("method", ["pad", "backfill"]) + def test_fillna_preserves_tz(self, method): + dti = pd.date_range("2000-01-01", periods=5, freq="D", tz="US/Central") + arr = DatetimeArray(dti, copy=True) + arr[2] = pd.NaT + + fill_val = dti[1] if method == "pad" else dti[3] + expected = DatetimeArray._from_sequence( + [dti[0], dti[1], fill_val, dti[3], dti[4]], freq=None, tz="US/Central" + ) + + result = arr.fillna(method=method) + tm.assert_extension_array_equal(result, expected) + + # assert that arr and dti were not modified in-place + assert arr[2] is pd.NaT + assert dti[2] == pd.Timestamp("2000-01-03", tz="US/Central") + + def test_array_interface_tz(self): + tz = "US/Central" + data = DatetimeArray(pd.date_range("2017", periods=2, tz=tz)) + result = np.asarray(data) + + expected = np.array( + [ + pd.Timestamp("2017-01-01T00:00:00", tz=tz), + pd.Timestamp("2017-01-02T00:00:00", tz=tz), + ], + dtype=object, + ) + tm.assert_numpy_array_equal(result, expected) + + result = np.asarray(data, dtype=object) + tm.assert_numpy_array_equal(result, expected) + + result = np.asarray(data, dtype="M8[ns]") + + expected = np.array( + ["2017-01-01T06:00:00", "2017-01-02T06:00:00"], dtype="M8[ns]" + ) + tm.assert_numpy_array_equal(result, expected) + + def test_array_interface(self): + data = DatetimeArray(pd.date_range("2017", periods=2)) + expected = np.array( + ["2017-01-01T00:00:00", "2017-01-02T00:00:00"], dtype="datetime64[ns]" + ) + + result = np.asarray(data) + tm.assert_numpy_array_equal(result, expected) + + result = np.asarray(data, dtype=object) + expected = np.array( + [pd.Timestamp("2017-01-01T00:00:00"), pd.Timestamp("2017-01-02T00:00:00")], + dtype=object, + ) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("index", [True, False]) + def test_searchsorted_different_tz(self, index): + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = DatetimeArray(data, freq="D").tz_localize("Asia/Tokyo") + if index: + arr = pd.Index(arr) + + expected = arr.searchsorted(arr[2]) + result = arr.searchsorted(arr[2].tz_convert("UTC")) + assert result == expected + + expected = arr.searchsorted(arr[2:6]) + result = arr.searchsorted(arr[2:6].tz_convert("UTC")) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize("index", [True, False]) + def test_searchsorted_tzawareness_compat(self, index): + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = DatetimeArray(data, freq="D") + if index: + arr = pd.Index(arr) + + mismatch = arr.tz_localize("Asia/Tokyo") + + msg = "Cannot compare tz-naive and tz-aware datetime-like objects" + with pytest.raises(TypeError, match=msg): + arr.searchsorted(mismatch[0]) + with pytest.raises(TypeError, match=msg): + arr.searchsorted(mismatch) + + with pytest.raises(TypeError, match=msg): + mismatch.searchsorted(arr[0]) + with pytest.raises(TypeError, match=msg): + mismatch.searchsorted(arr) + + @pytest.mark.parametrize( + "other", + [ + 1, + np.int64(1), + 1.0, + np.timedelta64("NaT"), + pd.Timedelta(days=2), + "invalid", + np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9, + np.arange(10).view("timedelta64[ns]") * 24 * 3600 * 10 ** 9, + pd.Timestamp.now().to_period("D"), + ], + ) + @pytest.mark.parametrize( + "index", + [ + True, + pytest.param( + False, + marks=pytest.mark.xfail( + reason="Raises ValueError instead of TypeError", raises=ValueError + ), + ), + ], + ) + def test_searchsorted_invalid_types(self, other, index): + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = DatetimeArray(data, freq="D") + if index: + arr = pd.Index(arr) + + msg = "searchsorted requires compatible dtype or scalar" + with pytest.raises(TypeError, match=msg): + arr.searchsorted(other) + + +class TestSequenceToDT64NS: + def test_tz_dtype_mismatch_raises(self): + arr = DatetimeArray._from_sequence(["2000"], tz="US/Central") + with pytest.raises(TypeError, match="data is already tz-aware"): + sequence_to_dt64ns(arr, dtype=DatetimeTZDtype(tz="UTC")) + + def test_tz_dtype_matches(self): + arr = DatetimeArray._from_sequence(["2000"], tz="US/Central") + result, _, _ = sequence_to_dt64ns(arr, dtype=DatetimeTZDtype(tz="US/Central")) + tm.assert_numpy_array_equal(arr._data, result) + + +class TestReductions: + @pytest.mark.parametrize("tz", [None, "US/Central"]) + def test_min_max(self, tz): + arr = DatetimeArray._from_sequence( + [ + "2000-01-03", + "2000-01-03", + "NaT", + "2000-01-02", + "2000-01-05", + "2000-01-04", + ], + tz=tz, + ) + + result = arr.min() + expected = pd.Timestamp("2000-01-02", tz=tz) + assert result == expected + + result = arr.max() + expected = pd.Timestamp("2000-01-05", tz=tz) + assert result == expected + + result = arr.min(skipna=False) + assert result is pd.NaT + + result = arr.max(skipna=False) + assert result is pd.NaT + + @pytest.mark.parametrize("tz", [None, "US/Central"]) + @pytest.mark.parametrize("skipna", [True, False]) + def test_min_max_empty(self, skipna, tz): + arr = DatetimeArray._from_sequence([], tz=tz) + result = arr.min(skipna=skipna) + assert result is pd.NaT + + result = arr.max(skipna=skipna) + assert result is pd.NaT diff --git a/venv/Lib/site-packages/pandas/tests/arrays/test_integer.py b/venv/Lib/site-packages/pandas/tests/arrays/test_integer.py new file mode 100644 index 0000000..0c5ae50 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arrays/test_integer.py @@ -0,0 +1,1079 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas.core.dtypes.generic import ABCIndexClass + +import pandas as pd +import pandas._testing as tm +from pandas.api.types import is_float, is_float_dtype, is_integer, is_scalar +from pandas.core.arrays import IntegerArray, integer_array +from pandas.core.arrays.integer import ( + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, +) +from pandas.tests.extension.base import BaseOpsUtil + + +def make_data(): + return list(range(8)) + [np.nan] + list(range(10, 98)) + [np.nan] + [99, 100] + + +@pytest.fixture( + params=[ + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, + ] +) +def dtype(request): + return request.param() + + +@pytest.fixture +def data(dtype): + return integer_array(make_data(), dtype=dtype) + + +@pytest.fixture +def data_missing(dtype): + return integer_array([np.nan, 1], dtype=dtype) + + +@pytest.fixture(params=["data", "data_missing"]) +def all_data(request, data, data_missing): + """Parametrized fixture giving 'data' and 'data_missing'""" + if request.param == "data": + return data + elif request.param == "data_missing": + return data_missing + + +def test_dtypes(dtype): + # smoke tests on auto dtype construction + + if dtype.is_signed_integer: + assert np.dtype(dtype.type).kind == "i" + else: + assert np.dtype(dtype.type).kind == "u" + assert dtype.name is not None + + +@pytest.mark.parametrize( + "dtype, expected", + [ + (Int8Dtype(), "Int8Dtype()"), + (Int16Dtype(), "Int16Dtype()"), + (Int32Dtype(), "Int32Dtype()"), + (Int64Dtype(), "Int64Dtype()"), + (UInt8Dtype(), "UInt8Dtype()"), + (UInt16Dtype(), "UInt16Dtype()"), + (UInt32Dtype(), "UInt32Dtype()"), + (UInt64Dtype(), "UInt64Dtype()"), + ], +) +def test_repr_dtype(dtype, expected): + assert repr(dtype) == expected + + +def test_repr_array(): + result = repr(integer_array([1, None, 3])) + expected = "\n[1, , 3]\nLength: 3, dtype: Int64" + assert result == expected + + +def test_repr_array_long(): + data = integer_array([1, 2, None] * 1000) + expected = ( + "\n" + "[ 1, 2, , 1, 2, , 1, 2, , 1,\n" + " ...\n" + " , 1, 2, , 1, 2, , 1, 2, ]\n" + "Length: 3000, dtype: Int64" + ) + result = repr(data) + assert result == expected + + +class TestConstructors: + def test_uses_pandas_na(self): + a = pd.array([1, None], dtype=pd.Int64Dtype()) + assert a[1] is pd.NA + + def test_from_dtype_from_float(self, data): + # construct from our dtype & string dtype + dtype = data.dtype + + # from float + expected = pd.Series(data) + result = pd.Series( + data.to_numpy(na_value=np.nan, dtype="float"), dtype=str(dtype) + ) + tm.assert_series_equal(result, expected) + + # from int / list + expected = pd.Series(data) + result = pd.Series(np.array(data).tolist(), dtype=str(dtype)) + tm.assert_series_equal(result, expected) + + # from int / array + expected = pd.Series(data).dropna().reset_index(drop=True) + dropped = np.array(data.dropna()).astype(np.dtype((dtype.type))) + result = pd.Series(dropped, dtype=str(dtype)) + tm.assert_series_equal(result, expected) + + +class TestArithmeticOps(BaseOpsUtil): + def _check_divmod_op(self, s, op, other, exc=None): + super()._check_divmod_op(s, op, other, None) + + def _check_op(self, s, op_name, other, exc=None): + op = self.get_op_from_name(op_name) + result = op(s, other) + + # compute expected + mask = s.isna() + + # if s is a DataFrame, squeeze to a Series + # for comparison + if isinstance(s, pd.DataFrame): + result = result.squeeze() + s = s.squeeze() + mask = mask.squeeze() + + # other array is an Integer + if isinstance(other, IntegerArray): + omask = getattr(other, "mask", None) + mask = getattr(other, "data", other) + if omask is not None: + mask |= omask + + # 1 ** na is na, so need to unmask those + if op_name == "__pow__": + mask = np.where(~s.isna() & (s == 1), False, mask) + + elif op_name == "__rpow__": + other_is_one = other == 1 + if isinstance(other_is_one, pd.Series): + other_is_one = other_is_one.fillna(False) + mask = np.where(other_is_one, False, mask) + + # float result type or float op + if ( + is_float_dtype(other) + or is_float(other) + or op_name in ["__rtruediv__", "__truediv__", "__rdiv__", "__div__"] + ): + rs = s.astype("float") + expected = op(rs, other) + self._check_op_float(result, expected, mask, s, op_name, other) + + # integer result type + else: + rs = pd.Series(s.values._data, name=s.name) + expected = op(rs, other) + self._check_op_integer(result, expected, mask, s, op_name, other) + + def _check_op_float(self, result, expected, mask, s, op_name, other): + # check comparisons that are resulting in float dtypes + + expected[mask] = np.nan + if "floordiv" in op_name: + # Series op sets 1//0 to np.inf, which IntegerArray does not do (yet) + mask2 = np.isinf(expected) & np.isnan(result) + expected[mask2] = np.nan + tm.assert_series_equal(result, expected) + + def _check_op_integer(self, result, expected, mask, s, op_name, other): + # check comparisons that are resulting in integer dtypes + + # to compare properly, we convert the expected + # to float, mask to nans and convert infs + # if we have uints then we process as uints + # then convert to float + # and we ultimately want to create a IntArray + # for comparisons + + fill_value = 0 + + # mod/rmod turn floating 0 into NaN while + # integer works as expected (no nan) + if op_name in ["__mod__", "__rmod__"]: + if is_scalar(other): + if other == 0: + expected[s.values == 0] = 0 + else: + expected = expected.fillna(0) + else: + expected[ + (s.values == 0).fillna(False) + & ((expected == 0).fillna(False) | expected.isna()) + ] = 0 + try: + expected[ + ((expected == np.inf) | (expected == -np.inf)).fillna(False) + ] = fill_value + original = expected + expected = expected.astype(s.dtype) + + except ValueError: + + expected = expected.astype(float) + expected[ + ((expected == np.inf) | (expected == -np.inf)).fillna(False) + ] = fill_value + original = expected + expected = expected.astype(s.dtype) + + expected[mask] = pd.NA + + # assert that the expected astype is ok + # (skip for unsigned as they have wrap around) + if not s.dtype.is_unsigned_integer: + original = pd.Series(original) + + # we need to fill with 0's to emulate what an astype('int') does + # (truncation) for certain ops + if op_name in ["__rtruediv__", "__rdiv__"]: + mask |= original.isna() + original = original.fillna(0).astype("int") + + original = original.astype("float") + original[mask] = np.nan + tm.assert_series_equal(original, expected.astype("float")) + + # assert our expected result + tm.assert_series_equal(result, expected) + + def test_arith_integer_array(self, data, all_arithmetic_operators): + # we operate with a rhs of an integer array + + op = all_arithmetic_operators + + s = pd.Series(data) + rhs = pd.Series([1] * len(data), dtype=data.dtype) + rhs.iloc[-1] = np.nan + + self._check_op(s, op, rhs) + + def test_arith_series_with_scalar(self, data, all_arithmetic_operators): + # scalar + op = all_arithmetic_operators + s = pd.Series(data) + self._check_op(s, op, 1, exc=TypeError) + + def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): + # frame & scalar + op = all_arithmetic_operators + df = pd.DataFrame({"A": data}) + self._check_op(df, op, 1, exc=TypeError) + + def test_arith_series_with_array(self, data, all_arithmetic_operators): + # ndarray & other series + op = all_arithmetic_operators + s = pd.Series(data) + other = np.ones(len(s), dtype=s.dtype.type) + self._check_op(s, op, other, exc=TypeError) + + def test_arith_coerce_scalar(self, data, all_arithmetic_operators): + + op = all_arithmetic_operators + s = pd.Series(data) + + other = 0.01 + self._check_op(s, op, other) + + @pytest.mark.parametrize("other", [1.0, np.array(1.0)]) + def test_arithmetic_conversion(self, all_arithmetic_operators, other): + # if we have a float operand we should have a float result + # if that is equal to an integer + op = self.get_op_from_name(all_arithmetic_operators) + + s = pd.Series([1, 2, 3], dtype="Int64") + result = op(s, other) + assert result.dtype is np.dtype("float") + + def test_arith_len_mismatch(self, all_arithmetic_operators): + # operating with a list-like with non-matching length raises + op = self.get_op_from_name(all_arithmetic_operators) + other = np.array([1.0]) + + s = pd.Series([1, 2, 3], dtype="Int64") + with pytest.raises(ValueError, match="Lengths must match"): + op(s, other) + + @pytest.mark.parametrize("other", [0, 0.5]) + def test_arith_zero_dim_ndarray(self, other): + arr = integer_array([1, None, 2]) + result = arr + np.array(other) + expected = arr + other + tm.assert_equal(result, expected) + + def test_error(self, data, all_arithmetic_operators): + # invalid ops + + op = all_arithmetic_operators + s = pd.Series(data) + ops = getattr(s, op) + opa = getattr(data, op) + + # invalid scalars + with pytest.raises(TypeError): + ops("foo") + with pytest.raises(TypeError): + ops(pd.Timestamp("20180101")) + + # invalid array-likes + with pytest.raises(TypeError): + ops(pd.Series("foo", index=s.index)) + + if op != "__rpow__": + # TODO(extension) + # rpow with a datetimelike coerces the integer array incorrectly + with pytest.raises(TypeError): + ops(pd.Series(pd.date_range("20180101", periods=len(s)))) + + # 2d + result = opa(pd.DataFrame({"A": s})) + assert result is NotImplemented + + with pytest.raises(NotImplementedError): + opa(np.arange(len(s)).reshape(-1, len(s))) + + @pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)]) + def test_divide_by_zero(self, zero, negative): + # https://github.com/pandas-dev/pandas/issues/27398 + a = pd.array([0, 1, -1, None], dtype="Int64") + result = a / zero + expected = np.array([np.nan, np.inf, -np.inf, np.nan]) + if negative: + expected *= -1 + tm.assert_numpy_array_equal(result, expected) + + def test_pow_scalar(self): + a = pd.array([-1, 0, 1, None, 2], dtype="Int64") + result = a ** 0 + expected = pd.array([1, 1, 1, 1, 1], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = a ** 1 + expected = pd.array([-1, 0, 1, None, 2], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = a ** pd.NA + expected = pd.array([None, None, 1, None, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = a ** np.nan + expected = np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + # reversed + a = a[1:] # Can't raise integers to negative powers. + + result = 0 ** a + expected = pd.array([1, 0, None, 0], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = 1 ** a + expected = pd.array([1, 1, 1, 1], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = pd.NA ** a + expected = pd.array([1, None, None, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = np.nan ** a + expected = np.array([1, np.nan, np.nan, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + def test_pow_array(self): + a = integer_array([0, 0, 0, 1, 1, 1, None, None, None]) + b = integer_array([0, 1, None, 0, 1, None, 0, 1, None]) + result = a ** b + expected = integer_array([1, 0, None, 1, 1, 1, 1, None, None]) + tm.assert_extension_array_equal(result, expected) + + def test_rpow_one_to_na(self): + # https://github.com/pandas-dev/pandas/issues/22022 + # https://github.com/pandas-dev/pandas/issues/29997 + arr = integer_array([np.nan, np.nan]) + result = np.array([1.0, 2.0]) ** arr + expected = np.array([1.0, np.nan]) + tm.assert_numpy_array_equal(result, expected) + + +class TestComparisonOps(BaseOpsUtil): + def _compare_other(self, data, op_name, other): + op = self.get_op_from_name(op_name) + + # array + result = pd.Series(op(data, other)) + expected = pd.Series(op(data._data, other), dtype="boolean") + + # fill the nan locations + expected[data._mask] = pd.NA + + tm.assert_series_equal(result, expected) + + # series + s = pd.Series(data) + result = op(s, other) + + expected = op(pd.Series(data._data), other) + + # fill the nan locations + expected[data._mask] = pd.NA + expected = expected.astype("boolean") + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("other", [True, False, pd.NA, -1, 0, 1]) + def test_scalar(self, other, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([1, 0, None], dtype="Int64") + + result = op(a, other) + + if other is pd.NA: + expected = pd.array([None, None, None], dtype="boolean") + else: + values = op(a._data, other) + expected = pd.arrays.BooleanArray(values, a._mask, copy=True) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = pd.NA + tm.assert_extension_array_equal(a, pd.array([1, 0, None], dtype="Int64")) + + def test_array(self, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([0, 1, 2, None, None, None], dtype="Int64") + b = pd.array([0, 1, None, 0, 1, None], dtype="Int64") + + result = op(a, b) + values = op(a._data, b._data) + mask = a._mask | b._mask + + expected = pd.arrays.BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = pd.NA + tm.assert_extension_array_equal( + a, pd.array([0, 1, 2, None, None, None], dtype="Int64") + ) + tm.assert_extension_array_equal( + b, pd.array([0, 1, None, 0, 1, None], dtype="Int64") + ) + + def test_compare_with_booleanarray(self, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([True, False, None] * 3, dtype="boolean") + b = pd.array([0] * 3 + [1] * 3 + [None] * 3, dtype="Int64") + other = pd.array([False] * 3 + [True] * 3 + [None] * 3, dtype="boolean") + expected = op(a, other) + result = op(a, b) + tm.assert_extension_array_equal(result, expected) + + def test_no_shared_mask(self, data): + result = data + 1 + assert np.shares_memory(result._mask, data._mask) is False + + def test_compare_to_string(self, any_nullable_int_dtype): + # GH 28930 + s = pd.Series([1, None], dtype=any_nullable_int_dtype) + result = s == "a" + expected = pd.Series([False, pd.NA], dtype="boolean") + + self.assert_series_equal(result, expected) + + def test_compare_to_int(self, any_nullable_int_dtype, all_compare_operators): + # GH 28930 + s1 = pd.Series([1, None, 3], dtype=any_nullable_int_dtype) + s2 = pd.Series([1, None, 3], dtype="float") + + method = getattr(s1, all_compare_operators) + result = method(2) + + method = getattr(s2, all_compare_operators) + expected = method(2).astype("boolean") + expected[s2.isna()] = pd.NA + + self.assert_series_equal(result, expected) + + +class TestCasting: + @pytest.mark.parametrize("dropna", [True, False]) + def test_construct_index(self, all_data, dropna): + # ensure that we do not coerce to Float64Index, rather + # keep as Index + + all_data = all_data[:10] + if dropna: + other = np.array(all_data[~all_data.isna()]) + else: + other = all_data + + result = pd.Index(integer_array(other, dtype=all_data.dtype)) + expected = pd.Index(other, dtype=object) + + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("dropna", [True, False]) + def test_astype_index(self, all_data, dropna): + # as an int/uint index to Index + + all_data = all_data[:10] + if dropna: + other = all_data[~all_data.isna()] + else: + other = all_data + + dtype = all_data.dtype + idx = pd.Index(np.array(other)) + assert isinstance(idx, ABCIndexClass) + + result = idx.astype(dtype) + expected = idx.astype(object).astype(dtype) + tm.assert_index_equal(result, expected) + + def test_astype(self, all_data): + all_data = all_data[:10] + + ints = all_data[~all_data.isna()] + mixed = all_data + dtype = Int8Dtype() + + # coerce to same type - ints + s = pd.Series(ints) + result = s.astype(all_data.dtype) + expected = pd.Series(ints) + tm.assert_series_equal(result, expected) + + # coerce to same other - ints + s = pd.Series(ints) + result = s.astype(dtype) + expected = pd.Series(ints, dtype=dtype) + tm.assert_series_equal(result, expected) + + # coerce to same numpy_dtype - ints + s = pd.Series(ints) + result = s.astype(all_data.dtype.numpy_dtype) + expected = pd.Series(ints._data.astype(all_data.dtype.numpy_dtype)) + tm.assert_series_equal(result, expected) + + # coerce to same type - mixed + s = pd.Series(mixed) + result = s.astype(all_data.dtype) + expected = pd.Series(mixed) + tm.assert_series_equal(result, expected) + + # coerce to same other - mixed + s = pd.Series(mixed) + result = s.astype(dtype) + expected = pd.Series(mixed, dtype=dtype) + tm.assert_series_equal(result, expected) + + # coerce to same numpy_dtype - mixed + s = pd.Series(mixed) + with pytest.raises(ValueError): + s.astype(all_data.dtype.numpy_dtype) + + # coerce to object + s = pd.Series(mixed) + result = s.astype("object") + expected = pd.Series(np.asarray(mixed)) + tm.assert_series_equal(result, expected) + + def test_astype_to_larger_numpy(self): + a = pd.array([1, 2], dtype="Int32") + result = a.astype("int64") + expected = np.array([1, 2], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + a = pd.array([1, 2], dtype="UInt32") + result = a.astype("uint64") + expected = np.array([1, 2], dtype="uint64") + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("dtype", [Int8Dtype(), "Int8", UInt32Dtype(), "UInt32"]) + def test_astype_specific_casting(self, dtype): + s = pd.Series([1, 2, 3], dtype="Int64") + result = s.astype(dtype) + expected = pd.Series([1, 2, 3], dtype=dtype) + tm.assert_series_equal(result, expected) + + s = pd.Series([1, 2, 3, None], dtype="Int64") + result = s.astype(dtype) + expected = pd.Series([1, 2, 3, None], dtype=dtype) + tm.assert_series_equal(result, expected) + + def test_construct_cast_invalid(self, dtype): + + msg = "cannot safely" + arr = [1.2, 2.3, 3.7] + with pytest.raises(TypeError, match=msg): + integer_array(arr, dtype=dtype) + + with pytest.raises(TypeError, match=msg): + pd.Series(arr).astype(dtype) + + arr = [1.2, 2.3, 3.7, np.nan] + with pytest.raises(TypeError, match=msg): + integer_array(arr, dtype=dtype) + + with pytest.raises(TypeError, match=msg): + pd.Series(arr).astype(dtype) + + @pytest.mark.parametrize("in_series", [True, False]) + def test_to_numpy_na_nan(self, in_series): + a = pd.array([0, 1, None], dtype="Int64") + if in_series: + a = pd.Series(a) + + result = a.to_numpy(dtype="float64", na_value=np.nan) + expected = np.array([0.0, 1.0, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + result = a.to_numpy(dtype="int64", na_value=-1) + expected = np.array([0, 1, -1], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + result = a.to_numpy(dtype="bool", na_value=False) + expected = np.array([False, True, False], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("in_series", [True, False]) + @pytest.mark.parametrize("dtype", ["int32", "int64", "bool"]) + def test_to_numpy_dtype(self, dtype, in_series): + a = pd.array([0, 1], dtype="Int64") + if in_series: + a = pd.Series(a) + + result = a.to_numpy(dtype=dtype) + expected = np.array([0, 1], dtype=dtype) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("dtype", ["float64", "int64", "bool"]) + def test_to_numpy_na_raises(self, dtype): + a = pd.array([0, 1, None], dtype="Int64") + with pytest.raises(ValueError, match=dtype): + a.to_numpy(dtype=dtype) + + def test_astype_str(self): + a = pd.array([1, 2, None], dtype="Int64") + expected = np.array(["1", "2", ""], dtype=object) + + tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_numpy_array_equal(a.astype("str"), expected) + + def test_astype_boolean(self): + # https://github.com/pandas-dev/pandas/issues/31102 + a = pd.array([1, 0, -1, 2, None], dtype="Int64") + result = a.astype("boolean") + expected = pd.array([True, False, True, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_frame_repr(data_missing): + + df = pd.DataFrame({"A": data_missing}) + result = repr(df) + expected = " A\n0 \n1 1" + assert result == expected + + +def test_conversions(data_missing): + + # astype to object series + df = pd.DataFrame({"A": data_missing}) + result = df["A"].astype("object") + expected = pd.Series(np.array([np.nan, 1], dtype=object), name="A") + tm.assert_series_equal(result, expected) + + # convert to object ndarray + # we assert that we are exactly equal + # including type conversions of scalars + result = df["A"].astype("object").values + expected = np.array([pd.NA, 1], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + for r, e in zip(result, expected): + if pd.isnull(r): + assert pd.isnull(e) + elif is_integer(r): + assert r == e + assert is_integer(e) + else: + assert r == e + assert type(r) == type(e) + + +def test_integer_array_constructor(): + values = np.array([1, 2, 3, 4], dtype="int64") + mask = np.array([False, False, False, True], dtype="bool") + + result = IntegerArray(values, mask) + expected = integer_array([1, 2, 3, np.nan], dtype="int64") + tm.assert_extension_array_equal(result, expected) + + with pytest.raises(TypeError): + IntegerArray(values.tolist(), mask) + + with pytest.raises(TypeError): + IntegerArray(values, mask.tolist()) + + with pytest.raises(TypeError): + IntegerArray(values.astype(float), mask) + + with pytest.raises(TypeError): + IntegerArray(values) + + +@pytest.mark.parametrize( + "a, b", + [ + ([1, None], [1, np.nan]), + ([None], [np.nan]), + ([None, np.nan], [np.nan, np.nan]), + ([np.nan, np.nan], [np.nan, np.nan]), + ], +) +def test_integer_array_constructor_none_is_nan(a, b): + result = integer_array(a) + expected = integer_array(b) + tm.assert_extension_array_equal(result, expected) + + +def test_integer_array_constructor_copy(): + values = np.array([1, 2, 3, 4], dtype="int64") + mask = np.array([False, False, False, True], dtype="bool") + + result = IntegerArray(values, mask) + assert result._data is values + assert result._mask is mask + + result = IntegerArray(values, mask, copy=True) + assert result._data is not values + assert result._mask is not mask + + +@pytest.mark.parametrize( + "values", + [ + ["foo", "bar"], + ["1", "2"], + "foo", + 1, + 1.0, + pd.date_range("20130101", periods=2), + np.array(["foo"]), + [[1, 2], [3, 4]], + [np.nan, {"a": 1}], + ], +) +def test_to_integer_array_error(values): + # error in converting existing arrays to IntegerArrays + with pytest.raises(TypeError): + integer_array(values) + + +def test_to_integer_array_inferred_dtype(): + # if values has dtype -> respect it + result = integer_array(np.array([1, 2], dtype="int8")) + assert result.dtype == Int8Dtype() + result = integer_array(np.array([1, 2], dtype="int32")) + assert result.dtype == Int32Dtype() + + # if values have no dtype -> always int64 + result = integer_array([1, 2]) + assert result.dtype == Int64Dtype() + + +def test_to_integer_array_dtype_keyword(): + result = integer_array([1, 2], dtype="int8") + assert result.dtype == Int8Dtype() + + # if values has dtype -> override it + result = integer_array(np.array([1, 2], dtype="int8"), dtype="int32") + assert result.dtype == Int32Dtype() + + +def test_to_integer_array_float(): + result = integer_array([1.0, 2.0]) + expected = integer_array([1, 2]) + tm.assert_extension_array_equal(result, expected) + + with pytest.raises(TypeError, match="cannot safely cast non-equivalent"): + integer_array([1.5, 2.0]) + + # for float dtypes, the itemsize is not preserved + result = integer_array(np.array([1.0, 2.0], dtype="float32")) + assert result.dtype == Int64Dtype() + + +@pytest.mark.parametrize( + "bool_values, int_values, target_dtype, expected_dtype", + [ + ([False, True], [0, 1], Int64Dtype(), Int64Dtype()), + ([False, True], [0, 1], "Int64", Int64Dtype()), + ([False, True, np.nan], [0, 1, np.nan], Int64Dtype(), Int64Dtype()), + ], +) +def test_to_integer_array_bool(bool_values, int_values, target_dtype, expected_dtype): + result = integer_array(bool_values, dtype=target_dtype) + assert result.dtype == expected_dtype + expected = integer_array(int_values, dtype=target_dtype) + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "values, to_dtype, result_dtype", + [ + (np.array([1], dtype="int64"), None, Int64Dtype), + (np.array([1, np.nan]), None, Int64Dtype), + (np.array([1, np.nan]), "int8", Int8Dtype), + ], +) +def test_to_integer_array(values, to_dtype, result_dtype): + # convert existing arrays to IntegerArrays + result = integer_array(values, dtype=to_dtype) + assert result.dtype == result_dtype() + expected = integer_array(values, dtype=result_dtype()) + tm.assert_extension_array_equal(result, expected) + + +def test_cross_type_arithmetic(): + + df = pd.DataFrame( + { + "A": pd.Series([1, 2, np.nan], dtype="Int64"), + "B": pd.Series([1, np.nan, 3], dtype="UInt8"), + "C": [1, 2, 3], + } + ) + + result = df.A + df.C + expected = pd.Series([2, 4, np.nan], dtype="Int64") + tm.assert_series_equal(result, expected) + + result = (df.A + df.C) * 3 == 12 + expected = pd.Series([False, True, None], dtype="boolean") + tm.assert_series_equal(result, expected) + + result = df.A + df.B + expected = pd.Series([2, np.nan, np.nan], dtype="Int64") + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("op", ["sum", "min", "max", "prod"]) +def test_preserve_dtypes(op): + # TODO(#22346): preserve Int64 dtype + # for ops that enable (mean would actually work here + # but generally it is a float return value) + df = pd.DataFrame( + { + "A": ["a", "b", "b"], + "B": [1, None, 3], + "C": integer_array([1, None, 3], dtype="Int64"), + } + ) + + # op + result = getattr(df.C, op)() + assert isinstance(result, int) + + # groupby + result = getattr(df.groupby("A"), op)() + + expected = pd.DataFrame( + {"B": np.array([1.0, 3.0]), "C": integer_array([1, 3], dtype="Int64")}, + index=pd.Index(["a", "b"], name="A"), + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("op", ["mean"]) +def test_reduce_to_float(op): + # some reduce ops always return float, even if the result + # is a rounded number + df = pd.DataFrame( + { + "A": ["a", "b", "b"], + "B": [1, None, 3], + "C": integer_array([1, None, 3], dtype="Int64"), + } + ) + + # op + result = getattr(df.C, op)() + assert isinstance(result, float) + + # groupby + result = getattr(df.groupby("A"), op)() + + expected = pd.DataFrame( + {"B": np.array([1.0, 3.0]), "C": integer_array([1, 3], dtype="Int64")}, + index=pd.Index(["a", "b"], name="A"), + ) + tm.assert_frame_equal(result, expected) + + +def test_astype_nansafe(): + # see gh-22343 + arr = integer_array([np.nan, 1, 2], dtype="Int8") + msg = "cannot convert to 'uint32'-dtype NumPy array with missing values." + + with pytest.raises(ValueError, match=msg): + arr.astype("uint32") + + +@pytest.mark.parametrize("ufunc", [np.abs, np.sign]) +def test_ufuncs_single_int(ufunc): + a = integer_array([1, 2, -3, np.nan]) + result = ufunc(a) + expected = integer_array(ufunc(a.astype(float))) + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + result = ufunc(s) + expected = pd.Series(integer_array(ufunc(a.astype(float)))) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ufunc", [np.log, np.exp, np.sin, np.cos, np.sqrt]) +def test_ufuncs_single_float(ufunc): + a = integer_array([1, 2, -3, np.nan]) + with np.errstate(invalid="ignore"): + result = ufunc(a) + expected = ufunc(a.astype(float)) + tm.assert_numpy_array_equal(result, expected) + + s = pd.Series(a) + with np.errstate(invalid="ignore"): + result = ufunc(s) + expected = ufunc(s.astype(float)) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ufunc", [np.add, np.subtract]) +def test_ufuncs_binary_int(ufunc): + # two IntegerArrays + a = integer_array([1, 2, -3, np.nan]) + result = ufunc(a, a) + expected = integer_array(ufunc(a.astype(float), a.astype(float))) + tm.assert_extension_array_equal(result, expected) + + # IntegerArray with numpy array + arr = np.array([1, 2, 3, 4]) + result = ufunc(a, arr) + expected = integer_array(ufunc(a.astype(float), arr)) + tm.assert_extension_array_equal(result, expected) + + result = ufunc(arr, a) + expected = integer_array(ufunc(arr, a.astype(float))) + tm.assert_extension_array_equal(result, expected) + + # IntegerArray with scalar + result = ufunc(a, 1) + expected = integer_array(ufunc(a.astype(float), 1)) + tm.assert_extension_array_equal(result, expected) + + result = ufunc(1, a) + expected = integer_array(ufunc(1, a.astype(float))) + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize("values", [[0, 1], [0, None]]) +def test_ufunc_reduce_raises(values): + a = integer_array(values) + with pytest.raises(NotImplementedError): + np.add.reduce(a) + + +@td.skip_if_no("pyarrow", min_version="0.15.0") +def test_arrow_array(data): + # protocol added in 0.15.0 + import pyarrow as pa + + arr = pa.array(data) + expected = np.array(data, dtype=object) + expected[data.isna()] = None + expected = pa.array(expected, type=data.dtype.name.lower(), from_pandas=True) + assert arr.equals(expected) + + +@td.skip_if_no("pyarrow", min_version="0.15.1.dev") +def test_arrow_roundtrip(data): + # roundtrip possible from arrow 1.0.0 + import pyarrow as pa + + df = pd.DataFrame({"a": data}) + table = pa.table(df) + assert table.field("a").type == str(data.dtype.numpy_dtype) + result = table.to_pandas() + tm.assert_frame_equal(result, df) + + +@pytest.mark.parametrize( + "pandasmethname, kwargs", + [ + ("var", {"ddof": 0}), + ("var", {"ddof": 1}), + ("kurtosis", {}), + ("skew", {}), + ("sem", {}), + ], +) +def test_stat_method(pandasmethname, kwargs): + s = pd.Series(data=[1, 2, 3, 4, 5, 6, np.nan, np.nan], dtype="Int64") + pandasmeth = getattr(s, pandasmethname) + result = pandasmeth(**kwargs) + s2 = pd.Series(data=[1, 2, 3, 4, 5, 6], dtype="Int64") + pandasmeth = getattr(s2, pandasmethname) + expected = pandasmeth(**kwargs) + assert expected == result + + +def test_value_counts_na(): + arr = pd.array([1, 2, 1, pd.NA], dtype="Int64") + result = arr.value_counts(dropna=False) + expected = pd.Series([2, 1, 1], index=[1, 2, pd.NA], dtype="Int64") + tm.assert_series_equal(result, expected) + + result = arr.value_counts(dropna=True) + expected = pd.Series([2, 1], index=[1, 2], dtype="Int64") + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("bins", [3, [0, 5, 15]]) +@pytest.mark.parametrize("right", [True, False]) +@pytest.mark.parametrize("include_lowest", [True, False]) +def test_cut(bins, right, include_lowest): + a = np.random.randint(0, 10, size=50).astype(object) + a[::2] = np.nan + result = pd.cut( + pd.array(a, dtype="Int64"), bins, right=right, include_lowest=include_lowest + ) + expected = pd.cut(a, bins, right=right, include_lowest=include_lowest) + tm.assert_categorical_equal(result, expected) + + +# TODO(jreback) - these need testing / are broken + +# shift + +# set_index (destroys type) diff --git a/venv/Lib/site-packages/pandas/tests/arrays/test_numpy.py b/venv/Lib/site-packages/pandas/tests/arrays/test_numpy.py new file mode 100644 index 0000000..86793c4 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arrays/test_numpy.py @@ -0,0 +1,250 @@ +""" +Additional tests for PandasArray that aren't covered by +the interface tests. +""" +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.arrays import PandasArray +from pandas.core.arrays.numpy_ import PandasDtype + + +@pytest.fixture( + params=[ + np.array(["a", "b"], dtype=object), + np.array([0, 1], dtype=float), + np.array([0, 1], dtype=int), + np.array([0, 1 + 2j], dtype=complex), + np.array([True, False], dtype=bool), + np.array([0, 1], dtype="datetime64[ns]"), + np.array([0, 1], dtype="timedelta64[ns]"), + ] +) +def any_numpy_array(request): + """ + Parametrized fixture for NumPy arrays with different dtypes. + + This excludes string and bytes. + """ + return request.param + + +# ---------------------------------------------------------------------------- +# PandasDtype + + +@pytest.mark.parametrize( + "dtype, expected", + [ + ("bool", True), + ("int", True), + ("uint", True), + ("float", True), + ("complex", True), + ("str", False), + ("bytes", False), + ("datetime64[ns]", False), + ("object", False), + ("void", False), + ], +) +def test_is_numeric(dtype, expected): + dtype = PandasDtype(dtype) + assert dtype._is_numeric is expected + + +@pytest.mark.parametrize( + "dtype, expected", + [ + ("bool", True), + ("int", False), + ("uint", False), + ("float", False), + ("complex", False), + ("str", False), + ("bytes", False), + ("datetime64[ns]", False), + ("object", False), + ("void", False), + ], +) +def test_is_boolean(dtype, expected): + dtype = PandasDtype(dtype) + assert dtype._is_boolean is expected + + +def test_repr(): + dtype = PandasDtype(np.dtype("int64")) + assert repr(dtype) == "PandasDtype('int64')" + + +def test_constructor_from_string(): + result = PandasDtype.construct_from_string("int64") + expected = PandasDtype(np.dtype("int64")) + assert result == expected + + +# ---------------------------------------------------------------------------- +# Construction + + +def test_constructor_no_coercion(): + with pytest.raises(ValueError, match="NumPy array"): + PandasArray([1, 2, 3]) + + +def test_series_constructor_with_copy(): + ndarray = np.array([1, 2, 3]) + ser = pd.Series(PandasArray(ndarray), copy=True) + + assert ser.values is not ndarray + + +def test_series_constructor_with_astype(): + ndarray = np.array([1, 2, 3]) + result = pd.Series(PandasArray(ndarray), dtype="float64") + expected = pd.Series([1.0, 2.0, 3.0], dtype="float64") + tm.assert_series_equal(result, expected) + + +def test_from_sequence_dtype(): + arr = np.array([1, 2, 3], dtype="int64") + result = PandasArray._from_sequence(arr, dtype="uint64") + expected = PandasArray(np.array([1, 2, 3], dtype="uint64")) + tm.assert_extension_array_equal(result, expected) + + +def test_constructor_copy(): + arr = np.array([0, 1]) + result = PandasArray(arr, copy=True) + + assert np.shares_memory(result._ndarray, arr) is False + + +def test_constructor_with_data(any_numpy_array): + nparr = any_numpy_array + arr = PandasArray(nparr) + assert arr.dtype.numpy_dtype == nparr.dtype + + +# ---------------------------------------------------------------------------- +# Conversion + + +def test_to_numpy(): + arr = PandasArray(np.array([1, 2, 3])) + result = arr.to_numpy() + assert result is arr._ndarray + + result = arr.to_numpy(copy=True) + assert result is not arr._ndarray + + result = arr.to_numpy(dtype="f8") + expected = np.array([1, 2, 3], dtype="f8") + tm.assert_numpy_array_equal(result, expected) + + +# ---------------------------------------------------------------------------- +# Setitem + + +def test_setitem_series(): + ser = pd.Series([1, 2, 3]) + ser.array[0] = 10 + expected = pd.Series([10, 2, 3]) + tm.assert_series_equal(ser, expected) + + +def test_setitem(any_numpy_array): + nparr = any_numpy_array + arr = PandasArray(nparr, copy=True) + + arr[0] = arr[1] + nparr[0] = nparr[1] + + tm.assert_numpy_array_equal(arr.to_numpy(), nparr) + + +# ---------------------------------------------------------------------------- +# Reductions + + +def test_bad_reduce_raises(): + arr = np.array([1, 2, 3], dtype="int64") + arr = PandasArray(arr) + msg = "cannot perform not_a_method with type int" + with pytest.raises(TypeError, match=msg): + arr._reduce(msg) + + +def test_validate_reduction_keyword_args(): + arr = PandasArray(np.array([1, 2, 3])) + msg = "the 'keepdims' parameter is not supported .*all" + with pytest.raises(ValueError, match=msg): + arr.all(keepdims=True) + + +# ---------------------------------------------------------------------------- +# Ops + + +def test_ufunc(): + arr = PandasArray(np.array([-1.0, 0.0, 1.0])) + result = np.abs(arr) + expected = PandasArray(np.abs(arr._ndarray)) + tm.assert_extension_array_equal(result, expected) + + r1, r2 = np.divmod(arr, np.add(arr, 2)) + e1, e2 = np.divmod(arr._ndarray, np.add(arr._ndarray, 2)) + e1 = PandasArray(e1) + e2 = PandasArray(e2) + tm.assert_extension_array_equal(r1, e1) + tm.assert_extension_array_equal(r2, e2) + + +def test_basic_binop(): + # Just a basic smoke test. The EA interface tests exercise this + # more thoroughly. + x = PandasArray(np.array([1, 2, 3])) + result = x + x + expected = PandasArray(np.array([2, 4, 6])) + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize("dtype", [None, object]) +def test_setitem_object_typecode(dtype): + arr = PandasArray(np.array(["a", "b", "c"], dtype=dtype)) + arr[0] = "t" + expected = PandasArray(np.array(["t", "b", "c"], dtype=dtype)) + tm.assert_extension_array_equal(arr, expected) + + +def test_setitem_no_coercion(): + # https://github.com/pandas-dev/pandas/issues/28150 + arr = PandasArray(np.array([1, 2, 3])) + with pytest.raises(ValueError, match="int"): + arr[0] = "a" + + # With a value that we do coerce, check that we coerce the value + # and not the underlying array. + arr[0] = 2.5 + assert isinstance(arr[0], (int, np.integer)), type(arr[0]) + + +def test_setitem_preserves_views(): + # GH#28150, see also extension test of the same name + arr = PandasArray(np.array([1, 2, 3])) + view1 = arr.view() + view2 = arr[:] + view3 = np.asarray(arr) + + arr[0] = 9 + assert view1[0] == 9 + assert view2[0] == 9 + assert view3[0] == 9 + + arr[-1] = 2.5 + view1[-1] = 5 + assert arr[-1] == 5 diff --git a/venv/Lib/site-packages/pandas/tests/arrays/test_period.py b/venv/Lib/site-packages/pandas/tests/arrays/test_period.py new file mode 100644 index 0000000..1f4351c --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arrays/test_period.py @@ -0,0 +1,414 @@ +import numpy as np +import pytest + +from pandas._libs.tslibs import iNaT +from pandas._libs.tslibs.period import IncompatibleFrequency +import pandas.util._test_decorators as td + +from pandas.core.dtypes.dtypes import PeriodDtype, registry + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import PeriodArray, period_array + +# ---------------------------------------------------------------------------- +# Dtype + + +def test_registered(): + assert PeriodDtype in registry.dtypes + result = registry.find("Period[D]") + expected = PeriodDtype("D") + assert result == expected + + +# ---------------------------------------------------------------------------- +# period_array + + +@pytest.mark.parametrize( + "data, freq, expected", + [ + ([pd.Period("2017", "D")], None, [17167]), + ([pd.Period("2017", "D")], "D", [17167]), + ([2017], "D", [17167]), + (["2017"], "D", [17167]), + ([pd.Period("2017", "D")], pd.tseries.offsets.Day(), [17167]), + ([pd.Period("2017", "D"), None], None, [17167, iNaT]), + (pd.Series(pd.date_range("2017", periods=3)), None, [17167, 17168, 17169]), + (pd.date_range("2017", periods=3), None, [17167, 17168, 17169]), + ], +) +def test_period_array_ok(data, freq, expected): + result = period_array(data, freq=freq).asi8 + expected = np.asarray(expected, dtype=np.int64) + tm.assert_numpy_array_equal(result, expected) + + +def test_period_array_readonly_object(): + # https://github.com/pandas-dev/pandas/issues/25403 + pa = period_array([pd.Period("2019-01-01")]) + arr = np.asarray(pa, dtype="object") + arr.setflags(write=False) + + result = period_array(arr) + tm.assert_period_array_equal(result, pa) + + result = pd.Series(arr) + tm.assert_series_equal(result, pd.Series(pa)) + + result = pd.DataFrame({"A": arr}) + tm.assert_frame_equal(result, pd.DataFrame({"A": pa})) + + +def test_from_datetime64_freq_changes(): + # https://github.com/pandas-dev/pandas/issues/23438 + arr = pd.date_range("2017", periods=3, freq="D") + result = PeriodArray._from_datetime64(arr, freq="M") + expected = period_array(["2017-01-01", "2017-01-01", "2017-01-01"], freq="M") + tm.assert_period_array_equal(result, expected) + + +@pytest.mark.parametrize( + "data, freq, msg", + [ + ( + [pd.Period("2017", "D"), pd.Period("2017", "A")], + None, + "Input has different freq", + ), + ([pd.Period("2017", "D")], "A", "Input has different freq"), + ], +) +def test_period_array_raises(data, freq, msg): + with pytest.raises(IncompatibleFrequency, match=msg): + period_array(data, freq) + + +def test_period_array_non_period_series_raies(): + ser = pd.Series([1, 2, 3]) + with pytest.raises(TypeError, match="dtype"): + PeriodArray(ser, freq="D") + + +def test_period_array_freq_mismatch(): + arr = period_array(["2000", "2001"], freq="D") + with pytest.raises(IncompatibleFrequency, match="freq"): + PeriodArray(arr, freq="M") + + with pytest.raises(IncompatibleFrequency, match="freq"): + PeriodArray(arr, freq=pd.tseries.offsets.MonthEnd()) + + +def test_asi8(): + result = period_array(["2000", "2001", None], freq="D").asi8 + expected = np.array([10957, 11323, iNaT]) + tm.assert_numpy_array_equal(result, expected) + + +def test_take_raises(): + arr = period_array(["2000", "2001"], freq="D") + with pytest.raises(IncompatibleFrequency, match="freq"): + arr.take([0, -1], allow_fill=True, fill_value=pd.Period("2000", freq="W")) + + with pytest.raises(ValueError, match="foo"): + arr.take([0, -1], allow_fill=True, fill_value="foo") + + +@pytest.mark.parametrize("dtype", [int, np.int32, np.int64, "uint32", "uint64"]) +def test_astype(dtype): + # We choose to ignore the sign and size of integers for + # Period/Datetime/Timedelta astype + arr = period_array(["2000", "2001", None], freq="D") + result = arr.astype(dtype) + + if np.dtype(dtype).kind == "u": + expected_dtype = np.dtype("uint64") + else: + expected_dtype = np.dtype("int64") + expected = arr.astype(expected_dtype) + + assert result.dtype == expected_dtype + tm.assert_numpy_array_equal(result, expected) + + +def test_astype_copies(): + arr = period_array(["2000", "2001", None], freq="D") + result = arr.astype(np.int64, copy=False) + # Add the `.base`, since we now use `.asi8` which returns a view. + # We could maybe override it in PeriodArray to return ._data directly. + assert result.base is arr._data + + result = arr.astype(np.int64, copy=True) + assert result is not arr._data + tm.assert_numpy_array_equal(result, arr._data.view("i8")) + + +def test_astype_categorical(): + arr = period_array(["2000", "2001", "2001", None], freq="D") + result = arr.astype("category") + categories = pd.PeriodIndex(["2000", "2001"], freq="D") + expected = pd.Categorical.from_codes([0, 1, 1, -1], categories=categories) + tm.assert_categorical_equal(result, expected) + + +def test_astype_period(): + arr = period_array(["2000", "2001", None], freq="D") + result = arr.astype(PeriodDtype("M")) + expected = period_array(["2000", "2001", None], freq="M") + tm.assert_period_array_equal(result, expected) + + +@pytest.mark.parametrize("other", ["datetime64[ns]", "timedelta64[ns]"]) +def test_astype_datetime(other): + arr = period_array(["2000", "2001", None], freq="D") + # slice off the [ns] so that the regex matches. + with pytest.raises(TypeError, match=other[:-4]): + arr.astype(other) + + +def test_fillna_raises(): + arr = period_array(["2000", "2001", "2002"], freq="D") + with pytest.raises(ValueError, match="Length"): + arr.fillna(arr[:2]) + + +def test_fillna_copies(): + arr = period_array(["2000", "2001", "2002"], freq="D") + result = arr.fillna(pd.Period("2000", "D")) + assert result is not arr + + +# ---------------------------------------------------------------------------- +# setitem + + +@pytest.mark.parametrize( + "key, value, expected", + [ + ([0], pd.Period("2000", "D"), [10957, 1, 2]), + ([0], None, [iNaT, 1, 2]), + ([0], np.nan, [iNaT, 1, 2]), + ([0, 1, 2], pd.Period("2000", "D"), [10957] * 3), + ( + [0, 1, 2], + [pd.Period("2000", "D"), pd.Period("2001", "D"), pd.Period("2002", "D")], + [10957, 11323, 11688], + ), + ], +) +def test_setitem(key, value, expected): + arr = PeriodArray(np.arange(3), freq="D") + expected = PeriodArray(expected, freq="D") + arr[key] = value + tm.assert_period_array_equal(arr, expected) + + +def test_setitem_raises_incompatible_freq(): + arr = PeriodArray(np.arange(3), freq="D") + with pytest.raises(IncompatibleFrequency, match="freq"): + arr[0] = pd.Period("2000", freq="A") + + other = period_array(["2000", "2001"], freq="A") + with pytest.raises(IncompatibleFrequency, match="freq"): + arr[[0, 1]] = other + + +def test_setitem_raises_length(): + arr = PeriodArray(np.arange(3), freq="D") + with pytest.raises(ValueError, match="length"): + arr[[0, 1]] = [pd.Period("2000", freq="D")] + + +def test_setitem_raises_type(): + arr = PeriodArray(np.arange(3), freq="D") + with pytest.raises(TypeError, match="int"): + arr[0] = 1 + + +# ---------------------------------------------------------------------------- +# Ops + + +def test_sub_period(): + arr = period_array(["2000", "2001"], freq="D") + other = pd.Period("2000", freq="M") + with pytest.raises(IncompatibleFrequency, match="freq"): + arr - other + + +# ---------------------------------------------------------------------------- +# Methods + + +@pytest.mark.parametrize( + "other", + [pd.Period("2000", freq="H"), period_array(["2000", "2001", "2000"], freq="H")], +) +def test_where_different_freq_raises(other): + ser = pd.Series(period_array(["2000", "2001", "2002"], freq="D")) + cond = np.array([True, False, True]) + with pytest.raises(IncompatibleFrequency, match="freq"): + ser.where(cond, other) + + +# ---------------------------------------------------------------------------- +# Printing + + +def test_repr_small(): + arr = period_array(["2000", "2001"], freq="D") + result = str(arr) + expected = ( + "\n['2000-01-01', '2001-01-01']\nLength: 2, dtype: period[D]" + ) + assert result == expected + + +def test_repr_large(): + arr = period_array(["2000", "2001"] * 500, freq="D") + result = str(arr) + expected = ( + "\n" + "['2000-01-01', '2001-01-01', '2000-01-01', '2001-01-01', " + "'2000-01-01',\n" + " '2001-01-01', '2000-01-01', '2001-01-01', '2000-01-01', " + "'2001-01-01',\n" + " ...\n" + " '2000-01-01', '2001-01-01', '2000-01-01', '2001-01-01', " + "'2000-01-01',\n" + " '2001-01-01', '2000-01-01', '2001-01-01', '2000-01-01', " + "'2001-01-01']\n" + "Length: 1000, dtype: period[D]" + ) + assert result == expected + + +# ---------------------------------------------------------------------------- +# Reductions + + +class TestReductions: + def test_min_max(self): + arr = period_array( + [ + "2000-01-03", + "2000-01-03", + "NaT", + "2000-01-02", + "2000-01-05", + "2000-01-04", + ], + freq="D", + ) + + result = arr.min() + expected = pd.Period("2000-01-02", freq="D") + assert result == expected + + result = arr.max() + expected = pd.Period("2000-01-05", freq="D") + assert result == expected + + result = arr.min(skipna=False) + assert result is pd.NaT + + result = arr.max(skipna=False) + assert result is pd.NaT + + @pytest.mark.parametrize("skipna", [True, False]) + def test_min_max_empty(self, skipna): + arr = period_array([], freq="D") + result = arr.min(skipna=skipna) + assert result is pd.NaT + + result = arr.max(skipna=skipna) + assert result is pd.NaT + + +# ---------------------------------------------------------------------------- +# Arrow interaction + +pyarrow_skip = pyarrow_skip = td.skip_if_no("pyarrow", min_version="0.15.1.dev") + + +@pyarrow_skip +def test_arrow_extension_type(): + from pandas.core.arrays._arrow_utils import ArrowPeriodType + + p1 = ArrowPeriodType("D") + p2 = ArrowPeriodType("D") + p3 = ArrowPeriodType("M") + + assert p1.freq == "D" + assert p1 == p2 + assert not p1 == p3 + assert hash(p1) == hash(p2) + assert not hash(p1) == hash(p3) + + +@pyarrow_skip +@pytest.mark.parametrize( + "data, freq", + [ + (pd.date_range("2017", periods=3), "D"), + (pd.date_range("2017", periods=3, freq="A"), "A-DEC"), + ], +) +def test_arrow_array(data, freq): + import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowPeriodType + + periods = period_array(data, freq=freq) + result = pa.array(periods) + assert isinstance(result.type, ArrowPeriodType) + assert result.type.freq == freq + expected = pa.array(periods.asi8, type="int64") + assert result.storage.equals(expected) + + # convert to its storage type + result = pa.array(periods, type=pa.int64()) + assert result.equals(expected) + + # unsupported conversions + with pytest.raises(TypeError): + pa.array(periods, type="float64") + + with pytest.raises(TypeError, match="different 'freq'"): + pa.array(periods, type=ArrowPeriodType("T")) + + +@pyarrow_skip +def test_arrow_array_missing(): + import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowPeriodType + + arr = PeriodArray([1, 2, 3], freq="D") + arr[1] = pd.NaT + + result = pa.array(arr) + assert isinstance(result.type, ArrowPeriodType) + assert result.type.freq == "D" + expected = pa.array([1, None, 3], type="int64") + assert result.storage.equals(expected) + + +@pyarrow_skip +def test_arrow_table_roundtrip(): + import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowPeriodType + + arr = PeriodArray([1, 2, 3], freq="D") + arr[1] = pd.NaT + df = pd.DataFrame({"a": arr}) + + table = pa.table(df) + assert isinstance(table.field("a").type, ArrowPeriodType) + result = table.to_pandas() + assert isinstance(result["a"].dtype, PeriodDtype) + tm.assert_frame_equal(result, df) + + table2 = pa.concat_tables([table, table]) + result = table2.to_pandas() + expected = pd.concat([df, df], ignore_index=True) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/arrays/test_timedeltas.py b/venv/Lib/site-packages/pandas/tests/arrays/test_timedeltas.py new file mode 100644 index 0000000..62cb476 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/arrays/test_timedeltas.py @@ -0,0 +1,293 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import TimedeltaArray + + +class TestTimedeltaArrayConstructor: + def test_only_1dim_accepted(self): + # GH#25282 + arr = np.array([0, 1, 2, 3], dtype="m8[h]").astype("m8[ns]") + + with pytest.raises(ValueError, match="Only 1-dimensional"): + # 3-dim, we allow 2D to sneak in for ops purposes GH#29853 + TimedeltaArray(arr.reshape(2, 2, 1)) + + with pytest.raises(ValueError, match="Only 1-dimensional"): + # 0-dim + TimedeltaArray(arr[[0]].squeeze()) + + def test_freq_validation(self): + # ensure that the public constructor cannot create an invalid instance + arr = np.array([0, 0, 1], dtype=np.int64) * 3600 * 10 ** 9 + + msg = ( + "Inferred frequency None from passed values does not " + "conform to passed frequency D" + ) + with pytest.raises(ValueError, match=msg): + TimedeltaArray(arr.view("timedelta64[ns]"), freq="D") + + def test_non_array_raises(self): + with pytest.raises(ValueError, match="list"): + TimedeltaArray([1, 2, 3]) + + def test_other_type_raises(self): + with pytest.raises(ValueError, match="dtype bool cannot be converted"): + TimedeltaArray(np.array([1, 2, 3], dtype="bool")) + + def test_incorrect_dtype_raises(self): + # TODO: why TypeError for 'category' but ValueError for i8? + with pytest.raises( + ValueError, match=r"category cannot be converted to timedelta64\[ns\]" + ): + TimedeltaArray(np.array([1, 2, 3], dtype="i8"), dtype="category") + + with pytest.raises( + ValueError, match=r"dtype int64 cannot be converted to timedelta64\[ns\]", + ): + TimedeltaArray(np.array([1, 2, 3], dtype="i8"), dtype=np.dtype("int64")) + + def test_copy(self): + data = np.array([1, 2, 3], dtype="m8[ns]") + arr = TimedeltaArray(data, copy=False) + assert arr._data is data + + arr = TimedeltaArray(data, copy=True) + assert arr._data is not data + assert arr._data.base is not data + + +class TestTimedeltaArray: + def test_np_sum(self): + # GH#25282 + vals = np.arange(5, dtype=np.int64).view("m8[h]").astype("m8[ns]") + arr = TimedeltaArray(vals) + result = np.sum(arr) + assert result == vals.sum() + + result = np.sum(pd.TimedeltaIndex(arr)) + assert result == vals.sum() + + def test_from_sequence_dtype(self): + msg = "dtype .*object.* cannot be converted to timedelta64" + with pytest.raises(ValueError, match=msg): + TimedeltaArray._from_sequence([], dtype=object) + + def test_abs(self): + vals = np.array([-3600 * 10 ** 9, "NaT", 7200 * 10 ** 9], dtype="m8[ns]") + arr = TimedeltaArray(vals) + + evals = np.array([3600 * 10 ** 9, "NaT", 7200 * 10 ** 9], dtype="m8[ns]") + expected = TimedeltaArray(evals) + + result = abs(arr) + tm.assert_timedelta_array_equal(result, expected) + + def test_neg(self): + vals = np.array([-3600 * 10 ** 9, "NaT", 7200 * 10 ** 9], dtype="m8[ns]") + arr = TimedeltaArray(vals) + + evals = np.array([3600 * 10 ** 9, "NaT", -7200 * 10 ** 9], dtype="m8[ns]") + expected = TimedeltaArray(evals) + + result = -arr + tm.assert_timedelta_array_equal(result, expected) + + def test_neg_freq(self): + tdi = pd.timedelta_range("2 Days", periods=4, freq="H") + arr = TimedeltaArray(tdi, freq=tdi.freq) + + expected = TimedeltaArray(-tdi._data, freq=-tdi.freq) + + result = -arr + tm.assert_timedelta_array_equal(result, expected) + + @pytest.mark.parametrize("dtype", [int, np.int32, np.int64, "uint32", "uint64"]) + def test_astype_int(self, dtype): + arr = TimedeltaArray._from_sequence([pd.Timedelta("1H"), pd.Timedelta("2H")]) + result = arr.astype(dtype) + + if np.dtype(dtype).kind == "u": + expected_dtype = np.dtype("uint64") + else: + expected_dtype = np.dtype("int64") + expected = arr.astype(expected_dtype) + + assert result.dtype == expected_dtype + tm.assert_numpy_array_equal(result, expected) + + def test_setitem_clears_freq(self): + a = TimedeltaArray(pd.timedelta_range("1H", periods=2, freq="H")) + a[0] = pd.Timedelta("1H") + assert a.freq is None + + @pytest.mark.parametrize( + "obj", + [ + pd.Timedelta(seconds=1), + pd.Timedelta(seconds=1).to_timedelta64(), + pd.Timedelta(seconds=1).to_pytimedelta(), + ], + ) + def test_setitem_objects(self, obj): + # make sure we accept timedelta64 and timedelta in addition to Timedelta + tdi = pd.timedelta_range("2 Days", periods=4, freq="H") + arr = TimedeltaArray(tdi, freq=tdi.freq) + + arr[0] = obj + assert arr[0] == pd.Timedelta(seconds=1) + + @pytest.mark.parametrize( + "other", + [ + 1, + np.int64(1), + 1.0, + np.datetime64("NaT"), + pd.Timestamp.now(), + "invalid", + np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9, + (np.arange(10) * 24 * 3600 * 10 ** 9).view("datetime64[ns]"), + pd.Timestamp.now().to_period("D"), + ], + ) + @pytest.mark.parametrize( + "index", + [ + True, + pytest.param( + False, + marks=pytest.mark.xfail( + reason="Raises ValueError instead of TypeError", raises=ValueError + ), + ), + ], + ) + def test_searchsorted_invalid_types(self, other, index): + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = TimedeltaArray(data, freq="D") + if index: + arr = pd.Index(arr) + + msg = "searchsorted requires compatible dtype or scalar" + with pytest.raises(TypeError, match=msg): + arr.searchsorted(other) + + +class TestReductions: + @pytest.mark.parametrize("name", ["sum", "std", "min", "max", "median"]) + @pytest.mark.parametrize("skipna", [True, False]) + def test_reductions_empty(self, name, skipna): + tdi = pd.TimedeltaIndex([]) + arr = tdi.array + + result = getattr(tdi, name)(skipna=skipna) + assert result is pd.NaT + + result = getattr(arr, name)(skipna=skipna) + assert result is pd.NaT + + def test_min_max(self): + arr = TimedeltaArray._from_sequence(["3H", "3H", "NaT", "2H", "5H", "4H"]) + + result = arr.min() + expected = pd.Timedelta("2H") + assert result == expected + + result = arr.max() + expected = pd.Timedelta("5H") + assert result == expected + + result = arr.min(skipna=False) + assert result is pd.NaT + + result = arr.max(skipna=False) + assert result is pd.NaT + + def test_sum(self): + tdi = pd.TimedeltaIndex(["3H", "3H", "NaT", "2H", "5H", "4H"]) + arr = tdi.array + + result = arr.sum(skipna=True) + expected = pd.Timedelta(hours=17) + assert isinstance(result, pd.Timedelta) + assert result == expected + + result = tdi.sum(skipna=True) + assert isinstance(result, pd.Timedelta) + assert result == expected + + result = arr.sum(skipna=False) + assert result is pd.NaT + + result = tdi.sum(skipna=False) + assert result is pd.NaT + + result = arr.sum(min_count=9) + assert result is pd.NaT + + result = tdi.sum(min_count=9) + assert result is pd.NaT + + result = arr.sum(min_count=1) + assert isinstance(result, pd.Timedelta) + assert result == expected + + result = tdi.sum(min_count=1) + assert isinstance(result, pd.Timedelta) + assert result == expected + + def test_npsum(self): + # GH#25335 np.sum should return a Timedelta, not timedelta64 + tdi = pd.TimedeltaIndex(["3H", "3H", "2H", "5H", "4H"]) + arr = tdi.array + + result = np.sum(tdi) + expected = pd.Timedelta(hours=17) + assert isinstance(result, pd.Timedelta) + assert result == expected + + result = np.sum(arr) + assert isinstance(result, pd.Timedelta) + assert result == expected + + def test_std(self): + tdi = pd.TimedeltaIndex(["0H", "4H", "NaT", "4H", "0H", "2H"]) + arr = tdi.array + + result = arr.std(skipna=True) + expected = pd.Timedelta(hours=2) + assert isinstance(result, pd.Timedelta) + assert result == expected + + result = tdi.std(skipna=True) + assert isinstance(result, pd.Timedelta) + assert result == expected + + result = arr.std(skipna=False) + assert result is pd.NaT + + result = tdi.std(skipna=False) + assert result is pd.NaT + + def test_median(self): + tdi = pd.TimedeltaIndex(["0H", "3H", "NaT", "5H06m", "0H", "2H"]) + arr = tdi.array + + result = arr.median(skipna=True) + expected = pd.Timedelta(hours=2) + assert isinstance(result, pd.Timedelta) + assert result == expected + + result = tdi.median(skipna=True) + assert isinstance(result, pd.Timedelta) + assert result == expected + + result = arr.std(skipna=False) + assert result is pd.NaT + + result = tdi.std(skipna=False) + assert result is pd.NaT diff --git a/venv/Lib/site-packages/pandas/tests/base/__init__.py b/venv/Lib/site-packages/pandas/tests/base/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/base/test_constructors.py b/venv/Lib/site-packages/pandas/tests/base/test_constructors.py new file mode 100644 index 0000000..0b72743 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/base/test_constructors.py @@ -0,0 +1,142 @@ +from datetime import datetime +import sys + +import numpy as np +import pytest + +from pandas.compat import PYPY + +import pandas as pd +from pandas import DataFrame, Index, Series +import pandas._testing as tm +from pandas.core.accessor import PandasDelegate +from pandas.core.base import NoNewAttributesMixin, PandasObject + + +class TestPandasDelegate: + class Delegator: + _properties = ["foo"] + _methods = ["bar"] + + def _set_foo(self, value): + self.foo = value + + def _get_foo(self): + return self.foo + + foo = property(_get_foo, _set_foo, doc="foo property") + + def bar(self, *args, **kwargs): + """ a test bar method """ + pass + + class Delegate(PandasDelegate, PandasObject): + def __init__(self, obj): + self.obj = obj + + def setup_method(self, method): + pass + + def test_invalid_delegation(self): + # these show that in order for the delegation to work + # the _delegate_* methods need to be overridden to not raise + # a TypeError + + self.Delegate._add_delegate_accessors( + delegate=self.Delegator, + accessors=self.Delegator._properties, + typ="property", + ) + self.Delegate._add_delegate_accessors( + delegate=self.Delegator, accessors=self.Delegator._methods, typ="method" + ) + + delegate = self.Delegate(self.Delegator()) + + with pytest.raises(TypeError): + delegate.foo + + with pytest.raises(TypeError): + delegate.foo = 5 + + with pytest.raises(TypeError): + delegate.foo() + + @pytest.mark.skipif(PYPY, reason="not relevant for PyPy") + def test_memory_usage(self): + # Delegate does not implement memory_usage. + # Check that we fall back to in-built `__sizeof__` + # GH 12924 + delegate = self.Delegate(self.Delegator()) + sys.getsizeof(delegate) + + +class TestNoNewAttributesMixin: + def test_mixin(self): + class T(NoNewAttributesMixin): + pass + + t = T() + assert not hasattr(t, "__frozen") + + t.a = "test" + assert t.a == "test" + + t._freeze() + assert "__frozen" in dir(t) + assert getattr(t, "__frozen") + + with pytest.raises(AttributeError): + t.b = "test" + + assert not hasattr(t, "b") + + +class TestConstruction: + # test certain constructor behaviours on dtype inference across Series, + # Index and DataFrame + + @pytest.mark.parametrize( + "klass", + [ + Series, + lambda x, **kwargs: DataFrame({"a": x}, **kwargs)["a"], + pytest.param( + lambda x, **kwargs: DataFrame(x, **kwargs)[0], marks=pytest.mark.xfail + ), + Index, + ], + ) + @pytest.mark.parametrize( + "a", + [ + np.array(["2263-01-01"], dtype="datetime64[D]"), + np.array([datetime(2263, 1, 1)], dtype=object), + np.array([np.datetime64("2263-01-01", "D")], dtype=object), + np.array(["2263-01-01"], dtype=object), + ], + ids=[ + "datetime64[D]", + "object-datetime.datetime", + "object-numpy-scalar", + "object-string", + ], + ) + def test_constructor_datetime_outofbound(self, a, klass): + # GH-26853 (+ bug GH-26206 out of bound non-ns unit) + + # No dtype specified (dtype inference) + # datetime64[non-ns] raise error, other cases result in object dtype + # and preserve original data + if a.dtype.kind == "M": + with pytest.raises(pd.errors.OutOfBoundsDatetime): + klass(a) + else: + result = klass(a) + assert result.dtype == "object" + tm.assert_numpy_array_equal(result.to_numpy(), a) + + # Explicit dtype specified + # Forced conversion fails for all -> all cases raise error + with pytest.raises(pd.errors.OutOfBoundsDatetime): + klass(a, dtype="datetime64[ns]") diff --git a/venv/Lib/site-packages/pandas/tests/base/test_conversion.py b/venv/Lib/site-packages/pandas/tests/base/test_conversion.py new file mode 100644 index 0000000..07a15d0 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/base/test_conversion.py @@ -0,0 +1,439 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_datetime64_dtype, is_timedelta64_dtype +from pandas.core.dtypes.dtypes import DatetimeTZDtype + +import pandas as pd +from pandas import CategoricalIndex, Series, Timedelta, Timestamp +import pandas._testing as tm +from pandas.core.arrays import ( + DatetimeArray, + IntervalArray, + PandasArray, + PeriodArray, + SparseArray, + TimedeltaArray, +) + + +class TestToIterable: + # test that we convert an iterable to python types + + dtypes = [ + ("int8", int), + ("int16", int), + ("int32", int), + ("int64", int), + ("uint8", int), + ("uint16", int), + ("uint32", int), + ("uint64", int), + ("float16", float), + ("float32", float), + ("float64", float), + ("datetime64[ns]", Timestamp), + ("datetime64[ns, US/Eastern]", Timestamp), + ("timedelta64[ns]", Timedelta), + ] + + @pytest.mark.parametrize("dtype, rdtype", dtypes) + @pytest.mark.parametrize( + "method", + [ + lambda x: x.tolist(), + lambda x: x.to_list(), + lambda x: list(x), + lambda x: list(x.__iter__()), + ], + ids=["tolist", "to_list", "list", "iter"], + ) + @pytest.mark.filterwarnings("ignore:\\n Passing:FutureWarning") + # TODO(GH-24559): Remove the filterwarnings + def test_iterable(self, index_or_series, method, dtype, rdtype): + # gh-10904 + # gh-13258 + # coerce iteration to underlying python / pandas types + typ = index_or_series + s = typ([1], dtype=dtype) + result = method(s)[0] + assert isinstance(result, rdtype) + + @pytest.mark.parametrize( + "dtype, rdtype, obj", + [ + ("object", object, "a"), + ("object", int, 1), + ("category", object, "a"), + ("category", int, 1), + ], + ) + @pytest.mark.parametrize( + "method", + [ + lambda x: x.tolist(), + lambda x: x.to_list(), + lambda x: list(x), + lambda x: list(x.__iter__()), + ], + ids=["tolist", "to_list", "list", "iter"], + ) + def test_iterable_object_and_category( + self, index_or_series, method, dtype, rdtype, obj + ): + # gh-10904 + # gh-13258 + # coerce iteration to underlying python / pandas types + typ = index_or_series + s = typ([obj], dtype=dtype) + result = method(s)[0] + assert isinstance(result, rdtype) + + @pytest.mark.parametrize("dtype, rdtype", dtypes) + def test_iterable_items(self, dtype, rdtype): + # gh-13258 + # test if items yields the correct boxed scalars + # this only applies to series + s = Series([1], dtype=dtype) + _, result = list(s.items())[0] + assert isinstance(result, rdtype) + + _, result = list(s.items())[0] + assert isinstance(result, rdtype) + + @pytest.mark.parametrize( + "dtype, rdtype", dtypes + [("object", int), ("category", int)] + ) + @pytest.mark.filterwarnings("ignore:\\n Passing:FutureWarning") + # TODO(GH-24559): Remove the filterwarnings + def test_iterable_map(self, index_or_series, dtype, rdtype): + # gh-13236 + # coerce iteration to underlying python / pandas types + typ = index_or_series + s = typ([1], dtype=dtype) + result = s.map(type)[0] + if not isinstance(rdtype, tuple): + rdtype = tuple([rdtype]) + assert result in rdtype + + @pytest.mark.parametrize( + "method", + [ + lambda x: x.tolist(), + lambda x: x.to_list(), + lambda x: list(x), + lambda x: list(x.__iter__()), + ], + ids=["tolist", "to_list", "list", "iter"], + ) + def test_categorial_datetimelike(self, method): + i = CategoricalIndex([Timestamp("1999-12-31"), Timestamp("2000-12-31")]) + + result = method(i)[0] + assert isinstance(result, Timestamp) + + def test_iter_box(self): + vals = [Timestamp("2011-01-01"), Timestamp("2011-01-02")] + s = Series(vals) + assert s.dtype == "datetime64[ns]" + for res, exp in zip(s, vals): + assert isinstance(res, Timestamp) + assert res.tz is None + assert res == exp + + vals = [ + Timestamp("2011-01-01", tz="US/Eastern"), + Timestamp("2011-01-02", tz="US/Eastern"), + ] + s = Series(vals) + + assert s.dtype == "datetime64[ns, US/Eastern]" + for res, exp in zip(s, vals): + assert isinstance(res, Timestamp) + assert res.tz == exp.tz + assert res == exp + + # timedelta + vals = [Timedelta("1 days"), Timedelta("2 days")] + s = Series(vals) + assert s.dtype == "timedelta64[ns]" + for res, exp in zip(s, vals): + assert isinstance(res, Timedelta) + assert res == exp + + # period + vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")] + s = Series(vals) + assert s.dtype == "Period[M]" + for res, exp in zip(s, vals): + assert isinstance(res, pd.Period) + assert res.freq == "M" + assert res == exp + + +@pytest.mark.parametrize( + "array, expected_type, dtype", + [ + (np.array([0, 1], dtype=np.int64), np.ndarray, "int64"), + (np.array(["a", "b"]), np.ndarray, "object"), + (pd.Categorical(["a", "b"]), pd.Categorical, "category"), + ( + pd.DatetimeIndex(["2017", "2018"], tz="US/Central"), + DatetimeArray, + "datetime64[ns, US/Central]", + ), + ( + pd.PeriodIndex([2018, 2019], freq="A"), + PeriodArray, + pd.core.dtypes.dtypes.PeriodDtype("A-DEC"), + ), + (pd.IntervalIndex.from_breaks([0, 1, 2]), IntervalArray, "interval",), + # This test is currently failing for datetime64[ns] and timedelta64[ns]. + # The NumPy type system is sufficient for representing these types, so + # we just use NumPy for Series / DataFrame columns of these types (so + # we get consolidation and so on). + # However, DatetimeIndex and TimedeltaIndex use the DateLikeArray + # abstraction to for code reuse. + # At the moment, we've judged that allowing this test to fail is more + # practical that overriding Series._values to special case + # Series[M8[ns]] and Series[m8[ns]] to return a DateLikeArray. + pytest.param( + pd.DatetimeIndex(["2017", "2018"]), + np.ndarray, + "datetime64[ns]", + marks=[pytest.mark.xfail(reason="datetime _values", strict=True)], + ), + pytest.param( + pd.TimedeltaIndex([10 ** 10]), + np.ndarray, + "m8[ns]", + marks=[pytest.mark.xfail(reason="timedelta _values", strict=True)], + ), + ], +) +def test_values_consistent(array, expected_type, dtype): + l_values = pd.Series(array)._values + r_values = pd.Index(array)._values + assert type(l_values) is expected_type + assert type(l_values) is type(r_values) + + tm.assert_equal(l_values, r_values) + + +@pytest.mark.parametrize( + "array, expected", + [ + (np.array([0, 1], dtype=np.int64), np.array([0, 1], dtype=np.int64)), + (np.array(["0", "1"]), np.array(["0", "1"], dtype=object)), + (pd.Categorical(["a", "a"]), np.array([0, 0], dtype="int8")), + ( + pd.DatetimeIndex(["2017-01-01T00:00:00"]), + np.array(["2017-01-01T00:00:00"], dtype="M8[ns]"), + ), + ( + pd.DatetimeIndex(["2017-01-01T00:00:00"], tz="US/Eastern"), + np.array(["2017-01-01T05:00:00"], dtype="M8[ns]"), + ), + (pd.TimedeltaIndex([10 ** 10]), np.array([10 ** 10], dtype="m8[ns]")), + ( + pd.PeriodIndex(["2017", "2018"], freq="D"), + np.array([17167, 17532], dtype=np.int64), + ), + ], +) +def test_ndarray_values(array, expected): + l_values = pd.Series(array)._ndarray_values + r_values = pd.Index(array)._ndarray_values + tm.assert_numpy_array_equal(l_values, r_values) + tm.assert_numpy_array_equal(l_values, expected) + + +@pytest.mark.parametrize("arr", [np.array([1, 2, 3])]) +def test_numpy_array(arr): + ser = pd.Series(arr) + result = ser.array + expected = PandasArray(arr) + tm.assert_extension_array_equal(result, expected) + + +def test_numpy_array_all_dtypes(any_numpy_dtype): + ser = pd.Series(dtype=any_numpy_dtype) + result = ser.array + if is_datetime64_dtype(any_numpy_dtype): + assert isinstance(result, DatetimeArray) + elif is_timedelta64_dtype(any_numpy_dtype): + assert isinstance(result, TimedeltaArray) + else: + assert isinstance(result, PandasArray) + + +@pytest.mark.parametrize( + "array, attr", + [ + (pd.Categorical(["a", "b"]), "_codes"), + (pd.core.arrays.period_array(["2000", "2001"], freq="D"), "_data"), + (pd.core.arrays.integer_array([0, np.nan]), "_data"), + (IntervalArray.from_breaks([0, 1]), "_left"), + (SparseArray([0, 1]), "_sparse_values"), + (DatetimeArray(np.array([1, 2], dtype="datetime64[ns]")), "_data"), + # tz-aware Datetime + ( + DatetimeArray( + np.array( + ["2000-01-01T12:00:00", "2000-01-02T12:00:00"], dtype="M8[ns]" + ), + dtype=DatetimeTZDtype(tz="US/Central"), + ), + "_data", + ), + ], +) +def test_array(array, attr, index_or_series): + box = index_or_series + if array.dtype.name in ("Int64", "Sparse[int64, 0]") and box is pd.Index: + pytest.skip(f"No index type for {array.dtype}") + result = box(array, copy=False).array + + if attr: + array = getattr(array, attr) + result = getattr(result, attr) + + assert result is array + + +def test_array_multiindex_raises(): + idx = pd.MultiIndex.from_product([["A"], ["a", "b"]]) + with pytest.raises(ValueError, match="MultiIndex"): + idx.array + + +@pytest.mark.parametrize( + "array, expected", + [ + (np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64)), + (pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object)), + ( + pd.core.arrays.period_array(["2000", "2001"], freq="D"), + np.array([pd.Period("2000", freq="D"), pd.Period("2001", freq="D")]), + ), + ( + pd.core.arrays.integer_array([0, np.nan]), + np.array([0, pd.NA], dtype=object), + ), + ( + IntervalArray.from_breaks([0, 1, 2]), + np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object), + ), + (SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)), + # tz-naive datetime + ( + DatetimeArray(np.array(["2000", "2001"], dtype="M8[ns]")), + np.array(["2000", "2001"], dtype="M8[ns]"), + ), + # tz-aware stays tz`-aware + ( + DatetimeArray( + np.array( + ["2000-01-01T06:00:00", "2000-01-02T06:00:00"], dtype="M8[ns]" + ), + dtype=DatetimeTZDtype(tz="US/Central"), + ), + np.array( + [ + pd.Timestamp("2000-01-01", tz="US/Central"), + pd.Timestamp("2000-01-02", tz="US/Central"), + ] + ), + ), + # Timedelta + ( + TimedeltaArray(np.array([0, 3600000000000], dtype="i8"), freq="H"), + np.array([0, 3600000000000], dtype="m8[ns]"), + ), + ], +) +def test_to_numpy(array, expected, index_or_series): + box = index_or_series + thing = box(array) + + if array.dtype.name in ("Int64", "Sparse[int64, 0]") and box is pd.Index: + pytest.skip(f"No index type for {array.dtype}") + + result = thing.to_numpy() + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("as_series", [True, False]) +@pytest.mark.parametrize( + "arr", [np.array([1, 2, 3], dtype="int64"), np.array(["a", "b", "c"], dtype=object)] +) +def test_to_numpy_copy(arr, as_series): + obj = pd.Index(arr, copy=False) + if as_series: + obj = pd.Series(obj.values, copy=False) + + # no copy by default + result = obj.to_numpy() + assert np.shares_memory(arr, result) is True + + result = obj.to_numpy(copy=False) + assert np.shares_memory(arr, result) is True + + # copy=True + result = obj.to_numpy(copy=True) + assert np.shares_memory(arr, result) is False + + +@pytest.mark.parametrize("as_series", [True, False]) +def test_to_numpy_dtype(as_series): + tz = "US/Eastern" + obj = pd.DatetimeIndex(["2000", "2001"], tz=tz) + if as_series: + obj = pd.Series(obj) + + # preserve tz by default + result = obj.to_numpy() + expected = np.array( + [pd.Timestamp("2000", tz=tz), pd.Timestamp("2001", tz=tz)], dtype=object + ) + tm.assert_numpy_array_equal(result, expected) + + result = obj.to_numpy(dtype="object") + tm.assert_numpy_array_equal(result, expected) + + result = obj.to_numpy(dtype="M8[ns]") + expected = np.array(["2000-01-01T05", "2001-01-01T05"], dtype="M8[ns]") + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize( + "values, dtype, na_value, expected", + [ + ([1, 2, None], "float64", 0, [1.0, 2.0, 0.0]), + ( + [pd.Timestamp("2000"), pd.Timestamp("2000"), pd.NaT], + None, + pd.Timestamp("2000"), + [np.datetime64("2000-01-01T00:00:00.000000000")] * 3, + ), + ], +) +@pytest.mark.parametrize("container", [pd.Series, pd.Index]) # type: ignore +def test_to_numpy_na_value_numpy_dtype(container, values, dtype, na_value, expected): + s = container(values) + result = s.to_numpy(dtype=dtype, na_value=na_value) + expected = np.array(expected) + tm.assert_numpy_array_equal(result, expected) + + +def test_to_numpy_kwargs_raises(): + # numpy + s = pd.Series([1, 2, 3]) + match = r"to_numpy\(\) got an unexpected keyword argument 'foo'" + with pytest.raises(TypeError, match=match): + s.to_numpy(foo=True) + + # extension + s = pd.Series([1, 2, 3], dtype="Int64") + with pytest.raises(TypeError, match=match): + s.to_numpy(foo=True) diff --git a/venv/Lib/site-packages/pandas/tests/base/test_ops.py b/venv/Lib/site-packages/pandas/tests/base/test_ops.py new file mode 100644 index 0000000..2693eb1 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/base/test_ops.py @@ -0,0 +1,899 @@ +from datetime import datetime, timedelta +from io import StringIO +import sys + +import numpy as np +import pytest + +from pandas._libs.tslib import iNaT +from pandas.compat import PYPY +from pandas.compat.numpy import np_array_datetime64_compat + +from pandas.core.dtypes.common import ( + is_datetime64_dtype, + is_datetime64tz_dtype, + is_object_dtype, + needs_i8_conversion, +) + +import pandas as pd +from pandas import ( + DataFrame, + DatetimeIndex, + Index, + Interval, + IntervalIndex, + PeriodIndex, + Series, + Timedelta, + TimedeltaIndex, + Timestamp, +) +import pandas._testing as tm +from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin + + +class Ops: + def _allow_na_ops(self, obj): + """Whether to skip test cases including NaN""" + if (isinstance(obj, Index) and obj.is_boolean()) or not obj._can_hold_na: + # don't test boolean / integer dtypes + return False + return True + + def setup_method(self, method): + self.bool_index = tm.makeBoolIndex(10, name="a") + self.int_index = tm.makeIntIndex(10, name="a") + self.float_index = tm.makeFloatIndex(10, name="a") + self.dt_index = tm.makeDateIndex(10, name="a") + self.dt_tz_index = tm.makeDateIndex(10, name="a").tz_localize(tz="US/Eastern") + self.period_index = tm.makePeriodIndex(10, name="a") + self.string_index = tm.makeStringIndex(10, name="a") + self.unicode_index = tm.makeUnicodeIndex(10, name="a") + + arr = np.random.randn(10) + self.bool_series = Series(arr, index=self.bool_index, name="a") + self.int_series = Series(arr, index=self.int_index, name="a") + self.float_series = Series(arr, index=self.float_index, name="a") + self.dt_series = Series(arr, index=self.dt_index, name="a") + self.dt_tz_series = self.dt_tz_index.to_series() + self.period_series = Series(arr, index=self.period_index, name="a") + self.string_series = Series(arr, index=self.string_index, name="a") + self.unicode_series = Series(arr, index=self.unicode_index, name="a") + + types = ["bool", "int", "float", "dt", "dt_tz", "period", "string", "unicode"] + self.indexes = [getattr(self, f"{t}_index") for t in types] + self.series = [getattr(self, f"{t}_series") for t in types] + + # To test narrow dtypes, we use narrower *data* elements, not *index* elements + index = self.int_index + self.float32_series = Series(arr.astype(np.float32), index=index, name="a") + + arr_int = np.random.choice(10, size=10, replace=False) + self.int8_series = Series(arr_int.astype(np.int8), index=index, name="a") + self.int16_series = Series(arr_int.astype(np.int16), index=index, name="a") + self.int32_series = Series(arr_int.astype(np.int32), index=index, name="a") + + self.uint8_series = Series(arr_int.astype(np.uint8), index=index, name="a") + self.uint16_series = Series(arr_int.astype(np.uint16), index=index, name="a") + self.uint32_series = Series(arr_int.astype(np.uint32), index=index, name="a") + + nrw_types = ["float32", "int8", "int16", "int32", "uint8", "uint16", "uint32"] + self.narrow_series = [getattr(self, f"{t}_series") for t in nrw_types] + + self.objs = self.indexes + self.series + self.narrow_series + + def check_ops_properties(self, props, filter=None, ignore_failures=False): + for op in props: + for o in self.is_valid_objs: + + # if a filter, skip if it doesn't match + if filter is not None: + filt = o.index if isinstance(o, Series) else o + if not filter(filt): + continue + + try: + if isinstance(o, Series): + expected = Series(getattr(o.index, op), index=o.index, name="a") + else: + expected = getattr(o, op) + except (AttributeError): + if ignore_failures: + continue + + result = getattr(o, op) + + # these could be series, arrays or scalars + if isinstance(result, Series) and isinstance(expected, Series): + tm.assert_series_equal(result, expected) + elif isinstance(result, Index) and isinstance(expected, Index): + tm.assert_index_equal(result, expected) + elif isinstance(result, np.ndarray) and isinstance( + expected, np.ndarray + ): + tm.assert_numpy_array_equal(result, expected) + else: + assert result == expected + + # freq raises AttributeError on an Int64Index because its not + # defined we mostly care about Series here anyhow + if not ignore_failures: + for o in self.not_valid_objs: + + # an object that is datetimelike will raise a TypeError, + # otherwise an AttributeError + err = AttributeError + if issubclass(type(o), DatetimeIndexOpsMixin): + err = TypeError + + with pytest.raises(err): + getattr(o, op) + + @pytest.mark.parametrize("klass", [Series, DataFrame]) + def test_binary_ops_docs(self, klass): + op_map = { + "add": "+", + "sub": "-", + "mul": "*", + "mod": "%", + "pow": "**", + "truediv": "/", + "floordiv": "//", + } + for op_name in op_map: + operand1 = klass.__name__.lower() + operand2 = "other" + op = op_map[op_name] + expected_str = " ".join([operand1, op, operand2]) + assert expected_str in getattr(klass, op_name).__doc__ + + # reverse version of the binary ops + expected_str = " ".join([operand2, op, operand1]) + assert expected_str in getattr(klass, "r" + op_name).__doc__ + + +class TestTranspose(Ops): + errmsg = "the 'axes' parameter is not supported" + + def test_transpose(self): + for obj in self.objs: + tm.assert_equal(obj.transpose(), obj) + + def test_transpose_non_default_axes(self): + for obj in self.objs: + with pytest.raises(ValueError, match=self.errmsg): + obj.transpose(1) + with pytest.raises(ValueError, match=self.errmsg): + obj.transpose(axes=1) + + def test_numpy_transpose(self): + for obj in self.objs: + tm.assert_equal(np.transpose(obj), obj) + + with pytest.raises(ValueError, match=self.errmsg): + np.transpose(obj, axes=1) + + +class TestIndexOps(Ops): + def setup_method(self, method): + super().setup_method(method) + self.is_valid_objs = self.objs + self.not_valid_objs = [] + + def test_none_comparison(self): + + # bug brought up by #1079 + # changed from TypeError in 0.17.0 + for o in self.is_valid_objs: + if isinstance(o, Series): + + o[0] = np.nan + + # noinspection PyComparisonWithNone + result = o == None # noqa + assert not result.iat[0] + assert not result.iat[1] + + # noinspection PyComparisonWithNone + result = o != None # noqa + assert result.iat[0] + assert result.iat[1] + + result = None == o # noqa + assert not result.iat[0] + assert not result.iat[1] + + result = None != o # noqa + assert result.iat[0] + assert result.iat[1] + + if is_datetime64_dtype(o) or is_datetime64tz_dtype(o): + # Following DatetimeIndex (and Timestamp) convention, + # inequality comparisons with Series[datetime64] raise + with pytest.raises(TypeError): + None > o + with pytest.raises(TypeError): + o > None + else: + result = None > o + assert not result.iat[0] + assert not result.iat[1] + + result = o < None + assert not result.iat[0] + assert not result.iat[1] + + def test_ndarray_compat_properties(self): + + for o in self.objs: + # Check that we work. + for p in ["shape", "dtype", "T", "nbytes"]: + assert getattr(o, p, None) is not None + + # deprecated properties + for p in ["flags", "strides", "itemsize", "base", "data"]: + assert not hasattr(o, p) + + with pytest.raises(ValueError): + o.item() # len > 1 + + assert o.ndim == 1 + assert o.size == len(o) + + assert Index([1]).item() == 1 + assert Series([1]).item() == 1 + + def test_value_counts_unique_nunique(self): + for orig in self.objs: + o = orig.copy() + klass = type(o) + values = o._values + + if isinstance(values, Index): + # reset name not to affect latter process + values.name = None + + # create repeated values, 'n'th element is repeated by n+1 times + # skip boolean, because it only has 2 values at most + if isinstance(o, Index) and o.is_boolean(): + continue + elif isinstance(o, Index): + expected_index = Index(o[::-1]) + expected_index.name = None + o = o.repeat(range(1, len(o) + 1)) + o.name = "a" + else: + expected_index = Index(values[::-1]) + idx = o.index.repeat(range(1, len(o) + 1)) + # take-based repeat + indices = np.repeat(np.arange(len(o)), range(1, len(o) + 1)) + rep = values.take(indices) + o = klass(rep, index=idx, name="a") + + # check values has the same dtype as the original + assert o.dtype == orig.dtype + + expected_s = Series( + range(10, 0, -1), index=expected_index, dtype="int64", name="a" + ) + + result = o.value_counts() + tm.assert_series_equal(result, expected_s) + assert result.index.name is None + assert result.name == "a" + + result = o.unique() + if isinstance(o, Index): + assert isinstance(result, type(o)) + tm.assert_index_equal(result, orig) + assert result.dtype == orig.dtype + elif is_datetime64tz_dtype(o): + # datetimetz Series returns array of Timestamp + assert result[0] == orig[0] + for r in result: + assert isinstance(r, Timestamp) + + tm.assert_numpy_array_equal( + result.astype(object), orig._values.astype(object) + ) + else: + tm.assert_numpy_array_equal(result, orig.values) + assert result.dtype == orig.dtype + + assert o.nunique() == len(np.unique(o.values)) + + @pytest.mark.parametrize("null_obj", [np.nan, None]) + def test_value_counts_unique_nunique_null(self, null_obj): + + for orig in self.objs: + o = orig.copy() + klass = type(o) + values = o._ndarray_values + + if not self._allow_na_ops(o): + continue + + # special assign to the numpy array + if is_datetime64tz_dtype(o): + if isinstance(o, DatetimeIndex): + v = o.asi8 + v[0:2] = iNaT + values = o._shallow_copy(v) + else: + o = o.copy() + o[0:2] = pd.NaT + values = o._values + + elif needs_i8_conversion(o): + values[0:2] = iNaT + values = o._shallow_copy(values) + else: + values[0:2] = null_obj + # check values has the same dtype as the original + + assert values.dtype == o.dtype + + # create repeated values, 'n'th element is repeated by n+1 + # times + if isinstance(o, (DatetimeIndex, PeriodIndex)): + expected_index = o.copy() + expected_index.name = None + + # attach name to klass + o = klass(values.repeat(range(1, len(o) + 1))) + o.name = "a" + else: + if isinstance(o, DatetimeIndex): + expected_index = orig._values._shallow_copy(values) + else: + expected_index = Index(values) + expected_index.name = None + o = o.repeat(range(1, len(o) + 1)) + o.name = "a" + + # check values has the same dtype as the original + assert o.dtype == orig.dtype + # check values correctly have NaN + nanloc = np.zeros(len(o), dtype=np.bool) + nanloc[:3] = True + if isinstance(o, Index): + tm.assert_numpy_array_equal(pd.isna(o), nanloc) + else: + exp = Series(nanloc, o.index, name="a") + tm.assert_series_equal(pd.isna(o), exp) + + expected_s_na = Series( + list(range(10, 2, -1)) + [3], + index=expected_index[9:0:-1], + dtype="int64", + name="a", + ) + expected_s = Series( + list(range(10, 2, -1)), + index=expected_index[9:1:-1], + dtype="int64", + name="a", + ) + + result_s_na = o.value_counts(dropna=False) + tm.assert_series_equal(result_s_na, expected_s_na) + assert result_s_na.index.name is None + assert result_s_na.name == "a" + result_s = o.value_counts() + tm.assert_series_equal(o.value_counts(), expected_s) + assert result_s.index.name is None + assert result_s.name == "a" + + result = o.unique() + if isinstance(o, Index): + tm.assert_index_equal(result, Index(values[1:], name="a")) + elif is_datetime64tz_dtype(o): + # unable to compare NaT / nan + tm.assert_extension_array_equal(result[1:], values[2:]) + assert result[0] is pd.NaT + else: + tm.assert_numpy_array_equal(result[1:], values[2:]) + + assert pd.isna(result[0]) + assert result.dtype == orig.dtype + + assert o.nunique() == 8 + assert o.nunique(dropna=False) == 9 + + def test_value_counts_inferred(self, index_or_series): + klass = index_or_series + s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"] + s = klass(s_values) + expected = Series([4, 3, 2, 1], index=["b", "a", "d", "c"]) + tm.assert_series_equal(s.value_counts(), expected) + + if isinstance(s, Index): + exp = Index(np.unique(np.array(s_values, dtype=np.object_))) + tm.assert_index_equal(s.unique(), exp) + else: + exp = np.unique(np.array(s_values, dtype=np.object_)) + tm.assert_numpy_array_equal(s.unique(), exp) + + assert s.nunique() == 4 + # don't sort, have to sort after the fact as not sorting is + # platform-dep + hist = s.value_counts(sort=False).sort_values() + expected = Series([3, 1, 4, 2], index=list("acbd")).sort_values() + tm.assert_series_equal(hist, expected) + + # sort ascending + hist = s.value_counts(ascending=True) + expected = Series([1, 2, 3, 4], index=list("cdab")) + tm.assert_series_equal(hist, expected) + + # relative histogram. + hist = s.value_counts(normalize=True) + expected = Series([0.4, 0.3, 0.2, 0.1], index=["b", "a", "d", "c"]) + tm.assert_series_equal(hist, expected) + + def test_value_counts_bins(self, index_or_series): + klass = index_or_series + s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"] + s = klass(s_values) + + # bins + with pytest.raises(TypeError): + s.value_counts(bins=1) + + s1 = Series([1, 1, 2, 3]) + res1 = s1.value_counts(bins=1) + exp1 = Series({Interval(0.997, 3.0): 4}) + tm.assert_series_equal(res1, exp1) + res1n = s1.value_counts(bins=1, normalize=True) + exp1n = Series({Interval(0.997, 3.0): 1.0}) + tm.assert_series_equal(res1n, exp1n) + + if isinstance(s1, Index): + tm.assert_index_equal(s1.unique(), Index([1, 2, 3])) + else: + exp = np.array([1, 2, 3], dtype=np.int64) + tm.assert_numpy_array_equal(s1.unique(), exp) + + assert s1.nunique() == 3 + + # these return the same + res4 = s1.value_counts(bins=4, dropna=True) + intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) + exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) + tm.assert_series_equal(res4, exp4) + + res4 = s1.value_counts(bins=4, dropna=False) + intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) + exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) + tm.assert_series_equal(res4, exp4) + + res4n = s1.value_counts(bins=4, normalize=True) + exp4n = Series([0.5, 0.25, 0.25, 0], index=intervals.take([0, 3, 1, 2])) + tm.assert_series_equal(res4n, exp4n) + + # handle NA's properly + s_values = ["a", "b", "b", "b", np.nan, np.nan, "d", "d", "a", "a", "b"] + s = klass(s_values) + expected = Series([4, 3, 2], index=["b", "a", "d"]) + tm.assert_series_equal(s.value_counts(), expected) + + if isinstance(s, Index): + exp = Index(["a", "b", np.nan, "d"]) + tm.assert_index_equal(s.unique(), exp) + else: + exp = np.array(["a", "b", np.nan, "d"], dtype=object) + tm.assert_numpy_array_equal(s.unique(), exp) + assert s.nunique() == 3 + + s = klass({}) if klass is dict else klass({}, dtype=object) + expected = Series([], dtype=np.int64) + tm.assert_series_equal(s.value_counts(), expected, check_index_type=False) + # returned dtype differs depending on original + if isinstance(s, Index): + tm.assert_index_equal(s.unique(), Index([]), exact=False) + else: + tm.assert_numpy_array_equal(s.unique(), np.array([]), check_dtype=False) + + assert s.nunique() == 0 + + def test_value_counts_datetime64(self, index_or_series): + klass = index_or_series + + # GH 3002, datetime64[ns] + # don't test names though + txt = "\n".join( + [ + "xxyyzz20100101PIE", + "xxyyzz20100101GUM", + "xxyyzz20100101EGG", + "xxyyww20090101EGG", + "foofoo20080909PIE", + "foofoo20080909GUM", + ] + ) + f = StringIO(txt) + df = pd.read_fwf( + f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"] + ) + + s = klass(df["dt"].copy()) + s.name = None + idx = pd.to_datetime( + ["2010-01-01 00:00:00", "2008-09-09 00:00:00", "2009-01-01 00:00:00"] + ) + expected_s = Series([3, 2, 1], index=idx) + tm.assert_series_equal(s.value_counts(), expected_s) + + expected = np_array_datetime64_compat( + ["2010-01-01 00:00:00", "2009-01-01 00:00:00", "2008-09-09 00:00:00"], + dtype="datetime64[ns]", + ) + if isinstance(s, Index): + tm.assert_index_equal(s.unique(), DatetimeIndex(expected)) + else: + tm.assert_numpy_array_equal(s.unique(), expected) + + assert s.nunique() == 3 + + # with NaT + s = df["dt"].copy() + s = klass(list(s.values) + [pd.NaT]) + + result = s.value_counts() + assert result.index.dtype == "datetime64[ns]" + tm.assert_series_equal(result, expected_s) + + result = s.value_counts(dropna=False) + expected_s[pd.NaT] = 1 + tm.assert_series_equal(result, expected_s) + + unique = s.unique() + assert unique.dtype == "datetime64[ns]" + + # numpy_array_equal cannot compare pd.NaT + if isinstance(s, Index): + exp_idx = DatetimeIndex(expected.tolist() + [pd.NaT]) + tm.assert_index_equal(unique, exp_idx) + else: + tm.assert_numpy_array_equal(unique[:3], expected) + assert pd.isna(unique[3]) + + assert s.nunique() == 3 + assert s.nunique(dropna=False) == 4 + + # timedelta64[ns] + td = df.dt - df.dt + timedelta(1) + td = klass(td, name="dt") + + result = td.value_counts() + expected_s = Series([6], index=[Timedelta("1day")], name="dt") + tm.assert_series_equal(result, expected_s) + + expected = TimedeltaIndex(["1 days"], name="dt") + if isinstance(td, Index): + tm.assert_index_equal(td.unique(), expected) + else: + tm.assert_numpy_array_equal(td.unique(), expected.values) + + td2 = timedelta(1) + (df.dt - df.dt) + td2 = klass(td2, name="dt") + result2 = td2.value_counts() + tm.assert_series_equal(result2, expected_s) + + def test_factorize(self): + for orig in self.objs: + o = orig.copy() + + if isinstance(o, Index) and o.is_boolean(): + exp_arr = np.array([0, 1] + [0] * 8, dtype=np.intp) + exp_uniques = o + exp_uniques = Index([False, True]) + else: + exp_arr = np.array(range(len(o)), dtype=np.intp) + exp_uniques = o + codes, uniques = o.factorize() + + tm.assert_numpy_array_equal(codes, exp_arr) + if isinstance(o, Series): + tm.assert_index_equal(uniques, Index(orig), check_names=False) + else: + # factorize explicitly resets name + tm.assert_index_equal(uniques, exp_uniques, check_names=False) + + def test_factorize_repeated(self): + for orig in self.objs: + o = orig.copy() + + # don't test boolean + if isinstance(o, Index) and o.is_boolean(): + continue + + # sort by value, and create duplicates + if isinstance(o, Series): + o = o.sort_values() + n = o.iloc[5:].append(o) + else: + indexer = o.argsort() + o = o.take(indexer) + n = o[5:].append(o) + + exp_arr = np.array( + [5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=np.intp + ) + codes, uniques = n.factorize(sort=True) + + tm.assert_numpy_array_equal(codes, exp_arr) + if isinstance(o, Series): + tm.assert_index_equal( + uniques, Index(orig).sort_values(), check_names=False + ) + else: + tm.assert_index_equal(uniques, o, check_names=False) + + exp_arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4], np.intp) + codes, uniques = n.factorize(sort=False) + tm.assert_numpy_array_equal(codes, exp_arr) + + if isinstance(o, Series): + expected = Index(o.iloc[5:10].append(o.iloc[:5])) + tm.assert_index_equal(uniques, expected, check_names=False) + else: + expected = o[5:10].append(o[:5]) + tm.assert_index_equal(uniques, expected, check_names=False) + + def test_duplicated_drop_duplicates_index(self): + # GH 4060 + for original in self.objs: + if isinstance(original, Index): + + # special case + if original.is_boolean(): + result = original.drop_duplicates() + expected = Index([False, True], name="a") + tm.assert_index_equal(result, expected) + continue + + # original doesn't have duplicates + expected = np.array([False] * len(original), dtype=bool) + duplicated = original.duplicated() + tm.assert_numpy_array_equal(duplicated, expected) + assert duplicated.dtype == bool + result = original.drop_duplicates() + tm.assert_index_equal(result, original) + assert result is not original + + # has_duplicates + assert not original.has_duplicates + + # create repeated values, 3rd and 5th values are duplicated + idx = original[list(range(len(original))) + [5, 3]] + expected = np.array([False] * len(original) + [True, True], dtype=bool) + duplicated = idx.duplicated() + tm.assert_numpy_array_equal(duplicated, expected) + assert duplicated.dtype == bool + tm.assert_index_equal(idx.drop_duplicates(), original) + + base = [False] * len(idx) + base[3] = True + base[5] = True + expected = np.array(base) + + duplicated = idx.duplicated(keep="last") + tm.assert_numpy_array_equal(duplicated, expected) + assert duplicated.dtype == bool + result = idx.drop_duplicates(keep="last") + tm.assert_index_equal(result, idx[~expected]) + + base = [False] * len(original) + [True, True] + base[3] = True + base[5] = True + expected = np.array(base) + + duplicated = idx.duplicated(keep=False) + tm.assert_numpy_array_equal(duplicated, expected) + assert duplicated.dtype == bool + result = idx.drop_duplicates(keep=False) + tm.assert_index_equal(result, idx[~expected]) + + with pytest.raises( + TypeError, + match=r"drop_duplicates\(\) got an unexpected keyword argument", + ): + idx.drop_duplicates(inplace=True) + + else: + expected = Series( + [False] * len(original), index=original.index, name="a" + ) + tm.assert_series_equal(original.duplicated(), expected) + result = original.drop_duplicates() + tm.assert_series_equal(result, original) + assert result is not original + + idx = original.index[list(range(len(original))) + [5, 3]] + values = original._values[list(range(len(original))) + [5, 3]] + s = Series(values, index=idx, name="a") + + expected = Series( + [False] * len(original) + [True, True], index=idx, name="a" + ) + tm.assert_series_equal(s.duplicated(), expected) + tm.assert_series_equal(s.drop_duplicates(), original) + + base = [False] * len(idx) + base[3] = True + base[5] = True + expected = Series(base, index=idx, name="a") + + tm.assert_series_equal(s.duplicated(keep="last"), expected) + tm.assert_series_equal( + s.drop_duplicates(keep="last"), s[~np.array(base)] + ) + + base = [False] * len(original) + [True, True] + base[3] = True + base[5] = True + expected = Series(base, index=idx, name="a") + + tm.assert_series_equal(s.duplicated(keep=False), expected) + tm.assert_series_equal( + s.drop_duplicates(keep=False), s[~np.array(base)] + ) + + s.drop_duplicates(inplace=True) + tm.assert_series_equal(s, original) + + def test_drop_duplicates_series_vs_dataframe(self): + # GH 14192 + df = pd.DataFrame( + { + "a": [1, 1, 1, "one", "one"], + "b": [2, 2, np.nan, np.nan, np.nan], + "c": [3, 3, np.nan, np.nan, "three"], + "d": [1, 2, 3, 4, 4], + "e": [ + datetime(2015, 1, 1), + datetime(2015, 1, 1), + datetime(2015, 2, 1), + pd.NaT, + pd.NaT, + ], + } + ) + for column in df.columns: + for keep in ["first", "last", False]: + dropped_frame = df[[column]].drop_duplicates(keep=keep) + dropped_series = df[column].drop_duplicates(keep=keep) + tm.assert_frame_equal(dropped_frame, dropped_series.to_frame()) + + def test_fillna(self): + # # GH 11343 + # though Index.fillna and Series.fillna has separate impl, + # test here to confirm these works as the same + + for orig in self.objs: + + o = orig.copy() + values = o.values + + # values will not be changed + result = o.fillna(o.astype(object).values[0]) + if isinstance(o, Index): + tm.assert_index_equal(o, result) + else: + tm.assert_series_equal(o, result) + # check shallow_copied + assert o is not result + + for null_obj in [np.nan, None]: + for orig in self.objs: + o = orig.copy() + klass = type(o) + + if not self._allow_na_ops(o): + continue + + if needs_i8_conversion(o): + + values = o.astype(object).values + fill_value = values[0] + values[0:2] = pd.NaT + else: + values = o.values.copy() + fill_value = o.values[0] + values[0:2] = null_obj + + expected = [fill_value] * 2 + list(values[2:]) + + expected = klass(expected, dtype=orig.dtype) + o = klass(values) + + # check values has the same dtype as the original + assert o.dtype == orig.dtype + + result = o.fillna(fill_value) + if isinstance(o, Index): + tm.assert_index_equal(result, expected) + else: + tm.assert_series_equal(result, expected) + # check shallow_copied + assert o is not result + + @pytest.mark.skipif(PYPY, reason="not relevant for PyPy") + def test_memory_usage(self): + for o in self.objs: + res = o.memory_usage() + res_deep = o.memory_usage(deep=True) + + if is_object_dtype(o) or ( + isinstance(o, Series) and is_object_dtype(o.index) + ): + # if there are objects, only deep will pick them up + assert res_deep > res + else: + assert res == res_deep + + if isinstance(o, Series): + assert ( + o.memory_usage(index=False) + o.index.memory_usage() + ) == o.memory_usage(index=True) + + # sys.getsizeof will call the .memory_usage with + # deep=True, and add on some GC overhead + diff = res_deep - sys.getsizeof(o) + assert abs(diff) < 100 + + def test_searchsorted(self): + # See gh-12238 + for o in self.objs: + index = np.searchsorted(o, max(o)) + assert 0 <= index <= len(o) + + index = np.searchsorted(o, max(o), sorter=range(len(o))) + assert 0 <= index <= len(o) + + def test_validate_bool_args(self): + invalid_values = [1, "True", [1, 2, 3], 5.0] + + for value in invalid_values: + with pytest.raises(ValueError): + self.int_series.drop_duplicates(inplace=value) + + def test_getitem(self): + for i in self.indexes: + s = pd.Series(i) + + assert i[0] == s.iloc[0] + assert i[5] == s.iloc[5] + assert i[-1] == s.iloc[-1] + + assert i[-1] == i[9] + + with pytest.raises(IndexError): + i[20] + with pytest.raises(IndexError): + s.iloc[20] + + @pytest.mark.parametrize("indexer_klass", [list, pd.Index]) + @pytest.mark.parametrize( + "indexer", + [ + [True] * 10, + [False] * 10, + [True, False, True, True, False, False, True, True, False, True], + ], + ) + def test_bool_indexing(self, indexer_klass, indexer): + # GH 22533 + for idx in self.indexes: + exp_idx = [i for i in range(len(indexer)) if indexer[i]] + tm.assert_index_equal(idx[indexer_klass(indexer)], idx[exp_idx]) + s = pd.Series(idx) + tm.assert_series_equal(s[indexer_klass(indexer)], s.iloc[exp_idx]) + + def test_get_indexer_non_unique_dtype_mismatch(self): + # GH 25459 + indexes, missing = pd.Index(["A", "B"]).get_indexer_non_unique(pd.Index([0])) + tm.assert_numpy_array_equal(np.array([-1], dtype=np.intp), indexes) + tm.assert_numpy_array_equal(np.array([0], dtype=np.int64), missing) diff --git a/venv/Lib/site-packages/pandas/tests/computation/__init__.py b/venv/Lib/site-packages/pandas/tests/computation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/computation/test_compat.py b/venv/Lib/site-packages/pandas/tests/computation/test_compat.py new file mode 100644 index 0000000..b3fbd8c --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/computation/test_compat.py @@ -0,0 +1,49 @@ +from distutils.version import LooseVersion + +import pytest + +from pandas.compat._optional import VERSIONS + +import pandas as pd +from pandas.core.computation.engines import _engines +import pandas.core.computation.expr as expr + + +def test_compat(): + # test we have compat with our version of nu + + from pandas.core.computation.check import _NUMEXPR_INSTALLED + + try: + import numexpr as ne + + ver = ne.__version__ + if LooseVersion(ver) < LooseVersion(VERSIONS["numexpr"]): + assert not _NUMEXPR_INSTALLED + else: + assert _NUMEXPR_INSTALLED + except ImportError: + pytest.skip("not testing numexpr version compat") + + +@pytest.mark.parametrize("engine", _engines) +@pytest.mark.parametrize("parser", expr._parsers) +def test_invalid_numexpr_version(engine, parser): + def testit(): + a, b = 1, 2 # noqa + res = pd.eval("a + b", engine=engine, parser=parser) + assert res == 3 + + if engine == "numexpr": + try: + import numexpr as ne + except ImportError: + pytest.skip("no numexpr") + else: + if LooseVersion(ne.__version__) < LooseVersion(VERSIONS["numexpr"]): + with pytest.raises(ImportError): + testit() + else: + testit() + else: + testit() diff --git a/venv/Lib/site-packages/pandas/tests/computation/test_eval.py b/venv/Lib/site-packages/pandas/tests/computation/test_eval.py new file mode 100644 index 0000000..7f68abb --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/computation/test_eval.py @@ -0,0 +1,2051 @@ +from distutils.version import LooseVersion +from functools import reduce +from itertools import product +import operator +from typing import Dict, Type +import warnings + +import numpy as np +from numpy.random import rand, randint, randn +import pytest + +from pandas.errors import PerformanceWarning +import pandas.util._test_decorators as td + +from pandas.core.dtypes.common import is_bool, is_list_like, is_scalar + +import pandas as pd +from pandas import DataFrame, Series, compat, date_range +import pandas._testing as tm +from pandas.core.computation import pytables +from pandas.core.computation.check import _NUMEXPR_VERSION +from pandas.core.computation.engines import NumExprClobberingError, _engines +import pandas.core.computation.expr as expr +from pandas.core.computation.expr import ( + BaseExprVisitor, + PandasExprVisitor, + PythonExprVisitor, +) +from pandas.core.computation.expressions import _NUMEXPR_INSTALLED, _USE_NUMEXPR +from pandas.core.computation.ops import ( + _arith_ops_syms, + _binary_math_ops, + _binary_ops_dict, + _special_case_arith_ops_syms, + _unary_math_ops, +) + + +@pytest.fixture( + params=( + pytest.param( + engine, + marks=pytest.mark.skipif( + engine == "numexpr" and not _USE_NUMEXPR, + reason=f"numexpr enabled->{_USE_NUMEXPR}, " + f"installed->{_NUMEXPR_INSTALLED}", + ), + ) + for engine in _engines + ) +) # noqa +def engine(request): + return request.param + + +@pytest.fixture(params=expr._parsers) +def parser(request): + return request.param + + +@pytest.fixture +def ne_lt_2_6_9(): + if _NUMEXPR_INSTALLED and _NUMEXPR_VERSION >= LooseVersion("2.6.9"): + pytest.skip("numexpr is >= 2.6.9") + return "numexpr" + + +@pytest.fixture +def unary_fns_for_ne(): + if _NUMEXPR_INSTALLED: + if _NUMEXPR_VERSION >= LooseVersion("2.6.9"): + return _unary_math_ops + else: + return tuple(x for x in _unary_math_ops if x not in ("floor", "ceil")) + else: + pytest.skip("numexpr is not present") + + +def engine_has_neg_frac(engine): + return _engines[engine].has_neg_frac + + +def _eval_single_bin(lhs, cmp1, rhs, engine): + c = _binary_ops_dict[cmp1] + if engine_has_neg_frac(engine): + try: + return c(lhs, rhs) + except ValueError as e: + if str(e).startswith( + "negative number cannot be raised to a fractional power" + ): + return np.nan + raise + return c(lhs, rhs) + + +def _series_and_2d_ndarray(lhs, rhs): + return ( + isinstance(lhs, Series) and isinstance(rhs, np.ndarray) and rhs.ndim > 1 + ) or (isinstance(rhs, Series) and isinstance(lhs, np.ndarray) and lhs.ndim > 1) + + +def _series_and_frame(lhs, rhs): + return (isinstance(lhs, Series) and isinstance(rhs, DataFrame)) or ( + isinstance(rhs, Series) and isinstance(lhs, DataFrame) + ) + + +def _bool_and_frame(lhs, rhs): + return isinstance(lhs, bool) and isinstance(rhs, pd.core.generic.NDFrame) + + +def _is_py3_complex_incompat(result, expected): + return isinstance(expected, (complex, np.complexfloating)) and np.isnan(result) + + +_good_arith_ops = set(_arith_ops_syms).difference(_special_case_arith_ops_syms) + + +@td.skip_if_no_ne +class TestEvalNumexprPandas: + @classmethod + def setup_class(cls): + import numexpr as ne + + cls.ne = ne + cls.engine = "numexpr" + cls.parser = "pandas" + + @classmethod + def teardown_class(cls): + del cls.engine, cls.parser + if hasattr(cls, "ne"): + del cls.ne + + def setup_data(self): + nan_df1 = DataFrame(rand(10, 5)) + nan_df1[nan_df1 > 0.5] = np.nan + nan_df2 = DataFrame(rand(10, 5)) + nan_df2[nan_df2 > 0.5] = np.nan + + self.pandas_lhses = ( + DataFrame(randn(10, 5)), + Series(randn(5)), + Series([1, 2, np.nan, np.nan, 5]), + nan_df1, + ) + self.pandas_rhses = ( + DataFrame(randn(10, 5)), + Series(randn(5)), + Series([1, 2, np.nan, np.nan, 5]), + nan_df2, + ) + self.scalar_lhses = (randn(),) + self.scalar_rhses = (randn(),) + + self.lhses = self.pandas_lhses + self.scalar_lhses + self.rhses = self.pandas_rhses + self.scalar_rhses + + def setup_ops(self): + self.cmp_ops = expr._cmp_ops_syms + self.cmp2_ops = self.cmp_ops[::-1] + self.bin_ops = expr._bool_ops_syms + self.special_case_ops = _special_case_arith_ops_syms + self.arith_ops = _good_arith_ops + self.unary_ops = "-", "~", "not " + + def setup_method(self, method): + self.setup_ops() + self.setup_data() + self.current_engines = filter(lambda x: x != self.engine, _engines) + + def teardown_method(self, method): + del self.lhses, self.rhses, self.scalar_rhses, self.scalar_lhses + del self.pandas_rhses, self.pandas_lhses, self.current_engines + + @pytest.mark.slow + @pytest.mark.parametrize( + "cmp1", + ["!=", "==", "<=", ">=", "<", ">"], + ids=["ne", "eq", "le", "ge", "lt", "gt"], + ) + @pytest.mark.parametrize("cmp2", [">", "<"], ids=["gt", "lt"]) + def test_complex_cmp_ops(self, cmp1, cmp2): + for lhs, rhs, binop in product(self.lhses, self.rhses, self.bin_ops): + lhs_new = _eval_single_bin(lhs, cmp1, rhs, self.engine) + rhs_new = _eval_single_bin(lhs, cmp2, rhs, self.engine) + expected = _eval_single_bin(lhs_new, binop, rhs_new, self.engine) + + ex = f"(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)" + result = pd.eval(ex, engine=self.engine, parser=self.parser) + self.check_equal(result, expected) + + def test_simple_cmp_ops(self): + bool_lhses = ( + DataFrame(tm.randbool(size=(10, 5))), + Series(tm.randbool((5,))), + tm.randbool(), + ) + bool_rhses = ( + DataFrame(tm.randbool(size=(10, 5))), + Series(tm.randbool((5,))), + tm.randbool(), + ) + for lhs, rhs, cmp_op in product(bool_lhses, bool_rhses, self.cmp_ops): + self.check_simple_cmp_op(lhs, cmp_op, rhs) + + @pytest.mark.slow + def test_binary_arith_ops(self): + for lhs, op, rhs in product(self.lhses, self.arith_ops, self.rhses): + self.check_binary_arith_op(lhs, op, rhs) + + def test_modulus(self): + for lhs, rhs in product(self.lhses, self.rhses): + self.check_modulus(lhs, "%", rhs) + + def test_floor_division(self): + for lhs, rhs in product(self.lhses, self.rhses): + self.check_floor_division(lhs, "//", rhs) + + @td.skip_if_windows + def test_pow(self): + # odd failure on win32 platform, so skip + for lhs, rhs in product(self.lhses, self.rhses): + self.check_pow(lhs, "**", rhs) + + @pytest.mark.slow + def test_single_invert_op(self): + for lhs, op, rhs in product(self.lhses, self.cmp_ops, self.rhses): + self.check_single_invert_op(lhs, op, rhs) + + @pytest.mark.slow + def test_compound_invert_op(self): + for lhs, op, rhs in product(self.lhses, self.cmp_ops, self.rhses): + self.check_compound_invert_op(lhs, op, rhs) + + @pytest.mark.slow + def test_chained_cmp_op(self): + mids = self.lhses + cmp_ops = "<", ">" + for lhs, cmp1, mid, cmp2, rhs in product( + self.lhses, cmp_ops, mids, cmp_ops, self.rhses + ): + self.check_chained_cmp_op(lhs, cmp1, mid, cmp2, rhs) + + def check_equal(self, result, expected): + if isinstance(result, DataFrame): + tm.assert_frame_equal(result, expected) + elif isinstance(result, Series): + tm.assert_series_equal(result, expected) + elif isinstance(result, np.ndarray): + tm.assert_numpy_array_equal(result, expected) + else: + assert result == expected + + def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): + def check_operands(left, right, cmp_op): + return _eval_single_bin(left, cmp_op, right, self.engine) + + lhs_new = check_operands(lhs, mid, cmp1) + rhs_new = check_operands(mid, rhs, cmp2) + + if lhs_new is not None and rhs_new is not None: + ex1 = f"lhs {cmp1} mid {cmp2} rhs" + ex2 = f"lhs {cmp1} mid and mid {cmp2} rhs" + ex3 = f"(lhs {cmp1} mid) & (mid {cmp2} rhs)" + expected = _eval_single_bin(lhs_new, "&", rhs_new, self.engine) + + for ex in (ex1, ex2, ex3): + result = pd.eval(ex, engine=self.engine, parser=self.parser) + + tm.assert_almost_equal(result, expected) + + def check_simple_cmp_op(self, lhs, cmp1, rhs): + ex = f"lhs {cmp1} rhs" + msg = ( + r"only list-like( or dict-like)? objects are allowed to be" + r" passed to (DataFrame\.)?isin\(\), you passed a" + r" (\[|')bool(\]|')|" + "argument of type 'bool' is not iterable" + ) + if cmp1 in ("in", "not in") and not is_list_like(rhs): + with pytest.raises(TypeError, match=msg): + pd.eval( + ex, + engine=self.engine, + parser=self.parser, + local_dict={"lhs": lhs, "rhs": rhs}, + ) + else: + expected = _eval_single_bin(lhs, cmp1, rhs, self.engine) + result = pd.eval(ex, engine=self.engine, parser=self.parser) + self.check_equal(result, expected) + + def check_binary_arith_op(self, lhs, arith1, rhs): + ex = f"lhs {arith1} rhs" + result = pd.eval(ex, engine=self.engine, parser=self.parser) + expected = _eval_single_bin(lhs, arith1, rhs, self.engine) + + tm.assert_almost_equal(result, expected) + ex = f"lhs {arith1} rhs {arith1} rhs" + result = pd.eval(ex, engine=self.engine, parser=self.parser) + nlhs = _eval_single_bin(lhs, arith1, rhs, self.engine) + self.check_alignment(result, nlhs, rhs, arith1) + + def check_alignment(self, result, nlhs, ghs, op): + try: + nlhs, ghs = nlhs.align(ghs) + except (ValueError, TypeError, AttributeError): + # ValueError: series frame or frame series align + # TypeError, AttributeError: series or frame with scalar align + pass + else: + + # direct numpy comparison + expected = self.ne.evaluate(f"nlhs {op} ghs") + tm.assert_numpy_array_equal(result.values, expected) + + # modulus, pow, and floor division require special casing + + def check_modulus(self, lhs, arith1, rhs): + ex = f"lhs {arith1} rhs" + result = pd.eval(ex, engine=self.engine, parser=self.parser) + expected = lhs % rhs + + tm.assert_almost_equal(result, expected) + expected = self.ne.evaluate(f"expected {arith1} rhs") + if isinstance(result, (DataFrame, Series)): + tm.assert_almost_equal(result.values, expected) + else: + tm.assert_almost_equal(result, expected.item()) + + def check_floor_division(self, lhs, arith1, rhs): + ex = f"lhs {arith1} rhs" + + if self.engine == "python": + res = pd.eval(ex, engine=self.engine, parser=self.parser) + expected = lhs // rhs + self.check_equal(res, expected) + else: + msg = ( + r"unsupported operand type\(s\) for //: 'VariableNode' and " + "'VariableNode'" + ) + with pytest.raises(TypeError, match=msg): + pd.eval( + ex, + local_dict={"lhs": lhs, "rhs": rhs}, + engine=self.engine, + parser=self.parser, + ) + + def get_expected_pow_result(self, lhs, rhs): + try: + expected = _eval_single_bin(lhs, "**", rhs, self.engine) + except ValueError as e: + if str(e).startswith( + "negative number cannot be raised to a fractional power" + ): + if self.engine == "python": + pytest.skip(str(e)) + else: + expected = np.nan + else: + raise + return expected + + def check_pow(self, lhs, arith1, rhs): + ex = f"lhs {arith1} rhs" + expected = self.get_expected_pow_result(lhs, rhs) + result = pd.eval(ex, engine=self.engine, parser=self.parser) + + if ( + is_scalar(lhs) + and is_scalar(rhs) + and _is_py3_complex_incompat(result, expected) + ): + with pytest.raises(AssertionError): + tm.assert_numpy_array_equal(result, expected) + else: + tm.assert_almost_equal(result, expected) + + ex = f"(lhs {arith1} rhs) {arith1} rhs" + result = pd.eval(ex, engine=self.engine, parser=self.parser) + expected = self.get_expected_pow_result( + self.get_expected_pow_result(lhs, rhs), rhs + ) + tm.assert_almost_equal(result, expected) + + def check_single_invert_op(self, lhs, cmp1, rhs): + # simple + for el in (lhs, rhs): + try: + elb = el.astype(bool) + except AttributeError: + elb = np.array([bool(el)]) + expected = ~elb + result = pd.eval("~elb", engine=self.engine, parser=self.parser) + tm.assert_almost_equal(expected, result) + + for engine in self.current_engines: + tm.assert_almost_equal( + result, pd.eval("~elb", engine=engine, parser=self.parser) + ) + + def check_compound_invert_op(self, lhs, cmp1, rhs): + skip_these = ["in", "not in"] + ex = f"~(lhs {cmp1} rhs)" + + msg = ( + r"only list-like( or dict-like)? objects are allowed to be" + r" passed to (DataFrame\.)?isin\(\), you passed a" + r" (\[|')float(\]|')|" + "argument of type 'float' is not iterable" + ) + if is_scalar(rhs) and cmp1 in skip_these: + with pytest.raises(TypeError, match=msg): + pd.eval( + ex, + engine=self.engine, + parser=self.parser, + local_dict={"lhs": lhs, "rhs": rhs}, + ) + else: + # compound + if is_scalar(lhs) and is_scalar(rhs): + lhs, rhs = map(lambda x: np.array([x]), (lhs, rhs)) + expected = _eval_single_bin(lhs, cmp1, rhs, self.engine) + if is_scalar(expected): + expected = not expected + else: + expected = ~expected + result = pd.eval(ex, engine=self.engine, parser=self.parser) + tm.assert_almost_equal(expected, result) + + # make sure the other engines work the same as this one + for engine in self.current_engines: + ev = pd.eval(ex, engine=self.engine, parser=self.parser) + tm.assert_almost_equal(ev, result) + + def ex(self, op, var_name="lhs"): + return f"{op}{var_name}" + + def test_frame_invert(self): + expr = self.ex("~") + + # ~ ## + # frame + # float always raises + lhs = DataFrame(randn(5, 2)) + if self.engine == "numexpr": + with pytest.raises(NotImplementedError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + with pytest.raises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + + # int raises on numexpr + lhs = DataFrame(randint(5, size=(5, 2))) + if self.engine == "numexpr": + with pytest.raises(NotImplementedError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = ~lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + tm.assert_frame_equal(expect, result) + + # bool always works + lhs = DataFrame(rand(5, 2) > 0.5) + expect = ~lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + tm.assert_frame_equal(expect, result) + + # object raises + lhs = DataFrame({"b": ["a", 1, 2.0], "c": rand(3) > 0.5}) + if self.engine == "numexpr": + with pytest.raises(ValueError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + with pytest.raises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + + def test_series_invert(self): + # ~ #### + expr = self.ex("~") + + # series + # float raises + lhs = Series(randn(5)) + if self.engine == "numexpr": + with pytest.raises(NotImplementedError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + with pytest.raises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + + # int raises on numexpr + lhs = Series(randint(5, size=5)) + if self.engine == "numexpr": + with pytest.raises(NotImplementedError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = ~lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + tm.assert_series_equal(expect, result) + + # bool + lhs = Series(rand(5) > 0.5) + expect = ~lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + tm.assert_series_equal(expect, result) + + # float + # int + # bool + + # object + lhs = Series(["a", 1, 2.0]) + if self.engine == "numexpr": + with pytest.raises(ValueError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + with pytest.raises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + + def test_frame_negate(self): + expr = self.ex("-") + + # float + lhs = DataFrame(randn(5, 2)) + expect = -lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + tm.assert_frame_equal(expect, result) + + # int + lhs = DataFrame(randint(5, size=(5, 2))) + expect = -lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + tm.assert_frame_equal(expect, result) + + # bool doesn't work with numexpr but works elsewhere + lhs = DataFrame(rand(5, 2) > 0.5) + if self.engine == "numexpr": + with pytest.raises(NotImplementedError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = -lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + tm.assert_frame_equal(expect, result) + + def test_series_negate(self): + expr = self.ex("-") + + # float + lhs = Series(randn(5)) + expect = -lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + tm.assert_series_equal(expect, result) + + # int + lhs = Series(randint(5, size=5)) + expect = -lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + tm.assert_series_equal(expect, result) + + # bool doesn't work with numexpr but works elsewhere + lhs = Series(rand(5) > 0.5) + if self.engine == "numexpr": + with pytest.raises(NotImplementedError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = -lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + tm.assert_series_equal(expect, result) + + def test_frame_pos(self): + expr = self.ex("+") + + # float + lhs = DataFrame(randn(5, 2)) + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + tm.assert_frame_equal(expect, result) + + # int + lhs = DataFrame(randint(5, size=(5, 2))) + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + tm.assert_frame_equal(expect, result) + + # bool doesn't work with numexpr but works elsewhere + lhs = DataFrame(rand(5, 2) > 0.5) + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + tm.assert_frame_equal(expect, result) + + def test_series_pos(self): + expr = self.ex("+") + + # float + lhs = Series(randn(5)) + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + tm.assert_series_equal(expect, result) + + # int + lhs = Series(randint(5, size=5)) + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + tm.assert_series_equal(expect, result) + + # bool doesn't work with numexpr but works elsewhere + lhs = Series(rand(5) > 0.5) + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + tm.assert_series_equal(expect, result) + + def test_scalar_unary(self): + with pytest.raises(TypeError): + pd.eval("~1.0", engine=self.engine, parser=self.parser) + + assert pd.eval("-1.0", parser=self.parser, engine=self.engine) == -1.0 + assert pd.eval("+1.0", parser=self.parser, engine=self.engine) == +1.0 + assert pd.eval("~1", parser=self.parser, engine=self.engine) == ~1 + assert pd.eval("-1", parser=self.parser, engine=self.engine) == -1 + assert pd.eval("+1", parser=self.parser, engine=self.engine) == +1 + assert pd.eval("~True", parser=self.parser, engine=self.engine) == ~True + assert pd.eval("~False", parser=self.parser, engine=self.engine) == ~False + assert pd.eval("-True", parser=self.parser, engine=self.engine) == -True + assert pd.eval("-False", parser=self.parser, engine=self.engine) == -False + assert pd.eval("+True", parser=self.parser, engine=self.engine) == +True + assert pd.eval("+False", parser=self.parser, engine=self.engine) == +False + + def test_unary_in_array(self): + # GH 11235 + tm.assert_numpy_array_equal( + pd.eval( + "[-True, True, ~True, +True," + "-False, False, ~False, +False," + "-37, 37, ~37, +37]" + ), + np.array( + [ + -True, + True, + ~True, + +True, + -False, + False, + ~False, + +False, + -37, + 37, + ~37, + +37, + ], + dtype=np.object_, + ), + ) + + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) + def test_float_comparison_bin_op(self, dtype): + # GH 16363 + df = pd.DataFrame({"x": np.array([0], dtype=dtype)}) + res = df.eval("x < -0.1") + assert res.values == np.array([False]) + + res = df.eval("-5 > x") + assert res.values == np.array([False]) + + def test_disallow_scalar_bool_ops(self): + exprs = "1 or 2", "1 and 2" + exprs += "a and b", "a or b" + exprs += ("1 or 2 and (3 + 2) > 3",) + exprs += ("2 * x > 2 or 1 and 2",) + exprs += ("2 * df > 3 and 1 or a",) + + x, a, b, df = np.random.randn(3), 1, 2, DataFrame(randn(3, 2)) # noqa + for ex in exprs: + with pytest.raises(NotImplementedError): + pd.eval(ex, engine=self.engine, parser=self.parser) + + def test_identical(self): + # see gh-10546 + x = 1 + result = pd.eval("x", engine=self.engine, parser=self.parser) + assert result == 1 + assert is_scalar(result) + + x = 1.5 + result = pd.eval("x", engine=self.engine, parser=self.parser) + assert result == 1.5 + assert is_scalar(result) + + x = False + result = pd.eval("x", engine=self.engine, parser=self.parser) + assert not result + assert is_bool(result) + assert is_scalar(result) + + x = np.array([1]) + result = pd.eval("x", engine=self.engine, parser=self.parser) + tm.assert_numpy_array_equal(result, np.array([1])) + assert result.shape == (1,) + + x = np.array([1.5]) + result = pd.eval("x", engine=self.engine, parser=self.parser) + tm.assert_numpy_array_equal(result, np.array([1.5])) + assert result.shape == (1,) + + x = np.array([False]) # noqa + result = pd.eval("x", engine=self.engine, parser=self.parser) + tm.assert_numpy_array_equal(result, np.array([False])) + assert result.shape == (1,) + + def test_line_continuation(self): + # GH 11149 + exp = """1 + 2 * \ + 5 - 1 + 2 """ + result = pd.eval(exp, engine=self.engine, parser=self.parser) + assert result == 12 + + def test_float_truncation(self): + # GH 14241 + exp = "1000000000.006" + result = pd.eval(exp, engine=self.engine, parser=self.parser) + expected = np.float64(exp) + assert result == expected + + df = pd.DataFrame({"A": [1000000000.0009, 1000000000.0011, 1000000000.0015]}) + cutoff = 1000000000.0006 + result = df.query(f"A < {cutoff:.4f}") + assert result.empty + + cutoff = 1000000000.0010 + result = df.query(f"A > {cutoff:.4f}") + expected = df.loc[[1, 2], :] + tm.assert_frame_equal(expected, result) + + exact = 1000000000.0011 + result = df.query(f"A == {exact:.4f}") + expected = df.loc[[1], :] + tm.assert_frame_equal(expected, result) + + def test_disallow_python_keywords(self): + # GH 18221 + df = pd.DataFrame([[0, 0, 0]], columns=["foo", "bar", "class"]) + msg = "Python keyword not valid identifier in numexpr query" + with pytest.raises(SyntaxError, match=msg): + df.query("class == 0") + + df = pd.DataFrame() + df.index.name = "lambda" + with pytest.raises(SyntaxError, match=msg): + df.query("lambda == 0") + + +@td.skip_if_no_ne +class TestEvalNumexprPython(TestEvalNumexprPandas): + @classmethod + def setup_class(cls): + super().setup_class() + import numexpr as ne + + cls.ne = ne + cls.engine = "numexpr" + cls.parser = "python" + + def setup_ops(self): + self.cmp_ops = list( + filter(lambda x: x not in ("in", "not in"), expr._cmp_ops_syms) + ) + self.cmp2_ops = self.cmp_ops[::-1] + self.bin_ops = [s for s in expr._bool_ops_syms if s not in ("and", "or")] + self.special_case_ops = _special_case_arith_ops_syms + self.arith_ops = _good_arith_ops + self.unary_ops = "+", "-", "~" + + def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): + ex1 = f"lhs {cmp1} mid {cmp2} rhs" + with pytest.raises(NotImplementedError): + pd.eval(ex1, engine=self.engine, parser=self.parser) + + +class TestEvalPythonPython(TestEvalNumexprPython): + @classmethod + def setup_class(cls): + super().setup_class() + cls.engine = "python" + cls.parser = "python" + + def check_modulus(self, lhs, arith1, rhs): + ex = f"lhs {arith1} rhs" + result = pd.eval(ex, engine=self.engine, parser=self.parser) + + expected = lhs % rhs + tm.assert_almost_equal(result, expected) + + expected = _eval_single_bin(expected, arith1, rhs, self.engine) + tm.assert_almost_equal(result, expected) + + def check_alignment(self, result, nlhs, ghs, op): + try: + nlhs, ghs = nlhs.align(ghs) + except (ValueError, TypeError, AttributeError): + # ValueError: series frame or frame series align + # TypeError, AttributeError: series or frame with scalar align + pass + else: + expected = eval(f"nlhs {op} ghs") + tm.assert_almost_equal(result, expected) + + +class TestEvalPythonPandas(TestEvalPythonPython): + @classmethod + def setup_class(cls): + super().setup_class() + cls.engine = "python" + cls.parser = "pandas" + + def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): + TestEvalNumexprPandas.check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs) + + +f = lambda *args, **kwargs: np.random.randn() + + +# ------------------------------------- +# gh-12388: Typecasting rules consistency with python + + +class TestTypeCasting: + @pytest.mark.parametrize("op", ["+", "-", "*", "**", "/"]) + # maybe someday... numexpr has too many upcasting rules now + # chain(*(np.sctypes[x] for x in ['uint', 'int', 'float'])) + @pytest.mark.parametrize("dt", [np.float32, np.float64]) + def test_binop_typecasting(self, engine, parser, op, dt): + df = tm.makeCustomDataframe(5, 3, data_gen_f=f, dtype=dt) + s = f"df {op} 3" + res = pd.eval(s, engine=engine, parser=parser) + assert df.values.dtype == dt + assert res.values.dtype == dt + tm.assert_frame_equal(res, eval(s)) + + s = f"3 {op} df" + res = pd.eval(s, engine=engine, parser=parser) + assert df.values.dtype == dt + assert res.values.dtype == dt + tm.assert_frame_equal(res, eval(s)) + + +# ------------------------------------- +# Basic and complex alignment + + +def _is_datetime(x): + return issubclass(x.dtype.type, np.datetime64) + + +def should_warn(*args): + not_mono = not any(map(operator.attrgetter("is_monotonic"), args)) + only_one_dt = reduce(operator.xor, map(_is_datetime, args)) + return not_mono and only_one_dt + + +class TestAlignment: + + index_types = "i", "u", "dt" + lhs_index_types = index_types + ("s",) # 'p' + + def test_align_nested_unary_op(self, engine, parser): + s = "df * ~2" + df = tm.makeCustomDataframe(5, 3, data_gen_f=f) + res = pd.eval(s, engine=engine, parser=parser) + tm.assert_frame_equal(res, df * ~2) + + def test_basic_frame_alignment(self, engine, parser): + args = product(self.lhs_index_types, self.index_types, self.index_types) + with warnings.catch_warnings(record=True): + warnings.simplefilter("always", RuntimeWarning) + for lr_idx_type, rr_idx_type, c_idx_type in args: + df = tm.makeCustomDataframe( + 10, 10, data_gen_f=f, r_idx_type=lr_idx_type, c_idx_type=c_idx_type + ) + df2 = tm.makeCustomDataframe( + 20, 10, data_gen_f=f, r_idx_type=rr_idx_type, c_idx_type=c_idx_type + ) + # only warns if not monotonic and not sortable + if should_warn(df.index, df2.index): + with tm.assert_produces_warning(RuntimeWarning): + res = pd.eval("df + df2", engine=engine, parser=parser) + else: + res = pd.eval("df + df2", engine=engine, parser=parser) + tm.assert_frame_equal(res, df + df2) + + def test_frame_comparison(self, engine, parser): + args = product(self.lhs_index_types, repeat=2) + for r_idx_type, c_idx_type in args: + df = tm.makeCustomDataframe( + 10, 10, data_gen_f=f, r_idx_type=r_idx_type, c_idx_type=c_idx_type + ) + res = pd.eval("df < 2", engine=engine, parser=parser) + tm.assert_frame_equal(res, df < 2) + + df3 = DataFrame(randn(*df.shape), index=df.index, columns=df.columns) + res = pd.eval("df < df3", engine=engine, parser=parser) + tm.assert_frame_equal(res, df < df3) + + @pytest.mark.slow + def test_medium_complex_frame_alignment(self, engine, parser): + args = product( + self.lhs_index_types, self.index_types, self.index_types, self.index_types + ) + + with warnings.catch_warnings(record=True): + warnings.simplefilter("always", RuntimeWarning) + + for r1, c1, r2, c2 in args: + df = tm.makeCustomDataframe( + 3, 2, data_gen_f=f, r_idx_type=r1, c_idx_type=c1 + ) + df2 = tm.makeCustomDataframe( + 4, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2 + ) + df3 = tm.makeCustomDataframe( + 5, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2 + ) + if should_warn(df.index, df2.index, df3.index): + with tm.assert_produces_warning(RuntimeWarning): + res = pd.eval("df + df2 + df3", engine=engine, parser=parser) + else: + res = pd.eval("df + df2 + df3", engine=engine, parser=parser) + tm.assert_frame_equal(res, df + df2 + df3) + + def test_basic_frame_series_alignment(self, engine, parser): + def testit(r_idx_type, c_idx_type, index_name): + df = tm.makeCustomDataframe( + 10, 10, data_gen_f=f, r_idx_type=r_idx_type, c_idx_type=c_idx_type + ) + index = getattr(df, index_name) + s = Series(np.random.randn(5), index[:5]) + + if should_warn(df.index, s.index): + with tm.assert_produces_warning(RuntimeWarning): + res = pd.eval("df + s", engine=engine, parser=parser) + else: + res = pd.eval("df + s", engine=engine, parser=parser) + + if r_idx_type == "dt" or c_idx_type == "dt": + expected = df.add(s) if engine == "numexpr" else df + s + else: + expected = df + s + tm.assert_frame_equal(res, expected) + + args = product(self.lhs_index_types, self.index_types, ("index", "columns")) + with warnings.catch_warnings(record=True): + warnings.simplefilter("always", RuntimeWarning) + for r_idx_type, c_idx_type, index_name in args: + testit(r_idx_type, c_idx_type, index_name) + + def test_basic_series_frame_alignment(self, engine, parser): + def testit(r_idx_type, c_idx_type, index_name): + df = tm.makeCustomDataframe( + 10, 7, data_gen_f=f, r_idx_type=r_idx_type, c_idx_type=c_idx_type + ) + index = getattr(df, index_name) + s = Series(np.random.randn(5), index[:5]) + if should_warn(s.index, df.index): + with tm.assert_produces_warning(RuntimeWarning): + res = pd.eval("s + df", engine=engine, parser=parser) + else: + res = pd.eval("s + df", engine=engine, parser=parser) + + if r_idx_type == "dt" or c_idx_type == "dt": + expected = df.add(s) if engine == "numexpr" else s + df + else: + expected = s + df + tm.assert_frame_equal(res, expected) + + # only test dt with dt, otherwise weird joins result + args = product(["i", "u", "s"], ["i", "u", "s"], ("index", "columns")) + with warnings.catch_warnings(record=True): + # avoid warning about comparing strings and ints + warnings.simplefilter("ignore", RuntimeWarning) + + for r_idx_type, c_idx_type, index_name in args: + testit(r_idx_type, c_idx_type, index_name) + + # dt with dt + args = product(["dt"], ["dt"], ("index", "columns")) + with warnings.catch_warnings(record=True): + # avoid warning about comparing strings and ints + warnings.simplefilter("ignore", RuntimeWarning) + + for r_idx_type, c_idx_type, index_name in args: + testit(r_idx_type, c_idx_type, index_name) + + def test_series_frame_commutativity(self, engine, parser): + args = product( + self.lhs_index_types, self.index_types, ("+", "*"), ("index", "columns") + ) + + with warnings.catch_warnings(record=True): + warnings.simplefilter("always", RuntimeWarning) + for r_idx_type, c_idx_type, op, index_name in args: + df = tm.makeCustomDataframe( + 10, 10, data_gen_f=f, r_idx_type=r_idx_type, c_idx_type=c_idx_type + ) + index = getattr(df, index_name) + s = Series(np.random.randn(5), index[:5]) + + lhs = f"s {op} df" + rhs = f"df {op} s" + if should_warn(df.index, s.index): + with tm.assert_produces_warning(RuntimeWarning): + a = pd.eval(lhs, engine=engine, parser=parser) + with tm.assert_produces_warning(RuntimeWarning): + b = pd.eval(rhs, engine=engine, parser=parser) + else: + a = pd.eval(lhs, engine=engine, parser=parser) + b = pd.eval(rhs, engine=engine, parser=parser) + + if r_idx_type != "dt" and c_idx_type != "dt": + if engine == "numexpr": + tm.assert_frame_equal(a, b) + + @pytest.mark.slow + def test_complex_series_frame_alignment(self, engine, parser): + import random + + args = product( + self.lhs_index_types, self.index_types, self.index_types, self.index_types + ) + n = 3 + m1 = 5 + m2 = 2 * m1 + + with warnings.catch_warnings(record=True): + warnings.simplefilter("always", RuntimeWarning) + for r1, r2, c1, c2 in args: + index_name = random.choice(["index", "columns"]) + obj_name = random.choice(["df", "df2"]) + + df = tm.makeCustomDataframe( + m1, n, data_gen_f=f, r_idx_type=r1, c_idx_type=c1 + ) + df2 = tm.makeCustomDataframe( + m2, n, data_gen_f=f, r_idx_type=r2, c_idx_type=c2 + ) + index = getattr(locals().get(obj_name), index_name) + s = Series(np.random.randn(n), index[:n]) + + if r2 == "dt" or c2 == "dt": + if engine == "numexpr": + expected2 = df2.add(s) + else: + expected2 = df2 + s + else: + expected2 = df2 + s + + if r1 == "dt" or c1 == "dt": + if engine == "numexpr": + expected = expected2.add(df) + else: + expected = expected2 + df + else: + expected = expected2 + df + + if should_warn(df2.index, s.index, df.index): + with tm.assert_produces_warning(RuntimeWarning): + res = pd.eval("df2 + s + df", engine=engine, parser=parser) + else: + res = pd.eval("df2 + s + df", engine=engine, parser=parser) + assert res.shape == expected.shape + tm.assert_frame_equal(res, expected) + + def test_performance_warning_for_poor_alignment(self, engine, parser): + df = DataFrame(randn(1000, 10)) + s = Series(randn(10000)) + if engine == "numexpr": + seen = PerformanceWarning + else: + seen = False + + with tm.assert_produces_warning(seen): + pd.eval("df + s", engine=engine, parser=parser) + + s = Series(randn(1000)) + with tm.assert_produces_warning(False): + pd.eval("df + s", engine=engine, parser=parser) + + df = DataFrame(randn(10, 10000)) + s = Series(randn(10000)) + with tm.assert_produces_warning(False): + pd.eval("df + s", engine=engine, parser=parser) + + df = DataFrame(randn(10, 10)) + s = Series(randn(10000)) + + is_python_engine = engine == "python" + + if not is_python_engine: + wrn = PerformanceWarning + else: + wrn = False + + with tm.assert_produces_warning(wrn) as w: + pd.eval("df + s", engine=engine, parser=parser) + + if not is_python_engine: + assert len(w) == 1 + msg = str(w[0].message) + loged = np.log10(s.size - df.shape[1]) + expected = ( + f"Alignment difference on axis 1 is larger " + f"than an order of magnitude on term 'df', " + f"by more than {loged:.4g}; performance may suffer" + ) + assert msg == expected + + +# ------------------------------------ +# Slightly more complex ops + + +@td.skip_if_no_ne +class TestOperationsNumExprPandas: + @classmethod + def setup_class(cls): + cls.engine = "numexpr" + cls.parser = "pandas" + cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms + + @classmethod + def teardown_class(cls): + del cls.engine, cls.parser + + def eval(self, *args, **kwargs): + kwargs["engine"] = self.engine + kwargs["parser"] = self.parser + kwargs["level"] = kwargs.pop("level", 0) + 1 + return pd.eval(*args, **kwargs) + + def test_simple_arith_ops(self): + ops = self.arith_ops + + for op in filter(lambda x: x != "//", ops): + ex = f"1 {op} 1" + ex2 = f"x {op} 1" + ex3 = f"1 {op} (x + 1)" + + if op in ("in", "not in"): + msg = "argument of type 'int' is not iterable" + with pytest.raises(TypeError, match=msg): + pd.eval(ex, engine=self.engine, parser=self.parser) + else: + expec = _eval_single_bin(1, op, 1, self.engine) + x = self.eval(ex, engine=self.engine, parser=self.parser) + assert x == expec + + expec = _eval_single_bin(x, op, 1, self.engine) + y = self.eval( + ex2, local_dict={"x": x}, engine=self.engine, parser=self.parser + ) + assert y == expec + + expec = _eval_single_bin(1, op, x + 1, self.engine) + y = self.eval( + ex3, local_dict={"x": x}, engine=self.engine, parser=self.parser + ) + assert y == expec + + def test_simple_bool_ops(self): + for op, lhs, rhs in product(expr._bool_ops_syms, (True, False), (True, False)): + ex = f"{lhs} {op} {rhs}" + res = self.eval(ex) + exp = eval(ex) + assert res == exp + + def test_bool_ops_with_constants(self): + for op, lhs, rhs in product( + expr._bool_ops_syms, ("True", "False"), ("True", "False") + ): + ex = f"{lhs} {op} {rhs}" + res = self.eval(ex) + exp = eval(ex) + assert res == exp + + def test_4d_ndarray_fails(self): + x = randn(3, 4, 5, 6) + y = Series(randn(10)) + with pytest.raises(NotImplementedError): + self.eval("x + y", local_dict={"x": x, "y": y}) + + def test_constant(self): + x = self.eval("1") + assert x == 1 + + def test_single_variable(self): + df = DataFrame(randn(10, 2)) + df2 = self.eval("df", local_dict={"df": df}) + tm.assert_frame_equal(df, df2) + + def test_truediv(self): + s = np.array([1]) + ex = "s / 1" + d = {"s": s} # noqa + + # FutureWarning: The `truediv` parameter in pd.eval is deprecated and will be + # removed in a future version. + with tm.assert_produces_warning(FutureWarning): + res = self.eval(ex, truediv=False) + tm.assert_numpy_array_equal(res, np.array([1.0])) + + with tm.assert_produces_warning(FutureWarning): + res = self.eval(ex, truediv=True) + tm.assert_numpy_array_equal(res, np.array([1.0])) + + with tm.assert_produces_warning(FutureWarning): + res = self.eval("1 / 2", truediv=True) + expec = 0.5 + assert res == expec + + with tm.assert_produces_warning(FutureWarning): + res = self.eval("1 / 2", truediv=False) + expec = 0.5 + assert res == expec + + with tm.assert_produces_warning(FutureWarning): + res = self.eval("s / 2", truediv=False) + expec = 0.5 + assert res == expec + + with tm.assert_produces_warning(FutureWarning): + res = self.eval("s / 2", truediv=True) + expec = 0.5 + assert res == expec + + def test_failing_subscript_with_name_error(self): + df = DataFrame(np.random.randn(5, 3)) # noqa + with pytest.raises(NameError): + self.eval("df[x > 2] > 2") + + def test_lhs_expression_subscript(self): + df = DataFrame(np.random.randn(5, 3)) + result = self.eval("(df + 1)[df > 2]", local_dict={"df": df}) + expected = (df + 1)[df > 2] + tm.assert_frame_equal(result, expected) + + def test_attr_expression(self): + df = DataFrame(np.random.randn(5, 3), columns=list("abc")) + expr1 = "df.a < df.b" + expec1 = df.a < df.b + expr2 = "df.a + df.b + df.c" + expec2 = df.a + df.b + df.c + expr3 = "df.a + df.b + df.c[df.b < 0]" + expec3 = df.a + df.b + df.c[df.b < 0] + exprs = expr1, expr2, expr3 + expecs = expec1, expec2, expec3 + for e, expec in zip(exprs, expecs): + tm.assert_series_equal(expec, self.eval(e, local_dict={"df": df})) + + def test_assignment_fails(self): + df = DataFrame(np.random.randn(5, 3), columns=list("abc")) + df2 = DataFrame(np.random.randn(5, 3)) + expr1 = "df = df2" + msg = "cannot assign without a target object" + with pytest.raises(ValueError, match=msg): + self.eval(expr1, local_dict={"df": df, "df2": df2}) + + def test_assignment_column(self): + df = DataFrame(np.random.randn(5, 2), columns=list("ab")) + orig_df = df.copy() + + # multiple assignees + with pytest.raises(SyntaxError, match="invalid syntax"): + df.eval("d c = a + b") + + # invalid assignees + msg = "left hand side of an assignment must be a single name" + with pytest.raises(SyntaxError, match=msg): + df.eval("d,c = a + b") + if compat.PY38: + msg = "cannot assign to function call" + else: + msg = "can't assign to function call" + with pytest.raises(SyntaxError, match=msg): + df.eval('Timestamp("20131001") = a + b') + + # single assignment - existing variable + expected = orig_df.copy() + expected["a"] = expected["a"] + expected["b"] + df = orig_df.copy() + df.eval("a = a + b", inplace=True) + tm.assert_frame_equal(df, expected) + + # single assignment - new variable + expected = orig_df.copy() + expected["c"] = expected["a"] + expected["b"] + df = orig_df.copy() + df.eval("c = a + b", inplace=True) + tm.assert_frame_equal(df, expected) + + # with a local name overlap + def f(): + df = orig_df.copy() + a = 1 # noqa + df.eval("a = 1 + b", inplace=True) + return df + + df = f() + expected = orig_df.copy() + expected["a"] = 1 + expected["b"] + tm.assert_frame_equal(df, expected) + + df = orig_df.copy() + + def f(): + a = 1 # noqa + old_a = df.a.copy() + df.eval("a = a + b", inplace=True) + result = old_a + df.b + tm.assert_series_equal(result, df.a, check_names=False) + assert result.name is None + + f() + + # multiple assignment + df = orig_df.copy() + df.eval("c = a + b", inplace=True) + msg = "can only assign a single expression" + with pytest.raises(SyntaxError, match=msg): + df.eval("c = a = b") + + # explicit targets + df = orig_df.copy() + self.eval("c = df.a + df.b", local_dict={"df": df}, target=df, inplace=True) + expected = orig_df.copy() + expected["c"] = expected["a"] + expected["b"] + tm.assert_frame_equal(df, expected) + + def test_column_in(self): + # GH 11235 + df = DataFrame({"a": [11], "b": [-32]}) + result = df.eval("a in [11, -32]") + expected = Series([True]) + tm.assert_series_equal(result, expected) + + def assignment_not_inplace(self): + # see gh-9297 + df = DataFrame(np.random.randn(5, 2), columns=list("ab")) + + actual = df.eval("c = a + b", inplace=False) + assert actual is not None + + expected = df.copy() + expected["c"] = expected["a"] + expected["b"] + tm.assert_frame_equal(df, expected) + + def test_multi_line_expression(self): + # GH 11149 + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + expected = df.copy() + + expected["c"] = expected["a"] + expected["b"] + expected["d"] = expected["c"] + expected["b"] + ans = df.eval( + """ + c = a + b + d = c + b""", + inplace=True, + ) + tm.assert_frame_equal(expected, df) + assert ans is None + + expected["a"] = expected["a"] - 1 + expected["e"] = expected["a"] + 2 + ans = df.eval( + """ + a = a - 1 + e = a + 2""", + inplace=True, + ) + tm.assert_frame_equal(expected, df) + assert ans is None + + # multi-line not valid if not all assignments + with pytest.raises(ValueError): + df.eval( + """ + a = b + 2 + b - 2""", + inplace=False, + ) + + def test_multi_line_expression_not_inplace(self): + # GH 11149 + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + expected = df.copy() + + expected["c"] = expected["a"] + expected["b"] + expected["d"] = expected["c"] + expected["b"] + df = df.eval( + """ + c = a + b + d = c + b""", + inplace=False, + ) + tm.assert_frame_equal(expected, df) + + expected["a"] = expected["a"] - 1 + expected["e"] = expected["a"] + 2 + df = df.eval( + """ + a = a - 1 + e = a + 2""", + inplace=False, + ) + tm.assert_frame_equal(expected, df) + + def test_multi_line_expression_local_variable(self): + # GH 15342 + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + expected = df.copy() + + local_var = 7 + expected["c"] = expected["a"] * local_var + expected["d"] = expected["c"] + local_var + ans = df.eval( + """ + c = a * @local_var + d = c + @local_var + """, + inplace=True, + ) + tm.assert_frame_equal(expected, df) + assert ans is None + + def test_multi_line_expression_callable_local_variable(self): + # 26426 + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + + def local_func(a, b): + return b + + expected = df.copy() + expected["c"] = expected["a"] * local_func(1, 7) + expected["d"] = expected["c"] + local_func(1, 7) + ans = df.eval( + """ + c = a * @local_func(1, 7) + d = c + @local_func(1, 7) + """, + inplace=True, + ) + tm.assert_frame_equal(expected, df) + assert ans is None + + def test_multi_line_expression_callable_local_variable_with_kwargs(self): + # 26426 + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + + def local_func(a, b): + return b + + expected = df.copy() + expected["c"] = expected["a"] * local_func(b=7, a=1) + expected["d"] = expected["c"] + local_func(b=7, a=1) + ans = df.eval( + """ + c = a * @local_func(b=7, a=1) + d = c + @local_func(b=7, a=1) + """, + inplace=True, + ) + tm.assert_frame_equal(expected, df) + assert ans is None + + def test_assignment_in_query(self): + # GH 8664 + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df_orig = df.copy() + with pytest.raises(ValueError): + df.query("a = 1") + tm.assert_frame_equal(df, df_orig) + + def test_query_inplace(self): + # see gh-11149 + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + expected = df.copy() + expected = expected[expected["a"] == 2] + df.query("a == 2", inplace=True) + tm.assert_frame_equal(expected, df) + + df = {} + expected = {"a": 3} + + self.eval("a = 1 + 2", target=df, inplace=True) + tm.assert_dict_equal(df, expected) + + @pytest.mark.parametrize("invalid_target", [1, "cat", [1, 2], np.array([]), (1, 3)]) + @pytest.mark.filterwarnings("ignore::FutureWarning") + def test_cannot_item_assign(self, invalid_target): + msg = "Cannot assign expression output to target" + expression = "a = 1 + 2" + + with pytest.raises(ValueError, match=msg): + self.eval(expression, target=invalid_target, inplace=True) + + if hasattr(invalid_target, "copy"): + with pytest.raises(ValueError, match=msg): + self.eval(expression, target=invalid_target, inplace=False) + + @pytest.mark.parametrize("invalid_target", [1, "cat", (1, 3)]) + def test_cannot_copy_item(self, invalid_target): + msg = "Cannot return a copy of the target" + expression = "a = 1 + 2" + + with pytest.raises(ValueError, match=msg): + self.eval(expression, target=invalid_target, inplace=False) + + @pytest.mark.parametrize("target", [1, "cat", [1, 2], np.array([]), (1, 3), {1: 2}]) + def test_inplace_no_assignment(self, target): + expression = "1 + 2" + + assert self.eval(expression, target=target, inplace=False) == 3 + + msg = "Cannot operate inplace if there is no assignment" + with pytest.raises(ValueError, match=msg): + self.eval(expression, target=target, inplace=True) + + def test_basic_period_index_boolean_expression(self): + df = tm.makeCustomDataframe(2, 2, data_gen_f=f, c_idx_type="p", r_idx_type="i") + + e = df < 2 + r = self.eval("df < 2", local_dict={"df": df}) + x = df < 2 + + tm.assert_frame_equal(r, e) + tm.assert_frame_equal(x, e) + + def test_basic_period_index_subscript_expression(self): + df = tm.makeCustomDataframe(2, 2, data_gen_f=f, c_idx_type="p", r_idx_type="i") + r = self.eval("df[df < 2 + 3]", local_dict={"df": df}) + e = df[df < 2 + 3] + tm.assert_frame_equal(r, e) + + def test_nested_period_index_subscript_expression(self): + df = tm.makeCustomDataframe(2, 2, data_gen_f=f, c_idx_type="p", r_idx_type="i") + r = self.eval("df[df[df < 2] < 2] + df * 2", local_dict={"df": df}) + e = df[df[df < 2] < 2] + df * 2 + tm.assert_frame_equal(r, e) + + def test_date_boolean(self): + df = DataFrame(randn(5, 3)) + df["dates1"] = date_range("1/1/2012", periods=5) + res = self.eval( + "df.dates1 < 20130101", + local_dict={"df": df}, + engine=self.engine, + parser=self.parser, + ) + expec = df.dates1 < "20130101" + tm.assert_series_equal(res, expec, check_names=False) + + def test_simple_in_ops(self): + if self.parser != "python": + res = pd.eval("1 in [1, 2]", engine=self.engine, parser=self.parser) + assert res + + res = pd.eval("2 in (1, 2)", engine=self.engine, parser=self.parser) + assert res + + res = pd.eval("3 in (1, 2)", engine=self.engine, parser=self.parser) + assert not res + + res = pd.eval("3 not in (1, 2)", engine=self.engine, parser=self.parser) + assert res + + res = pd.eval("[3] not in (1, 2)", engine=self.engine, parser=self.parser) + assert res + + res = pd.eval("[3] in ([3], 2)", engine=self.engine, parser=self.parser) + assert res + + res = pd.eval("[[3]] in [[[3]], 2]", engine=self.engine, parser=self.parser) + assert res + + res = pd.eval("(3,) in [(3,), 2]", engine=self.engine, parser=self.parser) + assert res + + res = pd.eval( + "(3,) not in [(3,), 2]", engine=self.engine, parser=self.parser + ) + assert not res + + res = pd.eval( + "[(3,)] in [[(3,)], 2]", engine=self.engine, parser=self.parser + ) + assert res + else: + with pytest.raises(NotImplementedError): + pd.eval("1 in [1, 2]", engine=self.engine, parser=self.parser) + with pytest.raises(NotImplementedError): + pd.eval("2 in (1, 2)", engine=self.engine, parser=self.parser) + with pytest.raises(NotImplementedError): + pd.eval("3 in (1, 2)", engine=self.engine, parser=self.parser) + with pytest.raises(NotImplementedError): + pd.eval("3 not in (1, 2)", engine=self.engine, parser=self.parser) + with pytest.raises(NotImplementedError): + pd.eval( + "[(3,)] in (1, 2, [(3,)])", engine=self.engine, parser=self.parser + ) + with pytest.raises(NotImplementedError): + pd.eval( + "[3] not in (1, 2, [[3]])", engine=self.engine, parser=self.parser + ) + + +@td.skip_if_no_ne +class TestOperationsNumExprPython(TestOperationsNumExprPandas): + @classmethod + def setup_class(cls): + super().setup_class() + cls.engine = "numexpr" + cls.parser = "python" + cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms + cls.arith_ops = filter(lambda x: x not in ("in", "not in"), cls.arith_ops) + + def test_check_many_exprs(self): + a = 1 # noqa + expr = " * ".join("a" * 33) + expected = 1 + res = pd.eval(expr, engine=self.engine, parser=self.parser) + assert res == expected + + def test_fails_and(self): + df = DataFrame(np.random.randn(5, 3)) + msg = "'BoolOp' nodes are not implemented" + with pytest.raises(NotImplementedError, match=msg): + pd.eval( + "df > 2 and df > 3", + local_dict={"df": df}, + parser=self.parser, + engine=self.engine, + ) + + def test_fails_or(self): + df = DataFrame(np.random.randn(5, 3)) + msg = "'BoolOp' nodes are not implemented" + with pytest.raises(NotImplementedError, match=msg): + pd.eval( + "df > 2 or df > 3", + local_dict={"df": df}, + parser=self.parser, + engine=self.engine, + ) + + def test_fails_not(self): + df = DataFrame(np.random.randn(5, 3)) + msg = "'Not' nodes are not implemented" + with pytest.raises(NotImplementedError, match=msg): + pd.eval( + "not df > 2", + local_dict={"df": df}, + parser=self.parser, + engine=self.engine, + ) + + def test_fails_ampersand(self): + df = DataFrame(np.random.randn(5, 3)) # noqa + ex = "(df + 2)[df > 1] > 0 & (df > 0)" + with pytest.raises(NotImplementedError): + pd.eval(ex, parser=self.parser, engine=self.engine) + + def test_fails_pipe(self): + df = DataFrame(np.random.randn(5, 3)) # noqa + ex = "(df + 2)[df > 1] > 0 | (df > 0)" + with pytest.raises(NotImplementedError): + pd.eval(ex, parser=self.parser, engine=self.engine) + + def test_bool_ops_with_constants(self): + for op, lhs, rhs in product( + expr._bool_ops_syms, ("True", "False"), ("True", "False") + ): + ex = f"{lhs} {op} {rhs}" + if op in ("and", "or"): + with pytest.raises(NotImplementedError): + self.eval(ex) + else: + res = self.eval(ex) + exp = eval(ex) + assert res == exp + + def test_simple_bool_ops(self): + for op, lhs, rhs in product(expr._bool_ops_syms, (True, False), (True, False)): + ex = f"lhs {op} rhs" + if op in ("and", "or"): + with pytest.raises(NotImplementedError): + pd.eval(ex, engine=self.engine, parser=self.parser) + else: + res = pd.eval(ex, engine=self.engine, parser=self.parser) + exp = eval(ex) + assert res == exp + + +class TestOperationsPythonPython(TestOperationsNumExprPython): + @classmethod + def setup_class(cls): + super().setup_class() + cls.engine = cls.parser = "python" + cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms + cls.arith_ops = filter(lambda x: x not in ("in", "not in"), cls.arith_ops) + + +class TestOperationsPythonPandas(TestOperationsNumExprPandas): + @classmethod + def setup_class(cls): + super().setup_class() + cls.engine = "python" + cls.parser = "pandas" + cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms + + +@td.skip_if_no_ne +class TestMathPythonPython: + @classmethod + def setup_class(cls): + cls.engine = "python" + cls.parser = "pandas" + cls.unary_fns = _unary_math_ops + cls.binary_fns = _binary_math_ops + + @classmethod + def teardown_class(cls): + del cls.engine, cls.parser + + def eval(self, *args, **kwargs): + kwargs["engine"] = self.engine + kwargs["parser"] = self.parser + kwargs["level"] = kwargs.pop("level", 0) + 1 + return pd.eval(*args, **kwargs) + + def test_unary_functions(self, unary_fns_for_ne): + df = DataFrame({"a": np.random.randn(10)}) + a = df.a + + for fn in unary_fns_for_ne: + expr = f"{fn}(a)" + got = self.eval(expr) + with np.errstate(all="ignore"): + expect = getattr(np, fn)(a) + tm.assert_series_equal(got, expect, check_names=False) + + def test_floor_and_ceil_functions_raise_error(self, ne_lt_2_6_9, unary_fns_for_ne): + for fn in ("floor", "ceil"): + msg = f'"{fn}" is not a supported function' + with pytest.raises(ValueError, match=msg): + expr = f"{fn}(100)" + self.eval(expr) + + def test_binary_functions(self): + df = DataFrame({"a": np.random.randn(10), "b": np.random.randn(10)}) + a = df.a + b = df.b + for fn in self.binary_fns: + expr = f"{fn}(a, b)" + got = self.eval(expr) + with np.errstate(all="ignore"): + expect = getattr(np, fn)(a, b) + tm.assert_almost_equal(got, expect, check_names=False) + + def test_df_use_case(self): + df = DataFrame({"a": np.random.randn(10), "b": np.random.randn(10)}) + df.eval( + "e = arctan2(sin(a), b)", + engine=self.engine, + parser=self.parser, + inplace=True, + ) + got = df.e + expect = np.arctan2(np.sin(df.a), df.b) + tm.assert_series_equal(got, expect, check_names=False) + + def test_df_arithmetic_subexpression(self): + df = DataFrame({"a": np.random.randn(10), "b": np.random.randn(10)}) + df.eval("e = sin(a + b)", engine=self.engine, parser=self.parser, inplace=True) + got = df.e + expect = np.sin(df.a + df.b) + tm.assert_series_equal(got, expect, check_names=False) + + def check_result_type(self, dtype, expect_dtype): + df = DataFrame({"a": np.random.randn(10).astype(dtype)}) + assert df.a.dtype == dtype + df.eval("b = sin(a)", engine=self.engine, parser=self.parser, inplace=True) + got = df.b + expect = np.sin(df.a) + assert expect.dtype == got.dtype + assert expect_dtype == got.dtype + tm.assert_series_equal(got, expect, check_names=False) + + def test_result_types(self): + self.check_result_type(np.int32, np.float64) + self.check_result_type(np.int64, np.float64) + self.check_result_type(np.float32, np.float32) + self.check_result_type(np.float64, np.float64) + + @td.skip_if_windows + def test_result_complex128(self): + # xref https://github.com/pandas-dev/pandas/issues/12293 + # this fails on Windows, apparently a floating point precision issue + + # Did not test complex64 because DataFrame is converting it to + # complex128. Due to https://github.com/pandas-dev/pandas/issues/10952 + self.check_result_type(np.complex128, np.complex128) + + def test_undefined_func(self): + df = DataFrame({"a": np.random.randn(10)}) + msg = '"mysin" is not a supported function' + + with pytest.raises(ValueError, match=msg): + df.eval("mysin(a)", engine=self.engine, parser=self.parser) + + def test_keyword_arg(self): + df = DataFrame({"a": np.random.randn(10)}) + msg = 'Function "sin" does not support keyword arguments' + + with pytest.raises(TypeError, match=msg): + df.eval("sin(x=a)", engine=self.engine, parser=self.parser) + + +class TestMathPythonPandas(TestMathPythonPython): + @classmethod + def setup_class(cls): + super().setup_class() + cls.engine = "python" + cls.parser = "pandas" + + +class TestMathNumExprPandas(TestMathPythonPython): + @classmethod + def setup_class(cls): + super().setup_class() + cls.engine = "numexpr" + cls.parser = "pandas" + + +class TestMathNumExprPython(TestMathPythonPython): + @classmethod + def setup_class(cls): + super().setup_class() + cls.engine = "numexpr" + cls.parser = "python" + + +_var_s = randn(10) + + +class TestScope: + def test_global_scope(self, engine, parser): + e = "_var_s * 2" + tm.assert_numpy_array_equal( + _var_s * 2, pd.eval(e, engine=engine, parser=parser) + ) + + def test_no_new_locals(self, engine, parser): + x = 1 # noqa + lcls = locals().copy() + pd.eval("x + 1", local_dict=lcls, engine=engine, parser=parser) + lcls2 = locals().copy() + lcls2.pop("lcls") + assert lcls == lcls2 + + def test_no_new_globals(self, engine, parser): + x = 1 # noqa + gbls = globals().copy() + pd.eval("x + 1", engine=engine, parser=parser) + gbls2 = globals().copy() + assert gbls == gbls2 + + +@td.skip_if_no_ne +def test_invalid_engine(): + msg = "Invalid engine 'asdf' passed" + with pytest.raises(KeyError, match=msg): + pd.eval("x + y", local_dict={"x": 1, "y": 2}, engine="asdf") + + +@td.skip_if_no_ne +def test_invalid_parser(): + msg = "Invalid parser 'asdf' passed" + with pytest.raises(KeyError, match=msg): + pd.eval("x + y", local_dict={"x": 1, "y": 2}, parser="asdf") + + +_parsers: Dict[str, Type[BaseExprVisitor]] = { + "python": PythonExprVisitor, + "pytables": pytables.PyTablesExprVisitor, + "pandas": PandasExprVisitor, +} + + +@pytest.mark.parametrize("engine", _engines) +@pytest.mark.parametrize("parser", _parsers) +def test_disallowed_nodes(engine, parser): + VisitorClass = _parsers[parser] + uns_ops = VisitorClass.unsupported_nodes + inst = VisitorClass("x + 1", engine, parser) + + for ops in uns_ops: + with pytest.raises(NotImplementedError): + getattr(inst, ops)() + + +def test_syntax_error_exprs(engine, parser): + e = "s +" + with pytest.raises(SyntaxError): + pd.eval(e, engine=engine, parser=parser) + + +def test_name_error_exprs(engine, parser): + e = "s + t" + with pytest.raises(NameError): + pd.eval(e, engine=engine, parser=parser) + + +def test_invalid_local_variable_reference(engine, parser): + a, b = 1, 2 # noqa + exprs = "a + @b", "@a + b", "@a + @b" + + for _expr in exprs: + if parser != "pandas": + with pytest.raises(SyntaxError, match="The '@' prefix is only"): + pd.eval(_expr, engine=engine, parser=parser) + else: + with pytest.raises(SyntaxError, match="The '@' prefix is not"): + pd.eval(_expr, engine=engine, parser=parser) + + +def test_numexpr_builtin_raises(engine, parser): + sin, dotted_line = 1, 2 + if engine == "numexpr": + msg = "Variables in expression .+" + with pytest.raises(NumExprClobberingError, match=msg): + pd.eval("sin + dotted_line", engine=engine, parser=parser) + else: + res = pd.eval("sin + dotted_line", engine=engine, parser=parser) + assert res == sin + dotted_line + + +def test_bad_resolver_raises(engine, parser): + cannot_resolve = 42, 3.0 + with pytest.raises(TypeError, match="Resolver of type .+"): + pd.eval("1 + 2", resolvers=cannot_resolve, engine=engine, parser=parser) + + +def test_empty_string_raises(engine, parser): + # GH 13139 + with pytest.raises(ValueError, match="expr cannot be an empty string"): + pd.eval("", engine=engine, parser=parser) + + +def test_more_than_one_expression_raises(engine, parser): + with pytest.raises(SyntaxError, match=("only a single expression is allowed")): + pd.eval("1 + 1; 2 + 2", engine=engine, parser=parser) + + +@pytest.mark.parametrize("cmp", ("and", "or")) +@pytest.mark.parametrize("lhs", (int, float)) +@pytest.mark.parametrize("rhs", (int, float)) +def test_bool_ops_fails_on_scalars(lhs, cmp, rhs, engine, parser): + gen = {int: lambda: np.random.randint(10), float: np.random.randn} + + mid = gen[lhs]() # noqa + lhs = gen[lhs]() # noqa + rhs = gen[rhs]() # noqa + + ex1 = f"lhs {cmp} mid {cmp} rhs" + ex2 = f"lhs {cmp} mid and mid {cmp} rhs" + ex3 = f"(lhs {cmp} mid) & (mid {cmp} rhs)" + for ex in (ex1, ex2, ex3): + with pytest.raises(NotImplementedError): + pd.eval(ex, engine=engine, parser=parser) + + +@pytest.mark.parametrize( + "other", + [ + "'x'", + pytest.param( + "...", marks=pytest.mark.xfail(not compat.PY38, reason="GH-28116") + ), + ], +) +def test_equals_various(other): + df = DataFrame({"A": ["a", "b", "c"]}) + result = df.eval(f"A == {other}") + expected = Series([False, False, False], name="A") + if _USE_NUMEXPR: + # https://github.com/pandas-dev/pandas/issues/10239 + # lose name with numexpr engine. Remove when that's fixed. + expected.name = None + tm.assert_series_equal(result, expected) + + +def test_inf(engine, parser): + s = "inf + 1" + expected = np.inf + result = pd.eval(s, engine=engine, parser=parser) + assert result == expected + + +def test_truediv_deprecated(engine, parser): + # GH#29182 + match = "The `truediv` parameter in pd.eval is deprecated" + + with tm.assert_produces_warning(FutureWarning) as m: + pd.eval("1+1", engine=engine, parser=parser, truediv=True) + + assert len(m) == 1 + assert match in str(m[0].message) + + with tm.assert_produces_warning(FutureWarning) as m: + pd.eval("1+1", engine=engine, parser=parser, truediv=False) + + assert len(m) == 1 + assert match in str(m[0].message) + + +def test_negate_lt_eq_le(engine, parser): + df = pd.DataFrame([[0, 10], [1, 20]], columns=["cat", "count"]) + expected = df[~(df.cat > 0)] + + result = df.query("~(cat > 0)", engine=engine, parser=parser) + tm.assert_frame_equal(result, expected) + + if parser == "python": + with pytest.raises(NotImplementedError): + df.query("not (cat > 0)", engine=engine, parser=parser) + else: + result = df.query("not (cat > 0)", engine=engine, parser=parser) + tm.assert_frame_equal(result, expected) + + +class TestValidate: + def test_validate_bool_args(self): + invalid_values = [1, "True", [1, 2, 3], 5.0] + + for value in invalid_values: + with pytest.raises(ValueError): + pd.eval("2+2", inplace=value) diff --git a/venv/Lib/site-packages/pandas/tests/config/__init__.py b/venv/Lib/site-packages/pandas/tests/config/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/config/test_config.py b/venv/Lib/site-packages/pandas/tests/config/test_config.py new file mode 100644 index 0000000..5164064 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/config/test_config.py @@ -0,0 +1,470 @@ +import warnings + +import pytest + +from pandas._config import config as cf +from pandas._config.config import OptionError + +import pandas as pd + + +class TestConfig: + @classmethod + def setup_class(cls): + from copy import deepcopy + + cls.cf = cf + cls.gc = deepcopy(getattr(cls.cf, "_global_config")) + cls.do = deepcopy(getattr(cls.cf, "_deprecated_options")) + cls.ro = deepcopy(getattr(cls.cf, "_registered_options")) + + def setup_method(self, method): + setattr(self.cf, "_global_config", {}) + setattr(self.cf, "options", self.cf.DictWrapper(self.cf._global_config)) + setattr(self.cf, "_deprecated_options", {}) + setattr(self.cf, "_registered_options", {}) + + # Our test fixture in conftest.py sets "chained_assignment" + # to "raise" only after all test methods have been setup. + # However, after this setup, there is no longer any + # "chained_assignment" option, so re-register it. + self.cf.register_option("chained_assignment", "raise") + + def teardown_method(self, method): + setattr(self.cf, "_global_config", self.gc) + setattr(self.cf, "_deprecated_options", self.do) + setattr(self.cf, "_registered_options", self.ro) + + def test_api(self): + + # the pandas object exposes the user API + assert hasattr(pd, "get_option") + assert hasattr(pd, "set_option") + assert hasattr(pd, "reset_option") + assert hasattr(pd, "describe_option") + + def test_is_one_of_factory(self): + v = self.cf.is_one_of_factory([None, 12]) + + v(12) + v(None) + msg = r"Value must be one of None\|12" + with pytest.raises(ValueError, match=msg): + v(1.1) + + def test_register_option(self): + self.cf.register_option("a", 1, "doc") + + # can't register an already registered option + msg = "Option 'a' has already been registered" + with pytest.raises(OptionError, match=msg): + self.cf.register_option("a", 1, "doc") + + # can't register an already registered option + msg = "Path prefix to option 'a' is already an option" + with pytest.raises(OptionError, match=msg): + self.cf.register_option("a.b.c.d1", 1, "doc") + with pytest.raises(OptionError, match=msg): + self.cf.register_option("a.b.c.d2", 1, "doc") + + # no python keywords + msg = "for is a python keyword" + with pytest.raises(ValueError, match=msg): + self.cf.register_option("for", 0) + with pytest.raises(ValueError, match=msg): + self.cf.register_option("a.for.b", 0) + # must be valid identifier (ensure attribute access works) + msg = "oh my goddess! is not a valid identifier" + with pytest.raises(ValueError, match=msg): + self.cf.register_option("Oh my Goddess!", 0) + + # we can register options several levels deep + # without predefining the intermediate steps + # and we can define differently named options + # in the same namespace + self.cf.register_option("k.b.c.d1", 1, "doc") + self.cf.register_option("k.b.c.d2", 1, "doc") + + def test_describe_option(self): + self.cf.register_option("a", 1, "doc") + self.cf.register_option("b", 1, "doc2") + self.cf.deprecate_option("b") + + self.cf.register_option("c.d.e1", 1, "doc3") + self.cf.register_option("c.d.e2", 1, "doc4") + self.cf.register_option("f", 1) + self.cf.register_option("g.h", 1) + self.cf.register_option("k", 2) + self.cf.deprecate_option("g.h", rkey="k") + self.cf.register_option("l", "foo") + + # non-existent keys raise KeyError + msg = r"No such keys\(s\)" + with pytest.raises(OptionError, match=msg): + self.cf.describe_option("no.such.key") + + # we can get the description for any key we registered + assert "doc" in self.cf.describe_option("a", _print_desc=False) + assert "doc2" in self.cf.describe_option("b", _print_desc=False) + assert "precated" in self.cf.describe_option("b", _print_desc=False) + assert "doc3" in self.cf.describe_option("c.d.e1", _print_desc=False) + assert "doc4" in self.cf.describe_option("c.d.e2", _print_desc=False) + + # if no doc is specified we get a default message + # saying "description not available" + assert "vailable" in self.cf.describe_option("f", _print_desc=False) + assert "vailable" in self.cf.describe_option("g.h", _print_desc=False) + assert "precated" in self.cf.describe_option("g.h", _print_desc=False) + assert "k" in self.cf.describe_option("g.h", _print_desc=False) + + # default is reported + assert "foo" in self.cf.describe_option("l", _print_desc=False) + # current value is reported + assert "bar" not in self.cf.describe_option("l", _print_desc=False) + self.cf.set_option("l", "bar") + assert "bar" in self.cf.describe_option("l", _print_desc=False) + + def test_case_insensitive(self): + self.cf.register_option("KanBAN", 1, "doc") + + assert "doc" in self.cf.describe_option("kanbaN", _print_desc=False) + assert self.cf.get_option("kanBaN") == 1 + self.cf.set_option("KanBan", 2) + assert self.cf.get_option("kAnBaN") == 2 + + # gets of non-existent keys fail + msg = r"No such keys\(s\): 'no_such_option'" + with pytest.raises(OptionError, match=msg): + self.cf.get_option("no_such_option") + self.cf.deprecate_option("KanBan") + + assert self.cf._is_deprecated("kAnBaN") + + def test_get_option(self): + self.cf.register_option("a", 1, "doc") + self.cf.register_option("b.c", "hullo", "doc2") + self.cf.register_option("b.b", None, "doc2") + + # gets of existing keys succeed + assert self.cf.get_option("a") == 1 + assert self.cf.get_option("b.c") == "hullo" + assert self.cf.get_option("b.b") is None + + # gets of non-existent keys fail + msg = r"No such keys\(s\): 'no_such_option'" + with pytest.raises(OptionError, match=msg): + self.cf.get_option("no_such_option") + + def test_set_option(self): + self.cf.register_option("a", 1, "doc") + self.cf.register_option("b.c", "hullo", "doc2") + self.cf.register_option("b.b", None, "doc2") + + assert self.cf.get_option("a") == 1 + assert self.cf.get_option("b.c") == "hullo" + assert self.cf.get_option("b.b") is None + + self.cf.set_option("a", 2) + self.cf.set_option("b.c", "wurld") + self.cf.set_option("b.b", 1.1) + + assert self.cf.get_option("a") == 2 + assert self.cf.get_option("b.c") == "wurld" + assert self.cf.get_option("b.b") == 1.1 + + msg = r"No such keys\(s\): 'no.such.key'" + with pytest.raises(OptionError, match=msg): + self.cf.set_option("no.such.key", None) + + def test_set_option_empty_args(self): + msg = "Must provide an even number of non-keyword arguments" + with pytest.raises(ValueError, match=msg): + self.cf.set_option() + + def test_set_option_uneven_args(self): + msg = "Must provide an even number of non-keyword arguments" + with pytest.raises(ValueError, match=msg): + self.cf.set_option("a.b", 2, "b.c") + + def test_set_option_invalid_single_argument_type(self): + msg = "Must provide an even number of non-keyword arguments" + with pytest.raises(ValueError, match=msg): + self.cf.set_option(2) + + def test_set_option_multiple(self): + self.cf.register_option("a", 1, "doc") + self.cf.register_option("b.c", "hullo", "doc2") + self.cf.register_option("b.b", None, "doc2") + + assert self.cf.get_option("a") == 1 + assert self.cf.get_option("b.c") == "hullo" + assert self.cf.get_option("b.b") is None + + self.cf.set_option("a", "2", "b.c", None, "b.b", 10.0) + + assert self.cf.get_option("a") == "2" + assert self.cf.get_option("b.c") is None + assert self.cf.get_option("b.b") == 10.0 + + def test_validation(self): + self.cf.register_option("a", 1, "doc", validator=self.cf.is_int) + self.cf.register_option("d", 1, "doc", validator=self.cf.is_nonnegative_int) + self.cf.register_option("b.c", "hullo", "doc2", validator=self.cf.is_text) + + msg = "Value must have type ''" + with pytest.raises(ValueError, match=msg): + self.cf.register_option("a.b.c.d2", "NO", "doc", validator=self.cf.is_int) + + self.cf.set_option("a", 2) # int is_int + self.cf.set_option("b.c", "wurld") # str is_str + self.cf.set_option("d", 2) + self.cf.set_option("d", None) # non-negative int can be None + + # None not is_int + with pytest.raises(ValueError, match=msg): + self.cf.set_option("a", None) + with pytest.raises(ValueError, match=msg): + self.cf.set_option("a", "ab") + + msg = "Value must be a nonnegative integer or None" + with pytest.raises(ValueError, match=msg): + self.cf.register_option( + "a.b.c.d3", "NO", "doc", validator=self.cf.is_nonnegative_int + ) + with pytest.raises(ValueError, match=msg): + self.cf.register_option( + "a.b.c.d3", -2, "doc", validator=self.cf.is_nonnegative_int + ) + + msg = r"Value must be an instance of \|" + with pytest.raises(ValueError, match=msg): + self.cf.set_option("b.c", 1) + + validator = self.cf.is_one_of_factory([None, self.cf.is_callable]) + self.cf.register_option("b", lambda: None, "doc", validator=validator) + self.cf.set_option("b", "%.1f".format) # Formatter is callable + self.cf.set_option("b", None) # Formatter is none (default) + with pytest.raises(ValueError, match="Value must be a callable"): + self.cf.set_option("b", "%.1f") + + def test_reset_option(self): + self.cf.register_option("a", 1, "doc", validator=self.cf.is_int) + self.cf.register_option("b.c", "hullo", "doc2", validator=self.cf.is_str) + assert self.cf.get_option("a") == 1 + assert self.cf.get_option("b.c") == "hullo" + + self.cf.set_option("a", 2) + self.cf.set_option("b.c", "wurld") + assert self.cf.get_option("a") == 2 + assert self.cf.get_option("b.c") == "wurld" + + self.cf.reset_option("a") + assert self.cf.get_option("a") == 1 + assert self.cf.get_option("b.c") == "wurld" + self.cf.reset_option("b.c") + assert self.cf.get_option("a") == 1 + assert self.cf.get_option("b.c") == "hullo" + + def test_reset_option_all(self): + self.cf.register_option("a", 1, "doc", validator=self.cf.is_int) + self.cf.register_option("b.c", "hullo", "doc2", validator=self.cf.is_str) + assert self.cf.get_option("a") == 1 + assert self.cf.get_option("b.c") == "hullo" + + self.cf.set_option("a", 2) + self.cf.set_option("b.c", "wurld") + assert self.cf.get_option("a") == 2 + assert self.cf.get_option("b.c") == "wurld" + + self.cf.reset_option("all") + assert self.cf.get_option("a") == 1 + assert self.cf.get_option("b.c") == "hullo" + + def test_deprecate_option(self): + # we can deprecate non-existent options + self.cf.deprecate_option("foo") + + assert self.cf._is_deprecated("foo") + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + with pytest.raises(KeyError, match="No such keys.s.: 'foo'"): + self.cf.get_option("foo") + assert len(w) == 1 # should have raised one warning + assert "deprecated" in str(w[-1]) # we get the default message + + self.cf.register_option("a", 1, "doc", validator=self.cf.is_int) + self.cf.register_option("b.c", "hullo", "doc2") + self.cf.register_option("foo", "hullo", "doc2") + + self.cf.deprecate_option("a", removal_ver="nifty_ver") + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + self.cf.get_option("a") + + assert len(w) == 1 # should have raised one warning + assert "eprecated" in str(w[-1]) # we get the default message + assert "nifty_ver" in str(w[-1]) # with the removal_ver quoted + + msg = "Option 'a' has already been defined as deprecated" + with pytest.raises(OptionError, match=msg): + self.cf.deprecate_option("a") + + self.cf.deprecate_option("b.c", "zounds!") + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + self.cf.get_option("b.c") + + assert len(w) == 1 # should have raised one warning + assert "zounds!" in str(w[-1]) # we get the custom message + + # test rerouting keys + self.cf.register_option("d.a", "foo", "doc2") + self.cf.register_option("d.dep", "bar", "doc2") + assert self.cf.get_option("d.a") == "foo" + assert self.cf.get_option("d.dep") == "bar" + + self.cf.deprecate_option("d.dep", rkey="d.a") # reroute d.dep to d.a + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + assert self.cf.get_option("d.dep") == "foo" + + assert len(w) == 1 # should have raised one warning + assert "eprecated" in str(w[-1]) # we get the custom message + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + self.cf.set_option("d.dep", "baz") # should overwrite "d.a" + + assert len(w) == 1 # should have raised one warning + assert "eprecated" in str(w[-1]) # we get the custom message + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + assert self.cf.get_option("d.dep") == "baz" + + assert len(w) == 1 # should have raised one warning + assert "eprecated" in str(w[-1]) # we get the custom message + + def test_config_prefix(self): + with self.cf.config_prefix("base"): + self.cf.register_option("a", 1, "doc1") + self.cf.register_option("b", 2, "doc2") + assert self.cf.get_option("a") == 1 + assert self.cf.get_option("b") == 2 + + self.cf.set_option("a", 3) + self.cf.set_option("b", 4) + assert self.cf.get_option("a") == 3 + assert self.cf.get_option("b") == 4 + + assert self.cf.get_option("base.a") == 3 + assert self.cf.get_option("base.b") == 4 + assert "doc1" in self.cf.describe_option("base.a", _print_desc=False) + assert "doc2" in self.cf.describe_option("base.b", _print_desc=False) + + self.cf.reset_option("base.a") + self.cf.reset_option("base.b") + + with self.cf.config_prefix("base"): + assert self.cf.get_option("a") == 1 + assert self.cf.get_option("b") == 2 + + def test_callback(self): + k = [None] + v = [None] + + def callback(key): + k.append(key) + v.append(self.cf.get_option(key)) + + self.cf.register_option("d.a", "foo", cb=callback) + self.cf.register_option("d.b", "foo", cb=callback) + + del k[-1], v[-1] + self.cf.set_option("d.a", "fooz") + assert k[-1] == "d.a" + assert v[-1] == "fooz" + + del k[-1], v[-1] + self.cf.set_option("d.b", "boo") + assert k[-1] == "d.b" + assert v[-1] == "boo" + + del k[-1], v[-1] + self.cf.reset_option("d.b") + assert k[-1] == "d.b" + + def test_set_ContextManager(self): + def eq(val): + assert self.cf.get_option("a") == val + + self.cf.register_option("a", 0) + eq(0) + with self.cf.option_context("a", 15): + eq(15) + with self.cf.option_context("a", 25): + eq(25) + eq(15) + eq(0) + + self.cf.set_option("a", 17) + eq(17) + + def test_attribute_access(self): + holder = [] + + def f3(key): + holder.append(True) + + self.cf.register_option("a", 0) + self.cf.register_option("c", 0, cb=f3) + options = self.cf.options + + assert options.a == 0 + with self.cf.option_context("a", 15): + assert options.a == 15 + + options.a = 500 + assert self.cf.get_option("a") == 500 + + self.cf.reset_option("a") + assert options.a == self.cf.get_option("a", 0) + + msg = "You can only set the value of existing options" + with pytest.raises(OptionError, match=msg): + options.b = 1 + with pytest.raises(OptionError, match=msg): + options.display = 1 + + # make sure callback kicks when using this form of setting + options.c = 1 + assert len(holder) == 1 + + def test_option_context_scope(self): + # Ensure that creating a context does not affect the existing + # environment as it is supposed to be used with the `with` statement. + # See https://github.com/pandas-dev/pandas/issues/8514 + + original_value = 60 + context_value = 10 + option_name = "a" + + self.cf.register_option(option_name, original_value) + + # Ensure creating contexts didn't affect the current context. + ctx = self.cf.option_context(option_name, context_value) + assert self.cf.get_option(option_name) == original_value + + # Ensure the correct value is available inside the context. + with ctx: + assert self.cf.get_option(option_name) == context_value + + # Ensure the current context is reset + assert self.cf.get_option(option_name) == original_value + + def test_dictwrapper_getattr(self): + options = self.cf.options + # GH 19789 + with pytest.raises(OptionError, match="No such option"): + options.bananas + assert not hasattr(options, "bananas") diff --git a/venv/Lib/site-packages/pandas/tests/config/test_localization.py b/venv/Lib/site-packages/pandas/tests/config/test_localization.py new file mode 100644 index 0000000..e815a90 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/config/test_localization.py @@ -0,0 +1,105 @@ +import codecs +import locale +import os + +import pytest + +from pandas._config.localization import can_set_locale, get_locales, set_locale + +from pandas.compat import is_platform_windows + +import pandas as pd + +_all_locales = get_locales() or [] +_current_locale = locale.getlocale() + +# Don't run any of these tests if we are on Windows or have no locales. +pytestmark = pytest.mark.skipif( + is_platform_windows() or not _all_locales, reason="Need non-Windows and locales" +) + +_skip_if_only_one_locale = pytest.mark.skipif( + len(_all_locales) <= 1, reason="Need multiple locales for meaningful test" +) + + +def test_can_set_locale_valid_set(): + # Can set the default locale. + assert can_set_locale("") + + +def test_can_set_locale_invalid_set(): + # Cannot set an invalid locale. + assert not can_set_locale("non-existent_locale") + + +def test_can_set_locale_invalid_get(monkeypatch): + # see GH#22129 + # In some cases, an invalid locale can be set, + # but a subsequent getlocale() raises a ValueError. + + def mock_get_locale(): + raise ValueError() + + with monkeypatch.context() as m: + m.setattr(locale, "getlocale", mock_get_locale) + assert not can_set_locale("") + + +def test_get_locales_at_least_one(): + # see GH#9744 + assert len(_all_locales) > 0 + + +@_skip_if_only_one_locale +def test_get_locales_prefix(): + first_locale = _all_locales[0] + assert len(get_locales(prefix=first_locale[:2])) > 0 + + +@_skip_if_only_one_locale +@pytest.mark.parametrize( + "lang,enc", + [ + ("it_CH", "UTF-8"), + ("en_US", "ascii"), + ("zh_CN", "GB2312"), + ("it_IT", "ISO-8859-1"), + ], +) +def test_set_locale(lang, enc): + if all(x is None for x in _current_locale): + # Not sure why, but on some Travis runs with pytest, + # getlocale() returned (None, None). + pytest.skip("Current locale is not set.") + + enc = codecs.lookup(enc).name + new_locale = lang, enc + + if not can_set_locale(new_locale): + msg = "unsupported locale setting" + + with pytest.raises(locale.Error, match=msg): + with set_locale(new_locale): + pass + else: + with set_locale(new_locale) as normalized_locale: + new_lang, new_enc = normalized_locale.split(".") + new_enc = codecs.lookup(enc).name + + normalized_locale = new_lang, new_enc + assert normalized_locale == new_locale + + # Once we exit the "with" statement, locale should be back to what it was. + current_locale = locale.getlocale() + assert current_locale == _current_locale + + +def test_encoding_detected(): + system_locale = os.environ.get("LC_ALL") + system_encoding = system_locale.split(".")[-1] if system_locale else "utf-8" + + assert ( + codecs.lookup(pd.options.display.encoding).name + == codecs.lookup(system_encoding).name + ) diff --git a/venv/Lib/site-packages/pandas/tests/dtypes/__init__.py b/venv/Lib/site-packages/pandas/tests/dtypes/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/dtypes/cast/__init__.py b/venv/Lib/site-packages/pandas/tests/dtypes/cast/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/dtypes/cast/test_construct_from_scalar.py b/venv/Lib/site-packages/pandas/tests/dtypes/cast/test_construct_from_scalar.py new file mode 100644 index 0000000..cc823a3 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/dtypes/cast/test_construct_from_scalar.py @@ -0,0 +1,20 @@ +from pandas.core.dtypes.cast import construct_1d_arraylike_from_scalar +from pandas.core.dtypes.dtypes import CategoricalDtype + +from pandas import Categorical +import pandas._testing as tm + + +def test_cast_1d_array_like_from_scalar_categorical(): + # see gh-19565 + # + # Categorical result from scalar did not maintain + # categories and ordering of the passed dtype. + cats = ["a", "b", "c"] + cat_type = CategoricalDtype(categories=cats, ordered=False) + expected = Categorical(["a", "a"], categories=cats) + + result = construct_1d_arraylike_from_scalar("a", len(expected), cat_type) + tm.assert_categorical_equal( + result, expected, check_category_order=True, check_dtype=True + ) diff --git a/venv/Lib/site-packages/pandas/tests/dtypes/cast/test_construct_ndarray.py b/venv/Lib/site-packages/pandas/tests/dtypes/cast/test_construct_ndarray.py new file mode 100644 index 0000000..fe27139 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/dtypes/cast/test_construct_ndarray.py @@ -0,0 +1,21 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.cast import construct_1d_ndarray_preserving_na + +import pandas._testing as tm + + +@pytest.mark.parametrize( + "values, dtype, expected", + [ + ([1, 2, 3], None, np.array([1, 2, 3])), + (np.array([1, 2, 3]), None, np.array([1, 2, 3])), + (["1", "2", None], None, np.array(["1", "2", None])), + (["1", "2", None], np.dtype("str"), np.array(["1", "2", None])), + ([1, 2, None], np.dtype("str"), np.array(["1", "2", None])), + ], +) +def test_construct_1d_ndarray_preserving_na(values, dtype, expected): + result = construct_1d_ndarray_preserving_na(values, dtype=dtype) + tm.assert_numpy_array_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/dtypes/cast/test_construct_object_arr.py b/venv/Lib/site-packages/pandas/tests/dtypes/cast/test_construct_object_arr.py new file mode 100644 index 0000000..cb44f91 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/dtypes/cast/test_construct_object_arr.py @@ -0,0 +1,20 @@ +import pytest + +from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike + + +@pytest.mark.parametrize("datum1", [1, 2.0, "3", (4, 5), [6, 7], None]) +@pytest.mark.parametrize("datum2", [8, 9.0, "10", (11, 12), [13, 14], None]) +def test_cast_1d_array(datum1, datum2): + data = [datum1, datum2] + result = construct_1d_object_array_from_listlike(data) + + # Direct comparison fails: https://github.com/numpy/numpy/issues/10218 + assert result.dtype == "object" + assert list(result) == data + + +@pytest.mark.parametrize("val", [1, 2.0, None]) +def test_cast_1d_array_invalid_scalar(val): + with pytest.raises(TypeError, match="has no len()"): + construct_1d_object_array_from_listlike(val) diff --git a/venv/Lib/site-packages/pandas/tests/dtypes/cast/test_convert_objects.py b/venv/Lib/site-packages/pandas/tests/dtypes/cast/test_convert_objects.py new file mode 100644 index 0000000..a28d554 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/dtypes/cast/test_convert_objects.py @@ -0,0 +1,12 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.cast import maybe_convert_objects + + +@pytest.mark.parametrize("data", [[1, 2], ["apply", "banana"]]) +def test_maybe_convert_objects_copy(data): + arr = np.array(data) + out = maybe_convert_objects(arr) + + assert arr is not out diff --git a/venv/Lib/site-packages/pandas/tests/dtypes/cast/test_downcast.py b/venv/Lib/site-packages/pandas/tests/dtypes/cast/test_downcast.py new file mode 100644 index 0000000..d6e6ed3 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/dtypes/cast/test_downcast.py @@ -0,0 +1,99 @@ +import decimal + +import numpy as np +import pytest + +from pandas.core.dtypes.cast import maybe_downcast_to_dtype + +from pandas import DatetimeIndex, Series, Timestamp +import pandas._testing as tm + + +@pytest.mark.parametrize( + "arr,dtype,expected", + [ + ( + np.array([8.5, 8.6, 8.7, 8.8, 8.9999999999995]), + "infer", + np.array([8.5, 8.6, 8.7, 8.8, 8.9999999999995]), + ), + ( + np.array([8.0, 8.0, 8.0, 8.0, 8.9999999999995]), + "infer", + np.array([8, 8, 8, 8, 9], dtype=np.int64), + ), + ( + np.array([8.0, 8.0, 8.0, 8.0, 9.0000000000005]), + "infer", + np.array([8, 8, 8, 8, 9], dtype=np.int64), + ), + ( + # This is a judgement call, but we do _not_ downcast Decimal + # objects + np.array([decimal.Decimal(0.0)]), + "int64", + np.array([decimal.Decimal(0.0)]), + ), + ], +) +def test_downcast(arr, expected, dtype): + result = maybe_downcast_to_dtype(arr, dtype) + tm.assert_numpy_array_equal(result, expected) + + +def test_downcast_booleans(): + # see gh-16875: coercing of booleans. + ser = Series([True, True, False]) + result = maybe_downcast_to_dtype(ser, np.dtype(np.float64)) + + expected = ser + tm.assert_series_equal(result, expected) + + +def test_downcast_conversion_no_nan(any_real_dtype): + dtype = any_real_dtype + expected = np.array([1, 2]) + arr = np.array([1.0, 2.0], dtype=dtype) + + result = maybe_downcast_to_dtype(arr, "infer") + tm.assert_almost_equal(result, expected, check_dtype=False) + + +def test_downcast_conversion_nan(float_dtype): + dtype = float_dtype + data = [1.0, 2.0, np.nan] + + expected = np.array(data, dtype=dtype) + arr = np.array(data, dtype=dtype) + + result = maybe_downcast_to_dtype(arr, "infer") + tm.assert_almost_equal(result, expected) + + +def test_downcast_conversion_empty(any_real_dtype): + dtype = any_real_dtype + arr = np.array([], dtype=dtype) + result = maybe_downcast_to_dtype(arr, "int64") + tm.assert_numpy_array_equal(result, np.array([], dtype=np.int64)) + + +@pytest.mark.parametrize("klass", [np.datetime64, np.timedelta64]) +def test_datetime_likes_nan(klass): + dtype = klass.__name__ + "[ns]" + arr = np.array([1, 2, np.nan]) + + exp = np.array([1, 2, klass("NaT")], dtype) + res = maybe_downcast_to_dtype(arr, dtype) + tm.assert_numpy_array_equal(res, exp) + + +@pytest.mark.parametrize("as_asi", [True, False]) +def test_datetime_with_timezone(as_asi): + # see gh-15426 + ts = Timestamp("2016-01-01 12:00:00", tz="US/Pacific") + exp = DatetimeIndex([ts, ts]) + + obj = exp.asi8 if as_asi else exp + res = maybe_downcast_to_dtype(obj, exp.dtype) + + tm.assert_index_equal(res, exp) diff --git a/venv/Lib/site-packages/pandas/tests/dtypes/cast/test_find_common_type.py b/venv/Lib/site-packages/pandas/tests/dtypes/cast/test_find_common_type.py new file mode 100644 index 0000000..ac7a522 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/dtypes/cast/test_find_common_type.py @@ -0,0 +1,122 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.cast import find_common_type +from pandas.core.dtypes.dtypes import CategoricalDtype, DatetimeTZDtype, PeriodDtype + + +@pytest.mark.parametrize( + "source_dtypes,expected_common_dtype", + [ + ((np.int64,), np.int64), + ((np.uint64,), np.uint64), + ((np.float32,), np.float32), + ((np.object,), np.object), + # Into ints. + ((np.int16, np.int64), np.int64), + ((np.int32, np.uint32), np.int64), + ((np.uint16, np.uint64), np.uint64), + # Into floats. + ((np.float16, np.float32), np.float32), + ((np.float16, np.int16), np.float32), + ((np.float32, np.int16), np.float32), + ((np.uint64, np.int64), np.float64), + ((np.int16, np.float64), np.float64), + ((np.float16, np.int64), np.float64), + # Into others. + ((np.complex128, np.int32), np.complex128), + ((np.object, np.float32), np.object), + ((np.object, np.int16), np.object), + # Bool with int. + ((np.dtype("bool"), np.int64), np.object), + ((np.dtype("bool"), np.int32), np.object), + ((np.dtype("bool"), np.int16), np.object), + ((np.dtype("bool"), np.int8), np.object), + ((np.dtype("bool"), np.uint64), np.object), + ((np.dtype("bool"), np.uint32), np.object), + ((np.dtype("bool"), np.uint16), np.object), + ((np.dtype("bool"), np.uint8), np.object), + # Bool with float. + ((np.dtype("bool"), np.float64), np.object), + ((np.dtype("bool"), np.float32), np.object), + ( + (np.dtype("datetime64[ns]"), np.dtype("datetime64[ns]")), + np.dtype("datetime64[ns]"), + ), + ( + (np.dtype("timedelta64[ns]"), np.dtype("timedelta64[ns]")), + np.dtype("timedelta64[ns]"), + ), + ( + (np.dtype("datetime64[ns]"), np.dtype("datetime64[ms]")), + np.dtype("datetime64[ns]"), + ), + ( + (np.dtype("timedelta64[ms]"), np.dtype("timedelta64[ns]")), + np.dtype("timedelta64[ns]"), + ), + ((np.dtype("datetime64[ns]"), np.dtype("timedelta64[ns]")), np.object), + ((np.dtype("datetime64[ns]"), np.int64), np.object), + ], +) +def test_numpy_dtypes(source_dtypes, expected_common_dtype): + assert find_common_type(source_dtypes) == expected_common_dtype + + +def test_raises_empty_input(): + with pytest.raises(ValueError, match="no types given"): + find_common_type([]) + + +@pytest.mark.parametrize( + "dtypes,exp_type", + [ + ([CategoricalDtype()], "category"), + ([np.object, CategoricalDtype()], np.object), + ([CategoricalDtype(), CategoricalDtype()], "category"), + ], +) +def test_categorical_dtype(dtypes, exp_type): + assert find_common_type(dtypes) == exp_type + + +def test_datetimetz_dtype_match(): + dtype = DatetimeTZDtype(unit="ns", tz="US/Eastern") + assert find_common_type([dtype, dtype]) == "datetime64[ns, US/Eastern]" + + +@pytest.mark.parametrize( + "dtype2", + [ + DatetimeTZDtype(unit="ns", tz="Asia/Tokyo"), + np.dtype("datetime64[ns]"), + np.object, + np.int64, + ], +) +def test_datetimetz_dtype_mismatch(dtype2): + dtype = DatetimeTZDtype(unit="ns", tz="US/Eastern") + assert find_common_type([dtype, dtype2]) == np.object + assert find_common_type([dtype2, dtype]) == np.object + + +def test_period_dtype_match(): + dtype = PeriodDtype(freq="D") + assert find_common_type([dtype, dtype]) == "period[D]" + + +@pytest.mark.parametrize( + "dtype2", + [ + DatetimeTZDtype(unit="ns", tz="Asia/Tokyo"), + PeriodDtype(freq="2D"), + PeriodDtype(freq="H"), + np.dtype("datetime64[ns]"), + np.object, + np.int64, + ], +) +def test_period_dtype_mismatch(dtype2): + dtype = PeriodDtype(freq="D") + assert find_common_type([dtype, dtype2]) == np.object + assert find_common_type([dtype2, dtype]) == np.object diff --git a/venv/Lib/site-packages/pandas/tests/dtypes/cast/test_infer_datetimelike.py b/venv/Lib/site-packages/pandas/tests/dtypes/cast/test_infer_datetimelike.py new file mode 100644 index 0000000..f4253e9 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/dtypes/cast/test_infer_datetimelike.py @@ -0,0 +1,23 @@ +import numpy as np +import pytest + +from pandas import DataFrame, NaT, Series, Timestamp + + +@pytest.mark.parametrize( + "data,exp_size", + [ + # see gh-16362. + ([[NaT, "a", "b", 0], [NaT, "b", "c", 1]], 8), + ([[NaT, "a", 0], [NaT, "b", 1]], 6), + ], +) +def test_maybe_infer_to_datetimelike_df_construct(data, exp_size): + result = DataFrame(np.array(data)) + assert result.size == exp_size + + +def test_maybe_infer_to_datetimelike_ser_construct(): + # see gh-19671. + result = Series(["M1701", Timestamp("20130101")]) + assert result.dtype.kind == "O" diff --git a/venv/Lib/site-packages/pandas/tests/dtypes/cast/test_infer_dtype.py b/venv/Lib/site-packages/pandas/tests/dtypes/cast/test_infer_dtype.py new file mode 100644 index 0000000..2744cfa --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/dtypes/cast/test_infer_dtype.py @@ -0,0 +1,198 @@ +from datetime import date, datetime, timedelta + +import numpy as np +import pytest + +from pandas.core.dtypes.cast import ( + cast_scalar_to_array, + infer_dtype_from_array, + infer_dtype_from_scalar, +) +from pandas.core.dtypes.common import is_dtype_equal + +from pandas import ( + Categorical, + Interval, + Period, + Series, + Timedelta, + Timestamp, + date_range, +) +import pandas._testing as tm + + +@pytest.fixture(params=[True, False]) +def pandas_dtype(request): + return request.param + + +def test_infer_dtype_from_int_scalar(any_int_dtype): + # Test that infer_dtype_from_scalar is + # returning correct dtype for int and float. + data = np.dtype(any_int_dtype).type(12) + dtype, val = infer_dtype_from_scalar(data) + assert dtype == type(data) + + +def test_infer_dtype_from_float_scalar(float_dtype): + float_dtype = np.dtype(float_dtype).type + data = float_dtype(12) + + dtype, val = infer_dtype_from_scalar(data) + assert dtype == float_dtype + + +@pytest.mark.parametrize("data,exp_dtype", [(12, np.int64), (np.float(12), np.float64)]) +def test_infer_dtype_from_python_scalar(data, exp_dtype): + dtype, val = infer_dtype_from_scalar(data) + assert dtype == exp_dtype + + +@pytest.mark.parametrize("bool_val", [True, False]) +def test_infer_dtype_from_boolean(bool_val): + dtype, val = infer_dtype_from_scalar(bool_val) + assert dtype == np.bool_ + + +def test_infer_dtype_from_complex(complex_dtype): + data = np.dtype(complex_dtype).type(1) + dtype, val = infer_dtype_from_scalar(data) + assert dtype == np.complex_ + + +@pytest.mark.parametrize( + "data", [np.datetime64(1, "ns"), Timestamp(1), datetime(2000, 1, 1, 0, 0)] +) +def test_infer_dtype_from_datetime(data): + dtype, val = infer_dtype_from_scalar(data) + assert dtype == "M8[ns]" + + +@pytest.mark.parametrize("data", [np.timedelta64(1, "ns"), Timedelta(1), timedelta(1)]) +def test_infer_dtype_from_timedelta(data): + dtype, val = infer_dtype_from_scalar(data) + assert dtype == "m8[ns]" + + +@pytest.mark.parametrize("freq", ["M", "D"]) +def test_infer_dtype_from_period(freq, pandas_dtype): + p = Period("2011-01-01", freq=freq) + dtype, val = infer_dtype_from_scalar(p, pandas_dtype=pandas_dtype) + + if pandas_dtype: + exp_dtype = f"period[{freq}]" + exp_val = p.ordinal + else: + exp_dtype = np.object_ + exp_val = p + + assert dtype == exp_dtype + assert val == exp_val + + +@pytest.mark.parametrize( + "data", [date(2000, 1, 1), "foo", Timestamp(1, tz="US/Eastern")] +) +def test_infer_dtype_misc(data): + dtype, val = infer_dtype_from_scalar(data) + assert dtype == np.object_ + + +@pytest.mark.parametrize("tz", ["UTC", "US/Eastern", "Asia/Tokyo"]) +def test_infer_from_scalar_tz(tz, pandas_dtype): + dt = Timestamp(1, tz=tz) + dtype, val = infer_dtype_from_scalar(dt, pandas_dtype=pandas_dtype) + + if pandas_dtype: + exp_dtype = f"datetime64[ns, {tz}]" + exp_val = dt.value + else: + exp_dtype = np.object_ + exp_val = dt + + assert dtype == exp_dtype + assert val == exp_val + + +@pytest.mark.parametrize( + "left, right, subtype", + [ + (0, 1, "int64"), + (0.0, 1.0, "float64"), + (Timestamp(0), Timestamp(1), "datetime64[ns]"), + (Timestamp(0, tz="UTC"), Timestamp(1, tz="UTC"), "datetime64[ns, UTC]"), + (Timedelta(0), Timedelta(1), "timedelta64[ns]"), + ], +) +def test_infer_from_interval(left, right, subtype, closed, pandas_dtype): + # GH 30337 + interval = Interval(left, right, closed) + result_dtype, result_value = infer_dtype_from_scalar(interval, pandas_dtype) + expected_dtype = f"interval[{subtype}]" if pandas_dtype else np.object_ + assert result_dtype == expected_dtype + assert result_value == interval + + +def test_infer_dtype_from_scalar_errors(): + msg = "invalid ndarray passed to infer_dtype_from_scalar" + + with pytest.raises(ValueError, match=msg): + infer_dtype_from_scalar(np.array([1])) + + +@pytest.mark.parametrize( + "arr, expected, pandas_dtype", + [ + ("foo", np.object_, False), + (b"foo", np.object_, False), + (1, np.int_, False), + (1.5, np.float_, False), + ([1], np.int_, False), + (np.array([1], dtype=np.int64), np.int64, False), + ([np.nan, 1, ""], np.object_, False), + (np.array([[1.0, 2.0]]), np.float_, False), + (Categorical(list("aabc")), np.object_, False), + (Categorical([1, 2, 3]), np.int64, False), + (Categorical(list("aabc")), "category", True), + (Categorical([1, 2, 3]), "category", True), + (Timestamp("20160101"), np.object_, False), + (np.datetime64("2016-01-01"), np.dtype("=M8[D]"), False), + (date_range("20160101", periods=3), np.dtype("=M8[ns]"), False), + ( + date_range("20160101", periods=3, tz="US/Eastern"), + "datetime64[ns, US/Eastern]", + True, + ), + (Series([1.0, 2, 3]), np.float64, False), + (Series(list("abc")), np.object_, False), + ( + Series(date_range("20160101", periods=3, tz="US/Eastern")), + "datetime64[ns, US/Eastern]", + True, + ), + ], +) +def test_infer_dtype_from_array(arr, expected, pandas_dtype): + dtype, _ = infer_dtype_from_array(arr, pandas_dtype=pandas_dtype) + assert is_dtype_equal(dtype, expected) + + +@pytest.mark.parametrize( + "obj,dtype", + [ + (1, np.int64), + (1.1, np.float64), + (Timestamp("2011-01-01"), "datetime64[ns]"), + (Timestamp("2011-01-01", tz="US/Eastern"), np.object), + (Period("2011-01-01", freq="D"), np.object), + ], +) +def test_cast_scalar_to_array(obj, dtype): + shape = (3, 2) + + exp = np.empty(shape, dtype=dtype) + exp.fill(obj) + + arr = cast_scalar_to_array(shape, obj, dtype=dtype) + tm.assert_numpy_array_equal(arr, exp) diff --git a/venv/Lib/site-packages/pandas/tests/dtypes/cast/test_promote.py b/venv/Lib/site-packages/pandas/tests/dtypes/cast/test_promote.py new file mode 100644 index 0000000..69f8f46 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/dtypes/cast/test_promote.py @@ -0,0 +1,631 @@ +""" +These test the method maybe_promote from core/dtypes/cast.py +""" + +import datetime + +import numpy as np +import pytest + +from pandas._libs.tslibs import NaT + +from pandas.core.dtypes.cast import maybe_promote +from pandas.core.dtypes.common import ( + is_complex_dtype, + is_datetime64_dtype, + is_datetime_or_timedelta_dtype, + is_float_dtype, + is_integer_dtype, + is_object_dtype, + is_scalar, + is_timedelta64_dtype, +) +from pandas.core.dtypes.dtypes import DatetimeTZDtype +from pandas.core.dtypes.missing import isna + +import pandas as pd + + +@pytest.fixture( + params=[ + bool, + "uint8", + "int32", + "uint64", + "float32", + "float64", + "complex64", + "complex128", + "M8[ns]", + "m8[ns]", + str, + bytes, + object, + ] +) +def any_numpy_dtype_reduced(request): + """ + Parameterized fixture for numpy dtypes, reduced from any_numpy_dtype. + + * bool + * 'int32' + * 'uint64' + * 'float32' + * 'float64' + * 'complex64' + * 'complex128' + * 'M8[ns]' + * 'M8[ns]' + * str + * bytes + * object + """ + return request.param + + +def _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar=None): + """ + Auxiliary function to unify testing of scalar/array promotion. + + Parameters + ---------- + dtype : dtype + The value to pass on as the first argument to maybe_promote. + fill_value : scalar + The value to pass on as the second argument to maybe_promote as + a scalar. + expected_dtype : dtype + The expected dtype returned by maybe_promote (by design this is the + same regardless of whether fill_value was passed as a scalar or in an + array!). + exp_val_for_scalar : scalar + The expected value for the (potentially upcast) fill_value returned by + maybe_promote. + """ + assert is_scalar(fill_value) + + # here, we pass on fill_value as a scalar directly; the expected value + # returned from maybe_promote is fill_value, potentially upcast to the + # returned dtype. + result_dtype, result_fill_value = maybe_promote(dtype, fill_value) + expected_fill_value = exp_val_for_scalar + + assert result_dtype == expected_dtype + _assert_match(result_fill_value, expected_fill_value) + + +def _assert_match(result_fill_value, expected_fill_value): + # GH#23982/25425 require the same type in addition to equality/NA-ness + res_type = type(result_fill_value) + ex_type = type(expected_fill_value) + if res_type.__name__ == "uint64": + # No idea why, but these (sometimes) do not compare as equal + assert ex_type.__name__ == "uint64" + elif res_type.__name__ == "ulonglong": + # On some builds we get this instead of np.uint64 + # Note: cant check res_type.dtype.itemsize directly on numpy 1.18 + assert res_type(0).itemsize == 8 + assert ex_type == res_type or ex_type == np.uint64 + else: + # On some builds, type comparison fails, e.g. np.int32 != np.int32 + assert res_type == ex_type or res_type.__name__ == ex_type.__name__ + + match_value = result_fill_value == expected_fill_value + + # Note: type check above ensures that we have the _same_ NA value + # for missing values, None == None (which is checked + # through match_value above), but np.nan != np.nan and pd.NaT != pd.NaT + match_missing = isna(result_fill_value) and isna(expected_fill_value) + + assert match_value or match_missing + + +@pytest.mark.parametrize( + "dtype, fill_value, expected_dtype", + [ + # size 8 + ("int8", 1, "int8"), + ("int8", np.iinfo("int8").max + 1, "int16"), + ("int8", np.iinfo("int16").max + 1, "int32"), + ("int8", np.iinfo("int32").max + 1, "int64"), + ("int8", np.iinfo("int64").max + 1, "object"), + ("int8", -1, "int8"), + ("int8", np.iinfo("int8").min - 1, "int16"), + ("int8", np.iinfo("int16").min - 1, "int32"), + ("int8", np.iinfo("int32").min - 1, "int64"), + ("int8", np.iinfo("int64").min - 1, "object"), + # keep signed-ness as long as possible + ("uint8", 1, "uint8"), + ("uint8", np.iinfo("int8").max + 1, "uint8"), + ("uint8", np.iinfo("uint8").max + 1, "uint16"), + ("uint8", np.iinfo("int16").max + 1, "uint16"), + ("uint8", np.iinfo("uint16").max + 1, "uint32"), + ("uint8", np.iinfo("int32").max + 1, "uint32"), + ("uint8", np.iinfo("uint32").max + 1, "uint64"), + ("uint8", np.iinfo("int64").max + 1, "uint64"), + ("uint8", np.iinfo("uint64").max + 1, "object"), + # max of uint8 cannot be contained in int8 + ("uint8", -1, "int16"), + ("uint8", np.iinfo("int8").min - 1, "int16"), + ("uint8", np.iinfo("int16").min - 1, "int32"), + ("uint8", np.iinfo("int32").min - 1, "int64"), + ("uint8", np.iinfo("int64").min - 1, "object"), + # size 16 + ("int16", 1, "int16"), + ("int16", np.iinfo("int8").max + 1, "int16"), + ("int16", np.iinfo("int16").max + 1, "int32"), + ("int16", np.iinfo("int32").max + 1, "int64"), + ("int16", np.iinfo("int64").max + 1, "object"), + ("int16", -1, "int16"), + ("int16", np.iinfo("int8").min - 1, "int16"), + ("int16", np.iinfo("int16").min - 1, "int32"), + ("int16", np.iinfo("int32").min - 1, "int64"), + ("int16", np.iinfo("int64").min - 1, "object"), + ("uint16", 1, "uint16"), + ("uint16", np.iinfo("int8").max + 1, "uint16"), + ("uint16", np.iinfo("uint8").max + 1, "uint16"), + ("uint16", np.iinfo("int16").max + 1, "uint16"), + ("uint16", np.iinfo("uint16").max + 1, "uint32"), + ("uint16", np.iinfo("int32").max + 1, "uint32"), + ("uint16", np.iinfo("uint32").max + 1, "uint64"), + ("uint16", np.iinfo("int64").max + 1, "uint64"), + ("uint16", np.iinfo("uint64").max + 1, "object"), + ("uint16", -1, "int32"), + ("uint16", np.iinfo("int8").min - 1, "int32"), + ("uint16", np.iinfo("int16").min - 1, "int32"), + ("uint16", np.iinfo("int32").min - 1, "int64"), + ("uint16", np.iinfo("int64").min - 1, "object"), + # size 32 + ("int32", 1, "int32"), + ("int32", np.iinfo("int8").max + 1, "int32"), + ("int32", np.iinfo("int16").max + 1, "int32"), + ("int32", np.iinfo("int32").max + 1, "int64"), + ("int32", np.iinfo("int64").max + 1, "object"), + ("int32", -1, "int32"), + ("int32", np.iinfo("int8").min - 1, "int32"), + ("int32", np.iinfo("int16").min - 1, "int32"), + ("int32", np.iinfo("int32").min - 1, "int64"), + ("int32", np.iinfo("int64").min - 1, "object"), + ("uint32", 1, "uint32"), + ("uint32", np.iinfo("int8").max + 1, "uint32"), + ("uint32", np.iinfo("uint8").max + 1, "uint32"), + ("uint32", np.iinfo("int16").max + 1, "uint32"), + ("uint32", np.iinfo("uint16").max + 1, "uint32"), + ("uint32", np.iinfo("int32").max + 1, "uint32"), + ("uint32", np.iinfo("uint32").max + 1, "uint64"), + ("uint32", np.iinfo("int64").max + 1, "uint64"), + ("uint32", np.iinfo("uint64").max + 1, "object"), + ("uint32", -1, "int64"), + ("uint32", np.iinfo("int8").min - 1, "int64"), + ("uint32", np.iinfo("int16").min - 1, "int64"), + ("uint32", np.iinfo("int32").min - 1, "int64"), + ("uint32", np.iinfo("int64").min - 1, "object"), + # size 64 + ("int64", 1, "int64"), + ("int64", np.iinfo("int8").max + 1, "int64"), + ("int64", np.iinfo("int16").max + 1, "int64"), + ("int64", np.iinfo("int32").max + 1, "int64"), + ("int64", np.iinfo("int64").max + 1, "object"), + ("int64", -1, "int64"), + ("int64", np.iinfo("int8").min - 1, "int64"), + ("int64", np.iinfo("int16").min - 1, "int64"), + ("int64", np.iinfo("int32").min - 1, "int64"), + ("int64", np.iinfo("int64").min - 1, "object"), + ("uint64", 1, "uint64"), + ("uint64", np.iinfo("int8").max + 1, "uint64"), + ("uint64", np.iinfo("uint8").max + 1, "uint64"), + ("uint64", np.iinfo("int16").max + 1, "uint64"), + ("uint64", np.iinfo("uint16").max + 1, "uint64"), + ("uint64", np.iinfo("int32").max + 1, "uint64"), + ("uint64", np.iinfo("uint32").max + 1, "uint64"), + ("uint64", np.iinfo("int64").max + 1, "uint64"), + ("uint64", np.iinfo("uint64").max + 1, "object"), + ("uint64", -1, "object"), + ("uint64", np.iinfo("int8").min - 1, "object"), + ("uint64", np.iinfo("int16").min - 1, "object"), + ("uint64", np.iinfo("int32").min - 1, "object"), + ("uint64", np.iinfo("int64").min - 1, "object"), + ], +) +def test_maybe_promote_int_with_int(dtype, fill_value, expected_dtype): + dtype = np.dtype(dtype) + expected_dtype = np.dtype(expected_dtype) + + # output is not a generic int, but corresponds to expected_dtype + exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +def test_maybe_promote_int_with_float(any_int_dtype, float_dtype): + dtype = np.dtype(any_int_dtype) + fill_dtype = np.dtype(float_dtype) + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling int with float always upcasts to float64 + expected_dtype = np.float64 + # fill_value can be different float type + exp_val_for_scalar = np.float64(fill_value) + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +def test_maybe_promote_float_with_int(float_dtype, any_int_dtype): + + dtype = np.dtype(float_dtype) + fill_dtype = np.dtype(any_int_dtype) + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling float with int always keeps float dtype + # because: np.finfo('float32').max > np.iinfo('uint64').max + expected_dtype = dtype + # output is not a generic float, but corresponds to expected_dtype + exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +@pytest.mark.parametrize( + "dtype, fill_value, expected_dtype", + [ + # float filled with float + ("float32", 1, "float32"), + ("float32", np.finfo("float32").max * 1.1, "float64"), + ("float64", 1, "float64"), + ("float64", np.finfo("float32").max * 1.1, "float64"), + # complex filled with float + ("complex64", 1, "complex64"), + ("complex64", np.finfo("float32").max * 1.1, "complex128"), + ("complex128", 1, "complex128"), + ("complex128", np.finfo("float32").max * 1.1, "complex128"), + # float filled with complex + ("float32", 1 + 1j, "complex64"), + ("float32", np.finfo("float32").max * (1.1 + 1j), "complex128"), + ("float64", 1 + 1j, "complex128"), + ("float64", np.finfo("float32").max * (1.1 + 1j), "complex128"), + # complex filled with complex + ("complex64", 1 + 1j, "complex64"), + ("complex64", np.finfo("float32").max * (1.1 + 1j), "complex128"), + ("complex128", 1 + 1j, "complex128"), + ("complex128", np.finfo("float32").max * (1.1 + 1j), "complex128"), + ], +) +def test_maybe_promote_float_with_float(dtype, fill_value, expected_dtype): + + dtype = np.dtype(dtype) + expected_dtype = np.dtype(expected_dtype) + + # output is not a generic float, but corresponds to expected_dtype + exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +def test_maybe_promote_bool_with_any(any_numpy_dtype_reduced): + dtype = np.dtype(bool) + fill_dtype = np.dtype(any_numpy_dtype_reduced) + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling bool with anything but bool casts to object + expected_dtype = np.dtype(object) if fill_dtype != bool else fill_dtype + exp_val_for_scalar = fill_value + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +def test_maybe_promote_any_with_bool(any_numpy_dtype_reduced): + dtype = np.dtype(any_numpy_dtype_reduced) + fill_value = True + + # filling anything but bool with bool casts to object + expected_dtype = np.dtype(object) if dtype != bool else dtype + # output is not a generic bool, but corresponds to expected_dtype + exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +def test_maybe_promote_bytes_with_any(bytes_dtype, any_numpy_dtype_reduced): + dtype = np.dtype(bytes_dtype) + fill_dtype = np.dtype(any_numpy_dtype_reduced) + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # we never use bytes dtype internally, always promote to object + expected_dtype = np.dtype(np.object_) + exp_val_for_scalar = fill_value + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +def test_maybe_promote_any_with_bytes(any_numpy_dtype_reduced, bytes_dtype): + dtype = np.dtype(any_numpy_dtype_reduced) + + # create array of given dtype + fill_value = b"abc" + + # we never use bytes dtype internally, always promote to object + expected_dtype = np.dtype(np.object_) + # output is not a generic bytes, but corresponds to expected_dtype + exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +def test_maybe_promote_datetime64_with_any(datetime64_dtype, any_numpy_dtype_reduced): + dtype = np.dtype(datetime64_dtype) + fill_dtype = np.dtype(any_numpy_dtype_reduced) + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling datetime with anything but datetime casts to object + if is_datetime64_dtype(fill_dtype): + expected_dtype = dtype + # for datetime dtypes, scalar values get cast to to_datetime64 + exp_val_for_scalar = pd.Timestamp(fill_value).to_datetime64() + else: + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +@pytest.mark.parametrize( + "fill_value", + [ + pd.Timestamp("now"), + np.datetime64("now"), + datetime.datetime.now(), + datetime.date.today(), + ], + ids=["pd.Timestamp", "np.datetime64", "datetime.datetime", "datetime.date"], +) +def test_maybe_promote_any_with_datetime64( + any_numpy_dtype_reduced, datetime64_dtype, fill_value +): + dtype = np.dtype(any_numpy_dtype_reduced) + + # filling datetime with anything but datetime casts to object + if is_datetime64_dtype(dtype): + expected_dtype = dtype + # for datetime dtypes, scalar values get cast to pd.Timestamp.value + exp_val_for_scalar = pd.Timestamp(fill_value).to_datetime64() + else: + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +def test_maybe_promote_datetimetz_with_any_numpy_dtype( + tz_aware_fixture, any_numpy_dtype_reduced +): + dtype = DatetimeTZDtype(tz=tz_aware_fixture) + fill_dtype = np.dtype(any_numpy_dtype_reduced) + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling datetimetz with any numpy dtype casts to object + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +def test_maybe_promote_datetimetz_with_datetimetz(tz_aware_fixture, tz_aware_fixture2): + dtype = DatetimeTZDtype(tz=tz_aware_fixture) + fill_dtype = DatetimeTZDtype(tz=tz_aware_fixture2) + + # create array of given dtype; casts "1" to correct dtype + fill_value = pd.Series([10 ** 9], dtype=fill_dtype)[0] + + # filling datetimetz with datetimetz casts to object, unless tz matches + exp_val_for_scalar = fill_value + if dtype.tz == fill_dtype.tz: + expected_dtype = dtype + else: + expected_dtype = np.dtype(object) + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +@pytest.mark.parametrize("fill_value", [None, np.nan, NaT]) +def test_maybe_promote_datetimetz_with_na(tz_aware_fixture, fill_value): + + dtype = DatetimeTZDtype(tz=tz_aware_fixture) + + expected_dtype = dtype + exp_val_for_scalar = NaT + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +@pytest.mark.parametrize( + "fill_value", + [ + pd.Timestamp("now"), + np.datetime64("now"), + datetime.datetime.now(), + datetime.date.today(), + ], + ids=["pd.Timestamp", "np.datetime64", "datetime.datetime", "datetime.date"], +) +def test_maybe_promote_any_numpy_dtype_with_datetimetz( + any_numpy_dtype_reduced, tz_aware_fixture, fill_value +): + dtype = np.dtype(any_numpy_dtype_reduced) + fill_dtype = DatetimeTZDtype(tz=tz_aware_fixture) + + fill_value = pd.Series([fill_value], dtype=fill_dtype)[0] + + # filling any numpy dtype with datetimetz casts to object + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +def test_maybe_promote_timedelta64_with_any(timedelta64_dtype, any_numpy_dtype_reduced): + dtype = np.dtype(timedelta64_dtype) + fill_dtype = np.dtype(any_numpy_dtype_reduced) + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling timedelta with anything but timedelta casts to object + if is_timedelta64_dtype(fill_dtype): + expected_dtype = dtype + # for timedelta dtypes, scalar values get cast to pd.Timedelta.value + exp_val_for_scalar = pd.Timedelta(fill_value).to_timedelta64() + else: + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +@pytest.mark.parametrize( + "fill_value", + [pd.Timedelta(days=1), np.timedelta64(24, "h"), datetime.timedelta(1)], + ids=["pd.Timedelta", "np.timedelta64", "datetime.timedelta"], +) +def test_maybe_promote_any_with_timedelta64( + any_numpy_dtype_reduced, timedelta64_dtype, fill_value +): + dtype = np.dtype(any_numpy_dtype_reduced) + + # filling anything but timedelta with timedelta casts to object + if is_timedelta64_dtype(dtype): + expected_dtype = dtype + # for timedelta dtypes, scalar values get cast to pd.Timedelta.value + exp_val_for_scalar = pd.Timedelta(fill_value).to_timedelta64() + else: + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +def test_maybe_promote_string_with_any(string_dtype, any_numpy_dtype_reduced): + dtype = np.dtype(string_dtype) + fill_dtype = np.dtype(any_numpy_dtype_reduced) + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling string with anything casts to object + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +def test_maybe_promote_any_with_string(any_numpy_dtype_reduced, string_dtype): + dtype = np.dtype(any_numpy_dtype_reduced) + + # create array of given dtype + fill_value = "abc" + + # filling anything with a string casts to object + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +def test_maybe_promote_object_with_any(object_dtype, any_numpy_dtype_reduced): + dtype = np.dtype(object_dtype) + fill_dtype = np.dtype(any_numpy_dtype_reduced) + + # create array of given dtype; casts "1" to correct dtype + fill_value = np.array([1], dtype=fill_dtype)[0] + + # filling object with anything stays object + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +def test_maybe_promote_any_with_object(any_numpy_dtype_reduced, object_dtype): + dtype = np.dtype(any_numpy_dtype_reduced) + + # create array of object dtype from a scalar value (i.e. passing + # dtypes.common.is_scalar), which can however not be cast to int/float etc. + fill_value = pd.DateOffset(1) + + # filling object with anything stays object + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +@pytest.mark.parametrize("fill_value", [None, np.nan, NaT]) +def test_maybe_promote_any_numpy_dtype_with_na(any_numpy_dtype_reduced, fill_value): + dtype = np.dtype(any_numpy_dtype_reduced) + + if is_integer_dtype(dtype) and fill_value is not NaT: + # integer + other missing value (np.nan / None) casts to float + expected_dtype = np.float64 + exp_val_for_scalar = np.nan + elif is_object_dtype(dtype) and fill_value is NaT: + # inserting into object does not cast the value + # but *does* cast None to np.nan + expected_dtype = np.dtype(object) + exp_val_for_scalar = fill_value + elif is_datetime_or_timedelta_dtype(dtype): + # datetime / timedelta cast all missing values to dtyped-NaT + expected_dtype = dtype + exp_val_for_scalar = dtype.type("NaT", "ns") + elif fill_value is NaT: + # NaT upcasts everything that's not datetime/timedelta to object + expected_dtype = np.dtype(object) + exp_val_for_scalar = NaT + elif is_float_dtype(dtype) or is_complex_dtype(dtype): + # float / complex + missing value (!= NaT) stays the same + expected_dtype = dtype + exp_val_for_scalar = np.nan + else: + # all other cases cast to object, and use np.nan as missing value + expected_dtype = np.dtype(object) + exp_val_for_scalar = np.nan + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +@pytest.mark.parametrize("dim", [0, 2, 3]) +def test_maybe_promote_dimensions(any_numpy_dtype_reduced, dim): + dtype = np.dtype(any_numpy_dtype_reduced) + + # create 0-dim array of given dtype; casts "1" to correct dtype + fill_array = np.array(1, dtype=dtype) + + # expand to desired dimension: + for _ in range(dim): + fill_array = np.expand_dims(fill_array, 0) + + if dtype != object: + # test against 1-dimensional case + with pytest.raises(ValueError, match="fill_value must be a scalar"): + maybe_promote(dtype, np.array([1], dtype=dtype)) + + with pytest.raises(ValueError, match="fill_value must be a scalar"): + maybe_promote(dtype, fill_array) + + else: + expected_dtype, expected_missing_value = maybe_promote( + dtype, np.array([1], dtype=dtype) + ) + result_dtype, result_missing_value = maybe_promote(dtype, fill_array) + assert result_dtype == expected_dtype + _assert_match(result_missing_value, expected_missing_value) diff --git a/venv/Lib/site-packages/pandas/tests/dtypes/cast/test_upcast.py b/venv/Lib/site-packages/pandas/tests/dtypes/cast/test_upcast.py new file mode 100644 index 0000000..bb7a7d0 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/dtypes/cast/test_upcast.py @@ -0,0 +1,71 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.cast import maybe_upcast_putmask + +from pandas import Series +import pandas._testing as tm + + +@pytest.mark.parametrize("result", [Series([10, 11, 12]), [10, 11, 12], (10, 11, 12)]) +def test_upcast_error(result): + # GH23823 require result arg to be ndarray + mask = np.array([False, True, False]) + other = np.array([61, 62, 63]) + with pytest.raises(ValueError): + result, _ = maybe_upcast_putmask(result, mask, other) + + +@pytest.mark.parametrize( + "arr, other", + [ + (np.arange(1, 6), np.array([61, 62, 63])), + (np.arange(1, 6), np.array([61.1, 62.2, 63.3])), + (np.arange(10, 15), np.array([61, 62])), + (np.arange(10, 15), np.array([61, np.nan])), + ( + np.arange("2019-01-01", "2019-01-06", dtype="datetime64[D]"), + np.arange("2018-01-01", "2018-01-04", dtype="datetime64[D]"), + ), + ( + np.arange("2019-01-01", "2019-01-06", dtype="datetime64[D]"), + np.arange("2018-01-01", "2018-01-03", dtype="datetime64[D]"), + ), + ], +) +def test_upcast_scalar_other(arr, other): + # for now we do not support non-scalar `other` + mask = np.array([False, True, False, True, True]) + with pytest.raises(ValueError, match="other must be a scalar"): + maybe_upcast_putmask(arr, mask, other) + + +def test_upcast(): + # GH23823 + arr = np.arange(1, 6) + mask = np.array([False, True, False, True, True]) + result, changed = maybe_upcast_putmask(arr, mask, other=np.nan) + + expected = np.array([1, np.nan, 3, np.nan, np.nan]) + assert changed + tm.assert_numpy_array_equal(result, expected) + + +def test_upcast_datetime(): + # GH23823 + arr = np.arange("2019-01-01", "2019-01-06", dtype="datetime64[D]") + mask = np.array([False, True, False, True, True]) + result, changed = maybe_upcast_putmask(arr, mask, other=np.nan) + + expected = np.array( + [ + "2019-01-01", + np.datetime64("NaT"), + "2019-01-03", + np.datetime64("NaT"), + np.datetime64("NaT"), + ], + dtype="datetime64[D]", + ) + assert not changed + tm.assert_numpy_array_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/dtypes/test_common.py b/venv/Lib/site-packages/pandas/tests/dtypes/test_common.py new file mode 100644 index 0000000..097e83d --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/dtypes/test_common.py @@ -0,0 +1,755 @@ +from datetime import datetime +from typing import List + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas.core.dtypes.cast import astype_nansafe +import pandas.core.dtypes.common as com +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + CategoricalDtypeType, + DatetimeTZDtype, + IntervalDtype, + PeriodDtype, +) +from pandas.core.dtypes.missing import isna + +import pandas as pd +import pandas._testing as tm +from pandas.arrays import SparseArray +from pandas.conftest import ( + ALL_EA_INT_DTYPES, + ALL_INT_DTYPES, + SIGNED_EA_INT_DTYPES, + SIGNED_INT_DTYPES, + UNSIGNED_EA_INT_DTYPES, + UNSIGNED_INT_DTYPES, +) + + +# EA & Actual Dtypes +def to_ea_dtypes(dtypes): + """ convert list of string dtypes to EA dtype """ + return [getattr(pd, dt + "Dtype") for dt in dtypes] + + +def to_numpy_dtypes(dtypes): + """ convert list of string dtypes to numpy dtype """ + return [getattr(np, dt) for dt in dtypes if isinstance(dt, str)] + + +class TestPandasDtype: + + # Passing invalid dtype, both as a string or object, must raise TypeError + # Per issue GH15520 + @pytest.mark.parametrize("box", [pd.Timestamp, "pd.Timestamp", list]) + def test_invalid_dtype_error(self, box): + with pytest.raises(TypeError, match="not understood"): + com.pandas_dtype(box) + + @pytest.mark.parametrize( + "dtype", + [ + object, + "float64", + np.object_, + np.dtype("object"), + "O", + np.float64, + float, + np.dtype("float64"), + ], + ) + def test_pandas_dtype_valid(self, dtype): + assert com.pandas_dtype(dtype) == dtype + + @pytest.mark.parametrize( + "dtype", ["M8[ns]", "m8[ns]", "object", "float64", "int64"] + ) + def test_numpy_dtype(self, dtype): + assert com.pandas_dtype(dtype) == np.dtype(dtype) + + def test_numpy_string_dtype(self): + # do not parse freq-like string as period dtype + assert com.pandas_dtype("U") == np.dtype("U") + assert com.pandas_dtype("S") == np.dtype("S") + + @pytest.mark.parametrize( + "dtype", + [ + "datetime64[ns, US/Eastern]", + "datetime64[ns, Asia/Tokyo]", + "datetime64[ns, UTC]", + ], + ) + def test_datetimetz_dtype(self, dtype): + assert com.pandas_dtype(dtype) == DatetimeTZDtype.construct_from_string(dtype) + assert com.pandas_dtype(dtype) == dtype + + def test_categorical_dtype(self): + assert com.pandas_dtype("category") == CategoricalDtype() + + @pytest.mark.parametrize( + "dtype", + [ + "period[D]", + "period[3M]", + "period[U]", + "Period[D]", + "Period[3M]", + "Period[U]", + ], + ) + def test_period_dtype(self, dtype): + assert com.pandas_dtype(dtype) is PeriodDtype(dtype) + assert com.pandas_dtype(dtype) == PeriodDtype(dtype) + assert com.pandas_dtype(dtype) == dtype + + +dtypes = dict( + datetime_tz=com.pandas_dtype("datetime64[ns, US/Eastern]"), + datetime=com.pandas_dtype("datetime64[ns]"), + timedelta=com.pandas_dtype("timedelta64[ns]"), + period=PeriodDtype("D"), + integer=np.dtype(np.int64), + float=np.dtype(np.float64), + object=np.dtype(np.object), + category=com.pandas_dtype("category"), +) + + +@pytest.mark.parametrize("name1,dtype1", list(dtypes.items()), ids=lambda x: str(x)) +@pytest.mark.parametrize("name2,dtype2", list(dtypes.items()), ids=lambda x: str(x)) +def test_dtype_equal(name1, dtype1, name2, dtype2): + + # match equal to self, but not equal to other + assert com.is_dtype_equal(dtype1, dtype1) + if name1 != name2: + assert not com.is_dtype_equal(dtype1, dtype2) + + +@pytest.mark.parametrize( + "dtype1,dtype2", + [ + (np.int8, np.int64), + (np.int16, np.int64), + (np.int32, np.int64), + (np.float32, np.float64), + (PeriodDtype("D"), PeriodDtype("2D")), # PeriodType + ( + com.pandas_dtype("datetime64[ns, US/Eastern]"), + com.pandas_dtype("datetime64[ns, CET]"), + ), # Datetime + (None, None), # gh-15941: no exception should be raised. + ], +) +def test_dtype_equal_strict(dtype1, dtype2): + assert not com.is_dtype_equal(dtype1, dtype2) + + +def get_is_dtype_funcs(): + """ + Get all functions in pandas.core.dtypes.common that + begin with 'is_' and end with 'dtype' + + """ + + fnames = [f for f in dir(com) if (f.startswith("is_") and f.endswith("dtype"))] + return [getattr(com, fname) for fname in fnames] + + +@pytest.mark.parametrize("func", get_is_dtype_funcs(), ids=lambda x: x.__name__) +def test_get_dtype_error_catch(func): + # see gh-15941 + # + # No exception should be raised. + + assert not func(None) + + +def test_is_object(): + assert com.is_object_dtype(object) + assert com.is_object_dtype(np.array([], dtype=object)) + + assert not com.is_object_dtype(int) + assert not com.is_object_dtype(np.array([], dtype=int)) + assert not com.is_object_dtype([1, 2, 3]) + + +@pytest.mark.parametrize( + "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] +) +def test_is_sparse(check_scipy): + assert com.is_sparse(SparseArray([1, 2, 3])) + + assert not com.is_sparse(np.array([1, 2, 3])) + + if check_scipy: + import scipy.sparse + + assert not com.is_sparse(scipy.sparse.bsr_matrix([1, 2, 3])) + + +@td.skip_if_no_scipy +def test_is_scipy_sparse(): + from scipy.sparse import bsr_matrix + + assert com.is_scipy_sparse(bsr_matrix([1, 2, 3])) + + assert not com.is_scipy_sparse(SparseArray([1, 2, 3])) + + +def test_is_categorical(): + cat = pd.Categorical([1, 2, 3]) + assert com.is_categorical(cat) + assert com.is_categorical(pd.Series(cat)) + assert com.is_categorical(pd.CategoricalIndex([1, 2, 3])) + + assert not com.is_categorical([1, 2, 3]) + + +def test_is_datetime64_dtype(): + assert not com.is_datetime64_dtype(object) + assert not com.is_datetime64_dtype([1, 2, 3]) + assert not com.is_datetime64_dtype(np.array([], dtype=int)) + + assert com.is_datetime64_dtype(np.datetime64) + assert com.is_datetime64_dtype(np.array([], dtype=np.datetime64)) + + +def test_is_datetime64tz_dtype(): + assert not com.is_datetime64tz_dtype(object) + assert not com.is_datetime64tz_dtype([1, 2, 3]) + assert not com.is_datetime64tz_dtype(pd.DatetimeIndex([1, 2, 3])) + assert com.is_datetime64tz_dtype(pd.DatetimeIndex(["2000"], tz="US/Eastern")) + + +def test_is_timedelta64_dtype(): + assert not com.is_timedelta64_dtype(object) + assert not com.is_timedelta64_dtype(None) + assert not com.is_timedelta64_dtype([1, 2, 3]) + assert not com.is_timedelta64_dtype(np.array([], dtype=np.datetime64)) + assert not com.is_timedelta64_dtype("0 days") + assert not com.is_timedelta64_dtype("0 days 00:00:00") + assert not com.is_timedelta64_dtype(["0 days 00:00:00"]) + assert not com.is_timedelta64_dtype("NO DATE") + + assert com.is_timedelta64_dtype(np.timedelta64) + assert com.is_timedelta64_dtype(pd.Series([], dtype="timedelta64[ns]")) + assert com.is_timedelta64_dtype(pd.to_timedelta(["0 days", "1 days"])) + + +def test_is_period_dtype(): + assert not com.is_period_dtype(object) + assert not com.is_period_dtype([1, 2, 3]) + assert not com.is_period_dtype(pd.Period("2017-01-01")) + + assert com.is_period_dtype(PeriodDtype(freq="D")) + assert com.is_period_dtype(pd.PeriodIndex([], freq="A")) + + +def test_is_interval_dtype(): + assert not com.is_interval_dtype(object) + assert not com.is_interval_dtype([1, 2, 3]) + + assert com.is_interval_dtype(IntervalDtype()) + + interval = pd.Interval(1, 2, closed="right") + assert not com.is_interval_dtype(interval) + assert com.is_interval_dtype(pd.IntervalIndex([interval])) + + +def test_is_categorical_dtype(): + assert not com.is_categorical_dtype(object) + assert not com.is_categorical_dtype([1, 2, 3]) + + assert com.is_categorical_dtype(CategoricalDtype()) + assert com.is_categorical_dtype(pd.Categorical([1, 2, 3])) + assert com.is_categorical_dtype(pd.CategoricalIndex([1, 2, 3])) + + +def test_is_string_dtype(): + assert not com.is_string_dtype(int) + assert not com.is_string_dtype(pd.Series([1, 2])) + + assert com.is_string_dtype(str) + assert com.is_string_dtype(object) + assert com.is_string_dtype(np.array(["a", "b"])) + assert com.is_string_dtype(pd.StringDtype()) + assert com.is_string_dtype(pd.array(["a", "b"], dtype="string")) + + +def test_is_period_arraylike(): + assert not com.is_period_arraylike([1, 2, 3]) + assert not com.is_period_arraylike(pd.Index([1, 2, 3])) + assert com.is_period_arraylike(pd.PeriodIndex(["2017-01-01"], freq="D")) + + +def test_is_datetime_arraylike(): + assert not com.is_datetime_arraylike([1, 2, 3]) + assert not com.is_datetime_arraylike(pd.Index([1, 2, 3])) + assert com.is_datetime_arraylike(pd.DatetimeIndex([1, 2, 3])) + + +integer_dtypes: List = [] + + +@pytest.mark.parametrize( + "dtype", + integer_dtypes + + [pd.Series([1, 2])] + + ALL_INT_DTYPES + + to_numpy_dtypes(ALL_INT_DTYPES) + + ALL_EA_INT_DTYPES + + to_ea_dtypes(ALL_EA_INT_DTYPES), +) +def test_is_integer_dtype(dtype): + assert com.is_integer_dtype(dtype) + + +@pytest.mark.parametrize( + "dtype", + [ + str, + float, + np.datetime64, + np.timedelta64, + pd.Index([1, 2.0]), + np.array(["a", "b"]), + np.array([], dtype=np.timedelta64), + ], +) +def test_is_not_integer_dtype(dtype): + assert not com.is_integer_dtype(dtype) + + +signed_integer_dtypes: List = [] + + +@pytest.mark.parametrize( + "dtype", + signed_integer_dtypes + + [pd.Series([1, 2])] + + SIGNED_INT_DTYPES + + to_numpy_dtypes(SIGNED_INT_DTYPES) + + SIGNED_EA_INT_DTYPES + + to_ea_dtypes(SIGNED_EA_INT_DTYPES), +) +def test_is_signed_integer_dtype(dtype): + assert com.is_integer_dtype(dtype) + + +@pytest.mark.parametrize( + "dtype", + [ + str, + float, + np.datetime64, + np.timedelta64, + pd.Index([1, 2.0]), + np.array(["a", "b"]), + np.array([], dtype=np.timedelta64), + ] + + UNSIGNED_INT_DTYPES + + to_numpy_dtypes(UNSIGNED_INT_DTYPES) + + UNSIGNED_EA_INT_DTYPES + + to_ea_dtypes(UNSIGNED_EA_INT_DTYPES), +) +def test_is_not_signed_integer_dtype(dtype): + assert not com.is_signed_integer_dtype(dtype) + + +unsigned_integer_dtypes: List = [] + + +@pytest.mark.parametrize( + "dtype", + unsigned_integer_dtypes + + [pd.Series([1, 2], dtype=np.uint32)] + + UNSIGNED_INT_DTYPES + + to_numpy_dtypes(UNSIGNED_INT_DTYPES) + + UNSIGNED_EA_INT_DTYPES + + to_ea_dtypes(UNSIGNED_EA_INT_DTYPES), +) +def test_is_unsigned_integer_dtype(dtype): + assert com.is_unsigned_integer_dtype(dtype) + + +@pytest.mark.parametrize( + "dtype", + [ + str, + float, + np.datetime64, + np.timedelta64, + pd.Index([1, 2.0]), + np.array(["a", "b"]), + np.array([], dtype=np.timedelta64), + ] + + SIGNED_INT_DTYPES + + to_numpy_dtypes(SIGNED_INT_DTYPES) + + SIGNED_EA_INT_DTYPES + + to_ea_dtypes(SIGNED_EA_INT_DTYPES), +) +def test_is_not_unsigned_integer_dtype(dtype): + assert not com.is_unsigned_integer_dtype(dtype) + + +@pytest.mark.parametrize( + "dtype", [np.int64, np.array([1, 2], dtype=np.int64), "Int64", pd.Int64Dtype] +) +def test_is_int64_dtype(dtype): + assert com.is_int64_dtype(dtype) + + +@pytest.mark.parametrize( + "dtype", + [ + str, + float, + np.int32, + np.uint64, + pd.Index([1, 2.0]), + np.array(["a", "b"]), + np.array([1, 2], dtype=np.uint32), + "int8", + "Int8", + pd.Int8Dtype, + ], +) +def test_is_not_int64_dtype(dtype): + assert not com.is_int64_dtype(dtype) + + +def test_is_datetime64_any_dtype(): + assert not com.is_datetime64_any_dtype(int) + assert not com.is_datetime64_any_dtype(str) + assert not com.is_datetime64_any_dtype(np.array([1, 2])) + assert not com.is_datetime64_any_dtype(np.array(["a", "b"])) + + assert com.is_datetime64_any_dtype(np.datetime64) + assert com.is_datetime64_any_dtype(np.array([], dtype=np.datetime64)) + assert com.is_datetime64_any_dtype(DatetimeTZDtype("ns", "US/Eastern")) + assert com.is_datetime64_any_dtype( + pd.DatetimeIndex([1, 2, 3], dtype="datetime64[ns]") + ) + + +def test_is_datetime64_ns_dtype(): + assert not com.is_datetime64_ns_dtype(int) + assert not com.is_datetime64_ns_dtype(str) + assert not com.is_datetime64_ns_dtype(np.datetime64) + assert not com.is_datetime64_ns_dtype(np.array([1, 2])) + assert not com.is_datetime64_ns_dtype(np.array(["a", "b"])) + assert not com.is_datetime64_ns_dtype(np.array([], dtype=np.datetime64)) + + # This datetime array has the wrong unit (ps instead of ns) + assert not com.is_datetime64_ns_dtype(np.array([], dtype="datetime64[ps]")) + + assert com.is_datetime64_ns_dtype(DatetimeTZDtype("ns", "US/Eastern")) + assert com.is_datetime64_ns_dtype( + pd.DatetimeIndex([1, 2, 3], dtype=np.dtype("datetime64[ns]")) + ) + + +def test_is_timedelta64_ns_dtype(): + assert not com.is_timedelta64_ns_dtype(np.dtype("m8[ps]")) + assert not com.is_timedelta64_ns_dtype(np.array([1, 2], dtype=np.timedelta64)) + + assert com.is_timedelta64_ns_dtype(np.dtype("m8[ns]")) + assert com.is_timedelta64_ns_dtype(np.array([1, 2], dtype="m8[ns]")) + + +def test_is_datetime_or_timedelta_dtype(): + assert not com.is_datetime_or_timedelta_dtype(int) + assert not com.is_datetime_or_timedelta_dtype(str) + assert not com.is_datetime_or_timedelta_dtype(pd.Series([1, 2])) + assert not com.is_datetime_or_timedelta_dtype(np.array(["a", "b"])) + + # TODO(jreback), this is slightly suspect + assert not com.is_datetime_or_timedelta_dtype(DatetimeTZDtype("ns", "US/Eastern")) + + assert com.is_datetime_or_timedelta_dtype(np.datetime64) + assert com.is_datetime_or_timedelta_dtype(np.timedelta64) + assert com.is_datetime_or_timedelta_dtype(np.array([], dtype=np.timedelta64)) + assert com.is_datetime_or_timedelta_dtype(np.array([], dtype=np.datetime64)) + + +def test_is_numeric_v_string_like(): + assert not com.is_numeric_v_string_like(1, 1) + assert not com.is_numeric_v_string_like(1, "foo") + assert not com.is_numeric_v_string_like("foo", "foo") + assert not com.is_numeric_v_string_like(np.array([1]), np.array([2])) + assert not com.is_numeric_v_string_like(np.array(["foo"]), np.array(["foo"])) + + assert com.is_numeric_v_string_like(np.array([1]), "foo") + assert com.is_numeric_v_string_like("foo", np.array([1])) + assert com.is_numeric_v_string_like(np.array([1, 2]), np.array(["foo"])) + assert com.is_numeric_v_string_like(np.array(["foo"]), np.array([1, 2])) + + +def test_is_datetimelike_v_numeric(): + dt = np.datetime64(datetime(2017, 1, 1)) + + assert not com.is_datetimelike_v_numeric(1, 1) + assert not com.is_datetimelike_v_numeric(dt, dt) + assert not com.is_datetimelike_v_numeric(np.array([1]), np.array([2])) + assert not com.is_datetimelike_v_numeric(np.array([dt]), np.array([dt])) + + assert com.is_datetimelike_v_numeric(1, dt) + assert com.is_datetimelike_v_numeric(1, dt) + assert com.is_datetimelike_v_numeric(np.array([dt]), 1) + assert com.is_datetimelike_v_numeric(np.array([1]), dt) + assert com.is_datetimelike_v_numeric(np.array([dt]), np.array([1])) + + +def test_needs_i8_conversion(): + assert not com.needs_i8_conversion(str) + assert not com.needs_i8_conversion(np.int64) + assert not com.needs_i8_conversion(pd.Series([1, 2])) + assert not com.needs_i8_conversion(np.array(["a", "b"])) + + assert com.needs_i8_conversion(np.datetime64) + assert com.needs_i8_conversion(pd.Series([], dtype="timedelta64[ns]")) + assert com.needs_i8_conversion(pd.DatetimeIndex(["2000"], tz="US/Eastern")) + + +def test_is_numeric_dtype(): + assert not com.is_numeric_dtype(str) + assert not com.is_numeric_dtype(np.datetime64) + assert not com.is_numeric_dtype(np.timedelta64) + assert not com.is_numeric_dtype(np.array(["a", "b"])) + assert not com.is_numeric_dtype(np.array([], dtype=np.timedelta64)) + + assert com.is_numeric_dtype(int) + assert com.is_numeric_dtype(float) + assert com.is_numeric_dtype(np.uint64) + assert com.is_numeric_dtype(pd.Series([1, 2])) + assert com.is_numeric_dtype(pd.Index([1, 2.0])) + + +def test_is_string_like_dtype(): + assert not com.is_string_like_dtype(object) + assert not com.is_string_like_dtype(pd.Series([1, 2])) + + assert com.is_string_like_dtype(str) + assert com.is_string_like_dtype(np.array(["a", "b"])) + + +def test_is_float_dtype(): + assert not com.is_float_dtype(str) + assert not com.is_float_dtype(int) + assert not com.is_float_dtype(pd.Series([1, 2])) + assert not com.is_float_dtype(np.array(["a", "b"])) + + assert com.is_float_dtype(float) + assert com.is_float_dtype(pd.Index([1, 2.0])) + + +def test_is_bool_dtype(): + assert not com.is_bool_dtype(int) + assert not com.is_bool_dtype(str) + assert not com.is_bool_dtype(pd.Series([1, 2])) + assert not com.is_bool_dtype(np.array(["a", "b"])) + assert not com.is_bool_dtype(pd.Index(["a", "b"])) + + assert com.is_bool_dtype(bool) + assert com.is_bool_dtype(np.bool) + assert com.is_bool_dtype(np.array([True, False])) + assert com.is_bool_dtype(pd.Index([True, False])) + + assert com.is_bool_dtype(pd.BooleanDtype()) + assert com.is_bool_dtype(pd.array([True, False, None], dtype="boolean")) + + +@pytest.mark.filterwarnings("ignore:'is_extension_type' is deprecated:FutureWarning") +@pytest.mark.parametrize( + "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] +) +def test_is_extension_type(check_scipy): + assert not com.is_extension_type([1, 2, 3]) + assert not com.is_extension_type(np.array([1, 2, 3])) + assert not com.is_extension_type(pd.DatetimeIndex([1, 2, 3])) + + cat = pd.Categorical([1, 2, 3]) + assert com.is_extension_type(cat) + assert com.is_extension_type(pd.Series(cat)) + assert com.is_extension_type(SparseArray([1, 2, 3])) + assert com.is_extension_type(pd.DatetimeIndex(["2000"], tz="US/Eastern")) + + dtype = DatetimeTZDtype("ns", tz="US/Eastern") + s = pd.Series([], dtype=dtype) + assert com.is_extension_type(s) + + if check_scipy: + import scipy.sparse + + assert not com.is_extension_type(scipy.sparse.bsr_matrix([1, 2, 3])) + + +def test_is_extension_type_deprecation(): + with tm.assert_produces_warning(FutureWarning): + com.is_extension_type([1, 2, 3]) + + +@pytest.mark.parametrize( + "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] +) +def test_is_extension_array_dtype(check_scipy): + assert not com.is_extension_array_dtype([1, 2, 3]) + assert not com.is_extension_array_dtype(np.array([1, 2, 3])) + assert not com.is_extension_array_dtype(pd.DatetimeIndex([1, 2, 3])) + + cat = pd.Categorical([1, 2, 3]) + assert com.is_extension_array_dtype(cat) + assert com.is_extension_array_dtype(pd.Series(cat)) + assert com.is_extension_array_dtype(SparseArray([1, 2, 3])) + assert com.is_extension_array_dtype(pd.DatetimeIndex(["2000"], tz="US/Eastern")) + + dtype = DatetimeTZDtype("ns", tz="US/Eastern") + s = pd.Series([], dtype=dtype) + assert com.is_extension_array_dtype(s) + + if check_scipy: + import scipy.sparse + + assert not com.is_extension_array_dtype(scipy.sparse.bsr_matrix([1, 2, 3])) + + +def test_is_complex_dtype(): + assert not com.is_complex_dtype(int) + assert not com.is_complex_dtype(str) + assert not com.is_complex_dtype(pd.Series([1, 2])) + assert not com.is_complex_dtype(np.array(["a", "b"])) + + assert com.is_complex_dtype(np.complex) + assert com.is_complex_dtype(np.array([1 + 1j, 5])) + + +@pytest.mark.parametrize( + "input_param,result", + [ + (int, np.dtype(int)), + ("int32", np.dtype("int32")), + (float, np.dtype(float)), + ("float64", np.dtype("float64")), + (np.dtype("float64"), np.dtype("float64")), + (str, np.dtype(str)), + (pd.Series([1, 2], dtype=np.dtype("int16")), np.dtype("int16")), + (pd.Series(["a", "b"]), np.dtype(object)), + (pd.Index([1, 2]), np.dtype("int64")), + (pd.Index(["a", "b"]), np.dtype(object)), + ("category", "category"), + (pd.Categorical(["a", "b"]).dtype, CategoricalDtype(["a", "b"])), + (pd.Categorical(["a", "b"]), CategoricalDtype(["a", "b"])), + (pd.CategoricalIndex(["a", "b"]).dtype, CategoricalDtype(["a", "b"])), + (pd.CategoricalIndex(["a", "b"]), CategoricalDtype(["a", "b"])), + (CategoricalDtype(), CategoricalDtype()), + (CategoricalDtype(["a", "b"]), CategoricalDtype()), + (pd.DatetimeIndex([1, 2]), np.dtype("=M8[ns]")), + (pd.DatetimeIndex([1, 2]).dtype, np.dtype("=M8[ns]")), + (" df.two.sum() + + with catch_warnings(record=True) as w: + # successfully modify column in place + # this should not raise a warning + df.one += 1 + assert len(w) == 0 + assert df.one.iloc[0] == 2 + + with catch_warnings(record=True) as w: + # successfully add an attribute to a series + # this should not raise a warning + df.two.not_an_index = [1, 2] + assert len(w) == 0 + + with tm.assert_produces_warning(UserWarning): + # warn when setting column to nonexistent name + df.four = df.two + 2 + assert df.four.sum() > df.two.sum() diff --git a/venv/Lib/site-packages/pandas/tests/dtypes/test_inference.py b/venv/Lib/site-packages/pandas/tests/dtypes/test_inference.py new file mode 100644 index 0000000..5eb85de --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/dtypes/test_inference.py @@ -0,0 +1,1460 @@ +""" +These the test the public routines exposed in types/common.py +related to inference and not otherwise tested in types/test_common.py + +""" +import collections +from collections import namedtuple +from datetime import date, datetime, time, timedelta +from decimal import Decimal +from fractions import Fraction +from io import StringIO +from numbers import Number +import re + +import numpy as np +import pytest +import pytz + +from pandas._libs import iNaT, lib, missing as libmissing +import pandas.util._test_decorators as td + +from pandas.core.dtypes import inference +from pandas.core.dtypes.common import ( + ensure_categorical, + ensure_int32, + is_bool, + is_datetime64_any_dtype, + is_datetime64_dtype, + is_datetime64_ns_dtype, + is_datetime64tz_dtype, + is_float, + is_integer, + is_number, + is_scalar, + is_scipy_sparse, + is_timedelta64_dtype, + is_timedelta64_ns_dtype, +) + +import pandas as pd +from pandas import ( + Categorical, + DataFrame, + DateOffset, + DatetimeIndex, + Index, + Interval, + Period, + Series, + Timedelta, + TimedeltaIndex, + Timestamp, + isna, +) +import pandas._testing as tm +from pandas.core.arrays import IntegerArray + + +@pytest.fixture(params=[True, False], ids=str) +def coerce(request): + return request.param + + +# collect all objects to be tested for list-like-ness; use tuples of objects, +# whether they are list-like or not (special casing for sets), and their ID +ll_params = [ + ([1], True, "list"), + ([], True, "list-empty"), + ((1,), True, "tuple"), + (tuple(), True, "tuple-empty"), + ({"a": 1}, True, "dict"), + (dict(), True, "dict-empty"), + ({"a", 1}, "set", "set"), + (set(), "set", "set-empty"), + (frozenset({"a", 1}), "set", "frozenset"), + (frozenset(), "set", "frozenset-empty"), + (iter([1, 2]), True, "iterator"), + (iter([]), True, "iterator-empty"), + ((x for x in [1, 2]), True, "generator"), + ((_ for _ in []), True, "generator-empty"), + (Series([1]), True, "Series"), + (Series([], dtype=object), True, "Series-empty"), + (Series(["a"]).str, True, "StringMethods"), + (Series([], dtype="O").str, True, "StringMethods-empty"), + (Index([1]), True, "Index"), + (Index([]), True, "Index-empty"), + (DataFrame([[1]]), True, "DataFrame"), + (DataFrame(), True, "DataFrame-empty"), + (np.ndarray((2,) * 1), True, "ndarray-1d"), + (np.array([]), True, "ndarray-1d-empty"), + (np.ndarray((2,) * 2), True, "ndarray-2d"), + (np.array([[]]), True, "ndarray-2d-empty"), + (np.ndarray((2,) * 3), True, "ndarray-3d"), + (np.array([[[]]]), True, "ndarray-3d-empty"), + (np.ndarray((2,) * 4), True, "ndarray-4d"), + (np.array([[[[]]]]), True, "ndarray-4d-empty"), + (np.array(2), False, "ndarray-0d"), + (1, False, "int"), + (b"123", False, "bytes"), + (b"", False, "bytes-empty"), + ("123", False, "string"), + ("", False, "string-empty"), + (str, False, "string-type"), + (object(), False, "object"), + (np.nan, False, "NaN"), + (None, False, "None"), +] +objs, expected, ids = zip(*ll_params) + + +@pytest.fixture(params=zip(objs, expected), ids=ids) +def maybe_list_like(request): + return request.param + + +def test_is_list_like(maybe_list_like): + obj, expected = maybe_list_like + expected = True if expected == "set" else expected + assert inference.is_list_like(obj) == expected + + +def test_is_list_like_disallow_sets(maybe_list_like): + obj, expected = maybe_list_like + expected = False if expected == "set" else expected + assert inference.is_list_like(obj, allow_sets=False) == expected + + +def test_is_sequence(): + is_seq = inference.is_sequence + assert is_seq((1, 2)) + assert is_seq([1, 2]) + assert not is_seq("abcd") + assert not is_seq(np.int64) + + class A: + def __getitem__(self): + return 1 + + assert not is_seq(A()) + + +def test_is_array_like(): + assert inference.is_array_like(Series([], dtype=object)) + assert inference.is_array_like(Series([1, 2])) + assert inference.is_array_like(np.array(["a", "b"])) + assert inference.is_array_like(Index(["2016-01-01"])) + + class DtypeList(list): + dtype = "special" + + assert inference.is_array_like(DtypeList()) + + assert not inference.is_array_like([1, 2, 3]) + assert not inference.is_array_like(tuple()) + assert not inference.is_array_like("foo") + assert not inference.is_array_like(123) + + +@pytest.mark.parametrize( + "inner", + [ + [], + [1], + (1,), + (1, 2), + {"a": 1}, + {1, "a"}, + Series([1]), + Series([], dtype=object), + Series(["a"]).str, + (x for x in range(5)), + ], +) +@pytest.mark.parametrize("outer", [list, Series, np.array, tuple]) +def test_is_nested_list_like_passes(inner, outer): + result = outer([inner for _ in range(5)]) + assert inference.is_list_like(result) + + +@pytest.mark.parametrize( + "obj", + [ + "abc", + [], + [1], + (1,), + ["a"], + "a", + {"a"}, + [1, 2, 3], + Series([1]), + DataFrame({"A": [1]}), + ([1, 2] for _ in range(5)), + ], +) +def test_is_nested_list_like_fails(obj): + assert not inference.is_nested_list_like(obj) + + +@pytest.mark.parametrize("ll", [{}, {"A": 1}, Series([1]), collections.defaultdict()]) +def test_is_dict_like_passes(ll): + assert inference.is_dict_like(ll) + + +@pytest.mark.parametrize( + "ll", + [ + "1", + 1, + [1, 2], + (1, 2), + range(2), + Index([1]), + dict, + collections.defaultdict, + Series, + ], +) +def test_is_dict_like_fails(ll): + assert not inference.is_dict_like(ll) + + +@pytest.mark.parametrize("has_keys", [True, False]) +@pytest.mark.parametrize("has_getitem", [True, False]) +@pytest.mark.parametrize("has_contains", [True, False]) +def test_is_dict_like_duck_type(has_keys, has_getitem, has_contains): + class DictLike: + def __init__(self, d): + self.d = d + + if has_keys: + + def keys(self): + return self.d.keys() + + if has_getitem: + + def __getitem__(self, key): + return self.d.__getitem__(key) + + if has_contains: + + def __contains__(self, key) -> bool: + return self.d.__contains__(key) + + d = DictLike({1: 2}) + result = inference.is_dict_like(d) + expected = has_keys and has_getitem and has_contains + + assert result is expected + + +def test_is_file_like(): + class MockFile: + pass + + is_file = inference.is_file_like + + data = StringIO("data") + assert is_file(data) + + # No read / write attributes + # No iterator attributes + m = MockFile() + assert not is_file(m) + + MockFile.write = lambda self: 0 + + # Write attribute but not an iterator + m = MockFile() + assert not is_file(m) + + # gh-16530: Valid iterator just means we have the + # __iter__ attribute for our purposes. + MockFile.__iter__ = lambda self: self + + # Valid write-only file + m = MockFile() + assert is_file(m) + + del MockFile.write + MockFile.read = lambda self: 0 + + # Valid read-only file + m = MockFile() + assert is_file(m) + + # Iterator but no read / write attributes + data = [1, 2, 3] + assert not is_file(data) + + +test_tuple = collections.namedtuple("Test", ["a", "b", "c"]) + + +@pytest.mark.parametrize("ll", [test_tuple(1, 2, 3)]) +def test_is_names_tuple_passes(ll): + assert inference.is_named_tuple(ll) + + +@pytest.mark.parametrize("ll", [(1, 2, 3), "a", Series({"pi": 3.14})]) +def test_is_names_tuple_fails(ll): + assert not inference.is_named_tuple(ll) + + +def test_is_hashable(): + + # all new-style classes are hashable by default + class HashableClass: + pass + + class UnhashableClass1: + __hash__ = None + + class UnhashableClass2: + def __hash__(self): + raise TypeError("Not hashable") + + hashable = (1, 3.14, np.float64(3.14), "a", tuple(), (1,), HashableClass()) + not_hashable = ([], UnhashableClass1()) + abc_hashable_not_really_hashable = (([],), UnhashableClass2()) + + for i in hashable: + assert inference.is_hashable(i) + for i in not_hashable: + assert not inference.is_hashable(i) + for i in abc_hashable_not_really_hashable: + assert not inference.is_hashable(i) + + # numpy.array is no longer collections.abc.Hashable as of + # https://github.com/numpy/numpy/pull/5326, just test + # is_hashable() + assert not inference.is_hashable(np.array([])) + + +@pytest.mark.parametrize("ll", [re.compile("ad")]) +def test_is_re_passes(ll): + assert inference.is_re(ll) + + +@pytest.mark.parametrize("ll", ["x", 2, 3, object()]) +def test_is_re_fails(ll): + assert not inference.is_re(ll) + + +@pytest.mark.parametrize( + "ll", [r"a", "x", r"asdf", re.compile("adsf"), r"\u2233\s*", re.compile(r"")] +) +def test_is_recompilable_passes(ll): + assert inference.is_re_compilable(ll) + + +@pytest.mark.parametrize("ll", [1, [], object()]) +def test_is_recompilable_fails(ll): + assert not inference.is_re_compilable(ll) + + +class TestInference: + def test_infer_dtype_bytes(self): + compare = "bytes" + + # string array of bytes + arr = np.array(list("abc"), dtype="S1") + assert lib.infer_dtype(arr, skipna=True) == compare + + # object array of bytes + arr = arr.astype(object) + assert lib.infer_dtype(arr, skipna=True) == compare + + # object array of bytes with missing values + assert lib.infer_dtype([b"a", np.nan, b"c"], skipna=True) == compare + + def test_isinf_scalar(self): + # GH 11352 + assert libmissing.isposinf_scalar(float("inf")) + assert libmissing.isposinf_scalar(np.inf) + assert not libmissing.isposinf_scalar(-np.inf) + assert not libmissing.isposinf_scalar(1) + assert not libmissing.isposinf_scalar("a") + + assert libmissing.isneginf_scalar(float("-inf")) + assert libmissing.isneginf_scalar(-np.inf) + assert not libmissing.isneginf_scalar(np.inf) + assert not libmissing.isneginf_scalar(1) + assert not libmissing.isneginf_scalar("a") + + @pytest.mark.parametrize("maybe_int", [True, False]) + @pytest.mark.parametrize( + "infinity", ["inf", "inF", "iNf", "Inf", "iNF", "InF", "INf", "INF"] + ) + def test_maybe_convert_numeric_infinities(self, infinity, maybe_int): + # see gh-13274 + na_values = {"", "NULL", "nan"} + + pos = np.array(["inf"], dtype=np.float64) + neg = np.array(["-inf"], dtype=np.float64) + + msg = "Unable to parse string" + + out = lib.maybe_convert_numeric( + np.array([infinity], dtype=object), na_values, maybe_int + ) + tm.assert_numpy_array_equal(out, pos) + + out = lib.maybe_convert_numeric( + np.array(["-" + infinity], dtype=object), na_values, maybe_int + ) + tm.assert_numpy_array_equal(out, neg) + + out = lib.maybe_convert_numeric( + np.array([infinity], dtype=object), na_values, maybe_int + ) + tm.assert_numpy_array_equal(out, pos) + + out = lib.maybe_convert_numeric( + np.array(["+" + infinity], dtype=object), na_values, maybe_int + ) + tm.assert_numpy_array_equal(out, pos) + + # too many characters + with pytest.raises(ValueError, match=msg): + lib.maybe_convert_numeric( + np.array(["foo_" + infinity], dtype=object), na_values, maybe_int + ) + + def test_maybe_convert_numeric_post_floatify_nan(self, coerce): + # see gh-13314 + data = np.array(["1.200", "-999.000", "4.500"], dtype=object) + expected = np.array([1.2, np.nan, 4.5], dtype=np.float64) + nan_values = {-999, -999.0} + + out = lib.maybe_convert_numeric(data, nan_values, coerce) + tm.assert_numpy_array_equal(out, expected) + + def test_convert_infs(self): + arr = np.array(["inf", "inf", "inf"], dtype="O") + result = lib.maybe_convert_numeric(arr, set(), False) + assert result.dtype == np.float64 + + arr = np.array(["-inf", "-inf", "-inf"], dtype="O") + result = lib.maybe_convert_numeric(arr, set(), False) + assert result.dtype == np.float64 + + def test_scientific_no_exponent(self): + # See PR 12215 + arr = np.array(["42E", "2E", "99e", "6e"], dtype="O") + result = lib.maybe_convert_numeric(arr, set(), False, True) + assert np.all(np.isnan(result)) + + def test_convert_non_hashable(self): + # GH13324 + # make sure that we are handing non-hashables + arr = np.array([[10.0, 2], 1.0, "apple"], dtype=object) + result = lib.maybe_convert_numeric(arr, set(), False, True) + tm.assert_numpy_array_equal(result, np.array([np.nan, 1.0, np.nan])) + + def test_convert_numeric_uint64(self): + arr = np.array([2 ** 63], dtype=object) + exp = np.array([2 ** 63], dtype=np.uint64) + tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp) + + arr = np.array([str(2 ** 63)], dtype=object) + exp = np.array([2 ** 63], dtype=np.uint64) + tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp) + + arr = np.array([np.uint64(2 ** 63)], dtype=object) + exp = np.array([2 ** 63], dtype=np.uint64) + tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp) + + @pytest.mark.parametrize( + "arr", + [ + np.array([2 ** 63, np.nan], dtype=object), + np.array([str(2 ** 63), np.nan], dtype=object), + np.array([np.nan, 2 ** 63], dtype=object), + np.array([np.nan, str(2 ** 63)], dtype=object), + ], + ) + def test_convert_numeric_uint64_nan(self, coerce, arr): + expected = arr.astype(float) if coerce else arr.copy() + result = lib.maybe_convert_numeric(arr, set(), coerce_numeric=coerce) + tm.assert_almost_equal(result, expected) + + def test_convert_numeric_uint64_nan_values(self, coerce): + arr = np.array([2 ** 63, 2 ** 63 + 1], dtype=object) + na_values = {2 ** 63} + + expected = ( + np.array([np.nan, 2 ** 63 + 1], dtype=float) if coerce else arr.copy() + ) + result = lib.maybe_convert_numeric(arr, na_values, coerce_numeric=coerce) + tm.assert_almost_equal(result, expected) + + @pytest.mark.parametrize( + "case", + [ + np.array([2 ** 63, -1], dtype=object), + np.array([str(2 ** 63), -1], dtype=object), + np.array([str(2 ** 63), str(-1)], dtype=object), + np.array([-1, 2 ** 63], dtype=object), + np.array([-1, str(2 ** 63)], dtype=object), + np.array([str(-1), str(2 ** 63)], dtype=object), + ], + ) + def test_convert_numeric_int64_uint64(self, case, coerce): + expected = case.astype(float) if coerce else case.copy() + result = lib.maybe_convert_numeric(case, set(), coerce_numeric=coerce) + tm.assert_almost_equal(result, expected) + + @pytest.mark.parametrize("value", [-(2 ** 63) - 1, 2 ** 64]) + def test_convert_int_overflow(self, value): + # see gh-18584 + arr = np.array([value], dtype=object) + result = lib.maybe_convert_objects(arr) + tm.assert_numpy_array_equal(arr, result) + + def test_maybe_convert_objects_uint64(self): + # see gh-4471 + arr = np.array([2 ** 63], dtype=object) + exp = np.array([2 ** 63], dtype=np.uint64) + tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp) + + # NumPy bug: can't compare uint64 to int64, as that + # results in both casting to float64, so we should + # make sure that this function is robust against it + arr = np.array([np.uint64(2 ** 63)], dtype=object) + exp = np.array([2 ** 63], dtype=np.uint64) + tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp) + + arr = np.array([2, -1], dtype=object) + exp = np.array([2, -1], dtype=np.int64) + tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp) + + arr = np.array([2 ** 63, -1], dtype=object) + exp = np.array([2 ** 63, -1], dtype=object) + tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp) + + def test_maybe_convert_objects_datetime(self): + # GH27438 + arr = np.array( + [np.datetime64("2000-01-01"), np.timedelta64(1, "s")], dtype=object + ) + exp = arr.copy() + out = lib.maybe_convert_objects(arr, convert_datetime=1, convert_timedelta=1) + tm.assert_numpy_array_equal(out, exp) + + arr = np.array([pd.NaT, np.timedelta64(1, "s")], dtype=object) + exp = np.array([np.timedelta64("NaT"), np.timedelta64(1, "s")], dtype="m8[ns]") + out = lib.maybe_convert_objects(arr, convert_datetime=1, convert_timedelta=1) + tm.assert_numpy_array_equal(out, exp) + + arr = np.array([np.timedelta64(1, "s"), np.nan], dtype=object) + exp = arr.copy() + out = lib.maybe_convert_objects(arr, convert_datetime=1, convert_timedelta=1) + tm.assert_numpy_array_equal(out, exp) + + @pytest.mark.parametrize( + "exp", + [ + IntegerArray(np.array([2, 0], dtype="i8"), np.array([False, True])), + IntegerArray(np.array([2, 0], dtype="int64"), np.array([False, True])), + ], + ) + def test_maybe_convert_objects_nullable_integer(self, exp): + # GH27335 + arr = np.array([2, np.NaN], dtype=object) + result = lib.maybe_convert_objects(arr, convert_to_nullable_integer=1) + + tm.assert_extension_array_equal(result, exp) + + def test_mixed_dtypes_remain_object_array(self): + # GH14956 + array = np.array([datetime(2015, 1, 1, tzinfo=pytz.utc), 1], dtype=object) + result = lib.maybe_convert_objects(array, convert_datetime=1) + tm.assert_numpy_array_equal(result, array) + + +class TestTypeInference: + + # Dummy class used for testing with Python objects + class Dummy: + pass + + def test_inferred_dtype_fixture(self, any_skipna_inferred_dtype): + # see pandas/conftest.py + inferred_dtype, values = any_skipna_inferred_dtype + + # make sure the inferred dtype of the fixture is as requested + assert inferred_dtype == lib.infer_dtype(values, skipna=True) + + @pytest.mark.parametrize("skipna", [True, False]) + def test_length_zero(self, skipna): + result = lib.infer_dtype(np.array([], dtype="i4"), skipna=skipna) + assert result == "integer" + + result = lib.infer_dtype([], skipna=skipna) + assert result == "empty" + + # GH 18004 + arr = np.array([np.array([], dtype=object), np.array([], dtype=object)]) + result = lib.infer_dtype(arr, skipna=skipna) + assert result == "empty" + + def test_integers(self): + arr = np.array([1, 2, 3, np.int64(4), np.int32(5)], dtype="O") + result = lib.infer_dtype(arr, skipna=True) + assert result == "integer" + + arr = np.array([1, 2, 3, np.int64(4), np.int32(5), "foo"], dtype="O") + result = lib.infer_dtype(arr, skipna=True) + assert result == "mixed-integer" + + arr = np.array([1, 2, 3, 4, 5], dtype="i4") + result = lib.infer_dtype(arr, skipna=True) + assert result == "integer" + + @pytest.mark.parametrize( + "arr, skipna", + [ + (np.array([1, 2, np.nan, np.nan, 3], dtype="O"), False), + (np.array([1, 2, np.nan, np.nan, 3], dtype="O"), True), + (np.array([1, 2, 3, np.int64(4), np.int32(5), np.nan], dtype="O"), False), + (np.array([1, 2, 3, np.int64(4), np.int32(5), np.nan], dtype="O"), True), + ], + ) + def test_integer_na(self, arr, skipna): + # GH 27392 + result = lib.infer_dtype(arr, skipna=skipna) + expected = "integer" if skipna else "integer-na" + assert result == expected + + def test_infer_dtype_skipna_default(self): + # infer_dtype `skipna` default deprecated in GH#24050, + # changed to True in GH#29876 + arr = np.array([1, 2, 3, np.nan], dtype=object) + + result = lib.infer_dtype(arr) + assert result == "integer" + + def test_bools(self): + arr = np.array([True, False, True, True, True], dtype="O") + result = lib.infer_dtype(arr, skipna=True) + assert result == "boolean" + + arr = np.array([np.bool_(True), np.bool_(False)], dtype="O") + result = lib.infer_dtype(arr, skipna=True) + assert result == "boolean" + + arr = np.array([True, False, True, "foo"], dtype="O") + result = lib.infer_dtype(arr, skipna=True) + assert result == "mixed" + + arr = np.array([True, False, True], dtype=bool) + result = lib.infer_dtype(arr, skipna=True) + assert result == "boolean" + + arr = np.array([True, np.nan, False], dtype="O") + result = lib.infer_dtype(arr, skipna=True) + assert result == "boolean" + + result = lib.infer_dtype(arr, skipna=False) + assert result == "mixed" + + def test_floats(self): + arr = np.array([1.0, 2.0, 3.0, np.float64(4), np.float32(5)], dtype="O") + result = lib.infer_dtype(arr, skipna=True) + assert result == "floating" + + arr = np.array([1, 2, 3, np.float64(4), np.float32(5), "foo"], dtype="O") + result = lib.infer_dtype(arr, skipna=True) + assert result == "mixed-integer" + + arr = np.array([1, 2, 3, 4, 5], dtype="f4") + result = lib.infer_dtype(arr, skipna=True) + assert result == "floating" + + arr = np.array([1, 2, 3, 4, 5], dtype="f8") + result = lib.infer_dtype(arr, skipna=True) + assert result == "floating" + + def test_decimals(self): + # GH15690 + arr = np.array([Decimal(1), Decimal(2), Decimal(3)]) + result = lib.infer_dtype(arr, skipna=True) + assert result == "decimal" + + arr = np.array([1.0, 2.0, Decimal(3)]) + result = lib.infer_dtype(arr, skipna=True) + assert result == "mixed" + + arr = np.array([Decimal(1), Decimal("NaN"), Decimal(3)]) + result = lib.infer_dtype(arr, skipna=True) + assert result == "decimal" + + arr = np.array([Decimal(1), np.nan, Decimal(3)], dtype="O") + result = lib.infer_dtype(arr, skipna=True) + assert result == "decimal" + + # complex is compatible with nan, so skipna has no effect + @pytest.mark.parametrize("skipna", [True, False]) + def test_complex(self, skipna): + # gets cast to complex on array construction + arr = np.array([1.0, 2.0, 1 + 1j]) + result = lib.infer_dtype(arr, skipna=skipna) + assert result == "complex" + + arr = np.array([1.0, 2.0, 1 + 1j], dtype="O") + result = lib.infer_dtype(arr, skipna=skipna) + assert result == "mixed" + + # gets cast to complex on array construction + arr = np.array([1, np.nan, 1 + 1j]) + result = lib.infer_dtype(arr, skipna=skipna) + assert result == "complex" + + arr = np.array([1.0, np.nan, 1 + 1j], dtype="O") + result = lib.infer_dtype(arr, skipna=skipna) + assert result == "mixed" + + # complex with nans stays complex + arr = np.array([1 + 1j, np.nan, 3 + 3j], dtype="O") + result = lib.infer_dtype(arr, skipna=skipna) + assert result == "complex" + + # test smaller complex dtype; will pass through _try_infer_map fastpath + arr = np.array([1 + 1j, np.nan, 3 + 3j], dtype=np.complex64) + result = lib.infer_dtype(arr, skipna=skipna) + assert result == "complex" + + def test_string(self): + pass + + def test_unicode(self): + arr = ["a", np.nan, "c"] + result = lib.infer_dtype(arr, skipna=False) + # This currently returns "mixed", but it's not clear that's optimal. + # This could also return "string" or "mixed-string" + assert result == "mixed" + + arr = ["a", np.nan, "c"] + result = lib.infer_dtype(arr, skipna=True) + assert result == "string" + + arr = ["a", "c"] + result = lib.infer_dtype(arr, skipna=False) + assert result == "string" + + @pytest.mark.parametrize( + "dtype, missing, skipna, expected", + [ + (float, np.nan, False, "floating"), + (float, np.nan, True, "floating"), + (object, np.nan, False, "floating"), + (object, np.nan, True, "empty"), + (object, None, False, "mixed"), + (object, None, True, "empty"), + ], + ) + @pytest.mark.parametrize("box", [pd.Series, np.array]) + def test_object_empty(self, box, missing, dtype, skipna, expected): + # GH 23421 + arr = box([missing, missing], dtype=dtype) + + result = lib.infer_dtype(arr, skipna=skipna) + assert result == expected + + def test_datetime(self): + + dates = [datetime(2012, 1, x) for x in range(1, 20)] + index = Index(dates) + assert index.inferred_type == "datetime64" + + def test_infer_dtype_datetime(self): + + arr = np.array([Timestamp("2011-01-01"), Timestamp("2011-01-02")]) + assert lib.infer_dtype(arr, skipna=True) == "datetime" + + arr = np.array( + [np.datetime64("2011-01-01"), np.datetime64("2011-01-01")], dtype=object + ) + assert lib.infer_dtype(arr, skipna=True) == "datetime64" + + arr = np.array([datetime(2011, 1, 1), datetime(2012, 2, 1)]) + assert lib.infer_dtype(arr, skipna=True) == "datetime" + + # starts with nan + for n in [pd.NaT, np.nan]: + arr = np.array([n, pd.Timestamp("2011-01-02")]) + assert lib.infer_dtype(arr, skipna=True) == "datetime" + + arr = np.array([n, np.datetime64("2011-01-02")]) + assert lib.infer_dtype(arr, skipna=True) == "datetime64" + + arr = np.array([n, datetime(2011, 1, 1)]) + assert lib.infer_dtype(arr, skipna=True) == "datetime" + + arr = np.array([n, pd.Timestamp("2011-01-02"), n]) + assert lib.infer_dtype(arr, skipna=True) == "datetime" + + arr = np.array([n, np.datetime64("2011-01-02"), n]) + assert lib.infer_dtype(arr, skipna=True) == "datetime64" + + arr = np.array([n, datetime(2011, 1, 1), n]) + assert lib.infer_dtype(arr, skipna=True) == "datetime" + + # different type of nat + arr = np.array( + [np.timedelta64("nat"), np.datetime64("2011-01-02")], dtype=object + ) + assert lib.infer_dtype(arr, skipna=False) == "mixed" + + arr = np.array( + [np.datetime64("2011-01-02"), np.timedelta64("nat")], dtype=object + ) + assert lib.infer_dtype(arr, skipna=False) == "mixed" + + # mixed datetime + arr = np.array([datetime(2011, 1, 1), pd.Timestamp("2011-01-02")]) + assert lib.infer_dtype(arr, skipna=True) == "datetime" + + # should be datetime? + arr = np.array([np.datetime64("2011-01-01"), pd.Timestamp("2011-01-02")]) + assert lib.infer_dtype(arr, skipna=True) == "mixed" + + arr = np.array([pd.Timestamp("2011-01-02"), np.datetime64("2011-01-01")]) + assert lib.infer_dtype(arr, skipna=True) == "mixed" + + arr = np.array([np.nan, pd.Timestamp("2011-01-02"), 1]) + assert lib.infer_dtype(arr, skipna=True) == "mixed-integer" + + arr = np.array([np.nan, pd.Timestamp("2011-01-02"), 1.1]) + assert lib.infer_dtype(arr, skipna=True) == "mixed" + + arr = np.array([np.nan, "2011-01-01", pd.Timestamp("2011-01-02")]) + assert lib.infer_dtype(arr, skipna=True) == "mixed" + + def test_infer_dtype_timedelta(self): + + arr = np.array([pd.Timedelta("1 days"), pd.Timedelta("2 days")]) + assert lib.infer_dtype(arr, skipna=True) == "timedelta" + + arr = np.array([np.timedelta64(1, "D"), np.timedelta64(2, "D")], dtype=object) + assert lib.infer_dtype(arr, skipna=True) == "timedelta" + + arr = np.array([timedelta(1), timedelta(2)]) + assert lib.infer_dtype(arr, skipna=True) == "timedelta" + + # starts with nan + for n in [pd.NaT, np.nan]: + arr = np.array([n, Timedelta("1 days")]) + assert lib.infer_dtype(arr, skipna=True) == "timedelta" + + arr = np.array([n, np.timedelta64(1, "D")]) + assert lib.infer_dtype(arr, skipna=True) == "timedelta" + + arr = np.array([n, timedelta(1)]) + assert lib.infer_dtype(arr, skipna=True) == "timedelta" + + arr = np.array([n, pd.Timedelta("1 days"), n]) + assert lib.infer_dtype(arr, skipna=True) == "timedelta" + + arr = np.array([n, np.timedelta64(1, "D"), n]) + assert lib.infer_dtype(arr, skipna=True) == "timedelta" + + arr = np.array([n, timedelta(1), n]) + assert lib.infer_dtype(arr, skipna=True) == "timedelta" + + # different type of nat + arr = np.array([np.datetime64("nat"), np.timedelta64(1, "D")], dtype=object) + assert lib.infer_dtype(arr, skipna=False) == "mixed" + + arr = np.array([np.timedelta64(1, "D"), np.datetime64("nat")], dtype=object) + assert lib.infer_dtype(arr, skipna=False) == "mixed" + + def test_infer_dtype_period(self): + # GH 13664 + arr = np.array([pd.Period("2011-01", freq="D"), pd.Period("2011-02", freq="D")]) + assert lib.infer_dtype(arr, skipna=True) == "period" + + arr = np.array([pd.Period("2011-01", freq="D"), pd.Period("2011-02", freq="M")]) + assert lib.infer_dtype(arr, skipna=True) == "period" + + # starts with nan + for n in [pd.NaT, np.nan]: + arr = np.array([n, pd.Period("2011-01", freq="D")]) + assert lib.infer_dtype(arr, skipna=True) == "period" + + arr = np.array([n, pd.Period("2011-01", freq="D"), n]) + assert lib.infer_dtype(arr, skipna=True) == "period" + + # different type of nat + arr = np.array( + [np.datetime64("nat"), pd.Period("2011-01", freq="M")], dtype=object + ) + assert lib.infer_dtype(arr, skipna=False) == "mixed" + + arr = np.array( + [pd.Period("2011-01", freq="M"), np.datetime64("nat")], dtype=object + ) + assert lib.infer_dtype(arr, skipna=False) == "mixed" + + @pytest.mark.parametrize( + "data", + [ + [datetime(2017, 6, 12, 19, 30), datetime(2017, 3, 11, 1, 15)], + [Timestamp("20170612"), Timestamp("20170311")], + [ + Timestamp("20170612", tz="US/Eastern"), + Timestamp("20170311", tz="US/Eastern"), + ], + [date(2017, 6, 12), Timestamp("20170311", tz="US/Eastern")], + [np.datetime64("2017-06-12"), np.datetime64("2017-03-11")], + [np.datetime64("2017-06-12"), datetime(2017, 3, 11, 1, 15)], + ], + ) + def test_infer_datetimelike_array_datetime(self, data): + assert lib.infer_datetimelike_array(data) == "datetime" + + @pytest.mark.parametrize( + "data", + [ + [timedelta(2017, 6, 12), timedelta(2017, 3, 11)], + [timedelta(2017, 6, 12), date(2017, 3, 11)], + [np.timedelta64(2017, "D"), np.timedelta64(6, "s")], + [np.timedelta64(2017, "D"), timedelta(2017, 3, 11)], + ], + ) + def test_infer_datetimelike_array_timedelta(self, data): + assert lib.infer_datetimelike_array(data) == "timedelta" + + def test_infer_datetimelike_array_date(self): + arr = [date(2017, 6, 12), date(2017, 3, 11)] + assert lib.infer_datetimelike_array(arr) == "date" + + @pytest.mark.parametrize( + "data", + [ + ["2017-06-12", "2017-03-11"], + [20170612, 20170311], + [20170612.5, 20170311.8], + [Dummy(), Dummy()], + [Timestamp("20170612"), Timestamp("20170311", tz="US/Eastern")], + [Timestamp("20170612"), 20170311], + [timedelta(2017, 6, 12), Timestamp("20170311", tz="US/Eastern")], + ], + ) + def test_infer_datetimelike_array_mixed(self, data): + assert lib.infer_datetimelike_array(data) == "mixed" + + @pytest.mark.parametrize( + "first, expected", + [ + [[None], "mixed"], + [[np.nan], "mixed"], + [[pd.NaT], "nat"], + [[datetime(2017, 6, 12, 19, 30), pd.NaT], "datetime"], + [[np.datetime64("2017-06-12"), pd.NaT], "datetime"], + [[date(2017, 6, 12), pd.NaT], "date"], + [[timedelta(2017, 6, 12), pd.NaT], "timedelta"], + [[np.timedelta64(2017, "D"), pd.NaT], "timedelta"], + ], + ) + @pytest.mark.parametrize("second", [None, np.nan]) + def test_infer_datetimelike_array_nan_nat_like(self, first, second, expected): + first.append(second) + assert lib.infer_datetimelike_array(first) == expected + + def test_infer_dtype_all_nan_nat_like(self): + arr = np.array([np.nan, np.nan]) + assert lib.infer_dtype(arr, skipna=True) == "floating" + + # nan and None mix are result in mixed + arr = np.array([np.nan, np.nan, None]) + assert lib.infer_dtype(arr, skipna=True) == "empty" + assert lib.infer_dtype(arr, skipna=False) == "mixed" + + arr = np.array([None, np.nan, np.nan]) + assert lib.infer_dtype(arr, skipna=True) == "empty" + assert lib.infer_dtype(arr, skipna=False) == "mixed" + + # pd.NaT + arr = np.array([pd.NaT]) + assert lib.infer_dtype(arr, skipna=False) == "datetime" + + arr = np.array([pd.NaT, np.nan]) + assert lib.infer_dtype(arr, skipna=False) == "datetime" + + arr = np.array([np.nan, pd.NaT]) + assert lib.infer_dtype(arr, skipna=False) == "datetime" + + arr = np.array([np.nan, pd.NaT, np.nan]) + assert lib.infer_dtype(arr, skipna=False) == "datetime" + + arr = np.array([None, pd.NaT, None]) + assert lib.infer_dtype(arr, skipna=False) == "datetime" + + # np.datetime64(nat) + arr = np.array([np.datetime64("nat")]) + assert lib.infer_dtype(arr, skipna=False) == "datetime64" + + for n in [np.nan, pd.NaT, None]: + arr = np.array([n, np.datetime64("nat"), n]) + assert lib.infer_dtype(arr, skipna=False) == "datetime64" + + arr = np.array([pd.NaT, n, np.datetime64("nat"), n]) + assert lib.infer_dtype(arr, skipna=False) == "datetime64" + + arr = np.array([np.timedelta64("nat")], dtype=object) + assert lib.infer_dtype(arr, skipna=False) == "timedelta" + + for n in [np.nan, pd.NaT, None]: + arr = np.array([n, np.timedelta64("nat"), n]) + assert lib.infer_dtype(arr, skipna=False) == "timedelta" + + arr = np.array([pd.NaT, n, np.timedelta64("nat"), n]) + assert lib.infer_dtype(arr, skipna=False) == "timedelta" + + # datetime / timedelta mixed + arr = np.array([pd.NaT, np.datetime64("nat"), np.timedelta64("nat"), np.nan]) + assert lib.infer_dtype(arr, skipna=False) == "mixed" + + arr = np.array([np.timedelta64("nat"), np.datetime64("nat")], dtype=object) + assert lib.infer_dtype(arr, skipna=False) == "mixed" + + def test_is_datetimelike_array_all_nan_nat_like(self): + arr = np.array([np.nan, pd.NaT, np.datetime64("nat")]) + assert lib.is_datetime_array(arr) + assert lib.is_datetime64_array(arr) + assert not lib.is_timedelta_or_timedelta64_array(arr) + + arr = np.array([np.nan, pd.NaT, np.timedelta64("nat")]) + assert not lib.is_datetime_array(arr) + assert not lib.is_datetime64_array(arr) + assert lib.is_timedelta_or_timedelta64_array(arr) + + arr = np.array([np.nan, pd.NaT, np.datetime64("nat"), np.timedelta64("nat")]) + assert not lib.is_datetime_array(arr) + assert not lib.is_datetime64_array(arr) + assert not lib.is_timedelta_or_timedelta64_array(arr) + + arr = np.array([np.nan, pd.NaT]) + assert lib.is_datetime_array(arr) + assert lib.is_datetime64_array(arr) + assert lib.is_timedelta_or_timedelta64_array(arr) + + arr = np.array([np.nan, np.nan], dtype=object) + assert not lib.is_datetime_array(arr) + assert not lib.is_datetime64_array(arr) + assert not lib.is_timedelta_or_timedelta64_array(arr) + + assert lib.is_datetime_with_singletz_array( + np.array( + [ + pd.Timestamp("20130101", tz="US/Eastern"), + pd.Timestamp("20130102", tz="US/Eastern"), + ], + dtype=object, + ) + ) + assert not lib.is_datetime_with_singletz_array( + np.array( + [ + pd.Timestamp("20130101", tz="US/Eastern"), + pd.Timestamp("20130102", tz="CET"), + ], + dtype=object, + ) + ) + + @pytest.mark.parametrize( + "func", + [ + "is_datetime_array", + "is_datetime64_array", + "is_bool_array", + "is_timedelta_or_timedelta64_array", + "is_date_array", + "is_time_array", + "is_interval_array", + "is_period_array", + ], + ) + def test_other_dtypes_for_array(self, func): + func = getattr(lib, func) + arr = np.array(["foo", "bar"]) + assert not func(arr) + + arr = np.array([1, 2]) + assert not func(arr) + + def test_date(self): + + dates = [date(2012, 1, day) for day in range(1, 20)] + index = Index(dates) + assert index.inferred_type == "date" + + dates = [date(2012, 1, day) for day in range(1, 20)] + [np.nan] + result = lib.infer_dtype(dates, skipna=False) + assert result == "mixed" + + result = lib.infer_dtype(dates, skipna=True) + assert result == "date" + + def test_is_numeric_array(self): + + assert lib.is_float_array(np.array([1, 2.0])) + assert lib.is_float_array(np.array([1, 2.0, np.nan])) + assert not lib.is_float_array(np.array([1, 2])) + + assert lib.is_integer_array(np.array([1, 2])) + assert not lib.is_integer_array(np.array([1, 2.0])) + + def test_is_string_array(self): + + assert lib.is_string_array(np.array(["foo", "bar"])) + assert not lib.is_string_array( + np.array(["foo", "bar", pd.NA], dtype=object), skipna=False + ) + assert lib.is_string_array( + np.array(["foo", "bar", pd.NA], dtype=object), skipna=True + ) + # NaN is not valid for string array, just NA + assert not lib.is_string_array( + np.array(["foo", "bar", np.nan], dtype=object), skipna=True + ) + + assert not lib.is_string_array(np.array([1, 2])) + + def test_to_object_array_tuples(self): + r = (5, 6) + values = [r] + lib.to_object_array_tuples(values) + + # make sure record array works + record = namedtuple("record", "x y") + r = record(5, 6) + values = [r] + lib.to_object_array_tuples(values) + + def test_object(self): + + # GH 7431 + # cannot infer more than this as only a single element + arr = np.array([None], dtype="O") + result = lib.infer_dtype(arr, skipna=False) + assert result == "mixed" + result = lib.infer_dtype(arr, skipna=True) + assert result == "empty" + + def test_to_object_array_width(self): + # see gh-13320 + rows = [[1, 2, 3], [4, 5, 6]] + + expected = np.array(rows, dtype=object) + out = lib.to_object_array(rows) + tm.assert_numpy_array_equal(out, expected) + + expected = np.array(rows, dtype=object) + out = lib.to_object_array(rows, min_width=1) + tm.assert_numpy_array_equal(out, expected) + + expected = np.array( + [[1, 2, 3, None, None], [4, 5, 6, None, None]], dtype=object + ) + out = lib.to_object_array(rows, min_width=5) + tm.assert_numpy_array_equal(out, expected) + + def test_is_period(self): + assert lib.is_period(pd.Period("2011-01", freq="M")) + assert not lib.is_period(pd.PeriodIndex(["2011-01"], freq="M")) + assert not lib.is_period(pd.Timestamp("2011-01")) + assert not lib.is_period(1) + assert not lib.is_period(np.nan) + + def test_categorical(self): + + # GH 8974 + arr = Categorical(list("abc")) + result = lib.infer_dtype(arr, skipna=True) + assert result == "categorical" + + result = lib.infer_dtype(Series(arr), skipna=True) + assert result == "categorical" + + arr = Categorical(list("abc"), categories=["cegfab"], ordered=True) + result = lib.infer_dtype(arr, skipna=True) + assert result == "categorical" + + result = lib.infer_dtype(Series(arr), skipna=True) + assert result == "categorical" + + def test_interval(self): + idx = pd.IntervalIndex.from_breaks(range(5), closed="both") + inferred = lib.infer_dtype(idx, skipna=False) + assert inferred == "interval" + + inferred = lib.infer_dtype(idx._data, skipna=False) + assert inferred == "interval" + + inferred = lib.infer_dtype(pd.Series(idx), skipna=False) + assert inferred == "interval" + + +class TestNumberScalar: + def test_is_number(self): + + assert is_number(True) + assert is_number(1) + assert is_number(1.1) + assert is_number(1 + 3j) + assert is_number(np.bool(False)) + assert is_number(np.int64(1)) + assert is_number(np.float64(1.1)) + assert is_number(np.complex128(1 + 3j)) + assert is_number(np.nan) + + assert not is_number(None) + assert not is_number("x") + assert not is_number(datetime(2011, 1, 1)) + assert not is_number(np.datetime64("2011-01-01")) + assert not is_number(Timestamp("2011-01-01")) + assert not is_number(Timestamp("2011-01-01", tz="US/Eastern")) + assert not is_number(timedelta(1000)) + assert not is_number(Timedelta("1 days")) + + # questionable + assert not is_number(np.bool_(False)) + assert is_number(np.timedelta64(1, "D")) + + def test_is_bool(self): + assert is_bool(True) + assert is_bool(np.bool(False)) + assert is_bool(np.bool_(False)) + + assert not is_bool(1) + assert not is_bool(1.1) + assert not is_bool(1 + 3j) + assert not is_bool(np.int64(1)) + assert not is_bool(np.float64(1.1)) + assert not is_bool(np.complex128(1 + 3j)) + assert not is_bool(np.nan) + assert not is_bool(None) + assert not is_bool("x") + assert not is_bool(datetime(2011, 1, 1)) + assert not is_bool(np.datetime64("2011-01-01")) + assert not is_bool(Timestamp("2011-01-01")) + assert not is_bool(Timestamp("2011-01-01", tz="US/Eastern")) + assert not is_bool(timedelta(1000)) + assert not is_bool(np.timedelta64(1, "D")) + assert not is_bool(Timedelta("1 days")) + + def test_is_integer(self): + assert is_integer(1) + assert is_integer(np.int64(1)) + + assert not is_integer(True) + assert not is_integer(1.1) + assert not is_integer(1 + 3j) + assert not is_integer(np.bool(False)) + assert not is_integer(np.bool_(False)) + assert not is_integer(np.float64(1.1)) + assert not is_integer(np.complex128(1 + 3j)) + assert not is_integer(np.nan) + assert not is_integer(None) + assert not is_integer("x") + assert not is_integer(datetime(2011, 1, 1)) + assert not is_integer(np.datetime64("2011-01-01")) + assert not is_integer(Timestamp("2011-01-01")) + assert not is_integer(Timestamp("2011-01-01", tz="US/Eastern")) + assert not is_integer(timedelta(1000)) + assert not is_integer(Timedelta("1 days")) + assert not is_integer(np.timedelta64(1, "D")) + + def test_is_float(self): + assert is_float(1.1) + assert is_float(np.float64(1.1)) + assert is_float(np.nan) + + assert not is_float(True) + assert not is_float(1) + assert not is_float(1 + 3j) + assert not is_float(np.bool(False)) + assert not is_float(np.bool_(False)) + assert not is_float(np.int64(1)) + assert not is_float(np.complex128(1 + 3j)) + assert not is_float(None) + assert not is_float("x") + assert not is_float(datetime(2011, 1, 1)) + assert not is_float(np.datetime64("2011-01-01")) + assert not is_float(Timestamp("2011-01-01")) + assert not is_float(Timestamp("2011-01-01", tz="US/Eastern")) + assert not is_float(timedelta(1000)) + assert not is_float(np.timedelta64(1, "D")) + assert not is_float(Timedelta("1 days")) + + def test_is_datetime_dtypes(self): + + ts = pd.date_range("20130101", periods=3) + tsa = pd.date_range("20130101", periods=3, tz="US/Eastern") + + assert is_datetime64_dtype("datetime64") + assert is_datetime64_dtype("datetime64[ns]") + assert is_datetime64_dtype(ts) + assert not is_datetime64_dtype(tsa) + + assert not is_datetime64_ns_dtype("datetime64") + assert is_datetime64_ns_dtype("datetime64[ns]") + assert is_datetime64_ns_dtype(ts) + assert is_datetime64_ns_dtype(tsa) + + assert is_datetime64_any_dtype("datetime64") + assert is_datetime64_any_dtype("datetime64[ns]") + assert is_datetime64_any_dtype(ts) + assert is_datetime64_any_dtype(tsa) + + assert not is_datetime64tz_dtype("datetime64") + assert not is_datetime64tz_dtype("datetime64[ns]") + assert not is_datetime64tz_dtype(ts) + assert is_datetime64tz_dtype(tsa) + + for tz in ["US/Eastern", "UTC"]: + dtype = f"datetime64[ns, {tz}]" + assert not is_datetime64_dtype(dtype) + assert is_datetime64tz_dtype(dtype) + assert is_datetime64_ns_dtype(dtype) + assert is_datetime64_any_dtype(dtype) + + def test_is_timedelta(self): + assert is_timedelta64_dtype("timedelta64") + assert is_timedelta64_dtype("timedelta64[ns]") + assert not is_timedelta64_ns_dtype("timedelta64") + assert is_timedelta64_ns_dtype("timedelta64[ns]") + + tdi = TimedeltaIndex([1e14, 2e14], dtype="timedelta64[ns]") + assert is_timedelta64_dtype(tdi) + assert is_timedelta64_ns_dtype(tdi) + assert is_timedelta64_ns_dtype(tdi.astype("timedelta64[ns]")) + + # Conversion to Int64Index: + assert not is_timedelta64_ns_dtype(tdi.astype("timedelta64")) + assert not is_timedelta64_ns_dtype(tdi.astype("timedelta64[h]")) + + +class TestIsScalar: + def test_is_scalar_builtin_scalars(self): + assert is_scalar(None) + assert is_scalar(True) + assert is_scalar(False) + assert is_scalar(Number()) + assert is_scalar(Fraction()) + assert is_scalar(0.0) + assert is_scalar(np.nan) + assert is_scalar("foobar") + assert is_scalar(b"foobar") + assert is_scalar(datetime(2014, 1, 1)) + assert is_scalar(date(2014, 1, 1)) + assert is_scalar(time(12, 0)) + assert is_scalar(timedelta(hours=1)) + assert is_scalar(pd.NaT) + + def test_is_scalar_builtin_nonscalars(self): + assert not is_scalar({}) + assert not is_scalar([]) + assert not is_scalar([1]) + assert not is_scalar(()) + assert not is_scalar((1,)) + assert not is_scalar(slice(None)) + assert not is_scalar(Ellipsis) + + def test_is_scalar_numpy_array_scalars(self): + assert is_scalar(np.int64(1)) + assert is_scalar(np.float64(1.0)) + assert is_scalar(np.int32(1)) + assert is_scalar(np.object_("foobar")) + assert is_scalar(np.str_("foobar")) + assert is_scalar(np.unicode_("foobar")) + assert is_scalar(np.bytes_(b"foobar")) + assert is_scalar(np.datetime64("2014-01-01")) + assert is_scalar(np.timedelta64(1, "h")) + + def test_is_scalar_numpy_zerodim_arrays(self): + for zerodim in [ + np.array(1), + np.array("foobar"), + np.array(np.datetime64("2014-01-01")), + np.array(np.timedelta64(1, "h")), + np.array(np.datetime64("NaT")), + ]: + assert not is_scalar(zerodim) + assert is_scalar(lib.item_from_zerodim(zerodim)) + + @pytest.mark.filterwarnings("ignore::PendingDeprecationWarning") + def test_is_scalar_numpy_arrays(self): + assert not is_scalar(np.array([])) + assert not is_scalar(np.array([[]])) + assert not is_scalar(np.matrix("1; 2")) + + def test_is_scalar_pandas_scalars(self): + assert is_scalar(Timestamp("2014-01-01")) + assert is_scalar(Timedelta(hours=1)) + assert is_scalar(Period("2014-01-01")) + assert is_scalar(Interval(left=0, right=1)) + assert is_scalar(DateOffset(days=1)) + + def test_is_scalar_pandas_containers(self): + assert not is_scalar(Series(dtype=object)) + assert not is_scalar(Series([1])) + assert not is_scalar(DataFrame()) + assert not is_scalar(DataFrame([[1]])) + assert not is_scalar(Index([])) + assert not is_scalar(Index([1])) + + +def test_datetimeindex_from_empty_datetime64_array(): + for unit in ["ms", "us", "ns"]: + idx = DatetimeIndex(np.array([], dtype=f"datetime64[{unit}]")) + assert len(idx) == 0 + + +def test_nan_to_nat_conversions(): + + df = DataFrame( + dict({"A": np.asarray(range(10), dtype="float64"), "B": Timestamp("20010101")}) + ) + df.iloc[3:6, :] = np.nan + result = df.loc[4, "B"].value + assert result == iNaT + + s = df["B"].copy() + s._data = s._data.setitem(indexer=tuple([slice(8, 9)]), value=np.nan) + assert isna(s[8]) + + assert s[8].value == np.datetime64("NaT").astype(np.int64) + + +@td.skip_if_no_scipy +@pytest.mark.filterwarnings("ignore::PendingDeprecationWarning") +def test_is_scipy_sparse(spmatrix): # noqa: F811 + assert is_scipy_sparse(spmatrix([[0, 1]])) + assert not is_scipy_sparse(np.array([1])) + + +def test_ensure_int32(): + values = np.arange(10, dtype=np.int32) + result = ensure_int32(values) + assert result.dtype == np.int32 + + values = np.arange(10, dtype=np.int64) + result = ensure_int32(values) + assert result.dtype == np.int32 + + +def test_ensure_categorical(): + values = np.arange(10, dtype=np.int32) + result = ensure_categorical(values) + assert result.dtype == "category" + + values = Categorical(values) + result = ensure_categorical(values) + tm.assert_categorical_equal(result, values) diff --git a/venv/Lib/site-packages/pandas/tests/dtypes/test_missing.py b/venv/Lib/site-packages/pandas/tests/dtypes/test_missing.py new file mode 100644 index 0000000..7ba5978 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/dtypes/test_missing.py @@ -0,0 +1,586 @@ +from datetime import datetime +from decimal import Decimal + +import numpy as np +import pytest + +from pandas._config import config as cf + +from pandas._libs import missing as libmissing +from pandas._libs.tslibs import iNaT, is_null_datetimelike + +from pandas.core.dtypes.common import is_scalar +from pandas.core.dtypes.dtypes import DatetimeTZDtype, IntervalDtype, PeriodDtype +from pandas.core.dtypes.missing import ( + array_equivalent, + isna, + isnull, + na_value_for_dtype, + notna, + notnull, +) + +import pandas as pd +from pandas import DatetimeIndex, Float64Index, NaT, Series, TimedeltaIndex, date_range +import pandas._testing as tm + +now = pd.Timestamp.now() +utcnow = pd.Timestamp.now("UTC") + + +@pytest.mark.parametrize("notna_f", [notna, notnull]) +def test_notna_notnull(notna_f): + assert notna_f(1.0) + assert not notna_f(None) + assert not notna_f(np.NaN) + + with cf.option_context("mode.use_inf_as_na", False): + assert notna_f(np.inf) + assert notna_f(-np.inf) + + arr = np.array([1.5, np.inf, 3.5, -np.inf]) + result = notna_f(arr) + assert result.all() + + with cf.option_context("mode.use_inf_as_na", True): + assert not notna_f(np.inf) + assert not notna_f(-np.inf) + + arr = np.array([1.5, np.inf, 3.5, -np.inf]) + result = notna_f(arr) + assert result.sum() == 2 + + with cf.option_context("mode.use_inf_as_na", False): + for s in [ + tm.makeFloatSeries(), + tm.makeStringSeries(), + tm.makeObjectSeries(), + tm.makeTimeSeries(), + tm.makePeriodSeries(), + ]: + assert isinstance(notna_f(s), Series) + + +class TestIsNA: + def test_0d_array(self): + assert isna(np.array(np.nan)) + assert not isna(np.array(0.0)) + assert not isna(np.array(0)) + # test object dtype + assert isna(np.array(np.nan, dtype=object)) + assert not isna(np.array(0.0, dtype=object)) + assert not isna(np.array(0, dtype=object)) + + def test_empty_object(self): + + for shape in [(4, 0), (4,)]: + arr = np.empty(shape=shape, dtype=object) + result = isna(arr) + expected = np.ones(shape=shape, dtype=bool) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("isna_f", [isna, isnull]) + def test_isna_isnull(self, isna_f): + assert not isna_f(1.0) + assert isna_f(None) + assert isna_f(np.NaN) + assert float("nan") + assert not isna_f(np.inf) + assert not isna_f(-np.inf) + + # type + assert not isna_f(type(pd.Series(dtype=object))) + assert not isna_f(type(pd.Series(dtype=np.float64))) + assert not isna_f(type(pd.DataFrame())) + + # series + for s in [ + tm.makeFloatSeries(), + tm.makeStringSeries(), + tm.makeObjectSeries(), + tm.makeTimeSeries(), + tm.makePeriodSeries(), + ]: + assert isinstance(isna_f(s), Series) + + # frame + for df in [ + tm.makeTimeDataFrame(), + tm.makePeriodFrame(), + tm.makeMixedDataFrame(), + ]: + result = isna_f(df) + expected = df.apply(isna_f) + tm.assert_frame_equal(result, expected) + + def test_isna_lists(self): + result = isna([[False]]) + exp = np.array([[False]]) + tm.assert_numpy_array_equal(result, exp) + + result = isna([[1], [2]]) + exp = np.array([[False], [False]]) + tm.assert_numpy_array_equal(result, exp) + + # list of strings / unicode + result = isna(["foo", "bar"]) + exp = np.array([False, False]) + tm.assert_numpy_array_equal(result, exp) + + result = isna(["foo", "bar"]) + exp = np.array([False, False]) + tm.assert_numpy_array_equal(result, exp) + + # GH20675 + result = isna([np.NaN, "world"]) + exp = np.array([True, False]) + tm.assert_numpy_array_equal(result, exp) + + def test_isna_nat(self): + result = isna([NaT]) + exp = np.array([True]) + tm.assert_numpy_array_equal(result, exp) + + result = isna(np.array([NaT], dtype=object)) + exp = np.array([True]) + tm.assert_numpy_array_equal(result, exp) + + def test_isna_numpy_nat(self): + arr = np.array( + [ + NaT, + np.datetime64("NaT"), + np.timedelta64("NaT"), + np.datetime64("NaT", "s"), + ] + ) + result = isna(arr) + expected = np.array([True] * 4) + tm.assert_numpy_array_equal(result, expected) + + def test_isna_datetime(self): + assert not isna(datetime.now()) + assert notna(datetime.now()) + + idx = date_range("1/1/1990", periods=20) + exp = np.ones(len(idx), dtype=bool) + tm.assert_numpy_array_equal(notna(idx), exp) + + idx = np.asarray(idx) + idx[0] = iNaT + idx = DatetimeIndex(idx) + mask = isna(idx) + assert mask[0] + exp = np.array([True] + [False] * (len(idx) - 1), dtype=bool) + tm.assert_numpy_array_equal(mask, exp) + + # GH 9129 + pidx = idx.to_period(freq="M") + mask = isna(pidx) + assert mask[0] + exp = np.array([True] + [False] * (len(idx) - 1), dtype=bool) + tm.assert_numpy_array_equal(mask, exp) + + mask = isna(pidx[1:]) + exp = np.zeros(len(mask), dtype=bool) + tm.assert_numpy_array_equal(mask, exp) + + @pytest.mark.parametrize( + "value, expected", + [ + (np.complex128(np.nan), True), + (np.float64(1), False), + (np.array([1, 1 + 0j, np.nan, 3]), np.array([False, False, True, False])), + ( + np.array([1, 1 + 0j, np.nan, 3], dtype=object), + np.array([False, False, True, False]), + ), + ( + np.array([1, 1 + 0j, np.nan, 3]).astype(object), + np.array([False, False, True, False]), + ), + ], + ) + def test_complex(self, value, expected): + result = isna(value) + if is_scalar(result): + assert result is expected + else: + tm.assert_numpy_array_equal(result, expected) + + def test_datetime_other_units(self): + idx = pd.DatetimeIndex(["2011-01-01", "NaT", "2011-01-02"]) + exp = np.array([False, True, False]) + tm.assert_numpy_array_equal(isna(idx), exp) + tm.assert_numpy_array_equal(notna(idx), ~exp) + tm.assert_numpy_array_equal(isna(idx.values), exp) + tm.assert_numpy_array_equal(notna(idx.values), ~exp) + + for dtype in [ + "datetime64[D]", + "datetime64[h]", + "datetime64[m]", + "datetime64[s]", + "datetime64[ms]", + "datetime64[us]", + "datetime64[ns]", + ]: + values = idx.values.astype(dtype) + + exp = np.array([False, True, False]) + tm.assert_numpy_array_equal(isna(values), exp) + tm.assert_numpy_array_equal(notna(values), ~exp) + + exp = pd.Series([False, True, False]) + s = pd.Series(values) + tm.assert_series_equal(isna(s), exp) + tm.assert_series_equal(notna(s), ~exp) + s = pd.Series(values, dtype=object) + tm.assert_series_equal(isna(s), exp) + tm.assert_series_equal(notna(s), ~exp) + + def test_timedelta_other_units(self): + idx = pd.TimedeltaIndex(["1 days", "NaT", "2 days"]) + exp = np.array([False, True, False]) + tm.assert_numpy_array_equal(isna(idx), exp) + tm.assert_numpy_array_equal(notna(idx), ~exp) + tm.assert_numpy_array_equal(isna(idx.values), exp) + tm.assert_numpy_array_equal(notna(idx.values), ~exp) + + for dtype in [ + "timedelta64[D]", + "timedelta64[h]", + "timedelta64[m]", + "timedelta64[s]", + "timedelta64[ms]", + "timedelta64[us]", + "timedelta64[ns]", + ]: + values = idx.values.astype(dtype) + + exp = np.array([False, True, False]) + tm.assert_numpy_array_equal(isna(values), exp) + tm.assert_numpy_array_equal(notna(values), ~exp) + + exp = pd.Series([False, True, False]) + s = pd.Series(values) + tm.assert_series_equal(isna(s), exp) + tm.assert_series_equal(notna(s), ~exp) + s = pd.Series(values, dtype=object) + tm.assert_series_equal(isna(s), exp) + tm.assert_series_equal(notna(s), ~exp) + + def test_period(self): + idx = pd.PeriodIndex(["2011-01", "NaT", "2012-01"], freq="M") + exp = np.array([False, True, False]) + tm.assert_numpy_array_equal(isna(idx), exp) + tm.assert_numpy_array_equal(notna(idx), ~exp) + + exp = pd.Series([False, True, False]) + s = pd.Series(idx) + tm.assert_series_equal(isna(s), exp) + tm.assert_series_equal(notna(s), ~exp) + s = pd.Series(idx, dtype=object) + tm.assert_series_equal(isna(s), exp) + tm.assert_series_equal(notna(s), ~exp) + + +def test_array_equivalent(): + assert array_equivalent(np.array([np.nan, np.nan]), np.array([np.nan, np.nan])) + assert array_equivalent( + np.array([np.nan, 1, np.nan]), np.array([np.nan, 1, np.nan]) + ) + assert array_equivalent( + np.array([np.nan, None], dtype="object"), + np.array([np.nan, None], dtype="object"), + ) + # Check the handling of nested arrays in array_equivalent_object + assert array_equivalent( + np.array([np.array([np.nan, None], dtype="object"), None], dtype="object"), + np.array([np.array([np.nan, None], dtype="object"), None], dtype="object"), + ) + assert array_equivalent( + np.array([np.nan, 1 + 1j], dtype="complex"), + np.array([np.nan, 1 + 1j], dtype="complex"), + ) + assert not array_equivalent( + np.array([np.nan, 1 + 1j], dtype="complex"), + np.array([np.nan, 1 + 2j], dtype="complex"), + ) + assert not array_equivalent( + np.array([np.nan, 1, np.nan]), np.array([np.nan, 2, np.nan]) + ) + assert not array_equivalent(np.array(["a", "b", "c", "d"]), np.array(["e", "e"])) + assert array_equivalent(Float64Index([0, np.nan]), Float64Index([0, np.nan])) + assert not array_equivalent(Float64Index([0, np.nan]), Float64Index([1, np.nan])) + assert array_equivalent(DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan])) + assert not array_equivalent(DatetimeIndex([0, np.nan]), DatetimeIndex([1, np.nan])) + assert array_equivalent(TimedeltaIndex([0, np.nan]), TimedeltaIndex([0, np.nan])) + assert not array_equivalent( + TimedeltaIndex([0, np.nan]), TimedeltaIndex([1, np.nan]) + ) + assert array_equivalent( + DatetimeIndex([0, np.nan], tz="US/Eastern"), + DatetimeIndex([0, np.nan], tz="US/Eastern"), + ) + assert not array_equivalent( + DatetimeIndex([0, np.nan], tz="US/Eastern"), + DatetimeIndex([1, np.nan], tz="US/Eastern"), + ) + assert not array_equivalent( + DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan], tz="US/Eastern") + ) + assert not array_equivalent( + DatetimeIndex([0, np.nan], tz="CET"), + DatetimeIndex([0, np.nan], tz="US/Eastern"), + ) + + assert not array_equivalent(DatetimeIndex([0, np.nan]), TimedeltaIndex([0, np.nan])) + + +@pytest.mark.parametrize( + "lvalue, rvalue", + [ + # There are 3 variants for each of lvalue and rvalue. We include all + # three for the tz-naive `now` and exclude the datetim64 variant + # for utcnow because it drops tzinfo. + (now, utcnow), + (now.to_datetime64(), utcnow), + (now.to_pydatetime(), utcnow), + (now, utcnow), + (now.to_datetime64(), utcnow.to_pydatetime()), + (now.to_pydatetime(), utcnow.to_pydatetime()), + ], +) +def test_array_equivalent_tzawareness(lvalue, rvalue): + # we shouldn't raise if comparing tzaware and tznaive datetimes + left = np.array([lvalue], dtype=object) + right = np.array([rvalue], dtype=object) + + assert not array_equivalent(left, right, strict_nan=True) + assert not array_equivalent(left, right, strict_nan=False) + + +def test_array_equivalent_compat(): + # see gh-13388 + m = np.array([(1, 2), (3, 4)], dtype=[("a", int), ("b", float)]) + n = np.array([(1, 2), (3, 4)], dtype=[("a", int), ("b", float)]) + assert array_equivalent(m, n, strict_nan=True) + assert array_equivalent(m, n, strict_nan=False) + + m = np.array([(1, 2), (3, 4)], dtype=[("a", int), ("b", float)]) + n = np.array([(1, 2), (4, 3)], dtype=[("a", int), ("b", float)]) + assert not array_equivalent(m, n, strict_nan=True) + assert not array_equivalent(m, n, strict_nan=False) + + m = np.array([(1, 2), (3, 4)], dtype=[("a", int), ("b", float)]) + n = np.array([(1, 2), (3, 4)], dtype=[("b", int), ("a", float)]) + assert not array_equivalent(m, n, strict_nan=True) + assert not array_equivalent(m, n, strict_nan=False) + + +def test_array_equivalent_str(): + for dtype in ["O", "S", "U"]: + assert array_equivalent( + np.array(["A", "B"], dtype=dtype), np.array(["A", "B"], dtype=dtype) + ) + assert not array_equivalent( + np.array(["A", "B"], dtype=dtype), np.array(["A", "X"], dtype=dtype) + ) + + +def test_array_equivalent_nested(): + # reached in groupby aggregations, make sure we use np.any when checking + # if the comparison is truthy + left = np.array([np.array([50, 70, 90]), np.array([20, 30, 40])], dtype=object) + right = np.array([np.array([50, 70, 90]), np.array([20, 30, 40])], dtype=object) + + assert array_equivalent(left, right, strict_nan=True) + assert not array_equivalent(left, right[::-1], strict_nan=True) + + left = np.array([np.array([50, 50, 50]), np.array([40, 40, 40])], dtype=object) + right = np.array([50, 40]) + assert not array_equivalent(left, right, strict_nan=True) + + +@pytest.mark.parametrize( + "dtype, na_value", + [ + # Datetime-like + (np.dtype("M8[ns]"), NaT), + (np.dtype("m8[ns]"), NaT), + (DatetimeTZDtype.construct_from_string("datetime64[ns, US/Eastern]"), NaT), + (PeriodDtype("M"), NaT), + # Integer + ("u1", 0), + ("u2", 0), + ("u4", 0), + ("u8", 0), + ("i1", 0), + ("i2", 0), + ("i4", 0), + ("i8", 0), + # Bool + ("bool", False), + # Float + ("f2", np.nan), + ("f4", np.nan), + ("f8", np.nan), + # Object + ("O", np.nan), + # Interval + (IntervalDtype(), np.nan), + ], +) +def test_na_value_for_dtype(dtype, na_value): + result = na_value_for_dtype(dtype) + assert result is na_value + + +class TestNAObj: + + _1d_methods = ["isnaobj", "isnaobj_old"] + _2d_methods = ["isnaobj2d", "isnaobj2d_old"] + + def _check_behavior(self, arr, expected): + for method in TestNAObj._1d_methods: + result = getattr(libmissing, method)(arr) + tm.assert_numpy_array_equal(result, expected) + + arr = np.atleast_2d(arr) + expected = np.atleast_2d(expected) + + for method in TestNAObj._2d_methods: + result = getattr(libmissing, method)(arr) + tm.assert_numpy_array_equal(result, expected) + + def test_basic(self): + arr = np.array([1, None, "foo", -5.1, pd.NaT, np.nan]) + expected = np.array([False, True, False, False, True, True]) + + self._check_behavior(arr, expected) + + def test_non_obj_dtype(self): + arr = np.array([1, 3, np.nan, 5], dtype=float) + expected = np.array([False, False, True, False]) + + self._check_behavior(arr, expected) + + def test_empty_arr(self): + arr = np.array([]) + expected = np.array([], dtype=bool) + + self._check_behavior(arr, expected) + + def test_empty_str_inp(self): + arr = np.array([""]) # empty but not na + expected = np.array([False]) + + self._check_behavior(arr, expected) + + def test_empty_like(self): + # see gh-13717: no segfaults! + arr = np.empty_like([None]) + expected = np.array([True]) + + self._check_behavior(arr, expected) + + +m8_units = ["as", "ps", "ns", "us", "ms", "s", "m", "h", "D", "W", "M", "Y"] + +na_vals = ( + [ + None, + NaT, + float("NaN"), + complex("NaN"), + np.nan, + np.float64("NaN"), + np.float32("NaN"), + np.complex64(np.nan), + np.complex128(np.nan), + np.datetime64("NaT"), + np.timedelta64("NaT"), + ] + + [np.datetime64("NaT", unit) for unit in m8_units] + + [np.timedelta64("NaT", unit) for unit in m8_units] +) + +inf_vals = [ + float("inf"), + float("-inf"), + complex("inf"), + complex("-inf"), + np.inf, + np.NINF, +] + +int_na_vals = [ + # Values that match iNaT, which we treat as null in specific cases + np.int64(NaT.value), + int(NaT.value), +] + +sometimes_na_vals = [Decimal("NaN")] + +never_na_vals = [ + # float/complex values that when viewed as int64 match iNaT + -0.0, + np.float64("-0.0"), + -0j, + np.complex64(-0j), +] + + +class TestLibMissing: + def test_checknull(self): + for value in na_vals: + assert libmissing.checknull(value) + + for value in inf_vals: + assert not libmissing.checknull(value) + + for value in int_na_vals: + assert not libmissing.checknull(value) + + for value in sometimes_na_vals: + assert not libmissing.checknull(value) + + for value in never_na_vals: + assert not libmissing.checknull(value) + + def checknull_old(self): + for value in na_vals: + assert libmissing.checknull_old(value) + + for value in inf_vals: + assert libmissing.checknull_old(value) + + for value in int_na_vals: + assert not libmissing.checknull_old(value) + + for value in sometimes_na_vals: + assert not libmissing.checknull_old(value) + + for value in never_na_vals: + assert not libmissing.checknull_old(value) + + def test_is_null_datetimelike(self): + for value in na_vals: + assert is_null_datetimelike(value) + assert is_null_datetimelike(value, False) + + for value in inf_vals: + assert not is_null_datetimelike(value) + assert not is_null_datetimelike(value, False) + + for value in int_na_vals: + assert is_null_datetimelike(value) + assert not is_null_datetimelike(value, False) + + for value in sometimes_na_vals: + assert not is_null_datetimelike(value) + assert not is_null_datetimelike(value, False) + + for value in never_na_vals: + assert not is_null_datetimelike(value) diff --git a/venv/Lib/site-packages/pandas/tests/extension/__init__.py b/venv/Lib/site-packages/pandas/tests/extension/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/extension/arrow/__init__.py b/venv/Lib/site-packages/pandas/tests/extension/arrow/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/extension/arrow/arrays.py b/venv/Lib/site-packages/pandas/tests/extension/arrow/arrays.py new file mode 100644 index 0000000..b0e5a6f --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/arrow/arrays.py @@ -0,0 +1,190 @@ +"""Rudimentary Apache Arrow-backed ExtensionArray. + +At the moment, just a boolean array / type is implemented. +Eventually, we'll want to parametrize the type and support +multiple dtypes. Not all methods are implemented yet, and the +current implementation is not efficient. +""" +import copy +import itertools + +import numpy as np +import pyarrow as pa + +import pandas as pd +from pandas.api.extensions import ( + ExtensionArray, + ExtensionDtype, + register_extension_dtype, + take, +) + + +@register_extension_dtype +class ArrowBoolDtype(ExtensionDtype): + + type = np.bool_ + kind = "b" + name = "arrow_bool" + na_value = pa.NULL + + @classmethod + def construct_from_string(cls, string): + if string == cls.name: + return cls() + else: + raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") + + @classmethod + def construct_array_type(cls): + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + return ArrowBoolArray + + def _is_boolean(self): + return True + + +@register_extension_dtype +class ArrowStringDtype(ExtensionDtype): + + type = str + kind = "U" + name = "arrow_string" + na_value = pa.NULL + + @classmethod + def construct_from_string(cls, string): + if string == cls.name: + return cls() + else: + raise TypeError(f"Cannot construct a '{cls}' from '{string}'") + + @classmethod + def construct_array_type(cls): + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + return ArrowStringArray + + +class ArrowExtensionArray(ExtensionArray): + @classmethod + def from_scalars(cls, values): + arr = pa.chunked_array([pa.array(np.asarray(values))]) + return cls(arr) + + @classmethod + def from_array(cls, arr): + assert isinstance(arr, pa.Array) + return cls(pa.chunked_array([arr])) + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + return cls.from_scalars(scalars) + + def __repr__(self): + return f"{type(self).__name__}({repr(self._data)})" + + def __getitem__(self, item): + if pd.api.types.is_scalar(item): + return self._data.to_pandas()[item] + else: + vals = self._data.to_pandas()[item] + return type(self).from_scalars(vals) + + def __len__(self): + return len(self._data) + + def astype(self, dtype, copy=True): + # needed to fix this astype for the Series constructor. + if isinstance(dtype, type(self.dtype)) and dtype == self.dtype: + if copy: + return self.copy() + return self + return super().astype(dtype, copy) + + @property + def dtype(self): + return self._dtype + + @property + def nbytes(self): + return sum( + x.size + for chunk in self._data.chunks + for x in chunk.buffers() + if x is not None + ) + + def isna(self): + nas = pd.isna(self._data.to_pandas()) + return type(self).from_scalars(nas) + + def take(self, indices, allow_fill=False, fill_value=None): + data = self._data.to_pandas() + + if allow_fill and fill_value is None: + fill_value = self.dtype.na_value + + result = take(data, indices, fill_value=fill_value, allow_fill=allow_fill) + return self._from_sequence(result, dtype=self.dtype) + + def copy(self): + return type(self)(copy.copy(self._data)) + + @classmethod + def _concat_same_type(cls, to_concat): + chunks = list(itertools.chain.from_iterable(x._data.chunks for x in to_concat)) + arr = pa.chunked_array(chunks) + return cls(arr) + + def __invert__(self): + return type(self).from_scalars(~self._data.to_pandas()) + + def _reduce(self, method, skipna=True, **kwargs): + if skipna: + arr = self[~self.isna()] + else: + arr = self + + try: + op = getattr(arr, method) + except AttributeError: + raise TypeError + return op(**kwargs) + + def any(self, axis=0, out=None): + return self._data.to_pandas().any() + + def all(self, axis=0, out=None): + return self._data.to_pandas().all() + + +class ArrowBoolArray(ArrowExtensionArray): + def __init__(self, values): + if not isinstance(values, pa.ChunkedArray): + raise ValueError + + assert values.type == pa.bool_() + self._data = values + self._dtype = ArrowBoolDtype() + + +class ArrowStringArray(ArrowExtensionArray): + def __init__(self, values): + if not isinstance(values, pa.ChunkedArray): + raise ValueError + + assert values.type == pa.string() + self._data = values + self._dtype = ArrowStringDtype() diff --git a/venv/Lib/site-packages/pandas/tests/extension/arrow/test_bool.py b/venv/Lib/site-packages/pandas/tests/extension/arrow/test_bool.py new file mode 100644 index 0000000..94dd09d --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/arrow/test_bool.py @@ -0,0 +1,74 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.tests.extension import base + +pytest.importorskip("pyarrow", minversion="0.13.0") + +from .arrays import ArrowBoolArray, ArrowBoolDtype # isort:skip + + +@pytest.fixture +def dtype(): + return ArrowBoolDtype() + + +@pytest.fixture +def data(): + values = np.random.randint(0, 2, size=100, dtype=bool) + values[1] = ~values[0] + return ArrowBoolArray.from_scalars(values) + + +@pytest.fixture +def data_missing(): + return ArrowBoolArray.from_scalars([None, True]) + + +class BaseArrowTests: + pass + + +class TestDtype(BaseArrowTests, base.BaseDtypeTests): + def test_array_type_with_arg(self, data, dtype): + pytest.skip("GH-22666") + + +class TestInterface(BaseArrowTests, base.BaseInterfaceTests): + def test_copy(self, data): + # __setitem__ does not work, so we only have a smoke-test + data.copy() + + def test_view(self, data): + # __setitem__ does not work, so we only have a smoke-test + data.view() + + +class TestConstructors(BaseArrowTests, base.BaseConstructorsTests): + def test_from_dtype(self, data): + pytest.skip("GH-22666") + + # seems like some bug in isna on empty BoolArray returning floats. + @pytest.mark.xfail(reason="bad is-na for empty data") + def test_from_sequence_from_cls(self, data): + super().test_from_sequence_from_cls(data) + + +class TestReduce(base.BaseNoReduceTests): + def test_reduce_series_boolean(self): + pass + + +class TestReduceBoolean(base.BaseBooleanReduceTests): + pass + + +def test_is_bool_dtype(data): + assert pd.api.types.is_bool_dtype(data) + assert pd.core.common.is_bool_indexer(data) + s = pd.Series(range(len(data))) + result = s[data] + expected = s[np.asarray(data)] + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/extension/arrow/test_string.py b/venv/Lib/site-packages/pandas/tests/extension/arrow/test_string.py new file mode 100644 index 0000000..abd5c1f --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/arrow/test_string.py @@ -0,0 +1,13 @@ +import pytest + +import pandas as pd + +pytest.importorskip("pyarrow", minversion="0.13.0") + +from .arrays import ArrowStringDtype # isort:skip + + +def test_constructor_from_list(): + # GH 27673 + result = pd.Series(["E"], dtype=ArrowStringDtype()) + assert isinstance(result.dtype, ArrowStringDtype) diff --git a/venv/Lib/site-packages/pandas/tests/extension/base/__init__.py b/venv/Lib/site-packages/pandas/tests/extension/base/__init__.py new file mode 100644 index 0000000..090df35 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/base/__init__.py @@ -0,0 +1,60 @@ +"""Base test suite for extension arrays. + +These tests are intended for third-party libraries to subclass to validate +that their extension arrays and dtypes satisfy the interface. Moving or +renaming the tests should not be done lightly. + +Libraries are expected to implement a few pytest fixtures to provide data +for the tests. The fixtures may be located in either + +* The same module as your test class. +* A ``conftest.py`` in the same directory as your test class. + +The full list of fixtures may be found in the ``conftest.py`` next to this +file. + +.. code-block:: python + + import pytest + from pandas.tests.extension.base import BaseDtypeTests + + + @pytest.fixture + def dtype(): + return MyDtype() + + + class TestMyDtype(BaseDtypeTests): + pass + + +Your class ``TestDtype`` will inherit all the tests defined on +``BaseDtypeTests``. pytest's fixture discover will supply your ``dtype`` +wherever the test requires it. You're free to implement additional tests. + +All the tests in these modules use ``self.assert_frame_equal`` or +``self.assert_series_equal`` for dataframe or series comparisons. By default, +they use the usual ``pandas.testing.assert_frame_equal`` and +``pandas.testing.assert_series_equal``. You can override the checks used +by defining the staticmethods ``assert_frame_equal`` and +``assert_series_equal`` on your base test class. + +""" +from .casting import BaseCastingTests # noqa +from .constructors import BaseConstructorsTests # noqa +from .dtype import BaseDtypeTests # noqa +from .getitem import BaseGetitemTests # noqa +from .groupby import BaseGroupbyTests # noqa +from .interface import BaseInterfaceTests # noqa +from .io import BaseParsingTests # noqa +from .methods import BaseMethodsTests # noqa +from .missing import BaseMissingTests # noqa +from .ops import BaseArithmeticOpsTests, BaseComparisonOpsTests, BaseOpsUtil # noqa +from .printing import BasePrintingTests # noqa +from .reduce import ( # noqa + BaseBooleanReduceTests, + BaseNoReduceTests, + BaseNumericReduceTests, +) +from .reshaping import BaseReshapingTests # noqa +from .setitem import BaseSetitemTests # noqa diff --git a/venv/Lib/site-packages/pandas/tests/extension/base/base.py b/venv/Lib/site-packages/pandas/tests/extension/base/base.py new file mode 100644 index 0000000..144b082 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/base/base.py @@ -0,0 +1,9 @@ +import pandas._testing as tm + + +class BaseExtensionTests: + + assert_equal = staticmethod(tm.assert_equal) + assert_series_equal = staticmethod(tm.assert_series_equal) + assert_frame_equal = staticmethod(tm.assert_frame_equal) + assert_extension_array_equal = staticmethod(tm.assert_extension_array_equal) diff --git a/venv/Lib/site-packages/pandas/tests/extension/base/casting.py b/venv/Lib/site-packages/pandas/tests/extension/base/casting.py new file mode 100644 index 0000000..58859fc --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/base/casting.py @@ -0,0 +1,34 @@ +import numpy as np + +import pandas as pd +from pandas.core.internals import ObjectBlock + +from .base import BaseExtensionTests + + +class BaseCastingTests(BaseExtensionTests): + """Casting to and from ExtensionDtypes""" + + def test_astype_object_series(self, all_data): + ser = pd.Series({"A": all_data}) + result = ser.astype(object) + assert isinstance(result._data.blocks[0], ObjectBlock) + + def test_tolist(self, data): + result = pd.Series(data).tolist() + expected = list(data) + assert result == expected + + def test_astype_str(self, data): + result = pd.Series(data[:5]).astype(str) + expected = pd.Series(data[:5].astype(str)) + self.assert_series_equal(result, expected) + + def test_to_numpy(self, data): + expected = np.asarray(data) + + result = data.to_numpy() + self.assert_equal(result, expected) + + result = pd.Series(data).to_numpy() + self.assert_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/extension/base/constructors.py b/venv/Lib/site-packages/pandas/tests/extension/base/constructors.py new file mode 100644 index 0000000..c40646c --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/base/constructors.py @@ -0,0 +1,85 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas.core.internals import ExtensionBlock + +from .base import BaseExtensionTests + + +class BaseConstructorsTests(BaseExtensionTests): + def test_from_sequence_from_cls(self, data): + result = type(data)._from_sequence(data, dtype=data.dtype) + self.assert_extension_array_equal(result, data) + + data = data[:0] + result = type(data)._from_sequence(data, dtype=data.dtype) + self.assert_extension_array_equal(result, data) + + def test_array_from_scalars(self, data): + scalars = [data[0], data[1], data[2]] + result = data._from_sequence(scalars) + assert isinstance(result, type(data)) + + def test_series_constructor(self, data): + result = pd.Series(data) + assert result.dtype == data.dtype + assert len(result) == len(data) + assert isinstance(result._data.blocks[0], ExtensionBlock) + assert result._data.blocks[0].values is data + + # Series[EA] is unboxed / boxed correctly + result2 = pd.Series(result) + assert result2.dtype == data.dtype + assert isinstance(result2._data.blocks[0], ExtensionBlock) + + @pytest.mark.parametrize("from_series", [True, False]) + def test_dataframe_constructor_from_dict(self, data, from_series): + if from_series: + data = pd.Series(data) + result = pd.DataFrame({"A": data}) + assert result.dtypes["A"] == data.dtype + assert result.shape == (len(data), 1) + assert isinstance(result._data.blocks[0], ExtensionBlock) + + def test_dataframe_from_series(self, data): + result = pd.DataFrame(pd.Series(data)) + assert result.dtypes[0] == data.dtype + assert result.shape == (len(data), 1) + assert isinstance(result._data.blocks[0], ExtensionBlock) + + def test_series_given_mismatched_index_raises(self, data): + msg = "Length of passed values is 3, index implies 5" + with pytest.raises(ValueError, match=msg): + pd.Series(data[:3], index=[0, 1, 2, 3, 4]) + + def test_from_dtype(self, data): + # construct from our dtype & string dtype + dtype = data.dtype + + expected = pd.Series(data) + result = pd.Series(list(data), dtype=dtype) + self.assert_series_equal(result, expected) + + result = pd.Series(list(data), dtype=str(dtype)) + self.assert_series_equal(result, expected) + + # gh-30280 + + expected = pd.DataFrame(data).astype(dtype) + result = pd.DataFrame(list(data), dtype=dtype) + self.assert_frame_equal(result, expected) + + result = pd.DataFrame(list(data), dtype=str(dtype)) + self.assert_frame_equal(result, expected) + + def test_pandas_array(self, data): + # pd.array(extension_array) should be idempotent... + result = pd.array(data) + self.assert_extension_array_equal(result, data) + + def test_pandas_array_dtype(self, data): + # ... but specifying dtype will override idempotency + result = pd.array(data, dtype=np.dtype(object)) + expected = pd.arrays.PandasArray(np.asarray(data, dtype=object)) + self.assert_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/extension/base/dtype.py b/venv/Lib/site-packages/pandas/tests/extension/base/dtype.py new file mode 100644 index 0000000..b6c12b5 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/base/dtype.py @@ -0,0 +1,107 @@ +import warnings + +import numpy as np +import pytest + +import pandas as pd + +from .base import BaseExtensionTests + + +class BaseDtypeTests(BaseExtensionTests): + """Base class for ExtensionDtype classes""" + + def test_name(self, dtype): + assert isinstance(dtype.name, str) + + def test_kind(self, dtype): + valid = set("biufcmMOSUV") + assert dtype.kind in valid + + def test_construct_from_string_own_name(self, dtype): + result = dtype.construct_from_string(dtype.name) + assert type(result) is type(dtype) + + # check OK as classmethod + result = type(dtype).construct_from_string(dtype.name) + assert type(result) is type(dtype) + + def test_is_dtype_from_name(self, dtype): + result = type(dtype).is_dtype(dtype.name) + assert result is True + + def test_is_dtype_unboxes_dtype(self, data, dtype): + assert dtype.is_dtype(data) is True + + def test_is_dtype_from_self(self, dtype): + result = type(dtype).is_dtype(dtype) + assert result is True + + def test_is_dtype_other_input(self, dtype): + assert dtype.is_dtype([1, 2, 3]) is False + + def test_is_not_string_type(self, dtype): + return not pd.api.types.is_string_dtype(dtype) + + def test_is_not_object_type(self, dtype): + return not pd.api.types.is_object_dtype(dtype) + + def test_eq_with_str(self, dtype): + assert dtype == dtype.name + assert dtype != dtype.name + "-suffix" + + def test_eq_with_numpy_object(self, dtype): + assert dtype != np.dtype("object") + + def test_eq_with_self(self, dtype): + assert dtype == dtype + assert dtype != object() + + def test_array_type(self, data, dtype): + assert dtype.construct_array_type() is type(data) + + def test_check_dtype(self, data): + dtype = data.dtype + + # check equivalency for using .dtypes + df = pd.DataFrame( + {"A": pd.Series(data, dtype=dtype), "B": data, "C": "foo", "D": 1} + ) + + # np.dtype('int64') == 'Int64' == 'int64' + # so can't distinguish + if dtype.name == "Int64": + expected = pd.Series([True, True, False, True], index=list("ABCD")) + else: + expected = pd.Series([True, True, False, False], index=list("ABCD")) + + # XXX: This should probably be *fixed* not ignored. + # See libops.scalar_compare + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + result = df.dtypes == str(dtype) + + self.assert_series_equal(result, expected) + + expected = pd.Series([True, True, False, False], index=list("ABCD")) + result = df.dtypes.apply(str) == str(dtype) + self.assert_series_equal(result, expected) + + def test_hashable(self, dtype): + hash(dtype) # no error + + def test_str(self, dtype): + assert str(dtype) == dtype.name + + def test_eq(self, dtype): + assert dtype == dtype.name + assert dtype != "anonther_type" + + def test_construct_from_string(self, dtype): + dtype_instance = type(dtype).construct_from_string(dtype.name) + assert isinstance(dtype_instance, type(dtype)) + + def test_construct_from_string_another_type_raises(self, dtype): + msg = f"Cannot construct a '{type(dtype).__name__}' from 'another_type'" + with pytest.raises(TypeError, match=msg): + type(dtype).construct_from_string("another_type") diff --git a/venv/Lib/site-packages/pandas/tests/extension/base/getitem.py b/venv/Lib/site-packages/pandas/tests/extension/base/getitem.py new file mode 100644 index 0000000..8615a8d --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/base/getitem.py @@ -0,0 +1,370 @@ +import numpy as np +import pytest + +import pandas as pd + +from .base import BaseExtensionTests + + +class BaseGetitemTests(BaseExtensionTests): + """Tests for ExtensionArray.__getitem__.""" + + def test_iloc_series(self, data): + ser = pd.Series(data) + result = ser.iloc[:4] + expected = pd.Series(data[:4]) + self.assert_series_equal(result, expected) + + result = ser.iloc[[0, 1, 2, 3]] + self.assert_series_equal(result, expected) + + def test_iloc_frame(self, data): + df = pd.DataFrame({"A": data, "B": np.arange(len(data), dtype="int64")}) + expected = pd.DataFrame({"A": data[:4]}) + + # slice -> frame + result = df.iloc[:4, [0]] + self.assert_frame_equal(result, expected) + + # sequence -> frame + result = df.iloc[[0, 1, 2, 3], [0]] + self.assert_frame_equal(result, expected) + + expected = pd.Series(data[:4], name="A") + + # slice -> series + result = df.iloc[:4, 0] + self.assert_series_equal(result, expected) + + # sequence -> series + result = df.iloc[:4, 0] + self.assert_series_equal(result, expected) + + def test_loc_series(self, data): + ser = pd.Series(data) + result = ser.loc[:3] + expected = pd.Series(data[:4]) + self.assert_series_equal(result, expected) + + result = ser.loc[[0, 1, 2, 3]] + self.assert_series_equal(result, expected) + + def test_loc_frame(self, data): + df = pd.DataFrame({"A": data, "B": np.arange(len(data), dtype="int64")}) + expected = pd.DataFrame({"A": data[:4]}) + + # slice -> frame + result = df.loc[:3, ["A"]] + self.assert_frame_equal(result, expected) + + # sequence -> frame + result = df.loc[[0, 1, 2, 3], ["A"]] + self.assert_frame_equal(result, expected) + + expected = pd.Series(data[:4], name="A") + + # slice -> series + result = df.loc[:3, "A"] + self.assert_series_equal(result, expected) + + # sequence -> series + result = df.loc[:3, "A"] + self.assert_series_equal(result, expected) + + def test_loc_iloc_frame_single_dtype(self, data): + # GH#27110 bug in ExtensionBlock.iget caused df.iloc[n] to incorrectly + # return a scalar + df = pd.DataFrame({"A": data}) + expected = pd.Series([data[2]], index=["A"], name=2, dtype=data.dtype) + + result = df.loc[2] + self.assert_series_equal(result, expected) + + expected = pd.Series( + [data[-1]], index=["A"], name=len(data) - 1, dtype=data.dtype + ) + result = df.iloc[-1] + self.assert_series_equal(result, expected) + + def test_getitem_scalar(self, data): + result = data[0] + assert isinstance(result, data.dtype.type) + + result = pd.Series(data)[0] + assert isinstance(result, data.dtype.type) + + def test_getitem_scalar_na(self, data_missing, na_cmp, na_value): + result = data_missing[0] + assert na_cmp(result, na_value) + + def test_getitem_empty(self, data): + # Indexing with empty list + result = data[[]] + assert len(result) == 0 + assert isinstance(result, type(data)) + + expected = data[np.array([], dtype="int64")] + self.assert_extension_array_equal(result, expected) + + def test_getitem_mask(self, data): + # Empty mask, raw array + mask = np.zeros(len(data), dtype=bool) + result = data[mask] + assert len(result) == 0 + assert isinstance(result, type(data)) + + # Empty mask, in series + mask = np.zeros(len(data), dtype=bool) + result = pd.Series(data)[mask] + assert len(result) == 0 + assert result.dtype == data.dtype + + # non-empty mask, raw array + mask[0] = True + result = data[mask] + assert len(result) == 1 + assert isinstance(result, type(data)) + + # non-empty mask, in series + result = pd.Series(data)[mask] + assert len(result) == 1 + assert result.dtype == data.dtype + + def test_getitem_mask_raises(self, data): + mask = np.array([True, False]) + with pytest.raises(IndexError): + data[mask] + + mask = pd.array(mask, dtype="boolean") + with pytest.raises(IndexError): + data[mask] + + def test_getitem_boolean_array_mask(self, data): + mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean") + result = data[mask] + assert len(result) == 0 + assert isinstance(result, type(data)) + + result = pd.Series(data)[mask] + assert len(result) == 0 + assert result.dtype == data.dtype + + mask[:5] = True + expected = data.take([0, 1, 2, 3, 4]) + result = data[mask] + self.assert_extension_array_equal(result, expected) + + expected = pd.Series(expected) + result = pd.Series(data)[mask] + self.assert_series_equal(result, expected) + + def test_getitem_boolean_array_mask_raises(self, data): + mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean") + mask[:2] = pd.NA + + msg = ( + "Cannot mask with a boolean indexer containing NA values|" + "cannot mask with array containing NA / NaN values" + ) + with pytest.raises(ValueError, match=msg): + data[mask] + + s = pd.Series(data) + + with pytest.raises(ValueError): + s[mask] + + @pytest.mark.parametrize( + "idx", + [[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])], + ids=["list", "integer-array", "numpy-array"], + ) + def test_getitem_integer_array(self, data, idx): + result = data[idx] + assert len(result) == 3 + assert isinstance(result, type(data)) + expected = data.take([0, 1, 2]) + self.assert_extension_array_equal(result, expected) + + expected = pd.Series(expected) + result = pd.Series(data)[idx] + self.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "idx", + [[0, 1, 2, pd.NA], pd.array([0, 1, 2, pd.NA], dtype="Int64")], + ids=["list", "integer-array"], + ) + def test_getitem_integer_with_missing_raises(self, data, idx): + msg = "Cannot index with an integer indexer containing NA values" + with pytest.raises(ValueError, match=msg): + data[idx] + + # TODO this raises KeyError about labels not found (it tries label-based) + # import pandas._testing as tm + # s = pd.Series(data, index=[tm.rands(4) for _ in range(len(data))]) + # with pytest.raises(ValueError, match=msg): + # s[idx] + + def test_getitem_slice(self, data): + # getitem[slice] should return an array + result = data[slice(0)] # empty + assert isinstance(result, type(data)) + + result = data[slice(1)] # scalar + assert isinstance(result, type(data)) + + def test_get(self, data): + # GH 20882 + s = pd.Series(data, index=[2 * i for i in range(len(data))]) + assert s.get(4) == s.iloc[2] + + result = s.get([4, 6]) + expected = s.iloc[[2, 3]] + self.assert_series_equal(result, expected) + + result = s.get(slice(2)) + expected = s.iloc[[0, 1]] + self.assert_series_equal(result, expected) + + assert s.get(-1) is None + assert s.get(s.index.max() + 1) is None + + s = pd.Series(data[:6], index=list("abcdef")) + assert s.get("c") == s.iloc[2] + + result = s.get(slice("b", "d")) + expected = s.iloc[[1, 2, 3]] + self.assert_series_equal(result, expected) + + result = s.get("Z") + assert result is None + + assert s.get(4) == s.iloc[4] + assert s.get(-1) == s.iloc[-1] + assert s.get(len(s)) is None + + # GH 21257 + s = pd.Series(data) + s2 = s[::2] + assert s2.get(1) is None + + def test_take_sequence(self, data): + result = pd.Series(data)[[0, 1, 3]] + assert result.iloc[0] == data[0] + assert result.iloc[1] == data[1] + assert result.iloc[2] == data[3] + + def test_take(self, data, na_value, na_cmp): + result = data.take([0, -1]) + assert result.dtype == data.dtype + assert result[0] == data[0] + assert result[1] == data[-1] + + result = data.take([0, -1], allow_fill=True, fill_value=na_value) + assert result[0] == data[0] + assert na_cmp(result[1], na_value) + + with pytest.raises(IndexError, match="out of bounds"): + data.take([len(data) + 1]) + + def test_take_empty(self, data, na_value, na_cmp): + empty = data[:0] + + result = empty.take([-1], allow_fill=True) + assert na_cmp(result[0], na_value) + + with pytest.raises(IndexError): + empty.take([-1]) + + with pytest.raises(IndexError, match="cannot do a non-empty take"): + empty.take([0, 1]) + + def test_take_negative(self, data): + # https://github.com/pandas-dev/pandas/issues/20640 + n = len(data) + result = data.take([0, -n, n - 1, -1]) + expected = data.take([0, 0, n - 1, n - 1]) + self.assert_extension_array_equal(result, expected) + + def test_take_non_na_fill_value(self, data_missing): + fill_value = data_missing[1] # valid + na = data_missing[0] + + array = data_missing._from_sequence( + [na, fill_value, na], dtype=data_missing.dtype + ) + result = array.take([-1, 1], fill_value=fill_value, allow_fill=True) + expected = array.take([1, 1]) + self.assert_extension_array_equal(result, expected) + + def test_take_pandas_style_negative_raises(self, data, na_value): + with pytest.raises(ValueError): + data.take([0, -2], fill_value=na_value, allow_fill=True) + + @pytest.mark.parametrize("allow_fill", [True, False]) + def test_take_out_of_bounds_raises(self, data, allow_fill): + arr = data[:3] + with pytest.raises(IndexError): + arr.take(np.asarray([0, 3]), allow_fill=allow_fill) + + def test_take_series(self, data): + s = pd.Series(data) + result = s.take([0, -1]) + expected = pd.Series( + data._from_sequence([data[0], data[len(data) - 1]], dtype=s.dtype), + index=[0, len(data) - 1], + ) + self.assert_series_equal(result, expected) + + def test_reindex(self, data, na_value): + s = pd.Series(data) + result = s.reindex([0, 1, 3]) + expected = pd.Series(data.take([0, 1, 3]), index=[0, 1, 3]) + self.assert_series_equal(result, expected) + + n = len(data) + result = s.reindex([-1, 0, n]) + expected = pd.Series( + data._from_sequence([na_value, data[0], na_value], dtype=s.dtype), + index=[-1, 0, n], + ) + self.assert_series_equal(result, expected) + + result = s.reindex([n, n + 1]) + expected = pd.Series( + data._from_sequence([na_value, na_value], dtype=s.dtype), index=[n, n + 1] + ) + self.assert_series_equal(result, expected) + + def test_reindex_non_na_fill_value(self, data_missing): + valid = data_missing[1] + na = data_missing[0] + + array = data_missing._from_sequence([na, valid], dtype=data_missing.dtype) + ser = pd.Series(array) + result = ser.reindex([0, 1, 2], fill_value=valid) + expected = pd.Series( + data_missing._from_sequence([na, valid, valid], dtype=data_missing.dtype) + ) + + self.assert_series_equal(result, expected) + + def test_loc_len1(self, data): + # see GH-27785 take_nd with indexer of len 1 resulting in wrong ndim + df = pd.DataFrame({"A": data}) + res = df.loc[[0], "A"] + assert res._data._block.ndim == 1 + + def test_item(self, data): + # https://github.com/pandas-dev/pandas/pull/30175 + s = pd.Series(data) + result = s[:1].item() + assert result == data[0] + + msg = "can only convert an array of size 1 to a Python scalar" + with pytest.raises(ValueError, match=msg): + s[:0].item() + + with pytest.raises(ValueError, match=msg): + s.item() diff --git a/venv/Lib/site-packages/pandas/tests/extension/base/groupby.py b/venv/Lib/site-packages/pandas/tests/extension/base/groupby.py new file mode 100644 index 0000000..94d0ef7 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/base/groupby.py @@ -0,0 +1,91 @@ +import pytest + +import pandas as pd +import pandas._testing as tm + +from .base import BaseExtensionTests + + +class BaseGroupbyTests(BaseExtensionTests): + """Groupby-specific tests.""" + + def test_grouping_grouper(self, data_for_grouping): + df = pd.DataFrame( + {"A": ["B", "B", None, None, "A", "A", "B", "C"], "B": data_for_grouping} + ) + gr1 = df.groupby("A").grouper.groupings[0] + gr2 = df.groupby("B").grouper.groupings[0] + + tm.assert_numpy_array_equal(gr1.grouper, df.A.values) + tm.assert_extension_array_equal(gr2.grouper, data_for_grouping) + + @pytest.mark.parametrize("as_index", [True, False]) + def test_groupby_extension_agg(self, as_index, data_for_grouping): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) + result = df.groupby("B", as_index=as_index).A.mean() + _, index = pd.factorize(data_for_grouping, sort=True) + + index = pd.Index(index, name="B") + expected = pd.Series([3, 1, 4], index=index, name="A") + if as_index: + self.assert_series_equal(result, expected) + else: + expected = expected.reset_index() + self.assert_frame_equal(result, expected) + + def test_groupby_extension_no_sort(self, data_for_grouping): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) + result = df.groupby("B", sort=False).A.mean() + _, index = pd.factorize(data_for_grouping, sort=False) + + index = pd.Index(index, name="B") + expected = pd.Series([1, 3, 4], index=index, name="A") + self.assert_series_equal(result, expected) + + def test_groupby_extension_transform(self, data_for_grouping): + valid = data_for_grouping[~data_for_grouping.isna()] + df = pd.DataFrame({"A": [1, 1, 3, 3, 1, 4], "B": valid}) + + result = df.groupby("B").A.transform(len) + expected = pd.Series([3, 3, 2, 2, 3, 1], name="A") + + self.assert_series_equal(result, expected) + + def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) + df.groupby("B").apply(groupby_apply_op) + df.groupby("B").A.apply(groupby_apply_op) + df.groupby("A").apply(groupby_apply_op) + df.groupby("A").B.apply(groupby_apply_op) + + def test_groupby_apply_identity(self, data_for_grouping): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) + result = df.groupby("A").B.apply(lambda x: x.array) + expected = pd.Series( + [ + df.B.iloc[[0, 1, 6]].array, + df.B.iloc[[2, 3]].array, + df.B.iloc[[4, 5]].array, + df.B.iloc[[7]].array, + ], + index=pd.Index([1, 2, 3, 4], name="A"), + name="B", + ) + self.assert_series_equal(result, expected) + + def test_in_numeric_groupby(self, data_for_grouping): + df = pd.DataFrame( + { + "A": [1, 1, 2, 2, 3, 3, 1, 4], + "B": data_for_grouping, + "C": [1, 1, 1, 1, 1, 1, 1, 1], + } + ) + result = df.groupby("A").sum().columns + + if data_for_grouping.dtype._is_numeric: + expected = pd.Index(["B", "C"]) + else: + expected = pd.Index(["C"]) + + tm.assert_index_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/extension/base/interface.py b/venv/Lib/site-packages/pandas/tests/extension/base/interface.py new file mode 100644 index 0000000..cdea963 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/base/interface.py @@ -0,0 +1,92 @@ +import numpy as np + +from pandas.core.dtypes.common import is_extension_array_dtype +from pandas.core.dtypes.dtypes import ExtensionDtype + +import pandas as pd +import pandas._testing as tm + +from .base import BaseExtensionTests + + +class BaseInterfaceTests(BaseExtensionTests): + """Tests that the basic interface is satisfied.""" + + # ------------------------------------------------------------------------ + # Interface + # ------------------------------------------------------------------------ + + def test_len(self, data): + assert len(data) == 100 + + def test_ndim(self, data): + assert data.ndim == 1 + + def test_can_hold_na_valid(self, data): + # GH-20761 + assert data._can_hold_na is True + + def test_memory_usage(self, data): + s = pd.Series(data) + result = s.memory_usage(index=False) + assert result == s.nbytes + + def test_array_interface(self, data): + result = np.array(data) + assert result[0] == data[0] + + result = np.array(data, dtype=object) + expected = np.array(list(data), dtype=object) + tm.assert_numpy_array_equal(result, expected) + + def test_is_extension_array_dtype(self, data): + assert is_extension_array_dtype(data) + assert is_extension_array_dtype(data.dtype) + assert is_extension_array_dtype(pd.Series(data)) + assert isinstance(data.dtype, ExtensionDtype) + + def test_no_values_attribute(self, data): + # GH-20735: EA's with .values attribute give problems with internal + # code, disallowing this for now until solved + assert not hasattr(data, "values") + assert not hasattr(data, "_values") + + def test_is_numeric_honored(self, data): + result = pd.Series(data) + assert result._data.blocks[0].is_numeric is data.dtype._is_numeric + + def test_isna_extension_array(self, data_missing): + # If your `isna` returns an ExtensionArray, you must also implement + # _reduce. At the *very* least, you must implement any and all + na = data_missing.isna() + if is_extension_array_dtype(na): + assert na._reduce("any") + assert na.any() + + assert not na._reduce("all") + assert not na.all() + + assert na.dtype._is_boolean + + def test_copy(self, data): + # GH#27083 removing deep keyword from EA.copy + assert data[0] != data[1] + result = data.copy() + + data[1] = data[0] + assert result[1] != result[0] + + def test_view(self, data): + # view with no dtype should return a shallow copy, *not* the same + # object + assert data[1] != data[0] + + result = data.view() + assert result is not data + assert type(result) == type(data) + + result[1] = result[0] + assert data[1] == data[0] + + # check specifically that the `dtype` kwarg is accepted + data.view(dtype=None) diff --git a/venv/Lib/site-packages/pandas/tests/extension/base/io.py b/venv/Lib/site-packages/pandas/tests/extension/base/io.py new file mode 100644 index 0000000..3de752a --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/base/io.py @@ -0,0 +1,20 @@ +from io import StringIO + +import numpy as np +import pytest + +import pandas as pd + +from .base import BaseExtensionTests + + +class BaseParsingTests(BaseExtensionTests): + @pytest.mark.parametrize("engine", ["c", "python"]) + def test_EA_types(self, engine, data): + df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))}) + csv_output = df.to_csv(index=False, na_rep=np.nan) + result = pd.read_csv( + StringIO(csv_output), dtype={"with_dtype": str(data.dtype)}, engine=engine + ) + expected = df + self.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/extension/base/methods.py b/venv/Lib/site-packages/pandas/tests/extension/base/methods.py new file mode 100644 index 0000000..24ab7fe --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/base/methods.py @@ -0,0 +1,390 @@ +import operator + +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_bool_dtype + +import pandas as pd +import pandas._testing as tm +from pandas.core.sorting import nargsort + +from .base import BaseExtensionTests + + +class BaseMethodsTests(BaseExtensionTests): + """Various Series and DataFrame methods.""" + + @pytest.mark.parametrize("dropna", [True, False]) + def test_value_counts(self, all_data, dropna): + all_data = all_data[:10] + if dropna: + other = np.array(all_data[~all_data.isna()]) + else: + other = all_data + + result = pd.Series(all_data).value_counts(dropna=dropna).sort_index() + expected = pd.Series(other).value_counts(dropna=dropna).sort_index() + + self.assert_series_equal(result, expected) + + def test_count(self, data_missing): + df = pd.DataFrame({"A": data_missing}) + result = df.count(axis="columns") + expected = pd.Series([0, 1]) + self.assert_series_equal(result, expected) + + def test_series_count(self, data_missing): + # GH#26835 + ser = pd.Series(data_missing) + result = ser.count() + expected = 1 + assert result == expected + + def test_apply_simple_series(self, data): + result = pd.Series(data).apply(id) + assert isinstance(result, pd.Series) + + def test_argsort(self, data_for_sorting): + result = pd.Series(data_for_sorting).argsort() + expected = pd.Series(np.array([2, 0, 1], dtype=np.int64)) + self.assert_series_equal(result, expected) + + def test_argsort_missing_array(self, data_missing_for_sorting): + result = data_missing_for_sorting.argsort() + expected = np.array([2, 0, 1], dtype=np.dtype("int")) + # we don't care whether it's int32 or int64 + result = result.astype("int64", casting="safe") + expected = expected.astype("int64", casting="safe") + tm.assert_numpy_array_equal(result, expected) + + def test_argsort_missing(self, data_missing_for_sorting): + result = pd.Series(data_missing_for_sorting).argsort() + expected = pd.Series(np.array([1, -1, 0], dtype=np.int64)) + self.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "na_position, expected", + [ + ("last", np.array([2, 0, 1], dtype=np.dtype("intp"))), + ("first", np.array([1, 2, 0], dtype=np.dtype("intp"))), + ], + ) + def test_nargsort(self, data_missing_for_sorting, na_position, expected): + # GH 25439 + result = nargsort(data_missing_for_sorting, na_position=na_position) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("ascending", [True, False]) + def test_sort_values(self, data_for_sorting, ascending): + ser = pd.Series(data_for_sorting) + result = ser.sort_values(ascending=ascending) + expected = ser.iloc[[2, 0, 1]] + if not ascending: + expected = expected[::-1] + + self.assert_series_equal(result, expected) + + @pytest.mark.parametrize("ascending", [True, False]) + def test_sort_values_missing(self, data_missing_for_sorting, ascending): + ser = pd.Series(data_missing_for_sorting) + result = ser.sort_values(ascending=ascending) + if ascending: + expected = ser.iloc[[2, 0, 1]] + else: + expected = ser.iloc[[0, 2, 1]] + self.assert_series_equal(result, expected) + + @pytest.mark.parametrize("ascending", [True, False]) + def test_sort_values_frame(self, data_for_sorting, ascending): + df = pd.DataFrame({"A": [1, 2, 1], "B": data_for_sorting}) + result = df.sort_values(["A", "B"]) + expected = pd.DataFrame( + {"A": [1, 1, 2], "B": data_for_sorting.take([2, 0, 1])}, index=[2, 0, 1] + ) + self.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("box", [pd.Series, lambda x: x]) + @pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique]) + def test_unique(self, data, box, method): + duplicated = box(data._from_sequence([data[0], data[0]])) + + result = method(duplicated) + + assert len(result) == 1 + assert isinstance(result, type(data)) + assert result[0] == duplicated[0] + + @pytest.mark.parametrize("na_sentinel", [-1, -2]) + def test_factorize(self, data_for_grouping, na_sentinel): + codes, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) + expected_codes = np.array( + [0, 0, na_sentinel, na_sentinel, 1, 1, 0, 2], dtype=np.intp + ) + expected_uniques = data_for_grouping.take([0, 4, 7]) + + tm.assert_numpy_array_equal(codes, expected_codes) + self.assert_extension_array_equal(uniques, expected_uniques) + + @pytest.mark.parametrize("na_sentinel", [-1, -2]) + def test_factorize_equivalence(self, data_for_grouping, na_sentinel): + codes_1, uniques_1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) + codes_2, uniques_2 = data_for_grouping.factorize(na_sentinel=na_sentinel) + + tm.assert_numpy_array_equal(codes_1, codes_2) + self.assert_extension_array_equal(uniques_1, uniques_2) + + def test_factorize_empty(self, data): + codes, uniques = pd.factorize(data[:0]) + expected_codes = np.array([], dtype=np.intp) + expected_uniques = type(data)._from_sequence([], dtype=data[:0].dtype) + + tm.assert_numpy_array_equal(codes, expected_codes) + self.assert_extension_array_equal(uniques, expected_uniques) + + def test_fillna_copy_frame(self, data_missing): + arr = data_missing.take([1, 1]) + df = pd.DataFrame({"A": arr}) + + filled_val = df.iloc[0, 0] + result = df.fillna(filled_val) + + assert df.A.values is not result.A.values + + def test_fillna_copy_series(self, data_missing): + arr = data_missing.take([1, 1]) + ser = pd.Series(arr) + + filled_val = ser[0] + result = ser.fillna(filled_val) + + assert ser._values is not result._values + assert ser._values is arr + + def test_fillna_length_mismatch(self, data_missing): + msg = "Length of 'value' does not match." + with pytest.raises(ValueError, match=msg): + data_missing.fillna(data_missing.take([1])) + + def test_combine_le(self, data_repeated): + # GH 20825 + # Test that combine works when doing a <= (le) comparison + orig_data1, orig_data2 = data_repeated(2) + s1 = pd.Series(orig_data1) + s2 = pd.Series(orig_data2) + result = s1.combine(s2, lambda x1, x2: x1 <= x2) + expected = pd.Series( + [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))] + ) + self.assert_series_equal(result, expected) + + val = s1.iloc[0] + result = s1.combine(val, lambda x1, x2: x1 <= x2) + expected = pd.Series([a <= val for a in list(orig_data1)]) + self.assert_series_equal(result, expected) + + def test_combine_add(self, data_repeated): + # GH 20825 + orig_data1, orig_data2 = data_repeated(2) + s1 = pd.Series(orig_data1) + s2 = pd.Series(orig_data2) + result = s1.combine(s2, lambda x1, x2: x1 + x2) + with np.errstate(over="ignore"): + expected = pd.Series( + orig_data1._from_sequence( + [a + b for (a, b) in zip(list(orig_data1), list(orig_data2))] + ) + ) + self.assert_series_equal(result, expected) + + val = s1.iloc[0] + result = s1.combine(val, lambda x1, x2: x1 + x2) + expected = pd.Series( + orig_data1._from_sequence([a + val for a in list(orig_data1)]) + ) + self.assert_series_equal(result, expected) + + def test_combine_first(self, data): + # https://github.com/pandas-dev/pandas/issues/24147 + a = pd.Series(data[:3]) + b = pd.Series(data[2:5], index=[2, 3, 4]) + result = a.combine_first(b) + expected = pd.Series(data[:5]) + self.assert_series_equal(result, expected) + + @pytest.mark.parametrize("frame", [True, False]) + @pytest.mark.parametrize( + "periods, indices", + [(-2, [2, 3, 4, -1, -1]), (0, [0, 1, 2, 3, 4]), (2, [-1, -1, 0, 1, 2])], + ) + def test_container_shift(self, data, frame, periods, indices): + # https://github.com/pandas-dev/pandas/issues/22386 + subset = data[:5] + data = pd.Series(subset, name="A") + expected = pd.Series(subset.take(indices, allow_fill=True), name="A") + + if frame: + result = data.to_frame(name="A").assign(B=1).shift(periods) + expected = pd.concat( + [expected, pd.Series([1] * 5, name="B").shift(periods)], axis=1 + ) + compare = self.assert_frame_equal + else: + result = data.shift(periods) + compare = self.assert_series_equal + + compare(result, expected) + + @pytest.mark.parametrize("periods", [1, -2]) + def test_diff(self, data, periods): + data = data[:5] + if is_bool_dtype(data.dtype): + op = operator.xor + else: + op = operator.sub + try: + # does this array implement ops? + op(data, data) + except Exception: + pytest.skip(f"{type(data)} does not support diff") + s = pd.Series(data) + result = s.diff(periods) + expected = pd.Series(op(data, data.shift(periods))) + self.assert_series_equal(result, expected) + + df = pd.DataFrame({"A": data, "B": [1.0] * 5}) + result = df.diff(periods) + if periods == 1: + b = [np.nan, 0, 0, 0, 0] + else: + b = [0, 0, 0, np.nan, np.nan] + expected = pd.DataFrame({"A": expected, "B": b}) + self.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "periods, indices", + [[-4, [-1, -1]], [-1, [1, -1]], [0, [0, 1]], [1, [-1, 0]], [4, [-1, -1]]], + ) + def test_shift_non_empty_array(self, data, periods, indices): + # https://github.com/pandas-dev/pandas/issues/23911 + subset = data[:2] + result = subset.shift(periods) + expected = subset.take(indices, allow_fill=True) + self.assert_extension_array_equal(result, expected) + + @pytest.mark.parametrize("periods", [-4, -1, 0, 1, 4]) + def test_shift_empty_array(self, data, periods): + # https://github.com/pandas-dev/pandas/issues/23911 + empty = data[:0] + result = empty.shift(periods) + expected = empty + self.assert_extension_array_equal(result, expected) + + def test_shift_fill_value(self, data): + arr = data[:4] + fill_value = data[0] + result = arr.shift(1, fill_value=fill_value) + expected = data.take([0, 0, 1, 2]) + self.assert_extension_array_equal(result, expected) + + result = arr.shift(-2, fill_value=fill_value) + expected = data.take([2, 3, 0, 0]) + self.assert_extension_array_equal(result, expected) + + def test_hash_pandas_object_works(self, data, as_frame): + # https://github.com/pandas-dev/pandas/issues/23066 + data = pd.Series(data) + if as_frame: + data = data.to_frame() + a = pd.util.hash_pandas_object(data) + b = pd.util.hash_pandas_object(data) + self.assert_equal(a, b) + + def test_searchsorted(self, data_for_sorting, as_series): + b, c, a = data_for_sorting + arr = type(data_for_sorting)._from_sequence([a, b, c]) + + if as_series: + arr = pd.Series(arr) + assert arr.searchsorted(a) == 0 + assert arr.searchsorted(a, side="right") == 1 + + assert arr.searchsorted(b) == 1 + assert arr.searchsorted(b, side="right") == 2 + + assert arr.searchsorted(c) == 2 + assert arr.searchsorted(c, side="right") == 3 + + result = arr.searchsorted(arr.take([0, 2])) + expected = np.array([0, 2], dtype=np.intp) + + tm.assert_numpy_array_equal(result, expected) + + # sorter + sorter = np.array([1, 2, 0]) + assert data_for_sorting.searchsorted(a, sorter=sorter) == 0 + + def test_where_series(self, data, na_value, as_frame): + assert data[0] != data[1] + cls = type(data) + a, b = data[:2] + + ser = pd.Series(cls._from_sequence([a, a, b, b], dtype=data.dtype)) + cond = np.array([True, True, False, False]) + + if as_frame: + ser = ser.to_frame(name="a") + cond = cond.reshape(-1, 1) + + result = ser.where(cond) + expected = pd.Series( + cls._from_sequence([a, a, na_value, na_value], dtype=data.dtype) + ) + + if as_frame: + expected = expected.to_frame(name="a") + self.assert_equal(result, expected) + + # array other + cond = np.array([True, False, True, True]) + other = cls._from_sequence([a, b, a, b], dtype=data.dtype) + if as_frame: + other = pd.DataFrame({"a": other}) + cond = pd.DataFrame({"a": cond}) + result = ser.where(cond, other) + expected = pd.Series(cls._from_sequence([a, b, b, b], dtype=data.dtype)) + if as_frame: + expected = expected.to_frame(name="a") + self.assert_equal(result, expected) + + @pytest.mark.parametrize("repeats", [0, 1, 2, [1, 2, 3]]) + def test_repeat(self, data, repeats, as_series, use_numpy): + arr = type(data)._from_sequence(data[:3], dtype=data.dtype) + if as_series: + arr = pd.Series(arr) + + result = np.repeat(arr, repeats) if use_numpy else arr.repeat(repeats) + + repeats = [repeats] * 3 if isinstance(repeats, int) else repeats + expected = [x for x, n in zip(arr, repeats) for _ in range(n)] + expected = type(data)._from_sequence(expected, dtype=data.dtype) + if as_series: + expected = pd.Series(expected, index=arr.index.repeat(repeats)) + + self.assert_equal(result, expected) + + @pytest.mark.parametrize( + "repeats, kwargs, error, msg", + [ + (2, dict(axis=1), ValueError, "'axis"), + (-1, dict(), ValueError, "negative"), + ([1, 2], dict(), ValueError, "shape"), + (2, dict(foo="bar"), TypeError, "'foo'"), + ], + ) + def test_repeat_raises(self, data, repeats, kwargs, error, msg, use_numpy): + with pytest.raises(error, match=msg): + if use_numpy: + np.repeat(data, repeats, **kwargs) + else: + data.repeat(repeats, **kwargs) diff --git a/venv/Lib/site-packages/pandas/tests/extension/base/missing.py b/venv/Lib/site-packages/pandas/tests/extension/base/missing.py new file mode 100644 index 0000000..2393d2e --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/base/missing.py @@ -0,0 +1,129 @@ +import numpy as np + +import pandas as pd +import pandas._testing as tm + +from .base import BaseExtensionTests + + +class BaseMissingTests(BaseExtensionTests): + def test_isna(self, data_missing): + expected = np.array([True, False]) + + result = pd.isna(data_missing) + tm.assert_numpy_array_equal(result, expected) + + result = pd.Series(data_missing).isna() + expected = pd.Series(expected) + self.assert_series_equal(result, expected) + + # GH 21189 + result = pd.Series(data_missing).drop([0, 1]).isna() + expected = pd.Series([], dtype=bool) + self.assert_series_equal(result, expected) + + def test_dropna_array(self, data_missing): + result = data_missing.dropna() + expected = data_missing[[1]] + self.assert_extension_array_equal(result, expected) + + def test_dropna_series(self, data_missing): + ser = pd.Series(data_missing) + result = ser.dropna() + expected = ser.iloc[[1]] + self.assert_series_equal(result, expected) + + def test_dropna_frame(self, data_missing): + df = pd.DataFrame({"A": data_missing}) + + # defaults + result = df.dropna() + expected = df.iloc[[1]] + self.assert_frame_equal(result, expected) + + # axis = 1 + result = df.dropna(axis="columns") + expected = pd.DataFrame(index=[0, 1]) + self.assert_frame_equal(result, expected) + + # multiple + df = pd.DataFrame({"A": data_missing, "B": [1, np.nan]}) + result = df.dropna() + expected = df.iloc[:0] + self.assert_frame_equal(result, expected) + + def test_fillna_scalar(self, data_missing): + valid = data_missing[1] + result = data_missing.fillna(valid) + expected = data_missing.fillna(valid) + self.assert_extension_array_equal(result, expected) + + def test_fillna_limit_pad(self, data_missing): + arr = data_missing.take([1, 0, 0, 0, 1]) + result = pd.Series(arr).fillna(method="ffill", limit=2) + expected = pd.Series(data_missing.take([1, 1, 1, 0, 1])) + self.assert_series_equal(result, expected) + + def test_fillna_limit_backfill(self, data_missing): + arr = data_missing.take([1, 0, 0, 0, 1]) + result = pd.Series(arr).fillna(method="backfill", limit=2) + expected = pd.Series(data_missing.take([1, 0, 1, 1, 1])) + self.assert_series_equal(result, expected) + + def test_fillna_series(self, data_missing): + fill_value = data_missing[1] + ser = pd.Series(data_missing) + + result = ser.fillna(fill_value) + expected = pd.Series( + data_missing._from_sequence( + [fill_value, fill_value], dtype=data_missing.dtype + ) + ) + self.assert_series_equal(result, expected) + + # Fill with a series + result = ser.fillna(expected) + self.assert_series_equal(result, expected) + + # Fill with a series not affecting the missing values + result = ser.fillna(ser) + self.assert_series_equal(result, ser) + + def test_fillna_series_method(self, data_missing, fillna_method): + fill_value = data_missing[1] + + if fillna_method == "ffill": + data_missing = data_missing[::-1] + + result = pd.Series(data_missing).fillna(method=fillna_method) + expected = pd.Series( + data_missing._from_sequence( + [fill_value, fill_value], dtype=data_missing.dtype + ) + ) + + self.assert_series_equal(result, expected) + + def test_fillna_frame(self, data_missing): + fill_value = data_missing[1] + + result = pd.DataFrame({"A": data_missing, "B": [1, 2]}).fillna(fill_value) + + expected = pd.DataFrame( + { + "A": data_missing._from_sequence( + [fill_value, fill_value], dtype=data_missing.dtype + ), + "B": [1, 2], + } + ) + + self.assert_frame_equal(result, expected) + + def test_fillna_fill_other(self, data): + result = pd.DataFrame({"A": data, "B": [np.nan] * len(data)}).fillna({"B": 0.0}) + + expected = pd.DataFrame({"A": data, "B": [0.0] * len(result)}) + + self.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/extension/base/ops.py b/venv/Lib/site-packages/pandas/tests/extension/base/ops.py new file mode 100644 index 0000000..20d06ef --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/base/ops.py @@ -0,0 +1,170 @@ +import operator +from typing import Optional, Type + +import pytest + +import pandas as pd +from pandas.core import ops + +from .base import BaseExtensionTests + + +class BaseOpsUtil(BaseExtensionTests): + def get_op_from_name(self, op_name): + short_opname = op_name.strip("_") + try: + op = getattr(operator, short_opname) + except AttributeError: + # Assume it is the reverse operator + rop = getattr(operator, short_opname[1:]) + op = lambda x, y: rop(y, x) + + return op + + def check_opname(self, s, op_name, other, exc=Exception): + op = self.get_op_from_name(op_name) + + self._check_op(s, op, other, op_name, exc) + + def _check_op(self, s, op, other, op_name, exc=NotImplementedError): + if exc is None: + result = op(s, other) + expected = s.combine(other, op) + self.assert_series_equal(result, expected) + else: + with pytest.raises(exc): + op(s, other) + + def _check_divmod_op(self, s, op, other, exc=Exception): + # divmod has multiple return values, so check separately + if exc is None: + result_div, result_mod = op(s, other) + if op is divmod: + expected_div, expected_mod = s // other, s % other + else: + expected_div, expected_mod = other // s, other % s + self.assert_series_equal(result_div, expected_div) + self.assert_series_equal(result_mod, expected_mod) + else: + with pytest.raises(exc): + divmod(s, other) + + +class BaseArithmeticOpsTests(BaseOpsUtil): + """Various Series and DataFrame arithmetic ops methods. + + Subclasses supporting various ops should set the class variables + to indicate that they support ops of that kind + + * series_scalar_exc = TypeError + * frame_scalar_exc = TypeError + * series_array_exc = TypeError + * divmod_exc = TypeError + """ + + series_scalar_exc: Optional[Type[TypeError]] = TypeError + frame_scalar_exc: Optional[Type[TypeError]] = TypeError + series_array_exc: Optional[Type[TypeError]] = TypeError + divmod_exc: Optional[Type[TypeError]] = TypeError + + def test_arith_series_with_scalar(self, data, all_arithmetic_operators): + # series & scalar + op_name = all_arithmetic_operators + s = pd.Series(data) + self.check_opname(s, op_name, s.iloc[0], exc=self.series_scalar_exc) + + @pytest.mark.xfail(run=False, reason="_reduce needs implementation") + def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): + # frame & scalar + op_name = all_arithmetic_operators + df = pd.DataFrame({"A": data}) + self.check_opname(df, op_name, data[0], exc=self.frame_scalar_exc) + + def test_arith_series_with_array(self, data, all_arithmetic_operators): + # ndarray & other series + op_name = all_arithmetic_operators + s = pd.Series(data) + self.check_opname( + s, op_name, pd.Series([s.iloc[0]] * len(s)), exc=self.series_array_exc + ) + + def test_divmod(self, data): + s = pd.Series(data) + self._check_divmod_op(s, divmod, 1, exc=self.divmod_exc) + self._check_divmod_op(1, ops.rdivmod, s, exc=self.divmod_exc) + + def test_divmod_series_array(self, data, data_for_twos): + s = pd.Series(data) + self._check_divmod_op(s, divmod, data) + + other = data_for_twos + self._check_divmod_op(other, ops.rdivmod, s) + + other = pd.Series(other) + self._check_divmod_op(other, ops.rdivmod, s) + + def test_add_series_with_extension_array(self, data): + s = pd.Series(data) + result = s + data + expected = pd.Series(data + data) + self.assert_series_equal(result, expected) + + def test_error(self, data, all_arithmetic_operators): + # invalid ops + op_name = all_arithmetic_operators + with pytest.raises(AttributeError): + getattr(data, op_name) + + def test_direct_arith_with_series_returns_not_implemented(self, data): + # EAs should return NotImplemented for ops with Series. + # Pandas takes care of unboxing the series and calling the EA's op. + other = pd.Series(data) + if hasattr(data, "__add__"): + result = data.__add__(other) + assert result is NotImplemented + else: + raise pytest.skip(f"{type(data).__name__} does not implement add") + + +class BaseComparisonOpsTests(BaseOpsUtil): + """Various Series and DataFrame comparison ops methods.""" + + def _compare_other(self, s, data, op_name, other): + op = self.get_op_from_name(op_name) + if op_name == "__eq__": + assert getattr(data, op_name)(other) is NotImplemented + assert not op(s, other).all() + elif op_name == "__ne__": + assert getattr(data, op_name)(other) is NotImplemented + assert op(s, other).all() + + else: + + # array + assert getattr(data, op_name)(other) is NotImplemented + + # series + s = pd.Series(data) + with pytest.raises(TypeError): + op(s, other) + + def test_compare_scalar(self, data, all_compare_operators): + op_name = all_compare_operators + s = pd.Series(data) + self._compare_other(s, data, op_name, 0) + + def test_compare_array(self, data, all_compare_operators): + op_name = all_compare_operators + s = pd.Series(data) + other = pd.Series([data[0]] * len(data)) + self._compare_other(s, data, op_name, other) + + def test_direct_arith_with_series_returns_not_implemented(self, data): + # EAs should return NotImplemented for ops with Series. + # Pandas takes care of unboxing the series and calling the EA's op. + other = pd.Series(data) + if hasattr(data, "__eq__"): + result = data.__eq__(other) + assert result is NotImplemented + else: + raise pytest.skip(f"{type(data).__name__} does not implement __eq__") diff --git a/venv/Lib/site-packages/pandas/tests/extension/base/printing.py b/venv/Lib/site-packages/pandas/tests/extension/base/printing.py new file mode 100644 index 0000000..ad34a83 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/base/printing.py @@ -0,0 +1,43 @@ +import io + +import pytest + +import pandas as pd + +from .base import BaseExtensionTests + + +class BasePrintingTests(BaseExtensionTests): + """Tests checking the formatting of your EA when printed.""" + + @pytest.mark.parametrize("size", ["big", "small"]) + def test_array_repr(self, data, size): + if size == "small": + data = data[:5] + else: + data = type(data)._concat_same_type([data] * 5) + + result = repr(data) + assert type(data).__name__ in result + assert f"Length: {len(data)}" in result + assert str(data.dtype) in result + if size == "big": + assert "..." in result + + def test_array_repr_unicode(self, data): + result = str(data) + assert isinstance(result, str) + + def test_series_repr(self, data): + ser = pd.Series(data) + assert data.dtype.name in repr(ser) + + def test_dataframe_repr(self, data): + df = pd.DataFrame({"A": data}) + repr(df) + + def test_dtype_name_in_info(self, data): + buf = io.StringIO() + pd.DataFrame({"A": data}).info(buf=buf) + result = buf.getvalue() + assert data.dtype.name in result diff --git a/venv/Lib/site-packages/pandas/tests/extension/base/reduce.py b/venv/Lib/site-packages/pandas/tests/extension/base/reduce.py new file mode 100644 index 0000000..6f433d6 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/base/reduce.py @@ -0,0 +1,60 @@ +import warnings + +import pytest + +import pandas as pd +import pandas._testing as tm + +from .base import BaseExtensionTests + + +class BaseReduceTests(BaseExtensionTests): + """ + Reduction specific tests. Generally these only + make sense for numeric/boolean operations. + """ + + def check_reduce(self, s, op_name, skipna): + result = getattr(s, op_name)(skipna=skipna) + expected = getattr(s.astype("float64"), op_name)(skipna=skipna) + tm.assert_almost_equal(result, expected) + + +class BaseNoReduceTests(BaseReduceTests): + """ we don't define any reductions """ + + @pytest.mark.parametrize("skipna", [True, False]) + def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): + op_name = all_numeric_reductions + s = pd.Series(data) + + with pytest.raises(TypeError): + getattr(s, op_name)(skipna=skipna) + + @pytest.mark.parametrize("skipna", [True, False]) + def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna): + op_name = all_boolean_reductions + s = pd.Series(data) + + with pytest.raises(TypeError): + getattr(s, op_name)(skipna=skipna) + + +class BaseNumericReduceTests(BaseReduceTests): + @pytest.mark.parametrize("skipna", [True, False]) + def test_reduce_series(self, data, all_numeric_reductions, skipna): + op_name = all_numeric_reductions + s = pd.Series(data) + + # min/max with empty produce numpy warnings + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + self.check_reduce(s, op_name, skipna) + + +class BaseBooleanReduceTests(BaseReduceTests): + @pytest.mark.parametrize("skipna", [True, False]) + def test_reduce_series(self, data, all_boolean_reductions, skipna): + op_name = all_boolean_reductions + s = pd.Series(data) + self.check_reduce(s, op_name, skipna) diff --git a/venv/Lib/site-packages/pandas/tests/extension/base/reshaping.py b/venv/Lib/site-packages/pandas/tests/extension/base/reshaping.py new file mode 100644 index 0000000..ec21898 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/base/reshaping.py @@ -0,0 +1,326 @@ +import itertools + +import numpy as np +import pytest + +import pandas as pd +from pandas.core.internals import ExtensionBlock + +from .base import BaseExtensionTests + + +class BaseReshapingTests(BaseExtensionTests): + """Tests for reshaping and concatenation.""" + + @pytest.mark.parametrize("in_frame", [True, False]) + def test_concat(self, data, in_frame): + wrapped = pd.Series(data) + if in_frame: + wrapped = pd.DataFrame(wrapped) + result = pd.concat([wrapped, wrapped], ignore_index=True) + + assert len(result) == len(data) * 2 + + if in_frame: + dtype = result.dtypes[0] + else: + dtype = result.dtype + + assert dtype == data.dtype + assert isinstance(result._data.blocks[0], ExtensionBlock) + + @pytest.mark.parametrize("in_frame", [True, False]) + def test_concat_all_na_block(self, data_missing, in_frame): + valid_block = pd.Series(data_missing.take([1, 1]), index=[0, 1]) + na_block = pd.Series(data_missing.take([0, 0]), index=[2, 3]) + if in_frame: + valid_block = pd.DataFrame({"a": valid_block}) + na_block = pd.DataFrame({"a": na_block}) + result = pd.concat([valid_block, na_block]) + if in_frame: + expected = pd.DataFrame({"a": data_missing.take([1, 1, 0, 0])}) + self.assert_frame_equal(result, expected) + else: + expected = pd.Series(data_missing.take([1, 1, 0, 0])) + self.assert_series_equal(result, expected) + + def test_concat_mixed_dtypes(self, data): + # https://github.com/pandas-dev/pandas/issues/20762 + df1 = pd.DataFrame({"A": data[:3]}) + df2 = pd.DataFrame({"A": [1, 2, 3]}) + df3 = pd.DataFrame({"A": ["a", "b", "c"]}).astype("category") + dfs = [df1, df2, df3] + + # dataframes + result = pd.concat(dfs) + expected = pd.concat([x.astype(object) for x in dfs]) + self.assert_frame_equal(result, expected) + + # series + result = pd.concat([x["A"] for x in dfs]) + expected = pd.concat([x["A"].astype(object) for x in dfs]) + self.assert_series_equal(result, expected) + + # simple test for just EA and one other + result = pd.concat([df1, df2]) + expected = pd.concat([df1.astype("object"), df2.astype("object")]) + self.assert_frame_equal(result, expected) + + result = pd.concat([df1["A"], df2["A"]]) + expected = pd.concat([df1["A"].astype("object"), df2["A"].astype("object")]) + self.assert_series_equal(result, expected) + + def test_concat_columns(self, data, na_value): + df1 = pd.DataFrame({"A": data[:3]}) + df2 = pd.DataFrame({"B": [1, 2, 3]}) + + expected = pd.DataFrame({"A": data[:3], "B": [1, 2, 3]}) + result = pd.concat([df1, df2], axis=1) + self.assert_frame_equal(result, expected) + result = pd.concat([df1["A"], df2["B"]], axis=1) + self.assert_frame_equal(result, expected) + + # non-aligned + df2 = pd.DataFrame({"B": [1, 2, 3]}, index=[1, 2, 3]) + expected = pd.DataFrame( + { + "A": data._from_sequence(list(data[:3]) + [na_value], dtype=data.dtype), + "B": [np.nan, 1, 2, 3], + } + ) + + result = pd.concat([df1, df2], axis=1) + self.assert_frame_equal(result, expected) + result = pd.concat([df1["A"], df2["B"]], axis=1) + self.assert_frame_equal(result, expected) + + def test_concat_extension_arrays_copy_false(self, data, na_value): + # GH 20756 + df1 = pd.DataFrame({"A": data[:3]}) + df2 = pd.DataFrame({"B": data[3:7]}) + expected = pd.DataFrame( + { + "A": data._from_sequence(list(data[:3]) + [na_value], dtype=data.dtype), + "B": data[3:7], + } + ) + result = pd.concat([df1, df2], axis=1, copy=False) + self.assert_frame_equal(result, expected) + + def test_align(self, data, na_value): + a = data[:3] + b = data[2:5] + r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3])) + + # Assumes that the ctor can take a list of scalars of the type + e1 = pd.Series(data._from_sequence(list(a) + [na_value], dtype=data.dtype)) + e2 = pd.Series(data._from_sequence([na_value] + list(b), dtype=data.dtype)) + self.assert_series_equal(r1, e1) + self.assert_series_equal(r2, e2) + + def test_align_frame(self, data, na_value): + a = data[:3] + b = data[2:5] + r1, r2 = pd.DataFrame({"A": a}).align(pd.DataFrame({"A": b}, index=[1, 2, 3])) + + # Assumes that the ctor can take a list of scalars of the type + e1 = pd.DataFrame( + {"A": data._from_sequence(list(a) + [na_value], dtype=data.dtype)} + ) + e2 = pd.DataFrame( + {"A": data._from_sequence([na_value] + list(b), dtype=data.dtype)} + ) + self.assert_frame_equal(r1, e1) + self.assert_frame_equal(r2, e2) + + def test_align_series_frame(self, data, na_value): + # https://github.com/pandas-dev/pandas/issues/20576 + ser = pd.Series(data, name="a") + df = pd.DataFrame({"col": np.arange(len(ser) + 1)}) + r1, r2 = ser.align(df) + + e1 = pd.Series( + data._from_sequence(list(data) + [na_value], dtype=data.dtype), + name=ser.name, + ) + + self.assert_series_equal(r1, e1) + self.assert_frame_equal(r2, df) + + def test_set_frame_expand_regular_with_extension(self, data): + df = pd.DataFrame({"A": [1] * len(data)}) + df["B"] = data + expected = pd.DataFrame({"A": [1] * len(data), "B": data}) + self.assert_frame_equal(df, expected) + + def test_set_frame_expand_extension_with_regular(self, data): + df = pd.DataFrame({"A": data}) + df["B"] = [1] * len(data) + expected = pd.DataFrame({"A": data, "B": [1] * len(data)}) + self.assert_frame_equal(df, expected) + + def test_set_frame_overwrite_object(self, data): + # https://github.com/pandas-dev/pandas/issues/20555 + df = pd.DataFrame({"A": [1] * len(data)}, dtype=object) + df["A"] = data + assert df.dtypes["A"] == data.dtype + + def test_merge(self, data, na_value): + # GH-20743 + df1 = pd.DataFrame({"ext": data[:3], "int1": [1, 2, 3], "key": [0, 1, 2]}) + df2 = pd.DataFrame({"int2": [1, 2, 3, 4], "key": [0, 0, 1, 3]}) + + res = pd.merge(df1, df2) + exp = pd.DataFrame( + { + "int1": [1, 1, 2], + "int2": [1, 2, 3], + "key": [0, 0, 1], + "ext": data._from_sequence( + [data[0], data[0], data[1]], dtype=data.dtype + ), + } + ) + self.assert_frame_equal(res, exp[["ext", "int1", "key", "int2"]]) + + res = pd.merge(df1, df2, how="outer") + exp = pd.DataFrame( + { + "int1": [1, 1, 2, 3, np.nan], + "int2": [1, 2, 3, np.nan, 4], + "key": [0, 0, 1, 2, 3], + "ext": data._from_sequence( + [data[0], data[0], data[1], data[2], na_value], dtype=data.dtype + ), + } + ) + self.assert_frame_equal(res, exp[["ext", "int1", "key", "int2"]]) + + def test_merge_on_extension_array(self, data): + # GH 23020 + a, b = data[:2] + key = type(data)._from_sequence([a, b], dtype=data.dtype) + + df = pd.DataFrame({"key": key, "val": [1, 2]}) + result = pd.merge(df, df, on="key") + expected = pd.DataFrame({"key": key, "val_x": [1, 2], "val_y": [1, 2]}) + self.assert_frame_equal(result, expected) + + # order + result = pd.merge(df.iloc[[1, 0]], df, on="key") + expected = expected.iloc[[1, 0]].reset_index(drop=True) + self.assert_frame_equal(result, expected) + + def test_merge_on_extension_array_duplicates(self, data): + # GH 23020 + a, b = data[:2] + key = type(data)._from_sequence([a, b, a], dtype=data.dtype) + df1 = pd.DataFrame({"key": key, "val": [1, 2, 3]}) + df2 = pd.DataFrame({"key": key, "val": [1, 2, 3]}) + + result = pd.merge(df1, df2, on="key") + expected = pd.DataFrame( + { + "key": key.take([0, 0, 0, 0, 1]), + "val_x": [1, 1, 3, 3, 2], + "val_y": [1, 3, 1, 3, 2], + } + ) + self.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "columns", + [ + ["A", "B"], + pd.MultiIndex.from_tuples( + [("A", "a"), ("A", "b")], names=["outer", "inner"] + ), + ], + ) + def test_stack(self, data, columns): + df = pd.DataFrame({"A": data[:5], "B": data[:5]}) + df.columns = columns + result = df.stack() + expected = df.astype(object).stack() + # we need a second astype(object), in case the constructor inferred + # object -> specialized, as is done for period. + expected = expected.astype(object) + + if isinstance(expected, pd.Series): + assert result.dtype == df.iloc[:, 0].dtype + else: + assert all(result.dtypes == df.iloc[:, 0].dtype) + + result = result.astype(object) + self.assert_equal(result, expected) + + @pytest.mark.parametrize( + "index", + [ + # Two levels, uniform. + pd.MultiIndex.from_product(([["A", "B"], ["a", "b"]]), names=["a", "b"]), + # non-uniform + pd.MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "b")]), + # three levels, non-uniform + pd.MultiIndex.from_product([("A", "B"), ("a", "b", "c"), (0, 1, 2)]), + pd.MultiIndex.from_tuples( + [ + ("A", "a", 1), + ("A", "b", 0), + ("A", "a", 0), + ("B", "a", 0), + ("B", "c", 1), + ] + ), + ], + ) + @pytest.mark.parametrize("obj", ["series", "frame"]) + def test_unstack(self, data, index, obj): + data = data[: len(index)] + if obj == "series": + ser = pd.Series(data, index=index) + else: + ser = pd.DataFrame({"A": data, "B": data}, index=index) + + n = index.nlevels + levels = list(range(n)) + # [0, 1, 2] + # [(0,), (1,), (2,), (0, 1), (0, 2), (1, 0), (1, 2), (2, 0), (2, 1)] + combinations = itertools.chain.from_iterable( + itertools.permutations(levels, i) for i in range(1, n) + ) + + for level in combinations: + result = ser.unstack(level=level) + assert all( + isinstance(result[col].array, type(data)) for col in result.columns + ) + expected = ser.astype(object).unstack(level=level) + result = result.astype(object) + + self.assert_frame_equal(result, expected) + + def test_ravel(self, data): + # as long as EA is 1D-only, ravel is a no-op + result = data.ravel() + assert type(result) == type(data) + + # Check that we have a view, not a copy + result[0] = result[1] + assert data[0] == data[1] + + def test_transpose(self, data): + df = pd.DataFrame({"A": data[:4], "B": data[:4]}, index=["a", "b", "c", "d"]) + result = df.T + expected = pd.DataFrame( + { + "a": type(data)._from_sequence([data[0]] * 2, dtype=data.dtype), + "b": type(data)._from_sequence([data[1]] * 2, dtype=data.dtype), + "c": type(data)._from_sequence([data[2]] * 2, dtype=data.dtype), + "d": type(data)._from_sequence([data[3]] * 2, dtype=data.dtype), + }, + index=["A", "B"], + ) + self.assert_frame_equal(result, expected) + self.assert_frame_equal(np.transpose(np.transpose(df)), df) + self.assert_frame_equal(np.transpose(np.transpose(df[["A"]])), df[["A"]]) diff --git a/venv/Lib/site-packages/pandas/tests/extension/base/setitem.py b/venv/Lib/site-packages/pandas/tests/extension/base/setitem.py new file mode 100644 index 0000000..0bb8aed --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/base/setitem.py @@ -0,0 +1,197 @@ +import operator + +import numpy as np +import pytest + +import pandas as pd + +from .base import BaseExtensionTests + + +class BaseSetitemTests(BaseExtensionTests): + def test_setitem_scalar_series(self, data, box_in_series): + if box_in_series: + data = pd.Series(data) + data[0] = data[1] + assert data[0] == data[1] + + def test_setitem_sequence(self, data, box_in_series): + if box_in_series: + data = pd.Series(data) + original = data.copy() + + data[[0, 1]] = [data[1], data[0]] + assert data[0] == original[1] + assert data[1] == original[0] + + def test_setitem_sequence_mismatched_length_raises(self, data, as_array): + ser = pd.Series(data) + original = ser.copy() + value = [data[0]] + if as_array: + value = data._from_sequence(value) + + xpr = "cannot set using a {} indexer with a different length" + with pytest.raises(ValueError, match=xpr.format("list-like")): + ser[[0, 1]] = value + # Ensure no modifications made before the exception + self.assert_series_equal(ser, original) + + with pytest.raises(ValueError, match=xpr.format("slice")): + ser[slice(3)] = value + self.assert_series_equal(ser, original) + + def test_setitem_empty_indxer(self, data, box_in_series): + if box_in_series: + data = pd.Series(data) + original = data.copy() + data[np.array([], dtype=int)] = [] + self.assert_equal(data, original) + + def test_setitem_sequence_broadcasts(self, data, box_in_series): + if box_in_series: + data = pd.Series(data) + data[[0, 1]] = data[2] + assert data[0] == data[2] + assert data[1] == data[2] + + @pytest.mark.parametrize("setter", ["loc", "iloc"]) + def test_setitem_scalar(self, data, setter): + arr = pd.Series(data) + setter = getattr(arr, setter) + operator.setitem(setter, 0, data[1]) + assert arr[0] == data[1] + + def test_setitem_loc_scalar_mixed(self, data): + df = pd.DataFrame({"A": np.arange(len(data)), "B": data}) + df.loc[0, "B"] = data[1] + assert df.loc[0, "B"] == data[1] + + def test_setitem_loc_scalar_single(self, data): + df = pd.DataFrame({"B": data}) + df.loc[10, "B"] = data[1] + assert df.loc[10, "B"] == data[1] + + def test_setitem_loc_scalar_multiple_homogoneous(self, data): + df = pd.DataFrame({"A": data, "B": data}) + df.loc[10, "B"] = data[1] + assert df.loc[10, "B"] == data[1] + + def test_setitem_iloc_scalar_mixed(self, data): + df = pd.DataFrame({"A": np.arange(len(data)), "B": data}) + df.iloc[0, 1] = data[1] + assert df.loc[0, "B"] == data[1] + + def test_setitem_iloc_scalar_single(self, data): + df = pd.DataFrame({"B": data}) + df.iloc[10, 0] = data[1] + assert df.loc[10, "B"] == data[1] + + def test_setitem_iloc_scalar_multiple_homogoneous(self, data): + df = pd.DataFrame({"A": data, "B": data}) + df.iloc[10, 1] = data[1] + assert df.loc[10, "B"] == data[1] + + @pytest.mark.parametrize("as_callable", [True, False]) + @pytest.mark.parametrize("setter", ["loc", None]) + def test_setitem_mask_aligned(self, data, as_callable, setter): + ser = pd.Series(data) + mask = np.zeros(len(data), dtype=bool) + mask[:2] = True + + if as_callable: + mask2 = lambda x: mask + else: + mask2 = mask + + if setter: + # loc + target = getattr(ser, setter) + else: + # Series.__setitem__ + target = ser + + operator.setitem(target, mask2, data[5:7]) + + ser[mask2] = data[5:7] + assert ser[0] == data[5] + assert ser[1] == data[6] + + @pytest.mark.parametrize("setter", ["loc", None]) + def test_setitem_mask_broadcast(self, data, setter): + ser = pd.Series(data) + mask = np.zeros(len(data), dtype=bool) + mask[:2] = True + + if setter: # loc + target = getattr(ser, setter) + else: # __setitem__ + target = ser + + operator.setitem(target, mask, data[10]) + assert ser[0] == data[10] + assert ser[1] == data[10] + + def test_setitem_expand_columns(self, data): + df = pd.DataFrame({"A": data}) + result = df.copy() + result["B"] = 1 + expected = pd.DataFrame({"A": data, "B": [1] * len(data)}) + self.assert_frame_equal(result, expected) + + result = df.copy() + result.loc[:, "B"] = 1 + self.assert_frame_equal(result, expected) + + # overwrite with new type + result["B"] = data + expected = pd.DataFrame({"A": data, "B": data}) + self.assert_frame_equal(result, expected) + + def test_setitem_expand_with_extension(self, data): + df = pd.DataFrame({"A": [1] * len(data)}) + result = df.copy() + result["B"] = data + expected = pd.DataFrame({"A": [1] * len(data), "B": data}) + self.assert_frame_equal(result, expected) + + result = df.copy() + result.loc[:, "B"] = data + self.assert_frame_equal(result, expected) + + def test_setitem_frame_invalid_length(self, data): + df = pd.DataFrame({"A": [1] * len(data)}) + xpr = "Length of values does not match length of index" + with pytest.raises(ValueError, match=xpr): + df["B"] = data[:5] + + @pytest.mark.xfail(reason="GH#20441: setitem on extension types.") + def test_setitem_tuple_index(self, data): + s = pd.Series(data[:2], index=[(0, 0), (0, 1)]) + expected = pd.Series(data.take([1, 1]), index=s.index) + s[(0, 1)] = data[1] + self.assert_series_equal(s, expected) + + def test_setitem_slice_mismatch_length_raises(self, data): + arr = data[:5] + with pytest.raises(ValueError): + arr[:1] = arr[:2] + + def test_setitem_slice_array(self, data): + arr = data[:5].copy() + arr[:5] = data[-5:] + self.assert_extension_array_equal(arr, data[-5:]) + + def test_setitem_scalar_key_sequence_raise(self, data): + arr = data[:5].copy() + with pytest.raises(ValueError): + arr[0] = arr[[0, 1]] + + def test_setitem_preserves_views(self, data): + # GH#28150 setitem shouldn't swap the underlying data + view1 = data.view() + view2 = data[:] + + data[0] = data[1] + assert view1[0] == data[1] + assert view2[0] == data[1] diff --git a/venv/Lib/site-packages/pandas/tests/extension/conftest.py b/venv/Lib/site-packages/pandas/tests/extension/conftest.py new file mode 100644 index 0000000..d37638d --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/conftest.py @@ -0,0 +1,178 @@ +import operator + +import pytest + +from pandas import Series + + +@pytest.fixture +def dtype(): + """A fixture providing the ExtensionDtype to validate.""" + raise NotImplementedError + + +@pytest.fixture +def data(): + """Length-100 array for this type. + + * data[0] and data[1] should both be non missing + * data[0] and data[1] should not be equal + """ + raise NotImplementedError + + +@pytest.fixture +def data_for_twos(): + """Length-100 array in which all the elements are two.""" + raise NotImplementedError + + +@pytest.fixture +def data_missing(): + """Length-2 array with [NA, Valid]""" + raise NotImplementedError + + +@pytest.fixture(params=["data", "data_missing"]) +def all_data(request, data, data_missing): + """Parametrized fixture giving 'data' and 'data_missing'""" + if request.param == "data": + return data + elif request.param == "data_missing": + return data_missing + + +@pytest.fixture +def data_repeated(data): + """ + Generate many datasets. + + Parameters + ---------- + data : fixture implementing `data` + + Returns + ------- + Callable[[int], Generator]: + A callable that takes a `count` argument and + returns a generator yielding `count` datasets. + """ + + def gen(count): + for _ in range(count): + yield data + + return gen + + +@pytest.fixture +def data_for_sorting(): + """Length-3 array with a known sort order. + + This should be three items [B, C, A] with + A < B < C + """ + raise NotImplementedError + + +@pytest.fixture +def data_missing_for_sorting(): + """Length-3 array with a known sort order. + + This should be three items [B, NA, A] with + A < B and NA missing. + """ + raise NotImplementedError + + +@pytest.fixture +def na_cmp(): + """Binary operator for comparing NA values. + + Should return a function of two arguments that returns + True if both arguments are (scalar) NA for your type. + + By default, uses ``operator.is_`` + """ + return operator.is_ + + +@pytest.fixture +def na_value(): + """The scalar missing value for this type. Default 'None'""" + return None + + +@pytest.fixture +def data_for_grouping(): + """Data for factorization, grouping, and unique tests. + + Expected to be like [B, B, NA, NA, A, A, B, C] + + Where A < B < C and NA is missing + """ + raise NotImplementedError + + +@pytest.fixture(params=[True, False]) +def box_in_series(request): + """Whether to box the data in a Series""" + return request.param + + +@pytest.fixture( + params=[ + lambda x: 1, + lambda x: [1] * len(x), + lambda x: Series([1] * len(x)), + lambda x: x, + ], + ids=["scalar", "list", "series", "object"], +) +def groupby_apply_op(request): + """ + Functions to test groupby.apply(). + """ + return request.param + + +@pytest.fixture(params=[True, False]) +def as_frame(request): + """ + Boolean fixture to support Series and Series.to_frame() comparison testing. + """ + return request.param + + +@pytest.fixture(params=[True, False]) +def as_series(request): + """ + Boolean fixture to support arr and Series(arr) comparison testing. + """ + return request.param + + +@pytest.fixture(params=[True, False]) +def use_numpy(request): + """ + Boolean fixture to support comparison testing of ExtensionDtype array + and numpy array. + """ + return request.param + + +@pytest.fixture(params=["ffill", "bfill"]) +def fillna_method(request): + """ + Parametrized fixture giving method parameters 'ffill' and 'bfill' for + Series.fillna(method=) testing. + """ + return request.param + + +@pytest.fixture(params=[True, False]) +def as_array(request): + """ + Boolean fixture to support ExtensionDtype _from_sequence method testing. + """ + return request.param diff --git a/venv/Lib/site-packages/pandas/tests/extension/decimal/__init__.py b/venv/Lib/site-packages/pandas/tests/extension/decimal/__init__.py new file mode 100644 index 0000000..8194327 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/decimal/__init__.py @@ -0,0 +1,3 @@ +from .array import DecimalArray, DecimalDtype, make_data, to_decimal + +__all__ = ["DecimalArray", "DecimalDtype", "to_decimal", "make_data"] diff --git a/venv/Lib/site-packages/pandas/tests/extension/decimal/array.py b/venv/Lib/site-packages/pandas/tests/extension/decimal/array.py new file mode 100644 index 0000000..743852c --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/decimal/array.py @@ -0,0 +1,203 @@ +import decimal +import numbers +import random +import sys + +import numpy as np + +from pandas.core.dtypes.base import ExtensionDtype + +import pandas as pd +from pandas.api.extensions import no_default, register_extension_dtype +from pandas.core.arrays import ExtensionArray, ExtensionScalarOpsMixin + + +@register_extension_dtype +class DecimalDtype(ExtensionDtype): + type = decimal.Decimal + name = "decimal" + na_value = decimal.Decimal("NaN") + _metadata = ("context",) + + def __init__(self, context=None): + self.context = context or decimal.getcontext() + + def __repr__(self) -> str: + return f"DecimalDtype(context={self.context})" + + @classmethod + def construct_array_type(cls): + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + return DecimalArray + + @classmethod + def construct_from_string(cls, string): + if string == cls.name: + return cls() + else: + raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") + + @property + def _is_numeric(self): + return True + + +class DecimalArray(ExtensionArray, ExtensionScalarOpsMixin): + __array_priority__ = 1000 + + def __init__(self, values, dtype=None, copy=False, context=None): + for val in values: + if not isinstance(val, decimal.Decimal): + raise TypeError("All values must be of type " + str(decimal.Decimal)) + values = np.asarray(values, dtype=object) + + self._data = values + # Some aliases for common attribute names to ensure pandas supports + # these + self._items = self.data = self._data + # those aliases are currently not working due to assumptions + # in internal code (GH-20735) + # self._values = self.values = self.data + self._dtype = DecimalDtype(context) + + @property + def dtype(self): + return self._dtype + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + return cls(scalars) + + @classmethod + def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): + return cls._from_sequence([decimal.Decimal(x) for x in strings], dtype, copy) + + @classmethod + def _from_factorized(cls, values, original): + return cls(values) + + _HANDLED_TYPES = (decimal.Decimal, numbers.Number, np.ndarray) + + def to_numpy(self, dtype=None, copy=False, na_value=no_default, decimals=None): + result = np.asarray(self, dtype=dtype) + if decimals is not None: + result = np.asarray([round(x, decimals) for x in result]) + return result + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + # + if not all( + isinstance(t, self._HANDLED_TYPES + (DecimalArray,)) for t in inputs + ): + return NotImplemented + + inputs = tuple(x._data if isinstance(x, DecimalArray) else x for x in inputs) + result = getattr(ufunc, method)(*inputs, **kwargs) + + def reconstruct(x): + if isinstance(x, (decimal.Decimal, numbers.Number)): + return x + else: + return DecimalArray._from_sequence(x) + + if isinstance(result, tuple): + return tuple(reconstruct(x) for x in result) + else: + return reconstruct(result) + + def __getitem__(self, item): + if isinstance(item, numbers.Integral): + return self._data[item] + else: + # array, slice. + item = pd.api.indexers.check_array_indexer(self, item) + return type(self)(self._data[item]) + + def take(self, indexer, allow_fill=False, fill_value=None): + from pandas.api.extensions import take + + data = self._data + if allow_fill and fill_value is None: + fill_value = self.dtype.na_value + + result = take(data, indexer, fill_value=fill_value, allow_fill=allow_fill) + return self._from_sequence(result) + + def copy(self): + return type(self)(self._data.copy()) + + def astype(self, dtype, copy=True): + if isinstance(dtype, type(self.dtype)): + return type(self)(self._data, context=dtype.context) + return np.asarray(self, dtype=dtype) + + def __setitem__(self, key, value): + if pd.api.types.is_list_like(value): + if pd.api.types.is_scalar(key): + raise ValueError("setting an array element with a sequence.") + value = [decimal.Decimal(v) for v in value] + else: + value = decimal.Decimal(value) + self._data[key] = value + + def __len__(self) -> int: + return len(self._data) + + @property + def nbytes(self) -> int: + n = len(self) + if n: + return n * sys.getsizeof(self[0]) + return 0 + + def isna(self): + return np.array([x.is_nan() for x in self._data], dtype=bool) + + @property + def _na_value(self): + return decimal.Decimal("NaN") + + def _formatter(self, boxed=False): + if boxed: + return "Decimal: {0}".format + return repr + + @classmethod + def _concat_same_type(cls, to_concat): + return cls(np.concatenate([x._data for x in to_concat])) + + def _reduce(self, name, skipna=True, **kwargs): + + if skipna: + # If we don't have any NAs, we can ignore skipna + if self.isna().any(): + other = self[~self.isna()] + return other._reduce(name, **kwargs) + + if name == "sum" and len(self) == 0: + # GH#29630 avoid returning int 0 or np.bool_(False) on old numpy + return decimal.Decimal(0) + + try: + op = getattr(self.data, name) + except AttributeError: + raise NotImplementedError(f"decimal does not support the {name} operation") + return op(axis=0) + + +def to_decimal(values, context=None): + return DecimalArray([decimal.Decimal(x) for x in values], context=context) + + +def make_data(): + return [decimal.Decimal(random.random()) for _ in range(100)] + + +DecimalArray._add_arithmetic_ops() +DecimalArray._add_comparison_ops() diff --git a/venv/Lib/site-packages/pandas/tests/extension/decimal/test_decimal.py b/venv/Lib/site-packages/pandas/tests/extension/decimal/test_decimal.py new file mode 100644 index 0000000..de7c98a --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/decimal/test_decimal.py @@ -0,0 +1,515 @@ +import decimal +import math +import operator + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.tests.extension import base + +from .array import DecimalArray, DecimalDtype, make_data, to_decimal + + +@pytest.fixture +def dtype(): + return DecimalDtype() + + +@pytest.fixture +def data(): + return DecimalArray(make_data()) + + +@pytest.fixture +def data_for_twos(): + return DecimalArray([decimal.Decimal(2) for _ in range(100)]) + + +@pytest.fixture +def data_missing(): + return DecimalArray([decimal.Decimal("NaN"), decimal.Decimal(1)]) + + +@pytest.fixture +def data_for_sorting(): + return DecimalArray( + [decimal.Decimal("1"), decimal.Decimal("2"), decimal.Decimal("0")] + ) + + +@pytest.fixture +def data_missing_for_sorting(): + return DecimalArray( + [decimal.Decimal("1"), decimal.Decimal("NaN"), decimal.Decimal("0")] + ) + + +@pytest.fixture +def na_cmp(): + return lambda x, y: x.is_nan() and y.is_nan() + + +@pytest.fixture +def na_value(): + return decimal.Decimal("NaN") + + +@pytest.fixture +def data_for_grouping(): + b = decimal.Decimal("1.0") + a = decimal.Decimal("0.0") + c = decimal.Decimal("2.0") + na = decimal.Decimal("NaN") + return DecimalArray([b, b, na, na, a, a, b, c]) + + +class BaseDecimal: + def assert_series_equal(self, left, right, *args, **kwargs): + def convert(x): + # need to convert array([Decimal(NaN)], dtype='object') to np.NaN + # because Series[object].isnan doesn't recognize decimal(NaN) as + # NA. + try: + return math.isnan(x) + except TypeError: + return False + + if left.dtype == "object": + left_na = left.apply(convert) + else: + left_na = left.isna() + if right.dtype == "object": + right_na = right.apply(convert) + else: + right_na = right.isna() + + tm.assert_series_equal(left_na, right_na) + return tm.assert_series_equal(left[~left_na], right[~right_na], *args, **kwargs) + + def assert_frame_equal(self, left, right, *args, **kwargs): + # TODO(EA): select_dtypes + tm.assert_index_equal( + left.columns, + right.columns, + exact=kwargs.get("check_column_type", "equiv"), + check_names=kwargs.get("check_names", True), + check_exact=kwargs.get("check_exact", False), + check_categorical=kwargs.get("check_categorical", True), + obj="{obj}.columns".format(obj=kwargs.get("obj", "DataFrame")), + ) + + decimals = (left.dtypes == "decimal").index + + for col in decimals: + self.assert_series_equal(left[col], right[col], *args, **kwargs) + + left = left.drop(columns=decimals) + right = right.drop(columns=decimals) + tm.assert_frame_equal(left, right, *args, **kwargs) + + +class TestDtype(BaseDecimal, base.BaseDtypeTests): + def test_hashable(self, dtype): + pass + + +class TestInterface(BaseDecimal, base.BaseInterfaceTests): + pass + + +class TestConstructors(BaseDecimal, base.BaseConstructorsTests): + @pytest.mark.skip(reason="not implemented constructor from dtype") + def test_from_dtype(self, data): + # construct from our dtype & string dtype + pass + + +class TestReshaping(BaseDecimal, base.BaseReshapingTests): + pass + + +class TestGetitem(BaseDecimal, base.BaseGetitemTests): + def test_take_na_value_other_decimal(self): + arr = DecimalArray([decimal.Decimal("1.0"), decimal.Decimal("2.0")]) + result = arr.take([0, -1], allow_fill=True, fill_value=decimal.Decimal("-1.0")) + expected = DecimalArray([decimal.Decimal("1.0"), decimal.Decimal("-1.0")]) + self.assert_extension_array_equal(result, expected) + + +class TestMissing(BaseDecimal, base.BaseMissingTests): + pass + + +class Reduce: + def check_reduce(self, s, op_name, skipna): + + if op_name in ["median", "skew", "kurt"]: + with pytest.raises(NotImplementedError): + getattr(s, op_name)(skipna=skipna) + + else: + result = getattr(s, op_name)(skipna=skipna) + expected = getattr(np.asarray(s), op_name)() + tm.assert_almost_equal(result, expected) + + +class TestNumericReduce(Reduce, base.BaseNumericReduceTests): + pass + + +class TestBooleanReduce(Reduce, base.BaseBooleanReduceTests): + pass + + +class TestMethods(BaseDecimal, base.BaseMethodsTests): + @pytest.mark.parametrize("dropna", [True, False]) + @pytest.mark.xfail(reason="value_counts not implemented yet.") + def test_value_counts(self, all_data, dropna): + all_data = all_data[:10] + if dropna: + other = np.array(all_data[~all_data.isna()]) + else: + other = all_data + + result = pd.Series(all_data).value_counts(dropna=dropna).sort_index() + expected = pd.Series(other).value_counts(dropna=dropna).sort_index() + + tm.assert_series_equal(result, expected) + + +class TestCasting(BaseDecimal, base.BaseCastingTests): + pass + + +class TestGroupby(BaseDecimal, base.BaseGroupbyTests): + @pytest.mark.xfail( + reason="needs to correctly define __eq__ to handle nans, xref #27081." + ) + def test_groupby_apply_identity(self, data_for_grouping): + super().test_groupby_apply_identity(data_for_grouping) + + +class TestSetitem(BaseDecimal, base.BaseSetitemTests): + pass + + +class TestPrinting(BaseDecimal, base.BasePrintingTests): + def test_series_repr(self, data): + # Overriding this base test to explicitly test that + # the custom _formatter is used + ser = pd.Series(data) + assert data.dtype.name in repr(ser) + assert "Decimal: " in repr(ser) + + +# TODO(extension) +@pytest.mark.xfail( + reason=( + "raising AssertionError as this is not implemented, though easy enough to do" + ) +) +def test_series_constructor_coerce_data_to_extension_dtype_raises(): + xpr = ( + "Cannot cast data to extension dtype 'decimal'. Pass the " + "extension array directly." + ) + with pytest.raises(ValueError, match=xpr): + pd.Series([0, 1, 2], dtype=DecimalDtype()) + + +def test_series_constructor_with_dtype(): + arr = DecimalArray([decimal.Decimal("10.0")]) + result = pd.Series(arr, dtype=DecimalDtype()) + expected = pd.Series(arr) + tm.assert_series_equal(result, expected) + + result = pd.Series(arr, dtype="int64") + expected = pd.Series([10]) + tm.assert_series_equal(result, expected) + + +def test_dataframe_constructor_with_dtype(): + arr = DecimalArray([decimal.Decimal("10.0")]) + + result = pd.DataFrame({"A": arr}, dtype=DecimalDtype()) + expected = pd.DataFrame({"A": arr}) + tm.assert_frame_equal(result, expected) + + arr = DecimalArray([decimal.Decimal("10.0")]) + result = pd.DataFrame({"A": arr}, dtype="int64") + expected = pd.DataFrame({"A": [10]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("frame", [True, False]) +def test_astype_dispatches(frame): + # This is a dtype-specific test that ensures Series[decimal].astype + # gets all the way through to ExtensionArray.astype + # Designing a reliable smoke test that works for arbitrary data types + # is difficult. + data = pd.Series(DecimalArray([decimal.Decimal(2)]), name="a") + ctx = decimal.Context() + ctx.prec = 5 + + if frame: + data = data.to_frame() + + result = data.astype(DecimalDtype(ctx)) + + if frame: + result = result["a"] + + assert result.dtype.context.prec == ctx.prec + + +class TestArithmeticOps(BaseDecimal, base.BaseArithmeticOpsTests): + def check_opname(self, s, op_name, other, exc=None): + super().check_opname(s, op_name, other, exc=None) + + def test_arith_series_with_array(self, data, all_arithmetic_operators): + op_name = all_arithmetic_operators + s = pd.Series(data) + + context = decimal.getcontext() + divbyzerotrap = context.traps[decimal.DivisionByZero] + invalidoptrap = context.traps[decimal.InvalidOperation] + context.traps[decimal.DivisionByZero] = 0 + context.traps[decimal.InvalidOperation] = 0 + + # Decimal supports ops with int, but not float + other = pd.Series([int(d * 100) for d in data]) + self.check_opname(s, op_name, other) + + if "mod" not in op_name: + self.check_opname(s, op_name, s * 2) + + self.check_opname(s, op_name, 0) + self.check_opname(s, op_name, 5) + context.traps[decimal.DivisionByZero] = divbyzerotrap + context.traps[decimal.InvalidOperation] = invalidoptrap + + def _check_divmod_op(self, s, op, other, exc=NotImplementedError): + # We implement divmod + super()._check_divmod_op(s, op, other, exc=None) + + def test_error(self): + pass + + +class TestComparisonOps(BaseDecimal, base.BaseComparisonOpsTests): + def check_opname(self, s, op_name, other, exc=None): + super().check_opname(s, op_name, other, exc=None) + + def _compare_other(self, s, data, op_name, other): + self.check_opname(s, op_name, other) + + def test_compare_scalar(self, data, all_compare_operators): + op_name = all_compare_operators + s = pd.Series(data) + self._compare_other(s, data, op_name, 0.5) + + def test_compare_array(self, data, all_compare_operators): + op_name = all_compare_operators + s = pd.Series(data) + + alter = np.random.choice([-1, 0, 1], len(data)) + # Randomly double, halve or keep same value + other = pd.Series(data) * [decimal.Decimal(pow(2.0, i)) for i in alter] + self._compare_other(s, data, op_name, other) + + +class DecimalArrayWithoutFromSequence(DecimalArray): + """Helper class for testing error handling in _from_sequence.""" + + def _from_sequence(cls, scalars, dtype=None, copy=False): + raise KeyError("For the test") + + +class DecimalArrayWithoutCoercion(DecimalArrayWithoutFromSequence): + @classmethod + def _create_arithmetic_method(cls, op): + return cls._create_method(op, coerce_to_dtype=False) + + +DecimalArrayWithoutCoercion._add_arithmetic_ops() + + +def test_combine_from_sequence_raises(): + # https://github.com/pandas-dev/pandas/issues/22850 + ser = pd.Series( + DecimalArrayWithoutFromSequence( + [decimal.Decimal("1.0"), decimal.Decimal("2.0")] + ) + ) + result = ser.combine(ser, operator.add) + + # note: object dtype + expected = pd.Series( + [decimal.Decimal("2.0"), decimal.Decimal("4.0")], dtype="object" + ) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "class_", [DecimalArrayWithoutFromSequence, DecimalArrayWithoutCoercion] +) +def test_scalar_ops_from_sequence_raises(class_): + # op(EA, EA) should return an EA, or an ndarray if it's not possible + # to return an EA with the return values. + arr = class_([decimal.Decimal("1.0"), decimal.Decimal("2.0")]) + result = arr + arr + expected = np.array( + [decimal.Decimal("2.0"), decimal.Decimal("4.0")], dtype="object" + ) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize( + "reverse, expected_div, expected_mod", + [(False, [0, 1, 1, 2], [1, 0, 1, 0]), (True, [2, 1, 0, 0], [0, 0, 2, 2])], +) +def test_divmod_array(reverse, expected_div, expected_mod): + # https://github.com/pandas-dev/pandas/issues/22930 + arr = to_decimal([1, 2, 3, 4]) + if reverse: + div, mod = divmod(2, arr) + else: + div, mod = divmod(arr, 2) + expected_div = to_decimal(expected_div) + expected_mod = to_decimal(expected_mod) + + tm.assert_extension_array_equal(div, expected_div) + tm.assert_extension_array_equal(mod, expected_mod) + + +def test_ufunc_fallback(data): + a = data[:5] + s = pd.Series(a, index=range(3, 8)) + result = np.abs(s) + expected = pd.Series(np.abs(a), index=range(3, 8)) + tm.assert_series_equal(result, expected) + + +def test_array_ufunc(): + a = to_decimal([1, 2, 3]) + result = np.exp(a) + expected = to_decimal(np.exp(a._data)) + tm.assert_extension_array_equal(result, expected) + + +def test_array_ufunc_series(): + a = to_decimal([1, 2, 3]) + s = pd.Series(a) + result = np.exp(s) + expected = pd.Series(to_decimal(np.exp(a._data))) + tm.assert_series_equal(result, expected) + + +def test_array_ufunc_series_scalar_other(): + # check _HANDLED_TYPES + a = to_decimal([1, 2, 3]) + s = pd.Series(a) + result = np.add(s, decimal.Decimal(1)) + expected = pd.Series(np.add(a, decimal.Decimal(1))) + tm.assert_series_equal(result, expected) + + +def test_array_ufunc_series_defer(): + a = to_decimal([1, 2, 3]) + s = pd.Series(a) + + expected = pd.Series(to_decimal([2, 4, 6])) + r1 = np.add(s, a) + r2 = np.add(a, s) + + tm.assert_series_equal(r1, expected) + tm.assert_series_equal(r2, expected) + + +def test_groupby_agg(): + # Ensure that the result of agg is inferred to be decimal dtype + # https://github.com/pandas-dev/pandas/issues/29141 + + data = make_data()[:5] + df = pd.DataFrame( + {"id1": [0, 0, 0, 1, 1], "id2": [0, 1, 0, 1, 1], "decimals": DecimalArray(data)} + ) + + # single key, selected column + expected = pd.Series(to_decimal([data[0], data[3]])) + result = df.groupby("id1")["decimals"].agg(lambda x: x.iloc[0]) + tm.assert_series_equal(result, expected, check_names=False) + result = df["decimals"].groupby(df["id1"]).agg(lambda x: x.iloc[0]) + tm.assert_series_equal(result, expected, check_names=False) + + # multiple keys, selected column + expected = pd.Series( + to_decimal([data[0], data[1], data[3]]), + index=pd.MultiIndex.from_tuples([(0, 0), (0, 1), (1, 1)]), + ) + result = df.groupby(["id1", "id2"])["decimals"].agg(lambda x: x.iloc[0]) + tm.assert_series_equal(result, expected, check_names=False) + result = df["decimals"].groupby([df["id1"], df["id2"]]).agg(lambda x: x.iloc[0]) + tm.assert_series_equal(result, expected, check_names=False) + + # multiple columns + expected = pd.DataFrame({"id2": [0, 1], "decimals": to_decimal([data[0], data[3]])}) + result = df.groupby("id1").agg(lambda x: x.iloc[0]) + tm.assert_frame_equal(result, expected, check_names=False) + + +def test_groupby_agg_ea_method(monkeypatch): + # Ensure that the result of agg is inferred to be decimal dtype + # https://github.com/pandas-dev/pandas/issues/29141 + + def DecimalArray__my_sum(self): + return np.sum(np.array(self)) + + monkeypatch.setattr(DecimalArray, "my_sum", DecimalArray__my_sum, raising=False) + + data = make_data()[:5] + df = pd.DataFrame({"id": [0, 0, 0, 1, 1], "decimals": DecimalArray(data)}) + expected = pd.Series(to_decimal([data[0] + data[1] + data[2], data[3] + data[4]])) + + result = df.groupby("id")["decimals"].agg(lambda x: x.values.my_sum()) + tm.assert_series_equal(result, expected, check_names=False) + s = pd.Series(DecimalArray(data)) + result = s.groupby(np.array([0, 0, 0, 1, 1])).agg(lambda x: x.values.my_sum()) + tm.assert_series_equal(result, expected, check_names=False) + + +def test_indexing_no_materialize(monkeypatch): + # See https://github.com/pandas-dev/pandas/issues/29708 + # Ensure that indexing operations do not materialize (convert to a numpy + # array) the ExtensionArray unnecessary + + def DecimalArray__array__(self, dtype=None): + raise Exception("tried to convert a DecimalArray to a numpy array") + + monkeypatch.setattr(DecimalArray, "__array__", DecimalArray__array__, raising=False) + + data = make_data() + s = pd.Series(DecimalArray(data)) + df = pd.DataFrame({"a": s, "b": range(len(s))}) + + # ensure the following operations do not raise an error + s[s > 0.5] + df[s > 0.5] + s.at[0] + df.at[0, "a"] + + +def test_to_numpy_keyword(): + # test the extra keyword + values = [decimal.Decimal("1.1111"), decimal.Decimal("2.2222")] + expected = np.array( + [decimal.Decimal("1.11"), decimal.Decimal("2.22")], dtype="object" + ) + a = pd.array(values, dtype="decimal") + result = a.to_numpy(decimals=2) + tm.assert_numpy_array_equal(result, expected) + + result = pd.Series(a).to_numpy(decimals=2) + tm.assert_numpy_array_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/extension/json/__init__.py b/venv/Lib/site-packages/pandas/tests/extension/json/__init__.py new file mode 100644 index 0000000..e205c7e --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/json/__init__.py @@ -0,0 +1,3 @@ +from .array import JSONArray, JSONDtype, make_data + +__all__ = ["JSONArray", "JSONDtype", "make_data"] diff --git a/venv/Lib/site-packages/pandas/tests/extension/json/array.py b/venv/Lib/site-packages/pandas/tests/extension/json/array.py new file mode 100644 index 0000000..ded513b --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/json/array.py @@ -0,0 +1,207 @@ +"""Test extension array for storing nested data in a pandas container. + +The JSONArray stores lists of dictionaries. The storage mechanism is a list, +not an ndarray. + +Note: + +We currently store lists of UserDicts. Pandas has a few places +internally that specifically check for dicts, and does non-scalar things +in that case. We *want* the dictionaries to be treated as scalars, so we +hack around pandas by using UserDicts. +""" +from collections import UserDict, abc +import itertools +import numbers +import random +import string +import sys + +import numpy as np + +import pandas as pd +from pandas.api.extensions import ExtensionArray, ExtensionDtype + + +class JSONDtype(ExtensionDtype): + type = abc.Mapping + name = "json" + na_value = UserDict() + + @classmethod + def construct_array_type(cls): + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + return JSONArray + + @classmethod + def construct_from_string(cls, string): + if string == cls.name: + return cls() + else: + raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") + + +class JSONArray(ExtensionArray): + dtype = JSONDtype() + __array_priority__ = 1000 + + def __init__(self, values, dtype=None, copy=False): + for val in values: + if not isinstance(val, self.dtype.type): + raise TypeError("All values must be of type " + str(self.dtype.type)) + self.data = values + + # Some aliases for common attribute names to ensure pandas supports + # these + self._items = self._data = self.data + # those aliases are currently not working due to assumptions + # in internal code (GH-20735) + # self._values = self.values = self.data + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + return cls(scalars) + + @classmethod + def _from_factorized(cls, values, original): + return cls([UserDict(x) for x in values if x != ()]) + + def __getitem__(self, item): + if isinstance(item, numbers.Integral): + return self.data[item] + elif isinstance(item, slice) and item == slice(None): + # Make sure we get a view + return type(self)(self.data) + elif isinstance(item, slice): + # slice + return type(self)(self.data[item]) + else: + item = pd.api.indexers.check_array_indexer(self, item) + if pd.api.types.is_bool_dtype(item.dtype): + return self._from_sequence([x for x, m in zip(self, item) if m]) + # integer + return type(self)([self.data[i] for i in item]) + + def __setitem__(self, key, value): + if isinstance(key, numbers.Integral): + self.data[key] = value + else: + if not isinstance(value, (type(self), abc.Sequence)): + # broadcast value + value = itertools.cycle([value]) + + if isinstance(key, np.ndarray) and key.dtype == "bool": + # masking + for i, (k, v) in enumerate(zip(key, value)): + if k: + assert isinstance(v, self.dtype.type) + self.data[i] = v + else: + for k, v in zip(key, value): + assert isinstance(v, self.dtype.type) + self.data[k] = v + + def __len__(self) -> int: + return len(self.data) + + def __array__(self, dtype=None): + if dtype is None: + dtype = object + return np.asarray(self.data, dtype=dtype) + + @property + def nbytes(self) -> int: + return sys.getsizeof(self.data) + + def isna(self): + return np.array([x == self.dtype.na_value for x in self.data], dtype=bool) + + def take(self, indexer, allow_fill=False, fill_value=None): + # re-implement here, since NumPy has trouble setting + # sized objects like UserDicts into scalar slots of + # an ndarary. + indexer = np.asarray(indexer) + msg = ( + "Index is out of bounds or cannot do a " + "non-empty take from an empty array." + ) + + if allow_fill: + if fill_value is None: + fill_value = self.dtype.na_value + # bounds check + if (indexer < -1).any(): + raise ValueError + try: + output = [ + self.data[loc] if loc != -1 else fill_value for loc in indexer + ] + except IndexError: + raise IndexError(msg) + else: + try: + output = [self.data[loc] for loc in indexer] + except IndexError: + raise IndexError(msg) + + return self._from_sequence(output) + + def copy(self): + return type(self)(self.data[:]) + + def astype(self, dtype, copy=True): + # NumPy has issues when all the dicts are the same length. + # np.array([UserDict(...), UserDict(...)]) fails, + # but np.array([{...}, {...}]) works, so cast. + + # needed to add this check for the Series constructor + if isinstance(dtype, type(self.dtype)) and dtype == self.dtype: + if copy: + return self.copy() + return self + return np.array([dict(x) for x in self], dtype=dtype, copy=copy) + + def unique(self): + # Parent method doesn't work since np.array will try to infer + # a 2-dim object. + return type(self)( + [dict(x) for x in list({tuple(d.items()) for d in self.data})] + ) + + @classmethod + def _concat_same_type(cls, to_concat): + data = list(itertools.chain.from_iterable([x.data for x in to_concat])) + return cls(data) + + def _values_for_factorize(self): + frozen = self._values_for_argsort() + if len(frozen) == 0: + # _factorize_array expects 1-d array, this is a len-0 2-d array. + frozen = frozen.ravel() + return frozen, () + + def _values_for_argsort(self): + # Disable NumPy's shape inference by including an empty tuple... + # If all the elements of self are the same size P, NumPy will + # cast them to an (N, P) array, instead of an (N,) array of tuples. + frozen = [()] + [tuple(x.items()) for x in self] + return np.array(frozen, dtype=object)[1:] + + +def make_data(): + # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer + return [ + UserDict( + [ + (random.choice(string.ascii_letters), random.randint(0, 100)) + for _ in range(random.randint(0, 10)) + ] + ) + for _ in range(100) + ] diff --git a/venv/Lib/site-packages/pandas/tests/extension/json/test_json.py b/venv/Lib/site-packages/pandas/tests/extension/json/test_json.py new file mode 100644 index 0000000..dc03a1f --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/json/test_json.py @@ -0,0 +1,303 @@ +import collections +import operator + +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.tests.extension import base + +from .array import JSONArray, JSONDtype, make_data + + +@pytest.fixture +def dtype(): + return JSONDtype() + + +@pytest.fixture +def data(): + """Length-100 PeriodArray for semantics test.""" + data = make_data() + + # Why the while loop? NumPy is unable to construct an ndarray from + # equal-length ndarrays. Many of our operations involve coercing the + # EA to an ndarray of objects. To avoid random test failures, we ensure + # that our data is coercible to an ndarray. Several tests deal with only + # the first two elements, so that's what we'll check. + + while len(data[0]) == len(data[1]): + data = make_data() + + return JSONArray(data) + + +@pytest.fixture +def data_missing(): + """Length 2 array with [NA, Valid]""" + return JSONArray([{}, {"a": 10}]) + + +@pytest.fixture +def data_for_sorting(): + return JSONArray([{"b": 1}, {"c": 4}, {"a": 2, "c": 3}]) + + +@pytest.fixture +def data_missing_for_sorting(): + return JSONArray([{"b": 1}, {}, {"a": 4}]) + + +@pytest.fixture +def na_value(dtype): + return dtype.na_value + + +@pytest.fixture +def na_cmp(): + return operator.eq + + +@pytest.fixture +def data_for_grouping(): + return JSONArray( + [ + {"b": 1}, + {"b": 1}, + {}, + {}, + {"a": 0, "c": 2}, + {"a": 0, "c": 2}, + {"b": 1}, + {"c": 2}, + ] + ) + + +class BaseJSON: + # NumPy doesn't handle an array of equal-length UserDicts. + # The default assert_series_equal eventually does a + # Series.values, which raises. We work around it by + # converting the UserDicts to dicts. + def assert_series_equal(self, left, right, **kwargs): + if left.dtype.name == "json": + assert left.dtype == right.dtype + left = pd.Series( + JSONArray(left.values.astype(object)), index=left.index, name=left.name + ) + right = pd.Series( + JSONArray(right.values.astype(object)), + index=right.index, + name=right.name, + ) + tm.assert_series_equal(left, right, **kwargs) + + def assert_frame_equal(self, left, right, *args, **kwargs): + obj_type = kwargs.get("obj", "DataFrame") + tm.assert_index_equal( + left.columns, + right.columns, + exact=kwargs.get("check_column_type", "equiv"), + check_names=kwargs.get("check_names", True), + check_exact=kwargs.get("check_exact", False), + check_categorical=kwargs.get("check_categorical", True), + obj=f"{obj_type}.columns", + ) + + jsons = (left.dtypes == "json").index + + for col in jsons: + self.assert_series_equal(left[col], right[col], *args, **kwargs) + + left = left.drop(columns=jsons) + right = right.drop(columns=jsons) + tm.assert_frame_equal(left, right, *args, **kwargs) + + +class TestDtype(BaseJSON, base.BaseDtypeTests): + pass + + +class TestInterface(BaseJSON, base.BaseInterfaceTests): + def test_custom_asserts(self): + # This would always trigger the KeyError from trying to put + # an array of equal-length UserDicts inside an ndarray. + data = JSONArray( + [ + collections.UserDict({"a": 1}), + collections.UserDict({"b": 2}), + collections.UserDict({"c": 3}), + ] + ) + a = pd.Series(data) + self.assert_series_equal(a, a) + self.assert_frame_equal(a.to_frame(), a.to_frame()) + + b = pd.Series(data.take([0, 0, 1])) + with pytest.raises(AssertionError): + self.assert_series_equal(a, b) + + with pytest.raises(AssertionError): + self.assert_frame_equal(a.to_frame(), b.to_frame()) + + +class TestConstructors(BaseJSON, base.BaseConstructorsTests): + @pytest.mark.skip(reason="not implemented constructor from dtype") + def test_from_dtype(self, data): + # construct from our dtype & string dtype + pass + + +class TestReshaping(BaseJSON, base.BaseReshapingTests): + @pytest.mark.skip(reason="Different definitions of NA") + def test_stack(self): + """ + The test does .astype(object).stack(). If we happen to have + any missing values in `data`, then we'll end up with different + rows since we consider `{}` NA, but `.astype(object)` doesn't. + """ + + @pytest.mark.xfail(reason="dict for NA") + def test_unstack(self, data, index): + # The base test has NaN for the expected NA value. + # this matches otherwise + return super().test_unstack(data, index) + + +class TestGetitem(BaseJSON, base.BaseGetitemTests): + pass + + +class TestMissing(BaseJSON, base.BaseMissingTests): + @pytest.mark.skip(reason="Setting a dict as a scalar") + def test_fillna_series(self): + """We treat dictionaries as a mapping in fillna, not a scalar.""" + + @pytest.mark.skip(reason="Setting a dict as a scalar") + def test_fillna_frame(self): + """We treat dictionaries as a mapping in fillna, not a scalar.""" + + +unhashable = pytest.mark.skip(reason="Unhashable") + + +class TestReduce(base.BaseNoReduceTests): + pass + + +class TestMethods(BaseJSON, base.BaseMethodsTests): + @unhashable + def test_value_counts(self, all_data, dropna): + pass + + @unhashable + def test_sort_values_frame(self): + # TODO (EA.factorize): see if _values_for_factorize allows this. + pass + + def test_argsort(self, data_for_sorting): + super().test_argsort(data_for_sorting) + + def test_argsort_missing(self, data_missing_for_sorting): + super().test_argsort_missing(data_missing_for_sorting) + + @pytest.mark.parametrize("ascending", [True, False]) + def test_sort_values(self, data_for_sorting, ascending): + super().test_sort_values(data_for_sorting, ascending) + + @pytest.mark.parametrize("ascending", [True, False]) + def test_sort_values_missing(self, data_missing_for_sorting, ascending): + super().test_sort_values_missing(data_missing_for_sorting, ascending) + + @pytest.mark.skip(reason="combine for JSONArray not supported") + def test_combine_le(self, data_repeated): + pass + + @pytest.mark.skip(reason="combine for JSONArray not supported") + def test_combine_add(self, data_repeated): + pass + + @pytest.mark.skip(reason="combine for JSONArray not supported") + def test_combine_first(self, data): + pass + + @unhashable + def test_hash_pandas_object_works(self, data, kind): + super().test_hash_pandas_object_works(data, kind) + + @pytest.mark.skip(reason="broadcasting error") + def test_where_series(self, data, na_value): + # Fails with + # *** ValueError: operands could not be broadcast together + # with shapes (4,) (4,) (0,) + super().test_where_series(data, na_value) + + @pytest.mark.skip(reason="Can't compare dicts.") + def test_searchsorted(self, data_for_sorting): + super().test_searchsorted(data_for_sorting) + + +class TestCasting(BaseJSON, base.BaseCastingTests): + @pytest.mark.skip(reason="failing on np.array(self, dtype=str)") + def test_astype_str(self): + """This currently fails in NumPy on np.array(self, dtype=str) with + + *** ValueError: setting an array element with a sequence + """ + + +# We intentionally don't run base.BaseSetitemTests because pandas' +# internals has trouble setting sequences of values into scalar positions. + + +class TestGroupby(BaseJSON, base.BaseGroupbyTests): + @unhashable + def test_groupby_extension_transform(self): + """ + This currently fails in Series.name.setter, since the + name must be hashable, but the value is a dictionary. + I think this is what we want, i.e. `.name` should be the original + values, and not the values for factorization. + """ + + @unhashable + def test_groupby_extension_apply(self): + """ + This fails in Index._do_unique_check with + + > hash(val) + E TypeError: unhashable type: 'UserDict' with + + I suspect that once we support Index[ExtensionArray], + we'll be able to dispatch unique. + """ + + @pytest.mark.parametrize("as_index", [True, False]) + def test_groupby_extension_agg(self, as_index, data_for_grouping): + super().test_groupby_extension_agg(as_index, data_for_grouping) + + +class TestArithmeticOps(BaseJSON, base.BaseArithmeticOpsTests): + def test_error(self, data, all_arithmetic_operators): + pass + + def test_add_series_with_extension_array(self, data): + ser = pd.Series(data) + with pytest.raises(TypeError, match="unsupported"): + ser + data + + def test_divmod_series_array(self): + # GH 23287 + # skipping because it is not implemented + pass + + def _check_divmod_op(self, s, op, other, exc=NotImplementedError): + return super()._check_divmod_op(s, op, other, exc=TypeError) + + +class TestComparisonOps(BaseJSON, base.BaseComparisonOpsTests): + pass + + +class TestPrinting(BaseJSON, base.BasePrintingTests): + pass diff --git a/venv/Lib/site-packages/pandas/tests/extension/list/__init__.py b/venv/Lib/site-packages/pandas/tests/extension/list/__init__.py new file mode 100644 index 0000000..108f193 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/list/__init__.py @@ -0,0 +1,3 @@ +from .array import ListArray, ListDtype, make_data + +__all__ = ["ListArray", "ListDtype", "make_data"] diff --git a/venv/Lib/site-packages/pandas/tests/extension/list/array.py b/venv/Lib/site-packages/pandas/tests/extension/list/array.py new file mode 100644 index 0000000..6dd00ad --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/list/array.py @@ -0,0 +1,133 @@ +""" +Test extension array for storing nested data in a pandas container. + +The ListArray stores an ndarray of lists. +""" +import numbers +import random +import string + +import numpy as np + +from pandas.core.dtypes.base import ExtensionDtype + +import pandas as pd +from pandas.core.arrays import ExtensionArray + + +class ListDtype(ExtensionDtype): + type = list + name = "list" + na_value = np.nan + + @classmethod + def construct_array_type(cls): + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + return ListArray + + @classmethod + def construct_from_string(cls, string): + if string == cls.name: + return cls() + else: + raise TypeError(f"Cannot construct a '{cls}' from '{string}'") + + +class ListArray(ExtensionArray): + dtype = ListDtype() + __array_priority__ = 1000 + + def __init__(self, values, dtype=None, copy=False): + if not isinstance(values, np.ndarray): + raise TypeError("Need to pass a numpy array as values") + for val in values: + if not isinstance(val, self.dtype.type) and not pd.isna(val): + raise TypeError("All values must be of type " + str(self.dtype.type)) + self.data = values + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + data = np.empty(len(scalars), dtype=object) + data[:] = scalars + return cls(data) + + def __getitem__(self, item): + if isinstance(item, numbers.Integral): + return self.data[item] + else: + # slice, list-like, mask + return type(self)(self.data[item]) + + def __len__(self) -> int: + return len(self.data) + + def isna(self): + return np.array( + [not isinstance(x, list) and np.isnan(x) for x in self.data], dtype=bool + ) + + def take(self, indexer, allow_fill=False, fill_value=None): + # re-implement here, since NumPy has trouble setting + # sized objects like UserDicts into scalar slots of + # an ndarary. + indexer = np.asarray(indexer) + msg = ( + "Index is out of bounds or cannot do a " + "non-empty take from an empty array." + ) + + if allow_fill: + if fill_value is None: + fill_value = self.dtype.na_value + # bounds check + if (indexer < -1).any(): + raise ValueError + try: + output = [ + self.data[loc] if loc != -1 else fill_value for loc in indexer + ] + except IndexError: + raise IndexError(msg) + else: + try: + output = [self.data[loc] for loc in indexer] + except IndexError: + raise IndexError(msg) + + return self._from_sequence(output) + + def copy(self): + return type(self)(self.data[:]) + + def astype(self, dtype, copy=True): + if isinstance(dtype, type(self.dtype)) and dtype == self.dtype: + if copy: + return self.copy() + return self + elif pd.api.types.is_string_dtype(dtype) and not pd.api.types.is_object_dtype( + dtype + ): + # numpy has problems with astype(str) for nested elements + return np.array([str(x) for x in self.data], dtype=dtype) + return np.array(self.data, dtype=dtype, copy=copy) + + @classmethod + def _concat_same_type(cls, to_concat): + data = np.concatenate([x.data for x in to_concat]) + return cls(data) + + +def make_data(): + # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer + data = np.empty(100, dtype=object) + data[:] = [ + [random.choice(string.ascii_letters) for _ in range(random.randint(0, 10))] + for _ in range(100) + ] + return data diff --git a/venv/Lib/site-packages/pandas/tests/extension/list/test_list.py b/venv/Lib/site-packages/pandas/tests/extension/list/test_list.py new file mode 100644 index 0000000..c5c4417 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/list/test_list.py @@ -0,0 +1,30 @@ +import pytest + +import pandas as pd + +from .array import ListArray, ListDtype, make_data + + +@pytest.fixture +def dtype(): + return ListDtype() + + +@pytest.fixture +def data(): + """Length-100 ListArray for semantics test.""" + data = make_data() + + while len(data[0]) == len(data[1]): + data = make_data() + + return ListArray(data) + + +def test_to_csv(data): + # https://github.com/pandas-dev/pandas/issues/28840 + # array with list-likes fail when doing astype(str) on the numpy array + # which was done in to_native_types + df = pd.DataFrame({"a": data}) + res = df.to_csv() + assert str(data[0]) in res diff --git a/venv/Lib/site-packages/pandas/tests/extension/test_boolean.py b/venv/Lib/site-packages/pandas/tests/extension/test_boolean.py new file mode 100644 index 0000000..c489445 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/test_boolean.py @@ -0,0 +1,347 @@ +""" +This file contains a minimal set of tests for compliance with the extension +array interface test suite, and should contain no other tests. +The test suite for the full functionality of the array is located in +`pandas/tests/arrays/`. + +The tests in this file are inherited from the BaseExtensionTests, and only +minimal tweaks should be applied to get the tests passing (by overwriting a +parent method). + +Additional tests should either be added to one of the BaseExtensionTests +classes (if they are relevant for the extension interface for all dtypes), or +be added to the array-specific tests in `pandas/tests/arrays/`. + +""" +import numpy as np +import pytest + +from pandas.compat.numpy import _np_version_under1p14 + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays.boolean import BooleanDtype +from pandas.tests.extension import base + + +def make_data(): + return [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False] + + +@pytest.fixture +def dtype(): + return BooleanDtype() + + +@pytest.fixture +def data(dtype): + return pd.array(make_data(), dtype=dtype) + + +@pytest.fixture +def data_for_twos(dtype): + return pd.array(np.ones(100), dtype=dtype) + + +@pytest.fixture +def data_missing(dtype): + return pd.array([np.nan, True], dtype=dtype) + + +@pytest.fixture +def data_for_sorting(dtype): + return pd.array([True, True, False], dtype=dtype) + + +@pytest.fixture +def data_missing_for_sorting(dtype): + return pd.array([True, np.nan, False], dtype=dtype) + + +@pytest.fixture +def na_cmp(): + # we are pd.NA + return lambda x, y: x is pd.NA and y is pd.NA + + +@pytest.fixture +def na_value(): + return pd.NA + + +@pytest.fixture +def data_for_grouping(dtype): + b = True + a = False + na = np.nan + return pd.array([b, b, na, na, a, a, b], dtype=dtype) + + +class TestDtype(base.BaseDtypeTests): + pass + + +class TestInterface(base.BaseInterfaceTests): + pass + + +class TestConstructors(base.BaseConstructorsTests): + pass + + +class TestGetitem(base.BaseGetitemTests): + pass + + +class TestSetitem(base.BaseSetitemTests): + pass + + +class TestMissing(base.BaseMissingTests): + pass + + +class TestArithmeticOps(base.BaseArithmeticOpsTests): + def check_opname(self, s, op_name, other, exc=None): + # overwriting to indicate ops don't raise an error + super().check_opname(s, op_name, other, exc=None) + + def _check_op(self, s, op, other, op_name, exc=NotImplementedError): + if exc is None: + if op_name in ("__sub__", "__rsub__"): + # subtraction for bools raises TypeError (but not yet in 1.13) + if _np_version_under1p14: + pytest.skip("__sub__ does not yet raise in numpy 1.13") + with pytest.raises(TypeError): + op(s, other) + + return + + result = op(s, other) + expected = s.combine(other, op) + + if op_name in ( + "__floordiv__", + "__rfloordiv__", + "__pow__", + "__rpow__", + "__mod__", + "__rmod__", + ): + # combine keeps boolean type + expected = expected.astype("Int8") + elif op_name in ("__truediv__", "__rtruediv__"): + # combine with bools does not generate the correct result + # (numpy behaviour for div is to regard the bools as numeric) + expected = s.astype(float).combine(other, op) + if op_name == "__rpow__": + # for rpow, combine does not propagate NaN + expected[result.isna()] = np.nan + self.assert_series_equal(result, expected) + else: + with pytest.raises(exc): + op(s, other) + + def _check_divmod_op(self, s, op, other, exc=None): + # override to not raise an error + super()._check_divmod_op(s, op, other, None) + + @pytest.mark.skip(reason="BooleanArray does not error on ops") + def test_error(self, data, all_arithmetic_operators): + # other specific errors tested in the boolean array specific tests + pass + + +class TestComparisonOps(base.BaseComparisonOpsTests): + def check_opname(self, s, op_name, other, exc=None): + # overwriting to indicate ops don't raise an error + super().check_opname(s, op_name, other, exc=None) + + def _compare_other(self, s, data, op_name, other): + self.check_opname(s, op_name, other) + + @pytest.mark.skip(reason="Tested in tests/arrays/test_boolean.py") + def test_compare_scalar(self, data, all_compare_operators): + pass + + @pytest.mark.skip(reason="Tested in tests/arrays/test_boolean.py") + def test_compare_array(self, data, all_compare_operators): + pass + + +class TestReshaping(base.BaseReshapingTests): + pass + + +class TestMethods(base.BaseMethodsTests): + @pytest.mark.parametrize("na_sentinel", [-1, -2]) + def test_factorize(self, data_for_grouping, na_sentinel): + # override because we only have 2 unique values + labels, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) + expected_labels = np.array( + [0, 0, na_sentinel, na_sentinel, 1, 1, 0], dtype=np.intp + ) + expected_uniques = data_for_grouping.take([0, 4]) + + tm.assert_numpy_array_equal(labels, expected_labels) + self.assert_extension_array_equal(uniques, expected_uniques) + + def test_combine_le(self, data_repeated): + # override because expected needs to be boolean instead of bool dtype + orig_data1, orig_data2 = data_repeated(2) + s1 = pd.Series(orig_data1) + s2 = pd.Series(orig_data2) + result = s1.combine(s2, lambda x1, x2: x1 <= x2) + expected = pd.Series( + [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))], + dtype="boolean", + ) + self.assert_series_equal(result, expected) + + val = s1.iloc[0] + result = s1.combine(val, lambda x1, x2: x1 <= x2) + expected = pd.Series([a <= val for a in list(orig_data1)], dtype="boolean") + self.assert_series_equal(result, expected) + + def test_searchsorted(self, data_for_sorting, as_series): + # override because we only have 2 unique values + data_for_sorting = pd.array([True, False], dtype="boolean") + b, a = data_for_sorting + arr = type(data_for_sorting)._from_sequence([a, b]) + + if as_series: + arr = pd.Series(arr) + assert arr.searchsorted(a) == 0 + assert arr.searchsorted(a, side="right") == 1 + + assert arr.searchsorted(b) == 1 + assert arr.searchsorted(b, side="right") == 2 + + result = arr.searchsorted(arr.take([0, 1])) + expected = np.array([0, 1], dtype=np.intp) + + tm.assert_numpy_array_equal(result, expected) + + # sorter + sorter = np.array([1, 0]) + assert data_for_sorting.searchsorted(a, sorter=sorter) == 0 + + @pytest.mark.skip(reason="uses nullable integer") + def test_value_counts(self, all_data, dropna): + return super().test_value_counts(all_data, dropna) + + +class TestCasting(base.BaseCastingTests): + pass + + +class TestGroupby(base.BaseGroupbyTests): + """ + Groupby-specific tests are overridden because boolean only has 2 + unique values, base tests uses 3 groups. + """ + + def test_grouping_grouper(self, data_for_grouping): + df = pd.DataFrame( + {"A": ["B", "B", None, None, "A", "A", "B"], "B": data_for_grouping} + ) + gr1 = df.groupby("A").grouper.groupings[0] + gr2 = df.groupby("B").grouper.groupings[0] + + tm.assert_numpy_array_equal(gr1.grouper, df.A.values) + tm.assert_extension_array_equal(gr2.grouper, data_for_grouping) + + @pytest.mark.parametrize("as_index", [True, False]) + def test_groupby_extension_agg(self, as_index, data_for_grouping): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) + result = df.groupby("B", as_index=as_index).A.mean() + _, index = pd.factorize(data_for_grouping, sort=True) + + index = pd.Index(index, name="B") + expected = pd.Series([3, 1], index=index, name="A") + if as_index: + self.assert_series_equal(result, expected) + else: + expected = expected.reset_index() + self.assert_frame_equal(result, expected) + + def test_groupby_extension_no_sort(self, data_for_grouping): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) + result = df.groupby("B", sort=False).A.mean() + _, index = pd.factorize(data_for_grouping, sort=False) + + index = pd.Index(index, name="B") + expected = pd.Series([1, 3], index=index, name="A") + self.assert_series_equal(result, expected) + + def test_groupby_extension_transform(self, data_for_grouping): + valid = data_for_grouping[~data_for_grouping.isna()] + df = pd.DataFrame({"A": [1, 1, 3, 3, 1], "B": valid}) + + result = df.groupby("B").A.transform(len) + expected = pd.Series([3, 3, 2, 2, 3], name="A") + + self.assert_series_equal(result, expected) + + def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) + df.groupby("B").apply(groupby_apply_op) + df.groupby("B").A.apply(groupby_apply_op) + df.groupby("A").apply(groupby_apply_op) + df.groupby("A").B.apply(groupby_apply_op) + + def test_groupby_apply_identity(self, data_for_grouping): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) + result = df.groupby("A").B.apply(lambda x: x.array) + expected = pd.Series( + [ + df.B.iloc[[0, 1, 6]].array, + df.B.iloc[[2, 3]].array, + df.B.iloc[[4, 5]].array, + ], + index=pd.Index([1, 2, 3], name="A"), + name="B", + ) + self.assert_series_equal(result, expected) + + def test_in_numeric_groupby(self, data_for_grouping): + df = pd.DataFrame( + { + "A": [1, 1, 2, 2, 3, 3, 1], + "B": data_for_grouping, + "C": [1, 1, 1, 1, 1, 1, 1], + } + ) + result = df.groupby("A").sum().columns + + if data_for_grouping.dtype._is_numeric: + expected = pd.Index(["B", "C"]) + else: + expected = pd.Index(["C"]) + + tm.assert_index_equal(result, expected) + + +class TestNumericReduce(base.BaseNumericReduceTests): + def check_reduce(self, s, op_name, skipna): + result = getattr(s, op_name)(skipna=skipna) + expected = getattr(s.astype("float64"), op_name)(skipna=skipna) + # override parent function to cast to bool for min/max + if np.isnan(expected): + expected = pd.NA + elif op_name in ("min", "max"): + expected = bool(expected) + tm.assert_almost_equal(result, expected) + + +class TestBooleanReduce(base.BaseBooleanReduceTests): + pass + + +class TestPrinting(base.BasePrintingTests): + pass + + +# TODO parsing not yet supported +# class TestParsing(base.BaseParsingTests): +# pass diff --git a/venv/Lib/site-packages/pandas/tests/extension/test_categorical.py b/venv/Lib/site-packages/pandas/tests/extension/test_categorical.py new file mode 100644 index 0000000..336b23e --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/test_categorical.py @@ -0,0 +1,286 @@ +""" +This file contains a minimal set of tests for compliance with the extension +array interface test suite, and should contain no other tests. +The test suite for the full functionality of the array is located in +`pandas/tests/arrays/`. + +The tests in this file are inherited from the BaseExtensionTests, and only +minimal tweaks should be applied to get the tests passing (by overwriting a +parent method). + +Additional tests should either be added to one of the BaseExtensionTests +classes (if they are relevant for the extension interface for all dtypes), or +be added to the array-specific tests in `pandas/tests/arrays/`. + +""" +import string + +import numpy as np +import pytest + +import pandas as pd +from pandas import Categorical, CategoricalIndex, Timestamp +import pandas._testing as tm +from pandas.api.types import CategoricalDtype +from pandas.tests.extension import base + + +def make_data(): + while True: + values = np.random.choice(list(string.ascii_letters), size=100) + # ensure we meet the requirements + # 1. first two not null + # 2. first and second are different + if values[0] != values[1]: + break + return values + + +@pytest.fixture +def dtype(): + return CategoricalDtype() + + +@pytest.fixture +def data(): + """Length-100 array for this type. + + * data[0] and data[1] should both be non missing + * data[0] and data[1] should not gbe equal + """ + return Categorical(make_data()) + + +@pytest.fixture +def data_missing(): + """Length 2 array with [NA, Valid]""" + return Categorical([np.nan, "A"]) + + +@pytest.fixture +def data_for_sorting(): + return Categorical(["A", "B", "C"], categories=["C", "A", "B"], ordered=True) + + +@pytest.fixture +def data_missing_for_sorting(): + return Categorical(["A", None, "B"], categories=["B", "A"], ordered=True) + + +@pytest.fixture +def na_value(): + return np.nan + + +@pytest.fixture +def data_for_grouping(): + return Categorical(["a", "a", None, None, "b", "b", "a", "c"]) + + +class TestDtype(base.BaseDtypeTests): + pass + + +class TestInterface(base.BaseInterfaceTests): + @pytest.mark.skip(reason="Memory usage doesn't match") + def test_memory_usage(self, data): + # Is this deliberate? + super().test_memory_usage(data) + + +class TestConstructors(base.BaseConstructorsTests): + pass + + +class TestReshaping(base.BaseReshapingTests): + pass + + +class TestGetitem(base.BaseGetitemTests): + skip_take = pytest.mark.skip(reason="GH-20664.") + + @pytest.mark.skip(reason="Backwards compatibility") + def test_getitem_scalar(self, data): + # CategoricalDtype.type isn't "correct" since it should + # be a parent of the elements (object). But don't want + # to break things by changing. + super().test_getitem_scalar(data) + + @skip_take + def test_take(self, data, na_value, na_cmp): + # TODO remove this once Categorical.take is fixed + super().test_take(data, na_value, na_cmp) + + @skip_take + def test_take_negative(self, data): + super().test_take_negative(data) + + @skip_take + def test_take_pandas_style_negative_raises(self, data, na_value): + super().test_take_pandas_style_negative_raises(data, na_value) + + @skip_take + def test_take_non_na_fill_value(self, data_missing): + super().test_take_non_na_fill_value(data_missing) + + @skip_take + def test_take_out_of_bounds_raises(self, data, allow_fill): + return super().test_take_out_of_bounds_raises(data, allow_fill) + + @pytest.mark.skip(reason="GH-20747. Unobserved categories.") + def test_take_series(self, data): + super().test_take_series(data) + + @skip_take + def test_reindex_non_na_fill_value(self, data_missing): + super().test_reindex_non_na_fill_value(data_missing) + + @pytest.mark.skip(reason="Categorical.take buggy") + def test_take_empty(self, data, na_value, na_cmp): + super().test_take_empty(data, na_value, na_cmp) + + @pytest.mark.skip(reason="test not written correctly for categorical") + def test_reindex(self, data, na_value): + super().test_reindex(data, na_value) + + +class TestSetitem(base.BaseSetitemTests): + pass + + +class TestMissing(base.BaseMissingTests): + @pytest.mark.skip(reason="Not implemented") + def test_fillna_limit_pad(self, data_missing): + super().test_fillna_limit_pad(data_missing) + + @pytest.mark.skip(reason="Not implemented") + def test_fillna_limit_backfill(self, data_missing): + super().test_fillna_limit_backfill(data_missing) + + +class TestReduce(base.BaseNoReduceTests): + pass + + +class TestMethods(base.BaseMethodsTests): + @pytest.mark.skip(reason="Unobserved categories included") + def test_value_counts(self, all_data, dropna): + return super().test_value_counts(all_data, dropna) + + def test_combine_add(self, data_repeated): + # GH 20825 + # When adding categoricals in combine, result is a string + orig_data1, orig_data2 = data_repeated(2) + s1 = pd.Series(orig_data1) + s2 = pd.Series(orig_data2) + result = s1.combine(s2, lambda x1, x2: x1 + x2) + expected = pd.Series( + ([a + b for (a, b) in zip(list(orig_data1), list(orig_data2))]) + ) + self.assert_series_equal(result, expected) + + val = s1.iloc[0] + result = s1.combine(val, lambda x1, x2: x1 + x2) + expected = pd.Series([a + val for a in list(orig_data1)]) + self.assert_series_equal(result, expected) + + @pytest.mark.skip(reason="Not Applicable") + def test_fillna_length_mismatch(self, data_missing): + super().test_fillna_length_mismatch(data_missing) + + def test_searchsorted(self, data_for_sorting): + if not data_for_sorting.ordered: + raise pytest.skip(reason="searchsorted requires ordered data.") + + +class TestCasting(base.BaseCastingTests): + @pytest.mark.parametrize("cls", [Categorical, CategoricalIndex]) + @pytest.mark.parametrize("values", [[1, np.nan], [Timestamp("2000"), pd.NaT]]) + def test_cast_nan_to_int(self, cls, values): + # GH 28406 + s = cls(values) + + msg = "Cannot (cast|convert)" + with pytest.raises((ValueError, TypeError), match=msg): + s.astype(int) + + @pytest.mark.parametrize( + "expected", + [ + pd.Series(["2019", "2020"], dtype="datetime64[ns, UTC]"), + pd.Series([0, 0], dtype="timedelta64[ns]"), + pd.Series([pd.Period("2019"), pd.Period("2020")], dtype="period[A-DEC]"), + pd.Series([pd.Interval(0, 1), pd.Interval(1, 2)], dtype="interval"), + pd.Series([1, np.nan], dtype="Int64"), + ], + ) + def test_cast_category_to_extension_dtype(self, expected): + # GH 28668 + result = expected.astype("category").astype(expected.dtype) + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "dtype, expected", + [ + ( + "datetime64[ns]", + np.array(["2015-01-01T00:00:00.000000000"], dtype="datetime64[ns]"), + ), + ( + "datetime64[ns, MET]", + pd.DatetimeIndex( + [pd.Timestamp("2015-01-01 00:00:00+0100", tz="MET")] + ).array, + ), + ], + ) + def test_consistent_casting(self, dtype, expected): + # GH 28448 + result = pd.Categorical("2015-01-01").astype(dtype) + assert result == expected + + +class TestArithmeticOps(base.BaseArithmeticOpsTests): + def test_arith_series_with_scalar(self, data, all_arithmetic_operators): + + op_name = all_arithmetic_operators + if op_name != "__rmod__": + super().test_arith_series_with_scalar(data, op_name) + else: + pytest.skip("rmod never called when string is first argument") + + def test_add_series_with_extension_array(self, data): + ser = pd.Series(data) + with pytest.raises(TypeError, match="cannot perform|unsupported operand"): + ser + data + + def test_divmod_series_array(self): + # GH 23287 + # skipping because it is not implemented + pass + + def _check_divmod_op(self, s, op, other, exc=NotImplementedError): + return super()._check_divmod_op(s, op, other, exc=TypeError) + + +class TestComparisonOps(base.BaseComparisonOpsTests): + def _compare_other(self, s, data, op_name, other): + op = self.get_op_from_name(op_name) + if op_name == "__eq__": + result = op(s, other) + expected = s.combine(other, lambda x, y: x == y) + assert (result == expected).all() + + elif op_name == "__ne__": + result = op(s, other) + expected = s.combine(other, lambda x, y: x != y) + assert (result == expected).all() + + else: + with pytest.raises(TypeError): + op(data, other) + + +class TestParsing(base.BaseParsingTests): + pass diff --git a/venv/Lib/site-packages/pandas/tests/extension/test_common.py b/venv/Lib/site-packages/pandas/tests/extension/test_common.py new file mode 100644 index 0000000..e43650c --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/test_common.py @@ -0,0 +1,81 @@ +import numpy as np +import pytest + +from pandas.core.dtypes import dtypes +from pandas.core.dtypes.common import is_extension_array_dtype + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import ExtensionArray + + +class DummyDtype(dtypes.ExtensionDtype): + pass + + +class DummyArray(ExtensionArray): + def __init__(self, data): + self.data = data + + def __array__(self, dtype): + return self.data + + @property + def dtype(self): + return DummyDtype() + + def astype(self, dtype, copy=True): + # we don't support anything but a single dtype + if isinstance(dtype, DummyDtype): + if copy: + return type(self)(self.data) + return self + + return np.array(self, dtype=dtype, copy=copy) + + +class TestExtensionArrayDtype: + @pytest.mark.parametrize( + "values", + [ + pd.Categorical([]), + pd.Categorical([]).dtype, + pd.Series(pd.Categorical([])), + DummyDtype(), + DummyArray(np.array([1, 2])), + ], + ) + def test_is_extension_array_dtype(self, values): + assert is_extension_array_dtype(values) + + @pytest.mark.parametrize("values", [np.array([]), pd.Series(np.array([]))]) + def test_is_not_extension_array_dtype(self, values): + assert not is_extension_array_dtype(values) + + +def test_astype(): + + arr = DummyArray(np.array([1, 2, 3])) + expected = np.array([1, 2, 3], dtype=object) + + result = arr.astype(object) + tm.assert_numpy_array_equal(result, expected) + + result = arr.astype("object") + tm.assert_numpy_array_equal(result, expected) + + +def test_astype_no_copy(): + arr = DummyArray(np.array([1, 2, 3], dtype=np.int64)) + result = arr.astype(arr.dtype, copy=False) + + assert arr is result + + result = arr.astype(arr.dtype) + assert arr is not result + + +@pytest.mark.parametrize("dtype", [dtypes.CategoricalDtype(), dtypes.IntervalDtype()]) +def test_is_extension_array_dtype(dtype): + assert isinstance(dtype, dtypes.ExtensionDtype) + assert is_extension_array_dtype(dtype) diff --git a/venv/Lib/site-packages/pandas/tests/extension/test_datetime.py b/venv/Lib/site-packages/pandas/tests/extension/test_datetime.py new file mode 100644 index 0000000..a60607d --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/test_datetime.py @@ -0,0 +1,214 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.dtypes import DatetimeTZDtype + +import pandas as pd +from pandas.core.arrays import DatetimeArray +from pandas.tests.extension import base + + +@pytest.fixture(params=["US/Central"]) +def dtype(request): + return DatetimeTZDtype(unit="ns", tz=request.param) + + +@pytest.fixture +def data(dtype): + data = DatetimeArray(pd.date_range("2000", periods=100, tz=dtype.tz), dtype=dtype) + return data + + +@pytest.fixture +def data_missing(dtype): + return DatetimeArray( + np.array(["NaT", "2000-01-01"], dtype="datetime64[ns]"), dtype=dtype + ) + + +@pytest.fixture +def data_for_sorting(dtype): + a = pd.Timestamp("2000-01-01") + b = pd.Timestamp("2000-01-02") + c = pd.Timestamp("2000-01-03") + return DatetimeArray(np.array([b, c, a], dtype="datetime64[ns]"), dtype=dtype) + + +@pytest.fixture +def data_missing_for_sorting(dtype): + a = pd.Timestamp("2000-01-01") + b = pd.Timestamp("2000-01-02") + return DatetimeArray(np.array([b, "NaT", a], dtype="datetime64[ns]"), dtype=dtype) + + +@pytest.fixture +def data_for_grouping(dtype): + """ + Expected to be like [B, B, NA, NA, A, A, B, C] + + Where A < B < C and NA is missing + """ + a = pd.Timestamp("2000-01-01") + b = pd.Timestamp("2000-01-02") + c = pd.Timestamp("2000-01-03") + na = "NaT" + return DatetimeArray( + np.array([b, b, na, na, a, a, b, c], dtype="datetime64[ns]"), dtype=dtype + ) + + +@pytest.fixture +def na_cmp(): + def cmp(a, b): + return a is pd.NaT and a is b + + return cmp + + +@pytest.fixture +def na_value(): + return pd.NaT + + +# ---------------------------------------------------------------------------- +class BaseDatetimeTests: + pass + + +# ---------------------------------------------------------------------------- +# Tests +class TestDatetimeDtype(BaseDatetimeTests, base.BaseDtypeTests): + pass + + +class TestConstructors(BaseDatetimeTests, base.BaseConstructorsTests): + pass + + +class TestGetitem(BaseDatetimeTests, base.BaseGetitemTests): + pass + + +class TestMethods(BaseDatetimeTests, base.BaseMethodsTests): + @pytest.mark.skip(reason="Incorrect expected") + def test_value_counts(self, all_data, dropna): + pass + + def test_combine_add(self, data_repeated): + # Timestamp.__add__(Timestamp) not defined + pass + + +class TestInterface(BaseDatetimeTests, base.BaseInterfaceTests): + def test_array_interface(self, data): + if data.tz: + # np.asarray(DTA) is currently always tz-naive. + pytest.skip("GH-23569") + else: + super().test_array_interface(data) + + +class TestArithmeticOps(BaseDatetimeTests, base.BaseArithmeticOpsTests): + implements = {"__sub__", "__rsub__"} + + def test_arith_series_with_scalar(self, data, all_arithmetic_operators): + if all_arithmetic_operators in self.implements: + s = pd.Series(data) + self.check_opname(s, all_arithmetic_operators, s.iloc[0], exc=None) + else: + # ... but not the rest. + super().test_arith_series_with_scalar(data, all_arithmetic_operators) + + def test_add_series_with_extension_array(self, data): + # Datetime + Datetime not implemented + s = pd.Series(data) + msg = "cannot add DatetimeArray and DatetimeArray" + with pytest.raises(TypeError, match=msg): + s + data + + def test_arith_series_with_array(self, data, all_arithmetic_operators): + if all_arithmetic_operators in self.implements: + s = pd.Series(data) + self.check_opname(s, all_arithmetic_operators, s.iloc[0], exc=None) + else: + # ... but not the rest. + super().test_arith_series_with_scalar(data, all_arithmetic_operators) + + def test_error(self, data, all_arithmetic_operators): + pass + + def test_divmod_series_array(self): + # GH 23287 + # skipping because it is not implemented + pass + + +class TestCasting(BaseDatetimeTests, base.BaseCastingTests): + pass + + +class TestComparisonOps(BaseDatetimeTests, base.BaseComparisonOpsTests): + def _compare_other(self, s, data, op_name, other): + # the base test is not appropriate for us. We raise on comparison + # with (some) integers, depending on the value. + pass + + +class TestMissing(BaseDatetimeTests, base.BaseMissingTests): + pass + + +class TestReshaping(BaseDatetimeTests, base.BaseReshapingTests): + @pytest.mark.skip(reason="We have DatetimeTZBlock") + def test_concat(self, data, in_frame): + pass + + def test_concat_mixed_dtypes(self, data): + # concat(Series[datetimetz], Series[category]) uses a + # plain np.array(values) on the DatetimeArray, which + # drops the tz. + super().test_concat_mixed_dtypes(data) + + @pytest.mark.parametrize("obj", ["series", "frame"]) + def test_unstack(self, obj): + # GH-13287: can't use base test, since building the expected fails. + data = DatetimeArray._from_sequence( + ["2000", "2001", "2002", "2003"], tz="US/Central" + ) + index = pd.MultiIndex.from_product(([["A", "B"], ["a", "b"]]), names=["a", "b"]) + + if obj == "series": + ser = pd.Series(data, index=index) + expected = pd.DataFrame( + {"A": data.take([0, 1]), "B": data.take([2, 3])}, + index=pd.Index(["a", "b"], name="b"), + ) + expected.columns.name = "a" + + else: + ser = pd.DataFrame({"A": data, "B": data}, index=index) + expected = pd.DataFrame( + { + ("A", "A"): data.take([0, 1]), + ("A", "B"): data.take([2, 3]), + ("B", "A"): data.take([0, 1]), + ("B", "B"): data.take([2, 3]), + }, + index=pd.Index(["a", "b"], name="b"), + ) + expected.columns.names = [None, "a"] + + result = ser.unstack(0) + self.assert_equal(result, expected) + + +class TestSetitem(BaseDatetimeTests, base.BaseSetitemTests): + pass + + +class TestGroupby(BaseDatetimeTests, base.BaseGroupbyTests): + pass + + +class TestPrinting(BaseDatetimeTests, base.BasePrintingTests): + pass diff --git a/venv/Lib/site-packages/pandas/tests/extension/test_external_block.py b/venv/Lib/site-packages/pandas/tests/extension/test_external_block.py new file mode 100644 index 0000000..6311070 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/test_external_block.py @@ -0,0 +1,55 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas.core.internals import BlockManager +from pandas.core.internals.blocks import Block, NonConsolidatableMixIn + + +class CustomBlock(NonConsolidatableMixIn, Block): + + _holder = np.ndarray + + def concat_same_type(self, to_concat, placement=None): + """ + Always concatenate disregarding self.ndim as the values are + always 1D in this custom Block + """ + values = np.concatenate([blk.values for blk in to_concat]) + return self.make_block_same_class( + values, placement=placement or slice(0, len(values), 1) + ) + + +@pytest.fixture +def df(): + df1 = pd.DataFrame({"a": [1, 2, 3]}) + blocks = df1._data.blocks + values = np.arange(3, dtype="int64") + custom_block = CustomBlock(values, placement=slice(1, 2)) + blocks = blocks + (custom_block,) + block_manager = BlockManager(blocks, [pd.Index(["a", "b"]), df1.index]) + return pd.DataFrame(block_manager) + + +def test_concat_series(): + # GH17728 + values = np.arange(3, dtype="int64") + block = CustomBlock(values, placement=slice(0, 3)) + s = pd.Series(block, pd.RangeIndex(3), fastpath=True) + + res = pd.concat([s, s]) + assert isinstance(res._data.blocks[0], CustomBlock) + + +def test_concat_dataframe(df): + # GH17728 + res = pd.concat([df, df]) + assert isinstance(res._data.blocks[1], CustomBlock) + + +def test_concat_axis1(df): + # GH17954 + df2 = pd.DataFrame({"c": [0.1, 0.2, 0.3]}) + res = pd.concat([df, df2], axis=1) + assert isinstance(res._data.blocks[1], CustomBlock) diff --git a/venv/Lib/site-packages/pandas/tests/extension/test_integer.py b/venv/Lib/site-packages/pandas/tests/extension/test_integer.py new file mode 100644 index 0000000..f55ec75 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/test_integer.py @@ -0,0 +1,256 @@ +""" +This file contains a minimal set of tests for compliance with the extension +array interface test suite, and should contain no other tests. +The test suite for the full functionality of the array is located in +`pandas/tests/arrays/`. + +The tests in this file are inherited from the BaseExtensionTests, and only +minimal tweaks should be applied to get the tests passing (by overwriting a +parent method). + +Additional tests should either be added to one of the BaseExtensionTests +classes (if they are relevant for the extension interface for all dtypes), or +be added to the array-specific tests in `pandas/tests/arrays/`. + +""" +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_extension_array_dtype + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import integer_array +from pandas.core.arrays.integer import ( + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, +) +from pandas.tests.extension import base + + +def make_data(): + return list(range(1, 9)) + [pd.NA] + list(range(10, 98)) + [pd.NA] + [99, 100] + + +@pytest.fixture( + params=[ + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, + ] +) +def dtype(request): + return request.param() + + +@pytest.fixture +def data(dtype): + return integer_array(make_data(), dtype=dtype) + + +@pytest.fixture +def data_for_twos(dtype): + return integer_array(np.ones(100) * 2, dtype=dtype) + + +@pytest.fixture +def data_missing(dtype): + return integer_array([pd.NA, 1], dtype=dtype) + + +@pytest.fixture +def data_for_sorting(dtype): + return integer_array([1, 2, 0], dtype=dtype) + + +@pytest.fixture +def data_missing_for_sorting(dtype): + return integer_array([1, pd.NA, 0], dtype=dtype) + + +@pytest.fixture +def na_cmp(): + # we are pd.NA + return lambda x, y: x is pd.NA and y is pd.NA + + +@pytest.fixture +def na_value(): + return pd.NA + + +@pytest.fixture +def data_for_grouping(dtype): + b = 1 + a = 0 + c = 2 + na = pd.NA + return integer_array([b, b, na, na, a, a, b, c], dtype=dtype) + + +class TestDtype(base.BaseDtypeTests): + @pytest.mark.skip(reason="using multiple dtypes") + def test_is_dtype_unboxes_dtype(self): + # we have multiple dtypes, so skip + pass + + +class TestArithmeticOps(base.BaseArithmeticOpsTests): + def check_opname(self, s, op_name, other, exc=None): + # overwriting to indicate ops don't raise an error + super().check_opname(s, op_name, other, exc=None) + + def _check_op(self, s, op, other, op_name, exc=NotImplementedError): + if exc is None: + if s.dtype.is_unsigned_integer and (op_name == "__rsub__"): + # TODO see https://github.com/pandas-dev/pandas/issues/22023 + pytest.skip("unsigned subtraction gives negative values") + + if ( + hasattr(other, "dtype") + and not is_extension_array_dtype(other.dtype) + and pd.api.types.is_integer_dtype(other.dtype) + ): + # other is np.int64 and would therefore always result in + # upcasting, so keeping other as same numpy_dtype + other = other.astype(s.dtype.numpy_dtype) + + result = op(s, other) + expected = s.combine(other, op) + + if op_name in ("__rtruediv__", "__truediv__", "__div__"): + expected = expected.fillna(np.nan).astype(float) + if op_name == "__rtruediv__": + # TODO reverse operators result in object dtype + result = result.astype(float) + elif op_name.startswith("__r"): + # TODO reverse operators result in object dtype + # see https://github.com/pandas-dev/pandas/issues/22024 + expected = expected.astype(s.dtype) + result = result.astype(s.dtype) + else: + # combine method result in 'biggest' (int64) dtype + expected = expected.astype(s.dtype) + pass + + if (op_name == "__rpow__") and isinstance(other, pd.Series): + # TODO pow on Int arrays gives different result with NA + # see https://github.com/pandas-dev/pandas/issues/22022 + result = result.fillna(1) + + self.assert_series_equal(result, expected) + else: + with pytest.raises(exc): + op(s, other) + + def _check_divmod_op(self, s, op, other, exc=None): + super()._check_divmod_op(s, op, other, None) + + @pytest.mark.skip(reason="intNA does not error on ops") + def test_error(self, data, all_arithmetic_operators): + # other specific errors tested in the integer array specific tests + pass + + +class TestComparisonOps(base.BaseComparisonOpsTests): + def _check_op(self, s, op, other, op_name, exc=NotImplementedError): + if exc is None: + result = op(s, other) + # Override to do the astype to boolean + expected = s.combine(other, op).astype("boolean") + self.assert_series_equal(result, expected) + else: + with pytest.raises(exc): + op(s, other) + + def check_opname(self, s, op_name, other, exc=None): + super().check_opname(s, op_name, other, exc=None) + + def _compare_other(self, s, data, op_name, other): + self.check_opname(s, op_name, other) + + +class TestInterface(base.BaseInterfaceTests): + pass + + +class TestConstructors(base.BaseConstructorsTests): + pass + + +class TestReshaping(base.BaseReshapingTests): + pass + + # for test_concat_mixed_dtypes test + # concat of an Integer and Int coerces to object dtype + # TODO(jreback) once integrated this would + + +class TestGetitem(base.BaseGetitemTests): + pass + + +class TestSetitem(base.BaseSetitemTests): + pass + + +class TestMissing(base.BaseMissingTests): + pass + + +class TestMethods(base.BaseMethodsTests): + @pytest.mark.skip(reason="uses nullable integer") + def test_value_counts(self, all_data, dropna): + all_data = all_data[:10] + if dropna: + other = np.array(all_data[~all_data.isna()]) + else: + other = all_data + + result = pd.Series(all_data).value_counts(dropna=dropna).sort_index() + expected = pd.Series(other).value_counts(dropna=dropna).sort_index() + expected.index = expected.index.astype(all_data.dtype) + + self.assert_series_equal(result, expected) + + +class TestCasting(base.BaseCastingTests): + pass + + +class TestGroupby(base.BaseGroupbyTests): + pass + + +class TestNumericReduce(base.BaseNumericReduceTests): + def check_reduce(self, s, op_name, skipna): + # overwrite to ensure pd.NA is tested instead of np.nan + # https://github.com/pandas-dev/pandas/issues/30958 + result = getattr(s, op_name)(skipna=skipna) + expected = getattr(s.astype("float64"), op_name)(skipna=skipna) + if np.isnan(expected): + expected = pd.NA + tm.assert_almost_equal(result, expected) + + +class TestBooleanReduce(base.BaseBooleanReduceTests): + pass + + +class TestPrinting(base.BasePrintingTests): + pass + + +class TestParsing(base.BaseParsingTests): + pass diff --git a/venv/Lib/site-packages/pandas/tests/extension/test_interval.py b/venv/Lib/site-packages/pandas/tests/extension/test_interval.py new file mode 100644 index 0000000..2411f6c --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/test_interval.py @@ -0,0 +1,166 @@ +""" +This file contains a minimal set of tests for compliance with the extension +array interface test suite, and should contain no other tests. +The test suite for the full functionality of the array is located in +`pandas/tests/arrays/`. + +The tests in this file are inherited from the BaseExtensionTests, and only +minimal tweaks should be applied to get the tests passing (by overwriting a +parent method). + +Additional tests should either be added to one of the BaseExtensionTests +classes (if they are relevant for the extension interface for all dtypes), or +be added to the array-specific tests in `pandas/tests/arrays/`. + +""" +import numpy as np +import pytest + +from pandas.core.dtypes.dtypes import IntervalDtype + +from pandas import Interval +from pandas.core.arrays import IntervalArray +from pandas.tests.extension import base + + +def make_data(): + N = 100 + left = np.random.uniform(size=N).cumsum() + right = left + np.random.uniform(size=N) + return [Interval(l, r) for l, r in zip(left, right)] + + +@pytest.fixture +def dtype(): + return IntervalDtype() + + +@pytest.fixture +def data(): + """Length-100 PeriodArray for semantics test.""" + return IntervalArray(make_data()) + + +@pytest.fixture +def data_missing(): + """Length 2 array with [NA, Valid]""" + return IntervalArray.from_tuples([None, (0, 1)]) + + +@pytest.fixture +def data_for_sorting(): + return IntervalArray.from_tuples([(1, 2), (2, 3), (0, 1)]) + + +@pytest.fixture +def data_missing_for_sorting(): + return IntervalArray.from_tuples([(1, 2), None, (0, 1)]) + + +@pytest.fixture +def na_value(): + return np.nan + + +@pytest.fixture +def data_for_grouping(): + a = (0, 1) + b = (1, 2) + c = (2, 3) + return IntervalArray.from_tuples([b, b, None, None, a, a, b, c]) + + +class BaseInterval: + pass + + +class TestDtype(BaseInterval, base.BaseDtypeTests): + pass + + +class TestCasting(BaseInterval, base.BaseCastingTests): + pass + + +class TestConstructors(BaseInterval, base.BaseConstructorsTests): + pass + + +class TestGetitem(BaseInterval, base.BaseGetitemTests): + pass + + +class TestGrouping(BaseInterval, base.BaseGroupbyTests): + pass + + +class TestInterface(BaseInterval, base.BaseInterfaceTests): + def test_view(self, data): + # __setitem__ incorrectly makes a copy (GH#27147), so we only + # have a smoke-test + data.view() + + +class TestReduce(base.BaseNoReduceTests): + pass + + +class TestMethods(BaseInterval, base.BaseMethodsTests): + @pytest.mark.skip(reason="addition is not defined for intervals") + def test_combine_add(self, data_repeated): + pass + + @pytest.mark.skip(reason="Not Applicable") + def test_fillna_length_mismatch(self, data_missing): + pass + + +class TestMissing(BaseInterval, base.BaseMissingTests): + # Index.fillna only accepts scalar `value`, so we have to skip all + # non-scalar fill tests. + unsupported_fill = pytest.mark.skip("Unsupported fillna option.") + + @unsupported_fill + def test_fillna_limit_pad(self): + pass + + @unsupported_fill + def test_fillna_series_method(self): + pass + + @unsupported_fill + def test_fillna_limit_backfill(self): + pass + + @unsupported_fill + def test_fillna_series(self): + pass + + def test_non_scalar_raises(self, data_missing): + msg = "Got a 'list' instead." + with pytest.raises(TypeError, match=msg): + data_missing.fillna([1, 1]) + + +class TestReshaping(BaseInterval, base.BaseReshapingTests): + pass + + +class TestSetitem(BaseInterval, base.BaseSetitemTests): + @pytest.mark.xfail(reason="GH#27147 setitem changes underlying index") + def test_setitem_preserves_views(self, data): + super().test_setitem_preserves_views(data) + + +class TestPrinting(BaseInterval, base.BasePrintingTests): + @pytest.mark.skip(reason="custom repr") + def test_array_repr(self, data, size): + pass + + +class TestParsing(BaseInterval, base.BaseParsingTests): + @pytest.mark.parametrize("engine", ["c", "python"]) + def test_EA_types(self, engine, data): + expected_msg = r".*must implement _from_sequence_of_strings.*" + with pytest.raises(NotImplementedError, match=expected_msg): + super().test_EA_types(engine, data) diff --git a/venv/Lib/site-packages/pandas/tests/extension/test_numpy.py b/venv/Lib/site-packages/pandas/tests/extension/test_numpy.py new file mode 100644 index 0000000..8a820c8 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/test_numpy.py @@ -0,0 +1,402 @@ +import numpy as np +import pytest + +from pandas.compat.numpy import _np_version_under1p16 + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays.numpy_ import PandasArray, PandasDtype + +from . import base + + +@pytest.fixture(params=["float", "object"]) +def dtype(request): + return PandasDtype(np.dtype(request.param)) + + +@pytest.fixture +def allow_in_pandas(monkeypatch): + """ + A monkeypatch to tells pandas to let us in. + + By default, passing a PandasArray to an index / series / frame + constructor will unbox that PandasArray to an ndarray, and treat + it as a non-EA column. We don't want people using EAs without + reason. + + The mechanism for this is a check against ABCPandasArray + in each constructor. + + But, for testing, we need to allow them in pandas. So we patch + the _typ of PandasArray, so that we evade the ABCPandasArray + check. + """ + with monkeypatch.context() as m: + m.setattr(PandasArray, "_typ", "extension") + yield + + +@pytest.fixture +def data(allow_in_pandas, dtype): + if dtype.numpy_dtype == "object": + return pd.Series([(i,) for i in range(100)]).array + return PandasArray(np.arange(1, 101, dtype=dtype._dtype)) + + +@pytest.fixture +def data_missing(allow_in_pandas, dtype): + # For NumPy <1.16, np.array([np.nan, (1,)]) raises + # ValueError: setting an array element with a sequence. + if dtype.numpy_dtype == "object": + if _np_version_under1p16: + raise pytest.skip("Skipping for NumPy <1.16") + return PandasArray(np.array([np.nan, (1,)], dtype=object)) + return PandasArray(np.array([np.nan, 1.0])) + + +@pytest.fixture +def na_value(): + return np.nan + + +@pytest.fixture +def na_cmp(): + def cmp(a, b): + return np.isnan(a) and np.isnan(b) + + return cmp + + +@pytest.fixture +def data_for_sorting(allow_in_pandas, dtype): + """Length-3 array with a known sort order. + + This should be three items [B, C, A] with + A < B < C + """ + if dtype.numpy_dtype == "object": + # Use an empty tuple for first element, then remove, + # to disable np.array's shape inference. + return PandasArray(np.array([(), (2,), (3,), (1,)], dtype=object)[1:]) + return PandasArray(np.array([1, 2, 0])) + + +@pytest.fixture +def data_missing_for_sorting(allow_in_pandas, dtype): + """Length-3 array with a known sort order. + + This should be three items [B, NA, A] with + A < B and NA missing. + """ + if dtype.numpy_dtype == "object": + return PandasArray(np.array([(1,), np.nan, (0,)], dtype=object)) + return PandasArray(np.array([1, np.nan, 0])) + + +@pytest.fixture +def data_for_grouping(allow_in_pandas, dtype): + """Data for factorization, grouping, and unique tests. + + Expected to be like [B, B, NA, NA, A, A, B, C] + + Where A < B < C and NA is missing + """ + if dtype.numpy_dtype == "object": + a, b, c = (1,), (2,), (3,) + else: + a, b, c = np.arange(3) + return PandasArray( + np.array([b, b, np.nan, np.nan, a, a, b, c], dtype=dtype.numpy_dtype) + ) + + +@pytest.fixture +def skip_numpy_object(dtype): + """ + Tests for PandasArray with nested data. Users typically won't create + these objects via `pd.array`, but they can show up through `.array` + on a Series with nested data. Many of the base tests fail, as they aren't + appropriate for nested data. + + This fixture allows these tests to be skipped when used as a usefixtures + marker to either an individual test or a test class. + """ + if dtype == "object": + raise pytest.skip("Skipping for object dtype.") + + +skip_nested = pytest.mark.usefixtures("skip_numpy_object") + + +class BaseNumPyTests: + pass + + +class TestCasting(BaseNumPyTests, base.BaseCastingTests): + @skip_nested + def test_astype_str(self, data): + # ValueError: setting an array element with a sequence + super().test_astype_str(data) + + +class TestConstructors(BaseNumPyTests, base.BaseConstructorsTests): + @pytest.mark.skip(reason="We don't register our dtype") + # We don't want to register. This test should probably be split in two. + def test_from_dtype(self, data): + pass + + @skip_nested + def test_array_from_scalars(self, data): + # ValueError: PandasArray must be 1-dimensional. + super().test_array_from_scalars(data) + + +class TestDtype(BaseNumPyTests, base.BaseDtypeTests): + @pytest.mark.skip(reason="Incorrect expected.") + # we unsurprisingly clash with a NumPy name. + def test_check_dtype(self, data): + pass + + +class TestGetitem(BaseNumPyTests, base.BaseGetitemTests): + @skip_nested + def test_getitem_scalar(self, data): + # AssertionError + super().test_getitem_scalar(data) + + @skip_nested + def test_take_series(self, data): + # ValueError: PandasArray must be 1-dimensional. + super().test_take_series(data) + + @pytest.mark.xfail(reason="astype doesn't recognize data.dtype") + def test_loc_iloc_frame_single_dtype(self, data): + super().test_loc_iloc_frame_single_dtype(data) + + +class TestGroupby(BaseNumPyTests, base.BaseGroupbyTests): + @skip_nested + def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): + # ValueError: Names should be list-like for a MultiIndex + super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op) + + +class TestInterface(BaseNumPyTests, base.BaseInterfaceTests): + @skip_nested + def test_array_interface(self, data): + # NumPy array shape inference + super().test_array_interface(data) + + +class TestMethods(BaseNumPyTests, base.BaseMethodsTests): + @pytest.mark.skip(reason="TODO: remove?") + def test_value_counts(self, all_data, dropna): + pass + + @pytest.mark.skip(reason="Incorrect expected") + # We have a bool dtype, so the result is an ExtensionArray + # but expected is not + def test_combine_le(self, data_repeated): + super().test_combine_le(data_repeated) + + @skip_nested + def test_combine_add(self, data_repeated): + # Not numeric + super().test_combine_add(data_repeated) + + @skip_nested + def test_shift_fill_value(self, data): + # np.array shape inference. Shift implementation fails. + super().test_shift_fill_value(data) + + @skip_nested + @pytest.mark.parametrize("box", [pd.Series, lambda x: x]) + @pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique]) + def test_unique(self, data, box, method): + # Fails creating expected + super().test_unique(data, box, method) + + @skip_nested + def test_fillna_copy_frame(self, data_missing): + # The "scalar" for this array isn't a scalar. + super().test_fillna_copy_frame(data_missing) + + @skip_nested + def test_fillna_copy_series(self, data_missing): + # The "scalar" for this array isn't a scalar. + super().test_fillna_copy_series(data_missing) + + @skip_nested + def test_hash_pandas_object_works(self, data, as_frame): + # ndarray of tuples not hashable + super().test_hash_pandas_object_works(data, as_frame) + + @skip_nested + def test_searchsorted(self, data_for_sorting, as_series): + # Test setup fails. + super().test_searchsorted(data_for_sorting, as_series) + + @skip_nested + def test_where_series(self, data, na_value, as_frame): + # Test setup fails. + super().test_where_series(data, na_value, as_frame) + + @skip_nested + @pytest.mark.parametrize("repeats", [0, 1, 2, [1, 2, 3]]) + def test_repeat(self, data, repeats, as_series, use_numpy): + # Fails creating expected + super().test_repeat(data, repeats, as_series, use_numpy) + + @pytest.mark.xfail(reason="PandasArray.diff may fail on dtype") + def test_diff(self, data, periods): + return super().test_diff(data, periods) + + +@skip_nested +class TestArithmetics(BaseNumPyTests, base.BaseArithmeticOpsTests): + divmod_exc = None + series_scalar_exc = None + frame_scalar_exc = None + series_array_exc = None + + def test_divmod_series_array(self, data): + s = pd.Series(data) + self._check_divmod_op(s, divmod, data, exc=None) + + @pytest.mark.skip("We implement ops") + def test_error(self, data, all_arithmetic_operators): + pass + + def test_arith_series_with_scalar(self, data, all_arithmetic_operators): + super().test_arith_series_with_scalar(data, all_arithmetic_operators) + + def test_arith_series_with_array(self, data, all_arithmetic_operators): + super().test_arith_series_with_array(data, all_arithmetic_operators) + + +class TestPrinting(BaseNumPyTests, base.BasePrintingTests): + pass + + +@skip_nested +class TestNumericReduce(BaseNumPyTests, base.BaseNumericReduceTests): + def check_reduce(self, s, op_name, skipna): + result = getattr(s, op_name)(skipna=skipna) + # avoid coercing int -> float. Just cast to the actual numpy type. + expected = getattr(s.astype(s.dtype._dtype), op_name)(skipna=skipna) + tm.assert_almost_equal(result, expected) + + +@skip_nested +class TestBooleanReduce(BaseNumPyTests, base.BaseBooleanReduceTests): + pass + + +class TestMissing(BaseNumPyTests, base.BaseMissingTests): + @skip_nested + def test_fillna_scalar(self, data_missing): + # Non-scalar "scalar" values. + super().test_fillna_scalar(data_missing) + + @skip_nested + def test_fillna_series_method(self, data_missing, fillna_method): + # Non-scalar "scalar" values. + super().test_fillna_series_method(data_missing, fillna_method) + + @skip_nested + def test_fillna_series(self, data_missing): + # Non-scalar "scalar" values. + super().test_fillna_series(data_missing) + + @skip_nested + def test_fillna_frame(self, data_missing): + # Non-scalar "scalar" values. + super().test_fillna_frame(data_missing) + + +class TestReshaping(BaseNumPyTests, base.BaseReshapingTests): + @pytest.mark.skip("Incorrect parent test") + # not actually a mixed concat, since we concat int and int. + def test_concat_mixed_dtypes(self, data): + super().test_concat_mixed_dtypes(data) + + @skip_nested + def test_merge(self, data, na_value): + # Fails creating expected + super().test_merge(data, na_value) + + @skip_nested + def test_merge_on_extension_array(self, data): + # Fails creating expected + super().test_merge_on_extension_array(data) + + @skip_nested + def test_merge_on_extension_array_duplicates(self, data): + # Fails creating expected + super().test_merge_on_extension_array_duplicates(data) + + @skip_nested + def test_transpose(self, data): + super().test_transpose(data) + + +class TestSetitem(BaseNumPyTests, base.BaseSetitemTests): + @skip_nested + def test_setitem_scalar_series(self, data, box_in_series): + # AssertionError + super().test_setitem_scalar_series(data, box_in_series) + + @skip_nested + def test_setitem_sequence(self, data, box_in_series): + # ValueError: shape mismatch: value array of shape (2,1) could not + # be broadcast to indexing result of shape (2,) + super().test_setitem_sequence(data, box_in_series) + + @skip_nested + def test_setitem_sequence_mismatched_length_raises(self, data, as_array): + # ValueError: PandasArray must be 1-dimensional. + super().test_setitem_sequence_mismatched_length_raises(data, as_array) + + @skip_nested + def test_setitem_sequence_broadcasts(self, data, box_in_series): + # ValueError: cannot set using a list-like indexer with a different + # length than the value + super().test_setitem_sequence_broadcasts(data, box_in_series) + + @skip_nested + def test_setitem_loc_scalar_mixed(self, data): + # AssertionError + super().test_setitem_loc_scalar_mixed(data) + + @skip_nested + def test_setitem_loc_scalar_multiple_homogoneous(self, data): + # AssertionError + super().test_setitem_loc_scalar_multiple_homogoneous(data) + + @skip_nested + def test_setitem_iloc_scalar_mixed(self, data): + # AssertionError + super().test_setitem_iloc_scalar_mixed(data) + + @skip_nested + def test_setitem_iloc_scalar_multiple_homogoneous(self, data): + # AssertionError + super().test_setitem_iloc_scalar_multiple_homogoneous(data) + + @skip_nested + @pytest.mark.parametrize("setter", ["loc", None]) + def test_setitem_mask_broadcast(self, data, setter): + # ValueError: cannot set using a list-like indexer with a different + # length than the value + super().test_setitem_mask_broadcast(data, setter) + + @skip_nested + def test_setitem_scalar_key_sequence_raise(self, data): + # Failed: DID NOT RAISE + super().test_setitem_scalar_key_sequence_raise(data) + + +@skip_nested +class TestParsing(BaseNumPyTests, base.BaseParsingTests): + pass diff --git a/venv/Lib/site-packages/pandas/tests/extension/test_period.py b/venv/Lib/site-packages/pandas/tests/extension/test_period.py new file mode 100644 index 0000000..c439b8b --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/test_period.py @@ -0,0 +1,161 @@ +import numpy as np +import pytest + +from pandas._libs.tslib import iNaT + +from pandas.core.dtypes.dtypes import PeriodDtype + +import pandas as pd +from pandas.core.arrays import PeriodArray +from pandas.tests.extension import base + + +@pytest.fixture +def dtype(): + return PeriodDtype(freq="D") + + +@pytest.fixture +def data(dtype): + return PeriodArray(np.arange(1970, 2070), freq=dtype.freq) + + +@pytest.fixture +def data_for_twos(dtype): + return PeriodArray(np.ones(100) * 2, freq=dtype.freq) + + +@pytest.fixture +def data_for_sorting(dtype): + return PeriodArray([2018, 2019, 2017], freq=dtype.freq) + + +@pytest.fixture +def data_missing(dtype): + return PeriodArray([iNaT, 2017], freq=dtype.freq) + + +@pytest.fixture +def data_missing_for_sorting(dtype): + return PeriodArray([2018, iNaT, 2017], freq=dtype.freq) + + +@pytest.fixture +def data_for_grouping(dtype): + B = 2018 + NA = iNaT + A = 2017 + C = 2019 + return PeriodArray([B, B, NA, NA, A, A, B, C], freq=dtype.freq) + + +@pytest.fixture +def na_value(): + return pd.NaT + + +class BasePeriodTests: + pass + + +class TestPeriodDtype(BasePeriodTests, base.BaseDtypeTests): + pass + + +class TestConstructors(BasePeriodTests, base.BaseConstructorsTests): + pass + + +class TestGetitem(BasePeriodTests, base.BaseGetitemTests): + pass + + +class TestMethods(BasePeriodTests, base.BaseMethodsTests): + def test_combine_add(self, data_repeated): + # Period + Period is not defined. + pass + + +class TestInterface(BasePeriodTests, base.BaseInterfaceTests): + + pass + + +class TestArithmeticOps(BasePeriodTests, base.BaseArithmeticOpsTests): + implements = {"__sub__", "__rsub__"} + + def test_arith_series_with_scalar(self, data, all_arithmetic_operators): + # we implement substitution... + if all_arithmetic_operators in self.implements: + s = pd.Series(data) + self.check_opname(s, all_arithmetic_operators, s.iloc[0], exc=None) + else: + # ... but not the rest. + super().test_arith_series_with_scalar(data, all_arithmetic_operators) + + def test_arith_series_with_array(self, data, all_arithmetic_operators): + if all_arithmetic_operators in self.implements: + s = pd.Series(data) + self.check_opname(s, all_arithmetic_operators, s.iloc[0], exc=None) + else: + # ... but not the rest. + super().test_arith_series_with_scalar(data, all_arithmetic_operators) + + def _check_divmod_op(self, s, op, other, exc=NotImplementedError): + super()._check_divmod_op(s, op, other, exc=TypeError) + + def test_add_series_with_extension_array(self, data): + # we don't implement + for Period + s = pd.Series(data) + msg = ( + r"unsupported operand type\(s\) for \+: " + r"\'PeriodArray\' and \'PeriodArray\'" + ) + with pytest.raises(TypeError, match=msg): + s + data + + def test_error(self): + pass + + def test_direct_arith_with_series_returns_not_implemented(self, data): + # Override to use __sub__ instead of __add__ + other = pd.Series(data) + result = data.__sub__(other) + assert result is NotImplemented + + +class TestCasting(BasePeriodTests, base.BaseCastingTests): + pass + + +class TestComparisonOps(BasePeriodTests, base.BaseComparisonOpsTests): + def _compare_other(self, s, data, op_name, other): + # the base test is not appropriate for us. We raise on comparison + # with (some) integers, depending on the value. + pass + + +class TestMissing(BasePeriodTests, base.BaseMissingTests): + pass + + +class TestReshaping(BasePeriodTests, base.BaseReshapingTests): + pass + + +class TestSetitem(BasePeriodTests, base.BaseSetitemTests): + pass + + +class TestGroupby(BasePeriodTests, base.BaseGroupbyTests): + pass + + +class TestPrinting(BasePeriodTests, base.BasePrintingTests): + pass + + +class TestParsing(BasePeriodTests, base.BaseParsingTests): + @pytest.mark.parametrize("engine", ["c", "python"]) + def test_EA_types(self, engine, data): + super().test_EA_types(engine, data) diff --git a/venv/Lib/site-packages/pandas/tests/extension/test_sparse.py b/venv/Lib/site-packages/pandas/tests/extension/test_sparse.py new file mode 100644 index 0000000..198a228 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/test_sparse.py @@ -0,0 +1,379 @@ +import numpy as np +import pytest + +from pandas.errors import PerformanceWarning + +import pandas as pd +from pandas import SparseDtype +import pandas._testing as tm +from pandas.arrays import SparseArray +from pandas.tests.extension import base + + +def make_data(fill_value): + if np.isnan(fill_value): + data = np.random.uniform(size=100) + else: + data = np.random.randint(1, 100, size=100) + if data[0] == data[1]: + data[0] += 1 + + data[2::3] = fill_value + return data + + +@pytest.fixture +def dtype(): + return SparseDtype() + + +@pytest.fixture(params=[0, np.nan]) +def data(request): + """Length-100 PeriodArray for semantics test.""" + res = SparseArray(make_data(request.param), fill_value=request.param) + return res + + +@pytest.fixture +def data_for_twos(request): + return SparseArray(np.ones(100) * 2) + + +@pytest.fixture(params=[0, np.nan]) +def data_missing(request): + """Length 2 array with [NA, Valid]""" + return SparseArray([np.nan, 1], fill_value=request.param) + + +@pytest.fixture(params=[0, np.nan]) +def data_repeated(request): + """Return different versions of data for count times""" + + def gen(count): + for _ in range(count): + yield SparseArray(make_data(request.param), fill_value=request.param) + + yield gen + + +@pytest.fixture(params=[0, np.nan]) +def data_for_sorting(request): + return SparseArray([2, 3, 1], fill_value=request.param) + + +@pytest.fixture(params=[0, np.nan]) +def data_missing_for_sorting(request): + return SparseArray([2, np.nan, 1], fill_value=request.param) + + +@pytest.fixture +def na_value(): + return np.nan + + +@pytest.fixture +def na_cmp(): + return lambda left, right: pd.isna(left) and pd.isna(right) + + +@pytest.fixture(params=[0, np.nan]) +def data_for_grouping(request): + return SparseArray([1, 1, np.nan, np.nan, 2, 2, 1, 3], fill_value=request.param) + + +class BaseSparseTests: + def _check_unsupported(self, data): + if data.dtype == SparseDtype(int, 0): + pytest.skip("Can't store nan in int array.") + + @pytest.mark.xfail(reason="SparseArray does not support setitem") + def test_ravel(self, data): + super().test_ravel(data) + + +class TestDtype(BaseSparseTests, base.BaseDtypeTests): + def test_array_type_with_arg(self, data, dtype): + assert dtype.construct_array_type() is SparseArray + + +class TestInterface(BaseSparseTests, base.BaseInterfaceTests): + def test_no_values_attribute(self, data): + pytest.skip("We have values") + + def test_copy(self, data): + # __setitem__ does not work, so we only have a smoke-test + data.copy() + + def test_view(self, data): + # __setitem__ does not work, so we only have a smoke-test + data.view() + + +class TestConstructors(BaseSparseTests, base.BaseConstructorsTests): + pass + + +class TestReshaping(BaseSparseTests, base.BaseReshapingTests): + def test_concat_mixed_dtypes(self, data): + # https://github.com/pandas-dev/pandas/issues/20762 + # This should be the same, aside from concat([sparse, float]) + df1 = pd.DataFrame({"A": data[:3]}) + df2 = pd.DataFrame({"A": [1, 2, 3]}) + df3 = pd.DataFrame({"A": ["a", "b", "c"]}).astype("category") + dfs = [df1, df2, df3] + + # dataframes + result = pd.concat(dfs) + expected = pd.concat( + [x.apply(lambda s: np.asarray(s).astype(object)) for x in dfs] + ) + self.assert_frame_equal(result, expected) + + def test_concat_columns(self, data, na_value): + self._check_unsupported(data) + super().test_concat_columns(data, na_value) + + def test_concat_extension_arrays_copy_false(self, data, na_value): + self._check_unsupported(data) + super().test_concat_extension_arrays_copy_false(data, na_value) + + def test_align(self, data, na_value): + self._check_unsupported(data) + super().test_align(data, na_value) + + def test_align_frame(self, data, na_value): + self._check_unsupported(data) + super().test_align_frame(data, na_value) + + def test_align_series_frame(self, data, na_value): + self._check_unsupported(data) + super().test_align_series_frame(data, na_value) + + def test_merge(self, data, na_value): + self._check_unsupported(data) + super().test_merge(data, na_value) + + +class TestGetitem(BaseSparseTests, base.BaseGetitemTests): + def test_get(self, data): + s = pd.Series(data, index=[2 * i for i in range(len(data))]) + if np.isnan(s.values.fill_value): + assert np.isnan(s.get(4)) and np.isnan(s.iloc[2]) + else: + assert s.get(4) == s.iloc[2] + assert s.get(2) == s.iloc[1] + + def test_reindex(self, data, na_value): + self._check_unsupported(data) + super().test_reindex(data, na_value) + + +# Skipping TestSetitem, since we don't implement it. + + +class TestMissing(BaseSparseTests, base.BaseMissingTests): + def test_isna(self, data_missing): + expected_dtype = SparseDtype(bool, pd.isna(data_missing.dtype.fill_value)) + expected = SparseArray([True, False], dtype=expected_dtype) + + result = pd.isna(data_missing) + self.assert_equal(result, expected) + + result = pd.Series(data_missing).isna() + expected = pd.Series(expected) + self.assert_series_equal(result, expected) + + # GH 21189 + result = pd.Series(data_missing).drop([0, 1]).isna() + expected = pd.Series([], dtype=expected_dtype) + self.assert_series_equal(result, expected) + + def test_fillna_limit_pad(self, data_missing): + with tm.assert_produces_warning(PerformanceWarning): + super().test_fillna_limit_pad(data_missing) + + def test_fillna_limit_backfill(self, data_missing): + with tm.assert_produces_warning(PerformanceWarning): + super().test_fillna_limit_backfill(data_missing) + + def test_fillna_series_method(self, data_missing): + with tm.assert_produces_warning(PerformanceWarning): + super().test_fillna_limit_backfill(data_missing) + + @pytest.mark.skip(reason="Unsupported") + def test_fillna_series(self): + # this one looks doable. + pass + + def test_fillna_frame(self, data_missing): + # Have to override to specify that fill_value will change. + fill_value = data_missing[1] + + result = pd.DataFrame({"A": data_missing, "B": [1, 2]}).fillna(fill_value) + + if pd.isna(data_missing.fill_value): + dtype = SparseDtype(data_missing.dtype, fill_value) + else: + dtype = data_missing.dtype + + expected = pd.DataFrame( + { + "A": data_missing._from_sequence([fill_value, fill_value], dtype=dtype), + "B": [1, 2], + } + ) + + self.assert_frame_equal(result, expected) + + +class TestMethods(BaseSparseTests, base.BaseMethodsTests): + def test_combine_le(self, data_repeated): + # We return a Series[SparseArray].__le__ returns a + # Series[Sparse[bool]] + # rather than Series[bool] + orig_data1, orig_data2 = data_repeated(2) + s1 = pd.Series(orig_data1) + s2 = pd.Series(orig_data2) + result = s1.combine(s2, lambda x1, x2: x1 <= x2) + expected = pd.Series( + SparseArray( + [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))], + fill_value=False, + ) + ) + self.assert_series_equal(result, expected) + + val = s1.iloc[0] + result = s1.combine(val, lambda x1, x2: x1 <= x2) + expected = pd.Series( + SparseArray([a <= val for a in list(orig_data1)], fill_value=False) + ) + self.assert_series_equal(result, expected) + + def test_fillna_copy_frame(self, data_missing): + arr = data_missing.take([1, 1]) + df = pd.DataFrame({"A": arr}) + + filled_val = df.iloc[0, 0] + result = df.fillna(filled_val) + + assert df.values.base is not result.values.base + assert df.A._values.to_dense() is arr.to_dense() + + def test_fillna_copy_series(self, data_missing): + arr = data_missing.take([1, 1]) + ser = pd.Series(arr) + + filled_val = ser[0] + result = ser.fillna(filled_val) + + assert ser._values is not result._values + assert ser._values.to_dense() is arr.to_dense() + + @pytest.mark.skip(reason="Not Applicable") + def test_fillna_length_mismatch(self, data_missing): + pass + + def test_where_series(self, data, na_value): + assert data[0] != data[1] + cls = type(data) + a, b = data[:2] + + ser = pd.Series(cls._from_sequence([a, a, b, b], dtype=data.dtype)) + + cond = np.array([True, True, False, False]) + result = ser.where(cond) + + new_dtype = SparseDtype("float", 0.0) + expected = pd.Series( + cls._from_sequence([a, a, na_value, na_value], dtype=new_dtype) + ) + self.assert_series_equal(result, expected) + + other = cls._from_sequence([a, b, a, b], dtype=data.dtype) + cond = np.array([True, False, True, True]) + result = ser.where(cond, other) + expected = pd.Series(cls._from_sequence([a, b, b, b], dtype=data.dtype)) + self.assert_series_equal(result, expected) + + def test_combine_first(self, data): + if data.dtype.subtype == "int": + # Right now this is upcasted to float, just like combine_first + # for Series[int] + pytest.skip("TODO(SparseArray.__setitem__ will preserve dtype.") + super().test_combine_first(data) + + def test_searchsorted(self, data_for_sorting, as_series): + with tm.assert_produces_warning(PerformanceWarning): + super().test_searchsorted(data_for_sorting, as_series) + + +class TestCasting(BaseSparseTests, base.BaseCastingTests): + pass + + +class TestArithmeticOps(BaseSparseTests, base.BaseArithmeticOpsTests): + series_scalar_exc = None + frame_scalar_exc = None + divmod_exc = None + series_array_exc = None + + def _skip_if_different_combine(self, data): + if data.fill_value == 0: + # arith ops call on dtype.fill_value so that the sparsity + # is maintained. Combine can't be called on a dtype in + # general, so we can't make the expected. This is tested elsewhere + raise pytest.skip("Incorrected expected from Series.combine") + + def test_error(self, data, all_arithmetic_operators): + pass + + def test_arith_series_with_scalar(self, data, all_arithmetic_operators): + self._skip_if_different_combine(data) + super().test_arith_series_with_scalar(data, all_arithmetic_operators) + + def test_arith_series_with_array(self, data, all_arithmetic_operators): + self._skip_if_different_combine(data) + super().test_arith_series_with_array(data, all_arithmetic_operators) + + +class TestComparisonOps(BaseSparseTests, base.BaseComparisonOpsTests): + def _compare_other(self, s, data, op_name, other): + op = self.get_op_from_name(op_name) + + # array + result = pd.Series(op(data, other)) + # hard to test the fill value, since we don't know what expected + # is in general. + # Rely on tests in `tests/sparse` to validate that. + assert isinstance(result.dtype, SparseDtype) + assert result.dtype.subtype == np.dtype("bool") + + with np.errstate(all="ignore"): + expected = pd.Series( + SparseArray( + op(np.asarray(data), np.asarray(other)), + fill_value=result.values.fill_value, + ) + ) + + tm.assert_series_equal(result, expected) + + # series + s = pd.Series(data) + result = op(s, other) + tm.assert_series_equal(result, expected) + + +class TestPrinting(BaseSparseTests, base.BasePrintingTests): + @pytest.mark.xfail(reason="Different repr", strict=True) + def test_array_repr(self, data, size): + super().test_array_repr(data, size) + + +class TestParsing(BaseSparseTests, base.BaseParsingTests): + @pytest.mark.parametrize("engine", ["c", "python"]) + def test_EA_types(self, engine, data): + expected_msg = r".*must implement _from_sequence_of_strings.*" + with pytest.raises(NotImplementedError, match=expected_msg): + super().test_EA_types(engine, data) diff --git a/venv/Lib/site-packages/pandas/tests/extension/test_string.py b/venv/Lib/site-packages/pandas/tests/extension/test_string.py new file mode 100644 index 0000000..86aed67 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/extension/test_string.py @@ -0,0 +1,114 @@ +import string + +import numpy as np +import pytest + +import pandas as pd +from pandas.core.arrays.string_ import StringArray, StringDtype +from pandas.tests.extension import base + + +@pytest.fixture +def dtype(): + return StringDtype() + + +@pytest.fixture +def data(): + strings = np.random.choice(list(string.ascii_letters), size=100) + while strings[0] == strings[1]: + strings = np.random.choice(list(string.ascii_letters), size=100) + + return StringArray._from_sequence(strings) + + +@pytest.fixture +def data_missing(): + """Length 2 array with [NA, Valid]""" + return StringArray._from_sequence([pd.NA, "A"]) + + +@pytest.fixture +def data_for_sorting(): + return StringArray._from_sequence(["B", "C", "A"]) + + +@pytest.fixture +def data_missing_for_sorting(): + return StringArray._from_sequence(["B", pd.NA, "A"]) + + +@pytest.fixture +def na_value(): + return pd.NA + + +@pytest.fixture +def data_for_grouping(): + return StringArray._from_sequence(["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"]) + + +class TestDtype(base.BaseDtypeTests): + pass + + +class TestInterface(base.BaseInterfaceTests): + pass + + +class TestConstructors(base.BaseConstructorsTests): + pass + + +class TestReshaping(base.BaseReshapingTests): + pass + + +class TestGetitem(base.BaseGetitemTests): + pass + + +class TestSetitem(base.BaseSetitemTests): + pass + + +class TestMissing(base.BaseMissingTests): + pass + + +class TestNoReduce(base.BaseNoReduceTests): + pass + + +class TestMethods(base.BaseMethodsTests): + @pytest.mark.skip(reason="returns nullable") + def test_value_counts(self, all_data, dropna): + return super().test_value_counts(all_data, dropna) + + +class TestCasting(base.BaseCastingTests): + pass + + +class TestComparisonOps(base.BaseComparisonOpsTests): + def _compare_other(self, s, data, op_name, other): + result = getattr(s, op_name)(other) + expected = getattr(s.astype(object), op_name)(other).astype("boolean") + self.assert_series_equal(result, expected) + + def test_compare_scalar(self, data, all_compare_operators): + op_name = all_compare_operators + s = pd.Series(data) + self._compare_other(s, data, op_name, "abc") + + +class TestParsing(base.BaseParsingTests): + pass + + +class TestPrinting(base.BasePrintingTests): + pass + + +class TestGroupBy(base.BaseGroupbyTests): + pass diff --git a/venv/Lib/site-packages/pandas/tests/frame/__init__.py b/venv/Lib/site-packages/pandas/tests/frame/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/frame/common.py b/venv/Lib/site-packages/pandas/tests/frame/common.py new file mode 100644 index 0000000..463a140 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/common.py @@ -0,0 +1,31 @@ +def _check_mixed_float(df, dtype=None): + # float16 are most likely to be upcasted to float32 + dtypes = dict(A="float32", B="float32", C="float16", D="float64") + if isinstance(dtype, str): + dtypes = {k: dtype for k, v in dtypes.items()} + elif isinstance(dtype, dict): + dtypes.update(dtype) + if dtypes.get("A"): + assert df.dtypes["A"] == dtypes["A"] + if dtypes.get("B"): + assert df.dtypes["B"] == dtypes["B"] + if dtypes.get("C"): + assert df.dtypes["C"] == dtypes["C"] + if dtypes.get("D"): + assert df.dtypes["D"] == dtypes["D"] + + +def _check_mixed_int(df, dtype=None): + dtypes = dict(A="int32", B="uint64", C="uint8", D="int64") + if isinstance(dtype, str): + dtypes = {k: dtype for k, v in dtypes.items()} + elif isinstance(dtype, dict): + dtypes.update(dtype) + if dtypes.get("A"): + assert df.dtypes["A"] == dtypes["A"] + if dtypes.get("B"): + assert df.dtypes["B"] == dtypes["B"] + if dtypes.get("C"): + assert df.dtypes["C"] == dtypes["C"] + if dtypes.get("D"): + assert df.dtypes["D"] == dtypes["D"] diff --git a/venv/Lib/site-packages/pandas/tests/frame/conftest.py b/venv/Lib/site-packages/pandas/tests/frame/conftest.py new file mode 100644 index 0000000..774eb44 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/conftest.py @@ -0,0 +1,330 @@ +import numpy as np +import pytest + +from pandas import DataFrame, NaT, date_range +import pandas._testing as tm + + +@pytest.fixture +def float_frame_with_na(): + """ + Fixture for DataFrame of floats with index of unique strings + + Columns are ['A', 'B', 'C', 'D']; some entries are missing + + A B C D + ABwBzA0ljw -1.128865 -0.897161 0.046603 0.274997 + DJiRzmbyQF 0.728869 0.233502 0.722431 -0.890872 + neMgPD5UBF 0.486072 -1.027393 -0.031553 1.449522 + 0yWA4n8VeX -1.937191 -1.142531 0.805215 -0.462018 + 3slYUbbqU1 0.153260 1.164691 1.489795 -0.545826 + soujjZ0A08 NaN NaN NaN NaN + 7W6NLGsjB9 NaN NaN NaN NaN + ... ... ... ... ... + uhfeaNkCR1 -0.231210 -0.340472 0.244717 -0.901590 + n6p7GYuBIV -0.419052 1.922721 -0.125361 -0.727717 + ZhzAeY6p1y 1.234374 -1.425359 -0.827038 -0.633189 + uWdPsORyUh 0.046738 -0.980445 -1.102965 0.605503 + 3DJA6aN590 -0.091018 -1.684734 -1.100900 0.215947 + 2GBPAzdbMk -2.883405 -1.021071 1.209877 1.633083 + sHadBoyVHw -2.223032 -0.326384 0.258931 0.245517 + + [30 rows x 4 columns] + """ + df = DataFrame(tm.getSeriesData()) + # set some NAs + df.loc[5:10] = np.nan + df.loc[15:20, -2:] = np.nan + return df + + +@pytest.fixture +def bool_frame_with_na(): + """ + Fixture for DataFrame of booleans with index of unique strings + + Columns are ['A', 'B', 'C', 'D']; some entries are missing + + A B C D + zBZxY2IDGd False False False False + IhBWBMWllt False True True True + ctjdvZSR6R True False True True + AVTujptmxb False True False True + G9lrImrSWq False False False True + sFFwdIUfz2 NaN NaN NaN NaN + s15ptEJnRb NaN NaN NaN NaN + ... ... ... ... ... + UW41KkDyZ4 True True False False + l9l6XkOdqV True False False False + X2MeZfzDYA False True False False + xWkIKU7vfX False True False True + QOhL6VmpGU False False False True + 22PwkRJdat False True False False + kfboQ3VeIK True False True False + + [30 rows x 4 columns] + """ + df = DataFrame(tm.getSeriesData()) > 0 + df = df.astype(object) + # set some NAs + df.loc[5:10] = np.nan + df.loc[15:20, -2:] = np.nan + return df + + +@pytest.fixture +def int_frame(): + """ + Fixture for DataFrame of ints with index of unique strings + + Columns are ['A', 'B', 'C', 'D'] + + A B C D + vpBeWjM651 1 0 1 0 + 5JyxmrP1En -1 0 0 0 + qEDaoD49U2 -1 1 0 0 + m66TkTfsFe 0 0 0 0 + EHPaNzEUFm -1 0 -1 0 + fpRJCevQhi 2 0 0 0 + OlQvnmfi3Q 0 0 -2 0 + ... .. .. .. .. + uB1FPlz4uP 0 0 0 1 + EcSe6yNzCU 0 0 -1 0 + L50VudaiI8 -1 1 -2 0 + y3bpw4nwIp 0 -1 0 0 + H0RdLLwrCT 1 1 0 0 + rY82K0vMwm 0 0 0 0 + 1OPIUjnkjk 2 0 0 0 + + [30 rows x 4 columns] + """ + df = DataFrame({k: v.astype(int) for k, v in tm.getSeriesData().items()}) + # force these all to int64 to avoid platform testing issues + return DataFrame({c: s for c, s in df.items()}, dtype=np.int64) + + +@pytest.fixture +def datetime_frame(): + """ + Fixture for DataFrame of floats with DatetimeIndex + + Columns are ['A', 'B', 'C', 'D'] + + A B C D + 2000-01-03 -1.122153 0.468535 0.122226 1.693711 + 2000-01-04 0.189378 0.486100 0.007864 -1.216052 + 2000-01-05 0.041401 -0.835752 -0.035279 -0.414357 + 2000-01-06 0.430050 0.894352 0.090719 0.036939 + 2000-01-07 -0.620982 -0.668211 -0.706153 1.466335 + 2000-01-10 -0.752633 0.328434 -0.815325 0.699674 + 2000-01-11 -2.236969 0.615737 -0.829076 -1.196106 + ... ... ... ... ... + 2000-02-03 1.642618 -0.579288 0.046005 1.385249 + 2000-02-04 -0.544873 -1.160962 -0.284071 -1.418351 + 2000-02-07 -2.656149 -0.601387 1.410148 0.444150 + 2000-02-08 -1.201881 -1.289040 0.772992 -1.445300 + 2000-02-09 1.377373 0.398619 1.008453 -0.928207 + 2000-02-10 0.473194 -0.636677 0.984058 0.511519 + 2000-02-11 -0.965556 0.408313 -1.312844 -0.381948 + + [30 rows x 4 columns] + """ + return DataFrame(tm.getTimeSeriesData()) + + +@pytest.fixture +def float_string_frame(): + """ + Fixture for DataFrame of floats and strings with index of unique strings + + Columns are ['A', 'B', 'C', 'D', 'foo']. + + A B C D foo + w3orJvq07g -1.594062 -1.084273 -1.252457 0.356460 bar + PeukuVdmz2 0.109855 -0.955086 -0.809485 0.409747 bar + ahp2KvwiM8 -1.533729 -0.142519 -0.154666 1.302623 bar + 3WSJ7BUCGd 2.484964 0.213829 0.034778 -2.327831 bar + khdAmufk0U -0.193480 -0.743518 -0.077987 0.153646 bar + LE2DZiFlrE -0.193566 -1.343194 -0.107321 0.959978 bar + HJXSJhVn7b 0.142590 1.257603 -0.659409 -0.223844 bar + ... ... ... ... ... ... + 9a1Vypttgw -1.316394 1.601354 0.173596 1.213196 bar + h5d1gVFbEy 0.609475 1.106738 -0.155271 0.294630 bar + mK9LsTQG92 1.303613 0.857040 -1.019153 0.369468 bar + oOLksd9gKH 0.558219 -0.134491 -0.289869 -0.951033 bar + 9jgoOjKyHg 0.058270 -0.496110 -0.413212 -0.852659 bar + jZLDHclHAO 0.096298 1.267510 0.549206 -0.005235 bar + lR0nxDp1C2 -2.119350 -0.794384 0.544118 0.145849 bar + + [30 rows x 5 columns] + """ + df = DataFrame(tm.getSeriesData()) + df["foo"] = "bar" + return df + + +@pytest.fixture +def mixed_float_frame(): + """ + Fixture for DataFrame of different float types with index of unique strings + + Columns are ['A', 'B', 'C', 'D']. + + A B C D + GI7bbDaEZe -0.237908 -0.246225 -0.468506 0.752993 + KGp9mFepzA -1.140809 -0.644046 -1.225586 0.801588 + VeVYLAb1l2 -1.154013 -1.677615 0.690430 -0.003731 + kmPME4WKhO 0.979578 0.998274 -0.776367 0.897607 + CPyopdXTiz 0.048119 -0.257174 0.836426 0.111266 + 0kJZQndAj0 0.274357 -0.281135 -0.344238 0.834541 + tqdwQsaHG8 -0.979716 -0.519897 0.582031 0.144710 + ... ... ... ... ... + 7FhZTWILQj -2.906357 1.261039 -0.780273 -0.537237 + 4pUDPM4eGq -2.042512 -0.464382 -0.382080 1.132612 + B8dUgUzwTi -1.506637 -0.364435 1.087891 0.297653 + hErlVYjVv9 1.477453 -0.495515 -0.713867 1.438427 + 1BKN3o7YLs 0.127535 -0.349812 -0.881836 0.489827 + 9S4Ekn7zga 1.445518 -2.095149 0.031982 0.373204 + xN1dNn6OV6 1.425017 -0.983995 -0.363281 -0.224502 + + [30 rows x 4 columns] + """ + df = DataFrame(tm.getSeriesData()) + df.A = df.A.astype("float32") + df.B = df.B.astype("float32") + df.C = df.C.astype("float16") + df.D = df.D.astype("float64") + return df + + +@pytest.fixture +def mixed_int_frame(): + """ + Fixture for DataFrame of different int types with index of unique strings + + Columns are ['A', 'B', 'C', 'D']. + + A B C D + mUrCZ67juP 0 1 2 2 + rw99ACYaKS 0 1 0 0 + 7QsEcpaaVU 0 1 1 1 + xkrimI2pcE 0 1 0 0 + dz01SuzoS8 0 1 255 255 + ccQkqOHX75 -1 1 0 0 + DN0iXaoDLd 0 1 0 0 + ... .. .. ... ... + Dfb141wAaQ 1 1 254 254 + IPD8eQOVu5 0 1 0 0 + CcaKulsCmv 0 1 0 0 + rIBa8gu7E5 0 1 0 0 + RP6peZmh5o 0 1 1 1 + NMb9pipQWQ 0 1 0 0 + PqgbJEzjib 0 1 3 3 + + [30 rows x 4 columns] + """ + df = DataFrame({k: v.astype(int) for k, v in tm.getSeriesData().items()}) + df.A = df.A.astype("int32") + df.B = np.ones(len(df.B), dtype="uint64") + df.C = df.C.astype("uint8") + df.D = df.C.astype("int64") + return df + + +@pytest.fixture +def mixed_type_frame(): + """ + Fixture for DataFrame of float/int/string columns with RangeIndex + Columns are ['a', 'b', 'c', 'float32', 'int32']. + """ + return DataFrame( + { + "a": 1.0, + "b": 2, + "c": "foo", + "float32": np.array([1.0] * 10, dtype="float32"), + "int32": np.array([1] * 10, dtype="int32"), + }, + index=np.arange(10), + ) + + +@pytest.fixture +def timezone_frame(): + """ + Fixture for DataFrame of date_range Series with different time zones + + Columns are ['A', 'B', 'C']; some entries are missing + + A B C + 0 2013-01-01 2013-01-01 00:00:00-05:00 2013-01-01 00:00:00+01:00 + 1 2013-01-02 NaT NaT + 2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00 + """ + df = DataFrame( + { + "A": date_range("20130101", periods=3), + "B": date_range("20130101", periods=3, tz="US/Eastern"), + "C": date_range("20130101", periods=3, tz="CET"), + } + ) + df.iloc[1, 1] = NaT + df.iloc[1, 2] = NaT + return df + + +@pytest.fixture +def uint64_frame(): + """ + Fixture for DataFrame with uint64 values + + Columns are ['A', 'B'] + """ + return DataFrame( + {"A": np.arange(3), "B": [2 ** 63, 2 ** 63 + 5, 2 ** 63 + 10]}, dtype=np.uint64 + ) + + +@pytest.fixture +def simple_frame(): + """ + Fixture for simple 3x3 DataFrame + + Columns are ['one', 'two', 'three'], index is ['a', 'b', 'c']. + + one two three + a 1.0 2.0 3.0 + b 4.0 5.0 6.0 + c 7.0 8.0 9.0 + """ + arr = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) + + return DataFrame(arr, columns=["one", "two", "three"], index=["a", "b", "c"]) + + +@pytest.fixture +def frame_of_index_cols(): + """ + Fixture for DataFrame of columns that can be used for indexing + + Columns are ['A', 'B', 'C', 'D', 'E', ('tuple', 'as', 'label')]; + 'A' & 'B' contain duplicates (but are jointly unique), the rest are unique. + + A B C D E (tuple, as, label) + 0 foo one a 0.608477 -0.012500 -1.664297 + 1 foo two b -0.633460 0.249614 -0.364411 + 2 foo three c 0.615256 2.154968 -0.834666 + 3 bar one d 0.234246 1.085675 0.718445 + 4 bar two e 0.533841 -0.005702 -3.533912 + """ + df = DataFrame( + { + "A": ["foo", "foo", "foo", "bar", "bar"], + "B": ["one", "two", "three", "one", "two"], + "C": ["a", "b", "c", "d", "e"], + "D": np.random.randn(5), + "E": np.random.randn(5), + ("tuple", "as", "label"): np.random.randn(5), + } + ) + return df diff --git a/venv/Lib/site-packages/pandas/tests/frame/methods/__init__.py b/venv/Lib/site-packages/pandas/tests/frame/methods/__init__.py new file mode 100644 index 0000000..245594b --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/methods/__init__.py @@ -0,0 +1,7 @@ +""" +Test files dedicated to individual (stand-alone) DataFrame methods + +Ideally these files/tests should correspond 1-to-1 with tests.series.methods + +These may also present opportunities for sharing/de-duplicating test code. +""" diff --git a/venv/Lib/site-packages/pandas/tests/frame/methods/test_append.py b/venv/Lib/site-packages/pandas/tests/frame/methods/test_append.py new file mode 100644 index 0000000..d128a51 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/methods/test_append.py @@ -0,0 +1,195 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Series, Timestamp +import pandas._testing as tm + + +class TestDataFrameAppend: + def test_append_empty_list(self): + # GH 28769 + df = DataFrame() + result = df.append([]) + expected = df + tm.assert_frame_equal(result, expected) + assert result is not df + + df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) + result = df.append([]) + expected = df + tm.assert_frame_equal(result, expected) + assert result is not df # .append() should return a new object + + def test_append_series_dict(self): + df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) + + series = df.loc[4] + msg = "Indexes have overlapping values" + with pytest.raises(ValueError, match=msg): + df.append(series, verify_integrity=True) + + series.name = None + msg = "Can only append a Series if ignore_index=True" + with pytest.raises(TypeError, match=msg): + df.append(series, verify_integrity=True) + + result = df.append(series[::-1], ignore_index=True) + expected = df.append( + DataFrame({0: series[::-1]}, index=df.columns).T, ignore_index=True + ) + tm.assert_frame_equal(result, expected) + + # dict + result = df.append(series.to_dict(), ignore_index=True) + tm.assert_frame_equal(result, expected) + + result = df.append(series[::-1][:3], ignore_index=True) + expected = df.append( + DataFrame({0: series[::-1][:3]}).T, ignore_index=True, sort=True + ) + tm.assert_frame_equal(result, expected.loc[:, result.columns]) + + # can append when name set + row = df.loc[4] + row.name = 5 + result = df.append(row) + expected = df.append(df[-1:], ignore_index=True) + tm.assert_frame_equal(result, expected) + + def test_append_list_of_series_dicts(self): + df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) + + dicts = [x.to_dict() for idx, x in df.iterrows()] + + result = df.append(dicts, ignore_index=True) + expected = df.append(df, ignore_index=True) + tm.assert_frame_equal(result, expected) + + # different columns + dicts = [ + {"foo": 1, "bar": 2, "baz": 3, "peekaboo": 4}, + {"foo": 5, "bar": 6, "baz": 7, "peekaboo": 8}, + ] + result = df.append(dicts, ignore_index=True, sort=True) + expected = df.append(DataFrame(dicts), ignore_index=True, sort=True) + tm.assert_frame_equal(result, expected) + + def test_append_missing_cols(self): + # GH22252 + # exercise the conditional branch in append method where the data + # to be appended is a list and does not contain all columns that are in + # the target DataFrame + df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) + + dicts = [{"foo": 9}, {"bar": 10}] + with tm.assert_produces_warning(None): + result = df.append(dicts, ignore_index=True, sort=True) + + expected = df.append(DataFrame(dicts), ignore_index=True, sort=True) + tm.assert_frame_equal(result, expected) + + def test_append_empty_dataframe(self): + + # Empty df append empty df + df1 = DataFrame() + df2 = DataFrame() + result = df1.append(df2) + expected = df1.copy() + tm.assert_frame_equal(result, expected) + + # Non-empty df append empty df + df1 = DataFrame(np.random.randn(5, 2)) + df2 = DataFrame() + result = df1.append(df2) + expected = df1.copy() + tm.assert_frame_equal(result, expected) + + # Empty df with columns append empty df + df1 = DataFrame(columns=["bar", "foo"]) + df2 = DataFrame() + result = df1.append(df2) + expected = df1.copy() + tm.assert_frame_equal(result, expected) + + # Non-Empty df with columns append empty df + df1 = DataFrame(np.random.randn(5, 2), columns=["bar", "foo"]) + df2 = DataFrame() + result = df1.append(df2) + expected = df1.copy() + tm.assert_frame_equal(result, expected) + + def test_append_dtypes(self): + + # GH 5754 + # row appends of different dtypes (so need to do by-item) + # can sometimes infer the correct type + + df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(5)) + df2 = DataFrame() + result = df1.append(df2) + expected = df1.copy() + tm.assert_frame_equal(result, expected) + + df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) + df2 = DataFrame({"bar": "foo"}, index=range(1, 2)) + result = df1.append(df2) + expected = DataFrame({"bar": [Timestamp("20130101"), "foo"]}) + tm.assert_frame_equal(result, expected) + + df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) + df2 = DataFrame({"bar": np.nan}, index=range(1, 2)) + result = df1.append(df2) + expected = DataFrame( + {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} + ) + tm.assert_frame_equal(result, expected) + + df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) + df2 = DataFrame({"bar": np.nan}, index=range(1, 2), dtype=object) + result = df1.append(df2) + expected = DataFrame( + {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} + ) + tm.assert_frame_equal(result, expected) + + df1 = DataFrame({"bar": np.nan}, index=range(1)) + df2 = DataFrame({"bar": Timestamp("20130101")}, index=range(1, 2)) + result = df1.append(df2) + expected = DataFrame( + {"bar": Series([np.nan, Timestamp("20130101")], dtype="M8[ns]")} + ) + tm.assert_frame_equal(result, expected) + + df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) + df2 = DataFrame({"bar": 1}, index=range(1, 2), dtype=object) + result = df1.append(df2) + expected = DataFrame({"bar": Series([Timestamp("20130101"), 1])}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "timestamp", ["2019-07-19 07:04:57+0100", "2019-07-19 07:04:57"] + ) + def test_append_timestamps_aware_or_naive(self, tz_naive_fixture, timestamp): + # GH 30238 + tz = tz_naive_fixture + df = pd.DataFrame([pd.Timestamp(timestamp, tz=tz)]) + result = df.append(df.iloc[0]).iloc[-1] + expected = pd.Series(pd.Timestamp(timestamp, tz=tz), name=0) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "data, dtype", + [ + ([1], pd.Int64Dtype()), + ([1], pd.CategoricalDtype()), + ([pd.Interval(left=0, right=5)], pd.IntervalDtype()), + ([pd.Period("2000-03", freq="M")], pd.PeriodDtype("M")), + ([1], pd.SparseDtype()), + ], + ) + def test_other_dtypes(self, data, dtype): + df = pd.DataFrame(data, dtype=dtype) + result = df.append(df.iloc[0]).iloc[-1] + expected = pd.Series(data, name=0, dtype=dtype) + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/frame/methods/test_asof.py b/venv/Lib/site-packages/pandas/tests/frame/methods/test_asof.py new file mode 100644 index 0000000..e2b4179 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/methods/test_asof.py @@ -0,0 +1,158 @@ +import numpy as np +import pytest + +from pandas import DataFrame, Period, Series, Timestamp, date_range, to_datetime +import pandas._testing as tm + + +@pytest.fixture +def date_range_frame(): + """ + Fixture for DataFrame of ints with date_range index + + Columns are ['A', 'B']. + """ + N = 50 + rng = date_range("1/1/1990", periods=N, freq="53s") + return DataFrame({"A": np.arange(N), "B": np.arange(N)}, index=rng) + + +class TestFrameAsof: + def test_basic(self, date_range_frame): + df = date_range_frame + N = 50 + df.loc[15:30, "A"] = np.nan + dates = date_range("1/1/1990", periods=N * 3, freq="25s") + + result = df.asof(dates) + assert result.notna().all(1).all() + lb = df.index[14] + ub = df.index[30] + + dates = list(dates) + + result = df.asof(dates) + assert result.notna().all(1).all() + + mask = (result.index >= lb) & (result.index < ub) + rs = result[mask] + assert (rs == 14).all(1).all() + + def test_subset(self, date_range_frame): + N = 10 + df = date_range_frame.iloc[:N].copy() + df.loc[4:8, "A"] = np.nan + dates = date_range("1/1/1990", periods=N * 3, freq="25s") + + # with a subset of A should be the same + result = df.asof(dates, subset="A") + expected = df.asof(dates) + tm.assert_frame_equal(result, expected) + + # same with A/B + result = df.asof(dates, subset=["A", "B"]) + expected = df.asof(dates) + tm.assert_frame_equal(result, expected) + + # B gives df.asof + result = df.asof(dates, subset="B") + expected = df.resample("25s", closed="right").ffill().reindex(dates) + expected.iloc[20:] = 9 + + tm.assert_frame_equal(result, expected) + + def test_missing(self, date_range_frame): + # GH 15118 + # no match found - `where` value before earliest date in index + N = 10 + df = date_range_frame.iloc[:N].copy() + + result = df.asof("1989-12-31") + + expected = Series( + index=["A", "B"], name=Timestamp("1989-12-31"), dtype=np.float64 + ) + tm.assert_series_equal(result, expected) + + result = df.asof(to_datetime(["1989-12-31"])) + expected = DataFrame( + index=to_datetime(["1989-12-31"]), columns=["A", "B"], dtype="float64" + ) + tm.assert_frame_equal(result, expected) + + # Check that we handle PeriodIndex correctly, dont end up with + # period.ordinal for series name + df = df.to_period("D") + result = df.asof("1989-12-31") + assert isinstance(result.name, Period) + + def test_all_nans(self, date_range_frame): + # GH 15713 + # DataFrame is all nans + result = DataFrame([np.nan]).asof([0]) + expected = DataFrame([np.nan]) + tm.assert_frame_equal(result, expected) + + # testing non-default indexes, multiple inputs + N = 150 + rng = date_range_frame.index + dates = date_range("1/1/1990", periods=N, freq="25s") + result = DataFrame(np.nan, index=rng, columns=["A"]).asof(dates) + expected = DataFrame(np.nan, index=dates, columns=["A"]) + tm.assert_frame_equal(result, expected) + + # testing multiple columns + dates = date_range("1/1/1990", periods=N, freq="25s") + result = DataFrame(np.nan, index=rng, columns=["A", "B", "C"]).asof(dates) + expected = DataFrame(np.nan, index=dates, columns=["A", "B", "C"]) + tm.assert_frame_equal(result, expected) + + # testing scalar input + result = DataFrame(np.nan, index=[1, 2], columns=["A", "B"]).asof([3]) + expected = DataFrame(np.nan, index=[3], columns=["A", "B"]) + tm.assert_frame_equal(result, expected) + + result = DataFrame(np.nan, index=[1, 2], columns=["A", "B"]).asof(3) + expected = Series(np.nan, index=["A", "B"], name=3) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "stamp,expected", + [ + ( + Timestamp("2018-01-01 23:22:43.325+00:00"), + Series(2.0, name=Timestamp("2018-01-01 23:22:43.325+00:00")), + ), + ( + Timestamp("2018-01-01 22:33:20.682+01:00"), + Series(1.0, name=Timestamp("2018-01-01 22:33:20.682+01:00")), + ), + ], + ) + def test_time_zone_aware_index(self, stamp, expected): + # GH21194 + # Testing awareness of DataFrame index considering different + # UTC and timezone + df = DataFrame( + data=[1, 2], + index=[ + Timestamp("2018-01-01 21:00:05.001+00:00"), + Timestamp("2018-01-01 22:35:10.550+00:00"), + ], + ) + + result = df.asof(stamp) + tm.assert_series_equal(result, expected) + + def test_is_copy(self, date_range_frame): + # GH-27357, GH-30784: ensure the result of asof is an actual copy and + # doesn't track the parent dataframe / doesn't give SettingWithCopy warnings + df = date_range_frame + N = 50 + df.loc[15:30, "A"] = np.nan + dates = date_range("1/1/1990", periods=N * 3, freq="25s") + + result = df.asof(dates) + + with tm.assert_produces_warning(None): + result["C"] = 1 diff --git a/venv/Lib/site-packages/pandas/tests/frame/methods/test_clip.py b/venv/Lib/site-packages/pandas/tests/frame/methods/test_clip.py new file mode 100644 index 0000000..34727da --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/methods/test_clip.py @@ -0,0 +1,157 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Series +import pandas._testing as tm + + +class TestDataFrameClip: + def test_clip(self, float_frame): + median = float_frame.median().median() + original = float_frame.copy() + + double = float_frame.clip(upper=median, lower=median) + assert not (double.values != median).any() + + # Verify that float_frame was not changed inplace + assert (float_frame.values == original.values).all() + + def test_inplace_clip(self, float_frame): + # GH#15388 + median = float_frame.median().median() + frame_copy = float_frame.copy() + + frame_copy.clip(upper=median, lower=median, inplace=True) + assert not (frame_copy.values != median).any() + + def test_dataframe_clip(self): + # GH#2747 + df = DataFrame(np.random.randn(1000, 2)) + + for lb, ub in [(-1, 1), (1, -1)]: + clipped_df = df.clip(lb, ub) + + lb, ub = min(lb, ub), max(ub, lb) + lb_mask = df.values <= lb + ub_mask = df.values >= ub + mask = ~lb_mask & ~ub_mask + assert (clipped_df.values[lb_mask] == lb).all() + assert (clipped_df.values[ub_mask] == ub).all() + assert (clipped_df.values[mask] == df.values[mask]).all() + + def test_clip_mixed_numeric(self): + # TODO(jreback) + # clip on mixed integer or floats + # with integer clippers coerces to float + df = DataFrame({"A": [1, 2, 3], "B": [1.0, np.nan, 3.0]}) + result = df.clip(1, 2) + expected = DataFrame({"A": [1, 2, 2], "B": [1.0, np.nan, 2.0]}) + tm.assert_frame_equal(result, expected, check_like=True) + + # GH#24162, clipping now preserves numeric types per column + df = DataFrame([[1, 2, 3.4], [3, 4, 5.6]], columns=["foo", "bar", "baz"]) + expected = df.dtypes + result = df.clip(upper=3).dtypes + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("inplace", [True, False]) + def test_clip_against_series(self, inplace): + # GH#6966 + + df = DataFrame(np.random.randn(1000, 2)) + lb = Series(np.random.randn(1000)) + ub = lb + 1 + + original = df.copy() + clipped_df = df.clip(lb, ub, axis=0, inplace=inplace) + + if inplace: + clipped_df = df + + for i in range(2): + lb_mask = original.iloc[:, i] <= lb + ub_mask = original.iloc[:, i] >= ub + mask = ~lb_mask & ~ub_mask + + result = clipped_df.loc[lb_mask, i] + tm.assert_series_equal(result, lb[lb_mask], check_names=False) + assert result.name == i + + result = clipped_df.loc[ub_mask, i] + tm.assert_series_equal(result, ub[ub_mask], check_names=False) + assert result.name == i + + tm.assert_series_equal(clipped_df.loc[mask, i], df.loc[mask, i]) + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize("lower", [[2, 3, 4], np.asarray([2, 3, 4])]) + @pytest.mark.parametrize( + "axis,res", + [ + (0, [[2.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 7.0, 7.0]]), + (1, [[2.0, 3.0, 4.0], [4.0, 5.0, 6.0], [5.0, 6.0, 7.0]]), + ], + ) + def test_clip_against_list_like(self, simple_frame, inplace, lower, axis, res): + # GH#15390 + original = simple_frame.copy(deep=True) + + result = original.clip(lower=lower, upper=[5, 6, 7], axis=axis, inplace=inplace) + + expected = pd.DataFrame(res, columns=original.columns, index=original.index) + if inplace: + result = original + tm.assert_frame_equal(result, expected, check_exact=True) + + @pytest.mark.parametrize("axis", [0, 1, None]) + def test_clip_against_frame(self, axis): + df = DataFrame(np.random.randn(1000, 2)) + lb = DataFrame(np.random.randn(1000, 2)) + ub = lb + 1 + + clipped_df = df.clip(lb, ub, axis=axis) + + lb_mask = df <= lb + ub_mask = df >= ub + mask = ~lb_mask & ~ub_mask + + tm.assert_frame_equal(clipped_df[lb_mask], lb[lb_mask]) + tm.assert_frame_equal(clipped_df[ub_mask], ub[ub_mask]) + tm.assert_frame_equal(clipped_df[mask], df[mask]) + + def test_clip_against_unordered_columns(self): + # GH#20911 + df1 = DataFrame(np.random.randn(1000, 4), columns=["A", "B", "C", "D"]) + df2 = DataFrame(np.random.randn(1000, 4), columns=["D", "A", "B", "C"]) + df3 = DataFrame(df2.values - 1, columns=["B", "D", "C", "A"]) + result_upper = df1.clip(lower=0, upper=df2) + expected_upper = df1.clip(lower=0, upper=df2[df1.columns]) + result_lower = df1.clip(lower=df3, upper=3) + expected_lower = df1.clip(lower=df3[df1.columns], upper=3) + result_lower_upper = df1.clip(lower=df3, upper=df2) + expected_lower_upper = df1.clip(lower=df3[df1.columns], upper=df2[df1.columns]) + tm.assert_frame_equal(result_upper, expected_upper) + tm.assert_frame_equal(result_lower, expected_lower) + tm.assert_frame_equal(result_lower_upper, expected_lower_upper) + + def test_clip_with_na_args(self, float_frame): + """Should process np.nan argument as None """ + # GH#17276 + tm.assert_frame_equal(float_frame.clip(np.nan), float_frame) + tm.assert_frame_equal(float_frame.clip(upper=np.nan, lower=np.nan), float_frame) + + # GH#19992 + df = DataFrame({"col_0": [1, 2, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]}) + + result = df.clip(lower=[4, 5, np.nan], axis=0) + expected = DataFrame( + {"col_0": [4, 5, np.nan], "col_1": [4, 5, np.nan], "col_2": [7, 8, np.nan]} + ) + tm.assert_frame_equal(result, expected) + + result = df.clip(lower=[4, 5, np.nan], axis=1) + expected = DataFrame( + {"col_0": [4, 4, 4], "col_1": [5, 5, 6], "col_2": [np.nan, np.nan, np.nan]} + ) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/frame/methods/test_count.py b/venv/Lib/site-packages/pandas/tests/frame/methods/test_count.py new file mode 100644 index 0000000..13a93e3 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/methods/test_count.py @@ -0,0 +1,36 @@ +from pandas import DataFrame, Series +import pandas._testing as tm + + +class TestDataFrameCount: + def test_count(self): + # corner case + frame = DataFrame() + ct1 = frame.count(1) + assert isinstance(ct1, Series) + + ct2 = frame.count(0) + assert isinstance(ct2, Series) + + # GH#423 + df = DataFrame(index=range(10)) + result = df.count(1) + expected = Series(0, index=df.index) + tm.assert_series_equal(result, expected) + + df = DataFrame(columns=range(10)) + result = df.count(0) + expected = Series(0, index=df.columns) + tm.assert_series_equal(result, expected) + + df = DataFrame() + result = df.count() + expected = Series(0, index=[]) + tm.assert_series_equal(result, expected) + + def test_count_objects(self, float_string_frame): + dm = DataFrame(float_string_frame._series) + df = DataFrame(float_string_frame._series) + + tm.assert_series_equal(dm.count(), df.count()) + tm.assert_series_equal(dm.count(1), df.count(1)) diff --git a/venv/Lib/site-packages/pandas/tests/frame/methods/test_cov_corr.py b/venv/Lib/site-packages/pandas/tests/frame/methods/test_cov_corr.py new file mode 100644 index 0000000..5c13b60 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/methods/test_cov_corr.py @@ -0,0 +1,272 @@ +import warnings + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import DataFrame, Series, isna +import pandas._testing as tm + + +class TestDataFrameCov: + def test_cov(self, float_frame, float_string_frame): + # min_periods no NAs (corner case) + expected = float_frame.cov() + result = float_frame.cov(min_periods=len(float_frame)) + + tm.assert_frame_equal(expected, result) + + result = float_frame.cov(min_periods=len(float_frame) + 1) + assert isna(result.values).all() + + # with NAs + frame = float_frame.copy() + frame["A"][:5] = np.nan + frame["B"][5:10] = np.nan + result = float_frame.cov(min_periods=len(float_frame) - 8) + expected = float_frame.cov() + expected.loc["A", "B"] = np.nan + expected.loc["B", "A"] = np.nan + + # regular + float_frame["A"][:5] = np.nan + float_frame["B"][:10] = np.nan + cov = float_frame.cov() + + tm.assert_almost_equal(cov["A"]["C"], float_frame["A"].cov(float_frame["C"])) + + # exclude non-numeric types + result = float_string_frame.cov() + expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].cov() + tm.assert_frame_equal(result, expected) + + # Single column frame + df = DataFrame(np.linspace(0.0, 1.0, 10)) + result = df.cov() + expected = DataFrame( + np.cov(df.values.T).reshape((1, 1)), index=df.columns, columns=df.columns + ) + tm.assert_frame_equal(result, expected) + df.loc[0] = np.nan + result = df.cov() + expected = DataFrame( + np.cov(df.values[1:].T).reshape((1, 1)), + index=df.columns, + columns=df.columns, + ) + tm.assert_frame_equal(result, expected) + + +class TestDataFrameCorr: + # DataFrame.corr(), as opposed to DataFrame.corrwith + + @pytest.mark.parametrize("method", ["pearson", "kendall", "spearman"]) + @td.skip_if_no_scipy + def test_corr_scipy_method(self, float_frame, method): + float_frame["A"][:5] = np.nan + float_frame["B"][5:10] = np.nan + + correls = float_frame.corr(method=method) + expected = float_frame["A"].corr(float_frame["C"], method=method) + tm.assert_almost_equal(correls["A"]["C"], expected) + + # --------------------------------------------------------------------- + + @td.skip_if_no_scipy + def test_corr_non_numeric(self, float_frame, float_string_frame): + float_frame["A"][:5] = np.nan + float_frame["B"][5:10] = np.nan + + # exclude non-numeric types + result = float_string_frame.corr() + expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].corr() + tm.assert_frame_equal(result, expected) + + @td.skip_if_no_scipy + @pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"]) + def test_corr_nooverlap(self, meth): + # nothing in common + df = DataFrame( + { + "A": [1, 1.5, 1, np.nan, np.nan, np.nan], + "B": [np.nan, np.nan, np.nan, 1, 1.5, 1], + "C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + } + ) + rs = df.corr(meth) + assert isna(rs.loc["A", "B"]) + assert isna(rs.loc["B", "A"]) + assert rs.loc["A", "A"] == 1 + assert rs.loc["B", "B"] == 1 + assert isna(rs.loc["C", "C"]) + + @td.skip_if_no_scipy + @pytest.mark.parametrize("meth", ["pearson", "spearman"]) + def test_corr_constant(self, meth): + # constant --> all NA + + df = DataFrame( + { + "A": [1, 1, 1, np.nan, np.nan, np.nan], + "B": [np.nan, np.nan, np.nan, 1, 1, 1], + } + ) + rs = df.corr(meth) + assert isna(rs.values).all() + + @td.skip_if_no_scipy + def test_corr_int_and_boolean(self): + # when dtypes of pandas series are different + # then ndarray will have dtype=object, + # so it need to be properly handled + df = DataFrame({"a": [True, False], "b": [1, 0]}) + + expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"]) + for meth in ["pearson", "kendall", "spearman"]: + + with warnings.catch_warnings(record=True): + warnings.simplefilter("ignore", RuntimeWarning) + result = df.corr(meth) + tm.assert_frame_equal(result, expected) + + def test_corr_cov_independent_index_column(self): + # GH#14617 + df = pd.DataFrame(np.random.randn(4 * 10).reshape(10, 4), columns=list("abcd")) + for method in ["cov", "corr"]: + result = getattr(df, method)() + assert result.index is not result.columns + assert result.index.equals(result.columns) + + def test_corr_invalid_method(self): + # GH#22298 + df = pd.DataFrame(np.random.normal(size=(10, 2))) + msg = "method must be either 'pearson', 'spearman', 'kendall', or a callable, " + with pytest.raises(ValueError, match=msg): + df.corr(method="____") + + def test_corr_int(self): + # dtypes other than float64 GH#1761 + df3 = DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]}) + + df3.cov() + df3.corr() + + +class TestDataFrameCorrWith: + def test_corrwith(self, datetime_frame): + a = datetime_frame + noise = Series(np.random.randn(len(a)), index=a.index) + + b = datetime_frame.add(noise, axis=0) + + # make sure order does not matter + b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][10:]) + del b["B"] + + colcorr = a.corrwith(b, axis=0) + tm.assert_almost_equal(colcorr["A"], a["A"].corr(b["A"])) + + rowcorr = a.corrwith(b, axis=1) + tm.assert_series_equal(rowcorr, a.T.corrwith(b.T, axis=0)) + + dropped = a.corrwith(b, axis=0, drop=True) + tm.assert_almost_equal(dropped["A"], a["A"].corr(b["A"])) + assert "B" not in dropped + + dropped = a.corrwith(b, axis=1, drop=True) + assert a.index[-1] not in dropped.index + + # non time-series data + index = ["a", "b", "c", "d", "e"] + columns = ["one", "two", "three", "four"] + df1 = DataFrame(np.random.randn(5, 4), index=index, columns=columns) + df2 = DataFrame(np.random.randn(4, 4), index=index[:4], columns=columns) + correls = df1.corrwith(df2, axis=1) + for row in index[:4]: + tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row])) + + def test_corrwith_with_objects(self): + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame() + cols = ["A", "B", "C", "D"] + + df1["obj"] = "foo" + df2["obj"] = "bar" + + result = df1.corrwith(df2) + expected = df1.loc[:, cols].corrwith(df2.loc[:, cols]) + tm.assert_series_equal(result, expected) + + result = df1.corrwith(df2, axis=1) + expected = df1.loc[:, cols].corrwith(df2.loc[:, cols], axis=1) + tm.assert_series_equal(result, expected) + + def test_corrwith_series(self, datetime_frame): + result = datetime_frame.corrwith(datetime_frame["A"]) + expected = datetime_frame.apply(datetime_frame["A"].corr) + + tm.assert_series_equal(result, expected) + + def test_corrwith_matches_corrcoef(self): + df1 = DataFrame(np.arange(10000), columns=["a"]) + df2 = DataFrame(np.arange(10000) ** 2, columns=["a"]) + c1 = df1.corrwith(df2)["a"] + c2 = np.corrcoef(df1["a"], df2["a"])[0][1] + + tm.assert_almost_equal(c1, c2) + assert c1 < 1 + + def test_corrwith_mixed_dtypes(self): + # GH#18570 + df = pd.DataFrame( + {"a": [1, 4, 3, 2], "b": [4, 6, 7, 3], "c": ["a", "b", "c", "d"]} + ) + s = pd.Series([0, 6, 7, 3]) + result = df.corrwith(s) + corrs = [df["a"].corr(s), df["b"].corr(s)] + expected = pd.Series(data=corrs, index=["a", "b"]) + tm.assert_series_equal(result, expected) + + def test_corrwith_index_intersection(self): + df1 = pd.DataFrame(np.random.random(size=(10, 2)), columns=["a", "b"]) + df2 = pd.DataFrame(np.random.random(size=(10, 3)), columns=["a", "b", "c"]) + + result = df1.corrwith(df2, drop=True).index.sort_values() + expected = df1.columns.intersection(df2.columns).sort_values() + tm.assert_index_equal(result, expected) + + def test_corrwith_index_union(self): + df1 = pd.DataFrame(np.random.random(size=(10, 2)), columns=["a", "b"]) + df2 = pd.DataFrame(np.random.random(size=(10, 3)), columns=["a", "b", "c"]) + + result = df1.corrwith(df2, drop=False).index.sort_values() + expected = df1.columns.union(df2.columns).sort_values() + tm.assert_index_equal(result, expected) + + def test_corrwith_dup_cols(self): + # GH#21925 + df1 = pd.DataFrame(np.vstack([np.arange(10)] * 3).T) + df2 = df1.copy() + df2 = pd.concat((df2, df2[0]), axis=1) + + result = df1.corrwith(df2) + expected = pd.Series(np.ones(4), index=[0, 0, 1, 2]) + tm.assert_series_equal(result, expected) + + @td.skip_if_no_scipy + def test_corrwith_spearman(self): + # GH#21925 + df = pd.DataFrame(np.random.random(size=(100, 3))) + result = df.corrwith(df ** 2, method="spearman") + expected = Series(np.ones(len(result))) + tm.assert_series_equal(result, expected) + + @td.skip_if_no_scipy + def test_corrwith_kendall(self): + # GH#21925 + df = pd.DataFrame(np.random.random(size=(100, 3))) + result = df.corrwith(df ** 2, method="kendall") + expected = Series(np.ones(len(result))) + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/frame/methods/test_describe.py b/venv/Lib/site-packages/pandas/tests/frame/methods/test_describe.py new file mode 100644 index 0000000..251563e --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/methods/test_describe.py @@ -0,0 +1,333 @@ +import numpy as np + +import pandas as pd +from pandas import Categorical, DataFrame, Series, Timestamp, date_range +import pandas._testing as tm + + +class TestDataFrameDescribe: + def test_describe_bool_in_mixed_frame(self): + df = DataFrame( + { + "string_data": ["a", "b", "c", "d", "e"], + "bool_data": [True, True, False, False, False], + "int_data": [10, 20, 30, 40, 50], + } + ) + + # Integer data are included in .describe() output, + # Boolean and string data are not. + result = df.describe() + expected = DataFrame( + {"int_data": [5, 30, df.int_data.std(), 10, 20, 30, 40, 50]}, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_frame_equal(result, expected) + + # Top value is a boolean value that is False + result = df.describe(include=["bool"]) + + expected = DataFrame( + {"bool_data": [5, 2, False, 3]}, index=["count", "unique", "top", "freq"] + ) + tm.assert_frame_equal(result, expected) + + def test_describe_empty_object(self): + # GH#27183 + df = pd.DataFrame({"A": [None, None]}, dtype=object) + result = df.describe() + expected = pd.DataFrame( + {"A": [0, 0, np.nan, np.nan]}, + dtype=object, + index=["count", "unique", "top", "freq"], + ) + tm.assert_frame_equal(result, expected) + + result = df.iloc[:0].describe() + tm.assert_frame_equal(result, expected) + + def test_describe_bool_frame(self): + # GH#13891 + df = pd.DataFrame( + { + "bool_data_1": [False, False, True, True], + "bool_data_2": [False, True, True, True], + } + ) + result = df.describe() + expected = DataFrame( + {"bool_data_1": [4, 2, True, 2], "bool_data_2": [4, 2, True, 3]}, + index=["count", "unique", "top", "freq"], + ) + tm.assert_frame_equal(result, expected) + + df = pd.DataFrame( + { + "bool_data": [False, False, True, True, False], + "int_data": [0, 1, 2, 3, 4], + } + ) + result = df.describe() + expected = DataFrame( + {"int_data": [5, 2, df.int_data.std(), 0, 1, 2, 3, 4]}, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_frame_equal(result, expected) + + df = pd.DataFrame( + {"bool_data": [False, False, True, True], "str_data": ["a", "b", "c", "a"]} + ) + result = df.describe() + expected = DataFrame( + {"bool_data": [4, 2, True, 2], "str_data": [4, 3, "a", 2]}, + index=["count", "unique", "top", "freq"], + ) + tm.assert_frame_equal(result, expected) + + def test_describe_categorical(self): + df = DataFrame({"value": np.random.randint(0, 10000, 100)}) + labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] + cat_labels = Categorical(labels, labels) + + df = df.sort_values(by=["value"], ascending=True) + df["value_group"] = pd.cut( + df.value, range(0, 10500, 500), right=False, labels=cat_labels + ) + cat = df + + # Categoricals should not show up together with numerical columns + result = cat.describe() + assert len(result.columns) == 1 + + # In a frame, describe() for the cat should be the same as for string + # arrays (count, unique, top, freq) + + cat = Categorical( + ["a", "b", "b", "b"], categories=["a", "b", "c"], ordered=True + ) + s = Series(cat) + result = s.describe() + expected = Series([4, 2, "b", 3], index=["count", "unique", "top", "freq"]) + tm.assert_series_equal(result, expected) + + cat = Series(Categorical(["a", "b", "c", "c"])) + df3 = DataFrame({"cat": cat, "s": ["a", "b", "c", "c"]}) + result = df3.describe() + tm.assert_numpy_array_equal(result["cat"].values, result["s"].values) + + def test_describe_empty_categorical_column(self): + # GH#26397 + # Ensure the index of an an empty categorical DataFrame column + # also contains (count, unique, top, freq) + df = pd.DataFrame({"empty_col": Categorical([])}) + result = df.describe() + expected = DataFrame( + {"empty_col": [0, 0, np.nan, np.nan]}, + index=["count", "unique", "top", "freq"], + dtype="object", + ) + tm.assert_frame_equal(result, expected) + # ensure NaN, not None + assert np.isnan(result.iloc[2, 0]) + assert np.isnan(result.iloc[3, 0]) + + def test_describe_categorical_columns(self): + # GH#11558 + columns = pd.CategoricalIndex(["int1", "int2", "obj"], ordered=True, name="XXX") + df = DataFrame( + { + "int1": [10, 20, 30, 40, 50], + "int2": [10, 20, 30, 40, 50], + "obj": ["A", 0, None, "X", 1], + }, + columns=columns, + ) + result = df.describe() + + exp_columns = pd.CategoricalIndex( + ["int1", "int2"], + categories=["int1", "int2", "obj"], + ordered=True, + name="XXX", + ) + expected = DataFrame( + { + "int1": [5, 30, df.int1.std(), 10, 20, 30, 40, 50], + "int2": [5, 30, df.int2.std(), 10, 20, 30, 40, 50], + }, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + columns=exp_columns, + ) + + tm.assert_frame_equal(result, expected) + tm.assert_categorical_equal(result.columns.values, expected.columns.values) + + def test_describe_datetime_columns(self): + columns = pd.DatetimeIndex( + ["2011-01-01", "2011-02-01", "2011-03-01"], + freq="MS", + tz="US/Eastern", + name="XXX", + ) + df = DataFrame( + { + 0: [10, 20, 30, 40, 50], + 1: [10, 20, 30, 40, 50], + 2: ["A", 0, None, "X", 1], + } + ) + df.columns = columns + result = df.describe() + + exp_columns = pd.DatetimeIndex( + ["2011-01-01", "2011-02-01"], freq="MS", tz="US/Eastern", name="XXX" + ) + expected = DataFrame( + { + 0: [5, 30, df.iloc[:, 0].std(), 10, 20, 30, 40, 50], + 1: [5, 30, df.iloc[:, 1].std(), 10, 20, 30, 40, 50], + }, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + expected.columns = exp_columns + tm.assert_frame_equal(result, expected) + assert result.columns.freq == "MS" + assert result.columns.tz == expected.columns.tz + + def test_describe_timedelta_values(self): + # GH#6145 + t1 = pd.timedelta_range("1 days", freq="D", periods=5) + t2 = pd.timedelta_range("1 hours", freq="H", periods=5) + df = pd.DataFrame({"t1": t1, "t2": t2}) + + expected = DataFrame( + { + "t1": [ + 5, + pd.Timedelta("3 days"), + df.iloc[:, 0].std(), + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), + pd.Timedelta("4 days"), + pd.Timedelta("5 days"), + ], + "t2": [ + 5, + pd.Timedelta("3 hours"), + df.iloc[:, 1].std(), + pd.Timedelta("1 hours"), + pd.Timedelta("2 hours"), + pd.Timedelta("3 hours"), + pd.Timedelta("4 hours"), + pd.Timedelta("5 hours"), + ], + }, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + + result = df.describe() + tm.assert_frame_equal(result, expected) + + exp_repr = ( + " t1 t2\n" + "count 5 5\n" + "mean 3 days 00:00:00 0 days 03:00:00\n" + "std 1 days 13:56:50.394919 0 days 01:34:52.099788\n" + "min 1 days 00:00:00 0 days 01:00:00\n" + "25% 2 days 00:00:00 0 days 02:00:00\n" + "50% 3 days 00:00:00 0 days 03:00:00\n" + "75% 4 days 00:00:00 0 days 04:00:00\n" + "max 5 days 00:00:00 0 days 05:00:00" + ) + assert repr(result) == exp_repr + + def test_describe_tz_values(self, tz_naive_fixture): + # GH#21332 + tz = tz_naive_fixture + s1 = Series(range(5)) + start = Timestamp(2018, 1, 1) + end = Timestamp(2018, 1, 5) + s2 = Series(date_range(start, end, tz=tz)) + df = pd.DataFrame({"s1": s1, "s2": s2}) + + expected = DataFrame( + { + "s1": [ + 5, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + 2, + 1.581139, + 0, + 1, + 2, + 3, + 4, + ], + "s2": [ + 5, + 5, + s2.value_counts().index[0], + 1, + start.tz_localize(tz), + end.tz_localize(tz), + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + ], + }, + index=[ + "count", + "unique", + "top", + "freq", + "first", + "last", + "mean", + "std", + "min", + "25%", + "50%", + "75%", + "max", + ], + ) + result = df.describe(include="all") + tm.assert_frame_equal(result, expected) + + def test_describe_percentiles_integer_idx(self): + # GH#26660 + df = pd.DataFrame({"x": [1]}) + pct = np.linspace(0, 1, 10 + 1) + result = df.describe(percentiles=pct) + + expected = DataFrame( + {"x": [1.0, 1.0, np.NaN, 1.0, *[1.0 for _ in pct], 1.0]}, + index=[ + "count", + "mean", + "std", + "min", + "0%", + "10%", + "20%", + "30%", + "40%", + "50%", + "60%", + "70%", + "80%", + "90%", + "100%", + "max", + ], + ) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/frame/methods/test_diff.py b/venv/Lib/site-packages/pandas/tests/frame/methods/test_diff.py new file mode 100644 index 0000000..43c25f4 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/methods/test_diff.py @@ -0,0 +1,120 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Series, Timestamp, date_range +import pandas._testing as tm + + +class TestDataFrameDiff: + def test_diff(self, datetime_frame): + the_diff = datetime_frame.diff(1) + + tm.assert_series_equal( + the_diff["A"], datetime_frame["A"] - datetime_frame["A"].shift(1) + ) + + # int dtype + a = 10000000000000000 + b = a + 1 + s = Series([a, b]) + + rs = DataFrame({"s": s}).diff() + assert rs.s[1] == 1 + + # mixed numeric + tf = datetime_frame.astype("float32") + the_diff = tf.diff(1) + tm.assert_series_equal(the_diff["A"], tf["A"] - tf["A"].shift(1)) + + # GH#10907 + df = pd.DataFrame({"y": pd.Series([2]), "z": pd.Series([3])}) + df.insert(0, "x", 1) + result = df.diff(axis=1) + expected = pd.DataFrame( + {"x": np.nan, "y": pd.Series(1), "z": pd.Series(1)} + ).astype("float64") + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("tz", [None, "UTC"]) + def test_diff_datetime_axis0(self, tz): + # GH#18578 + df = DataFrame( + { + 0: date_range("2010", freq="D", periods=2, tz=tz), + 1: date_range("2010", freq="D", periods=2, tz=tz), + } + ) + + result = df.diff(axis=0) + expected = DataFrame( + { + 0: pd.TimedeltaIndex(["NaT", "1 days"]), + 1: pd.TimedeltaIndex(["NaT", "1 days"]), + } + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("tz", [None, "UTC"]) + def test_diff_datetime_axis1(self, tz): + # GH#18578 + df = DataFrame( + { + 0: date_range("2010", freq="D", periods=2, tz=tz), + 1: date_range("2010", freq="D", periods=2, tz=tz), + } + ) + if tz is None: + result = df.diff(axis=1) + expected = DataFrame( + { + 0: pd.TimedeltaIndex(["NaT", "NaT"]), + 1: pd.TimedeltaIndex(["0 days", "0 days"]), + } + ) + tm.assert_frame_equal(result, expected) + else: + with pytest.raises(NotImplementedError): + result = df.diff(axis=1) + + def test_diff_timedelta(self): + # GH#4533 + df = DataFrame( + dict( + time=[Timestamp("20130101 9:01"), Timestamp("20130101 9:02")], + value=[1.0, 2.0], + ) + ) + + res = df.diff() + exp = DataFrame( + [[pd.NaT, np.nan], [pd.Timedelta("00:01:00"), 1]], columns=["time", "value"] + ) + tm.assert_frame_equal(res, exp) + + def test_diff_mixed_dtype(self): + df = DataFrame(np.random.randn(5, 3)) + df["A"] = np.array([1, 2, 3, 4, 5], dtype=object) + + result = df.diff() + assert result[0].dtype == np.float64 + + def test_diff_neg_n(self, datetime_frame): + rs = datetime_frame.diff(-1) + xp = datetime_frame - datetime_frame.shift(-1) + tm.assert_frame_equal(rs, xp) + + def test_diff_float_n(self, datetime_frame): + rs = datetime_frame.diff(1.0) + xp = datetime_frame.diff(1) + tm.assert_frame_equal(rs, xp) + + def test_diff_axis(self): + # GH#9727 + df = DataFrame([[1.0, 2.0], [3.0, 4.0]]) + tm.assert_frame_equal( + df.diff(axis=1), DataFrame([[np.nan, 1.0], [np.nan, 1.0]]) + ) + tm.assert_frame_equal( + df.diff(axis=0), DataFrame([[np.nan, np.nan], [2.0, 2.0]]) + ) diff --git a/venv/Lib/site-packages/pandas/tests/frame/methods/test_drop_duplicates.py b/venv/Lib/site-packages/pandas/tests/frame/methods/test_drop_duplicates.py new file mode 100644 index 0000000..fd4bae2 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/methods/test_drop_duplicates.py @@ -0,0 +1,420 @@ +import re + +import numpy as np +import pytest + +from pandas import DataFrame +import pandas._testing as tm + + +@pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]]) +def test_drop_duplicates_with_misspelled_column_name(subset): + # GH 19730 + df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]}) + msg = re.escape("Index(['a'], dtype='object')") + + with pytest.raises(KeyError, match=msg): + df.drop_duplicates(subset) + + +def test_drop_duplicates(): + df = DataFrame( + { + "AAA": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": [1, 1, 2, 2, 2, 2, 1, 2], + "D": range(8), + } + ) + # single column + result = df.drop_duplicates("AAA") + expected = df[:2] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("AAA", keep="last") + expected = df.loc[[6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("AAA", keep=False) + expected = df.loc[[]] + tm.assert_frame_equal(result, expected) + assert len(result) == 0 + + # multi column + expected = df.loc[[0, 1, 2, 3]] + result = df.drop_duplicates(np.array(["AAA", "B"])) + tm.assert_frame_equal(result, expected) + result = df.drop_duplicates(["AAA", "B"]) + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(("AAA", "B"), keep="last") + expected = df.loc[[0, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(("AAA", "B"), keep=False) + expected = df.loc[[0]] + tm.assert_frame_equal(result, expected) + + # consider everything + df2 = df.loc[:, ["AAA", "B", "C"]] + + result = df2.drop_duplicates() + # in this case only + expected = df2.drop_duplicates(["AAA", "B"]) + tm.assert_frame_equal(result, expected) + + result = df2.drop_duplicates(keep="last") + expected = df2.drop_duplicates(["AAA", "B"], keep="last") + tm.assert_frame_equal(result, expected) + + result = df2.drop_duplicates(keep=False) + expected = df2.drop_duplicates(["AAA", "B"], keep=False) + tm.assert_frame_equal(result, expected) + + # integers + result = df.drop_duplicates("C") + expected = df.iloc[[0, 2]] + tm.assert_frame_equal(result, expected) + result = df.drop_duplicates("C", keep="last") + expected = df.iloc[[-2, -1]] + tm.assert_frame_equal(result, expected) + + df["E"] = df["C"].astype("int8") + result = df.drop_duplicates("E") + expected = df.iloc[[0, 2]] + tm.assert_frame_equal(result, expected) + result = df.drop_duplicates("E", keep="last") + expected = df.iloc[[-2, -1]] + tm.assert_frame_equal(result, expected) + + # GH 11376 + df = DataFrame({"x": [7, 6, 3, 3, 4, 8, 0], "y": [0, 6, 5, 5, 9, 1, 2]}) + expected = df.loc[df.index != 3] + tm.assert_frame_equal(df.drop_duplicates(), expected) + + df = DataFrame([[1, 0], [0, 2]]) + tm.assert_frame_equal(df.drop_duplicates(), df) + + df = DataFrame([[-2, 0], [0, -4]]) + tm.assert_frame_equal(df.drop_duplicates(), df) + + x = np.iinfo(np.int64).max / 3 * 2 + df = DataFrame([[-x, x], [0, x + 4]]) + tm.assert_frame_equal(df.drop_duplicates(), df) + + df = DataFrame([[-x, x], [x, x + 4]]) + tm.assert_frame_equal(df.drop_duplicates(), df) + + # GH 11864 + df = DataFrame([i] * 9 for i in range(16)) + df = df.append([[1] + [0] * 8], ignore_index=True) + + for keep in ["first", "last", False]: + assert df.duplicated(keep=keep).sum() == 0 + + +def test_drop_duplicates_with_duplicate_column_names(): + # GH17836 + df = DataFrame([[1, 2, 5], [3, 4, 6], [3, 4, 7]], columns=["a", "a", "b"]) + + result0 = df.drop_duplicates() + tm.assert_frame_equal(result0, df) + + result1 = df.drop_duplicates("a") + expected1 = df[:2] + tm.assert_frame_equal(result1, expected1) + + +def test_drop_duplicates_for_take_all(): + df = DataFrame( + { + "AAA": ["foo", "bar", "baz", "bar", "foo", "bar", "qux", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": [1, 1, 2, 2, 2, 2, 1, 2], + "D": range(8), + } + ) + # single column + result = df.drop_duplicates("AAA") + expected = df.iloc[[0, 1, 2, 6]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("AAA", keep="last") + expected = df.iloc[[2, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("AAA", keep=False) + expected = df.iloc[[2, 6]] + tm.assert_frame_equal(result, expected) + + # multiple columns + result = df.drop_duplicates(["AAA", "B"]) + expected = df.iloc[[0, 1, 2, 3, 4, 6]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(["AAA", "B"], keep="last") + expected = df.iloc[[0, 1, 2, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(["AAA", "B"], keep=False) + expected = df.iloc[[0, 1, 2, 6]] + tm.assert_frame_equal(result, expected) + + +def test_drop_duplicates_tuple(): + df = DataFrame( + { + ("AA", "AB"): ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": [1, 1, 2, 2, 2, 2, 1, 2], + "D": range(8), + } + ) + # single column + result = df.drop_duplicates(("AA", "AB")) + expected = df[:2] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(("AA", "AB"), keep="last") + expected = df.loc[[6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(("AA", "AB"), keep=False) + expected = df.loc[[]] # empty df + assert len(result) == 0 + tm.assert_frame_equal(result, expected) + + # multi column + expected = df.loc[[0, 1, 2, 3]] + result = df.drop_duplicates((("AA", "AB"), "B")) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "df", + [ + DataFrame(), + DataFrame(columns=[]), + DataFrame(columns=["A", "B", "C"]), + DataFrame(index=[]), + DataFrame(index=["A", "B", "C"]), + ], +) +def test_drop_duplicates_empty(df): + # GH 20516 + result = df.drop_duplicates() + tm.assert_frame_equal(result, df) + + result = df.copy() + result.drop_duplicates(inplace=True) + tm.assert_frame_equal(result, df) + + +def test_drop_duplicates_NA(): + # none + df = DataFrame( + { + "A": [None, None, "foo", "bar", "foo", "bar", "bar", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": [1.0, np.nan, np.nan, np.nan, 1.0, 1.0, 1, 1.0], + "D": range(8), + } + ) + # single column + result = df.drop_duplicates("A") + expected = df.loc[[0, 2, 3]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("A", keep="last") + expected = df.loc[[1, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("A", keep=False) + expected = df.loc[[]] # empty df + tm.assert_frame_equal(result, expected) + assert len(result) == 0 + + # multi column + result = df.drop_duplicates(["A", "B"]) + expected = df.loc[[0, 2, 3, 6]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(["A", "B"], keep="last") + expected = df.loc[[1, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(["A", "B"], keep=False) + expected = df.loc[[6]] + tm.assert_frame_equal(result, expected) + + # nan + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": [1.0, np.nan, np.nan, np.nan, 1.0, 1.0, 1, 1.0], + "D": range(8), + } + ) + # single column + result = df.drop_duplicates("C") + expected = df[:2] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("C", keep="last") + expected = df.loc[[3, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("C", keep=False) + expected = df.loc[[]] # empty df + tm.assert_frame_equal(result, expected) + assert len(result) == 0 + + # multi column + result = df.drop_duplicates(["C", "B"]) + expected = df.loc[[0, 1, 2, 4]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(["C", "B"], keep="last") + expected = df.loc[[1, 3, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(["C", "B"], keep=False) + expected = df.loc[[1]] + tm.assert_frame_equal(result, expected) + + +def test_drop_duplicates_NA_for_take_all(): + # none + df = DataFrame( + { + "A": [None, None, "foo", "bar", "foo", "baz", "bar", "qux"], + "C": [1.0, np.nan, np.nan, np.nan, 1.0, 2.0, 3, 1.0], + } + ) + + # single column + result = df.drop_duplicates("A") + expected = df.iloc[[0, 2, 3, 5, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("A", keep="last") + expected = df.iloc[[1, 4, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("A", keep=False) + expected = df.iloc[[5, 7]] + tm.assert_frame_equal(result, expected) + + # nan + + # single column + result = df.drop_duplicates("C") + expected = df.iloc[[0, 1, 5, 6]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("C", keep="last") + expected = df.iloc[[3, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("C", keep=False) + expected = df.iloc[[5, 6]] + tm.assert_frame_equal(result, expected) + + +def test_drop_duplicates_inplace(): + orig = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": [1, 1, 2, 2, 2, 2, 1, 2], + "D": range(8), + } + ) + # single column + df = orig.copy() + df.drop_duplicates("A", inplace=True) + expected = orig[:2] + result = df + tm.assert_frame_equal(result, expected) + + df = orig.copy() + df.drop_duplicates("A", keep="last", inplace=True) + expected = orig.loc[[6, 7]] + result = df + tm.assert_frame_equal(result, expected) + + df = orig.copy() + df.drop_duplicates("A", keep=False, inplace=True) + expected = orig.loc[[]] + result = df + tm.assert_frame_equal(result, expected) + assert len(df) == 0 + + # multi column + df = orig.copy() + df.drop_duplicates(["A", "B"], inplace=True) + expected = orig.loc[[0, 1, 2, 3]] + result = df + tm.assert_frame_equal(result, expected) + + df = orig.copy() + df.drop_duplicates(["A", "B"], keep="last", inplace=True) + expected = orig.loc[[0, 5, 6, 7]] + result = df + tm.assert_frame_equal(result, expected) + + df = orig.copy() + df.drop_duplicates(["A", "B"], keep=False, inplace=True) + expected = orig.loc[[0]] + result = df + tm.assert_frame_equal(result, expected) + + # consider everything + orig2 = orig.loc[:, ["A", "B", "C"]].copy() + + df2 = orig2.copy() + df2.drop_duplicates(inplace=True) + # in this case only + expected = orig2.drop_duplicates(["A", "B"]) + result = df2 + tm.assert_frame_equal(result, expected) + + df2 = orig2.copy() + df2.drop_duplicates(keep="last", inplace=True) + expected = orig2.drop_duplicates(["A", "B"], keep="last") + result = df2 + tm.assert_frame_equal(result, expected) + + df2 = orig2.copy() + df2.drop_duplicates(keep=False, inplace=True) + expected = orig2.drop_duplicates(["A", "B"], keep=False) + result = df2 + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("inplace", [True, False]) +@pytest.mark.parametrize( + "origin_dict, output_dict, ignore_index, output_index", + [ + ({"A": [2, 2, 3]}, {"A": [2, 3]}, True, [0, 1]), + ({"A": [2, 2, 3]}, {"A": [2, 3]}, False, [0, 2]), + ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, True, [0, 1]), + ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, False, [0, 2]), + ], +) +def test_drop_duplicates_ignore_index( + inplace, origin_dict, output_dict, ignore_index, output_index +): + # GH 30114 + df = DataFrame(origin_dict) + expected = DataFrame(output_dict, index=output_index) + + if inplace: + result_df = df.copy() + result_df.drop_duplicates(ignore_index=ignore_index, inplace=inplace) + else: + result_df = df.drop_duplicates(ignore_index=ignore_index, inplace=inplace) + + tm.assert_frame_equal(result_df, expected) + tm.assert_frame_equal(df, DataFrame(origin_dict)) diff --git a/venv/Lib/site-packages/pandas/tests/frame/methods/test_duplicated.py b/venv/Lib/site-packages/pandas/tests/frame/methods/test_duplicated.py new file mode 100644 index 0000000..72eec87 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/methods/test_duplicated.py @@ -0,0 +1,100 @@ +import re + +import numpy as np +import pytest + +from pandas import DataFrame, Series +import pandas._testing as tm + + +@pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]]) +def test_duplicated_with_misspelled_column_name(subset): + # GH 19730 + df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]}) + msg = re.escape("Index(['a'], dtype='object')") + + with pytest.raises(KeyError, match=msg): + df.duplicated(subset) + + +@pytest.mark.slow +def test_duplicated_do_not_fail_on_wide_dataframes(): + # gh-21524 + # Given the wide dataframe with a lot of columns + # with different (important!) values + data = { + "col_{0:02d}".format(i): np.random.randint(0, 1000, 30000) for i in range(100) + } + df = DataFrame(data).T + result = df.duplicated() + + # Then duplicates produce the bool Series as a result and don't fail during + # calculation. Actual values doesn't matter here, though usually it's all + # False in this case + assert isinstance(result, Series) + assert result.dtype == np.bool + + +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, False, True])), + ("last", Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])), + ], +) +def test_duplicated_keep(keep, expected): + df = DataFrame({"A": [0, 1, 1, 2, 0], "B": ["a", "b", "b", "c", "a"]}) + + result = df.duplicated(keep=keep) + tm.assert_series_equal(result, expected) + + +@pytest.mark.xfail(reason="GH#21720; nan/None falsely considered equal") +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, False, True])), + ("last", Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])), + ], +) +def test_duplicated_nan_none(keep, expected): + df = DataFrame({"C": [np.nan, 3, 3, None, np.nan]}, dtype=object) + + result = df.duplicated(keep=keep) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("keep", ["first", "last", False]) +@pytest.mark.parametrize("subset", [None, ["A", "B"], "A"]) +def test_duplicated_subset(subset, keep): + df = DataFrame( + { + "A": [0, 1, 1, 2, 0], + "B": ["a", "b", "b", "c", "a"], + "C": [np.nan, 3, 3, None, np.nan], + } + ) + + if subset is None: + subset = list(df.columns) + elif isinstance(subset, str): + # need to have a DataFrame, not a Series + # -> select columns with singleton list, not string + subset = [subset] + + expected = df[subset].duplicated(keep=keep) + result = df.duplicated(keep=keep, subset=subset) + tm.assert_series_equal(result, expected) + + +def test_duplicated_on_empty_frame(): + # GH 25184 + + df = DataFrame(columns=["a", "b"]) + dupes = df.duplicated("a") + + result = df[dupes] + expected = df.copy() + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/frame/methods/test_explode.py b/venv/Lib/site-packages/pandas/tests/frame/methods/test_explode.py new file mode 100644 index 0000000..76c87ed --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/methods/test_explode.py @@ -0,0 +1,164 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +def test_error(): + df = pd.DataFrame( + {"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1} + ) + with pytest.raises(ValueError): + df.explode(list("AA")) + + df.columns = list("AA") + with pytest.raises(ValueError): + df.explode("A") + + +def test_basic(): + df = pd.DataFrame( + {"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1} + ) + result = df.explode("A") + expected = pd.DataFrame( + { + "A": pd.Series( + [0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object + ), + "B": 1, + } + ) + tm.assert_frame_equal(result, expected) + + +def test_multi_index_rows(): + df = pd.DataFrame( + {"A": np.array([[0, 1, 2], np.nan, [], (3, 4)], dtype=object), "B": 1}, + index=pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]), + ) + + result = df.explode("A") + expected = pd.DataFrame( + { + "A": pd.Series( + [0, 1, 2, np.nan, np.nan, 3, 4], + index=pd.MultiIndex.from_tuples( + [ + ("a", 1), + ("a", 1), + ("a", 1), + ("a", 2), + ("b", 1), + ("b", 2), + ("b", 2), + ] + ), + dtype=object, + ), + "B": 1, + } + ) + tm.assert_frame_equal(result, expected) + + +def test_multi_index_columns(): + df = pd.DataFrame( + {("A", 1): np.array([[0, 1, 2], np.nan, [], (3, 4)], dtype=object), ("A", 2): 1} + ) + + result = df.explode(("A", 1)) + expected = pd.DataFrame( + { + ("A", 1): pd.Series( + [0, 1, 2, np.nan, np.nan, 3, 4], + index=pd.Index([0, 0, 0, 1, 2, 3, 3]), + dtype=object, + ), + ("A", 2): 1, + } + ) + tm.assert_frame_equal(result, expected) + + +def test_usecase(): + # explode a single column + # gh-10511 + df = pd.DataFrame( + [[11, range(5), 10], [22, range(3), 20]], columns=list("ABC") + ).set_index("C") + result = df.explode("B") + + expected = pd.DataFrame( + { + "A": [11, 11, 11, 11, 11, 22, 22, 22], + "B": np.array([0, 1, 2, 3, 4, 0, 1, 2], dtype=object), + "C": [10, 10, 10, 10, 10, 20, 20, 20], + }, + columns=list("ABC"), + ).set_index("C") + + tm.assert_frame_equal(result, expected) + + # gh-8517 + df = pd.DataFrame( + [["2014-01-01", "Alice", "A B"], ["2014-01-02", "Bob", "C D"]], + columns=["dt", "name", "text"], + ) + result = df.assign(text=df.text.str.split(" ")).explode("text") + expected = pd.DataFrame( + [ + ["2014-01-01", "Alice", "A"], + ["2014-01-01", "Alice", "B"], + ["2014-01-02", "Bob", "C"], + ["2014-01-02", "Bob", "D"], + ], + columns=["dt", "name", "text"], + index=[0, 0, 1, 1], + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "input_dict, input_index, expected_dict, expected_index", + [ + ( + {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]}, + [0, 0], + {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]}, + [0, 0, 0, 0], + ), + ( + {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]}, + pd.Index([0, 0], name="my_index"), + {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]}, + pd.Index([0, 0, 0, 0], name="my_index"), + ), + ( + {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]}, + pd.MultiIndex.from_arrays( + [[0, 0], [1, 1]], names=["my_first_index", "my_second_index"] + ), + {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]}, + pd.MultiIndex.from_arrays( + [[0, 0, 0, 0], [1, 1, 1, 1]], + names=["my_first_index", "my_second_index"], + ), + ), + ( + {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]}, + pd.MultiIndex.from_arrays([[0, 0], [1, 1]], names=["my_index", None]), + {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]}, + pd.MultiIndex.from_arrays( + [[0, 0, 0, 0], [1, 1, 1, 1]], names=["my_index", None] + ), + ), + ], +) +def test_duplicate_index(input_dict, input_index, expected_dict, expected_index): + # GH 28005 + df = pd.DataFrame(input_dict, index=input_index) + result = df.explode("col1") + expected = pd.DataFrame(expected_dict, index=expected_index, dtype=object) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/frame/methods/test_isin.py b/venv/Lib/site-packages/pandas/tests/frame/methods/test_isin.py new file mode 100644 index 0000000..0eb94af --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/methods/test_isin.py @@ -0,0 +1,186 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, MultiIndex, Series +import pandas._testing as tm + + +class TestDataFrameIsIn: + def test_isin(self): + # GH#4211 + df = DataFrame( + { + "vals": [1, 2, 3, 4], + "ids": ["a", "b", "f", "n"], + "ids2": ["a", "n", "c", "n"], + }, + index=["foo", "bar", "baz", "qux"], + ) + other = ["a", "b", "c"] + + result = df.isin(other) + expected = DataFrame([df.loc[s].isin(other) for s in df.index]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("empty", [[], Series(dtype=object), np.array([])]) + def test_isin_empty(self, empty): + # GH#16991 + df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]}) + expected = DataFrame(False, df.index, df.columns) + + result = df.isin(empty) + tm.assert_frame_equal(result, expected) + + def test_isin_dict(self): + df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]}) + d = {"A": ["a"]} + + expected = DataFrame(False, df.index, df.columns) + expected.loc[0, "A"] = True + + result = df.isin(d) + tm.assert_frame_equal(result, expected) + + # non unique columns + df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]}) + df.columns = ["A", "A"] + expected = DataFrame(False, df.index, df.columns) + expected.loc[0, "A"] = True + result = df.isin(d) + tm.assert_frame_equal(result, expected) + + def test_isin_with_string_scalar(self): + # GH#4763 + df = DataFrame( + { + "vals": [1, 2, 3, 4], + "ids": ["a", "b", "f", "n"], + "ids2": ["a", "n", "c", "n"], + }, + index=["foo", "bar", "baz", "qux"], + ) + with pytest.raises(TypeError): + df.isin("a") + + with pytest.raises(TypeError): + df.isin("aaa") + + def test_isin_df(self): + df1 = DataFrame({"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}) + df2 = DataFrame({"A": [0, 2, 12, 4], "B": [2, np.nan, 4, 5]}) + expected = DataFrame(False, df1.index, df1.columns) + result = df1.isin(df2) + expected["A"].loc[[1, 3]] = True + expected["B"].loc[[0, 2]] = True + tm.assert_frame_equal(result, expected) + + # partial overlapping columns + df2.columns = ["A", "C"] + result = df1.isin(df2) + expected["B"] = False + tm.assert_frame_equal(result, expected) + + def test_isin_tuples(self): + # GH#16394 + df = pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "f"]}) + df["C"] = list(zip(df["A"], df["B"])) + result = df["C"].isin([(1, "a")]) + tm.assert_series_equal(result, Series([True, False, False], name="C")) + + def test_isin_df_dupe_values(self): + df1 = DataFrame({"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}) + # just cols duped + df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]], columns=["B", "B"]) + with pytest.raises(ValueError): + df1.isin(df2) + + # just index duped + df2 = DataFrame( + [[0, 2], [12, 4], [2, np.nan], [4, 5]], + columns=["A", "B"], + index=[0, 0, 1, 1], + ) + with pytest.raises(ValueError): + df1.isin(df2) + + # cols and index: + df2.columns = ["B", "B"] + with pytest.raises(ValueError): + df1.isin(df2) + + def test_isin_dupe_self(self): + other = DataFrame({"A": [1, 0, 1, 0], "B": [1, 1, 0, 0]}) + df = DataFrame([[1, 1], [1, 0], [0, 0]], columns=["A", "A"]) + result = df.isin(other) + expected = DataFrame(False, index=df.index, columns=df.columns) + expected.loc[0] = True + expected.iloc[1, 1] = True + tm.assert_frame_equal(result, expected) + + def test_isin_against_series(self): + df = pd.DataFrame( + {"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}, index=["a", "b", "c", "d"] + ) + s = pd.Series([1, 3, 11, 4], index=["a", "b", "c", "d"]) + expected = DataFrame(False, index=df.index, columns=df.columns) + expected["A"].loc["a"] = True + expected.loc["d"] = True + result = df.isin(s) + tm.assert_frame_equal(result, expected) + + def test_isin_multiIndex(self): + idx = MultiIndex.from_tuples( + [ + (0, "a", "foo"), + (0, "a", "bar"), + (0, "b", "bar"), + (0, "b", "baz"), + (2, "a", "foo"), + (2, "a", "bar"), + (2, "c", "bar"), + (2, "c", "baz"), + (1, "b", "foo"), + (1, "b", "bar"), + (1, "c", "bar"), + (1, "c", "baz"), + ] + ) + df1 = DataFrame({"A": np.ones(12), "B": np.zeros(12)}, index=idx) + df2 = DataFrame( + { + "A": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], + "B": [1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1], + } + ) + # against regular index + expected = DataFrame(False, index=df1.index, columns=df1.columns) + result = df1.isin(df2) + tm.assert_frame_equal(result, expected) + + df2.index = idx + expected = df2.values.astype(np.bool) + expected[:, 1] = ~expected[:, 1] + expected = DataFrame(expected, columns=["A", "B"], index=idx) + + result = df1.isin(df2) + tm.assert_frame_equal(result, expected) + + def test_isin_empty_datetimelike(self): + # GH#15473 + df1_ts = DataFrame({"date": pd.to_datetime(["2014-01-01", "2014-01-02"])}) + df1_td = DataFrame({"date": [pd.Timedelta(1, "s"), pd.Timedelta(2, "s")]}) + df2 = DataFrame({"date": []}) + df3 = DataFrame() + + expected = DataFrame({"date": [False, False]}) + + result = df1_ts.isin(df2) + tm.assert_frame_equal(result, expected) + result = df1_ts.isin(df3) + tm.assert_frame_equal(result, expected) + + result = df1_td.isin(df2) + tm.assert_frame_equal(result, expected) + result = df1_td.isin(df3) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/frame/methods/test_nlargest.py b/venv/Lib/site-packages/pandas/tests/frame/methods/test_nlargest.py new file mode 100644 index 0000000..4ce4742 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/methods/test_nlargest.py @@ -0,0 +1,211 @@ +""" +Note: for naming purposes, most tests are title with as e.g. "test_nlargest_foo" +but are implicitly also testing nsmallest_foo. +""" +from string import ascii_lowercase + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.fixture +def df_duplicates(): + return pd.DataFrame( + {"a": [1, 2, 3, 4, 4], "b": [1, 1, 1, 1, 1], "c": [0, 1, 2, 5, 4]}, + index=[0, 0, 1, 1, 1], + ) + + +@pytest.fixture +def df_strings(): + return pd.DataFrame( + { + "a": np.random.permutation(10), + "b": list(ascii_lowercase[:10]), + "c": np.random.permutation(10).astype("float64"), + } + ) + + +@pytest.fixture +def df_main_dtypes(): + return pd.DataFrame( + { + "group": [1, 1, 2], + "int": [1, 2, 3], + "float": [4.0, 5.0, 6.0], + "string": list("abc"), + "category_string": pd.Series(list("abc")).astype("category"), + "category_int": [7, 8, 9], + "datetime": pd.date_range("20130101", periods=3), + "datetimetz": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "timedelta": pd.timedelta_range("1 s", periods=3, freq="s"), + }, + columns=[ + "group", + "int", + "float", + "string", + "category_string", + "category_int", + "datetime", + "datetimetz", + "timedelta", + ], + ) + + +class TestNLargestNSmallest: + + # ---------------------------------------------------------------------- + # Top / bottom + @pytest.mark.parametrize( + "order", + [ + ["a"], + ["c"], + ["a", "b"], + ["a", "c"], + ["b", "a"], + ["b", "c"], + ["a", "b", "c"], + ["c", "a", "b"], + ["c", "b", "a"], + ["b", "c", "a"], + ["b", "a", "c"], + # dups! + ["b", "c", "c"], + ], + ) + @pytest.mark.parametrize("n", range(1, 11)) + def test_nlargest_n(self, df_strings, nselect_method, n, order): + # GH#10393 + df = df_strings + if "b" in order: + + error_msg = ( + f"Column 'b' has dtype object, " + f"cannot use method '{nselect_method}' with this dtype" + ) + with pytest.raises(TypeError, match=error_msg): + getattr(df, nselect_method)(n, order) + else: + ascending = nselect_method == "nsmallest" + result = getattr(df, nselect_method)(n, order) + expected = df.sort_values(order, ascending=ascending).head(n) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "columns", [["group", "category_string"], ["group", "string"]] + ) + def test_nlargest_error(self, df_main_dtypes, nselect_method, columns): + df = df_main_dtypes + col = columns[1] + error_msg = ( + f"Column '{col}' has dtype {df[col].dtype}, " + f"cannot use method '{nselect_method}' with this dtype" + ) + # escape some characters that may be in the repr + error_msg = ( + error_msg.replace("(", "\\(") + .replace(")", "\\)") + .replace("[", "\\[") + .replace("]", "\\]") + ) + with pytest.raises(TypeError, match=error_msg): + getattr(df, nselect_method)(2, columns) + + def test_nlargest_all_dtypes(self, df_main_dtypes): + df = df_main_dtypes + df.nsmallest(2, list(set(df) - {"category_string", "string"})) + df.nlargest(2, list(set(df) - {"category_string", "string"})) + + def test_nlargest_duplicates_on_starter_columns(self): + # regression test for GH#22752 + + df = pd.DataFrame({"a": [2, 2, 2, 1, 1, 1], "b": [1, 2, 3, 3, 2, 1]}) + + result = df.nlargest(4, columns=["a", "b"]) + expected = pd.DataFrame( + {"a": [2, 2, 2, 1], "b": [3, 2, 1, 3]}, index=[2, 1, 0, 3] + ) + tm.assert_frame_equal(result, expected) + + result = df.nsmallest(4, columns=["a", "b"]) + expected = pd.DataFrame( + {"a": [1, 1, 1, 2], "b": [1, 2, 3, 1]}, index=[5, 4, 3, 0] + ) + tm.assert_frame_equal(result, expected) + + def test_nlargest_n_identical_values(self): + # GH#15297 + df = pd.DataFrame({"a": [1] * 5, "b": [1, 2, 3, 4, 5]}) + + result = df.nlargest(3, "a") + expected = pd.DataFrame({"a": [1] * 3, "b": [1, 2, 3]}, index=[0, 1, 2]) + tm.assert_frame_equal(result, expected) + + result = df.nsmallest(3, "a") + expected = pd.DataFrame({"a": [1] * 3, "b": [1, 2, 3]}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "order", + [["a", "b", "c"], ["c", "b", "a"], ["a"], ["b"], ["a", "b"], ["c", "b"]], + ) + @pytest.mark.parametrize("n", range(1, 6)) + def test_nlargest_n_duplicate_index(self, df_duplicates, n, order): + # GH#13412 + + df = df_duplicates + result = df.nsmallest(n, order) + expected = df.sort_values(order).head(n) + tm.assert_frame_equal(result, expected) + + result = df.nlargest(n, order) + expected = df.sort_values(order, ascending=False).head(n) + tm.assert_frame_equal(result, expected) + + def test_nlargest_duplicate_keep_all_ties(self): + # GH#16818 + df = pd.DataFrame( + {"a": [5, 4, 4, 2, 3, 3, 3, 3], "b": [10, 9, 8, 7, 5, 50, 10, 20]} + ) + result = df.nlargest(4, "a", keep="all") + expected = pd.DataFrame( + { + "a": {0: 5, 1: 4, 2: 4, 4: 3, 5: 3, 6: 3, 7: 3}, + "b": {0: 10, 1: 9, 2: 8, 4: 5, 5: 50, 6: 10, 7: 20}, + } + ) + tm.assert_frame_equal(result, expected) + + result = df.nsmallest(2, "a", keep="all") + expected = pd.DataFrame( + { + "a": {3: 2, 4: 3, 5: 3, 6: 3, 7: 3}, + "b": {3: 7, 4: 5, 5: 50, 6: 10, 7: 20}, + } + ) + tm.assert_frame_equal(result, expected) + + def test_nlargest_multiindex_column_lookup(self): + # Check whether tuples are correctly treated as multi-level lookups. + # GH#23033 + df = pd.DataFrame( + columns=pd.MultiIndex.from_product([["x"], ["a", "b"]]), + data=[[0.33, 0.13], [0.86, 0.25], [0.25, 0.70], [0.85, 0.91]], + ) + + # nsmallest + result = df.nsmallest(3, ("x", "a")) + expected = df.iloc[[2, 0, 3]] + tm.assert_frame_equal(result, expected) + + # nlargest + result = df.nlargest(3, ("x", "b")) + expected = df.iloc[[3, 2, 1]] + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/frame/methods/test_pct_change.py b/venv/Lib/site-packages/pandas/tests/frame/methods/test_pct_change.py new file mode 100644 index 0000000..8f3f37f --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/methods/test_pct_change.py @@ -0,0 +1,96 @@ +import numpy as np +import pytest + +from pandas import DataFrame, Series +import pandas._testing as tm + + +class TestDataFramePctChange: + def test_pct_change_numeric(self): + # GH#11150 + pnl = DataFrame( + [np.arange(0, 40, 10), np.arange(0, 40, 10), np.arange(0, 40, 10)] + ).astype(np.float64) + pnl.iat[1, 0] = np.nan + pnl.iat[1, 1] = np.nan + pnl.iat[2, 3] = 60 + + for axis in range(2): + expected = pnl.ffill(axis=axis) / pnl.ffill(axis=axis).shift(axis=axis) - 1 + result = pnl.pct_change(axis=axis, fill_method="pad") + + tm.assert_frame_equal(result, expected) + + def test_pct_change(self, datetime_frame): + rs = datetime_frame.pct_change(fill_method=None) + tm.assert_frame_equal(rs, datetime_frame / datetime_frame.shift(1) - 1) + + rs = datetime_frame.pct_change(2) + filled = datetime_frame.fillna(method="pad") + tm.assert_frame_equal(rs, filled / filled.shift(2) - 1) + + rs = datetime_frame.pct_change(fill_method="bfill", limit=1) + filled = datetime_frame.fillna(method="bfill", limit=1) + tm.assert_frame_equal(rs, filled / filled.shift(1) - 1) + + rs = datetime_frame.pct_change(freq="5D") + filled = datetime_frame.fillna(method="pad") + tm.assert_frame_equal( + rs, (filled / filled.shift(freq="5D") - 1).reindex_like(filled) + ) + + def test_pct_change_shift_over_nas(self): + s = Series([1.0, 1.5, np.nan, 2.5, 3.0]) + + df = DataFrame({"a": s, "b": s}) + + chg = df.pct_change() + expected = Series([np.nan, 0.5, 0.0, 2.5 / 1.5 - 1, 0.2]) + edf = DataFrame({"a": expected, "b": expected}) + tm.assert_frame_equal(chg, edf) + + @pytest.mark.parametrize( + "freq, periods, fill_method, limit", + [ + ("5B", 5, None, None), + ("3B", 3, None, None), + ("3B", 3, "bfill", None), + ("7B", 7, "pad", 1), + ("7B", 7, "bfill", 3), + ("14B", 14, None, None), + ], + ) + def test_pct_change_periods_freq( + self, datetime_frame, freq, periods, fill_method, limit + ): + # GH#7292 + rs_freq = datetime_frame.pct_change( + freq=freq, fill_method=fill_method, limit=limit + ) + rs_periods = datetime_frame.pct_change( + periods, fill_method=fill_method, limit=limit + ) + tm.assert_frame_equal(rs_freq, rs_periods) + + empty_ts = DataFrame(index=datetime_frame.index, columns=datetime_frame.columns) + rs_freq = empty_ts.pct_change(freq=freq, fill_method=fill_method, limit=limit) + rs_periods = empty_ts.pct_change(periods, fill_method=fill_method, limit=limit) + tm.assert_frame_equal(rs_freq, rs_periods) + + +@pytest.mark.parametrize("fill_method", ["pad", "ffill", None]) +def test_pct_change_with_duplicated_indices(fill_method): + # GH30463 + data = DataFrame( + {0: [np.nan, 1, 2, 3, 9, 18], 1: [0, 1, np.nan, 3, 9, 18]}, index=["a", "b"] * 3 + ) + result = data.pct_change(fill_method=fill_method) + if fill_method is None: + second_column = [np.nan, np.inf, np.nan, np.nan, 2.0, 1.0] + else: + second_column = [np.nan, np.inf, 0.0, 2.0, 2.0, 1.0] + expected = DataFrame( + {0: [np.nan, np.nan, 1.0, 0.5, 2.0, 1.0], 1: second_column}, + index=["a", "b"] * 3, + ) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/frame/methods/test_quantile.py b/venv/Lib/site-packages/pandas/tests/frame/methods/test_quantile.py new file mode 100644 index 0000000..64461c0 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/methods/test_quantile.py @@ -0,0 +1,492 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Series, Timestamp +import pandas._testing as tm + + +class TestDataFrameQuantile: + def test_quantile_sparse(self): + # GH#17198 + s = pd.Series(pd.arrays.SparseArray([1, 2])) + s1 = pd.Series(pd.arrays.SparseArray([3, 4])) + df = pd.DataFrame({0: s, 1: s1}) + result = df.quantile() + + expected = pd.Series([1.5, 3.5], name=0.5) + tm.assert_series_equal(result, expected) + + def test_quantile(self, datetime_frame): + from numpy import percentile + + df = datetime_frame + q = df.quantile(0.1, axis=0) + assert q["A"] == percentile(df["A"], 10) + tm.assert_index_equal(q.index, df.columns) + + q = df.quantile(0.9, axis=1) + assert q["2000-01-17"] == percentile(df.loc["2000-01-17"], 90) + tm.assert_index_equal(q.index, df.index) + + # test degenerate case + q = DataFrame({"x": [], "y": []}).quantile(0.1, axis=0) + assert np.isnan(q["x"]) and np.isnan(q["y"]) + + # non-numeric exclusion + df = DataFrame({"col1": ["A", "A", "B", "B"], "col2": [1, 2, 3, 4]}) + rs = df.quantile(0.5) + xp = df.median().rename(0.5) + tm.assert_series_equal(rs, xp) + + # axis + df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) + result = df.quantile(0.5, axis=1) + expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5) + tm.assert_series_equal(result, expected) + + result = df.quantile([0.5, 0.75], axis=1) + expected = DataFrame( + {1: [1.5, 1.75], 2: [2.5, 2.75], 3: [3.5, 3.75]}, index=[0.5, 0.75] + ) + tm.assert_frame_equal(result, expected, check_index_type=True) + + # We may want to break API in the future to change this + # so that we exclude non-numeric along the same axis + # See GH #7312 + df = DataFrame([[1, 2, 3], ["a", "b", 4]]) + result = df.quantile(0.5, axis=1) + expected = Series([3.0, 4.0], index=[0, 1], name=0.5) + tm.assert_series_equal(result, expected) + + def test_quantile_axis_mixed(self): + + # mixed on axis=1 + df = DataFrame( + { + "A": [1, 2, 3], + "B": [2.0, 3.0, 4.0], + "C": pd.date_range("20130101", periods=3), + "D": ["foo", "bar", "baz"], + } + ) + result = df.quantile(0.5, axis=1) + expected = Series([1.5, 2.5, 3.5], name=0.5) + tm.assert_series_equal(result, expected) + + # must raise + with pytest.raises(TypeError): + df.quantile(0.5, axis=1, numeric_only=False) + + def test_quantile_axis_parameter(self): + # GH 9543/9544 + + df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) + + result = df.quantile(0.5, axis=0) + + expected = Series([2.0, 3.0], index=["A", "B"], name=0.5) + tm.assert_series_equal(result, expected) + + expected = df.quantile(0.5, axis="index") + tm.assert_series_equal(result, expected) + + result = df.quantile(0.5, axis=1) + + expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5) + tm.assert_series_equal(result, expected) + + result = df.quantile(0.5, axis="columns") + tm.assert_series_equal(result, expected) + + msg = "No axis named -1 for object type " + with pytest.raises(ValueError, match=msg): + df.quantile(0.1, axis=-1) + msg = ( + "No axis named column for object type " + "" + ) + with pytest.raises(ValueError, match=msg): + df.quantile(0.1, axis="column") + + def test_quantile_interpolation(self): + # see gh-10174 + + # interpolation method other than default linear + df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) + result = df.quantile(0.5, axis=1, interpolation="nearest") + expected = Series([1, 2, 3], index=[1, 2, 3], name=0.5) + tm.assert_series_equal(result, expected) + + # cross-check interpolation=nearest results in original dtype + exp = np.percentile( + np.array([[1, 2, 3], [2, 3, 4]]), 0.5, axis=0, interpolation="nearest" + ) + expected = Series(exp, index=[1, 2, 3], name=0.5, dtype="int64") + tm.assert_series_equal(result, expected) + + # float + df = DataFrame({"A": [1.0, 2.0, 3.0], "B": [2.0, 3.0, 4.0]}, index=[1, 2, 3]) + result = df.quantile(0.5, axis=1, interpolation="nearest") + expected = Series([1.0, 2.0, 3.0], index=[1, 2, 3], name=0.5) + tm.assert_series_equal(result, expected) + exp = np.percentile( + np.array([[1.0, 2.0, 3.0], [2.0, 3.0, 4.0]]), + 0.5, + axis=0, + interpolation="nearest", + ) + expected = Series(exp, index=[1, 2, 3], name=0.5, dtype="float64") + tm.assert_series_equal(result, expected) + + # axis + result = df.quantile([0.5, 0.75], axis=1, interpolation="lower") + expected = DataFrame( + {1: [1.0, 1.0], 2: [2.0, 2.0], 3: [3.0, 3.0]}, index=[0.5, 0.75] + ) + tm.assert_frame_equal(result, expected) + + # test degenerate case + df = DataFrame({"x": [], "y": []}) + q = df.quantile(0.1, axis=0, interpolation="higher") + assert np.isnan(q["x"]) and np.isnan(q["y"]) + + # multi + df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"]) + result = df.quantile([0.25, 0.5], interpolation="midpoint") + + # https://github.com/numpy/numpy/issues/7163 + expected = DataFrame( + [[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]], + index=[0.25, 0.5], + columns=["a", "b", "c"], + ) + tm.assert_frame_equal(result, expected) + + def test_quantile_interpolation_datetime(self, datetime_frame): + # see gh-10174 + + # interpolation = linear (default case) + df = datetime_frame + q = df.quantile(0.1, axis=0, interpolation="linear") + assert q["A"] == np.percentile(df["A"], 10) + + def test_quantile_interpolation_int(self, int_frame): + # see gh-10174 + + df = int_frame + # interpolation = linear (default case) + q = df.quantile(0.1) + assert q["A"] == np.percentile(df["A"], 10) + + # test with and without interpolation keyword + q1 = df.quantile(0.1, axis=0, interpolation="linear") + assert q1["A"] == np.percentile(df["A"], 10) + tm.assert_series_equal(q, q1) + + def test_quantile_multi(self): + df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"]) + result = df.quantile([0.25, 0.5]) + expected = DataFrame( + [[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]], + index=[0.25, 0.5], + columns=["a", "b", "c"], + ) + tm.assert_frame_equal(result, expected) + + # axis = 1 + result = df.quantile([0.25, 0.5], axis=1) + expected = DataFrame( + [[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]], index=[0.25, 0.5], columns=[0, 1, 2] + ) + + # empty + result = DataFrame({"x": [], "y": []}).quantile([0.1, 0.9], axis=0) + expected = DataFrame( + {"x": [np.nan, np.nan], "y": [np.nan, np.nan]}, index=[0.1, 0.9] + ) + tm.assert_frame_equal(result, expected) + + def test_quantile_datetime(self): + df = DataFrame({"a": pd.to_datetime(["2010", "2011"]), "b": [0, 5]}) + + # exclude datetime + result = df.quantile(0.5) + expected = Series([2.5], index=["b"]) + + # datetime + result = df.quantile(0.5, numeric_only=False) + expected = Series( + [Timestamp("2010-07-02 12:00:00"), 2.5], index=["a", "b"], name=0.5 + ) + tm.assert_series_equal(result, expected) + + # datetime w/ multi + result = df.quantile([0.5], numeric_only=False) + expected = DataFrame( + [[Timestamp("2010-07-02 12:00:00"), 2.5]], index=[0.5], columns=["a", "b"] + ) + tm.assert_frame_equal(result, expected) + + # axis = 1 + df["c"] = pd.to_datetime(["2011", "2012"]) + result = df[["a", "c"]].quantile(0.5, axis=1, numeric_only=False) + expected = Series( + [Timestamp("2010-07-02 12:00:00"), Timestamp("2011-07-02 12:00:00")], + index=[0, 1], + name=0.5, + ) + tm.assert_series_equal(result, expected) + + result = df[["a", "c"]].quantile([0.5], axis=1, numeric_only=False) + expected = DataFrame( + [[Timestamp("2010-07-02 12:00:00"), Timestamp("2011-07-02 12:00:00")]], + index=[0.5], + columns=[0, 1], + ) + tm.assert_frame_equal(result, expected) + + # empty when numeric_only=True + # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) + # result = df[['a', 'c']].quantile(.5) + # result = df[['a', 'c']].quantile([.5]) + + def test_quantile_invalid(self, datetime_frame): + msg = "percentiles should all be in the interval \\[0, 1\\]" + for invalid in [-1, 2, [0.5, -1], [0.5, 2]]: + with pytest.raises(ValueError, match=msg): + datetime_frame.quantile(invalid) + + def test_quantile_box(self): + df = DataFrame( + { + "A": [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-03"), + ], + "B": [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timestamp("2011-01-03", tz="US/Eastern"), + ], + "C": [ + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), + ], + } + ) + + res = df.quantile(0.5, numeric_only=False) + + exp = pd.Series( + [ + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timedelta("2 days"), + ], + name=0.5, + index=["A", "B", "C"], + ) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5], numeric_only=False) + exp = pd.DataFrame( + [ + [ + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timedelta("2 days"), + ] + ], + index=[0.5], + columns=["A", "B", "C"], + ) + tm.assert_frame_equal(res, exp) + + # DatetimeBlock may be consolidated and contain NaT in different loc + df = DataFrame( + { + "A": [ + pd.Timestamp("2011-01-01"), + pd.NaT, + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-03"), + ], + "a": [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.NaT, + pd.Timestamp("2011-01-03"), + ], + "B": [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.NaT, + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timestamp("2011-01-03", tz="US/Eastern"), + ], + "b": [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.NaT, + pd.Timestamp("2011-01-03", tz="US/Eastern"), + ], + "C": [ + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), + pd.NaT, + ], + "c": [ + pd.NaT, + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), + ], + }, + columns=list("AaBbCc"), + ) + + res = df.quantile(0.5, numeric_only=False) + exp = pd.Series( + [ + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timedelta("2 days"), + pd.Timedelta("2 days"), + ], + name=0.5, + index=list("AaBbCc"), + ) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5], numeric_only=False) + exp = pd.DataFrame( + [ + [ + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timedelta("2 days"), + pd.Timedelta("2 days"), + ] + ], + index=[0.5], + columns=list("AaBbCc"), + ) + tm.assert_frame_equal(res, exp) + + def test_quantile_nan(self): + + # GH 14357 - float block where some cols have missing values + df = DataFrame({"a": np.arange(1, 6.0), "b": np.arange(1, 6.0)}) + df.iloc[-1, 1] = np.nan + + res = df.quantile(0.5) + exp = Series([3.0, 2.5], index=["a", "b"], name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5, 0.75]) + exp = DataFrame({"a": [3.0, 4.0], "b": [2.5, 3.25]}, index=[0.5, 0.75]) + tm.assert_frame_equal(res, exp) + + res = df.quantile(0.5, axis=1) + exp = Series(np.arange(1.0, 6.0), name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5, 0.75], axis=1) + exp = DataFrame([np.arange(1.0, 6.0)] * 2, index=[0.5, 0.75]) + tm.assert_frame_equal(res, exp) + + # full-nan column + df["b"] = np.nan + + res = df.quantile(0.5) + exp = Series([3.0, np.nan], index=["a", "b"], name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5, 0.75]) + exp = DataFrame({"a": [3.0, 4.0], "b": [np.nan, np.nan]}, index=[0.5, 0.75]) + tm.assert_frame_equal(res, exp) + + def test_quantile_nat(self): + + # full NaT column + df = DataFrame({"a": [pd.NaT, pd.NaT, pd.NaT]}) + + res = df.quantile(0.5, numeric_only=False) + exp = Series([pd.NaT], index=["a"], name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5], numeric_only=False) + exp = DataFrame({"a": [pd.NaT]}, index=[0.5]) + tm.assert_frame_equal(res, exp) + + # mixed non-null / full null column + df = DataFrame( + { + "a": [ + pd.Timestamp("2012-01-01"), + pd.Timestamp("2012-01-02"), + pd.Timestamp("2012-01-03"), + ], + "b": [pd.NaT, pd.NaT, pd.NaT], + } + ) + + res = df.quantile(0.5, numeric_only=False) + exp = Series([pd.Timestamp("2012-01-02"), pd.NaT], index=["a", "b"], name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5], numeric_only=False) + exp = DataFrame( + [[pd.Timestamp("2012-01-02"), pd.NaT]], index=[0.5], columns=["a", "b"] + ) + tm.assert_frame_equal(res, exp) + + def test_quantile_empty_no_rows(self): + + # floats + df = DataFrame(columns=["a", "b"], dtype="float64") + + res = df.quantile(0.5) + exp = Series([np.nan, np.nan], index=["a", "b"], name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5]) + exp = DataFrame([[np.nan, np.nan]], columns=["a", "b"], index=[0.5]) + tm.assert_frame_equal(res, exp) + + # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) + # res = df.quantile(0.5, axis=1) + # res = df.quantile([0.5], axis=1) + + # ints + df = DataFrame(columns=["a", "b"], dtype="int64") + + # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) + # res = df.quantile(0.5) + + # datetimes + df = DataFrame(columns=["a", "b"], dtype="datetime64[ns]") + + # FIXME (gives NaNs instead of NaT in 0.18.1 or 0.19.0) + # res = df.quantile(0.5, numeric_only=False) + + def test_quantile_empty_no_columns(self): + # GH#23925 _get_numeric_data may drop all columns + df = pd.DataFrame(pd.date_range("1/1/18", periods=5)) + df.columns.name = "captain tightpants" + result = df.quantile(0.5) + expected = pd.Series([], index=[], name=0.5, dtype=np.float64) + expected.index.name = "captain tightpants" + tm.assert_series_equal(result, expected) + + result = df.quantile([0.5]) + expected = pd.DataFrame([], index=[0.5], columns=[]) + expected.columns.name = "captain tightpants" + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/frame/methods/test_rank.py b/venv/Lib/site-packages/pandas/tests/frame/methods/test_rank.py new file mode 100644 index 0000000..bab2db3 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/methods/test_rank.py @@ -0,0 +1,331 @@ +from datetime import datetime, timedelta + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import DataFrame, Series +import pandas._testing as tm + + +class TestRank: + s = Series([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]) + df = DataFrame({"A": s, "B": s}) + + results = { + "average": np.array([1.5, 5.5, 7.0, 3.5, np.nan, 3.5, 1.5, 8.0, np.nan, 5.5]), + "min": np.array([1, 5, 7, 3, np.nan, 3, 1, 8, np.nan, 5]), + "max": np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6]), + "first": np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6]), + "dense": np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]), + } + + @pytest.fixture(params=["average", "min", "max", "first", "dense"]) + def method(self, request): + """ + Fixture for trying all rank methods + """ + return request.param + + @td.skip_if_no_scipy + def test_rank(self, float_frame): + import scipy.stats # noqa:F401 + from scipy.stats import rankdata + + float_frame["A"][::2] = np.nan + float_frame["B"][::3] = np.nan + float_frame["C"][::4] = np.nan + float_frame["D"][::5] = np.nan + + ranks0 = float_frame.rank() + ranks1 = float_frame.rank(1) + mask = np.isnan(float_frame.values) + + fvals = float_frame.fillna(np.inf).values + + exp0 = np.apply_along_axis(rankdata, 0, fvals) + exp0[mask] = np.nan + + exp1 = np.apply_along_axis(rankdata, 1, fvals) + exp1[mask] = np.nan + + tm.assert_almost_equal(ranks0.values, exp0) + tm.assert_almost_equal(ranks1.values, exp1) + + # integers + df = DataFrame(np.random.randint(0, 5, size=40).reshape((10, 4))) + + result = df.rank() + exp = df.astype(float).rank() + tm.assert_frame_equal(result, exp) + + result = df.rank(1) + exp = df.astype(float).rank(1) + tm.assert_frame_equal(result, exp) + + def test_rank2(self): + df = DataFrame([[1, 3, 2], [1, 2, 3]]) + expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0 + result = df.rank(1, pct=True) + tm.assert_frame_equal(result, expected) + + df = DataFrame([[1, 3, 2], [1, 2, 3]]) + expected = df.rank(0) / 2.0 + result = df.rank(0, pct=True) + tm.assert_frame_equal(result, expected) + + df = DataFrame([["b", "c", "a"], ["a", "c", "b"]]) + expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]]) + result = df.rank(1, numeric_only=False) + tm.assert_frame_equal(result, expected) + + expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]]) + result = df.rank(0, numeric_only=False) + tm.assert_frame_equal(result, expected) + + df = DataFrame([["b", np.nan, "a"], ["a", "c", "b"]]) + expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 3.0, 2.0]]) + result = df.rank(1, numeric_only=False) + tm.assert_frame_equal(result, expected) + + expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 1.0, 2.0]]) + result = df.rank(0, numeric_only=False) + tm.assert_frame_equal(result, expected) + + # f7u12, this does not work without extensive workaround + data = [ + [datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)], + [datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 1)], + ] + df = DataFrame(data) + + # check the rank + expected = DataFrame([[2.0, np.nan, 1.0], [2.0, 3.0, 1.0]]) + result = df.rank(1, numeric_only=False, ascending=True) + tm.assert_frame_equal(result, expected) + + expected = DataFrame([[1.0, np.nan, 2.0], [2.0, 1.0, 3.0]]) + result = df.rank(1, numeric_only=False, ascending=False) + tm.assert_frame_equal(result, expected) + + df = DataFrame({"a": [1e-20, -5, 1e-20 + 1e-40, 10, 1e60, 1e80, 1e-30]}) + exp = DataFrame({"a": [3.5, 1.0, 3.5, 5.0, 6.0, 7.0, 2.0]}) + tm.assert_frame_equal(df.rank(), exp) + + def test_rank_does_not_mutate(self): + # GH#18521 + # Check rank does not mutate DataFrame + df = DataFrame(np.random.randn(10, 3), dtype="float64") + expected = df.copy() + df.rank() + result = df + tm.assert_frame_equal(result, expected) + + def test_rank_mixed_frame(self, float_string_frame): + float_string_frame["datetime"] = datetime.now() + float_string_frame["timedelta"] = timedelta(days=1, seconds=1) + + result = float_string_frame.rank(1) + expected = float_string_frame.rank(1, numeric_only=True) + tm.assert_frame_equal(result, expected) + + @td.skip_if_no_scipy + def test_rank_na_option(self, float_frame): + import scipy.stats # noqa:F401 + from scipy.stats import rankdata + + float_frame["A"][::2] = np.nan + float_frame["B"][::3] = np.nan + float_frame["C"][::4] = np.nan + float_frame["D"][::5] = np.nan + + # bottom + ranks0 = float_frame.rank(na_option="bottom") + ranks1 = float_frame.rank(1, na_option="bottom") + + fvals = float_frame.fillna(np.inf).values + + exp0 = np.apply_along_axis(rankdata, 0, fvals) + exp1 = np.apply_along_axis(rankdata, 1, fvals) + + tm.assert_almost_equal(ranks0.values, exp0) + tm.assert_almost_equal(ranks1.values, exp1) + + # top + ranks0 = float_frame.rank(na_option="top") + ranks1 = float_frame.rank(1, na_option="top") + + fval0 = float_frame.fillna((float_frame.min() - 1).to_dict()).values + fval1 = float_frame.T + fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T + fval1 = fval1.fillna(np.inf).values + + exp0 = np.apply_along_axis(rankdata, 0, fval0) + exp1 = np.apply_along_axis(rankdata, 1, fval1) + + tm.assert_almost_equal(ranks0.values, exp0) + tm.assert_almost_equal(ranks1.values, exp1) + + # descending + + # bottom + ranks0 = float_frame.rank(na_option="top", ascending=False) + ranks1 = float_frame.rank(1, na_option="top", ascending=False) + + fvals = float_frame.fillna(np.inf).values + + exp0 = np.apply_along_axis(rankdata, 0, -fvals) + exp1 = np.apply_along_axis(rankdata, 1, -fvals) + + tm.assert_almost_equal(ranks0.values, exp0) + tm.assert_almost_equal(ranks1.values, exp1) + + # descending + + # top + ranks0 = float_frame.rank(na_option="bottom", ascending=False) + ranks1 = float_frame.rank(1, na_option="bottom", ascending=False) + + fval0 = float_frame.fillna((float_frame.min() - 1).to_dict()).values + fval1 = float_frame.T + fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T + fval1 = fval1.fillna(np.inf).values + + exp0 = np.apply_along_axis(rankdata, 0, -fval0) + exp1 = np.apply_along_axis(rankdata, 1, -fval1) + + tm.assert_numpy_array_equal(ranks0.values, exp0) + tm.assert_numpy_array_equal(ranks1.values, exp1) + + # bad values throw error + msg = "na_option must be one of 'keep', 'top', or 'bottom'" + + with pytest.raises(ValueError, match=msg): + float_frame.rank(na_option="bad", ascending=False) + + # invalid type + with pytest.raises(ValueError, match=msg): + float_frame.rank(na_option=True, ascending=False) + + def test_rank_axis(self): + # check if using axes' names gives the same result + df = DataFrame([[2, 1], [4, 3]]) + tm.assert_frame_equal(df.rank(axis=0), df.rank(axis="index")) + tm.assert_frame_equal(df.rank(axis=1), df.rank(axis="columns")) + + @td.skip_if_no_scipy + def test_rank_methods_frame(self): + import scipy.stats # noqa:F401 + from scipy.stats import rankdata + + xs = np.random.randint(0, 21, (100, 26)) + xs = (xs - 10.0) / 10.0 + cols = [chr(ord("z") - i) for i in range(xs.shape[1])] + + for vals in [xs, xs + 1e6, xs * 1e-6]: + df = DataFrame(vals, columns=cols) + + for ax in [0, 1]: + for m in ["average", "min", "max", "first", "dense"]: + result = df.rank(axis=ax, method=m) + sprank = np.apply_along_axis( + rankdata, ax, vals, m if m != "first" else "ordinal" + ) + sprank = sprank.astype(np.float64) + expected = DataFrame(sprank, columns=cols).astype("float64") + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) + def test_rank_descending(self, method, dtype): + + if "i" in dtype: + df = self.df.dropna() + else: + df = self.df.astype(dtype) + + res = df.rank(ascending=False) + expected = (df.max() - df).rank() + tm.assert_frame_equal(res, expected) + + if method == "first" and dtype == "O": + return + + expected = (df.max() - df).rank(method=method) + + if dtype != "O": + res2 = df.rank(method=method, ascending=False, numeric_only=True) + tm.assert_frame_equal(res2, expected) + + res3 = df.rank(method=method, ascending=False, numeric_only=False) + tm.assert_frame_equal(res3, expected) + + @pytest.mark.parametrize("axis", [0, 1]) + @pytest.mark.parametrize("dtype", [None, object]) + def test_rank_2d_tie_methods(self, method, axis, dtype): + df = self.df + + def _check2d(df, expected, method="average", axis=0): + exp_df = DataFrame({"A": expected, "B": expected}) + + if axis == 1: + df = df.T + exp_df = exp_df.T + + result = df.rank(method=method, axis=axis) + tm.assert_frame_equal(result, exp_df) + + disabled = {(object, "first")} + if (dtype, method) in disabled: + return + frame = df if dtype is None else df.astype(dtype) + _check2d(frame, self.results[method], method=method, axis=axis) + + @pytest.mark.parametrize( + "method,exp", + [ + ("dense", [[1.0, 1.0, 1.0], [1.0, 0.5, 2.0 / 3], [1.0, 0.5, 1.0 / 3]]), + ( + "min", + [ + [1.0 / 3, 1.0, 1.0], + [1.0 / 3, 1.0 / 3, 2.0 / 3], + [1.0 / 3, 1.0 / 3, 1.0 / 3], + ], + ), + ( + "max", + [[1.0, 1.0, 1.0], [1.0, 2.0 / 3, 2.0 / 3], [1.0, 2.0 / 3, 1.0 / 3]], + ), + ( + "average", + [[2.0 / 3, 1.0, 1.0], [2.0 / 3, 0.5, 2.0 / 3], [2.0 / 3, 0.5, 1.0 / 3]], + ), + ( + "first", + [ + [1.0 / 3, 1.0, 1.0], + [2.0 / 3, 1.0 / 3, 2.0 / 3], + [3.0 / 3, 2.0 / 3, 1.0 / 3], + ], + ), + ], + ) + def test_rank_pct_true(self, method, exp): + # see gh-15630. + + df = DataFrame([[2012, 66, 3], [2012, 65, 2], [2012, 65, 1]]) + result = df.rank(method=method, pct=True) + + expected = DataFrame(exp) + tm.assert_frame_equal(result, expected) + + @pytest.mark.single + @pytest.mark.high_memory + def test_pct_max_many_rows(self): + # GH 18271 + df = DataFrame( + {"A": np.arange(2 ** 24 + 1), "B": np.arange(2 ** 24 + 1, 0, -1)} + ) + result = df.rank(pct=True).max() + assert (result == 1).all() diff --git a/venv/Lib/site-packages/pandas/tests/frame/methods/test_replace.py b/venv/Lib/site-packages/pandas/tests/frame/methods/test_replace.py new file mode 100644 index 0000000..aa91e7a --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/methods/test_replace.py @@ -0,0 +1,1358 @@ +from datetime import datetime +from io import StringIO +import re +from typing import Dict, List, Union + +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Index, Series, Timestamp, date_range +import pandas._testing as tm + + +@pytest.fixture +def mix_ab() -> Dict[str, List[Union[int, str]]]: + return {"a": list(range(4)), "b": list("ab..")} + + +@pytest.fixture +def mix_abc() -> Dict[str, List[Union[float, str]]]: + return {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", np.nan, "d"]} + + +class TestDataFrameReplace: + def test_replace_inplace(self, datetime_frame, float_string_frame): + datetime_frame["A"][:5] = np.nan + datetime_frame["A"][-5:] = np.nan + + tsframe = datetime_frame.copy() + tsframe.replace(np.nan, 0, inplace=True) + tm.assert_frame_equal(tsframe, datetime_frame.fillna(0)) + + # mixed type + mf = float_string_frame + mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan + mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan + + result = float_string_frame.replace(np.nan, 0) + expected = float_string_frame.fillna(value=0) + tm.assert_frame_equal(result, expected) + + tsframe = datetime_frame.copy() + tsframe.replace([np.nan], [0], inplace=True) + tm.assert_frame_equal(tsframe, datetime_frame.fillna(0)) + + def test_regex_replace_scalar(self, mix_ab): + obj = {"a": list("ab.."), "b": list("efgh")} + dfobj = DataFrame(obj) + dfmix = DataFrame(mix_ab) + + # simplest cases + # regex -> value + # obj frame + res = dfobj.replace(r"\s*\.\s*", np.nan, regex=True) + tm.assert_frame_equal(dfobj, res.fillna(".")) + + # mixed + res = dfmix.replace(r"\s*\.\s*", np.nan, regex=True) + tm.assert_frame_equal(dfmix, res.fillna(".")) + + # regex -> regex + # obj frame + res = dfobj.replace(r"\s*(\.)\s*", r"\1\1\1", regex=True) + objc = obj.copy() + objc["a"] = ["a", "b", "...", "..."] + expec = DataFrame(objc) + tm.assert_frame_equal(res, expec) + + # with mixed + res = dfmix.replace(r"\s*(\.)\s*", r"\1\1\1", regex=True) + mixc = mix_ab.copy() + mixc["b"] = ["a", "b", "...", "..."] + expec = DataFrame(mixc) + tm.assert_frame_equal(res, expec) + + # everything with compiled regexs as well + res = dfobj.replace(re.compile(r"\s*\.\s*"), np.nan, regex=True) + tm.assert_frame_equal(dfobj, res.fillna(".")) + + # mixed + res = dfmix.replace(re.compile(r"\s*\.\s*"), np.nan, regex=True) + tm.assert_frame_equal(dfmix, res.fillna(".")) + + # regex -> regex + # obj frame + res = dfobj.replace(re.compile(r"\s*(\.)\s*"), r"\1\1\1") + objc = obj.copy() + objc["a"] = ["a", "b", "...", "..."] + expec = DataFrame(objc) + tm.assert_frame_equal(res, expec) + + # with mixed + res = dfmix.replace(re.compile(r"\s*(\.)\s*"), r"\1\1\1") + mixc = mix_ab.copy() + mixc["b"] = ["a", "b", "...", "..."] + expec = DataFrame(mixc) + tm.assert_frame_equal(res, expec) + + res = dfmix.replace(regex=re.compile(r"\s*(\.)\s*"), value=r"\1\1\1") + mixc = mix_ab.copy() + mixc["b"] = ["a", "b", "...", "..."] + expec = DataFrame(mixc) + tm.assert_frame_equal(res, expec) + + res = dfmix.replace(regex=r"\s*(\.)\s*", value=r"\1\1\1") + mixc = mix_ab.copy() + mixc["b"] = ["a", "b", "...", "..."] + expec = DataFrame(mixc) + tm.assert_frame_equal(res, expec) + + def test_regex_replace_scalar_inplace(self, mix_ab): + obj = {"a": list("ab.."), "b": list("efgh")} + dfobj = DataFrame(obj) + dfmix = DataFrame(mix_ab) + + # simplest cases + # regex -> value + # obj frame + res = dfobj.copy() + res.replace(r"\s*\.\s*", np.nan, regex=True, inplace=True) + tm.assert_frame_equal(dfobj, res.fillna(".")) + + # mixed + res = dfmix.copy() + res.replace(r"\s*\.\s*", np.nan, regex=True, inplace=True) + tm.assert_frame_equal(dfmix, res.fillna(".")) + + # regex -> regex + # obj frame + res = dfobj.copy() + res.replace(r"\s*(\.)\s*", r"\1\1\1", regex=True, inplace=True) + objc = obj.copy() + objc["a"] = ["a", "b", "...", "..."] + expec = DataFrame(objc) + tm.assert_frame_equal(res, expec) + + # with mixed + res = dfmix.copy() + res.replace(r"\s*(\.)\s*", r"\1\1\1", regex=True, inplace=True) + mixc = mix_ab.copy() + mixc["b"] = ["a", "b", "...", "..."] + expec = DataFrame(mixc) + tm.assert_frame_equal(res, expec) + + # everything with compiled regexs as well + res = dfobj.copy() + res.replace(re.compile(r"\s*\.\s*"), np.nan, regex=True, inplace=True) + tm.assert_frame_equal(dfobj, res.fillna(".")) + + # mixed + res = dfmix.copy() + res.replace(re.compile(r"\s*\.\s*"), np.nan, regex=True, inplace=True) + tm.assert_frame_equal(dfmix, res.fillna(".")) + + # regex -> regex + # obj frame + res = dfobj.copy() + res.replace(re.compile(r"\s*(\.)\s*"), r"\1\1\1", regex=True, inplace=True) + objc = obj.copy() + objc["a"] = ["a", "b", "...", "..."] + expec = DataFrame(objc) + tm.assert_frame_equal(res, expec) + + # with mixed + res = dfmix.copy() + res.replace(re.compile(r"\s*(\.)\s*"), r"\1\1\1", regex=True, inplace=True) + mixc = mix_ab.copy() + mixc["b"] = ["a", "b", "...", "..."] + expec = DataFrame(mixc) + tm.assert_frame_equal(res, expec) + + res = dfobj.copy() + res.replace(regex=r"\s*\.\s*", value=np.nan, inplace=True) + tm.assert_frame_equal(dfobj, res.fillna(".")) + + # mixed + res = dfmix.copy() + res.replace(regex=r"\s*\.\s*", value=np.nan, inplace=True) + tm.assert_frame_equal(dfmix, res.fillna(".")) + + # regex -> regex + # obj frame + res = dfobj.copy() + res.replace(regex=r"\s*(\.)\s*", value=r"\1\1\1", inplace=True) + objc = obj.copy() + objc["a"] = ["a", "b", "...", "..."] + expec = DataFrame(objc) + tm.assert_frame_equal(res, expec) + + # with mixed + res = dfmix.copy() + res.replace(regex=r"\s*(\.)\s*", value=r"\1\1\1", inplace=True) + mixc = mix_ab.copy() + mixc["b"] = ["a", "b", "...", "..."] + expec = DataFrame(mixc) + tm.assert_frame_equal(res, expec) + + # everything with compiled regexs as well + res = dfobj.copy() + res.replace(regex=re.compile(r"\s*\.\s*"), value=np.nan, inplace=True) + tm.assert_frame_equal(dfobj, res.fillna(".")) + + # mixed + res = dfmix.copy() + res.replace(regex=re.compile(r"\s*\.\s*"), value=np.nan, inplace=True) + tm.assert_frame_equal(dfmix, res.fillna(".")) + + # regex -> regex + # obj frame + res = dfobj.copy() + res.replace(regex=re.compile(r"\s*(\.)\s*"), value=r"\1\1\1", inplace=True) + objc = obj.copy() + objc["a"] = ["a", "b", "...", "..."] + expec = DataFrame(objc) + tm.assert_frame_equal(res, expec) + + # with mixed + res = dfmix.copy() + res.replace(regex=re.compile(r"\s*(\.)\s*"), value=r"\1\1\1", inplace=True) + mixc = mix_ab.copy() + mixc["b"] = ["a", "b", "...", "..."] + expec = DataFrame(mixc) + tm.assert_frame_equal(res, expec) + + def test_regex_replace_list_obj(self): + obj = {"a": list("ab.."), "b": list("efgh"), "c": list("helo")} + dfobj = DataFrame(obj) + + # lists of regexes and values + # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN] + to_replace_res = [r"\s*\.\s*", r"e|f|g"] + values = [np.nan, "crap"] + res = dfobj.replace(to_replace_res, values, regex=True) + expec = DataFrame( + { + "a": ["a", "b", np.nan, np.nan], + "b": ["crap"] * 3 + ["h"], + "c": ["h", "crap", "l", "o"], + } + ) + tm.assert_frame_equal(res, expec) + + # list of [re1, re2, ..., reN] -> [re1, re2, .., reN] + to_replace_res = [r"\s*(\.)\s*", r"(e|f|g)"] + values = [r"\1\1", r"\1_crap"] + res = dfobj.replace(to_replace_res, values, regex=True) + expec = DataFrame( + { + "a": ["a", "b", "..", ".."], + "b": ["e_crap", "f_crap", "g_crap", "h"], + "c": ["h", "e_crap", "l", "o"], + } + ) + tm.assert_frame_equal(res, expec) + + # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN + # or vN)] + to_replace_res = [r"\s*(\.)\s*", r"e"] + values = [r"\1\1", r"crap"] + res = dfobj.replace(to_replace_res, values, regex=True) + expec = DataFrame( + { + "a": ["a", "b", "..", ".."], + "b": ["crap", "f", "g", "h"], + "c": ["h", "crap", "l", "o"], + } + ) + tm.assert_frame_equal(res, expec) + + to_replace_res = [r"\s*(\.)\s*", r"e"] + values = [r"\1\1", r"crap"] + res = dfobj.replace(value=values, regex=to_replace_res) + expec = DataFrame( + { + "a": ["a", "b", "..", ".."], + "b": ["crap", "f", "g", "h"], + "c": ["h", "crap", "l", "o"], + } + ) + tm.assert_frame_equal(res, expec) + + def test_regex_replace_list_obj_inplace(self): + # same as above with inplace=True + # lists of regexes and values + obj = {"a": list("ab.."), "b": list("efgh"), "c": list("helo")} + dfobj = DataFrame(obj) + + # lists of regexes and values + # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN] + to_replace_res = [r"\s*\.\s*", r"e|f|g"] + values = [np.nan, "crap"] + res = dfobj.copy() + res.replace(to_replace_res, values, inplace=True, regex=True) + expec = DataFrame( + { + "a": ["a", "b", np.nan, np.nan], + "b": ["crap"] * 3 + ["h"], + "c": ["h", "crap", "l", "o"], + } + ) + tm.assert_frame_equal(res, expec) + + # list of [re1, re2, ..., reN] -> [re1, re2, .., reN] + to_replace_res = [r"\s*(\.)\s*", r"(e|f|g)"] + values = [r"\1\1", r"\1_crap"] + res = dfobj.copy() + res.replace(to_replace_res, values, inplace=True, regex=True) + expec = DataFrame( + { + "a": ["a", "b", "..", ".."], + "b": ["e_crap", "f_crap", "g_crap", "h"], + "c": ["h", "e_crap", "l", "o"], + } + ) + tm.assert_frame_equal(res, expec) + + # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN + # or vN)] + to_replace_res = [r"\s*(\.)\s*", r"e"] + values = [r"\1\1", r"crap"] + res = dfobj.copy() + res.replace(to_replace_res, values, inplace=True, regex=True) + expec = DataFrame( + { + "a": ["a", "b", "..", ".."], + "b": ["crap", "f", "g", "h"], + "c": ["h", "crap", "l", "o"], + } + ) + tm.assert_frame_equal(res, expec) + + to_replace_res = [r"\s*(\.)\s*", r"e"] + values = [r"\1\1", r"crap"] + res = dfobj.copy() + res.replace(value=values, regex=to_replace_res, inplace=True) + expec = DataFrame( + { + "a": ["a", "b", "..", ".."], + "b": ["crap", "f", "g", "h"], + "c": ["h", "crap", "l", "o"], + } + ) + tm.assert_frame_equal(res, expec) + + def test_regex_replace_list_mixed(self, mix_ab): + # mixed frame to make sure this doesn't break things + dfmix = DataFrame(mix_ab) + + # lists of regexes and values + # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN] + to_replace_res = [r"\s*\.\s*", r"a"] + values = [np.nan, "crap"] + mix2 = {"a": list(range(4)), "b": list("ab.."), "c": list("halo")} + dfmix2 = DataFrame(mix2) + res = dfmix2.replace(to_replace_res, values, regex=True) + expec = DataFrame( + { + "a": mix2["a"], + "b": ["crap", "b", np.nan, np.nan], + "c": ["h", "crap", "l", "o"], + } + ) + tm.assert_frame_equal(res, expec) + + # list of [re1, re2, ..., reN] -> [re1, re2, .., reN] + to_replace_res = [r"\s*(\.)\s*", r"(a|b)"] + values = [r"\1\1", r"\1_crap"] + res = dfmix.replace(to_replace_res, values, regex=True) + expec = DataFrame({"a": mix_ab["a"], "b": ["a_crap", "b_crap", "..", ".."]}) + tm.assert_frame_equal(res, expec) + + # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN + # or vN)] + to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"] + values = [r"\1\1", r"crap", r"\1_crap"] + res = dfmix.replace(to_replace_res, values, regex=True) + expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]}) + tm.assert_frame_equal(res, expec) + + to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"] + values = [r"\1\1", r"crap", r"\1_crap"] + res = dfmix.replace(regex=to_replace_res, value=values) + expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]}) + tm.assert_frame_equal(res, expec) + + def test_regex_replace_list_mixed_inplace(self, mix_ab): + dfmix = DataFrame(mix_ab) + # the same inplace + # lists of regexes and values + # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN] + to_replace_res = [r"\s*\.\s*", r"a"] + values = [np.nan, "crap"] + res = dfmix.copy() + res.replace(to_replace_res, values, inplace=True, regex=True) + expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b", np.nan, np.nan]}) + tm.assert_frame_equal(res, expec) + + # list of [re1, re2, ..., reN] -> [re1, re2, .., reN] + to_replace_res = [r"\s*(\.)\s*", r"(a|b)"] + values = [r"\1\1", r"\1_crap"] + res = dfmix.copy() + res.replace(to_replace_res, values, inplace=True, regex=True) + expec = DataFrame({"a": mix_ab["a"], "b": ["a_crap", "b_crap", "..", ".."]}) + tm.assert_frame_equal(res, expec) + + # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN + # or vN)] + to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"] + values = [r"\1\1", r"crap", r"\1_crap"] + res = dfmix.copy() + res.replace(to_replace_res, values, inplace=True, regex=True) + expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]}) + tm.assert_frame_equal(res, expec) + + to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"] + values = [r"\1\1", r"crap", r"\1_crap"] + res = dfmix.copy() + res.replace(regex=to_replace_res, value=values, inplace=True) + expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]}) + tm.assert_frame_equal(res, expec) + + def test_regex_replace_dict_mixed(self, mix_abc): + dfmix = DataFrame(mix_abc) + + # dicts + # single dict {re1: v1}, search the whole frame + # need test for this... + + # list of dicts {re1: v1, re2: v2, ..., re3: v3}, search the whole + # frame + res = dfmix.replace({"b": r"\s*\.\s*"}, {"b": np.nan}, regex=True) + res2 = dfmix.copy() + res2.replace({"b": r"\s*\.\s*"}, {"b": np.nan}, inplace=True, regex=True) + expec = DataFrame( + {"a": mix_abc["a"], "b": ["a", "b", np.nan, np.nan], "c": mix_abc["c"]} + ) + tm.assert_frame_equal(res, expec) + tm.assert_frame_equal(res2, expec) + + # list of dicts {re1: re11, re2: re12, ..., reN: re1N}, search the + # whole frame + res = dfmix.replace({"b": r"\s*(\.)\s*"}, {"b": r"\1ty"}, regex=True) + res2 = dfmix.copy() + res2.replace({"b": r"\s*(\.)\s*"}, {"b": r"\1ty"}, inplace=True, regex=True) + expec = DataFrame( + {"a": mix_abc["a"], "b": ["a", "b", ".ty", ".ty"], "c": mix_abc["c"]} + ) + tm.assert_frame_equal(res, expec) + tm.assert_frame_equal(res2, expec) + + res = dfmix.replace(regex={"b": r"\s*(\.)\s*"}, value={"b": r"\1ty"}) + res2 = dfmix.copy() + res2.replace(regex={"b": r"\s*(\.)\s*"}, value={"b": r"\1ty"}, inplace=True) + expec = DataFrame( + {"a": mix_abc["a"], "b": ["a", "b", ".ty", ".ty"], "c": mix_abc["c"]} + ) + tm.assert_frame_equal(res, expec) + tm.assert_frame_equal(res2, expec) + + # scalar -> dict + # to_replace regex, {value: value} + expec = DataFrame( + {"a": mix_abc["a"], "b": [np.nan, "b", ".", "."], "c": mix_abc["c"]} + ) + res = dfmix.replace("a", {"b": np.nan}, regex=True) + res2 = dfmix.copy() + res2.replace("a", {"b": np.nan}, regex=True, inplace=True) + tm.assert_frame_equal(res, expec) + tm.assert_frame_equal(res2, expec) + + res = dfmix.replace("a", {"b": np.nan}, regex=True) + res2 = dfmix.copy() + res2.replace(regex="a", value={"b": np.nan}, inplace=True) + expec = DataFrame( + {"a": mix_abc["a"], "b": [np.nan, "b", ".", "."], "c": mix_abc["c"]} + ) + tm.assert_frame_equal(res, expec) + tm.assert_frame_equal(res2, expec) + + def test_regex_replace_dict_nested(self, mix_abc): + # nested dicts will not work until this is implemented for Series + dfmix = DataFrame(mix_abc) + res = dfmix.replace({"b": {r"\s*\.\s*": np.nan}}, regex=True) + res2 = dfmix.copy() + res4 = dfmix.copy() + res2.replace({"b": {r"\s*\.\s*": np.nan}}, inplace=True, regex=True) + res3 = dfmix.replace(regex={"b": {r"\s*\.\s*": np.nan}}) + res4.replace(regex={"b": {r"\s*\.\s*": np.nan}}, inplace=True) + expec = DataFrame( + {"a": mix_abc["a"], "b": ["a", "b", np.nan, np.nan], "c": mix_abc["c"]} + ) + tm.assert_frame_equal(res, expec) + tm.assert_frame_equal(res2, expec) + tm.assert_frame_equal(res3, expec) + tm.assert_frame_equal(res4, expec) + + def test_regex_replace_dict_nested_non_first_character(self): + # GH 25259 + df = pd.DataFrame({"first": ["abc", "bca", "cab"]}) + expected = pd.DataFrame({"first": [".bc", "bc.", "c.b"]}) + result = df.replace({"a": "."}, regex=True) + tm.assert_frame_equal(result, expected) + + def test_regex_replace_dict_nested_gh4115(self): + df = pd.DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2}) + expected = DataFrame({"Type": [0, 1, 0, 0, 1], "tmp": 2}) + result = df.replace({"Type": {"Q": 0, "T": 1}}) + tm.assert_frame_equal(result, expected) + + def test_regex_replace_list_to_scalar(self, mix_abc): + df = DataFrame(mix_abc) + expec = DataFrame( + { + "a": mix_abc["a"], + "b": np.array([np.nan] * 4), + "c": [np.nan, np.nan, np.nan, "d"], + } + ) + res = df.replace([r"\s*\.\s*", "a|b"], np.nan, regex=True) + res2 = df.copy() + res3 = df.copy() + res2.replace([r"\s*\.\s*", "a|b"], np.nan, regex=True, inplace=True) + res3.replace(regex=[r"\s*\.\s*", "a|b"], value=np.nan, inplace=True) + tm.assert_frame_equal(res, expec) + tm.assert_frame_equal(res2, expec) + tm.assert_frame_equal(res3, expec) + + def test_regex_replace_str_to_numeric(self, mix_abc): + # what happens when you try to replace a numeric value with a regex? + df = DataFrame(mix_abc) + res = df.replace(r"\s*\.\s*", 0, regex=True) + res2 = df.copy() + res2.replace(r"\s*\.\s*", 0, inplace=True, regex=True) + res3 = df.copy() + res3.replace(regex=r"\s*\.\s*", value=0, inplace=True) + expec = DataFrame({"a": mix_abc["a"], "b": ["a", "b", 0, 0], "c": mix_abc["c"]}) + tm.assert_frame_equal(res, expec) + tm.assert_frame_equal(res2, expec) + tm.assert_frame_equal(res3, expec) + + def test_regex_replace_regex_list_to_numeric(self, mix_abc): + df = DataFrame(mix_abc) + res = df.replace([r"\s*\.\s*", "b"], 0, regex=True) + res2 = df.copy() + res2.replace([r"\s*\.\s*", "b"], 0, regex=True, inplace=True) + res3 = df.copy() + res3.replace(regex=[r"\s*\.\s*", "b"], value=0, inplace=True) + expec = DataFrame( + {"a": mix_abc["a"], "b": ["a", 0, 0, 0], "c": ["a", 0, np.nan, "d"]} + ) + tm.assert_frame_equal(res, expec) + tm.assert_frame_equal(res2, expec) + tm.assert_frame_equal(res3, expec) + + def test_regex_replace_series_of_regexes(self, mix_abc): + df = DataFrame(mix_abc) + s1 = Series({"b": r"\s*\.\s*"}) + s2 = Series({"b": np.nan}) + res = df.replace(s1, s2, regex=True) + res2 = df.copy() + res2.replace(s1, s2, inplace=True, regex=True) + res3 = df.copy() + res3.replace(regex=s1, value=s2, inplace=True) + expec = DataFrame( + {"a": mix_abc["a"], "b": ["a", "b", np.nan, np.nan], "c": mix_abc["c"]} + ) + tm.assert_frame_equal(res, expec) + tm.assert_frame_equal(res2, expec) + tm.assert_frame_equal(res3, expec) + + def test_regex_replace_numeric_to_object_conversion(self, mix_abc): + df = DataFrame(mix_abc) + expec = DataFrame({"a": ["a", 1, 2, 3], "b": mix_abc["b"], "c": mix_abc["c"]}) + res = df.replace(0, "a") + tm.assert_frame_equal(res, expec) + assert res.a.dtype == np.object_ + + @pytest.mark.parametrize("metachar", ["[]", "()", r"\d", r"\w", r"\s"]) + def test_replace_regex_metachar(self, metachar): + df = DataFrame({"a": [metachar, "else"]}) + result = df.replace({"a": {metachar: "paren"}}) + expected = DataFrame({"a": ["paren", "else"]}) + tm.assert_frame_equal(result, expected) + + def test_replace(self, datetime_frame): + datetime_frame["A"][:5] = np.nan + datetime_frame["A"][-5:] = np.nan + + zero_filled = datetime_frame.replace(np.nan, -1e8) + tm.assert_frame_equal(zero_filled, datetime_frame.fillna(-1e8)) + tm.assert_frame_equal(zero_filled.replace(-1e8, np.nan), datetime_frame) + + datetime_frame["A"][:5] = np.nan + datetime_frame["A"][-5:] = np.nan + datetime_frame["B"][:5] = -1e8 + + # empty + df = DataFrame(index=["a", "b"]) + tm.assert_frame_equal(df, df.replace(5, 7)) + + # GH 11698 + # test for mixed data types. + df = pd.DataFrame( + [("-", pd.to_datetime("20150101")), ("a", pd.to_datetime("20150102"))] + ) + df1 = df.replace("-", np.nan) + expected_df = pd.DataFrame( + [(np.nan, pd.to_datetime("20150101")), ("a", pd.to_datetime("20150102"))] + ) + tm.assert_frame_equal(df1, expected_df) + + def test_replace_list(self): + obj = {"a": list("ab.."), "b": list("efgh"), "c": list("helo")} + dfobj = DataFrame(obj) + + # lists of regexes and values + # list of [v1, v2, ..., vN] -> [v1, v2, ..., vN] + to_replace_res = [r".", r"e"] + values = [np.nan, "crap"] + res = dfobj.replace(to_replace_res, values) + expec = DataFrame( + { + "a": ["a", "b", np.nan, np.nan], + "b": ["crap", "f", "g", "h"], + "c": ["h", "crap", "l", "o"], + } + ) + tm.assert_frame_equal(res, expec) + + # list of [v1, v2, ..., vN] -> [v1, v2, .., vN] + to_replace_res = [r".", r"f"] + values = [r"..", r"crap"] + res = dfobj.replace(to_replace_res, values) + expec = DataFrame( + { + "a": ["a", "b", "..", ".."], + "b": ["e", "crap", "g", "h"], + "c": ["h", "e", "l", "o"], + } + ) + tm.assert_frame_equal(res, expec) + + def test_replace_with_empty_list(self): + # GH 21977 + s = pd.Series([["a", "b"], [], np.nan, [1]]) + df = pd.DataFrame({"col": s}) + expected = df + result = df.replace([], np.nan) + tm.assert_frame_equal(result, expected) + + # GH 19266 + with pytest.raises(ValueError, match="cannot assign mismatch"): + df.replace({np.nan: []}) + with pytest.raises(ValueError, match="cannot assign mismatch"): + df.replace({np.nan: ["dummy", "alt"]}) + + def test_replace_series_dict(self): + # from GH 3064 + df = DataFrame({"zero": {"a": 0.0, "b": 1}, "one": {"a": 2.0, "b": 0}}) + result = df.replace(0, {"zero": 0.5, "one": 1.0}) + expected = DataFrame({"zero": {"a": 0.5, "b": 1}, "one": {"a": 2.0, "b": 1.0}}) + tm.assert_frame_equal(result, expected) + + result = df.replace(0, df.mean()) + tm.assert_frame_equal(result, expected) + + # series to series/dict + df = DataFrame({"zero": {"a": 0.0, "b": 1}, "one": {"a": 2.0, "b": 0}}) + s = Series({"zero": 0.0, "one": 2.0}) + result = df.replace(s, {"zero": 0.5, "one": 1.0}) + expected = DataFrame({"zero": {"a": 0.5, "b": 1}, "one": {"a": 1.0, "b": 0.0}}) + tm.assert_frame_equal(result, expected) + + result = df.replace(s, df.mean()) + tm.assert_frame_equal(result, expected) + + def test_replace_convert(self): + # gh 3907 + df = DataFrame([["foo", "bar", "bah"], ["bar", "foo", "bah"]]) + m = {"foo": 1, "bar": 2, "bah": 3} + rep = df.replace(m) + expec = Series([np.int64] * 3) + res = rep.dtypes + tm.assert_series_equal(expec, res) + + def test_replace_mixed(self, float_string_frame): + mf = float_string_frame + mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan + mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan + + result = float_string_frame.replace(np.nan, -18) + expected = float_string_frame.fillna(value=-18) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result.replace(-18, np.nan), float_string_frame) + + result = float_string_frame.replace(np.nan, -1e8) + expected = float_string_frame.fillna(value=-1e8) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result.replace(-1e8, np.nan), float_string_frame) + + # int block upcasting + df = DataFrame( + { + "A": Series([1.0, 2.0], dtype="float64"), + "B": Series([0, 1], dtype="int64"), + } + ) + expected = DataFrame( + { + "A": Series([1.0, 2.0], dtype="float64"), + "B": Series([0.5, 1], dtype="float64"), + } + ) + result = df.replace(0, 0.5) + tm.assert_frame_equal(result, expected) + + df.replace(0, 0.5, inplace=True) + tm.assert_frame_equal(df, expected) + + # int block splitting + df = DataFrame( + { + "A": Series([1.0, 2.0], dtype="float64"), + "B": Series([0, 1], dtype="int64"), + "C": Series([1, 2], dtype="int64"), + } + ) + expected = DataFrame( + { + "A": Series([1.0, 2.0], dtype="float64"), + "B": Series([0.5, 1], dtype="float64"), + "C": Series([1, 2], dtype="int64"), + } + ) + result = df.replace(0, 0.5) + tm.assert_frame_equal(result, expected) + + # to object block upcasting + df = DataFrame( + { + "A": Series([1.0, 2.0], dtype="float64"), + "B": Series([0, 1], dtype="int64"), + } + ) + expected = DataFrame( + { + "A": Series([1, "foo"], dtype="object"), + "B": Series([0, 1], dtype="int64"), + } + ) + result = df.replace(2, "foo") + tm.assert_frame_equal(result, expected) + + expected = DataFrame( + { + "A": Series(["foo", "bar"], dtype="object"), + "B": Series([0, "foo"], dtype="object"), + } + ) + result = df.replace([1, 2], ["foo", "bar"]) + tm.assert_frame_equal(result, expected) + + # test case from + df = DataFrame( + {"A": Series([3, 0], dtype="int64"), "B": Series([0, 3], dtype="int64")} + ) + result = df.replace(3, df.mean().to_dict()) + expected = df.copy().astype("float64") + m = df.mean() + expected.iloc[0, 0] = m[0] + expected.iloc[1, 1] = m[1] + tm.assert_frame_equal(result, expected) + + def test_replace_simple_nested_dict(self): + df = DataFrame({"col": range(1, 5)}) + expected = DataFrame({"col": ["a", 2, 3, "b"]}) + + result = df.replace({"col": {1: "a", 4: "b"}}) + tm.assert_frame_equal(expected, result) + + # in this case, should be the same as the not nested version + result = df.replace({1: "a", 4: "b"}) + tm.assert_frame_equal(expected, result) + + def test_replace_simple_nested_dict_with_nonexistent_value(self): + df = DataFrame({"col": range(1, 5)}) + expected = DataFrame({"col": ["a", 2, 3, "b"]}) + + result = df.replace({-1: "-", 1: "a", 4: "b"}) + tm.assert_frame_equal(expected, result) + + result = df.replace({"col": {-1: "-", 1: "a", 4: "b"}}) + tm.assert_frame_equal(expected, result) + + def test_replace_value_is_none(self, datetime_frame): + orig_value = datetime_frame.iloc[0, 0] + orig2 = datetime_frame.iloc[1, 0] + + datetime_frame.iloc[0, 0] = np.nan + datetime_frame.iloc[1, 0] = 1 + + result = datetime_frame.replace(to_replace={np.nan: 0}) + expected = datetime_frame.T.replace(to_replace={np.nan: 0}).T + tm.assert_frame_equal(result, expected) + + result = datetime_frame.replace(to_replace={np.nan: 0, 1: -1e8}) + tsframe = datetime_frame.copy() + tsframe.iloc[0, 0] = 0 + tsframe.iloc[1, 0] = -1e8 + expected = tsframe + tm.assert_frame_equal(expected, result) + datetime_frame.iloc[0, 0] = orig_value + datetime_frame.iloc[1, 0] = orig2 + + def test_replace_for_new_dtypes(self, datetime_frame): + + # dtypes + tsframe = datetime_frame.copy().astype(np.float32) + tsframe["A"][:5] = np.nan + tsframe["A"][-5:] = np.nan + + zero_filled = tsframe.replace(np.nan, -1e8) + tm.assert_frame_equal(zero_filled, tsframe.fillna(-1e8)) + tm.assert_frame_equal(zero_filled.replace(-1e8, np.nan), tsframe) + + tsframe["A"][:5] = np.nan + tsframe["A"][-5:] = np.nan + tsframe["B"][:5] = -1e8 + + b = tsframe["B"] + b[b == -1e8] = np.nan + tsframe["B"] = b + result = tsframe.fillna(method="bfill") + tm.assert_frame_equal(result, tsframe.fillna(method="bfill")) + + @pytest.mark.parametrize( + "frame, to_replace, value, expected", + [ + (DataFrame({"ints": [1, 2, 3]}), 1, 0, DataFrame({"ints": [0, 2, 3]})), + ( + DataFrame({"ints": [1, 2, 3]}, dtype=np.int32), + 1, + 0, + DataFrame({"ints": [0, 2, 3]}, dtype=np.int32), + ), + ( + DataFrame({"ints": [1, 2, 3]}, dtype=np.int16), + 1, + 0, + DataFrame({"ints": [0, 2, 3]}, dtype=np.int16), + ), + ( + DataFrame({"bools": [True, False, True]}), + False, + True, + DataFrame({"bools": [True, True, True]}), + ), + ( + DataFrame({"complex": [1j, 2j, 3j]}), + 1j, + 0, + DataFrame({"complex": [0j, 2j, 3j]}), + ), + ( + DataFrame( + { + "datetime64": Index( + [ + datetime(2018, 5, 28), + datetime(2018, 7, 28), + datetime(2018, 5, 28), + ] + ) + } + ), + datetime(2018, 5, 28), + datetime(2018, 7, 28), + DataFrame({"datetime64": Index([datetime(2018, 7, 28)] * 3)}), + ), + # GH 20380 + ( + DataFrame({"dt": [datetime(3017, 12, 20)], "str": ["foo"]}), + "foo", + "bar", + DataFrame({"dt": [datetime(3017, 12, 20)], "str": ["bar"]}), + ), + ( + DataFrame( + { + "A": date_range("20130101", periods=3, tz="US/Eastern"), + "B": [0, np.nan, 2], + } + ), + Timestamp("20130102", tz="US/Eastern"), + Timestamp("20130104", tz="US/Eastern"), + DataFrame( + { + "A": [ + Timestamp("20130101", tz="US/Eastern"), + Timestamp("20130104", tz="US/Eastern"), + Timestamp("20130103", tz="US/Eastern"), + ], + "B": [0, np.nan, 2], + } + ), + ), + ], + ) + def test_replace_dtypes(self, frame, to_replace, value, expected): + result = getattr(frame, "replace")(to_replace, value) + tm.assert_frame_equal(result, expected) + + def test_replace_input_formats_listlike(self): + # both dicts + to_rep = {"A": np.nan, "B": 0, "C": ""} + values = {"A": 0, "B": -1, "C": "missing"} + df = DataFrame( + {"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} + ) + filled = df.replace(to_rep, values) + expected = {k: v.replace(to_rep[k], values[k]) for k, v in df.items()} + tm.assert_frame_equal(filled, DataFrame(expected)) + + result = df.replace([0, 2, 5], [5, 2, 0]) + expected = DataFrame( + {"A": [np.nan, 5, np.inf], "B": [5, 2, 0], "C": ["", "asdf", "fd"]} + ) + tm.assert_frame_equal(result, expected) + + # scalar to dict + values = {"A": 0, "B": -1, "C": "missing"} + df = DataFrame( + {"A": [np.nan, 0, np.nan], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} + ) + filled = df.replace(np.nan, values) + expected = {k: v.replace(np.nan, values[k]) for k, v in df.items()} + tm.assert_frame_equal(filled, DataFrame(expected)) + + # list to list + to_rep = [np.nan, 0, ""] + values = [-2, -1, "missing"] + result = df.replace(to_rep, values) + expected = df.copy() + for i in range(len(to_rep)): + expected.replace(to_rep[i], values[i], inplace=True) + tm.assert_frame_equal(result, expected) + + msg = r"Replacement lists must match in length\. Expecting 3 got 2" + with pytest.raises(ValueError, match=msg): + df.replace(to_rep, values[1:]) + + def test_replace_input_formats_scalar(self): + df = DataFrame( + {"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} + ) + + # dict to scalar + to_rep = {"A": np.nan, "B": 0, "C": ""} + filled = df.replace(to_rep, 0) + expected = {k: v.replace(to_rep[k], 0) for k, v in df.items()} + tm.assert_frame_equal(filled, DataFrame(expected)) + + msg = "value argument must be scalar, dict, or Series" + with pytest.raises(TypeError, match=msg): + df.replace(to_rep, [np.nan, 0, ""]) + + # list to scalar + to_rep = [np.nan, 0, ""] + result = df.replace(to_rep, -1) + expected = df.copy() + for i in range(len(to_rep)): + expected.replace(to_rep[i], -1, inplace=True) + tm.assert_frame_equal(result, expected) + + def test_replace_limit(self): + pass + + def test_replace_dict_no_regex(self): + answer = Series( + { + 0: "Strongly Agree", + 1: "Agree", + 2: "Neutral", + 3: "Disagree", + 4: "Strongly Disagree", + } + ) + weights = { + "Agree": 4, + "Disagree": 2, + "Neutral": 3, + "Strongly Agree": 5, + "Strongly Disagree": 1, + } + expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}) + result = answer.replace(weights) + tm.assert_series_equal(result, expected) + + def test_replace_series_no_regex(self): + answer = Series( + { + 0: "Strongly Agree", + 1: "Agree", + 2: "Neutral", + 3: "Disagree", + 4: "Strongly Disagree", + } + ) + weights = Series( + { + "Agree": 4, + "Disagree": 2, + "Neutral": 3, + "Strongly Agree": 5, + "Strongly Disagree": 1, + } + ) + expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}) + result = answer.replace(weights) + tm.assert_series_equal(result, expected) + + def test_replace_dict_tuple_list_ordering_remains_the_same(self): + df = DataFrame(dict(A=[np.nan, 1])) + res1 = df.replace(to_replace={np.nan: 0, 1: -1e8}) + res2 = df.replace(to_replace=(1, np.nan), value=[-1e8, 0]) + res3 = df.replace(to_replace=[1, np.nan], value=[-1e8, 0]) + + expected = DataFrame({"A": [0, -1e8]}) + tm.assert_frame_equal(res1, res2) + tm.assert_frame_equal(res2, res3) + tm.assert_frame_equal(res3, expected) + + def test_replace_doesnt_replace_without_regex(self): + raw = """fol T_opp T_Dir T_Enh + 0 1 0 0 vo + 1 2 vr 0 0 + 2 2 0 0 0 + 3 3 0 bt 0""" + df = pd.read_csv(StringIO(raw), sep=r"\s+") + res = df.replace({r"\D": 1}) + tm.assert_frame_equal(df, res) + + def test_replace_bool_with_string(self): + df = DataFrame({"a": [True, False], "b": list("ab")}) + result = df.replace(True, "a") + expected = DataFrame({"a": ["a", False], "b": df.b}) + tm.assert_frame_equal(result, expected) + + def test_replace_pure_bool_with_string_no_op(self): + df = DataFrame(np.random.rand(2, 2) > 0.5) + result = df.replace("asdf", "fdsa") + tm.assert_frame_equal(df, result) + + def test_replace_bool_with_bool(self): + df = DataFrame(np.random.rand(2, 2) > 0.5) + result = df.replace(False, True) + expected = DataFrame(np.ones((2, 2), dtype=bool)) + tm.assert_frame_equal(result, expected) + + def test_replace_with_dict_with_bool_keys(self): + df = DataFrame({0: [True, False], 1: [False, True]}) + with pytest.raises(TypeError, match="Cannot compare types .+"): + df.replace({"asdf": "asdb", True: "yes"}) + + def test_replace_truthy(self): + df = DataFrame({"a": [True, True]}) + r = df.replace([np.inf, -np.inf], np.nan) + e = df + tm.assert_frame_equal(r, e) + + def test_nested_dict_overlapping_keys_replace_int(self): + # GH 27660 keep behaviour consistent for simple dictionary and + # nested dictionary replacement + df = DataFrame({"a": list(range(1, 5))}) + + result = df.replace({"a": dict(zip(range(1, 5), range(2, 6)))}) + expected = df.replace(dict(zip(range(1, 5), range(2, 6)))) + tm.assert_frame_equal(result, expected) + + def test_nested_dict_overlapping_keys_replace_str(self): + # GH 27660 + a = np.arange(1, 5) + astr = a.astype(str) + bstr = np.arange(2, 6).astype(str) + df = DataFrame({"a": astr}) + result = df.replace(dict(zip(astr, bstr))) + expected = df.replace({"a": dict(zip(astr, bstr))}) + tm.assert_frame_equal(result, expected) + + def test_replace_swapping_bug(self): + df = pd.DataFrame({"a": [True, False, True]}) + res = df.replace({"a": {True: "Y", False: "N"}}) + expect = pd.DataFrame({"a": ["Y", "N", "Y"]}) + tm.assert_frame_equal(res, expect) + + df = pd.DataFrame({"a": [0, 1, 0]}) + res = df.replace({"a": {0: "Y", 1: "N"}}) + expect = pd.DataFrame({"a": ["Y", "N", "Y"]}) + tm.assert_frame_equal(res, expect) + + def test_replace_period(self): + d = { + "fname": { + "out_augmented_AUG_2011.json": pd.Period(year=2011, month=8, freq="M"), + "out_augmented_JAN_2011.json": pd.Period(year=2011, month=1, freq="M"), + "out_augmented_MAY_2012.json": pd.Period(year=2012, month=5, freq="M"), + "out_augmented_SUBSIDY_WEEK.json": pd.Period( + year=2011, month=4, freq="M" + ), + "out_augmented_AUG_2012.json": pd.Period(year=2012, month=8, freq="M"), + "out_augmented_MAY_2011.json": pd.Period(year=2011, month=5, freq="M"), + "out_augmented_SEP_2013.json": pd.Period(year=2013, month=9, freq="M"), + } + } + + df = pd.DataFrame( + [ + "out_augmented_AUG_2012.json", + "out_augmented_SEP_2013.json", + "out_augmented_SUBSIDY_WEEK.json", + "out_augmented_MAY_2012.json", + "out_augmented_MAY_2011.json", + "out_augmented_AUG_2011.json", + "out_augmented_JAN_2011.json", + ], + columns=["fname"], + ) + assert set(df.fname.values) == set(d["fname"].keys()) + # We don't support converting object -> specialized EA in + # replace yet. + expected = DataFrame( + {"fname": [d["fname"][k] for k in df.fname.values]}, dtype=object + ) + result = df.replace(d) + tm.assert_frame_equal(result, expected) + + def test_replace_datetime(self): + d = { + "fname": { + "out_augmented_AUG_2011.json": pd.Timestamp("2011-08"), + "out_augmented_JAN_2011.json": pd.Timestamp("2011-01"), + "out_augmented_MAY_2012.json": pd.Timestamp("2012-05"), + "out_augmented_SUBSIDY_WEEK.json": pd.Timestamp("2011-04"), + "out_augmented_AUG_2012.json": pd.Timestamp("2012-08"), + "out_augmented_MAY_2011.json": pd.Timestamp("2011-05"), + "out_augmented_SEP_2013.json": pd.Timestamp("2013-09"), + } + } + + df = pd.DataFrame( + [ + "out_augmented_AUG_2012.json", + "out_augmented_SEP_2013.json", + "out_augmented_SUBSIDY_WEEK.json", + "out_augmented_MAY_2012.json", + "out_augmented_MAY_2011.json", + "out_augmented_AUG_2011.json", + "out_augmented_JAN_2011.json", + ], + columns=["fname"], + ) + assert set(df.fname.values) == set(d["fname"].keys()) + expected = DataFrame({"fname": [d["fname"][k] for k in df.fname.values]}) + result = df.replace(d) + tm.assert_frame_equal(result, expected) + + def test_replace_datetimetz(self): + + # GH 11326 + # behaving poorly when presented with a datetime64[ns, tz] + df = DataFrame( + { + "A": date_range("20130101", periods=3, tz="US/Eastern"), + "B": [0, np.nan, 2], + } + ) + result = df.replace(np.nan, 1) + expected = DataFrame( + { + "A": date_range("20130101", periods=3, tz="US/Eastern"), + "B": Series([0, 1, 2], dtype="float64"), + } + ) + tm.assert_frame_equal(result, expected) + + result = df.fillna(1) + tm.assert_frame_equal(result, expected) + + result = df.replace(0, np.nan) + expected = DataFrame( + { + "A": date_range("20130101", periods=3, tz="US/Eastern"), + "B": [np.nan, np.nan, 2], + } + ) + tm.assert_frame_equal(result, expected) + + result = df.replace( + Timestamp("20130102", tz="US/Eastern"), + Timestamp("20130104", tz="US/Eastern"), + ) + expected = DataFrame( + { + "A": [ + Timestamp("20130101", tz="US/Eastern"), + Timestamp("20130104", tz="US/Eastern"), + Timestamp("20130103", tz="US/Eastern"), + ], + "B": [0, np.nan, 2], + } + ) + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.iloc[1, 0] = np.nan + result = result.replace({"A": pd.NaT}, Timestamp("20130104", tz="US/Eastern")) + tm.assert_frame_equal(result, expected) + + # coerce to object + result = df.copy() + result.iloc[1, 0] = np.nan + result = result.replace({"A": pd.NaT}, Timestamp("20130104", tz="US/Pacific")) + expected = DataFrame( + { + "A": [ + Timestamp("20130101", tz="US/Eastern"), + Timestamp("20130104", tz="US/Pacific"), + Timestamp("20130103", tz="US/Eastern"), + ], + "B": [0, np.nan, 2], + } + ) + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.iloc[1, 0] = np.nan + result = result.replace({"A": np.nan}, Timestamp("20130104")) + expected = DataFrame( + { + "A": [ + Timestamp("20130101", tz="US/Eastern"), + Timestamp("20130104"), + Timestamp("20130103", tz="US/Eastern"), + ], + "B": [0, np.nan, 2], + } + ) + tm.assert_frame_equal(result, expected) + + def test_replace_with_empty_dictlike(self, mix_abc): + # GH 15289 + df = DataFrame(mix_abc) + tm.assert_frame_equal(df, df.replace({})) + tm.assert_frame_equal(df, df.replace(Series([], dtype=object))) + + tm.assert_frame_equal(df, df.replace({"b": {}})) + tm.assert_frame_equal(df, df.replace(Series({"b": {}}))) + + @pytest.mark.parametrize( + "to_replace, method, expected", + [ + (0, "bfill", {"A": [1, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}), + ( + np.nan, + "bfill", + {"A": [0, 1, 2], "B": [5.0, 7.0, 7.0], "C": ["a", "b", "c"]}, + ), + ("d", "ffill", {"A": [0, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}), + ( + [0, 2], + "bfill", + {"A": [1, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}, + ), + ( + [1, 2], + "pad", + {"A": [0, 0, 0], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}, + ), + ( + (1, 2), + "bfill", + {"A": [0, 2, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}, + ), + ( + ["b", "c"], + "ffill", + {"A": [0, 1, 2], "B": [5, np.nan, 7], "C": ["a", "a", "a"]}, + ), + ], + ) + def test_replace_method(self, to_replace, method, expected): + # GH 19632 + df = DataFrame({"A": [0, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}) + + result = df.replace(to_replace=to_replace, value=None, method=method) + expected = DataFrame(expected) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "replace_dict, final_data", + [({"a": 1, "b": 1}, [[3, 3], [2, 2]]), ({"a": 1, "b": 2}, [[3, 1], [2, 3]])], + ) + def test_categorical_replace_with_dict(self, replace_dict, final_data): + # GH 26988 + df = DataFrame([[1, 1], [2, 2]], columns=["a", "b"], dtype="category") + expected = DataFrame(final_data, columns=["a", "b"], dtype="category") + expected["a"] = expected["a"].cat.set_categories([1, 2, 3]) + expected["b"] = expected["b"].cat.set_categories([1, 2, 3]) + result = df.replace(replace_dict, 3) + tm.assert_frame_equal(result, expected) + with pytest.raises(AssertionError): + # ensure non-inplace call does not affect original + tm.assert_frame_equal(df, expected) + df.replace(replace_dict, 3, inplace=True) + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize( + "df, to_replace, exp", + [ + ( + {"col1": [1, 2, 3], "col2": [4, 5, 6]}, + {4: 5, 5: 6, 6: 7}, + {"col1": [1, 2, 3], "col2": [5, 6, 7]}, + ), + ( + {"col1": [1, 2, 3], "col2": ["4", "5", "6"]}, + {"4": "5", "5": "6", "6": "7"}, + {"col1": [1, 2, 3], "col2": ["5", "6", "7"]}, + ), + ], + ) + def test_replace_commutative(self, df, to_replace, exp): + # GH 16051 + # DataFrame.replace() overwrites when values are non-numeric + # also added to data frame whilst issue was for series + + df = pd.DataFrame(df) + + expected = pd.DataFrame(exp) + result = df.replace(to_replace) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "replacer", + [ + pd.Timestamp("20170827"), + np.int8(1), + np.int16(1), + np.float32(1), + np.float64(1), + ], + ) + def test_replace_replacer_dtype(self, replacer): + # GH26632 + df = pd.DataFrame(["a"]) + result = df.replace({"a": replacer, "b": replacer}) + expected = pd.DataFrame([replacer]) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/frame/methods/test_round.py b/venv/Lib/site-packages/pandas/tests/frame/methods/test_round.py new file mode 100644 index 0000000..0865e03 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/methods/test_round.py @@ -0,0 +1,217 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Series, date_range +import pandas._testing as tm + + +class TestDataFrameRound: + def test_round(self): + # GH#2665 + + # Test that rounding an empty DataFrame does nothing + df = DataFrame() + tm.assert_frame_equal(df, df.round()) + + # Here's the test frame we'll be working with + df = DataFrame({"col1": [1.123, 2.123, 3.123], "col2": [1.234, 2.234, 3.234]}) + + # Default round to integer (i.e. decimals=0) + expected_rounded = DataFrame({"col1": [1.0, 2.0, 3.0], "col2": [1.0, 2.0, 3.0]}) + tm.assert_frame_equal(df.round(), expected_rounded) + + # Round with an integer + decimals = 2 + expected_rounded = DataFrame( + {"col1": [1.12, 2.12, 3.12], "col2": [1.23, 2.23, 3.23]} + ) + tm.assert_frame_equal(df.round(decimals), expected_rounded) + + # This should also work with np.round (since np.round dispatches to + # df.round) + tm.assert_frame_equal(np.round(df, decimals), expected_rounded) + + # Round with a list + round_list = [1, 2] + with pytest.raises(TypeError): + df.round(round_list) + + # Round with a dictionary + expected_rounded = DataFrame( + {"col1": [1.1, 2.1, 3.1], "col2": [1.23, 2.23, 3.23]} + ) + round_dict = {"col1": 1, "col2": 2} + tm.assert_frame_equal(df.round(round_dict), expected_rounded) + + # Incomplete dict + expected_partially_rounded = DataFrame( + {"col1": [1.123, 2.123, 3.123], "col2": [1.2, 2.2, 3.2]} + ) + partial_round_dict = {"col2": 1} + tm.assert_frame_equal(df.round(partial_round_dict), expected_partially_rounded) + + # Dict with unknown elements + wrong_round_dict = {"col3": 2, "col2": 1} + tm.assert_frame_equal(df.round(wrong_round_dict), expected_partially_rounded) + + # float input to `decimals` + non_int_round_dict = {"col1": 1, "col2": 0.5} + with pytest.raises(TypeError): + df.round(non_int_round_dict) + + # String input + non_int_round_dict = {"col1": 1, "col2": "foo"} + with pytest.raises(TypeError): + df.round(non_int_round_dict) + + non_int_round_Series = Series(non_int_round_dict) + with pytest.raises(TypeError): + df.round(non_int_round_Series) + + # List input + non_int_round_dict = {"col1": 1, "col2": [1, 2]} + with pytest.raises(TypeError): + df.round(non_int_round_dict) + + non_int_round_Series = Series(non_int_round_dict) + with pytest.raises(TypeError): + df.round(non_int_round_Series) + + # Non integer Series inputs + non_int_round_Series = Series(non_int_round_dict) + with pytest.raises(TypeError): + df.round(non_int_round_Series) + + non_int_round_Series = Series(non_int_round_dict) + with pytest.raises(TypeError): + df.round(non_int_round_Series) + + # Negative numbers + negative_round_dict = {"col1": -1, "col2": -2} + big_df = df * 100 + expected_neg_rounded = DataFrame( + {"col1": [110.0, 210, 310], "col2": [100.0, 200, 300]} + ) + tm.assert_frame_equal(big_df.round(negative_round_dict), expected_neg_rounded) + + # nan in Series round + nan_round_Series = Series({"col1": np.nan, "col2": 1}) + + # TODO(wesm): unused? + expected_nan_round = DataFrame( # noqa + {"col1": [1.123, 2.123, 3.123], "col2": [1.2, 2.2, 3.2]} + ) + + with pytest.raises(TypeError): + df.round(nan_round_Series) + + # Make sure this doesn't break existing Series.round + tm.assert_series_equal(df["col1"].round(1), expected_rounded["col1"]) + + # named columns + # GH#11986 + decimals = 2 + expected_rounded = DataFrame( + {"col1": [1.12, 2.12, 3.12], "col2": [1.23, 2.23, 3.23]} + ) + df.columns.name = "cols" + expected_rounded.columns.name = "cols" + tm.assert_frame_equal(df.round(decimals), expected_rounded) + + # interaction of named columns & series + tm.assert_series_equal(df["col1"].round(decimals), expected_rounded["col1"]) + tm.assert_series_equal(df.round(decimals)["col1"], expected_rounded["col1"]) + + def test_round_numpy(self): + # GH#12600 + df = DataFrame([[1.53, 1.36], [0.06, 7.01]]) + out = np.round(df, decimals=0) + expected = DataFrame([[2.0, 1.0], [0.0, 7.0]]) + tm.assert_frame_equal(out, expected) + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.round(df, decimals=0, out=df) + + def test_round_numpy_with_nan(self): + # See GH#14197 + df = Series([1.53, np.nan, 0.06]).to_frame() + with tm.assert_produces_warning(None): + result = df.round() + expected = Series([2.0, np.nan, 0.0]).to_frame() + tm.assert_frame_equal(result, expected) + + def test_round_mixed_type(self): + # GH#11885 + df = DataFrame( + { + "col1": [1.1, 2.2, 3.3, 4.4], + "col2": ["1", "a", "c", "f"], + "col3": date_range("20111111", periods=4), + } + ) + round_0 = DataFrame( + { + "col1": [1.0, 2.0, 3.0, 4.0], + "col2": ["1", "a", "c", "f"], + "col3": date_range("20111111", periods=4), + } + ) + tm.assert_frame_equal(df.round(), round_0) + tm.assert_frame_equal(df.round(1), df) + tm.assert_frame_equal(df.round({"col1": 1}), df) + tm.assert_frame_equal(df.round({"col1": 0}), round_0) + tm.assert_frame_equal(df.round({"col1": 0, "col2": 1}), round_0) + tm.assert_frame_equal(df.round({"col3": 1}), df) + + def test_round_with_duplicate_columns(self): + # GH#11611 + + df = pd.DataFrame( + np.random.random([3, 3]), + columns=["A", "B", "C"], + index=["first", "second", "third"], + ) + + dfs = pd.concat((df, df), axis=1) + rounded = dfs.round() + tm.assert_index_equal(rounded.index, dfs.index) + + decimals = pd.Series([1, 0, 2], index=["A", "B", "A"]) + msg = "Index of decimals must be unique" + with pytest.raises(ValueError, match=msg): + df.round(decimals) + + def test_round_builtin(self): + # GH#11763 + # Here's the test frame we'll be working with + df = DataFrame({"col1": [1.123, 2.123, 3.123], "col2": [1.234, 2.234, 3.234]}) + + # Default round to integer (i.e. decimals=0) + expected_rounded = DataFrame({"col1": [1.0, 2.0, 3.0], "col2": [1.0, 2.0, 3.0]}) + tm.assert_frame_equal(round(df), expected_rounded) + + def test_round_nonunique_categorical(self): + # See GH#21809 + idx = pd.CategoricalIndex(["low"] * 3 + ["hi"] * 3) + df = pd.DataFrame(np.random.rand(6, 3), columns=list("abc")) + + expected = df.round(3) + expected.index = idx + + df_categorical = df.copy().set_index(idx) + assert df_categorical.shape == (6, 3) + result = df_categorical.round(3) + assert result.shape == (6, 3) + + tm.assert_frame_equal(result, expected) + + def test_round_interval_category_columns(self): + # GH#30063 + columns = pd.CategoricalIndex(pd.interval_range(0, 2)) + df = DataFrame([[0.66, 1.1], [0.3, 0.25]], columns=columns) + + result = df.round() + expected = DataFrame([[1.0, 1.0], [0.0, 0.0]], columns=columns) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/frame/methods/test_shift.py b/venv/Lib/site-packages/pandas/tests/frame/methods/test_shift.py new file mode 100644 index 0000000..cfb17de --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/methods/test_shift.py @@ -0,0 +1,187 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Index, Series, date_range, offsets +import pandas._testing as tm + + +class TestDataFrameShift: + def test_shift(self, datetime_frame, int_frame): + # naive shift + shiftedFrame = datetime_frame.shift(5) + tm.assert_index_equal(shiftedFrame.index, datetime_frame.index) + + shiftedSeries = datetime_frame["A"].shift(5) + tm.assert_series_equal(shiftedFrame["A"], shiftedSeries) + + shiftedFrame = datetime_frame.shift(-5) + tm.assert_index_equal(shiftedFrame.index, datetime_frame.index) + + shiftedSeries = datetime_frame["A"].shift(-5) + tm.assert_series_equal(shiftedFrame["A"], shiftedSeries) + + # shift by 0 + unshifted = datetime_frame.shift(0) + tm.assert_frame_equal(unshifted, datetime_frame) + + # shift by DateOffset + shiftedFrame = datetime_frame.shift(5, freq=offsets.BDay()) + assert len(shiftedFrame) == len(datetime_frame) + + shiftedFrame2 = datetime_frame.shift(5, freq="B") + tm.assert_frame_equal(shiftedFrame, shiftedFrame2) + + d = datetime_frame.index[0] + shifted_d = d + offsets.BDay(5) + tm.assert_series_equal( + datetime_frame.xs(d), shiftedFrame.xs(shifted_d), check_names=False + ) + + # shift int frame + int_shifted = int_frame.shift(1) # noqa + + # Shifting with PeriodIndex + ps = tm.makePeriodFrame() + shifted = ps.shift(1) + unshifted = shifted.shift(-1) + tm.assert_index_equal(shifted.index, ps.index) + tm.assert_index_equal(unshifted.index, ps.index) + tm.assert_numpy_array_equal( + unshifted.iloc[:, 0].dropna().values, ps.iloc[:-1, 0].values + ) + + shifted2 = ps.shift(1, "B") + shifted3 = ps.shift(1, offsets.BDay()) + tm.assert_frame_equal(shifted2, shifted3) + tm.assert_frame_equal(ps, shifted2.shift(-1, "B")) + + msg = "does not match PeriodIndex freq" + with pytest.raises(ValueError, match=msg): + ps.shift(freq="D") + + # shift other axis + # GH#6371 + df = DataFrame(np.random.rand(10, 5)) + expected = pd.concat( + [DataFrame(np.nan, index=df.index, columns=[0]), df.iloc[:, 0:-1]], + ignore_index=True, + axis=1, + ) + result = df.shift(1, axis=1) + tm.assert_frame_equal(result, expected) + + # shift named axis + df = DataFrame(np.random.rand(10, 5)) + expected = pd.concat( + [DataFrame(np.nan, index=df.index, columns=[0]), df.iloc[:, 0:-1]], + ignore_index=True, + axis=1, + ) + result = df.shift(1, axis="columns") + tm.assert_frame_equal(result, expected) + + def test_shift_bool(self): + df = DataFrame({"high": [True, False], "low": [False, False]}) + rs = df.shift(1) + xp = DataFrame( + np.array([[np.nan, np.nan], [True, False]], dtype=object), + columns=["high", "low"], + ) + tm.assert_frame_equal(rs, xp) + + def test_shift_categorical(self): + # GH#9416 + s1 = pd.Series(["a", "b", "c"], dtype="category") + s2 = pd.Series(["A", "B", "C"], dtype="category") + df = DataFrame({"one": s1, "two": s2}) + rs = df.shift(1) + xp = DataFrame({"one": s1.shift(1), "two": s2.shift(1)}) + tm.assert_frame_equal(rs, xp) + + def test_shift_fill_value(self): + # GH#24128 + df = DataFrame( + [1, 2, 3, 4, 5], index=date_range("1/1/2000", periods=5, freq="H") + ) + exp = DataFrame( + [0, 1, 2, 3, 4], index=date_range("1/1/2000", periods=5, freq="H") + ) + result = df.shift(1, fill_value=0) + tm.assert_frame_equal(result, exp) + + exp = DataFrame( + [0, 0, 1, 2, 3], index=date_range("1/1/2000", periods=5, freq="H") + ) + result = df.shift(2, fill_value=0) + tm.assert_frame_equal(result, exp) + + def test_shift_empty(self): + # Regression test for GH#8019 + df = DataFrame({"foo": []}) + rs = df.shift(-1) + + tm.assert_frame_equal(df, rs) + + def test_shift_duplicate_columns(self): + # GH#9092; verify that position-based shifting works + # in the presence of duplicate columns + column_lists = [list(range(5)), [1] * 5, [1, 1, 2, 2, 1]] + data = np.random.randn(20, 5) + + shifted = [] + for columns in column_lists: + df = pd.DataFrame(data.copy(), columns=columns) + for s in range(5): + df.iloc[:, s] = df.iloc[:, s].shift(s + 1) + df.columns = range(5) + shifted.append(df) + + # sanity check the base case + nulls = shifted[0].isna().sum() + tm.assert_series_equal(nulls, Series(range(1, 6), dtype="int64")) + + # check all answers are the same + tm.assert_frame_equal(shifted[0], shifted[1]) + tm.assert_frame_equal(shifted[0], shifted[2]) + + def test_tshift(self, datetime_frame): + # PeriodIndex + ps = tm.makePeriodFrame() + shifted = ps.tshift(1) + unshifted = shifted.tshift(-1) + + tm.assert_frame_equal(unshifted, ps) + + shifted2 = ps.tshift(freq="B") + tm.assert_frame_equal(shifted, shifted2) + + shifted3 = ps.tshift(freq=offsets.BDay()) + tm.assert_frame_equal(shifted, shifted3) + + with pytest.raises(ValueError, match="does not match"): + ps.tshift(freq="M") + + # DatetimeIndex + shifted = datetime_frame.tshift(1) + unshifted = shifted.tshift(-1) + + tm.assert_frame_equal(datetime_frame, unshifted) + + shifted2 = datetime_frame.tshift(freq=datetime_frame.index.freq) + tm.assert_frame_equal(shifted, shifted2) + + inferred_ts = DataFrame( + datetime_frame.values, + Index(np.asarray(datetime_frame.index)), + columns=datetime_frame.columns, + ) + shifted = inferred_ts.tshift(1) + unshifted = shifted.tshift(-1) + tm.assert_frame_equal(shifted, datetime_frame.tshift(1)) + tm.assert_frame_equal(unshifted, inferred_ts) + + no_freq = datetime_frame.iloc[[0, 5, 7], :] + msg = "Freq was not given and was not set in the index" + with pytest.raises(ValueError, match=msg): + no_freq.tshift() diff --git a/venv/Lib/site-packages/pandas/tests/frame/methods/test_sort_index.py b/venv/Lib/site-packages/pandas/tests/frame/methods/test_sort_index.py new file mode 100644 index 0000000..2c25e1f --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/methods/test_sort_index.py @@ -0,0 +1,320 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import CategoricalDtype, DataFrame, IntervalIndex, MultiIndex, Series +import pandas._testing as tm + + +class TestDataFrameSortIndex: + def test_sort_index_nan(self): + # GH#3917 + + # Test DataFrame with nan label + df = DataFrame( + {"A": [1, 2, np.nan, 1, 6, 8, 4], "B": [9, np.nan, 5, 2, 5, 4, 5]}, + index=[1, 2, 3, 4, 5, 6, np.nan], + ) + + # NaN label, ascending=True, na_position='last' + sorted_df = df.sort_index(kind="quicksort", ascending=True, na_position="last") + expected = DataFrame( + {"A": [1, 2, np.nan, 1, 6, 8, 4], "B": [9, np.nan, 5, 2, 5, 4, 5]}, + index=[1, 2, 3, 4, 5, 6, np.nan], + ) + tm.assert_frame_equal(sorted_df, expected) + + # NaN label, ascending=True, na_position='first' + sorted_df = df.sort_index(na_position="first") + expected = DataFrame( + {"A": [4, 1, 2, np.nan, 1, 6, 8], "B": [5, 9, np.nan, 5, 2, 5, 4]}, + index=[np.nan, 1, 2, 3, 4, 5, 6], + ) + tm.assert_frame_equal(sorted_df, expected) + + # NaN label, ascending=False, na_position='last' + sorted_df = df.sort_index(kind="quicksort", ascending=False) + expected = DataFrame( + {"A": [8, 6, 1, np.nan, 2, 1, 4], "B": [4, 5, 2, 5, np.nan, 9, 5]}, + index=[6, 5, 4, 3, 2, 1, np.nan], + ) + tm.assert_frame_equal(sorted_df, expected) + + # NaN label, ascending=False, na_position='first' + sorted_df = df.sort_index( + kind="quicksort", ascending=False, na_position="first" + ) + expected = DataFrame( + {"A": [4, 8, 6, 1, np.nan, 2, 1], "B": [5, 4, 5, 2, 5, np.nan, 9]}, + index=[np.nan, 6, 5, 4, 3, 2, 1], + ) + tm.assert_frame_equal(sorted_df, expected) + + def test_sort_index_multi_index(self): + # GH#25775, testing that sorting by index works with a multi-index. + df = DataFrame( + {"a": [3, 1, 2], "b": [0, 0, 0], "c": [0, 1, 2], "d": list("abc")} + ) + result = df.set_index(list("abc")).sort_index(level=list("ba")) + + expected = DataFrame( + {"a": [1, 2, 3], "b": [0, 0, 0], "c": [1, 2, 0], "d": list("bca")} + ) + expected = expected.set_index(list("abc")) + + tm.assert_frame_equal(result, expected) + + def test_sort_index_inplace(self): + frame = DataFrame( + np.random.randn(4, 4), index=[1, 2, 3, 4], columns=["A", "B", "C", "D"] + ) + + # axis=0 + unordered = frame.loc[[3, 2, 4, 1]] + a_id = id(unordered["A"]) + df = unordered.copy() + df.sort_index(inplace=True) + expected = frame + tm.assert_frame_equal(df, expected) + assert a_id != id(df["A"]) + + df = unordered.copy() + df.sort_index(ascending=False, inplace=True) + expected = frame[::-1] + tm.assert_frame_equal(df, expected) + + # axis=1 + unordered = frame.loc[:, ["D", "B", "C", "A"]] + df = unordered.copy() + df.sort_index(axis=1, inplace=True) + expected = frame + tm.assert_frame_equal(df, expected) + + df = unordered.copy() + df.sort_index(axis=1, ascending=False, inplace=True) + expected = frame.iloc[:, ::-1] + tm.assert_frame_equal(df, expected) + + def test_sort_index_different_sortorder(self): + A = np.arange(20).repeat(5) + B = np.tile(np.arange(5), 20) + + indexer = np.random.permutation(100) + A = A.take(indexer) + B = B.take(indexer) + + df = DataFrame({"A": A, "B": B, "C": np.random.randn(100)}) + + ex_indexer = np.lexsort((df.B.max() - df.B, df.A)) + expected = df.take(ex_indexer) + + # test with multiindex, too + idf = df.set_index(["A", "B"]) + + result = idf.sort_index(ascending=[1, 0]) + expected = idf.take(ex_indexer) + tm.assert_frame_equal(result, expected) + + # also, Series! + result = idf["C"].sort_index(ascending=[1, 0]) + tm.assert_series_equal(result, expected["C"]) + + def test_sort_index_level(self): + mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC")) + df = DataFrame([[1, 2], [3, 4]], mi) + + result = df.sort_index(level="A", sort_remaining=False) + expected = df + tm.assert_frame_equal(result, expected) + + result = df.sort_index(level=["A", "B"], sort_remaining=False) + expected = df + tm.assert_frame_equal(result, expected) + + # Error thrown by sort_index when + # first index is sorted last (GH#26053) + result = df.sort_index(level=["C", "B", "A"]) + expected = df.iloc[[1, 0]] + tm.assert_frame_equal(result, expected) + + result = df.sort_index(level=["B", "C", "A"]) + expected = df.iloc[[1, 0]] + tm.assert_frame_equal(result, expected) + + result = df.sort_index(level=["C", "A"]) + expected = df.iloc[[1, 0]] + tm.assert_frame_equal(result, expected) + + def test_sort_index_categorical_index(self): + + df = DataFrame( + { + "A": np.arange(6, dtype="int64"), + "B": Series(list("aabbca")).astype(CategoricalDtype(list("cab"))), + } + ).set_index("B") + + result = df.sort_index() + expected = df.iloc[[4, 0, 1, 5, 2, 3]] + tm.assert_frame_equal(result, expected) + + result = df.sort_index(ascending=False) + expected = df.iloc[[2, 3, 0, 1, 5, 4]] + tm.assert_frame_equal(result, expected) + + def test_sort_index(self): + # GH#13496 + + frame = DataFrame( + np.arange(16).reshape(4, 4), + index=[1, 2, 3, 4], + columns=["A", "B", "C", "D"], + ) + + # axis=0 : sort rows by index labels + unordered = frame.loc[[3, 2, 4, 1]] + result = unordered.sort_index(axis=0) + expected = frame + tm.assert_frame_equal(result, expected) + + result = unordered.sort_index(ascending=False) + expected = frame[::-1] + tm.assert_frame_equal(result, expected) + + # axis=1 : sort columns by column names + unordered = frame.iloc[:, [2, 1, 3, 0]] + result = unordered.sort_index(axis=1) + tm.assert_frame_equal(result, frame) + + result = unordered.sort_index(axis=1, ascending=False) + expected = frame.iloc[:, ::-1] + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("level", ["A", 0]) # GH#21052 + def test_sort_index_multiindex(self, level): + # GH#13496 + + # sort rows by specified level of multi-index + mi = MultiIndex.from_tuples( + [[2, 1, 3], [2, 1, 2], [1, 1, 1]], names=list("ABC") + ) + df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mi) + + expected_mi = MultiIndex.from_tuples( + [[1, 1, 1], [2, 1, 2], [2, 1, 3]], names=list("ABC") + ) + expected = pd.DataFrame([[5, 6], [3, 4], [1, 2]], index=expected_mi) + result = df.sort_index(level=level) + tm.assert_frame_equal(result, expected) + + # sort_remaining=False + expected_mi = MultiIndex.from_tuples( + [[1, 1, 1], [2, 1, 3], [2, 1, 2]], names=list("ABC") + ) + expected = pd.DataFrame([[5, 6], [1, 2], [3, 4]], index=expected_mi) + result = df.sort_index(level=level, sort_remaining=False) + tm.assert_frame_equal(result, expected) + + def test_sort_index_intervalindex(self): + # this is a de-facto sort via unstack + # confirming that we sort in the order of the bins + y = Series(np.random.randn(100)) + x1 = Series(np.sign(np.random.randn(100))) + x2 = pd.cut(Series(np.random.randn(100)), bins=[-3, -0.5, 0, 0.5, 3]) + model = pd.concat([y, x1, x2], axis=1, keys=["Y", "X1", "X2"]) + + result = model.groupby(["X1", "X2"], observed=True).mean().unstack() + expected = IntervalIndex.from_tuples( + [(-3.0, -0.5), (-0.5, 0.0), (0.0, 0.5), (0.5, 3.0)], closed="right" + ) + result = result.columns.levels[1].categories + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize( + "original_dict, sorted_dict, ascending, ignore_index, output_index", + [ + ({"A": [1, 2, 3]}, {"A": [2, 3, 1]}, False, True, [0, 1, 2]), + ({"A": [1, 2, 3]}, {"A": [1, 3, 2]}, True, True, [0, 1, 2]), + ({"A": [1, 2, 3]}, {"A": [2, 3, 1]}, False, False, [5, 3, 2]), + ({"A": [1, 2, 3]}, {"A": [1, 3, 2]}, True, False, [2, 3, 5]), + ], + ) + def test_sort_index_ignore_index( + self, inplace, original_dict, sorted_dict, ascending, ignore_index, output_index + ): + # GH 30114 + original_index = [2, 5, 3] + df = DataFrame(original_dict, index=original_index) + expected_df = DataFrame(sorted_dict, index=output_index) + kwargs = { + "ascending": ascending, + "ignore_index": ignore_index, + "inplace": inplace, + } + + if inplace: + result_df = df.copy() + result_df.sort_index(**kwargs) + else: + result_df = df.sort_index(**kwargs) + + tm.assert_frame_equal(result_df, expected_df) + tm.assert_frame_equal(df, DataFrame(original_dict, index=original_index)) + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize( + "original_dict, sorted_dict, ascending, ignore_index, output_index", + [ + ( + {"M1": [1, 2], "M2": [3, 4]}, + {"M1": [1, 2], "M2": [3, 4]}, + True, + True, + [0, 1], + ), + ( + {"M1": [1, 2], "M2": [3, 4]}, + {"M1": [2, 1], "M2": [4, 3]}, + False, + True, + [0, 1], + ), + ( + {"M1": [1, 2], "M2": [3, 4]}, + {"M1": [1, 2], "M2": [3, 4]}, + True, + False, + MultiIndex.from_tuples([[2, 1], [3, 4]], names=list("AB")), + ), + ( + {"M1": [1, 2], "M2": [3, 4]}, + {"M1": [2, 1], "M2": [4, 3]}, + False, + False, + MultiIndex.from_tuples([[3, 4], [2, 1]], names=list("AB")), + ), + ], + ) + def test_sort_index_ignore_index_multi_index( + self, inplace, original_dict, sorted_dict, ascending, ignore_index, output_index + ): + # GH 30114, this is to test ignore_index on MulitIndex of index + mi = MultiIndex.from_tuples([[2, 1], [3, 4]], names=list("AB")) + df = DataFrame(original_dict, index=mi) + expected_df = DataFrame(sorted_dict, index=output_index) + + kwargs = { + "ascending": ascending, + "ignore_index": ignore_index, + "inplace": inplace, + } + + if inplace: + result_df = df.copy() + result_df.sort_index(**kwargs) + else: + result_df = df.sort_index(**kwargs) + + tm.assert_frame_equal(result_df, expected_df) + tm.assert_frame_equal(df, DataFrame(original_dict, index=mi)) diff --git a/venv/Lib/site-packages/pandas/tests/frame/methods/test_sort_values.py b/venv/Lib/site-packages/pandas/tests/frame/methods/test_sort_values.py new file mode 100644 index 0000000..96f4d6e --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/methods/test_sort_values.py @@ -0,0 +1,518 @@ +import random + +import numpy as np +import pytest + +import pandas as pd +from pandas import Categorical, DataFrame, NaT, Timestamp, date_range +import pandas._testing as tm + + +class TestDataFrameSortValues: + def test_sort_values(self): + frame = DataFrame( + [[1, 1, 2], [3, 1, 0], [4, 5, 6]], index=[1, 2, 3], columns=list("ABC") + ) + + # by column (axis=0) + sorted_df = frame.sort_values(by="A") + indexer = frame["A"].argsort().values + expected = frame.loc[frame.index[indexer]] + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort_values(by="A", ascending=False) + indexer = indexer[::-1] + expected = frame.loc[frame.index[indexer]] + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort_values(by="A", ascending=False) + tm.assert_frame_equal(sorted_df, expected) + + # GH4839 + sorted_df = frame.sort_values(by=["A"], ascending=[False]) + tm.assert_frame_equal(sorted_df, expected) + + # multiple bys + sorted_df = frame.sort_values(by=["B", "C"]) + expected = frame.loc[[2, 1, 3]] + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort_values(by=["B", "C"], ascending=False) + tm.assert_frame_equal(sorted_df, expected[::-1]) + + sorted_df = frame.sort_values(by=["B", "A"], ascending=[True, False]) + tm.assert_frame_equal(sorted_df, expected) + + msg = "No axis named 2 for object type " + with pytest.raises(ValueError, match=msg): + frame.sort_values(by=["A", "B"], axis=2, inplace=True) + + # by row (axis=1): GH#10806 + sorted_df = frame.sort_values(by=3, axis=1) + expected = frame + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort_values(by=3, axis=1, ascending=False) + expected = frame.reindex(columns=["C", "B", "A"]) + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort_values(by=[1, 2], axis="columns") + expected = frame.reindex(columns=["B", "A", "C"]) + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=[True, False]) + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=False) + expected = frame.reindex(columns=["C", "B", "A"]) + tm.assert_frame_equal(sorted_df, expected) + + msg = r"Length of ascending \(5\) != length of by \(2\)" + with pytest.raises(ValueError, match=msg): + frame.sort_values(by=["A", "B"], axis=0, ascending=[True] * 5) + + def test_sort_values_inplace(self): + frame = DataFrame( + np.random.randn(4, 4), index=[1, 2, 3, 4], columns=["A", "B", "C", "D"] + ) + + sorted_df = frame.copy() + sorted_df.sort_values(by="A", inplace=True) + expected = frame.sort_values(by="A") + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.copy() + sorted_df.sort_values(by=1, axis=1, inplace=True) + expected = frame.sort_values(by=1, axis=1) + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.copy() + sorted_df.sort_values(by="A", ascending=False, inplace=True) + expected = frame.sort_values(by="A", ascending=False) + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.copy() + sorted_df.sort_values(by=["A", "B"], ascending=False, inplace=True) + expected = frame.sort_values(by=["A", "B"], ascending=False) + tm.assert_frame_equal(sorted_df, expected) + + def test_sort_values_multicolumn(self): + A = np.arange(5).repeat(20) + B = np.tile(np.arange(5), 20) + random.shuffle(A) + random.shuffle(B) + frame = DataFrame({"A": A, "B": B, "C": np.random.randn(100)}) + + result = frame.sort_values(by=["A", "B"]) + indexer = np.lexsort((frame["B"], frame["A"])) + expected = frame.take(indexer) + tm.assert_frame_equal(result, expected) + + result = frame.sort_values(by=["A", "B"], ascending=False) + indexer = np.lexsort( + (frame["B"].rank(ascending=False), frame["A"].rank(ascending=False)) + ) + expected = frame.take(indexer) + tm.assert_frame_equal(result, expected) + + result = frame.sort_values(by=["B", "A"]) + indexer = np.lexsort((frame["A"], frame["B"])) + expected = frame.take(indexer) + tm.assert_frame_equal(result, expected) + + def test_sort_values_multicolumn_uint64(self): + # GH#9918 + # uint64 multicolumn sort + + df = pd.DataFrame( + { + "a": pd.Series([18446637057563306014, 1162265347240853609]), + "b": pd.Series([1, 2]), + } + ) + df["a"] = df["a"].astype(np.uint64) + result = df.sort_values(["a", "b"]) + + expected = pd.DataFrame( + { + "a": pd.Series([18446637057563306014, 1162265347240853609]), + "b": pd.Series([1, 2]), + }, + index=pd.Index([1, 0]), + ) + + tm.assert_frame_equal(result, expected) + + def test_sort_values_nan(self): + # GH#3917 + df = DataFrame( + {"A": [1, 2, np.nan, 1, 6, 8, 4], "B": [9, np.nan, 5, 2, 5, 4, 5]} + ) + + # sort one column only + expected = DataFrame( + {"A": [np.nan, 1, 1, 2, 4, 6, 8], "B": [5, 9, 2, np.nan, 5, 5, 4]}, + index=[2, 0, 3, 1, 6, 4, 5], + ) + sorted_df = df.sort_values(["A"], na_position="first") + tm.assert_frame_equal(sorted_df, expected) + + expected = DataFrame( + {"A": [np.nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, np.nan, 9, 2]}, + index=[2, 5, 4, 6, 1, 0, 3], + ) + sorted_df = df.sort_values(["A"], na_position="first", ascending=False) + tm.assert_frame_equal(sorted_df, expected) + + expected = df.reindex(columns=["B", "A"]) + sorted_df = df.sort_values(by=1, axis=1, na_position="first") + tm.assert_frame_equal(sorted_df, expected) + + # na_position='last', order + expected = DataFrame( + {"A": [1, 1, 2, 4, 6, 8, np.nan], "B": [2, 9, np.nan, 5, 5, 4, 5]}, + index=[3, 0, 1, 6, 4, 5, 2], + ) + sorted_df = df.sort_values(["A", "B"]) + tm.assert_frame_equal(sorted_df, expected) + + # na_position='first', order + expected = DataFrame( + {"A": [np.nan, 1, 1, 2, 4, 6, 8], "B": [5, 2, 9, np.nan, 5, 5, 4]}, + index=[2, 3, 0, 1, 6, 4, 5], + ) + sorted_df = df.sort_values(["A", "B"], na_position="first") + tm.assert_frame_equal(sorted_df, expected) + + # na_position='first', not order + expected = DataFrame( + {"A": [np.nan, 1, 1, 2, 4, 6, 8], "B": [5, 9, 2, np.nan, 5, 5, 4]}, + index=[2, 0, 3, 1, 6, 4, 5], + ) + sorted_df = df.sort_values(["A", "B"], ascending=[1, 0], na_position="first") + tm.assert_frame_equal(sorted_df, expected) + + # na_position='last', not order + expected = DataFrame( + {"A": [8, 6, 4, 2, 1, 1, np.nan], "B": [4, 5, 5, np.nan, 2, 9, 5]}, + index=[5, 4, 6, 1, 3, 0, 2], + ) + sorted_df = df.sort_values(["A", "B"], ascending=[0, 1], na_position="last") + tm.assert_frame_equal(sorted_df, expected) + + def test_sort_values_stable_descending_sort(self): + # GH#6399 + df = DataFrame( + [[2, "first"], [2, "second"], [1, "a"], [1, "b"]], + columns=["sort_col", "order"], + ) + sorted_df = df.sort_values(by="sort_col", kind="mergesort", ascending=False) + tm.assert_frame_equal(df, sorted_df) + + def test_sort_values_stable_descending_multicolumn_sort(self): + df = DataFrame( + {"A": [1, 2, np.nan, 1, 6, 8, 4], "B": [9, np.nan, 5, 2, 5, 4, 5]} + ) + # test stable mergesort + expected = DataFrame( + {"A": [np.nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, np.nan, 2, 9]}, + index=[2, 5, 4, 6, 1, 3, 0], + ) + sorted_df = df.sort_values( + ["A", "B"], ascending=[0, 1], na_position="first", kind="mergesort" + ) + tm.assert_frame_equal(sorted_df, expected) + + expected = DataFrame( + {"A": [np.nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, np.nan, 9, 2]}, + index=[2, 5, 4, 6, 1, 0, 3], + ) + sorted_df = df.sort_values( + ["A", "B"], ascending=[0, 0], na_position="first", kind="mergesort" + ) + tm.assert_frame_equal(sorted_df, expected) + + def test_sort_values_stable_categorial(self): + # GH#16793 + df = DataFrame({"x": pd.Categorical(np.repeat([1, 2, 3, 4], 5), ordered=True)}) + expected = df.copy() + sorted_df = df.sort_values("x", kind="mergesort") + tm.assert_frame_equal(sorted_df, expected) + + def test_sort_values_datetimes(self): + + # GH#3461, argsort / lexsort differences for a datetime column + df = DataFrame( + ["a", "a", "a", "b", "c", "d", "e", "f", "g"], + columns=["A"], + index=date_range("20130101", periods=9), + ) + dts = [ + Timestamp(x) + for x in [ + "2004-02-11", + "2004-01-21", + "2004-01-26", + "2005-09-20", + "2010-10-04", + "2009-05-12", + "2008-11-12", + "2010-09-28", + "2010-09-28", + ] + ] + df["B"] = dts[::2] + dts[1::2] + df["C"] = 2.0 + df["A1"] = 3.0 + + df1 = df.sort_values(by="A") + df2 = df.sort_values(by=["A"]) + tm.assert_frame_equal(df1, df2) + + df1 = df.sort_values(by="B") + df2 = df.sort_values(by=["B"]) + tm.assert_frame_equal(df1, df2) + + df1 = df.sort_values(by="B") + + df2 = df.sort_values(by=["C", "B"]) + tm.assert_frame_equal(df1, df2) + + def test_sort_values_frame_column_inplace_sort_exception(self, float_frame): + s = float_frame["A"] + with pytest.raises(ValueError, match="This Series is a view"): + s.sort_values(inplace=True) + + cp = s.copy() + cp.sort_values() # it works! + + def test_sort_values_nat_values_in_int_column(self): + + # GH#14922: "sorting with large float and multiple columns incorrect" + + # cause was that the int64 value NaT was considered as "na". Which is + # only correct for datetime64 columns. + + int_values = (2, int(NaT)) + float_values = (2.0, -1.797693e308) + + df = DataFrame( + dict(int=int_values, float=float_values), columns=["int", "float"] + ) + + df_reversed = DataFrame( + dict(int=int_values[::-1], float=float_values[::-1]), + columns=["int", "float"], + index=[1, 0], + ) + + # NaT is not a "na" for int64 columns, so na_position must not + # influence the result: + df_sorted = df.sort_values(["int", "float"], na_position="last") + tm.assert_frame_equal(df_sorted, df_reversed) + + df_sorted = df.sort_values(["int", "float"], na_position="first") + tm.assert_frame_equal(df_sorted, df_reversed) + + # reverse sorting order + df_sorted = df.sort_values(["int", "float"], ascending=False) + tm.assert_frame_equal(df_sorted, df) + + # and now check if NaT is still considered as "na" for datetime64 + # columns: + df = DataFrame( + dict(datetime=[Timestamp("2016-01-01"), NaT], float=float_values), + columns=["datetime", "float"], + ) + + df_reversed = DataFrame( + dict(datetime=[NaT, Timestamp("2016-01-01")], float=float_values[::-1]), + columns=["datetime", "float"], + index=[1, 0], + ) + + df_sorted = df.sort_values(["datetime", "float"], na_position="first") + tm.assert_frame_equal(df_sorted, df_reversed) + + df_sorted = df.sort_values(["datetime", "float"], na_position="last") + tm.assert_frame_equal(df_sorted, df) + + # Ascending should not affect the results. + df_sorted = df.sort_values(["datetime", "float"], ascending=False) + tm.assert_frame_equal(df_sorted, df) + + def test_sort_values_na_position_with_categories(self): + # GH#22556 + # Positioning missing value properly when column is Categorical. + categories = ["A", "B", "C"] + category_indices = [0, 2, 4] + list_of_nans = [np.nan, np.nan] + na_indices = [1, 3] + na_position_first = "first" + na_position_last = "last" + column_name = "c" + + reversed_categories = sorted(categories, reverse=True) + reversed_category_indices = sorted(category_indices, reverse=True) + reversed_na_indices = sorted(na_indices) + + df = pd.DataFrame( + { + column_name: pd.Categorical( + ["A", np.nan, "B", np.nan, "C"], categories=categories, ordered=True + ) + } + ) + # sort ascending with na first + result = df.sort_values( + by=column_name, ascending=True, na_position=na_position_first + ) + expected = DataFrame( + { + column_name: Categorical( + list_of_nans + categories, categories=categories, ordered=True + ) + }, + index=na_indices + category_indices, + ) + + tm.assert_frame_equal(result, expected) + + # sort ascending with na last + result = df.sort_values( + by=column_name, ascending=True, na_position=na_position_last + ) + expected = DataFrame( + { + column_name: Categorical( + categories + list_of_nans, categories=categories, ordered=True + ) + }, + index=category_indices + na_indices, + ) + + tm.assert_frame_equal(result, expected) + + # sort descending with na first + result = df.sort_values( + by=column_name, ascending=False, na_position=na_position_first + ) + expected = DataFrame( + { + column_name: Categorical( + list_of_nans + reversed_categories, + categories=categories, + ordered=True, + ) + }, + index=reversed_na_indices + reversed_category_indices, + ) + + tm.assert_frame_equal(result, expected) + + # sort descending with na last + result = df.sort_values( + by=column_name, ascending=False, na_position=na_position_last + ) + expected = DataFrame( + { + column_name: Categorical( + reversed_categories + list_of_nans, + categories=categories, + ordered=True, + ) + }, + index=reversed_category_indices + reversed_na_indices, + ) + + tm.assert_frame_equal(result, expected) + + def test_sort_values_nat(self): + + # GH#16836 + + d1 = [Timestamp(x) for x in ["2016-01-01", "2015-01-01", np.nan, "2016-01-01"]] + d2 = [ + Timestamp(x) + for x in ["2017-01-01", "2014-01-01", "2016-01-01", "2015-01-01"] + ] + df = pd.DataFrame({"a": d1, "b": d2}, index=[0, 1, 2, 3]) + + d3 = [Timestamp(x) for x in ["2015-01-01", "2016-01-01", "2016-01-01", np.nan]] + d4 = [ + Timestamp(x) + for x in ["2014-01-01", "2015-01-01", "2017-01-01", "2016-01-01"] + ] + expected = pd.DataFrame({"a": d3, "b": d4}, index=[1, 3, 0, 2]) + sorted_df = df.sort_values(by=["a", "b"]) + tm.assert_frame_equal(sorted_df, expected) + + def test_sort_values_na_position_with_categories_raises(self): + df = pd.DataFrame( + { + "c": pd.Categorical( + ["A", np.nan, "B", np.nan, "C"], + categories=["A", "B", "C"], + ordered=True, + ) + } + ) + + with pytest.raises(ValueError): + df.sort_values(by="c", ascending=False, na_position="bad_position") + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize( + "original_dict, sorted_dict, ignore_index, output_index", + [ + ({"A": [1, 2, 3]}, {"A": [3, 2, 1]}, True, [0, 1, 2]), + ({"A": [1, 2, 3]}, {"A": [3, 2, 1]}, False, [2, 1, 0]), + ( + {"A": [1, 2, 3], "B": [2, 3, 4]}, + {"A": [3, 2, 1], "B": [4, 3, 2]}, + True, + [0, 1, 2], + ), + ( + {"A": [1, 2, 3], "B": [2, 3, 4]}, + {"A": [3, 2, 1], "B": [4, 3, 2]}, + False, + [2, 1, 0], + ), + ], + ) + def test_sort_values_ignore_index( + self, inplace, original_dict, sorted_dict, ignore_index, output_index + ): + # GH 30114 + df = DataFrame(original_dict) + expected = DataFrame(sorted_dict, index=output_index) + kwargs = {"ignore_index": ignore_index, "inplace": inplace} + + if inplace: + result_df = df.copy() + result_df.sort_values("A", ascending=False, **kwargs) + else: + result_df = df.sort_values("A", ascending=False, **kwargs) + + tm.assert_frame_equal(result_df, expected) + tm.assert_frame_equal(df, DataFrame(original_dict)) + + def test_sort_values_nat_na_position_default(self): + # GH 13230 + expected = pd.DataFrame( + { + "A": [1, 2, 3, 4, 4], + "date": pd.DatetimeIndex( + [ + "2010-01-01 09:00:00", + "2010-01-01 09:00:01", + "2010-01-01 09:00:02", + "2010-01-01 09:00:03", + "NaT", + ] + ), + } + ) + result = expected.sort_values(["A", "date"]) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/frame/methods/test_to_dict.py b/venv/Lib/site-packages/pandas/tests/frame/methods/test_to_dict.py new file mode 100644 index 0000000..7b0adce --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/methods/test_to_dict.py @@ -0,0 +1,258 @@ +from collections import OrderedDict, defaultdict +from datetime import datetime + +import numpy as np +import pytest +import pytz + +from pandas import DataFrame, Series, Timestamp +import pandas._testing as tm + + +class TestDataFrameToDict: + def test_to_dict_timestamp(self): + + # GH#11247 + # split/records producing np.datetime64 rather than Timestamps + # on datetime64[ns] dtypes only + + tsmp = Timestamp("20130101") + test_data = DataFrame({"A": [tsmp, tsmp], "B": [tsmp, tsmp]}) + test_data_mixed = DataFrame({"A": [tsmp, tsmp], "B": [1, 2]}) + + expected_records = [{"A": tsmp, "B": tsmp}, {"A": tsmp, "B": tsmp}] + expected_records_mixed = [{"A": tsmp, "B": 1}, {"A": tsmp, "B": 2}] + + assert test_data.to_dict(orient="records") == expected_records + assert test_data_mixed.to_dict(orient="records") == expected_records_mixed + + expected_series = { + "A": Series([tsmp, tsmp], name="A"), + "B": Series([tsmp, tsmp], name="B"), + } + expected_series_mixed = { + "A": Series([tsmp, tsmp], name="A"), + "B": Series([1, 2], name="B"), + } + + tm.assert_dict_equal(test_data.to_dict(orient="series"), expected_series) + tm.assert_dict_equal( + test_data_mixed.to_dict(orient="series"), expected_series_mixed + ) + + expected_split = { + "index": [0, 1], + "data": [[tsmp, tsmp], [tsmp, tsmp]], + "columns": ["A", "B"], + } + expected_split_mixed = { + "index": [0, 1], + "data": [[tsmp, 1], [tsmp, 2]], + "columns": ["A", "B"], + } + + tm.assert_dict_equal(test_data.to_dict(orient="split"), expected_split) + tm.assert_dict_equal( + test_data_mixed.to_dict(orient="split"), expected_split_mixed + ) + + def test_to_dict_index_not_unique_with_index_orient(self): + # GH#22801 + # Data loss when indexes are not unique. Raise ValueError. + df = DataFrame({"a": [1, 2], "b": [0.5, 0.75]}, index=["A", "A"]) + msg = "DataFrame index must be unique for orient='index'" + with pytest.raises(ValueError, match=msg): + df.to_dict(orient="index") + + def test_to_dict_invalid_orient(self): + df = DataFrame({"A": [0, 1]}) + msg = "orient 'xinvalid' not understood" + with pytest.raises(ValueError, match=msg): + df.to_dict(orient="xinvalid") + + @pytest.mark.parametrize("mapping", [dict, defaultdict(list), OrderedDict]) + def test_to_dict(self, mapping): + test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}} + + # GH#16122 + recons_data = DataFrame(test_data).to_dict(into=mapping) + + for k, v in test_data.items(): + for k2, v2 in v.items(): + assert v2 == recons_data[k][k2] + + recons_data = DataFrame(test_data).to_dict("l", mapping) + + for k, v in test_data.items(): + for k2, v2 in v.items(): + assert v2 == recons_data[k][int(k2) - 1] + + recons_data = DataFrame(test_data).to_dict("s", mapping) + + for k, v in test_data.items(): + for k2, v2 in v.items(): + assert v2 == recons_data[k][k2] + + recons_data = DataFrame(test_data).to_dict("sp", mapping) + expected_split = { + "columns": ["A", "B"], + "index": ["1", "2", "3"], + "data": [[1.0, "1"], [2.0, "2"], [np.nan, "3"]], + } + tm.assert_dict_equal(recons_data, expected_split) + + recons_data = DataFrame(test_data).to_dict("r", mapping) + expected_records = [ + {"A": 1.0, "B": "1"}, + {"A": 2.0, "B": "2"}, + {"A": np.nan, "B": "3"}, + ] + assert isinstance(recons_data, list) + assert len(recons_data) == 3 + for l, r in zip(recons_data, expected_records): + tm.assert_dict_equal(l, r) + + # GH#10844 + recons_data = DataFrame(test_data).to_dict("i") + + for k, v in test_data.items(): + for k2, v2 in v.items(): + assert v2 == recons_data[k2][k] + + df = DataFrame(test_data) + df["duped"] = df[df.columns[0]] + recons_data = df.to_dict("i") + comp_data = test_data.copy() + comp_data["duped"] = comp_data[df.columns[0]] + for k, v in comp_data.items(): + for k2, v2 in v.items(): + assert v2 == recons_data[k2][k] + + @pytest.mark.parametrize("mapping", [list, defaultdict, []]) + def test_to_dict_errors(self, mapping): + # GH#16122 + df = DataFrame(np.random.randn(3, 3)) + with pytest.raises(TypeError): + df.to_dict(into=mapping) + + def test_to_dict_not_unique_warning(self): + # GH#16927: When converting to a dict, if a column has a non-unique name + # it will be dropped, throwing a warning. + df = DataFrame([[1, 2, 3]], columns=["a", "a", "b"]) + with tm.assert_produces_warning(UserWarning): + df.to_dict() + + # orient - orient argument to to_dict function + # item_getter - function for extracting value from + # the resulting dict using column name and index + @pytest.mark.parametrize( + "orient,item_getter", + [ + ("dict", lambda d, col, idx: d[col][idx]), + ("records", lambda d, col, idx: d[idx][col]), + ("list", lambda d, col, idx: d[col][idx]), + ("split", lambda d, col, idx: d["data"][idx][d["columns"].index(col)]), + ("index", lambda d, col, idx: d[idx][col]), + ], + ) + def test_to_dict_box_scalars(self, orient, item_getter): + # GH#14216, GH#23753 + # make sure that we are boxing properly + df = DataFrame({"a": [1, 2], "b": [0.1, 0.2]}) + result = df.to_dict(orient=orient) + assert isinstance(item_getter(result, "a", 0), int) + assert isinstance(item_getter(result, "b", 0), float) + + def test_to_dict_tz(self): + # GH#18372 When converting to dict with orient='records' columns of + # datetime that are tz-aware were not converted to required arrays + data = [ + (datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),), + (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc),), + ] + df = DataFrame(list(data), columns=["d"]) + + result = df.to_dict(orient="records") + expected = [ + {"d": Timestamp("2017-11-18 21:53:00.219225+0000", tz=pytz.utc)}, + {"d": Timestamp("2017-11-18 22:06:30.061810+0000", tz=pytz.utc)}, + ] + tm.assert_dict_equal(result[0], expected[0]) + tm.assert_dict_equal(result[1], expected[1]) + + @pytest.mark.parametrize( + "into, expected", + [ + ( + dict, + { + 0: {"int_col": 1, "float_col": 1.0}, + 1: {"int_col": 2, "float_col": 2.0}, + 2: {"int_col": 3, "float_col": 3.0}, + }, + ), + ( + OrderedDict, + OrderedDict( + [ + (0, {"int_col": 1, "float_col": 1.0}), + (1, {"int_col": 2, "float_col": 2.0}), + (2, {"int_col": 3, "float_col": 3.0}), + ] + ), + ), + ( + defaultdict(dict), + defaultdict( + dict, + { + 0: {"int_col": 1, "float_col": 1.0}, + 1: {"int_col": 2, "float_col": 2.0}, + 2: {"int_col": 3, "float_col": 3.0}, + }, + ), + ), + ], + ) + def test_to_dict_index_dtypes(self, into, expected): + # GH#18580 + # When using to_dict(orient='index') on a dataframe with int + # and float columns only the int columns were cast to float + + df = DataFrame({"int_col": [1, 2, 3], "float_col": [1.0, 2.0, 3.0]}) + + result = df.to_dict(orient="index", into=into) + cols = ["int_col", "float_col"] + result = DataFrame.from_dict(result, orient="index")[cols] + expected = DataFrame.from_dict(expected, orient="index")[cols] + tm.assert_frame_equal(result, expected) + + def test_to_dict_numeric_names(self): + # GH#24940 + df = DataFrame({str(i): [i] for i in range(5)}) + result = set(df.to_dict("records")[0].keys()) + expected = set(df.columns) + assert result == expected + + def test_to_dict_wide(self): + # GH#24939 + df = DataFrame({("A_{:d}".format(i)): [i] for i in range(256)}) + result = df.to_dict("records")[0] + expected = {"A_{:d}".format(i): i for i in range(256)} + assert result == expected + + def test_to_dict_orient_dtype(self): + # GH#22620 + # Input Data + input_data = {"a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["X", "Y", "Z"]} + df = DataFrame(input_data) + # Expected Dtypes + expected = {"a": int, "b": float, "c": str} + # Extracting dtypes out of to_dict operation + for df_dict in df.to_dict("records"): + result = { + "a": type(df_dict["a"]), + "b": type(df_dict["b"]), + "c": type(df_dict["c"]), + } + assert result == expected diff --git a/venv/Lib/site-packages/pandas/tests/frame/methods/test_to_records.py b/venv/Lib/site-packages/pandas/tests/frame/methods/test_to_records.py new file mode 100644 index 0000000..d0181f0 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/methods/test_to_records.py @@ -0,0 +1,360 @@ +from collections import abc + +import numpy as np +import pytest + +from pandas import CategoricalDtype, DataFrame, MultiIndex, Series, date_range +import pandas._testing as tm + + +class TestDataFrameToRecords: + def test_to_records_dt64(self): + df = DataFrame( + [["one", "two", "three"], ["four", "five", "six"]], + index=date_range("2012-01-01", "2012-01-02"), + ) + + expected = df.index.values[0] + result = df.to_records()["index"][0] + assert expected == result + + def test_to_records_with_multindex(self): + # GH#3189 + index = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + data = np.zeros((8, 4)) + df = DataFrame(data, index=index) + r = df.to_records(index=True)["level_0"] + assert "bar" in r + assert "one" not in r + + def test_to_records_with_Mapping_type(self): + import email + from email.parser import Parser + + abc.Mapping.register(email.message.Message) + + headers = Parser().parsestr( + "From: \n" + "To: \n" + "Subject: Test message\n" + "\n" + "Body would go here\n" + ) + + frame = DataFrame.from_records([headers]) + all(x in frame for x in ["Type", "Subject", "From"]) + + def test_to_records_floats(self): + df = DataFrame(np.random.rand(10, 10)) + df.to_records() + + def test_to_records_index_name(self): + df = DataFrame(np.random.randn(3, 3)) + df.index.name = "X" + rs = df.to_records() + assert "X" in rs.dtype.fields + + df = DataFrame(np.random.randn(3, 3)) + rs = df.to_records() + assert "index" in rs.dtype.fields + + df.index = MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")]) + df.index.names = ["A", None] + rs = df.to_records() + assert "level_0" in rs.dtype.fields + + def test_to_records_with_unicode_index(self): + # GH#13172 + # unicode_literals conflict with to_records + result = DataFrame([{"a": "x", "b": "y"}]).set_index("a").to_records() + expected = np.rec.array([("x", "y")], dtype=[("a", "O"), ("b", "O")]) + tm.assert_almost_equal(result, expected) + + def test_to_records_with_unicode_column_names(self): + # xref issue: https://github.com/numpy/numpy/issues/2407 + # Issue GH#11879. to_records used to raise an exception when used + # with column names containing non-ascii characters in Python 2 + result = DataFrame(data={"accented_name_é": [1.0]}).to_records() + + # Note that numpy allows for unicode field names but dtypes need + # to be specified using dictionary instead of list of tuples. + expected = np.rec.array( + [(0, 1.0)], + dtype={"names": ["index", "accented_name_é"], "formats": ["=i8", "=f8"]}, + ) + tm.assert_almost_equal(result, expected) + + def test_to_records_with_categorical(self): + # GH#8626 + + # dict creation + df = DataFrame({"A": list("abc")}, dtype="category") + expected = Series(list("abc"), dtype="category", name="A") + tm.assert_series_equal(df["A"], expected) + + # list-like creation + df = DataFrame(list("abc"), dtype="category") + expected = Series(list("abc"), dtype="category", name=0) + tm.assert_series_equal(df[0], expected) + + # to record array + # this coerces + result = df.to_records() + expected = np.rec.array( + [(0, "a"), (1, "b"), (2, "c")], dtype=[("index", "=i8"), ("0", "O")] + ) + tm.assert_almost_equal(result, expected) + + @pytest.mark.parametrize( + "kwargs,expected", + [ + # No dtypes --> default to array dtypes. + ( + dict(), + np.rec.array( + [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], + dtype=[("index", " bool: + return key in self.d + + def keys(self): + return self.d.keys() + + df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]}) + + dtype_mappings = dict( + column_dtypes=DictLike(**{"A": np.int8, "B": np.float32}), + index_dtypes="= 1)] + result = df2.set_index("key") + tm.assert_frame_equal(result, expected) + + # MultiIndex constructor does not work directly on Series -> lambda + # Add list-of-list constructor because list is ambiguous -> lambda + # also test index name if append=True (name is duplicate here for B) + @pytest.mark.parametrize( + "box", + [ + Series, + Index, + np.array, + list, + lambda x: [list(x)], + lambda x: MultiIndex.from_arrays([x]), + ], + ) + @pytest.mark.parametrize( + "append, index_name", [(True, None), (True, "B"), (True, "test"), (False, None)] + ) + @pytest.mark.parametrize("drop", [True, False]) + def test_set_index_pass_single_array( + self, frame_of_index_cols, drop, append, index_name, box + ): + df = frame_of_index_cols + df.index.name = index_name + + key = box(df["B"]) + if box == list: + # list of strings gets interpreted as list of keys + msg = "['one', 'two', 'three', 'one', 'two']" + with pytest.raises(KeyError, match=msg): + df.set_index(key, drop=drop, append=append) + else: + # np.array/list-of-list "forget" the name of B + name_mi = getattr(key, "names", None) + name = [getattr(key, "name", None)] if name_mi is None else name_mi + + result = df.set_index(key, drop=drop, append=append) + + # only valid column keys are dropped + # since B is always passed as array above, nothing is dropped + expected = df.set_index(["B"], drop=False, append=append) + expected.index.names = [index_name] + name if append else name + + tm.assert_frame_equal(result, expected) + + # MultiIndex constructor does not work directly on Series -> lambda + # also test index name if append=True (name is duplicate here for A & B) + @pytest.mark.parametrize( + "box", [Series, Index, np.array, list, lambda x: MultiIndex.from_arrays([x])] + ) + @pytest.mark.parametrize( + "append, index_name", + [(True, None), (True, "A"), (True, "B"), (True, "test"), (False, None)], + ) + @pytest.mark.parametrize("drop", [True, False]) + def test_set_index_pass_arrays( + self, frame_of_index_cols, drop, append, index_name, box + ): + df = frame_of_index_cols + df.index.name = index_name + + keys = ["A", box(df["B"])] + # np.array/list "forget" the name of B + names = ["A", None if box in [np.array, list, tuple, iter] else "B"] + + result = df.set_index(keys, drop=drop, append=append) + + # only valid column keys are dropped + # since B is always passed as array above, only A is dropped, if at all + expected = df.set_index(["A", "B"], drop=False, append=append) + expected = expected.drop("A", axis=1) if drop else expected + expected.index.names = [index_name] + names if append else names + + tm.assert_frame_equal(result, expected) + + # MultiIndex constructor does not work directly on Series -> lambda + # We also emulate a "constructor" for the label -> lambda + # also test index name if append=True (name is duplicate here for A) + @pytest.mark.parametrize( + "box2", + [ + Series, + Index, + np.array, + list, + iter, + lambda x: MultiIndex.from_arrays([x]), + lambda x: x.name, + ], + ) + @pytest.mark.parametrize( + "box1", + [ + Series, + Index, + np.array, + list, + iter, + lambda x: MultiIndex.from_arrays([x]), + lambda x: x.name, + ], + ) + @pytest.mark.parametrize( + "append, index_name", [(True, None), (True, "A"), (True, "test"), (False, None)] + ) + @pytest.mark.parametrize("drop", [True, False]) + def test_set_index_pass_arrays_duplicate( + self, frame_of_index_cols, drop, append, index_name, box1, box2 + ): + df = frame_of_index_cols + df.index.name = index_name + + keys = [box1(df["A"]), box2(df["A"])] + result = df.set_index(keys, drop=drop, append=append) + + # if either box is iter, it has been consumed; re-read + keys = [box1(df["A"]), box2(df["A"])] + + # need to adapt first drop for case that both keys are 'A' -- + # cannot drop the same column twice; + # use "is" because == would give ambiguous Boolean error for containers + first_drop = ( + False if (keys[0] is "A" and keys[1] is "A") else drop # noqa: F632 + ) + # to test against already-tested behaviour, we add sequentially, + # hence second append always True; must wrap keys in list, otherwise + # box = list would be interpreted as keys + expected = df.set_index([keys[0]], drop=first_drop, append=append) + expected = expected.set_index([keys[1]], drop=drop, append=True) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("append", [True, False]) + @pytest.mark.parametrize("drop", [True, False]) + def test_set_index_pass_multiindex(self, frame_of_index_cols, drop, append): + df = frame_of_index_cols + keys = MultiIndex.from_arrays([df["A"], df["B"]], names=["A", "B"]) + + result = df.set_index(keys, drop=drop, append=append) + + # setting with a MultiIndex will never drop columns + expected = df.set_index(["A", "B"], drop=False, append=append) + + tm.assert_frame_equal(result, expected) + + def test_set_index_verify_integrity(self, frame_of_index_cols): + df = frame_of_index_cols + + with pytest.raises(ValueError, match="Index has duplicate keys"): + df.set_index("A", verify_integrity=True) + # with MultiIndex + with pytest.raises(ValueError, match="Index has duplicate keys"): + df.set_index([df["A"], df["A"]], verify_integrity=True) + + @pytest.mark.parametrize("append", [True, False]) + @pytest.mark.parametrize("drop", [True, False]) + def test_set_index_raise_keys(self, frame_of_index_cols, drop, append): + df = frame_of_index_cols + + with pytest.raises(KeyError, match="['foo', 'bar', 'baz']"): + # column names are A-E, as well as one tuple + df.set_index(["foo", "bar", "baz"], drop=drop, append=append) + + # non-existent key in list with arrays + with pytest.raises(KeyError, match="X"): + df.set_index([df["A"], df["B"], "X"], drop=drop, append=append) + + msg = "[('foo', 'foo', 'foo', 'bar', 'bar')]" + # tuples always raise KeyError + with pytest.raises(KeyError, match=msg): + df.set_index(tuple(df["A"]), drop=drop, append=append) + + # also within a list + with pytest.raises(KeyError, match=msg): + df.set_index(["A", df["A"], tuple(df["A"])], drop=drop, append=append) + + @pytest.mark.parametrize("append", [True, False]) + @pytest.mark.parametrize("drop", [True, False]) + @pytest.mark.parametrize("box", [set], ids=["set"]) + def test_set_index_raise_on_type(self, frame_of_index_cols, box, drop, append): + df = frame_of_index_cols + + msg = 'The parameter "keys" may be a column key, .*' + # forbidden type, e.g. set + with pytest.raises(TypeError, match=msg): + df.set_index(box(df["A"]), drop=drop, append=append) + + # forbidden type in list, e.g. set + with pytest.raises(TypeError, match=msg): + df.set_index(["A", df["A"], box(df["A"])], drop=drop, append=append) + + # MultiIndex constructor does not work directly on Series -> lambda + @pytest.mark.parametrize( + "box", + [Series, Index, np.array, iter, lambda x: MultiIndex.from_arrays([x])], + ids=["Series", "Index", "np.array", "iter", "MultiIndex"], + ) + @pytest.mark.parametrize("length", [4, 6], ids=["too_short", "too_long"]) + @pytest.mark.parametrize("append", [True, False]) + @pytest.mark.parametrize("drop", [True, False]) + def test_set_index_raise_on_len( + self, frame_of_index_cols, box, length, drop, append + ): + # GH 24984 + df = frame_of_index_cols # has length 5 + + values = np.random.randint(0, 10, (length,)) + + msg = "Length mismatch: Expected 5 rows, received array of length.*" + + # wrong length directly + with pytest.raises(ValueError, match=msg): + df.set_index(box(values), drop=drop, append=append) + + # wrong length in list + with pytest.raises(ValueError, match=msg): + df.set_index(["A", df.A, box(values)], drop=drop, append=append) + + def test_set_index_custom_label_type(self): + # GH 24969 + + class Thing: + def __init__(self, name, color): + self.name = name + self.color = color + + def __str__(self) -> str: + return f"" + + # necessary for pretty KeyError + __repr__ = __str__ + + thing1 = Thing("One", "red") + thing2 = Thing("Two", "blue") + df = DataFrame({thing1: [0, 1], thing2: [2, 3]}) + expected = DataFrame({thing1: [0, 1]}, index=Index([2, 3], name=thing2)) + + # use custom label directly + result = df.set_index(thing2) + tm.assert_frame_equal(result, expected) + + # custom label wrapped in list + result = df.set_index([thing2]) + tm.assert_frame_equal(result, expected) + + # missing key + thing3 = Thing("Three", "pink") + msg = "" + with pytest.raises(KeyError, match=msg): + # missing label directly + df.set_index(thing3) + + with pytest.raises(KeyError, match=msg): + # missing label in list + df.set_index([thing3]) + + def test_set_index_custom_label_hashable_iterable(self): + # GH 24969 + + # actual example discussed in GH 24984 was e.g. for shapely.geometry + # objects (e.g. a collection of Points) that can be both hashable and + # iterable; using frozenset as a stand-in for testing here + + class Thing(frozenset): + # need to stabilize repr for KeyError (due to random order in sets) + def __repr__(self) -> str: + tmp = sorted(self) + # double curly brace prints one brace in format string + return "frozenset({{{}}})".format(", ".join(map(repr, tmp))) + + thing1 = Thing(["One", "red"]) + thing2 = Thing(["Two", "blue"]) + df = DataFrame({thing1: [0, 1], thing2: [2, 3]}) + expected = DataFrame({thing1: [0, 1]}, index=Index([2, 3], name=thing2)) + + # use custom label directly + result = df.set_index(thing2) + tm.assert_frame_equal(result, expected) + + # custom label wrapped in list + result = df.set_index([thing2]) + tm.assert_frame_equal(result, expected) + + # missing key + thing3 = Thing(["Three", "pink"]) + msg = r"frozenset\(\{'Three', 'pink'\}\)" + with pytest.raises(KeyError, match=msg): + # missing label directly + df.set_index(thing3) + + with pytest.raises(KeyError, match=msg): + # missing label in list + df.set_index([thing3]) + + def test_set_index_custom_label_type_raises(self): + # GH 24969 + + # purposefully inherit from something unhashable + class Thing(set): + def __init__(self, name, color): + self.name = name + self.color = color + + def __str__(self) -> str: + return f"" + + thing1 = Thing("One", "red") + thing2 = Thing("Two", "blue") + df = DataFrame([[0, 2], [1, 3]], columns=[thing1, thing2]) + + msg = 'The parameter "keys" may be a column key, .*' + + with pytest.raises(TypeError, match=msg): + # use custom label directly + df.set_index(thing2) + + with pytest.raises(TypeError, match=msg): + # custom label wrapped in list + df.set_index([thing2]) + + def test_construction_with_categorical_index(self): + ci = tm.makeCategoricalIndex(10) + ci.name = "B" + + # with Categorical + df = DataFrame({"A": np.random.randn(10), "B": ci.values}) + idf = df.set_index("B") + tm.assert_index_equal(idf.index, ci) + + # from a CategoricalIndex + df = DataFrame({"A": np.random.randn(10), "B": ci}) + idf = df.set_index("B") + tm.assert_index_equal(idf.index, ci) + + # round-trip + idf = idf.reset_index().set_index("B") + tm.assert_index_equal(idf.index, ci) + + def test_set_index_cast_datetimeindex(self): + df = DataFrame( + { + "A": [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)], + "B": np.random.randn(1000), + } + ) + + idf = df.set_index("A") + assert isinstance(idf.index, DatetimeIndex) + + def test_convert_dti_to_series(self): + # don't cast a DatetimeIndex WITH a tz, leave as object + # GH 6032 + idx = DatetimeIndex( + to_datetime(["2013-1-1 13:00", "2013-1-2 14:00"]), name="B" + ).tz_localize("US/Pacific") + df = DataFrame(np.random.randn(2, 1), columns=["A"]) + + expected = Series( + np.array( + [ + Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"), + Timestamp("2013-01-02 14:00:00-0800", tz="US/Pacific"), + ], + dtype="object", + ), + name="B", + ) + + # convert index to series + result = Series(idx) + tm.assert_series_equal(result, expected) + + # assign to frame + df["B"] = idx + result = df["B"] + tm.assert_series_equal(result, expected) + + # convert to series while keeping the timezone + msg = "stop passing 'keep_tz'" + with tm.assert_produces_warning(FutureWarning) as m: + result = idx.to_series(keep_tz=True, index=[0, 1]) + tm.assert_series_equal(result, expected) + assert msg in str(m[0].message) + + # convert to utc + with tm.assert_produces_warning(FutureWarning) as m: + df["B"] = idx.to_series(keep_tz=False, index=[0, 1]) + result = df["B"] + comp = Series(DatetimeIndex(expected.values).tz_localize(None), name="B") + tm.assert_series_equal(result, comp) + msg = "do 'idx.tz_convert(None)' before calling" + assert msg in str(m[0].message) + + result = idx.to_series(index=[0, 1]) + tm.assert_series_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning) as m: + result = idx.to_series(keep_tz=False, index=[0, 1]) + tm.assert_series_equal(result, expected.dt.tz_convert(None)) + msg = "do 'idx.tz_convert(None)' before calling" + assert msg in str(m[0].message) + + # list of datetimes with a tz + df["B"] = idx.to_pydatetime() + result = df["B"] + tm.assert_series_equal(result, expected) + + # GH 6785 + # set the index manually + import pytz + + df = DataFrame([{"ts": datetime(2014, 4, 1, tzinfo=pytz.utc), "foo": 1}]) + expected = df.set_index("ts") + df.index = df["ts"] + df.pop("ts") + tm.assert_frame_equal(df, expected) + + def test_reset_index_tz(self, tz_aware_fixture): + # GH 3950 + # reset_index with single level + tz = tz_aware_fixture + idx = date_range("1/1/2011", periods=5, freq="D", tz=tz, name="idx") + df = DataFrame({"a": range(5), "b": ["A", "B", "C", "D", "E"]}, index=idx) + + expected = DataFrame( + { + "idx": [ + datetime(2011, 1, 1), + datetime(2011, 1, 2), + datetime(2011, 1, 3), + datetime(2011, 1, 4), + datetime(2011, 1, 5), + ], + "a": range(5), + "b": ["A", "B", "C", "D", "E"], + }, + columns=["idx", "a", "b"], + ) + expected["idx"] = expected["idx"].apply(lambda d: Timestamp(d, tz=tz)) + tm.assert_frame_equal(df.reset_index(), expected) + + def test_set_index_timezone(self): + # GH 12358 + # tz-aware Series should retain the tz + idx = to_datetime(["2014-01-01 10:10:10"], utc=True).tz_convert("Europe/Rome") + df = DataFrame({"A": idx}) + assert df.set_index(idx).index[0].hour == 11 + assert DatetimeIndex(Series(df.A))[0].hour == 11 + assert df.set_index(df.A).index[0].hour == 11 + + def test_set_index_dst(self): + di = date_range("2006-10-29 00:00:00", periods=3, freq="H", tz="US/Pacific") + + df = DataFrame(data={"a": [0, 1, 2], "b": [3, 4, 5]}, index=di).reset_index() + # single level + res = df.set_index("index") + exp = DataFrame( + data={"a": [0, 1, 2], "b": [3, 4, 5]}, index=Index(di, name="index") + ) + tm.assert_frame_equal(res, exp) + + # GH 12920 + res = df.set_index(["index", "a"]) + exp_index = MultiIndex.from_arrays([di, [0, 1, 2]], names=["index", "a"]) + exp = DataFrame({"b": [3, 4, 5]}, index=exp_index) + tm.assert_frame_equal(res, exp) + + def test_reset_index_with_intervals(self): + idx = IntervalIndex.from_breaks(np.arange(11), name="x") + original = DataFrame({"x": idx, "y": np.arange(10)})[["x", "y"]] + + result = original.set_index("x") + expected = DataFrame({"y": np.arange(10)}, index=idx) + tm.assert_frame_equal(result, expected) + + result2 = result.reset_index() + tm.assert_frame_equal(result2, original) + + def test_set_index_multiindexcolumns(self): + columns = MultiIndex.from_tuples([("foo", 1), ("foo", 2), ("bar", 1)]) + df = DataFrame(np.random.randn(3, 3), columns=columns) + result = df.set_index(df.columns[0]) + expected = df.iloc[:, 1:] + expected.index = df.iloc[:, 0].values + expected.index.names = [df.columns[0]] + tm.assert_frame_equal(result, expected) + + def test_set_index_empty_column(self): + # GH 1971 + df = DataFrame( + [ + {"a": 1, "p": 0}, + {"a": 2, "m": 10}, + {"a": 3, "m": 11, "p": 20}, + {"a": 4, "m": 12, "p": 21}, + ], + columns=("a", "m", "p", "x"), + ) + + result = df.set_index(["a", "x"]) + expected = df[["m", "p"]] + expected.index = MultiIndex.from_arrays([df["a"], df["x"]], names=["a", "x"]) + tm.assert_frame_equal(result, expected) + + def test_set_columns(self, float_string_frame): + cols = Index(np.arange(len(float_string_frame.columns))) + float_string_frame.columns = cols + with pytest.raises(ValueError, match="Length mismatch"): + float_string_frame.columns = cols[::2] + + def test_dti_set_index_reindex(self): + # GH 6631 + df = DataFrame(np.random.random(6)) + idx1 = date_range("2011/01/01", periods=6, freq="M", tz="US/Eastern") + idx2 = date_range("2013", periods=6, freq="A", tz="Asia/Tokyo") + + df = df.set_index(idx1) + tm.assert_index_equal(df.index, idx1) + df = df.reindex(idx2) + tm.assert_index_equal(df.index, idx2) + + # GH 11314 + # with tz + index = date_range( + datetime(2015, 10, 1), datetime(2015, 10, 1, 23), freq="H", tz="US/Eastern" + ) + df = DataFrame(np.random.randn(24, 1), columns=["a"], index=index) + new_index = date_range( + datetime(2015, 10, 2), datetime(2015, 10, 2, 23), freq="H", tz="US/Eastern" + ) + + result = df.set_index(new_index) + assert result.index.freq == index.freq + + # Renaming + + def test_rename(self, float_frame): + mapping = {"A": "a", "B": "b", "C": "c", "D": "d"} + + renamed = float_frame.rename(columns=mapping) + renamed2 = float_frame.rename(columns=str.lower) + + tm.assert_frame_equal(renamed, renamed2) + tm.assert_frame_equal( + renamed2.rename(columns=str.upper), float_frame, check_names=False + ) + + # index + data = {"A": {"foo": 0, "bar": 1}} + + # gets sorted alphabetical + df = DataFrame(data) + renamed = df.rename(index={"foo": "bar", "bar": "foo"}) + tm.assert_index_equal(renamed.index, Index(["foo", "bar"])) + + renamed = df.rename(index=str.upper) + tm.assert_index_equal(renamed.index, Index(["BAR", "FOO"])) + + # have to pass something + with pytest.raises(TypeError, match="must pass an index to rename"): + float_frame.rename() + + # partial columns + renamed = float_frame.rename(columns={"C": "foo", "D": "bar"}) + tm.assert_index_equal(renamed.columns, Index(["A", "B", "foo", "bar"])) + + # other axis + renamed = float_frame.T.rename(index={"C": "foo", "D": "bar"}) + tm.assert_index_equal(renamed.index, Index(["A", "B", "foo", "bar"])) + + # index with name + index = Index(["foo", "bar"], name="name") + renamer = DataFrame(data, index=index) + renamed = renamer.rename(index={"foo": "bar", "bar": "foo"}) + tm.assert_index_equal(renamed.index, Index(["bar", "foo"], name="name")) + assert renamed.index.name == renamer.index.name + + @pytest.mark.parametrize( + "args,kwargs", + [ + ((ChainMap({"A": "a"}, {"B": "b"}),), dict(axis="columns")), + ((), dict(columns=ChainMap({"A": "a"}, {"B": "b"}))), + ], + ) + def test_rename_chainmap(self, args, kwargs): + # see gh-23859 + colAData = range(1, 11) + colBdata = np.random.randn(10) + + df = DataFrame({"A": colAData, "B": colBdata}) + result = df.rename(*args, **kwargs) + + expected = DataFrame({"a": colAData, "b": colBdata}) + tm.assert_frame_equal(result, expected) + + def test_rename_axis_inplace(self, float_frame): + # GH 15704 + expected = float_frame.rename_axis("foo") + result = float_frame.copy() + no_return = result.rename_axis("foo", inplace=True) + + assert no_return is None + tm.assert_frame_equal(result, expected) + + expected = float_frame.rename_axis("bar", axis=1) + result = float_frame.copy() + no_return = result.rename_axis("bar", axis=1, inplace=True) + + assert no_return is None + tm.assert_frame_equal(result, expected) + + def test_rename_axis_raises(self): + # https://github.com/pandas-dev/pandas/issues/17833 + df = DataFrame({"A": [1, 2], "B": [1, 2]}) + with pytest.raises(ValueError, match="Use `.rename`"): + df.rename_axis(id, axis=0) + + with pytest.raises(ValueError, match="Use `.rename`"): + df.rename_axis({0: 10, 1: 20}, axis=0) + + with pytest.raises(ValueError, match="Use `.rename`"): + df.rename_axis(id, axis=1) + + with pytest.raises(ValueError, match="Use `.rename`"): + df["A"].rename_axis(id) + + def test_rename_axis_mapper(self): + # GH 19978 + mi = MultiIndex.from_product([["a", "b", "c"], [1, 2]], names=["ll", "nn"]) + df = DataFrame( + {"x": list(range(len(mi))), "y": [i * 10 for i in range(len(mi))]}, index=mi + ) + + # Test for rename of the Index object of columns + result = df.rename_axis("cols", axis=1) + tm.assert_index_equal(result.columns, Index(["x", "y"], name="cols")) + + # Test for rename of the Index object of columns using dict + result = result.rename_axis(columns={"cols": "new"}, axis=1) + tm.assert_index_equal(result.columns, Index(["x", "y"], name="new")) + + # Test for renaming index using dict + result = df.rename_axis(index={"ll": "foo"}) + assert result.index.names == ["foo", "nn"] + + # Test for renaming index using a function + result = df.rename_axis(index=str.upper, axis=0) + assert result.index.names == ["LL", "NN"] + + # Test for renaming index providing complete list + result = df.rename_axis(index=["foo", "goo"]) + assert result.index.names == ["foo", "goo"] + + # Test for changing index and columns at same time + sdf = df.reset_index().set_index("nn").drop(columns=["ll", "y"]) + result = sdf.rename_axis(index="foo", columns="meh") + assert result.index.name == "foo" + assert result.columns.name == "meh" + + # Test different error cases + with pytest.raises(TypeError, match="Must pass"): + df.rename_axis(index="wrong") + + with pytest.raises(ValueError, match="Length of names"): + df.rename_axis(index=["wrong"]) + + with pytest.raises(TypeError, match="bogus"): + df.rename_axis(bogus=None) + + @pytest.mark.parametrize( + "kwargs, rename_index, rename_columns", + [ + ({"mapper": None, "axis": 0}, True, False), + ({"mapper": None, "axis": 1}, False, True), + ({"index": None}, True, False), + ({"columns": None}, False, True), + ({"index": None, "columns": None}, True, True), + ({}, False, False), + ], + ) + def test_rename_axis_none(self, kwargs, rename_index, rename_columns): + # GH 25034 + index = Index(list("abc"), name="foo") + columns = Index(["col1", "col2"], name="bar") + data = np.arange(6).reshape(3, 2) + df = DataFrame(data, index, columns) + + result = df.rename_axis(**kwargs) + expected_index = index.rename(None) if rename_index else index + expected_columns = columns.rename(None) if rename_columns else columns + expected = DataFrame(data, expected_index, expected_columns) + tm.assert_frame_equal(result, expected) + + def test_rename_multiindex(self): + + tuples_index = [("foo1", "bar1"), ("foo2", "bar2")] + tuples_columns = [("fizz1", "buzz1"), ("fizz2", "buzz2")] + index = MultiIndex.from_tuples(tuples_index, names=["foo", "bar"]) + columns = MultiIndex.from_tuples(tuples_columns, names=["fizz", "buzz"]) + df = DataFrame([(0, 0), (1, 1)], index=index, columns=columns) + + # + # without specifying level -> across all levels + + renamed = df.rename( + index={"foo1": "foo3", "bar2": "bar3"}, + columns={"fizz1": "fizz3", "buzz2": "buzz3"}, + ) + new_index = MultiIndex.from_tuples( + [("foo3", "bar1"), ("foo2", "bar3")], names=["foo", "bar"] + ) + new_columns = MultiIndex.from_tuples( + [("fizz3", "buzz1"), ("fizz2", "buzz3")], names=["fizz", "buzz"] + ) + tm.assert_index_equal(renamed.index, new_index) + tm.assert_index_equal(renamed.columns, new_columns) + assert renamed.index.names == df.index.names + assert renamed.columns.names == df.columns.names + + # + # with specifying a level (GH13766) + + # dict + new_columns = MultiIndex.from_tuples( + [("fizz3", "buzz1"), ("fizz2", "buzz2")], names=["fizz", "buzz"] + ) + renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=0) + tm.assert_index_equal(renamed.columns, new_columns) + renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="fizz") + tm.assert_index_equal(renamed.columns, new_columns) + + new_columns = MultiIndex.from_tuples( + [("fizz1", "buzz1"), ("fizz2", "buzz3")], names=["fizz", "buzz"] + ) + renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=1) + tm.assert_index_equal(renamed.columns, new_columns) + renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="buzz") + tm.assert_index_equal(renamed.columns, new_columns) + + # function + func = str.upper + new_columns = MultiIndex.from_tuples( + [("FIZZ1", "buzz1"), ("FIZZ2", "buzz2")], names=["fizz", "buzz"] + ) + renamed = df.rename(columns=func, level=0) + tm.assert_index_equal(renamed.columns, new_columns) + renamed = df.rename(columns=func, level="fizz") + tm.assert_index_equal(renamed.columns, new_columns) + + new_columns = MultiIndex.from_tuples( + [("fizz1", "BUZZ1"), ("fizz2", "BUZZ2")], names=["fizz", "buzz"] + ) + renamed = df.rename(columns=func, level=1) + tm.assert_index_equal(renamed.columns, new_columns) + renamed = df.rename(columns=func, level="buzz") + tm.assert_index_equal(renamed.columns, new_columns) + + # index + new_index = MultiIndex.from_tuples( + [("foo3", "bar1"), ("foo2", "bar2")], names=["foo", "bar"] + ) + renamed = df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0) + tm.assert_index_equal(renamed.index, new_index) + + def test_rename_nocopy(self, float_frame): + renamed = float_frame.rename(columns={"C": "foo"}, copy=False) + renamed["foo"] = 1.0 + assert (float_frame["C"] == 1.0).all() + + def test_rename_inplace(self, float_frame): + float_frame.rename(columns={"C": "foo"}) + assert "C" in float_frame + assert "foo" not in float_frame + + c_id = id(float_frame["C"]) + float_frame = float_frame.copy() + float_frame.rename(columns={"C": "foo"}, inplace=True) + + assert "C" not in float_frame + assert "foo" in float_frame + assert id(float_frame["foo"]) != c_id + + def test_rename_bug(self): + # GH 5344 + # rename set ref_locs, and set_index was not resetting + df = DataFrame({0: ["foo", "bar"], 1: ["bah", "bas"], 2: [1, 2]}) + df = df.rename(columns={0: "a"}) + df = df.rename(columns={1: "b"}) + df = df.set_index(["a", "b"]) + df.columns = ["2001-01-01"] + expected = DataFrame( + [[1], [2]], + index=MultiIndex.from_tuples( + [("foo", "bah"), ("bar", "bas")], names=["a", "b"] + ), + columns=["2001-01-01"], + ) + tm.assert_frame_equal(df, expected) + + def test_rename_bug2(self): + # GH 19497 + # rename was changing Index to MultiIndex if Index contained tuples + + df = DataFrame(data=np.arange(3), index=[(0, 0), (1, 1), (2, 2)], columns=["a"]) + df = df.rename({(1, 1): (5, 4)}, axis="index") + expected = DataFrame( + data=np.arange(3), index=[(0, 0), (5, 4), (2, 2)], columns=["a"] + ) + tm.assert_frame_equal(df, expected) + + def test_rename_errors_raises(self): + df = DataFrame(columns=["A", "B", "C", "D"]) + with pytest.raises(KeyError, match="'E'] not found in axis"): + df.rename(columns={"A": "a", "E": "e"}, errors="raise") + + @pytest.mark.parametrize( + "mapper, errors, expected_columns", + [ + ({"A": "a", "E": "e"}, "ignore", ["a", "B", "C", "D"]), + ({"A": "a"}, "raise", ["a", "B", "C", "D"]), + (str.lower, "raise", ["a", "b", "c", "d"]), + ], + ) + def test_rename_errors(self, mapper, errors, expected_columns): + # GH 13473 + # rename now works with errors parameter + df = DataFrame(columns=["A", "B", "C", "D"]) + result = df.rename(columns=mapper, errors=errors) + expected = DataFrame(columns=expected_columns) + tm.assert_frame_equal(result, expected) + + def test_reorder_levels(self): + index = MultiIndex( + levels=[["bar"], ["one", "two", "three"], [0, 1]], + codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], + names=["L0", "L1", "L2"], + ) + df = DataFrame({"A": np.arange(6), "B": np.arange(6)}, index=index) + + # no change, position + result = df.reorder_levels([0, 1, 2]) + tm.assert_frame_equal(df, result) + + # no change, labels + result = df.reorder_levels(["L0", "L1", "L2"]) + tm.assert_frame_equal(df, result) + + # rotate, position + result = df.reorder_levels([1, 2, 0]) + e_idx = MultiIndex( + levels=[["one", "two", "three"], [0, 1], ["bar"]], + codes=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1], [0, 0, 0, 0, 0, 0]], + names=["L1", "L2", "L0"], + ) + expected = DataFrame({"A": np.arange(6), "B": np.arange(6)}, index=e_idx) + tm.assert_frame_equal(result, expected) + + result = df.reorder_levels([0, 0, 0]) + e_idx = MultiIndex( + levels=[["bar"], ["bar"], ["bar"]], + codes=[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]], + names=["L0", "L0", "L0"], + ) + expected = DataFrame({"A": np.arange(6), "B": np.arange(6)}, index=e_idx) + tm.assert_frame_equal(result, expected) + + result = df.reorder_levels(["L0", "L0", "L0"]) + tm.assert_frame_equal(result, expected) + + def test_reset_index(self, float_frame): + stacked = float_frame.stack()[::2] + stacked = DataFrame({"foo": stacked, "bar": stacked}) + + names = ["first", "second"] + stacked.index.names = names + deleveled = stacked.reset_index() + for i, (lev, level_codes) in enumerate( + zip(stacked.index.levels, stacked.index.codes) + ): + values = lev.take(level_codes) + name = names[i] + tm.assert_index_equal(values, Index(deleveled[name])) + + stacked.index.names = [None, None] + deleveled2 = stacked.reset_index() + tm.assert_series_equal( + deleveled["first"], deleveled2["level_0"], check_names=False + ) + tm.assert_series_equal( + deleveled["second"], deleveled2["level_1"], check_names=False + ) + + # default name assigned + rdf = float_frame.reset_index() + exp = Series(float_frame.index.values, name="index") + tm.assert_series_equal(rdf["index"], exp) + + # default name assigned, corner case + df = float_frame.copy() + df["index"] = "foo" + rdf = df.reset_index() + exp = Series(float_frame.index.values, name="level_0") + tm.assert_series_equal(rdf["level_0"], exp) + + # but this is ok + float_frame.index.name = "index" + deleveled = float_frame.reset_index() + tm.assert_series_equal(deleveled["index"], Series(float_frame.index)) + tm.assert_index_equal(deleveled.index, Index(np.arange(len(deleveled)))) + + # preserve column names + float_frame.columns.name = "columns" + resetted = float_frame.reset_index() + assert resetted.columns.name == "columns" + + # only remove certain columns + df = float_frame.reset_index().set_index(["index", "A", "B"]) + rs = df.reset_index(["A", "B"]) + + # TODO should reset_index check_names ? + tm.assert_frame_equal(rs, float_frame, check_names=False) + + rs = df.reset_index(["index", "A", "B"]) + tm.assert_frame_equal(rs, float_frame.reset_index(), check_names=False) + + rs = df.reset_index(["index", "A", "B"]) + tm.assert_frame_equal(rs, float_frame.reset_index(), check_names=False) + + rs = df.reset_index("A") + xp = float_frame.reset_index().set_index(["index", "B"]) + tm.assert_frame_equal(rs, xp, check_names=False) + + # test resetting in place + df = float_frame.copy() + resetted = float_frame.reset_index() + df.reset_index(inplace=True) + tm.assert_frame_equal(df, resetted, check_names=False) + + df = float_frame.reset_index().set_index(["index", "A", "B"]) + rs = df.reset_index("A", drop=True) + xp = float_frame.copy() + del xp["A"] + xp = xp.set_index(["B"], append=True) + tm.assert_frame_equal(rs, xp, check_names=False) + + def test_reset_index_name(self): + df = DataFrame( + [[1, 2, 3, 4], [5, 6, 7, 8]], + columns=["A", "B", "C", "D"], + index=Index(range(2), name="x"), + ) + assert df.reset_index().index.name is None + assert df.reset_index(drop=True).index.name is None + df.reset_index(inplace=True) + assert df.index.name is None + + def test_reset_index_level(self): + df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "C", "D"]) + + for levels in ["A", "B"], [0, 1]: + # With MultiIndex + result = df.set_index(["A", "B"]).reset_index(level=levels[0]) + tm.assert_frame_equal(result, df.set_index("B")) + + result = df.set_index(["A", "B"]).reset_index(level=levels[:1]) + tm.assert_frame_equal(result, df.set_index("B")) + + result = df.set_index(["A", "B"]).reset_index(level=levels) + tm.assert_frame_equal(result, df) + + result = df.set_index(["A", "B"]).reset_index(level=levels, drop=True) + tm.assert_frame_equal(result, df[["C", "D"]]) + + # With single-level Index (GH 16263) + result = df.set_index("A").reset_index(level=levels[0]) + tm.assert_frame_equal(result, df) + + result = df.set_index("A").reset_index(level=levels[:1]) + tm.assert_frame_equal(result, df) + + result = df.set_index(["A"]).reset_index(level=levels[0], drop=True) + tm.assert_frame_equal(result, df[["B", "C", "D"]]) + + # Missing levels - for both MultiIndex and single-level Index: + for idx_lev in ["A", "B"], ["A"]: + with pytest.raises(KeyError, match=r"(L|l)evel \(?E\)?"): + df.set_index(idx_lev).reset_index(level=["A", "E"]) + with pytest.raises(IndexError, match="Too many levels"): + df.set_index(idx_lev).reset_index(level=[0, 1, 2]) + + def test_reset_index_right_dtype(self): + time = np.arange(0.0, 10, np.sqrt(2) / 2) + s1 = Series( + (9.81 * time ** 2) / 2, index=Index(time, name="time"), name="speed" + ) + df = DataFrame(s1) + + resetted = s1.reset_index() + assert resetted["time"].dtype == np.float64 + + resetted = df.reset_index() + assert resetted["time"].dtype == np.float64 + + def test_reset_index_multiindex_col(self): + vals = np.random.randn(3, 3).astype(object) + idx = ["x", "y", "z"] + full = np.hstack(([[x] for x in idx], vals)) + df = DataFrame( + vals, + Index(idx, name="a"), + columns=[["b", "b", "c"], ["mean", "median", "mean"]], + ) + rs = df.reset_index() + xp = DataFrame( + full, columns=[["a", "b", "b", "c"], ["", "mean", "median", "mean"]] + ) + tm.assert_frame_equal(rs, xp) + + rs = df.reset_index(col_fill=None) + xp = DataFrame( + full, columns=[["a", "b", "b", "c"], ["a", "mean", "median", "mean"]] + ) + tm.assert_frame_equal(rs, xp) + + rs = df.reset_index(col_level=1, col_fill="blah") + xp = DataFrame( + full, columns=[["blah", "b", "b", "c"], ["a", "mean", "median", "mean"]] + ) + tm.assert_frame_equal(rs, xp) + + df = DataFrame( + vals, + MultiIndex.from_arrays([[0, 1, 2], ["x", "y", "z"]], names=["d", "a"]), + columns=[["b", "b", "c"], ["mean", "median", "mean"]], + ) + rs = df.reset_index("a") + xp = DataFrame( + full, + Index([0, 1, 2], name="d"), + columns=[["a", "b", "b", "c"], ["", "mean", "median", "mean"]], + ) + tm.assert_frame_equal(rs, xp) + + rs = df.reset_index("a", col_fill=None) + xp = DataFrame( + full, + Index(range(3), name="d"), + columns=[["a", "b", "b", "c"], ["a", "mean", "median", "mean"]], + ) + tm.assert_frame_equal(rs, xp) + + rs = df.reset_index("a", col_fill="blah", col_level=1) + xp = DataFrame( + full, + Index(range(3), name="d"), + columns=[["blah", "b", "b", "c"], ["a", "mean", "median", "mean"]], + ) + tm.assert_frame_equal(rs, xp) + + def test_reset_index_multiindex_nan(self): + # GH6322, testing reset_index on MultiIndexes + # when we have a nan or all nan + df = DataFrame( + {"A": ["a", "b", "c"], "B": [0, 1, np.nan], "C": np.random.rand(3)} + ) + rs = df.set_index(["A", "B"]).reset_index() + tm.assert_frame_equal(rs, df) + + df = DataFrame( + {"A": [np.nan, "b", "c"], "B": [0, 1, 2], "C": np.random.rand(3)} + ) + rs = df.set_index(["A", "B"]).reset_index() + tm.assert_frame_equal(rs, df) + + df = DataFrame({"A": ["a", "b", "c"], "B": [0, 1, 2], "C": [np.nan, 1.1, 2.2]}) + rs = df.set_index(["A", "B"]).reset_index() + tm.assert_frame_equal(rs, df) + + df = DataFrame( + { + "A": ["a", "b", "c"], + "B": [np.nan, np.nan, np.nan], + "C": np.random.rand(3), + } + ) + rs = df.set_index(["A", "B"]).reset_index() + tm.assert_frame_equal(rs, df) + + def test_reset_index_with_datetimeindex_cols(self): + # GH5818 + # + df = DataFrame( + [[1, 2], [3, 4]], + columns=date_range("1/1/2013", "1/2/2013"), + index=["A", "B"], + ) + + result = df.reset_index() + expected = DataFrame( + [["A", 1, 2], ["B", 3, 4]], + columns=["index", datetime(2013, 1, 1), datetime(2013, 1, 2)], + ) + tm.assert_frame_equal(result, expected) + + def test_reset_index_range(self): + # GH 12071 + df = DataFrame([[0, 0], [1, 1]], columns=["A", "B"], index=RangeIndex(stop=2)) + result = df.reset_index() + assert isinstance(result.index, RangeIndex) + expected = DataFrame( + [[0, 0, 0], [1, 1, 1]], + columns=["index", "A", "B"], + index=RangeIndex(stop=2), + ) + tm.assert_frame_equal(result, expected) + + def test_set_index_names(self): + df = tm.makeDataFrame() + df.index.name = "name" + + assert df.set_index(df.index).index.names == ["name"] + + mi = MultiIndex.from_arrays(df[["A", "B"]].T.values, names=["A", "B"]) + mi2 = MultiIndex.from_arrays( + df[["A", "B", "A", "B"]].T.values, names=["A", "B", "C", "D"] + ) + + df = df.set_index(["A", "B"]) + + assert df.set_index(df.index).index.names == ["A", "B"] + + # Check that set_index isn't converting a MultiIndex into an Index + assert isinstance(df.set_index(df.index).index, MultiIndex) + + # Check actual equality + tm.assert_index_equal(df.set_index(df.index).index, mi) + + idx2 = df.index.rename(["C", "D"]) + + # Check that [MultiIndex, MultiIndex] yields a MultiIndex rather + # than a pair of tuples + assert isinstance(df.set_index([df.index, idx2]).index, MultiIndex) + + # Check equality + tm.assert_index_equal(df.set_index([df.index, idx2]).index, mi2) + + def test_rename_objects(self, float_string_frame): + renamed = float_string_frame.rename(columns=str.upper) + + assert "FOO" in renamed + assert "foo" not in renamed + + def test_rename_axis_style(self): + # https://github.com/pandas-dev/pandas/issues/12392 + df = DataFrame({"A": [1, 2], "B": [1, 2]}, index=["X", "Y"]) + expected = DataFrame({"a": [1, 2], "b": [1, 2]}, index=["X", "Y"]) + + result = df.rename(str.lower, axis=1) + tm.assert_frame_equal(result, expected) + + result = df.rename(str.lower, axis="columns") + tm.assert_frame_equal(result, expected) + + result = df.rename({"A": "a", "B": "b"}, axis=1) + tm.assert_frame_equal(result, expected) + + result = df.rename({"A": "a", "B": "b"}, axis="columns") + tm.assert_frame_equal(result, expected) + + # Index + expected = DataFrame({"A": [1, 2], "B": [1, 2]}, index=["x", "y"]) + result = df.rename(str.lower, axis=0) + tm.assert_frame_equal(result, expected) + + result = df.rename(str.lower, axis="index") + tm.assert_frame_equal(result, expected) + + result = df.rename({"X": "x", "Y": "y"}, axis=0) + tm.assert_frame_equal(result, expected) + + result = df.rename({"X": "x", "Y": "y"}, axis="index") + tm.assert_frame_equal(result, expected) + + result = df.rename(mapper=str.lower, axis="index") + tm.assert_frame_equal(result, expected) + + def test_rename_mapper_multi(self): + df = DataFrame({"A": ["a", "b"], "B": ["c", "d"], "C": [1, 2]}).set_index( + ["A", "B"] + ) + result = df.rename(str.upper) + expected = df.rename(index=str.upper) + tm.assert_frame_equal(result, expected) + + def test_rename_positional_named(self): + # https://github.com/pandas-dev/pandas/issues/12392 + df = DataFrame({"a": [1, 2], "b": [1, 2]}, index=["X", "Y"]) + result = df.rename(index=str.lower, columns=str.upper) + expected = DataFrame({"A": [1, 2], "B": [1, 2]}, index=["x", "y"]) + tm.assert_frame_equal(result, expected) + + def test_rename_axis_style_raises(self): + # see gh-12392 + df = DataFrame({"A": [1, 2], "B": [1, 2]}, index=["0", "1"]) + + # Named target and axis + over_spec_msg = "Cannot specify both 'axis' and any of 'index' or 'columns'" + with pytest.raises(TypeError, match=over_spec_msg): + df.rename(index=str.lower, axis=1) + + with pytest.raises(TypeError, match=over_spec_msg): + df.rename(index=str.lower, axis="columns") + + with pytest.raises(TypeError, match=over_spec_msg): + df.rename(columns=str.lower, axis="columns") + + with pytest.raises(TypeError, match=over_spec_msg): + df.rename(index=str.lower, axis=0) + + # Multiple targets and axis + with pytest.raises(TypeError, match=over_spec_msg): + df.rename(str.lower, index=str.lower, axis="columns") + + # Too many targets + over_spec_msg = "Cannot specify both 'mapper' and any of 'index' or 'columns'" + with pytest.raises(TypeError, match=over_spec_msg): + df.rename(str.lower, index=str.lower, columns=str.lower) + + # Duplicates + with pytest.raises(TypeError, match="multiple values"): + df.rename(id, mapper=id) + + def test_reindex_api_equivalence(self): + # equivalence of the labels/axis and index/columns API's + df = DataFrame( + [[1, 2, 3], [3, 4, 5], [5, 6, 7]], + index=["a", "b", "c"], + columns=["d", "e", "f"], + ) + + res1 = df.reindex(["b", "a"]) + res2 = df.reindex(index=["b", "a"]) + res3 = df.reindex(labels=["b", "a"]) + res4 = df.reindex(labels=["b", "a"], axis=0) + res5 = df.reindex(["b", "a"], axis=0) + for res in [res2, res3, res4, res5]: + tm.assert_frame_equal(res1, res) + + res1 = df.reindex(columns=["e", "d"]) + res2 = df.reindex(["e", "d"], axis=1) + res3 = df.reindex(labels=["e", "d"], axis=1) + for res in [res2, res3]: + tm.assert_frame_equal(res1, res) + + res1 = df.reindex(index=["b", "a"], columns=["e", "d"]) + res2 = df.reindex(columns=["e", "d"], index=["b", "a"]) + res3 = df.reindex(labels=["b", "a"], axis=0).reindex(labels=["e", "d"], axis=1) + for res in [res2, res3]: + tm.assert_frame_equal(res1, res) + + def test_rename_positional_raises(self): + # GH 29136 + df = DataFrame(columns=["A", "B"]) + msg = r"rename\(\) takes from 1 to 2 positional arguments" + + with pytest.raises(TypeError, match=msg): + df.rename(None, str.lower) + + def test_rename_no_mappings_raises(self): + # GH 29136 + df = DataFrame([[1]]) + msg = "must pass an index to rename" + with pytest.raises(TypeError, match=msg): + df.rename() + + with pytest.raises(TypeError, match=msg): + df.rename(None, index=None) + + with pytest.raises(TypeError, match=msg): + df.rename(None, columns=None) + + with pytest.raises(TypeError, match=msg): + df.rename(None, columns=None, index=None) + + def test_rename_mapper_and_positional_arguments_raises(self): + # GH 29136 + df = DataFrame([[1]]) + msg = "Cannot specify both 'mapper' and any of 'index' or 'columns'" + with pytest.raises(TypeError, match=msg): + df.rename({}, index={}) + + with pytest.raises(TypeError, match=msg): + df.rename({}, columns={}) + + with pytest.raises(TypeError, match=msg): + df.rename({}, columns={}, index={}) + + def test_assign_columns(self, float_frame): + float_frame["hi"] = "there" + + df = float_frame.copy() + df.columns = ["foo", "bar", "baz", "quux", "foo2"] + tm.assert_series_equal(float_frame["C"], df["baz"], check_names=False) + tm.assert_series_equal(float_frame["hi"], df["foo2"], check_names=False) + + def test_set_index_preserve_categorical_dtype(self): + # GH13743, GH13854 + df = DataFrame( + { + "A": [1, 2, 1, 1, 2], + "B": [10, 16, 22, 28, 34], + "C1": Categorical(list("abaab"), categories=list("bac"), ordered=False), + "C2": Categorical(list("abaab"), categories=list("bac"), ordered=True), + } + ) + for cols in ["C1", "C2", ["A", "C1"], ["A", "C2"], ["C1", "C2"]]: + result = df.set_index(cols).reset_index() + result = result.reindex(columns=df.columns) + tm.assert_frame_equal(result, df) + + def test_rename_signature(self): + sig = inspect.signature(DataFrame.rename) + parameters = set(sig.parameters) + assert parameters == { + "self", + "mapper", + "index", + "columns", + "axis", + "inplace", + "copy", + "level", + "errors", + } + + def test_reindex_signature(self): + sig = inspect.signature(DataFrame.reindex) + parameters = set(sig.parameters) + assert parameters == { + "self", + "labels", + "index", + "columns", + "axis", + "limit", + "copy", + "level", + "method", + "fill_value", + "tolerance", + } + + def test_droplevel(self): + # GH20342 + df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]) + df = df.set_index([0, 1]).rename_axis(["a", "b"]) + df.columns = MultiIndex.from_tuples( + [("c", "e"), ("d", "f")], names=["level_1", "level_2"] + ) + + # test that dropping of a level in index works + expected = df.reset_index("a", drop=True) + result = df.droplevel("a", axis="index") + tm.assert_frame_equal(result, expected) + + # test that dropping of a level in columns works + expected = df.copy() + expected.columns = Index(["c", "d"], name="level_1") + result = df.droplevel("level_2", axis="columns") + tm.assert_frame_equal(result, expected) + + +class TestIntervalIndex: + def test_setitem(self): + + df = DataFrame({"A": range(10)}) + s = cut(df.A, 5) + assert isinstance(s.cat.categories, IntervalIndex) + + # B & D end up as Categoricals + # the remainer are converted to in-line objects + # contining an IntervalIndex.values + df["B"] = s + df["C"] = np.array(s) + df["D"] = s.values + df["E"] = np.array(s.values) + + assert is_categorical_dtype(df["B"]) + assert is_interval_dtype(df["B"].cat.categories) + assert is_categorical_dtype(df["D"]) + assert is_interval_dtype(df["D"].cat.categories) + + assert is_object_dtype(df["C"]) + assert is_object_dtype(df["E"]) + + # they compare equal as Index + # when converted to numpy objects + c = lambda x: Index(np.array(x)) + tm.assert_index_equal(c(df.B), c(df.B), check_names=False) + tm.assert_index_equal(c(df.B), c(df.C), check_names=False) + tm.assert_index_equal(c(df.B), c(df.D), check_names=False) + tm.assert_index_equal(c(df.B), c(df.D), check_names=False) + + # B & D are the same Series + tm.assert_series_equal(df["B"], df["B"], check_names=False) + tm.assert_series_equal(df["B"], df["D"], check_names=False) + + # C & E are the same Series + tm.assert_series_equal(df["C"], df["C"], check_names=False) + tm.assert_series_equal(df["C"], df["E"], check_names=False) + + def test_set_reset_index(self): + + df = DataFrame({"A": range(10)}) + s = cut(df.A, 5) + df["B"] = s + df = df.set_index("B") + + df = df.reset_index() + + def test_set_axis_inplace(self): + # GH14636 + df = DataFrame( + {"A": [1.1, 2.2, 3.3], "B": [5.0, 6.1, 7.2], "C": [4.4, 5.5, 6.6]}, + index=[2010, 2011, 2012], + ) + + expected = {0: df.copy(), 1: df.copy()} + expected[0].index = list("abc") + expected[1].columns = list("abc") + expected["index"] = expected[0] + expected["columns"] = expected[1] + + for axis in expected: + result = df.copy() + result.set_axis(list("abc"), axis=axis, inplace=True) + tm.assert_frame_equal(result, expected[axis]) + + # inplace=False + result = df.set_axis(list("abc"), axis=axis) + tm.assert_frame_equal(expected[axis], result) + + # omitting the "axis" parameter + with tm.assert_produces_warning(None): + result = df.set_axis(list("abc")) + tm.assert_frame_equal(result, expected[0]) + + # wrong values for the "axis" parameter + for axis in 3, "foo": + with pytest.raises(ValueError, match="No axis named"): + df.set_axis(list("abc"), axis=axis) diff --git a/venv/Lib/site-packages/pandas/tests/frame/test_analytics.py b/venv/Lib/site-packages/pandas/tests/frame/test_analytics.py new file mode 100644 index 0000000..25b2997 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/test_analytics.py @@ -0,0 +1,1272 @@ +from datetime import timedelta +from decimal import Decimal +import operator + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + Categorical, + DataFrame, + MultiIndex, + Series, + Timestamp, + date_range, + isna, + notna, + to_datetime, + to_timedelta, +) +import pandas._testing as tm +import pandas.core.algorithms as algorithms +import pandas.core.nanops as nanops + + +def assert_stat_op_calc( + opname, + alternative, + frame, + has_skipna=True, + check_dtype=True, + check_dates=False, + check_less_precise=False, + skipna_alternative=None, +): + """ + Check that operator opname works as advertised on frame + + Parameters + ---------- + opname : string + Name of the operator to test on frame + alternative : function + Function that opname is tested against; i.e. "frame.opname()" should + equal "alternative(frame)". + frame : DataFrame + The object that the tests are executed on + has_skipna : bool, default True + Whether the method "opname" has the kwarg "skip_na" + check_dtype : bool, default True + Whether the dtypes of the result of "frame.opname()" and + "alternative(frame)" should be checked. + check_dates : bool, default false + Whether opname should be tested on a Datetime Series + check_less_precise : bool, default False + Whether results should only be compared approximately; + passed on to tm.assert_series_equal + skipna_alternative : function, default None + NaN-safe version of alternative + """ + + f = getattr(frame, opname) + + if check_dates: + df = DataFrame({"b": date_range("1/1/2001", periods=2)}) + result = getattr(df, opname)() + assert isinstance(result, Series) + + df["a"] = range(len(df)) + result = getattr(df, opname)() + assert isinstance(result, Series) + assert len(result) + + if has_skipna: + + def wrapper(x): + return alternative(x.values) + + skipna_wrapper = tm._make_skipna_wrapper(alternative, skipna_alternative) + result0 = f(axis=0, skipna=False) + result1 = f(axis=1, skipna=False) + tm.assert_series_equal( + result0, + frame.apply(wrapper), + check_dtype=check_dtype, + check_less_precise=check_less_precise, + ) + # HACK: win32 + tm.assert_series_equal( + result1, + frame.apply(wrapper, axis=1), + check_dtype=False, + check_less_precise=check_less_precise, + ) + else: + skipna_wrapper = alternative + + result0 = f(axis=0) + result1 = f(axis=1) + tm.assert_series_equal( + result0, + frame.apply(skipna_wrapper), + check_dtype=check_dtype, + check_less_precise=check_less_precise, + ) + + if opname in ["sum", "prod"]: + expected = frame.apply(skipna_wrapper, axis=1) + tm.assert_series_equal( + result1, expected, check_dtype=False, check_less_precise=check_less_precise + ) + + # check dtypes + if check_dtype: + lcd_dtype = frame.values.dtype + assert lcd_dtype == result0.dtype + assert lcd_dtype == result1.dtype + + # bad axis + with pytest.raises(ValueError, match="No axis named 2"): + f(axis=2) + + # all NA case + if has_skipna: + all_na = frame * np.NaN + r0 = getattr(all_na, opname)(axis=0) + r1 = getattr(all_na, opname)(axis=1) + if opname in ["sum", "prod"]: + unit = 1 if opname == "prod" else 0 # result for empty sum/prod + expected = pd.Series(unit, index=r0.index, dtype=r0.dtype) + tm.assert_series_equal(r0, expected) + expected = pd.Series(unit, index=r1.index, dtype=r1.dtype) + tm.assert_series_equal(r1, expected) + + +def assert_stat_op_api(opname, float_frame, float_string_frame, has_numeric_only=False): + """ + Check that API for operator opname works as advertised on frame + + Parameters + ---------- + opname : string + Name of the operator to test on frame + float_frame : DataFrame + DataFrame with columns of type float + float_string_frame : DataFrame + DataFrame with both float and string columns + has_numeric_only : bool, default False + Whether the method "opname" has the kwarg "numeric_only" + """ + + # make sure works on mixed-type frame + getattr(float_string_frame, opname)(axis=0) + getattr(float_string_frame, opname)(axis=1) + + if has_numeric_only: + getattr(float_string_frame, opname)(axis=0, numeric_only=True) + getattr(float_string_frame, opname)(axis=1, numeric_only=True) + getattr(float_frame, opname)(axis=0, numeric_only=False) + getattr(float_frame, opname)(axis=1, numeric_only=False) + + +def assert_bool_op_calc(opname, alternative, frame, has_skipna=True): + """ + Check that bool operator opname works as advertised on frame + + Parameters + ---------- + opname : string + Name of the operator to test on frame + alternative : function + Function that opname is tested against; i.e. "frame.opname()" should + equal "alternative(frame)". + frame : DataFrame + The object that the tests are executed on + has_skipna : bool, default True + Whether the method "opname" has the kwarg "skip_na" + """ + + f = getattr(frame, opname) + + if has_skipna: + + def skipna_wrapper(x): + nona = x.dropna().values + return alternative(nona) + + def wrapper(x): + return alternative(x.values) + + result0 = f(axis=0, skipna=False) + result1 = f(axis=1, skipna=False) + + tm.assert_series_equal(result0, frame.apply(wrapper)) + tm.assert_series_equal( + result1, frame.apply(wrapper, axis=1), check_dtype=False + ) # HACK: win32 + else: + skipna_wrapper = alternative + wrapper = alternative + + result0 = f(axis=0) + result1 = f(axis=1) + + tm.assert_series_equal(result0, frame.apply(skipna_wrapper)) + tm.assert_series_equal( + result1, frame.apply(skipna_wrapper, axis=1), check_dtype=False + ) + + # bad axis + with pytest.raises(ValueError, match="No axis named 2"): + f(axis=2) + + # all NA case + if has_skipna: + all_na = frame * np.NaN + r0 = getattr(all_na, opname)(axis=0) + r1 = getattr(all_na, opname)(axis=1) + if opname == "any": + assert not r0.any() + assert not r1.any() + else: + assert r0.all() + assert r1.all() + + +def assert_bool_op_api( + opname, bool_frame_with_na, float_string_frame, has_bool_only=False +): + """ + Check that API for boolean operator opname works as advertised on frame + + Parameters + ---------- + opname : string + Name of the operator to test on frame + float_frame : DataFrame + DataFrame with columns of type float + float_string_frame : DataFrame + DataFrame with both float and string columns + has_bool_only : bool, default False + Whether the method "opname" has the kwarg "bool_only" + """ + # make sure op works on mixed-type frame + mixed = float_string_frame + mixed["_bool_"] = np.random.randn(len(mixed)) > 0.5 + getattr(mixed, opname)(axis=0) + getattr(mixed, opname)(axis=1) + + if has_bool_only: + getattr(mixed, opname)(axis=0, bool_only=True) + getattr(mixed, opname)(axis=1, bool_only=True) + getattr(bool_frame_with_na, opname)(axis=0, bool_only=False) + getattr(bool_frame_with_na, opname)(axis=1, bool_only=False) + + +class TestDataFrameAnalytics: + + # --------------------------------------------------------------------- + # Reductions + + def test_stat_op_api(self, float_frame, float_string_frame): + assert_stat_op_api( + "count", float_frame, float_string_frame, has_numeric_only=True + ) + assert_stat_op_api( + "sum", float_frame, float_string_frame, has_numeric_only=True + ) + + assert_stat_op_api("nunique", float_frame, float_string_frame) + assert_stat_op_api("mean", float_frame, float_string_frame) + assert_stat_op_api("product", float_frame, float_string_frame) + assert_stat_op_api("median", float_frame, float_string_frame) + assert_stat_op_api("min", float_frame, float_string_frame) + assert_stat_op_api("max", float_frame, float_string_frame) + assert_stat_op_api("mad", float_frame, float_string_frame) + assert_stat_op_api("var", float_frame, float_string_frame) + assert_stat_op_api("std", float_frame, float_string_frame) + assert_stat_op_api("sem", float_frame, float_string_frame) + assert_stat_op_api("median", float_frame, float_string_frame) + + try: + from scipy.stats import skew, kurtosis # noqa:F401 + + assert_stat_op_api("skew", float_frame, float_string_frame) + assert_stat_op_api("kurt", float_frame, float_string_frame) + except ImportError: + pass + + def test_stat_op_calc(self, float_frame_with_na, mixed_float_frame): + def count(s): + return notna(s).sum() + + def nunique(s): + return len(algorithms.unique1d(s.dropna())) + + def mad(x): + return np.abs(x - x.mean()).mean() + + def var(x): + return np.var(x, ddof=1) + + def std(x): + return np.std(x, ddof=1) + + def sem(x): + return np.std(x, ddof=1) / np.sqrt(len(x)) + + def skewness(x): + from scipy.stats import skew # noqa:F811 + + if len(x) < 3: + return np.nan + return skew(x, bias=False) + + def kurt(x): + from scipy.stats import kurtosis # noqa:F811 + + if len(x) < 4: + return np.nan + return kurtosis(x, bias=False) + + assert_stat_op_calc( + "nunique", + nunique, + float_frame_with_na, + has_skipna=False, + check_dtype=False, + check_dates=True, + ) + + # mixed types (with upcasting happening) + assert_stat_op_calc( + "sum", + np.sum, + mixed_float_frame.astype("float32"), + check_dtype=False, + check_less_precise=True, + ) + + assert_stat_op_calc( + "sum", np.sum, float_frame_with_na, skipna_alternative=np.nansum + ) + assert_stat_op_calc("mean", np.mean, float_frame_with_na, check_dates=True) + assert_stat_op_calc("product", np.prod, float_frame_with_na) + + assert_stat_op_calc("mad", mad, float_frame_with_na) + assert_stat_op_calc("var", var, float_frame_with_na) + assert_stat_op_calc("std", std, float_frame_with_na) + assert_stat_op_calc("sem", sem, float_frame_with_na) + + assert_stat_op_calc( + "count", + count, + float_frame_with_na, + has_skipna=False, + check_dtype=False, + check_dates=True, + ) + + try: + from scipy import skew, kurtosis # noqa:F401 + + assert_stat_op_calc("skew", skewness, float_frame_with_na) + assert_stat_op_calc("kurt", kurt, float_frame_with_na) + except ImportError: + pass + + # TODO: Ensure warning isn't emitted in the first place + @pytest.mark.filterwarnings("ignore:All-NaN:RuntimeWarning") + def test_median(self, float_frame_with_na, int_frame): + def wrapper(x): + if isna(x).any(): + return np.nan + return np.median(x) + + assert_stat_op_calc("median", wrapper, float_frame_with_na, check_dates=True) + assert_stat_op_calc( + "median", wrapper, int_frame, check_dtype=False, check_dates=True + ) + + @pytest.mark.parametrize( + "method", ["sum", "mean", "prod", "var", "std", "skew", "min", "max"] + ) + def test_stat_operators_attempt_obj_array(self, method): + # GH#676 + data = { + "a": [ + -0.00049987540199591344, + -0.0016467257772919831, + 0.00067695870775883013, + ], + "b": [-0, -0, 0.0], + "c": [ + 0.00031111847529610595, + 0.0014902627951905339, + -0.00094099200035979691, + ], + } + df1 = DataFrame(data, index=["foo", "bar", "baz"], dtype="O") + + df2 = DataFrame({0: [np.nan, 2], 1: [np.nan, 3], 2: [np.nan, 4]}, dtype=object) + + for df in [df1, df2]: + assert df.values.dtype == np.object_ + result = getattr(df, method)(1) + expected = getattr(df.astype("f8"), method)(1) + + if method in ["sum", "prod"]: + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"]) + def test_mixed_ops(self, op): + # GH#16116 + df = DataFrame( + { + "int": [1, 2, 3, 4], + "float": [1.0, 2.0, 3.0, 4.0], + "str": ["a", "b", "c", "d"], + } + ) + + result = getattr(df, op)() + assert len(result) == 2 + + with pd.option_context("use_bottleneck", False): + result = getattr(df, op)() + assert len(result) == 2 + + def test_reduce_mixed_frame(self): + # GH 6806 + df = DataFrame( + { + "bool_data": [True, True, False, False, False], + "int_data": [10, 20, 30, 40, 50], + "string_data": ["a", "b", "c", "d", "e"], + } + ) + df.reindex(columns=["bool_data", "int_data", "string_data"]) + test = df.sum(axis=0) + tm.assert_numpy_array_equal( + test.values, np.array([2, 150, "abcde"], dtype=object) + ) + tm.assert_series_equal(test, df.T.sum(axis=1)) + + def test_nunique(self): + df = DataFrame({"A": [1, 1, 1], "B": [1, 2, 3], "C": [1, np.nan, 3]}) + tm.assert_series_equal(df.nunique(), Series({"A": 1, "B": 3, "C": 2})) + tm.assert_series_equal( + df.nunique(dropna=False), Series({"A": 1, "B": 3, "C": 3}) + ) + tm.assert_series_equal(df.nunique(axis=1), Series({0: 1, 1: 2, 2: 2})) + tm.assert_series_equal( + df.nunique(axis=1, dropna=False), Series({0: 1, 1: 3, 2: 2}) + ) + + @pytest.mark.parametrize("tz", [None, "UTC"]) + def test_mean_mixed_datetime_numeric(self, tz): + # https://github.com/pandas-dev/pandas/issues/24752 + df = pd.DataFrame({"A": [1, 1], "B": [pd.Timestamp("2000", tz=tz)] * 2}) + result = df.mean() + expected = pd.Series([1.0], index=["A"]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("tz", [None, "UTC"]) + def test_mean_excludes_datetimes(self, tz): + # https://github.com/pandas-dev/pandas/issues/24752 + # Our long-term desired behavior is unclear, but the behavior in + # 0.24.0rc1 was buggy. + df = pd.DataFrame({"A": [pd.Timestamp("2000", tz=tz)] * 2}) + result = df.mean() + expected = pd.Series(dtype=np.float64) + tm.assert_series_equal(result, expected) + + def test_mean_mixed_string_decimal(self): + # GH 11670 + # possible bug when calculating mean of DataFrame? + + d = [ + {"A": 2, "B": None, "C": Decimal("628.00")}, + {"A": 1, "B": None, "C": Decimal("383.00")}, + {"A": 3, "B": None, "C": Decimal("651.00")}, + {"A": 2, "B": None, "C": Decimal("575.00")}, + {"A": 4, "B": None, "C": Decimal("1114.00")}, + {"A": 1, "B": "TEST", "C": Decimal("241.00")}, + {"A": 2, "B": None, "C": Decimal("572.00")}, + {"A": 4, "B": None, "C": Decimal("609.00")}, + {"A": 3, "B": None, "C": Decimal("820.00")}, + {"A": 5, "B": None, "C": Decimal("1223.00")}, + ] + + df = pd.DataFrame(d) + + result = df.mean() + expected = pd.Series([2.7, 681.6], index=["A", "C"]) + tm.assert_series_equal(result, expected) + + def test_var_std(self, datetime_frame): + result = datetime_frame.std(ddof=4) + expected = datetime_frame.apply(lambda x: x.std(ddof=4)) + tm.assert_almost_equal(result, expected) + + result = datetime_frame.var(ddof=4) + expected = datetime_frame.apply(lambda x: x.var(ddof=4)) + tm.assert_almost_equal(result, expected) + + arr = np.repeat(np.random.random((1, 1000)), 1000, 0) + result = nanops.nanvar(arr, axis=0) + assert not (result < 0).any() + + with pd.option_context("use_bottleneck", False): + result = nanops.nanvar(arr, axis=0) + assert not (result < 0).any() + + @pytest.mark.parametrize("meth", ["sem", "var", "std"]) + def test_numeric_only_flag(self, meth): + # GH 9201 + df1 = DataFrame(np.random.randn(5, 3), columns=["foo", "bar", "baz"]) + # set one entry to a number in str format + df1.loc[0, "foo"] = "100" + + df2 = DataFrame(np.random.randn(5, 3), columns=["foo", "bar", "baz"]) + # set one entry to a non-number str + df2.loc[0, "foo"] = "a" + + result = getattr(df1, meth)(axis=1, numeric_only=True) + expected = getattr(df1[["bar", "baz"]], meth)(axis=1) + tm.assert_series_equal(expected, result) + + result = getattr(df2, meth)(axis=1, numeric_only=True) + expected = getattr(df2[["bar", "baz"]], meth)(axis=1) + tm.assert_series_equal(expected, result) + + # df1 has all numbers, df2 has a letter inside + msg = r"unsupported operand type\(s\) for -: 'float' and 'str'" + with pytest.raises(TypeError, match=msg): + getattr(df1, meth)(axis=1, numeric_only=False) + msg = "could not convert string to float: 'a'" + with pytest.raises(TypeError, match=msg): + getattr(df2, meth)(axis=1, numeric_only=False) + + def test_sem(self, datetime_frame): + result = datetime_frame.sem(ddof=4) + expected = datetime_frame.apply(lambda x: x.std(ddof=4) / np.sqrt(len(x))) + tm.assert_almost_equal(result, expected) + + arr = np.repeat(np.random.random((1, 1000)), 1000, 0) + result = nanops.nansem(arr, axis=0) + assert not (result < 0).any() + + with pd.option_context("use_bottleneck", False): + result = nanops.nansem(arr, axis=0) + assert not (result < 0).any() + + @td.skip_if_no_scipy + def test_kurt(self): + index = MultiIndex( + levels=[["bar"], ["one", "two", "three"], [0, 1]], + codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], + ) + df = DataFrame(np.random.randn(6, 3), index=index) + + kurt = df.kurt() + kurt2 = df.kurt(level=0).xs("bar") + tm.assert_series_equal(kurt, kurt2, check_names=False) + assert kurt.name is None + assert kurt2.name == "bar" + + @pytest.mark.parametrize( + "dropna, expected", + [ + ( + True, + { + "A": [12], + "B": [10.0], + "C": [1.0], + "D": ["a"], + "E": Categorical(["a"], categories=["a"]), + "F": to_datetime(["2000-1-2"]), + "G": to_timedelta(["1 days"]), + }, + ), + ( + False, + { + "A": [12], + "B": [10.0], + "C": [np.nan], + "D": np.array([np.nan], dtype=object), + "E": Categorical([np.nan], categories=["a"]), + "F": [pd.NaT], + "G": to_timedelta([pd.NaT]), + }, + ), + ( + True, + { + "H": [8, 9, np.nan, np.nan], + "I": [8, 9, np.nan, np.nan], + "J": [1, np.nan, np.nan, np.nan], + "K": Categorical(["a", np.nan, np.nan, np.nan], categories=["a"]), + "L": to_datetime(["2000-1-2", "NaT", "NaT", "NaT"]), + "M": to_timedelta(["1 days", "nan", "nan", "nan"]), + "N": [0, 1, 2, 3], + }, + ), + ( + False, + { + "H": [8, 9, np.nan, np.nan], + "I": [8, 9, np.nan, np.nan], + "J": [1, np.nan, np.nan, np.nan], + "K": Categorical([np.nan, "a", np.nan, np.nan], categories=["a"]), + "L": to_datetime(["NaT", "2000-1-2", "NaT", "NaT"]), + "M": to_timedelta(["nan", "1 days", "nan", "nan"]), + "N": [0, 1, 2, 3], + }, + ), + ], + ) + def test_mode_dropna(self, dropna, expected): + + df = DataFrame( + { + "A": [12, 12, 19, 11], + "B": [10, 10, np.nan, 3], + "C": [1, np.nan, np.nan, np.nan], + "D": [np.nan, np.nan, "a", np.nan], + "E": Categorical([np.nan, np.nan, "a", np.nan]), + "F": to_datetime(["NaT", "2000-1-2", "NaT", "NaT"]), + "G": to_timedelta(["1 days", "nan", "nan", "nan"]), + "H": [8, 8, 9, 9], + "I": [9, 9, 8, 8], + "J": [1, 1, np.nan, np.nan], + "K": Categorical(["a", np.nan, "a", np.nan]), + "L": to_datetime(["2000-1-2", "2000-1-2", "NaT", "NaT"]), + "M": to_timedelta(["1 days", "nan", "1 days", "nan"]), + "N": np.arange(4, dtype="int64"), + } + ) + + result = df[sorted(expected.keys())].mode(dropna=dropna) + expected = DataFrame(expected) + tm.assert_frame_equal(result, expected) + + def test_mode_sortwarning(self): + # Check for the warning that is raised when the mode + # results cannot be sorted + + df = DataFrame({"A": [np.nan, np.nan, "a", "a"]}) + expected = DataFrame({"A": ["a", np.nan]}) + + with tm.assert_produces_warning(UserWarning, check_stacklevel=False): + result = df.mode(dropna=False) + result = result.sort_values(by="A").reset_index(drop=True) + + tm.assert_frame_equal(result, expected) + + def test_operators_timedelta64(self): + df = DataFrame( + dict( + A=date_range("2012-1-1", periods=3, freq="D"), + B=date_range("2012-1-2", periods=3, freq="D"), + C=Timestamp("20120101") - timedelta(minutes=5, seconds=5), + ) + ) + + diffs = DataFrame(dict(A=df["A"] - df["C"], B=df["A"] - df["B"])) + + # min + result = diffs.min() + assert result[0] == diffs.loc[0, "A"] + assert result[1] == diffs.loc[0, "B"] + + result = diffs.min(axis=1) + assert (result == diffs.loc[0, "B"]).all() + + # max + result = diffs.max() + assert result[0] == diffs.loc[2, "A"] + assert result[1] == diffs.loc[2, "B"] + + result = diffs.max(axis=1) + assert (result == diffs["A"]).all() + + # abs + result = diffs.abs() + result2 = abs(diffs) + expected = DataFrame(dict(A=df["A"] - df["C"], B=df["B"] - df["A"])) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result2, expected) + + # mixed frame + mixed = diffs.copy() + mixed["C"] = "foo" + mixed["D"] = 1 + mixed["E"] = 1.0 + mixed["F"] = Timestamp("20130101") + + # results in an object array + result = mixed.min() + expected = Series( + [ + pd.Timedelta(timedelta(seconds=5 * 60 + 5)), + pd.Timedelta(timedelta(days=-1)), + "foo", + 1, + 1.0, + Timestamp("20130101"), + ], + index=mixed.columns, + ) + tm.assert_series_equal(result, expected) + + # excludes numeric + result = mixed.min(axis=1) + expected = Series([1, 1, 1.0], index=[0, 1, 2]) + tm.assert_series_equal(result, expected) + + # works when only those columns are selected + result = mixed[["A", "B"]].min(1) + expected = Series([timedelta(days=-1)] * 3) + tm.assert_series_equal(result, expected) + + result = mixed[["A", "B"]].min() + expected = Series( + [timedelta(seconds=5 * 60 + 5), timedelta(days=-1)], index=["A", "B"] + ) + tm.assert_series_equal(result, expected) + + # GH 3106 + df = DataFrame( + { + "time": date_range("20130102", periods=5), + "time2": date_range("20130105", periods=5), + } + ) + df["off1"] = df["time2"] - df["time"] + assert df["off1"].dtype == "timedelta64[ns]" + + df["off2"] = df["time"] - df["time2"] + df._consolidate_inplace() + assert df["off1"].dtype == "timedelta64[ns]" + assert df["off2"].dtype == "timedelta64[ns]" + + def test_sum_corner(self): + empty_frame = DataFrame() + + axis0 = empty_frame.sum(0) + axis1 = empty_frame.sum(1) + assert isinstance(axis0, Series) + assert isinstance(axis1, Series) + assert len(axis0) == 0 + assert len(axis1) == 0 + + @pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)]) + def test_sum_prod_nanops(self, method, unit): + idx = ["a", "b", "c"] + df = pd.DataFrame( + {"a": [unit, unit], "b": [unit, np.nan], "c": [np.nan, np.nan]} + ) + # The default + result = getattr(df, method) + expected = pd.Series([unit, unit, unit], index=idx, dtype="float64") + + # min_count=1 + result = getattr(df, method)(min_count=1) + expected = pd.Series([unit, unit, np.nan], index=idx) + tm.assert_series_equal(result, expected) + + # min_count=0 + result = getattr(df, method)(min_count=0) + expected = pd.Series([unit, unit, unit], index=idx, dtype="float64") + tm.assert_series_equal(result, expected) + + result = getattr(df.iloc[1:], method)(min_count=1) + expected = pd.Series([unit, np.nan, np.nan], index=idx) + tm.assert_series_equal(result, expected) + + # min_count > 1 + df = pd.DataFrame({"A": [unit] * 10, "B": [unit] * 5 + [np.nan] * 5}) + result = getattr(df, method)(min_count=5) + expected = pd.Series(result, index=["A", "B"]) + tm.assert_series_equal(result, expected) + + result = getattr(df, method)(min_count=6) + expected = pd.Series(result, index=["A", "B"]) + tm.assert_series_equal(result, expected) + + def test_sum_nanops_timedelta(self): + # prod isn't defined on timedeltas + idx = ["a", "b", "c"] + df = pd.DataFrame({"a": [0, 0], "b": [0, np.nan], "c": [np.nan, np.nan]}) + + df2 = df.apply(pd.to_timedelta) + + # 0 by default + result = df2.sum() + expected = pd.Series([0, 0, 0], dtype="m8[ns]", index=idx) + tm.assert_series_equal(result, expected) + + # min_count=0 + result = df2.sum(min_count=0) + tm.assert_series_equal(result, expected) + + # min_count=1 + result = df2.sum(min_count=1) + expected = pd.Series([0, 0, np.nan], dtype="m8[ns]", index=idx) + tm.assert_series_equal(result, expected) + + def test_sum_object(self, float_frame): + values = float_frame.values.astype(int) + frame = DataFrame(values, index=float_frame.index, columns=float_frame.columns) + deltas = frame * timedelta(1) + deltas.sum() + + def test_sum_bool(self, float_frame): + # ensure this works, bug report + bools = np.isnan(float_frame) + bools.sum(1) + bools.sum(0) + + def test_sum_mixed_datetime(self): + # GH#30886 + df = pd.DataFrame( + {"A": pd.date_range("2000", periods=4), "B": [1, 2, 3, 4]} + ).reindex([2, 3, 4]) + result = df.sum() + + expected = pd.Series({"B": 7.0}) + tm.assert_series_equal(result, expected) + + def test_mean_corner(self, float_frame, float_string_frame): + # unit test when have object data + the_mean = float_string_frame.mean(axis=0) + the_sum = float_string_frame.sum(axis=0, numeric_only=True) + tm.assert_index_equal(the_sum.index, the_mean.index) + assert len(the_mean.index) < len(float_string_frame.columns) + + # xs sum mixed type, just want to know it works... + the_mean = float_string_frame.mean(axis=1) + the_sum = float_string_frame.sum(axis=1, numeric_only=True) + tm.assert_index_equal(the_sum.index, the_mean.index) + + # take mean of boolean column + float_frame["bool"] = float_frame["A"] > 0 + means = float_frame.mean(0) + assert means["bool"] == float_frame["bool"].values.mean() + + def test_mean_datetimelike(self): + # GH#24757 check that datetimelike are excluded by default, handled + # correctly with numeric_only=True + + df = pd.DataFrame( + { + "A": np.arange(3), + "B": pd.date_range("2016-01-01", periods=3), + "C": pd.timedelta_range("1D", periods=3), + "D": pd.period_range("2016", periods=3, freq="A"), + } + ) + result = df.mean(numeric_only=True) + expected = pd.Series({"A": 1.0}) + tm.assert_series_equal(result, expected) + + result = df.mean() + expected = pd.Series({"A": 1.0, "C": df.loc[1, "C"]}) + tm.assert_series_equal(result, expected) + + @pytest.mark.xfail( + reason="casts to object-dtype and then tries to add timestamps", + raises=TypeError, + strict=True, + ) + def test_mean_datetimelike_numeric_only_false(self): + df = pd.DataFrame( + { + "A": np.arange(3), + "B": pd.date_range("2016-01-01", periods=3), + "C": pd.timedelta_range("1D", periods=3), + "D": pd.period_range("2016", periods=3, freq="A"), + } + ) + + result = df.mean(numeric_only=False) + expected = pd.Series( + {"A": 1, "B": df.loc[1, "B"], "C": df.loc[1, "C"], "D": df.loc[1, "D"]} + ) + tm.assert_series_equal(result, expected) + + def test_stats_mixed_type(self, float_string_frame): + # don't blow up + float_string_frame.std(1) + float_string_frame.var(1) + float_string_frame.mean(1) + float_string_frame.skew(1) + + def test_sum_bools(self): + df = DataFrame(index=range(1), columns=range(10)) + bools = isna(df) + assert bools.sum(axis=1)[0] == 10 + + # ---------------------------------------------------------------------- + # Index of max / min + + def test_idxmin(self, float_frame, int_frame): + frame = float_frame + frame.loc[5:10] = np.nan + frame.loc[15:20, -2:] = np.nan + for skipna in [True, False]: + for axis in [0, 1]: + for df in [frame, int_frame]: + result = df.idxmin(axis=axis, skipna=skipna) + expected = df.apply(Series.idxmin, axis=axis, skipna=skipna) + tm.assert_series_equal(result, expected) + + msg = "No axis named 2 for object type " + with pytest.raises(ValueError, match=msg): + frame.idxmin(axis=2) + + def test_idxmax(self, float_frame, int_frame): + frame = float_frame + frame.loc[5:10] = np.nan + frame.loc[15:20, -2:] = np.nan + for skipna in [True, False]: + for axis in [0, 1]: + for df in [frame, int_frame]: + result = df.idxmax(axis=axis, skipna=skipna) + expected = df.apply(Series.idxmax, axis=axis, skipna=skipna) + tm.assert_series_equal(result, expected) + + msg = "No axis named 2 for object type " + with pytest.raises(ValueError, match=msg): + frame.idxmax(axis=2) + + # ---------------------------------------------------------------------- + # Logical reductions + + @pytest.mark.parametrize("opname", ["any", "all"]) + def test_any_all(self, opname, bool_frame_with_na, float_string_frame): + assert_bool_op_calc( + opname, getattr(np, opname), bool_frame_with_na, has_skipna=True + ) + assert_bool_op_api( + opname, bool_frame_with_na, float_string_frame, has_bool_only=True + ) + + def test_any_all_extra(self): + df = DataFrame( + { + "A": [True, False, False], + "B": [True, True, False], + "C": [True, True, True], + }, + index=["a", "b", "c"], + ) + result = df[["A", "B"]].any(1) + expected = Series([True, True, False], index=["a", "b", "c"]) + tm.assert_series_equal(result, expected) + + result = df[["A", "B"]].any(1, bool_only=True) + tm.assert_series_equal(result, expected) + + result = df.all(1) + expected = Series([True, False, False], index=["a", "b", "c"]) + tm.assert_series_equal(result, expected) + + result = df.all(1, bool_only=True) + tm.assert_series_equal(result, expected) + + # Axis is None + result = df.all(axis=None).item() + assert result is False + + result = df.any(axis=None).item() + assert result is True + + result = df[["C"]].all(axis=None).item() + assert result is True + + def test_any_datetime(self): + + # GH 23070 + float_data = [1, np.nan, 3, np.nan] + datetime_data = [ + pd.Timestamp("1960-02-15"), + pd.Timestamp("1960-02-16"), + pd.NaT, + pd.NaT, + ] + df = DataFrame({"A": float_data, "B": datetime_data}) + + result = df.any(1) + expected = Series([True, True, True, False]) + tm.assert_series_equal(result, expected) + + def test_any_all_bool_only(self): + + # GH 25101 + df = DataFrame( + {"col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [None, None, None]} + ) + + result = df.all(bool_only=True) + expected = Series(dtype=np.bool) + tm.assert_series_equal(result, expected) + + df = DataFrame( + { + "col1": [1, 2, 3], + "col2": [4, 5, 6], + "col3": [None, None, None], + "col4": [False, False, True], + } + ) + + result = df.all(bool_only=True) + expected = Series({"col4": False}) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "func, data, expected", + [ + (np.any, {}, False), + (np.all, {}, True), + (np.any, {"A": []}, False), + (np.all, {"A": []}, True), + (np.any, {"A": [False, False]}, False), + (np.all, {"A": [False, False]}, False), + (np.any, {"A": [True, False]}, True), + (np.all, {"A": [True, False]}, False), + (np.any, {"A": [True, True]}, True), + (np.all, {"A": [True, True]}, True), + (np.any, {"A": [False], "B": [False]}, False), + (np.all, {"A": [False], "B": [False]}, False), + (np.any, {"A": [False, False], "B": [False, True]}, True), + (np.all, {"A": [False, False], "B": [False, True]}, False), + # other types + (np.all, {"A": pd.Series([0.0, 1.0], dtype="float")}, False), + (np.any, {"A": pd.Series([0.0, 1.0], dtype="float")}, True), + (np.all, {"A": pd.Series([0, 1], dtype=int)}, False), + (np.any, {"A": pd.Series([0, 1], dtype=int)}, True), + pytest.param( + np.all, + {"A": pd.Series([0, 1], dtype="M8[ns]")}, + False, + marks=[td.skip_if_np_lt("1.15")], + ), + pytest.param( + np.any, + {"A": pd.Series([0, 1], dtype="M8[ns]")}, + True, + marks=[td.skip_if_np_lt("1.15")], + ), + pytest.param( + np.all, + {"A": pd.Series([1, 2], dtype="M8[ns]")}, + True, + marks=[td.skip_if_np_lt("1.15")], + ), + pytest.param( + np.any, + {"A": pd.Series([1, 2], dtype="M8[ns]")}, + True, + marks=[td.skip_if_np_lt("1.15")], + ), + pytest.param( + np.all, + {"A": pd.Series([0, 1], dtype="m8[ns]")}, + False, + marks=[td.skip_if_np_lt("1.15")], + ), + pytest.param( + np.any, + {"A": pd.Series([0, 1], dtype="m8[ns]")}, + True, + marks=[td.skip_if_np_lt("1.15")], + ), + pytest.param( + np.all, + {"A": pd.Series([1, 2], dtype="m8[ns]")}, + True, + marks=[td.skip_if_np_lt("1.15")], + ), + pytest.param( + np.any, + {"A": pd.Series([1, 2], dtype="m8[ns]")}, + True, + marks=[td.skip_if_np_lt("1.15")], + ), + (np.all, {"A": pd.Series([0, 1], dtype="category")}, False), + (np.any, {"A": pd.Series([0, 1], dtype="category")}, True), + (np.all, {"A": pd.Series([1, 2], dtype="category")}, True), + (np.any, {"A": pd.Series([1, 2], dtype="category")}, True), + # Mix GH#21484 + pytest.param( + np.all, + { + "A": pd.Series([10, 20], dtype="M8[ns]"), + "B": pd.Series([10, 20], dtype="m8[ns]"), + }, + True, + # In 1.13.3 and 1.14 np.all(df) returns a Timedelta here + marks=[td.skip_if_np_lt("1.15")], + ), + ], + ) + def test_any_all_np_func(self, func, data, expected): + # GH 19976 + data = DataFrame(data) + result = func(data) + assert isinstance(result, np.bool_) + assert result.item() is expected + + # method version + result = getattr(DataFrame(data), func.__name__)(axis=None) + assert isinstance(result, np.bool_) + assert result.item() is expected + + def test_any_all_object(self): + # GH 19976 + result = np.all(DataFrame(columns=["a", "b"])).item() + assert result is True + + result = np.any(DataFrame(columns=["a", "b"])).item() + assert result is False + + @pytest.mark.parametrize("method", ["any", "all"]) + def test_any_all_level_axis_none_raises(self, method): + df = DataFrame( + {"A": 1}, + index=MultiIndex.from_product( + [["A", "B"], ["a", "b"]], names=["out", "in"] + ), + ) + xpr = "Must specify 'axis' when aggregating by level." + with pytest.raises(ValueError, match=xpr): + getattr(df, method)(axis=None, level="out") + + # --------------------------------------------------------------------- + # Matrix-like + + def test_dot(self): + a = DataFrame( + np.random.randn(3, 4), index=["a", "b", "c"], columns=["p", "q", "r", "s"] + ) + b = DataFrame( + np.random.randn(4, 2), index=["p", "q", "r", "s"], columns=["one", "two"] + ) + + result = a.dot(b) + expected = DataFrame( + np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] + ) + # Check alignment + b1 = b.reindex(index=reversed(b.index)) + result = a.dot(b) + tm.assert_frame_equal(result, expected) + + # Check series argument + result = a.dot(b["one"]) + tm.assert_series_equal(result, expected["one"], check_names=False) + assert result.name is None + + result = a.dot(b1["one"]) + tm.assert_series_equal(result, expected["one"], check_names=False) + assert result.name is None + + # can pass correct-length arrays + row = a.iloc[0].values + + result = a.dot(row) + expected = a.dot(a.iloc[0]) + tm.assert_series_equal(result, expected) + + with pytest.raises(ValueError, match="Dot product shape mismatch"): + a.dot(row[:-1]) + + a = np.random.rand(1, 5) + b = np.random.rand(5, 1) + A = DataFrame(a) + + # TODO(wesm): unused + B = DataFrame(b) # noqa + + # it works + result = A.dot(b) + + # unaligned + df = DataFrame(np.random.randn(3, 4), index=[1, 2, 3], columns=range(4)) + df2 = DataFrame(np.random.randn(5, 3), index=range(5), columns=[1, 2, 3]) + + with pytest.raises(ValueError, match="aligned"): + df.dot(df2) + + def test_matmul(self): + # matmul test is for GH 10259 + a = DataFrame( + np.random.randn(3, 4), index=["a", "b", "c"], columns=["p", "q", "r", "s"] + ) + b = DataFrame( + np.random.randn(4, 2), index=["p", "q", "r", "s"], columns=["one", "two"] + ) + + # DataFrame @ DataFrame + result = operator.matmul(a, b) + expected = DataFrame( + np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] + ) + tm.assert_frame_equal(result, expected) + + # DataFrame @ Series + result = operator.matmul(a, b.one) + expected = Series(np.dot(a.values, b.one.values), index=["a", "b", "c"]) + tm.assert_series_equal(result, expected) + + # np.array @ DataFrame + result = operator.matmul(a.values, b) + assert isinstance(result, DataFrame) + assert result.columns.equals(b.columns) + assert result.index.equals(pd.Index(range(3))) + expected = np.dot(a.values, b.values) + tm.assert_almost_equal(result.values, expected) + + # nested list @ DataFrame (__rmatmul__) + result = operator.matmul(a.values.tolist(), b) + expected = DataFrame( + np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] + ) + tm.assert_almost_equal(result.values, expected.values) + + # mixed dtype DataFrame @ DataFrame + a["q"] = a.q.round().astype(int) + result = operator.matmul(a, b) + expected = DataFrame( + np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] + ) + tm.assert_frame_equal(result, expected) + + # different dtypes DataFrame @ DataFrame + a = a.astype(int) + result = operator.matmul(a, b) + expected = DataFrame( + np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] + ) + tm.assert_frame_equal(result, expected) + + # unaligned + df = DataFrame(np.random.randn(3, 4), index=[1, 2, 3], columns=range(4)) + df2 = DataFrame(np.random.randn(5, 3), index=range(5), columns=[1, 2, 3]) + + with pytest.raises(ValueError, match="aligned"): + operator.matmul(df, df2) + + # --------------------------------------------------------------------- + # Unsorted + + def test_series_broadcasting(self): + # smoke test for numpy warnings + # GH 16378, GH 16306 + df = DataFrame([1.0, 1.0, 1.0]) + df_nan = DataFrame({"A": [np.nan, 2.0, np.nan]}) + s = Series([1, 1, 1]) + s_nan = Series([np.nan, np.nan, 1]) + + with tm.assert_produces_warning(None): + df_nan.clip(lower=s, axis=0) + for op in ["lt", "le", "gt", "ge", "eq", "ne"]: + getattr(df, op)(s_nan, axis=0) diff --git a/venv/Lib/site-packages/pandas/tests/frame/test_api.py b/venv/Lib/site-packages/pandas/tests/frame/test_api.py new file mode 100644 index 0000000..9263409 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/test_api.py @@ -0,0 +1,561 @@ +from copy import deepcopy +import datetime +import pydoc + +import numpy as np +import pytest + +from pandas.compat import PY37 +from pandas.util._test_decorators import async_mark + +import pandas as pd +from pandas import Categorical, DataFrame, Series, compat, date_range, timedelta_range +import pandas._testing as tm + + +class TestDataFrameMisc: + def test_copy_index_name_checking(self, float_frame): + # don't want to be able to modify the index stored elsewhere after + # making a copy + for attr in ("index", "columns"): + ind = getattr(float_frame, attr) + ind.name = None + cp = float_frame.copy() + getattr(cp, attr).name = "foo" + assert getattr(float_frame, attr).name is None + + def test_getitem_pop_assign_name(self, float_frame): + s = float_frame["A"] + assert s.name == "A" + + s = float_frame.pop("A") + assert s.name == "A" + + s = float_frame.loc[:, "B"] + assert s.name == "B" + + s2 = s.loc[:] + assert s2.name == "B" + + def test_get_value(self, float_frame): + for idx in float_frame.index: + for col in float_frame.columns: + result = float_frame._get_value(idx, col) + expected = float_frame[col][idx] + tm.assert_almost_equal(result, expected) + + def test_add_prefix_suffix(self, float_frame): + with_prefix = float_frame.add_prefix("foo#") + expected = pd.Index(["foo#{c}".format(c=c) for c in float_frame.columns]) + tm.assert_index_equal(with_prefix.columns, expected) + + with_suffix = float_frame.add_suffix("#foo") + expected = pd.Index(["{c}#foo".format(c=c) for c in float_frame.columns]) + tm.assert_index_equal(with_suffix.columns, expected) + + with_pct_prefix = float_frame.add_prefix("%") + expected = pd.Index(["%{c}".format(c=c) for c in float_frame.columns]) + tm.assert_index_equal(with_pct_prefix.columns, expected) + + with_pct_suffix = float_frame.add_suffix("%") + expected = pd.Index(["{c}%".format(c=c) for c in float_frame.columns]) + tm.assert_index_equal(with_pct_suffix.columns, expected) + + def test_get_axis(self, float_frame): + f = float_frame + assert f._get_axis_number(0) == 0 + assert f._get_axis_number(1) == 1 + assert f._get_axis_number("index") == 0 + assert f._get_axis_number("rows") == 0 + assert f._get_axis_number("columns") == 1 + + assert f._get_axis_name(0) == "index" + assert f._get_axis_name(1) == "columns" + assert f._get_axis_name("index") == "index" + assert f._get_axis_name("rows") == "index" + assert f._get_axis_name("columns") == "columns" + + assert f._get_axis(0) is f.index + assert f._get_axis(1) is f.columns + + with pytest.raises(ValueError, match="No axis named"): + f._get_axis_number(2) + + with pytest.raises(ValueError, match="No axis.*foo"): + f._get_axis_name("foo") + + with pytest.raises(ValueError, match="No axis.*None"): + f._get_axis_name(None) + + with pytest.raises(ValueError, match="No axis named"): + f._get_axis_number(None) + + def test_keys(self, float_frame): + getkeys = float_frame.keys + assert getkeys() is float_frame.columns + + def test_column_contains_raises(self, float_frame): + with pytest.raises(TypeError, match="unhashable type: 'Index'"): + float_frame.columns in float_frame + + def test_tab_completion(self): + # DataFrame whose columns are identifiers shall have them in __dir__. + df = pd.DataFrame([list("abcd"), list("efgh")], columns=list("ABCD")) + for key in list("ABCD"): + assert key in dir(df) + assert isinstance(df.__getitem__("A"), pd.Series) + + # DataFrame whose first-level columns are identifiers shall have + # them in __dir__. + df = pd.DataFrame( + [list("abcd"), list("efgh")], + columns=pd.MultiIndex.from_tuples(list(zip("ABCD", "EFGH"))), + ) + for key in list("ABCD"): + assert key in dir(df) + for key in list("EFGH"): + assert key not in dir(df) + assert isinstance(df.__getitem__("A"), pd.DataFrame) + + def test_not_hashable(self): + empty_frame = DataFrame() + + df = DataFrame([1]) + msg = "'DataFrame' objects are mutable, thus they cannot be hashed" + with pytest.raises(TypeError, match=msg): + hash(df) + with pytest.raises(TypeError, match=msg): + hash(empty_frame) + + def test_new_empty_index(self): + df1 = DataFrame(np.random.randn(0, 3)) + df2 = DataFrame(np.random.randn(0, 3)) + df1.index.name = "foo" + assert df2.index.name is None + + def test_array_interface(self, float_frame): + with np.errstate(all="ignore"): + result = np.sqrt(float_frame) + assert isinstance(result, type(float_frame)) + assert result.index is float_frame.index + assert result.columns is float_frame.columns + + tm.assert_frame_equal(result, float_frame.apply(np.sqrt)) + + def test_get_agg_axis(self, float_frame): + cols = float_frame._get_agg_axis(0) + assert cols is float_frame.columns + + idx = float_frame._get_agg_axis(1) + assert idx is float_frame.index + + msg = r"Axis must be 0 or 1 \(got 2\)" + with pytest.raises(ValueError, match=msg): + float_frame._get_agg_axis(2) + + def test_nonzero(self, float_frame, float_string_frame): + empty_frame = DataFrame() + assert empty_frame.empty + + assert not float_frame.empty + assert not float_string_frame.empty + + # corner case + df = DataFrame({"A": [1.0, 2.0, 3.0], "B": ["a", "b", "c"]}, index=np.arange(3)) + del df["A"] + assert not df.empty + + def test_iteritems(self): + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"]) + for k, v in df.items(): + assert isinstance(v, DataFrame._constructor_sliced) + + def test_items(self): + # GH 17213, GH 13918 + cols = ["a", "b", "c"] + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=cols) + for c, (k, v) in zip(cols, df.items()): + assert c == k + assert isinstance(v, Series) + assert (df[k] == v).all() + + def test_iter(self, float_frame): + assert tm.equalContents(list(float_frame), float_frame.columns) + + def test_iterrows(self, float_frame, float_string_frame): + for k, v in float_frame.iterrows(): + exp = float_frame.loc[k] + tm.assert_series_equal(v, exp) + + for k, v in float_string_frame.iterrows(): + exp = float_string_frame.loc[k] + tm.assert_series_equal(v, exp) + + def test_iterrows_iso8601(self): + # GH 19671 + s = DataFrame( + { + "non_iso8601": ["M1701", "M1802", "M1903", "M2004"], + "iso8601": date_range("2000-01-01", periods=4, freq="M"), + } + ) + for k, v in s.iterrows(): + exp = s.loc[k] + tm.assert_series_equal(v, exp) + + def test_iterrows_corner(self): + # gh-12222 + df = DataFrame( + { + "a": [datetime.datetime(2015, 1, 1)], + "b": [None], + "c": [None], + "d": [""], + "e": [[]], + "f": [set()], + "g": [{}], + } + ) + expected = Series( + [datetime.datetime(2015, 1, 1), None, None, "", [], set(), {}], + index=list("abcdefg"), + name=0, + dtype="object", + ) + _, result = next(df.iterrows()) + tm.assert_series_equal(result, expected) + + def test_itertuples(self, float_frame): + for i, tup in enumerate(float_frame.itertuples()): + s = DataFrame._constructor_sliced(tup[1:]) + s.name = tup[0] + expected = float_frame.iloc[i, :].reset_index(drop=True) + tm.assert_series_equal(s, expected) + + df = DataFrame( + {"floats": np.random.randn(5), "ints": range(5)}, columns=["floats", "ints"] + ) + + for tup in df.itertuples(index=False): + assert isinstance(tup[1], int) + + df = DataFrame(data={"a": [1, 2, 3], "b": [4, 5, 6]}) + dfaa = df[["a", "a"]] + + assert list(dfaa.itertuples()) == [(0, 1, 1), (1, 2, 2), (2, 3, 3)] + + # repr with int on 32-bit/windows + if not (compat.is_platform_windows() or compat.is_platform_32bit()): + assert ( + repr(list(df.itertuples(name=None))) + == "[(0, 1, 4), (1, 2, 5), (2, 3, 6)]" + ) + + tup = next(df.itertuples(name="TestName")) + assert tup._fields == ("Index", "a", "b") + assert (tup.Index, tup.a, tup.b) == tup + assert type(tup).__name__ == "TestName" + + df.columns = ["def", "return"] + tup2 = next(df.itertuples(name="TestName")) + assert tup2 == (0, 1, 4) + assert tup2._fields == ("Index", "_1", "_2") + + df3 = DataFrame({"f" + str(i): [i] for i in range(1024)}) + # will raise SyntaxError if trying to create namedtuple + tup3 = next(df3.itertuples()) + assert isinstance(tup3, tuple) + if PY37: + assert hasattr(tup3, "_fields") + else: + assert not hasattr(tup3, "_fields") + + # GH 28282 + df_254_columns = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(254)}]) + result_254_columns = next(df_254_columns.itertuples(index=False)) + assert isinstance(result_254_columns, tuple) + assert hasattr(result_254_columns, "_fields") + + df_255_columns = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(255)}]) + result_255_columns = next(df_255_columns.itertuples(index=False)) + assert isinstance(result_255_columns, tuple) + + # Dataframes with >=255 columns will fallback to regular tuples on python < 3.7 + if PY37: + assert hasattr(result_255_columns, "_fields") + else: + assert not hasattr(result_255_columns, "_fields") + + def test_sequence_like_with_categorical(self): + + # GH 7839 + # make sure can iterate + df = DataFrame( + {"id": [1, 2, 3, 4, 5, 6], "raw_grade": ["a", "b", "b", "a", "a", "e"]} + ) + df["grade"] = Categorical(df["raw_grade"]) + + # basic sequencing testing + result = list(df.grade.values) + expected = np.array(df.grade.values).tolist() + tm.assert_almost_equal(result, expected) + + # iteration + for t in df.itertuples(index=False): + str(t) + + for row, s in df.iterrows(): + str(s) + + for c, col in df.items(): + str(s) + + def test_len(self, float_frame): + assert len(float_frame) == len(float_frame.index) + + def test_values_mixed_dtypes(self, float_frame, float_string_frame): + frame = float_frame + arr = frame.values + + frame_cols = frame.columns + for i, row in enumerate(arr): + for j, value in enumerate(row): + col = frame_cols[j] + if np.isnan(value): + assert np.isnan(frame[col][i]) + else: + assert value == frame[col][i] + + # mixed type + arr = float_string_frame[["foo", "A"]].values + assert arr[0, 0] == "bar" + + df = DataFrame({"complex": [1j, 2j, 3j], "real": [1, 2, 3]}) + arr = df.values + assert arr[0, 0] == 1j + + # single block corner case + arr = float_frame[["A", "B"]].values + expected = float_frame.reindex(columns=["A", "B"]).values + tm.assert_almost_equal(arr, expected) + + def test_to_numpy(self): + df = pd.DataFrame({"A": [1, 2], "B": [3, 4.5]}) + expected = np.array([[1, 3], [2, 4.5]]) + result = df.to_numpy() + tm.assert_numpy_array_equal(result, expected) + + def test_to_numpy_dtype(self): + df = pd.DataFrame({"A": [1, 2], "B": [3, 4.5]}) + expected = np.array([[1, 3], [2, 4]], dtype="int64") + result = df.to_numpy(dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + def test_to_numpy_copy(self): + arr = np.random.randn(4, 3) + df = pd.DataFrame(arr) + assert df.values.base is arr + assert df.to_numpy(copy=False).base is arr + assert df.to_numpy(copy=True).base is None + + def test_transpose(self, float_frame): + frame = float_frame + dft = frame.T + for idx, series in dft.items(): + for col, value in series.items(): + if np.isnan(value): + assert np.isnan(frame[col][idx]) + else: + assert value == frame[col][idx] + + # mixed type + index, data = tm.getMixedTypeDict() + mixed = DataFrame(data, index=index) + + mixed_T = mixed.T + for col, s in mixed_T.items(): + assert s.dtype == np.object_ + + def test_swapaxes(self): + df = DataFrame(np.random.randn(10, 5)) + tm.assert_frame_equal(df.T, df.swapaxes(0, 1)) + tm.assert_frame_equal(df.T, df.swapaxes(1, 0)) + tm.assert_frame_equal(df, df.swapaxes(0, 0)) + msg = ( + "No axis named 2 for object type" + r" " + ) + with pytest.raises(ValueError, match=msg): + df.swapaxes(2, 5) + + def test_axis_aliases(self, float_frame): + f = float_frame + + # reg name + expected = f.sum(axis=0) + result = f.sum(axis="index") + tm.assert_series_equal(result, expected) + + expected = f.sum(axis=1) + result = f.sum(axis="columns") + tm.assert_series_equal(result, expected) + + def test_class_axis(self): + # GH 18147 + # no exception and no empty docstring + assert pydoc.getdoc(DataFrame.index) + assert pydoc.getdoc(DataFrame.columns) + + def test_more_values(self, float_string_frame): + values = float_string_frame.values + assert values.shape[1] == len(float_string_frame.columns) + + def test_repr_with_mi_nat(self, float_string_frame): + df = DataFrame( + {"X": [1, 2]}, index=[[pd.NaT, pd.Timestamp("20130101")], ["a", "b"]] + ) + result = repr(df) + expected = " X\nNaT a 1\n2013-01-01 b 2" + assert result == expected + + def test_items_names(self, float_string_frame): + for k, v in float_string_frame.items(): + assert v.name == k + + def test_series_put_names(self, float_string_frame): + series = float_string_frame._series + for k, v in series.items(): + assert v.name == k + + def test_empty_nonzero(self): + df = DataFrame([1, 2, 3]) + assert not df.empty + df = DataFrame(index=[1], columns=[1]) + assert not df.empty + df = DataFrame(index=["a", "b"], columns=["c", "d"]).dropna() + assert df.empty + assert df.T.empty + empty_frames = [ + DataFrame(), + DataFrame(index=[1]), + DataFrame(columns=[1]), + DataFrame({1: []}), + ] + for df in empty_frames: + assert df.empty + assert df.T.empty + + def test_with_datetimelikes(self): + + df = DataFrame( + { + "A": date_range("20130101", periods=10), + "B": timedelta_range("1 day", periods=10), + } + ) + t = df.T + + result = t.dtypes.value_counts() + expected = Series({np.dtype("object"): 10}) + tm.assert_series_equal(result, expected) + + def test_values(self, float_frame): + float_frame.values[:, 0] = 5.0 + assert (float_frame.values[:, 0] == 5).all() + + def test_deepcopy(self, float_frame): + cp = deepcopy(float_frame) + series = cp["A"] + series[:] = 10 + for idx, value in series.items(): + assert float_frame["A"][idx] != value + + def test_transpose_get_view(self, float_frame): + dft = float_frame.T + dft.values[:, 5:10] = 5 + + assert (float_frame.values[5:10] == 5).all() + + def test_inplace_return_self(self): + # GH 1893 + + data = DataFrame( + {"a": ["foo", "bar", "baz", "qux"], "b": [0, 0, 1, 1], "c": [1, 2, 3, 4]} + ) + + def _check_f(base, f): + result = f(base) + assert result is None + + # -----DataFrame----- + + # set_index + f = lambda x: x.set_index("a", inplace=True) + _check_f(data.copy(), f) + + # reset_index + f = lambda x: x.reset_index(inplace=True) + _check_f(data.set_index("a"), f) + + # drop_duplicates + f = lambda x: x.drop_duplicates(inplace=True) + _check_f(data.copy(), f) + + # sort + f = lambda x: x.sort_values("b", inplace=True) + _check_f(data.copy(), f) + + # sort_index + f = lambda x: x.sort_index(inplace=True) + _check_f(data.copy(), f) + + # fillna + f = lambda x: x.fillna(0, inplace=True) + _check_f(data.copy(), f) + + # replace + f = lambda x: x.replace(1, 0, inplace=True) + _check_f(data.copy(), f) + + # rename + f = lambda x: x.rename({1: "foo"}, inplace=True) + _check_f(data.copy(), f) + + # -----Series----- + d = data.copy()["c"] + + # reset_index + f = lambda x: x.reset_index(inplace=True, drop=True) + _check_f(data.set_index("a")["c"], f) + + # fillna + f = lambda x: x.fillna(0, inplace=True) + _check_f(d.copy(), f) + + # replace + f = lambda x: x.replace(1, 0, inplace=True) + _check_f(d.copy(), f) + + # rename + f = lambda x: x.rename({1: "foo"}, inplace=True) + _check_f(d.copy(), f) + + @async_mark() + async def test_tab_complete_warning(self, ip): + # GH 16409 + pytest.importorskip("IPython", minversion="6.0.0") + from IPython.core.completer import provisionalcompleter + + code = "import pandas as pd; df = pd.DataFrame()" + await ip.run_code(code) + with tm.assert_produces_warning(None): + with provisionalcompleter("ignore"): + list(ip.Completer.completions("df.", 1)) + + def test_attrs(self): + df = pd.DataFrame({"A": [2, 3]}) + assert df.attrs == {} + df.attrs["version"] = 1 + + result = df.rename(columns=str) + assert result.attrs == {"version": 1} diff --git a/venv/Lib/site-packages/pandas/tests/frame/test_apply.py b/venv/Lib/site-packages/pandas/tests/frame/test_apply.py new file mode 100644 index 0000000..e98f74e --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/test_apply.py @@ -0,0 +1,1406 @@ +from collections import OrderedDict +from datetime import datetime +from itertools import chain +import operator +import warnings + +import numpy as np +import pytest + +from pandas.core.dtypes.dtypes import CategoricalDtype + +import pandas as pd +from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range, notna +import pandas._testing as tm +from pandas.conftest import _get_cython_table_params +from pandas.core.apply import frame_apply +from pandas.core.base import SpecificationError + + +@pytest.fixture +def int_frame_const_col(): + """ + Fixture for DataFrame of ints which are constant per column + + Columns are ['A', 'B', 'C'], with values (per column): [1, 2, 3] + """ + df = DataFrame( + np.tile(np.arange(3, dtype="int64"), 6).reshape(6, -1) + 1, + columns=["A", "B", "C"], + ) + return df + + +class TestDataFrameApply: + def test_apply(self, float_frame): + with np.errstate(all="ignore"): + # ufunc + applied = float_frame.apply(np.sqrt) + tm.assert_series_equal(np.sqrt(float_frame["A"]), applied["A"]) + + # aggregator + applied = float_frame.apply(np.mean) + assert applied["A"] == np.mean(float_frame["A"]) + + d = float_frame.index[0] + applied = float_frame.apply(np.mean, axis=1) + assert applied[d] == np.mean(float_frame.xs(d)) + assert applied.index is float_frame.index # want this + + # invalid axis + df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"]) + with pytest.raises(ValueError): + df.apply(lambda x: x, 2) + + # GH 9573 + df = DataFrame({"c0": ["A", "A", "B", "B"], "c1": ["C", "C", "D", "D"]}) + df = df.apply(lambda ts: ts.astype("category")) + + assert df.shape == (4, 2) + assert isinstance(df["c0"].dtype, CategoricalDtype) + assert isinstance(df["c1"].dtype, CategoricalDtype) + + def test_apply_mixed_datetimelike(self): + # mixed datetimelike + # GH 7778 + df = DataFrame( + { + "A": date_range("20130101", periods=3), + "B": pd.to_timedelta(np.arange(3), unit="s"), + } + ) + result = df.apply(lambda x: x, axis=1) + tm.assert_frame_equal(result, df) + + def test_apply_empty(self, float_frame): + # empty + empty_frame = DataFrame() + + applied = empty_frame.apply(np.sqrt) + assert applied.empty + + applied = empty_frame.apply(np.mean) + assert applied.empty + + no_rows = float_frame[:0] + result = no_rows.apply(lambda x: x.mean()) + expected = Series(np.nan, index=float_frame.columns) + tm.assert_series_equal(result, expected) + + no_cols = float_frame.loc[:, []] + result = no_cols.apply(lambda x: x.mean(), axis=1) + expected = Series(np.nan, index=float_frame.index) + tm.assert_series_equal(result, expected) + + # GH 2476 + expected = DataFrame(index=["a"]) + result = expected.apply(lambda x: x["a"], axis=1) + tm.assert_frame_equal(expected, result) + + def test_apply_with_reduce_empty(self): + # reduce with an empty DataFrame + empty_frame = DataFrame() + + x = [] + result = empty_frame.apply(x.append, axis=1, result_type="expand") + tm.assert_frame_equal(result, empty_frame) + result = empty_frame.apply(x.append, axis=1, result_type="reduce") + expected = Series([], index=pd.Index([], dtype=object), dtype=np.float64) + tm.assert_series_equal(result, expected) + + empty_with_cols = DataFrame(columns=["a", "b", "c"]) + result = empty_with_cols.apply(x.append, axis=1, result_type="expand") + tm.assert_frame_equal(result, empty_with_cols) + result = empty_with_cols.apply(x.append, axis=1, result_type="reduce") + expected = Series([], index=pd.Index([], dtype=object), dtype=np.float64) + tm.assert_series_equal(result, expected) + + # Ensure that x.append hasn't been called + assert x == [] + + @pytest.mark.parametrize("func", ["sum", "prod", "any", "all"]) + def test_apply_funcs_over_empty(self, func): + # GH 28213 + df = DataFrame(columns=["a", "b", "c"]) + + result = df.apply(getattr(np, func)) + expected = getattr(df, func)() + tm.assert_series_equal(result, expected) + + def test_nunique_empty(self): + # GH 28213 + df = DataFrame(columns=["a", "b", "c"]) + + result = df.nunique() + expected = Series(0, index=df.columns) + tm.assert_series_equal(result, expected) + + result = df.T.nunique() + expected = Series([], index=pd.Index([]), dtype=np.float64) + tm.assert_series_equal(result, expected) + + def test_apply_standard_nonunique(self): + df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"]) + + result = df.apply(lambda s: s[0], axis=1) + expected = Series([1, 4, 7], ["a", "a", "c"]) + tm.assert_series_equal(result, expected) + + result = df.T.apply(lambda s: s[0], axis=0) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("func", ["sum", "mean", "min", "max", "std"]) + @pytest.mark.parametrize( + "args,kwds", + [ + pytest.param([], {}, id="no_args_or_kwds"), + pytest.param([1], {}, id="axis_from_args"), + pytest.param([], {"axis": 1}, id="axis_from_kwds"), + pytest.param([], {"numeric_only": True}, id="optional_kwds"), + pytest.param([1, None], {"numeric_only": True}, id="args_and_kwds"), + ], + ) + def test_apply_with_string_funcs(self, float_frame, func, args, kwds): + result = float_frame.apply(func, *args, **kwds) + expected = getattr(float_frame, func)(*args, **kwds) + tm.assert_series_equal(result, expected) + + def test_apply_broadcast(self, float_frame, int_frame_const_col): + + # scalars + result = float_frame.apply(np.mean, result_type="broadcast") + expected = DataFrame([float_frame.mean()], index=float_frame.index) + tm.assert_frame_equal(result, expected) + + result = float_frame.apply(np.mean, axis=1, result_type="broadcast") + m = float_frame.mean(axis=1) + expected = DataFrame({c: m for c in float_frame.columns}) + tm.assert_frame_equal(result, expected) + + # lists + result = float_frame.apply( + lambda x: list(range(len(float_frame.columns))), + axis=1, + result_type="broadcast", + ) + m = list(range(len(float_frame.columns))) + expected = DataFrame( + [m] * len(float_frame.index), + dtype="float64", + index=float_frame.index, + columns=float_frame.columns, + ) + tm.assert_frame_equal(result, expected) + + result = float_frame.apply( + lambda x: list(range(len(float_frame.index))), result_type="broadcast" + ) + m = list(range(len(float_frame.index))) + expected = DataFrame( + {c: m for c in float_frame.columns}, + dtype="float64", + index=float_frame.index, + ) + tm.assert_frame_equal(result, expected) + + # preserve columns + df = int_frame_const_col + result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="broadcast") + tm.assert_frame_equal(result, df) + + df = int_frame_const_col + result = df.apply( + lambda x: Series([1, 2, 3], index=list("abc")), + axis=1, + result_type="broadcast", + ) + expected = df.copy() + tm.assert_frame_equal(result, expected) + + def test_apply_broadcast_error(self, int_frame_const_col): + df = int_frame_const_col + + # > 1 ndim + with pytest.raises(ValueError): + df.apply( + lambda x: np.array([1, 2]).reshape(-1, 2), + axis=1, + result_type="broadcast", + ) + + # cannot broadcast + with pytest.raises(ValueError): + df.apply(lambda x: [1, 2], axis=1, result_type="broadcast") + + with pytest.raises(ValueError): + df.apply(lambda x: Series([1, 2]), axis=1, result_type="broadcast") + + def test_apply_raw(self, float_frame): + result0 = float_frame.apply(np.mean, raw=True) + result1 = float_frame.apply(np.mean, axis=1, raw=True) + + expected0 = float_frame.apply(lambda x: x.values.mean()) + expected1 = float_frame.apply(lambda x: x.values.mean(), axis=1) + + tm.assert_series_equal(result0, expected0) + tm.assert_series_equal(result1, expected1) + + # no reduction + result = float_frame.apply(lambda x: x * 2, raw=True) + expected = float_frame * 2 + tm.assert_frame_equal(result, expected) + + def test_apply_axis1(self, float_frame): + d = float_frame.index[0] + tapplied = float_frame.apply(np.mean, axis=1) + assert tapplied[d] == np.mean(float_frame.xs(d)) + + def test_apply_ignore_failures(self, float_string_frame): + result = frame_apply( + float_string_frame, np.mean, 0, ignore_failures=True + ).apply_standard() + expected = float_string_frame._get_numeric_data().apply(np.mean) + tm.assert_series_equal(result, expected) + + def test_apply_mixed_dtype_corner(self): + df = DataFrame({"A": ["foo"], "B": [1.0]}) + result = df[:0].apply(np.mean, axis=1) + # the result here is actually kind of ambiguous, should it be a Series + # or a DataFrame? + expected = Series(np.nan, index=pd.Index([], dtype="int64")) + tm.assert_series_equal(result, expected) + + df = DataFrame({"A": ["foo"], "B": [1.0]}) + result = df.apply(lambda x: x["A"], axis=1) + expected = Series(["foo"], index=[0]) + tm.assert_series_equal(result, expected) + + result = df.apply(lambda x: x["B"], axis=1) + expected = Series([1.0], index=[0]) + tm.assert_series_equal(result, expected) + + def test_apply_empty_infer_type(self): + no_cols = DataFrame(index=["a", "b", "c"]) + no_index = DataFrame(columns=["a", "b", "c"]) + + def _check(df, f): + with warnings.catch_warnings(record=True): + warnings.simplefilter("ignore", RuntimeWarning) + test_res = f(np.array([], dtype="f8")) + is_reduction = not isinstance(test_res, np.ndarray) + + def _checkit(axis=0, raw=False): + result = df.apply(f, axis=axis, raw=raw) + if is_reduction: + agg_axis = df._get_agg_axis(axis) + assert isinstance(result, Series) + assert result.index is agg_axis + else: + assert isinstance(result, DataFrame) + + _checkit() + _checkit(axis=1) + _checkit(raw=True) + _checkit(axis=0, raw=True) + + with np.errstate(all="ignore"): + _check(no_cols, lambda x: x) + _check(no_cols, lambda x: x.mean()) + _check(no_index, lambda x: x) + _check(no_index, lambda x: x.mean()) + + result = no_cols.apply(lambda x: x.mean(), result_type="broadcast") + assert isinstance(result, DataFrame) + + def test_apply_with_args_kwds(self, float_frame): + def add_some(x, howmuch=0): + return x + howmuch + + def agg_and_add(x, howmuch=0): + return x.mean() + howmuch + + def subtract_and_divide(x, sub, divide=1): + return (x - sub) / divide + + result = float_frame.apply(add_some, howmuch=2) + expected = float_frame.apply(lambda x: x + 2) + tm.assert_frame_equal(result, expected) + + result = float_frame.apply(agg_and_add, howmuch=2) + expected = float_frame.apply(lambda x: x.mean() + 2) + tm.assert_series_equal(result, expected) + + result = float_frame.apply(subtract_and_divide, args=(2,), divide=2) + expected = float_frame.apply(lambda x: (x - 2.0) / 2.0) + tm.assert_frame_equal(result, expected) + + def test_apply_yield_list(self, float_frame): + result = float_frame.apply(list) + tm.assert_frame_equal(result, float_frame) + + def test_apply_reduce_Series(self, float_frame): + float_frame.loc[::2, "A"] = np.nan + expected = float_frame.mean(1) + result = float_frame.apply(np.mean, axis=1) + tm.assert_series_equal(result, expected) + + def test_apply_reduce_rows_to_dict(self): + # GH 25196 + data = pd.DataFrame([[1, 2], [3, 4]]) + expected = pd.Series([{0: 1, 1: 3}, {0: 2, 1: 4}]) + result = data.apply(dict) + tm.assert_series_equal(result, expected) + + def test_apply_differently_indexed(self): + df = DataFrame(np.random.randn(20, 10)) + + result0 = df.apply(Series.describe, axis=0) + expected0 = DataFrame( + {i: v.describe() for i, v in df.items()}, columns=df.columns + ) + tm.assert_frame_equal(result0, expected0) + + result1 = df.apply(Series.describe, axis=1) + expected1 = DataFrame( + {i: v.describe() for i, v in df.T.items()}, columns=df.index + ).T + tm.assert_frame_equal(result1, expected1) + + def test_apply_modify_traceback(self): + data = DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + "D": np.random.randn(11), + "E": np.random.randn(11), + "F": np.random.randn(11), + } + ) + + data.loc[4, "C"] = np.nan + + def transform(row): + if row["C"].startswith("shin") and row["A"] == "foo": + row["D"] = 7 + return row + + def transform2(row): + if notna(row["C"]) and row["C"].startswith("shin") and row["A"] == "foo": + row["D"] = 7 + return row + + msg = "'float' object has no attribute 'startswith'" + with pytest.raises(AttributeError, match=msg): + data.apply(transform, axis=1) + + def test_apply_bug(self): + + # GH 6125 + positions = pd.DataFrame( + [ + [1, "ABC0", 50], + [1, "YUM0", 20], + [1, "DEF0", 20], + [2, "ABC1", 50], + [2, "YUM1", 20], + [2, "DEF1", 20], + ], + columns=["a", "market", "position"], + ) + + def f(r): + return r["market"] + + expected = positions.apply(f, axis=1) + + positions = DataFrame( + [ + [datetime(2013, 1, 1), "ABC0", 50], + [datetime(2013, 1, 2), "YUM0", 20], + [datetime(2013, 1, 3), "DEF0", 20], + [datetime(2013, 1, 4), "ABC1", 50], + [datetime(2013, 1, 5), "YUM1", 20], + [datetime(2013, 1, 6), "DEF1", 20], + ], + columns=["a", "market", "position"], + ) + result = positions.apply(f, axis=1) + tm.assert_series_equal(result, expected) + + def test_apply_convert_objects(self): + data = DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + "D": np.random.randn(11), + "E": np.random.randn(11), + "F": np.random.randn(11), + } + ) + + result = data.apply(lambda x: x, axis=1) + tm.assert_frame_equal(result._convert(datetime=True), data) + + def test_apply_attach_name(self, float_frame): + result = float_frame.apply(lambda x: x.name) + expected = Series(float_frame.columns, index=float_frame.columns) + tm.assert_series_equal(result, expected) + + result = float_frame.apply(lambda x: x.name, axis=1) + expected = Series(float_frame.index, index=float_frame.index) + tm.assert_series_equal(result, expected) + + # non-reductions + result = float_frame.apply(lambda x: np.repeat(x.name, len(x))) + expected = DataFrame( + np.tile(float_frame.columns, (len(float_frame.index), 1)), + index=float_frame.index, + columns=float_frame.columns, + ) + tm.assert_frame_equal(result, expected) + + result = float_frame.apply(lambda x: np.repeat(x.name, len(x)), axis=1) + expected = Series( + np.repeat(t[0], len(float_frame.columns)) for t in float_frame.itertuples() + ) + expected.index = float_frame.index + tm.assert_series_equal(result, expected) + + def test_apply_multi_index(self, float_frame): + index = MultiIndex.from_arrays([["a", "a", "b"], ["c", "d", "d"]]) + s = DataFrame([[1, 2], [3, 4], [5, 6]], index=index, columns=["col1", "col2"]) + result = s.apply(lambda x: Series({"min": min(x), "max": max(x)}), 1) + expected = DataFrame( + [[1, 2], [3, 4], [5, 6]], index=index, columns=["min", "max"] + ) + tm.assert_frame_equal(result, expected, check_like=True) + + def test_apply_dict(self): + + # GH 8735 + A = DataFrame([["foo", "bar"], ["spam", "eggs"]]) + A_dicts = Series( + [dict([(0, "foo"), (1, "spam")]), dict([(0, "bar"), (1, "eggs")])] + ) + B = DataFrame([[0, 1], [2, 3]]) + B_dicts = Series([dict([(0, 0), (1, 2)]), dict([(0, 1), (1, 3)])]) + fn = lambda x: x.to_dict() + + for df, dicts in [(A, A_dicts), (B, B_dicts)]: + reduce_true = df.apply(fn, result_type="reduce") + reduce_false = df.apply(fn, result_type="expand") + reduce_none = df.apply(fn) + + tm.assert_series_equal(reduce_true, dicts) + tm.assert_frame_equal(reduce_false, df) + tm.assert_series_equal(reduce_none, dicts) + + def test_applymap(self, float_frame): + applied = float_frame.applymap(lambda x: x * 2) + tm.assert_frame_equal(applied, float_frame * 2) + float_frame.applymap(type) + + # GH 465: function returning tuples + result = float_frame.applymap(lambda x: (x, x)) + assert isinstance(result["A"][0], tuple) + + # GH 2909: object conversion to float in constructor? + df = DataFrame(data=[1, "a"]) + result = df.applymap(lambda x: x) + assert result.dtypes[0] == object + + df = DataFrame(data=[1.0, "a"]) + result = df.applymap(lambda x: x) + assert result.dtypes[0] == object + + # GH 2786 + df = DataFrame(np.random.random((3, 4))) + df2 = df.copy() + cols = ["a", "a", "a", "a"] + df.columns = cols + + expected = df2.applymap(str) + expected.columns = cols + result = df.applymap(str) + tm.assert_frame_equal(result, expected) + + # datetime/timedelta + df["datetime"] = Timestamp("20130101") + df["timedelta"] = pd.Timedelta("1 min") + result = df.applymap(str) + for f in ["datetime", "timedelta"]: + assert result.loc[0, f] == str(df.loc[0, f]) + + # GH 8222 + empty_frames = [ + pd.DataFrame(), + pd.DataFrame(columns=list("ABC")), + pd.DataFrame(index=list("ABC")), + pd.DataFrame({"A": [], "B": [], "C": []}), + ] + for frame in empty_frames: + for func in [round, lambda x: x]: + result = frame.applymap(func) + tm.assert_frame_equal(result, frame) + + def test_applymap_box_timestamps(self): + # GH 2689, GH 2627 + ser = pd.Series(date_range("1/1/2000", periods=10)) + + def func(x): + return (x.hour, x.day, x.month) + + # it works! + pd.DataFrame(ser).applymap(func) + + def test_applymap_box(self): + # ufunc will not be boxed. Same test cases as the test_map_box + df = pd.DataFrame( + { + "a": [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")], + "b": [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + ], + "c": [pd.Timedelta("1 days"), pd.Timedelta("2 days")], + "d": [ + pd.Period("2011-01-01", freq="M"), + pd.Period("2011-01-02", freq="M"), + ], + } + ) + + result = df.applymap(lambda x: type(x).__name__) + expected = pd.DataFrame( + { + "a": ["Timestamp", "Timestamp"], + "b": ["Timestamp", "Timestamp"], + "c": ["Timedelta", "Timedelta"], + "d": ["Period", "Period"], + } + ) + tm.assert_frame_equal(result, expected) + + def test_frame_apply_dont_convert_datetime64(self): + from pandas.tseries.offsets import BDay + + df = DataFrame({"x1": [datetime(1996, 1, 1)]}) + + df = df.applymap(lambda x: x + BDay()) + df = df.applymap(lambda x: x + BDay()) + + assert df.x1.dtype == "M8[ns]" + + def test_apply_non_numpy_dtype(self): + # GH 12244 + df = DataFrame( + {"dt": pd.date_range("2015-01-01", periods=3, tz="Europe/Brussels")} + ) + result = df.apply(lambda x: x) + tm.assert_frame_equal(result, df) + + result = df.apply(lambda x: x + pd.Timedelta("1day")) + expected = DataFrame( + {"dt": pd.date_range("2015-01-02", periods=3, tz="Europe/Brussels")} + ) + tm.assert_frame_equal(result, expected) + + df = DataFrame({"dt": ["a", "b", "c", "a"]}, dtype="category") + result = df.apply(lambda x: x) + tm.assert_frame_equal(result, df) + + def test_apply_dup_names_multi_agg(self): + # GH 21063 + df = pd.DataFrame([[0, 1], [2, 3]], columns=["a", "a"]) + expected = pd.DataFrame([[0, 1]], columns=["a", "a"], index=["min"]) + result = df.agg(["min"]) + + tm.assert_frame_equal(result, expected) + + def test_apply_nested_result_axis_1(self): + # GH 13820 + def apply_list(row): + return [2 * row["A"], 2 * row["C"], 2 * row["B"]] + + df = pd.DataFrame(np.zeros((4, 4)), columns=list("ABCD")) + result = df.apply(apply_list, axis=1) + expected = Series( + [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]] + ) + tm.assert_series_equal(result, expected) + + +class TestInferOutputShape: + # the user has supplied an opaque UDF where + # they are transforming the input that requires + # us to infer the output + + def test_infer_row_shape(self): + # GH 17437 + # if row shape is changing, infer it + df = pd.DataFrame(np.random.rand(10, 2)) + result = df.apply(np.fft.fft, axis=0) + assert result.shape == (10, 2) + + result = df.apply(np.fft.rfft, axis=0) + assert result.shape == (6, 2) + + def test_with_dictlike_columns(self): + # GH 17602 + df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) + result = df.apply(lambda x: {"s": x["a"] + x["b"]}, axis=1) + expected = Series([{"s": 3} for t in df.itertuples()]) + tm.assert_series_equal(result, expected) + + df["tm"] = [ + pd.Timestamp("2017-05-01 00:00:00"), + pd.Timestamp("2017-05-02 00:00:00"), + ] + result = df.apply(lambda x: {"s": x["a"] + x["b"]}, axis=1) + tm.assert_series_equal(result, expected) + + # compose a series + result = (df["a"] + df["b"]).apply(lambda x: {"s": x}) + expected = Series([{"s": 3}, {"s": 3}]) + tm.assert_series_equal(result, expected) + + # GH 18775 + df = DataFrame() + df["author"] = ["X", "Y", "Z"] + df["publisher"] = ["BBC", "NBC", "N24"] + df["date"] = pd.to_datetime( + ["17-10-2010 07:15:30", "13-05-2011 08:20:35", "15-01-2013 09:09:09"] + ) + result = df.apply(lambda x: {}, axis=1) + expected = Series([{}, {}, {}]) + tm.assert_series_equal(result, expected) + + def test_with_dictlike_columns_with_infer(self): + # GH 17602 + df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) + result = df.apply( + lambda x: {"s": x["a"] + x["b"]}, axis=1, result_type="expand" + ) + expected = DataFrame({"s": [3, 3]}) + tm.assert_frame_equal(result, expected) + + df["tm"] = [ + pd.Timestamp("2017-05-01 00:00:00"), + pd.Timestamp("2017-05-02 00:00:00"), + ] + result = df.apply( + lambda x: {"s": x["a"] + x["b"]}, axis=1, result_type="expand" + ) + tm.assert_frame_equal(result, expected) + + def test_with_listlike_columns(self): + # GH 17348 + df = DataFrame( + { + "a": Series(np.random.randn(4)), + "b": ["a", "list", "of", "words"], + "ts": date_range("2016-10-01", periods=4, freq="H"), + } + ) + + result = df[["a", "b"]].apply(tuple, axis=1) + expected = Series([t[1:] for t in df[["a", "b"]].itertuples()]) + tm.assert_series_equal(result, expected) + + result = df[["a", "ts"]].apply(tuple, axis=1) + expected = Series([t[1:] for t in df[["a", "ts"]].itertuples()]) + tm.assert_series_equal(result, expected) + + # GH 18919 + df = DataFrame( + {"x": Series([["a", "b"], ["q"]]), "y": Series([["z"], ["q", "t"]])} + ) + df.index = MultiIndex.from_tuples([("i0", "j0"), ("i1", "j1")]) + + result = df.apply(lambda row: [el for el in row["x"] if el in row["y"]], axis=1) + expected = Series([[], ["q"]], index=df.index) + tm.assert_series_equal(result, expected) + + def test_infer_output_shape_columns(self): + # GH 18573 + + df = DataFrame( + { + "number": [1.0, 2.0], + "string": ["foo", "bar"], + "datetime": [ + pd.Timestamp("2017-11-29 03:30:00"), + pd.Timestamp("2017-11-29 03:45:00"), + ], + } + ) + result = df.apply(lambda row: (row.number, row.string), axis=1) + expected = Series([(t.number, t.string) for t in df.itertuples()]) + tm.assert_series_equal(result, expected) + + def test_infer_output_shape_listlike_columns(self): + # GH 16353 + + df = DataFrame(np.random.randn(6, 3), columns=["A", "B", "C"]) + + result = df.apply(lambda x: [1, 2, 3], axis=1) + expected = Series([[1, 2, 3] for t in df.itertuples()]) + tm.assert_series_equal(result, expected) + + result = df.apply(lambda x: [1, 2], axis=1) + expected = Series([[1, 2] for t in df.itertuples()]) + tm.assert_series_equal(result, expected) + + # GH 17970 + df = DataFrame({"a": [1, 2, 3]}, index=list("abc")) + + result = df.apply(lambda row: np.ones(1), axis=1) + expected = Series([np.ones(1) for t in df.itertuples()], index=df.index) + tm.assert_series_equal(result, expected) + + result = df.apply(lambda row: np.ones(2), axis=1) + expected = Series([np.ones(2) for t in df.itertuples()], index=df.index) + tm.assert_series_equal(result, expected) + + # GH 17892 + df = pd.DataFrame( + { + "a": [ + pd.Timestamp("2010-02-01"), + pd.Timestamp("2010-02-04"), + pd.Timestamp("2010-02-05"), + pd.Timestamp("2010-02-06"), + ], + "b": [9, 5, 4, 3], + "c": [5, 3, 4, 2], + "d": [1, 2, 3, 4], + } + ) + + def fun(x): + return (1, 2) + + result = df.apply(fun, axis=1) + expected = Series([(1, 2) for t in df.itertuples()]) + tm.assert_series_equal(result, expected) + + def test_consistent_coerce_for_shapes(self): + # we want column names to NOT be propagated + # just because the shape matches the input shape + df = DataFrame(np.random.randn(4, 3), columns=["A", "B", "C"]) + + result = df.apply(lambda x: [1, 2, 3], axis=1) + expected = Series([[1, 2, 3] for t in df.itertuples()]) + tm.assert_series_equal(result, expected) + + result = df.apply(lambda x: [1, 2], axis=1) + expected = Series([[1, 2] for t in df.itertuples()]) + tm.assert_series_equal(result, expected) + + def test_consistent_names(self, int_frame_const_col): + # if a Series is returned, we should use the resulting index names + df = int_frame_const_col + + result = df.apply( + lambda x: Series([1, 2, 3], index=["test", "other", "cols"]), axis=1 + ) + expected = int_frame_const_col.rename( + columns={"A": "test", "B": "other", "C": "cols"} + ) + tm.assert_frame_equal(result, expected) + + result = df.apply(lambda x: Series([1, 2], index=["test", "other"]), axis=1) + expected = expected[["test", "other"]] + tm.assert_frame_equal(result, expected) + + def test_result_type(self, int_frame_const_col): + # result_type should be consistent no matter which + # path we take in the code + df = int_frame_const_col + + result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="expand") + expected = df.copy() + expected.columns = [0, 1, 2] + tm.assert_frame_equal(result, expected) + + result = df.apply(lambda x: [1, 2], axis=1, result_type="expand") + expected = df[["A", "B"]].copy() + expected.columns = [0, 1] + tm.assert_frame_equal(result, expected) + + # broadcast result + result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="broadcast") + expected = df.copy() + tm.assert_frame_equal(result, expected) + + columns = ["other", "col", "names"] + result = df.apply( + lambda x: Series([1, 2, 3], index=columns), axis=1, result_type="broadcast" + ) + expected = df.copy() + tm.assert_frame_equal(result, expected) + + # series result + result = df.apply(lambda x: Series([1, 2, 3], index=x.index), axis=1) + expected = df.copy() + tm.assert_frame_equal(result, expected) + + # series result with other index + columns = ["other", "col", "names"] + result = df.apply(lambda x: Series([1, 2, 3], index=columns), axis=1) + expected = df.copy() + expected.columns = columns + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("result_type", ["foo", 1]) + def test_result_type_error(self, result_type, int_frame_const_col): + # allowed result_type + df = int_frame_const_col + + with pytest.raises(ValueError): + df.apply(lambda x: [1, 2, 3], axis=1, result_type=result_type) + + @pytest.mark.parametrize( + "box", + [lambda x: list(x), lambda x: tuple(x), lambda x: np.array(x, dtype="int64")], + ids=["list", "tuple", "array"], + ) + def test_consistency_for_boxed(self, box, int_frame_const_col): + # passing an array or list should not affect the output shape + df = int_frame_const_col + + result = df.apply(lambda x: box([1, 2]), axis=1) + expected = Series([box([1, 2]) for t in df.itertuples()]) + tm.assert_series_equal(result, expected) + + result = df.apply(lambda x: box([1, 2]), axis=1, result_type="expand") + expected = int_frame_const_col[["A", "B"]].rename(columns={"A": 0, "B": 1}) + tm.assert_frame_equal(result, expected) + + +def zip_frames(frames, axis=1): + """ + take a list of frames, zip them together under the + assumption that these all have the first frames' index/columns. + + Returns + ------- + new_frame : DataFrame + """ + if axis == 1: + columns = frames[0].columns + zipped = [f.loc[:, c] for c in columns for f in frames] + return pd.concat(zipped, axis=1) + else: + index = frames[0].index + zipped = [f.loc[i, :] for i in index for f in frames] + return pd.DataFrame(zipped) + + +class TestDataFrameAggregate: + def test_agg_transform(self, axis, float_frame): + other_axis = 1 if axis in {0, "index"} else 0 + + with np.errstate(all="ignore"): + + f_abs = np.abs(float_frame) + f_sqrt = np.sqrt(float_frame) + + # ufunc + result = float_frame.transform(np.sqrt, axis=axis) + expected = f_sqrt.copy() + tm.assert_frame_equal(result, expected) + + result = float_frame.apply(np.sqrt, axis=axis) + tm.assert_frame_equal(result, expected) + + result = float_frame.transform(np.sqrt, axis=axis) + tm.assert_frame_equal(result, expected) + + # list-like + result = float_frame.apply([np.sqrt], axis=axis) + expected = f_sqrt.copy() + if axis in {0, "index"}: + expected.columns = pd.MultiIndex.from_product( + [float_frame.columns, ["sqrt"]] + ) + else: + expected.index = pd.MultiIndex.from_product( + [float_frame.index, ["sqrt"]] + ) + tm.assert_frame_equal(result, expected) + + result = float_frame.transform([np.sqrt], axis=axis) + tm.assert_frame_equal(result, expected) + + # multiple items in list + # these are in the order as if we are applying both + # functions per series and then concatting + result = float_frame.apply([np.abs, np.sqrt], axis=axis) + expected = zip_frames([f_abs, f_sqrt], axis=other_axis) + if axis in {0, "index"}: + expected.columns = pd.MultiIndex.from_product( + [float_frame.columns, ["absolute", "sqrt"]] + ) + else: + expected.index = pd.MultiIndex.from_product( + [float_frame.index, ["absolute", "sqrt"]] + ) + tm.assert_frame_equal(result, expected) + + result = float_frame.transform([np.abs, "sqrt"], axis=axis) + tm.assert_frame_equal(result, expected) + + def test_transform_and_agg_err(self, axis, float_frame): + # cannot both transform and agg + with pytest.raises(ValueError): + float_frame.transform(["max", "min"], axis=axis) + + with pytest.raises(ValueError): + with np.errstate(all="ignore"): + float_frame.agg(["max", "sqrt"], axis=axis) + + with pytest.raises(ValueError): + with np.errstate(all="ignore"): + float_frame.transform(["max", "sqrt"], axis=axis) + + df = pd.DataFrame({"A": range(5), "B": 5}) + + def f(): + with np.errstate(all="ignore"): + df.agg({"A": ["abs", "sum"], "B": ["mean", "max"]}, axis=axis) + + @pytest.mark.parametrize("method", ["abs", "shift", "pct_change", "cumsum", "rank"]) + def test_transform_method_name(self, method): + # GH 19760 + df = pd.DataFrame({"A": [-1, 2]}) + result = df.transform(method) + expected = operator.methodcaller(method)(df) + tm.assert_frame_equal(result, expected) + + def test_demo(self): + # demonstration tests + df = pd.DataFrame({"A": range(5), "B": 5}) + + result = df.agg(["min", "max"]) + expected = DataFrame( + {"A": [0, 4], "B": [5, 5]}, columns=["A", "B"], index=["min", "max"] + ) + tm.assert_frame_equal(result, expected) + + result = df.agg({"A": ["min", "max"], "B": ["sum", "max"]}) + expected = DataFrame( + {"A": [4.0, 0.0, np.nan], "B": [5.0, np.nan, 25.0]}, + columns=["A", "B"], + index=["max", "min", "sum"], + ) + tm.assert_frame_equal(result.reindex_like(expected), expected) + + def test_agg_multiple_mixed_no_warning(self): + # GH 20909 + mdf = pd.DataFrame( + { + "A": [1, 2, 3], + "B": [1.0, 2.0, 3.0], + "C": ["foo", "bar", "baz"], + "D": pd.date_range("20130101", periods=3), + } + ) + expected = pd.DataFrame( + { + "A": [1, 6], + "B": [1.0, 6.0], + "C": ["bar", "foobarbaz"], + "D": [pd.Timestamp("2013-01-01"), pd.NaT], + }, + index=["min", "sum"], + ) + # sorted index + with tm.assert_produces_warning(None): + result = mdf.agg(["min", "sum"]) + + tm.assert_frame_equal(result, expected) + + with tm.assert_produces_warning(None): + result = mdf[["D", "C", "B", "A"]].agg(["sum", "min"]) + + # For backwards compatibility, the result's index is + # still sorted by function name, so it's ['min', 'sum'] + # not ['sum', 'min']. + expected = expected[["D", "C", "B", "A"]] + tm.assert_frame_equal(result, expected) + + def test_agg_dict_nested_renaming_depr(self): + + df = pd.DataFrame({"A": range(5), "B": 5}) + + # nested renaming + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + df.agg({"A": {"foo": "min"}, "B": {"bar": "max"}}) + + def test_agg_reduce(self, axis, float_frame): + other_axis = 1 if axis in {0, "index"} else 0 + name1, name2 = float_frame.axes[other_axis].unique()[:2].sort_values() + + # all reducers + expected = pd.concat( + [ + float_frame.mean(axis=axis), + float_frame.max(axis=axis), + float_frame.sum(axis=axis), + ], + axis=1, + ) + expected.columns = ["mean", "max", "sum"] + expected = expected.T if axis in {0, "index"} else expected + + result = float_frame.agg(["mean", "max", "sum"], axis=axis) + tm.assert_frame_equal(result, expected) + + # dict input with scalars + func = OrderedDict([(name1, "mean"), (name2, "sum")]) + result = float_frame.agg(func, axis=axis) + expected = Series( + [ + float_frame.loc(other_axis)[name1].mean(), + float_frame.loc(other_axis)[name2].sum(), + ], + index=[name1, name2], + ) + tm.assert_series_equal(result, expected) + + # dict input with lists + func = OrderedDict([(name1, ["mean"]), (name2, ["sum"])]) + result = float_frame.agg(func, axis=axis) + expected = DataFrame( + { + name1: Series( + [float_frame.loc(other_axis)[name1].mean()], index=["mean"] + ), + name2: Series( + [float_frame.loc(other_axis)[name2].sum()], index=["sum"] + ), + } + ) + expected = expected.T if axis in {1, "columns"} else expected + tm.assert_frame_equal(result, expected) + + # dict input with lists with multiple + func = OrderedDict([(name1, ["mean", "sum"]), (name2, ["sum", "max"])]) + result = float_frame.agg(func, axis=axis) + expected = DataFrame( + OrderedDict( + [ + ( + name1, + Series( + [ + float_frame.loc(other_axis)[name1].mean(), + float_frame.loc(other_axis)[name1].sum(), + ], + index=["mean", "sum"], + ), + ), + ( + name2, + Series( + [ + float_frame.loc(other_axis)[name2].sum(), + float_frame.loc(other_axis)[name2].max(), + ], + index=["sum", "max"], + ), + ), + ] + ) + ) + expected = expected.T if axis in {1, "columns"} else expected + tm.assert_frame_equal(result, expected) + + def test_nuiscance_columns(self): + + # GH 15015 + df = DataFrame( + { + "A": [1, 2, 3], + "B": [1.0, 2.0, 3.0], + "C": ["foo", "bar", "baz"], + "D": pd.date_range("20130101", periods=3), + } + ) + + result = df.agg("min") + expected = Series([1, 1.0, "bar", pd.Timestamp("20130101")], index=df.columns) + tm.assert_series_equal(result, expected) + + result = df.agg(["min"]) + expected = DataFrame( + [[1, 1.0, "bar", pd.Timestamp("20130101")]], + index=["min"], + columns=df.columns, + ) + tm.assert_frame_equal(result, expected) + + result = df.agg("sum") + expected = Series([6, 6.0, "foobarbaz"], index=["A", "B", "C"]) + tm.assert_series_equal(result, expected) + + result = df.agg(["sum"]) + expected = DataFrame( + [[6, 6.0, "foobarbaz"]], index=["sum"], columns=["A", "B", "C"] + ) + tm.assert_frame_equal(result, expected) + + def test_non_callable_aggregates(self): + + # GH 16405 + # 'size' is a property of frame/series + # validate that this is working + df = DataFrame( + {"A": [None, 2, 3], "B": [1.0, np.nan, 3.0], "C": ["foo", None, "bar"]} + ) + + # Function aggregate + result = df.agg({"A": "count"}) + expected = Series({"A": 2}) + + tm.assert_series_equal(result, expected) + + # Non-function aggregate + result = df.agg({"A": "size"}) + expected = Series({"A": 3}) + + tm.assert_series_equal(result, expected) + + # Mix function and non-function aggs + result1 = df.agg(["count", "size"]) + result2 = df.agg( + {"A": ["count", "size"], "B": ["count", "size"], "C": ["count", "size"]} + ) + expected = pd.DataFrame( + { + "A": {"count": 2, "size": 3}, + "B": {"count": 2, "size": 3}, + "C": {"count": 2, "size": 3}, + } + ) + + tm.assert_frame_equal(result1, result2, check_like=True) + tm.assert_frame_equal(result2, expected, check_like=True) + + # Just functional string arg is same as calling df.arg() + result = df.agg("count") + expected = df.count() + + tm.assert_series_equal(result, expected) + + # Just a string attribute arg same as calling df.arg + result = df.agg("size") + expected = df.size + + assert result == expected + + def test_agg_listlike_result(self): + # GH-29587 user defined function returning list-likes + df = DataFrame( + {"A": [2, 2, 3], "B": [1.5, np.nan, 1.5], "C": ["foo", None, "bar"]} + ) + + def func(group_col): + return list(group_col.dropna().unique()) + + result = df.agg(func) + expected = pd.Series([[2, 3], [1.5], ["foo", "bar"]], index=["A", "B", "C"]) + tm.assert_series_equal(result, expected) + + result = df.agg([func]) + expected = expected.to_frame("func").T + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "df, func, expected", + chain( + _get_cython_table_params( + DataFrame(), + [ + ("sum", Series(dtype="float64")), + ("max", Series(dtype="float64")), + ("min", Series(dtype="float64")), + ("all", Series(dtype=bool)), + ("any", Series(dtype=bool)), + ("mean", Series(dtype="float64")), + ("prod", Series(dtype="float64")), + ("std", Series(dtype="float64")), + ("var", Series(dtype="float64")), + ("median", Series(dtype="float64")), + ], + ), + _get_cython_table_params( + DataFrame([[np.nan, 1], [1, 2]]), + [ + ("sum", Series([1.0, 3])), + ("max", Series([1.0, 2])), + ("min", Series([1.0, 1])), + ("all", Series([True, True])), + ("any", Series([True, True])), + ("mean", Series([1, 1.5])), + ("prod", Series([1.0, 2])), + ("std", Series([np.nan, 0.707107])), + ("var", Series([np.nan, 0.5])), + ("median", Series([1, 1.5])), + ], + ), + ), + ) + def test_agg_cython_table(self, df, func, expected, axis): + # GH 21224 + # test reducing functions in + # pandas.core.base.SelectionMixin._cython_table + result = df.agg(func, axis=axis) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "df, func, expected", + chain( + _get_cython_table_params( + DataFrame(), [("cumprod", DataFrame()), ("cumsum", DataFrame())] + ), + _get_cython_table_params( + DataFrame([[np.nan, 1], [1, 2]]), + [ + ("cumprod", DataFrame([[np.nan, 1], [1, 2]])), + ("cumsum", DataFrame([[np.nan, 1], [1, 3]])), + ], + ), + ), + ) + def test_agg_cython_table_transform(self, df, func, expected, axis): + # GH 21224 + # test transforming functions in + # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) + if axis == "columns" or axis == 1: + # operating blockwise doesn't let us preserve dtypes + expected = expected.astype("float64") + + result = df.agg(func, axis=axis) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "df, func, expected", + _get_cython_table_params( + DataFrame([["a", "b"], ["b", "a"]]), [["cumprod", TypeError]] + ), + ) + def test_agg_cython_table_raises(self, df, func, expected, axis): + # GH 21224 + with pytest.raises(expected): + df.agg(func, axis=axis) + + @pytest.mark.parametrize("num_cols", [2, 3, 5]) + def test_frequency_is_original(self, num_cols): + # GH 22150 + index = pd.DatetimeIndex(["1950-06-30", "1952-10-24", "1953-05-29"]) + original = index.copy() + df = DataFrame(1, index=index, columns=range(num_cols)) + df.apply(lambda x: x) + assert index.freq == original.freq + + def test_apply_datetime_tz_issue(self): + # GH 29052 + + timestamps = [ + pd.Timestamp("2019-03-15 12:34:31.909000+0000", tz="UTC"), + pd.Timestamp("2019-03-15 12:34:34.359000+0000", tz="UTC"), + pd.Timestamp("2019-03-15 12:34:34.660000+0000", tz="UTC"), + ] + df = DataFrame(data=[0, 1, 2], index=timestamps) + result = df.apply(lambda x: x.name, axis=1) + expected = pd.Series(index=timestamps, data=timestamps) + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("df", [pd.DataFrame({"A": ["a", None], "B": ["c", "d"]})]) + @pytest.mark.parametrize("method", ["min", "max", "sum"]) + def test_consistency_of_aggregates_of_columns_with_missing_values(self, df, method): + # GH 16832 + none_in_first_column_result = getattr(df[["A", "B"]], method)() + none_in_second_column_result = getattr(df[["B", "A"]], method)() + + tm.assert_series_equal( + none_in_first_column_result, none_in_second_column_result + ) diff --git a/venv/Lib/site-packages/pandas/tests/frame/test_arithmetic.py b/venv/Lib/site-packages/pandas/tests/frame/test_arithmetic.py new file mode 100644 index 0000000..659b557 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/test_arithmetic.py @@ -0,0 +1,739 @@ +from collections import deque +from datetime import datetime +import operator + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.tests.frame.common import _check_mixed_float, _check_mixed_int + +# ------------------------------------------------------------------- +# Comparisons + + +class TestFrameComparisons: + # Specifically _not_ flex-comparisons + + def test_frame_in_list(self): + # GH#12689 this should raise at the DataFrame level, not blocks + df = pd.DataFrame(np.random.randn(6, 4), columns=list("ABCD")) + msg = "The truth value of a DataFrame is ambiguous" + with pytest.raises(ValueError, match=msg): + df in [None] + + def test_comparison_invalid(self): + def check(df, df2): + + for (x, y) in [(df, df2), (df2, df)]: + # we expect the result to match Series comparisons for + # == and !=, inequalities should raise + result = x == y + expected = pd.DataFrame( + {col: x[col] == y[col] for col in x.columns}, + index=x.index, + columns=x.columns, + ) + tm.assert_frame_equal(result, expected) + + result = x != y + expected = pd.DataFrame( + {col: x[col] != y[col] for col in x.columns}, + index=x.index, + columns=x.columns, + ) + tm.assert_frame_equal(result, expected) + + with pytest.raises(TypeError): + x >= y + with pytest.raises(TypeError): + x > y + with pytest.raises(TypeError): + x < y + with pytest.raises(TypeError): + x <= y + + # GH4968 + # invalid date/int comparisons + df = pd.DataFrame(np.random.randint(10, size=(10, 1)), columns=["a"]) + df["dates"] = pd.date_range("20010101", periods=len(df)) + + df2 = df.copy() + df2["dates"] = df["a"] + check(df, df2) + + df = pd.DataFrame(np.random.randint(10, size=(10, 2)), columns=["a", "b"]) + df2 = pd.DataFrame( + { + "a": pd.date_range("20010101", periods=len(df)), + "b": pd.date_range("20100101", periods=len(df)), + } + ) + check(df, df2) + + def test_timestamp_compare(self): + # make sure we can compare Timestamps on the right AND left hand side + # GH#4982 + df = pd.DataFrame( + { + "dates1": pd.date_range("20010101", periods=10), + "dates2": pd.date_range("20010102", periods=10), + "intcol": np.random.randint(1000000000, size=10), + "floatcol": np.random.randn(10), + "stringcol": list(tm.rands(10)), + } + ) + df.loc[np.random.rand(len(df)) > 0.5, "dates2"] = pd.NaT + ops = {"gt": "lt", "lt": "gt", "ge": "le", "le": "ge", "eq": "eq", "ne": "ne"} + + for left, right in ops.items(): + left_f = getattr(operator, left) + right_f = getattr(operator, right) + + # no nats + if left in ["eq", "ne"]: + expected = left_f(df, pd.Timestamp("20010109")) + result = right_f(pd.Timestamp("20010109"), df) + tm.assert_frame_equal(result, expected) + else: + with pytest.raises(TypeError): + left_f(df, pd.Timestamp("20010109")) + with pytest.raises(TypeError): + right_f(pd.Timestamp("20010109"), df) + # nats + expected = left_f(df, pd.Timestamp("nat")) + result = right_f(pd.Timestamp("nat"), df) + tm.assert_frame_equal(result, expected) + + def test_mixed_comparison(self): + # GH#13128, GH#22163 != datetime64 vs non-dt64 should be False, + # not raise TypeError + # (this appears to be fixed before GH#22163, not sure when) + df = pd.DataFrame([["1989-08-01", 1], ["1989-08-01", 2]]) + other = pd.DataFrame([["a", "b"], ["c", "d"]]) + + result = df == other + assert not result.any().any() + + result = df != other + assert result.all().all() + + def test_df_boolean_comparison_error(self): + # GH#4576, GH#22880 + # comparing DataFrame against list/tuple with len(obj) matching + # len(df.columns) is supported as of GH#22800 + df = pd.DataFrame(np.arange(6).reshape((3, 2))) + + expected = pd.DataFrame([[False, False], [True, False], [False, False]]) + + result = df == (2, 2) + tm.assert_frame_equal(result, expected) + + result = df == [2, 2] + tm.assert_frame_equal(result, expected) + + def test_df_float_none_comparison(self): + df = pd.DataFrame( + np.random.randn(8, 3), index=range(8), columns=["A", "B", "C"] + ) + + result = df.__eq__(None) + assert not result.any().any() + + def test_df_string_comparison(self): + df = pd.DataFrame([{"a": 1, "b": "foo"}, {"a": 2, "b": "bar"}]) + mask_a = df.a > 1 + tm.assert_frame_equal(df[mask_a], df.loc[1:1, :]) + tm.assert_frame_equal(df[-mask_a], df.loc[0:0, :]) + + mask_b = df.b == "foo" + tm.assert_frame_equal(df[mask_b], df.loc[0:0, :]) + tm.assert_frame_equal(df[-mask_b], df.loc[1:1, :]) + + +class TestFrameFlexComparisons: + # TODO: test_bool_flex_frame needs a better name + def test_bool_flex_frame(self): + data = np.random.randn(5, 3) + other_data = np.random.randn(5, 3) + df = pd.DataFrame(data) + other = pd.DataFrame(other_data) + ndim_5 = np.ones(df.shape + (1, 3)) + + # Unaligned + def _check_unaligned_frame(meth, op, df, other): + part_o = other.loc[3:, 1:].copy() + rs = meth(part_o) + xp = op(df, part_o.reindex(index=df.index, columns=df.columns)) + tm.assert_frame_equal(rs, xp) + + # DataFrame + assert df.eq(df).values.all() + assert not df.ne(df).values.any() + for op in ["eq", "ne", "gt", "lt", "ge", "le"]: + f = getattr(df, op) + o = getattr(operator, op) + # No NAs + tm.assert_frame_equal(f(other), o(df, other)) + _check_unaligned_frame(f, o, df, other) + # ndarray + tm.assert_frame_equal(f(other.values), o(df, other.values)) + # scalar + tm.assert_frame_equal(f(0), o(df, 0)) + # NAs + msg = "Unable to coerce to Series/DataFrame" + tm.assert_frame_equal(f(np.nan), o(df, np.nan)) + with pytest.raises(ValueError, match=msg): + f(ndim_5) + + # Series + def _test_seq(df, idx_ser, col_ser): + idx_eq = df.eq(idx_ser, axis=0) + col_eq = df.eq(col_ser) + idx_ne = df.ne(idx_ser, axis=0) + col_ne = df.ne(col_ser) + tm.assert_frame_equal(col_eq, df == pd.Series(col_ser)) + tm.assert_frame_equal(col_eq, -col_ne) + tm.assert_frame_equal(idx_eq, -idx_ne) + tm.assert_frame_equal(idx_eq, df.T.eq(idx_ser).T) + tm.assert_frame_equal(col_eq, df.eq(list(col_ser))) + tm.assert_frame_equal(idx_eq, df.eq(pd.Series(idx_ser), axis=0)) + tm.assert_frame_equal(idx_eq, df.eq(list(idx_ser), axis=0)) + + idx_gt = df.gt(idx_ser, axis=0) + col_gt = df.gt(col_ser) + idx_le = df.le(idx_ser, axis=0) + col_le = df.le(col_ser) + + tm.assert_frame_equal(col_gt, df > pd.Series(col_ser)) + tm.assert_frame_equal(col_gt, -col_le) + tm.assert_frame_equal(idx_gt, -idx_le) + tm.assert_frame_equal(idx_gt, df.T.gt(idx_ser).T) + + idx_ge = df.ge(idx_ser, axis=0) + col_ge = df.ge(col_ser) + idx_lt = df.lt(idx_ser, axis=0) + col_lt = df.lt(col_ser) + tm.assert_frame_equal(col_ge, df >= pd.Series(col_ser)) + tm.assert_frame_equal(col_ge, -col_lt) + tm.assert_frame_equal(idx_ge, -idx_lt) + tm.assert_frame_equal(idx_ge, df.T.ge(idx_ser).T) + + idx_ser = pd.Series(np.random.randn(5)) + col_ser = pd.Series(np.random.randn(3)) + _test_seq(df, idx_ser, col_ser) + + # list/tuple + _test_seq(df, idx_ser.values, col_ser.values) + + # NA + df.loc[0, 0] = np.nan + rs = df.eq(df) + assert not rs.loc[0, 0] + rs = df.ne(df) + assert rs.loc[0, 0] + rs = df.gt(df) + assert not rs.loc[0, 0] + rs = df.lt(df) + assert not rs.loc[0, 0] + rs = df.ge(df) + assert not rs.loc[0, 0] + rs = df.le(df) + assert not rs.loc[0, 0] + + def test_bool_flex_frame_complex_dtype(self): + # complex + arr = np.array([np.nan, 1, 6, np.nan]) + arr2 = np.array([2j, np.nan, 7, None]) + df = pd.DataFrame({"a": arr}) + df2 = pd.DataFrame({"a": arr2}) + + msg = "|".join( + [ + "'>' not supported between instances of '.*' and 'complex'", + r"unorderable types: .*complex\(\)", # PY35 + ] + ) + with pytest.raises(TypeError, match=msg): + # inequalities are not well-defined for complex numbers + df.gt(df2) + with pytest.raises(TypeError, match=msg): + # regression test that we get the same behavior for Series + df["a"].gt(df2["a"]) + with pytest.raises(TypeError, match=msg): + # Check that we match numpy behavior here + df.values > df2.values + + rs = df.ne(df2) + assert rs.values.all() + + arr3 = np.array([2j, np.nan, None]) + df3 = pd.DataFrame({"a": arr3}) + + with pytest.raises(TypeError, match=msg): + # inequalities are not well-defined for complex numbers + df3.gt(2j) + with pytest.raises(TypeError, match=msg): + # regression test that we get the same behavior for Series + df3["a"].gt(2j) + with pytest.raises(TypeError, match=msg): + # Check that we match numpy behavior here + df3.values > 2j + + def test_bool_flex_frame_object_dtype(self): + # corner, dtype=object + df1 = pd.DataFrame({"col": ["foo", np.nan, "bar"]}) + df2 = pd.DataFrame({"col": ["foo", datetime.now(), "bar"]}) + result = df1.ne(df2) + exp = pd.DataFrame({"col": [False, True, False]}) + tm.assert_frame_equal(result, exp) + + def test_flex_comparison_nat(self): + # GH 15697, GH 22163 df.eq(pd.NaT) should behave like df == pd.NaT, + # and _definitely_ not be NaN + df = pd.DataFrame([pd.NaT]) + + result = df == pd.NaT + # result.iloc[0, 0] is a np.bool_ object + assert result.iloc[0, 0].item() is False + + result = df.eq(pd.NaT) + assert result.iloc[0, 0].item() is False + + result = df != pd.NaT + assert result.iloc[0, 0].item() is True + + result = df.ne(pd.NaT) + assert result.iloc[0, 0].item() is True + + @pytest.mark.parametrize("opname", ["eq", "ne", "gt", "lt", "ge", "le"]) + def test_df_flex_cmp_constant_return_types(self, opname): + # GH 15077, non-empty DataFrame + df = pd.DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]}) + const = 2 + + result = getattr(df, opname)(const).dtypes.value_counts() + tm.assert_series_equal(result, pd.Series([2], index=[np.dtype(bool)])) + + @pytest.mark.parametrize("opname", ["eq", "ne", "gt", "lt", "ge", "le"]) + def test_df_flex_cmp_constant_return_types_empty(self, opname): + # GH 15077 empty DataFrame + df = pd.DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]}) + const = 2 + + empty = df.iloc[:0] + result = getattr(empty, opname)(const).dtypes.value_counts() + tm.assert_series_equal(result, pd.Series([2], index=[np.dtype(bool)])) + + +# ------------------------------------------------------------------- +# Arithmetic + + +class TestFrameFlexArithmetic: + def test_df_add_td64_columnwise(self): + # GH 22534 Check that column-wise addition broadcasts correctly + dti = pd.date_range("2016-01-01", periods=10) + tdi = pd.timedelta_range("1", periods=10) + tser = pd.Series(tdi) + df = pd.DataFrame({0: dti, 1: tdi}) + + result = df.add(tser, axis=0) + expected = pd.DataFrame({0: dti + tdi, 1: tdi + tdi}) + tm.assert_frame_equal(result, expected) + + def test_df_add_flex_filled_mixed_dtypes(self): + # GH 19611 + dti = pd.date_range("2016-01-01", periods=3) + ser = pd.Series(["1 Day", "NaT", "2 Days"], dtype="timedelta64[ns]") + df = pd.DataFrame({"A": dti, "B": ser}) + other = pd.DataFrame({"A": ser, "B": ser}) + fill = pd.Timedelta(days=1).to_timedelta64() + result = df.add(other, fill_value=fill) + + expected = pd.DataFrame( + { + "A": pd.Series( + ["2016-01-02", "2016-01-03", "2016-01-05"], dtype="datetime64[ns]" + ), + "B": ser * 2, + } + ) + tm.assert_frame_equal(result, expected) + + def test_arith_flex_frame( + self, all_arithmetic_operators, float_frame, mixed_float_frame + ): + # one instance of parametrized fixture + op = all_arithmetic_operators + + def f(x, y): + # r-versions not in operator-stdlib; get op without "r" and invert + if op.startswith("__r"): + return getattr(operator, op.replace("__r", "__"))(y, x) + return getattr(operator, op)(x, y) + + result = getattr(float_frame, op)(2 * float_frame) + expected = f(float_frame, 2 * float_frame) + tm.assert_frame_equal(result, expected) + + # vs mix float + result = getattr(mixed_float_frame, op)(2 * mixed_float_frame) + expected = f(mixed_float_frame, 2 * mixed_float_frame) + tm.assert_frame_equal(result, expected) + _check_mixed_float(result, dtype=dict(C=None)) + + @pytest.mark.parametrize("op", ["__add__", "__sub__", "__mul__"]) + def test_arith_flex_frame_mixed( + self, op, int_frame, mixed_int_frame, mixed_float_frame + ): + f = getattr(operator, op) + + # vs mix int + result = getattr(mixed_int_frame, op)(2 + mixed_int_frame) + expected = f(mixed_int_frame, 2 + mixed_int_frame) + + # no overflow in the uint + dtype = None + if op in ["__sub__"]: + dtype = dict(B="uint64", C=None) + elif op in ["__add__", "__mul__"]: + dtype = dict(C=None) + tm.assert_frame_equal(result, expected) + _check_mixed_int(result, dtype=dtype) + + # vs mix float + result = getattr(mixed_float_frame, op)(2 * mixed_float_frame) + expected = f(mixed_float_frame, 2 * mixed_float_frame) + tm.assert_frame_equal(result, expected) + _check_mixed_float(result, dtype=dict(C=None)) + + # vs plain int + result = getattr(int_frame, op)(2 * int_frame) + expected = f(int_frame, 2 * int_frame) + tm.assert_frame_equal(result, expected) + + def test_arith_flex_frame_raise(self, all_arithmetic_operators, float_frame): + # one instance of parametrized fixture + op = all_arithmetic_operators + + # Check that arrays with dim >= 3 raise + for dim in range(3, 6): + arr = np.ones((1,) * dim) + msg = "Unable to coerce to Series/DataFrame" + with pytest.raises(ValueError, match=msg): + getattr(float_frame, op)(arr) + + def test_arith_flex_frame_corner(self, float_frame): + + const_add = float_frame.add(1) + tm.assert_frame_equal(const_add, float_frame + 1) + + # corner cases + result = float_frame.add(float_frame[:0]) + tm.assert_frame_equal(result, float_frame * np.nan) + + result = float_frame[:0].add(float_frame) + tm.assert_frame_equal(result, float_frame * np.nan) + + with pytest.raises(NotImplementedError, match="fill_value"): + float_frame.add(float_frame.iloc[0], fill_value=3) + + with pytest.raises(NotImplementedError, match="fill_value"): + float_frame.add(float_frame.iloc[0], axis="index", fill_value=3) + + def test_arith_flex_series(self, simple_frame): + df = simple_frame + + row = df.xs("a") + col = df["two"] + # after arithmetic refactor, add truediv here + ops = ["add", "sub", "mul", "mod"] + for op in ops: + f = getattr(df, op) + op = getattr(operator, op) + tm.assert_frame_equal(f(row), op(df, row)) + tm.assert_frame_equal(f(col, axis=0), op(df.T, col).T) + + # special case for some reason + tm.assert_frame_equal(df.add(row, axis=None), df + row) + + # cases which will be refactored after big arithmetic refactor + tm.assert_frame_equal(df.div(row), df / row) + tm.assert_frame_equal(df.div(col, axis=0), (df.T / col).T) + + # broadcasting issue in GH 7325 + df = pd.DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype="int64") + expected = pd.DataFrame([[np.nan, np.inf], [1.0, 1.5], [1.0, 1.25]]) + result = df.div(df[0], axis="index") + tm.assert_frame_equal(result, expected) + + df = pd.DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype="float64") + expected = pd.DataFrame([[np.nan, np.inf], [1.0, 1.5], [1.0, 1.25]]) + result = df.div(df[0], axis="index") + tm.assert_frame_equal(result, expected) + + def test_arith_flex_zero_len_raises(self): + # GH 19522 passing fill_value to frame flex arith methods should + # raise even in the zero-length special cases + ser_len0 = pd.Series([], dtype=object) + df_len0 = pd.DataFrame(columns=["A", "B"]) + df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + + with pytest.raises(NotImplementedError, match="fill_value"): + df.add(ser_len0, fill_value="E") + + with pytest.raises(NotImplementedError, match="fill_value"): + df_len0.sub(df["A"], axis=None, fill_value=3) + + +class TestFrameArithmetic: + def test_td64_op_nat_casting(self): + # Make sure we don't accidentally treat timedelta64(NaT) as datetime64 + # when calling dispatch_to_series in DataFrame arithmetic + ser = pd.Series(["NaT", "NaT"], dtype="timedelta64[ns]") + df = pd.DataFrame([[1, 2], [3, 4]]) + + result = df * ser + expected = pd.DataFrame({0: ser, 1: ser}) + tm.assert_frame_equal(result, expected) + + def test_df_add_2d_array_rowlike_broadcasts(self): + # GH#23000 + arr = np.arange(6).reshape(3, 2) + df = pd.DataFrame(arr, columns=[True, False], index=["A", "B", "C"]) + + rowlike = arr[[1], :] # shape --> (1, ncols) + assert rowlike.shape == (1, df.shape[1]) + + expected = pd.DataFrame( + [[2, 4], [4, 6], [6, 8]], + columns=df.columns, + index=df.index, + # specify dtype explicitly to avoid failing + # on 32bit builds + dtype=arr.dtype, + ) + result = df + rowlike + tm.assert_frame_equal(result, expected) + result = rowlike + df + tm.assert_frame_equal(result, expected) + + def test_df_add_2d_array_collike_broadcasts(self): + # GH#23000 + arr = np.arange(6).reshape(3, 2) + df = pd.DataFrame(arr, columns=[True, False], index=["A", "B", "C"]) + + collike = arr[:, [1]] # shape --> (nrows, 1) + assert collike.shape == (df.shape[0], 1) + + expected = pd.DataFrame( + [[1, 2], [5, 6], [9, 10]], + columns=df.columns, + index=df.index, + # specify dtype explicitly to avoid failing + # on 32bit builds + dtype=arr.dtype, + ) + result = df + collike + tm.assert_frame_equal(result, expected) + result = collike + df + tm.assert_frame_equal(result, expected) + + def test_df_arith_2d_array_rowlike_broadcasts(self, all_arithmetic_operators): + # GH#23000 + opname = all_arithmetic_operators + + arr = np.arange(6).reshape(3, 2) + df = pd.DataFrame(arr, columns=[True, False], index=["A", "B", "C"]) + + rowlike = arr[[1], :] # shape --> (1, ncols) + assert rowlike.shape == (1, df.shape[1]) + + exvals = [ + getattr(df.loc["A"], opname)(rowlike.squeeze()), + getattr(df.loc["B"], opname)(rowlike.squeeze()), + getattr(df.loc["C"], opname)(rowlike.squeeze()), + ] + + expected = pd.DataFrame(exvals, columns=df.columns, index=df.index) + + if opname in ["__rmod__", "__rfloordiv__"]: + # exvals will have dtypes [f8, i8, i8] so expected will be + # all-f8, but the DataFrame operation will return mixed dtypes + # use exvals[-1].dtype instead of "i8" for compat with 32-bit + # systems/pythons + expected[False] = expected[False].astype(exvals[-1].dtype) + + result = getattr(df, opname)(rowlike) + tm.assert_frame_equal(result, expected) + + def test_df_arith_2d_array_collike_broadcasts(self, all_arithmetic_operators): + # GH#23000 + opname = all_arithmetic_operators + + arr = np.arange(6).reshape(3, 2) + df = pd.DataFrame(arr, columns=[True, False], index=["A", "B", "C"]) + + collike = arr[:, [1]] # shape --> (nrows, 1) + assert collike.shape == (df.shape[0], 1) + + exvals = { + True: getattr(df[True], opname)(collike.squeeze()), + False: getattr(df[False], opname)(collike.squeeze()), + } + + dtype = None + if opname in ["__rmod__", "__rfloordiv__"]: + # Series ops may return mixed int/float dtypes in cases where + # DataFrame op will return all-float. So we upcast `expected` + dtype = np.common_type(*[x.values for x in exvals.values()]) + + expected = pd.DataFrame(exvals, columns=df.columns, index=df.index, dtype=dtype) + + result = getattr(df, opname)(collike) + tm.assert_frame_equal(result, expected) + + def test_df_bool_mul_int(self): + # GH 22047, GH 22163 multiplication by 1 should result in int dtype, + # not object dtype + df = pd.DataFrame([[False, True], [False, False]]) + result = df * 1 + + # On appveyor this comes back as np.int32 instead of np.int64, + # so we check dtype.kind instead of just dtype + kinds = result.dtypes.apply(lambda x: x.kind) + assert (kinds == "i").all() + + result = 1 * df + kinds = result.dtypes.apply(lambda x: x.kind) + assert (kinds == "i").all() + + def test_arith_mixed(self): + + left = pd.DataFrame({"A": ["a", "b", "c"], "B": [1, 2, 3]}) + + result = left + left + expected = pd.DataFrame({"A": ["aa", "bb", "cc"], "B": [2, 4, 6]}) + tm.assert_frame_equal(result, expected) + + def test_arith_getitem_commute(self): + df = pd.DataFrame({"A": [1.1, 3.3], "B": [2.5, -3.9]}) + + def _test_op(df, op): + result = op(df, 1) + + if not df.columns.is_unique: + raise ValueError("Only unique columns supported by this test") + + for col in result.columns: + tm.assert_series_equal(result[col], op(df[col], 1)) + + _test_op(df, operator.add) + _test_op(df, operator.sub) + _test_op(df, operator.mul) + _test_op(df, operator.truediv) + _test_op(df, operator.floordiv) + _test_op(df, operator.pow) + + _test_op(df, lambda x, y: y + x) + _test_op(df, lambda x, y: y - x) + _test_op(df, lambda x, y: y * x) + _test_op(df, lambda x, y: y / x) + _test_op(df, lambda x, y: y ** x) + + _test_op(df, lambda x, y: x + y) + _test_op(df, lambda x, y: x - y) + _test_op(df, lambda x, y: x * y) + _test_op(df, lambda x, y: x / y) + _test_op(df, lambda x, y: x ** y) + + @pytest.mark.parametrize( + "values", [[1, 2], (1, 2), np.array([1, 2]), range(1, 3), deque([1, 2])] + ) + def test_arith_alignment_non_pandas_object(self, values): + # GH#17901 + df = pd.DataFrame({"A": [1, 1], "B": [1, 1]}) + expected = pd.DataFrame({"A": [2, 2], "B": [3, 3]}) + result = df + values + tm.assert_frame_equal(result, expected) + + def test_arith_non_pandas_object(self): + df = pd.DataFrame( + np.arange(1, 10, dtype="f8").reshape(3, 3), + columns=["one", "two", "three"], + index=["a", "b", "c"], + ) + + val1 = df.xs("a").values + added = pd.DataFrame(df.values + val1, index=df.index, columns=df.columns) + tm.assert_frame_equal(df + val1, added) + + added = pd.DataFrame((df.values.T + val1).T, index=df.index, columns=df.columns) + tm.assert_frame_equal(df.add(val1, axis=0), added) + + val2 = list(df["two"]) + + added = pd.DataFrame(df.values + val2, index=df.index, columns=df.columns) + tm.assert_frame_equal(df + val2, added) + + added = pd.DataFrame((df.values.T + val2).T, index=df.index, columns=df.columns) + tm.assert_frame_equal(df.add(val2, axis="index"), added) + + val3 = np.random.rand(*df.shape) + added = pd.DataFrame(df.values + val3, index=df.index, columns=df.columns) + tm.assert_frame_equal(df.add(val3), added) + + def test_operations_with_interval_categories_index(self, all_arithmetic_operators): + # GH#27415 + op = all_arithmetic_operators + ind = pd.CategoricalIndex(pd.interval_range(start=0.0, end=2.0)) + data = [1, 2] + df = pd.DataFrame([data], columns=ind) + num = 10 + result = getattr(df, op)(num) + expected = pd.DataFrame([[getattr(n, op)(num) for n in data]], columns=ind) + tm.assert_frame_equal(result, expected) + + +def test_frame_with_zero_len_series_corner_cases(): + # GH#28600 + # easy all-float case + df = pd.DataFrame(np.random.randn(6).reshape(3, 2), columns=["A", "B"]) + ser = pd.Series(dtype=np.float64) + + result = df + ser + expected = pd.DataFrame(df.values * np.nan, columns=df.columns) + tm.assert_frame_equal(result, expected) + + result = df == ser + expected = pd.DataFrame(False, index=df.index, columns=df.columns) + tm.assert_frame_equal(result, expected) + + # non-float case should not raise on comparison + df2 = pd.DataFrame(df.values.view("M8[ns]"), columns=df.columns) + result = df2 == ser + expected = pd.DataFrame(False, index=df.index, columns=df.columns) + tm.assert_frame_equal(result, expected) + + +def test_zero_len_frame_with_series_corner_cases(): + # GH#28600 + df = pd.DataFrame(columns=["A", "B"], dtype=np.float64) + ser = pd.Series([1, 2], index=["A", "B"]) + + result = df + ser + expected = df + tm.assert_frame_equal(result, expected) + + +def test_frame_single_columns_object_sum_axis_1(): + # GH 13758 + data = { + "One": pd.Series(["A", 1.2, np.nan]), + } + df = pd.DataFrame(data) + result = df.sum(axis=1) + expected = pd.Series(["A", 1.2, 0]) + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/frame/test_axis_select_reindex.py b/venv/Lib/site-packages/pandas/tests/frame/test_axis_select_reindex.py new file mode 100644 index 0000000..7effa98 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/test_axis_select_reindex.py @@ -0,0 +1,1154 @@ +from datetime import datetime +import re + +import numpy as np +import pytest + +from pandas.errors import PerformanceWarning + +import pandas as pd +from pandas import Categorical, DataFrame, Index, MultiIndex, Series, date_range, isna +import pandas._testing as tm + + +class TestDataFrameSelectReindex: + # These are specific reindex-based tests; other indexing tests should go in + # test_indexing + + def test_drop_names(self): + df = DataFrame( + [[1, 2, 3], [3, 4, 5], [5, 6, 7]], + index=["a", "b", "c"], + columns=["d", "e", "f"], + ) + df.index.name, df.columns.name = "first", "second" + df_dropped_b = df.drop("b") + df_dropped_e = df.drop("e", axis=1) + df_inplace_b, df_inplace_e = df.copy(), df.copy() + df_inplace_b.drop("b", inplace=True) + df_inplace_e.drop("e", axis=1, inplace=True) + for obj in (df_dropped_b, df_dropped_e, df_inplace_b, df_inplace_e): + assert obj.index.name == "first" + assert obj.columns.name == "second" + assert list(df.columns) == ["d", "e", "f"] + + msg = r"\['g'\] not found in axis" + with pytest.raises(KeyError, match=msg): + df.drop(["g"]) + with pytest.raises(KeyError, match=msg): + df.drop(["g"], 1) + + # errors = 'ignore' + dropped = df.drop(["g"], errors="ignore") + expected = Index(["a", "b", "c"], name="first") + tm.assert_index_equal(dropped.index, expected) + + dropped = df.drop(["b", "g"], errors="ignore") + expected = Index(["a", "c"], name="first") + tm.assert_index_equal(dropped.index, expected) + + dropped = df.drop(["g"], axis=1, errors="ignore") + expected = Index(["d", "e", "f"], name="second") + tm.assert_index_equal(dropped.columns, expected) + + dropped = df.drop(["d", "g"], axis=1, errors="ignore") + expected = Index(["e", "f"], name="second") + tm.assert_index_equal(dropped.columns, expected) + + # GH 16398 + dropped = df.drop([], errors="ignore") + expected = Index(["a", "b", "c"], name="first") + tm.assert_index_equal(dropped.index, expected) + + def test_drop_col_still_multiindex(self): + arrays = [["a", "b", "c", "top"], ["", "", "", "OD"], ["", "", "", "wx"]] + + tuples = sorted(zip(*arrays)) + index = MultiIndex.from_tuples(tuples) + + df = DataFrame(np.random.randn(3, 4), columns=index) + del df[("a", "", "")] + assert isinstance(df.columns, MultiIndex) + + def test_drop(self): + simple = DataFrame({"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]}) + tm.assert_frame_equal(simple.drop("A", axis=1), simple[["B"]]) + tm.assert_frame_equal(simple.drop(["A", "B"], axis="columns"), simple[[]]) + tm.assert_frame_equal(simple.drop([0, 1, 3], axis=0), simple.loc[[2], :]) + tm.assert_frame_equal(simple.drop([0, 3], axis="index"), simple.loc[[1, 2], :]) + + with pytest.raises(KeyError, match=r"\[5\] not found in axis"): + simple.drop(5) + with pytest.raises(KeyError, match=r"\['C'\] not found in axis"): + simple.drop("C", 1) + with pytest.raises(KeyError, match=r"\[5\] not found in axis"): + simple.drop([1, 5]) + with pytest.raises(KeyError, match=r"\['C'\] not found in axis"): + simple.drop(["A", "C"], 1) + + # errors = 'ignore' + tm.assert_frame_equal(simple.drop(5, errors="ignore"), simple) + tm.assert_frame_equal( + simple.drop([0, 5], errors="ignore"), simple.loc[[1, 2, 3], :] + ) + tm.assert_frame_equal(simple.drop("C", axis=1, errors="ignore"), simple) + tm.assert_frame_equal( + simple.drop(["A", "C"], axis=1, errors="ignore"), simple[["B"]] + ) + + # non-unique - wheee! + nu_df = DataFrame( + list(zip(range(3), range(-3, 1), list("abc"))), columns=["a", "a", "b"] + ) + tm.assert_frame_equal(nu_df.drop("a", axis=1), nu_df[["b"]]) + tm.assert_frame_equal(nu_df.drop("b", axis="columns"), nu_df["a"]) + tm.assert_frame_equal(nu_df.drop([]), nu_df) # GH 16398 + + nu_df = nu_df.set_index(pd.Index(["X", "Y", "X"])) + nu_df.columns = list("abc") + tm.assert_frame_equal(nu_df.drop("X", axis="rows"), nu_df.loc[["Y"], :]) + tm.assert_frame_equal(nu_df.drop(["X", "Y"], axis=0), nu_df.loc[[], :]) + + # inplace cache issue + # GH 5628 + df = pd.DataFrame(np.random.randn(10, 3), columns=list("abc")) + expected = df[~(df.b > 0)] + df.drop(labels=df[df.b > 0].index, inplace=True) + tm.assert_frame_equal(df, expected) + + def test_drop_multiindex_not_lexsorted(self): + # GH 11640 + + # define the lexsorted version + lexsorted_mi = MultiIndex.from_tuples( + [("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"] + ) + lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi) + assert lexsorted_df.columns.is_lexsorted() + + # define the non-lexsorted version + not_lexsorted_df = DataFrame( + columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]] + ) + not_lexsorted_df = not_lexsorted_df.pivot_table( + index="a", columns=["b", "c"], values="d" + ) + not_lexsorted_df = not_lexsorted_df.reset_index() + assert not not_lexsorted_df.columns.is_lexsorted() + + # compare the results + tm.assert_frame_equal(lexsorted_df, not_lexsorted_df) + + expected = lexsorted_df.drop("a", axis=1) + with tm.assert_produces_warning(PerformanceWarning): + result = not_lexsorted_df.drop("a", axis=1) + + tm.assert_frame_equal(result, expected) + + def test_drop_api_equivalence(self): + # equivalence of the labels/axis and index/columns API's (GH12392) + df = DataFrame( + [[1, 2, 3], [3, 4, 5], [5, 6, 7]], + index=["a", "b", "c"], + columns=["d", "e", "f"], + ) + + res1 = df.drop("a") + res2 = df.drop(index="a") + tm.assert_frame_equal(res1, res2) + + res1 = df.drop("d", 1) + res2 = df.drop(columns="d") + tm.assert_frame_equal(res1, res2) + + res1 = df.drop(labels="e", axis=1) + res2 = df.drop(columns="e") + tm.assert_frame_equal(res1, res2) + + res1 = df.drop(["a"], axis=0) + res2 = df.drop(index=["a"]) + tm.assert_frame_equal(res1, res2) + + res1 = df.drop(["a"], axis=0).drop(["d"], axis=1) + res2 = df.drop(index=["a"], columns=["d"]) + tm.assert_frame_equal(res1, res2) + + with pytest.raises(ValueError): + df.drop(labels="a", index="b") + + with pytest.raises(ValueError): + df.drop(labels="a", columns="b") + + with pytest.raises(ValueError): + df.drop(axis=1) + + def test_merge_join_different_levels(self): + # GH 9455 + + # first dataframe + df1 = DataFrame(columns=["a", "b"], data=[[1, 11], [0, 22]]) + + # second dataframe + columns = MultiIndex.from_tuples([("a", ""), ("c", "c1")]) + df2 = DataFrame(columns=columns, data=[[1, 33], [0, 44]]) + + # merge + columns = ["a", "b", ("c", "c1")] + expected = DataFrame(columns=columns, data=[[1, 11, 33], [0, 22, 44]]) + with tm.assert_produces_warning(UserWarning): + result = pd.merge(df1, df2, on="a") + tm.assert_frame_equal(result, expected) + + # join, see discussion in GH 12219 + columns = ["a", "b", ("a", ""), ("c", "c1")] + expected = DataFrame(columns=columns, data=[[1, 11, 0, 44], [0, 22, 1, 33]]) + with tm.assert_produces_warning(UserWarning): + result = df1.join(df2, on="a") + tm.assert_frame_equal(result, expected) + + def test_reindex(self, float_frame): + datetime_series = tm.makeTimeSeries(nper=30) + + newFrame = float_frame.reindex(datetime_series.index) + + for col in newFrame.columns: + for idx, val in newFrame[col].items(): + if idx in float_frame.index: + if np.isnan(val): + assert np.isnan(float_frame[col][idx]) + else: + assert val == float_frame[col][idx] + else: + assert np.isnan(val) + + for col, series in newFrame.items(): + assert tm.equalContents(series.index, newFrame.index) + emptyFrame = float_frame.reindex(Index([])) + assert len(emptyFrame.index) == 0 + + # Cython code should be unit-tested directly + nonContigFrame = float_frame.reindex(datetime_series.index[::2]) + + for col in nonContigFrame.columns: + for idx, val in nonContigFrame[col].items(): + if idx in float_frame.index: + if np.isnan(val): + assert np.isnan(float_frame[col][idx]) + else: + assert val == float_frame[col][idx] + else: + assert np.isnan(val) + + for col, series in nonContigFrame.items(): + assert tm.equalContents(series.index, nonContigFrame.index) + + # corner cases + + # Same index, copies values but not index if copy=False + newFrame = float_frame.reindex(float_frame.index, copy=False) + assert newFrame.index is float_frame.index + + # length zero + newFrame = float_frame.reindex([]) + assert newFrame.empty + assert len(newFrame.columns) == len(float_frame.columns) + + # length zero with columns reindexed with non-empty index + newFrame = float_frame.reindex([]) + newFrame = newFrame.reindex(float_frame.index) + assert len(newFrame.index) == len(float_frame.index) + assert len(newFrame.columns) == len(float_frame.columns) + + # pass non-Index + newFrame = float_frame.reindex(list(datetime_series.index)) + tm.assert_index_equal(newFrame.index, datetime_series.index) + + # copy with no axes + result = float_frame.reindex() + tm.assert_frame_equal(result, float_frame) + assert result is not float_frame + + def test_reindex_nan(self): + df = pd.DataFrame( + [[1, 2], [3, 5], [7, 11], [9, 23]], + index=[2, np.nan, 1, 5], + columns=["joe", "jim"], + ) + + i, j = [np.nan, 5, 5, np.nan, 1, 2, np.nan], [1, 3, 3, 1, 2, 0, 1] + tm.assert_frame_equal(df.reindex(i), df.iloc[j]) + + df.index = df.index.astype("object") + tm.assert_frame_equal(df.reindex(i), df.iloc[j], check_index_type=False) + + # GH10388 + df = pd.DataFrame( + { + "other": ["a", "b", np.nan, "c"], + "date": ["2015-03-22", np.nan, "2012-01-08", np.nan], + "amount": [2, 3, 4, 5], + } + ) + + df["date"] = pd.to_datetime(df.date) + df["delta"] = (pd.to_datetime("2015-06-18") - df["date"]).shift(1) + + left = df.set_index(["delta", "other", "date"]).reset_index() + right = df.reindex(columns=["delta", "other", "date", "amount"]) + tm.assert_frame_equal(left, right) + + def test_reindex_name_remains(self): + s = Series(np.random.rand(10)) + df = DataFrame(s, index=np.arange(len(s))) + i = Series(np.arange(10), name="iname") + + df = df.reindex(i) + assert df.index.name == "iname" + + df = df.reindex(Index(np.arange(10), name="tmpname")) + assert df.index.name == "tmpname" + + s = Series(np.random.rand(10)) + df = DataFrame(s.T, index=np.arange(len(s))) + i = Series(np.arange(10), name="iname") + df = df.reindex(columns=i) + assert df.columns.name == "iname" + + def test_reindex_int(self, int_frame): + smaller = int_frame.reindex(int_frame.index[::2]) + + assert smaller["A"].dtype == np.int64 + + bigger = smaller.reindex(int_frame.index) + assert bigger["A"].dtype == np.float64 + + smaller = int_frame.reindex(columns=["A", "B"]) + assert smaller["A"].dtype == np.int64 + + def test_reindex_like(self, float_frame): + other = float_frame.reindex(index=float_frame.index[:10], columns=["C", "B"]) + + tm.assert_frame_equal(other, float_frame.reindex_like(other)) + + def test_reindex_columns(self, float_frame): + new_frame = float_frame.reindex(columns=["A", "B", "E"]) + + tm.assert_series_equal(new_frame["B"], float_frame["B"]) + assert np.isnan(new_frame["E"]).all() + assert "C" not in new_frame + + # Length zero + new_frame = float_frame.reindex(columns=[]) + assert new_frame.empty + + def test_reindex_columns_method(self): + + # GH 14992, reindexing over columns ignored method + df = DataFrame( + data=[[11, 12, 13], [21, 22, 23], [31, 32, 33]], + index=[1, 2, 4], + columns=[1, 2, 4], + dtype=float, + ) + + # default method + result = df.reindex(columns=range(6)) + expected = DataFrame( + data=[ + [np.nan, 11, 12, np.nan, 13, np.nan], + [np.nan, 21, 22, np.nan, 23, np.nan], + [np.nan, 31, 32, np.nan, 33, np.nan], + ], + index=[1, 2, 4], + columns=range(6), + dtype=float, + ) + tm.assert_frame_equal(result, expected) + + # method='ffill' + result = df.reindex(columns=range(6), method="ffill") + expected = DataFrame( + data=[ + [np.nan, 11, 12, 12, 13, 13], + [np.nan, 21, 22, 22, 23, 23], + [np.nan, 31, 32, 32, 33, 33], + ], + index=[1, 2, 4], + columns=range(6), + dtype=float, + ) + tm.assert_frame_equal(result, expected) + + # method='bfill' + result = df.reindex(columns=range(6), method="bfill") + expected = DataFrame( + data=[ + [11, 11, 12, 13, 13, np.nan], + [21, 21, 22, 23, 23, np.nan], + [31, 31, 32, 33, 33, np.nan], + ], + index=[1, 2, 4], + columns=range(6), + dtype=float, + ) + tm.assert_frame_equal(result, expected) + + def test_reindex_axes(self): + # GH 3317, reindexing by both axes loses freq of the index + df = DataFrame( + np.ones((3, 3)), + index=[datetime(2012, 1, 1), datetime(2012, 1, 2), datetime(2012, 1, 3)], + columns=["a", "b", "c"], + ) + time_freq = date_range("2012-01-01", "2012-01-03", freq="d") + some_cols = ["a", "b"] + + index_freq = df.reindex(index=time_freq).index.freq + both_freq = df.reindex(index=time_freq, columns=some_cols).index.freq + seq_freq = df.reindex(index=time_freq).reindex(columns=some_cols).index.freq + assert index_freq == both_freq + assert index_freq == seq_freq + + def test_reindex_fill_value(self): + df = DataFrame(np.random.randn(10, 4)) + + # axis=0 + result = df.reindex(list(range(15))) + assert np.isnan(result.values[-5:]).all() + + result = df.reindex(range(15), fill_value=0) + expected = df.reindex(range(15)).fillna(0) + tm.assert_frame_equal(result, expected) + + # axis=1 + result = df.reindex(columns=range(5), fill_value=0.0) + expected = df.copy() + expected[4] = 0.0 + tm.assert_frame_equal(result, expected) + + result = df.reindex(columns=range(5), fill_value=0) + expected = df.copy() + expected[4] = 0 + tm.assert_frame_equal(result, expected) + + result = df.reindex(columns=range(5), fill_value="foo") + expected = df.copy() + expected[4] = "foo" + tm.assert_frame_equal(result, expected) + + # other dtypes + df["foo"] = "foo" + result = df.reindex(range(15), fill_value=0) + expected = df.reindex(range(15)).fillna(0) + tm.assert_frame_equal(result, expected) + + def test_reindex_dups(self): + + # GH4746, reindex on duplicate index error messages + arr = np.random.randn(10) + df = DataFrame(arr, index=[1, 2, 3, 4, 5, 1, 2, 3, 4, 5]) + + # set index is ok + result = df.copy() + result.index = list(range(len(df))) + expected = DataFrame(arr, index=list(range(len(df)))) + tm.assert_frame_equal(result, expected) + + # reindex fails + msg = "cannot reindex from a duplicate axis" + with pytest.raises(ValueError, match=msg): + df.reindex(index=list(range(len(df)))) + + def test_reindex_axis_style(self): + # https://github.com/pandas-dev/pandas/issues/12392 + df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + expected = pd.DataFrame( + {"A": [1, 2, np.nan], "B": [4, 5, np.nan]}, index=[0, 1, 3] + ) + result = df.reindex([0, 1, 3]) + tm.assert_frame_equal(result, expected) + + result = df.reindex([0, 1, 3], axis=0) + tm.assert_frame_equal(result, expected) + + result = df.reindex([0, 1, 3], axis="index") + tm.assert_frame_equal(result, expected) + + def test_reindex_positional_warns(self): + # https://github.com/pandas-dev/pandas/issues/12392 + df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + expected = pd.DataFrame({"A": [1.0, 2], "B": [4.0, 5], "C": [np.nan, np.nan]}) + with tm.assert_produces_warning(FutureWarning): + result = df.reindex([0, 1], ["A", "B", "C"]) + + tm.assert_frame_equal(result, expected) + + def test_reindex_axis_style_raises(self): + # https://github.com/pandas-dev/pandas/issues/12392 + df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + with pytest.raises(TypeError, match="Cannot specify both 'axis'"): + df.reindex([0, 1], ["A"], axis=1) + + with pytest.raises(TypeError, match="Cannot specify both 'axis'"): + df.reindex([0, 1], ["A"], axis="index") + + with pytest.raises(TypeError, match="Cannot specify both 'axis'"): + df.reindex(index=[0, 1], axis="index") + + with pytest.raises(TypeError, match="Cannot specify both 'axis'"): + df.reindex(index=[0, 1], axis="columns") + + with pytest.raises(TypeError, match="Cannot specify both 'axis'"): + df.reindex(columns=[0, 1], axis="columns") + + with pytest.raises(TypeError, match="Cannot specify both 'axis'"): + df.reindex(index=[0, 1], columns=[0, 1], axis="columns") + + with pytest.raises(TypeError, match="Cannot specify all"): + df.reindex([0, 1], [0], ["A"]) + + # Mixing styles + with pytest.raises(TypeError, match="Cannot specify both 'axis'"): + df.reindex(index=[0, 1], axis="index") + + with pytest.raises(TypeError, match="Cannot specify both 'axis'"): + df.reindex(index=[0, 1], axis="columns") + + # Duplicates + with pytest.raises(TypeError, match="multiple values"): + df.reindex([0, 1], labels=[0, 1]) + + def test_reindex_single_named_indexer(self): + # https://github.com/pandas-dev/pandas/issues/12392 + df = pd.DataFrame({"A": [1, 2, 3], "B": [1, 2, 3]}) + result = df.reindex([0, 1], columns=["A"]) + expected = pd.DataFrame({"A": [1, 2]}) + tm.assert_frame_equal(result, expected) + + def test_reindex_api_equivalence(self): + # https://github.com/pandas-dev/pandas/issues/12392 + # equivalence of the labels/axis and index/columns API's + df = DataFrame( + [[1, 2, 3], [3, 4, 5], [5, 6, 7]], + index=["a", "b", "c"], + columns=["d", "e", "f"], + ) + + res1 = df.reindex(["b", "a"]) + res2 = df.reindex(index=["b", "a"]) + res3 = df.reindex(labels=["b", "a"]) + res4 = df.reindex(labels=["b", "a"], axis=0) + res5 = df.reindex(["b", "a"], axis=0) + for res in [res2, res3, res4, res5]: + tm.assert_frame_equal(res1, res) + + res1 = df.reindex(columns=["e", "d"]) + res2 = df.reindex(["e", "d"], axis=1) + res3 = df.reindex(labels=["e", "d"], axis=1) + for res in [res2, res3]: + tm.assert_frame_equal(res1, res) + + with tm.assert_produces_warning(FutureWarning) as m: + res1 = df.reindex(["b", "a"], ["e", "d"]) + assert "reindex" in str(m[0].message) + res2 = df.reindex(columns=["e", "d"], index=["b", "a"]) + res3 = df.reindex(labels=["b", "a"], axis=0).reindex(labels=["e", "d"], axis=1) + for res in [res2, res3]: + tm.assert_frame_equal(res1, res) + + def test_align_float(self, float_frame): + af, bf = float_frame.align(float_frame) + assert af._data is not float_frame._data + + af, bf = float_frame.align(float_frame, copy=False) + assert af._data is float_frame._data + + # axis = 0 + other = float_frame.iloc[:-5, :3] + af, bf = float_frame.align(other, axis=0, fill_value=-1) + + tm.assert_index_equal(bf.columns, other.columns) + + # test fill value + join_idx = float_frame.index.join(other.index) + diff_a = float_frame.index.difference(join_idx) + diff_b = other.index.difference(join_idx) + diff_a_vals = af.reindex(diff_a).values + diff_b_vals = bf.reindex(diff_b).values + assert (diff_a_vals == -1).all() + + af, bf = float_frame.align(other, join="right", axis=0) + tm.assert_index_equal(bf.columns, other.columns) + tm.assert_index_equal(bf.index, other.index) + tm.assert_index_equal(af.index, other.index) + + # axis = 1 + other = float_frame.iloc[:-5, :3].copy() + af, bf = float_frame.align(other, axis=1) + tm.assert_index_equal(bf.columns, float_frame.columns) + tm.assert_index_equal(bf.index, other.index) + + # test fill value + join_idx = float_frame.index.join(other.index) + diff_a = float_frame.index.difference(join_idx) + diff_b = other.index.difference(join_idx) + diff_a_vals = af.reindex(diff_a).values + + # TODO(wesm): unused? + diff_b_vals = bf.reindex(diff_b).values # noqa + + assert (diff_a_vals == -1).all() + + af, bf = float_frame.align(other, join="inner", axis=1) + tm.assert_index_equal(bf.columns, other.columns) + + af, bf = float_frame.align(other, join="inner", axis=1, method="pad") + tm.assert_index_equal(bf.columns, other.columns) + + af, bf = float_frame.align( + other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=None + ) + tm.assert_index_equal(bf.index, Index([])) + + af, bf = float_frame.align( + other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0 + ) + tm.assert_index_equal(bf.index, Index([])) + + # Try to align DataFrame to Series along bad axis + with pytest.raises(ValueError): + float_frame.align(af.iloc[0, :3], join="inner", axis=2) + + # align dataframe to series with broadcast or not + idx = float_frame.index + s = Series(range(len(idx)), index=idx) + + left, right = float_frame.align(s, axis=0) + tm.assert_index_equal(left.index, float_frame.index) + tm.assert_index_equal(right.index, float_frame.index) + assert isinstance(right, Series) + + left, right = float_frame.align(s, broadcast_axis=1) + tm.assert_index_equal(left.index, float_frame.index) + expected = {c: s for c in float_frame.columns} + expected = DataFrame( + expected, index=float_frame.index, columns=float_frame.columns + ) + tm.assert_frame_equal(right, expected) + + # see gh-9558 + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + result = df[df["a"] == 2] + expected = DataFrame([[2, 5]], index=[1], columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + + result = df.where(df["a"] == 2, 0) + expected = DataFrame({"a": [0, 2, 0], "b": [0, 5, 0]}) + tm.assert_frame_equal(result, expected) + + def test_align_int(self, int_frame): + # test other non-float types + other = DataFrame(index=range(5), columns=["A", "B", "C"]) + + af, bf = int_frame.align(other, join="inner", axis=1, method="pad") + tm.assert_index_equal(bf.columns, other.columns) + + def test_align_mixed_type(self, float_string_frame): + + af, bf = float_string_frame.align( + float_string_frame, join="inner", axis=1, method="pad" + ) + tm.assert_index_equal(bf.columns, float_string_frame.columns) + + def test_align_mixed_float(self, mixed_float_frame): + # mixed floats/ints + other = DataFrame(index=range(5), columns=["A", "B", "C"]) + + af, bf = mixed_float_frame.align( + other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0 + ) + tm.assert_index_equal(bf.index, Index([])) + + def test_align_mixed_int(self, mixed_int_frame): + other = DataFrame(index=range(5), columns=["A", "B", "C"]) + + af, bf = mixed_int_frame.align( + other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0 + ) + tm.assert_index_equal(bf.index, Index([])) + + def _check_align(self, a, b, axis, fill_axis, how, method, limit=None): + aa, ab = a.align( + b, axis=axis, join=how, method=method, limit=limit, fill_axis=fill_axis + ) + + join_index, join_columns = None, None + + ea, eb = a, b + if axis is None or axis == 0: + join_index = a.index.join(b.index, how=how) + ea = ea.reindex(index=join_index) + eb = eb.reindex(index=join_index) + + if axis is None or axis == 1: + join_columns = a.columns.join(b.columns, how=how) + ea = ea.reindex(columns=join_columns) + eb = eb.reindex(columns=join_columns) + + ea = ea.fillna(axis=fill_axis, method=method, limit=limit) + eb = eb.fillna(axis=fill_axis, method=method, limit=limit) + + tm.assert_frame_equal(aa, ea) + tm.assert_frame_equal(ab, eb) + + @pytest.mark.parametrize("meth", ["pad", "bfill"]) + @pytest.mark.parametrize("ax", [0, 1, None]) + @pytest.mark.parametrize("fax", [0, 1]) + @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"]) + def test_align_fill_method(self, how, meth, ax, fax, float_frame): + df = float_frame + self._check_align_fill(df, how, meth, ax, fax) + + def _check_align_fill(self, frame, kind, meth, ax, fax): + left = frame.iloc[0:4, :10] + right = frame.iloc[2:, 6:] + empty = frame.iloc[:0, :0] + + self._check_align(left, right, axis=ax, fill_axis=fax, how=kind, method=meth) + self._check_align( + left, right, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1 + ) + + # empty left + self._check_align(empty, right, axis=ax, fill_axis=fax, how=kind, method=meth) + self._check_align( + empty, right, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1 + ) + + # empty right + self._check_align(left, empty, axis=ax, fill_axis=fax, how=kind, method=meth) + self._check_align( + left, empty, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1 + ) + + # both empty + self._check_align(empty, empty, axis=ax, fill_axis=fax, how=kind, method=meth) + self._check_align( + empty, empty, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1 + ) + + def test_align_int_fill_bug(self): + # GH #910 + X = np.arange(10 * 10, dtype="float64").reshape(10, 10) + Y = np.ones((10, 1), dtype=int) + + df1 = DataFrame(X) + df1["0.X"] = Y.squeeze() + + df2 = df1.astype(float) + + result = df1 - df1.mean() + expected = df2 - df2.mean() + tm.assert_frame_equal(result, expected) + + def test_align_multiindex(self): + # GH 10665 + # same test cases as test_align_multiindex in test_series.py + + midx = pd.MultiIndex.from_product( + [range(2), range(3), range(2)], names=("a", "b", "c") + ) + idx = pd.Index(range(2), name="b") + df1 = pd.DataFrame(np.arange(12, dtype="int64"), index=midx) + df2 = pd.DataFrame(np.arange(2, dtype="int64"), index=idx) + + # these must be the same results (but flipped) + res1l, res1r = df1.align(df2, join="left") + res2l, res2r = df2.align(df1, join="right") + + expl = df1 + tm.assert_frame_equal(expl, res1l) + tm.assert_frame_equal(expl, res2r) + expr = pd.DataFrame([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx) + tm.assert_frame_equal(expr, res1r) + tm.assert_frame_equal(expr, res2l) + + res1l, res1r = df1.align(df2, join="right") + res2l, res2r = df2.align(df1, join="left") + + exp_idx = pd.MultiIndex.from_product( + [range(2), range(2), range(2)], names=("a", "b", "c") + ) + expl = pd.DataFrame([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx) + tm.assert_frame_equal(expl, res1l) + tm.assert_frame_equal(expl, res2r) + expr = pd.DataFrame([0, 0, 1, 1] * 2, index=exp_idx) + tm.assert_frame_equal(expr, res1r) + tm.assert_frame_equal(expr, res2l) + + def test_align_series_combinations(self): + df = pd.DataFrame({"a": [1, 3, 5], "b": [1, 3, 5]}, index=list("ACE")) + s = pd.Series([1, 2, 4], index=list("ABD"), name="x") + + # frame + series + res1, res2 = df.align(s, axis=0) + exp1 = pd.DataFrame( + {"a": [1, np.nan, 3, np.nan, 5], "b": [1, np.nan, 3, np.nan, 5]}, + index=list("ABCDE"), + ) + exp2 = pd.Series([1, 2, np.nan, 4, np.nan], index=list("ABCDE"), name="x") + + tm.assert_frame_equal(res1, exp1) + tm.assert_series_equal(res2, exp2) + + # series + frame + res1, res2 = s.align(df) + tm.assert_series_equal(res1, exp2) + tm.assert_frame_equal(res2, exp1) + + def test_filter(self, float_frame, float_string_frame): + # Items + filtered = float_frame.filter(["A", "B", "E"]) + assert len(filtered.columns) == 2 + assert "E" not in filtered + + filtered = float_frame.filter(["A", "B", "E"], axis="columns") + assert len(filtered.columns) == 2 + assert "E" not in filtered + + # Other axis + idx = float_frame.index[0:4] + filtered = float_frame.filter(idx, axis="index") + expected = float_frame.reindex(index=idx) + tm.assert_frame_equal(filtered, expected) + + # like + fcopy = float_frame.copy() + fcopy["AA"] = 1 + + filtered = fcopy.filter(like="A") + assert len(filtered.columns) == 2 + assert "AA" in filtered + + # like with ints in column names + df = DataFrame(0.0, index=[0, 1, 2], columns=[0, 1, "_A", "_B"]) + filtered = df.filter(like="_") + assert len(filtered.columns) == 2 + + # regex with ints in column names + # from PR #10384 + df = DataFrame(0.0, index=[0, 1, 2], columns=["A1", 1, "B", 2, "C"]) + expected = DataFrame( + 0.0, index=[0, 1, 2], columns=pd.Index([1, 2], dtype=object) + ) + filtered = df.filter(regex="^[0-9]+$") + tm.assert_frame_equal(filtered, expected) + + expected = DataFrame(0.0, index=[0, 1, 2], columns=[0, "0", 1, "1"]) + # shouldn't remove anything + filtered = expected.filter(regex="^[0-9]+$") + tm.assert_frame_equal(filtered, expected) + + # pass in None + with pytest.raises(TypeError, match="Must pass"): + float_frame.filter() + with pytest.raises(TypeError, match="Must pass"): + float_frame.filter(items=None) + with pytest.raises(TypeError, match="Must pass"): + float_frame.filter(axis=1) + + # test mutually exclusive arguments + with pytest.raises(TypeError, match="mutually exclusive"): + float_frame.filter(items=["one", "three"], regex="e$", like="bbi") + with pytest.raises(TypeError, match="mutually exclusive"): + float_frame.filter(items=["one", "three"], regex="e$", axis=1) + with pytest.raises(TypeError, match="mutually exclusive"): + float_frame.filter(items=["one", "three"], regex="e$") + with pytest.raises(TypeError, match="mutually exclusive"): + float_frame.filter(items=["one", "three"], like="bbi", axis=0) + with pytest.raises(TypeError, match="mutually exclusive"): + float_frame.filter(items=["one", "three"], like="bbi") + + # objects + filtered = float_string_frame.filter(like="foo") + assert "foo" in filtered + + # unicode columns, won't ascii-encode + df = float_frame.rename(columns={"B": "\u2202"}) + filtered = df.filter(like="C") + assert "C" in filtered + + def test_filter_regex_search(self, float_frame): + fcopy = float_frame.copy() + fcopy["AA"] = 1 + + # regex + filtered = fcopy.filter(regex="[A]+") + assert len(filtered.columns) == 2 + assert "AA" in filtered + + # doesn't have to be at beginning + df = DataFrame( + {"aBBa": [1, 2], "BBaBB": [1, 2], "aCCa": [1, 2], "aCCaBB": [1, 2]} + ) + + result = df.filter(regex="BB") + exp = df[[x for x in df.columns if "BB" in x]] + tm.assert_frame_equal(result, exp) + + @pytest.mark.parametrize( + "name,expected", + [ + ("a", DataFrame({"a": [1, 2]})), + ("a", DataFrame({"a": [1, 2]})), + ("あ", DataFrame({"あ": [3, 4]})), + ], + ) + def test_filter_unicode(self, name, expected): + # GH13101 + df = DataFrame({"a": [1, 2], "あ": [3, 4]}) + + tm.assert_frame_equal(df.filter(like=name), expected) + tm.assert_frame_equal(df.filter(regex=name), expected) + + @pytest.mark.parametrize("name", ["a", "a"]) + def test_filter_bytestring(self, name): + # GH13101 + df = DataFrame({b"a": [1, 2], b"b": [3, 4]}) + expected = DataFrame({b"a": [1, 2]}) + + tm.assert_frame_equal(df.filter(like=name), expected) + tm.assert_frame_equal(df.filter(regex=name), expected) + + def test_filter_corner(self): + empty = DataFrame() + + result = empty.filter([]) + tm.assert_frame_equal(result, empty) + + result = empty.filter(like="foo") + tm.assert_frame_equal(result, empty) + + def test_filter_regex_non_string(self): + # GH#5798 trying to filter on non-string columns should drop, + # not raise + df = pd.DataFrame(np.random.random((3, 2)), columns=["STRING", 123]) + result = df.filter(regex="STRING") + expected = df[["STRING"]] + tm.assert_frame_equal(result, expected) + + def test_take(self, float_frame): + # homogeneous + order = [3, 1, 2, 0] + for df in [float_frame]: + + result = df.take(order, axis=0) + expected = df.reindex(df.index.take(order)) + tm.assert_frame_equal(result, expected) + + # axis = 1 + result = df.take(order, axis=1) + expected = df.loc[:, ["D", "B", "C", "A"]] + tm.assert_frame_equal(result, expected, check_names=False) + + # negative indices + order = [2, 1, -1] + for df in [float_frame]: + + result = df.take(order, axis=0) + expected = df.reindex(df.index.take(order)) + tm.assert_frame_equal(result, expected) + + result = df.take(order, axis=0) + tm.assert_frame_equal(result, expected) + + # axis = 1 + result = df.take(order, axis=1) + expected = df.loc[:, ["C", "B", "D"]] + tm.assert_frame_equal(result, expected, check_names=False) + + # illegal indices + msg = "indices are out-of-bounds" + with pytest.raises(IndexError, match=msg): + df.take([3, 1, 2, 30], axis=0) + with pytest.raises(IndexError, match=msg): + df.take([3, 1, 2, -31], axis=0) + with pytest.raises(IndexError, match=msg): + df.take([3, 1, 2, 5], axis=1) + with pytest.raises(IndexError, match=msg): + df.take([3, 1, 2, -5], axis=1) + + def test_take_mixed_type(self, float_string_frame): + + # mixed-dtype + order = [4, 1, 2, 0, 3] + for df in [float_string_frame]: + + result = df.take(order, axis=0) + expected = df.reindex(df.index.take(order)) + tm.assert_frame_equal(result, expected) + + # axis = 1 + result = df.take(order, axis=1) + expected = df.loc[:, ["foo", "B", "C", "A", "D"]] + tm.assert_frame_equal(result, expected) + + # negative indices + order = [4, 1, -2] + for df in [float_string_frame]: + + result = df.take(order, axis=0) + expected = df.reindex(df.index.take(order)) + tm.assert_frame_equal(result, expected) + + # axis = 1 + result = df.take(order, axis=1) + expected = df.loc[:, ["foo", "B", "D"]] + tm.assert_frame_equal(result, expected) + + def test_take_mixed_numeric(self, mixed_float_frame, mixed_int_frame): + # by dtype + order = [1, 2, 0, 3] + for df in [mixed_float_frame, mixed_int_frame]: + + result = df.take(order, axis=0) + expected = df.reindex(df.index.take(order)) + tm.assert_frame_equal(result, expected) + + # axis = 1 + result = df.take(order, axis=1) + expected = df.loc[:, ["B", "C", "A", "D"]] + tm.assert_frame_equal(result, expected) + + def test_reindex_boolean(self): + frame = DataFrame( + np.ones((10, 2), dtype=bool), index=np.arange(0, 20, 2), columns=[0, 2] + ) + + reindexed = frame.reindex(np.arange(10)) + assert reindexed.values.dtype == np.object_ + assert isna(reindexed[0][1]) + + reindexed = frame.reindex(columns=range(3)) + assert reindexed.values.dtype == np.object_ + assert isna(reindexed[1]).all() + + def test_reindex_objects(self, float_string_frame): + reindexed = float_string_frame.reindex(columns=["foo", "A", "B"]) + assert "foo" in reindexed + + reindexed = float_string_frame.reindex(columns=["A", "B"]) + assert "foo" not in reindexed + + def test_reindex_corner(self, int_frame): + index = Index(["a", "b", "c"]) + dm = DataFrame({}).reindex(index=[1, 2, 3]) + reindexed = dm.reindex(columns=index) + tm.assert_index_equal(reindexed.columns, index) + + # ints are weird + smaller = int_frame.reindex(columns=["A", "B", "E"]) + assert smaller["E"].dtype == np.float64 + + def test_reindex_with_nans(self): + df = DataFrame( + [[1, 2], [3, 4], [np.nan, np.nan], [7, 8], [9, 10]], + columns=["a", "b"], + index=[100.0, 101.0, np.nan, 102.0, 103.0], + ) + + result = df.reindex(index=[101.0, 102.0, 103.0]) + expected = df.iloc[[1, 3, 4]] + tm.assert_frame_equal(result, expected) + + result = df.reindex(index=[103.0]) + expected = df.iloc[[4]] + tm.assert_frame_equal(result, expected) + + result = df.reindex(index=[101.0]) + expected = df.iloc[[1]] + tm.assert_frame_equal(result, expected) + + def test_reindex_multi(self): + df = DataFrame(np.random.randn(3, 3)) + + result = df.reindex(index=range(4), columns=range(4)) + expected = df.reindex(list(range(4))).reindex(columns=range(4)) + + tm.assert_frame_equal(result, expected) + + df = DataFrame(np.random.randint(0, 10, (3, 3))) + + result = df.reindex(index=range(4), columns=range(4)) + expected = df.reindex(list(range(4))).reindex(columns=range(4)) + + tm.assert_frame_equal(result, expected) + + df = DataFrame(np.random.randint(0, 10, (3, 3))) + + result = df.reindex(index=range(2), columns=range(2)) + expected = df.reindex(range(2)).reindex(columns=range(2)) + + tm.assert_frame_equal(result, expected) + + df = DataFrame(np.random.randn(5, 3) + 1j, columns=["a", "b", "c"]) + + result = df.reindex(index=[0, 1], columns=["a", "b"]) + expected = df.reindex([0, 1]).reindex(columns=["a", "b"]) + + tm.assert_frame_equal(result, expected) + + def test_reindex_multi_categorical_time(self): + # https://github.com/pandas-dev/pandas/issues/21390 + midx = pd.MultiIndex.from_product( + [ + Categorical(["a", "b", "c"]), + Categorical(date_range("2012-01-01", periods=3, freq="H")), + ] + ) + df = pd.DataFrame({"a": range(len(midx))}, index=midx) + df2 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 8]] + + result = df2.reindex(midx) + expected = pd.DataFrame({"a": [0, 1, 2, 3, 4, 5, 6, np.nan, 8]}, index=midx) + tm.assert_frame_equal(result, expected) + + data = [[1, 2, 3], [1, 2, 3]] + + @pytest.mark.parametrize( + "actual", + [ + DataFrame(data=data, index=["a", "a"]), + DataFrame(data=data, index=["a", "b"]), + DataFrame(data=data, index=["a", "b"]).set_index([0, 1]), + DataFrame(data=data, index=["a", "a"]).set_index([0, 1]), + ], + ) + def test_raise_on_drop_duplicate_index(self, actual): + + # issue 19186 + level = 0 if isinstance(actual.index, MultiIndex) else None + msg = re.escape("\"['c'] not found in axis\"") + with pytest.raises(KeyError, match=msg): + actual.drop("c", level=level, axis=0) + with pytest.raises(KeyError, match=msg): + actual.T.drop("c", level=level, axis=1) + expected_no_err = actual.drop("c", axis=0, level=level, errors="ignore") + tm.assert_frame_equal(expected_no_err, actual) + expected_no_err = actual.T.drop("c", axis=1, level=level, errors="ignore") + tm.assert_frame_equal(expected_no_err.T, actual) + + @pytest.mark.parametrize("index", [[1, 2, 3], [1, 1, 2]]) + @pytest.mark.parametrize("drop_labels", [[], [1], [2]]) + def test_drop_empty_list(self, index, drop_labels): + # GH 21494 + expected_index = [i for i in index if i not in drop_labels] + frame = pd.DataFrame(index=index).drop(drop_labels) + tm.assert_frame_equal(frame, pd.DataFrame(index=expected_index)) + + @pytest.mark.parametrize("index", [[1, 2, 3], [1, 2, 2]]) + @pytest.mark.parametrize("drop_labels", [[1, 4], [4, 5]]) + def test_drop_non_empty_list(self, index, drop_labels): + # GH 21494 + with pytest.raises(KeyError, match="not found in axis"): + pd.DataFrame(index=index).drop(drop_labels) diff --git a/venv/Lib/site-packages/pandas/tests/frame/test_block_internals.py b/venv/Lib/site-packages/pandas/tests/frame/test_block_internals.py new file mode 100644 index 0000000..d301ed9 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/test_block_internals.py @@ -0,0 +1,624 @@ +from datetime import datetime, timedelta +from io import StringIO +import itertools + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + Categorical, + DataFrame, + Series, + Timestamp, + compat, + date_range, + option_context, +) +import pandas._testing as tm +from pandas.core.arrays import IntervalArray, integer_array +from pandas.core.internals import ObjectBlock +from pandas.core.internals.blocks import IntBlock + +# Segregated collection of methods that require the BlockManager internal data +# structure + + +class TestDataFrameBlockInternals: + def test_setitem_invalidates_datetime_index_freq(self): + # GH#24096 altering a datetime64tz column inplace invalidates the + # `freq` attribute on the underlying DatetimeIndex + + dti = date_range("20130101", periods=3, tz="US/Eastern") + ts = dti[1] + + df = DataFrame({"B": dti}) + assert df["B"]._values.freq == "D" + + df.iloc[1, 0] = pd.NaT + assert df["B"]._values.freq is None + + # check that the DatetimeIndex was not altered in place + assert dti.freq == "D" + assert dti[1] == ts + + def test_cast_internals(self, float_frame): + casted = DataFrame(float_frame._data, dtype=int) + expected = DataFrame(float_frame._series, dtype=int) + tm.assert_frame_equal(casted, expected) + + casted = DataFrame(float_frame._data, dtype=np.int32) + expected = DataFrame(float_frame._series, dtype=np.int32) + tm.assert_frame_equal(casted, expected) + + def test_consolidate(self, float_frame): + float_frame["E"] = 7.0 + consolidated = float_frame._consolidate() + assert len(consolidated._data.blocks) == 1 + + # Ensure copy, do I want this? + recons = consolidated._consolidate() + assert recons is not consolidated + tm.assert_frame_equal(recons, consolidated) + + float_frame["F"] = 8.0 + assert len(float_frame._data.blocks) == 3 + + float_frame._consolidate(inplace=True) + assert len(float_frame._data.blocks) == 1 + + def test_consolidate_inplace(self, float_frame): + frame = float_frame.copy() # noqa + + # triggers in-place consolidation + for letter in range(ord("A"), ord("Z")): + float_frame[chr(letter)] = chr(letter) + + def test_values_consolidate(self, float_frame): + float_frame["E"] = 7.0 + assert not float_frame._data.is_consolidated() + _ = float_frame.values # noqa + assert float_frame._data.is_consolidated() + + def test_modify_values(self, float_frame): + float_frame.values[5] = 5 + assert (float_frame.values[5] == 5).all() + + # unconsolidated + float_frame["E"] = 7.0 + float_frame.values[6] = 6 + assert (float_frame.values[6] == 6).all() + + def test_boolean_set_uncons(self, float_frame): + float_frame["E"] = 7.0 + + expected = float_frame.values.copy() + expected[expected > 1] = 2 + + float_frame[float_frame > 1] = 2 + tm.assert_almost_equal(expected, float_frame.values) + + def test_values_numeric_cols(self, float_frame): + float_frame["foo"] = "bar" + + values = float_frame[["A", "B", "C", "D"]].values + assert values.dtype == np.float64 + + def test_values_lcd(self, mixed_float_frame, mixed_int_frame): + + # mixed lcd + values = mixed_float_frame[["A", "B", "C", "D"]].values + assert values.dtype == np.float64 + + values = mixed_float_frame[["A", "B", "C"]].values + assert values.dtype == np.float32 + + values = mixed_float_frame[["C"]].values + assert values.dtype == np.float16 + + # GH 10364 + # B uint64 forces float because there are other signed int types + values = mixed_int_frame[["A", "B", "C", "D"]].values + assert values.dtype == np.float64 + + values = mixed_int_frame[["A", "D"]].values + assert values.dtype == np.int64 + + # B uint64 forces float because there are other signed int types + values = mixed_int_frame[["A", "B", "C"]].values + assert values.dtype == np.float64 + + # as B and C are both unsigned, no forcing to float is needed + values = mixed_int_frame[["B", "C"]].values + assert values.dtype == np.uint64 + + values = mixed_int_frame[["A", "C"]].values + assert values.dtype == np.int32 + + values = mixed_int_frame[["C", "D"]].values + assert values.dtype == np.int64 + + values = mixed_int_frame[["A"]].values + assert values.dtype == np.int32 + + values = mixed_int_frame[["C"]].values + assert values.dtype == np.uint8 + + def test_constructor_with_convert(self): + # this is actually mostly a test of lib.maybe_convert_objects + # #2845 + df = DataFrame({"A": [2 ** 63 - 1]}) + result = df["A"] + expected = Series(np.asarray([2 ** 63 - 1], np.int64), name="A") + tm.assert_series_equal(result, expected) + + df = DataFrame({"A": [2 ** 63]}) + result = df["A"] + expected = Series(np.asarray([2 ** 63], np.uint64), name="A") + tm.assert_series_equal(result, expected) + + df = DataFrame({"A": [datetime(2005, 1, 1), True]}) + result = df["A"] + expected = Series( + np.asarray([datetime(2005, 1, 1), True], np.object_), name="A" + ) + tm.assert_series_equal(result, expected) + + df = DataFrame({"A": [None, 1]}) + result = df["A"] + expected = Series(np.asarray([np.nan, 1], np.float_), name="A") + tm.assert_series_equal(result, expected) + + df = DataFrame({"A": [1.0, 2]}) + result = df["A"] + expected = Series(np.asarray([1.0, 2], np.float_), name="A") + tm.assert_series_equal(result, expected) + + df = DataFrame({"A": [1.0 + 2.0j, 3]}) + result = df["A"] + expected = Series(np.asarray([1.0 + 2.0j, 3], np.complex_), name="A") + tm.assert_series_equal(result, expected) + + df = DataFrame({"A": [1.0 + 2.0j, 3.0]}) + result = df["A"] + expected = Series(np.asarray([1.0 + 2.0j, 3.0], np.complex_), name="A") + tm.assert_series_equal(result, expected) + + df = DataFrame({"A": [1.0 + 2.0j, True]}) + result = df["A"] + expected = Series(np.asarray([1.0 + 2.0j, True], np.object_), name="A") + tm.assert_series_equal(result, expected) + + df = DataFrame({"A": [1.0, None]}) + result = df["A"] + expected = Series(np.asarray([1.0, np.nan], np.float_), name="A") + tm.assert_series_equal(result, expected) + + df = DataFrame({"A": [1.0 + 2.0j, None]}) + result = df["A"] + expected = Series(np.asarray([1.0 + 2.0j, np.nan], np.complex_), name="A") + tm.assert_series_equal(result, expected) + + df = DataFrame({"A": [2.0, 1, True, None]}) + result = df["A"] + expected = Series(np.asarray([2.0, 1, True, None], np.object_), name="A") + tm.assert_series_equal(result, expected) + + df = DataFrame({"A": [2.0, 1, datetime(2006, 1, 1), None]}) + result = df["A"] + expected = Series( + np.asarray([2.0, 1, datetime(2006, 1, 1), None], np.object_), name="A" + ) + tm.assert_series_equal(result, expected) + + def test_construction_with_mixed(self, float_string_frame): + # test construction edge cases with mixed types + + # f7u12, this does not work without extensive workaround + data = [ + [datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)], + [datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 1)], + ] + df = DataFrame(data) + + # check dtypes + result = df.dtypes + expected = Series({"datetime64[ns]": 3}) + + # mixed-type frames + float_string_frame["datetime"] = datetime.now() + float_string_frame["timedelta"] = timedelta(days=1, seconds=1) + assert float_string_frame["datetime"].dtype == "M8[ns]" + assert float_string_frame["timedelta"].dtype == "m8[ns]" + result = float_string_frame.dtypes + expected = Series( + [np.dtype("float64")] * 4 + + [ + np.dtype("object"), + np.dtype("datetime64[ns]"), + np.dtype("timedelta64[ns]"), + ], + index=list("ABCD") + ["foo", "datetime", "timedelta"], + ) + tm.assert_series_equal(result, expected) + + def test_construction_with_conversions(self): + + # convert from a numpy array of non-ns timedelta64 + arr = np.array([1, 2, 3], dtype="timedelta64[s]") + df = DataFrame(index=range(3)) + df["A"] = arr + expected = DataFrame( + {"A": pd.timedelta_range("00:00:01", periods=3, freq="s")}, index=range(3) + ) + tm.assert_frame_equal(df, expected) + + expected = DataFrame( + { + "dt1": Timestamp("20130101"), + "dt2": date_range("20130101", periods=3), + # 'dt3' : date_range('20130101 00:00:01',periods=3,freq='s'), + }, + index=range(3), + ) + + df = DataFrame(index=range(3)) + df["dt1"] = np.datetime64("2013-01-01") + df["dt2"] = np.array( + ["2013-01-01", "2013-01-02", "2013-01-03"], dtype="datetime64[D]" + ) + + # df['dt3'] = np.array(['2013-01-01 00:00:01','2013-01-01 + # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]') + + tm.assert_frame_equal(df, expected) + + def test_constructor_compound_dtypes(self): + # GH 5191 + # compound dtypes should raise not-implementederror + + def f(dtype): + data = list(itertools.repeat((datetime(2001, 1, 1), "aa", 20), 9)) + return DataFrame(data=data, columns=["A", "B", "C"], dtype=dtype) + + msg = "compound dtypes are not implemented in the DataFrame constructor" + with pytest.raises(NotImplementedError, match=msg): + f([("A", "datetime64[h]"), ("B", "str"), ("C", "int32")]) + + # these work (though results may be unexpected) + f("int64") + f("float64") + + # 10822 + # invalid error message on dt inference + if not compat.is_platform_windows(): + f("M8[ns]") + + def test_equals_different_blocks(self): + # GH 9330 + df0 = pd.DataFrame({"A": ["x", "y"], "B": [1, 2], "C": ["w", "z"]}) + df1 = df0.reset_index()[["A", "B", "C"]] + # this assert verifies that the above operations have + # induced a block rearrangement + assert df0._data.blocks[0].dtype != df1._data.blocks[0].dtype + + # do the real tests + tm.assert_frame_equal(df0, df1) + assert df0.equals(df1) + assert df1.equals(df0) + + def test_copy_blocks(self, float_frame): + # API/ENH 9607 + df = DataFrame(float_frame, copy=True) + column = df.columns[0] + + # use the default copy=True, change a column + blocks = df._to_dict_of_blocks(copy=True) + for dtype, _df in blocks.items(): + if column in _df: + _df.loc[:, column] = _df[column] + 1 + + # make sure we did not change the original DataFrame + assert not _df[column].equals(df[column]) + + def test_no_copy_blocks(self, float_frame): + # API/ENH 9607 + df = DataFrame(float_frame, copy=True) + column = df.columns[0] + + # use the copy=False, change a column + blocks = df._to_dict_of_blocks(copy=False) + for dtype, _df in blocks.items(): + if column in _df: + _df.loc[:, column] = _df[column] + 1 + + # make sure we did change the original DataFrame + assert _df[column].equals(df[column]) + + def test_copy(self, float_frame, float_string_frame): + cop = float_frame.copy() + cop["E"] = cop["A"] + assert "E" not in float_frame + + # copy objects + copy = float_string_frame.copy() + assert copy._data is not float_string_frame._data + + def test_pickle(self, float_string_frame, timezone_frame): + empty_frame = DataFrame() + + unpickled = tm.round_trip_pickle(float_string_frame) + tm.assert_frame_equal(float_string_frame, unpickled) + + # buglet + float_string_frame._data.ndim + + # empty + unpickled = tm.round_trip_pickle(empty_frame) + repr(unpickled) + + # tz frame + unpickled = tm.round_trip_pickle(timezone_frame) + tm.assert_frame_equal(timezone_frame, unpickled) + + def test_consolidate_datetime64(self): + # numpy vstack bug + + data = """\ +starting,ending,measure +2012-06-21 00:00,2012-06-23 07:00,77 +2012-06-23 07:00,2012-06-23 16:30,65 +2012-06-23 16:30,2012-06-25 08:00,77 +2012-06-25 08:00,2012-06-26 12:00,0 +2012-06-26 12:00,2012-06-27 08:00,77 +""" + df = pd.read_csv(StringIO(data), parse_dates=[0, 1]) + + ser_starting = df.starting + ser_starting.index = ser_starting.values + ser_starting = ser_starting.tz_localize("US/Eastern") + ser_starting = ser_starting.tz_convert("UTC") + ser_starting.index.name = "starting" + + ser_ending = df.ending + ser_ending.index = ser_ending.values + ser_ending = ser_ending.tz_localize("US/Eastern") + ser_ending = ser_ending.tz_convert("UTC") + ser_ending.index.name = "ending" + + df.starting = ser_starting.index + df.ending = ser_ending.index + + tm.assert_index_equal(pd.DatetimeIndex(df.starting), ser_starting.index) + tm.assert_index_equal(pd.DatetimeIndex(df.ending), ser_ending.index) + + def test_is_mixed_type(self, float_frame, float_string_frame): + assert not float_frame._is_mixed_type + assert float_string_frame._is_mixed_type + + def test_get_numeric_data(self): + # TODO(wesm): unused? + intname = np.dtype(np.int_).name # noqa + floatname = np.dtype(np.float_).name # noqa + + datetime64name = np.dtype("M8[ns]").name + objectname = np.dtype(np.object_).name + + df = DataFrame( + {"a": 1.0, "b": 2, "c": "foo", "f": Timestamp("20010102")}, + index=np.arange(10), + ) + result = df.dtypes + expected = Series( + [ + np.dtype("float64"), + np.dtype("int64"), + np.dtype(objectname), + np.dtype(datetime64name), + ], + index=["a", "b", "c", "f"], + ) + tm.assert_series_equal(result, expected) + + df = DataFrame( + { + "a": 1.0, + "b": 2, + "c": "foo", + "d": np.array([1.0] * 10, dtype="float32"), + "e": np.array([1] * 10, dtype="int32"), + "f": np.array([1] * 10, dtype="int16"), + "g": Timestamp("20010102"), + }, + index=np.arange(10), + ) + + result = df._get_numeric_data() + expected = df.loc[:, ["a", "b", "d", "e", "f"]] + tm.assert_frame_equal(result, expected) + + only_obj = df.loc[:, ["c", "g"]] + result = only_obj._get_numeric_data() + expected = df.loc[:, []] + tm.assert_frame_equal(result, expected) + + df = DataFrame.from_dict({"a": [1, 2], "b": ["foo", "bar"], "c": [np.pi, np.e]}) + result = df._get_numeric_data() + expected = DataFrame.from_dict({"a": [1, 2], "c": [np.pi, np.e]}) + tm.assert_frame_equal(result, expected) + + df = result.copy() + result = df._get_numeric_data() + expected = df + tm.assert_frame_equal(result, expected) + + def test_get_numeric_data_extension_dtype(self): + # GH 22290 + df = DataFrame( + { + "A": integer_array([-10, np.nan, 0, 10, 20, 30], dtype="Int64"), + "B": Categorical(list("abcabc")), + "C": integer_array([0, 1, 2, 3, np.nan, 5], dtype="UInt8"), + "D": IntervalArray.from_breaks(range(7)), + } + ) + result = df._get_numeric_data() + expected = df.loc[:, ["A", "C"]] + tm.assert_frame_equal(result, expected) + + def test_convert_objects(self, float_string_frame): + + oops = float_string_frame.T.T + converted = oops._convert(datetime=True) + tm.assert_frame_equal(converted, float_string_frame) + assert converted["A"].dtype == np.float64 + + # force numeric conversion + float_string_frame["H"] = "1." + float_string_frame["I"] = "1" + + # add in some items that will be nan + length = len(float_string_frame) + float_string_frame["J"] = "1." + float_string_frame["K"] = "1" + float_string_frame.loc[0:5, ["J", "K"]] = "garbled" + converted = float_string_frame._convert(datetime=True, numeric=True) + assert converted["H"].dtype == "float64" + assert converted["I"].dtype == "int64" + assert converted["J"].dtype == "float64" + assert converted["K"].dtype == "float64" + assert len(converted["J"].dropna()) == length - 5 + assert len(converted["K"].dropna()) == length - 5 + + # via astype + converted = float_string_frame.copy() + converted["H"] = converted["H"].astype("float64") + converted["I"] = converted["I"].astype("int64") + assert converted["H"].dtype == "float64" + assert converted["I"].dtype == "int64" + + # via astype, but errors + converted = float_string_frame.copy() + with pytest.raises(ValueError, match="invalid literal"): + converted["H"].astype("int32") + + # mixed in a single column + df = DataFrame(dict(s=Series([1, "na", 3, 4]))) + result = df._convert(datetime=True, numeric=True) + expected = DataFrame(dict(s=Series([1, np.nan, 3, 4]))) + tm.assert_frame_equal(result, expected) + + def test_convert_objects_no_conversion(self): + mixed1 = DataFrame({"a": [1, 2, 3], "b": [4.0, 5, 6], "c": ["x", "y", "z"]}) + mixed2 = mixed1._convert(datetime=True) + tm.assert_frame_equal(mixed1, mixed2) + + def test_infer_objects(self): + # GH 11221 + df = DataFrame( + { + "a": ["a", 1, 2, 3], + "b": ["b", 2.0, 3.0, 4.1], + "c": [ + "c", + datetime(2016, 1, 1), + datetime(2016, 1, 2), + datetime(2016, 1, 3), + ], + "d": [1, 2, 3, "d"], + }, + columns=["a", "b", "c", "d"], + ) + df = df.iloc[1:].infer_objects() + + assert df["a"].dtype == "int64" + assert df["b"].dtype == "float64" + assert df["c"].dtype == "M8[ns]" + assert df["d"].dtype == "object" + + expected = DataFrame( + { + "a": [1, 2, 3], + "b": [2.0, 3.0, 4.1], + "c": [datetime(2016, 1, 1), datetime(2016, 1, 2), datetime(2016, 1, 3)], + "d": [2, 3, "d"], + }, + columns=["a", "b", "c", "d"], + ) + # reconstruct frame to verify inference is same + tm.assert_frame_equal(df.reset_index(drop=True), expected) + + def test_stale_cached_series_bug_473(self): + + # this is chained, but ok + with option_context("chained_assignment", None): + Y = DataFrame( + np.random.random((4, 4)), + index=("a", "b", "c", "d"), + columns=("e", "f", "g", "h"), + ) + repr(Y) + Y["e"] = Y["e"].astype("object") + Y["g"]["c"] = np.NaN + repr(Y) + result = Y.sum() # noqa + exp = Y["g"].sum() # noqa + assert pd.isna(Y["g"]["c"]) + + def test_get_X_columns(self): + # numeric and object columns + + df = DataFrame( + { + "a": [1, 2, 3], + "b": [True, False, True], + "c": ["foo", "bar", "baz"], + "d": [None, None, None], + "e": [3.14, 0.577, 2.773], + } + ) + + tm.assert_index_equal(df._get_numeric_data().columns, pd.Index(["a", "b", "e"])) + + def test_strange_column_corruption_issue(self): + # (wesm) Unclear how exactly this is related to internal matters + df = DataFrame(index=[0, 1]) + df[0] = np.nan + wasCol = {} + + for i, dt in enumerate(df.index): + for col in range(100, 200): + if col not in wasCol: + wasCol[col] = 1 + df[col] = np.nan + df[col][dt] = i + + myid = 100 + + first = len(df.loc[pd.isna(df[myid]), [myid]]) + second = len(df.loc[pd.isna(df[myid]), [myid]]) + assert first == second == 0 + + def test_constructor_no_pandas_array(self): + # Ensure that PandasArray isn't allowed inside Series + # See https://github.com/pandas-dev/pandas/issues/23995 for more. + arr = pd.Series([1, 2, 3]).array + result = pd.DataFrame({"A": arr}) + expected = pd.DataFrame({"A": [1, 2, 3]}) + tm.assert_frame_equal(result, expected) + assert isinstance(result._data.blocks[0], IntBlock) + + def test_add_column_with_pandas_array(self): + # GH 26390 + df = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) + df["c"] = pd.arrays.PandasArray(np.array([1, 2, None, 3], dtype=object)) + df2 = pd.DataFrame( + { + "a": [1, 2, 3, 4], + "b": ["a", "b", "c", "d"], + "c": pd.arrays.PandasArray(np.array([1, 2, None, 3], dtype=object)), + } + ) + assert type(df["c"]._data.blocks[0]) == ObjectBlock + assert type(df2["c"]._data.blocks[0]) == ObjectBlock + tm.assert_frame_equal(df, df2) diff --git a/venv/Lib/site-packages/pandas/tests/frame/test_combine_concat.py b/venv/Lib/site-packages/pandas/tests/frame/test_combine_concat.py new file mode 100644 index 0000000..9bad54b --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/test_combine_concat.py @@ -0,0 +1,798 @@ +from datetime import datetime + +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Index, Series, Timestamp, date_range +import pandas._testing as tm + + +class TestDataFrameConcatCommon: + def test_concat_multiple_frames_dtypes(self): + + # GH 2759 + A = DataFrame(data=np.ones((10, 2)), columns=["foo", "bar"], dtype=np.float64) + B = DataFrame(data=np.ones((10, 2)), dtype=np.float32) + results = pd.concat((A, B), axis=1).dtypes + expected = Series( + [np.dtype("float64")] * 2 + [np.dtype("float32")] * 2, + index=["foo", "bar", 0, 1], + ) + tm.assert_series_equal(results, expected) + + @pytest.mark.parametrize( + "data", + [ + pd.date_range("2000", periods=4), + pd.date_range("2000", periods=4, tz="US/Central"), + pd.period_range("2000", periods=4), + pd.timedelta_range(0, periods=4), + ], + ) + def test_combine_datetlike_udf(self, data): + # https://github.com/pandas-dev/pandas/issues/23079 + df = pd.DataFrame({"A": data}) + other = df.copy() + df.iloc[1, 0] = None + + def combiner(a, b): + return b + + result = df.combine(other, combiner) + tm.assert_frame_equal(result, other) + + def test_concat_multiple_tzs(self): + # GH 12467 + # combining datetime tz-aware and naive DataFrames + ts1 = Timestamp("2015-01-01", tz=None) + ts2 = Timestamp("2015-01-01", tz="UTC") + ts3 = Timestamp("2015-01-01", tz="EST") + + df1 = DataFrame(dict(time=[ts1])) + df2 = DataFrame(dict(time=[ts2])) + df3 = DataFrame(dict(time=[ts3])) + + results = pd.concat([df1, df2]).reset_index(drop=True) + expected = DataFrame(dict(time=[ts1, ts2]), dtype=object) + tm.assert_frame_equal(results, expected) + + results = pd.concat([df1, df3]).reset_index(drop=True) + expected = DataFrame(dict(time=[ts1, ts3]), dtype=object) + tm.assert_frame_equal(results, expected) + + results = pd.concat([df2, df3]).reset_index(drop=True) + expected = DataFrame(dict(time=[ts2, ts3])) + tm.assert_frame_equal(results, expected) + + @pytest.mark.parametrize( + "t1", + [ + "2015-01-01", + pytest.param( + pd.NaT, + marks=pytest.mark.xfail( + reason="GH23037 incorrect dtype when concatenating" + ), + ), + ], + ) + def test_concat_tz_NaT(self, t1): + # GH 22796 + # Concating tz-aware multicolumn DataFrames + ts1 = Timestamp(t1, tz="UTC") + ts2 = Timestamp("2015-01-01", tz="UTC") + ts3 = Timestamp("2015-01-01", tz="UTC") + + df1 = DataFrame([[ts1, ts2]]) + df2 = DataFrame([[ts3]]) + + result = pd.concat([df1, df2]) + expected = DataFrame([[ts1, ts2], [ts3, pd.NaT]], index=[0, 0]) + + tm.assert_frame_equal(result, expected) + + def test_concat_tz_not_aligned(self): + # GH 22796 + ts = pd.to_datetime([1, 2]).tz_localize("UTC") + a = pd.DataFrame({"A": ts}) + b = pd.DataFrame({"A": ts, "B": ts}) + result = pd.concat([a, b], sort=True, ignore_index=True) + expected = pd.DataFrame( + {"A": list(ts) + list(ts), "B": [pd.NaT, pd.NaT] + list(ts)} + ) + tm.assert_frame_equal(result, expected) + + def test_concat_tuple_keys(self): + # GH 14438 + df1 = pd.DataFrame(np.ones((2, 2)), columns=list("AB")) + df2 = pd.DataFrame(np.ones((3, 2)) * 2, columns=list("AB")) + results = pd.concat((df1, df2), keys=[("bee", "bah"), ("bee", "boo")]) + expected = pd.DataFrame( + { + "A": { + ("bee", "bah", 0): 1.0, + ("bee", "bah", 1): 1.0, + ("bee", "boo", 0): 2.0, + ("bee", "boo", 1): 2.0, + ("bee", "boo", 2): 2.0, + }, + "B": { + ("bee", "bah", 0): 1.0, + ("bee", "bah", 1): 1.0, + ("bee", "boo", 0): 2.0, + ("bee", "boo", 1): 2.0, + ("bee", "boo", 2): 2.0, + }, + } + ) + tm.assert_frame_equal(results, expected) + + def test_update(self): + df = DataFrame( + [[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]] + ) + + other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3]) + + df.update(other) + + expected = DataFrame( + [[1.5, np.nan, 3], [3.6, 2, 3], [1.5, np.nan, 3], [1.5, np.nan, 7.0]] + ) + tm.assert_frame_equal(df, expected) + + def test_update_dtypes(self): + + # gh 3016 + df = DataFrame( + [[1.0, 2.0, False, True], [4.0, 5.0, True, False]], + columns=["A", "B", "bool1", "bool2"], + ) + + other = DataFrame([[45, 45]], index=[0], columns=["A", "B"]) + df.update(other) + + expected = DataFrame( + [[45.0, 45.0, False, True], [4.0, 5.0, True, False]], + columns=["A", "B", "bool1", "bool2"], + ) + tm.assert_frame_equal(df, expected) + + def test_update_nooverwrite(self): + df = DataFrame( + [[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]] + ) + + other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3]) + + df.update(other, overwrite=False) + + expected = DataFrame( + [[1.5, np.nan, 3], [1.5, 2, 3], [1.5, np.nan, 3], [1.5, np.nan, 3.0]] + ) + tm.assert_frame_equal(df, expected) + + def test_update_filtered(self): + df = DataFrame( + [[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]] + ) + + other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3]) + + df.update(other, filter_func=lambda x: x > 2) + + expected = DataFrame( + [[1.5, np.nan, 3], [1.5, np.nan, 3], [1.5, np.nan, 3], [1.5, np.nan, 7.0]] + ) + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize( + "bad_kwarg, exception, msg", + [ + # errors must be 'ignore' or 'raise' + ({"errors": "something"}, ValueError, "The parameter errors must.*"), + ({"join": "inner"}, NotImplementedError, "Only left join is supported"), + ], + ) + def test_update_raise_bad_parameter(self, bad_kwarg, exception, msg): + df = DataFrame([[1.5, 1, 3.0]]) + with pytest.raises(exception, match=msg): + df.update(df, **bad_kwarg) + + def test_update_raise_on_overlap(self): + df = DataFrame( + [[1.5, 1, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]] + ) + + other = DataFrame([[2.0, np.nan], [np.nan, 7]], index=[1, 3], columns=[1, 2]) + with pytest.raises(ValueError, match="Data overlaps"): + df.update(other, errors="raise") + + def test_update_from_non_df(self): + d = {"a": Series([1, 2, 3, 4]), "b": Series([5, 6, 7, 8])} + df = DataFrame(d) + + d["a"] = Series([5, 6, 7, 8]) + df.update(d) + + expected = DataFrame(d) + + tm.assert_frame_equal(df, expected) + + d = {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]} + df = DataFrame(d) + + d["a"] = [5, 6, 7, 8] + df.update(d) + + expected = DataFrame(d) + + tm.assert_frame_equal(df, expected) + + def test_update_datetime_tz(self): + # GH 25807 + result = DataFrame([pd.Timestamp("2019", tz="UTC")]) + result.update(result) + expected = DataFrame([pd.Timestamp("2019", tz="UTC")]) + tm.assert_frame_equal(result, expected) + + def test_join_str_datetime(self): + str_dates = ["20120209", "20120222"] + dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)] + + A = DataFrame(str_dates, index=range(2), columns=["aa"]) + C = DataFrame([[1, 2], [3, 4]], index=str_dates, columns=dt_dates) + + tst = A.join(C, on="aa") + + assert len(tst.columns) == 3 + + def test_join_multiindex_leftright(self): + # GH 10741 + df1 = pd.DataFrame( + [ + ["a", "x", 0.471780], + ["a", "y", 0.774908], + ["a", "z", 0.563634], + ["b", "x", -0.353756], + ["b", "y", 0.368062], + ["b", "z", -1.721840], + ["c", "x", 1], + ["c", "y", 2], + ["c", "z", 3], + ], + columns=["first", "second", "value1"], + ).set_index(["first", "second"]) + + df2 = pd.DataFrame( + [["a", 10], ["b", 20]], columns=["first", "value2"] + ).set_index(["first"]) + + exp = pd.DataFrame( + [ + [0.471780, 10], + [0.774908, 10], + [0.563634, 10], + [-0.353756, 20], + [0.368062, 20], + [-1.721840, 20], + [1.000000, np.nan], + [2.000000, np.nan], + [3.000000, np.nan], + ], + index=df1.index, + columns=["value1", "value2"], + ) + + # these must be the same results (but columns are flipped) + tm.assert_frame_equal(df1.join(df2, how="left"), exp) + tm.assert_frame_equal(df2.join(df1, how="right"), exp[["value2", "value1"]]) + + exp_idx = pd.MultiIndex.from_product( + [["a", "b"], ["x", "y", "z"]], names=["first", "second"] + ) + exp = pd.DataFrame( + [ + [0.471780, 10], + [0.774908, 10], + [0.563634, 10], + [-0.353756, 20], + [0.368062, 20], + [-1.721840, 20], + ], + index=exp_idx, + columns=["value1", "value2"], + ) + + tm.assert_frame_equal(df1.join(df2, how="right"), exp) + tm.assert_frame_equal(df2.join(df1, how="left"), exp[["value2", "value1"]]) + + def test_concat_named_keys(self): + # GH 14252 + df = pd.DataFrame({"foo": [1, 2], "bar": [0.1, 0.2]}) + index = Index(["a", "b"], name="baz") + concatted_named_from_keys = pd.concat([df, df], keys=index) + expected_named = pd.DataFrame( + {"foo": [1, 2, 1, 2], "bar": [0.1, 0.2, 0.1, 0.2]}, + index=pd.MultiIndex.from_product((["a", "b"], [0, 1]), names=["baz", None]), + ) + tm.assert_frame_equal(concatted_named_from_keys, expected_named) + + index_no_name = Index(["a", "b"], name=None) + concatted_named_from_names = pd.concat( + [df, df], keys=index_no_name, names=["baz"] + ) + tm.assert_frame_equal(concatted_named_from_names, expected_named) + + concatted_unnamed = pd.concat([df, df], keys=index_no_name) + expected_unnamed = pd.DataFrame( + {"foo": [1, 2, 1, 2], "bar": [0.1, 0.2, 0.1, 0.2]}, + index=pd.MultiIndex.from_product((["a", "b"], [0, 1]), names=[None, None]), + ) + tm.assert_frame_equal(concatted_unnamed, expected_unnamed) + + def test_concat_axis_parameter(self): + # GH 14369 + df1 = pd.DataFrame({"A": [0.1, 0.2]}, index=range(2)) + df2 = pd.DataFrame({"A": [0.3, 0.4]}, index=range(2)) + + # Index/row/0 DataFrame + expected_index = pd.DataFrame({"A": [0.1, 0.2, 0.3, 0.4]}, index=[0, 1, 0, 1]) + + concatted_index = pd.concat([df1, df2], axis="index") + tm.assert_frame_equal(concatted_index, expected_index) + + concatted_row = pd.concat([df1, df2], axis="rows") + tm.assert_frame_equal(concatted_row, expected_index) + + concatted_0 = pd.concat([df1, df2], axis=0) + tm.assert_frame_equal(concatted_0, expected_index) + + # Columns/1 DataFrame + expected_columns = pd.DataFrame( + [[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=["A", "A"] + ) + + concatted_columns = pd.concat([df1, df2], axis="columns") + tm.assert_frame_equal(concatted_columns, expected_columns) + + concatted_1 = pd.concat([df1, df2], axis=1) + tm.assert_frame_equal(concatted_1, expected_columns) + + series1 = pd.Series([0.1, 0.2]) + series2 = pd.Series([0.3, 0.4]) + + # Index/row/0 Series + expected_index_series = pd.Series([0.1, 0.2, 0.3, 0.4], index=[0, 1, 0, 1]) + + concatted_index_series = pd.concat([series1, series2], axis="index") + tm.assert_series_equal(concatted_index_series, expected_index_series) + + concatted_row_series = pd.concat([series1, series2], axis="rows") + tm.assert_series_equal(concatted_row_series, expected_index_series) + + concatted_0_series = pd.concat([series1, series2], axis=0) + tm.assert_series_equal(concatted_0_series, expected_index_series) + + # Columns/1 Series + expected_columns_series = pd.DataFrame( + [[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=[0, 1] + ) + + concatted_columns_series = pd.concat([series1, series2], axis="columns") + tm.assert_frame_equal(concatted_columns_series, expected_columns_series) + + concatted_1_series = pd.concat([series1, series2], axis=1) + tm.assert_frame_equal(concatted_1_series, expected_columns_series) + + # Testing ValueError + with pytest.raises(ValueError, match="No axis named"): + pd.concat([series1, series2], axis="something") + + def test_concat_numerical_names(self): + # #15262 # #12223 + df = pd.DataFrame( + {"col": range(9)}, + dtype="int32", + index=( + pd.MultiIndex.from_product( + [["A0", "A1", "A2"], ["B0", "B1", "B2"]], names=[1, 2] + ) + ), + ) + result = pd.concat((df.iloc[:2, :], df.iloc[-2:, :])) + expected = pd.DataFrame( + {"col": [0, 1, 7, 8]}, + dtype="int32", + index=pd.MultiIndex.from_tuples( + [("A0", "B0"), ("A0", "B1"), ("A2", "B1"), ("A2", "B2")], names=[1, 2] + ), + ) + tm.assert_frame_equal(result, expected) + + def test_concat_astype_dup_col(self): + # gh 23049 + df = pd.DataFrame([{"a": "b"}]) + df = pd.concat([df, df], axis=1) + + result = df.astype("category") + expected = pd.DataFrame( + np.array(["b", "b"]).reshape(1, 2), columns=["a", "a"] + ).astype("category") + tm.assert_frame_equal(result, expected) + + +class TestDataFrameCombineFirst: + def test_combine_first_mixed(self): + a = Series(["a", "b"], index=range(2)) + b = Series(range(2), index=range(2)) + f = DataFrame({"A": a, "B": b}) + + a = Series(["a", "b"], index=range(5, 7)) + b = Series(range(2), index=range(5, 7)) + g = DataFrame({"A": a, "B": b}) + + exp = pd.DataFrame( + {"A": list("abab"), "B": [0.0, 1.0, 0.0, 1.0]}, index=[0, 1, 5, 6] + ) + combined = f.combine_first(g) + tm.assert_frame_equal(combined, exp) + + def test_combine_first(self, float_frame): + # disjoint + head, tail = float_frame[:5], float_frame[5:] + + combined = head.combine_first(tail) + reordered_frame = float_frame.reindex(combined.index) + tm.assert_frame_equal(combined, reordered_frame) + assert tm.equalContents(combined.columns, float_frame.columns) + tm.assert_series_equal(combined["A"], reordered_frame["A"]) + + # same index + fcopy = float_frame.copy() + fcopy["A"] = 1 + del fcopy["C"] + + fcopy2 = float_frame.copy() + fcopy2["B"] = 0 + del fcopy2["D"] + + combined = fcopy.combine_first(fcopy2) + + assert (combined["A"] == 1).all() + tm.assert_series_equal(combined["B"], fcopy["B"]) + tm.assert_series_equal(combined["C"], fcopy2["C"]) + tm.assert_series_equal(combined["D"], fcopy["D"]) + + # overlap + head, tail = reordered_frame[:10].copy(), reordered_frame + head["A"] = 1 + + combined = head.combine_first(tail) + assert (combined["A"][:10] == 1).all() + + # reverse overlap + tail["A"][:10] = 0 + combined = tail.combine_first(head) + assert (combined["A"][:10] == 0).all() + + # no overlap + f = float_frame[:10] + g = float_frame[10:] + combined = f.combine_first(g) + tm.assert_series_equal(combined["A"].reindex(f.index), f["A"]) + tm.assert_series_equal(combined["A"].reindex(g.index), g["A"]) + + # corner cases + comb = float_frame.combine_first(DataFrame()) + tm.assert_frame_equal(comb, float_frame) + + comb = DataFrame().combine_first(float_frame) + tm.assert_frame_equal(comb, float_frame) + + comb = float_frame.combine_first(DataFrame(index=["faz", "boo"])) + assert "faz" in comb.index + + # #2525 + df = DataFrame({"a": [1]}, index=[datetime(2012, 1, 1)]) + df2 = DataFrame(columns=["b"]) + result = df.combine_first(df2) + assert "b" in result + + def test_combine_first_mixed_bug(self): + idx = Index(["a", "b", "c", "e"]) + ser1 = Series([5.0, -9.0, 4.0, 100.0], index=idx) + ser2 = Series(["a", "b", "c", "e"], index=idx) + ser3 = Series([12, 4, 5, 97], index=idx) + + frame1 = DataFrame({"col0": ser1, "col2": ser2, "col3": ser3}) + + idx = Index(["a", "b", "c", "f"]) + ser1 = Series([5.0, -9.0, 4.0, 100.0], index=idx) + ser2 = Series(["a", "b", "c", "f"], index=idx) + ser3 = Series([12, 4, 5, 97], index=idx) + + frame2 = DataFrame({"col1": ser1, "col2": ser2, "col5": ser3}) + + combined = frame1.combine_first(frame2) + assert len(combined.columns) == 5 + + # gh 3016 (same as in update) + df = DataFrame( + [[1.0, 2.0, False, True], [4.0, 5.0, True, False]], + columns=["A", "B", "bool1", "bool2"], + ) + + other = DataFrame([[45, 45]], index=[0], columns=["A", "B"]) + result = df.combine_first(other) + tm.assert_frame_equal(result, df) + + df.loc[0, "A"] = np.nan + result = df.combine_first(other) + df.loc[0, "A"] = 45 + tm.assert_frame_equal(result, df) + + # doc example + df1 = DataFrame( + {"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]} + ) + + df2 = DataFrame( + { + "A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0], + "B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0], + } + ) + + result = df1.combine_first(df2) + expected = DataFrame({"A": [1, 2, 3, 5, 3, 7.0], "B": [np.nan, 2, 3, 4, 6, 8]}) + tm.assert_frame_equal(result, expected) + + # GH3552, return object dtype with bools + df1 = DataFrame( + [[np.nan, 3.0, True], [-4.6, np.nan, True], [np.nan, 7.0, False]] + ) + df2 = DataFrame([[-42.6, np.nan, True], [-5.0, 1.6, False]], index=[1, 2]) + + result = df1.combine_first(df2)[2] + expected = Series([True, True, False], name=2) + tm.assert_series_equal(result, expected) + + # GH 3593, converting datetime64[ns] incorrectly + df0 = DataFrame( + {"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]} + ) + df1 = DataFrame({"a": [None, None, None]}) + df2 = df1.combine_first(df0) + tm.assert_frame_equal(df2, df0) + + df2 = df0.combine_first(df1) + tm.assert_frame_equal(df2, df0) + + df0 = DataFrame( + {"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]} + ) + df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]}) + df2 = df1.combine_first(df0) + result = df0.copy() + result.iloc[0, :] = df1.iloc[0, :] + tm.assert_frame_equal(df2, result) + + df2 = df0.combine_first(df1) + tm.assert_frame_equal(df2, df0) + + def test_combine_first_align_nan(self): + # GH 7509 (not fixed) + dfa = pd.DataFrame([[pd.Timestamp("2011-01-01"), 2]], columns=["a", "b"]) + dfb = pd.DataFrame([[4], [5]], columns=["b"]) + assert dfa["a"].dtype == "datetime64[ns]" + assert dfa["b"].dtype == "int64" + + res = dfa.combine_first(dfb) + exp = pd.DataFrame( + {"a": [pd.Timestamp("2011-01-01"), pd.NaT], "b": [2.0, 5.0]}, + columns=["a", "b"], + ) + tm.assert_frame_equal(res, exp) + assert res["a"].dtype == "datetime64[ns]" + # ToDo: this must be int64 + assert res["b"].dtype == "float64" + + res = dfa.iloc[:0].combine_first(dfb) + exp = pd.DataFrame({"a": [np.nan, np.nan], "b": [4, 5]}, columns=["a", "b"]) + tm.assert_frame_equal(res, exp) + # ToDo: this must be datetime64 + assert res["a"].dtype == "float64" + # ToDo: this must be int64 + assert res["b"].dtype == "int64" + + def test_combine_first_timezone(self): + # see gh-7630 + data1 = pd.to_datetime("20100101 01:01").tz_localize("UTC") + df1 = pd.DataFrame( + columns=["UTCdatetime", "abc"], + data=data1, + index=pd.date_range("20140627", periods=1), + ) + data2 = pd.to_datetime("20121212 12:12").tz_localize("UTC") + df2 = pd.DataFrame( + columns=["UTCdatetime", "xyz"], + data=data2, + index=pd.date_range("20140628", periods=1), + ) + res = df2[["UTCdatetime"]].combine_first(df1) + exp = pd.DataFrame( + { + "UTCdatetime": [ + pd.Timestamp("2010-01-01 01:01", tz="UTC"), + pd.Timestamp("2012-12-12 12:12", tz="UTC"), + ], + "abc": [pd.Timestamp("2010-01-01 01:01:00", tz="UTC"), pd.NaT], + }, + columns=["UTCdatetime", "abc"], + index=pd.date_range("20140627", periods=2, freq="D"), + ) + tm.assert_frame_equal(res, exp) + assert res["UTCdatetime"].dtype == "datetime64[ns, UTC]" + assert res["abc"].dtype == "datetime64[ns, UTC]" + + # see gh-10567 + dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="UTC") + df1 = pd.DataFrame({"DATE": dts1}) + dts2 = pd.date_range("2015-01-03", "2015-01-05", tz="UTC") + df2 = pd.DataFrame({"DATE": dts2}) + + res = df1.combine_first(df2) + tm.assert_frame_equal(res, df1) + assert res["DATE"].dtype == "datetime64[ns, UTC]" + + dts1 = pd.DatetimeIndex( + ["2011-01-01", "NaT", "2011-01-03", "2011-01-04"], tz="US/Eastern" + ) + df1 = pd.DataFrame({"DATE": dts1}, index=[1, 3, 5, 7]) + dts2 = pd.DatetimeIndex( + ["2012-01-01", "2012-01-02", "2012-01-03"], tz="US/Eastern" + ) + df2 = pd.DataFrame({"DATE": dts2}, index=[2, 4, 5]) + + res = df1.combine_first(df2) + exp_dts = pd.DatetimeIndex( + [ + "2011-01-01", + "2012-01-01", + "NaT", + "2012-01-02", + "2011-01-03", + "2011-01-04", + ], + tz="US/Eastern", + ) + exp = pd.DataFrame({"DATE": exp_dts}, index=[1, 2, 3, 4, 5, 7]) + tm.assert_frame_equal(res, exp) + + # different tz + dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="US/Eastern") + df1 = pd.DataFrame({"DATE": dts1}) + dts2 = pd.date_range("2015-01-03", "2015-01-05") + df2 = pd.DataFrame({"DATE": dts2}) + + # if df1 doesn't have NaN, keep its dtype + res = df1.combine_first(df2) + tm.assert_frame_equal(res, df1) + assert res["DATE"].dtype == "datetime64[ns, US/Eastern]" + + dts1 = pd.date_range("2015-01-01", "2015-01-02", tz="US/Eastern") + df1 = pd.DataFrame({"DATE": dts1}) + dts2 = pd.date_range("2015-01-01", "2015-01-03") + df2 = pd.DataFrame({"DATE": dts2}) + + res = df1.combine_first(df2) + exp_dts = [ + pd.Timestamp("2015-01-01", tz="US/Eastern"), + pd.Timestamp("2015-01-02", tz="US/Eastern"), + pd.Timestamp("2015-01-03"), + ] + exp = pd.DataFrame({"DATE": exp_dts}) + tm.assert_frame_equal(res, exp) + assert res["DATE"].dtype == "object" + + def test_combine_first_timedelta(self): + data1 = pd.TimedeltaIndex(["1 day", "NaT", "3 day", "4day"]) + df1 = pd.DataFrame({"TD": data1}, index=[1, 3, 5, 7]) + data2 = pd.TimedeltaIndex(["10 day", "11 day", "12 day"]) + df2 = pd.DataFrame({"TD": data2}, index=[2, 4, 5]) + + res = df1.combine_first(df2) + exp_dts = pd.TimedeltaIndex( + ["1 day", "10 day", "NaT", "11 day", "3 day", "4 day"] + ) + exp = pd.DataFrame({"TD": exp_dts}, index=[1, 2, 3, 4, 5, 7]) + tm.assert_frame_equal(res, exp) + assert res["TD"].dtype == "timedelta64[ns]" + + def test_combine_first_period(self): + data1 = pd.PeriodIndex(["2011-01", "NaT", "2011-03", "2011-04"], freq="M") + df1 = pd.DataFrame({"P": data1}, index=[1, 3, 5, 7]) + data2 = pd.PeriodIndex(["2012-01-01", "2012-02", "2012-03"], freq="M") + df2 = pd.DataFrame({"P": data2}, index=[2, 4, 5]) + + res = df1.combine_first(df2) + exp_dts = pd.PeriodIndex( + ["2011-01", "2012-01", "NaT", "2012-02", "2011-03", "2011-04"], freq="M" + ) + exp = pd.DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7]) + tm.assert_frame_equal(res, exp) + assert res["P"].dtype == data1.dtype + + # different freq + dts2 = pd.PeriodIndex(["2012-01-01", "2012-01-02", "2012-01-03"], freq="D") + df2 = pd.DataFrame({"P": dts2}, index=[2, 4, 5]) + + res = df1.combine_first(df2) + exp_dts = [ + pd.Period("2011-01", freq="M"), + pd.Period("2012-01-01", freq="D"), + pd.NaT, + pd.Period("2012-01-02", freq="D"), + pd.Period("2011-03", freq="M"), + pd.Period("2011-04", freq="M"), + ] + exp = pd.DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7]) + tm.assert_frame_equal(res, exp) + assert res["P"].dtype == "object" + + def test_combine_first_int(self): + # GH14687 - integer series that do no align exactly + + df1 = pd.DataFrame({"a": [0, 1, 3, 5]}, dtype="int64") + df2 = pd.DataFrame({"a": [1, 4]}, dtype="int64") + + res = df1.combine_first(df2) + tm.assert_frame_equal(res, df1) + assert res["a"].dtype == "int64" + + @pytest.mark.parametrize("val", [1, 1.0]) + def test_combine_first_with_asymmetric_other(self, val): + # see gh-20699 + df1 = pd.DataFrame({"isNum": [val]}) + df2 = pd.DataFrame({"isBool": [True]}) + + res = df1.combine_first(df2) + exp = pd.DataFrame({"isBool": [True], "isNum": [val]}) + + tm.assert_frame_equal(res, exp) + + def test_concat_datetime_datetime64_frame(self): + # #2624 + rows = [] + rows.append([datetime(2010, 1, 1), 1]) + rows.append([datetime(2010, 1, 2), "hi"]) + + df2_obj = DataFrame.from_records(rows, columns=["date", "test"]) + + ind = date_range(start="2000/1/1", freq="D", periods=10) + df1 = DataFrame({"date": ind, "test": range(10)}) + + # it works! + pd.concat([df1, df2_obj]) + + +class TestDataFrameUpdate: + def test_update_nan(self): + # #15593 #15617 + # test 1 + df1 = DataFrame({"A": [1.0, 2, 3], "B": date_range("2000", periods=3)}) + df2 = DataFrame({"A": [None, 2, 3]}) + expected = df1.copy() + df1.update(df2, overwrite=False) + + tm.assert_frame_equal(df1, expected) + + # test 2 + df1 = DataFrame({"A": [1.0, None, 3], "B": date_range("2000", periods=3)}) + df2 = DataFrame({"A": [None, 2, 3]}) + expected = DataFrame({"A": [1.0, 2, 3], "B": date_range("2000", periods=3)}) + df1.update(df2, overwrite=False) + + tm.assert_frame_equal(df1, expected) diff --git a/venv/Lib/site-packages/pandas/tests/frame/test_constructors.py b/venv/Lib/site-packages/pandas/tests/frame/test_constructors.py new file mode 100644 index 0000000..ea1e339 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/test_constructors.py @@ -0,0 +1,2569 @@ +from collections import OrderedDict, abc +from datetime import date, datetime, timedelta +import functools +import itertools + +import numpy as np +import numpy.ma as ma +import numpy.ma.mrecords as mrecords +import pytest + +from pandas.compat import is_platform_little_endian + +from pandas.core.dtypes.common import is_integer_dtype + +import pandas as pd +from pandas import ( + Categorical, + DataFrame, + Index, + MultiIndex, + RangeIndex, + Series, + Timedelta, + Timestamp, + date_range, + isna, +) +import pandas._testing as tm +from pandas.arrays import IntervalArray, PeriodArray, SparseArray +from pandas.core.construction import create_series_with_explicit_dtype + +MIXED_FLOAT_DTYPES = ["float16", "float32", "float64"] +MIXED_INT_DTYPES = [ + "uint8", + "uint16", + "uint32", + "uint64", + "int8", + "int16", + "int32", + "int64", +] + + +class TestDataFrameConstructors: + def test_series_with_name_not_matching_column(self): + # GH#9232 + x = pd.Series(range(5), name=1) + y = pd.Series(range(5), name=0) + + result = pd.DataFrame(x, columns=[0]) + expected = pd.DataFrame([], columns=[0]) + tm.assert_frame_equal(result, expected) + + result = pd.DataFrame(y, columns=[1]) + expected = pd.DataFrame([], columns=[1]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "constructor", + [ + lambda: DataFrame(), + lambda: DataFrame(None), + lambda: DataFrame({}), + lambda: DataFrame(()), + lambda: DataFrame([]), + lambda: DataFrame((_ for _ in [])), + lambda: DataFrame(range(0)), + lambda: DataFrame(data=None), + lambda: DataFrame(data={}), + lambda: DataFrame(data=()), + lambda: DataFrame(data=[]), + lambda: DataFrame(data=(_ for _ in [])), + lambda: DataFrame(data=range(0)), + ], + ) + def test_empty_constructor(self, constructor): + expected = DataFrame() + result = constructor() + assert len(result.index) == 0 + assert len(result.columns) == 0 + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "emptylike,expected_index,expected_columns", + [ + ([[]], RangeIndex(1), RangeIndex(0)), + ([[], []], RangeIndex(2), RangeIndex(0)), + ([(_ for _ in [])], RangeIndex(1), RangeIndex(0)), + ], + ) + def test_emptylike_constructor(self, emptylike, expected_index, expected_columns): + expected = DataFrame(index=expected_index, columns=expected_columns) + result = DataFrame(emptylike) + tm.assert_frame_equal(result, expected) + + def test_constructor_mixed(self, float_string_frame): + index, data = tm.getMixedTypeDict() + + # TODO(wesm), incomplete test? + indexed_frame = DataFrame(data, index=index) # noqa + unindexed_frame = DataFrame(data) # noqa + + assert float_string_frame["foo"].dtype == np.object_ + + def test_constructor_cast_failure(self): + foo = DataFrame({"a": ["a", "b", "c"]}, dtype=np.float64) + assert foo["a"].dtype == object + + # GH 3010, constructing with odd arrays + df = DataFrame(np.ones((4, 2))) + + # this is ok + df["foo"] = np.ones((4, 2)).tolist() + + # this is not ok + msg = "Wrong number of items passed 2, placement implies 1" + with pytest.raises(ValueError, match=msg): + df["test"] = np.ones((4, 2)) + + # this is ok + df["foo2"] = np.ones((4, 2)).tolist() + + def test_constructor_dtype_copy(self): + orig_df = DataFrame({"col1": [1.0], "col2": [2.0], "col3": [3.0]}) + + new_df = pd.DataFrame(orig_df, dtype=float, copy=True) + + new_df["col1"] = 200.0 + assert orig_df["col1"][0] == 1.0 + + def test_constructor_dtype_nocast_view(self): + df = DataFrame([[1, 2]]) + should_be_view = DataFrame(df, dtype=df[0].dtype) + should_be_view[0][0] = 99 + assert df.values[0, 0] == 99 + + should_be_view = DataFrame(df.values, dtype=df[0].dtype) + should_be_view[0][0] = 97 + assert df.values[0, 0] == 97 + + def test_constructor_dtype_list_data(self): + df = DataFrame([[1, "2"], [None, "a"]], dtype=object) + assert df.loc[1, 0] is None + assert df.loc[0, 1] == "2" + + def test_constructor_list_frames(self): + # see gh-3243 + result = DataFrame([DataFrame()]) + assert result.shape == (1, 0) + + result = DataFrame([DataFrame(dict(A=np.arange(5)))]) + assert isinstance(result.iloc[0, 0], DataFrame) + + def test_constructor_mixed_dtypes(self): + def _make_mixed_dtypes_df(typ, ad=None): + + if typ == "int": + dtypes = MIXED_INT_DTYPES + arrays = [np.array(np.random.rand(10), dtype=d) for d in dtypes] + elif typ == "float": + dtypes = MIXED_FLOAT_DTYPES + arrays = [ + np.array(np.random.randint(10, size=10), dtype=d) for d in dtypes + ] + + for d, a in zip(dtypes, arrays): + assert a.dtype == d + if ad is None: + ad = dict() + ad.update({d: a for d, a in zip(dtypes, arrays)}) + return DataFrame(ad) + + def _check_mixed_dtypes(df, dtypes=None): + if dtypes is None: + dtypes = MIXED_FLOAT_DTYPES + MIXED_INT_DTYPES + for d in dtypes: + if d in df: + assert df.dtypes[d] == d + + # mixed floating and integer coexist in the same frame + df = _make_mixed_dtypes_df("float") + _check_mixed_dtypes(df) + + # add lots of types + df = _make_mixed_dtypes_df("float", dict(A=1, B="foo", C="bar")) + _check_mixed_dtypes(df) + + # GH 622 + df = _make_mixed_dtypes_df("int") + _check_mixed_dtypes(df) + + def test_constructor_complex_dtypes(self): + # GH10952 + a = np.random.rand(10).astype(np.complex64) + b = np.random.rand(10).astype(np.complex128) + + df = DataFrame({"a": a, "b": b}) + assert a.dtype == df.a.dtype + assert b.dtype == df.b.dtype + + def test_constructor_dtype_str_na_values(self, string_dtype): + # https://github.com/pandas-dev/pandas/issues/21083 + df = DataFrame({"A": ["x", None]}, dtype=string_dtype) + result = df.isna() + expected = DataFrame({"A": [False, True]}) + tm.assert_frame_equal(result, expected) + assert df.iloc[1, 0] is None + + df = DataFrame({"A": ["x", np.nan]}, dtype=string_dtype) + assert np.isnan(df.iloc[1, 0]) + + def test_constructor_rec(self, float_frame): + rec = float_frame.to_records(index=False) + rec.dtype.names = list(rec.dtype.names)[::-1] + + index = float_frame.index + + df = DataFrame(rec) + tm.assert_index_equal(df.columns, pd.Index(rec.dtype.names)) + + df2 = DataFrame(rec, index=index) + tm.assert_index_equal(df2.columns, pd.Index(rec.dtype.names)) + tm.assert_index_equal(df2.index, index) + + rng = np.arange(len(rec))[::-1] + df3 = DataFrame(rec, index=rng, columns=["C", "B"]) + expected = DataFrame(rec, index=rng).reindex(columns=["C", "B"]) + tm.assert_frame_equal(df3, expected) + + def test_constructor_bool(self): + df = DataFrame({0: np.ones(10, dtype=bool), 1: np.zeros(10, dtype=bool)}) + assert df.values.dtype == np.bool_ + + def test_constructor_overflow_int64(self): + # see gh-14881 + values = np.array([2 ** 64 - i for i in range(1, 10)], dtype=np.uint64) + + result = DataFrame({"a": values}) + assert result["a"].dtype == np.uint64 + + # see gh-2355 + data_scores = [ + (6311132704823138710, 273), + (2685045978526272070, 23), + (8921811264899370420, 45), + (17019687244989530680, 270), + (9930107427299601010, 273), + ] + dtype = [("uid", "u8"), ("score", "u8")] + data = np.zeros((len(data_scores),), dtype=dtype) + data[:] = data_scores + df_crawls = DataFrame(data) + assert df_crawls["uid"].dtype == np.uint64 + + @pytest.mark.parametrize( + "values", + [ + np.array([2 ** 64], dtype=object), + np.array([2 ** 65]), + [2 ** 64 + 1], + np.array([-(2 ** 63) - 4], dtype=object), + np.array([-(2 ** 64) - 1]), + [-(2 ** 65) - 2], + ], + ) + def test_constructor_int_overflow(self, values): + # see gh-18584 + value = values[0] + result = DataFrame(values) + + assert result[0].dtype == object + assert result[0][0] == value + + def test_constructor_ordereddict(self): + import random + + nitems = 100 + nums = list(range(nitems)) + random.shuffle(nums) + expected = ["A{i:d}".format(i=i) for i in nums] + df = DataFrame(OrderedDict(zip(expected, [[0]] * nitems))) + assert expected == list(df.columns) + + def test_constructor_dict(self): + datetime_series = tm.makeTimeSeries(nper=30) + # test expects index shifted by 5 + datetime_series_short = tm.makeTimeSeries(nper=30)[5:] + + frame = DataFrame({"col1": datetime_series, "col2": datetime_series_short}) + + # col2 is padded with NaN + assert len(datetime_series) == 30 + assert len(datetime_series_short) == 25 + + tm.assert_series_equal(frame["col1"], datetime_series.rename("col1")) + + exp = pd.Series( + np.concatenate([[np.nan] * 5, datetime_series_short.values]), + index=datetime_series.index, + name="col2", + ) + tm.assert_series_equal(exp, frame["col2"]) + + frame = DataFrame( + {"col1": datetime_series, "col2": datetime_series_short}, + columns=["col2", "col3", "col4"], + ) + + assert len(frame) == len(datetime_series_short) + assert "col1" not in frame + assert isna(frame["col3"]).all() + + # Corner cases + assert len(DataFrame()) == 0 + + # mix dict and array, wrong size - no spec for which error should raise + # first + with pytest.raises(ValueError): + DataFrame({"A": {"a": "a", "b": "b"}, "B": ["a", "b", "c"]}) + + # Length-one dict micro-optimization + frame = DataFrame({"A": {"1": 1, "2": 2}}) + tm.assert_index_equal(frame.index, pd.Index(["1", "2"])) + + # empty dict plus index + idx = Index([0, 1, 2]) + frame = DataFrame({}, index=idx) + assert frame.index is idx + + # empty dict with index and columns + idx = Index([0, 1, 2]) + frame = DataFrame({}, index=idx, columns=idx) + assert frame.index is idx + assert frame.columns is idx + assert len(frame._series) == 3 + + # with dict of empty list and Series + frame = DataFrame({"A": [], "B": []}, columns=["A", "B"]) + tm.assert_index_equal(frame.index, Index([], dtype=np.int64)) + + # GH 14381 + # Dict with None value + frame_none = DataFrame(dict(a=None), index=[0]) + frame_none_list = DataFrame(dict(a=[None]), index=[0]) + assert frame_none._get_value(0, "a") is None + assert frame_none_list._get_value(0, "a") is None + tm.assert_frame_equal(frame_none, frame_none_list) + + # GH10856 + # dict with scalar values should raise error, even if columns passed + msg = "If using all scalar values, you must pass an index" + with pytest.raises(ValueError, match=msg): + DataFrame({"a": 0.7}) + + with pytest.raises(ValueError, match=msg): + DataFrame({"a": 0.7}, columns=["a"]) + + @pytest.mark.parametrize("scalar", [2, np.nan, None, "D"]) + def test_constructor_invalid_items_unused(self, scalar): + # No error if invalid (scalar) value is in fact not used: + result = DataFrame({"a": scalar}, columns=["b"]) + expected = DataFrame(columns=["b"]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("value", [2, np.nan, None, float("nan")]) + def test_constructor_dict_nan_key(self, value): + # GH 18455 + cols = [1, value, 3] + idx = ["a", value] + values = [[0, 3], [1, 4], [2, 5]] + data = {cols[c]: Series(values[c], index=idx) for c in range(3)} + result = DataFrame(data).sort_values(1).sort_values("a", axis=1) + expected = DataFrame( + np.arange(6, dtype="int64").reshape(2, 3), index=idx, columns=cols + ) + tm.assert_frame_equal(result, expected) + + result = DataFrame(data, index=idx).sort_values("a", axis=1) + tm.assert_frame_equal(result, expected) + + result = DataFrame(data, index=idx, columns=cols) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("value", [np.nan, None, float("nan")]) + def test_constructor_dict_nan_tuple_key(self, value): + # GH 18455 + cols = Index([(11, 21), (value, 22), (13, value)]) + idx = Index([("a", value), (value, 2)]) + values = [[0, 3], [1, 4], [2, 5]] + data = {cols[c]: Series(values[c], index=idx) for c in range(3)} + result = DataFrame(data).sort_values((11, 21)).sort_values(("a", value), axis=1) + expected = DataFrame( + np.arange(6, dtype="int64").reshape(2, 3), index=idx, columns=cols + ) + tm.assert_frame_equal(result, expected) + + result = DataFrame(data, index=idx).sort_values(("a", value), axis=1) + tm.assert_frame_equal(result, expected) + + result = DataFrame(data, index=idx, columns=cols) + tm.assert_frame_equal(result, expected) + + def test_constructor_dict_order_insertion(self): + datetime_series = tm.makeTimeSeries(nper=30) + datetime_series_short = tm.makeTimeSeries(nper=25) + + # GH19018 + # initialization ordering: by insertion order if python>= 3.6 + d = {"b": datetime_series_short, "a": datetime_series} + frame = DataFrame(data=d) + expected = DataFrame(data=d, columns=list("ba")) + tm.assert_frame_equal(frame, expected) + + def test_constructor_multi_index(self): + # GH 4078 + # construction error with mi and all-nan frame + tuples = [(2, 3), (3, 3), (3, 3)] + mi = MultiIndex.from_tuples(tuples) + df = DataFrame(index=mi, columns=mi) + assert pd.isna(df).values.ravel().all() + + tuples = [(3, 3), (2, 3), (3, 3)] + mi = MultiIndex.from_tuples(tuples) + df = DataFrame(index=mi, columns=mi) + assert pd.isna(df).values.ravel().all() + + def test_constructor_2d_index(self): + # GH 25416 + # handling of 2d index in construction + df = pd.DataFrame([[1]], columns=[[1]], index=[1, 2]) + expected = pd.DataFrame( + [1, 1], + index=pd.Int64Index([1, 2], dtype="int64"), + columns=pd.MultiIndex(levels=[[1]], codes=[[0]]), + ) + tm.assert_frame_equal(df, expected) + + df = pd.DataFrame([[1]], columns=[[1]], index=[[1, 2]]) + expected = pd.DataFrame( + [1, 1], + index=pd.MultiIndex(levels=[[1, 2]], codes=[[0, 1]]), + columns=pd.MultiIndex(levels=[[1]], codes=[[0]]), + ) + tm.assert_frame_equal(df, expected) + + def test_constructor_error_msgs(self): + msg = "Empty data passed with indices specified." + # passing an empty array with columns specified. + with pytest.raises(ValueError, match=msg): + DataFrame(np.empty(0), columns=list("abc")) + + msg = "Mixing dicts with non-Series may lead to ambiguous ordering." + # mix dict and array, wrong size + with pytest.raises(ValueError, match=msg): + DataFrame({"A": {"a": "a", "b": "b"}, "B": ["a", "b", "c"]}) + + # wrong size ndarray, GH 3105 + msg = r"Shape of passed values is \(4, 3\), indices imply \(3, 3\)" + with pytest.raises(ValueError, match=msg): + DataFrame( + np.arange(12).reshape((4, 3)), + columns=["foo", "bar", "baz"], + index=pd.date_range("2000-01-01", periods=3), + ) + + arr = np.array([[4, 5, 6]]) + msg = r"Shape of passed values is \(1, 3\), indices imply \(1, 4\)" + with pytest.raises(ValueError, match=msg): + DataFrame(index=[0], columns=range(0, 4), data=arr) + + arr = np.array([4, 5, 6]) + msg = r"Shape of passed values is \(3, 1\), indices imply \(1, 4\)" + with pytest.raises(ValueError, match=msg): + DataFrame(index=[0], columns=range(0, 4), data=arr) + + # higher dim raise exception + with pytest.raises(ValueError, match="Must pass 2-d input"): + DataFrame(np.zeros((3, 3, 3)), columns=["A", "B", "C"], index=[1]) + + # wrong size axis labels + msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)" + with pytest.raises(ValueError, match=msg): + DataFrame(np.random.rand(2, 3), columns=["A", "B", "C"], index=[1]) + + msg = r"Shape of passed values is \(2, 3\), indices imply \(2, 2\)" + with pytest.raises(ValueError, match=msg): + DataFrame(np.random.rand(2, 3), columns=["A", "B"], index=[1, 2]) + + # gh-26429 + msg = "2 columns passed, passed data had 10 columns" + with pytest.raises(ValueError, match=msg): + DataFrame((range(10), range(10, 20)), columns=("ones", "twos")) + + msg = "If using all scalar values, you must pass an index" + with pytest.raises(ValueError, match=msg): + DataFrame({"a": False, "b": True}) + + def test_constructor_with_embedded_frames(self): + + # embedded data frames + df1 = DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) + df2 = DataFrame([df1, df1 + 10]) + + df2.dtypes + str(df2) + + result = df2.loc[0, 0] + tm.assert_frame_equal(result, df1) + + result = df2.loc[1, 0] + tm.assert_frame_equal(result, df1 + 10) + + def test_constructor_subclass_dict(self, float_frame, dict_subclass): + # Test for passing dict subclass to constructor + data = { + "col1": dict_subclass((x, 10.0 * x) for x in range(10)), + "col2": dict_subclass((x, 20.0 * x) for x in range(10)), + } + df = DataFrame(data) + refdf = DataFrame({col: dict(val.items()) for col, val in data.items()}) + tm.assert_frame_equal(refdf, df) + + data = dict_subclass(data.items()) + df = DataFrame(data) + tm.assert_frame_equal(refdf, df) + + # try with defaultdict + from collections import defaultdict + + data = {} + float_frame["B"][:10] = np.nan + for k, v in float_frame.items(): + dct = defaultdict(dict) + dct.update(v.to_dict()) + data[k] = dct + frame = DataFrame(data) + expected = frame.reindex(index=float_frame.index) + tm.assert_frame_equal(float_frame, expected) + + def test_constructor_dict_block(self): + expected = np.array([[4.0, 3.0, 2.0, 1.0]]) + df = DataFrame( + {"d": [4.0], "c": [3.0], "b": [2.0], "a": [1.0]}, + columns=["d", "c", "b", "a"], + ) + tm.assert_numpy_array_equal(df.values, expected) + + def test_constructor_dict_cast(self): + # cast float tests + test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}} + frame = DataFrame(test_data, dtype=float) + assert len(frame) == 3 + assert frame["B"].dtype == np.float64 + assert frame["A"].dtype == np.float64 + + frame = DataFrame(test_data) + assert len(frame) == 3 + assert frame["B"].dtype == np.object_ + assert frame["A"].dtype == np.float64 + + # can't cast to float + test_data = { + "A": dict(zip(range(20), tm.makeStringIndex(20))), + "B": dict(zip(range(15), np.random.randn(15))), + } + frame = DataFrame(test_data, dtype=float) + assert len(frame) == 20 + assert frame["A"].dtype == np.object_ + assert frame["B"].dtype == np.float64 + + def test_constructor_dict_dont_upcast(self): + d = {"Col1": {"Row1": "A String", "Row2": np.nan}} + df = DataFrame(d) + assert isinstance(df["Col1"]["Row2"], float) + + dm = DataFrame([[1, 2], ["a", "b"]], index=[1, 2], columns=[1, 2]) + assert isinstance(dm[1][1], int) + + def test_constructor_dict_of_tuples(self): + # GH #1491 + data = {"a": (1, 2, 3), "b": (4, 5, 6)} + + result = DataFrame(data) + expected = DataFrame({k: list(v) for k, v in data.items()}) + tm.assert_frame_equal(result, expected, check_dtype=False) + + def test_constructor_dict_of_ranges(self): + # GH 26356 + data = {"a": range(3), "b": range(3, 6)} + + result = DataFrame(data) + expected = DataFrame({"a": [0, 1, 2], "b": [3, 4, 5]}) + tm.assert_frame_equal(result, expected) + + def test_constructor_dict_of_iterators(self): + # GH 26349 + data = {"a": iter(range(3)), "b": reversed(range(3))} + + result = DataFrame(data) + expected = DataFrame({"a": [0, 1, 2], "b": [2, 1, 0]}) + tm.assert_frame_equal(result, expected) + + def test_constructor_dict_of_generators(self): + # GH 26349 + data = {"a": (i for i in (range(3))), "b": (i for i in reversed(range(3)))} + result = DataFrame(data) + expected = DataFrame({"a": [0, 1, 2], "b": [2, 1, 0]}) + tm.assert_frame_equal(result, expected) + + def test_constructor_dict_multiindex(self): + def check(result, expected): + return tm.assert_frame_equal( + result, + expected, + check_dtype=True, + check_index_type=True, + check_column_type=True, + check_names=True, + ) + + d = { + ("a", "a"): {("i", "i"): 0, ("i", "j"): 1, ("j", "i"): 2}, + ("b", "a"): {("i", "i"): 6, ("i", "j"): 5, ("j", "i"): 4}, + ("b", "c"): {("i", "i"): 7, ("i", "j"): 8, ("j", "i"): 9}, + } + _d = sorted(d.items()) + df = DataFrame(d) + expected = DataFrame( + [x[1] for x in _d], index=MultiIndex.from_tuples([x[0] for x in _d]) + ).T + expected.index = MultiIndex.from_tuples(expected.index) + check(df, expected) + + d["z"] = {"y": 123.0, ("i", "i"): 111, ("i", "j"): 111, ("j", "i"): 111} + _d.insert(0, ("z", d["z"])) + expected = DataFrame( + [x[1] for x in _d], index=Index([x[0] for x in _d], tupleize_cols=False) + ).T + expected.index = Index(expected.index, tupleize_cols=False) + df = DataFrame(d) + df = df.reindex(columns=expected.columns, index=expected.index) + check(df, expected) + + def test_constructor_dict_datetime64_index(self): + # GH 10160 + dates_as_str = ["1984-02-19", "1988-11-06", "1989-12-03", "1990-03-15"] + + def create_data(constructor): + return {i: {constructor(s): 2 * i} for i, s in enumerate(dates_as_str)} + + data_datetime64 = create_data(np.datetime64) + data_datetime = create_data(lambda x: datetime.strptime(x, "%Y-%m-%d")) + data_Timestamp = create_data(Timestamp) + + expected = DataFrame( + [ + {0: 0, 1: None, 2: None, 3: None}, + {0: None, 1: 2, 2: None, 3: None}, + {0: None, 1: None, 2: 4, 3: None}, + {0: None, 1: None, 2: None, 3: 6}, + ], + index=[Timestamp(dt) for dt in dates_as_str], + ) + + result_datetime64 = DataFrame(data_datetime64) + result_datetime = DataFrame(data_datetime) + result_Timestamp = DataFrame(data_Timestamp) + tm.assert_frame_equal(result_datetime64, expected) + tm.assert_frame_equal(result_datetime, expected) + tm.assert_frame_equal(result_Timestamp, expected) + + def test_constructor_dict_timedelta64_index(self): + # GH 10160 + td_as_int = [1, 2, 3, 4] + + def create_data(constructor): + return {i: {constructor(s): 2 * i} for i, s in enumerate(td_as_int)} + + data_timedelta64 = create_data(lambda x: np.timedelta64(x, "D")) + data_timedelta = create_data(lambda x: timedelta(days=x)) + data_Timedelta = create_data(lambda x: Timedelta(x, "D")) + + expected = DataFrame( + [ + {0: 0, 1: None, 2: None, 3: None}, + {0: None, 1: 2, 2: None, 3: None}, + {0: None, 1: None, 2: 4, 3: None}, + {0: None, 1: None, 2: None, 3: 6}, + ], + index=[Timedelta(td, "D") for td in td_as_int], + ) + + result_timedelta64 = DataFrame(data_timedelta64) + result_timedelta = DataFrame(data_timedelta) + result_Timedelta = DataFrame(data_Timedelta) + tm.assert_frame_equal(result_timedelta64, expected) + tm.assert_frame_equal(result_timedelta, expected) + tm.assert_frame_equal(result_Timedelta, expected) + + def test_constructor_period(self): + # PeriodIndex + a = pd.PeriodIndex(["2012-01", "NaT", "2012-04"], freq="M") + b = pd.PeriodIndex(["2012-02-01", "2012-03-01", "NaT"], freq="D") + df = pd.DataFrame({"a": a, "b": b}) + assert df["a"].dtype == a.dtype + assert df["b"].dtype == b.dtype + + # list of periods + df = pd.DataFrame( + {"a": a.astype(object).tolist(), "b": b.astype(object).tolist()} + ) + assert df["a"].dtype == a.dtype + assert df["b"].dtype == b.dtype + + def test_nested_dict_frame_constructor(self): + rng = pd.period_range("1/1/2000", periods=5) + df = DataFrame(np.random.randn(10, 5), columns=rng) + + data = {} + for col in df.columns: + for row in df.index: + data.setdefault(col, {})[row] = df._get_value(row, col) + + result = DataFrame(data, columns=rng) + tm.assert_frame_equal(result, df) + + data = {} + for col in df.columns: + for row in df.index: + data.setdefault(row, {})[col] = df._get_value(row, col) + + result = DataFrame(data, index=rng).T + tm.assert_frame_equal(result, df) + + def _check_basic_constructor(self, empty): + # mat: 2d matrix with shape (3, 2) to input. empty - makes sized + # objects + mat = empty((2, 3), dtype=float) + # 2-D input + frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2]) + + assert len(frame.index) == 2 + assert len(frame.columns) == 3 + + # 1-D input + frame = DataFrame(empty((3,)), columns=["A"], index=[1, 2, 3]) + assert len(frame.index) == 3 + assert len(frame.columns) == 1 + + # cast type + frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64) + assert frame.values.dtype == np.int64 + + # wrong size axis labels + msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)" + with pytest.raises(ValueError, match=msg): + DataFrame(mat, columns=["A", "B", "C"], index=[1]) + msg = r"Shape of passed values is \(2, 3\), indices imply \(2, 2\)" + with pytest.raises(ValueError, match=msg): + DataFrame(mat, columns=["A", "B"], index=[1, 2]) + + # higher dim raise exception + with pytest.raises(ValueError, match="Must pass 2-d input"): + DataFrame(empty((3, 3, 3)), columns=["A", "B", "C"], index=[1]) + + # automatic labeling + frame = DataFrame(mat) + tm.assert_index_equal(frame.index, pd.Int64Index(range(2))) + tm.assert_index_equal(frame.columns, pd.Int64Index(range(3))) + + frame = DataFrame(mat, index=[1, 2]) + tm.assert_index_equal(frame.columns, pd.Int64Index(range(3))) + + frame = DataFrame(mat, columns=["A", "B", "C"]) + tm.assert_index_equal(frame.index, pd.Int64Index(range(2))) + + # 0-length axis + frame = DataFrame(empty((0, 3))) + assert len(frame.index) == 0 + + frame = DataFrame(empty((3, 0))) + assert len(frame.columns) == 0 + + def test_constructor_ndarray(self): + self._check_basic_constructor(np.ones) + + frame = DataFrame(["foo", "bar"], index=[0, 1], columns=["A"]) + assert len(frame) == 2 + + def test_constructor_maskedarray(self): + self._check_basic_constructor(ma.masked_all) + + # Check non-masked values + mat = ma.masked_all((2, 3), dtype=float) + mat[0, 0] = 1.0 + mat[1, 2] = 2.0 + frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2]) + assert 1.0 == frame["A"][1] + assert 2.0 == frame["C"][2] + + # what is this even checking?? + mat = ma.masked_all((2, 3), dtype=float) + frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2]) + assert np.all(~np.asarray(frame == frame)) + + def test_constructor_maskedarray_nonfloat(self): + # masked int promoted to float + mat = ma.masked_all((2, 3), dtype=int) + # 2-D input + frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2]) + + assert len(frame.index) == 2 + assert len(frame.columns) == 3 + assert np.all(~np.asarray(frame == frame)) + + # cast type + frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.float64) + assert frame.values.dtype == np.float64 + + # Check non-masked values + mat2 = ma.copy(mat) + mat2[0, 0] = 1 + mat2[1, 2] = 2 + frame = DataFrame(mat2, columns=["A", "B", "C"], index=[1, 2]) + assert 1 == frame["A"][1] + assert 2 == frame["C"][2] + + # masked np.datetime64 stays (use NaT as null) + mat = ma.masked_all((2, 3), dtype="M8[ns]") + # 2-D input + frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2]) + + assert len(frame.index) == 2 + assert len(frame.columns) == 3 + assert isna(frame).values.all() + + # cast type + frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64) + assert frame.values.dtype == np.int64 + + # Check non-masked values + mat2 = ma.copy(mat) + mat2[0, 0] = 1 + mat2[1, 2] = 2 + frame = DataFrame(mat2, columns=["A", "B", "C"], index=[1, 2]) + assert 1 == frame["A"].view("i8")[1] + assert 2 == frame["C"].view("i8")[2] + + # masked bool promoted to object + mat = ma.masked_all((2, 3), dtype=bool) + # 2-D input + frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2]) + + assert len(frame.index) == 2 + assert len(frame.columns) == 3 + assert np.all(~np.asarray(frame == frame)) + + # cast type + frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=object) + assert frame.values.dtype == object + + # Check non-masked values + mat2 = ma.copy(mat) + mat2[0, 0] = True + mat2[1, 2] = False + frame = DataFrame(mat2, columns=["A", "B", "C"], index=[1, 2]) + assert frame["A"][1] is True + assert frame["C"][2] is False + + def test_constructor_maskedarray_hardened(self): + # Check numpy masked arrays with hard masks -- from GH24574 + mat_hard = ma.masked_all((2, 2), dtype=float).harden_mask() + result = pd.DataFrame(mat_hard, columns=["A", "B"], index=[1, 2]) + expected = pd.DataFrame( + {"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, + columns=["A", "B"], + index=[1, 2], + dtype=float, + ) + tm.assert_frame_equal(result, expected) + # Check case where mask is hard but no data are masked + mat_hard = ma.ones((2, 2), dtype=float).harden_mask() + result = pd.DataFrame(mat_hard, columns=["A", "B"], index=[1, 2]) + expected = pd.DataFrame( + {"A": [1.0, 1.0], "B": [1.0, 1.0]}, + columns=["A", "B"], + index=[1, 2], + dtype=float, + ) + tm.assert_frame_equal(result, expected) + + def test_constructor_maskedrecarray_dtype(self): + # Ensure constructor honors dtype + data = np.ma.array( + np.ma.zeros(5, dtype=[("date", "0 + df = DataFrame( + { + "a": 1.0, + "b": 2, + "c": "foo", + floatname: np.array([1.0] * 10, dtype=floatname), + intname: np.array([1] * 10, dtype=intname), + }, + index=np.arange(10), + ) + result = df.dtypes + expected = Series( + [np.dtype("float64")] + + [np.dtype("int64")] + + [np.dtype("object")] + + [np.dtype("float64")] + + [np.dtype(intname)], + index=["a", "b", "c", floatname, intname], + ) + tm.assert_series_equal(result, expected) + + # GH 2809 + ind = date_range(start="2000-01-01", freq="D", periods=10) + datetimes = [ts.to_pydatetime() for ts in ind] + datetime_s = Series(datetimes) + assert datetime_s.dtype == "M8[ns]" + + # GH 2810 + ind = date_range(start="2000-01-01", freq="D", periods=10) + datetimes = [ts.to_pydatetime() for ts in ind] + dates = [ts.date() for ts in ind] + df = DataFrame(datetimes, columns=["datetimes"]) + df["dates"] = dates + result = df.dtypes + expected = Series( + [np.dtype("datetime64[ns]"), np.dtype("object")], + index=["datetimes", "dates"], + ) + tm.assert_series_equal(result, expected) + + # GH 7594 + # don't coerce tz-aware + import pytz + + tz = pytz.timezone("US/Eastern") + dt = tz.localize(datetime(2012, 1, 1)) + + df = DataFrame({"End Date": dt}, index=[0]) + assert df.iat[0, 0] == dt + tm.assert_series_equal( + df.dtypes, Series({"End Date": "datetime64[ns, US/Eastern]"}) + ) + + df = DataFrame([{"End Date": dt}]) + assert df.iat[0, 0] == dt + tm.assert_series_equal( + df.dtypes, Series({"End Date": "datetime64[ns, US/Eastern]"}) + ) + + # tz-aware (UTC and other tz's) + # GH 8411 + dr = date_range("20130101", periods=3) + df = DataFrame({"value": dr}) + assert df.iat[0, 0].tz is None + dr = date_range("20130101", periods=3, tz="UTC") + df = DataFrame({"value": dr}) + assert str(df.iat[0, 0].tz) == "UTC" + dr = date_range("20130101", periods=3, tz="US/Eastern") + df = DataFrame({"value": dr}) + assert str(df.iat[0, 0].tz) == "US/Eastern" + + # GH 7822 + # preserver an index with a tz on dict construction + i = date_range("1/1/2011", periods=5, freq="10s", tz="US/Eastern") + + expected = DataFrame({"a": i.to_series().reset_index(drop=True)}) + df = DataFrame() + df["a"] = i + tm.assert_frame_equal(df, expected) + + df = DataFrame({"a": i}) + tm.assert_frame_equal(df, expected) + + # multiples + i_no_tz = date_range("1/1/2011", periods=5, freq="10s") + df = DataFrame({"a": i, "b": i_no_tz}) + expected = DataFrame({"a": i.to_series().reset_index(drop=True), "b": i_no_tz}) + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize( + "arr", + [ + np.array([None, None, None, None, datetime.now(), None]), + np.array([None, None, datetime.now(), None]), + [[np.datetime64("NaT")], [None]], + [[np.datetime64("NaT")], [pd.NaT]], + [[None], [np.datetime64("NaT")]], + [[None], [pd.NaT]], + [[pd.NaT], [np.datetime64("NaT")]], + [[pd.NaT], [None]], + ], + ) + def test_constructor_datetimes_with_nulls(self, arr): + # gh-15869, GH#11220 + result = DataFrame(arr).dtypes + expected = Series([np.dtype("datetime64[ns]")]) + tm.assert_series_equal(result, expected) + + def test_constructor_for_list_with_dtypes(self): + # test list of lists/ndarrays + df = DataFrame([np.arange(5) for x in range(5)]) + result = df.dtypes + expected = Series([np.dtype("int64")] * 5) + tm.assert_series_equal(result, expected) + + df = DataFrame([np.array(np.arange(5), dtype="int32") for x in range(5)]) + result = df.dtypes + expected = Series([np.dtype("int64")] * 5) + tm.assert_series_equal(result, expected) + + # overflow issue? (we always expecte int64 upcasting here) + df = DataFrame({"a": [2 ** 31, 2 ** 31 + 1]}) + assert df.dtypes.iloc[0] == np.dtype("int64") + + # GH #2751 (construction with no index specified), make sure we cast to + # platform values + df = DataFrame([1, 2]) + assert df.dtypes.iloc[0] == np.dtype("int64") + + df = DataFrame([1.0, 2.0]) + assert df.dtypes.iloc[0] == np.dtype("float64") + + df = DataFrame({"a": [1, 2]}) + assert df.dtypes.iloc[0] == np.dtype("int64") + + df = DataFrame({"a": [1.0, 2.0]}) + assert df.dtypes.iloc[0] == np.dtype("float64") + + df = DataFrame({"a": 1}, index=range(3)) + assert df.dtypes.iloc[0] == np.dtype("int64") + + df = DataFrame({"a": 1.0}, index=range(3)) + assert df.dtypes.iloc[0] == np.dtype("float64") + + # with object list + df = DataFrame( + { + "a": [1, 2, 4, 7], + "b": [1.2, 2.3, 5.1, 6.3], + "c": list("abcd"), + "d": [datetime(2000, 1, 1) for i in range(4)], + "e": [1.0, 2, 4.0, 7], + } + ) + result = df.dtypes + expected = Series( + [ + np.dtype("int64"), + np.dtype("float64"), + np.dtype("object"), + np.dtype("datetime64[ns]"), + np.dtype("float64"), + ], + index=list("abcde"), + ) + tm.assert_series_equal(result, expected) + + def test_constructor_frame_copy(self, float_frame): + cop = DataFrame(float_frame, copy=True) + cop["A"] = 5 + assert (cop["A"] == 5).all() + assert not (float_frame["A"] == 5).all() + + def test_constructor_ndarray_copy(self, float_frame): + df = DataFrame(float_frame.values) + + float_frame.values[5] = 5 + assert (df.values[5] == 5).all() + + df = DataFrame(float_frame.values, copy=True) + float_frame.values[6] = 6 + assert not (df.values[6] == 6).all() + + def test_constructor_series_copy(self, float_frame): + series = float_frame._series + + df = DataFrame({"A": series["A"]}) + df["A"][:] = 5 + + assert not (series["A"] == 5).all() + + def test_constructor_with_nas(self): + # GH 5016 + # na's in indices + + def check(df): + for i in range(len(df.columns)): + df.iloc[:, i] + + indexer = np.arange(len(df.columns))[isna(df.columns)] + + # No NaN found -> error + if len(indexer) == 0: + msg = ( + "cannot do label indexing on" + r" " + r" with these indexers \[nan\] of " + ) + with pytest.raises(TypeError, match=msg): + df.loc[:, np.nan] + # single nan should result in Series + elif len(indexer) == 1: + tm.assert_series_equal(df.iloc[:, indexer[0]], df.loc[:, np.nan]) + # multiple nans should result in DataFrame + else: + tm.assert_frame_equal(df.iloc[:, indexer], df.loc[:, np.nan]) + + df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[1, np.nan]) + check(df) + + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=[1.1, 2.2, np.nan]) + check(df) + + df = DataFrame([[0, 1, 2, 3], [4, 5, 6, 7]], columns=[np.nan, 1.1, 2.2, np.nan]) + check(df) + + df = DataFrame( + [[0.0, 1, 2, 3.0], [4, 5, 6, 7]], columns=[np.nan, 1.1, 2.2, np.nan] + ) + check(df) + + # GH 21428 (non-unique columns) + df = DataFrame([[0.0, 1, 2, 3.0], [4, 5, 6, 7]], columns=[np.nan, 1, 2, 2]) + check(df) + + def test_constructor_lists_to_object_dtype(self): + # from #1074 + d = DataFrame({"a": [np.nan, False]}) + assert d["a"].dtype == np.object_ + assert not d["a"][1] + + def test_constructor_categorical(self): + + # GH8626 + + # dict creation + df = DataFrame({"A": list("abc")}, dtype="category") + expected = Series(list("abc"), dtype="category", name="A") + tm.assert_series_equal(df["A"], expected) + + # to_frame + s = Series(list("abc"), dtype="category") + result = s.to_frame() + expected = Series(list("abc"), dtype="category", name=0) + tm.assert_series_equal(result[0], expected) + result = s.to_frame(name="foo") + expected = Series(list("abc"), dtype="category", name="foo") + tm.assert_series_equal(result["foo"], expected) + + # list-like creation + df = DataFrame(list("abc"), dtype="category") + expected = Series(list("abc"), dtype="category", name=0) + tm.assert_series_equal(df[0], expected) + + # ndim != 1 + df = DataFrame([Categorical(list("abc"))]) + expected = DataFrame({0: Series(list("abc"), dtype="category")}) + tm.assert_frame_equal(df, expected) + + df = DataFrame([Categorical(list("abc")), Categorical(list("abd"))]) + expected = DataFrame( + { + 0: Series(list("abc"), dtype="category"), + 1: Series(list("abd"), dtype="category"), + }, + columns=[0, 1], + ) + tm.assert_frame_equal(df, expected) + + # mixed + df = DataFrame([Categorical(list("abc")), list("def")]) + expected = DataFrame( + {0: Series(list("abc"), dtype="category"), 1: list("def")}, columns=[0, 1] + ) + tm.assert_frame_equal(df, expected) + + # invalid (shape) + msg = r"Shape of passed values is \(6, 2\), indices imply \(3, 2\)" + with pytest.raises(ValueError, match=msg): + DataFrame([Categorical(list("abc")), Categorical(list("abdefg"))]) + + # ndim > 1 + msg = "> 1 ndim Categorical are not supported at this time" + with pytest.raises(NotImplementedError, match=msg): + Categorical(np.array([list("abcd")])) + + def test_constructor_categorical_series(self): + + items = [1, 2, 3, 1] + exp = Series(items).astype("category") + res = Series(items, dtype="category") + tm.assert_series_equal(res, exp) + + items = ["a", "b", "c", "a"] + exp = Series(items).astype("category") + res = Series(items, dtype="category") + tm.assert_series_equal(res, exp) + + # insert into frame with different index + # GH 8076 + index = date_range("20000101", periods=3) + expected = Series( + Categorical(values=[np.nan, np.nan, np.nan], categories=["a", "b", "c"]) + ) + expected.index = index + + expected = DataFrame({"x": expected}) + df = DataFrame({"x": Series(["a", "b", "c"], dtype="category")}, index=index) + tm.assert_frame_equal(df, expected) + + def test_from_records_to_records(self): + # from numpy documentation + arr = np.zeros((2,), dtype=("i4,f4,a10")) + arr[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")] + + # TODO(wesm): unused + frame = DataFrame.from_records(arr) # noqa + + index = pd.Index(np.arange(len(arr))[::-1]) + indexed_frame = DataFrame.from_records(arr, index=index) + tm.assert_index_equal(indexed_frame.index, index) + + # without names, it should go to last ditch + arr2 = np.zeros((2, 3)) + tm.assert_frame_equal(DataFrame.from_records(arr2), DataFrame(arr2)) + + # wrong length + msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)" + with pytest.raises(ValueError, match=msg): + DataFrame.from_records(arr, index=index[:-1]) + + indexed_frame = DataFrame.from_records(arr, index="f1") + + # what to do? + records = indexed_frame.to_records() + assert len(records.dtype.names) == 3 + + records = indexed_frame.to_records(index=False) + assert len(records.dtype.names) == 2 + assert "index" not in records.dtype.names + + def test_from_records_nones(self): + tuples = [(1, 2, None, 3), (1, 2, None, 3), (None, 2, 5, 3)] + + df = DataFrame.from_records(tuples, columns=["a", "b", "c", "d"]) + assert np.isnan(df["c"][0]) + + def test_from_records_iterator(self): + arr = np.array( + [(1.0, 1.0, 2, 2), (3.0, 3.0, 4, 4), (5.0, 5.0, 6, 6), (7.0, 7.0, 8, 8)], + dtype=[ + ("x", np.float64), + ("u", np.float32), + ("y", np.int64), + ("z", np.int32), + ], + ) + df = DataFrame.from_records(iter(arr), nrows=2) + xp = DataFrame( + { + "x": np.array([1.0, 3.0], dtype=np.float64), + "u": np.array([1.0, 3.0], dtype=np.float32), + "y": np.array([2, 4], dtype=np.int64), + "z": np.array([2, 4], dtype=np.int32), + } + ) + tm.assert_frame_equal(df.reindex_like(xp), xp) + + # no dtypes specified here, so just compare with the default + arr = [(1.0, 2), (3.0, 4), (5.0, 6), (7.0, 8)] + df = DataFrame.from_records(iter(arr), columns=["x", "y"], nrows=2) + tm.assert_frame_equal(df, xp.reindex(columns=["x", "y"]), check_dtype=False) + + def test_from_records_tuples_generator(self): + def tuple_generator(length): + for i in range(length): + letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + yield (i, letters[i % len(letters)], i / length) + + columns_names = ["Integer", "String", "Float"] + columns = [ + [i[j] for i in tuple_generator(10)] for j in range(len(columns_names)) + ] + data = {"Integer": columns[0], "String": columns[1], "Float": columns[2]} + expected = DataFrame(data, columns=columns_names) + + generator = tuple_generator(10) + result = DataFrame.from_records(generator, columns=columns_names) + tm.assert_frame_equal(result, expected) + + def test_from_records_lists_generator(self): + def list_generator(length): + for i in range(length): + letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + yield [i, letters[i % len(letters)], i / length] + + columns_names = ["Integer", "String", "Float"] + columns = [ + [i[j] for i in list_generator(10)] for j in range(len(columns_names)) + ] + data = {"Integer": columns[0], "String": columns[1], "Float": columns[2]} + expected = DataFrame(data, columns=columns_names) + + generator = list_generator(10) + result = DataFrame.from_records(generator, columns=columns_names) + tm.assert_frame_equal(result, expected) + + def test_from_records_columns_not_modified(self): + tuples = [(1, 2, 3), (1, 2, 3), (2, 5, 3)] + + columns = ["a", "b", "c"] + original_columns = list(columns) + + df = DataFrame.from_records(tuples, columns=columns, index="a") # noqa + + assert columns == original_columns + + def test_from_records_decimal(self): + from decimal import Decimal + + tuples = [(Decimal("1.5"),), (Decimal("2.5"),), (None,)] + + df = DataFrame.from_records(tuples, columns=["a"]) + assert df["a"].dtype == object + + df = DataFrame.from_records(tuples, columns=["a"], coerce_float=True) + assert df["a"].dtype == np.float64 + assert np.isnan(df["a"].values[-1]) + + def test_from_records_duplicates(self): + result = DataFrame.from_records([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "a"]) + + expected = DataFrame([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "a"]) + + tm.assert_frame_equal(result, expected) + + def test_from_records_set_index_name(self): + def create_dict(order_id): + return { + "order_id": order_id, + "quantity": np.random.randint(1, 10), + "price": np.random.randint(1, 10), + } + + documents = [create_dict(i) for i in range(10)] + # demo missing data + documents.append({"order_id": 10, "quantity": 5}) + + result = DataFrame.from_records(documents, index="order_id") + assert result.index.name == "order_id" + + # MultiIndex + result = DataFrame.from_records(documents, index=["order_id", "quantity"]) + assert result.index.names == ("order_id", "quantity") + + def test_from_records_misc_brokenness(self): + # #2179 + + data = {1: ["foo"], 2: ["bar"]} + + result = DataFrame.from_records(data, columns=["a", "b"]) + exp = DataFrame(data, columns=["a", "b"]) + tm.assert_frame_equal(result, exp) + + # overlap in index/index_names + + data = {"a": [1, 2, 3], "b": [4, 5, 6]} + + result = DataFrame.from_records(data, index=["a", "b", "c"]) + exp = DataFrame(data, index=["a", "b", "c"]) + tm.assert_frame_equal(result, exp) + + # GH 2623 + rows = [] + rows.append([datetime(2010, 1, 1), 1]) + rows.append([datetime(2010, 1, 2), "hi"]) # test col upconverts to obj + df2_obj = DataFrame.from_records(rows, columns=["date", "test"]) + result = df2_obj.dtypes + expected = Series( + [np.dtype("datetime64[ns]"), np.dtype("object")], index=["date", "test"] + ) + tm.assert_series_equal(result, expected) + + rows = [] + rows.append([datetime(2010, 1, 1), 1]) + rows.append([datetime(2010, 1, 2), 1]) + df2_obj = DataFrame.from_records(rows, columns=["date", "test"]) + result = df2_obj.dtypes + expected = Series( + [np.dtype("datetime64[ns]"), np.dtype("int64")], index=["date", "test"] + ) + tm.assert_series_equal(result, expected) + + def test_from_records_empty(self): + # 3562 + result = DataFrame.from_records([], columns=["a", "b", "c"]) + expected = DataFrame(columns=["a", "b", "c"]) + tm.assert_frame_equal(result, expected) + + result = DataFrame.from_records([], columns=["a", "b", "b"]) + expected = DataFrame(columns=["a", "b", "b"]) + tm.assert_frame_equal(result, expected) + + def test_from_records_empty_with_nonempty_fields_gh3682(self): + a = np.array([(1, 2)], dtype=[("id", np.int64), ("value", np.int64)]) + df = DataFrame.from_records(a, index="id") + tm.assert_index_equal(df.index, Index([1], name="id")) + assert df.index.name == "id" + tm.assert_index_equal(df.columns, Index(["value"])) + + b = np.array([], dtype=[("id", np.int64), ("value", np.int64)]) + df = DataFrame.from_records(b, index="id") + tm.assert_index_equal(df.index, Index([], name="id")) + assert df.index.name == "id" + + def test_from_records_with_datetimes(self): + + # this may fail on certain platforms because of a numpy issue + # related GH6140 + if not is_platform_little_endian(): + pytest.skip("known failure of test on non-little endian") + + # construction with a null in a recarray + # GH 6140 + expected = DataFrame({"EXPIRY": [datetime(2005, 3, 1, 0, 0), None]}) + + arrdata = [np.array([datetime(2005, 3, 1, 0, 0), None])] + dtypes = [("EXPIRY", " 0 + result = float_string_frame.dtypes + expected = Series( + {k: v.dtype for k, v in float_string_frame.items()}, index=result.index + ) + tm.assert_series_equal(result, expected) + + # compat, GH 8722 + with option_context("use_inf_as_na", True): + df = DataFrame([[1]]) + result = df.dtypes + tm.assert_series_equal(result, Series({0: np.dtype("int64")})) + + def test_astype_float(self, float_frame): + casted = float_frame.astype(int) + expected = DataFrame( + float_frame.values.astype(int), + index=float_frame.index, + columns=float_frame.columns, + ) + tm.assert_frame_equal(casted, expected) + + casted = float_frame.astype(np.int32) + expected = DataFrame( + float_frame.values.astype(np.int32), + index=float_frame.index, + columns=float_frame.columns, + ) + tm.assert_frame_equal(casted, expected) + + float_frame["foo"] = "5" + casted = float_frame.astype(int) + expected = DataFrame( + float_frame.values.astype(int), + index=float_frame.index, + columns=float_frame.columns, + ) + tm.assert_frame_equal(casted, expected) + + def test_astype_mixed_float(self, mixed_float_frame): + # mixed casting + casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float32") + _check_cast(casted, "float32") + + casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float16") + _check_cast(casted, "float16") + + def test_astype_mixed_type(self, mixed_type_frame): + # mixed casting + mn = mixed_type_frame._get_numeric_data().copy() + mn["little_float"] = np.array(12345.0, dtype="float16") + mn["big_float"] = np.array(123456789101112.0, dtype="float64") + + casted = mn.astype("float64") + _check_cast(casted, "float64") + + casted = mn.astype("int64") + _check_cast(casted, "int64") + + casted = mn.reindex(columns=["little_float"]).astype("float16") + _check_cast(casted, "float16") + + casted = mn.astype("float32") + _check_cast(casted, "float32") + + casted = mn.astype("int32") + _check_cast(casted, "int32") + + # to object + casted = mn.astype("O") + _check_cast(casted, "object") + + def test_astype_with_exclude_string(self, float_frame): + df = float_frame.copy() + expected = float_frame.astype(int) + df["string"] = "foo" + casted = df.astype(int, errors="ignore") + + expected["string"] = "foo" + tm.assert_frame_equal(casted, expected) + + df = float_frame.copy() + expected = float_frame.astype(np.int32) + df["string"] = "foo" + casted = df.astype(np.int32, errors="ignore") + + expected["string"] = "foo" + tm.assert_frame_equal(casted, expected) + + def test_astype_with_view_float(self, float_frame): + + # this is the only real reason to do it this way + tf = np.round(float_frame).astype(np.int32) + casted = tf.astype(np.float32, copy=False) + + # TODO(wesm): verification? + tf = float_frame.astype(np.float64) + casted = tf.astype(np.int64, copy=False) # noqa + + def test_astype_with_view_mixed_float(self, mixed_float_frame): + + tf = mixed_float_frame.reindex(columns=["A", "B", "C"]) + + casted = tf.astype(np.int64) + casted = tf.astype(np.float32) # noqa + + @pytest.mark.parametrize("dtype", [np.int32, np.int64]) + @pytest.mark.parametrize("val", [np.nan, np.inf]) + def test_astype_cast_nan_inf_int(self, val, dtype): + # see gh-14265 + # + # Check NaN and inf --> raise error when converting to int. + msg = "Cannot convert non-finite values \\(NA or inf\\) to integer" + df = DataFrame([val]) + + with pytest.raises(ValueError, match=msg): + df.astype(dtype) + + def test_astype_str(self): + # see gh-9757 + a = Series(date_range("2010-01-04", periods=5)) + b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern")) + c = Series([Timedelta(x, unit="d") for x in range(5)]) + d = Series(range(5)) + e = Series([0.0, 0.2, 0.4, 0.6, 0.8]) + + df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e}) + + # Datetime-like + result = df.astype(str) + + expected = DataFrame( + { + "a": list(map(str, map(lambda x: Timestamp(x)._date_repr, a._values))), + "b": list(map(str, map(Timestamp, b._values))), + "c": list( + map( + str, + map(lambda x: Timedelta(x)._repr_base(format="all"), c._values), + ) + ), + "d": list(map(str, d._values)), + "e": list(map(str, e._values)), + } + ) + + tm.assert_frame_equal(result, expected) + + def test_astype_str_float(self): + # see gh-11302 + result = DataFrame([np.NaN]).astype(str) + expected = DataFrame(["nan"]) + + tm.assert_frame_equal(result, expected) + result = DataFrame([1.12345678901234567890]).astype(str) + + # < 1.14 truncates + # >= 1.14 preserves the full repr + val = "1.12345678901" if _np_version_under1p14 else "1.1234567890123457" + expected = DataFrame([val]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("dtype_class", [dict, Series]) + def test_astype_dict_like(self, dtype_class): + # GH7271 & GH16717 + a = Series(date_range("2010-01-04", periods=5)) + b = Series(range(5)) + c = Series([0.0, 0.2, 0.4, 0.6, 0.8]) + d = Series(["1.0", "2", "3.14", "4", "5.4"]) + df = DataFrame({"a": a, "b": b, "c": c, "d": d}) + original = df.copy(deep=True) + + # change type of a subset of columns + dt1 = dtype_class({"b": "str", "d": "float32"}) + result = df.astype(dt1) + expected = DataFrame( + { + "a": a, + "b": Series(["0", "1", "2", "3", "4"]), + "c": c, + "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"), + } + ) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(df, original) + + dt2 = dtype_class({"b": np.float32, "c": "float32", "d": np.float64}) + result = df.astype(dt2) + expected = DataFrame( + { + "a": a, + "b": Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype="float32"), + "c": Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype="float32"), + "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float64"), + } + ) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(df, original) + + # change all columns + dt3 = dtype_class({"a": str, "b": str, "c": str, "d": str}) + tm.assert_frame_equal(df.astype(dt3), df.astype(str)) + tm.assert_frame_equal(df, original) + + # error should be raised when using something other than column labels + # in the keys of the dtype dict + dt4 = dtype_class({"b": str, 2: str}) + dt5 = dtype_class({"e": str}) + msg = "Only a column name can be used for the key in a dtype mappings argument" + with pytest.raises(KeyError, match=msg): + df.astype(dt4) + with pytest.raises(KeyError, match=msg): + df.astype(dt5) + tm.assert_frame_equal(df, original) + + # if the dtypes provided are the same as the original dtypes, the + # resulting DataFrame should be the same as the original DataFrame + dt6 = dtype_class({col: df[col].dtype for col in df.columns}) + equiv = df.astype(dt6) + tm.assert_frame_equal(df, equiv) + tm.assert_frame_equal(df, original) + + # GH 16717 + # if dtypes provided is empty, the resulting DataFrame + # should be the same as the original DataFrame + dt7 = dtype_class({}) if dtype_class is dict else dtype_class({}, dtype=object) + equiv = df.astype(dt7) + tm.assert_frame_equal(df, equiv) + tm.assert_frame_equal(df, original) + + def test_astype_duplicate_col(self): + a1 = Series([1, 2, 3, 4, 5], name="a") + b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name="b") + a2 = Series([0, 1, 2, 3, 4], name="a") + df = concat([a1, b, a2], axis=1) + + result = df.astype(str) + a1_str = Series(["1", "2", "3", "4", "5"], dtype="str", name="a") + b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"], dtype=str, name="b") + a2_str = Series(["0", "1", "2", "3", "4"], dtype="str", name="a") + expected = concat([a1_str, b_str, a2_str], axis=1) + tm.assert_frame_equal(result, expected) + + result = df.astype({"a": "str"}) + expected = concat([a1_str, b, a2_str], axis=1) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "dtype", + [ + "category", + CategoricalDtype(), + CategoricalDtype(ordered=True), + CategoricalDtype(ordered=False), + CategoricalDtype(categories=list("abcdef")), + CategoricalDtype(categories=list("edba"), ordered=False), + CategoricalDtype(categories=list("edcb"), ordered=True), + ], + ids=repr, + ) + def test_astype_categorical(self, dtype): + # GH 18099 + d = {"A": list("abbc"), "B": list("bccd"), "C": list("cdde")} + df = DataFrame(d) + result = df.astype(dtype) + expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("cls", [CategoricalDtype, DatetimeTZDtype, IntervalDtype]) + def test_astype_categoricaldtype_class_raises(self, cls): + df = DataFrame({"A": ["a", "a", "b", "c"]}) + xpr = "Expected an instance of {}".format(cls.__name__) + with pytest.raises(TypeError, match=xpr): + df.astype({"A": cls}) + + with pytest.raises(TypeError, match=xpr): + df["A"].astype(cls) + + def test_singlerow_slice_categoricaldtype_gives_series(self): + # GH29521 + df = pd.DataFrame({"x": pd.Categorical("a b c d e".split())}) + result = df.iloc[0] + raw_cat = pd.Categorical(["a"], categories=["a", "b", "c", "d", "e"]) + expected = pd.Series(raw_cat, index=["x"], name=0, dtype="category") + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"]) + def test_astype_extension_dtypes(self, dtype): + # GH 22578 + df = pd.DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"]) + + expected1 = pd.DataFrame( + { + "a": integer_array([1, 3, 5], dtype=dtype), + "b": integer_array([2, 4, 6], dtype=dtype), + } + ) + tm.assert_frame_equal(df.astype(dtype), expected1) + tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) + tm.assert_frame_equal(df.astype(dtype).astype("float64"), df) + + df = pd.DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"]) + df["b"] = df["b"].astype(dtype) + expected2 = pd.DataFrame( + {"a": [1.0, 3.0, 5.0], "b": integer_array([2, 4, 6], dtype=dtype)} + ) + tm.assert_frame_equal(df, expected2) + + tm.assert_frame_equal(df.astype(dtype), expected1) + tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) + + @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"]) + def test_astype_extension_dtypes_1d(self, dtype): + # GH 22578 + df = pd.DataFrame({"a": [1.0, 2.0, 3.0]}) + + expected1 = pd.DataFrame({"a": integer_array([1, 2, 3], dtype=dtype)}) + tm.assert_frame_equal(df.astype(dtype), expected1) + tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) + + df = pd.DataFrame({"a": [1.0, 2.0, 3.0]}) + df["a"] = df["a"].astype(dtype) + expected2 = pd.DataFrame({"a": integer_array([1, 2, 3], dtype=dtype)}) + tm.assert_frame_equal(df, expected2) + + tm.assert_frame_equal(df.astype(dtype), expected1) + tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) + + @pytest.mark.parametrize("dtype", ["category", "Int64"]) + def test_astype_extension_dtypes_duplicate_col(self, dtype): + # GH 24704 + a1 = Series([0, np.nan, 4], name="a") + a2 = Series([np.nan, 3, 5], name="a") + df = concat([a1, a2], axis=1) + + result = df.astype(dtype) + expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("kwargs", [dict(), dict(other=None)]) + def test_df_where_with_category(self, kwargs): + # GH 16979 + df = DataFrame(np.arange(2 * 3).reshape(2, 3), columns=list("ABC")) + mask = np.array([[True, False, True], [False, True, True]]) + + # change type to category + df.A = df.A.astype("category") + df.B = df.B.astype("category") + df.C = df.C.astype("category") + + result = df.A.where(mask[:, 0], **kwargs) + expected = Series(pd.Categorical([0, np.nan], categories=[0, 3]), name="A") + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "dtype", [{100: "float64", 200: "uint64"}, "category", "float64"] + ) + def test_astype_column_metadata(self, dtype): + # GH 19920 + columns = pd.UInt64Index([100, 200, 300], name="foo") + df = DataFrame(np.arange(15).reshape(5, 3), columns=columns) + df = df.astype(dtype) + tm.assert_index_equal(df.columns, columns) + + def test_df_where_change_dtype(self): + # GH 16979 + df = DataFrame(np.arange(2 * 3).reshape(2, 3), columns=list("ABC")) + mask = np.array([[True, False, False], [False, False, True]]) + + result = df.where(mask) + expected = DataFrame( + [[0, np.nan, np.nan], [np.nan, np.nan, 5]], columns=list("ABC") + ) + + tm.assert_frame_equal(result, expected) + + # change type to category + df.A = df.A.astype("category") + df.B = df.B.astype("category") + df.C = df.C.astype("category") + + result = df.where(mask) + A = pd.Categorical([0, np.nan], categories=[0, 3]) + B = pd.Categorical([np.nan, np.nan], categories=[1, 4]) + C = pd.Categorical([np.nan, 5], categories=[2, 5]) + expected = DataFrame({"A": A, "B": B, "C": C}) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("dtype", ["M8", "m8"]) + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) + def test_astype_from_datetimelike_to_objectt(self, dtype, unit): + # tests astype to object dtype + # gh-19223 / gh-12425 + dtype = "{}[{}]".format(dtype, unit) + arr = np.array([[1, 2, 3]], dtype=dtype) + df = DataFrame(arr) + result = df.astype(object) + assert (result.dtypes == object).all() + + if dtype.startswith("M8"): + assert result.iloc[0, 0] == pd.to_datetime(1, unit=unit) + else: + assert result.iloc[0, 0] == pd.to_timedelta(1, unit=unit) + + @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64]) + @pytest.mark.parametrize("dtype", ["M8", "m8"]) + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) + def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit): + # tests all units from numeric origination + # gh-19223 / gh-12425 + dtype = "{}[{}]".format(dtype, unit) + arr = np.array([[1, 2, 3]], dtype=arr_dtype) + df = DataFrame(arr) + result = df.astype(dtype) + expected = DataFrame(arr.astype(dtype)) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) + def test_astype_to_datetime_unit(self, unit): + # tests all units from datetime origination + # gh-19223 + dtype = "M8[{}]".format(unit) + arr = np.array([[1, 2, 3]], dtype=dtype) + df = DataFrame(arr) + result = df.astype(dtype) + expected = DataFrame(arr.astype(dtype)) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("unit", ["ns"]) + def test_astype_to_timedelta_unit_ns(self, unit): + # preserver the timedelta conversion + # gh-19223 + dtype = "m8[{}]".format(unit) + arr = np.array([[1, 2, 3]], dtype=dtype) + df = DataFrame(arr) + result = df.astype(dtype) + expected = DataFrame(arr.astype(dtype)) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("unit", ["us", "ms", "s", "h", "m", "D"]) + def test_astype_to_timedelta_unit(self, unit): + # coerce to float + # gh-19223 + dtype = "m8[{}]".format(unit) + arr = np.array([[1, 2, 3]], dtype=dtype) + df = DataFrame(arr) + result = df.astype(dtype) + expected = DataFrame(df.values.astype(dtype).astype(float)) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) + def test_astype_to_incorrect_datetimelike(self, unit): + # trying to astype a m to a M, or vice-versa + # gh-19224 + dtype = "M8[{}]".format(unit) + other = "m8[{}]".format(unit) + + df = DataFrame(np.array([[1, 2, 3]], dtype=dtype)) + msg = ( + r"cannot astype a datetimelike from \[datetime64\[ns\]\] to" + r" \[timedelta64\[{}\]\]" + ).format(unit) + with pytest.raises(TypeError, match=msg): + df.astype(other) + + msg = ( + r"cannot astype a timedelta from \[timedelta64\[ns\]\] to" + r" \[datetime64\[{}\]\]" + ).format(unit) + df = DataFrame(np.array([[1, 2, 3]], dtype=other)) + with pytest.raises(TypeError, match=msg): + df.astype(dtype) + + def test_timedeltas(self): + df = DataFrame( + dict( + A=Series(date_range("2012-1-1", periods=3, freq="D")), + B=Series([timedelta(days=i) for i in range(3)]), + ) + ) + result = df.dtypes + expected = Series( + [np.dtype("datetime64[ns]"), np.dtype("timedelta64[ns]")], index=list("AB") + ) + tm.assert_series_equal(result, expected) + + df["C"] = df["A"] + df["B"] + result = df.dtypes + expected = Series( + [ + np.dtype("datetime64[ns]"), + np.dtype("timedelta64[ns]"), + np.dtype("datetime64[ns]"), + ], + index=list("ABC"), + ) + tm.assert_series_equal(result, expected) + + # mixed int types + df["D"] = 1 + result = df.dtypes + expected = Series( + [ + np.dtype("datetime64[ns]"), + np.dtype("timedelta64[ns]"), + np.dtype("datetime64[ns]"), + np.dtype("int64"), + ], + index=list("ABCD"), + ) + tm.assert_series_equal(result, expected) + + def test_arg_for_errors_in_astype(self): + # issue #14878 + + df = DataFrame([1, 2, 3]) + + with pytest.raises(ValueError): + df.astype(np.float64, errors=True) + + df.astype(np.int8, errors="ignore") + + def test_arg_for_errors_in_astype_dictlist(self): + # GH-25905 + df = pd.DataFrame( + [ + {"a": "1", "b": "16.5%", "c": "test"}, + {"a": "2.2", "b": "15.3", "c": "another_test"}, + ] + ) + expected = pd.DataFrame( + [ + {"a": 1.0, "b": "16.5%", "c": "test"}, + {"a": 2.2, "b": "15.3", "c": "another_test"}, + ] + ) + type_dict = {"a": "float64", "b": "float64", "c": "object"} + + result = df.astype(dtype=type_dict, errors="ignore") + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "input_vals", + [ + ([1, 2]), + (["1", "2"]), + (list(pd.date_range("1/1/2011", periods=2, freq="H"))), + (list(pd.date_range("1/1/2011", periods=2, freq="H", tz="US/Eastern"))), + ([pd.Interval(left=0, right=5)]), + ], + ) + def test_constructor_list_str(self, input_vals, string_dtype): + # GH 16605 + # Ensure that data elements are converted to strings when + # dtype is str, 'str', or 'U' + + result = DataFrame({"A": input_vals}, dtype=string_dtype) + expected = DataFrame({"A": input_vals}).astype({"A": string_dtype}) + tm.assert_frame_equal(result, expected) + + def test_constructor_list_str_na(self, string_dtype): + + result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype) + expected = DataFrame({"A": ["1.0", "2.0", None]}, dtype=object) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "data, expected", + [ + # empty + (DataFrame(), True), + # multi-same + (DataFrame({"A": [1, 2], "B": [1, 2]}), True), + # multi-object + ( + DataFrame( + { + "A": np.array([1, 2], dtype=object), + "B": np.array(["a", "b"], dtype=object), + } + ), + True, + ), + # multi-extension + ( + DataFrame( + {"A": pd.Categorical(["a", "b"]), "B": pd.Categorical(["a", "b"])} + ), + True, + ), + # differ types + (DataFrame({"A": [1, 2], "B": [1.0, 2.0]}), False), + # differ sizes + ( + DataFrame( + { + "A": np.array([1, 2], dtype=np.int32), + "B": np.array([1, 2], dtype=np.int64), + } + ), + False, + ), + # multi-extension differ + ( + DataFrame( + {"A": pd.Categorical(["a", "b"]), "B": pd.Categorical(["b", "c"])} + ), + False, + ), + ], + ) + def test_is_homogeneous_type(self, data, expected): + assert data._is_homogeneous_type is expected + + def test_asarray_homogenous(self): + df = pd.DataFrame({"A": pd.Categorical([1, 2]), "B": pd.Categorical([1, 2])}) + result = np.asarray(df) + # may change from object in the future + expected = np.array([[1, 1], [2, 2]], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + def test_str_to_small_float_conversion_type(self): + # GH 20388 + np.random.seed(13) + col_data = [str(np.random.random() * 1e-12) for _ in range(5)] + result = pd.DataFrame(col_data, columns=["A"]) + expected = pd.DataFrame(col_data, columns=["A"], dtype=object) + tm.assert_frame_equal(result, expected) + # change the dtype of the elements from object to float one by one + result.loc[result.index, "A"] = [float(x) for x in col_data] + expected = pd.DataFrame(col_data, columns=["A"], dtype=float) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")] + ) + def test_convert_dtypes(self, convert_integer, expected): + # Specific types are tested in tests/series/test_dtypes.py + # Just check that it works for DataFrame here + df = pd.DataFrame( + { + "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), + "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")), + } + ) + result = df.convert_dtypes(True, True, convert_integer, False) + expected = pd.DataFrame( + { + "a": pd.Series([1, 2, 3], dtype=expected), + "b": pd.Series(["x", "y", "z"], dtype="string"), + } + ) + tm.assert_frame_equal(result, expected) + + +class TestDataFrameDatetimeWithTZ: + def test_interleave(self, timezone_frame): + + # interleave with object + result = timezone_frame.assign(D="foo").values + expected = np.array( + [ + [ + Timestamp("2013-01-01 00:00:00"), + Timestamp("2013-01-02 00:00:00"), + Timestamp("2013-01-03 00:00:00"), + ], + [ + Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"), + pd.NaT, + Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"), + ], + [ + Timestamp("2013-01-01 00:00:00+0100", tz="CET"), + pd.NaT, + Timestamp("2013-01-03 00:00:00+0100", tz="CET"), + ], + ["foo", "foo", "foo"], + ], + dtype=object, + ).T + tm.assert_numpy_array_equal(result, expected) + + # interleave with only datetime64[ns] + result = timezone_frame.values + expected = np.array( + [ + [ + Timestamp("2013-01-01 00:00:00"), + Timestamp("2013-01-02 00:00:00"), + Timestamp("2013-01-03 00:00:00"), + ], + [ + Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"), + pd.NaT, + Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"), + ], + [ + Timestamp("2013-01-01 00:00:00+0100", tz="CET"), + pd.NaT, + Timestamp("2013-01-03 00:00:00+0100", tz="CET"), + ], + ], + dtype=object, + ).T + tm.assert_numpy_array_equal(result, expected) + + def test_astype(self, timezone_frame): + # astype + expected = np.array( + [ + [ + Timestamp("2013-01-01 00:00:00"), + Timestamp("2013-01-02 00:00:00"), + Timestamp("2013-01-03 00:00:00"), + ], + [ + Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"), + pd.NaT, + Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"), + ], + [ + Timestamp("2013-01-01 00:00:00+0100", tz="CET"), + pd.NaT, + Timestamp("2013-01-03 00:00:00+0100", tz="CET"), + ], + ], + dtype=object, + ).T + expected = DataFrame( + expected, + index=timezone_frame.index, + columns=timezone_frame.columns, + dtype=object, + ) + result = timezone_frame.astype(object) + tm.assert_frame_equal(result, expected) + + result = timezone_frame.astype("datetime64[ns]") + expected = DataFrame( + { + "A": date_range("20130101", periods=3), + "B": ( + date_range("20130101", periods=3, tz="US/Eastern") + .tz_convert("UTC") + .tz_localize(None) + ), + "C": ( + date_range("20130101", periods=3, tz="CET") + .tz_convert("UTC") + .tz_localize(None) + ), + } + ) + expected.iloc[1, 1] = pd.NaT + expected.iloc[1, 2] = pd.NaT + tm.assert_frame_equal(result, expected) + + def test_astype_str(self, timezone_frame): + # str formatting + result = timezone_frame.astype(str) + expected = DataFrame( + [ + [ + "2013-01-01", + "2013-01-01 00:00:00-05:00", + "2013-01-01 00:00:00+01:00", + ], + ["2013-01-02", "NaT", "NaT"], + [ + "2013-01-03", + "2013-01-03 00:00:00-05:00", + "2013-01-03 00:00:00+01:00", + ], + ], + columns=timezone_frame.columns, + ) + tm.assert_frame_equal(result, expected) + + with option_context("display.max_columns", 20): + result = str(timezone_frame) + assert ( + "0 2013-01-01 2013-01-01 00:00:00-05:00 2013-01-01 00:00:00+01:00" + ) in result + assert ( + "1 2013-01-02 NaT NaT" + ) in result + assert ( + "2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00" + ) in result diff --git a/venv/Lib/site-packages/pandas/tests/frame/test_join.py b/venv/Lib/site-packages/pandas/tests/frame/test_join.py new file mode 100644 index 0000000..c6e28f3 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/test_join.py @@ -0,0 +1,218 @@ +import numpy as np +import pytest + +from pandas import DataFrame, Index, period_range +import pandas._testing as tm + + +@pytest.fixture +def frame_with_period_index(): + return DataFrame( + data=np.arange(20).reshape(4, 5), + columns=list("abcde"), + index=period_range(start="2000", freq="A", periods=4), + ) + + +@pytest.fixture +def left(): + return DataFrame({"a": [20, 10, 0]}, index=[2, 1, 0]) + + +@pytest.fixture +def right(): + return DataFrame({"b": [300, 100, 200]}, index=[3, 1, 2]) + + +@pytest.mark.parametrize( + "how, sort, expected", + [ + ("inner", False, DataFrame({"a": [20, 10], "b": [200, 100]}, index=[2, 1])), + ("inner", True, DataFrame({"a": [10, 20], "b": [100, 200]}, index=[1, 2])), + ( + "left", + False, + DataFrame({"a": [20, 10, 0], "b": [200, 100, np.nan]}, index=[2, 1, 0]), + ), + ( + "left", + True, + DataFrame({"a": [0, 10, 20], "b": [np.nan, 100, 200]}, index=[0, 1, 2]), + ), + ( + "right", + False, + DataFrame({"a": [np.nan, 10, 20], "b": [300, 100, 200]}, index=[3, 1, 2]), + ), + ( + "right", + True, + DataFrame({"a": [10, 20, np.nan], "b": [100, 200, 300]}, index=[1, 2, 3]), + ), + ( + "outer", + False, + DataFrame( + {"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]}, + index=[0, 1, 2, 3], + ), + ), + ( + "outer", + True, + DataFrame( + {"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]}, + index=[0, 1, 2, 3], + ), + ), + ], +) +def test_join(left, right, how, sort, expected): + + result = left.join(right, how=how, sort=sort) + tm.assert_frame_equal(result, expected) + + +def test_join_index(float_frame): + # left / right + + f = float_frame.loc[float_frame.index[:10], ["A", "B"]] + f2 = float_frame.loc[float_frame.index[5:], ["C", "D"]].iloc[::-1] + + joined = f.join(f2) + tm.assert_index_equal(f.index, joined.index) + expected_columns = Index(["A", "B", "C", "D"]) + tm.assert_index_equal(joined.columns, expected_columns) + + joined = f.join(f2, how="left") + tm.assert_index_equal(joined.index, f.index) + tm.assert_index_equal(joined.columns, expected_columns) + + joined = f.join(f2, how="right") + tm.assert_index_equal(joined.index, f2.index) + tm.assert_index_equal(joined.columns, expected_columns) + + # inner + + joined = f.join(f2, how="inner") + tm.assert_index_equal(joined.index, f.index[5:10]) + tm.assert_index_equal(joined.columns, expected_columns) + + # outer + + joined = f.join(f2, how="outer") + tm.assert_index_equal(joined.index, float_frame.index.sort_values()) + tm.assert_index_equal(joined.columns, expected_columns) + + with pytest.raises(ValueError, match="join method"): + f.join(f2, how="foo") + + # corner case - overlapping columns + msg = "columns overlap but no suffix" + for how in ("outer", "left", "inner"): + with pytest.raises(ValueError, match=msg): + float_frame.join(float_frame, how=how) + + +def test_join_index_more(float_frame): + af = float_frame.loc[:, ["A", "B"]] + bf = float_frame.loc[::2, ["C", "D"]] + + expected = af.copy() + expected["C"] = float_frame["C"][::2] + expected["D"] = float_frame["D"][::2] + + result = af.join(bf) + tm.assert_frame_equal(result, expected) + + result = af.join(bf, how="right") + tm.assert_frame_equal(result, expected[::2]) + + result = bf.join(af, how="right") + tm.assert_frame_equal(result, expected.loc[:, result.columns]) + + +def test_join_index_series(float_frame): + df = float_frame.copy() + s = df.pop(float_frame.columns[-1]) + joined = df.join(s) + + # TODO should this check_names ? + tm.assert_frame_equal(joined, float_frame, check_names=False) + + s.name = None + with pytest.raises(ValueError, match="must have a name"): + df.join(s) + + +def test_join_overlap(float_frame): + df1 = float_frame.loc[:, ["A", "B", "C"]] + df2 = float_frame.loc[:, ["B", "C", "D"]] + + joined = df1.join(df2, lsuffix="_df1", rsuffix="_df2") + df1_suf = df1.loc[:, ["B", "C"]].add_suffix("_df1") + df2_suf = df2.loc[:, ["B", "C"]].add_suffix("_df2") + + no_overlap = float_frame.loc[:, ["A", "D"]] + expected = df1_suf.join(df2_suf).join(no_overlap) + + # column order not necessarily sorted + tm.assert_frame_equal(joined, expected.loc[:, joined.columns]) + + +def test_join_period_index(frame_with_period_index): + other = frame_with_period_index.rename(columns=lambda x: "{key}{key}".format(key=x)) + + joined_values = np.concatenate([frame_with_period_index.values] * 2, axis=1) + + joined_cols = frame_with_period_index.columns.append(other.columns) + + joined = frame_with_period_index.join(other) + expected = DataFrame( + data=joined_values, columns=joined_cols, index=frame_with_period_index.index + ) + + tm.assert_frame_equal(joined, expected) + + +def test_join_left_sequence_non_unique_index(): + # https://github.com/pandas-dev/pandas/issues/19607 + df1 = DataFrame({"a": [0, 10, 20]}, index=[1, 2, 3]) + df2 = DataFrame({"b": [100, 200, 300]}, index=[4, 3, 2]) + df3 = DataFrame({"c": [400, 500, 600]}, index=[2, 2, 4]) + + joined = df1.join([df2, df3], how="left") + + expected = DataFrame( + { + "a": [0, 10, 10, 20], + "b": [np.nan, 300, 300, 200], + "c": [np.nan, 400, 500, np.nan], + }, + index=[1, 2, 2, 3], + ) + + tm.assert_frame_equal(joined, expected) + + +@pytest.mark.parametrize("sort_kw", [True, False]) +def test_suppress_future_warning_with_sort_kw(sort_kw): + a = DataFrame({"col1": [1, 2]}, index=["c", "a"]) + + b = DataFrame({"col2": [4, 5]}, index=["b", "a"]) + + c = DataFrame({"col3": [7, 8]}, index=["a", "b"]) + + expected = DataFrame( + { + "col1": {"a": 2.0, "b": float("nan"), "c": 1.0}, + "col2": {"a": 5.0, "b": 4.0, "c": float("nan")}, + "col3": {"a": 7.0, "b": 8.0, "c": float("nan")}, + } + ) + if sort_kw is False: + expected = expected.reindex(index=["c", "a", "b"]) + + with tm.assert_produces_warning(None, check_stacklevel=False): + result = a.join([b, c], how="outer", sort=sort_kw) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/frame/test_missing.py b/venv/Lib/site-packages/pandas/tests/frame/test_missing.py new file mode 100644 index 0000000..2e6759c --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/test_missing.py @@ -0,0 +1,985 @@ +import datetime + +import dateutil +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import Categorical, DataFrame, Series, Timestamp, date_range +import pandas._testing as tm +from pandas.tests.frame.common import _check_mixed_float + + +class TestDataFrameMissingData: + def test_dropEmptyRows(self, float_frame): + N = len(float_frame.index) + mat = np.random.randn(N) + mat[:5] = np.nan + + frame = DataFrame({"foo": mat}, index=float_frame.index) + original = Series(mat, index=float_frame.index, name="foo") + expected = original.dropna() + inplace_frame1, inplace_frame2 = frame.copy(), frame.copy() + + smaller_frame = frame.dropna(how="all") + # check that original was preserved + tm.assert_series_equal(frame["foo"], original) + inplace_frame1.dropna(how="all", inplace=True) + tm.assert_series_equal(smaller_frame["foo"], expected) + tm.assert_series_equal(inplace_frame1["foo"], expected) + + smaller_frame = frame.dropna(how="all", subset=["foo"]) + inplace_frame2.dropna(how="all", subset=["foo"], inplace=True) + tm.assert_series_equal(smaller_frame["foo"], expected) + tm.assert_series_equal(inplace_frame2["foo"], expected) + + def test_dropIncompleteRows(self, float_frame): + N = len(float_frame.index) + mat = np.random.randn(N) + mat[:5] = np.nan + + frame = DataFrame({"foo": mat}, index=float_frame.index) + frame["bar"] = 5 + original = Series(mat, index=float_frame.index, name="foo") + inp_frame1, inp_frame2 = frame.copy(), frame.copy() + + smaller_frame = frame.dropna() + tm.assert_series_equal(frame["foo"], original) + inp_frame1.dropna(inplace=True) + + exp = Series(mat[5:], index=float_frame.index[5:], name="foo") + tm.assert_series_equal(smaller_frame["foo"], exp) + tm.assert_series_equal(inp_frame1["foo"], exp) + + samesize_frame = frame.dropna(subset=["bar"]) + tm.assert_series_equal(frame["foo"], original) + assert (frame["bar"] == 5).all() + inp_frame2.dropna(subset=["bar"], inplace=True) + tm.assert_index_equal(samesize_frame.index, float_frame.index) + tm.assert_index_equal(inp_frame2.index, float_frame.index) + + def test_dropna(self): + df = DataFrame(np.random.randn(6, 4)) + df[2][:2] = np.nan + + dropped = df.dropna(axis=1) + expected = df.loc[:, [0, 1, 3]] + inp = df.copy() + inp.dropna(axis=1, inplace=True) + tm.assert_frame_equal(dropped, expected) + tm.assert_frame_equal(inp, expected) + + dropped = df.dropna(axis=0) + expected = df.loc[list(range(2, 6))] + inp = df.copy() + inp.dropna(axis=0, inplace=True) + tm.assert_frame_equal(dropped, expected) + tm.assert_frame_equal(inp, expected) + + # threshold + dropped = df.dropna(axis=1, thresh=5) + expected = df.loc[:, [0, 1, 3]] + inp = df.copy() + inp.dropna(axis=1, thresh=5, inplace=True) + tm.assert_frame_equal(dropped, expected) + tm.assert_frame_equal(inp, expected) + + dropped = df.dropna(axis=0, thresh=4) + expected = df.loc[range(2, 6)] + inp = df.copy() + inp.dropna(axis=0, thresh=4, inplace=True) + tm.assert_frame_equal(dropped, expected) + tm.assert_frame_equal(inp, expected) + + dropped = df.dropna(axis=1, thresh=4) + tm.assert_frame_equal(dropped, df) + + dropped = df.dropna(axis=1, thresh=3) + tm.assert_frame_equal(dropped, df) + + # subset + dropped = df.dropna(axis=0, subset=[0, 1, 3]) + inp = df.copy() + inp.dropna(axis=0, subset=[0, 1, 3], inplace=True) + tm.assert_frame_equal(dropped, df) + tm.assert_frame_equal(inp, df) + + # all + dropped = df.dropna(axis=1, how="all") + tm.assert_frame_equal(dropped, df) + + df[2] = np.nan + dropped = df.dropna(axis=1, how="all") + expected = df.loc[:, [0, 1, 3]] + tm.assert_frame_equal(dropped, expected) + + # bad input + msg = "No axis named 3 for object type " + with pytest.raises(ValueError, match=msg): + df.dropna(axis=3) + + def test_drop_and_dropna_caching(self): + # tst that cacher updates + original = Series([1, 2, np.nan], name="A") + expected = Series([1, 2], dtype=original.dtype, name="A") + df = pd.DataFrame({"A": original.values.copy()}) + df2 = df.copy() + df["A"].dropna() + tm.assert_series_equal(df["A"], original) + df["A"].dropna(inplace=True) + tm.assert_series_equal(df["A"], expected) + df2["A"].drop([1]) + tm.assert_series_equal(df2["A"], original) + df2["A"].drop([1], inplace=True) + tm.assert_series_equal(df2["A"], original.drop([1])) + + def test_dropna_corner(self, float_frame): + # bad input + msg = "invalid how option: foo" + with pytest.raises(ValueError, match=msg): + float_frame.dropna(how="foo") + msg = "must specify how or thresh" + with pytest.raises(TypeError, match=msg): + float_frame.dropna(how=None) + # non-existent column - 8303 + with pytest.raises(KeyError, match=r"^\['X'\]$"): + float_frame.dropna(subset=["A", "X"]) + + def test_dropna_multiple_axes(self): + df = DataFrame( + [ + [1, np.nan, 2, 3], + [4, np.nan, 5, 6], + [np.nan, np.nan, np.nan, np.nan], + [7, np.nan, 8, 9], + ] + ) + + # GH20987 + with pytest.raises(TypeError, match="supplying multiple axes"): + df.dropna(how="all", axis=[0, 1]) + with pytest.raises(TypeError, match="supplying multiple axes"): + df.dropna(how="all", axis=(0, 1)) + + inp = df.copy() + with pytest.raises(TypeError, match="supplying multiple axes"): + inp.dropna(how="all", axis=(0, 1), inplace=True) + + def test_dropna_tz_aware_datetime(self): + # GH13407 + df = DataFrame() + dt1 = datetime.datetime(2015, 1, 1, tzinfo=dateutil.tz.tzutc()) + dt2 = datetime.datetime(2015, 2, 2, tzinfo=dateutil.tz.tzutc()) + df["Time"] = [dt1] + result = df.dropna(axis=0) + expected = DataFrame({"Time": [dt1]}) + tm.assert_frame_equal(result, expected) + + # Ex2 + df = DataFrame({"Time": [dt1, None, np.nan, dt2]}) + result = df.dropna(axis=0) + expected = DataFrame([dt1, dt2], columns=["Time"], index=[0, 3]) + tm.assert_frame_equal(result, expected) + + def test_dropna_categorical_interval_index(self): + # GH 25087 + ii = pd.IntervalIndex.from_breaks([0, 2.78, 3.14, 6.28]) + ci = pd.CategoricalIndex(ii) + df = pd.DataFrame({"A": list("abc")}, index=ci) + + expected = df + result = df.dropna() + tm.assert_frame_equal(result, expected) + + def test_fillna_datetime(self, datetime_frame): + tf = datetime_frame + tf.loc[tf.index[:5], "A"] = np.nan + tf.loc[tf.index[-5:], "A"] = np.nan + + zero_filled = datetime_frame.fillna(0) + assert (zero_filled.loc[zero_filled.index[:5], "A"] == 0).all() + + padded = datetime_frame.fillna(method="pad") + assert np.isnan(padded.loc[padded.index[:5], "A"]).all() + assert ( + padded.loc[padded.index[-5:], "A"] == padded.loc[padded.index[-5], "A"] + ).all() + + msg = "Must specify a fill 'value' or 'method'" + with pytest.raises(ValueError, match=msg): + datetime_frame.fillna() + msg = "Cannot specify both 'value' and 'method'" + with pytest.raises(ValueError, match=msg): + datetime_frame.fillna(5, method="ffill") + + def test_fillna_mixed_type(self, float_string_frame): + + mf = float_string_frame + mf.loc[mf.index[5:20], "foo"] = np.nan + mf.loc[mf.index[-10:], "A"] = np.nan + # TODO: make stronger assertion here, GH 25640 + mf.fillna(value=0) + mf.fillna(method="pad") + + def test_fillna_mixed_float(self, mixed_float_frame): + + # mixed numeric (but no float16) + mf = mixed_float_frame.reindex(columns=["A", "B", "D"]) + mf.loc[mf.index[-10:], "A"] = np.nan + result = mf.fillna(value=0) + _check_mixed_float(result, dtype=dict(C=None)) + + result = mf.fillna(method="pad") + _check_mixed_float(result, dtype=dict(C=None)) + + def test_fillna_empty(self): + # empty frame (GH #2778) + df = DataFrame(columns=["x"]) + for m in ["pad", "backfill"]: + df.x.fillna(method=m, inplace=True) + df.x.fillna(method=m) + + def test_fillna_different_dtype(self): + # with different dtype (GH#3386) + df = DataFrame( + [["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]] + ) + + result = df.fillna({2: "foo"}) + expected = DataFrame( + [["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]] + ) + tm.assert_frame_equal(result, expected) + + df.fillna({2: "foo"}, inplace=True) + tm.assert_frame_equal(df, expected) + + def test_fillna_limit_and_value(self): + # limit and value + df = DataFrame(np.random.randn(10, 3)) + df.iloc[2:7, 0] = np.nan + df.iloc[3:5, 2] = np.nan + + expected = df.copy() + expected.iloc[2, 0] = 999 + expected.iloc[3, 2] = 999 + result = df.fillna(999, limit=1) + tm.assert_frame_equal(result, expected) + + def test_fillna_datelike(self): + # with datelike + # GH#6344 + df = DataFrame( + { + "Date": [pd.NaT, Timestamp("2014-1-1")], + "Date2": [Timestamp("2013-1-1"), pd.NaT], + } + ) + + expected = df.copy() + expected["Date"] = expected["Date"].fillna(df.loc[df.index[0], "Date2"]) + result = df.fillna(value={"Date": df["Date2"]}) + tm.assert_frame_equal(result, expected) + + def test_fillna_tzaware(self): + # with timezone + # GH#15855 + df = pd.DataFrame({"A": [pd.Timestamp("2012-11-11 00:00:00+01:00"), pd.NaT]}) + exp = pd.DataFrame( + { + "A": [ + pd.Timestamp("2012-11-11 00:00:00+01:00"), + pd.Timestamp("2012-11-11 00:00:00+01:00"), + ] + } + ) + tm.assert_frame_equal(df.fillna(method="pad"), exp) + + df = pd.DataFrame({"A": [pd.NaT, pd.Timestamp("2012-11-11 00:00:00+01:00")]}) + exp = pd.DataFrame( + { + "A": [ + pd.Timestamp("2012-11-11 00:00:00+01:00"), + pd.Timestamp("2012-11-11 00:00:00+01:00"), + ] + } + ) + tm.assert_frame_equal(df.fillna(method="bfill"), exp) + + def test_fillna_tzaware_different_column(self): + # with timezone in another column + # GH#15522 + df = pd.DataFrame( + { + "A": pd.date_range("20130101", periods=4, tz="US/Eastern"), + "B": [1, 2, np.nan, np.nan], + } + ) + result = df.fillna(method="pad") + expected = pd.DataFrame( + { + "A": pd.date_range("20130101", periods=4, tz="US/Eastern"), + "B": [1.0, 2.0, 2.0, 2.0], + } + ) + tm.assert_frame_equal(result, expected) + + def test_na_actions_categorical(self): + + cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) + vals = ["a", "b", np.nan, "d"] + df = DataFrame({"cats": cat, "vals": vals}) + cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3]) + vals2 = ["a", "b", "b", "d"] + df_exp_fill = DataFrame({"cats": cat2, "vals": vals2}) + cat3 = Categorical([1, 2, 3], categories=[1, 2, 3]) + vals3 = ["a", "b", np.nan] + df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3}) + cat4 = Categorical([1, 2], categories=[1, 2, 3]) + vals4 = ["a", "b"] + df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4}) + + # fillna + res = df.fillna(value={"cats": 3, "vals": "b"}) + tm.assert_frame_equal(res, df_exp_fill) + + with pytest.raises(ValueError, match=("fill value must be in categories")): + df.fillna(value={"cats": 4, "vals": "c"}) + + res = df.fillna(method="pad") + tm.assert_frame_equal(res, df_exp_fill) + + # dropna + res = df.dropna(subset=["cats"]) + tm.assert_frame_equal(res, df_exp_drop_cats) + + res = df.dropna() + tm.assert_frame_equal(res, df_exp_drop_all) + + # make sure that fillna takes missing values into account + c = Categorical([np.nan, "b", np.nan], categories=["a", "b"]) + df = pd.DataFrame({"cats": c, "vals": [1, 2, 3]}) + + cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"]) + df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]}) + + res = df.fillna("a") + tm.assert_frame_equal(res, df_exp) + + def test_fillna_categorical_nan(self): + # GH 14021 + # np.nan should always be a valid filler + cat = Categorical([np.nan, 2, np.nan]) + val = Categorical([np.nan, np.nan, np.nan]) + df = DataFrame({"cats": cat, "vals": val}) + with tm.assert_produces_warning(RuntimeWarning): + res = df.fillna(df.median()) + v_exp = [np.nan, np.nan, np.nan] + df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp}, dtype="category") + tm.assert_frame_equal(res, df_exp) + + result = df.cats.fillna(np.nan) + tm.assert_series_equal(result, df.cats) + + result = df.vals.fillna(np.nan) + tm.assert_series_equal(result, df.vals) + + idx = pd.DatetimeIndex( + ["2011-01-01 09:00", "2016-01-01 23:45", "2011-01-01 09:00", pd.NaT, pd.NaT] + ) + df = DataFrame({"a": Categorical(idx)}) + tm.assert_frame_equal(df.fillna(value=pd.NaT), df) + + idx = pd.PeriodIndex( + ["2011-01", "2011-01", "2011-01", pd.NaT, pd.NaT], freq="M" + ) + df = DataFrame({"a": Categorical(idx)}) + tm.assert_frame_equal(df.fillna(value=pd.NaT), df) + + idx = pd.TimedeltaIndex(["1 days", "2 days", "1 days", pd.NaT, pd.NaT]) + df = DataFrame({"a": Categorical(idx)}) + tm.assert_frame_equal(df.fillna(value=pd.NaT), df) + + def test_fillna_downcast(self): + # GH 15277 + # infer int64 from float64 + df = pd.DataFrame({"a": [1.0, np.nan]}) + result = df.fillna(0, downcast="infer") + expected = pd.DataFrame({"a": [1, 0]}) + tm.assert_frame_equal(result, expected) + + # infer int64 from float64 when fillna value is a dict + df = pd.DataFrame({"a": [1.0, np.nan]}) + result = df.fillna({"a": 0}, downcast="infer") + expected = pd.DataFrame({"a": [1, 0]}) + tm.assert_frame_equal(result, expected) + + def test_fillna_dtype_conversion(self): + # make sure that fillna on an empty frame works + df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) + result = df.dtypes + expected = Series([np.dtype("object")] * 5, index=[1, 2, 3, 4, 5]) + tm.assert_series_equal(result, expected) + + result = df.fillna(1) + expected = DataFrame(1, index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) + tm.assert_frame_equal(result, expected) + + # empty block + df = DataFrame(index=range(3), columns=["A", "B"], dtype="float64") + result = df.fillna("nan") + expected = DataFrame("nan", index=range(3), columns=["A", "B"]) + tm.assert_frame_equal(result, expected) + + # equiv of replace + df = DataFrame(dict(A=[1, np.nan], B=[1.0, 2.0])) + for v in ["", 1, np.nan, 1.0]: + expected = df.replace(np.nan, v) + result = df.fillna(v) + tm.assert_frame_equal(result, expected) + + def test_fillna_datetime_columns(self): + # GH 7095 + df = pd.DataFrame( + { + "A": [-1, -2, np.nan], + "B": date_range("20130101", periods=3), + "C": ["foo", "bar", None], + "D": ["foo2", "bar2", None], + }, + index=date_range("20130110", periods=3), + ) + result = df.fillna("?") + expected = pd.DataFrame( + { + "A": [-1, -2, "?"], + "B": date_range("20130101", periods=3), + "C": ["foo", "bar", "?"], + "D": ["foo2", "bar2", "?"], + }, + index=date_range("20130110", periods=3), + ) + tm.assert_frame_equal(result, expected) + + df = pd.DataFrame( + { + "A": [-1, -2, np.nan], + "B": [pd.Timestamp("2013-01-01"), pd.Timestamp("2013-01-02"), pd.NaT], + "C": ["foo", "bar", None], + "D": ["foo2", "bar2", None], + }, + index=date_range("20130110", periods=3), + ) + result = df.fillna("?") + expected = pd.DataFrame( + { + "A": [-1, -2, "?"], + "B": [pd.Timestamp("2013-01-01"), pd.Timestamp("2013-01-02"), "?"], + "C": ["foo", "bar", "?"], + "D": ["foo2", "bar2", "?"], + }, + index=pd.date_range("20130110", periods=3), + ) + tm.assert_frame_equal(result, expected) + + def test_ffill(self, datetime_frame): + datetime_frame["A"][:5] = np.nan + datetime_frame["A"][-5:] = np.nan + + tm.assert_frame_equal( + datetime_frame.ffill(), datetime_frame.fillna(method="ffill") + ) + + def test_bfill(self, datetime_frame): + datetime_frame["A"][:5] = np.nan + datetime_frame["A"][-5:] = np.nan + + tm.assert_frame_equal( + datetime_frame.bfill(), datetime_frame.fillna(method="bfill") + ) + + def test_frame_pad_backfill_limit(self): + index = np.arange(10) + df = DataFrame(np.random.randn(10, 4), index=index) + + result = df[:2].reindex(index, method="pad", limit=5) + + expected = df[:2].reindex(index).fillna(method="pad") + expected.values[-3:] = np.nan + tm.assert_frame_equal(result, expected) + + result = df[-2:].reindex(index, method="backfill", limit=5) + + expected = df[-2:].reindex(index).fillna(method="backfill") + expected.values[:3] = np.nan + tm.assert_frame_equal(result, expected) + + def test_frame_fillna_limit(self): + index = np.arange(10) + df = DataFrame(np.random.randn(10, 4), index=index) + + result = df[:2].reindex(index) + result = result.fillna(method="pad", limit=5) + + expected = df[:2].reindex(index).fillna(method="pad") + expected.values[-3:] = np.nan + tm.assert_frame_equal(result, expected) + + result = df[-2:].reindex(index) + result = result.fillna(method="backfill", limit=5) + + expected = df[-2:].reindex(index).fillna(method="backfill") + expected.values[:3] = np.nan + tm.assert_frame_equal(result, expected) + + def test_fillna_skip_certain_blocks(self): + # don't try to fill boolean, int blocks + + df = DataFrame(np.random.randn(10, 4).astype(int)) + + # it works! + df.fillna(np.nan) + + @pytest.mark.parametrize("type", [int, float]) + def test_fillna_positive_limit(self, type): + df = DataFrame(np.random.randn(10, 4)).astype(type) + + msg = "Limit must be greater than 0" + with pytest.raises(ValueError, match=msg): + df.fillna(0, limit=-5) + + @pytest.mark.parametrize("type", [int, float]) + def test_fillna_integer_limit(self, type): + df = DataFrame(np.random.randn(10, 4)).astype(type) + + msg = "Limit must be an integer" + with pytest.raises(ValueError, match=msg): + df.fillna(0, limit=0.5) + + def test_fillna_inplace(self): + df = DataFrame(np.random.randn(10, 4)) + df[1][:4] = np.nan + df[3][-4:] = np.nan + + expected = df.fillna(value=0) + assert expected is not df + + df.fillna(value=0, inplace=True) + tm.assert_frame_equal(df, expected) + + expected = df.fillna(value={0: 0}, inplace=True) + assert expected is None + + df[1][:4] = np.nan + df[3][-4:] = np.nan + expected = df.fillna(method="ffill") + assert expected is not df + + df.fillna(method="ffill", inplace=True) + tm.assert_frame_equal(df, expected) + + def test_fillna_dict_series(self): + df = DataFrame( + { + "a": [np.nan, 1, 2, np.nan, np.nan], + "b": [1, 2, 3, np.nan, np.nan], + "c": [np.nan, 1, 2, 3, 4], + } + ) + + result = df.fillna({"a": 0, "b": 5}) + + expected = df.copy() + expected["a"] = expected["a"].fillna(0) + expected["b"] = expected["b"].fillna(5) + tm.assert_frame_equal(result, expected) + + # it works + result = df.fillna({"a": 0, "b": 5, "d": 7}) + + # Series treated same as dict + result = df.fillna(df.max()) + expected = df.fillna(df.max().to_dict()) + tm.assert_frame_equal(result, expected) + + # disable this for now + with pytest.raises(NotImplementedError, match="column by column"): + df.fillna(df.max(1), axis=1) + + def test_fillna_dataframe(self): + # GH 8377 + df = DataFrame( + { + "a": [np.nan, 1, 2, np.nan, np.nan], + "b": [1, 2, 3, np.nan, np.nan], + "c": [np.nan, 1, 2, 3, 4], + }, + index=list("VWXYZ"), + ) + + # df2 may have different index and columns + df2 = DataFrame( + { + "a": [np.nan, 10, 20, 30, 40], + "b": [50, 60, 70, 80, 90], + "foo": ["bar"] * 5, + }, + index=list("VWXuZ"), + ) + + result = df.fillna(df2) + + # only those columns and indices which are shared get filled + expected = DataFrame( + { + "a": [np.nan, 1, 2, np.nan, 40], + "b": [1, 2, 3, np.nan, 90], + "c": [np.nan, 1, 2, 3, 4], + }, + index=list("VWXYZ"), + ) + + tm.assert_frame_equal(result, expected) + + def test_fillna_columns(self): + df = DataFrame(np.random.randn(10, 10)) + df.values[:, ::2] = np.nan + + result = df.fillna(method="ffill", axis=1) + expected = df.T.fillna(method="pad").T + tm.assert_frame_equal(result, expected) + + df.insert(6, "foo", 5) + result = df.fillna(method="ffill", axis=1) + expected = df.astype(float).fillna(method="ffill", axis=1) + tm.assert_frame_equal(result, expected) + + def test_fillna_invalid_method(self, float_frame): + with pytest.raises(ValueError, match="ffil"): + float_frame.fillna(method="ffil") + + def test_fillna_invalid_value(self, float_frame): + # list + msg = '"value" parameter must be a scalar or dict, but you passed a "{}"' + with pytest.raises(TypeError, match=msg.format("list")): + float_frame.fillna([1, 2]) + # tuple + with pytest.raises(TypeError, match=msg.format("tuple")): + float_frame.fillna((1, 2)) + # frame with series + msg = ( + '"value" parameter must be a scalar, dict or Series, but you' + ' passed a "DataFrame"' + ) + with pytest.raises(TypeError, match=msg): + float_frame.iloc[:, 0].fillna(float_frame) + + def test_fillna_col_reordering(self): + cols = ["COL." + str(i) for i in range(5, 0, -1)] + data = np.random.rand(20, 5) + df = DataFrame(index=range(20), columns=cols, data=data) + filled = df.fillna(method="ffill") + assert df.columns.tolist() == filled.columns.tolist() + + def test_fill_corner(self, float_frame, float_string_frame): + mf = float_string_frame + mf.loc[mf.index[5:20], "foo"] = np.nan + mf.loc[mf.index[-10:], "A"] = np.nan + + filled = float_string_frame.fillna(value=0) + assert (filled.loc[filled.index[5:20], "foo"] == 0).all() + del float_string_frame["foo"] + + empty_float = float_frame.reindex(columns=[]) + + # TODO(wesm): unused? + result = empty_float.fillna(value=0) # noqa + + def test_fill_value_when_combine_const(self): + # GH12723 + dat = np.array([0, 1, np.nan, 3, 4, 5], dtype="float") + df = DataFrame({"foo": dat}, index=range(6)) + + exp = df.fillna(0).add(2) + res = df.add(2, fill_value=0) + tm.assert_frame_equal(res, exp) + + +class TestDataFrameInterpolate: + def test_interp_basic(self): + df = DataFrame( + { + "A": [1, 2, np.nan, 4], + "B": [1, 4, 9, np.nan], + "C": [1, 2, 3, 5], + "D": list("abcd"), + } + ) + expected = DataFrame( + { + "A": [1.0, 2.0, 3.0, 4.0], + "B": [1.0, 4.0, 9.0, 9.0], + "C": [1, 2, 3, 5], + "D": list("abcd"), + } + ) + result = df.interpolate() + tm.assert_frame_equal(result, expected) + + result = df.set_index("C").interpolate() + expected = df.set_index("C") + expected.loc[3, "A"] = 3 + expected.loc[5, "B"] = 9 + tm.assert_frame_equal(result, expected) + + def test_interp_bad_method(self): + df = DataFrame( + { + "A": [1, 2, np.nan, 4], + "B": [1, 4, 9, np.nan], + "C": [1, 2, 3, 5], + "D": list("abcd"), + } + ) + with pytest.raises(ValueError): + df.interpolate(method="not_a_method") + + def test_interp_combo(self): + df = DataFrame( + { + "A": [1.0, 2.0, np.nan, 4.0], + "B": [1, 4, 9, np.nan], + "C": [1, 2, 3, 5], + "D": list("abcd"), + } + ) + + result = df["A"].interpolate() + expected = Series([1.0, 2.0, 3.0, 4.0], name="A") + tm.assert_series_equal(result, expected) + + result = df["A"].interpolate(downcast="infer") + expected = Series([1, 2, 3, 4], name="A") + tm.assert_series_equal(result, expected) + + def test_interp_nan_idx(self): + df = DataFrame({"A": [1, 2, np.nan, 4], "B": [np.nan, 2, 3, 4]}) + df = df.set_index("A") + with pytest.raises(NotImplementedError): + df.interpolate(method="values") + + @td.skip_if_no_scipy + def test_interp_various(self): + df = DataFrame( + {"A": [1, 2, np.nan, 4, 5, np.nan, 7], "C": [1, 2, 3, 5, 8, 13, 21]} + ) + df = df.set_index("C") + expected = df.copy() + result = df.interpolate(method="polynomial", order=1) + + expected.A.loc[3] = 2.66666667 + expected.A.loc[13] = 5.76923076 + tm.assert_frame_equal(result, expected) + + result = df.interpolate(method="cubic") + # GH #15662. + expected.A.loc[3] = 2.81547781 + expected.A.loc[13] = 5.52964175 + tm.assert_frame_equal(result, expected) + + result = df.interpolate(method="nearest") + expected.A.loc[3] = 2 + expected.A.loc[13] = 5 + tm.assert_frame_equal(result, expected, check_dtype=False) + + result = df.interpolate(method="quadratic") + expected.A.loc[3] = 2.82150771 + expected.A.loc[13] = 6.12648668 + tm.assert_frame_equal(result, expected) + + result = df.interpolate(method="slinear") + expected.A.loc[3] = 2.66666667 + expected.A.loc[13] = 5.76923077 + tm.assert_frame_equal(result, expected) + + result = df.interpolate(method="zero") + expected.A.loc[3] = 2.0 + expected.A.loc[13] = 5 + tm.assert_frame_equal(result, expected, check_dtype=False) + + @td.skip_if_no_scipy + def test_interp_alt_scipy(self): + df = DataFrame( + {"A": [1, 2, np.nan, 4, 5, np.nan, 7], "C": [1, 2, 3, 5, 8, 13, 21]} + ) + result = df.interpolate(method="barycentric") + expected = df.copy() + expected.loc[2, "A"] = 3 + expected.loc[5, "A"] = 6 + tm.assert_frame_equal(result, expected) + + result = df.interpolate(method="barycentric", downcast="infer") + tm.assert_frame_equal(result, expected.astype(np.int64)) + + result = df.interpolate(method="krogh") + expectedk = df.copy() + expectedk["A"] = expected["A"] + tm.assert_frame_equal(result, expectedk) + + result = df.interpolate(method="pchip") + expected.loc[2, "A"] = 3 + expected.loc[5, "A"] = 6.0 + + tm.assert_frame_equal(result, expected) + + def test_interp_rowwise(self): + df = DataFrame( + { + 0: [1, 2, np.nan, 4], + 1: [2, 3, 4, np.nan], + 2: [np.nan, 4, 5, 6], + 3: [4, np.nan, 6, 7], + 4: [1, 2, 3, 4], + } + ) + result = df.interpolate(axis=1) + expected = df.copy() + expected.loc[3, 1] = 5 + expected.loc[0, 2] = 3 + expected.loc[1, 3] = 3 + expected[4] = expected[4].astype(np.float64) + tm.assert_frame_equal(result, expected) + + result = df.interpolate(axis=1, method="values") + tm.assert_frame_equal(result, expected) + + result = df.interpolate(axis=0) + expected = df.interpolate() + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "axis_name, axis_number", + [ + pytest.param("rows", 0, id="rows_0"), + pytest.param("index", 0, id="index_0"), + pytest.param("columns", 1, id="columns_1"), + ], + ) + def test_interp_axis_names(self, axis_name, axis_number): + # GH 29132: test axis names + data = {0: [0, np.nan, 6], 1: [1, np.nan, 7], 2: [2, 5, 8]} + + df = DataFrame(data, dtype=np.float64) + result = df.interpolate(axis=axis_name, method="linear") + expected = df.interpolate(axis=axis_number, method="linear") + tm.assert_frame_equal(result, expected) + + def test_rowwise_alt(self): + df = DataFrame( + { + 0: [0, 0.5, 1.0, np.nan, 4, 8, np.nan, np.nan, 64], + 1: [1, 2, 3, 4, 3, 2, 1, 0, -1], + } + ) + df.interpolate(axis=0) + + @pytest.mark.parametrize( + "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] + ) + def test_interp_leading_nans(self, check_scipy): + df = DataFrame( + {"A": [np.nan, np.nan, 0.5, 0.25, 0], "B": [np.nan, -3, -3.5, np.nan, -4]} + ) + result = df.interpolate() + expected = df.copy() + expected["B"].loc[3] = -3.75 + tm.assert_frame_equal(result, expected) + + if check_scipy: + result = df.interpolate(method="polynomial", order=1) + tm.assert_frame_equal(result, expected) + + def test_interp_raise_on_only_mixed(self): + df = DataFrame( + { + "A": [1, 2, np.nan, 4], + "B": ["a", "b", "c", "d"], + "C": [np.nan, 2, 5, 7], + "D": [np.nan, np.nan, 9, 9], + "E": [1, 2, 3, 4], + } + ) + with pytest.raises(TypeError): + df.interpolate(axis=1) + + def test_interp_raise_on_all_object_dtype(self): + # GH 22985 + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, dtype="object") + msg = ( + "Cannot interpolate with all object-dtype columns " + "in the DataFrame. Try setting at least one " + "column to a numeric dtype." + ) + with pytest.raises(TypeError, match=msg): + df.interpolate() + + def test_interp_inplace(self): + df = DataFrame({"a": [1.0, 2.0, np.nan, 4.0]}) + expected = DataFrame({"a": [1.0, 2.0, 3.0, 4.0]}) + result = df.copy() + result["a"].interpolate(inplace=True) + tm.assert_frame_equal(result, expected) + + result = df.copy() + result["a"].interpolate(inplace=True, downcast="infer") + tm.assert_frame_equal(result, expected.astype("int64")) + + def test_interp_inplace_row(self): + # GH 10395 + result = DataFrame( + {"a": [1.0, 2.0, 3.0, 4.0], "b": [np.nan, 2.0, 3.0, 4.0], "c": [3, 2, 2, 2]} + ) + expected = result.interpolate(method="linear", axis=1, inplace=False) + result.interpolate(method="linear", axis=1, inplace=True) + tm.assert_frame_equal(result, expected) + + def test_interp_ignore_all_good(self): + # GH + df = DataFrame( + { + "A": [1, 2, np.nan, 4], + "B": [1, 2, 3, 4], + "C": [1.0, 2.0, np.nan, 4.0], + "D": [1.0, 2.0, 3.0, 4.0], + } + ) + expected = DataFrame( + { + "A": np.array([1, 2, 3, 4], dtype="float64"), + "B": np.array([1, 2, 3, 4], dtype="int64"), + "C": np.array([1.0, 2.0, 3, 4.0], dtype="float64"), + "D": np.array([1.0, 2.0, 3.0, 4.0], dtype="float64"), + } + ) + + result = df.interpolate(downcast=None) + tm.assert_frame_equal(result, expected) + + # all good + result = df[["B", "D"]].interpolate(downcast=None) + tm.assert_frame_equal(result, df[["B", "D"]]) + + @pytest.mark.parametrize("axis", [0, 1]) + def test_interp_time_inplace_axis(self, axis): + # GH 9687 + periods = 5 + idx = pd.date_range(start="2014-01-01", periods=periods) + data = np.random.rand(periods, periods) + data[data < 0.5] = np.nan + expected = pd.DataFrame(index=idx, columns=idx, data=data) + + result = expected.interpolate(axis=0, method="time") + expected.interpolate(axis=0, method="time", inplace=True) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/frame/test_mutate_columns.py b/venv/Lib/site-packages/pandas/tests/frame/test_mutate_columns.py new file mode 100644 index 0000000..8bc2aa2 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/test_mutate_columns.py @@ -0,0 +1,262 @@ +import re + +import numpy as np +import pytest + +from pandas import DataFrame, Index, MultiIndex, Series +import pandas._testing as tm + +# Column add, remove, delete. + + +class TestDataFrameMutateColumns: + def test_assign(self): + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + original = df.copy() + result = df.assign(C=df.B / df.A) + expected = df.copy() + expected["C"] = [4, 2.5, 2] + tm.assert_frame_equal(result, expected) + + # lambda syntax + result = df.assign(C=lambda x: x.B / x.A) + tm.assert_frame_equal(result, expected) + + # original is unmodified + tm.assert_frame_equal(df, original) + + # Non-Series array-like + result = df.assign(C=[4, 2.5, 2]) + tm.assert_frame_equal(result, expected) + # original is unmodified + tm.assert_frame_equal(df, original) + + result = df.assign(B=df.B / df.A) + expected = expected.drop("B", axis=1).rename(columns={"C": "B"}) + tm.assert_frame_equal(result, expected) + + # overwrite + result = df.assign(A=df.A + df.B) + expected = df.copy() + expected["A"] = [5, 7, 9] + tm.assert_frame_equal(result, expected) + + # lambda + result = df.assign(A=lambda x: x.A + x.B) + tm.assert_frame_equal(result, expected) + + def test_assign_multiple(self): + df = DataFrame([[1, 4], [2, 5], [3, 6]], columns=["A", "B"]) + result = df.assign(C=[7, 8, 9], D=df.A, E=lambda x: x.B) + expected = DataFrame( + [[1, 4, 7, 1, 4], [2, 5, 8, 2, 5], [3, 6, 9, 3, 6]], columns=list("ABCDE") + ) + tm.assert_frame_equal(result, expected) + + def test_assign_order(self): + # GH 9818 + df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + result = df.assign(D=df.A + df.B, C=df.A - df.B) + + expected = DataFrame([[1, 2, 3, -1], [3, 4, 7, -1]], columns=list("ABDC")) + tm.assert_frame_equal(result, expected) + result = df.assign(C=df.A - df.B, D=df.A + df.B) + + expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]], columns=list("ABCD")) + + tm.assert_frame_equal(result, expected) + + def test_assign_bad(self): + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + + # non-keyword argument + with pytest.raises(TypeError): + df.assign(lambda x: x.A) + with pytest.raises(AttributeError): + df.assign(C=df.A, D=df.A + df.C) + + def test_assign_dependent(self): + df = DataFrame({"A": [1, 2], "B": [3, 4]}) + + result = df.assign(C=df.A, D=lambda x: x["A"] + x["C"]) + expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], columns=list("ABCD")) + tm.assert_frame_equal(result, expected) + + result = df.assign(C=lambda df: df.A, D=lambda df: df["A"] + df["C"]) + expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], columns=list("ABCD")) + tm.assert_frame_equal(result, expected) + + def test_insert_error_msmgs(self): + + # GH 7432 + df = DataFrame( + {"foo": ["a", "b", "c"], "bar": [1, 2, 3], "baz": ["d", "e", "f"]} + ).set_index("foo") + s = DataFrame( + {"foo": ["a", "b", "c", "a"], "fiz": ["g", "h", "i", "j"]} + ).set_index("foo") + msg = "cannot reindex from a duplicate axis" + with pytest.raises(ValueError, match=msg): + df["newcol"] = s + + # GH 4107, more descriptive error message + df = DataFrame(np.random.randint(0, 2, (4, 4)), columns=["a", "b", "c", "d"]) + + msg = "incompatible index of inserted column with frame index" + with pytest.raises(TypeError, match=msg): + df["gr"] = df.groupby(["b", "c"]).count() + + def test_insert_benchmark(self): + # from the vb_suite/frame_methods/frame_insert_columns + N = 10 + K = 5 + df = DataFrame(index=range(N)) + new_col = np.random.randn(N) + for i in range(K): + df[i] = new_col + expected = DataFrame(np.repeat(new_col, K).reshape(N, K), index=range(N)) + tm.assert_frame_equal(df, expected) + + def test_insert(self): + df = DataFrame( + np.random.randn(5, 3), index=np.arange(5), columns=["c", "b", "a"] + ) + + df.insert(0, "foo", df["a"]) + tm.assert_index_equal(df.columns, Index(["foo", "c", "b", "a"])) + tm.assert_series_equal(df["a"], df["foo"], check_names=False) + + df.insert(2, "bar", df["c"]) + tm.assert_index_equal(df.columns, Index(["foo", "c", "bar", "b", "a"])) + tm.assert_almost_equal(df["c"], df["bar"], check_names=False) + + # diff dtype + + # new item + df["x"] = df["a"].astype("float32") + result = df.dtypes + expected = Series( + [np.dtype("float64")] * 5 + [np.dtype("float32")], + index=["foo", "c", "bar", "b", "a", "x"], + ) + tm.assert_series_equal(result, expected) + + # replacing current (in different block) + df["a"] = df["a"].astype("float32") + result = df.dtypes + expected = Series( + [np.dtype("float64")] * 4 + [np.dtype("float32")] * 2, + index=["foo", "c", "bar", "b", "a", "x"], + ) + tm.assert_series_equal(result, expected) + + df["y"] = df["a"].astype("int32") + result = df.dtypes + expected = Series( + [np.dtype("float64")] * 4 + [np.dtype("float32")] * 2 + [np.dtype("int32")], + index=["foo", "c", "bar", "b", "a", "x", "y"], + ) + tm.assert_series_equal(result, expected) + + with pytest.raises(ValueError, match="already exists"): + df.insert(1, "a", df["b"]) + msg = "cannot insert c, already exists" + with pytest.raises(ValueError, match=msg): + df.insert(1, "c", df["b"]) + + df.columns.name = "some_name" + # preserve columns name field + df.insert(0, "baz", df["c"]) + assert df.columns.name == "some_name" + + # GH 13522 + df = DataFrame(index=["A", "B", "C"]) + df["X"] = df.index + df["X"] = ["x", "y", "z"] + exp = DataFrame(data={"X": ["x", "y", "z"]}, index=["A", "B", "C"]) + tm.assert_frame_equal(df, exp) + + def test_delitem(self, float_frame): + del float_frame["A"] + assert "A" not in float_frame + + def test_delitem_multiindex(self): + midx = MultiIndex.from_product([["A", "B"], [1, 2]]) + df = DataFrame(np.random.randn(4, 4), columns=midx) + assert len(df.columns) == 4 + assert ("A",) in df.columns + assert "A" in df.columns + + result = df["A"] + assert isinstance(result, DataFrame) + del df["A"] + + assert len(df.columns) == 2 + + # A still in the levels, BUT get a KeyError if trying + # to delete + assert ("A",) not in df.columns + with pytest.raises(KeyError, match=re.escape("('A',)")): + del df[("A",)] + + # behavior of dropped/deleted MultiIndex levels changed from + # GH 2770 to GH 19027: MultiIndex no longer '.__contains__' + # levels which are dropped/deleted + assert "A" not in df.columns + with pytest.raises(KeyError, match=re.escape("('A',)")): + del df["A"] + + def test_pop(self, float_frame): + float_frame.columns.name = "baz" + + float_frame.pop("A") + assert "A" not in float_frame + + float_frame["foo"] = "bar" + float_frame.pop("foo") + assert "foo" not in float_frame + assert float_frame.columns.name == "baz" + + # gh-10912: inplace ops cause caching issue + a = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"], index=["X", "Y"]) + b = a.pop("B") + b += 1 + + # original frame + expected = DataFrame([[1, 3], [4, 6]], columns=["A", "C"], index=["X", "Y"]) + tm.assert_frame_equal(a, expected) + + # result + expected = Series([2, 5], index=["X", "Y"], name="B") + 1 + tm.assert_series_equal(b, expected) + + def test_pop_non_unique_cols(self): + df = DataFrame({0: [0, 1], 1: [0, 1], 2: [4, 5]}) + df.columns = ["a", "b", "a"] + + res = df.pop("a") + assert type(res) == DataFrame + assert len(res) == 2 + assert len(df.columns) == 1 + assert "b" in df.columns + assert "a" not in df.columns + assert len(df.index) == 2 + + def test_insert_column_bug_4032(self): + + # GH4032, inserting a column and renaming causing errors + df = DataFrame({"b": [1.1, 2.2]}) + df = df.rename(columns={}) + df.insert(0, "a", [1, 2]) + + result = df.rename(columns={}) + str(result) + expected = DataFrame([[1, 1.1], [2, 2.2]], columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + df.insert(0, "c", [1.3, 2.3]) + + result = df.rename(columns={}) + str(result) + + expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]], columns=["c", "a", "b"]) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/frame/test_nonunique_indexes.py b/venv/Lib/site-packages/pandas/tests/frame/test_nonunique_indexes.py new file mode 100644 index 0000000..32ead40 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/test_nonunique_indexes.py @@ -0,0 +1,526 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, MultiIndex, Series, date_range +import pandas._testing as tm + + +class TestDataFrameNonuniqueIndexes: + def test_column_dups_operations(self): + def check(result, expected=None): + if expected is not None: + tm.assert_frame_equal(result, expected) + result.dtypes + str(result) + + # assignment + # GH 3687 + arr = np.random.randn(3, 2) + idx = list(range(2)) + df = DataFrame(arr, columns=["A", "A"]) + df.columns = idx + expected = DataFrame(arr, columns=idx) + check(df, expected) + + idx = date_range("20130101", periods=4, freq="Q-NOV") + df = DataFrame( + [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=["a", "a", "a", "a"] + ) + df.columns = idx + expected = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx) + check(df, expected) + + # insert + df = DataFrame( + [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], + columns=["foo", "bar", "foo", "hello"], + ) + df["string"] = "bah" + expected = DataFrame( + [[1, 1, 1, 5, "bah"], [1, 1, 2, 5, "bah"], [2, 1, 3, 5, "bah"]], + columns=["foo", "bar", "foo", "hello", "string"], + ) + check(df, expected) + with pytest.raises(ValueError, match="Length of value"): + df.insert(0, "AnotherColumn", range(len(df.index) - 1)) + + # insert same dtype + df["foo2"] = 3 + expected = DataFrame( + [[1, 1, 1, 5, "bah", 3], [1, 1, 2, 5, "bah", 3], [2, 1, 3, 5, "bah", 3]], + columns=["foo", "bar", "foo", "hello", "string", "foo2"], + ) + check(df, expected) + + # set (non-dup) + df["foo2"] = 4 + expected = DataFrame( + [[1, 1, 1, 5, "bah", 4], [1, 1, 2, 5, "bah", 4], [2, 1, 3, 5, "bah", 4]], + columns=["foo", "bar", "foo", "hello", "string", "foo2"], + ) + check(df, expected) + df["foo2"] = 3 + + # delete (non dup) + del df["bar"] + expected = DataFrame( + [[1, 1, 5, "bah", 3], [1, 2, 5, "bah", 3], [2, 3, 5, "bah", 3]], + columns=["foo", "foo", "hello", "string", "foo2"], + ) + check(df, expected) + + # try to delete again (its not consolidated) + del df["hello"] + expected = DataFrame( + [[1, 1, "bah", 3], [1, 2, "bah", 3], [2, 3, "bah", 3]], + columns=["foo", "foo", "string", "foo2"], + ) + check(df, expected) + + # consolidate + df = df._consolidate() + expected = DataFrame( + [[1, 1, "bah", 3], [1, 2, "bah", 3], [2, 3, "bah", 3]], + columns=["foo", "foo", "string", "foo2"], + ) + check(df, expected) + + # insert + df.insert(2, "new_col", 5.0) + expected = DataFrame( + [[1, 1, 5.0, "bah", 3], [1, 2, 5.0, "bah", 3], [2, 3, 5.0, "bah", 3]], + columns=["foo", "foo", "new_col", "string", "foo2"], + ) + check(df, expected) + + # insert a dup + with pytest.raises(ValueError, match="cannot insert"): + df.insert(2, "new_col", 4.0) + + df.insert(2, "new_col", 4.0, allow_duplicates=True) + expected = DataFrame( + [ + [1, 1, 4.0, 5.0, "bah", 3], + [1, 2, 4.0, 5.0, "bah", 3], + [2, 3, 4.0, 5.0, "bah", 3], + ], + columns=["foo", "foo", "new_col", "new_col", "string", "foo2"], + ) + check(df, expected) + + # delete (dup) + del df["foo"] + expected = DataFrame( + [[4.0, 5.0, "bah", 3], [4.0, 5.0, "bah", 3], [4.0, 5.0, "bah", 3]], + columns=["new_col", "new_col", "string", "foo2"], + ) + tm.assert_frame_equal(df, expected) + + # dup across dtypes + df = DataFrame( + [[1, 1, 1.0, 5], [1, 1, 2.0, 5], [2, 1, 3.0, 5]], + columns=["foo", "bar", "foo", "hello"], + ) + check(df) + + df["foo2"] = 7.0 + expected = DataFrame( + [[1, 1, 1.0, 5, 7.0], [1, 1, 2.0, 5, 7.0], [2, 1, 3.0, 5, 7.0]], + columns=["foo", "bar", "foo", "hello", "foo2"], + ) + check(df, expected) + + result = df["foo"] + expected = DataFrame([[1, 1.0], [1, 2.0], [2, 3.0]], columns=["foo", "foo"]) + check(result, expected) + + # multiple replacements + df["foo"] = "string" + expected = DataFrame( + [ + ["string", 1, "string", 5, 7.0], + ["string", 1, "string", 5, 7.0], + ["string", 1, "string", 5, 7.0], + ], + columns=["foo", "bar", "foo", "hello", "foo2"], + ) + check(df, expected) + + del df["foo"] + expected = DataFrame( + [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "hello", "foo2"] + ) + check(df, expected) + + # values + df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=["x", "x"]) + result = df.values + expected = np.array([[1, 2.5], [3, 4.5]]) + assert (result == expected).all().all() + + # rename, GH 4403 + df4 = DataFrame( + {"RT": [0.0454], "TClose": [22.02], "TExg": [0.0422]}, + index=MultiIndex.from_tuples( + [(600809, 20130331)], names=["STK_ID", "RPT_Date"] + ), + ) + + df5 = DataFrame( + { + "RPT_Date": [20120930, 20121231, 20130331], + "STK_ID": [600809] * 3, + "STK_Name": ["饡驦", "饡驦", "饡驦"], + "TClose": [38.05, 41.66, 30.01], + }, + index=MultiIndex.from_tuples( + [(600809, 20120930), (600809, 20121231), (600809, 20130331)], + names=["STK_ID", "RPT_Date"], + ), + ) + + k = pd.merge(df4, df5, how="inner", left_index=True, right_index=True) + result = k.rename(columns={"TClose_x": "TClose", "TClose_y": "QT_Close"}) + str(result) + result.dtypes + + expected = DataFrame( + [[0.0454, 22.02, 0.0422, 20130331, 600809, "饡驦", 30.01]], + columns=[ + "RT", + "TClose", + "TExg", + "RPT_Date", + "STK_ID", + "STK_Name", + "QT_Close", + ], + ).set_index(["STK_ID", "RPT_Date"], drop=False) + tm.assert_frame_equal(result, expected) + + # reindex is invalid! + df = DataFrame( + [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"] + ) + msg = "cannot reindex from a duplicate axis" + with pytest.raises(ValueError, match=msg): + df.reindex(columns=["bar"]) + with pytest.raises(ValueError, match=msg): + df.reindex(columns=["bar", "foo"]) + + # drop + df = DataFrame( + [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"] + ) + result = df.drop(["a"], axis=1) + expected = DataFrame([[1], [1], [1]], columns=["bar"]) + check(result, expected) + result = df.drop("a", axis=1) + check(result, expected) + + # describe + df = DataFrame( + [[1, 1, 1], [2, 2, 2], [3, 3, 3]], + columns=["bar", "a", "a"], + dtype="float64", + ) + result = df.describe() + s = df.iloc[:, 0].describe() + expected = pd.concat([s, s, s], keys=df.columns, axis=1) + check(result, expected) + + # check column dups with index equal and not equal to df's index + df = DataFrame( + np.random.randn(5, 3), + index=["a", "b", "c", "d", "e"], + columns=["A", "B", "A"], + ) + for index in [df.index, pd.Index(list("edcba"))]: + this_df = df.copy() + expected_ser = pd.Series(index.values, index=this_df.index) + expected_df = DataFrame( + {"A": expected_ser, "B": this_df["B"], "A": expected_ser}, + columns=["A", "B", "A"], + ) + this_df["A"] = index + check(this_df, expected_df) + + # operations + for op in ["__add__", "__mul__", "__sub__", "__truediv__"]: + df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10))) + expected = getattr(df, op)(df) + expected.columns = ["A", "A"] + df.columns = ["A", "A"] + result = getattr(df, op)(df) + check(result, expected) + + # multiple assignments that change dtypes + # the location indexer is a slice + # GH 6120 + df = DataFrame(np.random.randn(5, 2), columns=["that", "that"]) + expected = DataFrame(1.0, index=range(5), columns=["that", "that"]) + + df["that"] = 1.0 + check(df, expected) + + df = DataFrame(np.random.rand(5, 2), columns=["that", "that"]) + expected = DataFrame(1, index=range(5), columns=["that", "that"]) + + df["that"] = 1 + check(df, expected) + + def test_column_dups2(self): + + # drop buggy GH 6240 + df = DataFrame( + { + "A": np.random.randn(5), + "B": np.random.randn(5), + "C": np.random.randn(5), + "D": ["a", "b", "c", "d", "e"], + } + ) + + expected = df.take([0, 1, 1], axis=1) + df2 = df.take([2, 0, 1, 2, 1], axis=1) + result = df2.drop("C", axis=1) + tm.assert_frame_equal(result, expected) + + # dropna + df = DataFrame( + { + "A": np.random.randn(5), + "B": np.random.randn(5), + "C": np.random.randn(5), + "D": ["a", "b", "c", "d", "e"], + } + ) + df.iloc[2, [0, 1, 2]] = np.nan + df.iloc[0, 0] = np.nan + df.iloc[1, 1] = np.nan + df.iloc[:, 3] = np.nan + expected = df.dropna(subset=["A", "B", "C"], how="all") + expected.columns = ["A", "A", "B", "C"] + + df.columns = ["A", "A", "B", "C"] + + result = df.dropna(subset=["A", "C"], how="all") + tm.assert_frame_equal(result, expected) + + def test_column_dups_indexing(self): + def check(result, expected=None): + if expected is not None: + tm.assert_frame_equal(result, expected) + result.dtypes + str(result) + + # boolean indexing + # GH 4879 + dups = ["A", "A", "C", "D"] + df = DataFrame( + np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64" + ) + expected = df[df.C > 6] + expected.columns = dups + df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64") + result = df[df.C > 6] + check(result, expected) + + # where + df = DataFrame( + np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64" + ) + expected = df[df > 6] + expected.columns = dups + df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64") + result = df[df > 6] + check(result, expected) + + # boolean with the duplicate raises + df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64") + msg = "cannot reindex from a duplicate axis" + with pytest.raises(ValueError, match=msg): + df[df.A > 6] + + # dup aligning operations should work + # GH 5185 + df1 = DataFrame([1, 2, 3, 4, 5], index=[1, 2, 1, 2, 3]) + df2 = DataFrame([1, 2, 3], index=[1, 2, 3]) + expected = DataFrame([0, 2, 0, 2, 2], index=[1, 1, 2, 2, 3]) + result = df1.sub(df2) + tm.assert_frame_equal(result, expected) + + # equality + df1 = DataFrame([[1, 2], [2, np.nan], [3, 4], [4, 4]], columns=["A", "B"]) + df2 = DataFrame([[0, 1], [2, 4], [2, np.nan], [4, 5]], columns=["A", "A"]) + + # not-comparing like-labelled + msg = "Can only compare identically-labeled DataFrame objects" + with pytest.raises(ValueError, match=msg): + df1 == df2 + + df1r = df1.reindex_like(df2) + result = df1r == df2 + expected = DataFrame( + [[False, True], [True, False], [False, False], [True, False]], + columns=["A", "A"], + ) + tm.assert_frame_equal(result, expected) + + # mixed column selection + # GH 5639 + dfbool = DataFrame( + { + "one": Series([True, True, False], index=["a", "b", "c"]), + "two": Series([False, False, True, False], index=["a", "b", "c", "d"]), + "three": Series([False, True, True, True], index=["a", "b", "c", "d"]), + } + ) + expected = pd.concat([dfbool["one"], dfbool["three"], dfbool["one"]], axis=1) + result = dfbool[["one", "three", "one"]] + check(result, expected) + + # multi-axis dups + # GH 6121 + df = DataFrame( + np.arange(25.0).reshape(5, 5), + index=["a", "b", "c", "d", "e"], + columns=["A", "B", "C", "D", "E"], + ) + z = df[["A", "C", "A"]].copy() + expected = z.loc[["a", "c", "a"]] + + df = DataFrame( + np.arange(25.0).reshape(5, 5), + index=["a", "b", "c", "d", "e"], + columns=["A", "B", "C", "D", "E"], + ) + z = df[["A", "C", "A"]] + result = z.loc[["a", "c", "a"]] + check(result, expected) + + def test_column_dups_indexing2(self): + + # GH 8363 + # datetime ops with a non-unique index + df = DataFrame( + {"A": np.arange(5, dtype="int64"), "B": np.arange(1, 6, dtype="int64")}, + index=[2, 2, 3, 3, 4], + ) + result = df.B - df.A + expected = Series(1, index=[2, 2, 3, 3, 4]) + tm.assert_series_equal(result, expected) + + df = DataFrame( + { + "A": date_range("20130101", periods=5), + "B": date_range("20130101 09:00:00", periods=5), + }, + index=[2, 2, 3, 3, 4], + ) + result = df.B - df.A + expected = Series(pd.Timedelta("9 hours"), index=[2, 2, 3, 3, 4]) + tm.assert_series_equal(result, expected) + + def test_columns_with_dups(self): + # GH 3468 related + + # basic + df = DataFrame([[1, 2]], columns=["a", "a"]) + df.columns = ["a", "a.1"] + str(df) + expected = DataFrame([[1, 2]], columns=["a", "a.1"]) + tm.assert_frame_equal(df, expected) + + df = DataFrame([[1, 2, 3]], columns=["b", "a", "a"]) + df.columns = ["b", "a", "a.1"] + str(df) + expected = DataFrame([[1, 2, 3]], columns=["b", "a", "a.1"]) + tm.assert_frame_equal(df, expected) + + # with a dup index + df = DataFrame([[1, 2]], columns=["a", "a"]) + df.columns = ["b", "b"] + str(df) + expected = DataFrame([[1, 2]], columns=["b", "b"]) + tm.assert_frame_equal(df, expected) + + # multi-dtype + df = DataFrame( + [[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], + columns=["a", "a", "b", "b", "d", "c", "c"], + ) + df.columns = list("ABCDEFG") + str(df) + expected = DataFrame( + [[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], columns=list("ABCDEFG") + ) + tm.assert_frame_equal(df, expected) + + df = DataFrame([[1, 2, "foo", "bar"]], columns=["a", "a", "a", "a"]) + df.columns = ["a", "a.1", "a.2", "a.3"] + str(df) + expected = DataFrame([[1, 2, "foo", "bar"]], columns=["a", "a.1", "a.2", "a.3"]) + tm.assert_frame_equal(df, expected) + + # dups across blocks + df_float = DataFrame(np.random.randn(10, 3), dtype="float64") + df_int = DataFrame(np.random.randn(10, 3), dtype="int64") + df_bool = DataFrame(True, index=df_float.index, columns=df_float.columns) + df_object = DataFrame("foo", index=df_float.index, columns=df_float.columns) + df_dt = DataFrame( + pd.Timestamp("20010101"), index=df_float.index, columns=df_float.columns + ) + df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1) + + assert len(df._data._blknos) == len(df.columns) + assert len(df._data._blklocs) == len(df.columns) + + # testing iloc + for i in range(len(df.columns)): + df.iloc[:, i] + + # dup columns across dtype GH 2079/2194 + vals = [[1, -1, 2.0], [2, -2, 3.0]] + rs = DataFrame(vals, columns=["A", "A", "B"]) + xp = DataFrame(vals) + xp.columns = ["A", "A", "B"] + tm.assert_frame_equal(rs, xp) + + def test_values_duplicates(self): + df = DataFrame( + [[1, 2, "a", "b"], [1, 2, "a", "b"]], columns=["one", "one", "two", "two"] + ) + + result = df.values + expected = np.array([[1, 2, "a", "b"], [1, 2, "a", "b"]], dtype=object) + + tm.assert_numpy_array_equal(result, expected) + + def test_set_value_by_index(self): + # See gh-12344 + df = DataFrame(np.arange(9).reshape(3, 3).T) + df.columns = list("AAA") + expected = df.iloc[:, 2] + + df.iloc[:, 0] = 3 + tm.assert_series_equal(df.iloc[:, 2], expected) + + df = DataFrame(np.arange(9).reshape(3, 3).T) + df.columns = [2, float(2), str(2)] + expected = df.iloc[:, 1] + + df.iloc[:, 0] = 3 + tm.assert_series_equal(df.iloc[:, 1], expected) + + def test_insert_with_columns_dups(self): + # GH 14291 + df = pd.DataFrame() + df.insert(0, "A", ["g", "h", "i"], allow_duplicates=True) + df.insert(0, "A", ["d", "e", "f"], allow_duplicates=True) + df.insert(0, "A", ["a", "b", "c"], allow_duplicates=True) + exp = pd.DataFrame( + [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"] + ) + tm.assert_frame_equal(df, exp) diff --git a/venv/Lib/site-packages/pandas/tests/frame/test_operators.py b/venv/Lib/site-packages/pandas/tests/frame/test_operators.py new file mode 100644 index 0000000..c727cb3 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/test_operators.py @@ -0,0 +1,890 @@ +from decimal import Decimal +import operator + +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, MultiIndex, Series +import pandas._testing as tm +import pandas.core.common as com +from pandas.tests.frame.common import _check_mixed_float + + +class TestDataFrameUnaryOperators: + # __pos__, __neg__, __inv__ + + @pytest.mark.parametrize( + "df,expected", + [ + (pd.DataFrame({"a": [-1, 1]}), pd.DataFrame({"a": [1, -1]})), + (pd.DataFrame({"a": [False, True]}), pd.DataFrame({"a": [True, False]})), + ( + pd.DataFrame({"a": pd.Series(pd.to_timedelta([-1, 1]))}), + pd.DataFrame({"a": pd.Series(pd.to_timedelta([1, -1]))}), + ), + ], + ) + def test_neg_numeric(self, df, expected): + tm.assert_frame_equal(-df, expected) + tm.assert_series_equal(-df["a"], expected["a"]) + + @pytest.mark.parametrize( + "df, expected", + [ + (np.array([1, 2], dtype=object), np.array([-1, -2], dtype=object)), + ([Decimal("1.0"), Decimal("2.0")], [Decimal("-1.0"), Decimal("-2.0")]), + ], + ) + def test_neg_object(self, df, expected): + # GH#21380 + df = pd.DataFrame({"a": df}) + expected = pd.DataFrame({"a": expected}) + tm.assert_frame_equal(-df, expected) + tm.assert_series_equal(-df["a"], expected["a"]) + + @pytest.mark.parametrize( + "df", + [ + pd.DataFrame({"a": ["a", "b"]}), + pd.DataFrame({"a": pd.to_datetime(["2017-01-22", "1970-01-01"])}), + ], + ) + def test_neg_raises(self, df): + with pytest.raises(TypeError): + (-df) + with pytest.raises(TypeError): + (-df["a"]) + + def test_invert(self, float_frame): + df = float_frame + + tm.assert_frame_equal(-(df < 0), ~(df < 0)) + + @pytest.mark.parametrize( + "df", + [ + pd.DataFrame({"a": [-1, 1]}), + pd.DataFrame({"a": [False, True]}), + pd.DataFrame({"a": pd.Series(pd.to_timedelta([-1, 1]))}), + ], + ) + def test_pos_numeric(self, df): + # GH#16073 + tm.assert_frame_equal(+df, df) + tm.assert_series_equal(+df["a"], df["a"]) + + @pytest.mark.parametrize( + "df", + [ + # numpy changing behavior in the future + pytest.param( + pd.DataFrame({"a": ["a", "b"]}), + marks=[pytest.mark.filterwarnings("ignore")], + ), + pd.DataFrame({"a": np.array([-1, 2], dtype=object)}), + pd.DataFrame({"a": [Decimal("-1.0"), Decimal("2.0")]}), + ], + ) + def test_pos_object(self, df): + # GH#21380 + tm.assert_frame_equal(+df, df) + tm.assert_series_equal(+df["a"], df["a"]) + + @pytest.mark.parametrize( + "df", [pd.DataFrame({"a": pd.to_datetime(["2017-01-22", "1970-01-01"])})] + ) + def test_pos_raises(self, df): + with pytest.raises(TypeError): + (+df) + with pytest.raises(TypeError): + (+df["a"]) + + +class TestDataFrameLogicalOperators: + # &, |, ^ + + def test_logical_ops_empty_frame(self): + # GH#5808 + # empty frames, non-mixed dtype + df = DataFrame(index=[1]) + + result = df & df + tm.assert_frame_equal(result, df) + + result = df | df + tm.assert_frame_equal(result, df) + + df2 = DataFrame(index=[1, 2]) + result = df & df2 + tm.assert_frame_equal(result, df2) + + dfa = DataFrame(index=[1], columns=["A"]) + + result = dfa & dfa + expected = DataFrame(False, index=[1], columns=["A"]) + tm.assert_frame_equal(result, expected) + + def test_logical_ops_bool_frame(self): + # GH#5808 + df1a_bool = DataFrame(True, index=[1], columns=["A"]) + + result = df1a_bool & df1a_bool + tm.assert_frame_equal(result, df1a_bool) + + result = df1a_bool | df1a_bool + tm.assert_frame_equal(result, df1a_bool) + + def test_logical_ops_int_frame(self): + # GH#5808 + df1a_int = DataFrame(1, index=[1], columns=["A"]) + df1a_bool = DataFrame(True, index=[1], columns=["A"]) + + result = df1a_int | df1a_bool + tm.assert_frame_equal(result, df1a_bool) + + # Check that this matches Series behavior + res_ser = df1a_int["A"] | df1a_bool["A"] + tm.assert_series_equal(res_ser, df1a_bool["A"]) + + def test_logical_ops_invalid(self): + # GH#5808 + + df1 = DataFrame(1.0, index=[1], columns=["A"]) + df2 = DataFrame(True, index=[1], columns=["A"]) + with pytest.raises(TypeError): + df1 | df2 + + df1 = DataFrame("foo", index=[1], columns=["A"]) + df2 = DataFrame(True, index=[1], columns=["A"]) + with pytest.raises(TypeError): + df1 | df2 + + def test_logical_operators(self): + def _check_bin_op(op): + result = op(df1, df2) + expected = DataFrame( + op(df1.values, df2.values), index=df1.index, columns=df1.columns + ) + assert result.values.dtype == np.bool_ + tm.assert_frame_equal(result, expected) + + def _check_unary_op(op): + result = op(df1) + expected = DataFrame(op(df1.values), index=df1.index, columns=df1.columns) + assert result.values.dtype == np.bool_ + tm.assert_frame_equal(result, expected) + + df1 = { + "a": {"a": True, "b": False, "c": False, "d": True, "e": True}, + "b": {"a": False, "b": True, "c": False, "d": False, "e": False}, + "c": {"a": False, "b": False, "c": True, "d": False, "e": False}, + "d": {"a": True, "b": False, "c": False, "d": True, "e": True}, + "e": {"a": True, "b": False, "c": False, "d": True, "e": True}, + } + + df2 = { + "a": {"a": True, "b": False, "c": True, "d": False, "e": False}, + "b": {"a": False, "b": True, "c": False, "d": False, "e": False}, + "c": {"a": True, "b": False, "c": True, "d": False, "e": False}, + "d": {"a": False, "b": False, "c": False, "d": True, "e": False}, + "e": {"a": False, "b": False, "c": False, "d": False, "e": True}, + } + + df1 = DataFrame(df1) + df2 = DataFrame(df2) + + _check_bin_op(operator.and_) + _check_bin_op(operator.or_) + _check_bin_op(operator.xor) + + _check_unary_op(operator.inv) # TODO: belongs elsewhere + + def test_logical_with_nas(self): + d = DataFrame({"a": [np.nan, False], "b": [True, True]}) + + # GH4947 + # bool comparisons should return bool + result = d["a"] | d["b"] + expected = Series([False, True]) + tm.assert_series_equal(result, expected) + + # GH4604, automatic casting here + result = d["a"].fillna(False) | d["b"] + expected = Series([True, True]) + tm.assert_series_equal(result, expected) + + result = d["a"].fillna(False, downcast=False) | d["b"] + expected = Series([True, True]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "left, right, op, expected", + [ + ( + [True, False, np.nan], + [True, False, True], + operator.and_, + [True, False, False], + ), + ( + [True, False, True], + [True, False, np.nan], + operator.and_, + [True, False, False], + ), + ( + [True, False, np.nan], + [True, False, True], + operator.or_, + [True, False, False], + ), + ( + [True, False, True], + [True, False, np.nan], + operator.or_, + [True, False, True], + ), + ], + ) + def test_logical_operators_nans(self, left, right, op, expected): + # GH 13896 + result = op(DataFrame(left), DataFrame(right)) + expected = DataFrame(expected) + + tm.assert_frame_equal(result, expected) + + +class TestDataFrameOperators: + @pytest.mark.parametrize( + "op", [operator.add, operator.sub, operator.mul, operator.truediv] + ) + def test_operators_none_as_na(self, op): + df = DataFrame( + {"col1": [2, 5.0, 123, None], "col2": [1, 2, 3, 4]}, dtype=object + ) + + # since filling converts dtypes from object, changed expected to be + # object + filled = df.fillna(np.nan) + result = op(df, 3) + expected = op(filled, 3).astype(object) + expected[com.isna(expected)] = None + tm.assert_frame_equal(result, expected) + + result = op(df, df) + expected = op(filled, filled).astype(object) + expected[com.isna(expected)] = None + tm.assert_frame_equal(result, expected) + + result = op(df, df.fillna(7)) + tm.assert_frame_equal(result, expected) + + result = op(df.fillna(7), df) + tm.assert_frame_equal(result, expected, check_dtype=False) + + @pytest.mark.parametrize("op,res", [("__eq__", False), ("__ne__", True)]) + # TODO: not sure what's correct here. + @pytest.mark.filterwarnings("ignore:elementwise:FutureWarning") + def test_logical_typeerror_with_non_valid(self, op, res, float_frame): + # we are comparing floats vs a string + result = getattr(float_frame, op)("foo") + assert bool(result.all().all()) is res + + def test_binary_ops_align(self): + + # test aligning binary ops + + # GH 6681 + index = MultiIndex.from_product( + [list("abc"), ["one", "two", "three"], [1, 2, 3]], + names=["first", "second", "third"], + ) + + df = DataFrame( + np.arange(27 * 3).reshape(27, 3), + index=index, + columns=["value1", "value2", "value3"], + ).sort_index() + + idx = pd.IndexSlice + for op in ["add", "sub", "mul", "div", "truediv"]: + opa = getattr(operator, op, None) + if opa is None: + continue + + x = Series([1.0, 10.0, 100.0], [1, 2, 3]) + result = getattr(df, op)(x, level="third", axis=0) + + expected = pd.concat( + [opa(df.loc[idx[:, :, i], :], v) for i, v in x.items()] + ).sort_index() + tm.assert_frame_equal(result, expected) + + x = Series([1.0, 10.0], ["two", "three"]) + result = getattr(df, op)(x, level="second", axis=0) + + expected = ( + pd.concat([opa(df.loc[idx[:, i], :], v) for i, v in x.items()]) + .reindex_like(df) + .sort_index() + ) + tm.assert_frame_equal(result, expected) + + # GH9463 (alignment level of dataframe with series) + + midx = MultiIndex.from_product([["A", "B"], ["a", "b"]]) + df = DataFrame(np.ones((2, 4), dtype="int64"), columns=midx) + s = pd.Series({"a": 1, "b": 2}) + + df2 = df.copy() + df2.columns.names = ["lvl0", "lvl1"] + s2 = s.copy() + s2.index.name = "lvl1" + + # different cases of integer/string level names: + res1 = df.mul(s, axis=1, level=1) + res2 = df.mul(s2, axis=1, level=1) + res3 = df2.mul(s, axis=1, level=1) + res4 = df2.mul(s2, axis=1, level=1) + res5 = df2.mul(s, axis=1, level="lvl1") + res6 = df2.mul(s2, axis=1, level="lvl1") + + exp = DataFrame( + np.array([[1, 2, 1, 2], [1, 2, 1, 2]], dtype="int64"), columns=midx + ) + + for res in [res1, res2]: + tm.assert_frame_equal(res, exp) + + exp.columns.names = ["lvl0", "lvl1"] + for res in [res3, res4, res5, res6]: + tm.assert_frame_equal(res, exp) + + def test_dti_tz_convert_to_utc(self): + base = pd.DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"], tz="UTC") + idx1 = base.tz_convert("Asia/Tokyo")[:2] + idx2 = base.tz_convert("US/Eastern")[1:] + + df1 = DataFrame({"A": [1, 2]}, index=idx1) + df2 = DataFrame({"A": [1, 1]}, index=idx2) + exp = DataFrame({"A": [np.nan, 3, np.nan]}, index=base) + tm.assert_frame_equal(df1 + df2, exp) + + def test_combineFrame(self, float_frame, mixed_float_frame, mixed_int_frame): + frame_copy = float_frame.reindex(float_frame.index[::2]) + + del frame_copy["D"] + frame_copy["C"][:5] = np.nan + + added = float_frame + frame_copy + + indexer = added["A"].dropna().index + exp = (float_frame["A"] * 2).copy() + + tm.assert_series_equal(added["A"].dropna(), exp.loc[indexer]) + + exp.loc[~exp.index.isin(indexer)] = np.nan + tm.assert_series_equal(added["A"], exp.loc[added["A"].index]) + + assert np.isnan(added["C"].reindex(frame_copy.index)[:5]).all() + + # assert(False) + + assert np.isnan(added["D"]).all() + + self_added = float_frame + float_frame + tm.assert_index_equal(self_added.index, float_frame.index) + + added_rev = frame_copy + float_frame + assert np.isnan(added["D"]).all() + assert np.isnan(added_rev["D"]).all() + + # corner cases + + # empty + plus_empty = float_frame + DataFrame() + assert np.isnan(plus_empty.values).all() + + empty_plus = DataFrame() + float_frame + assert np.isnan(empty_plus.values).all() + + empty_empty = DataFrame() + DataFrame() + assert empty_empty.empty + + # out of order + reverse = float_frame.reindex(columns=float_frame.columns[::-1]) + + tm.assert_frame_equal(reverse + float_frame, float_frame * 2) + + # mix vs float64, upcast + added = float_frame + mixed_float_frame + _check_mixed_float(added, dtype="float64") + added = mixed_float_frame + float_frame + _check_mixed_float(added, dtype="float64") + + # mix vs mix + added = mixed_float_frame + mixed_float_frame + _check_mixed_float(added, dtype=dict(C=None)) + + # with int + added = float_frame + mixed_int_frame + _check_mixed_float(added, dtype="float64") + + def test_combine_series( + self, float_frame, mixed_float_frame, mixed_int_frame, datetime_frame + ): + + # Series + series = float_frame.xs(float_frame.index[0]) + + added = float_frame + series + + for key, s in added.items(): + tm.assert_series_equal(s, float_frame[key] + series[key]) + + larger_series = series.to_dict() + larger_series["E"] = 1 + larger_series = Series(larger_series) + larger_added = float_frame + larger_series + + for key, s in float_frame.items(): + tm.assert_series_equal(larger_added[key], s + series[key]) + assert "E" in larger_added + assert np.isnan(larger_added["E"]).all() + + # no upcast needed + added = mixed_float_frame + series + _check_mixed_float(added) + + # vs mix (upcast) as needed + added = mixed_float_frame + series.astype("float32") + _check_mixed_float(added, dtype=dict(C=None)) + added = mixed_float_frame + series.astype("float16") + _check_mixed_float(added, dtype=dict(C=None)) + + # FIXME: don't leave commented-out + # these raise with numexpr.....as we are adding an int64 to an + # uint64....weird vs int + + # added = mixed_int_frame + (100*series).astype('int64') + # _check_mixed_int(added, dtype = dict(A = 'int64', B = 'float64', C = + # 'int64', D = 'int64')) + # added = mixed_int_frame + (100*series).astype('int32') + # _check_mixed_int(added, dtype = dict(A = 'int32', B = 'float64', C = + # 'int32', D = 'int64')) + + # TimeSeries + ts = datetime_frame["A"] + + # 10890 + # we no longer allow auto timeseries broadcasting + # and require explicit broadcasting + added = datetime_frame.add(ts, axis="index") + + for key, col in datetime_frame.items(): + result = col + ts + tm.assert_series_equal(added[key], result, check_names=False) + assert added[key].name == key + if col.name == ts.name: + assert result.name == "A" + else: + assert result.name is None + + smaller_frame = datetime_frame[:-5] + smaller_added = smaller_frame.add(ts, axis="index") + + tm.assert_index_equal(smaller_added.index, datetime_frame.index) + + smaller_ts = ts[:-5] + smaller_added2 = datetime_frame.add(smaller_ts, axis="index") + tm.assert_frame_equal(smaller_added, smaller_added2) + + # length 0, result is all-nan + result = datetime_frame.add(ts[:0], axis="index") + expected = DataFrame( + np.nan, index=datetime_frame.index, columns=datetime_frame.columns + ) + tm.assert_frame_equal(result, expected) + + # Frame is all-nan + result = datetime_frame[:0].add(ts, axis="index") + expected = DataFrame( + np.nan, index=datetime_frame.index, columns=datetime_frame.columns + ) + tm.assert_frame_equal(result, expected) + + # empty but with non-empty index + frame = datetime_frame[:1].reindex(columns=[]) + result = frame.mul(ts, axis="index") + assert len(result) == len(ts) + + def test_combineFunc(self, float_frame, mixed_float_frame): + result = float_frame * 2 + tm.assert_numpy_array_equal(result.values, float_frame.values * 2) + + # vs mix + result = mixed_float_frame * 2 + for c, s in result.items(): + tm.assert_numpy_array_equal(s.values, mixed_float_frame[c].values * 2) + _check_mixed_float(result, dtype=dict(C=None)) + + result = DataFrame() * 2 + assert result.index.equals(DataFrame().index) + assert len(result.columns) == 0 + + def test_comparisons(self, simple_frame, float_frame): + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame() + + row = simple_frame.xs("a") + ndim_5 = np.ones(df1.shape + (1, 1, 1)) + + def test_comp(func): + result = func(df1, df2) + tm.assert_numpy_array_equal(result.values, func(df1.values, df2.values)) + + with pytest.raises(ValueError, match="dim must be <= 2"): + func(df1, ndim_5) + + result2 = func(simple_frame, row) + tm.assert_numpy_array_equal( + result2.values, func(simple_frame.values, row.values) + ) + + result3 = func(float_frame, 0) + tm.assert_numpy_array_equal(result3.values, func(float_frame.values, 0)) + + msg = "Can only compare identically-labeled DataFrame" + with pytest.raises(ValueError, match=msg): + func(simple_frame, simple_frame[:2]) + + test_comp(operator.eq) + test_comp(operator.ne) + test_comp(operator.lt) + test_comp(operator.gt) + test_comp(operator.ge) + test_comp(operator.le) + + def test_strings_to_numbers_comparisons_raises(self, compare_operators_no_eq_ne): + # GH 11565 + df = DataFrame( + {x: {"x": "foo", "y": "bar", "z": "baz"} for x in ["a", "b", "c"]} + ) + + f = getattr(operator, compare_operators_no_eq_ne) + with pytest.raises(TypeError): + f(df, 0) + + def test_comparison_protected_from_errstate(self): + missing_df = tm.makeDataFrame() + missing_df.iloc[0]["A"] = np.nan + with np.errstate(invalid="ignore"): + expected = missing_df.values < 0 + with np.errstate(invalid="raise"): + result = (missing_df < 0).values + tm.assert_numpy_array_equal(result, expected) + + def test_boolean_comparison(self): + + # GH 4576 + # boolean comparisons with a tuple/list give unexpected results + df = DataFrame(np.arange(6).reshape((3, 2))) + b = np.array([2, 2]) + b_r = np.atleast_2d([2, 2]) + b_c = b_r.T + lst = [2, 2, 2] + tup = tuple(lst) + + # gt + expected = DataFrame([[False, False], [False, True], [True, True]]) + result = df > b + tm.assert_frame_equal(result, expected) + + result = df.values > b + tm.assert_numpy_array_equal(result, expected.values) + + msg1d = "Unable to coerce to Series, length must be 2: given 3" + msg2d = "Unable to coerce to DataFrame, shape must be" + msg2db = "operands could not be broadcast together with shapes" + with pytest.raises(ValueError, match=msg1d): + # wrong shape + df > lst + + with pytest.raises(ValueError, match=msg1d): + # wrong shape + result = df > tup + + # broadcasts like ndarray (GH#23000) + result = df > b_r + tm.assert_frame_equal(result, expected) + + result = df.values > b_r + tm.assert_numpy_array_equal(result, expected.values) + + with pytest.raises(ValueError, match=msg2d): + df > b_c + + with pytest.raises(ValueError, match=msg2db): + df.values > b_c + + # == + expected = DataFrame([[False, False], [True, False], [False, False]]) + result = df == b + tm.assert_frame_equal(result, expected) + + with pytest.raises(ValueError, match=msg1d): + result = df == lst + + with pytest.raises(ValueError, match=msg1d): + result = df == tup + + # broadcasts like ndarray (GH#23000) + result = df == b_r + tm.assert_frame_equal(result, expected) + + result = df.values == b_r + tm.assert_numpy_array_equal(result, expected.values) + + with pytest.raises(ValueError, match=msg2d): + df == b_c + + assert df.values.shape != b_c.shape + + # with alignment + df = DataFrame( + np.arange(6).reshape((3, 2)), columns=list("AB"), index=list("abc") + ) + expected.index = df.index + expected.columns = df.columns + + with pytest.raises(ValueError, match=msg1d): + result = df == lst + + with pytest.raises(ValueError, match=msg1d): + result = df == tup + + def test_combine_generic(self, float_frame): + df1 = float_frame + df2 = float_frame.loc[float_frame.index[:-5], ["A", "B", "C"]] + + combined = df1.combine(df2, np.add) + combined2 = df2.combine(df1, np.add) + assert combined["D"].isna().all() + assert combined2["D"].isna().all() + + chunk = combined.loc[combined.index[:-5], ["A", "B", "C"]] + chunk2 = combined2.loc[combined2.index[:-5], ["A", "B", "C"]] + + exp = ( + float_frame.loc[float_frame.index[:-5], ["A", "B", "C"]].reindex_like(chunk) + * 2 + ) + tm.assert_frame_equal(chunk, exp) + tm.assert_frame_equal(chunk2, exp) + + def test_inplace_ops_alignment(self): + + # inplace ops / ops alignment + # GH 8511 + + columns = list("abcdefg") + X_orig = DataFrame( + np.arange(10 * len(columns)).reshape(-1, len(columns)), + columns=columns, + index=range(10), + ) + Z = 100 * X_orig.iloc[:, 1:-1].copy() + block1 = list("bedcf") + subs = list("bcdef") + + # add + X = X_orig.copy() + result1 = (X[block1] + Z).reindex(columns=subs) + + X[block1] += Z + result2 = X.reindex(columns=subs) + + X = X_orig.copy() + result3 = (X[block1] + Z[block1]).reindex(columns=subs) + + X[block1] += Z[block1] + result4 = X.reindex(columns=subs) + + tm.assert_frame_equal(result1, result2) + tm.assert_frame_equal(result1, result3) + tm.assert_frame_equal(result1, result4) + + # sub + X = X_orig.copy() + result1 = (X[block1] - Z).reindex(columns=subs) + + X[block1] -= Z + result2 = X.reindex(columns=subs) + + X = X_orig.copy() + result3 = (X[block1] - Z[block1]).reindex(columns=subs) + + X[block1] -= Z[block1] + result4 = X.reindex(columns=subs) + + tm.assert_frame_equal(result1, result2) + tm.assert_frame_equal(result1, result3) + tm.assert_frame_equal(result1, result4) + + def test_inplace_ops_identity(self): + + # GH 5104 + # make sure that we are actually changing the object + s_orig = Series([1, 2, 3]) + df_orig = DataFrame(np.random.randint(0, 5, size=10).reshape(-1, 5)) + + # no dtype change + s = s_orig.copy() + s2 = s + s += 1 + tm.assert_series_equal(s, s2) + tm.assert_series_equal(s_orig + 1, s) + assert s is s2 + assert s._data is s2._data + + df = df_orig.copy() + df2 = df + df += 1 + tm.assert_frame_equal(df, df2) + tm.assert_frame_equal(df_orig + 1, df) + assert df is df2 + assert df._data is df2._data + + # dtype change + s = s_orig.copy() + s2 = s + s += 1.5 + tm.assert_series_equal(s, s2) + tm.assert_series_equal(s_orig + 1.5, s) + + df = df_orig.copy() + df2 = df + df += 1.5 + tm.assert_frame_equal(df, df2) + tm.assert_frame_equal(df_orig + 1.5, df) + assert df is df2 + assert df._data is df2._data + + # mixed dtype + arr = np.random.randint(0, 10, size=5) + df_orig = DataFrame({"A": arr.copy(), "B": "foo"}) + df = df_orig.copy() + df2 = df + df["A"] += 1 + expected = DataFrame({"A": arr.copy() + 1, "B": "foo"}) + tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df2, expected) + assert df._data is df2._data + + df = df_orig.copy() + df2 = df + df["A"] += 1.5 + expected = DataFrame({"A": arr.copy() + 1.5, "B": "foo"}) + tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df2, expected) + assert df._data is df2._data + + @pytest.mark.parametrize( + "op", + [ + "add", + "and", + "div", + "floordiv", + "mod", + "mul", + "or", + "pow", + "sub", + "truediv", + "xor", + ], + ) + def test_inplace_ops_identity2(self, op): + + if op == "div": + return + + df = DataFrame({"a": [1.0, 2.0, 3.0], "b": [1, 2, 3]}) + + operand = 2 + if op in ("and", "or", "xor"): + # cannot use floats for boolean ops + df["a"] = [True, False, True] + + df_copy = df.copy() + iop = "__i{}__".format(op) + op = "__{}__".format(op) + + # no id change and value is correct + getattr(df, iop)(operand) + expected = getattr(df_copy, op)(operand) + tm.assert_frame_equal(df, expected) + expected = id(df) + assert id(df) == expected + + def test_alignment_non_pandas(self): + index = ["A", "B", "C"] + columns = ["X", "Y", "Z"] + df = pd.DataFrame(np.random.randn(3, 3), index=index, columns=columns) + + align = pd.core.ops._align_method_FRAME + for val in [ + [1, 2, 3], + (1, 2, 3), + np.array([1, 2, 3], dtype=np.int64), + range(1, 4), + ]: + + tm.assert_series_equal( + align(df, val, "index"), Series([1, 2, 3], index=df.index) + ) + tm.assert_series_equal( + align(df, val, "columns"), Series([1, 2, 3], index=df.columns) + ) + + # length mismatch + msg = "Unable to coerce to Series, length must be 3: given 2" + for val in [[1, 2], (1, 2), np.array([1, 2]), range(1, 3)]: + + with pytest.raises(ValueError, match=msg): + align(df, val, "index") + + with pytest.raises(ValueError, match=msg): + align(df, val, "columns") + + val = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + tm.assert_frame_equal( + align(df, val, "index"), DataFrame(val, index=df.index, columns=df.columns) + ) + tm.assert_frame_equal( + align(df, val, "columns"), + DataFrame(val, index=df.index, columns=df.columns), + ) + + # shape mismatch + msg = "Unable to coerce to DataFrame, shape must be" + val = np.array([[1, 2, 3], [4, 5, 6]]) + with pytest.raises(ValueError, match=msg): + align(df, val, "index") + + with pytest.raises(ValueError, match=msg): + align(df, val, "columns") + + val = np.zeros((3, 3, 3)) + with pytest.raises(ValueError): + align(df, val, "index") + with pytest.raises(ValueError): + align(df, val, "columns") + + def test_no_warning(self, all_arithmetic_operators): + df = pd.DataFrame({"A": [0.0, 0.0], "B": [0.0, None]}) + b = df["B"] + with tm.assert_produces_warning(None): + getattr(df, all_arithmetic_operators)(b, 0) diff --git a/venv/Lib/site-packages/pandas/tests/frame/test_period.py b/venv/Lib/site-packages/pandas/tests/frame/test_period.py new file mode 100644 index 0000000..a6b2b33 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/test_period.py @@ -0,0 +1,156 @@ +from datetime import timedelta + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + DatetimeIndex, + Index, + PeriodIndex, + Timedelta, + date_range, + period_range, + to_datetime, +) +import pandas._testing as tm + + +def _permute(obj): + return obj.take(np.random.permutation(len(obj))) + + +class TestPeriodIndex: + def test_as_frame_columns(self): + rng = period_range("1/1/2000", periods=5) + df = DataFrame(np.random.randn(10, 5), columns=rng) + + ts = df[rng[0]] + tm.assert_series_equal(ts, df.iloc[:, 0]) + + # GH # 1211 + repr(df) + + ts = df["1/1/2000"] + tm.assert_series_equal(ts, df.iloc[:, 0]) + + def test_frame_setitem(self): + rng = period_range("1/1/2000", periods=5, name="index") + df = DataFrame(np.random.randn(5, 3), index=rng) + + df["Index"] = rng + rs = Index(df["Index"]) + tm.assert_index_equal(rs, rng, check_names=False) + assert rs.name == "Index" + assert rng.name == "index" + + rs = df.reset_index().set_index("index") + assert isinstance(rs.index, PeriodIndex) + tm.assert_index_equal(rs.index, rng) + + def test_frame_to_time_stamp(self): + K = 5 + index = period_range(freq="A", start="1/1/2001", end="12/1/2009") + df = DataFrame(np.random.randn(len(index), K), index=index) + df["mix"] = "a" + + exp_index = date_range("1/1/2001", end="12/31/2009", freq="A-DEC") + exp_index = exp_index + Timedelta(1, "D") - Timedelta(1, "ns") + result = df.to_timestamp("D", "end") + tm.assert_index_equal(result.index, exp_index) + tm.assert_numpy_array_equal(result.values, df.values) + + exp_index = date_range("1/1/2001", end="1/1/2009", freq="AS-JAN") + result = df.to_timestamp("D", "start") + tm.assert_index_equal(result.index, exp_index) + + def _get_with_delta(delta, freq="A-DEC"): + return date_range( + to_datetime("1/1/2001") + delta, + to_datetime("12/31/2009") + delta, + freq=freq, + ) + + delta = timedelta(hours=23) + result = df.to_timestamp("H", "end") + exp_index = _get_with_delta(delta) + exp_index = exp_index + Timedelta(1, "h") - Timedelta(1, "ns") + tm.assert_index_equal(result.index, exp_index) + + delta = timedelta(hours=23, minutes=59) + result = df.to_timestamp("T", "end") + exp_index = _get_with_delta(delta) + exp_index = exp_index + Timedelta(1, "m") - Timedelta(1, "ns") + tm.assert_index_equal(result.index, exp_index) + + result = df.to_timestamp("S", "end") + delta = timedelta(hours=23, minutes=59, seconds=59) + exp_index = _get_with_delta(delta) + exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns") + tm.assert_index_equal(result.index, exp_index) + + # columns + df = df.T + + exp_index = date_range("1/1/2001", end="12/31/2009", freq="A-DEC") + exp_index = exp_index + Timedelta(1, "D") - Timedelta(1, "ns") + result = df.to_timestamp("D", "end", axis=1) + tm.assert_index_equal(result.columns, exp_index) + tm.assert_numpy_array_equal(result.values, df.values) + + exp_index = date_range("1/1/2001", end="1/1/2009", freq="AS-JAN") + result = df.to_timestamp("D", "start", axis=1) + tm.assert_index_equal(result.columns, exp_index) + + delta = timedelta(hours=23) + result = df.to_timestamp("H", "end", axis=1) + exp_index = _get_with_delta(delta) + exp_index = exp_index + Timedelta(1, "h") - Timedelta(1, "ns") + tm.assert_index_equal(result.columns, exp_index) + + delta = timedelta(hours=23, minutes=59) + result = df.to_timestamp("T", "end", axis=1) + exp_index = _get_with_delta(delta) + exp_index = exp_index + Timedelta(1, "m") - Timedelta(1, "ns") + tm.assert_index_equal(result.columns, exp_index) + + result = df.to_timestamp("S", "end", axis=1) + delta = timedelta(hours=23, minutes=59, seconds=59) + exp_index = _get_with_delta(delta) + exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns") + tm.assert_index_equal(result.columns, exp_index) + + # invalid axis + with pytest.raises(ValueError, match="axis"): + df.to_timestamp(axis=2) + + result1 = df.to_timestamp("5t", axis=1) + result2 = df.to_timestamp("t", axis=1) + expected = pd.date_range("2001-01-01", "2009-01-01", freq="AS") + assert isinstance(result1.columns, DatetimeIndex) + assert isinstance(result2.columns, DatetimeIndex) + tm.assert_numpy_array_equal(result1.columns.asi8, expected.asi8) + tm.assert_numpy_array_equal(result2.columns.asi8, expected.asi8) + # PeriodIndex.to_timestamp always use 'infer' + assert result1.columns.freqstr == "AS-JAN" + assert result2.columns.freqstr == "AS-JAN" + + def test_frame_index_to_string(self): + index = PeriodIndex(["2011-1", "2011-2", "2011-3"], freq="M") + frame = DataFrame(np.random.randn(3, 4), index=index) + + # it works! + frame.to_string() + + def test_align_frame(self): + rng = period_range("1/1/2000", "1/1/2010", freq="A") + ts = DataFrame(np.random.randn(len(rng), 3), index=rng) + + result = ts + ts[::2] + expected = ts + ts + expected.values[1::2] = np.nan + tm.assert_frame_equal(result, expected) + + result = ts + _permute(ts[::2]) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/frame/test_query_eval.py b/venv/Lib/site-packages/pandas/tests/frame/test_query_eval.py new file mode 100644 index 0000000..703e059 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/test_query_eval.py @@ -0,0 +1,1178 @@ +from io import StringIO +import operator + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import DataFrame, Index, MultiIndex, Series, date_range +import pandas._testing as tm +from pandas.core.computation.check import _NUMEXPR_INSTALLED + +PARSERS = "python", "pandas" +ENGINES = "python", pytest.param("numexpr", marks=td.skip_if_no_ne) + + +@pytest.fixture(params=PARSERS, ids=lambda x: x) +def parser(request): + return request.param + + +@pytest.fixture(params=ENGINES, ids=lambda x: x) +def engine(request): + return request.param + + +def skip_if_no_pandas_parser(parser): + if parser != "pandas": + pytest.skip(f"cannot evaluate with parser {repr(parser)}") + + +class TestCompat: + def setup_method(self, method): + self.df = DataFrame({"A": [1, 2, 3]}) + self.expected1 = self.df[self.df.A > 0] + self.expected2 = self.df.A + 1 + + def test_query_default(self): + + # GH 12749 + # this should always work, whether _NUMEXPR_INSTALLED or not + df = self.df + result = df.query("A>0") + tm.assert_frame_equal(result, self.expected1) + result = df.eval("A+1") + tm.assert_series_equal(result, self.expected2, check_names=False) + + def test_query_None(self): + + df = self.df + result = df.query("A>0", engine=None) + tm.assert_frame_equal(result, self.expected1) + result = df.eval("A+1", engine=None) + tm.assert_series_equal(result, self.expected2, check_names=False) + + def test_query_python(self): + + df = self.df + result = df.query("A>0", engine="python") + tm.assert_frame_equal(result, self.expected1) + result = df.eval("A+1", engine="python") + tm.assert_series_equal(result, self.expected2, check_names=False) + + def test_query_numexpr(self): + + df = self.df + if _NUMEXPR_INSTALLED: + result = df.query("A>0", engine="numexpr") + tm.assert_frame_equal(result, self.expected1) + result = df.eval("A+1", engine="numexpr") + tm.assert_series_equal(result, self.expected2, check_names=False) + else: + with pytest.raises(ImportError): + df.query("A>0", engine="numexpr") + with pytest.raises(ImportError): + df.eval("A+1", engine="numexpr") + + +class TestDataFrameEval: + def test_ops(self): + + # tst ops and reversed ops in evaluation + # GH7198 + + # smaller hits python, larger hits numexpr + for n in [4, 4000]: + + df = DataFrame(1, index=range(n), columns=list("abcd")) + df.iloc[0] = 2 + m = df.mean() + + for op_str, op, rop in [ + ("+", "__add__", "__radd__"), + ("-", "__sub__", "__rsub__"), + ("*", "__mul__", "__rmul__"), + ("/", "__truediv__", "__rtruediv__"), + ]: + + base = DataFrame( # noqa + np.tile(m.values, n).reshape(n, -1), columns=list("abcd") + ) + + expected = eval("base{op}df".format(op=op_str)) + + # ops as strings + result = eval("m{op}df".format(op=op_str)) + tm.assert_frame_equal(result, expected) + + # these are commutative + if op in ["+", "*"]: + result = getattr(df, op)(m) + tm.assert_frame_equal(result, expected) + + # these are not + elif op in ["-", "/"]: + result = getattr(df, rop)(m) + tm.assert_frame_equal(result, expected) + + # GH7192: Note we need a large number of rows to ensure this + # goes through the numexpr path + df = DataFrame(dict(A=np.random.randn(25000))) + df.iloc[0:5] = np.nan + expected = 1 - np.isnan(df.iloc[0:25]) + result = (1 - np.isnan(df)).iloc[0:25] + tm.assert_frame_equal(result, expected) + + def test_query_non_str(self): + # GH 11485 + df = pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "b"]}) + + msg = "expr must be a string to be evaluated" + with pytest.raises(ValueError, match=msg): + df.query(lambda x: x.B == "b") + + with pytest.raises(ValueError, match=msg): + df.query(111) + + def test_query_empty_string(self): + # GH 13139 + df = pd.DataFrame({"A": [1, 2, 3]}) + + msg = "expr cannot be an empty string" + with pytest.raises(ValueError, match=msg): + df.query("") + + def test_eval_resolvers_as_list(self): + # GH 14095 + df = DataFrame(np.random.randn(10, 2), columns=list("ab")) + dict1 = {"a": 1} + dict2 = {"b": 2} + assert df.eval("a + b", resolvers=[dict1, dict2]) == dict1["a"] + dict2["b"] + assert pd.eval("a + b", resolvers=[dict1, dict2]) == dict1["a"] + dict2["b"] + + +class TestDataFrameQueryWithMultiIndex: + def test_query_with_named_multiindex(self, parser, engine): + skip_if_no_pandas_parser(parser) + a = np.random.choice(["red", "green"], size=10) + b = np.random.choice(["eggs", "ham"], size=10) + index = MultiIndex.from_arrays([a, b], names=["color", "food"]) + df = DataFrame(np.random.randn(10, 2), index=index) + ind = Series( + df.index.get_level_values("color").values, index=index, name="color" + ) + + # equality + res1 = df.query('color == "red"', parser=parser, engine=engine) + res2 = df.query('"red" == color', parser=parser, engine=engine) + exp = df[ind == "red"] + tm.assert_frame_equal(res1, exp) + tm.assert_frame_equal(res2, exp) + + # inequality + res1 = df.query('color != "red"', parser=parser, engine=engine) + res2 = df.query('"red" != color', parser=parser, engine=engine) + exp = df[ind != "red"] + tm.assert_frame_equal(res1, exp) + tm.assert_frame_equal(res2, exp) + + # list equality (really just set membership) + res1 = df.query('color == ["red"]', parser=parser, engine=engine) + res2 = df.query('["red"] == color', parser=parser, engine=engine) + exp = df[ind.isin(["red"])] + tm.assert_frame_equal(res1, exp) + tm.assert_frame_equal(res2, exp) + + res1 = df.query('color != ["red"]', parser=parser, engine=engine) + res2 = df.query('["red"] != color', parser=parser, engine=engine) + exp = df[~ind.isin(["red"])] + tm.assert_frame_equal(res1, exp) + tm.assert_frame_equal(res2, exp) + + # in/not in ops + res1 = df.query('["red"] in color', parser=parser, engine=engine) + res2 = df.query('"red" in color', parser=parser, engine=engine) + exp = df[ind.isin(["red"])] + tm.assert_frame_equal(res1, exp) + tm.assert_frame_equal(res2, exp) + + res1 = df.query('["red"] not in color', parser=parser, engine=engine) + res2 = df.query('"red" not in color', parser=parser, engine=engine) + exp = df[~ind.isin(["red"])] + tm.assert_frame_equal(res1, exp) + tm.assert_frame_equal(res2, exp) + + def test_query_with_unnamed_multiindex(self, parser, engine): + skip_if_no_pandas_parser(parser) + a = np.random.choice(["red", "green"], size=10) + b = np.random.choice(["eggs", "ham"], size=10) + index = MultiIndex.from_arrays([a, b]) + df = DataFrame(np.random.randn(10, 2), index=index) + ind = Series(df.index.get_level_values(0).values, index=index) + + res1 = df.query('ilevel_0 == "red"', parser=parser, engine=engine) + res2 = df.query('"red" == ilevel_0', parser=parser, engine=engine) + exp = df[ind == "red"] + tm.assert_frame_equal(res1, exp) + tm.assert_frame_equal(res2, exp) + + # inequality + res1 = df.query('ilevel_0 != "red"', parser=parser, engine=engine) + res2 = df.query('"red" != ilevel_0', parser=parser, engine=engine) + exp = df[ind != "red"] + tm.assert_frame_equal(res1, exp) + tm.assert_frame_equal(res2, exp) + + # list equality (really just set membership) + res1 = df.query('ilevel_0 == ["red"]', parser=parser, engine=engine) + res2 = df.query('["red"] == ilevel_0', parser=parser, engine=engine) + exp = df[ind.isin(["red"])] + tm.assert_frame_equal(res1, exp) + tm.assert_frame_equal(res2, exp) + + res1 = df.query('ilevel_0 != ["red"]', parser=parser, engine=engine) + res2 = df.query('["red"] != ilevel_0', parser=parser, engine=engine) + exp = df[~ind.isin(["red"])] + tm.assert_frame_equal(res1, exp) + tm.assert_frame_equal(res2, exp) + + # in/not in ops + res1 = df.query('["red"] in ilevel_0', parser=parser, engine=engine) + res2 = df.query('"red" in ilevel_0', parser=parser, engine=engine) + exp = df[ind.isin(["red"])] + tm.assert_frame_equal(res1, exp) + tm.assert_frame_equal(res2, exp) + + res1 = df.query('["red"] not in ilevel_0', parser=parser, engine=engine) + res2 = df.query('"red" not in ilevel_0', parser=parser, engine=engine) + exp = df[~ind.isin(["red"])] + tm.assert_frame_equal(res1, exp) + tm.assert_frame_equal(res2, exp) + + # ## LEVEL 1 + ind = Series(df.index.get_level_values(1).values, index=index) + res1 = df.query('ilevel_1 == "eggs"', parser=parser, engine=engine) + res2 = df.query('"eggs" == ilevel_1', parser=parser, engine=engine) + exp = df[ind == "eggs"] + tm.assert_frame_equal(res1, exp) + tm.assert_frame_equal(res2, exp) + + # inequality + res1 = df.query('ilevel_1 != "eggs"', parser=parser, engine=engine) + res2 = df.query('"eggs" != ilevel_1', parser=parser, engine=engine) + exp = df[ind != "eggs"] + tm.assert_frame_equal(res1, exp) + tm.assert_frame_equal(res2, exp) + + # list equality (really just set membership) + res1 = df.query('ilevel_1 == ["eggs"]', parser=parser, engine=engine) + res2 = df.query('["eggs"] == ilevel_1', parser=parser, engine=engine) + exp = df[ind.isin(["eggs"])] + tm.assert_frame_equal(res1, exp) + tm.assert_frame_equal(res2, exp) + + res1 = df.query('ilevel_1 != ["eggs"]', parser=parser, engine=engine) + res2 = df.query('["eggs"] != ilevel_1', parser=parser, engine=engine) + exp = df[~ind.isin(["eggs"])] + tm.assert_frame_equal(res1, exp) + tm.assert_frame_equal(res2, exp) + + # in/not in ops + res1 = df.query('["eggs"] in ilevel_1', parser=parser, engine=engine) + res2 = df.query('"eggs" in ilevel_1', parser=parser, engine=engine) + exp = df[ind.isin(["eggs"])] + tm.assert_frame_equal(res1, exp) + tm.assert_frame_equal(res2, exp) + + res1 = df.query('["eggs"] not in ilevel_1', parser=parser, engine=engine) + res2 = df.query('"eggs" not in ilevel_1', parser=parser, engine=engine) + exp = df[~ind.isin(["eggs"])] + tm.assert_frame_equal(res1, exp) + tm.assert_frame_equal(res2, exp) + + def test_query_with_partially_named_multiindex(self, parser, engine): + skip_if_no_pandas_parser(parser) + a = np.random.choice(["red", "green"], size=10) + b = np.arange(10) + index = MultiIndex.from_arrays([a, b]) + index.names = [None, "rating"] + df = DataFrame(np.random.randn(10, 2), index=index) + res = df.query("rating == 1", parser=parser, engine=engine) + ind = Series( + df.index.get_level_values("rating").values, index=index, name="rating" + ) + exp = df[ind == 1] + tm.assert_frame_equal(res, exp) + + res = df.query("rating != 1", parser=parser, engine=engine) + ind = Series( + df.index.get_level_values("rating").values, index=index, name="rating" + ) + exp = df[ind != 1] + tm.assert_frame_equal(res, exp) + + res = df.query('ilevel_0 == "red"', parser=parser, engine=engine) + ind = Series(df.index.get_level_values(0).values, index=index) + exp = df[ind == "red"] + tm.assert_frame_equal(res, exp) + + res = df.query('ilevel_0 != "red"', parser=parser, engine=engine) + ind = Series(df.index.get_level_values(0).values, index=index) + exp = df[ind != "red"] + tm.assert_frame_equal(res, exp) + + def test_query_multiindex_get_index_resolvers(self): + df = tm.makeCustomDataframe( + 10, 3, r_idx_nlevels=2, r_idx_names=["spam", "eggs"] + ) + resolvers = df._get_index_resolvers() + + def to_series(mi, level): + level_values = mi.get_level_values(level) + s = level_values.to_series() + s.index = mi + return s + + col_series = df.columns.to_series() + expected = { + "index": df.index, + "columns": col_series, + "spam": to_series(df.index, "spam"), + "eggs": to_series(df.index, "eggs"), + "C0": col_series, + } + for k, v in resolvers.items(): + if isinstance(v, Index): + assert v.is_(expected[k]) + elif isinstance(v, Series): + tm.assert_series_equal(v, expected[k]) + else: + raise AssertionError("object must be a Series or Index") + + +@td.skip_if_no_ne +class TestDataFrameQueryNumExprPandas: + @classmethod + def setup_class(cls): + cls.engine = "numexpr" + cls.parser = "pandas" + + @classmethod + def teardown_class(cls): + del cls.engine, cls.parser + + def test_date_query_with_attribute_access(self): + engine, parser = self.engine, self.parser + skip_if_no_pandas_parser(parser) + df = DataFrame(np.random.randn(5, 3)) + df["dates1"] = date_range("1/1/2012", periods=5) + df["dates2"] = date_range("1/1/2013", periods=5) + df["dates3"] = date_range("1/1/2014", periods=5) + res = df.query( + "@df.dates1 < 20130101 < @df.dates3", engine=engine, parser=parser + ) + expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)] + tm.assert_frame_equal(res, expec) + + def test_date_query_no_attribute_access(self): + engine, parser = self.engine, self.parser + df = DataFrame(np.random.randn(5, 3)) + df["dates1"] = date_range("1/1/2012", periods=5) + df["dates2"] = date_range("1/1/2013", periods=5) + df["dates3"] = date_range("1/1/2014", periods=5) + res = df.query("dates1 < 20130101 < dates3", engine=engine, parser=parser) + expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)] + tm.assert_frame_equal(res, expec) + + def test_date_query_with_NaT(self): + engine, parser = self.engine, self.parser + n = 10 + df = DataFrame(np.random.randn(n, 3)) + df["dates1"] = date_range("1/1/2012", periods=n) + df["dates2"] = date_range("1/1/2013", periods=n) + df["dates3"] = date_range("1/1/2014", periods=n) + df.loc[np.random.rand(n) > 0.5, "dates1"] = pd.NaT + df.loc[np.random.rand(n) > 0.5, "dates3"] = pd.NaT + res = df.query("dates1 < 20130101 < dates3", engine=engine, parser=parser) + expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)] + tm.assert_frame_equal(res, expec) + + def test_date_index_query(self): + engine, parser = self.engine, self.parser + n = 10 + df = DataFrame(np.random.randn(n, 3)) + df["dates1"] = date_range("1/1/2012", periods=n) + df["dates3"] = date_range("1/1/2014", periods=n) + df.set_index("dates1", inplace=True, drop=True) + res = df.query("index < 20130101 < dates3", engine=engine, parser=parser) + expec = df[(df.index < "20130101") & ("20130101" < df.dates3)] + tm.assert_frame_equal(res, expec) + + def test_date_index_query_with_NaT(self): + engine, parser = self.engine, self.parser + n = 10 + df = DataFrame(np.random.randn(n, 3)) + df["dates1"] = date_range("1/1/2012", periods=n) + df["dates3"] = date_range("1/1/2014", periods=n) + df.iloc[0, 0] = pd.NaT + df.set_index("dates1", inplace=True, drop=True) + res = df.query("index < 20130101 < dates3", engine=engine, parser=parser) + expec = df[(df.index < "20130101") & ("20130101" < df.dates3)] + tm.assert_frame_equal(res, expec) + + def test_date_index_query_with_NaT_duplicates(self): + engine, parser = self.engine, self.parser + n = 10 + d = {} + d["dates1"] = date_range("1/1/2012", periods=n) + d["dates3"] = date_range("1/1/2014", periods=n) + df = DataFrame(d) + df.loc[np.random.rand(n) > 0.5, "dates1"] = pd.NaT + df.set_index("dates1", inplace=True, drop=True) + res = df.query("dates1 < 20130101 < dates3", engine=engine, parser=parser) + expec = df[(df.index.to_series() < "20130101") & ("20130101" < df.dates3)] + tm.assert_frame_equal(res, expec) + + def test_date_query_with_non_date(self): + engine, parser = self.engine, self.parser + + n = 10 + df = DataFrame( + {"dates": date_range("1/1/2012", periods=n), "nondate": np.arange(n)} + ) + + result = df.query("dates == nondate", parser=parser, engine=engine) + assert len(result) == 0 + + result = df.query("dates != nondate", parser=parser, engine=engine) + tm.assert_frame_equal(result, df) + + for op in ["<", ">", "<=", ">="]: + with pytest.raises(TypeError): + df.query( + "dates {op} nondate".format(op=op), parser=parser, engine=engine + ) + + def test_query_syntax_error(self): + engine, parser = self.engine, self.parser + df = DataFrame({"i": range(10), "+": range(3, 13), "r": range(4, 14)}) + with pytest.raises(SyntaxError): + df.query("i - +", engine=engine, parser=parser) + + def test_query_scope(self): + from pandas.core.computation.ops import UndefinedVariableError + + engine, parser = self.engine, self.parser + skip_if_no_pandas_parser(parser) + + df = DataFrame(np.random.randn(20, 2), columns=list("ab")) + + a, b = 1, 2 # noqa + res = df.query("a > b", engine=engine, parser=parser) + expected = df[df.a > df.b] + tm.assert_frame_equal(res, expected) + + res = df.query("@a > b", engine=engine, parser=parser) + expected = df[a > df.b] + tm.assert_frame_equal(res, expected) + + # no local variable c + with pytest.raises( + UndefinedVariableError, match="local variable 'c' is not defined" + ): + df.query("@a > b > @c", engine=engine, parser=parser) + + # no column named 'c' + with pytest.raises(UndefinedVariableError, match="name 'c' is not defined"): + df.query("@a > b > c", engine=engine, parser=parser) + + def test_query_doesnt_pickup_local(self): + from pandas.core.computation.ops import UndefinedVariableError + + engine, parser = self.engine, self.parser + n = m = 10 + df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list("abc")) + + # we don't pick up the local 'sin' + with pytest.raises(UndefinedVariableError, match="name 'sin' is not defined"): + df.query("sin > 5", engine=engine, parser=parser) + + def test_query_builtin(self): + from pandas.core.computation.engines import NumExprClobberingError + + engine, parser = self.engine, self.parser + + n = m = 10 + df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list("abc")) + + df.index.name = "sin" + msg = "Variables in expression.+" + with pytest.raises(NumExprClobberingError, match=msg): + df.query("sin > 5", engine=engine, parser=parser) + + def test_query(self): + engine, parser = self.engine, self.parser + df = DataFrame(np.random.randn(10, 3), columns=["a", "b", "c"]) + + tm.assert_frame_equal( + df.query("a < b", engine=engine, parser=parser), df[df.a < df.b] + ) + tm.assert_frame_equal( + df.query("a + b > b * c", engine=engine, parser=parser), + df[df.a + df.b > df.b * df.c], + ) + + def test_query_index_with_name(self): + engine, parser = self.engine, self.parser + df = DataFrame( + np.random.randint(10, size=(10, 3)), + index=Index(range(10), name="blob"), + columns=["a", "b", "c"], + ) + res = df.query("(blob < 5) & (a < b)", engine=engine, parser=parser) + expec = df[(df.index < 5) & (df.a < df.b)] + tm.assert_frame_equal(res, expec) + + res = df.query("blob < b", engine=engine, parser=parser) + expec = df[df.index < df.b] + + tm.assert_frame_equal(res, expec) + + def test_query_index_without_name(self): + engine, parser = self.engine, self.parser + df = DataFrame( + np.random.randint(10, size=(10, 3)), + index=range(10), + columns=["a", "b", "c"], + ) + + # "index" should refer to the index + res = df.query("index < b", engine=engine, parser=parser) + expec = df[df.index < df.b] + tm.assert_frame_equal(res, expec) + + # test against a scalar + res = df.query("index < 5", engine=engine, parser=parser) + expec = df[df.index < 5] + tm.assert_frame_equal(res, expec) + + def test_nested_scope(self): + engine = self.engine + parser = self.parser + + skip_if_no_pandas_parser(parser) + + df = DataFrame(np.random.randn(5, 3)) + df2 = DataFrame(np.random.randn(5, 3)) + expected = df[(df > 0) & (df2 > 0)] + + result = df.query("(@df > 0) & (@df2 > 0)", engine=engine, parser=parser) + tm.assert_frame_equal(result, expected) + + result = pd.eval("df[df > 0 and df2 > 0]", engine=engine, parser=parser) + tm.assert_frame_equal(result, expected) + + result = pd.eval( + "df[df > 0 and df2 > 0 and df[df > 0] > 0]", engine=engine, parser=parser + ) + expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] + tm.assert_frame_equal(result, expected) + + result = pd.eval("df[(df>0) & (df2>0)]", engine=engine, parser=parser) + expected = df.query("(@df>0) & (@df2>0)", engine=engine, parser=parser) + tm.assert_frame_equal(result, expected) + + def test_nested_raises_on_local_self_reference(self): + from pandas.core.computation.ops import UndefinedVariableError + + df = DataFrame(np.random.randn(5, 3)) + + # can't reference ourself b/c we're a local so @ is necessary + with pytest.raises(UndefinedVariableError, match="name 'df' is not defined"): + df.query("df > 0", engine=self.engine, parser=self.parser) + + def test_local_syntax(self): + skip_if_no_pandas_parser(self.parser) + + engine, parser = self.engine, self.parser + df = DataFrame(np.random.randn(100, 10), columns=list("abcdefghij")) + b = 1 + expect = df[df.a < b] + result = df.query("a < @b", engine=engine, parser=parser) + tm.assert_frame_equal(result, expect) + + expect = df[df.a < df.b] + result = df.query("a < b", engine=engine, parser=parser) + tm.assert_frame_equal(result, expect) + + def test_chained_cmp_and_in(self): + skip_if_no_pandas_parser(self.parser) + engine, parser = self.engine, self.parser + cols = list("abc") + df = DataFrame(np.random.randn(100, len(cols)), columns=cols) + res = df.query( + "a < b < c and a not in b not in c", engine=engine, parser=parser + ) + ind = ( + (df.a < df.b) & (df.b < df.c) & ~df.b.isin(df.a) & ~df.c.isin(df.b) + ) # noqa + expec = df[ind] + tm.assert_frame_equal(res, expec) + + def test_local_variable_with_in(self): + engine, parser = self.engine, self.parser + skip_if_no_pandas_parser(parser) + a = Series(np.random.randint(3, size=15), name="a") + b = Series(np.random.randint(10, size=15), name="b") + df = DataFrame({"a": a, "b": b}) + + expected = df.loc[(df.b - 1).isin(a)] + result = df.query("b - 1 in a", engine=engine, parser=parser) + tm.assert_frame_equal(expected, result) + + b = Series(np.random.randint(10, size=15), name="b") + expected = df.loc[(b - 1).isin(a)] + result = df.query("@b - 1 in a", engine=engine, parser=parser) + tm.assert_frame_equal(expected, result) + + def test_at_inside_string(self): + engine, parser = self.engine, self.parser + skip_if_no_pandas_parser(parser) + c = 1 # noqa + df = DataFrame({"a": ["a", "a", "b", "b", "@c", "@c"]}) + result = df.query('a == "@c"', engine=engine, parser=parser) + expected = df[df.a == "@c"] + tm.assert_frame_equal(result, expected) + + def test_query_undefined_local(self): + from pandas.core.computation.ops import UndefinedVariableError + + engine, parser = self.engine, self.parser + skip_if_no_pandas_parser(parser) + + df = DataFrame(np.random.rand(10, 2), columns=list("ab")) + with pytest.raises( + UndefinedVariableError, match="local variable 'c' is not defined" + ): + df.query("a == @c", engine=engine, parser=parser) + + def test_index_resolvers_come_after_columns_with_the_same_name(self): + n = 1 # noqa + a = np.r_[20:101:20] + + df = DataFrame({"index": a, "b": np.random.randn(a.size)}) + df.index.name = "index" + result = df.query("index > 5", engine=self.engine, parser=self.parser) + expected = df[df["index"] > 5] + tm.assert_frame_equal(result, expected) + + df = DataFrame({"index": a, "b": np.random.randn(a.size)}) + result = df.query("ilevel_0 > 5", engine=self.engine, parser=self.parser) + expected = df.loc[df.index[df.index > 5]] + tm.assert_frame_equal(result, expected) + + df = DataFrame({"a": a, "b": np.random.randn(a.size)}) + df.index.name = "a" + result = df.query("a > 5", engine=self.engine, parser=self.parser) + expected = df[df.a > 5] + tm.assert_frame_equal(result, expected) + + result = df.query("index > 5", engine=self.engine, parser=self.parser) + expected = df.loc[df.index[df.index > 5]] + tm.assert_frame_equal(result, expected) + + def test_inf(self): + n = 10 + df = DataFrame({"a": np.random.rand(n), "b": np.random.rand(n)}) + df.loc[::2, 0] = np.inf + ops = "==", "!=" + d = dict(zip(ops, (operator.eq, operator.ne))) + for op, f in d.items(): + q = "a {op} inf".format(op=op) + expected = df[f(df.a, np.inf)] + result = df.query(q, engine=self.engine, parser=self.parser) + tm.assert_frame_equal(result, expected) + + +@td.skip_if_no_ne +class TestDataFrameQueryNumExprPython(TestDataFrameQueryNumExprPandas): + @classmethod + def setup_class(cls): + super().setup_class() + cls.engine = "numexpr" + cls.parser = "python" + + def test_date_query_no_attribute_access(self): + engine, parser = self.engine, self.parser + df = DataFrame(np.random.randn(5, 3)) + df["dates1"] = date_range("1/1/2012", periods=5) + df["dates2"] = date_range("1/1/2013", periods=5) + df["dates3"] = date_range("1/1/2014", periods=5) + res = df.query( + "(dates1 < 20130101) & (20130101 < dates3)", engine=engine, parser=parser + ) + expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)] + tm.assert_frame_equal(res, expec) + + def test_date_query_with_NaT(self): + engine, parser = self.engine, self.parser + n = 10 + df = DataFrame(np.random.randn(n, 3)) + df["dates1"] = date_range("1/1/2012", periods=n) + df["dates2"] = date_range("1/1/2013", periods=n) + df["dates3"] = date_range("1/1/2014", periods=n) + df.loc[np.random.rand(n) > 0.5, "dates1"] = pd.NaT + df.loc[np.random.rand(n) > 0.5, "dates3"] = pd.NaT + res = df.query( + "(dates1 < 20130101) & (20130101 < dates3)", engine=engine, parser=parser + ) + expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)] + tm.assert_frame_equal(res, expec) + + def test_date_index_query(self): + engine, parser = self.engine, self.parser + n = 10 + df = DataFrame(np.random.randn(n, 3)) + df["dates1"] = date_range("1/1/2012", periods=n) + df["dates3"] = date_range("1/1/2014", periods=n) + df.set_index("dates1", inplace=True, drop=True) + res = df.query( + "(index < 20130101) & (20130101 < dates3)", engine=engine, parser=parser + ) + expec = df[(df.index < "20130101") & ("20130101" < df.dates3)] + tm.assert_frame_equal(res, expec) + + def test_date_index_query_with_NaT(self): + engine, parser = self.engine, self.parser + n = 10 + df = DataFrame(np.random.randn(n, 3)) + df["dates1"] = date_range("1/1/2012", periods=n) + df["dates3"] = date_range("1/1/2014", periods=n) + df.iloc[0, 0] = pd.NaT + df.set_index("dates1", inplace=True, drop=True) + res = df.query( + "(index < 20130101) & (20130101 < dates3)", engine=engine, parser=parser + ) + expec = df[(df.index < "20130101") & ("20130101" < df.dates3)] + tm.assert_frame_equal(res, expec) + + def test_date_index_query_with_NaT_duplicates(self): + engine, parser = self.engine, self.parser + n = 10 + df = DataFrame(np.random.randn(n, 3)) + df["dates1"] = date_range("1/1/2012", periods=n) + df["dates3"] = date_range("1/1/2014", periods=n) + df.loc[np.random.rand(n) > 0.5, "dates1"] = pd.NaT + df.set_index("dates1", inplace=True, drop=True) + with pytest.raises(NotImplementedError): + df.query("index < 20130101 < dates3", engine=engine, parser=parser) + + def test_nested_scope(self): + from pandas.core.computation.ops import UndefinedVariableError + + engine = self.engine + parser = self.parser + # smoke test + x = 1 # noqa + result = pd.eval("x + 1", engine=engine, parser=parser) + assert result == 2 + + df = DataFrame(np.random.randn(5, 3)) + df2 = DataFrame(np.random.randn(5, 3)) + + # don't have the pandas parser + with pytest.raises(SyntaxError): + df.query("(@df>0) & (@df2>0)", engine=engine, parser=parser) + + with pytest.raises(UndefinedVariableError, match="name 'df' is not defined"): + df.query("(df>0) & (df2>0)", engine=engine, parser=parser) + + expected = df[(df > 0) & (df2 > 0)] + result = pd.eval("df[(df > 0) & (df2 > 0)]", engine=engine, parser=parser) + tm.assert_frame_equal(expected, result) + + expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] + result = pd.eval( + "df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]", engine=engine, parser=parser + ) + tm.assert_frame_equal(expected, result) + + +class TestDataFrameQueryPythonPandas(TestDataFrameQueryNumExprPandas): + @classmethod + def setup_class(cls): + super().setup_class() + cls.engine = "python" + cls.parser = "pandas" + + def test_query_builtin(self): + engine, parser = self.engine, self.parser + + n = m = 10 + df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list("abc")) + + df.index.name = "sin" + expected = df[df.index > 5] + result = df.query("sin > 5", engine=engine, parser=parser) + tm.assert_frame_equal(expected, result) + + +class TestDataFrameQueryPythonPython(TestDataFrameQueryNumExprPython): + @classmethod + def setup_class(cls): + super().setup_class() + cls.engine = cls.parser = "python" + + def test_query_builtin(self): + engine, parser = self.engine, self.parser + + n = m = 10 + df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list("abc")) + + df.index.name = "sin" + expected = df[df.index > 5] + result = df.query("sin > 5", engine=engine, parser=parser) + tm.assert_frame_equal(expected, result) + + +class TestDataFrameQueryStrings: + def test_str_query_method(self, parser, engine): + df = DataFrame(np.random.randn(10, 1), columns=["b"]) + df["strings"] = Series(list("aabbccddee")) + expect = df[df.strings == "a"] + + if parser != "pandas": + col = "strings" + lst = '"a"' + + lhs = [col] * 2 + [lst] * 2 + rhs = lhs[::-1] + + eq, ne = "==", "!=" + ops = 2 * ([eq] + [ne]) + + for lhs, op, rhs in zip(lhs, ops, rhs): + ex = "{lhs} {op} {rhs}".format(lhs=lhs, op=op, rhs=rhs) + msg = r"'(Not)?In' nodes are not implemented" + with pytest.raises(NotImplementedError, match=msg): + df.query( + ex, + engine=engine, + parser=parser, + local_dict={"strings": df.strings}, + ) + else: + res = df.query('"a" == strings', engine=engine, parser=parser) + tm.assert_frame_equal(res, expect) + + res = df.query('strings == "a"', engine=engine, parser=parser) + tm.assert_frame_equal(res, expect) + tm.assert_frame_equal(res, df[df.strings.isin(["a"])]) + + expect = df[df.strings != "a"] + res = df.query('strings != "a"', engine=engine, parser=parser) + tm.assert_frame_equal(res, expect) + + res = df.query('"a" != strings', engine=engine, parser=parser) + tm.assert_frame_equal(res, expect) + tm.assert_frame_equal(res, df[~df.strings.isin(["a"])]) + + def test_str_list_query_method(self, parser, engine): + df = DataFrame(np.random.randn(10, 1), columns=["b"]) + df["strings"] = Series(list("aabbccddee")) + expect = df[df.strings.isin(["a", "b"])] + + if parser != "pandas": + col = "strings" + lst = '["a", "b"]' + + lhs = [col] * 2 + [lst] * 2 + rhs = lhs[::-1] + + eq, ne = "==", "!=" + ops = 2 * ([eq] + [ne]) + + for lhs, op, rhs in zip(lhs, ops, rhs): + ex = "{lhs} {op} {rhs}".format(lhs=lhs, op=op, rhs=rhs) + with pytest.raises(NotImplementedError): + df.query(ex, engine=engine, parser=parser) + else: + res = df.query('strings == ["a", "b"]', engine=engine, parser=parser) + tm.assert_frame_equal(res, expect) + + res = df.query('["a", "b"] == strings', engine=engine, parser=parser) + tm.assert_frame_equal(res, expect) + + expect = df[~df.strings.isin(["a", "b"])] + + res = df.query('strings != ["a", "b"]', engine=engine, parser=parser) + tm.assert_frame_equal(res, expect) + + res = df.query('["a", "b"] != strings', engine=engine, parser=parser) + tm.assert_frame_equal(res, expect) + + def test_query_with_string_columns(self, parser, engine): + df = DataFrame( + { + "a": list("aaaabbbbcccc"), + "b": list("aabbccddeeff"), + "c": np.random.randint(5, size=12), + "d": np.random.randint(9, size=12), + } + ) + if parser == "pandas": + res = df.query("a in b", parser=parser, engine=engine) + expec = df[df.a.isin(df.b)] + tm.assert_frame_equal(res, expec) + + res = df.query("a in b and c < d", parser=parser, engine=engine) + expec = df[df.a.isin(df.b) & (df.c < df.d)] + tm.assert_frame_equal(res, expec) + else: + with pytest.raises(NotImplementedError): + df.query("a in b", parser=parser, engine=engine) + + with pytest.raises(NotImplementedError): + df.query("a in b and c < d", parser=parser, engine=engine) + + def test_object_array_eq_ne(self, parser, engine): + df = DataFrame( + { + "a": list("aaaabbbbcccc"), + "b": list("aabbccddeeff"), + "c": np.random.randint(5, size=12), + "d": np.random.randint(9, size=12), + } + ) + res = df.query("a == b", parser=parser, engine=engine) + exp = df[df.a == df.b] + tm.assert_frame_equal(res, exp) + + res = df.query("a != b", parser=parser, engine=engine) + exp = df[df.a != df.b] + tm.assert_frame_equal(res, exp) + + def test_query_with_nested_strings(self, parser, engine): + skip_if_no_pandas_parser(parser) + raw = """id event timestamp + 1 "page 1 load" 1/1/2014 0:00:01 + 1 "page 1 exit" 1/1/2014 0:00:31 + 2 "page 2 load" 1/1/2014 0:01:01 + 2 "page 2 exit" 1/1/2014 0:01:31 + 3 "page 3 load" 1/1/2014 0:02:01 + 3 "page 3 exit" 1/1/2014 0:02:31 + 4 "page 1 load" 2/1/2014 1:00:01 + 4 "page 1 exit" 2/1/2014 1:00:31 + 5 "page 2 load" 2/1/2014 1:01:01 + 5 "page 2 exit" 2/1/2014 1:01:31 + 6 "page 3 load" 2/1/2014 1:02:01 + 6 "page 3 exit" 2/1/2014 1:02:31 + """ + df = pd.read_csv( + StringIO(raw), sep=r"\s{2,}", engine="python", parse_dates=["timestamp"] + ) + expected = df[df.event == '"page 1 load"'] + res = df.query("""'"page 1 load"' in event""", parser=parser, engine=engine) + tm.assert_frame_equal(expected, res) + + def test_query_with_nested_special_character(self, parser, engine): + skip_if_no_pandas_parser(parser) + df = DataFrame({"a": ["a", "b", "test & test"], "b": [1, 2, 3]}) + res = df.query('a == "test & test"', parser=parser, engine=engine) + expec = df[df.a == "test & test"] + tm.assert_frame_equal(res, expec) + + def test_query_lex_compare_strings(self, parser, engine): + + a = Series(np.random.choice(list("abcde"), 20)) + b = Series(np.arange(a.size)) + df = DataFrame({"X": a, "Y": b}) + + ops = {"<": operator.lt, ">": operator.gt, "<=": operator.le, ">=": operator.ge} + + for op, func in ops.items(): + res = df.query(f'X {op} "d"', engine=engine, parser=parser) + expected = df[func(df.X, "d")] + tm.assert_frame_equal(res, expected) + + def test_query_single_element_booleans(self, parser, engine): + columns = "bid", "bidsize", "ask", "asksize" + data = np.random.randint(2, size=(1, len(columns))).astype(bool) + df = DataFrame(data, columns=columns) + res = df.query("bid & ask", engine=engine, parser=parser) + expected = df[df.bid & df.ask] + tm.assert_frame_equal(res, expected) + + def test_query_string_scalar_variable(self, parser, engine): + skip_if_no_pandas_parser(parser) + df = pd.DataFrame( + { + "Symbol": ["BUD US", "BUD US", "IBM US", "IBM US"], + "Price": [109.70, 109.72, 183.30, 183.35], + } + ) + e = df[df.Symbol == "BUD US"] + symb = "BUD US" # noqa + r = df.query("Symbol == @symb", parser=parser, engine=engine) + tm.assert_frame_equal(e, r) + + +class TestDataFrameEvalWithFrame: + def setup_method(self, method): + self.frame = DataFrame(np.random.randn(10, 3), columns=list("abc")) + + def teardown_method(self, method): + del self.frame + + def test_simple_expr(self, parser, engine): + res = self.frame.eval("a + b", engine=engine, parser=parser) + expect = self.frame.a + self.frame.b + tm.assert_series_equal(res, expect) + + def test_bool_arith_expr(self, parser, engine): + res = self.frame.eval("a[a < 1] + b", engine=engine, parser=parser) + expect = self.frame.a[self.frame.a < 1] + self.frame.b + tm.assert_series_equal(res, expect) + + @pytest.mark.parametrize("op", ["+", "-", "*", "/"]) + def test_invalid_type_for_operator_raises(self, parser, engine, op): + df = DataFrame({"a": [1, 2], "b": ["c", "d"]}) + msg = r"unsupported operand type\(s\) for .+: '.+' and '.+'" + + with pytest.raises(TypeError, match=msg): + df.eval("a {0} b".format(op), engine=engine, parser=parser) + + +class TestDataFrameQueryBacktickQuoting: + @pytest.fixture(scope="class") + def df(self): + """ + Yields a dataframe with strings that may or may not need escaping + by backticks. The last two columns cannot be escaped by backticks + and should raise a ValueError. + """ + yield DataFrame( + { + "A": [1, 2, 3], + "B B": [3, 2, 1], + "C C": [4, 5, 6], + "C C": [7, 4, 3], + "C_C": [8, 9, 10], + "D_D D": [11, 1, 101], + "E.E": [6, 3, 5], + "F-F": [8, 1, 10], + "1e1": [2, 4, 8], + "def": [10, 11, 2], + "A (x)": [4, 1, 3], + "B(x)": [1, 1, 5], + "B (x)": [2, 7, 4], + " &^ :!€$?(} > <++*'' ": [2, 5, 6], + "": [10, 11, 1], + " A": [4, 7, 9], + " ": [1, 2, 1], + "it's": [6, 3, 1], + "that's": [9, 1, 8], + "☺": [8, 7, 6], + "foo#bar": [2, 4, 5], + 1: [5, 7, 9], + } + ) + + def test_single_backtick_variable_query(self, df): + res = df.query("1 < `B B`") + expect = df[1 < df["B B"]] + tm.assert_frame_equal(res, expect) + + def test_two_backtick_variables_query(self, df): + res = df.query("1 < `B B` and 4 < `C C`") + expect = df[(1 < df["B B"]) & (4 < df["C C"])] + tm.assert_frame_equal(res, expect) + + def test_single_backtick_variable_expr(self, df): + res = df.eval("A + `B B`") + expect = df["A"] + df["B B"] + tm.assert_series_equal(res, expect) + + def test_two_backtick_variables_expr(self, df): + res = df.eval("`B B` + `C C`") + expect = df["B B"] + df["C C"] + tm.assert_series_equal(res, expect) + + def test_already_underscore_variable(self, df): + res = df.eval("`C_C` + A") + expect = df["C_C"] + df["A"] + tm.assert_series_equal(res, expect) + + def test_same_name_but_underscores(self, df): + res = df.eval("C_C + `C C`") + expect = df["C_C"] + df["C C"] + tm.assert_series_equal(res, expect) + + def test_mixed_underscores_and_spaces(self, df): + res = df.eval("A + `D_D D`") + expect = df["A"] + df["D_D D"] + tm.assert_series_equal(res, expect) + + def test_backtick_quote_name_with_no_spaces(self, df): + res = df.eval("A + `C_C`") + expect = df["A"] + df["C_C"] + tm.assert_series_equal(res, expect) + + def test_special_characters(self, df): + res = df.eval("`E.E` + `F-F` - A") + expect = df["E.E"] + df["F-F"] - df["A"] + tm.assert_series_equal(res, expect) + + def test_start_with_digit(self, df): + res = df.eval("A + `1e1`") + expect = df["A"] + df["1e1"] + tm.assert_series_equal(res, expect) + + def test_keyword(self, df): + res = df.eval("A + `def`") + expect = df["A"] + df["def"] + tm.assert_series_equal(res, expect) + + def test_unneeded_quoting(self, df): + res = df.query("`A` > 2") + expect = df[df["A"] > 2] + tm.assert_frame_equal(res, expect) + + def test_parenthesis(self, df): + res = df.query("`A (x)` > 2") + expect = df[df["A (x)"] > 2] + tm.assert_frame_equal(res, expect) + + def test_empty_string(self, df): + res = df.query("`` > 5") + expect = df[df[""] > 5] + tm.assert_frame_equal(res, expect) + + def test_multiple_spaces(self, df): + res = df.query("`C C` > 5") + expect = df[df["C C"] > 5] + tm.assert_frame_equal(res, expect) + + def test_start_with_spaces(self, df): + res = df.eval("` A` + ` `") + expect = df[" A"] + df[" "] + tm.assert_series_equal(res, expect) + + def test_lots_of_operators_string(self, df): + res = df.query("` &^ :!€$?(} > <++*'' ` > 4") + expect = df[df[" &^ :!€$?(} > <++*'' "] > 4] + tm.assert_frame_equal(res, expect) + + def test_failing_quote(self, df): + with pytest.raises(SyntaxError): + df.query("`it's` > `that's`") + + def test_failing_character_outside_range(self, df): + with pytest.raises(SyntaxError): + df.query("`☺` > 4") + + def test_failing_hashtag(self, df): + with pytest.raises(SyntaxError): + df.query("`foo#bar` > 4") diff --git a/venv/Lib/site-packages/pandas/tests/frame/test_repr_info.py b/venv/Lib/site-packages/pandas/tests/frame/test_repr_info.py new file mode 100644 index 0000000..05bdec4 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/test_repr_info.py @@ -0,0 +1,579 @@ +from datetime import datetime, timedelta +from io import StringIO +import re +import sys +import textwrap +import warnings + +import numpy as np +import pytest + +from pandas.compat import PYPY + +import pandas as pd +from pandas import ( + Categorical, + DataFrame, + Series, + date_range, + option_context, + period_range, +) +import pandas._testing as tm + +import pandas.io.formats.format as fmt + +# Segregated collection of methods that require the BlockManager internal data +# structure + + +class TestDataFrameReprInfoEtc: + def test_repr_empty(self): + # empty + repr(DataFrame()) + + # empty with index + frame = DataFrame(index=np.arange(1000)) + repr(frame) + + def test_repr_mixed(self, float_string_frame): + buf = StringIO() + + # mixed + repr(float_string_frame) + float_string_frame.info(verbose=False, buf=buf) + + @pytest.mark.slow + def test_repr_mixed_big(self): + # big mixed + biggie = DataFrame( + {"A": np.random.randn(200), "B": tm.makeStringIndex(200)}, index=range(200) + ) + biggie.loc[:20, "A"] = np.nan + biggie.loc[:20, "B"] = np.nan + + repr(biggie) + + def test_repr(self, float_frame): + buf = StringIO() + + # small one + repr(float_frame) + float_frame.info(verbose=False, buf=buf) + + # even smaller + float_frame.reindex(columns=["A"]).info(verbose=False, buf=buf) + float_frame.reindex(columns=["A", "B"]).info(verbose=False, buf=buf) + + # exhausting cases in DataFrame.info + + # columns but no index + no_index = DataFrame(columns=[0, 1, 3]) + repr(no_index) + + # no columns or index + DataFrame().info(buf=buf) + + df = DataFrame(["a\n\r\tb"], columns=["a\n\r\td"], index=["a\n\r\tf"]) + assert "\t" not in repr(df) + assert "\r" not in repr(df) + assert "a\n" not in repr(df) + + def test_repr_dimensions(self): + df = DataFrame([[1, 2], [3, 4]]) + with option_context("display.show_dimensions", True): + assert "2 rows x 2 columns" in repr(df) + + with option_context("display.show_dimensions", False): + assert "2 rows x 2 columns" not in repr(df) + + with option_context("display.show_dimensions", "truncate"): + assert "2 rows x 2 columns" not in repr(df) + + @pytest.mark.slow + def test_repr_big(self): + # big one + biggie = DataFrame(np.zeros((200, 4)), columns=range(4), index=range(200)) + repr(biggie) + + def test_repr_unsortable(self, float_frame): + # columns are not sortable + + warn_filters = warnings.filters + warnings.filterwarnings("ignore", category=FutureWarning, module=".*format") + + unsortable = DataFrame( + { + "foo": [1] * 50, + datetime.today(): [1] * 50, + "bar": ["bar"] * 50, + datetime.today() + timedelta(1): ["bar"] * 50, + }, + index=np.arange(50), + ) + repr(unsortable) + + fmt.set_option("display.precision", 3, "display.column_space", 10) + repr(float_frame) + + fmt.set_option("display.max_rows", 10, "display.max_columns", 2) + repr(float_frame) + + fmt.set_option("display.max_rows", 1000, "display.max_columns", 1000) + repr(float_frame) + + tm.reset_display_options() + + warnings.filters = warn_filters + + def test_repr_unicode(self): + uval = "\u03c3\u03c3\u03c3\u03c3" + + # TODO(wesm): is this supposed to be used? + bval = uval.encode("utf-8") # noqa + + df = DataFrame({"A": [uval, uval]}) + + result = repr(df) + ex_top = " A" + assert result.split("\n")[0].rstrip() == ex_top + + df = DataFrame({"A": [uval, uval]}) + result = repr(df) + assert result.split("\n")[0].rstrip() == ex_top + + def test_unicode_string_with_unicode(self): + df = DataFrame({"A": ["\u05d0"]}) + str(df) + + def test_str_to_bytes_raises(self): + # GH 26447 + df = DataFrame({"A": ["abc"]}) + msg = "^'str' object cannot be interpreted as an integer$" + with pytest.raises(TypeError, match=msg): + bytes(df) + + def test_very_wide_info_repr(self): + df = DataFrame(np.random.randn(10, 20), columns=tm.rands_array(10, 20)) + repr(df) + + def test_repr_column_name_unicode_truncation_bug(self): + # #1906 + df = DataFrame( + { + "Id": [7117434], + "StringCol": ( + "Is it possible to modify drop plot code" + " so that the output graph is displayed " + "in iphone simulator, Is it possible to " + "modify drop plot code so that the " + "output graph is \xe2\x80\xa8displayed " + "in iphone simulator.Now we are adding " + "the CSV file externally. I want to Call" + " the File through the code.." + ), + } + ) + + with option_context("display.max_columns", 20): + assert "StringCol" in repr(df) + + def test_latex_repr(self): + result = r"""\begin{tabular}{llll} +\toprule +{} & 0 & 1 & 2 \\ +\midrule +0 & $\alpha$ & b & c \\ +1 & 1 & 2 & 3 \\ +\bottomrule +\end{tabular} +""" + with option_context("display.latex.escape", False, "display.latex.repr", True): + df = DataFrame([[r"$\alpha$", "b", "c"], [1, 2, 3]]) + assert result == df._repr_latex_() + + # GH 12182 + assert df._repr_latex_() is None + + def test_info(self, float_frame, datetime_frame): + io = StringIO() + float_frame.info(buf=io) + datetime_frame.info(buf=io) + + frame = DataFrame(np.random.randn(5, 3)) + + frame.info() + frame.info(verbose=False) + + def test_info_verbose(self): + buf = StringIO() + size = 1001 + start = 5 + frame = DataFrame(np.random.randn(3, size)) + frame.info(verbose=True, buf=buf) + + res = buf.getvalue() + header = " # Column Dtype \n--- ------ ----- " + assert header in res + + frame.info(verbose=True, buf=buf) + buf.seek(0) + lines = buf.readlines() + assert len(lines) > 0 + + for i, line in enumerate(lines): + if i >= start and i < start + size: + index = i - start + line_nr = " {} ".format(index) + assert line.startswith(line_nr) + + def test_info_memory(self): + # https://github.com/pandas-dev/pandas/issues/21056 + df = pd.DataFrame({"a": pd.Series([1, 2], dtype="i8")}) + buf = StringIO() + df.info(buf=buf) + result = buf.getvalue() + bytes = float(df.memory_usage().sum()) + + expected = textwrap.dedent( + """\ + + RangeIndex: 2 entries, 0 to 1 + Data columns (total 1 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 a 2 non-null int64 + dtypes: int64(1) + memory usage: {} bytes + """.format( + bytes + ) + ) + + assert result == expected + + def test_info_wide(self): + from pandas import set_option, reset_option + + io = StringIO() + df = DataFrame(np.random.randn(5, 101)) + df.info(buf=io) + + io = StringIO() + df.info(buf=io, max_cols=101) + rs = io.getvalue() + assert len(rs.splitlines()) > 100 + xp = rs + + set_option("display.max_info_columns", 101) + io = StringIO() + df.info(buf=io) + assert rs == xp + reset_option("display.max_info_columns") + + def test_info_duplicate_columns(self): + io = StringIO() + + # it works! + frame = DataFrame(np.random.randn(1500, 4), columns=["a", "a", "b", "b"]) + frame.info(buf=io) + + def test_info_duplicate_columns_shows_correct_dtypes(self): + # GH11761 + io = StringIO() + + frame = DataFrame([[1, 2.0]], columns=["a", "a"]) + frame.info(buf=io) + io.seek(0) + lines = io.readlines() + assert " 0 a 1 non-null int64 \n" == lines[5] + assert " 1 a 1 non-null float64\n" == lines[6] + + def test_info_shows_column_dtypes(self): + dtypes = [ + "int64", + "float64", + "datetime64[ns]", + "timedelta64[ns]", + "complex128", + "object", + "bool", + ] + data = {} + n = 10 + for i, dtype in enumerate(dtypes): + data[i] = np.random.randint(2, size=n).astype(dtype) + df = DataFrame(data) + buf = StringIO() + df.info(buf=buf) + res = buf.getvalue() + header = ( + " # Column Non-Null Count Dtype \n" + "--- ------ -------------- ----- " + ) + assert header in res + for i, dtype in enumerate(dtypes): + name = " {i:d} {i:d} {n:d} non-null {dtype}".format( + i=i, n=n, dtype=dtype + ) + assert name in res + + def test_info_max_cols(self): + df = DataFrame(np.random.randn(10, 5)) + for len_, verbose in [(5, None), (5, False), (12, True)]: + # For verbose always ^ setting ^ summarize ^ full output + with option_context("max_info_columns", 4): + buf = StringIO() + df.info(buf=buf, verbose=verbose) + res = buf.getvalue() + assert len(res.strip().split("\n")) == len_ + + for len_, verbose in [(12, None), (5, False), (12, True)]: + + # max_cols not exceeded + with option_context("max_info_columns", 5): + buf = StringIO() + df.info(buf=buf, verbose=verbose) + res = buf.getvalue() + assert len(res.strip().split("\n")) == len_ + + for len_, max_cols in [(12, 5), (5, 4)]: + # setting truncates + with option_context("max_info_columns", 4): + buf = StringIO() + df.info(buf=buf, max_cols=max_cols) + res = buf.getvalue() + assert len(res.strip().split("\n")) == len_ + + # setting wouldn't truncate + with option_context("max_info_columns", 5): + buf = StringIO() + df.info(buf=buf, max_cols=max_cols) + res = buf.getvalue() + assert len(res.strip().split("\n")) == len_ + + def test_info_memory_usage(self): + # Ensure memory usage is displayed, when asserted, on the last line + dtypes = [ + "int64", + "float64", + "datetime64[ns]", + "timedelta64[ns]", + "complex128", + "object", + "bool", + ] + data = {} + n = 10 + for i, dtype in enumerate(dtypes): + data[i] = np.random.randint(2, size=n).astype(dtype) + df = DataFrame(data) + buf = StringIO() + + # display memory usage case + df.info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + assert "memory usage: " in res[-1] + + # do not display memory usage case + df.info(buf=buf, memory_usage=False) + res = buf.getvalue().splitlines() + assert "memory usage: " not in res[-1] + + df.info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + + # memory usage is a lower bound, so print it as XYZ+ MB + assert re.match(r"memory usage: [^+]+\+", res[-1]) + + df.iloc[:, :5].info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + + # excluded column with object dtype, so estimate is accurate + assert not re.match(r"memory usage: [^+]+\+", res[-1]) + + # Test a DataFrame with duplicate columns + dtypes = ["int64", "int64", "int64", "float64"] + data = {} + n = 100 + for i, dtype in enumerate(dtypes): + data[i] = np.random.randint(2, size=n).astype(dtype) + df = DataFrame(data) + df.columns = dtypes + + df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"]) + df_with_object_index.info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + assert re.match(r"memory usage: [^+]+\+", res[-1]) + + df_with_object_index.info(buf=buf, memory_usage="deep") + res = buf.getvalue().splitlines() + assert re.match(r"memory usage: [^+]+$", res[-1]) + + # Ensure df size is as expected + # (cols * rows * bytes) + index size + df_size = df.memory_usage().sum() + exp_size = len(dtypes) * n * 8 + df.index.nbytes + assert df_size == exp_size + + # Ensure number of cols in memory_usage is the same as df + size_df = np.size(df.columns.values) + 1 # index=True; default + assert size_df == np.size(df.memory_usage()) + + # assert deep works only on object + assert df.memory_usage().sum() == df.memory_usage(deep=True).sum() + + # test for validity + DataFrame(1, index=["a"], columns=["A"]).memory_usage(index=True) + DataFrame(1, index=["a"], columns=["A"]).index.nbytes + df = DataFrame( + data=1, + index=pd.MultiIndex.from_product([["a"], range(1000)]), + columns=["A"], + ) + df.index.nbytes + df.memory_usage(index=True) + df.index.values.nbytes + + mem = df.memory_usage(deep=True).sum() + assert mem > 0 + + @pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result") + def test_info_memory_usage_deep_not_pypy(self): + df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"]) + assert ( + df_with_object_index.memory_usage(index=True, deep=True).sum() + > df_with_object_index.memory_usage(index=True).sum() + ) + + df_object = pd.DataFrame({"a": ["a"]}) + assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum() + + @pytest.mark.skipif(not PYPY, reason="on PyPy deep=True does not change result") + def test_info_memory_usage_deep_pypy(self): + df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"]) + assert ( + df_with_object_index.memory_usage(index=True, deep=True).sum() + == df_with_object_index.memory_usage(index=True).sum() + ) + + df_object = pd.DataFrame({"a": ["a"]}) + assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum() + + @pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design") + def test_usage_via_getsizeof(self): + df = DataFrame( + data=1, + index=pd.MultiIndex.from_product([["a"], range(1000)]), + columns=["A"], + ) + mem = df.memory_usage(deep=True).sum() + # sys.getsizeof will call the .memory_usage with + # deep=True, and add on some GC overhead + diff = mem - sys.getsizeof(df) + assert abs(diff) < 100 + + def test_info_memory_usage_qualified(self): + + buf = StringIO() + df = DataFrame(1, columns=list("ab"), index=[1, 2, 3]) + df.info(buf=buf) + assert "+" not in buf.getvalue() + + buf = StringIO() + df = DataFrame(1, columns=list("ab"), index=list("ABC")) + df.info(buf=buf) + assert "+" in buf.getvalue() + + buf = StringIO() + df = DataFrame( + 1, + columns=list("ab"), + index=pd.MultiIndex.from_product([range(3), range(3)]), + ) + df.info(buf=buf) + assert "+" not in buf.getvalue() + + buf = StringIO() + df = DataFrame( + 1, + columns=list("ab"), + index=pd.MultiIndex.from_product([range(3), ["foo", "bar"]]), + ) + df.info(buf=buf) + assert "+" in buf.getvalue() + + def test_info_memory_usage_bug_on_multiindex(self): + # GH 14308 + # memory usage introspection should not materialize .values + + from string import ascii_uppercase as uppercase + + def memory_usage(f): + return f.memory_usage(deep=True).sum() + + N = 100 + M = len(uppercase) + index = pd.MultiIndex.from_product( + [list(uppercase), pd.date_range("20160101", periods=N)], + names=["id", "date"], + ) + df = DataFrame({"value": np.random.randn(N * M)}, index=index) + + unstacked = df.unstack("id") + assert df.values.nbytes == unstacked.values.nbytes + assert memory_usage(df) > memory_usage(unstacked) + + # high upper bound + assert memory_usage(unstacked) - memory_usage(df) < 2000 + + def test_info_categorical(self): + # GH14298 + idx = pd.CategoricalIndex(["a", "b"]) + df = pd.DataFrame(np.zeros((2, 2)), index=idx, columns=idx) + + buf = StringIO() + df.info(buf=buf) + + def test_info_categorical_column(self): + + # make sure it works + n = 2500 + df = DataFrame({"int64": np.random.randint(100, size=n)}) + df["category"] = Series( + np.array(list("abcdefghij")).take(np.random.randint(0, 10, size=n)) + ).astype("category") + df.isna() + buf = StringIO() + df.info(buf=buf) + + df2 = df[df["category"] == "d"] + buf = StringIO() + df2.info(buf=buf) + + def test_repr_categorical_dates_periods(self): + # normal DataFrame + dt = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") + p = period_range("2011-01", freq="M", periods=5) + df = DataFrame({"dt": dt, "p": p}) + exp = """ dt p +0 2011-01-01 09:00:00-05:00 2011-01 +1 2011-01-01 10:00:00-05:00 2011-02 +2 2011-01-01 11:00:00-05:00 2011-03 +3 2011-01-01 12:00:00-05:00 2011-04 +4 2011-01-01 13:00:00-05:00 2011-05""" + + assert repr(df) == exp + + df2 = DataFrame({"dt": Categorical(dt), "p": Categorical(p)}) + assert repr(df2) == exp + + @pytest.mark.parametrize("arg", [np.datetime64, np.timedelta64]) + @pytest.mark.parametrize( + "box, expected", + [[Series, "0 NaT\ndtype: object"], [DataFrame, " 0\n0 NaT"]], + ) + def test_repr_np_nat_with_object(self, arg, box, expected): + # GH 25445 + result = repr(box([arg("NaT")], dtype=object)) + assert result == expected diff --git a/venv/Lib/site-packages/pandas/tests/frame/test_reshape.py b/venv/Lib/site-packages/pandas/tests/frame/test_reshape.py new file mode 100644 index 0000000..56a0c8c --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/test_reshape.py @@ -0,0 +1,1161 @@ +from datetime import datetime +import itertools + +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Index, MultiIndex, Period, Series, Timedelta, date_range +import pandas._testing as tm + + +class TestDataFrameReshape: + def test_pivot(self): + data = { + "index": ["A", "B", "C", "C", "B", "A"], + "columns": ["One", "One", "One", "Two", "Two", "Two"], + "values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0], + } + + frame = DataFrame(data) + pivoted = frame.pivot(index="index", columns="columns", values="values") + + expected = DataFrame( + { + "One": {"A": 1.0, "B": 2.0, "C": 3.0}, + "Two": {"A": 1.0, "B": 2.0, "C": 3.0}, + } + ) + + expected.index.name, expected.columns.name = "index", "columns" + tm.assert_frame_equal(pivoted, expected) + + # name tracking + assert pivoted.index.name == "index" + assert pivoted.columns.name == "columns" + + # don't specify values + pivoted = frame.pivot(index="index", columns="columns") + assert pivoted.index.name == "index" + assert pivoted.columns.names == (None, "columns") + + def test_pivot_duplicates(self): + data = DataFrame( + { + "a": ["bar", "bar", "foo", "foo", "foo"], + "b": ["one", "two", "one", "one", "two"], + "c": [1.0, 2.0, 3.0, 3.0, 4.0], + } + ) + with pytest.raises(ValueError, match="duplicate entries"): + data.pivot("a", "b", "c") + + def test_pivot_empty(self): + df = DataFrame(columns=["a", "b", "c"]) + result = df.pivot("a", "b", "c") + expected = DataFrame() + tm.assert_frame_equal(result, expected, check_names=False) + + def test_pivot_integer_bug(self): + df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")]) + + result = df.pivot(index=1, columns=0, values=2) + repr(result) + tm.assert_index_equal(result.columns, Index(["A", "B"], name=0)) + + def test_pivot_index_none(self): + # gh-3962 + data = { + "index": ["A", "B", "C", "C", "B", "A"], + "columns": ["One", "One", "One", "Two", "Two", "Two"], + "values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0], + } + + frame = DataFrame(data).set_index("index") + result = frame.pivot(columns="columns", values="values") + expected = DataFrame( + { + "One": {"A": 1.0, "B": 2.0, "C": 3.0}, + "Two": {"A": 1.0, "B": 2.0, "C": 3.0}, + } + ) + + expected.index.name, expected.columns.name = "index", "columns" + tm.assert_frame_equal(result, expected) + + # omit values + result = frame.pivot(columns="columns") + + expected.columns = pd.MultiIndex.from_tuples( + [("values", "One"), ("values", "Two")], names=[None, "columns"] + ) + expected.index.name = "index" + tm.assert_frame_equal(result, expected, check_names=False) + assert result.index.name == "index" + assert result.columns.names == (None, "columns") + expected.columns = expected.columns.droplevel(0) + result = frame.pivot(columns="columns", values="values") + + expected.columns.name = "columns" + tm.assert_frame_equal(result, expected) + + def test_stack_unstack(self, float_frame): + df = float_frame.copy() + df[:] = np.arange(np.prod(df.shape)).reshape(df.shape) + + stacked = df.stack() + stacked_df = DataFrame({"foo": stacked, "bar": stacked}) + + unstacked = stacked.unstack() + unstacked_df = stacked_df.unstack() + + tm.assert_frame_equal(unstacked, df) + tm.assert_frame_equal(unstacked_df["bar"], df) + + unstacked_cols = stacked.unstack(0) + unstacked_cols_df = stacked_df.unstack(0) + tm.assert_frame_equal(unstacked_cols.T, df) + tm.assert_frame_equal(unstacked_cols_df["bar"].T, df) + + def test_stack_mixed_level(self): + # GH 18310 + levels = [range(3), [3, "a", "b"], [1, 2]] + + # flat columns: + df = DataFrame(1, index=levels[0], columns=levels[1]) + result = df.stack() + expected = Series(1, index=MultiIndex.from_product(levels[:2])) + tm.assert_series_equal(result, expected) + + # MultiIndex columns: + df = DataFrame(1, index=levels[0], columns=MultiIndex.from_product(levels[1:])) + result = df.stack(1) + expected = DataFrame( + 1, index=MultiIndex.from_product([levels[0], levels[2]]), columns=levels[1] + ) + tm.assert_frame_equal(result, expected) + + # as above, but used labels in level are actually of homogeneous type + result = df[["a", "b"]].stack(1) + expected = expected[["a", "b"]] + tm.assert_frame_equal(result, expected) + + def test_unstack_fill(self): + + # GH #9746: fill_value keyword argument for Series + # and DataFrame unstack + + # From a series + data = Series([1, 2, 4, 5], dtype=np.int16) + data.index = MultiIndex.from_tuples( + [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] + ) + + result = data.unstack(fill_value=-1) + expected = DataFrame( + {"a": [1, -1, 5], "b": [2, 4, -1]}, index=["x", "y", "z"], dtype=np.int16 + ) + tm.assert_frame_equal(result, expected) + + # From a series with incorrect data type for fill_value + result = data.unstack(fill_value=0.5) + expected = DataFrame( + {"a": [1, 0.5, 5], "b": [2, 4, 0.5]}, index=["x", "y", "z"], dtype=np.float + ) + tm.assert_frame_equal(result, expected) + + # GH #13971: fill_value when unstacking multiple levels: + df = DataFrame( + {"x": ["a", "a", "b"], "y": ["j", "k", "j"], "z": [0, 1, 2], "w": [0, 1, 2]} + ).set_index(["x", "y", "z"]) + unstacked = df.unstack(["x", "y"], fill_value=0) + key = ("w", "b", "j") + expected = unstacked[key] + result = pd.Series([0, 0, 2], index=unstacked.index, name=key) + tm.assert_series_equal(result, expected) + + stacked = unstacked.stack(["x", "y"]) + stacked.index = stacked.index.reorder_levels(df.index.names) + # Workaround for GH #17886 (unnecessarily casts to float): + stacked = stacked.astype(np.int64) + result = stacked.loc[df.index] + tm.assert_frame_equal(result, df) + + # From a series + s = df["w"] + result = s.unstack(["x", "y"], fill_value=0) + expected = unstacked["w"] + tm.assert_frame_equal(result, expected) + + def test_unstack_fill_frame(self): + + # From a dataframe + rows = [[1, 2], [3, 4], [5, 6], [7, 8]] + df = DataFrame(rows, columns=list("AB"), dtype=np.int32) + df.index = MultiIndex.from_tuples( + [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] + ) + + result = df.unstack(fill_value=-1) + + rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]] + expected = DataFrame(rows, index=list("xyz"), dtype=np.int32) + expected.columns = MultiIndex.from_tuples( + [("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")] + ) + tm.assert_frame_equal(result, expected) + + # From a mixed type dataframe + df["A"] = df["A"].astype(np.int16) + df["B"] = df["B"].astype(np.float64) + + result = df.unstack(fill_value=-1) + expected["A"] = expected["A"].astype(np.int16) + expected["B"] = expected["B"].astype(np.float64) + tm.assert_frame_equal(result, expected) + + # From a dataframe with incorrect data type for fill_value + result = df.unstack(fill_value=0.5) + + rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]] + expected = DataFrame(rows, index=list("xyz"), dtype=np.float) + expected.columns = MultiIndex.from_tuples( + [("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")] + ) + tm.assert_frame_equal(result, expected) + + def test_unstack_fill_frame_datetime(self): + + # Test unstacking with date times + dv = pd.date_range("2012-01-01", periods=4).values + data = Series(dv) + data.index = MultiIndex.from_tuples( + [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] + ) + + result = data.unstack() + expected = DataFrame( + {"a": [dv[0], pd.NaT, dv[3]], "b": [dv[1], dv[2], pd.NaT]}, + index=["x", "y", "z"], + ) + tm.assert_frame_equal(result, expected) + + result = data.unstack(fill_value=dv[0]) + expected = DataFrame( + {"a": [dv[0], dv[0], dv[3]], "b": [dv[1], dv[2], dv[0]]}, + index=["x", "y", "z"], + ) + tm.assert_frame_equal(result, expected) + + def test_unstack_fill_frame_timedelta(self): + + # Test unstacking with time deltas + td = [Timedelta(days=i) for i in range(4)] + data = Series(td) + data.index = MultiIndex.from_tuples( + [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] + ) + + result = data.unstack() + expected = DataFrame( + {"a": [td[0], pd.NaT, td[3]], "b": [td[1], td[2], pd.NaT]}, + index=["x", "y", "z"], + ) + tm.assert_frame_equal(result, expected) + + result = data.unstack(fill_value=td[1]) + expected = DataFrame( + {"a": [td[0], td[1], td[3]], "b": [td[1], td[2], td[1]]}, + index=["x", "y", "z"], + ) + tm.assert_frame_equal(result, expected) + + def test_unstack_fill_frame_period(self): + + # Test unstacking with period + periods = [ + Period("2012-01"), + Period("2012-02"), + Period("2012-03"), + Period("2012-04"), + ] + data = Series(periods) + data.index = MultiIndex.from_tuples( + [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] + ) + + result = data.unstack() + expected = DataFrame( + {"a": [periods[0], None, periods[3]], "b": [periods[1], periods[2], None]}, + index=["x", "y", "z"], + ) + tm.assert_frame_equal(result, expected) + + result = data.unstack(fill_value=periods[1]) + expected = DataFrame( + { + "a": [periods[0], periods[1], periods[3]], + "b": [periods[1], periods[2], periods[1]], + }, + index=["x", "y", "z"], + ) + tm.assert_frame_equal(result, expected) + + def test_unstack_fill_frame_categorical(self): + + # Test unstacking with categorical + data = pd.Series(["a", "b", "c", "a"], dtype="category") + data.index = pd.MultiIndex.from_tuples( + [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] + ) + + # By default missing values will be NaN + result = data.unstack() + expected = DataFrame( + { + "a": pd.Categorical(list("axa"), categories=list("abc")), + "b": pd.Categorical(list("bcx"), categories=list("abc")), + }, + index=list("xyz"), + ) + tm.assert_frame_equal(result, expected) + + # Fill with non-category results in a TypeError + msg = r"'fill_value' \('d'\) is not in" + with pytest.raises(TypeError, match=msg): + data.unstack(fill_value="d") + + # Fill with category value replaces missing values as expected + result = data.unstack(fill_value="c") + expected = DataFrame( + { + "a": pd.Categorical(list("aca"), categories=list("abc")), + "b": pd.Categorical(list("bcc"), categories=list("abc")), + }, + index=list("xyz"), + ) + tm.assert_frame_equal(result, expected) + + def test_unstack_preserve_dtypes(self): + # Checks fix for #11847 + df = pd.DataFrame( + dict( + state=["IL", "MI", "NC"], + index=["a", "b", "c"], + some_categories=pd.Series(["a", "b", "c"]).astype("category"), + A=np.random.rand(3), + B=1, + C="foo", + D=pd.Timestamp("20010102"), + E=pd.Series([1.0, 50.0, 100.0]).astype("float32"), + F=pd.Series([3.0, 4.0, 5.0]).astype("float64"), + G=False, + H=pd.Series([1, 200, 923442], dtype="int8"), + ) + ) + + def unstack_and_compare(df, column_name): + unstacked1 = df.unstack([column_name]) + unstacked2 = df.unstack(column_name) + tm.assert_frame_equal(unstacked1, unstacked2) + + df1 = df.set_index(["state", "index"]) + unstack_and_compare(df1, "index") + + df1 = df.set_index(["state", "some_categories"]) + unstack_and_compare(df1, "some_categories") + + df1 = df.set_index(["F", "C"]) + unstack_and_compare(df1, "F") + + df1 = df.set_index(["G", "B", "state"]) + unstack_and_compare(df1, "B") + + df1 = df.set_index(["E", "A"]) + unstack_and_compare(df1, "E") + + df1 = df.set_index(["state", "index"]) + s = df1["A"] + unstack_and_compare(s, "index") + + def test_stack_ints(self): + columns = MultiIndex.from_tuples(list(itertools.product(range(3), repeat=3))) + df = DataFrame(np.random.randn(30, 27), columns=columns) + + tm.assert_frame_equal(df.stack(level=[1, 2]), df.stack(level=1).stack(level=1)) + tm.assert_frame_equal( + df.stack(level=[-2, -1]), df.stack(level=1).stack(level=1) + ) + + df_named = df.copy() + df_named.columns.set_names(range(3), inplace=True) + + tm.assert_frame_equal( + df_named.stack(level=[1, 2]), df_named.stack(level=1).stack(level=1) + ) + + def test_stack_mixed_levels(self): + columns = MultiIndex.from_tuples( + [ + ("A", "cat", "long"), + ("B", "cat", "long"), + ("A", "dog", "short"), + ("B", "dog", "short"), + ], + names=["exp", "animal", "hair_length"], + ) + df = DataFrame(np.random.randn(4, 4), columns=columns) + + animal_hair_stacked = df.stack(level=["animal", "hair_length"]) + exp_hair_stacked = df.stack(level=["exp", "hair_length"]) + + # GH #8584: Need to check that stacking works when a number + # is passed that is both a level name and in the range of + # the level numbers + df2 = df.copy() + df2.columns.names = ["exp", "animal", 1] + tm.assert_frame_equal( + df2.stack(level=["animal", 1]), animal_hair_stacked, check_names=False + ) + tm.assert_frame_equal( + df2.stack(level=["exp", 1]), exp_hair_stacked, check_names=False + ) + + # When mixed types are passed and the ints are not level + # names, raise + msg = ( + "level should contain all level names or all level numbers, not" + " a mixture of the two" + ) + with pytest.raises(ValueError, match=msg): + df2.stack(level=["animal", 0]) + + # GH #8584: Having 0 in the level names could raise a + # strange error about lexsort depth + df3 = df.copy() + df3.columns.names = ["exp", "animal", 0] + tm.assert_frame_equal( + df3.stack(level=["animal", 0]), animal_hair_stacked, check_names=False + ) + + def test_stack_int_level_names(self): + columns = MultiIndex.from_tuples( + [ + ("A", "cat", "long"), + ("B", "cat", "long"), + ("A", "dog", "short"), + ("B", "dog", "short"), + ], + names=["exp", "animal", "hair_length"], + ) + df = DataFrame(np.random.randn(4, 4), columns=columns) + + exp_animal_stacked = df.stack(level=["exp", "animal"]) + animal_hair_stacked = df.stack(level=["animal", "hair_length"]) + exp_hair_stacked = df.stack(level=["exp", "hair_length"]) + + df2 = df.copy() + df2.columns.names = [0, 1, 2] + tm.assert_frame_equal( + df2.stack(level=[1, 2]), animal_hair_stacked, check_names=False + ) + tm.assert_frame_equal( + df2.stack(level=[0, 1]), exp_animal_stacked, check_names=False + ) + tm.assert_frame_equal( + df2.stack(level=[0, 2]), exp_hair_stacked, check_names=False + ) + + # Out-of-order int column names + df3 = df.copy() + df3.columns.names = [2, 0, 1] + tm.assert_frame_equal( + df3.stack(level=[0, 1]), animal_hair_stacked, check_names=False + ) + tm.assert_frame_equal( + df3.stack(level=[2, 0]), exp_animal_stacked, check_names=False + ) + tm.assert_frame_equal( + df3.stack(level=[2, 1]), exp_hair_stacked, check_names=False + ) + + def test_unstack_bool(self): + df = DataFrame( + [False, False], + index=MultiIndex.from_arrays([["a", "b"], ["c", "l"]]), + columns=["col"], + ) + rs = df.unstack() + xp = DataFrame( + np.array([[False, np.nan], [np.nan, False]], dtype=object), + index=["a", "b"], + columns=MultiIndex.from_arrays([["col", "col"], ["c", "l"]]), + ) + tm.assert_frame_equal(rs, xp) + + def test_unstack_level_binding(self): + # GH9856 + mi = pd.MultiIndex( + levels=[["foo", "bar"], ["one", "two"], ["a", "b"]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1], [1, 0, 1, 0]], + names=["first", "second", "third"], + ) + s = pd.Series(0, index=mi) + result = s.unstack([1, 2]).stack(0) + + expected_mi = pd.MultiIndex( + levels=[["foo", "bar"], ["one", "two"]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1]], + names=["first", "second"], + ) + + expected = pd.DataFrame( + np.array( + [[np.nan, 0], [0, np.nan], [np.nan, 0], [0, np.nan]], dtype=np.float64 + ), + index=expected_mi, + columns=pd.Index(["a", "b"], name="third"), + ) + + tm.assert_frame_equal(result, expected) + + def test_unstack_to_series(self, float_frame): + # check reversibility + data = float_frame.unstack() + + assert isinstance(data, Series) + undo = data.unstack().T + tm.assert_frame_equal(undo, float_frame) + + # check NA handling + data = DataFrame({"x": [1, 2, np.NaN], "y": [3.0, 4, np.NaN]}) + data.index = Index(["a", "b", "c"]) + result = data.unstack() + + midx = MultiIndex( + levels=[["x", "y"], ["a", "b", "c"]], + codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], + ) + expected = Series([1, 2, np.NaN, 3, 4, np.NaN], index=midx) + + tm.assert_series_equal(result, expected) + + # check composability of unstack + old_data = data.copy() + for _ in range(4): + data = data.unstack() + tm.assert_frame_equal(old_data, data) + + def test_unstack_dtypes(self): + + # GH 2929 + rows = [[1, 1, 3, 4], [1, 2, 3, 4], [2, 1, 3, 4], [2, 2, 3, 4]] + + df = DataFrame(rows, columns=list("ABCD")) + result = df.dtypes + expected = Series([np.dtype("int64")] * 4, index=list("ABCD")) + tm.assert_series_equal(result, expected) + + # single dtype + df2 = df.set_index(["A", "B"]) + df3 = df2.unstack("B") + result = df3.dtypes + expected = Series( + [np.dtype("int64")] * 4, + index=pd.MultiIndex.from_arrays( + [["C", "C", "D", "D"], [1, 2, 1, 2]], names=(None, "B") + ), + ) + tm.assert_series_equal(result, expected) + + # mixed + df2 = df.set_index(["A", "B"]) + df2["C"] = 3.0 + df3 = df2.unstack("B") + result = df3.dtypes + expected = Series( + [np.dtype("float64")] * 2 + [np.dtype("int64")] * 2, + index=pd.MultiIndex.from_arrays( + [["C", "C", "D", "D"], [1, 2, 1, 2]], names=(None, "B") + ), + ) + tm.assert_series_equal(result, expected) + df2["D"] = "foo" + df3 = df2.unstack("B") + result = df3.dtypes + expected = Series( + [np.dtype("float64")] * 2 + [np.dtype("object")] * 2, + index=pd.MultiIndex.from_arrays( + [["C", "C", "D", "D"], [1, 2, 1, 2]], names=(None, "B") + ), + ) + tm.assert_series_equal(result, expected) + + # GH7405 + for c, d in ( + (np.zeros(5), np.zeros(5)), + (np.arange(5, dtype="f8"), np.arange(5, 10, dtype="f8")), + ): + + df = DataFrame( + { + "A": ["a"] * 5, + "C": c, + "D": d, + "B": pd.date_range("2012-01-01", periods=5), + } + ) + + right = df.iloc[:3].copy(deep=True) + + df = df.set_index(["A", "B"]) + df["D"] = df["D"].astype("int64") + + left = df.iloc[:3].unstack(0) + right = right.set_index(["A", "B"]).unstack(0) + right[("D", "a")] = right[("D", "a")].astype("int64") + + assert left.shape == (3, 2) + tm.assert_frame_equal(left, right) + + def test_unstack_non_unique_index_names(self): + idx = MultiIndex.from_tuples([("a", "b"), ("c", "d")], names=["c1", "c1"]) + df = DataFrame([1, 2], index=idx) + with pytest.raises(ValueError): + df.unstack("c1") + + with pytest.raises(ValueError): + df.T.stack("c1") + + def test_unstack_unused_levels(self): + # GH 17845: unused codes in index make unstack() cast int to float + idx = pd.MultiIndex.from_product([["a"], ["A", "B", "C", "D"]])[:-1] + df = pd.DataFrame([[1, 0]] * 3, index=idx) + + result = df.unstack() + exp_col = pd.MultiIndex.from_product([[0, 1], ["A", "B", "C"]]) + expected = pd.DataFrame([[1, 1, 1, 0, 0, 0]], index=["a"], columns=exp_col) + tm.assert_frame_equal(result, expected) + assert (result.columns.levels[1] == idx.levels[1]).all() + + # Unused items on both levels + levels = [[0, 1, 7], [0, 1, 2, 3]] + codes = [[0, 0, 1, 1], [0, 2, 0, 2]] + idx = pd.MultiIndex(levels, codes) + block = np.arange(4).reshape(2, 2) + df = pd.DataFrame(np.concatenate([block, block + 4]), index=idx) + result = df.unstack() + expected = pd.DataFrame( + np.concatenate([block * 2, block * 2 + 1], axis=1), columns=idx + ) + tm.assert_frame_equal(result, expected) + assert (result.columns.levels[1] == idx.levels[1]).all() + + # With mixed dtype and NaN + levels = [["a", 2, "c"], [1, 3, 5, 7]] + codes = [[0, -1, 1, 1], [0, 2, -1, 2]] + idx = pd.MultiIndex(levels, codes) + data = np.arange(8) + df = pd.DataFrame(data.reshape(4, 2), index=idx) + + cases = ( + (0, [13, 16, 6, 9, 2, 5, 8, 11], [np.nan, "a", 2], [np.nan, 5, 1]), + (1, [8, 11, 1, 4, 12, 15, 13, 16], [np.nan, 5, 1], [np.nan, "a", 2]), + ) + for level, idces, col_level, idx_level in cases: + result = df.unstack(level=level) + exp_data = np.zeros(18) * np.nan + exp_data[idces] = data + cols = pd.MultiIndex.from_product([[0, 1], col_level]) + expected = pd.DataFrame( + exp_data.reshape(3, 6), index=idx_level, columns=cols + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("cols", [["A", "C"], slice(None)]) + def test_unstack_unused_level(self, cols): + # GH 18562 : unused codes on the unstacked level + df = pd.DataFrame( + [[2010, "a", "I"], [2011, "b", "II"]], columns=["A", "B", "C"] + ) + + ind = df.set_index(["A", "B", "C"], drop=False) + selection = ind.loc[(slice(None), slice(None), "I"), cols] + result = selection.unstack() + + expected = ind.iloc[[0]][cols] + expected.columns = MultiIndex.from_product( + [expected.columns, ["I"]], names=[None, "C"] + ) + expected.index = expected.index.droplevel("C") + tm.assert_frame_equal(result, expected) + + def test_unstack_nan_index(self): # GH7466 + cast = lambda val: "{0:1}".format("" if val != val else val) + + def verify(df): + mk_list = lambda a: list(a) if isinstance(a, tuple) else [a] + rows, cols = df.notna().values.nonzero() + for i, j in zip(rows, cols): + left = sorted(df.iloc[i, j].split(".")) + right = mk_list(df.index[i]) + mk_list(df.columns[j]) + right = sorted(map(cast, right)) + assert left == right + + df = DataFrame( + { + "jim": ["a", "b", np.nan, "d"], + "joe": ["w", "x", "y", "z"], + "jolie": ["a.w", "b.x", " .y", "d.z"], + } + ) + + left = df.set_index(["jim", "joe"]).unstack()["jolie"] + right = df.set_index(["joe", "jim"]).unstack()["jolie"].T + tm.assert_frame_equal(left, right) + + for idx in itertools.permutations(df.columns[:2]): + mi = df.set_index(list(idx)) + for lev in range(2): + udf = mi.unstack(level=lev) + assert udf.notna().values.sum() == len(df) + verify(udf["jolie"]) + + df = DataFrame( + { + "1st": ["d"] * 3 + + [np.nan] * 5 + + ["a"] * 2 + + ["c"] * 3 + + ["e"] * 2 + + ["b"] * 5, + "2nd": ["y"] * 2 + + ["w"] * 3 + + [np.nan] * 3 + + ["z"] * 4 + + [np.nan] * 3 + + ["x"] * 3 + + [np.nan] * 2, + "3rd": [ + 67, + 39, + 53, + 72, + 57, + 80, + 31, + 18, + 11, + 30, + 59, + 50, + 62, + 59, + 76, + 52, + 14, + 53, + 60, + 51, + ], + } + ) + + df["4th"], df["5th"] = ( + df.apply(lambda r: ".".join(map(cast, r)), axis=1), + df.apply(lambda r: ".".join(map(cast, r.iloc[::-1])), axis=1), + ) + + for idx in itertools.permutations(["1st", "2nd", "3rd"]): + mi = df.set_index(list(idx)) + for lev in range(3): + udf = mi.unstack(level=lev) + assert udf.notna().values.sum() == 2 * len(df) + for col in ["4th", "5th"]: + verify(udf[col]) + + # GH7403 + df = pd.DataFrame({"A": list("aaaabbbb"), "B": range(8), "C": range(8)}) + df.iloc[3, 1] = np.NaN + left = df.set_index(["A", "B"]).unstack(0) + + vals = [ + [3, 0, 1, 2, np.nan, np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan, np.nan, 4, 5, 6, 7], + ] + vals = list(map(list, zip(*vals))) + idx = Index([np.nan, 0, 1, 2, 4, 5, 6, 7], name="B") + cols = MultiIndex( + levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"] + ) + + right = DataFrame(vals, columns=cols, index=idx) + tm.assert_frame_equal(left, right) + + df = DataFrame({"A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8)}) + df.iloc[2, 1] = np.NaN + left = df.set_index(["A", "B"]).unstack(0) + + vals = [[2, np.nan], [0, 4], [1, 5], [np.nan, 6], [3, 7]] + cols = MultiIndex( + levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"] + ) + idx = Index([np.nan, 0, 1, 2, 3], name="B") + right = DataFrame(vals, columns=cols, index=idx) + tm.assert_frame_equal(left, right) + + df = pd.DataFrame( + {"A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8)} + ) + df.iloc[3, 1] = np.NaN + left = df.set_index(["A", "B"]).unstack(0) + + vals = [[3, np.nan], [0, 4], [1, 5], [2, 6], [np.nan, 7]] + cols = MultiIndex( + levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"] + ) + idx = Index([np.nan, 0, 1, 2, 3], name="B") + right = DataFrame(vals, columns=cols, index=idx) + tm.assert_frame_equal(left, right) + + # GH7401 + df = pd.DataFrame( + { + "A": list("aaaaabbbbb"), + "B": (date_range("2012-01-01", periods=5).tolist() * 2), + "C": np.arange(10), + } + ) + + df.iloc[3, 1] = np.NaN + left = df.set_index(["A", "B"]).unstack() + + vals = np.array([[3, 0, 1, 2, np.nan, 4], [np.nan, 5, 6, 7, 8, 9]]) + idx = Index(["a", "b"], name="A") + cols = MultiIndex( + levels=[["C"], date_range("2012-01-01", periods=5)], + codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]], + names=[None, "B"], + ) + + right = DataFrame(vals, columns=cols, index=idx) + tm.assert_frame_equal(left, right) + + # GH4862 + vals = [ + ["Hg", np.nan, np.nan, 680585148], + ["U", 0.0, np.nan, 680585148], + ["Pb", 7.07e-06, np.nan, 680585148], + ["Sn", 2.3614e-05, 0.0133, 680607017], + ["Ag", 0.0, 0.0133, 680607017], + ["Hg", -0.00015, 0.0133, 680607017], + ] + df = DataFrame( + vals, + columns=["agent", "change", "dosage", "s_id"], + index=[17263, 17264, 17265, 17266, 17267, 17268], + ) + + left = df.copy().set_index(["s_id", "dosage", "agent"]).unstack() + + vals = [ + [np.nan, np.nan, 7.07e-06, np.nan, 0.0], + [0.0, -0.00015, np.nan, 2.3614e-05, np.nan], + ] + + idx = MultiIndex( + levels=[[680585148, 680607017], [0.0133]], + codes=[[0, 1], [-1, 0]], + names=["s_id", "dosage"], + ) + + cols = MultiIndex( + levels=[["change"], ["Ag", "Hg", "Pb", "Sn", "U"]], + codes=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]], + names=[None, "agent"], + ) + + right = DataFrame(vals, columns=cols, index=idx) + tm.assert_frame_equal(left, right) + + left = df.loc[17264:].copy().set_index(["s_id", "dosage", "agent"]) + tm.assert_frame_equal(left.unstack(), right) + + # GH9497 - multiple unstack with nulls + df = DataFrame( + { + "1st": [1, 2, 1, 2, 1, 2], + "2nd": pd.date_range("2014-02-01", periods=6, freq="D"), + "jim": 100 + np.arange(6), + "joe": (np.random.randn(6) * 10).round(2), + } + ) + + df["3rd"] = df["2nd"] - pd.Timestamp("2014-02-02") + df.loc[1, "2nd"] = df.loc[3, "2nd"] = np.nan + df.loc[1, "3rd"] = df.loc[4, "3rd"] = np.nan + + left = df.set_index(["1st", "2nd", "3rd"]).unstack(["2nd", "3rd"]) + assert left.notna().values.sum() == 2 * len(df) + + for col in ["jim", "joe"]: + for _, r in df.iterrows(): + key = r["1st"], (col, r["2nd"], r["3rd"]) + assert r[col] == left.loc[key] + + def test_stack_datetime_column_multiIndex(self): + # GH 8039 + t = datetime(2014, 1, 1) + df = DataFrame([1, 2, 3, 4], columns=MultiIndex.from_tuples([(t, "A", "B")])) + result = df.stack() + + eidx = MultiIndex.from_product([(0, 1, 2, 3), ("B",)]) + ecols = MultiIndex.from_tuples([(t, "A")]) + expected = DataFrame([1, 2, 3, 4], index=eidx, columns=ecols) + tm.assert_frame_equal(result, expected) + + def test_stack_partial_multiIndex(self): + # GH 8844 + def _test_stack_with_multiindex(multiindex): + df = DataFrame( + np.arange(3 * len(multiindex)).reshape(3, len(multiindex)), + columns=multiindex, + ) + for level in (-1, 0, 1, [0, 1], [1, 0]): + result = df.stack(level=level, dropna=False) + + if isinstance(level, int): + # Stacking a single level should not make any all-NaN rows, + # so df.stack(level=level, dropna=False) should be the same + # as df.stack(level=level, dropna=True). + expected = df.stack(level=level, dropna=True) + if isinstance(expected, Series): + tm.assert_series_equal(result, expected) + else: + tm.assert_frame_equal(result, expected) + + df.columns = MultiIndex.from_tuples( + df.columns.to_numpy(), names=df.columns.names + ) + expected = df.stack(level=level, dropna=False) + if isinstance(expected, Series): + tm.assert_series_equal(result, expected) + else: + tm.assert_frame_equal(result, expected) + + full_multiindex = MultiIndex.from_tuples( + [("B", "x"), ("B", "z"), ("A", "y"), ("C", "x"), ("C", "u")], + names=["Upper", "Lower"], + ) + for multiindex_columns in ( + [0, 1, 2, 3, 4], + [0, 1, 2, 3], + [0, 1, 2, 4], + [0, 1, 2], + [1, 2, 3], + [2, 3, 4], + [0, 1], + [0, 2], + [0, 3], + [0], + [2], + [4], + ): + _test_stack_with_multiindex(full_multiindex[multiindex_columns]) + if len(multiindex_columns) > 1: + multiindex_columns.reverse() + _test_stack_with_multiindex(full_multiindex[multiindex_columns]) + + df = DataFrame(np.arange(6).reshape(2, 3), columns=full_multiindex[[0, 1, 3]]) + result = df.stack(dropna=False) + expected = DataFrame( + [[0, 2], [1, np.nan], [3, 5], [4, np.nan]], + index=MultiIndex( + levels=[[0, 1], ["u", "x", "y", "z"]], + codes=[[0, 0, 1, 1], [1, 3, 1, 3]], + names=[None, "Lower"], + ), + columns=Index(["B", "C"], name="Upper"), + dtype=df.dtypes[0], + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("ordered", [False, True]) + @pytest.mark.parametrize("labels", [list("yxz"), list("yxy")]) + def test_stack_preserve_categorical_dtype(self, ordered, labels): + # GH13854 + cidx = pd.CategoricalIndex(labels, categories=list("xyz"), ordered=ordered) + df = DataFrame([[10, 11, 12]], columns=cidx) + result = df.stack() + + # `MultiIndex.from_product` preserves categorical dtype - + # it's tested elsewhere. + midx = pd.MultiIndex.from_product([df.index, cidx]) + expected = Series([10, 11, 12], index=midx) + + tm.assert_series_equal(result, expected) + + def test_stack_preserve_categorical_dtype_values(self): + # GH-23077 + cat = pd.Categorical(["a", "a", "b", "c"]) + df = pd.DataFrame({"A": cat, "B": cat}) + result = df.stack() + index = pd.MultiIndex.from_product([[0, 1, 2, 3], ["A", "B"]]) + expected = pd.Series( + pd.Categorical(["a", "a", "a", "a", "b", "b", "c", "c"]), index=index + ) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "index, columns", + [ + ([0, 0, 1, 1], pd.MultiIndex.from_product([[1, 2], ["a", "b"]])), + ([0, 0, 2, 3], pd.MultiIndex.from_product([[1, 2], ["a", "b"]])), + ([0, 1, 2, 3], pd.MultiIndex.from_product([[1, 2], ["a", "b"]])), + ], + ) + def test_stack_multi_columns_non_unique_index(self, index, columns): + # GH-28301 + df = pd.DataFrame(index=index, columns=columns).fillna(1) + stacked = df.stack() + new_index = pd.MultiIndex.from_tuples(stacked.index.to_numpy()) + expected = pd.DataFrame( + stacked.to_numpy(), index=new_index, columns=stacked.columns + ) + tm.assert_frame_equal(stacked, expected) + stacked_codes = np.asarray(stacked.index.codes) + expected_codes = np.asarray(new_index.codes) + tm.assert_numpy_array_equal(stacked_codes, expected_codes) + + @pytest.mark.parametrize("level", [0, 1]) + def test_unstack_mixed_extension_types(self, level): + index = pd.MultiIndex.from_tuples( + [("A", 0), ("A", 1), ("B", 1)], names=["a", "b"] + ) + df = pd.DataFrame( + { + "A": pd.core.arrays.integer_array([0, 1, None]), + "B": pd.Categorical(["a", "a", "b"]), + }, + index=index, + ) + + result = df.unstack(level=level) + expected = df.astype(object).unstack(level=level) + + expected_dtypes = pd.Series( + [df.A.dtype] * 2 + [df.B.dtype] * 2, index=result.columns + ) + tm.assert_series_equal(result.dtypes, expected_dtypes) + tm.assert_frame_equal(result.astype(object), expected) + + @pytest.mark.parametrize("level", [0, "baz"]) + def test_unstack_swaplevel_sortlevel(self, level): + # GH 20994 + mi = pd.MultiIndex.from_product([[0], ["d", "c"]], names=["bar", "baz"]) + df = pd.DataFrame([[0, 2], [1, 3]], index=mi, columns=["B", "A"]) + df.columns.name = "foo" + + expected = pd.DataFrame( + [[3, 1, 2, 0]], + columns=pd.MultiIndex.from_tuples( + [("c", "A"), ("c", "B"), ("d", "A"), ("d", "B")], names=["baz", "foo"] + ), + ) + expected.index.name = "bar" + + result = df.unstack().swaplevel(axis=1).sort_index(axis=1, level=level) + tm.assert_frame_equal(result, expected) + + +def test_unstack_fill_frame_object(): + # GH12815 Test unstacking with object. + data = pd.Series(["a", "b", "c", "a"], dtype="object") + data.index = pd.MultiIndex.from_tuples( + [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] + ) + + # By default missing values will be NaN + result = data.unstack() + expected = pd.DataFrame( + {"a": ["a", np.nan, "a"], "b": ["b", "c", np.nan]}, index=list("xyz") + ) + tm.assert_frame_equal(result, expected) + + # Fill with any value replaces missing values as expected + result = data.unstack(fill_value="d") + expected = pd.DataFrame( + {"a": ["a", "d", "a"], "b": ["b", "c", "d"]}, index=list("xyz") + ) + tm.assert_frame_equal(result, expected) + + +def test_unstack_timezone_aware_values(): + # GH 18338 + df = pd.DataFrame( + { + "timestamp": [pd.Timestamp("2017-08-27 01:00:00.709949+0000", tz="UTC")], + "a": ["a"], + "b": ["b"], + "c": ["c"], + }, + columns=["timestamp", "a", "b", "c"], + ) + result = df.set_index(["a", "b"]).unstack() + expected = pd.DataFrame( + [[pd.Timestamp("2017-08-27 01:00:00.709949+0000", tz="UTC"), "c"]], + index=pd.Index(["a"], name="a"), + columns=pd.MultiIndex( + levels=[["timestamp", "c"], ["b"]], + codes=[[0, 1], [0, 0]], + names=[None, "b"], + ), + ) + tm.assert_frame_equal(result, expected) + + +def test_stack_timezone_aware_values(): + # GH 19420 + ts = pd.date_range( + freq="D", start="20180101", end="20180103", tz="America/New_York" + ) + df = pd.DataFrame({"A": ts}, index=["a", "b", "c"]) + result = df.stack() + expected = pd.Series( + ts, + index=pd.MultiIndex( + levels=[["a", "b", "c"], ["A"]], codes=[[0, 1, 2], [0, 0, 0]] + ), + ) + tm.assert_series_equal(result, expected) + + +def test_unstacking_multi_index_df(): + # see gh-30740 + df = DataFrame( + { + "name": ["Alice", "Bob"], + "score": [9.5, 8], + "employed": [False, True], + "kids": [0, 0], + "gender": ["female", "male"], + } + ) + df = df.set_index(["name", "employed", "kids", "gender"]) + df = df.unstack(["gender"], fill_value=0) + expected = df.unstack("employed", fill_value=0).unstack("kids", fill_value=0) + result = df.unstack(["employed", "kids"], fill_value=0) + expected = DataFrame( + [[9.5, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 8.0]], + index=Index(["Alice", "Bob"], name="name"), + columns=MultiIndex.from_tuples( + [ + ("score", "female", False, 0), + ("score", "female", True, 0), + ("score", "male", False, 0), + ("score", "male", True, 0), + ], + names=[None, "gender", "employed", "kids"], + ), + ) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/frame/test_sort_values_level_as_str.py b/venv/Lib/site-packages/pandas/tests/frame/test_sort_values_level_as_str.py new file mode 100644 index 0000000..40526ab --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/test_sort_values_level_as_str.py @@ -0,0 +1,92 @@ +import numpy as np +import pytest + +from pandas.errors import PerformanceWarning + +from pandas import DataFrame +import pandas._testing as tm + + +@pytest.fixture +def df_none(): + return DataFrame( + { + "outer": ["a", "a", "a", "b", "b", "b"], + "inner": [1, 2, 2, 2, 1, 1], + "A": np.arange(6, 0, -1), + ("B", 5): ["one", "one", "two", "two", "one", "one"], + } + ) + + +@pytest.fixture(params=[["outer"], ["outer", "inner"]]) +def df_idx(request, df_none): + levels = request.param + return df_none.set_index(levels) + + +@pytest.fixture( + params=[ + "inner", # index level + ["outer"], # list of index level + "A", # column + [("B", 5)], # list of column + ["inner", "outer"], # two index levels + [("B", 5), "outer"], # index level and column + ["A", ("B", 5)], # Two columns + ["inner", "outer"], # two index levels and column + ] +) +def sort_names(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def ascending(request): + return request.param + + +def test_sort_index_level_and_column_label(df_none, df_idx, sort_names, ascending): + + # GH 14353 + + # Get index levels from df_idx + levels = df_idx.index.names + + # Compute expected by sorting on columns and the setting index + expected = df_none.sort_values( + by=sort_names, ascending=ascending, axis=0 + ).set_index(levels) + + # Compute result sorting on mix on columns and index levels + result = df_idx.sort_values(by=sort_names, ascending=ascending, axis=0) + + tm.assert_frame_equal(result, expected) + + +def test_sort_column_level_and_index_label(df_none, df_idx, sort_names, ascending): + + # GH 14353 + + # Get levels from df_idx + levels = df_idx.index.names + + # Compute expected by sorting on axis=0, setting index levels, and then + # transposing. For some cases this will result in a frame with + # multiple column levels + expected = ( + df_none.sort_values(by=sort_names, ascending=ascending, axis=0) + .set_index(levels) + .T + ) + + # Compute result by transposing and sorting on axis=1. + result = df_idx.T.sort_values(by=sort_names, ascending=ascending, axis=1) + + if len(levels) > 1: + # Accessing multi-level columns that are not lexsorted raises a + # performance warning + with tm.assert_produces_warning(PerformanceWarning, check_stacklevel=False): + tm.assert_frame_equal(result, expected) + else: + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/frame/test_subclass.py b/venv/Lib/site-packages/pandas/tests/frame/test_subclass.py new file mode 100644 index 0000000..4a436d7 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/test_subclass.py @@ -0,0 +1,559 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Index, MultiIndex, Series +import pandas._testing as tm + + +class TestDataFrameSubclassing: + def test_frame_subclassing_and_slicing(self): + # Subclass frame and ensure it returns the right class on slicing it + # In reference to PR 9632 + + class CustomSeries(Series): + @property + def _constructor(self): + return CustomSeries + + def custom_series_function(self): + return "OK" + + class CustomDataFrame(DataFrame): + """ + Subclasses pandas DF, fills DF with simulation results, adds some + custom plotting functions. + """ + + def __init__(self, *args, **kw): + super().__init__(*args, **kw) + + @property + def _constructor(self): + return CustomDataFrame + + _constructor_sliced = CustomSeries + + def custom_frame_function(self): + return "OK" + + data = {"col1": range(10), "col2": range(10)} + cdf = CustomDataFrame(data) + + # Did we get back our own DF class? + assert isinstance(cdf, CustomDataFrame) + + # Do we get back our own Series class after selecting a column? + cdf_series = cdf.col1 + assert isinstance(cdf_series, CustomSeries) + assert cdf_series.custom_series_function() == "OK" + + # Do we get back our own DF class after slicing row-wise? + cdf_rows = cdf[1:5] + assert isinstance(cdf_rows, CustomDataFrame) + assert cdf_rows.custom_frame_function() == "OK" + + # Make sure sliced part of multi-index frame is custom class + mcol = pd.MultiIndex.from_tuples([("A", "A"), ("A", "B")]) + cdf_multi = CustomDataFrame([[0, 1], [2, 3]], columns=mcol) + assert isinstance(cdf_multi["A"], CustomDataFrame) + + mcol = pd.MultiIndex.from_tuples([("A", ""), ("B", "")]) + cdf_multi2 = CustomDataFrame([[0, 1], [2, 3]], columns=mcol) + assert isinstance(cdf_multi2["A"], CustomSeries) + + def test_dataframe_metadata(self): + df = tm.SubclassedDataFrame( + {"X": [1, 2, 3], "Y": [1, 2, 3]}, index=["a", "b", "c"] + ) + df.testattr = "XXX" + + assert df.testattr == "XXX" + assert df[["X"]].testattr == "XXX" + assert df.loc[["a", "b"], :].testattr == "XXX" + assert df.iloc[[0, 1], :].testattr == "XXX" + + # see gh-9776 + assert df.iloc[0:1, :].testattr == "XXX" + + # see gh-10553 + unpickled = tm.round_trip_pickle(df) + tm.assert_frame_equal(df, unpickled) + assert df._metadata == unpickled._metadata + assert df.testattr == unpickled.testattr + + def test_indexing_sliced(self): + # GH 11559 + df = tm.SubclassedDataFrame( + {"X": [1, 2, 3], "Y": [4, 5, 6], "Z": [7, 8, 9]}, index=["a", "b", "c"] + ) + res = df.loc[:, "X"] + exp = tm.SubclassedSeries([1, 2, 3], index=list("abc"), name="X") + tm.assert_series_equal(res, exp) + assert isinstance(res, tm.SubclassedSeries) + + res = df.iloc[:, 1] + exp = tm.SubclassedSeries([4, 5, 6], index=list("abc"), name="Y") + tm.assert_series_equal(res, exp) + assert isinstance(res, tm.SubclassedSeries) + + res = df.loc[:, "Z"] + exp = tm.SubclassedSeries([7, 8, 9], index=list("abc"), name="Z") + tm.assert_series_equal(res, exp) + assert isinstance(res, tm.SubclassedSeries) + + res = df.loc["a", :] + exp = tm.SubclassedSeries([1, 4, 7], index=list("XYZ"), name="a") + tm.assert_series_equal(res, exp) + assert isinstance(res, tm.SubclassedSeries) + + res = df.iloc[1, :] + exp = tm.SubclassedSeries([2, 5, 8], index=list("XYZ"), name="b") + tm.assert_series_equal(res, exp) + assert isinstance(res, tm.SubclassedSeries) + + res = df.loc["c", :] + exp = tm.SubclassedSeries([3, 6, 9], index=list("XYZ"), name="c") + tm.assert_series_equal(res, exp) + assert isinstance(res, tm.SubclassedSeries) + + def test_subclass_attr_err_propagation(self): + # GH 11808 + class A(DataFrame): + @property + def bar(self): + return self.i_dont_exist + + with pytest.raises(AttributeError, match=".*i_dont_exist.*"): + A().bar + + def test_subclass_align(self): + # GH 12983 + df1 = tm.SubclassedDataFrame( + {"a": [1, 3, 5], "b": [1, 3, 5]}, index=list("ACE") + ) + df2 = tm.SubclassedDataFrame( + {"c": [1, 2, 4], "d": [1, 2, 4]}, index=list("ABD") + ) + + res1, res2 = df1.align(df2, axis=0) + exp1 = tm.SubclassedDataFrame( + {"a": [1, np.nan, 3, np.nan, 5], "b": [1, np.nan, 3, np.nan, 5]}, + index=list("ABCDE"), + ) + exp2 = tm.SubclassedDataFrame( + {"c": [1, 2, np.nan, 4, np.nan], "d": [1, 2, np.nan, 4, np.nan]}, + index=list("ABCDE"), + ) + assert isinstance(res1, tm.SubclassedDataFrame) + tm.assert_frame_equal(res1, exp1) + assert isinstance(res2, tm.SubclassedDataFrame) + tm.assert_frame_equal(res2, exp2) + + res1, res2 = df1.a.align(df2.c) + assert isinstance(res1, tm.SubclassedSeries) + tm.assert_series_equal(res1, exp1.a) + assert isinstance(res2, tm.SubclassedSeries) + tm.assert_series_equal(res2, exp2.c) + + def test_subclass_align_combinations(self): + # GH 12983 + df = tm.SubclassedDataFrame({"a": [1, 3, 5], "b": [1, 3, 5]}, index=list("ACE")) + s = tm.SubclassedSeries([1, 2, 4], index=list("ABD"), name="x") + + # frame + series + res1, res2 = df.align(s, axis=0) + exp1 = pd.DataFrame( + {"a": [1, np.nan, 3, np.nan, 5], "b": [1, np.nan, 3, np.nan, 5]}, + index=list("ABCDE"), + ) + # name is lost when + exp2 = pd.Series([1, 2, np.nan, 4, np.nan], index=list("ABCDE"), name="x") + + assert isinstance(res1, tm.SubclassedDataFrame) + tm.assert_frame_equal(res1, exp1) + assert isinstance(res2, tm.SubclassedSeries) + tm.assert_series_equal(res2, exp2) + + # series + frame + res1, res2 = s.align(df) + assert isinstance(res1, tm.SubclassedSeries) + tm.assert_series_equal(res1, exp2) + assert isinstance(res2, tm.SubclassedDataFrame) + tm.assert_frame_equal(res2, exp1) + + def test_subclass_iterrows(self): + # GH 13977 + df = tm.SubclassedDataFrame({"a": [1]}) + for i, row in df.iterrows(): + assert isinstance(row, tm.SubclassedSeries) + tm.assert_series_equal(row, df.loc[i]) + + def test_subclass_stack(self): + # GH 15564 + df = tm.SubclassedDataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=["a", "b", "c"], + columns=["X", "Y", "Z"], + ) + + res = df.stack() + exp = tm.SubclassedSeries( + [1, 2, 3, 4, 5, 6, 7, 8, 9], index=[list("aaabbbccc"), list("XYZXYZXYZ")] + ) + + tm.assert_series_equal(res, exp) + + def test_subclass_stack_multi(self): + # GH 15564 + df = tm.SubclassedDataFrame( + [[10, 11, 12, 13], [20, 21, 22, 23], [30, 31, 32, 33], [40, 41, 42, 43]], + index=MultiIndex.from_tuples( + list(zip(list("AABB"), list("cdcd"))), names=["aaa", "ccc"] + ), + columns=MultiIndex.from_tuples( + list(zip(list("WWXX"), list("yzyz"))), names=["www", "yyy"] + ), + ) + + exp = tm.SubclassedDataFrame( + [ + [10, 12], + [11, 13], + [20, 22], + [21, 23], + [30, 32], + [31, 33], + [40, 42], + [41, 43], + ], + index=MultiIndex.from_tuples( + list(zip(list("AAAABBBB"), list("ccddccdd"), list("yzyzyzyz"))), + names=["aaa", "ccc", "yyy"], + ), + columns=Index(["W", "X"], name="www"), + ) + + res = df.stack() + tm.assert_frame_equal(res, exp) + + res = df.stack("yyy") + tm.assert_frame_equal(res, exp) + + exp = tm.SubclassedDataFrame( + [ + [10, 11], + [12, 13], + [20, 21], + [22, 23], + [30, 31], + [32, 33], + [40, 41], + [42, 43], + ], + index=MultiIndex.from_tuples( + list(zip(list("AAAABBBB"), list("ccddccdd"), list("WXWXWXWX"))), + names=["aaa", "ccc", "www"], + ), + columns=Index(["y", "z"], name="yyy"), + ) + + res = df.stack("www") + tm.assert_frame_equal(res, exp) + + def test_subclass_stack_multi_mixed(self): + # GH 15564 + df = tm.SubclassedDataFrame( + [ + [10, 11, 12.0, 13.0], + [20, 21, 22.0, 23.0], + [30, 31, 32.0, 33.0], + [40, 41, 42.0, 43.0], + ], + index=MultiIndex.from_tuples( + list(zip(list("AABB"), list("cdcd"))), names=["aaa", "ccc"] + ), + columns=MultiIndex.from_tuples( + list(zip(list("WWXX"), list("yzyz"))), names=["www", "yyy"] + ), + ) + + exp = tm.SubclassedDataFrame( + [ + [10, 12.0], + [11, 13.0], + [20, 22.0], + [21, 23.0], + [30, 32.0], + [31, 33.0], + [40, 42.0], + [41, 43.0], + ], + index=MultiIndex.from_tuples( + list(zip(list("AAAABBBB"), list("ccddccdd"), list("yzyzyzyz"))), + names=["aaa", "ccc", "yyy"], + ), + columns=Index(["W", "X"], name="www"), + ) + + res = df.stack() + tm.assert_frame_equal(res, exp) + + res = df.stack("yyy") + tm.assert_frame_equal(res, exp) + + exp = tm.SubclassedDataFrame( + [ + [10.0, 11.0], + [12.0, 13.0], + [20.0, 21.0], + [22.0, 23.0], + [30.0, 31.0], + [32.0, 33.0], + [40.0, 41.0], + [42.0, 43.0], + ], + index=MultiIndex.from_tuples( + list(zip(list("AAAABBBB"), list("ccddccdd"), list("WXWXWXWX"))), + names=["aaa", "ccc", "www"], + ), + columns=Index(["y", "z"], name="yyy"), + ) + + res = df.stack("www") + tm.assert_frame_equal(res, exp) + + def test_subclass_unstack(self): + # GH 15564 + df = tm.SubclassedDataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=["a", "b", "c"], + columns=["X", "Y", "Z"], + ) + + res = df.unstack() + exp = tm.SubclassedSeries( + [1, 4, 7, 2, 5, 8, 3, 6, 9], index=[list("XXXYYYZZZ"), list("abcabcabc")] + ) + + tm.assert_series_equal(res, exp) + + def test_subclass_unstack_multi(self): + # GH 15564 + df = tm.SubclassedDataFrame( + [[10, 11, 12, 13], [20, 21, 22, 23], [30, 31, 32, 33], [40, 41, 42, 43]], + index=MultiIndex.from_tuples( + list(zip(list("AABB"), list("cdcd"))), names=["aaa", "ccc"] + ), + columns=MultiIndex.from_tuples( + list(zip(list("WWXX"), list("yzyz"))), names=["www", "yyy"] + ), + ) + + exp = tm.SubclassedDataFrame( + [[10, 20, 11, 21, 12, 22, 13, 23], [30, 40, 31, 41, 32, 42, 33, 43]], + index=Index(["A", "B"], name="aaa"), + columns=MultiIndex.from_tuples( + list(zip(list("WWWWXXXX"), list("yyzzyyzz"), list("cdcdcdcd"))), + names=["www", "yyy", "ccc"], + ), + ) + + res = df.unstack() + tm.assert_frame_equal(res, exp) + + res = df.unstack("ccc") + tm.assert_frame_equal(res, exp) + + exp = tm.SubclassedDataFrame( + [[10, 30, 11, 31, 12, 32, 13, 33], [20, 40, 21, 41, 22, 42, 23, 43]], + index=Index(["c", "d"], name="ccc"), + columns=MultiIndex.from_tuples( + list(zip(list("WWWWXXXX"), list("yyzzyyzz"), list("ABABABAB"))), + names=["www", "yyy", "aaa"], + ), + ) + + res = df.unstack("aaa") + tm.assert_frame_equal(res, exp) + + def test_subclass_unstack_multi_mixed(self): + # GH 15564 + df = tm.SubclassedDataFrame( + [ + [10, 11, 12.0, 13.0], + [20, 21, 22.0, 23.0], + [30, 31, 32.0, 33.0], + [40, 41, 42.0, 43.0], + ], + index=MultiIndex.from_tuples( + list(zip(list("AABB"), list("cdcd"))), names=["aaa", "ccc"] + ), + columns=MultiIndex.from_tuples( + list(zip(list("WWXX"), list("yzyz"))), names=["www", "yyy"] + ), + ) + + exp = tm.SubclassedDataFrame( + [ + [10, 20, 11, 21, 12.0, 22.0, 13.0, 23.0], + [30, 40, 31, 41, 32.0, 42.0, 33.0, 43.0], + ], + index=Index(["A", "B"], name="aaa"), + columns=MultiIndex.from_tuples( + list(zip(list("WWWWXXXX"), list("yyzzyyzz"), list("cdcdcdcd"))), + names=["www", "yyy", "ccc"], + ), + ) + + res = df.unstack() + tm.assert_frame_equal(res, exp) + + res = df.unstack("ccc") + tm.assert_frame_equal(res, exp) + + exp = tm.SubclassedDataFrame( + [ + [10, 30, 11, 31, 12.0, 32.0, 13.0, 33.0], + [20, 40, 21, 41, 22.0, 42.0, 23.0, 43.0], + ], + index=Index(["c", "d"], name="ccc"), + columns=MultiIndex.from_tuples( + list(zip(list("WWWWXXXX"), list("yyzzyyzz"), list("ABABABAB"))), + names=["www", "yyy", "aaa"], + ), + ) + + res = df.unstack("aaa") + tm.assert_frame_equal(res, exp) + + def test_subclass_pivot(self): + # GH 15564 + df = tm.SubclassedDataFrame( + { + "index": ["A", "B", "C", "C", "B", "A"], + "columns": ["One", "One", "One", "Two", "Two", "Two"], + "values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0], + } + ) + + pivoted = df.pivot(index="index", columns="columns", values="values") + + expected = tm.SubclassedDataFrame( + { + "One": {"A": 1.0, "B": 2.0, "C": 3.0}, + "Two": {"A": 1.0, "B": 2.0, "C": 3.0}, + } + ) + + expected.index.name, expected.columns.name = "index", "columns" + + tm.assert_frame_equal(pivoted, expected) + + def test_subclassed_melt(self): + # GH 15564 + cheese = tm.SubclassedDataFrame( + { + "first": ["John", "Mary"], + "last": ["Doe", "Bo"], + "height": [5.5, 6.0], + "weight": [130, 150], + } + ) + + melted = pd.melt(cheese, id_vars=["first", "last"]) + + expected = tm.SubclassedDataFrame( + [ + ["John", "Doe", "height", 5.5], + ["Mary", "Bo", "height", 6.0], + ["John", "Doe", "weight", 130], + ["Mary", "Bo", "weight", 150], + ], + columns=["first", "last", "variable", "value"], + ) + + tm.assert_frame_equal(melted, expected) + + def test_subclassed_wide_to_long(self): + # GH 9762 + + np.random.seed(123) + x = np.random.randn(3) + df = tm.SubclassedDataFrame( + { + "A1970": {0: "a", 1: "b", 2: "c"}, + "A1980": {0: "d", 1: "e", 2: "f"}, + "B1970": {0: 2.5, 1: 1.2, 2: 0.7}, + "B1980": {0: 3.2, 1: 1.3, 2: 0.1}, + "X": dict(zip(range(3), x)), + } + ) + + df["id"] = df.index + exp_data = { + "X": x.tolist() + x.tolist(), + "A": ["a", "b", "c", "d", "e", "f"], + "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], + "year": [1970, 1970, 1970, 1980, 1980, 1980], + "id": [0, 1, 2, 0, 1, 2], + } + expected = tm.SubclassedDataFrame(exp_data) + expected = expected.set_index(["id", "year"])[["X", "A", "B"]] + long_frame = pd.wide_to_long(df, ["A", "B"], i="id", j="year") + + tm.assert_frame_equal(long_frame, expected) + + def test_subclassed_apply(self): + # GH 19822 + + def check_row_subclass(row): + assert isinstance(row, tm.SubclassedSeries) + + def strech(row): + if row["variable"] == "height": + row["value"] += 0.5 + return row + + df = tm.SubclassedDataFrame( + [ + ["John", "Doe", "height", 5.5], + ["Mary", "Bo", "height", 6.0], + ["John", "Doe", "weight", 130], + ["Mary", "Bo", "weight", 150], + ], + columns=["first", "last", "variable", "value"], + ) + + df.apply(lambda x: check_row_subclass(x)) + df.apply(lambda x: check_row_subclass(x), axis=1) + + expected = tm.SubclassedDataFrame( + [ + ["John", "Doe", "height", 6.0], + ["Mary", "Bo", "height", 6.5], + ["John", "Doe", "weight", 130], + ["Mary", "Bo", "weight", 150], + ], + columns=["first", "last", "variable", "value"], + ) + + result = df.apply(lambda x: strech(x), axis=1) + assert isinstance(result, tm.SubclassedDataFrame) + tm.assert_frame_equal(result, expected) + + expected = tm.SubclassedDataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]]) + + result = df.apply(lambda x: tm.SubclassedSeries([1, 2, 3]), axis=1) + assert isinstance(result, tm.SubclassedDataFrame) + tm.assert_frame_equal(result, expected) + + result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="expand") + assert isinstance(result, tm.SubclassedDataFrame) + tm.assert_frame_equal(result, expected) + + expected = tm.SubclassedSeries([[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]]) + + result = df.apply(lambda x: [1, 2, 3], axis=1) + assert not isinstance(result, tm.SubclassedDataFrame) + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/frame/test_timeseries.py b/venv/Lib/site-packages/pandas/tests/frame/test_timeseries.py new file mode 100644 index 0000000..e89f4ee --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/test_timeseries.py @@ -0,0 +1,549 @@ +from datetime import datetime, time +from itertools import product + +import numpy as np +import pytest +import pytz + +import pandas as pd +from pandas import ( + DataFrame, + DatetimeIndex, + Index, + MultiIndex, + Series, + date_range, + period_range, + to_datetime, +) +import pandas._testing as tm + +import pandas.tseries.offsets as offsets + + +@pytest.fixture(params=product([True, False], [True, False])) +def close_open_fixture(request): + return request.param + + +class TestDataFrameTimeSeriesMethods: + def test_frame_ctor_datetime64_column(self): + rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s") + dates = np.asarray(rng) + + df = DataFrame({"A": np.random.randn(len(rng)), "B": dates}) + assert np.issubdtype(df["B"].dtype, np.dtype("M8[ns]")) + + def test_frame_append_datetime64_column(self): + rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s") + df = DataFrame(index=np.arange(len(rng))) + + df["A"] = rng + assert np.issubdtype(df["A"].dtype, np.dtype("M8[ns]")) + + def test_frame_datetime64_pre1900_repr(self): + df = DataFrame({"year": date_range("1/1/1700", periods=50, freq="A-DEC")}) + # it works! + repr(df) + + def test_frame_append_datetime64_col_other_units(self): + n = 100 + + units = ["h", "m", "s", "ms", "D", "M", "Y"] + + ns_dtype = np.dtype("M8[ns]") + + for unit in units: + dtype = np.dtype("M8[{unit}]".format(unit=unit)) + vals = np.arange(n, dtype=np.int64).view(dtype) + + df = DataFrame({"ints": np.arange(n)}, index=np.arange(n)) + df[unit] = vals + + ex_vals = to_datetime(vals.astype("O")).values + + assert df[unit].dtype == ns_dtype + assert (df[unit].values == ex_vals).all() + + # Test insertion into existing datetime64 column + df = DataFrame({"ints": np.arange(n)}, index=np.arange(n)) + df["dates"] = np.arange(n, dtype=np.int64).view(ns_dtype) + + for unit in units: + dtype = np.dtype("M8[{unit}]".format(unit=unit)) + vals = np.arange(n, dtype=np.int64).view(dtype) + + tmp = df.copy() + + tmp["dates"] = vals + ex_vals = to_datetime(vals.astype("O")).values + + assert (tmp["dates"].values == ex_vals).all() + + def test_asfreq(self, datetime_frame): + offset_monthly = datetime_frame.asfreq(offsets.BMonthEnd()) + rule_monthly = datetime_frame.asfreq("BM") + + tm.assert_almost_equal(offset_monthly["A"], rule_monthly["A"]) + + filled = rule_monthly.asfreq("B", method="pad") # noqa + # TODO: actually check that this worked. + + # don't forget! + filled_dep = rule_monthly.asfreq("B", method="pad") # noqa + + # test does not blow up on length-0 DataFrame + zero_length = datetime_frame.reindex([]) + result = zero_length.asfreq("BM") + assert result is not zero_length + + def test_asfreq_datetimeindex(self): + df = DataFrame( + {"A": [1, 2, 3]}, + index=[datetime(2011, 11, 1), datetime(2011, 11, 2), datetime(2011, 11, 3)], + ) + df = df.asfreq("B") + assert isinstance(df.index, DatetimeIndex) + + ts = df["A"].asfreq("B") + assert isinstance(ts.index, DatetimeIndex) + + def test_asfreq_fillvalue(self): + # test for fill value during upsampling, related to issue 3715 + + # setup + rng = pd.date_range("1/1/2016", periods=10, freq="2S") + ts = pd.Series(np.arange(len(rng)), index=rng) + df = pd.DataFrame({"one": ts}) + + # insert pre-existing missing value + df.loc["2016-01-01 00:00:08", "one"] = None + + actual_df = df.asfreq(freq="1S", fill_value=9.0) + expected_df = df.asfreq(freq="1S").fillna(9.0) + expected_df.loc["2016-01-01 00:00:08", "one"] = None + tm.assert_frame_equal(expected_df, actual_df) + + expected_series = ts.asfreq(freq="1S").fillna(9.0) + actual_series = ts.asfreq(freq="1S", fill_value=9.0) + tm.assert_series_equal(expected_series, actual_series) + + @pytest.mark.parametrize( + "data,idx,expected_first,expected_last", + [ + ({"A": [1, 2, 3]}, [1, 1, 2], 1, 2), + ({"A": [1, 2, 3]}, [1, 2, 2], 1, 2), + ({"A": [1, 2, 3, 4]}, ["d", "d", "d", "d"], "d", "d"), + ({"A": [1, np.nan, 3]}, [1, 1, 2], 1, 2), + ({"A": [np.nan, np.nan, 3]}, [1, 1, 2], 2, 2), + ({"A": [1, np.nan, 3]}, [1, 2, 2], 1, 2), + ], + ) + def test_first_last_valid( + self, float_frame, data, idx, expected_first, expected_last + ): + N = len(float_frame.index) + mat = np.random.randn(N) + mat[:5] = np.nan + mat[-5:] = np.nan + + frame = DataFrame({"foo": mat}, index=float_frame.index) + index = frame.first_valid_index() + + assert index == frame.index[5] + + index = frame.last_valid_index() + assert index == frame.index[-6] + + # GH12800 + empty = DataFrame() + assert empty.last_valid_index() is None + assert empty.first_valid_index() is None + + # GH17400: no valid entries + frame[:] = np.nan + assert frame.last_valid_index() is None + assert frame.first_valid_index() is None + + # GH20499: its preserves freq with holes + frame.index = date_range("20110101", periods=N, freq="B") + frame.iloc[1] = 1 + frame.iloc[-2] = 1 + assert frame.first_valid_index() == frame.index[1] + assert frame.last_valid_index() == frame.index[-2] + assert frame.first_valid_index().freq == frame.index.freq + assert frame.last_valid_index().freq == frame.index.freq + + # GH 21441 + df = DataFrame(data, index=idx) + assert expected_first == df.first_valid_index() + assert expected_last == df.last_valid_index() + + @pytest.mark.parametrize("klass", [Series, DataFrame]) + def test_first_valid_index_all_nan(self, klass): + # GH#9752 Series/DataFrame should both return None, not raise + obj = klass([np.nan]) + + assert obj.first_valid_index() is None + assert obj.iloc[:0].first_valid_index() is None + + def test_first_subset(self): + ts = tm.makeTimeDataFrame(freq="12h") + result = ts.first("10d") + assert len(result) == 20 + + ts = tm.makeTimeDataFrame(freq="D") + result = ts.first("10d") + assert len(result) == 10 + + result = ts.first("3M") + expected = ts[:"3/31/2000"] + tm.assert_frame_equal(result, expected) + + result = ts.first("21D") + expected = ts[:21] + tm.assert_frame_equal(result, expected) + + result = ts[:0].first("3M") + tm.assert_frame_equal(result, ts[:0]) + + def test_first_raises(self): + # GH20725 + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + with pytest.raises(TypeError): # index is not a DatetimeIndex + df.first("1D") + + def test_last_subset(self): + ts = tm.makeTimeDataFrame(freq="12h") + result = ts.last("10d") + assert len(result) == 20 + + ts = tm.makeTimeDataFrame(nper=30, freq="D") + result = ts.last("10d") + assert len(result) == 10 + + result = ts.last("21D") + expected = ts["2000-01-10":] + tm.assert_frame_equal(result, expected) + + result = ts.last("21D") + expected = ts[-21:] + tm.assert_frame_equal(result, expected) + + result = ts[:0].last("3M") + tm.assert_frame_equal(result, ts[:0]) + + def test_last_raises(self): + # GH20725 + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + with pytest.raises(TypeError): # index is not a DatetimeIndex + df.last("1D") + + def test_at_time(self): + rng = date_range("1/1/2000", "1/5/2000", freq="5min") + ts = DataFrame(np.random.randn(len(rng), 2), index=rng) + rs = ts.at_time(rng[1]) + assert (rs.index.hour == rng[1].hour).all() + assert (rs.index.minute == rng[1].minute).all() + assert (rs.index.second == rng[1].second).all() + + result = ts.at_time("9:30") + expected = ts.at_time(time(9, 30)) + tm.assert_frame_equal(result, expected) + + result = ts.loc[time(9, 30)] + expected = ts.loc[(rng.hour == 9) & (rng.minute == 30)] + + tm.assert_frame_equal(result, expected) + + # midnight, everything + rng = date_range("1/1/2000", "1/31/2000") + ts = DataFrame(np.random.randn(len(rng), 3), index=rng) + + result = ts.at_time(time(0, 0)) + tm.assert_frame_equal(result, ts) + + # time doesn't exist + rng = date_range("1/1/2012", freq="23Min", periods=384) + ts = DataFrame(np.random.randn(len(rng), 2), rng) + rs = ts.at_time("16:00") + assert len(rs) == 0 + + @pytest.mark.parametrize( + "hour", ["1:00", "1:00AM", time(1), time(1, tzinfo=pytz.UTC)] + ) + def test_at_time_errors(self, hour): + # GH 24043 + dti = pd.date_range("2018", periods=3, freq="H") + df = pd.DataFrame(list(range(len(dti))), index=dti) + if getattr(hour, "tzinfo", None) is None: + result = df.at_time(hour) + expected = df.iloc[1:2] + tm.assert_frame_equal(result, expected) + else: + with pytest.raises(ValueError, match="Index must be timezone"): + df.at_time(hour) + + def test_at_time_tz(self): + # GH 24043 + dti = pd.date_range("2018", periods=3, freq="H", tz="US/Pacific") + df = pd.DataFrame(list(range(len(dti))), index=dti) + result = df.at_time(time(4, tzinfo=pytz.timezone("US/Eastern"))) + expected = df.iloc[1:2] + tm.assert_frame_equal(result, expected) + + def test_at_time_raises(self): + # GH20725 + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + with pytest.raises(TypeError): # index is not a DatetimeIndex + df.at_time("00:00") + + @pytest.mark.parametrize("axis", ["index", "columns", 0, 1]) + def test_at_time_axis(self, axis): + # issue 8839 + rng = date_range("1/1/2000", "1/5/2000", freq="5min") + ts = DataFrame(np.random.randn(len(rng), len(rng))) + ts.index, ts.columns = rng, rng + + indices = rng[(rng.hour == 9) & (rng.minute == 30) & (rng.second == 0)] + + if axis in ["index", 0]: + expected = ts.loc[indices, :] + elif axis in ["columns", 1]: + expected = ts.loc[:, indices] + + result = ts.at_time("9:30", axis=axis) + tm.assert_frame_equal(result, expected) + + def test_between_time(self, close_open_fixture): + rng = date_range("1/1/2000", "1/5/2000", freq="5min") + ts = DataFrame(np.random.randn(len(rng), 2), index=rng) + stime = time(0, 0) + etime = time(1, 0) + inc_start, inc_end = close_open_fixture + + filtered = ts.between_time(stime, etime, inc_start, inc_end) + exp_len = 13 * 4 + 1 + if not inc_start: + exp_len -= 5 + if not inc_end: + exp_len -= 4 + + assert len(filtered) == exp_len + for rs in filtered.index: + t = rs.time() + if inc_start: + assert t >= stime + else: + assert t > stime + + if inc_end: + assert t <= etime + else: + assert t < etime + + result = ts.between_time("00:00", "01:00") + expected = ts.between_time(stime, etime) + tm.assert_frame_equal(result, expected) + + # across midnight + rng = date_range("1/1/2000", "1/5/2000", freq="5min") + ts = DataFrame(np.random.randn(len(rng), 2), index=rng) + stime = time(22, 0) + etime = time(9, 0) + + filtered = ts.between_time(stime, etime, inc_start, inc_end) + exp_len = (12 * 11 + 1) * 4 + 1 + if not inc_start: + exp_len -= 4 + if not inc_end: + exp_len -= 4 + + assert len(filtered) == exp_len + for rs in filtered.index: + t = rs.time() + if inc_start: + assert (t >= stime) or (t <= etime) + else: + assert (t > stime) or (t <= etime) + + if inc_end: + assert (t <= etime) or (t >= stime) + else: + assert (t < etime) or (t >= stime) + + def test_between_time_raises(self): + # GH20725 + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + with pytest.raises(TypeError): # index is not a DatetimeIndex + df.between_time(start_time="00:00", end_time="12:00") + + def test_between_time_axis(self, axis): + # issue 8839 + rng = date_range("1/1/2000", periods=100, freq="10min") + ts = DataFrame(np.random.randn(len(rng), len(rng))) + stime, etime = ("08:00:00", "09:00:00") + exp_len = 7 + + if axis in ["index", 0]: + ts.index = rng + assert len(ts.between_time(stime, etime)) == exp_len + assert len(ts.between_time(stime, etime, axis=0)) == exp_len + + if axis in ["columns", 1]: + ts.columns = rng + selected = ts.between_time(stime, etime, axis=1).columns + assert len(selected) == exp_len + + def test_between_time_axis_raises(self, axis): + # issue 8839 + rng = date_range("1/1/2000", periods=100, freq="10min") + mask = np.arange(0, len(rng)) + rand_data = np.random.randn(len(rng), len(rng)) + ts = DataFrame(rand_data, index=rng, columns=rng) + stime, etime = ("08:00:00", "09:00:00") + + msg = "Index must be DatetimeIndex" + if axis in ["columns", 1]: + ts.index = mask + with pytest.raises(TypeError, match=msg): + ts.between_time(stime, etime) + with pytest.raises(TypeError, match=msg): + ts.between_time(stime, etime, axis=0) + + if axis in ["index", 0]: + ts.columns = mask + with pytest.raises(TypeError, match=msg): + ts.between_time(stime, etime, axis=1) + + def test_operation_on_NaT(self): + # Both NaT and Timestamp are in DataFrame. + df = pd.DataFrame({"foo": [pd.NaT, pd.NaT, pd.Timestamp("2012-05-01")]}) + + res = df.min() + exp = pd.Series([pd.Timestamp("2012-05-01")], index=["foo"]) + tm.assert_series_equal(res, exp) + + res = df.max() + exp = pd.Series([pd.Timestamp("2012-05-01")], index=["foo"]) + tm.assert_series_equal(res, exp) + + # GH12941, only NaTs are in DataFrame. + df = pd.DataFrame({"foo": [pd.NaT, pd.NaT]}) + + res = df.min() + exp = pd.Series([pd.NaT], index=["foo"]) + tm.assert_series_equal(res, exp) + + res = df.max() + exp = pd.Series([pd.NaT], index=["foo"]) + tm.assert_series_equal(res, exp) + + def test_datetime_assignment_with_NaT_and_diff_time_units(self): + # GH 7492 + data_ns = np.array([1, "nat"], dtype="datetime64[ns]") + result = pd.Series(data_ns).to_frame() + result["new"] = data_ns + expected = pd.DataFrame( + {0: [1, None], "new": [1, None]}, dtype="datetime64[ns]" + ) + tm.assert_frame_equal(result, expected) + # OutOfBoundsDatetime error shouldn't occur + data_s = np.array([1, "nat"], dtype="datetime64[s]") + result["new"] = data_s + expected = pd.DataFrame( + {0: [1, None], "new": [1e9, None]}, dtype="datetime64[ns]" + ) + tm.assert_frame_equal(result, expected) + + def test_frame_to_period(self): + K = 5 + + dr = date_range("1/1/2000", "1/1/2001") + pr = period_range("1/1/2000", "1/1/2001") + df = DataFrame(np.random.randn(len(dr), K), index=dr) + df["mix"] = "a" + + pts = df.to_period() + exp = df.copy() + exp.index = pr + tm.assert_frame_equal(pts, exp) + + pts = df.to_period("M") + tm.assert_index_equal(pts.index, exp.index.asfreq("M")) + + df = df.T + pts = df.to_period(axis=1) + exp = df.copy() + exp.columns = pr + tm.assert_frame_equal(pts, exp) + + pts = df.to_period("M", axis=1) + tm.assert_index_equal(pts.columns, exp.columns.asfreq("M")) + + msg = "No axis named 2 for object type " + with pytest.raises(ValueError, match=msg): + df.to_period(axis=2) + + @pytest.mark.parametrize("fn", ["tz_localize", "tz_convert"]) + def test_tz_convert_and_localize(self, fn): + l0 = date_range("20140701", periods=5, freq="D") + l1 = date_range("20140701", periods=5, freq="D") + + int_idx = Index(range(5)) + + if fn == "tz_convert": + l0 = l0.tz_localize("UTC") + l1 = l1.tz_localize("UTC") + + for idx in [l0, l1]: + + l0_expected = getattr(idx, fn)("US/Pacific") + l1_expected = getattr(idx, fn)("US/Pacific") + + df1 = DataFrame(np.ones(5), index=l0) + df1 = getattr(df1, fn)("US/Pacific") + tm.assert_index_equal(df1.index, l0_expected) + + # MultiIndex + # GH7846 + df2 = DataFrame(np.ones(5), MultiIndex.from_arrays([l0, l1])) + + df3 = getattr(df2, fn)("US/Pacific", level=0) + assert not df3.index.levels[0].equals(l0) + tm.assert_index_equal(df3.index.levels[0], l0_expected) + tm.assert_index_equal(df3.index.levels[1], l1) + assert not df3.index.levels[1].equals(l1_expected) + + df3 = getattr(df2, fn)("US/Pacific", level=1) + tm.assert_index_equal(df3.index.levels[0], l0) + assert not df3.index.levels[0].equals(l0_expected) + tm.assert_index_equal(df3.index.levels[1], l1_expected) + assert not df3.index.levels[1].equals(l1) + + df4 = DataFrame(np.ones(5), MultiIndex.from_arrays([int_idx, l0])) + + # TODO: untested + df5 = getattr(df4, fn)("US/Pacific", level=1) # noqa + + tm.assert_index_equal(df3.index.levels[0], l0) + assert not df3.index.levels[0].equals(l0_expected) + tm.assert_index_equal(df3.index.levels[1], l1_expected) + assert not df3.index.levels[1].equals(l1) + + # Bad Inputs + + # Not DatetimeIndex / PeriodIndex + with pytest.raises(TypeError, match="DatetimeIndex"): + df = DataFrame(index=int_idx) + df = getattr(df, fn)("US/Pacific") + + # Not DatetimeIndex / PeriodIndex + with pytest.raises(TypeError, match="DatetimeIndex"): + df = DataFrame(np.ones(5), MultiIndex.from_arrays([int_idx, l0])) + df = getattr(df, fn)("US/Pacific", level=0) + + # Invalid level + with pytest.raises(ValueError, match="not valid"): + df = DataFrame(index=l0) + df = getattr(df, fn)("US/Pacific", level=1) diff --git a/venv/Lib/site-packages/pandas/tests/frame/test_timezones.py b/venv/Lib/site-packages/pandas/tests/frame/test_timezones.py new file mode 100644 index 0000000..b60f205 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/test_timezones.py @@ -0,0 +1,215 @@ +""" +Tests for DataFrame timezone-related methods +""" +from datetime import datetime + +import numpy as np +import pytest +import pytz + +from pandas.core.dtypes.dtypes import DatetimeTZDtype + +import pandas as pd +from pandas import DataFrame, Series +import pandas._testing as tm +from pandas.core.indexes.datetimes import date_range + + +class TestDataFrameTimezones: + def test_frame_values_with_tz(self): + tz = "US/Central" + df = DataFrame({"A": date_range("2000", periods=4, tz=tz)}) + result = df.values + expected = np.array( + [ + [pd.Timestamp("2000-01-01", tz=tz)], + [pd.Timestamp("2000-01-02", tz=tz)], + [pd.Timestamp("2000-01-03", tz=tz)], + [pd.Timestamp("2000-01-04", tz=tz)], + ] + ) + tm.assert_numpy_array_equal(result, expected) + + # two columns, homogenous + + df = df.assign(B=df.A) + result = df.values + expected = np.concatenate([expected, expected], axis=1) + tm.assert_numpy_array_equal(result, expected) + + # three columns, heterogeneous + est = "US/Eastern" + df = df.assign(C=df.A.dt.tz_convert(est)) + + new = np.array( + [ + [pd.Timestamp("2000-01-01T01:00:00", tz=est)], + [pd.Timestamp("2000-01-02T01:00:00", tz=est)], + [pd.Timestamp("2000-01-03T01:00:00", tz=est)], + [pd.Timestamp("2000-01-04T01:00:00", tz=est)], + ] + ) + expected = np.concatenate([expected, new], axis=1) + result = df.values + tm.assert_numpy_array_equal(result, expected) + + def test_frame_from_records_utc(self): + rec = {"datum": 1.5, "begin_time": datetime(2006, 4, 27, tzinfo=pytz.utc)} + + # it works + DataFrame.from_records([rec], index="begin_time") + + def test_frame_tz_localize(self): + rng = date_range("1/1/2011", periods=100, freq="H") + + df = DataFrame({"a": 1}, index=rng) + result = df.tz_localize("utc") + expected = DataFrame({"a": 1}, rng.tz_localize("UTC")) + assert result.index.tz.zone == "UTC" + tm.assert_frame_equal(result, expected) + + df = df.T + result = df.tz_localize("utc", axis=1) + assert result.columns.tz.zone == "UTC" + tm.assert_frame_equal(result, expected.T) + + def test_frame_tz_convert(self): + rng = date_range("1/1/2011", periods=200, freq="D", tz="US/Eastern") + + df = DataFrame({"a": 1}, index=rng) + result = df.tz_convert("Europe/Berlin") + expected = DataFrame({"a": 1}, rng.tz_convert("Europe/Berlin")) + assert result.index.tz.zone == "Europe/Berlin" + tm.assert_frame_equal(result, expected) + + df = df.T + result = df.tz_convert("Europe/Berlin", axis=1) + assert result.columns.tz.zone == "Europe/Berlin" + tm.assert_frame_equal(result, expected.T) + + def test_frame_join_tzaware(self): + test1 = DataFrame( + np.zeros((6, 3)), + index=date_range( + "2012-11-15 00:00:00", periods=6, freq="100L", tz="US/Central" + ), + ) + test2 = DataFrame( + np.zeros((3, 3)), + index=date_range( + "2012-11-15 00:00:00", periods=3, freq="250L", tz="US/Central" + ), + columns=range(3, 6), + ) + + result = test1.join(test2, how="outer") + ex_index = test1.index.union(test2.index) + + tm.assert_index_equal(result.index, ex_index) + assert result.index.tz.zone == "US/Central" + + def test_frame_add_tz_mismatch_converts_to_utc(self): + rng = date_range("1/1/2011", periods=10, freq="H", tz="US/Eastern") + df = DataFrame(np.random.randn(len(rng)), index=rng, columns=["a"]) + + df_moscow = df.tz_convert("Europe/Moscow") + result = df + df_moscow + assert result.index.tz is pytz.utc + + result = df_moscow + df + assert result.index.tz is pytz.utc + + def test_frame_align_aware(self): + idx1 = date_range("2001", periods=5, freq="H", tz="US/Eastern") + idx2 = date_range("2001", periods=5, freq="2H", tz="US/Eastern") + df1 = DataFrame(np.random.randn(len(idx1), 3), idx1) + df2 = DataFrame(np.random.randn(len(idx2), 3), idx2) + new1, new2 = df1.align(df2) + assert df1.index.tz == new1.index.tz + assert df2.index.tz == new2.index.tz + + # different timezones convert to UTC + + # frame with frame + df1_central = df1.tz_convert("US/Central") + new1, new2 = df1.align(df1_central) + assert new1.index.tz == pytz.UTC + assert new2.index.tz == pytz.UTC + + # frame with Series + new1, new2 = df1.align(df1_central[0], axis=0) + assert new1.index.tz == pytz.UTC + assert new2.index.tz == pytz.UTC + + df1[0].align(df1_central, axis=0) + assert new1.index.tz == pytz.UTC + assert new2.index.tz == pytz.UTC + + @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) + def test_frame_no_datetime64_dtype(self, tz): + # after GH#7822 + # these retain the timezones on dict construction + dr = date_range("2011/1/1", "2012/1/1", freq="W-FRI") + dr_tz = dr.tz_localize(tz) + df = DataFrame({"A": "foo", "B": dr_tz}, index=dr) + tz_expected = DatetimeTZDtype("ns", dr_tz.tzinfo) + assert df["B"].dtype == tz_expected + + # GH#2810 (with timezones) + datetimes_naive = [ts.to_pydatetime() for ts in dr] + datetimes_with_tz = [ts.to_pydatetime() for ts in dr_tz] + df = DataFrame({"dr": dr}) + df["dr_tz"] = dr_tz + df["datetimes_naive"] = datetimes_naive + df["datetimes_with_tz"] = datetimes_with_tz + result = df.dtypes + expected = Series( + [ + np.dtype("datetime64[ns]"), + DatetimeTZDtype(tz=tz), + np.dtype("datetime64[ns]"), + DatetimeTZDtype(tz=tz), + ], + index=["dr", "dr_tz", "datetimes_naive", "datetimes_with_tz"], + ) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) + def test_frame_reset_index(self, tz): + dr = date_range("2012-06-02", periods=10, tz=tz) + df = DataFrame(np.random.randn(len(dr)), dr) + roundtripped = df.reset_index().set_index("index") + xp = df.index.tz + rs = roundtripped.index.tz + assert xp == rs + + @pytest.mark.parametrize("tz", [None, "America/New_York"]) + def test_boolean_compare_transpose_tzindex_with_dst(self, tz): + # GH 19970 + idx = date_range("20161101", "20161130", freq="4H", tz=tz) + df = DataFrame({"a": range(len(idx)), "b": range(len(idx))}, index=idx) + result = df.T == df.T + expected = DataFrame(True, index=list("ab"), columns=idx) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("copy", [True, False]) + @pytest.mark.parametrize( + "method, tz", [["tz_localize", None], ["tz_convert", "Europe/Berlin"]] + ) + def test_tz_localize_convert_copy_inplace_mutate(self, copy, method, tz): + # GH 6326 + result = DataFrame( + np.arange(0, 5), index=date_range("20131027", periods=5, freq="1H", tz=tz) + ) + getattr(result, method)("UTC", copy=copy) + expected = DataFrame( + np.arange(0, 5), index=date_range("20131027", periods=5, freq="1H", tz=tz) + ) + tm.assert_frame_equal(result, expected) + + def test_constructor_data_aware_dtype_naive(self, tz_aware_fixture): + # GH 25843 + tz = tz_aware_fixture + result = DataFrame({"d": [pd.Timestamp("2019", tz=tz)]}, dtype="datetime64[ns]") + expected = DataFrame({"d": [pd.Timestamp("2019")]}) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/frame/test_to_csv.py b/venv/Lib/site-packages/pandas/tests/frame/test_to_csv.py new file mode 100644 index 0000000..aeff929 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/test_to_csv.py @@ -0,0 +1,1358 @@ +import csv +from io import StringIO +import os + +import numpy as np +import pytest + +from pandas.errors import ParserError + +import pandas as pd +from pandas import ( + DataFrame, + Index, + MultiIndex, + Series, + Timestamp, + date_range, + read_csv, + to_datetime, +) +import pandas._testing as tm +import pandas.core.common as com + +from pandas.io.common import get_handle + +MIXED_FLOAT_DTYPES = ["float16", "float32", "float64"] +MIXED_INT_DTYPES = [ + "uint8", + "uint16", + "uint32", + "uint64", + "int8", + "int16", + "int32", + "int64", +] + + +class TestDataFrameToCSV: + def read_csv(self, path, **kwargs): + params = dict(index_col=0, parse_dates=True) + params.update(**kwargs) + + return pd.read_csv(path, **params) + + def test_to_csv_from_csv1(self, float_frame, datetime_frame): + + with tm.ensure_clean("__tmp_to_csv_from_csv1__") as path: + float_frame["A"][:5] = np.nan + + float_frame.to_csv(path) + float_frame.to_csv(path, columns=["A", "B"]) + float_frame.to_csv(path, header=False) + float_frame.to_csv(path, index=False) + + # test roundtrip + datetime_frame.to_csv(path) + recons = self.read_csv(path) + tm.assert_frame_equal(datetime_frame, recons) + + datetime_frame.to_csv(path, index_label="index") + recons = self.read_csv(path, index_col=None) + + assert len(recons.columns) == len(datetime_frame.columns) + 1 + + # no index + datetime_frame.to_csv(path, index=False) + recons = self.read_csv(path, index_col=None) + tm.assert_almost_equal(datetime_frame.values, recons.values) + + # corner case + dm = DataFrame( + { + "s1": Series(range(3), index=np.arange(3)), + "s2": Series(range(2), index=np.arange(2)), + } + ) + dm.to_csv(path) + + recons = self.read_csv(path) + tm.assert_frame_equal(dm, recons) + + def test_to_csv_from_csv2(self, float_frame): + + with tm.ensure_clean("__tmp_to_csv_from_csv2__") as path: + + # duplicate index + df = DataFrame( + np.random.randn(3, 3), index=["a", "a", "b"], columns=["x", "y", "z"] + ) + df.to_csv(path) + result = self.read_csv(path) + tm.assert_frame_equal(result, df) + + midx = MultiIndex.from_tuples([("A", 1, 2), ("A", 1, 2), ("B", 1, 2)]) + df = DataFrame(np.random.randn(3, 3), index=midx, columns=["x", "y", "z"]) + + df.to_csv(path) + result = self.read_csv(path, index_col=[0, 1, 2], parse_dates=False) + tm.assert_frame_equal(result, df, check_names=False) + + # column aliases + col_aliases = Index(["AA", "X", "Y", "Z"]) + float_frame.to_csv(path, header=col_aliases) + + rs = self.read_csv(path) + xp = float_frame.copy() + xp.columns = col_aliases + tm.assert_frame_equal(xp, rs) + + msg = "Writing 4 cols but got 2 aliases" + with pytest.raises(ValueError, match=msg): + float_frame.to_csv(path, header=["AA", "X"]) + + def test_to_csv_from_csv3(self): + + with tm.ensure_clean("__tmp_to_csv_from_csv3__") as path: + df1 = DataFrame(np.random.randn(3, 1)) + df2 = DataFrame(np.random.randn(3, 1)) + + df1.to_csv(path) + df2.to_csv(path, mode="a", header=False) + xp = pd.concat([df1, df2]) + rs = pd.read_csv(path, index_col=0) + rs.columns = [int(label) for label in rs.columns] + xp.columns = [int(label) for label in xp.columns] + tm.assert_frame_equal(xp, rs) + + def test_to_csv_from_csv4(self): + + with tm.ensure_clean("__tmp_to_csv_from_csv4__") as path: + # GH 10833 (TimedeltaIndex formatting) + dt = pd.Timedelta(seconds=1) + df = pd.DataFrame( + {"dt_data": [i * dt for i in range(3)]}, + index=pd.Index([i * dt for i in range(3)], name="dt_index"), + ) + df.to_csv(path) + + result = pd.read_csv(path, index_col="dt_index") + result.index = pd.to_timedelta(result.index) + # TODO: remove renaming when GH 10875 is solved + result.index = result.index.rename("dt_index") + result["dt_data"] = pd.to_timedelta(result["dt_data"]) + + tm.assert_frame_equal(df, result, check_index_type=True) + + def test_to_csv_from_csv5(self, timezone_frame): + + # tz, 8260 + with tm.ensure_clean("__tmp_to_csv_from_csv5__") as path: + + timezone_frame.to_csv(path) + result = pd.read_csv(path, index_col=0, parse_dates=["A"]) + + converter = ( + lambda c: to_datetime(result[c]) + .dt.tz_convert("UTC") + .dt.tz_convert(timezone_frame[c].dt.tz) + ) + result["B"] = converter("B") + result["C"] = converter("C") + tm.assert_frame_equal(result, timezone_frame) + + def test_to_csv_cols_reordering(self): + # GH3454 + import pandas as pd + + chunksize = 5 + N = int(chunksize * 2.5) + + df = tm.makeCustomDataframe(N, 3) + cs = df.columns + cols = [cs[2], cs[0]] + + with tm.ensure_clean() as path: + df.to_csv(path, columns=cols, chunksize=chunksize) + rs_c = pd.read_csv(path, index_col=0) + + tm.assert_frame_equal(df[cols], rs_c, check_names=False) + + def test_to_csv_new_dupe_cols(self): + import pandas as pd + + def _check_df(df, cols=None): + with tm.ensure_clean() as path: + df.to_csv(path, columns=cols, chunksize=chunksize) + rs_c = pd.read_csv(path, index_col=0) + + # we wrote them in a different order + # so compare them in that order + if cols is not None: + + if df.columns.is_unique: + rs_c.columns = cols + else: + indexer, missing = df.columns.get_indexer_non_unique(cols) + rs_c.columns = df.columns.take(indexer) + + for c in cols: + obj_df = df[c] + obj_rs = rs_c[c] + if isinstance(obj_df, Series): + tm.assert_series_equal(obj_df, obj_rs) + else: + tm.assert_frame_equal(obj_df, obj_rs, check_names=False) + + # wrote in the same order + else: + rs_c.columns = df.columns + tm.assert_frame_equal(df, rs_c, check_names=False) + + chunksize = 5 + N = int(chunksize * 2.5) + + # dupe cols + df = tm.makeCustomDataframe(N, 3) + df.columns = ["a", "a", "b"] + _check_df(df, None) + + # dupe cols with selection + cols = ["b", "a"] + _check_df(df, cols) + + @pytest.mark.slow + def test_to_csv_dtnat(self): + # GH3437 + from pandas import NaT + + def make_dtnat_arr(n, nnat=None): + if nnat is None: + nnat = int(n * 0.1) # 10% + s = list(date_range("2000", freq="5min", periods=n)) + if nnat: + for i in np.random.randint(0, len(s), nnat): + s[i] = NaT + i = np.random.randint(100) + s[-i] = NaT + s[i] = NaT + return s + + chunksize = 1000 + # N=35000 + s1 = make_dtnat_arr(chunksize + 5) + s2 = make_dtnat_arr(chunksize + 5, 0) + + # s3=make_dtnjat_arr(chunksize+5,0) + with tm.ensure_clean("1.csv") as pth: + df = DataFrame(dict(a=s1, b=s2)) + df.to_csv(pth, chunksize=chunksize) + + recons = self.read_csv(pth)._convert(datetime=True, coerce=True) + tm.assert_frame_equal( + df, recons, check_names=False, check_less_precise=True + ) + + @pytest.mark.slow + def test_to_csv_moar(self): + def _do_test( + df, r_dtype=None, c_dtype=None, rnlvl=None, cnlvl=None, dupe_col=False + ): + + kwargs = dict(parse_dates=False) + if cnlvl: + if rnlvl is not None: + kwargs["index_col"] = list(range(rnlvl)) + kwargs["header"] = list(range(cnlvl)) + + with tm.ensure_clean("__tmp_to_csv_moar__") as path: + df.to_csv(path, encoding="utf8", chunksize=chunksize) + recons = self.read_csv(path, **kwargs) + else: + kwargs["header"] = 0 + + with tm.ensure_clean("__tmp_to_csv_moar__") as path: + df.to_csv(path, encoding="utf8", chunksize=chunksize) + recons = self.read_csv(path, **kwargs) + + def _to_uni(x): + if not isinstance(x, str): + return x.decode("utf8") + return x + + if dupe_col: + # read_Csv disambiguates the columns by + # labeling them dupe.1,dupe.2, etc'. monkey patch columns + recons.columns = df.columns + if rnlvl and not cnlvl: + delta_lvl = [recons.iloc[:, i].values for i in range(rnlvl - 1)] + ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl) + recons.index = ix + recons = recons.iloc[:, rnlvl - 1 :] + + type_map = dict(i="i", f="f", s="O", u="O", dt="O", p="O") + if r_dtype: + if r_dtype == "u": # unicode + r_dtype = "O" + recons.index = np.array( + [_to_uni(label) for label in recons.index], dtype=r_dtype + ) + df.index = np.array( + [_to_uni(label) for label in df.index], dtype=r_dtype + ) + elif r_dtype == "dt": # unicode + r_dtype = "O" + recons.index = np.array( + [Timestamp(label) for label in recons.index], dtype=r_dtype + ) + df.index = np.array( + [Timestamp(label) for label in df.index], dtype=r_dtype + ) + elif r_dtype == "p": + r_dtype = "O" + idx_list = to_datetime(recons.index) + recons.index = np.array( + [Timestamp(label) for label in idx_list], dtype=r_dtype + ) + df.index = np.array( + list(map(Timestamp, df.index.to_timestamp())), dtype=r_dtype + ) + else: + r_dtype = type_map.get(r_dtype) + recons.index = np.array(recons.index, dtype=r_dtype) + df.index = np.array(df.index, dtype=r_dtype) + if c_dtype: + if c_dtype == "u": + c_dtype = "O" + recons.columns = np.array( + [_to_uni(label) for label in recons.columns], dtype=c_dtype + ) + df.columns = np.array( + [_to_uni(label) for label in df.columns], dtype=c_dtype + ) + elif c_dtype == "dt": + c_dtype = "O" + recons.columns = np.array( + [Timestamp(label) for label in recons.columns], dtype=c_dtype + ) + df.columns = np.array( + [Timestamp(label) for label in df.columns], dtype=c_dtype + ) + elif c_dtype == "p": + c_dtype = "O" + col_list = to_datetime(recons.columns) + recons.columns = np.array( + [Timestamp(label) for label in col_list], dtype=c_dtype + ) + col_list = df.columns.to_timestamp() + df.columns = np.array( + [Timestamp(label) for label in col_list], dtype=c_dtype + ) + else: + c_dtype = type_map.get(c_dtype) + recons.columns = np.array(recons.columns, dtype=c_dtype) + df.columns = np.array(df.columns, dtype=c_dtype) + + tm.assert_frame_equal( + df, recons, check_names=False, check_less_precise=True + ) + + N = 100 + chunksize = 1000 + + for ncols in [4]: + base = int((chunksize // ncols or 1) or 1) + for nrows in [ + 2, + 10, + N - 1, + N, + N + 1, + N + 2, + 2 * N - 2, + 2 * N - 1, + 2 * N, + 2 * N + 1, + 2 * N + 2, + base - 1, + base, + base + 1, + ]: + _do_test( + tm.makeCustomDataframe( + nrows, ncols, r_idx_type="dt", c_idx_type="s" + ), + "dt", + "s", + ) + + for ncols in [4]: + base = int((chunksize // ncols or 1) or 1) + for nrows in [ + 2, + 10, + N - 1, + N, + N + 1, + N + 2, + 2 * N - 2, + 2 * N - 1, + 2 * N, + 2 * N + 1, + 2 * N + 2, + base - 1, + base, + base + 1, + ]: + _do_test( + tm.makeCustomDataframe( + nrows, ncols, r_idx_type="dt", c_idx_type="s" + ), + "dt", + "s", + ) + pass + + for r_idx_type, c_idx_type in [("i", "i"), ("s", "s"), ("u", "dt"), ("p", "p")]: + for ncols in [1, 2, 3, 4]: + base = int((chunksize // ncols or 1) or 1) + for nrows in [ + 2, + 10, + N - 1, + N, + N + 1, + N + 2, + 2 * N - 2, + 2 * N - 1, + 2 * N, + 2 * N + 1, + 2 * N + 2, + base - 1, + base, + base + 1, + ]: + _do_test( + tm.makeCustomDataframe( + nrows, ncols, r_idx_type=r_idx_type, c_idx_type=c_idx_type + ), + r_idx_type, + c_idx_type, + ) + + for ncols in [1, 2, 3, 4]: + base = int((chunksize // ncols or 1) or 1) + for nrows in [ + 10, + N - 2, + N - 1, + N, + N + 1, + N + 2, + 2 * N - 2, + 2 * N - 1, + 2 * N, + 2 * N + 1, + 2 * N + 2, + base - 1, + base, + base + 1, + ]: + _do_test(tm.makeCustomDataframe(nrows, ncols)) + + for nrows in [10, N - 2, N - 1, N, N + 1, N + 2]: + df = tm.makeCustomDataframe(nrows, 3) + cols = list(df.columns) + cols[:2] = ["dupe", "dupe"] + cols[-2:] = ["dupe", "dupe"] + ix = list(df.index) + ix[:2] = ["rdupe", "rdupe"] + ix[-2:] = ["rdupe", "rdupe"] + df.index = ix + df.columns = cols + _do_test(df, dupe_col=True) + + _do_test(DataFrame(index=np.arange(10))) + _do_test( + tm.makeCustomDataframe(chunksize // 2 + 1, 2, r_idx_nlevels=2), rnlvl=2 + ) + for ncols in [2, 3, 4]: + base = int(chunksize // ncols) + for nrows in [ + 10, + N - 2, + N - 1, + N, + N + 1, + N + 2, + 2 * N - 2, + 2 * N - 1, + 2 * N, + 2 * N + 1, + 2 * N + 2, + base - 1, + base, + base + 1, + ]: + _do_test(tm.makeCustomDataframe(nrows, ncols, r_idx_nlevels=2), rnlvl=2) + _do_test(tm.makeCustomDataframe(nrows, ncols, c_idx_nlevels=2), cnlvl=2) + _do_test( + tm.makeCustomDataframe( + nrows, ncols, r_idx_nlevels=2, c_idx_nlevels=2 + ), + rnlvl=2, + cnlvl=2, + ) + + def test_to_csv_from_csv_w_some_infs(self, float_frame): + + # test roundtrip with inf, -inf, nan, as full columns and mix + float_frame["G"] = np.nan + f = lambda x: [np.inf, np.nan][np.random.rand() < 0.5] + float_frame["H"] = float_frame.index.map(f) + + with tm.ensure_clean() as path: + float_frame.to_csv(path) + recons = self.read_csv(path) + + # TODO to_csv drops column name + tm.assert_frame_equal(float_frame, recons, check_names=False) + tm.assert_frame_equal( + np.isinf(float_frame), np.isinf(recons), check_names=False + ) + + def test_to_csv_from_csv_w_all_infs(self, float_frame): + + # test roundtrip with inf, -inf, nan, as full columns and mix + float_frame["E"] = np.inf + float_frame["F"] = -np.inf + + with tm.ensure_clean() as path: + float_frame.to_csv(path) + recons = self.read_csv(path) + + # TODO to_csv drops column name + tm.assert_frame_equal(float_frame, recons, check_names=False) + tm.assert_frame_equal( + np.isinf(float_frame), np.isinf(recons), check_names=False + ) + + def test_to_csv_no_index(self): + # GH 3624, after appending columns, to_csv fails + with tm.ensure_clean("__tmp_to_csv_no_index__") as path: + df = DataFrame({"c1": [1, 2, 3], "c2": [4, 5, 6]}) + df.to_csv(path, index=False) + result = read_csv(path) + tm.assert_frame_equal(df, result) + df["c3"] = Series([7, 8, 9], dtype="int64") + df.to_csv(path, index=False) + result = read_csv(path) + tm.assert_frame_equal(df, result) + + def test_to_csv_with_mix_columns(self): + # gh-11637: incorrect output when a mix of integer and string column + # names passed as columns parameter in to_csv + + df = DataFrame({0: ["a", "b", "c"], 1: ["aa", "bb", "cc"]}) + df["test"] = "txt" + assert df.to_csv() == df.to_csv(columns=[0, 1, "test"]) + + def test_to_csv_headers(self): + # GH6186, the presence or absence of `index` incorrectly + # causes to_csv to have different header semantics. + from_df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + to_df = DataFrame([[1, 2], [3, 4]], columns=["X", "Y"]) + with tm.ensure_clean("__tmp_to_csv_headers__") as path: + from_df.to_csv(path, header=["X", "Y"]) + recons = self.read_csv(path) + + tm.assert_frame_equal(to_df, recons) + + from_df.to_csv(path, index=False, header=["X", "Y"]) + recons = self.read_csv(path) + + recons.reset_index(inplace=True) + tm.assert_frame_equal(to_df, recons) + + def test_to_csv_multiindex(self, float_frame, datetime_frame): + + frame = float_frame + old_index = frame.index + arrays = np.arange(len(old_index) * 2).reshape(2, -1) + new_index = MultiIndex.from_arrays(arrays, names=["first", "second"]) + frame.index = new_index + + with tm.ensure_clean("__tmp_to_csv_multiindex__") as path: + + frame.to_csv(path, header=False) + frame.to_csv(path, columns=["A", "B"]) + + # round trip + frame.to_csv(path) + + df = self.read_csv(path, index_col=[0, 1], parse_dates=False) + + # TODO to_csv drops column name + tm.assert_frame_equal(frame, df, check_names=False) + assert frame.index.names == df.index.names + + # needed if setUp becomes a class method + float_frame.index = old_index + + # try multiindex with dates + tsframe = datetime_frame + old_index = tsframe.index + new_index = [old_index, np.arange(len(old_index))] + tsframe.index = MultiIndex.from_arrays(new_index) + + tsframe.to_csv(path, index_label=["time", "foo"]) + recons = self.read_csv(path, index_col=[0, 1]) + + # TODO to_csv drops column name + tm.assert_frame_equal(tsframe, recons, check_names=False) + + # do not load index + tsframe.to_csv(path) + recons = self.read_csv(path, index_col=None) + assert len(recons.columns) == len(tsframe.columns) + 2 + + # no index + tsframe.to_csv(path, index=False) + recons = self.read_csv(path, index_col=None) + tm.assert_almost_equal(recons.values, datetime_frame.values) + + # needed if setUp becomes class method + datetime_frame.index = old_index + + with tm.ensure_clean("__tmp_to_csv_multiindex__") as path: + # GH3571, GH1651, GH3141 + + def _make_frame(names=None): + if names is True: + names = ["first", "second"] + return DataFrame( + np.random.randint(0, 10, size=(3, 3)), + columns=MultiIndex.from_tuples( + [("bah", "foo"), ("bah", "bar"), ("ban", "baz")], names=names + ), + dtype="int64", + ) + + # column & index are multi-index + df = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) + df.to_csv(path) + result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1]) + tm.assert_frame_equal(df, result) + + # column is mi + df = tm.makeCustomDataframe(5, 3, r_idx_nlevels=1, c_idx_nlevels=4) + df.to_csv(path) + result = read_csv(path, header=[0, 1, 2, 3], index_col=0) + tm.assert_frame_equal(df, result) + + # dup column names? + df = tm.makeCustomDataframe(5, 3, r_idx_nlevels=3, c_idx_nlevels=4) + df.to_csv(path) + result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1, 2]) + tm.assert_frame_equal(df, result) + + # writing with no index + df = _make_frame() + df.to_csv(path, index=False) + result = read_csv(path, header=[0, 1]) + tm.assert_frame_equal(df, result) + + # we lose the names here + df = _make_frame(True) + df.to_csv(path, index=False) + result = read_csv(path, header=[0, 1]) + assert com.all_none(*result.columns.names) + result.columns.names = df.columns.names + tm.assert_frame_equal(df, result) + + # whatsnew example + df = _make_frame() + df.to_csv(path) + result = read_csv(path, header=[0, 1], index_col=[0]) + tm.assert_frame_equal(df, result) + + df = _make_frame(True) + df.to_csv(path) + result = read_csv(path, header=[0, 1], index_col=[0]) + tm.assert_frame_equal(df, result) + + # invalid options + df = _make_frame(True) + df.to_csv(path) + + for i in [6, 7]: + msg = "len of {i}, but only 5 lines in file".format(i=i) + with pytest.raises(ParserError, match=msg): + read_csv(path, header=list(range(i)), index_col=0) + + # write with cols + msg = "cannot specify cols with a MultiIndex" + with pytest.raises(TypeError, match=msg): + df.to_csv(path, columns=["foo", "bar"]) + + with tm.ensure_clean("__tmp_to_csv_multiindex__") as path: + # empty + tsframe[:0].to_csv(path) + recons = self.read_csv(path) + + exp = tsframe[:0] + exp.index = [] + + tm.assert_index_equal(recons.columns, exp.columns) + assert len(recons) == 0 + + def test_to_csv_interval_index(self): + # GH 28210 + df = DataFrame({"A": list("abc"), "B": range(3)}, index=pd.interval_range(0, 3)) + + with tm.ensure_clean("__tmp_to_csv_interval_index__.csv") as path: + df.to_csv(path) + result = self.read_csv(path, index_col=0) + + # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) + expected = df.copy() + expected.index = expected.index.astype(str) + + tm.assert_frame_equal(result, expected) + + def test_to_csv_float32_nanrep(self): + df = DataFrame(np.random.randn(1, 4).astype(np.float32)) + df[1] = np.nan + + with tm.ensure_clean("__tmp_to_csv_float32_nanrep__.csv") as path: + df.to_csv(path, na_rep=999) + + with open(path) as f: + lines = f.readlines() + assert lines[1].split(",")[2] == "999" + + def test_to_csv_withcommas(self): + + # Commas inside fields should be correctly escaped when saving as CSV. + df = DataFrame({"A": [1, 2, 3], "B": ["5,6", "7,8", "9,0"]}) + + with tm.ensure_clean("__tmp_to_csv_withcommas__.csv") as path: + df.to_csv(path) + df2 = self.read_csv(path) + tm.assert_frame_equal(df2, df) + + def test_to_csv_mixed(self): + def create_cols(name): + return ["{name}{i:03d}".format(name=name, i=i) for i in range(5)] + + df_float = DataFrame( + np.random.randn(100, 5), dtype="float64", columns=create_cols("float") + ) + df_int = DataFrame( + np.random.randn(100, 5), dtype="int64", columns=create_cols("int") + ) + df_bool = DataFrame(True, index=df_float.index, columns=create_cols("bool")) + df_object = DataFrame( + "foo", index=df_float.index, columns=create_cols("object") + ) + df_dt = DataFrame( + Timestamp("20010101"), index=df_float.index, columns=create_cols("date") + ) + + # add in some nans + df_float.loc[30:50, 1:3] = np.nan + + # ## this is a bug in read_csv right now #### + # df_dt.loc[30:50,1:3] = np.nan + + df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1) + + # dtype + dtypes = dict() + for n, dtype in [ + ("float", np.float64), + ("int", np.int64), + ("bool", np.bool), + ("object", np.object), + ]: + for c in create_cols(n): + dtypes[c] = dtype + + with tm.ensure_clean() as filename: + df.to_csv(filename) + rs = read_csv( + filename, index_col=0, dtype=dtypes, parse_dates=create_cols("date") + ) + tm.assert_frame_equal(rs, df) + + def test_to_csv_dups_cols(self): + + df = DataFrame( + np.random.randn(1000, 30), + columns=list(range(15)) + list(range(15)), + dtype="float64", + ) + + with tm.ensure_clean() as filename: + df.to_csv(filename) # single dtype, fine + result = read_csv(filename, index_col=0) + result.columns = df.columns + tm.assert_frame_equal(result, df) + + df_float = DataFrame(np.random.randn(1000, 3), dtype="float64") + df_int = DataFrame(np.random.randn(1000, 3), dtype="int64") + df_bool = DataFrame(True, index=df_float.index, columns=range(3)) + df_object = DataFrame("foo", index=df_float.index, columns=range(3)) + df_dt = DataFrame(Timestamp("20010101"), index=df_float.index, columns=range(3)) + df = pd.concat( + [df_float, df_int, df_bool, df_object, df_dt], axis=1, ignore_index=True + ) + + cols = [] + for i in range(5): + cols.extend([0, 1, 2]) + df.columns = cols + + with tm.ensure_clean() as filename: + df.to_csv(filename) + result = read_csv(filename, index_col=0) + + # date cols + for i in ["0.4", "1.4", "2.4"]: + result[i] = to_datetime(result[i]) + + result.columns = df.columns + tm.assert_frame_equal(result, df) + + # GH3457 + + N = 10 + df = tm.makeCustomDataframe(N, 3) + df.columns = ["a", "a", "b"] + + with tm.ensure_clean() as filename: + df.to_csv(filename) + + # read_csv will rename the dups columns + result = read_csv(filename, index_col=0) + result = result.rename(columns={"a.1": "a"}) + tm.assert_frame_equal(result, df) + + def test_to_csv_chunking(self): + + aa = DataFrame({"A": range(100000)}) + aa["B"] = aa.A + 1.0 + aa["C"] = aa.A + 2.0 + aa["D"] = aa.A + 3.0 + + for chunksize in [10000, 50000, 100000]: + with tm.ensure_clean() as filename: + aa.to_csv(filename, chunksize=chunksize) + rs = read_csv(filename, index_col=0) + tm.assert_frame_equal(rs, aa) + + @pytest.mark.slow + def test_to_csv_wide_frame_formatting(self): + # Issue #8621 + df = DataFrame(np.random.randn(1, 100010), columns=None, index=None) + with tm.ensure_clean() as filename: + df.to_csv(filename, header=False, index=False) + rs = read_csv(filename, header=None) + tm.assert_frame_equal(rs, df) + + def test_to_csv_bug(self): + f1 = StringIO("a,1.0\nb,2.0") + df = self.read_csv(f1, header=None) + newdf = DataFrame({"t": df[df.columns[0]]}) + + with tm.ensure_clean() as path: + newdf.to_csv(path) + + recons = read_csv(path, index_col=0) + # don't check_names as t != 1 + tm.assert_frame_equal(recons, newdf, check_names=False) + + def test_to_csv_unicode(self): + + df = DataFrame({"c/\u03c3": [1, 2, 3]}) + with tm.ensure_clean() as path: + + df.to_csv(path, encoding="UTF-8") + df2 = read_csv(path, index_col=0, encoding="UTF-8") + tm.assert_frame_equal(df, df2) + + df.to_csv(path, encoding="UTF-8", index=False) + df2 = read_csv(path, index_col=None, encoding="UTF-8") + tm.assert_frame_equal(df, df2) + + def test_to_csv_unicode_index_col(self): + buf = StringIO("") + df = DataFrame( + [["\u05d0", "d2", "d3", "d4"], ["a1", "a2", "a3", "a4"]], + columns=["\u05d0", "\u05d1", "\u05d2", "\u05d3"], + index=["\u05d0", "\u05d1"], + ) + + df.to_csv(buf, encoding="UTF-8") + buf.seek(0) + + df2 = read_csv(buf, index_col=0, encoding="UTF-8") + tm.assert_frame_equal(df, df2) + + def test_to_csv_stringio(self, float_frame): + buf = StringIO() + float_frame.to_csv(buf) + buf.seek(0) + recons = read_csv(buf, index_col=0) + # TODO to_csv drops column name + tm.assert_frame_equal(recons, float_frame, check_names=False) + + def test_to_csv_float_format(self): + + df = DataFrame( + [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) + + with tm.ensure_clean() as filename: + + df.to_csv(filename, float_format="%.2f") + + rs = read_csv(filename, index_col=0) + xp = DataFrame( + [[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) + tm.assert_frame_equal(rs, xp) + + def test_to_csv_unicodewriter_quoting(self): + df = DataFrame({"A": [1, 2, 3], "B": ["foo", "bar", "baz"]}) + + buf = StringIO() + df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC, encoding="utf-8") + + result = buf.getvalue() + expected_rows = ['"A","B"', '1,"foo"', '2,"bar"', '3,"baz"'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected + + def test_to_csv_quote_none(self): + # GH4328 + df = DataFrame({"A": ["hello", '{"hello"}']}) + for encoding in (None, "utf-8"): + buf = StringIO() + df.to_csv(buf, quoting=csv.QUOTE_NONE, encoding=encoding, index=False) + + result = buf.getvalue() + expected_rows = ["A", "hello", '{"hello"}'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected + + def test_to_csv_index_no_leading_comma(self): + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["one", "two", "three"]) + + buf = StringIO() + df.to_csv(buf, index_label=False) + + expected_rows = ["A,B", "one,1,4", "two,2,5", "three,3,6"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert buf.getvalue() == expected + + def test_to_csv_line_terminators(self): + # see gh-20353 + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["one", "two", "three"]) + + with tm.ensure_clean() as path: + # case 1: CRLF as line terminator + df.to_csv(path, line_terminator="\r\n") + expected = b",A,B\r\none,1,4\r\ntwo,2,5\r\nthree,3,6\r\n" + + with open(path, mode="rb") as f: + assert f.read() == expected + + with tm.ensure_clean() as path: + # case 2: LF as line terminator + df.to_csv(path, line_terminator="\n") + expected = b",A,B\none,1,4\ntwo,2,5\nthree,3,6\n" + + with open(path, mode="rb") as f: + assert f.read() == expected + + with tm.ensure_clean() as path: + # case 3: The default line terminator(=os.linesep)(gh-21406) + df.to_csv(path) + os_linesep = os.linesep.encode("utf-8") + expected = ( + b",A,B" + + os_linesep + + b"one,1,4" + + os_linesep + + b"two,2,5" + + os_linesep + + b"three,3,6" + + os_linesep + ) + + with open(path, mode="rb") as f: + assert f.read() == expected + + def test_to_csv_from_csv_categorical(self): + + # CSV with categoricals should result in the same output + # as when one would add a "normal" Series/DataFrame. + s = Series(pd.Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])) + s2 = Series(["a", "b", "b", "a", "a", "c", "c", "c"]) + res = StringIO() + + s.to_csv(res, header=False) + exp = StringIO() + + s2.to_csv(exp, header=False) + assert res.getvalue() == exp.getvalue() + + df = DataFrame({"s": s}) + df2 = DataFrame({"s": s2}) + + res = StringIO() + df.to_csv(res) + + exp = StringIO() + df2.to_csv(exp) + + assert res.getvalue() == exp.getvalue() + + def test_to_csv_path_is_none(self, float_frame): + # GH 8215 + # Make sure we return string for consistency with + # Series.to_csv() + csv_str = float_frame.to_csv(path_or_buf=None) + assert isinstance(csv_str, str) + recons = pd.read_csv(StringIO(csv_str), index_col=0) + tm.assert_frame_equal(float_frame, recons) + + @pytest.mark.parametrize( + "df,encoding", + [ + ( + DataFrame( + [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ), + None, + ), + # GH 21241, 21118 + (DataFrame([["abc", "def", "ghi"]], columns=["X", "Y", "Z"]), "ascii"), + (DataFrame(5 * [[123, "你好", "世界"]], columns=["X", "Y", "Z"]), "gb2312"), + ( + DataFrame(5 * [[123, "Γειά σου", "Κόσμε"]], columns=["X", "Y", "Z"]), + "cp737", + ), + ], + ) + def test_to_csv_compression(self, df, encoding, compression): + + with tm.ensure_clean() as filename: + + df.to_csv(filename, compression=compression, encoding=encoding) + # test the round trip - to_csv -> read_csv + result = read_csv( + filename, compression=compression, index_col=0, encoding=encoding + ) + tm.assert_frame_equal(df, result) + + # test the round trip using file handle - to_csv -> read_csv + f, _handles = get_handle( + filename, "w", compression=compression, encoding=encoding + ) + with f: + df.to_csv(f, encoding=encoding) + result = pd.read_csv( + filename, + compression=compression, + encoding=encoding, + index_col=0, + squeeze=True, + ) + tm.assert_frame_equal(df, result) + + # explicitly make sure file is compressed + with tm.decompress_file(filename, compression) as fh: + text = fh.read().decode(encoding or "utf8") + for col in df.columns: + assert col in text + + with tm.decompress_file(filename, compression) as fh: + tm.assert_frame_equal(df, read_csv(fh, index_col=0, encoding=encoding)) + + def test_to_csv_date_format(self, datetime_frame): + with tm.ensure_clean("__tmp_to_csv_date_format__") as path: + dt_index = datetime_frame.index + datetime_frame = DataFrame( + {"A": dt_index, "B": dt_index.shift(1)}, index=dt_index + ) + datetime_frame.to_csv(path, date_format="%Y%m%d") + + # Check that the data was put in the specified format + test = read_csv(path, index_col=0) + + datetime_frame_int = datetime_frame.applymap( + lambda x: int(x.strftime("%Y%m%d")) + ) + datetime_frame_int.index = datetime_frame_int.index.map( + lambda x: int(x.strftime("%Y%m%d")) + ) + + tm.assert_frame_equal(test, datetime_frame_int) + + datetime_frame.to_csv(path, date_format="%Y-%m-%d") + + # Check that the data was put in the specified format + test = read_csv(path, index_col=0) + datetime_frame_str = datetime_frame.applymap( + lambda x: x.strftime("%Y-%m-%d") + ) + datetime_frame_str.index = datetime_frame_str.index.map( + lambda x: x.strftime("%Y-%m-%d") + ) + + tm.assert_frame_equal(test, datetime_frame_str) + + # Check that columns get converted + datetime_frame_columns = datetime_frame.T + datetime_frame_columns.to_csv(path, date_format="%Y%m%d") + + test = read_csv(path, index_col=0) + + datetime_frame_columns = datetime_frame_columns.applymap( + lambda x: int(x.strftime("%Y%m%d")) + ) + # Columns don't get converted to ints by read_csv + datetime_frame_columns.columns = datetime_frame_columns.columns.map( + lambda x: x.strftime("%Y%m%d") + ) + + tm.assert_frame_equal(test, datetime_frame_columns) + + # test NaTs + nat_index = to_datetime( + ["NaT"] * 10 + ["2000-01-01", "1/1/2000", "1-1-2000"] + ) + nat_frame = DataFrame({"A": nat_index}, index=nat_index) + nat_frame.to_csv(path, date_format="%Y-%m-%d") + + test = read_csv(path, parse_dates=[0, 1], index_col=0) + + tm.assert_frame_equal(test, nat_frame) + + def test_to_csv_with_dst_transitions(self): + + with tm.ensure_clean("csv_date_format_with_dst") as path: + # make sure we are not failing on transitions + times = pd.date_range( + "2013-10-26 23:00", + "2013-10-27 01:00", + tz="Europe/London", + freq="H", + ambiguous="infer", + ) + + for i in [times, times + pd.Timedelta("10s")]: + time_range = np.array(range(len(i)), dtype="int64") + df = DataFrame({"A": time_range}, index=i) + df.to_csv(path, index=True) + # we have to reconvert the index as we + # don't parse the tz's + result = read_csv(path, index_col=0) + result.index = to_datetime(result.index, utc=True).tz_convert( + "Europe/London" + ) + tm.assert_frame_equal(result, df) + + # GH11619 + idx = pd.date_range("2015-01-01", "2015-12-31", freq="H", tz="Europe/Paris") + df = DataFrame({"values": 1, "idx": idx}, index=idx) + with tm.ensure_clean("csv_date_format_with_dst") as path: + df.to_csv(path, index=True) + result = read_csv(path, index_col=0) + result.index = to_datetime(result.index, utc=True).tz_convert( + "Europe/Paris" + ) + result["idx"] = to_datetime(result["idx"], utc=True).astype( + "datetime64[ns, Europe/Paris]" + ) + tm.assert_frame_equal(result, df) + + # assert working + df.astype(str) + + with tm.ensure_clean("csv_date_format_with_dst") as path: + df.to_pickle(path) + result = pd.read_pickle(path) + tm.assert_frame_equal(result, df) + + def test_to_csv_quoting(self): + df = DataFrame( + { + "c_bool": [True, False], + "c_float": [1.0, 3.2], + "c_int": [42, np.nan], + "c_string": ["a", "b,c"], + } + ) + + expected_rows = [ + ",c_bool,c_float,c_int,c_string", + "0,True,1.0,42.0,a", + '1,False,3.2,,"b,c"', + ] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + + result = df.to_csv() + assert result == expected + + result = df.to_csv(quoting=None) + assert result == expected + + expected_rows = [ + ",c_bool,c_float,c_int,c_string", + "0,True,1.0,42.0,a", + '1,False,3.2,,"b,c"', + ] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + + result = df.to_csv(quoting=csv.QUOTE_MINIMAL) + assert result == expected + + expected_rows = [ + '"","c_bool","c_float","c_int","c_string"', + '"0","True","1.0","42.0","a"', + '"1","False","3.2","","b,c"', + ] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + + result = df.to_csv(quoting=csv.QUOTE_ALL) + assert result == expected + + # see gh-12922, gh-13259: make sure changes to + # the formatters do not break this behaviour + expected_rows = [ + '"","c_bool","c_float","c_int","c_string"', + '0,True,1.0,42.0,"a"', + '1,False,3.2,"","b,c"', + ] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + result = df.to_csv(quoting=csv.QUOTE_NONNUMERIC) + assert result == expected + + msg = "need to escape, but no escapechar set" + with pytest.raises(csv.Error, match=msg): + df.to_csv(quoting=csv.QUOTE_NONE) + + with pytest.raises(csv.Error, match=msg): + df.to_csv(quoting=csv.QUOTE_NONE, escapechar=None) + + expected_rows = [ + ",c_bool,c_float,c_int,c_string", + "0,True,1.0,42.0,a", + "1,False,3.2,,b!,c", + ] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + result = df.to_csv(quoting=csv.QUOTE_NONE, escapechar="!") + assert result == expected + + expected_rows = [ + ",c_bool,c_ffloat,c_int,c_string", + "0,True,1.0,42.0,a", + "1,False,3.2,,bf,c", + ] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + result = df.to_csv(quoting=csv.QUOTE_NONE, escapechar="f") + assert result == expected + + # see gh-3503: quoting Windows line terminators + # presents with encoding? + text_rows = ["a,b,c", '1,"test \r\n",3'] + text = tm.convert_rows_list_to_csv_str(text_rows) + df = pd.read_csv(StringIO(text)) + + buf = StringIO() + df.to_csv(buf, encoding="utf-8", index=False) + assert buf.getvalue() == text + + # xref gh-7791: make sure the quoting parameter is passed through + # with multi-indexes + df = pd.DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}) + df = df.set_index(["a", "b"]) + + expected_rows = ['"a","b","c"', '"1","3","5"', '"2","4","6"'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert df.to_csv(quoting=csv.QUOTE_ALL) == expected + + def test_period_index_date_overflow(self): + # see gh-15982 + + dates = ["1990-01-01", "2000-01-01", "3005-01-01"] + index = pd.PeriodIndex(dates, freq="D") + + df = pd.DataFrame([4, 5, 6], index=index) + result = df.to_csv() + + expected_rows = [",0", "1990-01-01,4", "2000-01-01,5", "3005-01-01,6"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected + + date_format = "%m-%d-%Y" + result = df.to_csv(date_format=date_format) + + expected_rows = [",0", "01-01-1990,4", "01-01-2000,5", "01-01-3005,6"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected + + # Overflow with pd.NaT + dates = ["1990-01-01", pd.NaT, "3005-01-01"] + index = pd.PeriodIndex(dates, freq="D") + + df = pd.DataFrame([4, 5, 6], index=index) + result = df.to_csv() + + expected_rows = [",0", "1990-01-01,4", ",5", "3005-01-01,6"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected + + def test_multi_index_header(self): + # see gh-5539 + columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]) + df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]]) + df.columns = columns + + header = ["a", "b", "c", "d"] + result = df.to_csv(header=header) + + expected_rows = [",a,b,c,d", "0,1,2,3,4", "1,5,6,7,8"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected + + def test_to_csv_single_level_multi_index(self): + # see gh-26303 + index = pd.Index([(1,), (2,), (3,)]) + df = pd.DataFrame([[1, 2, 3]], columns=index) + df = df.reindex(columns=[(1,), (3,)]) + expected = ",1,3\n0,1,3\n" + result = df.to_csv(line_terminator="\n") + tm.assert_almost_equal(result, expected) + + def test_gz_lineend(self): + # GH 25311 + df = pd.DataFrame({"a": [1, 2]}) + expected_rows = ["a", "1", "2"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + with tm.ensure_clean("__test_gz_lineend.csv.gz") as path: + df.to_csv(path, index=False) + with tm.decompress_file(path, compression="gzip") as f: + result = f.read().decode("utf-8") + + assert result == expected diff --git a/venv/Lib/site-packages/pandas/tests/frame/test_validate.py b/venv/Lib/site-packages/pandas/tests/frame/test_validate.py new file mode 100644 index 0000000..c727032 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/frame/test_validate.py @@ -0,0 +1,41 @@ +import pytest + +from pandas.core.frame import DataFrame + + +@pytest.fixture +def dataframe(): + return DataFrame({"a": [1, 2], "b": [3, 4]}) + + +class TestDataFrameValidate: + """Tests for error handling related to data types of method arguments.""" + + @pytest.mark.parametrize( + "func", + [ + "query", + "eval", + "set_index", + "reset_index", + "dropna", + "drop_duplicates", + "sort_values", + ], + ) + @pytest.mark.parametrize("inplace", [1, "True", [1, 2, 3], 5.0]) + def test_validate_bool_args(self, dataframe, func, inplace): + msg = 'For argument "inplace" expected type bool' + kwargs = dict(inplace=inplace) + + if func == "query": + kwargs["expr"] = "a > b" + elif func == "eval": + kwargs["expr"] = "a + b" + elif func == "set_index": + kwargs["keys"] = ["a"] + elif func == "sort_values": + kwargs["by"] = ["a"] + + with pytest.raises(ValueError, match=msg): + getattr(dataframe, func)(**kwargs) diff --git a/venv/Lib/site-packages/pandas/tests/generic/__init__.py b/venv/Lib/site-packages/pandas/tests/generic/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/generic/test_frame.py b/venv/Lib/site-packages/pandas/tests/generic/test_frame.py new file mode 100644 index 0000000..7fe22e7 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/generic/test_frame.py @@ -0,0 +1,282 @@ +from copy import deepcopy +from distutils.version import LooseVersion +from operator import methodcaller + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import DataFrame, MultiIndex, Series, date_range +import pandas._testing as tm + +from .test_generic import Generic + +try: + import xarray + + _XARRAY_INSTALLED = True +except ImportError: + _XARRAY_INSTALLED = False + + +class TestDataFrame(Generic): + _typ = DataFrame + _comparator = lambda self, x, y: tm.assert_frame_equal(x, y) + + def test_rename_mi(self): + df = DataFrame( + [11, 21, 31], + index=MultiIndex.from_tuples([("A", x) for x in ["a", "B", "c"]]), + ) + df.rename(str.lower) + + def test_set_axis_name(self): + df = pd.DataFrame([[1, 2], [3, 4]]) + funcs = ["_set_axis_name", "rename_axis"] + for func in funcs: + result = methodcaller(func, "foo")(df) + assert df.index.name is None + assert result.index.name == "foo" + + result = methodcaller(func, "cols", axis=1)(df) + assert df.columns.name is None + assert result.columns.name == "cols" + + def test_set_axis_name_mi(self): + df = DataFrame( + np.empty((3, 3)), + index=MultiIndex.from_tuples([("A", x) for x in list("aBc")]), + columns=MultiIndex.from_tuples([("C", x) for x in list("xyz")]), + ) + + level_names = ["L1", "L2"] + funcs = ["_set_axis_name", "rename_axis"] + for func in funcs: + result = methodcaller(func, level_names)(df) + assert result.index.names == level_names + assert result.columns.names == [None, None] + + result = methodcaller(func, level_names, axis=1)(df) + assert result.columns.names == ["L1", "L2"] + assert result.index.names == [None, None] + + def test_nonzero_single_element(self): + + # allow single item via bool method + df = DataFrame([[True]]) + assert df.bool() + + df = DataFrame([[False]]) + assert not df.bool() + + df = DataFrame([[False, False]]) + with pytest.raises(ValueError): + df.bool() + with pytest.raises(ValueError): + bool(df) + + def test_get_numeric_data_preserve_dtype(self): + + # get the numeric data + o = DataFrame({"A": [1, "2", 3.0]}) + result = o._get_numeric_data() + expected = DataFrame(index=[0, 1, 2], dtype=object) + self._compare(result, expected) + + def test_metadata_propagation_indiv(self): + + # groupby + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + } + ) + result = df.groupby("A").sum() + self.check_metadata(df, result) + + # resample + df = DataFrame( + np.random.randn(1000, 2), + index=date_range("20130101", periods=1000, freq="s"), + ) + result = df.resample("1T") + self.check_metadata(df, result) + + # merging with override + # GH 6923 + _metadata = DataFrame._metadata + _finalize = DataFrame.__finalize__ + + np.random.seed(10) + df1 = DataFrame(np.random.randint(0, 4, (3, 2)), columns=["a", "b"]) + df2 = DataFrame(np.random.randint(0, 4, (3, 2)), columns=["c", "d"]) + DataFrame._metadata = ["filename"] + df1.filename = "fname1.csv" + df2.filename = "fname2.csv" + + def finalize(self, other, method=None, **kwargs): + + for name in self._metadata: + if method == "merge": + left, right = other.left, other.right + value = getattr(left, name, "") + "|" + getattr(right, name, "") + object.__setattr__(self, name, value) + else: + object.__setattr__(self, name, getattr(other, name, "")) + + return self + + DataFrame.__finalize__ = finalize + result = df1.merge(df2, left_on=["a"], right_on=["c"], how="inner") + assert result.filename == "fname1.csv|fname2.csv" + + # concat + # GH 6927 + DataFrame._metadata = ["filename"] + df1 = DataFrame(np.random.randint(0, 4, (3, 2)), columns=list("ab")) + df1.filename = "foo" + + def finalize(self, other, method=None, **kwargs): + for name in self._metadata: + if method == "concat": + value = "+".join( + [getattr(o, name) for o in other.objs if getattr(o, name, None)] + ) + object.__setattr__(self, name, value) + else: + object.__setattr__(self, name, getattr(other, name, None)) + + return self + + DataFrame.__finalize__ = finalize + + result = pd.concat([df1, df1]) + assert result.filename == "foo+foo" + + # reset + DataFrame._metadata = _metadata + DataFrame.__finalize__ = _finalize + + def test_set_attribute(self): + # Test for consistent setattr behavior when an attribute and a column + # have the same name (Issue #8994) + df = DataFrame({"x": [1, 2, 3]}) + + df.y = 2 + df["y"] = [2, 4, 6] + df.y = 5 + + assert df.y == 5 + tm.assert_series_equal(df["y"], Series([2, 4, 6], name="y")) + + @pytest.mark.skipif( + not _XARRAY_INSTALLED + or _XARRAY_INSTALLED + and LooseVersion(xarray.__version__) < LooseVersion("0.10.0"), + reason="xarray >= 0.10.0 required", + ) + @pytest.mark.parametrize( + "index", + [ + "FloatIndex", + "IntIndex", + "StringIndex", + "UnicodeIndex", + "DateIndex", + "PeriodIndex", + "CategoricalIndex", + "TimedeltaIndex", + ], + ) + def test_to_xarray_index_types(self, index): + from xarray import Dataset + + index = getattr(tm, f"make{index}") + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.Categorical(list("abc")), + "g": pd.date_range("20130101", periods=3), + "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), + } + ) + + df.index = index(3) + df.index.name = "foo" + df.columns.name = "bar" + result = df.to_xarray() + assert result.dims["foo"] == 3 + assert len(result.coords) == 1 + assert len(result.data_vars) == 8 + tm.assert_almost_equal(list(result.coords.keys()), ["foo"]) + assert isinstance(result, Dataset) + + # idempotency + # categoricals are not preserved + # datetimes w/tz are preserved + # column names are lost + expected = df.copy() + expected["f"] = expected["f"].astype(object) + expected.columns.name = None + tm.assert_frame_equal( + result.to_dataframe(), + expected, + check_index_type=False, + check_categorical=False, + ) + + @td.skip_if_no("xarray", min_version="0.7.0") + def test_to_xarray(self): + from xarray import Dataset + + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.Categorical(list("abc")), + "g": pd.date_range("20130101", periods=3), + "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), + } + ) + + df.index.name = "foo" + result = df[0:0].to_xarray() + assert result.dims["foo"] == 0 + assert isinstance(result, Dataset) + + # available in 0.7.1 + # MultiIndex + df.index = pd.MultiIndex.from_product([["a"], range(3)], names=["one", "two"]) + result = df.to_xarray() + assert result.dims["one"] == 1 + assert result.dims["two"] == 3 + assert len(result.coords) == 2 + assert len(result.data_vars) == 8 + tm.assert_almost_equal(list(result.coords.keys()), ["one", "two"]) + assert isinstance(result, Dataset) + + result = result.to_dataframe() + expected = df.copy() + expected["f"] = expected["f"].astype(object) + expected.columns.name = None + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_deepcopy_empty(self): + # This test covers empty frame copying with non-empty column sets + # as reported in issue GH15370 + empty_frame = DataFrame(data=[], index=[], columns=["A"]) + empty_frame_copy = deepcopy(empty_frame) + + self._compare(empty_frame_copy, empty_frame) diff --git a/venv/Lib/site-packages/pandas/tests/generic/test_generic.py b/venv/Lib/site-packages/pandas/tests/generic/test_generic.py new file mode 100644 index 0000000..a684171 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/generic/test_generic.py @@ -0,0 +1,976 @@ +from copy import copy, deepcopy + +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_scalar + +import pandas as pd +from pandas import DataFrame, MultiIndex, Series, date_range +import pandas._testing as tm + +# ---------------------------------------------------------------------- +# Generic types test cases + + +class Generic: + @property + def _ndim(self): + return self._typ._AXIS_LEN + + def _axes(self): + """ return the axes for my object typ """ + return self._typ._AXIS_ORDERS + + def _construct(self, shape, value=None, dtype=None, **kwargs): + """ construct an object for the given shape + if value is specified use that if its a scalar + if value is an array, repeat it as needed """ + + if isinstance(shape, int): + shape = tuple([shape] * self._ndim) + if value is not None: + if is_scalar(value): + if value == "empty": + arr = None + dtype = np.float64 + + # remove the info axis + kwargs.pop(self._typ._info_axis_name, None) + else: + arr = np.empty(shape, dtype=dtype) + arr.fill(value) + else: + fshape = np.prod(shape) + arr = value.ravel() + new_shape = fshape / arr.shape[0] + if fshape % arr.shape[0] != 0: + raise Exception("invalid value passed in _construct") + + arr = np.repeat(arr, new_shape).reshape(shape) + else: + arr = np.random.randn(*shape) + return self._typ(arr, dtype=dtype, **kwargs) + + def _compare(self, result, expected): + self._comparator(result, expected) + + def test_rename(self): + + # single axis + idx = list("ABCD") + # relabeling values passed into self.rename + args = [ + str.lower, + {x: x.lower() for x in idx}, + Series({x: x.lower() for x in idx}), + ] + + for axis in self._axes(): + kwargs = {axis: idx} + obj = self._construct(4, **kwargs) + + for arg in args: + # rename a single axis + result = obj.rename(**{axis: arg}) + expected = obj.copy() + setattr(expected, axis, list("abcd")) + self._compare(result, expected) + + # multiple axes at once + + def test_get_numeric_data(self): + + n = 4 + kwargs = {self._typ._AXIS_NAMES[i]: list(range(n)) for i in range(self._ndim)} + + # get the numeric data + o = self._construct(n, **kwargs) + result = o._get_numeric_data() + self._compare(result, o) + + # non-inclusion + result = o._get_bool_data() + expected = self._construct(n, value="empty", **kwargs) + self._compare(result, expected) + + # get the bool data + arr = np.array([True, True, False, True]) + o = self._construct(n, value=arr, **kwargs) + result = o._get_numeric_data() + self._compare(result, o) + + # _get_numeric_data is includes _get_bool_data, so can't test for + # non-inclusion + + def test_get_default(self): + + # GH 7725 + d0 = "a", "b", "c", "d" + d1 = np.arange(4, dtype="int64") + others = "e", 10 + + for data, index in ((d0, d1), (d1, d0)): + s = Series(data, index=index) + for i, d in zip(index, data): + assert s.get(i) == d + assert s.get(i, d) == d + assert s.get(i, "z") == d + for other in others: + assert s.get(other, "z") == "z" + assert s.get(other, other) == other + + def test_nonzero(self): + + # GH 4633 + # look at the boolean/nonzero behavior for objects + obj = self._construct(shape=4) + msg = f"The truth value of a {self._typ.__name__} is ambiguous" + with pytest.raises(ValueError, match=msg): + bool(obj == 0) + with pytest.raises(ValueError, match=msg): + bool(obj == 1) + with pytest.raises(ValueError, match=msg): + bool(obj) + + obj = self._construct(shape=4, value=1) + with pytest.raises(ValueError, match=msg): + bool(obj == 0) + with pytest.raises(ValueError, match=msg): + bool(obj == 1) + with pytest.raises(ValueError, match=msg): + bool(obj) + + obj = self._construct(shape=4, value=np.nan) + with pytest.raises(ValueError, match=msg): + bool(obj == 0) + with pytest.raises(ValueError, match=msg): + bool(obj == 1) + with pytest.raises(ValueError, match=msg): + bool(obj) + + # empty + obj = self._construct(shape=0) + with pytest.raises(ValueError, match=msg): + bool(obj) + + # invalid behaviors + + obj1 = self._construct(shape=4, value=1) + obj2 = self._construct(shape=4, value=1) + + with pytest.raises(ValueError, match=msg): + if obj1: + pass + + with pytest.raises(ValueError, match=msg): + obj1 and obj2 + with pytest.raises(ValueError, match=msg): + obj1 or obj2 + with pytest.raises(ValueError, match=msg): + not obj1 + + def test_downcast(self): + # test close downcasting + + o = self._construct(shape=4, value=9, dtype=np.int64) + result = o.copy() + result._data = o._data.downcast(dtypes="infer") + self._compare(result, o) + + o = self._construct(shape=4, value=9.0) + expected = o.astype(np.int64) + result = o.copy() + result._data = o._data.downcast(dtypes="infer") + self._compare(result, expected) + + o = self._construct(shape=4, value=9.5) + result = o.copy() + result._data = o._data.downcast(dtypes="infer") + self._compare(result, o) + + # are close + o = self._construct(shape=4, value=9.000000000005) + result = o.copy() + result._data = o._data.downcast(dtypes="infer") + expected = o.astype(np.int64) + self._compare(result, expected) + + def test_constructor_compound_dtypes(self): + # see gh-5191 + # Compound dtypes should raise NotImplementedError. + + def f(dtype): + return self._construct(shape=3, value=1, dtype=dtype) + + msg = "compound dtypes are not implemented" + f"in the {self._typ.__name__} constructor" + + with pytest.raises(NotImplementedError, match=msg): + f([("A", "datetime64[h]"), ("B", "str"), ("C", "int32")]) + + # these work (though results may be unexpected) + f("int64") + f("float64") + f("M8[ns]") + + def check_metadata(self, x, y=None): + for m in x._metadata: + v = getattr(x, m, None) + if y is None: + assert v is None + else: + assert v == getattr(y, m, None) + + def test_metadata_propagation(self): + # check that the metadata matches up on the resulting ops + + o = self._construct(shape=3) + o.name = "foo" + o2 = self._construct(shape=3) + o2.name = "bar" + + # ---------- + # preserving + # ---------- + + # simple ops with scalars + for op in ["__add__", "__sub__", "__truediv__", "__mul__"]: + result = getattr(o, op)(1) + self.check_metadata(o, result) + + # ops with like + for op in ["__add__", "__sub__", "__truediv__", "__mul__"]: + result = getattr(o, op)(o) + self.check_metadata(o, result) + + # simple boolean + for op in ["__eq__", "__le__", "__ge__"]: + v1 = getattr(o, op)(o) + self.check_metadata(o, v1) + self.check_metadata(o, v1 & v1) + self.check_metadata(o, v1 | v1) + + # combine_first + result = o.combine_first(o2) + self.check_metadata(o, result) + + # --------------------------- + # non-preserving (by default) + # --------------------------- + + # add non-like + result = o + o2 + self.check_metadata(result) + + # simple boolean + for op in ["__eq__", "__le__", "__ge__"]: + + # this is a name matching op + v1 = getattr(o, op)(o) + v2 = getattr(o, op)(o2) + self.check_metadata(v2) + self.check_metadata(v1 & v2) + self.check_metadata(v1 | v2) + + def test_head_tail(self): + # GH5370 + + o = self._construct(shape=10) + + # check all index types + for index in [ + tm.makeFloatIndex, + tm.makeIntIndex, + tm.makeStringIndex, + tm.makeUnicodeIndex, + tm.makeDateIndex, + tm.makePeriodIndex, + ]: + axis = o._get_axis_name(0) + setattr(o, axis, index(len(getattr(o, axis)))) + + o.head() + + self._compare(o.head(), o.iloc[:5]) + self._compare(o.tail(), o.iloc[-5:]) + + # 0-len + self._compare(o.head(0), o.iloc[0:0]) + self._compare(o.tail(0), o.iloc[0:0]) + + # bounded + self._compare(o.head(len(o) + 1), o) + self._compare(o.tail(len(o) + 1), o) + + # neg index + self._compare(o.head(-3), o.head(7)) + self._compare(o.tail(-3), o.tail(7)) + + def test_sample(self): + # Fixes issue: 2419 + + o = self._construct(shape=10) + + ### + # Check behavior of random_state argument + ### + + # Check for stability when receives seed or random state -- run 10 + # times. + for test in range(10): + seed = np.random.randint(0, 100) + self._compare( + o.sample(n=4, random_state=seed), o.sample(n=4, random_state=seed) + ) + + self._compare( + o.sample(frac=0.7, random_state=seed), + o.sample(frac=0.7, random_state=seed), + ) + + self._compare( + o.sample(n=4, random_state=np.random.RandomState(test)), + o.sample(n=4, random_state=np.random.RandomState(test)), + ) + + self._compare( + o.sample(frac=0.7, random_state=np.random.RandomState(test)), + o.sample(frac=0.7, random_state=np.random.RandomState(test)), + ) + + self._compare( + o.sample( + frac=2, replace=True, random_state=np.random.RandomState(test) + ), + o.sample( + frac=2, replace=True, random_state=np.random.RandomState(test) + ), + ) + + os1, os2 = [], [] + for _ in range(2): + np.random.seed(test) + os1.append(o.sample(n=4)) + os2.append(o.sample(frac=0.7)) + self._compare(*os1) + self._compare(*os2) + + # Check for error when random_state argument invalid. + with pytest.raises(ValueError): + o.sample(random_state="astring!") + + ### + # Check behavior of `frac` and `N` + ### + + # Giving both frac and N throws error + with pytest.raises(ValueError): + o.sample(n=3, frac=0.3) + + # Check that raises right error for negative lengths + with pytest.raises(ValueError): + o.sample(n=-3) + with pytest.raises(ValueError): + o.sample(frac=-0.3) + + # Make sure float values of `n` give error + with pytest.raises(ValueError): + o.sample(n=3.2) + + # Check lengths are right + assert len(o.sample(n=4) == 4) + assert len(o.sample(frac=0.34) == 3) + assert len(o.sample(frac=0.36) == 4) + + ### + # Check weights + ### + + # Weight length must be right + with pytest.raises(ValueError): + o.sample(n=3, weights=[0, 1]) + + with pytest.raises(ValueError): + bad_weights = [0.5] * 11 + o.sample(n=3, weights=bad_weights) + + with pytest.raises(ValueError): + bad_weight_series = Series([0, 0, 0.2]) + o.sample(n=4, weights=bad_weight_series) + + # Check won't accept negative weights + with pytest.raises(ValueError): + bad_weights = [-0.1] * 10 + o.sample(n=3, weights=bad_weights) + + # Check inf and -inf throw errors: + with pytest.raises(ValueError): + weights_with_inf = [0.1] * 10 + weights_with_inf[0] = np.inf + o.sample(n=3, weights=weights_with_inf) + + with pytest.raises(ValueError): + weights_with_ninf = [0.1] * 10 + weights_with_ninf[0] = -np.inf + o.sample(n=3, weights=weights_with_ninf) + + # All zeros raises errors + zero_weights = [0] * 10 + with pytest.raises(ValueError): + o.sample(n=3, weights=zero_weights) + + # All missing weights + nan_weights = [np.nan] * 10 + with pytest.raises(ValueError): + o.sample(n=3, weights=nan_weights) + + # Check np.nan are replaced by zeros. + weights_with_nan = [np.nan] * 10 + weights_with_nan[5] = 0.5 + self._compare(o.sample(n=1, axis=0, weights=weights_with_nan), o.iloc[5:6]) + + # Check None are also replaced by zeros. + weights_with_None = [None] * 10 + weights_with_None[5] = 0.5 + self._compare(o.sample(n=1, axis=0, weights=weights_with_None), o.iloc[5:6]) + + def test_sample_upsampling_without_replacement(self): + # GH27451 + + df = pd.DataFrame({"A": list("abc")}) + msg = ( + "Replace has to be set to `True` when " + "upsampling the population `frac` > 1." + ) + with pytest.raises(ValueError, match=msg): + df.sample(frac=2, replace=False) + + def test_sample_is_copy(self): + # GH-27357, GH-30784: ensure the result of sample is an actual copy and + # doesn't track the parent dataframe / doesn't give SettingWithCopy warnings + df = pd.DataFrame(np.random.randn(10, 3), columns=["a", "b", "c"]) + df2 = df.sample(3) + + with tm.assert_produces_warning(None): + df2["d"] = 1 + + def test_size_compat(self): + # GH8846 + # size property should be defined + + o = self._construct(shape=10) + assert o.size == np.prod(o.shape) + assert o.size == 10 ** len(o.axes) + + def test_split_compat(self): + # xref GH8846 + o = self._construct(shape=10) + assert len(np.array_split(o, 5)) == 5 + assert len(np.array_split(o, 2)) == 2 + + def test_unexpected_keyword(self): # GH8597 + df = DataFrame(np.random.randn(5, 2), columns=["jim", "joe"]) + ca = pd.Categorical([0, 0, 2, 2, 3, np.nan]) + ts = df["joe"].copy() + ts[2] = np.nan + + with pytest.raises(TypeError, match="unexpected keyword"): + df.drop("joe", axis=1, in_place=True) + + with pytest.raises(TypeError, match="unexpected keyword"): + df.reindex([1, 0], inplace=True) + + with pytest.raises(TypeError, match="unexpected keyword"): + ca.fillna(0, inplace=True) + + with pytest.raises(TypeError, match="unexpected keyword"): + ts.fillna(0, in_place=True) + + # See gh-12301 + def test_stat_unexpected_keyword(self): + obj = self._construct(5) + starwars = "Star Wars" + errmsg = "unexpected keyword" + + with pytest.raises(TypeError, match=errmsg): + obj.max(epic=starwars) # stat_function + with pytest.raises(TypeError, match=errmsg): + obj.var(epic=starwars) # stat_function_ddof + with pytest.raises(TypeError, match=errmsg): + obj.sum(epic=starwars) # cum_function + with pytest.raises(TypeError, match=errmsg): + obj.any(epic=starwars) # logical_function + + def test_api_compat(self): + + # GH 12021 + # compat for __name__, __qualname__ + + obj = self._construct(5) + for func in ["sum", "cumsum", "any", "var"]: + f = getattr(obj, func) + assert f.__name__ == func + assert f.__qualname__.endswith(func) + + def test_stat_non_defaults_args(self): + obj = self._construct(5) + out = np.array([0]) + errmsg = "the 'out' parameter is not supported" + + with pytest.raises(ValueError, match=errmsg): + obj.max(out=out) # stat_function + with pytest.raises(ValueError, match=errmsg): + obj.var(out=out) # stat_function_ddof + with pytest.raises(ValueError, match=errmsg): + obj.sum(out=out) # cum_function + with pytest.raises(ValueError, match=errmsg): + obj.any(out=out) # logical_function + + def test_truncate_out_of_bounds(self): + # GH11382 + + # small + shape = [int(2e3)] + ([1] * (self._ndim - 1)) + small = self._construct(shape, dtype="int8", value=1) + self._compare(small.truncate(), small) + self._compare(small.truncate(before=0, after=3e3), small) + self._compare(small.truncate(before=-1, after=2e3), small) + + # big + shape = [int(2e6)] + ([1] * (self._ndim - 1)) + big = self._construct(shape, dtype="int8", value=1) + self._compare(big.truncate(), big) + self._compare(big.truncate(before=0, after=3e6), big) + self._compare(big.truncate(before=-1, after=2e6), big) + + def test_validate_bool_args(self): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + invalid_values = [1, "True", [1, 2, 3], 5.0] + + for value in invalid_values: + with pytest.raises(ValueError): + super(DataFrame, df).rename_axis( + mapper={"a": "x", "b": "y"}, axis=1, inplace=value + ) + + with pytest.raises(ValueError): + super(DataFrame, df).drop("a", axis=1, inplace=value) + + with pytest.raises(ValueError): + super(DataFrame, df).sort_index(inplace=value) + + with pytest.raises(ValueError): + super(DataFrame, df)._consolidate(inplace=value) + + with pytest.raises(ValueError): + super(DataFrame, df).fillna(value=0, inplace=value) + + with pytest.raises(ValueError): + super(DataFrame, df).replace(to_replace=1, value=7, inplace=value) + + with pytest.raises(ValueError): + super(DataFrame, df).interpolate(inplace=value) + + with pytest.raises(ValueError): + super(DataFrame, df)._where(cond=df.a > 2, inplace=value) + + with pytest.raises(ValueError): + super(DataFrame, df).mask(cond=df.a > 2, inplace=value) + + def test_copy_and_deepcopy(self): + # GH 15444 + for shape in [0, 1, 2]: + obj = self._construct(shape) + for func in [ + copy, + deepcopy, + lambda x: x.copy(deep=False), + lambda x: x.copy(deep=True), + ]: + obj_copy = func(obj) + assert obj_copy is not obj + self._compare(obj_copy, obj) + + @pytest.mark.parametrize( + "periods,fill_method,limit,exp", + [ + (1, "ffill", None, [np.nan, np.nan, np.nan, 1, 1, 1.5, 0, 0]), + (1, "ffill", 1, [np.nan, np.nan, np.nan, 1, 1, 1.5, 0, np.nan]), + (1, "bfill", None, [np.nan, 0, 0, 1, 1, 1.5, np.nan, np.nan]), + (1, "bfill", 1, [np.nan, np.nan, 0, 1, 1, 1.5, np.nan, np.nan]), + (-1, "ffill", None, [np.nan, np.nan, -0.5, -0.5, -0.6, 0, 0, np.nan]), + (-1, "ffill", 1, [np.nan, np.nan, -0.5, -0.5, -0.6, 0, np.nan, np.nan]), + (-1, "bfill", None, [0, 0, -0.5, -0.5, -0.6, np.nan, np.nan, np.nan]), + (-1, "bfill", 1, [np.nan, 0, -0.5, -0.5, -0.6, np.nan, np.nan, np.nan]), + ], + ) + def test_pct_change(self, periods, fill_method, limit, exp): + vals = [np.nan, np.nan, 1, 2, 4, 10, np.nan, np.nan] + obj = self._typ(vals) + func = getattr(obj, "pct_change") + res = func(periods=periods, fill_method=fill_method, limit=limit) + if type(obj) is DataFrame: + tm.assert_frame_equal(res, DataFrame(exp)) + else: + tm.assert_series_equal(res, Series(exp)) + + +class TestNDFrame: + # tests that don't fit elsewhere + + def test_sample(sel): + # Fixes issue: 2419 + # additional specific object based tests + + # A few dataframe test with degenerate weights. + easy_weight_list = [0] * 10 + easy_weight_list[5] = 1 + + df = pd.DataFrame( + { + "col1": range(10, 20), + "col2": range(20, 30), + "colString": ["a"] * 10, + "easyweights": easy_weight_list, + } + ) + sample1 = df.sample(n=1, weights="easyweights") + tm.assert_frame_equal(sample1, df.iloc[5:6]) + + # Ensure proper error if string given as weight for Series or + # DataFrame with axis = 1. + s = Series(range(10)) + with pytest.raises(ValueError): + s.sample(n=3, weights="weight_column") + + with pytest.raises(ValueError): + df.sample(n=1, weights="weight_column", axis=1) + + # Check weighting key error + with pytest.raises( + KeyError, match="'String passed to weights not a valid column'" + ): + df.sample(n=3, weights="not_a_real_column_name") + + # Check that re-normalizes weights that don't sum to one. + weights_less_than_1 = [0] * 10 + weights_less_than_1[0] = 0.5 + tm.assert_frame_equal(df.sample(n=1, weights=weights_less_than_1), df.iloc[:1]) + + ### + # Test axis argument + ### + + # Test axis argument + df = pd.DataFrame({"col1": range(10), "col2": ["a"] * 10}) + second_column_weight = [0, 1] + tm.assert_frame_equal( + df.sample(n=1, axis=1, weights=second_column_weight), df[["col2"]] + ) + + # Different axis arg types + tm.assert_frame_equal( + df.sample(n=1, axis="columns", weights=second_column_weight), df[["col2"]] + ) + + weight = [0] * 10 + weight[5] = 0.5 + tm.assert_frame_equal(df.sample(n=1, axis="rows", weights=weight), df.iloc[5:6]) + tm.assert_frame_equal( + df.sample(n=1, axis="index", weights=weight), df.iloc[5:6] + ) + + # Check out of range axis values + with pytest.raises(ValueError): + df.sample(n=1, axis=2) + + with pytest.raises(ValueError): + df.sample(n=1, axis="not_a_name") + + with pytest.raises(ValueError): + s = pd.Series(range(10)) + s.sample(n=1, axis=1) + + # Test weight length compared to correct axis + with pytest.raises(ValueError): + df.sample(n=1, axis=1, weights=[0.5] * 10) + + # Check weights with axis = 1 + easy_weight_list = [0] * 3 + easy_weight_list[2] = 1 + + df = pd.DataFrame( + {"col1": range(10, 20), "col2": range(20, 30), "colString": ["a"] * 10} + ) + sample1 = df.sample(n=1, axis=1, weights=easy_weight_list) + tm.assert_frame_equal(sample1, df[["colString"]]) + + # Test default axes + tm.assert_frame_equal( + df.sample(n=3, random_state=42), df.sample(n=3, axis=0, random_state=42) + ) + + # Test that function aligns weights with frame + df = DataFrame({"col1": [5, 6, 7], "col2": ["a", "b", "c"]}, index=[9, 5, 3]) + s = Series([1, 0, 0], index=[3, 5, 9]) + tm.assert_frame_equal(df.loc[[3]], df.sample(1, weights=s)) + + # Weights have index values to be dropped because not in + # sampled DataFrame + s2 = Series([0.001, 0, 10000], index=[3, 5, 10]) + tm.assert_frame_equal(df.loc[[3]], df.sample(1, weights=s2)) + + # Weights have empty values to be filed with zeros + s3 = Series([0.01, 0], index=[3, 5]) + tm.assert_frame_equal(df.loc[[3]], df.sample(1, weights=s3)) + + # No overlap in weight and sampled DataFrame indices + s4 = Series([1, 0], index=[1, 2]) + with pytest.raises(ValueError): + df.sample(1, weights=s4) + + def test_squeeze(self): + # noop + for s in [tm.makeFloatSeries(), tm.makeStringSeries(), tm.makeObjectSeries()]: + tm.assert_series_equal(s.squeeze(), s) + for df in [tm.makeTimeDataFrame()]: + tm.assert_frame_equal(df.squeeze(), df) + + # squeezing + df = tm.makeTimeDataFrame().reindex(columns=["A"]) + tm.assert_series_equal(df.squeeze(), df["A"]) + + # don't fail with 0 length dimensions GH11229 & GH8999 + empty_series = Series([], name="five", dtype=np.float64) + empty_frame = DataFrame([empty_series]) + tm.assert_series_equal(empty_series, empty_series.squeeze()) + tm.assert_series_equal(empty_series, empty_frame.squeeze()) + + # axis argument + df = tm.makeTimeDataFrame(nper=1).iloc[:, :1] + assert df.shape == (1, 1) + tm.assert_series_equal(df.squeeze(axis=0), df.iloc[0]) + tm.assert_series_equal(df.squeeze(axis="index"), df.iloc[0]) + tm.assert_series_equal(df.squeeze(axis=1), df.iloc[:, 0]) + tm.assert_series_equal(df.squeeze(axis="columns"), df.iloc[:, 0]) + assert df.squeeze() == df.iloc[0, 0] + msg = "No axis named 2 for object type " + with pytest.raises(ValueError, match=msg): + df.squeeze(axis=2) + msg = "No axis named x for object type " + with pytest.raises(ValueError, match=msg): + df.squeeze(axis="x") + + df = tm.makeTimeDataFrame(3) + tm.assert_frame_equal(df.squeeze(axis=0), df) + + def test_numpy_squeeze(self): + s = tm.makeFloatSeries() + tm.assert_series_equal(np.squeeze(s), s) + + df = tm.makeTimeDataFrame().reindex(columns=["A"]) + tm.assert_series_equal(np.squeeze(df), df["A"]) + + def test_transpose(self): + for s in [tm.makeFloatSeries(), tm.makeStringSeries(), tm.makeObjectSeries()]: + # calls implementation in pandas/core/base.py + tm.assert_series_equal(s.transpose(), s) + for df in [tm.makeTimeDataFrame()]: + tm.assert_frame_equal(df.transpose().transpose(), df) + + def test_numpy_transpose(self): + msg = "the 'axes' parameter is not supported" + + s = tm.makeFloatSeries() + tm.assert_series_equal(np.transpose(s), s) + + with pytest.raises(ValueError, match=msg): + np.transpose(s, axes=1) + + df = tm.makeTimeDataFrame() + tm.assert_frame_equal(np.transpose(np.transpose(df)), df) + + with pytest.raises(ValueError, match=msg): + np.transpose(df, axes=1) + + def test_take(self): + indices = [1, 5, -2, 6, 3, -1] + for s in [tm.makeFloatSeries(), tm.makeStringSeries(), tm.makeObjectSeries()]: + out = s.take(indices) + expected = Series( + data=s.values.take(indices), index=s.index.take(indices), dtype=s.dtype + ) + tm.assert_series_equal(out, expected) + for df in [tm.makeTimeDataFrame()]: + out = df.take(indices) + expected = DataFrame( + data=df.values.take(indices, axis=0), + index=df.index.take(indices), + columns=df.columns, + ) + tm.assert_frame_equal(out, expected) + + def test_take_invalid_kwargs(self): + indices = [-3, 2, 0, 1] + s = tm.makeFloatSeries() + df = tm.makeTimeDataFrame() + + for obj in (s, df): + msg = r"take\(\) got an unexpected keyword argument 'foo'" + with pytest.raises(TypeError, match=msg): + obj.take(indices, foo=2) + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + obj.take(indices, out=indices) + + msg = "the 'mode' parameter is not supported" + with pytest.raises(ValueError, match=msg): + obj.take(indices, mode="clip") + + @pytest.mark.parametrize("is_copy", [True, False]) + def test_depr_take_kwarg_is_copy(self, is_copy): + # GH 27357 + df = DataFrame({"A": [1, 2, 3]}) + msg = ( + "is_copy is deprecated and will be removed in a future version. " + "'take' always returns a copy, so there is no need to specify this." + ) + with tm.assert_produces_warning(FutureWarning) as w: + df.take([0, 1], is_copy=is_copy) + + assert w[0].message.args[0] == msg + + s = Series([1, 2, 3]) + with tm.assert_produces_warning(FutureWarning): + s.take([0, 1], is_copy=is_copy) + + def test_equals(self): + s1 = pd.Series([1, 2, 3], index=[0, 2, 1]) + s2 = s1.copy() + assert s1.equals(s2) + + s1[1] = 99 + assert not s1.equals(s2) + + # NaNs compare as equal + s1 = pd.Series([1, np.nan, 3, np.nan], index=[0, 2, 1, 3]) + s2 = s1.copy() + assert s1.equals(s2) + + s2[0] = 9.9 + assert not s1.equals(s2) + + idx = MultiIndex.from_tuples([(0, "a"), (1, "b"), (2, "c")]) + s1 = Series([1, 2, np.nan], index=idx) + s2 = s1.copy() + assert s1.equals(s2) + + # Add object dtype column with nans + index = np.random.random(10) + df1 = DataFrame(np.random.random(10), index=index, columns=["floats"]) + df1["text"] = "the sky is so blue. we could use more chocolate.".split() + df1["start"] = date_range("2000-1-1", periods=10, freq="T") + df1["end"] = date_range("2000-1-1", periods=10, freq="D") + df1["diff"] = df1["end"] - df1["start"] + df1["bool"] = np.arange(10) % 3 == 0 + df1.loc[::2] = np.nan + df2 = df1.copy() + assert df1["text"].equals(df2["text"]) + assert df1["start"].equals(df2["start"]) + assert df1["end"].equals(df2["end"]) + assert df1["diff"].equals(df2["diff"]) + assert df1["bool"].equals(df2["bool"]) + assert df1.equals(df2) + assert not df1.equals(object) + + # different dtype + different = df1.copy() + different["floats"] = different["floats"].astype("float32") + assert not df1.equals(different) + + # different index + different_index = -index + different = df2.set_index(different_index) + assert not df1.equals(different) + + # different columns + different = df2.copy() + different.columns = df2.columns[::-1] + assert not df1.equals(different) + + # DatetimeIndex + index = pd.date_range("2000-1-1", periods=10, freq="T") + df1 = df1.set_index(index) + df2 = df1.copy() + assert df1.equals(df2) + + # MultiIndex + df3 = df1.set_index(["text"], append=True) + df2 = df1.set_index(["text"], append=True) + assert df3.equals(df2) + + df2 = df1.set_index(["floats"], append=True) + assert not df3.equals(df2) + + # NaN in index + df3 = df1.set_index(["floats"], append=True) + df2 = df1.set_index(["floats"], append=True) + assert df3.equals(df2) + + # GH 8437 + a = pd.Series([False, np.nan]) + b = pd.Series([False, np.nan]) + c = pd.Series(index=range(2), dtype=object) + d = c.copy() + e = c.copy() + f = c.copy() + c[:-1] = d[:-1] = e[0] = f[0] = False + assert a.equals(a) + assert a.equals(b) + assert a.equals(c) + assert a.equals(d) + assert a.equals(e) + assert e.equals(f) + + def test_pipe(self): + df = DataFrame({"A": [1, 2, 3]}) + f = lambda x, y: x ** y + result = df.pipe(f, 2) + expected = DataFrame({"A": [1, 4, 9]}) + tm.assert_frame_equal(result, expected) + + result = df.A.pipe(f, 2) + tm.assert_series_equal(result, expected.A) + + def test_pipe_tuple(self): + df = DataFrame({"A": [1, 2, 3]}) + f = lambda x, y: y + result = df.pipe((f, "y"), 0) + tm.assert_frame_equal(result, df) + + result = df.A.pipe((f, "y"), 0) + tm.assert_series_equal(result, df.A) + + def test_pipe_tuple_error(self): + df = DataFrame({"A": [1, 2, 3]}) + f = lambda x, y: y + with pytest.raises(ValueError): + df.pipe((f, "y"), x=1, y=0) + + with pytest.raises(ValueError): + df.A.pipe((f, "y"), x=1, y=0) + + @pytest.mark.parametrize("box", [pd.Series, pd.DataFrame]) + def test_axis_classmethods(self, box): + obj = box(dtype=object) + values = ( + list(box._AXIS_NAMES.keys()) + + list(box._AXIS_NUMBERS.keys()) + + list(box._AXIS_ALIASES.keys()) + ) + for v in values: + assert obj._get_axis_number(v) == box._get_axis_number(v) + assert obj._get_axis_name(v) == box._get_axis_name(v) + assert obj._get_block_manager_axis(v) == box._get_block_manager_axis(v) diff --git a/venv/Lib/site-packages/pandas/tests/generic/test_label_or_level_utils.py b/venv/Lib/site-packages/pandas/tests/generic/test_label_or_level_utils.py new file mode 100644 index 0000000..d3566f1 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/generic/test_label_or_level_utils.py @@ -0,0 +1,339 @@ +import pytest + +from pandas.core.dtypes.missing import array_equivalent + +import pandas as pd + + +# Fixtures +# ======== +@pytest.fixture +def df(): + """DataFrame with columns 'L1', 'L2', and 'L3' """ + return pd.DataFrame({"L1": [1, 2, 3], "L2": [11, 12, 13], "L3": ["A", "B", "C"]}) + + +@pytest.fixture(params=[[], ["L1"], ["L1", "L2"], ["L1", "L2", "L3"]]) +def df_levels(request, df): + """DataFrame with columns or index levels 'L1', 'L2', and 'L3' """ + levels = request.param + + if levels: + df = df.set_index(levels) + + return df + + +@pytest.fixture +def df_ambig(df): + """DataFrame with levels 'L1' and 'L2' and labels 'L1' and 'L3' """ + df = df.set_index(["L1", "L2"]) + + df["L1"] = df["L3"] + + return df + + +@pytest.fixture +def df_duplabels(df): + """DataFrame with level 'L1' and labels 'L2', 'L3', and 'L2' """ + df = df.set_index(["L1"]) + df = pd.concat([df, df["L2"]], axis=1) + + return df + + +# Test is label/level reference +# ============================= +def get_labels_levels(df_levels): + expected_labels = list(df_levels.columns) + expected_levels = [name for name in df_levels.index.names if name is not None] + return expected_labels, expected_levels + + +def assert_label_reference(frame, labels, axis): + for label in labels: + assert frame._is_label_reference(label, axis=axis) + assert not frame._is_level_reference(label, axis=axis) + assert frame._is_label_or_level_reference(label, axis=axis) + + +def assert_level_reference(frame, levels, axis): + for level in levels: + assert frame._is_level_reference(level, axis=axis) + assert not frame._is_label_reference(level, axis=axis) + assert frame._is_label_or_level_reference(level, axis=axis) + + +# DataFrame +# --------- +def test_is_level_or_label_reference_df_simple(df_levels, axis): + + # Compute expected labels and levels + expected_labels, expected_levels = get_labels_levels(df_levels) + + # Transpose frame if axis == 1 + if axis in {1, "columns"}: + df_levels = df_levels.T + + # Perform checks + assert_level_reference(df_levels, expected_levels, axis=axis) + assert_label_reference(df_levels, expected_labels, axis=axis) + + +def test_is_level_reference_df_ambig(df_ambig, axis): + + # Transpose frame if axis == 1 + if axis in {1, "columns"}: + df_ambig = df_ambig.T + + # df has both an on-axis level and off-axis label named L1 + # Therefore L1 should reference the label, not the level + assert_label_reference(df_ambig, ["L1"], axis=axis) + + # df has an on-axis level named L2 and it is not ambiguous + # Therefore L2 is an level reference + assert_level_reference(df_ambig, ["L2"], axis=axis) + + # df has a column named L3 and it not an level reference + assert_label_reference(df_ambig, ["L3"], axis=axis) + + +# Series +# ------ +def test_is_level_reference_series_simple_axis0(df): + + # Make series with L1 as index + s = df.set_index("L1").L2 + assert_level_reference(s, ["L1"], axis=0) + assert not s._is_level_reference("L2") + + # Make series with L1 and L2 as index + s = df.set_index(["L1", "L2"]).L3 + assert_level_reference(s, ["L1", "L2"], axis=0) + assert not s._is_level_reference("L3") + + +def test_is_level_reference_series_axis1_error(df): + + # Make series with L1 as index + s = df.set_index("L1").L2 + + with pytest.raises(ValueError, match="No axis named 1"): + s._is_level_reference("L1", axis=1) + + +# Test _check_label_or_level_ambiguity_df +# ======================================= + +# DataFrame +# --------- +def test_check_label_or_level_ambiguity_df(df_ambig, axis): + + # Transpose frame if axis == 1 + if axis in {1, "columns"}: + df_ambig = df_ambig.T + + if axis in {0, "index"}: + msg = "'L1' is both an index level and a column label" + else: + msg = "'L1' is both a column level and an index label" + + # df_ambig has both an on-axis level and off-axis label named L1 + # Therefore, L1 is ambiguous. + with pytest.raises(ValueError, match=msg): + df_ambig._check_label_or_level_ambiguity("L1", axis=axis) + + # df_ambig has an on-axis level named L2,, and it is not ambiguous. + df_ambig._check_label_or_level_ambiguity("L2", axis=axis) + + # df_ambig has an off-axis label named L3, and it is not ambiguous + assert not df_ambig._check_label_or_level_ambiguity("L3", axis=axis) + + +# Series +# ------ +def test_check_label_or_level_ambiguity_series(df): + + # A series has no columns and therefore references are never ambiguous + + # Make series with L1 as index + s = df.set_index("L1").L2 + s._check_label_or_level_ambiguity("L1", axis=0) + s._check_label_or_level_ambiguity("L2", axis=0) + + # Make series with L1 and L2 as index + s = df.set_index(["L1", "L2"]).L3 + s._check_label_or_level_ambiguity("L1", axis=0) + s._check_label_or_level_ambiguity("L2", axis=0) + s._check_label_or_level_ambiguity("L3", axis=0) + + +def test_check_label_or_level_ambiguity_series_axis1_error(df): + + # Make series with L1 as index + s = df.set_index("L1").L2 + + with pytest.raises(ValueError, match="No axis named 1"): + s._check_label_or_level_ambiguity("L1", axis=1) + + +# Test _get_label_or_level_values +# =============================== +def assert_label_values(frame, labels, axis): + for label in labels: + if axis in {0, "index"}: + expected = frame[label]._values + else: + expected = frame.loc[label]._values + + result = frame._get_label_or_level_values(label, axis=axis) + assert array_equivalent(expected, result) + + +def assert_level_values(frame, levels, axis): + for level in levels: + if axis in {0, "index"}: + expected = frame.index.get_level_values(level=level)._values + else: + expected = frame.columns.get_level_values(level=level)._values + + result = frame._get_label_or_level_values(level, axis=axis) + assert array_equivalent(expected, result) + + +# DataFrame +# --------- +def test_get_label_or_level_values_df_simple(df_levels, axis): + + # Compute expected labels and levels + expected_labels, expected_levels = get_labels_levels(df_levels) + + # Transpose frame if axis == 1 + if axis in {1, "columns"}: + df_levels = df_levels.T + + # Perform checks + assert_label_values(df_levels, expected_labels, axis=axis) + assert_level_values(df_levels, expected_levels, axis=axis) + + +def test_get_label_or_level_values_df_ambig(df_ambig, axis): + + # Transpose frame if axis == 1 + if axis in {1, "columns"}: + df_ambig = df_ambig.T + + # df has an on-axis level named L2, and it is not ambiguous. + assert_level_values(df_ambig, ["L2"], axis=axis) + + # df has an off-axis label named L3, and it is not ambiguous. + assert_label_values(df_ambig, ["L3"], axis=axis) + + +def test_get_label_or_level_values_df_duplabels(df_duplabels, axis): + + # Transpose frame if axis == 1 + if axis in {1, "columns"}: + df_duplabels = df_duplabels.T + + # df has unambiguous level 'L1' + assert_level_values(df_duplabels, ["L1"], axis=axis) + + # df has unique label 'L3' + assert_label_values(df_duplabels, ["L3"], axis=axis) + + # df has duplicate labels 'L2' + if axis in {0, "index"}: + expected_msg = "The column label 'L2' is not unique" + else: + expected_msg = "The index label 'L2' is not unique" + + with pytest.raises(ValueError, match=expected_msg): + assert_label_values(df_duplabels, ["L2"], axis=axis) + + +# Series +# ------ +def test_get_label_or_level_values_series_axis0(df): + + # Make series with L1 as index + s = df.set_index("L1").L2 + assert_level_values(s, ["L1"], axis=0) + + # Make series with L1 and L2 as index + s = df.set_index(["L1", "L2"]).L3 + assert_level_values(s, ["L1", "L2"], axis=0) + + +def test_get_label_or_level_values_series_axis1_error(df): + + # Make series with L1 as index + s = df.set_index("L1").L2 + + with pytest.raises(ValueError, match="No axis named 1"): + s._get_label_or_level_values("L1", axis=1) + + +# Test _drop_labels_or_levels +# =========================== +def assert_labels_dropped(frame, labels, axis): + for label in labels: + df_dropped = frame._drop_labels_or_levels(label, axis=axis) + + if axis in {0, "index"}: + assert label in frame.columns + assert label not in df_dropped.columns + else: + assert label in frame.index + assert label not in df_dropped.index + + +def assert_levels_dropped(frame, levels, axis): + for level in levels: + df_dropped = frame._drop_labels_or_levels(level, axis=axis) + + if axis in {0, "index"}: + assert level in frame.index.names + assert level not in df_dropped.index.names + else: + assert level in frame.columns.names + assert level not in df_dropped.columns.names + + +# DataFrame +# --------- +def test_drop_labels_or_levels_df(df_levels, axis): + + # Compute expected labels and levels + expected_labels, expected_levels = get_labels_levels(df_levels) + + # Transpose frame if axis == 1 + if axis in {1, "columns"}: + df_levels = df_levels.T + + # Perform checks + assert_labels_dropped(df_levels, expected_labels, axis=axis) + assert_levels_dropped(df_levels, expected_levels, axis=axis) + + with pytest.raises(ValueError, match="not valid labels or levels"): + df_levels._drop_labels_or_levels("L4", axis=axis) + + +# Series +# ------ +def test_drop_labels_or_levels_series(df): + + # Make series with L1 as index + s = df.set_index("L1").L2 + assert_levels_dropped(s, ["L1"], axis=0) + + with pytest.raises(ValueError, match="not valid labels or levels"): + s._drop_labels_or_levels("L4", axis=0) + + # Make series with L1 and L2 as index + s = df.set_index(["L1", "L2"]).L3 + assert_levels_dropped(s, ["L1", "L2"], axis=0) + + with pytest.raises(ValueError, match="not valid labels or levels"): + s._drop_labels_or_levels("L4", axis=0) diff --git a/venv/Lib/site-packages/pandas/tests/generic/test_series.py b/venv/Lib/site-packages/pandas/tests/generic/test_series.py new file mode 100644 index 0000000..8ad8355 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/generic/test_series.py @@ -0,0 +1,263 @@ +from distutils.version import LooseVersion +from operator import methodcaller + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import MultiIndex, Series, date_range +import pandas._testing as tm + +from .test_generic import Generic + +try: + import xarray + + _XARRAY_INSTALLED = True +except ImportError: + _XARRAY_INSTALLED = False + + +class TestSeries(Generic): + _typ = Series + _comparator = lambda self, x, y: tm.assert_series_equal(x, y) + + def setup_method(self): + self.ts = tm.makeTimeSeries() # Was at top level in test_series + self.ts.name = "ts" + + self.series = tm.makeStringSeries() + self.series.name = "series" + + def test_rename_mi(self): + s = Series( + [11, 21, 31], + index=MultiIndex.from_tuples([("A", x) for x in ["a", "B", "c"]]), + ) + s.rename(str.lower) + + def test_set_axis_name(self): + s = Series([1, 2, 3], index=["a", "b", "c"]) + funcs = ["rename_axis", "_set_axis_name"] + name = "foo" + for func in funcs: + result = methodcaller(func, name)(s) + assert s.index.name is None + assert result.index.name == name + + def test_set_axis_name_mi(self): + s = Series( + [11, 21, 31], + index=MultiIndex.from_tuples( + [("A", x) for x in ["a", "B", "c"]], names=["l1", "l2"] + ), + ) + funcs = ["rename_axis", "_set_axis_name"] + for func in funcs: + result = methodcaller(func, ["L1", "L2"])(s) + assert s.index.name is None + assert s.index.names == ["l1", "l2"] + assert result.index.name is None + assert result.index.names, ["L1", "L2"] + + def test_set_axis_name_raises(self): + s = pd.Series([1]) + with pytest.raises(ValueError): + s._set_axis_name(name="a", axis=1) + + def test_get_numeric_data_preserve_dtype(self): + + # get the numeric data + o = Series([1, 2, 3]) + result = o._get_numeric_data() + self._compare(result, o) + + o = Series([1, "2", 3.0]) + result = o._get_numeric_data() + expected = Series([], dtype=object, index=pd.Index([], dtype=object)) + self._compare(result, expected) + + o = Series([True, False, True]) + result = o._get_numeric_data() + self._compare(result, o) + + o = Series([True, False, True]) + result = o._get_bool_data() + self._compare(result, o) + + o = Series(date_range("20130101", periods=3)) + result = o._get_numeric_data() + expected = Series([], dtype="M8[ns]", index=pd.Index([], dtype=object)) + self._compare(result, expected) + + def test_nonzero_single_element(self): + + # allow single item via bool method + s = Series([True]) + assert s.bool() + + s = Series([False]) + assert not s.bool() + + msg = "The truth value of a Series is ambiguous" + # single item nan to raise + for s in [Series([np.nan]), Series([pd.NaT]), Series([True]), Series([False])]: + with pytest.raises(ValueError, match=msg): + bool(s) + + msg = "bool cannot act on a non-boolean single element Series" + for s in [Series([np.nan]), Series([pd.NaT])]: + with pytest.raises(ValueError, match=msg): + s.bool() + + # multiple bool are still an error + msg = "The truth value of a Series is ambiguous" + for s in [Series([True, True]), Series([False, False])]: + with pytest.raises(ValueError, match=msg): + bool(s) + with pytest.raises(ValueError, match=msg): + s.bool() + + # single non-bool are an error + for s in [Series([1]), Series([0]), Series(["a"]), Series([0.0])]: + msg = "The truth value of a Series is ambiguous" + with pytest.raises(ValueError, match=msg): + bool(s) + msg = "bool cannot act on a non-boolean single element Series" + with pytest.raises(ValueError, match=msg): + s.bool() + + def test_metadata_propagation_indiv(self): + # check that the metadata matches up on the resulting ops + + o = Series(range(3), range(3)) + o.name = "foo" + o2 = Series(range(3), range(3)) + o2.name = "bar" + + result = o.T + self.check_metadata(o, result) + + # resample + ts = Series( + np.random.rand(1000), + index=date_range("20130101", periods=1000, freq="s"), + name="foo", + ) + result = ts.resample("1T").mean() + self.check_metadata(ts, result) + + result = ts.resample("1T").min() + self.check_metadata(ts, result) + + result = ts.resample("1T").apply(lambda x: x.sum()) + self.check_metadata(ts, result) + + _metadata = Series._metadata + _finalize = Series.__finalize__ + Series._metadata = ["name", "filename"] + o.filename = "foo" + o2.filename = "bar" + + def finalize(self, other, method=None, **kwargs): + for name in self._metadata: + if method == "concat" and name == "filename": + value = "+".join( + [getattr(o, name) for o in other.objs if getattr(o, name, None)] + ) + object.__setattr__(self, name, value) + else: + object.__setattr__(self, name, getattr(other, name, None)) + + return self + + Series.__finalize__ = finalize + + result = pd.concat([o, o2]) + assert result.filename == "foo+bar" + assert result.name is None + + # reset + Series._metadata = _metadata + Series.__finalize__ = _finalize + + @pytest.mark.skipif( + not _XARRAY_INSTALLED + or _XARRAY_INSTALLED + and LooseVersion(xarray.__version__) < LooseVersion("0.10.0"), + reason="xarray >= 0.10.0 required", + ) + @pytest.mark.parametrize( + "index", + [ + "FloatIndex", + "IntIndex", + "StringIndex", + "UnicodeIndex", + "DateIndex", + "PeriodIndex", + "TimedeltaIndex", + "CategoricalIndex", + ], + ) + def test_to_xarray_index_types(self, index): + from xarray import DataArray + + index = getattr(tm, f"make{index}") + s = Series(range(6), index=index(6)) + s.index.name = "foo" + result = s.to_xarray() + repr(result) + assert len(result) == 6 + assert len(result.coords) == 1 + tm.assert_almost_equal(list(result.coords.keys()), ["foo"]) + assert isinstance(result, DataArray) + + # idempotency + tm.assert_series_equal( + result.to_series(), s, check_index_type=False, check_categorical=True + ) + + @td.skip_if_no("xarray", min_version="0.7.0") + def test_to_xarray(self): + from xarray import DataArray + + s = Series([], dtype=object) + s.index.name = "foo" + result = s.to_xarray() + assert len(result) == 0 + assert len(result.coords) == 1 + tm.assert_almost_equal(list(result.coords.keys()), ["foo"]) + assert isinstance(result, DataArray) + + s = Series(range(6)) + s.index.name = "foo" + s.index = pd.MultiIndex.from_product( + [["a", "b"], range(3)], names=["one", "two"] + ) + result = s.to_xarray() + assert len(result) == 2 + tm.assert_almost_equal(list(result.coords.keys()), ["one", "two"]) + assert isinstance(result, DataArray) + tm.assert_series_equal(result.to_series(), s) + + @pytest.mark.parametrize( + "s", + [ + Series([np.arange(5)]), + pd.date_range("1/1/2011", periods=24, freq="H"), + pd.Series(range(5), index=pd.date_range("2017", periods=5)), + ], + ) + @pytest.mark.parametrize("shift_size", [0, 1, 2]) + def test_shift_always_copy(self, s, shift_size): + # GH22397 + assert s.shift(shift_size) is not s + + @pytest.mark.parametrize("move_by_freq", [pd.Timedelta("1D"), pd.Timedelta("1M")]) + def test_datetime_shift_always_copy(self, move_by_freq): + # GH22397 + s = pd.Series(range(5), index=pd.date_range("2017", periods=5)) + assert s.shift(freq=move_by_freq) is not s diff --git a/venv/Lib/site-packages/pandas/tests/groupby/__init__.py b/venv/Lib/site-packages/pandas/tests/groupby/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/groupby/aggregate/__init__.py b/venv/Lib/site-packages/pandas/tests/groupby/aggregate/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/groupby/aggregate/test_aggregate.py b/venv/Lib/site-packages/pandas/tests/groupby/aggregate/test_aggregate.py new file mode 100644 index 0000000..723aec1 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/groupby/aggregate/test_aggregate.py @@ -0,0 +1,885 @@ +""" +test .agg behavior / note that .apply is tested generally in test_groupby.py +""" +import functools + +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Index, MultiIndex, Series, concat +import pandas._testing as tm +from pandas.core.base import SpecificationError +from pandas.core.groupby.generic import _make_unique, _maybe_mangle_lambdas +from pandas.core.groupby.grouper import Grouping + + +def test_agg_regression1(tsframe): + grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) + result = grouped.agg(np.mean) + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + +def test_agg_must_agg(df): + grouped = df.groupby("A")["C"] + + msg = "Must produce aggregated value" + with pytest.raises(Exception, match=msg): + grouped.agg(lambda x: x.describe()) + with pytest.raises(Exception, match=msg): + grouped.agg(lambda x: x.index[:2]) + + +def test_agg_ser_multi_key(df): + # TODO(wesm): unused + ser = df.C # noqa + + f = lambda x: x.sum() + results = df.C.groupby([df.A, df.B]).aggregate(f) + expected = df.groupby(["A", "B"]).sum()["C"] + tm.assert_series_equal(results, expected) + + +def test_groupby_aggregation_mixed_dtype(): + + # GH 6212 + expected = DataFrame( + { + "v1": [5, 5, 7, np.nan, 3, 3, 4, 1], + "v2": [55, 55, 77, np.nan, 33, 33, 44, 11], + }, + index=MultiIndex.from_tuples( + [ + (1, 95), + (1, 99), + (2, 95), + (2, 99), + ("big", "damp"), + ("blue", "dry"), + ("red", "red"), + ("red", "wet"), + ], + names=["by1", "by2"], + ), + ) + + df = DataFrame( + { + "v1": [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9], + "v2": [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99], + "by1": ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12], + "by2": [ + "wet", + "dry", + 99, + 95, + np.nan, + "damp", + 95, + 99, + "red", + 99, + np.nan, + np.nan, + ], + } + ) + + g = df.groupby(["by1", "by2"]) + result = g[["v1", "v2"]].mean() + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_multi_level_column(): + # GH 29772 + lst = [ + [True, True, True, False], + [True, False, np.nan, False], + [True, True, np.nan, False], + [True, True, np.nan, False], + ] + df = pd.DataFrame( + data=lst, + columns=pd.MultiIndex.from_tuples([("A", 0), ("A", 1), ("B", 0), ("B", 1)]), + ) + + result = df.groupby(level=1, axis=1).sum() + expected = pd.DataFrame({0: [2.0, 1, 1, 1], 1: [1, 0, 1, 1]}) + + tm.assert_frame_equal(result, expected) + + +def test_agg_apply_corner(ts, tsframe): + # nothing to group, all NA + grouped = ts.groupby(ts * np.nan) + assert ts.dtype == np.float64 + + # groupby float64 values results in Float64Index + exp = Series([], dtype=np.float64, index=pd.Index([], dtype=np.float64)) + tm.assert_series_equal(grouped.sum(), exp) + tm.assert_series_equal(grouped.agg(np.sum), exp) + tm.assert_series_equal(grouped.apply(np.sum), exp, check_index_type=False) + + # DataFrame + grouped = tsframe.groupby(tsframe["A"] * np.nan) + exp_df = DataFrame( + columns=tsframe.columns, dtype=float, index=pd.Index([], dtype=np.float64) + ) + tm.assert_frame_equal(grouped.sum(), exp_df, check_names=False) + tm.assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False) + tm.assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0], check_names=False) + + +def test_agg_grouping_is_list_tuple(ts): + df = tm.makeTimeDataFrame() + + grouped = df.groupby(lambda x: x.year) + grouper = grouped.grouper.groupings[0].grouper + grouped.grouper.groupings[0] = Grouping(ts.index, list(grouper)) + + result = grouped.agg(np.mean) + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + grouped.grouper.groupings[0] = Grouping(ts.index, tuple(grouper)) + + result = grouped.agg(np.mean) + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + +def test_agg_python_multiindex(mframe): + grouped = mframe.groupby(["A", "B"]) + + result = grouped.agg(np.mean) + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "groupbyfunc", [lambda x: x.weekday(), [lambda x: x.month, lambda x: x.weekday()]] +) +def test_aggregate_str_func(tsframe, groupbyfunc): + grouped = tsframe.groupby(groupbyfunc) + + # single series + result = grouped["A"].agg("std") + expected = grouped["A"].std() + tm.assert_series_equal(result, expected) + + # group frame by function name + result = grouped.aggregate("var") + expected = grouped.var() + tm.assert_frame_equal(result, expected) + + # group frame by function dict + result = grouped.agg({"A": "var", "B": "std", "C": "mean", "D": "sem"}) + expected = DataFrame( + { + "A": grouped["A"].var(), + "B": grouped["B"].std(), + "C": grouped["C"].mean(), + "D": grouped["D"].sem(), + } + ) + tm.assert_frame_equal(result, expected) + + +def test_aggregate_item_by_item(df): + grouped = df.groupby("A") + + aggfun = lambda ser: ser.size + result = grouped.agg(aggfun) + foo = (df.A == "foo").sum() + bar = (df.A == "bar").sum() + K = len(result.columns) + + # GH5782 + # odd comparisons can result here, so cast to make easy + exp = pd.Series( + np.array([foo] * K), index=list("BCD"), dtype=np.float64, name="foo" + ) + tm.assert_series_equal(result.xs("foo"), exp) + + exp = pd.Series( + np.array([bar] * K), index=list("BCD"), dtype=np.float64, name="bar" + ) + tm.assert_almost_equal(result.xs("bar"), exp) + + def aggfun(ser): + return ser.size + + result = DataFrame().groupby(df.A).agg(aggfun) + assert isinstance(result, DataFrame) + assert len(result) == 0 + + +def test_wrap_agg_out(three_group): + grouped = three_group.groupby(["A", "B"]) + + def func(ser): + if ser.dtype == np.object: + raise TypeError + else: + return ser.sum() + + result = grouped.aggregate(func) + exp_grouped = three_group.loc[:, three_group.columns != "C"] + expected = exp_grouped.groupby(["A", "B"]).aggregate(func) + tm.assert_frame_equal(result, expected) + + +def test_agg_multiple_functions_maintain_order(df): + # GH #610 + funcs = [("mean", np.mean), ("max", np.max), ("min", np.min)] + result = df.groupby("A")["C"].agg(funcs) + exp_cols = Index(["mean", "max", "min"]) + + tm.assert_index_equal(result.columns, exp_cols) + + +def test_multiple_functions_tuples_and_non_tuples(df): + # #1359 + funcs = [("foo", "mean"), "std"] + ex_funcs = [("foo", "mean"), ("std", "std")] + + result = df.groupby("A")["C"].agg(funcs) + expected = df.groupby("A")["C"].agg(ex_funcs) + tm.assert_frame_equal(result, expected) + + result = df.groupby("A").agg(funcs) + expected = df.groupby("A").agg(ex_funcs) + tm.assert_frame_equal(result, expected) + + +def test_more_flexible_frame_multi_function(df): + grouped = df.groupby("A") + + exmean = grouped.agg({"C": np.mean, "D": np.mean}) + exstd = grouped.agg({"C": np.std, "D": np.std}) + + expected = concat([exmean, exstd], keys=["mean", "std"], axis=1) + expected = expected.swaplevel(0, 1, axis=1).sort_index(level=0, axis=1) + + d = {"C": [np.mean, np.std], "D": [np.mean, np.std]} + result = grouped.aggregate(d) + + tm.assert_frame_equal(result, expected) + + # be careful + result = grouped.aggregate({"C": np.mean, "D": [np.mean, np.std]}) + expected = grouped.aggregate({"C": np.mean, "D": [np.mean, np.std]}) + tm.assert_frame_equal(result, expected) + + def foo(x): + return np.mean(x) + + def bar(x): + return np.std(x, ddof=1) + + # this uses column selection & renaming + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + d = dict([["C", np.mean], ["D", dict([["foo", np.mean], ["bar", np.std]])]]) + grouped.aggregate(d) + + # But without renaming, these functions are OK + d = {"C": [np.mean], "D": [foo, bar]} + grouped.aggregate(d) + + +def test_multi_function_flexible_mix(df): + # GH #1268 + grouped = df.groupby("A") + + # Expected + d = {"C": {"foo": "mean", "bar": "std"}, "D": {"sum": "sum"}} + # this uses column selection & renaming + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + grouped.aggregate(d) + + # Test 1 + d = {"C": {"foo": "mean", "bar": "std"}, "D": "sum"} + # this uses column selection & renaming + with pytest.raises(SpecificationError, match=msg): + grouped.aggregate(d) + + # Test 2 + d = {"C": {"foo": "mean", "bar": "std"}, "D": "sum"} + # this uses column selection & renaming + with pytest.raises(SpecificationError, match=msg): + grouped.aggregate(d) + + +def test_groupby_agg_coercing_bools(): + # issue 14873 + dat = pd.DataFrame({"a": [1, 1, 2, 2], "b": [0, 1, 2, 3], "c": [None, None, 1, 1]}) + gp = dat.groupby("a") + + index = Index([1, 2], name="a") + + result = gp["b"].aggregate(lambda x: (x != 0).all()) + expected = Series([False, True], index=index, name="b") + tm.assert_series_equal(result, expected) + + result = gp["c"].aggregate(lambda x: x.isnull().all()) + expected = Series([True, False], index=index, name="c") + tm.assert_series_equal(result, expected) + + +def test_order_aggregate_multiple_funcs(): + # GH 25692 + df = pd.DataFrame({"A": [1, 1, 2, 2], "B": [1, 2, 3, 4]}) + + res = df.groupby("A").agg(["sum", "max", "mean", "ohlc", "min"]) + result = res.columns.levels[1] + + expected = pd.Index(["sum", "max", "mean", "ohlc", "min"]) + + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("dtype", [np.int64, np.uint64]) +@pytest.mark.parametrize("how", ["first", "last", "min", "max", "mean", "median"]) +def test_uint64_type_handling(dtype, how): + # GH 26310 + df = pd.DataFrame({"x": 6903052872240755750, "y": [1, 2]}) + expected = df.groupby("y").agg({"x": how}) + df.x = df.x.astype(dtype) + result = df.groupby("y").agg({"x": how}) + result.x = result.x.astype(np.int64) + tm.assert_frame_equal(result, expected, check_exact=True) + + +def test_func_duplicates_raises(): + # GH28426 + msg = "Function names" + df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) + with pytest.raises(SpecificationError, match=msg): + df.groupby("A").agg(["min", "min"]) + + +@pytest.mark.parametrize( + "index", + [ + pd.CategoricalIndex(list("abc")), + pd.interval_range(0, 3), + pd.period_range("2020", periods=3, freq="D"), + pd.MultiIndex.from_tuples([("a", 0), ("a", 1), ("b", 0)]), + ], +) +def test_agg_index_has_complex_internals(index): + # GH 31223 + df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index) + result = df.groupby("group").agg({"value": Series.nunique}) + expected = DataFrame({"group": [1, 2], "value": [2, 1]}).set_index("group") + tm.assert_frame_equal(result, expected) + + +class TestNamedAggregationSeries: + def test_series_named_agg(self): + df = pd.Series([1, 2, 3, 4]) + gr = df.groupby([0, 0, 1, 1]) + result = gr.agg(a="sum", b="min") + expected = pd.DataFrame( + {"a": [3, 7], "b": [1, 3]}, columns=["a", "b"], index=[0, 1] + ) + tm.assert_frame_equal(result, expected) + + result = gr.agg(b="min", a="sum") + expected = expected[["b", "a"]] + tm.assert_frame_equal(result, expected) + + def test_no_args_raises(self): + gr = pd.Series([1, 2]).groupby([0, 1]) + with pytest.raises(TypeError, match="Must provide"): + gr.agg() + + # but we do allow this + result = gr.agg([]) + expected = pd.DataFrame() + tm.assert_frame_equal(result, expected) + + def test_series_named_agg_duplicates_no_raises(self): + # GH28426 + gr = pd.Series([1, 2, 3]).groupby([0, 0, 1]) + grouped = gr.agg(a="sum", b="sum") + expected = pd.DataFrame({"a": [3, 3], "b": [3, 3]}) + tm.assert_frame_equal(expected, grouped) + + def test_mangled(self): + gr = pd.Series([1, 2, 3]).groupby([0, 0, 1]) + result = gr.agg(a=lambda x: 0, b=lambda x: 1) + expected = pd.DataFrame({"a": [0, 0], "b": [1, 1]}) + tm.assert_frame_equal(result, expected) + + +class TestNamedAggregationDataFrame: + def test_agg_relabel(self): + df = pd.DataFrame( + {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]} + ) + result = df.groupby("group").agg(a_max=("A", "max"), b_max=("B", "max")) + expected = pd.DataFrame( + {"a_max": [1, 3], "b_max": [6, 8]}, + index=pd.Index(["a", "b"], name="group"), + columns=["a_max", "b_max"], + ) + tm.assert_frame_equal(result, expected) + + # order invariance + p98 = functools.partial(np.percentile, q=98) + result = df.groupby("group").agg( + b_min=("B", "min"), + a_min=("A", min), + a_mean=("A", np.mean), + a_max=("A", "max"), + b_max=("B", "max"), + a_98=("A", p98), + ) + expected = pd.DataFrame( + { + "b_min": [5, 7], + "a_min": [0, 2], + "a_mean": [0.5, 2.5], + "a_max": [1, 3], + "b_max": [6, 8], + "a_98": [0.98, 2.98], + }, + index=pd.Index(["a", "b"], name="group"), + columns=["b_min", "a_min", "a_mean", "a_max", "b_max", "a_98"], + ) + tm.assert_frame_equal(result, expected) + + def test_agg_relabel_non_identifier(self): + df = pd.DataFrame( + {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]} + ) + + result = df.groupby("group").agg(**{"my col": ("A", "max")}) + expected = pd.DataFrame( + {"my col": [1, 3]}, index=pd.Index(["a", "b"], name="group") + ) + tm.assert_frame_equal(result, expected) + + def test_duplicate_no_raises(self): + # GH 28426, if use same input function on same column, + # no error should raise + df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) + + grouped = df.groupby("A").agg(a=("B", "min"), b=("B", "min")) + expected = pd.DataFrame( + {"a": [1, 3], "b": [1, 3]}, index=pd.Index([0, 1], name="A") + ) + tm.assert_frame_equal(grouped, expected) + + quant50 = functools.partial(np.percentile, q=50) + quant70 = functools.partial(np.percentile, q=70) + quant50.__name__ = "quant50" + quant70.__name__ = "quant70" + + test = pd.DataFrame( + {"col1": ["a", "a", "b", "b", "b"], "col2": [1, 2, 3, 4, 5]} + ) + + grouped = test.groupby("col1").agg( + quantile_50=("col2", quant50), quantile_70=("col2", quant70) + ) + expected = pd.DataFrame( + {"quantile_50": [1.5, 4.0], "quantile_70": [1.7, 4.4]}, + index=pd.Index(["a", "b"], name="col1"), + ) + tm.assert_frame_equal(grouped, expected) + + def test_agg_relabel_with_level(self): + df = pd.DataFrame( + {"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}, + index=pd.MultiIndex.from_product([["A", "B"], ["a", "b"]]), + ) + result = df.groupby(level=0).agg( + aa=("A", "max"), bb=("A", "min"), cc=("B", "mean") + ) + expected = pd.DataFrame( + {"aa": [0, 1], "bb": [0, 1], "cc": [1.5, 3.5]}, index=["A", "B"] + ) + tm.assert_frame_equal(result, expected) + + def test_agg_relabel_other_raises(self): + df = pd.DataFrame({"A": [0, 0, 1], "B": [1, 2, 3]}) + grouped = df.groupby("A") + match = "Must provide" + with pytest.raises(TypeError, match=match): + grouped.agg(foo=1) + + with pytest.raises(TypeError, match=match): + grouped.agg() + + with pytest.raises(TypeError, match=match): + grouped.agg(a=("B", "max"), b=(1, 2, 3)) + + def test_missing_raises(self): + df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) + with pytest.raises(KeyError, match="Column 'C' does not exist"): + df.groupby("A").agg(c=("C", "sum")) + + def test_agg_namedtuple(self): + df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) + result = df.groupby("A").agg( + b=pd.NamedAgg("B", "sum"), c=pd.NamedAgg(column="B", aggfunc="count") + ) + expected = df.groupby("A").agg(b=("B", "sum"), c=("B", "count")) + tm.assert_frame_equal(result, expected) + + def test_mangled(self): + df = pd.DataFrame({"A": [0, 1], "B": [1, 2], "C": [3, 4]}) + result = df.groupby("A").agg(b=("B", lambda x: 0), c=("C", lambda x: 1)) + expected = pd.DataFrame( + {"b": [0, 0], "c": [1, 1]}, index=pd.Index([0, 1], name="A") + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "agg_col1, agg_col2, agg_col3, agg_result1, agg_result2, agg_result3", + [ + ( + (("y", "A"), "max"), + (("y", "A"), np.min), + (("y", "B"), "mean"), + [1, 3], + [0, 2], + [5.5, 7.5], + ), + ( + (("y", "A"), lambda x: max(x)), + (("y", "A"), lambda x: 1), + (("y", "B"), "mean"), + [1, 3], + [1, 1], + [5.5, 7.5], + ), + ( + pd.NamedAgg(("y", "A"), "max"), + pd.NamedAgg(("y", "B"), np.mean), + pd.NamedAgg(("y", "A"), lambda x: 1), + [1, 3], + [5.5, 7.5], + [1, 1], + ), + ], +) +def test_agg_relabel_multiindex_column( + agg_col1, agg_col2, agg_col3, agg_result1, agg_result2, agg_result3 +): + # GH 29422, add tests for multiindex column cases + df = DataFrame( + {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]} + ) + df.columns = pd.MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")]) + idx = pd.Index(["a", "b"], name=("x", "group")) + + result = df.groupby(("x", "group")).agg(a_max=(("y", "A"), "max")) + expected = DataFrame({"a_max": [1, 3]}, index=idx) + tm.assert_frame_equal(result, expected) + + result = df.groupby(("x", "group")).agg( + col_1=agg_col1, col_2=agg_col2, col_3=agg_col3 + ) + expected = DataFrame( + {"col_1": agg_result1, "col_2": agg_result2, "col_3": agg_result3}, index=idx + ) + tm.assert_frame_equal(result, expected) + + +def test_agg_relabel_multiindex_raises_not_exist(): + # GH 29422, add test for raises senario when aggregate column does not exist + df = DataFrame( + {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]} + ) + df.columns = pd.MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")]) + + with pytest.raises(KeyError, match="does not exist"): + df.groupby(("x", "group")).agg(a=(("Y", "a"), "max")) + + +def test_agg_relabel_multiindex_duplicates(): + # GH29422, add test for raises senario when getting duplicates + # GH28426, after this change, duplicates should also work if the relabelling is + # different + df = DataFrame( + {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]} + ) + df.columns = pd.MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")]) + + result = df.groupby(("x", "group")).agg( + a=(("y", "A"), "min"), b=(("y", "A"), "min") + ) + idx = pd.Index(["a", "b"], name=("x", "group")) + expected = DataFrame({"a": [0, 2], "b": [0, 2]}, index=idx) + tm.assert_frame_equal(result, expected) + + +def myfunc(s): + return np.percentile(s, q=0.90) + + +@pytest.mark.parametrize("func", [lambda s: np.percentile(s, q=0.90), myfunc]) +def test_lambda_named_agg(func): + # see gh-28467 + animals = DataFrame( + { + "kind": ["cat", "dog", "cat", "dog"], + "height": [9.1, 6.0, 9.5, 34.0], + "weight": [7.9, 7.5, 9.9, 198.0], + } + ) + + result = animals.groupby("kind").agg( + mean_height=("height", "mean"), perc90=("height", func) + ) + expected = DataFrame( + [[9.3, 9.1036], [20.0, 6.252]], + columns=["mean_height", "perc90"], + index=Index(["cat", "dog"], name="kind"), + ) + + tm.assert_frame_equal(result, expected) + + +def test_aggregate_mixed_types(): + # GH 16916 + df = pd.DataFrame( + data=np.array([0] * 9).reshape(3, 3), columns=list("XYZ"), index=list("abc") + ) + df["grouping"] = ["group 1", "group 1", 2] + result = df.groupby("grouping").aggregate(lambda x: x.tolist()) + expected_data = [[[0], [0], [0]], [[0, 0], [0, 0], [0, 0]]] + expected = pd.DataFrame( + expected_data, + index=Index([2, "group 1"], dtype="object", name="grouping"), + columns=Index(["X", "Y", "Z"], dtype="object"), + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.xfail(reason="Not implemented.") +def test_aggregate_udf_na_extension_type(): + # https://github.com/pandas-dev/pandas/pull/31359 + # This is currently failing to cast back to Int64Dtype. + # The presence of the NA causes two problems + # 1. NA is not an instance of Int64Dtype.type (numpy.int64) + # 2. The presence of an NA forces object type, so the non-NA values is + # a Python int rather than a NumPy int64. Python ints aren't + # instances of numpy.int64. + def aggfunc(x): + if all(x > 2): + return 1 + else: + return pd.NA + + df = pd.DataFrame({"A": pd.array([1, 2, 3])}) + result = df.groupby([1, 1, 2]).agg(aggfunc) + expected = pd.DataFrame({"A": pd.array([1, pd.NA], dtype="Int64")}, index=[1, 2]) + tm.assert_frame_equal(result, expected) + + +class TestLambdaMangling: + def test_maybe_mangle_lambdas_passthrough(self): + assert _maybe_mangle_lambdas("mean") == "mean" + assert _maybe_mangle_lambdas(lambda x: x).__name__ == "" + # don't mangel single lambda. + assert _maybe_mangle_lambdas([lambda x: x])[0].__name__ == "" + + def test_maybe_mangle_lambdas_listlike(self): + aggfuncs = [lambda x: 1, lambda x: 2] + result = _maybe_mangle_lambdas(aggfuncs) + assert result[0].__name__ == "" + assert result[1].__name__ == "" + assert aggfuncs[0](None) == result[0](None) + assert aggfuncs[1](None) == result[1](None) + + def test_maybe_mangle_lambdas(self): + func = {"A": [lambda x: 0, lambda x: 1]} + result = _maybe_mangle_lambdas(func) + assert result["A"][0].__name__ == "" + assert result["A"][1].__name__ == "" + + def test_maybe_mangle_lambdas_args(self): + func = {"A": [lambda x, a, b=1: (0, a, b), lambda x: 1]} + result = _maybe_mangle_lambdas(func) + assert result["A"][0].__name__ == "" + assert result["A"][1].__name__ == "" + + assert func["A"][0](0, 1) == (0, 1, 1) + assert func["A"][0](0, 1, 2) == (0, 1, 2) + assert func["A"][0](0, 2, b=3) == (0, 2, 3) + + def test_maybe_mangle_lambdas_named(self): + func = {"C": np.mean, "D": {"foo": np.mean, "bar": np.mean}} + result = _maybe_mangle_lambdas(func) + assert result == func + + def test_basic(self): + df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) + result = df.groupby("A").agg({"B": [lambda x: 0, lambda x: 1]}) + + expected = pd.DataFrame( + {("B", ""): [0, 0], ("B", ""): [1, 1]}, + index=pd.Index([0, 1], name="A"), + ) + tm.assert_frame_equal(result, expected) + + def test_mangle_series_groupby(self): + gr = pd.Series([1, 2, 3, 4]).groupby([0, 0, 1, 1]) + result = gr.agg([lambda x: 0, lambda x: 1]) + expected = pd.DataFrame({"": [0, 0], "": [1, 1]}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.xfail(reason="GH-26611. kwargs for multi-agg.") + def test_with_kwargs(self): + f1 = lambda x, y, b=1: x.sum() + y + b + f2 = lambda x, y, b=2: x.sum() + y * b + result = pd.Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0) + expected = pd.DataFrame({"": [4], "": [6]}) + tm.assert_frame_equal(result, expected) + + result = pd.Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0, b=10) + expected = pd.DataFrame({"": [13], "": [30]}) + tm.assert_frame_equal(result, expected) + + def test_agg_with_one_lambda(self): + # GH 25719, write tests for DataFrameGroupby.agg with only one lambda + df = pd.DataFrame( + { + "kind": ["cat", "dog", "cat", "dog"], + "height": [9.1, 6.0, 9.5, 34.0], + "weight": [7.9, 7.5, 9.9, 198.0], + } + ) + + columns = ["height_sqr_min", "height_max", "weight_max"] + expected = pd.DataFrame( + { + "height_sqr_min": [82.81, 36.00], + "height_max": [9.5, 34.0], + "weight_max": [9.9, 198.0], + }, + index=pd.Index(["cat", "dog"], name="kind"), + columns=columns, + ) + + # check pd.NameAgg case + result1 = df.groupby(by="kind").agg( + height_sqr_min=pd.NamedAgg( + column="height", aggfunc=lambda x: np.min(x ** 2) + ), + height_max=pd.NamedAgg(column="height", aggfunc="max"), + weight_max=pd.NamedAgg(column="weight", aggfunc="max"), + ) + tm.assert_frame_equal(result1, expected) + + # check agg(key=(col, aggfunc)) case + result2 = df.groupby(by="kind").agg( + height_sqr_min=("height", lambda x: np.min(x ** 2)), + height_max=("height", "max"), + weight_max=("weight", "max"), + ) + tm.assert_frame_equal(result2, expected) + + def test_agg_multiple_lambda(self): + # GH25719, test for DataFrameGroupby.agg with multiple lambdas + # with mixed aggfunc + df = pd.DataFrame( + { + "kind": ["cat", "dog", "cat", "dog"], + "height": [9.1, 6.0, 9.5, 34.0], + "weight": [7.9, 7.5, 9.9, 198.0], + } + ) + columns = [ + "height_sqr_min", + "height_max", + "weight_max", + "height_max_2", + "weight_min", + ] + expected = pd.DataFrame( + { + "height_sqr_min": [82.81, 36.00], + "height_max": [9.5, 34.0], + "weight_max": [9.9, 198.0], + "height_max_2": [9.5, 34.0], + "weight_min": [7.9, 7.5], + }, + index=pd.Index(["cat", "dog"], name="kind"), + columns=columns, + ) + + # check agg(key=(col, aggfunc)) case + result1 = df.groupby(by="kind").agg( + height_sqr_min=("height", lambda x: np.min(x ** 2)), + height_max=("height", "max"), + weight_max=("weight", "max"), + height_max_2=("height", lambda x: np.max(x)), + weight_min=("weight", lambda x: np.min(x)), + ) + tm.assert_frame_equal(result1, expected) + + # check pd.NamedAgg case + result2 = df.groupby(by="kind").agg( + height_sqr_min=pd.NamedAgg( + column="height", aggfunc=lambda x: np.min(x ** 2) + ), + height_max=pd.NamedAgg(column="height", aggfunc="max"), + weight_max=pd.NamedAgg(column="weight", aggfunc="max"), + height_max_2=pd.NamedAgg(column="height", aggfunc=lambda x: np.max(x)), + weight_min=pd.NamedAgg(column="weight", aggfunc=lambda x: np.min(x)), + ) + tm.assert_frame_equal(result2, expected) + + @pytest.mark.parametrize( + "order, expected_reorder", + [ + ( + [ + ("height", ""), + ("height", "max"), + ("weight", "max"), + ("height", ""), + ("weight", ""), + ], + [ + ("height", "_0"), + ("height", "max"), + ("weight", "max"), + ("height", "_1"), + ("weight", ""), + ], + ), + ( + [ + ("col2", "min"), + ("col1", ""), + ("col1", ""), + ("col1", ""), + ], + [ + ("col2", "min"), + ("col1", "_0"), + ("col1", "_1"), + ("col1", "_2"), + ], + ), + ( + [("col", ""), ("col", ""), ("col", "")], + [("col", "_0"), ("col", "_1"), ("col", "_2")], + ), + ], + ) + def test_make_unique(self, order, expected_reorder): + # GH 27519, test if make_unique function reorders correctly + result = _make_unique(order) + + assert result == expected_reorder diff --git a/venv/Lib/site-packages/pandas/tests/groupby/aggregate/test_cython.py b/venv/Lib/site-packages/pandas/tests/groupby/aggregate/test_cython.py new file mode 100644 index 0000000..5ddda26 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/groupby/aggregate/test_cython.py @@ -0,0 +1,238 @@ +""" +test cython .agg behavior +""" + +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Index, NaT, Series, Timedelta, Timestamp, bdate_range +import pandas._testing as tm +from pandas.core.groupby.groupby import DataError + + +@pytest.mark.parametrize( + "op_name", + [ + "count", + "sum", + "std", + "var", + "sem", + "mean", + pytest.param( + "median", + # ignore mean of empty slice + # and all-NaN + marks=[pytest.mark.filterwarnings("ignore::RuntimeWarning")], + ), + "prod", + "min", + "max", + ], +) +def test_cythonized_aggers(op_name): + data = { + "A": [0, 0, 0, 0, 1, 1, 1, 1, 1, 1.0, np.nan, np.nan], + "B": ["A", "B"] * 6, + "C": np.random.randn(12), + } + df = DataFrame(data) + df.loc[2:10:2, "C"] = np.nan + + op = lambda x: getattr(x, op_name)() + + # single column + grouped = df.drop(["B"], axis=1).groupby("A") + exp = {cat: op(group["C"]) for cat, group in grouped} + exp = DataFrame({"C": exp}) + exp.index.name = "A" + result = op(grouped) + tm.assert_frame_equal(result, exp) + + # multiple columns + grouped = df.groupby(["A", "B"]) + expd = {} + for (cat1, cat2), group in grouped: + expd.setdefault(cat1, {})[cat2] = op(group["C"]) + exp = DataFrame(expd).T.stack(dropna=False) + exp.index.names = ["A", "B"] + exp.name = "C" + + result = op(grouped)["C"] + if op_name in ["sum", "prod"]: + tm.assert_series_equal(result, exp) + + +def test_cython_agg_boolean(): + frame = DataFrame( + { + "a": np.random.randint(0, 5, 50), + "b": np.random.randint(0, 2, 50).astype("bool"), + } + ) + result = frame.groupby("a")["b"].mean() + expected = frame.groupby("a")["b"].agg(np.mean) + + tm.assert_series_equal(result, expected) + + +def test_cython_agg_nothing_to_agg(): + frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25}) + msg = "No numeric types to aggregate" + + with pytest.raises(DataError, match=msg): + frame.groupby("a")["b"].mean() + + frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25}) + with pytest.raises(DataError, match=msg): + frame[["b"]].groupby(frame["a"]).mean() + + +def test_cython_agg_nothing_to_agg_with_dates(): + frame = DataFrame( + { + "a": np.random.randint(0, 5, 50), + "b": ["foo", "bar"] * 25, + "dates": pd.date_range("now", periods=50, freq="T"), + } + ) + msg = "No numeric types to aggregate" + with pytest.raises(DataError, match=msg): + frame.groupby("b").dates.mean() + + +def test_cython_agg_frame_columns(): + # #2113 + df = DataFrame({"x": [1, 2, 3], "y": [3, 4, 5]}) + + df.groupby(level=0, axis="columns").mean() + df.groupby(level=0, axis="columns").mean() + df.groupby(level=0, axis="columns").mean() + df.groupby(level=0, axis="columns").mean() + + +def test_cython_agg_return_dict(): + # GH 16741 + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + } + ) + + ts = df.groupby("A")["B"].agg(lambda x: x.value_counts().to_dict()) + expected = Series( + [{"two": 1, "one": 1, "three": 1}, {"two": 2, "one": 2, "three": 1}], + index=Index(["bar", "foo"], name="A"), + name="B", + ) + tm.assert_series_equal(ts, expected) + + +def test_cython_fail_agg(): + dr = bdate_range("1/1/2000", periods=50) + ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr) + + grouped = ts.groupby(lambda x: x.month) + summed = grouped.sum() + expected = grouped.agg(np.sum) + tm.assert_series_equal(summed, expected) + + +@pytest.mark.parametrize( + "op, targop", + [ + ("mean", np.mean), + ("median", np.median), + ("var", np.var), + ("add", np.sum), + ("prod", np.prod), + ("min", np.min), + ("max", np.max), + ("first", lambda x: x.iloc[0]), + ("last", lambda x: x.iloc[-1]), + ], +) +def test__cython_agg_general(op, targop): + df = DataFrame(np.random.randn(1000)) + labels = np.random.randint(0, 50, size=1000).astype(float) + + result = df.groupby(labels)._cython_agg_general(op) + expected = df.groupby(labels).agg(targop) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "op, targop", + [ + ("mean", np.mean), + ("median", lambda x: np.median(x) if len(x) > 0 else np.nan), + ("var", lambda x: np.var(x, ddof=1)), + ("min", np.min), + ("max", np.max), + ], +) +def test_cython_agg_empty_buckets(op, targop, observed): + df = pd.DataFrame([11, 12, 13]) + grps = range(0, 55, 5) + + # calling _cython_agg_general directly, instead of via the user API + # which sets different values for min_count, so do that here. + g = df.groupby(pd.cut(df[0], grps), observed=observed) + result = g._cython_agg_general(op) + + g = df.groupby(pd.cut(df[0], grps), observed=observed) + expected = g.agg(lambda x: targop(x)) + tm.assert_frame_equal(result, expected) + + +def test_cython_agg_empty_buckets_nanops(observed): + # GH-18869 can't call nanops on empty groups, so hardcode expected + # for these + df = pd.DataFrame([11, 12, 13], columns=["a"]) + grps = range(0, 25, 5) + # add / sum + result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general( + "add" + ) + intervals = pd.interval_range(0, 20, freq=5) + expected = pd.DataFrame( + {"a": [0, 0, 36, 0]}, + index=pd.CategoricalIndex(intervals, name="a", ordered=True), + ) + if observed: + expected = expected[expected.a != 0] + + tm.assert_frame_equal(result, expected) + + # prod + result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general( + "prod" + ) + expected = pd.DataFrame( + {"a": [1, 1, 1716, 1]}, + index=pd.CategoricalIndex(intervals, name="a", ordered=True), + ) + if observed: + expected = expected[expected.a != 1] + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("op", ["first", "last", "max", "min"]) +@pytest.mark.parametrize( + "data", [Timestamp("2016-10-14 21:00:44.557"), Timedelta("17088 days 21:00:44.557")] +) +def test_cython_with_timestamp_and_nat(op, data): + # https://github.com/pandas-dev/pandas/issues/19526 + df = DataFrame({"a": [0, 1], "b": [data, NaT]}) + index = Index([0, 1], name="a") + + # We will group by a and test the cython aggregations + expected = DataFrame({"b": [data, NaT]}, index=index) + + result = df.groupby("a").aggregate(op) + tm.assert_frame_equal(expected, result) diff --git a/venv/Lib/site-packages/pandas/tests/groupby/aggregate/test_other.py b/venv/Lib/site-packages/pandas/tests/groupby/aggregate/test_other.py new file mode 100644 index 0000000..52ee3e6 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/groupby/aggregate/test_other.py @@ -0,0 +1,644 @@ +""" +test all other .agg behavior +""" + +import datetime as dt +from functools import partial + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Index, + MultiIndex, + PeriodIndex, + Series, + date_range, + period_range, +) +import pandas._testing as tm +from pandas.core.base import SpecificationError + +from pandas.io.formats.printing import pprint_thing + + +def test_agg_api(): + # GH 6337 + # https://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error + # different api for agg when passed custom function with mixed frame + + df = DataFrame( + { + "data1": np.random.randn(5), + "data2": np.random.randn(5), + "key1": ["a", "a", "b", "b", "a"], + "key2": ["one", "two", "one", "two", "one"], + } + ) + grouped = df.groupby("key1") + + def peak_to_peak(arr): + return arr.max() - arr.min() + + expected = grouped.agg([peak_to_peak]) + expected.columns = ["data1", "data2"] + result = grouped.agg(peak_to_peak) + tm.assert_frame_equal(result, expected) + + +def test_agg_datetimes_mixed(): + data = [[1, "2012-01-01", 1.0], [2, "2012-01-02", 2.0], [3, None, 3.0]] + + df1 = DataFrame( + { + "key": [x[0] for x in data], + "date": [x[1] for x in data], + "value": [x[2] for x in data], + } + ) + + data = [ + [ + row[0], + (dt.datetime.strptime(row[1], "%Y-%m-%d").date() if row[1] else None), + row[2], + ] + for row in data + ] + + df2 = DataFrame( + { + "key": [x[0] for x in data], + "date": [x[1] for x in data], + "value": [x[2] for x in data], + } + ) + + df1["weights"] = df1["value"] / df1["value"].sum() + gb1 = df1.groupby("date").aggregate(np.sum) + + df2["weights"] = df1["value"] / df1["value"].sum() + gb2 = df2.groupby("date").aggregate(np.sum) + + assert len(gb1) == len(gb2) + + +def test_agg_period_index(): + prng = period_range("2012-1-1", freq="M", periods=3) + df = DataFrame(np.random.randn(3, 2), index=prng) + rs = df.groupby(level=0).sum() + assert isinstance(rs.index, PeriodIndex) + + # GH 3579 + index = period_range(start="1999-01", periods=5, freq="M") + s1 = Series(np.random.rand(len(index)), index=index) + s2 = Series(np.random.rand(len(index)), index=index) + df = DataFrame.from_dict({"s1": s1, "s2": s2}) + grouped = df.groupby(df.index.month) + list(grouped) + + +def test_agg_dict_parameter_cast_result_dtypes(): + # GH 12821 + + df = DataFrame( + { + "class": ["A", "A", "B", "B", "C", "C", "D", "D"], + "time": date_range("1/1/2011", periods=8, freq="H"), + } + ) + df.loc[[0, 1, 2, 5], "time"] = None + + # test for `first` function + exp = df.loc[[0, 3, 4, 6]].set_index("class") + grouped = df.groupby("class") + tm.assert_frame_equal(grouped.first(), exp) + tm.assert_frame_equal(grouped.agg("first"), exp) + tm.assert_frame_equal(grouped.agg({"time": "first"}), exp) + tm.assert_series_equal(grouped.time.first(), exp["time"]) + tm.assert_series_equal(grouped.time.agg("first"), exp["time"]) + + # test for `last` function + exp = df.loc[[0, 3, 4, 7]].set_index("class") + grouped = df.groupby("class") + tm.assert_frame_equal(grouped.last(), exp) + tm.assert_frame_equal(grouped.agg("last"), exp) + tm.assert_frame_equal(grouped.agg({"time": "last"}), exp) + tm.assert_series_equal(grouped.time.last(), exp["time"]) + tm.assert_series_equal(grouped.time.agg("last"), exp["time"]) + + # count + exp = pd.Series([2, 2, 2, 2], index=Index(list("ABCD"), name="class"), name="time") + tm.assert_series_equal(grouped.time.agg(len), exp) + tm.assert_series_equal(grouped.time.size(), exp) + + exp = pd.Series([0, 1, 1, 2], index=Index(list("ABCD"), name="class"), name="time") + tm.assert_series_equal(grouped.time.count(), exp) + + +def test_agg_cast_results_dtypes(): + # similar to GH12821 + # xref #11444 + u = [dt.datetime(2015, x + 1, 1) for x in range(12)] + v = list("aaabbbbbbccd") + df = pd.DataFrame({"X": v, "Y": u}) + + result = df.groupby("X")["Y"].agg(len) + expected = df.groupby("X")["Y"].count() + tm.assert_series_equal(result, expected) + + +def test_aggregate_float64_no_int64(): + # see gh-11199 + df = DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 2, 4, 5], "c": [1, 2, 3, 4, 5]}) + + expected = DataFrame({"a": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5]) + expected.index.name = "b" + + result = df.groupby("b")[["a"]].mean() + tm.assert_frame_equal(result, expected) + + expected = DataFrame({"a": [1, 2.5, 4, 5], "c": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5]) + expected.index.name = "b" + + result = df.groupby("b")[["a", "c"]].mean() + tm.assert_frame_equal(result, expected) + + +def test_aggregate_api_consistency(): + # GH 9052 + # make sure that the aggregates via dict + # are consistent + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": np.random.randn(8) + 1.0, + "D": np.arange(8), + } + ) + + grouped = df.groupby(["A", "B"]) + c_mean = grouped["C"].mean() + c_sum = grouped["C"].sum() + d_mean = grouped["D"].mean() + d_sum = grouped["D"].sum() + + result = grouped["D"].agg(["sum", "mean"]) + expected = pd.concat([d_sum, d_mean], axis=1) + expected.columns = ["sum", "mean"] + tm.assert_frame_equal(result, expected, check_like=True) + + result = grouped.agg([np.sum, np.mean]) + expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1) + expected.columns = MultiIndex.from_product([["C", "D"], ["sum", "mean"]]) + tm.assert_frame_equal(result, expected, check_like=True) + + result = grouped[["D", "C"]].agg([np.sum, np.mean]) + expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1) + expected.columns = MultiIndex.from_product([["D", "C"], ["sum", "mean"]]) + tm.assert_frame_equal(result, expected, check_like=True) + + result = grouped.agg({"C": "mean", "D": "sum"}) + expected = pd.concat([d_sum, c_mean], axis=1) + tm.assert_frame_equal(result, expected, check_like=True) + + result = grouped.agg({"C": ["mean", "sum"], "D": ["mean", "sum"]}) + expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1) + expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]]) + + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + grouped[["D", "C"]].agg({"r": np.sum, "r2": np.mean}) + + +def test_agg_dict_renaming_deprecation(): + # 15931 + df = pd.DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)}) + + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + df.groupby("A").agg( + {"B": {"foo": ["sum", "max"]}, "C": {"bar": ["count", "min"]}} + ) + + with pytest.raises(SpecificationError, match=msg): + df.groupby("A")[["B", "C"]].agg({"ma": "max"}) + + with pytest.raises(SpecificationError, match=msg): + df.groupby("A").B.agg({"foo": "count"}) + + +def test_agg_compat(): + # GH 12334 + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": np.random.randn(8) + 1.0, + "D": np.arange(8), + } + ) + + g = df.groupby(["A", "B"]) + + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + g["D"].agg({"C": ["sum", "std"]}) + + with pytest.raises(SpecificationError, match=msg): + g["D"].agg({"C": "sum", "D": "std"}) + + +def test_agg_nested_dicts(): + # API change for disallowing these types of nested dicts + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": np.random.randn(8) + 1.0, + "D": np.arange(8), + } + ) + + g = df.groupby(["A", "B"]) + + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + g.aggregate({"r1": {"C": ["mean", "sum"]}, "r2": {"D": ["mean", "sum"]}}) + + with pytest.raises(SpecificationError, match=msg): + g.agg({"C": {"ra": ["mean", "std"]}, "D": {"rb": ["mean", "std"]}}) + + # same name as the original column + # GH9052 + with pytest.raises(SpecificationError, match=msg): + g["D"].agg({"result1": np.sum, "result2": np.mean}) + + with pytest.raises(SpecificationError, match=msg): + g["D"].agg({"D": np.sum, "result2": np.mean}) + + +def test_agg_item_by_item_raise_typeerror(): + df = DataFrame(np.random.randint(10, size=(20, 10))) + + def raiseException(df): + pprint_thing("----------------------------------------") + pprint_thing(df.to_string()) + raise TypeError("test") + + with pytest.raises(TypeError, match="test"): + df.groupby(0).agg(raiseException) + + +def test_series_agg_multikey(): + ts = tm.makeTimeSeries() + grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) + + result = grouped.agg(np.sum) + expected = grouped.sum() + tm.assert_series_equal(result, expected) + + +def test_series_agg_multi_pure_python(): + data = DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + "D": np.random.randn(11), + "E": np.random.randn(11), + "F": np.random.randn(11), + } + ) + + def bad(x): + assert len(x.values.base) > 0 + return "foo" + + result = data.groupby(["A", "B"]).agg(bad) + expected = data.groupby(["A", "B"]).agg(lambda x: "foo") + tm.assert_frame_equal(result, expected) + + +def test_agg_consistency(): + # agg with ([]) and () not consistent + # GH 6715 + def P1(a): + return np.percentile(a.dropna(), q=1) + + df = DataFrame( + { + "col1": [1, 2, 3, 4], + "col2": [10, 25, 26, 31], + "date": [ + dt.date(2013, 2, 10), + dt.date(2013, 2, 10), + dt.date(2013, 2, 11), + dt.date(2013, 2, 11), + ], + } + ) + + g = df.groupby("date") + + expected = g.agg([P1]) + expected.columns = expected.columns.levels[0] + + result = g.agg(P1) + tm.assert_frame_equal(result, expected) + + +def test_agg_callables(): + # GH 7929 + df = DataFrame({"foo": [1, 2], "bar": [3, 4]}).astype(np.int64) + + class fn_class: + def __call__(self, x): + return sum(x) + + equiv_callables = [ + sum, + np.sum, + lambda x: sum(x), + lambda x: x.sum(), + partial(sum), + fn_class(), + ] + + expected = df.groupby("foo").agg(sum) + for ecall in equiv_callables: + result = df.groupby("foo").agg(ecall) + tm.assert_frame_equal(result, expected) + + +def test_agg_over_numpy_arrays(): + # GH 3788 + df = pd.DataFrame( + [ + [1, np.array([10, 20, 30])], + [1, np.array([40, 50, 60])], + [2, np.array([20, 30, 40])], + ], + columns=["category", "arraydata"], + ) + result = df.groupby("category").agg(sum) + + expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]] + expected_index = pd.Index([1, 2], name="category") + expected_column = ["arraydata"] + expected = pd.DataFrame( + expected_data, index=expected_index, columns=expected_column + ) + + tm.assert_frame_equal(result, expected) + + +def test_agg_tzaware_non_datetime_result(): + # discussed in GH#29589, fixed in GH#29641, operating on tzaware values + # with function that is not dtype-preserving + dti = pd.date_range("2012-01-01", periods=4, tz="UTC") + df = pd.DataFrame({"a": [0, 0, 1, 1], "b": dti}) + gb = df.groupby("a") + + # Case that _does_ preserve the dtype + result = gb["b"].agg(lambda x: x.iloc[0]) + expected = pd.Series(dti[::2], name="b") + expected.index.name = "a" + tm.assert_series_equal(result, expected) + + # Cases that do _not_ preserve the dtype + result = gb["b"].agg(lambda x: x.iloc[0].year) + expected = pd.Series([2012, 2012], name="b") + expected.index.name = "a" + tm.assert_series_equal(result, expected) + + result = gb["b"].agg(lambda x: x.iloc[-1] - x.iloc[0]) + expected = pd.Series([pd.Timedelta(days=1), pd.Timedelta(days=1)], name="b") + expected.index.name = "a" + tm.assert_series_equal(result, expected) + + +def test_agg_timezone_round_trip(): + # GH 15426 + ts = pd.Timestamp("2016-01-01 12:00:00", tz="US/Pacific") + df = pd.DataFrame( + {"a": 1, "b": [ts + dt.timedelta(minutes=nn) for nn in range(10)]} + ) + + result1 = df.groupby("a")["b"].agg(np.min).iloc[0] + result2 = df.groupby("a")["b"].agg(lambda x: np.min(x)).iloc[0] + result3 = df.groupby("a")["b"].min().iloc[0] + + assert result1 == ts + assert result2 == ts + assert result3 == ts + + dates = [ + pd.Timestamp(f"2016-01-0{i:d} 12:00:00", tz="US/Pacific") for i in range(1, 5) + ] + df = pd.DataFrame({"A": ["a", "b"] * 2, "B": dates}) + grouped = df.groupby("A") + + ts = df["B"].iloc[0] + assert ts == grouped.nth(0)["B"].iloc[0] + assert ts == grouped.head(1)["B"].iloc[0] + assert ts == grouped.first()["B"].iloc[0] + + # GH#27110 applying iloc should return a DataFrame + assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 0] + + ts = df["B"].iloc[2] + assert ts == grouped.last()["B"].iloc[0] + + # GH#27110 applying iloc should return a DataFrame + assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 0] + + +def test_sum_uint64_overflow(): + # see gh-14758 + # Convert to uint64 and don't overflow + df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object) + df = df + 9223372036854775807 + + index = pd.Index( + [9223372036854775808, 9223372036854775810, 9223372036854775812], dtype=np.uint64 + ) + expected = pd.DataFrame( + {1: [9223372036854775809, 9223372036854775811, 9223372036854775813]}, + index=index, + ) + + expected.index.name = 0 + result = df.groupby(0).sum() + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "structure, expected", + [ + (tuple, pd.DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})), + (list, pd.DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})), + ( + lambda x: tuple(x), + pd.DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}}), + ), + ( + lambda x: list(x), + pd.DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}}), + ), + ], +) +def test_agg_structs_dataframe(structure, expected): + df = pd.DataFrame( + {"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]} + ) + + result = df.groupby(["A", "B"]).aggregate(structure) + expected.index.names = ["A", "B"] + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "structure, expected", + [ + (tuple, pd.Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")), + (list, pd.Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")), + (lambda x: tuple(x), pd.Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")), + (lambda x: list(x), pd.Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")), + ], +) +def test_agg_structs_series(structure, expected): + # Issue #18079 + df = pd.DataFrame( + {"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]} + ) + + result = df.groupby("A")["C"].aggregate(structure) + expected.index.name = "A" + tm.assert_series_equal(result, expected) + + +def test_agg_category_nansum(observed): + categories = ["a", "b", "c"] + df = pd.DataFrame( + {"A": pd.Categorical(["a", "a", "b"], categories=categories), "B": [1, 2, 3]} + ) + result = df.groupby("A", observed=observed).B.agg(np.nansum) + expected = pd.Series( + [3, 3, 0], + index=pd.CategoricalIndex(["a", "b", "c"], categories=categories, name="A"), + name="B", + ) + if observed: + expected = expected[expected != 0] + tm.assert_series_equal(result, expected) + + +def test_agg_list_like_func(): + # GH 18473 + df = pd.DataFrame( + {"A": [str(x) for x in range(3)], "B": [str(x) for x in range(3)]} + ) + grouped = df.groupby("A", as_index=False, sort=False) + result = grouped.agg({"B": lambda x: list(x)}) + expected = pd.DataFrame( + {"A": [str(x) for x in range(3)], "B": [[str(x)] for x in range(3)]} + ) + tm.assert_frame_equal(result, expected) + + +def test_agg_lambda_with_timezone(): + # GH 23683 + df = pd.DataFrame( + { + "tag": [1, 1], + "date": [ + pd.Timestamp("2018-01-01", tz="UTC"), + pd.Timestamp("2018-01-02", tz="UTC"), + ], + } + ) + result = df.groupby("tag").agg({"date": lambda e: e.head(1)}) + expected = pd.DataFrame( + [pd.Timestamp("2018-01-01", tz="UTC")], + index=pd.Index([1], name="tag"), + columns=["date"], + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "err_cls", + [ + NotImplementedError, + RuntimeError, + KeyError, + IndexError, + OSError, + ValueError, + ArithmeticError, + AttributeError, + ], +) +def test_groupby_agg_err_catching(err_cls): + # make sure we suppress anything other than TypeError or AssertionError + # in _python_agg_general + + # Use a non-standard EA to make sure we don't go down ndarray paths + from pandas.tests.extension.decimal.array import DecimalArray, make_data, to_decimal + + data = make_data()[:5] + df = pd.DataFrame( + {"id1": [0, 0, 0, 1, 1], "id2": [0, 1, 0, 1, 1], "decimals": DecimalArray(data)} + ) + + expected = pd.Series(to_decimal([data[0], data[3]])) + + def weird_func(x): + # weird function that raise something other than TypeError or IndexError + # in _python_agg_general + if len(x) == 0: + raise err_cls + return x.iloc[0] + + result = df["decimals"].groupby(df["id1"]).agg(weird_func) + tm.assert_series_equal(result, expected, check_names=False) diff --git a/venv/Lib/site-packages/pandas/tests/groupby/conftest.py b/venv/Lib/site-packages/pandas/tests/groupby/conftest.py new file mode 100644 index 0000000..8901af7 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/groupby/conftest.py @@ -0,0 +1,124 @@ +import numpy as np +import pytest + +from pandas import DataFrame, MultiIndex +import pandas._testing as tm +from pandas.core.groupby.base import reduction_kernels, transformation_kernels + + +@pytest.fixture +def mframe(): + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + return DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) + + +@pytest.fixture +def df(): + return DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + } + ) + + +@pytest.fixture +def ts(): + return tm.makeTimeSeries() + + +@pytest.fixture +def tsd(): + return tm.getTimeSeriesData() + + +@pytest.fixture +def tsframe(tsd): + return DataFrame(tsd) + + +@pytest.fixture +def df_mixed_floats(): + return DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.array(np.random.randn(8), dtype="float32"), + } + ) + + +@pytest.fixture +def three_group(): + return DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + "D": np.random.randn(11), + "E": np.random.randn(11), + "F": np.random.randn(11), + } + ) + + +@pytest.fixture(params=sorted(reduction_kernels)) +def reduction_func(request): + """yields the string names of all groupby reduction functions, one at a time. + """ + return request.param + + +@pytest.fixture(params=transformation_kernels) +def transformation_func(request): + """yields the string names of all groupby transformation functions.""" + return request.param + + +@pytest.fixture(params=sorted(reduction_kernels) + sorted(transformation_kernels)) +def groupby_func(request): + """yields both aggregation and transformation functions.""" + return request.param diff --git a/venv/Lib/site-packages/pandas/tests/groupby/test_apply.py b/venv/Lib/site-packages/pandas/tests/groupby/test_apply.py new file mode 100644 index 0000000..9c3a832 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/groupby/test_apply.py @@ -0,0 +1,787 @@ +from datetime import datetime +from io import StringIO + +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Index, MultiIndex, Series, bdate_range +import pandas._testing as tm + + +def test_apply_issues(): + # GH 5788 + + s = """2011.05.16,00:00,1.40893 +2011.05.16,01:00,1.40760 +2011.05.16,02:00,1.40750 +2011.05.16,03:00,1.40649 +2011.05.17,02:00,1.40893 +2011.05.17,03:00,1.40760 +2011.05.17,04:00,1.40750 +2011.05.17,05:00,1.40649 +2011.05.18,02:00,1.40893 +2011.05.18,03:00,1.40760 +2011.05.18,04:00,1.40750 +2011.05.18,05:00,1.40649""" + + df = pd.read_csv( + StringIO(s), + header=None, + names=["date", "time", "value"], + parse_dates=[["date", "time"]], + ) + df = df.set_index("date_time") + + expected = df.groupby(df.index.date).idxmax() + result = df.groupby(df.index.date).apply(lambda x: x.idxmax()) + tm.assert_frame_equal(result, expected) + + # GH 5789 + # don't auto coerce dates + df = pd.read_csv(StringIO(s), header=None, names=["date", "time", "value"]) + exp_idx = pd.Index( + ["2011.05.16", "2011.05.17", "2011.05.18"], dtype=object, name="date" + ) + expected = Series(["00:00", "02:00", "02:00"], index=exp_idx) + result = df.groupby("date").apply(lambda x: x["time"][x["value"].idxmax()]) + tm.assert_series_equal(result, expected) + + +def test_apply_trivial(): + # GH 20066 + # trivial apply: ignore input and return a constant dataframe. + df = pd.DataFrame( + {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, + columns=["key", "data"], + ) + expected = pd.concat([df.iloc[1:], df.iloc[1:]], axis=1, keys=["float64", "object"]) + result = df.groupby([str(x) for x in df.dtypes], axis=1).apply( + lambda x: df.iloc[1:] + ) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.xfail( + reason="GH#20066; function passed into apply " + "returns a DataFrame with the same index " + "as the one to create GroupBy object." +) +def test_apply_trivial_fail(): + # GH 20066 + # trivial apply fails if the constant dataframe has the same index + # with the one used to create GroupBy object. + df = pd.DataFrame( + {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, + columns=["key", "data"], + ) + expected = pd.concat([df, df], axis=1, keys=["float64", "object"]) + result = df.groupby([str(x) for x in df.dtypes], axis=1).apply(lambda x: df) + + tm.assert_frame_equal(result, expected) + + +def test_fast_apply(): + # make sure that fast apply is correctly called + # rather than raising any kind of error + # otherwise the python path will be callsed + # which slows things down + N = 1000 + labels = np.random.randint(0, 2000, size=N) + labels2 = np.random.randint(0, 3, size=N) + df = DataFrame( + { + "key": labels, + "key2": labels2, + "value1": np.random.randn(N), + "value2": ["foo", "bar", "baz", "qux"] * (N // 4), + } + ) + + def f(g): + return 1 + + g = df.groupby(["key", "key2"]) + + grouper = g.grouper + + splitter = grouper._get_splitter(g._selected_obj, axis=g.axis) + group_keys = grouper._get_group_keys() + + values, mutated = splitter.fast_apply(f, group_keys) + + assert not mutated + + +@pytest.mark.parametrize( + "df, group_names", + [ + (DataFrame({"a": [1, 1, 1, 2, 3], "b": ["a", "a", "a", "b", "c"]}), [1, 2, 3]), + (DataFrame({"a": [0, 0, 1, 1], "b": [0, 1, 0, 1]}), [0, 1]), + (DataFrame({"a": [1]}), [1]), + (DataFrame({"a": [1, 1, 1, 2, 2, 1, 1, 2], "b": range(8)}), [1, 2]), + (DataFrame({"a": [1, 2, 3, 1, 2, 3], "two": [4, 5, 6, 7, 8, 9]}), [1, 2, 3]), + ( + DataFrame( + { + "a": list("aaabbbcccc"), + "B": [3, 4, 3, 6, 5, 2, 1, 9, 5, 4], + "C": [4, 0, 2, 2, 2, 7, 8, 6, 2, 8], + } + ), + ["a", "b", "c"], + ), + (DataFrame([[1, 2, 3], [2, 2, 3]], columns=["a", "b", "c"]), [1, 2]), + ], + ids=[ + "GH2936", + "GH7739 & GH10519", + "GH10519", + "GH2656", + "GH12155", + "GH20084", + "GH21417", + ], +) +def test_group_apply_once_per_group(df, group_names): + # GH2936, GH7739, GH10519, GH2656, GH12155, GH20084, GH21417 + + # This test should ensure that a function is only evaluated + # once per group. Previously the function has been evaluated twice + # on the first group to check if the Cython index slider is safe to use + # This test ensures that the side effect (append to list) is only triggered + # once per group + + names = [] + # cannot parameterize over the functions since they need external + # `names` to detect side effects + + def f_copy(group): + # this takes the fast apply path + names.append(group.name) + return group.copy() + + def f_nocopy(group): + # this takes the slow apply path + names.append(group.name) + return group + + def f_scalar(group): + # GH7739, GH2656 + names.append(group.name) + return 0 + + def f_none(group): + # GH10519, GH12155, GH21417 + names.append(group.name) + return None + + def f_constant_df(group): + # GH2936, GH20084 + names.append(group.name) + return DataFrame({"a": [1], "b": [1]}) + + for func in [f_copy, f_nocopy, f_scalar, f_none, f_constant_df]: + del names[:] + + df.groupby("a").apply(func) + assert names == group_names + + +def test_apply_with_mixed_dtype(): + # GH3480, apply with mixed dtype on axis=1 breaks in 0.11 + df = DataFrame( + { + "foo1": np.random.randn(6), + "foo2": ["one", "two", "two", "three", "one", "two"], + } + ) + result = df.apply(lambda x: x, axis=1).dtypes + expected = df.dtypes + tm.assert_series_equal(result, expected) + + # GH 3610 incorrect dtype conversion with as_index=False + df = DataFrame({"c1": [1, 2, 6, 6, 8]}) + df["c2"] = df.c1 / 2.0 + result1 = df.groupby("c2").mean().reset_index().c2 + result2 = df.groupby("c2", as_index=False).mean().c2 + tm.assert_series_equal(result1, result2) + + +def test_groupby_as_index_apply(df): + # GH #4648 and #3417 + df = DataFrame( + { + "item_id": ["b", "b", "a", "c", "a", "b"], + "user_id": [1, 2, 1, 1, 3, 1], + "time": range(6), + } + ) + + g_as = df.groupby("user_id", as_index=True) + g_not_as = df.groupby("user_id", as_index=False) + + res_as = g_as.head(2).index + res_not_as = g_not_as.head(2).index + exp = Index([0, 1, 2, 4]) + tm.assert_index_equal(res_as, exp) + tm.assert_index_equal(res_not_as, exp) + + res_as_apply = g_as.apply(lambda x: x.head(2)).index + res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index + + # apply doesn't maintain the original ordering + # changed in GH5610 as the as_index=False returns a MI here + exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), (2, 4)]) + tp = [(1, 0), (1, 2), (2, 1), (3, 4)] + exp_as_apply = MultiIndex.from_tuples(tp, names=["user_id", None]) + + tm.assert_index_equal(res_as_apply, exp_as_apply) + tm.assert_index_equal(res_not_as_apply, exp_not_as_apply) + + ind = Index(list("abcde")) + df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind) + res = df.groupby(0, as_index=False).apply(lambda x: x).index + tm.assert_index_equal(res, ind) + + +def test_apply_concat_preserve_names(three_group): + grouped = three_group.groupby(["A", "B"]) + + def desc(group): + result = group.describe() + result.index.name = "stat" + return result + + def desc2(group): + result = group.describe() + result.index.name = "stat" + result = result[: len(group)] + # weirdo + return result + + def desc3(group): + result = group.describe() + + # names are different + result.index.name = f"stat_{len(group):d}" + + result = result[: len(group)] + # weirdo + return result + + result = grouped.apply(desc) + assert result.index.names == ("A", "B", "stat") + + result2 = grouped.apply(desc2) + assert result2.index.names == ("A", "B", "stat") + + result3 = grouped.apply(desc3) + assert result3.index.names == ("A", "B", None) + + +def test_apply_series_to_frame(): + def f(piece): + with np.errstate(invalid="ignore"): + logged = np.log(piece) + return DataFrame( + {"value": piece, "demeaned": piece - piece.mean(), "logged": logged} + ) + + dr = bdate_range("1/1/2000", periods=100) + ts = Series(np.random.randn(100), index=dr) + + grouped = ts.groupby(lambda x: x.month) + result = grouped.apply(f) + + assert isinstance(result, DataFrame) + tm.assert_index_equal(result.index, ts.index) + + +def test_apply_series_yield_constant(df): + result = df.groupby(["A", "B"])["C"].apply(len) + assert result.index.names[:2] == ("A", "B") + + +def test_apply_frame_yield_constant(df): + # GH13568 + result = df.groupby(["A", "B"]).apply(len) + assert isinstance(result, Series) + assert result.name is None + + result = df.groupby(["A", "B"])[["C", "D"]].apply(len) + assert isinstance(result, Series) + assert result.name is None + + +def test_apply_frame_to_series(df): + grouped = df.groupby(["A", "B"]) + result = grouped.apply(len) + expected = grouped.count()["C"] + tm.assert_index_equal(result.index, expected.index) + tm.assert_numpy_array_equal(result.values, expected.values) + + +def test_apply_frame_concat_series(): + def trans(group): + return group.groupby("B")["C"].sum().sort_values()[:2] + + def trans2(group): + grouped = group.groupby(df.reindex(group.index)["B"]) + return grouped.sum().sort_values()[:2] + + df = DataFrame( + { + "A": np.random.randint(0, 5, 1000), + "B": np.random.randint(0, 5, 1000), + "C": np.random.randn(1000), + } + ) + + result = df.groupby("A").apply(trans) + exp = df.groupby("A")["C"].apply(trans2) + tm.assert_series_equal(result, exp, check_names=False) + assert result.name == "C" + + +def test_apply_transform(ts): + grouped = ts.groupby(lambda x: x.month) + result = grouped.apply(lambda x: x * 2) + expected = grouped.transform(lambda x: x * 2) + tm.assert_series_equal(result, expected) + + +def test_apply_multikey_corner(tsframe): + grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) + + def f(group): + return group.sort_values("A")[-5:] + + result = grouped.apply(f) + for key, group in grouped: + tm.assert_frame_equal(result.loc[key], f(group)) + + +def test_apply_chunk_view(): + # Low level tinkering could be unsafe, make sure not + df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) + + result = df.groupby("key", group_keys=False).apply(lambda x: x[:2]) + expected = df.take([0, 1, 3, 4, 6, 7]) + tm.assert_frame_equal(result, expected) + + +def test_apply_no_name_column_conflict(): + df = DataFrame( + { + "name": [1, 1, 1, 1, 1, 1, 2, 2, 2, 2], + "name2": [0, 0, 0, 1, 1, 1, 0, 0, 1, 1], + "value": range(9, -1, -1), + } + ) + + # it works! #2605 + grouped = df.groupby(["name", "name2"]) + grouped.apply(lambda x: x.sort_values("value", inplace=True)) + + +def test_apply_typecast_fail(): + df = DataFrame( + { + "d": [1.0, 1.0, 1.0, 2.0, 2.0, 2.0], + "c": np.tile(["a", "b", "c"], 2), + "v": np.arange(1.0, 7.0), + } + ) + + def f(group): + v = group["v"] + group["v2"] = (v - v.min()) / (v.max() - v.min()) + return group + + result = df.groupby("d").apply(f) + + expected = df.copy() + expected["v2"] = np.tile([0.0, 0.5, 1], 2) + + tm.assert_frame_equal(result, expected) + + +def test_apply_multiindex_fail(): + index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]]) + df = DataFrame( + { + "d": [1.0, 1.0, 1.0, 2.0, 2.0, 2.0], + "c": np.tile(["a", "b", "c"], 2), + "v": np.arange(1.0, 7.0), + }, + index=index, + ) + + def f(group): + v = group["v"] + group["v2"] = (v - v.min()) / (v.max() - v.min()) + return group + + result = df.groupby("d").apply(f) + + expected = df.copy() + expected["v2"] = np.tile([0.0, 0.5, 1], 2) + + tm.assert_frame_equal(result, expected) + + +def test_apply_corner(tsframe): + result = tsframe.groupby(lambda x: x.year).apply(lambda x: x * 2) + expected = tsframe * 2 + tm.assert_frame_equal(result, expected) + + +def test_apply_without_copy(): + # GH 5545 + # returning a non-copy in an applied function fails + + data = DataFrame( + { + "id_field": [100, 100, 200, 300], + "category": ["a", "b", "c", "c"], + "value": [1, 2, 3, 4], + } + ) + + def filt1(x): + if x.shape[0] == 1: + return x.copy() + else: + return x[x.category == "c"] + + def filt2(x): + if x.shape[0] == 1: + return x + else: + return x[x.category == "c"] + + expected = data.groupby("id_field").apply(filt1) + result = data.groupby("id_field").apply(filt2) + tm.assert_frame_equal(result, expected) + + +def test_apply_corner_cases(): + # #535, can't use sliding iterator + + N = 1000 + labels = np.random.randint(0, 100, size=N) + df = DataFrame( + { + "key": labels, + "value1": np.random.randn(N), + "value2": ["foo", "bar", "baz", "qux"] * (N // 4), + } + ) + + grouped = df.groupby("key") + + def f(g): + g["value3"] = g["value1"] * 2 + return g + + result = grouped.apply(f) + assert "value3" in result + + +def test_apply_numeric_coercion_when_datetime(): + # In the past, group-by/apply operations have been over-eager + # in converting dtypes to numeric, in the presence of datetime + # columns. Various GH issues were filed, the reproductions + # for which are here. + + # GH 15670 + df = pd.DataFrame( + {"Number": [1, 2], "Date": ["2017-03-02"] * 2, "Str": ["foo", "inf"]} + ) + expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) + df.Date = pd.to_datetime(df.Date) + result = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) + tm.assert_series_equal(result["Str"], expected["Str"]) + + # GH 15421 + df = pd.DataFrame( + {"A": [10, 20, 30], "B": ["foo", "3", "4"], "T": [pd.Timestamp("12:31:22")] * 3} + ) + + def get_B(g): + return g.iloc[0][["B"]] + + result = df.groupby("A").apply(get_B)["B"] + expected = df.B + expected.index = df.A + tm.assert_series_equal(result, expected) + + # GH 14423 + def predictions(tool): + out = pd.Series(index=["p1", "p2", "useTime"], dtype=object) + if "step1" in list(tool.State): + out["p1"] = str(tool[tool.State == "step1"].Machine.values[0]) + if "step2" in list(tool.State): + out["p2"] = str(tool[tool.State == "step2"].Machine.values[0]) + out["useTime"] = str(tool[tool.State == "step2"].oTime.values[0]) + return out + + df1 = pd.DataFrame( + { + "Key": ["B", "B", "A", "A"], + "State": ["step1", "step2", "step1", "step2"], + "oTime": ["", "2016-09-19 05:24:33", "", "2016-09-19 23:59:04"], + "Machine": ["23", "36L", "36R", "36R"], + } + ) + df2 = df1.copy() + df2.oTime = pd.to_datetime(df2.oTime) + expected = df1.groupby("Key").apply(predictions).p1 + result = df2.groupby("Key").apply(predictions).p1 + tm.assert_series_equal(expected, result) + + +def test_apply_aggregating_timedelta_and_datetime(): + # Regression test for GH 15562 + # The following groupby caused ValueErrors and IndexErrors pre 0.20.0 + + df = pd.DataFrame( + { + "clientid": ["A", "B", "C"], + "datetime": [np.datetime64("2017-02-01 00:00:00")] * 3, + } + ) + df["time_delta_zero"] = df.datetime - df.datetime + result = df.groupby("clientid").apply( + lambda ddf: pd.Series( + dict(clientid_age=ddf.time_delta_zero.min(), date=ddf.datetime.min()) + ) + ) + expected = pd.DataFrame( + { + "clientid": ["A", "B", "C"], + "clientid_age": [np.timedelta64(0, "D")] * 3, + "date": [np.datetime64("2017-02-01 00:00:00")] * 3, + } + ).set_index("clientid") + + tm.assert_frame_equal(result, expected) + + +def test_time_field_bug(): + # Test a fix for the following error related to GH issue 11324 When + # non-key fields in a group-by dataframe contained time-based fields + # that were not returned by the apply function, an exception would be + # raised. + + df = pd.DataFrame({"a": 1, "b": [datetime.now() for nn in range(10)]}) + + def func_with_no_date(batch): + return pd.Series({"c": 2}) + + def func_with_date(batch): + return pd.Series({"b": datetime(2015, 1, 1), "c": 2}) + + dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date) + dfg_no_conversion_expected = pd.DataFrame({"c": 2}, index=[1]) + dfg_no_conversion_expected.index.name = "a" + + dfg_conversion = df.groupby(by=["a"]).apply(func_with_date) + dfg_conversion_expected = pd.DataFrame( + {"b": datetime(2015, 1, 1), "c": 2}, index=[1] + ) + dfg_conversion_expected.index.name = "a" + + tm.assert_frame_equal(dfg_no_conversion, dfg_no_conversion_expected) + tm.assert_frame_equal(dfg_conversion, dfg_conversion_expected) + + +def test_gb_apply_list_of_unequal_len_arrays(): + + # GH1738 + df = DataFrame( + { + "group1": ["a", "a", "a", "b", "b", "b", "a", "a", "a", "b", "b", "b"], + "group2": ["c", "c", "d", "d", "d", "e", "c", "c", "d", "d", "d", "e"], + "weight": [1.1, 2, 3, 4, 5, 6, 2, 4, 6, 8, 1, 2], + "value": [7.1, 8, 9, 10, 11, 12, 8, 7, 6, 5, 4, 3], + } + ) + df = df.set_index(["group1", "group2"]) + df_grouped = df.groupby(level=["group1", "group2"], sort=True) + + def noddy(value, weight): + out = np.array(value * weight).repeat(3) + return out + + # the kernel function returns arrays of unequal length + # pandas sniffs the first one, sees it's an array and not + # a list, and assumed the rest are of equal length + # and so tries a vstack + + # don't die + df_grouped.apply(lambda x: noddy(x.value, x.weight)) + + +def test_groupby_apply_all_none(): + # Tests to make sure no errors if apply function returns all None + # values. Issue 9684. + test_df = DataFrame({"groups": [0, 0, 1, 1], "random_vars": [8, 7, 4, 5]}) + + def test_func(x): + pass + + result = test_df.groupby("groups").apply(test_func) + expected = DataFrame() + tm.assert_frame_equal(result, expected) + + +def test_groupby_apply_none_first(): + # GH 12824. Tests if apply returns None first. + test_df1 = DataFrame({"groups": [1, 1, 1, 2], "vars": [0, 1, 2, 3]}) + test_df2 = DataFrame({"groups": [1, 2, 2, 2], "vars": [0, 1, 2, 3]}) + + def test_func(x): + if x.shape[0] < 2: + return None + return x.iloc[[0, -1]] + + result1 = test_df1.groupby("groups").apply(test_func) + result2 = test_df2.groupby("groups").apply(test_func) + index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], names=["groups", None]) + index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], names=["groups", None]) + expected1 = DataFrame({"groups": [1, 1], "vars": [0, 2]}, index=index1) + expected2 = DataFrame({"groups": [2, 2], "vars": [1, 3]}, index=index2) + tm.assert_frame_equal(result1, expected1) + tm.assert_frame_equal(result2, expected2) + + +def test_groupby_apply_return_empty_chunk(): + # GH 22221: apply filter which returns some empty groups + df = pd.DataFrame(dict(value=[0, 1], group=["filled", "empty"])) + groups = df.groupby("group") + result = groups.apply(lambda group: group[group.value != 1]["value"]) + expected = pd.Series( + [0], + name="value", + index=MultiIndex.from_product( + [["empty", "filled"], [0]], names=["group", None] + ).drop("empty"), + ) + tm.assert_series_equal(result, expected) + + +def test_apply_with_mixed_types(): + # gh-20949 + df = pd.DataFrame({"A": "a a b".split(), "B": [1, 2, 3], "C": [4, 6, 5]}) + g = df.groupby("A") + + result = g.transform(lambda x: x / x.sum()) + expected = pd.DataFrame({"B": [1 / 3.0, 2 / 3.0, 1], "C": [0.4, 0.6, 1.0]}) + tm.assert_frame_equal(result, expected) + + result = g.apply(lambda x: x / x.sum()) + tm.assert_frame_equal(result, expected) + + +def test_func_returns_object(): + # GH 28652 + df = DataFrame({"a": [1, 2]}, index=pd.Int64Index([1, 2])) + result = df.groupby("a").apply(lambda g: g.index) + expected = Series( + [pd.Int64Index([1]), pd.Int64Index([2])], index=pd.Int64Index([1, 2], name="a") + ) + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "group_column_dtlike", + [datetime.today(), datetime.today().date(), datetime.today().time()], +) +def test_apply_datetime_issue(group_column_dtlike): + # GH-28247 + # groupby-apply throws an error if one of the columns in the DataFrame + # is a datetime object and the column labels are different from + # standard int values in range(len(num_columns)) + + df = pd.DataFrame({"a": ["foo"], "b": [group_column_dtlike]}) + result = df.groupby("a").apply(lambda x: pd.Series(["spam"], index=[42])) + + expected = pd.DataFrame( + ["spam"], Index(["foo"], dtype="object", name="a"), columns=[42] + ) + tm.assert_frame_equal(result, expected) + + +def test_apply_series_return_dataframe_groups(): + # GH 10078 + tdf = DataFrame( + { + "day": { + 0: pd.Timestamp("2015-02-24 00:00:00"), + 1: pd.Timestamp("2015-02-24 00:00:00"), + 2: pd.Timestamp("2015-02-24 00:00:00"), + 3: pd.Timestamp("2015-02-24 00:00:00"), + 4: pd.Timestamp("2015-02-24 00:00:00"), + }, + "userAgent": { + 0: "some UA string", + 1: "some UA string", + 2: "some UA string", + 3: "another UA string", + 4: "some UA string", + }, + "userId": { + 0: "17661101", + 1: "17661101", + 2: "17661101", + 3: "17661101", + 4: "17661101", + }, + } + ) + + def most_common_values(df): + return Series({c: s.value_counts().index[0] for c, s in df.iteritems()}) + + result = tdf.groupby("day").apply(most_common_values)["userId"] + expected = pd.Series( + ["17661101"], index=pd.DatetimeIndex(["2015-02-24"], name="day"), name="userId" + ) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("category", [False, True]) +def test_apply_multi_level_name(category): + # https://github.com/pandas-dev/pandas/issues/31068 + b = [1, 2] * 5 + if category: + b = pd.Categorical(b, categories=[1, 2, 3]) + df = pd.DataFrame( + {"A": np.arange(10), "B": b, "C": list(range(10)), "D": list(range(10))} + ).set_index(["A", "B"]) + result = df.groupby("B").apply(lambda x: x.sum()) + expected = pd.DataFrame( + {"C": [20, 25], "D": [20, 25]}, index=pd.Index([1, 2], name="B") + ) + tm.assert_frame_equal(result, expected) + assert df.index.names == ["A", "B"] + + +@pytest.mark.parametrize( + "index", + [ + pd.CategoricalIndex(list("abc")), + pd.interval_range(0, 3), + pd.period_range("2020", periods=3, freq="D"), + pd.MultiIndex.from_tuples([("a", 0), ("a", 1), ("b", 0)]), + ], +) +def test_apply_index_has_complex_internals(index): + # GH 31248 + df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index) + result = df.groupby("group").apply(lambda x: x) + tm.assert_frame_equal(result, df) diff --git a/venv/Lib/site-packages/pandas/tests/groupby/test_bin_groupby.py b/venv/Lib/site-packages/pandas/tests/groupby/test_bin_groupby.py new file mode 100644 index 0000000..ad71f73 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/groupby/test_bin_groupby.py @@ -0,0 +1,147 @@ +import numpy as np +import pytest + +from pandas._libs import groupby, lib, reduction as libreduction + +from pandas.core.dtypes.common import ensure_int64 + +from pandas import Index, Series, isna +import pandas._testing as tm + + +def test_series_grouper(): + obj = Series(np.random.randn(10)) + dummy = obj[:0] + + labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int64) + + grouper = libreduction.SeriesGrouper(obj, np.mean, labels, 2, dummy) + result, counts = grouper.get_result() + + expected = np.array([obj[3:6].mean(), obj[6:].mean()]) + tm.assert_almost_equal(result, expected) + + exp_counts = np.array([3, 4], dtype=np.int64) + tm.assert_almost_equal(counts, exp_counts) + + +def test_series_grouper_requires_nonempty_raises(): + # GH#29500 + obj = Series(np.random.randn(10)) + dummy = obj[:0] + labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int64) + + with pytest.raises(ValueError, match="SeriesGrouper requires non-empty `series`"): + libreduction.SeriesGrouper(dummy, np.mean, labels, 2, dummy) + + +def test_series_bin_grouper(): + obj = Series(np.random.randn(10)) + dummy = obj[:0] + + bins = np.array([3, 6]) + + grouper = libreduction.SeriesBinGrouper(obj, np.mean, bins, dummy) + result, counts = grouper.get_result() + + expected = np.array([obj[:3].mean(), obj[3:6].mean(), obj[6:].mean()]) + tm.assert_almost_equal(result, expected) + + exp_counts = np.array([3, 3, 4], dtype=np.int64) + tm.assert_almost_equal(counts, exp_counts) + + +@pytest.mark.parametrize( + "binner,closed,expected", + [ + ( + np.array([0, 3, 6, 9], dtype=np.int64), + "left", + np.array([2, 5, 6], dtype=np.int64), + ), + ( + np.array([0, 3, 6, 9], dtype=np.int64), + "right", + np.array([3, 6, 6], dtype=np.int64), + ), + (np.array([0, 3, 6], dtype=np.int64), "left", np.array([2, 5], dtype=np.int64)), + ( + np.array([0, 3, 6], dtype=np.int64), + "right", + np.array([3, 6], dtype=np.int64), + ), + ], +) +def test_generate_bins(binner, closed, expected): + values = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64) + result = lib.generate_bins_dt64(values, binner, closed=closed) + tm.assert_numpy_array_equal(result, expected) + + +def test_group_ohlc(): + def _check(dtype): + obj = np.array(np.random.randn(20), dtype=dtype) + + bins = np.array([6, 12, 20]) + out = np.zeros((3, 4), dtype) + counts = np.zeros(len(out), dtype=np.int64) + labels = ensure_int64(np.repeat(np.arange(3), np.diff(np.r_[0, bins]))) + + func = getattr(groupby, f"group_ohlc_{dtype}") + func(out, counts, obj[:, None], labels) + + def _ohlc(group): + if isna(group).all(): + return np.repeat(np.nan, 4) + return [group[0], group.max(), group.min(), group[-1]] + + expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), _ohlc(obj[12:])]) + + tm.assert_almost_equal(out, expected) + tm.assert_numpy_array_equal(counts, np.array([6, 6, 8], dtype=np.int64)) + + obj[:6] = np.nan + func(out, counts, obj[:, None], labels) + expected[0] = np.nan + tm.assert_almost_equal(out, expected) + + _check("float32") + _check("float64") + + +class TestMoments: + pass + + +class TestReducer: + def test_int_index(self): + arr = np.random.randn(100, 4) + + msg = "Must pass either dummy and labels, or neither" + # we must pass either both labels and dummy, or neither + with pytest.raises(ValueError, match=msg): + libreduction.compute_reduction(arr, np.sum, labels=Index(np.arange(4))) + + with pytest.raises(ValueError, match=msg): + libreduction.compute_reduction( + arr, np.sum, axis=1, labels=Index(np.arange(100)) + ) + + dummy = Series(0.0, index=np.arange(100)) + result = libreduction.compute_reduction( + arr, np.sum, dummy=dummy, labels=Index(np.arange(4)) + ) + expected = arr.sum(0) + tm.assert_almost_equal(result, expected) + + dummy = Series(0.0, index=np.arange(4)) + result = libreduction.compute_reduction( + arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100)) + ) + expected = arr.sum(1) + tm.assert_almost_equal(result, expected) + + result = libreduction.compute_reduction( + arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100)) + ) + tm.assert_almost_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/groupby/test_categorical.py b/venv/Lib/site-packages/pandas/tests/groupby/test_categorical.py new file mode 100644 index 0000000..1c2de8c --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/groupby/test_categorical.py @@ -0,0 +1,1378 @@ +from datetime import datetime + +import numpy as np +import pytest + +from pandas.compat import PY37 + +import pandas as pd +from pandas import ( + Categorical, + CategoricalIndex, + DataFrame, + Index, + MultiIndex, + Series, + qcut, +) +import pandas._testing as tm + + +def cartesian_product_for_groupers(result, args, names): + """ Reindex to a cartesian production for the groupers, + preserving the nature (Categorical) of each grouper """ + + def f(a): + if isinstance(a, (CategoricalIndex, Categorical)): + categories = a.categories + a = Categorical.from_codes( + np.arange(len(categories)), categories=categories, ordered=a.ordered + ) + return a + + index = MultiIndex.from_product(map(f, args), names=names) + return result.reindex(index).sort_index() + + +def test_apply_use_categorical_name(df): + cats = qcut(df.C, 4) + + def get_stats(group): + return { + "min": group.min(), + "max": group.max(), + "count": group.count(), + "mean": group.mean(), + } + + result = df.groupby(cats, observed=False).D.apply(get_stats) + assert result.index.names[0] == "C" + + +def test_basic(): + + cats = Categorical( + ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + categories=["a", "b", "c", "d"], + ordered=True, + ) + data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) + + exp_index = CategoricalIndex(list("abcd"), name="b", ordered=True) + expected = DataFrame({"a": [1, 2, 4, np.nan]}, index=exp_index) + result = data.groupby("b", observed=False).mean() + tm.assert_frame_equal(result, expected) + + cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) + cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) + df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) + + # single grouper + gb = df.groupby("A", observed=False) + exp_idx = CategoricalIndex(["a", "b", "z"], name="A", ordered=True) + expected = DataFrame({"values": Series([3, 7, 0], index=exp_idx)}) + result = gb.sum() + tm.assert_frame_equal(result, expected) + + # GH 8623 + x = DataFrame( + [[1, "John P. Doe"], [2, "Jane Dove"], [1, "John P. Doe"]], + columns=["person_id", "person_name"], + ) + x["person_name"] = Categorical(x.person_name) + + g = x.groupby(["person_id"], observed=False) + result = g.transform(lambda x: x) + tm.assert_frame_equal(result, x[["person_name"]]) + + result = x.drop_duplicates("person_name") + expected = x.iloc[[0, 1]] + tm.assert_frame_equal(result, expected) + + def f(x): + return x.drop_duplicates("person_name").iloc[0] + + result = g.apply(f) + expected = x.iloc[[0, 1]].copy() + expected.index = Index([1, 2], name="person_id") + expected["person_name"] = expected["person_name"].astype("object") + tm.assert_frame_equal(result, expected) + + # GH 9921 + # Monotonic + df = DataFrame({"a": [5, 15, 25]}) + c = pd.cut(df.a, bins=[0, 10, 20, 30, 40]) + + result = df.a.groupby(c, observed=False).transform(sum) + tm.assert_series_equal(result, df["a"]) + + tm.assert_series_equal( + df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df["a"] + ) + tm.assert_frame_equal(df.groupby(c, observed=False).transform(sum), df[["a"]]) + tm.assert_frame_equal( + df.groupby(c, observed=False).transform(lambda xs: np.max(xs)), df[["a"]] + ) + + # Filter + tm.assert_series_equal(df.a.groupby(c, observed=False).filter(np.all), df["a"]) + tm.assert_frame_equal(df.groupby(c, observed=False).filter(np.all), df) + + # Non-monotonic + df = DataFrame({"a": [5, 15, 25, -5]}) + c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40]) + + result = df.a.groupby(c, observed=False).transform(sum) + tm.assert_series_equal(result, df["a"]) + + tm.assert_series_equal( + df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df["a"] + ) + tm.assert_frame_equal(df.groupby(c, observed=False).transform(sum), df[["a"]]) + tm.assert_frame_equal( + df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df[["a"]] + ) + + # GH 9603 + df = DataFrame({"a": [1, 0, 0, 0]}) + c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list("abcd"))) + result = df.groupby(c, observed=False).apply(len) + + exp_index = CategoricalIndex(c.values.categories, ordered=c.values.ordered) + expected = Series([1, 0, 0, 0], index=exp_index) + expected.index.name = "a" + tm.assert_series_equal(result, expected) + + # more basic + levels = ["foo", "bar", "baz", "qux"] + codes = np.random.randint(0, 4, size=100) + + cats = Categorical.from_codes(codes, levels, ordered=True) + + data = DataFrame(np.random.randn(100, 4)) + + result = data.groupby(cats, observed=False).mean() + + expected = data.groupby(np.asarray(cats), observed=False).mean() + exp_idx = CategoricalIndex(levels, categories=cats.categories, ordered=True) + expected = expected.reindex(exp_idx) + + tm.assert_frame_equal(result, expected) + + grouped = data.groupby(cats, observed=False) + desc_result = grouped.describe() + + idx = cats.codes.argsort() + ord_labels = np.asarray(cats).take(idx) + ord_data = data.take(idx) + + exp_cats = Categorical( + ord_labels, ordered=True, categories=["foo", "bar", "baz", "qux"] + ) + expected = ord_data.groupby(exp_cats, sort=False, observed=False).describe() + tm.assert_frame_equal(desc_result, expected) + + # GH 10460 + expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) + exp = CategoricalIndex(expc) + tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp) + exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4) + tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp) + + +def test_level_get_group(observed): + # GH15155 + df = DataFrame( + data=np.arange(2, 22, 2), + index=MultiIndex( + levels=[CategoricalIndex(["a", "b"]), range(10)], + codes=[[0] * 5 + [1] * 5, range(10)], + names=["Index1", "Index2"], + ), + ) + g = df.groupby(level=["Index1"], observed=observed) + + # expected should equal test.loc[["a"]] + # GH15166 + expected = DataFrame( + data=np.arange(2, 12, 2), + index=MultiIndex( + levels=[CategoricalIndex(["a", "b"]), range(5)], + codes=[[0] * 5, range(5)], + names=["Index1", "Index2"], + ), + ) + result = g.get_group("a") + + tm.assert_frame_equal(result, expected) + + +# GH#21636 flaky on py37; may be related to older numpy, see discussion +# https://github.com/MacPython/pandas-wheels/pull/64 +@pytest.mark.xfail(PY37, reason="Flaky, GH-27902", strict=False) +@pytest.mark.parametrize("ordered", [True, False]) +def test_apply(ordered): + # GH 10138 + + dense = Categorical(list("abc"), ordered=ordered) + + # 'b' is in the categories but not in the list + missing = Categorical(list("aaa"), categories=["a", "b"], ordered=ordered) + values = np.arange(len(dense)) + df = DataFrame({"missing": missing, "dense": dense, "values": values}) + grouped = df.groupby(["missing", "dense"], observed=True) + + # missing category 'b' should still exist in the output index + idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"]) + expected = DataFrame([0, 1, 2.0], index=idx, columns=["values"]) + + # GH#21636 tracking down the xfail, in some builds np.mean(df.loc[[0]]) + # is coming back as Series([0., 1., 0.], index=["missing", "dense", "values"]) + # when we expect Series(0., index=["values"]) + result = grouped.apply(lambda x: np.mean(x)) + tm.assert_frame_equal(result, expected) + + # we coerce back to ints + expected = expected.astype("int") + result = grouped.mean() + tm.assert_frame_equal(result, expected) + + result = grouped.agg(np.mean) + tm.assert_frame_equal(result, expected) + + # but for transform we should still get back the original index + idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"]) + expected = Series(1, index=idx) + result = grouped.apply(lambda x: 1) + tm.assert_series_equal(result, expected) + + +def test_observed(observed): + # multiple groupers, don't re-expand the output space + # of the grouper + # gh-14942 (implement) + # gh-10132 (back-compat) + # gh-8138 (back-compat) + # gh-8869 + + cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) + cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) + df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) + df["C"] = ["foo", "bar"] * 2 + + # multiple groupers with a non-cat + gb = df.groupby(["A", "B", "C"], observed=observed) + exp_index = MultiIndex.from_arrays( + [cat1, cat2, ["foo", "bar"] * 2], names=["A", "B", "C"] + ) + expected = DataFrame({"values": Series([1, 2, 3, 4], index=exp_index)}).sort_index() + result = gb.sum() + if not observed: + expected = cartesian_product_for_groupers( + expected, [cat1, cat2, ["foo", "bar"]], list("ABC") + ) + + tm.assert_frame_equal(result, expected) + + gb = df.groupby(["A", "B"], observed=observed) + exp_index = MultiIndex.from_arrays([cat1, cat2], names=["A", "B"]) + expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index) + result = gb.sum() + if not observed: + expected = cartesian_product_for_groupers(expected, [cat1, cat2], list("AB")) + + tm.assert_frame_equal(result, expected) + + # https://github.com/pandas-dev/pandas/issues/8138 + d = { + "cat": Categorical( + ["a", "b", "a", "b"], categories=["a", "b", "c"], ordered=True + ), + "ints": [1, 1, 2, 2], + "val": [10, 20, 30, 40], + } + df = DataFrame(d) + + # Grouping on a single column + groups_single_key = df.groupby("cat", observed=observed) + result = groups_single_key.mean() + + exp_index = CategoricalIndex( + list("ab"), name="cat", categories=list("abc"), ordered=True + ) + expected = DataFrame({"ints": [1.5, 1.5], "val": [20.0, 30]}, index=exp_index) + if not observed: + index = CategoricalIndex( + list("abc"), name="cat", categories=list("abc"), ordered=True + ) + expected = expected.reindex(index) + + tm.assert_frame_equal(result, expected) + + # Grouping on two columns + groups_double_key = df.groupby(["cat", "ints"], observed=observed) + result = groups_double_key.agg("mean") + expected = DataFrame( + { + "val": [10, 30, 20, 40], + "cat": Categorical( + ["a", "a", "b", "b"], categories=["a", "b", "c"], ordered=True + ), + "ints": [1, 2, 1, 2], + } + ).set_index(["cat", "ints"]) + if not observed: + expected = cartesian_product_for_groupers( + expected, [df.cat.values, [1, 2]], ["cat", "ints"] + ) + + tm.assert_frame_equal(result, expected) + + # GH 10132 + for key in [("a", 1), ("b", 2), ("b", 1), ("a", 2)]: + c, i = key + result = groups_double_key.get_group(key) + expected = df[(df.cat == c) & (df.ints == i)] + tm.assert_frame_equal(result, expected) + + # gh-8869 + # with as_index + d = { + "foo": [10, 8, 4, 8, 4, 1, 1], + "bar": [10, 20, 30, 40, 50, 60, 70], + "baz": ["d", "c", "e", "a", "a", "d", "c"], + } + df = DataFrame(d) + cat = pd.cut(df["foo"], np.linspace(0, 10, 3)) + df["range"] = cat + groups = df.groupby(["range", "baz"], as_index=False, observed=observed) + result = groups.agg("mean") + + groups2 = df.groupby(["range", "baz"], as_index=True, observed=observed) + expected = groups2.agg("mean").reset_index() + tm.assert_frame_equal(result, expected) + + +def test_observed_codes_remap(observed): + d = {"C1": [3, 3, 4, 5], "C2": [1, 2, 3, 4], "C3": [10, 100, 200, 34]} + df = DataFrame(d) + values = pd.cut(df["C1"], [1, 2, 3, 6]) + values.name = "cat" + groups_double_key = df.groupby([values, "C2"], observed=observed) + + idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]], names=["cat", "C2"]) + expected = DataFrame({"C1": [3, 3, 4, 5], "C3": [10, 100, 200, 34]}, index=idx) + if not observed: + expected = cartesian_product_for_groupers( + expected, [values.values, [1, 2, 3, 4]], ["cat", "C2"] + ) + + result = groups_double_key.agg("mean") + tm.assert_frame_equal(result, expected) + + +def test_observed_perf(): + # we create a cartesian product, so this is + # non-performant if we don't use observed values + # gh-14942 + df = DataFrame( + { + "cat": np.random.randint(0, 255, size=30000), + "int_id": np.random.randint(0, 255, size=30000), + "other_id": np.random.randint(0, 10000, size=30000), + "foo": 0, + } + ) + df["cat"] = df.cat.astype(str).astype("category") + + grouped = df.groupby(["cat", "int_id", "other_id"], observed=True) + result = grouped.count() + assert result.index.levels[0].nunique() == df.cat.nunique() + assert result.index.levels[1].nunique() == df.int_id.nunique() + assert result.index.levels[2].nunique() == df.other_id.nunique() + + +def test_observed_groups(observed): + # gh-20583 + # test that we have the appropriate groups + + cat = Categorical(["a", "c", "a"], categories=["a", "b", "c"]) + df = DataFrame({"cat": cat, "vals": [1, 2, 3]}) + g = df.groupby("cat", observed=observed) + + result = g.groups + if observed: + expected = {"a": Index([0, 2], dtype="int64"), "c": Index([1], dtype="int64")} + else: + expected = { + "a": Index([0, 2], dtype="int64"), + "b": Index([], dtype="int64"), + "c": Index([1], dtype="int64"), + } + + tm.assert_dict_equal(result, expected) + + +def test_observed_groups_with_nan(observed): + # GH 24740 + df = DataFrame( + { + "cat": Categorical(["a", np.nan, "a"], categories=["a", "b", "d"]), + "vals": [1, 2, 3], + } + ) + g = df.groupby("cat", observed=observed) + result = g.groups + if observed: + expected = {"a": Index([0, 2], dtype="int64")} + else: + expected = { + "a": Index([0, 2], dtype="int64"), + "b": Index([], dtype="int64"), + "d": Index([], dtype="int64"), + } + tm.assert_dict_equal(result, expected) + + +def test_observed_nth(): + # GH 26385 + cat = pd.Categorical(["a", np.nan, np.nan], categories=["a", "b", "c"]) + ser = pd.Series([1, 2, 3]) + df = pd.DataFrame({"cat": cat, "ser": ser}) + + result = df.groupby("cat", observed=False)["ser"].nth(0) + + index = pd.Categorical(["a", "b", "c"], categories=["a", "b", "c"]) + expected = pd.Series([1, np.nan, np.nan], index=index, name="ser") + expected.index.name = "cat" + + tm.assert_series_equal(result, expected) + + +def test_dataframe_categorical_with_nan(observed): + # GH 21151 + s1 = Categorical([np.nan, "a", np.nan, "a"], categories=["a", "b", "c"]) + s2 = Series([1, 2, 3, 4]) + df = DataFrame({"s1": s1, "s2": s2}) + result = df.groupby("s1", observed=observed).first().reset_index() + if observed: + expected = DataFrame( + {"s1": Categorical(["a"], categories=["a", "b", "c"]), "s2": [2]} + ) + else: + expected = DataFrame( + { + "s1": Categorical(["a", "b", "c"], categories=["a", "b", "c"]), + "s2": [2, np.nan, np.nan], + } + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("ordered", [True, False]) +@pytest.mark.parametrize("observed", [True, False]) +@pytest.mark.parametrize("sort", [True, False]) +def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort): + # GH 25871: Fix groupby sorting on ordered Categoricals + # GH 25167: Groupby with observed=True doesn't sort + + # Build a dataframe with cat having one unobserved category ('missing'), + # and a Series with identical values + label = Categorical( + ["d", "a", "b", "a", "d", "b"], + categories=["a", "b", "missing", "d"], + ordered=ordered, + ) + val = Series(["d", "a", "b", "a", "d", "b"]) + df = DataFrame({"label": label, "val": val}) + + # aggregate on the Categorical + result = df.groupby("label", observed=observed, sort=sort)["val"].aggregate("first") + + # If ordering works, we expect index labels equal to aggregation results, + # except for 'observed=False': label 'missing' has aggregation None + label = Series(result.index.array, dtype="object") + aggr = Series(result.array) + if not observed: + aggr[aggr.isna()] = "missing" + if not all(label == aggr): + msg = ( + f"Labels and aggregation results not consistently sorted\n" + + "for (ordered={ordered}, observed={observed}, sort={sort})\n" + + "Result:\n{result}" + ) + assert False, msg + + +def test_datetime(): + # GH9049: ensure backward compatibility + levels = pd.date_range("2014-01-01", periods=4) + codes = np.random.randint(0, 4, size=100) + + cats = Categorical.from_codes(codes, levels, ordered=True) + + data = DataFrame(np.random.randn(100, 4)) + result = data.groupby(cats, observed=False).mean() + + expected = data.groupby(np.asarray(cats), observed=False).mean() + expected = expected.reindex(levels) + expected.index = CategoricalIndex( + expected.index, categories=expected.index, ordered=True + ) + + tm.assert_frame_equal(result, expected) + + grouped = data.groupby(cats, observed=False) + desc_result = grouped.describe() + + idx = cats.codes.argsort() + ord_labels = cats.take(idx) + ord_data = data.take(idx) + expected = ord_data.groupby(ord_labels, observed=False).describe() + tm.assert_frame_equal(desc_result, expected) + tm.assert_index_equal(desc_result.index, expected.index) + tm.assert_index_equal( + desc_result.index.get_level_values(0), expected.index.get_level_values(0) + ) + + # GH 10460 + expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) + exp = CategoricalIndex(expc) + tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp) + exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4) + tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp) + + +def test_categorical_index(): + + s = np.random.RandomState(12345) + levels = ["foo", "bar", "baz", "qux"] + codes = s.randint(0, 4, size=20) + cats = Categorical.from_codes(codes, levels, ordered=True) + df = DataFrame(np.repeat(np.arange(20), 4).reshape(-1, 4), columns=list("abcd")) + df["cats"] = cats + + # with a cat index + result = df.set_index("cats").groupby(level=0, observed=False).sum() + expected = df[list("abcd")].groupby(cats.codes, observed=False).sum() + expected.index = CategoricalIndex( + Categorical.from_codes([0, 1, 2, 3], levels, ordered=True), name="cats" + ) + tm.assert_frame_equal(result, expected) + + # with a cat column, should produce a cat index + result = df.groupby("cats", observed=False).sum() + expected = df[list("abcd")].groupby(cats.codes, observed=False).sum() + expected.index = CategoricalIndex( + Categorical.from_codes([0, 1, 2, 3], levels, ordered=True), name="cats" + ) + tm.assert_frame_equal(result, expected) + + +def test_describe_categorical_columns(): + # GH 11558 + cats = CategoricalIndex( + ["qux", "foo", "baz", "bar"], + categories=["foo", "bar", "baz", "qux"], + ordered=True, + ) + df = DataFrame(np.random.randn(20, 4), columns=cats) + result = df.groupby([1, 2, 3, 4] * 5).describe() + + tm.assert_index_equal(result.stack().columns, cats) + tm.assert_categorical_equal(result.stack().columns.values, cats.values) + + +def test_unstack_categorical(): + # GH11558 (example is taken from the original issue) + df = DataFrame( + {"a": range(10), "medium": ["A", "B"] * 5, "artist": list("XYXXY") * 2} + ) + df["medium"] = df["medium"].astype("category") + + gcat = df.groupby(["artist", "medium"], observed=False)["a"].count().unstack() + result = gcat.describe() + + exp_columns = CategoricalIndex(["A", "B"], ordered=False, name="medium") + tm.assert_index_equal(result.columns, exp_columns) + tm.assert_categorical_equal(result.columns.values, exp_columns.values) + + result = gcat["A"] + gcat["B"] + expected = Series([6, 4], index=Index(["X", "Y"], name="artist")) + tm.assert_series_equal(result, expected) + + +def test_bins_unequal_len(): + # GH3011 + series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4]) + bins = pd.cut(series.dropna().values, 4) + + # len(bins) != len(series) here + with pytest.raises(ValueError): + series.groupby(bins).mean() + + +def test_as_index(): + # GH13204 + df = DataFrame( + { + "cat": Categorical([1, 2, 2], [1, 2, 3]), + "A": [10, 11, 11], + "B": [101, 102, 103], + } + ) + result = df.groupby(["cat", "A"], as_index=False, observed=True).sum() + expected = DataFrame( + { + "cat": Categorical([1, 2], categories=df.cat.cat.categories), + "A": [10, 11], + "B": [101, 205], + }, + columns=["cat", "A", "B"], + ) + tm.assert_frame_equal(result, expected) + + # function grouper + f = lambda r: df.loc[r, "A"] + result = df.groupby(["cat", f], as_index=False, observed=True).sum() + expected = DataFrame( + { + "cat": Categorical([1, 2], categories=df.cat.cat.categories), + "A": [10, 22], + "B": [101, 205], + }, + columns=["cat", "A", "B"], + ) + tm.assert_frame_equal(result, expected) + + # another not in-axis grouper (conflicting names in index) + s = Series(["a", "b", "b"], name="cat") + result = df.groupby(["cat", s], as_index=False, observed=True).sum() + tm.assert_frame_equal(result, expected) + + # is original index dropped? + group_columns = ["cat", "A"] + expected = DataFrame( + { + "cat": Categorical([1, 2], categories=df.cat.cat.categories), + "A": [10, 11], + "B": [101, 205], + }, + columns=["cat", "A", "B"], + ) + + for name in [None, "X", "B"]: + df.index = Index(list("abc"), name=name) + result = df.groupby(group_columns, as_index=False, observed=True).sum() + + tm.assert_frame_equal(result, expected) + + +def test_preserve_categories(): + # GH-13179 + categories = list("abc") + + # ordered=True + df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=True)}) + index = CategoricalIndex(categories, categories, ordered=True, name="A") + tm.assert_index_equal( + df.groupby("A", sort=True, observed=False).first().index, index + ) + tm.assert_index_equal( + df.groupby("A", sort=False, observed=False).first().index, index + ) + + # ordered=False + df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=False)}) + sort_index = CategoricalIndex(categories, categories, ordered=False, name="A") + nosort_index = CategoricalIndex(list("bac"), list("bac"), ordered=False, name="A") + tm.assert_index_equal( + df.groupby("A", sort=True, observed=False).first().index, sort_index + ) + tm.assert_index_equal( + df.groupby("A", sort=False, observed=False).first().index, nosort_index + ) + + +def test_preserve_categorical_dtype(): + # GH13743, GH13854 + df = DataFrame( + { + "A": [1, 2, 1, 1, 2], + "B": [10, 16, 22, 28, 34], + "C1": Categorical(list("abaab"), categories=list("bac"), ordered=False), + "C2": Categorical(list("abaab"), categories=list("bac"), ordered=True), + } + ) + # single grouper + exp_full = DataFrame( + { + "A": [2.0, 1.0, np.nan], + "B": [25.0, 20.0, np.nan], + "C1": Categorical(list("bac"), categories=list("bac"), ordered=False), + "C2": Categorical(list("bac"), categories=list("bac"), ordered=True), + } + ) + for col in ["C1", "C2"]: + result1 = df.groupby(by=col, as_index=False, observed=False).mean() + result2 = df.groupby(by=col, as_index=True, observed=False).mean().reset_index() + expected = exp_full.reindex(columns=result1.columns) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + + +@pytest.mark.parametrize( + "func, values", + [ + ("first", ["second", "first"]), + ("last", ["fourth", "third"]), + ("min", ["fourth", "first"]), + ("max", ["second", "third"]), + ], +) +def test_preserve_on_ordered_ops(func, values): + # gh-18502 + # preserve the categoricals on ops + c = pd.Categorical(["first", "second", "third", "fourth"], ordered=True) + df = pd.DataFrame({"payload": [-1, -2, -1, -2], "col": c}) + g = df.groupby("payload") + result = getattr(g, func)() + expected = pd.DataFrame( + {"payload": [-2, -1], "col": pd.Series(values, dtype=c.dtype)} + ).set_index("payload") + tm.assert_frame_equal(result, expected) + + +def test_categorical_no_compress(): + data = Series(np.random.randn(9)) + + codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) + cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True) + + result = data.groupby(cats, observed=False).mean() + exp = data.groupby(codes, observed=False).mean() + + exp.index = CategoricalIndex( + exp.index, categories=cats.categories, ordered=cats.ordered + ) + tm.assert_series_equal(result, exp) + + codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3]) + cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True) + + result = data.groupby(cats, observed=False).mean() + exp = data.groupby(codes, observed=False).mean().reindex(cats.categories) + exp.index = CategoricalIndex( + exp.index, categories=cats.categories, ordered=cats.ordered + ) + tm.assert_series_equal(result, exp) + + cats = Categorical( + ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + categories=["a", "b", "c", "d"], + ordered=True, + ) + data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) + + result = data.groupby("b", observed=False).mean() + result = result["a"].values + exp = np.array([1, 2, 4, np.nan]) + tm.assert_numpy_array_equal(result, exp) + + +def test_groupby_empty_with_category(): + # GH-9614 + # test fix for when group by on None resulted in + # coercion of dtype categorical -> float + df = pd.DataFrame( + {"A": [None] * 3, "B": pd.Categorical(["train", "train", "test"])} + ) + result = df.groupby("A").first()["B"] + expected = pd.Series( + pd.Categorical([], categories=["test", "train"]), + index=pd.Series([], dtype="object", name="A"), + name="B", + ) + tm.assert_series_equal(result, expected) + + +def test_sort(): + + # https://stackoverflow.com/questions/23814368/sorting-pandas- + # categorical-labels-after-groupby + # This should result in a properly sorted Series so that the plot + # has a sorted x axis + # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar') + + df = DataFrame({"value": np.random.randint(0, 10000, 100)}) + labels = [f"{i} - {i+499}" for i in range(0, 10000, 500)] + cat_labels = Categorical(labels, labels) + + df = df.sort_values(by=["value"], ascending=True) + df["value_group"] = pd.cut( + df.value, range(0, 10500, 500), right=False, labels=cat_labels + ) + + res = df.groupby(["value_group"], observed=False)["value_group"].count() + exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))] + exp.index = CategoricalIndex(exp.index, name=exp.index.name) + tm.assert_series_equal(res, exp) + + +def test_sort2(): + # dataframe groupby sort was being ignored # GH 8868 + df = DataFrame( + [ + ["(7.5, 10]", 10, 10], + ["(7.5, 10]", 8, 20], + ["(2.5, 5]", 5, 30], + ["(5, 7.5]", 6, 40], + ["(2.5, 5]", 4, 50], + ["(0, 2.5]", 1, 60], + ["(5, 7.5]", 7, 70], + ], + columns=["range", "foo", "bar"], + ) + df["range"] = Categorical(df["range"], ordered=True) + index = CategoricalIndex( + ["(0, 2.5]", "(2.5, 5]", "(5, 7.5]", "(7.5, 10]"], name="range", ordered=True + ) + expected_sort = DataFrame( + [[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"], index=index + ) + + col = "range" + result_sort = df.groupby(col, sort=True, observed=False).first() + tm.assert_frame_equal(result_sort, expected_sort) + + # when categories is ordered, group is ordered by category's order + expected_sort = result_sort + result_sort = df.groupby(col, sort=False, observed=False).first() + tm.assert_frame_equal(result_sort, expected_sort) + + df["range"] = Categorical(df["range"], ordered=False) + index = CategoricalIndex( + ["(0, 2.5]", "(2.5, 5]", "(5, 7.5]", "(7.5, 10]"], name="range" + ) + expected_sort = DataFrame( + [[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"], index=index + ) + + index = CategoricalIndex( + ["(7.5, 10]", "(2.5, 5]", "(5, 7.5]", "(0, 2.5]"], + categories=["(7.5, 10]", "(2.5, 5]", "(5, 7.5]", "(0, 2.5]"], + name="range", + ) + expected_nosort = DataFrame( + [[10, 10], [5, 30], [6, 40], [1, 60]], index=index, columns=["foo", "bar"] + ) + + col = "range" + + # this is an unordered categorical, but we allow this #### + result_sort = df.groupby(col, sort=True, observed=False).first() + tm.assert_frame_equal(result_sort, expected_sort) + + result_nosort = df.groupby(col, sort=False, observed=False).first() + tm.assert_frame_equal(result_nosort, expected_nosort) + + +def test_sort_datetimelike(): + # GH10505 + + # use same data as test_groupby_sort_categorical, which category is + # corresponding to datetime.month + df = DataFrame( + { + "dt": [ + datetime(2011, 7, 1), + datetime(2011, 7, 1), + datetime(2011, 2, 1), + datetime(2011, 5, 1), + datetime(2011, 2, 1), + datetime(2011, 1, 1), + datetime(2011, 5, 1), + ], + "foo": [10, 8, 5, 6, 4, 1, 7], + "bar": [10, 20, 30, 40, 50, 60, 70], + }, + columns=["dt", "foo", "bar"], + ) + + # ordered=True + df["dt"] = Categorical(df["dt"], ordered=True) + index = [ + datetime(2011, 1, 1), + datetime(2011, 2, 1), + datetime(2011, 5, 1), + datetime(2011, 7, 1), + ] + result_sort = DataFrame( + [[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"] + ) + result_sort.index = CategoricalIndex(index, name="dt", ordered=True) + + index = [ + datetime(2011, 7, 1), + datetime(2011, 2, 1), + datetime(2011, 5, 1), + datetime(2011, 1, 1), + ] + result_nosort = DataFrame( + [[10, 10], [5, 30], [6, 40], [1, 60]], columns=["foo", "bar"] + ) + result_nosort.index = CategoricalIndex( + index, categories=index, name="dt", ordered=True + ) + + col = "dt" + tm.assert_frame_equal( + result_sort, df.groupby(col, sort=True, observed=False).first() + ) + + # when categories is ordered, group is ordered by category's order + tm.assert_frame_equal( + result_sort, df.groupby(col, sort=False, observed=False).first() + ) + + # ordered = False + df["dt"] = Categorical(df["dt"], ordered=False) + index = [ + datetime(2011, 1, 1), + datetime(2011, 2, 1), + datetime(2011, 5, 1), + datetime(2011, 7, 1), + ] + result_sort = DataFrame( + [[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"] + ) + result_sort.index = CategoricalIndex(index, name="dt") + + index = [ + datetime(2011, 7, 1), + datetime(2011, 2, 1), + datetime(2011, 5, 1), + datetime(2011, 1, 1), + ] + result_nosort = DataFrame( + [[10, 10], [5, 30], [6, 40], [1, 60]], columns=["foo", "bar"] + ) + result_nosort.index = CategoricalIndex(index, categories=index, name="dt") + + col = "dt" + tm.assert_frame_equal( + result_sort, df.groupby(col, sort=True, observed=False).first() + ) + tm.assert_frame_equal( + result_nosort, df.groupby(col, sort=False, observed=False).first() + ) + + +def test_empty_sum(): + # https://github.com/pandas-dev/pandas/issues/18678 + df = DataFrame( + {"A": Categorical(["a", "a", "b"], categories=["a", "b", "c"]), "B": [1, 2, 1]} + ) + expected_idx = CategoricalIndex(["a", "b", "c"], name="A") + + # 0 by default + result = df.groupby("A", observed=False).B.sum() + expected = Series([3, 1, 0], expected_idx, name="B") + tm.assert_series_equal(result, expected) + + # min_count=0 + result = df.groupby("A", observed=False).B.sum(min_count=0) + expected = Series([3, 1, 0], expected_idx, name="B") + tm.assert_series_equal(result, expected) + + # min_count=1 + result = df.groupby("A", observed=False).B.sum(min_count=1) + expected = Series([3, 1, np.nan], expected_idx, name="B") + tm.assert_series_equal(result, expected) + + # min_count>1 + result = df.groupby("A", observed=False).B.sum(min_count=2) + expected = Series([3, np.nan, np.nan], expected_idx, name="B") + tm.assert_series_equal(result, expected) + + +def test_empty_prod(): + # https://github.com/pandas-dev/pandas/issues/18678 + df = DataFrame( + {"A": Categorical(["a", "a", "b"], categories=["a", "b", "c"]), "B": [1, 2, 1]} + ) + + expected_idx = CategoricalIndex(["a", "b", "c"], name="A") + + # 1 by default + result = df.groupby("A", observed=False).B.prod() + expected = Series([2, 1, 1], expected_idx, name="B") + tm.assert_series_equal(result, expected) + + # min_count=0 + result = df.groupby("A", observed=False).B.prod(min_count=0) + expected = Series([2, 1, 1], expected_idx, name="B") + tm.assert_series_equal(result, expected) + + # min_count=1 + result = df.groupby("A", observed=False).B.prod(min_count=1) + expected = Series([2, 1, np.nan], expected_idx, name="B") + tm.assert_series_equal(result, expected) + + +def test_groupby_multiindex_categorical_datetime(): + # https://github.com/pandas-dev/pandas/issues/21390 + + df = DataFrame( + { + "key1": Categorical(list("abcbabcba")), + "key2": Categorical( + list(pd.date_range("2018-06-01 00", freq="1T", periods=3)) * 3 + ), + "values": np.arange(9), + } + ) + result = df.groupby(["key1", "key2"]).mean() + + idx = MultiIndex.from_product( + [ + Categorical(["a", "b", "c"]), + Categorical(pd.date_range("2018-06-01 00", freq="1T", periods=3)), + ], + names=["key1", "key2"], + ) + expected = DataFrame({"values": [0, 4, 8, 3, 4, 5, 6, np.nan, 2]}, index=idx) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "as_index, expected", + [ + ( + True, + Series( + index=MultiIndex.from_arrays( + [Series([1, 1, 2], dtype="category"), [1, 2, 2]], names=["a", "b"] + ), + data=[1, 2, 3], + name="x", + ), + ), + ( + False, + DataFrame( + { + "a": Series([1, 1, 2], dtype="category"), + "b": [1, 2, 2], + "x": [1, 2, 3], + } + ), + ), + ], +) +def test_groupby_agg_observed_true_single_column(as_index, expected): + # GH-23970 + df = DataFrame( + {"a": Series([1, 1, 2], dtype="category"), "b": [1, 2, 2], "x": [1, 2, 3]} + ) + + result = df.groupby(["a", "b"], as_index=as_index, observed=True)["x"].sum() + + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("fill_value", [None, np.nan, pd.NaT]) +def test_shift(fill_value): + ct = Categorical( + ["a", "b", "c", "d"], categories=["a", "b", "c", "d"], ordered=False + ) + expected = Categorical( + [None, "a", "b", "c"], categories=["a", "b", "c", "d"], ordered=False + ) + res = ct.shift(1, fill_value=fill_value) + tm.assert_equal(res, expected) + + +@pytest.fixture +def df_cat(df): + """ + DataFrame with multiple categorical columns and a column of integers. + Shortened so as not to contain all possible combinations of categories. + Useful for testing `observed` kwarg functionality on GroupBy objects. + + Parameters + ---------- + df: DataFrame + Non-categorical, longer DataFrame from another fixture, used to derive + this one + + Returns + ------- + df_cat: DataFrame + """ + df_cat = df.copy()[:4] # leave out some groups + df_cat["A"] = df_cat["A"].astype("category") + df_cat["B"] = df_cat["B"].astype("category") + df_cat["C"] = Series([1, 2, 3, 4]) + df_cat = df_cat.drop(["D"], axis=1) + return df_cat + + +@pytest.mark.parametrize( + "operation, kwargs", [("agg", dict(dtype="category")), ("apply", dict())] +) +def test_seriesgroupby_observed_true(df_cat, operation, kwargs): + # GH 24880 + index = MultiIndex.from_frame( + DataFrame( + {"A": ["foo", "foo", "bar", "bar"], "B": ["one", "two", "one", "three"]}, + **kwargs, + ) + ) + expected = Series(data=[1, 3, 2, 4], index=index, name="C") + grouped = df_cat.groupby(["A", "B"], observed=True)["C"] + result = getattr(grouped, operation)(sum) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("operation", ["agg", "apply"]) +@pytest.mark.parametrize("observed", [False, None]) +def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation): + # GH 24880 + index, _ = MultiIndex.from_product( + [ + CategoricalIndex(["bar", "foo"], ordered=False), + CategoricalIndex(["one", "three", "two"], ordered=False), + ], + names=["A", "B"], + ).sortlevel() + + expected = Series(data=[2, 4, np.nan, 1, np.nan, 3], index=index, name="C") + grouped = df_cat.groupby(["A", "B"], observed=observed)["C"] + result = getattr(grouped, operation)(sum) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "observed, index, data", + [ + ( + True, + MultiIndex.from_tuples( + [ + ("foo", "one", "min"), + ("foo", "one", "max"), + ("foo", "two", "min"), + ("foo", "two", "max"), + ("bar", "one", "min"), + ("bar", "one", "max"), + ("bar", "three", "min"), + ("bar", "three", "max"), + ], + names=["A", "B", None], + ), + [1, 1, 3, 3, 2, 2, 4, 4], + ), + ( + False, + MultiIndex.from_product( + [ + CategoricalIndex(["bar", "foo"], ordered=False), + CategoricalIndex(["one", "three", "two"], ordered=False), + Index(["min", "max"]), + ], + names=["A", "B", None], + ), + [2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3], + ), + ( + None, + MultiIndex.from_product( + [ + CategoricalIndex(["bar", "foo"], ordered=False), + CategoricalIndex(["one", "three", "two"], ordered=False), + Index(["min", "max"]), + ], + names=["A", "B", None], + ), + [2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3], + ), + ], +) +def test_seriesgroupby_observed_apply_dict(df_cat, observed, index, data): + # GH 24880 + expected = Series(data=data, index=index, name="C") + result = df_cat.groupby(["A", "B"], observed=observed)["C"].apply( + lambda x: {"min": x.min(), "max": x.max()} + ) + tm.assert_series_equal(result, expected) + + +def test_groupby_categorical_series_dataframe_consistent(df_cat): + # GH 20416 + expected = df_cat.groupby(["A", "B"])["C"].mean() + result = df_cat.groupby(["A", "B"]).mean()["C"] + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("code", [([1, 0, 0]), ([0, 0, 0])]) +def test_groupby_categorical_axis_1(code): + # GH 13420 + df = DataFrame({"a": [1, 2, 3, 4], "b": [-1, -2, -3, -4], "c": [5, 6, 7, 8]}) + cat = pd.Categorical.from_codes(code, categories=list("abc")) + result = df.groupby(cat, axis=1).mean() + expected = df.T.groupby(cat, axis=0).mean().T + tm.assert_frame_equal(result, expected) + + +def test_groupby_cat_preserves_structure(observed, ordered_fixture): + # GH 28787 + df = DataFrame( + {"Name": Categorical(["Bob", "Greg"], ordered=ordered_fixture), "Item": [1, 2]}, + columns=["Name", "Item"], + ) + expected = df.copy() + + result = ( + df.groupby("Name", observed=observed) + .agg(pd.DataFrame.sum, skipna=True) + .reset_index() + ) + + tm.assert_frame_equal(result, expected) + + +def test_get_nonexistent_category(): + # Accessing a Category that is not in the dataframe + df = pd.DataFrame({"var": ["a", "a", "b", "b"], "val": range(4)}) + with pytest.raises(KeyError, match="'vau'"): + df.groupby("var").apply( + lambda rows: pd.DataFrame( + {"var": [rows.iloc[-1]["var"]], "val": [rows.iloc[-1]["vau"]]} + ) + ) + + +def test_series_groupby_on_2_categoricals_unobserved( + reduction_func: str, observed: bool +): + # GH 17605 + + if reduction_func == "ngroup": + pytest.skip("ngroup is not truly a reduction") + + df = pd.DataFrame( + { + "cat_1": pd.Categorical(list("AABB"), categories=list("ABCD")), + "cat_2": pd.Categorical(list("AB") * 2, categories=list("ABCD")), + "value": [0.1] * 4, + } + ) + args = {"nth": [0]}.get(reduction_func, []) + + expected_length = 4 if observed else 16 + + series_groupby = df.groupby(["cat_1", "cat_2"], observed=observed)["value"] + agg = getattr(series_groupby, reduction_func) + result = agg(*args) + + assert len(result) == expected_length + + +@pytest.mark.parametrize( + "func, zero_or_nan", + [ + ("all", np.NaN), + ("any", np.NaN), + ("count", 0), + ("first", np.NaN), + ("idxmax", np.NaN), + ("idxmin", np.NaN), + ("last", np.NaN), + ("mad", np.NaN), + ("max", np.NaN), + ("mean", np.NaN), + ("median", np.NaN), + ("min", np.NaN), + ("nth", np.NaN), + ("nunique", 0), + ("prod", np.NaN), + ("quantile", np.NaN), + ("sem", np.NaN), + ("size", 0), + ("skew", np.NaN), + ("std", np.NaN), + ("sum", np.NaN), + ("var", np.NaN), + ], +) +def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_or_nan): + # GH 17605 + # Tests whether the unobserved categories in the result contain 0 or NaN + df = pd.DataFrame( + { + "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), + "cat_2": pd.Categorical(list("AB") * 2, categories=list("ABC")), + "value": [0.1] * 4, + } + ) + unobserved = [tuple("AC"), tuple("BC"), tuple("CA"), tuple("CB"), tuple("CC")] + args = {"nth": [0]}.get(func, []) + + series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"] + agg = getattr(series_groupby, func) + result = agg(*args) + + for idx in unobserved: + val = result.loc[idx] + assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan) + + # If we expect unobserved values to be zero, we also expect the dtype to be int + if zero_or_nan == 0: + assert np.issubdtype(result.dtype, np.integer) + + +def test_series_groupby_categorical_aggregation_getitem(): + # GH 8870 + d = {"foo": [10, 8, 4, 1], "bar": [10, 20, 30, 40], "baz": ["d", "c", "d", "c"]} + df = pd.DataFrame(d) + cat = pd.cut(df["foo"], np.linspace(0, 20, 5)) + df["range"] = cat + groups = df.groupby(["range", "baz"], as_index=True, sort=True) + result = groups["foo"].agg("mean") + expected = groups.agg("mean")["foo"] + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "func, expected_values", + [(pd.Series.nunique, [1, 1, 2]), (pd.Series.count, [1, 2, 2])], +) +def test_groupby_agg_categorical_columns(func, expected_values): + # 31256 + df = pd.DataFrame( + { + "id": [0, 1, 2, 3, 4], + "groups": [0, 1, 1, 2, 2], + "value": pd.Categorical([0, 0, 0, 0, 1]), + } + ).set_index("id") + result = df.groupby("groups").agg(func) + + expected = pd.DataFrame( + {"value": expected_values}, index=pd.Index([0, 1, 2], name="groups"), + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_agg_non_numeric(): + df = pd.DataFrame( + {"A": pd.Categorical(["a", "a", "b"], categories=["a", "b", "c"])} + ) + expected = pd.DataFrame({"A": [2, 1]}, index=[1, 2]) + + result = df.groupby([1, 2, 1]).agg(pd.Series.nunique) + tm.assert_frame_equal(result, expected) + + result = df.groupby([1, 2, 1]).nunique() + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/groupby/test_counting.py b/venv/Lib/site-packages/pandas/tests/groupby/test_counting.py new file mode 100644 index 0000000..b4239d7 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/groupby/test_counting.py @@ -0,0 +1,222 @@ +from itertools import product + +import numpy as np +import pytest + +from pandas import DataFrame, MultiIndex, Period, Series, Timedelta, Timestamp +import pandas._testing as tm + + +class TestCounting: + def test_cumcount(self): + df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"]) + g = df.groupby("A") + sg = g.A + + expected = Series([0, 1, 2, 0, 3]) + + tm.assert_series_equal(expected, g.cumcount()) + tm.assert_series_equal(expected, sg.cumcount()) + + def test_cumcount_empty(self): + ge = DataFrame().groupby(level=0) + se = Series(dtype=object).groupby(level=0) + + # edge case, as this is usually considered float + e = Series(dtype="int64") + + tm.assert_series_equal(e, ge.cumcount()) + tm.assert_series_equal(e, se.cumcount()) + + def test_cumcount_dupe_index(self): + df = DataFrame( + [["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5 + ) + g = df.groupby("A") + sg = g.A + + expected = Series([0, 1, 2, 0, 3], index=[0] * 5) + + tm.assert_series_equal(expected, g.cumcount()) + tm.assert_series_equal(expected, sg.cumcount()) + + def test_cumcount_mi(self): + mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]]) + df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=mi) + g = df.groupby("A") + sg = g.A + + expected = Series([0, 1, 2, 0, 3], index=mi) + + tm.assert_series_equal(expected, g.cumcount()) + tm.assert_series_equal(expected, sg.cumcount()) + + def test_cumcount_groupby_not_col(self): + df = DataFrame( + [["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5 + ) + g = df.groupby([0, 0, 0, 1, 0]) + sg = g.A + + expected = Series([0, 1, 2, 0, 3], index=[0] * 5) + + tm.assert_series_equal(expected, g.cumcount()) + tm.assert_series_equal(expected, sg.cumcount()) + + def test_ngroup(self): + df = DataFrame({"A": list("aaaba")}) + g = df.groupby("A") + sg = g.A + + expected = Series([0, 0, 0, 1, 0]) + + tm.assert_series_equal(expected, g.ngroup()) + tm.assert_series_equal(expected, sg.ngroup()) + + def test_ngroup_distinct(self): + df = DataFrame({"A": list("abcde")}) + g = df.groupby("A") + sg = g.A + + expected = Series(range(5), dtype="int64") + + tm.assert_series_equal(expected, g.ngroup()) + tm.assert_series_equal(expected, sg.ngroup()) + + def test_ngroup_one_group(self): + df = DataFrame({"A": [0] * 5}) + g = df.groupby("A") + sg = g.A + + expected = Series([0] * 5) + + tm.assert_series_equal(expected, g.ngroup()) + tm.assert_series_equal(expected, sg.ngroup()) + + def test_ngroup_empty(self): + ge = DataFrame().groupby(level=0) + se = Series(dtype=object).groupby(level=0) + + # edge case, as this is usually considered float + e = Series(dtype="int64") + + tm.assert_series_equal(e, ge.ngroup()) + tm.assert_series_equal(e, se.ngroup()) + + def test_ngroup_series_matches_frame(self): + df = DataFrame({"A": list("aaaba")}) + s = Series(list("aaaba")) + + tm.assert_series_equal(df.groupby(s).ngroup(), s.groupby(s).ngroup()) + + def test_ngroup_dupe_index(self): + df = DataFrame({"A": list("aaaba")}, index=[0] * 5) + g = df.groupby("A") + sg = g.A + + expected = Series([0, 0, 0, 1, 0], index=[0] * 5) + + tm.assert_series_equal(expected, g.ngroup()) + tm.assert_series_equal(expected, sg.ngroup()) + + def test_ngroup_mi(self): + mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]]) + df = DataFrame({"A": list("aaaba")}, index=mi) + g = df.groupby("A") + sg = g.A + expected = Series([0, 0, 0, 1, 0], index=mi) + + tm.assert_series_equal(expected, g.ngroup()) + tm.assert_series_equal(expected, sg.ngroup()) + + def test_ngroup_groupby_not_col(self): + df = DataFrame({"A": list("aaaba")}, index=[0] * 5) + g = df.groupby([0, 0, 0, 1, 0]) + sg = g.A + + expected = Series([0, 0, 0, 1, 0], index=[0] * 5) + + tm.assert_series_equal(expected, g.ngroup()) + tm.assert_series_equal(expected, sg.ngroup()) + + def test_ngroup_descending(self): + df = DataFrame(["a", "a", "b", "a", "b"], columns=["A"]) + g = df.groupby(["A"]) + + ascending = Series([0, 0, 1, 0, 1]) + descending = Series([1, 1, 0, 1, 0]) + + tm.assert_series_equal(descending, (g.ngroups - 1) - ascending) + tm.assert_series_equal(ascending, g.ngroup(ascending=True)) + tm.assert_series_equal(descending, g.ngroup(ascending=False)) + + def test_ngroup_matches_cumcount(self): + # verify one manually-worked out case works + df = DataFrame( + [["a", "x"], ["a", "y"], ["b", "x"], ["a", "x"], ["b", "y"]], + columns=["A", "X"], + ) + g = df.groupby(["A", "X"]) + g_ngroup = g.ngroup() + g_cumcount = g.cumcount() + expected_ngroup = Series([0, 1, 2, 0, 3]) + expected_cumcount = Series([0, 0, 0, 1, 0]) + + tm.assert_series_equal(g_ngroup, expected_ngroup) + tm.assert_series_equal(g_cumcount, expected_cumcount) + + def test_ngroup_cumcount_pair(self): + # brute force comparison for all small series + for p in product(range(3), repeat=4): + df = DataFrame({"a": p}) + g = df.groupby(["a"]) + + order = sorted(set(p)) + ngroupd = [order.index(val) for val in p] + cumcounted = [p[:i].count(val) for i, val in enumerate(p)] + + tm.assert_series_equal(g.ngroup(), Series(ngroupd)) + tm.assert_series_equal(g.cumcount(), Series(cumcounted)) + + def test_ngroup_respects_groupby_order(self): + np.random.seed(0) + df = DataFrame({"a": np.random.choice(list("abcdef"), 100)}) + for sort_flag in (False, True): + g = df.groupby(["a"], sort=sort_flag) + df["group_id"] = -1 + df["group_index"] = -1 + + for i, (_, group) in enumerate(g): + df.loc[group.index, "group_id"] = i + for j, ind in enumerate(group.index): + df.loc[ind, "group_index"] = j + + tm.assert_series_equal(Series(df["group_id"].values), g.ngroup()) + tm.assert_series_equal(Series(df["group_index"].values), g.cumcount()) + + @pytest.mark.parametrize( + "datetimelike", + [ + [Timestamp(f"2016-05-{i:02d} 20:09:25+00:00") for i in range(1, 4)], + [Timestamp(f"2016-05-{i:02d} 20:09:25") for i in range(1, 4)], + [Timedelta(x, unit="h") for x in range(1, 4)], + [Period(freq="2W", year=2017, month=x) for x in range(1, 4)], + ], + ) + def test_count_with_datetimelike(self, datetimelike): + # test for #13393, where DataframeGroupBy.count() fails + # when counting a datetimelike column. + + df = DataFrame({"x": ["a", "a", "b"], "y": datetimelike}) + res = df.groupby("x").count() + expected = DataFrame({"y": [2, 1]}, index=["a", "b"]) + expected.index.name = "x" + tm.assert_frame_equal(expected, res) + + def test_count_with_only_nans_in_first_group(self): + # GH21956 + df = DataFrame({"A": [np.nan, np.nan], "B": ["a", "b"], "C": [1, 2]}) + result = df.groupby(["A", "B"]).C.count() + mi = MultiIndex(levels=[[], ["a", "b"]], codes=[[], []], names=["A", "B"]) + expected = Series([], index=mi, dtype=np.int64, name="C") + tm.assert_series_equal(result, expected, check_index_type=False) diff --git a/venv/Lib/site-packages/pandas/tests/groupby/test_filters.py b/venv/Lib/site-packages/pandas/tests/groupby/test_filters.py new file mode 100644 index 0000000..c16ad81 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/groupby/test_filters.py @@ -0,0 +1,597 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Series, Timestamp +import pandas._testing as tm + + +def test_filter_series(): + s = pd.Series([1, 3, 20, 5, 22, 24, 7]) + expected_odd = pd.Series([1, 3, 5, 7], index=[0, 1, 3, 6]) + expected_even = pd.Series([20, 22, 24], index=[2, 4, 5]) + grouper = s.apply(lambda x: x % 2) + grouped = s.groupby(grouper) + tm.assert_series_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd) + tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 10), expected_even) + # Test dropna=False. + tm.assert_series_equal( + grouped.filter(lambda x: x.mean() < 10, dropna=False), + expected_odd.reindex(s.index), + ) + tm.assert_series_equal( + grouped.filter(lambda x: x.mean() > 10, dropna=False), + expected_even.reindex(s.index), + ) + + +def test_filter_single_column_df(): + df = pd.DataFrame([1, 3, 20, 5, 22, 24, 7]) + expected_odd = pd.DataFrame([1, 3, 5, 7], index=[0, 1, 3, 6]) + expected_even = pd.DataFrame([20, 22, 24], index=[2, 4, 5]) + grouper = df[0].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + tm.assert_frame_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd) + tm.assert_frame_equal(grouped.filter(lambda x: x.mean() > 10), expected_even) + # Test dropna=False. + tm.assert_frame_equal( + grouped.filter(lambda x: x.mean() < 10, dropna=False), + expected_odd.reindex(df.index), + ) + tm.assert_frame_equal( + grouped.filter(lambda x: x.mean() > 10, dropna=False), + expected_even.reindex(df.index), + ) + + +def test_filter_multi_column_df(): + df = pd.DataFrame({"A": [1, 12, 12, 1], "B": [1, 1, 1, 1]}) + grouper = df["A"].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + expected = pd.DataFrame({"A": [12, 12], "B": [1, 1]}, index=[1, 2]) + tm.assert_frame_equal( + grouped.filter(lambda x: x["A"].sum() - x["B"].sum() > 10), expected + ) + + +def test_filter_mixed_df(): + df = pd.DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()}) + grouper = df["A"].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + expected = pd.DataFrame({"A": [12, 12], "B": ["b", "c"]}, index=[1, 2]) + tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 10), expected) + + +def test_filter_out_all_groups(): + s = pd.Series([1, 3, 20, 5, 22, 24, 7]) + grouper = s.apply(lambda x: x % 2) + grouped = s.groupby(grouper) + tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 1000), s[[]]) + df = pd.DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()}) + grouper = df["A"].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 1000), df.loc[[]]) + + +def test_filter_out_no_groups(): + s = pd.Series([1, 3, 20, 5, 22, 24, 7]) + grouper = s.apply(lambda x: x % 2) + grouped = s.groupby(grouper) + filtered = grouped.filter(lambda x: x.mean() > 0) + tm.assert_series_equal(filtered, s) + df = pd.DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()}) + grouper = df["A"].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + filtered = grouped.filter(lambda x: x["A"].mean() > 0) + tm.assert_frame_equal(filtered, df) + + +def test_filter_out_all_groups_in_df(): + # GH12768 + df = pd.DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]}) + res = df.groupby("a") + res = res.filter(lambda x: x["b"].sum() > 5, dropna=False) + expected = pd.DataFrame({"a": [np.nan] * 3, "b": [np.nan] * 3}) + tm.assert_frame_equal(expected, res) + + df = pd.DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]}) + res = df.groupby("a") + res = res.filter(lambda x: x["b"].sum() > 5, dropna=True) + expected = pd.DataFrame({"a": [], "b": []}, dtype="int64") + tm.assert_frame_equal(expected, res) + + +def test_filter_condition_raises(): + def raise_if_sum_is_zero(x): + if x.sum() == 0: + raise ValueError + else: + return x.sum() > 0 + + s = pd.Series([-1, 0, 1, 2]) + grouper = s.apply(lambda x: x % 2) + grouped = s.groupby(grouper) + msg = "the filter must return a boolean result" + with pytest.raises(TypeError, match=msg): + grouped.filter(raise_if_sum_is_zero) + + +def test_filter_with_axis_in_groupby(): + # issue 11041 + index = pd.MultiIndex.from_product([range(10), [0, 1]]) + data = pd.DataFrame(np.arange(100).reshape(-1, 20), columns=index, dtype="int64") + result = data.groupby(level=0, axis=1).filter(lambda x: x.iloc[0, 0] > 10) + expected = data.iloc[:, 12:20] + tm.assert_frame_equal(result, expected) + + +def test_filter_bad_shapes(): + df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)}) + s = df["B"] + g_df = df.groupby("B") + g_s = s.groupby(s) + + f = lambda x: x + msg = "filter function returned a DataFrame, but expected a scalar bool" + with pytest.raises(TypeError, match=msg): + g_df.filter(f) + msg = "the filter must return a boolean result" + with pytest.raises(TypeError, match=msg): + g_s.filter(f) + + f = lambda x: x == 1 + msg = "filter function returned a DataFrame, but expected a scalar bool" + with pytest.raises(TypeError, match=msg): + g_df.filter(f) + msg = "the filter must return a boolean result" + with pytest.raises(TypeError, match=msg): + g_s.filter(f) + + f = lambda x: np.outer(x, x) + msg = "can't multiply sequence by non-int of type 'str'" + with pytest.raises(TypeError, match=msg): + g_df.filter(f) + msg = "the filter must return a boolean result" + with pytest.raises(TypeError, match=msg): + g_s.filter(f) + + +def test_filter_nan_is_false(): + df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)}) + s = df["B"] + g_df = df.groupby(df["B"]) + g_s = s.groupby(s) + + f = lambda x: np.nan + tm.assert_frame_equal(g_df.filter(f), df.loc[[]]) + tm.assert_series_equal(g_s.filter(f), s[[]]) + + +def test_filter_against_workaround(): + np.random.seed(0) + # Series of ints + s = Series(np.random.randint(0, 100, 1000)) + grouper = s.apply(lambda x: np.round(x, -1)) + grouped = s.groupby(grouper) + f = lambda x: x.mean() > 10 + + old_way = s[grouped.transform(f).astype("bool")] + new_way = grouped.filter(f) + tm.assert_series_equal(new_way.sort_values(), old_way.sort_values()) + + # Series of floats + s = 100 * Series(np.random.random(1000)) + grouper = s.apply(lambda x: np.round(x, -1)) + grouped = s.groupby(grouper) + f = lambda x: x.mean() > 10 + old_way = s[grouped.transform(f).astype("bool")] + new_way = grouped.filter(f) + tm.assert_series_equal(new_way.sort_values(), old_way.sort_values()) + + # Set up DataFrame of ints, floats, strings. + from string import ascii_lowercase + + letters = np.array(list(ascii_lowercase)) + N = 1000 + random_letters = letters.take(np.random.randint(0, 26, N)) + df = DataFrame( + { + "ints": Series(np.random.randint(0, 100, N)), + "floats": N / 10 * Series(np.random.random(N)), + "letters": Series(random_letters), + } + ) + + # Group by ints; filter on floats. + grouped = df.groupby("ints") + old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 20).astype("bool")] + new_way = grouped.filter(lambda x: x["floats"].mean() > N / 20) + tm.assert_frame_equal(new_way, old_way) + + # Group by floats (rounded); filter on strings. + grouper = df.floats.apply(lambda x: np.round(x, -1)) + grouped = df.groupby(grouper) + old_way = df[grouped.letters.transform(lambda x: len(x) < N / 10).astype("bool")] + new_way = grouped.filter(lambda x: len(x.letters) < N / 10) + tm.assert_frame_equal(new_way, old_way) + + # Group by strings; filter on ints. + grouped = df.groupby("letters") + old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 20).astype("bool")] + new_way = grouped.filter(lambda x: x["ints"].mean() > N / 20) + tm.assert_frame_equal(new_way, old_way) + + +def test_filter_using_len(): + # BUG GH4447 + df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)}) + grouped = df.groupby("B") + actual = grouped.filter(lambda x: len(x) > 2) + expected = DataFrame( + {"A": np.arange(2, 6), "B": list("bbbb"), "C": np.arange(2, 6)}, + index=np.arange(2, 6), + ) + tm.assert_frame_equal(actual, expected) + + actual = grouped.filter(lambda x: len(x) > 4) + expected = df.loc[[]] + tm.assert_frame_equal(actual, expected) + + # Series have always worked properly, but we'll test anyway. + s = df["B"] + grouped = s.groupby(s) + actual = grouped.filter(lambda x: len(x) > 2) + expected = Series(4 * ["b"], index=np.arange(2, 6), name="B") + tm.assert_series_equal(actual, expected) + + actual = grouped.filter(lambda x: len(x) > 4) + expected = s[[]] + tm.assert_series_equal(actual, expected) + + +def test_filter_maintains_ordering(): + # Simple case: index is sequential. #4621 + df = DataFrame( + {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]} + ) + s = df["pid"] + grouped = df.groupby("tag") + actual = grouped.filter(lambda x: len(x) > 1) + expected = df.iloc[[1, 2, 4, 7]] + tm.assert_frame_equal(actual, expected) + + grouped = s.groupby(df["tag"]) + actual = grouped.filter(lambda x: len(x) > 1) + expected = s.iloc[[1, 2, 4, 7]] + tm.assert_series_equal(actual, expected) + + # Now index is sequentially decreasing. + df.index = np.arange(len(df) - 1, -1, -1) + s = df["pid"] + grouped = df.groupby("tag") + actual = grouped.filter(lambda x: len(x) > 1) + expected = df.iloc[[1, 2, 4, 7]] + tm.assert_frame_equal(actual, expected) + + grouped = s.groupby(df["tag"]) + actual = grouped.filter(lambda x: len(x) > 1) + expected = s.iloc[[1, 2, 4, 7]] + tm.assert_series_equal(actual, expected) + + # Index is shuffled. + SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3] + df.index = df.index[SHUFFLED] + s = df["pid"] + grouped = df.groupby("tag") + actual = grouped.filter(lambda x: len(x) > 1) + expected = df.iloc[[1, 2, 4, 7]] + tm.assert_frame_equal(actual, expected) + + grouped = s.groupby(df["tag"]) + actual = grouped.filter(lambda x: len(x) > 1) + expected = s.iloc[[1, 2, 4, 7]] + tm.assert_series_equal(actual, expected) + + +def test_filter_multiple_timestamp(): + # GH 10114 + df = DataFrame( + { + "A": np.arange(5, dtype="int64"), + "B": ["foo", "bar", "foo", "bar", "bar"], + "C": Timestamp("20130101"), + } + ) + + grouped = df.groupby(["B", "C"]) + + result = grouped["A"].filter(lambda x: True) + tm.assert_series_equal(df["A"], result) + + result = grouped["A"].transform(len) + expected = Series([2, 3, 2, 3, 3], name="A") + tm.assert_series_equal(result, expected) + + result = grouped.filter(lambda x: True) + tm.assert_frame_equal(df, result) + + result = grouped.transform("sum") + expected = DataFrame({"A": [2, 8, 2, 8, 8]}) + tm.assert_frame_equal(result, expected) + + result = grouped.transform(len) + expected = DataFrame({"A": [2, 3, 2, 3, 3]}) + tm.assert_frame_equal(result, expected) + + +def test_filter_and_transform_with_non_unique_int_index(): + # GH4620 + index = [1, 1, 1, 2, 1, 1, 0, 1] + df = DataFrame( + {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}, + index=index, + ) + grouped_df = df.groupby("tag") + ser = df["pid"] + grouped_ser = ser.groupby(df["tag"]) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + tm.assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + tm.assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + tm.assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid") + # ^ made manually because this can get confusing! + tm.assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid") + tm.assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + tm.assert_series_equal(actual, expected) + + +def test_filter_and_transform_with_multiple_non_unique_int_index(): + # GH4620 + index = [1, 1, 1, 2, 0, 0, 0, 1] + df = DataFrame( + {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}, + index=index, + ) + grouped_df = df.groupby("tag") + ser = df["pid"] + grouped_ser = ser.groupby(df["tag"]) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + tm.assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + tm.assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + tm.assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid") + # ^ made manually because this can get confusing! + tm.assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid") + tm.assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + tm.assert_series_equal(actual, expected) + + +def test_filter_and_transform_with_non_unique_float_index(): + # GH4620 + index = np.array([1, 1, 1, 2, 1, 1, 0, 1], dtype=float) + df = DataFrame( + {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}, + index=index, + ) + grouped_df = df.groupby("tag") + ser = df["pid"] + grouped_ser = ser.groupby(df["tag"]) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + tm.assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + tm.assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + tm.assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid") + # ^ made manually because this can get confusing! + tm.assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid") + tm.assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + tm.assert_series_equal(actual, expected) + + +def test_filter_and_transform_with_non_unique_timestamp_index(): + # GH4620 + t0 = Timestamp("2013-09-30 00:05:00") + t1 = Timestamp("2013-10-30 00:05:00") + t2 = Timestamp("2013-11-30 00:05:00") + index = [t1, t1, t1, t2, t1, t1, t0, t1] + df = DataFrame( + {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}, + index=index, + ) + grouped_df = df.groupby("tag") + ser = df["pid"] + grouped_ser = ser.groupby(df["tag"]) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + tm.assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + tm.assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + tm.assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid") + # ^ made manually because this can get confusing! + tm.assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid") + tm.assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + tm.assert_series_equal(actual, expected) + + +def test_filter_and_transform_with_non_unique_string_index(): + # GH4620 + index = list("bbbcbbab") + df = DataFrame( + {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}, + index=index, + ) + grouped_df = df.groupby("tag") + ser = df["pid"] + grouped_ser = ser.groupby(df["tag"]) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + tm.assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + tm.assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + tm.assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid") + # ^ made manually because this can get confusing! + tm.assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid") + tm.assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + tm.assert_series_equal(actual, expected) + + +def test_filter_has_access_to_grouped_cols(): + df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=["A", "B"]) + g = df.groupby("A") + # previously didn't have access to col A #???? + filt = g.filter(lambda x: x["A"].sum() == 2) + tm.assert_frame_equal(filt, df.iloc[[0, 1]]) + + +def test_filter_enforces_scalarness(): + df = pd.DataFrame( + [ + ["best", "a", "x"], + ["worst", "b", "y"], + ["best", "c", "x"], + ["best", "d", "y"], + ["worst", "d", "y"], + ["worst", "d", "y"], + ["best", "d", "z"], + ], + columns=["a", "b", "c"], + ) + with pytest.raises(TypeError, match="filter function returned a.*"): + df.groupby("c").filter(lambda g: g["a"] == "best") + + +def test_filter_non_bool_raises(): + df = pd.DataFrame( + [ + ["best", "a", 1], + ["worst", "b", 1], + ["best", "c", 1], + ["best", "d", 1], + ["worst", "d", 1], + ["worst", "d", 1], + ["best", "d", 1], + ], + columns=["a", "b", "c"], + ) + with pytest.raises(TypeError, match="filter function returned a.*"): + df.groupby("a").filter(lambda g: g.c.mean()) + + +def test_filter_dropna_with_empty_groups(): + # GH 10780 + data = pd.Series(np.random.rand(9), index=np.repeat([1, 2, 3], 3)) + groupped = data.groupby(level=0) + result_false = groupped.filter(lambda x: x.mean() > 1, dropna=False) + expected_false = pd.Series([np.nan] * 9, index=np.repeat([1, 2, 3], 3)) + tm.assert_series_equal(result_false, expected_false) + + result_true = groupped.filter(lambda x: x.mean() > 1, dropna=True) + expected_true = pd.Series(index=pd.Index([], dtype=int), dtype=np.float64) + tm.assert_series_equal(result_true, expected_true) diff --git a/venv/Lib/site-packages/pandas/tests/groupby/test_function.py b/venv/Lib/site-packages/pandas/tests/groupby/test_function.py new file mode 100644 index 0000000..97cf1af --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/groupby/test_function.py @@ -0,0 +1,1557 @@ +import builtins +import datetime as dt +from io import StringIO +from itertools import product +from string import ascii_lowercase + +import numpy as np +import pytest + +from pandas.errors import UnsupportedFunctionCall + +import pandas as pd +from pandas import ( + DataFrame, + Index, + MultiIndex, + NaT, + Series, + Timestamp, + date_range, + isna, +) +import pandas._testing as tm +import pandas.core.nanops as nanops +from pandas.util import _test_decorators as td + + +@pytest.mark.parametrize("agg_func", ["any", "all"]) +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.parametrize( + "vals", + [ + ["foo", "bar", "baz"], + ["foo", "", ""], + ["", "", ""], + [1, 2, 3], + [1, 0, 0], + [0, 0, 0], + [1.0, 2.0, 3.0], + [1.0, 0.0, 0.0], + [0.0, 0.0, 0.0], + [True, True, True], + [True, False, False], + [False, False, False], + [np.nan, np.nan, np.nan], + ], +) +def test_groupby_bool_aggs(agg_func, skipna, vals): + df = DataFrame({"key": ["a"] * 3 + ["b"] * 3, "val": vals * 2}) + + # Figure out expectation using Python builtin + exp = getattr(builtins, agg_func)(vals) + + # edge case for missing data with skipna and 'any' + if skipna and all(isna(vals)) and agg_func == "any": + exp = False + + exp_df = DataFrame([exp] * 2, columns=["val"], index=Index(["a", "b"], name="key")) + result = getattr(df.groupby("key"), agg_func)(skipna=skipna) + tm.assert_frame_equal(result, exp_df) + + +def test_max_min_non_numeric(): + # #2700 + aa = DataFrame({"nn": [11, 11, 22, 22], "ii": [1, 2, 3, 4], "ss": 4 * ["mama"]}) + + result = aa.groupby("nn").max() + assert "ss" in result + + result = aa.groupby("nn").max(numeric_only=False) + assert "ss" in result + + result = aa.groupby("nn").min() + assert "ss" in result + + result = aa.groupby("nn").min(numeric_only=False) + assert "ss" in result + + +def test_intercept_builtin_sum(): + s = Series([1.0, 2.0, np.nan, 3.0]) + grouped = s.groupby([0, 1, 2, 2]) + + result = grouped.agg(builtins.sum) + result2 = grouped.apply(builtins.sum) + expected = grouped.sum() + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result2, expected) + + +# @pytest.mark.parametrize("f", [max, min, sum]) +# def test_builtins_apply(f): + + +@pytest.mark.parametrize("f", [max, min, sum]) +@pytest.mark.parametrize("keys", ["jim", ["jim", "joe"]]) # Single key # Multi-key +def test_builtins_apply(keys, f): + # see gh-8155 + df = pd.DataFrame(np.random.randint(1, 50, (1000, 2)), columns=["jim", "joe"]) + df["jolie"] = np.random.randn(1000) + + fname = f.__name__ + result = df.groupby(keys).apply(f) + ngroups = len(df.drop_duplicates(subset=keys)) + + assert_msg = f"invalid frame shape: {result.shape} (expected ({ngroups}, 3))" + assert result.shape == (ngroups, 3), assert_msg + + tm.assert_frame_equal( + result, # numpy's equivalent function + df.groupby(keys).apply(getattr(np, fname)), + ) + + if f != sum: + expected = df.groupby(keys).agg(fname).reset_index() + expected.set_index(keys, inplace=True, drop=False) + tm.assert_frame_equal(result, expected, check_dtype=False) + + tm.assert_series_equal(getattr(result, fname)(), getattr(df, fname)()) + + +def test_arg_passthru(): + # make sure that we are passing thru kwargs + # to our agg functions + + # GH3668 + # GH5724 + df = pd.DataFrame( + { + "group": [1, 1, 2], + "int": [1, 2, 3], + "float": [4.0, 5.0, 6.0], + "string": list("abc"), + "category_string": pd.Series(list("abc")).astype("category"), + "category_int": [7, 8, 9], + "datetime": pd.date_range("20130101", periods=3), + "datetimetz": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "timedelta": pd.timedelta_range("1 s", periods=3, freq="s"), + }, + columns=[ + "group", + "int", + "float", + "string", + "category_string", + "category_int", + "datetime", + "datetimetz", + "timedelta", + ], + ) + + expected_columns_numeric = Index(["int", "float", "category_int"]) + + # mean / median + expected = pd.DataFrame( + { + "category_int": [7.5, 9], + "float": [4.5, 6.0], + "timedelta": [pd.Timedelta("1.5s"), pd.Timedelta("3s")], + "int": [1.5, 3], + "datetime": [ + pd.Timestamp("2013-01-01 12:00:00"), + pd.Timestamp("2013-01-03 00:00:00"), + ], + "datetimetz": [ + pd.Timestamp("2013-01-01 12:00:00", tz="US/Eastern"), + pd.Timestamp("2013-01-03 00:00:00", tz="US/Eastern"), + ], + }, + index=Index([1, 2], name="group"), + columns=["int", "float", "category_int", "datetime", "datetimetz", "timedelta"], + ) + + for attr in ["mean", "median"]: + f = getattr(df.groupby("group"), attr) + result = f() + tm.assert_index_equal(result.columns, expected_columns_numeric) + + result = f(numeric_only=False) + tm.assert_frame_equal(result.reindex_like(expected), expected) + + # TODO: min, max *should* handle + # categorical (ordered) dtype + expected_columns = Index( + [ + "int", + "float", + "string", + "category_int", + "datetime", + "datetimetz", + "timedelta", + ] + ) + for attr in ["min", "max"]: + f = getattr(df.groupby("group"), attr) + result = f() + tm.assert_index_equal(result.columns, expected_columns) + + result = f(numeric_only=False) + tm.assert_index_equal(result.columns, expected_columns) + + expected_columns = Index( + [ + "int", + "float", + "string", + "category_string", + "category_int", + "datetime", + "datetimetz", + "timedelta", + ] + ) + for attr in ["first", "last"]: + f = getattr(df.groupby("group"), attr) + result = f() + tm.assert_index_equal(result.columns, expected_columns) + + result = f(numeric_only=False) + tm.assert_index_equal(result.columns, expected_columns) + + expected_columns = Index(["int", "float", "string", "category_int", "timedelta"]) + for attr in ["sum"]: + f = getattr(df.groupby("group"), attr) + result = f() + tm.assert_index_equal(result.columns, expected_columns_numeric) + + result = f(numeric_only=False) + tm.assert_index_equal(result.columns, expected_columns) + + expected_columns = Index(["int", "float", "category_int"]) + for attr in ["prod", "cumprod"]: + f = getattr(df.groupby("group"), attr) + result = f() + tm.assert_index_equal(result.columns, expected_columns_numeric) + + result = f(numeric_only=False) + tm.assert_index_equal(result.columns, expected_columns) + + # like min, max, but don't include strings + expected_columns = Index( + ["int", "float", "category_int", "datetime", "datetimetz", "timedelta"] + ) + for attr in ["cummin", "cummax"]: + f = getattr(df.groupby("group"), attr) + result = f() + # GH 15561: numeric_only=False set by default like min/max + tm.assert_index_equal(result.columns, expected_columns) + + result = f(numeric_only=False) + tm.assert_index_equal(result.columns, expected_columns) + + expected_columns = Index(["int", "float", "category_int", "timedelta"]) + for attr in ["cumsum"]: + f = getattr(df.groupby("group"), attr) + result = f() + tm.assert_index_equal(result.columns, expected_columns_numeric) + + result = f(numeric_only=False) + tm.assert_index_equal(result.columns, expected_columns) + + +def test_non_cython_api(): + + # GH5610 + # non-cython calls should not include the grouper + + df = DataFrame( + [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]], columns=["A", "B", "C"] + ) + g = df.groupby("A") + gni = df.groupby("A", as_index=False) + + # mad + expected = DataFrame([[0], [np.nan]], columns=["B"], index=[1, 3]) + expected.index.name = "A" + result = g.mad() + tm.assert_frame_equal(result, expected) + + expected = DataFrame([[0.0, 0.0], [0, np.nan]], columns=["A", "B"], index=[0, 1]) + result = gni.mad() + tm.assert_frame_equal(result, expected) + + # describe + expected_index = pd.Index([1, 3], name="A") + expected_col = pd.MultiIndex( + levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]], + codes=[[0] * 8, list(range(8))], + ) + expected = pd.DataFrame( + [ + [1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0], + [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + ], + index=expected_index, + columns=expected_col, + ) + result = g.describe() + tm.assert_frame_equal(result, expected) + + expected = pd.concat( + [ + df[df.A == 1].describe().unstack().to_frame().T, + df[df.A == 3].describe().unstack().to_frame().T, + ] + ) + expected.index = pd.Index([0, 1]) + result = gni.describe() + tm.assert_frame_equal(result, expected) + + # any + expected = DataFrame( + [[True, True], [False, True]], columns=["B", "C"], index=[1, 3] + ) + expected.index.name = "A" + result = g.any() + tm.assert_frame_equal(result, expected) + + # idxmax + expected = DataFrame([[0.0], [np.nan]], columns=["B"], index=[1, 3]) + expected.index.name = "A" + result = g.idxmax() + tm.assert_frame_equal(result, expected) + + +def test_cython_api2(): + + # this takes the fast apply path + + # cumsum (GH5614) + df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"]) + expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=["B", "C"]) + result = df.groupby("A").cumsum() + tm.assert_frame_equal(result, expected) + + # GH 5755 - cumsum is a transformer and should ignore as_index + result = df.groupby("A", as_index=False).cumsum() + tm.assert_frame_equal(result, expected) + + # GH 13994 + result = df.groupby("A").cumsum(axis=1) + expected = df.cumsum(axis=1) + tm.assert_frame_equal(result, expected) + result = df.groupby("A").cumprod(axis=1) + expected = df.cumprod(axis=1) + tm.assert_frame_equal(result, expected) + + +def test_cython_median(): + df = DataFrame(np.random.randn(1000)) + df.values[::2] = np.nan + + labels = np.random.randint(0, 50, size=1000).astype(float) + labels[::17] = np.nan + + result = df.groupby(labels).median() + exp = df.groupby(labels).agg(nanops.nanmedian) + tm.assert_frame_equal(result, exp) + + df = DataFrame(np.random.randn(1000, 5)) + rs = df.groupby(labels).agg(np.median) + xp = df.groupby(labels).median() + tm.assert_frame_equal(rs, xp) + + +def test_median_empty_bins(observed): + df = pd.DataFrame(np.random.randint(0, 44, 500)) + + grps = range(0, 55, 5) + bins = pd.cut(df[0], grps) + + result = df.groupby(bins, observed=observed).median() + expected = df.groupby(bins, observed=observed).agg(lambda x: x.median()) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype", ["int8", "int16", "int32", "int64", "float32", "float64", "uint64"] +) +@pytest.mark.parametrize( + "method,data", + [ + ("first", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}), + ("last", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}), + ("min", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}), + ("max", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}), + ("nth", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}], "args": [1]}), + ("count", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 2}], "out_type": "int64"}), + ], +) +def test_groupby_non_arithmetic_agg_types(dtype, method, data): + # GH9311, GH6620 + df = pd.DataFrame( + [{"a": 1, "b": 1}, {"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 2, "b": 4}] + ) + + df["b"] = df.b.astype(dtype) + + if "args" not in data: + data["args"] = [] + + if "out_type" in data: + out_type = data["out_type"] + else: + out_type = dtype + + exp = data["df"] + df_out = pd.DataFrame(exp) + + df_out["b"] = df_out.b.astype(out_type) + df_out.set_index("a", inplace=True) + + grpd = df.groupby("a") + t = getattr(grpd, method)(*data["args"]) + tm.assert_frame_equal(t, df_out) + + +@pytest.mark.parametrize( + "i", + [ + ( + Timestamp("2011-01-15 12:50:28.502376"), + Timestamp("2011-01-20 12:50:28.593448"), + ), + (24650000000000001, 24650000000000002), + ], +) +def test_groupby_non_arithmetic_agg_int_like_precision(i): + # see gh-6620, gh-9311 + df = pd.DataFrame([{"a": 1, "b": i[0]}, {"a": 1, "b": i[1]}]) + + grp_exp = { + "first": {"expected": i[0]}, + "last": {"expected": i[1]}, + "min": {"expected": i[0]}, + "max": {"expected": i[1]}, + "nth": {"expected": i[1], "args": [1]}, + "count": {"expected": 2}, + } + + for method, data in grp_exp.items(): + if "args" not in data: + data["args"] = [] + + grouped = df.groupby("a") + res = getattr(grouped, method)(*data["args"]) + + assert res.iloc[0].b == data["expected"] + + +@pytest.mark.parametrize( + "func, values", + [ + ("idxmin", {"c_int": [0, 2], "c_float": [1, 3], "c_date": [1, 2]}), + ("idxmax", {"c_int": [1, 3], "c_float": [0, 2], "c_date": [0, 3]}), + ], +) +def test_idxmin_idxmax_returns_int_types(func, values): + # GH 25444 + df = pd.DataFrame( + { + "name": ["A", "A", "B", "B"], + "c_int": [1, 2, 3, 4], + "c_float": [4.02, 3.03, 2.04, 1.05], + "c_date": ["2019", "2018", "2016", "2017"], + } + ) + df["c_date"] = pd.to_datetime(df["c_date"]) + + result = getattr(df.groupby("name"), func)() + + expected = pd.DataFrame(values, index=Index(["A", "B"], name="name")) + + tm.assert_frame_equal(result, expected) + + +def test_fill_consistency(): + + # GH9221 + # pass thru keyword arguments to the generated wrapper + # are set if the passed kw is None (only) + df = DataFrame( + index=pd.MultiIndex.from_product( + [["value1", "value2"], date_range("2014-01-01", "2014-01-06")] + ), + columns=Index(["1", "2"], name="id"), + ) + df["1"] = [ + np.nan, + 1, + np.nan, + np.nan, + 11, + np.nan, + np.nan, + 2, + np.nan, + np.nan, + 22, + np.nan, + ] + df["2"] = [ + np.nan, + 3, + np.nan, + np.nan, + 33, + np.nan, + np.nan, + 4, + np.nan, + np.nan, + 44, + np.nan, + ] + + expected = df.groupby(level=0, axis=0).fillna(method="ffill") + result = df.T.groupby(level=0, axis=1).fillna(method="ffill").T + tm.assert_frame_equal(result, expected) + + +def test_groupby_cumprod(): + # GH 4095 + df = pd.DataFrame({"key": ["b"] * 10, "value": 2}) + + actual = df.groupby("key")["value"].cumprod() + expected = df.groupby("key")["value"].apply(lambda x: x.cumprod()) + expected.name = "value" + tm.assert_series_equal(actual, expected) + + df = pd.DataFrame({"key": ["b"] * 100, "value": 2}) + actual = df.groupby("key")["value"].cumprod() + # if overflows, groupby product casts to float + # while numpy passes back invalid values + df["value"] = df["value"].astype(float) + expected = df.groupby("key")["value"].apply(lambda x: x.cumprod()) + expected.name = "value" + tm.assert_series_equal(actual, expected) + + +def scipy_sem(*args, **kwargs): + from scipy.stats import sem + + return sem(*args, ddof=1, **kwargs) + + +@pytest.mark.parametrize( + "op,targop", + [ + ("mean", np.mean), + ("median", np.median), + ("std", np.std), + ("var", np.var), + ("sum", np.sum), + ("prod", np.prod), + ("min", np.min), + ("max", np.max), + ("first", lambda x: x.iloc[0]), + ("last", lambda x: x.iloc[-1]), + ("count", np.size), + pytest.param("sem", scipy_sem, marks=td.skip_if_no_scipy), + ], +) +def test_ops_general(op, targop): + df = DataFrame(np.random.randn(1000)) + labels = np.random.randint(0, 50, size=1000).astype(float) + + result = getattr(df.groupby(labels), op)().astype(float) + expected = df.groupby(labels).agg(targop) + tm.assert_frame_equal(result, expected) + + +def test_max_nan_bug(): + raw = """,Date,app,File +-04-23,2013-04-23 00:00:00,,log080001.log +-05-06,2013-05-06 00:00:00,,log.log +-05-07,2013-05-07 00:00:00,OE,xlsx""" + + df = pd.read_csv(StringIO(raw), parse_dates=[0]) + gb = df.groupby("Date") + r = gb[["File"]].max() + e = gb["File"].max().to_frame() + tm.assert_frame_equal(r, e) + assert not r["File"].isna().any() + + +def test_nlargest(): + a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) + b = Series(list("a" * 5 + "b" * 5)) + gb = a.groupby(b) + r = gb.nlargest(3) + e = Series( + [7, 5, 3, 10, 9, 6], + index=MultiIndex.from_arrays([list("aaabbb"), [3, 2, 1, 9, 5, 8]]), + ) + tm.assert_series_equal(r, e) + + a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) + gb = a.groupby(b) + e = Series( + [3, 2, 1, 3, 3, 2], + index=MultiIndex.from_arrays([list("aaabbb"), [2, 3, 1, 6, 5, 7]]), + ) + tm.assert_series_equal(gb.nlargest(3, keep="last"), e) + + +def test_nlargest_mi_grouper(): + # see gh-21411 + npr = np.random.RandomState(123456789) + + dts = date_range("20180101", periods=10) + iterables = [dts, ["one", "two"]] + + idx = MultiIndex.from_product(iterables, names=["first", "second"]) + s = Series(npr.randn(20), index=idx) + + result = s.groupby("first").nlargest(1) + + exp_idx = MultiIndex.from_tuples( + [ + (dts[0], dts[0], "one"), + (dts[1], dts[1], "one"), + (dts[2], dts[2], "one"), + (dts[3], dts[3], "two"), + (dts[4], dts[4], "one"), + (dts[5], dts[5], "one"), + (dts[6], dts[6], "one"), + (dts[7], dts[7], "one"), + (dts[8], dts[8], "two"), + (dts[9], dts[9], "one"), + ], + names=["first", "first", "second"], + ) + + exp_values = [ + 2.2129019979039612, + 1.8417114045748335, + 0.858963679564603, + 1.3759151378258088, + 0.9430284594687134, + 0.5296914208183142, + 0.8318045593815487, + -0.8476703342910327, + 0.3804446884133735, + -0.8028845810770998, + ] + + expected = Series(exp_values, index=exp_idx) + tm.assert_series_equal(result, expected, check_exact=False, check_less_precise=True) + + +def test_nsmallest(): + a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) + b = Series(list("a" * 5 + "b" * 5)) + gb = a.groupby(b) + r = gb.nsmallest(3) + e = Series( + [1, 2, 3, 0, 4, 6], + index=MultiIndex.from_arrays([list("aaabbb"), [0, 4, 1, 6, 7, 8]]), + ) + tm.assert_series_equal(r, e) + + a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) + gb = a.groupby(b) + e = Series( + [0, 1, 1, 0, 1, 2], + index=MultiIndex.from_arrays([list("aaabbb"), [4, 1, 0, 9, 8, 7]]), + ) + tm.assert_series_equal(gb.nsmallest(3, keep="last"), e) + + +@pytest.mark.parametrize("func", ["mean", "var", "std", "cumprod", "cumsum"]) +def test_numpy_compat(func): + # see gh-12811 + df = pd.DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]}) + g = df.groupby("A") + + msg = "numpy operations are not valid with groupby" + + with pytest.raises(UnsupportedFunctionCall, match=msg): + getattr(g, func)(1, 2, 3) + with pytest.raises(UnsupportedFunctionCall, match=msg): + getattr(g, func)(foo=1) + + +def test_cummin_cummax(): + # GH 15048 + num_types = [np.int32, np.int64, np.float32, np.float64] + num_mins = [ + np.iinfo(np.int32).min, + np.iinfo(np.int64).min, + np.finfo(np.float32).min, + np.finfo(np.float64).min, + ] + num_max = [ + np.iinfo(np.int32).max, + np.iinfo(np.int64).max, + np.finfo(np.float32).max, + np.finfo(np.float64).max, + ] + base_df = pd.DataFrame( + {"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]} + ) + expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] + expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3] + + for dtype, min_val, max_val in zip(num_types, num_mins, num_max): + df = base_df.astype(dtype) + + # cummin + expected = pd.DataFrame({"B": expected_mins}).astype(dtype) + result = df.groupby("A").cummin() + tm.assert_frame_equal(result, expected) + result = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() + tm.assert_frame_equal(result, expected) + + # Test cummin w/ min value for dtype + df.loc[[2, 6], "B"] = min_val + expected.loc[[2, 3, 6, 7], "B"] = min_val + result = df.groupby("A").cummin() + tm.assert_frame_equal(result, expected) + expected = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() + tm.assert_frame_equal(result, expected) + + # cummax + expected = pd.DataFrame({"B": expected_maxs}).astype(dtype) + result = df.groupby("A").cummax() + tm.assert_frame_equal(result, expected) + result = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() + tm.assert_frame_equal(result, expected) + + # Test cummax w/ max value for dtype + df.loc[[2, 6], "B"] = max_val + expected.loc[[2, 3, 6, 7], "B"] = max_val + result = df.groupby("A").cummax() + tm.assert_frame_equal(result, expected) + expected = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() + tm.assert_frame_equal(result, expected) + + # Test nan in some values + base_df.loc[[0, 2, 4, 6], "B"] = np.nan + expected = pd.DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]}) + result = base_df.groupby("A").cummin() + tm.assert_frame_equal(result, expected) + expected = base_df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]}) + result = base_df.groupby("A").cummax() + tm.assert_frame_equal(result, expected) + expected = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() + tm.assert_frame_equal(result, expected) + + # Test nan in entire column + base_df["B"] = np.nan + expected = pd.DataFrame({"B": [np.nan] * 8}) + result = base_df.groupby("A").cummin() + tm.assert_frame_equal(expected, result) + result = base_df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() + tm.assert_frame_equal(expected, result) + result = base_df.groupby("A").cummax() + tm.assert_frame_equal(expected, result) + result = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() + tm.assert_frame_equal(expected, result) + + # GH 15561 + df = pd.DataFrame(dict(a=[1], b=pd.to_datetime(["2001"]))) + expected = pd.Series(pd.to_datetime("2001"), index=[0], name="b") + for method in ["cummax", "cummin"]: + result = getattr(df.groupby("a")["b"], method)() + tm.assert_series_equal(expected, result) + + # GH 15635 + df = pd.DataFrame(dict(a=[1, 2, 1], b=[2, 1, 1])) + result = df.groupby("a").b.cummax() + expected = pd.Series([2, 1, 2], name="b") + tm.assert_series_equal(result, expected) + + df = pd.DataFrame(dict(a=[1, 2, 1], b=[1, 2, 2])) + result = df.groupby("a").b.cummin() + expected = pd.Series([1, 2, 1], name="b") + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "in_vals, out_vals", + [ + # Basics: strictly increasing (T), strictly decreasing (F), + # abs val increasing (F), non-strictly increasing (T) + ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], [True, False, False, True]), + # Test with inf vals + ( + [1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf], + [True, False, True, False], + ), + # Test with nan vals; should always be False + ( + [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], + [False, False, False, False], + ), + ], +) +def test_is_monotonic_increasing(in_vals, out_vals): + # GH 17015 + source_dict = { + "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"], + "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"], + "C": in_vals, + } + df = pd.DataFrame(source_dict) + result = df.groupby("B").C.is_monotonic_increasing + index = Index(list("abcd"), name="B") + expected = pd.Series(index=index, data=out_vals, name="C") + tm.assert_series_equal(result, expected) + + # Also check result equal to manually taking x.is_monotonic_increasing. + expected = df.groupby(["B"]).C.apply(lambda x: x.is_monotonic_increasing) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "in_vals, out_vals", + [ + # Basics: strictly decreasing (T), strictly increasing (F), + # abs val decreasing (F), non-strictly increasing (T) + ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], [True, False, False, True]), + # Test with inf vals + ( + [np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf], + [True, True, False, True], + ), + # Test with nan vals; should always be False + ( + [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], + [False, False, False, False], + ), + ], +) +def test_is_monotonic_decreasing(in_vals, out_vals): + # GH 17015 + source_dict = { + "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"], + "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"], + "C": in_vals, + } + + df = pd.DataFrame(source_dict) + result = df.groupby("B").C.is_monotonic_decreasing + index = Index(list("abcd"), name="B") + expected = pd.Series(index=index, data=out_vals, name="C") + tm.assert_series_equal(result, expected) + + +# describe +# -------------------------------- + + +def test_apply_describe_bug(mframe): + grouped = mframe.groupby(level="first") + grouped.describe() # it works! + + +def test_series_describe_multikey(): + ts = tm.makeTimeSeries() + grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) + result = grouped.describe() + tm.assert_series_equal(result["mean"], grouped.mean(), check_names=False) + tm.assert_series_equal(result["std"], grouped.std(), check_names=False) + tm.assert_series_equal(result["min"], grouped.min(), check_names=False) + + +def test_series_describe_single(): + ts = tm.makeTimeSeries() + grouped = ts.groupby(lambda x: x.month) + result = grouped.apply(lambda x: x.describe()) + expected = grouped.describe().stack() + tm.assert_series_equal(result, expected) + + +def test_series_index_name(df): + grouped = df.loc[:, ["C"]].groupby(df["A"]) + result = grouped.agg(lambda x: x.mean()) + assert result.index.name == "A" + + +def test_frame_describe_multikey(tsframe): + grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) + result = grouped.describe() + desc_groups = [] + for col in tsframe: + group = grouped[col].describe() + # GH 17464 - Remove duplicate MultiIndex levels + group_col = pd.MultiIndex( + levels=[[col], group.columns], + codes=[[0] * len(group.columns), range(len(group.columns))], + ) + group = pd.DataFrame(group.values, columns=group_col, index=group.index) + desc_groups.append(group) + expected = pd.concat(desc_groups, axis=1) + tm.assert_frame_equal(result, expected) + + groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) + result = groupedT.describe() + expected = tsframe.describe().T + expected.index = pd.MultiIndex( + levels=[[0, 1], expected.index], + codes=[[0, 0, 1, 1], range(len(expected.index))], + ) + tm.assert_frame_equal(result, expected) + + +def test_frame_describe_tupleindex(): + + # GH 14848 - regression from 0.19.0 to 0.19.1 + df1 = DataFrame( + { + "x": [1, 2, 3, 4, 5] * 3, + "y": [10, 20, 30, 40, 50] * 3, + "z": [100, 200, 300, 400, 500] * 3, + } + ) + df1["k"] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 + df2 = df1.rename(columns={"k": "key"}) + msg = "Names should be list-like for a MultiIndex" + with pytest.raises(ValueError, match=msg): + df1.groupby("k").describe() + with pytest.raises(ValueError, match=msg): + df2.groupby("key").describe() + + +def test_frame_describe_unstacked_format(): + # GH 4792 + prices = { + pd.Timestamp("2011-01-06 10:59:05", tz=None): 24990, + pd.Timestamp("2011-01-06 12:43:33", tz=None): 25499, + pd.Timestamp("2011-01-06 12:54:09", tz=None): 25499, + } + volumes = { + pd.Timestamp("2011-01-06 10:59:05", tz=None): 1500000000, + pd.Timestamp("2011-01-06 12:43:33", tz=None): 5000000000, + pd.Timestamp("2011-01-06 12:54:09", tz=None): 100000000, + } + df = pd.DataFrame({"PRICE": prices, "VOLUME": volumes}) + result = df.groupby("PRICE").VOLUME.describe() + data = [ + df[df.PRICE == 24990].VOLUME.describe().values.tolist(), + df[df.PRICE == 25499].VOLUME.describe().values.tolist(), + ] + expected = pd.DataFrame( + data, + index=pd.Index([24990, 25499], name="PRICE"), + columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_frame_equal(result, expected) + + +# nunique +# -------------------------------- + + +@pytest.mark.parametrize("n", 10 ** np.arange(2, 6)) +@pytest.mark.parametrize("m", [10, 100, 1000]) +@pytest.mark.parametrize("sort", [False, True]) +@pytest.mark.parametrize("dropna", [False, True]) +def test_series_groupby_nunique(n, m, sort, dropna): + def check_nunique(df, keys, as_index=True): + gr = df.groupby(keys, as_index=as_index, sort=sort) + left = gr["julie"].nunique(dropna=dropna) + + gr = df.groupby(keys, as_index=as_index, sort=sort) + right = gr["julie"].apply(Series.nunique, dropna=dropna) + if not as_index: + right = right.reset_index(drop=True) + + tm.assert_series_equal(left, right, check_names=False) + + days = date_range("2015-08-23", periods=10) + + frame = DataFrame( + { + "jim": np.random.choice(list(ascii_lowercase), n), + "joe": np.random.choice(days, n), + "julie": np.random.randint(0, m, n), + } + ) + + check_nunique(frame, ["jim"]) + check_nunique(frame, ["jim", "joe"]) + + frame.loc[1::17, "jim"] = None + frame.loc[3::37, "joe"] = None + frame.loc[7::19, "julie"] = None + frame.loc[8::19, "julie"] = None + frame.loc[9::19, "julie"] = None + + check_nunique(frame, ["jim"]) + check_nunique(frame, ["jim", "joe"]) + check_nunique(frame, ["jim"], as_index=False) + check_nunique(frame, ["jim", "joe"], as_index=False) + + +def test_nunique(): + df = DataFrame({"A": list("abbacc"), "B": list("abxacc"), "C": list("abbacx")}) + + expected = DataFrame({"A": [1] * 3, "B": [1, 2, 1], "C": [1, 1, 2]}) + result = df.groupby("A", as_index=False).nunique() + tm.assert_frame_equal(result, expected) + + # as_index + expected.index = list("abc") + expected.index.name = "A" + result = df.groupby("A").nunique() + tm.assert_frame_equal(result, expected) + + # with na + result = df.replace({"x": None}).groupby("A").nunique(dropna=False) + tm.assert_frame_equal(result, expected) + + # dropna + expected = DataFrame({"A": [1] * 3, "B": [1] * 3, "C": [1] * 3}, index=list("abc")) + expected.index.name = "A" + result = df.replace({"x": None}).groupby("A").nunique() + tm.assert_frame_equal(result, expected) + + +def test_nunique_with_object(): + # GH 11077 + data = pd.DataFrame( + [ + [100, 1, "Alice"], + [200, 2, "Bob"], + [300, 3, "Charlie"], + [-400, 4, "Dan"], + [500, 5, "Edith"], + ], + columns=["amount", "id", "name"], + ) + + result = data.groupby(["id", "amount"])["name"].nunique() + index = MultiIndex.from_arrays([data.id, data.amount]) + expected = pd.Series([1] * 5, name="name", index=index) + tm.assert_series_equal(result, expected) + + +def test_nunique_with_empty_series(): + # GH 12553 + data = pd.Series(name="name", dtype=object) + result = data.groupby(level=0).nunique() + expected = pd.Series(name="name", dtype="int64") + tm.assert_series_equal(result, expected) + + +def test_nunique_with_timegrouper(): + # GH 13453 + test = pd.DataFrame( + { + "time": [ + Timestamp("2016-06-28 09:35:35"), + Timestamp("2016-06-28 16:09:30"), + Timestamp("2016-06-28 16:46:28"), + ], + "data": ["1", "2", "3"], + } + ).set_index("time") + result = test.groupby(pd.Grouper(freq="h"))["data"].nunique() + expected = test.groupby(pd.Grouper(freq="h"))["data"].apply(pd.Series.nunique) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "key, data, dropna, expected", + [ + ( + ["x", "x", "x"], + [Timestamp("2019-01-01"), NaT, Timestamp("2019-01-01")], + True, + Series([1], index=pd.Index(["x"], name="key"), name="data"), + ), + ( + ["x", "x", "x"], + [dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)], + True, + Series([1], index=pd.Index(["x"], name="key"), name="data"), + ), + ( + ["x", "x", "x", "y", "y"], + [dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)], + False, + Series([2, 2], index=pd.Index(["x", "y"], name="key"), name="data"), + ), + ( + ["x", "x", "x", "x", "y"], + [dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)], + False, + Series([2, 1], index=pd.Index(["x", "y"], name="key"), name="data"), + ), + ], +) +def test_nunique_with_NaT(key, data, dropna, expected): + # GH 27951 + df = pd.DataFrame({"key": key, "data": data}) + result = df.groupby(["key"])["data"].nunique(dropna=dropna) + tm.assert_series_equal(result, expected) + + +def test_nunique_preserves_column_level_names(): + # GH 23222 + test = pd.DataFrame([1, 2, 2], columns=pd.Index(["A"], name="level_0")) + result = test.groupby([0, 0, 0]).nunique() + expected = pd.DataFrame([2], columns=test.columns) + tm.assert_frame_equal(result, expected) + + +# count +# -------------------------------- + + +def test_groupby_timedelta_cython_count(): + df = DataFrame( + {"g": list("ab" * 2), "delt": np.arange(4).astype("timedelta64[ns]")} + ) + expected = Series([2, 2], index=pd.Index(["a", "b"], name="g"), name="delt") + result = df.groupby("g").delt.count() + tm.assert_series_equal(expected, result) + + +def test_count(): + n = 1 << 15 + dr = date_range("2015-08-30", periods=n // 10, freq="T") + + df = DataFrame( + { + "1st": np.random.choice(list(ascii_lowercase), n), + "2nd": np.random.randint(0, 5, n), + "3rd": np.random.randn(n).round(3), + "4th": np.random.randint(-10, 10, n), + "5th": np.random.choice(dr, n), + "6th": np.random.randn(n).round(3), + "7th": np.random.randn(n).round(3), + "8th": np.random.choice(dr, n) - np.random.choice(dr, 1), + "9th": np.random.choice(list(ascii_lowercase), n), + } + ) + + for col in df.columns.drop(["1st", "2nd", "4th"]): + df.loc[np.random.choice(n, n // 10), col] = np.nan + + df["9th"] = df["9th"].astype("category") + + for key in ["1st", "2nd", ["1st", "2nd"]]: + left = df.groupby(key).count() + right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) + tm.assert_frame_equal(left, right) + + +def test_count_non_nulls(): + # GH#5610 + # count counts non-nulls + df = pd.DataFrame( + [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, np.nan]], + columns=["A", "B", "C"], + ) + + count_as = df.groupby("A").count() + count_not_as = df.groupby("A", as_index=False).count() + + expected = DataFrame([[1, 2], [0, 0]], columns=["B", "C"], index=[1, 3]) + expected.index.name = "A" + tm.assert_frame_equal(count_not_as, expected.reset_index()) + tm.assert_frame_equal(count_as, expected) + + count_B = df.groupby("A")["B"].count() + tm.assert_series_equal(count_B, expected["B"]) + + +def test_count_object(): + df = pd.DataFrame({"a": ["a"] * 3 + ["b"] * 3, "c": [2] * 3 + [3] * 3}) + result = df.groupby("c").a.count() + expected = pd.Series([3, 3], index=pd.Index([2, 3], name="c"), name="a") + tm.assert_series_equal(result, expected) + + df = pd.DataFrame({"a": ["a", np.nan, np.nan] + ["b"] * 3, "c": [2] * 3 + [3] * 3}) + result = df.groupby("c").a.count() + expected = pd.Series([1, 3], index=pd.Index([2, 3], name="c"), name="a") + tm.assert_series_equal(result, expected) + + +def test_count_cross_type(): + # GH8169 + vals = np.hstack( + (np.random.randint(0, 5, (100, 2)), np.random.randint(0, 2, (100, 2))) + ) + + df = pd.DataFrame(vals, columns=["a", "b", "c", "d"]) + df[df == 2] = np.nan + expected = df.groupby(["c", "d"]).count() + + for t in ["float32", "object"]: + df["a"] = df["a"].astype(t) + df["b"] = df["b"].astype(t) + result = df.groupby(["c", "d"]).count() + tm.assert_frame_equal(result, expected) + + +def test_lower_int_prec_count(): + df = DataFrame( + { + "a": np.array([0, 1, 2, 100], np.int8), + "b": np.array([1, 2, 3, 6], np.uint32), + "c": np.array([4, 5, 6, 8], np.int16), + "grp": list("ab" * 2), + } + ) + result = df.groupby("grp").count() + expected = DataFrame( + {"a": [2, 2], "b": [2, 2], "c": [2, 2]}, index=pd.Index(list("ab"), name="grp") + ) + tm.assert_frame_equal(result, expected) + + +def test_count_uses_size_on_exception(): + class RaisingObjectException(Exception): + pass + + class RaisingObject: + def __init__(self, msg="I will raise inside Cython"): + super().__init__() + self.msg = msg + + def __eq__(self, other): + # gets called in Cython to check that raising calls the method + raise RaisingObjectException(self.msg) + + df = DataFrame({"a": [RaisingObject() for _ in range(4)], "grp": list("ab" * 2)}) + result = df.groupby("grp").count() + expected = DataFrame({"a": [2, 2]}, index=pd.Index(list("ab"), name="grp")) + tm.assert_frame_equal(result, expected) + + +# size +# -------------------------------- + + +def test_size(df): + grouped = df.groupby(["A", "B"]) + result = grouped.size() + for key, group in grouped: + assert result[key] == len(group) + + grouped = df.groupby("A") + result = grouped.size() + for key, group in grouped: + assert result[key] == len(group) + + grouped = df.groupby("B") + result = grouped.size() + for key, group in grouped: + assert result[key] == len(group) + + df = DataFrame(np.random.choice(20, (1000, 3)), columns=list("abc")) + for sort, key in product((False, True), ("a", "b", ["a", "b"])): + left = df.groupby(key, sort=sort).size() + right = df.groupby(key, sort=sort)["c"].apply(lambda a: a.shape[0]) + tm.assert_series_equal(left, right, check_names=False) + + # GH11699 + df = DataFrame(columns=["A", "B"]) + out = Series(dtype="int64", index=Index([], name="A")) + tm.assert_series_equal(df.groupby("A").size(), out) + + +def test_size_groupby_all_null(): + # GH23050 + # Assert no 'Value Error : Length of passed values is 2, index implies 0' + df = DataFrame({"A": [None, None]}) # all-null groups + result = df.groupby("A").size() + expected = Series(dtype="int64", index=Index([], name="A")) + tm.assert_series_equal(result, expected) + + +# quantile +# -------------------------------- +@pytest.mark.parametrize( + "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"] +) +@pytest.mark.parametrize( + "a_vals,b_vals", + [ + # Ints + ([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]), + ([1, 2, 3, 4], [4, 3, 2, 1]), + ([1, 2, 3, 4, 5], [4, 3, 2, 1]), + # Floats + ([1.0, 2.0, 3.0, 4.0, 5.0], [5.0, 4.0, 3.0, 2.0, 1.0]), + # Missing data + ([1.0, np.nan, 3.0, np.nan, 5.0], [5.0, np.nan, 3.0, np.nan, 1.0]), + ([np.nan, 4.0, np.nan, 2.0, np.nan], [np.nan, 4.0, np.nan, 2.0, np.nan]), + # Timestamps + ( + list(pd.date_range("1/1/18", freq="D", periods=5)), + list(pd.date_range("1/1/18", freq="D", periods=5))[::-1], + ), + # All NA + ([np.nan] * 5, [np.nan] * 5), + ], +) +@pytest.mark.parametrize("q", [0, 0.25, 0.5, 0.75, 1]) +def test_quantile(interpolation, a_vals, b_vals, q): + if interpolation == "nearest" and q == 0.5 and b_vals == [4, 3, 2, 1]: + pytest.skip( + "Unclear numpy expectation for nearest result with equidistant data" + ) + + a_expected = pd.Series(a_vals).quantile(q, interpolation=interpolation) + b_expected = pd.Series(b_vals).quantile(q, interpolation=interpolation) + + df = DataFrame( + {"key": ["a"] * len(a_vals) + ["b"] * len(b_vals), "val": a_vals + b_vals} + ) + + expected = DataFrame( + [a_expected, b_expected], columns=["val"], index=Index(["a", "b"], name="key") + ) + result = df.groupby("key").quantile(q, interpolation=interpolation) + + tm.assert_frame_equal(result, expected) + + +def test_quantile_array(): + # https://github.com/pandas-dev/pandas/issues/27526 + df = pd.DataFrame({"A": [0, 1, 2, 3, 4]}) + result = df.groupby([0, 0, 1, 1, 1]).quantile([0.25]) + + index = pd.MultiIndex.from_product([[0, 1], [0.25]]) + expected = pd.DataFrame({"A": [0.25, 2.50]}, index=index) + tm.assert_frame_equal(result, expected) + + df = pd.DataFrame({"A": [0, 1, 2, 3], "B": [4, 5, 6, 7]}) + index = pd.MultiIndex.from_product([[0, 1], [0.25, 0.75]]) + + result = df.groupby([0, 0, 1, 1]).quantile([0.25, 0.75]) + expected = pd.DataFrame( + {"A": [0.25, 0.75, 2.25, 2.75], "B": [4.25, 4.75, 6.25, 6.75]}, index=index + ) + tm.assert_frame_equal(result, expected) + + +def test_quantile_array2(): + # https://github.com/pandas-dev/pandas/pull/28085#issuecomment-524066959 + df = pd.DataFrame( + np.random.RandomState(0).randint(0, 5, size=(10, 3)), columns=list("ABC") + ) + result = df.groupby("A").quantile([0.3, 0.7]) + expected = pd.DataFrame( + { + "B": [0.9, 2.1, 2.2, 3.4, 1.6, 2.4, 2.3, 2.7, 0.0, 0.0], + "C": [1.2, 2.8, 1.8, 3.0, 0.0, 0.0, 1.9, 3.1, 3.0, 3.0], + }, + index=pd.MultiIndex.from_product( + [[0, 1, 2, 3, 4], [0.3, 0.7]], names=["A", None] + ), + ) + tm.assert_frame_equal(result, expected) + + +def test_quantile_array_no_sort(): + df = pd.DataFrame({"A": [0, 1, 2], "B": [3, 4, 5]}) + result = df.groupby([1, 0, 1], sort=False).quantile([0.25, 0.5, 0.75]) + expected = pd.DataFrame( + {"A": [0.5, 1.0, 1.5, 1.0, 1.0, 1.0], "B": [3.5, 4.0, 4.5, 4.0, 4.0, 4.0]}, + index=pd.MultiIndex.from_product([[1, 0], [0.25, 0.5, 0.75]]), + ) + tm.assert_frame_equal(result, expected) + + result = df.groupby([1, 0, 1], sort=False).quantile([0.75, 0.25]) + expected = pd.DataFrame( + {"A": [1.5, 0.5, 1.0, 1.0], "B": [4.5, 3.5, 4.0, 4.0]}, + index=pd.MultiIndex.from_product([[1, 0], [0.75, 0.25]]), + ) + tm.assert_frame_equal(result, expected) + + +def test_quantile_array_multiple_levels(): + df = pd.DataFrame( + {"A": [0, 1, 2], "B": [3, 4, 5], "c": ["a", "a", "a"], "d": ["a", "a", "b"]} + ) + result = df.groupby(["c", "d"]).quantile([0.25, 0.75]) + index = pd.MultiIndex.from_tuples( + [("a", "a", 0.25), ("a", "a", 0.75), ("a", "b", 0.25), ("a", "b", 0.75)], + names=["c", "d", None], + ) + expected = pd.DataFrame( + {"A": [0.25, 0.75, 2.0, 2.0], "B": [3.25, 3.75, 5.0, 5.0]}, index=index + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("frame_size", [(2, 3), (100, 10)]) +@pytest.mark.parametrize("groupby", [[0], [0, 1]]) +@pytest.mark.parametrize("q", [[0.5, 0.6]]) +def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, q): + # GH30289 + nrow, ncol = frame_size + df = pd.DataFrame( + np.array([ncol * [_ % 4] for _ in range(nrow)]), columns=range(ncol) + ) + + idx_levels = [list(range(min(nrow, 4)))] * len(groupby) + [q] + idx_codes = [[x for x in range(min(nrow, 4)) for _ in q]] * len(groupby) + [ + list(range(len(q))) * min(nrow, 4) + ] + expected_index = pd.MultiIndex( + levels=idx_levels, codes=idx_codes, names=groupby + [None] + ) + expected_values = [ + [float(x)] * (ncol - len(groupby)) for x in range(min(nrow, 4)) for _ in q + ] + expected_columns = [x for x in range(ncol) if x not in groupby] + expected = pd.DataFrame( + expected_values, index=expected_index, columns=expected_columns + ) + result = df.groupby(groupby).quantile(q) + + tm.assert_frame_equal(result, expected) + + +def test_quantile_raises(): + df = pd.DataFrame( + [["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"] + ) + + with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"): + df.groupby("key").quantile() + + +def test_quantile_out_of_bounds_q_raises(): + # https://github.com/pandas-dev/pandas/issues/27470 + df = pd.DataFrame(dict(a=[0, 0, 0, 1, 1, 1], b=range(6))) + g = df.groupby([0, 0, 0, 1, 1, 1]) + with pytest.raises(ValueError, match="Got '50.0' instead"): + g.quantile(50) + + with pytest.raises(ValueError, match="Got '-1.0' instead"): + g.quantile(-1) + + +def test_quantile_missing_group_values_no_segfaults(): + # GH 28662 + data = np.array([1.0, np.nan, 1.0]) + df = pd.DataFrame(dict(key=data, val=range(3))) + + # Random segfaults; would have been guaranteed in loop + grp = df.groupby("key") + for _ in range(100): + grp.quantile() + + +def test_quantile_missing_group_values_correct_results(): + # GH 28662 + data = np.array([1.0, np.nan, 3.0, np.nan]) + df = pd.DataFrame(dict(key=data, val=range(4))) + + result = df.groupby("key").quantile() + expected = pd.DataFrame( + [1.0, 3.0], index=pd.Index([1.0, 3.0], name="key"), columns=["val"] + ) + tm.assert_frame_equal(result, expected) + + +# pipe +# -------------------------------- + + +def test_pipe(): + # Test the pipe method of DataFrameGroupBy. + # Issue #17871 + + random_state = np.random.RandomState(1234567890) + + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": random_state.randn(8), + "C": random_state.randn(8), + } + ) + + def f(dfgb): + return dfgb.B.max() - dfgb.C.min().min() + + def square(srs): + return srs ** 2 + + # Note that the transformations are + # GroupBy -> Series + # Series -> Series + # This then chains the GroupBy.pipe and the + # NDFrame.pipe methods + result = df.groupby("A").pipe(f).pipe(square) + + index = Index(["bar", "foo"], dtype="object", name="A") + expected = pd.Series([8.99110003361, 8.17516964785], name="B", index=index) + + tm.assert_series_equal(expected, result) + + +def test_pipe_args(): + # Test passing args to the pipe method of DataFrameGroupBy. + # Issue #17871 + + df = pd.DataFrame( + { + "group": ["A", "A", "B", "B", "C"], + "x": [1.0, 2.0, 3.0, 2.0, 5.0], + "y": [10.0, 100.0, 1000.0, -100.0, -1000.0], + } + ) + + def f(dfgb, arg1): + return dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False).groupby( + dfgb.grouper + ) + + def g(dfgb, arg2): + return dfgb.sum() / dfgb.sum().sum() + arg2 + + def h(df, arg3): + return df.x + df.y - arg3 + + result = df.groupby("group").pipe(f, 0).pipe(g, 10).pipe(h, 100) + + # Assert the results here + index = pd.Index(["A", "B", "C"], name="group") + expected = pd.Series([-79.5160891089, -78.4839108911, -80], index=index) + + tm.assert_series_equal(expected, result) + + # test SeriesGroupby.pipe + ser = pd.Series([1, 1, 2, 2, 3, 3]) + result = ser.groupby(ser).pipe(lambda grp: grp.sum() * grp.count()) + + expected = pd.Series([4, 8, 12], index=pd.Int64Index([1, 2, 3])) + + tm.assert_series_equal(result, expected) + + +def test_groupby_mean_no_overflow(): + # Regression test for (#22487) + df = pd.DataFrame( + { + "user": ["A", "A", "A", "A", "A"], + "connections": [4970, 4749, 4719, 4704, 18446744073699999744], + } + ) + assert df.groupby("user")["connections"].mean()["A"] == 3689348814740003840 diff --git a/venv/Lib/site-packages/pandas/tests/groupby/test_groupby.py b/venv/Lib/site-packages/pandas/tests/groupby/test_groupby.py new file mode 100644 index 0000000..7e37481 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/groupby/test_groupby.py @@ -0,0 +1,2032 @@ +from datetime import datetime +from decimal import Decimal +from io import StringIO + +import numpy as np +import pytest + +from pandas.errors import PerformanceWarning + +import pandas as pd +from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv +import pandas._testing as tm +from pandas.core.base import SpecificationError +import pandas.core.common as com + + +def test_repr(): + # GH18203 + result = repr(pd.Grouper(key="A", level="B")) + expected = "Grouper(key='A', level='B', axis=0, sort=False)" + assert result == expected + + +@pytest.mark.parametrize("dtype", ["int64", "int32", "float64", "float32"]) +def test_basic(dtype): + + data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype) + + index = np.arange(9) + np.random.shuffle(index) + data = data.reindex(index) + + grouped = data.groupby(lambda x: x // 3) + + for k, v in grouped: + assert len(v) == 3 + + agged = grouped.aggregate(np.mean) + assert agged[1] == 1 + + tm.assert_series_equal(agged, grouped.agg(np.mean)) # shorthand + tm.assert_series_equal(agged, grouped.mean()) + tm.assert_series_equal(grouped.agg(np.sum), grouped.sum()) + + expected = grouped.apply(lambda x: x * x.sum()) + transformed = grouped.transform(lambda x: x * x.sum()) + assert transformed[7] == 12 + tm.assert_series_equal(transformed, expected) + + value_grouped = data.groupby(data) + tm.assert_series_equal( + value_grouped.aggregate(np.mean), agged, check_index_type=False + ) + + # complex agg + agged = grouped.aggregate([np.mean, np.std]) + + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + grouped.aggregate({"one": np.mean, "two": np.std}) + + group_constants = {0: 10, 1: 20, 2: 30} + agged = grouped.agg(lambda x: group_constants[x.name] + x.mean()) + assert agged[1] == 21 + + # corner cases + msg = "Must produce aggregated value" + # exception raised is type Exception + with pytest.raises(Exception, match=msg): + grouped.aggregate(lambda x: x * 2) + + +def test_groupby_nonobject_dtype(mframe, df_mixed_floats): + key = mframe.index.codes[0] + grouped = mframe.groupby(key) + result = grouped.sum() + + expected = mframe.groupby(key.astype("O")).sum() + tm.assert_frame_equal(result, expected) + + # GH 3911, mixed frame non-conversion + df = df_mixed_floats.copy() + df["value"] = range(len(df)) + + def max_value(group): + return group.loc[group["value"].idxmax()] + + applied = df.groupby("A").apply(max_value) + result = applied.dtypes + expected = Series( + [np.dtype("object")] * 2 + [np.dtype("float64")] * 2 + [np.dtype("int64")], + index=["A", "B", "C", "D", "value"], + ) + tm.assert_series_equal(result, expected) + + +def test_groupby_return_type(): + + # GH2893, return a reduced type + df1 = DataFrame( + [ + {"val1": 1, "val2": 20}, + {"val1": 1, "val2": 19}, + {"val1": 2, "val2": 27}, + {"val1": 2, "val2": 12}, + ] + ) + + def func(dataf): + return dataf["val2"] - dataf["val2"].mean() + + result = df1.groupby("val1", squeeze=True).apply(func) + assert isinstance(result, Series) + + df2 = DataFrame( + [ + {"val1": 1, "val2": 20}, + {"val1": 1, "val2": 19}, + {"val1": 1, "val2": 27}, + {"val1": 1, "val2": 12}, + ] + ) + + def func(dataf): + return dataf["val2"] - dataf["val2"].mean() + + result = df2.groupby("val1", squeeze=True).apply(func) + assert isinstance(result, Series) + + # GH3596, return a consistent type (regression in 0.11 from 0.10.1) + df = DataFrame([[1, 1], [1, 1]], columns=["X", "Y"]) + result = df.groupby("X", squeeze=False).count() + assert isinstance(result, DataFrame) + + +def test_inconsistent_return_type(): + # GH5592 + # inconsistent return type + df = DataFrame( + dict( + A=["Tiger", "Tiger", "Tiger", "Lamb", "Lamb", "Pony", "Pony"], + B=Series(np.arange(7), dtype="int64"), + C=date_range("20130101", periods=7), + ) + ) + + def f(grp): + return grp.iloc[0] + + expected = df.groupby("A").first()[["B"]] + result = df.groupby("A").apply(f)[["B"]] + tm.assert_frame_equal(result, expected) + + def f(grp): + if grp.name == "Tiger": + return None + return grp.iloc[0] + + result = df.groupby("A").apply(f)[["B"]] + e = expected.copy() + e.loc["Tiger"] = np.nan + tm.assert_frame_equal(result, e) + + def f(grp): + if grp.name == "Pony": + return None + return grp.iloc[0] + + result = df.groupby("A").apply(f)[["B"]] + e = expected.copy() + e.loc["Pony"] = np.nan + tm.assert_frame_equal(result, e) + + # 5592 revisited, with datetimes + def f(grp): + if grp.name == "Pony": + return None + return grp.iloc[0] + + result = df.groupby("A").apply(f)[["C"]] + e = df.groupby("A").first()[["C"]] + e.loc["Pony"] = pd.NaT + tm.assert_frame_equal(result, e) + + # scalar outputs + def f(grp): + if grp.name == "Pony": + return None + return grp.iloc[0].loc["C"] + + result = df.groupby("A").apply(f) + e = df.groupby("A").first()["C"].copy() + e.loc["Pony"] = np.nan + e.name = None + tm.assert_series_equal(result, e) + + +def test_pass_args_kwargs(ts, tsframe): + def f(x, q=None, axis=0): + return np.percentile(x, q, axis=axis) + + g = lambda x: np.percentile(x, 80, axis=0) + + # Series + ts_grouped = ts.groupby(lambda x: x.month) + agg_result = ts_grouped.agg(np.percentile, 80, axis=0) + apply_result = ts_grouped.apply(np.percentile, 80, axis=0) + trans_result = ts_grouped.transform(np.percentile, 80, axis=0) + + agg_expected = ts_grouped.quantile(0.8) + trans_expected = ts_grouped.transform(g) + + tm.assert_series_equal(apply_result, agg_expected) + tm.assert_series_equal(agg_result, agg_expected) + tm.assert_series_equal(trans_result, trans_expected) + + agg_result = ts_grouped.agg(f, q=80) + apply_result = ts_grouped.apply(f, q=80) + trans_result = ts_grouped.transform(f, q=80) + tm.assert_series_equal(agg_result, agg_expected) + tm.assert_series_equal(apply_result, agg_expected) + tm.assert_series_equal(trans_result, trans_expected) + + # DataFrame + df_grouped = tsframe.groupby(lambda x: x.month) + agg_result = df_grouped.agg(np.percentile, 80, axis=0) + apply_result = df_grouped.apply(DataFrame.quantile, 0.8) + expected = df_grouped.quantile(0.8) + tm.assert_frame_equal(apply_result, expected, check_names=False) + tm.assert_frame_equal(agg_result, expected) + + agg_result = df_grouped.agg(f, q=80) + apply_result = df_grouped.apply(DataFrame.quantile, q=0.8) + tm.assert_frame_equal(agg_result, expected) + tm.assert_frame_equal(apply_result, expected, check_names=False) + + +def test_len(): + df = tm.makeTimeDataFrame() + grouped = df.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]) + assert len(grouped) == len(df) + + grouped = df.groupby([lambda x: x.year, lambda x: x.month]) + expected = len({(x.year, x.month) for x in df.index}) + assert len(grouped) == expected + + # issue 11016 + df = pd.DataFrame(dict(a=[np.nan] * 3, b=[1, 2, 3])) + assert len(df.groupby(("a"))) == 0 + assert len(df.groupby(("b"))) == 3 + assert len(df.groupby(["a", "b"])) == 3 + + +def test_basic_regression(): + # regression + result = Series([1.0 * x for x in list(range(1, 10)) * 10]) + + data = np.random.random(1100) * 10.0 + groupings = Series(data) + + grouped = result.groupby(groupings) + grouped.mean() + + +@pytest.mark.parametrize( + "dtype", ["float64", "float32", "int64", "int32", "int16", "int8"] +) +def test_with_na_groups(dtype): + index = Index(np.arange(10)) + values = Series(np.ones(10), index, dtype=dtype) + labels = Series( + [np.nan, "foo", "bar", "bar", np.nan, np.nan, "bar", "bar", np.nan, "foo"], + index=index, + ) + + # this SHOULD be an int + grouped = values.groupby(labels) + agged = grouped.agg(len) + expected = Series([4, 2], index=["bar", "foo"]) + + tm.assert_series_equal(agged, expected, check_dtype=False) + + # assert issubclass(agged.dtype.type, np.integer) + + # explicitly return a float from my function + def f(x): + return float(len(x)) + + agged = grouped.agg(f) + expected = Series([4, 2], index=["bar", "foo"]) + + tm.assert_series_equal(agged, expected, check_dtype=False) + assert issubclass(agged.dtype.type, np.dtype(dtype).type) + + +def test_indices_concatenation_order(): + + # GH 2808 + + def f1(x): + y = x[(x.b % 2) == 1] ** 2 + if y.empty: + multiindex = MultiIndex(levels=[[]] * 2, codes=[[]] * 2, names=["b", "c"]) + res = DataFrame(columns=["a"], index=multiindex) + return res + else: + y = y.set_index(["b", "c"]) + return y + + def f2(x): + y = x[(x.b % 2) == 1] ** 2 + if y.empty: + return DataFrame() + else: + y = y.set_index(["b", "c"]) + return y + + def f3(x): + y = x[(x.b % 2) == 1] ** 2 + if y.empty: + multiindex = MultiIndex( + levels=[[]] * 2, codes=[[]] * 2, names=["foo", "bar"] + ) + res = DataFrame(columns=["a", "b"], index=multiindex) + return res + else: + return y + + df = DataFrame({"a": [1, 2, 2, 2], "b": range(4), "c": range(5, 9)}) + + df2 = DataFrame({"a": [3, 2, 2, 2], "b": range(4), "c": range(5, 9)}) + + # correct result + result1 = df.groupby("a").apply(f1) + result2 = df2.groupby("a").apply(f1) + tm.assert_frame_equal(result1, result2) + + # should fail (not the same number of levels) + msg = "Cannot concat indices that do not have the same number of levels" + with pytest.raises(AssertionError, match=msg): + df.groupby("a").apply(f2) + with pytest.raises(AssertionError, match=msg): + df2.groupby("a").apply(f2) + + # should fail (incorrect shape) + with pytest.raises(AssertionError, match=msg): + df.groupby("a").apply(f3) + with pytest.raises(AssertionError, match=msg): + df2.groupby("a").apply(f3) + + +def test_attr_wrapper(ts): + grouped = ts.groupby(lambda x: x.weekday()) + + result = grouped.std() + expected = grouped.agg(lambda x: np.std(x, ddof=1)) + tm.assert_series_equal(result, expected) + + # this is pretty cool + result = grouped.describe() + expected = {name: gp.describe() for name, gp in grouped} + expected = DataFrame(expected).T + tm.assert_frame_equal(result, expected) + + # get attribute + result = grouped.dtype + expected = grouped.agg(lambda x: x.dtype) + + # make sure raises error + msg = "'SeriesGroupBy' object has no attribute 'foo'" + with pytest.raises(AttributeError, match=msg): + getattr(grouped, "foo") + + +def test_frame_groupby(tsframe): + grouped = tsframe.groupby(lambda x: x.weekday()) + + # aggregate + aggregated = grouped.aggregate(np.mean) + assert len(aggregated) == 5 + assert len(aggregated.columns) == 4 + + # by string + tscopy = tsframe.copy() + tscopy["weekday"] = [x.weekday() for x in tscopy.index] + stragged = tscopy.groupby("weekday").aggregate(np.mean) + tm.assert_frame_equal(stragged, aggregated, check_names=False) + + # transform + grouped = tsframe.head(30).groupby(lambda x: x.weekday()) + transformed = grouped.transform(lambda x: x - x.mean()) + assert len(transformed) == 30 + assert len(transformed.columns) == 4 + + # transform propagate + transformed = grouped.transform(lambda x: x.mean()) + for name, group in grouped: + mean = group.mean() + for idx in group.index: + tm.assert_series_equal(transformed.xs(idx), mean, check_names=False) + + # iterate + for weekday, group in grouped: + assert group.index[0].weekday() == weekday + + # groups / group_indices + groups = grouped.groups + indices = grouped.indices + + for k, v in groups.items(): + samething = tsframe.index.take(indices[k]) + assert (samething == v).all() + + +def test_frame_groupby_columns(tsframe): + mapping = {"A": 0, "B": 0, "C": 1, "D": 1} + grouped = tsframe.groupby(mapping, axis=1) + + # aggregate + aggregated = grouped.aggregate(np.mean) + assert len(aggregated) == len(tsframe) + assert len(aggregated.columns) == 2 + + # transform + tf = lambda x: x - x.mean() + groupedT = tsframe.T.groupby(mapping, axis=0) + tm.assert_frame_equal(groupedT.transform(tf).T, grouped.transform(tf)) + + # iterate + for k, v in grouped: + assert len(v.columns) == 2 + + +def test_frame_set_name_single(df): + grouped = df.groupby("A") + + result = grouped.mean() + assert result.index.name == "A" + + result = df.groupby("A", as_index=False).mean() + assert result.index.name != "A" + + result = grouped.agg(np.mean) + assert result.index.name == "A" + + result = grouped.agg({"C": np.mean, "D": np.std}) + assert result.index.name == "A" + + result = grouped["C"].mean() + assert result.index.name == "A" + result = grouped["C"].agg(np.mean) + assert result.index.name == "A" + result = grouped["C"].agg([np.mean, np.std]) + assert result.index.name == "A" + + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + grouped["C"].agg({"foo": np.mean, "bar": np.std}) + + +def test_multi_func(df): + col1 = df["A"] + col2 = df["B"] + + grouped = df.groupby([col1.get, col2.get]) + agged = grouped.mean() + expected = df.groupby(["A", "B"]).mean() + + # TODO groupby get drops names + tm.assert_frame_equal( + agged.loc[:, ["C", "D"]], expected.loc[:, ["C", "D"]], check_names=False + ) + + # some "groups" with no data + df = DataFrame( + { + "v1": np.random.randn(6), + "v2": np.random.randn(6), + "k1": np.array(["b", "b", "b", "a", "a", "a"]), + "k2": np.array(["1", "1", "1", "2", "2", "2"]), + }, + index=["one", "two", "three", "four", "five", "six"], + ) + # only verify that it works for now + grouped = df.groupby(["k1", "k2"]) + grouped.agg(np.sum) + + +def test_multi_key_multiple_functions(df): + grouped = df.groupby(["A", "B"])["C"] + + agged = grouped.agg([np.mean, np.std]) + expected = DataFrame({"mean": grouped.agg(np.mean), "std": grouped.agg(np.std)}) + tm.assert_frame_equal(agged, expected) + + +def test_frame_multi_key_function_list(): + data = DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + "D": np.random.randn(11), + "E": np.random.randn(11), + "F": np.random.randn(11), + } + ) + + grouped = data.groupby(["A", "B"]) + funcs = [np.mean, np.std] + agged = grouped.agg(funcs) + expected = pd.concat( + [grouped["D"].agg(funcs), grouped["E"].agg(funcs), grouped["F"].agg(funcs)], + keys=["D", "E", "F"], + axis=1, + ) + assert isinstance(agged.index, MultiIndex) + assert isinstance(expected.index, MultiIndex) + tm.assert_frame_equal(agged, expected) + + +@pytest.mark.parametrize("op", [lambda x: x.sum(), lambda x: x.mean()]) +def test_groupby_multiple_columns(df, op): + data = df + grouped = data.groupby(["A", "B"]) + + result1 = op(grouped) + + keys = [] + values = [] + for n1, gp1 in data.groupby("A"): + for n2, gp2 in gp1.groupby("B"): + keys.append((n1, n2)) + values.append(op(gp2.loc[:, ["C", "D"]])) + + mi = MultiIndex.from_tuples(keys, names=["A", "B"]) + expected = pd.concat(values, axis=1).T + expected.index = mi + + # a little bit crude + for col in ["C", "D"]: + result_col = op(grouped[col]) + pivoted = result1[col] + exp = expected[col] + tm.assert_series_equal(result_col, exp) + tm.assert_series_equal(pivoted, exp) + + # test single series works the same + result = data["C"].groupby([data["A"], data["B"]]).mean() + expected = data.groupby(["A", "B"]).mean()["C"] + + tm.assert_series_equal(result, expected) + + +def test_as_index_select_column(): + # GH 5764 + df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) + result = df.groupby("A", as_index=False)["B"].get_group(1) + expected = pd.Series([2, 4], name="B") + tm.assert_series_equal(result, expected) + + result = df.groupby("A", as_index=False)["B"].apply(lambda x: x.cumsum()) + expected = pd.Series( + [2, 6, 6], name="B", index=pd.MultiIndex.from_tuples([(0, 0), (0, 1), (1, 2)]) + ) + tm.assert_series_equal(result, expected) + + +def test_groupby_as_index_agg(df): + grouped = df.groupby("A", as_index=False) + + # single-key + + result = grouped.agg(np.mean) + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + result2 = grouped.agg({"C": np.mean, "D": np.sum}) + expected2 = grouped.mean() + expected2["D"] = grouped.sum()["D"] + tm.assert_frame_equal(result2, expected2) + + grouped = df.groupby("A", as_index=True) + + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + grouped["C"].agg({"Q": np.sum}) + + # multi-key + + grouped = df.groupby(["A", "B"], as_index=False) + + result = grouped.agg(np.mean) + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + result2 = grouped.agg({"C": np.mean, "D": np.sum}) + expected2 = grouped.mean() + expected2["D"] = grouped.sum()["D"] + tm.assert_frame_equal(result2, expected2) + + expected3 = grouped["C"].sum() + expected3 = DataFrame(expected3).rename(columns={"C": "Q"}) + result3 = grouped["C"].agg({"Q": np.sum}) + tm.assert_frame_equal(result3, expected3) + + # GH7115 & GH8112 & GH8582 + df = DataFrame(np.random.randint(0, 100, (50, 3)), columns=["jim", "joe", "jolie"]) + ts = Series(np.random.randint(5, 10, 50), name="jim") + + gr = df.groupby(ts) + gr.nth(0) # invokes set_selection_from_grouper internally + tm.assert_frame_equal(gr.apply(sum), df.groupby(ts).apply(sum)) + + for attr in ["mean", "max", "count", "idxmax", "cumsum", "all"]: + gr = df.groupby(ts, as_index=False) + left = getattr(gr, attr)() + + gr = df.groupby(ts.values, as_index=True) + right = getattr(gr, attr)().reset_index(drop=True) + + tm.assert_frame_equal(left, right) + + +def test_as_index_series_return_frame(df): + grouped = df.groupby("A", as_index=False) + grouped2 = df.groupby(["A", "B"], as_index=False) + + result = grouped["C"].agg(np.sum) + expected = grouped.agg(np.sum).loc[:, ["A", "C"]] + assert isinstance(result, DataFrame) + tm.assert_frame_equal(result, expected) + + result2 = grouped2["C"].agg(np.sum) + expected2 = grouped2.agg(np.sum).loc[:, ["A", "B", "C"]] + assert isinstance(result2, DataFrame) + tm.assert_frame_equal(result2, expected2) + + result = grouped["C"].sum() + expected = grouped.sum().loc[:, ["A", "C"]] + assert isinstance(result, DataFrame) + tm.assert_frame_equal(result, expected) + + result2 = grouped2["C"].sum() + expected2 = grouped2.sum().loc[:, ["A", "B", "C"]] + assert isinstance(result2, DataFrame) + tm.assert_frame_equal(result2, expected2) + + +def test_as_index_series_column_slice_raises(df): + # GH15072 + grouped = df.groupby("A", as_index=False) + msg = r"Column\(s\) C already selected" + + with pytest.raises(IndexError, match=msg): + grouped["C"].__getitem__("D") + + +def test_groupby_as_index_cython(df): + data = df + + # single-key + grouped = data.groupby("A", as_index=False) + result = grouped.mean() + expected = data.groupby(["A"]).mean() + expected.insert(0, "A", expected.index) + expected.index = np.arange(len(expected)) + tm.assert_frame_equal(result, expected) + + # multi-key + grouped = data.groupby(["A", "B"], as_index=False) + result = grouped.mean() + expected = data.groupby(["A", "B"]).mean() + + arrays = list(zip(*expected.index.values)) + expected.insert(0, "A", arrays[0]) + expected.insert(1, "B", arrays[1]) + expected.index = np.arange(len(expected)) + tm.assert_frame_equal(result, expected) + + +def test_groupby_as_index_series_scalar(df): + grouped = df.groupby(["A", "B"], as_index=False) + + # GH #421 + + result = grouped["C"].agg(len) + expected = grouped.agg(len).loc[:, ["A", "B", "C"]] + tm.assert_frame_equal(result, expected) + + +def test_groupby_as_index_corner(df, ts): + msg = "as_index=False only valid with DataFrame" + with pytest.raises(TypeError, match=msg): + ts.groupby(lambda x: x.weekday(), as_index=False) + + msg = "as_index=False only valid for axis=0" + with pytest.raises(ValueError, match=msg): + df.groupby(lambda x: x.lower(), as_index=False, axis=1) + + +def test_groupby_multiple_key(df): + df = tm.makeTimeDataFrame() + grouped = df.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]) + agged = grouped.sum() + tm.assert_almost_equal(df.values, agged.values) + + grouped = df.T.groupby( + [lambda x: x.year, lambda x: x.month, lambda x: x.day], axis=1 + ) + + agged = grouped.agg(lambda x: x.sum()) + tm.assert_index_equal(agged.index, df.columns) + tm.assert_almost_equal(df.T.values, agged.values) + + agged = grouped.agg(lambda x: x.sum()) + tm.assert_almost_equal(df.T.values, agged.values) + + +def test_groupby_multi_corner(df): + # test that having an all-NA column doesn't mess you up + df = df.copy() + df["bad"] = np.nan + agged = df.groupby(["A", "B"]).mean() + + expected = df.groupby(["A", "B"]).mean() + expected["bad"] = np.nan + + tm.assert_frame_equal(agged, expected) + + +def test_omit_nuisance(df): + grouped = df.groupby("A") + + result = grouped.mean() + expected = df.loc[:, ["A", "C", "D"]].groupby("A").mean() + tm.assert_frame_equal(result, expected) + + agged = grouped.agg(np.mean) + exp = grouped.mean() + tm.assert_frame_equal(agged, exp) + + df = df.loc[:, ["A", "C", "D"]] + df["E"] = datetime.now() + grouped = df.groupby("A") + result = grouped.agg(np.sum) + expected = grouped.sum() + tm.assert_frame_equal(result, expected) + + # won't work with axis = 1 + grouped = df.groupby({"A": 0, "C": 0, "D": 1, "E": 1}, axis=1) + msg = "reduction operation 'sum' not allowed for this dtype" + with pytest.raises(TypeError, match=msg): + grouped.agg(lambda x: x.sum(0, numeric_only=False)) + + +def test_omit_nuisance_python_multiple(three_group): + grouped = three_group.groupby(["A", "B"]) + + agged = grouped.agg(np.mean) + exp = grouped.mean() + tm.assert_frame_equal(agged, exp) + + +def test_empty_groups_corner(mframe): + # handle empty groups + df = DataFrame( + { + "k1": np.array(["b", "b", "b", "a", "a", "a"]), + "k2": np.array(["1", "1", "1", "2", "2", "2"]), + "k3": ["foo", "bar"] * 3, + "v1": np.random.randn(6), + "v2": np.random.randn(6), + } + ) + + grouped = df.groupby(["k1", "k2"]) + result = grouped.agg(np.mean) + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + grouped = mframe[3:5].groupby(level=0) + agged = grouped.apply(lambda x: x.mean()) + agged_A = grouped["A"].apply(np.mean) + tm.assert_series_equal(agged["A"], agged_A) + assert agged.index.name == "first" + + +def test_nonsense_func(): + df = DataFrame([0]) + msg = r"unsupported operand type\(s\) for \+: 'int' and 'str'" + with pytest.raises(TypeError, match=msg): + df.groupby(lambda x: x + "foo") + + +def test_wrap_aggregated_output_multindex(mframe): + df = mframe.T + df["baz", "two"] = "peekaboo" + + keys = [np.array([0, 0, 1]), np.array([0, 0, 1])] + agged = df.groupby(keys).agg(np.mean) + assert isinstance(agged.columns, MultiIndex) + + def aggfun(ser): + if ser.name == ("foo", "one"): + raise TypeError + else: + return ser.sum() + + agged2 = df.groupby(keys).aggregate(aggfun) + assert len(agged2.columns) + 1 == len(df.columns) + + +def test_groupby_level_apply(mframe): + + result = mframe.groupby(level=0).count() + assert result.index.name == "first" + result = mframe.groupby(level=1).count() + assert result.index.name == "second" + + result = mframe["A"].groupby(level=0).count() + assert result.index.name == "first" + + +def test_groupby_level_mapper(mframe): + deleveled = mframe.reset_index() + + mapper0 = {"foo": 0, "bar": 0, "baz": 1, "qux": 1} + mapper1 = {"one": 0, "two": 0, "three": 1} + + result0 = mframe.groupby(mapper0, level=0).sum() + result1 = mframe.groupby(mapper1, level=1).sum() + + mapped_level0 = np.array([mapper0.get(x) for x in deleveled["first"]]) + mapped_level1 = np.array([mapper1.get(x) for x in deleveled["second"]]) + expected0 = mframe.groupby(mapped_level0).sum() + expected1 = mframe.groupby(mapped_level1).sum() + expected0.index.name, expected1.index.name = "first", "second" + + tm.assert_frame_equal(result0, expected0) + tm.assert_frame_equal(result1, expected1) + + +def test_groupby_level_nonmulti(): + # GH 1313, GH 13901 + s = Series([1, 2, 3, 10, 4, 5, 20, 6], Index([1, 2, 3, 1, 4, 5, 2, 6], name="foo")) + expected = Series([11, 22, 3, 4, 5, 6], Index(range(1, 7), name="foo")) + + result = s.groupby(level=0).sum() + tm.assert_series_equal(result, expected) + result = s.groupby(level=[0]).sum() + tm.assert_series_equal(result, expected) + result = s.groupby(level=-1).sum() + tm.assert_series_equal(result, expected) + result = s.groupby(level=[-1]).sum() + tm.assert_series_equal(result, expected) + + msg = "level > 0 or level < -1 only valid with MultiIndex" + with pytest.raises(ValueError, match=msg): + s.groupby(level=1) + with pytest.raises(ValueError, match=msg): + s.groupby(level=-2) + msg = "No group keys passed!" + with pytest.raises(ValueError, match=msg): + s.groupby(level=[]) + msg = "multiple levels only valid with MultiIndex" + with pytest.raises(ValueError, match=msg): + s.groupby(level=[0, 0]) + with pytest.raises(ValueError, match=msg): + s.groupby(level=[0, 1]) + msg = "level > 0 or level < -1 only valid with MultiIndex" + with pytest.raises(ValueError, match=msg): + s.groupby(level=[1]) + + +def test_groupby_complex(): + # GH 12902 + a = Series(data=np.arange(4) * (1 + 2j), index=[0, 0, 1, 1]) + expected = Series((1 + 2j, 5 + 10j)) + + result = a.groupby(level=0).sum() + tm.assert_series_equal(result, expected) + + result = a.sum(level=0) + tm.assert_series_equal(result, expected) + + +def test_mutate_groups(): + + # GH3380 + + df = DataFrame( + { + "cat1": ["a"] * 8 + ["b"] * 6, + "cat2": ["c"] * 2 + + ["d"] * 2 + + ["e"] * 2 + + ["f"] * 2 + + ["c"] * 2 + + ["d"] * 2 + + ["e"] * 2, + "cat3": [f"g{x}" for x in range(1, 15)], + "val": np.random.randint(100, size=14), + } + ) + + def f_copy(x): + x = x.copy() + x["rank"] = x.val.rank(method="min") + return x.groupby("cat2")["rank"].min() + + def f_no_copy(x): + x["rank"] = x.val.rank(method="min") + return x.groupby("cat2")["rank"].min() + + grpby_copy = df.groupby("cat1").apply(f_copy) + grpby_no_copy = df.groupby("cat1").apply(f_no_copy) + tm.assert_series_equal(grpby_copy, grpby_no_copy) + + +def test_no_mutate_but_looks_like(): + + # GH 8467 + # first show's mutation indicator + # second does not, but should yield the same results + df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) + + result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key) + result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key) + tm.assert_series_equal(result1, result2) + + +def test_groupby_series_indexed_differently(): + s1 = Series( + [5.0, -9.0, 4.0, 100.0, -5.0, 55.0, 6.7], + index=Index(["a", "b", "c", "d", "e", "f", "g"]), + ) + s2 = Series( + [1.0, 1.0, 4.0, 5.0, 5.0, 7.0], index=Index(["a", "b", "d", "f", "g", "h"]) + ) + + grouped = s1.groupby(s2) + agged = grouped.mean() + exp = s1.groupby(s2.reindex(s1.index).get).mean() + tm.assert_series_equal(agged, exp) + + +def test_groupby_with_hier_columns(): + tuples = list( + zip( + *[ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + ) + ) + index = MultiIndex.from_tuples(tuples) + columns = MultiIndex.from_tuples( + [("A", "cat"), ("B", "dog"), ("B", "cat"), ("A", "dog")] + ) + df = DataFrame(np.random.randn(8, 4), index=index, columns=columns) + + result = df.groupby(level=0).mean() + tm.assert_index_equal(result.columns, columns) + + result = df.groupby(level=0, axis=1).mean() + tm.assert_index_equal(result.index, df.index) + + result = df.groupby(level=0).agg(np.mean) + tm.assert_index_equal(result.columns, columns) + + result = df.groupby(level=0).apply(lambda x: x.mean()) + tm.assert_index_equal(result.columns, columns) + + result = df.groupby(level=0, axis=1).agg(lambda x: x.mean(1)) + tm.assert_index_equal(result.columns, Index(["A", "B"])) + tm.assert_index_equal(result.index, df.index) + + # add a nuisance column + sorted_columns, _ = columns.sortlevel(0) + df["A", "foo"] = "bar" + result = df.groupby(level=0).mean() + tm.assert_index_equal(result.columns, df.columns[:-1]) + + +def test_grouping_ndarray(df): + grouped = df.groupby(df["A"].values) + + result = grouped.sum() + expected = df.groupby("A").sum() + tm.assert_frame_equal( + result, expected, check_names=False + ) # Note: no names when grouping by value + + +def test_groupby_wrong_multi_labels(): + data = """index,foo,bar,baz,spam,data +0,foo1,bar1,baz1,spam2,20 +1,foo1,bar2,baz1,spam3,30 +2,foo2,bar2,baz1,spam2,40 +3,foo1,bar1,baz2,spam1,50 +4,foo3,bar1,baz2,spam1,60""" + + data = read_csv(StringIO(data), index_col=0) + + grouped = data.groupby(["foo", "bar", "baz", "spam"]) + + result = grouped.agg(np.mean) + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + +def test_groupby_series_with_name(df): + result = df.groupby(df["A"]).mean() + result2 = df.groupby(df["A"], as_index=False).mean() + assert result.index.name == "A" + assert "A" in result2 + + result = df.groupby([df["A"], df["B"]]).mean() + result2 = df.groupby([df["A"], df["B"]], as_index=False).mean() + assert result.index.names == ("A", "B") + assert "A" in result2 + assert "B" in result2 + + +def test_seriesgroupby_name_attr(df): + # GH 6265 + result = df.groupby("A")["C"] + assert result.count().name == "C" + assert result.mean().name == "C" + + testFunc = lambda x: np.sum(x) * 2 + assert result.agg(testFunc).name == "C" + + +def test_consistency_name(): + # GH 12363 + + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": np.random.randn(8) + 1.0, + "D": np.arange(8), + } + ) + + expected = df.groupby(["A"]).B.count() + result = df.B.groupby(df.A).count() + tm.assert_series_equal(result, expected) + + +def test_groupby_name_propagation(df): + # GH 6124 + def summarize(df, name=None): + return Series({"count": 1, "mean": 2, "omissions": 3}, name=name) + + def summarize_random_name(df): + # Provide a different name for each Series. In this case, groupby + # should not attempt to propagate the Series name since they are + # inconsistent. + return Series({"count": 1, "mean": 2, "omissions": 3}, name=df.iloc[0]["A"]) + + metrics = df.groupby("A").apply(summarize) + assert metrics.columns.name is None + metrics = df.groupby("A").apply(summarize, "metrics") + assert metrics.columns.name == "metrics" + metrics = df.groupby("A").apply(summarize_random_name) + assert metrics.columns.name is None + + +def test_groupby_nonstring_columns(): + df = DataFrame([np.arange(10) for x in range(10)]) + grouped = df.groupby(0) + result = grouped.mean() + expected = df.groupby(df[0]).mean() + tm.assert_frame_equal(result, expected) + + +def test_groupby_mixed_type_columns(): + # GH 13432, unorderable types in py3 + df = DataFrame([[0, 1, 2]], columns=["A", "B", 0]) + expected = DataFrame([[1, 2]], columns=["B", 0], index=Index([0], name="A")) + + result = df.groupby("A").first() + tm.assert_frame_equal(result, expected) + + result = df.groupby("A").sum() + tm.assert_frame_equal(result, expected) + + +# TODO: Ensure warning isn't emitted in the first place +@pytest.mark.filterwarnings("ignore:Mean of:RuntimeWarning") +def test_cython_grouper_series_bug_noncontig(): + arr = np.empty((100, 100)) + arr.fill(np.nan) + obj = Series(arr[:, 0]) + inds = np.tile(range(10), 10) + + result = obj.groupby(inds).agg(Series.median) + assert result.isna().all() + + +def test_series_grouper_noncontig_index(): + index = Index(tm.rands_array(10, 100)) + + values = Series(np.random.randn(50), index=index[::2]) + labels = np.random.randint(0, 5, 50) + + # it works! + grouped = values.groupby(labels) + + # accessing the index elements causes segfault + f = lambda x: len(set(map(id, x.index))) + grouped.agg(f) + + +def test_convert_objects_leave_decimal_alone(): + + s = Series(range(5)) + labels = np.array(["a", "b", "c", "d", "e"], dtype="O") + + def convert_fast(x): + return Decimal(str(x.mean())) + + def convert_force_pure(x): + # base will be length 0 + assert len(x.values.base) > 0 + return Decimal(str(x.mean())) + + grouped = s.groupby(labels) + + result = grouped.agg(convert_fast) + assert result.dtype == np.object_ + assert isinstance(result[0], Decimal) + + result = grouped.agg(convert_force_pure) + assert result.dtype == np.object_ + assert isinstance(result[0], Decimal) + + +def test_groupby_dtype_inference_empty(): + # GH 6733 + df = DataFrame({"x": [], "range": np.arange(0, dtype="int64")}) + assert df["x"].dtype == np.float64 + + result = df.groupby("x").first() + exp_index = Index([], name="x", dtype=np.float64) + expected = DataFrame({"range": Series([], index=exp_index, dtype="int64")}) + tm.assert_frame_equal(result, expected, by_blocks=True) + + +def test_groupby_list_infer_array_like(df): + result = df.groupby(list(df["A"])).mean() + expected = df.groupby(df["A"]).mean() + tm.assert_frame_equal(result, expected, check_names=False) + + with pytest.raises(KeyError, match=r"^'foo'$"): + df.groupby(list(df["A"][:-1])) + + # pathological case of ambiguity + df = DataFrame({"foo": [0, 1], "bar": [3, 4], "val": np.random.randn(2)}) + + result = df.groupby(["foo", "bar"]).mean() + expected = df.groupby([df["foo"], df["bar"]]).mean()[["val"]] + + +def test_groupby_keys_same_size_as_index(): + # GH 11185 + freq = "s" + index = pd.date_range( + start=pd.Timestamp("2015-09-29T11:34:44-0700"), periods=2, freq=freq + ) + df = pd.DataFrame([["A", 10], ["B", 15]], columns=["metric", "values"], index=index) + result = df.groupby([pd.Grouper(level=0, freq=freq), "metric"]).mean() + expected = df.set_index([df.index, "metric"]) + + tm.assert_frame_equal(result, expected) + + +def test_groupby_one_row(): + # GH 11741 + msg = r"^'Z'$" + df1 = pd.DataFrame(np.random.randn(1, 4), columns=list("ABCD")) + with pytest.raises(KeyError, match=msg): + df1.groupby("Z") + df2 = pd.DataFrame(np.random.randn(2, 4), columns=list("ABCD")) + with pytest.raises(KeyError, match=msg): + df2.groupby("Z") + + +def test_groupby_nat_exclude(): + # GH 6992 + df = pd.DataFrame( + { + "values": np.random.randn(8), + "dt": [ + np.nan, + pd.Timestamp("2013-01-01"), + np.nan, + pd.Timestamp("2013-02-01"), + np.nan, + pd.Timestamp("2013-02-01"), + np.nan, + pd.Timestamp("2013-01-01"), + ], + "str": [np.nan, "a", np.nan, "a", np.nan, "a", np.nan, "b"], + } + ) + grouped = df.groupby("dt") + + expected = [pd.Index([1, 7]), pd.Index([3, 5])] + keys = sorted(grouped.groups.keys()) + assert len(keys) == 2 + for k, e in zip(keys, expected): + # grouped.groups keys are np.datetime64 with system tz + # not to be affected by tz, only compare values + tm.assert_index_equal(grouped.groups[k], e) + + # confirm obj is not filtered + tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df) + assert grouped.ngroups == 2 + + expected = { + Timestamp("2013-01-01 00:00:00"): np.array([1, 7], dtype=np.int64), + Timestamp("2013-02-01 00:00:00"): np.array([3, 5], dtype=np.int64), + } + + for k in grouped.indices: + tm.assert_numpy_array_equal(grouped.indices[k], expected[k]) + + tm.assert_frame_equal(grouped.get_group(Timestamp("2013-01-01")), df.iloc[[1, 7]]) + tm.assert_frame_equal(grouped.get_group(Timestamp("2013-02-01")), df.iloc[[3, 5]]) + + with pytest.raises(KeyError, match=r"^NaT$"): + grouped.get_group(pd.NaT) + + nan_df = DataFrame( + {"nan": [np.nan, np.nan, np.nan], "nat": [pd.NaT, pd.NaT, pd.NaT]} + ) + assert nan_df["nan"].dtype == "float64" + assert nan_df["nat"].dtype == "datetime64[ns]" + + for key in ["nan", "nat"]: + grouped = nan_df.groupby(key) + assert grouped.groups == {} + assert grouped.ngroups == 0 + assert grouped.indices == {} + with pytest.raises(KeyError, match=r"^nan$"): + grouped.get_group(np.nan) + with pytest.raises(KeyError, match=r"^NaT$"): + grouped.get_group(pd.NaT) + + +def test_groupby_2d_malformed(): + d = DataFrame(index=range(2)) + d["group"] = ["g1", "g2"] + d["zeros"] = [0, 0] + d["ones"] = [1, 1] + d["label"] = ["l1", "l2"] + tmp = d.groupby(["group"]).mean() + res_values = np.array([[0, 1], [0, 1]], dtype=np.int64) + tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"])) + tm.assert_numpy_array_equal(tmp.values, res_values) + + +def test_int32_overflow(): + B = np.concatenate((np.arange(10000), np.arange(10000), np.arange(5000))) + A = np.arange(25000) + df = DataFrame({"A": A, "B": B, "C": A, "D": B, "E": np.random.randn(25000)}) + + left = df.groupby(["A", "B", "C", "D"]).sum() + right = df.groupby(["D", "C", "B", "A"]).sum() + assert len(left) == len(right) + + +def test_groupby_sort_multi(): + df = DataFrame( + { + "a": ["foo", "bar", "baz"], + "b": [3, 2, 1], + "c": [0, 1, 2], + "d": np.random.randn(3), + } + ) + + tups = [tuple(row) for row in df[["a", "b", "c"]].values] + tups = com.asarray_tuplesafe(tups) + result = df.groupby(["a", "b", "c"], sort=True).sum() + tm.assert_numpy_array_equal(result.index.values, tups[[1, 2, 0]]) + + tups = [tuple(row) for row in df[["c", "a", "b"]].values] + tups = com.asarray_tuplesafe(tups) + result = df.groupby(["c", "a", "b"], sort=True).sum() + tm.assert_numpy_array_equal(result.index.values, tups) + + tups = [tuple(x) for x in df[["b", "c", "a"]].values] + tups = com.asarray_tuplesafe(tups) + result = df.groupby(["b", "c", "a"], sort=True).sum() + tm.assert_numpy_array_equal(result.index.values, tups[[2, 1, 0]]) + + df = DataFrame( + {"a": [0, 1, 2, 0, 1, 2], "b": [0, 0, 0, 1, 1, 1], "d": np.random.randn(6)} + ) + grouped = df.groupby(["a", "b"])["d"] + result = grouped.sum() + + def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): + tups = [tuple(row) for row in df[keys].values] + tups = com.asarray_tuplesafe(tups) + expected = f(df.groupby(tups)[field]) + for k, v in expected.items(): + assert result[k] == v + + _check_groupby(df, result, ["a", "b"], "d") + + +def test_dont_clobber_name_column(): + df = DataFrame( + {"key": ["a", "a", "a", "b", "b", "b"], "name": ["foo", "bar", "baz"] * 2} + ) + + result = df.groupby("key").apply(lambda x: x) + tm.assert_frame_equal(result, df) + + +def test_skip_group_keys(): + + tsf = tm.makeTimeDataFrame() + + grouped = tsf.groupby(lambda x: x.month, group_keys=False) + result = grouped.apply(lambda x: x.sort_values(by="A")[:3]) + + pieces = [group.sort_values(by="A")[:3] for key, group in grouped] + + expected = pd.concat(pieces) + tm.assert_frame_equal(result, expected) + + grouped = tsf["A"].groupby(lambda x: x.month, group_keys=False) + result = grouped.apply(lambda x: x.sort_values()[:3]) + + pieces = [group.sort_values()[:3] for key, group in grouped] + + expected = pd.concat(pieces) + tm.assert_series_equal(result, expected) + + +def test_no_nonsense_name(float_frame): + # GH #995 + s = float_frame["C"].copy() + s.name = None + + result = s.groupby(float_frame["A"]).agg(np.sum) + assert result.name is None + + +def test_multifunc_sum_bug(): + # GH #1065 + x = DataFrame(np.arange(9).reshape(3, 3)) + x["test"] = 0 + x["fl"] = [1.3, 1.5, 1.6] + + grouped = x.groupby("test") + result = grouped.agg({"fl": "sum", 2: "size"}) + assert result["fl"].dtype == np.float64 + + +def test_handle_dict_return_value(df): + def f(group): + return {"max": group.max(), "min": group.min()} + + def g(group): + return Series({"max": group.max(), "min": group.min()}) + + result = df.groupby("A")["C"].apply(f) + expected = df.groupby("A")["C"].apply(g) + + assert isinstance(result, Series) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("grouper", ["A", ["A", "B"]]) +def test_set_group_name(df, grouper): + def f(group): + assert group.name is not None + return group + + def freduce(group): + assert group.name is not None + return group.sum() + + def foo(x): + return freduce(x) + + grouped = df.groupby(grouper) + + # make sure all these work + grouped.apply(f) + grouped.aggregate(freduce) + grouped.aggregate({"C": freduce, "D": freduce}) + grouped.transform(f) + + grouped["C"].apply(f) + grouped["C"].aggregate(freduce) + grouped["C"].aggregate([freduce, foo]) + grouped["C"].transform(f) + + +def test_group_name_available_in_inference_pass(): + # gh-15062 + df = pd.DataFrame({"a": [0, 0, 1, 1, 2, 2], "b": np.arange(6)}) + + names = [] + + def f(group): + names.append(group.name) + return group.copy() + + df.groupby("a", sort=False, group_keys=False).apply(f) + + expected_names = [0, 1, 2] + assert names == expected_names + + +def test_no_dummy_key_names(df): + # see gh-1291 + result = df.groupby(df["A"].values).sum() + assert result.index.name is None + + result = df.groupby([df["A"].values, df["B"].values]).sum() + assert result.index.names == (None, None) + + +def test_groupby_sort_multiindex_series(): + # series multiindex groupby sort argument was not being passed through + # _compress_group_index + # GH 9444 + index = MultiIndex( + levels=[[1, 2], [1, 2]], + codes=[[0, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 0]], + names=["a", "b"], + ) + mseries = Series([0, 1, 2, 3, 4, 5], index=index) + index = MultiIndex( + levels=[[1, 2], [1, 2]], codes=[[0, 0, 1], [1, 0, 0]], names=["a", "b"] + ) + mseries_result = Series([0, 2, 4], index=index) + + result = mseries.groupby(level=["a", "b"], sort=False).first() + tm.assert_series_equal(result, mseries_result) + result = mseries.groupby(level=["a", "b"], sort=True).first() + tm.assert_series_equal(result, mseries_result.sort_index()) + + +def test_groupby_reindex_inside_function(): + + periods = 1000 + ind = date_range(start="2012/1/1", freq="5min", periods=periods) + df = DataFrame({"high": np.arange(periods), "low": np.arange(periods)}, index=ind) + + def agg_before(hour, func, fix=False): + """ + Run an aggregate func on the subset of data. + """ + + def _func(data): + d = data.loc[data.index.map(lambda x: x.hour < 11)].dropna() + if fix: + data[data.index[0]] + if len(d) == 0: + return None + return func(d) + + return _func + + def afunc(data): + d = data.select(lambda x: x.hour < 11).dropna() + return np.max(d) + + grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day)) + closure_bad = grouped.agg({"high": agg_before(11, np.max)}) + closure_good = grouped.agg({"high": agg_before(11, np.max, True)}) + + tm.assert_frame_equal(closure_bad, closure_good) + + +def test_groupby_multiindex_missing_pair(): + # GH9049 + df = DataFrame( + { + "group1": ["a", "a", "a", "b"], + "group2": ["c", "c", "d", "c"], + "value": [1, 1, 1, 5], + } + ) + df = df.set_index(["group1", "group2"]) + df_grouped = df.groupby(level=["group1", "group2"], sort=True) + + res = df_grouped.agg("sum") + idx = MultiIndex.from_tuples( + [("a", "c"), ("a", "d"), ("b", "c")], names=["group1", "group2"] + ) + exp = DataFrame([[2], [1], [5]], index=idx, columns=["value"]) + + tm.assert_frame_equal(res, exp) + + +def test_groupby_multiindex_not_lexsorted(): + # GH 11640 + + # define the lexsorted version + lexsorted_mi = MultiIndex.from_tuples( + [("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"] + ) + lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi) + assert lexsorted_df.columns.is_lexsorted() + + # define the non-lexsorted version + not_lexsorted_df = DataFrame( + columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]] + ) + not_lexsorted_df = not_lexsorted_df.pivot_table( + index="a", columns=["b", "c"], values="d" + ) + not_lexsorted_df = not_lexsorted_df.reset_index() + assert not not_lexsorted_df.columns.is_lexsorted() + + # compare the results + tm.assert_frame_equal(lexsorted_df, not_lexsorted_df) + + expected = lexsorted_df.groupby("a").mean() + with tm.assert_produces_warning(PerformanceWarning): + result = not_lexsorted_df.groupby("a").mean() + tm.assert_frame_equal(expected, result) + + # a transforming function should work regardless of sort + # GH 14776 + df = DataFrame( + {"x": ["a", "a", "b", "a"], "y": [1, 1, 2, 2], "z": [1, 2, 3, 4]} + ).set_index(["x", "y"]) + assert not df.index.is_lexsorted() + + for level in [0, 1, [0, 1]]: + for sort in [False, True]: + result = df.groupby(level=level, sort=sort).apply(DataFrame.drop_duplicates) + expected = df + tm.assert_frame_equal(expected, result) + + result = ( + df.sort_index() + .groupby(level=level, sort=sort) + .apply(DataFrame.drop_duplicates) + ) + expected = df.sort_index() + tm.assert_frame_equal(expected, result) + + +def test_index_label_overlaps_location(): + # checking we don't have any label/location confusion in the + # the wake of GH5375 + df = DataFrame(list("ABCDE"), index=[2, 0, 2, 1, 1]) + g = df.groupby(list("ababb")) + actual = g.filter(lambda x: len(x) > 2) + expected = df.iloc[[1, 3, 4]] + tm.assert_frame_equal(actual, expected) + + ser = df[0] + g = ser.groupby(list("ababb")) + actual = g.filter(lambda x: len(x) > 2) + expected = ser.take([1, 3, 4]) + tm.assert_series_equal(actual, expected) + + # ... and again, with a generic Index of floats + df.index = df.index.astype(float) + g = df.groupby(list("ababb")) + actual = g.filter(lambda x: len(x) > 2) + expected = df.iloc[[1, 3, 4]] + tm.assert_frame_equal(actual, expected) + + ser = df[0] + g = ser.groupby(list("ababb")) + actual = g.filter(lambda x: len(x) > 2) + expected = ser.take([1, 3, 4]) + tm.assert_series_equal(actual, expected) + + +def test_transform_doesnt_clobber_ints(): + # GH 7972 + n = 6 + x = np.arange(n) + df = DataFrame({"a": x // 2, "b": 2.0 * x, "c": 3.0 * x}) + df2 = DataFrame({"a": x // 2 * 1.0, "b": 2.0 * x, "c": 3.0 * x}) + + gb = df.groupby("a") + result = gb.transform("mean") + + gb2 = df2.groupby("a") + expected = gb2.transform("mean") + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "sort_column", + ["ints", "floats", "strings", ["ints", "floats"], ["ints", "strings"]], +) +@pytest.mark.parametrize( + "group_column", ["int_groups", "string_groups", ["int_groups", "string_groups"]] +) +def test_groupby_preserves_sort(sort_column, group_column): + # Test to ensure that groupby always preserves sort order of original + # object. Issue #8588 and #9651 + + df = DataFrame( + { + "int_groups": [3, 1, 0, 1, 0, 3, 3, 3], + "string_groups": ["z", "a", "z", "a", "a", "g", "g", "g"], + "ints": [8, 7, 4, 5, 2, 9, 1, 1], + "floats": [2.3, 5.3, 6.2, -2.4, 2.2, 1.1, 1.1, 5], + "strings": ["z", "d", "a", "e", "word", "word2", "42", "47"], + } + ) + + # Try sorting on different types and with different group types + + df = df.sort_values(by=sort_column) + g = df.groupby(group_column) + + def test_sort(x): + tm.assert_frame_equal(x, x.sort_values(by=sort_column)) + + g.apply(test_sort) + + +def test_group_shift_with_null_key(): + # This test is designed to replicate the segfault in issue #13813. + n_rows = 1200 + + # Generate a moderately large dataframe with occasional missing + # values in column `B`, and then group by [`A`, `B`]. This should + # force `-1` in `labels` array of `g.grouper.group_info` exactly + # at those places, where the group-by key is partially missing. + df = DataFrame( + [(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)], + dtype=float, + columns=["A", "B", "Z"], + index=None, + ) + g = df.groupby(["A", "B"]) + + expected = DataFrame( + [(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)], + dtype=float, + columns=["Z"], + index=None, + ) + result = g.shift(-1) + + tm.assert_frame_equal(result, expected) + + +def test_group_shift_with_fill_value(): + # GH #24128 + n_rows = 24 + df = DataFrame( + [(i % 12, i % 3, i) for i in range(n_rows)], + dtype=float, + columns=["A", "B", "Z"], + index=None, + ) + g = df.groupby(["A", "B"]) + + expected = DataFrame( + [(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)], + dtype=float, + columns=["Z"], + index=None, + ) + result = g.shift(-1, fill_value=0)[["Z"]] + + tm.assert_frame_equal(result, expected) + + +def test_group_shift_lose_timezone(): + # GH 30134 + now_dt = pd.Timestamp.utcnow() + df = DataFrame({"a": [1, 1], "date": now_dt}) + result = df.groupby("a").shift(0).iloc[0] + expected = Series({"date": now_dt}, name=result.name) + tm.assert_series_equal(result, expected) + + +def test_pivot_table_values_key_error(): + # This test is designed to replicate the error in issue #14938 + df = pd.DataFrame( + { + "eventDate": pd.date_range(datetime.today(), periods=20, freq="M").tolist(), + "thename": range(0, 20), + } + ) + + df["year"] = df.set_index("eventDate").index.year + df["month"] = df.set_index("eventDate").index.month + + with pytest.raises(KeyError, match="'badname'"): + df.reset_index().pivot_table( + index="year", columns="month", values="badname", aggfunc="count" + ) + + +def test_empty_dataframe_groupby(): + # GH8093 + df = DataFrame(columns=["A", "B", "C"]) + + result = df.groupby("A").sum() + expected = DataFrame(columns=["B", "C"], dtype=np.float64) + expected.index.name = "A" + + tm.assert_frame_equal(result, expected) + + +def test_tuple_as_grouping(): + # https://github.com/pandas-dev/pandas/issues/18314 + df = pd.DataFrame( + { + ("a", "b"): [1, 1, 1, 1], + "a": [2, 2, 2, 2], + "b": [2, 2, 2, 2], + "c": [1, 1, 1, 1], + } + ) + + with pytest.raises(KeyError): + df[["a", "b", "c"]].groupby(("a", "b")) + + result = df.groupby(("a", "b"))["c"].sum() + expected = pd.Series([4], name="c", index=pd.Index([1], name=("a", "b"))) + tm.assert_series_equal(result, expected) + + +def test_tuple_correct_keyerror(): + # https://github.com/pandas-dev/pandas/issues/18798 + df = pd.DataFrame( + 1, index=range(3), columns=pd.MultiIndex.from_product([[1, 2], [3, 4]]) + ) + with pytest.raises(KeyError, match=r"^\(7, 8\)$"): + df.groupby((7, 8)).mean() + + +def test_groupby_agg_ohlc_non_first(): + # GH 21716 + df = pd.DataFrame( + [[1], [1]], + columns=["foo"], + index=pd.date_range("2018-01-01", periods=2, freq="D"), + ) + + expected = pd.DataFrame( + [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]], + columns=pd.MultiIndex.from_tuples( + ( + ("foo", "sum", "foo"), + ("foo", "ohlc", "open"), + ("foo", "ohlc", "high"), + ("foo", "ohlc", "low"), + ("foo", "ohlc", "close"), + ) + ), + index=pd.date_range("2018-01-01", periods=2, freq="D"), + ) + + result = df.groupby(pd.Grouper(freq="D")).agg(["sum", "ohlc"]) + + tm.assert_frame_equal(result, expected) + + +def test_groupby_multiindex_nat(): + # GH 9236 + values = [ + (pd.NaT, "a"), + (datetime(2012, 1, 2), "a"), + (datetime(2012, 1, 2), "b"), + (datetime(2012, 1, 3), "a"), + ] + mi = pd.MultiIndex.from_tuples(values, names=["date", None]) + ser = pd.Series([3, 2, 2.5, 4], index=mi) + + result = ser.groupby(level=1).mean() + expected = pd.Series([3.0, 2.5], index=["a", "b"]) + tm.assert_series_equal(result, expected) + + +def test_groupby_empty_list_raises(): + # GH 5289 + values = zip(range(10), range(10)) + df = DataFrame(values, columns=["apple", "b"]) + msg = "Grouper and axis must be same length" + with pytest.raises(ValueError, match=msg): + df.groupby([[]]) + + +def test_groupby_multiindex_series_keys_len_equal_group_axis(): + # GH 25704 + index_array = [["x", "x"], ["a", "b"], ["k", "k"]] + index_names = ["first", "second", "third"] + ri = pd.MultiIndex.from_arrays(index_array, names=index_names) + s = pd.Series(data=[1, 2], index=ri) + result = s.groupby(["first", "third"]).sum() + + index_array = [["x"], ["k"]] + index_names = ["first", "third"] + ei = pd.MultiIndex.from_arrays(index_array, names=index_names) + expected = pd.Series([3], index=ei) + + tm.assert_series_equal(result, expected) + + +def test_groupby_groups_in_BaseGrouper(): + # GH 26326 + # Test if DataFrame grouped with a pandas.Grouper has correct groups + mi = pd.MultiIndex.from_product([["A", "B"], ["C", "D"]], names=["alpha", "beta"]) + df = pd.DataFrame({"foo": [1, 2, 1, 2], "bar": [1, 2, 3, 4]}, index=mi) + result = df.groupby([pd.Grouper(level="alpha"), "beta"]) + expected = df.groupby(["alpha", "beta"]) + assert result.groups == expected.groups + + result = df.groupby(["beta", pd.Grouper(level="alpha")]) + expected = df.groupby(["beta", "alpha"]) + assert result.groups == expected.groups + + +@pytest.mark.parametrize("group_name", ["x", ["x"]]) +def test_groupby_axis_1(group_name): + # GH 27614 + df = pd.DataFrame( + np.arange(12).reshape(3, 4), index=[0, 1, 0], columns=[10, 20, 10, 20] + ) + df.index.name = "y" + df.columns.name = "x" + + results = df.groupby(group_name, axis=1).sum() + expected = df.T.groupby(group_name).sum().T + tm.assert_frame_equal(results, expected) + + # test on MI column + iterables = [["bar", "baz", "foo"], ["one", "two"]] + mi = pd.MultiIndex.from_product(iterables=iterables, names=["x", "x1"]) + df = pd.DataFrame(np.arange(18).reshape(3, 6), index=[0, 1, 0], columns=mi) + results = df.groupby(group_name, axis=1).sum() + expected = df.T.groupby(group_name).sum().T + tm.assert_frame_equal(results, expected) + + +@pytest.mark.parametrize( + "op, expected", + [ + ( + "shift", + { + "time": [ + None, + None, + Timestamp("2019-01-01 12:00:00"), + Timestamp("2019-01-01 12:30:00"), + None, + None, + ] + }, + ), + ( + "bfill", + { + "time": [ + Timestamp("2019-01-01 12:00:00"), + Timestamp("2019-01-01 12:30:00"), + Timestamp("2019-01-01 14:00:00"), + Timestamp("2019-01-01 14:30:00"), + Timestamp("2019-01-01 14:00:00"), + Timestamp("2019-01-01 14:30:00"), + ] + }, + ), + ( + "ffill", + { + "time": [ + Timestamp("2019-01-01 12:00:00"), + Timestamp("2019-01-01 12:30:00"), + Timestamp("2019-01-01 12:00:00"), + Timestamp("2019-01-01 12:30:00"), + Timestamp("2019-01-01 14:00:00"), + Timestamp("2019-01-01 14:30:00"), + ] + }, + ), + ], +) +def test_shift_bfill_ffill_tz(tz_naive_fixture, op, expected): + # GH19995, GH27992: Check that timezone does not drop in shift, bfill, and ffill + tz = tz_naive_fixture + data = { + "id": ["A", "B", "A", "B", "A", "B"], + "time": [ + Timestamp("2019-01-01 12:00:00"), + Timestamp("2019-01-01 12:30:00"), + None, + None, + Timestamp("2019-01-01 14:00:00"), + Timestamp("2019-01-01 14:30:00"), + ], + } + df = DataFrame(data).assign(time=lambda x: x.time.dt.tz_localize(tz)) + + grouped = df.groupby("id") + result = getattr(grouped, op)() + expected = DataFrame(expected).assign(time=lambda x: x.time.dt.tz_localize(tz)) + tm.assert_frame_equal(result, expected) + + +def test_groupby_only_none_group(): + # see GH21624 + # this was crashing with "ValueError: Length of passed values is 1, index implies 0" + df = pd.DataFrame({"g": [None], "x": 1}) + actual = df.groupby("g")["x"].transform("sum") + expected = pd.Series([np.nan], name="x") + + tm.assert_series_equal(actual, expected) + + +def test_groupby_duplicate_index(): + # GH#29189 the groupby call here used to raise + ser = pd.Series([2, 5, 6, 8], index=[2.0, 4.0, 4.0, 5.0]) + gb = ser.groupby(level=0) + + result = gb.mean() + expected = pd.Series([2, 5.5, 8], index=[2.0, 4.0, 5.0]) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) +def test_bool_aggs_dup_column_labels(bool_agg_func): + # 21668 + df = pd.DataFrame([[True, True]], columns=["a", "a"]) + grp_by = df.groupby([0]) + result = getattr(grp_by, bool_agg_func)() + + expected = df + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "idx", [pd.Index(["a", "a"]), pd.MultiIndex.from_tuples((("a", "a"), ("a", "a")))] +) +def test_dup_labels_output_shape(groupby_func, idx): + if groupby_func in {"size", "ngroup", "cumcount"}: + pytest.skip("Not applicable") + + df = pd.DataFrame([[1, 1]], columns=idx) + grp_by = df.groupby([0]) + + args = [] + if groupby_func in {"fillna", "nth"}: + args.append(0) + elif groupby_func == "corrwith": + args.append(df) + elif groupby_func == "tshift": + df.index = [pd.Timestamp("today")] + args.extend([1, "D"]) + + result = getattr(grp_by, groupby_func)(*args) + + assert result.shape == (1, 2) + tm.assert_index_equal(result.columns, idx) + + +def test_groupby_crash_on_nunique(axis): + # Fix following 30253 + df = pd.DataFrame({("A", "B"): [1, 2], ("A", "C"): [1, 3], ("D", "B"): [0, 0]}) + + axis_number = df._get_axis_number(axis) + if not axis_number: + df = df.T + + result = df.groupby(axis=axis_number, level=0).nunique() + + expected = pd.DataFrame({"A": [1, 2], "D": [1, 1]}) + if not axis_number: + expected = expected.T + + tm.assert_frame_equal(result, expected) + + +def test_groupby_list_level(): + # GH 9790 + expected = pd.DataFrame(np.arange(0, 9).reshape(3, 3)) + result = expected.groupby(level=[0]).mean() + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/groupby/test_grouping.py b/venv/Lib/site-packages/pandas/tests/groupby/test_grouping.py new file mode 100644 index 0000000..70ba21d --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/groupby/test_grouping.py @@ -0,0 +1,953 @@ +""" test where we are determining what we are grouping, or getting groups """ + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + CategoricalIndex, + DataFrame, + Index, + MultiIndex, + Series, + Timestamp, + date_range, +) +import pandas._testing as tm +from pandas.core.groupby.grouper import Grouping + +# selection +# -------------------------------- + + +class TestSelection: + def test_select_bad_cols(self): + df = DataFrame([[1, 2]], columns=["A", "B"]) + g = df.groupby("A") + with pytest.raises(KeyError, match="\"Columns not found: 'C'\""): + g[["C"]] + + with pytest.raises(KeyError, match="^[^A]+$"): + # A should not be referenced as a bad column... + # will have to rethink regex if you change message! + g[["A", "C"]] + + def test_groupby_duplicated_column_errormsg(self): + # GH7511 + df = DataFrame( + columns=["A", "B", "A", "C"], data=[range(4), range(2, 6), range(0, 8, 2)] + ) + + msg = "Grouper for 'A' not 1-dimensional" + with pytest.raises(ValueError, match=msg): + df.groupby("A") + with pytest.raises(ValueError, match=msg): + df.groupby(["A", "B"]) + + grouped = df.groupby("B") + c = grouped.count() + assert c.columns.nlevels == 1 + assert c.columns.size == 3 + + def test_column_select_via_attr(self, df): + result = df.groupby("A").C.sum() + expected = df.groupby("A")["C"].sum() + tm.assert_series_equal(result, expected) + + df["mean"] = 1.5 + result = df.groupby("A").mean() + expected = df.groupby("A").agg(np.mean) + tm.assert_frame_equal(result, expected) + + def test_getitem_list_of_columns(self): + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + "E": np.random.randn(8), + } + ) + + result = df.groupby("A")[["C", "D"]].mean() + result2 = df.groupby("A")[df.columns[2:4]].mean() + + expected = df.loc[:, ["A", "C", "D"]].groupby("A").mean() + + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result2, expected) + + def test_getitem_numeric_column_names(self): + # GH #13731 + df = DataFrame( + { + 0: list("abcd") * 2, + 2: np.random.randn(8), + 4: np.random.randn(8), + 6: np.random.randn(8), + } + ) + result = df.groupby(0)[df.columns[1:3]].mean() + result2 = df.groupby(0)[[2, 4]].mean() + + expected = df.loc[:, [0, 2, 4]].groupby(0).mean() + + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result2, expected) + + # per GH 23566 this should raise a FutureWarning + with tm.assert_produces_warning(FutureWarning): + df.groupby(0)[2, 4].mean() + + def test_getitem_single_list_of_columns(self, df): + # per GH 23566 this should raise a FutureWarning + with tm.assert_produces_warning(FutureWarning): + df.groupby("A")["C", "D"].mean() + + def test_getitem_single_column(self): + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + "E": np.random.randn(8), + } + ) + + result = df.groupby("A")["C"].mean() + + as_frame = df.loc[:, ["A", "C"]].groupby("A").mean() + as_series = as_frame.iloc[:, 0] + expected = as_series + + tm.assert_series_equal(result, expected) + + +# grouping +# -------------------------------- + + +class TestGrouping: + def test_grouper_index_types(self): + # related GH5375 + # groupby misbehaving when using a Floatlike index + df = DataFrame(np.arange(10).reshape(5, 2), columns=list("AB")) + for index in [ + tm.makeFloatIndex, + tm.makeStringIndex, + tm.makeUnicodeIndex, + tm.makeIntIndex, + tm.makeDateIndex, + tm.makePeriodIndex, + ]: + + df.index = index(len(df)) + df.groupby(list("abcde")).apply(lambda x: x) + + df.index = list(reversed(df.index.tolist())) + df.groupby(list("abcde")).apply(lambda x: x) + + def test_grouper_multilevel_freq(self): + + # GH 7885 + # with level and freq specified in a pd.Grouper + from datetime import date, timedelta + + d0 = date.today() - timedelta(days=14) + dates = date_range(d0, date.today()) + date_index = pd.MultiIndex.from_product([dates, dates], names=["foo", "bar"]) + df = pd.DataFrame(np.random.randint(0, 100, 225), index=date_index) + + # Check string level + expected = ( + df.reset_index() + .groupby([pd.Grouper(key="foo", freq="W"), pd.Grouper(key="bar", freq="W")]) + .sum() + ) + # reset index changes columns dtype to object + expected.columns = pd.Index([0], dtype="int64") + + result = df.groupby( + [pd.Grouper(level="foo", freq="W"), pd.Grouper(level="bar", freq="W")] + ).sum() + tm.assert_frame_equal(result, expected) + + # Check integer level + result = df.groupby( + [pd.Grouper(level=0, freq="W"), pd.Grouper(level=1, freq="W")] + ).sum() + tm.assert_frame_equal(result, expected) + + def test_grouper_creation_bug(self): + + # GH 8795 + df = DataFrame({"A": [0, 0, 1, 1, 2, 2], "B": [1, 2, 3, 4, 5, 6]}) + g = df.groupby("A") + expected = g.sum() + + g = df.groupby(pd.Grouper(key="A")) + result = g.sum() + tm.assert_frame_equal(result, expected) + + result = g.apply(lambda x: x.sum()) + tm.assert_frame_equal(result, expected) + + g = df.groupby(pd.Grouper(key="A", axis=0)) + result = g.sum() + tm.assert_frame_equal(result, expected) + + # GH14334 + # pd.Grouper(key=...) may be passed in a list + df = DataFrame( + {"A": [0, 0, 0, 1, 1, 1], "B": [1, 1, 2, 2, 3, 3], "C": [1, 2, 3, 4, 5, 6]} + ) + # Group by single column + expected = df.groupby("A").sum() + g = df.groupby([pd.Grouper(key="A")]) + result = g.sum() + tm.assert_frame_equal(result, expected) + + # Group by two columns + # using a combination of strings and Grouper objects + expected = df.groupby(["A", "B"]).sum() + + # Group with two Grouper objects + g = df.groupby([pd.Grouper(key="A"), pd.Grouper(key="B")]) + result = g.sum() + tm.assert_frame_equal(result, expected) + + # Group with a string and a Grouper object + g = df.groupby(["A", pd.Grouper(key="B")]) + result = g.sum() + tm.assert_frame_equal(result, expected) + + # Group with a Grouper object and a string + g = df.groupby([pd.Grouper(key="A"), "B"]) + result = g.sum() + tm.assert_frame_equal(result, expected) + + # GH8866 + s = Series( + np.arange(8, dtype="int64"), + index=pd.MultiIndex.from_product( + [list("ab"), range(2), date_range("20130101", periods=2)], + names=["one", "two", "three"], + ), + ) + result = s.groupby(pd.Grouper(level="three", freq="M")).sum() + expected = Series( + [28], index=Index([Timestamp("2013-01-31")], freq="M", name="three") + ) + tm.assert_series_equal(result, expected) + + # just specifying a level breaks + result = s.groupby(pd.Grouper(level="one")).sum() + expected = s.groupby(level="one").sum() + tm.assert_series_equal(result, expected) + + def test_grouper_column_and_index(self): + # GH 14327 + + # Grouping a multi-index frame by a column and an index level should + # be equivalent to resetting the index and grouping by two columns + idx = pd.MultiIndex.from_tuples( + [("a", 1), ("a", 2), ("a", 3), ("b", 1), ("b", 2), ("b", 3)] + ) + idx.names = ["outer", "inner"] + df_multi = pd.DataFrame( + {"A": np.arange(6), "B": ["one", "one", "two", "two", "one", "one"]}, + index=idx, + ) + result = df_multi.groupby(["B", pd.Grouper(level="inner")]).mean() + expected = df_multi.reset_index().groupby(["B", "inner"]).mean() + tm.assert_frame_equal(result, expected) + + # Test the reverse grouping order + result = df_multi.groupby([pd.Grouper(level="inner"), "B"]).mean() + expected = df_multi.reset_index().groupby(["inner", "B"]).mean() + tm.assert_frame_equal(result, expected) + + # Grouping a single-index frame by a column and the index should + # be equivalent to resetting the index and grouping by two columns + df_single = df_multi.reset_index("outer") + result = df_single.groupby(["B", pd.Grouper(level="inner")]).mean() + expected = df_single.reset_index().groupby(["B", "inner"]).mean() + tm.assert_frame_equal(result, expected) + + # Test the reverse grouping order + result = df_single.groupby([pd.Grouper(level="inner"), "B"]).mean() + expected = df_single.reset_index().groupby(["inner", "B"]).mean() + tm.assert_frame_equal(result, expected) + + def test_groupby_levels_and_columns(self): + # GH9344, GH9049 + idx_names = ["x", "y"] + idx = pd.MultiIndex.from_tuples( + [(1, 1), (1, 2), (3, 4), (5, 6)], names=idx_names + ) + df = pd.DataFrame(np.arange(12).reshape(-1, 3), index=idx) + + by_levels = df.groupby(level=idx_names).mean() + # reset_index changes columns dtype to object + by_columns = df.reset_index().groupby(idx_names).mean() + + tm.assert_frame_equal(by_levels, by_columns, check_column_type=False) + + by_columns.columns = pd.Index(by_columns.columns, dtype=np.int64) + tm.assert_frame_equal(by_levels, by_columns) + + def test_groupby_categorical_index_and_columns(self, observed): + # GH18432, adapted for GH25871 + columns = ["A", "B", "A", "B"] + categories = ["B", "A"] + data = np.array( + [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2]], int + ) + cat_columns = CategoricalIndex(columns, categories=categories, ordered=True) + df = DataFrame(data=data, columns=cat_columns) + result = df.groupby(axis=1, level=0, observed=observed).sum() + expected_data = np.array([[4, 2], [4, 2], [4, 2], [4, 2], [4, 2]], int) + expected_columns = CategoricalIndex( + categories, categories=categories, ordered=True + ) + expected = DataFrame(data=expected_data, columns=expected_columns) + tm.assert_frame_equal(result, expected) + + # test transposed version + df = DataFrame(data.T, index=cat_columns) + result = df.groupby(axis=0, level=0, observed=observed).sum() + expected = DataFrame(data=expected_data.T, index=expected_columns) + tm.assert_frame_equal(result, expected) + + def test_grouper_getting_correct_binner(self): + + # GH 10063 + # using a non-time-based grouper and a time-based grouper + # and specifying levels + df = DataFrame( + {"A": 1}, + index=pd.MultiIndex.from_product( + [list("ab"), date_range("20130101", periods=80)], names=["one", "two"] + ), + ) + result = df.groupby( + [pd.Grouper(level="one"), pd.Grouper(level="two", freq="M")] + ).sum() + expected = DataFrame( + {"A": [31, 28, 21, 31, 28, 21]}, + index=MultiIndex.from_product( + [list("ab"), date_range("20130101", freq="M", periods=3)], + names=["one", "two"], + ), + ) + tm.assert_frame_equal(result, expected) + + def test_grouper_iter(self, df): + assert sorted(df.groupby("A").grouper) == ["bar", "foo"] + + def test_empty_groups(self, df): + # see gh-1048 + with pytest.raises(ValueError, match="No group keys passed!"): + df.groupby([]) + + def test_groupby_grouper(self, df): + grouped = df.groupby("A") + + result = df.groupby(grouped.grouper).mean() + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + def test_groupby_dict_mapping(self): + # GH #679 + from pandas import Series + + s = Series({"T1": 5}) + result = s.groupby({"T1": "T2"}).agg(sum) + expected = s.groupby(["T2"]).agg(sum) + tm.assert_series_equal(result, expected) + + s = Series([1.0, 2.0, 3.0, 4.0], index=list("abcd")) + mapping = {"a": 0, "b": 0, "c": 1, "d": 1} + + result = s.groupby(mapping).mean() + result2 = s.groupby(mapping).agg(np.mean) + expected = s.groupby([0, 0, 1, 1]).mean() + expected2 = s.groupby([0, 0, 1, 1]).mean() + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, result2) + tm.assert_series_equal(result, expected2) + + def test_groupby_grouper_f_sanity_checked(self): + dates = date_range("01-Jan-2013", periods=12, freq="MS") + ts = Series(np.random.randn(12), index=dates) + + # GH3035 + # index.map is used to apply grouper to the index + # if it fails on the elements, map tries it on the entire index as + # a sequence. That can yield invalid results that cause trouble + # down the line. + # the surprise comes from using key[0:6] rather then str(key)[0:6] + # when the elements are Timestamp. + # the result is Index[0:6], very confusing. + + msg = r"Grouper result violates len\(labels\) == len\(data\)" + with pytest.raises(AssertionError, match=msg): + ts.groupby(lambda key: key[0:6]) + + def test_grouping_error_on_multidim_input(self, df): + msg = "Grouper for '' not 1-dimensional" + with pytest.raises(ValueError, match=msg): + Grouping(df.index, df[["A", "A"]]) + + def test_multiindex_passthru(self): + + # GH 7997 + # regression from 0.14.1 + df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + df.columns = pd.MultiIndex.from_tuples([(0, 1), (1, 1), (2, 1)]) + + result = df.groupby(axis=1, level=[0, 1]).first() + tm.assert_frame_equal(result, df) + + def test_multiindex_negative_level(self, mframe): + # GH 13901 + result = mframe.groupby(level=-1).sum() + expected = mframe.groupby(level="second").sum() + tm.assert_frame_equal(result, expected) + + result = mframe.groupby(level=-2).sum() + expected = mframe.groupby(level="first").sum() + tm.assert_frame_equal(result, expected) + + result = mframe.groupby(level=[-2, -1]).sum() + expected = mframe + tm.assert_frame_equal(result, expected) + + result = mframe.groupby(level=[-1, "first"]).sum() + expected = mframe.groupby(level=["second", "first"]).sum() + tm.assert_frame_equal(result, expected) + + def test_multifunc_select_col_integer_cols(self, df): + df.columns = np.arange(len(df.columns)) + + # it works! + df.groupby(1, as_index=False)[2].agg({"Q": np.mean}) + + def test_multiindex_columns_empty_level(self): + lst = [["count", "values"], ["to filter", ""]] + midx = MultiIndex.from_tuples(lst) + + df = DataFrame([[1, "A"]], columns=midx) + + grouped = df.groupby("to filter").groups + assert grouped["A"] == [0] + + grouped = df.groupby([("to filter", "")]).groups + assert grouped["A"] == [0] + + df = DataFrame([[1, "A"], [2, "B"]], columns=midx) + + expected = df.groupby("to filter").groups + result = df.groupby([("to filter", "")]).groups + assert result == expected + + df = DataFrame([[1, "A"], [2, "A"]], columns=midx) + + expected = df.groupby("to filter").groups + result = df.groupby([("to filter", "")]).groups + tm.assert_dict_equal(result, expected) + + def test_groupby_multiindex_tuple(self): + # GH 17979 + df = pd.DataFrame( + [[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]], + columns=pd.MultiIndex.from_arrays([["a", "b", "b", "c"], [1, 1, 2, 2]]), + ) + expected = df.groupby([("b", 1)]).groups + result = df.groupby(("b", 1)).groups + tm.assert_dict_equal(expected, result) + + df2 = pd.DataFrame( + df.values, + columns=pd.MultiIndex.from_arrays( + [["a", "b", "b", "c"], ["d", "d", "e", "e"]] + ), + ) + expected = df2.groupby([("b", "d")]).groups + result = df.groupby(("b", 1)).groups + tm.assert_dict_equal(expected, result) + + df3 = pd.DataFrame(df.values, columns=[("a", "d"), ("b", "d"), ("b", "e"), "c"]) + expected = df3.groupby([("b", "d")]).groups + result = df.groupby(("b", 1)).groups + tm.assert_dict_equal(expected, result) + + @pytest.mark.parametrize("sort", [True, False]) + def test_groupby_level(self, sort, mframe, df): + # GH 17537 + frame = mframe + deleveled = frame.reset_index() + + result0 = frame.groupby(level=0, sort=sort).sum() + result1 = frame.groupby(level=1, sort=sort).sum() + + expected0 = frame.groupby(deleveled["first"].values, sort=sort).sum() + expected1 = frame.groupby(deleveled["second"].values, sort=sort).sum() + + expected0.index.name = "first" + expected1.index.name = "second" + + assert result0.index.name == "first" + assert result1.index.name == "second" + + tm.assert_frame_equal(result0, expected0) + tm.assert_frame_equal(result1, expected1) + assert result0.index.name == frame.index.names[0] + assert result1.index.name == frame.index.names[1] + + # groupby level name + result0 = frame.groupby(level="first", sort=sort).sum() + result1 = frame.groupby(level="second", sort=sort).sum() + tm.assert_frame_equal(result0, expected0) + tm.assert_frame_equal(result1, expected1) + + # axis=1 + + result0 = frame.T.groupby(level=0, axis=1, sort=sort).sum() + result1 = frame.T.groupby(level=1, axis=1, sort=sort).sum() + tm.assert_frame_equal(result0, expected0.T) + tm.assert_frame_equal(result1, expected1.T) + + # raise exception for non-MultiIndex + msg = "level > 0 or level < -1 only valid with MultiIndex" + with pytest.raises(ValueError, match=msg): + df.groupby(level=1) + + def test_groupby_level_index_names(self, axis): + # GH4014 this used to raise ValueError since 'exp'>1 (in py2) + df = DataFrame({"exp": ["A"] * 3 + ["B"] * 3, "var1": range(6)}).set_index( + "exp" + ) + if axis in (1, "columns"): + df = df.T + df.groupby(level="exp", axis=axis) + msg = f"level name foo is not the name of the {df._get_axis_name(axis)}" + with pytest.raises(ValueError, match=msg): + df.groupby(level="foo", axis=axis) + + @pytest.mark.parametrize("sort", [True, False]) + def test_groupby_level_with_nas(self, sort): + # GH 17537 + index = MultiIndex( + levels=[[1, 0], [0, 1, 2, 3]], + codes=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]], + ) + + # factorizing doesn't confuse things + s = Series(np.arange(8.0), index=index) + result = s.groupby(level=0, sort=sort).sum() + expected = Series([6.0, 22.0], index=[0, 1]) + tm.assert_series_equal(result, expected) + + index = MultiIndex( + levels=[[1, 0], [0, 1, 2, 3]], + codes=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]], + ) + + # factorizing doesn't confuse things + s = Series(np.arange(8.0), index=index) + result = s.groupby(level=0, sort=sort).sum() + expected = Series([6.0, 18.0], index=[0.0, 1.0]) + tm.assert_series_equal(result, expected) + + def test_groupby_args(self, mframe): + # PR8618 and issue 8015 + frame = mframe + + msg = "You have to supply one of 'by' and 'level'" + with pytest.raises(TypeError, match=msg): + frame.groupby() + + msg = "You have to supply one of 'by' and 'level'" + with pytest.raises(TypeError, match=msg): + frame.groupby(by=None, level=None) + + @pytest.mark.parametrize( + "sort,labels", + [ + [True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]], + [False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]], + ], + ) + def test_level_preserve_order(self, sort, labels, mframe): + # GH 17537 + grouped = mframe.groupby(level=0, sort=sort) + exp_labels = np.array(labels, np.intp) + tm.assert_almost_equal(grouped.grouper.codes[0], exp_labels) + + def test_grouping_labels(self, mframe): + grouped = mframe.groupby(mframe.index.get_level_values(0)) + exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp) + tm.assert_almost_equal(grouped.grouper.codes[0], exp_labels) + + def test_list_grouper_with_nat(self): + # GH 14715 + df = pd.DataFrame({"date": pd.date_range("1/1/2011", periods=365, freq="D")}) + df.iloc[-1] = pd.NaT + grouper = pd.Grouper(key="date", freq="AS") + + # Grouper in a list grouping + result = df.groupby([grouper]) + expected = {pd.Timestamp("2011-01-01"): pd.Index(list(range(364)))} + tm.assert_dict_equal(result.groups, expected) + + # Test case without a list + result = df.groupby(grouper) + expected = {pd.Timestamp("2011-01-01"): 365} + tm.assert_dict_equal(result.groups, expected) + + @pytest.mark.parametrize( + "func,expected", + [ + ( + "transform", + pd.Series(name=2, dtype=np.float64, index=pd.RangeIndex(0, 0, 1)), + ), + ( + "agg", + pd.Series(name=2, dtype=np.float64, index=pd.Float64Index([], name=1)), + ), + ( + "apply", + pd.Series(name=2, dtype=np.float64, index=pd.Float64Index([], name=1)), + ), + ], + ) + def test_evaluate_with_empty_groups(self, func, expected): + # 26208 + # test transform'ing empty groups + # (not testing other agg fns, because they return + # different index objects. + df = pd.DataFrame({1: [], 2: []}) + g = df.groupby(1) + result = getattr(g[2], func)(lambda x: x) + tm.assert_series_equal(result, expected) + + def test_groupby_empty(self): + # https://github.com/pandas-dev/pandas/issues/27190 + s = pd.Series([], name="name", dtype="float64") + gr = s.groupby([]) + + result = gr.mean() + tm.assert_series_equal(result, s) + + # check group properties + assert len(gr.grouper.groupings) == 1 + tm.assert_numpy_array_equal( + gr.grouper.group_info[0], np.array([], dtype=np.dtype("int64")) + ) + + tm.assert_numpy_array_equal( + gr.grouper.group_info[1], np.array([], dtype=np.dtype("int")) + ) + + assert gr.grouper.group_info[2] == 0 + + # check name + assert s.groupby(s).grouper.names == ["name"] + + def test_groupby_level_index_value_all_na(self): + # issue 20519 + df = DataFrame( + [["x", np.nan, 10], [None, np.nan, 20]], columns=["A", "B", "C"] + ).set_index(["A", "B"]) + result = df.groupby(level=["A", "B"]).sum() + expected = DataFrame( + data=[], + index=MultiIndex( + levels=[Index(["x"], dtype="object"), Index([], dtype="float64")], + codes=[[], []], + names=["A", "B"], + ), + columns=["C"], + dtype="int64", + ) + tm.assert_frame_equal(result, expected) + + +# get_group +# -------------------------------- + + +class TestGetGroup: + def test_get_group(self): + # GH 5267 + # be datelike friendly + df = DataFrame( + { + "DATE": pd.to_datetime( + [ + "10-Oct-2013", + "10-Oct-2013", + "10-Oct-2013", + "11-Oct-2013", + "11-Oct-2013", + "11-Oct-2013", + ] + ), + "label": ["foo", "foo", "bar", "foo", "foo", "bar"], + "VAL": [1, 2, 3, 4, 5, 6], + } + ) + + g = df.groupby("DATE") + key = list(g.groups)[0] + result1 = g.get_group(key) + result2 = g.get_group(Timestamp(key).to_pydatetime()) + result3 = g.get_group(str(Timestamp(key))) + tm.assert_frame_equal(result1, result2) + tm.assert_frame_equal(result1, result3) + + g = df.groupby(["DATE", "label"]) + + key = list(g.groups)[0] + result1 = g.get_group(key) + result2 = g.get_group((Timestamp(key[0]).to_pydatetime(), key[1])) + result3 = g.get_group((str(Timestamp(key[0])), key[1])) + tm.assert_frame_equal(result1, result2) + tm.assert_frame_equal(result1, result3) + + # must pass a same-length tuple with multiple keys + msg = "must supply a tuple to get_group with multiple grouping keys" + with pytest.raises(ValueError, match=msg): + g.get_group("foo") + with pytest.raises(ValueError, match=msg): + g.get_group(("foo")) + msg = ( + "must supply a same-length tuple to get_group with multiple" + " grouping keys" + ) + with pytest.raises(ValueError, match=msg): + g.get_group(("foo", "bar", "baz")) + + def test_get_group_empty_bins(self, observed): + + d = pd.DataFrame([3, 1, 7, 6]) + bins = [0, 5, 10, 15] + g = d.groupby(pd.cut(d[0], bins), observed=observed) + + # TODO: should prob allow a str of Interval work as well + # IOW '(0, 5]' + result = g.get_group(pd.Interval(0, 5)) + expected = DataFrame([3, 1], index=[0, 1]) + tm.assert_frame_equal(result, expected) + + msg = r"Interval\(10, 15, closed='right'\)" + with pytest.raises(KeyError, match=msg): + g.get_group(pd.Interval(10, 15)) + + def test_get_group_grouped_by_tuple(self): + # GH 8121 + df = DataFrame([[(1,), (1, 2), (1,), (1, 2)]], index=["ids"]).T + gr = df.groupby("ids") + expected = DataFrame({"ids": [(1,), (1,)]}, index=[0, 2]) + result = gr.get_group((1,)) + tm.assert_frame_equal(result, expected) + + dt = pd.to_datetime(["2010-01-01", "2010-01-02", "2010-01-01", "2010-01-02"]) + df = DataFrame({"ids": [(x,) for x in dt]}) + gr = df.groupby("ids") + result = gr.get_group(("2010-01-01",)) + expected = DataFrame({"ids": [(dt[0],), (dt[0],)]}, index=[0, 2]) + tm.assert_frame_equal(result, expected) + + def test_groupby_with_empty(self): + index = pd.DatetimeIndex(()) + data = () + series = pd.Series(data, index, dtype=object) + grouper = pd.Grouper(freq="D") + grouped = series.groupby(grouper) + assert next(iter(grouped), None) is None + + def test_groupby_with_single_column(self): + df = pd.DataFrame({"a": list("abssbab")}) + tm.assert_frame_equal(df.groupby("a").get_group("a"), df.iloc[[0, 5]]) + # GH 13530 + exp = pd.DataFrame(index=pd.Index(["a", "b", "s"], name="a")) + tm.assert_frame_equal(df.groupby("a").count(), exp) + tm.assert_frame_equal(df.groupby("a").sum(), exp) + tm.assert_frame_equal(df.groupby("a").nth(1), exp) + + def test_gb_key_len_equal_axis_len(self): + # GH16843 + # test ensures that index and column keys are recognized correctly + # when number of keys equals axis length of groupby + df = pd.DataFrame( + [["foo", "bar", "B", 1], ["foo", "bar", "B", 2], ["foo", "baz", "C", 3]], + columns=["first", "second", "third", "one"], + ) + df = df.set_index(["first", "second"]) + df = df.groupby(["first", "second", "third"]).size() + assert df.loc[("foo", "bar", "B")] == 2 + assert df.loc[("foo", "baz", "C")] == 1 + + +# groups & iteration +# -------------------------------- + + +class TestIteration: + def test_groups(self, df): + grouped = df.groupby(["A"]) + groups = grouped.groups + assert groups is grouped.groups # caching works + + for k, v in grouped.groups.items(): + assert (df.loc[v]["A"] == k).all() + + grouped = df.groupby(["A", "B"]) + groups = grouped.groups + assert groups is grouped.groups # caching works + + for k, v in grouped.groups.items(): + assert (df.loc[v]["A"] == k[0]).all() + assert (df.loc[v]["B"] == k[1]).all() + + def test_grouping_is_iterable(self, tsframe): + # this code path isn't used anywhere else + # not sure it's useful + grouped = tsframe.groupby([lambda x: x.weekday(), lambda x: x.year]) + + # test it works + for g in grouped.grouper.groupings[0]: + pass + + def test_multi_iter(self): + s = Series(np.arange(6)) + k1 = np.array(["a", "a", "a", "b", "b", "b"]) + k2 = np.array(["1", "2", "1", "2", "1", "2"]) + + grouped = s.groupby([k1, k2]) + + iterated = list(grouped) + expected = [ + ("a", "1", s[[0, 2]]), + ("a", "2", s[[1]]), + ("b", "1", s[[4]]), + ("b", "2", s[[3, 5]]), + ] + for i, ((one, two), three) in enumerate(iterated): + e1, e2, e3 = expected[i] + assert e1 == one + assert e2 == two + tm.assert_series_equal(three, e3) + + def test_multi_iter_frame(self, three_group): + k1 = np.array(["b", "b", "b", "a", "a", "a"]) + k2 = np.array(["1", "2", "1", "2", "1", "2"]) + df = DataFrame( + {"v1": np.random.randn(6), "v2": np.random.randn(6), "k1": k1, "k2": k2}, + index=["one", "two", "three", "four", "five", "six"], + ) + + grouped = df.groupby(["k1", "k2"]) + + # things get sorted! + iterated = list(grouped) + idx = df.index + expected = [ + ("a", "1", df.loc[idx[[4]]]), + ("a", "2", df.loc[idx[[3, 5]]]), + ("b", "1", df.loc[idx[[0, 2]]]), + ("b", "2", df.loc[idx[[1]]]), + ] + for i, ((one, two), three) in enumerate(iterated): + e1, e2, e3 = expected[i] + assert e1 == one + assert e2 == two + tm.assert_frame_equal(three, e3) + + # don't iterate through groups with no data + df["k1"] = np.array(["b", "b", "b", "a", "a", "a"]) + df["k2"] = np.array(["1", "1", "1", "2", "2", "2"]) + grouped = df.groupby(["k1", "k2"]) + groups = {key: gp for key, gp in grouped} + assert len(groups) == 2 + + # axis = 1 + three_levels = three_group.groupby(["A", "B", "C"]).mean() + grouped = three_levels.T.groupby(axis=1, level=(1, 2)) + for key, group in grouped: + pass + + def test_dictify(self, df): + dict(iter(df.groupby("A"))) + dict(iter(df.groupby(["A", "B"]))) + dict(iter(df["C"].groupby(df["A"]))) + dict(iter(df["C"].groupby([df["A"], df["B"]]))) + dict(iter(df.groupby("A")["C"])) + dict(iter(df.groupby(["A", "B"])["C"])) + + def test_groupby_with_small_elem(self): + # GH 8542 + # length=2 + df = pd.DataFrame( + {"event": ["start", "start"], "change": [1234, 5678]}, + index=pd.DatetimeIndex(["2014-09-10", "2013-10-10"]), + ) + grouped = df.groupby([pd.Grouper(freq="M"), "event"]) + assert len(grouped.groups) == 2 + assert grouped.ngroups == 2 + assert (pd.Timestamp("2014-09-30"), "start") in grouped.groups + assert (pd.Timestamp("2013-10-31"), "start") in grouped.groups + + res = grouped.get_group((pd.Timestamp("2014-09-30"), "start")) + tm.assert_frame_equal(res, df.iloc[[0], :]) + res = grouped.get_group((pd.Timestamp("2013-10-31"), "start")) + tm.assert_frame_equal(res, df.iloc[[1], :]) + + df = pd.DataFrame( + {"event": ["start", "start", "start"], "change": [1234, 5678, 9123]}, + index=pd.DatetimeIndex(["2014-09-10", "2013-10-10", "2014-09-15"]), + ) + grouped = df.groupby([pd.Grouper(freq="M"), "event"]) + assert len(grouped.groups) == 2 + assert grouped.ngroups == 2 + assert (pd.Timestamp("2014-09-30"), "start") in grouped.groups + assert (pd.Timestamp("2013-10-31"), "start") in grouped.groups + + res = grouped.get_group((pd.Timestamp("2014-09-30"), "start")) + tm.assert_frame_equal(res, df.iloc[[0, 2], :]) + res = grouped.get_group((pd.Timestamp("2013-10-31"), "start")) + tm.assert_frame_equal(res, df.iloc[[1], :]) + + # length=3 + df = pd.DataFrame( + {"event": ["start", "start", "start"], "change": [1234, 5678, 9123]}, + index=pd.DatetimeIndex(["2014-09-10", "2013-10-10", "2014-08-05"]), + ) + grouped = df.groupby([pd.Grouper(freq="M"), "event"]) + assert len(grouped.groups) == 3 + assert grouped.ngroups == 3 + assert (pd.Timestamp("2014-09-30"), "start") in grouped.groups + assert (pd.Timestamp("2013-10-31"), "start") in grouped.groups + assert (pd.Timestamp("2014-08-31"), "start") in grouped.groups + + res = grouped.get_group((pd.Timestamp("2014-09-30"), "start")) + tm.assert_frame_equal(res, df.iloc[[0], :]) + res = grouped.get_group((pd.Timestamp("2013-10-31"), "start")) + tm.assert_frame_equal(res, df.iloc[[1], :]) + res = grouped.get_group((pd.Timestamp("2014-08-31"), "start")) + tm.assert_frame_equal(res, df.iloc[[2], :]) + + def test_grouping_string_repr(self): + # GH 13394 + mi = MultiIndex.from_arrays([list("AAB"), list("aba")]) + df = DataFrame([[1, 2, 3]], columns=mi) + gr = df.groupby(df[("A", "a")]) + + result = gr.grouper.groupings[0].__repr__() + expected = "Grouping(('A', 'a'))" + assert result == expected diff --git a/venv/Lib/site-packages/pandas/tests/groupby/test_index_as_string.py b/venv/Lib/site-packages/pandas/tests/groupby/test_index_as_string.py new file mode 100644 index 0000000..971a447 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/groupby/test_index_as_string.py @@ -0,0 +1,82 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.fixture(params=[["inner"], ["inner", "outer"]]) +def frame(request): + levels = request.param + df = pd.DataFrame( + { + "outer": ["a", "a", "a", "b", "b", "b"], + "inner": [1, 2, 3, 1, 2, 3], + "A": np.arange(6), + "B": ["one", "one", "two", "two", "one", "one"], + } + ) + if levels: + df = df.set_index(levels) + + return df + + +@pytest.fixture() +def series(): + df = pd.DataFrame( + { + "outer": ["a", "a", "a", "b", "b", "b"], + "inner": [1, 2, 3, 1, 2, 3], + "A": np.arange(6), + "B": ["one", "one", "two", "two", "one", "one"], + } + ) + s = df.set_index(["outer", "inner", "B"])["A"] + + return s + + +@pytest.mark.parametrize( + "key_strs,groupers", + [ + ("inner", pd.Grouper(level="inner")), # Index name + (["inner"], [pd.Grouper(level="inner")]), # List of index name + (["B", "inner"], ["B", pd.Grouper(level="inner")]), # Column and index + (["inner", "B"], [pd.Grouper(level="inner"), "B"]), # Index and column + ], +) +def test_grouper_index_level_as_string(frame, key_strs, groupers): + result = frame.groupby(key_strs).mean() + expected = frame.groupby(groupers).mean() + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "levels", + [ + "inner", + "outer", + "B", + ["inner"], + ["outer"], + ["B"], + ["inner", "outer"], + ["outer", "inner"], + ["inner", "outer", "B"], + ["B", "outer", "inner"], + ], +) +def test_grouper_index_level_as_string_series(series, levels): + + # Compute expected result + if isinstance(levels, list): + groupers = [pd.Grouper(level=lv) for lv in levels] + else: + groupers = pd.Grouper(level=levels) + + expected = series.groupby(groupers).mean() + + # Compute and check result + result = series.groupby(levels).mean() + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/groupby/test_nth.py b/venv/Lib/site-packages/pandas/tests/groupby/test_nth.py new file mode 100644 index 0000000..0f850f2 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/groupby/test_nth.py @@ -0,0 +1,532 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, isna +import pandas._testing as tm + + +def test_first_last_nth(df): + # tests for first / last / nth + grouped = df.groupby("A") + first = grouped.first() + expected = df.loc[[1, 0], ["B", "C", "D"]] + expected.index = Index(["bar", "foo"], name="A") + expected = expected.sort_index() + tm.assert_frame_equal(first, expected) + + nth = grouped.nth(0) + tm.assert_frame_equal(nth, expected) + + last = grouped.last() + expected = df.loc[[5, 7], ["B", "C", "D"]] + expected.index = Index(["bar", "foo"], name="A") + tm.assert_frame_equal(last, expected) + + nth = grouped.nth(-1) + tm.assert_frame_equal(nth, expected) + + nth = grouped.nth(1) + expected = df.loc[[2, 3], ["B", "C", "D"]].copy() + expected.index = Index(["foo", "bar"], name="A") + expected = expected.sort_index() + tm.assert_frame_equal(nth, expected) + + # it works! + grouped["B"].first() + grouped["B"].last() + grouped["B"].nth(0) + + df.loc[df["A"] == "foo", "B"] = np.nan + assert isna(grouped["B"].first()["foo"]) + assert isna(grouped["B"].last()["foo"]) + assert isna(grouped["B"].nth(0)["foo"]) + + # v0.14.0 whatsnew + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) + g = df.groupby("A") + result = g.first() + expected = df.iloc[[1, 2]].set_index("A") + tm.assert_frame_equal(result, expected) + + expected = df.iloc[[1, 2]].set_index("A") + result = g.nth(0, dropna="any") + tm.assert_frame_equal(result, expected) + + +def test_first_last_nth_dtypes(df_mixed_floats): + + df = df_mixed_floats.copy() + df["E"] = True + df["F"] = 1 + + # tests for first / last / nth + grouped = df.groupby("A") + first = grouped.first() + expected = df.loc[[1, 0], ["B", "C", "D", "E", "F"]] + expected.index = Index(["bar", "foo"], name="A") + expected = expected.sort_index() + tm.assert_frame_equal(first, expected) + + last = grouped.last() + expected = df.loc[[5, 7], ["B", "C", "D", "E", "F"]] + expected.index = Index(["bar", "foo"], name="A") + expected = expected.sort_index() + tm.assert_frame_equal(last, expected) + + nth = grouped.nth(1) + expected = df.loc[[3, 2], ["B", "C", "D", "E", "F"]] + expected.index = Index(["bar", "foo"], name="A") + expected = expected.sort_index() + tm.assert_frame_equal(nth, expected) + + # GH 2763, first/last shifting dtypes + idx = list(range(10)) + idx.append(9) + s = Series(data=range(11), index=idx, name="IntCol") + assert s.dtype == "int64" + f = s.groupby(level=0).first() + assert f.dtype == "int64" + + +def test_first_strings_timestamps(): + # GH 11244 + test = pd.DataFrame( + { + pd.Timestamp("2012-01-01 00:00:00"): ["a", "b"], + pd.Timestamp("2012-01-02 00:00:00"): ["c", "d"], + "name": ["e", "e"], + "aaaa": ["f", "g"], + } + ) + result = test.groupby("name").first() + expected = DataFrame( + [["a", "c", "f"]], + columns=Index([Timestamp("2012-01-01"), Timestamp("2012-01-02"), "aaaa"]), + index=Index(["e"], name="name"), + ) + tm.assert_frame_equal(result, expected) + + +def test_nth(): + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) + g = df.groupby("A") + + tm.assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index("A")) + tm.assert_frame_equal(g.nth(1), df.iloc[[1]].set_index("A")) + tm.assert_frame_equal(g.nth(2), df.loc[[]].set_index("A")) + tm.assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index("A")) + tm.assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index("A")) + tm.assert_frame_equal(g.nth(-3), df.loc[[]].set_index("A")) + tm.assert_series_equal(g.B.nth(0), df.set_index("A").B.iloc[[0, 2]]) + tm.assert_series_equal(g.B.nth(1), df.set_index("A").B.iloc[[1]]) + tm.assert_frame_equal(g[["B"]].nth(0), df.loc[[0, 2], ["A", "B"]].set_index("A")) + + exp = df.set_index("A") + tm.assert_frame_equal(g.nth(0, dropna="any"), exp.iloc[[1, 2]]) + tm.assert_frame_equal(g.nth(-1, dropna="any"), exp.iloc[[1, 2]]) + + exp["B"] = np.nan + tm.assert_frame_equal(g.nth(7, dropna="any"), exp.iloc[[1, 2]]) + tm.assert_frame_equal(g.nth(2, dropna="any"), exp.iloc[[1, 2]]) + + # out of bounds, regression from 0.13.1 + # GH 6621 + df = DataFrame( + { + "color": {0: "green", 1: "green", 2: "red", 3: "red", 4: "red"}, + "food": {0: "ham", 1: "eggs", 2: "eggs", 3: "ham", 4: "pork"}, + "two": { + 0: 1.5456590000000001, + 1: -0.070345000000000005, + 2: -2.4004539999999999, + 3: 0.46206000000000003, + 4: 0.52350799999999997, + }, + "one": { + 0: 0.56573799999999996, + 1: -0.9742360000000001, + 2: 1.033801, + 3: -0.78543499999999999, + 4: 0.70422799999999997, + }, + } + ).set_index(["color", "food"]) + + result = df.groupby(level=0, as_index=False).nth(2) + expected = df.iloc[[-1]] + tm.assert_frame_equal(result, expected) + + result = df.groupby(level=0, as_index=False).nth(3) + expected = df.loc[[]] + tm.assert_frame_equal(result, expected) + + # GH 7559 + # from the vbench + df = DataFrame(np.random.randint(1, 10, (100, 2)), dtype="int64") + s = df[1] + g = df[0] + expected = s.groupby(g).first() + expected2 = s.groupby(g).apply(lambda x: x.iloc[0]) + tm.assert_series_equal(expected2, expected, check_names=False) + assert expected.name == 1 + assert expected2.name == 1 + + # validate first + v = s[g == 1].iloc[0] + assert expected.iloc[0] == v + assert expected2.iloc[0] == v + + # this is NOT the same as .first (as sorted is default!) + # as it keeps the order in the series (and not the group order) + # related GH 7287 + expected = s.groupby(g, sort=False).first() + result = s.groupby(g, sort=False).nth(0, dropna="all") + tm.assert_series_equal(result, expected) + + with pytest.raises(ValueError, match="For a DataFrame groupby"): + s.groupby(g, sort=False).nth(0, dropna=True) + + # doc example + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) + g = df.groupby("A") + result = g.B.nth(0, dropna="all") + expected = g.B.first() + tm.assert_series_equal(result, expected) + + # test multiple nth values + df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], columns=["A", "B"]) + g = df.groupby("A") + + tm.assert_frame_equal(g.nth(0), df.iloc[[0, 3]].set_index("A")) + tm.assert_frame_equal(g.nth([0]), df.iloc[[0, 3]].set_index("A")) + tm.assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]].set_index("A")) + tm.assert_frame_equal(g.nth([0, -1]), df.iloc[[0, 2, 3, 4]].set_index("A")) + tm.assert_frame_equal(g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]].set_index("A")) + tm.assert_frame_equal(g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index("A")) + tm.assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index("A")) + tm.assert_frame_equal(g.nth([3, 4]), df.loc[[]].set_index("A")) + + business_dates = pd.date_range(start="4/1/2014", end="6/30/2014", freq="B") + df = DataFrame(1, index=business_dates, columns=["a", "b"]) + # get the first, fourth and last two business days for each month + key = [df.index.year, df.index.month] + result = df.groupby(key, as_index=False).nth([0, 3, -2, -1]) + expected_dates = pd.to_datetime( + [ + "2014/4/1", + "2014/4/4", + "2014/4/29", + "2014/4/30", + "2014/5/1", + "2014/5/6", + "2014/5/29", + "2014/5/30", + "2014/6/2", + "2014/6/5", + "2014/6/27", + "2014/6/30", + ] + ) + expected = DataFrame(1, columns=["a", "b"], index=expected_dates) + tm.assert_frame_equal(result, expected) + + +def test_nth_multi_index(three_group): + # PR 9090, related to issue 8979 + # test nth on MultiIndex, should match .first() + grouped = three_group.groupby(["A", "B"]) + result = grouped.nth(0) + expected = grouped.first() + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data, expected_first, expected_last", + [ + ( + { + "id": ["A"], + "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"), + "foo": [1], + }, + { + "id": ["A"], + "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"), + "foo": [1], + }, + { + "id": ["A"], + "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"), + "foo": [1], + }, + ), + ( + { + "id": ["A", "B", "A"], + "time": [ + Timestamp("2012-01-01 13:00:00", tz="America/New_York"), + Timestamp("2012-02-01 14:00:00", tz="US/Central"), + Timestamp("2012-03-01 12:00:00", tz="Europe/London"), + ], + "foo": [1, 2, 3], + }, + { + "id": ["A", "B"], + "time": [ + Timestamp("2012-01-01 13:00:00", tz="America/New_York"), + Timestamp("2012-02-01 14:00:00", tz="US/Central"), + ], + "foo": [1, 2], + }, + { + "id": ["A", "B"], + "time": [ + Timestamp("2012-03-01 12:00:00", tz="Europe/London"), + Timestamp("2012-02-01 14:00:00", tz="US/Central"), + ], + "foo": [3, 2], + }, + ), + ], +) +def test_first_last_tz(data, expected_first, expected_last): + # GH15884 + # Test that the timezone is retained when calling first + # or last on groupby with as_index=False + + df = DataFrame(data) + + result = df.groupby("id", as_index=False).first() + expected = DataFrame(expected_first) + cols = ["id", "time", "foo"] + tm.assert_frame_equal(result[cols], expected[cols]) + + result = df.groupby("id", as_index=False)["time"].first() + tm.assert_frame_equal(result, expected[["id", "time"]]) + + result = df.groupby("id", as_index=False).last() + expected = DataFrame(expected_last) + cols = ["id", "time", "foo"] + tm.assert_frame_equal(result[cols], expected[cols]) + + result = df.groupby("id", as_index=False)["time"].last() + tm.assert_frame_equal(result, expected[["id", "time"]]) + + +@pytest.mark.parametrize( + "method, ts, alpha", + [ + ["first", Timestamp("2013-01-01", tz="US/Eastern"), "a"], + ["last", Timestamp("2013-01-02", tz="US/Eastern"), "b"], + ], +) +def test_first_last_tz_multi_column(method, ts, alpha): + # GH 21603 + category_string = pd.Series(list("abc")).astype("category") + df = pd.DataFrame( + { + "group": [1, 1, 2], + "category_string": category_string, + "datetimetz": pd.date_range("20130101", periods=3, tz="US/Eastern"), + } + ) + result = getattr(df.groupby("group"), method)() + expected = pd.DataFrame( + { + "category_string": pd.Categorical( + [alpha, "c"], dtype=category_string.dtype + ), + "datetimetz": [ts, Timestamp("2013-01-03", tz="US/Eastern")], + }, + index=pd.Index([1, 2], name="group"), + ) + tm.assert_frame_equal(result, expected) + + +def test_nth_multi_index_as_expected(): + # PR 9090, related to issue 8979 + # test nth on MultiIndex + three_group = DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + } + ) + grouped = three_group.groupby(["A", "B"]) + result = grouped.nth(0) + expected = DataFrame( + {"C": ["dull", "dull", "dull", "dull"]}, + index=MultiIndex.from_arrays( + [["bar", "bar", "foo", "foo"], ["one", "two", "one", "two"]], + names=["A", "B"], + ), + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_head_tail(): + df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) + g_as = df.groupby("A", as_index=True) + g_not_as = df.groupby("A", as_index=False) + + # as_index= False, much easier + tm.assert_frame_equal(df.loc[[0, 2]], g_not_as.head(1)) + tm.assert_frame_equal(df.loc[[1, 2]], g_not_as.tail(1)) + + empty_not_as = DataFrame( + columns=df.columns, index=pd.Index([], dtype=df.index.dtype) + ) + empty_not_as["A"] = empty_not_as["A"].astype(df.A.dtype) + empty_not_as["B"] = empty_not_as["B"].astype(df.B.dtype) + tm.assert_frame_equal(empty_not_as, g_not_as.head(0)) + tm.assert_frame_equal(empty_not_as, g_not_as.tail(0)) + tm.assert_frame_equal(empty_not_as, g_not_as.head(-1)) + tm.assert_frame_equal(empty_not_as, g_not_as.tail(-1)) + + tm.assert_frame_equal(df, g_not_as.head(7)) # contains all + tm.assert_frame_equal(df, g_not_as.tail(7)) + + # as_index=True, (used to be different) + df_as = df + + tm.assert_frame_equal(df_as.loc[[0, 2]], g_as.head(1)) + tm.assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1)) + + empty_as = DataFrame(index=df_as.index[:0], columns=df.columns) + empty_as["A"] = empty_not_as["A"].astype(df.A.dtype) + empty_as["B"] = empty_not_as["B"].astype(df.B.dtype) + tm.assert_frame_equal(empty_as, g_as.head(0)) + tm.assert_frame_equal(empty_as, g_as.tail(0)) + tm.assert_frame_equal(empty_as, g_as.head(-1)) + tm.assert_frame_equal(empty_as, g_as.tail(-1)) + + tm.assert_frame_equal(df_as, g_as.head(7)) # contains all + tm.assert_frame_equal(df_as, g_as.tail(7)) + + # test with selection + tm.assert_frame_equal(g_as[[]].head(1), df_as.loc[[0, 2], []]) + tm.assert_frame_equal(g_as[["A"]].head(1), df_as.loc[[0, 2], ["A"]]) + tm.assert_frame_equal(g_as[["B"]].head(1), df_as.loc[[0, 2], ["B"]]) + tm.assert_frame_equal(g_as[["A", "B"]].head(1), df_as.loc[[0, 2]]) + + tm.assert_frame_equal(g_not_as[[]].head(1), df_as.loc[[0, 2], []]) + tm.assert_frame_equal(g_not_as[["A"]].head(1), df_as.loc[[0, 2], ["A"]]) + tm.assert_frame_equal(g_not_as[["B"]].head(1), df_as.loc[[0, 2], ["B"]]) + tm.assert_frame_equal(g_not_as[["A", "B"]].head(1), df_as.loc[[0, 2]]) + + +def test_group_selection_cache(): + # GH 12839 nth, head, and tail should return same result consistently + df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) + expected = df.iloc[[0, 2]].set_index("A") + + g = df.groupby("A") + result1 = g.head(n=2) + result2 = g.nth(0) + tm.assert_frame_equal(result1, df) + tm.assert_frame_equal(result2, expected) + + g = df.groupby("A") + result1 = g.tail(n=2) + result2 = g.nth(0) + tm.assert_frame_equal(result1, df) + tm.assert_frame_equal(result2, expected) + + g = df.groupby("A") + result1 = g.nth(0) + result2 = g.head(n=2) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, df) + + g = df.groupby("A") + result1 = g.nth(0) + result2 = g.tail(n=2) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, df) + + +def test_nth_empty(): + # GH 16064 + df = DataFrame(index=[0], columns=["a", "b", "c"]) + result = df.groupby("a").nth(10) + expected = DataFrame(index=Index([], name="a"), columns=["b", "c"]) + tm.assert_frame_equal(result, expected) + + result = df.groupby(["a", "b"]).nth(10) + expected = DataFrame( + index=MultiIndex([[], []], [[], []], names=["a", "b"]), columns=["c"] + ) + tm.assert_frame_equal(result, expected) + + +def test_nth_column_order(): + # GH 20760 + # Check that nth preserves column order + df = DataFrame( + [[1, "b", 100], [1, "a", 50], [1, "a", np.nan], [2, "c", 200], [2, "d", 150]], + columns=["A", "C", "B"], + ) + result = df.groupby("A").nth(0) + expected = DataFrame( + [["b", 100.0], ["c", 200.0]], columns=["C", "B"], index=Index([1, 2], name="A") + ) + tm.assert_frame_equal(result, expected) + + result = df.groupby("A").nth(-1, dropna="any") + expected = DataFrame( + [["a", 50.0], ["d", 150.0]], columns=["C", "B"], index=Index([1, 2], name="A") + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dropna", [None, "any", "all"]) +def test_nth_nan_in_grouper(dropna): + # GH 26011 + df = DataFrame( + [[np.nan, 0, 1], ["abc", 2, 3], [np.nan, 4, 5], ["def", 6, 7], [np.nan, 8, 9]], + columns=list("abc"), + ) + result = df.groupby("a").nth(0, dropna=dropna) + expected = pd.DataFrame( + [[2, 3], [6, 7]], columns=list("bc"), index=Index(["abc", "def"], name="a") + ) + + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/groupby/test_rank.py b/venv/Lib/site-packages/pandas/tests/groupby/test_rank.py new file mode 100644 index 0000000..3461bf6 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/groupby/test_rank.py @@ -0,0 +1,445 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Series, concat +import pandas._testing as tm +from pandas.core.base import DataError + + +def test_rank_apply(): + lev1 = tm.rands_array(10, 100) + lev2 = tm.rands_array(10, 130) + lab1 = np.random.randint(0, 100, size=500) + lab2 = np.random.randint(0, 130, size=500) + + df = DataFrame( + { + "value": np.random.randn(500), + "key1": lev1.take(lab1), + "key2": lev2.take(lab2), + } + ) + + result = df.groupby(["key1", "key2"]).value.rank() + + expected = [piece.value.rank() for key, piece in df.groupby(["key1", "key2"])] + expected = concat(expected, axis=0) + expected = expected.reindex(result.index) + tm.assert_series_equal(result, expected) + + result = df.groupby(["key1", "key2"]).value.rank(pct=True) + + expected = [ + piece.value.rank(pct=True) for key, piece in df.groupby(["key1", "key2"]) + ] + expected = concat(expected, axis=0) + expected = expected.reindex(result.index) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]]) +@pytest.mark.parametrize( + "vals", + [ + [2, 2, 8, 2, 6], + [ + pd.Timestamp("2018-01-02"), + pd.Timestamp("2018-01-02"), + pd.Timestamp("2018-01-08"), + pd.Timestamp("2018-01-02"), + pd.Timestamp("2018-01-06"), + ], + ], +) +@pytest.mark.parametrize( + "ties_method,ascending,pct,exp", + [ + ("average", True, False, [2.0, 2.0, 5.0, 2.0, 4.0]), + ("average", True, True, [0.4, 0.4, 1.0, 0.4, 0.8]), + ("average", False, False, [4.0, 4.0, 1.0, 4.0, 2.0]), + ("average", False, True, [0.8, 0.8, 0.2, 0.8, 0.4]), + ("min", True, False, [1.0, 1.0, 5.0, 1.0, 4.0]), + ("min", True, True, [0.2, 0.2, 1.0, 0.2, 0.8]), + ("min", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]), + ("min", False, True, [0.6, 0.6, 0.2, 0.6, 0.4]), + ("max", True, False, [3.0, 3.0, 5.0, 3.0, 4.0]), + ("max", True, True, [0.6, 0.6, 1.0, 0.6, 0.8]), + ("max", False, False, [5.0, 5.0, 1.0, 5.0, 2.0]), + ("max", False, True, [1.0, 1.0, 0.2, 1.0, 0.4]), + ("first", True, False, [1.0, 2.0, 5.0, 3.0, 4.0]), + ("first", True, True, [0.2, 0.4, 1.0, 0.6, 0.8]), + ("first", False, False, [3.0, 4.0, 1.0, 5.0, 2.0]), + ("first", False, True, [0.6, 0.8, 0.2, 1.0, 0.4]), + ("dense", True, False, [1.0, 1.0, 3.0, 1.0, 2.0]), + ("dense", True, True, [1.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 2.0 / 3.0]), + ("dense", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]), + ("dense", False, True, [3.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 2.0 / 3.0]), + ], +) +def test_rank_args(grps, vals, ties_method, ascending, pct, exp): + key = np.repeat(grps, len(vals)) + vals = vals * len(grps) + df = DataFrame({"key": key, "val": vals}) + result = df.groupby("key").rank(method=ties_method, ascending=ascending, pct=pct) + + exp_df = DataFrame(exp * len(grps), columns=["val"]) + tm.assert_frame_equal(result, exp_df) + + +@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]]) +@pytest.mark.parametrize( + "vals", [[-np.inf, -np.inf, np.nan, 1.0, np.nan, np.inf, np.inf]] +) +@pytest.mark.parametrize( + "ties_method,ascending,na_option,exp", + [ + ("average", True, "keep", [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]), + ("average", True, "top", [3.5, 3.5, 1.5, 5.0, 1.5, 6.5, 6.5]), + ("average", True, "bottom", [1.5, 1.5, 6.5, 3.0, 6.5, 4.5, 4.5]), + ("average", False, "keep", [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]), + ("average", False, "top", [6.5, 6.5, 1.5, 5.0, 1.5, 3.5, 3.5]), + ("average", False, "bottom", [4.5, 4.5, 6.5, 3.0, 6.5, 1.5, 1.5]), + ("min", True, "keep", [1.0, 1.0, np.nan, 3.0, np.nan, 4.0, 4.0]), + ("min", True, "top", [3.0, 3.0, 1.0, 5.0, 1.0, 6.0, 6.0]), + ("min", True, "bottom", [1.0, 1.0, 6.0, 3.0, 6.0, 4.0, 4.0]), + ("min", False, "keep", [4.0, 4.0, np.nan, 3.0, np.nan, 1.0, 1.0]), + ("min", False, "top", [6.0, 6.0, 1.0, 5.0, 1.0, 3.0, 3.0]), + ("min", False, "bottom", [4.0, 4.0, 6.0, 3.0, 6.0, 1.0, 1.0]), + ("max", True, "keep", [2.0, 2.0, np.nan, 3.0, np.nan, 5.0, 5.0]), + ("max", True, "top", [4.0, 4.0, 2.0, 5.0, 2.0, 7.0, 7.0]), + ("max", True, "bottom", [2.0, 2.0, 7.0, 3.0, 7.0, 5.0, 5.0]), + ("max", False, "keep", [5.0, 5.0, np.nan, 3.0, np.nan, 2.0, 2.0]), + ("max", False, "top", [7.0, 7.0, 2.0, 5.0, 2.0, 4.0, 4.0]), + ("max", False, "bottom", [5.0, 5.0, 7.0, 3.0, 7.0, 2.0, 2.0]), + ("first", True, "keep", [1.0, 2.0, np.nan, 3.0, np.nan, 4.0, 5.0]), + ("first", True, "top", [3.0, 4.0, 1.0, 5.0, 2.0, 6.0, 7.0]), + ("first", True, "bottom", [1.0, 2.0, 6.0, 3.0, 7.0, 4.0, 5.0]), + ("first", False, "keep", [4.0, 5.0, np.nan, 3.0, np.nan, 1.0, 2.0]), + ("first", False, "top", [6.0, 7.0, 1.0, 5.0, 2.0, 3.0, 4.0]), + ("first", False, "bottom", [4.0, 5.0, 6.0, 3.0, 7.0, 1.0, 2.0]), + ("dense", True, "keep", [1.0, 1.0, np.nan, 2.0, np.nan, 3.0, 3.0]), + ("dense", True, "top", [2.0, 2.0, 1.0, 3.0, 1.0, 4.0, 4.0]), + ("dense", True, "bottom", [1.0, 1.0, 4.0, 2.0, 4.0, 3.0, 3.0]), + ("dense", False, "keep", [3.0, 3.0, np.nan, 2.0, np.nan, 1.0, 1.0]), + ("dense", False, "top", [4.0, 4.0, 1.0, 3.0, 1.0, 2.0, 2.0]), + ("dense", False, "bottom", [3.0, 3.0, 4.0, 2.0, 4.0, 1.0, 1.0]), + ], +) +def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp): + # GH 20561 + key = np.repeat(grps, len(vals)) + vals = vals * len(grps) + df = DataFrame({"key": key, "val": vals}) + result = df.groupby("key").rank( + method=ties_method, ascending=ascending, na_option=na_option + ) + exp_df = DataFrame(exp * len(grps), columns=["val"]) + tm.assert_frame_equal(result, exp_df) + + +@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]]) +@pytest.mark.parametrize( + "vals", + [ + [2, 2, np.nan, 8, 2, 6, np.nan, np.nan], + [ + pd.Timestamp("2018-01-02"), + pd.Timestamp("2018-01-02"), + np.nan, + pd.Timestamp("2018-01-08"), + pd.Timestamp("2018-01-02"), + pd.Timestamp("2018-01-06"), + np.nan, + np.nan, + ], + ], +) +@pytest.mark.parametrize( + "ties_method,ascending,na_option,pct,exp", + [ + ( + "average", + True, + "keep", + False, + [2.0, 2.0, np.nan, 5.0, 2.0, 4.0, np.nan, np.nan], + ), + ( + "average", + True, + "keep", + True, + [0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan], + ), + ( + "average", + False, + "keep", + False, + [4.0, 4.0, np.nan, 1.0, 4.0, 2.0, np.nan, np.nan], + ), + ( + "average", + False, + "keep", + True, + [0.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan], + ), + ("min", True, "keep", False, [1.0, 1.0, np.nan, 5.0, 1.0, 4.0, np.nan, np.nan]), + ("min", True, "keep", True, [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]), + ( + "min", + False, + "keep", + False, + [3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan], + ), + ("min", False, "keep", True, [0.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]), + ("max", True, "keep", False, [3.0, 3.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan]), + ("max", True, "keep", True, [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]), + ( + "max", + False, + "keep", + False, + [5.0, 5.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan], + ), + ("max", False, "keep", True, [1.0, 1.0, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan]), + ( + "first", + True, + "keep", + False, + [1.0, 2.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan], + ), + ( + "first", + True, + "keep", + True, + [0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan], + ), + ( + "first", + False, + "keep", + False, + [3.0, 4.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan], + ), + ( + "first", + False, + "keep", + True, + [0.6, 0.8, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan], + ), + ( + "dense", + True, + "keep", + False, + [1.0, 1.0, np.nan, 3.0, 1.0, 2.0, np.nan, np.nan], + ), + ( + "dense", + True, + "keep", + True, + [ + 1.0 / 3.0, + 1.0 / 3.0, + np.nan, + 3.0 / 3.0, + 1.0 / 3.0, + 2.0 / 3.0, + np.nan, + np.nan, + ], + ), + ( + "dense", + False, + "keep", + False, + [3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan], + ), + ( + "dense", + False, + "keep", + True, + [ + 3.0 / 3.0, + 3.0 / 3.0, + np.nan, + 1.0 / 3.0, + 3.0 / 3.0, + 2.0 / 3.0, + np.nan, + np.nan, + ], + ), + ("average", True, "bottom", False, [2.0, 2.0, 7.0, 5.0, 2.0, 4.0, 7.0, 7.0]), + ( + "average", + True, + "bottom", + True, + [0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875], + ), + ("average", False, "bottom", False, [4.0, 4.0, 7.0, 1.0, 4.0, 2.0, 7.0, 7.0]), + ( + "average", + False, + "bottom", + True, + [0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875], + ), + ("min", True, "bottom", False, [1.0, 1.0, 6.0, 5.0, 1.0, 4.0, 6.0, 6.0]), + ( + "min", + True, + "bottom", + True, + [0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75], + ), + ("min", False, "bottom", False, [3.0, 3.0, 6.0, 1.0, 3.0, 2.0, 6.0, 6.0]), + ( + "min", + False, + "bottom", + True, + [0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75], + ), + ("max", True, "bottom", False, [3.0, 3.0, 8.0, 5.0, 3.0, 4.0, 8.0, 8.0]), + ("max", True, "bottom", True, [0.375, 0.375, 1.0, 0.625, 0.375, 0.5, 1.0, 1.0]), + ("max", False, "bottom", False, [5.0, 5.0, 8.0, 1.0, 5.0, 2.0, 8.0, 8.0]), + ( + "max", + False, + "bottom", + True, + [0.625, 0.625, 1.0, 0.125, 0.625, 0.25, 1.0, 1.0], + ), + ("first", True, "bottom", False, [1.0, 2.0, 6.0, 5.0, 3.0, 4.0, 7.0, 8.0]), + ( + "first", + True, + "bottom", + True, + [0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.0], + ), + ("first", False, "bottom", False, [3.0, 4.0, 6.0, 1.0, 5.0, 2.0, 7.0, 8.0]), + ( + "first", + False, + "bottom", + True, + [0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.0], + ), + ("dense", True, "bottom", False, [1.0, 1.0, 4.0, 3.0, 1.0, 2.0, 4.0, 4.0]), + ("dense", True, "bottom", True, [0.25, 0.25, 1.0, 0.75, 0.25, 0.5, 1.0, 1.0]), + ("dense", False, "bottom", False, [3.0, 3.0, 4.0, 1.0, 3.0, 2.0, 4.0, 4.0]), + ("dense", False, "bottom", True, [0.75, 0.75, 1.0, 0.25, 0.75, 0.5, 1.0, 1.0]), + ], +) +def test_rank_args_missing(grps, vals, ties_method, ascending, na_option, pct, exp): + key = np.repeat(grps, len(vals)) + vals = vals * len(grps) + df = DataFrame({"key": key, "val": vals}) + result = df.groupby("key").rank( + method=ties_method, ascending=ascending, na_option=na_option, pct=pct + ) + + exp_df = DataFrame(exp * len(grps), columns=["val"]) + tm.assert_frame_equal(result, exp_df) + + +@pytest.mark.parametrize( + "pct,exp", [(False, [3.0, 3.0, 3.0, 3.0, 3.0]), (True, [0.6, 0.6, 0.6, 0.6, 0.6])] +) +def test_rank_resets_each_group(pct, exp): + df = DataFrame( + {"key": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], "val": [1] * 10} + ) + result = df.groupby("key").rank(pct=pct) + exp_df = DataFrame(exp * 2, columns=["val"]) + tm.assert_frame_equal(result, exp_df) + + +def test_rank_avg_even_vals(): + df = DataFrame({"key": ["a"] * 4, "val": [1] * 4}) + result = df.groupby("key").rank() + exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=["val"]) + tm.assert_frame_equal(result, exp_df) + + +@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"]) +@pytest.mark.parametrize("ascending", [True, False]) +@pytest.mark.parametrize("na_option", ["keep", "top", "bottom"]) +@pytest.mark.parametrize("pct", [True, False]) +@pytest.mark.parametrize( + "vals", [["bar", "bar", "foo", "bar", "baz"], ["bar", np.nan, "foo", np.nan, "baz"]] +) +def test_rank_object_raises(ties_method, ascending, na_option, pct, vals): + df = DataFrame({"key": ["foo"] * 5, "val": vals}) + + with pytest.raises(DataError, match="No numeric types to aggregate"): + df.groupby("key").rank( + method=ties_method, ascending=ascending, na_option=na_option, pct=pct + ) + + +@pytest.mark.parametrize("na_option", [True, "bad", 1]) +@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"]) +@pytest.mark.parametrize("ascending", [True, False]) +@pytest.mark.parametrize("pct", [True, False]) +@pytest.mark.parametrize( + "vals", + [ + ["bar", "bar", "foo", "bar", "baz"], + ["bar", np.nan, "foo", np.nan, "baz"], + [1, np.nan, 2, np.nan, 3], + ], +) +def test_rank_naoption_raises(ties_method, ascending, na_option, pct, vals): + df = DataFrame({"key": ["foo"] * 5, "val": vals}) + msg = "na_option must be one of 'keep', 'top', or 'bottom'" + + with pytest.raises(ValueError, match=msg): + df.groupby("key").rank( + method=ties_method, ascending=ascending, na_option=na_option, pct=pct + ) + + +def test_rank_empty_group(): + # see gh-22519 + column = "A" + df = DataFrame({"A": [0, 1, 0], "B": [1.0, np.nan, 2.0]}) + + result = df.groupby(column).B.rank(pct=True) + expected = Series([0.5, np.nan, 1.0], name="B") + tm.assert_series_equal(result, expected) + + result = df.groupby(column).rank(pct=True) + expected = DataFrame({"B": [0.5, np.nan, 1.0]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "input_key,input_value,output_value", + [ + ([1, 2], [1, 1], [1.0, 1.0]), + ([1, 1, 2, 2], [1, 2, 1, 2], [0.5, 1.0, 0.5, 1.0]), + ([1, 1, 2, 2], [1, 2, 1, np.nan], [0.5, 1.0, 1.0, np.nan]), + ([1, 1, 2], [1, 2, np.nan], [0.5, 1.0, np.nan]), + ], +) +def test_rank_zero_div(input_key, input_value, output_value): + # GH 23666 + df = DataFrame({"A": input_key, "B": input_value}) + + result = df.groupby("A").rank(method="dense", pct=True) + expected = DataFrame({"B": output_value}) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/groupby/test_timegrouper.py b/venv/Lib/site-packages/pandas/tests/groupby/test_timegrouper.py new file mode 100644 index 0000000..6b8bd9e --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/groupby/test_timegrouper.py @@ -0,0 +1,757 @@ +""" test with the TimeGrouper / grouping with datetimes """ + +from datetime import datetime +from io import StringIO + +import numpy as np +import pytest +import pytz + +import pandas as pd +from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range +import pandas._testing as tm +from pandas.core.groupby.grouper import Grouper +from pandas.core.groupby.ops import BinGrouper + + +class TestGroupBy: + def test_groupby_with_timegrouper(self): + # GH 4161 + # TimeGrouper requires a sorted index + # also verifies that the resultant index has the correct name + df_original = DataFrame( + { + "Buyer": "Carl Carl Carl Carl Joe Carl".split(), + "Quantity": [18, 3, 5, 1, 9, 3], + "Date": [ + datetime(2013, 9, 1, 13, 0), + datetime(2013, 9, 1, 13, 5), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 3, 10, 0), + datetime(2013, 12, 2, 12, 0), + datetime(2013, 9, 2, 14, 0), + ], + } + ) + + # GH 6908 change target column's order + df_reordered = df_original.sort_values(by="Quantity") + + for df in [df_original, df_reordered]: + df = df.set_index(["Date"]) + + expected = DataFrame( + {"Quantity": 0}, + index=date_range( + "20130901", "20131205", freq="5D", name="Date", closed="left" + ), + ) + expected.iloc[[0, 6, 18], 0] = np.array([24, 6, 9], dtype="int64") + + result1 = df.resample("5D").sum() + tm.assert_frame_equal(result1, expected) + + df_sorted = df.sort_index() + result2 = df_sorted.groupby(pd.Grouper(freq="5D")).sum() + tm.assert_frame_equal(result2, expected) + + result3 = df.groupby(pd.Grouper(freq="5D")).sum() + tm.assert_frame_equal(result3, expected) + + @pytest.mark.parametrize("should_sort", [True, False]) + def test_groupby_with_timegrouper_methods(self, should_sort): + # GH 3881 + # make sure API of timegrouper conforms + + df = pd.DataFrame( + { + "Branch": "A A A A A B".split(), + "Buyer": "Carl Mark Carl Joe Joe Carl".split(), + "Quantity": [1, 3, 5, 8, 9, 3], + "Date": [ + datetime(2013, 1, 1, 13, 0), + datetime(2013, 1, 1, 13, 5), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 2, 10, 0), + datetime(2013, 12, 2, 12, 0), + datetime(2013, 12, 2, 14, 0), + ], + } + ) + + if should_sort: + df = df.sort_values(by="Quantity", ascending=False) + + df = df.set_index("Date", drop=False) + g = df.groupby(pd.Grouper(freq="6M")) + assert g.group_keys + + assert isinstance(g.grouper, BinGrouper) + groups = g.groups + assert isinstance(groups, dict) + assert len(groups) == 3 + + def test_timegrouper_with_reg_groups(self): + + # GH 3794 + # allow combination of timegrouper/reg groups + + df_original = DataFrame( + { + "Branch": "A A A A A A A B".split(), + "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(), + "Quantity": [1, 3, 5, 1, 8, 1, 9, 3], + "Date": [ + datetime(2013, 1, 1, 13, 0), + datetime(2013, 1, 1, 13, 5), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 2, 10, 0), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 2, 10, 0), + datetime(2013, 12, 2, 12, 0), + datetime(2013, 12, 2, 14, 0), + ], + } + ).set_index("Date") + + df_sorted = df_original.sort_values(by="Quantity", ascending=False) + + for df in [df_original, df_sorted]: + expected = DataFrame( + { + "Buyer": "Carl Joe Mark".split(), + "Quantity": [10, 18, 3], + "Date": [ + datetime(2013, 12, 31, 0, 0), + datetime(2013, 12, 31, 0, 0), + datetime(2013, 12, 31, 0, 0), + ], + } + ).set_index(["Date", "Buyer"]) + + result = df.groupby([pd.Grouper(freq="A"), "Buyer"]).sum() + tm.assert_frame_equal(result, expected) + + expected = DataFrame( + { + "Buyer": "Carl Mark Carl Joe".split(), + "Quantity": [1, 3, 9, 18], + "Date": [ + datetime(2013, 1, 1, 0, 0), + datetime(2013, 1, 1, 0, 0), + datetime(2013, 7, 1, 0, 0), + datetime(2013, 7, 1, 0, 0), + ], + } + ).set_index(["Date", "Buyer"]) + result = df.groupby([pd.Grouper(freq="6MS"), "Buyer"]).sum() + tm.assert_frame_equal(result, expected) + + df_original = DataFrame( + { + "Branch": "A A A A A A A B".split(), + "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(), + "Quantity": [1, 3, 5, 1, 8, 1, 9, 3], + "Date": [ + datetime(2013, 10, 1, 13, 0), + datetime(2013, 10, 1, 13, 5), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 2, 10, 0), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 2, 10, 0), + datetime(2013, 10, 2, 12, 0), + datetime(2013, 10, 2, 14, 0), + ], + } + ).set_index("Date") + + df_sorted = df_original.sort_values(by="Quantity", ascending=False) + for df in [df_original, df_sorted]: + + expected = DataFrame( + { + "Buyer": "Carl Joe Mark Carl Joe".split(), + "Quantity": [6, 8, 3, 4, 10], + "Date": [ + datetime(2013, 10, 1, 0, 0), + datetime(2013, 10, 1, 0, 0), + datetime(2013, 10, 1, 0, 0), + datetime(2013, 10, 2, 0, 0), + datetime(2013, 10, 2, 0, 0), + ], + } + ).set_index(["Date", "Buyer"]) + + result = df.groupby([pd.Grouper(freq="1D"), "Buyer"]).sum() + tm.assert_frame_equal(result, expected) + + result = df.groupby([pd.Grouper(freq="1M"), "Buyer"]).sum() + expected = DataFrame( + { + "Buyer": "Carl Joe Mark".split(), + "Quantity": [10, 18, 3], + "Date": [ + datetime(2013, 10, 31, 0, 0), + datetime(2013, 10, 31, 0, 0), + datetime(2013, 10, 31, 0, 0), + ], + } + ).set_index(["Date", "Buyer"]) + tm.assert_frame_equal(result, expected) + + # passing the name + df = df.reset_index() + result = df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"]).sum() + tm.assert_frame_equal(result, expected) + + with pytest.raises(KeyError, match="'The grouper name foo is not found'"): + df.groupby([pd.Grouper(freq="1M", key="foo"), "Buyer"]).sum() + + # passing the level + df = df.set_index("Date") + result = df.groupby([pd.Grouper(freq="1M", level="Date"), "Buyer"]).sum() + tm.assert_frame_equal(result, expected) + result = df.groupby([pd.Grouper(freq="1M", level=0), "Buyer"]).sum() + tm.assert_frame_equal(result, expected) + + with pytest.raises(ValueError): + df.groupby([pd.Grouper(freq="1M", level="foo"), "Buyer"]).sum() + + # multi names + df = df.copy() + df["Date"] = df.index + pd.offsets.MonthEnd(2) + result = df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"]).sum() + expected = DataFrame( + { + "Buyer": "Carl Joe Mark".split(), + "Quantity": [10, 18, 3], + "Date": [ + datetime(2013, 11, 30, 0, 0), + datetime(2013, 11, 30, 0, 0), + datetime(2013, 11, 30, 0, 0), + ], + } + ).set_index(["Date", "Buyer"]) + tm.assert_frame_equal(result, expected) + + # error as we have both a level and a name! + with pytest.raises(ValueError): + df.groupby( + [pd.Grouper(freq="1M", key="Date", level="Date"), "Buyer"] + ).sum() + + # single groupers + expected = DataFrame( + {"Quantity": [31], "Date": [datetime(2013, 10, 31, 0, 0)]} + ).set_index("Date") + result = df.groupby(pd.Grouper(freq="1M")).sum() + tm.assert_frame_equal(result, expected) + + result = df.groupby([pd.Grouper(freq="1M")]).sum() + tm.assert_frame_equal(result, expected) + + expected = DataFrame( + {"Quantity": [31], "Date": [datetime(2013, 11, 30, 0, 0)]} + ).set_index("Date") + result = df.groupby(pd.Grouper(freq="1M", key="Date")).sum() + tm.assert_frame_equal(result, expected) + + result = df.groupby([pd.Grouper(freq="1M", key="Date")]).sum() + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("freq", ["D", "M", "A", "Q-APR"]) + def test_timegrouper_with_reg_groups_freq(self, freq): + # GH 6764 multiple grouping with/without sort + df = DataFrame( + { + "date": pd.to_datetime( + [ + "20121002", + "20121007", + "20130130", + "20130202", + "20130305", + "20121002", + "20121207", + "20130130", + "20130202", + "20130305", + "20130202", + "20130305", + ] + ), + "user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5], + "whole_cost": [ + 1790, + 364, + 280, + 259, + 201, + 623, + 90, + 312, + 359, + 301, + 359, + 801, + ], + "cost1": [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12], + } + ).set_index("date") + + expected = ( + df.groupby("user_id")["whole_cost"] + .resample(freq) + .sum(min_count=1) # XXX + .dropna() + .reorder_levels(["date", "user_id"]) + .sort_index() + .astype("int64") + ) + expected.name = "whole_cost" + + result1 = ( + df.sort_index() + .groupby([pd.Grouper(freq=freq), "user_id"])["whole_cost"] + .sum() + ) + tm.assert_series_equal(result1, expected) + + result2 = df.groupby([pd.Grouper(freq=freq), "user_id"])["whole_cost"].sum() + tm.assert_series_equal(result2, expected) + + def test_timegrouper_get_group(self): + # GH 6914 + + df_original = DataFrame( + { + "Buyer": "Carl Joe Joe Carl Joe Carl".split(), + "Quantity": [18, 3, 5, 1, 9, 3], + "Date": [ + datetime(2013, 9, 1, 13, 0), + datetime(2013, 9, 1, 13, 5), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 3, 10, 0), + datetime(2013, 12, 2, 12, 0), + datetime(2013, 9, 2, 14, 0), + ], + } + ) + df_reordered = df_original.sort_values(by="Quantity") + + # single grouping + expected_list = [ + df_original.iloc[[0, 1, 5]], + df_original.iloc[[2, 3]], + df_original.iloc[[4]], + ] + dt_list = ["2013-09-30", "2013-10-31", "2013-12-31"] + + for df in [df_original, df_reordered]: + grouped = df.groupby(pd.Grouper(freq="M", key="Date")) + for t, expected in zip(dt_list, expected_list): + dt = pd.Timestamp(t) + result = grouped.get_group(dt) + tm.assert_frame_equal(result, expected) + + # multiple grouping + expected_list = [ + df_original.iloc[[1]], + df_original.iloc[[3]], + df_original.iloc[[4]], + ] + g_list = [("Joe", "2013-09-30"), ("Carl", "2013-10-31"), ("Joe", "2013-12-31")] + + for df in [df_original, df_reordered]: + grouped = df.groupby(["Buyer", pd.Grouper(freq="M", key="Date")]) + for (b, t), expected in zip(g_list, expected_list): + dt = pd.Timestamp(t) + result = grouped.get_group((b, dt)) + tm.assert_frame_equal(result, expected) + + # with index + df_original = df_original.set_index("Date") + df_reordered = df_original.sort_values(by="Quantity") + + expected_list = [ + df_original.iloc[[0, 1, 5]], + df_original.iloc[[2, 3]], + df_original.iloc[[4]], + ] + + for df in [df_original, df_reordered]: + grouped = df.groupby(pd.Grouper(freq="M")) + for t, expected in zip(dt_list, expected_list): + dt = pd.Timestamp(t) + result = grouped.get_group(dt) + tm.assert_frame_equal(result, expected) + + def test_timegrouper_apply_return_type_series(self): + # Using `apply` with the `TimeGrouper` should give the + # same return type as an `apply` with a `Grouper`. + # Issue #11742 + df = pd.DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]}) + df_dt = df.copy() + df_dt["date"] = pd.to_datetime(df_dt["date"]) + + def sumfunc_series(x): + return pd.Series([x["value"].sum()], ("sum",)) + + expected = df.groupby(pd.Grouper(key="date")).apply(sumfunc_series) + result = df_dt.groupby(pd.Grouper(freq="M", key="date")).apply(sumfunc_series) + tm.assert_frame_equal( + result.reset_index(drop=True), expected.reset_index(drop=True) + ) + + def test_timegrouper_apply_return_type_value(self): + # Using `apply` with the `TimeGrouper` should give the + # same return type as an `apply` with a `Grouper`. + # Issue #11742 + df = pd.DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]}) + df_dt = df.copy() + df_dt["date"] = pd.to_datetime(df_dt["date"]) + + def sumfunc_value(x): + return x.value.sum() + + expected = df.groupby(pd.Grouper(key="date")).apply(sumfunc_value) + result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_value) + tm.assert_series_equal( + result.reset_index(drop=True), expected.reset_index(drop=True) + ) + + def test_groupby_groups_datetimeindex(self): + # GH#1430 + periods = 1000 + ind = pd.date_range(start="2012/1/1", freq="5min", periods=periods) + df = DataFrame( + {"high": np.arange(periods), "low": np.arange(periods)}, index=ind + ) + grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day)) + + # it works! + groups = grouped.groups + assert isinstance(list(groups.keys())[0], datetime) + + # GH#11442 + index = pd.date_range("2015/01/01", periods=5, name="date") + df = pd.DataFrame({"A": [5, 6, 7, 8, 9], "B": [1, 2, 3, 4, 5]}, index=index) + result = df.groupby(level="date").groups + dates = ["2015-01-05", "2015-01-04", "2015-01-03", "2015-01-02", "2015-01-01"] + expected = { + pd.Timestamp(date): pd.DatetimeIndex([date], name="date") for date in dates + } + tm.assert_dict_equal(result, expected) + + grouped = df.groupby(level="date") + for date in dates: + result = grouped.get_group(date) + data = [[df.loc[date, "A"], df.loc[date, "B"]]] + expected_index = pd.DatetimeIndex([date], name="date") + expected = pd.DataFrame(data, columns=list("AB"), index=expected_index) + tm.assert_frame_equal(result, expected) + + def test_groupby_groups_datetimeindex_tz(self): + # GH 3950 + dates = [ + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + ] + df = DataFrame( + { + "label": ["a", "a", "a", "b", "b", "b"], + "datetime": dates, + "value1": np.arange(6, dtype="int64"), + "value2": [1, 2] * 3, + } + ) + df["datetime"] = df["datetime"].apply(lambda d: Timestamp(d, tz="US/Pacific")) + + exp_idx1 = pd.DatetimeIndex( + [ + "2011-07-19 07:00:00", + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + "2011-07-19 09:00:00", + ], + tz="US/Pacific", + name="datetime", + ) + exp_idx2 = Index(["a", "b"] * 3, name="label") + exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2]) + expected = DataFrame( + {"value1": [0, 3, 1, 4, 2, 5], "value2": [1, 2, 2, 1, 1, 2]}, + index=exp_idx, + columns=["value1", "value2"], + ) + + result = df.groupby(["datetime", "label"]).sum() + tm.assert_frame_equal(result, expected) + + # by level + didx = pd.DatetimeIndex(dates, tz="Asia/Tokyo") + df = DataFrame( + {"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]}, + index=didx, + ) + + exp_idx = pd.DatetimeIndex( + ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], + tz="Asia/Tokyo", + ) + expected = DataFrame( + {"value1": [3, 5, 7], "value2": [2, 4, 6]}, + index=exp_idx, + columns=["value1", "value2"], + ) + + result = df.groupby(level=0).sum() + tm.assert_frame_equal(result, expected) + + def test_frame_datetime64_handling_groupby(self): + # it works! + df = DataFrame( + [(3, np.datetime64("2012-07-03")), (3, np.datetime64("2012-07-04"))], + columns=["a", "date"], + ) + result = df.groupby("a").first() + assert result["date"][3] == Timestamp("2012-07-03") + + def test_groupby_multi_timezone(self): + + # combining multiple / different timezones yields UTC + + data = """0,2000-01-28 16:47:00,America/Chicago +1,2000-01-29 16:48:00,America/Chicago +2,2000-01-30 16:49:00,America/Los_Angeles +3,2000-01-31 16:50:00,America/Chicago +4,2000-01-01 16:50:00,America/New_York""" + + df = pd.read_csv(StringIO(data), header=None, names=["value", "date", "tz"]) + result = df.groupby("tz").date.apply( + lambda x: pd.to_datetime(x).dt.tz_localize(x.name) + ) + + expected = Series( + [ + Timestamp("2000-01-28 16:47:00-0600", tz="America/Chicago"), + Timestamp("2000-01-29 16:48:00-0600", tz="America/Chicago"), + Timestamp("2000-01-30 16:49:00-0800", tz="America/Los_Angeles"), + Timestamp("2000-01-31 16:50:00-0600", tz="America/Chicago"), + Timestamp("2000-01-01 16:50:00-0500", tz="America/New_York"), + ], + name="date", + dtype=object, + ) + tm.assert_series_equal(result, expected) + + tz = "America/Chicago" + res_values = df.groupby("tz").date.get_group(tz) + result = pd.to_datetime(res_values).dt.tz_localize(tz) + exp_values = Series( + ["2000-01-28 16:47:00", "2000-01-29 16:48:00", "2000-01-31 16:50:00"], + index=[0, 1, 3], + name="date", + ) + expected = pd.to_datetime(exp_values).dt.tz_localize(tz) + tm.assert_series_equal(result, expected) + + def test_groupby_groups_periods(self): + dates = [ + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + ] + df = DataFrame( + { + "label": ["a", "a", "a", "b", "b", "b"], + "period": [pd.Period(d, freq="H") for d in dates], + "value1": np.arange(6, dtype="int64"), + "value2": [1, 2] * 3, + } + ) + + exp_idx1 = pd.PeriodIndex( + [ + "2011-07-19 07:00:00", + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + "2011-07-19 09:00:00", + ], + freq="H", + name="period", + ) + exp_idx2 = Index(["a", "b"] * 3, name="label") + exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2]) + expected = DataFrame( + {"value1": [0, 3, 1, 4, 2, 5], "value2": [1, 2, 2, 1, 1, 2]}, + index=exp_idx, + columns=["value1", "value2"], + ) + + result = df.groupby(["period", "label"]).sum() + tm.assert_frame_equal(result, expected) + + # by level + didx = pd.PeriodIndex(dates, freq="H") + df = DataFrame( + {"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]}, + index=didx, + ) + + exp_idx = pd.PeriodIndex( + ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], + freq="H", + ) + expected = DataFrame( + {"value1": [3, 5, 7], "value2": [2, 4, 6]}, + index=exp_idx, + columns=["value1", "value2"], + ) + + result = df.groupby(level=0).sum() + tm.assert_frame_equal(result, expected) + + def test_groupby_first_datetime64(self): + df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)]) + df[1] = df[1].view("M8[ns]") + + assert issubclass(df[1].dtype.type, np.datetime64) + + result = df.groupby(level=0).first() + got_dt = result[1].dtype + assert issubclass(got_dt.type, np.datetime64) + + result = df[1].groupby(level=0).first() + got_dt = result.dtype + assert issubclass(got_dt.type, np.datetime64) + + def test_groupby_max_datetime64(self): + # GH 5869 + # datetimelike dtype conversion from int + df = DataFrame(dict(A=Timestamp("20130101"), B=np.arange(5))) + expected = df.groupby("A")["A"].apply(lambda x: x.max()) + result = df.groupby("A")["A"].max() + tm.assert_series_equal(result, expected) + + def test_groupby_datetime64_32_bit(self): + # GH 6410 / numpy 4328 + # 32-bit under 1.9-dev indexing issue + + df = DataFrame({"A": range(2), "B": [pd.Timestamp("2000-01-1")] * 2}) + result = df.groupby("A")["B"].transform(min) + expected = Series([pd.Timestamp("2000-01-1")] * 2, name="B") + tm.assert_series_equal(result, expected) + + def test_groupby_with_timezone_selection(self): + # GH 11616 + # Test that column selection returns output in correct timezone. + np.random.seed(42) + df = pd.DataFrame( + { + "factor": np.random.randint(0, 3, size=60), + "time": pd.date_range( + "01/01/2000 00:00", periods=60, freq="s", tz="UTC" + ), + } + ) + df1 = df.groupby("factor").max()["time"] + df2 = df.groupby("factor")["time"].max() + tm.assert_series_equal(df1, df2) + + def test_timezone_info(self): + # see gh-11682: Timezone info lost when broadcasting + # scalar datetime to DataFrame + + df = pd.DataFrame({"a": [1], "b": [datetime.now(pytz.utc)]}) + assert df["b"][0].tzinfo == pytz.utc + df = pd.DataFrame({"a": [1, 2, 3]}) + df["b"] = datetime.now(pytz.utc) + assert df["b"][0].tzinfo == pytz.utc + + def test_datetime_count(self): + df = DataFrame( + {"a": [1, 2, 3] * 2, "dates": pd.date_range("now", periods=6, freq="T")} + ) + result = df.groupby("a").dates.count() + expected = Series([2, 2, 2], index=Index([1, 2, 3], name="a"), name="dates") + tm.assert_series_equal(result, expected) + + def test_first_last_max_min_on_time_data(self): + # GH 10295 + # Verify that NaT is not in the result of max, min, first and last on + # Dataframe with datetime or timedelta values. + from datetime import timedelta as td + + df_test = DataFrame( + { + "dt": [ + np.nan, + "2015-07-24 10:10", + "2015-07-25 11:11", + "2015-07-23 12:12", + np.nan, + ], + "td": [np.nan, td(days=1), td(days=2), td(days=3), np.nan], + } + ) + df_test.dt = pd.to_datetime(df_test.dt) + df_test["group"] = "A" + df_ref = df_test[df_test.dt.notna()] + + grouped_test = df_test.groupby("group") + grouped_ref = df_ref.groupby("group") + + tm.assert_frame_equal(grouped_ref.max(), grouped_test.max()) + tm.assert_frame_equal(grouped_ref.min(), grouped_test.min()) + tm.assert_frame_equal(grouped_ref.first(), grouped_test.first()) + tm.assert_frame_equal(grouped_ref.last(), grouped_test.last()) + + def test_nunique_with_timegrouper_and_nat(self): + # GH 17575 + test = pd.DataFrame( + { + "time": [ + Timestamp("2016-06-28 09:35:35"), + pd.NaT, + Timestamp("2016-06-28 16:46:28"), + ], + "data": ["1", "2", "3"], + } + ) + + grouper = pd.Grouper(key="time", freq="h") + result = test.groupby(grouper)["data"].nunique() + expected = test[test.time.notnull()].groupby(grouper)["data"].nunique() + tm.assert_series_equal(result, expected) + + def test_scalar_call_versus_list_call(self): + # Issue: 17530 + data_frame = { + "location": ["shanghai", "beijing", "shanghai"], + "time": pd.Series( + ["2017-08-09 13:32:23", "2017-08-11 23:23:15", "2017-08-11 22:23:15"], + dtype="datetime64[ns]", + ), + "value": [1, 2, 3], + } + data_frame = pd.DataFrame(data_frame).set_index("time") + grouper = pd.Grouper(freq="D") + + grouped = data_frame.groupby(grouper) + result = grouped.count() + grouped = data_frame.groupby([grouper]) + expected = grouped.count() + + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/groupby/test_transform.py b/venv/Lib/site-packages/pandas/tests/groupby/test_transform.py new file mode 100644 index 0000000..6c05c40 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/groupby/test_transform.py @@ -0,0 +1,1170 @@ +""" test with the .transform """ +from io import StringIO + +import numpy as np +import pytest + +from pandas._libs import groupby + +from pandas.core.dtypes.common import ensure_platform_int, is_timedelta64_dtype + +import pandas as pd +from pandas import ( + Categorical, + DataFrame, + MultiIndex, + Series, + Timestamp, + concat, + date_range, +) +import pandas._testing as tm +from pandas.core.groupby.groupby import DataError + + +def assert_fp_equal(a, b): + assert (np.abs(a - b) < 1e-12).all() + + +def test_transform(): + data = Series(np.arange(9) // 3, index=np.arange(9)) + + index = np.arange(9) + np.random.shuffle(index) + data = data.reindex(index) + + grouped = data.groupby(lambda x: x // 3) + + transformed = grouped.transform(lambda x: x * x.sum()) + assert transformed[7] == 12 + + # GH 8046 + # make sure that we preserve the input order + + df = DataFrame( + np.arange(6, dtype="int64").reshape(3, 2), columns=["a", "b"], index=[0, 2, 1] + ) + key = [0, 0, 1] + expected = ( + df.sort_index() + .groupby(key) + .transform(lambda x: x - x.mean()) + .groupby(key) + .mean() + ) + result = df.groupby(key).transform(lambda x: x - x.mean()).groupby(key).mean() + tm.assert_frame_equal(result, expected) + + def demean(arr): + return arr - arr.mean() + + people = DataFrame( + np.random.randn(5, 5), + columns=["a", "b", "c", "d", "e"], + index=["Joe", "Steve", "Wes", "Jim", "Travis"], + ) + key = ["one", "two", "one", "two", "one"] + result = people.groupby(key).transform(demean).groupby(key).mean() + expected = people.groupby(key).apply(demean).groupby(key).mean() + tm.assert_frame_equal(result, expected) + + # GH 8430 + df = tm.makeTimeDataFrame() + g = df.groupby(pd.Grouper(freq="M")) + g.transform(lambda x: x - 1) + + # GH 9700 + df = DataFrame({"a": range(5, 10), "b": range(5)}) + result = df.groupby("a").transform(max) + expected = DataFrame({"b": range(5)}) + tm.assert_frame_equal(result, expected) + + +def test_transform_fast(): + + df = DataFrame({"id": np.arange(100000) / 3, "val": np.random.randn(100000)}) + + grp = df.groupby("id")["val"] + + values = np.repeat(grp.mean().values, ensure_platform_int(grp.count().values)) + expected = pd.Series(values, index=df.index, name="val") + + result = grp.transform(np.mean) + tm.assert_series_equal(result, expected) + + result = grp.transform("mean") + tm.assert_series_equal(result, expected) + + # GH 12737 + df = pd.DataFrame( + { + "grouping": [0, 1, 1, 3], + "f": [1.1, 2.1, 3.1, 4.5], + "d": pd.date_range("2014-1-1", "2014-1-4"), + "i": [1, 2, 3, 4], + }, + columns=["grouping", "f", "i", "d"], + ) + result = df.groupby("grouping").transform("first") + + dates = [ + pd.Timestamp("2014-1-1"), + pd.Timestamp("2014-1-2"), + pd.Timestamp("2014-1-2"), + pd.Timestamp("2014-1-4"), + ] + expected = pd.DataFrame( + {"f": [1.1, 2.1, 2.1, 4.5], "d": dates, "i": [1, 2, 2, 4]}, + columns=["f", "i", "d"], + ) + tm.assert_frame_equal(result, expected) + + # selection + result = df.groupby("grouping")[["f", "i"]].transform("first") + expected = expected[["f", "i"]] + tm.assert_frame_equal(result, expected) + + # dup columns + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["g", "a", "a"]) + result = df.groupby("g").transform("first") + expected = df.drop("g", axis=1) + tm.assert_frame_equal(result, expected) + + +def test_transform_broadcast(tsframe, ts): + grouped = ts.groupby(lambda x: x.month) + result = grouped.transform(np.mean) + + tm.assert_index_equal(result.index, ts.index) + for _, gp in grouped: + assert_fp_equal(result.reindex(gp.index), gp.mean()) + + grouped = tsframe.groupby(lambda x: x.month) + result = grouped.transform(np.mean) + tm.assert_index_equal(result.index, tsframe.index) + for _, gp in grouped: + agged = gp.mean() + res = result.reindex(gp.index) + for col in tsframe: + assert_fp_equal(res[col], agged[col]) + + # group columns + grouped = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) + result = grouped.transform(np.mean) + tm.assert_index_equal(result.index, tsframe.index) + tm.assert_index_equal(result.columns, tsframe.columns) + for _, gp in grouped: + agged = gp.mean(1) + res = result.reindex(columns=gp.columns) + for idx in gp.index: + assert_fp_equal(res.xs(idx), agged[idx]) + + +def test_transform_axis(tsframe): + + # make sure that we are setting the axes + # correctly when on axis=0 or 1 + # in the presence of a non-monotonic indexer + # GH12713 + + base = tsframe.iloc[0:5] + r = len(base.index) + c = len(base.columns) + tso = DataFrame( + np.random.randn(r, c), index=base.index, columns=base.columns, dtype="float64" + ) + # monotonic + ts = tso + grouped = ts.groupby(lambda x: x.weekday()) + result = ts - grouped.transform("mean") + expected = grouped.apply(lambda x: x - x.mean()) + tm.assert_frame_equal(result, expected) + + ts = ts.T + grouped = ts.groupby(lambda x: x.weekday(), axis=1) + result = ts - grouped.transform("mean") + expected = grouped.apply(lambda x: (x.T - x.mean(1)).T) + tm.assert_frame_equal(result, expected) + + # non-monotonic + ts = tso.iloc[[1, 0] + list(range(2, len(base)))] + grouped = ts.groupby(lambda x: x.weekday()) + result = ts - grouped.transform("mean") + expected = grouped.apply(lambda x: x - x.mean()) + tm.assert_frame_equal(result, expected) + + ts = ts.T + grouped = ts.groupby(lambda x: x.weekday(), axis=1) + result = ts - grouped.transform("mean") + expected = grouped.apply(lambda x: (x.T - x.mean(1)).T) + tm.assert_frame_equal(result, expected) + + +def test_transform_dtype(): + # GH 9807 + # Check transform dtype output is preserved + df = DataFrame([[1, 3], [2, 3]]) + result = df.groupby(1).transform("mean") + expected = DataFrame([[1.5], [1.5]]) + tm.assert_frame_equal(result, expected) + + +def test_transform_bug(): + # GH 5712 + # transforming on a datetime column + df = DataFrame(dict(A=Timestamp("20130101"), B=np.arange(5))) + result = df.groupby("A")["B"].transform(lambda x: x.rank(ascending=False)) + expected = Series(np.arange(5, 0, step=-1), name="B") + tm.assert_series_equal(result, expected) + + +def test_transform_numeric_to_boolean(): + # GH 16875 + # inconsistency in transforming boolean values + expected = pd.Series([True, True], name="A") + + df = pd.DataFrame({"A": [1.1, 2.2], "B": [1, 2]}) + result = df.groupby("B").A.transform(lambda x: True) + tm.assert_series_equal(result, expected) + + df = pd.DataFrame({"A": [1, 2], "B": [1, 2]}) + result = df.groupby("B").A.transform(lambda x: True) + tm.assert_series_equal(result, expected) + + +def test_transform_datetime_to_timedelta(): + # GH 15429 + # transforming a datetime to timedelta + df = DataFrame(dict(A=Timestamp("20130101"), B=np.arange(5))) + expected = pd.Series([Timestamp("20130101") - Timestamp("20130101")] * 5, name="A") + + # this does date math without changing result type in transform + base_time = df["A"][0] + result = ( + df.groupby("A")["A"].transform(lambda x: x.max() - x.min() + base_time) + - base_time + ) + tm.assert_series_equal(result, expected) + + # this does date math and causes the transform to return timedelta + result = df.groupby("A")["A"].transform(lambda x: x.max() - x.min()) + tm.assert_series_equal(result, expected) + + +def test_transform_datetime_to_numeric(): + # GH 10972 + # convert dt to float + df = DataFrame({"a": 1, "b": date_range("2015-01-01", periods=2, freq="D")}) + result = df.groupby("a").b.transform( + lambda x: x.dt.dayofweek - x.dt.dayofweek.mean() + ) + + expected = Series([-0.5, 0.5], name="b") + tm.assert_series_equal(result, expected) + + # convert dt to int + df = DataFrame({"a": 1, "b": date_range("2015-01-01", periods=2, freq="D")}) + result = df.groupby("a").b.transform( + lambda x: x.dt.dayofweek - x.dt.dayofweek.min() + ) + + expected = Series([0, 1], name="b") + tm.assert_series_equal(result, expected) + + +def test_transform_casting(): + # 13046 + data = """ + idx A ID3 DATETIME + 0 B-028 b76cd912ff "2014-10-08 13:43:27" + 1 B-054 4a57ed0b02 "2014-10-08 14:26:19" + 2 B-076 1a682034f8 "2014-10-08 14:29:01" + 3 B-023 b76cd912ff "2014-10-08 18:39:34" + 4 B-023 f88g8d7sds "2014-10-08 18:40:18" + 5 B-033 b76cd912ff "2014-10-08 18:44:30" + 6 B-032 b76cd912ff "2014-10-08 18:46:00" + 7 B-037 b76cd912ff "2014-10-08 18:52:15" + 8 B-046 db959faf02 "2014-10-08 18:59:59" + 9 B-053 b76cd912ff "2014-10-08 19:17:48" + 10 B-065 b76cd912ff "2014-10-08 19:21:38" + """ + df = pd.read_csv( + StringIO(data), sep=r"\s+", index_col=[0], parse_dates=["DATETIME"] + ) + + result = df.groupby("ID3")["DATETIME"].transform(lambda x: x.diff()) + assert is_timedelta64_dtype(result.dtype) + + result = df[["ID3", "DATETIME"]].groupby("ID3").transform(lambda x: x.diff()) + assert is_timedelta64_dtype(result.DATETIME.dtype) + + +def test_transform_multiple(ts): + grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) + + grouped.transform(lambda x: x * 2) + grouped.transform(np.mean) + + +def test_dispatch_transform(tsframe): + df = tsframe[::5].reindex(tsframe.index) + + grouped = df.groupby(lambda x: x.month) + + filled = grouped.fillna(method="pad") + fillit = lambda x: x.fillna(method="pad") + expected = df.groupby(lambda x: x.month).transform(fillit) + tm.assert_frame_equal(filled, expected) + + +def test_transform_select_columns(df): + f = lambda x: x.mean() + result = df.groupby("A")[["C", "D"]].transform(f) + + selection = df[["C", "D"]] + expected = selection.groupby(df["A"]).transform(f) + + tm.assert_frame_equal(result, expected) + + +def test_transform_exclude_nuisance(df): + + # this also tests orderings in transform between + # series/frame to make sure it's consistent + expected = {} + grouped = df.groupby("A") + expected["C"] = grouped["C"].transform(np.mean) + expected["D"] = grouped["D"].transform(np.mean) + expected = DataFrame(expected) + result = df.groupby("A").transform(np.mean) + + tm.assert_frame_equal(result, expected) + + +def test_transform_function_aliases(df): + result = df.groupby("A").transform("mean") + expected = df.groupby("A").transform(np.mean) + tm.assert_frame_equal(result, expected) + + result = df.groupby("A")["C"].transform("mean") + expected = df.groupby("A")["C"].transform(np.mean) + tm.assert_series_equal(result, expected) + + +def test_series_fast_transform_date(): + # GH 13191 + df = pd.DataFrame( + {"grouping": [np.nan, 1, 1, 3], "d": pd.date_range("2014-1-1", "2014-1-4")} + ) + result = df.groupby("grouping")["d"].transform("first") + dates = [ + pd.NaT, + pd.Timestamp("2014-1-2"), + pd.Timestamp("2014-1-2"), + pd.Timestamp("2014-1-4"), + ] + expected = pd.Series(dates, name="d") + tm.assert_series_equal(result, expected) + + +def test_transform_length(): + # GH 9697 + df = pd.DataFrame({"col1": [1, 1, 2, 2], "col2": [1, 2, 3, np.nan]}) + expected = pd.Series([3.0] * 4) + + def nsum(x): + return np.nansum(x) + + results = [ + df.groupby("col1").transform(sum)["col2"], + df.groupby("col1")["col2"].transform(sum), + df.groupby("col1").transform(nsum)["col2"], + df.groupby("col1")["col2"].transform(nsum), + ] + for result in results: + tm.assert_series_equal(result, expected, check_names=False) + + +def test_transform_coercion(): + + # 14457 + # when we are transforming be sure to not coerce + # via assignment + df = pd.DataFrame(dict(A=["a", "a"], B=[0, 1])) + g = df.groupby("A") + + expected = g.transform(np.mean) + result = g.transform(lambda x: np.mean(x)) + tm.assert_frame_equal(result, expected) + + +def test_groupby_transform_with_int(): + + # GH 3740, make sure that we might upcast on item-by-item transform + + # floats + df = DataFrame( + dict( + A=[1, 1, 1, 2, 2, 2], + B=Series(1, dtype="float64"), + C=Series([1, 2, 3, 1, 2, 3], dtype="float64"), + D="foo", + ) + ) + with np.errstate(all="ignore"): + result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) + expected = DataFrame( + dict(B=np.nan, C=Series([-1, 0, 1, -1, 0, 1], dtype="float64")) + ) + tm.assert_frame_equal(result, expected) + + # int case + df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, C=[1, 2, 3, 1, 2, 3], D="foo")) + with np.errstate(all="ignore"): + result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) + expected = DataFrame(dict(B=np.nan, C=[-1, 0, 1, -1, 0, 1])) + tm.assert_frame_equal(result, expected) + + # int that needs float conversion + s = Series([2, 3, 4, 10, 5, -1]) + df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, C=s, D="foo")) + with np.errstate(all="ignore"): + result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) + + s1 = s.iloc[0:3] + s1 = (s1 - s1.mean()) / s1.std() + s2 = s.iloc[3:6] + s2 = (s2 - s2.mean()) / s2.std() + expected = DataFrame(dict(B=np.nan, C=concat([s1, s2]))) + tm.assert_frame_equal(result, expected) + + # int downcasting + result = df.groupby("A").transform(lambda x: x * 2 / 2) + expected = DataFrame(dict(B=1, C=[2, 3, 4, 10, 5, -1])) + tm.assert_frame_equal(result, expected) + + +def test_groupby_transform_with_nan_group(): + # GH 9941 + df = pd.DataFrame({"a": range(10), "b": [1, 1, 2, 3, np.nan, 4, 4, 5, 5, 5]}) + result = df.groupby(df.b)["a"].transform(max) + expected = pd.Series( + [1.0, 1.0, 2.0, 3.0, np.nan, 6.0, 6.0, 9.0, 9.0, 9.0], name="a" + ) + tm.assert_series_equal(result, expected) + + +def test_transform_mixed_type(): + index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]]) + df = DataFrame( + { + "d": [1.0, 1.0, 1.0, 2.0, 2.0, 2.0], + "c": np.tile(["a", "b", "c"], 2), + "v": np.arange(1.0, 7.0), + }, + index=index, + ) + + def f(group): + group["g"] = group["d"] * 2 + return group[:1] + + grouped = df.groupby("c") + result = grouped.apply(f) + + assert result["d"].dtype == np.float64 + + # this is by definition a mutating operation! + with pd.option_context("mode.chained_assignment", None): + for key, group in grouped: + res = f(group) + tm.assert_frame_equal(res, result.loc[key]) + + +def _check_cython_group_transform_cumulative(pd_op, np_op, dtype): + """ + Check a group transform that executes a cumulative function. + + Parameters + ---------- + pd_op : callable + The pandas cumulative function. + np_op : callable + The analogous one in NumPy. + dtype : type + The specified dtype of the data. + """ + + is_datetimelike = False + + data = np.array([[1], [2], [3], [4]], dtype=dtype) + ans = np.zeros_like(data) + + labels = np.array([0, 0, 0, 0], dtype=np.int64) + ngroups = 1 + pd_op(ans, data, labels, ngroups, is_datetimelike) + + tm.assert_numpy_array_equal(np_op(data), ans[:, 0], check_dtype=False) + + +def test_cython_group_transform_cumsum(any_real_dtype): + # see gh-4095 + dtype = np.dtype(any_real_dtype).type + pd_op, np_op = groupby.group_cumsum, np.cumsum + _check_cython_group_transform_cumulative(pd_op, np_op, dtype) + + +def test_cython_group_transform_cumprod(): + # see gh-4095 + dtype = np.float64 + pd_op, np_op = groupby.group_cumprod_float64, np.cumproduct + _check_cython_group_transform_cumulative(pd_op, np_op, dtype) + + +def test_cython_group_transform_algos(): + # see gh-4095 + is_datetimelike = False + + # with nans + labels = np.array([0, 0, 0, 0, 0], dtype=np.int64) + ngroups = 1 + + data = np.array([[1], [2], [3], [np.nan], [4]], dtype="float64") + actual = np.zeros_like(data) + actual.fill(np.nan) + groupby.group_cumprod_float64(actual, data, labels, ngroups, is_datetimelike) + expected = np.array([1, 2, 6, np.nan, 24], dtype="float64") + tm.assert_numpy_array_equal(actual[:, 0], expected) + + actual = np.zeros_like(data) + actual.fill(np.nan) + groupby.group_cumsum(actual, data, labels, ngroups, is_datetimelike) + expected = np.array([1, 3, 6, np.nan, 10], dtype="float64") + tm.assert_numpy_array_equal(actual[:, 0], expected) + + # timedelta + is_datetimelike = True + data = np.array([np.timedelta64(1, "ns")] * 5, dtype="m8[ns]")[:, None] + actual = np.zeros_like(data, dtype="int64") + groupby.group_cumsum(actual, data.view("int64"), labels, ngroups, is_datetimelike) + expected = np.array( + [ + np.timedelta64(1, "ns"), + np.timedelta64(2, "ns"), + np.timedelta64(3, "ns"), + np.timedelta64(4, "ns"), + np.timedelta64(5, "ns"), + ] + ) + tm.assert_numpy_array_equal(actual[:, 0].view("m8[ns]"), expected) + + +@pytest.mark.parametrize( + "op, args, targop", + [ + ("cumprod", (), lambda x: x.cumprod()), + ("cumsum", (), lambda x: x.cumsum()), + ("shift", (-1,), lambda x: x.shift(-1)), + ("shift", (1,), lambda x: x.shift()), + ], +) +def test_cython_transform_series(op, args, targop): + # GH 4095 + s = Series(np.random.randn(1000)) + s_missing = s.copy() + s_missing.iloc[2:10] = np.nan + labels = np.random.randint(0, 50, size=1000).astype(float) + + # series + for data in [s, s_missing]: + # print(data.head()) + expected = data.groupby(labels).transform(targop) + + tm.assert_series_equal(expected, data.groupby(labels).transform(op, *args)) + tm.assert_series_equal(expected, getattr(data.groupby(labels), op)(*args)) + + +@pytest.mark.parametrize("op", ["cumprod", "cumsum"]) +@pytest.mark.parametrize("skipna", [False, True]) +@pytest.mark.parametrize( + "input, exp", + [ + # When everything is NaN + ({"key": ["b"] * 10, "value": np.nan}, pd.Series([np.nan] * 10, name="value")), + # When there is a single NaN + ( + {"key": ["b"] * 10 + ["a"] * 2, "value": [3] * 3 + [np.nan] + [3] * 8}, + { + ("cumprod", False): [3.0, 9.0, 27.0] + [np.nan] * 7 + [3.0, 9.0], + ("cumprod", True): [ + 3.0, + 9.0, + 27.0, + np.nan, + 81.0, + 243.0, + 729.0, + 2187.0, + 6561.0, + 19683.0, + 3.0, + 9.0, + ], + ("cumsum", False): [3.0, 6.0, 9.0] + [np.nan] * 7 + [3.0, 6.0], + ("cumsum", True): [ + 3.0, + 6.0, + 9.0, + np.nan, + 12.0, + 15.0, + 18.0, + 21.0, + 24.0, + 27.0, + 3.0, + 6.0, + ], + }, + ), + ], +) +def test_groupby_cum_skipna(op, skipna, input, exp): + df = pd.DataFrame(input) + result = df.groupby("key")["value"].transform(op, skipna=skipna) + if isinstance(exp, dict): + expected = exp[(op, skipna)] + else: + expected = exp + expected = pd.Series(expected, name="value") + tm.assert_series_equal(expected, result) + + +@pytest.mark.parametrize( + "op, args, targop", + [ + ("cumprod", (), lambda x: x.cumprod()), + ("cumsum", (), lambda x: x.cumsum()), + ("shift", (-1,), lambda x: x.shift(-1)), + ("shift", (1,), lambda x: x.shift()), + ], +) +def test_cython_transform_frame(op, args, targop): + s = Series(np.random.randn(1000)) + s_missing = s.copy() + s_missing.iloc[2:10] = np.nan + labels = np.random.randint(0, 50, size=1000).astype(float) + strings = list("qwertyuiopasdfghjklz") + strings_missing = strings[:] + strings_missing[5] = np.nan + df = DataFrame( + { + "float": s, + "float_missing": s_missing, + "int": [1, 1, 1, 1, 2] * 200, + "datetime": pd.date_range("1990-1-1", periods=1000), + "timedelta": pd.timedelta_range(1, freq="s", periods=1000), + "string": strings * 50, + "string_missing": strings_missing * 50, + }, + columns=[ + "float", + "float_missing", + "int", + "datetime", + "timedelta", + "string", + "string_missing", + ], + ) + df["cat"] = df["string"].astype("category") + + df2 = df.copy() + df2.index = pd.MultiIndex.from_product([range(100), range(10)]) + + # DataFrame - Single and MultiIndex, + # group by values, index level, columns + for df in [df, df2]: + for gb_target in [ + dict(by=labels), + dict(level=0), + dict(by="string"), + ]: # dict(by='string_missing')]: + # dict(by=['int','string'])]: + + gb = df.groupby(**gb_target) + # whitelisted methods set the selection before applying + # bit a of hack to make sure the cythonized shift + # is equivalent to pre 0.17.1 behavior + if op == "shift": + gb._set_group_selection() + + if op != "shift" and "int" not in gb_target: + # numeric apply fastpath promotes dtype so have + # to apply separately and concat + i = gb[["int"]].apply(targop) + f = gb[["float", "float_missing"]].apply(targop) + expected = pd.concat([f, i], axis=1) + else: + expected = gb.apply(targop) + + expected = expected.sort_index(axis=1) + tm.assert_frame_equal(expected, gb.transform(op, *args).sort_index(axis=1)) + tm.assert_frame_equal(expected, getattr(gb, op)(*args).sort_index(axis=1)) + # individual columns + for c in df: + if c not in ["float", "int", "float_missing"] and op != "shift": + msg = "No numeric types to aggregate" + with pytest.raises(DataError, match=msg): + gb[c].transform(op) + with pytest.raises(DataError, match=msg): + getattr(gb[c], op)() + else: + expected = gb[c].apply(targop) + expected.name = c + tm.assert_series_equal(expected, gb[c].transform(op, *args)) + tm.assert_series_equal(expected, getattr(gb[c], op)(*args)) + + +def test_transform_with_non_scalar_group(): + # GH 10165 + cols = pd.MultiIndex.from_tuples( + [ + ("syn", "A"), + ("mis", "A"), + ("non", "A"), + ("syn", "C"), + ("mis", "C"), + ("non", "C"), + ("syn", "T"), + ("mis", "T"), + ("non", "T"), + ("syn", "G"), + ("mis", "G"), + ("non", "G"), + ] + ) + df = pd.DataFrame( + np.random.randint(1, 10, (4, 12)), columns=cols, index=["A", "C", "G", "T"] + ) + + msg = "transform must return a scalar value for each group.*" + with pytest.raises(ValueError, match=msg): + df.groupby(axis=1, level=1).transform(lambda z: z.div(z.sum(axis=1), axis=0)) + + +@pytest.mark.parametrize( + "cols,exp,comp_func", + [ + ("a", pd.Series([1, 1, 1], name="a"), tm.assert_series_equal), + ( + ["a", "c"], + pd.DataFrame({"a": [1, 1, 1], "c": [1, 1, 1]}), + tm.assert_frame_equal, + ), + ], +) +@pytest.mark.parametrize("agg_func", ["count", "rank", "size"]) +def test_transform_numeric_ret(cols, exp, comp_func, agg_func, request): + if agg_func == "size" and isinstance(cols, list): + # https://github.com/pytest-dev/pytest/issues/6300 + # workaround to xfail fixture/param permutations + reason = "'size' transformation not supported with NDFrameGroupy" + request.node.add_marker(pytest.mark.xfail(reason=reason)) + + # GH 19200 + df = pd.DataFrame( + {"a": pd.date_range("2018-01-01", periods=3), "b": range(3), "c": range(7, 10)} + ) + + result = df.groupby("b")[cols].transform(agg_func) + + if agg_func == "rank": + exp = exp.astype("float") + + comp_func(result, exp) + + +@pytest.mark.parametrize("mix_groupings", [True, False]) +@pytest.mark.parametrize("as_series", [True, False]) +@pytest.mark.parametrize("val1,val2", [("foo", "bar"), (1, 2), (1.0, 2.0)]) +@pytest.mark.parametrize( + "fill_method,limit,exp_vals", + [ + ( + "ffill", + None, + [np.nan, np.nan, "val1", "val1", "val1", "val2", "val2", "val2"], + ), + ("ffill", 1, [np.nan, np.nan, "val1", "val1", np.nan, "val2", "val2", np.nan]), + ( + "bfill", + None, + ["val1", "val1", "val1", "val2", "val2", "val2", np.nan, np.nan], + ), + ("bfill", 1, [np.nan, "val1", "val1", np.nan, "val2", "val2", np.nan, np.nan]), + ], +) +def test_group_fill_methods( + mix_groupings, as_series, val1, val2, fill_method, limit, exp_vals +): + vals = [np.nan, np.nan, val1, np.nan, np.nan, val2, np.nan, np.nan] + _exp_vals = list(exp_vals) + # Overwrite placeholder values + for index, exp_val in enumerate(_exp_vals): + if exp_val == "val1": + _exp_vals[index] = val1 + elif exp_val == "val2": + _exp_vals[index] = val2 + + # Need to modify values and expectations depending on the + # Series / DataFrame that we ultimately want to generate + if mix_groupings: # ['a', 'b', 'a, 'b', ...] + keys = ["a", "b"] * len(vals) + + def interweave(list_obj): + temp = list() + for x in list_obj: + temp.extend([x, x]) + + return temp + + _exp_vals = interweave(_exp_vals) + vals = interweave(vals) + else: # ['a', 'a', 'a', ... 'b', 'b', 'b'] + keys = ["a"] * len(vals) + ["b"] * len(vals) + _exp_vals = _exp_vals * 2 + vals = vals * 2 + + df = DataFrame({"key": keys, "val": vals}) + if as_series: + result = getattr(df.groupby("key")["val"], fill_method)(limit=limit) + exp = Series(_exp_vals, name="val") + tm.assert_series_equal(result, exp) + else: + result = getattr(df.groupby("key"), fill_method)(limit=limit) + exp = DataFrame({"val": _exp_vals}) + tm.assert_frame_equal(result, exp) + + +@pytest.mark.parametrize("fill_method", ["ffill", "bfill"]) +def test_pad_stable_sorting(fill_method): + # GH 21207 + x = [0] * 20 + y = [np.nan] * 10 + [1] * 10 + + if fill_method == "bfill": + y = y[::-1] + + df = pd.DataFrame({"x": x, "y": y}) + expected = df.drop("x", 1) + + result = getattr(df.groupby("x"), fill_method)() + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("test_series", [True, False]) +@pytest.mark.parametrize( + "freq", + [ + None, + pytest.param( + "D", + marks=pytest.mark.xfail( + reason="GH#23918 before method uses freq in vectorized approach" + ), + ), + ], +) +@pytest.mark.parametrize("periods", [1, -1]) +@pytest.mark.parametrize("fill_method", ["ffill", "bfill", None]) +@pytest.mark.parametrize("limit", [None, 1]) +def test_pct_change(test_series, freq, periods, fill_method, limit): + # GH 21200, 21621, 30463 + vals = [3, np.nan, np.nan, np.nan, 1, 2, 4, 10, np.nan, 4] + keys = ["a", "b"] + key_v = np.repeat(keys, len(vals)) + df = DataFrame({"key": key_v, "vals": vals * 2}) + + df_g = df + if fill_method is not None: + df_g = getattr(df.groupby("key"), fill_method)(limit=limit) + grp = df_g.groupby(df.key) + + expected = grp["vals"].obj / grp["vals"].shift(periods) - 1 + + if test_series: + result = df.groupby("key")["vals"].pct_change( + periods=periods, fill_method=fill_method, limit=limit, freq=freq + ) + tm.assert_series_equal(result, expected) + else: + result = df.groupby("key").pct_change( + periods=periods, fill_method=fill_method, limit=limit, freq=freq + ) + tm.assert_frame_equal(result, expected.to_frame("vals")) + + +@pytest.mark.parametrize( + "func, expected_status", + [ + ("ffill", ["shrt", "shrt", "lng", np.nan, "shrt", "ntrl", "ntrl"]), + ("bfill", ["shrt", "lng", "lng", "shrt", "shrt", "ntrl", np.nan]), + ], +) +def test_ffill_bfill_non_unique_multilevel(func, expected_status): + # GH 19437 + date = pd.to_datetime( + [ + "2018-01-01", + "2018-01-01", + "2018-01-01", + "2018-01-01", + "2018-01-02", + "2018-01-01", + "2018-01-02", + ] + ) + symbol = ["MSFT", "MSFT", "MSFT", "AAPL", "AAPL", "TSLA", "TSLA"] + status = ["shrt", np.nan, "lng", np.nan, "shrt", "ntrl", np.nan] + + df = DataFrame({"date": date, "symbol": symbol, "status": status}) + df = df.set_index(["date", "symbol"]) + result = getattr(df.groupby("symbol")["status"], func)() + + index = MultiIndex.from_tuples( + tuples=list(zip(*[date, symbol])), names=["date", "symbol"] + ) + expected = Series(expected_status, index=index, name="status") + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("func", [np.any, np.all]) +def test_any_all_np_func(func): + # GH 20653 + df = pd.DataFrame( + [["foo", True], [np.nan, True], ["foo", True]], columns=["key", "val"] + ) + + exp = pd.Series([True, np.nan, True], name="val") + + res = df.groupby("key")["val"].transform(func) + tm.assert_series_equal(res, exp) + + +def test_groupby_transform_rename(): + # https://github.com/pandas-dev/pandas/issues/23461 + def demean_rename(x): + result = x - x.mean() + + if isinstance(x, pd.Series): + return result + + result = result.rename(columns={c: "{c}_demeaned" for c in result.columns}) + + return result + + df = pd.DataFrame({"group": list("ababa"), "value": [1, 1, 1, 2, 2]}) + expected = pd.DataFrame({"value": [-1.0 / 3, -0.5, -1.0 / 3, 0.5, 2.0 / 3]}) + + result = df.groupby("group").transform(demean_rename) + tm.assert_frame_equal(result, expected) + result_single = df.groupby("group").value.transform(demean_rename) + tm.assert_series_equal(result_single, expected["value"]) + + +@pytest.mark.parametrize("func", [min, max, np.min, np.max, "first", "last"]) +def test_groupby_transform_timezone_column(func): + # GH 24198 + ts = pd.to_datetime("now", utc=True).tz_convert("Asia/Singapore") + result = pd.DataFrame({"end_time": [ts], "id": [1]}) + result["max_end_time"] = result.groupby("id").end_time.transform(func) + expected = pd.DataFrame([[ts, 1, ts]], columns=["end_time", "id", "max_end_time"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "func, values", + [ + ("idxmin", ["1/1/2011"] * 2 + ["1/3/2011"] * 7 + ["1/10/2011"]), + ("idxmax", ["1/2/2011"] * 2 + ["1/9/2011"] * 7 + ["1/10/2011"]), + ], +) +def test_groupby_transform_with_datetimes(func, values): + # GH 15306 + dates = pd.date_range("1/1/2011", periods=10, freq="D") + + stocks = pd.DataFrame({"price": np.arange(10.0)}, index=dates) + stocks["week_id"] = pd.to_datetime(stocks.index).week + + result = stocks.groupby(stocks["week_id"])["price"].transform(func) + + expected = pd.Series(data=pd.to_datetime(values), index=dates, name="price") + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("func", ["cumsum", "cumprod", "cummin", "cummax"]) +def test_transform_absent_categories(func): + # GH 16771 + # cython transforms with more groups than rows + x_vals = [1] + x_cats = range(2) + y = [1] + df = DataFrame(dict(x=Categorical(x_vals, x_cats), y=y)) + result = getattr(df.y.groupby(df.x), func)() + expected = df.y + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("func", ["ffill", "bfill", "shift"]) +@pytest.mark.parametrize("key, val", [("level", 0), ("by", Series([0]))]) +def test_ffill_not_in_axis(func, key, val): + # GH 21521 + df = pd.DataFrame([[np.nan]]) + result = getattr(df.groupby(**{key: val}), func)() + expected = df + + tm.assert_frame_equal(result, expected) + + +def test_transform_invalid_name_raises(): + # GH#27486 + df = DataFrame(dict(a=[0, 1, 1, 2])) + g = df.groupby(["a", "b", "b", "c"]) + with pytest.raises(ValueError, match="not a valid function name"): + g.transform("some_arbitrary_name") + + # method exists on the object, but is not a valid transformation/agg + assert hasattr(g, "aggregate") # make sure the method exists + with pytest.raises(ValueError, match="not a valid function name"): + g.transform("aggregate") + + # Test SeriesGroupBy + g = df["a"].groupby(["a", "b", "b", "c"]) + with pytest.raises(ValueError, match="not a valid function name"): + g.transform("some_arbitrary_name") + + +@pytest.mark.parametrize( + "obj", + [ + DataFrame( + dict(a=[0, 0, 0, 1, 1, 1], b=range(6)), index=["A", "B", "C", "D", "E", "F"] + ), + Series([0, 0, 0, 1, 1, 1], index=["A", "B", "C", "D", "E", "F"]), + ], +) +def test_transform_agg_by_name(reduction_func, obj): + func = reduction_func + g = obj.groupby(np.repeat([0, 1], 3)) + + if func == "ngroup": # GH#27468 + pytest.xfail("TODO: g.transform('ngroup') doesn't work") + if func == "size": # GH#27469 + pytest.xfail("TODO: g.transform('size') doesn't work") + + args = {"nth": [0], "quantile": [0.5]}.get(func, []) + + result = g.transform(func, *args) + + # this is the *definition* of a transformation + tm.assert_index_equal(result.index, obj.index) + if hasattr(obj, "columns"): + tm.assert_index_equal(result.columns, obj.columns) + + # verify that values were broadcasted across each group + assert len(set(DataFrame(result).iloc[-3:, -1])) == 1 + + +def test_transform_lambda_with_datetimetz(): + # GH 27496 + df = DataFrame( + { + "time": [ + Timestamp("2010-07-15 03:14:45"), + Timestamp("2010-11-19 18:47:06"), + ], + "timezone": ["Etc/GMT+4", "US/Eastern"], + } + ) + result = df.groupby(["timezone"])["time"].transform( + lambda x: x.dt.tz_localize(x.name) + ) + expected = Series( + [ + Timestamp("2010-07-15 03:14:45", tz="Etc/GMT+4"), + Timestamp("2010-11-19 18:47:06", tz="US/Eastern"), + ], + name="time", + ) + tm.assert_series_equal(result, expected) + + +def test_transform_fastpath_raises(): + # GH#29631 case where fastpath defined in groupby.generic _choose_path + # raises, but slow_path does not + + df = pd.DataFrame({"A": [1, 1, 2, 2], "B": [1, -1, 1, 2]}) + gb = df.groupby("A") + + def func(grp): + # we want a function such that func(frame) fails but func.apply(frame) + # works + if grp.ndim == 2: + # Ensure that fast_path fails + raise NotImplementedError("Don't cross the streams") + return grp * 2 + + # Check that the fastpath raises, see _transform_general + obj = gb._obj_with_exclusions + gen = gb.grouper.get_iterator(obj, axis=gb.axis) + fast_path, slow_path = gb._define_paths(func) + _, group = next(gen) + + with pytest.raises(NotImplementedError, match="Don't cross the streams"): + fast_path(group) + + result = gb.transform(func) + + expected = pd.DataFrame([2, -2, 2, 4], columns=["B"]) + tm.assert_frame_equal(result, expected) + + +def test_transform_lambda_indexing(): + # GH 7883 + df = pd.DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "flux", "foo", "flux"], + "B": ["one", "one", "two", "three", "two", "six", "five", "three"], + "C": range(8), + "D": range(8), + "E": range(8), + } + ) + df = df.set_index(["A", "B"]) + df = df.sort_index() + result = df.groupby(level="A").transform(lambda x: x.iloc[-1]) + expected = DataFrame( + { + "C": [3, 3, 7, 7, 4, 4, 4, 4], + "D": [3, 3, 7, 7, 4, 4, 4, 4], + "E": [3, 3, 7, 7, 4, 4, 4, 4], + }, + index=MultiIndex.from_tuples( + [ + ("bar", "one"), + ("bar", "three"), + ("flux", "six"), + ("flux", "three"), + ("foo", "five"), + ("foo", "one"), + ("foo", "two"), + ("foo", "two"), + ], + names=["A", "B"], + ), + ) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/groupby/test_value_counts.py b/venv/Lib/site-packages/pandas/tests/groupby/test_value_counts.py new file mode 100644 index 0000000..c86cb45 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/groupby/test_value_counts.py @@ -0,0 +1,109 @@ +""" +these are systematically testing all of the args to value_counts +with different size combinations. This is to ensure stability of the sorting +and proper parameter handling +""" + +from itertools import product + +import numpy as np +import pytest + +from pandas import DataFrame, Grouper, MultiIndex, Series, date_range, to_datetime +import pandas._testing as tm + + +# our starting frame +def seed_df(seed_nans, n, m): + np.random.seed(1234) + days = date_range("2015-08-24", periods=10) + + frame = DataFrame( + { + "1st": np.random.choice(list("abcd"), n), + "2nd": np.random.choice(days, n), + "3rd": np.random.randint(1, m + 1, n), + } + ) + + if seed_nans: + frame.loc[1::11, "1st"] = np.nan + frame.loc[3::17, "2nd"] = np.nan + frame.loc[7::19, "3rd"] = np.nan + frame.loc[8::19, "3rd"] = np.nan + frame.loc[9::19, "3rd"] = np.nan + + return frame + + +# create input df, keys, and the bins +binned = [] +ids = [] +for seed_nans in [True, False]: + for n, m in product((100, 1000), (5, 20)): + + df = seed_df(seed_nans, n, m) + bins = None, np.arange(0, max(5, df["3rd"].max()) + 1, 2) + keys = "1st", "2nd", ["1st", "2nd"] + for k, b in product(keys, bins): + binned.append((df, k, b, n, m)) + ids.append(f"{k}-{n}-{m}") + + +@pytest.mark.slow +@pytest.mark.parametrize("df, keys, bins, n, m", binned, ids=ids) +@pytest.mark.parametrize("isort", [True, False]) +@pytest.mark.parametrize("normalize", [True, False]) +@pytest.mark.parametrize("sort", [True, False]) +@pytest.mark.parametrize("ascending", [True, False]) +@pytest.mark.parametrize("dropna", [True, False]) +def test_series_groupby_value_counts( + df, keys, bins, n, m, isort, normalize, sort, ascending, dropna +): + def rebuild_index(df): + arr = list(map(df.index.get_level_values, range(df.index.nlevels))) + df.index = MultiIndex.from_arrays(arr, names=df.index.names) + return df + + kwargs = dict( + normalize=normalize, sort=sort, ascending=ascending, dropna=dropna, bins=bins + ) + + gr = df.groupby(keys, sort=isort) + left = gr["3rd"].value_counts(**kwargs) + + gr = df.groupby(keys, sort=isort) + right = gr["3rd"].apply(Series.value_counts, **kwargs) + right.index.names = right.index.names[:-1] + ["3rd"] + + # have to sort on index because of unstable sort on values + left, right = map(rebuild_index, (left, right)) # xref GH9212 + tm.assert_series_equal(left.sort_index(), right.sort_index()) + + +def test_series_groupby_value_counts_with_grouper(): + # GH28479 + df = DataFrame( + { + "Timestamp": [ + 1565083561, + 1565083561 + 86400, + 1565083561 + 86500, + 1565083561 + 86400 * 2, + 1565083561 + 86400 * 3, + 1565083561 + 86500 * 3, + 1565083561 + 86400 * 4, + ], + "Food": ["apple", "apple", "banana", "banana", "orange", "orange", "pear"], + } + ).drop([3]) + + df["Datetime"] = to_datetime(df["Timestamp"].apply(lambda t: str(t)), unit="s") + dfg = df.groupby(Grouper(freq="1D", key="Datetime")) + + # have to sort on index because of unstable sort on values xref GH9212 + result = dfg["Food"].value_counts().sort_index() + expected = dfg["Food"].apply(Series.value_counts).sort_index() + expected.index.names = result.index.names + + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/groupby/test_whitelist.py b/venv/Lib/site-packages/pandas/tests/groupby/test_whitelist.py new file mode 100644 index 0000000..8e387e9 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/groupby/test_whitelist.py @@ -0,0 +1,436 @@ +""" +test methods relating to generic function evaluation +the so-called white/black lists +""" + +from string import ascii_lowercase + +import numpy as np +import pytest + +from pandas import DataFrame, Index, MultiIndex, Series, date_range +import pandas._testing as tm +from pandas.core.groupby.base import ( + groupby_other_methods, + reduction_kernels, + transformation_kernels, +) + +AGG_FUNCTIONS = [ + "sum", + "prod", + "min", + "max", + "median", + "mean", + "skew", + "mad", + "std", + "var", + "sem", +] +AGG_FUNCTIONS_WITH_SKIPNA = ["skew", "mad"] + +df_whitelist = [ + "quantile", + "fillna", + "mad", + "take", + "idxmax", + "idxmin", + "tshift", + "skew", + "plot", + "hist", + "dtypes", + "corrwith", + "corr", + "cov", + "diff", +] + + +@pytest.fixture(params=df_whitelist) +def df_whitelist_fixture(request): + return request.param + + +s_whitelist = [ + "quantile", + "fillna", + "mad", + "take", + "idxmax", + "idxmin", + "tshift", + "skew", + "plot", + "hist", + "dtype", + "corr", + "cov", + "diff", + "unique", + "nlargest", + "nsmallest", + "is_monotonic_increasing", + "is_monotonic_decreasing", +] + + +@pytest.fixture(params=s_whitelist) +def s_whitelist_fixture(request): + return request.param + + +@pytest.fixture +def mframe(): + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + return DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) + + +@pytest.fixture +def df(): + return DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + } + ) + + +@pytest.fixture +def df_letters(): + letters = np.array(list(ascii_lowercase)) + N = 10 + random_letters = letters.take(np.random.randint(0, 26, N)) + df = DataFrame( + { + "floats": N / 10 * Series(np.random.random(N)), + "letters": Series(random_letters), + } + ) + return df + + +@pytest.mark.parametrize("whitelist", [df_whitelist, s_whitelist]) +def test_groupby_whitelist(df_letters, whitelist): + df = df_letters + if whitelist == df_whitelist: + # dataframe + obj = df_letters + else: + obj = df_letters["floats"] + + gb = obj.groupby(df.letters) + + assert set(whitelist) == set(gb._apply_whitelist) + + +def check_whitelist(obj, df, m): + # check the obj for a particular whitelist m + + gb = obj.groupby(df.letters) + + f = getattr(type(gb), m) + + # name + try: + n = f.__name__ + except AttributeError: + return + assert n == m + + # qualname + try: + n = f.__qualname__ + except AttributeError: + return + assert n.endswith(m) + + +def test_groupby_series_whitelist(df_letters, s_whitelist_fixture): + m = s_whitelist_fixture + df = df_letters + check_whitelist(df.letters, df, m) + + +def test_groupby_frame_whitelist(df_letters, df_whitelist_fixture): + m = df_whitelist_fixture + df = df_letters + check_whitelist(df, df, m) + + +@pytest.fixture +def raw_frame(): + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + raw_frame = DataFrame( + np.random.randn(10, 3), index=index, columns=Index(["A", "B", "C"], name="exp") + ) + raw_frame.iloc[1, [1, 2]] = np.nan + raw_frame.iloc[7, [0, 1]] = np.nan + return raw_frame + + +@pytest.mark.parametrize("op", AGG_FUNCTIONS) +@pytest.mark.parametrize("level", [0, 1]) +@pytest.mark.parametrize("axis", [0, 1]) +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.parametrize("sort", [True, False]) +def test_regression_whitelist_methods(raw_frame, op, level, axis, skipna, sort): + # GH6944 + # GH 17537 + # explicitly test the whitelist methods + + if axis == 0: + frame = raw_frame + else: + frame = raw_frame.T + + if op in AGG_FUNCTIONS_WITH_SKIPNA: + grouped = frame.groupby(level=level, axis=axis, sort=sort) + result = getattr(grouped, op)(skipna=skipna) + expected = getattr(frame, op)(level=level, axis=axis, skipna=skipna) + if sort: + expected = expected.sort_index(axis=axis, level=level) + tm.assert_frame_equal(result, expected) + else: + grouped = frame.groupby(level=level, axis=axis, sort=sort) + result = getattr(grouped, op)() + expected = getattr(frame, op)(level=level, axis=axis) + if sort: + expected = expected.sort_index(axis=axis, level=level) + tm.assert_frame_equal(result, expected) + + +def test_groupby_blacklist(df_letters): + df = df_letters + s = df_letters.floats + + blacklist = [ + "eval", + "query", + "abs", + "where", + "mask", + "align", + "groupby", + "clip", + "astype", + "at", + "combine", + "consolidate", + "convert_objects", + ] + to_methods = [method for method in dir(df) if method.startswith("to_")] + + blacklist.extend(to_methods) + + for bl in blacklist: + for obj in (df, s): + gb = obj.groupby(df.letters) + + # e.g., to_csv + defined_but_not_allowed = ( + f"(?:^Cannot.+{repr(bl)}.+'{type(gb).__name__}'.+try " + f"using the 'apply' method$)" + ) + + # e.g., query, eval + not_defined = ( + f"(?:^'{type(gb).__name__}' object has no attribute {repr(bl)}$)" + ) + + msg = f"{defined_but_not_allowed}|{not_defined}" + + with pytest.raises(AttributeError, match=msg): + getattr(gb, bl) + + +def test_tab_completion(mframe): + grp = mframe.groupby(level="second") + results = {v for v in dir(grp) if not v.startswith("_")} + expected = { + "A", + "B", + "C", + "agg", + "aggregate", + "apply", + "boxplot", + "filter", + "first", + "get_group", + "groups", + "hist", + "indices", + "last", + "max", + "mean", + "median", + "min", + "ngroups", + "nth", + "ohlc", + "plot", + "prod", + "size", + "std", + "sum", + "transform", + "var", + "sem", + "count", + "nunique", + "head", + "describe", + "cummax", + "quantile", + "rank", + "cumprod", + "tail", + "resample", + "cummin", + "fillna", + "cumsum", + "cumcount", + "ngroup", + "all", + "shift", + "skew", + "take", + "tshift", + "pct_change", + "any", + "mad", + "corr", + "corrwith", + "cov", + "dtypes", + "ndim", + "diff", + "idxmax", + "idxmin", + "ffill", + "bfill", + "pad", + "backfill", + "rolling", + "expanding", + "pipe", + } + assert results == expected + + +def test_groupby_function_rename(mframe): + grp = mframe.groupby(level="second") + for name in ["sum", "prod", "min", "max", "first", "last"]: + f = getattr(grp, name) + assert f.__name__ == name + + +def test_groupby_selection_with_methods(df): + # some methods which require DatetimeIndex + rng = date_range("2014", periods=len(df)) + df.index = rng + + g = df.groupby(["A"])[["C"]] + g_exp = df[["C"]].groupby(df["A"]) + # TODO check groupby with > 1 col ? + + # methods which are called as .foo() + methods = [ + "count", + "corr", + "cummax", + "cummin", + "cumprod", + "describe", + "rank", + "quantile", + "diff", + "shift", + "all", + "any", + "idxmin", + "idxmax", + "ffill", + "bfill", + "pct_change", + "tshift", + ] + + for m in methods: + res = getattr(g, m)() + exp = getattr(g_exp, m)() + + # should always be frames! + tm.assert_frame_equal(res, exp) + + # methods which aren't just .foo() + tm.assert_frame_equal(g.fillna(0), g_exp.fillna(0)) + tm.assert_frame_equal(g.dtypes, g_exp.dtypes) + tm.assert_frame_equal(g.apply(lambda x: x.sum()), g_exp.apply(lambda x: x.sum())) + + tm.assert_frame_equal(g.resample("D").mean(), g_exp.resample("D").mean()) + tm.assert_frame_equal(g.resample("D").ohlc(), g_exp.resample("D").ohlc()) + + tm.assert_frame_equal( + g.filter(lambda x: len(x) == 3), g_exp.filter(lambda x: len(x) == 3) + ) + + +def test_all_methods_categorized(mframe): + grp = mframe.groupby(mframe.iloc[:, 0]) + names = {_ for _ in dir(grp) if not _.startswith("_")} - set(mframe.columns) + new_names = set(names) + new_names -= reduction_kernels + new_names -= transformation_kernels + new_names -= groupby_other_methods + + assert not (reduction_kernels & transformation_kernels) + assert not (reduction_kernels & groupby_other_methods) + assert not (transformation_kernels & groupby_other_methods) + + # new public method? + if new_names: + msg = f""" +There are uncatgeorized methods defined on the Grouper class: +{names}. + +Was a new method recently added? + +Every public method On Grouper must appear in exactly one the +following three lists defined in pandas.core.groupby.base: +- `reduction_kernels` +- `transformation_kernels` +- `groupby_other_methods` +see the comments in pandas/core/groupby/base.py for guidance on +how to fix this test. + """ + raise AssertionError(msg) + + # removed a public method? + all_categorized = reduction_kernels | transformation_kernels | groupby_other_methods + print(names) + print(all_categorized) + if not (names == all_categorized): + msg = f""" +Some methods which are supposed to be on the Grouper class +are missing: +{all_categorized - names}. + +They're still defined in one of the lists that live in pandas/core/groupby/base.py. +If you removed a method, you should update them +""" + raise AssertionError(msg) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/__init__.py b/venv/Lib/site-packages/pandas/tests/indexes/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/indexes/categorical/__init__.py b/venv/Lib/site-packages/pandas/tests/indexes/categorical/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/indexes/categorical/test_category.py b/venv/Lib/site-packages/pandas/tests/indexes/categorical/test_category.py new file mode 100644 index 0000000..d870259 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/categorical/test_category.py @@ -0,0 +1,994 @@ +import numpy as np +import pytest + +import pandas._config.config as cf + +from pandas._libs import index as libindex + +from pandas.core.dtypes.dtypes import CategoricalDtype + +import pandas as pd +from pandas import Categorical, IntervalIndex +import pandas._testing as tm +from pandas.core.indexes.api import CategoricalIndex, Index + +from ..common import Base + + +class TestCategoricalIndex(Base): + _holder = CategoricalIndex + + @pytest.fixture + def indices(self, request): + return tm.makeCategoricalIndex(100) + + def create_index(self, categories=None, ordered=False): + if categories is None: + categories = list("cab") + return CategoricalIndex(list("aabbca"), categories=categories, ordered=ordered) + + def test_can_hold_identifiers(self): + idx = self.create_index(categories=list("abcd")) + key = idx[0] + assert idx._can_hold_identifiers_and_holds_name(key) is True + + @pytest.mark.parametrize( + "func,op_name", + [ + (lambda idx: idx - idx, "__sub__"), + (lambda idx: idx + idx, "__add__"), + (lambda idx: idx - ["a", "b"], "__sub__"), + (lambda idx: idx + ["a", "b"], "__add__"), + (lambda idx: ["a", "b"] - idx, "__rsub__"), + (lambda idx: ["a", "b"] + idx, "__radd__"), + ], + ) + def test_disallow_addsub_ops(self, func, op_name): + # GH 10039 + # set ops (+/-) raise TypeError + idx = pd.Index(pd.Categorical(["a", "b"])) + msg = f"cannot perform {op_name} with this index type: CategoricalIndex" + with pytest.raises(TypeError, match=msg): + func(idx) + + def test_method_delegation(self): + + ci = CategoricalIndex(list("aabbca"), categories=list("cabdef")) + result = ci.set_categories(list("cab")) + tm.assert_index_equal( + result, CategoricalIndex(list("aabbca"), categories=list("cab")) + ) + + ci = CategoricalIndex(list("aabbca"), categories=list("cab")) + result = ci.rename_categories(list("efg")) + tm.assert_index_equal( + result, CategoricalIndex(list("ffggef"), categories=list("efg")) + ) + + # GH18862 (let rename_categories take callables) + result = ci.rename_categories(lambda x: x.upper()) + tm.assert_index_equal( + result, CategoricalIndex(list("AABBCA"), categories=list("CAB")) + ) + + ci = CategoricalIndex(list("aabbca"), categories=list("cab")) + result = ci.add_categories(["d"]) + tm.assert_index_equal( + result, CategoricalIndex(list("aabbca"), categories=list("cabd")) + ) + + ci = CategoricalIndex(list("aabbca"), categories=list("cab")) + result = ci.remove_categories(["c"]) + tm.assert_index_equal( + result, + CategoricalIndex(list("aabb") + [np.nan] + ["a"], categories=list("ab")), + ) + + ci = CategoricalIndex(list("aabbca"), categories=list("cabdef")) + result = ci.as_unordered() + tm.assert_index_equal(result, ci) + + ci = CategoricalIndex(list("aabbca"), categories=list("cabdef")) + result = ci.as_ordered() + tm.assert_index_equal( + result, + CategoricalIndex(list("aabbca"), categories=list("cabdef"), ordered=True), + ) + + # invalid + msg = "cannot use inplace with CategoricalIndex" + with pytest.raises(ValueError, match=msg): + ci.set_categories(list("cab"), inplace=True) + + def test_contains(self): + + ci = self.create_index(categories=list("cabdef")) + + assert "a" in ci + assert "z" not in ci + assert "e" not in ci + assert np.nan not in ci + + # assert codes NOT in index + assert 0 not in ci + assert 1 not in ci + + ci = CategoricalIndex(list("aabbca") + [np.nan], categories=list("cabdef")) + assert np.nan in ci + + @pytest.mark.parametrize( + "item, expected", + [ + (pd.Interval(0, 1), True), + (1.5, True), + (pd.Interval(0.5, 1.5), False), + ("a", False), + (pd.Timestamp(1), False), + (pd.Timedelta(1), False), + ], + ids=str, + ) + def test_contains_interval(self, item, expected): + # GH 23705 + ci = CategoricalIndex(IntervalIndex.from_breaks(range(3))) + result = item in ci + assert result is expected + + def test_contains_list(self): + # GH#21729 + idx = pd.CategoricalIndex([1, 2, 3]) + + assert "a" not in idx + + with pytest.raises(TypeError, match="unhashable type"): + ["a"] in idx + + with pytest.raises(TypeError, match="unhashable type"): + ["a", "b"] in idx + + def test_map(self): + ci = pd.CategoricalIndex(list("ABABC"), categories=list("CBA"), ordered=True) + result = ci.map(lambda x: x.lower()) + exp = pd.CategoricalIndex(list("ababc"), categories=list("cba"), ordered=True) + tm.assert_index_equal(result, exp) + + ci = pd.CategoricalIndex( + list("ABABC"), categories=list("BAC"), ordered=False, name="XXX" + ) + result = ci.map(lambda x: x.lower()) + exp = pd.CategoricalIndex( + list("ababc"), categories=list("bac"), ordered=False, name="XXX" + ) + tm.assert_index_equal(result, exp) + + # GH 12766: Return an index not an array + tm.assert_index_equal( + ci.map(lambda x: 1), Index(np.array([1] * 5, dtype=np.int64), name="XXX") + ) + + # change categories dtype + ci = pd.CategoricalIndex(list("ABABC"), categories=list("BAC"), ordered=False) + + def f(x): + return {"A": 10, "B": 20, "C": 30}.get(x) + + result = ci.map(f) + exp = pd.CategoricalIndex( + [10, 20, 10, 20, 30], categories=[20, 10, 30], ordered=False + ) + tm.assert_index_equal(result, exp) + + result = ci.map(pd.Series([10, 20, 30], index=["A", "B", "C"])) + tm.assert_index_equal(result, exp) + + result = ci.map({"A": 10, "B": 20, "C": 30}) + tm.assert_index_equal(result, exp) + + def test_map_with_categorical_series(self): + # GH 12756 + a = pd.Index([1, 2, 3, 4]) + b = pd.Series(["even", "odd", "even", "odd"], dtype="category") + c = pd.Series(["even", "odd", "even", "odd"]) + + exp = CategoricalIndex(["odd", "even", "odd", np.nan]) + tm.assert_index_equal(a.map(b), exp) + exp = pd.Index(["odd", "even", "odd", np.nan]) + tm.assert_index_equal(a.map(c), exp) + + @pytest.mark.parametrize( + ("data", "f"), + ( + ([1, 1, np.nan], pd.isna), + ([1, 2, np.nan], pd.isna), + ([1, 1, np.nan], {1: False}), + ([1, 2, np.nan], {1: False, 2: False}), + ([1, 1, np.nan], pd.Series([False, False])), + ([1, 2, np.nan], pd.Series([False, False, False])), + ), + ) + def test_map_with_nan(self, data, f): # GH 24241 + values = pd.Categorical(data) + result = values.map(f) + if data[1] == 1: + expected = pd.Categorical([False, False, np.nan]) + tm.assert_categorical_equal(result, expected) + else: + expected = pd.Index([False, False, np.nan]) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("klass", [list, tuple, np.array, pd.Series]) + def test_where(self, klass): + i = self.create_index() + cond = [True] * len(i) + expected = i + result = i.where(klass(cond)) + tm.assert_index_equal(result, expected) + + cond = [False] + [True] * (len(i) - 1) + expected = CategoricalIndex([np.nan] + i[1:].tolist(), categories=i.categories) + result = i.where(klass(cond)) + tm.assert_index_equal(result, expected) + + def test_append(self): + + ci = self.create_index() + categories = ci.categories + + # append cats with the same categories + result = ci[:3].append(ci[3:]) + tm.assert_index_equal(result, ci, exact=True) + + foos = [ci[:1], ci[1:3], ci[3:]] + result = foos[0].append(foos[1:]) + tm.assert_index_equal(result, ci, exact=True) + + # empty + result = ci.append([]) + tm.assert_index_equal(result, ci, exact=True) + + # appending with different categories or reordered is not ok + msg = "all inputs must be Index" + with pytest.raises(TypeError, match=msg): + ci.append(ci.values.set_categories(list("abcd"))) + with pytest.raises(TypeError, match=msg): + ci.append(ci.values.reorder_categories(list("abc"))) + + # with objects + result = ci.append(Index(["c", "a"])) + expected = CategoricalIndex(list("aabbcaca"), categories=categories) + tm.assert_index_equal(result, expected, exact=True) + + # invalid objects + msg = "cannot append a non-category item to a CategoricalIndex" + with pytest.raises(TypeError, match=msg): + ci.append(Index(["a", "d"])) + + # GH14298 - if base object is not categorical -> coerce to object + result = Index(["c", "a"]).append(ci) + expected = Index(list("caaabbca")) + tm.assert_index_equal(result, expected, exact=True) + + def test_append_to_another(self): + # hits Index._concat_same_dtype + fst = Index(["a", "b"]) + snd = CategoricalIndex(["d", "e"]) + result = fst.append(snd) + expected = Index(["a", "b", "d", "e"]) + tm.assert_index_equal(result, expected) + + def test_insert(self): + + ci = self.create_index() + categories = ci.categories + + # test 0th element + result = ci.insert(0, "a") + expected = CategoricalIndex(list("aaabbca"), categories=categories) + tm.assert_index_equal(result, expected, exact=True) + + # test Nth element that follows Python list behavior + result = ci.insert(-1, "a") + expected = CategoricalIndex(list("aabbcaa"), categories=categories) + tm.assert_index_equal(result, expected, exact=True) + + # test empty + result = CategoricalIndex(categories=categories).insert(0, "a") + expected = CategoricalIndex(["a"], categories=categories) + tm.assert_index_equal(result, expected, exact=True) + + # invalid + msg = ( + "cannot insert an item into a CategoricalIndex that is not " + "already an existing category" + ) + with pytest.raises(TypeError, match=msg): + ci.insert(0, "d") + + # GH 18295 (test missing) + expected = CategoricalIndex(["a", np.nan, "a", "b", "c", "b"]) + for na in (np.nan, pd.NaT, None): + result = CategoricalIndex(list("aabcb")).insert(1, na) + tm.assert_index_equal(result, expected) + + def test_delete(self): + + ci = self.create_index() + categories = ci.categories + + result = ci.delete(0) + expected = CategoricalIndex(list("abbca"), categories=categories) + tm.assert_index_equal(result, expected, exact=True) + + result = ci.delete(-1) + expected = CategoricalIndex(list("aabbc"), categories=categories) + tm.assert_index_equal(result, expected, exact=True) + + with pytest.raises((IndexError, ValueError)): + # Either depending on NumPy version + ci.delete(10) + + def test_astype(self): + + ci = self.create_index() + result = ci.astype(object) + tm.assert_index_equal(result, Index(np.array(ci))) + + # this IS equal, but not the same class + assert result.equals(ci) + assert isinstance(result, Index) + assert not isinstance(result, CategoricalIndex) + + # interval + ii = IntervalIndex.from_arrays(left=[-0.001, 2.0], right=[2, 4], closed="right") + + ci = CategoricalIndex( + Categorical.from_codes([0, 1, -1], categories=ii, ordered=True) + ) + + result = ci.astype("interval") + expected = ii.take([0, 1, -1]) + tm.assert_index_equal(result, expected) + + result = IntervalIndex(result.values) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("name", [None, "foo"]) + @pytest.mark.parametrize("dtype_ordered", [True, False]) + @pytest.mark.parametrize("index_ordered", [True, False]) + def test_astype_category(self, name, dtype_ordered, index_ordered): + # GH 18630 + index = self.create_index(ordered=index_ordered) + if name: + index = index.rename(name) + + # standard categories + dtype = CategoricalDtype(ordered=dtype_ordered) + result = index.astype(dtype) + expected = CategoricalIndex( + index.tolist(), + name=name, + categories=index.categories, + ordered=dtype_ordered, + ) + tm.assert_index_equal(result, expected) + + # non-standard categories + dtype = CategoricalDtype(index.unique().tolist()[:-1], dtype_ordered) + result = index.astype(dtype) + expected = CategoricalIndex(index.tolist(), name=name, dtype=dtype) + tm.assert_index_equal(result, expected) + + if dtype_ordered is False: + # dtype='category' can't specify ordered, so only test once + result = index.astype("category") + expected = index + tm.assert_index_equal(result, expected) + + def test_reindex_base(self): + # Determined by cat ordering. + idx = CategoricalIndex(list("cab"), categories=list("cab")) + expected = np.arange(len(idx), dtype=np.intp) + + actual = idx.get_indexer(idx) + tm.assert_numpy_array_equal(expected, actual) + + with pytest.raises(ValueError, match="Invalid fill method"): + idx.get_indexer(idx, method="invalid") + + def test_reindexing(self): + np.random.seed(123456789) + + ci = self.create_index() + oidx = Index(np.array(ci)) + + for n in [1, 2, 5, len(ci)]: + finder = oidx[np.random.randint(0, len(ci), size=n)] + expected = oidx.get_indexer_non_unique(finder)[0] + + actual = ci.get_indexer(finder) + tm.assert_numpy_array_equal(expected, actual) + + # see gh-17323 + # + # Even when indexer is equal to the + # members in the index, we should + # respect duplicates instead of taking + # the fast-track path. + for finder in [list("aabbca"), list("aababca")]: + expected = oidx.get_indexer_non_unique(finder)[0] + + actual = ci.get_indexer(finder) + tm.assert_numpy_array_equal(expected, actual) + + def test_reindex_dtype(self): + c = CategoricalIndex(["a", "b", "c", "a"]) + res, indexer = c.reindex(["a", "c"]) + tm.assert_index_equal(res, Index(["a", "a", "c"]), exact=True) + tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) + + c = CategoricalIndex(["a", "b", "c", "a"]) + res, indexer = c.reindex(Categorical(["a", "c"])) + + exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"]) + tm.assert_index_equal(res, exp, exact=True) + tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) + + c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) + res, indexer = c.reindex(["a", "c"]) + exp = Index(["a", "a", "c"], dtype="object") + tm.assert_index_equal(res, exp, exact=True) + tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) + + c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) + res, indexer = c.reindex(Categorical(["a", "c"])) + exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"]) + tm.assert_index_equal(res, exp, exact=True) + tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) + + def test_reindex_duplicate_target(self): + # See GH25459 + cat = CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c", "d"]) + res, indexer = cat.reindex(["a", "c", "c"]) + exp = Index(["a", "c", "c"], dtype="object") + tm.assert_index_equal(res, exp, exact=True) + tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp)) + + res, indexer = cat.reindex( + CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"]) + ) + exp = CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"]) + tm.assert_index_equal(res, exp, exact=True) + tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp)) + + def test_reindex_empty_index(self): + # See GH16770 + c = CategoricalIndex([]) + res, indexer = c.reindex(["a", "b"]) + tm.assert_index_equal(res, Index(["a", "b"]), exact=True) + tm.assert_numpy_array_equal(indexer, np.array([-1, -1], dtype=np.intp)) + + @pytest.mark.parametrize( + "data, non_lexsorted_data", + [[[1, 2, 3], [9, 0, 1, 2, 3]], [list("abc"), list("fabcd")]], + ) + def test_is_monotonic(self, data, non_lexsorted_data): + c = CategoricalIndex(data) + assert c.is_monotonic_increasing is True + assert c.is_monotonic_decreasing is False + + c = CategoricalIndex(data, ordered=True) + assert c.is_monotonic_increasing is True + assert c.is_monotonic_decreasing is False + + c = CategoricalIndex(data, categories=reversed(data)) + assert c.is_monotonic_increasing is False + assert c.is_monotonic_decreasing is True + + c = CategoricalIndex(data, categories=reversed(data), ordered=True) + assert c.is_monotonic_increasing is False + assert c.is_monotonic_decreasing is True + + # test when data is neither monotonic increasing nor decreasing + reordered_data = [data[0], data[2], data[1]] + c = CategoricalIndex(reordered_data, categories=reversed(data)) + assert c.is_monotonic_increasing is False + assert c.is_monotonic_decreasing is False + + # non lexsorted categories + categories = non_lexsorted_data + + c = CategoricalIndex(categories[:2], categories=categories) + assert c.is_monotonic_increasing is True + assert c.is_monotonic_decreasing is False + + c = CategoricalIndex(categories[1:3], categories=categories) + assert c.is_monotonic_increasing is True + assert c.is_monotonic_decreasing is False + + def test_has_duplicates(self): + + idx = CategoricalIndex([0, 0, 0], name="foo") + assert idx.is_unique is False + assert idx.has_duplicates is True + + def test_drop_duplicates(self): + + idx = CategoricalIndex([0, 0, 0], name="foo") + expected = CategoricalIndex([0], name="foo") + tm.assert_index_equal(idx.drop_duplicates(), expected) + tm.assert_index_equal(idx.unique(), expected) + + def test_get_indexer(self): + + idx1 = CategoricalIndex(list("aabcde"), categories=list("edabc")) + idx2 = CategoricalIndex(list("abf")) + + for indexer in [idx2, list("abf"), Index(list("abf"))]: + r1 = idx1.get_indexer(idx2) + tm.assert_almost_equal(r1, np.array([0, 1, 2, -1], dtype=np.intp)) + + msg = ( + "method='pad' and method='backfill' not implemented yet for " + "CategoricalIndex" + ) + with pytest.raises(NotImplementedError, match=msg): + idx2.get_indexer(idx1, method="pad") + with pytest.raises(NotImplementedError, match=msg): + idx2.get_indexer(idx1, method="backfill") + + msg = "method='nearest' not implemented yet for CategoricalIndex" + with pytest.raises(NotImplementedError, match=msg): + idx2.get_indexer(idx1, method="nearest") + + def test_get_loc(self): + # GH 12531 + cidx1 = CategoricalIndex(list("abcde"), categories=list("edabc")) + idx1 = Index(list("abcde")) + assert cidx1.get_loc("a") == idx1.get_loc("a") + assert cidx1.get_loc("e") == idx1.get_loc("e") + + for i in [cidx1, idx1]: + with pytest.raises(KeyError, match="'NOT-EXIST'"): + i.get_loc("NOT-EXIST") + + # non-unique + cidx2 = CategoricalIndex(list("aacded"), categories=list("edabc")) + idx2 = Index(list("aacded")) + + # results in bool array + res = cidx2.get_loc("d") + tm.assert_numpy_array_equal(res, idx2.get_loc("d")) + tm.assert_numpy_array_equal( + res, np.array([False, False, False, True, False, True]) + ) + # unique element results in scalar + res = cidx2.get_loc("e") + assert res == idx2.get_loc("e") + assert res == 4 + + for i in [cidx2, idx2]: + with pytest.raises(KeyError, match="'NOT-EXIST'"): + i.get_loc("NOT-EXIST") + + # non-unique, sliceable + cidx3 = CategoricalIndex(list("aabbb"), categories=list("abc")) + idx3 = Index(list("aabbb")) + + # results in slice + res = cidx3.get_loc("a") + assert res == idx3.get_loc("a") + assert res == slice(0, 2, None) + + res = cidx3.get_loc("b") + assert res == idx3.get_loc("b") + assert res == slice(2, 5, None) + + for i in [cidx3, idx3]: + with pytest.raises(KeyError, match="'c'"): + i.get_loc("c") + + def test_repr_roundtrip(self): + + ci = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) + str(ci) + tm.assert_index_equal(eval(repr(ci)), ci, exact=True) + + # formatting + str(ci) + + # long format + # this is not reprable + ci = CategoricalIndex(np.random.randint(0, 5, size=100)) + str(ci) + + def test_isin(self): + + ci = CategoricalIndex(list("aabca") + [np.nan], categories=["c", "a", "b"]) + tm.assert_numpy_array_equal( + ci.isin(["c"]), np.array([False, False, False, True, False, False]) + ) + tm.assert_numpy_array_equal( + ci.isin(["c", "a", "b"]), np.array([True] * 5 + [False]) + ) + tm.assert_numpy_array_equal( + ci.isin(["c", "a", "b", np.nan]), np.array([True] * 6) + ) + + # mismatched categorical -> coerced to ndarray so doesn't matter + result = ci.isin(ci.set_categories(list("abcdefghi"))) + expected = np.array([True] * 6) + tm.assert_numpy_array_equal(result, expected) + + result = ci.isin(ci.set_categories(list("defghi"))) + expected = np.array([False] * 5 + [True]) + tm.assert_numpy_array_equal(result, expected) + + def test_identical(self): + + ci1 = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) + ci2 = CategoricalIndex(["a", "b"], categories=["a", "b", "c"], ordered=True) + assert ci1.identical(ci1) + assert ci1.identical(ci1.copy()) + assert not ci1.identical(ci2) + + def test_ensure_copied_data(self, indices): + # gh-12309: Check the "copy" argument of each + # Index.__new__ is honored. + # + # Must be tested separately from other indexes because + # self.values is not an ndarray. + # GH#29918 Index.base has been removed + # FIXME: is this test still meaningful? + _base = lambda ar: ar if getattr(ar, "base", None) is None else ar.base + + result = CategoricalIndex(indices.values, copy=True) + tm.assert_index_equal(indices, result) + assert _base(indices.values) is not _base(result.values) + + result = CategoricalIndex(indices.values, copy=False) + assert _base(indices.values) is _base(result.values) + + def test_equals_categorical(self): + ci1 = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) + ci2 = CategoricalIndex(["a", "b"], categories=["a", "b", "c"], ordered=True) + + assert ci1.equals(ci1) + assert not ci1.equals(ci2) + assert ci1.equals(ci1.astype(object)) + assert ci1.astype(object).equals(ci1) + + assert (ci1 == ci1).all() + assert not (ci1 != ci1).all() + assert not (ci1 > ci1).all() + assert not (ci1 < ci1).all() + assert (ci1 <= ci1).all() + assert (ci1 >= ci1).all() + + assert not (ci1 == 1).all() + assert (ci1 == Index(["a", "b"])).all() + assert (ci1 == ci1.values).all() + + # invalid comparisons + with pytest.raises(ValueError, match="Lengths must match"): + ci1 == Index(["a", "b", "c"]) + + msg = ( + "categorical index comparisons must have the same categories " + "and ordered attributes" + "|" + "Categoricals can only be compared if 'categories' are the same. " + "Categories are different lengths" + "|" + "Categoricals can only be compared if 'ordered' is the same" + ) + with pytest.raises(TypeError, match=msg): + ci1 == ci2 + with pytest.raises(TypeError, match=msg): + ci1 == Categorical(ci1.values, ordered=False) + with pytest.raises(TypeError, match=msg): + ci1 == Categorical(ci1.values, categories=list("abc")) + + # tests + # make sure that we are testing for category inclusion properly + ci = CategoricalIndex(list("aabca"), categories=["c", "a", "b"]) + assert not ci.equals(list("aabca")) + # Same categories, but different order + # Unordered + assert ci.equals(CategoricalIndex(list("aabca"))) + # Ordered + assert not ci.equals(CategoricalIndex(list("aabca"), ordered=True)) + assert ci.equals(ci.copy()) + + ci = CategoricalIndex(list("aabca") + [np.nan], categories=["c", "a", "b"]) + assert not ci.equals(list("aabca")) + assert not ci.equals(CategoricalIndex(list("aabca"))) + assert ci.equals(ci.copy()) + + ci = CategoricalIndex(list("aabca") + [np.nan], categories=["c", "a", "b"]) + assert not ci.equals(list("aabca") + [np.nan]) + assert ci.equals(CategoricalIndex(list("aabca") + [np.nan])) + assert not ci.equals(CategoricalIndex(list("aabca") + [np.nan], ordered=True)) + assert ci.equals(ci.copy()) + + def test_equals_categoridcal_unordered(self): + # https://github.com/pandas-dev/pandas/issues/16603 + a = pd.CategoricalIndex(["A"], categories=["A", "B"]) + b = pd.CategoricalIndex(["A"], categories=["B", "A"]) + c = pd.CategoricalIndex(["C"], categories=["B", "A"]) + assert a.equals(b) + assert not a.equals(c) + assert not b.equals(c) + + def test_frame_repr(self): + df = pd.DataFrame({"A": [1, 2, 3]}, index=pd.CategoricalIndex(["a", "b", "c"])) + result = repr(df) + expected = " A\na 1\nb 2\nc 3" + assert result == expected + + def test_string_categorical_index_repr(self): + # short + idx = pd.CategoricalIndex(["a", "bb", "ccc"]) + expected = """CategoricalIndex(['a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" # noqa + assert repr(idx) == expected + + # multiple lines + idx = pd.CategoricalIndex(["a", "bb", "ccc"] * 10) + expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', + 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', + 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], + categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" # noqa + + assert repr(idx) == expected + + # truncated + idx = pd.CategoricalIndex(["a", "bb", "ccc"] * 100) + expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', + ... + 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], + categories=['a', 'bb', 'ccc'], ordered=False, dtype='category', length=300)""" # noqa + + assert repr(idx) == expected + + # larger categories + idx = pd.CategoricalIndex(list("abcdefghijklmmo")) + expected = """CategoricalIndex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', + 'm', 'm', 'o'], + categories=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', ...], ordered=False, dtype='category')""" # noqa + + assert repr(idx) == expected + + # short + idx = pd.CategoricalIndex(["あ", "いい", "ううう"]) + expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa + assert repr(idx) == expected + + # multiple lines + idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 10) + expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', + 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', + 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], + categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa + + assert repr(idx) == expected + + # truncated + idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 100) + expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', + ... + 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], + categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)""" # noqa + + assert repr(idx) == expected + + # larger categories + idx = pd.CategoricalIndex(list("あいうえおかきくけこさしすせそ")) + expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', + 'す', 'せ', 'そ'], + categories=['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', ...], ordered=False, dtype='category')""" # noqa + + assert repr(idx) == expected + + # Emable Unicode option ----------------------------------------- + with cf.option_context("display.unicode.east_asian_width", True): + + # short + idx = pd.CategoricalIndex(["あ", "いい", "ううう"]) + expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa + assert repr(idx) == expected + + # multiple lines + idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 10) + expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', + 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', + 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', + 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], + categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa + + assert repr(idx) == expected + + # truncated + idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 100) + expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', + 'ううう', 'あ', + ... + 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', + 'あ', 'いい', 'ううう'], + categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)""" # noqa + + assert repr(idx) == expected + + # larger categories + idx = pd.CategoricalIndex(list("あいうえおかきくけこさしすせそ")) + expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', + 'さ', 'し', 'す', 'せ', 'そ'], + categories=['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', ...], ordered=False, dtype='category')""" # noqa + + assert repr(idx) == expected + + def test_fillna_categorical(self): + # GH 11343 + idx = CategoricalIndex([1.0, np.nan, 3.0, 1.0], name="x") + # fill by value in categories + exp = CategoricalIndex([1.0, 1.0, 3.0, 1.0], name="x") + tm.assert_index_equal(idx.fillna(1.0), exp) + + # fill by value not in categories raises ValueError + msg = "fill value must be in categories" + with pytest.raises(ValueError, match=msg): + idx.fillna(2.0) + + def test_take_fill_value(self): + # GH 12631 + + # numeric category + idx = pd.CategoricalIndex([1, 2, 3], name="xxx") + result = idx.take(np.array([1, 0, -1])) + expected = pd.CategoricalIndex([2, 1, 3], name="xxx") + tm.assert_index_equal(result, expected) + tm.assert_categorical_equal(result.values, expected.values) + + # fill_value + result = idx.take(np.array([1, 0, -1]), fill_value=True) + expected = pd.CategoricalIndex([2, 1, np.nan], categories=[1, 2, 3], name="xxx") + tm.assert_index_equal(result, expected) + tm.assert_categorical_equal(result.values, expected.values) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = pd.CategoricalIndex([2, 1, 3], name="xxx") + tm.assert_index_equal(result, expected) + tm.assert_categorical_equal(result.values, expected.values) + + # object category + idx = pd.CategoricalIndex( + list("CBA"), categories=list("ABC"), ordered=True, name="xxx" + ) + result = idx.take(np.array([1, 0, -1])) + expected = pd.CategoricalIndex( + list("BCA"), categories=list("ABC"), ordered=True, name="xxx" + ) + tm.assert_index_equal(result, expected) + tm.assert_categorical_equal(result.values, expected.values) + + # fill_value + result = idx.take(np.array([1, 0, -1]), fill_value=True) + expected = pd.CategoricalIndex( + ["B", "C", np.nan], categories=list("ABC"), ordered=True, name="xxx" + ) + tm.assert_index_equal(result, expected) + tm.assert_categorical_equal(result.values, expected.values) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = pd.CategoricalIndex( + list("BCA"), categories=list("ABC"), ordered=True, name="xxx" + ) + tm.assert_index_equal(result, expected) + tm.assert_categorical_equal(result.values, expected.values) + + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + with pytest.raises(IndexError): + idx.take(np.array([1, -5])) + + def test_take_fill_value_datetime(self): + + # datetime category + idx = pd.DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"], name="xxx") + idx = pd.CategoricalIndex(idx) + result = idx.take(np.array([1, 0, -1])) + expected = pd.DatetimeIndex( + ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx" + ) + expected = pd.CategoricalIndex(expected) + tm.assert_index_equal(result, expected) + + # fill_value + result = idx.take(np.array([1, 0, -1]), fill_value=True) + expected = pd.DatetimeIndex(["2011-02-01", "2011-01-01", "NaT"], name="xxx") + exp_cats = pd.DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"]) + expected = pd.CategoricalIndex(expected, categories=exp_cats) + tm.assert_index_equal(result, expected) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = pd.DatetimeIndex( + ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx" + ) + expected = pd.CategoricalIndex(expected) + tm.assert_index_equal(result, expected) + + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + with pytest.raises(IndexError): + idx.take(np.array([1, -5])) + + def test_take_invalid_kwargs(self): + idx = pd.CategoricalIndex([1, 2, 3], name="foo") + indices = [1, 0, -1] + + msg = r"take\(\) got an unexpected keyword argument 'foo'" + with pytest.raises(TypeError, match=msg): + idx.take(indices, foo=2) + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + idx.take(indices, out=indices) + + msg = "the 'mode' parameter is not supported" + with pytest.raises(ValueError, match=msg): + idx.take(indices, mode="clip") + + @pytest.mark.parametrize( + "dtype, engine_type", + [ + (np.int8, libindex.Int8Engine), + (np.int16, libindex.Int16Engine), + (np.int32, libindex.Int32Engine), + (np.int64, libindex.Int64Engine), + ], + ) + def test_engine_type(self, dtype, engine_type): + if dtype != np.int64: + # num. of uniques required to push CategoricalIndex.codes to a + # dtype (128 categories required for .codes dtype to be int16 etc.) + num_uniques = {np.int8: 1, np.int16: 128, np.int32: 32768}[dtype] + ci = pd.CategoricalIndex(range(num_uniques)) + else: + # having 2**32 - 2**31 categories would be very memory-intensive, + # so we cheat a bit with the dtype + ci = pd.CategoricalIndex(range(32768)) # == 2**16 - 2**(16 - 1) + ci.values._codes = ci.values._codes.astype("int64") + assert np.issubdtype(ci.codes.dtype, dtype) + assert isinstance(ci._engine, engine_type) + + @pytest.mark.parametrize( + "data, categories", + [ + (list("abcbca"), list("cab")), + (pd.interval_range(0, 3).repeat(3), pd.interval_range(0, 3)), + ], + ids=["string", "interval"], + ) + def test_map_str(self, data, categories, ordered_fixture): + # GH 31202 - override base class since we want to maintain categorical/ordered + index = CategoricalIndex(data, categories=categories, ordered=ordered_fixture) + result = index.map(str) + expected = CategoricalIndex( + map(str, data), categories=map(str, categories), ordered=ordered_fixture + ) + tm.assert_index_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/categorical/test_constructors.py b/venv/Lib/site-packages/pandas/tests/indexes/categorical/test_constructors.py new file mode 100644 index 0000000..1df0874 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/categorical/test_constructors.py @@ -0,0 +1,147 @@ +import numpy as np +import pytest + +from pandas import Categorical, CategoricalDtype, CategoricalIndex, Index +import pandas._testing as tm + + +class TestCategoricalIndexConstructors: + def test_construction(self): + + ci = CategoricalIndex(list("aabbca"), categories=list("abcd"), ordered=False) + categories = ci.categories + + result = Index(ci) + tm.assert_index_equal(result, ci, exact=True) + assert not result.ordered + + result = Index(ci.values) + tm.assert_index_equal(result, ci, exact=True) + assert not result.ordered + + # empty + result = CategoricalIndex(categories=categories) + tm.assert_index_equal(result.categories, Index(categories)) + tm.assert_numpy_array_equal(result.codes, np.array([], dtype="int8")) + assert not result.ordered + + # passing categories + result = CategoricalIndex(list("aabbca"), categories=categories) + tm.assert_index_equal(result.categories, Index(categories)) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") + ) + + c = Categorical(list("aabbca")) + result = CategoricalIndex(c) + tm.assert_index_equal(result.categories, Index(list("abc"))) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") + ) + assert not result.ordered + + result = CategoricalIndex(c, categories=categories) + tm.assert_index_equal(result.categories, Index(categories)) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") + ) + assert not result.ordered + + ci = CategoricalIndex(c, categories=list("abcd")) + result = CategoricalIndex(ci) + tm.assert_index_equal(result.categories, Index(categories)) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") + ) + assert not result.ordered + + result = CategoricalIndex(ci, categories=list("ab")) + tm.assert_index_equal(result.categories, Index(list("ab"))) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, -1, 0], dtype="int8") + ) + assert not result.ordered + + result = CategoricalIndex(ci, categories=list("ab"), ordered=True) + tm.assert_index_equal(result.categories, Index(list("ab"))) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, -1, 0], dtype="int8") + ) + assert result.ordered + + result = CategoricalIndex(ci, categories=list("ab"), ordered=True) + expected = CategoricalIndex( + ci, categories=list("ab"), ordered=True, dtype="category" + ) + tm.assert_index_equal(result, expected, exact=True) + + # turn me to an Index + result = Index(np.array(ci)) + assert isinstance(result, Index) + assert not isinstance(result, CategoricalIndex) + + def test_construction_with_dtype(self): + + # specify dtype + ci = CategoricalIndex(list("aabbca"), categories=list("abc"), ordered=False) + + result = Index(np.array(ci), dtype="category") + tm.assert_index_equal(result, ci, exact=True) + + result = Index(np.array(ci).tolist(), dtype="category") + tm.assert_index_equal(result, ci, exact=True) + + # these are generally only equal when the categories are reordered + ci = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False) + + result = Index(np.array(ci), dtype="category").reorder_categories(ci.categories) + tm.assert_index_equal(result, ci, exact=True) + + # make sure indexes are handled + expected = CategoricalIndex([0, 1, 2], categories=[0, 1, 2], ordered=True) + idx = Index(range(3)) + result = CategoricalIndex(idx, categories=idx, ordered=True) + tm.assert_index_equal(result, expected, exact=True) + + def test_construction_empty_with_bool_categories(self): + # see GH#22702 + cat = CategoricalIndex([], categories=[True, False]) + categories = sorted(cat.categories.tolist()) + assert categories == [False, True] + + def test_construction_with_categorical_dtype(self): + # construction with CategoricalDtype + # GH#18109 + data, cats, ordered = "a a b b".split(), "c b a".split(), True + dtype = CategoricalDtype(categories=cats, ordered=ordered) + + result = CategoricalIndex(data, dtype=dtype) + expected = CategoricalIndex(data, categories=cats, ordered=ordered) + tm.assert_index_equal(result, expected, exact=True) + + # GH#19032 + result = Index(data, dtype=dtype) + tm.assert_index_equal(result, expected, exact=True) + + # error when combining categories/ordered and dtype kwargs + msg = "Cannot specify `categories` or `ordered` together with `dtype`." + with pytest.raises(ValueError, match=msg): + CategoricalIndex(data, categories=cats, dtype=dtype) + + with pytest.raises(ValueError, match=msg): + Index(data, categories=cats, dtype=dtype) + + with pytest.raises(ValueError, match=msg): + CategoricalIndex(data, ordered=ordered, dtype=dtype) + + with pytest.raises(ValueError, match=msg): + Index(data, ordered=ordered, dtype=dtype) + + def test_create_categorical(self): + # GH#17513 The public CI constructor doesn't hit this code path with + # instances of CategoricalIndex, but we still want to test the code + ci = CategoricalIndex(["a", "b", "c"]) + # First ci is self, second ci is data. + result = CategoricalIndex._create_categorical(ci, ci) + expected = Categorical(["a", "b", "c"]) + tm.assert_categorical_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/common.py b/venv/Lib/site-packages/pandas/tests/indexes/common.py new file mode 100644 index 0000000..cbffb9d --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/common.py @@ -0,0 +1,892 @@ +import gc +from typing import Optional, Type + +import numpy as np +import pytest + +from pandas._libs.tslib import iNaT + +from pandas.core.dtypes.dtypes import CategoricalDtype + +import pandas as pd +from pandas import ( + CategoricalIndex, + DatetimeIndex, + Index, + Int64Index, + IntervalIndex, + MultiIndex, + PeriodIndex, + RangeIndex, + Series, + TimedeltaIndex, + UInt64Index, + isna, +) +import pandas._testing as tm +from pandas.core.indexes.base import InvalidIndexError +from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin + + +class Base: + """ base class for index sub-class tests """ + + _holder: Optional[Type[Index]] = None + _compat_props = ["shape", "ndim", "size", "nbytes"] + + def test_pickle_compat_construction(self): + # need an object to create with + msg = ( + r"Index\(\.\.\.\) must be called with a collection of some" + r" kind, None was passed|" + r"__new__\(\) missing 1 required positional argument: 'data'|" + r"__new__\(\) takes at least 2 arguments \(1 given\)" + ) + with pytest.raises(TypeError, match=msg): + self._holder() + + def test_to_series(self): + # assert that we are creating a copy of the index + + idx = self.create_index() + s = idx.to_series() + assert s.values is not idx.values + assert s.index is not idx + assert s.name == idx.name + + def test_to_series_with_arguments(self): + # GH18699 + + # index kwarg + idx = self.create_index() + s = idx.to_series(index=idx) + + assert s.values is not idx.values + assert s.index is idx + assert s.name == idx.name + + # name kwarg + idx = self.create_index() + s = idx.to_series(name="__test") + + assert s.values is not idx.values + assert s.index is not idx + assert s.name != idx.name + + @pytest.mark.parametrize("name", [None, "new_name"]) + def test_to_frame(self, name): + # see GH-15230, GH-22580 + idx = self.create_index() + + if name: + idx_name = name + else: + idx_name = idx.name or 0 + + df = idx.to_frame(name=idx_name) + + assert df.index is idx + assert len(df.columns) == 1 + assert df.columns[0] == idx_name + assert df[idx_name].values is not idx.values + + df = idx.to_frame(index=False, name=idx_name) + assert df.index is not idx + + def test_shift(self): + + # GH8083 test the base class for shift + idx = self.create_index() + msg = "Not supported for type {}".format(type(idx).__name__) + with pytest.raises(NotImplementedError, match=msg): + idx.shift(1) + with pytest.raises(NotImplementedError, match=msg): + idx.shift(1, 2) + + def test_constructor_name_unhashable(self): + # GH#29069 check that name is hashable + # See also same-named test in tests.series.test_constructors + idx = self.create_index() + with pytest.raises(TypeError, match="Index.name must be a hashable type"): + type(idx)(idx, name=[]) + + def test_create_index_existing_name(self): + + # GH11193, when an existing index is passed, and a new name is not + # specified, the new index should inherit the previous object name + expected = self.create_index() + if not isinstance(expected, MultiIndex): + expected.name = "foo" + result = pd.Index(expected) + tm.assert_index_equal(result, expected) + + result = pd.Index(expected, name="bar") + expected.name = "bar" + tm.assert_index_equal(result, expected) + else: + expected.names = ["foo", "bar"] + result = pd.Index(expected) + tm.assert_index_equal( + result, + Index( + Index( + [ + ("foo", "one"), + ("foo", "two"), + ("bar", "one"), + ("baz", "two"), + ("qux", "one"), + ("qux", "two"), + ], + dtype="object", + ), + names=["foo", "bar"], + ), + ) + + result = pd.Index(expected, names=["A", "B"]) + tm.assert_index_equal( + result, + Index( + Index( + [ + ("foo", "one"), + ("foo", "two"), + ("bar", "one"), + ("baz", "two"), + ("qux", "one"), + ("qux", "two"), + ], + dtype="object", + ), + names=["A", "B"], + ), + ) + + def test_numeric_compat(self): + + idx = self.create_index() + with pytest.raises(TypeError, match="cannot perform __mul__"): + idx * 1 + with pytest.raises(TypeError, match="cannot perform __rmul__"): + 1 * idx + + div_err = "cannot perform __truediv__" + with pytest.raises(TypeError, match=div_err): + idx / 1 + + div_err = div_err.replace(" __", " __r") + with pytest.raises(TypeError, match=div_err): + 1 / idx + with pytest.raises(TypeError, match="cannot perform __floordiv__"): + idx // 1 + with pytest.raises(TypeError, match="cannot perform __rfloordiv__"): + 1 // idx + + def test_logical_compat(self): + idx = self.create_index() + with pytest.raises(TypeError, match="cannot perform all"): + idx.all() + with pytest.raises(TypeError, match="cannot perform any"): + idx.any() + + def test_boolean_context_compat(self): + + # boolean context compat + idx = self.create_index() + + with pytest.raises(ValueError, match="The truth value of a"): + if idx: + pass + + def test_reindex_base(self): + idx = self.create_index() + expected = np.arange(idx.size, dtype=np.intp) + + actual = idx.get_indexer(idx) + tm.assert_numpy_array_equal(expected, actual) + + with pytest.raises(ValueError, match="Invalid fill method"): + idx.get_indexer(idx, method="invalid") + + def test_get_indexer_consistency(self, indices): + # See GH 16819 + if isinstance(indices, IntervalIndex): + return + + if indices.is_unique or isinstance(indices, CategoricalIndex): + indexer = indices.get_indexer(indices[0:2]) + assert isinstance(indexer, np.ndarray) + assert indexer.dtype == np.intp + else: + e = "Reindexing only valid with uniquely valued Index objects" + with pytest.raises(InvalidIndexError, match=e): + indices.get_indexer(indices[0:2]) + + indexer, _ = indices.get_indexer_non_unique(indices[0:2]) + assert isinstance(indexer, np.ndarray) + assert indexer.dtype == np.intp + + def test_ndarray_compat_properties(self): + idx = self.create_index() + assert idx.T.equals(idx) + assert idx.transpose().equals(idx) + + values = idx.values + for prop in self._compat_props: + assert getattr(idx, prop) == getattr(values, prop) + + # test for validity + idx.nbytes + idx.values.nbytes + + def test_repr_roundtrip(self): + + idx = self.create_index() + tm.assert_index_equal(eval(repr(idx)), idx) + + def test_str(self): + + # test the string repr + idx = self.create_index() + idx.name = "foo" + assert "'foo'" in str(idx) + assert type(idx).__name__ in str(idx) + + def test_repr_max_seq_item_setting(self): + # GH10182 + idx = self.create_index() + idx = idx.repeat(50) + with pd.option_context("display.max_seq_items", None): + repr(idx) + assert "..." not in str(idx) + + def test_copy_name(self, indices): + # gh-12309: Check that the "name" argument + # passed at initialization is honored. + if isinstance(indices, MultiIndex): + return + + first = type(indices)(indices, copy=True, name="mario") + second = type(first)(first, copy=False) + + # Even though "copy=False", we want a new object. + assert first is not second + + # Not using tm.assert_index_equal() since names differ. + assert indices.equals(first) + + assert first.name == "mario" + assert second.name == "mario" + + s1 = Series(2, index=first) + s2 = Series(3, index=second[:-1]) + + if not isinstance(indices, CategoricalIndex): + # See gh-13365 + s3 = s1 * s2 + assert s3.index.name == "mario" + + def test_ensure_copied_data(self, indices): + # Check the "copy" argument of each Index.__new__ is honoured + # GH12309 + init_kwargs = {} + if isinstance(indices, PeriodIndex): + # Needs "freq" specification: + init_kwargs["freq"] = indices.freq + elif isinstance(indices, (RangeIndex, MultiIndex, CategoricalIndex)): + # RangeIndex cannot be initialized from data + # MultiIndex and CategoricalIndex are tested separately + return + + index_type = type(indices) + result = index_type(indices.values, copy=True, **init_kwargs) + tm.assert_index_equal(indices, result) + tm.assert_numpy_array_equal( + indices._ndarray_values, result._ndarray_values, check_same="copy" + ) + + if isinstance(indices, PeriodIndex): + # .values an object array of Period, thus copied + result = index_type(ordinal=indices.asi8, copy=False, **init_kwargs) + tm.assert_numpy_array_equal( + indices._ndarray_values, result._ndarray_values, check_same="same" + ) + elif isinstance(indices, IntervalIndex): + # checked in test_interval.py + pass + else: + result = index_type(indices.values, copy=False, **init_kwargs) + tm.assert_numpy_array_equal( + indices.values, result.values, check_same="same" + ) + tm.assert_numpy_array_equal( + indices._ndarray_values, result._ndarray_values, check_same="same" + ) + + def test_memory_usage(self, indices): + indices._engine.clear_mapping() + result = indices.memory_usage() + if indices.empty: + # we report 0 for no-length + assert result == 0 + return + + # non-zero length + indices.get_loc(indices[0]) + result2 = indices.memory_usage() + result3 = indices.memory_usage(deep=True) + + # RangeIndex, IntervalIndex + # don't have engines + if not isinstance(indices, (RangeIndex, IntervalIndex)): + assert result2 > result + + if indices.inferred_type == "object": + assert result3 > result2 + + def test_argsort(self, request, indices): + # separately tested + if isinstance(indices, CategoricalIndex): + return + + result = indices.argsort() + expected = np.array(indices).argsort() + tm.assert_numpy_array_equal(result, expected, check_dtype=False) + + def test_numpy_argsort(self, indices): + result = np.argsort(indices) + expected = indices.argsort() + tm.assert_numpy_array_equal(result, expected) + + # these are the only two types that perform + # pandas compatibility input validation - the + # rest already perform separate (or no) such + # validation via their 'values' attribute as + # defined in pandas.core.indexes/base.py - they + # cannot be changed at the moment due to + # backwards compatibility concerns + if isinstance(type(indices), (CategoricalIndex, RangeIndex)): + msg = "the 'axis' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.argsort(indices, axis=1) + + msg = "the 'kind' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.argsort(indices, kind="mergesort") + + msg = "the 'order' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.argsort(indices, order=("a", "b")) + + def test_take(self, indices): + indexer = [4, 3, 0, 2] + if len(indices) < 5: + # not enough elements; ignore + return + + result = indices.take(indexer) + expected = indices[indexer] + assert result.equals(expected) + + if not isinstance(indices, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): + # GH 10791 + with pytest.raises(AttributeError): + indices.freq + + def test_take_invalid_kwargs(self): + idx = self.create_index() + indices = [1, 2] + + msg = r"take\(\) got an unexpected keyword argument 'foo'" + with pytest.raises(TypeError, match=msg): + idx.take(indices, foo=2) + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + idx.take(indices, out=indices) + + msg = "the 'mode' parameter is not supported" + with pytest.raises(ValueError, match=msg): + idx.take(indices, mode="clip") + + def test_repeat(self): + rep = 2 + i = self.create_index() + expected = pd.Index(i.values.repeat(rep), name=i.name) + tm.assert_index_equal(i.repeat(rep), expected) + + i = self.create_index() + rep = np.arange(len(i)) + expected = pd.Index(i.values.repeat(rep), name=i.name) + tm.assert_index_equal(i.repeat(rep), expected) + + def test_numpy_repeat(self): + rep = 2 + i = self.create_index() + expected = i.repeat(rep) + tm.assert_index_equal(np.repeat(i, rep), expected) + + msg = "the 'axis' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.repeat(i, rep, axis=0) + + @pytest.mark.parametrize("klass", [list, tuple, np.array, Series]) + def test_where(self, klass): + i = self.create_index() + + cond = [True] * len(i) + result = i.where(klass(cond)) + expected = i + tm.assert_index_equal(result, expected) + + cond = [False] + [True] * len(i[1:]) + expected = pd.Index([i._na_value] + i[1:].tolist(), dtype=i.dtype) + result = i.where(klass(cond)) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("case", [0.5, "xxx"]) + @pytest.mark.parametrize( + "method", ["intersection", "union", "difference", "symmetric_difference"] + ) + def test_set_ops_error_cases(self, case, method, indices): + # non-iterable input + msg = "Input must be Index or array-like" + with pytest.raises(TypeError, match=msg): + getattr(indices, method)(case) + + def test_intersection_base(self, indices): + if isinstance(indices, CategoricalIndex): + return + + first = indices[:5] + second = indices[:3] + intersect = first.intersection(second) + assert tm.equalContents(intersect, second) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + result = first.intersection(case) + assert tm.equalContents(result, second) + + if isinstance(indices, MultiIndex): + msg = "other must be a MultiIndex or a list of tuples" + with pytest.raises(TypeError, match=msg): + first.intersection([1, 2, 3]) + + def test_union_base(self, indices): + first = indices[3:] + second = indices[:5] + everything = indices + union = first.union(second) + assert tm.equalContents(union, everything) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + if not isinstance(indices, CategoricalIndex): + result = first.union(case) + assert tm.equalContents(result, everything) + + if isinstance(indices, MultiIndex): + msg = "other must be a MultiIndex or a list of tuples" + with pytest.raises(TypeError, match=msg): + first.union([1, 2, 3]) + + @pytest.mark.parametrize("sort", [None, False]) + def test_difference_base(self, sort, indices): + if isinstance(indices, CategoricalIndex): + return + + first = indices[2:] + second = indices[:4] + answer = indices[4:] + result = first.difference(second, sort) + assert tm.equalContents(result, answer) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + if isinstance(indices, (DatetimeIndex, TimedeltaIndex)): + assert type(result) == type(answer) + tm.assert_numpy_array_equal( + result.sort_values().asi8, answer.sort_values().asi8 + ) + else: + result = first.difference(case, sort) + assert tm.equalContents(result, answer) + + if isinstance(indices, MultiIndex): + msg = "other must be a MultiIndex or a list of tuples" + with pytest.raises(TypeError, match=msg): + first.difference([1, 2, 3], sort) + + def test_symmetric_difference(self, indices): + if isinstance(indices, CategoricalIndex): + return + + first = indices[1:] + second = indices[:-1] + answer = indices[[0, -1]] + result = first.symmetric_difference(second) + assert tm.equalContents(result, answer) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + result = first.symmetric_difference(case) + assert tm.equalContents(result, answer) + + if isinstance(indices, MultiIndex): + msg = "other must be a MultiIndex or a list of tuples" + with pytest.raises(TypeError, match=msg): + first.symmetric_difference([1, 2, 3]) + + def test_insert_base(self, indices): + result = indices[1:4] + + if not len(indices): + return + + # test 0th element + assert indices[0:4].equals(result.insert(0, indices[0])) + + def test_delete_base(self, indices): + if not len(indices): + return + + if isinstance(indices, RangeIndex): + # tested in class + return + + expected = indices[1:] + result = indices.delete(0) + assert result.equals(expected) + assert result.name == expected.name + + expected = indices[:-1] + result = indices.delete(-1) + assert result.equals(expected) + assert result.name == expected.name + + with pytest.raises((IndexError, ValueError)): + # either depending on numpy version + indices.delete(len(indices)) + + def test_equals(self, indices): + if isinstance(indices, IntervalIndex): + # IntervalIndex tested separately + return + + assert indices.equals(indices) + assert indices.equals(indices.copy()) + assert indices.equals(indices.astype(object)) + + assert not indices.equals(list(indices)) + assert not indices.equals(np.array(indices)) + + # Cannot pass in non-int64 dtype to RangeIndex + if not isinstance(indices, RangeIndex): + same_values = Index(indices, dtype=object) + assert indices.equals(same_values) + assert same_values.equals(indices) + + if indices.nlevels == 1: + # do not test MultiIndex + assert not indices.equals(Series(indices)) + + def test_equals_op(self): + # GH9947, GH10637 + index_a = self.create_index() + if isinstance(index_a, PeriodIndex): + pytest.skip("Skip check for PeriodIndex") + + n = len(index_a) + index_b = index_a[0:-1] + index_c = index_a[0:-1].append(index_a[-2:-1]) + index_d = index_a[0:1] + + msg = "Lengths must match|could not be broadcast" + with pytest.raises(ValueError, match=msg): + index_a == index_b + expected1 = np.array([True] * n) + expected2 = np.array([True] * (n - 1) + [False]) + tm.assert_numpy_array_equal(index_a == index_a, expected1) + tm.assert_numpy_array_equal(index_a == index_c, expected2) + + # test comparisons with numpy arrays + array_a = np.array(index_a) + array_b = np.array(index_a[0:-1]) + array_c = np.array(index_a[0:-1].append(index_a[-2:-1])) + array_d = np.array(index_a[0:1]) + with pytest.raises(ValueError, match=msg): + index_a == array_b + tm.assert_numpy_array_equal(index_a == array_a, expected1) + tm.assert_numpy_array_equal(index_a == array_c, expected2) + + # test comparisons with Series + series_a = Series(array_a) + series_b = Series(array_b) + series_c = Series(array_c) + series_d = Series(array_d) + with pytest.raises(ValueError, match=msg): + index_a == series_b + + tm.assert_numpy_array_equal(index_a == series_a, expected1) + tm.assert_numpy_array_equal(index_a == series_c, expected2) + + # cases where length is 1 for one of them + with pytest.raises(ValueError, match="Lengths must match"): + index_a == index_d + with pytest.raises(ValueError, match="Lengths must match"): + index_a == series_d + with pytest.raises(ValueError, match="Lengths must match"): + index_a == array_d + msg = "Can only compare identically-labeled Series objects" + with pytest.raises(ValueError, match=msg): + series_a == series_d + with pytest.raises(ValueError, match="Lengths must match"): + series_a == array_d + + # comparing with a scalar should broadcast; note that we are excluding + # MultiIndex because in this case each item in the index is a tuple of + # length 2, and therefore is considered an array of length 2 in the + # comparison instead of a scalar + if not isinstance(index_a, MultiIndex): + expected3 = np.array([False] * (len(index_a) - 2) + [True, False]) + # assuming the 2nd to last item is unique in the data + item = index_a[-2] + tm.assert_numpy_array_equal(index_a == item, expected3) + tm.assert_series_equal(series_a == item, Series(expected3)) + + def test_hasnans_isnans(self, indices): + # GH 11343, added tests for hasnans / isnans + if isinstance(indices, MultiIndex): + return + + # cases in indices doesn't include NaN + idx = indices.copy(deep=True) + expected = np.array([False] * len(idx), dtype=bool) + tm.assert_numpy_array_equal(idx._isnan, expected) + assert idx.hasnans is False + + idx = indices.copy(deep=True) + values = np.asarray(idx.values) + + if len(indices) == 0: + return + elif isinstance(indices, DatetimeIndexOpsMixin): + values[1] = iNaT + elif isinstance(indices, (Int64Index, UInt64Index)): + return + else: + values[1] = np.nan + + if isinstance(indices, PeriodIndex): + idx = type(indices)(values, freq=indices.freq) + else: + idx = type(indices)(values) + + expected = np.array([False] * len(idx), dtype=bool) + expected[1] = True + tm.assert_numpy_array_equal(idx._isnan, expected) + assert idx.hasnans is True + + def test_fillna(self, indices): + # GH 11343 + if len(indices) == 0: + pass + elif isinstance(indices, MultiIndex): + idx = indices.copy(deep=True) + msg = "isna is not defined for MultiIndex" + with pytest.raises(NotImplementedError, match=msg): + idx.fillna(idx[0]) + else: + idx = indices.copy(deep=True) + result = idx.fillna(idx[0]) + tm.assert_index_equal(result, idx) + assert result is not idx + + msg = "'value' must be a scalar, passed: " + with pytest.raises(TypeError, match=msg): + idx.fillna([idx[0]]) + + idx = indices.copy(deep=True) + values = np.asarray(idx.values) + + if isinstance(indices, DatetimeIndexOpsMixin): + values[1] = iNaT + elif isinstance(indices, (Int64Index, UInt64Index)): + return + else: + values[1] = np.nan + + if isinstance(indices, PeriodIndex): + idx = type(indices)(values, freq=indices.freq) + else: + idx = type(indices)(values) + + expected = np.array([False] * len(idx), dtype=bool) + expected[1] = True + tm.assert_numpy_array_equal(idx._isnan, expected) + assert idx.hasnans is True + + def test_nulls(self, indices): + # this is really a smoke test for the methods + # as these are adequately tested for function elsewhere + if len(indices) == 0: + tm.assert_numpy_array_equal(indices.isna(), np.array([], dtype=bool)) + elif isinstance(indices, MultiIndex): + idx = indices.copy() + msg = "isna is not defined for MultiIndex" + with pytest.raises(NotImplementedError, match=msg): + idx.isna() + elif not indices.hasnans: + tm.assert_numpy_array_equal( + indices.isna(), np.zeros(len(indices), dtype=bool) + ) + tm.assert_numpy_array_equal( + indices.notna(), np.ones(len(indices), dtype=bool) + ) + else: + result = isna(indices) + tm.assert_numpy_array_equal(indices.isna(), result) + tm.assert_numpy_array_equal(indices.notna(), ~result) + + def test_empty(self): + # GH 15270 + index = self.create_index() + assert not index.empty + assert index[:0].empty + + def test_join_self_unique(self, join_type): + index = self.create_index() + if index.is_unique: + joined = index.join(index, how=join_type) + assert (index == joined).all() + + def test_map(self): + # callable + index = self.create_index() + + # we don't infer UInt64 + if isinstance(index, pd.UInt64Index): + expected = index.astype("int64") + else: + expected = index + + result = index.map(lambda x: x) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "mapper", + [ + lambda values, index: {i: e for e, i in zip(values, index)}, + lambda values, index: pd.Series(values, index), + ], + ) + def test_map_dictlike(self, mapper): + + index = self.create_index() + if isinstance(index, (pd.CategoricalIndex, pd.IntervalIndex)): + pytest.skip("skipping tests for {}".format(type(index))) + + identity = mapper(index.values, index) + + # we don't infer to UInt64 for a dict + if isinstance(index, pd.UInt64Index) and isinstance(identity, dict): + expected = index.astype("int64") + else: + expected = index + + result = index.map(identity) + tm.assert_index_equal(result, expected) + + # empty mappable + expected = pd.Index([np.nan] * len(index)) + result = index.map(mapper(expected, index)) + tm.assert_index_equal(result, expected) + + def test_map_str(self): + # GH 31202 + index = self.create_index() + result = index.map(str) + expected = Index([str(x) for x in index], dtype=object) + tm.assert_index_equal(result, expected) + + def test_putmask_with_wrong_mask(self): + # GH18368 + index = self.create_index() + + with pytest.raises(ValueError): + index.putmask(np.ones(len(index) + 1, np.bool), 1) + + with pytest.raises(ValueError): + index.putmask(np.ones(len(index) - 1, np.bool), 1) + + with pytest.raises(ValueError): + index.putmask("foo", 1) + + @pytest.mark.parametrize("copy", [True, False]) + @pytest.mark.parametrize("name", [None, "foo"]) + @pytest.mark.parametrize("ordered", [True, False]) + def test_astype_category(self, copy, name, ordered): + # GH 18630 + index = self.create_index() + if name: + index = index.rename(name) + + # standard categories + dtype = CategoricalDtype(ordered=ordered) + result = index.astype(dtype, copy=copy) + expected = CategoricalIndex(index.values, name=name, ordered=ordered) + tm.assert_index_equal(result, expected) + + # non-standard categories + dtype = CategoricalDtype(index.unique().tolist()[:-1], ordered) + result = index.astype(dtype, copy=copy) + expected = CategoricalIndex(index.values, name=name, dtype=dtype) + tm.assert_index_equal(result, expected) + + if ordered is False: + # dtype='category' defaults to ordered=False, so only test once + result = index.astype("category", copy=copy) + expected = CategoricalIndex(index.values, name=name) + tm.assert_index_equal(result, expected) + + def test_is_unique(self): + # initialize a unique index + index = self.create_index().drop_duplicates() + assert index.is_unique is True + + # empty index should be unique + index_empty = index[:0] + assert index_empty.is_unique is True + + # test basic dupes + index_dup = index.insert(0, index[0]) + assert index_dup.is_unique is False + + # single NA should be unique + index_na = index.insert(0, np.nan) + assert index_na.is_unique is True + + # multiple NA should not be unique + index_na_dup = index_na.insert(0, np.nan) + assert index_na_dup.is_unique is False + + def test_engine_reference_cycle(self): + # GH27585 + index = self.create_index() + nrefs_pre = len(gc.get_referrers(index)) + index._engine + assert len(gc.get_referrers(index)) == nrefs_pre + + def test_getitem_2d_deprecated(self): + # GH#30588 + idx = self.create_index() + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + res = idx[:, None] + + assert isinstance(res, np.ndarray), type(res) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/conftest.py b/venv/Lib/site-packages/pandas/tests/indexes/conftest.py new file mode 100644 index 0000000..e3e7ff4 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/conftest.py @@ -0,0 +1,52 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.core.indexes.api import Index, MultiIndex + +indices_dict = { + "unicode": tm.makeUnicodeIndex(100), + "string": tm.makeStringIndex(100), + "datetime": tm.makeDateIndex(100), + "period": tm.makePeriodIndex(100), + "timedelta": tm.makeTimedeltaIndex(100), + "int": tm.makeIntIndex(100), + "uint": tm.makeUIntIndex(100), + "range": tm.makeRangeIndex(100), + "float": tm.makeFloatIndex(100), + "bool": Index([True, False]), + "categorical": tm.makeCategoricalIndex(100), + "interval": tm.makeIntervalIndex(100), + "empty": Index([]), + "tuples": MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])), + "repeats": Index([0, 0, 1, 1, 2, 2]), +} + + +@pytest.fixture(params=indices_dict.keys()) +def indices(request): + # copy to avoid mutation, e.g. setting .name + return indices_dict[request.param].copy() + + +@pytest.fixture(params=[1, np.array(1, dtype=np.int64)]) +def one(request): + # zero-dim integer array behaves like an integer + return request.param + + +zeros = [ + box([0] * 5, dtype=dtype) + for box in [pd.Index, np.array] + for dtype in [np.int64, np.uint64, np.float64] +] +zeros.extend([np.array(0, dtype=dtype) for dtype in [np.int64, np.uint64, np.float64]]) +zeros.extend([0, 0.0]) + + +@pytest.fixture(params=zeros) +def zero(request): + # For testing division by (or of) zero for Index with length 5, this + # gives several scalar-zeros and length-5 vector-zeros + return request.param diff --git a/venv/Lib/site-packages/pandas/tests/indexes/datetimelike.py b/venv/Lib/site-packages/pandas/tests/indexes/datetimelike.py new file mode 100644 index 0000000..3c72d34 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/datetimelike.py @@ -0,0 +1,97 @@ +""" generic datetimelike tests """ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + +from .common import Base + + +class DatetimeLike(Base): + def test_argmax_axis_invalid(self): + # GH#23081 + rng = self.create_index() + with pytest.raises(ValueError): + rng.argmax(axis=1) + with pytest.raises(ValueError): + rng.argmin(axis=2) + with pytest.raises(ValueError): + rng.min(axis=-2) + with pytest.raises(ValueError): + rng.max(axis=-3) + + def test_can_hold_identifiers(self): + idx = self.create_index() + key = idx[0] + assert idx._can_hold_identifiers_and_holds_name(key) is False + + def test_shift_identity(self): + + idx = self.create_index() + tm.assert_index_equal(idx, idx.shift(0)) + + def test_str(self): + + # test the string repr + idx = self.create_index() + idx.name = "foo" + assert not "length={}".format(len(idx)) in str(idx) + assert "'foo'" in str(idx) + assert type(idx).__name__ in str(idx) + + if hasattr(idx, "tz"): + if idx.tz is not None: + assert idx.tz in str(idx) + if hasattr(idx, "freq"): + assert "freq='{idx.freqstr}'".format(idx=idx) in str(idx) + + def test_view(self): + i = self.create_index() + + i_view = i.view("i8") + result = self._holder(i) + tm.assert_index_equal(result, i) + + i_view = i.view(self._holder) + result = self._holder(i) + tm.assert_index_equal(result, i_view) + + def test_map_callable(self): + index = self.create_index() + expected = index + index.freq + result = index.map(lambda x: x + x.freq) + tm.assert_index_equal(result, expected) + + # map to NaT + result = index.map(lambda x: pd.NaT if x == index[0] else x) + expected = pd.Index([pd.NaT] + index[1:].tolist()) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "mapper", + [ + lambda values, index: {i: e for e, i in zip(values, index)}, + lambda values, index: pd.Series(values, index, dtype=object), + ], + ) + def test_map_dictlike(self, mapper): + index = self.create_index() + expected = index + index.freq + + # don't compare the freqs + if isinstance(expected, pd.DatetimeIndex): + expected._data.freq = None + + result = index.map(mapper(expected, index)) + tm.assert_index_equal(result, expected) + + expected = pd.Index([pd.NaT] + index[1:].tolist()) + result = index.map(mapper(expected, index)) + tm.assert_index_equal(result, expected) + + # empty map; these map to np.nan because we cannot know + # to re-infer things + expected = pd.Index([np.nan] * len(index)) + result = index.map(mapper([], [])) + tm.assert_index_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/datetimes/__init__.py b/venv/Lib/site-packages/pandas/tests/indexes/datetimes/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_astype.py b/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_astype.py new file mode 100644 index 0000000..6139726 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_astype.py @@ -0,0 +1,378 @@ +from datetime import datetime + +import dateutil +from dateutil.tz import tzlocal +import numpy as np +import pytest +import pytz + +import pandas as pd +from pandas import ( + DatetimeIndex, + Index, + Int64Index, + NaT, + Period, + Series, + Timestamp, + date_range, +) +import pandas._testing as tm + + +class TestDatetimeIndex: + def test_astype(self): + # GH 13149, GH 13209 + idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN]) + + result = idx.astype(object) + expected = Index([Timestamp("2016-05-16")] + [NaT] * 3, dtype=object) + tm.assert_index_equal(result, expected) + + result = idx.astype(int) + expected = Int64Index( + [1463356800000000000] + [-9223372036854775808] * 3, dtype=np.int64 + ) + tm.assert_index_equal(result, expected) + + rng = date_range("1/1/2000", periods=10) + result = rng.astype("i8") + tm.assert_index_equal(result, Index(rng.asi8)) + tm.assert_numpy_array_equal(result.values, rng.asi8) + + def test_astype_uint(self): + arr = date_range("2000", periods=2) + expected = pd.UInt64Index( + np.array([946684800000000000, 946771200000000000], dtype="uint64") + ) + + tm.assert_index_equal(arr.astype("uint64"), expected) + tm.assert_index_equal(arr.astype("uint32"), expected) + + def test_astype_with_tz(self): + + # with tz + rng = date_range("1/1/2000", periods=10, tz="US/Eastern") + result = rng.astype("datetime64[ns]") + expected = ( + date_range("1/1/2000", periods=10, tz="US/Eastern") + .tz_convert("UTC") + .tz_localize(None) + ) + tm.assert_index_equal(result, expected) + + # BUG#10442 : testing astype(str) is correct for Series/DatetimeIndex + result = pd.Series(pd.date_range("2012-01-01", periods=3)).astype(str) + expected = pd.Series(["2012-01-01", "2012-01-02", "2012-01-03"], dtype=object) + tm.assert_series_equal(result, expected) + + result = Series(pd.date_range("2012-01-01", periods=3, tz="US/Eastern")).astype( + str + ) + expected = Series( + [ + "2012-01-01 00:00:00-05:00", + "2012-01-02 00:00:00-05:00", + "2012-01-03 00:00:00-05:00", + ], + dtype=object, + ) + tm.assert_series_equal(result, expected) + + # GH 18951: tz-aware to tz-aware + idx = date_range("20170101", periods=4, tz="US/Pacific") + result = idx.astype("datetime64[ns, US/Eastern]") + expected = date_range("20170101 03:00:00", periods=4, tz="US/Eastern") + tm.assert_index_equal(result, expected) + + # GH 18951: tz-naive to tz-aware + idx = date_range("20170101", periods=4) + result = idx.astype("datetime64[ns, US/Eastern]") + expected = date_range("20170101", periods=4, tz="US/Eastern") + tm.assert_index_equal(result, expected) + + def test_astype_str_compat(self): + # GH 13149, GH 13209 + # verify that we are returning NaT as a string (and not unicode) + + idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN]) + result = idx.astype(str) + expected = Index(["2016-05-16", "NaT", "NaT", "NaT"], dtype=object) + tm.assert_index_equal(result, expected) + + def test_astype_str(self): + # test astype string - #10442 + result = date_range("2012-01-01", periods=4, name="test_name").astype(str) + expected = Index( + ["2012-01-01", "2012-01-02", "2012-01-03", "2012-01-04"], + name="test_name", + dtype=object, + ) + tm.assert_index_equal(result, expected) + + # test astype string with tz and name + result = date_range( + "2012-01-01", periods=3, name="test_name", tz="US/Eastern" + ).astype(str) + expected = Index( + [ + "2012-01-01 00:00:00-05:00", + "2012-01-02 00:00:00-05:00", + "2012-01-03 00:00:00-05:00", + ], + name="test_name", + dtype=object, + ) + tm.assert_index_equal(result, expected) + + # test astype string with freqH and name + result = date_range("1/1/2011", periods=3, freq="H", name="test_name").astype( + str + ) + expected = Index( + ["2011-01-01 00:00:00", "2011-01-01 01:00:00", "2011-01-01 02:00:00"], + name="test_name", + dtype=object, + ) + tm.assert_index_equal(result, expected) + + # test astype string with freqH and timezone + result = date_range( + "3/6/2012 00:00", periods=2, freq="H", tz="Europe/London", name="test_name" + ).astype(str) + expected = Index( + ["2012-03-06 00:00:00+00:00", "2012-03-06 01:00:00+00:00"], + dtype=object, + name="test_name", + ) + tm.assert_index_equal(result, expected) + + def test_astype_datetime64(self): + # GH 13149, GH 13209 + idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN]) + + result = idx.astype("datetime64[ns]") + tm.assert_index_equal(result, idx) + assert result is not idx + + result = idx.astype("datetime64[ns]", copy=False) + tm.assert_index_equal(result, idx) + assert result is idx + + idx_tz = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN], tz="EST") + result = idx_tz.astype("datetime64[ns]") + expected = DatetimeIndex( + ["2016-05-16 05:00:00", "NaT", "NaT", "NaT"], dtype="datetime64[ns]" + ) + tm.assert_index_equal(result, expected) + + def test_astype_object(self): + rng = date_range("1/1/2000", periods=20) + + casted = rng.astype("O") + exp_values = list(rng) + + tm.assert_index_equal(casted, Index(exp_values, dtype=np.object_)) + assert casted.tolist() == exp_values + + @pytest.mark.parametrize("tz", [None, "Asia/Tokyo"]) + def test_astype_object_tz(self, tz): + idx = pd.date_range(start="2013-01-01", periods=4, freq="M", name="idx", tz=tz) + expected_list = [ + Timestamp("2013-01-31", tz=tz), + Timestamp("2013-02-28", tz=tz), + Timestamp("2013-03-31", tz=tz), + Timestamp("2013-04-30", tz=tz), + ] + expected = pd.Index(expected_list, dtype=object, name="idx") + result = idx.astype(object) + tm.assert_index_equal(result, expected) + assert idx.tolist() == expected_list + + def test_astype_object_with_nat(self): + idx = DatetimeIndex( + [datetime(2013, 1, 1), datetime(2013, 1, 2), pd.NaT, datetime(2013, 1, 4)], + name="idx", + ) + expected_list = [ + Timestamp("2013-01-01"), + Timestamp("2013-01-02"), + pd.NaT, + Timestamp("2013-01-04"), + ] + expected = pd.Index(expected_list, dtype=object, name="idx") + result = idx.astype(object) + tm.assert_index_equal(result, expected) + assert idx.tolist() == expected_list + + @pytest.mark.parametrize( + "dtype", + [float, "timedelta64", "timedelta64[ns]", "datetime64", "datetime64[D]"], + ) + def test_astype_raises(self, dtype): + # GH 13149, GH 13209 + idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN]) + msg = "Cannot cast DatetimeArray to dtype" + with pytest.raises(TypeError, match=msg): + idx.astype(dtype) + + def test_index_convert_to_datetime_array(self): + def _check_rng(rng): + converted = rng.to_pydatetime() + assert isinstance(converted, np.ndarray) + for x, stamp in zip(converted, rng): + assert isinstance(x, datetime) + assert x == stamp.to_pydatetime() + assert x.tzinfo == stamp.tzinfo + + rng = date_range("20090415", "20090519") + rng_eastern = date_range("20090415", "20090519", tz="US/Eastern") + rng_utc = date_range("20090415", "20090519", tz="utc") + + _check_rng(rng) + _check_rng(rng_eastern) + _check_rng(rng_utc) + + def test_index_convert_to_datetime_array_explicit_pytz(self): + def _check_rng(rng): + converted = rng.to_pydatetime() + assert isinstance(converted, np.ndarray) + for x, stamp in zip(converted, rng): + assert isinstance(x, datetime) + assert x == stamp.to_pydatetime() + assert x.tzinfo == stamp.tzinfo + + rng = date_range("20090415", "20090519") + rng_eastern = date_range("20090415", "20090519", tz=pytz.timezone("US/Eastern")) + rng_utc = date_range("20090415", "20090519", tz=pytz.utc) + + _check_rng(rng) + _check_rng(rng_eastern) + _check_rng(rng_utc) + + def test_index_convert_to_datetime_array_dateutil(self): + def _check_rng(rng): + converted = rng.to_pydatetime() + assert isinstance(converted, np.ndarray) + for x, stamp in zip(converted, rng): + assert isinstance(x, datetime) + assert x == stamp.to_pydatetime() + assert x.tzinfo == stamp.tzinfo + + rng = date_range("20090415", "20090519") + rng_eastern = date_range("20090415", "20090519", tz="dateutil/US/Eastern") + rng_utc = date_range("20090415", "20090519", tz=dateutil.tz.tzutc()) + + _check_rng(rng) + _check_rng(rng_eastern) + _check_rng(rng_utc) + + @pytest.mark.parametrize( + "tz, dtype", + [["US/Pacific", "datetime64[ns, US/Pacific]"], [None, "datetime64[ns]"]], + ) + def test_integer_index_astype_datetime(self, tz, dtype): + # GH 20997, 20964, 24559 + val = [pd.Timestamp("2018-01-01", tz=tz).value] + result = pd.Index(val).astype(dtype) + expected = pd.DatetimeIndex(["2018-01-01"], tz=tz) + tm.assert_index_equal(result, expected) + + +class TestToPeriod: + def setup_method(self, method): + data = [ + Timestamp("2007-01-01 10:11:12.123456Z"), + Timestamp("2007-01-01 10:11:13.789123Z"), + ] + self.index = DatetimeIndex(data) + + def test_to_period_millisecond(self): + index = self.index + + with tm.assert_produces_warning(UserWarning): + # warning that timezone info will be lost + period = index.to_period(freq="L") + assert 2 == len(period) + assert period[0] == Period("2007-01-01 10:11:12.123Z", "L") + assert period[1] == Period("2007-01-01 10:11:13.789Z", "L") + + def test_to_period_microsecond(self): + index = self.index + + with tm.assert_produces_warning(UserWarning): + # warning that timezone info will be lost + period = index.to_period(freq="U") + assert 2 == len(period) + assert period[0] == Period("2007-01-01 10:11:12.123456Z", "U") + assert period[1] == Period("2007-01-01 10:11:13.789123Z", "U") + + @pytest.mark.parametrize( + "tz", + ["US/Eastern", pytz.utc, tzlocal(), "dateutil/US/Eastern", dateutil.tz.tzutc()], + ) + def test_to_period_tz(self, tz): + ts = date_range("1/1/2000", "2/1/2000", tz=tz) + + with tm.assert_produces_warning(UserWarning): + # GH#21333 warning that timezone info will be lost + result = ts.to_period()[0] + expected = ts[0].to_period() + + assert result == expected + + expected = date_range("1/1/2000", "2/1/2000").to_period() + + with tm.assert_produces_warning(UserWarning): + # GH#21333 warning that timezone info will be lost + result = ts.to_period() + + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("tz", ["Etc/GMT-1", "Etc/GMT+1"]) + def test_to_period_tz_utc_offset_consistency(self, tz): + # GH 22905 + ts = pd.date_range("1/1/2000", "2/1/2000", tz="Etc/GMT-1") + with tm.assert_produces_warning(UserWarning): + result = ts.to_period()[0] + expected = ts[0].to_period() + assert result == expected + + def test_to_period_nofreq(self): + idx = DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-04"]) + with pytest.raises(ValueError): + idx.to_period() + + idx = DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-03"], freq="infer") + assert idx.freqstr == "D" + expected = pd.PeriodIndex(["2000-01-01", "2000-01-02", "2000-01-03"], freq="D") + tm.assert_index_equal(idx.to_period(), expected) + + # GH 7606 + idx = DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-03"]) + assert idx.freqstr is None + tm.assert_index_equal(idx.to_period(), expected) + + @pytest.mark.parametrize("tz", [None, "US/Central"]) + def test_astype_category(self, tz): + obj = pd.date_range("2000", periods=2, tz=tz) + result = obj.astype("category") + expected = pd.CategoricalIndex( + [pd.Timestamp("2000-01-01", tz=tz), pd.Timestamp("2000-01-02", tz=tz)] + ) + tm.assert_index_equal(result, expected) + + result = obj._data.astype("category") + expected = expected.values + tm.assert_categorical_equal(result, expected) + + @pytest.mark.parametrize("tz", [None, "US/Central"]) + def test_astype_array_fallback(self, tz): + obj = pd.date_range("2000", periods=2, tz=tz) + result = obj.astype(bool) + expected = pd.Index(np.array([True, True])) + tm.assert_index_equal(result, expected) + + result = obj._data.astype(bool) + expected = np.array([True, True]) + tm.assert_numpy_array_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_constructors.py b/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_constructors.py new file mode 100644 index 0000000..ffe51dd --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_constructors.py @@ -0,0 +1,952 @@ +from datetime import datetime, timedelta +from functools import partial +from operator import attrgetter + +import dateutil +import numpy as np +import pytest +import pytz + +from pandas._libs.tslibs import OutOfBoundsDatetime, conversion + +import pandas as pd +from pandas import DatetimeIndex, Index, Timestamp, date_range, offsets, to_datetime +import pandas._testing as tm +from pandas.core.arrays import DatetimeArray, period_array + + +class TestDatetimeIndex: + @pytest.mark.parametrize("dt_cls", [DatetimeIndex, DatetimeArray._from_sequence]) + def test_freq_validation_with_nat(self, dt_cls): + # GH#11587 make sure we get a useful error message when generate_range + # raises + msg = ( + "Inferred frequency None from passed values does not conform " + "to passed frequency D" + ) + with pytest.raises(ValueError, match=msg): + dt_cls([pd.NaT, pd.Timestamp("2011-01-01")], freq="D") + with pytest.raises(ValueError, match=msg): + dt_cls([pd.NaT, pd.Timestamp("2011-01-01").value], freq="D") + + # TODO: better place for tests shared by DTI/TDI? + @pytest.mark.parametrize( + "index", + [ + pd.date_range("2016-01-01", periods=5, tz="US/Pacific"), + pd.timedelta_range("1 Day", periods=5), + ], + ) + def test_shallow_copy_inherits_array_freq(self, index): + # If we pass a DTA/TDA to shallow_copy and dont specify a freq, + # we should inherit the array's freq, not our own. + array = index._data + + arr = array[[0, 3, 2, 4, 1]] + assert arr.freq is None + + result = index._shallow_copy(arr) + assert result.freq is None + + def test_categorical_preserves_tz(self): + # GH#18664 retain tz when going DTI-->Categorical-->DTI + # TODO: parametrize over DatetimeIndex/DatetimeArray + # once CategoricalIndex(DTA) works + + dti = pd.DatetimeIndex( + [pd.NaT, "2015-01-01", "1999-04-06 15:14:13", "2015-01-01"], tz="US/Eastern" + ) + + ci = pd.CategoricalIndex(dti) + carr = pd.Categorical(dti) + cser = pd.Series(ci) + + for obj in [ci, carr, cser]: + result = pd.DatetimeIndex(obj) + tm.assert_index_equal(result, dti) + + def test_dti_with_period_data_raises(self): + # GH#23675 + data = pd.PeriodIndex(["2016Q1", "2016Q2"], freq="Q") + + with pytest.raises(TypeError, match="PeriodDtype data is invalid"): + DatetimeIndex(data) + + with pytest.raises(TypeError, match="PeriodDtype data is invalid"): + to_datetime(data) + + with pytest.raises(TypeError, match="PeriodDtype data is invalid"): + DatetimeIndex(period_array(data)) + + with pytest.raises(TypeError, match="PeriodDtype data is invalid"): + to_datetime(period_array(data)) + + def test_dti_with_timedelta64_data_raises(self): + # GH#23675 deprecated, enforrced in GH#29794 + data = np.array([0], dtype="m8[ns]") + msg = r"timedelta64\[ns\] cannot be converted to datetime64" + with pytest.raises(TypeError, match=msg): + DatetimeIndex(data) + + with pytest.raises(TypeError, match=msg): + to_datetime(data) + + with pytest.raises(TypeError, match=msg): + DatetimeIndex(pd.TimedeltaIndex(data)) + + with pytest.raises(TypeError, match=msg): + to_datetime(pd.TimedeltaIndex(data)) + + def test_construction_caching(self): + + df = pd.DataFrame( + { + "dt": pd.date_range("20130101", periods=3), + "dttz": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "dt_with_null": [ + pd.Timestamp("20130101"), + pd.NaT, + pd.Timestamp("20130103"), + ], + "dtns": pd.date_range("20130101", periods=3, freq="ns"), + } + ) + assert df.dttz.dtype.tz.zone == "US/Eastern" + + @pytest.mark.parametrize( + "kwargs", + [{"tz": "dtype.tz"}, {"dtype": "dtype"}, {"dtype": "dtype", "tz": "dtype.tz"}], + ) + def test_construction_with_alt(self, kwargs, tz_aware_fixture): + tz = tz_aware_fixture + i = pd.date_range("20130101", periods=5, freq="H", tz=tz) + kwargs = {key: attrgetter(val)(i) for key, val in kwargs.items()} + result = DatetimeIndex(i, **kwargs) + tm.assert_index_equal(i, result) + + @pytest.mark.parametrize( + "kwargs", + [{"tz": "dtype.tz"}, {"dtype": "dtype"}, {"dtype": "dtype", "tz": "dtype.tz"}], + ) + def test_construction_with_alt_tz_localize(self, kwargs, tz_aware_fixture): + tz = tz_aware_fixture + i = pd.date_range("20130101", periods=5, freq="H", tz=tz) + kwargs = {key: attrgetter(val)(i) for key, val in kwargs.items()} + + if "tz" in kwargs: + result = DatetimeIndex(i.asi8, tz="UTC").tz_convert(kwargs["tz"]) + + expected = DatetimeIndex(i, **kwargs) + tm.assert_index_equal(result, expected) + + # localize into the provided tz + i2 = DatetimeIndex(i.tz_localize(None).asi8, tz="UTC") + expected = i.tz_localize(None).tz_localize("UTC") + tm.assert_index_equal(i2, expected) + + # incompat tz/dtype + msg = "cannot supply both a tz and a dtype with a tz" + with pytest.raises(ValueError, match=msg): + DatetimeIndex(i.tz_localize(None).asi8, dtype=i.dtype, tz="US/Pacific") + + def test_construction_index_with_mixed_timezones(self): + # gh-11488: no tz results in DatetimeIndex + result = Index([Timestamp("2011-01-01"), Timestamp("2011-01-02")], name="idx") + exp = DatetimeIndex( + [Timestamp("2011-01-01"), Timestamp("2011-01-02")], name="idx" + ) + tm.assert_index_equal(result, exp, exact=True) + assert isinstance(result, DatetimeIndex) + assert result.tz is None + + # same tz results in DatetimeIndex + result = Index( + [ + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + Timestamp("2011-01-02 10:00", tz="Asia/Tokyo"), + ], + name="idx", + ) + exp = DatetimeIndex( + [Timestamp("2011-01-01 10:00"), Timestamp("2011-01-02 10:00")], + tz="Asia/Tokyo", + name="idx", + ) + tm.assert_index_equal(result, exp, exact=True) + assert isinstance(result, DatetimeIndex) + assert result.tz is not None + assert result.tz == exp.tz + + # same tz results in DatetimeIndex (DST) + result = Index( + [ + Timestamp("2011-01-01 10:00", tz="US/Eastern"), + Timestamp("2011-08-01 10:00", tz="US/Eastern"), + ], + name="idx", + ) + exp = DatetimeIndex( + [Timestamp("2011-01-01 10:00"), Timestamp("2011-08-01 10:00")], + tz="US/Eastern", + name="idx", + ) + tm.assert_index_equal(result, exp, exact=True) + assert isinstance(result, DatetimeIndex) + assert result.tz is not None + assert result.tz == exp.tz + + # Different tz results in Index(dtype=object) + result = Index( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + name="idx", + ) + exp = Index( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + dtype="object", + name="idx", + ) + tm.assert_index_equal(result, exp, exact=True) + assert not isinstance(result, DatetimeIndex) + + result = Index( + [ + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + name="idx", + ) + exp = Index( + [ + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + dtype="object", + name="idx", + ) + tm.assert_index_equal(result, exp, exact=True) + assert not isinstance(result, DatetimeIndex) + + # length = 1 + result = Index([Timestamp("2011-01-01")], name="idx") + exp = DatetimeIndex([Timestamp("2011-01-01")], name="idx") + tm.assert_index_equal(result, exp, exact=True) + assert isinstance(result, DatetimeIndex) + assert result.tz is None + + # length = 1 with tz + result = Index([Timestamp("2011-01-01 10:00", tz="Asia/Tokyo")], name="idx") + exp = DatetimeIndex( + [Timestamp("2011-01-01 10:00")], tz="Asia/Tokyo", name="idx" + ) + tm.assert_index_equal(result, exp, exact=True) + assert isinstance(result, DatetimeIndex) + assert result.tz is not None + assert result.tz == exp.tz + + def test_construction_index_with_mixed_timezones_with_NaT(self): + # see gh-11488 + result = Index( + [pd.NaT, Timestamp("2011-01-01"), pd.NaT, Timestamp("2011-01-02")], + name="idx", + ) + exp = DatetimeIndex( + [pd.NaT, Timestamp("2011-01-01"), pd.NaT, Timestamp("2011-01-02")], + name="idx", + ) + tm.assert_index_equal(result, exp, exact=True) + assert isinstance(result, DatetimeIndex) + assert result.tz is None + + # Same tz results in DatetimeIndex + result = Index( + [ + pd.NaT, + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + pd.NaT, + Timestamp("2011-01-02 10:00", tz="Asia/Tokyo"), + ], + name="idx", + ) + exp = DatetimeIndex( + [ + pd.NaT, + Timestamp("2011-01-01 10:00"), + pd.NaT, + Timestamp("2011-01-02 10:00"), + ], + tz="Asia/Tokyo", + name="idx", + ) + tm.assert_index_equal(result, exp, exact=True) + assert isinstance(result, DatetimeIndex) + assert result.tz is not None + assert result.tz == exp.tz + + # same tz results in DatetimeIndex (DST) + result = Index( + [ + Timestamp("2011-01-01 10:00", tz="US/Eastern"), + pd.NaT, + Timestamp("2011-08-01 10:00", tz="US/Eastern"), + ], + name="idx", + ) + exp = DatetimeIndex( + [Timestamp("2011-01-01 10:00"), pd.NaT, Timestamp("2011-08-01 10:00")], + tz="US/Eastern", + name="idx", + ) + tm.assert_index_equal(result, exp, exact=True) + assert isinstance(result, DatetimeIndex) + assert result.tz is not None + assert result.tz == exp.tz + + # different tz results in Index(dtype=object) + result = Index( + [ + pd.NaT, + Timestamp("2011-01-01 10:00"), + pd.NaT, + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + name="idx", + ) + exp = Index( + [ + pd.NaT, + Timestamp("2011-01-01 10:00"), + pd.NaT, + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + dtype="object", + name="idx", + ) + tm.assert_index_equal(result, exp, exact=True) + assert not isinstance(result, DatetimeIndex) + + result = Index( + [ + pd.NaT, + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + pd.NaT, + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + name="idx", + ) + exp = Index( + [ + pd.NaT, + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + pd.NaT, + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + dtype="object", + name="idx", + ) + tm.assert_index_equal(result, exp, exact=True) + assert not isinstance(result, DatetimeIndex) + + # all NaT + result = Index([pd.NaT, pd.NaT], name="idx") + exp = DatetimeIndex([pd.NaT, pd.NaT], name="idx") + tm.assert_index_equal(result, exp, exact=True) + assert isinstance(result, DatetimeIndex) + assert result.tz is None + + # all NaT with tz + result = Index([pd.NaT, pd.NaT], tz="Asia/Tokyo", name="idx") + exp = DatetimeIndex([pd.NaT, pd.NaT], tz="Asia/Tokyo", name="idx") + + tm.assert_index_equal(result, exp, exact=True) + assert isinstance(result, DatetimeIndex) + assert result.tz is not None + assert result.tz == exp.tz + + def test_construction_dti_with_mixed_timezones(self): + # GH 11488 (not changed, added explicit tests) + + # no tz results in DatetimeIndex + result = DatetimeIndex( + [Timestamp("2011-01-01"), Timestamp("2011-01-02")], name="idx" + ) + exp = DatetimeIndex( + [Timestamp("2011-01-01"), Timestamp("2011-01-02")], name="idx" + ) + tm.assert_index_equal(result, exp, exact=True) + assert isinstance(result, DatetimeIndex) + + # same tz results in DatetimeIndex + result = DatetimeIndex( + [ + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + Timestamp("2011-01-02 10:00", tz="Asia/Tokyo"), + ], + name="idx", + ) + exp = DatetimeIndex( + [Timestamp("2011-01-01 10:00"), Timestamp("2011-01-02 10:00")], + tz="Asia/Tokyo", + name="idx", + ) + tm.assert_index_equal(result, exp, exact=True) + assert isinstance(result, DatetimeIndex) + + # same tz results in DatetimeIndex (DST) + result = DatetimeIndex( + [ + Timestamp("2011-01-01 10:00", tz="US/Eastern"), + Timestamp("2011-08-01 10:00", tz="US/Eastern"), + ], + name="idx", + ) + exp = DatetimeIndex( + [Timestamp("2011-01-01 10:00"), Timestamp("2011-08-01 10:00")], + tz="US/Eastern", + name="idx", + ) + tm.assert_index_equal(result, exp, exact=True) + assert isinstance(result, DatetimeIndex) + + # tz mismatch affecting to tz-aware raises TypeError/ValueError + + with pytest.raises(ValueError): + DatetimeIndex( + [ + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + name="idx", + ) + + msg = "cannot be converted to datetime64" + with pytest.raises(ValueError, match=msg): + DatetimeIndex( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + tz="Asia/Tokyo", + name="idx", + ) + + with pytest.raises(ValueError): + DatetimeIndex( + [ + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + tz="US/Eastern", + name="idx", + ) + + with pytest.raises(ValueError, match=msg): + # passing tz should results in DatetimeIndex, then mismatch raises + # TypeError + Index( + [ + pd.NaT, + Timestamp("2011-01-01 10:00"), + pd.NaT, + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + tz="Asia/Tokyo", + name="idx", + ) + + def test_construction_base_constructor(self): + arr = [pd.Timestamp("2011-01-01"), pd.NaT, pd.Timestamp("2011-01-03")] + tm.assert_index_equal(pd.Index(arr), pd.DatetimeIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), pd.DatetimeIndex(np.array(arr))) + + arr = [np.nan, pd.NaT, pd.Timestamp("2011-01-03")] + tm.assert_index_equal(pd.Index(arr), pd.DatetimeIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), pd.DatetimeIndex(np.array(arr))) + + def test_construction_outofbounds(self): + # GH 13663 + dates = [ + datetime(3000, 1, 1), + datetime(4000, 1, 1), + datetime(5000, 1, 1), + datetime(6000, 1, 1), + ] + exp = Index(dates, dtype=object) + # coerces to object + tm.assert_index_equal(Index(dates), exp) + + with pytest.raises(OutOfBoundsDatetime): + # can't create DatetimeIndex + DatetimeIndex(dates) + + def test_construction_with_ndarray(self): + # GH 5152 + dates = [datetime(2013, 10, 7), datetime(2013, 10, 8), datetime(2013, 10, 9)] + data = DatetimeIndex(dates, freq=pd.offsets.BDay()).values + result = DatetimeIndex(data, freq=pd.offsets.BDay()) + expected = DatetimeIndex(["2013-10-07", "2013-10-08", "2013-10-09"], freq="B") + tm.assert_index_equal(result, expected) + + def test_integer_values_and_tz_interpreted_as_utc(self): + # GH-24559 + val = np.datetime64("2000-01-01 00:00:00", "ns") + values = np.array([val.view("i8")]) + + result = DatetimeIndex(values).tz_localize("US/Central") + + expected = pd.DatetimeIndex(["2000-01-01T00:00:00"], tz="US/Central") + tm.assert_index_equal(result, expected) + + # but UTC is *not* deprecated. + with tm.assert_produces_warning(None): + result = DatetimeIndex(values, tz="UTC") + expected = pd.DatetimeIndex(["2000-01-01T00:00:00"], tz="US/Central") + + def test_constructor_coverage(self): + rng = date_range("1/1/2000", periods=10.5) + exp = date_range("1/1/2000", periods=10) + tm.assert_index_equal(rng, exp) + + msg = "periods must be a number, got foo" + with pytest.raises(TypeError, match=msg): + date_range(start="1/1/2000", periods="foo", freq="D") + + with pytest.raises(TypeError): + DatetimeIndex("1/1/2000") + + # generator expression + gen = (datetime(2000, 1, 1) + timedelta(i) for i in range(10)) + result = DatetimeIndex(gen) + expected = DatetimeIndex( + [datetime(2000, 1, 1) + timedelta(i) for i in range(10)] + ) + tm.assert_index_equal(result, expected) + + # NumPy string array + strings = np.array(["2000-01-01", "2000-01-02", "2000-01-03"]) + result = DatetimeIndex(strings) + expected = DatetimeIndex(strings.astype("O")) + tm.assert_index_equal(result, expected) + + from_ints = DatetimeIndex(expected.asi8) + tm.assert_index_equal(from_ints, expected) + + # string with NaT + strings = np.array(["2000-01-01", "2000-01-02", "NaT"]) + result = DatetimeIndex(strings) + expected = DatetimeIndex(strings.astype("O")) + tm.assert_index_equal(result, expected) + + from_ints = DatetimeIndex(expected.asi8) + tm.assert_index_equal(from_ints, expected) + + # non-conforming + msg = ( + "Inferred frequency None from passed values does not conform " + "to passed frequency D" + ) + with pytest.raises(ValueError, match=msg): + DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-04"], freq="D") + + msg = ( + "Of the four parameters: start, end, periods, and freq, exactly " + "three must be specified" + ) + with pytest.raises(ValueError, match=msg): + date_range(start="2011-01-01", freq="b") + with pytest.raises(ValueError, match=msg): + date_range(end="2011-01-01", freq="B") + with pytest.raises(ValueError, match=msg): + date_range(periods=10, freq="D") + + @pytest.mark.parametrize("freq", ["AS", "W-SUN"]) + def test_constructor_datetime64_tzformat(self, freq): + # see GH#6572: ISO 8601 format results in pytz.FixedOffset + idx = date_range( + "2013-01-01T00:00:00-05:00", "2016-01-01T23:59:59-05:00", freq=freq + ) + expected = date_range( + "2013-01-01T00:00:00", + "2016-01-01T23:59:59", + freq=freq, + tz=pytz.FixedOffset(-300), + ) + tm.assert_index_equal(idx, expected) + # Unable to use `US/Eastern` because of DST + expected_i8 = date_range( + "2013-01-01T00:00:00", "2016-01-01T23:59:59", freq=freq, tz="America/Lima" + ) + tm.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) + + idx = date_range( + "2013-01-01T00:00:00+09:00", "2016-01-01T23:59:59+09:00", freq=freq + ) + expected = date_range( + "2013-01-01T00:00:00", + "2016-01-01T23:59:59", + freq=freq, + tz=pytz.FixedOffset(540), + ) + tm.assert_index_equal(idx, expected) + expected_i8 = date_range( + "2013-01-01T00:00:00", "2016-01-01T23:59:59", freq=freq, tz="Asia/Tokyo" + ) + tm.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) + + # Non ISO 8601 format results in dateutil.tz.tzoffset + idx = date_range("2013/1/1 0:00:00-5:00", "2016/1/1 23:59:59-5:00", freq=freq) + expected = date_range( + "2013-01-01T00:00:00", + "2016-01-01T23:59:59", + freq=freq, + tz=pytz.FixedOffset(-300), + ) + tm.assert_index_equal(idx, expected) + # Unable to use `US/Eastern` because of DST + expected_i8 = date_range( + "2013-01-01T00:00:00", "2016-01-01T23:59:59", freq=freq, tz="America/Lima" + ) + tm.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) + + idx = date_range("2013/1/1 0:00:00+9:00", "2016/1/1 23:59:59+09:00", freq=freq) + expected = date_range( + "2013-01-01T00:00:00", + "2016-01-01T23:59:59", + freq=freq, + tz=pytz.FixedOffset(540), + ) + tm.assert_index_equal(idx, expected) + expected_i8 = date_range( + "2013-01-01T00:00:00", "2016-01-01T23:59:59", freq=freq, tz="Asia/Tokyo" + ) + tm.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) + + def test_constructor_dtype(self): + + # passing a dtype with a tz should localize + idx = DatetimeIndex( + ["2013-01-01", "2013-01-02"], dtype="datetime64[ns, US/Eastern]" + ) + expected = DatetimeIndex(["2013-01-01", "2013-01-02"]).tz_localize("US/Eastern") + tm.assert_index_equal(idx, expected) + + idx = DatetimeIndex(["2013-01-01", "2013-01-02"], tz="US/Eastern") + tm.assert_index_equal(idx, expected) + + # if we already have a tz and its not the same, then raise + idx = DatetimeIndex( + ["2013-01-01", "2013-01-02"], dtype="datetime64[ns, US/Eastern]" + ) + + msg = ( + "cannot supply both a tz and a timezone-naive dtype" + r" \(i\.e\. datetime64\[ns\]\)" + ) + with pytest.raises(ValueError, match=msg): + DatetimeIndex(idx, dtype="datetime64[ns]") + + # this is effectively trying to convert tz's + msg = "data is already tz-aware US/Eastern, unable to set specified tz: CET" + with pytest.raises(TypeError, match=msg): + DatetimeIndex(idx, dtype="datetime64[ns, CET]") + msg = "cannot supply both a tz and a dtype with a tz" + with pytest.raises(ValueError, match=msg): + DatetimeIndex(idx, tz="CET", dtype="datetime64[ns, US/Eastern]") + + result = DatetimeIndex(idx, dtype="datetime64[ns, US/Eastern]") + tm.assert_index_equal(idx, result) + + @pytest.mark.parametrize("dtype", [object, np.int32, np.int64]) + def test_constructor_invalid_dtype_raises(self, dtype): + # GH 23986 + with pytest.raises(ValueError): + DatetimeIndex([1, 2], dtype=dtype) + + def test_constructor_name(self): + idx = date_range(start="2000-01-01", periods=1, freq="A", name="TEST") + assert idx.name == "TEST" + + def test_000constructor_resolution(self): + # 2252 + t1 = Timestamp((1352934390 * 1000000000) + 1000000 + 1000 + 1) + idx = DatetimeIndex([t1]) + + assert idx.nanosecond[0] == t1.nanosecond + + def test_disallow_setting_tz(self): + # GH 3746 + dti = DatetimeIndex(["2010"], tz="UTC") + with pytest.raises(AttributeError): + dti.tz = pytz.timezone("US/Pacific") + + @pytest.mark.parametrize( + "tz", + [ + None, + "America/Los_Angeles", + pytz.timezone("America/Los_Angeles"), + Timestamp("2000", tz="America/Los_Angeles").tz, + ], + ) + def test_constructor_start_end_with_tz(self, tz): + # GH 18595 + start = Timestamp("2013-01-01 06:00:00", tz="America/Los_Angeles") + end = Timestamp("2013-01-02 06:00:00", tz="America/Los_Angeles") + result = date_range(freq="D", start=start, end=end, tz=tz) + expected = DatetimeIndex( + ["2013-01-01 06:00:00", "2013-01-02 06:00:00"], tz="America/Los_Angeles" + ) + tm.assert_index_equal(result, expected) + # Especially assert that the timezone is consistent for pytz + assert pytz.timezone("America/Los_Angeles") is result.tz + + @pytest.mark.parametrize("tz", ["US/Pacific", "US/Eastern", "Asia/Tokyo"]) + def test_constructor_with_non_normalized_pytz(self, tz): + # GH 18595 + non_norm_tz = Timestamp("2010", tz=tz).tz + result = DatetimeIndex(["2010"], tz=non_norm_tz) + assert pytz.timezone(tz) is result.tz + + def test_constructor_timestamp_near_dst(self): + # GH 20854 + ts = [ + Timestamp("2016-10-30 03:00:00+0300", tz="Europe/Helsinki"), + Timestamp("2016-10-30 03:00:00+0200", tz="Europe/Helsinki"), + ] + result = DatetimeIndex(ts) + expected = DatetimeIndex([ts[0].to_pydatetime(), ts[1].to_pydatetime()]) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("klass", [Index, DatetimeIndex]) + @pytest.mark.parametrize("box", [np.array, partial(np.array, dtype=object), list]) + @pytest.mark.parametrize( + "tz, dtype", + [("US/Pacific", "datetime64[ns, US/Pacific]"), (None, "datetime64[ns]")], + ) + def test_constructor_with_int_tz(self, klass, box, tz, dtype): + # GH 20997, 20964 + ts = Timestamp("2018-01-01", tz=tz) + result = klass(box([ts.value]), dtype=dtype) + expected = klass([ts]) + assert result == expected + + def test_construction_int_rountrip(self, tz_naive_fixture): + # GH 12619, GH#24559 + tz = tz_naive_fixture + + result = 1293858000000000000 + expected = DatetimeIndex([result], tz=tz).asi8[0] + assert result == expected + + def test_construction_from_replaced_timestamps_with_dst(self): + # GH 18785 + index = pd.date_range( + pd.Timestamp(2000, 1, 1), + pd.Timestamp(2005, 1, 1), + freq="MS", + tz="Australia/Melbourne", + ) + test = pd.DataFrame({"data": range(len(index))}, index=index) + test = test.resample("Y").mean() + result = pd.DatetimeIndex([x.replace(month=6, day=1) for x in test.index]) + expected = pd.DatetimeIndex( + [ + "2000-06-01 00:00:00", + "2001-06-01 00:00:00", + "2002-06-01 00:00:00", + "2003-06-01 00:00:00", + "2004-06-01 00:00:00", + "2005-06-01 00:00:00", + ], + tz="Australia/Melbourne", + ) + tm.assert_index_equal(result, expected) + + def test_construction_with_tz_and_tz_aware_dti(self): + # GH 23579 + dti = date_range("2016-01-01", periods=3, tz="US/Central") + with pytest.raises(TypeError): + DatetimeIndex(dti, tz="Asia/Tokyo") + + def test_construction_with_nat_and_tzlocal(self): + tz = dateutil.tz.tzlocal() + result = DatetimeIndex(["2018", "NaT"], tz=tz) + expected = DatetimeIndex([Timestamp("2018", tz=tz), pd.NaT]) + tm.assert_index_equal(result, expected) + + def test_constructor_no_precision_raises(self): + # GH-24753, GH-24739 + + msg = "with no precision is not allowed" + with pytest.raises(ValueError, match=msg): + pd.DatetimeIndex(["2000"], dtype="datetime64") + + with pytest.raises(ValueError, match=msg): + pd.Index(["2000"], dtype="datetime64") + + def test_constructor_wrong_precision_raises(self): + with pytest.raises(ValueError): + pd.DatetimeIndex(["2000"], dtype="datetime64[us]") + + def test_index_constructor_with_numpy_object_array_and_timestamp_tz_with_nan(self): + # GH 27011 + result = Index(np.array([Timestamp("2019", tz="UTC"), np.nan], dtype=object)) + expected = DatetimeIndex([Timestamp("2019", tz="UTC"), pd.NaT]) + tm.assert_index_equal(result, expected) + + +class TestTimeSeries: + def test_dti_constructor_preserve_dti_freq(self): + rng = date_range("1/1/2000", "1/2/2000", freq="5min") + + rng2 = DatetimeIndex(rng) + assert rng.freq == rng2.freq + + def test_dti_constructor_years_only(self, tz_naive_fixture): + tz = tz_naive_fixture + # GH 6961 + rng1 = date_range("2014", "2015", freq="M", tz=tz) + expected1 = date_range("2014-01-31", "2014-12-31", freq="M", tz=tz) + + rng2 = date_range("2014", "2015", freq="MS", tz=tz) + expected2 = date_range("2014-01-01", "2015-01-01", freq="MS", tz=tz) + + rng3 = date_range("2014", "2020", freq="A", tz=tz) + expected3 = date_range("2014-12-31", "2019-12-31", freq="A", tz=tz) + + rng4 = date_range("2014", "2020", freq="AS", tz=tz) + expected4 = date_range("2014-01-01", "2020-01-01", freq="AS", tz=tz) + + for rng, expected in [ + (rng1, expected1), + (rng2, expected2), + (rng3, expected3), + (rng4, expected4), + ]: + tm.assert_index_equal(rng, expected) + + def test_dti_constructor_small_int(self, any_int_dtype): + # see gh-13721 + exp = DatetimeIndex( + [ + "1970-01-01 00:00:00.00000000", + "1970-01-01 00:00:00.00000001", + "1970-01-01 00:00:00.00000002", + ] + ) + + arr = np.array([0, 10, 20], dtype=any_int_dtype) + tm.assert_index_equal(DatetimeIndex(arr), exp) + + def test_ctor_str_intraday(self): + rng = DatetimeIndex(["1-1-2000 00:00:01"]) + assert rng[0].second == 1 + + def test_is_(self): + dti = date_range(start="1/1/2005", end="12/1/2005", freq="M") + assert dti.is_(dti) + assert dti.is_(dti.view()) + assert not dti.is_(dti.copy()) + + def test_index_cast_datetime64_other_units(self): + arr = np.arange(0, 100, 10, dtype=np.int64).view("M8[D]") + idx = Index(arr) + + assert (idx.values == conversion.ensure_datetime64ns(arr)).all() + + def test_constructor_int64_nocopy(self): + # GH#1624 + arr = np.arange(1000, dtype=np.int64) + index = DatetimeIndex(arr) + + arr[50:100] = -1 + assert (index.asi8[50:100] == -1).all() + + arr = np.arange(1000, dtype=np.int64) + index = DatetimeIndex(arr, copy=True) + + arr[50:100] = -1 + assert (index.asi8[50:100] != -1).all() + + @pytest.mark.parametrize( + "freq", ["M", "Q", "A", "D", "B", "BH", "T", "S", "L", "U", "H", "N", "C"] + ) + def test_from_freq_recreate_from_data(self, freq): + org = date_range(start="2001/02/01 09:00", freq=freq, periods=1) + idx = DatetimeIndex(org, freq=freq) + tm.assert_index_equal(idx, org) + + org = date_range( + start="2001/02/01 09:00", freq=freq, tz="US/Pacific", periods=1 + ) + idx = DatetimeIndex(org, freq=freq, tz="US/Pacific") + tm.assert_index_equal(idx, org) + + def test_datetimeindex_constructor_misc(self): + arr = ["1/1/2005", "1/2/2005", "Jn 3, 2005", "2005-01-04"] + msg = r"(\(')?Unknown string format(:', 'Jn 3, 2005'\))?" + with pytest.raises(ValueError, match=msg): + DatetimeIndex(arr) + + arr = ["1/1/2005", "1/2/2005", "1/3/2005", "2005-01-04"] + idx1 = DatetimeIndex(arr) + + arr = [datetime(2005, 1, 1), "1/2/2005", "1/3/2005", "2005-01-04"] + idx2 = DatetimeIndex(arr) + + arr = [Timestamp(datetime(2005, 1, 1)), "1/2/2005", "1/3/2005", "2005-01-04"] + idx3 = DatetimeIndex(arr) + + arr = np.array(["1/1/2005", "1/2/2005", "1/3/2005", "2005-01-04"], dtype="O") + idx4 = DatetimeIndex(arr) + + arr = to_datetime(["1/1/2005", "1/2/2005", "1/3/2005", "2005-01-04"]) + idx5 = DatetimeIndex(arr) + + arr = to_datetime(["1/1/2005", "1/2/2005", "Jan 3, 2005", "2005-01-04"]) + idx6 = DatetimeIndex(arr) + + idx7 = DatetimeIndex(["12/05/2007", "25/01/2008"], dayfirst=True) + idx8 = DatetimeIndex( + ["2007/05/12", "2008/01/25"], dayfirst=False, yearfirst=True + ) + tm.assert_index_equal(idx7, idx8) + + for other in [idx2, idx3, idx4, idx5, idx6]: + assert (idx1.values == other.values).all() + + sdate = datetime(1999, 12, 25) + edate = datetime(2000, 1, 1) + idx = date_range(start=sdate, freq="1B", periods=20) + assert len(idx) == 20 + assert idx[0] == sdate + 0 * offsets.BDay() + assert idx.freq == "B" + + idx = date_range(end=edate, freq=("D", 5), periods=20) + assert len(idx) == 20 + assert idx[-1] == edate + assert idx.freq == "5D" + + idx1 = date_range(start=sdate, end=edate, freq="W-SUN") + idx2 = date_range(start=sdate, end=edate, freq=offsets.Week(weekday=6)) + assert len(idx1) == len(idx2) + assert idx1.freq == idx2.freq + + idx1 = date_range(start=sdate, end=edate, freq="QS") + idx2 = date_range( + start=sdate, end=edate, freq=offsets.QuarterBegin(startingMonth=1) + ) + assert len(idx1) == len(idx2) + assert idx1.freq == idx2.freq + + idx1 = date_range(start=sdate, end=edate, freq="BQ") + idx2 = date_range( + start=sdate, end=edate, freq=offsets.BQuarterEnd(startingMonth=12) + ) + assert len(idx1) == len(idx2) + assert idx1.freq == idx2.freq diff --git a/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_date_range.py b/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_date_range.py new file mode 100644 index 0000000..4d0beec --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_date_range.py @@ -0,0 +1,963 @@ +""" +test date_range, bdate_range construction from the convenience range functions +""" + +from datetime import datetime, time, timedelta + +import numpy as np +import pytest +import pytz +from pytz import timezone + +from pandas.errors import OutOfBoundsDatetime +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import DatetimeIndex, Timestamp, bdate_range, date_range, offsets +import pandas._testing as tm + +from pandas.tseries.offsets import ( + BDay, + CDay, + DateOffset, + MonthEnd, + generate_range, + prefix_mapping, +) + +START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) + + +class TestTimestampEquivDateRange: + # Older tests in TestTimeSeries constructed their `stamp` objects + # using `date_range` instead of the `Timestamp` constructor. + # TestTimestampEquivDateRange checks that these are equivalent in the + # pertinent cases. + + def test_date_range_timestamp_equiv(self): + rng = date_range("20090415", "20090519", tz="US/Eastern") + stamp = rng[0] + + ts = Timestamp("20090415", tz="US/Eastern", freq="D") + assert ts == stamp + + def test_date_range_timestamp_equiv_dateutil(self): + rng = date_range("20090415", "20090519", tz="dateutil/US/Eastern") + stamp = rng[0] + + ts = Timestamp("20090415", tz="dateutil/US/Eastern", freq="D") + assert ts == stamp + + def test_date_range_timestamp_equiv_explicit_pytz(self): + rng = date_range("20090415", "20090519", tz=pytz.timezone("US/Eastern")) + stamp = rng[0] + + ts = Timestamp("20090415", tz=pytz.timezone("US/Eastern"), freq="D") + assert ts == stamp + + @td.skip_if_windows_python_3 + def test_date_range_timestamp_equiv_explicit_dateutil(self): + from pandas._libs.tslibs.timezones import dateutil_gettz as gettz + + rng = date_range("20090415", "20090519", tz=gettz("US/Eastern")) + stamp = rng[0] + + ts = Timestamp("20090415", tz=gettz("US/Eastern"), freq="D") + assert ts == stamp + + def test_date_range_timestamp_equiv_from_datetime_instance(self): + datetime_instance = datetime(2014, 3, 4) + # build a timestamp with a frequency, since then it supports + # addition/subtraction of integers + timestamp_instance = date_range(datetime_instance, periods=1, freq="D")[0] + + ts = Timestamp(datetime_instance, freq="D") + assert ts == timestamp_instance + + def test_date_range_timestamp_equiv_preserve_frequency(self): + timestamp_instance = date_range("2014-03-05", periods=1, freq="D")[0] + ts = Timestamp("2014-03-05", freq="D") + + assert timestamp_instance == ts + + +class TestDateRanges: + def test_date_range_nat(self): + # GH#11587 + msg = "Neither `start` nor `end` can be NaT" + with pytest.raises(ValueError, match=msg): + date_range(start="2016-01-01", end=pd.NaT, freq="D") + with pytest.raises(ValueError, match=msg): + date_range(start=pd.NaT, end="2016-01-01", freq="D") + + def test_date_range_multiplication_overflow(self): + # GH#24255 + # check that overflows in calculating `addend = periods * stride` + # are caught + with tm.assert_produces_warning(None): + # we should _not_ be seeing a overflow RuntimeWarning + dti = date_range(start="1677-09-22", periods=213503, freq="D") + + assert dti[0] == Timestamp("1677-09-22") + assert len(dti) == 213503 + + msg = "Cannot generate range with" + with pytest.raises(OutOfBoundsDatetime, match=msg): + date_range("1969-05-04", periods=200000000, freq="30000D") + + def test_date_range_unsigned_overflow_handling(self): + # GH#24255 + # case where `addend = periods * stride` overflows int64 bounds + # but not uint64 bounds + dti = date_range(start="1677-09-22", end="2262-04-11", freq="D") + + dti2 = date_range(start=dti[0], periods=len(dti), freq="D") + assert dti2.equals(dti) + + dti3 = date_range(end=dti[-1], periods=len(dti), freq="D") + assert dti3.equals(dti) + + def test_date_range_int64_overflow_non_recoverable(self): + # GH#24255 + # case with start later than 1970-01-01, overflow int64 but not uint64 + msg = "Cannot generate range with" + with pytest.raises(OutOfBoundsDatetime, match=msg): + date_range(start="1970-02-01", periods=106752 * 24, freq="H") + + # case with end before 1970-01-01, overflow int64 but not uint64 + with pytest.raises(OutOfBoundsDatetime, match=msg): + date_range(end="1969-11-14", periods=106752 * 24, freq="H") + + def test_date_range_int64_overflow_stride_endpoint_different_signs(self): + # cases where stride * periods overflow int64 and stride/endpoint + # have different signs + start = Timestamp("2262-02-23") + end = Timestamp("1969-11-14") + + expected = date_range(start=start, end=end, freq="-1H") + assert expected[0] == start + assert expected[-1] == end + + dti = date_range(end=end, periods=len(expected), freq="-1H") + tm.assert_index_equal(dti, expected) + + start2 = Timestamp("1970-02-01") + end2 = Timestamp("1677-10-22") + + expected2 = date_range(start=start2, end=end2, freq="-1H") + assert expected2[0] == start2 + assert expected2[-1] == end2 + + dti2 = date_range(start=start2, periods=len(expected2), freq="-1H") + tm.assert_index_equal(dti2, expected2) + + def test_date_range_out_of_bounds(self): + # GH#14187 + with pytest.raises(OutOfBoundsDatetime): + date_range("2016-01-01", periods=100000, freq="D") + with pytest.raises(OutOfBoundsDatetime): + date_range(end="1763-10-12", periods=100000, freq="D") + + def test_date_range_gen_error(self): + rng = date_range("1/1/2000 00:00", "1/1/2000 00:18", freq="5min") + assert len(rng) == 4 + + @pytest.mark.parametrize("freq", ["AS", "YS"]) + def test_begin_year_alias(self, freq): + # see gh-9313 + rng = date_range("1/1/2013", "7/1/2017", freq=freq) + exp = pd.DatetimeIndex( + ["2013-01-01", "2014-01-01", "2015-01-01", "2016-01-01", "2017-01-01"], + freq=freq, + ) + tm.assert_index_equal(rng, exp) + + @pytest.mark.parametrize("freq", ["A", "Y"]) + def test_end_year_alias(self, freq): + # see gh-9313 + rng = date_range("1/1/2013", "7/1/2017", freq=freq) + exp = pd.DatetimeIndex( + ["2013-12-31", "2014-12-31", "2015-12-31", "2016-12-31"], freq=freq + ) + tm.assert_index_equal(rng, exp) + + @pytest.mark.parametrize("freq", ["BA", "BY"]) + def test_business_end_year_alias(self, freq): + # see gh-9313 + rng = date_range("1/1/2013", "7/1/2017", freq=freq) + exp = pd.DatetimeIndex( + ["2013-12-31", "2014-12-31", "2015-12-31", "2016-12-30"], freq=freq + ) + tm.assert_index_equal(rng, exp) + + def test_date_range_negative_freq(self): + # GH 11018 + rng = date_range("2011-12-31", freq="-2A", periods=3) + exp = pd.DatetimeIndex(["2011-12-31", "2009-12-31", "2007-12-31"], freq="-2A") + tm.assert_index_equal(rng, exp) + assert rng.freq == "-2A" + + rng = date_range("2011-01-31", freq="-2M", periods=3) + exp = pd.DatetimeIndex(["2011-01-31", "2010-11-30", "2010-09-30"], freq="-2M") + tm.assert_index_equal(rng, exp) + assert rng.freq == "-2M" + + def test_date_range_bms_bug(self): + # #1645 + rng = date_range("1/1/2000", periods=10, freq="BMS") + + ex_first = Timestamp("2000-01-03") + assert rng[0] == ex_first + + def test_date_range_normalize(self): + snap = datetime.today() + n = 50 + + rng = date_range(snap, periods=n, normalize=False, freq="2D") + + offset = timedelta(2) + values = DatetimeIndex([snap + i * offset for i in range(n)]) + + tm.assert_index_equal(rng, values) + + rng = date_range("1/1/2000 08:15", periods=n, normalize=False, freq="B") + the_time = time(8, 15) + for val in rng: + assert val.time() == the_time + + def test_date_range_fy5252(self): + dr = date_range( + start="2013-01-01", + periods=2, + freq=offsets.FY5253(startingMonth=1, weekday=3, variation="nearest"), + ) + assert dr[0] == Timestamp("2013-01-31") + assert dr[1] == Timestamp("2014-01-30") + + def test_date_range_ambiguous_arguments(self): + # #2538 + start = datetime(2011, 1, 1, 5, 3, 40) + end = datetime(2011, 1, 1, 8, 9, 40) + + msg = ( + "Of the four parameters: start, end, periods, and " + "freq, exactly three must be specified" + ) + with pytest.raises(ValueError, match=msg): + date_range(start, end, periods=10, freq="s") + + def test_date_range_convenience_periods(self): + # GH 20808 + result = date_range("2018-04-24", "2018-04-27", periods=3) + expected = DatetimeIndex( + ["2018-04-24 00:00:00", "2018-04-25 12:00:00", "2018-04-27 00:00:00"], + freq=None, + ) + + tm.assert_index_equal(result, expected) + + # Test if spacing remains linear if tz changes to dst in range + result = date_range( + "2018-04-01 01:00:00", + "2018-04-01 04:00:00", + tz="Australia/Sydney", + periods=3, + ) + expected = DatetimeIndex( + [ + Timestamp("2018-04-01 01:00:00+1100", tz="Australia/Sydney"), + Timestamp("2018-04-01 02:00:00+1000", tz="Australia/Sydney"), + Timestamp("2018-04-01 04:00:00+1000", tz="Australia/Sydney"), + ] + ) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "start,end,result_tz", + [ + ["20180101", "20180103", "US/Eastern"], + [datetime(2018, 1, 1), datetime(2018, 1, 3), "US/Eastern"], + [Timestamp("20180101"), Timestamp("20180103"), "US/Eastern"], + [ + Timestamp("20180101", tz="US/Eastern"), + Timestamp("20180103", tz="US/Eastern"), + "US/Eastern", + ], + [ + Timestamp("20180101", tz="US/Eastern"), + Timestamp("20180103", tz="US/Eastern"), + None, + ], + ], + ) + def test_date_range_linspacing_tz(self, start, end, result_tz): + # GH 20983 + result = date_range(start, end, periods=3, tz=result_tz) + expected = date_range("20180101", periods=3, freq="D", tz="US/Eastern") + tm.assert_index_equal(result, expected) + + def test_date_range_businesshour(self): + idx = DatetimeIndex( + [ + "2014-07-04 09:00", + "2014-07-04 10:00", + "2014-07-04 11:00", + "2014-07-04 12:00", + "2014-07-04 13:00", + "2014-07-04 14:00", + "2014-07-04 15:00", + "2014-07-04 16:00", + ], + freq="BH", + ) + rng = date_range("2014-07-04 09:00", "2014-07-04 16:00", freq="BH") + tm.assert_index_equal(idx, rng) + + idx = DatetimeIndex(["2014-07-04 16:00", "2014-07-07 09:00"], freq="BH") + rng = date_range("2014-07-04 16:00", "2014-07-07 09:00", freq="BH") + tm.assert_index_equal(idx, rng) + + idx = DatetimeIndex( + [ + "2014-07-04 09:00", + "2014-07-04 10:00", + "2014-07-04 11:00", + "2014-07-04 12:00", + "2014-07-04 13:00", + "2014-07-04 14:00", + "2014-07-04 15:00", + "2014-07-04 16:00", + "2014-07-07 09:00", + "2014-07-07 10:00", + "2014-07-07 11:00", + "2014-07-07 12:00", + "2014-07-07 13:00", + "2014-07-07 14:00", + "2014-07-07 15:00", + "2014-07-07 16:00", + "2014-07-08 09:00", + "2014-07-08 10:00", + "2014-07-08 11:00", + "2014-07-08 12:00", + "2014-07-08 13:00", + "2014-07-08 14:00", + "2014-07-08 15:00", + "2014-07-08 16:00", + ], + freq="BH", + ) + rng = date_range("2014-07-04 09:00", "2014-07-08 16:00", freq="BH") + tm.assert_index_equal(idx, rng) + + def test_range_misspecified(self): + # GH #1095 + msg = ( + "Of the four parameters: start, end, periods, and " + "freq, exactly three must be specified" + ) + + with pytest.raises(ValueError, match=msg): + date_range(start="1/1/2000") + + with pytest.raises(ValueError, match=msg): + date_range(end="1/1/2000") + + with pytest.raises(ValueError, match=msg): + date_range(periods=10) + + with pytest.raises(ValueError, match=msg): + date_range(start="1/1/2000", freq="H") + + with pytest.raises(ValueError, match=msg): + date_range(end="1/1/2000", freq="H") + + with pytest.raises(ValueError, match=msg): + date_range(periods=10, freq="H") + + with pytest.raises(ValueError, match=msg): + date_range() + + def test_compat_replace(self): + # https://github.com/statsmodels/statsmodels/issues/3349 + # replace should take ints/longs for compat + result = date_range( + Timestamp("1960-04-01 00:00:00", freq="QS-JAN"), periods=76, freq="QS-JAN" + ) + assert len(result) == 76 + + def test_catch_infinite_loop(self): + offset = offsets.DateOffset(minute=5) + # blow up, don't loop forever + msg = "Offset did not increment date" + with pytest.raises(ValueError, match=msg): + date_range(datetime(2011, 11, 11), datetime(2011, 11, 12), freq=offset) + + @pytest.mark.parametrize("periods", (1, 2)) + def test_wom_len(self, periods): + # https://github.com/pandas-dev/pandas/issues/20517 + res = date_range(start="20110101", periods=periods, freq="WOM-1MON") + assert len(res) == periods + + def test_construct_over_dst(self): + # GH 20854 + pre_dst = Timestamp("2010-11-07 01:00:00").tz_localize( + "US/Pacific", ambiguous=True + ) + pst_dst = Timestamp("2010-11-07 01:00:00").tz_localize( + "US/Pacific", ambiguous=False + ) + expect_data = [ + Timestamp("2010-11-07 00:00:00", tz="US/Pacific"), + pre_dst, + pst_dst, + ] + expected = DatetimeIndex(expect_data) + result = date_range(start="2010-11-7", periods=3, freq="H", tz="US/Pacific") + tm.assert_index_equal(result, expected) + + def test_construct_with_different_start_end_string_format(self): + # GH 12064 + result = date_range( + "2013-01-01 00:00:00+09:00", "2013/01/01 02:00:00+09:00", freq="H" + ) + expected = DatetimeIndex( + [ + Timestamp("2013-01-01 00:00:00+09:00"), + Timestamp("2013-01-01 01:00:00+09:00"), + Timestamp("2013-01-01 02:00:00+09:00"), + ] + ) + tm.assert_index_equal(result, expected) + + def test_error_with_zero_monthends(self): + msg = r"Offset <0 \* MonthEnds> did not increment date" + with pytest.raises(ValueError, match=msg): + date_range("1/1/2000", "1/1/2001", freq=MonthEnd(0)) + + def test_range_bug(self): + # GH #770 + offset = DateOffset(months=3) + result = date_range("2011-1-1", "2012-1-31", freq=offset) + + start = datetime(2011, 1, 1) + expected = DatetimeIndex([start + i * offset for i in range(5)]) + tm.assert_index_equal(result, expected) + + def test_range_tz_pytz(self): + # see gh-2906 + tz = timezone("US/Eastern") + start = tz.localize(datetime(2011, 1, 1)) + end = tz.localize(datetime(2011, 1, 3)) + + dr = date_range(start=start, periods=3) + assert dr.tz.zone == tz.zone + assert dr[0] == start + assert dr[2] == end + + dr = date_range(end=end, periods=3) + assert dr.tz.zone == tz.zone + assert dr[0] == start + assert dr[2] == end + + dr = date_range(start=start, end=end) + assert dr.tz.zone == tz.zone + assert dr[0] == start + assert dr[2] == end + + @pytest.mark.parametrize( + "start, end", + [ + [ + Timestamp(datetime(2014, 3, 6), tz="US/Eastern"), + Timestamp(datetime(2014, 3, 12), tz="US/Eastern"), + ], + [ + Timestamp(datetime(2013, 11, 1), tz="US/Eastern"), + Timestamp(datetime(2013, 11, 6), tz="US/Eastern"), + ], + ], + ) + def test_range_tz_dst_straddle_pytz(self, start, end): + dr = date_range(start, end, freq="D") + assert dr[0] == start + assert dr[-1] == end + assert np.all(dr.hour == 0) + + dr = date_range(start, end, freq="D", tz="US/Eastern") + assert dr[0] == start + assert dr[-1] == end + assert np.all(dr.hour == 0) + + dr = date_range( + start.replace(tzinfo=None), + end.replace(tzinfo=None), + freq="D", + tz="US/Eastern", + ) + assert dr[0] == start + assert dr[-1] == end + assert np.all(dr.hour == 0) + + def test_range_tz_dateutil(self): + # see gh-2906 + + # Use maybe_get_tz to fix filename in tz under dateutil. + from pandas._libs.tslibs.timezones import maybe_get_tz + + tz = lambda x: maybe_get_tz("dateutil/" + x) + + start = datetime(2011, 1, 1, tzinfo=tz("US/Eastern")) + end = datetime(2011, 1, 3, tzinfo=tz("US/Eastern")) + + dr = date_range(start=start, periods=3) + assert dr.tz == tz("US/Eastern") + assert dr[0] == start + assert dr[2] == end + + dr = date_range(end=end, periods=3) + assert dr.tz == tz("US/Eastern") + assert dr[0] == start + assert dr[2] == end + + dr = date_range(start=start, end=end) + assert dr.tz == tz("US/Eastern") + assert dr[0] == start + assert dr[2] == end + + @pytest.mark.parametrize("freq", ["1D", "3D", "2M", "7W", "3H", "A"]) + def test_range_closed(self, freq): + begin = datetime(2011, 1, 1) + end = datetime(2014, 1, 1) + + closed = date_range(begin, end, closed=None, freq=freq) + left = date_range(begin, end, closed="left", freq=freq) + right = date_range(begin, end, closed="right", freq=freq) + expected_left = left + expected_right = right + + if end == closed[-1]: + expected_left = closed[:-1] + if begin == closed[0]: + expected_right = closed[1:] + + tm.assert_index_equal(expected_left, left) + tm.assert_index_equal(expected_right, right) + + def test_range_closed_with_tz_aware_start_end(self): + # GH12409, GH12684 + begin = Timestamp("2011/1/1", tz="US/Eastern") + end = Timestamp("2014/1/1", tz="US/Eastern") + + for freq in ["1D", "3D", "2M", "7W", "3H", "A"]: + closed = date_range(begin, end, closed=None, freq=freq) + left = date_range(begin, end, closed="left", freq=freq) + right = date_range(begin, end, closed="right", freq=freq) + expected_left = left + expected_right = right + + if end == closed[-1]: + expected_left = closed[:-1] + if begin == closed[0]: + expected_right = closed[1:] + + tm.assert_index_equal(expected_left, left) + tm.assert_index_equal(expected_right, right) + + begin = Timestamp("2011/1/1") + end = Timestamp("2014/1/1") + begintz = Timestamp("2011/1/1", tz="US/Eastern") + endtz = Timestamp("2014/1/1", tz="US/Eastern") + + for freq in ["1D", "3D", "2M", "7W", "3H", "A"]: + closed = date_range(begin, end, closed=None, freq=freq, tz="US/Eastern") + left = date_range(begin, end, closed="left", freq=freq, tz="US/Eastern") + right = date_range(begin, end, closed="right", freq=freq, tz="US/Eastern") + expected_left = left + expected_right = right + + if endtz == closed[-1]: + expected_left = closed[:-1] + if begintz == closed[0]: + expected_right = closed[1:] + + tm.assert_index_equal(expected_left, left) + tm.assert_index_equal(expected_right, right) + + @pytest.mark.parametrize("closed", ["right", "left", None]) + def test_range_closed_boundary(self, closed): + # GH#11804 + right_boundary = date_range( + "2015-09-12", "2015-12-01", freq="QS-MAR", closed=closed + ) + left_boundary = date_range( + "2015-09-01", "2015-09-12", freq="QS-MAR", closed=closed + ) + both_boundary = date_range( + "2015-09-01", "2015-12-01", freq="QS-MAR", closed=closed + ) + expected_right = expected_left = expected_both = both_boundary + + if closed == "right": + expected_left = both_boundary[1:] + if closed == "left": + expected_right = both_boundary[:-1] + if closed is None: + expected_right = both_boundary[1:] + expected_left = both_boundary[:-1] + + tm.assert_index_equal(right_boundary, expected_right) + tm.assert_index_equal(left_boundary, expected_left) + tm.assert_index_equal(both_boundary, expected_both) + + def test_years_only(self): + # GH 6961 + dr = date_range("2014", "2015", freq="M") + assert dr[0] == datetime(2014, 1, 31) + assert dr[-1] == datetime(2014, 12, 31) + + def test_freq_divides_end_in_nanos(self): + # GH 10885 + result_1 = date_range("2005-01-12 10:00", "2005-01-12 16:00", freq="345min") + result_2 = date_range("2005-01-13 10:00", "2005-01-13 16:00", freq="345min") + expected_1 = DatetimeIndex( + ["2005-01-12 10:00:00", "2005-01-12 15:45:00"], + dtype="datetime64[ns]", + freq="345T", + tz=None, + ) + expected_2 = DatetimeIndex( + ["2005-01-13 10:00:00", "2005-01-13 15:45:00"], + dtype="datetime64[ns]", + freq="345T", + tz=None, + ) + tm.assert_index_equal(result_1, expected_1) + tm.assert_index_equal(result_2, expected_2) + + def test_cached_range_bug(self): + rng = date_range("2010-09-01 05:00:00", periods=50, freq=DateOffset(hours=6)) + assert len(rng) == 50 + assert rng[0] == datetime(2010, 9, 1, 5) + + def test_timezone_comparaison_bug(self): + # smoke test + start = Timestamp("20130220 10:00", tz="US/Eastern") + result = date_range(start, periods=2, tz="US/Eastern") + assert len(result) == 2 + + def test_timezone_comparaison_assert(self): + start = Timestamp("20130220 10:00", tz="US/Eastern") + msg = "Inferred time zone not equal to passed time zone" + with pytest.raises(AssertionError, match=msg): + date_range(start, periods=2, tz="Europe/Berlin") + + def test_negative_non_tick_frequency_descending_dates(self, tz_aware_fixture): + # GH 23270 + tz = tz_aware_fixture + result = pd.date_range(start="2011-06-01", end="2011-01-01", freq="-1MS", tz=tz) + expected = pd.date_range( + end="2011-06-01", start="2011-01-01", freq="1MS", tz=tz + )[::-1] + tm.assert_index_equal(result, expected) + + +class TestGenRangeGeneration: + def test_generate(self): + rng1 = list(generate_range(START, END, offset=BDay())) + rng2 = list(generate_range(START, END, offset="B")) + assert rng1 == rng2 + + def test_generate_cday(self): + rng1 = list(generate_range(START, END, offset=CDay())) + rng2 = list(generate_range(START, END, offset="C")) + assert rng1 == rng2 + + def test_1(self): + rng = list(generate_range(start=datetime(2009, 3, 25), periods=2)) + expected = [datetime(2009, 3, 25), datetime(2009, 3, 26)] + assert rng == expected + + def test_2(self): + rng = list(generate_range(start=datetime(2008, 1, 1), end=datetime(2008, 1, 3))) + expected = [datetime(2008, 1, 1), datetime(2008, 1, 2), datetime(2008, 1, 3)] + assert rng == expected + + def test_3(self): + rng = list(generate_range(start=datetime(2008, 1, 5), end=datetime(2008, 1, 6))) + expected = [] + assert rng == expected + + def test_precision_finer_than_offset(self): + # GH#9907 + result1 = pd.date_range( + start="2015-04-15 00:00:03", end="2016-04-22 00:00:00", freq="Q" + ) + result2 = pd.date_range( + start="2015-04-15 00:00:03", end="2015-06-22 00:00:04", freq="W" + ) + expected1_list = [ + "2015-06-30 00:00:03", + "2015-09-30 00:00:03", + "2015-12-31 00:00:03", + "2016-03-31 00:00:03", + ] + expected2_list = [ + "2015-04-19 00:00:03", + "2015-04-26 00:00:03", + "2015-05-03 00:00:03", + "2015-05-10 00:00:03", + "2015-05-17 00:00:03", + "2015-05-24 00:00:03", + "2015-05-31 00:00:03", + "2015-06-07 00:00:03", + "2015-06-14 00:00:03", + "2015-06-21 00:00:03", + ] + expected1 = DatetimeIndex( + expected1_list, dtype="datetime64[ns]", freq="Q-DEC", tz=None + ) + expected2 = DatetimeIndex( + expected2_list, dtype="datetime64[ns]", freq="W-SUN", tz=None + ) + tm.assert_index_equal(result1, expected1) + tm.assert_index_equal(result2, expected2) + + dt1, dt2 = "2017-01-01", "2017-01-01" + tz1, tz2 = "US/Eastern", "Europe/London" + + @pytest.mark.parametrize( + "start,end", + [ + (pd.Timestamp(dt1, tz=tz1), pd.Timestamp(dt2)), + (pd.Timestamp(dt1), pd.Timestamp(dt2, tz=tz2)), + (pd.Timestamp(dt1, tz=tz1), pd.Timestamp(dt2, tz=tz2)), + (pd.Timestamp(dt1, tz=tz2), pd.Timestamp(dt2, tz=tz1)), + ], + ) + def test_mismatching_tz_raises_err(self, start, end): + # issue 18488 + with pytest.raises(TypeError): + pd.date_range(start, end) + with pytest.raises(TypeError): + pd.date_range(start, end, freq=BDay()) + + +class TestBusinessDateRange: + def test_constructor(self): + bdate_range(START, END, freq=BDay()) + bdate_range(START, periods=20, freq=BDay()) + bdate_range(end=START, periods=20, freq=BDay()) + + msg = "periods must be a number, got B" + with pytest.raises(TypeError, match=msg): + date_range("2011-1-1", "2012-1-1", "B") + + with pytest.raises(TypeError, match=msg): + bdate_range("2011-1-1", "2012-1-1", "B") + + msg = "freq must be specified for bdate_range; use date_range instead" + with pytest.raises(TypeError, match=msg): + bdate_range(START, END, periods=10, freq=None) + + def test_naive_aware_conflicts(self): + naive = bdate_range(START, END, freq=BDay(), tz=None) + aware = bdate_range(START, END, freq=BDay(), tz="Asia/Hong_Kong") + + msg = "tz-naive.*tz-aware" + with pytest.raises(TypeError, match=msg): + naive.join(aware) + + with pytest.raises(TypeError, match=msg): + aware.join(naive) + + def test_misc(self): + end = datetime(2009, 5, 13) + dr = bdate_range(end=end, periods=20) + firstDate = end - 19 * BDay() + + assert len(dr) == 20 + assert dr[0] == firstDate + assert dr[-1] == end + + def test_date_parse_failure(self): + badly_formed_date = "2007/100/1" + + with pytest.raises(ValueError): + Timestamp(badly_formed_date) + + with pytest.raises(ValueError): + bdate_range(start=badly_formed_date, periods=10) + + with pytest.raises(ValueError): + bdate_range(end=badly_formed_date, periods=10) + + with pytest.raises(ValueError): + bdate_range(badly_formed_date, badly_formed_date) + + def test_daterange_bug_456(self): + # GH #456 + rng1 = bdate_range("12/5/2011", "12/5/2011") + rng2 = bdate_range("12/2/2011", "12/5/2011") + rng2._data.freq = BDay() # TODO: shouldn't this already be set? + + result = rng1.union(rng2) + assert isinstance(result, DatetimeIndex) + + @pytest.mark.parametrize("closed", ["left", "right"]) + def test_bdays_and_open_boundaries(self, closed): + # GH 6673 + start = "2018-07-21" # Saturday + end = "2018-07-29" # Sunday + result = pd.date_range(start, end, freq="B", closed=closed) + + bday_start = "2018-07-23" # Monday + bday_end = "2018-07-27" # Friday + expected = pd.date_range(bday_start, bday_end, freq="D") + tm.assert_index_equal(result, expected) + + def test_bday_near_overflow(self): + # GH#24252 avoid doing unnecessary addition that _would_ overflow + start = pd.Timestamp.max.floor("D").to_pydatetime() + rng = pd.date_range(start, end=None, periods=1, freq="B") + expected = pd.DatetimeIndex([start], freq="B") + tm.assert_index_equal(rng, expected) + + def test_bday_overflow_error(self): + # GH#24252 check that we get OutOfBoundsDatetime and not OverflowError + start = pd.Timestamp.max.floor("D").to_pydatetime() + with pytest.raises(OutOfBoundsDatetime): + pd.date_range(start, periods=2, freq="B") + + +class TestCustomDateRange: + def test_constructor(self): + bdate_range(START, END, freq=CDay()) + bdate_range(START, periods=20, freq=CDay()) + bdate_range(end=START, periods=20, freq=CDay()) + + msg = "periods must be a number, got C" + with pytest.raises(TypeError, match=msg): + date_range("2011-1-1", "2012-1-1", "C") + + with pytest.raises(TypeError, match=msg): + bdate_range("2011-1-1", "2012-1-1", "C") + + def test_misc(self): + end = datetime(2009, 5, 13) + dr = bdate_range(end=end, periods=20, freq="C") + firstDate = end - 19 * CDay() + + assert len(dr) == 20 + assert dr[0] == firstDate + assert dr[-1] == end + + def test_daterange_bug_456(self): + # GH #456 + rng1 = bdate_range("12/5/2011", "12/5/2011", freq="C") + rng2 = bdate_range("12/2/2011", "12/5/2011", freq="C") + rng2._data.freq = CDay() # TODO: shouldn't this already be set? + + result = rng1.union(rng2) + assert isinstance(result, DatetimeIndex) + + def test_cdaterange(self): + result = bdate_range("2013-05-01", periods=3, freq="C") + expected = DatetimeIndex(["2013-05-01", "2013-05-02", "2013-05-03"]) + tm.assert_index_equal(result, expected) + + def test_cdaterange_weekmask(self): + result = bdate_range( + "2013-05-01", periods=3, freq="C", weekmask="Sun Mon Tue Wed Thu" + ) + expected = DatetimeIndex(["2013-05-01", "2013-05-02", "2013-05-05"]) + tm.assert_index_equal(result, expected) + + # raise with non-custom freq + msg = ( + "a custom frequency string is required when holidays or " + "weekmask are passed, got frequency B" + ) + with pytest.raises(ValueError, match=msg): + bdate_range("2013-05-01", periods=3, weekmask="Sun Mon Tue Wed Thu") + + def test_cdaterange_holidays(self): + result = bdate_range("2013-05-01", periods=3, freq="C", holidays=["2013-05-01"]) + expected = DatetimeIndex(["2013-05-02", "2013-05-03", "2013-05-06"]) + tm.assert_index_equal(result, expected) + + # raise with non-custom freq + msg = ( + "a custom frequency string is required when holidays or " + "weekmask are passed, got frequency B" + ) + with pytest.raises(ValueError, match=msg): + bdate_range("2013-05-01", periods=3, holidays=["2013-05-01"]) + + def test_cdaterange_weekmask_and_holidays(self): + result = bdate_range( + "2013-05-01", + periods=3, + freq="C", + weekmask="Sun Mon Tue Wed Thu", + holidays=["2013-05-01"], + ) + expected = DatetimeIndex(["2013-05-02", "2013-05-05", "2013-05-06"]) + tm.assert_index_equal(result, expected) + + # raise with non-custom freq + msg = ( + "a custom frequency string is required when holidays or " + "weekmask are passed, got frequency B" + ) + with pytest.raises(ValueError, match=msg): + bdate_range( + "2013-05-01", + periods=3, + weekmask="Sun Mon Tue Wed Thu", + holidays=["2013-05-01"], + ) + + @pytest.mark.parametrize( + "freq", [freq for freq in prefix_mapping if freq.startswith("C")] + ) + def test_all_custom_freq(self, freq): + # should not raise + bdate_range( + START, END, freq=freq, weekmask="Mon Wed Fri", holidays=["2009-03-14"] + ) + + bad_freq = freq + "FOO" + msg = "invalid custom frequency string: {freq}" + with pytest.raises(ValueError, match=msg.format(freq=bad_freq)): + bdate_range(START, END, freq=bad_freq) + + @pytest.mark.parametrize( + "start_end", + [ + ("2018-01-01T00:00:01.000Z", "2018-01-03T00:00:01.000Z"), + ("2018-01-01T00:00:00.010Z", "2018-01-03T00:00:00.010Z"), + ("2001-01-01T00:00:00.010Z", "2001-01-03T00:00:00.010Z"), + ], + ) + def test_range_with_millisecond_resolution(self, start_end): + # https://github.com/pandas-dev/pandas/issues/24110 + start, end = start_end + result = pd.date_range(start=start, end=end, periods=2, closed="left") + expected = DatetimeIndex([start]) + tm.assert_index_equal(result, expected) + + +def test_date_range_with_custom_holidays(): + # GH 30593 + freq = pd.offsets.CustomBusinessHour(start="15:00", holidays=["2020-11-26"]) + result = pd.date_range(start="2020-11-25 15:00", periods=4, freq=freq) + expected = pd.DatetimeIndex( + [ + "2020-11-25 15:00:00", + "2020-11-25 16:00:00", + "2020-11-27 15:00:00", + "2020-11-27 16:00:00", + ], + freq=freq, + ) + tm.assert_index_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_datetime.py b/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_datetime.py new file mode 100644 index 0000000..ca18d6f --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_datetime.py @@ -0,0 +1,445 @@ +from datetime import date + +import dateutil +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, DatetimeIndex, Index, Timestamp, date_range, offsets +import pandas._testing as tm + +randn = np.random.randn + + +class TestDatetimeIndex: + def test_roundtrip_pickle_with_tz(self): + + # GH 8367 + # round-trip of timezone + index = date_range("20130101", periods=3, tz="US/Eastern", name="foo") + unpickled = tm.round_trip_pickle(index) + tm.assert_index_equal(index, unpickled) + + def test_reindex_preserves_tz_if_target_is_empty_list_or_array(self): + # GH7774 + index = date_range("20130101", periods=3, tz="US/Eastern") + assert str(index.reindex([])[0].tz) == "US/Eastern" + assert str(index.reindex(np.array([]))[0].tz) == "US/Eastern" + + def test_time_loc(self): # GH8667 + from datetime import time + from pandas._libs.index import _SIZE_CUTOFF + + ns = _SIZE_CUTOFF + np.array([-100, 100], dtype=np.int64) + key = time(15, 11, 30) + start = key.hour * 3600 + key.minute * 60 + key.second + step = 24 * 3600 + + for n in ns: + idx = pd.date_range("2014-11-26", periods=n, freq="S") + ts = pd.Series(np.random.randn(n), index=idx) + i = np.arange(start, n, step) + + tm.assert_numpy_array_equal(ts.index.get_loc(key), i, check_dtype=False) + tm.assert_series_equal(ts[key], ts.iloc[i]) + + left, right = ts.copy(), ts.copy() + left[key] *= -10 + right.iloc[i] *= -10 + tm.assert_series_equal(left, right) + + def test_time_overflow_for_32bit_machines(self): + # GH8943. On some machines NumPy defaults to np.int32 (for example, + # 32-bit Linux machines). In the function _generate_regular_range + # found in tseries/index.py, `periods` gets multiplied by `strides` + # (which has value 1e9) and since the max value for np.int32 is ~2e9, + # and since those machines won't promote np.int32 to np.int64, we get + # overflow. + periods = np.int_(1000) + + idx1 = pd.date_range(start="2000", periods=periods, freq="S") + assert len(idx1) == periods + + idx2 = pd.date_range(end="2000", periods=periods, freq="S") + assert len(idx2) == periods + + def test_nat(self): + assert DatetimeIndex([np.nan])[0] is pd.NaT + + def test_week_of_month_frequency(self): + # GH 5348: "ValueError: Could not evaluate WOM-1SUN" shouldn't raise + d1 = date(2002, 9, 1) + d2 = date(2013, 10, 27) + d3 = date(2012, 9, 30) + idx1 = DatetimeIndex([d1, d2]) + idx2 = DatetimeIndex([d3]) + result_append = idx1.append(idx2) + expected = DatetimeIndex([d1, d2, d3]) + tm.assert_index_equal(result_append, expected) + result_union = idx1.union(idx2) + expected = DatetimeIndex([d1, d3, d2]) + tm.assert_index_equal(result_union, expected) + + # GH 5115 + result = date_range("2013-1-1", periods=4, freq="WOM-1SAT") + dates = ["2013-01-05", "2013-02-02", "2013-03-02", "2013-04-06"] + expected = DatetimeIndex(dates, freq="WOM-1SAT") + tm.assert_index_equal(result, expected) + + def test_hash_error(self): + index = date_range("20010101", periods=10) + with pytest.raises( + TypeError, match=f"unhashable type: '{type(index).__name__}'" + ): + hash(index) + + def test_stringified_slice_with_tz(self): + # GH#2658 + start = "2013-01-07" + idx = date_range(start=start, freq="1d", periods=10, tz="US/Eastern") + df = DataFrame(np.arange(10), index=idx) + df["2013-01-14 23:44:34.437768-05:00":] # no exception here + + def test_append_join_nondatetimeindex(self): + rng = date_range("1/1/2000", periods=10) + idx = Index(["a", "b", "c", "d"]) + + result = rng.append(idx) + assert isinstance(result[0], Timestamp) + + # it works + rng.join(idx, how="outer") + + def test_map(self): + rng = date_range("1/1/2000", periods=10) + + f = lambda x: x.strftime("%Y%m%d") + result = rng.map(f) + exp = Index([f(x) for x in rng], dtype="= -1" + ) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + with pytest.raises(IndexError): + idx.take(np.array([1, -5])) + + def test_take_fill_value_with_timezone(self): + idx = pd.DatetimeIndex( + ["2011-01-01", "2011-02-01", "2011-03-01"], name="xxx", tz="US/Eastern" + ) + result = idx.take(np.array([1, 0, -1])) + expected = pd.DatetimeIndex( + ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx", tz="US/Eastern" + ) + tm.assert_index_equal(result, expected) + + # fill_value + result = idx.take(np.array([1, 0, -1]), fill_value=True) + expected = pd.DatetimeIndex( + ["2011-02-01", "2011-01-01", "NaT"], name="xxx", tz="US/Eastern" + ) + tm.assert_index_equal(result, expected) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = pd.DatetimeIndex( + ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx", tz="US/Eastern" + ) + tm.assert_index_equal(result, expected) + + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + with pytest.raises(IndexError): + idx.take(np.array([1, -5])) + + +class TestDatetimeIndex: + @pytest.mark.parametrize( + "null", [None, np.nan, np.datetime64("NaT"), pd.NaT, pd.NA] + ) + @pytest.mark.parametrize("tz", [None, "UTC", "US/Eastern"]) + def test_insert_nat(self, tz, null): + # GH#16537, GH#18295 (test missing) + idx = pd.DatetimeIndex(["2017-01-01"], tz=tz) + expected = pd.DatetimeIndex(["NaT", "2017-01-01"], tz=tz) + res = idx.insert(0, null) + tm.assert_index_equal(res, expected) + + @pytest.mark.parametrize("tz", [None, "UTC", "US/Eastern"]) + def test_insert_invalid_na(self, tz): + idx = pd.DatetimeIndex(["2017-01-01"], tz=tz) + with pytest.raises(TypeError, match="incompatible label"): + idx.insert(0, np.timedelta64("NaT")) + + def test_insert(self): + idx = DatetimeIndex(["2000-01-04", "2000-01-01", "2000-01-02"], name="idx") + + result = idx.insert(2, datetime(2000, 1, 5)) + exp = DatetimeIndex( + ["2000-01-04", "2000-01-01", "2000-01-05", "2000-01-02"], name="idx" + ) + tm.assert_index_equal(result, exp) + + # insertion of non-datetime should coerce to object index + result = idx.insert(1, "inserted") + expected = Index( + [ + datetime(2000, 1, 4), + "inserted", + datetime(2000, 1, 1), + datetime(2000, 1, 2), + ], + name="idx", + ) + assert not isinstance(result, DatetimeIndex) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + + idx = date_range("1/1/2000", periods=3, freq="M", name="idx") + + # preserve freq + expected_0 = DatetimeIndex( + ["1999-12-31", "2000-01-31", "2000-02-29", "2000-03-31"], + name="idx", + freq="M", + ) + expected_3 = DatetimeIndex( + ["2000-01-31", "2000-02-29", "2000-03-31", "2000-04-30"], + name="idx", + freq="M", + ) + + # reset freq to None + expected_1_nofreq = DatetimeIndex( + ["2000-01-31", "2000-01-31", "2000-02-29", "2000-03-31"], + name="idx", + freq=None, + ) + expected_3_nofreq = DatetimeIndex( + ["2000-01-31", "2000-02-29", "2000-03-31", "2000-01-02"], + name="idx", + freq=None, + ) + + cases = [ + (0, datetime(1999, 12, 31), expected_0), + (-3, datetime(1999, 12, 31), expected_0), + (3, datetime(2000, 4, 30), expected_3), + (1, datetime(2000, 1, 31), expected_1_nofreq), + (3, datetime(2000, 1, 2), expected_3_nofreq), + ] + + for n, d, expected in cases: + result = idx.insert(n, d) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + # reset freq to None + result = idx.insert(3, datetime(2000, 1, 2)) + expected = DatetimeIndex( + ["2000-01-31", "2000-02-29", "2000-03-31", "2000-01-02"], + name="idx", + freq=None, + ) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq is None + + # see gh-7299 + idx = date_range("1/1/2000", periods=3, freq="D", tz="Asia/Tokyo", name="idx") + with pytest.raises(TypeError, match="Cannot compare tz-naive and tz-aware"): + idx.insert(3, pd.Timestamp("2000-01-04")) + with pytest.raises(TypeError, match="Cannot compare tz-naive and tz-aware"): + idx.insert(3, datetime(2000, 1, 4)) + with pytest.raises(ValueError): + idx.insert(3, pd.Timestamp("2000-01-04", tz="US/Eastern")) + with pytest.raises(ValueError): + idx.insert(3, datetime(2000, 1, 4, tzinfo=pytz.timezone("US/Eastern"))) + + for tz in ["US/Pacific", "Asia/Singapore"]: + idx = date_range("1/1/2000 09:00", periods=6, freq="H", tz=tz, name="idx") + # preserve freq + expected = date_range( + "1/1/2000 09:00", periods=7, freq="H", tz=tz, name="idx" + ) + for d in [ + pd.Timestamp("2000-01-01 15:00", tz=tz), + pytz.timezone(tz).localize(datetime(2000, 1, 1, 15)), + ]: + + result = idx.insert(6, d) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + assert result.tz == expected.tz + + expected = DatetimeIndex( + [ + "2000-01-01 09:00", + "2000-01-01 10:00", + "2000-01-01 11:00", + "2000-01-01 12:00", + "2000-01-01 13:00", + "2000-01-01 14:00", + "2000-01-01 10:00", + ], + name="idx", + tz=tz, + freq=None, + ) + # reset freq to None + for d in [ + pd.Timestamp("2000-01-01 10:00", tz=tz), + pytz.timezone(tz).localize(datetime(2000, 1, 1, 10)), + ]: + result = idx.insert(6, d) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.tz == expected.tz + assert result.freq is None + + def test_delete(self): + idx = date_range(start="2000-01-01", periods=5, freq="M", name="idx") + + # preserve freq + expected_0 = date_range(start="2000-02-01", periods=4, freq="M", name="idx") + expected_4 = date_range(start="2000-01-01", periods=4, freq="M", name="idx") + + # reset freq to None + expected_1 = DatetimeIndex( + ["2000-01-31", "2000-03-31", "2000-04-30", "2000-05-31"], + freq=None, + name="idx", + ) + + cases = { + 0: expected_0, + -5: expected_0, + -1: expected_4, + 4: expected_4, + 1: expected_1, + } + for n, expected in cases.items(): + result = idx.delete(n) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + with pytest.raises((IndexError, ValueError)): + # either depending on numpy version + idx.delete(5) + + for tz in [None, "Asia/Tokyo", "US/Pacific"]: + idx = date_range( + start="2000-01-01 09:00", periods=10, freq="H", name="idx", tz=tz + ) + + expected = date_range( + start="2000-01-01 10:00", periods=9, freq="H", name="idx", tz=tz + ) + result = idx.delete(0) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freqstr == "H" + assert result.tz == expected.tz + + expected = date_range( + start="2000-01-01 09:00", periods=9, freq="H", name="idx", tz=tz + ) + result = idx.delete(-1) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freqstr == "H" + assert result.tz == expected.tz + + def test_delete_slice(self): + idx = date_range(start="2000-01-01", periods=10, freq="D", name="idx") + + # preserve freq + expected_0_2 = date_range(start="2000-01-04", periods=7, freq="D", name="idx") + expected_7_9 = date_range(start="2000-01-01", periods=7, freq="D", name="idx") + + # reset freq to None + expected_3_5 = DatetimeIndex( + [ + "2000-01-01", + "2000-01-02", + "2000-01-03", + "2000-01-07", + "2000-01-08", + "2000-01-09", + "2000-01-10", + ], + freq=None, + name="idx", + ) + + cases = { + (0, 1, 2): expected_0_2, + (7, 8, 9): expected_7_9, + (3, 4, 5): expected_3_5, + } + for n, expected in cases.items(): + result = idx.delete(n) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + result = idx.delete(slice(n[0], n[-1] + 1)) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + for tz in [None, "Asia/Tokyo", "US/Pacific"]: + ts = pd.Series( + 1, + index=pd.date_range( + "2000-01-01 09:00", periods=10, freq="H", name="idx", tz=tz + ), + ) + # preserve freq + result = ts.drop(ts.index[:5]).index + expected = pd.date_range( + "2000-01-01 14:00", periods=5, freq="H", name="idx", tz=tz + ) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + assert result.tz == expected.tz + + # reset freq to None + result = ts.drop(ts.index[[1, 3, 5, 7, 9]]).index + expected = DatetimeIndex( + [ + "2000-01-01 09:00", + "2000-01-01 11:00", + "2000-01-01 13:00", + "2000-01-01 15:00", + "2000-01-01 17:00", + ], + freq=None, + name="idx", + tz=tz, + ) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + assert result.tz == expected.tz + + def test_get_value(self): + # specifically make sure we have test for np.datetime64 key + dti = pd.date_range("2016-01-01", periods=3) + + arr = np.arange(6, 8) + + key = dti[1] + + result = dti.get_value(arr, key) + assert result == 7 + + result = dti.get_value(arr, key.to_pydatetime()) + assert result == 7 + + result = dti.get_value(arr, key.to_datetime64()) + assert result == 7 + + def test_get_loc(self): + idx = pd.date_range("2000-01-01", periods=3) + + for method in [None, "pad", "backfill", "nearest"]: + assert idx.get_loc(idx[1], method) == 1 + assert idx.get_loc(idx[1].to_pydatetime(), method) == 1 + assert idx.get_loc(str(idx[1]), method) == 1 + + if method is not None: + assert ( + idx.get_loc(idx[1], method, tolerance=pd.Timedelta("0 days")) == 1 + ) + + assert idx.get_loc("2000-01-01", method="nearest") == 0 + assert idx.get_loc("2000-01-01T12", method="nearest") == 1 + + assert idx.get_loc("2000-01-01T12", method="nearest", tolerance="1 day") == 1 + assert ( + idx.get_loc("2000-01-01T12", method="nearest", tolerance=pd.Timedelta("1D")) + == 1 + ) + assert ( + idx.get_loc( + "2000-01-01T12", method="nearest", tolerance=np.timedelta64(1, "D") + ) + == 1 + ) + assert ( + idx.get_loc("2000-01-01T12", method="nearest", tolerance=timedelta(1)) == 1 + ) + with pytest.raises(ValueError, match="unit abbreviation w/o a number"): + idx.get_loc("2000-01-01T12", method="nearest", tolerance="foo") + with pytest.raises(KeyError, match="'2000-01-01T03'"): + idx.get_loc("2000-01-01T03", method="nearest", tolerance="2 hours") + with pytest.raises( + ValueError, match="tolerance size must match target index size" + ): + idx.get_loc( + "2000-01-01", + method="nearest", + tolerance=[ + pd.Timedelta("1day").to_timedelta64(), + pd.Timedelta("1day").to_timedelta64(), + ], + ) + + assert idx.get_loc("2000", method="nearest") == slice(0, 3) + assert idx.get_loc("2000-01", method="nearest") == slice(0, 3) + + assert idx.get_loc("1999", method="nearest") == 0 + assert idx.get_loc("2001", method="nearest") == 2 + + with pytest.raises(KeyError, match="'1999'"): + idx.get_loc("1999", method="pad") + with pytest.raises(KeyError, match="'2001'"): + idx.get_loc("2001", method="backfill") + + with pytest.raises(KeyError, match="'foobar'"): + idx.get_loc("foobar") + with pytest.raises(TypeError): + idx.get_loc(slice(2)) + + idx = pd.to_datetime(["2000-01-01", "2000-01-04"]) + assert idx.get_loc("2000-01-02", method="nearest") == 0 + assert idx.get_loc("2000-01-03", method="nearest") == 1 + assert idx.get_loc("2000-01", method="nearest") == slice(0, 2) + + # time indexing + idx = pd.date_range("2000-01-01", periods=24, freq="H") + tm.assert_numpy_array_equal( + idx.get_loc(time(12)), np.array([12]), check_dtype=False + ) + tm.assert_numpy_array_equal( + idx.get_loc(time(12, 30)), np.array([]), check_dtype=False + ) + with pytest.raises(NotImplementedError): + idx.get_loc(time(12, 30), method="pad") + + def test_get_indexer(self): + idx = pd.date_range("2000-01-01", periods=3) + exp = np.array([0, 1, 2], dtype=np.intp) + tm.assert_numpy_array_equal(idx.get_indexer(idx), exp) + + target = idx[0] + pd.to_timedelta(["-1 hour", "12 hours", "1 day 1 hour"]) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "pad"), np.array([-1, 0, 1], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "backfill"), np.array([0, 1, 2], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "nearest"), np.array([0, 1, 1], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "nearest", tolerance=pd.Timedelta("1 hour")), + np.array([0, -1, 1], dtype=np.intp), + ) + tol_raw = [ + pd.Timedelta("1 hour"), + pd.Timedelta("1 hour"), + pd.Timedelta("1 hour").to_timedelta64(), + ] + tm.assert_numpy_array_equal( + idx.get_indexer( + target, "nearest", tolerance=[np.timedelta64(x) for x in tol_raw] + ), + np.array([0, -1, 1], dtype=np.intp), + ) + tol_bad = [ + pd.Timedelta("2 hour").to_timedelta64(), + pd.Timedelta("1 hour").to_timedelta64(), + "foo", + ] + with pytest.raises(ValueError, match="abbreviation w/o a number"): + idx.get_indexer(target, "nearest", tolerance=tol_bad) + with pytest.raises(ValueError): + idx.get_indexer(idx[[0]], method="nearest", tolerance="foo") + + def test_reasonable_key_error(self): + # GH#1062 + index = DatetimeIndex(["1/3/2000"]) + with pytest.raises(KeyError, match="2000"): + index.get_loc("1/1/2000") + + @pytest.mark.parametrize("key", [pd.Timedelta(0), pd.Timedelta(1), timedelta(0)]) + def test_timedelta_invalid_key(self, key): + # GH#20464 + dti = pd.date_range("1970-01-01", periods=10) + with pytest.raises(TypeError): + dti.get_loc(key) + + def test_get_loc_nat(self): + # GH#20464 + index = DatetimeIndex(["1/3/2000", "NaT"]) + assert index.get_loc(pd.NaT) == 1 diff --git a/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_misc.py b/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_misc.py new file mode 100644 index 0000000..340f53b --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_misc.py @@ -0,0 +1,384 @@ +import calendar +from datetime import datetime +import locale +import unicodedata + +import numpy as np +import pytest + +import pandas as pd +from pandas import DatetimeIndex, Index, Timestamp, date_range, offsets +import pandas._testing as tm + + +class TestTimeSeries: + def test_pass_datetimeindex_to_index(self): + # Bugs in #1396 + rng = date_range("1/1/2000", "3/1/2000") + idx = Index(rng, dtype=object) + + expected = Index(rng.to_pydatetime(), dtype=object) + + tm.assert_numpy_array_equal(idx.values, expected.values) + + def test_range_edges(self): + # GH#13672 + idx = pd.date_range( + start=Timestamp("1970-01-01 00:00:00.000000001"), + end=Timestamp("1970-01-01 00:00:00.000000004"), + freq="N", + ) + exp = DatetimeIndex( + [ + "1970-01-01 00:00:00.000000001", + "1970-01-01 00:00:00.000000002", + "1970-01-01 00:00:00.000000003", + "1970-01-01 00:00:00.000000004", + ] + ) + tm.assert_index_equal(idx, exp) + + idx = pd.date_range( + start=Timestamp("1970-01-01 00:00:00.000000004"), + end=Timestamp("1970-01-01 00:00:00.000000001"), + freq="N", + ) + exp = DatetimeIndex([]) + tm.assert_index_equal(idx, exp) + + idx = pd.date_range( + start=Timestamp("1970-01-01 00:00:00.000000001"), + end=Timestamp("1970-01-01 00:00:00.000000001"), + freq="N", + ) + exp = DatetimeIndex(["1970-01-01 00:00:00.000000001"]) + tm.assert_index_equal(idx, exp) + + idx = pd.date_range( + start=Timestamp("1970-01-01 00:00:00.000001"), + end=Timestamp("1970-01-01 00:00:00.000004"), + freq="U", + ) + exp = DatetimeIndex( + [ + "1970-01-01 00:00:00.000001", + "1970-01-01 00:00:00.000002", + "1970-01-01 00:00:00.000003", + "1970-01-01 00:00:00.000004", + ] + ) + tm.assert_index_equal(idx, exp) + + idx = pd.date_range( + start=Timestamp("1970-01-01 00:00:00.001"), + end=Timestamp("1970-01-01 00:00:00.004"), + freq="L", + ) + exp = DatetimeIndex( + [ + "1970-01-01 00:00:00.001", + "1970-01-01 00:00:00.002", + "1970-01-01 00:00:00.003", + "1970-01-01 00:00:00.004", + ] + ) + tm.assert_index_equal(idx, exp) + + idx = pd.date_range( + start=Timestamp("1970-01-01 00:00:01"), + end=Timestamp("1970-01-01 00:00:04"), + freq="S", + ) + exp = DatetimeIndex( + [ + "1970-01-01 00:00:01", + "1970-01-01 00:00:02", + "1970-01-01 00:00:03", + "1970-01-01 00:00:04", + ] + ) + tm.assert_index_equal(idx, exp) + + idx = pd.date_range( + start=Timestamp("1970-01-01 00:01"), + end=Timestamp("1970-01-01 00:04"), + freq="T", + ) + exp = DatetimeIndex( + [ + "1970-01-01 00:01", + "1970-01-01 00:02", + "1970-01-01 00:03", + "1970-01-01 00:04", + ] + ) + tm.assert_index_equal(idx, exp) + + idx = pd.date_range( + start=Timestamp("1970-01-01 01:00"), + end=Timestamp("1970-01-01 04:00"), + freq="H", + ) + exp = DatetimeIndex( + [ + "1970-01-01 01:00", + "1970-01-01 02:00", + "1970-01-01 03:00", + "1970-01-01 04:00", + ] + ) + tm.assert_index_equal(idx, exp) + + idx = pd.date_range( + start=Timestamp("1970-01-01"), end=Timestamp("1970-01-04"), freq="D" + ) + exp = DatetimeIndex(["1970-01-01", "1970-01-02", "1970-01-03", "1970-01-04"]) + tm.assert_index_equal(idx, exp) + + +class TestDatetime64: + def test_datetimeindex_accessors(self): + dti_naive = pd.date_range(freq="D", start=datetime(1998, 1, 1), periods=365) + # GH#13303 + dti_tz = pd.date_range( + freq="D", start=datetime(1998, 1, 1), periods=365, tz="US/Eastern" + ) + for dti in [dti_naive, dti_tz]: + + assert dti.year[0] == 1998 + assert dti.month[0] == 1 + assert dti.day[0] == 1 + assert dti.hour[0] == 0 + assert dti.minute[0] == 0 + assert dti.second[0] == 0 + assert dti.microsecond[0] == 0 + assert dti.dayofweek[0] == 3 + + assert dti.dayofyear[0] == 1 + assert dti.dayofyear[120] == 121 + + assert dti.weekofyear[0] == 1 + assert dti.weekofyear[120] == 18 + + assert dti.quarter[0] == 1 + assert dti.quarter[120] == 2 + + assert dti.days_in_month[0] == 31 + assert dti.days_in_month[90] == 30 + + assert dti.is_month_start[0] + assert not dti.is_month_start[1] + assert dti.is_month_start[31] + assert dti.is_quarter_start[0] + assert dti.is_quarter_start[90] + assert dti.is_year_start[0] + assert not dti.is_year_start[364] + assert not dti.is_month_end[0] + assert dti.is_month_end[30] + assert not dti.is_month_end[31] + assert dti.is_month_end[364] + assert not dti.is_quarter_end[0] + assert not dti.is_quarter_end[30] + assert dti.is_quarter_end[89] + assert dti.is_quarter_end[364] + assert not dti.is_year_end[0] + assert dti.is_year_end[364] + + assert len(dti.year) == 365 + assert len(dti.month) == 365 + assert len(dti.day) == 365 + assert len(dti.hour) == 365 + assert len(dti.minute) == 365 + assert len(dti.second) == 365 + assert len(dti.microsecond) == 365 + assert len(dti.dayofweek) == 365 + assert len(dti.dayofyear) == 365 + assert len(dti.weekofyear) == 365 + assert len(dti.quarter) == 365 + assert len(dti.is_month_start) == 365 + assert len(dti.is_month_end) == 365 + assert len(dti.is_quarter_start) == 365 + assert len(dti.is_quarter_end) == 365 + assert len(dti.is_year_start) == 365 + assert len(dti.is_year_end) == 365 + + dti.name = "name" + + # non boolean accessors -> return Index + for accessor in DatetimeIndex._field_ops: + res = getattr(dti, accessor) + assert len(res) == 365 + assert isinstance(res, Index) + assert res.name == "name" + + # boolean accessors -> return array + for accessor in DatetimeIndex._bool_ops: + res = getattr(dti, accessor) + assert len(res) == 365 + assert isinstance(res, np.ndarray) + + # test boolean indexing + res = dti[dti.is_quarter_start] + exp = dti[[0, 90, 181, 273]] + tm.assert_index_equal(res, exp) + res = dti[dti.is_leap_year] + exp = DatetimeIndex([], freq="D", tz=dti.tz, name="name") + tm.assert_index_equal(res, exp) + + dti = pd.date_range(freq="BQ-FEB", start=datetime(1998, 1, 1), periods=4) + + assert sum(dti.is_quarter_start) == 0 + assert sum(dti.is_quarter_end) == 4 + assert sum(dti.is_year_start) == 0 + assert sum(dti.is_year_end) == 1 + + # Ensure is_start/end accessors throw ValueError for CustomBusinessDay, + bday_egypt = offsets.CustomBusinessDay(weekmask="Sun Mon Tue Wed Thu") + dti = date_range(datetime(2013, 4, 30), periods=5, freq=bday_egypt) + msg = "Custom business days is not supported by is_month_start" + with pytest.raises(ValueError, match=msg): + dti.is_month_start + + dti = DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-03"]) + + assert dti.is_month_start[0] == 1 + + tests = [ + (Timestamp("2013-06-01", freq="M").is_month_start, 1), + (Timestamp("2013-06-01", freq="BM").is_month_start, 0), + (Timestamp("2013-06-03", freq="M").is_month_start, 0), + (Timestamp("2013-06-03", freq="BM").is_month_start, 1), + (Timestamp("2013-02-28", freq="Q-FEB").is_month_end, 1), + (Timestamp("2013-02-28", freq="Q-FEB").is_quarter_end, 1), + (Timestamp("2013-02-28", freq="Q-FEB").is_year_end, 1), + (Timestamp("2013-03-01", freq="Q-FEB").is_month_start, 1), + (Timestamp("2013-03-01", freq="Q-FEB").is_quarter_start, 1), + (Timestamp("2013-03-01", freq="Q-FEB").is_year_start, 1), + (Timestamp("2013-03-31", freq="QS-FEB").is_month_end, 1), + (Timestamp("2013-03-31", freq="QS-FEB").is_quarter_end, 0), + (Timestamp("2013-03-31", freq="QS-FEB").is_year_end, 0), + (Timestamp("2013-02-01", freq="QS-FEB").is_month_start, 1), + (Timestamp("2013-02-01", freq="QS-FEB").is_quarter_start, 1), + (Timestamp("2013-02-01", freq="QS-FEB").is_year_start, 1), + (Timestamp("2013-06-30", freq="BQ").is_month_end, 0), + (Timestamp("2013-06-30", freq="BQ").is_quarter_end, 0), + (Timestamp("2013-06-30", freq="BQ").is_year_end, 0), + (Timestamp("2013-06-28", freq="BQ").is_month_end, 1), + (Timestamp("2013-06-28", freq="BQ").is_quarter_end, 1), + (Timestamp("2013-06-28", freq="BQ").is_year_end, 0), + (Timestamp("2013-06-30", freq="BQS-APR").is_month_end, 0), + (Timestamp("2013-06-30", freq="BQS-APR").is_quarter_end, 0), + (Timestamp("2013-06-30", freq="BQS-APR").is_year_end, 0), + (Timestamp("2013-06-28", freq="BQS-APR").is_month_end, 1), + (Timestamp("2013-06-28", freq="BQS-APR").is_quarter_end, 1), + (Timestamp("2013-03-29", freq="BQS-APR").is_year_end, 1), + (Timestamp("2013-11-01", freq="AS-NOV").is_year_start, 1), + (Timestamp("2013-10-31", freq="AS-NOV").is_year_end, 1), + (Timestamp("2012-02-01").days_in_month, 29), + (Timestamp("2013-02-01").days_in_month, 28), + ] + + for ts, value in tests: + assert ts == value + + # GH 6538: Check that DatetimeIndex and its TimeStamp elements + # return the same weekofyear accessor close to new year w/ tz + dates = ["2013/12/29", "2013/12/30", "2013/12/31"] + dates = DatetimeIndex(dates, tz="Europe/Brussels") + expected = [52, 1, 1] + assert dates.weekofyear.tolist() == expected + assert [d.weekofyear for d in dates] == expected + + # GH 12806 + @pytest.mark.parametrize( + "time_locale", [None] if tm.get_locales() is None else [None] + tm.get_locales() + ) + def test_datetime_name_accessors(self, time_locale): + # Test Monday -> Sunday and January -> December, in that sequence + if time_locale is None: + # If the time_locale is None, day-name and month_name should + # return the english attributes + expected_days = [ + "Monday", + "Tuesday", + "Wednesday", + "Thursday", + "Friday", + "Saturday", + "Sunday", + ] + expected_months = [ + "January", + "February", + "March", + "April", + "May", + "June", + "July", + "August", + "September", + "October", + "November", + "December", + ] + else: + with tm.set_locale(time_locale, locale.LC_TIME): + expected_days = calendar.day_name[:] + expected_months = calendar.month_name[1:] + + # GH#11128 + dti = pd.date_range(freq="D", start=datetime(1998, 1, 1), periods=365) + english_days = [ + "Monday", + "Tuesday", + "Wednesday", + "Thursday", + "Friday", + "Saturday", + "Sunday", + ] + for day, name, eng_name in zip(range(4, 11), expected_days, english_days): + name = name.capitalize() + assert dti.day_name(locale=time_locale)[day] == name + ts = Timestamp(datetime(2016, 4, day)) + assert ts.day_name(locale=time_locale) == name + dti = dti.append(DatetimeIndex([pd.NaT])) + assert np.isnan(dti.day_name(locale=time_locale)[-1]) + ts = Timestamp(pd.NaT) + assert np.isnan(ts.day_name(locale=time_locale)) + + # GH#12805 + dti = pd.date_range(freq="M", start="2012", end="2013") + result = dti.month_name(locale=time_locale) + expected = Index([month.capitalize() for month in expected_months]) + + # work around different normalization schemes + # https://github.com/pandas-dev/pandas/issues/22342 + result = result.str.normalize("NFD") + expected = expected.str.normalize("NFD") + + tm.assert_index_equal(result, expected) + + for date, expected in zip(dti, expected_months): + result = date.month_name(locale=time_locale) + expected = expected.capitalize() + + result = unicodedata.normalize("NFD", result) + expected = unicodedata.normalize("NFD", result) + + assert result == expected + dti = dti.append(DatetimeIndex([pd.NaT])) + assert np.isnan(dti.month_name(locale=time_locale)[-1]) + + def test_nanosecond_field(self): + dti = DatetimeIndex(np.arange(10)) + + tm.assert_index_equal(dti.nanosecond, pd.Index(np.arange(10, dtype=np.int64))) + + +def test_iter_readonly(): + # GH#28055 ints_to_pydatetime with readonly array + arr = np.array([np.datetime64("2012-02-15T12:00:00.000000000")]) + arr.setflags(write=False) + dti = pd.to_datetime(arr) + list(dti) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_missing.py b/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_missing.py new file mode 100644 index 0000000..3399c8e --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_missing.py @@ -0,0 +1,62 @@ +import pytest + +import pandas as pd +import pandas._testing as tm + + +class TestDatetimeIndex: + @pytest.mark.parametrize("tz", ["US/Eastern", "Asia/Tokyo"]) + def test_fillna_datetime64(self, tz): + # GH 11343 + idx = pd.DatetimeIndex(["2011-01-01 09:00", pd.NaT, "2011-01-01 11:00"]) + + exp = pd.DatetimeIndex( + ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"] + ) + tm.assert_index_equal(idx.fillna(pd.Timestamp("2011-01-01 10:00")), exp) + + # tz mismatch + exp = pd.Index( + [ + pd.Timestamp("2011-01-01 09:00"), + pd.Timestamp("2011-01-01 10:00", tz=tz), + pd.Timestamp("2011-01-01 11:00"), + ], + dtype=object, + ) + tm.assert_index_equal(idx.fillna(pd.Timestamp("2011-01-01 10:00", tz=tz)), exp) + + # object + exp = pd.Index( + [pd.Timestamp("2011-01-01 09:00"), "x", pd.Timestamp("2011-01-01 11:00")], + dtype=object, + ) + tm.assert_index_equal(idx.fillna("x"), exp) + + idx = pd.DatetimeIndex(["2011-01-01 09:00", pd.NaT, "2011-01-01 11:00"], tz=tz) + + exp = pd.DatetimeIndex( + ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], tz=tz + ) + tm.assert_index_equal(idx.fillna(pd.Timestamp("2011-01-01 10:00", tz=tz)), exp) + + exp = pd.Index( + [ + pd.Timestamp("2011-01-01 09:00", tz=tz), + pd.Timestamp("2011-01-01 10:00"), + pd.Timestamp("2011-01-01 11:00", tz=tz), + ], + dtype=object, + ) + tm.assert_index_equal(idx.fillna(pd.Timestamp("2011-01-01 10:00")), exp) + + # object + exp = pd.Index( + [ + pd.Timestamp("2011-01-01 09:00", tz=tz), + "x", + pd.Timestamp("2011-01-01 11:00", tz=tz), + ], + dtype=object, + ) + tm.assert_index_equal(idx.fillna("x"), exp) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_ops.py b/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_ops.py new file mode 100644 index 0000000..ecd4ace --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_ops.py @@ -0,0 +1,546 @@ +from datetime import datetime +import warnings + +import numpy as np +import pytest + +from pandas.core.dtypes.generic import ABCDateOffset + +import pandas as pd +from pandas import ( + DatetimeIndex, + Index, + PeriodIndex, + Series, + Timestamp, + bdate_range, + date_range, +) +import pandas._testing as tm +from pandas.tests.base.test_ops import Ops + +from pandas.tseries.offsets import BDay, BMonthEnd, CDay, Day, Hour + +START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) + + +class TestDatetimeIndexOps(Ops): + def setup_method(self, method): + super().setup_method(method) + mask = lambda x: (isinstance(x, DatetimeIndex) or isinstance(x, PeriodIndex)) + self.is_valid_objs = [o for o in self.objs if mask(o)] + self.not_valid_objs = [o for o in self.objs if not mask(o)] + + def test_ops_properties(self): + f = lambda x: isinstance(x, DatetimeIndex) + self.check_ops_properties(DatetimeIndex._field_ops, f) + self.check_ops_properties(DatetimeIndex._object_ops, f) + self.check_ops_properties(DatetimeIndex._bool_ops, f) + + def test_ops_properties_basic(self): + + # sanity check that the behavior didn't change + # GH#7206 + for op in ["year", "day", "second", "weekday"]: + msg = f"'Series' object has no attribute '{op}'" + with pytest.raises(AttributeError, match=msg): + getattr(self.dt_series, op) + + # attribute access should still work! + s = Series(dict(year=2000, month=1, day=10)) + assert s.year == 2000 + assert s.month == 1 + assert s.day == 10 + msg = "'Series' object has no attribute 'weekday'" + with pytest.raises(AttributeError, match=msg): + s.weekday + + def test_repeat_range(self, tz_naive_fixture): + tz = tz_naive_fixture + rng = date_range("1/1/2000", "1/1/2001") + + result = rng.repeat(5) + assert result.freq is None + assert len(result) == 5 * len(rng) + + index = pd.date_range("2001-01-01", periods=2, freq="D", tz=tz) + exp = pd.DatetimeIndex( + ["2001-01-01", "2001-01-01", "2001-01-02", "2001-01-02"], tz=tz + ) + for res in [index.repeat(2), np.repeat(index, 2)]: + tm.assert_index_equal(res, exp) + assert res.freq is None + + index = pd.date_range("2001-01-01", periods=2, freq="2D", tz=tz) + exp = pd.DatetimeIndex( + ["2001-01-01", "2001-01-01", "2001-01-03", "2001-01-03"], tz=tz + ) + for res in [index.repeat(2), np.repeat(index, 2)]: + tm.assert_index_equal(res, exp) + assert res.freq is None + + index = pd.DatetimeIndex(["2001-01-01", "NaT", "2003-01-01"], tz=tz) + exp = pd.DatetimeIndex( + [ + "2001-01-01", + "2001-01-01", + "2001-01-01", + "NaT", + "NaT", + "NaT", + "2003-01-01", + "2003-01-01", + "2003-01-01", + ], + tz=tz, + ) + for res in [index.repeat(3), np.repeat(index, 3)]: + tm.assert_index_equal(res, exp) + assert res.freq is None + + def test_repeat(self, tz_naive_fixture): + tz = tz_naive_fixture + reps = 2 + msg = "the 'axis' parameter is not supported" + + rng = pd.date_range(start="2016-01-01", periods=2, freq="30Min", tz=tz) + + expected_rng = DatetimeIndex( + [ + Timestamp("2016-01-01 00:00:00", tz=tz, freq="30T"), + Timestamp("2016-01-01 00:00:00", tz=tz, freq="30T"), + Timestamp("2016-01-01 00:30:00", tz=tz, freq="30T"), + Timestamp("2016-01-01 00:30:00", tz=tz, freq="30T"), + ] + ) + + res = rng.repeat(reps) + tm.assert_index_equal(res, expected_rng) + assert res.freq is None + + tm.assert_index_equal(np.repeat(rng, reps), expected_rng) + with pytest.raises(ValueError, match=msg): + np.repeat(rng, reps, axis=1) + + def test_resolution(self, tz_naive_fixture): + tz = tz_naive_fixture + for freq, expected in zip( + ["A", "Q", "M", "D", "H", "T", "S", "L", "U"], + [ + "day", + "day", + "day", + "day", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + ], + ): + idx = pd.date_range(start="2013-04-01", periods=30, freq=freq, tz=tz) + assert idx.resolution == expected + + def test_value_counts_unique(self, tz_naive_fixture): + tz = tz_naive_fixture + # GH 7735 + idx = pd.date_range("2011-01-01 09:00", freq="H", periods=10) + # create repeated values, 'n'th element is repeated by n+1 times + idx = DatetimeIndex(np.repeat(idx.values, range(1, len(idx) + 1)), tz=tz) + + exp_idx = pd.date_range("2011-01-01 18:00", freq="-1H", periods=10, tz=tz) + expected = Series(range(10, 0, -1), index=exp_idx, dtype="int64") + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) + + expected = pd.date_range("2011-01-01 09:00", freq="H", periods=10, tz=tz) + tm.assert_index_equal(idx.unique(), expected) + + idx = DatetimeIndex( + [ + "2013-01-01 09:00", + "2013-01-01 09:00", + "2013-01-01 09:00", + "2013-01-01 08:00", + "2013-01-01 08:00", + pd.NaT, + ], + tz=tz, + ) + + exp_idx = DatetimeIndex(["2013-01-01 09:00", "2013-01-01 08:00"], tz=tz) + expected = Series([3, 2], index=exp_idx) + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) + + exp_idx = DatetimeIndex(["2013-01-01 09:00", "2013-01-01 08:00", pd.NaT], tz=tz) + expected = Series([3, 2, 1], index=exp_idx) + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(dropna=False), expected) + + tm.assert_index_equal(idx.unique(), exp_idx) + + def test_nonunique_contains(self): + # GH 9512 + for idx in map( + DatetimeIndex, + ( + [0, 1, 0], + [0, 0, -1], + [0, -1, -1], + ["2015", "2015", "2016"], + ["2015", "2015", "2014"], + ), + ): + assert idx[0] in idx + + @pytest.mark.parametrize( + "idx", + [ + DatetimeIndex( + ["2011-01-01", "2011-01-02", "2011-01-03"], freq="D", name="idx" + ), + DatetimeIndex( + ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], + freq="H", + name="tzidx", + tz="Asia/Tokyo", + ), + ], + ) + def test_order_with_freq(self, idx): + ordered = idx.sort_values() + tm.assert_index_equal(ordered, idx) + assert ordered.freq == idx.freq + + ordered = idx.sort_values(ascending=False) + expected = idx[::-1] + tm.assert_index_equal(ordered, expected) + assert ordered.freq == expected.freq + assert ordered.freq.n == -1 + + ordered, indexer = idx.sort_values(return_indexer=True) + tm.assert_index_equal(ordered, idx) + tm.assert_numpy_array_equal(indexer, np.array([0, 1, 2]), check_dtype=False) + assert ordered.freq == idx.freq + + ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) + expected = idx[::-1] + tm.assert_index_equal(ordered, expected) + tm.assert_numpy_array_equal(indexer, np.array([2, 1, 0]), check_dtype=False) + assert ordered.freq == expected.freq + assert ordered.freq.n == -1 + + @pytest.mark.parametrize( + "index_dates,expected_dates", + [ + ( + ["2011-01-01", "2011-01-03", "2011-01-05", "2011-01-02", "2011-01-01"], + ["2011-01-01", "2011-01-01", "2011-01-02", "2011-01-03", "2011-01-05"], + ), + ( + ["2011-01-01", "2011-01-03", "2011-01-05", "2011-01-02", "2011-01-01"], + ["2011-01-01", "2011-01-01", "2011-01-02", "2011-01-03", "2011-01-05"], + ), + ( + [pd.NaT, "2011-01-03", "2011-01-05", "2011-01-02", pd.NaT], + [pd.NaT, pd.NaT, "2011-01-02", "2011-01-03", "2011-01-05"], + ), + ], + ) + def test_order_without_freq(self, index_dates, expected_dates, tz_naive_fixture): + tz = tz_naive_fixture + + # without freq + index = DatetimeIndex(index_dates, tz=tz, name="idx") + expected = DatetimeIndex(expected_dates, tz=tz, name="idx") + + ordered = index.sort_values() + tm.assert_index_equal(ordered, expected) + assert ordered.freq is None + + ordered = index.sort_values(ascending=False) + tm.assert_index_equal(ordered, expected[::-1]) + assert ordered.freq is None + + ordered, indexer = index.sort_values(return_indexer=True) + tm.assert_index_equal(ordered, expected) + + exp = np.array([0, 4, 3, 1, 2]) + tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) + assert ordered.freq is None + + ordered, indexer = index.sort_values(return_indexer=True, ascending=False) + tm.assert_index_equal(ordered, expected[::-1]) + + exp = np.array([2, 1, 3, 4, 0]) + tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) + assert ordered.freq is None + + def test_drop_duplicates_metadata(self): + # GH 10115 + idx = pd.date_range("2011-01-01", "2011-01-31", freq="D", name="idx") + result = idx.drop_duplicates() + tm.assert_index_equal(idx, result) + assert idx.freq == result.freq + + idx_dup = idx.append(idx) + assert idx_dup.freq is None # freq is reset + result = idx_dup.drop_duplicates() + tm.assert_index_equal(idx, result) + assert result.freq is None + + def test_drop_duplicates(self): + # to check Index/Series compat + base = pd.date_range("2011-01-01", "2011-01-31", freq="D", name="idx") + idx = base.append(base[:5]) + + res = idx.drop_duplicates() + tm.assert_index_equal(res, base) + res = Series(idx).drop_duplicates() + tm.assert_series_equal(res, Series(base)) + + res = idx.drop_duplicates(keep="last") + exp = base[5:].append(base[:5]) + tm.assert_index_equal(res, exp) + res = Series(idx).drop_duplicates(keep="last") + tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) + + res = idx.drop_duplicates(keep=False) + tm.assert_index_equal(res, base[5:]) + res = Series(idx).drop_duplicates(keep=False) + tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) + + @pytest.mark.parametrize( + "freq", + [ + "A", + "2A", + "-2A", + "Q", + "-1Q", + "M", + "-1M", + "D", + "3D", + "-3D", + "W", + "-1W", + "H", + "2H", + "-2H", + "T", + "2T", + "S", + "-3S", + ], + ) + def test_infer_freq(self, freq): + # GH 11018 + idx = pd.date_range("2011-01-01 09:00:00", freq=freq, periods=10) + result = pd.DatetimeIndex(idx.asi8, freq="infer") + tm.assert_index_equal(idx, result) + assert result.freq == freq + + def test_nat(self, tz_naive_fixture): + tz = tz_naive_fixture + assert pd.DatetimeIndex._na_value is pd.NaT + assert pd.DatetimeIndex([])._na_value is pd.NaT + + idx = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], tz=tz) + assert idx._can_hold_na + + tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) + assert idx.hasnans is False + tm.assert_numpy_array_equal(idx._nan_idxs, np.array([], dtype=np.intp)) + + idx = pd.DatetimeIndex(["2011-01-01", "NaT"], tz=tz) + assert idx._can_hold_na + + tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) + assert idx.hasnans is True + tm.assert_numpy_array_equal(idx._nan_idxs, np.array([1], dtype=np.intp)) + + def test_equals(self): + # GH 13107 + idx = pd.DatetimeIndex(["2011-01-01", "2011-01-02", "NaT"]) + assert idx.equals(idx) + assert idx.equals(idx.copy()) + assert idx.equals(idx.astype(object)) + assert idx.astype(object).equals(idx) + assert idx.astype(object).equals(idx.astype(object)) + assert not idx.equals(list(idx)) + assert not idx.equals(pd.Series(idx)) + + idx2 = pd.DatetimeIndex(["2011-01-01", "2011-01-02", "NaT"], tz="US/Pacific") + assert not idx.equals(idx2) + assert not idx.equals(idx2.copy()) + assert not idx.equals(idx2.astype(object)) + assert not idx.astype(object).equals(idx2) + assert not idx.equals(list(idx2)) + assert not idx.equals(pd.Series(idx2)) + + # same internal, different tz + idx3 = pd.DatetimeIndex._simple_new(idx.asi8, tz="US/Pacific") + tm.assert_numpy_array_equal(idx.asi8, idx3.asi8) + assert not idx.equals(idx3) + assert not idx.equals(idx3.copy()) + assert not idx.equals(idx3.astype(object)) + assert not idx.astype(object).equals(idx3) + assert not idx.equals(list(idx3)) + assert not idx.equals(pd.Series(idx3)) + + # check that we do not raise when comparing with OutOfBounds objects + oob = pd.Index([datetime(2500, 1, 1)] * 3, dtype=object) + assert not idx.equals(oob) + assert not idx2.equals(oob) + assert not idx3.equals(oob) + + # check that we do not raise when comparing with OutOfBounds dt64 + oob2 = oob.map(np.datetime64) + assert not idx.equals(oob2) + assert not idx2.equals(oob2) + assert not idx3.equals(oob2) + + @pytest.mark.parametrize("values", [["20180101", "20180103", "20180105"], []]) + @pytest.mark.parametrize("freq", ["2D", Day(2), "2B", BDay(2), "48H", Hour(48)]) + @pytest.mark.parametrize("tz", [None, "US/Eastern"]) + def test_freq_setter(self, values, freq, tz): + # GH 20678 + idx = DatetimeIndex(values, tz=tz) + + # can set to an offset, converting from string if necessary + idx._data.freq = freq + assert idx.freq == freq + assert isinstance(idx.freq, ABCDateOffset) + + # can reset to None + idx._data.freq = None + assert idx.freq is None + + def test_freq_setter_errors(self): + # GH 20678 + idx = DatetimeIndex(["20180101", "20180103", "20180105"]) + + # setting with an incompatible freq + msg = ( + "Inferred frequency 2D from passed values does not conform to " + "passed frequency 5D" + ) + with pytest.raises(ValueError, match=msg): + idx._data.freq = "5D" + + # setting with non-freq string + with pytest.raises(ValueError, match="Invalid frequency"): + idx._data.freq = "foo" + + +class TestBusinessDatetimeIndex: + def setup_method(self, method): + self.rng = bdate_range(START, END) + + def test_comparison(self): + d = self.rng[10] + + comp = self.rng > d + assert comp[11] + assert not comp[9] + + def test_pickle_unpickle(self): + unpickled = tm.round_trip_pickle(self.rng) + assert unpickled.freq is not None + + def test_copy(self): + cp = self.rng.copy() + repr(cp) + tm.assert_index_equal(cp, self.rng) + + def test_shift(self): + shifted = self.rng.shift(5) + assert shifted[0] == self.rng[5] + assert shifted.freq == self.rng.freq + + shifted = self.rng.shift(-5) + assert shifted[5] == self.rng[0] + assert shifted.freq == self.rng.freq + + shifted = self.rng.shift(0) + assert shifted[0] == self.rng[0] + assert shifted.freq == self.rng.freq + + rng = date_range(START, END, freq=BMonthEnd()) + shifted = rng.shift(1, freq=BDay()) + assert shifted[0] == rng[0] + BDay() + + def test_equals(self): + assert not self.rng.equals(list(self.rng)) + + def test_identical(self): + t1 = self.rng.copy() + t2 = self.rng.copy() + assert t1.identical(t2) + + # name + t1 = t1.rename("foo") + assert t1.equals(t2) + assert not t1.identical(t2) + t2 = t2.rename("foo") + assert t1.identical(t2) + + # freq + t2v = Index(t2.values) + assert t1.equals(t2v) + assert not t1.identical(t2v) + + +class TestCustomDatetimeIndex: + def setup_method(self, method): + self.rng = bdate_range(START, END, freq="C") + + def test_comparison(self): + d = self.rng[10] + + comp = self.rng > d + assert comp[11] + assert not comp[9] + + def test_copy(self): + cp = self.rng.copy() + repr(cp) + tm.assert_index_equal(cp, self.rng) + + def test_shift(self): + + shifted = self.rng.shift(5) + assert shifted[0] == self.rng[5] + assert shifted.freq == self.rng.freq + + shifted = self.rng.shift(-5) + assert shifted[5] == self.rng[0] + assert shifted.freq == self.rng.freq + + shifted = self.rng.shift(0) + assert shifted[0] == self.rng[0] + assert shifted.freq == self.rng.freq + + with warnings.catch_warnings(record=True): + warnings.simplefilter("ignore", pd.errors.PerformanceWarning) + rng = date_range(START, END, freq=BMonthEnd()) + shifted = rng.shift(1, freq=CDay()) + assert shifted[0] == rng[0] + CDay() + + def test_shift_periods(self): + # GH#22458 : argument 'n' was deprecated in favor of 'periods' + idx = pd.date_range(start=START, end=END, periods=3) + tm.assert_index_equal(idx.shift(periods=0), idx) + tm.assert_index_equal(idx.shift(0), idx) + + def test_pickle_unpickle(self): + unpickled = tm.round_trip_pickle(self.rng) + assert unpickled.freq is not None + + def test_equals(self): + assert not self.rng.equals(list(self.rng)) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_partial_slicing.py b/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_partial_slicing.py new file mode 100644 index 0000000..e30cc44 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -0,0 +1,481 @@ +""" test partial slicing on Series/Frame """ + +from datetime import datetime +import operator + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + DatetimeIndex, + Index, + Series, + Timedelta, + Timestamp, + date_range, +) +import pandas._testing as tm +from pandas.core.indexing import IndexingError + + +class TestSlicing: + def test_dti_slicing(self): + dti = date_range(start="1/1/2005", end="12/1/2005", freq="M") + dti2 = dti[[1, 3, 5]] + + v1 = dti2[0] + v2 = dti2[1] + v3 = dti2[2] + + assert v1 == Timestamp("2/28/2005") + assert v2 == Timestamp("4/30/2005") + assert v3 == Timestamp("6/30/2005") + + # don't carry freq through irregular slicing + assert dti2.freq is None + + def test_slice_keeps_name(self): + # GH4226 + st = pd.Timestamp("2013-07-01 00:00:00", tz="America/Los_Angeles") + et = pd.Timestamp("2013-07-02 00:00:00", tz="America/Los_Angeles") + dr = pd.date_range(st, et, freq="H", name="timebucket") + assert dr[1:].name == dr.name + + def test_slice_with_negative_step(self): + ts = Series(np.arange(20), date_range("2014-01-01", periods=20, freq="MS")) + SLC = pd.IndexSlice + + def assert_slices_equivalent(l_slc, i_slc): + tm.assert_series_equal(ts[l_slc], ts.iloc[i_slc]) + tm.assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) + tm.assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) + + assert_slices_equivalent(SLC[Timestamp("2014-10-01") :: -1], SLC[9::-1]) + assert_slices_equivalent(SLC["2014-10-01"::-1], SLC[9::-1]) + + assert_slices_equivalent(SLC[: Timestamp("2014-10-01") : -1], SLC[:8:-1]) + assert_slices_equivalent(SLC[:"2014-10-01":-1], SLC[:8:-1]) + + assert_slices_equivalent(SLC["2015-02-01":"2014-10-01":-1], SLC[13:8:-1]) + assert_slices_equivalent( + SLC[Timestamp("2015-02-01") : Timestamp("2014-10-01") : -1], SLC[13:8:-1] + ) + assert_slices_equivalent( + SLC["2015-02-01" : Timestamp("2014-10-01") : -1], SLC[13:8:-1] + ) + assert_slices_equivalent( + SLC[Timestamp("2015-02-01") : "2014-10-01" : -1], SLC[13:8:-1] + ) + + assert_slices_equivalent(SLC["2014-10-01":"2015-02-01":-1], SLC[:0]) + + def test_slice_with_zero_step_raises(self): + ts = Series(np.arange(20), date_range("2014-01-01", periods=20, freq="MS")) + with pytest.raises(ValueError, match="slice step cannot be zero"): + ts[::0] + with pytest.raises(ValueError, match="slice step cannot be zero"): + ts.loc[::0] + with pytest.raises(ValueError, match="slice step cannot be zero"): + ts.loc[::0] + + def test_slice_bounds_empty(self): + # GH#14354 + empty_idx = date_range(freq="1H", periods=0, end="2015") + + right = empty_idx._maybe_cast_slice_bound("2015-01-02", "right", "loc") + exp = Timestamp("2015-01-02 23:59:59.999999999") + assert right == exp + + left = empty_idx._maybe_cast_slice_bound("2015-01-02", "left", "loc") + exp = Timestamp("2015-01-02 00:00:00") + assert left == exp + + def test_slice_duplicate_monotonic(self): + # https://github.com/pandas-dev/pandas/issues/16515 + idx = pd.DatetimeIndex(["2017", "2017"]) + result = idx._maybe_cast_slice_bound("2017-01-01", "left", "loc") + expected = Timestamp("2017-01-01") + assert result == expected + + def test_monotone_DTI_indexing_bug(self): + # GH 19362 + # Testing accessing the first element in a monotonic descending + # partial string indexing. + + df = pd.DataFrame(list(range(5))) + date_list = [ + "2018-01-02", + "2017-02-10", + "2016-03-10", + "2015-03-15", + "2014-03-16", + ] + date_index = pd.to_datetime(date_list) + df["date"] = date_index + expected = pd.DataFrame({0: list(range(5)), "date": date_index}) + tm.assert_frame_equal(df, expected) + + df = pd.DataFrame( + {"A": [1, 2, 3]}, index=pd.date_range("20170101", periods=3)[::-1] + ) + expected = pd.DataFrame({"A": 1}, index=pd.date_range("20170103", periods=1)) + tm.assert_frame_equal(df.loc["2017-01-03"], expected) + + def test_slice_year(self): + dti = date_range(freq="B", start=datetime(2005, 1, 1), periods=500) + + s = Series(np.arange(len(dti)), index=dti) + result = s["2005"] + expected = s[s.index.year == 2005] + tm.assert_series_equal(result, expected) + + df = DataFrame(np.random.rand(len(dti), 5), index=dti) + result = df.loc["2005"] + expected = df[df.index.year == 2005] + tm.assert_frame_equal(result, expected) + + rng = date_range("1/1/2000", "1/1/2010") + + result = rng.get_loc("2009") + expected = slice(3288, 3653) + assert result == expected + + def test_slice_quarter(self): + dti = date_range(freq="D", start=datetime(2000, 6, 1), periods=500) + + s = Series(np.arange(len(dti)), index=dti) + assert len(s["2001Q1"]) == 90 + + df = DataFrame(np.random.rand(len(dti), 5), index=dti) + assert len(df.loc["1Q01"]) == 90 + + def test_slice_month(self): + dti = date_range(freq="D", start=datetime(2005, 1, 1), periods=500) + s = Series(np.arange(len(dti)), index=dti) + assert len(s["2005-11"]) == 30 + + df = DataFrame(np.random.rand(len(dti), 5), index=dti) + assert len(df.loc["2005-11"]) == 30 + + tm.assert_series_equal(s["2005-11"], s["11-2005"]) + + def test_partial_slice(self): + rng = date_range(freq="D", start=datetime(2005, 1, 1), periods=500) + s = Series(np.arange(len(rng)), index=rng) + + result = s["2005-05":"2006-02"] + expected = s["20050501":"20060228"] + tm.assert_series_equal(result, expected) + + result = s["2005-05":] + expected = s["20050501":] + tm.assert_series_equal(result, expected) + + result = s[:"2006-02"] + expected = s[:"20060228"] + tm.assert_series_equal(result, expected) + + result = s["2005-1-1"] + assert result == s.iloc[0] + + with pytest.raises(KeyError, match=r"^'2004-12-31'$"): + s["2004-12-31"] + + def test_partial_slice_daily(self): + rng = date_range(freq="H", start=datetime(2005, 1, 31), periods=500) + s = Series(np.arange(len(rng)), index=rng) + + result = s["2005-1-31"] + tm.assert_series_equal(result, s.iloc[:24]) + + with pytest.raises(KeyError, match=r"^'2004-12-31 00'$"): + s["2004-12-31 00"] + + def test_partial_slice_hourly(self): + rng = date_range(freq="T", start=datetime(2005, 1, 1, 20, 0, 0), periods=500) + s = Series(np.arange(len(rng)), index=rng) + + result = s["2005-1-1"] + tm.assert_series_equal(result, s.iloc[: 60 * 4]) + + result = s["2005-1-1 20"] + tm.assert_series_equal(result, s.iloc[:60]) + + assert s["2005-1-1 20:00"] == s.iloc[0] + with pytest.raises(KeyError, match=r"^'2004-12-31 00:15'$"): + s["2004-12-31 00:15"] + + def test_partial_slice_minutely(self): + rng = date_range(freq="S", start=datetime(2005, 1, 1, 23, 59, 0), periods=500) + s = Series(np.arange(len(rng)), index=rng) + + result = s["2005-1-1 23:59"] + tm.assert_series_equal(result, s.iloc[:60]) + + result = s["2005-1-1"] + tm.assert_series_equal(result, s.iloc[:60]) + + assert s[Timestamp("2005-1-1 23:59:00")] == s.iloc[0] + with pytest.raises(KeyError, match=r"^'2004-12-31 00:00:00'$"): + s["2004-12-31 00:00:00"] + + def test_partial_slice_second_precision(self): + rng = date_range( + start=datetime(2005, 1, 1, 0, 0, 59, microsecond=999990), + periods=20, + freq="US", + ) + s = Series(np.arange(20), rng) + + tm.assert_series_equal(s["2005-1-1 00:00"], s.iloc[:10]) + tm.assert_series_equal(s["2005-1-1 00:00:59"], s.iloc[:10]) + + tm.assert_series_equal(s["2005-1-1 00:01"], s.iloc[10:]) + tm.assert_series_equal(s["2005-1-1 00:01:00"], s.iloc[10:]) + + assert s[Timestamp("2005-1-1 00:00:59.999990")] == s.iloc[0] + with pytest.raises(KeyError, match="2005-1-1 00:00:00"): + s["2005-1-1 00:00:00"] + + def test_partial_slicing_dataframe(self): + # GH14856 + # Test various combinations of string slicing resolution vs. + # index resolution + # - If string resolution is less precise than index resolution, + # string is considered a slice + # - If string resolution is equal to or more precise than index + # resolution, string is considered an exact match + formats = [ + "%Y", + "%Y-%m", + "%Y-%m-%d", + "%Y-%m-%d %H", + "%Y-%m-%d %H:%M", + "%Y-%m-%d %H:%M:%S", + ] + resolutions = ["year", "month", "day", "hour", "minute", "second"] + for rnum, resolution in enumerate(resolutions[2:], 2): + # we check only 'day', 'hour', 'minute' and 'second' + unit = Timedelta("1 " + resolution) + middate = datetime(2012, 1, 1, 0, 0, 0) + index = DatetimeIndex([middate - unit, middate, middate + unit]) + values = [1, 2, 3] + df = DataFrame({"a": values}, index, dtype=np.int64) + assert df.index.resolution == resolution + + # Timestamp with the same resolution as index + # Should be exact match for Series (return scalar) + # and raise KeyError for Frame + for timestamp, expected in zip(index, values): + ts_string = timestamp.strftime(formats[rnum]) + # make ts_string as precise as index + result = df["a"][ts_string] + assert isinstance(result, np.int64) + assert result == expected + msg = fr"^'{ts_string}'$" + with pytest.raises(KeyError, match=msg): + df[ts_string] + + # Timestamp with resolution less precise than index + for fmt in formats[:rnum]: + for element, theslice in [[0, slice(None, 1)], [1, slice(1, None)]]: + ts_string = index[element].strftime(fmt) + + # Series should return slice + result = df["a"][ts_string] + expected = df["a"][theslice] + tm.assert_series_equal(result, expected) + + # Frame should return slice as well + result = df[ts_string] + expected = df[theslice] + tm.assert_frame_equal(result, expected) + + # Timestamp with resolution more precise than index + # Compatible with existing key + # Should return scalar for Series + # and raise KeyError for Frame + for fmt in formats[rnum + 1 :]: + ts_string = index[1].strftime(fmt) + result = df["a"][ts_string] + assert isinstance(result, np.int64) + assert result == 2 + msg = fr"^'{ts_string}'$" + with pytest.raises(KeyError, match=msg): + df[ts_string] + + # Not compatible with existing key + # Should raise KeyError + for fmt, res in list(zip(formats, resolutions))[rnum + 1 :]: + ts = index[1] + Timedelta("1 " + res) + ts_string = ts.strftime(fmt) + msg = fr"^'{ts_string}'$" + with pytest.raises(KeyError, match=msg): + df["a"][ts_string] + with pytest.raises(KeyError, match=msg): + df[ts_string] + + def test_partial_slicing_with_multiindex(self): + + # GH 4758 + # partial string indexing with a multi-index buggy + df = DataFrame( + { + "ACCOUNT": ["ACCT1", "ACCT1", "ACCT1", "ACCT2"], + "TICKER": ["ABC", "MNP", "XYZ", "XYZ"], + "val": [1, 2, 3, 4], + }, + index=date_range("2013-06-19 09:30:00", periods=4, freq="5T"), + ) + df_multi = df.set_index(["ACCOUNT", "TICKER"], append=True) + + expected = DataFrame( + [[1]], index=Index(["ABC"], name="TICKER"), columns=["val"] + ) + result = df_multi.loc[("2013-06-19 09:30:00", "ACCT1")] + tm.assert_frame_equal(result, expected) + + expected = df_multi.loc[ + (pd.Timestamp("2013-06-19 09:30:00", tz=None), "ACCT1", "ABC") + ] + result = df_multi.loc[("2013-06-19 09:30:00", "ACCT1", "ABC")] + tm.assert_series_equal(result, expected) + + # this is an IndexingError as we don't do partial string selection on + # multi-levels. + msg = "Too many indexers" + with pytest.raises(IndexingError, match=msg): + df_multi.loc[("2013-06-19", "ACCT1", "ABC")] + + # GH 4294 + # partial slice on a series mi + s = pd.DataFrame( + np.random.rand(1000, 1000), index=pd.date_range("2000-1-1", periods=1000) + ).stack() + + s2 = s[:-1].copy() + expected = s2["2000-1-4"] + result = s2[pd.Timestamp("2000-1-4")] + tm.assert_series_equal(result, expected) + + result = s[pd.Timestamp("2000-1-4")] + expected = s["2000-1-4"] + tm.assert_series_equal(result, expected) + + df2 = pd.DataFrame(s) + expected = df2.xs("2000-1-4") + result = df2.loc[pd.Timestamp("2000-1-4")] + tm.assert_frame_equal(result, expected) + + def test_partial_slice_doesnt_require_monotonicity(self): + # For historical reasons. + s = pd.Series(np.arange(10), pd.date_range("2014-01-01", periods=10)) + + nonmonotonic = s[[3, 5, 4]] + expected = nonmonotonic.iloc[:0] + timestamp = pd.Timestamp("2014-01-10") + + tm.assert_series_equal(nonmonotonic["2014-01-10":], expected) + with pytest.raises(KeyError, match=r"Timestamp\('2014-01-10 00:00:00'\)"): + nonmonotonic[timestamp:] + + tm.assert_series_equal(nonmonotonic.loc["2014-01-10":], expected) + with pytest.raises(KeyError, match=r"Timestamp\('2014-01-10 00:00:00'\)"): + nonmonotonic.loc[timestamp:] + + def test_loc_datetime_length_one(self): + # GH16071 + df = pd.DataFrame( + columns=["1"], + index=pd.date_range("2016-10-01T00:00:00", "2016-10-01T23:59:59"), + ) + result = df.loc[datetime(2016, 10, 1) :] + tm.assert_frame_equal(result, df) + + result = df.loc["2016-10-01T00:00:00":] + tm.assert_frame_equal(result, df) + + @pytest.mark.parametrize( + "datetimelike", + [ + Timestamp("20130101"), + datetime(2013, 1, 1), + np.datetime64("2013-01-01T00:00", "ns"), + ], + ) + @pytest.mark.parametrize( + "op,expected", + [ + (operator.lt, [True, False, False, False]), + (operator.le, [True, True, False, False]), + (operator.eq, [False, True, False, False]), + (operator.gt, [False, False, False, True]), + ], + ) + def test_selection_by_datetimelike(self, datetimelike, op, expected): + # GH issue #17965, test for ability to compare datetime64[ns] columns + # to datetimelike + df = DataFrame( + { + "A": [ + pd.Timestamp("20120101"), + pd.Timestamp("20130101"), + np.nan, + pd.Timestamp("20130103"), + ] + } + ) + result = op(df.A, datetimelike) + expected = Series(expected, name="A") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "start", + [ + "2018-12-02 21:50:00+00:00", + pd.Timestamp("2018-12-02 21:50:00+00:00"), + pd.Timestamp("2018-12-02 21:50:00+00:00").to_pydatetime(), + ], + ) + @pytest.mark.parametrize( + "end", + [ + "2018-12-02 21:52:00+00:00", + pd.Timestamp("2018-12-02 21:52:00+00:00"), + pd.Timestamp("2018-12-02 21:52:00+00:00").to_pydatetime(), + ], + ) + def test_getitem_with_datestring_with_UTC_offset(self, start, end): + # GH 24076 + idx = pd.date_range( + start="2018-12-02 14:50:00-07:00", + end="2018-12-02 14:50:00-07:00", + freq="1min", + ) + df = pd.DataFrame(1, index=idx, columns=["A"]) + result = df[start:end] + expected = df.iloc[0:3, :] + tm.assert_frame_equal(result, expected) + + # GH 16785 + start = str(start) + end = str(end) + with pytest.raises(ValueError, match="Both dates must"): + df[start : end[:-4] + "1:00"] + + with pytest.raises(ValueError, match="The index must be timezone"): + df = df.tz_localize(None) + df[start:end] + + def test_slice_reduce_to_series(self): + # GH 27516 + df = pd.DataFrame( + {"A": range(24)}, index=pd.date_range("2000", periods=24, freq="M") + ) + expected = pd.Series( + range(12), index=pd.date_range("2000", periods=12, freq="M"), name="A" + ) + result = df.loc["2000", "A"] + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_scalar_compat.py b/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_scalar_compat.py new file mode 100644 index 0000000..84eee24 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -0,0 +1,338 @@ +""" +Tests for DatetimeIndex methods behaving like their Timestamp counterparts +""" +from datetime import datetime + +import numpy as np +import pytest + +from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime + +import pandas as pd +from pandas import DatetimeIndex, Timestamp, date_range +import pandas._testing as tm + +from pandas.tseries.frequencies import to_offset + + +class TestDatetimeIndexOps: + def test_dti_time(self): + rng = date_range("1/1/2000", freq="12min", periods=10) + result = pd.Index(rng).time + expected = [t.time() for t in rng] + assert (result == expected).all() + + def test_dti_date(self): + rng = date_range("1/1/2000", freq="12H", periods=10) + result = pd.Index(rng).date + expected = [t.date() for t in rng] + assert (result == expected).all() + + @pytest.mark.parametrize("data", [["1400-01-01"], [datetime(1400, 1, 1)]]) + def test_dti_date_out_of_range(self, data): + # GH#1475 + msg = "Out of bounds nanosecond timestamp: 1400-01-01 00:00:00" + with pytest.raises(OutOfBoundsDatetime, match=msg): + DatetimeIndex(data) + + @pytest.mark.parametrize( + "field", + [ + "dayofweek", + "dayofyear", + "week", + "weekofyear", + "quarter", + "days_in_month", + "is_month_start", + "is_month_end", + "is_quarter_start", + "is_quarter_end", + "is_year_start", + "is_year_end", + ], + ) + def test_dti_timestamp_fields(self, field): + # extra fields from DatetimeIndex like quarter and week + idx = tm.makeDateIndex(100) + expected = getattr(idx, field)[-1] + result = getattr(Timestamp(idx[-1]), field) + assert result == expected + + def test_dti_timestamp_freq_fields(self): + # extra fields from DatetimeIndex like quarter and week + idx = tm.makeDateIndex(100) + + assert idx.freq == Timestamp(idx[-1], idx.freq).freq + assert idx.freqstr == Timestamp(idx[-1], idx.freq).freqstr + + # ---------------------------------------------------------------- + # DatetimeIndex.round + + def test_round_daily(self): + dti = date_range("20130101 09:10:11", periods=5) + result = dti.round("D") + expected = date_range("20130101", periods=5) + tm.assert_index_equal(result, expected) + + dti = dti.tz_localize("UTC").tz_convert("US/Eastern") + result = dti.round("D") + expected = date_range("20130101", periods=5).tz_localize("US/Eastern") + tm.assert_index_equal(result, expected) + + result = dti.round("s") + tm.assert_index_equal(result, dti) + + @pytest.mark.parametrize( + "freq, error_msg", + [ + ("Y", " is a non-fixed frequency"), + ("M", " is a non-fixed frequency"), + ("foobar", "Invalid frequency: foobar"), + ], + ) + def test_round_invalid(self, freq, error_msg): + dti = date_range("20130101 09:10:11", periods=5) + dti = dti.tz_localize("UTC").tz_convert("US/Eastern") + with pytest.raises(ValueError, match=error_msg): + dti.round(freq) + + def test_round(self, tz_naive_fixture): + tz = tz_naive_fixture + rng = date_range(start="2016-01-01", periods=5, freq="30Min", tz=tz) + elt = rng[1] + + expected_rng = DatetimeIndex( + [ + Timestamp("2016-01-01 00:00:00", tz=tz, freq="30T"), + Timestamp("2016-01-01 00:00:00", tz=tz, freq="30T"), + Timestamp("2016-01-01 01:00:00", tz=tz, freq="30T"), + Timestamp("2016-01-01 02:00:00", tz=tz, freq="30T"), + Timestamp("2016-01-01 02:00:00", tz=tz, freq="30T"), + ] + ) + expected_elt = expected_rng[1] + + tm.assert_index_equal(rng.round(freq="H"), expected_rng) + assert elt.round(freq="H") == expected_elt + + msg = pd._libs.tslibs.frequencies.INVALID_FREQ_ERR_MSG + with pytest.raises(ValueError, match=msg): + rng.round(freq="foo") + with pytest.raises(ValueError, match=msg): + elt.round(freq="foo") + + msg = " is a non-fixed frequency" + with pytest.raises(ValueError, match=msg): + rng.round(freq="M") + with pytest.raises(ValueError, match=msg): + elt.round(freq="M") + + # GH#14440 & GH#15578 + index = DatetimeIndex(["2016-10-17 12:00:00.0015"], tz=tz) + result = index.round("ms") + expected = DatetimeIndex(["2016-10-17 12:00:00.002000"], tz=tz) + tm.assert_index_equal(result, expected) + + for freq in ["us", "ns"]: + tm.assert_index_equal(index, index.round(freq)) + + index = DatetimeIndex(["2016-10-17 12:00:00.00149"], tz=tz) + result = index.round("ms") + expected = DatetimeIndex(["2016-10-17 12:00:00.001000"], tz=tz) + tm.assert_index_equal(result, expected) + + index = DatetimeIndex(["2016-10-17 12:00:00.001501031"]) + result = index.round("10ns") + expected = DatetimeIndex(["2016-10-17 12:00:00.001501030"]) + tm.assert_index_equal(result, expected) + + with tm.assert_produces_warning(False): + ts = "2016-10-17 12:00:00.001501031" + DatetimeIndex([ts]).round("1010ns") + + def test_no_rounding_occurs(self, tz_naive_fixture): + # GH 21262 + tz = tz_naive_fixture + rng = date_range(start="2016-01-01", periods=5, freq="2Min", tz=tz) + + expected_rng = DatetimeIndex( + [ + Timestamp("2016-01-01 00:00:00", tz=tz, freq="2T"), + Timestamp("2016-01-01 00:02:00", tz=tz, freq="2T"), + Timestamp("2016-01-01 00:04:00", tz=tz, freq="2T"), + Timestamp("2016-01-01 00:06:00", tz=tz, freq="2T"), + Timestamp("2016-01-01 00:08:00", tz=tz, freq="2T"), + ] + ) + + tm.assert_index_equal(rng.round(freq="2T"), expected_rng) + + @pytest.mark.parametrize( + "test_input, rounder, freq, expected", + [ + (["2117-01-01 00:00:45"], "floor", "15s", ["2117-01-01 00:00:45"]), + (["2117-01-01 00:00:45"], "ceil", "15s", ["2117-01-01 00:00:45"]), + ( + ["2117-01-01 00:00:45.000000012"], + "floor", + "10ns", + ["2117-01-01 00:00:45.000000010"], + ), + ( + ["1823-01-01 00:00:01.000000012"], + "ceil", + "10ns", + ["1823-01-01 00:00:01.000000020"], + ), + (["1823-01-01 00:00:01"], "floor", "1s", ["1823-01-01 00:00:01"]), + (["1823-01-01 00:00:01"], "ceil", "1s", ["1823-01-01 00:00:01"]), + (["2018-01-01 00:15:00"], "ceil", "15T", ["2018-01-01 00:15:00"]), + (["2018-01-01 00:15:00"], "floor", "15T", ["2018-01-01 00:15:00"]), + (["1823-01-01 03:00:00"], "ceil", "3H", ["1823-01-01 03:00:00"]), + (["1823-01-01 03:00:00"], "floor", "3H", ["1823-01-01 03:00:00"]), + ( + ("NaT", "1823-01-01 00:00:01"), + "floor", + "1s", + ("NaT", "1823-01-01 00:00:01"), + ), + ( + ("NaT", "1823-01-01 00:00:01"), + "ceil", + "1s", + ("NaT", "1823-01-01 00:00:01"), + ), + ], + ) + def test_ceil_floor_edge(self, test_input, rounder, freq, expected): + dt = DatetimeIndex(list(test_input)) + func = getattr(dt, rounder) + result = func(freq) + expected = DatetimeIndex(list(expected)) + assert expected.equals(result) + + @pytest.mark.parametrize( + "start, index_freq, periods", + [("2018-01-01", "12H", 25), ("2018-01-01 0:0:0.124999", "1ns", 1000)], + ) + @pytest.mark.parametrize( + "round_freq", + [ + "2ns", + "3ns", + "4ns", + "5ns", + "6ns", + "7ns", + "250ns", + "500ns", + "750ns", + "1us", + "19us", + "250us", + "500us", + "750us", + "1s", + "2s", + "3s", + "12H", + "1D", + ], + ) + def test_round_int64(self, start, index_freq, periods, round_freq): + dt = date_range(start=start, freq=index_freq, periods=periods) + unit = to_offset(round_freq).nanos + + # test floor + result = dt.floor(round_freq) + diff = dt.asi8 - result.asi8 + mod = result.asi8 % unit + assert (mod == 0).all(), "floor not a {} multiple".format(round_freq) + assert (0 <= diff).all() and (diff < unit).all(), "floor error" + + # test ceil + result = dt.ceil(round_freq) + diff = result.asi8 - dt.asi8 + mod = result.asi8 % unit + assert (mod == 0).all(), "ceil not a {} multiple".format(round_freq) + assert (0 <= diff).all() and (diff < unit).all(), "ceil error" + + # test round + result = dt.round(round_freq) + diff = abs(result.asi8 - dt.asi8) + mod = result.asi8 % unit + assert (mod == 0).all(), "round not a {} multiple".format(round_freq) + assert (diff <= unit // 2).all(), "round error" + if unit % 2 == 0: + assert ( + result.asi8[diff == unit // 2] % 2 == 0 + ).all(), "round half to even error" + + # ---------------------------------------------------------------- + # DatetimeIndex.normalize + + def test_normalize(self): + rng = date_range("1/1/2000 9:30", periods=10, freq="D") + + result = rng.normalize() + expected = date_range("1/1/2000", periods=10, freq="D") + tm.assert_index_equal(result, expected) + + arr_ns = np.array([1380585623454345752, 1380585612343234312]).astype( + "datetime64[ns]" + ) + rng_ns = DatetimeIndex(arr_ns) + rng_ns_normalized = rng_ns.normalize() + + arr_ns = np.array([1380585600000000000, 1380585600000000000]).astype( + "datetime64[ns]" + ) + expected = DatetimeIndex(arr_ns) + tm.assert_index_equal(rng_ns_normalized, expected) + + assert result.is_normalized + assert not rng.is_normalized + + def test_normalize_nat(self): + dti = DatetimeIndex([pd.NaT, Timestamp("2018-01-01 01:00:00")]) + result = dti.normalize() + expected = DatetimeIndex([pd.NaT, Timestamp("2018-01-01")]) + tm.assert_index_equal(result, expected) + + +class TestDateTimeIndexToJulianDate: + def test_1700(self): + dr = date_range(start=Timestamp("1710-10-01"), periods=5, freq="D") + r1 = pd.Index([x.to_julian_date() for x in dr]) + r2 = dr.to_julian_date() + assert isinstance(r2, pd.Float64Index) + tm.assert_index_equal(r1, r2) + + def test_2000(self): + dr = date_range(start=Timestamp("2000-02-27"), periods=5, freq="D") + r1 = pd.Index([x.to_julian_date() for x in dr]) + r2 = dr.to_julian_date() + assert isinstance(r2, pd.Float64Index) + tm.assert_index_equal(r1, r2) + + def test_hour(self): + dr = date_range(start=Timestamp("2000-02-27"), periods=5, freq="H") + r1 = pd.Index([x.to_julian_date() for x in dr]) + r2 = dr.to_julian_date() + assert isinstance(r2, pd.Float64Index) + tm.assert_index_equal(r1, r2) + + def test_minute(self): + dr = date_range(start=Timestamp("2000-02-27"), periods=5, freq="T") + r1 = pd.Index([x.to_julian_date() for x in dr]) + r2 = dr.to_julian_date() + assert isinstance(r2, pd.Float64Index) + tm.assert_index_equal(r1, r2) + + def test_second(self): + dr = date_range(start=Timestamp("2000-02-27"), periods=5, freq="S") + r1 = pd.Index([x.to_julian_date() for x in dr]) + r2 = dr.to_julian_date() + assert isinstance(r2, pd.Float64Index) + tm.assert_index_equal(r1, r2) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_setops.py b/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_setops.py new file mode 100644 index 0000000..78188c5 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_setops.py @@ -0,0 +1,596 @@ +from datetime import datetime + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + DataFrame, + DatetimeIndex, + Index, + Int64Index, + Series, + bdate_range, + date_range, + to_datetime, +) +import pandas._testing as tm + +from pandas.tseries.offsets import BMonthEnd, Minute, MonthEnd + +START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) + + +class TestDatetimeIndexSetOps: + tz = [ + None, + "UTC", + "Asia/Tokyo", + "US/Eastern", + "dateutil/Asia/Singapore", + "dateutil/US/Pacific", + ] + + # TODO: moved from test_datetimelike; dedup with version below + @pytest.mark.parametrize("sort", [None, False]) + def test_union2(self, sort): + everything = tm.makeDateIndex(10) + first = everything[:5] + second = everything[5:] + union = first.union(second, sort=sort) + tm.assert_index_equal(union, everything) + + @pytest.mark.parametrize("box", [np.array, Series, list]) + @pytest.mark.parametrize("sort", [None, False]) + def test_union3(self, sort, box): + everything = tm.makeDateIndex(10) + first = everything[:5] + second = everything[5:] + + # GH 10149 + expected = ( + first.astype("O").union(pd.Index(second.values, dtype="O")).astype("O") + ) + case = box(second.values) + result = first.union(case, sort=sort) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("tz", tz) + @pytest.mark.parametrize("sort", [None, False]) + def test_union(self, tz, sort): + rng1 = pd.date_range("1/1/2000", freq="D", periods=5, tz=tz) + other1 = pd.date_range("1/6/2000", freq="D", periods=5, tz=tz) + expected1 = pd.date_range("1/1/2000", freq="D", periods=10, tz=tz) + expected1_notsorted = pd.DatetimeIndex(list(other1) + list(rng1)) + + rng2 = pd.date_range("1/1/2000", freq="D", periods=5, tz=tz) + other2 = pd.date_range("1/4/2000", freq="D", periods=5, tz=tz) + expected2 = pd.date_range("1/1/2000", freq="D", periods=8, tz=tz) + expected2_notsorted = pd.DatetimeIndex(list(other2) + list(rng2[:3])) + + rng3 = pd.date_range("1/1/2000", freq="D", periods=5, tz=tz) + other3 = pd.DatetimeIndex([], tz=tz) + expected3 = pd.date_range("1/1/2000", freq="D", periods=5, tz=tz) + expected3_notsorted = rng3 + + for rng, other, exp, exp_notsorted in [ + (rng1, other1, expected1, expected1_notsorted), + (rng2, other2, expected2, expected2_notsorted), + (rng3, other3, expected3, expected3_notsorted), + ]: + + result_union = rng.union(other, sort=sort) + tm.assert_index_equal(result_union, exp) + + result_union = other.union(rng, sort=sort) + if sort is None: + tm.assert_index_equal(result_union, exp) + else: + tm.assert_index_equal(result_union, exp_notsorted) + + @pytest.mark.parametrize("sort", [None, False]) + def test_union_coverage(self, sort): + idx = DatetimeIndex(["2000-01-03", "2000-01-01", "2000-01-02"]) + ordered = DatetimeIndex(idx.sort_values(), freq="infer") + result = ordered.union(idx, sort=sort) + tm.assert_index_equal(result, ordered) + + result = ordered[:0].union(ordered, sort=sort) + tm.assert_index_equal(result, ordered) + assert result.freq == ordered.freq + + @pytest.mark.parametrize("sort", [None, False]) + def test_union_bug_1730(self, sort): + rng_a = date_range("1/1/2012", periods=4, freq="3H") + rng_b = date_range("1/1/2012", periods=4, freq="4H") + + result = rng_a.union(rng_b, sort=sort) + exp = list(rng_a) + list(rng_b[1:]) + if sort is None: + exp = DatetimeIndex(sorted(exp)) + else: + exp = DatetimeIndex(exp) + tm.assert_index_equal(result, exp) + + @pytest.mark.parametrize("sort", [None, False]) + def test_union_bug_1745(self, sort): + left = DatetimeIndex(["2012-05-11 15:19:49.695000"]) + right = DatetimeIndex( + [ + "2012-05-29 13:04:21.322000", + "2012-05-11 15:27:24.873000", + "2012-05-11 15:31:05.350000", + ] + ) + + result = left.union(right, sort=sort) + exp = DatetimeIndex( + [ + "2012-05-11 15:19:49.695000", + "2012-05-29 13:04:21.322000", + "2012-05-11 15:27:24.873000", + "2012-05-11 15:31:05.350000", + ] + ) + if sort is None: + exp = exp.sort_values() + tm.assert_index_equal(result, exp) + + @pytest.mark.parametrize("sort", [None, False]) + def test_union_bug_4564(self, sort): + from pandas import DateOffset + + left = date_range("2013-01-01", "2013-02-01") + right = left + DateOffset(minutes=15) + + result = left.union(right, sort=sort) + exp = list(left) + list(right) + if sort is None: + exp = DatetimeIndex(sorted(exp)) + else: + exp = DatetimeIndex(exp) + tm.assert_index_equal(result, exp) + + @pytest.mark.parametrize("sort", [None, False]) + def test_union_freq_both_none(self, sort): + # GH11086 + expected = bdate_range("20150101", periods=10) + expected._data.freq = None + + result = expected.union(expected, sort=sort) + tm.assert_index_equal(result, expected) + assert result.freq is None + + def test_union_freq_infer(self): + # When taking the union of two DatetimeIndexes, we infer + # a freq even if the arguments don't have freq. This matches + # TimedeltaIndex behavior. + dti = pd.date_range("2016-01-01", periods=5) + left = dti[[0, 1, 3, 4]] + right = dti[[2, 3, 1]] + + assert left.freq is None + assert right.freq is None + + result = left.union(right) + tm.assert_index_equal(result, dti) + assert result.freq == "D" + + def test_union_dataframe_index(self): + rng1 = date_range("1/1/1999", "1/1/2012", freq="MS") + s1 = Series(np.random.randn(len(rng1)), rng1) + + rng2 = date_range("1/1/1980", "12/1/2001", freq="MS") + s2 = Series(np.random.randn(len(rng2)), rng2) + df = DataFrame({"s1": s1, "s2": s2}) + + exp = pd.date_range("1/1/1980", "1/1/2012", freq="MS") + tm.assert_index_equal(df.index, exp) + + @pytest.mark.parametrize("sort", [None, False]) + def test_union_with_DatetimeIndex(self, sort): + i1 = Int64Index(np.arange(0, 20, 2)) + i2 = date_range(start="2012-01-03 00:00:00", periods=10, freq="D") + # Works + i1.union(i2, sort=sort) + # Fails with "AttributeError: can't set attribute" + i2.union(i1, sort=sort) + + # TODO: moved from test_datetimelike; de-duplicate with version below + def test_intersection2(self): + first = tm.makeDateIndex(10) + second = first[5:] + intersect = first.intersection(second) + assert tm.equalContents(intersect, second) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + result = first.intersection(case) + assert tm.equalContents(result, second) + + third = Index(["a", "b", "c"]) + result = first.intersection(third) + expected = pd.Index([], dtype=object) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "tz", [None, "Asia/Tokyo", "US/Eastern", "dateutil/US/Pacific"] + ) + @pytest.mark.parametrize("sort", [None, False]) + def test_intersection(self, tz, sort): + # GH 4690 (with tz) + base = date_range("6/1/2000", "6/30/2000", freq="D", name="idx") + + # if target has the same name, it is preserved + rng2 = date_range("5/15/2000", "6/20/2000", freq="D", name="idx") + expected2 = date_range("6/1/2000", "6/20/2000", freq="D", name="idx") + + # if target name is different, it will be reset + rng3 = date_range("5/15/2000", "6/20/2000", freq="D", name="other") + expected3 = date_range("6/1/2000", "6/20/2000", freq="D", name=None) + + rng4 = date_range("7/1/2000", "7/31/2000", freq="D", name="idx") + expected4 = DatetimeIndex([], name="idx") + + for (rng, expected) in [ + (rng2, expected2), + (rng3, expected3), + (rng4, expected4), + ]: + result = base.intersection(rng) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + assert result.tz == expected.tz + + # non-monotonic + base = DatetimeIndex( + ["2011-01-05", "2011-01-04", "2011-01-02", "2011-01-03"], tz=tz, name="idx" + ) + + rng2 = DatetimeIndex( + ["2011-01-04", "2011-01-02", "2011-02-02", "2011-02-03"], tz=tz, name="idx" + ) + expected2 = DatetimeIndex(["2011-01-04", "2011-01-02"], tz=tz, name="idx") + + rng3 = DatetimeIndex( + ["2011-01-04", "2011-01-02", "2011-02-02", "2011-02-03"], + tz=tz, + name="other", + ) + expected3 = DatetimeIndex(["2011-01-04", "2011-01-02"], tz=tz, name=None) + + # GH 7880 + rng4 = date_range("7/1/2000", "7/31/2000", freq="D", tz=tz, name="idx") + expected4 = DatetimeIndex([], tz=tz, name="idx") + + for (rng, expected) in [ + (rng2, expected2), + (rng3, expected3), + (rng4, expected4), + ]: + result = base.intersection(rng, sort=sort) + if sort is None: + expected = expected.sort_values() + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq is None + assert result.tz == expected.tz + + def test_intersection_empty(self): + # empty same freq GH2129 + rng = date_range("6/1/2000", "6/15/2000", freq="T") + result = rng[0:0].intersection(rng) + assert len(result) == 0 + + result = rng.intersection(rng[0:0]) + assert len(result) == 0 + + def test_intersection_bug_1708(self): + from pandas import DateOffset + + index_1 = date_range("1/1/2012", periods=4, freq="12H") + index_2 = index_1 + DateOffset(hours=1) + + result = index_1 & index_2 + assert len(result) == 0 + + @pytest.mark.parametrize("tz", tz) + @pytest.mark.parametrize("sort", [None, False]) + def test_difference(self, tz, sort): + rng_dates = ["1/2/2000", "1/3/2000", "1/1/2000", "1/4/2000", "1/5/2000"] + + rng1 = pd.DatetimeIndex(rng_dates, tz=tz) + other1 = pd.date_range("1/6/2000", freq="D", periods=5, tz=tz) + expected1 = pd.DatetimeIndex(rng_dates, tz=tz) + + rng2 = pd.DatetimeIndex(rng_dates, tz=tz) + other2 = pd.date_range("1/4/2000", freq="D", periods=5, tz=tz) + expected2 = pd.DatetimeIndex(rng_dates[:3], tz=tz) + + rng3 = pd.DatetimeIndex(rng_dates, tz=tz) + other3 = pd.DatetimeIndex([], tz=tz) + expected3 = pd.DatetimeIndex(rng_dates, tz=tz) + + for rng, other, expected in [ + (rng1, other1, expected1), + (rng2, other2, expected2), + (rng3, other3, expected3), + ]: + result_diff = rng.difference(other, sort) + if sort is None: + expected = expected.sort_values() + tm.assert_index_equal(result_diff, expected) + + @pytest.mark.parametrize("sort", [None, False]) + def test_difference_freq(self, sort): + # GH14323: difference of DatetimeIndex should not preserve frequency + + index = date_range("20160920", "20160925", freq="D") + other = date_range("20160921", "20160924", freq="D") + expected = DatetimeIndex(["20160920", "20160925"], freq=None) + idx_diff = index.difference(other, sort) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) + + other = date_range("20160922", "20160925", freq="D") + idx_diff = index.difference(other, sort) + expected = DatetimeIndex(["20160920", "20160921"], freq=None) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) + + @pytest.mark.parametrize("sort", [None, False]) + def test_datetimeindex_diff(self, sort): + dti1 = date_range(freq="Q-JAN", start=datetime(1997, 12, 31), periods=100) + dti2 = date_range(freq="Q-JAN", start=datetime(1997, 12, 31), periods=98) + assert len(dti1.difference(dti2, sort)) == 2 + + @pytest.mark.parametrize("sort", [None, False]) + def test_datetimeindex_union_join_empty(self, sort): + dti = date_range(start="1/1/2001", end="2/1/2001", freq="D") + empty = Index([]) + + result = dti.union(empty, sort=sort) + expected = dti.astype("O") + tm.assert_index_equal(result, expected) + + result = dti.join(empty) + assert isinstance(result, DatetimeIndex) + tm.assert_index_equal(result, dti) + + def test_join_nonunique(self): + idx1 = to_datetime(["2012-11-06 16:00:11.477563", "2012-11-06 16:00:11.477563"]) + idx2 = to_datetime(["2012-11-06 15:11:09.006507", "2012-11-06 15:11:09.006507"]) + rs = idx1.join(idx2, how="outer") + assert rs.is_monotonic + + +class TestBusinessDatetimeIndex: + def setup_method(self, method): + self.rng = bdate_range(START, END) + + @pytest.mark.parametrize("sort", [None, False]) + def test_union(self, sort): + # overlapping + left = self.rng[:10] + right = self.rng[5:10] + + the_union = left.union(right, sort=sort) + assert isinstance(the_union, DatetimeIndex) + + # non-overlapping, gap in middle + left = self.rng[:5] + right = self.rng[10:] + + the_union = left.union(right, sort=sort) + assert isinstance(the_union, Index) + + # non-overlapping, no gap + left = self.rng[:5] + right = self.rng[5:10] + + the_union = left.union(right, sort=sort) + assert isinstance(the_union, DatetimeIndex) + + # order does not matter + if sort is None: + tm.assert_index_equal(right.union(left, sort=sort), the_union) + else: + expected = pd.DatetimeIndex(list(right) + list(left)) + tm.assert_index_equal(right.union(left, sort=sort), expected) + + # overlapping, but different offset + rng = date_range(START, END, freq=BMonthEnd()) + + the_union = self.rng.union(rng, sort=sort) + assert isinstance(the_union, DatetimeIndex) + + def test_outer_join(self): + # should just behave as union + + # overlapping + left = self.rng[:10] + right = self.rng[5:10] + + the_join = left.join(right, how="outer") + assert isinstance(the_join, DatetimeIndex) + + # non-overlapping, gap in middle + left = self.rng[:5] + right = self.rng[10:] + + the_join = left.join(right, how="outer") + assert isinstance(the_join, DatetimeIndex) + assert the_join.freq is None + + # non-overlapping, no gap + left = self.rng[:5] + right = self.rng[5:10] + + the_join = left.join(right, how="outer") + assert isinstance(the_join, DatetimeIndex) + + # overlapping, but different offset + rng = date_range(START, END, freq=BMonthEnd()) + + the_join = self.rng.join(rng, how="outer") + assert isinstance(the_join, DatetimeIndex) + assert the_join.freq is None + + @pytest.mark.parametrize("sort", [None, False]) + def test_union_not_cacheable(self, sort): + rng = date_range("1/1/2000", periods=50, freq=Minute()) + rng1 = rng[10:] + rng2 = rng[:25] + the_union = rng1.union(rng2, sort=sort) + if sort is None: + tm.assert_index_equal(the_union, rng) + else: + expected = pd.DatetimeIndex(list(rng[10:]) + list(rng[:10])) + tm.assert_index_equal(the_union, expected) + + rng1 = rng[10:] + rng2 = rng[15:35] + the_union = rng1.union(rng2, sort=sort) + expected = rng[10:] + tm.assert_index_equal(the_union, expected) + + def test_intersection(self): + rng = date_range("1/1/2000", periods=50, freq=Minute()) + rng1 = rng[10:] + rng2 = rng[:25] + the_int = rng1.intersection(rng2) + expected = rng[10:25] + tm.assert_index_equal(the_int, expected) + assert isinstance(the_int, DatetimeIndex) + assert the_int.freq == rng.freq + + the_int = rng1.intersection(rng2.view(DatetimeIndex)) + tm.assert_index_equal(the_int, expected) + + # non-overlapping + the_int = rng[:10].intersection(rng[10:]) + expected = DatetimeIndex([]) + tm.assert_index_equal(the_int, expected) + + def test_intersection_bug(self): + # GH #771 + a = bdate_range("11/30/2011", "12/31/2011") + b = bdate_range("12/10/2011", "12/20/2011") + result = a.intersection(b) + tm.assert_index_equal(result, b) + + @pytest.mark.parametrize("sort", [None, False]) + def test_month_range_union_tz_pytz(self, sort): + from pytz import timezone + + tz = timezone("US/Eastern") + + early_start = datetime(2011, 1, 1) + early_end = datetime(2011, 3, 1) + + late_start = datetime(2011, 3, 1) + late_end = datetime(2011, 5, 1) + + early_dr = date_range(start=early_start, end=early_end, tz=tz, freq=MonthEnd()) + late_dr = date_range(start=late_start, end=late_end, tz=tz, freq=MonthEnd()) + + early_dr.union(late_dr, sort=sort) + + @td.skip_if_windows_python_3 + @pytest.mark.parametrize("sort", [None, False]) + def test_month_range_union_tz_dateutil(self, sort): + from pandas._libs.tslibs.timezones import dateutil_gettz + + tz = dateutil_gettz("US/Eastern") + + early_start = datetime(2011, 1, 1) + early_end = datetime(2011, 3, 1) + + late_start = datetime(2011, 3, 1) + late_end = datetime(2011, 5, 1) + + early_dr = date_range(start=early_start, end=early_end, tz=tz, freq=MonthEnd()) + late_dr = date_range(start=late_start, end=late_end, tz=tz, freq=MonthEnd()) + + early_dr.union(late_dr, sort=sort) + + +class TestCustomDatetimeIndex: + def setup_method(self, method): + self.rng = bdate_range(START, END, freq="C") + + @pytest.mark.parametrize("sort", [None, False]) + def test_union(self, sort): + # overlapping + left = self.rng[:10] + right = self.rng[5:10] + + the_union = left.union(right, sort=sort) + assert isinstance(the_union, DatetimeIndex) + + # non-overlapping, gap in middle + left = self.rng[:5] + right = self.rng[10:] + + the_union = left.union(right, sort) + assert isinstance(the_union, Index) + + # non-overlapping, no gap + left = self.rng[:5] + right = self.rng[5:10] + + the_union = left.union(right, sort=sort) + assert isinstance(the_union, DatetimeIndex) + + # order does not matter + if sort is None: + tm.assert_index_equal(right.union(left, sort=sort), the_union) + + # overlapping, but different offset + rng = date_range(START, END, freq=BMonthEnd()) + + the_union = self.rng.union(rng, sort=sort) + assert isinstance(the_union, DatetimeIndex) + + def test_outer_join(self): + # should just behave as union + + # overlapping + left = self.rng[:10] + right = self.rng[5:10] + + the_join = left.join(right, how="outer") + assert isinstance(the_join, DatetimeIndex) + + # non-overlapping, gap in middle + left = self.rng[:5] + right = self.rng[10:] + + the_join = left.join(right, how="outer") + assert isinstance(the_join, DatetimeIndex) + assert the_join.freq is None + + # non-overlapping, no gap + left = self.rng[:5] + right = self.rng[5:10] + + the_join = left.join(right, how="outer") + assert isinstance(the_join, DatetimeIndex) + + # overlapping, but different offset + rng = date_range(START, END, freq=BMonthEnd()) + + the_join = self.rng.join(rng, how="outer") + assert isinstance(the_join, DatetimeIndex) + assert the_join.freq is None + + def test_intersection_bug(self): + # GH #771 + a = bdate_range("11/30/2011", "12/31/2011", freq="C") + b = bdate_range("12/10/2011", "12/20/2011", freq="C") + result = a.intersection(b) + tm.assert_index_equal(result, b) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_shift.py b/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_shift.py new file mode 100644 index 0000000..1c87995 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_shift.py @@ -0,0 +1,117 @@ +from datetime import datetime + +import pytest +import pytz + +from pandas.errors import NullFrequencyError + +import pandas as pd +from pandas import DatetimeIndex, Series, date_range +import pandas._testing as tm + + +class TestDatetimeIndexShift: + + # ------------------------------------------------------------- + # DatetimeIndex.shift is used in integer addition + + def test_dti_shift_tzaware(self, tz_naive_fixture): + # GH#9903 + tz = tz_naive_fixture + idx = pd.DatetimeIndex([], name="xxx", tz=tz) + tm.assert_index_equal(idx.shift(0, freq="H"), idx) + tm.assert_index_equal(idx.shift(3, freq="H"), idx) + + idx = pd.DatetimeIndex( + ["2011-01-01 10:00", "2011-01-01 11:00", "2011-01-01 12:00"], + name="xxx", + tz=tz, + ) + tm.assert_index_equal(idx.shift(0, freq="H"), idx) + exp = pd.DatetimeIndex( + ["2011-01-01 13:00", "2011-01-01 14:00", "2011-01-01 15:00"], + name="xxx", + tz=tz, + ) + tm.assert_index_equal(idx.shift(3, freq="H"), exp) + exp = pd.DatetimeIndex( + ["2011-01-01 07:00", "2011-01-01 08:00", "2011-01-01 09:00"], + name="xxx", + tz=tz, + ) + tm.assert_index_equal(idx.shift(-3, freq="H"), exp) + + def test_dti_shift_freqs(self): + # test shift for DatetimeIndex and non DatetimeIndex + # GH#8083 + drange = pd.date_range("20130101", periods=5) + result = drange.shift(1) + expected = pd.DatetimeIndex( + ["2013-01-02", "2013-01-03", "2013-01-04", "2013-01-05", "2013-01-06"], + freq="D", + ) + tm.assert_index_equal(result, expected) + + result = drange.shift(-1) + expected = pd.DatetimeIndex( + ["2012-12-31", "2013-01-01", "2013-01-02", "2013-01-03", "2013-01-04"], + freq="D", + ) + tm.assert_index_equal(result, expected) + + result = drange.shift(3, freq="2D") + expected = pd.DatetimeIndex( + ["2013-01-07", "2013-01-08", "2013-01-09", "2013-01-10", "2013-01-11"], + freq="D", + ) + tm.assert_index_equal(result, expected) + + def test_dti_shift_int(self): + rng = date_range("1/1/2000", periods=20) + + result = rng + 5 * rng.freq + expected = rng.shift(5) + tm.assert_index_equal(result, expected) + + result = rng - 5 * rng.freq + expected = rng.shift(-5) + tm.assert_index_equal(result, expected) + + def test_dti_shift_no_freq(self): + # GH#19147 + dti = pd.DatetimeIndex(["2011-01-01 10:00", "2011-01-01"], freq=None) + with pytest.raises(NullFrequencyError): + dti.shift(2) + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_dti_shift_localized(self, tzstr): + dr = date_range("2011/1/1", "2012/1/1", freq="W-FRI") + dr_tz = dr.tz_localize(tzstr) + + result = dr_tz.shift(1, "10T") + assert result.tz == dr_tz.tz + + def test_dti_shift_across_dst(self): + # GH 8616 + idx = date_range("2013-11-03", tz="America/Chicago", periods=7, freq="H") + s = Series(index=idx[:-1], dtype=object) + result = s.shift(freq="H") + expected = Series(index=idx[1:], dtype=object) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "shift, result_time", + [ + [0, "2014-11-14 00:00:00"], + [-1, "2014-11-13 23:00:00"], + [1, "2014-11-14 01:00:00"], + ], + ) + def test_dti_shift_near_midnight(self, shift, result_time): + # GH 8616 + dt = datetime(2014, 11, 14, 0) + dt_est = pytz.timezone("EST").localize(dt) + s = Series(data=[1], index=[dt_est]) + result = s.shift(shift, freq="H") + expected = Series(1, index=DatetimeIndex([result_time], tz="EST")) + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_timezones.py b/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_timezones.py new file mode 100644 index 0000000..1505ac1 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_timezones.py @@ -0,0 +1,1251 @@ +""" +Tests for DatetimeIndex timezone-related methods +""" +from datetime import date, datetime, time, timedelta, tzinfo + +import dateutil +from dateutil.tz import gettz, tzlocal +import numpy as np +import pytest +import pytz + +from pandas._libs.tslibs import conversion, timezones +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + DatetimeIndex, + Index, + Timestamp, + bdate_range, + date_range, + isna, + to_datetime, +) +import pandas._testing as tm + + +class FixedOffset(tzinfo): + """Fixed offset in minutes east from UTC.""" + + def __init__(self, offset, name): + self.__offset = timedelta(minutes=offset) + self.__name = name + + def utcoffset(self, dt): + return self.__offset + + def tzname(self, dt): + return self.__name + + def dst(self, dt): + return timedelta(0) + + +fixed_off = FixedOffset(-420, "-07:00") +fixed_off_no_name = FixedOffset(-330, None) + + +class TestDatetimeIndexTimezones: + # ------------------------------------------------------------- + # DatetimeIndex.tz_convert + def test_tz_convert_nat(self): + # GH#5546 + dates = [pd.NaT] + idx = DatetimeIndex(dates) + idx = idx.tz_localize("US/Pacific") + tm.assert_index_equal(idx, DatetimeIndex(dates, tz="US/Pacific")) + idx = idx.tz_convert("US/Eastern") + tm.assert_index_equal(idx, DatetimeIndex(dates, tz="US/Eastern")) + idx = idx.tz_convert("UTC") + tm.assert_index_equal(idx, DatetimeIndex(dates, tz="UTC")) + + dates = ["2010-12-01 00:00", "2010-12-02 00:00", pd.NaT] + idx = DatetimeIndex(dates) + idx = idx.tz_localize("US/Pacific") + tm.assert_index_equal(idx, DatetimeIndex(dates, tz="US/Pacific")) + idx = idx.tz_convert("US/Eastern") + expected = ["2010-12-01 03:00", "2010-12-02 03:00", pd.NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz="US/Eastern")) + + idx = idx + pd.offsets.Hour(5) + expected = ["2010-12-01 08:00", "2010-12-02 08:00", pd.NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz="US/Eastern")) + idx = idx.tz_convert("US/Pacific") + expected = ["2010-12-01 05:00", "2010-12-02 05:00", pd.NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz="US/Pacific")) + + idx = idx + np.timedelta64(3, "h") + expected = ["2010-12-01 08:00", "2010-12-02 08:00", pd.NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz="US/Pacific")) + + idx = idx.tz_convert("US/Eastern") + expected = ["2010-12-01 11:00", "2010-12-02 11:00", pd.NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz="US/Eastern")) + + @pytest.mark.parametrize("prefix", ["", "dateutil/"]) + def test_dti_tz_convert_compat_timestamp(self, prefix): + strdates = ["1/1/2012", "3/1/2012", "4/1/2012"] + idx = DatetimeIndex(strdates, tz=prefix + "US/Eastern") + + conv = idx[0].tz_convert(prefix + "US/Pacific") + expected = idx.tz_convert(prefix + "US/Pacific")[0] + + assert conv == expected + + def test_dti_tz_convert_hour_overflow_dst(self): + # Regression test for: + # https://github.com/pandas-dev/pandas/issues/13306 + + # sorted case US/Eastern -> UTC + ts = ["2008-05-12 09:50:00", "2008-12-12 09:50:35", "2009-05-12 09:50:32"] + tt = DatetimeIndex(ts).tz_localize("US/Eastern") + ut = tt.tz_convert("UTC") + expected = Index([13, 14, 13]) + tm.assert_index_equal(ut.hour, expected) + + # sorted case UTC -> US/Eastern + ts = ["2008-05-12 13:50:00", "2008-12-12 14:50:35", "2009-05-12 13:50:32"] + tt = DatetimeIndex(ts).tz_localize("UTC") + ut = tt.tz_convert("US/Eastern") + expected = Index([9, 9, 9]) + tm.assert_index_equal(ut.hour, expected) + + # unsorted case US/Eastern -> UTC + ts = ["2008-05-12 09:50:00", "2008-12-12 09:50:35", "2008-05-12 09:50:32"] + tt = DatetimeIndex(ts).tz_localize("US/Eastern") + ut = tt.tz_convert("UTC") + expected = Index([13, 14, 13]) + tm.assert_index_equal(ut.hour, expected) + + # unsorted case UTC -> US/Eastern + ts = ["2008-05-12 13:50:00", "2008-12-12 14:50:35", "2008-05-12 13:50:32"] + tt = DatetimeIndex(ts).tz_localize("UTC") + ut = tt.tz_convert("US/Eastern") + expected = Index([9, 9, 9]) + tm.assert_index_equal(ut.hour, expected) + + @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) + def test_dti_tz_convert_hour_overflow_dst_timestamps(self, tz): + # Regression test for GH#13306 + + # sorted case US/Eastern -> UTC + ts = [ + Timestamp("2008-05-12 09:50:00", tz=tz), + Timestamp("2008-12-12 09:50:35", tz=tz), + Timestamp("2009-05-12 09:50:32", tz=tz), + ] + tt = DatetimeIndex(ts) + ut = tt.tz_convert("UTC") + expected = Index([13, 14, 13]) + tm.assert_index_equal(ut.hour, expected) + + # sorted case UTC -> US/Eastern + ts = [ + Timestamp("2008-05-12 13:50:00", tz="UTC"), + Timestamp("2008-12-12 14:50:35", tz="UTC"), + Timestamp("2009-05-12 13:50:32", tz="UTC"), + ] + tt = DatetimeIndex(ts) + ut = tt.tz_convert("US/Eastern") + expected = Index([9, 9, 9]) + tm.assert_index_equal(ut.hour, expected) + + # unsorted case US/Eastern -> UTC + ts = [ + Timestamp("2008-05-12 09:50:00", tz=tz), + Timestamp("2008-12-12 09:50:35", tz=tz), + Timestamp("2008-05-12 09:50:32", tz=tz), + ] + tt = DatetimeIndex(ts) + ut = tt.tz_convert("UTC") + expected = Index([13, 14, 13]) + tm.assert_index_equal(ut.hour, expected) + + # unsorted case UTC -> US/Eastern + ts = [ + Timestamp("2008-05-12 13:50:00", tz="UTC"), + Timestamp("2008-12-12 14:50:35", tz="UTC"), + Timestamp("2008-05-12 13:50:32", tz="UTC"), + ] + tt = DatetimeIndex(ts) + ut = tt.tz_convert("US/Eastern") + expected = Index([9, 9, 9]) + tm.assert_index_equal(ut.hour, expected) + + @pytest.mark.parametrize("freq, n", [("H", 1), ("T", 60), ("S", 3600)]) + def test_dti_tz_convert_trans_pos_plus_1__bug(self, freq, n): + # Regression test for tslib.tz_convert(vals, tz1, tz2). + # See https://github.com/pandas-dev/pandas/issues/4496 for details. + idx = date_range(datetime(2011, 3, 26, 23), datetime(2011, 3, 27, 1), freq=freq) + idx = idx.tz_localize("UTC") + idx = idx.tz_convert("Europe/Moscow") + + expected = np.repeat(np.array([3, 4, 5]), np.array([n, n, 1])) + tm.assert_index_equal(idx.hour, Index(expected)) + + def test_dti_tz_convert_dst(self): + for freq, n in [("H", 1), ("T", 60), ("S", 3600)]: + # Start DST + idx = date_range( + "2014-03-08 23:00", "2014-03-09 09:00", freq=freq, tz="UTC" + ) + idx = idx.tz_convert("US/Eastern") + expected = np.repeat( + np.array([18, 19, 20, 21, 22, 23, 0, 1, 3, 4, 5]), + np.array([n, n, n, n, n, n, n, n, n, n, 1]), + ) + tm.assert_index_equal(idx.hour, Index(expected)) + + idx = date_range( + "2014-03-08 18:00", "2014-03-09 05:00", freq=freq, tz="US/Eastern" + ) + idx = idx.tz_convert("UTC") + expected = np.repeat( + np.array([23, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), + np.array([n, n, n, n, n, n, n, n, n, n, 1]), + ) + tm.assert_index_equal(idx.hour, Index(expected)) + + # End DST + idx = date_range( + "2014-11-01 23:00", "2014-11-02 09:00", freq=freq, tz="UTC" + ) + idx = idx.tz_convert("US/Eastern") + expected = np.repeat( + np.array([19, 20, 21, 22, 23, 0, 1, 1, 2, 3, 4]), + np.array([n, n, n, n, n, n, n, n, n, n, 1]), + ) + tm.assert_index_equal(idx.hour, Index(expected)) + + idx = date_range( + "2014-11-01 18:00", "2014-11-02 05:00", freq=freq, tz="US/Eastern" + ) + idx = idx.tz_convert("UTC") + expected = np.repeat( + np.array([22, 23, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), + np.array([n, n, n, n, n, n, n, n, n, n, n, n, 1]), + ) + tm.assert_index_equal(idx.hour, Index(expected)) + + # daily + # Start DST + idx = date_range("2014-03-08 00:00", "2014-03-09 00:00", freq="D", tz="UTC") + idx = idx.tz_convert("US/Eastern") + tm.assert_index_equal(idx.hour, Index([19, 19])) + + idx = date_range( + "2014-03-08 00:00", "2014-03-09 00:00", freq="D", tz="US/Eastern" + ) + idx = idx.tz_convert("UTC") + tm.assert_index_equal(idx.hour, Index([5, 5])) + + # End DST + idx = date_range("2014-11-01 00:00", "2014-11-02 00:00", freq="D", tz="UTC") + idx = idx.tz_convert("US/Eastern") + tm.assert_index_equal(idx.hour, Index([20, 20])) + + idx = date_range( + "2014-11-01 00:00", "2014-11-02 000:00", freq="D", tz="US/Eastern" + ) + idx = idx.tz_convert("UTC") + tm.assert_index_equal(idx.hour, Index([4, 4])) + + def test_tz_convert_roundtrip(self, tz_aware_fixture): + tz = tz_aware_fixture + idx1 = date_range(start="2014-01-01", end="2014-12-31", freq="M", tz="UTC") + exp1 = date_range(start="2014-01-01", end="2014-12-31", freq="M") + + idx2 = date_range(start="2014-01-01", end="2014-12-31", freq="D", tz="UTC") + exp2 = date_range(start="2014-01-01", end="2014-12-31", freq="D") + + idx3 = date_range(start="2014-01-01", end="2014-03-01", freq="H", tz="UTC") + exp3 = date_range(start="2014-01-01", end="2014-03-01", freq="H") + + idx4 = date_range(start="2014-08-01", end="2014-10-31", freq="T", tz="UTC") + exp4 = date_range(start="2014-08-01", end="2014-10-31", freq="T") + + for idx, expected in [(idx1, exp1), (idx2, exp2), (idx3, exp3), (idx4, exp4)]: + converted = idx.tz_convert(tz) + reset = converted.tz_convert(None) + tm.assert_index_equal(reset, expected) + assert reset.tzinfo is None + expected = converted.tz_convert("UTC").tz_localize(None) + tm.assert_index_equal(reset, expected) + + def test_dti_tz_convert_tzlocal(self): + # GH#13583 + # tz_convert doesn't affect to internal + dti = date_range(start="2001-01-01", end="2001-03-01", tz="UTC") + dti2 = dti.tz_convert(dateutil.tz.tzlocal()) + tm.assert_numpy_array_equal(dti2.asi8, dti.asi8) + + dti = date_range(start="2001-01-01", end="2001-03-01", tz=dateutil.tz.tzlocal()) + dti2 = dti.tz_convert(None) + tm.assert_numpy_array_equal(dti2.asi8, dti.asi8) + + @pytest.mark.parametrize( + "tz", + [ + "US/Eastern", + "dateutil/US/Eastern", + pytz.timezone("US/Eastern"), + gettz("US/Eastern"), + ], + ) + def test_dti_tz_convert_utc_to_local_no_modify(self, tz): + rng = date_range("3/11/2012", "3/12/2012", freq="H", tz="utc") + rng_eastern = rng.tz_convert(tz) + + # Values are unmodified + tm.assert_numpy_array_equal(rng.asi8, rng_eastern.asi8) + + assert timezones.tz_compare(rng_eastern.tz, timezones.maybe_get_tz(tz)) + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_tz_convert_unsorted(self, tzstr): + dr = date_range("2012-03-09", freq="H", periods=100, tz="utc") + dr = dr.tz_convert(tzstr) + + result = dr[::-1].hour + exp = dr.hour[::-1] + tm.assert_almost_equal(result, exp) + + # ------------------------------------------------------------- + # DatetimeIndex.tz_localize + + def test_dti_tz_localize_nonexistent_raise_coerce(self): + # GH#13057 + times = ["2015-03-08 01:00", "2015-03-08 02:00", "2015-03-08 03:00"] + index = DatetimeIndex(times) + tz = "US/Eastern" + with pytest.raises(pytz.NonExistentTimeError): + index.tz_localize(tz=tz) + + with pytest.raises(pytz.NonExistentTimeError): + index.tz_localize(tz=tz, nonexistent="raise") + + result = index.tz_localize(tz=tz, nonexistent="NaT") + test_times = ["2015-03-08 01:00-05:00", "NaT", "2015-03-08 03:00-04:00"] + dti = to_datetime(test_times, utc=True) + expected = dti.tz_convert("US/Eastern") + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) + def test_dti_tz_localize_ambiguous_infer(self, tz): + # November 6, 2011, fall back, repeat 2 AM hour + # With no repeated hours, we cannot infer the transition + dr = date_range(datetime(2011, 11, 6, 0), periods=5, freq=pd.offsets.Hour()) + with pytest.raises(pytz.AmbiguousTimeError): + dr.tz_localize(tz) + + # With repeated hours, we can infer the transition + dr = date_range( + datetime(2011, 11, 6, 0), periods=5, freq=pd.offsets.Hour(), tz=tz + ) + times = [ + "11/06/2011 00:00", + "11/06/2011 01:00", + "11/06/2011 01:00", + "11/06/2011 02:00", + "11/06/2011 03:00", + ] + di = DatetimeIndex(times) + localized = di.tz_localize(tz, ambiguous="infer") + tm.assert_index_equal(dr, localized) + tm.assert_index_equal(dr, DatetimeIndex(times, tz=tz, ambiguous="infer")) + + # When there is no dst transition, nothing special happens + dr = date_range(datetime(2011, 6, 1, 0), periods=10, freq=pd.offsets.Hour()) + localized = dr.tz_localize(tz) + localized_infer = dr.tz_localize(tz, ambiguous="infer") + tm.assert_index_equal(localized, localized_infer) + + @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) + def test_dti_tz_localize_ambiguous_times(self, tz): + # March 13, 2011, spring forward, skip from 2 AM to 3 AM + dr = date_range(datetime(2011, 3, 13, 1, 30), periods=3, freq=pd.offsets.Hour()) + with pytest.raises(pytz.NonExistentTimeError): + dr.tz_localize(tz) + + # after dst transition, it works + dr = date_range( + datetime(2011, 3, 13, 3, 30), periods=3, freq=pd.offsets.Hour(), tz=tz + ) + + # November 6, 2011, fall back, repeat 2 AM hour + dr = date_range(datetime(2011, 11, 6, 1, 30), periods=3, freq=pd.offsets.Hour()) + with pytest.raises(pytz.AmbiguousTimeError): + dr.tz_localize(tz) + + # UTC is OK + dr = date_range( + datetime(2011, 3, 13), periods=48, freq=pd.offsets.Minute(30), tz=pytz.utc + ) + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_dti_tz_localize_pass_dates_to_utc(self, tzstr): + strdates = ["1/1/2012", "3/1/2012", "4/1/2012"] + + idx = DatetimeIndex(strdates) + conv = idx.tz_localize(tzstr) + + fromdates = DatetimeIndex(strdates, tz=tzstr) + + assert conv.tz == fromdates.tz + tm.assert_numpy_array_equal(conv.values, fromdates.values) + + @pytest.mark.parametrize("prefix", ["", "dateutil/"]) + def test_dti_tz_localize(self, prefix): + tzstr = prefix + "US/Eastern" + dti = pd.date_range(start="1/1/2005", end="1/1/2005 0:00:30.256", freq="L") + dti2 = dti.tz_localize(tzstr) + + dti_utc = pd.date_range( + start="1/1/2005 05:00", end="1/1/2005 5:00:30.256", freq="L", tz="utc" + ) + + tm.assert_numpy_array_equal(dti2.values, dti_utc.values) + + dti3 = dti2.tz_convert(prefix + "US/Pacific") + tm.assert_numpy_array_equal(dti3.values, dti_utc.values) + + dti = pd.date_range(start="11/6/2011 1:59", end="11/6/2011 2:00", freq="L") + with pytest.raises(pytz.AmbiguousTimeError): + dti.tz_localize(tzstr) + + dti = pd.date_range(start="3/13/2011 1:59", end="3/13/2011 2:00", freq="L") + with pytest.raises(pytz.NonExistentTimeError): + dti.tz_localize(tzstr) + + @pytest.mark.parametrize( + "tz", + [ + "US/Eastern", + "dateutil/US/Eastern", + pytz.timezone("US/Eastern"), + gettz("US/Eastern"), + ], + ) + def test_dti_tz_localize_utc_conversion(self, tz): + # Localizing to time zone should: + # 1) check for DST ambiguities + # 2) convert to UTC + + rng = date_range("3/10/2012", "3/11/2012", freq="30T") + + converted = rng.tz_localize(tz) + expected_naive = rng + pd.offsets.Hour(5) + tm.assert_numpy_array_equal(converted.asi8, expected_naive.asi8) + + # DST ambiguity, this should fail + rng = date_range("3/11/2012", "3/12/2012", freq="30T") + # Is this really how it should fail?? + with pytest.raises(pytz.NonExistentTimeError): + rng.tz_localize(tz) + + def test_dti_tz_localize_roundtrip(self, tz_aware_fixture): + # note: this tz tests that a tz-naive index can be localized + # and de-localized successfully, when there are no DST transitions + # in the range. + idx = date_range(start="2014-06-01", end="2014-08-30", freq="15T") + tz = tz_aware_fixture + localized = idx.tz_localize(tz) + # cant localize a tz-aware object + with pytest.raises(TypeError): + localized.tz_localize(tz) + reset = localized.tz_localize(None) + assert reset.tzinfo is None + tm.assert_index_equal(reset, idx) + + def test_dti_tz_localize_naive(self): + rng = date_range("1/1/2011", periods=100, freq="H") + + conv = rng.tz_localize("US/Pacific") + exp = date_range("1/1/2011", periods=100, freq="H", tz="US/Pacific") + + tm.assert_index_equal(conv, exp) + + def test_dti_tz_localize_tzlocal(self): + # GH#13583 + offset = dateutil.tz.tzlocal().utcoffset(datetime(2011, 1, 1)) + offset = int(offset.total_seconds() * 1000000000) + + dti = date_range(start="2001-01-01", end="2001-03-01") + dti2 = dti.tz_localize(dateutil.tz.tzlocal()) + tm.assert_numpy_array_equal(dti2.asi8 + offset, dti.asi8) + + dti = date_range(start="2001-01-01", end="2001-03-01", tz=dateutil.tz.tzlocal()) + dti2 = dti.tz_localize(None) + tm.assert_numpy_array_equal(dti2.asi8 - offset, dti.asi8) + + @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) + def test_dti_tz_localize_ambiguous_nat(self, tz): + times = [ + "11/06/2011 00:00", + "11/06/2011 01:00", + "11/06/2011 01:00", + "11/06/2011 02:00", + "11/06/2011 03:00", + ] + di = DatetimeIndex(times) + localized = di.tz_localize(tz, ambiguous="NaT") + + times = [ + "11/06/2011 00:00", + np.NaN, + np.NaN, + "11/06/2011 02:00", + "11/06/2011 03:00", + ] + di_test = DatetimeIndex(times, tz="US/Eastern") + + # left dtype is datetime64[ns, US/Eastern] + # right is datetime64[ns, tzfile('/usr/share/zoneinfo/US/Eastern')] + tm.assert_numpy_array_equal(di_test.values, localized.values) + + @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) + def test_dti_tz_localize_ambiguous_flags(self, tz): + # November 6, 2011, fall back, repeat 2 AM hour + + # Pass in flags to determine right dst transition + dr = date_range( + datetime(2011, 11, 6, 0), periods=5, freq=pd.offsets.Hour(), tz=tz + ) + times = [ + "11/06/2011 00:00", + "11/06/2011 01:00", + "11/06/2011 01:00", + "11/06/2011 02:00", + "11/06/2011 03:00", + ] + + # Test tz_localize + di = DatetimeIndex(times) + is_dst = [1, 1, 0, 0, 0] + localized = di.tz_localize(tz, ambiguous=is_dst) + tm.assert_index_equal(dr, localized) + tm.assert_index_equal(dr, DatetimeIndex(times, tz=tz, ambiguous=is_dst)) + + localized = di.tz_localize(tz, ambiguous=np.array(is_dst)) + tm.assert_index_equal(dr, localized) + + localized = di.tz_localize(tz, ambiguous=np.array(is_dst).astype("bool")) + tm.assert_index_equal(dr, localized) + + # Test constructor + localized = DatetimeIndex(times, tz=tz, ambiguous=is_dst) + tm.assert_index_equal(dr, localized) + + # Test duplicate times where inferring the dst fails + times += times + di = DatetimeIndex(times) + + # When the sizes are incompatible, make sure error is raised + with pytest.raises(Exception): + di.tz_localize(tz, ambiguous=is_dst) + + # When sizes are compatible and there are repeats ('infer' won't work) + is_dst = np.hstack((is_dst, is_dst)) + localized = di.tz_localize(tz, ambiguous=is_dst) + dr = dr.append(dr) + tm.assert_index_equal(dr, localized) + + # When there is no dst transition, nothing special happens + dr = date_range(datetime(2011, 6, 1, 0), periods=10, freq=pd.offsets.Hour()) + is_dst = np.array([1] * 10) + localized = dr.tz_localize(tz) + localized_is_dst = dr.tz_localize(tz, ambiguous=is_dst) + tm.assert_index_equal(localized, localized_is_dst) + + # TODO: belongs outside tz_localize tests? + @pytest.mark.parametrize("tz", ["Europe/London", "dateutil/Europe/London"]) + def test_dti_construction_ambiguous_endpoint(self, tz): + # construction with an ambiguous end-point + # GH#11626 + + with pytest.raises(pytz.AmbiguousTimeError): + date_range( + "2013-10-26 23:00", "2013-10-27 01:00", tz="Europe/London", freq="H" + ) + + times = date_range( + "2013-10-26 23:00", "2013-10-27 01:00", freq="H", tz=tz, ambiguous="infer" + ) + assert times[0] == Timestamp("2013-10-26 23:00", tz=tz, freq="H") + + if str(tz).startswith("dateutil"): + # fixed ambiguous behavior + # see GH#14621 + assert times[-1] == Timestamp("2013-10-27 01:00:00+0100", tz=tz, freq="H") + else: + assert times[-1] == Timestamp("2013-10-27 01:00:00+0000", tz=tz, freq="H") + + @pytest.mark.parametrize( + "tz, option, expected", + [ + ["US/Pacific", "shift_forward", "2019-03-10 03:00"], + ["dateutil/US/Pacific", "shift_forward", "2019-03-10 03:00"], + ["US/Pacific", "shift_backward", "2019-03-10 01:00"], + pytest.param( + "dateutil/US/Pacific", + "shift_backward", + "2019-03-10 01:00", + marks=pytest.mark.xfail(reason="GH 24329"), + ), + ["US/Pacific", timedelta(hours=1), "2019-03-10 03:00"], + ], + ) + def test_dti_construction_nonexistent_endpoint(self, tz, option, expected): + # construction with an nonexistent end-point + + with pytest.raises(pytz.NonExistentTimeError): + date_range( + "2019-03-10 00:00", "2019-03-10 02:00", tz="US/Pacific", freq="H" + ) + + times = date_range( + "2019-03-10 00:00", "2019-03-10 02:00", freq="H", tz=tz, nonexistent=option + ) + assert times[-1] == Timestamp(expected, tz=tz, freq="H") + + def test_dti_tz_localize_bdate_range(self): + dr = pd.bdate_range("1/1/2009", "1/1/2010") + dr_utc = pd.bdate_range("1/1/2009", "1/1/2010", tz=pytz.utc) + localized = dr.tz_localize(pytz.utc) + tm.assert_index_equal(dr_utc, localized) + + @pytest.mark.parametrize("tz", ["Europe/Warsaw", "dateutil/Europe/Warsaw"]) + @pytest.mark.parametrize( + "method, exp", [["NaT", pd.NaT], ["raise", None], ["foo", "invalid"]] + ) + def test_dti_tz_localize_nonexistent(self, tz, method, exp): + # GH 8917 + n = 60 + dti = date_range(start="2015-03-29 02:00:00", periods=n, freq="min") + if method == "raise": + with pytest.raises(pytz.NonExistentTimeError): + dti.tz_localize(tz, nonexistent=method) + elif exp == "invalid": + with pytest.raises(ValueError): + dti.tz_localize(tz, nonexistent=method) + else: + result = dti.tz_localize(tz, nonexistent=method) + expected = DatetimeIndex([exp] * n, tz=tz) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "start_ts, tz, end_ts, shift", + [ + ["2015-03-29 02:20:00", "Europe/Warsaw", "2015-03-29 03:00:00", "forward"], + [ + "2015-03-29 02:20:00", + "Europe/Warsaw", + "2015-03-29 01:59:59.999999999", + "backward", + ], + [ + "2015-03-29 02:20:00", + "Europe/Warsaw", + "2015-03-29 03:20:00", + timedelta(hours=1), + ], + [ + "2015-03-29 02:20:00", + "Europe/Warsaw", + "2015-03-29 01:20:00", + timedelta(hours=-1), + ], + ["2018-03-11 02:33:00", "US/Pacific", "2018-03-11 03:00:00", "forward"], + [ + "2018-03-11 02:33:00", + "US/Pacific", + "2018-03-11 01:59:59.999999999", + "backward", + ], + [ + "2018-03-11 02:33:00", + "US/Pacific", + "2018-03-11 03:33:00", + timedelta(hours=1), + ], + [ + "2018-03-11 02:33:00", + "US/Pacific", + "2018-03-11 01:33:00", + timedelta(hours=-1), + ], + ], + ) + @pytest.mark.parametrize("tz_type", ["", "dateutil/"]) + def test_dti_tz_localize_nonexistent_shift( + self, start_ts, tz, end_ts, shift, tz_type + ): + # GH 8917 + tz = tz_type + tz + if isinstance(shift, str): + shift = "shift_" + shift + dti = DatetimeIndex([Timestamp(start_ts)]) + result = dti.tz_localize(tz, nonexistent=shift) + expected = DatetimeIndex([Timestamp(end_ts)]).tz_localize(tz) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("offset", [-1, 1]) + @pytest.mark.parametrize("tz_type", ["", "dateutil/"]) + def test_dti_tz_localize_nonexistent_shift_invalid(self, offset, tz_type): + # GH 8917 + tz = tz_type + "Europe/Warsaw" + dti = DatetimeIndex([Timestamp("2015-03-29 02:20:00")]) + msg = "The provided timedelta will relocalize on a nonexistent time" + with pytest.raises(ValueError, match=msg): + dti.tz_localize(tz, nonexistent=timedelta(seconds=offset)) + + # ------------------------------------------------------------- + # DatetimeIndex.normalize + + def test_normalize_tz(self): + rng = date_range("1/1/2000 9:30", periods=10, freq="D", tz="US/Eastern") + + result = rng.normalize() + expected = date_range("1/1/2000", periods=10, freq="D", tz="US/Eastern") + tm.assert_index_equal(result, expected) + + assert result.is_normalized + assert not rng.is_normalized + + rng = date_range("1/1/2000 9:30", periods=10, freq="D", tz="UTC") + + result = rng.normalize() + expected = date_range("1/1/2000", periods=10, freq="D", tz="UTC") + tm.assert_index_equal(result, expected) + + assert result.is_normalized + assert not rng.is_normalized + + rng = date_range("1/1/2000 9:30", periods=10, freq="D", tz=tzlocal()) + result = rng.normalize() + expected = date_range("1/1/2000", periods=10, freq="D", tz=tzlocal()) + tm.assert_index_equal(result, expected) + + assert result.is_normalized + assert not rng.is_normalized + + @td.skip_if_windows + @pytest.mark.parametrize( + "timezone", + [ + "US/Pacific", + "US/Eastern", + "UTC", + "Asia/Kolkata", + "Asia/Shanghai", + "Australia/Canberra", + ], + ) + def test_normalize_tz_local(self, timezone): + # GH#13459 + with tm.set_timezone(timezone): + rng = date_range("1/1/2000 9:30", periods=10, freq="D", tz=tzlocal()) + + result = rng.normalize() + expected = date_range("1/1/2000", periods=10, freq="D", tz=tzlocal()) + tm.assert_index_equal(result, expected) + + assert result.is_normalized + assert not rng.is_normalized + + # ------------------------------------------------------------ + # DatetimeIndex.__new__ + + @pytest.mark.parametrize("prefix", ["", "dateutil/"]) + def test_dti_constructor_static_tzinfo(self, prefix): + # it works! + index = DatetimeIndex([datetime(2012, 1, 1)], tz=prefix + "EST") + index.hour + index[0] + + def test_dti_constructor_with_fixed_tz(self): + off = FixedOffset(420, "+07:00") + start = datetime(2012, 3, 11, 5, 0, 0, tzinfo=off) + end = datetime(2012, 6, 11, 5, 0, 0, tzinfo=off) + rng = date_range(start=start, end=end) + assert off == rng.tz + + rng2 = date_range(start, periods=len(rng), tz=off) + tm.assert_index_equal(rng, rng2) + + rng3 = date_range("3/11/2012 05:00:00+07:00", "6/11/2012 05:00:00+07:00") + assert (rng.values == rng3.values).all() + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_dti_convert_datetime_list(self, tzstr): + dr = date_range("2012-06-02", periods=10, tz=tzstr, name="foo") + dr2 = DatetimeIndex(list(dr), name="foo") + tm.assert_index_equal(dr, dr2) + assert dr.tz == dr2.tz + assert dr2.name == "foo" + + def test_dti_construction_univalent(self): + rng = date_range("03/12/2012 00:00", periods=10, freq="W-FRI", tz="US/Eastern") + rng2 = DatetimeIndex(data=rng, tz="US/Eastern") + tm.assert_index_equal(rng, rng2) + + @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) + def test_dti_from_tzaware_datetime(self, tz): + d = [datetime(2012, 8, 19, tzinfo=tz)] + + index = DatetimeIndex(d) + assert timezones.tz_compare(index.tz, tz) + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_dti_tz_constructors(self, tzstr): + """ Test different DatetimeIndex constructions with timezone + Follow-up of GH#4229 + """ + + arr = ["11/10/2005 08:00:00", "11/10/2005 09:00:00"] + + idx1 = to_datetime(arr).tz_localize(tzstr) + idx2 = pd.date_range(start="2005-11-10 08:00:00", freq="H", periods=2, tz=tzstr) + idx3 = DatetimeIndex(arr, tz=tzstr) + idx4 = DatetimeIndex(np.array(arr), tz=tzstr) + + for other in [idx2, idx3, idx4]: + tm.assert_index_equal(idx1, other) + + # ------------------------------------------------------------- + # Unsorted + + def test_join_utc_convert(self, join_type): + rng = date_range("1/1/2011", periods=100, freq="H", tz="utc") + + left = rng.tz_convert("US/Eastern") + right = rng.tz_convert("Europe/Berlin") + + result = left.join(left[:-5], how=join_type) + assert isinstance(result, DatetimeIndex) + assert result.tz == left.tz + + result = left.join(right[:-5], how=join_type) + assert isinstance(result, DatetimeIndex) + assert result.tz.zone == "UTC" + + @pytest.mark.parametrize( + "dtype", + [None, "datetime64[ns, CET]", "datetime64[ns, EST]", "datetime64[ns, UTC]"], + ) + def test_date_accessor(self, dtype): + # Regression test for GH#21230 + expected = np.array([date(2018, 6, 4), pd.NaT]) + + index = DatetimeIndex(["2018-06-04 10:00:00", pd.NaT], dtype=dtype) + result = index.date + + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "dtype", + [None, "datetime64[ns, CET]", "datetime64[ns, EST]", "datetime64[ns, UTC]"], + ) + def test_time_accessor(self, dtype): + # Regression test for GH#21267 + expected = np.array([time(10, 20, 30), pd.NaT]) + + index = DatetimeIndex(["2018-06-04 10:20:30", pd.NaT], dtype=dtype) + result = index.time + + tm.assert_numpy_array_equal(result, expected) + + def test_timetz_accessor(self, tz_naive_fixture): + # GH21358 + tz = timezones.maybe_get_tz(tz_naive_fixture) + + expected = np.array([time(10, 20, 30, tzinfo=tz), pd.NaT]) + + index = DatetimeIndex(["2018-06-04 10:20:30", pd.NaT], tz=tz) + result = index.timetz + + tm.assert_numpy_array_equal(result, expected) + + def test_dti_drop_dont_lose_tz(self): + # GH#2621 + ind = date_range("2012-12-01", periods=10, tz="utc") + ind = ind.drop(ind[-1]) + + assert ind.tz is not None + + def test_dti_tz_conversion_freq(self, tz_naive_fixture): + # GH25241 + t3 = DatetimeIndex(["2019-01-01 10:00"], freq="H") + assert t3.tz_localize(tz=tz_naive_fixture).freq == t3.freq + t4 = DatetimeIndex(["2019-01-02 12:00"], tz="UTC", freq="T") + assert t4.tz_convert(tz="UTC").freq == t4.freq + + def test_drop_dst_boundary(self): + # see gh-18031 + tz = "Europe/Brussels" + freq = "15min" + + start = pd.Timestamp("201710290100", tz=tz) + end = pd.Timestamp("201710290300", tz=tz) + index = pd.date_range(start=start, end=end, freq=freq) + + expected = DatetimeIndex( + [ + "201710290115", + "201710290130", + "201710290145", + "201710290200", + "201710290215", + "201710290230", + "201710290245", + "201710290200", + "201710290215", + "201710290230", + "201710290245", + "201710290300", + ], + tz=tz, + freq=freq, + ambiguous=[ + True, + True, + True, + True, + True, + True, + True, + False, + False, + False, + False, + False, + ], + ) + result = index.drop(index[0]) + tm.assert_index_equal(result, expected) + + def test_date_range_localize(self): + rng = date_range("3/11/2012 03:00", periods=15, freq="H", tz="US/Eastern") + rng2 = DatetimeIndex(["3/11/2012 03:00", "3/11/2012 04:00"], tz="US/Eastern") + rng3 = date_range("3/11/2012 03:00", periods=15, freq="H") + rng3 = rng3.tz_localize("US/Eastern") + + tm.assert_index_equal(rng, rng3) + + # DST transition time + val = rng[0] + exp = Timestamp("3/11/2012 03:00", tz="US/Eastern") + + assert val.hour == 3 + assert exp.hour == 3 + assert val == exp # same UTC value + tm.assert_index_equal(rng[:2], rng2) + + # Right before the DST transition + rng = date_range("3/11/2012 00:00", periods=2, freq="H", tz="US/Eastern") + rng2 = DatetimeIndex(["3/11/2012 00:00", "3/11/2012 01:00"], tz="US/Eastern") + tm.assert_index_equal(rng, rng2) + exp = Timestamp("3/11/2012 00:00", tz="US/Eastern") + assert exp.hour == 0 + assert rng[0] == exp + exp = Timestamp("3/11/2012 01:00", tz="US/Eastern") + assert exp.hour == 1 + assert rng[1] == exp + + rng = date_range("3/11/2012 00:00", periods=10, freq="H", tz="US/Eastern") + assert rng[2].hour == 3 + + def test_timestamp_equality_different_timezones(self): + utc_range = date_range("1/1/2000", periods=20, tz="UTC") + eastern_range = utc_range.tz_convert("US/Eastern") + berlin_range = utc_range.tz_convert("Europe/Berlin") + + for a, b, c in zip(utc_range, eastern_range, berlin_range): + assert a == b + assert b == c + assert a == c + + assert (utc_range == eastern_range).all() + assert (utc_range == berlin_range).all() + assert (berlin_range == eastern_range).all() + + def test_dti_intersection(self): + rng = date_range("1/1/2011", periods=100, freq="H", tz="utc") + + left = rng[10:90][::-1] + right = rng[20:80][::-1] + + assert left.tz == rng.tz + result = left.intersection(right) + assert result.tz == left.tz + + def test_dti_equals_with_tz(self): + left = date_range("1/1/2011", periods=100, freq="H", tz="utc") + right = date_range("1/1/2011", periods=100, freq="H", tz="US/Eastern") + + assert not left.equals(right) + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_dti_tz_nat(self, tzstr): + idx = DatetimeIndex([Timestamp("2013-1-1", tz=tzstr), pd.NaT]) + + assert isna(idx[1]) + assert idx[0].tzinfo is not None + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_dti_astype_asobject_tzinfos(self, tzstr): + # GH#1345 + + # dates around a dst transition + rng = date_range("2/13/2010", "5/6/2010", tz=tzstr) + + objs = rng.astype(object) + for i, x in enumerate(objs): + exval = rng[i] + assert x == exval + assert x.tzinfo == exval.tzinfo + + objs = rng.astype(object) + for i, x in enumerate(objs): + exval = rng[i] + assert x == exval + assert x.tzinfo == exval.tzinfo + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_dti_with_timezone_repr(self, tzstr): + rng = date_range("4/13/2010", "5/6/2010") + + rng_eastern = rng.tz_localize(tzstr) + + rng_repr = repr(rng_eastern) + assert "2010-04-13 00:00:00" in rng_repr + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_dti_take_dont_lose_meta(self, tzstr): + rng = date_range("1/1/2000", periods=20, tz=tzstr) + + result = rng.take(range(5)) + assert result.tz == rng.tz + assert result.freq == rng.freq + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_utc_box_timestamp_and_localize(self, tzstr): + tz = timezones.maybe_get_tz(tzstr) + + rng = date_range("3/11/2012", "3/12/2012", freq="H", tz="utc") + rng_eastern = rng.tz_convert(tzstr) + + expected = rng[-1].astimezone(tz) + + stamp = rng_eastern[-1] + assert stamp == expected + assert stamp.tzinfo == expected.tzinfo + + # right tzinfo + rng = date_range("3/13/2012", "3/14/2012", freq="H", tz="utc") + rng_eastern = rng.tz_convert(tzstr) + # test not valid for dateutil timezones. + # assert 'EDT' in repr(rng_eastern[0].tzinfo) + assert "EDT" in repr(rng_eastern[0].tzinfo) or "tzfile" in repr( + rng_eastern[0].tzinfo + ) + + def test_dti_to_pydatetime(self): + dt = dateutil.parser.parse("2012-06-13T01:39:00Z") + dt = dt.replace(tzinfo=tzlocal()) + + arr = np.array([dt], dtype=object) + + result = to_datetime(arr, utc=True) + assert result.tz is pytz.utc + + rng = date_range("2012-11-03 03:00", "2012-11-05 03:00", tz=tzlocal()) + arr = rng.to_pydatetime() + result = to_datetime(arr, utc=True) + assert result.tz is pytz.utc + + def test_dti_to_pydatetime_fizedtz(self): + dates = np.array( + [ + datetime(2000, 1, 1, tzinfo=fixed_off), + datetime(2000, 1, 2, tzinfo=fixed_off), + datetime(2000, 1, 3, tzinfo=fixed_off), + ] + ) + dti = DatetimeIndex(dates) + + result = dti.to_pydatetime() + tm.assert_numpy_array_equal(dates, result) + + result = dti._mpl_repr() + tm.assert_numpy_array_equal(dates, result) + + @pytest.mark.parametrize("tz", [pytz.timezone("US/Central"), gettz("US/Central")]) + def test_with_tz(self, tz): + # just want it to work + start = datetime(2011, 3, 12, tzinfo=pytz.utc) + dr = bdate_range(start, periods=50, freq=pd.offsets.Hour()) + assert dr.tz is pytz.utc + + # DateRange with naive datetimes + dr = bdate_range("1/1/2005", "1/1/2009", tz=pytz.utc) + dr = bdate_range("1/1/2005", "1/1/2009", tz=tz) + + # normalized + central = dr.tz_convert(tz) + assert central.tz is tz + naive = central[0].to_pydatetime().replace(tzinfo=None) + comp = conversion.localize_pydatetime(naive, tz).tzinfo + assert central[0].tz is comp + + # compare vs a localized tz + naive = dr[0].to_pydatetime().replace(tzinfo=None) + comp = conversion.localize_pydatetime(naive, tz).tzinfo + assert central[0].tz is comp + + # datetimes with tzinfo set + dr = bdate_range( + datetime(2005, 1, 1, tzinfo=pytz.utc), datetime(2009, 1, 1, tzinfo=pytz.utc) + ) + with pytest.raises(Exception): + bdate_range(datetime(2005, 1, 1, tzinfo=pytz.utc), "1/1/2009", tz=tz) + + @pytest.mark.parametrize("prefix", ["", "dateutil/"]) + def test_field_access_localize(self, prefix): + strdates = ["1/1/2012", "3/1/2012", "4/1/2012"] + rng = DatetimeIndex(strdates, tz=prefix + "US/Eastern") + assert (rng.hour == 0).all() + + # a more unusual time zone, #1946 + dr = date_range( + "2011-10-02 00:00", freq="h", periods=10, tz=prefix + "America/Atikokan" + ) + + expected = Index(np.arange(10, dtype=np.int64)) + tm.assert_index_equal(dr.hour, expected) + + @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) + def test_dti_convert_tz_aware_datetime_datetime(self, tz): + # GH#1581 + dates = [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)] + + dates_aware = [conversion.localize_pydatetime(x, tz) for x in dates] + result = DatetimeIndex(dates_aware) + assert timezones.tz_compare(result.tz, tz) + + converted = to_datetime(dates_aware, utc=True) + ex_vals = np.array([Timestamp(x).value for x in dates_aware]) + tm.assert_numpy_array_equal(converted.asi8, ex_vals) + assert converted.tz is pytz.utc + + def test_dti_union_aware(self): + # non-overlapping + rng = date_range("2012-11-15 00:00:00", periods=6, freq="H", tz="US/Central") + + rng2 = date_range("2012-11-15 12:00:00", periods=6, freq="H", tz="US/Eastern") + + result = rng.union(rng2) + expected = rng.astype("O").union(rng2.astype("O")) + tm.assert_index_equal(result, expected) + assert result[0].tz.zone == "US/Central" + assert result[-1].tz.zone == "US/Eastern" + + def test_dti_union_mixed(self): + # GH 21671 + rng = DatetimeIndex([pd.Timestamp("2011-01-01"), pd.NaT]) + rng2 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz="Asia/Tokyo") + result = rng.union(rng2) + expected = Index( + [ + pd.Timestamp("2011-01-01"), + pd.NaT, + pd.Timestamp("2012-01-01", tz="Asia/Tokyo"), + pd.Timestamp("2012-01-02", tz="Asia/Tokyo"), + ], + dtype=object, + ) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "tz", [None, "UTC", "US/Central", dateutil.tz.tzoffset(None, -28800)] + ) + @pytest.mark.usefixtures("datetime_tz_utc") + def test_iteration_preserves_nanoseconds(self, tz): + # GH 19603 + index = DatetimeIndex( + ["2018-02-08 15:00:00.168456358", "2018-02-08 15:00:00.168456359"], tz=tz + ) + for i, ts in enumerate(index): + assert ts == index[i] + + +class TestDateRange: + """Tests for date_range with timezones""" + + def test_hongkong_tz_convert(self): + # GH#1673 smoke test + dr = date_range("2012-01-01", "2012-01-10", freq="D", tz="Hongkong") + + # it works! + dr.hour + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_date_range_span_dst_transition(self, tzstr): + # GH#1778 + + # Standard -> Daylight Savings Time + dr = date_range("03/06/2012 00:00", periods=200, freq="W-FRI", tz="US/Eastern") + + assert (dr.hour == 0).all() + + dr = date_range("2012-11-02", periods=10, tz=tzstr) + result = dr.hour + expected = Index([0] * 10) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_date_range_timezone_str_argument(self, tzstr): + tz = timezones.maybe_get_tz(tzstr) + result = date_range("1/1/2000", periods=10, tz=tzstr) + expected = date_range("1/1/2000", periods=10, tz=tz) + + tm.assert_index_equal(result, expected) + + def test_date_range_with_fixedoffset_noname(self): + off = fixed_off_no_name + start = datetime(2012, 3, 11, 5, 0, 0, tzinfo=off) + end = datetime(2012, 6, 11, 5, 0, 0, tzinfo=off) + rng = date_range(start=start, end=end) + assert off == rng.tz + + idx = Index([start, end]) + assert off == idx.tz + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_date_range_with_tz(self, tzstr): + stamp = Timestamp("3/11/2012 05:00", tz=tzstr) + assert stamp.hour == 5 + + rng = date_range("3/11/2012 04:00", periods=10, freq="H", tz=tzstr) + + assert stamp == rng[1] + + +class TestToDatetime: + """Tests for the to_datetime constructor with timezones""" + + def test_to_datetime_utc(self): + arr = np.array([dateutil.parser.parse("2012-06-13T01:39:00Z")], dtype=object) + + result = to_datetime(arr, utc=True) + assert result.tz is pytz.utc + + def test_to_datetime_fixed_offset(self): + dates = [ + datetime(2000, 1, 1, tzinfo=fixed_off), + datetime(2000, 1, 2, tzinfo=fixed_off), + datetime(2000, 1, 3, tzinfo=fixed_off), + ] + result = to_datetime(dates) + assert result.tz == fixed_off diff --git a/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_tools.py b/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_tools.py new file mode 100644 index 0000000..fe65653 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/datetimes/test_tools.py @@ -0,0 +1,2315 @@ +""" test to_datetime """ + +import calendar +from collections import deque +from datetime import datetime, time +import locale + +from dateutil.parser import parse +from dateutil.tz.tz import tzoffset +import numpy as np +import pytest +import pytz + +from pandas._libs import tslib +from pandas._libs.tslibs import iNaT, parsing +from pandas.errors import OutOfBoundsDatetime +import pandas.util._test_decorators as td + +from pandas.core.dtypes.common import is_datetime64_ns_dtype + +import pandas as pd +from pandas import ( + DataFrame, + DatetimeIndex, + Index, + NaT, + Series, + Timestamp, + date_range, + isna, + to_datetime, +) +import pandas._testing as tm +from pandas.core.arrays import DatetimeArray +from pandas.core.tools import datetimes as tools + + +class TestTimeConversionFormats: + @pytest.mark.parametrize("cache", [True, False]) + def test_to_datetime_format(self, cache): + values = ["1/1/2000", "1/2/2000", "1/3/2000"] + + results1 = [Timestamp("20000101"), Timestamp("20000201"), Timestamp("20000301")] + results2 = [Timestamp("20000101"), Timestamp("20000102"), Timestamp("20000103")] + for vals, expecteds in [ + (values, (Index(results1), Index(results2))), + (Series(values), (Series(results1), Series(results2))), + (values[0], (results1[0], results2[0])), + (values[1], (results1[1], results2[1])), + (values[2], (results1[2], results2[2])), + ]: + + for i, fmt in enumerate(["%d/%m/%Y", "%m/%d/%Y"]): + result = to_datetime(vals, format=fmt, cache=cache) + expected = expecteds[i] + + if isinstance(expected, Series): + tm.assert_series_equal(result, Series(expected)) + elif isinstance(expected, Timestamp): + assert result == expected + else: + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("cache", [True, False]) + def test_to_datetime_format_YYYYMMDD(self, cache): + s = Series([19801222, 19801222] + [19810105] * 5) + expected = Series([Timestamp(x) for x in s.apply(str)]) + + result = to_datetime(s, format="%Y%m%d", cache=cache) + tm.assert_series_equal(result, expected) + + result = to_datetime(s.apply(str), format="%Y%m%d", cache=cache) + tm.assert_series_equal(result, expected) + + # with NaT + expected = Series( + [Timestamp("19801222"), Timestamp("19801222")] + [Timestamp("19810105")] * 5 + ) + expected[2] = np.nan + s[2] = np.nan + + result = to_datetime(s, format="%Y%m%d", cache=cache) + tm.assert_series_equal(result, expected) + + # string with NaT + s = s.apply(str) + s[2] = "nat" + result = to_datetime(s, format="%Y%m%d", cache=cache) + tm.assert_series_equal(result, expected) + + # coercion + # GH 7930 + s = Series([20121231, 20141231, 99991231]) + result = pd.to_datetime(s, format="%Y%m%d", errors="ignore", cache=cache) + expected = Series( + [datetime(2012, 12, 31), datetime(2014, 12, 31), datetime(9999, 12, 31)], + dtype=object, + ) + tm.assert_series_equal(result, expected) + + result = pd.to_datetime(s, format="%Y%m%d", errors="coerce", cache=cache) + expected = Series(["20121231", "20141231", "NaT"], dtype="M8[ns]") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "input_s", + [ + # Null values with Strings + ["19801222", "20010112", None], + ["19801222", "20010112", np.nan], + ["19801222", "20010112", pd.NaT], + ["19801222", "20010112", "NaT"], + # Null values with Integers + [19801222, 20010112, None], + [19801222, 20010112, np.nan], + [19801222, 20010112, pd.NaT], + [19801222, 20010112, "NaT"], + ], + ) + def test_to_datetime_format_YYYYMMDD_with_none(self, input_s): + # GH 30011 + # format='%Y%m%d' + # with None + expected = Series([Timestamp("19801222"), Timestamp("20010112"), pd.NaT]) + result = Series(pd.to_datetime(input_s, format="%Y%m%d")) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "input_s, expected", + [ + # NaN before strings with invalid date values + [ + Series(["19801222", np.nan, "20010012", "10019999"]), + Series([Timestamp("19801222"), np.nan, np.nan, np.nan]), + ], + # NaN after strings with invalid date values + [ + Series(["19801222", "20010012", "10019999", np.nan]), + Series([Timestamp("19801222"), np.nan, np.nan, np.nan]), + ], + # NaN before integers with invalid date values + [ + Series([20190813, np.nan, 20010012, 20019999]), + Series([Timestamp("20190813"), np.nan, np.nan, np.nan]), + ], + # NaN after integers with invalid date values + [ + Series([20190813, 20010012, np.nan, 20019999]), + Series([Timestamp("20190813"), np.nan, np.nan, np.nan]), + ], + ], + ) + def test_to_datetime_format_YYYYMMDD_overflow(self, input_s, expected): + # GH 25512 + # format='%Y%m%d', errors='coerce' + result = pd.to_datetime(input_s, format="%Y%m%d", errors="coerce") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("cache", [True, False]) + def test_to_datetime_format_integer(self, cache): + # GH 10178 + s = Series([2000, 2001, 2002]) + expected = Series([Timestamp(x) for x in s.apply(str)]) + + result = to_datetime(s, format="%Y", cache=cache) + tm.assert_series_equal(result, expected) + + s = Series([200001, 200105, 200206]) + expected = Series([Timestamp(x[:4] + "-" + x[4:]) for x in s.apply(str)]) + + result = to_datetime(s, format="%Y%m", cache=cache) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "int_date, expected", + [ + # valid date, length == 8 + [20121030, datetime(2012, 10, 30)], + # short valid date, length == 6 + [199934, datetime(1999, 3, 4)], + # long integer date partially parsed to datetime(2012,1,1), length > 8 + [2012010101, 2012010101], + # invalid date partially parsed to datetime(2012,9,9), length == 8 + [20129930, 20129930], + # short integer date partially parsed to datetime(2012,9,9), length < 8 + [2012993, 2012993], + # short invalid date, length == 4 + [2121, 2121], + ], + ) + def test_int_to_datetime_format_YYYYMMDD_typeerror(self, int_date, expected): + # GH 26583 + result = to_datetime(int_date, format="%Y%m%d", errors="ignore") + assert result == expected + + @pytest.mark.parametrize("cache", [True, False]) + def test_to_datetime_format_microsecond(self, cache): + + # these are locale dependent + lang, _ = locale.getlocale() + month_abbr = calendar.month_abbr[4] + val = "01-{}-2011 00:00:01.978".format(month_abbr) + + format = "%d-%b-%Y %H:%M:%S.%f" + result = to_datetime(val, format=format, cache=cache) + exp = datetime.strptime(val, format) + assert result == exp + + @pytest.mark.parametrize("cache", [True, False]) + def test_to_datetime_format_time(self, cache): + data = [ + ["01/10/2010 15:20", "%m/%d/%Y %H:%M", Timestamp("2010-01-10 15:20")], + ["01/10/2010 05:43", "%m/%d/%Y %I:%M", Timestamp("2010-01-10 05:43")], + [ + "01/10/2010 13:56:01", + "%m/%d/%Y %H:%M:%S", + Timestamp("2010-01-10 13:56:01"), + ] # , + # ['01/10/2010 08:14 PM', '%m/%d/%Y %I:%M %p', + # Timestamp('2010-01-10 20:14')], + # ['01/10/2010 07:40 AM', '%m/%d/%Y %I:%M %p', + # Timestamp('2010-01-10 07:40')], + # ['01/10/2010 09:12:56 AM', '%m/%d/%Y %I:%M:%S %p', + # Timestamp('2010-01-10 09:12:56')] + ] + for s, format, dt in data: + assert to_datetime(s, format=format, cache=cache) == dt + + @td.skip_if_has_locale + @pytest.mark.parametrize("cache", [True, False]) + def test_to_datetime_with_non_exact(self, cache): + # GH 10834 + # 8904 + # exact kw + s = Series( + ["19MAY11", "foobar19MAY11", "19MAY11:00:00:00", "19MAY11 00:00:00Z"] + ) + result = to_datetime(s, format="%d%b%y", exact=False, cache=cache) + expected = to_datetime( + s.str.extract(r"(\d+\w+\d+)", expand=False), format="%d%b%y", cache=cache + ) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("cache", [True, False]) + def test_parse_nanoseconds_with_formula(self, cache): + + # GH8989 + # truncating the nanoseconds when a format was provided + for v in [ + "2012-01-01 09:00:00.000000001", + "2012-01-01 09:00:00.000001", + "2012-01-01 09:00:00.001", + "2012-01-01 09:00:00.001000", + "2012-01-01 09:00:00.001000000", + ]: + expected = pd.to_datetime(v, cache=cache) + result = pd.to_datetime(v, format="%Y-%m-%d %H:%M:%S.%f", cache=cache) + assert result == expected + + @pytest.mark.parametrize("cache", [True, False]) + def test_to_datetime_format_weeks(self, cache): + data = [ + ["2009324", "%Y%W%w", Timestamp("2009-08-13")], + ["2013020", "%Y%U%w", Timestamp("2013-01-13")], + ] + for s, format, dt in data: + assert to_datetime(s, format=format, cache=cache) == dt + + @pytest.mark.parametrize( + "fmt,dates,expected_dates", + [ + [ + "%Y-%m-%d %H:%M:%S %Z", + ["2010-01-01 12:00:00 UTC"] * 2, + [pd.Timestamp("2010-01-01 12:00:00", tz="UTC")] * 2, + ], + [ + "%Y-%m-%d %H:%M:%S %Z", + [ + "2010-01-01 12:00:00 UTC", + "2010-01-01 12:00:00 GMT", + "2010-01-01 12:00:00 US/Pacific", + ], + [ + pd.Timestamp("2010-01-01 12:00:00", tz="UTC"), + pd.Timestamp("2010-01-01 12:00:00", tz="GMT"), + pd.Timestamp("2010-01-01 12:00:00", tz="US/Pacific"), + ], + ], + [ + "%Y-%m-%d %H:%M:%S%z", + ["2010-01-01 12:00:00+0100"] * 2, + [pd.Timestamp("2010-01-01 12:00:00", tzinfo=pytz.FixedOffset(60))] * 2, + ], + [ + "%Y-%m-%d %H:%M:%S %z", + ["2010-01-01 12:00:00 +0100"] * 2, + [pd.Timestamp("2010-01-01 12:00:00", tzinfo=pytz.FixedOffset(60))] * 2, + ], + [ + "%Y-%m-%d %H:%M:%S %z", + ["2010-01-01 12:00:00 +0100", "2010-01-01 12:00:00 -0100"], + [ + pd.Timestamp("2010-01-01 12:00:00", tzinfo=pytz.FixedOffset(60)), + pd.Timestamp("2010-01-01 12:00:00", tzinfo=pytz.FixedOffset(-60)), + ], + ], + [ + "%Y-%m-%d %H:%M:%S %z", + ["2010-01-01 12:00:00 Z", "2010-01-01 12:00:00 Z"], + [ + pd.Timestamp( + "2010-01-01 12:00:00", tzinfo=pytz.FixedOffset(0) + ), # pytz coerces to UTC + pd.Timestamp("2010-01-01 12:00:00", tzinfo=pytz.FixedOffset(0)), + ], + ], + ], + ) + def test_to_datetime_parse_tzname_or_tzoffset(self, fmt, dates, expected_dates): + # GH 13486 + result = pd.to_datetime(dates, format=fmt) + expected = pd.Index(expected_dates) + tm.assert_equal(result, expected) + + with pytest.raises(ValueError): + pd.to_datetime(dates, format=fmt, utc=True) + + @pytest.mark.parametrize( + "offset", ["+0", "-1foo", "UTCbar", ":10", "+01:000:01", ""] + ) + def test_to_datetime_parse_timezone_malformed(self, offset): + fmt = "%Y-%m-%d %H:%M:%S %z" + date = "2010-01-01 12:00:00 " + offset + with pytest.raises(ValueError): + pd.to_datetime([date], format=fmt) + + def test_to_datetime_parse_timezone_keeps_name(self): + # GH 21697 + fmt = "%Y-%m-%d %H:%M:%S %z" + arg = pd.Index(["2010-01-01 12:00:00 Z"], name="foo") + result = pd.to_datetime(arg, format=fmt) + expected = pd.DatetimeIndex(["2010-01-01 12:00:00"], tz="UTC", name="foo") + tm.assert_index_equal(result, expected) + + +class TestToDatetime: + @pytest.mark.parametrize( + "s, _format, dt", + [ + ["2015-1-1", "%G-%V-%u", datetime(2014, 12, 29, 0, 0)], + ["2015-1-4", "%G-%V-%u", datetime(2015, 1, 1, 0, 0)], + ["2015-1-7", "%G-%V-%u", datetime(2015, 1, 4, 0, 0)], + ], + ) + def test_to_datetime_iso_week_year_format(self, s, _format, dt): + # See GH#16607 + assert to_datetime(s, format=_format) == dt + + @pytest.mark.parametrize( + "msg, s, _format", + [ + [ + "ISO week directive '%V' must be used with the ISO year directive " + "'%G' and a weekday directive '%A', '%a', '%w', or '%u'.", + "1999 50", + "%Y %V", + ], + [ + "ISO year directive '%G' must be used with the ISO week directive " + "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", + "1999 51", + "%G %V", + ], + [ + "ISO year directive '%G' must be used with the ISO week directive " + "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", + "1999 Monday", + "%G %A", + ], + [ + "ISO year directive '%G' must be used with the ISO week directive " + "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", + "1999 Mon", + "%G %a", + ], + [ + "ISO year directive '%G' must be used with the ISO week directive " + "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", + "1999 6", + "%G %w", + ], + [ + "ISO year directive '%G' must be used with the ISO week directive " + "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", + "1999 6", + "%G %u", + ], + [ + "ISO year directive '%G' must be used with the ISO week directive " + "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", + "2051", + "%G", + ], + [ + "Day of the year directive '%j' is not compatible with ISO year " + "directive '%G'. Use '%Y' instead.", + "1999 51 6 256", + "%G %V %u %j", + ], + [ + "ISO week directive '%V' is incompatible with the year directive " + "'%Y'. Use the ISO year '%G' instead.", + "1999 51 Sunday", + "%Y %V %A", + ], + [ + "ISO week directive '%V' is incompatible with the year directive " + "'%Y'. Use the ISO year '%G' instead.", + "1999 51 Sun", + "%Y %V %a", + ], + [ + "ISO week directive '%V' is incompatible with the year directive " + "'%Y'. Use the ISO year '%G' instead.", + "1999 51 1", + "%Y %V %w", + ], + [ + "ISO week directive '%V' is incompatible with the year directive " + "'%Y'. Use the ISO year '%G' instead.", + "1999 51 1", + "%Y %V %u", + ], + [ + "ISO week directive '%V' must be used with the ISO year directive " + "'%G' and a weekday directive '%A', '%a', '%w', or '%u'.", + "20", + "%V", + ], + ], + ) + def test_error_iso_week_year(self, msg, s, _format): + # See GH#16607 + # This test checks for errors thrown when giving the wrong format + # However, as discussed on PR#25541, overriding the locale + # causes a different error to be thrown due to the format being + # locale specific, but the test data is in english. + # Therefore, the tests only run when locale is not overwritten, + # as a sort of solution to this problem. + if locale.getlocale() != ("zh_CN", "UTF-8") and locale.getlocale() != ( + "it_IT", + "UTF-8", + ): + with pytest.raises(ValueError, match=msg): + to_datetime(s, format=_format) + + @pytest.mark.parametrize("tz", [None, "US/Central"]) + def test_to_datetime_dtarr(self, tz): + # DatetimeArray + dti = date_range("1965-04-03", periods=19, freq="2W", tz=tz) + arr = DatetimeArray(dti) + + result = to_datetime(arr) + assert result is arr + + result = to_datetime(arr) + assert result is arr + + def test_to_datetime_pydatetime(self): + actual = pd.to_datetime(datetime(2008, 1, 15)) + assert actual == datetime(2008, 1, 15) + + def test_to_datetime_YYYYMMDD(self): + actual = pd.to_datetime("20080115") + assert actual == datetime(2008, 1, 15) + + def test_to_datetime_unparseable_ignore(self): + # unparseable + s = "Month 1, 1999" + assert pd.to_datetime(s, errors="ignore") == s + + @td.skip_if_windows # `tm.set_timezone` does not work in windows + def test_to_datetime_now(self): + # See GH#18666 + with tm.set_timezone("US/Eastern"): + npnow = np.datetime64("now").astype("datetime64[ns]") + pdnow = pd.to_datetime("now") + pdnow2 = pd.to_datetime(["now"])[0] + + # These should all be equal with infinite perf; this gives + # a generous margin of 10 seconds + assert abs(pdnow.value - npnow.astype(np.int64)) < 1e10 + assert abs(pdnow2.value - npnow.astype(np.int64)) < 1e10 + + assert pdnow.tzinfo is None + assert pdnow2.tzinfo is None + + @td.skip_if_windows # `tm.set_timezone` does not work in windows + def test_to_datetime_today(self): + # See GH#18666 + # Test with one timezone far ahead of UTC and another far behind, so + # one of these will _almost_ always be in a different day from UTC. + # Unfortunately this test between 12 and 1 AM Samoa time + # this both of these timezones _and_ UTC will all be in the same day, + # so this test will not detect the regression introduced in #18666. + with tm.set_timezone("Pacific/Auckland"): # 12-13 hours ahead of UTC + nptoday = np.datetime64("today").astype("datetime64[ns]").astype(np.int64) + pdtoday = pd.to_datetime("today") + pdtoday2 = pd.to_datetime(["today"])[0] + + tstoday = pd.Timestamp("today") + tstoday2 = pd.Timestamp.today() + + # These should all be equal with infinite perf; this gives + # a generous margin of 10 seconds + assert abs(pdtoday.normalize().value - nptoday) < 1e10 + assert abs(pdtoday2.normalize().value - nptoday) < 1e10 + assert abs(pdtoday.value - tstoday.value) < 1e10 + assert abs(pdtoday.value - tstoday2.value) < 1e10 + + assert pdtoday.tzinfo is None + assert pdtoday2.tzinfo is None + + with tm.set_timezone("US/Samoa"): # 11 hours behind UTC + nptoday = np.datetime64("today").astype("datetime64[ns]").astype(np.int64) + pdtoday = pd.to_datetime("today") + pdtoday2 = pd.to_datetime(["today"])[0] + + # These should all be equal with infinite perf; this gives + # a generous margin of 10 seconds + assert abs(pdtoday.normalize().value - nptoday) < 1e10 + assert abs(pdtoday2.normalize().value - nptoday) < 1e10 + + assert pdtoday.tzinfo is None + assert pdtoday2.tzinfo is None + + def test_to_datetime_today_now_unicode_bytes(self): + to_datetime(["now"]) + to_datetime(["today"]) + + @pytest.mark.parametrize("cache", [True, False]) + def test_to_datetime_dt64s(self, cache): + in_bound_dts = [np.datetime64("2000-01-01"), np.datetime64("2000-01-02")] + + for dt in in_bound_dts: + assert pd.to_datetime(dt, cache=cache) == Timestamp(dt) + + @pytest.mark.parametrize( + "dt", [np.datetime64("1000-01-01"), np.datetime64("5000-01-02")] + ) + @pytest.mark.parametrize("cache", [True, False]) + def test_to_datetime_dt64s_out_of_bounds(self, cache, dt): + msg = "Out of bounds nanosecond timestamp: {}".format(dt) + with pytest.raises(OutOfBoundsDatetime, match=msg): + pd.to_datetime(dt, errors="raise") + with pytest.raises(OutOfBoundsDatetime, match=msg): + Timestamp(dt) + assert pd.to_datetime(dt, errors="coerce", cache=cache) is NaT + + @pytest.mark.parametrize("cache", [True, False]) + def test_to_datetime_array_of_dt64s(self, cache): + dts = [np.datetime64("2000-01-01"), np.datetime64("2000-01-02")] + + # Assuming all datetimes are in bounds, to_datetime() returns + # an array that is equal to Timestamp() parsing + tm.assert_index_equal( + pd.to_datetime(dts, cache=cache), + pd.DatetimeIndex([Timestamp(x).asm8 for x in dts]), + ) + + # A list of datetimes where the last one is out of bounds + dts_with_oob = dts + [np.datetime64("9999-01-01")] + + msg = "Out of bounds nanosecond timestamp: 9999-01-01 00:00:00" + with pytest.raises(OutOfBoundsDatetime, match=msg): + pd.to_datetime(dts_with_oob, errors="raise") + + tm.assert_index_equal( + pd.to_datetime(dts_with_oob, errors="coerce", cache=cache), + pd.DatetimeIndex( + [ + Timestamp(dts_with_oob[0]).asm8, + Timestamp(dts_with_oob[1]).asm8, + pd.NaT, + ] + ), + ) + + # With errors='ignore', out of bounds datetime64s + # are converted to their .item(), which depending on the version of + # numpy is either a python datetime.datetime or datetime.date + tm.assert_index_equal( + pd.to_datetime(dts_with_oob, errors="ignore", cache=cache), + pd.Index([dt.item() for dt in dts_with_oob]), + ) + + @pytest.mark.parametrize("cache", [True, False]) + def test_to_datetime_tz(self, cache): + + # xref 8260 + # uniform returns a DatetimeIndex + arr = [ + pd.Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"), + pd.Timestamp("2013-01-02 14:00:00-0800", tz="US/Pacific"), + ] + result = pd.to_datetime(arr, cache=cache) + expected = DatetimeIndex( + ["2013-01-01 13:00:00", "2013-01-02 14:00:00"], tz="US/Pacific" + ) + tm.assert_index_equal(result, expected) + + # mixed tzs will raise + arr = [ + pd.Timestamp("2013-01-01 13:00:00", tz="US/Pacific"), + pd.Timestamp("2013-01-02 14:00:00", tz="US/Eastern"), + ] + msg = ( + "Tz-aware datetime.datetime cannot be " + "converted to datetime64 unless utc=True" + ) + with pytest.raises(ValueError, match=msg): + pd.to_datetime(arr, cache=cache) + + @pytest.mark.parametrize("cache", [True, False]) + def test_to_datetime_different_offsets(self, cache): + # inspired by asv timeseries.ToDatetimeNONISO8601 benchmark + # see GH-26097 for more + ts_string_1 = "March 1, 2018 12:00:00+0400" + ts_string_2 = "March 1, 2018 12:00:00+0500" + arr = [ts_string_1] * 5 + [ts_string_2] * 5 + expected = pd.Index([parse(x) for x in arr]) + result = pd.to_datetime(arr, cache=cache) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("cache", [True, False]) + def test_to_datetime_tz_pytz(self, cache): + # see gh-8260 + us_eastern = pytz.timezone("US/Eastern") + arr = np.array( + [ + us_eastern.localize( + datetime(year=2000, month=1, day=1, hour=3, minute=0) + ), + us_eastern.localize( + datetime(year=2000, month=6, day=1, hour=3, minute=0) + ), + ], + dtype=object, + ) + result = pd.to_datetime(arr, utc=True, cache=cache) + expected = DatetimeIndex( + ["2000-01-01 08:00:00+00:00", "2000-06-01 07:00:00+00:00"], + dtype="datetime64[ns, UTC]", + freq=None, + ) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("cache", [True, False]) + @pytest.mark.parametrize( + "init_constructor, end_constructor, test_method", + [ + (Index, DatetimeIndex, tm.assert_index_equal), + (list, DatetimeIndex, tm.assert_index_equal), + (np.array, DatetimeIndex, tm.assert_index_equal), + (Series, Series, tm.assert_series_equal), + ], + ) + def test_to_datetime_utc_true( + self, cache, init_constructor, end_constructor, test_method + ): + # See gh-11934 & gh-6415 + data = ["20100102 121314", "20100102 121315"] + expected_data = [ + pd.Timestamp("2010-01-02 12:13:14", tz="utc"), + pd.Timestamp("2010-01-02 12:13:15", tz="utc"), + ] + + result = pd.to_datetime( + init_constructor(data), format="%Y%m%d %H%M%S", utc=True, cache=cache + ) + expected = end_constructor(expected_data) + test_method(result, expected) + + # Test scalar case as well + for scalar, expected in zip(data, expected_data): + result = pd.to_datetime( + scalar, format="%Y%m%d %H%M%S", utc=True, cache=cache + ) + assert result == expected + + @pytest.mark.parametrize("cache", [True, False]) + def test_to_datetime_utc_true_with_series_single_value(self, cache): + # GH 15760 UTC=True with Series + ts = 1.5e18 + result = pd.to_datetime(pd.Series([ts]), utc=True, cache=cache) + expected = pd.Series([pd.Timestamp(ts, tz="utc")]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("cache", [True, False]) + def test_to_datetime_utc_true_with_series_tzaware_string(self, cache): + ts = "2013-01-01 00:00:00-01:00" + expected_ts = "2013-01-01 01:00:00" + data = pd.Series([ts] * 3) + result = pd.to_datetime(data, utc=True, cache=cache) + expected = pd.Series([pd.Timestamp(expected_ts, tz="utc")] * 3) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("cache", [True, False]) + @pytest.mark.parametrize( + "date, dtype", + [ + ("2013-01-01 01:00:00", "datetime64[ns]"), + ("2013-01-01 01:00:00", "datetime64[ns, UTC]"), + ], + ) + def test_to_datetime_utc_true_with_series_datetime_ns(self, cache, date, dtype): + expected = pd.Series([pd.Timestamp("2013-01-01 01:00:00", tz="UTC")]) + result = pd.to_datetime(pd.Series([date], dtype=dtype), utc=True, cache=cache) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("cache", [True, False]) + @td.skip_if_no("psycopg2") + def test_to_datetime_tz_psycopg2(self, cache): + + # xref 8260 + import psycopg2 + + # misc cases + tz1 = psycopg2.tz.FixedOffsetTimezone(offset=-300, name=None) + tz2 = psycopg2.tz.FixedOffsetTimezone(offset=-240, name=None) + arr = np.array( + [ + datetime(2000, 1, 1, 3, 0, tzinfo=tz1), + datetime(2000, 6, 1, 3, 0, tzinfo=tz2), + ], + dtype=object, + ) + + result = pd.to_datetime(arr, errors="coerce", utc=True, cache=cache) + expected = DatetimeIndex( + ["2000-01-01 08:00:00+00:00", "2000-06-01 07:00:00+00:00"], + dtype="datetime64[ns, UTC]", + freq=None, + ) + tm.assert_index_equal(result, expected) + + # dtype coercion + i = pd.DatetimeIndex( + ["2000-01-01 08:00:00"], + tz=psycopg2.tz.FixedOffsetTimezone(offset=-300, name=None), + ) + assert is_datetime64_ns_dtype(i) + + # tz coercion + result = pd.to_datetime(i, errors="coerce", cache=cache) + tm.assert_index_equal(result, i) + + result = pd.to_datetime(i, errors="coerce", utc=True, cache=cache) + expected = pd.DatetimeIndex( + ["2000-01-01 13:00:00"], dtype="datetime64[ns, UTC]" + ) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("cache", [True, False]) + def test_datetime_bool(self, cache): + # GH13176 + with pytest.raises(TypeError): + to_datetime(False) + assert to_datetime(False, errors="coerce", cache=cache) is NaT + assert to_datetime(False, errors="ignore", cache=cache) is False + with pytest.raises(TypeError): + to_datetime(True) + assert to_datetime(True, errors="coerce", cache=cache) is NaT + assert to_datetime(True, errors="ignore", cache=cache) is True + with pytest.raises(TypeError): + to_datetime([False, datetime.today()], cache=cache) + with pytest.raises(TypeError): + to_datetime(["20130101", True], cache=cache) + tm.assert_index_equal( + to_datetime([0, False, NaT, 0.0], errors="coerce", cache=cache), + DatetimeIndex( + [to_datetime(0, cache=cache), NaT, NaT, to_datetime(0, cache=cache)] + ), + ) + + def test_datetime_invalid_datatype(self): + # GH13176 + + with pytest.raises(TypeError): + pd.to_datetime(bool) + with pytest.raises(TypeError): + pd.to_datetime(pd.to_datetime) + + @pytest.mark.parametrize("value", ["a", "00:01:99"]) + @pytest.mark.parametrize("infer", [True, False]) + @pytest.mark.parametrize("format", [None, "H%:M%:S%"]) + def test_datetime_invalid_scalar(self, value, format, infer): + # GH24763 + res = pd.to_datetime( + value, errors="ignore", format=format, infer_datetime_format=infer + ) + assert res == value + + res = pd.to_datetime( + value, errors="coerce", format=format, infer_datetime_format=infer + ) + assert res is pd.NaT + + with pytest.raises(ValueError): + pd.to_datetime( + value, errors="raise", format=format, infer_datetime_format=infer + ) + + @pytest.mark.parametrize("value", ["3000/12/11 00:00:00"]) + @pytest.mark.parametrize("infer", [True, False]) + @pytest.mark.parametrize("format", [None, "H%:M%:S%"]) + def test_datetime_outofbounds_scalar(self, value, format, infer): + # GH24763 + res = pd.to_datetime( + value, errors="ignore", format=format, infer_datetime_format=infer + ) + assert res == value + + res = pd.to_datetime( + value, errors="coerce", format=format, infer_datetime_format=infer + ) + assert res is pd.NaT + + if format is not None: + with pytest.raises(ValueError): + pd.to_datetime( + value, errors="raise", format=format, infer_datetime_format=infer + ) + else: + with pytest.raises(OutOfBoundsDatetime): + pd.to_datetime( + value, errors="raise", format=format, infer_datetime_format=infer + ) + + @pytest.mark.parametrize("values", [["a"], ["00:01:99"], ["a", "b", "99:00:00"]]) + @pytest.mark.parametrize("infer", [True, False]) + @pytest.mark.parametrize("format", [None, "H%:M%:S%"]) + def test_datetime_invalid_index(self, values, format, infer): + # GH24763 + res = pd.to_datetime( + values, errors="ignore", format=format, infer_datetime_format=infer + ) + tm.assert_index_equal(res, pd.Index(values)) + + res = pd.to_datetime( + values, errors="coerce", format=format, infer_datetime_format=infer + ) + tm.assert_index_equal(res, pd.DatetimeIndex([pd.NaT] * len(values))) + + with pytest.raises(ValueError): + pd.to_datetime( + values, errors="raise", format=format, infer_datetime_format=infer + ) + + @pytest.mark.parametrize("utc", [True, None]) + @pytest.mark.parametrize("format", ["%Y%m%d %H:%M:%S", None]) + @pytest.mark.parametrize("constructor", [list, tuple, np.array, pd.Index, deque]) + def test_to_datetime_cache(self, utc, format, constructor): + date = "20130101 00:00:00" + test_dates = [date] * 10 ** 5 + data = constructor(test_dates) + + result = pd.to_datetime(data, utc=utc, format=format, cache=True) + expected = pd.to_datetime(data, utc=utc, format=format, cache=False) + + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "listlike", + [ + (deque([pd.Timestamp("2010-06-02 09:30:00")] * 51)), + ([pd.Timestamp("2010-06-02 09:30:00")] * 51), + (tuple([pd.Timestamp("2010-06-02 09:30:00")] * 51)), + ], + ) + def test_no_slicing_errors_in_should_cache(self, listlike): + # GH 29403 + assert tools.should_cache(listlike) is True + + def test_to_datetime_from_deque(self): + # GH 29403 + result = pd.to_datetime(deque([pd.Timestamp("2010-06-02 09:30:00")] * 51)) + expected = pd.to_datetime([pd.Timestamp("2010-06-02 09:30:00")] * 51) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("utc", [True, None]) + @pytest.mark.parametrize("format", ["%Y%m%d %H:%M:%S", None]) + def test_to_datetime_cache_series(self, utc, format): + date = "20130101 00:00:00" + test_dates = [date] * 10 ** 5 + data = pd.Series(test_dates) + result = pd.to_datetime(data, utc=utc, format=format, cache=True) + expected = pd.to_datetime(data, utc=utc, format=format, cache=False) + tm.assert_series_equal(result, expected) + + def test_to_datetime_cache_scalar(self): + date = "20130101 00:00:00" + result = pd.to_datetime(date, cache=True) + expected = pd.Timestamp("20130101 00:00:00") + assert result == expected + + @pytest.mark.parametrize( + "date, format", + [ + ("2017-20", "%Y-%W"), + ("20 Sunday", "%W %A"), + ("20 Sun", "%W %a"), + ("2017-21", "%Y-%U"), + ("20 Sunday", "%U %A"), + ("20 Sun", "%U %a"), + ], + ) + def test_week_without_day_and_calendar_year(self, date, format): + # GH16774 + + msg = "Cannot use '%W' or '%U' without day and year" + with pytest.raises(ValueError, match=msg): + pd.to_datetime(date, format=format) + + def test_to_datetime_coerce(self): + # GH 26122 + ts_strings = [ + "March 1, 2018 12:00:00+0400", + "March 1, 2018 12:00:00+0500", + "20100240", + ] + result = to_datetime(ts_strings, errors="coerce") + expected = Index( + [ + datetime(2018, 3, 1, 12, 0, tzinfo=tzoffset(None, 14400)), + datetime(2018, 3, 1, 12, 0, tzinfo=tzoffset(None, 18000)), + NaT, + ] + ) + tm.assert_index_equal(result, expected) + + def test_to_datetime_coerce_malformed(self): + # GH 28299 + ts_strings = ["200622-12-31", "111111-24-11"] + result = to_datetime(ts_strings, errors="coerce") + expected = Index([NaT, NaT]) + tm.assert_index_equal(result, expected) + + def test_iso_8601_strings_with_same_offset(self): + # GH 17697, 11736 + ts_str = "2015-11-18 15:30:00+05:30" + result = to_datetime(ts_str) + expected = Timestamp(ts_str) + assert result == expected + + expected = DatetimeIndex([Timestamp(ts_str)] * 2) + result = to_datetime([ts_str] * 2) + tm.assert_index_equal(result, expected) + + result = DatetimeIndex([ts_str] * 2) + tm.assert_index_equal(result, expected) + + def test_iso_8601_strings_with_different_offsets(self): + # GH 17697, 11736 + ts_strings = ["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30", NaT] + result = to_datetime(ts_strings) + expected = np.array( + [ + datetime(2015, 11, 18, 15, 30, tzinfo=tzoffset(None, 19800)), + datetime(2015, 11, 18, 16, 30, tzinfo=tzoffset(None, 23400)), + NaT, + ], + dtype=object, + ) + # GH 21864 + expected = Index(expected) + tm.assert_index_equal(result, expected) + + result = to_datetime(ts_strings, utc=True) + expected = DatetimeIndex( + [Timestamp(2015, 11, 18, 10), Timestamp(2015, 11, 18, 10), NaT], tz="UTC" + ) + tm.assert_index_equal(result, expected) + + def test_iso8601_strings_mixed_offsets_with_naive(self): + # GH 24992 + result = pd.to_datetime( + [ + "2018-11-28T00:00:00", + "2018-11-28T00:00:00+12:00", + "2018-11-28T00:00:00", + "2018-11-28T00:00:00+06:00", + "2018-11-28T00:00:00", + ], + utc=True, + ) + expected = pd.to_datetime( + [ + "2018-11-28T00:00:00", + "2018-11-27T12:00:00", + "2018-11-28T00:00:00", + "2018-11-27T18:00:00", + "2018-11-28T00:00:00", + ], + utc=True, + ) + tm.assert_index_equal(result, expected) + + items = ["2018-11-28T00:00:00+12:00", "2018-11-28T00:00:00"] + result = pd.to_datetime(items, utc=True) + expected = pd.to_datetime(list(reversed(items)), utc=True)[::-1] + tm.assert_index_equal(result, expected) + + def test_mixed_offsets_with_native_datetime_raises(self): + # GH 25978 + s = pd.Series( + [ + "nan", + pd.Timestamp("1990-01-01"), + "2015-03-14T16:15:14.123-08:00", + "2019-03-04T21:56:32.620-07:00", + None, + ] + ) + with pytest.raises(ValueError, match="Tz-aware datetime.datetime"): + pd.to_datetime(s) + + def test_non_iso_strings_with_tz_offset(self): + result = to_datetime(["March 1, 2018 12:00:00+0400"] * 2) + expected = DatetimeIndex( + [datetime(2018, 3, 1, 12, tzinfo=pytz.FixedOffset(240))] * 2 + ) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "ts, expected", + [ + (Timestamp("2018-01-01"), Timestamp("2018-01-01", tz="UTC")), + ( + Timestamp("2018-01-01", tz="US/Pacific"), + Timestamp("2018-01-01 08:00", tz="UTC"), + ), + ], + ) + def test_timestamp_utc_true(self, ts, expected): + # GH 24415 + result = to_datetime(ts, utc=True) + assert result == expected + + @pytest.mark.parametrize("dt_str", ["00010101", "13000101", "30000101", "99990101"]) + def test_to_datetime_with_format_out_of_bounds(self, dt_str): + # GH 9107 + with pytest.raises(OutOfBoundsDatetime): + pd.to_datetime(dt_str, format="%Y%m%d") + + +class TestToDatetimeUnit: + @pytest.mark.parametrize("cache", [True, False]) + def test_unit(self, cache): + # GH 11758 + # test proper behavior with errors + + with pytest.raises(ValueError): + to_datetime([1], unit="D", format="%Y%m%d", cache=cache) + + values = [11111111, 1, 1.0, iNaT, NaT, np.nan, "NaT", ""] + result = to_datetime(values, unit="D", errors="ignore", cache=cache) + expected = Index( + [ + 11111111, + Timestamp("1970-01-02"), + Timestamp("1970-01-02"), + NaT, + NaT, + NaT, + NaT, + NaT, + ], + dtype=object, + ) + tm.assert_index_equal(result, expected) + + result = to_datetime(values, unit="D", errors="coerce", cache=cache) + expected = DatetimeIndex( + ["NaT", "1970-01-02", "1970-01-02", "NaT", "NaT", "NaT", "NaT", "NaT"] + ) + tm.assert_index_equal(result, expected) + + with pytest.raises(tslib.OutOfBoundsDatetime): + to_datetime(values, unit="D", errors="raise", cache=cache) + + values = [1420043460000, iNaT, NaT, np.nan, "NaT"] + + result = to_datetime(values, errors="ignore", unit="s", cache=cache) + expected = Index([1420043460000, NaT, NaT, NaT, NaT], dtype=object) + tm.assert_index_equal(result, expected) + + result = to_datetime(values, errors="coerce", unit="s", cache=cache) + expected = DatetimeIndex(["NaT", "NaT", "NaT", "NaT", "NaT"]) + tm.assert_index_equal(result, expected) + + with pytest.raises(tslib.OutOfBoundsDatetime): + to_datetime(values, errors="raise", unit="s", cache=cache) + + # if we have a string, then we raise a ValueError + # and NOT an OutOfBoundsDatetime + for val in ["foo", Timestamp("20130101")]: + try: + to_datetime(val, errors="raise", unit="s", cache=cache) + except tslib.OutOfBoundsDatetime: + raise AssertionError("incorrect exception raised") + except ValueError: + pass + + @pytest.mark.parametrize("cache", [True, False]) + def test_unit_consistency(self, cache): + + # consistency of conversions + expected = Timestamp("1970-05-09 14:25:11") + result = pd.to_datetime(11111111, unit="s", errors="raise", cache=cache) + assert result == expected + assert isinstance(result, Timestamp) + + result = pd.to_datetime(11111111, unit="s", errors="coerce", cache=cache) + assert result == expected + assert isinstance(result, Timestamp) + + result = pd.to_datetime(11111111, unit="s", errors="ignore", cache=cache) + assert result == expected + assert isinstance(result, Timestamp) + + @pytest.mark.parametrize("cache", [True, False]) + def test_unit_with_numeric(self, cache): + + # GH 13180 + # coercions from floats/ints are ok + expected = DatetimeIndex(["2015-06-19 05:33:20", "2015-05-27 22:33:20"]) + arr1 = [1.434692e18, 1.432766e18] + arr2 = np.array(arr1).astype("int64") + for errors in ["ignore", "raise", "coerce"]: + result = pd.to_datetime(arr1, errors=errors, cache=cache) + tm.assert_index_equal(result, expected) + + result = pd.to_datetime(arr2, errors=errors, cache=cache) + tm.assert_index_equal(result, expected) + + # but we want to make sure that we are coercing + # if we have ints/strings + expected = DatetimeIndex(["NaT", "2015-06-19 05:33:20", "2015-05-27 22:33:20"]) + arr = ["foo", 1.434692e18, 1.432766e18] + result = pd.to_datetime(arr, errors="coerce", cache=cache) + tm.assert_index_equal(result, expected) + + expected = DatetimeIndex( + ["2015-06-19 05:33:20", "2015-05-27 22:33:20", "NaT", "NaT"] + ) + arr = [1.434692e18, 1.432766e18, "foo", "NaT"] + result = pd.to_datetime(arr, errors="coerce", cache=cache) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("cache", [True, False]) + def test_unit_mixed(self, cache): + + # mixed integers/datetimes + expected = DatetimeIndex(["2013-01-01", "NaT", "NaT"]) + arr = [pd.Timestamp("20130101"), 1.434692e18, 1.432766e18] + result = pd.to_datetime(arr, errors="coerce", cache=cache) + tm.assert_index_equal(result, expected) + + with pytest.raises(ValueError): + pd.to_datetime(arr, errors="raise", cache=cache) + + expected = DatetimeIndex(["NaT", "NaT", "2013-01-01"]) + arr = [1.434692e18, 1.432766e18, pd.Timestamp("20130101")] + result = pd.to_datetime(arr, errors="coerce", cache=cache) + tm.assert_index_equal(result, expected) + + with pytest.raises(ValueError): + pd.to_datetime(arr, errors="raise", cache=cache) + + @pytest.mark.parametrize("cache", [True, False]) + def test_unit_rounding(self, cache): + # GH 14156: argument will incur floating point errors but no + # premature rounding + result = pd.to_datetime(1434743731.8770001, unit="s", cache=cache) + expected = pd.Timestamp("2015-06-19 19:55:31.877000093") + assert result == expected + + @pytest.mark.parametrize("cache", [True, False]) + def test_unit_ignore_keeps_name(self, cache): + # GH 21697 + expected = pd.Index([15e9] * 2, name="name") + result = pd.to_datetime(expected, errors="ignore", unit="s", cache=cache) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("cache", [True, False]) + def test_dataframe(self, cache): + + df = DataFrame( + { + "year": [2015, 2016], + "month": [2, 3], + "day": [4, 5], + "hour": [6, 7], + "minute": [58, 59], + "second": [10, 11], + "ms": [1, 1], + "us": [2, 2], + "ns": [3, 3], + } + ) + + result = to_datetime( + {"year": df["year"], "month": df["month"], "day": df["day"]}, cache=cache + ) + expected = Series( + [Timestamp("20150204 00:00:00"), Timestamp("20160305 00:0:00")] + ) + tm.assert_series_equal(result, expected) + + # dict-like + result = to_datetime(df[["year", "month", "day"]].to_dict(), cache=cache) + tm.assert_series_equal(result, expected) + + # dict but with constructable + df2 = df[["year", "month", "day"]].to_dict() + df2["month"] = 2 + result = to_datetime(df2, cache=cache) + expected2 = Series( + [Timestamp("20150204 00:00:00"), Timestamp("20160205 00:0:00")] + ) + tm.assert_series_equal(result, expected2) + + # unit mappings + units = [ + { + "year": "years", + "month": "months", + "day": "days", + "hour": "hours", + "minute": "minutes", + "second": "seconds", + }, + { + "year": "year", + "month": "month", + "day": "day", + "hour": "hour", + "minute": "minute", + "second": "second", + }, + ] + + for d in units: + result = to_datetime(df[list(d.keys())].rename(columns=d), cache=cache) + expected = Series( + [Timestamp("20150204 06:58:10"), Timestamp("20160305 07:59:11")] + ) + tm.assert_series_equal(result, expected) + + d = { + "year": "year", + "month": "month", + "day": "day", + "hour": "hour", + "minute": "minute", + "second": "second", + "ms": "ms", + "us": "us", + "ns": "ns", + } + + result = to_datetime(df.rename(columns=d), cache=cache) + expected = Series( + [ + Timestamp("20150204 06:58:10.001002003"), + Timestamp("20160305 07:59:11.001002003"), + ] + ) + tm.assert_series_equal(result, expected) + + # coerce back to int + result = to_datetime(df.astype(str), cache=cache) + tm.assert_series_equal(result, expected) + + # passing coerce + df2 = DataFrame({"year": [2015, 2016], "month": [2, 20], "day": [4, 5]}) + + msg = ( + "cannot assemble the datetimes: time data .+ does not " + r"match format '%Y%m%d' \(match\)" + ) + with pytest.raises(ValueError, match=msg): + to_datetime(df2, cache=cache) + result = to_datetime(df2, errors="coerce", cache=cache) + expected = Series([Timestamp("20150204 00:00:00"), NaT]) + tm.assert_series_equal(result, expected) + + # extra columns + msg = r"extra keys have been passed to the datetime assemblage: \[foo\]" + with pytest.raises(ValueError, match=msg): + df2 = df.copy() + df2["foo"] = 1 + to_datetime(df2, cache=cache) + + # not enough + msg = ( + r"to assemble mappings requires at least that \[year, month, " + r"day\] be specified: \[.+\] is missing" + ) + for c in [ + ["year"], + ["year", "month"], + ["year", "month", "second"], + ["month", "day"], + ["year", "day", "second"], + ]: + with pytest.raises(ValueError, match=msg): + to_datetime(df[c], cache=cache) + + # duplicates + msg = "cannot assemble with duplicate keys" + df2 = DataFrame({"year": [2015, 2016], "month": [2, 20], "day": [4, 5]}) + df2.columns = ["year", "year", "day"] + with pytest.raises(ValueError, match=msg): + to_datetime(df2, cache=cache) + + df2 = DataFrame( + {"year": [2015, 2016], "month": [2, 20], "day": [4, 5], "hour": [4, 5]} + ) + df2.columns = ["year", "month", "day", "day"] + with pytest.raises(ValueError, match=msg): + to_datetime(df2, cache=cache) + + @pytest.mark.parametrize("cache", [True, False]) + def test_dataframe_dtypes(self, cache): + # #13451 + df = DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}) + + # int16 + result = to_datetime(df.astype("int16"), cache=cache) + expected = Series( + [Timestamp("20150204 00:00:00"), Timestamp("20160305 00:00:00")] + ) + tm.assert_series_equal(result, expected) + + # mixed dtypes + df["month"] = df["month"].astype("int8") + df["day"] = df["day"].astype("int8") + result = to_datetime(df, cache=cache) + expected = Series( + [Timestamp("20150204 00:00:00"), Timestamp("20160305 00:00:00")] + ) + tm.assert_series_equal(result, expected) + + # float + df = DataFrame({"year": [2000, 2001], "month": [1.5, 1], "day": [1, 1]}) + with pytest.raises(ValueError): + to_datetime(df, cache=cache) + + def test_dataframe_utc_true(self): + # GH 23760 + df = pd.DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}) + result = pd.to_datetime(df, utc=True) + expected = pd.Series( + np.array(["2015-02-04", "2016-03-05"], dtype="datetime64[ns]") + ).dt.tz_localize("UTC") + tm.assert_series_equal(result, expected) + + def test_to_datetime_errors_ignore_utc_true(self): + # GH 23758 + result = pd.to_datetime([1], unit="s", utc=True, errors="ignore") + expected = DatetimeIndex(["1970-01-01 00:00:01"], tz="UTC") + tm.assert_index_equal(result, expected) + + +class TestToDatetimeMisc: + def test_to_datetime_barely_out_of_bounds(self): + # GH#19529 + # GH#19382 close enough to bounds that dropping nanos would result + # in an in-bounds datetime + arr = np.array(["2262-04-11 23:47:16.854775808"], dtype=object) + + with pytest.raises(OutOfBoundsDatetime): + to_datetime(arr) + + @pytest.mark.parametrize("cache", [True, False]) + def test_to_datetime_iso8601(self, cache): + result = to_datetime(["2012-01-01 00:00:00"], cache=cache) + exp = Timestamp("2012-01-01 00:00:00") + assert result[0] == exp + + result = to_datetime(["20121001"], cache=cache) # bad iso 8601 + exp = Timestamp("2012-10-01") + assert result[0] == exp + + @pytest.mark.parametrize("cache", [True, False]) + def test_to_datetime_default(self, cache): + rs = to_datetime("2001", cache=cache) + xp = datetime(2001, 1, 1) + assert rs == xp + + # dayfirst is essentially broken + + # to_datetime('01-13-2012', dayfirst=True) + # pytest.raises(ValueError, to_datetime('01-13-2012', + # dayfirst=True)) + + @pytest.mark.parametrize("cache", [True, False]) + def test_to_datetime_on_datetime64_series(self, cache): + # #2699 + s = Series(date_range("1/1/2000", periods=10)) + + result = to_datetime(s, cache=cache) + assert result[0] == s[0] + + @pytest.mark.parametrize("cache", [True, False]) + def test_to_datetime_with_space_in_series(self, cache): + # GH 6428 + s = Series(["10/18/2006", "10/18/2008", " "]) + msg = r"(\(')?String does not contain a date(:', ' '\))?" + with pytest.raises(ValueError, match=msg): + to_datetime(s, errors="raise", cache=cache) + result_coerce = to_datetime(s, errors="coerce", cache=cache) + expected_coerce = Series([datetime(2006, 10, 18), datetime(2008, 10, 18), NaT]) + tm.assert_series_equal(result_coerce, expected_coerce) + result_ignore = to_datetime(s, errors="ignore", cache=cache) + tm.assert_series_equal(result_ignore, s) + + @td.skip_if_has_locale + @pytest.mark.parametrize("cache", [True, False]) + def test_to_datetime_with_apply(self, cache): + # this is only locale tested with US/None locales + # GH 5195 + # with a format and coerce a single item to_datetime fails + td = Series(["May 04", "Jun 02", "Dec 11"], index=[1, 2, 3]) + expected = pd.to_datetime(td, format="%b %y", cache=cache) + result = td.apply(pd.to_datetime, format="%b %y", cache=cache) + tm.assert_series_equal(result, expected) + + td = pd.Series(["May 04", "Jun 02", ""], index=[1, 2, 3]) + msg = r"time data '' does not match format '%b %y' \(match\)" + with pytest.raises(ValueError, match=msg): + pd.to_datetime(td, format="%b %y", errors="raise", cache=cache) + with pytest.raises(ValueError, match=msg): + td.apply(pd.to_datetime, format="%b %y", errors="raise", cache=cache) + expected = pd.to_datetime(td, format="%b %y", errors="coerce", cache=cache) + + result = td.apply( + lambda x: pd.to_datetime(x, format="%b %y", errors="coerce", cache=cache) + ) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("cache", [True, False]) + def test_to_datetime_types(self, cache): + + # empty string + result = to_datetime("", cache=cache) + assert result is NaT + + result = to_datetime(["", ""], cache=cache) + assert isna(result).all() + + # ints + result = Timestamp(0) + expected = to_datetime(0, cache=cache) + assert result == expected + + # GH 3888 (strings) + expected = to_datetime(["2012"], cache=cache)[0] + result = to_datetime("2012", cache=cache) + assert result == expected + + # array = ['2012','20120101','20120101 12:01:01'] + array = ["20120101", "20120101 12:01:01"] + expected = list(to_datetime(array, cache=cache)) + result = [Timestamp(date_str) for date_str in array] + tm.assert_almost_equal(result, expected) + + # currently fails ### + # result = Timestamp('2012') + # expected = to_datetime('2012') + # assert result == expected + + @pytest.mark.parametrize("cache", [True, False]) + def test_to_datetime_unprocessable_input(self, cache): + # GH 4928 + # GH 21864 + result = to_datetime([1, "1"], errors="ignore", cache=cache) + + expected = Index(np.array([1, "1"], dtype="O")) + tm.assert_equal(result, expected) + msg = "invalid string coercion to datetime" + with pytest.raises(TypeError, match=msg): + to_datetime([1, "1"], errors="raise", cache=cache) + + def test_to_datetime_other_datetime64_units(self): + # 5/25/2012 + scalar = np.int64(1337904000000000).view("M8[us]") + as_obj = scalar.astype("O") + + index = DatetimeIndex([scalar]) + assert index[0] == scalar.astype("O") + + value = Timestamp(scalar) + assert value == as_obj + + def test_to_datetime_list_of_integers(self): + rng = date_range("1/1/2000", periods=20) + rng = DatetimeIndex(rng.values) + + ints = list(rng.asi8) + + result = DatetimeIndex(ints) + + tm.assert_index_equal(rng, result) + + def test_to_datetime_overflow(self): + # gh-17637 + # we are overflowing Timedelta range here + + with pytest.raises(OverflowError): + date_range(start="1/1/1700", freq="B", periods=100000) + + @pytest.mark.parametrize("cache", [True, False]) + def test_string_na_nat_conversion(self, cache): + # GH #999, #858 + + strings = np.array( + ["1/1/2000", "1/2/2000", np.nan, "1/4/2000, 12:34:56"], dtype=object + ) + + expected = np.empty(4, dtype="M8[ns]") + for i, val in enumerate(strings): + if isna(val): + expected[i] = iNaT + else: + expected[i] = parse(val) + + result = tslib.array_to_datetime(strings)[0] + tm.assert_almost_equal(result, expected) + + result2 = to_datetime(strings, cache=cache) + assert isinstance(result2, DatetimeIndex) + tm.assert_numpy_array_equal(result, result2.values) + + malformed = np.array(["1/100/2000", np.nan], dtype=object) + + # GH 10636, default is now 'raise' + msg = r"Unknown string format:|day is out of range for month" + with pytest.raises(ValueError, match=msg): + to_datetime(malformed, errors="raise", cache=cache) + + result = to_datetime(malformed, errors="ignore", cache=cache) + # GH 21864 + expected = Index(malformed) + tm.assert_index_equal(result, expected) + + with pytest.raises(ValueError, match=msg): + to_datetime(malformed, errors="raise", cache=cache) + + idx = ["a", "b", "c", "d", "e"] + series = Series( + ["1/1/2000", np.nan, "1/3/2000", np.nan, "1/5/2000"], index=idx, name="foo" + ) + dseries = Series( + [ + to_datetime("1/1/2000", cache=cache), + np.nan, + to_datetime("1/3/2000", cache=cache), + np.nan, + to_datetime("1/5/2000", cache=cache), + ], + index=idx, + name="foo", + ) + + result = to_datetime(series, cache=cache) + dresult = to_datetime(dseries, cache=cache) + + expected = Series(np.empty(5, dtype="M8[ns]"), index=idx) + for i in range(5): + x = series[i] + if isna(x): + expected[i] = iNaT + else: + expected[i] = to_datetime(x, cache=cache) + + tm.assert_series_equal(result, expected, check_names=False) + assert result.name == "foo" + + tm.assert_series_equal(dresult, expected, check_names=False) + assert dresult.name == "foo" + + @pytest.mark.parametrize( + "dtype", + [ + "datetime64[h]", + "datetime64[m]", + "datetime64[s]", + "datetime64[ms]", + "datetime64[us]", + "datetime64[ns]", + ], + ) + @pytest.mark.parametrize("cache", [True, False]) + def test_dti_constructor_numpy_timeunits(self, cache, dtype): + # GH 9114 + base = pd.to_datetime( + ["2000-01-01T00:00", "2000-01-02T00:00", "NaT"], cache=cache + ) + + values = base.values.astype(dtype) + + tm.assert_index_equal(DatetimeIndex(values), base) + tm.assert_index_equal(to_datetime(values, cache=cache), base) + + @pytest.mark.parametrize("cache", [True, False]) + def test_dayfirst(self, cache): + # GH 5917 + arr = ["10/02/2014", "11/02/2014", "12/02/2014"] + expected = DatetimeIndex( + [datetime(2014, 2, 10), datetime(2014, 2, 11), datetime(2014, 2, 12)] + ) + idx1 = DatetimeIndex(arr, dayfirst=True) + idx2 = DatetimeIndex(np.array(arr), dayfirst=True) + idx3 = to_datetime(arr, dayfirst=True, cache=cache) + idx4 = to_datetime(np.array(arr), dayfirst=True, cache=cache) + idx5 = DatetimeIndex(Index(arr), dayfirst=True) + idx6 = DatetimeIndex(Series(arr), dayfirst=True) + tm.assert_index_equal(expected, idx1) + tm.assert_index_equal(expected, idx2) + tm.assert_index_equal(expected, idx3) + tm.assert_index_equal(expected, idx4) + tm.assert_index_equal(expected, idx5) + tm.assert_index_equal(expected, idx6) + + @pytest.mark.parametrize("klass", [DatetimeIndex, DatetimeArray]) + def test_to_datetime_dta_tz(self, klass): + # GH#27733 + dti = date_range("2015-04-05", periods=3).rename("foo") + expected = dti.tz_localize("UTC") + + obj = klass(dti) + expected = klass(expected) + + result = to_datetime(obj, utc=True) + tm.assert_equal(result, expected) + + +class TestGuessDatetimeFormat: + @td.skip_if_not_us_locale + def test_guess_datetime_format_for_array(self): + expected_format = "%Y-%m-%d %H:%M:%S.%f" + dt_string = datetime(2011, 12, 30, 0, 0, 0).strftime(expected_format) + + test_arrays = [ + np.array([dt_string, dt_string, dt_string], dtype="O"), + np.array([np.nan, np.nan, dt_string], dtype="O"), + np.array([dt_string, "random_string"], dtype="O"), + ] + + for test_array in test_arrays: + assert tools._guess_datetime_format_for_array(test_array) == expected_format + + format_for_string_of_nans = tools._guess_datetime_format_for_array( + np.array([np.nan, np.nan, np.nan], dtype="O") + ) + assert format_for_string_of_nans is None + + +class TestToDatetimeInferFormat: + @pytest.mark.parametrize("cache", [True, False]) + def test_to_datetime_infer_datetime_format_consistent_format(self, cache): + s = pd.Series(pd.date_range("20000101", periods=50, freq="H")) + + test_formats = ["%m-%d-%Y", "%m/%d/%Y %H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S.%f"] + + for test_format in test_formats: + s_as_dt_strings = s.apply(lambda x: x.strftime(test_format)) + + with_format = pd.to_datetime( + s_as_dt_strings, format=test_format, cache=cache + ) + no_infer = pd.to_datetime( + s_as_dt_strings, infer_datetime_format=False, cache=cache + ) + yes_infer = pd.to_datetime( + s_as_dt_strings, infer_datetime_format=True, cache=cache + ) + + # Whether the format is explicitly passed, it is inferred, or + # it is not inferred, the results should all be the same + tm.assert_series_equal(with_format, no_infer) + tm.assert_series_equal(no_infer, yes_infer) + + @pytest.mark.parametrize("cache", [True, False]) + def test_to_datetime_infer_datetime_format_inconsistent_format(self, cache): + s = pd.Series( + np.array( + ["01/01/2011 00:00:00", "01-02-2011 00:00:00", "2011-01-03T00:00:00"] + ) + ) + + # When the format is inconsistent, infer_datetime_format should just + # fallback to the default parsing + tm.assert_series_equal( + pd.to_datetime(s, infer_datetime_format=False, cache=cache), + pd.to_datetime(s, infer_datetime_format=True, cache=cache), + ) + + s = pd.Series(np.array(["Jan/01/2011", "Feb/01/2011", "Mar/01/2011"])) + + tm.assert_series_equal( + pd.to_datetime(s, infer_datetime_format=False, cache=cache), + pd.to_datetime(s, infer_datetime_format=True, cache=cache), + ) + + @pytest.mark.parametrize("cache", [True, False]) + def test_to_datetime_infer_datetime_format_series_with_nans(self, cache): + s = pd.Series( + np.array(["01/01/2011 00:00:00", np.nan, "01/03/2011 00:00:00", np.nan]) + ) + tm.assert_series_equal( + pd.to_datetime(s, infer_datetime_format=False, cache=cache), + pd.to_datetime(s, infer_datetime_format=True, cache=cache), + ) + + @pytest.mark.parametrize("cache", [True, False]) + def test_to_datetime_infer_datetime_format_series_start_with_nans(self, cache): + s = pd.Series( + np.array( + [ + np.nan, + np.nan, + "01/01/2011 00:00:00", + "01/02/2011 00:00:00", + "01/03/2011 00:00:00", + ] + ) + ) + + tm.assert_series_equal( + pd.to_datetime(s, infer_datetime_format=False, cache=cache), + pd.to_datetime(s, infer_datetime_format=True, cache=cache), + ) + + @pytest.mark.parametrize("cache", [True, False]) + def test_to_datetime_iso8601_noleading_0s(self, cache): + # GH 11871 + s = pd.Series(["2014-1-1", "2014-2-2", "2015-3-3"]) + expected = pd.Series( + [ + pd.Timestamp("2014-01-01"), + pd.Timestamp("2014-02-02"), + pd.Timestamp("2015-03-03"), + ] + ) + tm.assert_series_equal(pd.to_datetime(s, cache=cache), expected) + tm.assert_series_equal( + pd.to_datetime(s, format="%Y-%m-%d", cache=cache), expected + ) + + +class TestDaysInMonth: + # tests for issue #10154 + + @pytest.mark.parametrize("cache", [True, False]) + def test_day_not_in_month_coerce(self, cache): + assert isna(to_datetime("2015-02-29", errors="coerce", cache=cache)) + assert isna( + to_datetime("2015-02-29", format="%Y-%m-%d", errors="coerce", cache=cache) + ) + assert isna( + to_datetime("2015-02-32", format="%Y-%m-%d", errors="coerce", cache=cache) + ) + assert isna( + to_datetime("2015-04-31", format="%Y-%m-%d", errors="coerce", cache=cache) + ) + + @pytest.mark.parametrize("cache", [True, False]) + def test_day_not_in_month_raise(self, cache): + msg = "day is out of range for month" + with pytest.raises(ValueError, match=msg): + to_datetime("2015-02-29", errors="raise", cache=cache) + + msg = "time data 2015-02-29 doesn't match format specified" + with pytest.raises(ValueError, match=msg): + to_datetime("2015-02-29", errors="raise", format="%Y-%m-%d", cache=cache) + + msg = "time data 2015-02-32 doesn't match format specified" + with pytest.raises(ValueError, match=msg): + to_datetime("2015-02-32", errors="raise", format="%Y-%m-%d", cache=cache) + + msg = "time data 2015-04-31 doesn't match format specified" + with pytest.raises(ValueError, match=msg): + to_datetime("2015-04-31", errors="raise", format="%Y-%m-%d", cache=cache) + + @pytest.mark.parametrize("cache", [True, False]) + def test_day_not_in_month_ignore(self, cache): + assert to_datetime("2015-02-29", errors="ignore", cache=cache) == "2015-02-29" + assert ( + to_datetime("2015-02-29", errors="ignore", format="%Y-%m-%d", cache=cache) + == "2015-02-29" + ) + assert ( + to_datetime("2015-02-32", errors="ignore", format="%Y-%m-%d", cache=cache) + == "2015-02-32" + ) + assert ( + to_datetime("2015-04-31", errors="ignore", format="%Y-%m-%d", cache=cache) + == "2015-04-31" + ) + + +class TestDatetimeParsingWrappers: + @pytest.mark.parametrize( + "date_str,expected", + list( + { + "2011-01-01": datetime(2011, 1, 1), + "2Q2005": datetime(2005, 4, 1), + "2Q05": datetime(2005, 4, 1), + "2005Q1": datetime(2005, 1, 1), + "05Q1": datetime(2005, 1, 1), + "2011Q3": datetime(2011, 7, 1), + "11Q3": datetime(2011, 7, 1), + "3Q2011": datetime(2011, 7, 1), + "3Q11": datetime(2011, 7, 1), + # quarterly without space + "2000Q4": datetime(2000, 10, 1), + "00Q4": datetime(2000, 10, 1), + "4Q2000": datetime(2000, 10, 1), + "4Q00": datetime(2000, 10, 1), + "2000q4": datetime(2000, 10, 1), + "2000-Q4": datetime(2000, 10, 1), + "00-Q4": datetime(2000, 10, 1), + "4Q-2000": datetime(2000, 10, 1), + "4Q-00": datetime(2000, 10, 1), + "00q4": datetime(2000, 10, 1), + "2005": datetime(2005, 1, 1), + "2005-11": datetime(2005, 11, 1), + "2005 11": datetime(2005, 11, 1), + "11-2005": datetime(2005, 11, 1), + "11 2005": datetime(2005, 11, 1), + "200511": datetime(2020, 5, 11), + "20051109": datetime(2005, 11, 9), + "20051109 10:15": datetime(2005, 11, 9, 10, 15), + "20051109 08H": datetime(2005, 11, 9, 8, 0), + "2005-11-09 10:15": datetime(2005, 11, 9, 10, 15), + "2005-11-09 08H": datetime(2005, 11, 9, 8, 0), + "2005/11/09 10:15": datetime(2005, 11, 9, 10, 15), + "2005/11/09 08H": datetime(2005, 11, 9, 8, 0), + "Thu Sep 25 10:36:28 2003": datetime(2003, 9, 25, 10, 36, 28), + "Thu Sep 25 2003": datetime(2003, 9, 25), + "Sep 25 2003": datetime(2003, 9, 25), + "January 1 2014": datetime(2014, 1, 1), + # GHE10537 + "2014-06": datetime(2014, 6, 1), + "06-2014": datetime(2014, 6, 1), + "2014-6": datetime(2014, 6, 1), + "6-2014": datetime(2014, 6, 1), + "20010101 12": datetime(2001, 1, 1, 12), + "20010101 1234": datetime(2001, 1, 1, 12, 34), + "20010101 123456": datetime(2001, 1, 1, 12, 34, 56), + }.items() + ), + ) + @pytest.mark.parametrize("cache", [True, False]) + def test_parsers(self, date_str, expected, cache): + + # dateutil >= 2.5.0 defaults to yearfirst=True + # https://github.com/dateutil/dateutil/issues/217 + yearfirst = True + + result1, _, _ = parsing.parse_time_string(date_str, yearfirst=yearfirst) + result2 = to_datetime(date_str, yearfirst=yearfirst) + result3 = to_datetime([date_str], yearfirst=yearfirst) + # result5 is used below + result4 = to_datetime( + np.array([date_str], dtype=object), yearfirst=yearfirst, cache=cache + ) + result6 = DatetimeIndex([date_str], yearfirst=yearfirst) + # result7 is used below + result8 = DatetimeIndex(Index([date_str]), yearfirst=yearfirst) + result9 = DatetimeIndex(Series([date_str]), yearfirst=yearfirst) + + for res in [result1, result2]: + assert res == expected + for res in [result3, result4, result6, result8, result9]: + exp = DatetimeIndex([pd.Timestamp(expected)]) + tm.assert_index_equal(res, exp) + + # these really need to have yearfirst, but we don't support + if not yearfirst: + result5 = Timestamp(date_str) + assert result5 == expected + result7 = date_range(date_str, freq="S", periods=1, yearfirst=yearfirst) + assert result7 == expected + + @pytest.mark.parametrize("cache", [True, False]) + def test_na_values_with_cache( + self, cache, unique_nulls_fixture, unique_nulls_fixture2 + ): + # GH22305 + expected = Index([NaT, NaT], dtype="datetime64[ns]") + result = to_datetime([unique_nulls_fixture, unique_nulls_fixture2], cache=cache) + tm.assert_index_equal(result, expected) + + def test_parsers_nat(self): + # Test that each of several string-accepting methods return pd.NaT + result1, _, _ = parsing.parse_time_string("NaT") + result2 = to_datetime("NaT") + result3 = Timestamp("NaT") + result4 = DatetimeIndex(["NaT"])[0] + assert result1 is NaT + assert result2 is NaT + assert result3 is NaT + assert result4 is NaT + + @pytest.mark.parametrize("cache", [True, False]) + def test_parsers_dayfirst_yearfirst(self, cache): + # OK + # 2.5.1 10-11-12 [dayfirst=0, yearfirst=0] -> 2012-10-11 00:00:00 + # 2.5.2 10-11-12 [dayfirst=0, yearfirst=1] -> 2012-10-11 00:00:00 + # 2.5.3 10-11-12 [dayfirst=0, yearfirst=0] -> 2012-10-11 00:00:00 + + # OK + # 2.5.1 10-11-12 [dayfirst=0, yearfirst=1] -> 2010-11-12 00:00:00 + # 2.5.2 10-11-12 [dayfirst=0, yearfirst=1] -> 2010-11-12 00:00:00 + # 2.5.3 10-11-12 [dayfirst=0, yearfirst=1] -> 2010-11-12 00:00:00 + + # bug fix in 2.5.2 + # 2.5.1 10-11-12 [dayfirst=1, yearfirst=1] -> 2010-11-12 00:00:00 + # 2.5.2 10-11-12 [dayfirst=1, yearfirst=1] -> 2010-12-11 00:00:00 + # 2.5.3 10-11-12 [dayfirst=1, yearfirst=1] -> 2010-12-11 00:00:00 + + # OK + # 2.5.1 10-11-12 [dayfirst=1, yearfirst=0] -> 2012-11-10 00:00:00 + # 2.5.2 10-11-12 [dayfirst=1, yearfirst=0] -> 2012-11-10 00:00:00 + # 2.5.3 10-11-12 [dayfirst=1, yearfirst=0] -> 2012-11-10 00:00:00 + + # OK + # 2.5.1 20/12/21 [dayfirst=0, yearfirst=0] -> 2021-12-20 00:00:00 + # 2.5.2 20/12/21 [dayfirst=0, yearfirst=0] -> 2021-12-20 00:00:00 + # 2.5.3 20/12/21 [dayfirst=0, yearfirst=0] -> 2021-12-20 00:00:00 + + # OK + # 2.5.1 20/12/21 [dayfirst=0, yearfirst=1] -> 2020-12-21 00:00:00 + # 2.5.2 20/12/21 [dayfirst=0, yearfirst=1] -> 2020-12-21 00:00:00 + # 2.5.3 20/12/21 [dayfirst=0, yearfirst=1] -> 2020-12-21 00:00:00 + + # revert of bug in 2.5.2 + # 2.5.1 20/12/21 [dayfirst=1, yearfirst=1] -> 2020-12-21 00:00:00 + # 2.5.2 20/12/21 [dayfirst=1, yearfirst=1] -> month must be in 1..12 + # 2.5.3 20/12/21 [dayfirst=1, yearfirst=1] -> 2020-12-21 00:00:00 + + # OK + # 2.5.1 20/12/21 [dayfirst=1, yearfirst=0] -> 2021-12-20 00:00:00 + # 2.5.2 20/12/21 [dayfirst=1, yearfirst=0] -> 2021-12-20 00:00:00 + # 2.5.3 20/12/21 [dayfirst=1, yearfirst=0] -> 2021-12-20 00:00:00 + + # str : dayfirst, yearfirst, expected + cases = { + "10-11-12": [ + (False, False, datetime(2012, 10, 11)), + (True, False, datetime(2012, 11, 10)), + (False, True, datetime(2010, 11, 12)), + (True, True, datetime(2010, 12, 11)), + ], + "20/12/21": [ + (False, False, datetime(2021, 12, 20)), + (True, False, datetime(2021, 12, 20)), + (False, True, datetime(2020, 12, 21)), + (True, True, datetime(2020, 12, 21)), + ], + } + + for date_str, values in cases.items(): + for dayfirst, yearfirst, expected in values: + + # compare with dateutil result + dateutil_result = parse( + date_str, dayfirst=dayfirst, yearfirst=yearfirst + ) + assert dateutil_result == expected + + result1, _, _ = parsing.parse_time_string( + date_str, dayfirst=dayfirst, yearfirst=yearfirst + ) + + # we don't support dayfirst/yearfirst here: + if not dayfirst and not yearfirst: + result2 = Timestamp(date_str) + assert result2 == expected + + result3 = to_datetime( + date_str, dayfirst=dayfirst, yearfirst=yearfirst, cache=cache + ) + + result4 = DatetimeIndex( + [date_str], dayfirst=dayfirst, yearfirst=yearfirst + )[0] + + assert result1 == expected + assert result3 == expected + assert result4 == expected + + @pytest.mark.parametrize("cache", [True, False]) + def test_parsers_timestring(self, cache): + # must be the same as dateutil result + cases = { + "10:15": (parse("10:15"), datetime(1, 1, 1, 10, 15)), + "9:05": (parse("9:05"), datetime(1, 1, 1, 9, 5)), + } + + for date_str, (exp_now, exp_def) in cases.items(): + result1, _, _ = parsing.parse_time_string(date_str) + result2 = to_datetime(date_str) + result3 = to_datetime([date_str]) + result4 = Timestamp(date_str) + result5 = DatetimeIndex([date_str])[0] + # parse time string return time string based on default date + # others are not, and can't be changed because it is used in + # time series plot + assert result1 == exp_def + assert result2 == exp_now + assert result3 == exp_now + assert result4 == exp_now + assert result5 == exp_now + + @td.skip_if_has_locale + def test_parsers_time(self): + # GH11818 + strings = [ + "14:15", + "1415", + "2:15pm", + "0215pm", + "14:15:00", + "141500", + "2:15:00pm", + "021500pm", + time(14, 15), + ] + expected = time(14, 15) + + for time_string in strings: + assert tools.to_time(time_string) == expected + + new_string = "14.15" + msg = r"Cannot convert arg \['14\.15'\] to a time" + with pytest.raises(ValueError, match=msg): + tools.to_time(new_string) + assert tools.to_time(new_string, format="%H.%M") == expected + + arg = ["14:15", "20:20"] + expected_arr = [time(14, 15), time(20, 20)] + assert tools.to_time(arg) == expected_arr + assert tools.to_time(arg, format="%H:%M") == expected_arr + assert tools.to_time(arg, infer_time_format=True) == expected_arr + assert tools.to_time(arg, format="%I:%M%p", errors="coerce") == [None, None] + + res = tools.to_time(arg, format="%I:%M%p", errors="ignore") + tm.assert_numpy_array_equal(res, np.array(arg, dtype=np.object_)) + + with pytest.raises(ValueError): + tools.to_time(arg, format="%I:%M%p", errors="raise") + + tm.assert_series_equal( + tools.to_time(Series(arg, name="test")), Series(expected_arr, name="test") + ) + + res = tools.to_time(np.array(arg)) + assert isinstance(res, list) + assert res == expected_arr + + @pytest.mark.parametrize("cache", [True, False]) + @pytest.mark.parametrize( + "dt_string, tz, dt_string_repr", + [ + ( + "2013-01-01 05:45+0545", + pytz.FixedOffset(345), + "Timestamp('2013-01-01 05:45:00+0545', tz='pytz.FixedOffset(345)')", + ), + ( + "2013-01-01 05:30+0530", + pytz.FixedOffset(330), + "Timestamp('2013-01-01 05:30:00+0530', tz='pytz.FixedOffset(330)')", + ), + ], + ) + def test_parsers_timezone_minute_offsets_roundtrip( + self, cache, dt_string, tz, dt_string_repr + ): + # GH11708 + base = to_datetime("2013-01-01 00:00:00", cache=cache) + base = base.tz_localize("UTC").tz_convert(tz) + dt_time = to_datetime(dt_string, cache=cache) + assert base == dt_time + assert dt_string_repr == repr(dt_time) + + +@pytest.fixture(params=["D", "s", "ms", "us", "ns"]) +def units(request): + """Day and some time units. + + * D + * s + * ms + * us + * ns + """ + return request.param + + +@pytest.fixture +def epoch_1960(): + """Timestamp at 1960-01-01.""" + return Timestamp("1960-01-01") + + +@pytest.fixture +def units_from_epochs(): + return list(range(5)) + + +@pytest.fixture(params=["timestamp", "pydatetime", "datetime64", "str_1960"]) +def epochs(epoch_1960, request): + """Timestamp at 1960-01-01 in various forms. + + * pd.Timestamp + * datetime.datetime + * numpy.datetime64 + * str + """ + assert request.param in {"timestamp", "pydatetime", "datetime64", "str_1960"} + if request.param == "timestamp": + return epoch_1960 + elif request.param == "pydatetime": + return epoch_1960.to_pydatetime() + elif request.param == "datetime64": + return epoch_1960.to_datetime64() + else: + return str(epoch_1960) + + +@pytest.fixture +def julian_dates(): + return pd.date_range("2014-1-1", periods=10).to_julian_date().values + + +class TestOrigin: + def test_to_basic(self, julian_dates): + # gh-11276, gh-11745 + # for origin as julian + + result = Series(pd.to_datetime(julian_dates, unit="D", origin="julian")) + expected = Series( + pd.to_datetime(julian_dates - pd.Timestamp(0).to_julian_date(), unit="D") + ) + tm.assert_series_equal(result, expected) + + result = Series(pd.to_datetime([0, 1, 2], unit="D", origin="unix")) + expected = Series( + [Timestamp("1970-01-01"), Timestamp("1970-01-02"), Timestamp("1970-01-03")] + ) + tm.assert_series_equal(result, expected) + + # default + result = Series(pd.to_datetime([0, 1, 2], unit="D")) + expected = Series( + [Timestamp("1970-01-01"), Timestamp("1970-01-02"), Timestamp("1970-01-03")] + ) + tm.assert_series_equal(result, expected) + + def test_julian_round_trip(self): + result = pd.to_datetime(2456658, origin="julian", unit="D") + assert result.to_julian_date() == 2456658 + + # out-of-bounds + with pytest.raises(ValueError): + pd.to_datetime(1, origin="julian", unit="D") + + def test_invalid_unit(self, units, julian_dates): + + # checking for invalid combination of origin='julian' and unit != D + if units != "D": + with pytest.raises(ValueError): + pd.to_datetime(julian_dates, unit=units, origin="julian") + + def test_invalid_origin(self): + + # need to have a numeric specified + with pytest.raises(ValueError): + pd.to_datetime("2005-01-01", origin="1960-01-01") + + with pytest.raises(ValueError): + pd.to_datetime("2005-01-01", origin="1960-01-01", unit="D") + + def test_epoch(self, units, epochs, epoch_1960, units_from_epochs): + + expected = Series( + [pd.Timedelta(x, unit=units) + epoch_1960 for x in units_from_epochs] + ) + + result = Series(pd.to_datetime(units_from_epochs, unit=units, origin=epochs)) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "origin, exc", + [ + ("random_string", ValueError), + ("epoch", ValueError), + ("13-24-1990", ValueError), + (datetime(1, 1, 1), tslib.OutOfBoundsDatetime), + ], + ) + def test_invalid_origins(self, origin, exc, units, units_from_epochs): + + with pytest.raises(exc): + pd.to_datetime(units_from_epochs, unit=units, origin=origin) + + def test_invalid_origins_tzinfo(self): + # GH16842 + with pytest.raises(ValueError): + pd.to_datetime(1, unit="D", origin=datetime(2000, 1, 1, tzinfo=pytz.utc)) + + @pytest.mark.parametrize("format", [None, "%Y-%m-%d %H:%M:%S"]) + def test_to_datetime_out_of_bounds_with_format_arg(self, format): + # see gh-23830 + msg = "Out of bounds nanosecond timestamp" + with pytest.raises(OutOfBoundsDatetime, match=msg): + to_datetime("2417-10-27 00:00:00", format=format) + + def test_processing_order(self): + # make sure we handle out-of-bounds *before* + # constructing the dates + + result = pd.to_datetime(200 * 365, unit="D") + expected = Timestamp("2169-11-13 00:00:00") + assert result == expected + + result = pd.to_datetime(200 * 365, unit="D", origin="1870-01-01") + expected = Timestamp("2069-11-13 00:00:00") + assert result == expected + + result = pd.to_datetime(300 * 365, unit="D", origin="1870-01-01") + expected = Timestamp("2169-10-20 00:00:00") + assert result == expected + + @pytest.mark.parametrize( + "offset,utc,exp", + [ + ["Z", True, "2019-01-01T00:00:00.000Z"], + ["Z", None, "2019-01-01T00:00:00.000Z"], + ["-01:00", True, "2019-01-01T01:00:00.000Z"], + ["-01:00", None, "2019-01-01T00:00:00.000-01:00"], + ], + ) + def test_arg_tz_ns_unit(self, offset, utc, exp): + # GH 25546 + arg = "2019-01-01T00:00:00.000" + offset + result = to_datetime([arg], unit="ns", utc=utc) + expected = to_datetime([exp]) + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize( + "listlike,do_caching", + [([1, 2, 3, 4, 5, 6, 7, 8, 9, 0], False), ([1, 1, 1, 1, 4, 5, 6, 7, 8, 9], True)], +) +def test_should_cache(listlike, do_caching): + assert ( + tools.should_cache(listlike, check_count=len(listlike), unique_share=0.7) + == do_caching + ) + + +@pytest.mark.parametrize( + "unique_share,check_count, err_message", + [ + (0.5, 11, r"check_count must be in next bounds: \[0; len\(arg\)\]"), + (10, 2, r"unique_share must be in next bounds: \(0; 1\)"), + ], +) +def test_should_cache_errors(unique_share, check_count, err_message): + arg = [5] * 10 + + with pytest.raises(AssertionError, match=err_message): + tools.should_cache(arg, unique_share, check_count) + + +def test_nullable_integer_to_datetime(): + # Test for #30050 + ser = pd.Series([1, 2, None, 2 ** 61, None]) + ser = ser.astype("Int64") + ser_copy = ser.copy() + + res = pd.to_datetime(ser, unit="ns") + + expected = pd.Series( + [ + np.datetime64("1970-01-01 00:00:00.000000001"), + np.datetime64("1970-01-01 00:00:00.000000002"), + np.datetime64("NaT"), + np.datetime64("2043-01-25 23:56:49.213693952"), + np.datetime64("NaT"), + ] + ) + tm.assert_series_equal(res, expected) + # Check that ser isn't mutated + tm.assert_series_equal(ser, ser_copy) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/interval/__init__.py b/venv/Lib/site-packages/pandas/tests/indexes/interval/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/indexes/interval/test_astype.py b/venv/Lib/site-packages/pandas/tests/indexes/interval/test_astype.py new file mode 100644 index 0000000..c94af6c --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/interval/test_astype.py @@ -0,0 +1,223 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.dtypes import CategoricalDtype, IntervalDtype + +from pandas import ( + CategoricalIndex, + Index, + IntervalIndex, + NaT, + Timedelta, + Timestamp, + interval_range, +) +import pandas._testing as tm + + +class Base: + """Tests common to IntervalIndex with any subtype""" + + def test_astype_idempotent(self, index): + result = index.astype("interval") + tm.assert_index_equal(result, index) + + result = index.astype(index.dtype) + tm.assert_index_equal(result, index) + + def test_astype_object(self, index): + result = index.astype(object) + expected = Index(index.values, dtype="object") + tm.assert_index_equal(result, expected) + assert not result.equals(index) + + def test_astype_category(self, index): + result = index.astype("category") + expected = CategoricalIndex(index.values) + tm.assert_index_equal(result, expected) + + result = index.astype(CategoricalDtype()) + tm.assert_index_equal(result, expected) + + # non-default params + categories = index.dropna().unique().values[:-1] + dtype = CategoricalDtype(categories=categories, ordered=True) + result = index.astype(dtype) + expected = CategoricalIndex(index.values, categories=categories, ordered=True) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "dtype", + [ + "int64", + "uint64", + "float64", + "complex128", + "period[M]", + "timedelta64", + "timedelta64[ns]", + "datetime64", + "datetime64[ns]", + "datetime64[ns, US/Eastern]", + ], + ) + def test_astype_cannot_cast(self, index, dtype): + msg = "Cannot cast IntervalIndex to dtype" + with pytest.raises(TypeError, match=msg): + index.astype(dtype) + + def test_astype_invalid_dtype(self, index): + msg = "data type [\"']fake_dtype[\"'] not understood" + with pytest.raises(TypeError, match=msg): + index.astype("fake_dtype") + + +class TestIntSubtype(Base): + """Tests specific to IntervalIndex with integer-like subtype""" + + indexes = [ + IntervalIndex.from_breaks(np.arange(-10, 11, dtype="int64")), + IntervalIndex.from_breaks(np.arange(100, dtype="uint64"), closed="left"), + ] + + @pytest.fixture(params=indexes) + def index(self, request): + return request.param + + @pytest.mark.parametrize( + "subtype", ["float64", "datetime64[ns]", "timedelta64[ns]"] + ) + def test_subtype_conversion(self, index, subtype): + dtype = IntervalDtype(subtype) + result = index.astype(dtype) + expected = IntervalIndex.from_arrays( + index.left.astype(subtype), index.right.astype(subtype), closed=index.closed + ) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "subtype_start, subtype_end", [("int64", "uint64"), ("uint64", "int64")] + ) + def test_subtype_integer(self, subtype_start, subtype_end): + index = IntervalIndex.from_breaks(np.arange(100, dtype=subtype_start)) + dtype = IntervalDtype(subtype_end) + result = index.astype(dtype) + expected = IntervalIndex.from_arrays( + index.left.astype(subtype_end), + index.right.astype(subtype_end), + closed=index.closed, + ) + tm.assert_index_equal(result, expected) + + @pytest.mark.xfail(reason="GH#15832") + def test_subtype_integer_errors(self): + # int64 -> uint64 fails with negative values + index = interval_range(-10, 10) + dtype = IntervalDtype("uint64") + with pytest.raises(ValueError): + index.astype(dtype) + + +class TestFloatSubtype(Base): + """Tests specific to IntervalIndex with float subtype""" + + indexes = [ + interval_range(-10.0, 10.0, closed="neither"), + IntervalIndex.from_arrays( + [-1.5, np.nan, 0.0, 0.0, 1.5], [-0.5, np.nan, 1.0, 1.0, 3.0], closed="both" + ), + ] + + @pytest.fixture(params=indexes) + def index(self, request): + return request.param + + @pytest.mark.parametrize("subtype", ["int64", "uint64"]) + def test_subtype_integer(self, subtype): + index = interval_range(0.0, 10.0) + dtype = IntervalDtype(subtype) + result = index.astype(dtype) + expected = IntervalIndex.from_arrays( + index.left.astype(subtype), index.right.astype(subtype), closed=index.closed + ) + tm.assert_index_equal(result, expected) + + # raises with NA + msg = r"Cannot convert non-finite values \(NA or inf\) to integer" + with pytest.raises(ValueError, match=msg): + index.insert(0, np.nan).astype(dtype) + + @pytest.mark.xfail(reason="GH#15832") + def test_subtype_integer_errors(self): + # float64 -> uint64 fails with negative values + index = interval_range(-10.0, 10.0) + dtype = IntervalDtype("uint64") + with pytest.raises(ValueError): + index.astype(dtype) + + # float64 -> integer-like fails with non-integer valued floats + index = interval_range(0.0, 10.0, freq=0.25) + dtype = IntervalDtype("int64") + with pytest.raises(ValueError): + index.astype(dtype) + + dtype = IntervalDtype("uint64") + with pytest.raises(ValueError): + index.astype(dtype) + + @pytest.mark.parametrize("subtype", ["datetime64[ns]", "timedelta64[ns]"]) + def test_subtype_datetimelike(self, index, subtype): + dtype = IntervalDtype(subtype) + msg = "Cannot convert .* to .*; subtypes are incompatible" + with pytest.raises(TypeError, match=msg): + index.astype(dtype) + + +class TestDatetimelikeSubtype(Base): + """Tests specific to IntervalIndex with datetime-like subtype""" + + indexes = [ + interval_range(Timestamp("2018-01-01"), periods=10, closed="neither"), + interval_range(Timestamp("2018-01-01"), periods=10).insert(2, NaT), + interval_range(Timestamp("2018-01-01", tz="US/Eastern"), periods=10), + interval_range(Timedelta("0 days"), periods=10, closed="both"), + interval_range(Timedelta("0 days"), periods=10).insert(2, NaT), + ] + + @pytest.fixture(params=indexes) + def index(self, request): + return request.param + + @pytest.mark.parametrize("subtype", ["int64", "uint64"]) + def test_subtype_integer(self, index, subtype): + dtype = IntervalDtype(subtype) + result = index.astype(dtype) + expected = IntervalIndex.from_arrays( + index.left.astype(subtype), index.right.astype(subtype), closed=index.closed + ) + tm.assert_index_equal(result, expected) + + def test_subtype_float(self, index): + dtype = IntervalDtype("float64") + msg = "Cannot convert .* to .*; subtypes are incompatible" + with pytest.raises(TypeError, match=msg): + index.astype(dtype) + + def test_subtype_datetimelike(self): + # datetime -> timedelta raises + dtype = IntervalDtype("timedelta64[ns]") + msg = "Cannot convert .* to .*; subtypes are incompatible" + + index = interval_range(Timestamp("2018-01-01"), periods=10) + with pytest.raises(TypeError, match=msg): + index.astype(dtype) + + index = interval_range(Timestamp("2018-01-01", tz="CET"), periods=10) + with pytest.raises(TypeError, match=msg): + index.astype(dtype) + + # timedelta -> datetime raises + dtype = IntervalDtype("datetime64[ns]") + index = interval_range(Timedelta("0 days"), periods=10) + with pytest.raises(TypeError, match=msg): + index.astype(dtype) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/interval/test_base.py b/venv/Lib/site-packages/pandas/tests/indexes/interval/test_base.py new file mode 100644 index 0000000..d8c2ba8 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/interval/test_base.py @@ -0,0 +1,88 @@ +import numpy as np +import pytest + +from pandas import IntervalIndex, Series, date_range +import pandas._testing as tm +from pandas.tests.indexes.common import Base + + +class TestBase(Base): + """ + Tests specific to the shared common index tests; unrelated tests should be placed + in test_interval.py or the specific test file (e.g. test_astype.py) + """ + + _holder = IntervalIndex + + @pytest.fixture + def indices(self): + return tm.makeIntervalIndex(10) + + def create_index(self, closed="right"): + return IntervalIndex.from_breaks(range(11), closed=closed) + + def test_equals(self, closed): + expected = IntervalIndex.from_breaks(np.arange(5), closed=closed) + assert expected.equals(expected) + assert expected.equals(expected.copy()) + + assert not expected.equals(expected.astype(object)) + assert not expected.equals(np.array(expected)) + assert not expected.equals(list(expected)) + + assert not expected.equals([1, 2]) + assert not expected.equals(np.array([1, 2])) + assert not expected.equals(date_range("20130101", periods=2)) + + expected_name1 = IntervalIndex.from_breaks( + np.arange(5), closed=closed, name="foo" + ) + expected_name2 = IntervalIndex.from_breaks( + np.arange(5), closed=closed, name="bar" + ) + assert expected.equals(expected_name1) + assert expected_name1.equals(expected_name2) + + for other_closed in {"left", "right", "both", "neither"} - {closed}: + expected_other_closed = IntervalIndex.from_breaks( + np.arange(5), closed=other_closed + ) + assert not expected.equals(expected_other_closed) + + def test_repr_max_seq_item_setting(self): + # override base test: not a valid repr as we use interval notation + pass + + def test_repr_roundtrip(self): + # override base test: not a valid repr as we use interval notation + pass + + def test_take(self, closed): + index = self.create_index(closed=closed) + + result = index.take(range(10)) + tm.assert_index_equal(result, index) + + result = index.take([0, 0, 1]) + expected = IntervalIndex.from_arrays([0, 0, 1], [1, 1, 2], closed=closed) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("klass", [list, tuple, np.array, Series]) + def test_where(self, closed, klass): + idx = self.create_index(closed=closed) + cond = [True] * len(idx) + expected = idx + result = expected.where(klass(cond)) + tm.assert_index_equal(result, expected) + + cond = [False] + [True] * len(idx[1:]) + expected = IntervalIndex([np.nan] + idx[1:].tolist()) + result = idx.where(klass(cond)) + tm.assert_index_equal(result, expected) + + def test_getitem_2d_deprecated(self): + # GH#30588 multi-dim indexing is deprecated, but raising is also acceptable + idx = self.create_index() + with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"): + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + idx[:, None] diff --git a/venv/Lib/site-packages/pandas/tests/indexes/interval/test_constructors.py b/venv/Lib/site-packages/pandas/tests/indexes/interval/test_constructors.py new file mode 100644 index 0000000..837c124 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/interval/test_constructors.py @@ -0,0 +1,423 @@ +from functools import partial + +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_categorical_dtype +from pandas.core.dtypes.dtypes import IntervalDtype + +from pandas import ( + Categorical, + CategoricalIndex, + Float64Index, + Index, + Int64Index, + Interval, + IntervalIndex, + date_range, + notna, + period_range, + timedelta_range, +) +import pandas._testing as tm +from pandas.core.arrays import IntervalArray +import pandas.core.common as com + + +@pytest.fixture(params=[None, "foo"]) +def name(request): + return request.param + + +class Base: + """ + Common tests for all variations of IntervalIndex construction. Input data + to be supplied in breaks format, then converted by the subclass method + get_kwargs_from_breaks to the expected format. + """ + + @pytest.mark.parametrize( + "breaks", + [ + [3, 14, 15, 92, 653], + np.arange(10, dtype="int64"), + Int64Index(range(-10, 11)), + Float64Index(np.arange(20, 30, 0.5)), + date_range("20180101", periods=10), + date_range("20180101", periods=10, tz="US/Eastern"), + timedelta_range("1 day", periods=10), + ], + ) + def test_constructor(self, constructor, breaks, closed, name): + result_kwargs = self.get_kwargs_from_breaks(breaks, closed) + result = constructor(closed=closed, name=name, **result_kwargs) + + assert result.closed == closed + assert result.name == name + assert result.dtype.subtype == getattr(breaks, "dtype", "int64") + tm.assert_index_equal(result.left, Index(breaks[:-1])) + tm.assert_index_equal(result.right, Index(breaks[1:])) + + @pytest.mark.parametrize( + "breaks, subtype", + [ + (Int64Index([0, 1, 2, 3, 4]), "float64"), + (Int64Index([0, 1, 2, 3, 4]), "datetime64[ns]"), + (Int64Index([0, 1, 2, 3, 4]), "timedelta64[ns]"), + (Float64Index([0, 1, 2, 3, 4]), "int64"), + (date_range("2017-01-01", periods=5), "int64"), + (timedelta_range("1 day", periods=5), "int64"), + ], + ) + def test_constructor_dtype(self, constructor, breaks, subtype): + # GH 19262: conversion via dtype parameter + expected_kwargs = self.get_kwargs_from_breaks(breaks.astype(subtype)) + expected = constructor(**expected_kwargs) + + result_kwargs = self.get_kwargs_from_breaks(breaks) + iv_dtype = IntervalDtype(subtype) + for dtype in (iv_dtype, str(iv_dtype)): + result = constructor(dtype=dtype, **result_kwargs) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("breaks", [[np.nan] * 2, [np.nan] * 4, [np.nan] * 50]) + def test_constructor_nan(self, constructor, breaks, closed): + # GH 18421 + result_kwargs = self.get_kwargs_from_breaks(breaks) + result = constructor(closed=closed, **result_kwargs) + + expected_subtype = np.float64 + expected_values = np.array(breaks[:-1], dtype=object) + + assert result.closed == closed + assert result.dtype.subtype == expected_subtype + tm.assert_numpy_array_equal(result._ndarray_values, expected_values) + + @pytest.mark.parametrize( + "breaks", + [ + [], + np.array([], dtype="int64"), + np.array([], dtype="float64"), + np.array([], dtype="datetime64[ns]"), + np.array([], dtype="timedelta64[ns]"), + ], + ) + def test_constructor_empty(self, constructor, breaks, closed): + # GH 18421 + result_kwargs = self.get_kwargs_from_breaks(breaks) + result = constructor(closed=closed, **result_kwargs) + + expected_values = np.array([], dtype=object) + expected_subtype = getattr(breaks, "dtype", np.int64) + + assert result.empty + assert result.closed == closed + assert result.dtype.subtype == expected_subtype + tm.assert_numpy_array_equal(result._ndarray_values, expected_values) + + @pytest.mark.parametrize( + "breaks", + [ + tuple("0123456789"), + list("abcdefghij"), + np.array(list("abcdefghij"), dtype=object), + np.array(list("abcdefghij"), dtype=" Interval(0.5, 1.5) + tm.assert_numpy_array_equal(actual, expected) + + actual = self.index == self.index + expected = np.array([True, True]) + tm.assert_numpy_array_equal(actual, expected) + actual = self.index <= self.index + tm.assert_numpy_array_equal(actual, expected) + actual = self.index >= self.index + tm.assert_numpy_array_equal(actual, expected) + + actual = self.index < self.index + expected = np.array([False, False]) + tm.assert_numpy_array_equal(actual, expected) + actual = self.index > self.index + tm.assert_numpy_array_equal(actual, expected) + + actual = self.index == IntervalIndex.from_breaks([0, 1, 2], "left") + tm.assert_numpy_array_equal(actual, expected) + + actual = self.index == self.index.values + tm.assert_numpy_array_equal(actual, np.array([True, True])) + actual = self.index.values == self.index + tm.assert_numpy_array_equal(actual, np.array([True, True])) + actual = self.index <= self.index.values + tm.assert_numpy_array_equal(actual, np.array([True, True])) + actual = self.index != self.index.values + tm.assert_numpy_array_equal(actual, np.array([False, False])) + actual = self.index > self.index.values + tm.assert_numpy_array_equal(actual, np.array([False, False])) + actual = self.index.values > self.index + tm.assert_numpy_array_equal(actual, np.array([False, False])) + + # invalid comparisons + actual = self.index == 0 + tm.assert_numpy_array_equal(actual, np.array([False, False])) + actual = self.index == self.index.left + tm.assert_numpy_array_equal(actual, np.array([False, False])) + + with pytest.raises(TypeError, match="unorderable types"): + self.index > 0 + with pytest.raises(TypeError, match="unorderable types"): + self.index <= 0 + msg = r"unorderable types: Interval\(\) > int\(\)" + with pytest.raises(TypeError, match=msg): + self.index > np.arange(2) + msg = "Lengths must match to compare" + with pytest.raises(ValueError, match=msg): + self.index > np.arange(3) + + def test_missing_values(self, closed): + idx = Index( + [np.nan, Interval(0, 1, closed=closed), Interval(1, 2, closed=closed)] + ) + idx2 = IntervalIndex.from_arrays([np.nan, 0, 1], [np.nan, 1, 2], closed=closed) + assert idx.equals(idx2) + + msg = ( + "missing values must be missing in the same location both left " + "and right sides" + ) + with pytest.raises(ValueError, match=msg): + IntervalIndex.from_arrays( + [np.nan, 0, 1], np.array([0, 1, 2]), closed=closed + ) + + tm.assert_numpy_array_equal(isna(idx), np.array([True, False, False])) + + def test_sort_values(self, closed): + index = self.create_index(closed=closed) + + result = index.sort_values() + tm.assert_index_equal(result, index) + + result = index.sort_values(ascending=False) + tm.assert_index_equal(result, index[::-1]) + + # with nan + index = IntervalIndex([Interval(1, 2), np.nan, Interval(0, 1)]) + + result = index.sort_values() + expected = IntervalIndex([Interval(0, 1), Interval(1, 2), np.nan]) + tm.assert_index_equal(result, expected) + + result = index.sort_values(ascending=False) + expected = IntervalIndex([np.nan, Interval(1, 2), Interval(0, 1)]) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("tz", [None, "US/Eastern"]) + def test_datetime(self, tz): + start = Timestamp("2000-01-01", tz=tz) + dates = date_range(start=start, periods=10) + index = IntervalIndex.from_breaks(dates) + + # test mid + start = Timestamp("2000-01-01T12:00", tz=tz) + expected = date_range(start=start, periods=9) + tm.assert_index_equal(index.mid, expected) + + # __contains__ doesn't check individual points + assert Timestamp("2000-01-01", tz=tz) not in index + assert Timestamp("2000-01-01T12", tz=tz) not in index + assert Timestamp("2000-01-02", tz=tz) not in index + iv_true = Interval( + Timestamp("2000-01-02", tz=tz), Timestamp("2000-01-03", tz=tz) + ) + iv_false = Interval( + Timestamp("1999-12-31", tz=tz), Timestamp("2000-01-01", tz=tz) + ) + assert iv_true in index + assert iv_false not in index + + # .contains does check individual points + assert not index.contains(Timestamp("2000-01-01", tz=tz)).any() + assert index.contains(Timestamp("2000-01-01T12", tz=tz)).any() + assert index.contains(Timestamp("2000-01-02", tz=tz)).any() + + # test get_indexer + start = Timestamp("1999-12-31T12:00", tz=tz) + target = date_range(start=start, periods=7, freq="12H") + actual = index.get_indexer(target) + expected = np.array([-1, -1, 0, 0, 1, 1, 2], dtype="intp") + tm.assert_numpy_array_equal(actual, expected) + + start = Timestamp("2000-01-08T18:00", tz=tz) + target = date_range(start=start, periods=7, freq="6H") + actual = index.get_indexer(target) + expected = np.array([7, 7, 8, 8, 8, 8, -1], dtype="intp") + tm.assert_numpy_array_equal(actual, expected) + + def test_append(self, closed): + + index1 = IntervalIndex.from_arrays([0, 1], [1, 2], closed=closed) + index2 = IntervalIndex.from_arrays([1, 2], [2, 3], closed=closed) + + result = index1.append(index2) + expected = IntervalIndex.from_arrays([0, 1, 1, 2], [1, 2, 2, 3], closed=closed) + tm.assert_index_equal(result, expected) + + result = index1.append([index1, index2]) + expected = IntervalIndex.from_arrays( + [0, 1, 0, 1, 1, 2], [1, 2, 1, 2, 2, 3], closed=closed + ) + tm.assert_index_equal(result, expected) + + msg = ( + "can only append two IntervalIndex objects that are closed " + "on the same side" + ) + for other_closed in {"left", "right", "both", "neither"} - {closed}: + index_other_closed = IntervalIndex.from_arrays( + [0, 1], [1, 2], closed=other_closed + ) + with pytest.raises(ValueError, match=msg): + index1.append(index_other_closed) + + def test_is_non_overlapping_monotonic(self, closed): + # Should be True in all cases + tpls = [(0, 1), (2, 3), (4, 5), (6, 7)] + idx = IntervalIndex.from_tuples(tpls, closed=closed) + assert idx.is_non_overlapping_monotonic is True + + idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed) + assert idx.is_non_overlapping_monotonic is True + + # Should be False in all cases (overlapping) + tpls = [(0, 2), (1, 3), (4, 5), (6, 7)] + idx = IntervalIndex.from_tuples(tpls, closed=closed) + assert idx.is_non_overlapping_monotonic is False + + idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed) + assert idx.is_non_overlapping_monotonic is False + + # Should be False in all cases (non-monotonic) + tpls = [(0, 1), (2, 3), (6, 7), (4, 5)] + idx = IntervalIndex.from_tuples(tpls, closed=closed) + assert idx.is_non_overlapping_monotonic is False + + idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed) + assert idx.is_non_overlapping_monotonic is False + + # Should be False for closed='both', otherwise True (GH16560) + if closed == "both": + idx = IntervalIndex.from_breaks(range(4), closed=closed) + assert idx.is_non_overlapping_monotonic is False + else: + idx = IntervalIndex.from_breaks(range(4), closed=closed) + assert idx.is_non_overlapping_monotonic is True + + @pytest.mark.parametrize( + "start, shift, na_value", + [ + (0, 1, np.nan), + (Timestamp("2018-01-01"), Timedelta("1 day"), pd.NaT), + (Timedelta("0 days"), Timedelta("1 day"), pd.NaT), + ], + ) + def test_is_overlapping(self, start, shift, na_value, closed): + # GH 23309 + # see test_interval_tree.py for extensive tests; interface tests here + + # non-overlapping + tuples = [(start + n * shift, start + (n + 1) * shift) for n in (0, 2, 4)] + index = IntervalIndex.from_tuples(tuples, closed=closed) + assert index.is_overlapping is False + + # non-overlapping with NA + tuples = [(na_value, na_value)] + tuples + [(na_value, na_value)] + index = IntervalIndex.from_tuples(tuples, closed=closed) + assert index.is_overlapping is False + + # overlapping + tuples = [(start + n * shift, start + (n + 2) * shift) for n in range(3)] + index = IntervalIndex.from_tuples(tuples, closed=closed) + assert index.is_overlapping is True + + # overlapping with NA + tuples = [(na_value, na_value)] + tuples + [(na_value, na_value)] + index = IntervalIndex.from_tuples(tuples, closed=closed) + assert index.is_overlapping is True + + # common endpoints + tuples = [(start + n * shift, start + (n + 1) * shift) for n in range(3)] + index = IntervalIndex.from_tuples(tuples, closed=closed) + result = index.is_overlapping + expected = closed == "both" + assert result is expected + + # common endpoints with NA + tuples = [(na_value, na_value)] + tuples + [(na_value, na_value)] + index = IntervalIndex.from_tuples(tuples, closed=closed) + result = index.is_overlapping + assert result is expected + + @pytest.mark.parametrize( + "tuples", + [ + list(zip(range(10), range(1, 11))), + list( + zip( + date_range("20170101", periods=10), + date_range("20170101", periods=10), + ) + ), + list( + zip( + timedelta_range("0 days", periods=10), + timedelta_range("1 day", periods=10), + ) + ), + ], + ) + def test_to_tuples(self, tuples): + # GH 18756 + idx = IntervalIndex.from_tuples(tuples) + result = idx.to_tuples() + expected = Index(com.asarray_tuplesafe(tuples)) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "tuples", + [ + list(zip(range(10), range(1, 11))) + [np.nan], + list( + zip( + date_range("20170101", periods=10), + date_range("20170101", periods=10), + ) + ) + + [np.nan], + list( + zip( + timedelta_range("0 days", periods=10), + timedelta_range("1 day", periods=10), + ) + ) + + [np.nan], + ], + ) + @pytest.mark.parametrize("na_tuple", [True, False]) + def test_to_tuples_na(self, tuples, na_tuple): + # GH 18756 + idx = IntervalIndex.from_tuples(tuples) + result = idx.to_tuples(na_tuple=na_tuple) + + # check the non-NA portion + expected_notna = Index(com.asarray_tuplesafe(tuples[:-1])) + result_notna = result[:-1] + tm.assert_index_equal(result_notna, expected_notna) + + # check the NA portion + result_na = result[-1] + if na_tuple: + assert isinstance(result_na, tuple) + assert len(result_na) == 2 + assert all(isna(x) for x in result_na) + else: + assert isna(result_na) + + def test_nbytes(self): + # GH 19209 + left = np.arange(0, 4, dtype="i8") + right = np.arange(1, 5, dtype="i8") + + result = IntervalIndex.from_arrays(left, right).nbytes + expected = 64 # 4 * 8 * 2 + assert result == expected + + @pytest.mark.parametrize("new_closed", ["left", "right", "both", "neither"]) + def test_set_closed(self, name, closed, new_closed): + # GH 21670 + index = interval_range(0, 5, closed=closed, name=name) + result = index.set_closed(new_closed) + expected = interval_range(0, 5, closed=new_closed, name=name) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("bad_closed", ["foo", 10, "LEFT", True, False]) + def test_set_closed_errors(self, bad_closed): + # GH 21670 + index = interval_range(0, 5) + msg = "invalid option for 'closed': {closed}".format(closed=bad_closed) + with pytest.raises(ValueError, match=msg): + index.set_closed(bad_closed) + + def test_is_all_dates(self): + # GH 23576 + year_2017 = pd.Interval( + pd.Timestamp("2017-01-01 00:00:00"), pd.Timestamp("2018-01-01 00:00:00") + ) + year_2017_index = pd.IntervalIndex([year_2017]) + assert not year_2017_index.is_all_dates + + +def test_dir(): + # GH#27571 dir(interval_index) should not raise + index = IntervalIndex.from_arrays([0, 1], [1, 2]) + result = dir(index) + assert "str" not in result diff --git a/venv/Lib/site-packages/pandas/tests/indexes/interval/test_interval_range.py b/venv/Lib/site-packages/pandas/tests/indexes/interval/test_interval_range.py new file mode 100644 index 0000000..2f28c33 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/interval/test_interval_range.py @@ -0,0 +1,355 @@ +from datetime import timedelta + +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_integer + +from pandas import ( + DateOffset, + Interval, + IntervalIndex, + Timedelta, + Timestamp, + date_range, + interval_range, + timedelta_range, +) +import pandas._testing as tm + +from pandas.tseries.offsets import Day + + +@pytest.fixture(scope="class", params=[None, "foo"]) +def name(request): + return request.param + + +class TestIntervalRange: + @pytest.mark.parametrize("freq, periods", [(1, 100), (2.5, 40), (5, 20), (25, 4)]) + def test_constructor_numeric(self, closed, name, freq, periods): + start, end = 0, 100 + breaks = np.arange(101, step=freq) + expected = IntervalIndex.from_breaks(breaks, name=name, closed=closed) + + # defined from start/end/freq + result = interval_range( + start=start, end=end, freq=freq, name=name, closed=closed + ) + tm.assert_index_equal(result, expected) + + # defined from start/periods/freq + result = interval_range( + start=start, periods=periods, freq=freq, name=name, closed=closed + ) + tm.assert_index_equal(result, expected) + + # defined from end/periods/freq + result = interval_range( + end=end, periods=periods, freq=freq, name=name, closed=closed + ) + tm.assert_index_equal(result, expected) + + # GH 20976: linspace behavior defined from start/end/periods + result = interval_range( + start=start, end=end, periods=periods, name=name, closed=closed + ) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("tz", [None, "US/Eastern"]) + @pytest.mark.parametrize( + "freq, periods", [("D", 364), ("2D", 182), ("22D18H", 16), ("M", 11)] + ) + def test_constructor_timestamp(self, closed, name, freq, periods, tz): + start, end = Timestamp("20180101", tz=tz), Timestamp("20181231", tz=tz) + breaks = date_range(start=start, end=end, freq=freq) + expected = IntervalIndex.from_breaks(breaks, name=name, closed=closed) + + # defined from start/end/freq + result = interval_range( + start=start, end=end, freq=freq, name=name, closed=closed + ) + tm.assert_index_equal(result, expected) + + # defined from start/periods/freq + result = interval_range( + start=start, periods=periods, freq=freq, name=name, closed=closed + ) + tm.assert_index_equal(result, expected) + + # defined from end/periods/freq + result = interval_range( + end=end, periods=periods, freq=freq, name=name, closed=closed + ) + tm.assert_index_equal(result, expected) + + # GH 20976: linspace behavior defined from start/end/periods + if not breaks.freq.is_anchored() and tz is None: + # matches expected only for non-anchored offsets and tz naive + # (anchored/DST transitions cause unequal spacing in expected) + result = interval_range( + start=start, end=end, periods=periods, name=name, closed=closed + ) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "freq, periods", [("D", 100), ("2D12H", 40), ("5D", 20), ("25D", 4)] + ) + def test_constructor_timedelta(self, closed, name, freq, periods): + start, end = Timedelta("0 days"), Timedelta("100 days") + breaks = timedelta_range(start=start, end=end, freq=freq) + expected = IntervalIndex.from_breaks(breaks, name=name, closed=closed) + + # defined from start/end/freq + result = interval_range( + start=start, end=end, freq=freq, name=name, closed=closed + ) + tm.assert_index_equal(result, expected) + + # defined from start/periods/freq + result = interval_range( + start=start, periods=periods, freq=freq, name=name, closed=closed + ) + tm.assert_index_equal(result, expected) + + # defined from end/periods/freq + result = interval_range( + end=end, periods=periods, freq=freq, name=name, closed=closed + ) + tm.assert_index_equal(result, expected) + + # GH 20976: linspace behavior defined from start/end/periods + result = interval_range( + start=start, end=end, periods=periods, name=name, closed=closed + ) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "start, end, freq, expected_endpoint", + [ + (0, 10, 3, 9), + (0, 10, 1.5, 9), + (0.5, 10, 3, 9.5), + (Timedelta("0D"), Timedelta("10D"), "2D4H", Timedelta("8D16H")), + ( + Timestamp("2018-01-01"), + Timestamp("2018-02-09"), + "MS", + Timestamp("2018-02-01"), + ), + ( + Timestamp("2018-01-01", tz="US/Eastern"), + Timestamp("2018-01-20", tz="US/Eastern"), + "5D12H", + Timestamp("2018-01-17 12:00:00", tz="US/Eastern"), + ), + ], + ) + def test_early_truncation(self, start, end, freq, expected_endpoint): + # index truncates early if freq causes end to be skipped + result = interval_range(start=start, end=end, freq=freq) + result_endpoint = result.right[-1] + assert result_endpoint == expected_endpoint + + @pytest.mark.parametrize( + "start, end, freq", + [(0.5, None, None), (None, 4.5, None), (0.5, None, 1.5), (None, 6.5, 1.5)], + ) + def test_no_invalid_float_truncation(self, start, end, freq): + # GH 21161 + if freq is None: + breaks = [0.5, 1.5, 2.5, 3.5, 4.5] + else: + breaks = [0.5, 2.0, 3.5, 5.0, 6.5] + expected = IntervalIndex.from_breaks(breaks) + + result = interval_range(start=start, end=end, periods=4, freq=freq) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "start, mid, end", + [ + ( + Timestamp("2018-03-10", tz="US/Eastern"), + Timestamp("2018-03-10 23:30:00", tz="US/Eastern"), + Timestamp("2018-03-12", tz="US/Eastern"), + ), + ( + Timestamp("2018-11-03", tz="US/Eastern"), + Timestamp("2018-11-04 00:30:00", tz="US/Eastern"), + Timestamp("2018-11-05", tz="US/Eastern"), + ), + ], + ) + def test_linspace_dst_transition(self, start, mid, end): + # GH 20976: linspace behavior defined from start/end/periods + # accounts for the hour gained/lost during DST transition + result = interval_range(start=start, end=end, periods=2) + expected = IntervalIndex.from_breaks([start, mid, end]) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("freq", [2, 2.0]) + @pytest.mark.parametrize("end", [10, 10.0]) + @pytest.mark.parametrize("start", [0, 0.0]) + def test_float_subtype(self, start, end, freq): + # Has float subtype if any of start/end/freq are float, even if all + # resulting endpoints can safely be upcast to integers + + # defined from start/end/freq + index = interval_range(start=start, end=end, freq=freq) + result = index.dtype.subtype + expected = "int64" if is_integer(start + end + freq) else "float64" + assert result == expected + + # defined from start/periods/freq + index = interval_range(start=start, periods=5, freq=freq) + result = index.dtype.subtype + expected = "int64" if is_integer(start + freq) else "float64" + assert result == expected + + # defined from end/periods/freq + index = interval_range(end=end, periods=5, freq=freq) + result = index.dtype.subtype + expected = "int64" if is_integer(end + freq) else "float64" + assert result == expected + + # GH 20976: linspace behavior defined from start/end/periods + index = interval_range(start=start, end=end, periods=5) + result = index.dtype.subtype + expected = "int64" if is_integer(start + end) else "float64" + assert result == expected + + def test_constructor_coverage(self): + # float value for periods + expected = interval_range(start=0, periods=10) + result = interval_range(start=0, periods=10.5) + tm.assert_index_equal(result, expected) + + # equivalent timestamp-like start/end + start, end = Timestamp("2017-01-01"), Timestamp("2017-01-15") + expected = interval_range(start=start, end=end) + + result = interval_range(start=start.to_pydatetime(), end=end.to_pydatetime()) + tm.assert_index_equal(result, expected) + + result = interval_range(start=start.asm8, end=end.asm8) + tm.assert_index_equal(result, expected) + + # equivalent freq with timestamp + equiv_freq = [ + "D", + Day(), + Timedelta(days=1), + timedelta(days=1), + DateOffset(days=1), + ] + for freq in equiv_freq: + result = interval_range(start=start, end=end, freq=freq) + tm.assert_index_equal(result, expected) + + # equivalent timedelta-like start/end + start, end = Timedelta(days=1), Timedelta(days=10) + expected = interval_range(start=start, end=end) + + result = interval_range(start=start.to_pytimedelta(), end=end.to_pytimedelta()) + tm.assert_index_equal(result, expected) + + result = interval_range(start=start.asm8, end=end.asm8) + tm.assert_index_equal(result, expected) + + # equivalent freq with timedelta + equiv_freq = ["D", Day(), Timedelta(days=1), timedelta(days=1)] + for freq in equiv_freq: + result = interval_range(start=start, end=end, freq=freq) + tm.assert_index_equal(result, expected) + + def test_errors(self): + # not enough params + msg = ( + "Of the four parameters: start, end, periods, and freq, " + "exactly three must be specified" + ) + + with pytest.raises(ValueError, match=msg): + interval_range(start=0) + + with pytest.raises(ValueError, match=msg): + interval_range(end=5) + + with pytest.raises(ValueError, match=msg): + interval_range(periods=2) + + with pytest.raises(ValueError, match=msg): + interval_range() + + # too many params + with pytest.raises(ValueError, match=msg): + interval_range(start=0, end=5, periods=6, freq=1.5) + + # mixed units + msg = "start, end, freq need to be type compatible" + with pytest.raises(TypeError, match=msg): + interval_range(start=0, end=Timestamp("20130101"), freq=2) + + with pytest.raises(TypeError, match=msg): + interval_range(start=0, end=Timedelta("1 day"), freq=2) + + with pytest.raises(TypeError, match=msg): + interval_range(start=0, end=10, freq="D") + + with pytest.raises(TypeError, match=msg): + interval_range(start=Timestamp("20130101"), end=10, freq="D") + + with pytest.raises(TypeError, match=msg): + interval_range( + start=Timestamp("20130101"), end=Timedelta("1 day"), freq="D" + ) + + with pytest.raises(TypeError, match=msg): + interval_range( + start=Timestamp("20130101"), end=Timestamp("20130110"), freq=2 + ) + + with pytest.raises(TypeError, match=msg): + interval_range(start=Timedelta("1 day"), end=10, freq="D") + + with pytest.raises(TypeError, match=msg): + interval_range( + start=Timedelta("1 day"), end=Timestamp("20130110"), freq="D" + ) + + with pytest.raises(TypeError, match=msg): + interval_range(start=Timedelta("1 day"), end=Timedelta("10 days"), freq=2) + + # invalid periods + msg = "periods must be a number, got foo" + with pytest.raises(TypeError, match=msg): + interval_range(start=0, periods="foo") + + # invalid start + msg = "start must be numeric or datetime-like, got foo" + with pytest.raises(ValueError, match=msg): + interval_range(start="foo", periods=10) + + # invalid end + msg = r"end must be numeric or datetime-like, got \(0, 1\]" + with pytest.raises(ValueError, match=msg): + interval_range(end=Interval(0, 1), periods=10) + + # invalid freq for datetime-like + msg = "freq must be numeric or convertible to DateOffset, got foo" + with pytest.raises(ValueError, match=msg): + interval_range(start=0, end=10, freq="foo") + + with pytest.raises(ValueError, match=msg): + interval_range(start=Timestamp("20130101"), periods=10, freq="foo") + + with pytest.raises(ValueError, match=msg): + interval_range(end=Timedelta("1 day"), periods=10, freq="foo") + + # mixed tz + start = Timestamp("2017-01-01", tz="US/Eastern") + end = Timestamp("2017-01-07", tz="US/Pacific") + msg = "Start and end cannot both be tz-aware with different timezones" + with pytest.raises(TypeError, match=msg): + interval_range(start=start, end=end) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/interval/test_interval_tree.py b/venv/Lib/site-packages/pandas/tests/indexes/interval/test_interval_tree.py new file mode 100644 index 0000000..476ec1d --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/interval/test_interval_tree.py @@ -0,0 +1,193 @@ +from itertools import permutations + +import numpy as np +import pytest + +from pandas._libs.interval import IntervalTree + +from pandas import compat +import pandas._testing as tm + + +def skipif_32bit(param): + """ + Skip parameters in a parametrize on 32bit systems. Specifically used + here to skip leaf_size parameters related to GH 23440. + """ + marks = pytest.mark.skipif( + compat.is_platform_32bit(), reason="GH 23440: int type mismatch on 32bit" + ) + return pytest.param(param, marks=marks) + + +@pytest.fixture(scope="class", params=["int64", "float64", "uint64"]) +def dtype(request): + return request.param + + +@pytest.fixture(params=[skipif_32bit(1), skipif_32bit(2), 10]) +def leaf_size(request): + """ + Fixture to specify IntervalTree leaf_size parameter; to be used with the + tree fixture. + """ + return request.param + + +@pytest.fixture( + params=[ + np.arange(5, dtype="int64"), + np.arange(5, dtype="uint64"), + np.arange(5, dtype="float64"), + np.array([0, 1, 2, 3, 4, np.nan], dtype="float64"), + ] +) +def tree(request, leaf_size): + left = request.param + return IntervalTree(left, left + 2, leaf_size=leaf_size) + + +class TestIntervalTree: + def test_get_indexer(self, tree): + result = tree.get_indexer(np.array([1.0, 5.5, 6.5])) + expected = np.array([0, 4, -1], dtype="intp") + tm.assert_numpy_array_equal(result, expected) + + with pytest.raises( + KeyError, match="'indexer does not intersect a unique set of intervals'" + ): + tree.get_indexer(np.array([3.0])) + + @pytest.mark.parametrize( + "dtype, target_value, target_dtype", + [("int64", 2 ** 63 + 1, "uint64"), ("uint64", -1, "int64")], + ) + def test_get_indexer_overflow(self, dtype, target_value, target_dtype): + left, right = np.array([0, 1], dtype=dtype), np.array([1, 2], dtype=dtype) + tree = IntervalTree(left, right) + + result = tree.get_indexer(np.array([target_value], dtype=target_dtype)) + expected = np.array([-1], dtype="intp") + tm.assert_numpy_array_equal(result, expected) + + def test_get_indexer_non_unique(self, tree): + indexer, missing = tree.get_indexer_non_unique(np.array([1.0, 2.0, 6.5])) + + result = indexer[:1] + expected = np.array([0], dtype="intp") + tm.assert_numpy_array_equal(result, expected) + + result = np.sort(indexer[1:3]) + expected = np.array([0, 1], dtype="intp") + tm.assert_numpy_array_equal(result, expected) + + result = np.sort(indexer[3:]) + expected = np.array([-1], dtype="intp") + tm.assert_numpy_array_equal(result, expected) + + result = missing + expected = np.array([2], dtype="intp") + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "dtype, target_value, target_dtype", + [("int64", 2 ** 63 + 1, "uint64"), ("uint64", -1, "int64")], + ) + def test_get_indexer_non_unique_overflow(self, dtype, target_value, target_dtype): + left, right = np.array([0, 2], dtype=dtype), np.array([1, 3], dtype=dtype) + tree = IntervalTree(left, right) + target = np.array([target_value], dtype=target_dtype) + + result_indexer, result_missing = tree.get_indexer_non_unique(target) + expected_indexer = np.array([-1], dtype="intp") + tm.assert_numpy_array_equal(result_indexer, expected_indexer) + + expected_missing = np.array([0], dtype="intp") + tm.assert_numpy_array_equal(result_missing, expected_missing) + + def test_duplicates(self, dtype): + left = np.array([0, 0, 0], dtype=dtype) + tree = IntervalTree(left, left + 1) + + with pytest.raises( + KeyError, match="'indexer does not intersect a unique set of intervals'" + ): + tree.get_indexer(np.array([0.5])) + + indexer, missing = tree.get_indexer_non_unique(np.array([0.5])) + result = np.sort(indexer) + expected = np.array([0, 1, 2], dtype="intp") + tm.assert_numpy_array_equal(result, expected) + + result = missing + expected = np.array([], dtype="intp") + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "leaf_size", [skipif_32bit(1), skipif_32bit(10), skipif_32bit(100), 10000] + ) + def test_get_indexer_closed(self, closed, leaf_size): + x = np.arange(1000, dtype="float64") + found = x.astype("intp") + not_found = (-1 * np.ones(1000)).astype("intp") + + tree = IntervalTree(x, x + 0.5, closed=closed, leaf_size=leaf_size) + tm.assert_numpy_array_equal(found, tree.get_indexer(x + 0.25)) + + expected = found if tree.closed_left else not_found + tm.assert_numpy_array_equal(expected, tree.get_indexer(x + 0.0)) + + expected = found if tree.closed_right else not_found + tm.assert_numpy_array_equal(expected, tree.get_indexer(x + 0.5)) + + @pytest.mark.parametrize( + "left, right, expected", + [ + (np.array([0, 1, 4], dtype="int64"), np.array([2, 3, 5]), True), + (np.array([0, 1, 2], dtype="int64"), np.array([5, 4, 3]), True), + (np.array([0, 1, np.nan]), np.array([5, 4, np.nan]), True), + (np.array([0, 2, 4], dtype="int64"), np.array([1, 3, 5]), False), + (np.array([0, 2, np.nan]), np.array([1, 3, np.nan]), False), + ], + ) + @pytest.mark.parametrize("order", (list(x) for x in permutations(range(3)))) + def test_is_overlapping(self, closed, order, left, right, expected): + # GH 23309 + tree = IntervalTree(left[order], right[order], closed=closed) + result = tree.is_overlapping + assert result is expected + + @pytest.mark.parametrize("order", (list(x) for x in permutations(range(3)))) + def test_is_overlapping_endpoints(self, closed, order): + """shared endpoints are marked as overlapping""" + # GH 23309 + left, right = np.arange(3, dtype="int64"), np.arange(1, 4) + tree = IntervalTree(left[order], right[order], closed=closed) + result = tree.is_overlapping + expected = closed == "both" + assert result is expected + + @pytest.mark.parametrize( + "left, right", + [ + (np.array([], dtype="int64"), np.array([], dtype="int64")), + (np.array([0], dtype="int64"), np.array([1], dtype="int64")), + (np.array([np.nan]), np.array([np.nan])), + (np.array([np.nan] * 3), np.array([np.nan] * 3)), + ], + ) + def test_is_overlapping_trivial(self, closed, left, right): + # GH 23309 + tree = IntervalTree(left, right, closed=closed) + assert tree.is_overlapping is False + + @pytest.mark.skipif(compat.is_platform_32bit(), reason="GH 23440") + def test_construction_overflow(self): + # GH 25485 + left, right = np.arange(101, dtype="int64"), [np.iinfo(np.int64).max] * 101 + tree = IntervalTree(left, right) + + # pivot should be average of left/right medians + result = tree.root.pivot + expected = (50 + np.iinfo(np.int64).max) / 2 + assert result == expected diff --git a/venv/Lib/site-packages/pandas/tests/indexes/interval/test_setops.py b/venv/Lib/site-packages/pandas/tests/indexes/interval/test_setops.py new file mode 100644 index 0000000..3246ac6 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/interval/test_setops.py @@ -0,0 +1,187 @@ +import numpy as np +import pytest + +from pandas import Index, IntervalIndex, Timestamp, interval_range +import pandas._testing as tm + + +@pytest.fixture(scope="class", params=[None, "foo"]) +def name(request): + return request.param + + +@pytest.fixture(params=[None, False]) +def sort(request): + return request.param + + +def monotonic_index(start, end, dtype="int64", closed="right"): + return IntervalIndex.from_breaks(np.arange(start, end, dtype=dtype), closed=closed) + + +def empty_index(dtype="int64", closed="right"): + return IntervalIndex(np.array([], dtype=dtype), closed=closed) + + +class TestIntervalIndex: + def test_union(self, closed, sort): + index = monotonic_index(0, 11, closed=closed) + other = monotonic_index(5, 13, closed=closed) + + expected = monotonic_index(0, 13, closed=closed) + result = index[::-1].union(other, sort=sort) + if sort is None: + tm.assert_index_equal(result, expected) + assert tm.equalContents(result, expected) + + result = other[::-1].union(index, sort=sort) + if sort is None: + tm.assert_index_equal(result, expected) + assert tm.equalContents(result, expected) + + tm.assert_index_equal(index.union(index, sort=sort), index) + tm.assert_index_equal(index.union(index[:1], sort=sort), index) + + # GH 19101: empty result, same dtype + index = empty_index(dtype="int64", closed=closed) + result = index.union(index, sort=sort) + tm.assert_index_equal(result, index) + + # GH 19101: empty result, different dtypes + other = empty_index(dtype="float64", closed=closed) + result = index.union(other, sort=sort) + tm.assert_index_equal(result, index) + + def test_intersection(self, closed, sort): + index = monotonic_index(0, 11, closed=closed) + other = monotonic_index(5, 13, closed=closed) + + expected = monotonic_index(5, 11, closed=closed) + result = index[::-1].intersection(other, sort=sort) + if sort is None: + tm.assert_index_equal(result, expected) + assert tm.equalContents(result, expected) + + result = other[::-1].intersection(index, sort=sort) + if sort is None: + tm.assert_index_equal(result, expected) + assert tm.equalContents(result, expected) + + tm.assert_index_equal(index.intersection(index, sort=sort), index) + + # GH 19101: empty result, same dtype + other = monotonic_index(300, 314, closed=closed) + expected = empty_index(dtype="int64", closed=closed) + result = index.intersection(other, sort=sort) + tm.assert_index_equal(result, expected) + + # GH 19101: empty result, different dtypes + other = monotonic_index(300, 314, dtype="float64", closed=closed) + result = index.intersection(other, sort=sort) + tm.assert_index_equal(result, expected) + + # GH 26225: nested intervals + index = IntervalIndex.from_tuples([(1, 2), (1, 3), (1, 4), (0, 2)]) + other = IntervalIndex.from_tuples([(1, 2), (1, 3)]) + expected = IntervalIndex.from_tuples([(1, 2), (1, 3)]) + result = index.intersection(other) + tm.assert_index_equal(result, expected) + + # GH 26225: duplicate element + index = IntervalIndex.from_tuples([(1, 2), (1, 2), (2, 3), (3, 4)]) + other = IntervalIndex.from_tuples([(1, 2), (2, 3)]) + expected = IntervalIndex.from_tuples([(1, 2), (1, 2), (2, 3)]) + result = index.intersection(other) + tm.assert_index_equal(result, expected) + + # GH 26225 + index = IntervalIndex.from_tuples([(0, 3), (0, 2)]) + other = IntervalIndex.from_tuples([(0, 2), (1, 3)]) + expected = IntervalIndex.from_tuples([(0, 2)]) + result = index.intersection(other) + tm.assert_index_equal(result, expected) + + # GH 26225: duplicate nan element + index = IntervalIndex([np.nan, np.nan]) + other = IntervalIndex([np.nan]) + expected = IntervalIndex([np.nan]) + result = index.intersection(other) + tm.assert_index_equal(result, expected) + + def test_difference(self, closed, sort): + index = IntervalIndex.from_arrays([1, 0, 3, 2], [1, 2, 3, 4], closed=closed) + result = index.difference(index[:1], sort=sort) + expected = index[1:] + if sort is None: + expected = expected.sort_values() + tm.assert_index_equal(result, expected) + + # GH 19101: empty result, same dtype + result = index.difference(index, sort=sort) + expected = empty_index(dtype="int64", closed=closed) + tm.assert_index_equal(result, expected) + + # GH 19101: empty result, different dtypes + other = IntervalIndex.from_arrays( + index.left.astype("float64"), index.right, closed=closed + ) + result = index.difference(other, sort=sort) + tm.assert_index_equal(result, expected) + + def test_symmetric_difference(self, closed, sort): + index = monotonic_index(0, 11, closed=closed) + result = index[1:].symmetric_difference(index[:-1], sort=sort) + expected = IntervalIndex([index[0], index[-1]]) + if sort is None: + tm.assert_index_equal(result, expected) + assert tm.equalContents(result, expected) + + # GH 19101: empty result, same dtype + result = index.symmetric_difference(index, sort=sort) + expected = empty_index(dtype="int64", closed=closed) + if sort is None: + tm.assert_index_equal(result, expected) + assert tm.equalContents(result, expected) + + # GH 19101: empty result, different dtypes + other = IntervalIndex.from_arrays( + index.left.astype("float64"), index.right, closed=closed + ) + result = index.symmetric_difference(other, sort=sort) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "op_name", ["union", "intersection", "difference", "symmetric_difference"] + ) + @pytest.mark.parametrize("sort", [None, False]) + def test_set_incompatible_types(self, closed, op_name, sort): + index = monotonic_index(0, 11, closed=closed) + set_op = getattr(index, op_name) + + # TODO: standardize return type of non-union setops type(self vs other) + # non-IntervalIndex + if op_name == "difference": + expected = index + else: + expected = getattr(index.astype("O"), op_name)(Index([1, 2, 3])) + result = set_op(Index([1, 2, 3]), sort=sort) + tm.assert_index_equal(result, expected) + + # mixed closed + msg = ( + "can only do set operations between two IntervalIndex objects " + "that are closed on the same side" + ) + for other_closed in {"right", "left", "both", "neither"} - {closed}: + other = monotonic_index(0, 11, closed=other_closed) + with pytest.raises(ValueError, match=msg): + set_op(other, sort=sort) + + # GH 19016: incompatible dtypes + other = interval_range(Timestamp("20180101"), periods=9, closed=closed) + msg = ( + "can only do {op} between two IntervalIndex objects that have " + "compatible dtypes" + ).format(op=op_name) + with pytest.raises(TypeError, match=msg): + set_op(other, sort=sort) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/multi/__init__.py b/venv/Lib/site-packages/pandas/tests/indexes/multi/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/indexes/multi/conftest.py b/venv/Lib/site-packages/pandas/tests/indexes/multi/conftest.py new file mode 100644 index 0000000..acaea4f --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/multi/conftest.py @@ -0,0 +1,85 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import Index, MultiIndex + + +@pytest.fixture +def idx(): + # a MultiIndex used to test the general functionality of the + # general functionality of this object + major_axis = Index(["foo", "bar", "baz", "qux"]) + minor_axis = Index(["one", "two"]) + + major_codes = np.array([0, 0, 1, 2, 3, 3]) + minor_codes = np.array([0, 1, 0, 1, 0, 1]) + index_names = ["first", "second"] + mi = MultiIndex( + levels=[major_axis, minor_axis], + codes=[major_codes, minor_codes], + names=index_names, + verify_integrity=False, + ) + return mi + + +@pytest.fixture +def idx_dup(): + # compare tests/indexes/multi/conftest.py + major_axis = Index(["foo", "bar", "baz", "qux"]) + minor_axis = Index(["one", "two"]) + + major_codes = np.array([0, 0, 1, 0, 1, 1]) + minor_codes = np.array([0, 1, 0, 1, 0, 1]) + index_names = ["first", "second"] + mi = MultiIndex( + levels=[major_axis, minor_axis], + codes=[major_codes, minor_codes], + names=index_names, + verify_integrity=False, + ) + return mi + + +@pytest.fixture +def index_names(): + # names that match those in the idx fixture for testing equality of + # names assigned to the idx + return ["first", "second"] + + +@pytest.fixture +def holder(): + # the MultiIndex constructor used to base compatibility with pickle + return MultiIndex + + +@pytest.fixture +def compat_props(): + # a MultiIndex must have these properties associated with it + return ["shape", "ndim", "size"] + + +@pytest.fixture +def narrow_multi_index(): + """ + Return a MultiIndex that is narrower than the display (<80 characters). + """ + n = 1000 + ci = pd.CategoricalIndex(list("a" * n) + (["abc"] * n)) + dti = pd.date_range("2000-01-01", freq="s", periods=n * 2) + return pd.MultiIndex.from_arrays([ci, ci.codes + 9, dti], names=["a", "b", "dti"]) + + +@pytest.fixture +def wide_multi_index(): + """ + Return a MultiIndex that is wider than the display (>80 characters). + """ + n = 1000 + ci = pd.CategoricalIndex(list("a" * n) + (["abc"] * n)) + dti = pd.date_range("2000-01-01", freq="s", periods=n * 2) + levels = [ci, ci.codes + 9, dti, dti, dti] + names = ["a", "b", "dti_1", "dti_2", "dti_3"] + return pd.MultiIndex.from_arrays(levels, names=names) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/multi/test_analytics.py b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_analytics.py new file mode 100644 index 0000000..f04776e --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_analytics.py @@ -0,0 +1,356 @@ +import numpy as np +import pytest + +from pandas.compat.numpy import _np_version_under1p17 + +import pandas as pd +from pandas import Index, MultiIndex, date_range, period_range +import pandas._testing as tm + + +def test_shift(idx): + + # GH8083 test the base class for shift + msg = "Not supported for type MultiIndex" + with pytest.raises(NotImplementedError, match=msg): + idx.shift(1) + with pytest.raises(NotImplementedError, match=msg): + idx.shift(1, 2) + + +def test_groupby(idx): + groups = idx.groupby(np.array([1, 1, 1, 2, 2, 2])) + labels = idx.tolist() + exp = {1: labels[:3], 2: labels[3:]} + tm.assert_dict_equal(groups, exp) + + # GH5620 + groups = idx.groupby(idx) + exp = {key: [key] for key in idx} + tm.assert_dict_equal(groups, exp) + + +def test_truncate(): + major_axis = Index(list(range(4))) + minor_axis = Index(list(range(2))) + + major_codes = np.array([0, 0, 1, 2, 3, 3]) + minor_codes = np.array([0, 1, 0, 1, 0, 1]) + + index = MultiIndex( + levels=[major_axis, minor_axis], codes=[major_codes, minor_codes] + ) + + result = index.truncate(before=1) + assert "foo" not in result.levels[0] + assert 1 in result.levels[0] + + result = index.truncate(after=1) + assert 2 not in result.levels[0] + assert 1 in result.levels[0] + + result = index.truncate(before=1, after=2) + assert len(result.levels[0]) == 2 + + msg = "after < before" + with pytest.raises(ValueError, match=msg): + index.truncate(3, 1) + + +def test_where(): + i = MultiIndex.from_tuples([("A", 1), ("A", 2)]) + + msg = r"\.where is not supported for MultiIndex operations" + with pytest.raises(NotImplementedError, match=msg): + i.where(True) + + +@pytest.mark.parametrize("klass", [list, tuple, np.array, pd.Series]) +def test_where_array_like(klass): + i = MultiIndex.from_tuples([("A", 1), ("A", 2)]) + cond = [False, True] + msg = r"\.where is not supported for MultiIndex operations" + with pytest.raises(NotImplementedError, match=msg): + i.where(klass(cond)) + + +# TODO: reshape + + +def test_reorder_levels(idx): + # this blows up + with pytest.raises(IndexError, match="^Too many levels"): + idx.reorder_levels([2, 1, 0]) + + +def test_numpy_repeat(): + reps = 2 + numbers = [1, 2, 3] + names = np.array(["foo", "bar"]) + + m = MultiIndex.from_product([numbers, names], names=names) + expected = MultiIndex.from_product([numbers, names.repeat(reps)], names=names) + tm.assert_index_equal(np.repeat(m, reps), expected) + + msg = "the 'axis' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.repeat(m, reps, axis=1) + + +def test_append_mixed_dtypes(): + # GH 13660 + dti = date_range("2011-01-01", freq="M", periods=3) + dti_tz = date_range("2011-01-01", freq="M", periods=3, tz="US/Eastern") + pi = period_range("2011-01", freq="M", periods=3) + + mi = MultiIndex.from_arrays( + [[1, 2, 3], [1.1, np.nan, 3.3], ["a", "b", "c"], dti, dti_tz, pi] + ) + assert mi.nlevels == 6 + + res = mi.append(mi) + exp = MultiIndex.from_arrays( + [ + [1, 2, 3, 1, 2, 3], + [1.1, np.nan, 3.3, 1.1, np.nan, 3.3], + ["a", "b", "c", "a", "b", "c"], + dti.append(dti), + dti_tz.append(dti_tz), + pi.append(pi), + ] + ) + tm.assert_index_equal(res, exp) + + other = MultiIndex.from_arrays( + [ + ["x", "y", "z"], + ["x", "y", "z"], + ["x", "y", "z"], + ["x", "y", "z"], + ["x", "y", "z"], + ["x", "y", "z"], + ] + ) + + res = mi.append(other) + exp = MultiIndex.from_arrays( + [ + [1, 2, 3, "x", "y", "z"], + [1.1, np.nan, 3.3, "x", "y", "z"], + ["a", "b", "c", "x", "y", "z"], + dti.append(pd.Index(["x", "y", "z"])), + dti_tz.append(pd.Index(["x", "y", "z"])), + pi.append(pd.Index(["x", "y", "z"])), + ] + ) + tm.assert_index_equal(res, exp) + + +def test_take(idx): + indexer = [4, 3, 0, 2] + result = idx.take(indexer) + expected = idx[indexer] + assert result.equals(expected) + + # TODO: Remove Commented Code + # if not isinstance(idx, + # (DatetimeIndex, PeriodIndex, TimedeltaIndex)): + # GH 10791 + msg = "'MultiIndex' object has no attribute 'freq'" + with pytest.raises(AttributeError, match=msg): + idx.freq + + +def test_take_invalid_kwargs(idx): + idx = idx + indices = [1, 2] + + msg = r"take\(\) got an unexpected keyword argument 'foo'" + with pytest.raises(TypeError, match=msg): + idx.take(indices, foo=2) + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + idx.take(indices, out=indices) + + msg = "the 'mode' parameter is not supported" + with pytest.raises(ValueError, match=msg): + idx.take(indices, mode="clip") + + +def test_take_fill_value(): + # GH 12631 + vals = [["A", "B"], [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")]] + idx = pd.MultiIndex.from_product(vals, names=["str", "dt"]) + + result = idx.take(np.array([1, 0, -1])) + exp_vals = [ + ("A", pd.Timestamp("2011-01-02")), + ("A", pd.Timestamp("2011-01-01")), + ("B", pd.Timestamp("2011-01-02")), + ] + expected = pd.MultiIndex.from_tuples(exp_vals, names=["str", "dt"]) + tm.assert_index_equal(result, expected) + + # fill_value + result = idx.take(np.array([1, 0, -1]), fill_value=True) + exp_vals = [ + ("A", pd.Timestamp("2011-01-02")), + ("A", pd.Timestamp("2011-01-01")), + (np.nan, pd.NaT), + ] + expected = pd.MultiIndex.from_tuples(exp_vals, names=["str", "dt"]) + tm.assert_index_equal(result, expected) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + exp_vals = [ + ("A", pd.Timestamp("2011-01-02")), + ("A", pd.Timestamp("2011-01-01")), + ("B", pd.Timestamp("2011-01-02")), + ] + expected = pd.MultiIndex.from_tuples(exp_vals, names=["str", "dt"]) + tm.assert_index_equal(result, expected) + + msg = "When allow_fill=True and fill_value is not None, all indices must be >= -1" + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + msg = "index -5 is out of bounds for( axis 0 with)? size 4" + with pytest.raises(IndexError, match=msg): + idx.take(np.array([1, -5])) + + +def test_iter(idx): + result = list(idx) + expected = [ + ("foo", "one"), + ("foo", "two"), + ("bar", "one"), + ("baz", "two"), + ("qux", "one"), + ("qux", "two"), + ] + assert result == expected + + +def test_sub(idx): + + first = idx + + # - now raises (previously was set op difference) + msg = "cannot perform __sub__ with this index type: MultiIndex" + with pytest.raises(TypeError, match=msg): + first - idx[-3:] + with pytest.raises(TypeError, match=msg): + idx[-3:] - first + with pytest.raises(TypeError, match=msg): + idx[-3:] - first.tolist() + msg = "cannot perform __rsub__ with this index type: MultiIndex" + with pytest.raises(TypeError, match=msg): + first.tolist() - idx[-3:] + + +def test_map(idx): + # callable + index = idx + + # we don't infer UInt64 + if isinstance(index, pd.UInt64Index): + expected = index.astype("int64") + else: + expected = index + + result = index.map(lambda x: x) + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize( + "mapper", + [ + lambda values, idx: {i: e for e, i in zip(values, idx)}, + lambda values, idx: pd.Series(values, idx), + ], +) +def test_map_dictlike(idx, mapper): + + if isinstance(idx, (pd.CategoricalIndex, pd.IntervalIndex)): + pytest.skip(f"skipping tests for {type(idx)}") + + identity = mapper(idx.values, idx) + + # we don't infer to UInt64 for a dict + if isinstance(idx, pd.UInt64Index) and isinstance(identity, dict): + expected = idx.astype("int64") + else: + expected = idx + + result = idx.map(identity) + tm.assert_index_equal(result, expected) + + # empty mappable + expected = pd.Index([np.nan] * len(idx)) + result = idx.map(mapper(expected, idx)) + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize( + "func", + [ + np.exp, + np.exp2, + np.expm1, + np.log, + np.log2, + np.log10, + np.log1p, + np.sqrt, + np.sin, + np.cos, + np.tan, + np.arcsin, + np.arccos, + np.arctan, + np.sinh, + np.cosh, + np.tanh, + np.arcsinh, + np.arccosh, + np.arctanh, + np.deg2rad, + np.rad2deg, + ], + ids=lambda func: func.__name__, +) +def test_numpy_ufuncs(idx, func): + # test ufuncs of numpy. see: + # http://docs.scipy.org/doc/numpy/reference/ufuncs.html + + if _np_version_under1p17: + expected_exception = AttributeError + msg = f"'tuple' object has no attribute '{func.__name__}'" + else: + expected_exception = TypeError + msg = ( + "loop of ufunc does not support argument 0 of type tuple which" + f" has no callable {func.__name__} method" + ) + with pytest.raises(expected_exception, match=msg): + func(idx) + + +@pytest.mark.parametrize( + "func", + [np.isfinite, np.isinf, np.isnan, np.signbit], + ids=lambda func: func.__name__, +) +def test_numpy_type_funcs(idx, func): + msg = ( + f"ufunc '{func.__name__}' not supported for the input types, and the inputs " + "could not be safely coerced to any supported types according to " + "the casting rule ''safe''" + ) + with pytest.raises(TypeError, match=msg): + func(idx) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/multi/test_astype.py b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_astype.py new file mode 100644 index 0000000..2990853 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_astype.py @@ -0,0 +1,30 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.dtypes import CategoricalDtype + +import pandas._testing as tm + + +def test_astype(idx): + expected = idx.copy() + actual = idx.astype("O") + tm.assert_copy(actual.levels, expected.levels) + tm.assert_copy(actual.codes, expected.codes) + assert actual.names == list(expected.names) + + with pytest.raises(TypeError, match="^Setting.*dtype.*object"): + idx.astype(np.dtype(int)) + + +@pytest.mark.parametrize("ordered", [True, False]) +def test_astype_category(idx, ordered): + # GH 18630 + msg = "> 1 ndim Categorical are not supported at this time" + with pytest.raises(NotImplementedError, match=msg): + idx.astype(CategoricalDtype(ordered=ordered)) + + if ordered is False: + # dtype='category' defaults to ordered=False, so only test once + with pytest.raises(NotImplementedError, match=msg): + idx.astype("category") diff --git a/venv/Lib/site-packages/pandas/tests/indexes/multi/test_compat.py b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_compat.py new file mode 100644 index 0000000..d92cff1 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_compat.py @@ -0,0 +1,123 @@ +import numpy as np +import pytest + +from pandas import MultiIndex +import pandas._testing as tm + + +def test_numeric_compat(idx): + with pytest.raises(TypeError, match="cannot perform __mul__"): + idx * 1 + + with pytest.raises(TypeError, match="cannot perform __rmul__"): + 1 * idx + + div_err = "cannot perform __truediv__" + with pytest.raises(TypeError, match=div_err): + idx / 1 + + div_err = div_err.replace(" __", " __r") + with pytest.raises(TypeError, match=div_err): + 1 / idx + + with pytest.raises(TypeError, match="cannot perform __floordiv__"): + idx // 1 + + with pytest.raises(TypeError, match="cannot perform __rfloordiv__"): + 1 // idx + + +@pytest.mark.parametrize("method", ["all", "any"]) +def test_logical_compat(idx, method): + msg = "cannot perform {method}".format(method=method) + + with pytest.raises(TypeError, match=msg): + getattr(idx, method)() + + +def test_boolean_context_compat(idx): + + with pytest.raises(ValueError): + bool(idx) + + +def test_boolean_context_compat2(): + + # boolean context compat + # GH7897 + i1 = MultiIndex.from_tuples([("A", 1), ("A", 2)]) + i2 = MultiIndex.from_tuples([("A", 1), ("A", 3)]) + common = i1.intersection(i2) + + with pytest.raises(ValueError): + bool(common) + + +def test_inplace_mutation_resets_values(): + levels = [["a", "b", "c"], [4]] + levels2 = [[1, 2, 3], ["a"]] + codes = [[0, 1, 0, 2, 2, 0], [0, 0, 0, 0, 0, 0]] + + mi1 = MultiIndex(levels=levels, codes=codes) + mi2 = MultiIndex(levels=levels2, codes=codes) + vals = mi1.values.copy() + vals2 = mi2.values.copy() + + assert mi1._tuples is not None + + # Make sure level setting works + new_vals = mi1.set_levels(levels2).values + tm.assert_almost_equal(vals2, new_vals) + + # Non-inplace doesn't kill _tuples [implementation detail] + tm.assert_almost_equal(mi1._tuples, vals) + + # ...and values is still same too + tm.assert_almost_equal(mi1.values, vals) + + # Inplace should kill _tuples + mi1.set_levels(levels2, inplace=True) + tm.assert_almost_equal(mi1.values, vals2) + + # Make sure label setting works too + codes2 = [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]] + exp_values = np.empty((6,), dtype=object) + exp_values[:] = [(1, "a")] * 6 + + # Must be 1d array of tuples + assert exp_values.shape == (6,) + new_values = mi2.set_codes(codes2).values + + # Not inplace shouldn't change + tm.assert_almost_equal(mi2._tuples, vals2) + + # Should have correct values + tm.assert_almost_equal(exp_values, new_values) + + # ...and again setting inplace should kill _tuples, etc + mi2.set_codes(codes2, inplace=True) + tm.assert_almost_equal(mi2.values, new_values) + + +def test_ndarray_compat_properties(idx, compat_props): + assert idx.T.equals(idx) + assert idx.transpose().equals(idx) + + values = idx.values + for prop in compat_props: + assert getattr(idx, prop) == getattr(values, prop) + + # test for validity + idx.nbytes + idx.values.nbytes + + +def test_compat(indices): + assert indices.tolist() == list(indices) + + +def test_pickle_compat_construction(holder): + # this is testing for pickle compat + # need an object to create with + with pytest.raises(TypeError, match="Must pass both levels and codes"): + holder() diff --git a/venv/Lib/site-packages/pandas/tests/indexes/multi/test_constructors.py b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_constructors.py new file mode 100644 index 0000000..2c4b3ce --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_constructors.py @@ -0,0 +1,725 @@ +import numpy as np +import pytest + +from pandas._libs.tslib import Timestamp + +from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike + +import pandas as pd +from pandas import Index, MultiIndex, date_range +import pandas._testing as tm + + +def test_constructor_single_level(): + result = MultiIndex( + levels=[["foo", "bar", "baz", "qux"]], codes=[[0, 1, 2, 3]], names=["first"] + ) + assert isinstance(result, MultiIndex) + expected = Index(["foo", "bar", "baz", "qux"], name="first") + tm.assert_index_equal(result.levels[0], expected) + assert result.names == ["first"] + + +def test_constructor_no_levels(): + msg = "non-zero number of levels/codes" + with pytest.raises(ValueError, match=msg): + MultiIndex(levels=[], codes=[]) + + msg = "Must pass both levels and codes" + with pytest.raises(TypeError, match=msg): + MultiIndex(levels=[]) + with pytest.raises(TypeError, match=msg): + MultiIndex(codes=[]) + + +def test_constructor_nonhashable_names(): + # GH 20527 + levels = [[1, 2], ["one", "two"]] + codes = [[0, 0, 1, 1], [0, 1, 0, 1]] + names = (["foo"], ["bar"]) + msg = r"MultiIndex\.name must be a hashable type" + with pytest.raises(TypeError, match=msg): + MultiIndex(levels=levels, codes=codes, names=names) + + # With .rename() + mi = MultiIndex( + levels=[[1, 2], ["one", "two"]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1]], + names=("foo", "bar"), + ) + renamed = [["foor"], ["barr"]] + with pytest.raises(TypeError, match=msg): + mi.rename(names=renamed) + + # With .set_names() + with pytest.raises(TypeError, match=msg): + mi.set_names(names=renamed) + + +def test_constructor_mismatched_codes_levels(idx): + codes = [np.array([1]), np.array([2]), np.array([3])] + levels = ["a"] + + msg = "Length of levels and codes must be the same" + with pytest.raises(ValueError, match=msg): + MultiIndex(levels=levels, codes=codes) + + length_error = ( + r"On level 0, code max \(3\) >= length of level \(1\)\. " + "NOTE: this index is in an inconsistent state" + ) + label_error = r"Unequal code lengths: \[4, 2\]" + code_value_error = r"On level 0, code value \(-2\) < -1" + + # important to check that it's looking at the right thing. + with pytest.raises(ValueError, match=length_error): + MultiIndex(levels=[["a"], ["b"]], codes=[[0, 1, 2, 3], [0, 3, 4, 1]]) + + with pytest.raises(ValueError, match=label_error): + MultiIndex(levels=[["a"], ["b"]], codes=[[0, 0, 0, 0], [0, 0]]) + + # external API + with pytest.raises(ValueError, match=length_error): + idx.copy().set_levels([["a"], ["b"]]) + + with pytest.raises(ValueError, match=label_error): + idx.copy().set_codes([[0, 0, 0, 0], [0, 0]]) + + # test set_codes with verify_integrity=False + # the setting should not raise any value error + idx.copy().set_codes(codes=[[0, 0, 0, 0], [0, 0]], verify_integrity=False) + + # code value smaller than -1 + with pytest.raises(ValueError, match=code_value_error): + MultiIndex(levels=[["a"], ["b"]], codes=[[0, -2], [0, 0]]) + + +def test_na_levels(): + # GH26408 + # test if codes are re-assigned value -1 for levels + # with mising values (NaN, NaT, None) + result = MultiIndex( + levels=[[np.nan, None, pd.NaT, 128, 2]], codes=[[0, -1, 1, 2, 3, 4]] + ) + expected = MultiIndex( + levels=[[np.nan, None, pd.NaT, 128, 2]], codes=[[-1, -1, -1, -1, 3, 4]] + ) + tm.assert_index_equal(result, expected) + + result = MultiIndex( + levels=[[np.nan, "s", pd.NaT, 128, None]], codes=[[0, -1, 1, 2, 3, 4]] + ) + expected = MultiIndex( + levels=[[np.nan, "s", pd.NaT, 128, None]], codes=[[-1, -1, 1, -1, 3, -1]] + ) + tm.assert_index_equal(result, expected) + + # verify set_levels and set_codes + result = MultiIndex( + levels=[[1, 2, 3, 4, 5]], codes=[[0, -1, 1, 2, 3, 4]] + ).set_levels([[np.nan, "s", pd.NaT, 128, None]]) + tm.assert_index_equal(result, expected) + + result = MultiIndex( + levels=[[np.nan, "s", pd.NaT, 128, None]], codes=[[1, 2, 2, 2, 2, 2]] + ).set_codes([[0, -1, 1, 2, 3, 4]]) + tm.assert_index_equal(result, expected) + + +def test_copy_in_constructor(): + levels = np.array(["a", "b", "c"]) + codes = np.array([1, 1, 2, 0, 0, 1, 1]) + val = codes[0] + mi = MultiIndex(levels=[levels, levels], codes=[codes, codes], copy=True) + assert mi.codes[0][0] == val + codes[0] = 15 + assert mi.codes[0][0] == val + val = levels[0] + levels[0] = "PANDA" + assert mi.levels[0][0] == val + + +# ---------------------------------------------------------------------------- +# from_arrays +# ---------------------------------------------------------------------------- +def test_from_arrays(idx): + arrays = [ + np.asarray(lev).take(level_codes) + for lev, level_codes in zip(idx.levels, idx.codes) + ] + + # list of arrays as input + result = MultiIndex.from_arrays(arrays, names=idx.names) + tm.assert_index_equal(result, idx) + + # infer correctly + result = MultiIndex.from_arrays([[pd.NaT, Timestamp("20130101")], ["a", "b"]]) + assert result.levels[0].equals(Index([Timestamp("20130101")])) + assert result.levels[1].equals(Index(["a", "b"])) + + +def test_from_arrays_iterator(idx): + # GH 18434 + arrays = [ + np.asarray(lev).take(level_codes) + for lev, level_codes in zip(idx.levels, idx.codes) + ] + + # iterator as input + result = MultiIndex.from_arrays(iter(arrays), names=idx.names) + tm.assert_index_equal(result, idx) + + # invalid iterator input + msg = "Input must be a list / sequence of array-likes." + with pytest.raises(TypeError, match=msg): + MultiIndex.from_arrays(0) + + +def test_from_arrays_tuples(idx): + arrays = tuple( + tuple(np.asarray(lev).take(level_codes)) + for lev, level_codes in zip(idx.levels, idx.codes) + ) + + # tuple of tuples as input + result = MultiIndex.from_arrays(arrays, names=idx.names) + tm.assert_index_equal(result, idx) + + +def test_from_arrays_index_series_datetimetz(): + idx1 = pd.date_range("2015-01-01 10:00", freq="D", periods=3, tz="US/Eastern") + idx2 = pd.date_range("2015-01-01 10:00", freq="H", periods=3, tz="Asia/Tokyo") + result = pd.MultiIndex.from_arrays([idx1, idx2]) + tm.assert_index_equal(result.get_level_values(0), idx1) + tm.assert_index_equal(result.get_level_values(1), idx2) + + result2 = pd.MultiIndex.from_arrays([pd.Series(idx1), pd.Series(idx2)]) + tm.assert_index_equal(result2.get_level_values(0), idx1) + tm.assert_index_equal(result2.get_level_values(1), idx2) + + tm.assert_index_equal(result, result2) + + +def test_from_arrays_index_series_timedelta(): + idx1 = pd.timedelta_range("1 days", freq="D", periods=3) + idx2 = pd.timedelta_range("2 hours", freq="H", periods=3) + result = pd.MultiIndex.from_arrays([idx1, idx2]) + tm.assert_index_equal(result.get_level_values(0), idx1) + tm.assert_index_equal(result.get_level_values(1), idx2) + + result2 = pd.MultiIndex.from_arrays([pd.Series(idx1), pd.Series(idx2)]) + tm.assert_index_equal(result2.get_level_values(0), idx1) + tm.assert_index_equal(result2.get_level_values(1), idx2) + + tm.assert_index_equal(result, result2) + + +def test_from_arrays_index_series_period(): + idx1 = pd.period_range("2011-01-01", freq="D", periods=3) + idx2 = pd.period_range("2015-01-01", freq="H", periods=3) + result = pd.MultiIndex.from_arrays([idx1, idx2]) + tm.assert_index_equal(result.get_level_values(0), idx1) + tm.assert_index_equal(result.get_level_values(1), idx2) + + result2 = pd.MultiIndex.from_arrays([pd.Series(idx1), pd.Series(idx2)]) + tm.assert_index_equal(result2.get_level_values(0), idx1) + tm.assert_index_equal(result2.get_level_values(1), idx2) + + tm.assert_index_equal(result, result2) + + +def test_from_arrays_index_datetimelike_mixed(): + idx1 = pd.date_range("2015-01-01 10:00", freq="D", periods=3, tz="US/Eastern") + idx2 = pd.date_range("2015-01-01 10:00", freq="H", periods=3) + idx3 = pd.timedelta_range("1 days", freq="D", periods=3) + idx4 = pd.period_range("2011-01-01", freq="D", periods=3) + + result = pd.MultiIndex.from_arrays([idx1, idx2, idx3, idx4]) + tm.assert_index_equal(result.get_level_values(0), idx1) + tm.assert_index_equal(result.get_level_values(1), idx2) + tm.assert_index_equal(result.get_level_values(2), idx3) + tm.assert_index_equal(result.get_level_values(3), idx4) + + result2 = pd.MultiIndex.from_arrays( + [pd.Series(idx1), pd.Series(idx2), pd.Series(idx3), pd.Series(idx4)] + ) + tm.assert_index_equal(result2.get_level_values(0), idx1) + tm.assert_index_equal(result2.get_level_values(1), idx2) + tm.assert_index_equal(result2.get_level_values(2), idx3) + tm.assert_index_equal(result2.get_level_values(3), idx4) + + tm.assert_index_equal(result, result2) + + +def test_from_arrays_index_series_categorical(): + # GH13743 + idx1 = pd.CategoricalIndex(list("abcaab"), categories=list("bac"), ordered=False) + idx2 = pd.CategoricalIndex(list("abcaab"), categories=list("bac"), ordered=True) + + result = pd.MultiIndex.from_arrays([idx1, idx2]) + tm.assert_index_equal(result.get_level_values(0), idx1) + tm.assert_index_equal(result.get_level_values(1), idx2) + + result2 = pd.MultiIndex.from_arrays([pd.Series(idx1), pd.Series(idx2)]) + tm.assert_index_equal(result2.get_level_values(0), idx1) + tm.assert_index_equal(result2.get_level_values(1), idx2) + + result3 = pd.MultiIndex.from_arrays([idx1.values, idx2.values]) + tm.assert_index_equal(result3.get_level_values(0), idx1) + tm.assert_index_equal(result3.get_level_values(1), idx2) + + +def test_from_arrays_empty(): + # 0 levels + msg = "Must pass non-zero number of levels/codes" + with pytest.raises(ValueError, match=msg): + MultiIndex.from_arrays(arrays=[]) + + # 1 level + result = MultiIndex.from_arrays(arrays=[[]], names=["A"]) + assert isinstance(result, MultiIndex) + expected = Index([], name="A") + tm.assert_index_equal(result.levels[0], expected) + assert result.names == ["A"] + + # N levels + for N in [2, 3]: + arrays = [[]] * N + names = list("ABC")[:N] + result = MultiIndex.from_arrays(arrays=arrays, names=names) + expected = MultiIndex(levels=[[]] * N, codes=[[]] * N, names=names) + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize( + "invalid_sequence_of_arrays", + [ + 1, + [1], + [1, 2], + [[1], 2], + [1, [2]], + "a", + ["a"], + ["a", "b"], + [["a"], "b"], + (1,), + (1, 2), + ([1], 2), + (1, [2]), + "a", + ("a",), + ("a", "b"), + (["a"], "b"), + [(1,), 2], + [1, (2,)], + [("a",), "b"], + ((1,), 2), + (1, (2,)), + (("a",), "b"), + ], +) +def test_from_arrays_invalid_input(invalid_sequence_of_arrays): + msg = "Input must be a list / sequence of array-likes" + with pytest.raises(TypeError, match=msg): + MultiIndex.from_arrays(arrays=invalid_sequence_of_arrays) + + +@pytest.mark.parametrize( + "idx1, idx2", [([1, 2, 3], ["a", "b"]), ([], ["a", "b"]), ([1, 2, 3], [])] +) +def test_from_arrays_different_lengths(idx1, idx2): + # see gh-13599 + msg = "^all arrays must be same length$" + with pytest.raises(ValueError, match=msg): + MultiIndex.from_arrays([idx1, idx2]) + + +def test_from_arrays_respects_none_names(): + # GH27292 + a = pd.Series([1, 2, 3], name="foo") + b = pd.Series(["a", "b", "c"], name="bar") + + result = MultiIndex.from_arrays([a, b], names=None) + expected = MultiIndex( + levels=[[1, 2, 3], ["a", "b", "c"]], codes=[[0, 1, 2], [0, 1, 2]], names=None + ) + + tm.assert_index_equal(result, expected) + + +# ---------------------------------------------------------------------------- +# from_tuples +# ---------------------------------------------------------------------------- +def test_from_tuples(): + msg = "Cannot infer number of levels from empty list" + with pytest.raises(TypeError, match=msg): + MultiIndex.from_tuples([]) + + expected = MultiIndex( + levels=[[1, 3], [2, 4]], codes=[[0, 1], [0, 1]], names=["a", "b"] + ) + + # input tuples + result = MultiIndex.from_tuples(((1, 2), (3, 4)), names=["a", "b"]) + tm.assert_index_equal(result, expected) + + +def test_from_tuples_iterator(): + # GH 18434 + # input iterator for tuples + expected = MultiIndex( + levels=[[1, 3], [2, 4]], codes=[[0, 1], [0, 1]], names=["a", "b"] + ) + + result = MultiIndex.from_tuples(zip([1, 3], [2, 4]), names=["a", "b"]) + tm.assert_index_equal(result, expected) + + # input non-iterables + msg = "Input must be a list / sequence of tuple-likes." + with pytest.raises(TypeError, match=msg): + MultiIndex.from_tuples(0) + + +def test_from_tuples_empty(): + # GH 16777 + result = MultiIndex.from_tuples([], names=["a", "b"]) + expected = MultiIndex.from_arrays(arrays=[[], []], names=["a", "b"]) + tm.assert_index_equal(result, expected) + + +def test_from_tuples_index_values(idx): + result = MultiIndex.from_tuples(idx) + assert (result.values == idx.values).all() + + +def test_tuples_with_name_string(): + # GH 15110 and GH 14848 + + li = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] + msg = "Names should be list-like for a MultiIndex" + with pytest.raises(ValueError, match=msg): + pd.Index(li, name="abc") + with pytest.raises(ValueError, match=msg): + pd.Index(li, name="a") + + +def test_from_tuples_with_tuple_label(): + # GH 15457 + expected = pd.DataFrame( + [[2, 1, 2], [4, (1, 2), 3]], columns=["a", "b", "c"] + ).set_index(["a", "b"]) + idx = pd.MultiIndex.from_tuples([(2, 1), (4, (1, 2))], names=("a", "b")) + result = pd.DataFrame([2, 3], columns=["c"], index=idx) + tm.assert_frame_equal(expected, result) + + +# ---------------------------------------------------------------------------- +# from_product +# ---------------------------------------------------------------------------- +def test_from_product_empty_zero_levels(): + # 0 levels + msg = "Must pass non-zero number of levels/codes" + with pytest.raises(ValueError, match=msg): + MultiIndex.from_product([]) + + +def test_from_product_empty_one_level(): + result = MultiIndex.from_product([[]], names=["A"]) + expected = pd.Index([], name="A") + tm.assert_index_equal(result.levels[0], expected) + assert result.names == ["A"] + + +@pytest.mark.parametrize( + "first, second", [([], []), (["foo", "bar", "baz"], []), ([], ["a", "b", "c"])] +) +def test_from_product_empty_two_levels(first, second): + names = ["A", "B"] + result = MultiIndex.from_product([first, second], names=names) + expected = MultiIndex(levels=[first, second], codes=[[], []], names=names) + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("N", list(range(4))) +def test_from_product_empty_three_levels(N): + # GH12258 + names = ["A", "B", "C"] + lvl2 = list(range(N)) + result = MultiIndex.from_product([[], lvl2, []], names=names) + expected = MultiIndex(levels=[[], lvl2, []], codes=[[], [], []], names=names) + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize( + "invalid_input", [1, [1], [1, 2], [[1], 2], "a", ["a"], ["a", "b"], [["a"], "b"]] +) +def test_from_product_invalid_input(invalid_input): + msg = r"Input must be a list / sequence of iterables|Input must be list-like" + with pytest.raises(TypeError, match=msg): + MultiIndex.from_product(iterables=invalid_input) + + +def test_from_product_datetimeindex(): + dt_index = date_range("2000-01-01", periods=2) + mi = pd.MultiIndex.from_product([[1, 2], dt_index]) + etalon = construct_1d_object_array_from_listlike( + [ + (1, pd.Timestamp("2000-01-01")), + (1, pd.Timestamp("2000-01-02")), + (2, pd.Timestamp("2000-01-01")), + (2, pd.Timestamp("2000-01-02")), + ] + ) + tm.assert_numpy_array_equal(mi.values, etalon) + + +@pytest.mark.parametrize("ordered", [False, True]) +@pytest.mark.parametrize("f", [lambda x: x, lambda x: pd.Series(x), lambda x: x.values]) +def test_from_product_index_series_categorical(ordered, f): + # GH13743 + first = ["foo", "bar"] + + idx = pd.CategoricalIndex(list("abcaab"), categories=list("bac"), ordered=ordered) + expected = pd.CategoricalIndex( + list("abcaab") + list("abcaab"), categories=list("bac"), ordered=ordered + ) + + result = pd.MultiIndex.from_product([first, f(idx)]) + tm.assert_index_equal(result.get_level_values(1), expected) + + +def test_from_product(): + + first = ["foo", "bar", "buz"] + second = ["a", "b", "c"] + names = ["first", "second"] + result = MultiIndex.from_product([first, second], names=names) + + tuples = [ + ("foo", "a"), + ("foo", "b"), + ("foo", "c"), + ("bar", "a"), + ("bar", "b"), + ("bar", "c"), + ("buz", "a"), + ("buz", "b"), + ("buz", "c"), + ] + expected = MultiIndex.from_tuples(tuples, names=names) + + tm.assert_index_equal(result, expected) + + +def test_from_product_iterator(): + # GH 18434 + first = ["foo", "bar", "buz"] + second = ["a", "b", "c"] + names = ["first", "second"] + tuples = [ + ("foo", "a"), + ("foo", "b"), + ("foo", "c"), + ("bar", "a"), + ("bar", "b"), + ("bar", "c"), + ("buz", "a"), + ("buz", "b"), + ("buz", "c"), + ] + expected = MultiIndex.from_tuples(tuples, names=names) + + # iterator as input + result = MultiIndex.from_product(iter([first, second]), names=names) + tm.assert_index_equal(result, expected) + + # Invalid non-iterable input + msg = "Input must be a list / sequence of iterables." + with pytest.raises(TypeError, match=msg): + MultiIndex.from_product(0) + + +@pytest.mark.parametrize( + "a, b, expected_names", + [ + ( + pd.Series([1, 2, 3], name="foo"), + pd.Series(["a", "b"], name="bar"), + ["foo", "bar"], + ), + (pd.Series([1, 2, 3], name="foo"), ["a", "b"], ["foo", None]), + ([1, 2, 3], ["a", "b"], None), + ], +) +def test_from_product_infer_names(a, b, expected_names): + # GH27292 + result = MultiIndex.from_product([a, b]) + expected = MultiIndex( + levels=[[1, 2, 3], ["a", "b"]], + codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], + names=expected_names, + ) + tm.assert_index_equal(result, expected) + + +def test_from_product_respects_none_names(): + # GH27292 + a = pd.Series([1, 2, 3], name="foo") + b = pd.Series(["a", "b"], name="bar") + + result = MultiIndex.from_product([a, b], names=None) + expected = MultiIndex( + levels=[[1, 2, 3], ["a", "b"]], + codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], + names=None, + ) + tm.assert_index_equal(result, expected) + + +def test_from_product_readonly(): + # GH#15286 passing read-only array to from_product + a = np.array(range(3)) + b = ["a", "b"] + expected = MultiIndex.from_product([a, b]) + + a.setflags(write=False) + result = MultiIndex.from_product([a, b]) + tm.assert_index_equal(result, expected) + + +def test_create_index_existing_name(idx): + + # GH11193, when an existing index is passed, and a new name is not + # specified, the new index should inherit the previous object name + index = idx + index.names = ["foo", "bar"] + result = pd.Index(index) + expected = Index( + Index( + [ + ("foo", "one"), + ("foo", "two"), + ("bar", "one"), + ("baz", "two"), + ("qux", "one"), + ("qux", "two"), + ], + dtype="object", + ) + ) + tm.assert_index_equal(result, expected) + + result = pd.Index(index, name="A") + expected = Index( + Index( + [ + ("foo", "one"), + ("foo", "two"), + ("bar", "one"), + ("baz", "two"), + ("qux", "one"), + ("qux", "two"), + ], + dtype="object", + ), + name="A", + ) + tm.assert_index_equal(result, expected) + + +# ---------------------------------------------------------------------------- +# from_frame +# ---------------------------------------------------------------------------- +def test_from_frame(): + # GH 22420 + df = pd.DataFrame( + [["a", "a"], ["a", "b"], ["b", "a"], ["b", "b"]], columns=["L1", "L2"] + ) + expected = pd.MultiIndex.from_tuples( + [("a", "a"), ("a", "b"), ("b", "a"), ("b", "b")], names=["L1", "L2"] + ) + result = pd.MultiIndex.from_frame(df) + tm.assert_index_equal(expected, result) + + +@pytest.mark.parametrize( + "non_frame", + [ + pd.Series([1, 2, 3, 4]), + [1, 2, 3, 4], + [[1, 2], [3, 4], [5, 6]], + pd.Index([1, 2, 3, 4]), + np.array([[1, 2], [3, 4], [5, 6]]), + 27, + ], +) +def test_from_frame_error(non_frame): + # GH 22420 + with pytest.raises(TypeError, match="Input must be a DataFrame"): + pd.MultiIndex.from_frame(non_frame) + + +def test_from_frame_dtype_fidelity(): + # GH 22420 + df = pd.DataFrame( + { + "dates": pd.date_range("19910905", periods=6, tz="US/Eastern"), + "a": [1, 1, 1, 2, 2, 2], + "b": pd.Categorical(["a", "a", "b", "b", "c", "c"], ordered=True), + "c": ["x", "x", "y", "z", "x", "y"], + } + ) + original_dtypes = df.dtypes.to_dict() + + expected_mi = pd.MultiIndex.from_arrays( + [ + pd.date_range("19910905", periods=6, tz="US/Eastern"), + [1, 1, 1, 2, 2, 2], + pd.Categorical(["a", "a", "b", "b", "c", "c"], ordered=True), + ["x", "x", "y", "z", "x", "y"], + ], + names=["dates", "a", "b", "c"], + ) + mi = pd.MultiIndex.from_frame(df) + mi_dtypes = {name: mi.levels[i].dtype for i, name in enumerate(mi.names)} + + tm.assert_index_equal(expected_mi, mi) + assert original_dtypes == mi_dtypes + + +@pytest.mark.parametrize( + "names_in,names_out", [(None, [("L1", "x"), ("L2", "y")]), (["x", "y"], ["x", "y"])] +) +def test_from_frame_valid_names(names_in, names_out): + # GH 22420 + df = pd.DataFrame( + [["a", "a"], ["a", "b"], ["b", "a"], ["b", "b"]], + columns=pd.MultiIndex.from_tuples([("L1", "x"), ("L2", "y")]), + ) + mi = pd.MultiIndex.from_frame(df, names=names_in) + assert mi.names == names_out + + +@pytest.mark.parametrize( + "names,expected_error_msg", + [ + ("bad_input", "Names should be list-like for a MultiIndex"), + (["a", "b", "c"], "Length of names must match number of levels in MultiIndex"), + ], +) +def test_from_frame_invalid_names(names, expected_error_msg): + # GH 22420 + df = pd.DataFrame( + [["a", "a"], ["a", "b"], ["b", "a"], ["b", "b"]], + columns=pd.MultiIndex.from_tuples([("L1", "x"), ("L2", "y")]), + ) + with pytest.raises(ValueError, match=expected_error_msg): + pd.MultiIndex.from_frame(df, names=names) + + +def test_index_equal_empty_iterable(): + # #16844 + a = MultiIndex(levels=[[], []], codes=[[], []], names=["a", "b"]) + b = MultiIndex.from_arrays(arrays=[[], []], names=["a", "b"]) + tm.assert_index_equal(a, b) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/multi/test_contains.py b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_contains.py new file mode 100644 index 0000000..49aa632 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_contains.py @@ -0,0 +1,124 @@ +import numpy as np +import pytest + +from pandas.compat import PYPY + +import pandas as pd +from pandas import MultiIndex +import pandas._testing as tm + + +def test_contains_top_level(): + midx = MultiIndex.from_product([["A", "B"], [1, 2]]) + assert "A" in midx + assert "A" not in midx._engine + + +def test_contains_with_nat(): + # MI with a NaT + mi = MultiIndex( + levels=[["C"], pd.date_range("2012-01-01", periods=5)], + codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]], + names=[None, "B"], + ) + assert ("C", pd.Timestamp("2012-01-01")) in mi + for val in mi.values: + assert val in mi + + +def test_contains(idx): + assert ("foo", "two") in idx + assert ("bar", "two") not in idx + assert None not in idx + + +@pytest.mark.skipif(not PYPY, reason="tuples cmp recursively on PyPy") +def test_isin_nan_pypy(): + idx = MultiIndex.from_arrays([["foo", "bar"], [1.0, np.nan]]) + tm.assert_numpy_array_equal(idx.isin([("bar", np.nan)]), np.array([False, True])) + tm.assert_numpy_array_equal( + idx.isin([("bar", float("nan"))]), np.array([False, True]) + ) + + +def test_isin(): + values = [("foo", 2), ("bar", 3), ("quux", 4)] + + idx = MultiIndex.from_arrays([["qux", "baz", "foo", "bar"], np.arange(4)]) + result = idx.isin(values) + expected = np.array([False, False, True, True]) + tm.assert_numpy_array_equal(result, expected) + + # empty, return dtype bool + idx = MultiIndex.from_arrays([[], []]) + result = idx.isin(values) + assert len(result) == 0 + assert result.dtype == np.bool_ + + +@pytest.mark.skipif(PYPY, reason="tuples cmp recursively on PyPy") +def test_isin_nan_not_pypy(): + idx = MultiIndex.from_arrays([["foo", "bar"], [1.0, np.nan]]) + tm.assert_numpy_array_equal(idx.isin([("bar", np.nan)]), np.array([False, False])) + tm.assert_numpy_array_equal( + idx.isin([("bar", float("nan"))]), np.array([False, False]) + ) + + +def test_isin_level_kwarg(): + idx = MultiIndex.from_arrays([["qux", "baz", "foo", "bar"], np.arange(4)]) + + vals_0 = ["foo", "bar", "quux"] + vals_1 = [2, 3, 10] + + expected = np.array([False, False, True, True]) + tm.assert_numpy_array_equal(expected, idx.isin(vals_0, level=0)) + tm.assert_numpy_array_equal(expected, idx.isin(vals_0, level=-2)) + + tm.assert_numpy_array_equal(expected, idx.isin(vals_1, level=1)) + tm.assert_numpy_array_equal(expected, idx.isin(vals_1, level=-1)) + + msg = "Too many levels: Index has only 2 levels, not 6" + with pytest.raises(IndexError, match=msg): + idx.isin(vals_0, level=5) + msg = "Too many levels: Index has only 2 levels, -5 is not a valid level number" + with pytest.raises(IndexError, match=msg): + idx.isin(vals_0, level=-5) + + with pytest.raises(KeyError, match=r"'Level 1\.0 not found'"): + idx.isin(vals_0, level=1.0) + with pytest.raises(KeyError, match=r"'Level -1\.0 not found'"): + idx.isin(vals_1, level=-1.0) + with pytest.raises(KeyError, match="'Level A not found'"): + idx.isin(vals_1, level="A") + + idx.names = ["A", "B"] + tm.assert_numpy_array_equal(expected, idx.isin(vals_0, level="A")) + tm.assert_numpy_array_equal(expected, idx.isin(vals_1, level="B")) + + with pytest.raises(KeyError, match="'Level C not found'"): + idx.isin(vals_1, level="C") + + +def test_contains_with_missing_value(): + # issue 19132 + idx = MultiIndex.from_arrays([[1, np.nan, 2]]) + assert np.nan in idx + + idx = MultiIndex.from_arrays([[1, 2], [np.nan, 3]]) + assert np.nan not in idx + assert (1, np.nan) in idx + + +@pytest.mark.parametrize( + "labels,expected,level", + [ + ([("b", np.nan)], np.array([False, False, True]), None,), + ([np.nan, "a"], np.array([True, True, False]), 0), + (["d", np.nan], np.array([False, True, True]), 1), + ], +) +def test_isin_multi_index_with_missing_value(labels, expected, level): + # GH 19132 + midx = MultiIndex.from_arrays([[np.nan, "a", "b"], ["c", "d", np.nan]]) + tm.assert_numpy_array_equal(midx.isin(labels, level=level), expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/multi/test_conversion.py b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_conversion.py new file mode 100644 index 0000000..8956e6e --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_conversion.py @@ -0,0 +1,197 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, MultiIndex, date_range +import pandas._testing as tm + + +def test_tolist(idx): + result = idx.tolist() + exp = list(idx.values) + assert result == exp + + +def test_to_numpy(idx): + result = idx.to_numpy() + exp = idx.values + tm.assert_numpy_array_equal(result, exp) + + +def test_to_frame(): + tuples = [(1, "one"), (1, "two"), (2, "one"), (2, "two")] + + index = MultiIndex.from_tuples(tuples) + result = index.to_frame(index=False) + expected = DataFrame(tuples) + tm.assert_frame_equal(result, expected) + + result = index.to_frame() + expected.index = index + tm.assert_frame_equal(result, expected) + + tuples = [(1, "one"), (1, "two"), (2, "one"), (2, "two")] + index = MultiIndex.from_tuples(tuples, names=["first", "second"]) + result = index.to_frame(index=False) + expected = DataFrame(tuples) + expected.columns = ["first", "second"] + tm.assert_frame_equal(result, expected) + + result = index.to_frame() + expected.index = index + tm.assert_frame_equal(result, expected) + + # See GH-22580 + index = MultiIndex.from_tuples(tuples) + result = index.to_frame(index=False, name=["first", "second"]) + expected = DataFrame(tuples) + expected.columns = ["first", "second"] + tm.assert_frame_equal(result, expected) + + result = index.to_frame(name=["first", "second"]) + expected.index = index + expected.columns = ["first", "second"] + tm.assert_frame_equal(result, expected) + + msg = "'name' must be a list / sequence of column names." + with pytest.raises(TypeError, match=msg): + index.to_frame(name="first") + + msg = "'name' should have same length as number of levels on index." + with pytest.raises(ValueError, match=msg): + index.to_frame(name=["first"]) + + # Tests for datetime index + index = MultiIndex.from_product([range(5), pd.date_range("20130101", periods=3)]) + result = index.to_frame(index=False) + expected = DataFrame( + { + 0: np.repeat(np.arange(5, dtype="int64"), 3), + 1: np.tile(pd.date_range("20130101", periods=3), 5), + } + ) + tm.assert_frame_equal(result, expected) + + result = index.to_frame() + expected.index = index + tm.assert_frame_equal(result, expected) + + # See GH-22580 + result = index.to_frame(index=False, name=["first", "second"]) + expected = DataFrame( + { + "first": np.repeat(np.arange(5, dtype="int64"), 3), + "second": np.tile(pd.date_range("20130101", periods=3), 5), + } + ) + tm.assert_frame_equal(result, expected) + + result = index.to_frame(name=["first", "second"]) + expected.index = index + tm.assert_frame_equal(result, expected) + + +def test_to_frame_dtype_fidelity(): + # GH 22420 + mi = pd.MultiIndex.from_arrays( + [ + pd.date_range("19910905", periods=6, tz="US/Eastern"), + [1, 1, 1, 2, 2, 2], + pd.Categorical(["a", "a", "b", "b", "c", "c"], ordered=True), + ["x", "x", "y", "z", "x", "y"], + ], + names=["dates", "a", "b", "c"], + ) + original_dtypes = {name: mi.levels[i].dtype for i, name in enumerate(mi.names)} + + expected_df = pd.DataFrame( + { + "dates": pd.date_range("19910905", periods=6, tz="US/Eastern"), + "a": [1, 1, 1, 2, 2, 2], + "b": pd.Categorical(["a", "a", "b", "b", "c", "c"], ordered=True), + "c": ["x", "x", "y", "z", "x", "y"], + } + ) + df = mi.to_frame(index=False) + df_dtypes = df.dtypes.to_dict() + + tm.assert_frame_equal(df, expected_df) + assert original_dtypes == df_dtypes + + +def test_to_frame_resulting_column_order(): + # GH 22420 + expected = ["z", 0, "a"] + mi = pd.MultiIndex.from_arrays( + [["a", "b", "c"], ["x", "y", "z"], ["q", "w", "e"]], names=expected + ) + result = mi.to_frame().columns.tolist() + assert result == expected + + +def test_roundtrip_pickle_with_tz(): + return # FIXME: this can't be right? + + # GH 8367 + # round-trip of timezone + index = MultiIndex.from_product( + [[1, 2], ["a", "b"], date_range("20130101", periods=3, tz="US/Eastern")], + names=["one", "two", "three"], + ) + unpickled = tm.round_trip_pickle(index) + assert index.equal_levels(unpickled) + + +def test_pickle(indices): + return # FIXME: this can't be right? + + unpickled = tm.round_trip_pickle(indices) + assert indices.equals(unpickled) + original_name, indices.name = indices.name, "foo" + unpickled = tm.round_trip_pickle(indices) + assert indices.equals(unpickled) + indices.name = original_name + + +def test_to_series(idx): + # assert that we are creating a copy of the index + + s = idx.to_series() + assert s.values is not idx.values + assert s.index is not idx + assert s.name == idx.name + + +def test_to_series_with_arguments(idx): + # GH18699 + + # index kwarg + s = idx.to_series(index=idx) + + assert s.values is not idx.values + assert s.index is idx + assert s.name == idx.name + + # name kwarg + idx = idx + s = idx.to_series(name="__test") + + assert s.values is not idx.values + assert s.index is not idx + assert s.name != idx.name + + +def test_to_flat_index(idx): + expected = pd.Index( + ( + ("foo", "one"), + ("foo", "two"), + ("bar", "one"), + ("baz", "two"), + ("qux", "one"), + ("qux", "two"), + ), + tupleize_cols=False, + ) + result = idx.to_flat_index() + tm.assert_index_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/multi/test_copy.py b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_copy.py new file mode 100644 index 0000000..1acc65a --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_copy.py @@ -0,0 +1,88 @@ +from copy import copy, deepcopy + +import pytest + +from pandas import MultiIndex +import pandas._testing as tm + + +def assert_multiindex_copied(copy, original): + # Levels should be (at least, shallow copied) + tm.assert_copy(copy.levels, original.levels) + tm.assert_almost_equal(copy.codes, original.codes) + + # Labels doesn't matter which way copied + tm.assert_almost_equal(copy.codes, original.codes) + assert copy.codes is not original.codes + + # Names doesn't matter which way copied + assert copy.names == original.names + assert copy.names is not original.names + + # Sort order should be copied + assert copy.sortorder == original.sortorder + + +def test_copy(idx): + i_copy = idx.copy() + + assert_multiindex_copied(i_copy, idx) + + +def test_shallow_copy(idx): + i_copy = idx._shallow_copy() + + assert_multiindex_copied(i_copy, idx) + + +def test_view(idx): + i_view = idx.view() + assert_multiindex_copied(i_view, idx) + + +@pytest.mark.parametrize("func", [copy, deepcopy]) +def test_copy_and_deepcopy(func): + + idx = MultiIndex( + levels=[["foo", "bar"], ["fizz", "buzz"]], + codes=[[0, 0, 0, 1], [0, 0, 1, 1]], + names=["first", "second"], + ) + idx_copy = func(idx) + assert idx_copy is not idx + assert idx_copy.equals(idx) + + +@pytest.mark.parametrize("deep", [True, False]) +def test_copy_method(deep): + idx = MultiIndex( + levels=[["foo", "bar"], ["fizz", "buzz"]], + codes=[[0, 0, 0, 1], [0, 0, 1, 1]], + names=["first", "second"], + ) + idx_copy = idx.copy(deep=deep) + assert idx_copy.equals(idx) + + +@pytest.mark.parametrize("deep", [True, False]) +@pytest.mark.parametrize( + "kwarg, value", + [ + ("names", ["third", "fourth"]), + ("levels", [["foo2", "bar2"], ["fizz2", "buzz2"]]), + ("codes", [[1, 0, 0, 0], [1, 1, 0, 0]]), + ], +) +def test_copy_method_kwargs(deep, kwarg, value): + # gh-12309: Check that the "name" argument as well other kwargs are honored + idx = MultiIndex( + levels=[["foo", "bar"], ["fizz", "buzz"]], + codes=[[0, 0, 0, 1], [0, 0, 1, 1]], + names=["first", "second"], + ) + return + idx_copy = idx.copy(**{kwarg: value, "deep": deep}) + if kwarg == "names": + assert getattr(idx_copy, kwarg) == value + else: + assert [list(i) for i in getattr(idx_copy, kwarg)] == value diff --git a/venv/Lib/site-packages/pandas/tests/indexes/multi/test_drop.py b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_drop.py new file mode 100644 index 0000000..b909025 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_drop.py @@ -0,0 +1,190 @@ +import numpy as np +import pytest + +from pandas.errors import PerformanceWarning + +import pandas as pd +from pandas import Index, MultiIndex +import pandas._testing as tm + + +def test_drop(idx): + dropped = idx.drop([("foo", "two"), ("qux", "one")]) + + index = MultiIndex.from_tuples([("foo", "two"), ("qux", "one")]) + dropped2 = idx.drop(index) + + expected = idx[[0, 2, 3, 5]] + tm.assert_index_equal(dropped, expected) + tm.assert_index_equal(dropped2, expected) + + dropped = idx.drop(["bar"]) + expected = idx[[0, 1, 3, 4, 5]] + tm.assert_index_equal(dropped, expected) + + dropped = idx.drop("foo") + expected = idx[[2, 3, 4, 5]] + tm.assert_index_equal(dropped, expected) + + index = MultiIndex.from_tuples([("bar", "two")]) + with pytest.raises(KeyError, match=r"^10$"): + idx.drop([("bar", "two")]) + with pytest.raises(KeyError, match=r"^10$"): + idx.drop(index) + with pytest.raises(KeyError, match=r"^'two'$"): + idx.drop(["foo", "two"]) + + # partially correct argument + mixed_index = MultiIndex.from_tuples([("qux", "one"), ("bar", "two")]) + with pytest.raises(KeyError, match=r"^10$"): + idx.drop(mixed_index) + + # error='ignore' + dropped = idx.drop(index, errors="ignore") + expected = idx[[0, 1, 2, 3, 4, 5]] + tm.assert_index_equal(dropped, expected) + + dropped = idx.drop(mixed_index, errors="ignore") + expected = idx[[0, 1, 2, 3, 5]] + tm.assert_index_equal(dropped, expected) + + dropped = idx.drop(["foo", "two"], errors="ignore") + expected = idx[[2, 3, 4, 5]] + tm.assert_index_equal(dropped, expected) + + # mixed partial / full drop + dropped = idx.drop(["foo", ("qux", "one")]) + expected = idx[[2, 3, 5]] + tm.assert_index_equal(dropped, expected) + + # mixed partial / full drop / error='ignore' + mixed_index = ["foo", ("qux", "one"), "two"] + with pytest.raises(KeyError, match=r"^'two'$"): + idx.drop(mixed_index) + dropped = idx.drop(mixed_index, errors="ignore") + expected = idx[[2, 3, 5]] + tm.assert_index_equal(dropped, expected) + + +def test_droplevel_with_names(idx): + index = idx[idx.get_loc("foo")] + dropped = index.droplevel(0) + assert dropped.name == "second" + + index = MultiIndex( + levels=[Index(range(4)), Index(range(4)), Index(range(4))], + codes=[ + np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0]), + ], + names=["one", "two", "three"], + ) + dropped = index.droplevel(0) + assert dropped.names == ("two", "three") + + dropped = index.droplevel("two") + expected = index.droplevel(1) + assert dropped.equals(expected) + + +def test_droplevel_list(): + index = MultiIndex( + levels=[Index(range(4)), Index(range(4)), Index(range(4))], + codes=[ + np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0]), + ], + names=["one", "two", "three"], + ) + + dropped = index[:2].droplevel(["three", "one"]) + expected = index[:2].droplevel(2).droplevel(0) + assert dropped.equals(expected) + + dropped = index[:2].droplevel([]) + expected = index[:2] + assert dropped.equals(expected) + + msg = ( + "Cannot remove 3 levels from an index with 3 levels: " + "at least one level must be left" + ) + with pytest.raises(ValueError, match=msg): + index[:2].droplevel(["one", "two", "three"]) + + with pytest.raises(KeyError, match="'Level four not found'"): + index[:2].droplevel(["one", "four"]) + + +def test_drop_not_lexsorted(): + # GH 12078 + + # define the lexsorted version of the multi-index + tuples = [("a", ""), ("b1", "c1"), ("b2", "c2")] + lexsorted_mi = MultiIndex.from_tuples(tuples, names=["b", "c"]) + assert lexsorted_mi.is_lexsorted() + + # and the not-lexsorted version + df = pd.DataFrame( + columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]] + ) + df = df.pivot_table(index="a", columns=["b", "c"], values="d") + df = df.reset_index() + not_lexsorted_mi = df.columns + assert not not_lexsorted_mi.is_lexsorted() + + # compare the results + tm.assert_index_equal(lexsorted_mi, not_lexsorted_mi) + with tm.assert_produces_warning(PerformanceWarning): + tm.assert_index_equal(lexsorted_mi.drop("a"), not_lexsorted_mi.drop("a")) + + +@pytest.mark.parametrize( + "msg,labels,level", + [ + (r"labels \[4\] not found in level", 4, "a"), + (r"labels \[7\] not found in level", 7, "b"), + ], +) +def test_drop_raise_exception_if_labels_not_in_level(msg, labels, level): + # GH 8594 + mi = MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"]) + s = pd.Series([10, 20, 30], index=mi) + df = pd.DataFrame([10, 20, 30], index=mi) + + with pytest.raises(KeyError, match=msg): + s.drop(labels, level=level) + with pytest.raises(KeyError, match=msg): + df.drop(labels, level=level) + + +@pytest.mark.parametrize("labels,level", [(4, "a"), (7, "b")]) +def test_drop_errors_ignore(labels, level): + # GH 8594 + mi = MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"]) + s = pd.Series([10, 20, 30], index=mi) + df = pd.DataFrame([10, 20, 30], index=mi) + + expected_s = s.drop(labels, level=level, errors="ignore") + tm.assert_series_equal(s, expected_s) + + expected_df = df.drop(labels, level=level, errors="ignore") + tm.assert_frame_equal(df, expected_df) + + +def test_drop_with_non_unique_datetime_index_and_invalid_keys(): + # GH 30399 + + # define dataframe with unique datetime index + df = pd.DataFrame( + np.random.randn(5, 3), + columns=["a", "b", "c"], + index=pd.date_range("2012", freq="H", periods=5), + ) + # create dataframe with non-unique datetime index + df = df.iloc[[0, 2, 2, 3]].copy() + + with pytest.raises(KeyError, match="not found in axis"): + df.drop(["a", "b"]) # Dropping with labels not exist in the index diff --git a/venv/Lib/site-packages/pandas/tests/indexes/multi/test_duplicates.py b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_duplicates.py new file mode 100644 index 0000000..93e1de5 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_duplicates.py @@ -0,0 +1,276 @@ +from itertools import product + +import numpy as np +import pytest + +from pandas._libs import hashtable + +from pandas import DatetimeIndex, MultiIndex +import pandas._testing as tm + + +@pytest.mark.parametrize("names", [None, ["first", "second"]]) +def test_unique(names): + mi = MultiIndex.from_arrays([[1, 2, 1, 2], [1, 1, 1, 2]], names=names) + + res = mi.unique() + exp = MultiIndex.from_arrays([[1, 2, 2], [1, 1, 2]], names=mi.names) + tm.assert_index_equal(res, exp) + + mi = MultiIndex.from_arrays([list("aaaa"), list("abab")], names=names) + res = mi.unique() + exp = MultiIndex.from_arrays([list("aa"), list("ab")], names=mi.names) + tm.assert_index_equal(res, exp) + + mi = MultiIndex.from_arrays([list("aaaa"), list("aaaa")], names=names) + res = mi.unique() + exp = MultiIndex.from_arrays([["a"], ["a"]], names=mi.names) + tm.assert_index_equal(res, exp) + + # GH #20568 - empty MI + mi = MultiIndex.from_arrays([[], []], names=names) + res = mi.unique() + tm.assert_index_equal(mi, res) + + +def test_unique_datetimelike(): + idx1 = DatetimeIndex( + ["2015-01-01", "2015-01-01", "2015-01-01", "2015-01-01", "NaT", "NaT"] + ) + idx2 = DatetimeIndex( + ["2015-01-01", "2015-01-01", "2015-01-02", "2015-01-02", "NaT", "2015-01-01"], + tz="Asia/Tokyo", + ) + result = MultiIndex.from_arrays([idx1, idx2]).unique() + + eidx1 = DatetimeIndex(["2015-01-01", "2015-01-01", "NaT", "NaT"]) + eidx2 = DatetimeIndex( + ["2015-01-01", "2015-01-02", "NaT", "2015-01-01"], tz="Asia/Tokyo" + ) + exp = MultiIndex.from_arrays([eidx1, eidx2]) + tm.assert_index_equal(result, exp) + + +@pytest.mark.parametrize("level", [0, "first", 1, "second"]) +def test_unique_level(idx, level): + # GH #17896 - with level= argument + result = idx.unique(level=level) + expected = idx.get_level_values(level).unique() + tm.assert_index_equal(result, expected) + + # With already unique level + mi = MultiIndex.from_arrays([[1, 3, 2, 4], [1, 3, 2, 5]], names=["first", "second"]) + result = mi.unique(level=level) + expected = mi.get_level_values(level) + tm.assert_index_equal(result, expected) + + # With empty MI + mi = MultiIndex.from_arrays([[], []], names=["first", "second"]) + result = mi.unique(level=level) + expected = mi.get_level_values(level) + + +@pytest.mark.parametrize("dropna", [True, False]) +def test_get_unique_index(idx, dropna): + mi = idx[[0, 1, 0, 1, 1, 0, 0]] + expected = mi._shallow_copy(mi[[0, 1]]) + + result = mi._get_unique_index(dropna=dropna) + assert result.unique + tm.assert_index_equal(result, expected) + + +def test_duplicate_multiindex_codes(): + # GH 17464 + # Make sure that a MultiIndex with duplicate levels throws a ValueError + with pytest.raises(ValueError): + mi = MultiIndex([["A"] * 10, range(10)], [[0] * 10, range(10)]) + + # And that using set_levels with duplicate levels fails + mi = MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]]) + with pytest.raises(ValueError): + mi.set_levels([["A", "B", "A", "A", "B"], [2, 1, 3, -2, 5]], inplace=True) + + +@pytest.mark.parametrize("names", [["a", "b", "a"], [1, 1, 2], [1, "a", 1]]) +def test_duplicate_level_names(names): + # GH18872, GH19029 + mi = MultiIndex.from_product([[0, 1]] * 3, names=names) + assert mi.names == names + + # With .rename() + mi = MultiIndex.from_product([[0, 1]] * 3) + mi = mi.rename(names) + assert mi.names == names + + # With .rename(., level=) + mi.rename(names[1], level=1, inplace=True) + mi = mi.rename([names[0], names[2]], level=[0, 2]) + assert mi.names == names + + +def test_duplicate_meta_data(): + # GH 10115 + mi = MultiIndex( + levels=[[0, 1], [0, 1, 2]], codes=[[0, 0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 0, 1, 2]] + ) + + for idx in [ + mi, + mi.set_names([None, None]), + mi.set_names([None, "Num"]), + mi.set_names(["Upper", "Num"]), + ]: + assert idx.has_duplicates + assert idx.drop_duplicates().names == idx.names + + +def test_has_duplicates(idx, idx_dup): + # see fixtures + assert idx.is_unique is True + assert idx.has_duplicates is False + assert idx_dup.is_unique is False + assert idx_dup.has_duplicates is True + + mi = MultiIndex( + levels=[[0, 1], [0, 1, 2]], codes=[[0, 0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 0, 1, 2]] + ) + assert mi.is_unique is False + assert mi.has_duplicates is True + + # single instance of NaN + mi_nan = MultiIndex( + levels=[["a", "b"], [0, 1]], codes=[[-1, 0, 0, 1, 1], [-1, 0, 1, 0, 1]] + ) + assert mi_nan.is_unique is True + assert mi_nan.has_duplicates is False + + # multiple instances of NaN + mi_nan_dup = MultiIndex( + levels=[["a", "b"], [0, 1]], codes=[[-1, -1, 0, 0, 1, 1], [-1, -1, 0, 1, 0, 1]] + ) + assert mi_nan_dup.is_unique is False + assert mi_nan_dup.has_duplicates is True + + +def test_has_duplicates_from_tuples(): + # GH 9075 + t = [ + ("x", "out", "z", 5, "y", "in", "z", 169), + ("x", "out", "z", 7, "y", "in", "z", 119), + ("x", "out", "z", 9, "y", "in", "z", 135), + ("x", "out", "z", 13, "y", "in", "z", 145), + ("x", "out", "z", 14, "y", "in", "z", 158), + ("x", "out", "z", 16, "y", "in", "z", 122), + ("x", "out", "z", 17, "y", "in", "z", 160), + ("x", "out", "z", 18, "y", "in", "z", 180), + ("x", "out", "z", 20, "y", "in", "z", 143), + ("x", "out", "z", 21, "y", "in", "z", 128), + ("x", "out", "z", 22, "y", "in", "z", 129), + ("x", "out", "z", 25, "y", "in", "z", 111), + ("x", "out", "z", 28, "y", "in", "z", 114), + ("x", "out", "z", 29, "y", "in", "z", 121), + ("x", "out", "z", 31, "y", "in", "z", 126), + ("x", "out", "z", 32, "y", "in", "z", 155), + ("x", "out", "z", 33, "y", "in", "z", 123), + ("x", "out", "z", 12, "y", "in", "z", 144), + ] + + mi = MultiIndex.from_tuples(t) + assert not mi.has_duplicates + + +def test_has_duplicates_overflow(): + # handle int64 overflow if possible + def check(nlevels, with_nulls): + codes = np.tile(np.arange(500), 2) + level = np.arange(500) + + if with_nulls: # inject some null values + codes[500] = -1 # common nan value + codes = [codes.copy() for i in range(nlevels)] + for i in range(nlevels): + codes[i][500 + i - nlevels // 2] = -1 + + codes += [np.array([-1, 1]).repeat(500)] + else: + codes = [codes] * nlevels + [np.arange(2).repeat(500)] + + levels = [level] * nlevels + [[0, 1]] + + # no dups + mi = MultiIndex(levels=levels, codes=codes) + assert not mi.has_duplicates + + # with a dup + if with_nulls: + + def f(a): + return np.insert(a, 1000, a[0]) + + codes = list(map(f, codes)) + mi = MultiIndex(levels=levels, codes=codes) + else: + values = mi.values.tolist() + mi = MultiIndex.from_tuples(values + [values[0]]) + + assert mi.has_duplicates + + # no overflow + check(4, False) + check(4, True) + + # overflow possible + check(8, False) + check(8, True) + + +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", np.array([False, False, False, True, True, False])), + ("last", np.array([False, True, True, False, False, False])), + (False, np.array([False, True, True, True, True, False])), + ], +) +def test_duplicated(idx_dup, keep, expected): + result = idx_dup.duplicated(keep=keep) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("keep", ["first", "last", False]) +def test_duplicated_large(keep): + # GH 9125 + n, k = 200, 5000 + levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)] + codes = [np.random.choice(n, k * n) for lev in levels] + mi = MultiIndex(levels=levels, codes=codes) + + result = mi.duplicated(keep=keep) + expected = hashtable.duplicated_object(mi.values, keep=keep) + tm.assert_numpy_array_equal(result, expected) + + +def test_duplicated2(): + # TODO: more informative test name + # GH5873 + for a in [101, 102]: + mi = MultiIndex.from_arrays([[101, a], [3.5, np.nan]]) + assert not mi.has_duplicates + + tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(2, dtype="bool")) + + for n in range(1, 6): # 1st level shape + for m in range(1, 5): # 2nd level shape + # all possible unique combinations, including nan + codes = product(range(-1, n), range(-1, m)) + mi = MultiIndex( + levels=[list("abcde")[:n], list("WXYZ")[:m]], + codes=np.random.permutation(list(codes)).T, + ) + assert len(mi) == (n + 1) * (m + 1) + assert not mi.has_duplicates + + tm.assert_numpy_array_equal( + mi.duplicated(), np.zeros(len(mi), dtype="bool") + ) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/multi/test_equivalence.py b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_equivalence.py new file mode 100644 index 0000000..063ede0 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_equivalence.py @@ -0,0 +1,226 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import Index, MultiIndex, Series +import pandas._testing as tm + + +def test_equals(idx): + assert idx.equals(idx) + assert idx.equals(idx.copy()) + assert idx.equals(idx.astype(object)) + + assert not idx.equals(list(idx)) + assert not idx.equals(np.array(idx)) + + same_values = Index(idx, dtype=object) + assert idx.equals(same_values) + assert same_values.equals(idx) + + if idx.nlevels == 1: + # do not test MultiIndex + assert not idx.equals(pd.Series(idx)) + + +def test_equals_op(idx): + # GH9947, GH10637 + index_a = idx + + n = len(index_a) + index_b = index_a[0:-1] + index_c = index_a[0:-1].append(index_a[-2:-1]) + index_d = index_a[0:1] + with pytest.raises(ValueError, match="Lengths must match"): + index_a == index_b + expected1 = np.array([True] * n) + expected2 = np.array([True] * (n - 1) + [False]) + tm.assert_numpy_array_equal(index_a == index_a, expected1) + tm.assert_numpy_array_equal(index_a == index_c, expected2) + + # test comparisons with numpy arrays + array_a = np.array(index_a) + array_b = np.array(index_a[0:-1]) + array_c = np.array(index_a[0:-1].append(index_a[-2:-1])) + array_d = np.array(index_a[0:1]) + with pytest.raises(ValueError, match="Lengths must match"): + index_a == array_b + tm.assert_numpy_array_equal(index_a == array_a, expected1) + tm.assert_numpy_array_equal(index_a == array_c, expected2) + + # test comparisons with Series + series_a = Series(array_a) + series_b = Series(array_b) + series_c = Series(array_c) + series_d = Series(array_d) + with pytest.raises(ValueError, match="Lengths must match"): + index_a == series_b + + tm.assert_numpy_array_equal(index_a == series_a, expected1) + tm.assert_numpy_array_equal(index_a == series_c, expected2) + + # cases where length is 1 for one of them + with pytest.raises(ValueError, match="Lengths must match"): + index_a == index_d + with pytest.raises(ValueError, match="Lengths must match"): + index_a == series_d + with pytest.raises(ValueError, match="Lengths must match"): + index_a == array_d + msg = "Can only compare identically-labeled Series objects" + with pytest.raises(ValueError, match=msg): + series_a == series_d + with pytest.raises(ValueError, match="Lengths must match"): + series_a == array_d + + # comparing with a scalar should broadcast; note that we are excluding + # MultiIndex because in this case each item in the index is a tuple of + # length 2, and therefore is considered an array of length 2 in the + # comparison instead of a scalar + if not isinstance(index_a, MultiIndex): + expected3 = np.array([False] * (len(index_a) - 2) + [True, False]) + # assuming the 2nd to last item is unique in the data + item = index_a[-2] + tm.assert_numpy_array_equal(index_a == item, expected3) + tm.assert_series_equal(series_a == item, Series(expected3)) + + +def test_equals_multi(idx): + assert idx.equals(idx) + assert not idx.equals(idx.values) + assert idx.equals(Index(idx.values)) + + assert idx.equal_levels(idx) + assert not idx.equals(idx[:-1]) + assert not idx.equals(idx[-1]) + + # different number of levels + index = MultiIndex( + levels=[Index(list(range(4))), Index(list(range(4))), Index(list(range(4)))], + codes=[ + np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0]), + ], + ) + + index2 = MultiIndex(levels=index.levels[:-1], codes=index.codes[:-1]) + assert not index.equals(index2) + assert not index.equal_levels(index2) + + # levels are different + major_axis = Index(list(range(4))) + minor_axis = Index(list(range(2))) + + major_codes = np.array([0, 0, 1, 2, 2, 3]) + minor_codes = np.array([0, 1, 0, 0, 1, 0]) + + index = MultiIndex( + levels=[major_axis, minor_axis], codes=[major_codes, minor_codes] + ) + assert not idx.equals(index) + assert not idx.equal_levels(index) + + # some of the labels are different + major_axis = Index(["foo", "bar", "baz", "qux"]) + minor_axis = Index(["one", "two"]) + + major_codes = np.array([0, 0, 2, 2, 3, 3]) + minor_codes = np.array([0, 1, 0, 1, 0, 1]) + + index = MultiIndex( + levels=[major_axis, minor_axis], codes=[major_codes, minor_codes] + ) + assert not idx.equals(index) + + +def test_identical(idx): + mi = idx.copy() + mi2 = idx.copy() + assert mi.identical(mi2) + + mi = mi.set_names(["new1", "new2"]) + assert mi.equals(mi2) + assert not mi.identical(mi2) + + mi2 = mi2.set_names(["new1", "new2"]) + assert mi.identical(mi2) + + mi3 = Index(mi.tolist(), names=mi.names) + msg = r"Unexpected keyword arguments {'names'}" + with pytest.raises(TypeError, match=msg): + Index(mi.tolist(), names=mi.names, tupleize_cols=False) + mi4 = Index(mi.tolist(), tupleize_cols=False) + assert mi.identical(mi3) + assert not mi.identical(mi4) + assert mi.equals(mi4) + + +def test_equals_operator(idx): + # GH9785 + assert (idx == idx).all() + + +def test_equals_missing_values(): + # make sure take is not using -1 + i = pd.MultiIndex.from_tuples([(0, pd.NaT), (0, pd.Timestamp("20130101"))]) + result = i[0:1].equals(i[0]) + assert not result + result = i[1:2].equals(i[1]) + assert not result + + +def test_is_(): + mi = MultiIndex.from_tuples(zip(range(10), range(10))) + assert mi.is_(mi) + assert mi.is_(mi.view()) + assert mi.is_(mi.view().view().view().view()) + mi2 = mi.view() + # names are metadata, they don't change id + mi2.names = ["A", "B"] + assert mi2.is_(mi) + assert mi.is_(mi2) + + assert not mi.is_(mi.set_names(["C", "D"])) + mi2 = mi.view() + mi2.set_names(["E", "F"], inplace=True) + assert mi.is_(mi2) + # levels are inherent properties, they change identity + mi3 = mi2.set_levels([list(range(10)), list(range(10))]) + assert not mi3.is_(mi2) + # shouldn't change + assert mi2.is_(mi) + mi4 = mi3.view() + + # GH 17464 - Remove duplicate MultiIndex levels + mi4.set_levels([list(range(10)), list(range(10))], inplace=True) + assert not mi4.is_(mi3) + mi5 = mi.view() + mi5.set_levels(mi5.levels, inplace=True) + assert not mi5.is_(mi) + + +def test_is_all_dates(idx): + assert not idx.is_all_dates + + +def test_is_numeric(idx): + # MultiIndex is never numeric + assert not idx.is_numeric() + + +def test_multiindex_compare(): + # GH 21149 + # Ensure comparison operations for MultiIndex with nlevels == 1 + # behave consistently with those for MultiIndex with nlevels > 1 + + midx = pd.MultiIndex.from_product([[0, 1]]) + + # Equality self-test: MultiIndex object vs self + expected = pd.Series([True, True]) + result = pd.Series(midx == midx) + tm.assert_series_equal(result, expected) + + # Greater than comparison: MultiIndex object vs self + expected = pd.Series([False, False]) + result = pd.Series(midx > midx) + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/multi/test_format.py b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_format.py new file mode 100644 index 0000000..75f23fb --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_format.py @@ -0,0 +1,197 @@ +import warnings + +import pytest + +import pandas as pd +from pandas import MultiIndex +import pandas._testing as tm + + +def test_format(idx): + idx.format() + idx[:0].format() + + +def test_format_integer_names(): + index = MultiIndex( + levels=[[0, 1], [0, 1]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]], names=[0, 1] + ) + index.format(names=True) + + +def test_format_sparse_config(idx): + warn_filters = warnings.filters + warnings.filterwarnings("ignore", category=FutureWarning, module=".*format") + # GH1538 + pd.set_option("display.multi_sparse", False) + + result = idx.format() + assert result[1] == "foo two" + + tm.reset_display_options() + + warnings.filters = warn_filters + + +def test_format_sparse_display(): + index = MultiIndex( + levels=[[0, 1], [0, 1], [0, 1], [0]], + codes=[ + [0, 0, 0, 1, 1, 1], + [0, 0, 1, 0, 0, 1], + [0, 1, 0, 0, 1, 0], + [0, 0, 0, 0, 0, 0], + ], + ) + + result = index.format() + assert result[3] == "1 0 0 0" + + +def test_repr_with_unicode_data(): + with pd.option_context("display.encoding", "UTF-8"): + d = {"a": ["\u05d0", 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} + index = pd.DataFrame(d).set_index(["a", "b"]).index + assert "\\" not in repr(index) # we don't want unicode-escaped + + +def test_repr_roundtrip_raises(): + mi = MultiIndex.from_product([list("ab"), range(3)], names=["first", "second"]) + with pytest.raises(TypeError): + eval(repr(mi)) + + +def test_unicode_string_with_unicode(): + d = {"a": ["\u05d0", 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} + idx = pd.DataFrame(d).set_index(["a", "b"]).index + str(idx) + + +def test_repr_max_seq_item_setting(idx): + # GH10182 + idx = idx.repeat(50) + with pd.option_context("display.max_seq_items", None): + repr(idx) + assert "..." not in str(idx) + + +class TestRepr: + def test_repr(self, idx): + result = idx[:1].__repr__() + expected = """\ +MultiIndex([('foo', 'one')], + names=['first', 'second'])""" + assert result == expected + + result = idx.__repr__() + expected = """\ +MultiIndex([('foo', 'one'), + ('foo', 'two'), + ('bar', 'one'), + ('baz', 'two'), + ('qux', 'one'), + ('qux', 'two')], + names=['first', 'second'])""" + assert result == expected + + with pd.option_context("display.max_seq_items", 5): + result = idx.__repr__() + expected = """\ +MultiIndex([('foo', 'one'), + ('foo', 'two'), + ... + ('qux', 'one'), + ('qux', 'two')], + names=['first', 'second'], length=6)""" + assert result == expected + + def test_rjust(self, narrow_multi_index): + mi = narrow_multi_index + result = mi[:1].__repr__() + expected = """\ +MultiIndex([('a', 9, '2000-01-01 00:00:00')], + names=['a', 'b', 'dti'])""" + assert result == expected + + result = mi[::500].__repr__() + expected = """\ +MultiIndex([( 'a', 9, '2000-01-01 00:00:00'), + ( 'a', 9, '2000-01-01 00:08:20'), + ('abc', 10, '2000-01-01 00:16:40'), + ('abc', 10, '2000-01-01 00:25:00')], + names=['a', 'b', 'dti'])""" + assert result == expected + + result = mi.__repr__() + expected = """\ +MultiIndex([( 'a', 9, '2000-01-01 00:00:00'), + ( 'a', 9, '2000-01-01 00:00:01'), + ( 'a', 9, '2000-01-01 00:00:02'), + ( 'a', 9, '2000-01-01 00:00:03'), + ( 'a', 9, '2000-01-01 00:00:04'), + ( 'a', 9, '2000-01-01 00:00:05'), + ( 'a', 9, '2000-01-01 00:00:06'), + ( 'a', 9, '2000-01-01 00:00:07'), + ( 'a', 9, '2000-01-01 00:00:08'), + ( 'a', 9, '2000-01-01 00:00:09'), + ... + ('abc', 10, '2000-01-01 00:33:10'), + ('abc', 10, '2000-01-01 00:33:11'), + ('abc', 10, '2000-01-01 00:33:12'), + ('abc', 10, '2000-01-01 00:33:13'), + ('abc', 10, '2000-01-01 00:33:14'), + ('abc', 10, '2000-01-01 00:33:15'), + ('abc', 10, '2000-01-01 00:33:16'), + ('abc', 10, '2000-01-01 00:33:17'), + ('abc', 10, '2000-01-01 00:33:18'), + ('abc', 10, '2000-01-01 00:33:19')], + names=['a', 'b', 'dti'], length=2000)""" + assert result == expected + + def test_tuple_width(self, wide_multi_index): + mi = wide_multi_index + result = mi[:1].__repr__() + expected = """MultiIndex([('a', 9, '2000-01-01 00:00:00', '2000-01-01 00:00:00', ...)], + names=['a', 'b', 'dti_1', 'dti_2', 'dti_3'])""" + assert result == expected + + result = mi[:10].__repr__() + expected = """\ +MultiIndex([('a', 9, '2000-01-01 00:00:00', '2000-01-01 00:00:00', ...), + ('a', 9, '2000-01-01 00:00:01', '2000-01-01 00:00:01', ...), + ('a', 9, '2000-01-01 00:00:02', '2000-01-01 00:00:02', ...), + ('a', 9, '2000-01-01 00:00:03', '2000-01-01 00:00:03', ...), + ('a', 9, '2000-01-01 00:00:04', '2000-01-01 00:00:04', ...), + ('a', 9, '2000-01-01 00:00:05', '2000-01-01 00:00:05', ...), + ('a', 9, '2000-01-01 00:00:06', '2000-01-01 00:00:06', ...), + ('a', 9, '2000-01-01 00:00:07', '2000-01-01 00:00:07', ...), + ('a', 9, '2000-01-01 00:00:08', '2000-01-01 00:00:08', ...), + ('a', 9, '2000-01-01 00:00:09', '2000-01-01 00:00:09', ...)], + names=['a', 'b', 'dti_1', 'dti_2', 'dti_3'])""" + assert result == expected + + result = mi.__repr__() + expected = """\ +MultiIndex([( 'a', 9, '2000-01-01 00:00:00', '2000-01-01 00:00:00', ...), + ( 'a', 9, '2000-01-01 00:00:01', '2000-01-01 00:00:01', ...), + ( 'a', 9, '2000-01-01 00:00:02', '2000-01-01 00:00:02', ...), + ( 'a', 9, '2000-01-01 00:00:03', '2000-01-01 00:00:03', ...), + ( 'a', 9, '2000-01-01 00:00:04', '2000-01-01 00:00:04', ...), + ( 'a', 9, '2000-01-01 00:00:05', '2000-01-01 00:00:05', ...), + ( 'a', 9, '2000-01-01 00:00:06', '2000-01-01 00:00:06', ...), + ( 'a', 9, '2000-01-01 00:00:07', '2000-01-01 00:00:07', ...), + ( 'a', 9, '2000-01-01 00:00:08', '2000-01-01 00:00:08', ...), + ( 'a', 9, '2000-01-01 00:00:09', '2000-01-01 00:00:09', ...), + ... + ('abc', 10, '2000-01-01 00:33:10', '2000-01-01 00:33:10', ...), + ('abc', 10, '2000-01-01 00:33:11', '2000-01-01 00:33:11', ...), + ('abc', 10, '2000-01-01 00:33:12', '2000-01-01 00:33:12', ...), + ('abc', 10, '2000-01-01 00:33:13', '2000-01-01 00:33:13', ...), + ('abc', 10, '2000-01-01 00:33:14', '2000-01-01 00:33:14', ...), + ('abc', 10, '2000-01-01 00:33:15', '2000-01-01 00:33:15', ...), + ('abc', 10, '2000-01-01 00:33:16', '2000-01-01 00:33:16', ...), + ('abc', 10, '2000-01-01 00:33:17', '2000-01-01 00:33:17', ...), + ('abc', 10, '2000-01-01 00:33:18', '2000-01-01 00:33:18', ...), + ('abc', 10, '2000-01-01 00:33:19', '2000-01-01 00:33:19', ...)], + names=['a', 'b', 'dti_1', 'dti_2', 'dti_3'], length=2000)""" # noqa + assert result == expected diff --git a/venv/Lib/site-packages/pandas/tests/indexes/multi/test_get_set.py b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_get_set.py new file mode 100644 index 0000000..074072a --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_get_set.py @@ -0,0 +1,417 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import CategoricalIndex, Index, MultiIndex +import pandas._testing as tm + + +def assert_matching(actual, expected, check_dtype=False): + # avoid specifying internal representation + # as much as possible + assert len(actual) == len(expected) + for act, exp in zip(actual, expected): + act = np.asarray(act) + exp = np.asarray(exp) + tm.assert_numpy_array_equal(act, exp, check_dtype=check_dtype) + + +def test_get_level_number_integer(idx): + idx.names = [1, 0] + assert idx._get_level_number(1) == 0 + assert idx._get_level_number(0) == 1 + msg = "Too many levels: Index has only 2 levels, not 3" + with pytest.raises(IndexError, match=msg): + idx._get_level_number(2) + with pytest.raises(KeyError, match="Level fourth not found"): + idx._get_level_number("fourth") + + +def test_get_level_values(idx): + result = idx.get_level_values(0) + expected = Index(["foo", "foo", "bar", "baz", "qux", "qux"], name="first") + tm.assert_index_equal(result, expected) + assert result.name == "first" + + result = idx.get_level_values("first") + expected = idx.get_level_values(0) + tm.assert_index_equal(result, expected) + + # GH 10460 + index = MultiIndex( + levels=[CategoricalIndex(["A", "B"]), CategoricalIndex([1, 2, 3])], + codes=[np.array([0, 0, 0, 1, 1, 1]), np.array([0, 1, 2, 0, 1, 2])], + ) + + exp = CategoricalIndex(["A", "A", "A", "B", "B", "B"]) + tm.assert_index_equal(index.get_level_values(0), exp) + exp = CategoricalIndex([1, 2, 3, 1, 2, 3]) + tm.assert_index_equal(index.get_level_values(1), exp) + + +def test_get_value_duplicates(): + index = MultiIndex( + levels=[["D", "B", "C"], [0, 26, 27, 37, 57, 67, 75, 82]], + codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], + names=["tag", "day"], + ) + + assert index.get_loc("D") == slice(0, 3) + with pytest.raises(KeyError, match=r"^'D'$"): + index._engine.get_value(np.array([]), "D") + + +def test_get_level_values_all_na(): + # GH 17924 when level entirely consists of nan + arrays = [[np.nan, np.nan, np.nan], ["a", np.nan, 1]] + index = pd.MultiIndex.from_arrays(arrays) + result = index.get_level_values(0) + expected = pd.Index([np.nan, np.nan, np.nan], dtype=np.float64) + tm.assert_index_equal(result, expected) + + result = index.get_level_values(1) + expected = pd.Index(["a", np.nan, 1], dtype=object) + tm.assert_index_equal(result, expected) + + +def test_get_level_values_int_with_na(): + # GH 17924 + arrays = [["a", "b", "b"], [1, np.nan, 2]] + index = pd.MultiIndex.from_arrays(arrays) + result = index.get_level_values(1) + expected = Index([1, np.nan, 2]) + tm.assert_index_equal(result, expected) + + arrays = [["a", "b", "b"], [np.nan, np.nan, 2]] + index = pd.MultiIndex.from_arrays(arrays) + result = index.get_level_values(1) + expected = Index([np.nan, np.nan, 2]) + tm.assert_index_equal(result, expected) + + +def test_get_level_values_na(): + arrays = [[np.nan, np.nan, np.nan], ["a", np.nan, 1]] + index = pd.MultiIndex.from_arrays(arrays) + result = index.get_level_values(0) + expected = pd.Index([np.nan, np.nan, np.nan]) + tm.assert_index_equal(result, expected) + + result = index.get_level_values(1) + expected = pd.Index(["a", np.nan, 1]) + tm.assert_index_equal(result, expected) + + arrays = [["a", "b", "b"], pd.DatetimeIndex([0, 1, pd.NaT])] + index = pd.MultiIndex.from_arrays(arrays) + result = index.get_level_values(1) + expected = pd.DatetimeIndex([0, 1, pd.NaT]) + tm.assert_index_equal(result, expected) + + arrays = [[], []] + index = pd.MultiIndex.from_arrays(arrays) + result = index.get_level_values(0) + expected = pd.Index([], dtype=object) + tm.assert_index_equal(result, expected) + + +def test_set_name_methods(idx, index_names): + # so long as these are synonyms, we don't need to test set_names + assert idx.rename == idx.set_names + new_names = [name + "SUFFIX" for name in index_names] + ind = idx.set_names(new_names) + assert idx.names == index_names + assert ind.names == new_names + msg = "Length of names must match number of levels in MultiIndex" + with pytest.raises(ValueError, match=msg): + ind.set_names(new_names + new_names) + new_names2 = [name + "SUFFIX2" for name in new_names] + res = ind.set_names(new_names2, inplace=True) + assert res is None + assert ind.names == new_names2 + + # set names for specific level (# GH7792) + ind = idx.set_names(new_names[0], level=0) + assert idx.names == index_names + assert ind.names == [new_names[0], index_names[1]] + + res = ind.set_names(new_names2[0], level=0, inplace=True) + assert res is None + assert ind.names == [new_names2[0], index_names[1]] + + # set names for multiple levels + ind = idx.set_names(new_names, level=[0, 1]) + assert idx.names == index_names + assert ind.names == new_names + + res = ind.set_names(new_names2, level=[0, 1], inplace=True) + assert res is None + assert ind.names == new_names2 + + +def test_set_levels_codes_directly(idx): + # setting levels/codes directly raises AttributeError + + levels = idx.levels + new_levels = [[lev + "a" for lev in level] for level in levels] + + codes = idx.codes + major_codes, minor_codes = codes + major_codes = [(x + 1) % 3 for x in major_codes] + minor_codes = [(x + 1) % 1 for x in minor_codes] + new_codes = [major_codes, minor_codes] + + msg = "can't set attribute" + with pytest.raises(AttributeError, match=msg): + idx.levels = new_levels + with pytest.raises(AttributeError, match=msg): + idx.codes = new_codes + + +def test_set_levels(idx): + # side note - you probably wouldn't want to use levels and codes + # directly like this - but it is possible. + levels = idx.levels + new_levels = [[lev + "a" for lev in level] for level in levels] + + # level changing [w/o mutation] + ind2 = idx.set_levels(new_levels) + assert_matching(ind2.levels, new_levels) + assert_matching(idx.levels, levels) + + # level changing [w/ mutation] + ind2 = idx.copy() + inplace_return = ind2.set_levels(new_levels, inplace=True) + assert inplace_return is None + assert_matching(ind2.levels, new_levels) + + # level changing specific level [w/o mutation] + ind2 = idx.set_levels(new_levels[0], level=0) + assert_matching(ind2.levels, [new_levels[0], levels[1]]) + assert_matching(idx.levels, levels) + + ind2 = idx.set_levels(new_levels[1], level=1) + assert_matching(ind2.levels, [levels[0], new_levels[1]]) + assert_matching(idx.levels, levels) + + # level changing multiple levels [w/o mutation] + ind2 = idx.set_levels(new_levels, level=[0, 1]) + assert_matching(ind2.levels, new_levels) + assert_matching(idx.levels, levels) + + # level changing specific level [w/ mutation] + ind2 = idx.copy() + inplace_return = ind2.set_levels(new_levels[0], level=0, inplace=True) + assert inplace_return is None + assert_matching(ind2.levels, [new_levels[0], levels[1]]) + assert_matching(idx.levels, levels) + + ind2 = idx.copy() + inplace_return = ind2.set_levels(new_levels[1], level=1, inplace=True) + assert inplace_return is None + assert_matching(ind2.levels, [levels[0], new_levels[1]]) + assert_matching(idx.levels, levels) + + # level changing multiple levels [w/ mutation] + ind2 = idx.copy() + inplace_return = ind2.set_levels(new_levels, level=[0, 1], inplace=True) + assert inplace_return is None + assert_matching(ind2.levels, new_levels) + assert_matching(idx.levels, levels) + + # illegal level changing should not change levels + # GH 13754 + original_index = idx.copy() + for inplace in [True, False]: + with pytest.raises(ValueError, match="^On"): + idx.set_levels(["c"], level=0, inplace=inplace) + assert_matching(idx.levels, original_index.levels, check_dtype=True) + + with pytest.raises(ValueError, match="^On"): + idx.set_codes([0, 1, 2, 3, 4, 5], level=0, inplace=inplace) + assert_matching(idx.codes, original_index.codes, check_dtype=True) + + with pytest.raises(TypeError, match="^Levels"): + idx.set_levels("c", level=0, inplace=inplace) + assert_matching(idx.levels, original_index.levels, check_dtype=True) + + with pytest.raises(TypeError, match="^Codes"): + idx.set_codes(1, level=0, inplace=inplace) + assert_matching(idx.codes, original_index.codes, check_dtype=True) + + +def test_set_codes(idx): + # side note - you probably wouldn't want to use levels and codes + # directly like this - but it is possible. + codes = idx.codes + major_codes, minor_codes = codes + major_codes = [(x + 1) % 3 for x in major_codes] + minor_codes = [(x + 1) % 1 for x in minor_codes] + new_codes = [major_codes, minor_codes] + + # changing codes w/o mutation + ind2 = idx.set_codes(new_codes) + assert_matching(ind2.codes, new_codes) + assert_matching(idx.codes, codes) + + # changing label w/ mutation + ind2 = idx.copy() + inplace_return = ind2.set_codes(new_codes, inplace=True) + assert inplace_return is None + assert_matching(ind2.codes, new_codes) + + # codes changing specific level w/o mutation + ind2 = idx.set_codes(new_codes[0], level=0) + assert_matching(ind2.codes, [new_codes[0], codes[1]]) + assert_matching(idx.codes, codes) + + ind2 = idx.set_codes(new_codes[1], level=1) + assert_matching(ind2.codes, [codes[0], new_codes[1]]) + assert_matching(idx.codes, codes) + + # codes changing multiple levels w/o mutation + ind2 = idx.set_codes(new_codes, level=[0, 1]) + assert_matching(ind2.codes, new_codes) + assert_matching(idx.codes, codes) + + # label changing specific level w/ mutation + ind2 = idx.copy() + inplace_return = ind2.set_codes(new_codes[0], level=0, inplace=True) + assert inplace_return is None + assert_matching(ind2.codes, [new_codes[0], codes[1]]) + assert_matching(idx.codes, codes) + + ind2 = idx.copy() + inplace_return = ind2.set_codes(new_codes[1], level=1, inplace=True) + assert inplace_return is None + assert_matching(ind2.codes, [codes[0], new_codes[1]]) + assert_matching(idx.codes, codes) + + # codes changing multiple levels [w/ mutation] + ind2 = idx.copy() + inplace_return = ind2.set_codes(new_codes, level=[0, 1], inplace=True) + assert inplace_return is None + assert_matching(ind2.codes, new_codes) + assert_matching(idx.codes, codes) + + # label changing for levels of different magnitude of categories + ind = pd.MultiIndex.from_tuples([(0, i) for i in range(130)]) + new_codes = range(129, -1, -1) + expected = pd.MultiIndex.from_tuples([(0, i) for i in new_codes]) + + # [w/o mutation] + result = ind.set_codes(codes=new_codes, level=1) + assert result.equals(expected) + + # [w/ mutation] + result = ind.copy() + result.set_codes(codes=new_codes, level=1, inplace=True) + assert result.equals(expected) + + +def test_set_levels_codes_names_bad_input(idx): + levels, codes = idx.levels, idx.codes + names = idx.names + + with pytest.raises(ValueError, match="Length of levels"): + idx.set_levels([levels[0]]) + + with pytest.raises(ValueError, match="Length of codes"): + idx.set_codes([codes[0]]) + + with pytest.raises(ValueError, match="Length of names"): + idx.set_names([names[0]]) + + # shouldn't scalar data error, instead should demand list-like + with pytest.raises(TypeError, match="list of lists-like"): + idx.set_levels(levels[0]) + + # shouldn't scalar data error, instead should demand list-like + with pytest.raises(TypeError, match="list of lists-like"): + idx.set_codes(codes[0]) + + # shouldn't scalar data error, instead should demand list-like + with pytest.raises(TypeError, match="list-like"): + idx.set_names(names[0]) + + # should have equal lengths + with pytest.raises(TypeError, match="list of lists-like"): + idx.set_levels(levels[0], level=[0, 1]) + + with pytest.raises(TypeError, match="list-like"): + idx.set_levels(levels, level=0) + + # should have equal lengths + with pytest.raises(TypeError, match="list of lists-like"): + idx.set_codes(codes[0], level=[0, 1]) + + with pytest.raises(TypeError, match="list-like"): + idx.set_codes(codes, level=0) + + # should have equal lengths + with pytest.raises(ValueError, match="Length of names"): + idx.set_names(names[0], level=[0, 1]) + + with pytest.raises(TypeError, match="Names must be a"): + idx.set_names(names, level=0) + + +@pytest.mark.parametrize("inplace", [True, False]) +def test_set_names_with_nlevel_1(inplace): + # GH 21149 + # Ensure that .set_names for MultiIndex with + # nlevels == 1 does not raise any errors + expected = pd.MultiIndex(levels=[[0, 1]], codes=[[0, 1]], names=["first"]) + m = pd.MultiIndex.from_product([[0, 1]]) + result = m.set_names("first", level=0, inplace=inplace) + + if inplace: + result = m + + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("ordered", [True, False]) +def test_set_levels_categorical(ordered): + # GH13854 + index = MultiIndex.from_arrays([list("xyzx"), [0, 1, 2, 3]]) + + cidx = CategoricalIndex(list("bac"), ordered=ordered) + result = index.set_levels(cidx, 0) + expected = MultiIndex(levels=[cidx, [0, 1, 2, 3]], codes=index.codes) + tm.assert_index_equal(result, expected) + + result_lvl = result.get_level_values(0) + expected_lvl = CategoricalIndex( + list("bacb"), categories=cidx.categories, ordered=cidx.ordered + ) + tm.assert_index_equal(result_lvl, expected_lvl) + + +def test_set_value_keeps_names(): + # motivating example from #3742 + lev1 = ["hans", "hans", "hans", "grethe", "grethe", "grethe"] + lev2 = ["1", "2", "3"] * 2 + idx = pd.MultiIndex.from_arrays([lev1, lev2], names=["Name", "Number"]) + df = pd.DataFrame( + np.random.randn(6, 4), columns=["one", "two", "three", "four"], index=idx + ) + df = df.sort_index() + assert df._is_copy is None + assert df.index.names == ("Name", "Number") + df.at[("grethe", "4"), "one"] = 99.34 + assert df._is_copy is None + assert df.index.names == ("Name", "Number") + + +def test_set_levels_with_iterable(): + # GH23273 + sizes = [1, 2, 3] + colors = ["black"] * 3 + index = pd.MultiIndex.from_arrays([sizes, colors], names=["size", "color"]) + + result = index.set_levels(map(int, ["3", "2", "1"]), level="size") + + expected_sizes = [3, 2, 1] + expected = pd.MultiIndex.from_arrays( + [expected_sizes, colors], names=["size", "color"] + ) + tm.assert_index_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/multi/test_indexing.py b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_indexing.py new file mode 100644 index 0000000..b08280a --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_indexing.py @@ -0,0 +1,528 @@ +from datetime import timedelta + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + Categorical, + CategoricalIndex, + Index, + IntervalIndex, + MultiIndex, + date_range, +) +import pandas._testing as tm +from pandas.core.indexes.base import InvalidIndexError + + +def test_slice_locs_partial(idx): + sorted_idx, _ = idx.sortlevel(0) + + result = sorted_idx.slice_locs(("foo", "two"), ("qux", "one")) + assert result == (1, 5) + + result = sorted_idx.slice_locs(None, ("qux", "one")) + assert result == (0, 5) + + result = sorted_idx.slice_locs(("foo", "two"), None) + assert result == (1, len(sorted_idx)) + + result = sorted_idx.slice_locs("bar", "baz") + assert result == (2, 4) + + +def test_slice_locs(): + df = tm.makeTimeDataFrame() + stacked = df.stack() + idx = stacked.index + + slob = slice(*idx.slice_locs(df.index[5], df.index[15])) + sliced = stacked[slob] + expected = df[5:16].stack() + tm.assert_almost_equal(sliced.values, expected.values) + + slob = slice( + *idx.slice_locs( + df.index[5] + timedelta(seconds=30), df.index[15] - timedelta(seconds=30) + ) + ) + sliced = stacked[slob] + expected = df[6:15].stack() + tm.assert_almost_equal(sliced.values, expected.values) + + +def test_slice_locs_with_type_mismatch(): + df = tm.makeTimeDataFrame() + stacked = df.stack() + idx = stacked.index + with pytest.raises(TypeError, match="^Level type mismatch"): + idx.slice_locs((1, 3)) + with pytest.raises(TypeError, match="^Level type mismatch"): + idx.slice_locs(df.index[5] + timedelta(seconds=30), (5, 2)) + df = tm.makeCustomDataframe(5, 5) + stacked = df.stack() + idx = stacked.index + with pytest.raises(TypeError, match="^Level type mismatch"): + idx.slice_locs(timedelta(seconds=30)) + # TODO: Try creating a UnicodeDecodeError in exception message + with pytest.raises(TypeError, match="^Level type mismatch"): + idx.slice_locs(df.index[1], (16, "a")) + + +def test_slice_locs_not_sorted(): + index = MultiIndex( + levels=[Index(np.arange(4)), Index(np.arange(4)), Index(np.arange(4))], + codes=[ + np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0]), + ], + ) + msg = "[Kk]ey length.*greater than MultiIndex lexsort depth" + with pytest.raises(KeyError, match=msg): + index.slice_locs((1, 0, 1), (2, 1, 0)) + + # works + sorted_index, _ = index.sortlevel(0) + # should there be a test case here??? + sorted_index.slice_locs((1, 0, 1), (2, 1, 0)) + + +def test_slice_locs_not_contained(): + # some searchsorted action + + index = MultiIndex( + levels=[[0, 2, 4, 6], [0, 2, 4]], + codes=[[0, 0, 0, 1, 1, 2, 3, 3, 3], [0, 1, 2, 1, 2, 2, 0, 1, 2]], + ) + + result = index.slice_locs((1, 0), (5, 2)) + assert result == (3, 6) + + result = index.slice_locs(1, 5) + assert result == (3, 6) + + result = index.slice_locs((2, 2), (5, 2)) + assert result == (3, 6) + + result = index.slice_locs(2, 5) + assert result == (3, 6) + + result = index.slice_locs((1, 0), (6, 3)) + assert result == (3, 8) + + result = index.slice_locs(-1, 10) + assert result == (0, len(index)) + + +def test_putmask_with_wrong_mask(idx): + # GH18368 + + msg = "putmask: mask and data must be the same size" + with pytest.raises(ValueError, match=msg): + idx.putmask(np.ones(len(idx) + 1, np.bool), 1) + + with pytest.raises(ValueError, match=msg): + idx.putmask(np.ones(len(idx) - 1, np.bool), 1) + + with pytest.raises(ValueError, match=msg): + idx.putmask("foo", 1) + + +def test_get_indexer(): + major_axis = Index(np.arange(4)) + minor_axis = Index(np.arange(2)) + + major_codes = np.array([0, 0, 1, 2, 2, 3, 3], dtype=np.intp) + minor_codes = np.array([0, 1, 0, 0, 1, 0, 1], dtype=np.intp) + + index = MultiIndex( + levels=[major_axis, minor_axis], codes=[major_codes, minor_codes] + ) + idx1 = index[:5] + idx2 = index[[1, 3, 5]] + + r1 = idx1.get_indexer(idx2) + tm.assert_almost_equal(r1, np.array([1, 3, -1], dtype=np.intp)) + + r1 = idx2.get_indexer(idx1, method="pad") + e1 = np.array([-1, 0, 0, 1, 1], dtype=np.intp) + tm.assert_almost_equal(r1, e1) + + r2 = idx2.get_indexer(idx1[::-1], method="pad") + tm.assert_almost_equal(r2, e1[::-1]) + + rffill1 = idx2.get_indexer(idx1, method="ffill") + tm.assert_almost_equal(r1, rffill1) + + r1 = idx2.get_indexer(idx1, method="backfill") + e1 = np.array([0, 0, 1, 1, 2], dtype=np.intp) + tm.assert_almost_equal(r1, e1) + + r2 = idx2.get_indexer(idx1[::-1], method="backfill") + tm.assert_almost_equal(r2, e1[::-1]) + + rbfill1 = idx2.get_indexer(idx1, method="bfill") + tm.assert_almost_equal(r1, rbfill1) + + # pass non-MultiIndex + r1 = idx1.get_indexer(idx2.values) + rexp1 = idx1.get_indexer(idx2) + tm.assert_almost_equal(r1, rexp1) + + r1 = idx1.get_indexer([1, 2, 3]) + assert (r1 == [-1, -1, -1]).all() + + # create index with duplicates + idx1 = Index(list(range(10)) + list(range(10))) + idx2 = Index(list(range(20))) + + msg = "Reindexing only valid with uniquely valued Index objects" + with pytest.raises(InvalidIndexError, match=msg): + idx1.get_indexer(idx2) + + +def test_get_indexer_nearest(): + midx = MultiIndex.from_tuples([("a", 1), ("b", 2)]) + msg = "method='nearest' not implemented yet for MultiIndex; see GitHub issue 9365" + with pytest.raises(NotImplementedError, match=msg): + midx.get_indexer(["a"], method="nearest") + msg = "tolerance not implemented yet for MultiIndex" + with pytest.raises(NotImplementedError, match=msg): + midx.get_indexer(["a"], method="pad", tolerance=2) + + +def test_getitem(idx): + # scalar + assert idx[2] == ("bar", "one") + + # slice + result = idx[2:5] + expected = idx[[2, 3, 4]] + assert result.equals(expected) + + # boolean + result = idx[[True, False, True, False, True, True]] + result2 = idx[np.array([True, False, True, False, True, True])] + expected = idx[[0, 2, 4, 5]] + assert result.equals(expected) + assert result2.equals(expected) + + +def test_getitem_group_select(idx): + sorted_idx, _ = idx.sortlevel(0) + assert sorted_idx.get_loc("baz") == slice(3, 4) + assert sorted_idx.get_loc("foo") == slice(0, 2) + + +def test_get_indexer_consistency(idx): + # See GH 16819 + if isinstance(idx, IntervalIndex): + pass + + if idx.is_unique or isinstance(idx, CategoricalIndex): + indexer = idx.get_indexer(idx[0:2]) + assert isinstance(indexer, np.ndarray) + assert indexer.dtype == np.intp + else: + e = "Reindexing only valid with uniquely valued Index objects" + with pytest.raises(InvalidIndexError, match=e): + idx.get_indexer(idx[0:2]) + + indexer, _ = idx.get_indexer_non_unique(idx[0:2]) + assert isinstance(indexer, np.ndarray) + assert indexer.dtype == np.intp + + +@pytest.mark.parametrize("ind1", [[True] * 5, pd.Index([True] * 5)]) +@pytest.mark.parametrize( + "ind2", + [[True, False, True, False, False], pd.Index([True, False, True, False, False])], +) +def test_getitem_bool_index_all(ind1, ind2): + # GH#22533 + idx = MultiIndex.from_tuples([(10, 1), (20, 2), (30, 3), (40, 4), (50, 5)]) + tm.assert_index_equal(idx[ind1], idx) + + expected = MultiIndex.from_tuples([(10, 1), (30, 3)]) + tm.assert_index_equal(idx[ind2], expected) + + +@pytest.mark.parametrize("ind1", [[True], pd.Index([True])]) +@pytest.mark.parametrize("ind2", [[False], pd.Index([False])]) +def test_getitem_bool_index_single(ind1, ind2): + # GH#22533 + idx = MultiIndex.from_tuples([(10, 1)]) + tm.assert_index_equal(idx[ind1], idx) + + expected = pd.MultiIndex( + levels=[np.array([], dtype=np.int64), np.array([], dtype=np.int64)], + codes=[[], []], + ) + tm.assert_index_equal(idx[ind2], expected) + + +def test_get_loc(idx): + assert idx.get_loc(("foo", "two")) == 1 + assert idx.get_loc(("baz", "two")) == 3 + with pytest.raises(KeyError, match=r"^10$"): + idx.get_loc(("bar", "two")) + with pytest.raises(KeyError, match=r"^'quux'$"): + idx.get_loc("quux") + + msg = "only the default get_loc method is currently supported for MultiIndex" + with pytest.raises(NotImplementedError, match=msg): + idx.get_loc("foo", method="nearest") + + # 3 levels + index = MultiIndex( + levels=[Index(np.arange(4)), Index(np.arange(4)), Index(np.arange(4))], + codes=[ + np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0]), + ], + ) + with pytest.raises(KeyError, match=r"^\(1, 1\)$"): + index.get_loc((1, 1)) + assert index.get_loc((2, 0)) == slice(3, 5) + + +def test_get_loc_duplicates(): + index = Index([2, 2, 2, 2]) + result = index.get_loc(2) + expected = slice(0, 4) + assert result == expected + # pytest.raises(Exception, index.get_loc, 2) + + index = Index(["c", "a", "a", "b", "b"]) + rs = index.get_loc("c") + xp = 0 + assert rs == xp + + +def test_get_loc_level(): + index = MultiIndex( + levels=[Index(np.arange(4)), Index(np.arange(4)), Index(np.arange(4))], + codes=[ + np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0]), + ], + ) + loc, new_index = index.get_loc_level((0, 1)) + expected = slice(1, 2) + exp_index = index[expected].droplevel(0).droplevel(0) + assert loc == expected + assert new_index.equals(exp_index) + + loc, new_index = index.get_loc_level((0, 1, 0)) + expected = 1 + assert loc == expected + assert new_index is None + + with pytest.raises(KeyError, match=r"^\(2, 2\)$"): + index.get_loc_level((2, 2)) + # GH 22221: unused label + with pytest.raises(KeyError, match=r"^2$"): + index.drop(2).get_loc_level(2) + # Unused label on unsorted level: + with pytest.raises(KeyError, match=r"^2$"): + index.drop(1, level=2).get_loc_level(2, level=2) + + index = MultiIndex( + levels=[[2000], list(range(4))], + codes=[np.array([0, 0, 0, 0]), np.array([0, 1, 2, 3])], + ) + result, new_index = index.get_loc_level((2000, slice(None, None))) + expected = slice(None, None) + assert result == expected + assert new_index.equals(index.droplevel(0)) + + +@pytest.mark.parametrize("dtype1", [int, float, bool, str]) +@pytest.mark.parametrize("dtype2", [int, float, bool, str]) +def test_get_loc_multiple_dtypes(dtype1, dtype2): + # GH 18520 + levels = [np.array([0, 1]).astype(dtype1), np.array([0, 1]).astype(dtype2)] + idx = pd.MultiIndex.from_product(levels) + assert idx.get_loc(idx[2]) == 2 + + +@pytest.mark.parametrize("level", [0, 1]) +@pytest.mark.parametrize("dtypes", [[int, float], [float, int]]) +def test_get_loc_implicit_cast(level, dtypes): + # GH 18818, GH 15994 : as flat index, cast int to float and vice-versa + levels = [["a", "b"], ["c", "d"]] + key = ["b", "d"] + lev_dtype, key_dtype = dtypes + levels[level] = np.array([0, 1], dtype=lev_dtype) + key[level] = key_dtype(1) + idx = MultiIndex.from_product(levels) + assert idx.get_loc(tuple(key)) == 3 + + +def test_get_loc_cast_bool(): + # GH 19086 : int is casted to bool, but not vice-versa + levels = [[False, True], np.arange(2, dtype="int64")] + idx = MultiIndex.from_product(levels) + + assert idx.get_loc((0, 1)) == 1 + assert idx.get_loc((1, 0)) == 2 + + with pytest.raises(KeyError, match=r"^\(False, True\)$"): + idx.get_loc((False, True)) + with pytest.raises(KeyError, match=r"^\(True, False\)$"): + idx.get_loc((True, False)) + + +@pytest.mark.parametrize("level", [0, 1]) +def test_get_loc_nan(level, nulls_fixture): + # GH 18485 : NaN in MultiIndex + levels = [["a", "b"], ["c", "d"]] + key = ["b", "d"] + levels[level] = np.array([0, nulls_fixture], dtype=type(nulls_fixture)) + key[level] = nulls_fixture + idx = MultiIndex.from_product(levels) + assert idx.get_loc(tuple(key)) == 3 + + +def test_get_loc_missing_nan(): + # GH 8569 + idx = MultiIndex.from_arrays([[1.0, 2.0], [3.0, 4.0]]) + assert isinstance(idx.get_loc(1), slice) + with pytest.raises(KeyError, match=r"^3\.0$"): + idx.get_loc(3) + with pytest.raises(KeyError, match=r"^nan$"): + idx.get_loc(np.nan) + with pytest.raises(TypeError, match="unhashable type: 'list'"): + # listlike/non-hashable raises TypeError + idx.get_loc([np.nan]) + + +def test_get_indexer_categorical_time(): + # https://github.com/pandas-dev/pandas/issues/21390 + midx = MultiIndex.from_product( + [ + Categorical(["a", "b", "c"]), + Categorical(date_range("2012-01-01", periods=3, freq="H")), + ] + ) + result = midx.get_indexer(midx) + tm.assert_numpy_array_equal(result, np.arange(9, dtype=np.intp)) + + +def test_timestamp_multiindex_indexer(): + # https://github.com/pandas-dev/pandas/issues/26944 + idx = pd.MultiIndex.from_product( + [ + pd.date_range("2019-01-01T00:15:33", periods=100, freq="H", name="date"), + ["x"], + [3], + ] + ) + df = pd.DataFrame({"foo": np.arange(len(idx))}, idx) + result = df.loc[pd.IndexSlice["2019-1-2":, "x", :], "foo"] + qidx = pd.MultiIndex.from_product( + [ + pd.date_range( + start="2019-01-02T00:15:33", + end="2019-01-05T02:15:33", + freq="H", + name="date", + ), + ["x"], + [3], + ] + ) + should_be = pd.Series(data=np.arange(24, len(qidx) + 24), index=qidx, name="foo") + tm.assert_series_equal(result, should_be) + + +def test_get_loc_with_values_including_missing_values(): + # issue 19132 + idx = MultiIndex.from_product([[np.nan, 1]] * 2) + expected = slice(0, 2, None) + assert idx.get_loc(np.nan) == expected + + idx = MultiIndex.from_arrays([[np.nan, 1, 2, np.nan]]) + expected = np.array([True, False, False, True]) + tm.assert_numpy_array_equal(idx.get_loc(np.nan), expected) + + idx = MultiIndex.from_product([[np.nan, 1]] * 3) + expected = slice(2, 4, None) + assert idx.get_loc((np.nan, 1)) == expected + + +@pytest.mark.parametrize( + "index_arr,labels,expected", + [ + ( + [[1, np.nan, 2], [3, 4, 5]], + [1, np.nan, 2], + np.array([-1, -1, -1], dtype=np.intp), + ), + ([[1, np.nan, 2], [3, 4, 5]], [(np.nan, 4)], np.array([1], dtype=np.intp)), + ([[1, 2, 3], [np.nan, 4, 5]], [(1, np.nan)], np.array([0], dtype=np.intp)), + ( + [[1, 2, 3], [np.nan, 4, 5]], + [np.nan, 4, 5], + np.array([-1, -1, -1], dtype=np.intp), + ), + ], +) +def test_get_indexer_with_missing_value(index_arr, labels, expected): + # issue 19132 + idx = MultiIndex.from_arrays(index_arr) + result = idx.get_indexer(labels) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize( + "index_arr,expected,target,algo", + [ + ([[np.nan, "a", "b"], ["c", "d", "e"]], 0, np.nan, "left"), + ([[np.nan, "a", "b"], ["c", "d", "e"]], 1, (np.nan, "c"), "right"), + ([["a", "b", "c"], ["d", np.nan, "d"]], 1, ("b", np.nan), "left"), + ], +) +def test_get_slice_bound_with_missing_value(index_arr, expected, target, algo): + # issue 19132 + idx = MultiIndex.from_arrays(index_arr) + result = idx.get_slice_bound(target, side=algo, kind="loc") + assert result == expected + + +@pytest.mark.parametrize( + "index_arr,expected,start_idx,end_idx", + [ + ([[np.nan, 1, 2], [3, 4, 5]], slice(0, 2, None), np.nan, 1), + ([[np.nan, 1, 2], [3, 4, 5]], slice(0, 3, None), np.nan, (2, 5)), + ([[1, 2, 3], [4, np.nan, 5]], slice(1, 3, None), (2, np.nan), 3), + ([[1, 2, 3], [4, np.nan, 5]], slice(1, 3, None), (2, np.nan), (3, 5)), + ], +) +def test_slice_indexer_with_missing_value(index_arr, expected, start_idx, end_idx): + # issue 19132 + idx = MultiIndex.from_arrays(index_arr) + result = idx.slice_indexer(start=start_idx, end=end_idx) + assert result == expected + + +@pytest.mark.parametrize( + "index_arr,expected,start_idx,end_idx", + [ + ([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, None), + ([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, "b"), + ([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, ("b", "e")), + ([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), None), + ([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), "c"), + ([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), ("c", "e")), + ], +) +def test_slice_locs_with_missing_value(index_arr, expected, start_idx, end_idx): + # issue 19132 + idx = MultiIndex.from_arrays(index_arr) + result = idx.slice_locs(start=start_idx, end=end_idx) + assert result == expected diff --git a/venv/Lib/site-packages/pandas/tests/indexes/multi/test_integrity.py b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_integrity.py new file mode 100644 index 0000000..f2ec15e --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_integrity.py @@ -0,0 +1,294 @@ +import re + +import numpy as np +import pytest + +from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike + +import pandas as pd +from pandas import IntervalIndex, MultiIndex, RangeIndex +import pandas._testing as tm + + +def test_labels_dtypes(): + + # GH 8456 + i = MultiIndex.from_tuples([("A", 1), ("A", 2)]) + assert i.codes[0].dtype == "int8" + assert i.codes[1].dtype == "int8" + + i = MultiIndex.from_product([["a"], range(40)]) + assert i.codes[1].dtype == "int8" + i = MultiIndex.from_product([["a"], range(400)]) + assert i.codes[1].dtype == "int16" + i = MultiIndex.from_product([["a"], range(40000)]) + assert i.codes[1].dtype == "int32" + + i = pd.MultiIndex.from_product([["a"], range(1000)]) + assert (i.codes[0] >= 0).all() + assert (i.codes[1] >= 0).all() + + +def test_values_boxed(): + tuples = [ + (1, pd.Timestamp("2000-01-01")), + (2, pd.NaT), + (3, pd.Timestamp("2000-01-03")), + (1, pd.Timestamp("2000-01-04")), + (2, pd.Timestamp("2000-01-02")), + (3, pd.Timestamp("2000-01-03")), + ] + result = pd.MultiIndex.from_tuples(tuples) + expected = construct_1d_object_array_from_listlike(tuples) + tm.assert_numpy_array_equal(result.values, expected) + # Check that code branches for boxed values produce identical results + tm.assert_numpy_array_equal(result.values[:4], result[:4].values) + + +def test_values_multiindex_datetimeindex(): + # Test to ensure we hit the boxing / nobox part of MI.values + ints = np.arange(10 ** 18, 10 ** 18 + 5) + naive = pd.DatetimeIndex(ints) + + aware = pd.DatetimeIndex(ints, tz="US/Central") + + idx = pd.MultiIndex.from_arrays([naive, aware]) + result = idx.values + + outer = pd.DatetimeIndex([x[0] for x in result]) + tm.assert_index_equal(outer, naive) + + inner = pd.DatetimeIndex([x[1] for x in result]) + tm.assert_index_equal(inner, aware) + + # n_lev > n_lab + result = idx[:2].values + + outer = pd.DatetimeIndex([x[0] for x in result]) + tm.assert_index_equal(outer, naive[:2]) + + inner = pd.DatetimeIndex([x[1] for x in result]) + tm.assert_index_equal(inner, aware[:2]) + + +def test_values_multiindex_periodindex(): + # Test to ensure we hit the boxing / nobox part of MI.values + ints = np.arange(2007, 2012) + pidx = pd.PeriodIndex(ints, freq="D") + + idx = pd.MultiIndex.from_arrays([ints, pidx]) + result = idx.values + + outer = pd.Int64Index([x[0] for x in result]) + tm.assert_index_equal(outer, pd.Int64Index(ints)) + + inner = pd.PeriodIndex([x[1] for x in result]) + tm.assert_index_equal(inner, pidx) + + # n_lev > n_lab + result = idx[:2].values + + outer = pd.Int64Index([x[0] for x in result]) + tm.assert_index_equal(outer, pd.Int64Index(ints[:2])) + + inner = pd.PeriodIndex([x[1] for x in result]) + tm.assert_index_equal(inner, pidx[:2]) + + +def test_consistency(): + # need to construct an overflow + major_axis = list(range(70000)) + minor_axis = list(range(10)) + + major_codes = np.arange(70000) + minor_codes = np.repeat(range(10), 7000) + + # the fact that is works means it's consistent + index = MultiIndex( + levels=[major_axis, minor_axis], codes=[major_codes, minor_codes] + ) + + # inconsistent + major_codes = np.array([0, 0, 1, 1, 1, 2, 2, 3, 3]) + minor_codes = np.array([0, 1, 0, 1, 1, 0, 1, 0, 1]) + index = MultiIndex( + levels=[major_axis, minor_axis], codes=[major_codes, minor_codes] + ) + + assert index.is_unique is False + + +def test_hash_collisions(): + # non-smoke test that we don't get hash collisions + + index = MultiIndex.from_product( + [np.arange(1000), np.arange(1000)], names=["one", "two"] + ) + result = index.get_indexer(index.values) + tm.assert_numpy_array_equal(result, np.arange(len(index), dtype="intp")) + + for i in [0, 1, len(index) - 2, len(index) - 1]: + result = index.get_loc(index[i]) + assert result == i + + +def test_dims(): + pass + + +def take_invalid_kwargs(): + vals = [["A", "B"], [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")]] + idx = pd.MultiIndex.from_product(vals, names=["str", "dt"]) + indices = [1, 2] + + msg = r"take\(\) got an unexpected keyword argument 'foo'" + with pytest.raises(TypeError, match=msg): + idx.take(indices, foo=2) + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + idx.take(indices, out=indices) + + msg = "the 'mode' parameter is not supported" + with pytest.raises(ValueError, match=msg): + idx.take(indices, mode="clip") + + +def test_isna_behavior(idx): + # should not segfault GH5123 + # NOTE: if MI representation changes, may make sense to allow + # isna(MI) + msg = "isna is not defined for MultiIndex" + with pytest.raises(NotImplementedError, match=msg): + pd.isna(idx) + + +def test_large_multiindex_error(): + # GH12527 + df_below_1000000 = pd.DataFrame( + 1, index=pd.MultiIndex.from_product([[1, 2], range(499999)]), columns=["dest"] + ) + with pytest.raises(KeyError, match=r"^\(-1, 0\)$"): + df_below_1000000.loc[(-1, 0), "dest"] + with pytest.raises(KeyError, match=r"^\(3, 0\)$"): + df_below_1000000.loc[(3, 0), "dest"] + df_above_1000000 = pd.DataFrame( + 1, index=pd.MultiIndex.from_product([[1, 2], range(500001)]), columns=["dest"] + ) + with pytest.raises(KeyError, match=r"^\(-1, 0\)$"): + df_above_1000000.loc[(-1, 0), "dest"] + with pytest.raises(KeyError, match=r"^\(3, 0\)$"): + df_above_1000000.loc[(3, 0), "dest"] + + +def test_million_record_attribute_error(): + # GH 18165 + r = list(range(1000000)) + df = pd.DataFrame( + {"a": r, "b": r}, index=pd.MultiIndex.from_tuples([(x, x) for x in r]) + ) + + msg = "'Series' object has no attribute 'foo'" + with pytest.raises(AttributeError, match=msg): + df["a"].foo() + + +def test_can_hold_identifiers(idx): + key = idx[0] + assert idx._can_hold_identifiers_and_holds_name(key) is True + + +def test_metadata_immutable(idx): + levels, codes = idx.levels, idx.codes + # shouldn't be able to set at either the top level or base level + mutable_regex = re.compile("does not support mutable operations") + with pytest.raises(TypeError, match=mutable_regex): + levels[0] = levels[0] + with pytest.raises(TypeError, match=mutable_regex): + levels[0][0] = levels[0][0] + # ditto for labels + with pytest.raises(TypeError, match=mutable_regex): + codes[0] = codes[0] + with pytest.raises(ValueError, match="assignment destination is read-only"): + codes[0][0] = codes[0][0] + # and for names + names = idx.names + with pytest.raises(TypeError, match=mutable_regex): + names[0] = names[0] + + +def test_level_setting_resets_attributes(): + ind = pd.MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]]) + assert ind.is_monotonic + ind.set_levels([["A", "B"], [1, 3, 2]], inplace=True) + # if this fails, probably didn't reset the cache correctly. + assert not ind.is_monotonic + + +def test_rangeindex_fallback_coercion_bug(): + # GH 12893 + foo = pd.DataFrame(np.arange(100).reshape((10, 10))) + bar = pd.DataFrame(np.arange(100).reshape((10, 10))) + df = pd.concat({"foo": foo.stack(), "bar": bar.stack()}, axis=1) + df.index.names = ["fizz", "buzz"] + + str(df) + expected = pd.DataFrame( + {"bar": np.arange(100), "foo": np.arange(100)}, + index=pd.MultiIndex.from_product( + [range(10), range(10)], names=["fizz", "buzz"] + ), + ) + tm.assert_frame_equal(df, expected, check_like=True) + + result = df.index.get_level_values("fizz") + expected = pd.Int64Index(np.arange(10), name="fizz").repeat(10) + tm.assert_index_equal(result, expected) + + result = df.index.get_level_values("buzz") + expected = pd.Int64Index(np.tile(np.arange(10), 10), name="buzz") + tm.assert_index_equal(result, expected) + + +def test_hash_error(indices): + index = indices + with pytest.raises(TypeError, match=f"unhashable type: '{type(index).__name__}'"): + hash(indices) + + +def test_mutability(indices): + if not len(indices): + return + msg = "Index does not support mutable operations" + with pytest.raises(TypeError, match=msg): + indices[0] = indices[0] + + +def test_wrong_number_names(indices): + with pytest.raises(ValueError, match="^Length"): + indices.names = ["apple", "banana", "carrot"] + + +def test_memory_usage(idx): + result = idx.memory_usage() + if len(idx): + idx.get_loc(idx[0]) + result2 = idx.memory_usage() + result3 = idx.memory_usage(deep=True) + + # RangeIndex, IntervalIndex + # don't have engines + if not isinstance(idx, (RangeIndex, IntervalIndex)): + assert result2 > result + + if idx.inferred_type == "object": + assert result3 > result2 + + else: + + # we report 0 for no-length + assert result == 0 + + +def test_nlevels(idx): + assert idx.nlevels == 2 diff --git a/venv/Lib/site-packages/pandas/tests/indexes/multi/test_join.py b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_join.py new file mode 100644 index 0000000..062fb92 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_join.py @@ -0,0 +1,105 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import Index, MultiIndex +import pandas._testing as tm + + +@pytest.mark.parametrize( + "other", [Index(["three", "one", "two"]), Index(["one"]), Index(["one", "three"])] +) +def test_join_level(idx, other, join_type): + join_index, lidx, ridx = other.join( + idx, how=join_type, level="second", return_indexers=True + ) + + exp_level = other.join(idx.levels[1], how=join_type) + assert join_index.levels[0].equals(idx.levels[0]) + assert join_index.levels[1].equals(exp_level) + + # pare down levels + mask = np.array([x[1] in exp_level for x in idx], dtype=bool) + exp_values = idx.values[mask] + tm.assert_numpy_array_equal(join_index.values, exp_values) + + if join_type in ("outer", "inner"): + join_index2, ridx2, lidx2 = idx.join( + other, how=join_type, level="second", return_indexers=True + ) + + assert join_index.equals(join_index2) + tm.assert_numpy_array_equal(lidx, lidx2) + tm.assert_numpy_array_equal(ridx, ridx2) + tm.assert_numpy_array_equal(join_index2.values, exp_values) + + +def test_join_level_corner_case(idx): + # some corner cases + index = Index(["three", "one", "two"]) + result = index.join(idx, level="second") + assert isinstance(result, MultiIndex) + + with pytest.raises(TypeError, match="Join.*MultiIndex.*ambiguous"): + idx.join(idx, level=1) + + +def test_join_self(idx, join_type): + joined = idx.join(idx, how=join_type) + assert idx is joined + + +def test_join_multi(): + # GH 10665 + midx = pd.MultiIndex.from_product([np.arange(4), np.arange(4)], names=["a", "b"]) + idx = pd.Index([1, 2, 5], name="b") + + # inner + jidx, lidx, ridx = midx.join(idx, how="inner", return_indexers=True) + exp_idx = pd.MultiIndex.from_product([np.arange(4), [1, 2]], names=["a", "b"]) + exp_lidx = np.array([1, 2, 5, 6, 9, 10, 13, 14], dtype=np.intp) + exp_ridx = np.array([0, 1, 0, 1, 0, 1, 0, 1], dtype=np.intp) + tm.assert_index_equal(jidx, exp_idx) + tm.assert_numpy_array_equal(lidx, exp_lidx) + tm.assert_numpy_array_equal(ridx, exp_ridx) + # flip + jidx, ridx, lidx = idx.join(midx, how="inner", return_indexers=True) + tm.assert_index_equal(jidx, exp_idx) + tm.assert_numpy_array_equal(lidx, exp_lidx) + tm.assert_numpy_array_equal(ridx, exp_ridx) + + # keep MultiIndex + jidx, lidx, ridx = midx.join(idx, how="left", return_indexers=True) + exp_ridx = np.array( + [-1, 0, 1, -1, -1, 0, 1, -1, -1, 0, 1, -1, -1, 0, 1, -1], dtype=np.intp + ) + tm.assert_index_equal(jidx, midx) + assert lidx is None + tm.assert_numpy_array_equal(ridx, exp_ridx) + # flip + jidx, ridx, lidx = idx.join(midx, how="right", return_indexers=True) + tm.assert_index_equal(jidx, midx) + assert lidx is None + tm.assert_numpy_array_equal(ridx, exp_ridx) + + +def test_join_self_unique(idx, join_type): + if idx.is_unique: + joined = idx.join(idx, how=join_type) + assert (idx == joined).all() + + +def test_join_multi_wrong_order(): + # GH 25760 + # GH 28956 + + midx1 = pd.MultiIndex.from_product([[1, 2], [3, 4]], names=["a", "b"]) + midx2 = pd.MultiIndex.from_product([[1, 2], [3, 4]], names=["b", "a"]) + + join_idx, lidx, ridx = midx1.join(midx2, return_indexers=False) + + exp_ridx = np.array([-1, -1, -1, -1], dtype=np.intp) + + tm.assert_index_equal(midx1, join_idx) + assert lidx is None + tm.assert_numpy_array_equal(ridx, exp_ridx) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/multi/test_missing.py b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_missing.py new file mode 100644 index 0000000..a17e1e9 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_missing.py @@ -0,0 +1,143 @@ +import numpy as np +import pytest + +from pandas._libs.tslib import iNaT + +import pandas as pd +from pandas import Int64Index, MultiIndex, PeriodIndex, UInt64Index +import pandas._testing as tm +from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin + + +def test_fillna(idx): + # GH 11343 + + # TODO: Remove or Refactor. Not Implemented for MultiIndex + for name, index in [("idx", idx)]: + if len(index) == 0: + pass + elif isinstance(index, MultiIndex): + idx = index.copy() + msg = "isna is not defined for MultiIndex" + with pytest.raises(NotImplementedError, match=msg): + idx.fillna(idx[0]) + else: + idx = index.copy() + result = idx.fillna(idx[0]) + tm.assert_index_equal(result, idx) + assert result is not idx + + msg = "'value' must be a scalar, passed: " + with pytest.raises(TypeError, match=msg): + idx.fillna([idx[0]]) + + idx = index.copy() + values = idx.values + + if isinstance(index, DatetimeIndexOpsMixin): + values[1] = iNaT + elif isinstance(index, (Int64Index, UInt64Index)): + continue + else: + values[1] = np.nan + + if isinstance(index, PeriodIndex): + idx = type(index)(values, freq=index.freq) + else: + idx = type(index)(values) + + expected = np.array([False] * len(idx), dtype=bool) + expected[1] = True + tm.assert_numpy_array_equal(idx._isnan, expected) + assert idx.hasnans is True + + +def test_dropna(): + # GH 6194 + idx = pd.MultiIndex.from_arrays( + [ + [1, np.nan, 3, np.nan, 5], + [1, 2, np.nan, np.nan, 5], + ["a", "b", "c", np.nan, "e"], + ] + ) + + exp = pd.MultiIndex.from_arrays([[1, 5], [1, 5], ["a", "e"]]) + tm.assert_index_equal(idx.dropna(), exp) + tm.assert_index_equal(idx.dropna(how="any"), exp) + + exp = pd.MultiIndex.from_arrays( + [[1, np.nan, 3, 5], [1, 2, np.nan, 5], ["a", "b", "c", "e"]] + ) + tm.assert_index_equal(idx.dropna(how="all"), exp) + + msg = "invalid how option: xxx" + with pytest.raises(ValueError, match=msg): + idx.dropna(how="xxx") + + # GH26408 + # test if missing values are dropped for multiindex constructed + # from codes and values + idx = MultiIndex( + levels=[[np.nan, None, pd.NaT, "128", 2], [np.nan, None, pd.NaT, "128", 2]], + codes=[[0, -1, 1, 2, 3, 4], [0, -1, 3, 3, 3, 4]], + ) + expected = MultiIndex.from_arrays([["128", 2], ["128", 2]]) + tm.assert_index_equal(idx.dropna(), expected) + tm.assert_index_equal(idx.dropna(how="any"), expected) + + expected = MultiIndex.from_arrays( + [[np.nan, np.nan, "128", 2], ["128", "128", "128", 2]] + ) + tm.assert_index_equal(idx.dropna(how="all"), expected) + + +def test_nulls(idx): + # this is really a smoke test for the methods + # as these are adequately tested for function elsewhere + + msg = "isna is not defined for MultiIndex" + with pytest.raises(NotImplementedError, match=msg): + idx.isna() + + +@pytest.mark.xfail(reason="isna is not defined for MultiIndex") +def test_hasnans_isnans(idx): + # GH 11343, added tests for hasnans / isnans + index = idx.copy() + + # cases in indices doesn't include NaN + expected = np.array([False] * len(index), dtype=bool) + tm.assert_numpy_array_equal(index._isnan, expected) + assert index.hasnans is False + + index = idx.copy() + values = index.values + values[1] = np.nan + + index = type(idx)(values) + + expected = np.array([False] * len(index), dtype=bool) + expected[1] = True + tm.assert_numpy_array_equal(index._isnan, expected) + assert index.hasnans is True + + +def test_nan_stays_float(): + + # GH 7031 + idx0 = pd.MultiIndex( + levels=[["A", "B"], []], codes=[[1, 0], [-1, -1]], names=[0, 1] + ) + idx1 = pd.MultiIndex(levels=[["C"], ["D"]], codes=[[0], [0]], names=[0, 1]) + idxm = idx0.join(idx1, how="outer") + assert pd.isna(idx0.get_level_values(1)).all() + # the following failed in 0.14.1 + assert pd.isna(idxm.get_level_values(1)[:-1]).all() + + df0 = pd.DataFrame([[1, 2]], index=idx0) + df1 = pd.DataFrame([[3, 4]], index=idx1) + dfm = df0 - df1 + assert pd.isna(df0.index.get_level_values(1)).all() + # the following failed in 0.14.1 + assert pd.isna(dfm.index.get_level_values(1)[:-1]).all() diff --git a/venv/Lib/site-packages/pandas/tests/indexes/multi/test_monotonic.py b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_monotonic.py new file mode 100644 index 0000000..b5c73d5 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_monotonic.py @@ -0,0 +1,230 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import Index, IntervalIndex, MultiIndex +from pandas.api.types import is_scalar + + +def test_is_monotonic_increasing(): + i = MultiIndex.from_product([np.arange(10), np.arange(10)], names=["one", "two"]) + assert i.is_monotonic is True + assert i._is_strictly_monotonic_increasing is True + assert Index(i.values).is_monotonic is True + assert i._is_strictly_monotonic_increasing is True + + i = MultiIndex.from_product( + [np.arange(10, 0, -1), np.arange(10)], names=["one", "two"] + ) + assert i.is_monotonic is False + assert i._is_strictly_monotonic_increasing is False + assert Index(i.values).is_monotonic is False + assert Index(i.values)._is_strictly_monotonic_increasing is False + + i = MultiIndex.from_product( + [np.arange(10), np.arange(10, 0, -1)], names=["one", "two"] + ) + assert i.is_monotonic is False + assert i._is_strictly_monotonic_increasing is False + assert Index(i.values).is_monotonic is False + assert Index(i.values)._is_strictly_monotonic_increasing is False + + i = MultiIndex.from_product([[1.0, np.nan, 2.0], ["a", "b", "c"]]) + assert i.is_monotonic is False + assert i._is_strictly_monotonic_increasing is False + assert Index(i.values).is_monotonic is False + assert Index(i.values)._is_strictly_monotonic_increasing is False + + # string ordering + i = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + assert i.is_monotonic is False + assert Index(i.values).is_monotonic is False + assert i._is_strictly_monotonic_increasing is False + assert Index(i.values)._is_strictly_monotonic_increasing is False + + i = MultiIndex( + levels=[["bar", "baz", "foo", "qux"], ["mom", "next", "zenith"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + assert i.is_monotonic is True + assert Index(i.values).is_monotonic is True + assert i._is_strictly_monotonic_increasing is True + assert Index(i.values)._is_strictly_monotonic_increasing is True + + # mixed levels, hits the TypeError + i = MultiIndex( + levels=[ + [1, 2, 3, 4], + [ + "gb00b03mlx29", + "lu0197800237", + "nl0000289783", + "nl0000289965", + "nl0000301109", + ], + ], + codes=[[0, 1, 1, 2, 2, 2, 3], [4, 2, 0, 0, 1, 3, -1]], + names=["household_id", "asset_id"], + ) + + assert i.is_monotonic is False + assert i._is_strictly_monotonic_increasing is False + + # empty + i = MultiIndex.from_arrays([[], []]) + assert i.is_monotonic is True + assert Index(i.values).is_monotonic is True + assert i._is_strictly_monotonic_increasing is True + assert Index(i.values)._is_strictly_monotonic_increasing is True + + +def test_is_monotonic_decreasing(): + i = MultiIndex.from_product( + [np.arange(9, -1, -1), np.arange(9, -1, -1)], names=["one", "two"] + ) + assert i.is_monotonic_decreasing is True + assert i._is_strictly_monotonic_decreasing is True + assert Index(i.values).is_monotonic_decreasing is True + assert i._is_strictly_monotonic_decreasing is True + + i = MultiIndex.from_product( + [np.arange(10), np.arange(10, 0, -1)], names=["one", "two"] + ) + assert i.is_monotonic_decreasing is False + assert i._is_strictly_monotonic_decreasing is False + assert Index(i.values).is_monotonic_decreasing is False + assert Index(i.values)._is_strictly_monotonic_decreasing is False + + i = MultiIndex.from_product( + [np.arange(10, 0, -1), np.arange(10)], names=["one", "two"] + ) + assert i.is_monotonic_decreasing is False + assert i._is_strictly_monotonic_decreasing is False + assert Index(i.values).is_monotonic_decreasing is False + assert Index(i.values)._is_strictly_monotonic_decreasing is False + + i = MultiIndex.from_product([[2.0, np.nan, 1.0], ["c", "b", "a"]]) + assert i.is_monotonic_decreasing is False + assert i._is_strictly_monotonic_decreasing is False + assert Index(i.values).is_monotonic_decreasing is False + assert Index(i.values)._is_strictly_monotonic_decreasing is False + + # string ordering + i = MultiIndex( + levels=[["qux", "foo", "baz", "bar"], ["three", "two", "one"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + assert i.is_monotonic_decreasing is False + assert Index(i.values).is_monotonic_decreasing is False + assert i._is_strictly_monotonic_decreasing is False + assert Index(i.values)._is_strictly_monotonic_decreasing is False + + i = MultiIndex( + levels=[["qux", "foo", "baz", "bar"], ["zenith", "next", "mom"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + assert i.is_monotonic_decreasing is True + assert Index(i.values).is_monotonic_decreasing is True + assert i._is_strictly_monotonic_decreasing is True + assert Index(i.values)._is_strictly_monotonic_decreasing is True + + # mixed levels, hits the TypeError + i = MultiIndex( + levels=[ + [4, 3, 2, 1], + [ + "nl0000301109", + "nl0000289965", + "nl0000289783", + "lu0197800237", + "gb00b03mlx29", + ], + ], + codes=[[0, 1, 1, 2, 2, 2, 3], [4, 2, 0, 0, 1, 3, -1]], + names=["household_id", "asset_id"], + ) + + assert i.is_monotonic_decreasing is False + assert i._is_strictly_monotonic_decreasing is False + + # empty + i = MultiIndex.from_arrays([[], []]) + assert i.is_monotonic_decreasing is True + assert Index(i.values).is_monotonic_decreasing is True + assert i._is_strictly_monotonic_decreasing is True + assert Index(i.values)._is_strictly_monotonic_decreasing is True + + +def test_is_strictly_monotonic_increasing(): + idx = pd.MultiIndex( + levels=[["bar", "baz"], ["mom", "next"]], codes=[[0, 0, 1, 1], [0, 0, 0, 1]] + ) + assert idx.is_monotonic_increasing is True + assert idx._is_strictly_monotonic_increasing is False + + +def test_is_strictly_monotonic_decreasing(): + idx = pd.MultiIndex( + levels=[["baz", "bar"], ["next", "mom"]], codes=[[0, 0, 1, 1], [0, 0, 0, 1]] + ) + assert idx.is_monotonic_decreasing is True + assert idx._is_strictly_monotonic_decreasing is False + + +def test_searchsorted_monotonic(indices): + # GH17271 + # not implemented for tuple searches in MultiIndex + # or Intervals searches in IntervalIndex + if isinstance(indices, (MultiIndex, IntervalIndex)): + return + + # nothing to test if the index is empty + if indices.empty: + return + value = indices[0] + + # determine the expected results (handle dupes for 'right') + expected_left, expected_right = 0, (indices == value).argmin() + if expected_right == 0: + # all values are the same, expected_right should be length + expected_right = len(indices) + + # test _searchsorted_monotonic in all cases + # test searchsorted only for increasing + if indices.is_monotonic_increasing: + ssm_left = indices._searchsorted_monotonic(value, side="left") + assert is_scalar(ssm_left) + assert expected_left == ssm_left + + ssm_right = indices._searchsorted_monotonic(value, side="right") + assert is_scalar(ssm_right) + assert expected_right == ssm_right + + ss_left = indices.searchsorted(value, side="left") + assert is_scalar(ss_left) + assert expected_left == ss_left + + ss_right = indices.searchsorted(value, side="right") + assert is_scalar(ss_right) + assert expected_right == ss_right + + elif indices.is_monotonic_decreasing: + ssm_left = indices._searchsorted_monotonic(value, side="left") + assert is_scalar(ssm_left) + assert expected_left == ssm_left + + ssm_right = indices._searchsorted_monotonic(value, side="right") + assert is_scalar(ssm_right) + assert expected_right == ssm_right + + else: + # non-monotonic should raise. + with pytest.raises(ValueError): + indices._searchsorted_monotonic(value, side="left") diff --git a/venv/Lib/site-packages/pandas/tests/indexes/multi/test_names.py b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_names.py new file mode 100644 index 0000000..479b5ef --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_names.py @@ -0,0 +1,143 @@ +import pytest + +import pandas as pd +from pandas import MultiIndex +import pandas._testing as tm + + +def check_level_names(index, names): + assert [level.name for level in index.levels] == list(names) + + +def test_slice_keep_name(): + x = MultiIndex.from_tuples([("a", "b"), (1, 2), ("c", "d")], names=["x", "y"]) + assert x[1:].names == x.names + + +def test_index_name_retained(): + # GH9857 + result = pd.DataFrame({"x": [1, 2, 6], "y": [2, 2, 8], "z": [-5, 0, 5]}) + result = result.set_index("z") + result.loc[10] = [9, 10] + df_expected = pd.DataFrame( + {"x": [1, 2, 6, 9], "y": [2, 2, 8, 10], "z": [-5, 0, 5, 10]} + ) + df_expected = df_expected.set_index("z") + tm.assert_frame_equal(result, df_expected) + + +def test_changing_names(idx): + assert [level.name for level in idx.levels] == ["first", "second"] + + view = idx.view() + copy = idx.copy() + shallow_copy = idx._shallow_copy() + + # changing names should not change level names on object + new_names = [name + "a" for name in idx.names] + idx.names = new_names + check_level_names(idx, ["firsta", "seconda"]) + + # and not on copies + check_level_names(view, ["first", "second"]) + check_level_names(copy, ["first", "second"]) + check_level_names(shallow_copy, ["first", "second"]) + + # and copies shouldn't change original + shallow_copy.names = [name + "c" for name in shallow_copy.names] + check_level_names(idx, ["firsta", "seconda"]) + + +def test_take_preserve_name(idx): + taken = idx.take([3, 0, 1]) + assert taken.names == idx.names + + +def test_copy_names(): + # Check that adding a "names" parameter to the copy is honored + # GH14302 + multi_idx = pd.Index([(1, 2), (3, 4)], names=["MyName1", "MyName2"]) + multi_idx1 = multi_idx.copy() + + assert multi_idx.equals(multi_idx1) + assert multi_idx.names == ["MyName1", "MyName2"] + assert multi_idx1.names == ["MyName1", "MyName2"] + + multi_idx2 = multi_idx.copy(names=["NewName1", "NewName2"]) + + assert multi_idx.equals(multi_idx2) + assert multi_idx.names == ["MyName1", "MyName2"] + assert multi_idx2.names == ["NewName1", "NewName2"] + + multi_idx3 = multi_idx.copy(name=["NewName1", "NewName2"]) + + assert multi_idx.equals(multi_idx3) + assert multi_idx.names == ["MyName1", "MyName2"] + assert multi_idx3.names == ["NewName1", "NewName2"] + + +def test_names(idx, index_names): + + # names are assigned in setup + assert index_names == ["first", "second"] + level_names = [level.name for level in idx.levels] + assert level_names == index_names + + # setting bad names on existing + index = idx + with pytest.raises(ValueError, match="^Length of names"): + setattr(index, "names", list(index.names) + ["third"]) + with pytest.raises(ValueError, match="^Length of names"): + setattr(index, "names", []) + + # initializing with bad names (should always be equivalent) + major_axis, minor_axis = idx.levels + major_codes, minor_codes = idx.codes + with pytest.raises(ValueError, match="^Length of names"): + MultiIndex( + levels=[major_axis, minor_axis], + codes=[major_codes, minor_codes], + names=["first"], + ) + with pytest.raises(ValueError, match="^Length of names"): + MultiIndex( + levels=[major_axis, minor_axis], + codes=[major_codes, minor_codes], + names=["first", "second", "third"], + ) + + # names are assigned on index, but not transferred to the levels + index.names = ["a", "b"] + level_names = [level.name for level in index.levels] + assert level_names == ["a", "b"] + + +def test_duplicate_level_names_access_raises(idx): + # GH19029 + idx.names = ["foo", "foo"] + with pytest.raises(ValueError, match="name foo occurs multiple times"): + idx._get_level_number("foo") + + +def test_get_names_from_levels(): + idx = pd.MultiIndex.from_product([["a"], [1, 2]], names=["a", "b"]) + + assert idx.levels[0].name == "a" + assert idx.levels[1].name == "b" + + +def test_setting_names_from_levels_raises(): + idx = pd.MultiIndex.from_product([["a"], [1, 2]], names=["a", "b"]) + with pytest.raises(RuntimeError, match="set_names"): + idx.levels[0].name = "foo" + + with pytest.raises(RuntimeError, match="set_names"): + idx.levels[1].name = "foo" + + new = pd.Series(1, index=idx.levels[0]) + with pytest.raises(RuntimeError, match="set_names"): + new.index.name = "bar" + + assert pd.Index._no_setting_name is False + assert pd.Int64Index._no_setting_name is False + assert pd.RangeIndex._no_setting_name is False diff --git a/venv/Lib/site-packages/pandas/tests/indexes/multi/test_partial_indexing.py b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_partial_indexing.py new file mode 100644 index 0000000..b00018d --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_partial_indexing.py @@ -0,0 +1,96 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, MultiIndex, date_range +import pandas._testing as tm + + +def test_partial_string_timestamp_multiindex(): + # GH10331 + dr = pd.date_range("2016-01-01", "2016-01-03", freq="12H") + abc = ["a", "b", "c"] + ix = pd.MultiIndex.from_product([dr, abc]) + df = pd.DataFrame({"c1": range(0, 15)}, index=ix) + idx = pd.IndexSlice + + # c1 + # 2016-01-01 00:00:00 a 0 + # b 1 + # c 2 + # 2016-01-01 12:00:00 a 3 + # b 4 + # c 5 + # 2016-01-02 00:00:00 a 6 + # b 7 + # c 8 + # 2016-01-02 12:00:00 a 9 + # b 10 + # c 11 + # 2016-01-03 00:00:00 a 12 + # b 13 + # c 14 + + # partial string matching on a single index + for df_swap in (df.swaplevel(), df.swaplevel(0), df.swaplevel(0, 1)): + df_swap = df_swap.sort_index() + just_a = df_swap.loc["a"] + result = just_a.loc["2016-01-01"] + expected = df.loc[idx[:, "a"], :].iloc[0:2] + expected.index = expected.index.droplevel(1) + tm.assert_frame_equal(result, expected) + + # indexing with IndexSlice + result = df.loc[idx["2016-01-01":"2016-02-01", :], :] + expected = df + tm.assert_frame_equal(result, expected) + + # match on secondary index + result = df_swap.loc[idx[:, "2016-01-01":"2016-01-01"], :] + expected = df_swap.iloc[[0, 1, 5, 6, 10, 11]] + tm.assert_frame_equal(result, expected) + + # Even though this syntax works on a single index, this is somewhat + # ambiguous and we don't want to extend this behavior forward to work + # in multi-indexes. This would amount to selecting a scalar from a + # column. + with pytest.raises(KeyError, match="'2016-01-01'"): + df["2016-01-01"] + + # partial string match on year only + result = df.loc["2016"] + expected = df + tm.assert_frame_equal(result, expected) + + # partial string match on date + result = df.loc["2016-01-01"] + expected = df.iloc[0:6] + tm.assert_frame_equal(result, expected) + + # partial string match on date and hour, from middle + result = df.loc["2016-01-02 12"] + expected = df.iloc[9:12] + tm.assert_frame_equal(result, expected) + + # partial string match on secondary index + result = df_swap.loc[idx[:, "2016-01-02"], :] + expected = df_swap.iloc[[2, 3, 7, 8, 12, 13]] + tm.assert_frame_equal(result, expected) + + # tuple selector with partial string match on date + result = df.loc[("2016-01-01", "a"), :] + expected = df.iloc[[0, 3]] + tm.assert_frame_equal(result, expected) + + # Slicing date on first level should break (of course) + with pytest.raises(KeyError, match="'2016-01-01'"): + df_swap.loc["2016-01-01"] + + # GH12685 (partial string with daily resolution or below) + dr = date_range("2013-01-01", periods=100, freq="D") + ix = MultiIndex.from_product([dr, ["a", "b"]]) + df = DataFrame(np.random.randn(200, 1), columns=["A"], index=ix) + + result = df.loc[idx["2013-03":"2013-03", :], :] + expected = df.iloc[118:180] + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/multi/test_reindex.py b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_reindex.py new file mode 100644 index 0000000..ceb14aa --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_reindex.py @@ -0,0 +1,103 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import Index, MultiIndex +import pandas._testing as tm + + +def test_reindex(idx): + result, indexer = idx.reindex(list(idx[:4])) + assert isinstance(result, MultiIndex) + assert result.names == ["first", "second"] + assert [level.name for level in result.levels] == ["first", "second"] + + result, indexer = idx.reindex(list(idx)) + assert isinstance(result, MultiIndex) + assert indexer is None + assert result.names == ["first", "second"] + assert [level.name for level in result.levels] == ["first", "second"] + + +def test_reindex_level(idx): + index = Index(["one"]) + + target, indexer = idx.reindex(index, level="second") + target2, indexer2 = index.reindex(idx, level="second") + + exp_index = idx.join(index, level="second", how="right") + exp_index2 = idx.join(index, level="second", how="left") + + assert target.equals(exp_index) + exp_indexer = np.array([0, 2, 4]) + tm.assert_numpy_array_equal(indexer, exp_indexer, check_dtype=False) + + assert target2.equals(exp_index2) + exp_indexer2 = np.array([0, -1, 0, -1, 0, -1]) + tm.assert_numpy_array_equal(indexer2, exp_indexer2, check_dtype=False) + + with pytest.raises(TypeError, match="Fill method not supported"): + idx.reindex(idx, method="pad", level="second") + + with pytest.raises(TypeError, match="Fill method not supported"): + index.reindex(index, method="bfill", level="first") + + +def test_reindex_preserves_names_when_target_is_list_or_ndarray(idx): + # GH6552 + idx = idx.copy() + target = idx.copy() + idx.names = target.names = [None, None] + + other_dtype = pd.MultiIndex.from_product([[1, 2], [3, 4]]) + + # list & ndarray cases + assert idx.reindex([])[0].names == [None, None] + assert idx.reindex(np.array([]))[0].names == [None, None] + assert idx.reindex(target.tolist())[0].names == [None, None] + assert idx.reindex(target.values)[0].names == [None, None] + assert idx.reindex(other_dtype.tolist())[0].names == [None, None] + assert idx.reindex(other_dtype.values)[0].names == [None, None] + + idx.names = ["foo", "bar"] + assert idx.reindex([])[0].names == ["foo", "bar"] + assert idx.reindex(np.array([]))[0].names == ["foo", "bar"] + assert idx.reindex(target.tolist())[0].names == ["foo", "bar"] + assert idx.reindex(target.values)[0].names == ["foo", "bar"] + assert idx.reindex(other_dtype.tolist())[0].names == ["foo", "bar"] + assert idx.reindex(other_dtype.values)[0].names == ["foo", "bar"] + + +def test_reindex_lvl_preserves_names_when_target_is_list_or_array(): + # GH7774 + idx = pd.MultiIndex.from_product([[0, 1], ["a", "b"]], names=["foo", "bar"]) + assert idx.reindex([], level=0)[0].names == ["foo", "bar"] + assert idx.reindex([], level=1)[0].names == ["foo", "bar"] + + +def test_reindex_lvl_preserves_type_if_target_is_empty_list_or_array(): + # GH7774 + idx = pd.MultiIndex.from_product([[0, 1], ["a", "b"]]) + assert idx.reindex([], level=0)[0].levels[0].dtype.type == np.int64 + assert idx.reindex([], level=1)[0].levels[1].dtype.type == np.object_ + + +def test_reindex_base(idx): + idx = idx + expected = np.arange(idx.size, dtype=np.intp) + + actual = idx.get_indexer(idx) + tm.assert_numpy_array_equal(expected, actual) + + with pytest.raises(ValueError, match="Invalid fill method"): + idx.get_indexer(idx, method="invalid") + + +def test_reindex_non_unique(): + idx = pd.MultiIndex.from_tuples([(0, 0), (1, 1), (1, 1), (2, 2)]) + a = pd.Series(np.arange(4), index=idx) + new_idx = pd.MultiIndex.from_tuples([(0, 0), (1, 1), (2, 2)]) + + msg = "cannot handle a non-unique multi-index!" + with pytest.raises(ValueError, match=msg): + a.reindex(new_idx) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/multi/test_reshape.py b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_reshape.py new file mode 100644 index 0000000..2e39c71 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_reshape.py @@ -0,0 +1,130 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import Index, MultiIndex +import pandas._testing as tm + + +def test_insert(idx): + # key contained in all levels + new_index = idx.insert(0, ("bar", "two")) + assert new_index.equal_levels(idx) + assert new_index[0] == ("bar", "two") + + # key not contained in all levels + new_index = idx.insert(0, ("abc", "three")) + + exp0 = Index(list(idx.levels[0]) + ["abc"], name="first") + tm.assert_index_equal(new_index.levels[0], exp0) + assert new_index.names == ["first", "second"] + + exp1 = Index(list(idx.levels[1]) + ["three"], name="second") + tm.assert_index_equal(new_index.levels[1], exp1) + assert new_index[0] == ("abc", "three") + + # key wrong length + msg = "Item must have length equal to number of levels" + with pytest.raises(ValueError, match=msg): + idx.insert(0, ("foo2",)) + + left = pd.DataFrame([["a", "b", 0], ["b", "d", 1]], columns=["1st", "2nd", "3rd"]) + left.set_index(["1st", "2nd"], inplace=True) + ts = left["3rd"].copy(deep=True) + + left.loc[("b", "x"), "3rd"] = 2 + left.loc[("b", "a"), "3rd"] = -1 + left.loc[("b", "b"), "3rd"] = 3 + left.loc[("a", "x"), "3rd"] = 4 + left.loc[("a", "w"), "3rd"] = 5 + left.loc[("a", "a"), "3rd"] = 6 + + ts.loc[("b", "x")] = 2 + ts.loc["b", "a"] = -1 + ts.loc[("b", "b")] = 3 + ts.loc["a", "x"] = 4 + ts.loc[("a", "w")] = 5 + ts.loc["a", "a"] = 6 + + right = pd.DataFrame( + [ + ["a", "b", 0], + ["b", "d", 1], + ["b", "x", 2], + ["b", "a", -1], + ["b", "b", 3], + ["a", "x", 4], + ["a", "w", 5], + ["a", "a", 6], + ], + columns=["1st", "2nd", "3rd"], + ) + right.set_index(["1st", "2nd"], inplace=True) + # FIXME data types changes to float because + # of intermediate nan insertion; + tm.assert_frame_equal(left, right, check_dtype=False) + tm.assert_series_equal(ts, right["3rd"]) + + # GH9250 + idx = ( + [("test1", i) for i in range(5)] + + [("test2", i) for i in range(6)] + + [("test", 17), ("test", 18)] + ) + + left = pd.Series(np.linspace(0, 10, 11), pd.MultiIndex.from_tuples(idx[:-2])) + + left.loc[("test", 17)] = 11 + left.loc[("test", 18)] = 12 + + right = pd.Series(np.linspace(0, 12, 13), pd.MultiIndex.from_tuples(idx)) + + tm.assert_series_equal(left, right) + + +def test_append(idx): + result = idx[:3].append(idx[3:]) + assert result.equals(idx) + + foos = [idx[:1], idx[1:3], idx[3:]] + result = foos[0].append(foos[1:]) + assert result.equals(idx) + + # empty + result = idx.append([]) + assert result.equals(idx) + + +def test_repeat(): + reps = 2 + numbers = [1, 2, 3] + names = np.array(["foo", "bar"]) + + m = MultiIndex.from_product([numbers, names], names=names) + expected = MultiIndex.from_product([numbers, names.repeat(reps)], names=names) + tm.assert_index_equal(m.repeat(reps), expected) + + +def test_insert_base(idx): + + result = idx[1:4] + + # test 0th element + assert idx[0:4].equals(result.insert(0, idx[0])) + + +def test_delete_base(idx): + + expected = idx[1:] + result = idx.delete(0) + assert result.equals(expected) + assert result.name == expected.name + + expected = idx[:-1] + result = idx.delete(-1) + assert result.equals(expected) + assert result.name == expected.name + + with pytest.raises((IndexError, ValueError)): + # Exception raised depends on NumPy version. + idx.delete(len(idx)) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/multi/test_setops.py b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_setops.py new file mode 100644 index 0000000..841e3b3 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_setops.py @@ -0,0 +1,363 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import MultiIndex, Series +import pandas._testing as tm + + +@pytest.mark.parametrize("case", [0.5, "xxx"]) +@pytest.mark.parametrize("sort", [None, False]) +@pytest.mark.parametrize( + "method", ["intersection", "union", "difference", "symmetric_difference"] +) +def test_set_ops_error_cases(idx, case, sort, method): + # non-iterable input + msg = "Input must be Index or array-like" + with pytest.raises(TypeError, match=msg): + getattr(idx, method)(case, sort=sort) + + +@pytest.mark.parametrize("sort", [None, False]) +def test_intersection_base(idx, sort): + first = idx[:5] + second = idx[:3] + intersect = first.intersection(second, sort=sort) + + if sort is None: + tm.assert_index_equal(intersect, second.sort_values()) + assert tm.equalContents(intersect, second) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + result = first.intersection(case, sort=sort) + if sort is None: + tm.assert_index_equal(result, second.sort_values()) + assert tm.equalContents(result, second) + + msg = "other must be a MultiIndex or a list of tuples" + with pytest.raises(TypeError, match=msg): + first.intersection([1, 2, 3], sort=sort) + + +@pytest.mark.parametrize("sort", [None, False]) +def test_union_base(idx, sort): + first = idx[3:] + second = idx[:5] + everything = idx + union = first.union(second, sort=sort) + if sort is None: + tm.assert_index_equal(union, everything.sort_values()) + assert tm.equalContents(union, everything) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + result = first.union(case, sort=sort) + if sort is None: + tm.assert_index_equal(result, everything.sort_values()) + assert tm.equalContents(result, everything) + + msg = "other must be a MultiIndex or a list of tuples" + with pytest.raises(TypeError, match=msg): + first.union([1, 2, 3], sort=sort) + + +@pytest.mark.parametrize("sort", [None, False]) +def test_difference_base(idx, sort): + second = idx[4:] + answer = idx[:4] + result = idx.difference(second, sort=sort) + + if sort is None: + answer = answer.sort_values() + + assert result.equals(answer) + tm.assert_index_equal(result, answer) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + result = idx.difference(case, sort=sort) + tm.assert_index_equal(result, answer) + + msg = "other must be a MultiIndex or a list of tuples" + with pytest.raises(TypeError, match=msg): + idx.difference([1, 2, 3], sort=sort) + + +@pytest.mark.parametrize("sort", [None, False]) +def test_symmetric_difference(idx, sort): + first = idx[1:] + second = idx[:-1] + answer = idx[[-1, 0]] + result = first.symmetric_difference(second, sort=sort) + + if sort is None: + answer = answer.sort_values() + + tm.assert_index_equal(result, answer) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + result = first.symmetric_difference(case, sort=sort) + tm.assert_index_equal(result, answer) + + msg = "other must be a MultiIndex or a list of tuples" + with pytest.raises(TypeError, match=msg): + first.symmetric_difference([1, 2, 3], sort=sort) + + +def test_empty(idx): + # GH 15270 + assert not idx.empty + assert idx[:0].empty + + +@pytest.mark.parametrize("sort", [None, False]) +def test_difference(idx, sort): + + first = idx + result = first.difference(idx[-3:], sort=sort) + vals = idx[:-3].values + + if sort is None: + vals = sorted(vals) + + expected = MultiIndex.from_tuples(vals, sortorder=0, names=idx.names) + + assert isinstance(result, MultiIndex) + assert result.equals(expected) + assert result.names == idx.names + tm.assert_index_equal(result, expected) + + # empty difference: reflexive + result = idx.difference(idx, sort=sort) + expected = idx[:0] + assert result.equals(expected) + assert result.names == idx.names + + # empty difference: superset + result = idx[-3:].difference(idx, sort=sort) + expected = idx[:0] + assert result.equals(expected) + assert result.names == idx.names + + # empty difference: degenerate + result = idx[:0].difference(idx, sort=sort) + expected = idx[:0] + assert result.equals(expected) + assert result.names == idx.names + + # names not the same + chunklet = idx[-3:] + chunklet.names = ["foo", "baz"] + result = first.difference(chunklet, sort=sort) + assert result.names == (None, None) + + # empty, but non-equal + result = idx.difference(idx.sortlevel(1)[0], sort=sort) + assert len(result) == 0 + + # raise Exception called with non-MultiIndex + result = first.difference(first.values, sort=sort) + assert result.equals(first[:0]) + + # name from empty array + result = first.difference([], sort=sort) + assert first.equals(result) + assert first.names == result.names + + # name from non-empty array + result = first.difference([("foo", "one")], sort=sort) + expected = pd.MultiIndex.from_tuples( + [("bar", "one"), ("baz", "two"), ("foo", "two"), ("qux", "one"), ("qux", "two")] + ) + expected.names = first.names + assert first.names == result.names + + msg = "other must be a MultiIndex or a list of tuples" + with pytest.raises(TypeError, match=msg): + first.difference([1, 2, 3, 4, 5], sort=sort) + + +def test_difference_sort_special(): + # GH-24959 + idx = pd.MultiIndex.from_product([[1, 0], ["a", "b"]]) + # sort=None, the default + result = idx.difference([]) + tm.assert_index_equal(result, idx) + + +@pytest.mark.xfail(reason="Not implemented.") +def test_difference_sort_special_true(): + # TODO decide on True behaviour + idx = pd.MultiIndex.from_product([[1, 0], ["a", "b"]]) + result = idx.difference([], sort=True) + expected = pd.MultiIndex.from_product([[0, 1], ["a", "b"]]) + tm.assert_index_equal(result, expected) + + +def test_difference_sort_incomparable(): + # GH-24959 + idx = pd.MultiIndex.from_product([[1, pd.Timestamp("2000"), 2], ["a", "b"]]) + + other = pd.MultiIndex.from_product([[3, pd.Timestamp("2000"), 4], ["c", "d"]]) + # sort=None, the default + # MultiIndex.difference deviates here from other difference + # implementations in not catching the TypeError + with pytest.raises(TypeError): + result = idx.difference(other) + + # sort=False + result = idx.difference(other, sort=False) + tm.assert_index_equal(result, idx) + + +@pytest.mark.xfail(reason="Not implemented.") +def test_difference_sort_incomparable_true(): + # TODO decide on True behaviour + # # sort=True, raises + idx = pd.MultiIndex.from_product([[1, pd.Timestamp("2000"), 2], ["a", "b"]]) + other = pd.MultiIndex.from_product([[3, pd.Timestamp("2000"), 4], ["c", "d"]]) + + with pytest.raises(TypeError): + idx.difference(other, sort=True) + + +@pytest.mark.parametrize("sort", [None, False]) +def test_union(idx, sort): + piece1 = idx[:5][::-1] + piece2 = idx[3:] + + the_union = piece1.union(piece2, sort=sort) + + if sort is None: + tm.assert_index_equal(the_union, idx.sort_values()) + + assert tm.equalContents(the_union, idx) + + # corner case, pass self or empty thing: + the_union = idx.union(idx, sort=sort) + assert the_union is idx + + the_union = idx.union(idx[:0], sort=sort) + assert the_union is idx + + # won't work in python 3 + # tuples = _index.values + # result = _index[:4] | tuples[4:] + # assert result.equals(tuples) + + # not valid for python 3 + # def test_union_with_regular_index(self): + # other = Index(['A', 'B', 'C']) + + # result = other.union(idx) + # assert ('foo', 'one') in result + # assert 'B' in result + + # result2 = _index.union(other) + # assert result.equals(result2) + + +@pytest.mark.parametrize("sort", [None, False]) +def test_intersection(idx, sort): + piece1 = idx[:5][::-1] + piece2 = idx[3:] + + the_int = piece1.intersection(piece2, sort=sort) + + if sort is None: + tm.assert_index_equal(the_int, idx[3:5]) + assert tm.equalContents(the_int, idx[3:5]) + + # corner case, pass self + the_int = idx.intersection(idx, sort=sort) + assert the_int is idx + + # empty intersection: disjoint + empty = idx[:2].intersection(idx[2:], sort=sort) + expected = idx[:0] + assert empty.equals(expected) + + # can't do in python 3 + # tuples = _index.values + # result = _index & tuples + # assert result.equals(tuples) + + +def test_intersect_equal_sort(): + # GH-24959 + idx = pd.MultiIndex.from_product([[1, 0], ["a", "b"]]) + tm.assert_index_equal(idx.intersection(idx, sort=False), idx) + tm.assert_index_equal(idx.intersection(idx, sort=None), idx) + + +@pytest.mark.xfail(reason="Not implemented.") +def test_intersect_equal_sort_true(): + # TODO decide on True behaviour + idx = pd.MultiIndex.from_product([[1, 0], ["a", "b"]]) + sorted_ = pd.MultiIndex.from_product([[0, 1], ["a", "b"]]) + tm.assert_index_equal(idx.intersection(idx, sort=True), sorted_) + + +@pytest.mark.parametrize("slice_", [slice(None), slice(0)]) +def test_union_sort_other_empty(slice_): + # https://github.com/pandas-dev/pandas/issues/24959 + idx = pd.MultiIndex.from_product([[1, 0], ["a", "b"]]) + + # default, sort=None + other = idx[slice_] + tm.assert_index_equal(idx.union(other), idx) + # MultiIndex does not special case empty.union(idx) + # tm.assert_index_equal(other.union(idx), idx) + + # sort=False + tm.assert_index_equal(idx.union(other, sort=False), idx) + + +@pytest.mark.xfail(reason="Not implemented.") +def test_union_sort_other_empty_sort(slice_): + # TODO decide on True behaviour + # # sort=True + idx = pd.MultiIndex.from_product([[1, 0], ["a", "b"]]) + other = idx[:0] + result = idx.union(other, sort=True) + expected = pd.MultiIndex.from_product([[0, 1], ["a", "b"]]) + tm.assert_index_equal(result, expected) + + +def test_union_sort_other_incomparable(): + # https://github.com/pandas-dev/pandas/issues/24959 + idx = pd.MultiIndex.from_product([[1, pd.Timestamp("2000")], ["a", "b"]]) + + # default, sort=None + result = idx.union(idx[:1]) + tm.assert_index_equal(result, idx) + + # sort=False + result = idx.union(idx[:1], sort=False) + tm.assert_index_equal(result, idx) + + +@pytest.mark.xfail(reason="Not implemented.") +def test_union_sort_other_incomparable_sort(): + # TODO decide on True behaviour + # # sort=True + idx = pd.MultiIndex.from_product([[1, pd.Timestamp("2000")], ["a", "b"]]) + with pytest.raises(TypeError, match="Cannot compare"): + idx.union(idx[:1], sort=True) + + +@pytest.mark.parametrize( + "method", ["union", "intersection", "difference", "symmetric_difference"] +) +def test_setops_disallow_true(method): + idx1 = pd.MultiIndex.from_product([["a", "b"], [1, 2]]) + idx2 = pd.MultiIndex.from_product([["b", "c"], [1, 2]]) + + with pytest.raises(ValueError, match="The 'sort' keyword only takes"): + getattr(idx1, method)(idx2, sort=True) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/multi/test_sorting.py b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_sorting.py new file mode 100644 index 0000000..277bd79 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/multi/test_sorting.py @@ -0,0 +1,276 @@ +import numpy as np +import pytest + +from pandas.errors import PerformanceWarning, UnsortedIndexError + +import pandas as pd +from pandas import CategoricalIndex, DataFrame, Index, MultiIndex, RangeIndex +import pandas._testing as tm + + +def test_sortlevel(idx): + import random + + tuples = list(idx) + random.shuffle(tuples) + + index = MultiIndex.from_tuples(tuples) + + sorted_idx, _ = index.sortlevel(0) + expected = MultiIndex.from_tuples(sorted(tuples)) + assert sorted_idx.equals(expected) + + sorted_idx, _ = index.sortlevel(0, ascending=False) + assert sorted_idx.equals(expected[::-1]) + + sorted_idx, _ = index.sortlevel(1) + by1 = sorted(tuples, key=lambda x: (x[1], x[0])) + expected = MultiIndex.from_tuples(by1) + assert sorted_idx.equals(expected) + + sorted_idx, _ = index.sortlevel(1, ascending=False) + assert sorted_idx.equals(expected[::-1]) + + +def test_sortlevel_not_sort_remaining(): + mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC")) + sorted_idx, _ = mi.sortlevel("A", sort_remaining=False) + assert sorted_idx.equals(mi) + + +def test_sortlevel_deterministic(): + tuples = [ + ("bar", "one"), + ("foo", "two"), + ("qux", "two"), + ("foo", "one"), + ("baz", "two"), + ("qux", "one"), + ] + + index = MultiIndex.from_tuples(tuples) + + sorted_idx, _ = index.sortlevel(0) + expected = MultiIndex.from_tuples(sorted(tuples)) + assert sorted_idx.equals(expected) + + sorted_idx, _ = index.sortlevel(0, ascending=False) + assert sorted_idx.equals(expected[::-1]) + + sorted_idx, _ = index.sortlevel(1) + by1 = sorted(tuples, key=lambda x: (x[1], x[0])) + expected = MultiIndex.from_tuples(by1) + assert sorted_idx.equals(expected) + + sorted_idx, _ = index.sortlevel(1, ascending=False) + assert sorted_idx.equals(expected[::-1]) + + +def test_sort(indices): + with pytest.raises(TypeError): + indices.sort() + + +def test_numpy_argsort(idx): + result = np.argsort(idx) + expected = idx.argsort() + tm.assert_numpy_array_equal(result, expected) + + # these are the only two types that perform + # pandas compatibility input validation - the + # rest already perform separate (or no) such + # validation via their 'values' attribute as + # defined in pandas.core.indexes/base.py - they + # cannot be changed at the moment due to + # backwards compatibility concerns + if isinstance(type(idx), (CategoricalIndex, RangeIndex)): + msg = "the 'axis' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.argsort(idx, axis=1) + + msg = "the 'kind' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.argsort(idx, kind="mergesort") + + msg = "the 'order' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.argsort(idx, order=("a", "b")) + + +def test_unsortedindex(): + # GH 11897 + mi = pd.MultiIndex.from_tuples( + [("z", "a"), ("x", "a"), ("y", "b"), ("x", "b"), ("y", "a"), ("z", "b")], + names=["one", "two"], + ) + df = pd.DataFrame([[i, 10 * i] for i in range(6)], index=mi, columns=["one", "two"]) + + # GH 16734: not sorted, but no real slicing + result = df.loc(axis=0)["z", "a"] + expected = df.iloc[0] + tm.assert_series_equal(result, expected) + + with pytest.raises(UnsortedIndexError): + df.loc(axis=0)["z", slice("a")] + df.sort_index(inplace=True) + assert len(df.loc(axis=0)["z", :]) == 2 + + with pytest.raises(KeyError, match="'q'"): + df.loc(axis=0)["q", :] + + +def test_unsortedindex_doc_examples(): + # https://pandas.pydata.org/pandas-docs/stable/advanced.html#sorting-a-multiindex # noqa + dfm = DataFrame( + {"jim": [0, 0, 1, 1], "joe": ["x", "x", "z", "y"], "jolie": np.random.rand(4)} + ) + + dfm = dfm.set_index(["jim", "joe"]) + with tm.assert_produces_warning(PerformanceWarning): + dfm.loc[(1, "z")] + + with pytest.raises(UnsortedIndexError): + dfm.loc[(0, "y"):(1, "z")] + + assert not dfm.index.is_lexsorted() + assert dfm.index.lexsort_depth == 1 + + # sort it + dfm = dfm.sort_index() + dfm.loc[(1, "z")] + dfm.loc[(0, "y"):(1, "z")] + + assert dfm.index.is_lexsorted() + assert dfm.index.lexsort_depth == 2 + + +def test_reconstruct_sort(): + + # starts off lexsorted & monotonic + mi = MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]]) + assert mi.is_lexsorted() + assert mi.is_monotonic + + recons = mi._sort_levels_monotonic() + assert recons.is_lexsorted() + assert recons.is_monotonic + assert mi is recons + + assert mi.equals(recons) + assert Index(mi.values).equals(Index(recons.values)) + + # cannot convert to lexsorted + mi = pd.MultiIndex.from_tuples( + [("z", "a"), ("x", "a"), ("y", "b"), ("x", "b"), ("y", "a"), ("z", "b")], + names=["one", "two"], + ) + assert not mi.is_lexsorted() + assert not mi.is_monotonic + + recons = mi._sort_levels_monotonic() + assert not recons.is_lexsorted() + assert not recons.is_monotonic + + assert mi.equals(recons) + assert Index(mi.values).equals(Index(recons.values)) + + # cannot convert to lexsorted + mi = MultiIndex( + levels=[["b", "d", "a"], [1, 2, 3]], + codes=[[0, 1, 0, 2], [2, 0, 0, 1]], + names=["col1", "col2"], + ) + assert not mi.is_lexsorted() + assert not mi.is_monotonic + + recons = mi._sort_levels_monotonic() + assert not recons.is_lexsorted() + assert not recons.is_monotonic + + assert mi.equals(recons) + assert Index(mi.values).equals(Index(recons.values)) + + +def test_reconstruct_remove_unused(): + # xref to GH 2770 + df = DataFrame( + [["deleteMe", 1, 9], ["keepMe", 2, 9], ["keepMeToo", 3, 9]], + columns=["first", "second", "third"], + ) + df2 = df.set_index(["first", "second"], drop=False) + df2 = df2[df2["first"] != "deleteMe"] + + # removed levels are there + expected = MultiIndex( + levels=[["deleteMe", "keepMe", "keepMeToo"], [1, 2, 3]], + codes=[[1, 2], [1, 2]], + names=["first", "second"], + ) + result = df2.index + tm.assert_index_equal(result, expected) + + expected = MultiIndex( + levels=[["keepMe", "keepMeToo"], [2, 3]], + codes=[[0, 1], [0, 1]], + names=["first", "second"], + ) + result = df2.index.remove_unused_levels() + tm.assert_index_equal(result, expected) + + # idempotent + result2 = result.remove_unused_levels() + tm.assert_index_equal(result2, expected) + assert result2.is_(result) + + +@pytest.mark.parametrize( + "first_type,second_type", [("int64", "int64"), ("datetime64[D]", "str")] +) +def test_remove_unused_levels_large(first_type, second_type): + # GH16556 + + # because tests should be deterministic (and this test in particular + # checks that levels are removed, which is not the case for every + # random input): + rng = np.random.RandomState(4) # seed is arbitrary value that works + + size = 1 << 16 + df = DataFrame( + dict( + first=rng.randint(0, 1 << 13, size).astype(first_type), + second=rng.randint(0, 1 << 10, size).astype(second_type), + third=rng.rand(size), + ) + ) + df = df.groupby(["first", "second"]).sum() + df = df[df.third < 0.1] + + result = df.index.remove_unused_levels() + assert len(result.levels[0]) < len(df.index.levels[0]) + assert len(result.levels[1]) < len(df.index.levels[1]) + assert result.equals(df.index) + + expected = df.reset_index().set_index(["first", "second"]).index + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("level0", [["a", "d", "b"], ["a", "d", "b", "unused"]]) +@pytest.mark.parametrize( + "level1", [["w", "x", "y", "z"], ["w", "x", "y", "z", "unused"]] +) +def test_remove_unused_nan(level0, level1): + # GH 18417 + mi = pd.MultiIndex( + levels=[level0, level1], codes=[[0, 2, -1, 1, -1], [0, 1, 2, 3, 2]] + ) + + result = mi.remove_unused_levels() + tm.assert_index_equal(result, mi) + for level in 0, 1: + assert "unused" not in result.levels[level] + + +def test_argsort(idx): + result = idx.argsort() + expected = idx.values.argsort() + tm.assert_numpy_array_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/period/__init__.py b/venv/Lib/site-packages/pandas/tests/indexes/period/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/indexes/period/test_asfreq.py b/venv/Lib/site-packages/pandas/tests/indexes/period/test_asfreq.py new file mode 100644 index 0000000..88e800d --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/period/test_asfreq.py @@ -0,0 +1,149 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, PeriodIndex, Series, period_range +import pandas._testing as tm + + +class TestPeriodIndex: + def test_asfreq(self): + pi1 = period_range(freq="A", start="1/1/2001", end="1/1/2001") + pi2 = period_range(freq="Q", start="1/1/2001", end="1/1/2001") + pi3 = period_range(freq="M", start="1/1/2001", end="1/1/2001") + pi4 = period_range(freq="D", start="1/1/2001", end="1/1/2001") + pi5 = period_range(freq="H", start="1/1/2001", end="1/1/2001 00:00") + pi6 = period_range(freq="Min", start="1/1/2001", end="1/1/2001 00:00") + pi7 = period_range(freq="S", start="1/1/2001", end="1/1/2001 00:00:00") + + assert pi1.asfreq("Q", "S") == pi2 + assert pi1.asfreq("Q", "s") == pi2 + assert pi1.asfreq("M", "start") == pi3 + assert pi1.asfreq("D", "StarT") == pi4 + assert pi1.asfreq("H", "beGIN") == pi5 + assert pi1.asfreq("Min", "S") == pi6 + assert pi1.asfreq("S", "S") == pi7 + + assert pi2.asfreq("A", "S") == pi1 + assert pi2.asfreq("M", "S") == pi3 + assert pi2.asfreq("D", "S") == pi4 + assert pi2.asfreq("H", "S") == pi5 + assert pi2.asfreq("Min", "S") == pi6 + assert pi2.asfreq("S", "S") == pi7 + + assert pi3.asfreq("A", "S") == pi1 + assert pi3.asfreq("Q", "S") == pi2 + assert pi3.asfreq("D", "S") == pi4 + assert pi3.asfreq("H", "S") == pi5 + assert pi3.asfreq("Min", "S") == pi6 + assert pi3.asfreq("S", "S") == pi7 + + assert pi4.asfreq("A", "S") == pi1 + assert pi4.asfreq("Q", "S") == pi2 + assert pi4.asfreq("M", "S") == pi3 + assert pi4.asfreq("H", "S") == pi5 + assert pi4.asfreq("Min", "S") == pi6 + assert pi4.asfreq("S", "S") == pi7 + + assert pi5.asfreq("A", "S") == pi1 + assert pi5.asfreq("Q", "S") == pi2 + assert pi5.asfreq("M", "S") == pi3 + assert pi5.asfreq("D", "S") == pi4 + assert pi5.asfreq("Min", "S") == pi6 + assert pi5.asfreq("S", "S") == pi7 + + assert pi6.asfreq("A", "S") == pi1 + assert pi6.asfreq("Q", "S") == pi2 + assert pi6.asfreq("M", "S") == pi3 + assert pi6.asfreq("D", "S") == pi4 + assert pi6.asfreq("H", "S") == pi5 + assert pi6.asfreq("S", "S") == pi7 + + assert pi7.asfreq("A", "S") == pi1 + assert pi7.asfreq("Q", "S") == pi2 + assert pi7.asfreq("M", "S") == pi3 + assert pi7.asfreq("D", "S") == pi4 + assert pi7.asfreq("H", "S") == pi5 + assert pi7.asfreq("Min", "S") == pi6 + + msg = "How must be one of S or E" + with pytest.raises(ValueError, match=msg): + pi7.asfreq("T", "foo") + result1 = pi1.asfreq("3M") + result2 = pi1.asfreq("M") + expected = period_range(freq="M", start="2001-12", end="2001-12") + tm.assert_numpy_array_equal(result1.asi8, expected.asi8) + assert result1.freqstr == "3M" + tm.assert_numpy_array_equal(result2.asi8, expected.asi8) + assert result2.freqstr == "M" + + def test_asfreq_nat(self): + idx = PeriodIndex(["2011-01", "2011-02", "NaT", "2011-04"], freq="M") + result = idx.asfreq(freq="Q") + expected = PeriodIndex(["2011Q1", "2011Q1", "NaT", "2011Q2"], freq="Q") + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("freq", ["D", "3D"]) + def test_asfreq_mult_pi(self, freq): + pi = PeriodIndex(["2001-01", "2001-02", "NaT", "2001-03"], freq="2M") + + result = pi.asfreq(freq) + exp = PeriodIndex(["2001-02-28", "2001-03-31", "NaT", "2001-04-30"], freq=freq) + tm.assert_index_equal(result, exp) + assert result.freq == exp.freq + + result = pi.asfreq(freq, how="S") + exp = PeriodIndex(["2001-01-01", "2001-02-01", "NaT", "2001-03-01"], freq=freq) + tm.assert_index_equal(result, exp) + assert result.freq == exp.freq + + def test_asfreq_combined_pi(self): + pi = pd.PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="H") + exp = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="25H") + for freq, how in zip(["1D1H", "1H1D"], ["S", "E"]): + result = pi.asfreq(freq, how=how) + tm.assert_index_equal(result, exp) + assert result.freq == exp.freq + + for freq in ["1D1H", "1H1D"]: + pi = pd.PeriodIndex( + ["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq=freq + ) + result = pi.asfreq("H") + exp = PeriodIndex(["2001-01-02 00:00", "2001-01-03 02:00", "NaT"], freq="H") + tm.assert_index_equal(result, exp) + assert result.freq == exp.freq + + pi = pd.PeriodIndex( + ["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq=freq + ) + result = pi.asfreq("H", how="S") + exp = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="H") + tm.assert_index_equal(result, exp) + assert result.freq == exp.freq + + def test_asfreq_ts(self): + index = period_range(freq="A", start="1/1/2001", end="12/31/2010") + ts = Series(np.random.randn(len(index)), index=index) + df = DataFrame(np.random.randn(len(index), 3), index=index) + + result = ts.asfreq("D", how="end") + df_result = df.asfreq("D", how="end") + exp_index = index.asfreq("D", how="end") + assert len(result) == len(ts) + tm.assert_index_equal(result.index, exp_index) + tm.assert_index_equal(df_result.index, exp_index) + + result = ts.asfreq("D", how="start") + assert len(result) == len(ts) + tm.assert_index_equal(result.index, index.asfreq("D", how="start")) + + def test_astype_asfreq(self): + pi1 = PeriodIndex(["2011-01-01", "2011-02-01", "2011-03-01"], freq="D") + exp = PeriodIndex(["2011-01", "2011-02", "2011-03"], freq="M") + tm.assert_index_equal(pi1.asfreq("M"), exp) + tm.assert_index_equal(pi1.astype("period[M]"), exp) + + exp = PeriodIndex(["2011-01", "2011-02", "2011-03"], freq="3M") + tm.assert_index_equal(pi1.asfreq("3M"), exp) + tm.assert_index_equal(pi1.astype("period[3M]"), exp) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/period/test_astype.py b/venv/Lib/site-packages/pandas/tests/indexes/period/test_astype.py new file mode 100644 index 0000000..ec386dd --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/period/test_astype.py @@ -0,0 +1,128 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import Index, Int64Index, NaT, Period, PeriodIndex, period_range +import pandas._testing as tm + + +class TestPeriodIndexAsType: + @pytest.mark.parametrize("dtype", [float, "timedelta64", "timedelta64[ns]"]) + def test_astype_raises(self, dtype): + # GH#13149, GH#13209 + idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.NaN], freq="D") + msg = "Cannot cast PeriodArray to dtype" + with pytest.raises(TypeError, match=msg): + idx.astype(dtype) + + def test_astype_conversion(self): + # GH#13149, GH#13209 + idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.NaN], freq="D") + + result = idx.astype(object) + expected = Index( + [Period("2016-05-16", freq="D")] + [Period(NaT, freq="D")] * 3, + dtype="object", + ) + tm.assert_index_equal(result, expected) + + result = idx.astype(np.int64) + expected = Int64Index([16937] + [-9223372036854775808] * 3, dtype=np.int64) + tm.assert_index_equal(result, expected) + + result = idx.astype(str) + expected = Index(str(x) for x in idx) + tm.assert_index_equal(result, expected) + + idx = period_range("1990", "2009", freq="A") + result = idx.astype("i8") + tm.assert_index_equal(result, Index(idx.asi8)) + tm.assert_numpy_array_equal(result.values, idx.asi8) + + def test_astype_uint(self): + arr = period_range("2000", periods=2) + expected = pd.UInt64Index(np.array([10957, 10958], dtype="uint64")) + tm.assert_index_equal(arr.astype("uint64"), expected) + tm.assert_index_equal(arr.astype("uint32"), expected) + + def test_astype_object(self): + idx = pd.PeriodIndex([], freq="M") + + exp = np.array([], dtype=object) + tm.assert_numpy_array_equal(idx.astype(object).values, exp) + tm.assert_numpy_array_equal(idx._mpl_repr(), exp) + + idx = pd.PeriodIndex(["2011-01", pd.NaT], freq="M") + + exp = np.array([pd.Period("2011-01", freq="M"), pd.NaT], dtype=object) + tm.assert_numpy_array_equal(idx.astype(object).values, exp) + tm.assert_numpy_array_equal(idx._mpl_repr(), exp) + + exp = np.array([pd.Period("2011-01-01", freq="D"), pd.NaT], dtype=object) + idx = pd.PeriodIndex(["2011-01-01", pd.NaT], freq="D") + tm.assert_numpy_array_equal(idx.astype(object).values, exp) + tm.assert_numpy_array_equal(idx._mpl_repr(), exp) + + # TODO: de-duplicate this version (from test_ops) with the one above + # (from test_period) + def test_astype_object2(self): + idx = pd.period_range(start="2013-01-01", periods=4, freq="M", name="idx") + expected_list = [ + pd.Period("2013-01-31", freq="M"), + pd.Period("2013-02-28", freq="M"), + pd.Period("2013-03-31", freq="M"), + pd.Period("2013-04-30", freq="M"), + ] + expected = pd.Index(expected_list, dtype=object, name="idx") + result = idx.astype(object) + assert isinstance(result, Index) + assert result.dtype == object + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert idx.tolist() == expected_list + + idx = PeriodIndex( + ["2013-01-01", "2013-01-02", "NaT", "2013-01-04"], freq="D", name="idx" + ) + expected_list = [ + pd.Period("2013-01-01", freq="D"), + pd.Period("2013-01-02", freq="D"), + pd.Period("NaT", freq="D"), + pd.Period("2013-01-04", freq="D"), + ] + expected = pd.Index(expected_list, dtype=object, name="idx") + result = idx.astype(object) + assert isinstance(result, Index) + assert result.dtype == object + tm.assert_index_equal(result, expected) + for i in [0, 1, 3]: + assert result[i] == expected[i] + assert result[2] is pd.NaT + assert result.name == expected.name + + result_list = idx.tolist() + for i in [0, 1, 3]: + assert result_list[i] == expected_list[i] + assert result_list[2] is pd.NaT + + def test_astype_category(self): + obj = pd.period_range("2000", periods=2) + result = obj.astype("category") + expected = pd.CategoricalIndex( + [pd.Period("2000-01-01", freq="D"), pd.Period("2000-01-02", freq="D")] + ) + tm.assert_index_equal(result, expected) + + result = obj._data.astype("category") + expected = expected.values + tm.assert_categorical_equal(result, expected) + + def test_astype_array_fallback(self): + obj = pd.period_range("2000", periods=2) + result = obj.astype(bool) + expected = pd.Index(np.array([True, True])) + tm.assert_index_equal(result, expected) + + result = obj._data.astype(bool) + expected = np.array([True, True]) + tm.assert_numpy_array_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/period/test_constructors.py b/venv/Lib/site-packages/pandas/tests/indexes/period/test_constructors.py new file mode 100644 index 0000000..27ee915 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/period/test_constructors.py @@ -0,0 +1,520 @@ +import numpy as np +import pytest + +from pandas._libs.tslibs.period import IncompatibleFrequency + +from pandas.core.dtypes.dtypes import PeriodDtype + +import pandas as pd +from pandas import Index, Period, PeriodIndex, Series, date_range, offsets, period_range +import pandas._testing as tm +from pandas.core.arrays import PeriodArray + + +class TestPeriodIndex: + def test_construction_base_constructor(self): + # GH 13664 + arr = [pd.Period("2011-01", freq="M"), pd.NaT, pd.Period("2011-03", freq="M")] + tm.assert_index_equal(pd.Index(arr), pd.PeriodIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), pd.PeriodIndex(np.array(arr))) + + arr = [np.nan, pd.NaT, pd.Period("2011-03", freq="M")] + tm.assert_index_equal(pd.Index(arr), pd.PeriodIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), pd.PeriodIndex(np.array(arr))) + + arr = [pd.Period("2011-01", freq="M"), pd.NaT, pd.Period("2011-03", freq="D")] + tm.assert_index_equal(pd.Index(arr), pd.Index(arr, dtype=object)) + + tm.assert_index_equal( + pd.Index(np.array(arr)), pd.Index(np.array(arr), dtype=object) + ) + + def test_base_constructor_with_period_dtype(self): + dtype = PeriodDtype("D") + values = ["2011-01-01", "2012-03-04", "2014-05-01"] + result = pd.Index(values, dtype=dtype) + + expected = pd.PeriodIndex(values, dtype=dtype) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "values_constructor", [list, np.array, PeriodIndex, PeriodArray._from_sequence] + ) + def test_index_object_dtype(self, values_constructor): + # Index(periods, dtype=object) is an Index (not an PeriodIndex) + periods = [ + pd.Period("2011-01", freq="M"), + pd.NaT, + pd.Period("2011-03", freq="M"), + ] + values = values_constructor(periods) + result = Index(values, dtype=object) + + assert type(result) is Index + tm.assert_numpy_array_equal(result.values, np.array(values)) + + def test_constructor_use_start_freq(self): + # GH #1118 + p = Period("4/2/2012", freq="B") + expected = period_range(start="4/2/2012", periods=10, freq="B") + + index = period_range(start=p, periods=10) + tm.assert_index_equal(index, expected) + + def test_constructor_field_arrays(self): + # GH #1264 + + years = np.arange(1990, 2010).repeat(4)[2:-2] + quarters = np.tile(np.arange(1, 5), 20)[2:-2] + + index = PeriodIndex(year=years, quarter=quarters, freq="Q-DEC") + expected = period_range("1990Q3", "2009Q2", freq="Q-DEC") + tm.assert_index_equal(index, expected) + + index2 = PeriodIndex(year=years, quarter=quarters, freq="2Q-DEC") + tm.assert_numpy_array_equal(index.asi8, index2.asi8) + + index = PeriodIndex(year=years, quarter=quarters) + tm.assert_index_equal(index, expected) + + years = [2007, 2007, 2007] + months = [1, 2] + + msg = "Mismatched Period array lengths" + with pytest.raises(ValueError, match=msg): + PeriodIndex(year=years, month=months, freq="M") + with pytest.raises(ValueError, match=msg): + PeriodIndex(year=years, month=months, freq="2M") + + years = [2007, 2007, 2007] + months = [1, 2, 3] + idx = PeriodIndex(year=years, month=months, freq="M") + exp = period_range("2007-01", periods=3, freq="M") + tm.assert_index_equal(idx, exp) + + def test_constructor_U(self): + # U was used as undefined period + with pytest.raises(ValueError, match="Invalid frequency: X"): + period_range("2007-1-1", periods=500, freq="X") + + def test_constructor_nano(self): + idx = period_range( + start=Period(ordinal=1, freq="N"), end=Period(ordinal=4, freq="N"), freq="N" + ) + exp = PeriodIndex( + [ + Period(ordinal=1, freq="N"), + Period(ordinal=2, freq="N"), + Period(ordinal=3, freq="N"), + Period(ordinal=4, freq="N"), + ], + freq="N", + ) + tm.assert_index_equal(idx, exp) + + def test_constructor_arrays_negative_year(self): + years = np.arange(1960, 2000, dtype=np.int64).repeat(4) + quarters = np.tile(np.array([1, 2, 3, 4], dtype=np.int64), 40) + + pindex = PeriodIndex(year=years, quarter=quarters) + + tm.assert_index_equal(pindex.year, pd.Index(years)) + tm.assert_index_equal(pindex.quarter, pd.Index(quarters)) + + def test_constructor_invalid_quarters(self): + msg = "Quarter must be 1 <= q <= 4" + with pytest.raises(ValueError, match=msg): + PeriodIndex(year=range(2000, 2004), quarter=list(range(4)), freq="Q-DEC") + + def test_constructor_corner(self): + result = period_range("2007-01", periods=10.5, freq="M") + exp = period_range("2007-01", periods=10, freq="M") + tm.assert_index_equal(result, exp) + + def test_constructor_fromarraylike(self): + idx = period_range("2007-01", periods=20, freq="M") + + # values is an array of Period, thus can retrieve freq + tm.assert_index_equal(PeriodIndex(idx.values), idx) + tm.assert_index_equal(PeriodIndex(list(idx.values)), idx) + + msg = "freq not specified and cannot be inferred" + with pytest.raises(ValueError, match=msg): + PeriodIndex(idx._ndarray_values) + with pytest.raises(ValueError, match=msg): + PeriodIndex(list(idx._ndarray_values)) + + msg = "'Period' object is not iterable" + with pytest.raises(TypeError, match=msg): + PeriodIndex(data=Period("2007", freq="A")) + + result = PeriodIndex(iter(idx)) + tm.assert_index_equal(result, idx) + + result = PeriodIndex(idx) + tm.assert_index_equal(result, idx) + + result = PeriodIndex(idx, freq="M") + tm.assert_index_equal(result, idx) + + result = PeriodIndex(idx, freq=offsets.MonthEnd()) + tm.assert_index_equal(result, idx) + assert result.freq == "M" + + result = PeriodIndex(idx, freq="2M") + tm.assert_index_equal(result, idx.asfreq("2M")) + assert result.freq == "2M" + + result = PeriodIndex(idx, freq=offsets.MonthEnd(2)) + tm.assert_index_equal(result, idx.asfreq("2M")) + assert result.freq == "2M" + + result = PeriodIndex(idx, freq="D") + exp = idx.asfreq("D", "e") + tm.assert_index_equal(result, exp) + + def test_constructor_datetime64arr(self): + vals = np.arange(100000, 100000 + 10000, 100, dtype=np.int64) + vals = vals.view(np.dtype("M8[us]")) + + msg = r"Wrong dtype: datetime64\[us\]" + with pytest.raises(ValueError, match=msg): + PeriodIndex(vals, freq="D") + + @pytest.mark.parametrize("box", [None, "series", "index"]) + def test_constructor_datetime64arr_ok(self, box): + # https://github.com/pandas-dev/pandas/issues/23438 + data = pd.date_range("2017", periods=4, freq="M") + if box is None: + data = data._values + elif box == "series": + data = pd.Series(data) + + result = PeriodIndex(data, freq="D") + expected = PeriodIndex( + ["2017-01-31", "2017-02-28", "2017-03-31", "2017-04-30"], freq="D" + ) + tm.assert_index_equal(result, expected) + + def test_constructor_dtype(self): + # passing a dtype with a tz should localize + idx = PeriodIndex(["2013-01", "2013-03"], dtype="period[M]") + exp = PeriodIndex(["2013-01", "2013-03"], freq="M") + tm.assert_index_equal(idx, exp) + assert idx.dtype == "period[M]" + + idx = PeriodIndex(["2013-01-05", "2013-03-05"], dtype="period[3D]") + exp = PeriodIndex(["2013-01-05", "2013-03-05"], freq="3D") + tm.assert_index_equal(idx, exp) + assert idx.dtype == "period[3D]" + + # if we already have a freq and its not the same, then asfreq + # (not changed) + idx = PeriodIndex(["2013-01-01", "2013-01-02"], freq="D") + + res = PeriodIndex(idx, dtype="period[M]") + exp = PeriodIndex(["2013-01", "2013-01"], freq="M") + tm.assert_index_equal(res, exp) + assert res.dtype == "period[M]" + + res = PeriodIndex(idx, freq="M") + tm.assert_index_equal(res, exp) + assert res.dtype == "period[M]" + + msg = "specified freq and dtype are different" + with pytest.raises(IncompatibleFrequency, match=msg): + PeriodIndex(["2011-01"], freq="M", dtype="period[D]") + + def test_constructor_empty(self): + idx = pd.PeriodIndex([], freq="M") + assert isinstance(idx, PeriodIndex) + assert len(idx) == 0 + assert idx.freq == "M" + + with pytest.raises(ValueError, match="freq not specified"): + pd.PeriodIndex([]) + + def test_constructor_pi_nat(self): + idx = PeriodIndex( + [Period("2011-01", freq="M"), pd.NaT, Period("2011-01", freq="M")] + ) + exp = PeriodIndex(["2011-01", "NaT", "2011-01"], freq="M") + tm.assert_index_equal(idx, exp) + + idx = PeriodIndex( + np.array([Period("2011-01", freq="M"), pd.NaT, Period("2011-01", freq="M")]) + ) + tm.assert_index_equal(idx, exp) + + idx = PeriodIndex( + [pd.NaT, pd.NaT, Period("2011-01", freq="M"), Period("2011-01", freq="M")] + ) + exp = PeriodIndex(["NaT", "NaT", "2011-01", "2011-01"], freq="M") + tm.assert_index_equal(idx, exp) + + idx = PeriodIndex( + np.array( + [ + pd.NaT, + pd.NaT, + Period("2011-01", freq="M"), + Period("2011-01", freq="M"), + ] + ) + ) + tm.assert_index_equal(idx, exp) + + idx = PeriodIndex([pd.NaT, pd.NaT, "2011-01", "2011-01"], freq="M") + tm.assert_index_equal(idx, exp) + + with pytest.raises(ValueError, match="freq not specified"): + PeriodIndex([pd.NaT, pd.NaT]) + + with pytest.raises(ValueError, match="freq not specified"): + PeriodIndex(np.array([pd.NaT, pd.NaT])) + + with pytest.raises(ValueError, match="freq not specified"): + PeriodIndex(["NaT", "NaT"]) + + with pytest.raises(ValueError, match="freq not specified"): + PeriodIndex(np.array(["NaT", "NaT"])) + + def test_constructor_incompat_freq(self): + msg = "Input has different freq=D from PeriodIndex\\(freq=M\\)" + + with pytest.raises(IncompatibleFrequency, match=msg): + PeriodIndex( + [Period("2011-01", freq="M"), pd.NaT, Period("2011-01", freq="D")] + ) + + with pytest.raises(IncompatibleFrequency, match=msg): + PeriodIndex( + np.array( + [Period("2011-01", freq="M"), pd.NaT, Period("2011-01", freq="D")] + ) + ) + + # first element is pd.NaT + with pytest.raises(IncompatibleFrequency, match=msg): + PeriodIndex( + [pd.NaT, Period("2011-01", freq="M"), Period("2011-01", freq="D")] + ) + + with pytest.raises(IncompatibleFrequency, match=msg): + PeriodIndex( + np.array( + [pd.NaT, Period("2011-01", freq="M"), Period("2011-01", freq="D")] + ) + ) + + def test_constructor_mixed(self): + idx = PeriodIndex(["2011-01", pd.NaT, Period("2011-01", freq="M")]) + exp = PeriodIndex(["2011-01", "NaT", "2011-01"], freq="M") + tm.assert_index_equal(idx, exp) + + idx = PeriodIndex(["NaT", pd.NaT, Period("2011-01", freq="M")]) + exp = PeriodIndex(["NaT", "NaT", "2011-01"], freq="M") + tm.assert_index_equal(idx, exp) + + idx = PeriodIndex([Period("2011-01-01", freq="D"), pd.NaT, "2012-01-01"]) + exp = PeriodIndex(["2011-01-01", "NaT", "2012-01-01"], freq="D") + tm.assert_index_equal(idx, exp) + + def test_constructor_simple_new(self): + idx = period_range("2007-01", name="p", periods=2, freq="M") + result = idx._simple_new(idx, name="p", freq=idx.freq) + tm.assert_index_equal(result, idx) + + result = idx._simple_new(idx.astype("i8"), name="p", freq=idx.freq) + tm.assert_index_equal(result, idx) + + def test_constructor_simple_new_empty(self): + # GH13079 + idx = PeriodIndex([], freq="M", name="p") + result = idx._simple_new(idx, name="p", freq="M") + tm.assert_index_equal(result, idx) + + @pytest.mark.parametrize("floats", [[1.1, 2.1], np.array([1.1, 2.1])]) + def test_constructor_floats(self, floats): + msg = r"PeriodIndex\._simple_new does not accept floats" + with pytest.raises(TypeError, match=msg): + pd.PeriodIndex._simple_new(floats, freq="M") + + msg = "PeriodIndex does not allow floating point in construction" + with pytest.raises(TypeError, match=msg): + pd.PeriodIndex(floats, freq="M") + + def test_constructor_nat(self): + msg = "start and end must not be NaT" + with pytest.raises(ValueError, match=msg): + period_range(start="NaT", end="2011-01-01", freq="M") + with pytest.raises(ValueError, match=msg): + period_range(start="2011-01-01", end="NaT", freq="M") + + def test_constructor_year_and_quarter(self): + year = pd.Series([2001, 2002, 2003]) + quarter = year - 2000 + idx = PeriodIndex(year=year, quarter=quarter) + strs = ["{t[0]:d}Q{t[1]:d}".format(t=t) for t in zip(quarter, year)] + lops = list(map(Period, strs)) + p = PeriodIndex(lops) + tm.assert_index_equal(p, idx) + + def test_constructor_freq_mult(self): + # GH #7811 + pidx = period_range(start="2014-01", freq="2M", periods=4) + expected = PeriodIndex(["2014-01", "2014-03", "2014-05", "2014-07"], freq="2M") + tm.assert_index_equal(pidx, expected) + + pidx = period_range(start="2014-01-02", end="2014-01-15", freq="3D") + expected = PeriodIndex( + ["2014-01-02", "2014-01-05", "2014-01-08", "2014-01-11", "2014-01-14"], + freq="3D", + ) + tm.assert_index_equal(pidx, expected) + + pidx = period_range(end="2014-01-01 17:00", freq="4H", periods=3) + expected = PeriodIndex( + ["2014-01-01 09:00", "2014-01-01 13:00", "2014-01-01 17:00"], freq="4H" + ) + tm.assert_index_equal(pidx, expected) + + msg = "Frequency must be positive, because it represents span: -1M" + with pytest.raises(ValueError, match=msg): + PeriodIndex(["2011-01"], freq="-1M") + + msg = "Frequency must be positive, because it represents span: 0M" + with pytest.raises(ValueError, match=msg): + PeriodIndex(["2011-01"], freq="0M") + + msg = "Frequency must be positive, because it represents span: 0M" + with pytest.raises(ValueError, match=msg): + period_range("2011-01", periods=3, freq="0M") + + @pytest.mark.parametrize("freq", ["A", "M", "D", "T", "S"]) + @pytest.mark.parametrize("mult", [1, 2, 3, 4, 5]) + def test_constructor_freq_mult_dti_compat(self, mult, freq): + freqstr = str(mult) + freq + pidx = period_range(start="2014-04-01", freq=freqstr, periods=10) + expected = date_range(start="2014-04-01", freq=freqstr, periods=10).to_period( + freqstr + ) + tm.assert_index_equal(pidx, expected) + + def test_constructor_freq_combined(self): + for freq in ["1D1H", "1H1D"]: + pidx = PeriodIndex(["2016-01-01", "2016-01-02"], freq=freq) + expected = PeriodIndex(["2016-01-01 00:00", "2016-01-02 00:00"], freq="25H") + for freq in ["1D1H", "1H1D"]: + pidx = period_range(start="2016-01-01", periods=2, freq=freq) + expected = PeriodIndex(["2016-01-01 00:00", "2016-01-02 01:00"], freq="25H") + tm.assert_index_equal(pidx, expected) + + def test_constructor(self): + pi = period_range(freq="A", start="1/1/2001", end="12/1/2009") + assert len(pi) == 9 + + pi = period_range(freq="Q", start="1/1/2001", end="12/1/2009") + assert len(pi) == 4 * 9 + + pi = period_range(freq="M", start="1/1/2001", end="12/1/2009") + assert len(pi) == 12 * 9 + + pi = period_range(freq="D", start="1/1/2001", end="12/31/2009") + assert len(pi) == 365 * 9 + 2 + + pi = period_range(freq="B", start="1/1/2001", end="12/31/2009") + assert len(pi) == 261 * 9 + + pi = period_range(freq="H", start="1/1/2001", end="12/31/2001 23:00") + assert len(pi) == 365 * 24 + + pi = period_range(freq="Min", start="1/1/2001", end="1/1/2001 23:59") + assert len(pi) == 24 * 60 + + pi = period_range(freq="S", start="1/1/2001", end="1/1/2001 23:59:59") + assert len(pi) == 24 * 60 * 60 + + start = Period("02-Apr-2005", "B") + i1 = period_range(start=start, periods=20) + assert len(i1) == 20 + assert i1.freq == start.freq + assert i1[0] == start + + end_intv = Period("2006-12-31", "W") + i1 = period_range(end=end_intv, periods=10) + assert len(i1) == 10 + assert i1.freq == end_intv.freq + assert i1[-1] == end_intv + + end_intv = Period("2006-12-31", "1w") + i2 = period_range(end=end_intv, periods=10) + assert len(i1) == len(i2) + assert (i1 == i2).all() + assert i1.freq == i2.freq + + end_intv = Period("2006-12-31", ("w", 1)) + i2 = period_range(end=end_intv, periods=10) + assert len(i1) == len(i2) + assert (i1 == i2).all() + assert i1.freq == i2.freq + + end_intv = Period("2005-05-01", "B") + i1 = period_range(start=start, end=end_intv) + + # infer freq from first element + i2 = PeriodIndex([end_intv, Period("2005-05-05", "B")]) + assert len(i2) == 2 + assert i2[0] == end_intv + + i2 = PeriodIndex(np.array([end_intv, Period("2005-05-05", "B")])) + assert len(i2) == 2 + assert i2[0] == end_intv + + # Mixed freq should fail + vals = [end_intv, Period("2006-12-31", "w")] + msg = r"Input has different freq=W-SUN from PeriodIndex\(freq=B\)" + with pytest.raises(IncompatibleFrequency, match=msg): + PeriodIndex(vals) + vals = np.array(vals) + with pytest.raises(IncompatibleFrequency, match=msg): + PeriodIndex(vals) + + @pytest.mark.parametrize( + "freq", ["M", "Q", "A", "D", "B", "T", "S", "L", "U", "N", "H"] + ) + def test_recreate_from_data(self, freq): + org = period_range(start="2001/04/01", freq=freq, periods=1) + idx = PeriodIndex(org.values, freq=freq) + tm.assert_index_equal(idx, org) + + def test_map_with_string_constructor(self): + raw = [2005, 2007, 2009] + index = PeriodIndex(raw, freq="A") + + expected = Index([str(num) for num in raw]) + res = index.map(str) + + # should return an Index + assert isinstance(res, Index) + + # preserve element types + assert all(isinstance(resi, str) for resi in res) + + # lastly, values should compare equal + tm.assert_index_equal(res, expected) + + +class TestSeriesPeriod: + def setup_method(self, method): + self.series = Series(period_range("2000-01-01", periods=10, freq="D")) + + def test_constructor_cant_cast_period(self): + msg = "Cannot cast PeriodArray to dtype float64" + with pytest.raises(TypeError, match=msg): + Series(period_range("2000-01-01", periods=10, freq="D"), dtype=float) + + def test_constructor_cast_object(self): + s = Series(period_range("1/1/2000", periods=10), dtype=PeriodDtype("D")) + exp = Series(period_range("1/1/2000", periods=10)) + tm.assert_series_equal(s, exp) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/period/test_formats.py b/venv/Lib/site-packages/pandas/tests/indexes/period/test_formats.py new file mode 100644 index 0000000..5db373a --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/period/test_formats.py @@ -0,0 +1,211 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import PeriodIndex +import pandas._testing as tm + + +def test_to_native_types(): + index = PeriodIndex(["2017-01-01", "2017-01-02", "2017-01-03"], freq="D") + + # First, with no arguments. + expected = np.array(["2017-01-01", "2017-01-02", "2017-01-03"], dtype="=U10") + + result = index.to_native_types() + tm.assert_numpy_array_equal(result, expected) + + # No NaN values, so na_rep has no effect + result = index.to_native_types(na_rep="pandas") + tm.assert_numpy_array_equal(result, expected) + + # Make sure slicing works + expected = np.array(["2017-01-01", "2017-01-03"], dtype="=U10") + + result = index.to_native_types([0, 2]) + tm.assert_numpy_array_equal(result, expected) + + # Make sure date formatting works + expected = np.array(["01-2017-01", "01-2017-02", "01-2017-03"], dtype="=U10") + + result = index.to_native_types(date_format="%m-%Y-%d") + tm.assert_numpy_array_equal(result, expected) + + # NULL object handling should work + index = PeriodIndex(["2017-01-01", pd.NaT, "2017-01-03"], freq="D") + expected = np.array(["2017-01-01", "NaT", "2017-01-03"], dtype=object) + + result = index.to_native_types() + tm.assert_numpy_array_equal(result, expected) + + expected = np.array(["2017-01-01", "pandas", "2017-01-03"], dtype=object) + + result = index.to_native_types(na_rep="pandas") + tm.assert_numpy_array_equal(result, expected) + + +class TestPeriodIndexRendering: + def test_frame_repr(self): + df = pd.DataFrame({"A": [1, 2, 3]}, index=pd.date_range("2000", periods=3)) + result = repr(df) + expected = " A\n2000-01-01 1\n2000-01-02 2\n2000-01-03 3" + assert result == expected + + @pytest.mark.parametrize("method", ["__repr__", "__str__"]) + def test_representation(self, method): + # GH#7601 + idx1 = PeriodIndex([], freq="D") + idx2 = PeriodIndex(["2011-01-01"], freq="D") + idx3 = PeriodIndex(["2011-01-01", "2011-01-02"], freq="D") + idx4 = PeriodIndex(["2011-01-01", "2011-01-02", "2011-01-03"], freq="D") + idx5 = PeriodIndex(["2011", "2012", "2013"], freq="A") + idx6 = PeriodIndex(["2011-01-01 09:00", "2012-02-01 10:00", "NaT"], freq="H") + idx7 = pd.period_range("2013Q1", periods=1, freq="Q") + idx8 = pd.period_range("2013Q1", periods=2, freq="Q") + idx9 = pd.period_range("2013Q1", periods=3, freq="Q") + idx10 = PeriodIndex(["2011-01-01", "2011-02-01"], freq="3D") + + exp1 = "PeriodIndex([], dtype='period[D]', freq='D')" + + exp2 = "PeriodIndex(['2011-01-01'], dtype='period[D]', freq='D')" + + exp3 = "PeriodIndex(['2011-01-01', '2011-01-02'], dtype='period[D]', freq='D')" + + exp4 = ( + "PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], " + "dtype='period[D]', freq='D')" + ) + + exp5 = ( + "PeriodIndex(['2011', '2012', '2013'], dtype='period[A-DEC]', " + "freq='A-DEC')" + ) + + exp6 = ( + "PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'], " + "dtype='period[H]', freq='H')" + ) + + exp7 = "PeriodIndex(['2013Q1'], dtype='period[Q-DEC]', freq='Q-DEC')" + + exp8 = "PeriodIndex(['2013Q1', '2013Q2'], dtype='period[Q-DEC]', freq='Q-DEC')" + + exp9 = ( + "PeriodIndex(['2013Q1', '2013Q2', '2013Q3'], " + "dtype='period[Q-DEC]', freq='Q-DEC')" + ) + + exp10 = ( + "PeriodIndex(['2011-01-01', '2011-02-01'], " + "dtype='period[3D]', freq='3D')" + ) + + for idx, expected in zip( + [idx1, idx2, idx3, idx4, idx5, idx6, idx7, idx8, idx9, idx10], + [exp1, exp2, exp3, exp4, exp5, exp6, exp7, exp8, exp9, exp10], + ): + result = getattr(idx, method)() + assert result == expected + + def test_representation_to_series(self): + # GH#10971 + idx1 = PeriodIndex([], freq="D") + idx2 = PeriodIndex(["2011-01-01"], freq="D") + idx3 = PeriodIndex(["2011-01-01", "2011-01-02"], freq="D") + idx4 = PeriodIndex(["2011-01-01", "2011-01-02", "2011-01-03"], freq="D") + idx5 = PeriodIndex(["2011", "2012", "2013"], freq="A") + idx6 = PeriodIndex(["2011-01-01 09:00", "2012-02-01 10:00", "NaT"], freq="H") + + idx7 = pd.period_range("2013Q1", periods=1, freq="Q") + idx8 = pd.period_range("2013Q1", periods=2, freq="Q") + idx9 = pd.period_range("2013Q1", periods=3, freq="Q") + + exp1 = """Series([], dtype: period[D])""" + + exp2 = """0 2011-01-01 +dtype: period[D]""" + + exp3 = """0 2011-01-01 +1 2011-01-02 +dtype: period[D]""" + + exp4 = """0 2011-01-01 +1 2011-01-02 +2 2011-01-03 +dtype: period[D]""" + + exp5 = """0 2011 +1 2012 +2 2013 +dtype: period[A-DEC]""" + + exp6 = """0 2011-01-01 09:00 +1 2012-02-01 10:00 +2 NaT +dtype: period[H]""" + + exp7 = """0 2013Q1 +dtype: period[Q-DEC]""" + + exp8 = """0 2013Q1 +1 2013Q2 +dtype: period[Q-DEC]""" + + exp9 = """0 2013Q1 +1 2013Q2 +2 2013Q3 +dtype: period[Q-DEC]""" + + for idx, expected in zip( + [idx1, idx2, idx3, idx4, idx5, idx6, idx7, idx8, idx9], + [exp1, exp2, exp3, exp4, exp5, exp6, exp7, exp8, exp9], + ): + result = repr(pd.Series(idx)) + assert result == expected + + def test_summary(self): + # GH#9116 + idx1 = PeriodIndex([], freq="D") + idx2 = PeriodIndex(["2011-01-01"], freq="D") + idx3 = PeriodIndex(["2011-01-01", "2011-01-02"], freq="D") + idx4 = PeriodIndex(["2011-01-01", "2011-01-02", "2011-01-03"], freq="D") + idx5 = PeriodIndex(["2011", "2012", "2013"], freq="A") + idx6 = PeriodIndex(["2011-01-01 09:00", "2012-02-01 10:00", "NaT"], freq="H") + + idx7 = pd.period_range("2013Q1", periods=1, freq="Q") + idx8 = pd.period_range("2013Q1", periods=2, freq="Q") + idx9 = pd.period_range("2013Q1", periods=3, freq="Q") + + exp1 = """PeriodIndex: 0 entries +Freq: D""" + + exp2 = """PeriodIndex: 1 entries, 2011-01-01 to 2011-01-01 +Freq: D""" + + exp3 = """PeriodIndex: 2 entries, 2011-01-01 to 2011-01-02 +Freq: D""" + + exp4 = """PeriodIndex: 3 entries, 2011-01-01 to 2011-01-03 +Freq: D""" + + exp5 = """PeriodIndex: 3 entries, 2011 to 2013 +Freq: A-DEC""" + + exp6 = """PeriodIndex: 3 entries, 2011-01-01 09:00 to NaT +Freq: H""" + + exp7 = """PeriodIndex: 1 entries, 2013Q1 to 2013Q1 +Freq: Q-DEC""" + + exp8 = """PeriodIndex: 2 entries, 2013Q1 to 2013Q2 +Freq: Q-DEC""" + + exp9 = """PeriodIndex: 3 entries, 2013Q1 to 2013Q3 +Freq: Q-DEC""" + + for idx, expected in zip( + [idx1, idx2, idx3, idx4, idx5, idx6, idx7, idx8, idx9], + [exp1, exp2, exp3, exp4, exp5, exp6, exp7, exp8, exp9], + ): + result = idx._summary() + assert result == expected diff --git a/venv/Lib/site-packages/pandas/tests/indexes/period/test_indexing.py b/venv/Lib/site-packages/pandas/tests/indexes/period/test_indexing.py new file mode 100644 index 0000000..8a5bb2b --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/period/test_indexing.py @@ -0,0 +1,731 @@ +from datetime import datetime, timedelta + +import numpy as np +import pytest + +from pandas._libs.tslibs import period as libperiod + +import pandas as pd +from pandas import DatetimeIndex, Period, PeriodIndex, Series, notna, period_range +import pandas._testing as tm + + +class TestGetItem: + def test_ellipsis(self): + # GH#21282 + idx = period_range("2011-01-01", "2011-01-31", freq="D", name="idx") + + result = idx[...] + assert result.equals(idx) + assert result is not idx + + def test_getitem(self): + idx1 = pd.period_range("2011-01-01", "2011-01-31", freq="D", name="idx") + + for idx in [idx1]: + result = idx[0] + assert result == pd.Period("2011-01-01", freq="D") + + result = idx[-1] + assert result == pd.Period("2011-01-31", freq="D") + + result = idx[0:5] + expected = pd.period_range("2011-01-01", "2011-01-05", freq="D", name="idx") + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + assert result.freq == "D" + + result = idx[0:10:2] + expected = pd.PeriodIndex( + ["2011-01-01", "2011-01-03", "2011-01-05", "2011-01-07", "2011-01-09"], + freq="D", + name="idx", + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + assert result.freq == "D" + + result = idx[-20:-5:3] + expected = pd.PeriodIndex( + ["2011-01-12", "2011-01-15", "2011-01-18", "2011-01-21", "2011-01-24"], + freq="D", + name="idx", + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + assert result.freq == "D" + + result = idx[4::-1] + expected = PeriodIndex( + ["2011-01-05", "2011-01-04", "2011-01-03", "2011-01-02", "2011-01-01"], + freq="D", + name="idx", + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + assert result.freq == "D" + + def test_getitem_index(self): + idx = period_range("2007-01", periods=10, freq="M", name="x") + + result = idx[[1, 3, 5]] + exp = pd.PeriodIndex(["2007-02", "2007-04", "2007-06"], freq="M", name="x") + tm.assert_index_equal(result, exp) + + result = idx[[True, True, False, False, False, True, True, False, False, False]] + exp = pd.PeriodIndex( + ["2007-01", "2007-02", "2007-06", "2007-07"], freq="M", name="x" + ) + tm.assert_index_equal(result, exp) + + def test_getitem_partial(self): + rng = period_range("2007-01", periods=50, freq="M") + ts = Series(np.random.randn(len(rng)), rng) + + with pytest.raises(KeyError, match=r"^'2006'$"): + ts["2006"] + + result = ts["2008"] + assert (result.index.year == 2008).all() + + result = ts["2008":"2009"] + assert len(result) == 24 + + result = ts["2008-1":"2009-12"] + assert len(result) == 24 + + result = ts["2008Q1":"2009Q4"] + assert len(result) == 24 + + result = ts[:"2009"] + assert len(result) == 36 + + result = ts["2009":] + assert len(result) == 50 - 24 + + exp = result + result = ts[24:] + tm.assert_series_equal(exp, result) + + ts = ts[10:].append(ts[10:]) + msg = "left slice bound for non-unique label: '2008'" + with pytest.raises(KeyError, match=msg): + ts[slice("2008", "2009")] + + def test_getitem_datetime(self): + rng = period_range(start="2012-01-01", periods=10, freq="W-MON") + ts = Series(range(len(rng)), index=rng) + + dt1 = datetime(2011, 10, 2) + dt4 = datetime(2012, 4, 20) + + rs = ts[dt1:dt4] + tm.assert_series_equal(rs, ts) + + def test_getitem_nat(self): + idx = pd.PeriodIndex(["2011-01", "NaT", "2011-02"], freq="M") + assert idx[0] == pd.Period("2011-01", freq="M") + assert idx[1] is pd.NaT + + s = pd.Series([0, 1, 2], index=idx) + assert s[pd.NaT] == 1 + + s = pd.Series(idx, index=idx) + assert s[pd.Period("2011-01", freq="M")] == pd.Period("2011-01", freq="M") + assert s[pd.NaT] is pd.NaT + + def test_getitem_list_periods(self): + # GH 7710 + rng = period_range(start="2012-01-01", periods=10, freq="D") + ts = Series(range(len(rng)), index=rng) + exp = ts.iloc[[1]] + tm.assert_series_equal(ts[[Period("2012-01-02", freq="D")]], exp) + + def test_getitem_seconds(self): + # GH#6716 + didx = pd.date_range(start="2013/01/01 09:00:00", freq="S", periods=4000) + pidx = period_range(start="2013/01/01 09:00:00", freq="S", periods=4000) + + for idx in [didx, pidx]: + # getitem against index should raise ValueError + values = [ + "2014", + "2013/02", + "2013/01/02", + "2013/02/01 9H", + "2013/02/01 09:00", + ] + for v in values: + # GH7116 + # these show deprecations as we are trying + # to slice with non-integer indexers + # with pytest.raises(IndexError): + # idx[v] + continue + + s = Series(np.random.rand(len(idx)), index=idx) + tm.assert_series_equal(s["2013/01/01 10:00"], s[3600:3660]) + tm.assert_series_equal(s["2013/01/01 9H"], s[:3600]) + for d in ["2013/01/01", "2013/01", "2013"]: + tm.assert_series_equal(s[d], s) + + def test_getitem_day(self): + # GH#6716 + # Confirm DatetimeIndex and PeriodIndex works identically + didx = pd.date_range(start="2013/01/01", freq="D", periods=400) + pidx = period_range(start="2013/01/01", freq="D", periods=400) + + for idx in [didx, pidx]: + # getitem against index should raise ValueError + values = [ + "2014", + "2013/02", + "2013/01/02", + "2013/02/01 9H", + "2013/02/01 09:00", + ] + for v in values: + + # GH7116 + # these show deprecations as we are trying + # to slice with non-integer indexers + # with pytest.raises(IndexError): + # idx[v] + continue + + s = Series(np.random.rand(len(idx)), index=idx) + tm.assert_series_equal(s["2013/01"], s[0:31]) + tm.assert_series_equal(s["2013/02"], s[31:59]) + tm.assert_series_equal(s["2014"], s[365:]) + + invalid = ["2013/02/01 9H", "2013/02/01 09:00"] + for v in invalid: + with pytest.raises(KeyError, match=v): + s[v] + + +class TestWhere: + @pytest.mark.parametrize("klass", [list, tuple, np.array, Series]) + def test_where(self, klass): + i = period_range("20130101", periods=5, freq="D") + cond = [True] * len(i) + expected = i + result = i.where(klass(cond)) + tm.assert_index_equal(result, expected) + + cond = [False] + [True] * (len(i) - 1) + expected = PeriodIndex([pd.NaT] + i[1:].tolist(), freq="D") + result = i.where(klass(cond)) + tm.assert_index_equal(result, expected) + + def test_where_other(self): + i = period_range("20130101", periods=5, freq="D") + for arr in [np.nan, pd.NaT]: + result = i.where(notna(i), other=np.nan) + expected = i + tm.assert_index_equal(result, expected) + + i2 = i.copy() + i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), freq="D") + result = i.where(notna(i2), i2) + tm.assert_index_equal(result, i2) + + i2 = i.copy() + i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), freq="D") + result = i.where(notna(i2), i2.values) + tm.assert_index_equal(result, i2) + + def test_where_invalid_dtypes(self): + pi = period_range("20130101", periods=5, freq="D") + + i2 = pi.copy() + i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + pi[2:].tolist(), freq="D") + + with pytest.raises(TypeError, match="Where requires matching dtype"): + pi.where(notna(i2), i2.asi8) + + with pytest.raises(TypeError, match="Where requires matching dtype"): + pi.where(notna(i2), i2.asi8.view("timedelta64[ns]")) + + with pytest.raises(TypeError, match="Where requires matching dtype"): + pi.where(notna(i2), i2.to_timestamp("S")) + + +class TestTake: + def test_take(self): + # GH#10295 + idx1 = pd.period_range("2011-01-01", "2011-01-31", freq="D", name="idx") + + for idx in [idx1]: + result = idx.take([0]) + assert result == pd.Period("2011-01-01", freq="D") + + result = idx.take([5]) + assert result == pd.Period("2011-01-06", freq="D") + + result = idx.take([0, 1, 2]) + expected = pd.period_range("2011-01-01", "2011-01-03", freq="D", name="idx") + tm.assert_index_equal(result, expected) + assert result.freq == "D" + assert result.freq == expected.freq + + result = idx.take([0, 2, 4]) + expected = pd.PeriodIndex( + ["2011-01-01", "2011-01-03", "2011-01-05"], freq="D", name="idx" + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + assert result.freq == "D" + + result = idx.take([7, 4, 1]) + expected = pd.PeriodIndex( + ["2011-01-08", "2011-01-05", "2011-01-02"], freq="D", name="idx" + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + assert result.freq == "D" + + result = idx.take([3, 2, 5]) + expected = PeriodIndex( + ["2011-01-04", "2011-01-03", "2011-01-06"], freq="D", name="idx" + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + assert result.freq == "D" + + result = idx.take([-3, 2, 5]) + expected = PeriodIndex( + ["2011-01-29", "2011-01-03", "2011-01-06"], freq="D", name="idx" + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + assert result.freq == "D" + + def test_take_misc(self): + index = period_range(start="1/1/10", end="12/31/12", freq="D", name="idx") + expected = PeriodIndex( + [ + datetime(2010, 1, 6), + datetime(2010, 1, 7), + datetime(2010, 1, 9), + datetime(2010, 1, 13), + ], + freq="D", + name="idx", + ) + + taken1 = index.take([5, 6, 8, 12]) + taken2 = index[[5, 6, 8, 12]] + + for taken in [taken1, taken2]: + tm.assert_index_equal(taken, expected) + assert isinstance(taken, PeriodIndex) + assert taken.freq == index.freq + assert taken.name == expected.name + + def test_take_fill_value(self): + # GH#12631 + idx = pd.PeriodIndex( + ["2011-01-01", "2011-02-01", "2011-03-01"], name="xxx", freq="D" + ) + result = idx.take(np.array([1, 0, -1])) + expected = pd.PeriodIndex( + ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx", freq="D" + ) + tm.assert_index_equal(result, expected) + + # fill_value + result = idx.take(np.array([1, 0, -1]), fill_value=True) + expected = pd.PeriodIndex( + ["2011-02-01", "2011-01-01", "NaT"], name="xxx", freq="D" + ) + tm.assert_index_equal(result, expected) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = pd.PeriodIndex( + ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx", freq="D" + ) + tm.assert_index_equal(result, expected) + + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + msg = "index -5 is out of bounds for( axis 0 with)? size 3" + with pytest.raises(IndexError, match=msg): + idx.take(np.array([1, -5])) + + +class TestIndexing: + def test_get_loc_msg(self): + idx = period_range("2000-1-1", freq="A", periods=10) + bad_period = Period("2012", "A") + with pytest.raises(KeyError, match=r"^Period\('2012', 'A-DEC'\)$"): + idx.get_loc(bad_period) + + try: + idx.get_loc(bad_period) + except KeyError as inst: + assert inst.args[0] == bad_period + + def test_get_loc_nat(self): + didx = DatetimeIndex(["2011-01-01", "NaT", "2011-01-03"]) + pidx = PeriodIndex(["2011-01-01", "NaT", "2011-01-03"], freq="M") + + # check DatetimeIndex compat + for idx in [didx, pidx]: + assert idx.get_loc(pd.NaT) == 1 + assert idx.get_loc(None) == 1 + assert idx.get_loc(float("nan")) == 1 + assert idx.get_loc(np.nan) == 1 + + def test_get_loc(self): + # GH 17717 + p0 = pd.Period("2017-09-01") + p1 = pd.Period("2017-09-02") + p2 = pd.Period("2017-09-03") + + # get the location of p1/p2 from + # monotonic increasing PeriodIndex with non-duplicate + idx0 = pd.PeriodIndex([p0, p1, p2]) + expected_idx1_p1 = 1 + expected_idx1_p2 = 2 + + assert idx0.get_loc(p1) == expected_idx1_p1 + assert idx0.get_loc(str(p1)) == expected_idx1_p1 + assert idx0.get_loc(p2) == expected_idx1_p2 + assert idx0.get_loc(str(p2)) == expected_idx1_p2 + + msg = "Cannot interpret 'foo' as period" + with pytest.raises(KeyError, match=msg): + idx0.get_loc("foo") + with pytest.raises(KeyError, match=r"^1\.1$"): + idx0.get_loc(1.1) + + msg = ( + r"'PeriodIndex\(\['2017-09-01', '2017-09-02', '2017-09-03'\]," + r" dtype='period\[D\]', freq='D'\)' is an invalid key" + ) + with pytest.raises(TypeError, match=msg): + idx0.get_loc(idx0) + + # get the location of p1/p2 from + # monotonic increasing PeriodIndex with duplicate + idx1 = pd.PeriodIndex([p1, p1, p2]) + expected_idx1_p1 = slice(0, 2) + expected_idx1_p2 = 2 + + assert idx1.get_loc(p1) == expected_idx1_p1 + assert idx1.get_loc(str(p1)) == expected_idx1_p1 + assert idx1.get_loc(p2) == expected_idx1_p2 + assert idx1.get_loc(str(p2)) == expected_idx1_p2 + + msg = "Cannot interpret 'foo' as period" + with pytest.raises(KeyError, match=msg): + idx1.get_loc("foo") + + with pytest.raises(KeyError, match=r"^1\.1$"): + idx1.get_loc(1.1) + + msg = ( + r"'PeriodIndex\(\['2017-09-02', '2017-09-02', '2017-09-03'\]," + r" dtype='period\[D\]', freq='D'\)' is an invalid key" + ) + with pytest.raises(TypeError, match=msg): + idx1.get_loc(idx1) + + # get the location of p1/p2 from + # non-monotonic increasing/decreasing PeriodIndex with duplicate + idx2 = pd.PeriodIndex([p2, p1, p2]) + expected_idx2_p1 = 1 + expected_idx2_p2 = np.array([True, False, True]) + + assert idx2.get_loc(p1) == expected_idx2_p1 + assert idx2.get_loc(str(p1)) == expected_idx2_p1 + tm.assert_numpy_array_equal(idx2.get_loc(p2), expected_idx2_p2) + tm.assert_numpy_array_equal(idx2.get_loc(str(p2)), expected_idx2_p2) + + def test_is_monotonic_increasing(self): + # GH 17717 + p0 = pd.Period("2017-09-01") + p1 = pd.Period("2017-09-02") + p2 = pd.Period("2017-09-03") + + idx_inc0 = pd.PeriodIndex([p0, p1, p2]) + idx_inc1 = pd.PeriodIndex([p0, p1, p1]) + idx_dec0 = pd.PeriodIndex([p2, p1, p0]) + idx_dec1 = pd.PeriodIndex([p2, p1, p1]) + idx = pd.PeriodIndex([p1, p2, p0]) + + assert idx_inc0.is_monotonic_increasing is True + assert idx_inc1.is_monotonic_increasing is True + assert idx_dec0.is_monotonic_increasing is False + assert idx_dec1.is_monotonic_increasing is False + assert idx.is_monotonic_increasing is False + + def test_is_monotonic_decreasing(self): + # GH 17717 + p0 = pd.Period("2017-09-01") + p1 = pd.Period("2017-09-02") + p2 = pd.Period("2017-09-03") + + idx_inc0 = pd.PeriodIndex([p0, p1, p2]) + idx_inc1 = pd.PeriodIndex([p0, p1, p1]) + idx_dec0 = pd.PeriodIndex([p2, p1, p0]) + idx_dec1 = pd.PeriodIndex([p2, p1, p1]) + idx = pd.PeriodIndex([p1, p2, p0]) + + assert idx_inc0.is_monotonic_decreasing is False + assert idx_inc1.is_monotonic_decreasing is False + assert idx_dec0.is_monotonic_decreasing is True + assert idx_dec1.is_monotonic_decreasing is True + assert idx.is_monotonic_decreasing is False + + def test_contains(self): + # GH 17717 + p0 = pd.Period("2017-09-01") + p1 = pd.Period("2017-09-02") + p2 = pd.Period("2017-09-03") + p3 = pd.Period("2017-09-04") + + ps0 = [p0, p1, p2] + idx0 = pd.PeriodIndex(ps0) + + for p in ps0: + assert p in idx0 + assert str(p) in idx0 + + assert "2017-09-01 00:00:01" in idx0 + + assert "2017-09" in idx0 + + assert p3 not in idx0 + + def test_get_value(self): + # GH 17717 + p0 = pd.Period("2017-09-01") + p1 = pd.Period("2017-09-02") + p2 = pd.Period("2017-09-03") + + idx0 = pd.PeriodIndex([p0, p1, p2]) + input0 = np.array([1, 2, 3]) + expected0 = 2 + + result0 = idx0.get_value(input0, p1) + assert result0 == expected0 + + idx1 = pd.PeriodIndex([p1, p1, p2]) + input1 = np.array([1, 2, 3]) + expected1 = np.array([1, 2]) + + result1 = idx1.get_value(input1, p1) + tm.assert_numpy_array_equal(result1, expected1) + + idx2 = pd.PeriodIndex([p1, p2, p1]) + input2 = np.array([1, 2, 3]) + expected2 = np.array([1, 3]) + + result2 = idx2.get_value(input2, p1) + tm.assert_numpy_array_equal(result2, expected2) + + def test_get_indexer(self): + # GH 17717 + p1 = pd.Period("2017-09-01") + p2 = pd.Period("2017-09-04") + p3 = pd.Period("2017-09-07") + + tp0 = pd.Period("2017-08-31") + tp1 = pd.Period("2017-09-02") + tp2 = pd.Period("2017-09-05") + tp3 = pd.Period("2017-09-09") + + idx = pd.PeriodIndex([p1, p2, p3]) + + tm.assert_numpy_array_equal( + idx.get_indexer(idx), np.array([0, 1, 2], dtype=np.intp) + ) + + target = pd.PeriodIndex([tp0, tp1, tp2, tp3]) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "pad"), np.array([-1, 0, 1, 2], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "backfill"), np.array([0, 1, 2, -1], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "nearest"), np.array([0, 0, 1, 2], dtype=np.intp) + ) + + res = idx.get_indexer(target, "nearest", tolerance=pd.Timedelta("1 day")) + tm.assert_numpy_array_equal(res, np.array([0, 0, 1, -1], dtype=np.intp)) + + def test_get_indexer_mismatched_dtype(self): + # Check that we return all -1s and do not raise or cast incorrectly + + dti = pd.date_range("2016-01-01", periods=3) + pi = dti.to_period("D") + pi2 = dti.to_period("W") + + expected = np.array([-1, -1, -1], dtype=np.intp) + + result = pi.get_indexer(dti) + tm.assert_numpy_array_equal(result, expected) + + # This should work in both directions + result = dti.get_indexer(pi) + tm.assert_numpy_array_equal(result, expected) + + result = pi.get_indexer(pi2) + tm.assert_numpy_array_equal(result, expected) + + # We expect the same from get_indexer_non_unique + result = pi.get_indexer_non_unique(dti)[0] + tm.assert_numpy_array_equal(result, expected) + + result = dti.get_indexer_non_unique(pi)[0] + tm.assert_numpy_array_equal(result, expected) + + result = pi.get_indexer_non_unique(pi2)[0] + tm.assert_numpy_array_equal(result, expected) + + def test_get_indexer_non_unique(self): + # GH 17717 + p1 = pd.Period("2017-09-02") + p2 = pd.Period("2017-09-03") + p3 = pd.Period("2017-09-04") + p4 = pd.Period("2017-09-05") + + idx1 = pd.PeriodIndex([p1, p2, p1]) + idx2 = pd.PeriodIndex([p2, p1, p3, p4]) + + result = idx1.get_indexer_non_unique(idx2) + expected_indexer = np.array([1, 0, 2, -1, -1], dtype=np.intp) + expected_missing = np.array([2, 3], dtype=np.int64) + + tm.assert_numpy_array_equal(result[0], expected_indexer) + tm.assert_numpy_array_equal(result[1], expected_missing) + + # TODO: This method came from test_period; de-dup with version above + def test_get_loc2(self): + idx = pd.period_range("2000-01-01", periods=3) + + for method in [None, "pad", "backfill", "nearest"]: + assert idx.get_loc(idx[1], method) == 1 + assert idx.get_loc(idx[1].asfreq("H", how="start"), method) == 1 + assert idx.get_loc(idx[1].to_timestamp(), method) == 1 + assert idx.get_loc(idx[1].to_timestamp().to_pydatetime(), method) == 1 + assert idx.get_loc(str(idx[1]), method) == 1 + + idx = pd.period_range("2000-01-01", periods=5)[::2] + assert idx.get_loc("2000-01-02T12", method="nearest", tolerance="1 day") == 1 + assert ( + idx.get_loc("2000-01-02T12", method="nearest", tolerance=pd.Timedelta("1D")) + == 1 + ) + assert ( + idx.get_loc( + "2000-01-02T12", method="nearest", tolerance=np.timedelta64(1, "D") + ) + == 1 + ) + assert ( + idx.get_loc("2000-01-02T12", method="nearest", tolerance=timedelta(1)) == 1 + ) + + msg = "unit abbreviation w/o a number" + with pytest.raises(ValueError, match=msg): + idx.get_loc("2000-01-10", method="nearest", tolerance="foo") + + msg = "Input has different freq=None from PeriodArray\\(freq=D\\)" + with pytest.raises(ValueError, match=msg): + idx.get_loc("2000-01-10", method="nearest", tolerance="1 hour") + with pytest.raises(KeyError, match=r"^Period\('2000-01-10', 'D'\)$"): + idx.get_loc("2000-01-10", method="nearest", tolerance="1 day") + with pytest.raises( + ValueError, match="list-like tolerance size must match target index size" + ): + idx.get_loc( + "2000-01-10", + method="nearest", + tolerance=[ + pd.Timedelta("1 day").to_timedelta64(), + pd.Timedelta("1 day").to_timedelta64(), + ], + ) + + # TODO: This method came from test_period; de-dup with version above + def test_get_indexer2(self): + idx = pd.period_range("2000-01-01", periods=3).asfreq("H", how="start") + tm.assert_numpy_array_equal( + idx.get_indexer(idx), np.array([0, 1, 2], dtype=np.intp) + ) + + target = pd.PeriodIndex( + ["1999-12-31T23", "2000-01-01T12", "2000-01-02T01"], freq="H" + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "pad"), np.array([-1, 0, 1], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "backfill"), np.array([0, 1, 2], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "nearest"), np.array([0, 1, 1], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "nearest", tolerance="1 hour"), + np.array([0, -1, 1], dtype=np.intp), + ) + + msg = "Input has different freq=None from PeriodArray\\(freq=H\\)" + with pytest.raises(ValueError, match=msg): + idx.get_indexer(target, "nearest", tolerance="1 minute") + + tm.assert_numpy_array_equal( + idx.get_indexer(target, "nearest", tolerance="1 day"), + np.array([0, 1, 1], dtype=np.intp), + ) + tol_raw = [ + pd.Timedelta("1 hour"), + pd.Timedelta("1 hour"), + np.timedelta64(1, "D"), + ] + tm.assert_numpy_array_equal( + idx.get_indexer( + target, "nearest", tolerance=[np.timedelta64(x) for x in tol_raw] + ), + np.array([0, -1, 1], dtype=np.intp), + ) + tol_bad = [ + pd.Timedelta("2 hour").to_timedelta64(), + pd.Timedelta("1 hour").to_timedelta64(), + np.timedelta64(1, "M"), + ] + with pytest.raises( + libperiod.IncompatibleFrequency, match="Input has different freq=None from" + ): + idx.get_indexer(target, "nearest", tolerance=tol_bad) + + def test_indexing(self): + # GH 4390, iat incorrectly indexing + index = period_range("1/1/2001", periods=10) + s = Series(np.random.randn(10), index=index) + expected = s[index[0]] + result = s.iat[0] + assert expected == result + + def test_period_index_indexer(self): + # GH4125 + idx = pd.period_range("2002-01", "2003-12", freq="M") + df = pd.DataFrame(np.random.randn(24, 10), index=idx) + tm.assert_frame_equal(df, df.loc[idx]) + tm.assert_frame_equal(df, df.loc[list(idx)]) + tm.assert_frame_equal(df, df.loc[list(idx)]) + tm.assert_frame_equal(df.iloc[0:5], df.loc[idx[0:5]]) + tm.assert_frame_equal(df, df.loc[list(idx)]) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/period/test_ops.py b/venv/Lib/site-packages/pandas/tests/indexes/period/test_ops.py new file mode 100644 index 0000000..427d9ab --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/period/test_ops.py @@ -0,0 +1,347 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DatetimeIndex, Index, NaT, PeriodIndex, Series +import pandas._testing as tm +from pandas.core.arrays import PeriodArray +from pandas.tests.base.test_ops import Ops + + +class TestPeriodIndexOps(Ops): + def setup_method(self, method): + super().setup_method(method) + mask = lambda x: (isinstance(x, DatetimeIndex) or isinstance(x, PeriodIndex)) + self.is_valid_objs = [o for o in self.objs if mask(o)] + self.not_valid_objs = [o for o in self.objs if not mask(o)] + + def test_ops_properties(self): + f = lambda x: isinstance(x, PeriodIndex) + self.check_ops_properties(PeriodArray._field_ops, f) + self.check_ops_properties(PeriodArray._object_ops, f) + self.check_ops_properties(PeriodArray._bool_ops, f) + + def test_resolution(self): + for freq, expected in zip( + ["A", "Q", "M", "D", "H", "T", "S", "L", "U"], + [ + "day", + "day", + "day", + "day", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + ], + ): + + idx = pd.period_range(start="2013-04-01", periods=30, freq=freq) + assert idx.resolution == expected + + def test_value_counts_unique(self): + # GH 7735 + idx = pd.period_range("2011-01-01 09:00", freq="H", periods=10) + # create repeated values, 'n'th element is repeated by n+1 times + idx = PeriodIndex(np.repeat(idx._values, range(1, len(idx) + 1)), freq="H") + + exp_idx = PeriodIndex( + [ + "2011-01-01 18:00", + "2011-01-01 17:00", + "2011-01-01 16:00", + "2011-01-01 15:00", + "2011-01-01 14:00", + "2011-01-01 13:00", + "2011-01-01 12:00", + "2011-01-01 11:00", + "2011-01-01 10:00", + "2011-01-01 09:00", + ], + freq="H", + ) + expected = Series(range(10, 0, -1), index=exp_idx, dtype="int64") + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) + + expected = pd.period_range("2011-01-01 09:00", freq="H", periods=10) + tm.assert_index_equal(idx.unique(), expected) + + idx = PeriodIndex( + [ + "2013-01-01 09:00", + "2013-01-01 09:00", + "2013-01-01 09:00", + "2013-01-01 08:00", + "2013-01-01 08:00", + NaT, + ], + freq="H", + ) + + exp_idx = PeriodIndex(["2013-01-01 09:00", "2013-01-01 08:00"], freq="H") + expected = Series([3, 2], index=exp_idx) + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) + + exp_idx = PeriodIndex(["2013-01-01 09:00", "2013-01-01 08:00", NaT], freq="H") + expected = Series([3, 2, 1], index=exp_idx) + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(dropna=False), expected) + + tm.assert_index_equal(idx.unique(), exp_idx) + + def test_drop_duplicates_metadata(self): + # GH 10115 + idx = pd.period_range("2011-01-01", "2011-01-31", freq="D", name="idx") + result = idx.drop_duplicates() + tm.assert_index_equal(idx, result) + assert idx.freq == result.freq + + idx_dup = idx.append(idx) # freq will not be reset + result = idx_dup.drop_duplicates() + tm.assert_index_equal(idx, result) + assert idx.freq == result.freq + + def test_drop_duplicates(self): + # to check Index/Series compat + base = pd.period_range("2011-01-01", "2011-01-31", freq="D", name="idx") + idx = base.append(base[:5]) + + res = idx.drop_duplicates() + tm.assert_index_equal(res, base) + res = Series(idx).drop_duplicates() + tm.assert_series_equal(res, Series(base)) + + res = idx.drop_duplicates(keep="last") + exp = base[5:].append(base[:5]) + tm.assert_index_equal(res, exp) + res = Series(idx).drop_duplicates(keep="last") + tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) + + res = idx.drop_duplicates(keep=False) + tm.assert_index_equal(res, base[5:]) + res = Series(idx).drop_duplicates(keep=False) + tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) + + def test_order_compat(self): + def _check_freq(index, expected_index): + if isinstance(index, PeriodIndex): + assert index.freq == expected_index.freq + + pidx = PeriodIndex(["2011", "2012", "2013"], name="pidx", freq="A") + # for compatibility check + iidx = Index([2011, 2012, 2013], name="idx") + for idx in [pidx, iidx]: + ordered = idx.sort_values() + tm.assert_index_equal(ordered, idx) + _check_freq(ordered, idx) + + ordered = idx.sort_values(ascending=False) + tm.assert_index_equal(ordered, idx[::-1]) + _check_freq(ordered, idx[::-1]) + + ordered, indexer = idx.sort_values(return_indexer=True) + tm.assert_index_equal(ordered, idx) + tm.assert_numpy_array_equal(indexer, np.array([0, 1, 2]), check_dtype=False) + _check_freq(ordered, idx) + + ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) + tm.assert_index_equal(ordered, idx[::-1]) + tm.assert_numpy_array_equal(indexer, np.array([2, 1, 0]), check_dtype=False) + _check_freq(ordered, idx[::-1]) + + pidx = PeriodIndex( + ["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="A" + ) + pexpected = PeriodIndex( + ["2011", "2011", "2012", "2013", "2015"], name="pidx", freq="A" + ) + # for compatibility check + iidx = Index([2011, 2013, 2015, 2012, 2011], name="idx") + iexpected = Index([2011, 2011, 2012, 2013, 2015], name="idx") + for idx, expected in [(pidx, pexpected), (iidx, iexpected)]: + ordered = idx.sort_values() + tm.assert_index_equal(ordered, expected) + _check_freq(ordered, idx) + + ordered = idx.sort_values(ascending=False) + tm.assert_index_equal(ordered, expected[::-1]) + _check_freq(ordered, idx) + + ordered, indexer = idx.sort_values(return_indexer=True) + tm.assert_index_equal(ordered, expected) + + exp = np.array([0, 4, 3, 1, 2]) + tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) + _check_freq(ordered, idx) + + ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) + tm.assert_index_equal(ordered, expected[::-1]) + + exp = np.array([2, 1, 3, 4, 0]) + tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) + _check_freq(ordered, idx) + + pidx = PeriodIndex(["2011", "2013", "NaT", "2011"], name="pidx", freq="D") + + result = pidx.sort_values() + expected = PeriodIndex(["NaT", "2011", "2011", "2013"], name="pidx", freq="D") + tm.assert_index_equal(result, expected) + assert result.freq == "D" + + result = pidx.sort_values(ascending=False) + expected = PeriodIndex(["2013", "2011", "2011", "NaT"], name="pidx", freq="D") + tm.assert_index_equal(result, expected) + assert result.freq == "D" + + def test_order(self): + for freq in ["D", "2D", "4D"]: + idx = PeriodIndex( + ["2011-01-01", "2011-01-02", "2011-01-03"], freq=freq, name="idx" + ) + + ordered = idx.sort_values() + tm.assert_index_equal(ordered, idx) + assert ordered.freq == idx.freq + + ordered = idx.sort_values(ascending=False) + expected = idx[::-1] + tm.assert_index_equal(ordered, expected) + assert ordered.freq == expected.freq + assert ordered.freq == freq + + ordered, indexer = idx.sort_values(return_indexer=True) + tm.assert_index_equal(ordered, idx) + tm.assert_numpy_array_equal(indexer, np.array([0, 1, 2]), check_dtype=False) + assert ordered.freq == idx.freq + assert ordered.freq == freq + + ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) + expected = idx[::-1] + tm.assert_index_equal(ordered, expected) + tm.assert_numpy_array_equal(indexer, np.array([2, 1, 0]), check_dtype=False) + assert ordered.freq == expected.freq + assert ordered.freq == freq + + idx1 = PeriodIndex( + ["2011-01-01", "2011-01-03", "2011-01-05", "2011-01-02", "2011-01-01"], + freq="D", + name="idx1", + ) + exp1 = PeriodIndex( + ["2011-01-01", "2011-01-01", "2011-01-02", "2011-01-03", "2011-01-05"], + freq="D", + name="idx1", + ) + + idx2 = PeriodIndex( + ["2011-01-01", "2011-01-03", "2011-01-05", "2011-01-02", "2011-01-01"], + freq="D", + name="idx2", + ) + exp2 = PeriodIndex( + ["2011-01-01", "2011-01-01", "2011-01-02", "2011-01-03", "2011-01-05"], + freq="D", + name="idx2", + ) + + idx3 = PeriodIndex( + [NaT, "2011-01-03", "2011-01-05", "2011-01-02", NaT], freq="D", name="idx3" + ) + exp3 = PeriodIndex( + [NaT, NaT, "2011-01-02", "2011-01-03", "2011-01-05"], freq="D", name="idx3" + ) + + for idx, expected in [(idx1, exp1), (idx2, exp2), (idx3, exp3)]: + ordered = idx.sort_values() + tm.assert_index_equal(ordered, expected) + assert ordered.freq == "D" + + ordered = idx.sort_values(ascending=False) + tm.assert_index_equal(ordered, expected[::-1]) + assert ordered.freq == "D" + + ordered, indexer = idx.sort_values(return_indexer=True) + tm.assert_index_equal(ordered, expected) + + exp = np.array([0, 4, 3, 1, 2]) + tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) + assert ordered.freq == "D" + + ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) + tm.assert_index_equal(ordered, expected[::-1]) + + exp = np.array([2, 1, 3, 4, 0]) + tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) + assert ordered.freq == "D" + + def test_shift(self): + # This is tested in test_arithmetic + pass + + def test_nat(self): + assert pd.PeriodIndex._na_value is NaT + assert pd.PeriodIndex([], freq="M")._na_value is NaT + + idx = pd.PeriodIndex(["2011-01-01", "2011-01-02"], freq="D") + assert idx._can_hold_na + + tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) + assert idx.hasnans is False + tm.assert_numpy_array_equal(idx._nan_idxs, np.array([], dtype=np.intp)) + + idx = pd.PeriodIndex(["2011-01-01", "NaT"], freq="D") + assert idx._can_hold_na + + tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) + assert idx.hasnans is True + tm.assert_numpy_array_equal(idx._nan_idxs, np.array([1], dtype=np.intp)) + + @pytest.mark.parametrize("freq", ["D", "M"]) + def test_equals(self, freq): + # GH#13107 + idx = pd.PeriodIndex(["2011-01-01", "2011-01-02", "NaT"], freq=freq) + assert idx.equals(idx) + assert idx.equals(idx.copy()) + assert idx.equals(idx.astype(object)) + assert idx.astype(object).equals(idx) + assert idx.astype(object).equals(idx.astype(object)) + assert not idx.equals(list(idx)) + assert not idx.equals(pd.Series(idx)) + + idx2 = pd.PeriodIndex(["2011-01-01", "2011-01-02", "NaT"], freq="H") + assert not idx.equals(idx2) + assert not idx.equals(idx2.copy()) + assert not idx.equals(idx2.astype(object)) + assert not idx.astype(object).equals(idx2) + assert not idx.equals(list(idx2)) + assert not idx.equals(pd.Series(idx2)) + + # same internal, different tz + idx3 = pd.PeriodIndex._simple_new( + idx._values._simple_new(idx._values.asi8, freq="H") + ) + tm.assert_numpy_array_equal(idx.asi8, idx3.asi8) + assert not idx.equals(idx3) + assert not idx.equals(idx3.copy()) + assert not idx.equals(idx3.astype(object)) + assert not idx.astype(object).equals(idx3) + assert not idx.equals(list(idx3)) + assert not idx.equals(pd.Series(idx3)) + + def test_freq_setter_deprecated(self): + # GH 20678 + idx = pd.period_range("2018Q1", periods=4, freq="Q") + + # no warning for getter + with tm.assert_produces_warning(None): + idx.freq + + # warning for setter + with pytest.raises(AttributeError, match="can't set attribute"): + idx.freq = pd.offsets.Day() diff --git a/venv/Lib/site-packages/pandas/tests/indexes/period/test_partial_slicing.py b/venv/Lib/site-packages/pandas/tests/indexes/period/test_partial_slicing.py new file mode 100644 index 0000000..9ca2dd1 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/period/test_partial_slicing.py @@ -0,0 +1,135 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Period, Series, period_range +import pandas._testing as tm + + +class TestPeriodIndex: + def setup_method(self, method): + pass + + def test_slice_with_negative_step(self): + ts = Series(np.arange(20), period_range("2014-01", periods=20, freq="M")) + SLC = pd.IndexSlice + + def assert_slices_equivalent(l_slc, i_slc): + tm.assert_series_equal(ts[l_slc], ts.iloc[i_slc]) + tm.assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) + tm.assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) + + assert_slices_equivalent(SLC[Period("2014-10") :: -1], SLC[9::-1]) + assert_slices_equivalent(SLC["2014-10"::-1], SLC[9::-1]) + + assert_slices_equivalent(SLC[: Period("2014-10") : -1], SLC[:8:-1]) + assert_slices_equivalent(SLC[:"2014-10":-1], SLC[:8:-1]) + + assert_slices_equivalent(SLC["2015-02":"2014-10":-1], SLC[13:8:-1]) + assert_slices_equivalent( + SLC[Period("2015-02") : Period("2014-10") : -1], SLC[13:8:-1] + ) + assert_slices_equivalent(SLC["2015-02" : Period("2014-10") : -1], SLC[13:8:-1]) + assert_slices_equivalent(SLC[Period("2015-02") : "2014-10" : -1], SLC[13:8:-1]) + + assert_slices_equivalent(SLC["2014-10":"2015-02":-1], SLC[:0]) + + def test_slice_with_zero_step_raises(self): + ts = Series(np.arange(20), period_range("2014-01", periods=20, freq="M")) + with pytest.raises(ValueError, match="slice step cannot be zero"): + ts[::0] + with pytest.raises(ValueError, match="slice step cannot be zero"): + ts.loc[::0] + with pytest.raises(ValueError, match="slice step cannot be zero"): + ts.loc[::0] + + def test_slice_keep_name(self): + idx = period_range("20010101", periods=10, freq="D", name="bob") + assert idx.name == idx[1:].name + + def test_pindex_slice_index(self): + pi = period_range(start="1/1/10", end="12/31/12", freq="M") + s = Series(np.random.rand(len(pi)), index=pi) + res = s["2010"] + exp = s[0:12] + tm.assert_series_equal(res, exp) + res = s["2011"] + exp = s[12:24] + tm.assert_series_equal(res, exp) + + def test_range_slice_day(self): + # GH#6716 + didx = pd.date_range(start="2013/01/01", freq="D", periods=400) + pidx = period_range(start="2013/01/01", freq="D", periods=400) + + for idx in [didx, pidx]: + # slices against index should raise IndexError + values = [ + "2014", + "2013/02", + "2013/01/02", + "2013/02/01 9H", + "2013/02/01 09:00", + ] + for v in values: + with pytest.raises(TypeError): + idx[v:] + + s = Series(np.random.rand(len(idx)), index=idx) + + tm.assert_series_equal(s["2013/01/02":], s[1:]) + tm.assert_series_equal(s["2013/01/02":"2013/01/05"], s[1:5]) + tm.assert_series_equal(s["2013/02":], s[31:]) + tm.assert_series_equal(s["2014":], s[365:]) + + invalid = ["2013/02/01 9H", "2013/02/01 09:00"] + for v in invalid: + with pytest.raises(TypeError): + idx[v:] + + def test_range_slice_seconds(self): + # GH#6716 + didx = pd.date_range(start="2013/01/01 09:00:00", freq="S", periods=4000) + pidx = period_range(start="2013/01/01 09:00:00", freq="S", periods=4000) + + for idx in [didx, pidx]: + # slices against index should raise IndexError + values = [ + "2014", + "2013/02", + "2013/01/02", + "2013/02/01 9H", + "2013/02/01 09:00", + ] + for v in values: + with pytest.raises(TypeError): + idx[v:] + + s = Series(np.random.rand(len(idx)), index=idx) + + tm.assert_series_equal(s["2013/01/01 09:05":"2013/01/01 09:10"], s[300:660]) + tm.assert_series_equal( + s["2013/01/01 10:00":"2013/01/01 10:05"], s[3600:3960] + ) + tm.assert_series_equal(s["2013/01/01 10H":], s[3600:]) + tm.assert_series_equal(s[:"2013/01/01 09:30"], s[:1860]) + for d in ["2013/01/01", "2013/01", "2013"]: + tm.assert_series_equal(s[d:], s) + + def test_range_slice_outofbounds(self): + # GH#5407 + didx = pd.date_range(start="2013/10/01", freq="D", periods=10) + pidx = period_range(start="2013/10/01", freq="D", periods=10) + + for idx in [didx, pidx]: + df = DataFrame(dict(units=[100 + i for i in range(10)]), index=idx) + empty = DataFrame(index=type(idx)([], freq="D"), columns=["units"]) + empty["units"] = empty["units"].astype("int64") + + tm.assert_frame_equal(df["2013/09/01":"2013/09/30"], empty) + tm.assert_frame_equal(df["2013/09/30":"2013/10/02"], df.iloc[:2]) + tm.assert_frame_equal(df["2013/10/01":"2013/10/02"], df.iloc[:2]) + tm.assert_frame_equal(df["2013/10/02":"2013/09/30"], empty) + tm.assert_frame_equal(df["2013/10/15":"2013/10/17"], empty) + tm.assert_frame_equal(df["2013-06":"2013-09"], empty) + tm.assert_frame_equal(df["2013-11":"2013-12"], empty) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/period/test_period.py b/venv/Lib/site-packages/pandas/tests/indexes/period/test_period.py new file mode 100644 index 0000000..16fa0b0 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/period/test_period.py @@ -0,0 +1,664 @@ +import numpy as np +import pytest + +from pandas._libs.tslibs.period import IncompatibleFrequency +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + DataFrame, + DatetimeIndex, + Index, + NaT, + Period, + PeriodIndex, + Series, + date_range, + offsets, + period_range, +) +import pandas._testing as tm + +from ..datetimelike import DatetimeLike + + +class TestPeriodIndex(DatetimeLike): + _holder = PeriodIndex + + @pytest.fixture( + params=[ + tm.makePeriodIndex(10), + period_range("20130101", periods=10, freq="D")[::-1], + ], + ids=["index_inc", "index_dec"], + ) + def indices(self, request): + return request.param + + def create_index(self): + return period_range("20130101", periods=5, freq="D") + + def test_pickle_compat_construction(self): + pass + + @pytest.mark.parametrize("freq", ["D", "M", "A"]) + def test_pickle_round_trip(self, freq): + idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.NaN], freq=freq) + result = tm.round_trip_pickle(idx) + tm.assert_index_equal(result, idx) + + def test_where(self): + # This is handled in test_indexing + pass + + @pytest.mark.parametrize("use_numpy", [True, False]) + @pytest.mark.parametrize( + "index", + [ + pd.period_range("2000-01-01", periods=3, freq="D"), + pd.period_range("2001-01-01", periods=3, freq="2D"), + pd.PeriodIndex(["2001-01", "NaT", "2003-01"], freq="M"), + ], + ) + def test_repeat_freqstr(self, index, use_numpy): + # GH10183 + expected = PeriodIndex([p for p in index for _ in range(3)]) + result = np.repeat(index, 3) if use_numpy else index.repeat(3) + tm.assert_index_equal(result, expected) + assert result.freqstr == index.freqstr + + def test_fillna_period(self): + # GH 11343 + idx = pd.PeriodIndex(["2011-01-01 09:00", pd.NaT, "2011-01-01 11:00"], freq="H") + + exp = pd.PeriodIndex( + ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], freq="H" + ) + tm.assert_index_equal(idx.fillna(pd.Period("2011-01-01 10:00", freq="H")), exp) + + exp = pd.Index( + [ + pd.Period("2011-01-01 09:00", freq="H"), + "x", + pd.Period("2011-01-01 11:00", freq="H"), + ], + dtype=object, + ) + tm.assert_index_equal(idx.fillna("x"), exp) + + exp = pd.Index( + [ + pd.Period("2011-01-01 09:00", freq="H"), + pd.Period("2011-01-01", freq="D"), + pd.Period("2011-01-01 11:00", freq="H"), + ], + dtype=object, + ) + tm.assert_index_equal(idx.fillna(pd.Period("2011-01-01", freq="D")), exp) + + def test_no_millisecond_field(self): + msg = "type object 'DatetimeIndex' has no attribute 'millisecond'" + with pytest.raises(AttributeError, match=msg): + DatetimeIndex.millisecond + + msg = "'DatetimeIndex' object has no attribute 'millisecond'" + with pytest.raises(AttributeError, match=msg): + DatetimeIndex([]).millisecond + + def test_hash_error(self): + index = period_range("20010101", periods=10) + msg = f"unhashable type: '{type(index).__name__}'" + with pytest.raises(TypeError, match=msg): + hash(index) + + def test_make_time_series(self): + index = period_range(freq="A", start="1/1/2001", end="12/1/2009") + series = Series(1, index=index) + assert isinstance(series, Series) + + def test_shallow_copy_empty(self): + + # GH13067 + idx = PeriodIndex([], freq="M") + result = idx._shallow_copy() + expected = idx + + tm.assert_index_equal(result, expected) + + def test_shallow_copy_i8(self): + # GH-24391 + pi = period_range("2018-01-01", periods=3, freq="2D") + result = pi._shallow_copy(pi.asi8, freq=pi.freq) + tm.assert_index_equal(result, pi) + + def test_shallow_copy_changing_freq_raises(self): + pi = period_range("2018-01-01", periods=3, freq="2D") + msg = "specified freq and dtype are different" + with pytest.raises(IncompatibleFrequency, match=msg): + pi._shallow_copy(pi, freq="H") + + def test_view_asi8(self): + idx = pd.PeriodIndex([], freq="M") + + exp = np.array([], dtype=np.int64) + tm.assert_numpy_array_equal(idx.view("i8"), exp) + tm.assert_numpy_array_equal(idx.asi8, exp) + + idx = pd.PeriodIndex(["2011-01", pd.NaT], freq="M") + + exp = np.array([492, -9223372036854775808], dtype=np.int64) + tm.assert_numpy_array_equal(idx.view("i8"), exp) + tm.assert_numpy_array_equal(idx.asi8, exp) + + exp = np.array([14975, -9223372036854775808], dtype=np.int64) + idx = pd.PeriodIndex(["2011-01-01", pd.NaT], freq="D") + tm.assert_numpy_array_equal(idx.view("i8"), exp) + tm.assert_numpy_array_equal(idx.asi8, exp) + + def test_values(self): + idx = pd.PeriodIndex([], freq="M") + + exp = np.array([], dtype=np.object) + tm.assert_numpy_array_equal(idx.values, exp) + tm.assert_numpy_array_equal(idx.to_numpy(), exp) + + exp = np.array([], dtype=np.int64) + tm.assert_numpy_array_equal(idx._ndarray_values, exp) + + idx = pd.PeriodIndex(["2011-01", pd.NaT], freq="M") + + exp = np.array([pd.Period("2011-01", freq="M"), pd.NaT], dtype=object) + tm.assert_numpy_array_equal(idx.values, exp) + tm.assert_numpy_array_equal(idx.to_numpy(), exp) + exp = np.array([492, -9223372036854775808], dtype=np.int64) + tm.assert_numpy_array_equal(idx._ndarray_values, exp) + + idx = pd.PeriodIndex(["2011-01-01", pd.NaT], freq="D") + + exp = np.array([pd.Period("2011-01-01", freq="D"), pd.NaT], dtype=object) + tm.assert_numpy_array_equal(idx.values, exp) + tm.assert_numpy_array_equal(idx.to_numpy(), exp) + exp = np.array([14975, -9223372036854775808], dtype=np.int64) + tm.assert_numpy_array_equal(idx._ndarray_values, exp) + + def test_period_index_length(self): + pi = period_range(freq="A", start="1/1/2001", end="12/1/2009") + assert len(pi) == 9 + + pi = period_range(freq="Q", start="1/1/2001", end="12/1/2009") + assert len(pi) == 4 * 9 + + pi = period_range(freq="M", start="1/1/2001", end="12/1/2009") + assert len(pi) == 12 * 9 + + start = Period("02-Apr-2005", "B") + i1 = period_range(start=start, periods=20) + assert len(i1) == 20 + assert i1.freq == start.freq + assert i1[0] == start + + end_intv = Period("2006-12-31", "W") + i1 = period_range(end=end_intv, periods=10) + assert len(i1) == 10 + assert i1.freq == end_intv.freq + assert i1[-1] == end_intv + + end_intv = Period("2006-12-31", "1w") + i2 = period_range(end=end_intv, periods=10) + assert len(i1) == len(i2) + assert (i1 == i2).all() + assert i1.freq == i2.freq + + end_intv = Period("2006-12-31", ("w", 1)) + i2 = period_range(end=end_intv, periods=10) + assert len(i1) == len(i2) + assert (i1 == i2).all() + assert i1.freq == i2.freq + + msg = "start and end must have same freq" + with pytest.raises(ValueError, match=msg): + period_range(start=start, end=end_intv) + + end_intv = Period("2005-05-01", "B") + i1 = period_range(start=start, end=end_intv) + + msg = ( + "Of the three parameters: start, end, and periods, exactly two " + "must be specified" + ) + with pytest.raises(ValueError, match=msg): + period_range(start=start) + + # infer freq from first element + i2 = PeriodIndex([end_intv, Period("2005-05-05", "B")]) + assert len(i2) == 2 + assert i2[0] == end_intv + + i2 = PeriodIndex(np.array([end_intv, Period("2005-05-05", "B")])) + assert len(i2) == 2 + assert i2[0] == end_intv + + # Mixed freq should fail + vals = [end_intv, Period("2006-12-31", "w")] + msg = r"Input has different freq=W-SUN from PeriodIndex\(freq=B\)" + with pytest.raises(IncompatibleFrequency, match=msg): + PeriodIndex(vals) + vals = np.array(vals) + with pytest.raises(ValueError, match=msg): + PeriodIndex(vals) + + def test_fields(self): + # year, month, day, hour, minute + # second, weekofyear, week, dayofweek, weekday, dayofyear, quarter + # qyear + pi = period_range(freq="A", start="1/1/2001", end="12/1/2005") + self._check_all_fields(pi) + + pi = period_range(freq="Q", start="1/1/2001", end="12/1/2002") + self._check_all_fields(pi) + + pi = period_range(freq="M", start="1/1/2001", end="1/1/2002") + self._check_all_fields(pi) + + pi = period_range(freq="D", start="12/1/2001", end="6/1/2001") + self._check_all_fields(pi) + + pi = period_range(freq="B", start="12/1/2001", end="6/1/2001") + self._check_all_fields(pi) + + pi = period_range(freq="H", start="12/31/2001", end="1/1/2002 23:00") + self._check_all_fields(pi) + + pi = period_range(freq="Min", start="12/31/2001", end="1/1/2002 00:20") + self._check_all_fields(pi) + + pi = period_range( + freq="S", start="12/31/2001 00:00:00", end="12/31/2001 00:05:00" + ) + self._check_all_fields(pi) + + end_intv = Period("2006-12-31", "W") + i1 = period_range(end=end_intv, periods=10) + self._check_all_fields(i1) + + def _check_all_fields(self, periodindex): + fields = [ + "year", + "month", + "day", + "hour", + "minute", + "second", + "weekofyear", + "week", + "dayofweek", + "dayofyear", + "quarter", + "qyear", + "days_in_month", + ] + + periods = list(periodindex) + s = pd.Series(periodindex) + + for field in fields: + field_idx = getattr(periodindex, field) + assert len(periodindex) == len(field_idx) + for x, val in zip(periods, field_idx): + assert getattr(x, field) == val + + if len(s) == 0: + continue + + field_s = getattr(s.dt, field) + assert len(periodindex) == len(field_s) + for x, val in zip(periods, field_s): + assert getattr(x, field) == val + + def test_period_set_index_reindex(self): + # GH 6631 + df = DataFrame(np.random.random(6)) + idx1 = period_range("2011/01/01", periods=6, freq="M") + idx2 = period_range("2013", periods=6, freq="A") + + df = df.set_index(idx1) + tm.assert_index_equal(df.index, idx1) + df = df.set_index(idx2) + tm.assert_index_equal(df.index, idx2) + + @pytest.mark.parametrize( + "p_values, o_values, values, expected_values", + [ + ( + [Period("2019Q1", "Q-DEC"), Period("2019Q2", "Q-DEC")], + [Period("2019Q1", "Q-DEC"), Period("2019Q2", "Q-DEC"), "All"], + [1.0, 1.0], + [1.0, 1.0, np.nan], + ), + ( + [Period("2019Q1", "Q-DEC"), Period("2019Q2", "Q-DEC")], + [Period("2019Q1", "Q-DEC"), Period("2019Q2", "Q-DEC")], + [1.0, 1.0], + [1.0, 1.0], + ), + ], + ) + def test_period_reindex_with_object( + self, p_values, o_values, values, expected_values + ): + # GH 28337 + period_index = PeriodIndex(p_values) + object_index = Index(o_values) + + s = pd.Series(values, index=period_index) + result = s.reindex(object_index) + expected = pd.Series(expected_values, index=object_index) + tm.assert_series_equal(result, expected) + + def test_factorize(self): + idx1 = PeriodIndex( + ["2014-01", "2014-01", "2014-02", "2014-02", "2014-03", "2014-03"], freq="M" + ) + + exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.intp) + exp_idx = PeriodIndex(["2014-01", "2014-02", "2014-03"], freq="M") + + arr, idx = idx1.factorize() + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, exp_idx) + + arr, idx = idx1.factorize(sort=True) + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, exp_idx) + + idx2 = pd.PeriodIndex( + ["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"], freq="M" + ) + + exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.intp) + arr, idx = idx2.factorize(sort=True) + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, exp_idx) + + exp_arr = np.array([0, 0, 1, 2, 0, 2], dtype=np.intp) + exp_idx = PeriodIndex(["2014-03", "2014-02", "2014-01"], freq="M") + arr, idx = idx2.factorize() + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, exp_idx) + + def test_is_(self): + create_index = lambda: period_range(freq="A", start="1/1/2001", end="12/1/2009") + index = create_index() + assert index.is_(index) + assert not index.is_(create_index()) + assert index.is_(index.view()) + assert index.is_(index.view().view().view().view().view()) + assert index.view().is_(index) + ind2 = index.view() + index.name = "Apple" + assert ind2.is_(index) + assert not index.is_(index[:]) + assert not index.is_(index.asfreq("M")) + assert not index.is_(index.asfreq("A")) + + assert not index.is_(index - 2) + assert not index.is_(index - 0) + + def test_contains(self): + rng = period_range("2007-01", freq="M", periods=10) + + assert Period("2007-01", freq="M") in rng + assert not Period("2007-01", freq="D") in rng + assert not Period("2007-01", freq="2M") in rng + + def test_contains_nat(self): + # see gh-13582 + idx = period_range("2007-01", freq="M", periods=10) + assert pd.NaT not in idx + assert None not in idx + assert float("nan") not in idx + assert np.nan not in idx + + idx = pd.PeriodIndex(["2011-01", "NaT", "2011-02"], freq="M") + assert pd.NaT in idx + assert None in idx + assert float("nan") in idx + assert np.nan in idx + + def test_periods_number_check(self): + msg = ( + "Of the three parameters: start, end, and periods, exactly two " + "must be specified" + ) + with pytest.raises(ValueError, match=msg): + period_range("2011-1-1", "2012-1-1", "B") + + def test_start_time(self): + # GH 17157 + index = period_range(freq="M", start="2016-01-01", end="2016-05-31") + expected_index = date_range("2016-01-01", end="2016-05-31", freq="MS") + tm.assert_index_equal(index.start_time, expected_index) + + def test_end_time(self): + # GH 17157 + index = period_range(freq="M", start="2016-01-01", end="2016-05-31") + expected_index = date_range("2016-01-01", end="2016-05-31", freq="M") + expected_index = expected_index.shift(1, freq="D").shift(-1, freq="ns") + tm.assert_index_equal(index.end_time, expected_index) + + def test_index_duplicate_periods(self): + # monotonic + idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq="A-JUN") + ts = Series(np.random.randn(len(idx)), index=idx) + + result = ts["2007"] + expected = ts[1:3] + tm.assert_series_equal(result, expected) + result[:] = 1 + assert (ts[1:3] == 1).all() + + # not monotonic + idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq="A-JUN") + ts = Series(np.random.randn(len(idx)), index=idx) + + result = ts["2007"] + expected = ts[idx == "2007"] + tm.assert_series_equal(result, expected) + + def test_index_unique(self): + idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq="A-JUN") + expected = PeriodIndex([2000, 2007, 2009], freq="A-JUN") + tm.assert_index_equal(idx.unique(), expected) + assert idx.nunique() == 3 + + idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq="A-JUN", tz="US/Eastern") + expected = PeriodIndex([2000, 2007, 2009], freq="A-JUN", tz="US/Eastern") + tm.assert_index_equal(idx.unique(), expected) + assert idx.nunique() == 3 + + def test_shift(self): + # This is tested in test_arithmetic + pass + + @td.skip_if_32bit + def test_ndarray_compat_properties(self): + super().test_ndarray_compat_properties() + + def test_negative_ordinals(self): + Period(ordinal=-1000, freq="A") + Period(ordinal=0, freq="A") + + idx1 = PeriodIndex(ordinal=[-1, 0, 1], freq="A") + idx2 = PeriodIndex(ordinal=np.array([-1, 0, 1]), freq="A") + tm.assert_index_equal(idx1, idx2) + + def test_pindex_fieldaccessor_nat(self): + idx = PeriodIndex( + ["2011-01", "2011-02", "NaT", "2012-03", "2012-04"], freq="D", name="name" + ) + + exp = Index([2011, 2011, -1, 2012, 2012], dtype=np.int64, name="name") + tm.assert_index_equal(idx.year, exp) + exp = Index([1, 2, -1, 3, 4], dtype=np.int64, name="name") + tm.assert_index_equal(idx.month, exp) + + def test_pindex_qaccess(self): + pi = PeriodIndex(["2Q05", "3Q05", "4Q05", "1Q06", "2Q06"], freq="Q") + s = Series(np.random.rand(len(pi)), index=pi).cumsum() + # Todo: fix these accessors! + assert s["05Q4"] == s[2] + + def test_pindex_multiples(self): + expected = PeriodIndex( + ["2011-01", "2011-03", "2011-05", "2011-07", "2011-09", "2011-11"], + freq="2M", + ) + + pi = period_range(start="1/1/11", end="12/31/11", freq="2M") + tm.assert_index_equal(pi, expected) + assert pi.freq == offsets.MonthEnd(2) + assert pi.freqstr == "2M" + + pi = period_range(start="1/1/11", periods=6, freq="2M") + tm.assert_index_equal(pi, expected) + assert pi.freq == offsets.MonthEnd(2) + assert pi.freqstr == "2M" + + def test_iteration(self): + index = period_range(start="1/1/10", periods=4, freq="B") + + result = list(index) + assert isinstance(result[0], Period) + assert result[0].freq == index.freq + + def test_is_full(self): + index = PeriodIndex([2005, 2007, 2009], freq="A") + assert not index.is_full + + index = PeriodIndex([2005, 2006, 2007], freq="A") + assert index.is_full + + index = PeriodIndex([2005, 2005, 2007], freq="A") + assert not index.is_full + + index = PeriodIndex([2005, 2005, 2006], freq="A") + assert index.is_full + + index = PeriodIndex([2006, 2005, 2005], freq="A") + with pytest.raises(ValueError, match="Index is not monotonic"): + index.is_full + + assert index[:0].is_full + + def test_with_multi_index(self): + # #1705 + index = date_range("1/1/2012", periods=4, freq="12H") + index_as_arrays = [index.to_period(freq="D"), index.hour] + + s = Series([0, 1, 2, 3], index_as_arrays) + + assert isinstance(s.index.levels[0], PeriodIndex) + + assert isinstance(s.index.values[0][0], Period) + + def test_convert_array_of_periods(self): + rng = period_range("1/1/2000", periods=20, freq="D") + periods = list(rng) + + result = pd.Index(periods) + assert isinstance(result, PeriodIndex) + + def test_append_concat(self): + # #1815 + d1 = date_range("12/31/1990", "12/31/1999", freq="A-DEC") + d2 = date_range("12/31/2000", "12/31/2009", freq="A-DEC") + + s1 = Series(np.random.randn(10), d1) + s2 = Series(np.random.randn(10), d2) + + s1 = s1.to_period() + s2 = s2.to_period() + + # drops index + result = pd.concat([s1, s2]) + assert isinstance(result.index, PeriodIndex) + assert result.index[0] == s1.index[0] + + def test_pickle_freq(self): + # GH2891 + prng = period_range("1/1/2011", "1/1/2012", freq="M") + new_prng = tm.round_trip_pickle(prng) + assert new_prng.freq == offsets.MonthEnd() + assert new_prng.freqstr == "M" + + def test_map(self): + # test_map_dictlike generally tests + + index = PeriodIndex([2005, 2007, 2009], freq="A") + result = index.map(lambda x: x.ordinal) + exp = Index([x.ordinal for x in index]) + tm.assert_index_equal(result, exp) + + def test_join_self(self, join_type): + index = period_range("1/1/2000", periods=10) + joined = index.join(index, how=join_type) + assert index is joined + + def test_insert(self): + # GH 18295 (test missing) + expected = PeriodIndex( + ["2017Q1", pd.NaT, "2017Q2", "2017Q3", "2017Q4"], freq="Q" + ) + for na in (np.nan, pd.NaT, None): + result = period_range("2017Q1", periods=4, freq="Q").insert(1, na) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "msg, key", + [ + (r"Period\('2019', 'A-DEC'\), 'foo', 'bar'", (Period(2019), "foo", "bar")), + (r"Period\('2019', 'A-DEC'\), 'y1', 'bar'", (Period(2019), "y1", "bar")), + (r"Period\('2019', 'A-DEC'\), 'foo', 'z1'", (Period(2019), "foo", "z1")), + ( + r"Period\('2018', 'A-DEC'\), Period\('2016', 'A-DEC'\), 'bar'", + (Period(2018), Period(2016), "bar"), + ), + (r"Period\('2018', 'A-DEC'\), 'foo', 'y1'", (Period(2018), "foo", "y1")), + ( + r"Period\('2017', 'A-DEC'\), 'foo', Period\('2015', 'A-DEC'\)", + (Period(2017), "foo", Period(2015)), + ), + (r"Period\('2017', 'A-DEC'\), 'z1', 'bar'", (Period(2017), "z1", "bar")), + ], + ) + def test_contains_raise_error_if_period_index_is_in_multi_index(self, msg, key): + # issue 20684 + """ + parse_time_string return parameter if type not matched. + PeriodIndex.get_loc takes returned value from parse_time_string as a tuple. + If first argument is Period and a tuple has 3 items, + process go on not raise exception + """ + df = DataFrame( + { + "A": [Period(2019), "x1", "x2"], + "B": [Period(2018), Period(2016), "y1"], + "C": [Period(2017), "z1", Period(2015)], + "V1": [1, 2, 3], + "V2": [10, 20, 30], + } + ).set_index(["A", "B", "C"]) + with pytest.raises(KeyError, match=msg): + df.loc[key] + + +def test_maybe_convert_timedelta(): + pi = PeriodIndex(["2000", "2001"], freq="D") + offset = offsets.Day(2) + assert pi._maybe_convert_timedelta(offset) == 2 + assert pi._maybe_convert_timedelta(2) == 2 + + offset = offsets.BusinessDay() + msg = r"Input has different freq=B from PeriodIndex\(freq=D\)" + with pytest.raises(ValueError, match=msg): + pi._maybe_convert_timedelta(offset) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/period/test_period_range.py b/venv/Lib/site-packages/pandas/tests/indexes/period/test_period_range.py new file mode 100644 index 0000000..2c3d221 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/period/test_period_range.py @@ -0,0 +1,99 @@ +import pytest + +from pandas import NaT, Period, PeriodIndex, date_range, period_range +import pandas._testing as tm + + +class TestPeriodRange: + @pytest.mark.parametrize("freq", ["D", "W", "M", "Q", "A"]) + def test_construction_from_string(self, freq): + # non-empty + expected = date_range( + start="2017-01-01", periods=5, freq=freq, name="foo" + ).to_period() + start, end = str(expected[0]), str(expected[-1]) + + result = period_range(start=start, end=end, freq=freq, name="foo") + tm.assert_index_equal(result, expected) + + result = period_range(start=start, periods=5, freq=freq, name="foo") + tm.assert_index_equal(result, expected) + + result = period_range(end=end, periods=5, freq=freq, name="foo") + tm.assert_index_equal(result, expected) + + # empty + expected = PeriodIndex([], freq=freq, name="foo") + + result = period_range(start=start, periods=0, freq=freq, name="foo") + tm.assert_index_equal(result, expected) + + result = period_range(end=end, periods=0, freq=freq, name="foo") + tm.assert_index_equal(result, expected) + + result = period_range(start=end, end=start, freq=freq, name="foo") + tm.assert_index_equal(result, expected) + + def test_construction_from_period(self): + # upsampling + start, end = Period("2017Q1", freq="Q"), Period("2018Q1", freq="Q") + expected = date_range( + start="2017-03-31", end="2018-03-31", freq="M", name="foo" + ).to_period() + result = period_range(start=start, end=end, freq="M", name="foo") + tm.assert_index_equal(result, expected) + + # downsampling + start, end = Period("2017-1", freq="M"), Period("2019-12", freq="M") + expected = date_range( + start="2017-01-31", end="2019-12-31", freq="Q", name="foo" + ).to_period() + result = period_range(start=start, end=end, freq="Q", name="foo") + tm.assert_index_equal(result, expected) + + # empty + expected = PeriodIndex([], freq="W", name="foo") + + result = period_range(start=start, periods=0, freq="W", name="foo") + tm.assert_index_equal(result, expected) + + result = period_range(end=end, periods=0, freq="W", name="foo") + tm.assert_index_equal(result, expected) + + result = period_range(start=end, end=start, freq="W", name="foo") + tm.assert_index_equal(result, expected) + + def test_errors(self): + # not enough params + msg = ( + "Of the three parameters: start, end, and periods, " + "exactly two must be specified" + ) + with pytest.raises(ValueError, match=msg): + period_range(start="2017Q1") + + with pytest.raises(ValueError, match=msg): + period_range(end="2017Q1") + + with pytest.raises(ValueError, match=msg): + period_range(periods=5) + + with pytest.raises(ValueError, match=msg): + period_range() + + # too many params + with pytest.raises(ValueError, match=msg): + period_range(start="2017Q1", end="2018Q1", periods=8, freq="Q") + + # start/end NaT + msg = "start and end must not be NaT" + with pytest.raises(ValueError, match=msg): + period_range(start=NaT, end="2018Q1") + + with pytest.raises(ValueError, match=msg): + period_range(start="2017Q1", end=NaT) + + # invalid periods param + msg = "periods must be a number, got foo" + with pytest.raises(TypeError, match=msg): + period_range(start="2017Q1", periods="foo") diff --git a/venv/Lib/site-packages/pandas/tests/indexes/period/test_scalar_compat.py b/venv/Lib/site-packages/pandas/tests/indexes/period/test_scalar_compat.py new file mode 100644 index 0000000..d9809f0 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/period/test_scalar_compat.py @@ -0,0 +1,17 @@ +"""Tests for PeriodIndex behaving like a vectorized Period scalar""" + +from pandas import Timedelta, date_range, period_range +import pandas._testing as tm + + +class TestPeriodIndexOps: + def test_start_time(self): + index = period_range(freq="M", start="2016-01-01", end="2016-05-31") + expected_index = date_range("2016-01-01", end="2016-05-31", freq="MS") + tm.assert_index_equal(index.start_time, expected_index) + + def test_end_time(self): + index = period_range(freq="M", start="2016-01-01", end="2016-05-31") + expected_index = date_range("2016-01-01", end="2016-05-31", freq="M") + expected_index += Timedelta(1, "D") - Timedelta(1, "ns") + tm.assert_index_equal(index.end_time, expected_index) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/period/test_setops.py b/venv/Lib/site-packages/pandas/tests/indexes/period/test_setops.py new file mode 100644 index 0000000..dc78058 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/period/test_setops.py @@ -0,0 +1,375 @@ +import numpy as np +import pytest + +from pandas._libs.tslibs import IncompatibleFrequency + +import pandas as pd +from pandas import Index, PeriodIndex, date_range, period_range +import pandas._testing as tm + + +def _permute(obj): + return obj.take(np.random.permutation(len(obj))) + + +class TestPeriodIndex: + def test_joins(self, join_type): + index = period_range("1/1/2000", "1/20/2000", freq="D") + + joined = index.join(index[:-5], how=join_type) + + assert isinstance(joined, PeriodIndex) + assert joined.freq == index.freq + + def test_join_self(self, join_type): + index = period_range("1/1/2000", "1/20/2000", freq="D") + + res = index.join(index, how=join_type) + assert index is res + + def test_join_does_not_recur(self): + df = tm.makeCustomDataframe( + 3, + 2, + data_gen_f=lambda *args: np.random.randint(2), + c_idx_type="p", + r_idx_type="dt", + ) + s = df.iloc[:2, 0] + + res = s.index.join(df.columns, how="outer") + expected = Index([s.index[0], s.index[1], df.columns[0], df.columns[1]], object) + tm.assert_index_equal(res, expected) + + @pytest.mark.parametrize("sort", [None, False]) + def test_union(self, sort): + # union + other1 = pd.period_range("1/1/2000", freq="D", periods=5) + rng1 = pd.period_range("1/6/2000", freq="D", periods=5) + expected1 = pd.PeriodIndex( + [ + "2000-01-06", + "2000-01-07", + "2000-01-08", + "2000-01-09", + "2000-01-10", + "2000-01-01", + "2000-01-02", + "2000-01-03", + "2000-01-04", + "2000-01-05", + ], + freq="D", + ) + + rng2 = pd.period_range("1/1/2000", freq="D", periods=5) + other2 = pd.period_range("1/4/2000", freq="D", periods=5) + expected2 = pd.period_range("1/1/2000", freq="D", periods=8) + + rng3 = pd.period_range("1/1/2000", freq="D", periods=5) + other3 = pd.PeriodIndex([], freq="D") + expected3 = pd.period_range("1/1/2000", freq="D", periods=5) + + rng4 = pd.period_range("2000-01-01 09:00", freq="H", periods=5) + other4 = pd.period_range("2000-01-02 09:00", freq="H", periods=5) + expected4 = pd.PeriodIndex( + [ + "2000-01-01 09:00", + "2000-01-01 10:00", + "2000-01-01 11:00", + "2000-01-01 12:00", + "2000-01-01 13:00", + "2000-01-02 09:00", + "2000-01-02 10:00", + "2000-01-02 11:00", + "2000-01-02 12:00", + "2000-01-02 13:00", + ], + freq="H", + ) + + rng5 = pd.PeriodIndex( + ["2000-01-01 09:01", "2000-01-01 09:03", "2000-01-01 09:05"], freq="T" + ) + other5 = pd.PeriodIndex( + ["2000-01-01 09:01", "2000-01-01 09:05", "2000-01-01 09:08"], freq="T" + ) + expected5 = pd.PeriodIndex( + [ + "2000-01-01 09:01", + "2000-01-01 09:03", + "2000-01-01 09:05", + "2000-01-01 09:08", + ], + freq="T", + ) + + rng6 = pd.period_range("2000-01-01", freq="M", periods=7) + other6 = pd.period_range("2000-04-01", freq="M", periods=7) + expected6 = pd.period_range("2000-01-01", freq="M", periods=10) + + rng7 = pd.period_range("2003-01-01", freq="A", periods=5) + other7 = pd.period_range("1998-01-01", freq="A", periods=8) + expected7 = pd.PeriodIndex( + [ + "2003", + "2004", + "2005", + "2006", + "2007", + "1998", + "1999", + "2000", + "2001", + "2002", + ], + freq="A", + ) + + rng8 = pd.PeriodIndex( + ["1/3/2000", "1/2/2000", "1/1/2000", "1/5/2000", "1/4/2000"], freq="D" + ) + other8 = pd.period_range("1/6/2000", freq="D", periods=5) + expected8 = pd.PeriodIndex( + [ + "1/3/2000", + "1/2/2000", + "1/1/2000", + "1/5/2000", + "1/4/2000", + "1/6/2000", + "1/7/2000", + "1/8/2000", + "1/9/2000", + "1/10/2000", + ], + freq="D", + ) + + for rng, other, expected in [ + (rng1, other1, expected1), + (rng2, other2, expected2), + (rng3, other3, expected3), + (rng4, other4, expected4), + (rng5, other5, expected5), + (rng6, other6, expected6), + (rng7, other7, expected7), + (rng8, other8, expected8), + ]: + + result_union = rng.union(other, sort=sort) + if sort is None: + expected = expected.sort_values() + tm.assert_index_equal(result_union, expected) + + @pytest.mark.parametrize("sort", [None, False]) + def test_union_misc(self, sort): + index = period_range("1/1/2000", "1/20/2000", freq="D") + + result = index[:-5].union(index[10:], sort=sort) + tm.assert_index_equal(result, index) + + # not in order + result = _permute(index[:-5]).union(_permute(index[10:]), sort=sort) + if sort is None: + tm.assert_index_equal(result, index) + assert tm.equalContents(result, index) + + # raise if different frequencies + index = period_range("1/1/2000", "1/20/2000", freq="D") + index2 = period_range("1/1/2000", "1/20/2000", freq="W-WED") + with pytest.raises(IncompatibleFrequency): + index.union(index2, sort=sort) + + index3 = period_range("1/1/2000", "1/20/2000", freq="2D") + with pytest.raises(IncompatibleFrequency): + index.join(index3) + + def test_union_dataframe_index(self): + rng1 = pd.period_range("1/1/1999", "1/1/2012", freq="M") + s1 = pd.Series(np.random.randn(len(rng1)), rng1) + + rng2 = pd.period_range("1/1/1980", "12/1/2001", freq="M") + s2 = pd.Series(np.random.randn(len(rng2)), rng2) + df = pd.DataFrame({"s1": s1, "s2": s2}) + + exp = pd.period_range("1/1/1980", "1/1/2012", freq="M") + tm.assert_index_equal(df.index, exp) + + @pytest.mark.parametrize("sort", [None, False]) + def test_intersection(self, sort): + index = period_range("1/1/2000", "1/20/2000", freq="D") + + result = index[:-5].intersection(index[10:], sort=sort) + tm.assert_index_equal(result, index[10:-5]) + + # not in order + left = _permute(index[:-5]) + right = _permute(index[10:]) + result = left.intersection(right, sort=sort) + if sort is None: + tm.assert_index_equal(result, index[10:-5]) + assert tm.equalContents(result, index[10:-5]) + + # raise if different frequencies + index = period_range("1/1/2000", "1/20/2000", freq="D") + index2 = period_range("1/1/2000", "1/20/2000", freq="W-WED") + with pytest.raises(IncompatibleFrequency): + index.intersection(index2, sort=sort) + + index3 = period_range("1/1/2000", "1/20/2000", freq="2D") + with pytest.raises(IncompatibleFrequency): + index.intersection(index3, sort=sort) + + @pytest.mark.parametrize("sort", [None, False]) + def test_intersection_cases(self, sort): + base = period_range("6/1/2000", "6/30/2000", freq="D", name="idx") + + # if target has the same name, it is preserved + rng2 = period_range("5/15/2000", "6/20/2000", freq="D", name="idx") + expected2 = period_range("6/1/2000", "6/20/2000", freq="D", name="idx") + + # if target name is different, it will be reset + rng3 = period_range("5/15/2000", "6/20/2000", freq="D", name="other") + expected3 = period_range("6/1/2000", "6/20/2000", freq="D", name=None) + + rng4 = period_range("7/1/2000", "7/31/2000", freq="D", name="idx") + expected4 = PeriodIndex([], name="idx", freq="D") + + for (rng, expected) in [ + (rng2, expected2), + (rng3, expected3), + (rng4, expected4), + ]: + result = base.intersection(rng, sort=sort) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + # non-monotonic + base = PeriodIndex( + ["2011-01-05", "2011-01-04", "2011-01-02", "2011-01-03"], + freq="D", + name="idx", + ) + + rng2 = PeriodIndex( + ["2011-01-04", "2011-01-02", "2011-02-02", "2011-02-03"], + freq="D", + name="idx", + ) + expected2 = PeriodIndex(["2011-01-04", "2011-01-02"], freq="D", name="idx") + + rng3 = PeriodIndex( + ["2011-01-04", "2011-01-02", "2011-02-02", "2011-02-03"], + freq="D", + name="other", + ) + expected3 = PeriodIndex(["2011-01-04", "2011-01-02"], freq="D", name=None) + + rng4 = period_range("7/1/2000", "7/31/2000", freq="D", name="idx") + expected4 = PeriodIndex([], freq="D", name="idx") + + for (rng, expected) in [ + (rng2, expected2), + (rng3, expected3), + (rng4, expected4), + ]: + result = base.intersection(rng, sort=sort) + if sort is None: + expected = expected.sort_values() + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == "D" + + # empty same freq + rng = date_range("6/1/2000", "6/15/2000", freq="T") + result = rng[0:0].intersection(rng) + assert len(result) == 0 + + result = rng.intersection(rng[0:0]) + assert len(result) == 0 + + @pytest.mark.parametrize("sort", [None, False]) + def test_difference(self, sort): + # diff + period_rng = ["1/3/2000", "1/2/2000", "1/1/2000", "1/5/2000", "1/4/2000"] + rng1 = pd.PeriodIndex(period_rng, freq="D") + other1 = pd.period_range("1/6/2000", freq="D", periods=5) + expected1 = rng1 + + rng2 = pd.PeriodIndex(period_rng, freq="D") + other2 = pd.period_range("1/4/2000", freq="D", periods=5) + expected2 = pd.PeriodIndex(["1/3/2000", "1/2/2000", "1/1/2000"], freq="D") + + rng3 = pd.PeriodIndex(period_rng, freq="D") + other3 = pd.PeriodIndex([], freq="D") + expected3 = rng3 + + period_rng = [ + "2000-01-01 10:00", + "2000-01-01 09:00", + "2000-01-01 12:00", + "2000-01-01 11:00", + "2000-01-01 13:00", + ] + rng4 = pd.PeriodIndex(period_rng, freq="H") + other4 = pd.period_range("2000-01-02 09:00", freq="H", periods=5) + expected4 = rng4 + + rng5 = pd.PeriodIndex( + ["2000-01-01 09:03", "2000-01-01 09:01", "2000-01-01 09:05"], freq="T" + ) + other5 = pd.PeriodIndex(["2000-01-01 09:01", "2000-01-01 09:05"], freq="T") + expected5 = pd.PeriodIndex(["2000-01-01 09:03"], freq="T") + + period_rng = [ + "2000-02-01", + "2000-01-01", + "2000-06-01", + "2000-07-01", + "2000-05-01", + "2000-03-01", + "2000-04-01", + ] + rng6 = pd.PeriodIndex(period_rng, freq="M") + other6 = pd.period_range("2000-04-01", freq="M", periods=7) + expected6 = pd.PeriodIndex(["2000-02-01", "2000-01-01", "2000-03-01"], freq="M") + + period_rng = ["2003", "2007", "2006", "2005", "2004"] + rng7 = pd.PeriodIndex(period_rng, freq="A") + other7 = pd.period_range("1998-01-01", freq="A", periods=8) + expected7 = pd.PeriodIndex(["2007", "2006"], freq="A") + + for rng, other, expected in [ + (rng1, other1, expected1), + (rng2, other2, expected2), + (rng3, other3, expected3), + (rng4, other4, expected4), + (rng5, other5, expected5), + (rng6, other6, expected6), + (rng7, other7, expected7), + ]: + result_difference = rng.difference(other, sort=sort) + if sort is None: + expected = expected.sort_values() + tm.assert_index_equal(result_difference, expected) + + @pytest.mark.parametrize("sort", [None, False]) + def test_difference_freq(self, sort): + # GH14323: difference of Period MUST preserve frequency + # but the ability to union results must be preserved + + index = period_range("20160920", "20160925", freq="D") + + other = period_range("20160921", "20160924", freq="D") + expected = PeriodIndex(["20160920", "20160925"], freq="D") + idx_diff = index.difference(other, sort) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) + + other = period_range("20160922", "20160925", freq="D") + idx_diff = index.difference(other, sort) + expected = PeriodIndex(["20160920", "20160921"], freq="D") + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/period/test_shift.py b/venv/Lib/site-packages/pandas/tests/indexes/period/test_shift.py new file mode 100644 index 0000000..5689e98 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/period/test_shift.py @@ -0,0 +1,119 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import PeriodIndex, period_range +import pandas._testing as tm + + +class TestPeriodIndexShift: + # --------------------------------------------------------------- + # PeriodIndex.shift is used by __add__ and __sub__ + + def test_pi_shift_ndarray(self): + idx = PeriodIndex( + ["2011-01", "2011-02", "NaT", "2011-04"], freq="M", name="idx" + ) + result = idx.shift(np.array([1, 2, 3, 4])) + expected = PeriodIndex( + ["2011-02", "2011-04", "NaT", "2011-08"], freq="M", name="idx" + ) + tm.assert_index_equal(result, expected) + + result = idx.shift(np.array([1, -2, 3, -4])) + expected = PeriodIndex( + ["2011-02", "2010-12", "NaT", "2010-12"], freq="M", name="idx" + ) + tm.assert_index_equal(result, expected) + + def test_shift(self): + pi1 = period_range(freq="A", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="A", start="1/1/2002", end="12/1/2010") + + tm.assert_index_equal(pi1.shift(0), pi1) + + assert len(pi1) == len(pi2) + tm.assert_index_equal(pi1.shift(1), pi2) + + pi1 = period_range(freq="A", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="A", start="1/1/2000", end="12/1/2008") + assert len(pi1) == len(pi2) + tm.assert_index_equal(pi1.shift(-1), pi2) + + pi1 = period_range(freq="M", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="M", start="2/1/2001", end="1/1/2010") + assert len(pi1) == len(pi2) + tm.assert_index_equal(pi1.shift(1), pi2) + + pi1 = period_range(freq="M", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="M", start="12/1/2000", end="11/1/2009") + assert len(pi1) == len(pi2) + tm.assert_index_equal(pi1.shift(-1), pi2) + + pi1 = period_range(freq="D", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="D", start="1/2/2001", end="12/2/2009") + assert len(pi1) == len(pi2) + tm.assert_index_equal(pi1.shift(1), pi2) + + pi1 = period_range(freq="D", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="D", start="12/31/2000", end="11/30/2009") + assert len(pi1) == len(pi2) + tm.assert_index_equal(pi1.shift(-1), pi2) + + def test_shift_corner_cases(self): + # GH#9903 + idx = pd.PeriodIndex([], name="xxx", freq="H") + + with pytest.raises(TypeError): + # period shift doesn't accept freq + idx.shift(1, freq="H") + + tm.assert_index_equal(idx.shift(0), idx) + tm.assert_index_equal(idx.shift(3), idx) + + idx = pd.PeriodIndex( + ["2011-01-01 10:00", "2011-01-01 11:00", "2011-01-01 12:00"], + name="xxx", + freq="H", + ) + tm.assert_index_equal(idx.shift(0), idx) + exp = pd.PeriodIndex( + ["2011-01-01 13:00", "2011-01-01 14:00", "2011-01-01 15:00"], + name="xxx", + freq="H", + ) + tm.assert_index_equal(idx.shift(3), exp) + exp = pd.PeriodIndex( + ["2011-01-01 07:00", "2011-01-01 08:00", "2011-01-01 09:00"], + name="xxx", + freq="H", + ) + tm.assert_index_equal(idx.shift(-3), exp) + + def test_shift_nat(self): + idx = PeriodIndex( + ["2011-01", "2011-02", "NaT", "2011-04"], freq="M", name="idx" + ) + result = idx.shift(1) + expected = PeriodIndex( + ["2011-02", "2011-03", "NaT", "2011-05"], freq="M", name="idx" + ) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + + def test_shift_gh8083(self): + # test shift for PeriodIndex + # GH#8083 + drange = pd.period_range("20130101", periods=5, freq="D") + result = drange.shift(1) + expected = PeriodIndex( + ["2013-01-02", "2013-01-03", "2013-01-04", "2013-01-05", "2013-01-06"], + freq="D", + ) + tm.assert_index_equal(result, expected) + + def test_shift_periods(self): + # GH #22458 : argument 'n' was deprecated in favor of 'periods' + idx = period_range(freq="A", start="1/1/2001", end="12/1/2009") + tm.assert_index_equal(idx.shift(periods=0), idx) + tm.assert_index_equal(idx.shift(0), idx) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/period/test_tools.py b/venv/Lib/site-packages/pandas/tests/indexes/period/test_tools.py new file mode 100644 index 0000000..28ab14a --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/period/test_tools.py @@ -0,0 +1,381 @@ +from datetime import datetime, timedelta + +import numpy as np +import pytest + +from pandas._libs.tslibs import IncompatibleFrequency +from pandas._libs.tslibs.ccalendar import MONTHS + +import pandas as pd +from pandas import ( + DatetimeIndex, + Period, + PeriodIndex, + Series, + Timedelta, + Timestamp, + date_range, + period_range, + to_datetime, +) +import pandas._testing as tm + + +class TestPeriodRepresentation: + """ + Wish to match NumPy units + """ + + def _check_freq(self, freq, base_date): + rng = period_range(start=base_date, periods=10, freq=freq) + exp = np.arange(10, dtype=np.int64) + + tm.assert_numpy_array_equal(rng.asi8, exp) + + def test_annual(self): + self._check_freq("A", 1970) + + def test_monthly(self): + self._check_freq("M", "1970-01") + + @pytest.mark.parametrize("freq", ["W-THU", "D", "B", "H", "T", "S", "L", "U", "N"]) + def test_freq(self, freq): + self._check_freq(freq, "1970-01-01") + + def test_negone_ordinals(self): + freqs = ["A", "M", "Q", "D", "H", "T", "S"] + + period = Period(ordinal=-1, freq="D") + for freq in freqs: + repr(period.asfreq(freq)) + + for freq in freqs: + period = Period(ordinal=-1, freq=freq) + repr(period) + assert period.year == 1969 + + period = Period(ordinal=-1, freq="B") + repr(period) + period = Period(ordinal=-1, freq="W") + repr(period) + + +class TestPeriodIndex: + def test_to_timestamp(self): + index = period_range(freq="A", start="1/1/2001", end="12/1/2009") + series = Series(1, index=index, name="foo") + + exp_index = date_range("1/1/2001", end="12/31/2009", freq="A-DEC") + result = series.to_timestamp(how="end") + exp_index = exp_index + Timedelta(1, "D") - Timedelta(1, "ns") + tm.assert_index_equal(result.index, exp_index) + assert result.name == "foo" + + exp_index = date_range("1/1/2001", end="1/1/2009", freq="AS-JAN") + result = series.to_timestamp(how="start") + tm.assert_index_equal(result.index, exp_index) + + def _get_with_delta(delta, freq="A-DEC"): + return date_range( + to_datetime("1/1/2001") + delta, + to_datetime("12/31/2009") + delta, + freq=freq, + ) + + delta = timedelta(hours=23) + result = series.to_timestamp("H", "end") + exp_index = _get_with_delta(delta) + exp_index = exp_index + Timedelta(1, "h") - Timedelta(1, "ns") + tm.assert_index_equal(result.index, exp_index) + + delta = timedelta(hours=23, minutes=59) + result = series.to_timestamp("T", "end") + exp_index = _get_with_delta(delta) + exp_index = exp_index + Timedelta(1, "m") - Timedelta(1, "ns") + tm.assert_index_equal(result.index, exp_index) + + result = series.to_timestamp("S", "end") + delta = timedelta(hours=23, minutes=59, seconds=59) + exp_index = _get_with_delta(delta) + exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns") + tm.assert_index_equal(result.index, exp_index) + + index = period_range(freq="H", start="1/1/2001", end="1/2/2001") + series = Series(1, index=index, name="foo") + + exp_index = date_range("1/1/2001 00:59:59", end="1/2/2001 00:59:59", freq="H") + result = series.to_timestamp(how="end") + exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns") + tm.assert_index_equal(result.index, exp_index) + assert result.name == "foo" + + def test_to_timestamp_freq(self): + idx = pd.period_range("2017", periods=12, freq="A-DEC") + result = idx.to_timestamp() + expected = pd.date_range("2017", periods=12, freq="AS-JAN") + tm.assert_index_equal(result, expected) + + def test_to_timestamp_repr_is_code(self): + zs = [ + Timestamp("99-04-17 00:00:00", tz="UTC"), + Timestamp("2001-04-17 00:00:00", tz="UTC"), + Timestamp("2001-04-17 00:00:00", tz="America/Los_Angeles"), + Timestamp("2001-04-17 00:00:00", tz=None), + ] + for z in zs: + assert eval(repr(z)) == z + + def test_to_timestamp_to_period_astype(self): + idx = DatetimeIndex([pd.NaT, "2011-01-01", "2011-02-01"], name="idx") + + res = idx.astype("period[M]") + exp = PeriodIndex(["NaT", "2011-01", "2011-02"], freq="M", name="idx") + tm.assert_index_equal(res, exp) + + res = idx.astype("period[3M]") + exp = PeriodIndex(["NaT", "2011-01", "2011-02"], freq="3M", name="idx") + tm.assert_index_equal(res, exp) + + def test_dti_to_period(self): + dti = pd.date_range(start="1/1/2005", end="12/1/2005", freq="M") + pi1 = dti.to_period() + pi2 = dti.to_period(freq="D") + pi3 = dti.to_period(freq="3D") + + assert pi1[0] == Period("Jan 2005", freq="M") + assert pi2[0] == Period("1/31/2005", freq="D") + assert pi3[0] == Period("1/31/2005", freq="3D") + + assert pi1[-1] == Period("Nov 2005", freq="M") + assert pi2[-1] == Period("11/30/2005", freq="D") + assert pi3[-1], Period("11/30/2005", freq="3D") + + tm.assert_index_equal(pi1, period_range("1/1/2005", "11/1/2005", freq="M")) + tm.assert_index_equal( + pi2, period_range("1/1/2005", "11/1/2005", freq="M").asfreq("D") + ) + tm.assert_index_equal( + pi3, period_range("1/1/2005", "11/1/2005", freq="M").asfreq("3D") + ) + + @pytest.mark.parametrize("month", MONTHS) + def test_to_period_quarterly(self, month): + # make sure we can make the round trip + freq = "Q-{month}".format(month=month) + rng = period_range("1989Q3", "1991Q3", freq=freq) + stamps = rng.to_timestamp() + result = stamps.to_period(freq) + tm.assert_index_equal(rng, result) + + @pytest.mark.parametrize("off", ["BQ", "QS", "BQS"]) + def test_to_period_quarterlyish(self, off): + rng = date_range("01-Jan-2012", periods=8, freq=off) + prng = rng.to_period() + assert prng.freq == "Q-DEC" + + @pytest.mark.parametrize("off", ["BA", "AS", "BAS"]) + def test_to_period_annualish(self, off): + rng = date_range("01-Jan-2012", periods=8, freq=off) + prng = rng.to_period() + assert prng.freq == "A-DEC" + + def test_to_period_monthish(self): + offsets = ["MS", "BM"] + for off in offsets: + rng = date_range("01-Jan-2012", periods=8, freq=off) + prng = rng.to_period() + assert prng.freq == "M" + + rng = date_range("01-Jan-2012", periods=8, freq="M") + prng = rng.to_period() + assert prng.freq == "M" + + msg = pd._libs.tslibs.frequencies.INVALID_FREQ_ERR_MSG + with pytest.raises(ValueError, match=msg): + date_range("01-Jan-2012", periods=8, freq="EOM") + + def test_period_dt64_round_trip(self): + dti = date_range("1/1/2000", "1/7/2002", freq="B") + pi = dti.to_period() + tm.assert_index_equal(pi.to_timestamp(), dti) + + dti = date_range("1/1/2000", "1/7/2002", freq="B") + pi = dti.to_period(freq="H") + tm.assert_index_equal(pi.to_timestamp(), dti) + + def test_combine_first(self): + # GH#3367 + didx = pd.date_range(start="1950-01-31", end="1950-07-31", freq="M") + pidx = pd.period_range( + start=pd.Period("1950-1"), end=pd.Period("1950-7"), freq="M" + ) + # check to be consistent with DatetimeIndex + for idx in [didx, pidx]: + a = pd.Series([1, np.nan, np.nan, 4, 5, np.nan, 7], index=idx) + b = pd.Series([9, 9, 9, 9, 9, 9, 9], index=idx) + + result = a.combine_first(b) + expected = pd.Series([1, 9, 9, 4, 5, 9, 7], index=idx, dtype=np.float64) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("freq", ["D", "2D"]) + def test_searchsorted(self, freq): + pidx = pd.PeriodIndex( + ["2014-01-01", "2014-01-02", "2014-01-03", "2014-01-04", "2014-01-05"], + freq=freq, + ) + + p1 = pd.Period("2014-01-01", freq=freq) + assert pidx.searchsorted(p1) == 0 + + p2 = pd.Period("2014-01-04", freq=freq) + assert pidx.searchsorted(p2) == 3 + + assert pidx.searchsorted(pd.NaT) == 0 + + msg = "Input has different freq=H from PeriodArray" + with pytest.raises(IncompatibleFrequency, match=msg): + pidx.searchsorted(pd.Period("2014-01-01", freq="H")) + + msg = "Input has different freq=5D from PeriodArray" + with pytest.raises(IncompatibleFrequency, match=msg): + pidx.searchsorted(pd.Period("2014-01-01", freq="5D")) + + def test_searchsorted_invalid(self): + pidx = pd.PeriodIndex( + ["2014-01-01", "2014-01-02", "2014-01-03", "2014-01-04", "2014-01-05"], + freq="D", + ) + + other = np.array([0, 1], dtype=np.int64) + + msg = "requires either a Period or PeriodArray" + with pytest.raises(TypeError, match=msg): + pidx.searchsorted(other) + + with pytest.raises(TypeError, match=msg): + pidx.searchsorted(other.astype("timedelta64[ns]")) + + with pytest.raises(TypeError, match=msg): + pidx.searchsorted(np.timedelta64(4)) + + with pytest.raises(TypeError, match=msg): + pidx.searchsorted(np.timedelta64("NaT", "ms")) + + with pytest.raises(TypeError, match=msg): + pidx.searchsorted(np.datetime64(4, "ns")) + + with pytest.raises(TypeError, match=msg): + pidx.searchsorted(np.datetime64("NaT", "ns")) + + +class TestPeriodIndexConversion: + def test_tolist(self): + index = period_range(freq="A", start="1/1/2001", end="12/1/2009") + rs = index.tolist() + for x in rs: + assert isinstance(x, Period) + + recon = PeriodIndex(rs) + tm.assert_index_equal(index, recon) + + def test_to_timestamp_pi_nat(self): + # GH#7228 + index = PeriodIndex(["NaT", "2011-01", "2011-02"], freq="M", name="idx") + + result = index.to_timestamp("D") + expected = DatetimeIndex( + [pd.NaT, datetime(2011, 1, 1), datetime(2011, 2, 1)], name="idx" + ) + tm.assert_index_equal(result, expected) + assert result.name == "idx" + + result2 = result.to_period(freq="M") + tm.assert_index_equal(result2, index) + assert result2.name == "idx" + + result3 = result.to_period(freq="3M") + exp = PeriodIndex(["NaT", "2011-01", "2011-02"], freq="3M", name="idx") + tm.assert_index_equal(result3, exp) + assert result3.freqstr == "3M" + + msg = "Frequency must be positive, because it represents span: -2A" + with pytest.raises(ValueError, match=msg): + result.to_period(freq="-2A") + + def test_to_timestamp_preserve_name(self): + index = period_range(freq="A", start="1/1/2001", end="12/1/2009", name="foo") + assert index.name == "foo" + + conv = index.to_timestamp("D") + assert conv.name == "foo" + + def test_to_timestamp_quarterly_bug(self): + years = np.arange(1960, 2000).repeat(4) + quarters = np.tile(list(range(1, 5)), 40) + + pindex = PeriodIndex(year=years, quarter=quarters) + + stamps = pindex.to_timestamp("D", "end") + expected = DatetimeIndex([x.to_timestamp("D", "end") for x in pindex]) + tm.assert_index_equal(stamps, expected) + + def test_to_timestamp_pi_mult(self): + idx = PeriodIndex(["2011-01", "NaT", "2011-02"], freq="2M", name="idx") + + result = idx.to_timestamp() + expected = DatetimeIndex(["2011-01-01", "NaT", "2011-02-01"], name="idx") + tm.assert_index_equal(result, expected) + + result = idx.to_timestamp(how="E") + expected = DatetimeIndex(["2011-02-28", "NaT", "2011-03-31"], name="idx") + expected = expected + Timedelta(1, "D") - Timedelta(1, "ns") + tm.assert_index_equal(result, expected) + + def test_to_timestamp_pi_combined(self): + idx = period_range(start="2011", periods=2, freq="1D1H", name="idx") + + result = idx.to_timestamp() + expected = DatetimeIndex(["2011-01-01 00:00", "2011-01-02 01:00"], name="idx") + tm.assert_index_equal(result, expected) + + result = idx.to_timestamp(how="E") + expected = DatetimeIndex( + ["2011-01-02 00:59:59", "2011-01-03 01:59:59"], name="idx" + ) + expected = expected + Timedelta(1, "s") - Timedelta(1, "ns") + tm.assert_index_equal(result, expected) + + result = idx.to_timestamp(how="E", freq="H") + expected = DatetimeIndex(["2011-01-02 00:00", "2011-01-03 01:00"], name="idx") + expected = expected + Timedelta(1, "h") - Timedelta(1, "ns") + tm.assert_index_equal(result, expected) + + def test_period_astype_to_timestamp(self): + pi = pd.PeriodIndex(["2011-01", "2011-02", "2011-03"], freq="M") + + exp = pd.DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"]) + tm.assert_index_equal(pi.astype("datetime64[ns]"), exp) + + exp = pd.DatetimeIndex(["2011-01-31", "2011-02-28", "2011-03-31"]) + exp = exp + Timedelta(1, "D") - Timedelta(1, "ns") + tm.assert_index_equal(pi.astype("datetime64[ns]", how="end"), exp) + + exp = pd.DatetimeIndex( + ["2011-01-01", "2011-02-01", "2011-03-01"], tz="US/Eastern" + ) + res = pi.astype("datetime64[ns, US/Eastern]") + tm.assert_index_equal(pi.astype("datetime64[ns, US/Eastern]"), exp) + + exp = pd.DatetimeIndex( + ["2011-01-31", "2011-02-28", "2011-03-31"], tz="US/Eastern" + ) + exp = exp + Timedelta(1, "D") - Timedelta(1, "ns") + res = pi.astype("datetime64[ns, US/Eastern]", how="end") + tm.assert_index_equal(res, exp) + + def test_to_timestamp_1703(self): + index = period_range("1/1/2012", periods=4, freq="D") + + result = index.to_timestamp() + assert result[0] == Timestamp("1/1/2012") diff --git a/venv/Lib/site-packages/pandas/tests/indexes/ranges/__init__.py b/venv/Lib/site-packages/pandas/tests/indexes/ranges/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/indexes/ranges/test_constructors.py b/venv/Lib/site-packages/pandas/tests/indexes/ranges/test_constructors.py new file mode 100644 index 0000000..ba1de6d --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/ranges/test_constructors.py @@ -0,0 +1,154 @@ +from datetime import datetime + +import numpy as np +import pytest + +from pandas import Index, RangeIndex, Series +import pandas._testing as tm + + +class TestRangeIndexConstructors: + @pytest.mark.parametrize("name", [None, "foo"]) + @pytest.mark.parametrize( + "args, kwargs, start, stop, step", + [ + ((5,), dict(), 0, 5, 1), + ((1, 5), dict(), 1, 5, 1), + ((1, 5, 2), dict(), 1, 5, 2), + ((0,), dict(), 0, 0, 1), + ((0, 0), dict(), 0, 0, 1), + (tuple(), dict(start=0), 0, 0, 1), + (tuple(), dict(stop=0), 0, 0, 1), + ], + ) + def test_constructor(self, args, kwargs, start, stop, step, name): + result = RangeIndex(*args, name=name, **kwargs) + expected = Index(np.arange(start, stop, step, dtype=np.int64), name=name) + assert isinstance(result, RangeIndex) + assert result.name is name + assert result._range == range(start, stop, step) + tm.assert_index_equal(result, expected) + + def test_constructor_invalid_args(self): + msg = "RangeIndex\\(\\.\\.\\.\\) must be called with integers" + with pytest.raises(TypeError, match=msg): + RangeIndex() + + with pytest.raises(TypeError, match=msg): + RangeIndex(name="Foo") + + # invalid args + for i in [ + Index(["a", "b"]), + Series(["a", "b"]), + np.array(["a", "b"]), + [], + "foo", + datetime(2000, 1, 1, 0, 0), + np.arange(0, 10), + np.array([1]), + [1], + ]: + with pytest.raises(TypeError): + RangeIndex(i) + + # we don't allow on a bare Index + msg = ( + r"Index\(\.\.\.\) must be called with a collection of some " + r"kind, 0 was passed" + ) + with pytest.raises(TypeError, match=msg): + Index(0, 1000) + + def test_constructor_same(self): + + # pass thru w and w/o copy + index = RangeIndex(1, 5, 2) + result = RangeIndex(index, copy=False) + assert result.identical(index) + + result = RangeIndex(index, copy=True) + tm.assert_index_equal(result, index, exact=True) + + result = RangeIndex(index) + tm.assert_index_equal(result, index, exact=True) + + with pytest.raises( + ValueError, + match="Incorrect `dtype` passed: expected signed integer, received float64", + ): + RangeIndex(index, dtype="float64") + + def test_constructor_range(self): + + msg = "Value needs to be a scalar value, was type " + with pytest.raises(TypeError, match=msg): + result = RangeIndex(range(1, 5, 2)) + + result = RangeIndex.from_range(range(1, 5, 2)) + expected = RangeIndex(1, 5, 2) + tm.assert_index_equal(result, expected, exact=True) + + result = RangeIndex.from_range(range(5, 6)) + expected = RangeIndex(5, 6, 1) + tm.assert_index_equal(result, expected, exact=True) + + # an invalid range + result = RangeIndex.from_range(range(5, 1)) + expected = RangeIndex(0, 0, 1) + tm.assert_index_equal(result, expected, exact=True) + + result = RangeIndex.from_range(range(5)) + expected = RangeIndex(0, 5, 1) + tm.assert_index_equal(result, expected, exact=True) + + result = Index(range(1, 5, 2)) + expected = RangeIndex(1, 5, 2) + tm.assert_index_equal(result, expected, exact=True) + + with pytest.raises( + ValueError, + match="Incorrect `dtype` passed: expected signed integer, received float64", + ): + Index(range(1, 5, 2), dtype="float64") + msg = r"^from_range\(\) got an unexpected keyword argument" + with pytest.raises(TypeError, match=msg): + RangeIndex.from_range(range(10), copy=True) + + def test_constructor_name(self): + # GH#12288 + orig = RangeIndex(10) + orig.name = "original" + + copy = RangeIndex(orig) + copy.name = "copy" + + assert orig.name == "original" + assert copy.name == "copy" + + new = Index(copy) + assert new.name == "copy" + + new.name = "new" + assert orig.name == "original" + assert copy.name == "copy" + assert new.name == "new" + + def test_constructor_corner(self): + arr = np.array([1, 2, 3, 4], dtype=object) + index = RangeIndex(1, 5) + assert index.values.dtype == np.int64 + tm.assert_index_equal(index, Index(arr)) + + # non-int raise Exception + with pytest.raises(TypeError): + RangeIndex("1", "10", "1") + with pytest.raises(TypeError): + RangeIndex(1.1, 10.2, 1.3) + + # invalid passed type + with pytest.raises( + ValueError, + match="Incorrect `dtype` passed: expected signed integer, received float64", + ): + RangeIndex(1, 5, dtype="float64") diff --git a/venv/Lib/site-packages/pandas/tests/indexes/ranges/test_range.py b/venv/Lib/site-packages/pandas/tests/indexes/ranges/test_range.py new file mode 100644 index 0000000..8d98ab1 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/ranges/test_range.py @@ -0,0 +1,742 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.common import ensure_platform_int + +import pandas as pd +from pandas import Float64Index, Index, Int64Index, RangeIndex +import pandas._testing as tm + +from ..test_numeric import Numeric + +# aliases to make some tests easier to read +RI = RangeIndex +I64 = Int64Index +F64 = Float64Index +OI = Index + + +class TestRangeIndex(Numeric): + _holder = RangeIndex + _compat_props = ["shape", "ndim", "size"] + + @pytest.fixture( + params=[ + RangeIndex(start=0, stop=20, step=2, name="foo"), + RangeIndex(start=18, stop=-1, step=-2, name="bar"), + ], + ids=["index_inc", "index_dec"], + ) + def indices(self, request): + return request.param + + def create_index(self): + return RangeIndex(start=0, stop=20, step=2) + + def test_can_hold_identifiers(self): + idx = self.create_index() + key = idx[0] + assert idx._can_hold_identifiers_and_holds_name(key) is False + + def test_too_many_names(self): + index = self.create_index() + with pytest.raises(ValueError, match="^Length"): + index.names = ["roger", "harold"] + + @pytest.mark.parametrize( + "index, start, stop, step", + [ + (RangeIndex(5), 0, 5, 1), + (RangeIndex(0, 5), 0, 5, 1), + (RangeIndex(5, step=2), 0, 5, 2), + (RangeIndex(1, 5, 2), 1, 5, 2), + ], + ) + def test_start_stop_step_attrs(self, index, start, stop, step): + # GH 25710 + assert index.start == start + assert index.stop == stop + assert index.step == step + + @pytest.mark.parametrize("attr_name", ["_start", "_stop", "_step"]) + def test_deprecated_start_stop_step_attrs(self, attr_name): + # GH 26581 + idx = self.create_index() + with tm.assert_produces_warning(FutureWarning): + getattr(idx, attr_name) + + def test_copy(self): + i = RangeIndex(5, name="Foo") + i_copy = i.copy() + assert i_copy is not i + assert i_copy.identical(i) + assert i_copy._range == range(0, 5, 1) + assert i_copy.name == "Foo" + + def test_repr(self): + i = RangeIndex(5, name="Foo") + result = repr(i) + expected = "RangeIndex(start=0, stop=5, step=1, name='Foo')" + assert result == expected + + result = eval(result) + tm.assert_index_equal(result, i, exact=True) + + i = RangeIndex(5, 0, -1) + result = repr(i) + expected = "RangeIndex(start=5, stop=0, step=-1)" + assert result == expected + + result = eval(result) + tm.assert_index_equal(result, i, exact=True) + + def test_insert(self): + + idx = RangeIndex(5, name="Foo") + result = idx[1:4] + + # test 0th element + tm.assert_index_equal(idx[0:4], result.insert(0, idx[0])) + + # GH 18295 (test missing) + expected = Float64Index([0, np.nan, 1, 2, 3, 4]) + for na in (np.nan, pd.NaT, None): + result = RangeIndex(5).insert(1, na) + tm.assert_index_equal(result, expected) + + def test_delete(self): + + idx = RangeIndex(5, name="Foo") + expected = idx[1:].astype(int) + result = idx.delete(0) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + + expected = idx[:-1].astype(int) + result = idx.delete(-1) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + + with pytest.raises((IndexError, ValueError)): + # either depending on numpy version + result = idx.delete(len(idx)) + + def test_view(self): + i = RangeIndex(0, name="Foo") + i_view = i.view() + assert i_view.name == "Foo" + + i_view = i.view("i8") + tm.assert_numpy_array_equal(i.values, i_view) + + i_view = i.view(RangeIndex) + tm.assert_index_equal(i, i_view) + + def test_dtype(self): + index = self.create_index() + assert index.dtype == np.int64 + + def test_cached_data(self): + # GH 26565, GH26617 + # Calling RangeIndex._data caches an int64 array of the same length at + # self._cached_data. This test checks whether _cached_data has been set + idx = RangeIndex(0, 100, 10) + + assert idx._cached_data is None + + repr(idx) + assert idx._cached_data is None + + str(idx) + assert idx._cached_data is None + + idx.get_loc(20) + assert idx._cached_data is None + + 90 in idx + assert idx._cached_data is None + + 91 in idx + assert idx._cached_data is None + + idx.all() + assert idx._cached_data is None + + idx.any() + assert idx._cached_data is None + + df = pd.DataFrame({"a": range(10)}, index=idx) + + df.loc[50] + assert idx._cached_data is None + + with pytest.raises(KeyError, match="51"): + df.loc[51] + assert idx._cached_data is None + + df.loc[10:50] + assert idx._cached_data is None + + df.iloc[5:10] + assert idx._cached_data is None + + # actually calling idx._data + assert isinstance(idx._data, np.ndarray) + assert isinstance(idx._cached_data, np.ndarray) + + def test_is_monotonic(self): + index = RangeIndex(0, 20, 2) + assert index.is_monotonic is True + assert index.is_monotonic_increasing is True + assert index.is_monotonic_decreasing is False + assert index._is_strictly_monotonic_increasing is True + assert index._is_strictly_monotonic_decreasing is False + + index = RangeIndex(4, 0, -1) + assert index.is_monotonic is False + assert index._is_strictly_monotonic_increasing is False + assert index.is_monotonic_decreasing is True + assert index._is_strictly_monotonic_decreasing is True + + index = RangeIndex(1, 2) + assert index.is_monotonic is True + assert index.is_monotonic_increasing is True + assert index.is_monotonic_decreasing is True + assert index._is_strictly_monotonic_increasing is True + assert index._is_strictly_monotonic_decreasing is True + + index = RangeIndex(2, 1) + assert index.is_monotonic is True + assert index.is_monotonic_increasing is True + assert index.is_monotonic_decreasing is True + assert index._is_strictly_monotonic_increasing is True + assert index._is_strictly_monotonic_decreasing is True + + index = RangeIndex(1, 1) + assert index.is_monotonic is True + assert index.is_monotonic_increasing is True + assert index.is_monotonic_decreasing is True + assert index._is_strictly_monotonic_increasing is True + assert index._is_strictly_monotonic_decreasing is True + + def test_equals_range(self): + equiv_pairs = [ + (RangeIndex(0, 9, 2), RangeIndex(0, 10, 2)), + (RangeIndex(0), RangeIndex(1, -1, 3)), + (RangeIndex(1, 2, 3), RangeIndex(1, 3, 4)), + (RangeIndex(0, -9, -2), RangeIndex(0, -10, -2)), + ] + for left, right in equiv_pairs: + assert left.equals(right) + assert right.equals(left) + + def test_logical_compat(self): + idx = self.create_index() + assert idx.all() == idx.values.all() + assert idx.any() == idx.values.any() + + def test_identical(self): + index = self.create_index() + i = Index(index.copy()) + assert i.identical(index) + + # we don't allow object dtype for RangeIndex + if isinstance(index, RangeIndex): + return + + same_values_different_type = Index(i, dtype=object) + assert not i.identical(same_values_different_type) + + i = index.copy(dtype=object) + i = i.rename("foo") + same_values = Index(i, dtype=object) + assert same_values.identical(index.copy(dtype=object)) + + assert not i.identical(index) + assert Index(same_values, name="foo", dtype=object).identical(i) + + assert not index.copy(dtype=object).identical(index.copy(dtype="int64")) + + def test_get_indexer(self): + index = self.create_index() + target = RangeIndex(10) + indexer = index.get_indexer(target) + expected = np.array([0, -1, 1, -1, 2, -1, 3, -1, 4, -1], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected) + + def test_get_indexer_pad(self): + index = self.create_index() + target = RangeIndex(10) + indexer = index.get_indexer(target, method="pad") + expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected) + + def test_get_indexer_backfill(self): + index = self.create_index() + target = RangeIndex(10) + indexer = index.get_indexer(target, method="backfill") + expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected) + + def test_get_indexer_limit(self): + # GH 28631 + idx = RangeIndex(4) + target = RangeIndex(6) + result = idx.get_indexer(target, method="pad", limit=1) + expected = np.array([0, 1, 2, 3, 3, -1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("stop", [0, -1, -2]) + def test_get_indexer_decreasing(self, stop): + # GH 28678 + index = RangeIndex(7, stop, -3) + result = index.get_indexer(range(9)) + expected = np.array([-1, 2, -1, -1, 1, -1, -1, 0, -1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + def test_join_outer(self): + # join with Int64Index + index = self.create_index() + other = Int64Index(np.arange(25, 14, -1)) + + res, lidx, ridx = index.join(other, how="outer", return_indexers=True) + noidx_res = index.join(other, how="outer") + tm.assert_index_equal(res, noidx_res) + + eres = Int64Index( + [0, 2, 4, 6, 8, 10, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25] + ) + elidx = np.array( + [0, 1, 2, 3, 4, 5, 6, 7, -1, 8, -1, 9, -1, -1, -1, -1, -1, -1, -1], + dtype=np.intp, + ) + eridx = np.array( + [-1, -1, -1, -1, -1, -1, -1, -1, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], + dtype=np.intp, + ) + + assert isinstance(res, Int64Index) + assert not isinstance(res, RangeIndex) + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) + + # join with RangeIndex + other = RangeIndex(25, 14, -1) + + res, lidx, ridx = index.join(other, how="outer", return_indexers=True) + noidx_res = index.join(other, how="outer") + tm.assert_index_equal(res, noidx_res) + + assert isinstance(res, Int64Index) + assert not isinstance(res, RangeIndex) + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) + + def test_join_inner(self): + # Join with non-RangeIndex + index = self.create_index() + other = Int64Index(np.arange(25, 14, -1)) + + res, lidx, ridx = index.join(other, how="inner", return_indexers=True) + + # no guarantee of sortedness, so sort for comparison purposes + ind = res.argsort() + res = res.take(ind) + lidx = lidx.take(ind) + ridx = ridx.take(ind) + + eres = Int64Index([16, 18]) + elidx = np.array([8, 9], dtype=np.intp) + eridx = np.array([9, 7], dtype=np.intp) + + assert isinstance(res, Int64Index) + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) + + # Join two RangeIndex + other = RangeIndex(25, 14, -1) + + res, lidx, ridx = index.join(other, how="inner", return_indexers=True) + + assert isinstance(res, RangeIndex) + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) + + def test_join_left(self): + # Join with Int64Index + index = self.create_index() + other = Int64Index(np.arange(25, 14, -1)) + + res, lidx, ridx = index.join(other, how="left", return_indexers=True) + eres = index + eridx = np.array([-1, -1, -1, -1, -1, -1, -1, -1, 9, 7], dtype=np.intp) + + assert isinstance(res, RangeIndex) + tm.assert_index_equal(res, eres) + assert lidx is None + tm.assert_numpy_array_equal(ridx, eridx) + + # Join withRangeIndex + other = Int64Index(np.arange(25, 14, -1)) + + res, lidx, ridx = index.join(other, how="left", return_indexers=True) + + assert isinstance(res, RangeIndex) + tm.assert_index_equal(res, eres) + assert lidx is None + tm.assert_numpy_array_equal(ridx, eridx) + + def test_join_right(self): + # Join with Int64Index + index = self.create_index() + other = Int64Index(np.arange(25, 14, -1)) + + res, lidx, ridx = index.join(other, how="right", return_indexers=True) + eres = other + elidx = np.array([-1, -1, -1, -1, -1, -1, -1, 9, -1, 8, -1], dtype=np.intp) + + assert isinstance(other, Int64Index) + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + assert ridx is None + + # Join withRangeIndex + other = RangeIndex(25, 14, -1) + + res, lidx, ridx = index.join(other, how="right", return_indexers=True) + eres = other + + assert isinstance(other, RangeIndex) + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + assert ridx is None + + def test_join_non_int_index(self): + index = self.create_index() + other = Index([3, 6, 7, 8, 10], dtype=object) + + outer = index.join(other, how="outer") + outer2 = other.join(index, how="outer") + expected = Index([0, 2, 3, 4, 6, 7, 8, 10, 12, 14, 16, 18]) + tm.assert_index_equal(outer, outer2) + tm.assert_index_equal(outer, expected) + + inner = index.join(other, how="inner") + inner2 = other.join(index, how="inner") + expected = Index([6, 8, 10]) + tm.assert_index_equal(inner, inner2) + tm.assert_index_equal(inner, expected) + + left = index.join(other, how="left") + tm.assert_index_equal(left, index.astype(object)) + + left2 = other.join(index, how="left") + tm.assert_index_equal(left2, other) + + right = index.join(other, how="right") + tm.assert_index_equal(right, other) + + right2 = other.join(index, how="right") + tm.assert_index_equal(right2, index.astype(object)) + + def test_join_non_unique(self): + index = self.create_index() + other = Index([4, 4, 3, 3]) + + res, lidx, ridx = index.join(other, return_indexers=True) + + eres = Int64Index([0, 2, 4, 4, 6, 8, 10, 12, 14, 16, 18]) + elidx = np.array([0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9], dtype=np.intp) + eridx = np.array([-1, -1, 0, 1, -1, -1, -1, -1, -1, -1, -1], dtype=np.intp) + + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) + + def test_join_self(self, join_type): + index = self.create_index() + joined = index.join(index, how=join_type) + assert index is joined + + def test_nbytes(self): + + # memory savings vs int index + i = RangeIndex(0, 1000) + assert i.nbytes < i._int64index.nbytes / 10 + + # constant memory usage + i2 = RangeIndex(0, 10) + assert i.nbytes == i2.nbytes + + def test_cant_or_shouldnt_cast(self): + # can't + with pytest.raises(TypeError): + RangeIndex("foo", "bar", "baz") + + # shouldn't + with pytest.raises(TypeError): + RangeIndex("0", "1", "2") + + def test_view_index(self): + index = self.create_index() + index.view(Index) + + def test_prevent_casting(self): + index = self.create_index() + result = index.astype("O") + assert result.dtype == np.object_ + + def test_take_preserve_name(self): + index = RangeIndex(1, 5, name="foo") + taken = index.take([3, 0, 1]) + assert index.name == taken.name + + def test_take_fill_value(self): + # GH 12631 + idx = pd.RangeIndex(1, 4, name="xxx") + result = idx.take(np.array([1, 0, -1])) + expected = pd.Int64Index([2, 1, 3], name="xxx") + tm.assert_index_equal(result, expected) + + # fill_value + msg = "Unable to fill values because RangeIndex cannot contain NA" + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -1]), fill_value=True) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = pd.Int64Index([2, 1, 3], name="xxx") + tm.assert_index_equal(result, expected) + + msg = "Unable to fill values because RangeIndex cannot contain NA" + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + with pytest.raises(IndexError): + idx.take(np.array([1, -5])) + + def test_print_unicode_columns(self): + df = pd.DataFrame({"\u05d0": [1, 2, 3], "\u05d1": [4, 5, 6], "c": [7, 8, 9]}) + repr(df.columns) # should not raise UnicodeDecodeError + + def test_repr_roundtrip(self): + index = self.create_index() + tm.assert_index_equal(eval(repr(index)), index) + + def test_slice_keep_name(self): + idx = RangeIndex(1, 2, name="asdf") + assert idx.name == idx[1:].name + + def test_explicit_conversions(self): + + # GH 8608 + # add/sub are overridden explicitly for Float/Int Index + idx = RangeIndex(5) + + # float conversions + arr = np.arange(5, dtype="int64") * 3.2 + expected = Float64Index(arr) + fidx = idx * 3.2 + tm.assert_index_equal(fidx, expected) + fidx = 3.2 * idx + tm.assert_index_equal(fidx, expected) + + # interops with numpy arrays + expected = Float64Index(arr) + a = np.zeros(5, dtype="float64") + result = fidx - a + tm.assert_index_equal(result, expected) + + expected = Float64Index(-arr) + a = np.zeros(5, dtype="float64") + result = a - fidx + tm.assert_index_equal(result, expected) + + def test_has_duplicates(self, indices): + assert indices.is_unique + assert not indices.has_duplicates + + def test_extended_gcd(self): + index = self.create_index() + result = index._extended_gcd(6, 10) + assert result[0] == result[1] * 6 + result[2] * 10 + assert 2 == result[0] + + result = index._extended_gcd(10, 6) + assert 2 == result[1] * 10 + result[2] * 6 + assert 2 == result[0] + + def test_min_fitting_element(self): + result = RangeIndex(0, 20, 2)._min_fitting_element(1) + assert 2 == result + + result = RangeIndex(1, 6)._min_fitting_element(1) + assert 1 == result + + result = RangeIndex(18, -2, -2)._min_fitting_element(1) + assert 2 == result + + result = RangeIndex(5, 0, -1)._min_fitting_element(1) + assert 1 == result + + big_num = 500000000000000000000000 + + result = RangeIndex(5, big_num * 2, 1)._min_fitting_element(big_num) + assert big_num == result + + def test_max_fitting_element(self): + result = RangeIndex(0, 20, 2)._max_fitting_element(17) + assert 16 == result + + result = RangeIndex(1, 6)._max_fitting_element(4) + assert 4 == result + + result = RangeIndex(18, -2, -2)._max_fitting_element(17) + assert 16 == result + + result = RangeIndex(5, 0, -1)._max_fitting_element(4) + assert 4 == result + + big_num = 500000000000000000000000 + + result = RangeIndex(5, big_num * 2, 1)._max_fitting_element(big_num) + assert big_num == result + + def test_pickle_compat_construction(self): + # RangeIndex() is a valid constructor + pass + + def test_slice_specialised(self): + index = self.create_index() + index.name = "foo" + + # scalar indexing + res = index[1] + expected = 2 + assert res == expected + + res = index[-1] + expected = 18 + assert res == expected + + # slicing + # slice value completion + index_slice = index[:] + expected = index + tm.assert_index_equal(index_slice, expected) + + # positive slice values + index_slice = index[7:10:2] + expected = Index(np.array([14, 18]), name="foo") + tm.assert_index_equal(index_slice, expected) + + # negative slice values + index_slice = index[-1:-5:-2] + expected = Index(np.array([18, 14]), name="foo") + tm.assert_index_equal(index_slice, expected) + + # stop overshoot + index_slice = index[2:100:4] + expected = Index(np.array([4, 12]), name="foo") + tm.assert_index_equal(index_slice, expected) + + # reverse + index_slice = index[::-1] + expected = Index(index.values[::-1], name="foo") + tm.assert_index_equal(index_slice, expected) + + index_slice = index[-8::-1] + expected = Index(np.array([4, 2, 0]), name="foo") + tm.assert_index_equal(index_slice, expected) + + index_slice = index[-40::-1] + expected = Index(np.array([], dtype=np.int64), name="foo") + tm.assert_index_equal(index_slice, expected) + + index_slice = index[40::-1] + expected = Index(index.values[40::-1], name="foo") + tm.assert_index_equal(index_slice, expected) + + index_slice = index[10::-1] + expected = Index(index.values[::-1], name="foo") + tm.assert_index_equal(index_slice, expected) + + @pytest.mark.parametrize("step", set(range(-5, 6)) - {0}) + def test_len_specialised(self, step): + # make sure that our len is the same as np.arange calc + start, stop = (0, 5) if step > 0 else (5, 0) + + arr = np.arange(start, stop, step) + index = RangeIndex(start, stop, step) + assert len(index) == len(arr) + + index = RangeIndex(stop, start, step) + assert len(index) == 0 + + @pytest.fixture( + params=[ + ([RI(1, 12, 5)], RI(1, 12, 5)), + ([RI(0, 6, 4)], RI(0, 6, 4)), + ([RI(1, 3), RI(3, 7)], RI(1, 7)), + ([RI(1, 5, 2), RI(5, 6)], RI(1, 6, 2)), + ([RI(1, 3, 2), RI(4, 7, 3)], RI(1, 7, 3)), + ([RI(-4, 3, 2), RI(4, 7, 2)], RI(-4, 7, 2)), + ([RI(-4, -8), RI(-8, -12)], RI(0, 0)), + ([RI(-4, -8), RI(3, -4)], RI(0, 0)), + ([RI(-4, -8), RI(3, 5)], RI(3, 5)), + ([RI(-4, -2), RI(3, 5)], I64([-4, -3, 3, 4])), + ([RI(-2), RI(3, 5)], RI(3, 5)), + ([RI(2), RI(2)], I64([0, 1, 0, 1])), + ([RI(2), RI(2, 5), RI(5, 8, 4)], RI(0, 6)), + ([RI(2), RI(3, 5), RI(5, 8, 4)], I64([0, 1, 3, 4, 5])), + ([RI(-2, 2), RI(2, 5), RI(5, 8, 4)], RI(-2, 6)), + ([RI(3), I64([-1, 3, 15])], I64([0, 1, 2, -1, 3, 15])), + ([RI(3), F64([-1, 3.1, 15.0])], F64([0, 1, 2, -1, 3.1, 15.0])), + ([RI(3), OI(["a", None, 14])], OI([0, 1, 2, "a", None, 14])), + ([RI(3, 1), OI(["a", None, 14])], OI(["a", None, 14])), + ] + ) + def appends(self, request): + """Inputs and expected outputs for RangeIndex.append test""" + + return request.param + + def test_append(self, appends): + # GH16212 + + indices, expected = appends + + result = indices[0].append(indices[1:]) + tm.assert_index_equal(result, expected, exact=True) + + if len(indices) == 2: + # Append single item rather than list + result2 = indices[0].append(indices[1]) + tm.assert_index_equal(result2, expected, exact=True) + + def test_engineless_lookup(self): + # GH 16685 + # Standard lookup on RangeIndex should not require the engine to be + # created + idx = RangeIndex(2, 10, 3) + + assert idx.get_loc(5) == 1 + tm.assert_numpy_array_equal( + idx.get_indexer([2, 8]), ensure_platform_int(np.array([0, 2])) + ) + with pytest.raises(KeyError, match="3"): + idx.get_loc(3) + + assert "_engine" not in idx._cache + + # The engine is still required for lookup of a different dtype scalar: + with pytest.raises(KeyError, match="'a'"): + assert idx.get_loc("a") == -1 + + assert "_engine" in idx._cache diff --git a/venv/Lib/site-packages/pandas/tests/indexes/ranges/test_setops.py b/venv/Lib/site-packages/pandas/tests/indexes/ranges/test_setops.py new file mode 100644 index 0000000..5bedc40 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/ranges/test_setops.py @@ -0,0 +1,244 @@ +from datetime import datetime, timedelta + +import numpy as np +import pytest + +from pandas import Index, Int64Index, RangeIndex +import pandas._testing as tm + + +class TestRangeIndexSetOps: + @pytest.mark.parametrize("sort", [None, False]) + def test_intersection(self, sort): + # intersect with Int64Index + index = RangeIndex(start=0, stop=20, step=2) + other = Index(np.arange(1, 6)) + result = index.intersection(other, sort=sort) + expected = Index(np.sort(np.intersect1d(index.values, other.values))) + tm.assert_index_equal(result, expected) + + result = other.intersection(index, sort=sort) + expected = Index( + np.sort(np.asarray(np.intersect1d(index.values, other.values))) + ) + tm.assert_index_equal(result, expected) + + # intersect with increasing RangeIndex + other = RangeIndex(1, 6) + result = index.intersection(other, sort=sort) + expected = Index(np.sort(np.intersect1d(index.values, other.values))) + tm.assert_index_equal(result, expected) + + # intersect with decreasing RangeIndex + other = RangeIndex(5, 0, -1) + result = index.intersection(other, sort=sort) + expected = Index(np.sort(np.intersect1d(index.values, other.values))) + tm.assert_index_equal(result, expected) + + # reversed (GH 17296) + result = other.intersection(index, sort=sort) + tm.assert_index_equal(result, expected) + + # GH 17296: intersect two decreasing RangeIndexes + first = RangeIndex(10, -2, -2) + other = RangeIndex(5, -4, -1) + expected = first.astype(int).intersection(other.astype(int), sort=sort) + result = first.intersection(other, sort=sort).astype(int) + tm.assert_index_equal(result, expected) + + # reversed + result = other.intersection(first, sort=sort).astype(int) + tm.assert_index_equal(result, expected) + + index = RangeIndex(5) + + # intersect of non-overlapping indices + other = RangeIndex(5, 10, 1) + result = index.intersection(other, sort=sort) + expected = RangeIndex(0, 0, 1) + tm.assert_index_equal(result, expected) + + other = RangeIndex(-1, -5, -1) + result = index.intersection(other, sort=sort) + expected = RangeIndex(0, 0, 1) + tm.assert_index_equal(result, expected) + + # intersection of empty indices + other = RangeIndex(0, 0, 1) + result = index.intersection(other, sort=sort) + expected = RangeIndex(0, 0, 1) + tm.assert_index_equal(result, expected) + + result = other.intersection(index, sort=sort) + tm.assert_index_equal(result, expected) + + # intersection of non-overlapping values based on start value and gcd + index = RangeIndex(1, 10, 2) + other = RangeIndex(0, 10, 4) + result = index.intersection(other, sort=sort) + expected = RangeIndex(0, 0, 1) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("sort", [False, None]) + def test_union_noncomparable(self, sort): + # corner case, non-Int64Index + index = RangeIndex(start=0, stop=20, step=2) + other = Index([datetime.now() + timedelta(i) for i in range(4)], dtype=object) + result = index.union(other, sort=sort) + expected = Index(np.concatenate((index, other))) + tm.assert_index_equal(result, expected) + + result = other.union(index, sort=sort) + expected = Index(np.concatenate((other, index))) + tm.assert_index_equal(result, expected) + + @pytest.fixture( + params=[ + ( + RangeIndex(0, 10, 1), + RangeIndex(0, 10, 1), + RangeIndex(0, 10, 1), + RangeIndex(0, 10, 1), + ), + ( + RangeIndex(0, 10, 1), + RangeIndex(5, 20, 1), + RangeIndex(0, 20, 1), + Int64Index(range(20)), + ), + ( + RangeIndex(0, 10, 1), + RangeIndex(10, 20, 1), + RangeIndex(0, 20, 1), + Int64Index(range(20)), + ), + ( + RangeIndex(0, -10, -1), + RangeIndex(0, -10, -1), + RangeIndex(0, -10, -1), + RangeIndex(0, -10, -1), + ), + ( + RangeIndex(0, -10, -1), + RangeIndex(-10, -20, -1), + RangeIndex(-19, 1, 1), + Int64Index(range(0, -20, -1)), + ), + ( + RangeIndex(0, 10, 2), + RangeIndex(1, 10, 2), + RangeIndex(0, 10, 1), + Int64Index(list(range(0, 10, 2)) + list(range(1, 10, 2))), + ), + ( + RangeIndex(0, 11, 2), + RangeIndex(1, 12, 2), + RangeIndex(0, 12, 1), + Int64Index(list(range(0, 11, 2)) + list(range(1, 12, 2))), + ), + ( + RangeIndex(0, 21, 4), + RangeIndex(-2, 24, 4), + RangeIndex(-2, 24, 2), + Int64Index(list(range(0, 21, 4)) + list(range(-2, 24, 4))), + ), + ( + RangeIndex(0, -20, -2), + RangeIndex(-1, -21, -2), + RangeIndex(-19, 1, 1), + Int64Index(list(range(0, -20, -2)) + list(range(-1, -21, -2))), + ), + ( + RangeIndex(0, 100, 5), + RangeIndex(0, 100, 20), + RangeIndex(0, 100, 5), + Int64Index(range(0, 100, 5)), + ), + ( + RangeIndex(0, -100, -5), + RangeIndex(5, -100, -20), + RangeIndex(-95, 10, 5), + Int64Index(list(range(0, -100, -5)) + [5]), + ), + ( + RangeIndex(0, -11, -1), + RangeIndex(1, -12, -4), + RangeIndex(-11, 2, 1), + Int64Index(list(range(0, -11, -1)) + [1, -11]), + ), + (RangeIndex(0), RangeIndex(0), RangeIndex(0), RangeIndex(0)), + ( + RangeIndex(0, -10, -2), + RangeIndex(0), + RangeIndex(0, -10, -2), + RangeIndex(0, -10, -2), + ), + ( + RangeIndex(0, 100, 2), + RangeIndex(100, 150, 200), + RangeIndex(0, 102, 2), + Int64Index(range(0, 102, 2)), + ), + ( + RangeIndex(0, -100, -2), + RangeIndex(-100, 50, 102), + RangeIndex(-100, 4, 2), + Int64Index(list(range(0, -100, -2)) + [-100, 2]), + ), + ( + RangeIndex(0, -100, -1), + RangeIndex(0, -50, -3), + RangeIndex(-99, 1, 1), + Int64Index(list(range(0, -100, -1))), + ), + ( + RangeIndex(0, 1, 1), + RangeIndex(5, 6, 10), + RangeIndex(0, 6, 5), + Int64Index([0, 5]), + ), + ( + RangeIndex(0, 10, 5), + RangeIndex(-5, -6, -20), + RangeIndex(-5, 10, 5), + Int64Index([0, 5, -5]), + ), + ( + RangeIndex(0, 3, 1), + RangeIndex(4, 5, 1), + Int64Index([0, 1, 2, 4]), + Int64Index([0, 1, 2, 4]), + ), + ( + RangeIndex(0, 10, 1), + Int64Index([]), + RangeIndex(0, 10, 1), + RangeIndex(0, 10, 1), + ), + ( + RangeIndex(0), + Int64Index([1, 5, 6]), + Int64Index([1, 5, 6]), + Int64Index([1, 5, 6]), + ), + ] + ) + def unions(self, request): + """Inputs and expected outputs for RangeIndex.union tests""" + + return request.param + + def test_union_sorted(self, unions): + + idx1, idx2, expected_sorted, expected_notsorted = unions + + res1 = idx1.union(idx2, sort=None) + tm.assert_index_equal(res1, expected_sorted, exact=True) + + res1 = idx1.union(idx2, sort=False) + tm.assert_index_equal(res1, expected_notsorted, exact=True) + + res2 = idx2.union(idx1, sort=None) + res3 = idx1._int64index.union(idx2, sort=None) + tm.assert_index_equal(res2, expected_sorted, exact=True) + tm.assert_index_equal(res3, expected_sorted) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/test_base.py b/venv/Lib/site-packages/pandas/tests/indexes/test_base.py new file mode 100644 index 0000000..eec1ea6 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/test_base.py @@ -0,0 +1,2830 @@ +from collections import defaultdict +from datetime import datetime, timedelta +from io import StringIO +import math +import operator +import re + +import numpy as np +import pytest + +import pandas._config.config as cf + +from pandas._libs.tslib import Timestamp +from pandas.compat.numpy import np_datetime64_compat +from pandas.util._test_decorators import async_mark + +from pandas.core.dtypes.common import is_unsigned_integer_dtype +from pandas.core.dtypes.generic import ABCIndex + +import pandas as pd +from pandas import ( + CategoricalIndex, + DataFrame, + DatetimeIndex, + Float64Index, + Int64Index, + PeriodIndex, + RangeIndex, + Series, + TimedeltaIndex, + UInt64Index, + date_range, + isna, + period_range, +) +import pandas._testing as tm +from pandas.core.algorithms import safe_sort +from pandas.core.indexes.api import ( + Index, + MultiIndex, + _get_combined_index, + ensure_index, + ensure_index_from_sequences, +) +from pandas.tests.indexes.common import Base +from pandas.tests.indexes.conftest import indices_dict + + +class TestIndex(Base): + _holder = Index + + @pytest.fixture + def index(self, request): + """ + Fixture for selectively parametrizing indices_dict via indirect parametrization + (parametrize over indices_dict keys with indirect=True). Defaults to string + index if no keys are provided. + """ + key = getattr(request, "param", "string") + + # copy to avoid mutation, e.g. setting .name + return indices_dict[key].copy() + + def create_index(self): + return Index(list("abcde")) + + def test_can_hold_identifiers(self): + index = self.create_index() + key = index[0] + assert index._can_hold_identifiers_and_holds_name(key) is True + + @pytest.mark.parametrize("index", ["datetime"], indirect=True) + def test_new_axis(self, index): + with tm.assert_produces_warning(DeprecationWarning): + # GH#30588 multi-dimensional indexing deprecated + new_index = index[None, :] + assert new_index.ndim == 2 + assert isinstance(new_index, np.ndarray) + + @pytest.mark.parametrize("index", ["int", "uint", "float"], indirect=True) + def test_copy_and_deepcopy(self, index): + new_copy2 = index.copy(dtype=int) + assert new_copy2.dtype.kind == "i" + + def test_constructor_regular(self, indices): + tm.assert_contains_all(indices, indices) + + def test_constructor_casting(self, index): + # casting + arr = np.array(index) + new_index = Index(arr) + tm.assert_contains_all(arr, new_index) + tm.assert_index_equal(index, new_index) + + def test_constructor_copy(self, index): + # copy + # index = self.create_index() + arr = np.array(index) + new_index = Index(arr, copy=True, name="name") + assert isinstance(new_index, Index) + assert new_index.name == "name" + tm.assert_numpy_array_equal(arr, new_index.values) + arr[0] = "SOMEBIGLONGSTRING" + assert new_index[0] != "SOMEBIGLONGSTRING" + + # FIXME: dont leave commented-out + # what to do here? + # arr = np.array(5.) + # pytest.raises(Exception, arr.view, Index) + + def test_constructor_corner(self): + # corner case + msg = ( + r"Index\(\.\.\.\) must be called with a collection of some " + "kind, 0 was passed" + ) + with pytest.raises(TypeError, match=msg): + Index(0) + + @pytest.mark.parametrize("index_vals", [[("A", 1), "B"], ["B", ("A", 1)]]) + def test_construction_list_mixed_tuples(self, index_vals): + # see gh-10697: if we are constructing from a mixed list of tuples, + # make sure that we are independent of the sorting order. + index = Index(index_vals) + assert isinstance(index, Index) + assert not isinstance(index, MultiIndex) + + @pytest.mark.parametrize("na_value", [None, np.nan]) + @pytest.mark.parametrize("vtype", [list, tuple, iter]) + def test_construction_list_tuples_nan(self, na_value, vtype): + # GH 18505 : valid tuples containing NaN + values = [(1, "two"), (3.0, na_value)] + result = Index(vtype(values)) + expected = MultiIndex.from_tuples(values) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("cast_as_obj", [True, False]) + @pytest.mark.parametrize( + "index", + [ + pd.date_range( + "2015-01-01 10:00", + freq="D", + periods=3, + tz="US/Eastern", + name="Green Eggs & Ham", + ), # DTI with tz + pd.date_range("2015-01-01 10:00", freq="D", periods=3), # DTI no tz + pd.timedelta_range("1 days", freq="D", periods=3), # td + pd.period_range("2015-01-01", freq="D", periods=3), # period + ], + ) + def test_constructor_from_index_dtlike(self, cast_as_obj, index): + if cast_as_obj: + result = pd.Index(index.astype(object)) + else: + result = pd.Index(index) + + tm.assert_index_equal(result, index) + + if isinstance(index, pd.DatetimeIndex): + assert result.tz == index.tz + if cast_as_obj: + # GH#23524 check that Index(dti, dtype=object) does not + # incorrectly raise ValueError, and that nanoseconds are not + # dropped + index += pd.Timedelta(nanoseconds=50) + result = pd.Index(index, dtype=object) + assert result.dtype == np.object_ + assert list(result) == list(index) + + @pytest.mark.parametrize( + "index,has_tz", + [ + ( + pd.date_range("2015-01-01 10:00", freq="D", periods=3, tz="US/Eastern"), + True, + ), # datetimetz + (pd.timedelta_range("1 days", freq="D", periods=3), False), # td + (pd.period_range("2015-01-01", freq="D", periods=3), False), # period + ], + ) + def test_constructor_from_series_dtlike(self, index, has_tz): + result = pd.Index(pd.Series(index)) + tm.assert_index_equal(result, index) + + if has_tz: + assert result.tz == index.tz + + @pytest.mark.parametrize("klass", [Index, DatetimeIndex]) + def test_constructor_from_series(self, klass): + expected = DatetimeIndex( + [Timestamp("20110101"), Timestamp("20120101"), Timestamp("20130101")] + ) + s = Series( + [Timestamp("20110101"), Timestamp("20120101"), Timestamp("20130101")] + ) + result = klass(s) + tm.assert_index_equal(result, expected) + + def test_constructor_from_series_freq(self): + # GH 6273 + # create from a series, passing a freq + dts = ["1-1-1990", "2-1-1990", "3-1-1990", "4-1-1990", "5-1-1990"] + expected = DatetimeIndex(dts, freq="MS") + + s = Series(pd.to_datetime(dts)) + result = DatetimeIndex(s, freq="MS") + + tm.assert_index_equal(result, expected) + + def test_constructor_from_frame_series_freq(self): + # GH 6273 + # create from a series, passing a freq + dts = ["1-1-1990", "2-1-1990", "3-1-1990", "4-1-1990", "5-1-1990"] + expected = DatetimeIndex(dts, freq="MS") + + df = pd.DataFrame(np.random.rand(5, 3)) + df["date"] = dts + result = DatetimeIndex(df["date"], freq="MS") + + assert df["date"].dtype == object + expected.name = "date" + tm.assert_index_equal(result, expected) + + expected = pd.Series(dts, name="date") + tm.assert_series_equal(df["date"], expected) + + # GH 6274 + # infer freq of same + freq = pd.infer_freq(df["date"]) + assert freq == "MS" + + @pytest.mark.parametrize( + "array", + [ + np.arange(5), + np.array(["a", "b", "c"]), + date_range("2000-01-01", periods=3).values, + ], + ) + def test_constructor_ndarray_like(self, array): + # GH 5460#issuecomment-44474502 + # it should be possible to convert any object that satisfies the numpy + # ndarray interface directly into an Index + class ArrayLike: + def __init__(self, array): + self.array = array + + def __array__(self, dtype=None) -> np.ndarray: + return self.array + + expected = pd.Index(array) + result = pd.Index(ArrayLike(array)) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "dtype", + [int, "int64", "int32", "int16", "int8", "uint64", "uint32", "uint16", "uint8"], + ) + def test_constructor_int_dtype_float(self, dtype): + # GH 18400 + if is_unsigned_integer_dtype(dtype): + index_type = UInt64Index + else: + index_type = Int64Index + + expected = index_type([0, 1, 2, 3]) + result = Index([0.0, 1.0, 2.0, 3.0], dtype=dtype) + tm.assert_index_equal(result, expected) + + def test_constructor_int_dtype_nan(self): + # see gh-15187 + data = [np.nan] + expected = Float64Index(data) + result = Index(data, dtype="float") + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("dtype", ["int64", "uint64"]) + def test_constructor_int_dtype_nan_raises(self, dtype): + # see gh-15187 + data = [np.nan] + msg = "cannot convert" + with pytest.raises(ValueError, match=msg): + Index(data, dtype=dtype) + + def test_constructor_no_pandas_array(self): + ser = pd.Series([1, 2, 3]) + result = pd.Index(ser.array) + expected = pd.Index([1, 2, 3]) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "klass,dtype,na_val", + [ + (pd.Float64Index, np.float64, np.nan), + (pd.DatetimeIndex, "datetime64[ns]", pd.NaT), + ], + ) + def test_index_ctor_infer_nan_nat(self, klass, dtype, na_val): + # GH 13467 + na_list = [na_val, na_val] + expected = klass(na_list) + assert expected.dtype == dtype + + result = Index(na_list) + tm.assert_index_equal(result, expected) + + result = Index(np.array(na_list)) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("pos", [0, 1]) + @pytest.mark.parametrize( + "klass,dtype,ctor", + [ + (pd.DatetimeIndex, "datetime64[ns]", np.datetime64("nat")), + (pd.TimedeltaIndex, "timedelta64[ns]", np.timedelta64("nat")), + ], + ) + def test_index_ctor_infer_nat_dt_like(self, pos, klass, dtype, ctor, nulls_fixture): + expected = klass([pd.NaT, pd.NaT]) + assert expected.dtype == dtype + data = [ctor] + data.insert(pos, nulls_fixture) + + result = Index(data) + tm.assert_index_equal(result, expected) + + result = Index(np.array(data, dtype=object)) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("swap_objs", [True, False]) + def test_index_ctor_nat_result(self, swap_objs): + # mixed np.datetime64/timedelta64 nat results in object + data = [np.datetime64("nat"), np.timedelta64("nat")] + if swap_objs: + data = data[::-1] + + expected = pd.Index(data, dtype=object) + tm.assert_index_equal(Index(data), expected) + tm.assert_index_equal(Index(np.array(data, dtype=object)), expected) + + def test_index_ctor_infer_periodindex(self): + xp = period_range("2012-1-1", freq="M", periods=3) + rs = Index(xp) + tm.assert_index_equal(rs, xp) + assert isinstance(rs, PeriodIndex) + + @pytest.mark.parametrize( + "vals,dtype", + [ + ([1, 2, 3, 4, 5], "int"), + ([1.1, np.nan, 2.2, 3.0], "float"), + (["A", "B", "C", np.nan], "obj"), + ], + ) + def test_constructor_simple_new(self, vals, dtype): + index = Index(vals, name=dtype) + result = index._simple_new(index.values, dtype) + tm.assert_index_equal(result, index) + + def test_constructor_wrong_kwargs(self): + # GH #19348 + with pytest.raises(TypeError, match="Unexpected keyword arguments {'foo'}"): + Index([], foo="bar") + + @pytest.mark.parametrize( + "vals", + [ + [1, 2, 3], + np.array([1, 2, 3]), + np.array([1, 2, 3], dtype=int), + # below should coerce + [1.0, 2.0, 3.0], + np.array([1.0, 2.0, 3.0], dtype=float), + ], + ) + def test_constructor_dtypes_to_int64(self, vals): + index = Index(vals, dtype=int) + assert isinstance(index, Int64Index) + + @pytest.mark.parametrize( + "vals", + [ + [1, 2, 3], + [1.0, 2.0, 3.0], + np.array([1.0, 2.0, 3.0]), + np.array([1, 2, 3], dtype=int), + np.array([1.0, 2.0, 3.0], dtype=float), + ], + ) + def test_constructor_dtypes_to_float64(self, vals): + index = Index(vals, dtype=float) + assert isinstance(index, Float64Index) + + @pytest.mark.parametrize("cast_index", [True, False]) + @pytest.mark.parametrize( + "vals", [[True, False, True], np.array([True, False, True], dtype=bool)] + ) + def test_constructor_dtypes_to_object(self, cast_index, vals): + if cast_index: + index = Index(vals, dtype=bool) + else: + index = Index(vals) + + assert isinstance(index, Index) + assert index.dtype == object + + @pytest.mark.parametrize( + "vals", + [ + [1, 2, 3], + np.array([1, 2, 3], dtype=int), + np.array( + [np_datetime64_compat("2011-01-01"), np_datetime64_compat("2011-01-02")] + ), + [datetime(2011, 1, 1), datetime(2011, 1, 2)], + ], + ) + def test_constructor_dtypes_to_categorical(self, vals): + index = Index(vals, dtype="category") + assert isinstance(index, CategoricalIndex) + + @pytest.mark.parametrize("cast_index", [True, False]) + @pytest.mark.parametrize( + "vals", + [ + Index( + np.array( + [ + np_datetime64_compat("2011-01-01"), + np_datetime64_compat("2011-01-02"), + ] + ) + ), + Index([datetime(2011, 1, 1), datetime(2011, 1, 2)]), + ], + ) + def test_constructor_dtypes_to_datetime(self, cast_index, vals): + if cast_index: + index = Index(vals, dtype=object) + assert isinstance(index, Index) + assert index.dtype == object + else: + index = Index(vals) + assert isinstance(index, DatetimeIndex) + + @pytest.mark.parametrize("cast_index", [True, False]) + @pytest.mark.parametrize( + "vals", + [ + np.array([np.timedelta64(1, "D"), np.timedelta64(1, "D")]), + [timedelta(1), timedelta(1)], + ], + ) + def test_constructor_dtypes_to_timedelta(self, cast_index, vals): + if cast_index: + index = Index(vals, dtype=object) + assert isinstance(index, Index) + assert index.dtype == object + else: + index = Index(vals) + assert isinstance(index, TimedeltaIndex) + + @pytest.mark.parametrize("attr", ["values", "asi8"]) + @pytest.mark.parametrize("klass", [pd.Index, pd.DatetimeIndex]) + def test_constructor_dtypes_datetime(self, tz_naive_fixture, attr, klass): + # Test constructing with a datetimetz dtype + # .values produces numpy datetimes, so these are considered naive + # .asi8 produces integers, so these are considered epoch timestamps + # ^the above will be true in a later version. Right now we `.view` + # the i8 values as NS_DTYPE, effectively treating them as wall times. + index = pd.date_range("2011-01-01", periods=5) + arg = getattr(index, attr) + index = index.tz_localize(tz_naive_fixture) + dtype = index.dtype + + if attr == "asi8": + result = pd.DatetimeIndex(arg).tz_localize(tz_naive_fixture) + else: + result = klass(arg, tz=tz_naive_fixture) + tm.assert_index_equal(result, index) + + if attr == "asi8": + result = pd.DatetimeIndex(arg).astype(dtype) + else: + result = klass(arg, dtype=dtype) + tm.assert_index_equal(result, index) + + if attr == "asi8": + result = pd.DatetimeIndex(list(arg)).tz_localize(tz_naive_fixture) + else: + result = klass(list(arg), tz=tz_naive_fixture) + tm.assert_index_equal(result, index) + + if attr == "asi8": + result = pd.DatetimeIndex(list(arg)).astype(dtype) + else: + result = klass(list(arg), dtype=dtype) + tm.assert_index_equal(result, index) + + @pytest.mark.parametrize("attr", ["values", "asi8"]) + @pytest.mark.parametrize("klass", [pd.Index, pd.TimedeltaIndex]) + def test_constructor_dtypes_timedelta(self, attr, klass): + index = pd.timedelta_range("1 days", periods=5) + dtype = index.dtype + + values = getattr(index, attr) + + result = klass(values, dtype=dtype) + tm.assert_index_equal(result, index) + + result = klass(list(values), dtype=dtype) + tm.assert_index_equal(result, index) + + @pytest.mark.parametrize("value", [[], iter([]), (_ for _ in [])]) + @pytest.mark.parametrize( + "klass", + [ + Index, + Float64Index, + Int64Index, + UInt64Index, + CategoricalIndex, + DatetimeIndex, + TimedeltaIndex, + ], + ) + def test_constructor_empty(self, value, klass): + empty = klass(value) + assert isinstance(empty, klass) + assert not len(empty) + + @pytest.mark.parametrize( + "empty,klass", + [ + (PeriodIndex([], freq="B"), PeriodIndex), + (PeriodIndex(iter([]), freq="B"), PeriodIndex), + (PeriodIndex((_ for _ in []), freq="B"), PeriodIndex), + (RangeIndex(step=1), pd.RangeIndex), + (MultiIndex(levels=[[1, 2], ["blue", "red"]], codes=[[], []]), MultiIndex), + ], + ) + def test_constructor_empty_special(self, empty, klass): + assert isinstance(empty, klass) + assert not len(empty) + + def test_constructor_overflow_int64(self): + # see gh-15832 + msg = ( + "The elements provided in the data cannot " + "all be casted to the dtype int64" + ) + with pytest.raises(OverflowError, match=msg): + Index([np.iinfo(np.uint64).max - 1], dtype="int64") + + @pytest.mark.xfail(reason="see GH#21311: Index doesn't enforce dtype argument") + def test_constructor_cast(self): + msg = "could not convert string to float" + with pytest.raises(ValueError, match=msg): + Index(["a", "b", "c"], dtype=float) + + @pytest.mark.parametrize( + "index", + [ + "datetime", + "float", + "int", + "period", + "range", + "repeats", + "timedelta", + "tuples", + "uint", + ], + indirect=True, + ) + def test_view_with_args(self, index): + index.view("i8") + + @pytest.mark.parametrize( + "index", + [ + "unicode", + "string", + pytest.param("categorical", marks=pytest.mark.xfail(reason="gh-25464")), + "bool", + "empty", + ], + indirect=True, + ) + def test_view_with_args_object_array_raises(self, index): + msg = "Cannot change data-type for object array" + with pytest.raises(TypeError, match=msg): + index.view("i8") + + @pytest.mark.parametrize("index", ["int", "range"], indirect=True) + def test_astype(self, index): + casted = index.astype("i8") + + # it works! + casted.get_loc(5) + + # pass on name + index.name = "foobar" + casted = index.astype("i8") + assert casted.name == "foobar" + + def test_equals_object(self): + # same + assert Index(["a", "b", "c"]).equals(Index(["a", "b", "c"])) + + @pytest.mark.parametrize( + "comp", [Index(["a", "b"]), Index(["a", "b", "d"]), ["a", "b", "c"]] + ) + def test_not_equals_object(self, comp): + assert not Index(["a", "b", "c"]).equals(comp) + + def test_insert(self): + + # GH 7256 + # validate neg/pos inserts + result = Index(["b", "c", "d"]) + + # test 0th element + tm.assert_index_equal(Index(["a", "b", "c", "d"]), result.insert(0, "a")) + + # test Nth element that follows Python list behavior + tm.assert_index_equal(Index(["b", "c", "e", "d"]), result.insert(-1, "e")) + + # test loc +/- neq (0, -1) + tm.assert_index_equal(result.insert(1, "z"), result.insert(-2, "z")) + + # test empty + null_index = Index([]) + tm.assert_index_equal(Index(["a"]), null_index.insert(0, "a")) + + def test_insert_missing(self, nulls_fixture): + # GH 22295 + # test there is no mangling of NA values + expected = Index(["a", nulls_fixture, "b", "c"]) + result = Index(list("abc")).insert(1, nulls_fixture) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "pos,expected", + [ + (0, Index(["b", "c", "d"], name="index")), + (-1, Index(["a", "b", "c"], name="index")), + ], + ) + def test_delete(self, pos, expected): + index = Index(["a", "b", "c", "d"], name="index") + result = index.delete(pos) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + + def test_delete_raises(self): + index = Index(["a", "b", "c", "d"], name="index") + msg = "index 5 is out of bounds for axis 0 with size 4" + with pytest.raises(IndexError, match=msg): + index.delete(5) + + def test_identical(self): + + # index + i1 = Index(["a", "b", "c"]) + i2 = Index(["a", "b", "c"]) + + assert i1.identical(i2) + + i1 = i1.rename("foo") + assert i1.equals(i2) + assert not i1.identical(i2) + + i2 = i2.rename("foo") + assert i1.identical(i2) + + i3 = Index([("a", "a"), ("a", "b"), ("b", "a")]) + i4 = Index([("a", "a"), ("a", "b"), ("b", "a")], tupleize_cols=False) + assert not i3.identical(i4) + + def test_is_(self): + ind = Index(range(10)) + assert ind.is_(ind) + assert ind.is_(ind.view().view().view().view()) + assert not ind.is_(Index(range(10))) + assert not ind.is_(ind.copy()) + assert not ind.is_(ind.copy(deep=False)) + assert not ind.is_(ind[:]) + assert not ind.is_(np.array(range(10))) + + # quasi-implementation dependent + assert ind.is_(ind.view()) + ind2 = ind.view() + ind2.name = "bob" + assert ind.is_(ind2) + assert ind2.is_(ind) + # doesn't matter if Indices are *actually* views of underlying data, + assert not ind.is_(Index(ind.values)) + arr = np.array(range(1, 11)) + ind1 = Index(arr, copy=False) + ind2 = Index(arr, copy=False) + assert not ind1.is_(ind2) + + @pytest.mark.parametrize("index", ["datetime"], indirect=True) + def test_asof(self, index): + d = index[0] + assert index.asof(d) == d + assert isna(index.asof(d - timedelta(1))) + + d = index[-1] + assert index.asof(d + timedelta(1)) == d + + d = index[0].to_pydatetime() + assert isinstance(index.asof(d), Timestamp) + + def test_asof_datetime_partial(self): + index = pd.date_range("2010-01-01", periods=2, freq="m") + expected = Timestamp("2010-02-28") + result = index.asof("2010-02") + assert result == expected + assert not isinstance(result, Index) + + def test_nanosecond_index_access(self): + s = Series([Timestamp("20130101")]).values.view("i8")[0] + r = DatetimeIndex([s + 50 + i for i in range(100)]) + x = Series(np.random.randn(100), index=r) + + first_value = x.asof(x.index[0]) + + # this does not yet work, as parsing strings is done via dateutil + # assert first_value == x['2013-01-01 00:00:00.000000050+0000'] + + expected_ts = np_datetime64_compat("2013-01-01 00:00:00.000000050+0000", "ns") + assert first_value == x[Timestamp(expected_ts)] + + def test_booleanindex(self, index): + bool_index = np.ones(len(index), dtype=bool) + bool_index[5:30:2] = False + + sub_index = index[bool_index] + + for i, val in enumerate(sub_index): + assert sub_index.get_loc(val) == i + + sub_index = index[list(bool_index)] + for i, val in enumerate(sub_index): + assert sub_index.get_loc(val) == i + + def test_fancy(self): + index = self.create_index() + sl = index[[1, 2, 3]] + for i in sl: + assert i == sl[sl.get_loc(i)] + + @pytest.mark.parametrize("index", ["string", "int", "float"], indirect=True) + @pytest.mark.parametrize("dtype", [np.int_, np.bool_]) + def test_empty_fancy(self, index, dtype): + empty_arr = np.array([], dtype=dtype) + empty_index = type(index)([]) + + assert index[[]].identical(empty_index) + assert index[empty_arr].identical(empty_index) + + @pytest.mark.parametrize("index", ["string", "int", "float"], indirect=True) + def test_empty_fancy_raises(self, index): + # pd.DatetimeIndex is excluded, because it overrides getitem and should + # be tested separately. + empty_farr = np.array([], dtype=np.float_) + empty_index = type(index)([]) + + assert index[[]].identical(empty_index) + # np.ndarray only accepts ndarray of int & bool dtypes, so should Index + msg = r"arrays used as indices must be of integer \(or boolean\) type" + with pytest.raises(IndexError, match=msg): + index[empty_farr] + + @pytest.mark.parametrize("sort", [None, False]) + def test_intersection(self, index, sort): + first = index[:20] + second = index[:10] + intersect = first.intersection(second, sort=sort) + if sort is None: + tm.assert_index_equal(intersect, second.sort_values()) + assert tm.equalContents(intersect, second) + + # Corner cases + inter = first.intersection(first, sort=sort) + assert inter is first + + @pytest.mark.parametrize( + "index2,keeps_name", + [ + (Index([3, 4, 5, 6, 7], name="index"), True), # preserve same name + (Index([3, 4, 5, 6, 7], name="other"), False), # drop diff names + (Index([3, 4, 5, 6, 7]), False), + ], + ) + @pytest.mark.parametrize("sort", [None, False]) + def test_intersection_name_preservation(self, index2, keeps_name, sort): + index1 = Index([1, 2, 3, 4, 5], name="index") + expected = Index([3, 4, 5]) + result = index1.intersection(index2, sort) + + if keeps_name: + expected.name = "index" + + assert result.name == expected.name + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "first_name,second_name,expected_name", + [("A", "A", "A"), ("A", "B", None), (None, "B", None)], + ) + @pytest.mark.parametrize("sort", [None, False]) + def test_intersection_name_preservation2( + self, index, first_name, second_name, expected_name, sort + ): + first = index[5:20] + second = index[:10] + first.name = first_name + second.name = second_name + intersect = first.intersection(second, sort=sort) + assert intersect.name == expected_name + + @pytest.mark.parametrize( + "index2,keeps_name", + [ + (Index([4, 7, 6, 5, 3], name="index"), True), + (Index([4, 7, 6, 5, 3], name="other"), False), + ], + ) + @pytest.mark.parametrize("sort", [None, False]) + def test_intersection_monotonic(self, index2, keeps_name, sort): + index1 = Index([5, 3, 2, 4, 1], name="index") + expected = Index([5, 3, 4]) + + if keeps_name: + expected.name = "index" + + result = index1.intersection(index2, sort=sort) + if sort is None: + expected = expected.sort_values() + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "index2,expected_arr", + [(Index(["B", "D"]), ["B"]), (Index(["B", "D", "A"]), ["A", "B", "A"])], + ) + @pytest.mark.parametrize("sort", [None, False]) + def test_intersection_non_monotonic_non_unique(self, index2, expected_arr, sort): + # non-monotonic non-unique + index1 = Index(["A", "B", "A", "C"]) + expected = Index(expected_arr, dtype="object") + result = index1.intersection(index2, sort=sort) + if sort is None: + expected = expected.sort_values() + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("sort", [None, False]) + def test_intersect_str_dates(self, sort): + dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)] + + i1 = Index(dt_dates, dtype=object) + i2 = Index(["aa"], dtype=object) + result = i2.intersection(i1, sort=sort) + + assert len(result) == 0 + + def test_intersect_nosort(self): + result = pd.Index(["c", "b", "a"]).intersection(["b", "a"]) + expected = pd.Index(["b", "a"]) + tm.assert_index_equal(result, expected) + + def test_intersection_equal_sort(self): + idx = pd.Index(["c", "a", "b"]) + tm.assert_index_equal(idx.intersection(idx, sort=False), idx) + tm.assert_index_equal(idx.intersection(idx, sort=None), idx) + + @pytest.mark.xfail(reason="Not implemented") + def test_intersection_equal_sort_true(self): + # TODO decide on True behaviour + idx = pd.Index(["c", "a", "b"]) + sorted_ = pd.Index(["a", "b", "c"]) + tm.assert_index_equal(idx.intersection(idx, sort=True), sorted_) + + @pytest.mark.parametrize("sort", [None, False]) + def test_chained_union(self, sort): + # Chained unions handles names correctly + i1 = Index([1, 2], name="i1") + i2 = Index([5, 6], name="i2") + i3 = Index([3, 4], name="i3") + union = i1.union(i2.union(i3, sort=sort), sort=sort) + expected = i1.union(i2, sort=sort).union(i3, sort=sort) + tm.assert_index_equal(union, expected) + + j1 = Index([1, 2], name="j1") + j2 = Index([], name="j2") + j3 = Index([], name="j3") + union = j1.union(j2.union(j3, sort=sort), sort=sort) + expected = j1.union(j2, sort=sort).union(j3, sort=sort) + tm.assert_index_equal(union, expected) + + @pytest.mark.parametrize("sort", [None, False]) + def test_union(self, index, sort): + first = index[5:20] + second = index[:10] + everything = index[:20] + + union = first.union(second, sort=sort) + if sort is None: + tm.assert_index_equal(union, everything.sort_values()) + assert tm.equalContents(union, everything) + + @pytest.mark.parametrize("slice_", [slice(None), slice(0)]) + def test_union_sort_other_special(self, slice_): + # https://github.com/pandas-dev/pandas/issues/24959 + + idx = pd.Index([1, 0, 2]) + # default, sort=None + other = idx[slice_] + tm.assert_index_equal(idx.union(other), idx) + tm.assert_index_equal(other.union(idx), idx) + + # sort=False + tm.assert_index_equal(idx.union(other, sort=False), idx) + + @pytest.mark.xfail(reason="Not implemented") + @pytest.mark.parametrize("slice_", [slice(None), slice(0)]) + def test_union_sort_special_true(self, slice_): + # TODO decide on True behaviour + # sort=True + idx = pd.Index([1, 0, 2]) + # default, sort=None + other = idx[slice_] + + result = idx.union(other, sort=True) + expected = pd.Index([0, 1, 2]) + tm.assert_index_equal(result, expected) + + def test_union_sort_other_incomparable(self): + # https://github.com/pandas-dev/pandas/issues/24959 + idx = pd.Index([1, pd.Timestamp("2000")]) + # default (sort=None) + with tm.assert_produces_warning(RuntimeWarning): + result = idx.union(idx[:1]) + + tm.assert_index_equal(result, idx) + + # sort=None + with tm.assert_produces_warning(RuntimeWarning): + result = idx.union(idx[:1], sort=None) + tm.assert_index_equal(result, idx) + + # sort=False + result = idx.union(idx[:1], sort=False) + tm.assert_index_equal(result, idx) + + @pytest.mark.xfail(reason="Not implemented") + def test_union_sort_other_incomparable_true(self): + # TODO decide on True behaviour + # sort=True + idx = pd.Index([1, pd.Timestamp("2000")]) + with pytest.raises(TypeError, match=".*"): + idx.union(idx[:1], sort=True) + + @pytest.mark.parametrize("klass", [np.array, Series, list]) + @pytest.mark.parametrize("sort", [None, False]) + def test_union_from_iterables(self, index, klass, sort): + # GH 10149 + first = index[5:20] + second = index[:10] + everything = index[:20] + + case = klass(second.values) + result = first.union(case, sort=sort) + if sort is None: + tm.assert_index_equal(result, everything.sort_values()) + assert tm.equalContents(result, everything) + + @pytest.mark.parametrize("sort", [None, False]) + def test_union_identity(self, index, sort): + first = index[5:20] + + union = first.union(first, sort=sort) + # i.e. identity is not preserved when sort is True + assert (union is first) is (not sort) + + # This should no longer be the same object, since [] is not consistent, + # both objects will be recast to dtype('O') + union = first.union([], sort=sort) + assert (union is first) is (not sort) + + union = Index([]).union(first, sort=sort) + assert (union is first) is (not sort) + + @pytest.mark.parametrize("first_list", [list("ba"), list()]) + @pytest.mark.parametrize("second_list", [list("ab"), list()]) + @pytest.mark.parametrize( + "first_name, second_name, expected_name", + [("A", "B", None), (None, "B", None), ("A", None, None)], + ) + @pytest.mark.parametrize("sort", [None, False]) + def test_union_name_preservation( + self, first_list, second_list, first_name, second_name, expected_name, sort + ): + first = Index(first_list, name=first_name) + second = Index(second_list, name=second_name) + union = first.union(second, sort=sort) + + vals = set(first_list).union(second_list) + + if sort is None and len(first_list) > 0 and len(second_list) > 0: + expected = Index(sorted(vals), name=expected_name) + tm.assert_index_equal(union, expected) + else: + expected = Index(vals, name=expected_name) + assert tm.equalContents(union, expected) + + @pytest.mark.parametrize("sort", [None, False]) + def test_union_dt_as_obj(self, sort): + # TODO: Replace with fixturesult + index = self.create_index() + date_index = pd.date_range("2019-01-01", periods=10) + first_cat = index.union(date_index) + second_cat = index.union(index) + + if date_index.dtype == np.object_: + appended = np.append(index, date_index) + else: + appended = np.append(index, date_index.astype("O")) + + assert tm.equalContents(first_cat, appended) + assert tm.equalContents(second_cat, index) + tm.assert_contains_all(index, first_cat) + tm.assert_contains_all(index, second_cat) + tm.assert_contains_all(date_index, first_cat) + + @pytest.mark.parametrize( + "method", ["union", "intersection", "difference", "symmetric_difference"] + ) + def test_setops_disallow_true(self, method): + idx1 = pd.Index(["a", "b"]) + idx2 = pd.Index(["b", "c"]) + + with pytest.raises(ValueError, match="The 'sort' keyword only takes"): + getattr(idx1, method)(idx2, sort=True) + + def test_map_identity_mapping(self, indices): + # GH 12766 + tm.assert_index_equal(indices, indices.map(lambda x: x)) + + def test_map_with_tuples(self): + # GH 12766 + + # Test that returning a single tuple from an Index + # returns an Index. + index = tm.makeIntIndex(3) + result = tm.makeIntIndex(3).map(lambda x: (x,)) + expected = Index([(i,) for i in index]) + tm.assert_index_equal(result, expected) + + # Test that returning a tuple from a map of a single index + # returns a MultiIndex object. + result = index.map(lambda x: (x, x == 1)) + expected = MultiIndex.from_tuples([(i, i == 1) for i in index]) + tm.assert_index_equal(result, expected) + + def test_map_with_tuples_mi(self): + # Test that returning a single object from a MultiIndex + # returns an Index. + first_level = ["foo", "bar", "baz"] + multi_index = MultiIndex.from_tuples(zip(first_level, [1, 2, 3])) + reduced_index = multi_index.map(lambda x: x[0]) + tm.assert_index_equal(reduced_index, Index(first_level)) + + @pytest.mark.parametrize( + "attr", ["makeDateIndex", "makePeriodIndex", "makeTimedeltaIndex"] + ) + def test_map_tseries_indices_return_index(self, attr): + index = getattr(tm, attr)(10) + expected = Index([1] * 10) + result = index.map(lambda x: 1) + tm.assert_index_equal(expected, result) + + def test_map_tseries_indices_accsr_return_index(self): + date_index = tm.makeDateIndex(24, freq="h", name="hourly") + expected = Index(range(24), name="hourly") + tm.assert_index_equal(expected, date_index.map(lambda x: x.hour)) + + @pytest.mark.parametrize( + "mapper", + [ + lambda values, index: {i: e for e, i in zip(values, index)}, + lambda values, index: pd.Series(values, index), + ], + ) + def test_map_dictlike_simple(self, mapper): + # GH 12756 + expected = Index(["foo", "bar", "baz"]) + index = tm.makeIntIndex(3) + result = index.map(mapper(expected.values, index)) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "mapper", + [ + lambda values, index: {i: e for e, i in zip(values, index)}, + lambda values, index: pd.Series(values, index), + ], + ) + def test_map_dictlike(self, indices, mapper): + # GH 12756 + if isinstance(indices, CategoricalIndex): + # Tested in test_categorical + return + elif not indices.is_unique: + # Cannot map duplicated index + return + + if indices.empty: + # to match proper result coercion for uints + expected = Index([]) + else: + expected = Index(np.arange(len(indices), 0, -1)) + + result = indices.map(mapper(expected, indices)) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "mapper", + [Series(["foo", 2.0, "baz"], index=[0, 2, -1]), {0: "foo", 2: 2.0, -1: "baz"}], + ) + def test_map_with_non_function_missing_values(self, mapper): + # GH 12756 + expected = Index([2.0, np.nan, "foo"]) + result = Index([2, 1, 0]).map(mapper) + + tm.assert_index_equal(expected, result) + + def test_map_na_exclusion(self): + index = Index([1.5, np.nan, 3, np.nan, 5]) + + result = index.map(lambda x: x * 2, na_action="ignore") + expected = index * 2 + tm.assert_index_equal(result, expected) + + def test_map_defaultdict(self): + index = Index([1, 2, 3]) + default_dict = defaultdict(lambda: "blank") + default_dict[1] = "stuff" + result = index.map(default_dict) + expected = Index(["stuff", "blank", "blank"]) + tm.assert_index_equal(result, expected) + + def test_append_multiple(self): + index = Index(["a", "b", "c", "d", "e", "f"]) + + foos = [index[:2], index[2:4], index[4:]] + result = foos[0].append(foos[1:]) + tm.assert_index_equal(result, index) + + # empty + result = index.append([]) + tm.assert_index_equal(result, index) + + @pytest.mark.parametrize("name,expected", [("foo", "foo"), ("bar", None)]) + def test_append_empty_preserve_name(self, name, expected): + left = Index([], name="foo") + right = Index([1, 2, 3], name=name) + + result = left.append(right) + assert result.name == expected + + @pytest.mark.parametrize("second_name,expected", [(None, None), ("name", "name")]) + @pytest.mark.parametrize("sort", [None, False]) + def test_difference_name_preservation(self, index, second_name, expected, sort): + first = index[5:20] + second = index[:10] + answer = index[10:20] + + first.name = "name" + second.name = second_name + result = first.difference(second, sort=sort) + + assert tm.equalContents(result, answer) + + if expected is None: + assert result.name is None + else: + assert result.name == expected + + @pytest.mark.parametrize("sort", [None, False]) + def test_difference_empty_arg(self, index, sort): + first = index[5:20] + first.name == "name" + result = first.difference([], sort) + + assert tm.equalContents(result, first) + assert result.name == first.name + + @pytest.mark.parametrize("sort", [None, False]) + def test_difference_identity(self, index, sort): + first = index[5:20] + first.name == "name" + result = first.difference(first, sort) + + assert len(result) == 0 + assert result.name == first.name + + @pytest.mark.parametrize("sort", [None, False]) + def test_difference_sort(self, index, sort): + first = index[5:20] + second = index[:10] + + result = first.difference(second, sort) + expected = index[10:20] + + if sort is None: + expected = expected.sort_values() + + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("sort", [None, False]) + def test_symmetric_difference(self, sort): + # smoke + index1 = Index([5, 2, 3, 4], name="index1") + index2 = Index([2, 3, 4, 1]) + result = index1.symmetric_difference(index2, sort=sort) + expected = Index([5, 1]) + assert tm.equalContents(result, expected) + assert result.name is None + if sort is None: + expected = expected.sort_values() + tm.assert_index_equal(result, expected) + + # __xor__ syntax + expected = index1 ^ index2 + assert tm.equalContents(result, expected) + assert result.name is None + + @pytest.mark.parametrize("opname", ["difference", "symmetric_difference"]) + def test_difference_incomparable(self, opname): + a = pd.Index([3, pd.Timestamp("2000"), 1]) + b = pd.Index([2, pd.Timestamp("1999"), 1]) + op = operator.methodcaller(opname, b) + + # sort=None, the default + result = op(a) + expected = pd.Index([3, pd.Timestamp("2000"), 2, pd.Timestamp("1999")]) + if opname == "difference": + expected = expected[:2] + tm.assert_index_equal(result, expected) + + # sort=False + op = operator.methodcaller(opname, b, sort=False) + result = op(a) + tm.assert_index_equal(result, expected) + + @pytest.mark.xfail(reason="Not implemented") + @pytest.mark.parametrize("opname", ["difference", "symmetric_difference"]) + def test_difference_incomparable_true(self, opname): + # TODO decide on True behaviour + # # sort=True, raises + a = pd.Index([3, pd.Timestamp("2000"), 1]) + b = pd.Index([2, pd.Timestamp("1999"), 1]) + op = operator.methodcaller(opname, b, sort=True) + + with pytest.raises(TypeError, match="Cannot compare"): + op(a) + + @pytest.mark.parametrize("sort", [None, False]) + def test_symmetric_difference_mi(self, sort): + index1 = MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])) + index2 = MultiIndex.from_tuples([("foo", 1), ("bar", 3)]) + result = index1.symmetric_difference(index2, sort=sort) + expected = MultiIndex.from_tuples([("bar", 2), ("baz", 3), ("bar", 3)]) + if sort is None: + expected = expected.sort_values() + tm.assert_index_equal(result, expected) + assert tm.equalContents(result, expected) + + @pytest.mark.parametrize( + "index2,expected", + [ + (Index([0, 1, np.nan]), Index([2.0, 3.0, 0.0])), + (Index([0, 1]), Index([np.nan, 2.0, 3.0, 0.0])), + ], + ) + @pytest.mark.parametrize("sort", [None, False]) + def test_symmetric_difference_missing(self, index2, expected, sort): + # GH 13514 change: {nan} - {nan} == {} + # (GH 6444, sorting of nans, is no longer an issue) + index1 = Index([1, np.nan, 2, 3]) + + result = index1.symmetric_difference(index2, sort=sort) + if sort is None: + expected = expected.sort_values() + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("sort", [None, False]) + def test_symmetric_difference_non_index(self, sort): + index1 = Index([1, 2, 3, 4], name="index1") + index2 = np.array([2, 3, 4, 5]) + expected = Index([1, 5]) + result = index1.symmetric_difference(index2, sort=sort) + assert tm.equalContents(result, expected) + assert result.name == "index1" + + result = index1.symmetric_difference(index2, result_name="new_name", sort=sort) + assert tm.equalContents(result, expected) + assert result.name == "new_name" + + @pytest.mark.parametrize("sort", [None, False]) + def test_difference_type(self, indices, sort): + # GH 20040 + # If taking difference of a set and itself, it + # needs to preserve the type of the index + if not indices.is_unique: + return + result = indices.difference(indices, sort=sort) + expected = indices.drop(indices) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("sort", [None, False]) + def test_intersection_difference(self, indices, sort): + # GH 20040 + # Test that the intersection of an index with an + # empty index produces the same index as the difference + # of an index with itself. Test for all types + if not indices.is_unique: + return + inter = indices.intersection(indices.drop(indices)) + diff = indices.difference(indices, sort=sort) + tm.assert_index_equal(inter, diff) + + @pytest.mark.parametrize( + "index, expected", + [ + ("string", False), + ("bool", False), + ("categorical", False), + ("int", True), + ("datetime", False), + ("float", True), + ], + indirect=["index"], + ) + def test_is_numeric(self, index, expected): + assert index.is_numeric() is expected + + @pytest.mark.parametrize( + "index, expected", + [ + ("string", True), + ("bool", True), + ("categorical", False), + ("int", False), + ("datetime", False), + ("float", False), + ], + indirect=["index"], + ) + def test_is_object(self, index, expected): + assert index.is_object() is expected + + @pytest.mark.parametrize( + "index, expected", + [ + ("string", False), + ("bool", False), + ("categorical", False), + ("int", False), + ("datetime", True), + ("float", False), + ], + indirect=["index"], + ) + def test_is_all_dates(self, index, expected): + assert index.is_all_dates is expected + + def test_summary(self, indices): + self._check_method_works(Index._summary, indices) + + def test_summary_bug(self): + # GH3869` + ind = Index(["{other}%s", "~:{range}:0"], name="A") + result = ind._summary() + # shouldn't be formatted accidentally. + assert "~:{range}:0" in result + assert "{other}%s" in result + + def test_format(self, indices): + self._check_method_works(Index.format, indices) + + def test_format_bug(self): + # GH 14626 + # windows has different precision on datetime.datetime.now (it doesn't + # include us since the default for Timestamp shows these but Index + # formatting does not we are skipping) + now = datetime.now() + if not str(now).endswith("000"): + index = Index([now]) + formatted = index.format() + expected = [str(index[0])] + assert formatted == expected + + Index([]).format() + + @pytest.mark.parametrize("vals", [[1, 2.0 + 3.0j, 4.0], ["a", "b", "c"]]) + def test_format_missing(self, vals, nulls_fixture): + # 2845 + vals = list(vals) # Copy for each iteration + vals.append(nulls_fixture) + index = Index(vals) + + formatted = index.format() + expected = [str(index[0]), str(index[1]), str(index[2]), "NaN"] + + assert formatted == expected + assert index[3] is nulls_fixture + + def test_format_with_name_time_info(self): + # bug I fixed 12/20/2011 + dates = date_range("2011-01-01 04:00:00", periods=10, name="something") + + formatted = dates.format(name=True) + assert formatted[0] == "something" + + def test_format_datetime_with_time(self): + t = Index([datetime(2012, 2, 7), datetime(2012, 2, 7, 23)]) + + result = t.format() + expected = ["2012-02-07 00:00:00", "2012-02-07 23:00:00"] + assert len(result) == 2 + assert result == expected + + @pytest.mark.parametrize("op", ["any", "all"]) + def test_logical_compat(self, op): + index = self.create_index() + assert getattr(index, op)() == getattr(index.values, op)() + + def _check_method_works(self, method, index): + method(index) + + def test_get_indexer(self): + index1 = Index([1, 2, 3, 4, 5]) + index2 = Index([2, 4, 6]) + + r1 = index1.get_indexer(index2) + e1 = np.array([1, 3, -1], dtype=np.intp) + tm.assert_almost_equal(r1, e1) + + @pytest.mark.parametrize("reverse", [True, False]) + @pytest.mark.parametrize( + "expected,method", + [ + (np.array([-1, 0, 0, 1, 1], dtype=np.intp), "pad"), + (np.array([-1, 0, 0, 1, 1], dtype=np.intp), "ffill"), + (np.array([0, 0, 1, 1, 2], dtype=np.intp), "backfill"), + (np.array([0, 0, 1, 1, 2], dtype=np.intp), "bfill"), + ], + ) + def test_get_indexer_methods(self, reverse, expected, method): + index1 = Index([1, 2, 3, 4, 5]) + index2 = Index([2, 4, 6]) + + if reverse: + index1 = index1[::-1] + expected = expected[::-1] + + result = index2.get_indexer(index1, method=method) + tm.assert_almost_equal(result, expected) + + def test_get_indexer_invalid(self): + # GH10411 + index = Index(np.arange(10)) + + with pytest.raises(ValueError, match="tolerance argument"): + index.get_indexer([1, 0], tolerance=1) + + with pytest.raises(ValueError, match="limit argument"): + index.get_indexer([1, 0], limit=1) + + @pytest.mark.parametrize( + "method, tolerance, indexer, expected", + [ + ("pad", None, [0, 5, 9], [0, 5, 9]), + ("backfill", None, [0, 5, 9], [0, 5, 9]), + ("nearest", None, [0, 5, 9], [0, 5, 9]), + ("pad", 0, [0, 5, 9], [0, 5, 9]), + ("backfill", 0, [0, 5, 9], [0, 5, 9]), + ("nearest", 0, [0, 5, 9], [0, 5, 9]), + ("pad", None, [0.2, 1.8, 8.5], [0, 1, 8]), + ("backfill", None, [0.2, 1.8, 8.5], [1, 2, 9]), + ("nearest", None, [0.2, 1.8, 8.5], [0, 2, 9]), + ("pad", 1, [0.2, 1.8, 8.5], [0, 1, 8]), + ("backfill", 1, [0.2, 1.8, 8.5], [1, 2, 9]), + ("nearest", 1, [0.2, 1.8, 8.5], [0, 2, 9]), + ("pad", 0.2, [0.2, 1.8, 8.5], [0, -1, -1]), + ("backfill", 0.2, [0.2, 1.8, 8.5], [-1, 2, -1]), + ("nearest", 0.2, [0.2, 1.8, 8.5], [0, 2, -1]), + ], + ) + def test_get_indexer_nearest(self, method, tolerance, indexer, expected): + index = Index(np.arange(10)) + + actual = index.get_indexer(indexer, method=method, tolerance=tolerance) + tm.assert_numpy_array_equal(actual, np.array(expected, dtype=np.intp)) + + @pytest.mark.parametrize("listtype", [list, tuple, Series, np.array]) + @pytest.mark.parametrize( + "tolerance, expected", + list( + zip( + [[0.3, 0.3, 0.1], [0.2, 0.1, 0.1], [0.1, 0.5, 0.5]], + [[0, 2, -1], [0, -1, -1], [-1, 2, 9]], + ) + ), + ) + def test_get_indexer_nearest_listlike_tolerance( + self, tolerance, expected, listtype + ): + index = Index(np.arange(10)) + + actual = index.get_indexer( + [0.2, 1.8, 8.5], method="nearest", tolerance=listtype(tolerance) + ) + tm.assert_numpy_array_equal(actual, np.array(expected, dtype=np.intp)) + + def test_get_indexer_nearest_error(self): + index = Index(np.arange(10)) + with pytest.raises(ValueError, match="limit argument"): + index.get_indexer([1, 0], method="nearest", limit=1) + + with pytest.raises(ValueError, match="tolerance size must match"): + index.get_indexer([1, 0], method="nearest", tolerance=[1, 2, 3]) + + @pytest.mark.parametrize( + "method,expected", + [("pad", [8, 7, 0]), ("backfill", [9, 8, 1]), ("nearest", [9, 7, 0])], + ) + def test_get_indexer_nearest_decreasing(self, method, expected): + index = Index(np.arange(10))[::-1] + + actual = index.get_indexer([0, 5, 9], method=method) + tm.assert_numpy_array_equal(actual, np.array([9, 4, 0], dtype=np.intp)) + + actual = index.get_indexer([0.2, 1.8, 8.5], method=method) + tm.assert_numpy_array_equal(actual, np.array(expected, dtype=np.intp)) + + @pytest.mark.parametrize( + "method,expected", + [ + ("pad", np.array([-1, 0, 1, 1], dtype=np.intp)), + ("backfill", np.array([0, 0, 1, -1], dtype=np.intp)), + ], + ) + def test_get_indexer_strings(self, method, expected): + index = pd.Index(["b", "c"]) + actual = index.get_indexer(["a", "b", "c", "d"], method=method) + + tm.assert_numpy_array_equal(actual, expected) + + def test_get_indexer_strings_raises(self): + index = pd.Index(["b", "c"]) + + msg = r"unsupported operand type\(s\) for -: 'str' and 'str'" + with pytest.raises(TypeError, match=msg): + index.get_indexer(["a", "b", "c", "d"], method="nearest") + + with pytest.raises(TypeError, match=msg): + index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=2) + + with pytest.raises(TypeError, match=msg): + index.get_indexer( + ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] + ) + + @pytest.mark.parametrize("idx_class", [Int64Index, RangeIndex, Float64Index]) + def test_get_indexer_numeric_index_boolean_target(self, idx_class): + # GH 16877 + + numeric_index = idx_class(RangeIndex((4))) + result = numeric_index.get_indexer([True, False, True]) + expected = np.array([-1, -1, -1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + def test_get_indexer_with_NA_values( + self, unique_nulls_fixture, unique_nulls_fixture2 + ): + # GH 22332 + # check pairwise, that no pair of na values + # is mangled + if unique_nulls_fixture is unique_nulls_fixture2: + return # skip it, values are not unique + arr = np.array([unique_nulls_fixture, unique_nulls_fixture2], dtype=np.object) + index = pd.Index(arr, dtype=np.object) + result = index.get_indexer( + [unique_nulls_fixture, unique_nulls_fixture2, "Unknown"] + ) + expected = np.array([0, 1, -1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("method", [None, "pad", "backfill", "nearest"]) + def test_get_loc(self, method): + index = pd.Index([0, 1, 2]) + assert index.get_loc(1, method=method) == 1 + + if method: + assert index.get_loc(1, method=method, tolerance=0) == 1 + + @pytest.mark.parametrize("method", [None, "pad", "backfill", "nearest"]) + def test_get_loc_raises_bad_label(self, method): + index = pd.Index([0, 1, 2]) + if method: + msg = "not supported between" + else: + msg = "invalid key" + + with pytest.raises(TypeError, match=msg): + index.get_loc([1, 2], method=method) + + @pytest.mark.parametrize( + "method,loc", [("pad", 1), ("backfill", 2), ("nearest", 1)] + ) + def test_get_loc_tolerance(self, method, loc): + index = pd.Index([0, 1, 2]) + assert index.get_loc(1.1, method) == loc + assert index.get_loc(1.1, method, tolerance=1) == loc + + @pytest.mark.parametrize("method", ["pad", "backfill", "nearest"]) + def test_get_loc_outside_tolerance_raises(self, method): + index = pd.Index([0, 1, 2]) + with pytest.raises(KeyError, match="1.1"): + index.get_loc(1.1, method, tolerance=0.05) + + def test_get_loc_bad_tolerance_raises(self): + index = pd.Index([0, 1, 2]) + with pytest.raises(ValueError, match="must be numeric"): + index.get_loc(1.1, "nearest", tolerance="invalid") + + def test_get_loc_tolerance_no_method_raises(self): + index = pd.Index([0, 1, 2]) + with pytest.raises(ValueError, match="tolerance .* valid if"): + index.get_loc(1.1, tolerance=1) + + def test_get_loc_raises_missized_tolerance(self): + index = pd.Index([0, 1, 2]) + with pytest.raises(ValueError, match="tolerance size must match"): + index.get_loc(1.1, "nearest", tolerance=[1, 1]) + + def test_get_loc_raises_object_nearest(self): + index = pd.Index(["a", "c"]) + with pytest.raises(TypeError, match="unsupported operand type"): + index.get_loc("a", method="nearest") + + def test_get_loc_raises_object_tolerance(self): + index = pd.Index(["a", "c"]) + with pytest.raises(TypeError, match="unsupported operand type"): + index.get_loc("a", method="pad", tolerance="invalid") + + @pytest.mark.parametrize("dtype", [int, float]) + def test_slice_locs(self, dtype): + index = Index(np.array([0, 1, 2, 5, 6, 7, 9, 10], dtype=dtype)) + n = len(index) + + assert index.slice_locs(start=2) == (2, n) + assert index.slice_locs(start=3) == (3, n) + assert index.slice_locs(3, 8) == (3, 6) + assert index.slice_locs(5, 10) == (3, n) + assert index.slice_locs(end=8) == (0, 6) + assert index.slice_locs(end=9) == (0, 7) + + # reversed + index2 = index[::-1] + assert index2.slice_locs(8, 2) == (2, 6) + assert index2.slice_locs(7, 3) == (2, 5) + + @pytest.mark.parametrize("dtype", [int, float]) + def test_slice_float_locs(self, dtype): + index = Index(np.array([0, 1, 2, 5, 6, 7, 9, 10], dtype=dtype)) + n = len(index) + assert index.slice_locs(5.0, 10.0) == (3, n) + assert index.slice_locs(4.5, 10.5) == (3, 8) + + index2 = index[::-1] + assert index2.slice_locs(8.5, 1.5) == (2, 6) + assert index2.slice_locs(10.5, -1) == (0, n) + + def test_slice_locs_dup(self): + index = Index(["a", "a", "b", "c", "d", "d"]) + assert index.slice_locs("a", "d") == (0, 6) + assert index.slice_locs(end="d") == (0, 6) + assert index.slice_locs("a", "c") == (0, 4) + assert index.slice_locs("b", "d") == (2, 6) + + index2 = index[::-1] + assert index2.slice_locs("d", "a") == (0, 6) + assert index2.slice_locs(end="a") == (0, 6) + assert index2.slice_locs("d", "b") == (0, 4) + assert index2.slice_locs("c", "a") == (2, 6) + + @pytest.mark.parametrize("dtype", [int, float]) + def test_slice_locs_dup_numeric(self, dtype): + index = Index(np.array([10, 12, 12, 14], dtype=dtype)) + assert index.slice_locs(12, 12) == (1, 3) + assert index.slice_locs(11, 13) == (1, 3) + + index2 = index[::-1] + assert index2.slice_locs(12, 12) == (1, 3) + assert index2.slice_locs(13, 11) == (1, 3) + + def test_slice_locs_na(self): + index = Index([np.nan, 1, 2]) + assert index.slice_locs(1) == (1, 3) + assert index.slice_locs(np.nan) == (0, 3) + + index = Index([0, np.nan, np.nan, 1, 2]) + assert index.slice_locs(np.nan) == (1, 5) + + def test_slice_locs_na_raises(self): + index = Index([np.nan, 1, 2]) + with pytest.raises(KeyError, match=""): + index.slice_locs(start=1.5) + + with pytest.raises(KeyError, match=""): + index.slice_locs(end=1.5) + + @pytest.mark.parametrize( + "in_slice,expected", + [ + (pd.IndexSlice[::-1], "yxdcb"), + (pd.IndexSlice["b":"y":-1], ""), # type: ignore + (pd.IndexSlice["b"::-1], "b"), # type: ignore + (pd.IndexSlice[:"b":-1], "yxdcb"), # type: ignore + (pd.IndexSlice[:"y":-1], "y"), # type: ignore + (pd.IndexSlice["y"::-1], "yxdcb"), # type: ignore + (pd.IndexSlice["y"::-4], "yb"), # type: ignore + # absent labels + (pd.IndexSlice[:"a":-1], "yxdcb"), # type: ignore + (pd.IndexSlice[:"a":-2], "ydb"), # type: ignore + (pd.IndexSlice["z"::-1], "yxdcb"), # type: ignore + (pd.IndexSlice["z"::-3], "yc"), # type: ignore + (pd.IndexSlice["m"::-1], "dcb"), # type: ignore + (pd.IndexSlice[:"m":-1], "yx"), # type: ignore + (pd.IndexSlice["a":"a":-1], ""), # type: ignore + (pd.IndexSlice["z":"z":-1], ""), # type: ignore + (pd.IndexSlice["m":"m":-1], ""), # type: ignore + ], + ) + def test_slice_locs_negative_step(self, in_slice, expected): + index = Index(list("bcdxy")) + + s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) + result = index[s_start : s_stop : in_slice.step] + expected = pd.Index(list(expected)) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("index", ["string", "int", "float"], indirect=True) + def test_drop_by_str_label(self, index): + n = len(index) + drop = index[list(range(5, 10))] + dropped = index.drop(drop) + + expected = index[list(range(5)) + list(range(10, n))] + tm.assert_index_equal(dropped, expected) + + dropped = index.drop(index[0]) + expected = index[1:] + tm.assert_index_equal(dropped, expected) + + @pytest.mark.parametrize("index", ["string", "int", "float"], indirect=True) + @pytest.mark.parametrize("keys", [["foo", "bar"], ["1", "bar"]]) + def test_drop_by_str_label_raises_missing_keys(self, index, keys): + with pytest.raises(KeyError, match=""): + index.drop(keys) + + @pytest.mark.parametrize("index", ["string", "int", "float"], indirect=True) + def test_drop_by_str_label_errors_ignore(self, index): + n = len(index) + drop = index[list(range(5, 10))] + mixed = drop.tolist() + ["foo"] + dropped = index.drop(mixed, errors="ignore") + + expected = index[list(range(5)) + list(range(10, n))] + tm.assert_index_equal(dropped, expected) + + dropped = index.drop(["foo", "bar"], errors="ignore") + expected = index[list(range(n))] + tm.assert_index_equal(dropped, expected) + + def test_drop_by_numeric_label_loc(self): + # TODO: Parametrize numeric and str tests after self.strIndex fixture + index = Index([1, 2, 3]) + dropped = index.drop(1) + expected = Index([2, 3]) + + tm.assert_index_equal(dropped, expected) + + def test_drop_by_numeric_label_raises_missing_keys(self): + index = Index([1, 2, 3]) + with pytest.raises(KeyError, match=""): + index.drop([3, 4]) + + @pytest.mark.parametrize( + "key,expected", [(4, Index([1, 2, 3])), ([3, 4, 5], Index([1, 2]))] + ) + def test_drop_by_numeric_label_errors_ignore(self, key, expected): + index = Index([1, 2, 3]) + dropped = index.drop(key, errors="ignore") + + tm.assert_index_equal(dropped, expected) + + @pytest.mark.parametrize( + "values", + [["a", "b", ("c", "d")], ["a", ("c", "d"), "b"], [("c", "d"), "a", "b"]], + ) + @pytest.mark.parametrize("to_drop", [[("c", "d"), "a"], ["a", ("c", "d")]]) + def test_drop_tuple(self, values, to_drop): + # GH 18304 + index = pd.Index(values) + expected = pd.Index(["b"]) + + result = index.drop(to_drop) + tm.assert_index_equal(result, expected) + + removed = index.drop(to_drop[0]) + for drop_me in to_drop[1], [to_drop[1]]: + result = removed.drop(drop_me) + tm.assert_index_equal(result, expected) + + removed = index.drop(to_drop[1]) + msg = fr"\"\[{re.escape(to_drop[1].__repr__())}\] not found in axis\"" + for drop_me in to_drop[1], [to_drop[1]]: + with pytest.raises(KeyError, match=msg): + removed.drop(drop_me) + + @pytest.mark.parametrize( + "method,expected,sort", + [ + ( + "intersection", + np.array( + [(1, "A"), (2, "A"), (1, "B"), (2, "B")], + dtype=[("num", int), ("let", "a1")], + ), + False, + ), + ( + "intersection", + np.array( + [(1, "A"), (1, "B"), (2, "A"), (2, "B")], + dtype=[("num", int), ("let", "a1")], + ), + None, + ), + ( + "union", + np.array( + [(1, "A"), (1, "B"), (1, "C"), (2, "A"), (2, "B"), (2, "C")], + dtype=[("num", int), ("let", "a1")], + ), + None, + ), + ], + ) + def test_tuple_union_bug(self, method, expected, sort): + index1 = Index( + np.array( + [(1, "A"), (2, "A"), (1, "B"), (2, "B")], + dtype=[("num", int), ("let", "a1")], + ) + ) + index2 = Index( + np.array( + [(1, "A"), (2, "A"), (1, "B"), (2, "B"), (1, "C"), (2, "C")], + dtype=[("num", int), ("let", "a1")], + ) + ) + + result = getattr(index1, method)(index2, sort=sort) + assert result.ndim == 1 + + expected = Index(expected) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "attr", + [ + "is_monotonic_increasing", + "is_monotonic_decreasing", + "_is_strictly_monotonic_increasing", + "_is_strictly_monotonic_decreasing", + ], + ) + def test_is_monotonic_incomparable(self, attr): + index = Index([5, datetime.now(), 7]) + assert not getattr(index, attr) + + def test_set_value_deprecated(self): + # GH 28621 + idx = self.create_index() + arr = np.array([1, 2, 3]) + with tm.assert_produces_warning(FutureWarning): + idx.set_value(arr, idx[1], 80) + assert arr[1] == 80 + + @pytest.mark.parametrize( + "index", ["string", "int", "datetime", "timedelta"], indirect=True + ) + def test_get_value(self, index): + # TODO: Remove function? GH 19728 + values = np.random.randn(100) + value = index[67] + + tm.assert_almost_equal(index.get_value(values, value), values[67]) + + @pytest.mark.parametrize("values", [["foo", "bar", "quux"], {"foo", "bar", "quux"}]) + @pytest.mark.parametrize( + "index,expected", + [ + (Index(["qux", "baz", "foo", "bar"]), np.array([False, False, True, True])), + (Index([]), np.array([], dtype=bool)), # empty + ], + ) + def test_isin(self, values, index, expected): + result = index.isin(values) + tm.assert_numpy_array_equal(result, expected) + + def test_isin_nan_common_object(self, nulls_fixture, nulls_fixture2): + # Test cartesian product of null fixtures and ensure that we don't + # mangle the various types (save a corner case with PyPy) + + # all nans are the same + if ( + isinstance(nulls_fixture, float) + and isinstance(nulls_fixture2, float) + and math.isnan(nulls_fixture) + and math.isnan(nulls_fixture2) + ): + tm.assert_numpy_array_equal( + Index(["a", nulls_fixture]).isin([nulls_fixture2]), + np.array([False, True]), + ) + + elif nulls_fixture is nulls_fixture2: # should preserve NA type + tm.assert_numpy_array_equal( + Index(["a", nulls_fixture]).isin([nulls_fixture2]), + np.array([False, True]), + ) + + else: + tm.assert_numpy_array_equal( + Index(["a", nulls_fixture]).isin([nulls_fixture2]), + np.array([False, False]), + ) + + def test_isin_nan_common_float64(self, nulls_fixture): + if nulls_fixture is pd.NaT: + pytest.skip("pd.NaT not compatible with Float64Index") + + # Float64Index overrides isin, so must be checked separately + tm.assert_numpy_array_equal( + Float64Index([1.0, nulls_fixture]).isin([np.nan]), np.array([False, True]) + ) + + # we cannot compare NaT with NaN + tm.assert_numpy_array_equal( + Float64Index([1.0, nulls_fixture]).isin([pd.NaT]), np.array([False, False]) + ) + + @pytest.mark.parametrize("level", [0, -1]) + @pytest.mark.parametrize( + "index", + [ + Index(["qux", "baz", "foo", "bar"]), + # Float64Index overrides isin, so must be checked separately + Float64Index([1.0, 2.0, 3.0, 4.0]), + ], + ) + def test_isin_level_kwarg(self, level, index): + values = index.tolist()[-2:] + ["nonexisting"] + + expected = np.array([False, False, True, True]) + tm.assert_numpy_array_equal(expected, index.isin(values, level=level)) + + index.name = "foobar" + tm.assert_numpy_array_equal(expected, index.isin(values, level="foobar")) + + @pytest.mark.parametrize("level", [2, 10, -3]) + def test_isin_level_kwarg_bad_level_raises(self, level, indices): + index = indices + with pytest.raises(IndexError, match="Too many levels"): + index.isin([], level=level) + + @pytest.mark.parametrize("label", [1.0, "foobar", "xyzzy", np.nan]) + def test_isin_level_kwarg_bad_label_raises(self, label, indices): + index = indices + if isinstance(index, MultiIndex): + index = index.rename(["foo", "bar"]) + msg = f"'Level {label} not found'" + else: + index = index.rename("foo") + msg = fr"Requested level \({label}\) does not match index name \(foo\)" + with pytest.raises(KeyError, match=msg): + index.isin([], level=label) + + @pytest.mark.parametrize("empty", [[], Series(dtype=object), np.array([])]) + def test_isin_empty(self, empty): + # see gh-16991 + index = Index(["a", "b"]) + expected = np.array([False, False]) + + result = index.isin(empty) + tm.assert_numpy_array_equal(expected, result) + + @pytest.mark.parametrize( + "values", + [ + [1, 2, 3, 4], + [1.0, 2.0, 3.0, 4.0], + [True, True, True, True], + ["foo", "bar", "baz", "qux"], + pd.date_range("2018-01-01", freq="D", periods=4), + ], + ) + def test_boolean_cmp(self, values): + index = Index(values) + result = index == values + expected = np.array([True, True, True, True], dtype=bool) + + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("name,level", [(None, 0), ("a", "a")]) + def test_get_level_values(self, index, name, level): + expected = index.copy() + if name: + expected.name = name + + result = expected.get_level_values(level) + tm.assert_index_equal(result, expected) + + def test_slice_keep_name(self): + index = Index(["a", "b"], name="asdf") + assert index.name == index[1:].name + + @pytest.mark.parametrize( + "index", ["unicode", "string", "datetime", "int", "float"], indirect=True + ) + def test_join_self(self, index, join_type): + joined = index.join(index, how=join_type) + assert index is joined + + @pytest.mark.parametrize("method", ["strip", "rstrip", "lstrip"]) + def test_str_attribute(self, method): + # GH9068 + index = Index([" jack", "jill ", " jesse ", "frank"]) + expected = Index([getattr(str, method)(x) for x in index.values]) + + result = getattr(index.str, method)() + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "index", + [ + Index(range(5)), + tm.makeDateIndex(10), + MultiIndex.from_tuples([("foo", "1"), ("bar", "3")]), + period_range(start="2000", end="2010", freq="A"), + ], + ) + def test_str_attribute_raises(self, index): + with pytest.raises(AttributeError, match="only use .str accessor"): + index.str.repeat(2) + + @pytest.mark.parametrize( + "expand,expected", + [ + (None, Index([["a", "b", "c"], ["d", "e"], ["f"]])), + (False, Index([["a", "b", "c"], ["d", "e"], ["f"]])), + ( + True, + MultiIndex.from_tuples( + [("a", "b", "c"), ("d", "e", np.nan), ("f", np.nan, np.nan)] + ), + ), + ], + ) + def test_str_split(self, expand, expected): + index = Index(["a b c", "d e", "f"]) + if expand is not None: + result = index.str.split(expand=expand) + else: + result = index.str.split() + + tm.assert_index_equal(result, expected) + + def test_str_bool_return(self): + # test boolean case, should return np.array instead of boolean Index + index = Index(["a1", "a2", "b1", "b2"]) + result = index.str.startswith("a") + expected = np.array([True, True, False, False]) + + tm.assert_numpy_array_equal(result, expected) + assert isinstance(result, np.ndarray) + + def test_str_bool_series_indexing(self): + index = Index(["a1", "a2", "b1", "b2"]) + s = Series(range(4), index=index) + + result = s[s.index.str.startswith("a")] + expected = Series(range(2), index=["a1", "a2"]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "index,expected", [(Index(list("abcd")), True), (Index(range(4)), False)] + ) + def test_tab_completion(self, index, expected): + # GH 9910 + result = "str" in dir(index) + assert result == expected + + def test_indexing_doesnt_change_class(self): + index = Index([1, 2, 3, "a", "b", "c"]) + + assert index[1:3].identical(pd.Index([2, 3], dtype=np.object_)) + assert index[[0, 1]].identical(pd.Index([1, 2], dtype=np.object_)) + + def test_outer_join_sort(self): + left_index = Index(np.random.permutation(15)) + right_index = tm.makeDateIndex(10) + + with tm.assert_produces_warning(RuntimeWarning): + result = left_index.join(right_index, how="outer") + + # right_index in this case because DatetimeIndex has join precedence + # over Int64Index + with tm.assert_produces_warning(RuntimeWarning): + expected = right_index.astype(object).union(left_index.astype(object)) + + tm.assert_index_equal(result, expected) + + def test_nan_first_take_datetime(self): + index = Index([pd.NaT, Timestamp("20130101"), Timestamp("20130102")]) + result = index.take([-1, 0, 1]) + expected = Index([index[-1], index[0], index[1]]) + tm.assert_index_equal(result, expected) + + def test_take_fill_value(self): + # GH 12631 + index = pd.Index(list("ABC"), name="xxx") + result = index.take(np.array([1, 0, -1])) + expected = pd.Index(list("BAC"), name="xxx") + tm.assert_index_equal(result, expected) + + # fill_value + result = index.take(np.array([1, 0, -1]), fill_value=True) + expected = pd.Index(["B", "A", np.nan], name="xxx") + tm.assert_index_equal(result, expected) + + # allow_fill=False + result = index.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = pd.Index(["B", "A", "C"], name="xxx") + tm.assert_index_equal(result, expected) + + def test_take_fill_value_none_raises(self): + index = pd.Index(list("ABC"), name="xxx") + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) + + with pytest.raises(ValueError, match=msg): + index.take(np.array([1, 0, -2]), fill_value=True) + with pytest.raises(ValueError, match=msg): + index.take(np.array([1, 0, -5]), fill_value=True) + + def test_take_bad_bounds_raises(self): + index = pd.Index(list("ABC"), name="xxx") + with pytest.raises(IndexError, match="out of bounds"): + index.take(np.array([1, -5])) + + @pytest.mark.parametrize("name", [None, "foobar"]) + @pytest.mark.parametrize( + "labels", + [ + [], + np.array([]), + ["A", "B", "C"], + ["C", "B", "A"], + np.array(["A", "B", "C"]), + np.array(["C", "B", "A"]), + # Must preserve name even if dtype changes + pd.date_range("20130101", periods=3).values, + pd.date_range("20130101", periods=3).tolist(), + ], + ) + def test_reindex_preserves_name_if_target_is_list_or_ndarray(self, name, labels): + # GH6552 + index = pd.Index([0, 1, 2]) + index.name = name + assert index.reindex(labels)[0].name == name + + @pytest.mark.parametrize("labels", [[], np.array([]), np.array([], dtype=np.int64)]) + def test_reindex_preserves_type_if_target_is_empty_list_or_array(self, labels): + # GH7774 + index = pd.Index(list("abc")) + assert index.reindex(labels)[0].dtype.type == np.object_ + + @pytest.mark.parametrize( + "labels,dtype", + [ + (pd.Int64Index([]), np.int64), + (pd.Float64Index([]), np.float64), + (pd.DatetimeIndex([]), np.datetime64), + ], + ) + def test_reindex_doesnt_preserve_type_if_target_is_empty_index(self, labels, dtype): + # GH7774 + index = pd.Index(list("abc")) + assert index.reindex(labels)[0].dtype.type == dtype + + def test_reindex_no_type_preserve_target_empty_mi(self): + index = pd.Index(list("abc")) + result = index.reindex( + pd.MultiIndex([pd.Int64Index([]), pd.Float64Index([])], [[], []]) + )[0] + assert result.levels[0].dtype.type == np.int64 + assert result.levels[1].dtype.type == np.float64 + + def test_groupby(self): + index = Index(range(5)) + result = index.groupby(np.array([1, 1, 2, 2, 2])) + expected = {1: pd.Index([0, 1]), 2: pd.Index([2, 3, 4])} + + tm.assert_dict_equal(result, expected) + + @pytest.mark.parametrize( + "mi,expected", + [ + (MultiIndex.from_tuples([(1, 2), (4, 5)]), np.array([True, True])), + (MultiIndex.from_tuples([(1, 2), (4, 6)]), np.array([True, False])), + ], + ) + def test_equals_op_multiindex(self, mi, expected): + # GH9785 + # test comparisons of multiindex + df = pd.read_csv(StringIO("a,b,c\n1,2,3\n4,5,6"), index_col=[0, 1]) + + result = df.index == mi + tm.assert_numpy_array_equal(result, expected) + + def test_equals_op_multiindex_identify(self): + df = pd.read_csv(StringIO("a,b,c\n1,2,3\n4,5,6"), index_col=[0, 1]) + + result = df.index == df.index + expected = np.array([True, True]) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "index", + [ + MultiIndex.from_tuples([(1, 2), (4, 5), (8, 9)]), + Index(["foo", "bar", "baz"]), + ], + ) + def test_equals_op_mismatched_multiindex_raises(self, index): + df = pd.read_csv(StringIO("a,b,c\n1,2,3\n4,5,6"), index_col=[0, 1]) + + with pytest.raises(ValueError, match="Lengths must match"): + df.index == index + + def test_equals_op_index_vs_mi_same_length(self): + mi = MultiIndex.from_tuples([(1, 2), (4, 5), (8, 9)]) + index = Index(["foo", "bar", "baz"]) + + result = mi == index + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("dt_conv", [pd.to_datetime, pd.to_timedelta]) + def test_dt_conversion_preserves_name(self, dt_conv): + # GH 10875 + index = pd.Index(["01:02:03", "01:02:04"], name="label") + assert index.name == dt_conv(index).name + + @pytest.mark.parametrize( + "index,expected", + [ + # ASCII + # short + ( + pd.Index(["a", "bb", "ccc"]), + """Index(['a', 'bb', 'ccc'], dtype='object')""", + ), + # multiple lines + ( + pd.Index(["a", "bb", "ccc"] * 10), + """\ +Index(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', + 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', + 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], + dtype='object')""", + ), + # truncated + ( + pd.Index(["a", "bb", "ccc"] * 100), + """\ +Index(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', + ... + 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], + dtype='object', length=300)""", + ), + # Non-ASCII + # short + ( + pd.Index(["あ", "いい", "ううう"]), + """Index(['あ', 'いい', 'ううう'], dtype='object')""", + ), + # multiple lines + ( + pd.Index(["あ", "いい", "ううう"] * 10), + ( + "Index(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', " + "'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう',\n" + " 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', " + "'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう',\n" + " 'あ', 'いい', 'ううう', 'あ', 'いい', " + "'ううう'],\n" + " dtype='object')" + ), + ), + # truncated + ( + pd.Index(["あ", "いい", "ううう"] * 100), + ( + "Index(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', " + "'あ', 'いい', 'ううう', 'あ',\n" + " ...\n" + " 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', " + "'ううう', 'あ', 'いい', 'ううう'],\n" + " dtype='object', length=300)" + ), + ), + ], + ) + def test_string_index_repr(self, index, expected): + result = repr(index) + assert result == expected + + @pytest.mark.parametrize( + "index,expected", + [ + # short + ( + pd.Index(["あ", "いい", "ううう"]), + ("Index(['あ', 'いい', 'ううう'], dtype='object')"), + ), + # multiple lines + ( + pd.Index(["あ", "いい", "ううう"] * 10), + ( + "Index(['あ', 'いい', 'ううう', 'あ', 'いい', " + "'ううう', 'あ', 'いい', 'ううう',\n" + " 'あ', 'いい', 'ううう', 'あ', 'いい', " + "'ううう', 'あ', 'いい', 'ううう',\n" + " 'あ', 'いい', 'ううう', 'あ', 'いい', " + "'ううう', 'あ', 'いい', 'ううう',\n" + " 'あ', 'いい', 'ううう'],\n" + " dtype='object')" + "" + ), + ), + # truncated + ( + pd.Index(["あ", "いい", "ううう"] * 100), + ( + "Index(['あ', 'いい', 'ううう', 'あ', 'いい', " + "'ううう', 'あ', 'いい', 'ううう',\n" + " 'あ',\n" + " ...\n" + " 'ううう', 'あ', 'いい', 'ううう', 'あ', " + "'いい', 'ううう', 'あ', 'いい',\n" + " 'ううう'],\n" + " dtype='object', length=300)" + ), + ), + ], + ) + def test_string_index_repr_with_unicode_option(self, index, expected): + # Enable Unicode option ----------------------------------------- + with cf.option_context("display.unicode.east_asian_width", True): + result = repr(index) + assert result == expected + + def test_cached_properties_not_settable(self): + index = pd.Index([1, 2, 3]) + with pytest.raises(AttributeError, match="Can't set attribute"): + index.is_unique = False + + @async_mark() + async def test_tab_complete_warning(self, ip): + # https://github.com/pandas-dev/pandas/issues/16409 + pytest.importorskip("IPython", minversion="6.0.0") + from IPython.core.completer import provisionalcompleter + + code = "import pandas as pd; idx = pd.Index([1, 2])" + await ip.run_code(code) + + # GH 31324 newer jedi version raises Deprecation warning + import jedi + + if jedi.__version__ < "0.16.0": + warning = tm.assert_produces_warning(None) + else: + warning = tm.assert_produces_warning( + DeprecationWarning, check_stacklevel=False + ) + with warning: + with provisionalcompleter("ignore"): + list(ip.Completer.completions("idx.", 4)) + + def test_contains_method_removed(self, indices): + # GH#30103 method removed for all types except IntervalIndex + if isinstance(indices, pd.IntervalIndex): + indices.contains(1) + else: + with pytest.raises(AttributeError): + indices.contains(1) + + +class TestMixedIntIndex(Base): + # Mostly the tests from common.py for which the results differ + # in py2 and py3 because ints and strings are uncomparable in py3 + # (GH 13514) + + _holder = Index + + @pytest.fixture(params=[[0, "a", 1, "b", 2, "c"]], ids=["mixedIndex"]) + def indices(self, request): + return Index(request.param) + + def create_index(self): + return Index([0, "a", 1, "b", 2, "c"]) + + def test_argsort(self): + index = self.create_index() + with pytest.raises(TypeError, match="'>|<' not supported"): + index.argsort() + + def test_numpy_argsort(self): + index = self.create_index() + with pytest.raises(TypeError, match="'>|<' not supported"): + np.argsort(index) + + def test_copy_name(self): + # Check that "name" argument passed at initialization is honoured + # GH12309 + index = self.create_index() + + first = type(index)(index, copy=True, name="mario") + second = type(first)(first, copy=False) + + # Even though "copy=False", we want a new object. + assert first is not second + tm.assert_index_equal(first, second) + + assert first.name == "mario" + assert second.name == "mario" + + s1 = Series(2, index=first) + s2 = Series(3, index=second[:-1]) + + s3 = s1 * s2 + + assert s3.index.name == "mario" + + def test_copy_name2(self): + # Check that adding a "name" parameter to the copy is honored + # GH14302 + index = pd.Index([1, 2], name="MyName") + index1 = index.copy() + + tm.assert_index_equal(index, index1) + + index2 = index.copy(name="NewName") + tm.assert_index_equal(index, index2, check_names=False) + assert index.name == "MyName" + assert index2.name == "NewName" + + index3 = index.copy(names=["NewName"]) + tm.assert_index_equal(index, index3, check_names=False) + assert index.name == "MyName" + assert index.names == ["MyName"] + assert index3.name == "NewName" + assert index3.names == ["NewName"] + + def test_union_base(self): + index = self.create_index() + first = index[3:] + second = index[:5] + + result = first.union(second) + + expected = Index([0, 1, 2, "a", "b", "c"]) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("klass", [np.array, Series, list]) + def test_union_different_type_base(self, klass): + # GH 10149 + index = self.create_index() + first = index[3:] + second = index[:5] + + result = first.union(klass(second.values)) + + assert tm.equalContents(result, index) + + def test_unique_na(self): + idx = pd.Index([2, np.nan, 2, 1], name="my_index") + expected = pd.Index([2, np.nan, 1], name="my_index") + result = idx.unique() + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("sort", [None, False]) + def test_intersection_base(self, sort): + # (same results for py2 and py3 but sortedness not tested elsewhere) + index = self.create_index() + first = index[:5] + second = index[:3] + + expected = Index([0, 1, "a"]) if sort is None else Index([0, "a", 1]) + result = first.intersection(second, sort=sort) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("klass", [np.array, Series, list]) + @pytest.mark.parametrize("sort", [None, False]) + def test_intersection_different_type_base(self, klass, sort): + # GH 10149 + index = self.create_index() + first = index[:5] + second = index[:3] + + result = first.intersection(klass(second.values), sort=sort) + assert tm.equalContents(result, second) + + @pytest.mark.parametrize("sort", [None, False]) + def test_difference_base(self, sort): + # (same results for py2 and py3 but sortedness not tested elsewhere) + index = self.create_index() + first = index[:4] + second = index[3:] + + result = first.difference(second, sort) + expected = Index([0, "a", 1]) + if sort is None: + expected = Index(safe_sort(expected)) + tm.assert_index_equal(result, expected) + + def test_symmetric_difference(self): + # (same results for py2 and py3 but sortedness not tested elsewhere) + index = self.create_index() + first = index[:4] + second = index[3:] + + result = first.symmetric_difference(second) + expected = Index([0, 1, 2, "a", "c"]) + tm.assert_index_equal(result, expected) + + def test_logical_compat(self): + index = self.create_index() + assert index.all() == index.values.all() + assert index.any() == index.values.any() + + @pytest.mark.parametrize("how", ["any", "all"]) + @pytest.mark.parametrize("dtype", [None, object, "category"]) + @pytest.mark.parametrize( + "vals,expected", + [ + ([1, 2, 3], [1, 2, 3]), + ([1.0, 2.0, 3.0], [1.0, 2.0, 3.0]), + ([1.0, 2.0, np.nan, 3.0], [1.0, 2.0, 3.0]), + (["A", "B", "C"], ["A", "B", "C"]), + (["A", np.nan, "B", "C"], ["A", "B", "C"]), + ], + ) + def test_dropna(self, how, dtype, vals, expected): + # GH 6194 + index = pd.Index(vals, dtype=dtype) + result = index.dropna(how=how) + expected = pd.Index(expected, dtype=dtype) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("how", ["any", "all"]) + @pytest.mark.parametrize( + "index,expected", + [ + ( + pd.DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"]), + pd.DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"]), + ), + ( + pd.DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03", pd.NaT]), + pd.DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"]), + ), + ( + pd.TimedeltaIndex(["1 days", "2 days", "3 days"]), + pd.TimedeltaIndex(["1 days", "2 days", "3 days"]), + ), + ( + pd.TimedeltaIndex([pd.NaT, "1 days", "2 days", "3 days", pd.NaT]), + pd.TimedeltaIndex(["1 days", "2 days", "3 days"]), + ), + ( + pd.PeriodIndex(["2012-02", "2012-04", "2012-05"], freq="M"), + pd.PeriodIndex(["2012-02", "2012-04", "2012-05"], freq="M"), + ), + ( + pd.PeriodIndex(["2012-02", "2012-04", "NaT", "2012-05"], freq="M"), + pd.PeriodIndex(["2012-02", "2012-04", "2012-05"], freq="M"), + ), + ], + ) + def test_dropna_dt_like(self, how, index, expected): + result = index.dropna(how=how) + tm.assert_index_equal(result, expected) + + def test_dropna_invalid_how_raises(self): + msg = "invalid how option: xxx" + with pytest.raises(ValueError, match=msg): + pd.Index([1, 2, 3]).dropna(how="xxx") + + def test_get_combined_index(self): + result = _get_combined_index([]) + expected = Index([]) + tm.assert_index_equal(result, expected) + + def test_repeat(self): + repeats = 2 + index = pd.Index([1, 2, 3]) + expected = pd.Index([1, 1, 2, 2, 3, 3]) + + result = index.repeat(repeats) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "index", + [ + pd.Index([np.nan]), + pd.Index([np.nan, 1]), + pd.Index([1, 2, np.nan]), + pd.Index(["a", "b", np.nan]), + pd.to_datetime(["NaT"]), + pd.to_datetime(["NaT", "2000-01-01"]), + pd.to_datetime(["2000-01-01", "NaT", "2000-01-02"]), + pd.to_timedelta(["1 day", "NaT"]), + ], + ) + def test_is_monotonic_na(self, index): + assert index.is_monotonic_increasing is False + assert index.is_monotonic_decreasing is False + assert index._is_strictly_monotonic_increasing is False + assert index._is_strictly_monotonic_decreasing is False + + def test_repr_summary(self): + with cf.option_context("display.max_seq_items", 10): + result = repr(pd.Index(np.arange(1000))) + assert len(result) < 200 + assert "..." in result + + @pytest.mark.parametrize("klass", [Series, DataFrame]) + def test_int_name_format(self, klass): + index = Index(["a", "b", "c"], name=0) + result = klass(list(range(3)), index=index) + assert "0" in repr(result) + + def test_print_unicode_columns(self): + df = pd.DataFrame({"\u05d0": [1, 2, 3], "\u05d1": [4, 5, 6], "c": [7, 8, 9]}) + repr(df.columns) # should not raise UnicodeDecodeError + + def test_str_to_bytes_raises(self): + # GH 26447 + index = Index([str(x) for x in range(10)]) + msg = "^'str' object cannot be interpreted as an integer$" + with pytest.raises(TypeError, match=msg): + bytes(index) + + def test_intersect_str_dates(self): + dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)] + + index1 = Index(dt_dates, dtype=object) + index2 = Index(["aa"], dtype=object) + result = index2.intersection(index1) + + expected = Index([], dtype=object) + tm.assert_index_equal(result, expected) + + +class TestIndexUtils: + @pytest.mark.parametrize( + "data, names, expected", + [ + ([[1, 2, 3]], None, Index([1, 2, 3])), + ([[1, 2, 3]], ["name"], Index([1, 2, 3], name="name")), + ( + [["a", "a"], ["c", "d"]], + None, + MultiIndex([["a"], ["c", "d"]], [[0, 0], [0, 1]]), + ), + ( + [["a", "a"], ["c", "d"]], + ["L1", "L2"], + MultiIndex([["a"], ["c", "d"]], [[0, 0], [0, 1]], names=["L1", "L2"]), + ), + ], + ) + def test_ensure_index_from_sequences(self, data, names, expected): + result = ensure_index_from_sequences(data, names) + tm.assert_index_equal(result, expected) + + def test_ensure_index_mixed_closed_intervals(self): + # GH27172 + intervals = [ + pd.Interval(0, 1, closed="left"), + pd.Interval(1, 2, closed="right"), + pd.Interval(2, 3, closed="neither"), + pd.Interval(3, 4, closed="both"), + ] + result = ensure_index(intervals) + expected = Index(intervals, dtype=object) + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize( + "opname", + [ + "eq", + "ne", + "le", + "lt", + "ge", + "gt", + "add", + "radd", + "sub", + "rsub", + "mul", + "rmul", + "truediv", + "rtruediv", + "floordiv", + "rfloordiv", + "pow", + "rpow", + "mod", + "divmod", + ], +) +def test_generated_op_names(opname, indices): + if isinstance(indices, ABCIndex) and opname == "rsub": + # pd.Index.__rsub__ does not exist; though the method does exist + # for subclasses. see GH#19723 + return + opname = f"__{opname}__" + method = getattr(indices, opname) + assert method.__name__ == opname + + +@pytest.mark.parametrize("index_maker", tm.index_subclass_makers_generator()) +def test_index_subclass_constructor_wrong_kwargs(index_maker): + # GH #19348 + with pytest.raises(TypeError, match="unexpected keyword argument"): + index_maker(foo="bar") + + +def test_deprecated_fastpath(): + msg = "[Uu]nexpected keyword argument" + with pytest.raises(TypeError, match=msg): + pd.Index(np.array(["a", "b"], dtype=object), name="test", fastpath=True) + + with pytest.raises(TypeError, match=msg): + pd.Int64Index(np.array([1, 2, 3], dtype="int64"), name="test", fastpath=True) + + with pytest.raises(TypeError, match=msg): + pd.RangeIndex(0, 5, 2, name="test", fastpath=True) + + with pytest.raises(TypeError, match=msg): + pd.CategoricalIndex(["a", "b", "c"], name="test", fastpath=True) + + +def test_shape_of_invalid_index(): + # Currently, it is possible to create "invalid" index objects backed by + # a multi-dimensional array (see https://github.com/pandas-dev/pandas/issues/27125 + # about this). However, as long as this is not solved in general,this test ensures + # that the returned shape is consistent with this underlying array for + # compat with matplotlib (see https://github.com/pandas-dev/pandas/issues/27775) + idx = pd.Index([0, 1, 2, 3]) + with tm.assert_produces_warning(DeprecationWarning): + # GH#30588 multi-dimensional indexing deprecated + assert idx[:, None].shape == (4, 1) + + +def test_validate_1d_input(): + # GH#27125 check that we do not have >1-dimensional input + msg = "Index data must be 1-dimensional" + + arr = np.arange(8).reshape(2, 2, 2) + with pytest.raises(ValueError, match=msg): + pd.Index(arr) + + with pytest.raises(ValueError, match=msg): + pd.Float64Index(arr.astype(np.float64)) + + with pytest.raises(ValueError, match=msg): + pd.Int64Index(arr.astype(np.int64)) + + with pytest.raises(ValueError, match=msg): + pd.UInt64Index(arr.astype(np.uint64)) + + df = pd.DataFrame(arr.reshape(4, 2)) + with pytest.raises(ValueError, match=msg): + pd.Index(df) + + # GH#13601 trying to assign a multi-dimensional array to an index is not + # allowed + ser = pd.Series(0, range(4)) + with pytest.raises(ValueError, match=msg): + ser.index = np.array([[2, 3]] * 4) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/test_common.py b/venv/Lib/site-packages/pandas/tests/indexes/test_common.py new file mode 100644 index 0000000..7e30233 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/test_common.py @@ -0,0 +1,354 @@ +""" +Collection of tests asserting things that should be true for +any index subclass. Makes use of the `indices` fixture defined +in pandas/tests/indexes/conftest.py. +""" +import re + +import numpy as np +import pytest + +from pandas._libs.tslibs import iNaT + +from pandas.core.dtypes.common import needs_i8_conversion + +import pandas as pd +from pandas import CategoricalIndex, MultiIndex, RangeIndex +import pandas._testing as tm + + +class TestCommon: + def test_droplevel(self, indices): + # GH 21115 + if isinstance(indices, MultiIndex): + # Tested separately in test_multi.py + return + + assert indices.droplevel([]).equals(indices) + + for level in indices.name, [indices.name]: + if isinstance(indices.name, tuple) and level is indices.name: + # GH 21121 : droplevel with tuple name + continue + with pytest.raises(ValueError): + indices.droplevel(level) + + for level in "wrong", ["wrong"]: + with pytest.raises( + KeyError, + match=r"'Requested level \(wrong\) does not match index name \(None\)'", + ): + indices.droplevel(level) + + def test_constructor_non_hashable_name(self, indices): + # GH 20527 + + if isinstance(indices, MultiIndex): + pytest.skip("multiindex handled in test_multi.py") + + message = "Index.name must be a hashable type" + renamed = [["1"]] + + # With .rename() + with pytest.raises(TypeError, match=message): + indices.rename(name=renamed) + + # With .set_names() + with pytest.raises(TypeError, match=message): + indices.set_names(names=renamed) + + def test_constructor_unwraps_index(self, indices): + if isinstance(indices, pd.MultiIndex): + raise pytest.skip("MultiIndex has no ._data") + a = indices + b = type(a)(a) + tm.assert_equal(a._data, b._data) + + @pytest.mark.parametrize("itm", [101, "no_int"]) + # FutureWarning from non-tuple sequence of nd indexing + @pytest.mark.filterwarnings("ignore::FutureWarning") + def test_getitem_error(self, indices, itm): + with pytest.raises(IndexError): + indices[itm] + + @pytest.mark.parametrize( + "fname, sname, expected_name", + [ + ("A", "A", "A"), + ("A", "B", None), + ("A", None, None), + (None, "B", None), + (None, None, None), + ], + ) + def test_corner_union(self, indices, fname, sname, expected_name): + # GH 9943 9862 + # Test unions with various name combinations + # Do not test MultiIndex or repeats + + if isinstance(indices, MultiIndex) or not indices.is_unique: + pytest.skip("Not for MultiIndex or repeated indices") + + # Test copy.union(copy) + first = indices.copy().set_names(fname) + second = indices.copy().set_names(sname) + union = first.union(second) + expected = indices.copy().set_names(expected_name) + tm.assert_index_equal(union, expected) + + # Test copy.union(empty) + first = indices.copy().set_names(fname) + second = indices.drop(indices).set_names(sname) + union = first.union(second) + expected = indices.copy().set_names(expected_name) + tm.assert_index_equal(union, expected) + + # Test empty.union(copy) + first = indices.drop(indices).set_names(fname) + second = indices.copy().set_names(sname) + union = first.union(second) + expected = indices.copy().set_names(expected_name) + tm.assert_index_equal(union, expected) + + # Test empty.union(empty) + first = indices.drop(indices).set_names(fname) + second = indices.drop(indices).set_names(sname) + union = first.union(second) + expected = indices.drop(indices).set_names(expected_name) + tm.assert_index_equal(union, expected) + + def test_to_flat_index(self, indices): + # 22866 + if isinstance(indices, MultiIndex): + pytest.skip("Separate expectation for MultiIndex") + + result = indices.to_flat_index() + tm.assert_index_equal(result, indices) + + def test_wrong_number_names(self, indices): + with pytest.raises(ValueError, match="^Length"): + indices.names = ["apple", "banana", "carrot"] + + def test_set_name_methods(self, indices): + new_name = "This is the new name for this index" + + # don't tests a MultiIndex here (as its tested separated) + if isinstance(indices, MultiIndex): + pytest.skip("Skip check for MultiIndex") + original_name = indices.name + new_ind = indices.set_names([new_name]) + assert new_ind.name == new_name + assert indices.name == original_name + res = indices.rename(new_name, inplace=True) + + # should return None + assert res is None + assert indices.name == new_name + assert indices.names == [new_name] + # FIXME: dont leave commented-out + # with pytest.raises(TypeError, match="list-like"): + # # should still fail even if it would be the right length + # ind.set_names("a") + with pytest.raises(ValueError, match="Level must be None"): + indices.set_names("a", level=0) + + # rename in place just leaves tuples and other containers alone + name = ("A", "B") + indices.rename(name, inplace=True) + assert indices.name == name + assert indices.names == [name] + + def test_hash_error(self, indices): + index = indices + with pytest.raises( + TypeError, match=f"unhashable type: '{type(index).__name__}'" + ): + hash(indices) + + def test_copy_and_deepcopy(self, indices): + from copy import copy, deepcopy + + if isinstance(indices, MultiIndex): + pytest.skip("Skip check for MultiIndex") + + for func in (copy, deepcopy): + idx_copy = func(indices) + assert idx_copy is not indices + assert idx_copy.equals(indices) + + new_copy = indices.copy(deep=True, name="banana") + assert new_copy.name == "banana" + + def test_unique(self, indices): + # don't test a MultiIndex here (as its tested separated) + # don't test a CategoricalIndex because categories change (GH 18291) + if isinstance(indices, (MultiIndex, CategoricalIndex)): + pytest.skip("Skip check for MultiIndex/CategoricalIndex") + + # GH 17896 + expected = indices.drop_duplicates() + for level in 0, indices.name, None: + result = indices.unique(level=level) + tm.assert_index_equal(result, expected) + + msg = "Too many levels: Index has only 1 level, not 4" + with pytest.raises(IndexError, match=msg): + indices.unique(level=3) + + msg = ( + fr"Requested level \(wrong\) does not match index name " + fr"\({re.escape(indices.name.__repr__())}\)" + ) + with pytest.raises(KeyError, match=msg): + indices.unique(level="wrong") + + def test_get_unique_index(self, indices): + # MultiIndex tested separately + if not len(indices) or isinstance(indices, MultiIndex): + pytest.skip("Skip check for empty Index and MultiIndex") + + idx = indices[[0] * 5] + idx_unique = indices[[0]] + + # We test against `idx_unique`, so first we make sure it's unique + # and doesn't contain nans. + assert idx_unique.is_unique is True + try: + assert idx_unique.hasnans is False + except NotImplementedError: + pass + + for dropna in [False, True]: + result = idx._get_unique_index(dropna=dropna) + tm.assert_index_equal(result, idx_unique) + + # nans: + if not indices._can_hold_na: + pytest.skip("Skip na-check if index cannot hold na") + + if needs_i8_conversion(indices): + vals = indices.asi8[[0] * 5] + vals[0] = iNaT + else: + vals = indices.values[[0] * 5] + vals[0] = np.nan + + vals_unique = vals[:2] + idx_nan = indices._shallow_copy(vals) + idx_unique_nan = indices._shallow_copy(vals_unique) + assert idx_unique_nan.is_unique is True + + assert idx_nan.dtype == indices.dtype + assert idx_unique_nan.dtype == indices.dtype + + for dropna, expected in zip([False, True], [idx_unique_nan, idx_unique]): + for i in [idx_nan, idx_unique_nan]: + result = i._get_unique_index(dropna=dropna) + tm.assert_index_equal(result, expected) + + def test_sort(self, indices): + msg = "cannot sort an Index object in-place, use sort_values instead" + with pytest.raises(TypeError, match=msg): + indices.sort() + + def test_mutability(self, indices): + if not len(indices): + pytest.skip("Skip check for empty Index") + msg = "Index does not support mutable operations" + with pytest.raises(TypeError, match=msg): + indices[0] = indices[0] + + def test_view(self, indices): + assert indices.view().name == indices.name + + def test_compat(self, indices): + assert indices.tolist() == list(indices) + + def test_searchsorted_monotonic(self, indices): + # GH17271 + # not implemented for tuple searches in MultiIndex + # or Intervals searches in IntervalIndex + if isinstance(indices, (MultiIndex, pd.IntervalIndex)): + pytest.skip("Skip check for MultiIndex/IntervalIndex") + + # nothing to test if the index is empty + if indices.empty: + pytest.skip("Skip check for empty Index") + value = indices[0] + + # determine the expected results (handle dupes for 'right') + expected_left, expected_right = 0, (indices == value).argmin() + if expected_right == 0: + # all values are the same, expected_right should be length + expected_right = len(indices) + + # test _searchsorted_monotonic in all cases + # test searchsorted only for increasing + if indices.is_monotonic_increasing: + ssm_left = indices._searchsorted_monotonic(value, side="left") + assert expected_left == ssm_left + + ssm_right = indices._searchsorted_monotonic(value, side="right") + assert expected_right == ssm_right + + ss_left = indices.searchsorted(value, side="left") + assert expected_left == ss_left + + ss_right = indices.searchsorted(value, side="right") + assert expected_right == ss_right + + elif indices.is_monotonic_decreasing: + ssm_left = indices._searchsorted_monotonic(value, side="left") + assert expected_left == ssm_left + + ssm_right = indices._searchsorted_monotonic(value, side="right") + assert expected_right == ssm_right + else: + # non-monotonic should raise. + with pytest.raises(ValueError): + indices._searchsorted_monotonic(value, side="left") + + def test_pickle(self, indices): + original_name, indices.name = indices.name, "foo" + unpickled = tm.round_trip_pickle(indices) + assert indices.equals(unpickled) + indices.name = original_name + + @pytest.mark.parametrize("keep", ["first", "last", False]) + def test_duplicated(self, indices, keep): + if not len(indices) or isinstance(indices, (MultiIndex, RangeIndex)): + # MultiIndex tested separately in: + # tests/indexes/multi/test_unique_and_duplicates + pytest.skip("Skip check for empty Index, MultiIndex, RangeIndex") + + holder = type(indices) + + idx = holder(indices) + if idx.has_duplicates: + # We are testing the duplicated-method here, so we need to know + # exactly which indices are duplicate and how (for the result). + # This is not possible if "idx" has duplicates already, which we + # therefore remove. This is seemingly circular, as drop_duplicates + # invokes duplicated, but in the end, it all works out because we + # cross-check with Series.duplicated, which is tested separately. + idx = idx.drop_duplicates() + + n, k = len(idx), 10 + duplicated_selection = np.random.choice(n, k * n) + expected = pd.Series(duplicated_selection).duplicated(keep=keep).values + idx = holder(idx.values[duplicated_selection]) + + result = idx.duplicated(keep=keep) + tm.assert_numpy_array_equal(result, expected) + + def test_has_duplicates(self, indices): + holder = type(indices) + if not len(indices) or isinstance(indices, (MultiIndex, RangeIndex)): + # MultiIndex tested separately in: + # tests/indexes/multi/test_unique_and_duplicates. + # RangeIndex is unique by definition. + pytest.skip("Skip check for empty Index, MultiIndex, and RangeIndex") + + idx = holder([indices[0]] * 5) + assert idx.is_unique is False + assert idx.has_duplicates is True diff --git a/venv/Lib/site-packages/pandas/tests/indexes/test_frozen.py b/venv/Lib/site-packages/pandas/tests/indexes/test_frozen.py new file mode 100644 index 0000000..2e53e29 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/test_frozen.py @@ -0,0 +1,104 @@ +import re + +import pytest + +from pandas.core.indexes.frozen import FrozenList + + +class TestFrozenList: + + unicode_container = FrozenList(["\u05d0", "\u05d1", "c"]) + + def setup_method(self, _): + self.lst = [1, 2, 3, 4, 5] + self.container = FrozenList(self.lst) + + def check_mutable_error(self, *args, **kwargs): + # Pass whatever function you normally would to pytest.raises + # (after the Exception kind). + mutable_regex = re.compile("does not support mutable operations") + with pytest.raises(TypeError): + mutable_regex(*args, **kwargs) + + def test_no_mutable_funcs(self): + def setitem(): + self.container[0] = 5 + + self.check_mutable_error(setitem) + + def setslice(): + self.container[1:2] = 3 + + self.check_mutable_error(setslice) + + def delitem(): + del self.container[0] + + self.check_mutable_error(delitem) + + def delslice(): + del self.container[0:3] + + self.check_mutable_error(delslice) + + mutable_methods = ("extend", "pop", "remove", "insert") + + for meth in mutable_methods: + self.check_mutable_error(getattr(self.container, meth)) + + def test_slicing_maintains_type(self): + result = self.container[1:2] + expected = self.lst[1:2] + self.check_result(result, expected) + + def check_result(self, result, expected): + assert isinstance(result, FrozenList) + assert result == expected + + def test_string_methods_dont_fail(self): + repr(self.container) + str(self.container) + bytes(self.container) + + def test_tricky_container(self): + repr(self.unicode_container) + str(self.unicode_container) + + def test_add(self): + result = self.container + (1, 2, 3) + expected = FrozenList(self.lst + [1, 2, 3]) + self.check_result(result, expected) + + result = (1, 2, 3) + self.container + expected = FrozenList([1, 2, 3] + self.lst) + self.check_result(result, expected) + + def test_iadd(self): + q = r = self.container + + q += [5] + self.check_result(q, self.lst + [5]) + + # Other shouldn't be mutated. + self.check_result(r, self.lst) + + def test_union(self): + result = self.container.union((1, 2, 3)) + expected = FrozenList(self.lst + [1, 2, 3]) + self.check_result(result, expected) + + def test_difference(self): + result = self.container.difference([2]) + expected = FrozenList([1, 3, 4, 5]) + self.check_result(result, expected) + + def test_difference_dupe(self): + result = FrozenList([1, 2, 3, 2]).difference([2]) + expected = FrozenList([1, 3]) + self.check_result(result, expected) + + def test_tricky_container_to_bytes_raises(self): + # GH 26447 + msg = "^'str' object cannot be interpreted as an integer$" + with pytest.raises(TypeError, match=msg): + bytes(self.unicode_container) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/test_numeric.py b/venv/Lib/site-packages/pandas/tests/indexes/test_numeric.py new file mode 100644 index 0000000..f025168 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/test_numeric.py @@ -0,0 +1,1243 @@ +from datetime import datetime, timedelta +import re + +import numpy as np +import pytest + +from pandas._libs.tslibs import Timestamp + +import pandas as pd +from pandas import Float64Index, Index, Int64Index, Series, UInt64Index +import pandas._testing as tm +from pandas.api.types import pandas_dtype +from pandas.tests.indexes.common import Base + + +class Numeric(Base): + def test_can_hold_identifiers(self): + idx = self.create_index() + key = idx[0] + assert idx._can_hold_identifiers_and_holds_name(key) is False + + def test_numeric_compat(self): + pass # override Base method + + def test_explicit_conversions(self): + + # GH 8608 + # add/sub are overridden explicitly for Float/Int Index + idx = self._holder(np.arange(5, dtype="int64")) + + # float conversions + arr = np.arange(5, dtype="int64") * 3.2 + expected = Float64Index(arr) + fidx = idx * 3.2 + tm.assert_index_equal(fidx, expected) + fidx = 3.2 * idx + tm.assert_index_equal(fidx, expected) + + # interops with numpy arrays + expected = Float64Index(arr) + a = np.zeros(5, dtype="float64") + result = fidx - a + tm.assert_index_equal(result, expected) + + expected = Float64Index(-arr) + a = np.zeros(5, dtype="float64") + result = a - fidx + tm.assert_index_equal(result, expected) + + def test_index_groupby(self): + int_idx = Index(range(6)) + float_idx = Index(np.arange(0, 0.6, 0.1)) + obj_idx = Index("A B C D E F".split()) + dt_idx = pd.date_range("2013-01-01", freq="M", periods=6) + + for idx in [int_idx, float_idx, obj_idx, dt_idx]: + to_groupby = np.array([1, 2, np.nan, np.nan, 2, 1]) + tm.assert_dict_equal( + idx.groupby(to_groupby), {1.0: idx[[0, 5]], 2.0: idx[[1, 4]]} + ) + + to_groupby = Index( + [ + datetime(2011, 11, 1), + datetime(2011, 12, 1), + pd.NaT, + pd.NaT, + datetime(2011, 12, 1), + datetime(2011, 11, 1), + ], + tz="UTC", + ).values + + ex_keys = [Timestamp("2011-11-01"), Timestamp("2011-12-01")] + expected = {ex_keys[0]: idx[[0, 5]], ex_keys[1]: idx[[1, 4]]} + tm.assert_dict_equal(idx.groupby(to_groupby), expected) + + @pytest.mark.parametrize("klass", [list, tuple, np.array, Series]) + def test_where(self, klass): + i = self.create_index() + cond = [True] * len(i) + expected = i + result = i.where(klass(cond)) + + cond = [False] + [True] * (len(i) - 1) + expected = Float64Index([i._na_value] + i[1:].tolist()) + result = i.where(klass(cond)) + tm.assert_index_equal(result, expected) + + def test_insert(self, nulls_fixture): + # GH 18295 (test missing) + index = self.create_index() + expected = Float64Index([index[0], np.nan] + list(index[1:])) + result = index.insert(1, nulls_fixture) + tm.assert_index_equal(result, expected) + + +class TestFloat64Index(Numeric): + _holder = Float64Index + + @pytest.fixture( + params=[ + [1.5, 2, 3, 4, 5], + [0.0, 2.5, 5.0, 7.5, 10.0], + [5, 4, 3, 2, 1.5], + [10.0, 7.5, 5.0, 2.5, 0.0], + ], + ids=["mixed", "float", "mixed_dec", "float_dec"], + ) + def indices(self, request): + return Float64Index(request.param) + + @pytest.fixture + def mixed_index(self): + return Float64Index([1.5, 2, 3, 4, 5]) + + @pytest.fixture + def float_index(self): + return Float64Index([0.0, 2.5, 5.0, 7.5, 10.0]) + + def create_index(self): + return Float64Index(np.arange(5, dtype="float64")) + + def test_repr_roundtrip(self, indices): + tm.assert_index_equal(eval(repr(indices)), indices) + + def check_is_index(self, i): + assert isinstance(i, Index) + assert not isinstance(i, Float64Index) + + def check_coerce(self, a, b, is_float_index=True): + assert a.equals(b) + tm.assert_index_equal(a, b, exact=False) + if is_float_index: + assert isinstance(b, Float64Index) + else: + self.check_is_index(b) + + def test_constructor(self): + + # explicit construction + index = Float64Index([1, 2, 3, 4, 5]) + assert isinstance(index, Float64Index) + expected = np.array([1, 2, 3, 4, 5], dtype="float64") + tm.assert_numpy_array_equal(index.values, expected) + index = Float64Index(np.array([1, 2, 3, 4, 5])) + assert isinstance(index, Float64Index) + index = Float64Index([1.0, 2, 3, 4, 5]) + assert isinstance(index, Float64Index) + index = Float64Index(np.array([1.0, 2, 3, 4, 5])) + assert isinstance(index, Float64Index) + assert index.dtype == float + + index = Float64Index(np.array([1.0, 2, 3, 4, 5]), dtype=np.float32) + assert isinstance(index, Float64Index) + assert index.dtype == np.float64 + + index = Float64Index(np.array([1, 2, 3, 4, 5]), dtype=np.float32) + assert isinstance(index, Float64Index) + assert index.dtype == np.float64 + + # nan handling + result = Float64Index([np.nan, np.nan]) + assert pd.isna(result.values).all() + result = Float64Index(np.array([np.nan])) + assert pd.isna(result.values).all() + result = Index(np.array([np.nan])) + assert pd.isna(result.values).all() + + @pytest.mark.parametrize( + "index, dtype", + [ + (pd.Int64Index, "float64"), + (pd.UInt64Index, "categorical"), + (pd.Float64Index, "datetime64"), + (pd.RangeIndex, "float64"), + ], + ) + def test_invalid_dtype(self, index, dtype): + # GH 29539 + with pytest.raises( + ValueError, + match=rf"Incorrect `dtype` passed: expected \w+(?: \w+)?, received {dtype}", + ): + index([1, 2, 3], dtype=dtype) + + def test_constructor_invalid(self): + + # invalid + msg = ( + r"Float64Index\(\.\.\.\) must be called with a collection of" + r" some kind, 0\.0 was passed" + ) + with pytest.raises(TypeError, match=msg): + Float64Index(0.0) + msg = ( + "String dtype not supported, " + "you may need to explicitly cast to a numeric type" + ) + with pytest.raises(TypeError, match=msg): + Float64Index(["a", "b", 0.0]) + msg = r"float\(\) argument must be a string or a number, not 'Timestamp'" + with pytest.raises(TypeError, match=msg): + Float64Index([Timestamp("20130101")]) + + def test_constructor_coerce(self, mixed_index, float_index): + + self.check_coerce(mixed_index, Index([1.5, 2, 3, 4, 5])) + self.check_coerce(float_index, Index(np.arange(5) * 2.5)) + self.check_coerce( + float_index, Index(np.array(np.arange(5) * 2.5, dtype=object)) + ) + + def test_constructor_explicit(self, mixed_index, float_index): + + # these don't auto convert + self.check_coerce( + float_index, Index((np.arange(5) * 2.5), dtype=object), is_float_index=False + ) + self.check_coerce( + mixed_index, Index([1.5, 2, 3, 4, 5], dtype=object), is_float_index=False + ) + + def test_astype(self, mixed_index, float_index): + + result = float_index.astype(object) + assert result.equals(float_index) + assert float_index.equals(result) + self.check_is_index(result) + + i = mixed_index.copy() + i.name = "foo" + result = i.astype(object) + assert result.equals(i) + assert i.equals(result) + self.check_is_index(result) + + # GH 12881 + # a float astype int + for dtype in ["int16", "int32", "int64"]: + i = Float64Index([0, 1, 2]) + result = i.astype(dtype) + expected = Int64Index([0, 1, 2]) + tm.assert_index_equal(result, expected) + + i = Float64Index([0, 1.1, 2]) + result = i.astype(dtype) + expected = Int64Index([0, 1, 2]) + tm.assert_index_equal(result, expected) + + for dtype in ["float32", "float64"]: + i = Float64Index([0, 1, 2]) + result = i.astype(dtype) + expected = i + tm.assert_index_equal(result, expected) + + i = Float64Index([0, 1.1, 2]) + result = i.astype(dtype) + expected = Index(i.values.astype(dtype)) + tm.assert_index_equal(result, expected) + + # invalid + for dtype in ["M8[ns]", "m8[ns]"]: + msg = ( + f"Cannot convert Float64Index to dtype {pandas_dtype(dtype)}; " + f"integer values are required for conversion" + ) + with pytest.raises(TypeError, match=re.escape(msg)): + i.astype(dtype) + + # GH 13149 + for dtype in ["int16", "int32", "int64"]: + i = Float64Index([0, 1.1, np.NAN]) + msg = r"Cannot convert non-finite values \(NA or inf\) to integer" + with pytest.raises(ValueError, match=msg): + i.astype(dtype) + + def test_cannot_cast_inf_to_int(self): + idx = pd.Float64Index([1, 2, np.inf]) + + msg = r"Cannot convert non-finite values \(NA or inf\) to integer" + with pytest.raises(ValueError, match=msg): + idx.astype(int) + + def test_type_coercion_fail(self, any_int_dtype): + # see gh-15832 + msg = "Trying to coerce float values to integers" + with pytest.raises(ValueError, match=msg): + Index([1, 2, 3.5], dtype=any_int_dtype) + + def test_type_coercion_valid(self, float_dtype): + # There is no Float32Index, so we always + # generate Float64Index. + i = Index([1, 2, 3.5], dtype=float_dtype) + tm.assert_index_equal(i, Index([1, 2, 3.5])) + + def test_equals_numeric(self): + + i = Float64Index([1.0, 2.0]) + assert i.equals(i) + assert i.identical(i) + + i2 = Float64Index([1.0, 2.0]) + assert i.equals(i2) + + i = Float64Index([1.0, np.nan]) + assert i.equals(i) + assert i.identical(i) + + i2 = Float64Index([1.0, np.nan]) + assert i.equals(i2) + + def test_get_indexer(self): + idx = Float64Index([0.0, 1.0, 2.0]) + tm.assert_numpy_array_equal( + idx.get_indexer(idx), np.array([0, 1, 2], dtype=np.intp) + ) + + target = [-0.1, 0.5, 1.1] + tm.assert_numpy_array_equal( + idx.get_indexer(target, "pad"), np.array([-1, 0, 1], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "backfill"), np.array([0, 1, 2], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "nearest"), np.array([0, 1, 1], dtype=np.intp) + ) + + def test_get_loc(self): + idx = Float64Index([0.0, 1.0, 2.0]) + for method in [None, "pad", "backfill", "nearest"]: + assert idx.get_loc(1, method) == 1 + if method is not None: + assert idx.get_loc(1, method, tolerance=0) == 1 + + for method, loc in [("pad", 1), ("backfill", 2), ("nearest", 1)]: + assert idx.get_loc(1.1, method) == loc + assert idx.get_loc(1.1, method, tolerance=0.9) == loc + + with pytest.raises(KeyError, match="^'foo'$"): + idx.get_loc("foo") + with pytest.raises(KeyError, match=r"^1\.5$"): + idx.get_loc(1.5) + with pytest.raises(KeyError, match=r"^1\.5$"): + idx.get_loc(1.5, method="pad", tolerance=0.1) + with pytest.raises(KeyError, match="^True$"): + idx.get_loc(True) + with pytest.raises(KeyError, match="^False$"): + idx.get_loc(False) + + with pytest.raises(ValueError, match="must be numeric"): + idx.get_loc(1.4, method="nearest", tolerance="foo") + + with pytest.raises(ValueError, match="must contain numeric elements"): + idx.get_loc(1.4, method="nearest", tolerance=np.array(["foo"])) + + with pytest.raises( + ValueError, match="tolerance size must match target index size" + ): + idx.get_loc(1.4, method="nearest", tolerance=np.array([1, 2])) + + def test_get_loc_na(self): + idx = Float64Index([np.nan, 1, 2]) + assert idx.get_loc(1) == 1 + assert idx.get_loc(np.nan) == 0 + + idx = Float64Index([np.nan, 1, np.nan]) + assert idx.get_loc(1) == 1 + + # representable by slice [0:2:2] + # pytest.raises(KeyError, idx.slice_locs, np.nan) + sliced = idx.slice_locs(np.nan) + assert isinstance(sliced, tuple) + assert sliced == (0, 3) + + # not representable by slice + idx = Float64Index([np.nan, 1, np.nan, np.nan]) + assert idx.get_loc(1) == 1 + msg = "'Cannot get left slice bound for non-unique label: nan" + with pytest.raises(KeyError, match=msg): + idx.slice_locs(np.nan) + + def test_get_loc_missing_nan(self): + # GH 8569 + idx = Float64Index([1, 2]) + assert idx.get_loc(1) == 0 + with pytest.raises(KeyError, match=r"^3\.0$"): + idx.get_loc(3) + with pytest.raises(KeyError, match="^nan$"): + idx.get_loc(np.nan) + with pytest.raises(KeyError, match=r"^\[nan\]$"): + idx.get_loc([np.nan]) + + def test_contains_nans(self): + i = Float64Index([1.0, 2.0, np.nan]) + assert np.nan in i + + def test_contains_not_nans(self): + i = Float64Index([1.0, 2.0, np.nan]) + assert 1.0 in i + + def test_doesnt_contain_all_the_things(self): + i = Float64Index([np.nan]) + assert not i.isin([0]).item() + assert not i.isin([1]).item() + assert i.isin([np.nan]).item() + + def test_nan_multiple_containment(self): + i = Float64Index([1.0, np.nan]) + tm.assert_numpy_array_equal(i.isin([1.0]), np.array([True, False])) + tm.assert_numpy_array_equal(i.isin([2.0, np.pi]), np.array([False, False])) + tm.assert_numpy_array_equal(i.isin([np.nan]), np.array([False, True])) + tm.assert_numpy_array_equal(i.isin([1.0, np.nan]), np.array([True, True])) + i = Float64Index([1.0, 2.0]) + tm.assert_numpy_array_equal(i.isin([np.nan]), np.array([False, False])) + + def test_astype_from_object(self): + index = Index([1.0, np.nan, 0.2], dtype="object") + result = index.astype(float) + expected = Float64Index([1.0, np.nan, 0.2]) + assert result.dtype == expected.dtype + tm.assert_index_equal(result, expected) + + def test_fillna_float64(self): + # GH 11343 + idx = Index([1.0, np.nan, 3.0], dtype=float, name="x") + # can't downcast + exp = Index([1.0, 0.1, 3.0], name="x") + tm.assert_index_equal(idx.fillna(0.1), exp) + + # downcast + exp = Float64Index([1.0, 2.0, 3.0], name="x") + tm.assert_index_equal(idx.fillna(2), exp) + + # object + exp = Index([1.0, "obj", 3.0], name="x") + tm.assert_index_equal(idx.fillna("obj"), exp) + + def test_take_fill_value(self): + # GH 12631 + idx = pd.Float64Index([1.0, 2.0, 3.0], name="xxx") + result = idx.take(np.array([1, 0, -1])) + expected = pd.Float64Index([2.0, 1.0, 3.0], name="xxx") + tm.assert_index_equal(result, expected) + + # fill_value + result = idx.take(np.array([1, 0, -1]), fill_value=True) + expected = pd.Float64Index([2.0, 1.0, np.nan], name="xxx") + tm.assert_index_equal(result, expected) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = pd.Float64Index([2.0, 1.0, 3.0], name="xxx") + tm.assert_index_equal(result, expected) + + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + with pytest.raises(IndexError): + idx.take(np.array([1, -5])) + + +class NumericInt(Numeric): + def test_view(self): + i = self._holder([], name="Foo") + i_view = i.view() + assert i_view.name == "Foo" + + i_view = i.view(self._dtype) + tm.assert_index_equal(i, self._holder(i_view, name="Foo")) + + i_view = i.view(self._holder) + tm.assert_index_equal(i, self._holder(i_view, name="Foo")) + + def test_is_monotonic(self): + index = self._holder([1, 2, 3, 4]) + assert index.is_monotonic is True + assert index.is_monotonic_increasing is True + assert index._is_strictly_monotonic_increasing is True + assert index.is_monotonic_decreasing is False + assert index._is_strictly_monotonic_decreasing is False + + index = self._holder([4, 3, 2, 1]) + assert index.is_monotonic is False + assert index._is_strictly_monotonic_increasing is False + assert index._is_strictly_monotonic_decreasing is True + + index = self._holder([1]) + assert index.is_monotonic is True + assert index.is_monotonic_increasing is True + assert index.is_monotonic_decreasing is True + assert index._is_strictly_monotonic_increasing is True + assert index._is_strictly_monotonic_decreasing is True + + def test_is_strictly_monotonic(self): + index = self._holder([1, 1, 2, 3]) + assert index.is_monotonic_increasing is True + assert index._is_strictly_monotonic_increasing is False + + index = self._holder([3, 2, 1, 1]) + assert index.is_monotonic_decreasing is True + assert index._is_strictly_monotonic_decreasing is False + + index = self._holder([1, 1]) + assert index.is_monotonic_increasing + assert index.is_monotonic_decreasing + assert not index._is_strictly_monotonic_increasing + assert not index._is_strictly_monotonic_decreasing + + def test_logical_compat(self): + idx = self.create_index() + assert idx.all() == idx.values.all() + assert idx.any() == idx.values.any() + + def test_identical(self): + index = self.create_index() + i = Index(index.copy()) + assert i.identical(index) + + same_values_different_type = Index(i, dtype=object) + assert not i.identical(same_values_different_type) + + i = index.copy(dtype=object) + i = i.rename("foo") + same_values = Index(i, dtype=object) + assert same_values.identical(i) + + assert not i.identical(index) + assert Index(same_values, name="foo", dtype=object).identical(i) + + assert not index.copy(dtype=object).identical(index.copy(dtype=self._dtype)) + + def test_join_non_unique(self): + left = Index([4, 4, 3, 3]) + + joined, lidx, ridx = left.join(left, return_indexers=True) + + exp_joined = Index([3, 3, 3, 3, 4, 4, 4, 4]) + tm.assert_index_equal(joined, exp_joined) + + exp_lidx = np.array([2, 2, 3, 3, 0, 0, 1, 1], dtype=np.intp) + tm.assert_numpy_array_equal(lidx, exp_lidx) + + exp_ridx = np.array([2, 3, 2, 3, 0, 1, 0, 1], dtype=np.intp) + tm.assert_numpy_array_equal(ridx, exp_ridx) + + def test_join_self(self, join_type): + index = self.create_index() + joined = index.join(index, how=join_type) + assert index is joined + + def test_union_noncomparable(self): + # corner case, non-Int64Index + index = self.create_index() + other = Index([datetime.now() + timedelta(i) for i in range(4)], dtype=object) + result = index.union(other) + expected = Index(np.concatenate((index, other))) + tm.assert_index_equal(result, expected) + + result = other.union(index) + expected = Index(np.concatenate((other, index))) + tm.assert_index_equal(result, expected) + + def test_cant_or_shouldnt_cast(self): + msg = ( + "String dtype not supported, " + "you may need to explicitly cast to a numeric type" + ) + # can't + data = ["foo", "bar", "baz"] + with pytest.raises(TypeError, match=msg): + self._holder(data) + + # shouldn't + data = ["0", "1", "2"] + with pytest.raises(TypeError, match=msg): + self._holder(data) + + def test_view_index(self): + index = self.create_index() + index.view(Index) + + def test_prevent_casting(self): + index = self.create_index() + result = index.astype("O") + assert result.dtype == np.object_ + + def test_take_preserve_name(self): + index = self._holder([1, 2, 3, 4], name="foo") + taken = index.take([3, 0, 1]) + assert index.name == taken.name + + def test_take_fill_value(self): + # see gh-12631 + idx = self._holder([1, 2, 3], name="xxx") + result = idx.take(np.array([1, 0, -1])) + expected = self._holder([2, 1, 3], name="xxx") + tm.assert_index_equal(result, expected) + + name = self._holder.__name__ + msg = f"Unable to fill values because {name} cannot contain NA" + + # fill_value=True + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -1]), fill_value=True) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = self._holder([2, 1, 3], name="xxx") + tm.assert_index_equal(result, expected) + + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + with pytest.raises(IndexError): + idx.take(np.array([1, -5])) + + def test_slice_keep_name(self): + idx = self._holder([1, 2], name="asdf") + assert idx.name == idx[1:].name + + +class TestInt64Index(NumericInt): + _dtype = "int64" + _holder = Int64Index + + @pytest.fixture( + params=[range(0, 20, 2), range(19, -1, -1)], ids=["index_inc", "index_dec"] + ) + def indices(self, request): + return Int64Index(request.param) + + def create_index(self): + # return Int64Index(np.arange(5, dtype="int64")) + return Int64Index(range(0, 20, 2)) + + def test_constructor(self): + # pass list, coerce fine + index = Int64Index([-5, 0, 1, 2]) + expected = Index([-5, 0, 1, 2], dtype=np.int64) + tm.assert_index_equal(index, expected) + + # from iterable + index = Int64Index(iter([-5, 0, 1, 2])) + tm.assert_index_equal(index, expected) + + # scalar raise Exception + msg = ( + r"Int64Index\(\.\.\.\) must be called with a collection of some " + "kind, 5 was passed" + ) + with pytest.raises(TypeError, match=msg): + Int64Index(5) + + # copy + arr = index.values + new_index = Int64Index(arr, copy=True) + tm.assert_index_equal(new_index, index) + val = arr[0] + 3000 + + # this should not change index + arr[0] = val + assert new_index[0] != val + + # interpret list-like + expected = Int64Index([5, 0]) + for cls in [Index, Int64Index]: + for idx in [ + cls([5, 0], dtype="int64"), + cls(np.array([5, 0]), dtype="int64"), + cls(Series([5, 0]), dtype="int64"), + ]: + tm.assert_index_equal(idx, expected) + + def test_constructor_corner(self): + arr = np.array([1, 2, 3, 4], dtype=object) + index = Int64Index(arr) + assert index.values.dtype == np.int64 + tm.assert_index_equal(index, Index(arr)) + + # preventing casting + arr = np.array([1, "2", 3, "4"], dtype=object) + with pytest.raises(TypeError, match="casting"): + Int64Index(arr) + + arr_with_floats = [0, 2, 3, 4, 5, 1.25, 3, -1] + with pytest.raises(TypeError, match="casting"): + Int64Index(arr_with_floats) + + def test_constructor_coercion_signed_to_unsigned(self, uint_dtype): + + # see gh-15832 + msg = "Trying to coerce negative values to unsigned integers" + + with pytest.raises(OverflowError, match=msg): + Index([-1], dtype=uint_dtype) + + def test_constructor_unwraps_index(self): + idx = pd.Index([1, 2]) + result = pd.Int64Index(idx) + expected = np.array([1, 2], dtype="int64") + tm.assert_numpy_array_equal(result._data, expected) + + def test_coerce_list(self): + # coerce things + arr = Index([1, 2, 3, 4]) + assert isinstance(arr, Int64Index) + + # but not if explicit dtype passed + arr = Index([1, 2, 3, 4], dtype=object) + assert isinstance(arr, Index) + + def test_get_indexer(self): + index = self.create_index() + target = Int64Index(np.arange(10)) + indexer = index.get_indexer(target) + expected = np.array([0, -1, 1, -1, 2, -1, 3, -1, 4, -1], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected) + + target = Int64Index(np.arange(10)) + indexer = index.get_indexer(target, method="pad") + expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected) + + target = Int64Index(np.arange(10)) + indexer = index.get_indexer(target, method="backfill") + expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected) + + def test_get_indexer_nan(self): + # GH 7820 + result = Index([1, 2, np.nan]).get_indexer([np.nan]) + expected = np.array([2], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + def test_intersection(self): + index = self.create_index() + other = Index([1, 2, 3, 4, 5]) + result = index.intersection(other) + expected = Index(np.sort(np.intersect1d(index.values, other.values))) + tm.assert_index_equal(result, expected) + + result = other.intersection(index) + expected = Index( + np.sort(np.asarray(np.intersect1d(index.values, other.values))) + ) + tm.assert_index_equal(result, expected) + + def test_join_inner(self): + index = self.create_index() + other = Int64Index([7, 12, 25, 1, 2, 5]) + other_mono = Int64Index([1, 2, 5, 7, 12, 25]) + + # not monotonic + res, lidx, ridx = index.join(other, how="inner", return_indexers=True) + + # no guarantee of sortedness, so sort for comparison purposes + ind = res.argsort() + res = res.take(ind) + lidx = lidx.take(ind) + ridx = ridx.take(ind) + + eres = Int64Index([2, 12]) + elidx = np.array([1, 6], dtype=np.intp) + eridx = np.array([4, 1], dtype=np.intp) + + assert isinstance(res, Int64Index) + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) + + # monotonic + res, lidx, ridx = index.join(other_mono, how="inner", return_indexers=True) + + res2 = index.intersection(other_mono) + tm.assert_index_equal(res, res2) + + elidx = np.array([1, 6], dtype=np.intp) + eridx = np.array([1, 4], dtype=np.intp) + assert isinstance(res, Int64Index) + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) + + def test_join_left(self): + index = self.create_index() + other = Int64Index([7, 12, 25, 1, 2, 5]) + other_mono = Int64Index([1, 2, 5, 7, 12, 25]) + + # not monotonic + res, lidx, ridx = index.join(other, how="left", return_indexers=True) + eres = index + eridx = np.array([-1, 4, -1, -1, -1, -1, 1, -1, -1, -1], dtype=np.intp) + + assert isinstance(res, Int64Index) + tm.assert_index_equal(res, eres) + assert lidx is None + tm.assert_numpy_array_equal(ridx, eridx) + + # monotonic + res, lidx, ridx = index.join(other_mono, how="left", return_indexers=True) + eridx = np.array([-1, 1, -1, -1, -1, -1, 4, -1, -1, -1], dtype=np.intp) + assert isinstance(res, Int64Index) + tm.assert_index_equal(res, eres) + assert lidx is None + tm.assert_numpy_array_equal(ridx, eridx) + + # non-unique + idx = Index([1, 1, 2, 5]) + idx2 = Index([1, 2, 5, 7, 9]) + res, lidx, ridx = idx2.join(idx, how="left", return_indexers=True) + eres = Index([1, 1, 2, 5, 7, 9]) # 1 is in idx2, so it should be x2 + eridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp) + elidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp) + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) + + def test_join_right(self): + index = self.create_index() + other = Int64Index([7, 12, 25, 1, 2, 5]) + other_mono = Int64Index([1, 2, 5, 7, 12, 25]) + + # not monotonic + res, lidx, ridx = index.join(other, how="right", return_indexers=True) + eres = other + elidx = np.array([-1, 6, -1, -1, 1, -1], dtype=np.intp) + + assert isinstance(other, Int64Index) + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + assert ridx is None + + # monotonic + res, lidx, ridx = index.join(other_mono, how="right", return_indexers=True) + eres = other_mono + elidx = np.array([-1, 1, -1, -1, 6, -1], dtype=np.intp) + assert isinstance(other, Int64Index) + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + assert ridx is None + + # non-unique + idx = Index([1, 1, 2, 5]) + idx2 = Index([1, 2, 5, 7, 9]) + res, lidx, ridx = idx.join(idx2, how="right", return_indexers=True) + eres = Index([1, 1, 2, 5, 7, 9]) # 1 is in idx2, so it should be x2 + elidx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp) + eridx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp) + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) + + def test_join_non_int_index(self): + index = self.create_index() + other = Index([3, 6, 7, 8, 10], dtype=object) + + outer = index.join(other, how="outer") + outer2 = other.join(index, how="outer") + expected = Index([0, 2, 3, 4, 6, 7, 8, 10, 12, 14, 16, 18]) + tm.assert_index_equal(outer, outer2) + tm.assert_index_equal(outer, expected) + + inner = index.join(other, how="inner") + inner2 = other.join(index, how="inner") + expected = Index([6, 8, 10]) + tm.assert_index_equal(inner, inner2) + tm.assert_index_equal(inner, expected) + + left = index.join(other, how="left") + tm.assert_index_equal(left, index.astype(object)) + + left2 = other.join(index, how="left") + tm.assert_index_equal(left2, other) + + right = index.join(other, how="right") + tm.assert_index_equal(right, other) + + right2 = other.join(index, how="right") + tm.assert_index_equal(right2, index.astype(object)) + + def test_join_outer(self): + index = self.create_index() + other = Int64Index([7, 12, 25, 1, 2, 5]) + other_mono = Int64Index([1, 2, 5, 7, 12, 25]) + + # not monotonic + # guarantee of sortedness + res, lidx, ridx = index.join(other, how="outer", return_indexers=True) + noidx_res = index.join(other, how="outer") + tm.assert_index_equal(res, noidx_res) + + eres = Int64Index([0, 1, 2, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 25]) + elidx = np.array([0, -1, 1, 2, -1, 3, -1, 4, 5, 6, 7, 8, 9, -1], dtype=np.intp) + eridx = np.array( + [-1, 3, 4, -1, 5, -1, 0, -1, -1, 1, -1, -1, -1, 2], dtype=np.intp + ) + + assert isinstance(res, Int64Index) + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) + + # monotonic + res, lidx, ridx = index.join(other_mono, how="outer", return_indexers=True) + noidx_res = index.join(other_mono, how="outer") + tm.assert_index_equal(res, noidx_res) + + elidx = np.array([0, -1, 1, 2, -1, 3, -1, 4, 5, 6, 7, 8, 9, -1], dtype=np.intp) + eridx = np.array( + [-1, 0, 1, -1, 2, -1, 3, -1, -1, 4, -1, -1, -1, 5], dtype=np.intp + ) + assert isinstance(res, Int64Index) + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) + + +class TestUInt64Index(NumericInt): + + _dtype = "uint64" + _holder = UInt64Index + + @pytest.fixture( + params=[ + [2 ** 63, 2 ** 63 + 10, 2 ** 63 + 15, 2 ** 63 + 20, 2 ** 63 + 25], + [2 ** 63 + 25, 2 ** 63 + 20, 2 ** 63 + 15, 2 ** 63 + 10, 2 ** 63], + ], + ids=["index_inc", "index_dec"], + ) + def indices(self, request): + return UInt64Index(request.param) + + @pytest.fixture + def index_large(self): + # large values used in TestUInt64Index where no compat needed with Int64/Float64 + large = [2 ** 63, 2 ** 63 + 10, 2 ** 63 + 15, 2 ** 63 + 20, 2 ** 63 + 25] + return UInt64Index(large) + + def create_index(self): + # compat with shared Int64/Float64 tests; use index_large for UInt64 only tests + return UInt64Index(np.arange(5, dtype="uint64")) + + def test_constructor(self): + idx = UInt64Index([1, 2, 3]) + res = Index([1, 2, 3], dtype=np.uint64) + tm.assert_index_equal(res, idx) + + idx = UInt64Index([1, 2 ** 63]) + res = Index([1, 2 ** 63], dtype=np.uint64) + tm.assert_index_equal(res, idx) + + idx = UInt64Index([1, 2 ** 63]) + res = Index([1, 2 ** 63]) + tm.assert_index_equal(res, idx) + + idx = Index([-1, 2 ** 63], dtype=object) + res = Index(np.array([-1, 2 ** 63], dtype=object)) + tm.assert_index_equal(res, idx) + + # https://github.com/pandas-dev/pandas/issues/29526 + idx = UInt64Index([1, 2 ** 63 + 1], dtype=np.uint64) + res = Index([1, 2 ** 63 + 1], dtype=np.uint64) + tm.assert_index_equal(res, idx) + + def test_get_indexer(self, index_large): + target = UInt64Index(np.arange(10).astype("uint64") * 5 + 2 ** 63) + indexer = index_large.get_indexer(target) + expected = np.array([0, -1, 1, 2, 3, 4, -1, -1, -1, -1], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected) + + target = UInt64Index(np.arange(10).astype("uint64") * 5 + 2 ** 63) + indexer = index_large.get_indexer(target, method="pad") + expected = np.array([0, 0, 1, 2, 3, 4, 4, 4, 4, 4], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected) + + target = UInt64Index(np.arange(10).astype("uint64") * 5 + 2 ** 63) + indexer = index_large.get_indexer(target, method="backfill") + expected = np.array([0, 1, 1, 2, 3, 4, -1, -1, -1, -1], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected) + + def test_intersection(self, index_large): + other = Index([2 ** 63, 2 ** 63 + 5, 2 ** 63 + 10, 2 ** 63 + 15, 2 ** 63 + 20]) + result = index_large.intersection(other) + expected = Index(np.sort(np.intersect1d(index_large.values, other.values))) + tm.assert_index_equal(result, expected) + + result = other.intersection(index_large) + expected = Index( + np.sort(np.asarray(np.intersect1d(index_large.values, other.values))) + ) + tm.assert_index_equal(result, expected) + + def test_join_inner(self, index_large): + other = UInt64Index(2 ** 63 + np.array([7, 12, 25, 1, 2, 10], dtype="uint64")) + other_mono = UInt64Index( + 2 ** 63 + np.array([1, 2, 7, 10, 12, 25], dtype="uint64") + ) + + # not monotonic + res, lidx, ridx = index_large.join(other, how="inner", return_indexers=True) + + # no guarantee of sortedness, so sort for comparison purposes + ind = res.argsort() + res = res.take(ind) + lidx = lidx.take(ind) + ridx = ridx.take(ind) + + eres = UInt64Index(2 ** 63 + np.array([10, 25], dtype="uint64")) + elidx = np.array([1, 4], dtype=np.intp) + eridx = np.array([5, 2], dtype=np.intp) + + assert isinstance(res, UInt64Index) + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) + + # monotonic + res, lidx, ridx = index_large.join( + other_mono, how="inner", return_indexers=True + ) + + res2 = index_large.intersection(other_mono) + tm.assert_index_equal(res, res2) + + elidx = np.array([1, 4], dtype=np.intp) + eridx = np.array([3, 5], dtype=np.intp) + + assert isinstance(res, UInt64Index) + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) + + def test_join_left(self, index_large): + other = UInt64Index(2 ** 63 + np.array([7, 12, 25, 1, 2, 10], dtype="uint64")) + other_mono = UInt64Index( + 2 ** 63 + np.array([1, 2, 7, 10, 12, 25], dtype="uint64") + ) + + # not monotonic + res, lidx, ridx = index_large.join(other, how="left", return_indexers=True) + eres = index_large + eridx = np.array([-1, 5, -1, -1, 2], dtype=np.intp) + + assert isinstance(res, UInt64Index) + tm.assert_index_equal(res, eres) + assert lidx is None + tm.assert_numpy_array_equal(ridx, eridx) + + # monotonic + res, lidx, ridx = index_large.join(other_mono, how="left", return_indexers=True) + eridx = np.array([-1, 3, -1, -1, 5], dtype=np.intp) + + assert isinstance(res, UInt64Index) + tm.assert_index_equal(res, eres) + assert lidx is None + tm.assert_numpy_array_equal(ridx, eridx) + + # non-unique + idx = UInt64Index(2 ** 63 + np.array([1, 1, 2, 5], dtype="uint64")) + idx2 = UInt64Index(2 ** 63 + np.array([1, 2, 5, 7, 9], dtype="uint64")) + res, lidx, ridx = idx2.join(idx, how="left", return_indexers=True) + + # 1 is in idx2, so it should be x2 + eres = UInt64Index(2 ** 63 + np.array([1, 1, 2, 5, 7, 9], dtype="uint64")) + eridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp) + elidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp) + + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) + + def test_join_right(self, index_large): + other = UInt64Index(2 ** 63 + np.array([7, 12, 25, 1, 2, 10], dtype="uint64")) + other_mono = UInt64Index( + 2 ** 63 + np.array([1, 2, 7, 10, 12, 25], dtype="uint64") + ) + + # not monotonic + res, lidx, ridx = index_large.join(other, how="right", return_indexers=True) + eres = other + elidx = np.array([-1, -1, 4, -1, -1, 1], dtype=np.intp) + + tm.assert_numpy_array_equal(lidx, elidx) + assert isinstance(other, UInt64Index) + tm.assert_index_equal(res, eres) + assert ridx is None + + # monotonic + res, lidx, ridx = index_large.join( + other_mono, how="right", return_indexers=True + ) + eres = other_mono + elidx = np.array([-1, -1, -1, 1, -1, 4], dtype=np.intp) + + assert isinstance(other, UInt64Index) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_index_equal(res, eres) + assert ridx is None + + # non-unique + idx = UInt64Index(2 ** 63 + np.array([1, 1, 2, 5], dtype="uint64")) + idx2 = UInt64Index(2 ** 63 + np.array([1, 2, 5, 7, 9], dtype="uint64")) + res, lidx, ridx = idx.join(idx2, how="right", return_indexers=True) + + # 1 is in idx2, so it should be x2 + eres = UInt64Index(2 ** 63 + np.array([1, 1, 2, 5, 7, 9], dtype="uint64")) + elidx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp) + eridx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp) + + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) + + def test_join_non_int_index(self, index_large): + other = Index( + 2 ** 63 + np.array([1, 5, 7, 10, 20], dtype="uint64"), dtype=object + ) + + outer = index_large.join(other, how="outer") + outer2 = other.join(index_large, how="outer") + expected = Index( + 2 ** 63 + np.array([0, 1, 5, 7, 10, 15, 20, 25], dtype="uint64") + ) + tm.assert_index_equal(outer, outer2) + tm.assert_index_equal(outer, expected) + + inner = index_large.join(other, how="inner") + inner2 = other.join(index_large, how="inner") + expected = Index(2 ** 63 + np.array([10, 20], dtype="uint64")) + tm.assert_index_equal(inner, inner2) + tm.assert_index_equal(inner, expected) + + left = index_large.join(other, how="left") + tm.assert_index_equal(left, index_large.astype(object)) + + left2 = other.join(index_large, how="left") + tm.assert_index_equal(left2, other) + + right = index_large.join(other, how="right") + tm.assert_index_equal(right, other) + + right2 = other.join(index_large, how="right") + tm.assert_index_equal(right2, index_large.astype(object)) + + def test_join_outer(self, index_large): + other = UInt64Index(2 ** 63 + np.array([7, 12, 25, 1, 2, 10], dtype="uint64")) + other_mono = UInt64Index( + 2 ** 63 + np.array([1, 2, 7, 10, 12, 25], dtype="uint64") + ) + + # not monotonic + # guarantee of sortedness + res, lidx, ridx = index_large.join(other, how="outer", return_indexers=True) + noidx_res = index_large.join(other, how="outer") + tm.assert_index_equal(res, noidx_res) + + eres = UInt64Index( + 2 ** 63 + np.array([0, 1, 2, 7, 10, 12, 15, 20, 25], dtype="uint64") + ) + elidx = np.array([0, -1, -1, -1, 1, -1, 2, 3, 4], dtype=np.intp) + eridx = np.array([-1, 3, 4, 0, 5, 1, -1, -1, 2], dtype=np.intp) + + assert isinstance(res, UInt64Index) + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) + + # monotonic + res, lidx, ridx = index_large.join( + other_mono, how="outer", return_indexers=True + ) + noidx_res = index_large.join(other_mono, how="outer") + tm.assert_index_equal(res, noidx_res) + + elidx = np.array([0, -1, -1, -1, 1, -1, 2, 3, 4], dtype=np.intp) + eridx = np.array([-1, 0, 1, 2, 3, 4, -1, -1, 5], dtype=np.intp) + + assert isinstance(res, UInt64Index) + tm.assert_index_equal(res, eres) + tm.assert_numpy_array_equal(lidx, elidx) + tm.assert_numpy_array_equal(ridx, eridx) + + +@pytest.mark.parametrize("dtype", ["int64", "uint64"]) +def test_int_float_union_dtype(dtype): + # https://github.com/pandas-dev/pandas/issues/26778 + # [u]int | float -> float + index = pd.Index([0, 2, 3], dtype=dtype) + other = pd.Float64Index([0.5, 1.5]) + expected = pd.Float64Index([0.0, 0.5, 1.5, 2.0, 3.0]) + result = index.union(other) + tm.assert_index_equal(result, expected) + + result = other.union(index) + tm.assert_index_equal(result, expected) + + +def test_range_float_union_dtype(): + # https://github.com/pandas-dev/pandas/issues/26778 + index = pd.RangeIndex(start=0, stop=3) + other = pd.Float64Index([0.5, 1.5]) + result = index.union(other) + expected = pd.Float64Index([0.0, 0.5, 1, 1.5, 2.0]) + tm.assert_index_equal(result, expected) + + result = other.union(index) + tm.assert_index_equal(result, expected) + + +def test_uint_index_does_not_convert_to_float64(): + # https://github.com/pandas-dev/pandas/issues/28279 + # https://github.com/pandas-dev/pandas/issues/28023 + series = pd.Series( + [0, 1, 2, 3, 4, 5], + index=[ + 7606741985629028552, + 17876870360202815256, + 17876870360202815256, + 13106359306506049338, + 8991270399732411471, + 8991270399732411472, + ], + ) + + result = series.loc[[7606741985629028552, 17876870360202815256]] + + expected = UInt64Index( + [7606741985629028552, 17876870360202815256, 17876870360202815256], + dtype="uint64", + ) + tm.assert_index_equal(result.index, expected) + + tm.assert_equal(result, series[:3]) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/test_numpy_compat.py b/venv/Lib/site-packages/pandas/tests/indexes/test_numpy_compat.py new file mode 100644 index 0000000..5835566 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/test_numpy_compat.py @@ -0,0 +1,130 @@ +import numpy as np +import pytest + +from pandas import ( + DatetimeIndex, + Float64Index, + Index, + Int64Index, + PeriodIndex, + TimedeltaIndex, + UInt64Index, + _np_version_under1p17, + _np_version_under1p18, +) +import pandas._testing as tm +from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin + + +@pytest.mark.parametrize( + "func", + [ + np.exp, + np.exp2, + np.expm1, + np.log, + np.log2, + np.log10, + np.log1p, + np.sqrt, + np.sin, + np.cos, + np.tan, + np.arcsin, + np.arccos, + np.arctan, + np.sinh, + np.cosh, + np.tanh, + np.arcsinh, + np.arccosh, + np.arctanh, + np.deg2rad, + np.rad2deg, + ], + ids=lambda x: x.__name__, +) +def test_numpy_ufuncs_basic(indices, func): + # test ufuncs of numpy, see: + # http://docs.scipy.org/doc/numpy/reference/ufuncs.html + + idx = indices + if isinstance(idx, DatetimeIndexOpsMixin): + # raise TypeError or ValueError (PeriodIndex) + with pytest.raises(Exception): + with np.errstate(all="ignore"): + func(idx) + elif isinstance(idx, (Float64Index, Int64Index, UInt64Index)): + # coerces to float (e.g. np.sin) + with np.errstate(all="ignore"): + result = func(idx) + exp = Index(func(idx.values), name=idx.name) + + tm.assert_index_equal(result, exp) + assert isinstance(result, Float64Index) + else: + # raise AttributeError or TypeError + if len(idx) == 0: + pass + else: + with pytest.raises(Exception): + with np.errstate(all="ignore"): + func(idx) + + +@pytest.mark.parametrize( + "func", [np.isfinite, np.isinf, np.isnan, np.signbit], ids=lambda x: x.__name__ +) +def test_numpy_ufuncs_other(indices, func): + # test ufuncs of numpy, see: + # http://docs.scipy.org/doc/numpy/reference/ufuncs.html + + idx = indices + if isinstance(idx, (DatetimeIndex, TimedeltaIndex)): + + if not _np_version_under1p18 and func in [np.isfinite, np.isinf, np.isnan]: + # numpy 1.18(dev) changed isinf and isnan to not raise on dt64/tfd64 + result = func(idx) + assert isinstance(result, np.ndarray) + + elif not _np_version_under1p17 and func in [np.isfinite]: + # ok under numpy >= 1.17 + # Results in bool array + result = func(idx) + assert isinstance(result, np.ndarray) + else: + # raise TypeError or ValueError (PeriodIndex) + with pytest.raises(Exception): + func(idx) + + elif isinstance(idx, PeriodIndex): + # raise TypeError or ValueError (PeriodIndex) + with pytest.raises(Exception): + func(idx) + + elif isinstance(idx, (Float64Index, Int64Index, UInt64Index)): + # Results in bool array + result = func(idx) + assert isinstance(result, np.ndarray) + assert not isinstance(result, Index) + else: + if len(idx) == 0: + pass + else: + with pytest.raises(Exception): + func(idx) + + +def test_elementwise_comparison_warning(): + # https://github.com/pandas-dev/pandas/issues/22698#issuecomment-458968300 + # np.array([1, 2]) == 'a' returns False, and produces a + # FutureWarning that it'll be [False, False] in the future. + # We just want to ensure that comes through. + # When NumPy dev actually enforces this change, we'll need to skip + # this test. + idx = Index([1, 2]) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = idx == "a" + + expected = np.array([False, False]) + tm.assert_numpy_array_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/test_setops.py b/venv/Lib/site-packages/pandas/tests/indexes/test_setops.py new file mode 100644 index 0000000..abfa413 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/test_setops.py @@ -0,0 +1,107 @@ +""" +The tests in this package are to ensure the proper resultant dtypes of +set operations. +""" +import itertools as it + +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_dtype_equal + +import pandas as pd +from pandas import Float64Index, Int64Index, RangeIndex, UInt64Index +import pandas._testing as tm +from pandas.api.types import pandas_dtype +from pandas.tests.indexes.conftest import indices_dict + +COMPATIBLE_INCONSISTENT_PAIRS = { + (Int64Index, RangeIndex): (tm.makeIntIndex, tm.makeRangeIndex), + (Float64Index, Int64Index): (tm.makeFloatIndex, tm.makeIntIndex), + (Float64Index, RangeIndex): (tm.makeFloatIndex, tm.makeIntIndex), + (Float64Index, UInt64Index): (tm.makeFloatIndex, tm.makeUIntIndex), +} + + +@pytest.fixture(params=it.combinations(indices_dict, 2), ids="-".join) +def index_pair(request): + """ + Create all combinations of 2 index types. + """ + return indices_dict[request.param[0]], indices_dict[request.param[1]] + + +def test_union_same_types(indices): + # Union with a non-unique, non-monotonic index raises error + # Only needed for bool index factory + idx1 = indices.sort_values() + idx2 = indices.sort_values() + assert idx1.union(idx2).dtype == idx1.dtype + + +def test_union_different_types(index_pair): + # GH 23525 + idx1, idx2 = index_pair + type_pair = tuple(sorted([type(idx1), type(idx2)], key=lambda x: str(x))) + if type_pair in COMPATIBLE_INCONSISTENT_PAIRS: + pytest.xfail("This test only considers non compatible indexes.") + + if any(isinstance(idx, pd.MultiIndex) for idx in index_pair): + pytest.xfail("This test doesn't consider multiindixes.") + + if is_dtype_equal(idx1.dtype, idx2.dtype): + pytest.xfail("This test only considers non matching dtypes.") + + # A union with a CategoricalIndex (even as dtype('O')) and a + # non-CategoricalIndex can only be made if both indices are monotonic. + # This is true before this PR as well. + + # Union with a non-unique, non-monotonic index raises error + # This applies to the boolean index + idx1 = idx1.sort_values() + idx2 = idx2.sort_values() + + assert idx1.union(idx2).dtype == np.dtype("O") + assert idx2.union(idx1).dtype == np.dtype("O") + + +@pytest.mark.parametrize("idx_fact1,idx_fact2", COMPATIBLE_INCONSISTENT_PAIRS.values()) +def test_compatible_inconsistent_pairs(idx_fact1, idx_fact2): + # GH 23525 + idx1 = idx_fact1(10) + idx2 = idx_fact2(20) + + res1 = idx1.union(idx2) + res2 = idx2.union(idx1) + + assert res1.dtype in (idx1.dtype, idx2.dtype) + assert res2.dtype in (idx1.dtype, idx2.dtype) + + +@pytest.mark.parametrize( + "left, right, expected", + [ + ("int64", "int64", "int64"), + ("int64", "uint64", "object"), + ("int64", "float64", "float64"), + ("uint64", "float64", "float64"), + ("uint64", "uint64", "uint64"), + ("float64", "float64", "float64"), + ("datetime64[ns]", "int64", "object"), + ("datetime64[ns]", "uint64", "object"), + ("datetime64[ns]", "float64", "object"), + ("datetime64[ns, CET]", "int64", "object"), + ("datetime64[ns, CET]", "uint64", "object"), + ("datetime64[ns, CET]", "float64", "object"), + ("Period[D]", "int64", "object"), + ("Period[D]", "uint64", "object"), + ("Period[D]", "float64", "object"), + ], +) +def test_union_dtypes(left, right, expected): + left = pandas_dtype(left) + right = pandas_dtype(right) + a = pd.Index([], dtype=left) + b = pd.Index([], dtype=right) + result = (a | b).dtype + assert result == expected diff --git a/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/__init__.py b/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_astype.py b/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_astype.py new file mode 100644 index 0000000..82c9d99 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_astype.py @@ -0,0 +1,123 @@ +from datetime import timedelta + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + Float64Index, + Index, + Int64Index, + NaT, + Timedelta, + TimedeltaIndex, + timedelta_range, +) +import pandas._testing as tm + + +class TestTimedeltaIndex: + def test_astype_object(self): + idx = timedelta_range(start="1 days", periods=4, freq="D", name="idx") + expected_list = [ + Timedelta("1 days"), + Timedelta("2 days"), + Timedelta("3 days"), + Timedelta("4 days"), + ] + result = idx.astype(object) + expected = Index(expected_list, dtype=object, name="idx") + tm.assert_index_equal(result, expected) + assert idx.tolist() == expected_list + + def test_astype_object_with_nat(self): + idx = TimedeltaIndex( + [timedelta(days=1), timedelta(days=2), NaT, timedelta(days=4)], name="idx" + ) + expected_list = [ + Timedelta("1 days"), + Timedelta("2 days"), + NaT, + Timedelta("4 days"), + ] + result = idx.astype(object) + expected = Index(expected_list, dtype=object, name="idx") + tm.assert_index_equal(result, expected) + assert idx.tolist() == expected_list + + def test_astype(self): + # GH 13149, GH 13209 + idx = TimedeltaIndex([1e14, "NaT", NaT, np.NaN]) + + result = idx.astype(object) + expected = Index([Timedelta("1 days 03:46:40")] + [NaT] * 3, dtype=object) + tm.assert_index_equal(result, expected) + + result = idx.astype(int) + expected = Int64Index( + [100000000000000] + [-9223372036854775808] * 3, dtype=np.int64 + ) + tm.assert_index_equal(result, expected) + + result = idx.astype(str) + expected = Index(str(x) for x in idx) + tm.assert_index_equal(result, expected) + + rng = timedelta_range("1 days", periods=10) + result = rng.astype("i8") + tm.assert_index_equal(result, Index(rng.asi8)) + tm.assert_numpy_array_equal(rng.asi8, result.values) + + def test_astype_uint(self): + arr = timedelta_range("1H", periods=2) + expected = pd.UInt64Index( + np.array([3600000000000, 90000000000000], dtype="uint64") + ) + + tm.assert_index_equal(arr.astype("uint64"), expected) + tm.assert_index_equal(arr.astype("uint32"), expected) + + def test_astype_timedelta64(self): + # GH 13149, GH 13209 + idx = TimedeltaIndex([1e14, "NaT", NaT, np.NaN]) + + result = idx.astype("timedelta64") + expected = Float64Index([1e14] + [np.NaN] * 3, dtype="float64") + tm.assert_index_equal(result, expected) + + result = idx.astype("timedelta64[ns]") + tm.assert_index_equal(result, idx) + assert result is not idx + + result = idx.astype("timedelta64[ns]", copy=False) + tm.assert_index_equal(result, idx) + assert result is idx + + @pytest.mark.parametrize("dtype", [float, "datetime64", "datetime64[ns]"]) + def test_astype_raises(self, dtype): + # GH 13149, GH 13209 + idx = TimedeltaIndex([1e14, "NaT", NaT, np.NaN]) + msg = "Cannot cast TimedeltaArray to dtype" + with pytest.raises(TypeError, match=msg): + idx.astype(dtype) + + def test_astype_category(self): + obj = pd.timedelta_range("1H", periods=2, freq="H") + + result = obj.astype("category") + expected = pd.CategoricalIndex([pd.Timedelta("1H"), pd.Timedelta("2H")]) + tm.assert_index_equal(result, expected) + + result = obj._data.astype("category") + expected = expected.values + tm.assert_categorical_equal(result, expected) + + def test_astype_array_fallback(self): + obj = pd.timedelta_range("1H", periods=2) + result = obj.astype(bool) + expected = pd.Index(np.array([True, True])) + tm.assert_index_equal(result, expected) + + result = obj._data.astype(bool) + expected = np.array([True, True]) + tm.assert_numpy_array_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_constructors.py b/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_constructors.py new file mode 100644 index 0000000..39abbf5 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_constructors.py @@ -0,0 +1,212 @@ +from datetime import timedelta + +import numpy as np +import pytest + +import pandas as pd +from pandas import Timedelta, TimedeltaIndex, timedelta_range, to_timedelta +import pandas._testing as tm +from pandas.core.arrays import TimedeltaArray + + +class TestTimedeltaIndex: + def test_int64_nocopy(self): + # GH#23539 check that a copy isn't made when we pass int64 data + # and copy=False + arr = np.arange(10, dtype=np.int64) + tdi = TimedeltaIndex(arr, copy=False) + assert tdi._data._data.base is arr + + def test_infer_from_tdi(self): + # GH#23539 + # fast-path for inferring a frequency if the passed data already + # has one + tdi = pd.timedelta_range("1 second", periods=10 ** 7, freq="1s") + + result = pd.TimedeltaIndex(tdi, freq="infer") + assert result.freq == tdi.freq + + # check that inferred_freq was not called by checking that the + # value has not been cached + assert "inferred_freq" not in getattr(result, "_cache", {}) + + def test_infer_from_tdi_mismatch(self): + # GH#23539 + # fast-path for invalidating a frequency if the passed data already + # has one and it does not match the `freq` input + tdi = pd.timedelta_range("1 second", periods=100, freq="1s") + + msg = ( + "Inferred frequency .* from passed values does " + "not conform to passed frequency" + ) + with pytest.raises(ValueError, match=msg): + TimedeltaIndex(tdi, freq="D") + + with pytest.raises(ValueError, match=msg): + # GH#23789 + TimedeltaArray(tdi, freq="D") + + def test_dt64_data_invalid(self): + # GH#23539 + # passing tz-aware DatetimeIndex raises, naive or ndarray[datetime64] + # raise as of GH#29794 + dti = pd.date_range("2016-01-01", periods=3) + + msg = "cannot be converted to timedelta64" + with pytest.raises(TypeError, match=msg): + TimedeltaIndex(dti.tz_localize("Europe/Brussels")) + + with pytest.raises(TypeError, match=msg): + TimedeltaIndex(dti) + + with pytest.raises(TypeError, match=msg): + TimedeltaIndex(np.asarray(dti)) + + def test_float64_ns_rounded(self): + # GH#23539 without specifying a unit, floats are regarded as nanos, + # and fractional portions are truncated + tdi = TimedeltaIndex([2.3, 9.7]) + expected = TimedeltaIndex([2, 9]) + tm.assert_index_equal(tdi, expected) + + # integral floats are non-lossy + tdi = TimedeltaIndex([2.0, 9.0]) + expected = TimedeltaIndex([2, 9]) + tm.assert_index_equal(tdi, expected) + + # NaNs get converted to NaT + tdi = TimedeltaIndex([2.0, np.nan]) + expected = TimedeltaIndex([pd.Timedelta(nanoseconds=2), pd.NaT]) + tm.assert_index_equal(tdi, expected) + + def test_float64_unit_conversion(self): + # GH#23539 + tdi = TimedeltaIndex([1.5, 2.25], unit="D") + expected = TimedeltaIndex([Timedelta(days=1.5), Timedelta(days=2.25)]) + tm.assert_index_equal(tdi, expected) + + def test_construction_base_constructor(self): + arr = [pd.Timedelta("1 days"), pd.NaT, pd.Timedelta("3 days")] + tm.assert_index_equal(pd.Index(arr), pd.TimedeltaIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), pd.TimedeltaIndex(np.array(arr))) + + arr = [np.nan, pd.NaT, pd.Timedelta("1 days")] + tm.assert_index_equal(pd.Index(arr), pd.TimedeltaIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), pd.TimedeltaIndex(np.array(arr))) + + def test_constructor(self): + expected = TimedeltaIndex( + [ + "1 days", + "1 days 00:00:05", + "2 days", + "2 days 00:00:02", + "0 days 00:00:03", + ] + ) + result = TimedeltaIndex( + [ + "1 days", + "1 days, 00:00:05", + np.timedelta64(2, "D"), + timedelta(days=2, seconds=2), + pd.offsets.Second(3), + ] + ) + tm.assert_index_equal(result, expected) + + # unicode + result = TimedeltaIndex( + [ + "1 days", + "1 days, 00:00:05", + np.timedelta64(2, "D"), + timedelta(days=2, seconds=2), + pd.offsets.Second(3), + ] + ) + + expected = TimedeltaIndex( + ["0 days 00:00:00", "0 days 00:00:01", "0 days 00:00:02"] + ) + tm.assert_index_equal(TimedeltaIndex(range(3), unit="s"), expected) + expected = TimedeltaIndex( + ["0 days 00:00:00", "0 days 00:00:05", "0 days 00:00:09"] + ) + tm.assert_index_equal(TimedeltaIndex([0, 5, 9], unit="s"), expected) + expected = TimedeltaIndex( + ["0 days 00:00:00.400", "0 days 00:00:00.450", "0 days 00:00:01.200"] + ) + tm.assert_index_equal(TimedeltaIndex([400, 450, 1200], unit="ms"), expected) + + def test_constructor_iso(self): + # GH #21877 + expected = timedelta_range("1s", periods=9, freq="s") + durations = ["P0DT0H0M{}S".format(i) for i in range(1, 10)] + result = to_timedelta(durations) + tm.assert_index_equal(result, expected) + + def test_constructor_coverage(self): + rng = timedelta_range("1 days", periods=10.5) + exp = timedelta_range("1 days", periods=10) + tm.assert_index_equal(rng, exp) + + msg = "periods must be a number, got foo" + with pytest.raises(TypeError, match=msg): + timedelta_range(start="1 days", periods="foo", freq="D") + + with pytest.raises(TypeError): + TimedeltaIndex("1 days") + + # generator expression + gen = (timedelta(i) for i in range(10)) + result = TimedeltaIndex(gen) + expected = TimedeltaIndex([timedelta(i) for i in range(10)]) + tm.assert_index_equal(result, expected) + + # NumPy string array + strings = np.array(["1 days", "2 days", "3 days"]) + result = TimedeltaIndex(strings) + expected = to_timedelta([1, 2, 3], unit="d") + tm.assert_index_equal(result, expected) + + from_ints = TimedeltaIndex(expected.asi8) + tm.assert_index_equal(from_ints, expected) + + # non-conforming freq + msg = ( + "Inferred frequency None from passed values does not conform to " + "passed frequency D" + ) + with pytest.raises(ValueError, match=msg): + TimedeltaIndex(["1 days", "2 days", "4 days"], freq="D") + + msg = ( + "Of the four parameters: start, end, periods, and freq, exactly " + "three must be specified" + ) + with pytest.raises(ValueError, match=msg): + timedelta_range(periods=10, freq="D") + + def test_constructor_name(self): + idx = timedelta_range(start="1 days", periods=1, freq="D", name="TEST") + assert idx.name == "TEST" + + # GH10025 + idx2 = TimedeltaIndex(idx, name="something else") + assert idx2.name == "something else" + + def test_constructor_no_precision_raises(self): + # GH-24753, GH-24739 + + msg = "with no precision is not allowed" + with pytest.raises(ValueError, match=msg): + pd.TimedeltaIndex(["2000"], dtype="timedelta64") + + with pytest.raises(ValueError, match=msg): + pd.Index(["2000"], dtype="timedelta64") + + def test_constructor_wrong_precision_raises(self): + with pytest.raises(ValueError): + pd.TimedeltaIndex(["2000"], dtype="timedelta64[us]") diff --git a/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_formats.py b/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_formats.py new file mode 100644 index 0000000..1dfc5b5 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_formats.py @@ -0,0 +1,90 @@ +import pytest + +import pandas as pd +from pandas import TimedeltaIndex + + +class TestTimedeltaIndexRendering: + @pytest.mark.parametrize("method", ["__repr__", "__str__"]) + def test_representation(self, method): + idx1 = TimedeltaIndex([], freq="D") + idx2 = TimedeltaIndex(["1 days"], freq="D") + idx3 = TimedeltaIndex(["1 days", "2 days"], freq="D") + idx4 = TimedeltaIndex(["1 days", "2 days", "3 days"], freq="D") + idx5 = TimedeltaIndex(["1 days 00:00:01", "2 days", "3 days"]) + + exp1 = "TimedeltaIndex([], dtype='timedelta64[ns]', freq='D')" + + exp2 = "TimedeltaIndex(['1 days'], dtype='timedelta64[ns]', freq='D')" + + exp3 = "TimedeltaIndex(['1 days', '2 days'], dtype='timedelta64[ns]', freq='D')" + + exp4 = ( + "TimedeltaIndex(['1 days', '2 days', '3 days'], " + "dtype='timedelta64[ns]', freq='D')" + ) + + exp5 = ( + "TimedeltaIndex(['1 days 00:00:01', '2 days 00:00:00', " + "'3 days 00:00:00'], dtype='timedelta64[ns]', freq=None)" + ) + + with pd.option_context("display.width", 300): + for idx, expected in zip( + [idx1, idx2, idx3, idx4, idx5], [exp1, exp2, exp3, exp4, exp5] + ): + result = getattr(idx, method)() + assert result == expected + + def test_representation_to_series(self): + idx1 = TimedeltaIndex([], freq="D") + idx2 = TimedeltaIndex(["1 days"], freq="D") + idx3 = TimedeltaIndex(["1 days", "2 days"], freq="D") + idx4 = TimedeltaIndex(["1 days", "2 days", "3 days"], freq="D") + idx5 = TimedeltaIndex(["1 days 00:00:01", "2 days", "3 days"]) + + exp1 = """Series([], dtype: timedelta64[ns])""" + + exp2 = "0 1 days\ndtype: timedelta64[ns]" + + exp3 = "0 1 days\n1 2 days\ndtype: timedelta64[ns]" + + exp4 = "0 1 days\n1 2 days\n2 3 days\ndtype: timedelta64[ns]" + + exp5 = ( + "0 1 days 00:00:01\n" + "1 2 days 00:00:00\n" + "2 3 days 00:00:00\n" + "dtype: timedelta64[ns]" + ) + + with pd.option_context("display.width", 300): + for idx, expected in zip( + [idx1, idx2, idx3, idx4, idx5], [exp1, exp2, exp3, exp4, exp5] + ): + result = repr(pd.Series(idx)) + assert result == expected + + def test_summary(self): + # GH#9116 + idx1 = TimedeltaIndex([], freq="D") + idx2 = TimedeltaIndex(["1 days"], freq="D") + idx3 = TimedeltaIndex(["1 days", "2 days"], freq="D") + idx4 = TimedeltaIndex(["1 days", "2 days", "3 days"], freq="D") + idx5 = TimedeltaIndex(["1 days 00:00:01", "2 days", "3 days"]) + + exp1 = "TimedeltaIndex: 0 entries\nFreq: D" + + exp2 = "TimedeltaIndex: 1 entries, 1 days to 1 days\nFreq: D" + + exp3 = "TimedeltaIndex: 2 entries, 1 days to 2 days\nFreq: D" + + exp4 = "TimedeltaIndex: 3 entries, 1 days to 3 days\nFreq: D" + + exp5 = "TimedeltaIndex: 3 entries, 1 days 00:00:01 to 3 days 00:00:00" + + for idx, expected in zip( + [idx1, idx2, idx3, idx4, idx5], [exp1, exp2, exp3, exp4, exp5] + ): + result = idx._summary() + assert result == expected diff --git a/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_indexing.py b/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_indexing.py new file mode 100644 index 0000000..e8665ee --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_indexing.py @@ -0,0 +1,384 @@ +from datetime import datetime, timedelta + +import numpy as np +import pytest + +import pandas as pd +from pandas import Index, Timedelta, TimedeltaIndex, notna, timedelta_range +import pandas._testing as tm + + +class TestGetItem: + def test_ellipsis(self): + # GH#21282 + idx = timedelta_range("1 day", "31 day", freq="D", name="idx") + + result = idx[...] + assert result.equals(idx) + assert result is not idx + + def test_getitem(self): + idx1 = timedelta_range("1 day", "31 day", freq="D", name="idx") + + for idx in [idx1]: + result = idx[0] + assert result == Timedelta("1 day") + + result = idx[0:5] + expected = timedelta_range("1 day", "5 day", freq="D", name="idx") + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + result = idx[0:10:2] + expected = timedelta_range("1 day", "9 day", freq="2D", name="idx") + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + result = idx[-20:-5:3] + expected = timedelta_range("12 day", "24 day", freq="3D", name="idx") + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + result = idx[4::-1] + expected = TimedeltaIndex( + ["5 day", "4 day", "3 day", "2 day", "1 day"], freq="-1D", name="idx" + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + @pytest.mark.parametrize( + "key", + [pd.Timestamp("1970-01-01"), pd.Timestamp("1970-01-02"), datetime(1970, 1, 1)], + ) + def test_timestamp_invalid_key(self, key): + # GH#20464 + tdi = pd.timedelta_range(0, periods=10) + with pytest.raises(TypeError): + tdi.get_loc(key) + + +class TestWhere: + def test_where_invalid_dtypes(self): + tdi = timedelta_range("1 day", periods=3, freq="D", name="idx") + + i2 = tdi.copy() + i2 = Index([pd.NaT, pd.NaT] + tdi[2:].tolist()) + + with pytest.raises(TypeError, match="Where requires matching dtype"): + tdi.where(notna(i2), i2.asi8) + + with pytest.raises(TypeError, match="Where requires matching dtype"): + tdi.where(notna(i2), i2 + pd.Timestamp.now()) + + with pytest.raises(TypeError, match="Where requires matching dtype"): + tdi.where(notna(i2), (i2 + pd.Timestamp.now()).to_period("D")) + + +class TestTake: + def test_take(self): + # GH 10295 + idx1 = timedelta_range("1 day", "31 day", freq="D", name="idx") + + for idx in [idx1]: + result = idx.take([0]) + assert result == Timedelta("1 day") + + result = idx.take([-1]) + assert result == Timedelta("31 day") + + result = idx.take([0, 1, 2]) + expected = timedelta_range("1 day", "3 day", freq="D", name="idx") + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + result = idx.take([0, 2, 4]) + expected = timedelta_range("1 day", "5 day", freq="2D", name="idx") + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + result = idx.take([7, 4, 1]) + expected = timedelta_range("8 day", "2 day", freq="-3D", name="idx") + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + result = idx.take([3, 2, 5]) + expected = TimedeltaIndex(["4 day", "3 day", "6 day"], name="idx") + tm.assert_index_equal(result, expected) + assert result.freq is None + + result = idx.take([-3, 2, 5]) + expected = TimedeltaIndex(["29 day", "3 day", "6 day"], name="idx") + tm.assert_index_equal(result, expected) + assert result.freq is None + + def test_take_invalid_kwargs(self): + idx = timedelta_range("1 day", "31 day", freq="D", name="idx") + indices = [1, 6, 5, 9, 10, 13, 15, 3] + + msg = r"take\(\) got an unexpected keyword argument 'foo'" + with pytest.raises(TypeError, match=msg): + idx.take(indices, foo=2) + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + idx.take(indices, out=indices) + + msg = "the 'mode' parameter is not supported" + with pytest.raises(ValueError, match=msg): + idx.take(indices, mode="clip") + + # TODO: This method came from test_timedelta; de-dup with version above + def test_take2(self): + tds = ["1day 02:00:00", "1 day 04:00:00", "1 day 10:00:00"] + idx = timedelta_range(start="1d", end="2d", freq="H", name="idx") + expected = TimedeltaIndex(tds, freq=None, name="idx") + + taken1 = idx.take([2, 4, 10]) + taken2 = idx[[2, 4, 10]] + + for taken in [taken1, taken2]: + tm.assert_index_equal(taken, expected) + assert isinstance(taken, TimedeltaIndex) + assert taken.freq is None + assert taken.name == expected.name + + def test_take_fill_value(self): + # GH 12631 + idx = TimedeltaIndex(["1 days", "2 days", "3 days"], name="xxx") + result = idx.take(np.array([1, 0, -1])) + expected = TimedeltaIndex(["2 days", "1 days", "3 days"], name="xxx") + tm.assert_index_equal(result, expected) + + # fill_value + result = idx.take(np.array([1, 0, -1]), fill_value=True) + expected = TimedeltaIndex(["2 days", "1 days", "NaT"], name="xxx") + tm.assert_index_equal(result, expected) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = TimedeltaIndex(["2 days", "1 days", "3 days"], name="xxx") + tm.assert_index_equal(result, expected) + + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + with pytest.raises(IndexError): + idx.take(np.array([1, -5])) + + +class TestTimedeltaIndex: + def test_insert_empty(self): + # Corner case inserting with length zero doesnt raise IndexError + idx = timedelta_range("1 Day", periods=3) + td = idx[0] + + idx[:0].insert(0, td) + idx[:0].insert(1, td) + idx[:0].insert(-1, td) + + def test_insert(self): + + idx = TimedeltaIndex(["4day", "1day", "2day"], name="idx") + + result = idx.insert(2, timedelta(days=5)) + exp = TimedeltaIndex(["4day", "1day", "5day", "2day"], name="idx") + tm.assert_index_equal(result, exp) + + # insertion of non-datetime should coerce to object index + result = idx.insert(1, "inserted") + expected = Index( + [Timedelta("4day"), "inserted", Timedelta("1day"), Timedelta("2day")], + name="idx", + ) + assert not isinstance(result, TimedeltaIndex) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + + idx = timedelta_range("1day 00:00:01", periods=3, freq="s", name="idx") + + # preserve freq + expected_0 = TimedeltaIndex( + ["1day", "1day 00:00:01", "1day 00:00:02", "1day 00:00:03"], + name="idx", + freq="s", + ) + expected_3 = TimedeltaIndex( + ["1day 00:00:01", "1day 00:00:02", "1day 00:00:03", "1day 00:00:04"], + name="idx", + freq="s", + ) + + # reset freq to None + expected_1_nofreq = TimedeltaIndex( + ["1day 00:00:01", "1day 00:00:01", "1day 00:00:02", "1day 00:00:03"], + name="idx", + freq=None, + ) + expected_3_nofreq = TimedeltaIndex( + ["1day 00:00:01", "1day 00:00:02", "1day 00:00:03", "1day 00:00:05"], + name="idx", + freq=None, + ) + + cases = [ + (0, Timedelta("1day"), expected_0), + (-3, Timedelta("1day"), expected_0), + (3, Timedelta("1day 00:00:04"), expected_3), + (1, Timedelta("1day 00:00:01"), expected_1_nofreq), + (3, Timedelta("1day 00:00:05"), expected_3_nofreq), + ] + + for n, d, expected in cases: + result = idx.insert(n, d) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + @pytest.mark.parametrize( + "null", [None, np.nan, np.timedelta64("NaT"), pd.NaT, pd.NA] + ) + def test_insert_nat(self, null): + # GH 18295 (test missing) + idx = timedelta_range("1day", "3day") + result = idx.insert(1, null) + expected = TimedeltaIndex(["1day", pd.NaT, "2day", "3day"]) + tm.assert_index_equal(result, expected) + + def test_insert_invalid_na(self): + idx = TimedeltaIndex(["4day", "1day", "2day"], name="idx") + with pytest.raises(TypeError, match="incompatible label"): + idx.insert(0, np.datetime64("NaT")) + + def test_insert_dont_cast_strings(self): + # To match DatetimeIndex and PeriodIndex behavior, dont try to + # parse strings to Timedelta + idx = timedelta_range("1day", "3day") + + result = idx.insert(0, "1 Day") + assert result.dtype == object + assert result[0] == "1 Day" + + def test_delete(self): + idx = timedelta_range(start="1 Days", periods=5, freq="D", name="idx") + + # preserve freq + expected_0 = timedelta_range(start="2 Days", periods=4, freq="D", name="idx") + expected_4 = timedelta_range(start="1 Days", periods=4, freq="D", name="idx") + + # reset freq to None + expected_1 = TimedeltaIndex( + ["1 day", "3 day", "4 day", "5 day"], freq=None, name="idx" + ) + + cases = { + 0: expected_0, + -5: expected_0, + -1: expected_4, + 4: expected_4, + 1: expected_1, + } + for n, expected in cases.items(): + result = idx.delete(n) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + with pytest.raises((IndexError, ValueError)): + # either depending on numpy version + idx.delete(5) + + def test_delete_slice(self): + idx = timedelta_range(start="1 days", periods=10, freq="D", name="idx") + + # preserve freq + expected_0_2 = timedelta_range(start="4 days", periods=7, freq="D", name="idx") + expected_7_9 = timedelta_range(start="1 days", periods=7, freq="D", name="idx") + + # reset freq to None + expected_3_5 = TimedeltaIndex( + ["1 d", "2 d", "3 d", "7 d", "8 d", "9 d", "10d"], freq=None, name="idx" + ) + + cases = { + (0, 1, 2): expected_0_2, + (7, 8, 9): expected_7_9, + (3, 4, 5): expected_3_5, + } + for n, expected in cases.items(): + result = idx.delete(n) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + result = idx.delete(slice(n[0], n[-1] + 1)) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + def test_get_loc(self): + idx = pd.to_timedelta(["0 days", "1 days", "2 days"]) + + for method in [None, "pad", "backfill", "nearest"]: + assert idx.get_loc(idx[1], method) == 1 + assert idx.get_loc(idx[1].to_pytimedelta(), method) == 1 + assert idx.get_loc(str(idx[1]), method) == 1 + + assert idx.get_loc(idx[1], "pad", tolerance=Timedelta(0)) == 1 + assert idx.get_loc(idx[1], "pad", tolerance=np.timedelta64(0, "s")) == 1 + assert idx.get_loc(idx[1], "pad", tolerance=timedelta(0)) == 1 + + with pytest.raises(ValueError, match="unit abbreviation w/o a number"): + idx.get_loc(idx[1], method="nearest", tolerance="foo") + + with pytest.raises(ValueError, match="tolerance size must match"): + idx.get_loc( + idx[1], + method="nearest", + tolerance=[ + Timedelta(0).to_timedelta64(), + Timedelta(0).to_timedelta64(), + ], + ) + + for method, loc in [("pad", 1), ("backfill", 2), ("nearest", 1)]: + assert idx.get_loc("1 day 1 hour", method) == loc + + # GH 16909 + assert idx.get_loc(idx[1].to_timedelta64()) == 1 + + # GH 16896 + assert idx.get_loc("0 days") == 0 + + def test_get_loc_nat(self): + tidx = TimedeltaIndex(["1 days 01:00:00", "NaT", "2 days 01:00:00"]) + + assert tidx.get_loc(pd.NaT) == 1 + assert tidx.get_loc(None) == 1 + assert tidx.get_loc(float("nan")) == 1 + assert tidx.get_loc(np.nan) == 1 + + def test_get_indexer(self): + idx = pd.to_timedelta(["0 days", "1 days", "2 days"]) + tm.assert_numpy_array_equal( + idx.get_indexer(idx), np.array([0, 1, 2], dtype=np.intp) + ) + + target = pd.to_timedelta(["-1 hour", "12 hours", "1 day 1 hour"]) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "pad"), np.array([-1, 0, 1], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "backfill"), np.array([0, 1, 2], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "nearest"), np.array([0, 1, 1], dtype=np.intp) + ) + + res = idx.get_indexer(target, "nearest", tolerance=Timedelta("1 hour")) + tm.assert_numpy_array_equal(res, np.array([0, -1, 1], dtype=np.intp)) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_ops.py b/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_ops.py new file mode 100644 index 0000000..25f27da --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_ops.py @@ -0,0 +1,316 @@ +from datetime import timedelta + +import numpy as np +import pytest + +from pandas.core.dtypes.generic import ABCDateOffset + +import pandas as pd +from pandas import Series, TimedeltaIndex, timedelta_range +import pandas._testing as tm +from pandas.tests.base.test_ops import Ops + +from pandas.tseries.offsets import Day, Hour + + +class TestTimedeltaIndexOps(Ops): + def setup_method(self, method): + super().setup_method(method) + mask = lambda x: isinstance(x, TimedeltaIndex) + self.is_valid_objs = [o for o in self.objs if mask(o)] + self.not_valid_objs = [] + + def test_ops_properties(self): + f = lambda x: isinstance(x, TimedeltaIndex) + self.check_ops_properties(TimedeltaIndex._field_ops, f) + self.check_ops_properties(TimedeltaIndex._object_ops, f) + + def test_value_counts_unique(self): + # GH 7735 + + idx = timedelta_range("1 days 09:00:00", freq="H", periods=10) + # create repeated values, 'n'th element is repeated by n+1 times + idx = TimedeltaIndex(np.repeat(idx.values, range(1, len(idx) + 1))) + + exp_idx = timedelta_range("1 days 18:00:00", freq="-1H", periods=10) + expected = Series(range(10, 0, -1), index=exp_idx, dtype="int64") + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) + + expected = timedelta_range("1 days 09:00:00", freq="H", periods=10) + tm.assert_index_equal(idx.unique(), expected) + + idx = TimedeltaIndex( + [ + "1 days 09:00:00", + "1 days 09:00:00", + "1 days 09:00:00", + "1 days 08:00:00", + "1 days 08:00:00", + pd.NaT, + ] + ) + + exp_idx = TimedeltaIndex(["1 days 09:00:00", "1 days 08:00:00"]) + expected = Series([3, 2], index=exp_idx) + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) + + exp_idx = TimedeltaIndex(["1 days 09:00:00", "1 days 08:00:00", pd.NaT]) + expected = Series([3, 2, 1], index=exp_idx) + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(dropna=False), expected) + + tm.assert_index_equal(idx.unique(), exp_idx) + + def test_nonunique_contains(self): + # GH 9512 + for idx in map( + TimedeltaIndex, + ( + [0, 1, 0], + [0, 0, -1], + [0, -1, -1], + ["00:01:00", "00:01:00", "00:02:00"], + ["00:01:00", "00:01:00", "00:00:01"], + ), + ): + assert idx[0] in idx + + def test_unknown_attribute(self): + # see gh-9680 + tdi = pd.timedelta_range(start=0, periods=10, freq="1s") + ts = pd.Series(np.random.normal(size=10), index=tdi) + assert "foo" not in ts.__dict__.keys() + msg = "'Series' object has no attribute 'foo'" + with pytest.raises(AttributeError, match=msg): + ts.foo + + def test_order(self): + # GH 10295 + idx1 = TimedeltaIndex(["1 day", "2 day", "3 day"], freq="D", name="idx") + idx2 = TimedeltaIndex(["1 hour", "2 hour", "3 hour"], freq="H", name="idx") + + for idx in [idx1, idx2]: + ordered = idx.sort_values() + tm.assert_index_equal(ordered, idx) + assert ordered.freq == idx.freq + + ordered = idx.sort_values(ascending=False) + expected = idx[::-1] + tm.assert_index_equal(ordered, expected) + assert ordered.freq == expected.freq + assert ordered.freq.n == -1 + + ordered, indexer = idx.sort_values(return_indexer=True) + tm.assert_index_equal(ordered, idx) + tm.assert_numpy_array_equal(indexer, np.array([0, 1, 2]), check_dtype=False) + assert ordered.freq == idx.freq + + ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) + tm.assert_index_equal(ordered, idx[::-1]) + assert ordered.freq == expected.freq + assert ordered.freq.n == -1 + + idx1 = TimedeltaIndex( + ["1 hour", "3 hour", "5 hour", "2 hour ", "1 hour"], name="idx1" + ) + exp1 = TimedeltaIndex( + ["1 hour", "1 hour", "2 hour", "3 hour", "5 hour"], name="idx1" + ) + + idx2 = TimedeltaIndex( + ["1 day", "3 day", "5 day", "2 day", "1 day"], name="idx2" + ) + + # TODO(wesm): unused? + # exp2 = TimedeltaIndex(['1 day', '1 day', '2 day', + # '3 day', '5 day'], name='idx2') + + # idx3 = TimedeltaIndex([pd.NaT, '3 minute', '5 minute', + # '2 minute', pd.NaT], name='idx3') + # exp3 = TimedeltaIndex([pd.NaT, pd.NaT, '2 minute', '3 minute', + # '5 minute'], name='idx3') + + for idx, expected in [(idx1, exp1), (idx1, exp1), (idx1, exp1)]: + ordered = idx.sort_values() + tm.assert_index_equal(ordered, expected) + assert ordered.freq is None + + ordered = idx.sort_values(ascending=False) + tm.assert_index_equal(ordered, expected[::-1]) + assert ordered.freq is None + + ordered, indexer = idx.sort_values(return_indexer=True) + tm.assert_index_equal(ordered, expected) + + exp = np.array([0, 4, 3, 1, 2]) + tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) + assert ordered.freq is None + + ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) + tm.assert_index_equal(ordered, expected[::-1]) + + exp = np.array([2, 1, 3, 4, 0]) + tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) + assert ordered.freq is None + + def test_drop_duplicates_metadata(self): + # GH 10115 + idx = pd.timedelta_range("1 day", "31 day", freq="D", name="idx") + result = idx.drop_duplicates() + tm.assert_index_equal(idx, result) + assert idx.freq == result.freq + + idx_dup = idx.append(idx) + assert idx_dup.freq is None # freq is reset + result = idx_dup.drop_duplicates() + tm.assert_index_equal(idx, result) + assert result.freq is None + + def test_drop_duplicates(self): + # to check Index/Series compat + base = pd.timedelta_range("1 day", "31 day", freq="D", name="idx") + idx = base.append(base[:5]) + + res = idx.drop_duplicates() + tm.assert_index_equal(res, base) + res = Series(idx).drop_duplicates() + tm.assert_series_equal(res, Series(base)) + + res = idx.drop_duplicates(keep="last") + exp = base[5:].append(base[:5]) + tm.assert_index_equal(res, exp) + res = Series(idx).drop_duplicates(keep="last") + tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) + + res = idx.drop_duplicates(keep=False) + tm.assert_index_equal(res, base[5:]) + res = Series(idx).drop_duplicates(keep=False) + tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) + + @pytest.mark.parametrize( + "freq", ["D", "3D", "-3D", "H", "2H", "-2H", "T", "2T", "S", "-3S"] + ) + def test_infer_freq(self, freq): + # GH#11018 + idx = pd.timedelta_range("1", freq=freq, periods=10) + result = pd.TimedeltaIndex(idx.asi8, freq="infer") + tm.assert_index_equal(idx, result) + assert result.freq == freq + + def test_shift(self): + pass # handled in test_arithmetic.py + + def test_repeat(self): + index = pd.timedelta_range("1 days", periods=2, freq="D") + exp = pd.TimedeltaIndex(["1 days", "1 days", "2 days", "2 days"]) + for res in [index.repeat(2), np.repeat(index, 2)]: + tm.assert_index_equal(res, exp) + assert res.freq is None + + index = TimedeltaIndex(["1 days", "NaT", "3 days"]) + exp = TimedeltaIndex( + [ + "1 days", + "1 days", + "1 days", + "NaT", + "NaT", + "NaT", + "3 days", + "3 days", + "3 days", + ] + ) + for res in [index.repeat(3), np.repeat(index, 3)]: + tm.assert_index_equal(res, exp) + assert res.freq is None + + def test_nat(self): + assert pd.TimedeltaIndex._na_value is pd.NaT + assert pd.TimedeltaIndex([])._na_value is pd.NaT + + idx = pd.TimedeltaIndex(["1 days", "2 days"]) + assert idx._can_hold_na + + tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) + assert idx.hasnans is False + tm.assert_numpy_array_equal(idx._nan_idxs, np.array([], dtype=np.intp)) + + idx = pd.TimedeltaIndex(["1 days", "NaT"]) + assert idx._can_hold_na + + tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) + assert idx.hasnans is True + tm.assert_numpy_array_equal(idx._nan_idxs, np.array([1], dtype=np.intp)) + + def test_equals(self): + # GH 13107 + idx = pd.TimedeltaIndex(["1 days", "2 days", "NaT"]) + assert idx.equals(idx) + assert idx.equals(idx.copy()) + assert idx.equals(idx.astype(object)) + assert idx.astype(object).equals(idx) + assert idx.astype(object).equals(idx.astype(object)) + assert not idx.equals(list(idx)) + assert not idx.equals(pd.Series(idx)) + + idx2 = pd.TimedeltaIndex(["2 days", "1 days", "NaT"]) + assert not idx.equals(idx2) + assert not idx.equals(idx2.copy()) + assert not idx.equals(idx2.astype(object)) + assert not idx.astype(object).equals(idx2) + assert not idx.astype(object).equals(idx2.astype(object)) + assert not idx.equals(list(idx2)) + assert not idx.equals(pd.Series(idx2)) + + # Check that we dont raise OverflowError on comparisons outside the + # implementation range + oob = pd.Index([timedelta(days=10 ** 6)] * 3, dtype=object) + assert not idx.equals(oob) + assert not idx2.equals(oob) + + # FIXME: oob.apply(np.timedelta64) incorrectly overflows + oob2 = pd.Index([np.timedelta64(x) for x in oob], dtype=object) + assert not idx.equals(oob2) + assert not idx2.equals(oob2) + + @pytest.mark.parametrize("values", [["0 days", "2 days", "4 days"], []]) + @pytest.mark.parametrize("freq", ["2D", Day(2), "48H", Hour(48)]) + def test_freq_setter(self, values, freq): + # GH 20678 + idx = TimedeltaIndex(values) + + # can set to an offset, converting from string if necessary + idx._data.freq = freq + assert idx.freq == freq + assert isinstance(idx.freq, ABCDateOffset) + + # can reset to None + idx._data.freq = None + assert idx.freq is None + + def test_freq_setter_errors(self): + # GH 20678 + idx = TimedeltaIndex(["0 days", "2 days", "4 days"]) + + # setting with an incompatible freq + msg = ( + "Inferred frequency 2D from passed values does not conform to " + "passed frequency 5D" + ) + with pytest.raises(ValueError, match=msg): + idx._data.freq = "5D" + + # setting with a non-fixed frequency + msg = r"<2 \* BusinessDays> is a non-fixed frequency" + with pytest.raises(ValueError, match=msg): + idx._data.freq = "2B" + + # setting with non-freq string + with pytest.raises(ValueError, match="Invalid frequency"): + idx._data.freq = "foo" diff --git a/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_partial_slicing.py b/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_partial_slicing.py new file mode 100644 index 0000000..29e2c7d --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_partial_slicing.py @@ -0,0 +1,90 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import Series, Timedelta, timedelta_range +import pandas._testing as tm + + +class TestSlicing: + def test_slice_keeps_name(self): + # GH4226 + dr = pd.timedelta_range("1d", "5d", freq="H", name="timebucket") + assert dr[1:].name == dr.name + + def test_partial_slice(self): + rng = timedelta_range("1 day 10:11:12", freq="h", periods=500) + s = Series(np.arange(len(rng)), index=rng) + + result = s["5 day":"6 day"] + expected = s.iloc[86:134] + tm.assert_series_equal(result, expected) + + result = s["5 day":] + expected = s.iloc[86:] + tm.assert_series_equal(result, expected) + + result = s[:"6 day"] + expected = s.iloc[:134] + tm.assert_series_equal(result, expected) + + result = s["6 days, 23:11:12"] + assert result == s.iloc[133] + + msg = r"^Timedelta\('50 days 00:00:00'\)$" + with pytest.raises(KeyError, match=msg): + s["50 days"] + + def test_partial_slice_high_reso(self): + + # higher reso + rng = timedelta_range("1 day 10:11:12", freq="us", periods=2000) + s = Series(np.arange(len(rng)), index=rng) + + result = s["1 day 10:11:12":] + expected = s.iloc[0:] + tm.assert_series_equal(result, expected) + + result = s["1 day 10:11:12.001":] + expected = s.iloc[1000:] + tm.assert_series_equal(result, expected) + + result = s["1 days, 10:11:12.001001"] + assert result == s.iloc[1001] + + def test_slice_with_negative_step(self): + ts = Series(np.arange(20), timedelta_range("0", periods=20, freq="H")) + SLC = pd.IndexSlice + + def assert_slices_equivalent(l_slc, i_slc): + tm.assert_series_equal(ts[l_slc], ts.iloc[i_slc]) + tm.assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) + tm.assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) + + assert_slices_equivalent(SLC[Timedelta(hours=7) :: -1], SLC[7::-1]) + assert_slices_equivalent(SLC["7 hours"::-1], SLC[7::-1]) + + assert_slices_equivalent(SLC[: Timedelta(hours=7) : -1], SLC[:6:-1]) + assert_slices_equivalent(SLC[:"7 hours":-1], SLC[:6:-1]) + + assert_slices_equivalent(SLC["15 hours":"7 hours":-1], SLC[15:6:-1]) + assert_slices_equivalent( + SLC[Timedelta(hours=15) : Timedelta(hours=7) : -1], SLC[15:6:-1] + ) + assert_slices_equivalent( + SLC["15 hours" : Timedelta(hours=7) : -1], SLC[15:6:-1] + ) + assert_slices_equivalent( + SLC[Timedelta(hours=15) : "7 hours" : -1], SLC[15:6:-1] + ) + + assert_slices_equivalent(SLC["7 hours":"15 hours":-1], SLC[:0]) + + def test_slice_with_zero_step_raises(self): + ts = Series(np.arange(20), timedelta_range("0", periods=20, freq="H")) + with pytest.raises(ValueError, match="slice step cannot be zero"): + ts[::0] + with pytest.raises(ValueError, match="slice step cannot be zero"): + ts.loc[::0] + with pytest.raises(ValueError, match="slice step cannot be zero"): + ts.loc[::0] diff --git a/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_scalar_compat.py b/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_scalar_compat.py new file mode 100644 index 0000000..44f4a2a --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_scalar_compat.py @@ -0,0 +1,71 @@ +""" +Tests for TimedeltaIndex methods behaving like their Timedelta counterparts +""" + +import numpy as np +import pytest + +import pandas as pd +from pandas import Index, Series, Timedelta, TimedeltaIndex, timedelta_range +import pandas._testing as tm + + +class TestVectorizedTimedelta: + def test_tdi_total_seconds(self): + # GH#10939 + # test index + rng = timedelta_range("1 days, 10:11:12.100123456", periods=2, freq="s") + expt = [ + 1 * 86400 + 10 * 3600 + 11 * 60 + 12 + 100123456.0 / 1e9, + 1 * 86400 + 10 * 3600 + 11 * 60 + 13 + 100123456.0 / 1e9, + ] + tm.assert_almost_equal(rng.total_seconds(), Index(expt)) + + # test Series + ser = Series(rng) + s_expt = Series(expt, index=[0, 1]) + tm.assert_series_equal(ser.dt.total_seconds(), s_expt) + + # with nat + ser[1] = np.nan + s_expt = Series( + [1 * 86400 + 10 * 3600 + 11 * 60 + 12 + 100123456.0 / 1e9, np.nan], + index=[0, 1], + ) + tm.assert_series_equal(ser.dt.total_seconds(), s_expt) + + # with both nat + ser = Series([np.nan, np.nan], dtype="timedelta64[ns]") + tm.assert_series_equal( + ser.dt.total_seconds(), Series([np.nan, np.nan], index=[0, 1]) + ) + + def test_tdi_round(self): + td = pd.timedelta_range(start="16801 days", periods=5, freq="30Min") + elt = td[1] + + expected_rng = TimedeltaIndex( + [ + Timedelta("16801 days 00:00:00"), + Timedelta("16801 days 00:00:00"), + Timedelta("16801 days 01:00:00"), + Timedelta("16801 days 02:00:00"), + Timedelta("16801 days 02:00:00"), + ] + ) + expected_elt = expected_rng[1] + + tm.assert_index_equal(td.round(freq="H"), expected_rng) + assert elt.round(freq="H") == expected_elt + + msg = pd._libs.tslibs.frequencies.INVALID_FREQ_ERR_MSG + with pytest.raises(ValueError, match=msg): + td.round(freq="foo") + with pytest.raises(ValueError, match=msg): + elt.round(freq="foo") + + msg = " is a non-fixed frequency" + with pytest.raises(ValueError, match=msg): + td.round(freq="M") + with pytest.raises(ValueError, match=msg): + elt.round(freq="M") diff --git a/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_setops.py b/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_setops.py new file mode 100644 index 0000000..0aa784c --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_setops.py @@ -0,0 +1,260 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import Int64Index, TimedeltaIndex, timedelta_range +import pandas._testing as tm + +from pandas.tseries.offsets import Hour + + +class TestTimedeltaIndex: + def test_union(self): + + i1 = timedelta_range("1day", periods=5) + i2 = timedelta_range("3day", periods=5) + result = i1.union(i2) + expected = timedelta_range("1day", periods=7) + tm.assert_index_equal(result, expected) + + i1 = Int64Index(np.arange(0, 20, 2)) + i2 = timedelta_range(start="1 day", periods=10, freq="D") + i1.union(i2) # Works + i2.union(i1) # Fails with "AttributeError: can't set attribute" + + def test_union_sort_false(self): + tdi = timedelta_range("1day", periods=5) + + left = tdi[3:] + right = tdi[:3] + + # Check that we are testing the desired code path + assert left._can_fast_union(right) + + result = left.union(right) + tm.assert_index_equal(result, tdi) + + result = left.union(right, sort=False) + expected = pd.TimedeltaIndex(["4 Days", "5 Days", "1 Days", "2 Day", "3 Days"]) + tm.assert_index_equal(result, expected) + + def test_union_coverage(self): + + idx = TimedeltaIndex(["3d", "1d", "2d"]) + ordered = TimedeltaIndex(idx.sort_values(), freq="infer") + result = ordered.union(idx) + tm.assert_index_equal(result, ordered) + + result = ordered[:0].union(ordered) + tm.assert_index_equal(result, ordered) + assert result.freq == ordered.freq + + def test_union_bug_1730(self): + + rng_a = timedelta_range("1 day", periods=4, freq="3H") + rng_b = timedelta_range("1 day", periods=4, freq="4H") + + result = rng_a.union(rng_b) + exp = TimedeltaIndex(sorted(set(rng_a) | set(rng_b))) + tm.assert_index_equal(result, exp) + + def test_union_bug_1745(self): + + left = TimedeltaIndex(["1 day 15:19:49.695000"]) + right = TimedeltaIndex( + ["2 day 13:04:21.322000", "1 day 15:27:24.873000", "1 day 15:31:05.350000"] + ) + + result = left.union(right) + exp = TimedeltaIndex(sorted(set(left) | set(right))) + tm.assert_index_equal(result, exp) + + def test_union_bug_4564(self): + + left = timedelta_range("1 day", "30d") + right = left + pd.offsets.Minute(15) + + result = left.union(right) + exp = TimedeltaIndex(sorted(set(left) | set(right))) + tm.assert_index_equal(result, exp) + + def test_union_freq_infer(self): + # When taking the union of two TimedeltaIndexes, we infer + # a freq even if the arguments don't have freq. This matches + # DatetimeIndex behavior. + tdi = pd.timedelta_range("1 Day", periods=5) + left = tdi[[0, 1, 3, 4]] + right = tdi[[2, 3, 1]] + + assert left.freq is None + assert right.freq is None + + result = left.union(right) + tm.assert_index_equal(result, tdi) + assert result.freq == "D" + + def test_intersection_bug_1708(self): + index_1 = timedelta_range("1 day", periods=4, freq="h") + index_2 = index_1 + pd.offsets.Hour(5) + + result = index_1 & index_2 + assert len(result) == 0 + + index_1 = timedelta_range("1 day", periods=4, freq="h") + index_2 = index_1 + pd.offsets.Hour(1) + + result = index_1 & index_2 + expected = timedelta_range("1 day 01:00:00", periods=3, freq="h") + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("sort", [None, False]) + def test_intersection_equal(self, sort): + # GH 24471 Test intersection outcome given the sort keyword + # for equal indicies intersection should return the original index + first = timedelta_range("1 day", periods=4, freq="h") + second = timedelta_range("1 day", periods=4, freq="h") + intersect = first.intersection(second, sort=sort) + if sort is None: + tm.assert_index_equal(intersect, second.sort_values()) + assert tm.equalContents(intersect, second) + + # Corner cases + inter = first.intersection(first, sort=sort) + assert inter is first + + @pytest.mark.parametrize("period_1, period_2", [(0, 4), (4, 0)]) + @pytest.mark.parametrize("sort", [None, False]) + def test_intersection_zero_length(self, period_1, period_2, sort): + # GH 24471 test for non overlap the intersection should be zero length + index_1 = timedelta_range("1 day", periods=period_1, freq="h") + index_2 = timedelta_range("1 day", periods=period_2, freq="h") + expected = timedelta_range("1 day", periods=0, freq="h") + result = index_1.intersection(index_2, sort=sort) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("sort", [None, False]) + def test_zero_length_input_index(self, sort): + # GH 24966 test for 0-len intersections are copied + index_1 = timedelta_range("1 day", periods=0, freq="h") + index_2 = timedelta_range("1 day", periods=3, freq="h") + result = index_1.intersection(index_2, sort=sort) + assert index_1 is not result + assert index_2 is not result + tm.assert_copy(result, index_1) + + @pytest.mark.parametrize( + "rng, expected", + # if target has the same name, it is preserved + [ + ( + timedelta_range("1 day", periods=5, freq="h", name="idx"), + timedelta_range("1 day", periods=4, freq="h", name="idx"), + ), + # if target name is different, it will be reset + ( + timedelta_range("1 day", periods=5, freq="h", name="other"), + timedelta_range("1 day", periods=4, freq="h", name=None), + ), + # if no overlap exists return empty index + ( + timedelta_range("1 day", periods=10, freq="h", name="idx")[5:], + TimedeltaIndex([], name="idx"), + ), + ], + ) + @pytest.mark.parametrize("sort", [None, False]) + def test_intersection(self, rng, expected, sort): + # GH 4690 (with tz) + base = timedelta_range("1 day", periods=4, freq="h", name="idx") + result = base.intersection(rng, sort=sort) + if sort is None: + expected = expected.sort_values() + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + @pytest.mark.parametrize( + "rng, expected", + # part intersection works + [ + ( + TimedeltaIndex(["5 hour", "2 hour", "4 hour", "9 hour"], name="idx"), + TimedeltaIndex(["2 hour", "4 hour"], name="idx"), + ), + # reordered part intersection + ( + TimedeltaIndex(["2 hour", "5 hour", "5 hour", "1 hour"], name="other"), + TimedeltaIndex(["1 hour", "2 hour"], name=None), + ), + # reveresed index + ( + TimedeltaIndex(["1 hour", "2 hour", "4 hour", "3 hour"], name="idx")[ + ::-1 + ], + TimedeltaIndex(["1 hour", "2 hour", "4 hour", "3 hour"], name="idx"), + ), + ], + ) + @pytest.mark.parametrize("sort", [None, False]) + def test_intersection_non_monotonic(self, rng, expected, sort): + # 24471 non-monotonic + base = TimedeltaIndex(["1 hour", "2 hour", "4 hour", "3 hour"], name="idx") + result = base.intersection(rng, sort=sort) + if sort is None: + expected = expected.sort_values() + tm.assert_index_equal(result, expected) + assert result.name == expected.name + + # if reveresed order, frequency is still the same + if all(base == rng[::-1]) and sort is None: + assert isinstance(result.freq, Hour) + else: + assert result.freq is None + + +class TestTimedeltaIndexDifference: + @pytest.mark.parametrize("sort", [None, False]) + def test_difference_freq(self, sort): + # GH14323: Difference of TimedeltaIndex should not preserve frequency + + index = timedelta_range("0 days", "5 days", freq="D") + + other = timedelta_range("1 days", "4 days", freq="D") + expected = TimedeltaIndex(["0 days", "5 days"], freq=None) + idx_diff = index.difference(other, sort) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) + + other = timedelta_range("2 days", "5 days", freq="D") + idx_diff = index.difference(other, sort) + expected = TimedeltaIndex(["0 days", "1 days"], freq=None) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) + + @pytest.mark.parametrize("sort", [None, False]) + def test_difference_sort(self, sort): + + index = pd.TimedeltaIndex( + ["5 days", "3 days", "2 days", "4 days", "1 days", "0 days"] + ) + + other = timedelta_range("1 days", "4 days", freq="D") + idx_diff = index.difference(other, sort) + + expected = TimedeltaIndex(["5 days", "0 days"], freq=None) + + if sort is None: + expected = expected.sort_values() + + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) + + other = timedelta_range("2 days", "5 days", freq="D") + idx_diff = index.difference(other, sort) + expected = TimedeltaIndex(["1 days", "0 days"], freq=None) + + if sort is None: + expected = expected.sort_values() + + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_shift.py b/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_shift.py new file mode 100644 index 0000000..98933ff --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_shift.py @@ -0,0 +1,75 @@ +import pytest + +from pandas.errors import NullFrequencyError + +import pandas as pd +from pandas import TimedeltaIndex +import pandas._testing as tm + + +class TestTimedeltaIndexShift: + + # ------------------------------------------------------------- + # TimedeltaIndex.shift is used by __add__/__sub__ + + def test_tdi_shift_empty(self): + # GH#9903 + idx = pd.TimedeltaIndex([], name="xxx") + tm.assert_index_equal(idx.shift(0, freq="H"), idx) + tm.assert_index_equal(idx.shift(3, freq="H"), idx) + + def test_tdi_shift_hours(self): + # GH#9903 + idx = pd.TimedeltaIndex(["5 hours", "6 hours", "9 hours"], name="xxx") + tm.assert_index_equal(idx.shift(0, freq="H"), idx) + exp = pd.TimedeltaIndex(["8 hours", "9 hours", "12 hours"], name="xxx") + tm.assert_index_equal(idx.shift(3, freq="H"), exp) + exp = pd.TimedeltaIndex(["2 hours", "3 hours", "6 hours"], name="xxx") + tm.assert_index_equal(idx.shift(-3, freq="H"), exp) + + def test_tdi_shift_minutes(self): + # GH#9903 + idx = pd.TimedeltaIndex(["5 hours", "6 hours", "9 hours"], name="xxx") + tm.assert_index_equal(idx.shift(0, freq="T"), idx) + exp = pd.TimedeltaIndex(["05:03:00", "06:03:00", "9:03:00"], name="xxx") + tm.assert_index_equal(idx.shift(3, freq="T"), exp) + exp = pd.TimedeltaIndex(["04:57:00", "05:57:00", "8:57:00"], name="xxx") + tm.assert_index_equal(idx.shift(-3, freq="T"), exp) + + def test_tdi_shift_int(self): + # GH#8083 + trange = pd.to_timedelta(range(5), unit="d") + pd.offsets.Hour(1) + result = trange.shift(1) + expected = TimedeltaIndex( + [ + "1 days 01:00:00", + "2 days 01:00:00", + "3 days 01:00:00", + "4 days 01:00:00", + "5 days 01:00:00", + ], + freq="D", + ) + tm.assert_index_equal(result, expected) + + def test_tdi_shift_nonstandard_freq(self): + # GH#8083 + trange = pd.to_timedelta(range(5), unit="d") + pd.offsets.Hour(1) + result = trange.shift(3, freq="2D 1s") + expected = TimedeltaIndex( + [ + "6 days 01:00:03", + "7 days 01:00:03", + "8 days 01:00:03", + "9 days 01:00:03", + "10 days 01:00:03", + ], + freq="D", + ) + tm.assert_index_equal(result, expected) + + def test_shift_no_freq(self): + # GH#19147 + tdi = TimedeltaIndex(["1 days 01:00:00", "2 days 01:00:00"], freq=None) + with pytest.raises(NullFrequencyError): + tdi.shift(2) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_timedelta.py b/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_timedelta.py new file mode 100644 index 0000000..3b52b93 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -0,0 +1,300 @@ +from datetime import timedelta + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Index, + Int64Index, + Series, + Timedelta, + TimedeltaIndex, + date_range, + timedelta_range, +) +import pandas._testing as tm + +from ..datetimelike import DatetimeLike + +randn = np.random.randn + + +class TestTimedeltaIndex(DatetimeLike): + _holder = TimedeltaIndex + + @pytest.fixture + def indices(self): + return tm.makeTimedeltaIndex(10) + + def create_index(self): + return pd.to_timedelta(range(5), unit="d") + pd.offsets.Hour(1) + + def test_numeric_compat(self): + # Dummy method to override super's version; this test is now done + # in test_arithmetic.py + pass + + def test_shift(self): + pass # this is handled in test_arithmetic.py + + def test_pickle_compat_construction(self): + pass + + def test_fillna_timedelta(self): + # GH 11343 + idx = pd.TimedeltaIndex(["1 day", pd.NaT, "3 day"]) + + exp = pd.TimedeltaIndex(["1 day", "2 day", "3 day"]) + tm.assert_index_equal(idx.fillna(pd.Timedelta("2 day")), exp) + + exp = pd.TimedeltaIndex(["1 day", "3 hour", "3 day"]) + idx.fillna(pd.Timedelta("3 hour")) + + exp = pd.Index( + [pd.Timedelta("1 day"), "x", pd.Timedelta("3 day")], dtype=object + ) + tm.assert_index_equal(idx.fillna("x"), exp) + + def test_isin(self): + + index = tm.makeTimedeltaIndex(4) + result = index.isin(index) + assert result.all() + + result = index.isin(list(index)) + assert result.all() + + tm.assert_almost_equal( + index.isin([index[2], 5]), np.array([False, False, True, False]) + ) + + def test_factorize(self): + idx1 = TimedeltaIndex(["1 day", "1 day", "2 day", "2 day", "3 day", "3 day"]) + + exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.intp) + exp_idx = TimedeltaIndex(["1 day", "2 day", "3 day"]) + + arr, idx = idx1.factorize() + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, exp_idx) + + arr, idx = idx1.factorize(sort=True) + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, exp_idx) + + # freq must be preserved + idx3 = timedelta_range("1 day", periods=4, freq="s") + exp_arr = np.array([0, 1, 2, 3], dtype=np.intp) + arr, idx = idx3.factorize() + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, idx3) + + def test_join_self(self, join_type): + index = timedelta_range("1 day", periods=10) + joined = index.join(index, how=join_type) + tm.assert_index_equal(index, joined) + + def test_does_not_convert_mixed_integer(self): + df = tm.makeCustomDataframe( + 10, + 10, + data_gen_f=lambda *args, **kwargs: randn(), + r_idx_type="i", + c_idx_type="td", + ) + str(df) + + cols = df.columns.join(df.index, how="outer") + joined = cols.join(df.columns) + assert cols.dtype == np.dtype("O") + assert cols.dtype == joined.dtype + tm.assert_index_equal(cols, joined) + + def test_sort_values(self): + + idx = TimedeltaIndex(["4d", "1d", "2d"]) + + ordered = idx.sort_values() + assert ordered.is_monotonic + + ordered = idx.sort_values(ascending=False) + assert ordered[::-1].is_monotonic + + ordered, dexer = idx.sort_values(return_indexer=True) + assert ordered.is_monotonic + + tm.assert_numpy_array_equal(dexer, np.array([1, 2, 0]), check_dtype=False) + + ordered, dexer = idx.sort_values(return_indexer=True, ascending=False) + assert ordered[::-1].is_monotonic + + tm.assert_numpy_array_equal(dexer, np.array([0, 2, 1]), check_dtype=False) + + def test_argmin_argmax(self): + idx = TimedeltaIndex(["1 day 00:00:05", "1 day 00:00:01", "1 day 00:00:02"]) + assert idx.argmin() == 1 + assert idx.argmax() == 0 + + def test_misc_coverage(self): + + rng = timedelta_range("1 day", periods=5) + result = rng.groupby(rng.days) + assert isinstance(list(result.values())[0][0], Timedelta) + + idx = TimedeltaIndex(["3d", "1d", "2d"]) + assert not idx.equals(list(idx)) + + non_td = Index(list("abc")) + assert not idx.equals(list(non_td)) + + def test_map(self): + # test_map_dictlike generally tests + + rng = timedelta_range("1 day", periods=10) + + f = lambda x: x.days + result = rng.map(f) + exp = Int64Index([f(x) for x in rng]) + tm.assert_index_equal(result, exp) + + def test_pass_TimedeltaIndex_to_index(self): + + rng = timedelta_range("1 days", "10 days") + idx = Index(rng, dtype=object) + + expected = Index(rng.to_pytimedelta(), dtype=object) + + tm.assert_numpy_array_equal(idx.values, expected.values) + + def test_pickle(self): + + rng = timedelta_range("1 days", periods=10) + rng_p = tm.round_trip_pickle(rng) + tm.assert_index_equal(rng, rng_p) + + def test_hash_error(self): + index = timedelta_range("1 days", periods=10) + with pytest.raises( + TypeError, match=(f"unhashable type: {repr(type(index).__name__)}") + ): + hash(index) + + def test_append_join_nondatetimeindex(self): + rng = timedelta_range("1 days", periods=10) + idx = Index(["a", "b", "c", "d"]) + + result = rng.append(idx) + assert isinstance(result[0], Timedelta) + + # it works + rng.join(idx, how="outer") + + def test_append_numpy_bug_1681(self): + + td = timedelta_range("1 days", "10 days", freq="2D") + a = DataFrame() + c = DataFrame({"A": "foo", "B": td}, index=td) + str(c) + + result = a.append(c) + assert (result["B"] == td).all() + + def test_delete_doesnt_infer_freq(self): + # GH#30655 behavior matches DatetimeIndex + + tdi = pd.TimedeltaIndex(["1 Day", "2 Days", None, "3 Days", "4 Days"]) + result = tdi.delete(2) + assert result.freq is None + + def test_fields(self): + rng = timedelta_range("1 days, 10:11:12.100123456", periods=2, freq="s") + tm.assert_index_equal(rng.days, Index([1, 1], dtype="int64")) + tm.assert_index_equal( + rng.seconds, + Index([10 * 3600 + 11 * 60 + 12, 10 * 3600 + 11 * 60 + 13], dtype="int64"), + ) + tm.assert_index_equal( + rng.microseconds, Index([100 * 1000 + 123, 100 * 1000 + 123], dtype="int64") + ) + tm.assert_index_equal(rng.nanoseconds, Index([456, 456], dtype="int64")) + + msg = "'TimedeltaIndex' object has no attribute '{}'" + with pytest.raises(AttributeError, match=msg.format("hours")): + rng.hours + with pytest.raises(AttributeError, match=msg.format("minutes")): + rng.minutes + with pytest.raises(AttributeError, match=msg.format("milliseconds")): + rng.milliseconds + + # with nat + s = Series(rng) + s[1] = np.nan + + tm.assert_series_equal(s.dt.days, Series([1, np.nan], index=[0, 1])) + tm.assert_series_equal( + s.dt.seconds, Series([10 * 3600 + 11 * 60 + 12, np.nan], index=[0, 1]) + ) + + # preserve name (GH15589) + rng.name = "name" + assert rng.days.name == "name" + + def test_freq_conversion(self): + + # doc example + + # series + td = Series(date_range("20130101", periods=4)) - Series( + date_range("20121201", periods=4) + ) + td[2] += timedelta(minutes=5, seconds=3) + td[3] = np.nan + + result = td / np.timedelta64(1, "D") + expected = Series([31, 31, (31 * 86400 + 5 * 60 + 3) / 86400.0, np.nan]) + tm.assert_series_equal(result, expected) + + result = td.astype("timedelta64[D]") + expected = Series([31, 31, 31, np.nan]) + tm.assert_series_equal(result, expected) + + result = td / np.timedelta64(1, "s") + expected = Series([31 * 86400, 31 * 86400, 31 * 86400 + 5 * 60 + 3, np.nan]) + tm.assert_series_equal(result, expected) + + result = td.astype("timedelta64[s]") + tm.assert_series_equal(result, expected) + + # tdi + td = TimedeltaIndex(td) + + result = td / np.timedelta64(1, "D") + expected = Index([31, 31, (31 * 86400 + 5 * 60 + 3) / 86400.0, np.nan]) + tm.assert_index_equal(result, expected) + + result = td.astype("timedelta64[D]") + expected = Index([31, 31, 31, np.nan]) + tm.assert_index_equal(result, expected) + + result = td / np.timedelta64(1, "s") + expected = Index([31 * 86400, 31 * 86400, 31 * 86400 + 5 * 60 + 3, np.nan]) + tm.assert_index_equal(result, expected) + + result = td.astype("timedelta64[s]") + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("unit", ["Y", "y", "M"]) + def test_unit_m_y_raises(self, unit): + msg = "Units 'M' and 'Y' are no longer supported" + with pytest.raises(ValueError, match=msg): + TimedeltaIndex([1, 3, 7], unit) + + +class TestTimeSeries: + def test_series_box_timedelta(self): + rng = timedelta_range("1 day 1 s", periods=5, freq="h") + s = Series(rng) + assert isinstance(s[1], Timedelta) + assert isinstance(s.iat[2], Timedelta) diff --git a/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_timedelta_range.py b/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_timedelta_range.py new file mode 100644 index 0000000..1cef9de --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_timedelta_range.py @@ -0,0 +1,80 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import timedelta_range, to_timedelta +import pandas._testing as tm + +from pandas.tseries.offsets import Day, Second + + +class TestTimedeltas: + def test_timedelta_range(self): + + expected = to_timedelta(np.arange(5), unit="D") + result = timedelta_range("0 days", periods=5, freq="D") + tm.assert_index_equal(result, expected) + + expected = to_timedelta(np.arange(11), unit="D") + result = timedelta_range("0 days", "10 days", freq="D") + tm.assert_index_equal(result, expected) + + expected = to_timedelta(np.arange(5), unit="D") + Second(2) + Day() + result = timedelta_range("1 days, 00:00:02", "5 days, 00:00:02", freq="D") + tm.assert_index_equal(result, expected) + + expected = to_timedelta([1, 3, 5, 7, 9], unit="D") + Second(2) + result = timedelta_range("1 days, 00:00:02", periods=5, freq="2D") + tm.assert_index_equal(result, expected) + + expected = to_timedelta(np.arange(50), unit="T") * 30 + result = timedelta_range("0 days", freq="30T", periods=50) + tm.assert_index_equal(result, expected) + + # GH 11776 + arr = np.arange(10).reshape(2, 5) + df = pd.DataFrame(np.arange(10).reshape(2, 5)) + for arg in (arr, df): + with pytest.raises(TypeError, match="1-d array"): + to_timedelta(arg) + for errors in ["ignore", "raise", "coerce"]: + with pytest.raises(TypeError, match="1-d array"): + to_timedelta(arg, errors=errors) + + # issue10583 + df = pd.DataFrame(np.random.normal(size=(10, 4))) + df.index = pd.timedelta_range(start="0s", periods=10, freq="s") + expected = df.loc[pd.Timedelta("0s") :, :] + result = df.loc["0s":, :] + tm.assert_frame_equal(expected, result) + + @pytest.mark.parametrize( + "periods, freq", [(3, "2D"), (5, "D"), (6, "19H12T"), (7, "16H"), (9, "12H")] + ) + def test_linspace_behavior(self, periods, freq): + # GH 20976 + result = timedelta_range(start="0 days", end="4 days", periods=periods) + expected = timedelta_range(start="0 days", end="4 days", freq=freq) + tm.assert_index_equal(result, expected) + + def test_errors(self): + # not enough params + msg = ( + "Of the four parameters: start, end, periods, and freq, " + "exactly three must be specified" + ) + with pytest.raises(ValueError, match=msg): + timedelta_range(start="0 days") + + with pytest.raises(ValueError, match=msg): + timedelta_range(end="5 days") + + with pytest.raises(ValueError, match=msg): + timedelta_range(periods=2) + + with pytest.raises(ValueError, match=msg): + timedelta_range() + + # too many params + with pytest.raises(ValueError, match=msg): + timedelta_range(start="0 days", end="5 days", periods=10, freq="H") diff --git a/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_tools.py b/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_tools.py new file mode 100644 index 0000000..477fc09 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexes/timedeltas/test_tools.py @@ -0,0 +1,146 @@ +from datetime import time, timedelta + +import numpy as np +import pytest + +import pandas as pd +from pandas import Series, TimedeltaIndex, isna, to_timedelta +import pandas._testing as tm + + +class TestTimedeltas: + def test_to_timedelta(self): + + result = to_timedelta(["", ""]) + assert isna(result).all() + + # pass thru + result = to_timedelta(np.array([np.timedelta64(1, "s")])) + expected = pd.Index(np.array([np.timedelta64(1, "s")])) + tm.assert_index_equal(result, expected) + + # Series + expected = Series([timedelta(days=1), timedelta(days=1, seconds=1)]) + result = to_timedelta(Series(["1d", "1days 00:00:01"])) + tm.assert_series_equal(result, expected) + + # with units + result = TimedeltaIndex( + [np.timedelta64(0, "ns"), np.timedelta64(10, "s").astype("m8[ns]")] + ) + expected = to_timedelta([0, 10], unit="s") + tm.assert_index_equal(result, expected) + + # arrays of various dtypes + arr = np.array([1] * 5, dtype="int64") + result = to_timedelta(arr, unit="s") + expected = TimedeltaIndex([np.timedelta64(1, "s")] * 5) + tm.assert_index_equal(result, expected) + + arr = np.array([1] * 5, dtype="int64") + result = to_timedelta(arr, unit="m") + expected = TimedeltaIndex([np.timedelta64(1, "m")] * 5) + tm.assert_index_equal(result, expected) + + arr = np.array([1] * 5, dtype="int64") + result = to_timedelta(arr, unit="h") + expected = TimedeltaIndex([np.timedelta64(1, "h")] * 5) + tm.assert_index_equal(result, expected) + + arr = np.array([1] * 5, dtype="timedelta64[s]") + result = to_timedelta(arr) + expected = TimedeltaIndex([np.timedelta64(1, "s")] * 5) + tm.assert_index_equal(result, expected) + + arr = np.array([1] * 5, dtype="timedelta64[D]") + result = to_timedelta(arr) + expected = TimedeltaIndex([np.timedelta64(1, "D")] * 5) + tm.assert_index_equal(result, expected) + + def test_to_timedelta_invalid(self): + + # bad value for errors parameter + msg = "errors must be one of" + with pytest.raises(ValueError, match=msg): + to_timedelta(["foo"], errors="never") + + # these will error + msg = "invalid unit abbreviation: foo" + with pytest.raises(ValueError, match=msg): + to_timedelta([1, 2], unit="foo") + with pytest.raises(ValueError, match=msg): + to_timedelta(1, unit="foo") + + # time not supported ATM + msg = ( + "Value must be Timedelta, string, integer, float, timedelta or convertible" + ) + with pytest.raises(ValueError, match=msg): + to_timedelta(time(second=1)) + assert to_timedelta(time(second=1), errors="coerce") is pd.NaT + + msg = "unit abbreviation w/o a number" + with pytest.raises(ValueError, match=msg): + to_timedelta(["foo", "bar"]) + tm.assert_index_equal( + TimedeltaIndex([pd.NaT, pd.NaT]), + to_timedelta(["foo", "bar"], errors="coerce"), + ) + + tm.assert_index_equal( + TimedeltaIndex(["1 day", pd.NaT, "1 min"]), + to_timedelta(["1 day", "bar", "1 min"], errors="coerce"), + ) + + # gh-13613: these should not error because errors='ignore' + invalid_data = "apple" + assert invalid_data == to_timedelta(invalid_data, errors="ignore") + + invalid_data = ["apple", "1 days"] + tm.assert_numpy_array_equal( + np.array(invalid_data, dtype=object), + to_timedelta(invalid_data, errors="ignore"), + ) + + invalid_data = pd.Index(["apple", "1 days"]) + tm.assert_index_equal(invalid_data, to_timedelta(invalid_data, errors="ignore")) + + invalid_data = Series(["apple", "1 days"]) + tm.assert_series_equal( + invalid_data, to_timedelta(invalid_data, errors="ignore") + ) + + def test_to_timedelta_via_apply(self): + # GH 5458 + expected = Series([np.timedelta64(1, "s")]) + result = Series(["00:00:01"]).apply(to_timedelta) + tm.assert_series_equal(result, expected) + + result = Series([to_timedelta("00:00:01")]) + tm.assert_series_equal(result, expected) + + def test_to_timedelta_on_missing_values(self): + # GH5438 + timedelta_NaT = np.timedelta64("NaT") + + actual = pd.to_timedelta(Series(["00:00:01", np.nan])) + expected = Series( + [np.timedelta64(1000000000, "ns"), timedelta_NaT], dtype=" obj.ndim - 1: + return + + try: + rs = getattr(obj, method1).__getitem__(_axify(obj, key1, axis)) + + try: + xp = self.get_result(obj=obj, method=method2, key=key2, axis=axis) + except (KeyError, IndexError): + # TODO: why is this allowed? + return + + if is_scalar(rs) and is_scalar(xp): + assert rs == xp + else: + tm.assert_equal(rs, xp) + + except (IndexError, TypeError, KeyError) as detail: + + # if we are in fails, the ok, otherwise raise it + if fails is not None: + if isinstance(detail, fails): + result = f"ok ({type(detail).__name__})" + return + + result = type(detail).__name__ + raise AssertionError(result, detail) + + if typs is None: + typs = self._typs + + if axes is None: + axes = [0, 1] + elif not isinstance(axes, (tuple, list)): + assert isinstance(axes, int) + axes = [axes] + + # check + for kind in self._kinds: + + d = getattr(self, kind) + for ax in axes: + for typ in typs: + if typ not in self._typs: + continue + + obj = d[typ] + _eq(axis=ax, obj=obj, key1=key1, key2=key2) diff --git a/venv/Lib/site-packages/pandas/tests/indexing/conftest.py b/venv/Lib/site-packages/pandas/tests/indexing/conftest.py new file mode 100644 index 0000000..142beda --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/conftest.py @@ -0,0 +1,23 @@ +import numpy as np +import pytest + +from pandas._libs import index as libindex + + +@pytest.fixture( + params=[ + (libindex.Int64Engine, np.int64), + (libindex.Int32Engine, np.int32), + (libindex.Int16Engine, np.int16), + (libindex.Int8Engine, np.int8), + (libindex.UInt64Engine, np.uint64), + (libindex.UInt32Engine, np.uint32), + (libindex.UInt16Engine, np.uint16), + (libindex.UInt8Engine, np.uint8), + (libindex.Float64Engine, np.float64), + (libindex.Float32Engine, np.float32), + ], + ids=lambda x: x[0].__name__, +) +def numeric_indexing_engine_type_and_dtype(request): + return request.param diff --git a/venv/Lib/site-packages/pandas/tests/indexing/interval/__init__.py b/venv/Lib/site-packages/pandas/tests/indexing/interval/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/indexing/interval/test_interval.py b/venv/Lib/site-packages/pandas/tests/indexing/interval/test_interval.py new file mode 100644 index 0000000..6340209 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/interval/test_interval.py @@ -0,0 +1,149 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, IntervalIndex, Series +import pandas._testing as tm + + +class TestIntervalIndex: + def setup_method(self, method): + self.s = Series(np.arange(5), IntervalIndex.from_breaks(np.arange(6))) + + def test_getitem_with_scalar(self): + + s = self.s + + expected = s.iloc[:3] + tm.assert_series_equal(expected, s[:3]) + tm.assert_series_equal(expected, s[:2.5]) + tm.assert_series_equal(expected, s[0.1:2.5]) + + expected = s.iloc[1:4] + tm.assert_series_equal(expected, s[[1.5, 2.5, 3.5]]) + tm.assert_series_equal(expected, s[[2, 3, 4]]) + tm.assert_series_equal(expected, s[[1.5, 3, 4]]) + + expected = s.iloc[2:5] + tm.assert_series_equal(expected, s[s >= 2]) + + @pytest.mark.parametrize("direction", ["increasing", "decreasing"]) + def test_nonoverlapping_monotonic(self, direction, closed): + tpls = [(0, 1), (2, 3), (4, 5)] + if direction == "decreasing": + tpls = tpls[::-1] + + idx = IntervalIndex.from_tuples(tpls, closed=closed) + s = Series(list("abc"), idx) + + for key, expected in zip(idx.left, s): + if idx.closed_left: + assert s[key] == expected + assert s.loc[key] == expected + else: + with pytest.raises(KeyError, match=str(key)): + s[key] + with pytest.raises(KeyError, match=str(key)): + s.loc[key] + + for key, expected in zip(idx.right, s): + if idx.closed_right: + assert s[key] == expected + assert s.loc[key] == expected + else: + with pytest.raises(KeyError, match=str(key)): + s[key] + with pytest.raises(KeyError, match=str(key)): + s.loc[key] + + for key, expected in zip(idx.mid, s): + assert s[key] == expected + assert s.loc[key] == expected + + def test_non_matching(self): + s = self.s + + # this is a departure from our current + # indexing scheme, but simpler + with pytest.raises(KeyError, match="^$"): + s.loc[[-1, 3, 4, 5]] + + with pytest.raises(KeyError, match="^$"): + s.loc[[-1, 3]] + + def test_large_series(self): + s = Series( + np.arange(1000000), index=IntervalIndex.from_breaks(np.arange(1000001)) + ) + + result1 = s.loc[:80000] + result2 = s.loc[0:80000] + result3 = s.loc[0:80000:1] + tm.assert_series_equal(result1, result2) + tm.assert_series_equal(result1, result3) + + def test_loc_getitem_frame(self): + + df = DataFrame({"A": range(10)}) + s = pd.cut(df.A, 5) + df["B"] = s + df = df.set_index("B") + + result = df.loc[4] + expected = df.iloc[4:6] + tm.assert_frame_equal(result, expected) + + with pytest.raises(KeyError, match="10"): + df.loc[10] + + # single list-like + result = df.loc[[4]] + expected = df.iloc[4:6] + tm.assert_frame_equal(result, expected) + + # non-unique + result = df.loc[[4, 5]] + expected = df.take([4, 5, 4, 5]) + tm.assert_frame_equal(result, expected) + + with pytest.raises(KeyError, match="^$"): + df.loc[[10]] + + # partial missing + with pytest.raises(KeyError, match="^$"): + df.loc[[10, 4]] + + +class TestIntervalIndexInsideMultiIndex: + def test_mi_intervalindex_slicing_with_scalar(self): + # GH#27456 + idx = pd.MultiIndex.from_arrays( + [ + pd.Index(["FC", "FC", "FC", "FC", "OWNER", "OWNER", "OWNER", "OWNER"]), + pd.Index( + ["RID1", "RID1", "RID2", "RID2", "RID1", "RID1", "RID2", "RID2"] + ), + pd.IntervalIndex.from_arrays( + [0, 1, 10, 11, 0, 1, 10, 11], [1, 2, 11, 12, 1, 2, 11, 12] + ), + ] + ) + + idx.names = ["Item", "RID", "MP"] + df = pd.DataFrame({"value": [1, 2, 3, 4, 5, 6, 7, 8]}) + df.index = idx + query_df = pd.DataFrame( + { + "Item": ["FC", "OWNER", "FC", "OWNER", "OWNER"], + "RID": ["RID1", "RID1", "RID1", "RID2", "RID2"], + "MP": [0.2, 1.5, 1.6, 11.1, 10.9], + } + ) + + query_df = query_df.sort_index() + + idx = pd.MultiIndex.from_arrays([query_df.Item, query_df.RID, query_df.MP]) + query_df.index = idx + result = df.value.loc[query_df.index] + expected = pd.Series([1, 6, 2, 8, 7], index=idx, name="value") + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexing/interval/test_interval_new.py b/venv/Lib/site-packages/pandas/tests/indexing/interval/test_interval_new.py new file mode 100644 index 0000000..43036fb --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/interval/test_interval_new.py @@ -0,0 +1,248 @@ +import re + +import numpy as np +import pytest + +from pandas import Interval, IntervalIndex, Series +import pandas._testing as tm + + +class TestIntervalIndex: + def setup_method(self, method): + self.s = Series(np.arange(5), IntervalIndex.from_breaks(np.arange(6))) + + def test_loc_with_interval(self): + + # loc with single label / list of labels: + # - Intervals: only exact matches + # - scalars: those that contain it + + s = self.s + + expected = 0 + result = s.loc[Interval(0, 1)] + assert result == expected + result = s[Interval(0, 1)] + assert result == expected + + expected = s.iloc[3:5] + result = s.loc[[Interval(3, 4), Interval(4, 5)]] + tm.assert_series_equal(expected, result) + result = s[[Interval(3, 4), Interval(4, 5)]] + tm.assert_series_equal(expected, result) + + # missing or not exact + with pytest.raises(KeyError, match=re.escape("Interval(3, 5, closed='left')")): + s.loc[Interval(3, 5, closed="left")] + + with pytest.raises(KeyError, match=re.escape("Interval(3, 5, closed='left')")): + s[Interval(3, 5, closed="left")] + + with pytest.raises(KeyError, match=re.escape("Interval(3, 5, closed='right')")): + s[Interval(3, 5)] + + with pytest.raises(KeyError, match=re.escape("Interval(3, 5, closed='right')")): + s.loc[Interval(3, 5)] + + with pytest.raises(KeyError, match=re.escape("Interval(3, 5, closed='right')")): + s[Interval(3, 5)] + + with pytest.raises( + KeyError, match=re.escape("Interval(-2, 0, closed='right')") + ): + s.loc[Interval(-2, 0)] + + with pytest.raises( + KeyError, match=re.escape("Interval(-2, 0, closed='right')") + ): + s[Interval(-2, 0)] + + with pytest.raises(KeyError, match=re.escape("Interval(5, 6, closed='right')")): + s.loc[Interval(5, 6)] + + with pytest.raises(KeyError, match=re.escape("Interval(5, 6, closed='right')")): + s[Interval(5, 6)] + + def test_loc_with_scalar(self): + + # loc with single label / list of labels: + # - Intervals: only exact matches + # - scalars: those that contain it + + s = self.s + + assert s.loc[1] == 0 + assert s.loc[1.5] == 1 + assert s.loc[2] == 1 + + assert s[1] == 0 + assert s[1.5] == 1 + assert s[2] == 1 + + expected = s.iloc[1:4] + tm.assert_series_equal(expected, s.loc[[1.5, 2.5, 3.5]]) + tm.assert_series_equal(expected, s.loc[[2, 3, 4]]) + tm.assert_series_equal(expected, s.loc[[1.5, 3, 4]]) + + expected = s.iloc[[1, 1, 2, 1]] + tm.assert_series_equal(expected, s.loc[[1.5, 2, 2.5, 1.5]]) + + expected = s.iloc[2:5] + tm.assert_series_equal(expected, s.loc[s >= 2]) + + def test_loc_with_slices(self): + + # loc with slices: + # - Interval objects: only works with exact matches + # - scalars: only works for non-overlapping, monotonic intervals, + # and start/stop select location based on the interval that + # contains them: + # (slice_loc(start, stop) == (idx.get_loc(start), idx.get_loc(stop)) + + s = self.s + + # slice of interval + + expected = s.iloc[:3] + result = s.loc[Interval(0, 1) : Interval(2, 3)] + tm.assert_series_equal(expected, result) + result = s[Interval(0, 1) : Interval(2, 3)] + tm.assert_series_equal(expected, result) + + expected = s.iloc[3:] + result = s.loc[Interval(3, 4) :] + tm.assert_series_equal(expected, result) + result = s[Interval(3, 4) :] + tm.assert_series_equal(expected, result) + + msg = "Interval objects are not currently supported" + with pytest.raises(NotImplementedError, match=msg): + s.loc[Interval(3, 6) :] + + with pytest.raises(NotImplementedError, match=msg): + s[Interval(3, 6) :] + + with pytest.raises(NotImplementedError, match=msg): + s.loc[Interval(3, 4, closed="left") :] + + with pytest.raises(NotImplementedError, match=msg): + s[Interval(3, 4, closed="left") :] + + # TODO with non-existing intervals ? + # s.loc[Interval(-1, 0):Interval(2, 3)] + + # slice of scalar + + expected = s.iloc[:3] + tm.assert_series_equal(expected, s.loc[:3]) + tm.assert_series_equal(expected, s.loc[:2.5]) + tm.assert_series_equal(expected, s.loc[0.1:2.5]) + tm.assert_series_equal(expected, s.loc[-1:3]) + + tm.assert_series_equal(expected, s[:3]) + tm.assert_series_equal(expected, s[:2.5]) + tm.assert_series_equal(expected, s[0.1:2.5]) + + # slice of scalar with step != 1 + with pytest.raises(ValueError): + s[0:4:2] + + def test_loc_with_overlap(self): + + idx = IntervalIndex.from_tuples([(1, 5), (3, 7)]) + s = Series(range(len(idx)), index=idx) + + # scalar + expected = s + result = s.loc[4] + tm.assert_series_equal(expected, result) + + result = s[4] + tm.assert_series_equal(expected, result) + + result = s.loc[[4]] + tm.assert_series_equal(expected, result) + + result = s[[4]] + tm.assert_series_equal(expected, result) + + # interval + expected = 0 + result = s.loc[Interval(1, 5)] + result == expected + + result = s[Interval(1, 5)] + result == expected + + expected = s + result = s.loc[[Interval(1, 5), Interval(3, 7)]] + tm.assert_series_equal(expected, result) + + result = s[[Interval(1, 5), Interval(3, 7)]] + tm.assert_series_equal(expected, result) + + with pytest.raises(KeyError, match=re.escape("Interval(3, 5, closed='right')")): + s.loc[Interval(3, 5)] + + with pytest.raises(KeyError, match="^$"): + s.loc[[Interval(3, 5)]] + + with pytest.raises(KeyError, match=re.escape("Interval(3, 5, closed='right')")): + s[Interval(3, 5)] + + with pytest.raises(KeyError, match="^$"): + s[[Interval(3, 5)]] + + # slices with interval (only exact matches) + expected = s + result = s.loc[Interval(1, 5) : Interval(3, 7)] + tm.assert_series_equal(expected, result) + + result = s[Interval(1, 5) : Interval(3, 7)] + tm.assert_series_equal(expected, result) + + msg = "'can only get slices from an IntervalIndex if bounds are" + " non-overlapping and all monotonic increasing or decreasing'" + with pytest.raises(KeyError, match=msg): + s.loc[Interval(1, 6) : Interval(3, 8)] + + with pytest.raises(KeyError, match=msg): + s[Interval(1, 6) : Interval(3, 8)] + + # slices with scalar raise for overlapping intervals + # TODO KeyError is the appropriate error? + with pytest.raises(KeyError, match=msg): + s.loc[1:4] + + def test_non_unique(self): + + idx = IntervalIndex.from_tuples([(1, 3), (3, 7)]) + s = Series(range(len(idx)), index=idx) + + result = s.loc[Interval(1, 3)] + assert result == 0 + + result = s.loc[[Interval(1, 3)]] + expected = s.iloc[0:1] + tm.assert_series_equal(expected, result) + + def test_non_unique_moar(self): + + idx = IntervalIndex.from_tuples([(1, 3), (1, 3), (3, 7)]) + s = Series(range(len(idx)), index=idx) + + expected = s.iloc[[0, 1]] + result = s.loc[Interval(1, 3)] + tm.assert_series_equal(expected, result) + + expected = s + result = s.loc[Interval(1, 3) :] + tm.assert_series_equal(expected, result) + + expected = s + result = s[Interval(1, 3) :] + tm.assert_series_equal(expected, result) + + expected = s.iloc[[0, 1]] + result = s[[Interval(1, 3)]] + tm.assert_series_equal(expected, result) diff --git a/venv/Lib/site-packages/pandas/tests/indexing/multiindex/__init__.py b/venv/Lib/site-packages/pandas/tests/indexing/multiindex/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/indexing/multiindex/conftest.py b/venv/Lib/site-packages/pandas/tests/indexing/multiindex/conftest.py new file mode 100644 index 0000000..e6d5a9e --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/multiindex/conftest.py @@ -0,0 +1,30 @@ +import numpy as np +import pytest + +from pandas import DataFrame, Index, MultiIndex +import pandas._testing as tm + + +@pytest.fixture +def multiindex_dataframe_random_data(): + """DataFrame with 2 level MultiIndex with random data""" + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + return DataFrame( + np.random.randn(10, 3), index=index, columns=Index(["A", "B", "C"], name="exp") + ) + + +@pytest.fixture +def multiindex_year_month_day_dataframe_random_data(): + """DataFrame with 3 level MultiIndex (year, month, day) covering + first 100 business days from 2000-01-01 with random data""" + tdf = tm.makeTimeDataFrame(100) + ymd = tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() + # use Int64Index, to make sure things work + ymd.index.set_levels([lev.astype("i8") for lev in ymd.index.levels], inplace=True) + ymd.index.set_names(["year", "month", "day"], inplace=True) + return ymd diff --git a/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_chaining_and_caching.py b/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_chaining_and_caching.py new file mode 100644 index 0000000..8bfba8c --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_chaining_and_caching.py @@ -0,0 +1,64 @@ +import numpy as np +import pytest + +from pandas import DataFrame, MultiIndex, Series +import pandas._testing as tm +import pandas.core.common as com + + +def test_detect_chained_assignment(): + # Inplace ops, originally from: + # https://stackoverflow.com/questions/20508968/series-fillna-in-a-multiindex-dataframe-does-not-fill-is-this-a-bug + a = [12, 23] + b = [123, None] + c = [1234, 2345] + d = [12345, 23456] + tuples = [("eyes", "left"), ("eyes", "right"), ("ears", "left"), ("ears", "right")] + events = { + ("eyes", "left"): a, + ("eyes", "right"): b, + ("ears", "left"): c, + ("ears", "right"): d, + } + multiind = MultiIndex.from_tuples(tuples, names=["part", "side"]) + zed = DataFrame(events, index=["a", "b"], columns=multiind) + + with pytest.raises(com.SettingWithCopyError): + zed["eyes"]["right"].fillna(value=555, inplace=True) + + +def test_cache_updating(): + # 5216 + # make sure that we don't try to set a dead cache + a = np.random.rand(10, 3) + df = DataFrame(a, columns=["x", "y", "z"]) + tuples = [(i, j) for i in range(5) for j in range(2)] + index = MultiIndex.from_tuples(tuples) + df.index = index + + # setting via chained assignment + # but actually works, since everything is a view + df.loc[0]["z"].iloc[0] = 1.0 + result = df.loc[(0, 0), "z"] + assert result == 1 + + # correct setting + df.loc[(0, 0), "z"] = 2 + result = df.loc[(0, 0), "z"] + assert result == 2 + + +def test_indexer_caching(): + # GH5727 + # make sure that indexers are in the _internal_names_set + n = 1000001 + arrays = (range(n), range(n)) + index = MultiIndex.from_tuples(zip(*arrays)) + s = Series(np.zeros(n), index=index) + str(s) + + # setitem + expected = Series(np.ones(n), index=index) + s = Series(np.zeros(n), index=index) + s[s == 0] = 1 + tm.assert_series_equal(s, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_datetime.py b/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_datetime.py new file mode 100644 index 0000000..907d20c --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_datetime.py @@ -0,0 +1,22 @@ +from datetime import datetime + +import numpy as np + +from pandas import Index, Period, Series, period_range + + +def test_multiindex_period_datetime(): + # GH4861, using datetime in period of multiindex raises exception + + idx1 = Index(["a", "a", "a", "b", "b"]) + idx2 = period_range("2012-01", periods=len(idx1), freq="M") + s = Series(np.random.randn(len(idx1)), [idx1, idx2]) + + # try Period as index + expected = s.iloc[0] + result = s.loc["a", Period("2012-01")] + assert result == expected + + # try datetime as index + result = s.loc["a", datetime(2012, 1, 1)] + assert result == expected diff --git a/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_getitem.py b/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_getitem.py new file mode 100644 index 0000000..8ea825d --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_getitem.py @@ -0,0 +1,252 @@ +import numpy as np +import pytest + +from pandas import DataFrame, Index, MultiIndex, Series +import pandas._testing as tm +from pandas.core.indexing import IndexingError + +# ---------------------------------------------------------------------------- +# test indexing of Series with multi-level Index +# ---------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "access_method", + [lambda s, x: s[:, x], lambda s, x: s.loc[:, x], lambda s, x: s.xs(x, level=1)], +) +@pytest.mark.parametrize( + "level1_value, expected", + [(0, Series([1], index=[0])), (1, Series([2, 3], index=[1, 2]))], +) +def test_series_getitem_multiindex(access_method, level1_value, expected): + + # GH 6018 + # series regression getitem with a multi-index + + s = Series([1, 2, 3]) + s.index = MultiIndex.from_tuples([(0, 0), (1, 1), (2, 1)]) + result = access_method(s, level1_value) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("level0_value", ["D", "A"]) +def test_series_getitem_duplicates_multiindex(level0_value): + # GH 5725 the 'A' happens to be a valid Timestamp so the doesn't raise + # the appropriate error, only in PY3 of course! + + index = MultiIndex( + levels=[[level0_value, "B", "C"], [0, 26, 27, 37, 57, 67, 75, 82]], + codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], + names=["tag", "day"], + ) + arr = np.random.randn(len(index), 1) + df = DataFrame(arr, index=index, columns=["val"]) + + # confirm indexing on missing value raises KeyError + if level0_value != "A": + with pytest.raises(KeyError, match=r"^'A'$"): + df.val["A"] + + with pytest.raises(KeyError, match=r"^'X'$"): + df.val["X"] + + result = df.val[level0_value] + expected = Series( + arr.ravel()[0:3], name="val", index=Index([26, 37, 57], name="day") + ) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("indexer", [lambda s: s[2000, 3], lambda s: s.loc[2000, 3]]) +def test_series_getitem(multiindex_year_month_day_dataframe_random_data, indexer): + s = multiindex_year_month_day_dataframe_random_data["A"] + expected = s.reindex(s.index[42:65]) + expected.index = expected.index.droplevel(0).droplevel(0) + + result = indexer(s) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "indexer", [lambda s: s[2000, 3, 10], lambda s: s.loc[2000, 3, 10]] +) +def test_series_getitem_returns_scalar( + multiindex_year_month_day_dataframe_random_data, indexer +): + s = multiindex_year_month_day_dataframe_random_data["A"] + expected = s.iloc[49] + + result = indexer(s) + assert result == expected + + +@pytest.mark.parametrize( + "indexer,expected_error,expected_error_msg", + [ + (lambda s: s.__getitem__((2000, 3, 4)), KeyError, r"^\(2000, 3, 4\)$"), + (lambda s: s[(2000, 3, 4)], KeyError, r"^\(2000, 3, 4\)$"), + (lambda s: s.loc[(2000, 3, 4)], KeyError, r"^\(2000, 3, 4\)$"), + (lambda s: s.loc[(2000, 3, 4, 5)], IndexingError, "Too many indexers"), + (lambda s: s.__getitem__(len(s)), IndexError, "index out of bounds"), + (lambda s: s[len(s)], IndexError, "index out of bounds"), + ( + lambda s: s.iloc[len(s)], + IndexError, + "single positional indexer is out-of-bounds", + ), + ], +) +def test_series_getitem_indexing_errors( + multiindex_year_month_day_dataframe_random_data, + indexer, + expected_error, + expected_error_msg, +): + s = multiindex_year_month_day_dataframe_random_data["A"] + with pytest.raises(expected_error, match=expected_error_msg): + indexer(s) + + +def test_series_getitem_corner_generator( + multiindex_year_month_day_dataframe_random_data, +): + s = multiindex_year_month_day_dataframe_random_data["A"] + result = s[(x > 0 for x in s)] + expected = s[s > 0] + tm.assert_series_equal(result, expected) + + +# ---------------------------------------------------------------------------- +# test indexing of DataFrame with multi-level Index +# ---------------------------------------------------------------------------- + + +def test_getitem_simple(multiindex_dataframe_random_data): + df = multiindex_dataframe_random_data.T + expected = df.values[:, 0] + result = df["foo", "one"].values + tm.assert_almost_equal(result, expected) + + +@pytest.mark.parametrize( + "indexer,expected_error_msg", + [ + (lambda df: df[("foo", "four")], r"^\('foo', 'four'\)$"), + (lambda df: df["foobar"], r"^'foobar'$"), + ], +) +def test_frame_getitem_simple_key_error( + multiindex_dataframe_random_data, indexer, expected_error_msg +): + df = multiindex_dataframe_random_data.T + with pytest.raises(KeyError, match=expected_error_msg): + indexer(df) + + +def test_frame_getitem_multicolumn_empty_level(): + df = DataFrame({"a": ["1", "2", "3"], "b": ["2", "3", "4"]}) + df.columns = [ + ["level1 item1", "level1 item2"], + ["", "level2 item2"], + ["level3 item1", "level3 item2"], + ] + + result = df["level1 item1"] + expected = DataFrame( + [["1"], ["2"], ["3"]], index=df.index, columns=["level3 item1"] + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "indexer,expected_slice", + [ + (lambda df: df["foo"], slice(3)), + (lambda df: df["bar"], slice(3, 5)), + (lambda df: df.loc[:, "bar"], slice(3, 5)), + ], +) +def test_frame_getitem_toplevel( + multiindex_dataframe_random_data, indexer, expected_slice +): + df = multiindex_dataframe_random_data.T + expected = df.reindex(columns=df.columns[expected_slice]) + expected.columns = expected.columns.droplevel(0) + result = indexer(df) + tm.assert_frame_equal(result, expected) + + +def test_frame_mixed_depth_get(): + arrays = [ + ["a", "top", "top", "routine1", "routine1", "routine2"], + ["", "OD", "OD", "result1", "result2", "result1"], + ["", "wx", "wy", "", "", ""], + ] + + tuples = sorted(zip(*arrays)) + index = MultiIndex.from_tuples(tuples) + df = DataFrame(np.random.randn(4, 6), columns=index) + + result = df["a"] + expected = df["a", "", ""].rename("a") + tm.assert_series_equal(result, expected) + + result = df["routine1", "result1"] + expected = df["routine1", "result1", ""] + expected = expected.rename(("routine1", "result1")) + tm.assert_series_equal(result, expected) + + +# ---------------------------------------------------------------------------- +# test indexing of DataFrame with multi-level Index with duplicates +# ---------------------------------------------------------------------------- + + +@pytest.fixture +def dataframe_with_duplicate_index(): + """Fixture for DataFrame used in tests for gh-4145 and gh-4146""" + data = [["a", "d", "e", "c", "f", "b"], [1, 4, 5, 3, 6, 2], [1, 4, 5, 3, 6, 2]] + index = ["h1", "h3", "h5"] + columns = MultiIndex( + levels=[["A", "B"], ["A1", "A2", "B1", "B2"]], + codes=[[0, 0, 0, 1, 1, 1], [0, 3, 3, 0, 1, 2]], + names=["main", "sub"], + ) + return DataFrame(data, index=index, columns=columns) + + +@pytest.mark.parametrize( + "indexer", [lambda df: df[("A", "A1")], lambda df: df.loc[:, ("A", "A1")]] +) +def test_frame_mi_access(dataframe_with_duplicate_index, indexer): + # GH 4145 + df = dataframe_with_duplicate_index + index = Index(["h1", "h3", "h5"]) + columns = MultiIndex.from_tuples([("A", "A1")], names=["main", "sub"]) + expected = DataFrame([["a", 1, 1]], index=columns, columns=index).T + + result = indexer(df) + tm.assert_frame_equal(result, expected) + + +def test_frame_mi_access_returns_series(dataframe_with_duplicate_index): + # GH 4146, not returning a block manager when selecting a unique index + # from a duplicate index + # as of 4879, this returns a Series (which is similar to what happens + # with a non-unique) + df = dataframe_with_duplicate_index + expected = Series(["a", 1, 1], index=["h1", "h3", "h5"], name="A1") + result = df["A"]["A1"] + tm.assert_series_equal(result, expected) + + +def test_frame_mi_access_returns_frame(dataframe_with_duplicate_index): + # selecting a non_unique from the 2nd level + df = dataframe_with_duplicate_index + expected = DataFrame( + [["d", 4, 4], ["e", 5, 5]], + index=Index(["B2", "B2"], name="sub"), + columns=["h1", "h3", "h5"], + ).T + result = df["A"]["B2"] + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_iloc.py b/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_iloc.py new file mode 100644 index 0000000..9859c72 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_iloc.py @@ -0,0 +1,171 @@ +import numpy as np +import pytest + +from pandas import DataFrame, MultiIndex, Series +import pandas._testing as tm + + +@pytest.fixture +def simple_multiindex_dataframe(): + """ + Factory function to create simple 3 x 3 dataframe with + both columns and row MultiIndex using supplied data or + random data by default. + """ + + def _simple_multiindex_dataframe(data=None): + if data is None: + data = np.random.randn(3, 3) + return DataFrame( + data, columns=[[2, 2, 4], [6, 8, 10]], index=[[4, 4, 8], [8, 10, 12]] + ) + + return _simple_multiindex_dataframe + + +@pytest.mark.parametrize( + "indexer, expected", + [ + ( + lambda df: df.iloc[0], + lambda arr: Series(arr[0], index=[[2, 2, 4], [6, 8, 10]], name=(4, 8)), + ), + ( + lambda df: df.iloc[2], + lambda arr: Series(arr[2], index=[[2, 2, 4], [6, 8, 10]], name=(8, 12)), + ), + ( + lambda df: df.iloc[:, 2], + lambda arr: Series(arr[:, 2], index=[[4, 4, 8], [8, 10, 12]], name=(4, 10)), + ), + ], +) +def test_iloc_returns_series(indexer, expected, simple_multiindex_dataframe): + arr = np.random.randn(3, 3) + df = simple_multiindex_dataframe(arr) + result = indexer(df) + expected = expected(arr) + tm.assert_series_equal(result, expected) + + +def test_iloc_returns_dataframe(simple_multiindex_dataframe): + df = simple_multiindex_dataframe() + result = df.iloc[[0, 1]] + expected = df.xs(4, drop_level=False) + tm.assert_frame_equal(result, expected) + + +def test_iloc_returns_scalar(simple_multiindex_dataframe): + arr = np.random.randn(3, 3) + df = simple_multiindex_dataframe(arr) + result = df.iloc[2, 2] + expected = arr[2, 2] + assert result == expected + + +def test_iloc_getitem_multiple_items(): + # GH 5528 + tup = zip(*[["a", "a", "b", "b"], ["x", "y", "x", "y"]]) + index = MultiIndex.from_tuples(tup) + df = DataFrame(np.random.randn(4, 4), index=index) + result = df.iloc[[2, 3]] + expected = df.xs("b", drop_level=False) + tm.assert_frame_equal(result, expected) + + +def test_iloc_getitem_labels(): + # this is basically regular indexing + arr = np.random.randn(4, 3) + df = DataFrame( + arr, + columns=[["i", "i", "j"], ["A", "A", "B"]], + index=[["i", "i", "j", "k"], ["X", "X", "Y", "Y"]], + ) + result = df.iloc[2, 2] + expected = arr[2, 2] + assert result == expected + + +def test_frame_getitem_slice(multiindex_dataframe_random_data): + df = multiindex_dataframe_random_data + result = df.iloc[:4] + expected = df[:4] + tm.assert_frame_equal(result, expected) + + +def test_frame_setitem_slice(multiindex_dataframe_random_data): + df = multiindex_dataframe_random_data + df.iloc[:4] = 0 + + assert (df.values[:4] == 0).all() + assert (df.values[4:] != 0).all() + + +def test_indexing_ambiguity_bug_1678(): + # GH 1678 + columns = MultiIndex.from_tuples( + [("Ohio", "Green"), ("Ohio", "Red"), ("Colorado", "Green")] + ) + index = MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]) + + df = DataFrame(np.arange(12).reshape((4, 3)), index=index, columns=columns) + + result = df.iloc[:, 1] + expected = df.loc[:, ("Ohio", "Red")] + tm.assert_series_equal(result, expected) + + +def test_iloc_integer_locations(): + # GH 13797 + data = [ + ["str00", "str01"], + ["str10", "str11"], + ["str20", "srt21"], + ["str30", "str31"], + ["str40", "str41"], + ] + + index = MultiIndex.from_tuples( + [("CC", "A"), ("CC", "B"), ("CC", "B"), ("BB", "a"), ("BB", "b")] + ) + + expected = DataFrame(data) + df = DataFrame(data, index=index) + + result = DataFrame([[df.iloc[r, c] for c in range(2)] for r in range(5)]) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data, indexes, values, expected_k", + [ + # test without indexer value in first level of MultiIndex + ([[2, 22, 5], [2, 33, 6]], [0, -1, 1], [2, 3, 1], [7, 10]), + # test like code sample 1 in the issue + ([[1, 22, 555], [1, 33, 666]], [0, -1, 1], [200, 300, 100], [755, 1066]), + # test like code sample 2 in the issue + ([[1, 3, 7], [2, 4, 8]], [0, -1, 1], [10, 10, 1000], [17, 1018]), + # test like code sample 3 in the issue + ([[1, 11, 4], [2, 22, 5], [3, 33, 6]], [0, -1, 1], [4, 7, 10], [8, 15, 13]), + ], +) +def test_iloc_setitem_int_multiindex_series(data, indexes, values, expected_k): + # GH17148 + df = DataFrame(data=data, columns=["i", "j", "k"]) + df = df.set_index(["i", "j"]) + + series = df.k.copy() + for i, v in zip(indexes, values): + series.iloc[i] += v + + df["k"] = expected_k + expected = df.k + tm.assert_series_equal(series, expected) + + +def test_getitem_iloc(multiindex_dataframe_random_data): + df = multiindex_dataframe_random_data + result = df.iloc[2] + expected = df.xs(df.index[2]) + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_indexing_slow.py b/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_indexing_slow.py new file mode 100644 index 0000000..8ea1ceb --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_indexing_slow.py @@ -0,0 +1,92 @@ +import warnings + +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, MultiIndex, Series +import pandas._testing as tm + + +@pytest.mark.slow +@pytest.mark.filterwarnings("ignore::pandas.errors.PerformanceWarning") +def test_multiindex_get_loc(): # GH7724, GH2646 + + with warnings.catch_warnings(record=True): + + # test indexing into a multi-index before & past the lexsort depth + from numpy.random import randint, choice, randn + + cols = ["jim", "joe", "jolie", "joline", "jolia"] + + def validate(mi, df, key): + mask = np.ones(len(df)).astype("bool") + + # test for all partials of this key + for i, k in enumerate(key): + mask &= df.iloc[:, i] == k + + if not mask.any(): + assert key[: i + 1] not in mi.index + continue + + assert key[: i + 1] in mi.index + right = df[mask].copy() + + if i + 1 != len(key): # partial key + right.drop(cols[: i + 1], axis=1, inplace=True) + right.set_index(cols[i + 1 : -1], inplace=True) + tm.assert_frame_equal(mi.loc[key[: i + 1]], right) + + else: # full key + right.set_index(cols[:-1], inplace=True) + if len(right) == 1: # single hit + right = Series( + right["jolia"].values, name=right.index[0], index=["jolia"] + ) + tm.assert_series_equal(mi.loc[key[: i + 1]], right) + else: # multi hit + tm.assert_frame_equal(mi.loc[key[: i + 1]], right) + + def loop(mi, df, keys): + for key in keys: + validate(mi, df, key) + + n, m = 1000, 50 + + vals = [ + randint(0, 10, n), + choice(list("abcdefghij"), n), + choice(pd.date_range("20141009", periods=10).tolist(), n), + choice(list("ZYXWVUTSRQ"), n), + randn(n), + ] + vals = list(map(tuple, zip(*vals))) + + # bunch of keys for testing + keys = [ + randint(0, 11, m), + choice(list("abcdefghijk"), m), + choice(pd.date_range("20141009", periods=11).tolist(), m), + choice(list("ZYXWVUTSRQP"), m), + ] + keys = list(map(tuple, zip(*keys))) + keys += list(map(lambda t: t[:-1], vals[:: n // m])) + + # covers both unique index and non-unique index + df = DataFrame(vals, columns=cols) + a, b = pd.concat([df, df]), df.drop_duplicates(subset=cols[:-1]) + + for frame in a, b: + for i in range(5): # lexsort depth + df = frame.copy() if i == 0 else frame.sort_values(by=cols[:i]) + mi = df.set_index(cols[:-1]) + assert not mi.index.lexsort_depth < i + loop(mi, df, keys) + + +@pytest.mark.slow +def test_large_mi_dataframe_indexing(): + # GH10645 + result = MultiIndex.from_arrays([range(10 ** 6), range(10 ** 6)]) + assert not (10 ** 6, 0) in result diff --git a/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_ix.py b/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_ix.py new file mode 100644 index 0000000..01b0b39 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_ix.py @@ -0,0 +1,63 @@ +import numpy as np +import pytest + +from pandas.errors import PerformanceWarning + +from pandas import DataFrame, MultiIndex +import pandas._testing as tm + + +class TestMultiIndex: + def test_frame_setitem_loc(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + frame.loc[("bar", "two"), "B"] = 5 + assert frame.loc[("bar", "two"), "B"] == 5 + + # with integer labels + df = frame.copy() + df.columns = list(range(3)) + df.loc[("bar", "two"), 1] = 7 + assert df.loc[("bar", "two"), 1] == 7 + + def test_loc_general(self): + + # GH 2817 + data = { + "amount": {0: 700, 1: 600, 2: 222, 3: 333, 4: 444}, + "col": {0: 3.5, 1: 3.5, 2: 4.0, 3: 4.0, 4: 4.0}, + "year": {0: 2012, 1: 2011, 2: 2012, 3: 2012, 4: 2012}, + } + df = DataFrame(data).set_index(keys=["col", "year"]) + key = 4.0, 2012 + + # emits a PerformanceWarning, ok + with tm.assert_produces_warning(PerformanceWarning): + tm.assert_frame_equal(df.loc[key], df.iloc[2:]) + + # this is ok + df.sort_index(inplace=True) + res = df.loc[key] + + # col has float dtype, result should be Float64Index + index = MultiIndex.from_arrays([[4.0] * 3, [2012] * 3], names=["col", "year"]) + expected = DataFrame({"amount": [222, 333, 444]}, index=index) + tm.assert_frame_equal(res, expected) + + def test_loc_multiindex_missing_label_raises(self): + # GH 21593 + df = DataFrame( + np.random.randn(3, 3), + columns=[[2, 2, 4], [6, 8, 10]], + index=[[4, 4, 8], [8, 10, 12]], + ) + + with pytest.raises(KeyError, match=r"^2$"): + df.loc[2] + + def test_series_loc_getitem_fancy( + self, multiindex_year_month_day_dataframe_random_data + ): + s = multiindex_year_month_day_dataframe_random_data["A"] + expected = s.reindex(s.index[49:51]) + result = s.loc[[(2000, 3, 10), (2000, 3, 13)]] + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_loc.py b/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_loc.py new file mode 100644 index 0000000..3b8aa96 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_loc.py @@ -0,0 +1,470 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Index, MultiIndex, Series +import pandas._testing as tm +from pandas.core.indexing import IndexingError + + +@pytest.fixture +def single_level_multiindex(): + """single level MultiIndex""" + return MultiIndex( + levels=[["foo", "bar", "baz", "qux"]], codes=[[0, 1, 2, 3]], names=["first"] + ) + + +@pytest.fixture +def frame_random_data_integer_multi_index(): + levels = [[0, 1], [0, 1, 2]] + codes = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] + index = MultiIndex(levels=levels, codes=codes) + return DataFrame(np.random.randn(6, 2), index=index) + + +class TestMultiIndexLoc: + def test_loc_getitem_series(self): + # GH14730 + # passing a series as a key with a MultiIndex + index = MultiIndex.from_product([[1, 2, 3], ["A", "B", "C"]]) + x = Series(index=index, data=range(9), dtype=np.float64) + y = Series([1, 3]) + expected = Series( + data=[0, 1, 2, 6, 7, 8], + index=MultiIndex.from_product([[1, 3], ["A", "B", "C"]]), + dtype=np.float64, + ) + result = x.loc[y] + tm.assert_series_equal(result, expected) + + result = x.loc[[1, 3]] + tm.assert_series_equal(result, expected) + + # GH15424 + y1 = Series([1, 3], index=[1, 2]) + result = x.loc[y1] + tm.assert_series_equal(result, expected) + + empty = Series(data=[], dtype=np.float64) + expected = Series( + [], + index=MultiIndex(levels=index.levels, codes=[[], []], dtype=np.float64), + dtype=np.float64, + ) + result = x.loc[empty] + tm.assert_series_equal(result, expected) + + def test_loc_getitem_array(self): + # GH15434 + # passing an array as a key with a MultiIndex + index = MultiIndex.from_product([[1, 2, 3], ["A", "B", "C"]]) + x = Series(index=index, data=range(9), dtype=np.float64) + y = np.array([1, 3]) + expected = Series( + data=[0, 1, 2, 6, 7, 8], + index=MultiIndex.from_product([[1, 3], ["A", "B", "C"]]), + dtype=np.float64, + ) + result = x.loc[y] + tm.assert_series_equal(result, expected) + + # empty array: + empty = np.array([]) + expected = Series( + [], + index=MultiIndex(levels=index.levels, codes=[[], []], dtype=np.float64), + dtype="float64", + ) + result = x.loc[empty] + tm.assert_series_equal(result, expected) + + # 0-dim array (scalar): + scalar = np.int64(1) + expected = Series(data=[0, 1, 2], index=["A", "B", "C"], dtype=np.float64) + result = x.loc[scalar] + tm.assert_series_equal(result, expected) + + def test_loc_multiindex_labels(self): + df = DataFrame( + np.random.randn(3, 3), + columns=[["i", "i", "j"], ["A", "A", "B"]], + index=[["i", "i", "j"], ["X", "X", "Y"]], + ) + + # the first 2 rows + expected = df.iloc[[0, 1]].droplevel(0) + result = df.loc["i"] + tm.assert_frame_equal(result, expected) + + # 2nd (last) column + expected = df.iloc[:, [2]].droplevel(0, axis=1) + result = df.loc[:, "j"] + tm.assert_frame_equal(result, expected) + + # bottom right corner + expected = df.iloc[[2], [2]].droplevel(0).droplevel(0, axis=1) + result = df.loc["j"].loc[:, "j"] + tm.assert_frame_equal(result, expected) + + # with a tuple + expected = df.iloc[[0, 1]] + result = df.loc[("i", "X")] + tm.assert_frame_equal(result, expected) + + def test_loc_multiindex_ints(self): + df = DataFrame( + np.random.randn(3, 3), + columns=[[2, 2, 4], [6, 8, 10]], + index=[[4, 4, 8], [8, 10, 12]], + ) + expected = df.iloc[[0, 1]].droplevel(0) + result = df.loc[4] + tm.assert_frame_equal(result, expected) + + def test_loc_multiindex_missing_label_raises(self): + df = DataFrame( + np.random.randn(3, 3), + columns=[[2, 2, 4], [6, 8, 10]], + index=[[4, 4, 8], [8, 10, 12]], + ) + + with pytest.raises(KeyError, match=r"^2$"): + df.loc[2] + + @pytest.mark.parametrize("key, pos", [([2, 4], [0, 1]), ([2], []), ([2, 3], [])]) + def test_loc_multiindex_list_missing_label(self, key, pos): + # GH 27148 - lists with missing labels do not raise: + df = DataFrame( + np.random.randn(3, 3), + columns=[[2, 2, 4], [6, 8, 10]], + index=[[4, 4, 8], [8, 10, 12]], + ) + + expected = df.iloc[pos] + result = df.loc[key] + tm.assert_frame_equal(result, expected) + + def test_loc_multiindex_too_many_dims_raises(self): + # GH 14885 + s = Series( + range(8), + index=MultiIndex.from_product([["a", "b"], ["c", "d"], ["e", "f"]]), + ) + + with pytest.raises(KeyError, match=r"^\('a', 'b'\)$"): + s.loc["a", "b"] + with pytest.raises(KeyError, match=r"^\('a', 'd', 'g'\)$"): + s.loc["a", "d", "g"] + with pytest.raises(IndexingError, match="Too many indexers"): + s.loc["a", "d", "g", "j"] + + def test_loc_multiindex_indexer_none(self): + + # GH6788 + # multi-index indexer is None (meaning take all) + attributes = ["Attribute" + str(i) for i in range(1)] + attribute_values = ["Value" + str(i) for i in range(5)] + + index = MultiIndex.from_product([attributes, attribute_values]) + df = 0.1 * np.random.randn(10, 1 * 5) + 0.5 + df = DataFrame(df, columns=index) + result = df[attributes] + tm.assert_frame_equal(result, df) + + # GH 7349 + # loc with a multi-index seems to be doing fallback + df = DataFrame( + np.arange(12).reshape(-1, 1), + index=MultiIndex.from_product([[1, 2, 3, 4], [1, 2, 3]]), + ) + + expected = df.loc[([1, 2],), :] + result = df.loc[[1, 2]] + tm.assert_frame_equal(result, expected) + + def test_loc_multiindex_incomplete(self): + + # GH 7399 + # incomplete indexers + s = Series( + np.arange(15, dtype="int64"), + MultiIndex.from_product([range(5), ["a", "b", "c"]]), + ) + expected = s.loc[:, "a":"c"] + + result = s.loc[0:4, "a":"c"] + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) + + result = s.loc[:4, "a":"c"] + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) + + result = s.loc[0:, "a":"c"] + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) + + # GH 7400 + # multiindexer gettitem with list of indexers skips wrong element + s = Series( + np.arange(15, dtype="int64"), + MultiIndex.from_product([range(5), ["a", "b", "c"]]), + ) + expected = s.iloc[[6, 7, 8, 12, 13, 14]] + result = s.loc[2:4:2, "a":"c"] + tm.assert_series_equal(result, expected) + + def test_get_loc_single_level(self, single_level_multiindex): + single_level = single_level_multiindex + s = Series(np.random.randn(len(single_level)), index=single_level) + for k in single_level.values: + s[k] + + def test_loc_getitem_int_slice(self): + # GH 3053 + # loc should treat integer slices like label slices + + index = MultiIndex.from_product([[6, 7, 8], ["a", "b"]]) + df = DataFrame(np.random.randn(6, 6), index, index) + result = df.loc[6:8, :] + expected = df + tm.assert_frame_equal(result, expected) + + index = MultiIndex.from_product([[10, 20, 30], ["a", "b"]]) + df = DataFrame(np.random.randn(6, 6), index, index) + result = df.loc[20:30, :] + expected = df.iloc[2:] + tm.assert_frame_equal(result, expected) + + # doc examples + result = df.loc[10, :] + expected = df.iloc[0:2] + expected.index = ["a", "b"] + tm.assert_frame_equal(result, expected) + + result = df.loc[:, 10] + expected = df[10] + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "indexer_type_1", (list, tuple, set, slice, np.ndarray, Series, Index) + ) + @pytest.mark.parametrize( + "indexer_type_2", (list, tuple, set, slice, np.ndarray, Series, Index) + ) + def test_loc_getitem_nested_indexer(self, indexer_type_1, indexer_type_2): + # GH #19686 + # .loc should work with nested indexers which can be + # any list-like objects (see `pandas.api.types.is_list_like`) or slices + + def convert_nested_indexer(indexer_type, keys): + if indexer_type == np.ndarray: + return np.array(keys) + if indexer_type == slice: + return slice(*keys) + return indexer_type(keys) + + a = [10, 20, 30] + b = [1, 2, 3] + index = MultiIndex.from_product([a, b]) + df = DataFrame( + np.arange(len(index), dtype="int64"), index=index, columns=["Data"] + ) + + keys = ([10, 20], [2, 3]) + types = (indexer_type_1, indexer_type_2) + + # check indexers with all the combinations of nested objects + # of all the valid types + indexer = tuple( + convert_nested_indexer(indexer_type, k) + for indexer_type, k in zip(types, keys) + ) + + result = df.loc[indexer, "Data"] + expected = Series( + [1, 2, 4, 5], name="Data", index=MultiIndex.from_product(keys) + ) + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "indexer, pos", + [ + ([], []), # empty ok + (["A"], slice(3)), + (["A", "D"], slice(3)), + (["D", "E"], []), # no values found - fine + (["D"], []), # same, with single item list: GH 27148 + (pd.IndexSlice[:, ["foo"]], slice(2, None, 3)), + (pd.IndexSlice[:, ["foo", "bah"]], slice(2, None, 3)), + ], +) +def test_loc_getitem_duplicates_multiindex_missing_indexers(indexer, pos): + # GH 7866 + # multi-index slicing with missing indexers + idx = MultiIndex.from_product( + [["A", "B", "C"], ["foo", "bar", "baz"]], names=["one", "two"] + ) + s = Series(np.arange(9, dtype="int64"), index=idx).sort_index() + expected = s.iloc[pos] + result = s.loc[indexer] + tm.assert_series_equal(result, expected) + + +def test_series_loc_getitem_fancy(multiindex_year_month_day_dataframe_random_data): + s = multiindex_year_month_day_dataframe_random_data["A"] + expected = s.reindex(s.index[49:51]) + result = s.loc[[(2000, 3, 10), (2000, 3, 13)]] + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("columns_indexer", [([], slice(None)), (["foo"], [])]) +def test_loc_getitem_duplicates_multiindex_empty_indexer(columns_indexer): + # GH 8737 + # empty indexer + multi_index = MultiIndex.from_product((["foo", "bar", "baz"], ["alpha", "beta"])) + df = DataFrame(np.random.randn(5, 6), index=range(5), columns=multi_index) + df = df.sort_index(level=0, axis=1) + + expected = DataFrame(index=range(5), columns=multi_index.reindex([])[0]) + result = df.loc[:, columns_indexer] + tm.assert_frame_equal(result, expected) + + +def test_loc_getitem_duplicates_multiindex_non_scalar_type_object(): + # regression from < 0.14.0 + # GH 7914 + df = DataFrame( + [[np.mean, np.median], ["mean", "median"]], + columns=MultiIndex.from_tuples([("functs", "mean"), ("functs", "median")]), + index=["function", "name"], + ) + result = df.loc["function", ("functs", "mean")] + expected = np.mean + assert result == expected + + +def test_loc_getitem_tuple_plus_slice(): + # GH 671 + df = DataFrame( + { + "a": np.arange(10), + "b": np.arange(10), + "c": np.random.randn(10), + "d": np.random.randn(10), + } + ).set_index(["a", "b"]) + expected = df.loc[0, 0] + result = df.loc[(0, 0), :] + tm.assert_series_equal(result, expected) + + +def test_loc_getitem_int(frame_random_data_integer_multi_index): + df = frame_random_data_integer_multi_index + result = df.loc[1] + expected = df[-3:] + expected.index = expected.index.droplevel(0) + tm.assert_frame_equal(result, expected) + + +def test_loc_getitem_int_raises_exception(frame_random_data_integer_multi_index): + df = frame_random_data_integer_multi_index + with pytest.raises(KeyError, match=r"^3$"): + df.loc[3] + + +def test_loc_getitem_lowerdim_corner(multiindex_dataframe_random_data): + df = multiindex_dataframe_random_data + + # test setup - check key not in dataframe + with pytest.raises(KeyError, match=r"^\('bar', 'three'\)$"): + df.loc[("bar", "three"), "B"] + + # in theory should be inserting in a sorted space???? + df.loc[("bar", "three"), "B"] = 0 + expected = 0 + result = df.sort_index().loc[("bar", "three"), "B"] + assert result == expected + + +def test_loc_setitem_single_column_slice(): + # case from https://github.com/pandas-dev/pandas/issues/27841 + df = DataFrame( + "string", + index=list("abcd"), + columns=MultiIndex.from_product([["Main"], ("another", "one")]), + ) + df["labels"] = "a" + df.loc[:, "labels"] = df.index + tm.assert_numpy_array_equal(np.asarray(df["labels"]), np.asarray(df.index)) + + # test with non-object block + df = DataFrame( + np.nan, + index=range(4), + columns=MultiIndex.from_tuples([("A", "1"), ("A", "2"), ("B", "1")]), + ) + expected = df.copy() + df.loc[:, "B"] = np.arange(4) + expected.iloc[:, 2] = np.arange(4) + tm.assert_frame_equal(df, expected) + + +def test_loc_nan_multiindex(): + # GH 5286 + tups = [ + ("Good Things", "C", np.nan), + ("Good Things", "R", np.nan), + ("Bad Things", "C", np.nan), + ("Bad Things", "T", np.nan), + ("Okay Things", "N", "B"), + ("Okay Things", "N", "D"), + ("Okay Things", "B", np.nan), + ("Okay Things", "D", np.nan), + ] + df = DataFrame( + np.ones((8, 4)), + columns=Index(["d1", "d2", "d3", "d4"]), + index=MultiIndex.from_tuples(tups, names=["u1", "u2", "u3"]), + ) + result = df.loc["Good Things"].loc["C"] + expected = DataFrame( + np.ones((1, 4)), + index=Index([np.nan], dtype="object", name="u3"), + columns=Index(["d1", "d2", "d3", "d4"], dtype="object"), + ) + tm.assert_frame_equal(result, expected) + + +def test_loc_period_string_indexing(): + # GH 9892 + a = pd.period_range("2013Q1", "2013Q4", freq="Q") + i = (1111, 2222, 3333) + idx = pd.MultiIndex.from_product((a, i), names=("Periode", "CVR")) + df = pd.DataFrame( + index=idx, + columns=( + "OMS", + "OMK", + "RES", + "DRIFT_IND", + "OEVRIG_IND", + "FIN_IND", + "VARE_UD", + "LOEN_UD", + "FIN_UD", + ), + ) + result = df.loc[("2013Q1", 1111), "OMS"] + expected = pd.Series( + [np.nan], + dtype=object, + name="OMS", + index=pd.MultiIndex.from_tuples( + [(pd.Period("2013Q1"), 1111)], names=["Periode", "CVR"] + ), + ) + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_multiindex.py b/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_multiindex.py new file mode 100644 index 0000000..0064187 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_multiindex.py @@ -0,0 +1,113 @@ +import numpy as np + +import pandas._libs.index as _index +from pandas.errors import PerformanceWarning + +import pandas as pd +from pandas import DataFrame, Index, MultiIndex, Series +import pandas._testing as tm + + +class TestMultiIndexBasic: + def test_multiindex_perf_warn(self): + + df = DataFrame( + { + "jim": [0, 0, 1, 1], + "joe": ["x", "x", "z", "y"], + "jolie": np.random.rand(4), + } + ).set_index(["jim", "joe"]) + + with tm.assert_produces_warning(PerformanceWarning): + df.loc[(1, "z")] + + df = df.iloc[[2, 1, 3, 0]] + with tm.assert_produces_warning(PerformanceWarning): + df.loc[(0,)] + + def test_multiindex_contains_dropped(self): + # GH 19027 + # test that dropped MultiIndex levels are not in the MultiIndex + # despite continuing to be in the MultiIndex's levels + idx = MultiIndex.from_product([[1, 2], [3, 4]]) + assert 2 in idx + idx = idx.drop(2) + + # drop implementation keeps 2 in the levels + assert 2 in idx.levels[0] + # but it should no longer be in the index itself + assert 2 not in idx + + # also applies to strings + idx = MultiIndex.from_product([["a", "b"], ["c", "d"]]) + assert "a" in idx + idx = idx.drop("a") + assert "a" in idx.levels[0] + assert "a" not in idx + + def test_indexing_over_hashtable_size_cutoff(self): + n = 10000 + + old_cutoff = _index._SIZE_CUTOFF + _index._SIZE_CUTOFF = 20000 + + s = Series(np.arange(n), MultiIndex.from_arrays((["a"] * n, np.arange(n)))) + + # hai it works! + assert s[("a", 5)] == 5 + assert s[("a", 6)] == 6 + assert s[("a", 7)] == 7 + + _index._SIZE_CUTOFF = old_cutoff + + def test_multi_nan_indexing(self): + + # GH 3588 + df = DataFrame( + { + "a": ["R1", "R2", np.nan, "R4"], + "b": ["C1", "C2", "C3", "C4"], + "c": [10, 15, np.nan, 20], + } + ) + result = df.set_index(["a", "b"], drop=False) + expected = DataFrame( + { + "a": ["R1", "R2", np.nan, "R4"], + "b": ["C1", "C2", "C3", "C4"], + "c": [10, 15, np.nan, 20], + }, + index=[ + Index(["R1", "R2", np.nan, "R4"], name="a"), + Index(["C1", "C2", "C3", "C4"], name="b"), + ], + ) + tm.assert_frame_equal(result, expected) + + def test_contains(self): + # GH 24570 + tx = pd.timedelta_range("09:30:00", "16:00:00", freq="30 min") + idx = MultiIndex.from_arrays([tx, np.arange(len(tx))]) + assert tx[0] in idx + assert "element_not_exit" not in idx + assert "0 day 09:30:00" in idx + + def test_nested_tuples_duplicates(self): + # GH#30892 + + dti = pd.to_datetime(["20190101", "20190101", "20190102"]) + idx = pd.Index(["a", "a", "c"]) + mi = pd.MultiIndex.from_arrays([dti, idx], names=["index1", "index2"]) + + df = pd.DataFrame({"c1": [1, 2, 3], "c2": [np.nan, np.nan, np.nan]}, index=mi) + + expected = pd.DataFrame({"c1": df["c1"], "c2": [1.0, 1.0, np.nan]}, index=mi) + + df2 = df.copy(deep=True) + df2.loc[(dti[0], "a"), "c2"] = 1.0 + tm.assert_frame_equal(df2, expected) + + df3 = df.copy(deep=True) + df3.loc[[(dti[0], "a")], "c2"] = 1.0 + tm.assert_frame_equal(df3, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_partial.py b/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_partial.py new file mode 100644 index 0000000..9d181bd --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_partial.py @@ -0,0 +1,194 @@ +import numpy as np +import pytest + +from pandas import DataFrame, MultiIndex +import pandas._testing as tm + + +class TestMultiIndexPartial: + def test_getitem_partial_int(self): + # GH 12416 + # with single item + l1 = [10, 20] + l2 = ["a", "b"] + df = DataFrame(index=range(2), columns=MultiIndex.from_product([l1, l2])) + expected = DataFrame(index=range(2), columns=l2) + result = df[20] + tm.assert_frame_equal(result, expected) + + # with list + expected = DataFrame( + index=range(2), columns=MultiIndex.from_product([l1[1:], l2]) + ) + result = df[[20]] + tm.assert_frame_equal(result, expected) + + # missing item: + with pytest.raises(KeyError, match="1"): + df[1] + with pytest.raises(KeyError, match=r"'\[1\] not in index'"): + df[[1]] + + def test_series_slice_partial(self): + pass + + def test_xs_partial( + self, + multiindex_dataframe_random_data, + multiindex_year_month_day_dataframe_random_data, + ): + frame = multiindex_dataframe_random_data + ymd = multiindex_year_month_day_dataframe_random_data + result = frame.xs("foo") + result2 = frame.loc["foo"] + expected = frame.T["foo"].T + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, result2) + + result = ymd.xs((2000, 4)) + expected = ymd.loc[2000, 4] + tm.assert_frame_equal(result, expected) + + # ex from #1796 + index = MultiIndex( + levels=[["foo", "bar"], ["one", "two"], [-1, 1]], + codes=[ + [0, 0, 0, 0, 1, 1, 1, 1], + [0, 0, 1, 1, 0, 0, 1, 1], + [0, 1, 0, 1, 0, 1, 0, 1], + ], + ) + df = DataFrame(np.random.randn(8, 4), index=index, columns=list("abcd")) + + result = df.xs(["foo", "one"]) + expected = df.loc["foo", "one"] + tm.assert_frame_equal(result, expected) + + def test_getitem_partial(self, multiindex_year_month_day_dataframe_random_data): + ymd = multiindex_year_month_day_dataframe_random_data + ymd = ymd.T + result = ymd[2000, 2] + + expected = ymd.reindex(columns=ymd.columns[ymd.columns.codes[1] == 1]) + expected.columns = expected.columns.droplevel(0).droplevel(0) + tm.assert_frame_equal(result, expected) + + def test_fancy_slice_partial( + self, + multiindex_dataframe_random_data, + multiindex_year_month_day_dataframe_random_data, + ): + frame = multiindex_dataframe_random_data + result = frame.loc["bar":"baz"] + expected = frame[3:7] + tm.assert_frame_equal(result, expected) + + ymd = multiindex_year_month_day_dataframe_random_data + result = ymd.loc[(2000, 2):(2000, 4)] + lev = ymd.index.codes[1] + expected = ymd[(lev >= 1) & (lev <= 3)] + tm.assert_frame_equal(result, expected) + + def test_getitem_partial_column_select(self): + idx = MultiIndex( + codes=[[0, 0, 0], [0, 1, 1], [1, 0, 1]], + levels=[["a", "b"], ["x", "y"], ["p", "q"]], + ) + df = DataFrame(np.random.rand(3, 2), index=idx) + + result = df.loc[("a", "y"), :] + expected = df.loc[("a", "y")] + tm.assert_frame_equal(result, expected) + + result = df.loc[("a", "y"), [1, 0]] + expected = df.loc[("a", "y")][[1, 0]] + tm.assert_frame_equal(result, expected) + + with pytest.raises(KeyError, match=r"\('a', 'foo'\)"): + df.loc[("a", "foo"), :] + + def test_partial_set(self, multiindex_year_month_day_dataframe_random_data): + # GH #397 + ymd = multiindex_year_month_day_dataframe_random_data + df = ymd.copy() + exp = ymd.copy() + df.loc[2000, 4] = 0 + exp.loc[2000, 4].values[:] = 0 + tm.assert_frame_equal(df, exp) + + df["A"].loc[2000, 4] = 1 + exp["A"].loc[2000, 4].values[:] = 1 + tm.assert_frame_equal(df, exp) + + df.loc[2000] = 5 + exp.loc[2000].values[:] = 5 + tm.assert_frame_equal(df, exp) + + # this works...for now + df["A"].iloc[14] = 5 + assert df["A"][14] == 5 + + # --------------------------------------------------------------------- + # AMBIGUOUS CASES! + + def test_partial_loc_missing(self, multiindex_year_month_day_dataframe_random_data): + pytest.skip("skipping for now") + + ymd = multiindex_year_month_day_dataframe_random_data + result = ymd.loc[2000, 0] + expected = ymd.loc[2000]["A"] + tm.assert_series_equal(result, expected) + + # need to put in some work here + + # self.ymd.loc[2000, 0] = 0 + # assert (self.ymd.loc[2000]['A'] == 0).all() + + # Pretty sure the second (and maybe even the first) is already wrong. + with pytest.raises(Exception): + ymd.loc[(2000, 6)] + with pytest.raises(Exception): + ymd.loc[(2000, 6), 0] + + # --------------------------------------------------------------------- + + def test_setitem_multiple_partial(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + expected = frame.copy() + result = frame.copy() + result.loc[["foo", "bar"]] = 0 + expected.loc["foo"] = 0 + expected.loc["bar"] = 0 + tm.assert_frame_equal(result, expected) + + expected = frame.copy() + result = frame.copy() + result.loc["foo":"bar"] = 0 + expected.loc["foo"] = 0 + expected.loc["bar"] = 0 + tm.assert_frame_equal(result, expected) + + expected = frame["A"].copy() + result = frame["A"].copy() + result.loc[["foo", "bar"]] = 0 + expected.loc["foo"] = 0 + expected.loc["bar"] = 0 + tm.assert_series_equal(result, expected) + + expected = frame["A"].copy() + result = frame["A"].copy() + result.loc["foo":"bar"] = 0 + expected.loc["foo"] = 0 + expected.loc["bar"] = 0 + tm.assert_series_equal(result, expected) + + +def test_loc_getitem_partial_both_axis(): + # gh-12660 + iterables = [["a", "b"], [2, 1]] + columns = MultiIndex.from_product(iterables, names=["col1", "col2"]) + rows = MultiIndex.from_product(iterables, names=["row1", "row2"]) + df = DataFrame(np.random.randn(4, 4), index=rows, columns=columns) + expected = df.iloc[:2, 2:].droplevel("row1").droplevel("col1", axis=1) + result = df.loc["a", "b"] + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_set_ops.py b/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_set_ops.py new file mode 100644 index 0000000..f2cbfad --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_set_ops.py @@ -0,0 +1,41 @@ +from numpy.random import randn + +from pandas import DataFrame, MultiIndex, Series +import pandas._testing as tm + + +class TestMultiIndexSetOps: + def test_multiindex_symmetric_difference(self): + # GH 13490 + idx = MultiIndex.from_product([["a", "b"], ["A", "B"]], names=["a", "b"]) + result = idx ^ idx + assert result.names == idx.names + + idx2 = idx.copy().rename(["A", "B"]) + result = idx ^ idx2 + assert result.names == [None, None] + + def test_mixed_depth_insert(self): + arrays = [ + ["a", "top", "top", "routine1", "routine1", "routine2"], + ["", "OD", "OD", "result1", "result2", "result1"], + ["", "wx", "wy", "", "", ""], + ] + + tuples = sorted(zip(*arrays)) + index = MultiIndex.from_tuples(tuples) + df = DataFrame(randn(4, 6), columns=index) + + result = df.copy() + expected = df.copy() + result["b"] = [1, 2, 3, 4] + expected["b", "", ""] = [1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + def test_dataframe_insert_column_all_na(self): + # GH #1534 + mix = MultiIndex.from_tuples([("1a", "2a"), ("1a", "2b"), ("1a", "2c")]) + df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mix) + s = Series({(1, 1): 1, (1, 2): 2}) + df["new"] = s + assert df["new"].isna().all() diff --git a/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_setitem.py b/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_setitem.py new file mode 100644 index 0000000..aebd1ad --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_setitem.py @@ -0,0 +1,443 @@ +import numpy as np +from numpy.random import randn +import pytest + +import pandas as pd +from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range, isna, notna +import pandas._testing as tm +import pandas.core.common as com + + +class TestMultiIndexSetItem: + def test_setitem_multiindex(self): + for index_fn in ("loc",): + + def assert_equal(a, b): + assert a == b + + def check(target, indexers, value, compare_fn, expected=None): + fn = getattr(target, index_fn) + fn.__setitem__(indexers, value) + result = fn.__getitem__(indexers) + if expected is None: + expected = value + compare_fn(result, expected) + + # GH7190 + index = MultiIndex.from_product( + [np.arange(0, 100), np.arange(0, 80)], names=["time", "firm"] + ) + t, n = 0, 2 + df = DataFrame( + np.nan, + columns=["A", "w", "l", "a", "x", "X", "d", "profit"], + index=index, + ) + check(target=df, indexers=((t, n), "X"), value=0, compare_fn=assert_equal) + + df = DataFrame( + -999, columns=["A", "w", "l", "a", "x", "X", "d", "profit"], index=index + ) + check(target=df, indexers=((t, n), "X"), value=1, compare_fn=assert_equal) + + df = DataFrame( + columns=["A", "w", "l", "a", "x", "X", "d", "profit"], index=index + ) + check(target=df, indexers=((t, n), "X"), value=2, compare_fn=assert_equal) + + # gh-7218: assigning with 0-dim arrays + df = DataFrame( + -999, columns=["A", "w", "l", "a", "x", "X", "d", "profit"], index=index + ) + check( + target=df, + indexers=((t, n), "X"), + value=np.array(3), + compare_fn=assert_equal, + expected=3, + ) + + # GH5206 + df = DataFrame( + np.arange(25).reshape(5, 5), columns="A,B,C,D,E".split(","), dtype=float + ) + df["F"] = 99 + row_selection = df["A"] % 2 == 0 + col_selection = ["B", "C"] + df.loc[row_selection, col_selection] = df["F"] + output = DataFrame(99.0, index=[0, 2, 4], columns=["B", "C"]) + tm.assert_frame_equal(df.loc[row_selection, col_selection], output) + check( + target=df, + indexers=(row_selection, col_selection), + value=df["F"], + compare_fn=tm.assert_frame_equal, + expected=output, + ) + + # GH11372 + idx = MultiIndex.from_product( + [["A", "B", "C"], date_range("2015-01-01", "2015-04-01", freq="MS")] + ) + cols = MultiIndex.from_product( + [["foo", "bar"], date_range("2016-01-01", "2016-02-01", freq="MS")] + ) + + df = DataFrame(np.random.random((12, 4)), index=idx, columns=cols) + + subidx = MultiIndex.from_tuples( + [("A", Timestamp("2015-01-01")), ("A", Timestamp("2015-02-01"))] + ) + subcols = MultiIndex.from_tuples( + [("foo", Timestamp("2016-01-01")), ("foo", Timestamp("2016-02-01"))] + ) + + vals = DataFrame(np.random.random((2, 2)), index=subidx, columns=subcols) + check( + target=df, + indexers=(subidx, subcols), + value=vals, + compare_fn=tm.assert_frame_equal, + ) + # set all columns + vals = DataFrame(np.random.random((2, 4)), index=subidx, columns=cols) + check( + target=df, + indexers=(subidx, slice(None, None, None)), + value=vals, + compare_fn=tm.assert_frame_equal, + ) + # identity + copy = df.copy() + check( + target=df, + indexers=(df.index, df.columns), + value=df, + compare_fn=tm.assert_frame_equal, + expected=copy, + ) + + def test_multiindex_setitem(self): + + # GH 3738 + # setting with a multi-index right hand side + arrays = [ + np.array(["bar", "bar", "baz", "qux", "qux", "bar"]), + np.array(["one", "two", "one", "one", "two", "one"]), + np.arange(0, 6, 1), + ] + + df_orig = DataFrame( + np.random.randn(6, 3), index=arrays, columns=["A", "B", "C"] + ).sort_index() + + expected = df_orig.loc[["bar"]] * 2 + df = df_orig.copy() + df.loc[["bar"]] *= 2 + tm.assert_frame_equal(df.loc[["bar"]], expected) + + # raise because these have differing levels + with pytest.raises(TypeError): + df.loc["bar"] *= 2 + + # from SO + # https://stackoverflow.com/questions/24572040/pandas-access-the-level-of-multiindex-for-inplace-operation + df_orig = DataFrame.from_dict( + { + "price": { + ("DE", "Coal", "Stock"): 2, + ("DE", "Gas", "Stock"): 4, + ("DE", "Elec", "Demand"): 1, + ("FR", "Gas", "Stock"): 5, + ("FR", "Solar", "SupIm"): 0, + ("FR", "Wind", "SupIm"): 0, + } + } + ) + df_orig.index = MultiIndex.from_tuples( + df_orig.index, names=["Sit", "Com", "Type"] + ) + + expected = df_orig.copy() + expected.iloc[[0, 2, 3]] *= 2 + + idx = pd.IndexSlice + df = df_orig.copy() + df.loc[idx[:, :, "Stock"], :] *= 2 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[idx[:, :, "Stock"], "price"] *= 2 + tm.assert_frame_equal(df, expected) + + def test_multiindex_assignment(self): + + # GH3777 part 2 + + # mixed dtype + df = DataFrame( + np.random.randint(5, 10, size=9).reshape(3, 3), + columns=list("abc"), + index=[[4, 4, 8], [8, 10, 12]], + ) + df["d"] = np.nan + arr = np.array([0.0, 1.0]) + + df.loc[4, "d"] = arr + tm.assert_series_equal(df.loc[4, "d"], Series(arr, index=[8, 10], name="d")) + + # single dtype + df = DataFrame( + np.random.randint(5, 10, size=9).reshape(3, 3), + columns=list("abc"), + index=[[4, 4, 8], [8, 10, 12]], + ) + + df.loc[4, "c"] = arr + exp = Series(arr, index=[8, 10], name="c", dtype="float64") + tm.assert_series_equal(df.loc[4, "c"], exp) + + # scalar ok + df.loc[4, "c"] = 10 + exp = Series(10, index=[8, 10], name="c", dtype="float64") + tm.assert_series_equal(df.loc[4, "c"], exp) + + # invalid assignments + with pytest.raises(ValueError): + df.loc[4, "c"] = [0, 1, 2, 3] + + with pytest.raises(ValueError): + df.loc[4, "c"] = [0] + + # groupby example + NUM_ROWS = 100 + NUM_COLS = 10 + col_names = ["A" + num for num in map(str, np.arange(NUM_COLS).tolist())] + index_cols = col_names[:5] + + df = DataFrame( + np.random.randint(5, size=(NUM_ROWS, NUM_COLS)), + dtype=np.int64, + columns=col_names, + ) + df = df.set_index(index_cols).sort_index() + grp = df.groupby(level=index_cols[:4]) + df["new_col"] = np.nan + + f_index = np.arange(5) + + def f(name, df2): + return Series(np.arange(df2.shape[0]), name=df2.index.values[0]).reindex( + f_index + ) + + # TODO(wesm): unused? + # new_df = pd.concat([f(name, df2) for name, df2 in grp], axis=1).T + + # we are actually operating on a copy here + # but in this case, that's ok + for name, df2 in grp: + new_vals = np.arange(df2.shape[0]) + df.loc[name, "new_col"] = new_vals + + def test_series_setitem(self, multiindex_year_month_day_dataframe_random_data): + ymd = multiindex_year_month_day_dataframe_random_data + s = ymd["A"] + + s[2000, 3] = np.nan + assert isna(s.values[42:65]).all() + assert notna(s.values[:42]).all() + assert notna(s.values[65:]).all() + + s[2000, 3, 10] = np.nan + assert isna(s[49]) + + def test_frame_getitem_setitem_boolean(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + df = frame.T.copy() + values = df.values + + result = df[df > 0] + expected = df.where(df > 0) + tm.assert_frame_equal(result, expected) + + df[df > 0] = 5 + values[values > 0] = 5 + tm.assert_almost_equal(df.values, values) + + df[df == 5] = 0 + values[values == 5] = 0 + tm.assert_almost_equal(df.values, values) + + # a df that needs alignment first + df[df[:-1] < 0] = 2 + np.putmask(values[:-1], values[:-1] < 0, 2) + tm.assert_almost_equal(df.values, values) + + with pytest.raises(TypeError, match="boolean values only"): + df[df * 0] = 2 + + def test_frame_getitem_setitem_multislice(self): + levels = [["t1", "t2"], ["a", "b", "c"]] + codes = [[0, 0, 0, 1, 1], [0, 1, 2, 0, 1]] + midx = MultiIndex(codes=codes, levels=levels, names=[None, "id"]) + df = DataFrame({"value": [1, 2, 3, 7, 8]}, index=midx) + + result = df.loc[:, "value"] + tm.assert_series_equal(df["value"], result) + + result = df.loc[df.index[1:3], "value"] + tm.assert_series_equal(df["value"][1:3], result) + + result = df.loc[:, :] + tm.assert_frame_equal(df, result) + + result = df + df.loc[:, "value"] = 10 + result["value"] = 10 + tm.assert_frame_equal(df, result) + + df.loc[:, :] = 10 + tm.assert_frame_equal(df, result) + + def test_frame_setitem_multi_column(self): + df = DataFrame(randn(10, 4), columns=[["a", "a", "b", "b"], [0, 1, 0, 1]]) + + cp = df.copy() + cp["a"] = cp["b"] + tm.assert_frame_equal(cp["a"], cp["b"]) + + # set with ndarray + cp = df.copy() + cp["a"] = cp["b"].values + tm.assert_frame_equal(cp["a"], cp["b"]) + + # --------------------------------------- + # #1803 + columns = MultiIndex.from_tuples([("A", "1"), ("A", "2"), ("B", "1")]) + df = DataFrame(index=[1, 3, 5], columns=columns) + + # Works, but adds a column instead of updating the two existing ones + df["A"] = 0.0 # Doesn't work + assert (df["A"].values == 0).all() + + # it broadcasts + df["B", "1"] = [1, 2, 3] + df["A"] = df["B", "1"] + + sliced_a1 = df["A", "1"] + sliced_a2 = df["A", "2"] + sliced_b1 = df["B", "1"] + tm.assert_series_equal(sliced_a1, sliced_b1, check_names=False) + tm.assert_series_equal(sliced_a2, sliced_b1, check_names=False) + assert sliced_a1.name == ("A", "1") + assert sliced_a2.name == ("A", "2") + assert sliced_b1.name == ("B", "1") + + def test_getitem_setitem_tuple_plus_columns( + self, multiindex_year_month_day_dataframe_random_data + ): + # GH #1013 + ymd = multiindex_year_month_day_dataframe_random_data + df = ymd[:5] + + result = df.loc[(2000, 1, 6), ["A", "B", "C"]] + expected = df.loc[2000, 1, 6][["A", "B", "C"]] + tm.assert_series_equal(result, expected) + + def test_getitem_setitem_slice_integers(self): + index = MultiIndex( + levels=[[0, 1, 2], [0, 2]], codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]] + ) + + frame = DataFrame( + np.random.randn(len(index), 4), index=index, columns=["a", "b", "c", "d"] + ) + res = frame.loc[1:2] + exp = frame.reindex(frame.index[2:]) + tm.assert_frame_equal(res, exp) + + frame.loc[1:2] = 7 + assert (frame.loc[1:2] == 7).values.all() + + series = Series(np.random.randn(len(index)), index=index) + + res = series.loc[1:2] + exp = series.reindex(series.index[2:]) + tm.assert_series_equal(res, exp) + + series.loc[1:2] = 7 + assert (series.loc[1:2] == 7).values.all() + + def test_setitem_change_dtype(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + dft = frame.T + s = dft["foo", "two"] + dft["foo", "two"] = s > s.median() + tm.assert_series_equal(dft["foo", "two"], s > s.median()) + # assert isinstance(dft._data.blocks[1].items, MultiIndex) + + reindexed = dft.reindex(columns=[("foo", "two")]) + tm.assert_series_equal(reindexed["foo", "two"], s > s.median()) + + def test_set_column_scalar_with_loc(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + subset = frame.index[[1, 4, 5]] + + frame.loc[subset] = 99 + assert (frame.loc[subset].values == 99).all() + + col = frame["B"] + col[subset] = 97 + assert (frame.loc[subset, "B"] == 97).all() + + def test_nonunique_assignment_1750(self): + df = DataFrame( + [[1, 1, "x", "X"], [1, 1, "y", "Y"], [1, 2, "z", "Z"]], columns=list("ABCD") + ) + + df = df.set_index(["A", "B"]) + ix = MultiIndex.from_tuples([(1, 1)]) + + df.loc[ix, "C"] = "_" + + assert (df.xs((1, 1))["C"] == "_").all() + + def test_astype_assignment_with_dups(self): + + # GH 4686 + # assignment with dups that has a dtype change + cols = MultiIndex.from_tuples([("A", "1"), ("B", "1"), ("A", "2")]) + df = DataFrame(np.arange(3).reshape((1, 3)), columns=cols, dtype=object) + index = df.index.copy() + + df["A"] = df["A"].astype(np.float64) + tm.assert_index_equal(df.index, index) + + +def test_frame_setitem_view_direct(multiindex_dataframe_random_data): + # this works because we are modifying the underlying array + # really a no-no + df = multiindex_dataframe_random_data.T + df["foo"].values[:] = 0 + assert (df["foo"].values == 0).all() + + +def test_frame_setitem_copy_raises(multiindex_dataframe_random_data): + # will raise/warn as its chained assignment + df = multiindex_dataframe_random_data.T + msg = "A value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(com.SettingWithCopyError, match=msg): + df["foo"]["one"] = 2 + + +def test_frame_setitem_copy_no_write(multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data.T + expected = frame + df = frame.copy() + msg = "A value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(com.SettingWithCopyError, match=msg): + df["foo"]["one"] = 2 + + result = df + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_slice.py b/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_slice.py new file mode 100644 index 0000000..6fa9d3b --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_slice.py @@ -0,0 +1,740 @@ +import numpy as np +import pytest + +from pandas.errors import UnsortedIndexError + +import pandas as pd +from pandas import DataFrame, Index, MultiIndex, Series, Timestamp +import pandas._testing as tm +from pandas.core.indexing import _non_reducing_slice +from pandas.tests.indexing.common import _mklbl + + +class TestMultiIndexSlicers: + def test_per_axis_per_level_getitem(self): + + # GH6134 + # example test case + ix = MultiIndex.from_product( + [_mklbl("A", 5), _mklbl("B", 7), _mklbl("C", 4), _mklbl("D", 2)] + ) + df = DataFrame(np.arange(len(ix.to_numpy())), index=ix) + + result = df.loc[(slice("A1", "A3"), slice(None), ["C1", "C3"]), :] + expected = df.loc[ + [ + tuple([a, b, c, d]) + for a, b, c, d in df.index.values + if (a == "A1" or a == "A2" or a == "A3") and (c == "C1" or c == "C3") + ] + ] + tm.assert_frame_equal(result, expected) + + expected = df.loc[ + [ + tuple([a, b, c, d]) + for a, b, c, d in df.index.values + if (a == "A1" or a == "A2" or a == "A3") + and (c == "C1" or c == "C2" or c == "C3") + ] + ] + result = df.loc[(slice("A1", "A3"), slice(None), slice("C1", "C3")), :] + tm.assert_frame_equal(result, expected) + + # test multi-index slicing with per axis and per index controls + index = MultiIndex.from_tuples( + [("A", 1), ("A", 2), ("A", 3), ("B", 1)], names=["one", "two"] + ) + columns = MultiIndex.from_tuples( + [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], + names=["lvl0", "lvl1"], + ) + + df = DataFrame( + np.arange(16, dtype="int64").reshape(4, 4), index=index, columns=columns + ) + df = df.sort_index(axis=0).sort_index(axis=1) + + # identity + result = df.loc[(slice(None), slice(None)), :] + tm.assert_frame_equal(result, df) + result = df.loc[(slice(None), slice(None)), (slice(None), slice(None))] + tm.assert_frame_equal(result, df) + result = df.loc[:, (slice(None), slice(None))] + tm.assert_frame_equal(result, df) + + # index + result = df.loc[(slice(None), [1]), :] + expected = df.iloc[[0, 3]] + tm.assert_frame_equal(result, expected) + + result = df.loc[(slice(None), 1), :] + expected = df.iloc[[0, 3]] + tm.assert_frame_equal(result, expected) + + # columns + result = df.loc[:, (slice(None), ["foo"])] + expected = df.iloc[:, [1, 3]] + tm.assert_frame_equal(result, expected) + + # both + result = df.loc[(slice(None), 1), (slice(None), ["foo"])] + expected = df.iloc[[0, 3], [1, 3]] + tm.assert_frame_equal(result, expected) + + result = df.loc["A", "a"] + expected = DataFrame( + dict(bar=[1, 5, 9], foo=[0, 4, 8]), + index=Index([1, 2, 3], name="two"), + columns=Index(["bar", "foo"], name="lvl1"), + ) + tm.assert_frame_equal(result, expected) + + result = df.loc[(slice(None), [1, 2]), :] + expected = df.iloc[[0, 1, 3]] + tm.assert_frame_equal(result, expected) + + # multi-level series + s = Series(np.arange(len(ix.to_numpy())), index=ix) + result = s.loc["A1":"A3", :, ["C1", "C3"]] + expected = s.loc[ + [ + tuple([a, b, c, d]) + for a, b, c, d in s.index.values + if (a == "A1" or a == "A2" or a == "A3") and (c == "C1" or c == "C3") + ] + ] + tm.assert_series_equal(result, expected) + + # boolean indexers + result = df.loc[(slice(None), df.loc[:, ("a", "bar")] > 5), :] + expected = df.iloc[[2, 3]] + tm.assert_frame_equal(result, expected) + + with pytest.raises(ValueError): + df.loc[(slice(None), np.array([True, False])), :] + + # ambiguous notation + # this is interpreted as slicing on both axes (GH #16396) + result = df.loc[slice(None), [1]] + expected = df.iloc[:, []] + tm.assert_frame_equal(result, expected) + + result = df.loc[(slice(None), [1]), :] + expected = df.iloc[[0, 3]] + tm.assert_frame_equal(result, expected) + + # not lexsorted + assert df.index.lexsort_depth == 2 + df = df.sort_index(level=1, axis=0) + assert df.index.lexsort_depth == 0 + + msg = ( + "MultiIndex slicing requires the index to be " + r"lexsorted: slicing on levels \[1\], lexsort depth 0" + ) + with pytest.raises(UnsortedIndexError, match=msg): + df.loc[(slice(None), slice("bar")), :] + + # GH 16734: not sorted, but no real slicing + result = df.loc[(slice(None), df.loc[:, ("a", "bar")] > 5), :] + tm.assert_frame_equal(result, df.iloc[[1, 3], :]) + + def test_multiindex_slicers_non_unique(self): + + # GH 7106 + # non-unique mi index support + df = ( + DataFrame( + dict( + A=["foo", "foo", "foo", "foo"], + B=["a", "a", "a", "a"], + C=[1, 2, 1, 3], + D=[1, 2, 3, 4], + ) + ) + .set_index(["A", "B", "C"]) + .sort_index() + ) + assert not df.index.is_unique + expected = ( + DataFrame(dict(A=["foo", "foo"], B=["a", "a"], C=[1, 1], D=[1, 3])) + .set_index(["A", "B", "C"]) + .sort_index() + ) + result = df.loc[(slice(None), slice(None), 1), :] + tm.assert_frame_equal(result, expected) + + # this is equivalent of an xs expression + result = df.xs(1, level=2, drop_level=False) + tm.assert_frame_equal(result, expected) + + df = ( + DataFrame( + dict( + A=["foo", "foo", "foo", "foo"], + B=["a", "a", "a", "a"], + C=[1, 2, 1, 2], + D=[1, 2, 3, 4], + ) + ) + .set_index(["A", "B", "C"]) + .sort_index() + ) + assert not df.index.is_unique + expected = ( + DataFrame(dict(A=["foo", "foo"], B=["a", "a"], C=[1, 1], D=[1, 3])) + .set_index(["A", "B", "C"]) + .sort_index() + ) + result = df.loc[(slice(None), slice(None), 1), :] + assert not result.index.is_unique + tm.assert_frame_equal(result, expected) + + # GH12896 + # numpy-implementation dependent bug + ints = [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 12, + 13, + 14, + 14, + 16, + 17, + 18, + 19, + 200000, + 200000, + ] + n = len(ints) + idx = MultiIndex.from_arrays([["a"] * n, ints]) + result = Series([1] * n, index=idx) + result = result.sort_index() + result = result.loc[(slice(None), slice(100000))] + expected = Series([1] * (n - 2), index=idx[:-2]).sort_index() + tm.assert_series_equal(result, expected) + + def test_multiindex_slicers_datetimelike(self): + + # GH 7429 + # buggy/inconsistent behavior when slicing with datetime-like + import datetime + + dates = [ + datetime.datetime(2012, 1, 1, 12, 12, 12) + datetime.timedelta(days=i) + for i in range(6) + ] + freq = [1, 2] + index = MultiIndex.from_product([dates, freq], names=["date", "frequency"]) + + df = DataFrame( + np.arange(6 * 2 * 4, dtype="int64").reshape(-1, 4), + index=index, + columns=list("ABCD"), + ) + + # multi-axis slicing + idx = pd.IndexSlice + expected = df.iloc[[0, 2, 4], [0, 1]] + result = df.loc[ + ( + slice( + Timestamp("2012-01-01 12:12:12"), Timestamp("2012-01-03 12:12:12") + ), + slice(1, 1), + ), + slice("A", "B"), + ] + tm.assert_frame_equal(result, expected) + + result = df.loc[ + ( + idx[ + Timestamp("2012-01-01 12:12:12") : Timestamp("2012-01-03 12:12:12") + ], + idx[1:1], + ), + slice("A", "B"), + ] + tm.assert_frame_equal(result, expected) + + result = df.loc[ + ( + slice( + Timestamp("2012-01-01 12:12:12"), Timestamp("2012-01-03 12:12:12") + ), + 1, + ), + slice("A", "B"), + ] + tm.assert_frame_equal(result, expected) + + # with strings + result = df.loc[ + (slice("2012-01-01 12:12:12", "2012-01-03 12:12:12"), slice(1, 1)), + slice("A", "B"), + ] + tm.assert_frame_equal(result, expected) + + result = df.loc[ + (idx["2012-01-01 12:12:12":"2012-01-03 12:12:12"], 1), idx["A", "B"] + ] + tm.assert_frame_equal(result, expected) + + def test_multiindex_slicers_edges(self): + # GH 8132 + # various edge cases + df = DataFrame( + { + "A": ["A0"] * 5 + ["A1"] * 5 + ["A2"] * 5, + "B": ["B0", "B0", "B1", "B1", "B2"] * 3, + "DATE": [ + "2013-06-11", + "2013-07-02", + "2013-07-09", + "2013-07-30", + "2013-08-06", + "2013-06-11", + "2013-07-02", + "2013-07-09", + "2013-07-30", + "2013-08-06", + "2013-09-03", + "2013-10-01", + "2013-07-09", + "2013-08-06", + "2013-09-03", + ], + "VALUES": [22, 35, 14, 9, 4, 40, 18, 4, 2, 5, 1, 2, 3, 4, 2], + } + ) + + df["DATE"] = pd.to_datetime(df["DATE"]) + df1 = df.set_index(["A", "B", "DATE"]) + df1 = df1.sort_index() + + # A1 - Get all values under "A0" and "A1" + result = df1.loc[(slice("A1")), :] + expected = df1.iloc[0:10] + tm.assert_frame_equal(result, expected) + + # A2 - Get all values from the start to "A2" + result = df1.loc[(slice("A2")), :] + expected = df1 + tm.assert_frame_equal(result, expected) + + # A3 - Get all values under "B1" or "B2" + result = df1.loc[(slice(None), slice("B1", "B2")), :] + expected = df1.iloc[[2, 3, 4, 7, 8, 9, 12, 13, 14]] + tm.assert_frame_equal(result, expected) + + # A4 - Get all values between 2013-07-02 and 2013-07-09 + result = df1.loc[(slice(None), slice(None), slice("20130702", "20130709")), :] + expected = df1.iloc[[1, 2, 6, 7, 12]] + tm.assert_frame_equal(result, expected) + + # B1 - Get all values in B0 that are also under A0, A1 and A2 + result = df1.loc[(slice("A2"), slice("B0")), :] + expected = df1.iloc[[0, 1, 5, 6, 10, 11]] + tm.assert_frame_equal(result, expected) + + # B2 - Get all values in B0, B1 and B2 (similar to what #2 is doing for + # the As) + result = df1.loc[(slice(None), slice("B2")), :] + expected = df1 + tm.assert_frame_equal(result, expected) + + # B3 - Get all values from B1 to B2 and up to 2013-08-06 + result = df1.loc[(slice(None), slice("B1", "B2"), slice("2013-08-06")), :] + expected = df1.iloc[[2, 3, 4, 7, 8, 9, 12, 13]] + tm.assert_frame_equal(result, expected) + + # B4 - Same as A4 but the start of the date slice is not a key. + # shows indexing on a partial selection slice + result = df1.loc[(slice(None), slice(None), slice("20130701", "20130709")), :] + expected = df1.iloc[[1, 2, 6, 7, 12]] + tm.assert_frame_equal(result, expected) + + def test_per_axis_per_level_doc_examples(self): + + # test index maker + idx = pd.IndexSlice + + # from indexing.rst / advanced + index = MultiIndex.from_product( + [_mklbl("A", 4), _mklbl("B", 2), _mklbl("C", 4), _mklbl("D", 2)] + ) + columns = MultiIndex.from_tuples( + [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], + names=["lvl0", "lvl1"], + ) + df = DataFrame( + np.arange(len(index) * len(columns), dtype="int64").reshape( + (len(index), len(columns)) + ), + index=index, + columns=columns, + ) + result = df.loc[(slice("A1", "A3"), slice(None), ["C1", "C3"]), :] + expected = df.loc[ + [ + tuple([a, b, c, d]) + for a, b, c, d in df.index.values + if (a == "A1" or a == "A2" or a == "A3") and (c == "C1" or c == "C3") + ] + ] + tm.assert_frame_equal(result, expected) + result = df.loc[idx["A1":"A3", :, ["C1", "C3"]], :] + tm.assert_frame_equal(result, expected) + + result = df.loc[(slice(None), slice(None), ["C1", "C3"]), :] + expected = df.loc[ + [ + tuple([a, b, c, d]) + for a, b, c, d in df.index.values + if (c == "C1" or c == "C3") + ] + ] + tm.assert_frame_equal(result, expected) + result = df.loc[idx[:, :, ["C1", "C3"]], :] + tm.assert_frame_equal(result, expected) + + # not sorted + with pytest.raises(UnsortedIndexError): + df.loc["A1", ("a", slice("foo"))] + + # GH 16734: not sorted, but no real slicing + tm.assert_frame_equal( + df.loc["A1", (slice(None), "foo")], df.loc["A1"].iloc[:, [0, 2]] + ) + + df = df.sort_index(axis=1) + + # slicing + df.loc["A1", (slice(None), "foo")] + df.loc[(slice(None), slice(None), ["C1", "C3"]), (slice(None), "foo")] + + # setitem + df.loc(axis=0)[:, :, ["C1", "C3"]] = -10 + + def test_loc_axis_arguments(self): + + index = MultiIndex.from_product( + [_mklbl("A", 4), _mklbl("B", 2), _mklbl("C", 4), _mklbl("D", 2)] + ) + columns = MultiIndex.from_tuples( + [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], + names=["lvl0", "lvl1"], + ) + df = ( + DataFrame( + np.arange(len(index) * len(columns), dtype="int64").reshape( + (len(index), len(columns)) + ), + index=index, + columns=columns, + ) + .sort_index() + .sort_index(axis=1) + ) + + # axis 0 + result = df.loc(axis=0)["A1":"A3", :, ["C1", "C3"]] + expected = df.loc[ + [ + tuple([a, b, c, d]) + for a, b, c, d in df.index.values + if (a == "A1" or a == "A2" or a == "A3") and (c == "C1" or c == "C3") + ] + ] + tm.assert_frame_equal(result, expected) + + result = df.loc(axis="index")[:, :, ["C1", "C3"]] + expected = df.loc[ + [ + tuple([a, b, c, d]) + for a, b, c, d in df.index.values + if (c == "C1" or c == "C3") + ] + ] + tm.assert_frame_equal(result, expected) + + # axis 1 + result = df.loc(axis=1)[:, "foo"] + expected = df.loc[:, (slice(None), "foo")] + tm.assert_frame_equal(result, expected) + + result = df.loc(axis="columns")[:, "foo"] + expected = df.loc[:, (slice(None), "foo")] + tm.assert_frame_equal(result, expected) + + # invalid axis + with pytest.raises(ValueError): + df.loc(axis=-1)[:, :, ["C1", "C3"]] + + with pytest.raises(ValueError): + df.loc(axis=2)[:, :, ["C1", "C3"]] + + with pytest.raises(ValueError): + df.loc(axis="foo")[:, :, ["C1", "C3"]] + + def test_loc_axis_single_level_multi_col_indexing_multiindex_col_df(self): + + # GH29519 + df = pd.DataFrame( + np.arange(27).reshape(3, 9), + columns=pd.MultiIndex.from_product( + [["a1", "a2", "a3"], ["b1", "b2", "b3"]] + ), + ) + result = df.loc(axis=1)["a1":"a2"] + expected = df.iloc[:, :-3] + + tm.assert_frame_equal(result, expected) + + def test_loc_axis_single_level_single_col_indexing_multiindex_col_df(self): + + # GH29519 + df = pd.DataFrame( + np.arange(27).reshape(3, 9), + columns=pd.MultiIndex.from_product( + [["a1", "a2", "a3"], ["b1", "b2", "b3"]] + ), + ) + result = df.loc(axis=1)["a1"] + expected = df.iloc[:, :3] + expected.columns = ["b1", "b2", "b3"] + + tm.assert_frame_equal(result, expected) + + def test_loc_ax_single_level_indexer_simple_df(self): + + # GH29519 + # test single level indexing on single index column data frame + df = pd.DataFrame(np.arange(9).reshape(3, 3), columns=["a", "b", "c"]) + result = df.loc(axis=1)["a"] + expected = pd.Series(np.array([0, 3, 6]), name="a") + tm.assert_series_equal(result, expected) + + def test_per_axis_per_level_setitem(self): + + # test index maker + idx = pd.IndexSlice + + # test multi-index slicing with per axis and per index controls + index = MultiIndex.from_tuples( + [("A", 1), ("A", 2), ("A", 3), ("B", 1)], names=["one", "two"] + ) + columns = MultiIndex.from_tuples( + [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], + names=["lvl0", "lvl1"], + ) + + df_orig = DataFrame( + np.arange(16, dtype="int64").reshape(4, 4), index=index, columns=columns + ) + df_orig = df_orig.sort_index(axis=0).sort_index(axis=1) + + # identity + df = df_orig.copy() + df.loc[(slice(None), slice(None)), :] = 100 + expected = df_orig.copy() + expected.iloc[:, :] = 100 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc(axis=0)[:, :] = 100 + expected = df_orig.copy() + expected.iloc[:, :] = 100 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[(slice(None), slice(None)), (slice(None), slice(None))] = 100 + expected = df_orig.copy() + expected.iloc[:, :] = 100 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[:, (slice(None), slice(None))] = 100 + expected = df_orig.copy() + expected.iloc[:, :] = 100 + tm.assert_frame_equal(df, expected) + + # index + df = df_orig.copy() + df.loc[(slice(None), [1]), :] = 100 + expected = df_orig.copy() + expected.iloc[[0, 3]] = 100 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[(slice(None), 1), :] = 100 + expected = df_orig.copy() + expected.iloc[[0, 3]] = 100 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc(axis=0)[:, 1] = 100 + expected = df_orig.copy() + expected.iloc[[0, 3]] = 100 + tm.assert_frame_equal(df, expected) + + # columns + df = df_orig.copy() + df.loc[:, (slice(None), ["foo"])] = 100 + expected = df_orig.copy() + expected.iloc[:, [1, 3]] = 100 + tm.assert_frame_equal(df, expected) + + # both + df = df_orig.copy() + df.loc[(slice(None), 1), (slice(None), ["foo"])] = 100 + expected = df_orig.copy() + expected.iloc[[0, 3], [1, 3]] = 100 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[idx[:, 1], idx[:, ["foo"]]] = 100 + expected = df_orig.copy() + expected.iloc[[0, 3], [1, 3]] = 100 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc["A", "a"] = 100 + expected = df_orig.copy() + expected.iloc[0:3, 0:2] = 100 + tm.assert_frame_equal(df, expected) + + # setting with a list-like + df = df_orig.copy() + df.loc[(slice(None), 1), (slice(None), ["foo"])] = np.array( + [[100, 100], [100, 100]], dtype="int64" + ) + expected = df_orig.copy() + expected.iloc[[0, 3], [1, 3]] = 100 + tm.assert_frame_equal(df, expected) + + # not enough values + df = df_orig.copy() + + with pytest.raises(ValueError): + df.loc[(slice(None), 1), (slice(None), ["foo"])] = np.array( + [[100], [100, 100]], dtype="int64" + ) + + with pytest.raises(ValueError): + df.loc[(slice(None), 1), (slice(None), ["foo"])] = np.array( + [100, 100, 100, 100], dtype="int64" + ) + + # with an alignable rhs + df = df_orig.copy() + df.loc[(slice(None), 1), (slice(None), ["foo"])] = ( + df.loc[(slice(None), 1), (slice(None), ["foo"])] * 5 + ) + expected = df_orig.copy() + expected.iloc[[0, 3], [1, 3]] = expected.iloc[[0, 3], [1, 3]] * 5 + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[(slice(None), 1), (slice(None), ["foo"])] *= df.loc[ + (slice(None), 1), (slice(None), ["foo"]) + ] + expected = df_orig.copy() + expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]] + tm.assert_frame_equal(df, expected) + + rhs = df_orig.loc[(slice(None), 1), (slice(None), ["foo"])].copy() + rhs.loc[:, ("c", "bah")] = 10 + df = df_orig.copy() + df.loc[(slice(None), 1), (slice(None), ["foo"])] *= rhs + expected = df_orig.copy() + expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]] + tm.assert_frame_equal(df, expected) + + def test_multiindex_label_slicing_with_negative_step(self): + s = Series( + np.arange(20), MultiIndex.from_product([list("abcde"), np.arange(4)]) + ) + SLC = pd.IndexSlice + + def assert_slices_equivalent(l_slc, i_slc): + tm.assert_series_equal(s.loc[l_slc], s.iloc[i_slc]) + tm.assert_series_equal(s[l_slc], s.iloc[i_slc]) + + assert_slices_equivalent(SLC[::-1], SLC[::-1]) + + assert_slices_equivalent(SLC["d"::-1], SLC[15::-1]) + assert_slices_equivalent(SLC[("d",)::-1], SLC[15::-1]) + + assert_slices_equivalent(SLC[:"d":-1], SLC[:11:-1]) + assert_slices_equivalent(SLC[:("d",):-1], SLC[:11:-1]) + + assert_slices_equivalent(SLC["d":"b":-1], SLC[15:3:-1]) + assert_slices_equivalent(SLC[("d",):"b":-1], SLC[15:3:-1]) + assert_slices_equivalent(SLC["d":("b",):-1], SLC[15:3:-1]) + assert_slices_equivalent(SLC[("d",):("b",):-1], SLC[15:3:-1]) + assert_slices_equivalent(SLC["b":"d":-1], SLC[:0]) + + assert_slices_equivalent(SLC[("c", 2)::-1], SLC[10::-1]) + assert_slices_equivalent(SLC[:("c", 2):-1], SLC[:9:-1]) + assert_slices_equivalent(SLC[("e", 0):("c", 2):-1], SLC[16:9:-1]) + + def test_multiindex_slice_first_level(self): + # GH 12697 + freq = ["a", "b", "c", "d"] + idx = MultiIndex.from_product([freq, np.arange(500)]) + df = DataFrame(list(range(2000)), index=idx, columns=["Test"]) + df_slice = df.loc[pd.IndexSlice[:, 30:70], :] + result = df_slice.loc["a"] + expected = DataFrame(list(range(30, 71)), columns=["Test"], index=range(30, 71)) + tm.assert_frame_equal(result, expected) + result = df_slice.loc["d"] + expected = DataFrame( + list(range(1530, 1571)), columns=["Test"], index=range(30, 71) + ) + tm.assert_frame_equal(result, expected) + + def test_int_series_slicing(self, multiindex_year_month_day_dataframe_random_data): + ymd = multiindex_year_month_day_dataframe_random_data + s = ymd["A"] + result = s[5:] + expected = s.reindex(s.index[5:]) + tm.assert_series_equal(result, expected) + + exp = ymd["A"].copy() + s[5:] = 0 + exp.values[5:] = 0 + tm.assert_numpy_array_equal(s.values, exp.values) + + result = ymd[5:] + expected = ymd.reindex(s.index[5:]) + tm.assert_frame_equal(result, expected) + + def test_non_reducing_slice_on_multiindex(self): + # GH 19861 + dic = { + ("a", "d"): [1, 4], + ("a", "c"): [2, 3], + ("b", "c"): [3, 2], + ("b", "d"): [4, 1], + } + df = pd.DataFrame(dic, index=[0, 1]) + idx = pd.IndexSlice + slice_ = idx[:, idx["b", "d"]] + tslice_ = _non_reducing_slice(slice_) + + result = df.loc[tslice_] + expected = pd.DataFrame({("b", "d"): [4, 1]}) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_sorted.py b/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_sorted.py new file mode 100644 index 0000000..4bec0f4 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_sorted.py @@ -0,0 +1,97 @@ +import numpy as np +from numpy.random import randn + +from pandas import DataFrame, MultiIndex, Series +import pandas._testing as tm + + +class TestMultiIndexSorted: + def test_getitem_multilevel_index_tuple_not_sorted(self): + index_columns = list("abc") + df = DataFrame( + [[0, 1, 0, "x"], [0, 0, 1, "y"]], columns=index_columns + ["data"] + ) + df = df.set_index(index_columns) + query_index = df.index[:1] + rs = df.loc[query_index, "data"] + + xp_idx = MultiIndex.from_tuples([(0, 1, 0)], names=["a", "b", "c"]) + xp = Series(["x"], index=xp_idx, name="data") + tm.assert_series_equal(rs, xp) + + def test_getitem_slice_not_sorted(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + df = frame.sort_index(level=1).T + + # buglet with int typechecking + result = df.iloc[:, : np.int32(3)] + expected = df.reindex(columns=df.columns[:3]) + tm.assert_frame_equal(result, expected) + + def test_frame_getitem_not_sorted2(self): + # 13431 + df = DataFrame( + { + "col1": ["b", "d", "b", "a"], + "col2": [3, 1, 1, 2], + "data": ["one", "two", "three", "four"], + } + ) + + df2 = df.set_index(["col1", "col2"]) + df2_original = df2.copy() + + df2.index.set_levels(["b", "d", "a"], level="col1", inplace=True) + df2.index.set_codes([0, 1, 0, 2], level="col1", inplace=True) + assert not df2.index.is_lexsorted() + assert not df2.index.is_monotonic + + assert df2_original.index.equals(df2.index) + expected = df2.sort_index() + assert expected.index.is_lexsorted() + assert expected.index.is_monotonic + + result = df2.sort_index(level=0) + assert result.index.is_lexsorted() + assert result.index.is_monotonic + tm.assert_frame_equal(result, expected) + + def test_frame_getitem_not_sorted(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + df = frame.T + df["foo", "four"] = "foo" + + arrays = [np.array(x) for x in zip(*df.columns.values)] + + result = df["foo"] + result2 = df.loc[:, "foo"] + expected = df.reindex(columns=df.columns[arrays[0] == "foo"]) + expected.columns = expected.columns.droplevel(0) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result2, expected) + + df = df.T + result = df.xs("foo") + result2 = df.loc["foo"] + expected = df.reindex(df.index[arrays[0] == "foo"]) + expected.index = expected.index.droplevel(0) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result2, expected) + + def test_series_getitem_not_sorted(self): + arrays = [ + ["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + tuples = zip(*arrays) + index = MultiIndex.from_tuples(tuples) + s = Series(randn(8), index=index) + + arrays = [np.array(x) for x in zip(*index.values)] + + result = s["qux"] + result2 = s.loc["qux"] + expected = s[arrays[0] == "qux"] + expected.index = expected.index.droplevel(0) + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result2, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_xs.py b/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_xs.py new file mode 100644 index 0000000..db8c0c6 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/multiindex/test_xs.py @@ -0,0 +1,245 @@ +import numpy as np +import pytest + +from pandas import DataFrame, Index, MultiIndex, Series, concat, date_range +import pandas._testing as tm +import pandas.core.common as com + + +@pytest.fixture +def four_level_index_dataframe(): + arr = np.array( + [ + [-0.5109, -2.3358, -0.4645, 0.05076, 0.364], + [0.4473, 1.4152, 0.2834, 1.00661, 0.1744], + [-0.6662, -0.5243, -0.358, 0.89145, 2.5838], + ] + ) + index = MultiIndex( + levels=[["a", "x"], ["b", "q"], [10.0032, 20.0, 30.0], [3, 4, 5]], + codes=[[0, 0, 1], [0, 1, 1], [0, 1, 2], [2, 1, 0]], + names=["one", "two", "three", "four"], + ) + return DataFrame(arr, index=index, columns=list("ABCDE")) + + +@pytest.mark.parametrize( + "key, level, exp_arr, exp_index", + [ + ("a", "lvl0", lambda x: x[:, 0:2], Index(["bar", "foo"], name="lvl1")), + ("foo", "lvl1", lambda x: x[:, 1:2], Index(["a"], name="lvl0")), + ], +) +def test_xs_named_levels_axis_eq_1(key, level, exp_arr, exp_index): + # see gh-2903 + arr = np.random.randn(4, 4) + index = MultiIndex( + levels=[["a", "b"], ["bar", "foo", "hello", "world"]], + codes=[[0, 0, 1, 1], [0, 1, 2, 3]], + names=["lvl0", "lvl1"], + ) + df = DataFrame(arr, columns=index) + result = df.xs(key, level=level, axis=1) + expected = DataFrame(exp_arr(arr), columns=exp_index) + tm.assert_frame_equal(result, expected) + + +def test_xs_values(multiindex_dataframe_random_data): + df = multiindex_dataframe_random_data + result = df.xs(("bar", "two")).values + expected = df.values[4] + tm.assert_almost_equal(result, expected) + + +def test_xs_loc_equality(multiindex_dataframe_random_data): + df = multiindex_dataframe_random_data + result = df.xs(("bar", "two")) + expected = df.loc[("bar", "two")] + tm.assert_series_equal(result, expected) + + +def test_xs_missing_values_in_index(): + # see gh-6574 + # missing values in returned index should be preserved + acc = [ + ("a", "abcde", 1), + ("b", "bbcde", 2), + ("y", "yzcde", 25), + ("z", "xbcde", 24), + ("z", None, 26), + ("z", "zbcde", 25), + ("z", "ybcde", 26), + ] + df = DataFrame(acc, columns=["a1", "a2", "cnt"]).set_index(["a1", "a2"]) + expected = DataFrame( + {"cnt": [24, 26, 25, 26]}, + index=Index(["xbcde", np.nan, "zbcde", "ybcde"], name="a2"), + ) + + result = df.xs("z", level="a1") + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("key, level", [("one", "second"), (["one"], ["second"])]) +def test_xs_with_duplicates(key, level, multiindex_dataframe_random_data): + # see gh-13719 + frame = multiindex_dataframe_random_data + df = concat([frame] * 2) + assert df.index.is_unique is False + expected = concat([frame.xs("one", level="second")] * 2) + + result = df.xs(key, level=level) + tm.assert_frame_equal(result, expected) + + +def test_xs_level(multiindex_dataframe_random_data): + df = multiindex_dataframe_random_data + result = df.xs("two", level="second") + expected = df[df.index.get_level_values(1) == "two"] + expected.index = Index(["foo", "bar", "baz", "qux"], name="first") + tm.assert_frame_equal(result, expected) + + +def test_xs_level_eq_2(): + arr = np.random.randn(3, 5) + index = MultiIndex( + levels=[["a", "p", "x"], ["b", "q", "y"], ["c", "r", "z"]], + codes=[[2, 0, 1], [2, 0, 1], [2, 0, 1]], + ) + df = DataFrame(arr, index=index) + expected = DataFrame(arr[1:2], index=[["a"], ["b"]]) + result = df.xs("c", level=2) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "indexer", + [ + lambda df: df.xs(("a", 4), level=["one", "four"]), + lambda df: df.xs("a").xs(4, level="four"), + ], +) +def test_xs_level_multiple(indexer, four_level_index_dataframe): + df = four_level_index_dataframe + expected_values = [[0.4473, 1.4152, 0.2834, 1.00661, 0.1744]] + expected_index = MultiIndex( + levels=[["q"], [20.0]], codes=[[0], [0]], names=["two", "three"] + ) + expected = DataFrame(expected_values, index=expected_index, columns=list("ABCDE")) + result = indexer(df) + tm.assert_frame_equal(result, expected) + + +def test_xs_setting_with_copy_error(multiindex_dataframe_random_data): + # this is a copy in 0.14 + df = multiindex_dataframe_random_data + result = df.xs("two", level="second") + + # setting this will give a SettingWithCopyError + # as we are trying to write a view + msg = "A value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(com.SettingWithCopyError, match=msg): + result[:] = 10 + + +def test_xs_setting_with_copy_error_multiple(four_level_index_dataframe): + # this is a copy in 0.14 + df = four_level_index_dataframe + result = df.xs(("a", 4), level=["one", "four"]) + + # setting this will give a SettingWithCopyError + # as we are trying to write a view + msg = "A value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(com.SettingWithCopyError, match=msg): + result[:] = 10 + + +def test_xs_integer_key(): + # see gh-2107 + dates = range(20111201, 20111205) + ids = list("abcde") + index = MultiIndex.from_product([dates, ids], names=["date", "secid"]) + df = DataFrame(np.random.randn(len(index), 3), index, ["X", "Y", "Z"]) + + result = df.xs(20111201, level="date") + expected = df.loc[20111201, :] + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "indexer", [lambda df: df.xs("a", level=0), lambda df: df.xs("a")] +) +def test_xs_level0(indexer, four_level_index_dataframe): + df = four_level_index_dataframe + expected_values = [ + [-0.5109, -2.3358, -0.4645, 0.05076, 0.364], + [0.4473, 1.4152, 0.2834, 1.00661, 0.1744], + ] + expected_index = MultiIndex( + levels=[["b", "q"], [10.0032, 20.0], [4, 5]], + codes=[[0, 1], [0, 1], [1, 0]], + names=["two", "three", "four"], + ) + expected = DataFrame(expected_values, index=expected_index, columns=list("ABCDE")) + + result = indexer(df) + tm.assert_frame_equal(result, expected) + + +def test_xs_level_series(multiindex_dataframe_random_data): + # this test is not explicitly testing .xs functionality + # TODO: move to another module or refactor + df = multiindex_dataframe_random_data + s = df["A"] + result = s[:, "two"] + expected = df.xs("two", level=1)["A"] + tm.assert_series_equal(result, expected) + + +def test_xs_level_series_ymd(multiindex_year_month_day_dataframe_random_data): + # this test is not explicitly testing .xs functionality + # TODO: move to another module or refactor + df = multiindex_year_month_day_dataframe_random_data + s = df["A"] + result = s[2000, 5] + expected = df.loc[2000, 5]["A"] + tm.assert_series_equal(result, expected) + + +def test_xs_level_series_slice_not_implemented( + multiindex_year_month_day_dataframe_random_data, +): + # this test is not explicitly testing .xs functionality + # TODO: move to another module or refactor + # not implementing this for now + df = multiindex_year_month_day_dataframe_random_data + s = df["A"] + + msg = r"\(2000, slice\(3, 4, None\)\)" + with pytest.raises(TypeError, match=msg): + s[2000, 3:4] + + +def test_series_getitem_multiindex_xs(): + # GH6258 + dt = list(date_range("20130903", periods=3)) + idx = MultiIndex.from_product([list("AB"), dt]) + s = Series([1, 3, 4, 1, 3, 4], index=idx) + expected = Series([1, 1], index=list("AB")) + + result = s.xs("20130903", level=1) + tm.assert_series_equal(result, expected) + + +def test_series_getitem_multiindex_xs_by_label(): + # GH5684 + idx = MultiIndex.from_tuples( + [("a", "one"), ("a", "two"), ("b", "one"), ("b", "two")] + ) + s = Series([1, 2, 3, 4], index=idx) + s.index.set_names(["L1", "L2"], inplace=True) + expected = Series([1, 3], index=["a", "b"]) + expected.index.set_names(["L1"], inplace=True) + + result = s.xs("one", level="L2") + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexing/test_callable.py b/venv/Lib/site-packages/pandas/tests/indexing/test_callable.py new file mode 100644 index 0000000..621417e --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/test_callable.py @@ -0,0 +1,260 @@ +import numpy as np + +import pandas as pd +import pandas._testing as tm + + +class TestIndexingCallable: + def test_frame_loc_callable(self): + # GH 11485 + df = pd.DataFrame({"A": [1, 2, 3, 4], "B": list("aabb"), "C": [1, 2, 3, 4]}) + # iloc cannot use boolean Series (see GH3635) + + # return bool indexer + res = df.loc[lambda x: x.A > 2] + tm.assert_frame_equal(res, df.loc[df.A > 2]) + + res = df.loc[lambda x: x.A > 2] + tm.assert_frame_equal(res, df.loc[df.A > 2]) + + res = df.loc[ + lambda x: x.A > 2, + ] # noqa: E231 + tm.assert_frame_equal(res, df.loc[df.A > 2,]) # noqa: E231 + + res = df.loc[ + lambda x: x.A > 2, + ] # noqa: E231 + tm.assert_frame_equal(res, df.loc[df.A > 2,]) # noqa: E231 + + res = df.loc[lambda x: x.B == "b", :] + tm.assert_frame_equal(res, df.loc[df.B == "b", :]) + + res = df.loc[lambda x: x.B == "b", :] + tm.assert_frame_equal(res, df.loc[df.B == "b", :]) + + res = df.loc[lambda x: x.A > 2, lambda x: x.columns == "B"] + tm.assert_frame_equal(res, df.loc[df.A > 2, [False, True, False]]) + + res = df.loc[lambda x: x.A > 2, lambda x: x.columns == "B"] + tm.assert_frame_equal(res, df.loc[df.A > 2, [False, True, False]]) + + res = df.loc[lambda x: x.A > 2, lambda x: "B"] + tm.assert_series_equal(res, df.loc[df.A > 2, "B"]) + + res = df.loc[lambda x: x.A > 2, lambda x: "B"] + tm.assert_series_equal(res, df.loc[df.A > 2, "B"]) + + res = df.loc[lambda x: x.A > 2, lambda x: ["A", "B"]] + tm.assert_frame_equal(res, df.loc[df.A > 2, ["A", "B"]]) + + res = df.loc[lambda x: x.A > 2, lambda x: ["A", "B"]] + tm.assert_frame_equal(res, df.loc[df.A > 2, ["A", "B"]]) + + res = df.loc[lambda x: x.A == 2, lambda x: ["A", "B"]] + tm.assert_frame_equal(res, df.loc[df.A == 2, ["A", "B"]]) + + res = df.loc[lambda x: x.A == 2, lambda x: ["A", "B"]] + tm.assert_frame_equal(res, df.loc[df.A == 2, ["A", "B"]]) + + # scalar + res = df.loc[lambda x: 1, lambda x: "A"] + assert res == df.loc[1, "A"] + + res = df.loc[lambda x: 1, lambda x: "A"] + assert res == df.loc[1, "A"] + + def test_frame_loc_callable_mixture(self): + # GH 11485 + df = pd.DataFrame({"A": [1, 2, 3, 4], "B": list("aabb"), "C": [1, 2, 3, 4]}) + + res = df.loc[lambda x: x.A > 2, ["A", "B"]] + tm.assert_frame_equal(res, df.loc[df.A > 2, ["A", "B"]]) + + res = df.loc[lambda x: x.A > 2, ["A", "B"]] + tm.assert_frame_equal(res, df.loc[df.A > 2, ["A", "B"]]) + + res = df.loc[[2, 3], lambda x: ["A", "B"]] + tm.assert_frame_equal(res, df.loc[[2, 3], ["A", "B"]]) + + res = df.loc[[2, 3], lambda x: ["A", "B"]] + tm.assert_frame_equal(res, df.loc[[2, 3], ["A", "B"]]) + + res = df.loc[3, lambda x: ["A", "B"]] + tm.assert_series_equal(res, df.loc[3, ["A", "B"]]) + + res = df.loc[3, lambda x: ["A", "B"]] + tm.assert_series_equal(res, df.loc[3, ["A", "B"]]) + + def test_frame_loc_callable_labels(self): + # GH 11485 + df = pd.DataFrame({"X": [1, 2, 3, 4], "Y": list("aabb")}, index=list("ABCD")) + + # return label + res = df.loc[lambda x: ["A", "C"]] + tm.assert_frame_equal(res, df.loc[["A", "C"]]) + + res = df.loc[ + lambda x: ["A", "C"], + ] # noqa: E231 + tm.assert_frame_equal(res, df.loc[["A", "C"],]) # noqa: E231 + + res = df.loc[lambda x: ["A", "C"], :] + tm.assert_frame_equal(res, df.loc[["A", "C"], :]) + + res = df.loc[lambda x: ["A", "C"], lambda x: "X"] + tm.assert_series_equal(res, df.loc[["A", "C"], "X"]) + + res = df.loc[lambda x: ["A", "C"], lambda x: ["X"]] + tm.assert_frame_equal(res, df.loc[["A", "C"], ["X"]]) + + # mixture + res = df.loc[["A", "C"], lambda x: "X"] + tm.assert_series_equal(res, df.loc[["A", "C"], "X"]) + + res = df.loc[["A", "C"], lambda x: ["X"]] + tm.assert_frame_equal(res, df.loc[["A", "C"], ["X"]]) + + res = df.loc[lambda x: ["A", "C"], "X"] + tm.assert_series_equal(res, df.loc[["A", "C"], "X"]) + + res = df.loc[lambda x: ["A", "C"], ["X"]] + tm.assert_frame_equal(res, df.loc[["A", "C"], ["X"]]) + + def test_frame_loc_callable_setitem(self): + # GH 11485 + df = pd.DataFrame({"X": [1, 2, 3, 4], "Y": list("aabb")}, index=list("ABCD")) + + # return label + res = df.copy() + res.loc[lambda x: ["A", "C"]] = -20 + exp = df.copy() + exp.loc[["A", "C"]] = -20 + tm.assert_frame_equal(res, exp) + + res = df.copy() + res.loc[lambda x: ["A", "C"], :] = 20 + exp = df.copy() + exp.loc[["A", "C"], :] = 20 + tm.assert_frame_equal(res, exp) + + res = df.copy() + res.loc[lambda x: ["A", "C"], lambda x: "X"] = -1 + exp = df.copy() + exp.loc[["A", "C"], "X"] = -1 + tm.assert_frame_equal(res, exp) + + res = df.copy() + res.loc[lambda x: ["A", "C"], lambda x: ["X"]] = [5, 10] + exp = df.copy() + exp.loc[["A", "C"], ["X"]] = [5, 10] + tm.assert_frame_equal(res, exp) + + # mixture + res = df.copy() + res.loc[["A", "C"], lambda x: "X"] = np.array([-1, -2]) + exp = df.copy() + exp.loc[["A", "C"], "X"] = np.array([-1, -2]) + tm.assert_frame_equal(res, exp) + + res = df.copy() + res.loc[["A", "C"], lambda x: ["X"]] = 10 + exp = df.copy() + exp.loc[["A", "C"], ["X"]] = 10 + tm.assert_frame_equal(res, exp) + + res = df.copy() + res.loc[lambda x: ["A", "C"], "X"] = -2 + exp = df.copy() + exp.loc[["A", "C"], "X"] = -2 + tm.assert_frame_equal(res, exp) + + res = df.copy() + res.loc[lambda x: ["A", "C"], ["X"]] = -4 + exp = df.copy() + exp.loc[["A", "C"], ["X"]] = -4 + tm.assert_frame_equal(res, exp) + + def test_frame_iloc_callable(self): + # GH 11485 + df = pd.DataFrame({"X": [1, 2, 3, 4], "Y": list("aabb")}, index=list("ABCD")) + + # return location + res = df.iloc[lambda x: [1, 3]] + tm.assert_frame_equal(res, df.iloc[[1, 3]]) + + res = df.iloc[lambda x: [1, 3], :] + tm.assert_frame_equal(res, df.iloc[[1, 3], :]) + + res = df.iloc[lambda x: [1, 3], lambda x: 0] + tm.assert_series_equal(res, df.iloc[[1, 3], 0]) + + res = df.iloc[lambda x: [1, 3], lambda x: [0]] + tm.assert_frame_equal(res, df.iloc[[1, 3], [0]]) + + # mixture + res = df.iloc[[1, 3], lambda x: 0] + tm.assert_series_equal(res, df.iloc[[1, 3], 0]) + + res = df.iloc[[1, 3], lambda x: [0]] + tm.assert_frame_equal(res, df.iloc[[1, 3], [0]]) + + res = df.iloc[lambda x: [1, 3], 0] + tm.assert_series_equal(res, df.iloc[[1, 3], 0]) + + res = df.iloc[lambda x: [1, 3], [0]] + tm.assert_frame_equal(res, df.iloc[[1, 3], [0]]) + + def test_frame_iloc_callable_setitem(self): + # GH 11485 + df = pd.DataFrame({"X": [1, 2, 3, 4], "Y": list("aabb")}, index=list("ABCD")) + + # return location + res = df.copy() + res.iloc[lambda x: [1, 3]] = 0 + exp = df.copy() + exp.iloc[[1, 3]] = 0 + tm.assert_frame_equal(res, exp) + + res = df.copy() + res.iloc[lambda x: [1, 3], :] = -1 + exp = df.copy() + exp.iloc[[1, 3], :] = -1 + tm.assert_frame_equal(res, exp) + + res = df.copy() + res.iloc[lambda x: [1, 3], lambda x: 0] = 5 + exp = df.copy() + exp.iloc[[1, 3], 0] = 5 + tm.assert_frame_equal(res, exp) + + res = df.copy() + res.iloc[lambda x: [1, 3], lambda x: [0]] = 25 + exp = df.copy() + exp.iloc[[1, 3], [0]] = 25 + tm.assert_frame_equal(res, exp) + + # mixture + res = df.copy() + res.iloc[[1, 3], lambda x: 0] = -3 + exp = df.copy() + exp.iloc[[1, 3], 0] = -3 + tm.assert_frame_equal(res, exp) + + res = df.copy() + res.iloc[[1, 3], lambda x: [0]] = -5 + exp = df.copy() + exp.iloc[[1, 3], [0]] = -5 + tm.assert_frame_equal(res, exp) + + res = df.copy() + res.iloc[lambda x: [1, 3], 0] = 10 + exp = df.copy() + exp.iloc[[1, 3], 0] = 10 + tm.assert_frame_equal(res, exp) + + res = df.copy() + res.iloc[lambda x: [1, 3], [0]] = [-5, -5] + exp = df.copy() + exp.iloc[[1, 3], [0]] = [-5, -5] + tm.assert_frame_equal(res, exp) diff --git a/venv/Lib/site-packages/pandas/tests/indexing/test_categorical.py b/venv/Lib/site-packages/pandas/tests/indexing/test_categorical.py new file mode 100644 index 0000000..8c8dece --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/test_categorical.py @@ -0,0 +1,822 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_categorical_dtype +from pandas.core.dtypes.dtypes import CategoricalDtype + +import pandas as pd +from pandas import ( + Categorical, + CategoricalIndex, + DataFrame, + Index, + Interval, + Series, + Timedelta, + Timestamp, + conftest, +) +import pandas._testing as tm +from pandas.api.types import CategoricalDtype as CDT + + +class TestCategoricalIndex: + def setup_method(self, method): + + self.df = DataFrame( + { + "A": np.arange(6, dtype="int64"), + "B": Series(list("aabbca")).astype(CDT(list("cab"))), + } + ).set_index("B") + self.df2 = DataFrame( + { + "A": np.arange(6, dtype="int64"), + "B": Series(list("aabbca")).astype(CDT(list("cabe"))), + } + ).set_index("B") + self.df3 = DataFrame( + { + "A": np.arange(6, dtype="int64"), + "B": (Series([1, 1, 2, 1, 3, 2]).astype(CDT([3, 2, 1], ordered=True))), + } + ).set_index("B") + self.df4 = DataFrame( + { + "A": np.arange(6, dtype="int64"), + "B": (Series([1, 1, 2, 1, 3, 2]).astype(CDT([3, 2, 1], ordered=False))), + } + ).set_index("B") + + def test_loc_scalar(self): + result = self.df.loc["a"] + expected = DataFrame( + {"A": [0, 1, 5], "B": (Series(list("aaa")).astype(CDT(list("cab"))))} + ).set_index("B") + tm.assert_frame_equal(result, expected) + + df = self.df.copy() + df.loc["a"] = 20 + expected = DataFrame( + { + "A": [20, 20, 2, 3, 4, 20], + "B": (Series(list("aabbca")).astype(CDT(list("cab")))), + } + ).set_index("B") + tm.assert_frame_equal(df, expected) + + # value not in the categories + with pytest.raises(KeyError, match=r"^'d'$"): + df.loc["d"] + + msg = "cannot append a non-category item to a CategoricalIndex" + with pytest.raises(TypeError, match=msg): + df.loc["d"] = 10 + + msg = ( + "cannot insert an item into a CategoricalIndex that is not " + "already an existing category" + ) + with pytest.raises(TypeError, match=msg): + df.loc["d", "A"] = 10 + with pytest.raises(TypeError, match=msg): + df.loc["d", "C"] = 10 + + msg = ( + r"cannot do label indexing on with these indexers \[1\] of " + ) + with pytest.raises(TypeError, match=msg): + df.loc[1] + + def test_getitem_scalar(self): + + cats = Categorical([Timestamp("12-31-1999"), Timestamp("12-31-2000")]) + + s = Series([1, 2], index=cats) + + expected = s.iloc[0] + result = s[cats[0]] + assert result == expected + + def test_slicing_directly(self): + cat = Categorical(["a", "b", "c", "d", "a", "b", "c"]) + sliced = cat[3] + assert sliced == "d" + sliced = cat[3:5] + expected = Categorical(["d", "a"], categories=["a", "b", "c", "d"]) + tm.assert_numpy_array_equal(sliced._codes, expected._codes) + tm.assert_index_equal(sliced.categories, expected.categories) + + def test_slicing(self): + cat = Series(Categorical([1, 2, 3, 4])) + reversed = cat[::-1] + exp = np.array([4, 3, 2, 1], dtype=np.int64) + tm.assert_numpy_array_equal(reversed.__array__(), exp) + + df = DataFrame({"value": (np.arange(100) + 1).astype("int64")}) + df["D"] = pd.cut(df.value, bins=[0, 25, 50, 75, 100]) + + expected = Series([11, Interval(0, 25)], index=["value", "D"], name=10) + result = df.iloc[10] + tm.assert_series_equal(result, expected) + + expected = DataFrame( + {"value": np.arange(11, 21).astype("int64")}, + index=np.arange(10, 20).astype("int64"), + ) + expected["D"] = pd.cut(expected.value, bins=[0, 25, 50, 75, 100]) + result = df.iloc[10:20] + tm.assert_frame_equal(result, expected) + + expected = Series([9, Interval(0, 25)], index=["value", "D"], name=8) + result = df.loc[8] + tm.assert_series_equal(result, expected) + + def test_slicing_and_getting_ops(self): + + # systematically test the slicing operations: + # for all slicing ops: + # - returning a dataframe + # - returning a column + # - returning a row + # - returning a single value + + cats = Categorical( + ["a", "c", "b", "c", "c", "c", "c"], categories=["a", "b", "c"] + ) + idx = Index(["h", "i", "j", "k", "l", "m", "n"]) + values = [1, 2, 3, 4, 5, 6, 7] + df = DataFrame({"cats": cats, "values": values}, index=idx) + + # the expected values + cats2 = Categorical(["b", "c"], categories=["a", "b", "c"]) + idx2 = Index(["j", "k"]) + values2 = [3, 4] + + # 2:4,: | "j":"k",: + exp_df = DataFrame({"cats": cats2, "values": values2}, index=idx2) + + # :,"cats" | :,0 + exp_col = Series(cats, index=idx, name="cats") + + # "j",: | 2,: + exp_row = Series(["b", 3], index=["cats", "values"], dtype="object", name="j") + + # "j","cats | 2,0 + exp_val = "b" + + # iloc + # frame + res_df = df.iloc[2:4, :] + tm.assert_frame_equal(res_df, exp_df) + assert is_categorical_dtype(res_df["cats"]) + + # row + res_row = df.iloc[2, :] + tm.assert_series_equal(res_row, exp_row) + assert isinstance(res_row["cats"], str) + + # col + res_col = df.iloc[:, 0] + tm.assert_series_equal(res_col, exp_col) + assert is_categorical_dtype(res_col) + + # single value + res_val = df.iloc[2, 0] + assert res_val == exp_val + + # loc + # frame + res_df = df.loc["j":"k", :] + tm.assert_frame_equal(res_df, exp_df) + assert is_categorical_dtype(res_df["cats"]) + + # row + res_row = df.loc["j", :] + tm.assert_series_equal(res_row, exp_row) + assert isinstance(res_row["cats"], str) + + # col + res_col = df.loc[:, "cats"] + tm.assert_series_equal(res_col, exp_col) + assert is_categorical_dtype(res_col) + + # single value + res_val = df.loc["j", "cats"] + assert res_val == exp_val + + # ix + # frame + # res_df = df.loc["j":"k",[0,1]] # doesn't work? + res_df = df.loc["j":"k", :] + tm.assert_frame_equal(res_df, exp_df) + assert is_categorical_dtype(res_df["cats"]) + + # row + res_row = df.loc["j", :] + tm.assert_series_equal(res_row, exp_row) + assert isinstance(res_row["cats"], str) + + # col + res_col = df.loc[:, "cats"] + tm.assert_series_equal(res_col, exp_col) + assert is_categorical_dtype(res_col) + + # single value + res_val = df.loc["j", df.columns[0]] + assert res_val == exp_val + + # iat + res_val = df.iat[2, 0] + assert res_val == exp_val + + # at + res_val = df.at["j", "cats"] + assert res_val == exp_val + + # fancy indexing + exp_fancy = df.iloc[[2]] + + res_fancy = df[df["cats"] == "b"] + tm.assert_frame_equal(res_fancy, exp_fancy) + res_fancy = df[df["values"] == 3] + tm.assert_frame_equal(res_fancy, exp_fancy) + + # get_value + res_val = df.at["j", "cats"] + assert res_val == exp_val + + # i : int, slice, or sequence of integers + res_row = df.iloc[2] + tm.assert_series_equal(res_row, exp_row) + assert isinstance(res_row["cats"], str) + + res_df = df.iloc[slice(2, 4)] + tm.assert_frame_equal(res_df, exp_df) + assert is_categorical_dtype(res_df["cats"]) + + res_df = df.iloc[[2, 3]] + tm.assert_frame_equal(res_df, exp_df) + assert is_categorical_dtype(res_df["cats"]) + + res_col = df.iloc[:, 0] + tm.assert_series_equal(res_col, exp_col) + assert is_categorical_dtype(res_col) + + res_df = df.iloc[:, slice(0, 2)] + tm.assert_frame_equal(res_df, df) + assert is_categorical_dtype(res_df["cats"]) + + res_df = df.iloc[:, [0, 1]] + tm.assert_frame_equal(res_df, df) + assert is_categorical_dtype(res_df["cats"]) + + def test_slicing_doc_examples(self): + + # GH 7918 + cats = Categorical( + ["a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c"] + ) + idx = Index(["h", "i", "j", "k", "l", "m", "n"]) + values = [1, 2, 2, 2, 3, 4, 5] + df = DataFrame({"cats": cats, "values": values}, index=idx) + + result = df.iloc[2:4, :] + expected = DataFrame( + { + "cats": Categorical(["b", "b"], categories=["a", "b", "c"]), + "values": [2, 2], + }, + index=["j", "k"], + ) + tm.assert_frame_equal(result, expected) + + result = df.iloc[2:4, :].dtypes + expected = Series(["category", "int64"], ["cats", "values"]) + tm.assert_series_equal(result, expected) + + result = df.loc["h":"j", "cats"] + expected = Series( + Categorical(["a", "b", "b"], categories=["a", "b", "c"]), + index=["h", "i", "j"], + name="cats", + ) + tm.assert_series_equal(result, expected) + + result = df.loc["h":"j", df.columns[0:1]] + expected = DataFrame( + {"cats": Categorical(["a", "b", "b"], categories=["a", "b", "c"])}, + index=["h", "i", "j"], + ) + tm.assert_frame_equal(result, expected) + + def test_getitem_category_type(self): + # GH 14580 + # test iloc() on Series with Categorical data + + s = Series([1, 2, 3]).astype("category") + + # get slice + result = s.iloc[0:2] + expected = Series([1, 2]).astype(CategoricalDtype([1, 2, 3])) + tm.assert_series_equal(result, expected) + + # get list of indexes + result = s.iloc[[0, 1]] + expected = Series([1, 2]).astype(CategoricalDtype([1, 2, 3])) + tm.assert_series_equal(result, expected) + + # get boolean array + result = s.iloc[[True, False, False]] + expected = Series([1]).astype(CategoricalDtype([1, 2, 3])) + tm.assert_series_equal(result, expected) + + def test_loc_listlike(self): + + # list of labels + result = self.df.loc[["c", "a"]] + expected = self.df.iloc[[4, 0, 1, 5]] + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = self.df2.loc[["a", "b", "e"]] + exp_index = CategoricalIndex(list("aaabbe"), categories=list("cabe"), name="B") + expected = DataFrame({"A": [0, 1, 5, 2, 3, np.nan]}, index=exp_index) + tm.assert_frame_equal(result, expected, check_index_type=True) + + # element in the categories but not in the values + with pytest.raises(KeyError, match=r"^'e'$"): + self.df2.loc["e"] + + # assign is ok + df = self.df2.copy() + df.loc["e"] = 20 + result = df.loc[["a", "b", "e"]] + exp_index = CategoricalIndex(list("aaabbe"), categories=list("cabe"), name="B") + expected = DataFrame({"A": [0, 1, 5, 2, 3, 20]}, index=exp_index) + tm.assert_frame_equal(result, expected) + + df = self.df2.copy() + result = df.loc[["a", "b", "e"]] + exp_index = CategoricalIndex(list("aaabbe"), categories=list("cabe"), name="B") + expected = DataFrame({"A": [0, 1, 5, 2, 3, np.nan]}, index=exp_index) + tm.assert_frame_equal(result, expected, check_index_type=True) + + # not all labels in the categories + with pytest.raises( + KeyError, + match=( + "'a list-indexer must only include values that are in the categories'" + ), + ): + self.df2.loc[["a", "d"]] + + def test_loc_listlike_dtypes(self): + # GH 11586 + + # unique categories and codes + index = CategoricalIndex(["a", "b", "c"]) + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=index) + + # unique slice + res = df.loc[["a", "b"]] + exp_index = CategoricalIndex(["a", "b"], categories=index.categories) + exp = DataFrame({"A": [1, 2], "B": [4, 5]}, index=exp_index) + tm.assert_frame_equal(res, exp, check_index_type=True) + + # duplicated slice + res = df.loc[["a", "a", "b"]] + + exp_index = CategoricalIndex(["a", "a", "b"], categories=index.categories) + exp = DataFrame({"A": [1, 1, 2], "B": [4, 4, 5]}, index=exp_index) + tm.assert_frame_equal(res, exp, check_index_type=True) + + msg = "a list-indexer must only include values that are in the categories" + with pytest.raises(KeyError, match=msg): + df.loc[["a", "x"]] + + # duplicated categories and codes + index = CategoricalIndex(["a", "b", "a"]) + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=index) + + # unique slice + res = df.loc[["a", "b"]] + exp = DataFrame( + {"A": [1, 3, 2], "B": [4, 6, 5]}, index=CategoricalIndex(["a", "a", "b"]) + ) + tm.assert_frame_equal(res, exp, check_index_type=True) + + # duplicated slice + res = df.loc[["a", "a", "b"]] + exp = DataFrame( + {"A": [1, 3, 1, 3, 2], "B": [4, 6, 4, 6, 5]}, + index=CategoricalIndex(["a", "a", "a", "a", "b"]), + ) + tm.assert_frame_equal(res, exp, check_index_type=True) + + msg = "a list-indexer must only include values that are in the categories" + with pytest.raises(KeyError, match=msg): + df.loc[["a", "x"]] + + # contains unused category + index = CategoricalIndex(["a", "b", "a", "c"], categories=list("abcde")) + df = DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]}, index=index) + + res = df.loc[["a", "b"]] + exp = DataFrame( + {"A": [1, 3, 2], "B": [5, 7, 6]}, + index=CategoricalIndex(["a", "a", "b"], categories=list("abcde")), + ) + tm.assert_frame_equal(res, exp, check_index_type=True) + + res = df.loc[["a", "e"]] + exp = DataFrame( + {"A": [1, 3, np.nan], "B": [5, 7, np.nan]}, + index=CategoricalIndex(["a", "a", "e"], categories=list("abcde")), + ) + tm.assert_frame_equal(res, exp, check_index_type=True) + + # duplicated slice + res = df.loc[["a", "a", "b"]] + exp = DataFrame( + {"A": [1, 3, 1, 3, 2], "B": [5, 7, 5, 7, 6]}, + index=CategoricalIndex(["a", "a", "a", "a", "b"], categories=list("abcde")), + ) + tm.assert_frame_equal(res, exp, check_index_type=True) + + msg = "a list-indexer must only include values that are in the categories" + with pytest.raises(KeyError, match=msg): + df.loc[["a", "x"]] + + def test_get_indexer_array(self): + arr = np.array( + [Timestamp("1999-12-31 00:00:00"), Timestamp("2000-12-31 00:00:00")], + dtype=object, + ) + cats = [Timestamp("1999-12-31 00:00:00"), Timestamp("2000-12-31 00:00:00")] + ci = CategoricalIndex(cats, categories=cats, ordered=False, dtype="category") + result = ci.get_indexer(arr) + expected = np.array([0, 1], dtype="intp") + tm.assert_numpy_array_equal(result, expected) + + def test_get_indexer_same_categories_same_order(self): + ci = CategoricalIndex(["a", "b"], categories=["a", "b"]) + + result = ci.get_indexer(CategoricalIndex(["b", "b"], categories=["a", "b"])) + expected = np.array([1, 1], dtype="intp") + tm.assert_numpy_array_equal(result, expected) + + def test_get_indexer_same_categories_different_order(self): + # https://github.com/pandas-dev/pandas/issues/19551 + ci = CategoricalIndex(["a", "b"], categories=["a", "b"]) + + result = ci.get_indexer(CategoricalIndex(["b", "b"], categories=["b", "a"])) + expected = np.array([1, 1], dtype="intp") + tm.assert_numpy_array_equal(result, expected) + + def test_getitem_with_listlike(self): + # GH 16115 + cats = Categorical([Timestamp("12-31-1999"), Timestamp("12-31-2000")]) + + expected = DataFrame( + [[1, 0], [0, 1]], dtype="uint8", index=[0, 1], columns=cats + ) + dummies = pd.get_dummies(cats) + result = dummies[list(dummies.columns)] + tm.assert_frame_equal(result, expected) + + def test_setitem_listlike(self): + + # GH 9469 + # properly coerce the input indexers + np.random.seed(1) + c = Categorical( + np.random.randint(0, 5, size=150000).astype(np.int8) + ).add_categories([-1000]) + indexer = np.array([100000]).astype(np.int64) + c[indexer] = -1000 + + # we are asserting the code result here + # which maps to the -1000 category + result = c.codes[np.array([100000]).astype(np.int64)] + tm.assert_numpy_array_equal(result, np.array([5], dtype="int8")) + + def test_ix_categorical_index(self): + # GH 12531 + df = DataFrame(np.random.randn(3, 3), index=list("ABC"), columns=list("XYZ")) + cdf = df.copy() + cdf.index = CategoricalIndex(df.index) + cdf.columns = CategoricalIndex(df.columns) + + expect = Series(df.loc["A", :], index=cdf.columns, name="A") + tm.assert_series_equal(cdf.loc["A", :], expect) + + expect = Series(df.loc[:, "X"], index=cdf.index, name="X") + tm.assert_series_equal(cdf.loc[:, "X"], expect) + + exp_index = CategoricalIndex(list("AB"), categories=["A", "B", "C"]) + expect = DataFrame(df.loc[["A", "B"], :], columns=cdf.columns, index=exp_index) + tm.assert_frame_equal(cdf.loc[["A", "B"], :], expect) + + exp_columns = CategoricalIndex(list("XY"), categories=["X", "Y", "Z"]) + expect = DataFrame(df.loc[:, ["X", "Y"]], index=cdf.index, columns=exp_columns) + tm.assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect) + + # non-unique + df = DataFrame(np.random.randn(3, 3), index=list("ABA"), columns=list("XYX")) + cdf = df.copy() + cdf.index = CategoricalIndex(df.index) + cdf.columns = CategoricalIndex(df.columns) + + exp_index = CategoricalIndex(list("AA"), categories=["A", "B"]) + expect = DataFrame(df.loc["A", :], columns=cdf.columns, index=exp_index) + tm.assert_frame_equal(cdf.loc["A", :], expect) + + exp_columns = CategoricalIndex(list("XX"), categories=["X", "Y"]) + expect = DataFrame(df.loc[:, "X"], index=cdf.index, columns=exp_columns) + tm.assert_frame_equal(cdf.loc[:, "X"], expect) + + expect = DataFrame( + df.loc[["A", "B"], :], + columns=cdf.columns, + index=CategoricalIndex(list("AAB")), + ) + tm.assert_frame_equal(cdf.loc[["A", "B"], :], expect) + + expect = DataFrame( + df.loc[:, ["X", "Y"]], + index=cdf.index, + columns=CategoricalIndex(list("XXY")), + ) + tm.assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect) + + def test_read_only_source(self): + # GH 10043 + rw_array = np.eye(10) + rw_df = DataFrame(rw_array) + + ro_array = np.eye(10) + ro_array.setflags(write=False) + ro_df = DataFrame(ro_array) + + tm.assert_frame_equal(rw_df.iloc[[1, 2, 3]], ro_df.iloc[[1, 2, 3]]) + tm.assert_frame_equal(rw_df.iloc[[1]], ro_df.iloc[[1]]) + tm.assert_series_equal(rw_df.iloc[1], ro_df.iloc[1]) + tm.assert_frame_equal(rw_df.iloc[1:3], ro_df.iloc[1:3]) + + tm.assert_frame_equal(rw_df.loc[[1, 2, 3]], ro_df.loc[[1, 2, 3]]) + tm.assert_frame_equal(rw_df.loc[[1]], ro_df.loc[[1]]) + tm.assert_series_equal(rw_df.loc[1], ro_df.loc[1]) + tm.assert_frame_equal(rw_df.loc[1:3], ro_df.loc[1:3]) + + def test_reindexing(self): + df = DataFrame( + { + "A": np.arange(3, dtype="int64"), + "B": Series(list("abc")).astype(CDT(list("cabe"))), + } + ).set_index("B") + + # reindexing + # convert to a regular index + result = df.reindex(["a", "b", "e"]) + expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index( + "B" + ) + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = df.reindex(["a", "b"]) + expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = df.reindex(["e"]) + expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = df.reindex(["d"]) + expected = DataFrame({"A": [np.nan], "B": Series(["d"])}).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + # since we are actually reindexing with a Categorical + # then return a Categorical + cats = list("cabe") + + result = df.reindex(Categorical(["a", "e"], categories=cats)) + expected = DataFrame( + {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats))} + ).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = df.reindex(Categorical(["a"], categories=cats)) + expected = DataFrame( + {"A": [0], "B": Series(list("a")).astype(CDT(cats))} + ).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = df.reindex(["a", "b", "e"]) + expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index( + "B" + ) + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = df.reindex(["a", "b"]) + expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = df.reindex(["e"]) + expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + # give back the type of categorical that we received + result = df.reindex(Categorical(["a", "e"], categories=cats, ordered=True)) + expected = DataFrame( + {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats, ordered=True))} + ).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = df.reindex(Categorical(["a", "d"], categories=["a", "d"])) + expected = DataFrame( + {"A": [0, np.nan], "B": Series(list("ad")).astype(CDT(["a", "d"]))} + ).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + # passed duplicate indexers are not allowed + msg = "cannot reindex from a duplicate axis" + with pytest.raises(ValueError, match=msg): + self.df2.reindex(["a", "b"]) + + # args NotImplemented ATM + msg = r"argument {} is not implemented for CategoricalIndex\.reindex" + with pytest.raises(NotImplementedError, match=msg.format("method")): + df.reindex(["a"], method="ffill") + with pytest.raises(NotImplementedError, match=msg.format("level")): + df.reindex(["a"], level=1) + with pytest.raises(NotImplementedError, match=msg.format("limit")): + df.reindex(["a"], limit=2) + + def test_loc_slice(self): + # GH9748 + with pytest.raises(KeyError, match="1"): + self.df.loc[1:5] + + result = self.df.loc["b":"c"] + expected = self.df.iloc[[2, 3, 4]] + tm.assert_frame_equal(result, expected) + + def test_loc_and_at_with_categorical_index(self): + # GH 20629 + s = Series([1, 2, 3], index=pd.CategoricalIndex(["A", "B", "C"])) + assert s.loc["A"] == 1 + assert s.at["A"] == 1 + df = DataFrame( + [[1, 2], [3, 4], [5, 6]], index=pd.CategoricalIndex(["A", "B", "C"]) + ) + assert df.loc["B", 1] == 4 + assert df.at["B", 1] == 4 + + def test_boolean_selection(self): + + df3 = self.df3 + df4 = self.df4 + + result = df3[df3.index == "a"] + expected = df3.iloc[[]] + tm.assert_frame_equal(result, expected) + + result = df4[df4.index == "a"] + expected = df4.iloc[[]] + tm.assert_frame_equal(result, expected) + + result = df3[df3.index == 1] + expected = df3.iloc[[0, 1, 3]] + tm.assert_frame_equal(result, expected) + + result = df4[df4.index == 1] + expected = df4.iloc[[0, 1, 3]] + tm.assert_frame_equal(result, expected) + + # since we have an ordered categorical + + # CategoricalIndex([1, 1, 2, 1, 3, 2], + # categories=[3, 2, 1], + # ordered=True, + # name='B') + result = df3[df3.index < 2] + expected = df3.iloc[[4]] + tm.assert_frame_equal(result, expected) + + result = df3[df3.index > 1] + expected = df3.iloc[[]] + tm.assert_frame_equal(result, expected) + + # unordered + # cannot be compared + + # CategoricalIndex([1, 1, 2, 1, 3, 2], + # categories=[3, 2, 1], + # ordered=False, + # name='B') + msg = "Unordered Categoricals can only compare equality or not" + with pytest.raises(TypeError, match=msg): + df4[df4.index < 2] + with pytest.raises(TypeError, match=msg): + df4[df4.index > 1] + + def test_indexing_with_category(self): + + # https://github.com/pandas-dev/pandas/issues/12564 + # consistent result if comparing as Dataframe + + cat = DataFrame({"A": ["foo", "bar", "baz"]}) + exp = DataFrame({"A": [True, False, False]}) + + res = cat[["A"]] == "foo" + tm.assert_frame_equal(res, exp) + + cat["A"] = cat["A"].astype("category") + + res = cat[["A"]] == "foo" + tm.assert_frame_equal(res, exp) + + def test_map_with_dict_or_series(self): + orig_values = ["a", "B", 1, "a"] + new_values = ["one", 2, 3.0, "one"] + cur_index = pd.CategoricalIndex(orig_values, name="XXX") + expected = pd.CategoricalIndex( + new_values, name="XXX", categories=[3.0, 2, "one"] + ) + + mapper = pd.Series(new_values[:-1], index=orig_values[:-1]) + output = cur_index.map(mapper) + # Order of categories in output can be different + tm.assert_index_equal(expected, output) + + mapper = {o: n for o, n in zip(orig_values[:-1], new_values[:-1])} + output = cur_index.map(mapper) + # Order of categories in output can be different + tm.assert_index_equal(expected, output) + + @pytest.mark.parametrize( + "idx_values", + [ + # python types + [1, 2, 3], + [-1, -2, -3], + [1.5, 2.5, 3.5], + [-1.5, -2.5, -3.5], + # numpy int/uint + *[np.array([1, 2, 3], dtype=dtype) for dtype in conftest.ALL_INT_DTYPES], + # numpy floats + *[np.array([1.5, 2.5, 3.5], dtype=dtyp) for dtyp in conftest.FLOAT_DTYPES], + # numpy object + np.array([1, "b", 3.5], dtype=object), + # pandas scalars + [Interval(1, 4), Interval(4, 6), Interval(6, 9)], + [Timestamp(2019, 1, 1), Timestamp(2019, 2, 1), Timestamp(2019, 3, 1)], + [Timedelta(1, "d"), Timedelta(2, "d"), Timedelta(3, "D")], + # pandas Integer arrays + *[pd.array([1, 2, 3], dtype=dtype) for dtype in conftest.ALL_EA_INT_DTYPES], + # other pandas arrays + pd.IntervalIndex.from_breaks([1, 4, 6, 9]).array, + pd.date_range("2019-01-01", periods=3).array, + pd.timedelta_range(start="1d", periods=3).array, + ], + ) + def test_loc_with_non_string_categories(self, idx_values, ordered_fixture): + # GH-17569 + cat_idx = CategoricalIndex(idx_values, ordered=ordered_fixture) + df = DataFrame({"A": ["foo", "bar", "baz"]}, index=cat_idx) + sl = slice(idx_values[0], idx_values[1]) + + # scalar selection + result = df.loc[idx_values[0]] + expected = Series(["foo"], index=["A"], name=idx_values[0]) + tm.assert_series_equal(result, expected) + + # list selection + result = df.loc[idx_values[:2]] + expected = DataFrame(["foo", "bar"], index=cat_idx[:2], columns=["A"]) + tm.assert_frame_equal(result, expected) + + # slice selection + result = df.loc[sl] + expected = DataFrame(["foo", "bar"], index=cat_idx[:2], columns=["A"]) + tm.assert_frame_equal(result, expected) + + # scalar assignment + result = df.copy() + result.loc[idx_values[0]] = "qux" + expected = DataFrame({"A": ["qux", "bar", "baz"]}, index=cat_idx) + tm.assert_frame_equal(result, expected) + + # list assignment + result = df.copy() + result.loc[idx_values[:2], "A"] = ["qux", "qux2"] + expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx) + tm.assert_frame_equal(result, expected) + + # slice assignment + result = df.copy() + result.loc[sl, "A"] = ["qux", "qux2"] + expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexing/test_chaining_and_caching.py b/venv/Lib/site-packages/pandas/tests/indexing/test_chaining_and_caching.py new file mode 100644 index 0000000..e845487 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/test_chaining_and_caching.py @@ -0,0 +1,394 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Series, Timestamp, date_range, option_context +import pandas._testing as tm +import pandas.core.common as com + + +class TestCaching: + def test_slice_consolidate_invalidate_item_cache(self): + + # this is chained assignment, but will 'work' + with option_context("chained_assignment", None): + + # #3970 + df = DataFrame({"aa": np.arange(5), "bb": [2.2] * 5}) + + # Creates a second float block + df["cc"] = 0.0 + + # caches a reference to the 'bb' series + df["bb"] + + # repr machinery triggers consolidation + repr(df) + + # Assignment to wrong series + df["bb"].iloc[0] = 0.17 + df._clear_item_cache() + tm.assert_almost_equal(df["bb"][0], 0.17) + + def test_setitem_cache_updating(self): + # GH 5424 + cont = ["one", "two", "three", "four", "five", "six", "seven"] + + for do_ref in [False, False]: + df = DataFrame({"a": cont, "b": cont[3:] + cont[:3], "c": np.arange(7)}) + + # ref the cache + if do_ref: + df.loc[0, "c"] + + # set it + df.loc[7, "c"] = 1 + + assert df.loc[0, "c"] == 0.0 + assert df.loc[7, "c"] == 1.0 + + # GH 7084 + # not updating cache on series setting with slices + expected = DataFrame( + {"A": [600, 600, 600]}, index=date_range("5/7/2014", "5/9/2014") + ) + out = DataFrame({"A": [0, 0, 0]}, index=date_range("5/7/2014", "5/9/2014")) + df = DataFrame({"C": ["A", "A", "A"], "D": [100, 200, 300]}) + + # loop through df to update out + six = Timestamp("5/7/2014") + eix = Timestamp("5/9/2014") + for ix, row in df.iterrows(): + out.loc[six:eix, row["C"]] = out.loc[six:eix, row["C"]] + row["D"] + + tm.assert_frame_equal(out, expected) + tm.assert_series_equal(out["A"], expected["A"]) + + # try via a chain indexing + # this actually works + out = DataFrame({"A": [0, 0, 0]}, index=date_range("5/7/2014", "5/9/2014")) + for ix, row in df.iterrows(): + v = out[row["C"]][six:eix] + row["D"] + out[row["C"]][six:eix] = v + + tm.assert_frame_equal(out, expected) + tm.assert_series_equal(out["A"], expected["A"]) + + out = DataFrame({"A": [0, 0, 0]}, index=date_range("5/7/2014", "5/9/2014")) + for ix, row in df.iterrows(): + out.loc[six:eix, row["C"]] += row["D"] + + tm.assert_frame_equal(out, expected) + tm.assert_series_equal(out["A"], expected["A"]) + + +class TestChaining: + def test_setitem_chained_setfault(self): + + # GH6026 + data = ["right", "left", "left", "left", "right", "left", "timeout"] + mdata = ["right", "left", "left", "left", "right", "left", "none"] + + df = DataFrame({"response": np.array(data)}) + mask = df.response == "timeout" + df.response[mask] = "none" + tm.assert_frame_equal(df, DataFrame({"response": mdata})) + + recarray = np.rec.fromarrays([data], names=["response"]) + df = DataFrame(recarray) + mask = df.response == "timeout" + df.response[mask] = "none" + tm.assert_frame_equal(df, DataFrame({"response": mdata})) + + df = DataFrame({"response": data, "response1": data}) + mask = df.response == "timeout" + df.response[mask] = "none" + tm.assert_frame_equal(df, DataFrame({"response": mdata, "response1": data})) + + # GH 6056 + expected = DataFrame(dict(A=[np.nan, "bar", "bah", "foo", "bar"])) + df = DataFrame(dict(A=np.array(["foo", "bar", "bah", "foo", "bar"]))) + df["A"].iloc[0] = np.nan + result = df.head() + tm.assert_frame_equal(result, expected) + + df = DataFrame(dict(A=np.array(["foo", "bar", "bah", "foo", "bar"]))) + df.A.iloc[0] = np.nan + result = df.head() + tm.assert_frame_equal(result, expected) + + def test_detect_chained_assignment(self): + + pd.set_option("chained_assignment", "raise") + + # work with the chain + expected = DataFrame([[-5, 1], [-6, 3]], columns=list("AB")) + df = DataFrame(np.arange(4).reshape(2, 2), columns=list("AB"), dtype="int64") + assert df._is_copy is None + + df["A"][0] = -5 + df["A"][1] = -6 + tm.assert_frame_equal(df, expected) + + # test with the chaining + df = DataFrame( + { + "A": Series(range(2), dtype="int64"), + "B": np.array(np.arange(2, 4), dtype=np.float64), + } + ) + assert df._is_copy is None + + with pytest.raises(com.SettingWithCopyError): + df["A"][0] = -5 + + with pytest.raises(com.SettingWithCopyError): + df["A"][1] = np.nan + + assert df["A"]._is_copy is None + + # Using a copy (the chain), fails + df = DataFrame( + { + "A": Series(range(2), dtype="int64"), + "B": np.array(np.arange(2, 4), dtype=np.float64), + } + ) + + with pytest.raises(com.SettingWithCopyError): + df.loc[0]["A"] = -5 + + # Doc example + df = DataFrame( + { + "a": ["one", "one", "two", "three", "two", "one", "six"], + "c": Series(range(7), dtype="int64"), + } + ) + assert df._is_copy is None + + with pytest.raises(com.SettingWithCopyError): + indexer = df.a.str.startswith("o") + df[indexer]["c"] = 42 + + expected = DataFrame({"A": [111, "bbb", "ccc"], "B": [1, 2, 3]}) + df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]}) + + with pytest.raises(com.SettingWithCopyError): + df["A"][0] = 111 + + with pytest.raises(com.SettingWithCopyError): + df.loc[0]["A"] = 111 + + df.loc[0, "A"] = 111 + tm.assert_frame_equal(df, expected) + + # gh-5475: Make sure that is_copy is picked up reconstruction + df = DataFrame({"A": [1, 2]}) + assert df._is_copy is None + + with tm.ensure_clean("__tmp__pickle") as path: + df.to_pickle(path) + df2 = pd.read_pickle(path) + df2["B"] = df2["A"] + df2["B"] = df2["A"] + + # gh-5597: a spurious raise as we are setting the entire column here + from string import ascii_letters as letters + + def random_text(nobs=100): + df = [] + for i in range(nobs): + idx = np.random.randint(len(letters), size=2) + idx.sort() + + df.append([letters[idx[0] : idx[1]]]) + + return DataFrame(df, columns=["letters"]) + + df = random_text(100000) + + # Always a copy + x = df.iloc[[0, 1, 2]] + assert x._is_copy is not None + + x = df.iloc[[0, 1, 2, 4]] + assert x._is_copy is not None + + # Explicitly copy + indexer = df.letters.apply(lambda x: len(x) > 10) + df = df.loc[indexer].copy() + + assert df._is_copy is None + df["letters"] = df["letters"].apply(str.lower) + + # Implicitly take + df = random_text(100000) + indexer = df.letters.apply(lambda x: len(x) > 10) + df = df.loc[indexer] + + assert df._is_copy is not None + df["letters"] = df["letters"].apply(str.lower) + + # Implicitly take 2 + df = random_text(100000) + indexer = df.letters.apply(lambda x: len(x) > 10) + + df = df.loc[indexer] + assert df._is_copy is not None + df.loc[:, "letters"] = df["letters"].apply(str.lower) + + # Should be ok even though it's a copy! + assert df._is_copy is None + + df["letters"] = df["letters"].apply(str.lower) + assert df._is_copy is None + + df = random_text(100000) + indexer = df.letters.apply(lambda x: len(x) > 10) + df.loc[indexer, "letters"] = df.loc[indexer, "letters"].apply(str.lower) + + # an identical take, so no copy + df = DataFrame({"a": [1]}).dropna() + assert df._is_copy is None + df["a"] += 1 + + df = DataFrame(np.random.randn(10, 4)) + s = df.iloc[:, 0].sort_values() + + tm.assert_series_equal(s, df.iloc[:, 0].sort_values()) + tm.assert_series_equal(s, df[0].sort_values()) + + # see gh-6025: false positives + df = DataFrame({"column1": ["a", "a", "a"], "column2": [4, 8, 9]}) + str(df) + + df["column1"] = df["column1"] + "b" + str(df) + + df = df[df["column2"] != 8] + str(df) + + df["column1"] = df["column1"] + "c" + str(df) + + # from SO: + # https://stackoverflow.com/questions/24054495/potential-bug-setting-value-for-undefined-column-using-iloc + df = DataFrame(np.arange(0, 9), columns=["count"]) + df["group"] = "b" + + with pytest.raises(com.SettingWithCopyError): + df.iloc[0:5]["group"] = "a" + + # Mixed type setting but same dtype & changing dtype + df = DataFrame( + dict( + A=date_range("20130101", periods=5), + B=np.random.randn(5), + C=np.arange(5, dtype="int64"), + D=list("abcde"), + ) + ) + + with pytest.raises(com.SettingWithCopyError): + df.loc[2]["D"] = "foo" + + with pytest.raises(com.SettingWithCopyError): + df.loc[2]["C"] = "foo" + + with pytest.raises(com.SettingWithCopyError): + df["C"][2] = "foo" + + def test_setting_with_copy_bug(self): + + # operating on a copy + df = DataFrame( + {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", np.nan, "d"]} + ) + mask = pd.isna(df.c) + + msg = "A value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(com.SettingWithCopyError, match=msg): + df[["c"]][mask] = df[["b"]][mask] + + # invalid warning as we are returning a new object + # GH 8730 + df1 = DataFrame({"x": Series(["a", "b", "c"]), "y": Series(["d", "e", "f"])}) + df2 = df1[["x"]] + + # this should not raise + df2["y"] = ["g", "h", "i"] + + def test_detect_chained_assignment_warnings(self): + with option_context("chained_assignment", "warn"): + df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]}) + + with tm.assert_produces_warning(com.SettingWithCopyWarning): + df.loc[0]["A"] = 111 + + def test_detect_chained_assignment_warnings_filter_and_dupe_cols(self): + # xref gh-13017. + with option_context("chained_assignment", "warn"): + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, -9]], columns=["a", "a", "c"] + ) + + with tm.assert_produces_warning(com.SettingWithCopyWarning): + df.c.loc[df.c > 0] = None + + expected = pd.DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, -9]], columns=["a", "a", "c"] + ) + tm.assert_frame_equal(df, expected) + + def test_chained_getitem_with_lists(self): + + # GH6394 + # Regression in chained getitem indexing with embedded list-like from + # 0.12 + def check(result, expected): + tm.assert_numpy_array_equal(result, expected) + assert isinstance(result, np.ndarray) + + df = DataFrame({"A": 5 * [np.zeros(3)], "B": 5 * [np.ones(3)]}) + expected = df["A"].iloc[2] + result = df.loc[2, "A"] + check(result, expected) + result2 = df.iloc[2]["A"] + check(result2, expected) + result3 = df["A"].loc[2] + check(result3, expected) + result4 = df["A"].iloc[2] + check(result4, expected) + + def test_cache_updating(self): + # GH 4939, make sure to update the cache on setitem + + df = tm.makeDataFrame() + df["A"] # cache series + df.loc["Hello Friend"] = df.iloc[0] + assert "Hello Friend" in df["A"].index + assert "Hello Friend" in df["B"].index + + # 10264 + df = DataFrame( + np.zeros((5, 5), dtype="int64"), + columns=["a", "b", "c", "d", "e"], + index=range(5), + ) + df["f"] = 0 + df.f.values[3] = 1 + + # TODO(wesm): unused? + # y = df.iloc[np.arange(2, len(df))] + + df.f.values[3] = 2 + expected = DataFrame( + np.zeros((5, 6), dtype="int64"), + columns=["a", "b", "c", "d", "e", "f"], + index=range(5), + ) + expected.at[3, "f"] = 2 + tm.assert_frame_equal(df, expected) + expected = Series([0, 0, 0, 2, 0], name="f") + tm.assert_series_equal(df.f, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexing/test_check_indexer.py b/venv/Lib/site-packages/pandas/tests/indexing/test_check_indexer.py new file mode 100644 index 0000000..82f8c12 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/test_check_indexer.py @@ -0,0 +1,97 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.api.indexers import check_array_indexer + + +@pytest.mark.parametrize( + "indexer, expected", + [ + # integer + ([1, 2], np.array([1, 2], dtype=np.intp)), + (np.array([1, 2], dtype="int64"), np.array([1, 2], dtype=np.intp)), + (pd.array([1, 2], dtype="Int32"), np.array([1, 2], dtype=np.intp)), + (pd.Index([1, 2]), np.array([1, 2], dtype=np.intp)), + # boolean + ([True, False, True], np.array([True, False, True], dtype=np.bool_)), + (np.array([True, False, True]), np.array([True, False, True], dtype=np.bool_)), + ( + pd.array([True, False, True], dtype="boolean"), + np.array([True, False, True], dtype=np.bool_), + ), + # other + ([], np.array([], dtype=np.intp)), + ], +) +def test_valid_input(indexer, expected): + array = np.array([1, 2, 3]) + result = check_array_indexer(array, indexer) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize( + "indexer", [[True, False, None], pd.array([True, False, None], dtype="boolean")], +) +def test_bool_raise_missing_values(indexer): + array = np.array([1, 2, 3]) + + msg = "Cannot mask with a boolean indexer containing NA values" + with pytest.raises(ValueError, match=msg): + check_array_indexer(array, indexer) + + +@pytest.mark.parametrize( + "indexer", + [ + [True, False], + pd.array([True, False], dtype="boolean"), + np.array([True, False], dtype=np.bool_), + ], +) +def test_bool_raise_length(indexer): + array = np.array([1, 2, 3]) + + msg = "Boolean index has wrong length" + with pytest.raises(IndexError, match=msg): + check_array_indexer(array, indexer) + + +@pytest.mark.parametrize( + "indexer", [[0, 1, None], pd.array([0, 1, pd.NA], dtype="Int64")], +) +def test_int_raise_missing_values(indexer): + array = np.array([1, 2, 3]) + + msg = "Cannot index with an integer indexer containing NA values" + with pytest.raises(ValueError, match=msg): + check_array_indexer(array, indexer) + + +@pytest.mark.parametrize( + "indexer", + [ + [0.0, 1.0], + np.array([1.0, 2.0], dtype="float64"), + np.array([True, False], dtype=object), + pd.Index([True, False], dtype=object), + pd.array(["a", "b"], dtype="string"), + ], +) +def test_raise_invalid_array_dtypes(indexer): + array = np.array([1, 2, 3]) + + msg = "arrays used as indices must be of integer or boolean type" + with pytest.raises(IndexError, match=msg): + check_array_indexer(array, indexer) + + +@pytest.mark.parametrize( + "indexer", [None, Ellipsis, slice(0, 3), (None,)], +) +def test_pass_through_non_array_likes(indexer): + array = np.array([1, 2, 3]) + + result = check_array_indexer(array, indexer) + assert result == indexer diff --git a/venv/Lib/site-packages/pandas/tests/indexing/test_coercion.py b/venv/Lib/site-packages/pandas/tests/indexing/test_coercion.py new file mode 100644 index 0000000..b904755 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/test_coercion.py @@ -0,0 +1,1087 @@ +import itertools +from typing import Dict, List + +import numpy as np +import pytest + +import pandas.compat as compat + +import pandas as pd +import pandas._testing as tm + +############################################################### +# Index / Series common tests which may trigger dtype coercions +############################################################### + + +@pytest.fixture(autouse=True, scope="class") +def check_comprehensiveness(request): + # Iterate over combination of dtype, method and klass + # and ensure that each are contained within a collected test + cls = request.cls + combos = itertools.product(cls.klasses, cls.dtypes, [cls.method]) + + def has_test(combo): + klass, dtype, method = combo + cls_funcs = request.node.session.items + return any( + klass in x.name and dtype in x.name and method in x.name for x in cls_funcs + ) + + for combo in combos: + if not has_test(combo): + msg = "test method is not defined: {0}, {1}" + raise AssertionError(msg.format(cls.__name__, combo)) + + yield + + +class CoercionBase: + + klasses = ["index", "series"] + dtypes = [ + "object", + "int64", + "float64", + "complex128", + "bool", + "datetime64", + "datetime64tz", + "timedelta64", + "period", + ] + + @property + def method(self): + raise NotImplementedError(self) + + def _assert(self, left, right, dtype): + # explicitly check dtype to avoid any unexpected result + if isinstance(left, pd.Series): + tm.assert_series_equal(left, right) + elif isinstance(left, pd.Index): + tm.assert_index_equal(left, right) + else: + raise NotImplementedError + assert left.dtype == dtype + assert right.dtype == dtype + + +class TestSetitemCoercion(CoercionBase): + + method = "setitem" + + def _assert_setitem_series_conversion( + self, original_series, loc_value, expected_series, expected_dtype + ): + """ test series value's coercion triggered by assignment """ + temp = original_series.copy() + temp[1] = loc_value + tm.assert_series_equal(temp, expected_series) + # check dtype explicitly for sure + assert temp.dtype == expected_dtype + + # .loc works different rule, temporary disable + # temp = original_series.copy() + # temp.loc[1] = loc_value + # tm.assert_series_equal(temp, expected_series) + + @pytest.mark.parametrize( + "val,exp_dtype", + [(1, np.object), (1.1, np.object), (1 + 1j, np.object), (True, np.object)], + ) + def test_setitem_series_object(self, val, exp_dtype): + obj = pd.Series(list("abcd")) + assert obj.dtype == np.object + + exp = pd.Series(["a", val, "c", "d"]) + self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) + + @pytest.mark.parametrize( + "val,exp_dtype", + [(1, np.int64), (1.1, np.float64), (1 + 1j, np.complex128), (True, np.object)], + ) + def test_setitem_series_int64(self, val, exp_dtype): + obj = pd.Series([1, 2, 3, 4]) + assert obj.dtype == np.int64 + + if exp_dtype is np.float64: + exp = pd.Series([1, 1, 3, 4]) + self._assert_setitem_series_conversion(obj, 1.1, exp, np.int64) + pytest.xfail("GH12747 The result must be float") + + exp = pd.Series([1, val, 3, 4]) + self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) + + @pytest.mark.parametrize( + "val,exp_dtype", [(np.int32(1), np.int8), (np.int16(2 ** 9), np.int16)] + ) + def test_setitem_series_int8(self, val, exp_dtype): + obj = pd.Series([1, 2, 3, 4], dtype=np.int8) + assert obj.dtype == np.int8 + + if exp_dtype is np.int16: + exp = pd.Series([1, 0, 3, 4], dtype=np.int8) + self._assert_setitem_series_conversion(obj, val, exp, np.int8) + pytest.xfail("BUG: it must be Series([1, 1, 3, 4], dtype=np.int16") + + exp = pd.Series([1, val, 3, 4], dtype=np.int8) + self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) + + @pytest.mark.parametrize( + "val,exp_dtype", + [ + (1, np.float64), + (1.1, np.float64), + (1 + 1j, np.complex128), + (True, np.object), + ], + ) + def test_setitem_series_float64(self, val, exp_dtype): + obj = pd.Series([1.1, 2.2, 3.3, 4.4]) + assert obj.dtype == np.float64 + + exp = pd.Series([1.1, val, 3.3, 4.4]) + self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) + + @pytest.mark.parametrize( + "val,exp_dtype", + [ + (1, np.complex128), + (1.1, np.complex128), + (1 + 1j, np.complex128), + (True, np.object), + ], + ) + def test_setitem_series_complex128(self, val, exp_dtype): + obj = pd.Series([1 + 1j, 2 + 2j, 3 + 3j, 4 + 4j]) + assert obj.dtype == np.complex128 + + exp = pd.Series([1 + 1j, val, 3 + 3j, 4 + 4j]) + self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) + + @pytest.mark.parametrize( + "val,exp_dtype", + [ + (1, np.int64), + (3, np.int64), + (1.1, np.float64), + (1 + 1j, np.complex128), + (True, np.bool), + ], + ) + def test_setitem_series_bool(self, val, exp_dtype): + obj = pd.Series([True, False, True, False]) + assert obj.dtype == np.bool + + if exp_dtype is np.int64: + exp = pd.Series([True, True, True, False]) + self._assert_setitem_series_conversion(obj, val, exp, np.bool) + pytest.xfail("TODO_GH12747 The result must be int") + elif exp_dtype is np.float64: + exp = pd.Series([True, True, True, False]) + self._assert_setitem_series_conversion(obj, val, exp, np.bool) + pytest.xfail("TODO_GH12747 The result must be float") + elif exp_dtype is np.complex128: + exp = pd.Series([True, True, True, False]) + self._assert_setitem_series_conversion(obj, val, exp, np.bool) + pytest.xfail("TODO_GH12747 The result must be complex") + + exp = pd.Series([True, val, True, False]) + self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) + + @pytest.mark.parametrize( + "val,exp_dtype", + [ + (pd.Timestamp("2012-01-01"), "datetime64[ns]"), + (1, np.object), + ("x", np.object), + ], + ) + def test_setitem_series_datetime64(self, val, exp_dtype): + obj = pd.Series( + [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-03"), + pd.Timestamp("2011-01-04"), + ] + ) + assert obj.dtype == "datetime64[ns]" + + exp = pd.Series( + [ + pd.Timestamp("2011-01-01"), + val, + pd.Timestamp("2011-01-03"), + pd.Timestamp("2011-01-04"), + ] + ) + self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) + + @pytest.mark.parametrize( + "val,exp_dtype", + [ + (pd.Timestamp("2012-01-01", tz="US/Eastern"), "datetime64[ns, US/Eastern]"), + (pd.Timestamp("2012-01-01", tz="US/Pacific"), np.object), + (pd.Timestamp("2012-01-01"), np.object), + (1, np.object), + ], + ) + def test_setitem_series_datetime64tz(self, val, exp_dtype): + tz = "US/Eastern" + obj = pd.Series( + [ + pd.Timestamp("2011-01-01", tz=tz), + pd.Timestamp("2011-01-02", tz=tz), + pd.Timestamp("2011-01-03", tz=tz), + pd.Timestamp("2011-01-04", tz=tz), + ] + ) + assert obj.dtype == "datetime64[ns, US/Eastern]" + + exp = pd.Series( + [ + pd.Timestamp("2011-01-01", tz=tz), + val, + pd.Timestamp("2011-01-03", tz=tz), + pd.Timestamp("2011-01-04", tz=tz), + ] + ) + self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) + + @pytest.mark.parametrize( + "val,exp_dtype", + [(pd.Timedelta("12 day"), "timedelta64[ns]"), (1, np.object), ("x", np.object)], + ) + def test_setitem_series_timedelta64(self, val, exp_dtype): + obj = pd.Series( + [ + pd.Timedelta("1 day"), + pd.Timedelta("2 day"), + pd.Timedelta("3 day"), + pd.Timedelta("4 day"), + ] + ) + assert obj.dtype == "timedelta64[ns]" + + exp = pd.Series( + [pd.Timedelta("1 day"), val, pd.Timedelta("3 day"), pd.Timedelta("4 day")] + ) + self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) + + def _assert_setitem_index_conversion( + self, original_series, loc_key, expected_index, expected_dtype + ): + """ test index's coercion triggered by assign key """ + temp = original_series.copy() + temp[loc_key] = 5 + exp = pd.Series([1, 2, 3, 4, 5], index=expected_index) + tm.assert_series_equal(temp, exp) + # check dtype explicitly for sure + assert temp.index.dtype == expected_dtype + + temp = original_series.copy() + temp.loc[loc_key] = 5 + exp = pd.Series([1, 2, 3, 4, 5], index=expected_index) + tm.assert_series_equal(temp, exp) + # check dtype explicitly for sure + assert temp.index.dtype == expected_dtype + + @pytest.mark.parametrize( + "val,exp_dtype", [("x", np.object), (5, IndexError), (1.1, np.object)] + ) + def test_setitem_index_object(self, val, exp_dtype): + obj = pd.Series([1, 2, 3, 4], index=list("abcd")) + assert obj.index.dtype == np.object + + if exp_dtype is IndexError: + temp = obj.copy() + with pytest.raises(exp_dtype): + temp[5] = 5 + else: + exp_index = pd.Index(list("abcd") + [val]) + self._assert_setitem_index_conversion(obj, val, exp_index, exp_dtype) + + @pytest.mark.parametrize( + "val,exp_dtype", [(5, np.int64), (1.1, np.float64), ("x", np.object)] + ) + def test_setitem_index_int64(self, val, exp_dtype): + obj = pd.Series([1, 2, 3, 4]) + assert obj.index.dtype == np.int64 + + exp_index = pd.Index([0, 1, 2, 3, val]) + self._assert_setitem_index_conversion(obj, val, exp_index, exp_dtype) + + @pytest.mark.parametrize( + "val,exp_dtype", [(5, IndexError), (5.1, np.float64), ("x", np.object)] + ) + def test_setitem_index_float64(self, val, exp_dtype): + obj = pd.Series([1, 2, 3, 4], index=[1.1, 2.1, 3.1, 4.1]) + assert obj.index.dtype == np.float64 + + if exp_dtype is IndexError: + # float + int -> int + temp = obj.copy() + with pytest.raises(exp_dtype): + temp[5] = 5 + pytest.xfail("TODO_GH12747 The result must be float") + + exp_index = pd.Index([1.1, 2.1, 3.1, 4.1, val]) + self._assert_setitem_index_conversion(obj, val, exp_index, exp_dtype) + + def test_setitem_series_period(self): + pass + + def test_setitem_index_complex128(self): + pass + + def test_setitem_index_bool(self): + pass + + def test_setitem_index_datetime64(self): + pass + + def test_setitem_index_datetime64tz(self): + pass + + def test_setitem_index_timedelta64(self): + pass + + def test_setitem_index_period(self): + pass + + +class TestInsertIndexCoercion(CoercionBase): + + klasses = ["index"] + method = "insert" + + def _assert_insert_conversion(self, original, value, expected, expected_dtype): + """ test coercion triggered by insert """ + target = original.copy() + res = target.insert(1, value) + tm.assert_index_equal(res, expected) + assert res.dtype == expected_dtype + + @pytest.mark.parametrize( + "insert, coerced_val, coerced_dtype", + [ + (1, 1, np.object), + (1.1, 1.1, np.object), + (False, False, np.object), + ("x", "x", np.object), + ], + ) + def test_insert_index_object(self, insert, coerced_val, coerced_dtype): + obj = pd.Index(list("abcd")) + assert obj.dtype == np.object + + exp = pd.Index(["a", coerced_val, "b", "c", "d"]) + self._assert_insert_conversion(obj, insert, exp, coerced_dtype) + + @pytest.mark.parametrize( + "insert, coerced_val, coerced_dtype", + [ + (1, 1, np.int64), + (1.1, 1.1, np.float64), + (False, 0, np.int64), + ("x", "x", np.object), + ], + ) + def test_insert_index_int64(self, insert, coerced_val, coerced_dtype): + obj = pd.Int64Index([1, 2, 3, 4]) + assert obj.dtype == np.int64 + + exp = pd.Index([1, coerced_val, 2, 3, 4]) + self._assert_insert_conversion(obj, insert, exp, coerced_dtype) + + @pytest.mark.parametrize( + "insert, coerced_val, coerced_dtype", + [ + (1, 1.0, np.float64), + (1.1, 1.1, np.float64), + (False, 0.0, np.float64), + ("x", "x", np.object), + ], + ) + def test_insert_index_float64(self, insert, coerced_val, coerced_dtype): + obj = pd.Float64Index([1.0, 2.0, 3.0, 4.0]) + assert obj.dtype == np.float64 + + exp = pd.Index([1.0, coerced_val, 2.0, 3.0, 4.0]) + self._assert_insert_conversion(obj, insert, exp, coerced_dtype) + + @pytest.mark.parametrize( + "fill_val,exp_dtype", + [ + (pd.Timestamp("2012-01-01"), "datetime64[ns]"), + (pd.Timestamp("2012-01-01", tz="US/Eastern"), "datetime64[ns, US/Eastern]"), + ], + ids=["datetime64", "datetime64tz"], + ) + def test_insert_index_datetimes(self, fill_val, exp_dtype): + obj = pd.DatetimeIndex( + ["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"], tz=fill_val.tz + ) + assert obj.dtype == exp_dtype + + exp = pd.DatetimeIndex( + ["2011-01-01", fill_val.date(), "2011-01-02", "2011-01-03", "2011-01-04"], + tz=fill_val.tz, + ) + self._assert_insert_conversion(obj, fill_val, exp, exp_dtype) + + if fill_val.tz: + msg = "Cannot compare tz-naive and tz-aware" + with pytest.raises(TypeError, match=msg): + obj.insert(1, pd.Timestamp("2012-01-01")) + + msg = "Timezones don't match" + with pytest.raises(ValueError, match=msg): + obj.insert(1, pd.Timestamp("2012-01-01", tz="Asia/Tokyo")) + + else: + msg = "Cannot compare tz-naive and tz-aware" + with pytest.raises(TypeError, match=msg): + obj.insert(1, pd.Timestamp("2012-01-01", tz="Asia/Tokyo")) + + msg = "cannot insert DatetimeIndex with incompatible label" + with pytest.raises(TypeError, match=msg): + obj.insert(1, 1) + + pytest.xfail("ToDo: must coerce to object") + + def test_insert_index_timedelta64(self): + obj = pd.TimedeltaIndex(["1 day", "2 day", "3 day", "4 day"]) + assert obj.dtype == "timedelta64[ns]" + + # timedelta64 + timedelta64 => timedelta64 + exp = pd.TimedeltaIndex(["1 day", "10 day", "2 day", "3 day", "4 day"]) + self._assert_insert_conversion( + obj, pd.Timedelta("10 day"), exp, "timedelta64[ns]" + ) + + # ToDo: must coerce to object + msg = "cannot insert TimedeltaIndex with incompatible label" + with pytest.raises(TypeError, match=msg): + obj.insert(1, pd.Timestamp("2012-01-01")) + + # ToDo: must coerce to object + msg = "cannot insert TimedeltaIndex with incompatible label" + with pytest.raises(TypeError, match=msg): + obj.insert(1, 1) + + @pytest.mark.parametrize( + "insert, coerced_val, coerced_dtype", + [ + (pd.Period("2012-01", freq="M"), "2012-01", "period[M]"), + (pd.Timestamp("2012-01-01"), pd.Timestamp("2012-01-01"), np.object), + (1, 1, np.object), + ("x", "x", np.object), + ], + ) + def test_insert_index_period(self, insert, coerced_val, coerced_dtype): + obj = pd.PeriodIndex(["2011-01", "2011-02", "2011-03", "2011-04"], freq="M") + assert obj.dtype == "period[M]" + + data = [ + pd.Period("2011-01", freq="M"), + coerced_val, + pd.Period("2011-02", freq="M"), + pd.Period("2011-03", freq="M"), + pd.Period("2011-04", freq="M"), + ] + if isinstance(insert, pd.Period): + exp = pd.PeriodIndex(data, freq="M") + self._assert_insert_conversion(obj, insert, exp, coerced_dtype) + else: + msg = r"Unexpected keyword arguments {'freq'}" + with pytest.raises(TypeError, match=msg): + pd.Index(data, freq="M") + + def test_insert_index_complex128(self): + pass + + def test_insert_index_bool(self): + pass + + +class TestWhereCoercion(CoercionBase): + + method = "where" + + def _assert_where_conversion( + self, original, cond, values, expected, expected_dtype + ): + """ test coercion triggered by where """ + target = original.copy() + res = target.where(cond, values) + self._assert(res, expected, expected_dtype) + + @pytest.mark.parametrize( + "fill_val,exp_dtype", + [(1, np.object), (1.1, np.object), (1 + 1j, np.object), (True, np.object)], + ) + def test_where_object(self, index_or_series, fill_val, exp_dtype): + klass = index_or_series + obj = klass(list("abcd")) + assert obj.dtype == np.object + cond = klass([True, False, True, False]) + + if fill_val is True and klass is pd.Series: + ret_val = 1 + else: + ret_val = fill_val + + exp = klass(["a", ret_val, "c", ret_val]) + self._assert_where_conversion(obj, cond, fill_val, exp, exp_dtype) + + if fill_val is True: + values = klass([True, False, True, True]) + else: + values = klass(fill_val * x for x in [5, 6, 7, 8]) + + exp = klass(["a", values[1], "c", values[3]]) + self._assert_where_conversion(obj, cond, values, exp, exp_dtype) + + @pytest.mark.parametrize( + "fill_val,exp_dtype", + [(1, np.int64), (1.1, np.float64), (1 + 1j, np.complex128), (True, np.object)], + ) + def test_where_int64(self, index_or_series, fill_val, exp_dtype): + klass = index_or_series + if klass is pd.Index and exp_dtype is np.complex128: + pytest.skip("Complex Index not supported") + obj = klass([1, 2, 3, 4]) + assert obj.dtype == np.int64 + cond = klass([True, False, True, False]) + + exp = klass([1, fill_val, 3, fill_val]) + self._assert_where_conversion(obj, cond, fill_val, exp, exp_dtype) + + if fill_val is True: + values = klass([True, False, True, True]) + else: + values = klass(x * fill_val for x in [5, 6, 7, 8]) + exp = klass([1, values[1], 3, values[3]]) + self._assert_where_conversion(obj, cond, values, exp, exp_dtype) + + @pytest.mark.parametrize( + "fill_val, exp_dtype", + [ + (1, np.float64), + (1.1, np.float64), + (1 + 1j, np.complex128), + (True, np.object), + ], + ) + def test_where_float64(self, index_or_series, fill_val, exp_dtype): + klass = index_or_series + if klass is pd.Index and exp_dtype is np.complex128: + pytest.skip("Complex Index not supported") + obj = klass([1.1, 2.2, 3.3, 4.4]) + assert obj.dtype == np.float64 + cond = klass([True, False, True, False]) + + exp = klass([1.1, fill_val, 3.3, fill_val]) + self._assert_where_conversion(obj, cond, fill_val, exp, exp_dtype) + + if fill_val is True: + values = klass([True, False, True, True]) + else: + values = klass(x * fill_val for x in [5, 6, 7, 8]) + exp = klass([1.1, values[1], 3.3, values[3]]) + self._assert_where_conversion(obj, cond, values, exp, exp_dtype) + + @pytest.mark.parametrize( + "fill_val,exp_dtype", + [ + (1, np.complex128), + (1.1, np.complex128), + (1 + 1j, np.complex128), + (True, np.object), + ], + ) + def test_where_series_complex128(self, fill_val, exp_dtype): + obj = pd.Series([1 + 1j, 2 + 2j, 3 + 3j, 4 + 4j]) + assert obj.dtype == np.complex128 + cond = pd.Series([True, False, True, False]) + + exp = pd.Series([1 + 1j, fill_val, 3 + 3j, fill_val]) + self._assert_where_conversion(obj, cond, fill_val, exp, exp_dtype) + + if fill_val is True: + values = pd.Series([True, False, True, True]) + else: + values = pd.Series(x * fill_val for x in [5, 6, 7, 8]) + exp = pd.Series([1 + 1j, values[1], 3 + 3j, values[3]]) + self._assert_where_conversion(obj, cond, values, exp, exp_dtype) + + @pytest.mark.parametrize( + "fill_val,exp_dtype", + [(1, np.object), (1.1, np.object), (1 + 1j, np.object), (True, np.bool)], + ) + def test_where_series_bool(self, fill_val, exp_dtype): + + obj = pd.Series([True, False, True, False]) + assert obj.dtype == np.bool + cond = pd.Series([True, False, True, False]) + + exp = pd.Series([True, fill_val, True, fill_val]) + self._assert_where_conversion(obj, cond, fill_val, exp, exp_dtype) + + if fill_val is True: + values = pd.Series([True, False, True, True]) + else: + values = pd.Series(x * fill_val for x in [5, 6, 7, 8]) + exp = pd.Series([True, values[1], True, values[3]]) + self._assert_where_conversion(obj, cond, values, exp, exp_dtype) + + @pytest.mark.parametrize( + "fill_val,exp_dtype", + [ + (pd.Timestamp("2012-01-01"), "datetime64[ns]"), + (pd.Timestamp("2012-01-01", tz="US/Eastern"), np.object), + ], + ids=["datetime64", "datetime64tz"], + ) + def test_where_series_datetime64(self, fill_val, exp_dtype): + obj = pd.Series( + [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-03"), + pd.Timestamp("2011-01-04"), + ] + ) + assert obj.dtype == "datetime64[ns]" + cond = pd.Series([True, False, True, False]) + + exp = pd.Series( + [pd.Timestamp("2011-01-01"), fill_val, pd.Timestamp("2011-01-03"), fill_val] + ) + self._assert_where_conversion(obj, cond, fill_val, exp, exp_dtype) + + values = pd.Series(pd.date_range(fill_val, periods=4)) + if fill_val.tz: + exp = pd.Series( + [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2012-01-02 00:00", tz="US/Eastern"), + pd.Timestamp("2011-01-03"), + pd.Timestamp("2012-01-04 00:00", tz="US/Eastern"), + ] + ) + self._assert_where_conversion(obj, cond, values, exp, exp_dtype) + + exp = pd.Series( + [ + pd.Timestamp("2011-01-01"), + values[1], + pd.Timestamp("2011-01-03"), + values[3], + ] + ) + self._assert_where_conversion(obj, cond, values, exp, exp_dtype) + + def test_where_index_datetime(self): + fill_val = pd.Timestamp("2012-01-01") + exp_dtype = "datetime64[ns]" + obj = pd.Index( + [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-03"), + pd.Timestamp("2011-01-04"), + ] + ) + assert obj.dtype == "datetime64[ns]" + cond = pd.Index([True, False, True, False]) + + msg = "Index\\(\\.\\.\\.\\) must be called with a collection of some kind" + with pytest.raises(TypeError, match=msg): + obj.where(cond, fill_val) + + values = pd.Index(pd.date_range(fill_val, periods=4)) + exp = pd.Index( + [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2012-01-02"), + pd.Timestamp("2011-01-03"), + pd.Timestamp("2012-01-04"), + ] + ) + + self._assert_where_conversion(obj, cond, values, exp, exp_dtype) + + @pytest.mark.xfail(reason="GH 22839: do not ignore timezone, must be object") + def test_where_index_datetimetz(self): + fill_val = pd.Timestamp("2012-01-01", tz="US/Eastern") + exp_dtype = np.object + obj = pd.Index( + [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-03"), + pd.Timestamp("2011-01-04"), + ] + ) + assert obj.dtype == "datetime64[ns]" + cond = pd.Index([True, False, True, False]) + + msg = "Index\\(\\.\\.\\.\\) must be called with a collection of some kind" + with pytest.raises(TypeError, match=msg): + obj.where(cond, fill_val) + + values = pd.Index(pd.date_range(fill_val, periods=4)) + exp = pd.Index( + [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2012-01-02", tz="US/Eastern"), + pd.Timestamp("2011-01-03"), + pd.Timestamp("2012-01-04", tz="US/Eastern"), + ], + dtype=exp_dtype, + ) + + self._assert_where_conversion(obj, cond, values, exp, exp_dtype) + + def test_where_index_complex128(self): + pass + + def test_where_index_bool(self): + pass + + def test_where_series_datetime64tz(self): + pass + + def test_where_series_timedelta64(self): + pass + + def test_where_series_period(self): + pass + + def test_where_index_datetime64tz(self): + pass + + def test_where_index_timedelta64(self): + pass + + def test_where_index_period(self): + pass + + +class TestFillnaSeriesCoercion(CoercionBase): + + # not indexing, but place here for consistency + + method = "fillna" + + def test_has_comprehensive_tests(self): + pass + + def _assert_fillna_conversion(self, original, value, expected, expected_dtype): + """ test coercion triggered by fillna """ + target = original.copy() + res = target.fillna(value) + self._assert(res, expected, expected_dtype) + + @pytest.mark.parametrize( + "fill_val, fill_dtype", + [(1, np.object), (1.1, np.object), (1 + 1j, np.object), (True, np.object)], + ) + def test_fillna_object(self, index_or_series, fill_val, fill_dtype): + klass = index_or_series + obj = klass(["a", np.nan, "c", "d"]) + assert obj.dtype == np.object + + exp = klass(["a", fill_val, "c", "d"]) + self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) + + @pytest.mark.parametrize( + "fill_val,fill_dtype", + [ + (1, np.float64), + (1.1, np.float64), + (1 + 1j, np.complex128), + (True, np.object), + ], + ) + def test_fillna_float64(self, index_or_series, fill_val, fill_dtype): + klass = index_or_series + obj = klass([1.1, np.nan, 3.3, 4.4]) + assert obj.dtype == np.float64 + + exp = klass([1.1, fill_val, 3.3, 4.4]) + # float + complex -> we don't support a complex Index + # complex for Series, + # object for Index + if fill_dtype == np.complex128 and klass == pd.Index: + fill_dtype = np.object + self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) + + @pytest.mark.parametrize( + "fill_val,fill_dtype", + [ + (1, np.complex128), + (1.1, np.complex128), + (1 + 1j, np.complex128), + (True, np.object), + ], + ) + def test_fillna_series_complex128(self, fill_val, fill_dtype): + obj = pd.Series([1 + 1j, np.nan, 3 + 3j, 4 + 4j]) + assert obj.dtype == np.complex128 + + exp = pd.Series([1 + 1j, fill_val, 3 + 3j, 4 + 4j]) + self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) + + @pytest.mark.parametrize( + "fill_val,fill_dtype", + [ + (pd.Timestamp("2012-01-01"), "datetime64[ns]"), + (pd.Timestamp("2012-01-01", tz="US/Eastern"), np.object), + (1, np.object), + ("x", np.object), + ], + ids=["datetime64", "datetime64tz", "object", "object"], + ) + def test_fillna_datetime(self, index_or_series, fill_val, fill_dtype): + klass = index_or_series + obj = klass( + [ + pd.Timestamp("2011-01-01"), + pd.NaT, + pd.Timestamp("2011-01-03"), + pd.Timestamp("2011-01-04"), + ] + ) + assert obj.dtype == "datetime64[ns]" + + exp = klass( + [ + pd.Timestamp("2011-01-01"), + fill_val, + pd.Timestamp("2011-01-03"), + pd.Timestamp("2011-01-04"), + ] + ) + self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) + + @pytest.mark.parametrize( + "fill_val,fill_dtype", + [ + (pd.Timestamp("2012-01-01", tz="US/Eastern"), "datetime64[ns, US/Eastern]"), + (pd.Timestamp("2012-01-01"), np.object), + (pd.Timestamp("2012-01-01", tz="Asia/Tokyo"), np.object), + (1, np.object), + ("x", np.object), + ], + ) + def test_fillna_datetime64tz(self, index_or_series, fill_val, fill_dtype): + klass = index_or_series + tz = "US/Eastern" + + obj = klass( + [ + pd.Timestamp("2011-01-01", tz=tz), + pd.NaT, + pd.Timestamp("2011-01-03", tz=tz), + pd.Timestamp("2011-01-04", tz=tz), + ] + ) + assert obj.dtype == "datetime64[ns, US/Eastern]" + + exp = klass( + [ + pd.Timestamp("2011-01-01", tz=tz), + fill_val, + pd.Timestamp("2011-01-03", tz=tz), + pd.Timestamp("2011-01-04", tz=tz), + ] + ) + self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) + + def test_fillna_series_int64(self): + pass + + def test_fillna_index_int64(self): + pass + + def test_fillna_series_bool(self): + pass + + def test_fillna_index_bool(self): + pass + + def test_fillna_series_timedelta64(self): + pass + + def test_fillna_series_period(self): + pass + + def test_fillna_index_timedelta64(self): + pass + + def test_fillna_index_period(self): + pass + + +class TestReplaceSeriesCoercion(CoercionBase): + + klasses = ["series"] + method = "replace" + + rep: Dict[str, List] = {} + rep["object"] = ["a", "b"] + rep["int64"] = [4, 5] + rep["float64"] = [1.1, 2.2] + rep["complex128"] = [1 + 1j, 2 + 2j] + rep["bool"] = [True, False] + rep["datetime64[ns]"] = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-03")] + + for tz in ["UTC", "US/Eastern"]: + # to test tz => different tz replacement + key = "datetime64[ns, {0}]".format(tz) + rep[key] = [ + pd.Timestamp("2011-01-01", tz=tz), + pd.Timestamp("2011-01-03", tz=tz), + ] + + rep["timedelta64[ns]"] = [pd.Timedelta("1 day"), pd.Timedelta("2 day")] + + @pytest.mark.parametrize("how", ["dict", "series"]) + @pytest.mark.parametrize( + "to_key", + [ + "object", + "int64", + "float64", + "complex128", + "bool", + "datetime64[ns]", + "datetime64[ns, UTC]", + "datetime64[ns, US/Eastern]", + "timedelta64[ns]", + ], + ids=[ + "object", + "int64", + "float64", + "complex128", + "bool", + "datetime64", + "datetime64tz", + "datetime64tz", + "timedelta64", + ], + ) + @pytest.mark.parametrize( + "from_key", + [ + "object", + "int64", + "float64", + "complex128", + "bool", + "datetime64[ns]", + "datetime64[ns, UTC]", + "datetime64[ns, US/Eastern]", + "timedelta64[ns]", + ], + ) + def test_replace_series(self, how, to_key, from_key): + index = pd.Index([3, 4], name="xxx") + obj = pd.Series(self.rep[from_key], index=index, name="yyy") + assert obj.dtype == from_key + + if from_key.startswith("datetime") and to_key.startswith("datetime"): + # tested below + return + elif from_key in ["datetime64[ns, US/Eastern]", "datetime64[ns, UTC]"]: + # tested below + return + + if how == "dict": + replacer = dict(zip(self.rep[from_key], self.rep[to_key])) + elif how == "series": + replacer = pd.Series(self.rep[to_key], index=self.rep[from_key]) + else: + raise ValueError + + result = obj.replace(replacer) + + if (from_key == "float64" and to_key in ("int64")) or ( + from_key == "complex128" and to_key in ("int64", "float64") + ): + + if compat.is_platform_32bit() or compat.is_platform_windows(): + pytest.skip( + "32-bit platform buggy: {0} -> {1}".format(from_key, to_key) + ) + + # Expected: do not downcast by replacement + exp = pd.Series(self.rep[to_key], index=index, name="yyy", dtype=from_key) + + else: + exp = pd.Series(self.rep[to_key], index=index, name="yyy") + assert exp.dtype == to_key + + tm.assert_series_equal(result, exp) + + @pytest.mark.parametrize("how", ["dict", "series"]) + @pytest.mark.parametrize( + "to_key", + ["timedelta64[ns]", "bool", "object", "complex128", "float64", "int64"], + ) + @pytest.mark.parametrize( + "from_key", ["datetime64[ns, UTC]", "datetime64[ns, US/Eastern]"] + ) + def test_replace_series_datetime_tz(self, how, to_key, from_key): + index = pd.Index([3, 4], name="xyz") + obj = pd.Series(self.rep[from_key], index=index, name="yyy") + assert obj.dtype == from_key + + if how == "dict": + replacer = dict(zip(self.rep[from_key], self.rep[to_key])) + elif how == "series": + replacer = pd.Series(self.rep[to_key], index=self.rep[from_key]) + else: + raise ValueError + + result = obj.replace(replacer) + exp = pd.Series(self.rep[to_key], index=index, name="yyy") + assert exp.dtype == to_key + + tm.assert_series_equal(result, exp) + + @pytest.mark.parametrize("how", ["dict", "series"]) + @pytest.mark.parametrize( + "to_key", + ["datetime64[ns]", "datetime64[ns, UTC]", "datetime64[ns, US/Eastern]"], + ) + @pytest.mark.parametrize( + "from_key", + ["datetime64[ns]", "datetime64[ns, UTC]", "datetime64[ns, US/Eastern]"], + ) + def test_replace_series_datetime_datetime(self, how, to_key, from_key): + index = pd.Index([3, 4], name="xyz") + obj = pd.Series(self.rep[from_key], index=index, name="yyy") + assert obj.dtype == from_key + + if how == "dict": + replacer = dict(zip(self.rep[from_key], self.rep[to_key])) + elif how == "series": + replacer = pd.Series(self.rep[to_key], index=self.rep[from_key]) + else: + raise ValueError + + result = obj.replace(replacer) + exp = pd.Series(self.rep[to_key], index=index, name="yyy") + assert exp.dtype == to_key + + tm.assert_series_equal(result, exp) + + def test_replace_series_period(self): + pass diff --git a/venv/Lib/site-packages/pandas/tests/indexing/test_datetime.py b/venv/Lib/site-packages/pandas/tests/indexing/test_datetime.py new file mode 100644 index 0000000..42f9923 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/test_datetime.py @@ -0,0 +1,352 @@ +from datetime import datetime, timedelta + +from dateutil import tz +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Index, Series, Timestamp, date_range +import pandas._testing as tm + + +class TestDatetimeIndex: + def test_setitem_with_datetime_tz(self): + # 16889 + # support .loc with alignment and tz-aware DatetimeIndex + mask = np.array([True, False, True, False]) + + idx = date_range("20010101", periods=4, tz="UTC") + df = DataFrame({"a": np.arange(4)}, index=idx).astype("float64") + + result = df.copy() + result.loc[mask, :] = df.loc[mask, :] + tm.assert_frame_equal(result, df) + + result = df.copy() + result.loc[mask] = df.loc[mask] + tm.assert_frame_equal(result, df) + + idx = date_range("20010101", periods=4) + df = DataFrame({"a": np.arange(4)}, index=idx).astype("float64") + + result = df.copy() + result.loc[mask, :] = df.loc[mask, :] + tm.assert_frame_equal(result, df) + + result = df.copy() + result.loc[mask] = df.loc[mask] + tm.assert_frame_equal(result, df) + + def test_indexing_with_datetime_tz(self): + + # GH#8260 + # support datetime64 with tz + + idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo") + dr = date_range("20130110", periods=3) + df = DataFrame({"A": idx, "B": dr}) + df["C"] = idx + df.iloc[1, 1] = pd.NaT + df.iloc[1, 2] = pd.NaT + + # indexing + result = df.iloc[1] + expected = Series( + [Timestamp("2013-01-02 00:00:00-0500", tz="US/Eastern"), pd.NaT, pd.NaT], + index=list("ABC"), + dtype="object", + name=1, + ) + tm.assert_series_equal(result, expected) + result = df.loc[1] + expected = Series( + [Timestamp("2013-01-02 00:00:00-0500", tz="US/Eastern"), pd.NaT, pd.NaT], + index=list("ABC"), + dtype="object", + name=1, + ) + tm.assert_series_equal(result, expected) + + # indexing - fast_xs + df = DataFrame({"a": date_range("2014-01-01", periods=10, tz="UTC")}) + result = df.iloc[5] + expected = Series( + [Timestamp("2014-01-06 00:00:00+0000", tz="UTC")], index=["a"], name=5 + ) + tm.assert_series_equal(result, expected) + + result = df.loc[5] + tm.assert_series_equal(result, expected) + + # indexing - boolean + result = df[df.a > df.a[3]] + expected = df.iloc[4:] + tm.assert_frame_equal(result, expected) + + # indexing - setting an element + df = DataFrame( + data=pd.to_datetime(["2015-03-30 20:12:32", "2015-03-12 00:11:11"]), + columns=["time"], + ) + df["new_col"] = ["new", "old"] + df.time = df.set_index("time").index.tz_localize("UTC") + v = df[df.new_col == "new"].set_index("time").index.tz_convert("US/Pacific") + + # trying to set a single element on a part of a different timezone + # this converts to object + df2 = df.copy() + df2.loc[df2.new_col == "new", "time"] = v + + expected = Series([v[0], df.loc[1, "time"]], name="time") + tm.assert_series_equal(df2.time, expected) + + v = df.loc[df.new_col == "new", "time"] + pd.Timedelta("1s") + df.loc[df.new_col == "new", "time"] = v + tm.assert_series_equal(df.loc[df.new_col == "new", "time"], v) + + def test_consistency_with_tz_aware_scalar(self): + # xef gh-12938 + # various ways of indexing the same tz-aware scalar + df = Series([Timestamp("2016-03-30 14:35:25", tz="Europe/Brussels")]).to_frame() + + df = pd.concat([df, df]).reset_index(drop=True) + expected = Timestamp("2016-03-30 14:35:25+0200", tz="Europe/Brussels") + + result = df[0][0] + assert result == expected + + result = df.iloc[0, 0] + assert result == expected + + result = df.loc[0, 0] + assert result == expected + + result = df.iat[0, 0] + assert result == expected + + result = df.at[0, 0] + assert result == expected + + result = df[0].loc[0] + assert result == expected + + result = df[0].at[0] + assert result == expected + + def test_indexing_with_datetimeindex_tz(self): + + # GH 12050 + # indexing on a series with a datetimeindex with tz + index = date_range("2015-01-01", periods=2, tz="utc") + + ser = Series(range(2), index=index, dtype="int64") + + # list-like indexing + + for sel in (index, list(index)): + # getitem + tm.assert_series_equal(ser[sel], ser) + + # setitem + result = ser.copy() + result[sel] = 1 + expected = Series(1, index=index) + tm.assert_series_equal(result, expected) + + # .loc getitem + tm.assert_series_equal(ser.loc[sel], ser) + + # .loc setitem + result = ser.copy() + result.loc[sel] = 1 + expected = Series(1, index=index) + tm.assert_series_equal(result, expected) + + # single element indexing + + # getitem + assert ser[index[1]] == 1 + + # setitem + result = ser.copy() + result[index[1]] = 5 + expected = Series([0, 5], index=index) + tm.assert_series_equal(result, expected) + + # .loc getitem + assert ser.loc[index[1]] == 1 + + # .loc setitem + result = ser.copy() + result.loc[index[1]] = 5 + expected = Series([0, 5], index=index) + tm.assert_series_equal(result, expected) + + def test_partial_setting_with_datetimelike_dtype(self): + + # GH9478 + # a datetimeindex alignment issue with partial setting + df = DataFrame( + np.arange(6.0).reshape(3, 2), + columns=list("AB"), + index=date_range("1/1/2000", periods=3, freq="1H"), + ) + expected = df.copy() + expected["C"] = [expected.index[0]] + [pd.NaT, pd.NaT] + + mask = df.A < 1 + df.loc[mask, "C"] = df.loc[mask].index + tm.assert_frame_equal(df, expected) + + def test_loc_setitem_datetime(self): + + # GH 9516 + dt1 = Timestamp("20130101 09:00:00") + dt2 = Timestamp("20130101 10:00:00") + + for conv in [ + lambda x: x, + lambda x: x.to_datetime64(), + lambda x: x.to_pydatetime(), + lambda x: np.datetime64(x), + ]: + + df = DataFrame() + df.loc[conv(dt1), "one"] = 100 + df.loc[conv(dt2), "one"] = 200 + + expected = DataFrame({"one": [100.0, 200.0]}, index=[dt1, dt2]) + tm.assert_frame_equal(df, expected) + + def test_series_partial_set_datetime(self): + # GH 11497 + + idx = date_range("2011-01-01", "2011-01-02", freq="D", name="idx") + ser = Series([0.1, 0.2], index=idx, name="s") + + result = ser.loc[[Timestamp("2011-01-01"), Timestamp("2011-01-02")]] + exp = Series([0.1, 0.2], index=idx, name="s") + tm.assert_series_equal(result, exp, check_index_type=True) + + keys = [ + Timestamp("2011-01-02"), + Timestamp("2011-01-02"), + Timestamp("2011-01-01"), + ] + exp = Series( + [0.2, 0.2, 0.1], index=pd.DatetimeIndex(keys, name="idx"), name="s" + ) + tm.assert_series_equal(ser.loc[keys], exp, check_index_type=True) + + keys = [ + Timestamp("2011-01-03"), + Timestamp("2011-01-02"), + Timestamp("2011-01-03"), + ] + with pytest.raises(KeyError, match="with any missing labels"): + ser.loc[keys] + + def test_series_partial_set_period(self): + # GH 11497 + + idx = pd.period_range("2011-01-01", "2011-01-02", freq="D", name="idx") + ser = Series([0.1, 0.2], index=idx, name="s") + + result = ser.loc[ + [pd.Period("2011-01-01", freq="D"), pd.Period("2011-01-02", freq="D")] + ] + exp = Series([0.1, 0.2], index=idx, name="s") + tm.assert_series_equal(result, exp, check_index_type=True) + + keys = [ + pd.Period("2011-01-02", freq="D"), + pd.Period("2011-01-02", freq="D"), + pd.Period("2011-01-01", freq="D"), + ] + exp = Series([0.2, 0.2, 0.1], index=pd.PeriodIndex(keys, name="idx"), name="s") + tm.assert_series_equal(ser.loc[keys], exp, check_index_type=True) + + keys = [ + pd.Period("2011-01-03", freq="D"), + pd.Period("2011-01-02", freq="D"), + pd.Period("2011-01-03", freq="D"), + ] + with pytest.raises(KeyError, match="with any missing labels"): + ser.loc[keys] + + def test_nanosecond_getitem_setitem_with_tz(self): + # GH 11679 + data = ["2016-06-28 08:30:00.123456789"] + index = pd.DatetimeIndex(data, dtype="datetime64[ns, America/Chicago]") + df = DataFrame({"a": [10]}, index=index) + result = df.loc[df.index[0]] + expected = Series(10, index=["a"], name=df.index[0]) + tm.assert_series_equal(result, expected) + + result = df.copy() + result.loc[df.index[0], "a"] = -1 + expected = DataFrame(-1, index=index, columns=["a"]) + tm.assert_frame_equal(result, expected) + + def test_loc_getitem_across_dst(self): + # GH 21846 + idx = pd.date_range( + "2017-10-29 01:30:00", tz="Europe/Berlin", periods=5, freq="30 min" + ) + series2 = pd.Series([0, 1, 2, 3, 4], index=idx) + + t_1 = pd.Timestamp( + "2017-10-29 02:30:00+02:00", tz="Europe/Berlin", freq="30min" + ) + t_2 = pd.Timestamp( + "2017-10-29 02:00:00+01:00", tz="Europe/Berlin", freq="30min" + ) + result = series2.loc[t_1:t_2] + expected = pd.Series([2, 3], index=idx[2:4]) + tm.assert_series_equal(result, expected) + + result = series2[t_1] + expected = 2 + assert result == expected + + def test_loc_incremental_setitem_with_dst(self): + # GH 20724 + base = datetime(2015, 11, 1, tzinfo=tz.gettz("US/Pacific")) + idxs = [base + timedelta(seconds=i * 900) for i in range(16)] + result = pd.Series([0], index=[idxs[0]]) + for ts in idxs: + result.loc[ts] = 1 + expected = pd.Series(1, index=idxs) + tm.assert_series_equal(result, expected) + + def test_loc_setitem_with_existing_dst(self): + # GH 18308 + start = pd.Timestamp("2017-10-29 00:00:00+0200", tz="Europe/Madrid") + end = pd.Timestamp("2017-10-29 03:00:00+0100", tz="Europe/Madrid") + ts = pd.Timestamp("2016-10-10 03:00:00", tz="Europe/Madrid") + idx = pd.date_range(start, end, closed="left", freq="H") + result = pd.DataFrame(index=idx, columns=["value"]) + result.loc[ts, "value"] = 12 + expected = pd.DataFrame( + [np.nan] * len(idx) + [12], + index=idx.append(pd.DatetimeIndex([ts])), + columns=["value"], + dtype=object, + ) + tm.assert_frame_equal(result, expected) + + def test_loc_str_slicing(self): + ix = pd.period_range(start="2017-01-01", end="2018-01-01", freq="M") + ser = ix.to_series() + result = ser.loc[:"2017-12"] + expected = ser.iloc[:-1] + + tm.assert_series_equal(result, expected) + + def test_loc_label_slicing(self): + ix = pd.period_range(start="2017-01-01", end="2018-01-01", freq="M") + ser = ix.to_series() + result = ser.loc[: ix[-2]] + expected = ser.iloc[:-1] + + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexing/test_floats.py b/venv/Lib/site-packages/pandas/tests/indexing/test_floats.py new file mode 100644 index 0000000..2cc8232 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/test_floats.py @@ -0,0 +1,925 @@ +import numpy as np +import pytest + +from pandas import DataFrame, Float64Index, Index, Int64Index, RangeIndex, Series +import pandas._testing as tm + + +class TestFloatIndexers: + def check(self, result, original, indexer, getitem): + """ + comparator for results + we need to take care if we are indexing on a + Series or a frame + """ + if isinstance(original, Series): + expected = original.iloc[indexer] + else: + if getitem: + expected = original.iloc[:, indexer] + else: + expected = original.iloc[indexer] + + tm.assert_almost_equal(result, expected) + + def test_scalar_error(self): + + # GH 4892 + # float_indexers should raise exceptions + # on appropriate Index types & accessors + # this duplicates the code below + # but is specifically testing for the error + # message + + for index in [ + tm.makeStringIndex, + tm.makeUnicodeIndex, + tm.makeCategoricalIndex, + tm.makeDateIndex, + tm.makeTimedeltaIndex, + tm.makePeriodIndex, + tm.makeIntIndex, + tm.makeRangeIndex, + ]: + + i = index(5) + + s = Series(np.arange(len(i)), index=i) + + msg = "Cannot index by location index" + with pytest.raises(TypeError, match=msg): + s.iloc[3.0] + + msg = ( + "cannot do positional indexing on {klass} with these " + r"indexers \[3\.0\] of {kind}".format(klass=type(i), kind=str(float)) + ) + with pytest.raises(TypeError, match=msg): + s.iloc[3.0] = 0 + + def test_scalar_non_numeric(self): + + # GH 4892 + # float_indexers should raise exceptions + # on appropriate Index types & accessors + + for index in [ + tm.makeStringIndex, + tm.makeUnicodeIndex, + tm.makeCategoricalIndex, + tm.makeDateIndex, + tm.makeTimedeltaIndex, + tm.makePeriodIndex, + ]: + + i = index(5) + + for s in [ + Series(np.arange(len(i)), index=i), + DataFrame(np.random.randn(len(i), len(i)), index=i, columns=i), + ]: + + # getting + for idxr, getitem in [(lambda x: x.iloc, False), (lambda x: x, True)]: + + # gettitem on a DataFrame is a KeyError as it is indexing + # via labels on the columns + if getitem and isinstance(s, DataFrame): + error = KeyError + msg = r"^3(\.0)?$" + else: + error = TypeError + msg = ( + r"cannot do (label|index|positional) indexing " + r"on {klass} with these indexers \[3\.0\] of " + r"{kind}|" + "Cannot index by location index with a " + "non-integer key".format(klass=type(i), kind=str(float)) + ) + with pytest.raises(error, match=msg): + idxr(s)[3.0] + + # label based can be a TypeError or KeyError + if s.index.inferred_type in { + "categorical", + "string", + "unicode", + "mixed", + }: + error = KeyError + msg = r"^3$" + else: + error = TypeError + msg = ( + r"cannot do (label|index) indexing " + r"on {klass} with these indexers \[3\.0\] of " + r"{kind}".format(klass=type(i), kind=str(float)) + ) + with pytest.raises(error, match=msg): + s.loc[3.0] + + # contains + assert 3.0 not in s + + # setting with a float fails with iloc + msg = ( + r"cannot do (label|index|positional) indexing" + r" on {klass} with these indexers \[3\.0\] of" + r" {kind}".format(klass=type(i), kind=str(float)) + ) + with pytest.raises(TypeError, match=msg): + s.iloc[3.0] = 0 + + # setting with an indexer + if s.index.inferred_type in ["categorical"]: + # Value or Type Error + pass + elif s.index.inferred_type in ["datetime64", "timedelta64", "period"]: + + # these should prob work + # and are inconsistent between series/dataframe ATM + # for idxr in [lambda x: x]: + # s2 = s.copy() + # + # with pytest.raises(TypeError): + # idxr(s2)[3.0] = 0 + pass + + else: + + s2 = s.copy() + s2.loc[3.0] = 10 + assert s2.index.is_object() + + for idxr in [lambda x: x]: + s2 = s.copy() + idxr(s2)[3.0] = 0 + assert s2.index.is_object() + + # fallsback to position selection, series only + s = Series(np.arange(len(i)), index=i) + s[3] + msg = ( + r"cannot do (label|index) indexing" + r" on {klass} with these indexers \[3\.0\] of" + r" {kind}".format(klass=type(i), kind=str(float)) + ) + with pytest.raises(TypeError, match=msg): + s[3.0] + + def test_scalar_with_mixed(self): + + s2 = Series([1, 2, 3], index=["a", "b", "c"]) + s3 = Series([1, 2, 3], index=["a", "b", 1.5]) + + # lookup in a pure stringstr + # with an invalid indexer + for idxr in [lambda x: x, lambda x: x.iloc]: + + msg = ( + r"cannot do label indexing" + r" on {klass} with these indexers \[1\.0\] of" + r" {kind}|" + "Cannot index by location index with a non-integer key".format( + klass=str(Index), kind=str(float) + ) + ) + with pytest.raises(TypeError, match=msg): + idxr(s2)[1.0] + + with pytest.raises(KeyError, match=r"^1$"): + s2.loc[1.0] + + result = s2.loc["b"] + expected = 2 + assert result == expected + + # mixed index so we have label + # indexing + for idxr in [lambda x: x]: + + msg = ( + r"cannot do label indexing" + r" on {klass} with these indexers \[1\.0\] of" + r" {kind}".format(klass=str(Index), kind=str(float)) + ) + with pytest.raises(TypeError, match=msg): + idxr(s3)[1.0] + + result = idxr(s3)[1] + expected = 2 + assert result == expected + + msg = "Cannot index by location index with a non-integer key" + with pytest.raises(TypeError, match=msg): + s3.iloc[1.0] + with pytest.raises(KeyError, match=r"^1$"): + s3.loc[1.0] + + result = s3.loc[1.5] + expected = 3 + assert result == expected + + def test_scalar_integer(self): + + # test how scalar float indexers work on int indexes + + # integer index + for i in [Int64Index(range(5)), RangeIndex(5)]: + + for s in [ + Series(np.arange(len(i))), + DataFrame(np.random.randn(len(i), len(i)), index=i, columns=i), + ]: + + # coerce to equal int + for idxr, getitem in [(lambda x: x.loc, False), (lambda x: x, True)]: + + result = idxr(s)[3.0] + self.check(result, s, 3, getitem) + + # coerce to equal int + for idxr, getitem in [(lambda x: x.loc, False), (lambda x: x, True)]: + + if isinstance(s, Series): + + def compare(x, y): + assert x == y + + expected = 100 + else: + compare = tm.assert_series_equal + if getitem: + expected = Series(100, index=range(len(s)), name=3) + else: + expected = Series(100.0, index=range(len(s)), name=3) + + s2 = s.copy() + idxr(s2)[3.0] = 100 + + result = idxr(s2)[3.0] + compare(result, expected) + + result = idxr(s2)[3] + compare(result, expected) + + # contains + # coerce to equal int + assert 3.0 in s + + def test_scalar_float(self): + + # scalar float indexers work on a float index + index = Index(np.arange(5.0)) + for s in [ + Series(np.arange(len(index)), index=index), + DataFrame( + np.random.randn(len(index), len(index)), index=index, columns=index + ), + ]: + + # assert all operations except for iloc are ok + indexer = index[3] + for idxr, getitem in [(lambda x: x.loc, False), (lambda x: x, True)]: + + # getting + result = idxr(s)[indexer] + self.check(result, s, 3, getitem) + + # setting + s2 = s.copy() + + result = idxr(s2)[indexer] + self.check(result, s, 3, getitem) + + # random integer is a KeyError + with pytest.raises(KeyError, match=r"^3\.5$"): + idxr(s)[3.5] + + # contains + assert 3.0 in s + + # iloc succeeds with an integer + expected = s.iloc[3] + s2 = s.copy() + + s2.iloc[3] = expected + result = s2.iloc[3] + self.check(result, s, 3, False) + + # iloc raises with a float + msg = "Cannot index by location index with a non-integer key" + with pytest.raises(TypeError, match=msg): + s.iloc[3.0] + + msg = ( + r"cannot do positional indexing" + r" on {klass} with these indexers \[3\.0\] of" + r" {kind}".format(klass=str(Float64Index), kind=str(float)) + ) + with pytest.raises(TypeError, match=msg): + s2.iloc[3.0] = 0 + + def test_slice_non_numeric(self): + + # GH 4892 + # float_indexers should raise exceptions + # on appropriate Index types & accessors + + for index in [ + tm.makeStringIndex, + tm.makeUnicodeIndex, + tm.makeDateIndex, + tm.makeTimedeltaIndex, + tm.makePeriodIndex, + ]: + + index = index(5) + for s in [ + Series(range(5), index=index), + DataFrame(np.random.randn(5, 2), index=index), + ]: + + # getitem + for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]: + + msg = ( + "cannot do slice indexing " + r"on {klass} with these indexers \[(3|4)\.0\] of " + "{kind}".format(klass=type(index), kind=str(float)) + ) + with pytest.raises(TypeError, match=msg): + s.iloc[l] + + for idxr in [lambda x: x.loc, lambda x: x.iloc, lambda x: x]: + + msg = ( + "cannot do slice indexing " + r"on {klass} with these indexers " + r"\[(3|4)(\.0)?\] " + r"of ({kind_float}|{kind_int})".format( + klass=type(index), + kind_float=str(float), + kind_int=str(int), + ) + ) + with pytest.raises(TypeError, match=msg): + idxr(s)[l] + + # setitem + for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]: + + msg = ( + "cannot do slice indexing " + r"on {klass} with these indexers \[(3|4)\.0\] of " + "{kind}".format(klass=type(index), kind=str(float)) + ) + with pytest.raises(TypeError, match=msg): + s.iloc[l] = 0 + + for idxr in [lambda x: x.loc, lambda x: x.iloc, lambda x: x]: + msg = ( + "cannot do slice indexing" + r" on {klass} with these indexers" + r" \[(3|4)(\.0)?\]" + r" of ({kind_float}|{kind_int})".format( + klass=type(index), + kind_float=str(float), + kind_int=str(int), + ) + ) + with pytest.raises(TypeError, match=msg): + idxr(s)[l] = 0 + + def test_slice_integer(self): + + # same as above, but for Integer based indexes + # these coerce to a like integer + # oob indicates if we are out of bounds + # of positional indexing + for index, oob in [ + (Int64Index(range(5)), False), + (RangeIndex(5), False), + (Int64Index(range(5)) + 10, True), + ]: + + # s is an in-range index + s = Series(range(5), index=index) + + # getitem + for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]: + + for idxr in [lambda x: x.loc]: + + result = idxr(s)[l] + + # these are all label indexing + # except getitem which is positional + # empty + if oob: + indexer = slice(0, 0) + else: + indexer = slice(3, 5) + self.check(result, s, indexer, False) + + # positional indexing + msg = ( + "cannot do slice indexing " + r"on {klass} with these indexers \[(3|4)\.0\] of " + "{kind}".format(klass=type(index), kind=str(float)) + ) + with pytest.raises(TypeError, match=msg): + s[l] + + # getitem out-of-bounds + for l in [slice(-6, 6), slice(-6.0, 6.0)]: + + for idxr in [lambda x: x.loc]: + result = idxr(s)[l] + + # these are all label indexing + # except getitem which is positional + # empty + if oob: + indexer = slice(0, 0) + else: + indexer = slice(-6, 6) + self.check(result, s, indexer, False) + + # positional indexing + msg = ( + "cannot do slice indexing " + r"on {klass} with these indexers \[-6\.0\] of " + "{kind}".format(klass=type(index), kind=str(float)) + ) + with pytest.raises(TypeError, match=msg): + s[slice(-6.0, 6.0)] + + # getitem odd floats + for l, res1 in [ + (slice(2.5, 4), slice(3, 5)), + (slice(2, 3.5), slice(2, 4)), + (slice(2.5, 3.5), slice(3, 4)), + ]: + + for idxr in [lambda x: x.loc]: + + result = idxr(s)[l] + if oob: + res = slice(0, 0) + else: + res = res1 + + self.check(result, s, res, False) + + # positional indexing + msg = ( + "cannot do slice indexing " + r"on {klass} with these indexers \[(2|3)\.5\] of " + "{kind}".format(klass=type(index), kind=str(float)) + ) + with pytest.raises(TypeError, match=msg): + s[l] + + # setitem + for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]: + + for idxr in [lambda x: x.loc]: + sc = s.copy() + idxr(sc)[l] = 0 + result = idxr(sc)[l].values.ravel() + assert (result == 0).all() + + # positional indexing + msg = ( + "cannot do slice indexing " + r"on {klass} with these indexers \[(3|4)\.0\] of " + "{kind}".format(klass=type(index), kind=str(float)) + ) + with pytest.raises(TypeError, match=msg): + s[l] = 0 + + def test_integer_positional_indexing(self): + """ make sure that we are raising on positional indexing + w.r.t. an integer index """ + + s = Series(range(2, 6), index=range(2, 6)) + + result = s[2:4] + expected = s.iloc[2:4] + tm.assert_series_equal(result, expected) + + for idxr in [lambda x: x, lambda x: x.iloc]: + + for l in [slice(2, 4.0), slice(2.0, 4), slice(2.0, 4.0)]: + + klass = RangeIndex + msg = ( + "cannot do slice indexing " + r"on {klass} with these indexers \[(2|4)\.0\] of " + "{kind}".format(klass=str(klass), kind=str(float)) + ) + with pytest.raises(TypeError, match=msg): + idxr(s)[l] + + def test_slice_integer_frame_getitem(self): + + # similar to above, but on the getitem dim (of a DataFrame) + for index in [Int64Index(range(5)), RangeIndex(5)]: + + s = DataFrame(np.random.randn(5, 2), index=index) + + def f(idxr): + + # getitem + for l in [slice(0.0, 1), slice(0, 1.0), slice(0.0, 1.0)]: + + result = idxr(s)[l] + indexer = slice(0, 2) + self.check(result, s, indexer, False) + + # positional indexing + msg = ( + "cannot do slice indexing " + r"on {klass} with these indexers \[(0|1)\.0\] of " + "{kind}".format(klass=type(index), kind=str(float)) + ) + with pytest.raises(TypeError, match=msg): + s[l] + + # getitem out-of-bounds + for l in [slice(-10, 10), slice(-10.0, 10.0)]: + + result = idxr(s)[l] + self.check(result, s, slice(-10, 10), True) + + # positional indexing + msg = ( + "cannot do slice indexing " + r"on {klass} with these indexers \[-10\.0\] of " + "{kind}".format(klass=type(index), kind=str(float)) + ) + with pytest.raises(TypeError, match=msg): + s[slice(-10.0, 10.0)] + + # getitem odd floats + for l, res in [ + (slice(0.5, 1), slice(1, 2)), + (slice(0, 0.5), slice(0, 1)), + (slice(0.5, 1.5), slice(1, 2)), + ]: + + result = idxr(s)[l] + self.check(result, s, res, False) + + # positional indexing + msg = ( + "cannot do slice indexing " + r"on {klass} with these indexers \[0\.5\] of " + "{kind}".format(klass=type(index), kind=str(float)) + ) + with pytest.raises(TypeError, match=msg): + s[l] + + # setitem + for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]: + + sc = s.copy() + idxr(sc)[l] = 0 + result = idxr(sc)[l].values.ravel() + assert (result == 0).all() + + # positional indexing + msg = ( + "cannot do slice indexing " + r"on {klass} with these indexers \[(3|4)\.0\] of " + "{kind}".format(klass=type(index), kind=str(float)) + ) + with pytest.raises(TypeError, match=msg): + s[l] = 0 + + f(lambda x: x.loc) + + def test_slice_float(self): + + # same as above, but for floats + index = Index(np.arange(5.0)) + 0.1 + for s in [ + Series(range(5), index=index), + DataFrame(np.random.randn(5, 2), index=index), + ]: + + for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]: + + expected = s.iloc[3:4] + for idxr in [lambda x: x.loc, lambda x: x]: + + # getitem + result = idxr(s)[l] + if isinstance(s, Series): + tm.assert_series_equal(result, expected) + else: + tm.assert_frame_equal(result, expected) + # setitem + s2 = s.copy() + idxr(s2)[l] = 0 + result = idxr(s2)[l].values.ravel() + assert (result == 0).all() + + def test_floating_index_doc_example(self): + + index = Index([1.5, 2, 3, 4.5, 5]) + s = Series(range(5), index=index) + assert s[3] == 2 + assert s.loc[3] == 2 + assert s.loc[3] == 2 + assert s.iloc[3] == 3 + + def test_floating_misc(self): + + # related 236 + # scalar/slicing of a float index + s = Series(np.arange(5), index=np.arange(5) * 2.5, dtype=np.int64) + + # label based slicing + result1 = s[1.0:3.0] + result2 = s.loc[1.0:3.0] + result3 = s.loc[1.0:3.0] + tm.assert_series_equal(result1, result2) + tm.assert_series_equal(result1, result3) + + # exact indexing when found + result1 = s[5.0] + result2 = s.loc[5.0] + result3 = s.loc[5.0] + assert result1 == result2 + assert result1 == result3 + + result1 = s[5] + result2 = s.loc[5] + result3 = s.loc[5] + assert result1 == result2 + assert result1 == result3 + + assert s[5.0] == s[5] + + # value not found (and no fallbacking at all) + + # scalar integers + with pytest.raises(KeyError, match=r"^4\.0$"): + s.loc[4] + with pytest.raises(KeyError, match=r"^4\.0$"): + s.loc[4] + with pytest.raises(KeyError, match=r"^4\.0$"): + s[4] + + # fancy floats/integers create the correct entry (as nan) + # fancy tests + expected = Series([2, 0], index=Float64Index([5.0, 0.0])) + for fancy_idx in [[5.0, 0.0], np.array([5.0, 0.0])]: # float + tm.assert_series_equal(s[fancy_idx], expected) + tm.assert_series_equal(s.loc[fancy_idx], expected) + tm.assert_series_equal(s.loc[fancy_idx], expected) + + expected = Series([2, 0], index=Index([5, 0], dtype="int64")) + for fancy_idx in [[5, 0], np.array([5, 0])]: # int + tm.assert_series_equal(s[fancy_idx], expected) + tm.assert_series_equal(s.loc[fancy_idx], expected) + tm.assert_series_equal(s.loc[fancy_idx], expected) + + # all should return the same as we are slicing 'the same' + result1 = s.loc[2:5] + result2 = s.loc[2.0:5.0] + result3 = s.loc[2.0:5] + result4 = s.loc[2.1:5] + tm.assert_series_equal(result1, result2) + tm.assert_series_equal(result1, result3) + tm.assert_series_equal(result1, result4) + + # previously this did fallback indexing + result1 = s[2:5] + result2 = s[2.0:5.0] + result3 = s[2.0:5] + result4 = s[2.1:5] + tm.assert_series_equal(result1, result2) + tm.assert_series_equal(result1, result3) + tm.assert_series_equal(result1, result4) + + result1 = s.loc[2:5] + result2 = s.loc[2.0:5.0] + result3 = s.loc[2.0:5] + result4 = s.loc[2.1:5] + tm.assert_series_equal(result1, result2) + tm.assert_series_equal(result1, result3) + tm.assert_series_equal(result1, result4) + + # combined test + result1 = s.loc[2:5] + result2 = s.loc[2:5] + result3 = s[2:5] + + tm.assert_series_equal(result1, result2) + tm.assert_series_equal(result1, result3) + + # list selection + result1 = s[[0.0, 5, 10]] + result2 = s.loc[[0.0, 5, 10]] + result3 = s.loc[[0.0, 5, 10]] + result4 = s.iloc[[0, 2, 4]] + tm.assert_series_equal(result1, result2) + tm.assert_series_equal(result1, result3) + tm.assert_series_equal(result1, result4) + + with pytest.raises(KeyError, match="with any missing labels"): + s[[1.6, 5, 10]] + with pytest.raises(KeyError, match="with any missing labels"): + s.loc[[1.6, 5, 10]] + + with pytest.raises(KeyError, match="with any missing labels"): + s[[0, 1, 2]] + with pytest.raises(KeyError, match="with any missing labels"): + s.loc[[0, 1, 2]] + + result1 = s.loc[[2.5, 5]] + result2 = s.loc[[2.5, 5]] + tm.assert_series_equal(result1, result2) + tm.assert_series_equal(result1, Series([1, 2], index=[2.5, 5.0])) + + result1 = s[[2.5]] + result2 = s.loc[[2.5]] + result3 = s.loc[[2.5]] + tm.assert_series_equal(result1, result2) + tm.assert_series_equal(result1, result3) + tm.assert_series_equal(result1, Series([1], index=[2.5])) + + def test_floating_tuples(self): + # see gh-13509 + s = Series([(1, 1), (2, 2), (3, 3)], index=[0.0, 0.1, 0.2], name="foo") + + result = s[0.0] + assert result == (1, 1) + + expected = Series([(1, 1), (2, 2)], index=[0.0, 0.0], name="foo") + s = Series([(1, 1), (2, 2), (3, 3)], index=[0.0, 0.0, 0.2], name="foo") + + result = s[0.0] + tm.assert_series_equal(result, expected) + + def test_float64index_slicing_bug(self): + # GH 5557, related to slicing a float index + ser = { + 256: 2321.0, + 1: 78.0, + 2: 2716.0, + 3: 0.0, + 4: 369.0, + 5: 0.0, + 6: 269.0, + 7: 0.0, + 8: 0.0, + 9: 0.0, + 10: 3536.0, + 11: 0.0, + 12: 24.0, + 13: 0.0, + 14: 931.0, + 15: 0.0, + 16: 101.0, + 17: 78.0, + 18: 9643.0, + 19: 0.0, + 20: 0.0, + 21: 0.0, + 22: 63761.0, + 23: 0.0, + 24: 446.0, + 25: 0.0, + 26: 34773.0, + 27: 0.0, + 28: 729.0, + 29: 78.0, + 30: 0.0, + 31: 0.0, + 32: 3374.0, + 33: 0.0, + 34: 1391.0, + 35: 0.0, + 36: 361.0, + 37: 0.0, + 38: 61808.0, + 39: 0.0, + 40: 0.0, + 41: 0.0, + 42: 6677.0, + 43: 0.0, + 44: 802.0, + 45: 0.0, + 46: 2691.0, + 47: 0.0, + 48: 3582.0, + 49: 0.0, + 50: 734.0, + 51: 0.0, + 52: 627.0, + 53: 70.0, + 54: 2584.0, + 55: 0.0, + 56: 324.0, + 57: 0.0, + 58: 605.0, + 59: 0.0, + 60: 0.0, + 61: 0.0, + 62: 3989.0, + 63: 10.0, + 64: 42.0, + 65: 0.0, + 66: 904.0, + 67: 0.0, + 68: 88.0, + 69: 70.0, + 70: 8172.0, + 71: 0.0, + 72: 0.0, + 73: 0.0, + 74: 64902.0, + 75: 0.0, + 76: 347.0, + 77: 0.0, + 78: 36605.0, + 79: 0.0, + 80: 379.0, + 81: 70.0, + 82: 0.0, + 83: 0.0, + 84: 3001.0, + 85: 0.0, + 86: 1630.0, + 87: 7.0, + 88: 364.0, + 89: 0.0, + 90: 67404.0, + 91: 9.0, + 92: 0.0, + 93: 0.0, + 94: 7685.0, + 95: 0.0, + 96: 1017.0, + 97: 0.0, + 98: 2831.0, + 99: 0.0, + 100: 2963.0, + 101: 0.0, + 102: 854.0, + 103: 0.0, + 104: 0.0, + 105: 0.0, + 106: 0.0, + 107: 0.0, + 108: 0.0, + 109: 0.0, + 110: 0.0, + 111: 0.0, + 112: 0.0, + 113: 0.0, + 114: 0.0, + 115: 0.0, + 116: 0.0, + 117: 0.0, + 118: 0.0, + 119: 0.0, + 120: 0.0, + 121: 0.0, + 122: 0.0, + 123: 0.0, + 124: 0.0, + 125: 0.0, + 126: 67744.0, + 127: 22.0, + 128: 264.0, + 129: 0.0, + 260: 197.0, + 268: 0.0, + 265: 0.0, + 269: 0.0, + 261: 0.0, + 266: 1198.0, + 267: 0.0, + 262: 2629.0, + 258: 775.0, + 257: 0.0, + 263: 0.0, + 259: 0.0, + 264: 163.0, + 250: 10326.0, + 251: 0.0, + 252: 1228.0, + 253: 0.0, + 254: 2769.0, + 255: 0.0, + } + + # smoke test for the repr + s = Series(ser) + result = s.value_counts() + str(result) diff --git a/venv/Lib/site-packages/pandas/tests/indexing/test_iloc.py b/venv/Lib/site-packages/pandas/tests/indexing/test_iloc.py new file mode 100644 index 0000000..a6bf0ef --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/test_iloc.py @@ -0,0 +1,690 @@ +""" test positional based indexing with iloc """ + +from datetime import datetime +from warnings import catch_warnings, simplefilter + +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Series, concat, date_range, isna +import pandas._testing as tm +from pandas.api.types import is_scalar +from pandas.core.indexing import IndexingError +from pandas.tests.indexing.common import Base + + +class TestiLoc(Base): + def test_iloc_exceeds_bounds(self): + + # GH6296 + # iloc should allow indexers that exceed the bounds + df = DataFrame(np.random.random_sample((20, 5)), columns=list("ABCDE")) + + # lists of positions should raise IndexError! + msg = "positional indexers are out-of-bounds" + with pytest.raises(IndexError, match=msg): + df.iloc[:, [0, 1, 2, 3, 4, 5]] + with pytest.raises(IndexError, match=msg): + df.iloc[[1, 30]] + with pytest.raises(IndexError, match=msg): + df.iloc[[1, -30]] + with pytest.raises(IndexError, match=msg): + df.iloc[[100]] + + s = df["A"] + with pytest.raises(IndexError, match=msg): + s.iloc[[100]] + with pytest.raises(IndexError, match=msg): + s.iloc[[-100]] + + # still raise on a single indexer + msg = "single positional indexer is out-of-bounds" + with pytest.raises(IndexError, match=msg): + df.iloc[30] + with pytest.raises(IndexError, match=msg): + df.iloc[-30] + + # GH10779 + # single positive/negative indexer exceeding Series bounds should raise + # an IndexError + with pytest.raises(IndexError, match=msg): + s.iloc[30] + with pytest.raises(IndexError, match=msg): + s.iloc[-30] + + # slices are ok + result = df.iloc[:, 4:10] # 0 < start < len < stop + expected = df.iloc[:, 4:] + tm.assert_frame_equal(result, expected) + + result = df.iloc[:, -4:-10] # stop < 0 < start < len + expected = df.iloc[:, :0] + tm.assert_frame_equal(result, expected) + + result = df.iloc[:, 10:4:-1] # 0 < stop < len < start (down) + expected = df.iloc[:, :4:-1] + tm.assert_frame_equal(result, expected) + + result = df.iloc[:, 4:-10:-1] # stop < 0 < start < len (down) + expected = df.iloc[:, 4::-1] + tm.assert_frame_equal(result, expected) + + result = df.iloc[:, -10:4] # start < 0 < stop < len + expected = df.iloc[:, :4] + tm.assert_frame_equal(result, expected) + + result = df.iloc[:, 10:4] # 0 < stop < len < start + expected = df.iloc[:, :0] + tm.assert_frame_equal(result, expected) + + result = df.iloc[:, -10:-11:-1] # stop < start < 0 < len (down) + expected = df.iloc[:, :0] + tm.assert_frame_equal(result, expected) + + result = df.iloc[:, 10:11] # 0 < len < start < stop + expected = df.iloc[:, :0] + tm.assert_frame_equal(result, expected) + + # slice bounds exceeding is ok + result = s.iloc[18:30] + expected = s.iloc[18:] + tm.assert_series_equal(result, expected) + + result = s.iloc[30:] + expected = s.iloc[:0] + tm.assert_series_equal(result, expected) + + result = s.iloc[30::-1] + expected = s.iloc[::-1] + tm.assert_series_equal(result, expected) + + # doc example + def check(result, expected): + str(result) + result.dtypes + tm.assert_frame_equal(result, expected) + + dfl = DataFrame(np.random.randn(5, 2), columns=list("AB")) + check(dfl.iloc[:, 2:3], DataFrame(index=dfl.index)) + check(dfl.iloc[:, 1:3], dfl.iloc[:, [1]]) + check(dfl.iloc[4:6], dfl.iloc[[4]]) + + msg = "positional indexers are out-of-bounds" + with pytest.raises(IndexError, match=msg): + dfl.iloc[[4, 5, 6]] + msg = "single positional indexer is out-of-bounds" + with pytest.raises(IndexError, match=msg): + dfl.iloc[:, 4] + + @pytest.mark.parametrize("index,columns", [(np.arange(20), list("ABCDE"))]) + @pytest.mark.parametrize( + "index_vals,column_vals", + [ + ([slice(None), ["A", "D"]]), + (["1", "2"], slice(None)), + ([datetime(2019, 1, 1)], slice(None)), + ], + ) + def test_iloc_non_integer_raises(self, index, columns, index_vals, column_vals): + # GH 25753 + df = DataFrame( + np.random.randn(len(index), len(columns)), index=index, columns=columns + ) + msg = ".iloc requires numeric indexers, got" + with pytest.raises(IndexError, match=msg): + df.iloc[index_vals, column_vals] + + def test_iloc_getitem_int(self): + # integer + self.check_result( + "iloc", + 2, + "iloc", + 2, + typs=["labels", "mixed", "ts", "floats", "empty"], + fails=IndexError, + ) + + def test_iloc_getitem_neg_int(self): + # neg integer + self.check_result( + "iloc", + -1, + "iloc", + -1, + typs=["labels", "mixed", "ts", "floats", "empty"], + fails=IndexError, + ) + + @pytest.mark.parametrize("dims", [1, 2]) + def test_iloc_getitem_invalid_scalar(self, dims): + # GH 21982 + + if dims == 1: + s = Series(np.arange(10)) + else: + s = DataFrame(np.arange(100).reshape(10, 10)) + + with pytest.raises(TypeError, match="Cannot index by location index"): + s.iloc["a"] + + def test_iloc_array_not_mutating_negative_indices(self): + + # GH 21867 + array_with_neg_numbers = np.array([1, 2, -1]) + array_copy = array_with_neg_numbers.copy() + df = pd.DataFrame( + {"A": [100, 101, 102], "B": [103, 104, 105], "C": [106, 107, 108]}, + index=[1, 2, 3], + ) + df.iloc[array_with_neg_numbers] + tm.assert_numpy_array_equal(array_with_neg_numbers, array_copy) + df.iloc[:, array_with_neg_numbers] + tm.assert_numpy_array_equal(array_with_neg_numbers, array_copy) + + def test_iloc_getitem_list_int(self): + self.check_result( + "iloc", + [0, 1, 2], + "iloc", + [0, 1, 2], + typs=["labels", "mixed", "ts", "floats", "empty"], + fails=IndexError, + ) + + # array of ints (GH5006), make sure that a single indexer is returning + # the correct type + + def test_iloc_getitem_neg_int_can_reach_first_index(self): + # GH10547 and GH10779 + # negative integers should be able to reach index 0 + df = DataFrame({"A": [2, 3, 5], "B": [7, 11, 13]}) + s = df["A"] + + expected = df.iloc[0] + result = df.iloc[-3] + tm.assert_series_equal(result, expected) + + expected = df.iloc[[0]] + result = df.iloc[[-3]] + tm.assert_frame_equal(result, expected) + + expected = s.iloc[0] + result = s.iloc[-3] + assert result == expected + + expected = s.iloc[[0]] + result = s.iloc[[-3]] + tm.assert_series_equal(result, expected) + + # check the length 1 Series case highlighted in GH10547 + expected = Series(["a"], index=["A"]) + result = expected.iloc[[-1]] + tm.assert_series_equal(result, expected) + + def test_iloc_getitem_dups(self): + # GH 6766 + df1 = DataFrame([{"A": None, "B": 1}, {"A": 2, "B": 2}]) + df2 = DataFrame([{"A": 3, "B": 3}, {"A": 4, "B": 4}]) + df = concat([df1, df2], axis=1) + + # cross-sectional indexing + result = df.iloc[0, 0] + assert isna(result) + + result = df.iloc[0, :] + expected = Series([np.nan, 1, 3, 3], index=["A", "B", "A", "B"], name=0) + tm.assert_series_equal(result, expected) + + def test_iloc_getitem_array(self): + # TODO: test something here? + pass + + def test_iloc_getitem_bool(self): + # TODO: test something here? + pass + + @pytest.mark.parametrize("index", [[True, False], [True, False, True, False]]) + def test_iloc_getitem_bool_diff_len(self, index): + # GH26658 + s = Series([1, 2, 3]) + msg = "Boolean index has wrong length: {} instead of {}".format( + len(index), len(s) + ) + with pytest.raises(IndexError, match=msg): + _ = s.iloc[index] + + def test_iloc_getitem_slice(self): + # TODO: test something here? + pass + + def test_iloc_getitem_slice_dups(self): + + df1 = DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]) + df2 = DataFrame( + np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"] + ) + + # axis=1 + df = concat([df1, df2], axis=1) + tm.assert_frame_equal(df.iloc[:, :4], df1) + tm.assert_frame_equal(df.iloc[:, 4:], df2) + + df = concat([df2, df1], axis=1) + tm.assert_frame_equal(df.iloc[:, :2], df2) + tm.assert_frame_equal(df.iloc[:, 2:], df1) + + exp = concat([df2, df1.iloc[:, [0]]], axis=1) + tm.assert_frame_equal(df.iloc[:, 0:3], exp) + + # axis=0 + df = concat([df, df], axis=0) + tm.assert_frame_equal(df.iloc[0:10, :2], df2) + tm.assert_frame_equal(df.iloc[0:10, 2:], df1) + tm.assert_frame_equal(df.iloc[10:, :2], df2) + tm.assert_frame_equal(df.iloc[10:, 2:], df1) + + def test_iloc_setitem(self): + df = self.frame_ints + + df.iloc[1, 1] = 1 + result = df.iloc[1, 1] + assert result == 1 + + df.iloc[:, 2:3] = 0 + expected = df.iloc[:, 2:3] + result = df.iloc[:, 2:3] + tm.assert_frame_equal(result, expected) + + # GH5771 + s = Series(0, index=[4, 5, 6]) + s.iloc[1:2] += 1 + expected = Series([0, 1, 0], index=[4, 5, 6]) + tm.assert_series_equal(s, expected) + + def test_iloc_setitem_list(self): + + # setitem with an iloc list + df = DataFrame( + np.arange(9).reshape((3, 3)), index=["A", "B", "C"], columns=["A", "B", "C"] + ) + df.iloc[[0, 1], [1, 2]] + df.iloc[[0, 1], [1, 2]] += 100 + + expected = DataFrame( + np.array([0, 101, 102, 3, 104, 105, 6, 7, 8]).reshape((3, 3)), + index=["A", "B", "C"], + columns=["A", "B", "C"], + ) + tm.assert_frame_equal(df, expected) + + def test_iloc_setitem_pandas_object(self): + # GH 17193 + s_orig = Series([0, 1, 2, 3]) + expected = Series([0, -1, -2, 3]) + + s = s_orig.copy() + s.iloc[Series([1, 2])] = [-1, -2] + tm.assert_series_equal(s, expected) + + s = s_orig.copy() + s.iloc[pd.Index([1, 2])] = [-1, -2] + tm.assert_series_equal(s, expected) + + def test_iloc_setitem_dups(self): + + # GH 6766 + # iloc with a mask aligning from another iloc + df1 = DataFrame([{"A": None, "B": 1}, {"A": 2, "B": 2}]) + df2 = DataFrame([{"A": 3, "B": 3}, {"A": 4, "B": 4}]) + df = concat([df1, df2], axis=1) + + expected = df.fillna(3) + expected["A"] = expected["A"].astype("float64") + inds = np.isnan(df.iloc[:, 0]) + mask = inds[inds].index + df.iloc[mask, 0] = df.iloc[mask, 2] + tm.assert_frame_equal(df, expected) + + # del a dup column across blocks + expected = DataFrame({0: [1, 2], 1: [3, 4]}) + expected.columns = ["B", "B"] + del df["A"] + tm.assert_frame_equal(df, expected) + + # assign back to self + df.iloc[[0, 1], [0, 1]] = df.iloc[[0, 1], [0, 1]] + tm.assert_frame_equal(df, expected) + + # reversed x 2 + df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index(drop=True) + df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index(drop=True) + tm.assert_frame_equal(df, expected) + + # TODO: GH#27620 this test used to compare iloc against ix; check if this + # is redundant with another test comparing iloc against loc + def test_iloc_getitem_frame(self): + df = DataFrame( + np.random.randn(10, 4), index=range(0, 20, 2), columns=range(0, 8, 2) + ) + + result = df.iloc[2] + exp = df.loc[4] + tm.assert_series_equal(result, exp) + + result = df.iloc[2, 2] + exp = df.loc[4, 4] + assert result == exp + + # slice + result = df.iloc[4:8] + expected = df.loc[8:14] + tm.assert_frame_equal(result, expected) + + result = df.iloc[:, 2:3] + expected = df.loc[:, 4:5] + tm.assert_frame_equal(result, expected) + + # list of integers + result = df.iloc[[0, 1, 3]] + expected = df.loc[[0, 2, 6]] + tm.assert_frame_equal(result, expected) + + result = df.iloc[[0, 1, 3], [0, 1]] + expected = df.loc[[0, 2, 6], [0, 2]] + tm.assert_frame_equal(result, expected) + + # neg indices + result = df.iloc[[-1, 1, 3], [-1, 1]] + expected = df.loc[[18, 2, 6], [6, 2]] + tm.assert_frame_equal(result, expected) + + # dups indices + result = df.iloc[[-1, -1, 1, 3], [-1, 1]] + expected = df.loc[[18, 18, 2, 6], [6, 2]] + tm.assert_frame_equal(result, expected) + + # with index-like + s = Series(index=range(1, 5), dtype=object) + result = df.iloc[s.index] + expected = df.loc[[2, 4, 6, 8]] + tm.assert_frame_equal(result, expected) + + def test_iloc_getitem_labelled_frame(self): + # try with labelled frame + df = DataFrame( + np.random.randn(10, 4), index=list("abcdefghij"), columns=list("ABCD") + ) + + result = df.iloc[1, 1] + exp = df.loc["b", "B"] + assert result == exp + + result = df.iloc[:, 2:3] + expected = df.loc[:, ["C"]] + tm.assert_frame_equal(result, expected) + + # negative indexing + result = df.iloc[-1, -1] + exp = df.loc["j", "D"] + assert result == exp + + # out-of-bounds exception + msg = "single positional indexer is out-of-bounds" + with pytest.raises(IndexError, match=msg): + df.iloc[10, 5] + + # trying to use a label + msg = ( + r"Location based indexing can only have \[integer, integer" + r" slice \(START point is INCLUDED, END point is EXCLUDED\)," + r" listlike of integers, boolean array\] types" + ) + with pytest.raises(ValueError, match=msg): + df.iloc["j", "D"] + + def test_iloc_getitem_doc_issue(self): + + # multi axis slicing issue with single block + # surfaced in GH 6059 + + arr = np.random.randn(6, 4) + index = date_range("20130101", periods=6) + columns = list("ABCD") + df = DataFrame(arr, index=index, columns=columns) + + # defines ref_locs + df.describe() + + result = df.iloc[3:5, 0:2] + str(result) + result.dtypes + + expected = DataFrame(arr[3:5, 0:2], index=index[3:5], columns=columns[0:2]) + tm.assert_frame_equal(result, expected) + + # for dups + df.columns = list("aaaa") + result = df.iloc[3:5, 0:2] + str(result) + result.dtypes + + expected = DataFrame(arr[3:5, 0:2], index=index[3:5], columns=list("aa")) + tm.assert_frame_equal(result, expected) + + # related + arr = np.random.randn(6, 4) + index = list(range(0, 12, 2)) + columns = list(range(0, 8, 2)) + df = DataFrame(arr, index=index, columns=columns) + + df._data.blocks[0].mgr_locs + result = df.iloc[1:5, 2:4] + str(result) + result.dtypes + expected = DataFrame(arr[1:5, 2:4], index=index[1:5], columns=columns[2:4]) + tm.assert_frame_equal(result, expected) + + def test_iloc_setitem_series(self): + df = DataFrame( + np.random.randn(10, 4), index=list("abcdefghij"), columns=list("ABCD") + ) + + df.iloc[1, 1] = 1 + result = df.iloc[1, 1] + assert result == 1 + + df.iloc[:, 2:3] = 0 + expected = df.iloc[:, 2:3] + result = df.iloc[:, 2:3] + tm.assert_frame_equal(result, expected) + + s = Series(np.random.randn(10), index=range(0, 20, 2)) + + s.iloc[1] = 1 + result = s.iloc[1] + assert result == 1 + + s.iloc[:4] = 0 + expected = s.iloc[:4] + result = s.iloc[:4] + tm.assert_series_equal(result, expected) + + s = Series([-1] * 6) + s.iloc[0::2] = [0, 2, 4] + s.iloc[1::2] = [1, 3, 5] + result = s + expected = Series([0, 1, 2, 3, 4, 5]) + tm.assert_series_equal(result, expected) + + def test_iloc_setitem_list_of_lists(self): + + # GH 7551 + # list-of-list is set incorrectly in mixed vs. single dtyped frames + df = DataFrame( + dict(A=np.arange(5, dtype="int64"), B=np.arange(5, 10, dtype="int64")) + ) + df.iloc[2:4] = [[10, 11], [12, 13]] + expected = DataFrame(dict(A=[0, 1, 10, 12, 4], B=[5, 6, 11, 13, 9])) + tm.assert_frame_equal(df, expected) + + df = DataFrame(dict(A=list("abcde"), B=np.arange(5, 10, dtype="int64"))) + df.iloc[2:4] = [["x", 11], ["y", 13]] + expected = DataFrame(dict(A=["a", "b", "x", "y", "e"], B=[5, 6, 11, 13, 9])) + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize("indexer", [[0], slice(None, 1, None), np.array([0])]) + @pytest.mark.parametrize("value", [["Z"], np.array(["Z"])]) + def test_iloc_setitem_with_scalar_index(self, indexer, value): + # GH #19474 + # assigning like "df.iloc[0, [0]] = ['Z']" should be evaluated + # elementwisely, not using "setter('A', ['Z'])". + + df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + df.iloc[0, indexer] = value + result = df.iloc[0, 0] + + assert is_scalar(result) and result == "Z" + + def test_iloc_mask(self): + + # GH 3631, iloc with a mask (of a series) should raise + df = DataFrame(list(range(5)), index=list("ABCDE"), columns=["a"]) + mask = df.a % 2 == 0 + msg = "iLocation based boolean indexing cannot use an indexable as a mask" + with pytest.raises(ValueError, match=msg): + df.iloc[mask] + mask.index = range(len(mask)) + msg = "iLocation based boolean indexing on an integer type is not available" + with pytest.raises(NotImplementedError, match=msg): + df.iloc[mask] + + # ndarray ok + result = df.iloc[np.array([True] * len(mask), dtype=bool)] + tm.assert_frame_equal(result, df) + + # the possibilities + locs = np.arange(4) + nums = 2 ** locs + reps = [bin(num) for num in nums] + df = DataFrame({"locs": locs, "nums": nums}, reps) + + expected = { + (None, ""): "0b1100", + (None, ".loc"): "0b1100", + (None, ".iloc"): "0b1100", + ("index", ""): "0b11", + ("index", ".loc"): "0b11", + ("index", ".iloc"): ( + "iLocation based boolean indexing cannot use an indexable as a mask" + ), + ("locs", ""): "Unalignable boolean Series provided as indexer " + "(index of the boolean Series and of the indexed " + "object do not match).", + ("locs", ".loc"): "Unalignable boolean Series provided as indexer " + "(index of the boolean Series and of the " + "indexed object do not match).", + ("locs", ".iloc"): ( + "iLocation based boolean indexing on an " + "integer type is not available" + ), + } + + # UserWarnings from reindex of a boolean mask + with catch_warnings(record=True): + simplefilter("ignore", UserWarning) + result = dict() + for idx in [None, "index", "locs"]: + mask = (df.nums > 2).values + if idx: + mask = Series(mask, list(reversed(getattr(df, idx)))) + for method in ["", ".loc", ".iloc"]: + try: + if method: + accessor = getattr(df, method[1:]) + else: + accessor = df + ans = str(bin(accessor[mask]["nums"].sum())) + except (ValueError, IndexingError, NotImplementedError) as e: + ans = str(e) + + key = tuple([idx, method]) + r = expected.get(key) + if r != ans: + raise AssertionError( + "[{key}] does not match [{ans}], received [{r}]".format( + key=key, ans=ans, r=r + ) + ) + + def test_iloc_non_unique_indexing(self): + + # GH 4017, non-unique indexing (on the axis) + df = DataFrame({"A": [0.1] * 3000, "B": [1] * 3000}) + idx = np.arange(30) * 99 + expected = df.iloc[idx] + + df3 = concat([df, 2 * df, 3 * df]) + result = df3.iloc[idx] + + tm.assert_frame_equal(result, expected) + + df2 = DataFrame({"A": [0.1] * 1000, "B": [1] * 1000}) + df2 = concat([df2, 2 * df2, 3 * df2]) + + with pytest.raises(KeyError, match="with any missing labels"): + df2.loc[idx] + + def test_iloc_empty_list_indexer_is_ok(self): + + df = tm.makeCustomDataframe(5, 2) + # vertical empty + tm.assert_frame_equal( + df.iloc[:, []], + df.iloc[:, :0], + check_index_type=True, + check_column_type=True, + ) + # horizontal empty + tm.assert_frame_equal( + df.iloc[[], :], + df.iloc[:0, :], + check_index_type=True, + check_column_type=True, + ) + # horizontal empty + tm.assert_frame_equal( + df.iloc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True + ) + + def test_identity_slice_returns_new_object(self): + # GH13873 + original_df = DataFrame({"a": [1, 2, 3]}) + sliced_df = original_df.iloc[:] + assert sliced_df is not original_df + + # should be a shallow copy + original_df["a"] = [4, 4, 4] + assert (sliced_df["a"] == 4).all() + + original_series = Series([1, 2, 3, 4, 5, 6]) + sliced_series = original_series.iloc[:] + assert sliced_series is not original_series + + # should also be a shallow copy + original_series[:3] = [7, 8, 9] + assert all(sliced_series[:3] == [7, 8, 9]) + + def test_indexing_zerodim_np_array(self): + # GH24919 + df = DataFrame([[1, 2], [3, 4]]) + result = df.iloc[np.array(0)] + s = pd.Series([1, 2], name=0) + tm.assert_series_equal(result, s) + + def test_series_indexing_zerodim_np_array(self): + # GH24919 + s = Series([1, 2]) + result = s.iloc[np.array(0)] + assert result == 1 diff --git a/venv/Lib/site-packages/pandas/tests/indexing/test_indexing.py b/venv/Lib/site-packages/pandas/tests/indexing/test_indexing.py new file mode 100644 index 0000000..448a060 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/test_indexing.py @@ -0,0 +1,1186 @@ +""" test fancy indexing & misc """ + +from datetime import datetime +import re +import weakref + +import numpy as np +import pytest + +from pandas.errors import AbstractMethodError + +from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype + +import pandas as pd +from pandas import DataFrame, Index, NaT, Series +import pandas._testing as tm +from pandas.core.generic import NDFrame +from pandas.core.indexers import validate_indices +from pandas.core.indexing import _maybe_numeric_slice, _non_reducing_slice +from pandas.tests.indexing.common import Base, _mklbl + +# ------------------------------------------------------------------------ +# Indexing test cases + + +class TestFancy(Base): + """ pure get/set item & fancy indexing """ + + def test_setitem_ndarray_1d(self): + # GH5508 + + # len of indexer vs length of the 1d ndarray + df = DataFrame(index=Index(np.arange(1, 11))) + df["foo"] = np.zeros(10, dtype=np.float64) + df["bar"] = np.zeros(10, dtype=np.complex) + + # invalid + with pytest.raises(ValueError): + df.loc[df.index[2:5], "bar"] = np.array([2.33j, 1.23 + 0.1j, 2.2, 1.0]) + + # valid + df.loc[df.index[2:6], "bar"] = np.array([2.33j, 1.23 + 0.1j, 2.2, 1.0]) + + result = df.loc[df.index[2:6], "bar"] + expected = Series( + [2.33j, 1.23 + 0.1j, 2.2, 1.0], index=[3, 4, 5, 6], name="bar" + ) + tm.assert_series_equal(result, expected) + + # dtype getting changed? + df = DataFrame(index=Index(np.arange(1, 11))) + df["foo"] = np.zeros(10, dtype=np.float64) + df["bar"] = np.zeros(10, dtype=np.complex) + + with pytest.raises(ValueError): + df[2:5] = np.arange(1, 4) * 1j + + @pytest.mark.parametrize( + "index", tm.all_index_generator(5), ids=lambda x: type(x).__name__ + ) + @pytest.mark.parametrize( + "obj", + [ + lambda i: Series(np.arange(len(i)), index=i), + lambda i: DataFrame(np.random.randn(len(i), len(i)), index=i, columns=i), + ], + ids=["Series", "DataFrame"], + ) + @pytest.mark.parametrize( + "idxr, idxr_id", + [ + (lambda x: x, "getitem"), + (lambda x: x.loc, "loc"), + (lambda x: x.iloc, "iloc"), + ], + ) + def test_getitem_ndarray_3d(self, index, obj, idxr, idxr_id): + # GH 25567 + obj = obj(index) + idxr = idxr(obj) + nd3 = np.random.randint(5, size=(2, 2, 2)) + + msg = ( + r"Buffer has wrong number of dimensions \(expected 1," + r" got 3\)|" + "Cannot index with multidimensional key|" + r"Wrong number of dimensions. values.ndim != ndim \[3 != 1\]|" + "Index data must be 1-dimensional" + ) + + if ( + isinstance(obj, Series) + and idxr_id == "getitem" + and index.inferred_type + in [ + "string", + "datetime64", + "period", + "timedelta64", + "boolean", + "categorical", + ] + ): + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + idxr[nd3] + else: + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning(DeprecationWarning): + idxr[nd3] + + @pytest.mark.parametrize( + "index", tm.all_index_generator(5), ids=lambda x: type(x).__name__ + ) + @pytest.mark.parametrize( + "obj", + [ + lambda i: Series(np.arange(len(i)), index=i), + lambda i: DataFrame(np.random.randn(len(i), len(i)), index=i, columns=i), + ], + ids=["Series", "DataFrame"], + ) + @pytest.mark.parametrize( + "idxr, idxr_id", + [ + (lambda x: x, "setitem"), + (lambda x: x.loc, "loc"), + (lambda x: x.iloc, "iloc"), + ], + ) + def test_setitem_ndarray_3d(self, index, obj, idxr, idxr_id): + # GH 25567 + obj = obj(index) + idxr = idxr(obj) + nd3 = np.random.randint(5, size=(2, 2, 2)) + + msg = ( + r"Buffer has wrong number of dimensions \(expected 1," + r" got 3\)|" + "'pandas._libs.interval.IntervalTree' object has no attribute " + "'set_value'|" # AttributeError + "unhashable type: 'numpy.ndarray'|" # TypeError + "No matching signature found|" # TypeError + r"^\[\[\[|" # pandas.core.indexing.IndexingError + "Index data must be 1-dimensional" + ) + + if (idxr_id == "iloc") or ( + ( + isinstance(obj, Series) + and idxr_id == "setitem" + and index.inferred_type + in [ + "floating", + "string", + "datetime64", + "period", + "timedelta64", + "boolean", + "categorical", + ] + ) + ): + idxr[nd3] = 0 + else: + err = (ValueError, AttributeError) + with pytest.raises(err, match=msg): + idxr[nd3] = 0 + + def test_inf_upcast(self): + # GH 16957 + # We should be able to use np.inf as a key + # np.inf should cause an index to convert to float + + # Test with np.inf in rows + df = DataFrame(columns=[0]) + df.loc[1] = 1 + df.loc[2] = 2 + df.loc[np.inf] = 3 + + # make sure we can look up the value + assert df.loc[np.inf, 0] == 3 + + result = df.index + expected = pd.Float64Index([1, 2, np.inf]) + tm.assert_index_equal(result, expected) + + # Test with np.inf in columns + df = DataFrame() + df.loc[0, 0] = 1 + df.loc[1, 1] = 2 + df.loc[0, np.inf] = 3 + + result = df.columns + expected = pd.Float64Index([0, 1, np.inf]) + tm.assert_index_equal(result, expected) + + def test_setitem_dtype_upcast(self): + + # GH3216 + df = DataFrame([{"a": 1}, {"a": 3, "b": 2}]) + df["c"] = np.nan + assert df["c"].dtype == np.float64 + + df.loc[0, "c"] = "foo" + expected = DataFrame( + [{"a": 1, "b": np.nan, "c": "foo"}, {"a": 3, "b": 2, "c": np.nan}] + ) + tm.assert_frame_equal(df, expected) + + # GH10280 + df = DataFrame( + np.arange(6, dtype="int64").reshape(2, 3), + index=list("ab"), + columns=["foo", "bar", "baz"], + ) + + for val in [3.14, "wxyz"]: + left = df.copy() + left.loc["a", "bar"] = val + right = DataFrame( + [[0, val, 2], [3, 4, 5]], + index=list("ab"), + columns=["foo", "bar", "baz"], + ) + + tm.assert_frame_equal(left, right) + assert is_integer_dtype(left["foo"]) + assert is_integer_dtype(left["baz"]) + + left = DataFrame( + np.arange(6, dtype="int64").reshape(2, 3) / 10.0, + index=list("ab"), + columns=["foo", "bar", "baz"], + ) + left.loc["a", "bar"] = "wxyz" + + right = DataFrame( + [[0, "wxyz", 0.2], [0.3, 0.4, 0.5]], + index=list("ab"), + columns=["foo", "bar", "baz"], + ) + + tm.assert_frame_equal(left, right) + assert is_float_dtype(left["foo"]) + assert is_float_dtype(left["baz"]) + + def test_dups_fancy_indexing(self): + + # GH 3455 + + df = tm.makeCustomDataframe(10, 3) + df.columns = ["a", "a", "b"] + result = df[["b", "a"]].columns + expected = Index(["b", "a", "a"]) + tm.assert_index_equal(result, expected) + + # across dtypes + df = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], columns=list("aaaaaaa")) + df.head() + str(df) + result = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]]) + result.columns = list("aaaaaaa") + + # TODO(wesm): unused? + df_v = df.iloc[:, 4] # noqa + res_v = result.iloc[:, 4] # noqa + + tm.assert_frame_equal(df, result) + + # GH 3561, dups not in selected order + df = DataFrame( + {"test": [5, 7, 9, 11], "test1": [4.0, 5, 6, 7], "other": list("abcd")}, + index=["A", "A", "B", "C"], + ) + rows = ["C", "B"] + expected = DataFrame( + {"test": [11, 9], "test1": [7.0, 6], "other": ["d", "c"]}, index=rows + ) + result = df.loc[rows] + tm.assert_frame_equal(result, expected) + + result = df.loc[Index(rows)] + tm.assert_frame_equal(result, expected) + + rows = ["C", "B", "E"] + with pytest.raises(KeyError, match="with any missing labels"): + df.loc[rows] + + # see GH5553, make sure we use the right indexer + rows = ["F", "G", "H", "C", "B", "E"] + with pytest.raises(KeyError, match="with any missing labels"): + df.loc[rows] + + # List containing only missing label + dfnu = DataFrame(np.random.randn(5, 3), index=list("AABCD")) + with pytest.raises( + KeyError, + match=re.escape( + "\"None of [Index(['E'], dtype='object')] are in the [index]\"" + ), + ): + dfnu.loc[["E"]] + + # ToDo: check_index_type can be True after GH 11497 + + # GH 4619; duplicate indexer with missing label + df = DataFrame({"A": [0, 1, 2]}) + with pytest.raises(KeyError, match="with any missing labels"): + df.loc[[0, 8, 0]] + + df = DataFrame({"A": list("abc")}) + with pytest.raises(KeyError, match="with any missing labels"): + df.loc[[0, 8, 0]] + + # non unique with non unique selector + df = DataFrame({"test": [5, 7, 9, 11]}, index=["A", "A", "B", "C"]) + with pytest.raises(KeyError, match="with any missing labels"): + df.loc[["A", "A", "E"]] + + def test_dups_fancy_indexing2(self): + # GH 5835 + # dups on index and missing values + df = DataFrame(np.random.randn(5, 5), columns=["A", "B", "B", "B", "A"]) + + with pytest.raises(KeyError, match="with any missing labels"): + df.loc[:, ["A", "B", "C"]] + + # GH 6504, multi-axis indexing + df = DataFrame( + np.random.randn(9, 2), index=[1, 1, 1, 2, 2, 2, 3, 3, 3], columns=["a", "b"] + ) + + expected = df.iloc[0:6] + result = df.loc[[1, 2]] + tm.assert_frame_equal(result, expected) + + expected = df + result = df.loc[:, ["a", "b"]] + tm.assert_frame_equal(result, expected) + + expected = df.iloc[0:6, :] + result = df.loc[[1, 2], ["a", "b"]] + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("case", [lambda s: s, lambda s: s.loc]) + def test_duplicate_int_indexing(self, case): + # GH 17347 + s = pd.Series(range(3), index=[1, 1, 3]) + expected = s[1] + result = case(s)[[1]] + tm.assert_series_equal(result, expected) + + def test_indexing_mixed_frame_bug(self): + + # GH3492 + df = DataFrame( + {"a": {1: "aaa", 2: "bbb", 3: "ccc"}, "b": {1: 111, 2: 222, 3: 333}} + ) + + # this works, new column is created correctly + df["test"] = df["a"].apply(lambda x: "_" if x == "aaa" else x) + + # this does not work, ie column test is not changed + idx = df["test"] == "_" + temp = df.loc[idx, "a"].apply(lambda x: "-----" if x == "aaa" else x) + df.loc[idx, "test"] = temp + assert df.iloc[0, 2] == "-----" + + def test_multitype_list_index_access(self): + # GH 10610 + df = DataFrame(np.random.random((10, 5)), columns=["a"] + [20, 21, 22, 23]) + + with pytest.raises(KeyError, match=re.escape("'[-8, 26] not in index'")): + df[[22, 26, -8]] + assert df[21].shape[0] == df.shape[0] + + def test_set_index_nan(self): + + # GH 3586 + df = DataFrame( + { + "PRuid": { + 17: "nonQC", + 18: "nonQC", + 19: "nonQC", + 20: "10", + 21: "11", + 22: "12", + 23: "13", + 24: "24", + 25: "35", + 26: "46", + 27: "47", + 28: "48", + 29: "59", + 30: "10", + }, + "QC": { + 17: 0.0, + 18: 0.0, + 19: 0.0, + 20: np.nan, + 21: np.nan, + 22: np.nan, + 23: np.nan, + 24: 1.0, + 25: np.nan, + 26: np.nan, + 27: np.nan, + 28: np.nan, + 29: np.nan, + 30: np.nan, + }, + "data": { + 17: 7.9544899999999998, + 18: 8.0142609999999994, + 19: 7.8591520000000008, + 20: 0.86140349999999999, + 21: 0.87853110000000001, + 22: 0.8427041999999999, + 23: 0.78587700000000005, + 24: 0.73062459999999996, + 25: 0.81668560000000001, + 26: 0.81927080000000008, + 27: 0.80705009999999999, + 28: 0.81440240000000008, + 29: 0.80140849999999997, + 30: 0.81307740000000006, + }, + "year": { + 17: 2006, + 18: 2007, + 19: 2008, + 20: 1985, + 21: 1985, + 22: 1985, + 23: 1985, + 24: 1985, + 25: 1985, + 26: 1985, + 27: 1985, + 28: 1985, + 29: 1985, + 30: 1986, + }, + } + ).reset_index() + + result = ( + df.set_index(["year", "PRuid", "QC"]) + .reset_index() + .reindex(columns=df.columns) + ) + tm.assert_frame_equal(result, df) + + def test_multi_assign(self): + + # GH 3626, an assignment of a sub-df to a df + df = DataFrame( + { + "FC": ["a", "b", "a", "b", "a", "b"], + "PF": [0, 0, 0, 0, 1, 1], + "col1": list(range(6)), + "col2": list(range(6, 12)), + } + ) + df.iloc[1, 0] = np.nan + df2 = df.copy() + + mask = ~df2.FC.isna() + cols = ["col1", "col2"] + + dft = df2 * 2 + dft.iloc[3, 3] = np.nan + + expected = DataFrame( + { + "FC": ["a", np.nan, "a", "b", "a", "b"], + "PF": [0, 0, 0, 0, 1, 1], + "col1": Series([0, 1, 4, 6, 8, 10]), + "col2": [12, 7, 16, np.nan, 20, 22], + } + ) + + # frame on rhs + df2.loc[mask, cols] = dft.loc[mask, cols] + tm.assert_frame_equal(df2, expected) + + df2.loc[mask, cols] = dft.loc[mask, cols] + tm.assert_frame_equal(df2, expected) + + # with an ndarray on rhs + # coerces to float64 because values has float64 dtype + # GH 14001 + expected = DataFrame( + { + "FC": ["a", np.nan, "a", "b", "a", "b"], + "PF": [0, 0, 0, 0, 1, 1], + "col1": [0.0, 1.0, 4.0, 6.0, 8.0, 10.0], + "col2": [12, 7, 16, np.nan, 20, 22], + } + ) + df2 = df.copy() + df2.loc[mask, cols] = dft.loc[mask, cols].values + tm.assert_frame_equal(df2, expected) + df2.loc[mask, cols] = dft.loc[mask, cols].values + tm.assert_frame_equal(df2, expected) + + # broadcasting on the rhs is required + df = DataFrame( + dict( + A=[1, 2, 0, 0, 0], + B=[0, 0, 0, 10, 11], + C=[0, 0, 0, 10, 11], + D=[3, 4, 5, 6, 7], + ) + ) + + expected = df.copy() + mask = expected["A"] == 0 + for col in ["A", "B"]: + expected.loc[mask, col] = df["D"] + + df.loc[df["A"] == 0, ["A", "B"]] = df["D"] + tm.assert_frame_equal(df, expected) + + def test_setitem_list(self): + + # GH 6043 + # iloc with a list + df = DataFrame(index=[0, 1], columns=[0]) + df.iloc[1, 0] = [1, 2, 3] + df.iloc[1, 0] = [1, 2] + + result = DataFrame(index=[0, 1], columns=[0]) + result.iloc[1, 0] = [1, 2] + + tm.assert_frame_equal(result, df) + + # iloc with an object + class TO: + def __init__(self, value): + self.value = value + + def __str__(self) -> str: + return "[{0}]".format(self.value) + + __repr__ = __str__ + + def __eq__(self, other) -> bool: + return self.value == other.value + + def view(self): + return self + + df = DataFrame(index=[0, 1], columns=[0]) + df.iloc[1, 0] = TO(1) + df.iloc[1, 0] = TO(2) + + result = DataFrame(index=[0, 1], columns=[0]) + result.iloc[1, 0] = TO(2) + + tm.assert_frame_equal(result, df) + + # remains object dtype even after setting it back + df = DataFrame(index=[0, 1], columns=[0]) + df.iloc[1, 0] = TO(1) + df.iloc[1, 0] = np.nan + result = DataFrame(index=[0, 1], columns=[0]) + + tm.assert_frame_equal(result, df) + + def test_string_slice(self): + # GH 14424 + # string indexing against datetimelike with object + # dtype should properly raises KeyError + df = DataFrame([1], Index([pd.Timestamp("2011-01-01")], dtype=object)) + assert df.index.is_all_dates + with pytest.raises(KeyError, match="'2011'"): + df["2011"] + + with pytest.raises(KeyError, match="'2011'"): + df.loc["2011", 0] + + df = DataFrame() + assert not df.index.is_all_dates + with pytest.raises(KeyError, match="'2011'"): + df["2011"] + + with pytest.raises(KeyError, match="'2011'"): + df.loc["2011", 0] + + def test_astype_assignment(self): + + # GH4312 (iloc) + df_orig = DataFrame( + [["1", "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") + ) + + df = df_orig.copy() + df.iloc[:, 0:2] = df.iloc[:, 0:2].astype(np.int64) + expected = DataFrame( + [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") + ) + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.iloc[:, 0:2] = df.iloc[:, 0:2]._convert(datetime=True, numeric=True) + expected = DataFrame( + [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") + ) + tm.assert_frame_equal(df, expected) + + # GH5702 (loc) + df = df_orig.copy() + df.loc[:, "A"] = df.loc[:, "A"].astype(np.int64) + expected = DataFrame( + [[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") + ) + tm.assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64) + expected = DataFrame( + [["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") + ) + tm.assert_frame_equal(df, expected) + + # full replacements / no nans + df = DataFrame({"A": [1.0, 2.0, 3.0, 4.0]}) + df.iloc[:, 0] = df["A"].astype(np.int64) + expected = DataFrame({"A": [1, 2, 3, 4]}) + tm.assert_frame_equal(df, expected) + + df = DataFrame({"A": [1.0, 2.0, 3.0, 4.0]}) + df.loc[:, "A"] = df["A"].astype(np.int64) + expected = DataFrame({"A": [1, 2, 3, 4]}) + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize( + "index,val", + [ + (Index([0, 1, 2]), 2), + (Index([0, 1, "2"]), "2"), + (Index([0, 1, 2, np.inf, 4]), 4), + (Index([0, 1, 2, np.nan, 4]), 4), + (Index([0, 1, 2, np.inf]), np.inf), + (Index([0, 1, 2, np.nan]), np.nan), + ], + ) + def test_index_contains(self, index, val): + assert val in index + + @pytest.mark.parametrize( + "index,val", + [ + (Index([0, 1, 2]), "2"), + (Index([0, 1, "2"]), 2), + (Index([0, 1, 2, np.inf]), 4), + (Index([0, 1, 2, np.nan]), 4), + (Index([0, 1, 2, np.inf]), np.nan), + (Index([0, 1, 2, np.nan]), np.inf), + # Checking if np.inf in Int64Index should not cause an OverflowError + # Related to GH 16957 + (pd.Int64Index([0, 1, 2]), np.inf), + (pd.Int64Index([0, 1, 2]), np.nan), + (pd.UInt64Index([0, 1, 2]), np.inf), + (pd.UInt64Index([0, 1, 2]), np.nan), + ], + ) + def test_index_not_contains(self, index, val): + assert val not in index + + @pytest.mark.parametrize( + "index,val", [(Index([0, 1, "2"]), 0), (Index([0, 1, "2"]), "2")] + ) + def test_mixed_index_contains(self, index, val): + # GH 19860 + assert val in index + + @pytest.mark.parametrize( + "index,val", [(Index([0, 1, "2"]), "1"), (Index([0, 1, "2"]), 2)] + ) + def test_mixed_index_not_contains(self, index, val): + # GH 19860 + assert val not in index + + def test_contains_with_float_index(self): + # GH#22085 + integer_index = pd.Int64Index([0, 1, 2, 3]) + uinteger_index = pd.UInt64Index([0, 1, 2, 3]) + float_index = pd.Float64Index([0.1, 1.1, 2.2, 3.3]) + + for index in (integer_index, uinteger_index): + assert 1.1 not in index + assert 1.0 in index + assert 1 in index + + assert 1.1 in float_index + assert 1.0 not in float_index + assert 1 not in float_index + + def test_index_type_coercion(self): + + # GH 11836 + # if we have an index type and set it with something that looks + # to numpy like the same, but is actually, not + # (e.g. setting with a float or string '0') + # then we need to coerce to object + + # integer indexes + for s in [Series(range(5)), Series(range(5), index=range(1, 6))]: + + assert s.index.is_integer() + + for indexer in [lambda x: x.loc, lambda x: x]: + s2 = s.copy() + indexer(s2)[0.1] = 0 + assert s2.index.is_floating() + assert indexer(s2)[0.1] == 0 + + s2 = s.copy() + indexer(s2)[0.0] = 0 + exp = s.index + if 0 not in s: + exp = Index(s.index.tolist() + [0]) + tm.assert_index_equal(s2.index, exp) + + s2 = s.copy() + indexer(s2)["0"] = 0 + assert s2.index.is_object() + + for s in [Series(range(5), index=np.arange(5.0))]: + + assert s.index.is_floating() + + for idxr in [lambda x: x.loc, lambda x: x]: + + s2 = s.copy() + idxr(s2)[0.1] = 0 + assert s2.index.is_floating() + assert idxr(s2)[0.1] == 0 + + s2 = s.copy() + idxr(s2)[0.0] = 0 + tm.assert_index_equal(s2.index, s.index) + + s2 = s.copy() + idxr(s2)["0"] = 0 + assert s2.index.is_object() + + +class TestMisc(Base): + def test_float_index_to_mixed(self): + df = DataFrame({0.0: np.random.rand(10), 1.0: np.random.rand(10)}) + df["a"] = 10 + tm.assert_frame_equal( + DataFrame({0.0: df[0.0], 1.0: df[1.0], "a": [10] * 10}), df + ) + + def test_float_index_non_scalar_assignment(self): + df = DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}, index=[1.0, 2.0, 3.0]) + df.loc[df.index[:2]] = 1 + expected = DataFrame({"a": [1, 1, 3], "b": [1, 1, 5]}, index=df.index) + tm.assert_frame_equal(expected, df) + + df = DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}, index=[1.0, 2.0, 3.0]) + df2 = df.copy() + df.loc[df.index] = df.loc[df.index] + tm.assert_frame_equal(df, df2) + + def test_float_index_at_iat(self): + s = Series([1, 2, 3], index=[0.1, 0.2, 0.3]) + for el, item in s.items(): + assert s.at[el] == item + for i in range(len(s)): + assert s.iat[i] == i + 1 + + def test_mixed_index_assignment(self): + # GH 19860 + s = Series([1, 2, 3, 4, 5], index=["a", "b", "c", 1, 2]) + s.at["a"] = 11 + assert s.iat[0] == 11 + s.at[1] = 22 + assert s.iat[3] == 22 + + def test_mixed_index_no_fallback(self): + # GH 19860 + s = Series([1, 2, 3, 4, 5], index=["a", "b", "c", 1, 2]) + with pytest.raises(KeyError, match="^0$"): + s.at[0] + with pytest.raises(KeyError, match="^4$"): + s.at[4] + + def test_rhs_alignment(self): + # GH8258, tests that both rows & columns are aligned to what is + # assigned to. covers both uniform data-type & multi-type cases + def run_tests(df, rhs, right): + # label, index, slice + lbl_one, idx_one, slice_one = list("bcd"), [1, 2, 3], slice(1, 4) + lbl_two, idx_two, slice_two = ["joe", "jolie"], [1, 2], slice(1, 3) + + left = df.copy() + left.loc[lbl_one, lbl_two] = rhs + tm.assert_frame_equal(left, right) + + left = df.copy() + left.iloc[idx_one, idx_two] = rhs + tm.assert_frame_equal(left, right) + + left = df.copy() + left.iloc[slice_one, slice_two] = rhs + tm.assert_frame_equal(left, right) + + xs = np.arange(20).reshape(5, 4) + cols = ["jim", "joe", "jolie", "joline"] + df = DataFrame(xs, columns=cols, index=list("abcde")) + + # right hand side; permute the indices and multiplpy by -2 + rhs = -2 * df.iloc[3:0:-1, 2:0:-1] + + # expected `right` result; just multiply by -2 + right = df.copy() + right.iloc[1:4, 1:3] *= -2 + + # run tests with uniform dtypes + run_tests(df, rhs, right) + + # make frames multi-type & re-run tests + for frame in [df, rhs, right]: + frame["joe"] = frame["joe"].astype("float64") + frame["jolie"] = frame["jolie"].map("@{0}".format) + + run_tests(df, rhs, right) + + def test_str_label_slicing_with_negative_step(self): + SLC = pd.IndexSlice + + def assert_slices_equivalent(l_slc, i_slc): + tm.assert_series_equal(s.loc[l_slc], s.iloc[i_slc]) + + if not idx.is_integer: + # For integer indices, .loc and plain getitem are position-based. + tm.assert_series_equal(s[l_slc], s.iloc[i_slc]) + tm.assert_series_equal(s.loc[l_slc], s.iloc[i_slc]) + + for idx in [_mklbl("A", 20), np.arange(20) + 100, np.linspace(100, 150, 20)]: + idx = Index(idx) + s = Series(np.arange(20), index=idx) + assert_slices_equivalent(SLC[idx[9] :: -1], SLC[9::-1]) + assert_slices_equivalent(SLC[: idx[9] : -1], SLC[:8:-1]) + assert_slices_equivalent(SLC[idx[13] : idx[9] : -1], SLC[13:8:-1]) + assert_slices_equivalent(SLC[idx[9] : idx[13] : -1], SLC[:0]) + + def test_slice_with_zero_step_raises(self): + s = Series(np.arange(20), index=_mklbl("A", 20)) + with pytest.raises(ValueError, match="slice step cannot be zero"): + s[::0] + with pytest.raises(ValueError, match="slice step cannot be zero"): + s.loc[::0] + + def test_indexing_assignment_dict_already_exists(self): + df = DataFrame({"x": [1, 2, 6], "y": [2, 2, 8], "z": [-5, 0, 5]}).set_index("z") + expected = df.copy() + rhs = dict(x=9, y=99) + df.loc[5] = rhs + expected.loc[5] = [9, 99] + tm.assert_frame_equal(df, expected) + + def test_indexing_dtypes_on_empty(self): + # Check that .iloc returns correct dtypes GH9983 + df = DataFrame({"a": [1, 2, 3], "b": ["b", "b2", "b3"]}) + df2 = df.iloc[[], :] + + assert df2.loc[:, "a"].dtype == np.int64 + tm.assert_series_equal(df2.loc[:, "a"], df2.iloc[:, 0]) + + def test_range_in_series_indexing(self): + # range can cause an indexing error + # GH 11652 + for x in [5, 999999, 1000000]: + s = Series(index=range(x), dtype=np.float64) + s.loc[range(1)] = 42 + tm.assert_series_equal(s.loc[range(1)], Series(42.0, index=[0])) + + s.loc[range(2)] = 43 + tm.assert_series_equal(s.loc[range(2)], Series(43.0, index=[0, 1])) + + def test_non_reducing_slice(self): + df = DataFrame([[0, 1], [2, 3]]) + + slices = [ + # pd.IndexSlice[:, :], + pd.IndexSlice[:, 1], + pd.IndexSlice[1, :], + pd.IndexSlice[[1], [1]], + pd.IndexSlice[1, [1]], + pd.IndexSlice[[1], 1], + pd.IndexSlice[1], + pd.IndexSlice[1, 1], + slice(None, None, None), + [0, 1], + np.array([0, 1]), + Series([0, 1]), + ] + for slice_ in slices: + tslice_ = _non_reducing_slice(slice_) + assert isinstance(df.loc[tslice_], DataFrame) + + def test_list_slice(self): + # like dataframe getitem + slices = [["A"], Series(["A"]), np.array(["A"])] + df = DataFrame({"A": [1, 2], "B": [3, 4]}, index=["A", "B"]) + expected = pd.IndexSlice[:, ["A"]] + for subset in slices: + result = _non_reducing_slice(subset) + tm.assert_frame_equal(df.loc[result], df.loc[expected]) + + def test_maybe_numeric_slice(self): + df = DataFrame({"A": [1, 2], "B": ["c", "d"], "C": [True, False]}) + result = _maybe_numeric_slice(df, slice_=None) + expected = pd.IndexSlice[:, ["A"]] + assert result == expected + + result = _maybe_numeric_slice(df, None, include_bool=True) + expected = pd.IndexSlice[:, ["A", "C"]] + result = _maybe_numeric_slice(df, [1]) + expected = [1] + assert result == expected + + def test_partial_boolean_frame_indexing(self): + # GH 17170 + df = DataFrame( + np.arange(9.0).reshape(3, 3), index=list("abc"), columns=list("ABC") + ) + index_df = DataFrame(1, index=list("ab"), columns=list("AB")) + result = df[index_df.notnull()] + expected = DataFrame( + np.array([[0.0, 1.0, np.nan], [3.0, 4.0, np.nan], [np.nan] * 3]), + index=list("abc"), + columns=list("ABC"), + ) + tm.assert_frame_equal(result, expected) + + def test_no_reference_cycle(self): + df = DataFrame({"a": [0, 1], "b": [2, 3]}) + for name in ("loc", "iloc", "at", "iat"): + getattr(df, name) + wr = weakref.ref(df) + del df + assert wr() is None + + +class TestSeriesNoneCoercion: + EXPECTED_RESULTS = [ + # For numeric series, we should coerce to NaN. + ([1, 2, 3], [np.nan, 2, 3]), + ([1.0, 2.0, 3.0], [np.nan, 2.0, 3.0]), + # For datetime series, we should coerce to NaT. + ( + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)], + ), + # For objects, we should preserve the None value. + (["foo", "bar", "baz"], [None, "bar", "baz"]), + ] + + def test_coercion_with_setitem(self): + for start_data, expected_result in self.EXPECTED_RESULTS: + start_series = Series(start_data) + start_series[0] = None + + expected_series = Series(expected_result) + tm.assert_series_equal(start_series, expected_series) + + def test_coercion_with_loc_setitem(self): + for start_data, expected_result in self.EXPECTED_RESULTS: + start_series = Series(start_data) + start_series.loc[0] = None + + expected_series = Series(expected_result) + tm.assert_series_equal(start_series, expected_series) + + def test_coercion_with_setitem_and_series(self): + for start_data, expected_result in self.EXPECTED_RESULTS: + start_series = Series(start_data) + start_series[start_series == start_series[0]] = None + + expected_series = Series(expected_result) + tm.assert_series_equal(start_series, expected_series) + + def test_coercion_with_loc_and_series(self): + for start_data, expected_result in self.EXPECTED_RESULTS: + start_series = Series(start_data) + start_series.loc[start_series == start_series[0]] = None + + expected_series = Series(expected_result) + tm.assert_series_equal(start_series, expected_series) + + +class TestDataframeNoneCoercion: + EXPECTED_SINGLE_ROW_RESULTS = [ + # For numeric series, we should coerce to NaN. + ([1, 2, 3], [np.nan, 2, 3]), + ([1.0, 2.0, 3.0], [np.nan, 2.0, 3.0]), + # For datetime series, we should coerce to NaT. + ( + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)], + ), + # For objects, we should preserve the None value. + (["foo", "bar", "baz"], [None, "bar", "baz"]), + ] + + def test_coercion_with_loc(self): + for start_data, expected_result in self.EXPECTED_SINGLE_ROW_RESULTS: + start_dataframe = DataFrame({"foo": start_data}) + start_dataframe.loc[0, ["foo"]] = None + + expected_dataframe = DataFrame({"foo": expected_result}) + tm.assert_frame_equal(start_dataframe, expected_dataframe) + + def test_coercion_with_setitem_and_dataframe(self): + for start_data, expected_result in self.EXPECTED_SINGLE_ROW_RESULTS: + start_dataframe = DataFrame({"foo": start_data}) + start_dataframe[start_dataframe["foo"] == start_dataframe["foo"][0]] = None + + expected_dataframe = DataFrame({"foo": expected_result}) + tm.assert_frame_equal(start_dataframe, expected_dataframe) + + def test_none_coercion_loc_and_dataframe(self): + for start_data, expected_result in self.EXPECTED_SINGLE_ROW_RESULTS: + start_dataframe = DataFrame({"foo": start_data}) + start_dataframe.loc[ + start_dataframe["foo"] == start_dataframe["foo"][0] + ] = None + + expected_dataframe = DataFrame({"foo": expected_result}) + tm.assert_frame_equal(start_dataframe, expected_dataframe) + + def test_none_coercion_mixed_dtypes(self): + start_dataframe = DataFrame( + { + "a": [1, 2, 3], + "b": [1.0, 2.0, 3.0], + "c": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + "d": ["a", "b", "c"], + } + ) + start_dataframe.iloc[0] = None + + exp = DataFrame( + { + "a": [np.nan, 2, 3], + "b": [np.nan, 2.0, 3.0], + "c": [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)], + "d": [None, "b", "c"], + } + ) + tm.assert_frame_equal(start_dataframe, exp) + + +def test_validate_indices_ok(): + indices = np.asarray([0, 1]) + validate_indices(indices, 2) + validate_indices(indices[:0], 0) + validate_indices(np.array([-1, -1]), 0) + + +def test_validate_indices_low(): + indices = np.asarray([0, -2]) + with pytest.raises(ValueError, match="'indices' contains"): + validate_indices(indices, 2) + + +def test_validate_indices_high(): + indices = np.asarray([0, 1, 2]) + with pytest.raises(IndexError, match="indices are out"): + validate_indices(indices, 2) + + +def test_validate_indices_empty(): + with pytest.raises(IndexError, match="indices are out"): + validate_indices(np.array([0, 1]), 0) + + +def test_extension_array_cross_section(): + # A cross-section of a homogeneous EA should be an EA + df = pd.DataFrame( + { + "A": pd.core.arrays.integer_array([1, 2]), + "B": pd.core.arrays.integer_array([3, 4]), + }, + index=["a", "b"], + ) + expected = pd.Series( + pd.core.arrays.integer_array([1, 3]), index=["A", "B"], name="a" + ) + result = df.loc["a"] + tm.assert_series_equal(result, expected) + + result = df.iloc[0] + tm.assert_series_equal(result, expected) + + +def test_extension_array_cross_section_converts(): + df = pd.DataFrame( + {"A": pd.core.arrays.integer_array([1, 2]), "B": np.array([1, 2])}, + index=["a", "b"], + ) + result = df.loc["a"] + expected = pd.Series([1, 1], dtype=object, index=["A", "B"], name="a") + tm.assert_series_equal(result, expected) + + result = df.iloc[0] + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "idxr, error, error_message", + [ + (lambda x: x, AbstractMethodError, None), + ( + lambda x: x.loc, + AttributeError, + "type object 'NDFrame' has no attribute '_AXIS_NAMES'", + ), + ( + lambda x: x.iloc, + AttributeError, + "type object 'NDFrame' has no attribute '_AXIS_NAMES'", + ), + ], +) +def test_ndframe_indexing_raises(idxr, error, error_message): + # GH 25567 + frame = NDFrame(np.random.randint(5, size=(2, 2, 2))) + with pytest.raises(error, match=error_message): + idxr(frame)[0] + + +def test_readonly_indices(): + # GH#17192 iloc with read-only array raising TypeError + df = pd.DataFrame({"data": np.ones(100, dtype="float64")}) + indices = np.array([1, 3, 6]) + indices.flags.writeable = False + + result = df.iloc[indices] + expected = df.loc[[1, 3, 6]] + tm.assert_frame_equal(result, expected) + + result = df["data"].iloc[indices] + expected = df["data"].loc[[1, 3, 6]] + tm.assert_series_equal(result, expected) + + +def test_1tuple_without_multiindex(): + ser = pd.Series(range(5)) + key = (slice(3),) + + result = ser[key] + expected = ser[key[0]] + tm.assert_series_equal(result, expected) + + +def test_duplicate_index_mistyped_key_raises_keyerror(): + # GH#29189 float_index.get_loc(None) should raise KeyError, not TypeError + ser = pd.Series([2, 5, 6, 8], index=[2.0, 4.0, 4.0, 5.0]) + with pytest.raises(KeyError): + ser[None] + + with pytest.raises(KeyError): + ser.index.get_loc(None) + + with pytest.raises(KeyError): + ser.index._engine.get_loc(None) + + +def test_setitem_with_bool_mask_and_values_matching_n_trues_in_length(): + # GH 30567 + ser = pd.Series([None] * 10) + mask = [False] * 3 + [True] * 5 + [False] * 2 + ser[mask] = range(5) + result = ser + expected = pd.Series([None] * 3 + list(range(5)) + [None] * 2).astype("object") + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexing/test_indexing_engines.py b/venv/Lib/site-packages/pandas/tests/indexing/test_indexing_engines.py new file mode 100644 index 0000000..edb5d7d --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/test_indexing_engines.py @@ -0,0 +1,163 @@ +import numpy as np + +from pandas._libs import algos as libalgos, index as libindex + +import pandas._testing as tm + + +class TestNumericEngine: + def test_is_monotonic(self, numeric_indexing_engine_type_and_dtype): + engine_type, dtype = numeric_indexing_engine_type_and_dtype + num = 1000 + arr = np.array([1] * num + [2] * num + [3] * num, dtype=dtype) + + # monotonic increasing + engine = engine_type(lambda: arr, len(arr)) + assert engine.is_monotonic_increasing is True + assert engine.is_monotonic_decreasing is False + + # monotonic decreasing + engine = engine_type(lambda: arr[::-1], len(arr)) + assert engine.is_monotonic_increasing is False + assert engine.is_monotonic_decreasing is True + + # neither monotonic increasing or decreasing + arr = np.array([1] * num + [2] * num + [1] * num, dtype=dtype) + engine = engine_type(lambda: arr[::-1], len(arr)) + assert engine.is_monotonic_increasing is False + assert engine.is_monotonic_decreasing is False + + def test_is_unique(self, numeric_indexing_engine_type_and_dtype): + engine_type, dtype = numeric_indexing_engine_type_and_dtype + + # unique + arr = np.array([1, 3, 2], dtype=dtype) + engine = engine_type(lambda: arr, len(arr)) + assert engine.is_unique is True + + # not unique + arr = np.array([1, 2, 1], dtype=dtype) + engine = engine_type(lambda: arr, len(arr)) + assert engine.is_unique is False + + def test_get_loc(self, numeric_indexing_engine_type_and_dtype): + engine_type, dtype = numeric_indexing_engine_type_and_dtype + + # unique + arr = np.array([1, 2, 3], dtype=dtype) + engine = engine_type(lambda: arr, len(arr)) + assert engine.get_loc(2) == 1 + + # monotonic + num = 1000 + arr = np.array([1] * num + [2] * num + [3] * num, dtype=dtype) + engine = engine_type(lambda: arr, len(arr)) + assert engine.get_loc(2) == slice(1000, 2000) + + # not monotonic + arr = np.array([1, 2, 3] * num, dtype=dtype) + engine = engine_type(lambda: arr, len(arr)) + expected = np.array([False, True, False] * num, dtype=bool) + result = engine.get_loc(2) + assert (result == expected).all() + + def test_get_backfill_indexer(self, numeric_indexing_engine_type_and_dtype): + engine_type, dtype = numeric_indexing_engine_type_and_dtype + + arr = np.array([1, 5, 10], dtype=dtype) + engine = engine_type(lambda: arr, len(arr)) + + new = np.arange(12, dtype=dtype) + result = engine.get_backfill_indexer(new) + + expected = libalgos.backfill(arr, new) + tm.assert_numpy_array_equal(result, expected) + + def test_get_pad_indexer(self, numeric_indexing_engine_type_and_dtype): + engine_type, dtype = numeric_indexing_engine_type_and_dtype + + arr = np.array([1, 5, 10], dtype=dtype) + engine = engine_type(lambda: arr, len(arr)) + + new = np.arange(12, dtype=dtype) + result = engine.get_pad_indexer(new) + + expected = libalgos.pad(arr, new) + tm.assert_numpy_array_equal(result, expected) + + +class TestObjectEngine: + engine_type = libindex.ObjectEngine + dtype = np.object_ + values = list("abc") + + def test_is_monotonic(self): + + num = 1000 + arr = np.array(["a"] * num + ["a"] * num + ["c"] * num, dtype=self.dtype) + + # monotonic increasing + engine = self.engine_type(lambda: arr, len(arr)) + assert engine.is_monotonic_increasing is True + assert engine.is_monotonic_decreasing is False + + # monotonic decreasing + engine = self.engine_type(lambda: arr[::-1], len(arr)) + assert engine.is_monotonic_increasing is False + assert engine.is_monotonic_decreasing is True + + # neither monotonic increasing or decreasing + arr = np.array(["a"] * num + ["b"] * num + ["a"] * num, dtype=self.dtype) + engine = self.engine_type(lambda: arr[::-1], len(arr)) + assert engine.is_monotonic_increasing is False + assert engine.is_monotonic_decreasing is False + + def test_is_unique(self): + # unique + arr = np.array(self.values, dtype=self.dtype) + engine = self.engine_type(lambda: arr, len(arr)) + assert engine.is_unique is True + + # not unique + arr = np.array(["a", "b", "a"], dtype=self.dtype) + engine = self.engine_type(lambda: arr, len(arr)) + assert engine.is_unique is False + + def test_get_loc(self): + # unique + arr = np.array(self.values, dtype=self.dtype) + engine = self.engine_type(lambda: arr, len(arr)) + assert engine.get_loc("b") == 1 + + # monotonic + num = 1000 + arr = np.array(["a"] * num + ["b"] * num + ["c"] * num, dtype=self.dtype) + engine = self.engine_type(lambda: arr, len(arr)) + assert engine.get_loc("b") == slice(1000, 2000) + + # not monotonic + arr = np.array(self.values * num, dtype=self.dtype) + engine = self.engine_type(lambda: arr, len(arr)) + expected = np.array([False, True, False] * num, dtype=bool) + result = engine.get_loc("b") + assert (result == expected).all() + + def test_get_backfill_indexer(self): + arr = np.array(["a", "e", "j"], dtype=self.dtype) + engine = self.engine_type(lambda: arr, len(arr)) + + new = np.array(list("abcdefghij"), dtype=self.dtype) + result = engine.get_backfill_indexer(new) + + expected = libalgos.backfill["object"](arr, new) + tm.assert_numpy_array_equal(result, expected) + + def test_get_pad_indexer(self): + arr = np.array(["a", "e", "j"], dtype=self.dtype) + engine = self.engine_type(lambda: arr, len(arr)) + + new = np.array(list("abcdefghij"), dtype=self.dtype) + result = engine.get_pad_indexer(new) + + expected = libalgos.pad["object"](arr, new) + tm.assert_numpy_array_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexing/test_indexing_slow.py b/venv/Lib/site-packages/pandas/tests/indexing/test_indexing_slow.py new file mode 100644 index 0000000..2ffa44b --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/test_indexing_slow.py @@ -0,0 +1,14 @@ +import pytest + +from pandas import DataFrame +import pandas._testing as tm + + +class TestIndexingSlow: + @pytest.mark.slow + def test_large_dataframe_indexing(self): + # GH10692 + result = DataFrame({"x": range(10 ** 6)}, dtype="int64") + result.loc[len(result)] = len(result) + 1 + expected = DataFrame({"x": range(10 ** 6 + 1)}, dtype="int64") + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexing/test_loc.py b/venv/Lib/site-packages/pandas/tests/indexing/test_loc.py new file mode 100644 index 0000000..e5930b2 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/test_loc.py @@ -0,0 +1,1004 @@ +""" test label based indexing with loc """ +from io import StringIO +import re + +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Series, Timestamp, date_range +import pandas._testing as tm +from pandas.api.types import is_scalar +from pandas.tests.indexing.common import Base + + +class TestLoc(Base): + def test_loc_getitem_dups(self): + # GH 5678 + # repeated getitems on a dup index returning a ndarray + df = DataFrame( + np.random.random_sample((20, 5)), index=["ABCDE"[x % 5] for x in range(20)] + ) + expected = df.loc["A", 0] + result = df.loc[:, 0].loc["A"] + tm.assert_series_equal(result, expected) + + def test_loc_getitem_dups2(self): + + # GH4726 + # dup indexing with iloc/loc + df = DataFrame( + [[1, 2, "foo", "bar", Timestamp("20130101")]], + columns=["a", "a", "a", "a", "a"], + index=[1], + ) + expected = Series( + [1, 2, "foo", "bar", Timestamp("20130101")], + index=["a", "a", "a", "a", "a"], + name=1, + ) + + result = df.iloc[0] + tm.assert_series_equal(result, expected) + + result = df.loc[1] + tm.assert_series_equal(result, expected) + + def test_loc_setitem_dups(self): + + # GH 6541 + df_orig = DataFrame( + { + "me": list("rttti"), + "foo": list("aaade"), + "bar": np.arange(5, dtype="float64") * 1.34 + 2, + "bar2": np.arange(5, dtype="float64") * -0.34 + 2, + } + ).set_index("me") + + indexer = tuple(["r", ["bar", "bar2"]]) + df = df_orig.copy() + df.loc[indexer] *= 2.0 + tm.assert_series_equal(df.loc[indexer], 2.0 * df_orig.loc[indexer]) + + indexer = tuple(["r", "bar"]) + df = df_orig.copy() + df.loc[indexer] *= 2.0 + assert df.loc[indexer] == 2.0 * df_orig.loc[indexer] + + indexer = tuple(["t", ["bar", "bar2"]]) + df = df_orig.copy() + df.loc[indexer] *= 2.0 + tm.assert_frame_equal(df.loc[indexer], 2.0 * df_orig.loc[indexer]) + + def test_loc_setitem_slice(self): + # GH10503 + + # assigning the same type should not change the type + df1 = DataFrame({"a": [0, 1, 1], "b": Series([100, 200, 300], dtype="uint32")}) + ix = df1["a"] == 1 + newb1 = df1.loc[ix, "b"] + 1 + df1.loc[ix, "b"] = newb1 + expected = DataFrame( + {"a": [0, 1, 1], "b": Series([100, 201, 301], dtype="uint32")} + ) + tm.assert_frame_equal(df1, expected) + + # assigning a new type should get the inferred type + df2 = DataFrame({"a": [0, 1, 1], "b": [100, 200, 300]}, dtype="uint64") + ix = df1["a"] == 1 + newb2 = df2.loc[ix, "b"] + df1.loc[ix, "b"] = newb2 + expected = DataFrame({"a": [0, 1, 1], "b": [100, 200, 300]}, dtype="uint64") + tm.assert_frame_equal(df2, expected) + + def test_loc_getitem_int(self): + + # int label + self.check_result("loc", 2, "loc", 2, typs=["label"], fails=KeyError) + + def test_loc_getitem_label(self): + + # label + self.check_result("loc", "c", "loc", "c", typs=["empty"], fails=KeyError) + + def test_loc_getitem_label_out_of_range(self): + + # out of range label + self.check_result( + "loc", + "f", + "loc", + "f", + typs=["ints", "uints", "labels", "mixed", "ts"], + fails=KeyError, + ) + self.check_result("loc", "f", "ix", "f", typs=["floats"], fails=KeyError) + self.check_result("loc", "f", "loc", "f", typs=["floats"], fails=KeyError) + self.check_result( + "loc", 20, "loc", 20, typs=["ints", "uints", "mixed"], fails=KeyError, + ) + self.check_result("loc", 20, "loc", 20, typs=["labels"], fails=TypeError) + self.check_result("loc", 20, "loc", 20, typs=["ts"], axes=0, fails=TypeError) + self.check_result("loc", 20, "loc", 20, typs=["floats"], axes=0, fails=KeyError) + + def test_loc_getitem_label_list(self): + # TODO: test something here? + # list of labels + pass + + def test_loc_getitem_label_list_with_missing(self): + self.check_result( + "loc", [0, 1, 2], "loc", [0, 1, 2], typs=["empty"], fails=KeyError, + ) + self.check_result( + "loc", + [0, 2, 10], + "ix", + [0, 2, 10], + typs=["ints", "uints", "floats"], + axes=0, + fails=KeyError, + ) + + self.check_result( + "loc", + [3, 6, 7], + "ix", + [3, 6, 7], + typs=["ints", "uints", "floats"], + axes=1, + fails=KeyError, + ) + + # GH 17758 - MultiIndex and missing keys + self.check_result( + "loc", + [(1, 3), (1, 4), (2, 5)], + "ix", + [(1, 3), (1, 4), (2, 5)], + typs=["multi"], + axes=0, + fails=KeyError, + ) + + def test_getitem_label_list_with_missing(self): + s = Series(range(3), index=["a", "b", "c"]) + + # consistency + with pytest.raises(KeyError, match="with any missing labels"): + s[["a", "d"]] + + s = Series(range(3)) + with pytest.raises(KeyError, match="with any missing labels"): + s[[0, 3]] + + def test_loc_getitem_label_list_fails(self): + # fails + self.check_result( + "loc", + [20, 30, 40], + "loc", + [20, 30, 40], + typs=["ints", "uints"], + axes=1, + fails=KeyError, + ) + + def test_loc_getitem_label_array_like(self): + # TODO: test something? + # array like + pass + + def test_loc_getitem_bool(self): + # boolean indexers + b = [True, False, True, False] + + self.check_result("loc", b, "loc", b, typs=["empty"], fails=IndexError) + + @pytest.mark.parametrize("index", [[True, False], [True, False, True, False]]) + def test_loc_getitem_bool_diff_len(self, index): + # GH26658 + s = Series([1, 2, 3]) + msg = "Boolean index has wrong length: {} instead of {}".format( + len(index), len(s) + ) + with pytest.raises(IndexError, match=msg): + _ = s.loc[index] + + def test_loc_getitem_int_slice(self): + # TODO: test something here? + pass + + def test_loc_to_fail(self): + + # GH3449 + df = DataFrame( + np.random.random((3, 3)), index=["a", "b", "c"], columns=["e", "f", "g"] + ) + + # raise a KeyError? + msg = ( + r"\"None of \[Int64Index\(\[1, 2\], dtype='int64'\)\] are" + r" in the \[index\]\"" + ) + with pytest.raises(KeyError, match=msg): + df.loc[[1, 2], [1, 2]] + + # GH 7496 + # loc should not fallback + + s = Series(dtype=object) + s.loc[1] = 1 + s.loc["a"] = 2 + + with pytest.raises(KeyError, match=r"^-1$"): + s.loc[-1] + + msg = ( + r"\"None of \[Int64Index\(\[-1, -2\], dtype='int64'\)\] are" + r" in the \[index\]\"" + ) + with pytest.raises(KeyError, match=msg): + s.loc[[-1, -2]] + + msg = r"\"None of \[Index\(\['4'\], dtype='object'\)\] are in the \[index\]\"" + with pytest.raises(KeyError, match=msg): + s.loc[["4"]] + + s.loc[-1] = 3 + with pytest.raises(KeyError, match="with any missing labels"): + s.loc[[-1, -2]] + + s["a"] = 2 + msg = ( + r"\"None of \[Int64Index\(\[-2\], dtype='int64'\)\] are" + r" in the \[index\]\"" + ) + with pytest.raises(KeyError, match=msg): + s.loc[[-2]] + + del s["a"] + + with pytest.raises(KeyError, match=msg): + s.loc[[-2]] = 0 + + # inconsistency between .loc[values] and .loc[values,:] + # GH 7999 + df = DataFrame([["a"], ["b"]], index=[1, 2], columns=["value"]) + + msg = ( + r"\"None of \[Int64Index\(\[3\], dtype='int64'\)\] are" + r" in the \[index\]\"" + ) + with pytest.raises(KeyError, match=msg): + df.loc[[3], :] + + with pytest.raises(KeyError, match=msg): + df.loc[[3]] + + def test_loc_getitem_list_with_fail(self): + # 15747 + # should KeyError if *any* missing labels + + s = Series([1, 2, 3]) + + s.loc[[2]] + + with pytest.raises( + KeyError, + match=re.escape( + "\"None of [Int64Index([3], dtype='int64')] are in the [index]\"" + ), + ): + s.loc[[3]] + + # a non-match and a match + with pytest.raises(KeyError, match="with any missing labels"): + s.loc[[2, 3]] + + def test_loc_getitem_label_slice(self): + + # label slices (with ints) + + # real label slices + + # GH 14316 + + self.check_result( + "loc", + slice(1, 3), + "loc", + slice(1, 3), + typs=["labels", "mixed", "empty", "ts", "floats"], + fails=TypeError, + ) + + self.check_result( + "loc", + slice("20130102", "20130104"), + "loc", + slice("20130102", "20130104"), + typs=["ts"], + axes=1, + fails=TypeError, + ) + + self.check_result( + "loc", + slice(2, 8), + "loc", + slice(2, 8), + typs=["mixed"], + axes=0, + fails=TypeError, + ) + self.check_result( + "loc", + slice(2, 8), + "loc", + slice(2, 8), + typs=["mixed"], + axes=1, + fails=KeyError, + ) + + self.check_result( + "loc", + slice(2, 4, 2), + "loc", + slice(2, 4, 2), + typs=["mixed"], + axes=0, + fails=TypeError, + ) + + def test_loc_index(self): + # gh-17131 + # a boolean index should index like a boolean numpy array + + df = DataFrame( + np.random.random(size=(5, 10)), + index=["alpha_0", "alpha_1", "alpha_2", "beta_0", "beta_1"], + ) + + mask = df.index.map(lambda x: "alpha" in x) + expected = df.loc[np.array(mask)] + + result = df.loc[mask] + tm.assert_frame_equal(result, expected) + + result = df.loc[mask.values] + tm.assert_frame_equal(result, expected) + + result = df.loc[pd.array(mask, dtype="boolean")] + tm.assert_frame_equal(result, expected) + + def test_loc_general(self): + + df = DataFrame( + np.random.rand(4, 4), + columns=["A", "B", "C", "D"], + index=["A", "B", "C", "D"], + ) + + # want this to work + result = df.loc[:, "A":"B"].iloc[0:2, :] + assert (result.columns == ["A", "B"]).all() + assert (result.index == ["A", "B"]).all() + + # mixed type + result = DataFrame({"a": [Timestamp("20130101")], "b": [1]}).iloc[0] + expected = Series([Timestamp("20130101"), 1], index=["a", "b"], name=0) + tm.assert_series_equal(result, expected) + assert result.dtype == object + + def test_loc_setitem_consistency(self): + # GH 6149 + # coerce similarly for setitem and loc when rows have a null-slice + expected = DataFrame( + { + "date": Series(0, index=range(5), dtype=np.int64), + "val": Series(range(5), dtype=np.int64), + } + ) + + df = DataFrame( + { + "date": date_range("2000-01-01", "2000-01-5"), + "val": Series(range(5), dtype=np.int64), + } + ) + df.loc[:, "date"] = 0 + tm.assert_frame_equal(df, expected) + + df = DataFrame( + { + "date": date_range("2000-01-01", "2000-01-5"), + "val": Series(range(5), dtype=np.int64), + } + ) + df.loc[:, "date"] = np.array(0, dtype=np.int64) + tm.assert_frame_equal(df, expected) + + df = DataFrame( + { + "date": date_range("2000-01-01", "2000-01-5"), + "val": Series(range(5), dtype=np.int64), + } + ) + df.loc[:, "date"] = np.array([0, 0, 0, 0, 0], dtype=np.int64) + tm.assert_frame_equal(df, expected) + + expected = DataFrame( + { + "date": Series("foo", index=range(5)), + "val": Series(range(5), dtype=np.int64), + } + ) + df = DataFrame( + { + "date": date_range("2000-01-01", "2000-01-5"), + "val": Series(range(5), dtype=np.int64), + } + ) + df.loc[:, "date"] = "foo" + tm.assert_frame_equal(df, expected) + + expected = DataFrame( + { + "date": Series(1.0, index=range(5)), + "val": Series(range(5), dtype=np.int64), + } + ) + df = DataFrame( + { + "date": date_range("2000-01-01", "2000-01-5"), + "val": Series(range(5), dtype=np.int64), + } + ) + df.loc[:, "date"] = 1.0 + tm.assert_frame_equal(df, expected) + + # GH 15494 + # setting on frame with single row + df = DataFrame({"date": Series([Timestamp("20180101")])}) + df.loc[:, "date"] = "string" + expected = DataFrame({"date": Series(["string"])}) + tm.assert_frame_equal(df, expected) + + def test_loc_setitem_consistency_empty(self): + # empty (essentially noops) + expected = DataFrame(columns=["x", "y"]) + expected["x"] = expected["x"].astype(np.int64) + df = DataFrame(columns=["x", "y"]) + df.loc[:, "x"] = 1 + tm.assert_frame_equal(df, expected) + + df = DataFrame(columns=["x", "y"]) + df["x"] = 1 + tm.assert_frame_equal(df, expected) + + def test_loc_setitem_consistency_slice_column_len(self): + # .loc[:,column] setting with slice == len of the column + # GH10408 + data = """Level_0,,,Respondent,Respondent,Respondent,OtherCat,OtherCat +Level_1,,,Something,StartDate,EndDate,Yes/No,SomethingElse +Region,Site,RespondentID,,,,, +Region_1,Site_1,3987227376,A,5/25/2015 10:59,5/25/2015 11:22,Yes, +Region_1,Site_1,3980680971,A,5/21/2015 9:40,5/21/2015 9:52,Yes,Yes +Region_1,Site_2,3977723249,A,5/20/2015 8:27,5/20/2015 8:41,Yes, +Region_1,Site_2,3977723089,A,5/20/2015 8:33,5/20/2015 9:09,Yes,No""" + + df = pd.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1, 2]) + df.loc[:, ("Respondent", "StartDate")] = pd.to_datetime( + df.loc[:, ("Respondent", "StartDate")] + ) + df.loc[:, ("Respondent", "EndDate")] = pd.to_datetime( + df.loc[:, ("Respondent", "EndDate")] + ) + df.loc[:, ("Respondent", "Duration")] = ( + df.loc[:, ("Respondent", "EndDate")] + - df.loc[:, ("Respondent", "StartDate")] + ) + + df.loc[:, ("Respondent", "Duration")] = df.loc[ + :, ("Respondent", "Duration") + ].astype("timedelta64[s]") + expected = Series( + [1380, 720, 840, 2160.0], index=df.index, name=("Respondent", "Duration") + ) + tm.assert_series_equal(df[("Respondent", "Duration")], expected) + + @pytest.mark.parametrize("unit", ["Y", "M", "D", "h", "m", "s", "ms", "us"]) + def test_loc_assign_non_ns_datetime(self, unit): + # GH 27395, non-ns dtype assignment via .loc should work + # and return the same result when using simple assignment + df = DataFrame( + { + "timestamp": [ + np.datetime64("2017-02-11 12:41:29"), + np.datetime64("1991-11-07 04:22:37"), + ] + } + ) + + df.loc[:, unit] = df.loc[:, "timestamp"].values.astype( + "datetime64[{unit}]".format(unit=unit) + ) + df["expected"] = df.loc[:, "timestamp"].values.astype( + "datetime64[{unit}]".format(unit=unit) + ) + expected = Series(df.loc[:, "expected"], name=unit) + tm.assert_series_equal(df.loc[:, unit], expected) + + def test_loc_modify_datetime(self): + # see gh-28837 + df = DataFrame.from_dict( + {"date": [1485264372711, 1485265925110, 1540215845888, 1540282121025]} + ) + + df["date_dt"] = pd.to_datetime(df["date"], unit="ms", cache=True) + + df.loc[:, "date_dt_cp"] = df.loc[:, "date_dt"] + df.loc[[2, 3], "date_dt_cp"] = df.loc[[2, 3], "date_dt"] + + expected = DataFrame( + [ + [1485264372711, "2017-01-24 13:26:12.711", "2017-01-24 13:26:12.711"], + [1485265925110, "2017-01-24 13:52:05.110", "2017-01-24 13:52:05.110"], + [1540215845888, "2018-10-22 13:44:05.888", "2018-10-22 13:44:05.888"], + [1540282121025, "2018-10-23 08:08:41.025", "2018-10-23 08:08:41.025"], + ], + columns=["date", "date_dt", "date_dt_cp"], + ) + + columns = ["date_dt", "date_dt_cp"] + expected[columns] = expected[columns].apply(pd.to_datetime) + + tm.assert_frame_equal(df, expected) + + def test_loc_setitem_frame(self): + df = self.frame_labels + + result = df.iloc[0, 0] + + df.loc["a", "A"] = 1 + result = df.loc["a", "A"] + assert result == 1 + + result = df.iloc[0, 0] + assert result == 1 + + df.loc[:, "B":"D"] = 0 + expected = df.loc[:, "B":"D"] + result = df.iloc[:, 1:] + tm.assert_frame_equal(result, expected) + + # GH 6254 + # setting issue + df = DataFrame(index=[3, 5, 4], columns=["A"]) + df.loc[[4, 3, 5], "A"] = np.array([1, 2, 3], dtype="int64") + expected = DataFrame(dict(A=Series([1, 2, 3], index=[4, 3, 5]))).reindex( + index=[3, 5, 4] + ) + tm.assert_frame_equal(df, expected) + + # GH 6252 + # setting with an empty frame + keys1 = ["@" + str(i) for i in range(5)] + val1 = np.arange(5, dtype="int64") + + keys2 = ["@" + str(i) for i in range(4)] + val2 = np.arange(4, dtype="int64") + + index = list(set(keys1).union(keys2)) + df = DataFrame(index=index) + df["A"] = np.nan + df.loc[keys1, "A"] = val1 + + df["B"] = np.nan + df.loc[keys2, "B"] = val2 + + expected = DataFrame( + dict(A=Series(val1, index=keys1), B=Series(val2, index=keys2)) + ).reindex(index=index) + tm.assert_frame_equal(df, expected) + + # GH 8669 + # invalid coercion of nan -> int + df = DataFrame({"A": [1, 2, 3], "B": np.nan}) + df.loc[df.B > df.A, "B"] = df.A + expected = DataFrame({"A": [1, 2, 3], "B": np.nan}) + tm.assert_frame_equal(df, expected) + + # GH 6546 + # setting with mixed labels + df = DataFrame({1: [1, 2], 2: [3, 4], "a": ["a", "b"]}) + + result = df.loc[0, [1, 2]] + expected = Series([1, 3], index=[1, 2], dtype=object, name=0) + tm.assert_series_equal(result, expected) + + expected = DataFrame({1: [5, 2], 2: [6, 4], "a": ["a", "b"]}) + df.loc[0, [1, 2]] = [5, 6] + tm.assert_frame_equal(df, expected) + + def test_loc_setitem_frame_multiples(self): + # multiple setting + df = DataFrame( + {"A": ["foo", "bar", "baz"], "B": Series(range(3), dtype=np.int64)} + ) + rhs = df.loc[1:2] + rhs.index = df.index[0:2] + df.loc[0:1] = rhs + expected = DataFrame( + {"A": ["bar", "baz", "baz"], "B": Series([1, 2, 2], dtype=np.int64)} + ) + tm.assert_frame_equal(df, expected) + + # multiple setting with frame on rhs (with M8) + df = DataFrame( + { + "date": date_range("2000-01-01", "2000-01-5"), + "val": Series(range(5), dtype=np.int64), + } + ) + expected = DataFrame( + { + "date": [ + Timestamp("20000101"), + Timestamp("20000102"), + Timestamp("20000101"), + Timestamp("20000102"), + Timestamp("20000103"), + ], + "val": Series([0, 1, 0, 1, 2], dtype=np.int64), + } + ) + rhs = df.loc[0:2] + rhs.index = df.index[2:5] + df.loc[2:4] = rhs + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize( + "indexer", [["A"], slice(None, "A", None), np.array(["A"])] + ) + @pytest.mark.parametrize("value", [["Z"], np.array(["Z"])]) + def test_loc_setitem_with_scalar_index(self, indexer, value): + # GH #19474 + # assigning like "df.loc[0, ['A']] = ['Z']" should be evaluated + # elementwisely, not using "setter('A', ['Z'])". + + df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + df.loc[0, indexer] = value + result = df.loc[0, "A"] + + assert is_scalar(result) and result == "Z" + + def test_loc_coercion(self): + + # 12411 + df = DataFrame({"date": [Timestamp("20130101").tz_localize("UTC"), pd.NaT]}) + expected = df.dtypes + + result = df.iloc[[0]] + tm.assert_series_equal(result.dtypes, expected) + + result = df.iloc[[1]] + tm.assert_series_equal(result.dtypes, expected) + + # 12045 + import datetime + + df = DataFrame( + {"date": [datetime.datetime(2012, 1, 1), datetime.datetime(1012, 1, 2)]} + ) + expected = df.dtypes + + result = df.iloc[[0]] + tm.assert_series_equal(result.dtypes, expected) + + result = df.iloc[[1]] + tm.assert_series_equal(result.dtypes, expected) + + # 11594 + df = DataFrame({"text": ["some words"] + [None] * 9}) + expected = df.dtypes + + result = df.iloc[0:2] + tm.assert_series_equal(result.dtypes, expected) + + result = df.iloc[3:] + tm.assert_series_equal(result.dtypes, expected) + + def test_setitem_new_key_tz(self): + # GH#12862 should not raise on assigning the second value + vals = [ + pd.to_datetime(42).tz_localize("UTC"), + pd.to_datetime(666).tz_localize("UTC"), + ] + expected = pd.Series(vals, index=["foo", "bar"]) + + ser = pd.Series(dtype=object) + ser["foo"] = vals[0] + ser["bar"] = vals[1] + + tm.assert_series_equal(ser, expected) + + ser = pd.Series(dtype=object) + ser.loc["foo"] = vals[0] + ser.loc["bar"] = vals[1] + + tm.assert_series_equal(ser, expected) + + def test_loc_non_unique(self): + # GH3659 + # non-unique indexer with loc slice + # https://groups.google.com/forum/?fromgroups#!topic/pydata/zTm2No0crYs + + # these are going to raise because the we are non monotonic + df = DataFrame( + {"A": [1, 2, 3, 4, 5, 6], "B": [3, 4, 5, 6, 7, 8]}, index=[0, 1, 0, 1, 2, 3] + ) + msg = "'Cannot get left slice bound for non-unique label: 1'" + with pytest.raises(KeyError, match=msg): + df.loc[1:] + msg = "'Cannot get left slice bound for non-unique label: 0'" + with pytest.raises(KeyError, match=msg): + df.loc[0:] + msg = "'Cannot get left slice bound for non-unique label: 1'" + with pytest.raises(KeyError, match=msg): + df.loc[1:2] + + # monotonic are ok + df = DataFrame( + {"A": [1, 2, 3, 4, 5, 6], "B": [3, 4, 5, 6, 7, 8]}, index=[0, 1, 0, 1, 2, 3] + ).sort_index(axis=0) + result = df.loc[1:] + expected = DataFrame({"A": [2, 4, 5, 6], "B": [4, 6, 7, 8]}, index=[1, 1, 2, 3]) + tm.assert_frame_equal(result, expected) + + result = df.loc[0:] + tm.assert_frame_equal(result, df) + + result = df.loc[1:2] + expected = DataFrame({"A": [2, 4, 5], "B": [4, 6, 7]}, index=[1, 1, 2]) + tm.assert_frame_equal(result, expected) + + def test_loc_non_unique_memory_error(self): + + # GH 4280 + # non_unique index with a large selection triggers a memory error + + columns = list("ABCDEFG") + + def gen_test(l, l2): + return pd.concat( + [ + DataFrame( + np.random.randn(l, len(columns)), + index=np.arange(l), + columns=columns, + ), + DataFrame( + np.ones((l2, len(columns))), index=[0] * l2, columns=columns + ), + ] + ) + + def gen_expected(df, mask): + len_mask = len(mask) + return pd.concat( + [ + df.take([0]), + DataFrame( + np.ones((len_mask, len(columns))), + index=[0] * len_mask, + columns=columns, + ), + df.take(mask[1:]), + ] + ) + + df = gen_test(900, 100) + assert df.index.is_unique is False + + mask = np.arange(100) + result = df.loc[mask] + expected = gen_expected(df, mask) + tm.assert_frame_equal(result, expected) + + df = gen_test(900000, 100000) + assert df.index.is_unique is False + + mask = np.arange(100000) + result = df.loc[mask] + expected = gen_expected(df, mask) + tm.assert_frame_equal(result, expected) + + def test_loc_name(self): + # GH 3880 + df = DataFrame([[1, 1], [1, 1]]) + df.index.name = "index_name" + result = df.iloc[[0, 1]].index.name + assert result == "index_name" + + result = df.loc[[0, 1]].index.name + assert result == "index_name" + + def test_loc_empty_list_indexer_is_ok(self): + + df = tm.makeCustomDataframe(5, 2) + # vertical empty + tm.assert_frame_equal( + df.loc[:, []], df.iloc[:, :0], check_index_type=True, check_column_type=True + ) + # horizontal empty + tm.assert_frame_equal( + df.loc[[], :], df.iloc[:0, :], check_index_type=True, check_column_type=True + ) + # horizontal empty + tm.assert_frame_equal( + df.loc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True + ) + + def test_identity_slice_returns_new_object(self): + # GH13873 + original_df = DataFrame({"a": [1, 2, 3]}) + sliced_df = original_df.loc[:] + assert sliced_df is not original_df + assert original_df[:] is not original_df + + # should be a shallow copy + original_df["a"] = [4, 4, 4] + assert (sliced_df["a"] == 4).all() + + # These should not return copies + assert original_df is original_df.loc[:, :] + df = DataFrame(np.random.randn(10, 4)) + assert df[0] is df.loc[:, 0] + + # Same tests for Series + original_series = Series([1, 2, 3, 4, 5, 6]) + sliced_series = original_series.loc[:] + assert sliced_series is not original_series + assert original_series[:] is not original_series + + original_series[:3] = [7, 8, 9] + assert all(sliced_series[:3] == [7, 8, 9]) + + def test_loc_uint64(self): + # GH20722 + # Test whether loc accept uint64 max value as index. + s = pd.Series( + [1, 2], index=[np.iinfo("uint64").max - 1, np.iinfo("uint64").max] + ) + + result = s.loc[np.iinfo("uint64").max - 1] + expected = s.iloc[0] + assert result == expected + + result = s.loc[[np.iinfo("uint64").max - 1]] + expected = s.iloc[[0]] + tm.assert_series_equal(result, expected) + + result = s.loc[[np.iinfo("uint64").max - 1, np.iinfo("uint64").max]] + tm.assert_series_equal(result, s) + + def test_loc_setitem_empty_append(self): + # GH6173, various appends to an empty dataframe + + data = [1, 2, 3] + expected = DataFrame({"x": data, "y": [None] * len(data)}) + + # appends to fit length of data + df = DataFrame(columns=["x", "y"]) + df.loc[:, "x"] = data + tm.assert_frame_equal(df, expected) + + # only appends one value + expected = DataFrame({"x": [1.0], "y": [np.nan]}) + df = DataFrame(columns=["x", "y"], dtype=np.float) + df.loc[0, "x"] = expected.loc[0, "x"] + tm.assert_frame_equal(df, expected) + + def test_loc_setitem_empty_append_raises(self): + # GH6173, various appends to an empty dataframe + + data = [1, 2] + df = DataFrame(columns=["x", "y"]) + msg = ( + r"None of \[Int64Index\(\[0, 1\], dtype='int64'\)\] " + r"are in the \[index\]" + ) + with pytest.raises(KeyError, match=msg): + df.loc[[0, 1], "x"] = data + + msg = "cannot copy sequence with size 2 to array axis with dimension 0" + with pytest.raises(ValueError, match=msg): + df.loc[0:2, "x"] = data + + def test_indexing_zerodim_np_array(self): + # GH24924 + df = DataFrame([[1, 2], [3, 4]]) + result = df.loc[np.array(0)] + s = pd.Series([1, 2], name=0) + tm.assert_series_equal(result, s) + + def test_series_indexing_zerodim_np_array(self): + # GH24924 + s = Series([1, 2]) + result = s.loc[np.array(0)] + assert result == 1 + + def test_loc_reverse_assignment(self): + # GH26939 + data = [1, 2, 3, 4, 5, 6] + [None] * 4 + expected = Series(data, index=range(2010, 2020)) + + result = pd.Series(index=range(2010, 2020), dtype=np.float64) + result.loc[2015:2010:-1] = [6, 5, 4, 3, 2, 1] + + tm.assert_series_equal(result, expected) + + +def test_series_loc_getitem_label_list_missing_values(): + # gh-11428 + key = np.array( + ["2001-01-04", "2001-01-02", "2001-01-04", "2001-01-14"], dtype="datetime64" + ) + s = Series([2, 5, 8, 11], date_range("2001-01-01", freq="D", periods=4)) + with pytest.raises(KeyError, match="with any missing labels"): + s.loc[key] + + +@pytest.mark.parametrize( + "columns, column_key, expected_columns, check_column_type", + [ + ([2011, 2012, 2013], [2011, 2012], [0, 1], True), + ([2011, 2012, "All"], [2011, 2012], [0, 1], False), + ([2011, 2012, "All"], [2011, "All"], [0, 2], True), + ], +) +def test_loc_getitem_label_list_integer_labels( + columns, column_key, expected_columns, check_column_type +): + # gh-14836 + df = DataFrame(np.random.rand(3, 3), columns=columns, index=list("ABC")) + expected = df.iloc[:, expected_columns] + result = df.loc[["A", "B", "C"], column_key] + tm.assert_frame_equal(result, expected, check_column_type=check_column_type) + + +def test_loc_setitem_float_intindex(): + # GH 8720 + rand_data = np.random.randn(8, 4) + result = pd.DataFrame(rand_data) + result.loc[:, 0.5] = np.nan + expected_data = np.hstack((rand_data, np.array([np.nan] * 8).reshape(8, 1))) + expected = pd.DataFrame(expected_data, columns=[0.0, 1.0, 2.0, 3.0, 0.5]) + tm.assert_frame_equal(result, expected) + + result = pd.DataFrame(rand_data) + result.loc[:, 0.5] = np.nan + tm.assert_frame_equal(result, expected) + + +def test_loc_axis_1_slice(): + # GH 10586 + cols = [(yr, m) for yr in [2014, 2015] for m in [7, 8, 9, 10]] + df = pd.DataFrame( + np.ones((10, 8)), + index=tuple("ABCDEFGHIJ"), + columns=pd.MultiIndex.from_tuples(cols), + ) + result = df.loc(axis=1)[(2014, 9):(2015, 8)] + expected = pd.DataFrame( + np.ones((10, 4)), + index=tuple("ABCDEFGHIJ"), + columns=pd.MultiIndex.from_tuples( + [(2014, 9), (2014, 10), (2015, 7), (2015, 8)] + ), + ) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/indexing/test_na_indexing.py b/venv/Lib/site-packages/pandas/tests/indexing/test_na_indexing.py new file mode 100644 index 0000000..befe4fe --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/test_na_indexing.py @@ -0,0 +1,79 @@ +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize( + "values, dtype", + [ + ([1, 2, 3], "int64"), + ([1.0, 2.0, 3.0], "float64"), + (["a", "b", "c"], "object"), + (["a", "b", "c"], "string"), + ([1, 2, 3], "datetime64[ns]"), + ([1, 2, 3], "datetime64[ns, CET]"), + ([1, 2, 3], "timedelta64[ns]"), + (["2000", "2001", "2002"], "Period[D]"), + ([1, 0, 3], "Sparse"), + ([pd.Interval(0, 1), pd.Interval(1, 2), pd.Interval(3, 4)], "interval"), + ], +) +@pytest.mark.parametrize( + "mask", [[True, False, False], [True, True, True], [False, False, False]] +) +@pytest.mark.parametrize("box_mask", [True, False]) +@pytest.mark.parametrize("frame", [True, False]) +def test_series_mask_boolean(values, dtype, mask, box_mask, frame): + ser = pd.Series(values, dtype=dtype, index=["a", "b", "c"]) + if frame: + ser = ser.to_frame() + mask = pd.array(mask, dtype="boolean") + if box_mask: + mask = pd.Series(mask, index=ser.index) + + expected = ser[mask.astype("bool")] + + result = ser[mask] + tm.assert_equal(result, expected) + + if not box_mask: + # Series.iloc[Series[bool]] isn't allowed + result = ser.iloc[mask] + tm.assert_equal(result, expected) + + result = ser.loc[mask] + tm.assert_equal(result, expected) + + # empty + mask = mask[:0] + ser = ser.iloc[:0] + expected = ser[mask.astype("bool")] + result = ser[mask] + tm.assert_equal(result, expected) + + if not box_mask: + # Series.iloc[Series[bool]] isn't allowed + result = ser.iloc[mask] + tm.assert_equal(result, expected) + + result = ser.loc[mask] + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("frame", [True, False]) +def test_indexing_with_na_raises(frame): + s = pd.Series([1, 2, 3], name="name") + + if frame: + s = s.to_frame() + mask = pd.array([True, False, None], dtype="boolean") + match = "cannot mask with array containing NA / NaN values" + with pytest.raises(ValueError, match=match): + s[mask] + + with pytest.raises(ValueError, match=match): + s.loc[mask] + + with pytest.raises(ValueError, match=match): + s.iloc[mask] diff --git a/venv/Lib/site-packages/pandas/tests/indexing/test_partial.py b/venv/Lib/site-packages/pandas/tests/indexing/test_partial.py new file mode 100644 index 0000000..5fda759 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/test_partial.py @@ -0,0 +1,527 @@ +""" +test setting *parts* of objects both positionally and label based + +TODO: these should be split among the indexer tests +""" + +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Index, Series, date_range +import pandas._testing as tm + + +class TestPartialSetting: + def test_partial_setting(self): + + # GH2578, allow ix and friends to partially set + + # series + s_orig = Series([1, 2, 3]) + + s = s_orig.copy() + s[5] = 5 + expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5]) + tm.assert_series_equal(s, expected) + + s = s_orig.copy() + s.loc[5] = 5 + expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5]) + tm.assert_series_equal(s, expected) + + s = s_orig.copy() + s[5] = 5.0 + expected = Series([1, 2, 3, 5.0], index=[0, 1, 2, 5]) + tm.assert_series_equal(s, expected) + + s = s_orig.copy() + s.loc[5] = 5.0 + expected = Series([1, 2, 3, 5.0], index=[0, 1, 2, 5]) + tm.assert_series_equal(s, expected) + + # iloc/iat raise + s = s_orig.copy() + + with pytest.raises(IndexError): + s.iloc[3] = 5.0 + + with pytest.raises(IndexError): + s.iat[3] = 5.0 + + # ## frame ## + + df_orig = DataFrame( + np.arange(6).reshape(3, 2), columns=["A", "B"], dtype="int64" + ) + + # iloc/iat raise + df = df_orig.copy() + + with pytest.raises(IndexError): + df.iloc[4, 2] = 5.0 + + with pytest.raises(IndexError): + df.iat[4, 2] = 5.0 + + # row setting where it exists + expected = DataFrame(dict({"A": [0, 4, 4], "B": [1, 5, 5]})) + df = df_orig.copy() + df.iloc[1] = df.iloc[2] + tm.assert_frame_equal(df, expected) + + expected = DataFrame(dict({"A": [0, 4, 4], "B": [1, 5, 5]})) + df = df_orig.copy() + df.loc[1] = df.loc[2] + tm.assert_frame_equal(df, expected) + + # like 2578, partial setting with dtype preservation + expected = DataFrame(dict({"A": [0, 2, 4, 4], "B": [1, 3, 5, 5]})) + df = df_orig.copy() + df.loc[3] = df.loc[2] + tm.assert_frame_equal(df, expected) + + # single dtype frame, overwrite + expected = DataFrame(dict({"A": [0, 2, 4], "B": [0, 2, 4]})) + df = df_orig.copy() + df.loc[:, "B"] = df.loc[:, "A"] + tm.assert_frame_equal(df, expected) + + # mixed dtype frame, overwrite + expected = DataFrame(dict({"A": [0, 2, 4], "B": Series([0, 2, 4])})) + df = df_orig.copy() + df["B"] = df["B"].astype(np.float64) + df.loc[:, "B"] = df.loc[:, "A"] + tm.assert_frame_equal(df, expected) + + # single dtype frame, partial setting + expected = df_orig.copy() + expected["C"] = df["A"] + df = df_orig.copy() + df.loc[:, "C"] = df.loc[:, "A"] + tm.assert_frame_equal(df, expected) + + # mixed frame, partial setting + expected = df_orig.copy() + expected["C"] = df["A"] + df = df_orig.copy() + df.loc[:, "C"] = df.loc[:, "A"] + tm.assert_frame_equal(df, expected) + + # GH 8473 + dates = date_range("1/1/2000", periods=8) + df_orig = DataFrame( + np.random.randn(8, 4), index=dates, columns=["A", "B", "C", "D"] + ) + + expected = pd.concat( + [df_orig, DataFrame({"A": 7}, index=[dates[-1] + dates.freq])], sort=True + ) + df = df_orig.copy() + df.loc[dates[-1] + dates.freq, "A"] = 7 + tm.assert_frame_equal(df, expected) + df = df_orig.copy() + df.at[dates[-1] + dates.freq, "A"] = 7 + tm.assert_frame_equal(df, expected) + + exp_other = DataFrame({0: 7}, index=[dates[-1] + dates.freq]) + expected = pd.concat([df_orig, exp_other], axis=1) + + df = df_orig.copy() + df.loc[dates[-1] + dates.freq, 0] = 7 + tm.assert_frame_equal(df, expected) + df = df_orig.copy() + df.at[dates[-1] + dates.freq, 0] = 7 + tm.assert_frame_equal(df, expected) + + def test_partial_setting_mixed_dtype(self): + + # in a mixed dtype environment, try to preserve dtypes + # by appending + df = DataFrame([[True, 1], [False, 2]], columns=["female", "fitness"]) + + s = df.loc[1].copy() + s.name = 2 + expected = df.append(s) + + df.loc[2] = df.loc[1] + tm.assert_frame_equal(df, expected) + + # columns will align + df = DataFrame(columns=["A", "B"]) + df.loc[0] = Series(1, index=range(4)) + tm.assert_frame_equal(df, DataFrame(columns=["A", "B"], index=[0])) + + # columns will align + df = DataFrame(columns=["A", "B"]) + df.loc[0] = Series(1, index=["B"]) + + exp = DataFrame([[np.nan, 1]], columns=["A", "B"], index=[0], dtype="float64") + tm.assert_frame_equal(df, exp) + + # list-like must conform + df = DataFrame(columns=["A", "B"]) + + with pytest.raises(ValueError): + df.loc[0] = [1, 2, 3] + + # TODO: #15657, these are left as object and not coerced + df = DataFrame(columns=["A", "B"]) + df.loc[3] = [6, 7] + + exp = DataFrame([[6, 7]], index=[3], columns=["A", "B"], dtype="object") + tm.assert_frame_equal(df, exp) + + def test_series_partial_set(self): + # partial set with new index + # Regression from GH4825 + ser = Series([0.1, 0.2], index=[1, 2]) + + # loc equiv to .reindex + expected = Series([np.nan, 0.2, np.nan], index=[3, 2, 3]) + with pytest.raises(KeyError, match="with any missing labels"): + result = ser.loc[[3, 2, 3]] + + result = ser.reindex([3, 2, 3]) + tm.assert_series_equal(result, expected, check_index_type=True) + + expected = Series([np.nan, 0.2, np.nan, np.nan], index=[3, 2, 3, "x"]) + with pytest.raises(KeyError, match="with any missing labels"): + result = ser.loc[[3, 2, 3, "x"]] + + result = ser.reindex([3, 2, 3, "x"]) + tm.assert_series_equal(result, expected, check_index_type=True) + + expected = Series([0.2, 0.2, 0.1], index=[2, 2, 1]) + result = ser.loc[[2, 2, 1]] + tm.assert_series_equal(result, expected, check_index_type=True) + + expected = Series([0.2, 0.2, np.nan, 0.1], index=[2, 2, "x", 1]) + with pytest.raises(KeyError, match="with any missing labels"): + result = ser.loc[[2, 2, "x", 1]] + + result = ser.reindex([2, 2, "x", 1]) + tm.assert_series_equal(result, expected, check_index_type=True) + + # raises as nothing in in the index + msg = ( + r"\"None of \[Int64Index\(\[3, 3, 3\], dtype='int64'\)\] are" + r" in the \[index\]\"" + ) + with pytest.raises(KeyError, match=msg): + ser.loc[[3, 3, 3]] + + expected = Series([0.2, 0.2, np.nan], index=[2, 2, 3]) + with pytest.raises(KeyError, match="with any missing labels"): + ser.loc[[2, 2, 3]] + + result = ser.reindex([2, 2, 3]) + tm.assert_series_equal(result, expected, check_index_type=True) + + s = Series([0.1, 0.2, 0.3], index=[1, 2, 3]) + expected = Series([0.3, np.nan, np.nan], index=[3, 4, 4]) + with pytest.raises(KeyError, match="with any missing labels"): + s.loc[[3, 4, 4]] + + result = s.reindex([3, 4, 4]) + tm.assert_series_equal(result, expected, check_index_type=True) + + s = Series([0.1, 0.2, 0.3, 0.4], index=[1, 2, 3, 4]) + expected = Series([np.nan, 0.3, 0.3], index=[5, 3, 3]) + with pytest.raises(KeyError, match="with any missing labels"): + s.loc[[5, 3, 3]] + + result = s.reindex([5, 3, 3]) + tm.assert_series_equal(result, expected, check_index_type=True) + + s = Series([0.1, 0.2, 0.3, 0.4], index=[1, 2, 3, 4]) + expected = Series([np.nan, 0.4, 0.4], index=[5, 4, 4]) + with pytest.raises(KeyError, match="with any missing labels"): + s.loc[[5, 4, 4]] + + result = s.reindex([5, 4, 4]) + tm.assert_series_equal(result, expected, check_index_type=True) + + s = Series([0.1, 0.2, 0.3, 0.4], index=[4, 5, 6, 7]) + expected = Series([0.4, np.nan, np.nan], index=[7, 2, 2]) + with pytest.raises(KeyError, match="with any missing labels"): + s.loc[[7, 2, 2]] + + result = s.reindex([7, 2, 2]) + tm.assert_series_equal(result, expected, check_index_type=True) + + s = Series([0.1, 0.2, 0.3, 0.4], index=[1, 2, 3, 4]) + expected = Series([0.4, np.nan, np.nan], index=[4, 5, 5]) + with pytest.raises(KeyError, match="with any missing labels"): + s.loc[[4, 5, 5]] + + result = s.reindex([4, 5, 5]) + tm.assert_series_equal(result, expected, check_index_type=True) + + # iloc + expected = Series([0.2, 0.2, 0.1, 0.1], index=[2, 2, 1, 1]) + result = ser.iloc[[1, 1, 0, 0]] + tm.assert_series_equal(result, expected, check_index_type=True) + + def test_series_partial_set_with_name(self): + # GH 11497 + + idx = Index([1, 2], dtype="int64", name="idx") + ser = Series([0.1, 0.2], index=idx, name="s") + + # loc + with pytest.raises(KeyError, match="with any missing labels"): + ser.loc[[3, 2, 3]] + + with pytest.raises(KeyError, match="with any missing labels"): + ser.loc[[3, 2, 3, "x"]] + + exp_idx = Index([2, 2, 1], dtype="int64", name="idx") + expected = Series([0.2, 0.2, 0.1], index=exp_idx, name="s") + result = ser.loc[[2, 2, 1]] + tm.assert_series_equal(result, expected, check_index_type=True) + + with pytest.raises(KeyError, match="with any missing labels"): + ser.loc[[2, 2, "x", 1]] + + # raises as nothing in in the index + msg = ( + r"\"None of \[Int64Index\(\[3, 3, 3\], dtype='int64'," + r" name='idx'\)\] are in the \[index\]\"" + ) + with pytest.raises(KeyError, match=msg): + ser.loc[[3, 3, 3]] + + with pytest.raises(KeyError, match="with any missing labels"): + ser.loc[[2, 2, 3]] + + idx = Index([1, 2, 3], dtype="int64", name="idx") + with pytest.raises(KeyError, match="with any missing labels"): + Series([0.1, 0.2, 0.3], index=idx, name="s").loc[[3, 4, 4]] + + idx = Index([1, 2, 3, 4], dtype="int64", name="idx") + with pytest.raises(KeyError, match="with any missing labels"): + Series([0.1, 0.2, 0.3, 0.4], index=idx, name="s").loc[[5, 3, 3]] + + idx = Index([1, 2, 3, 4], dtype="int64", name="idx") + with pytest.raises(KeyError, match="with any missing labels"): + Series([0.1, 0.2, 0.3, 0.4], index=idx, name="s").loc[[5, 4, 4]] + + idx = Index([4, 5, 6, 7], dtype="int64", name="idx") + with pytest.raises(KeyError, match="with any missing labels"): + Series([0.1, 0.2, 0.3, 0.4], index=idx, name="s").loc[[7, 2, 2]] + + idx = Index([1, 2, 3, 4], dtype="int64", name="idx") + with pytest.raises(KeyError, match="with any missing labels"): + Series([0.1, 0.2, 0.3, 0.4], index=idx, name="s").loc[[4, 5, 5]] + + # iloc + exp_idx = Index([2, 2, 1, 1], dtype="int64", name="idx") + expected = Series([0.2, 0.2, 0.1, 0.1], index=exp_idx, name="s") + result = ser.iloc[[1, 1, 0, 0]] + tm.assert_series_equal(result, expected, check_index_type=True) + + def test_partial_set_invalid(self): + + # GH 4940 + # allow only setting of 'valid' values + + orig = tm.makeTimeDataFrame() + df = orig.copy() + + # don't allow not string inserts + with pytest.raises(TypeError): + df.loc[100.0, :] = df.iloc[0] + + with pytest.raises(TypeError): + df.loc[100, :] = df.iloc[0] + + # allow object conversion here + df = orig.copy() + df.loc["a", :] = df.iloc[0] + exp = orig.append(Series(df.iloc[0], name="a")) + tm.assert_frame_equal(df, exp) + tm.assert_index_equal(df.index, Index(orig.index.tolist() + ["a"])) + assert df.index.dtype == "object" + + def test_partial_set_empty_series(self): + + # GH5226 + + # partially set with an empty object series + s = Series(dtype=object) + s.loc[1] = 1 + tm.assert_series_equal(s, Series([1], index=[1])) + s.loc[3] = 3 + tm.assert_series_equal(s, Series([1, 3], index=[1, 3])) + + s = Series(dtype=object) + s.loc[1] = 1.0 + tm.assert_series_equal(s, Series([1.0], index=[1])) + s.loc[3] = 3.0 + tm.assert_series_equal(s, Series([1.0, 3.0], index=[1, 3])) + + s = Series(dtype=object) + s.loc["foo"] = 1 + tm.assert_series_equal(s, Series([1], index=["foo"])) + s.loc["bar"] = 3 + tm.assert_series_equal(s, Series([1, 3], index=["foo", "bar"])) + s.loc[3] = 4 + tm.assert_series_equal(s, Series([1, 3, 4], index=["foo", "bar", 3])) + + def test_partial_set_empty_frame(self): + + # partially set with an empty object + # frame + df = DataFrame() + + with pytest.raises(ValueError): + df.loc[1] = 1 + + with pytest.raises(ValueError): + df.loc[1] = Series([1], index=["foo"]) + + with pytest.raises(ValueError): + df.loc[:, 1] = 1 + + # these work as they don't really change + # anything but the index + # GH5632 + expected = DataFrame(columns=["foo"], index=Index([], dtype="object")) + + def f(): + df = DataFrame(index=Index([], dtype="object")) + df["foo"] = Series([], dtype="object") + return df + + tm.assert_frame_equal(f(), expected) + + def f(): + df = DataFrame() + df["foo"] = Series(df.index) + return df + + tm.assert_frame_equal(f(), expected) + + def f(): + df = DataFrame() + df["foo"] = df.index + return df + + tm.assert_frame_equal(f(), expected) + + expected = DataFrame(columns=["foo"], index=Index([], dtype="int64")) + expected["foo"] = expected["foo"].astype("float64") + + def f(): + df = DataFrame(index=Index([], dtype="int64")) + df["foo"] = [] + return df + + tm.assert_frame_equal(f(), expected) + + def f(): + df = DataFrame(index=Index([], dtype="int64")) + df["foo"] = Series(np.arange(len(df)), dtype="float64") + return df + + tm.assert_frame_equal(f(), expected) + + def f(): + df = DataFrame(index=Index([], dtype="int64")) + df["foo"] = range(len(df)) + return df + + expected = DataFrame(columns=["foo"], index=Index([], dtype="int64")) + expected["foo"] = expected["foo"].astype("float64") + tm.assert_frame_equal(f(), expected) + + df = DataFrame() + tm.assert_index_equal(df.columns, Index([], dtype=object)) + df2 = DataFrame() + df2[1] = Series([1], index=["foo"]) + df.loc[:, 1] = Series([1], index=["foo"]) + tm.assert_frame_equal(df, DataFrame([[1]], index=["foo"], columns=[1])) + tm.assert_frame_equal(df, df2) + + # no index to start + expected = DataFrame({0: Series(1, index=range(4))}, columns=["A", "B", 0]) + + df = DataFrame(columns=["A", "B"]) + df[0] = Series(1, index=range(4)) + df.dtypes + str(df) + tm.assert_frame_equal(df, expected) + + df = DataFrame(columns=["A", "B"]) + df.loc[:, 0] = Series(1, index=range(4)) + df.dtypes + str(df) + tm.assert_frame_equal(df, expected) + + def test_partial_set_empty_frame_row(self): + # GH5720, GH5744 + # don't create rows when empty + expected = DataFrame(columns=["A", "B", "New"], index=Index([], dtype="int64")) + expected["A"] = expected["A"].astype("int64") + expected["B"] = expected["B"].astype("float64") + expected["New"] = expected["New"].astype("float64") + + df = DataFrame({"A": [1, 2, 3], "B": [1.2, 4.2, 5.2]}) + y = df[df.A > 5] + y["New"] = np.nan + tm.assert_frame_equal(y, expected) + # tm.assert_frame_equal(y,expected) + + expected = DataFrame(columns=["a", "b", "c c", "d"]) + expected["d"] = expected["d"].astype("int64") + df = DataFrame(columns=["a", "b", "c c"]) + df["d"] = 3 + tm.assert_frame_equal(df, expected) + tm.assert_series_equal(df["c c"], Series(name="c c", dtype=object)) + + # reindex columns is ok + df = DataFrame({"A": [1, 2, 3], "B": [1.2, 4.2, 5.2]}) + y = df[df.A > 5] + result = y.reindex(columns=["A", "B", "C"]) + expected = DataFrame(columns=["A", "B", "C"], index=Index([], dtype="int64")) + expected["A"] = expected["A"].astype("int64") + expected["B"] = expected["B"].astype("float64") + expected["C"] = expected["C"].astype("float64") + tm.assert_frame_equal(result, expected) + + def test_partial_set_empty_frame_set_series(self): + # GH 5756 + # setting with empty Series + df = DataFrame(Series(dtype=object)) + tm.assert_frame_equal(df, DataFrame({0: Series(dtype=object)})) + + df = DataFrame(Series(name="foo", dtype=object)) + tm.assert_frame_equal(df, DataFrame({"foo": Series(dtype=object)})) + + def test_partial_set_empty_frame_empty_copy_assignment(self): + # GH 5932 + # copy on empty with assignment fails + df = DataFrame(index=[0]) + df = df.copy() + df["a"] = 0 + expected = DataFrame(0, index=[0], columns=["a"]) + tm.assert_frame_equal(df, expected) + + def test_partial_set_empty_frame_empty_consistencies(self): + # GH 6171 + # consistency on empty frames + df = DataFrame(columns=["x", "y"]) + df["x"] = [1, 2] + expected = DataFrame(dict(x=[1, 2], y=[np.nan, np.nan])) + tm.assert_frame_equal(df, expected, check_dtype=False) + + df = DataFrame(columns=["x", "y"]) + df["x"] = ["1", "2"] + expected = DataFrame(dict(x=["1", "2"], y=[np.nan, np.nan]), dtype=object) + tm.assert_frame_equal(df, expected) + + df = DataFrame(columns=["x", "y"]) + df.loc[0, "x"] = 1 + expected = DataFrame(dict(x=[1], y=[np.nan])) + tm.assert_frame_equal(df, expected, check_dtype=False) diff --git a/venv/Lib/site-packages/pandas/tests/indexing/test_scalar.py b/venv/Lib/site-packages/pandas/tests/indexing/test_scalar.py new file mode 100644 index 0000000..a567fb9 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/test_scalar.py @@ -0,0 +1,247 @@ +""" test scalar indexing, including at and iat """ + +import numpy as np +import pytest + +from pandas import DataFrame, Series, Timedelta, Timestamp, date_range +import pandas._testing as tm +from pandas.tests.indexing.common import Base + + +class TestScalar(Base): + def test_at_and_iat_get(self): + def _check(f, func, values=False): + + if f is not None: + indicies = self.generate_indices(f, values) + for i in indicies: + result = getattr(f, func)[i] + expected = self.get_value(func, f, i, values) + tm.assert_almost_equal(result, expected) + + for kind in self._kinds: + + d = getattr(self, kind) + + # iat + for f in [d["ints"], d["uints"]]: + _check(f, "iat", values=True) + + for f in [d["labels"], d["ts"], d["floats"]]: + if f is not None: + msg = "iAt based indexing can only have integer indexers" + with pytest.raises(ValueError, match=msg): + self.check_values(f, "iat") + + # at + for f in [d["ints"], d["uints"], d["labels"], d["ts"], d["floats"]]: + _check(f, "at") + + def test_at_and_iat_set(self): + def _check(f, func, values=False): + + if f is not None: + indicies = self.generate_indices(f, values) + for i in indicies: + getattr(f, func)[i] = 1 + expected = self.get_value(func, f, i, values) + tm.assert_almost_equal(expected, 1) + + for kind in self._kinds: + + d = getattr(self, kind) + + # iat + for f in [d["ints"], d["uints"]]: + _check(f, "iat", values=True) + + for f in [d["labels"], d["ts"], d["floats"]]: + if f is not None: + msg = "iAt based indexing can only have integer indexers" + with pytest.raises(ValueError, match=msg): + _check(f, "iat") + + # at + for f in [d["ints"], d["uints"], d["labels"], d["ts"], d["floats"]]: + _check(f, "at") + + def test_at_iat_coercion(self): + + # as timestamp is not a tuple! + dates = date_range("1/1/2000", periods=8) + df = DataFrame(np.random.randn(8, 4), index=dates, columns=["A", "B", "C", "D"]) + s = df["A"] + + result = s.at[dates[5]] + xp = s.values[5] + assert result == xp + + # GH 7729 + # make sure we are boxing the returns + s = Series(["2014-01-01", "2014-02-02"], dtype="datetime64[ns]") + expected = Timestamp("2014-02-02") + + for r in [lambda: s.iat[1], lambda: s.iloc[1]]: + result = r() + assert result == expected + + s = Series(["1 days", "2 days"], dtype="timedelta64[ns]") + expected = Timedelta("2 days") + + for r in [lambda: s.iat[1], lambda: s.iloc[1]]: + result = r() + assert result == expected + + def test_iat_invalid_args(self): + pass + + def test_imethods_with_dups(self): + + # GH6493 + # iat/iloc with dups + + s = Series(range(5), index=[1, 1, 2, 2, 3], dtype="int64") + result = s.iloc[2] + assert result == 2 + result = s.iat[2] + assert result == 2 + + msg = "index 10 is out of bounds for axis 0 with size 5" + with pytest.raises(IndexError, match=msg): + s.iat[10] + msg = "index -10 is out of bounds for axis 0 with size 5" + with pytest.raises(IndexError, match=msg): + s.iat[-10] + + result = s.iloc[[2, 3]] + expected = Series([2, 3], [2, 2], dtype="int64") + tm.assert_series_equal(result, expected) + + df = s.to_frame() + result = df.iloc[2] + expected = Series(2, index=[0], name=2) + tm.assert_series_equal(result, expected) + + result = df.iat[2, 0] + assert result == 2 + + def test_at_to_fail(self): + # at should not fallback + # GH 7814 + s = Series([1, 2, 3], index=list("abc")) + result = s.at["a"] + assert result == 1 + msg = ( + "At based indexing on an non-integer index can only have " + "non-integer indexers" + ) + with pytest.raises(ValueError, match=msg): + s.at[0] + + df = DataFrame({"A": [1, 2, 3]}, index=list("abc")) + result = df.at["a", "A"] + assert result == 1 + with pytest.raises(ValueError, match=msg): + df.at["a", 0] + + s = Series([1, 2, 3], index=[3, 2, 1]) + result = s.at[1] + assert result == 3 + msg = "At based indexing on an integer index can only have integer indexers" + with pytest.raises(ValueError, match=msg): + s.at["a"] + + df = DataFrame({0: [1, 2, 3]}, index=[3, 2, 1]) + result = df.at[1, 0] + assert result == 3 + with pytest.raises(ValueError, match=msg): + df.at["a", 0] + + # GH 13822, incorrect error string with non-unique columns when missing + # column is accessed + df = DataFrame({"x": [1.0], "y": [2.0], "z": [3.0]}) + df.columns = ["x", "x", "z"] + + # Check that we get the correct value in the KeyError + with pytest.raises(KeyError, match=r"\['y'\] not in index"): + df[["x", "y", "z"]] + + def test_at_with_tz(self): + # gh-15822 + df = DataFrame( + { + "name": ["John", "Anderson"], + "date": [ + Timestamp(2017, 3, 13, 13, 32, 56), + Timestamp(2017, 2, 16, 12, 10, 3), + ], + } + ) + df["date"] = df["date"].dt.tz_localize("Asia/Shanghai") + + expected = Timestamp("2017-03-13 13:32:56+0800", tz="Asia/Shanghai") + + result = df.loc[0, "date"] + assert result == expected + + result = df.at[0, "date"] + assert result == expected + + def test_series_set_tz_timestamp(self, tz_naive_fixture): + # GH 25506 + ts = Timestamp("2017-08-05 00:00:00+0100", tz=tz_naive_fixture) + result = Series(ts) + result.at[1] = ts + expected = Series([ts, ts]) + tm.assert_series_equal(result, expected) + + def test_mixed_index_at_iat_loc_iloc_series(self): + # GH 19860 + s = Series([1, 2, 3, 4, 5], index=["a", "b", "c", 1, 2]) + for el, item in s.items(): + assert s.at[el] == s.loc[el] == item + for i in range(len(s)): + assert s.iat[i] == s.iloc[i] == i + 1 + + with pytest.raises(KeyError, match="^4$"): + s.at[4] + with pytest.raises(KeyError, match="^4$"): + s.loc[4] + + def test_mixed_index_at_iat_loc_iloc_dataframe(self): + # GH 19860 + df = DataFrame( + [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]], columns=["a", "b", "c", 1, 2] + ) + for rowIdx, row in df.iterrows(): + for el, item in row.items(): + assert df.at[rowIdx, el] == df.loc[rowIdx, el] == item + + for row in range(2): + for i in range(5): + assert df.iat[row, i] == df.iloc[row, i] == row * 5 + i + + with pytest.raises(KeyError, match="^3$"): + df.at[0, 3] + with pytest.raises(KeyError, match="^3$"): + df.loc[0, 3] + + def test_iat_setter_incompatible_assignment(self): + # GH 23236 + result = DataFrame({"a": [0, 1], "b": [4, 5]}) + result.iat[0, 0] = None + expected = DataFrame({"a": [None, 1], "b": [4, 5]}) + tm.assert_frame_equal(result, expected) + + def test_getitem_zerodim_np_array(self): + # GH24924 + # dataframe __getitem__ + df = DataFrame([[1, 2], [3, 4]]) + result = df[np.array(0)] + expected = Series([1, 3], name=0) + tm.assert_series_equal(result, expected) + + # series __getitem__ + s = Series([1, 2]) + result = s[np.array(0)] + assert result == 1 diff --git a/venv/Lib/site-packages/pandas/tests/indexing/test_timedelta.py b/venv/Lib/site-packages/pandas/tests/indexing/test_timedelta.py new file mode 100644 index 0000000..dd47501 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/indexing/test_timedelta.py @@ -0,0 +1,120 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +class TestTimedeltaIndexing: + def test_boolean_indexing(self): + # GH 14946 + df = pd.DataFrame({"x": range(10)}) + df.index = pd.to_timedelta(range(10), unit="s") + conditions = [df["x"] > 3, df["x"] == 3, df["x"] < 3] + expected_data = [ + [0, 1, 2, 3, 10, 10, 10, 10, 10, 10], + [0, 1, 2, 10, 4, 5, 6, 7, 8, 9], + [10, 10, 10, 3, 4, 5, 6, 7, 8, 9], + ] + for cond, data in zip(conditions, expected_data): + result = df.assign(x=df.mask(cond, 10).astype("int64")) + expected = pd.DataFrame( + data, + index=pd.to_timedelta(range(10), unit="s"), + columns=["x"], + dtype="int64", + ) + tm.assert_frame_equal(expected, result) + + @pytest.mark.parametrize( + "indexer, expected", + [ + (0, [20, 1, 2, 3, 4, 5, 6, 7, 8, 9]), + (slice(4, 8), [0, 1, 2, 3, 20, 20, 20, 20, 8, 9]), + ([3, 5], [0, 1, 2, 20, 4, 20, 6, 7, 8, 9]), + ], + ) + def test_list_like_indexing(self, indexer, expected): + # GH 16637 + df = pd.DataFrame({"x": range(10)}, dtype="int64") + df.index = pd.to_timedelta(range(10), unit="s") + + df.loc[df.index[indexer], "x"] = 20 + + expected = pd.DataFrame( + expected, + index=pd.to_timedelta(range(10), unit="s"), + columns=["x"], + dtype="int64", + ) + + tm.assert_frame_equal(expected, df) + + def test_string_indexing(self): + # GH 16896 + df = pd.DataFrame({"x": range(3)}, index=pd.to_timedelta(range(3), unit="days")) + expected = df.iloc[0] + sliced = df.loc["0 days"] + tm.assert_series_equal(sliced, expected) + + @pytest.mark.parametrize("value", [None, pd.NaT, np.nan]) + def test_masked_setitem(self, value): + # issue (#18586) + series = pd.Series([0, 1, 2], dtype="timedelta64[ns]") + series[series == series[0]] = value + expected = pd.Series([pd.NaT, 1, 2], dtype="timedelta64[ns]") + tm.assert_series_equal(series, expected) + + @pytest.mark.parametrize("value", [None, pd.NaT, np.nan]) + def test_listlike_setitem(self, value): + # issue (#18586) + series = pd.Series([0, 1, 2], dtype="timedelta64[ns]") + series.iloc[0] = value + expected = pd.Series([pd.NaT, 1, 2], dtype="timedelta64[ns]") + tm.assert_series_equal(series, expected) + + @pytest.mark.parametrize( + "start,stop, expected_slice", + [ + [np.timedelta64(0, "ns"), None, slice(0, 11)], + [np.timedelta64(1, "D"), np.timedelta64(6, "D"), slice(1, 7)], + [None, np.timedelta64(4, "D"), slice(0, 5)], + ], + ) + def test_numpy_timedelta_scalar_indexing(self, start, stop, expected_slice): + # GH 20393 + s = pd.Series(range(11), pd.timedelta_range("0 days", "10 days")) + result = s.loc[slice(start, stop)] + expected = s.iloc[expected_slice] + tm.assert_series_equal(result, expected) + + def test_roundtrip_thru_setitem(self): + # PR 23462 + dt1 = pd.Timedelta(0) + dt2 = pd.Timedelta(28767471428571405) + df = pd.DataFrame({"dt": pd.Series([dt1, dt2])}) + df_copy = df.copy() + s = pd.Series([dt1]) + + expected = df["dt"].iloc[1].value + df.loc[[True, False]] = s + result = df["dt"].iloc[1].value + + assert expected == result + tm.assert_frame_equal(df, df_copy) + + def test_loc_str_slicing(self): + ix = pd.timedelta_range(start="1 day", end="2 days", freq="1H") + ser = ix.to_series() + result = ser.loc[:"1 days"] + expected = ser.iloc[:-1] + + tm.assert_series_equal(result, expected) + + def test_loc_slicing(self): + ix = pd.timedelta_range(start="1 day", end="2 days", freq="1H") + ser = ix.to_series() + result = ser.loc[: ix[-2]] + expected = ser.iloc[:-1] + + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/internals/__init__.py b/venv/Lib/site-packages/pandas/tests/internals/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/internals/test_internals.py b/venv/Lib/site-packages/pandas/tests/internals/test_internals.py new file mode 100644 index 0000000..15b1434 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/internals/test_internals.py @@ -0,0 +1,1285 @@ +from collections import OrderedDict +from datetime import date, datetime +import itertools +import operator +import re + +import numpy as np +import pytest + +from pandas._libs.internals import BlockPlacement + +import pandas as pd +from pandas import Categorical, DataFrame, DatetimeIndex, Index, MultiIndex, Series +import pandas._testing as tm +import pandas.core.algorithms as algos +from pandas.core.arrays import DatetimeArray, SparseArray, TimedeltaArray +from pandas.core.internals import BlockManager, SingleBlockManager, make_block + + +@pytest.fixture +def mgr(): + return create_mgr( + "a: f8; b: object; c: f8; d: object; e: f8;" + "f: bool; g: i8; h: complex; i: datetime-1; j: datetime-2;" + "k: M8[ns, US/Eastern]; l: M8[ns, CET];" + ) + + +def assert_block_equal(left, right): + tm.assert_numpy_array_equal(left.values, right.values) + assert left.dtype == right.dtype + assert isinstance(left.mgr_locs, BlockPlacement) + assert isinstance(right.mgr_locs, BlockPlacement) + tm.assert_numpy_array_equal(left.mgr_locs.as_array, right.mgr_locs.as_array) + + +def get_numeric_mat(shape): + arr = np.arange(shape[0]) + return np.lib.stride_tricks.as_strided( + x=arr, shape=shape, strides=(arr.itemsize,) + (0,) * (len(shape) - 1) + ).copy() + + +N = 10 + + +def create_block(typestr, placement, item_shape=None, num_offset=0): + """ + Supported typestr: + + * float, f8, f4, f2 + * int, i8, i4, i2, i1 + * uint, u8, u4, u2, u1 + * complex, c16, c8 + * bool + * object, string, O + * datetime, dt, M8[ns], M8[ns, tz] + * timedelta, td, m8[ns] + * sparse (SparseArray with fill_value=0.0) + * sparse_na (SparseArray with fill_value=np.nan) + * category, category2 + + """ + placement = BlockPlacement(placement) + num_items = len(placement) + + if item_shape is None: + item_shape = (N,) + + shape = (num_items,) + item_shape + + mat = get_numeric_mat(shape) + + if typestr in ( + "float", + "f8", + "f4", + "f2", + "int", + "i8", + "i4", + "i2", + "i1", + "uint", + "u8", + "u4", + "u2", + "u1", + ): + values = mat.astype(typestr) + num_offset + elif typestr in ("complex", "c16", "c8"): + values = 1.0j * (mat.astype(typestr) + num_offset) + elif typestr in ("object", "string", "O"): + values = np.reshape( + ["A{i:d}".format(i=i) for i in mat.ravel() + num_offset], shape + ) + elif typestr in ("b", "bool"): + values = np.ones(shape, dtype=np.bool_) + elif typestr in ("datetime", "dt", "M8[ns]"): + values = (mat * 1e9).astype("M8[ns]") + elif typestr.startswith("M8[ns"): + # datetime with tz + m = re.search(r"M8\[ns,\s*(\w+\/?\w*)\]", typestr) + assert m is not None, "incompatible typestr -> {0}".format(typestr) + tz = m.groups()[0] + assert num_items == 1, "must have only 1 num items for a tz-aware" + values = DatetimeIndex(np.arange(N) * 1e9, tz=tz) + elif typestr in ("timedelta", "td", "m8[ns]"): + values = (mat * 1).astype("m8[ns]") + elif typestr in ("category",): + values = Categorical([1, 1, 2, 2, 3, 3, 3, 3, 4, 4]) + elif typestr in ("category2",): + values = Categorical(["a", "a", "a", "a", "b", "b", "c", "c", "c", "d"]) + elif typestr in ("sparse", "sparse_na"): + # FIXME: doesn't support num_rows != 10 + assert shape[-1] == 10 + assert all(s == 1 for s in shape[:-1]) + if typestr.endswith("_na"): + fill_value = np.nan + else: + fill_value = 0.0 + values = SparseArray( + [fill_value, fill_value, 1, 2, 3, fill_value, 4, 5, fill_value, 6], + fill_value=fill_value, + ) + arr = values.sp_values.view() + arr += num_offset - 1 + else: + raise ValueError(f'Unsupported typestr: "{typestr}"') + + return make_block(values, placement=placement, ndim=len(shape)) + + +def create_single_mgr(typestr, num_rows=None): + if num_rows is None: + num_rows = N + + return SingleBlockManager( + create_block(typestr, placement=slice(0, num_rows), item_shape=()), + np.arange(num_rows), + ) + + +def create_mgr(descr, item_shape=None): + """ + Construct BlockManager from string description. + + String description syntax looks similar to np.matrix initializer. It looks + like this:: + + a,b,c: f8; d,e,f: i8 + + Rules are rather simple: + + * see list of supported datatypes in `create_block` method + * components are semicolon-separated + * each component is `NAME,NAME,NAME: DTYPE_ID` + * whitespace around colons & semicolons are removed + * components with same DTYPE_ID are combined into single block + * to force multiple blocks with same dtype, use '-SUFFIX':: + + 'a:f8-1; b:f8-2; c:f8-foobar' + + """ + if item_shape is None: + item_shape = (N,) + + offset = 0 + mgr_items = [] + block_placements = OrderedDict() + for d in descr.split(";"): + d = d.strip() + if not len(d): + continue + names, blockstr = d.partition(":")[::2] + blockstr = blockstr.strip() + names = names.strip().split(",") + + mgr_items.extend(names) + placement = list(np.arange(len(names)) + offset) + try: + block_placements[blockstr].extend(placement) + except KeyError: + block_placements[blockstr] = placement + offset += len(names) + + mgr_items = Index(mgr_items) + + blocks = [] + num_offset = 0 + for blockstr, placement in block_placements.items(): + typestr = blockstr.split("-")[0] + blocks.append( + create_block( + typestr, placement, item_shape=item_shape, num_offset=num_offset + ) + ) + num_offset += len(placement) + + return BlockManager( + sorted(blocks, key=lambda b: b.mgr_locs[0]), + [mgr_items] + [np.arange(n) for n in item_shape], + ) + + +class TestBlock: + def setup_method(self, method): + # self.fblock = get_float_ex() # a,c,e + # self.cblock = get_complex_ex() # + # self.oblock = get_obj_ex() + # self.bool_block = get_bool_ex() + # self.int_block = get_int_ex() + + self.fblock = create_block("float", [0, 2, 4]) + self.cblock = create_block("complex", [7]) + self.oblock = create_block("object", [1, 3]) + self.bool_block = create_block("bool", [5]) + self.int_block = create_block("int", [6]) + + def test_constructor(self): + int32block = create_block("i4", [0]) + assert int32block.dtype == np.int32 + + def test_pickle(self): + def _check(blk): + assert_block_equal(tm.round_trip_pickle(blk), blk) + + _check(self.fblock) + _check(self.cblock) + _check(self.oblock) + _check(self.bool_block) + + def test_mgr_locs(self): + assert isinstance(self.fblock.mgr_locs, BlockPlacement) + tm.assert_numpy_array_equal( + self.fblock.mgr_locs.as_array, np.array([0, 2, 4], dtype=np.int64) + ) + + def test_attrs(self): + assert self.fblock.shape == self.fblock.values.shape + assert self.fblock.dtype == self.fblock.values.dtype + assert len(self.fblock) == len(self.fblock.values) + + def test_merge(self): + avals = tm.randn(2, 10) + bvals = tm.randn(2, 10) + + ref_cols = Index(["e", "a", "b", "d", "f"]) + + ablock = make_block(avals, ref_cols.get_indexer(["e", "b"])) + bblock = make_block(bvals, ref_cols.get_indexer(["a", "d"])) + merged = ablock.merge(bblock) + tm.assert_numpy_array_equal( + merged.mgr_locs.as_array, np.array([0, 1, 2, 3], dtype=np.int64) + ) + tm.assert_numpy_array_equal(merged.values[[0, 2]], np.array(avals)) + tm.assert_numpy_array_equal(merged.values[[1, 3]], np.array(bvals)) + + # TODO: merge with mixed type? + + def test_copy(self): + cop = self.fblock.copy() + assert cop is not self.fblock + assert_block_equal(self.fblock, cop) + + def test_reindex_index(self): + pass + + def test_reindex_cast(self): + pass + + def test_insert(self): + pass + + def test_delete(self): + newb = self.fblock.copy() + newb.delete(0) + assert isinstance(newb.mgr_locs, BlockPlacement) + tm.assert_numpy_array_equal( + newb.mgr_locs.as_array, np.array([2, 4], dtype=np.int64) + ) + assert (newb.values[0] == 1).all() + + newb = self.fblock.copy() + newb.delete(1) + assert isinstance(newb.mgr_locs, BlockPlacement) + tm.assert_numpy_array_equal( + newb.mgr_locs.as_array, np.array([0, 4], dtype=np.int64) + ) + assert (newb.values[1] == 2).all() + + newb = self.fblock.copy() + newb.delete(2) + tm.assert_numpy_array_equal( + newb.mgr_locs.as_array, np.array([0, 2], dtype=np.int64) + ) + assert (newb.values[1] == 1).all() + + newb = self.fblock.copy() + with pytest.raises(Exception): + newb.delete(3) + + +class TestDatetimeBlock: + def test_can_hold_element(self): + block = create_block("datetime", [0]) + + # We will check that block._can_hold_element iff arr.__setitem__ works + arr = pd.array(block.values.ravel()) + + # coerce None + assert block._can_hold_element(None) + arr[0] = None + assert arr[0] is pd.NaT + + # coerce different types of datetime objects + vals = [np.datetime64("2010-10-10"), datetime(2010, 10, 10)] + for val in vals: + assert block._can_hold_element(val) + arr[0] = val + + val = date(2010, 10, 10) + assert not block._can_hold_element(val) + with pytest.raises(TypeError): + arr[0] = val + + +class TestBlockManager: + def test_constructor_corner(self): + pass + + def test_attrs(self): + mgr = create_mgr("a,b,c: f8-1; d,e,f: f8-2") + assert mgr.nblocks == 2 + assert len(mgr) == 6 + + def test_is_mixed_dtype(self): + assert not create_mgr("a,b:f8").is_mixed_type + assert not create_mgr("a:f8-1; b:f8-2").is_mixed_type + + assert create_mgr("a,b:f8; c,d: f4").is_mixed_type + assert create_mgr("a,b:f8; c,d: object").is_mixed_type + + def test_duplicate_ref_loc_failure(self): + tmp_mgr = create_mgr("a:bool; a: f8") + + axes, blocks = tmp_mgr.axes, tmp_mgr.blocks + + blocks[0].mgr_locs = np.array([0]) + blocks[1].mgr_locs = np.array([0]) + + # test trying to create block manager with overlapping ref locs + with pytest.raises(AssertionError): + BlockManager(blocks, axes) + + blocks[0].mgr_locs = np.array([0]) + blocks[1].mgr_locs = np.array([1]) + mgr = BlockManager(blocks, axes) + mgr.iget(1) + + def test_contains(self, mgr): + assert "a" in mgr + assert "baz" not in mgr + + def test_pickle(self, mgr): + + mgr2 = tm.round_trip_pickle(mgr) + tm.assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) + + # share ref_items + # assert mgr2.blocks[0].ref_items is mgr2.blocks[1].ref_items + + # GH2431 + assert hasattr(mgr2, "_is_consolidated") + assert hasattr(mgr2, "_known_consolidated") + + # reset to False on load + assert not mgr2._is_consolidated + assert not mgr2._known_consolidated + + def test_non_unique_pickle(self): + + mgr = create_mgr("a,a,a:f8") + mgr2 = tm.round_trip_pickle(mgr) + tm.assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) + + mgr = create_mgr("a: f8; a: i8") + mgr2 = tm.round_trip_pickle(mgr) + tm.assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) + + def test_categorical_block_pickle(self): + mgr = create_mgr("a: category") + mgr2 = tm.round_trip_pickle(mgr) + tm.assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) + + smgr = create_single_mgr("category") + smgr2 = tm.round_trip_pickle(smgr) + tm.assert_series_equal(Series(smgr), Series(smgr2)) + + def test_get(self): + cols = Index(list("abc")) + values = np.random.rand(3, 3) + block = make_block(values=values.copy(), placement=np.arange(3)) + mgr = BlockManager(blocks=[block], axes=[cols, np.arange(3)]) + + tm.assert_almost_equal(mgr.get("a").internal_values(), values[0]) + tm.assert_almost_equal(mgr.get("b").internal_values(), values[1]) + tm.assert_almost_equal(mgr.get("c").internal_values(), values[2]) + + def test_set(self): + mgr = create_mgr("a,b,c: int", item_shape=(3,)) + + mgr.set("d", np.array(["foo"] * 3)) + mgr.set("b", np.array(["bar"] * 3)) + tm.assert_numpy_array_equal(mgr.get("a").internal_values(), np.array([0] * 3)) + tm.assert_numpy_array_equal( + mgr.get("b").internal_values(), np.array(["bar"] * 3, dtype=np.object_) + ) + tm.assert_numpy_array_equal(mgr.get("c").internal_values(), np.array([2] * 3)) + tm.assert_numpy_array_equal( + mgr.get("d").internal_values(), np.array(["foo"] * 3, dtype=np.object_) + ) + + def test_set_change_dtype(self, mgr): + mgr.set("baz", np.zeros(N, dtype=bool)) + + mgr.set("baz", np.repeat("foo", N)) + assert mgr.get("baz").dtype == np.object_ + + mgr2 = mgr.consolidate() + mgr2.set("baz", np.repeat("foo", N)) + assert mgr2.get("baz").dtype == np.object_ + + mgr2.set("quux", tm.randn(N).astype(int)) + assert mgr2.get("quux").dtype == np.int_ + + mgr2.set("quux", tm.randn(N)) + assert mgr2.get("quux").dtype == np.float_ + + def test_set_change_dtype_slice(self): # GH8850 + cols = MultiIndex.from_tuples([("1st", "a"), ("2nd", "b"), ("3rd", "c")]) + df = DataFrame([[1.0, 2, 3], [4.0, 5, 6]], columns=cols) + df["2nd"] = df["2nd"] * 2.0 + + blocks = df._to_dict_of_blocks() + assert sorted(blocks.keys()) == ["float64", "int64"] + tm.assert_frame_equal( + blocks["float64"], DataFrame([[1.0, 4.0], [4.0, 10.0]], columns=cols[:2]) + ) + tm.assert_frame_equal(blocks["int64"], DataFrame([[3], [6]], columns=cols[2:])) + + def test_copy(self, mgr): + cp = mgr.copy(deep=False) + for blk, cp_blk in zip(mgr.blocks, cp.blocks): + + # view assertion + assert cp_blk.equals(blk) + if isinstance(blk.values, np.ndarray): + assert cp_blk.values.base is blk.values.base + else: + # DatetimeTZBlock has DatetimeIndex values + assert cp_blk.values._data.base is blk.values._data.base + + cp = mgr.copy(deep=True) + for blk, cp_blk in zip(mgr.blocks, cp.blocks): + + # copy assertion we either have a None for a base or in case of + # some blocks it is an array (e.g. datetimetz), but was copied + assert cp_blk.equals(blk) + if not isinstance(cp_blk.values, np.ndarray): + assert cp_blk.values._data.base is not blk.values._data.base + else: + assert cp_blk.values.base is None and blk.values.base is None + + def test_sparse(self): + mgr = create_mgr("a: sparse-1; b: sparse-2") + # what to test here? + assert mgr.as_array().dtype == np.float64 + + def test_sparse_mixed(self): + mgr = create_mgr("a: sparse-1; b: sparse-2; c: f8") + assert len(mgr.blocks) == 3 + assert isinstance(mgr, BlockManager) + + # what to test here? + + def test_as_array_float(self): + mgr = create_mgr("c: f4; d: f2; e: f8") + assert mgr.as_array().dtype == np.float64 + + mgr = create_mgr("c: f4; d: f2") + assert mgr.as_array().dtype == np.float32 + + def test_as_array_int_bool(self): + mgr = create_mgr("a: bool-1; b: bool-2") + assert mgr.as_array().dtype == np.bool_ + + mgr = create_mgr("a: i8-1; b: i8-2; c: i4; d: i2; e: u1") + assert mgr.as_array().dtype == np.int64 + + mgr = create_mgr("c: i4; d: i2; e: u1") + assert mgr.as_array().dtype == np.int32 + + def test_as_array_datetime(self): + mgr = create_mgr("h: datetime-1; g: datetime-2") + assert mgr.as_array().dtype == "M8[ns]" + + def test_as_array_datetime_tz(self): + mgr = create_mgr("h: M8[ns, US/Eastern]; g: M8[ns, CET]") + assert mgr.get("h").dtype == "datetime64[ns, US/Eastern]" + assert mgr.get("g").dtype == "datetime64[ns, CET]" + assert mgr.as_array().dtype == "object" + + @pytest.mark.parametrize("t", ["float16", "float32", "float64", "int32", "int64"]) + def test_astype(self, t): + # coerce all + mgr = create_mgr("c: f4; d: f2; e: f8") + + t = np.dtype(t) + tmgr = mgr.astype(t) + assert tmgr.get("c").dtype.type == t + assert tmgr.get("d").dtype.type == t + assert tmgr.get("e").dtype.type == t + + # mixed + mgr = create_mgr("a,b: object; c: bool; d: datetime; e: f4; f: f2; g: f8") + + t = np.dtype(t) + tmgr = mgr.astype(t, errors="ignore") + assert tmgr.get("c").dtype.type == t + assert tmgr.get("e").dtype.type == t + assert tmgr.get("f").dtype.type == t + assert tmgr.get("g").dtype.type == t + + assert tmgr.get("a").dtype.type == np.object_ + assert tmgr.get("b").dtype.type == np.object_ + if t != np.int64: + assert tmgr.get("d").dtype.type == np.datetime64 + else: + assert tmgr.get("d").dtype.type == t + + def test_convert(self): + def _compare(old_mgr, new_mgr): + """ compare the blocks, numeric compare ==, object don't """ + old_blocks = set(old_mgr.blocks) + new_blocks = set(new_mgr.blocks) + assert len(old_blocks) == len(new_blocks) + + # compare non-numeric + for b in old_blocks: + found = False + for nb in new_blocks: + if (b.values == nb.values).all(): + found = True + break + assert found + + for b in new_blocks: + found = False + for ob in old_blocks: + if (b.values == ob.values).all(): + found = True + break + assert found + + # noops + mgr = create_mgr("f: i8; g: f8") + new_mgr = mgr.convert() + _compare(mgr, new_mgr) + + # convert + mgr = create_mgr("a,b,foo: object; f: i8; g: f8") + mgr.set("a", np.array(["1"] * N, dtype=np.object_)) + mgr.set("b", np.array(["2."] * N, dtype=np.object_)) + mgr.set("foo", np.array(["foo."] * N, dtype=np.object_)) + new_mgr = mgr.convert(numeric=True) + assert new_mgr.get("a").dtype == np.int64 + assert new_mgr.get("b").dtype == np.float64 + assert new_mgr.get("foo").dtype == np.object_ + assert new_mgr.get("f").dtype == np.int64 + assert new_mgr.get("g").dtype == np.float64 + + mgr = create_mgr( + "a,b,foo: object; f: i4; bool: bool; dt: datetime; i: i8; g: f8; h: f2" + ) + mgr.set("a", np.array(["1"] * N, dtype=np.object_)) + mgr.set("b", np.array(["2."] * N, dtype=np.object_)) + mgr.set("foo", np.array(["foo."] * N, dtype=np.object_)) + new_mgr = mgr.convert(numeric=True) + assert new_mgr.get("a").dtype == np.int64 + assert new_mgr.get("b").dtype == np.float64 + assert new_mgr.get("foo").dtype == np.object_ + assert new_mgr.get("f").dtype == np.int32 + assert new_mgr.get("bool").dtype == np.bool_ + assert new_mgr.get("dt").dtype.type, np.datetime64 + assert new_mgr.get("i").dtype == np.int64 + assert new_mgr.get("g").dtype == np.float64 + assert new_mgr.get("h").dtype == np.float16 + + def test_interleave(self): + + # self + for dtype in ["f8", "i8", "object", "bool", "complex", "M8[ns]", "m8[ns]"]: + mgr = create_mgr("a: {0}".format(dtype)) + assert mgr.as_array().dtype == dtype + mgr = create_mgr("a: {0}; b: {0}".format(dtype)) + assert mgr.as_array().dtype == dtype + + # will be converted according the actual dtype of the underlying + mgr = create_mgr("a: category") + assert mgr.as_array().dtype == "i8" + mgr = create_mgr("a: category; b: category") + assert mgr.as_array().dtype == "i8" + mgr = create_mgr("a: category; b: category2") + assert mgr.as_array().dtype == "object" + mgr = create_mgr("a: category2") + assert mgr.as_array().dtype == "object" + mgr = create_mgr("a: category2; b: category2") + assert mgr.as_array().dtype == "object" + + # combinations + mgr = create_mgr("a: f8") + assert mgr.as_array().dtype == "f8" + mgr = create_mgr("a: f8; b: i8") + assert mgr.as_array().dtype == "f8" + mgr = create_mgr("a: f4; b: i8") + assert mgr.as_array().dtype == "f8" + mgr = create_mgr("a: f4; b: i8; d: object") + assert mgr.as_array().dtype == "object" + mgr = create_mgr("a: bool; b: i8") + assert mgr.as_array().dtype == "object" + mgr = create_mgr("a: complex") + assert mgr.as_array().dtype == "complex" + mgr = create_mgr("a: f8; b: category") + assert mgr.as_array().dtype == "object" + mgr = create_mgr("a: M8[ns]; b: category") + assert mgr.as_array().dtype == "object" + mgr = create_mgr("a: M8[ns]; b: bool") + assert mgr.as_array().dtype == "object" + mgr = create_mgr("a: M8[ns]; b: i8") + assert mgr.as_array().dtype == "object" + mgr = create_mgr("a: m8[ns]; b: bool") + assert mgr.as_array().dtype == "object" + mgr = create_mgr("a: m8[ns]; b: i8") + assert mgr.as_array().dtype == "object" + mgr = create_mgr("a: M8[ns]; b: m8[ns]") + assert mgr.as_array().dtype == "object" + + def test_interleave_non_unique_cols(self): + df = DataFrame( + [[pd.Timestamp("20130101"), 3.5], [pd.Timestamp("20130102"), 4.5]], + columns=["x", "x"], + index=[1, 2], + ) + + df_unique = df.copy() + df_unique.columns = ["x", "y"] + assert df_unique.values.shape == df.values.shape + tm.assert_numpy_array_equal(df_unique.values[0], df.values[0]) + tm.assert_numpy_array_equal(df_unique.values[1], df.values[1]) + + def test_consolidate(self): + pass + + def test_consolidate_ordering_issues(self, mgr): + mgr.set("f", tm.randn(N)) + mgr.set("d", tm.randn(N)) + mgr.set("b", tm.randn(N)) + mgr.set("g", tm.randn(N)) + mgr.set("h", tm.randn(N)) + + # we have datetime/tz blocks in mgr + cons = mgr.consolidate() + assert cons.nblocks == 4 + cons = mgr.consolidate().get_numeric_data() + assert cons.nblocks == 1 + assert isinstance(cons.blocks[0].mgr_locs, BlockPlacement) + tm.assert_numpy_array_equal( + cons.blocks[0].mgr_locs.as_array, np.arange(len(cons.items), dtype=np.int64) + ) + + def test_reindex_index(self): + # TODO: should this be pytest.skip? + pass + + def test_reindex_items(self): + # mgr is not consolidated, f8 & f8-2 blocks + mgr = create_mgr("a: f8; b: i8; c: f8; d: i8; e: f8; f: bool; g: f8-2") + + reindexed = mgr.reindex_axis(["g", "c", "a", "d"], axis=0) + assert reindexed.nblocks == 2 + tm.assert_index_equal(reindexed.items, pd.Index(["g", "c", "a", "d"])) + tm.assert_almost_equal( + mgr.get("g").internal_values(), reindexed.get("g").internal_values() + ) + tm.assert_almost_equal( + mgr.get("c").internal_values(), reindexed.get("c").internal_values() + ) + tm.assert_almost_equal( + mgr.get("a").internal_values(), reindexed.get("a").internal_values() + ) + tm.assert_almost_equal( + mgr.get("d").internal_values(), reindexed.get("d").internal_values() + ) + + def test_get_numeric_data(self): + mgr = create_mgr( + "int: int; float: float; complex: complex;" + "str: object; bool: bool; obj: object; dt: datetime", + item_shape=(3,), + ) + mgr.set("obj", np.array([1, 2, 3], dtype=np.object_)) + + numeric = mgr.get_numeric_data() + tm.assert_index_equal( + numeric.items, pd.Index(["int", "float", "complex", "bool"]) + ) + tm.assert_almost_equal( + mgr.get("float").internal_values(), numeric.get("float").internal_values() + ) + + # Check sharing + numeric.set("float", np.array([100.0, 200.0, 300.0])) + tm.assert_almost_equal( + mgr.get("float").internal_values(), np.array([100.0, 200.0, 300.0]) + ) + + numeric2 = mgr.get_numeric_data(copy=True) + tm.assert_index_equal( + numeric.items, pd.Index(["int", "float", "complex", "bool"]) + ) + numeric2.set("float", np.array([1000.0, 2000.0, 3000.0])) + tm.assert_almost_equal( + mgr.get("float").internal_values(), np.array([100.0, 200.0, 300.0]) + ) + + def test_get_bool_data(self): + mgr = create_mgr( + "int: int; float: float; complex: complex;" + "str: object; bool: bool; obj: object; dt: datetime", + item_shape=(3,), + ) + mgr.set("obj", np.array([True, False, True], dtype=np.object_)) + + bools = mgr.get_bool_data() + tm.assert_index_equal(bools.items, pd.Index(["bool"])) + tm.assert_almost_equal( + mgr.get("bool").internal_values(), bools.get("bool").internal_values() + ) + + bools.set("bool", np.array([True, False, True])) + tm.assert_numpy_array_equal( + mgr.get("bool").internal_values(), np.array([True, False, True]) + ) + + # Check sharing + bools2 = mgr.get_bool_data(copy=True) + bools2.set("bool", np.array([False, True, False])) + tm.assert_numpy_array_equal( + mgr.get("bool").internal_values(), np.array([True, False, True]) + ) + + def test_unicode_repr_doesnt_raise(self): + repr(create_mgr("b,\u05d0: object")) + + def test_missing_unicode_key(self): + df = DataFrame({"a": [1]}) + try: + df.loc[:, "\u05d0"] # should not raise UnicodeEncodeError + except KeyError: + pass # this is the expected exception + + def test_equals(self): + # unique items + bm1 = create_mgr("a,b,c: i8-1; d,e,f: i8-2") + bm2 = BlockManager(bm1.blocks[::-1], bm1.axes) + assert bm1.equals(bm2) + + bm1 = create_mgr("a,a,a: i8-1; b,b,b: i8-2") + bm2 = BlockManager(bm1.blocks[::-1], bm1.axes) + assert bm1.equals(bm2) + + def test_equals_block_order_different_dtypes(self): + # GH 9330 + + mgr_strings = [ + "a:i8;b:f8", # basic case + "a:i8;b:f8;c:c8;d:b", # many types + "a:i8;e:dt;f:td;g:string", # more types + "a:i8;b:category;c:category2;d:category2", # categories + "c:sparse;d:sparse_na;b:f8", # sparse + ] + + for mgr_string in mgr_strings: + bm = create_mgr(mgr_string) + block_perms = itertools.permutations(bm.blocks) + for bm_perm in block_perms: + bm_this = BlockManager(bm_perm, bm.axes) + assert bm.equals(bm_this) + assert bm_this.equals(bm) + + def test_single_mgr_ctor(self): + mgr = create_single_mgr("f8", num_rows=5) + assert mgr.as_array().tolist() == [0.0, 1.0, 2.0, 3.0, 4.0] + + def test_validate_bool_args(self): + invalid_values = [1, "True", [1, 2, 3], 5.0] + bm1 = create_mgr("a,b,c: i8-1; d,e,f: i8-2") + + for value in invalid_values: + with pytest.raises(ValueError): + bm1.replace_list([1], [2], inplace=value) + + +class TestIndexing: + # Nosetests-style data-driven tests. + # + # This test applies different indexing routines to block managers and + # compares the outcome to the result of same operations on np.ndarray. + # + # NOTE: sparse (SparseBlock with fill_value != np.nan) fail a lot of tests + # and are disabled. + + MANAGERS = [ + create_single_mgr("f8", N), + create_single_mgr("i8", N), + # 2-dim + create_mgr("a,b,c,d,e,f: f8", item_shape=(N,)), + create_mgr("a,b,c,d,e,f: i8", item_shape=(N,)), + create_mgr("a,b: f8; c,d: i8; e,f: string", item_shape=(N,)), + create_mgr("a,b: f8; c,d: i8; e,f: f8", item_shape=(N,)), + # 3-dim + create_mgr("a,b,c,d,e,f: f8", item_shape=(N, N)), + create_mgr("a,b,c,d,e,f: i8", item_shape=(N, N)), + create_mgr("a,b: f8; c,d: i8; e,f: string", item_shape=(N, N)), + create_mgr("a,b: f8; c,d: i8; e,f: f8", item_shape=(N, N)), + ] + + # MANAGERS = [MANAGERS[6]] + + def test_get_slice(self): + def assert_slice_ok(mgr, axis, slobj): + mat = mgr.as_array() + + # we maybe using an ndarray to test slicing and + # might not be the full length of the axis + if isinstance(slobj, np.ndarray): + ax = mgr.axes[axis] + if len(ax) and len(slobj) and len(slobj) != len(ax): + slobj = np.concatenate( + [slobj, np.zeros(len(ax) - len(slobj), dtype=bool)] + ) + sliced = mgr.get_slice(slobj, axis=axis) + mat_slobj = (slice(None),) * axis + (slobj,) + tm.assert_numpy_array_equal( + mat[mat_slobj], sliced.as_array(), check_dtype=False + ) + tm.assert_index_equal(mgr.axes[axis][slobj], sliced.axes[axis]) + + for mgr in self.MANAGERS: + for ax in range(mgr.ndim): + # slice + assert_slice_ok(mgr, ax, slice(None)) + assert_slice_ok(mgr, ax, slice(3)) + assert_slice_ok(mgr, ax, slice(100)) + assert_slice_ok(mgr, ax, slice(1, 4)) + assert_slice_ok(mgr, ax, slice(3, 0, -2)) + + # boolean mask + assert_slice_ok(mgr, ax, np.array([], dtype=np.bool_)) + assert_slice_ok(mgr, ax, np.ones(mgr.shape[ax], dtype=np.bool_)) + assert_slice_ok(mgr, ax, np.zeros(mgr.shape[ax], dtype=np.bool_)) + + if mgr.shape[ax] >= 3: + assert_slice_ok(mgr, ax, np.arange(mgr.shape[ax]) % 3 == 0) + assert_slice_ok( + mgr, ax, np.array([True, True, False], dtype=np.bool_) + ) + + # fancy indexer + assert_slice_ok(mgr, ax, []) + assert_slice_ok(mgr, ax, list(range(mgr.shape[ax]))) + + if mgr.shape[ax] >= 3: + assert_slice_ok(mgr, ax, [0, 1, 2]) + assert_slice_ok(mgr, ax, [-1, -2, -3]) + + def test_take(self): + def assert_take_ok(mgr, axis, indexer): + mat = mgr.as_array() + taken = mgr.take(indexer, axis) + tm.assert_numpy_array_equal( + np.take(mat, indexer, axis), taken.as_array(), check_dtype=False + ) + tm.assert_index_equal(mgr.axes[axis].take(indexer), taken.axes[axis]) + + for mgr in self.MANAGERS: + for ax in range(mgr.ndim): + # take/fancy indexer + assert_take_ok(mgr, ax, indexer=[]) + assert_take_ok(mgr, ax, indexer=[0, 0, 0]) + assert_take_ok(mgr, ax, indexer=list(range(mgr.shape[ax]))) + + if mgr.shape[ax] >= 3: + assert_take_ok(mgr, ax, indexer=[0, 1, 2]) + assert_take_ok(mgr, ax, indexer=[-1, -2, -3]) + + def test_reindex_axis(self): + def assert_reindex_axis_is_ok(mgr, axis, new_labels, fill_value): + mat = mgr.as_array() + indexer = mgr.axes[axis].get_indexer_for(new_labels) + + reindexed = mgr.reindex_axis(new_labels, axis, fill_value=fill_value) + tm.assert_numpy_array_equal( + algos.take_nd(mat, indexer, axis, fill_value=fill_value), + reindexed.as_array(), + check_dtype=False, + ) + tm.assert_index_equal(reindexed.axes[axis], new_labels) + + for mgr in self.MANAGERS: + for ax in range(mgr.ndim): + for fill_value in (None, np.nan, 100.0): + assert_reindex_axis_is_ok(mgr, ax, pd.Index([]), fill_value) + assert_reindex_axis_is_ok(mgr, ax, mgr.axes[ax], fill_value) + assert_reindex_axis_is_ok( + mgr, ax, mgr.axes[ax][[0, 0, 0]], fill_value + ) + assert_reindex_axis_is_ok( + mgr, ax, pd.Index(["foo", "bar", "baz"]), fill_value + ) + assert_reindex_axis_is_ok( + mgr, ax, pd.Index(["foo", mgr.axes[ax][0], "baz"]), fill_value + ) + + if mgr.shape[ax] >= 3: + assert_reindex_axis_is_ok( + mgr, ax, mgr.axes[ax][:-3], fill_value + ) + assert_reindex_axis_is_ok( + mgr, ax, mgr.axes[ax][-3::-1], fill_value + ) + assert_reindex_axis_is_ok( + mgr, ax, mgr.axes[ax][[0, 1, 2, 0, 1, 2]], fill_value + ) + + def test_reindex_indexer(self): + def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, fill_value): + mat = mgr.as_array() + reindexed_mat = algos.take_nd(mat, indexer, axis, fill_value=fill_value) + reindexed = mgr.reindex_indexer( + new_labels, indexer, axis, fill_value=fill_value + ) + tm.assert_numpy_array_equal( + reindexed_mat, reindexed.as_array(), check_dtype=False + ) + tm.assert_index_equal(reindexed.axes[axis], new_labels) + + for mgr in self.MANAGERS: + for ax in range(mgr.ndim): + for fill_value in (None, np.nan, 100.0): + assert_reindex_indexer_is_ok(mgr, ax, pd.Index([]), [], fill_value) + assert_reindex_indexer_is_ok( + mgr, ax, mgr.axes[ax], np.arange(mgr.shape[ax]), fill_value + ) + assert_reindex_indexer_is_ok( + mgr, + ax, + pd.Index(["foo"] * mgr.shape[ax]), + np.arange(mgr.shape[ax]), + fill_value, + ) + assert_reindex_indexer_is_ok( + mgr, + ax, + mgr.axes[ax][::-1], + np.arange(mgr.shape[ax]), + fill_value, + ) + assert_reindex_indexer_is_ok( + mgr, + ax, + mgr.axes[ax], + np.arange(mgr.shape[ax])[::-1], + fill_value, + ) + assert_reindex_indexer_is_ok( + mgr, ax, pd.Index(["foo", "bar", "baz"]), [0, 0, 0], fill_value + ) + assert_reindex_indexer_is_ok( + mgr, + ax, + pd.Index(["foo", "bar", "baz"]), + [-1, 0, -1], + fill_value, + ) + assert_reindex_indexer_is_ok( + mgr, + ax, + pd.Index(["foo", mgr.axes[ax][0], "baz"]), + [-1, -1, -1], + fill_value, + ) + + if mgr.shape[ax] >= 3: + assert_reindex_indexer_is_ok( + mgr, + ax, + pd.Index(["foo", "bar", "baz"]), + [0, 1, 2], + fill_value, + ) + + # test_get_slice(slice_like, axis) + # take(indexer, axis) + # reindex_axis(new_labels, axis) + # reindex_indexer(new_labels, indexer, axis) + + +class TestBlockPlacement: + def test_slice_len(self): + assert len(BlockPlacement(slice(0, 4))) == 4 + assert len(BlockPlacement(slice(0, 4, 2))) == 2 + assert len(BlockPlacement(slice(0, 3, 2))) == 2 + + assert len(BlockPlacement(slice(0, 1, 2))) == 1 + assert len(BlockPlacement(slice(1, 0, -1))) == 1 + + def test_zero_step_raises(self): + with pytest.raises(ValueError): + BlockPlacement(slice(1, 1, 0)) + with pytest.raises(ValueError): + BlockPlacement(slice(1, 2, 0)) + + def test_unbounded_slice_raises(self): + def assert_unbounded_slice_error(slc): + with pytest.raises(ValueError, match="unbounded slice"): + BlockPlacement(slc) + + assert_unbounded_slice_error(slice(None, None)) + assert_unbounded_slice_error(slice(10, None)) + assert_unbounded_slice_error(slice(None, None, -1)) + assert_unbounded_slice_error(slice(None, 10, -1)) + + # These are "unbounded" because negative index will change depending on + # container shape. + assert_unbounded_slice_error(slice(-1, None)) + assert_unbounded_slice_error(slice(None, -1)) + assert_unbounded_slice_error(slice(-1, -1)) + assert_unbounded_slice_error(slice(-1, None, -1)) + assert_unbounded_slice_error(slice(None, -1, -1)) + assert_unbounded_slice_error(slice(-1, -1, -1)) + + def test_not_slice_like_slices(self): + def assert_not_slice_like(slc): + assert not BlockPlacement(slc).is_slice_like + + assert_not_slice_like(slice(0, 0)) + assert_not_slice_like(slice(100, 0)) + + assert_not_slice_like(slice(100, 100, -1)) + assert_not_slice_like(slice(0, 100, -1)) + + assert not BlockPlacement(slice(0, 0)).is_slice_like + assert not BlockPlacement(slice(100, 100)).is_slice_like + + def test_array_to_slice_conversion(self): + def assert_as_slice_equals(arr, slc): + assert BlockPlacement(arr).as_slice == slc + + assert_as_slice_equals([0], slice(0, 1, 1)) + assert_as_slice_equals([100], slice(100, 101, 1)) + + assert_as_slice_equals([0, 1, 2], slice(0, 3, 1)) + assert_as_slice_equals([0, 5, 10], slice(0, 15, 5)) + assert_as_slice_equals([0, 100], slice(0, 200, 100)) + + assert_as_slice_equals([2, 1], slice(2, 0, -1)) + + def test_not_slice_like_arrays(self): + def assert_not_slice_like(arr): + assert not BlockPlacement(arr).is_slice_like + + assert_not_slice_like([]) + assert_not_slice_like([-1]) + assert_not_slice_like([-1, -2, -3]) + assert_not_slice_like([-10]) + assert_not_slice_like([-1]) + assert_not_slice_like([-1, 0, 1, 2]) + assert_not_slice_like([-2, 0, 2, 4]) + assert_not_slice_like([1, 0, -1]) + assert_not_slice_like([1, 1, 1]) + + def test_slice_iter(self): + assert list(BlockPlacement(slice(0, 3))) == [0, 1, 2] + assert list(BlockPlacement(slice(0, 0))) == [] + assert list(BlockPlacement(slice(3, 0))) == [] + + def test_slice_to_array_conversion(self): + def assert_as_array_equals(slc, asarray): + tm.assert_numpy_array_equal( + BlockPlacement(slc).as_array, np.asarray(asarray, dtype=np.int64) + ) + + assert_as_array_equals(slice(0, 3), [0, 1, 2]) + assert_as_array_equals(slice(0, 0), []) + assert_as_array_equals(slice(3, 0), []) + + assert_as_array_equals(slice(3, 0, -1), [3, 2, 1]) + + def test_blockplacement_add(self): + bpl = BlockPlacement(slice(0, 5)) + assert bpl.add(1).as_slice == slice(1, 6, 1) + assert bpl.add(np.arange(5)).as_slice == slice(0, 10, 2) + assert list(bpl.add(np.arange(5, 0, -1))) == [5, 5, 5, 5, 5] + + def test_blockplacement_add_int(self): + def assert_add_equals(val, inc, result): + assert list(BlockPlacement(val).add(inc)) == result + + assert_add_equals(slice(0, 0), 0, []) + assert_add_equals(slice(1, 4), 0, [1, 2, 3]) + assert_add_equals(slice(3, 0, -1), 0, [3, 2, 1]) + assert_add_equals([1, 2, 4], 0, [1, 2, 4]) + + assert_add_equals(slice(0, 0), 10, []) + assert_add_equals(slice(1, 4), 10, [11, 12, 13]) + assert_add_equals(slice(3, 0, -1), 10, [13, 12, 11]) + assert_add_equals([1, 2, 4], 10, [11, 12, 14]) + + assert_add_equals(slice(0, 0), -1, []) + assert_add_equals(slice(1, 4), -1, [0, 1, 2]) + assert_add_equals([1, 2, 4], -1, [0, 1, 3]) + + with pytest.raises(ValueError): + BlockPlacement(slice(1, 4)).add(-10) + with pytest.raises(ValueError): + BlockPlacement([1, 2, 4]).add(-10) + + +class DummyElement: + def __init__(self, value, dtype): + self.value = value + self.dtype = np.dtype(dtype) + + def __array__(self): + return np.array(self.value, dtype=self.dtype) + + def __str__(self) -> str: + return "DummyElement({}, {})".format(self.value, self.dtype) + + def __repr__(self) -> str: + return str(self) + + def astype(self, dtype, copy=False): + self.dtype = dtype + return self + + def view(self, dtype): + return type(self)(self.value.view(dtype), dtype) + + def any(self, axis=None): + return bool(self.value) + + +class TestCanHoldElement: + @pytest.mark.parametrize( + "value, dtype", + [ + (1, "i8"), + (1.0, "f8"), + (2 ** 63, "f8"), + (1j, "complex128"), + (2 ** 63, "complex128"), + (True, "bool"), + (np.timedelta64(20, "ns"), "= 1.11 otherwise, + # see https://github.com/spulec/moto/issues/1924 & 1952 + os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key") + os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret") + + moto = pytest.importorskip("moto") + + test_s3_files = [ + ("tips#1.csv", tips_file), + ("tips.csv", tips_file), + ("tips.csv.gz", tips_file + ".gz"), + ("tips.csv.bz2", tips_file + ".bz2"), + ("items.jsonl", jsonl_file), + ] + + def add_tips_files(bucket_name): + for s3_key, file_name in test_s3_files: + with open(file_name, "rb") as f: + conn.Bucket(bucket_name).put_object(Key=s3_key, Body=f) + + try: + s3 = moto.mock_s3() + s3.start() + + # see gh-16135 + bucket = "pandas-test" + conn = boto3.resource("s3", region_name="us-east-1") + + conn.create_bucket(Bucket=bucket) + add_tips_files(bucket) + + conn.create_bucket(Bucket="cant_get_it", ACL="private") + add_tips_files("cant_get_it") + s3fs.S3FileSystem.clear_instance_cache() + yield conn + finally: + s3.stop() diff --git a/venv/Lib/site-packages/pandas/tests/io/excel/__init__.py b/venv/Lib/site-packages/pandas/tests/io/excel/__init__.py new file mode 100644 index 0000000..5501723 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/excel/__init__.py @@ -0,0 +1,6 @@ +import pytest + +pytestmark = pytest.mark.filterwarnings( + # Looks like tree.getiterator is deprecated in favor of tree.iter + "ignore:This method will be removed in future versions:PendingDeprecationWarning" +) diff --git a/venv/Lib/site-packages/pandas/tests/io/excel/conftest.py b/venv/Lib/site-packages/pandas/tests/io/excel/conftest.py new file mode 100644 index 0000000..0455e0d --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/excel/conftest.py @@ -0,0 +1,65 @@ +import pytest + +import pandas.util._test_decorators as td + +import pandas._testing as tm + +from pandas.io.parsers import read_csv + + +@pytest.fixture +def frame(float_frame): + """ + Returns the first ten items in fixture "float_frame". + """ + return float_frame[:10] + + +@pytest.fixture +def tsframe(): + return tm.makeTimeDataFrame()[:5] + + +@pytest.fixture(params=[True, False]) +def merge_cells(request): + return request.param + + +@pytest.fixture +def df_ref(datapath): + """ + Obtain the reference data from read_csv with the Python engine. + """ + filepath = datapath("io", "data", "csv", "test1.csv") + df_ref = read_csv(filepath, index_col=0, parse_dates=True, engine="python") + return df_ref + + +@pytest.fixture(params=[".xls", ".xlsx", ".xlsm", ".ods", ".xlsb"]) +def read_ext(request): + """ + Valid extensions for reading Excel files. + """ + return request.param + + +@pytest.fixture(autouse=True) +def check_for_file_leaks(): + """ + Fixture to run around every test to ensure that we are not leaking files. + + See also + -------- + _test_decorators.check_file_leaks + """ + # GH#30162 + psutil = td.safe_import("psutil") + if not psutil: + yield + + else: + proc = psutil.Process() + flist = proc.open_files() + yield + flist2 = proc.open_files() + assert flist == flist2 diff --git a/venv/Lib/site-packages/pandas/tests/io/excel/test_odf.py b/venv/Lib/site-packages/pandas/tests/io/excel/test_odf.py new file mode 100644 index 0000000..b9a3e8b --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/excel/test_odf.py @@ -0,0 +1,46 @@ +import functools + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + +pytest.importorskip("odf") + + +@pytest.fixture(autouse=True) +def cd_and_set_engine(monkeypatch, datapath): + func = functools.partial(pd.read_excel, engine="odf") + monkeypatch.setattr(pd, "read_excel", func) + monkeypatch.chdir(datapath("io", "data", "excel")) + + +def test_read_invalid_types_raises(): + # the invalid_value_type.ods required manually editing + # of the included content.xml file + with pytest.raises(ValueError, match="Unrecognized type awesome_new_type"): + pd.read_excel("invalid_value_type.ods") + + +def test_read_writer_table(): + # Also test reading tables from an text OpenDocument file + # (.odt) + index = pd.Index(["Row 1", "Row 2", "Row 3"], name="Header") + expected = pd.DataFrame( + [[1, np.nan, 7], [2, np.nan, 8], [3, np.nan, 9]], + index=index, + columns=["Column 1", "Unnamed: 2", "Column 3"], + ) + + result = pd.read_excel("writertable.odt", "Table1", index_col=0) + + tm.assert_frame_equal(result, expected) + + +def test_nonexistent_sheetname_raises(read_ext): + # GH-27676 + # Specifying a non-existent sheet_name parameter should throw an error + # with the sheet name. + with pytest.raises(ValueError, match="sheet xyz not found"): + pd.read_excel("blank.ods", sheet_name="xyz") diff --git a/venv/Lib/site-packages/pandas/tests/io/excel/test_openpyxl.py b/venv/Lib/site-packages/pandas/tests/io/excel/test_openpyxl.py new file mode 100644 index 0000000..10ed192 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/excel/test_openpyxl.py @@ -0,0 +1,124 @@ +import os + +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame +import pandas._testing as tm + +from pandas.io.excel import ExcelWriter, _OpenpyxlWriter + +openpyxl = pytest.importorskip("openpyxl") + +pytestmark = pytest.mark.parametrize("ext", [".xlsx"]) + + +def test_to_excel_styleconverter(ext): + from openpyxl import styles + + hstyle = { + "font": {"color": "00FF0000", "bold": True}, + "borders": {"top": "thin", "right": "thin", "bottom": "thin", "left": "thin"}, + "alignment": {"horizontal": "center", "vertical": "top"}, + "fill": {"patternType": "solid", "fgColor": {"rgb": "006666FF", "tint": 0.3}}, + "number_format": {"format_code": "0.00"}, + "protection": {"locked": True, "hidden": False}, + } + + font_color = styles.Color("00FF0000") + font = styles.Font(bold=True, color=font_color) + side = styles.Side(style=styles.borders.BORDER_THIN) + border = styles.Border(top=side, right=side, bottom=side, left=side) + alignment = styles.Alignment(horizontal="center", vertical="top") + fill_color = styles.Color(rgb="006666FF", tint=0.3) + fill = styles.PatternFill(patternType="solid", fgColor=fill_color) + + number_format = "0.00" + + protection = styles.Protection(locked=True, hidden=False) + + kw = _OpenpyxlWriter._convert_to_style_kwargs(hstyle) + assert kw["font"] == font + assert kw["border"] == border + assert kw["alignment"] == alignment + assert kw["fill"] == fill + assert kw["number_format"] == number_format + assert kw["protection"] == protection + + +def test_write_cells_merge_styled(ext): + from pandas.io.formats.excel import ExcelCell + + sheet_name = "merge_styled" + + sty_b1 = {"font": {"color": "00FF0000"}} + sty_a2 = {"font": {"color": "0000FF00"}} + + initial_cells = [ + ExcelCell(col=1, row=0, val=42, style=sty_b1), + ExcelCell(col=0, row=1, val=99, style=sty_a2), + ] + + sty_merged = {"font": {"color": "000000FF", "bold": True}} + sty_kwargs = _OpenpyxlWriter._convert_to_style_kwargs(sty_merged) + openpyxl_sty_merged = sty_kwargs["font"] + merge_cells = [ + ExcelCell( + col=0, row=0, val="pandas", mergestart=1, mergeend=1, style=sty_merged + ) + ] + + with tm.ensure_clean(ext) as path: + writer = _OpenpyxlWriter(path) + writer.write_cells(initial_cells, sheet_name=sheet_name) + writer.write_cells(merge_cells, sheet_name=sheet_name) + + wks = writer.sheets[sheet_name] + xcell_b1 = wks["B1"] + xcell_a2 = wks["A2"] + assert xcell_b1.font == openpyxl_sty_merged + assert xcell_a2.font == openpyxl_sty_merged + + +@pytest.mark.parametrize( + "mode,expected", [("w", ["baz"]), ("a", ["foo", "bar", "baz"])] +) +def test_write_append_mode(ext, mode, expected): + df = DataFrame([1], columns=["baz"]) + + with tm.ensure_clean(ext) as f: + wb = openpyxl.Workbook() + wb.worksheets[0].title = "foo" + wb.worksheets[0]["A1"].value = "foo" + wb.create_sheet("bar") + wb.worksheets[1]["A1"].value = "bar" + wb.save(f) + + writer = ExcelWriter(f, engine="openpyxl", mode=mode) + df.to_excel(writer, sheet_name="baz", index=False) + writer.save() + + wb2 = openpyxl.load_workbook(f) + result = [sheet.title for sheet in wb2.worksheets] + assert result == expected + + for index, cell_value in enumerate(expected): + assert wb2.worksheets[index]["A1"].value == cell_value + + +def test_to_excel_with_openpyxl_engine(ext, tmpdir): + # GH 29854 + # TODO: Fix this once newer version of openpyxl fixes the bug + df1 = DataFrame({"A": np.linspace(1, 10, 10)}) + df2 = DataFrame({"B": np.linspace(1, 20, 10)}) + df = pd.concat([df1, df2], axis=1) + styled = df.style.applymap( + lambda val: "color: %s" % "red" if val < 0 else "black" + ).highlight_max() + + filename = tmpdir / "styled.xlsx" + styled.to_excel(filename, engine="openpyxl") + + assert filename.exists() + os.remove(filename) diff --git a/venv/Lib/site-packages/pandas/tests/io/excel/test_readers.py b/venv/Lib/site-packages/pandas/tests/io/excel/test_readers.py new file mode 100644 index 0000000..8d00ef1 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/excel/test_readers.py @@ -0,0 +1,1045 @@ +from collections import OrderedDict +import contextlib +from datetime import datetime, time +from functools import partial +import os +from urllib.error import URLError +import warnings + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import DataFrame, Index, MultiIndex, Series +import pandas._testing as tm + + +@contextlib.contextmanager +def ignore_xlrd_time_clock_warning(): + """ + Context manager to ignore warnings raised by the xlrd library, + regarding the deprecation of `time.clock` in Python 3.7. + """ + with warnings.catch_warnings(): + warnings.filterwarnings( + action="ignore", + message="time.clock has been deprecated", + category=DeprecationWarning, + ) + yield + + +read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"] +engine_params = [ + # Add any engines to test here + # When defusedxml is installed it triggers deprecation warnings for + # xlrd and openpyxl, so catch those here + pytest.param( + "xlrd", + marks=[ + td.skip_if_no("xlrd"), + pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"), + ], + ), + pytest.param( + "openpyxl", + marks=[ + td.skip_if_no("openpyxl"), + pytest.mark.filterwarnings("ignore:.*html argument"), + ], + ), + pytest.param( + None, + marks=[ + td.skip_if_no("xlrd"), + pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"), + ], + ), + pytest.param("pyxlsb", marks=td.skip_if_no("pyxlsb")), + pytest.param("odf", marks=td.skip_if_no("odf")), +] + + +def _is_valid_engine_ext_pair(engine, read_ext: str) -> bool: + """ + Filter out invalid (engine, ext) pairs instead of skipping, as that + produces 500+ pytest.skips. + """ + engine = engine.values[0] + if engine == "openpyxl" and read_ext == ".xls": + return False + if engine == "odf" and read_ext != ".ods": + return False + if read_ext == ".ods" and engine != "odf": + return False + if engine == "pyxlsb" and read_ext != ".xlsb": + return False + if read_ext == ".xlsb" and engine != "pyxlsb": + return False + return True + + +def _transfer_marks(engine, read_ext): + """ + engine gives us a pytest.param objec with some marks, read_ext is just + a string. We need to generate a new pytest.param inheriting the marks. + """ + values = engine.values + (read_ext,) + new_param = pytest.param(values, marks=engine.marks) + return new_param + + +@pytest.fixture( + autouse=True, + params=[ + _transfer_marks(eng, ext) + for eng in engine_params + for ext in read_ext_params + if _is_valid_engine_ext_pair(eng, ext) + ], +) +def engine_and_read_ext(request): + """ + Fixture for Excel reader engine and read_ext, only including valid pairs. + """ + return request.param + + +@pytest.fixture +def engine(engine_and_read_ext): + engine, read_ext = engine_and_read_ext + return engine + + +@pytest.fixture +def read_ext(engine_and_read_ext): + engine, read_ext = engine_and_read_ext + return read_ext + + +class TestReaders: + @pytest.fixture(autouse=True) + def cd_and_set_engine(self, engine, datapath, monkeypatch): + """ + Change directory and set engine for read_excel calls. + """ + func = partial(pd.read_excel, engine=engine) + monkeypatch.chdir(datapath("io", "data", "excel")) + monkeypatch.setattr(pd, "read_excel", func) + + def test_usecols_int(self, read_ext, df_ref): + df_ref = df_ref.reindex(columns=["A", "B", "C"]) + + # usecols as int + msg = "Passing an integer for `usecols`" + with pytest.raises(ValueError, match=msg): + with ignore_xlrd_time_clock_warning(): + pd.read_excel("test1" + read_ext, "Sheet1", index_col=0, usecols=3) + + # usecols as int + with pytest.raises(ValueError, match=msg): + with ignore_xlrd_time_clock_warning(): + pd.read_excel( + "test1" + read_ext, "Sheet2", skiprows=[1], index_col=0, usecols=3 + ) + + def test_usecols_list(self, read_ext, df_ref): + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") + + df_ref = df_ref.reindex(columns=["B", "C"]) + df1 = pd.read_excel( + "test1" + read_ext, "Sheet1", index_col=0, usecols=[0, 2, 3] + ) + df2 = pd.read_excel( + "test1" + read_ext, "Sheet2", skiprows=[1], index_col=0, usecols=[0, 2, 3] + ) + + # TODO add index to xls file) + tm.assert_frame_equal(df1, df_ref, check_names=False) + tm.assert_frame_equal(df2, df_ref, check_names=False) + + def test_usecols_str(self, read_ext, df_ref): + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") + + df1 = df_ref.reindex(columns=["A", "B", "C"]) + df2 = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0, usecols="A:D") + df3 = pd.read_excel( + "test1" + read_ext, "Sheet2", skiprows=[1], index_col=0, usecols="A:D" + ) + + # TODO add index to xls, read xls ignores index name ? + tm.assert_frame_equal(df2, df1, check_names=False) + tm.assert_frame_equal(df3, df1, check_names=False) + + df1 = df_ref.reindex(columns=["B", "C"]) + df2 = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0, usecols="A,C,D") + df3 = pd.read_excel( + "test1" + read_ext, "Sheet2", skiprows=[1], index_col=0, usecols="A,C,D" + ) + # TODO add index to xls file + tm.assert_frame_equal(df2, df1, check_names=False) + tm.assert_frame_equal(df3, df1, check_names=False) + + df1 = df_ref.reindex(columns=["B", "C"]) + df2 = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0, usecols="A,C:D") + df3 = pd.read_excel( + "test1" + read_ext, "Sheet2", skiprows=[1], index_col=0, usecols="A,C:D" + ) + tm.assert_frame_equal(df2, df1, check_names=False) + tm.assert_frame_equal(df3, df1, check_names=False) + + @pytest.mark.parametrize( + "usecols", [[0, 1, 3], [0, 3, 1], [1, 0, 3], [1, 3, 0], [3, 0, 1], [3, 1, 0]] + ) + def test_usecols_diff_positional_int_columns_order(self, read_ext, usecols, df_ref): + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") + + expected = df_ref[["A", "C"]] + result = pd.read_excel( + "test1" + read_ext, "Sheet1", index_col=0, usecols=usecols + ) + tm.assert_frame_equal(result, expected, check_names=False) + + @pytest.mark.parametrize("usecols", [["B", "D"], ["D", "B"]]) + def test_usecols_diff_positional_str_columns_order(self, read_ext, usecols, df_ref): + expected = df_ref[["B", "D"]] + expected.index = range(len(expected)) + + result = pd.read_excel("test1" + read_ext, "Sheet1", usecols=usecols) + tm.assert_frame_equal(result, expected, check_names=False) + + def test_read_excel_without_slicing(self, read_ext, df_ref): + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") + + expected = df_ref + result = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0) + tm.assert_frame_equal(result, expected, check_names=False) + + def test_usecols_excel_range_str(self, read_ext, df_ref): + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") + + expected = df_ref[["C", "D"]] + result = pd.read_excel( + "test1" + read_ext, "Sheet1", index_col=0, usecols="A,D:E" + ) + tm.assert_frame_equal(result, expected, check_names=False) + + def test_usecols_excel_range_str_invalid(self, read_ext): + msg = "Invalid column name: E1" + + with pytest.raises(ValueError, match=msg): + pd.read_excel("test1" + read_ext, "Sheet1", usecols="D:E1") + + def test_index_col_label_error(self, read_ext): + msg = "list indices must be integers.*, not str" + + with pytest.raises(TypeError, match=msg): + pd.read_excel( + "test1" + read_ext, "Sheet1", index_col=["A"], usecols=["A", "C"] + ) + + def test_index_col_empty(self, read_ext): + # see gh-9208 + result = pd.read_excel("test1" + read_ext, "Sheet3", index_col=["A", "B", "C"]) + expected = DataFrame( + columns=["D", "E", "F"], + index=MultiIndex(levels=[[]] * 3, codes=[[]] * 3, names=["A", "B", "C"]), + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("index_col", [None, 2]) + def test_index_col_with_unnamed(self, read_ext, index_col): + # see gh-18792 + result = pd.read_excel("test1" + read_ext, "Sheet4", index_col=index_col) + expected = DataFrame( + [["i1", "a", "x"], ["i2", "b", "y"]], columns=["Unnamed: 0", "col1", "col2"] + ) + if index_col: + expected = expected.set_index(expected.columns[index_col]) + + tm.assert_frame_equal(result, expected) + + def test_usecols_pass_non_existent_column(self, read_ext): + msg = ( + "Usecols do not match columns, " + "columns expected but not found: " + r"\['E'\]" + ) + + with pytest.raises(ValueError, match=msg): + pd.read_excel("test1" + read_ext, usecols=["E"]) + + def test_usecols_wrong_type(self, read_ext): + msg = ( + "'usecols' must either be list-like of " + "all strings, all unicode, all integers or a callable." + ) + + with pytest.raises(ValueError, match=msg): + pd.read_excel("test1" + read_ext, usecols=["E1", 0]) + + def test_excel_stop_iterator(self, read_ext): + + parsed = pd.read_excel("test2" + read_ext, "Sheet1") + expected = DataFrame([["aaaa", "bbbbb"]], columns=["Test", "Test1"]) + tm.assert_frame_equal(parsed, expected) + + def test_excel_cell_error_na(self, read_ext): + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") + + parsed = pd.read_excel("test3" + read_ext, "Sheet1") + expected = DataFrame([[np.nan]], columns=["Test"]) + tm.assert_frame_equal(parsed, expected) + + def test_excel_table(self, read_ext, df_ref): + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") + + df1 = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0) + df2 = pd.read_excel("test1" + read_ext, "Sheet2", skiprows=[1], index_col=0) + # TODO add index to file + tm.assert_frame_equal(df1, df_ref, check_names=False) + tm.assert_frame_equal(df2, df_ref, check_names=False) + + df3 = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0, skipfooter=1) + tm.assert_frame_equal(df3, df1.iloc[:-1]) + + def test_reader_special_dtypes(self, read_ext): + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") + + expected = DataFrame.from_dict( + OrderedDict( + [ + ("IntCol", [1, 2, -3, 4, 0]), + ("FloatCol", [1.25, 2.25, 1.83, 1.92, 0.0000000005]), + ("BoolCol", [True, False, True, True, False]), + ("StrCol", [1, 2, 3, 4, 5]), + # GH5394 - this is why convert_float isn't vectorized + ("Str2Col", ["a", 3, "c", "d", "e"]), + ( + "DateCol", + [ + datetime(2013, 10, 30), + datetime(2013, 10, 31), + datetime(1905, 1, 1), + datetime(2013, 12, 14), + datetime(2015, 3, 14), + ], + ), + ] + ) + ) + basename = "test_types" + + # should read in correctly and infer types + actual = pd.read_excel(basename + read_ext, "Sheet1") + tm.assert_frame_equal(actual, expected) + + # if not coercing number, then int comes in as float + float_expected = expected.copy() + float_expected["IntCol"] = float_expected["IntCol"].astype(float) + float_expected.loc[float_expected.index[1], "Str2Col"] = 3.0 + actual = pd.read_excel(basename + read_ext, "Sheet1", convert_float=False) + tm.assert_frame_equal(actual, float_expected) + + # check setting Index (assuming xls and xlsx are the same here) + for icol, name in enumerate(expected.columns): + actual = pd.read_excel(basename + read_ext, "Sheet1", index_col=icol) + exp = expected.set_index(name) + tm.assert_frame_equal(actual, exp) + + # convert_float and converters should be different but both accepted + expected["StrCol"] = expected["StrCol"].apply(str) + actual = pd.read_excel( + basename + read_ext, "Sheet1", converters={"StrCol": str} + ) + tm.assert_frame_equal(actual, expected) + + no_convert_float = float_expected.copy() + no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str) + actual = pd.read_excel( + basename + read_ext, + "Sheet1", + convert_float=False, + converters={"StrCol": str}, + ) + tm.assert_frame_equal(actual, no_convert_float) + + # GH8212 - support for converters and missing values + def test_reader_converters(self, read_ext): + + basename = "test_converters" + + expected = DataFrame.from_dict( + OrderedDict( + [ + ("IntCol", [1, 2, -3, -1000, 0]), + ("FloatCol", [12.5, np.nan, 18.3, 19.2, 0.000000005]), + ("BoolCol", ["Found", "Found", "Found", "Not found", "Found"]), + ("StrCol", ["1", np.nan, "3", "4", "5"]), + ] + ) + ) + + converters = { + "IntCol": lambda x: int(x) if x != "" else -1000, + "FloatCol": lambda x: 10 * x if x else np.nan, + 2: lambda x: "Found" if x != "" else "Not found", + 3: lambda x: str(x) if x else "", + } + + # should read in correctly and set types of single cells (not array + # dtypes) + actual = pd.read_excel(basename + read_ext, "Sheet1", converters=converters) + tm.assert_frame_equal(actual, expected) + + def test_reader_dtype(self, read_ext): + # GH 8212 + basename = "testdtype" + actual = pd.read_excel(basename + read_ext) + + expected = DataFrame( + { + "a": [1, 2, 3, 4], + "b": [2.5, 3.5, 4.5, 5.5], + "c": [1, 2, 3, 4], + "d": [1.0, 2.0, np.nan, 4.0], + } + ).reindex(columns=["a", "b", "c", "d"]) + + tm.assert_frame_equal(actual, expected) + + actual = pd.read_excel( + basename + read_ext, dtype={"a": "float64", "b": "float32", "c": str} + ) + + expected["a"] = expected["a"].astype("float64") + expected["b"] = expected["b"].astype("float32") + expected["c"] = ["001", "002", "003", "004"] + tm.assert_frame_equal(actual, expected) + + with pytest.raises(ValueError): + pd.read_excel(basename + read_ext, dtype={"d": "int64"}) + + @pytest.mark.parametrize( + "dtype,expected", + [ + ( + None, + DataFrame( + { + "a": [1, 2, 3, 4], + "b": [2.5, 3.5, 4.5, 5.5], + "c": [1, 2, 3, 4], + "d": [1.0, 2.0, np.nan, 4.0], + } + ), + ), + ( + {"a": "float64", "b": "float32", "c": str, "d": str}, + DataFrame( + { + "a": Series([1, 2, 3, 4], dtype="float64"), + "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"), + "c": ["001", "002", "003", "004"], + "d": ["1", "2", np.nan, "4"], + } + ), + ), + ], + ) + def test_reader_dtype_str(self, read_ext, dtype, expected): + # see gh-20377 + basename = "testdtype" + + actual = pd.read_excel(basename + read_ext, dtype=dtype) + tm.assert_frame_equal(actual, expected) + + def test_reading_all_sheets(self, read_ext): + # Test reading all sheetnames by setting sheetname to None, + # Ensure a dict is returned. + # See PR #9450 + basename = "test_multisheet" + dfs = pd.read_excel(basename + read_ext, sheet_name=None) + # ensure this is not alphabetical to test order preservation + expected_keys = ["Charlie", "Alpha", "Beta"] + tm.assert_contains_all(expected_keys, dfs.keys()) + # Issue 9930 + # Ensure sheet order is preserved + assert expected_keys == list(dfs.keys()) + + def test_reading_multiple_specific_sheets(self, read_ext): + # Test reading specific sheetnames by specifying a mixed list + # of integers and strings, and confirm that duplicated sheet + # references (positions/names) are removed properly. + # Ensure a dict is returned + # See PR #9450 + basename = "test_multisheet" + # Explicitly request duplicates. Only the set should be returned. + expected_keys = [2, "Charlie", "Charlie"] + dfs = pd.read_excel(basename + read_ext, sheet_name=expected_keys) + expected_keys = list(set(expected_keys)) + tm.assert_contains_all(expected_keys, dfs.keys()) + assert len(expected_keys) == len(dfs.keys()) + + def test_reading_all_sheets_with_blank(self, read_ext): + # Test reading all sheetnames by setting sheetname to None, + # In the case where some sheets are blank. + # Issue #11711 + basename = "blank_with_header" + dfs = pd.read_excel(basename + read_ext, sheet_name=None) + expected_keys = ["Sheet1", "Sheet2", "Sheet3"] + tm.assert_contains_all(expected_keys, dfs.keys()) + + # GH6403 + def test_read_excel_blank(self, read_ext): + actual = pd.read_excel("blank" + read_ext, "Sheet1") + tm.assert_frame_equal(actual, DataFrame()) + + def test_read_excel_blank_with_header(self, read_ext): + expected = DataFrame(columns=["col_1", "col_2"]) + actual = pd.read_excel("blank_with_header" + read_ext, "Sheet1") + tm.assert_frame_equal(actual, expected) + + def test_date_conversion_overflow(self, read_ext): + # GH 10001 : pandas.ExcelFile ignore parse_dates=False + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") + + expected = pd.DataFrame( + [ + [pd.Timestamp("2016-03-12"), "Marc Johnson"], + [pd.Timestamp("2016-03-16"), "Jack Black"], + [1e20, "Timothy Brown"], + ], + columns=["DateColWithBigInt", "StringCol"], + ) + + if pd.read_excel.keywords["engine"] == "openpyxl": + pytest.xfail("Maybe not supported by openpyxl") + + result = pd.read_excel("testdateoverflow" + read_ext) + tm.assert_frame_equal(result, expected) + + def test_sheet_name(self, read_ext, df_ref): + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") + filename = "test1" + sheet_name = "Sheet1" + + if pd.read_excel.keywords["engine"] == "openpyxl": + pytest.xfail("Maybe not supported by openpyxl") + + df1 = pd.read_excel( + filename + read_ext, sheet_name=sheet_name, index_col=0 + ) # doc + with ignore_xlrd_time_clock_warning(): + df2 = pd.read_excel(filename + read_ext, index_col=0, sheet_name=sheet_name) + + tm.assert_frame_equal(df1, df_ref, check_names=False) + tm.assert_frame_equal(df2, df_ref, check_names=False) + + def test_excel_read_buffer(self, read_ext): + + pth = "test1" + read_ext + expected = pd.read_excel(pth, "Sheet1", index_col=0) + with open(pth, "rb") as f: + actual = pd.read_excel(f, "Sheet1", index_col=0) + tm.assert_frame_equal(expected, actual) + + def test_bad_engine_raises(self, read_ext): + bad_engine = "foo" + with pytest.raises(ValueError, match="Unknown engine: foo"): + pd.read_excel("", engine=bad_engine) + + @tm.network + def test_read_from_http_url(self, read_ext): + url = ( + "https://raw.githubusercontent.com/pandas-dev/pandas/master/" + "pandas/tests/io/data/excel/test1" + read_ext + ) + url_table = pd.read_excel(url) + local_table = pd.read_excel("test1" + read_ext) + tm.assert_frame_equal(url_table, local_table) + + @td.skip_if_not_us_locale + def test_read_from_s3_url(self, read_ext, s3_resource): + # Bucket "pandas-test" created in tests/io/conftest.py + with open("test1" + read_ext, "rb") as f: + s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, Body=f) + + url = "s3://pandas-test/test1" + read_ext + url_table = pd.read_excel(url) + local_table = pd.read_excel("test1" + read_ext) + tm.assert_frame_equal(url_table, local_table) + + @pytest.mark.slow + # ignore warning from old xlrd + @pytest.mark.filterwarnings("ignore:This metho:PendingDeprecationWarning") + def test_read_from_file_url(self, read_ext, datapath): + + # FILE + localtable = os.path.join(datapath("io", "data", "excel"), "test1" + read_ext) + local_table = pd.read_excel(localtable) + + try: + url_table = pd.read_excel("file://localhost/" + localtable) + except URLError: + # fails on some systems + import platform + + pytest.skip("failing on {}".format(" ".join(platform.uname()).strip())) + + tm.assert_frame_equal(url_table, local_table) + + def test_read_from_pathlib_path(self, read_ext): + + # GH12655 + from pathlib import Path + + str_path = "test1" + read_ext + expected = pd.read_excel(str_path, "Sheet1", index_col=0) + + path_obj = Path("test1" + read_ext) + actual = pd.read_excel(path_obj, "Sheet1", index_col=0) + + tm.assert_frame_equal(expected, actual) + + @td.skip_if_no("py.path") + @td.check_file_leaks + def test_read_from_py_localpath(self, read_ext): + + # GH12655 + from py.path import local as LocalPath + + str_path = os.path.join("test1" + read_ext) + expected = pd.read_excel(str_path, "Sheet1", index_col=0) + + path_obj = LocalPath().join("test1" + read_ext) + actual = pd.read_excel(path_obj, "Sheet1", index_col=0) + + tm.assert_frame_equal(expected, actual) + + def test_reader_seconds(self, read_ext): + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") + + # Test reading times with and without milliseconds. GH5945. + expected = DataFrame.from_dict( + { + "Time": [ + time(1, 2, 3), + time(2, 45, 56, 100000), + time(4, 29, 49, 200000), + time(6, 13, 42, 300000), + time(7, 57, 35, 400000), + time(9, 41, 28, 500000), + time(11, 25, 21, 600000), + time(13, 9, 14, 700000), + time(14, 53, 7, 800000), + time(16, 37, 0, 900000), + time(18, 20, 54), + ] + } + ) + + actual = pd.read_excel("times_1900" + read_ext, "Sheet1") + tm.assert_frame_equal(actual, expected) + + actual = pd.read_excel("times_1904" + read_ext, "Sheet1") + tm.assert_frame_equal(actual, expected) + + def test_read_excel_multiindex(self, read_ext): + # see gh-4679 + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") + + mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]]) + mi_file = "testmultiindex" + read_ext + + # "mi_column" sheet + expected = DataFrame( + [ + [1, 2.5, pd.Timestamp("2015-01-01"), True], + [2, 3.5, pd.Timestamp("2015-01-02"), False], + [3, 4.5, pd.Timestamp("2015-01-03"), False], + [4, 5.5, pd.Timestamp("2015-01-04"), True], + ], + columns=mi, + ) + + actual = pd.read_excel(mi_file, "mi_column", header=[0, 1], index_col=0) + tm.assert_frame_equal(actual, expected) + + # "mi_index" sheet + expected.index = mi + expected.columns = ["a", "b", "c", "d"] + + actual = pd.read_excel(mi_file, "mi_index", index_col=[0, 1]) + tm.assert_frame_equal(actual, expected, check_names=False) + + # "both" sheet + expected.columns = mi + + actual = pd.read_excel(mi_file, "both", index_col=[0, 1], header=[0, 1]) + tm.assert_frame_equal(actual, expected, check_names=False) + + # "mi_index_name" sheet + expected.columns = ["a", "b", "c", "d"] + expected.index = mi.set_names(["ilvl1", "ilvl2"]) + + actual = pd.read_excel(mi_file, "mi_index_name", index_col=[0, 1]) + tm.assert_frame_equal(actual, expected) + + # "mi_column_name" sheet + expected.index = list(range(4)) + expected.columns = mi.set_names(["c1", "c2"]) + actual = pd.read_excel(mi_file, "mi_column_name", header=[0, 1], index_col=0) + tm.assert_frame_equal(actual, expected) + + # see gh-11317 + # "name_with_int" sheet + expected.columns = mi.set_levels([1, 2], level=1).set_names(["c1", "c2"]) + + actual = pd.read_excel(mi_file, "name_with_int", index_col=0, header=[0, 1]) + tm.assert_frame_equal(actual, expected) + + # "both_name" sheet + expected.columns = mi.set_names(["c1", "c2"]) + expected.index = mi.set_names(["ilvl1", "ilvl2"]) + + actual = pd.read_excel(mi_file, "both_name", index_col=[0, 1], header=[0, 1]) + tm.assert_frame_equal(actual, expected) + + # "both_skiprows" sheet + actual = pd.read_excel( + mi_file, "both_name_skiprows", index_col=[0, 1], header=[0, 1], skiprows=2 + ) + tm.assert_frame_equal(actual, expected) + + def test_read_excel_multiindex_header_only(self, read_ext): + # see gh-11733. + # + # Don't try to parse a header name if there isn't one. + mi_file = "testmultiindex" + read_ext + result = pd.read_excel(mi_file, "index_col_none", header=[0, 1]) + + exp_columns = MultiIndex.from_product([("A", "B"), ("key", "val")]) + expected = DataFrame([[1, 2, 3, 4]] * 2, columns=exp_columns) + tm.assert_frame_equal(result, expected) + + def test_excel_old_index_format(self, read_ext): + # see gh-4679 + filename = "test_index_name_pre17" + read_ext + + # We detect headers to determine if index names exist, so + # that "index" name in the "names" version of the data will + # now be interpreted as rows that include null data. + data = np.array( + [ + [None, None, None, None, None], + ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], + ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], + ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], + ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], + ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"], + ] + ) + columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"] + mi = MultiIndex( + levels=[ + ["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], + ["R1", "R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"], + ], + codes=[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]], + names=[None, None], + ) + si = Index( + ["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], name=None + ) + + expected = pd.DataFrame(data, index=si, columns=columns) + + actual = pd.read_excel(filename, "single_names", index_col=0) + tm.assert_frame_equal(actual, expected) + + expected.index = mi + + actual = pd.read_excel(filename, "multi_names", index_col=[0, 1]) + tm.assert_frame_equal(actual, expected) + + # The analogous versions of the "names" version data + # where there are explicitly no names for the indices. + data = np.array( + [ + ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], + ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], + ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], + ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], + ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"], + ] + ) + columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"] + mi = MultiIndex( + levels=[ + ["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], + ["R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"], + ], + codes=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]], + names=[None, None], + ) + si = Index(["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], name=None) + + expected = pd.DataFrame(data, index=si, columns=columns) + + actual = pd.read_excel(filename, "single_no_names", index_col=0) + tm.assert_frame_equal(actual, expected) + + expected.index = mi + + actual = pd.read_excel(filename, "multi_no_names", index_col=[0, 1]) + tm.assert_frame_equal(actual, expected, check_names=False) + + def test_read_excel_bool_header_arg(self, read_ext): + # GH 6114 + for arg in [True, False]: + with pytest.raises(TypeError): + pd.read_excel("test1" + read_ext, header=arg) + + def test_read_excel_chunksize(self, read_ext): + # GH 8011 + with pytest.raises(NotImplementedError): + pd.read_excel("test1" + read_ext, chunksize=100) + + def test_read_excel_skiprows_list(self, read_ext): + # GH 4903 + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") + + actual = pd.read_excel( + "testskiprows" + read_ext, "skiprows_list", skiprows=[0, 2] + ) + expected = DataFrame( + [ + [1, 2.5, pd.Timestamp("2015-01-01"), True], + [2, 3.5, pd.Timestamp("2015-01-02"), False], + [3, 4.5, pd.Timestamp("2015-01-03"), False], + [4, 5.5, pd.Timestamp("2015-01-04"), True], + ], + columns=["a", "b", "c", "d"], + ) + tm.assert_frame_equal(actual, expected) + + actual = pd.read_excel( + "testskiprows" + read_ext, "skiprows_list", skiprows=np.array([0, 2]) + ) + tm.assert_frame_equal(actual, expected) + + def test_read_excel_nrows(self, read_ext): + # GH 16645 + num_rows_to_pull = 5 + actual = pd.read_excel("test1" + read_ext, nrows=num_rows_to_pull) + expected = pd.read_excel("test1" + read_ext) + expected = expected[:num_rows_to_pull] + tm.assert_frame_equal(actual, expected) + + def test_read_excel_nrows_greater_than_nrows_in_file(self, read_ext): + # GH 16645 + expected = pd.read_excel("test1" + read_ext) + num_records_in_file = len(expected) + num_rows_to_pull = num_records_in_file + 10 + actual = pd.read_excel("test1" + read_ext, nrows=num_rows_to_pull) + tm.assert_frame_equal(actual, expected) + + def test_read_excel_nrows_non_integer_parameter(self, read_ext): + # GH 16645 + msg = "'nrows' must be an integer >=0" + with pytest.raises(ValueError, match=msg): + pd.read_excel("test1" + read_ext, nrows="5") + + def test_read_excel_squeeze(self, read_ext): + # GH 12157 + f = "test_squeeze" + read_ext + + actual = pd.read_excel(f, "two_columns", index_col=0, squeeze=True) + expected = pd.Series([2, 3, 4], [4, 5, 6], name="b") + expected.index.name = "a" + tm.assert_series_equal(actual, expected) + + actual = pd.read_excel(f, "two_columns", squeeze=True) + expected = pd.DataFrame({"a": [4, 5, 6], "b": [2, 3, 4]}) + tm.assert_frame_equal(actual, expected) + + actual = pd.read_excel(f, "one_column", squeeze=True) + expected = pd.Series([1, 2, 3], name="a") + tm.assert_series_equal(actual, expected) + + +class TestExcelFileRead: + @pytest.fixture(autouse=True) + def cd_and_set_engine(self, engine, datapath, monkeypatch): + """ + Change directory and set engine for ExcelFile objects. + """ + func = partial(pd.ExcelFile, engine=engine) + monkeypatch.chdir(datapath("io", "data", "excel")) + monkeypatch.setattr(pd, "ExcelFile", func) + + def test_excel_passes_na(self, read_ext): + with pd.ExcelFile("test4" + read_ext) as excel: + parsed = pd.read_excel( + excel, "Sheet1", keep_default_na=False, na_values=["apple"] + ) + expected = DataFrame( + [["NA"], [1], ["NA"], [np.nan], ["rabbit"]], columns=["Test"] + ) + tm.assert_frame_equal(parsed, expected) + + with pd.ExcelFile("test4" + read_ext) as excel: + parsed = pd.read_excel( + excel, "Sheet1", keep_default_na=True, na_values=["apple"] + ) + expected = DataFrame( + [[np.nan], [1], [np.nan], [np.nan], ["rabbit"]], columns=["Test"] + ) + tm.assert_frame_equal(parsed, expected) + + # 13967 + with pd.ExcelFile("test5" + read_ext) as excel: + parsed = pd.read_excel( + excel, "Sheet1", keep_default_na=False, na_values=["apple"] + ) + expected = DataFrame( + [["1.#QNAN"], [1], ["nan"], [np.nan], ["rabbit"]], columns=["Test"] + ) + tm.assert_frame_equal(parsed, expected) + + with pd.ExcelFile("test5" + read_ext) as excel: + parsed = pd.read_excel( + excel, "Sheet1", keep_default_na=True, na_values=["apple"] + ) + expected = DataFrame( + [[np.nan], [1], [np.nan], [np.nan], ["rabbit"]], columns=["Test"] + ) + tm.assert_frame_equal(parsed, expected) + + @pytest.mark.parametrize("na_filter", [None, True, False]) + def test_excel_passes_na_filter(self, read_ext, na_filter): + # gh-25453 + kwargs = {} + + if na_filter is not None: + kwargs["na_filter"] = na_filter + + with pd.ExcelFile("test5" + read_ext) as excel: + parsed = pd.read_excel( + excel, "Sheet1", keep_default_na=True, na_values=["apple"], **kwargs + ) + + if na_filter is False: + expected = [["1.#QNAN"], [1], ["nan"], ["apple"], ["rabbit"]] + else: + expected = [[np.nan], [1], [np.nan], [np.nan], ["rabbit"]] + + expected = DataFrame(expected, columns=["Test"]) + tm.assert_frame_equal(parsed, expected) + + @pytest.mark.parametrize("arg", ["sheet", "sheetname", "parse_cols"]) + @td.check_file_leaks + def test_unexpected_kwargs_raises(self, read_ext, arg): + # gh-17964 + kwarg = {arg: "Sheet1"} + msg = r"unexpected keyword argument `{}`".format(arg) + + with pd.ExcelFile("test1" + read_ext) as excel: + with pytest.raises(TypeError, match=msg): + pd.read_excel(excel, **kwarg) + + def test_excel_table_sheet_by_index(self, read_ext, df_ref): + # For some reason pd.read_excel has no attribute 'keywords' here. + # Skipping based on read_ext instead. + if read_ext == ".xlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") + + with pd.ExcelFile("test1" + read_ext) as excel: + df1 = pd.read_excel(excel, 0, index_col=0) + df2 = pd.read_excel(excel, 1, skiprows=[1], index_col=0) + tm.assert_frame_equal(df1, df_ref, check_names=False) + tm.assert_frame_equal(df2, df_ref, check_names=False) + + with pd.ExcelFile("test1" + read_ext) as excel: + df1 = excel.parse(0, index_col=0) + df2 = excel.parse(1, skiprows=[1], index_col=0) + tm.assert_frame_equal(df1, df_ref, check_names=False) + tm.assert_frame_equal(df2, df_ref, check_names=False) + + with pd.ExcelFile("test1" + read_ext) as excel: + df3 = pd.read_excel(excel, 0, index_col=0, skipfooter=1) + tm.assert_frame_equal(df3, df1.iloc[:-1]) + + with pd.ExcelFile("test1" + read_ext) as excel: + df3 = excel.parse(0, index_col=0, skipfooter=1) + + tm.assert_frame_equal(df3, df1.iloc[:-1]) + + def test_sheet_name(self, read_ext, df_ref): + # For some reason pd.read_excel has no attribute 'keywords' here. + # Skipping based on read_ext instead. + if read_ext == ".xlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") + + filename = "test1" + sheet_name = "Sheet1" + + with pd.ExcelFile(filename + read_ext) as excel: + df1_parse = excel.parse(sheet_name=sheet_name, index_col=0) # doc + + with pd.ExcelFile(filename + read_ext) as excel: + df2_parse = excel.parse(index_col=0, sheet_name=sheet_name) + + tm.assert_frame_equal(df1_parse, df_ref, check_names=False) + tm.assert_frame_equal(df2_parse, df_ref, check_names=False) + + def test_excel_read_buffer(self, engine, read_ext): + pth = "test1" + read_ext + expected = pd.read_excel(pth, "Sheet1", index_col=0, engine=engine) + + with open(pth, "rb") as f: + with pd.ExcelFile(f) as xls: + actual = pd.read_excel(xls, "Sheet1", index_col=0) + + tm.assert_frame_equal(expected, actual) + + def test_reader_closes_file(self, engine, read_ext): + f = open("test1" + read_ext, "rb") + with pd.ExcelFile(f) as xlsx: + # parses okay + pd.read_excel(xlsx, "Sheet1", index_col=0, engine=engine) + + assert f.closed + + def test_conflicting_excel_engines(self, read_ext): + # GH 26566 + msg = "Engine should not be specified when passing an ExcelFile" + + with pd.ExcelFile("test1" + read_ext) as xl: + with pytest.raises(ValueError, match=msg): + pd.read_excel(xl, engine="foo") + + def test_excel_read_binary(self, engine, read_ext): + # GH 15914 + expected = pd.read_excel("test1" + read_ext, engine=engine) + + with open("test1" + read_ext, "rb") as f: + data = f.read() + + actual = pd.read_excel(data, engine=engine) + tm.assert_frame_equal(expected, actual) diff --git a/venv/Lib/site-packages/pandas/tests/io/excel/test_style.py b/venv/Lib/site-packages/pandas/tests/io/excel/test_style.py new file mode 100644 index 0000000..88f4c37 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/excel/test_style.py @@ -0,0 +1,169 @@ +import numpy as np +import pytest + +from pandas import DataFrame +import pandas._testing as tm + +from pandas.io.excel import ExcelWriter +from pandas.io.formats.excel import ExcelFormatter + + +@pytest.mark.parametrize( + "engine", + [ + pytest.param( + "xlwt", + marks=pytest.mark.xfail( + reason="xlwt does not support openpyxl-compatible style dicts" + ), + ), + "xlsxwriter", + "openpyxl", + ], +) +def test_styler_to_excel(engine): + def style(df): + # XXX: RGB colors not supported in xlwt + return DataFrame( + [ + ["font-weight: bold", "", ""], + ["", "color: blue", ""], + ["", "", "text-decoration: underline"], + ["border-style: solid", "", ""], + ["", "font-style: italic", ""], + ["", "", "text-align: right"], + ["background-color: red", "", ""], + ["number-format: 0%", "", ""], + ["", "", ""], + ["", "", ""], + ["", "", ""], + ], + index=df.index, + columns=df.columns, + ) + + def assert_equal_style(cell1, cell2, engine): + if engine in ["xlsxwriter", "openpyxl"]: + pytest.xfail( + reason=( + "GH25351: failing on some attribute " + "comparisons in {}".format(engine) + ) + ) + # XXX: should find a better way to check equality + assert cell1.alignment.__dict__ == cell2.alignment.__dict__ + assert cell1.border.__dict__ == cell2.border.__dict__ + assert cell1.fill.__dict__ == cell2.fill.__dict__ + assert cell1.font.__dict__ == cell2.font.__dict__ + assert cell1.number_format == cell2.number_format + assert cell1.protection.__dict__ == cell2.protection.__dict__ + + def custom_converter(css): + # use bold iff there is custom style attached to the cell + if css.strip(" \n;"): + return {"font": {"bold": True}} + return {} + + pytest.importorskip("jinja2") + pytest.importorskip(engine) + + # Prepare spreadsheets + + df = DataFrame(np.random.randn(11, 3)) + with tm.ensure_clean(".xlsx" if engine != "xlwt" else ".xls") as path: + writer = ExcelWriter(path, engine=engine) + df.to_excel(writer, sheet_name="frame") + df.style.to_excel(writer, sheet_name="unstyled") + styled = df.style.apply(style, axis=None) + styled.to_excel(writer, sheet_name="styled") + ExcelFormatter(styled, style_converter=custom_converter).write( + writer, sheet_name="custom" + ) + writer.save() + + if engine not in ("openpyxl", "xlsxwriter"): + # For other engines, we only smoke test + return + openpyxl = pytest.importorskip("openpyxl") + wb = openpyxl.load_workbook(path) + + # (1) compare DataFrame.to_excel and Styler.to_excel when unstyled + n_cells = 0 + for col1, col2 in zip(wb["frame"].columns, wb["unstyled"].columns): + assert len(col1) == len(col2) + for cell1, cell2 in zip(col1, col2): + assert cell1.value == cell2.value + assert_equal_style(cell1, cell2, engine) + n_cells += 1 + + # ensure iteration actually happened: + assert n_cells == (11 + 1) * (3 + 1) + + # (2) check styling with default converter + + # XXX: openpyxl (as at 2.4) prefixes colors with 00, xlsxwriter with FF + alpha = "00" if engine == "openpyxl" else "FF" + + n_cells = 0 + for col1, col2 in zip(wb["frame"].columns, wb["styled"].columns): + assert len(col1) == len(col2) + for cell1, cell2 in zip(col1, col2): + ref = "{cell2.column}{cell2.row:d}".format(cell2=cell2) + # XXX: this isn't as strong a test as ideal; we should + # confirm that differences are exclusive + if ref == "B2": + assert not cell1.font.bold + assert cell2.font.bold + elif ref == "C3": + assert cell1.font.color.rgb != cell2.font.color.rgb + assert cell2.font.color.rgb == alpha + "0000FF" + elif ref == "D4": + assert cell1.font.underline != cell2.font.underline + assert cell2.font.underline == "single" + elif ref == "B5": + assert not cell1.border.left.style + assert ( + cell2.border.top.style + == cell2.border.right.style + == cell2.border.bottom.style + == cell2.border.left.style + == "medium" + ) + elif ref == "C6": + assert not cell1.font.italic + assert cell2.font.italic + elif ref == "D7": + assert cell1.alignment.horizontal != cell2.alignment.horizontal + assert cell2.alignment.horizontal == "right" + elif ref == "B8": + assert cell1.fill.fgColor.rgb != cell2.fill.fgColor.rgb + assert cell1.fill.patternType != cell2.fill.patternType + assert cell2.fill.fgColor.rgb == alpha + "FF0000" + assert cell2.fill.patternType == "solid" + elif ref == "B9": + assert cell1.number_format == "General" + assert cell2.number_format == "0%" + else: + assert_equal_style(cell1, cell2, engine) + + assert cell1.value == cell2.value + n_cells += 1 + + assert n_cells == (11 + 1) * (3 + 1) + + # (3) check styling with custom converter + n_cells = 0 + for col1, col2 in zip(wb["frame"].columns, wb["custom"].columns): + assert len(col1) == len(col2) + for cell1, cell2 in zip(col1, col2): + ref = "{cell2.column}{cell2.row:d}".format(cell2=cell2) + if ref in ("B2", "C3", "D4", "B5", "C6", "D7", "B8", "B9"): + assert not cell1.font.bold + assert cell2.font.bold + else: + assert_equal_style(cell1, cell2, engine) + + assert cell1.value == cell2.value + n_cells += 1 + + assert n_cells == (11 + 1) * (3 + 1) diff --git a/venv/Lib/site-packages/pandas/tests/io/excel/test_writers.py b/venv/Lib/site-packages/pandas/tests/io/excel/test_writers.py new file mode 100644 index 0000000..55b987a --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/excel/test_writers.py @@ -0,0 +1,1273 @@ +from datetime import date, datetime, timedelta +from functools import partial +from io import BytesIO +import os + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import DataFrame, Index, MultiIndex, get_option, set_option +import pandas._testing as tm + +from pandas.io.excel import ( + ExcelFile, + ExcelWriter, + _OpenpyxlWriter, + _XlsxWriter, + _XlwtWriter, + register_writer, +) + + +@pytest.fixture +def path(ext): + """ + Fixture to open file for use in each test case. + """ + with tm.ensure_clean(ext) as file_path: + yield file_path + + +@pytest.fixture +def set_engine(engine, ext): + """ + Fixture to set engine for use in each test case. + + Rather than requiring `engine=...` to be provided explicitly as an + argument in each test, this fixture sets a global option to dictate + which engine should be used to write Excel files. After executing + the test it rolls back said change to the global option. + """ + option_name = "io.excel.{ext}.writer".format(ext=ext.strip(".")) + prev_engine = get_option(option_name) + set_option(option_name, engine) + yield + set_option(option_name, prev_engine) # Roll back option change + + +@td.skip_if_no("xlrd") +@pytest.mark.parametrize("ext", [".xls", ".xlsx", ".xlsm"]) +class TestRoundTrip: + @td.skip_if_no("xlwt") + @td.skip_if_no("openpyxl") + @pytest.mark.parametrize( + "header,expected", + [(None, DataFrame([np.nan] * 4)), (0, DataFrame({"Unnamed: 0": [np.nan] * 3}))], + ) + def test_read_one_empty_col_no_header(self, ext, header, expected): + # xref gh-12292 + filename = "no_header" + df = pd.DataFrame([["", 1, 100], ["", 2, 200], ["", 3, 300], ["", 4, 400]]) + + with tm.ensure_clean(ext) as path: + df.to_excel(path, filename, index=False, header=False) + result = pd.read_excel(path, filename, usecols=[0], header=header) + + tm.assert_frame_equal(result, expected) + + @td.skip_if_no("xlwt") + @td.skip_if_no("openpyxl") + @pytest.mark.parametrize( + "header,expected", + [(None, DataFrame([0] + [np.nan] * 4)), (0, DataFrame([np.nan] * 4))], + ) + def test_read_one_empty_col_with_header(self, ext, header, expected): + filename = "with_header" + df = pd.DataFrame([["", 1, 100], ["", 2, 200], ["", 3, 300], ["", 4, 400]]) + + with tm.ensure_clean(ext) as path: + df.to_excel(path, "with_header", index=False, header=True) + result = pd.read_excel(path, filename, usecols=[0], header=header) + + tm.assert_frame_equal(result, expected) + + @td.skip_if_no("openpyxl") + @td.skip_if_no("xlwt") + def test_set_column_names_in_parameter(self, ext): + # GH 12870 : pass down column names associated with + # keyword argument names + refdf = pd.DataFrame([[1, "foo"], [2, "bar"], [3, "baz"]], columns=["a", "b"]) + + with tm.ensure_clean(ext) as pth: + with ExcelWriter(pth) as writer: + refdf.to_excel(writer, "Data_no_head", header=False, index=False) + refdf.to_excel(writer, "Data_with_head", index=False) + + refdf.columns = ["A", "B"] + + with ExcelFile(pth) as reader: + xlsdf_no_head = pd.read_excel( + reader, "Data_no_head", header=None, names=["A", "B"] + ) + xlsdf_with_head = pd.read_excel( + reader, "Data_with_head", index_col=None, names=["A", "B"] + ) + + tm.assert_frame_equal(xlsdf_no_head, refdf) + tm.assert_frame_equal(xlsdf_with_head, refdf) + + @td.skip_if_no("xlwt") + @td.skip_if_no("openpyxl") + def test_creating_and_reading_multiple_sheets(self, ext): + # see gh-9450 + # + # Test reading multiple sheets, from a runtime + # created Excel file with multiple sheets. + def tdf(col_sheet_name): + d, i = [11, 22, 33], [1, 2, 3] + return DataFrame(d, i, columns=[col_sheet_name]) + + sheets = ["AAA", "BBB", "CCC"] + + dfs = [tdf(s) for s in sheets] + dfs = dict(zip(sheets, dfs)) + + with tm.ensure_clean(ext) as pth: + with ExcelWriter(pth) as ew: + for sheetname, df in dfs.items(): + df.to_excel(ew, sheetname) + + dfs_returned = pd.read_excel(pth, sheet_name=sheets, index_col=0) + + for s in sheets: + tm.assert_frame_equal(dfs[s], dfs_returned[s]) + + @td.skip_if_no("xlsxwriter") + def test_read_excel_multiindex_empty_level(self, ext): + # see gh-12453 + with tm.ensure_clean(ext) as path: + df = DataFrame( + { + ("One", "x"): {0: 1}, + ("Two", "X"): {0: 3}, + ("Two", "Y"): {0: 7}, + ("Zero", ""): {0: 0}, + } + ) + + expected = DataFrame( + { + ("One", "x"): {0: 1}, + ("Two", "X"): {0: 3}, + ("Two", "Y"): {0: 7}, + ("Zero", "Unnamed: 4_level_1"): {0: 0}, + } + ) + + df.to_excel(path) + actual = pd.read_excel(path, header=[0, 1], index_col=0) + tm.assert_frame_equal(actual, expected) + + df = pd.DataFrame( + { + ("Beg", ""): {0: 0}, + ("Middle", "x"): {0: 1}, + ("Tail", "X"): {0: 3}, + ("Tail", "Y"): {0: 7}, + } + ) + + expected = pd.DataFrame( + { + ("Beg", "Unnamed: 1_level_1"): {0: 0}, + ("Middle", "x"): {0: 1}, + ("Tail", "X"): {0: 3}, + ("Tail", "Y"): {0: 7}, + } + ) + + df.to_excel(path) + actual = pd.read_excel(path, header=[0, 1], index_col=0) + tm.assert_frame_equal(actual, expected) + + @td.skip_if_no("xlsxwriter") + @pytest.mark.parametrize("c_idx_names", [True, False]) + @pytest.mark.parametrize("r_idx_names", [True, False]) + @pytest.mark.parametrize("c_idx_levels", [1, 3]) + @pytest.mark.parametrize("r_idx_levels", [1, 3]) + def test_excel_multindex_roundtrip( + self, ext, c_idx_names, r_idx_names, c_idx_levels, r_idx_levels + ): + # see gh-4679 + with tm.ensure_clean(ext) as pth: + if c_idx_levels == 1 and c_idx_names: + pytest.skip( + "Column index name cannot be serialized unless it's a MultiIndex" + ) + + # Empty name case current read in as + # unnamed levels, not Nones. + check_names = r_idx_names or r_idx_levels <= 1 + + df = tm.makeCustomDataframe( + 5, 5, c_idx_names, r_idx_names, c_idx_levels, r_idx_levels + ) + df.to_excel(pth) + + act = pd.read_excel( + pth, + index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels)), + ) + tm.assert_frame_equal(df, act, check_names=check_names) + + df.iloc[0, :] = np.nan + df.to_excel(pth) + + act = pd.read_excel( + pth, + index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels)), + ) + tm.assert_frame_equal(df, act, check_names=check_names) + + df.iloc[-1, :] = np.nan + df.to_excel(pth) + act = pd.read_excel( + pth, + index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels)), + ) + tm.assert_frame_equal(df, act, check_names=check_names) + + @td.skip_if_no("xlwt") + @td.skip_if_no("openpyxl") + def test_read_excel_parse_dates(self, ext): + # see gh-11544, gh-12051 + df = DataFrame( + {"col": [1, 2, 3], "date_strings": pd.date_range("2012-01-01", periods=3)} + ) + df2 = df.copy() + df2["date_strings"] = df2["date_strings"].dt.strftime("%m/%d/%Y") + + with tm.ensure_clean(ext) as pth: + df2.to_excel(pth) + + res = pd.read_excel(pth, index_col=0) + tm.assert_frame_equal(df2, res) + + res = pd.read_excel(pth, parse_dates=["date_strings"], index_col=0) + tm.assert_frame_equal(df, res) + + date_parser = lambda x: datetime.strptime(x, "%m/%d/%Y") + res = pd.read_excel( + pth, parse_dates=["date_strings"], date_parser=date_parser, index_col=0 + ) + tm.assert_frame_equal(df, res) + + +@td.skip_if_no("xlrd") +@pytest.mark.parametrize( + "engine,ext", + [ + pytest.param("openpyxl", ".xlsx", marks=td.skip_if_no("openpyxl")), + pytest.param("openpyxl", ".xlsm", marks=td.skip_if_no("openpyxl")), + pytest.param("xlwt", ".xls", marks=td.skip_if_no("xlwt")), + pytest.param("xlsxwriter", ".xlsx", marks=td.skip_if_no("xlsxwriter")), + ], +) +@pytest.mark.usefixtures("set_engine") +class TestExcelWriter: + def test_excel_sheet_size(self, path): + + # GH 26080 + breaking_row_count = 2 ** 20 + 1 + breaking_col_count = 2 ** 14 + 1 + # purposely using two arrays to prevent memory issues while testing + row_arr = np.zeros(shape=(breaking_row_count, 1)) + col_arr = np.zeros(shape=(1, breaking_col_count)) + row_df = pd.DataFrame(row_arr) + col_df = pd.DataFrame(col_arr) + + msg = "sheet is too large" + with pytest.raises(ValueError, match=msg): + row_df.to_excel(path) + + with pytest.raises(ValueError, match=msg): + col_df.to_excel(path) + + def test_excel_sheet_by_name_raise(self, path): + import xlrd + + gt = DataFrame(np.random.randn(10, 2)) + gt.to_excel(path) + + xl = ExcelFile(path) + df = pd.read_excel(xl, 0, index_col=0) + + tm.assert_frame_equal(gt, df) + + with pytest.raises(xlrd.XLRDError): + pd.read_excel(xl, "0") + + def test_excel_writer_context_manager(self, frame, path): + with ExcelWriter(path) as writer: + frame.to_excel(writer, "Data1") + frame2 = frame.copy() + frame2.columns = frame.columns[::-1] + frame2.to_excel(writer, "Data2") + + with ExcelFile(path) as reader: + found_df = pd.read_excel(reader, "Data1", index_col=0) + found_df2 = pd.read_excel(reader, "Data2", index_col=0) + + tm.assert_frame_equal(found_df, frame) + tm.assert_frame_equal(found_df2, frame2) + + def test_roundtrip(self, frame, path): + frame = frame.copy() + frame["A"][:5] = np.nan + + frame.to_excel(path, "test1") + frame.to_excel(path, "test1", columns=["A", "B"]) + frame.to_excel(path, "test1", header=False) + frame.to_excel(path, "test1", index=False) + + # test roundtrip + frame.to_excel(path, "test1") + recons = pd.read_excel(path, "test1", index_col=0) + tm.assert_frame_equal(frame, recons) + + frame.to_excel(path, "test1", index=False) + recons = pd.read_excel(path, "test1", index_col=None) + recons.index = frame.index + tm.assert_frame_equal(frame, recons) + + frame.to_excel(path, "test1", na_rep="NA") + recons = pd.read_excel(path, "test1", index_col=0, na_values=["NA"]) + tm.assert_frame_equal(frame, recons) + + # GH 3611 + frame.to_excel(path, "test1", na_rep="88") + recons = pd.read_excel(path, "test1", index_col=0, na_values=["88"]) + tm.assert_frame_equal(frame, recons) + + frame.to_excel(path, "test1", na_rep="88") + recons = pd.read_excel(path, "test1", index_col=0, na_values=[88, 88.0]) + tm.assert_frame_equal(frame, recons) + + # GH 6573 + frame.to_excel(path, "Sheet1") + recons = pd.read_excel(path, index_col=0) + tm.assert_frame_equal(frame, recons) + + frame.to_excel(path, "0") + recons = pd.read_excel(path, index_col=0) + tm.assert_frame_equal(frame, recons) + + # GH 8825 Pandas Series should provide to_excel method + s = frame["A"] + s.to_excel(path) + recons = pd.read_excel(path, index_col=0) + tm.assert_frame_equal(s.to_frame(), recons) + + def test_mixed(self, frame, path): + mixed_frame = frame.copy() + mixed_frame["foo"] = "bar" + + mixed_frame.to_excel(path, "test1") + reader = ExcelFile(path) + recons = pd.read_excel(reader, "test1", index_col=0) + tm.assert_frame_equal(mixed_frame, recons) + + def test_ts_frame(self, tsframe, path): + df = tsframe + + df.to_excel(path, "test1") + reader = ExcelFile(path) + + recons = pd.read_excel(reader, "test1", index_col=0) + tm.assert_frame_equal(df, recons) + + def test_basics_with_nan(self, frame, path): + frame = frame.copy() + frame["A"][:5] = np.nan + frame.to_excel(path, "test1") + frame.to_excel(path, "test1", columns=["A", "B"]) + frame.to_excel(path, "test1", header=False) + frame.to_excel(path, "test1", index=False) + + @pytest.mark.parametrize("np_type", [np.int8, np.int16, np.int32, np.int64]) + def test_int_types(self, np_type, path): + # Test np.int values read come back as int + # (rather than float which is Excel's format). + df = DataFrame(np.random.randint(-10, 10, size=(10, 2)), dtype=np_type) + df.to_excel(path, "test1") + + reader = ExcelFile(path) + recons = pd.read_excel(reader, "test1", index_col=0) + + int_frame = df.astype(np.int64) + tm.assert_frame_equal(int_frame, recons) + + recons2 = pd.read_excel(path, "test1", index_col=0) + tm.assert_frame_equal(int_frame, recons2) + + # Test with convert_float=False comes back as float. + float_frame = df.astype(float) + recons = pd.read_excel(path, "test1", convert_float=False, index_col=0) + tm.assert_frame_equal( + recons, float_frame, check_index_type=False, check_column_type=False + ) + + @pytest.mark.parametrize("np_type", [np.float16, np.float32, np.float64]) + def test_float_types(self, np_type, path): + # Test np.float values read come back as float. + df = DataFrame(np.random.random_sample(10), dtype=np_type) + df.to_excel(path, "test1") + + reader = ExcelFile(path) + recons = pd.read_excel(reader, "test1", index_col=0).astype(np_type) + + tm.assert_frame_equal(df, recons, check_dtype=False) + + @pytest.mark.parametrize("np_type", [np.bool8, np.bool_]) + def test_bool_types(self, np_type, path): + # Test np.bool values read come back as float. + df = DataFrame([1, 0, True, False], dtype=np_type) + df.to_excel(path, "test1") + + reader = ExcelFile(path) + recons = pd.read_excel(reader, "test1", index_col=0).astype(np_type) + + tm.assert_frame_equal(df, recons) + + def test_inf_roundtrip(self, path): + df = DataFrame([(1, np.inf), (2, 3), (5, -np.inf)]) + df.to_excel(path, "test1") + + reader = ExcelFile(path) + recons = pd.read_excel(reader, "test1", index_col=0) + + tm.assert_frame_equal(df, recons) + + def test_sheets(self, frame, tsframe, path): + frame = frame.copy() + frame["A"][:5] = np.nan + + frame.to_excel(path, "test1") + frame.to_excel(path, "test1", columns=["A", "B"]) + frame.to_excel(path, "test1", header=False) + frame.to_excel(path, "test1", index=False) + + # Test writing to separate sheets + writer = ExcelWriter(path) + frame.to_excel(writer, "test1") + tsframe.to_excel(writer, "test2") + writer.save() + reader = ExcelFile(path) + recons = pd.read_excel(reader, "test1", index_col=0) + tm.assert_frame_equal(frame, recons) + recons = pd.read_excel(reader, "test2", index_col=0) + tm.assert_frame_equal(tsframe, recons) + assert 2 == len(reader.sheet_names) + assert "test1" == reader.sheet_names[0] + assert "test2" == reader.sheet_names[1] + + def test_colaliases(self, frame, path): + frame = frame.copy() + frame["A"][:5] = np.nan + + frame.to_excel(path, "test1") + frame.to_excel(path, "test1", columns=["A", "B"]) + frame.to_excel(path, "test1", header=False) + frame.to_excel(path, "test1", index=False) + + # column aliases + col_aliases = Index(["AA", "X", "Y", "Z"]) + frame.to_excel(path, "test1", header=col_aliases) + reader = ExcelFile(path) + rs = pd.read_excel(reader, "test1", index_col=0) + xp = frame.copy() + xp.columns = col_aliases + tm.assert_frame_equal(xp, rs) + + def test_roundtrip_indexlabels(self, merge_cells, frame, path): + frame = frame.copy() + frame["A"][:5] = np.nan + + frame.to_excel(path, "test1") + frame.to_excel(path, "test1", columns=["A", "B"]) + frame.to_excel(path, "test1", header=False) + frame.to_excel(path, "test1", index=False) + + # test index_label + df = DataFrame(np.random.randn(10, 2)) >= 0 + df.to_excel(path, "test1", index_label=["test"], merge_cells=merge_cells) + reader = ExcelFile(path) + recons = pd.read_excel(reader, "test1", index_col=0).astype(np.int64) + df.index.names = ["test"] + assert df.index.names == recons.index.names + + df = DataFrame(np.random.randn(10, 2)) >= 0 + df.to_excel( + path, + "test1", + index_label=["test", "dummy", "dummy2"], + merge_cells=merge_cells, + ) + reader = ExcelFile(path) + recons = pd.read_excel(reader, "test1", index_col=0).astype(np.int64) + df.index.names = ["test"] + assert df.index.names == recons.index.names + + df = DataFrame(np.random.randn(10, 2)) >= 0 + df.to_excel(path, "test1", index_label="test", merge_cells=merge_cells) + reader = ExcelFile(path) + recons = pd.read_excel(reader, "test1", index_col=0).astype(np.int64) + df.index.names = ["test"] + tm.assert_frame_equal(df, recons.astype(bool)) + + frame.to_excel( + path, + "test1", + columns=["A", "B", "C", "D"], + index=False, + merge_cells=merge_cells, + ) + # take 'A' and 'B' as indexes (same row as cols 'C', 'D') + df = frame.copy() + df = df.set_index(["A", "B"]) + + reader = ExcelFile(path) + recons = pd.read_excel(reader, "test1", index_col=[0, 1]) + tm.assert_frame_equal(df, recons, check_less_precise=True) + + def test_excel_roundtrip_indexname(self, merge_cells, path): + df = DataFrame(np.random.randn(10, 4)) + df.index.name = "foo" + + df.to_excel(path, merge_cells=merge_cells) + + xf = ExcelFile(path) + result = pd.read_excel(xf, xf.sheet_names[0], index_col=0) + + tm.assert_frame_equal(result, df) + assert result.index.name == "foo" + + def test_excel_roundtrip_datetime(self, merge_cells, tsframe, path): + # datetime.date, not sure what to test here exactly + tsf = tsframe.copy() + + tsf.index = [x.date() for x in tsframe.index] + tsf.to_excel(path, "test1", merge_cells=merge_cells) + + reader = ExcelFile(path) + recons = pd.read_excel(reader, "test1", index_col=0) + + tm.assert_frame_equal(tsframe, recons) + + def test_excel_date_datetime_format(self, engine, ext, path): + # see gh-4133 + # + # Excel output format strings + df = DataFrame( + [ + [date(2014, 1, 31), date(1999, 9, 24)], + [datetime(1998, 5, 26, 23, 33, 4), datetime(2014, 2, 28, 13, 5, 13)], + ], + index=["DATE", "DATETIME"], + columns=["X", "Y"], + ) + df_expected = DataFrame( + [ + [datetime(2014, 1, 31), datetime(1999, 9, 24)], + [datetime(1998, 5, 26, 23, 33, 4), datetime(2014, 2, 28, 13, 5, 13)], + ], + index=["DATE", "DATETIME"], + columns=["X", "Y"], + ) + + with tm.ensure_clean(ext) as filename2: + writer1 = ExcelWriter(path) + writer2 = ExcelWriter( + filename2, + date_format="DD.MM.YYYY", + datetime_format="DD.MM.YYYY HH-MM-SS", + ) + + df.to_excel(writer1, "test1") + df.to_excel(writer2, "test1") + + writer1.close() + writer2.close() + + reader1 = ExcelFile(path) + reader2 = ExcelFile(filename2) + + rs1 = pd.read_excel(reader1, "test1", index_col=0) + rs2 = pd.read_excel(reader2, "test1", index_col=0) + + tm.assert_frame_equal(rs1, rs2) + + # Since the reader returns a datetime object for dates, + # we need to use df_expected to check the result. + tm.assert_frame_equal(rs2, df_expected) + + def test_to_excel_interval_no_labels(self, path): + # see gh-19242 + # + # Test writing Interval without labels. + df = DataFrame(np.random.randint(-10, 10, size=(20, 1)), dtype=np.int64) + expected = df.copy() + + df["new"] = pd.cut(df[0], 10) + expected["new"] = pd.cut(expected[0], 10).astype(str) + + df.to_excel(path, "test1") + reader = ExcelFile(path) + + recons = pd.read_excel(reader, "test1", index_col=0) + tm.assert_frame_equal(expected, recons) + + def test_to_excel_interval_labels(self, path): + # see gh-19242 + # + # Test writing Interval with labels. + df = DataFrame(np.random.randint(-10, 10, size=(20, 1)), dtype=np.int64) + expected = df.copy() + intervals = pd.cut( + df[0], 10, labels=["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"] + ) + df["new"] = intervals + expected["new"] = pd.Series(list(intervals)) + + df.to_excel(path, "test1") + reader = ExcelFile(path) + + recons = pd.read_excel(reader, "test1", index_col=0) + tm.assert_frame_equal(expected, recons) + + def test_to_excel_timedelta(self, path): + # see gh-19242, gh-9155 + # + # Test writing timedelta to xls. + df = DataFrame( + np.random.randint(-10, 10, size=(20, 1)), columns=["A"], dtype=np.int64 + ) + expected = df.copy() + + df["new"] = df["A"].apply(lambda x: timedelta(seconds=x)) + expected["new"] = expected["A"].apply( + lambda x: timedelta(seconds=x).total_seconds() / float(86400) + ) + + df.to_excel(path, "test1") + reader = ExcelFile(path) + + recons = pd.read_excel(reader, "test1", index_col=0) + tm.assert_frame_equal(expected, recons) + + def test_to_excel_periodindex(self, tsframe, path): + xp = tsframe.resample("M", kind="period").mean() + + xp.to_excel(path, "sht1") + + reader = ExcelFile(path) + rs = pd.read_excel(reader, "sht1", index_col=0) + tm.assert_frame_equal(xp, rs.to_period("M")) + + def test_to_excel_multiindex(self, merge_cells, frame, path): + arrays = np.arange(len(frame.index) * 2).reshape(2, -1) + new_index = MultiIndex.from_arrays(arrays, names=["first", "second"]) + frame.index = new_index + + frame.to_excel(path, "test1", header=False) + frame.to_excel(path, "test1", columns=["A", "B"]) + + # round trip + frame.to_excel(path, "test1", merge_cells=merge_cells) + reader = ExcelFile(path) + df = pd.read_excel(reader, "test1", index_col=[0, 1]) + tm.assert_frame_equal(frame, df) + + # GH13511 + def test_to_excel_multiindex_nan_label(self, merge_cells, path): + df = pd.DataFrame( + {"A": [None, 2, 3], "B": [10, 20, 30], "C": np.random.sample(3)} + ) + df = df.set_index(["A", "B"]) + + df.to_excel(path, merge_cells=merge_cells) + df1 = pd.read_excel(path, index_col=[0, 1]) + tm.assert_frame_equal(df, df1) + + # Test for Issue 11328. If column indices are integers, make + # sure they are handled correctly for either setting of + # merge_cells + def test_to_excel_multiindex_cols(self, merge_cells, frame, path): + arrays = np.arange(len(frame.index) * 2).reshape(2, -1) + new_index = MultiIndex.from_arrays(arrays, names=["first", "second"]) + frame.index = new_index + + new_cols_index = MultiIndex.from_tuples([(40, 1), (40, 2), (50, 1), (50, 2)]) + frame.columns = new_cols_index + header = [0, 1] + if not merge_cells: + header = 0 + + # round trip + frame.to_excel(path, "test1", merge_cells=merge_cells) + reader = ExcelFile(path) + df = pd.read_excel(reader, "test1", header=header, index_col=[0, 1]) + if not merge_cells: + fm = frame.columns.format(sparsify=False, adjoin=False, names=False) + frame.columns = [".".join(map(str, q)) for q in zip(*fm)] + tm.assert_frame_equal(frame, df) + + def test_to_excel_multiindex_dates(self, merge_cells, tsframe, path): + # try multiindex with dates + new_index = [tsframe.index, np.arange(len(tsframe.index))] + tsframe.index = MultiIndex.from_arrays(new_index) + + tsframe.index.names = ["time", "foo"] + tsframe.to_excel(path, "test1", merge_cells=merge_cells) + reader = ExcelFile(path) + recons = pd.read_excel(reader, "test1", index_col=[0, 1]) + + tm.assert_frame_equal(tsframe, recons) + assert recons.index.names == ("time", "foo") + + def test_to_excel_multiindex_no_write_index(self, path): + # Test writing and re-reading a MI without the index. GH 5616. + + # Initial non-MI frame. + frame1 = DataFrame({"a": [10, 20], "b": [30, 40], "c": [50, 60]}) + + # Add a MI. + frame2 = frame1.copy() + multi_index = MultiIndex.from_tuples([(70, 80), (90, 100)]) + frame2.index = multi_index + + # Write out to Excel without the index. + frame2.to_excel(path, "test1", index=False) + + # Read it back in. + reader = ExcelFile(path) + frame3 = pd.read_excel(reader, "test1") + + # Test that it is the same as the initial frame. + tm.assert_frame_equal(frame1, frame3) + + def test_to_excel_float_format(self, path): + df = DataFrame( + [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) + df.to_excel(path, "test1", float_format="%.2f") + + reader = ExcelFile(path) + result = pd.read_excel(reader, "test1", index_col=0) + + expected = DataFrame( + [[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) + tm.assert_frame_equal(result, expected) + + def test_to_excel_output_encoding(self, ext): + # Avoid mixed inferred_type. + df = DataFrame( + [["\u0192", "\u0193", "\u0194"], ["\u0195", "\u0196", "\u0197"]], + index=["A\u0192", "B"], + columns=["X\u0193", "Y", "Z"], + ) + + with tm.ensure_clean("__tmp_to_excel_float_format__." + ext) as filename: + df.to_excel(filename, sheet_name="TestSheet", encoding="utf8") + result = pd.read_excel(filename, "TestSheet", encoding="utf8", index_col=0) + tm.assert_frame_equal(result, df) + + def test_to_excel_unicode_filename(self, ext, path): + with tm.ensure_clean("\u0192u." + ext) as filename: + try: + f = open(filename, "wb") + except UnicodeEncodeError: + pytest.skip("No unicode file names on this system") + else: + f.close() + + df = DataFrame( + [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) + df.to_excel(filename, "test1", float_format="%.2f") + + reader = ExcelFile(filename) + result = pd.read_excel(reader, "test1", index_col=0) + + expected = DataFrame( + [[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) + tm.assert_frame_equal(result, expected) + + # FIXME: dont leave commented-out + # def test_to_excel_header_styling_xls(self, engine, ext): + + # import StringIO + # s = StringIO( + # """Date,ticker,type,value + # 2001-01-01,x,close,12.2 + # 2001-01-01,x,open ,12.1 + # 2001-01-01,y,close,12.2 + # 2001-01-01,y,open ,12.1 + # 2001-02-01,x,close,12.2 + # 2001-02-01,x,open ,12.1 + # 2001-02-01,y,close,12.2 + # 2001-02-01,y,open ,12.1 + # 2001-03-01,x,close,12.2 + # 2001-03-01,x,open ,12.1 + # 2001-03-01,y,close,12.2 + # 2001-03-01,y,open ,12.1""") + # df = read_csv(s, parse_dates=["Date"]) + # pdf = df.pivot_table(values="value", rows=["ticker"], + # cols=["Date", "type"]) + + # try: + # import xlwt + # import xlrd + # except ImportError: + # pytest.skip + + # filename = '__tmp_to_excel_header_styling_xls__.xls' + # pdf.to_excel(filename, 'test1') + + # wbk = xlrd.open_workbook(filename, + # formatting_info=True) + # assert ["test1"] == wbk.sheet_names() + # ws = wbk.sheet_by_name('test1') + # assert [(0, 1, 5, 7), (0, 1, 3, 5), (0, 1, 1, 3)] == ws.merged_cells + # for i in range(0, 2): + # for j in range(0, 7): + # xfx = ws.cell_xf_index(0, 0) + # cell_xf = wbk.xf_list[xfx] + # font = wbk.font_list + # assert 1 == font[cell_xf.font_index].bold + # assert 1 == cell_xf.border.top_line_style + # assert 1 == cell_xf.border.right_line_style + # assert 1 == cell_xf.border.bottom_line_style + # assert 1 == cell_xf.border.left_line_style + # assert 2 == cell_xf.alignment.hor_align + # os.remove(filename) + # def test_to_excel_header_styling_xlsx(self, engine, ext): + # import StringIO + # s = StringIO( + # """Date,ticker,type,value + # 2001-01-01,x,close,12.2 + # 2001-01-01,x,open ,12.1 + # 2001-01-01,y,close,12.2 + # 2001-01-01,y,open ,12.1 + # 2001-02-01,x,close,12.2 + # 2001-02-01,x,open ,12.1 + # 2001-02-01,y,close,12.2 + # 2001-02-01,y,open ,12.1 + # 2001-03-01,x,close,12.2 + # 2001-03-01,x,open ,12.1 + # 2001-03-01,y,close,12.2 + # 2001-03-01,y,open ,12.1""") + # df = read_csv(s, parse_dates=["Date"]) + # pdf = df.pivot_table(values="value", rows=["ticker"], + # cols=["Date", "type"]) + # try: + # import openpyxl + # from openpyxl.cell import get_column_letter + # except ImportError: + # pytest.skip + # if openpyxl.__version__ < '1.6.1': + # pytest.skip + # # test xlsx_styling + # filename = '__tmp_to_excel_header_styling_xlsx__.xlsx' + # pdf.to_excel(filename, 'test1') + # wbk = openpyxl.load_workbook(filename) + # assert ["test1"] == wbk.get_sheet_names() + # ws = wbk.get_sheet_by_name('test1') + # xlsaddrs = ["%s2" % chr(i) for i in range(ord('A'), ord('H'))] + # xlsaddrs += ["A%s" % i for i in range(1, 6)] + # xlsaddrs += ["B1", "D1", "F1"] + # for xlsaddr in xlsaddrs: + # cell = ws.cell(xlsaddr) + # assert cell.style.font.bold + # assert (openpyxl.style.Border.BORDER_THIN == + # cell.style.borders.top.border_style) + # assert (openpyxl.style.Border.BORDER_THIN == + # cell.style.borders.right.border_style) + # assert (openpyxl.style.Border.BORDER_THIN == + # cell.style.borders.bottom.border_style) + # assert (openpyxl.style.Border.BORDER_THIN == + # cell.style.borders.left.border_style) + # assert (openpyxl.style.Alignment.HORIZONTAL_CENTER == + # cell.style.alignment.horizontal) + # mergedcells_addrs = ["C1", "E1", "G1"] + # for maddr in mergedcells_addrs: + # assert ws.cell(maddr).merged + # os.remove(filename) + + @pytest.mark.parametrize("use_headers", [True, False]) + @pytest.mark.parametrize("r_idx_nlevels", [1, 2, 3]) + @pytest.mark.parametrize("c_idx_nlevels", [1, 2, 3]) + def test_excel_010_hemstring( + self, merge_cells, c_idx_nlevels, r_idx_nlevels, use_headers, path + ): + def roundtrip(data, header=True, parser_hdr=0, index=True): + data.to_excel(path, header=header, merge_cells=merge_cells, index=index) + + xf = ExcelFile(path) + return pd.read_excel(xf, xf.sheet_names[0], header=parser_hdr) + + # Basic test. + parser_header = 0 if use_headers else None + res = roundtrip(DataFrame([0]), use_headers, parser_header) + + assert res.shape == (1, 2) + assert res.iloc[0, 0] is not np.nan + + # More complex tests with multi-index. + nrows = 5 + ncols = 3 + + # ensure limited functionality in 0.10 + # override of gh-2370 until sorted out in 0.11 + + df = tm.makeCustomDataframe( + nrows, ncols, r_idx_nlevels=r_idx_nlevels, c_idx_nlevels=c_idx_nlevels + ) + + # This if will be removed once multi-column Excel writing + # is implemented. For now fixing gh-9794. + if c_idx_nlevels > 1: + with pytest.raises(NotImplementedError): + roundtrip(df, use_headers, index=False) + else: + res = roundtrip(df, use_headers) + + if use_headers: + assert res.shape == (nrows, ncols + r_idx_nlevels) + else: + # First row taken as columns. + assert res.shape == (nrows - 1, ncols + r_idx_nlevels) + + # No NaNs. + for r in range(len(res.index)): + for c in range(len(res.columns)): + assert res.iloc[r, c] is not np.nan + + def test_duplicated_columns(self, path): + # see gh-5235 + df = DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]], columns=["A", "B", "B"]) + df.to_excel(path, "test1") + expected = DataFrame( + [[1, 2, 3], [1, 2, 3], [1, 2, 3]], columns=["A", "B", "B.1"] + ) + + # By default, we mangle. + result = pd.read_excel(path, "test1", index_col=0) + tm.assert_frame_equal(result, expected) + + # Explicitly, we pass in the parameter. + result = pd.read_excel(path, "test1", index_col=0, mangle_dupe_cols=True) + tm.assert_frame_equal(result, expected) + + # see gh-11007, gh-10970 + df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "A", "B"]) + df.to_excel(path, "test1") + + result = pd.read_excel(path, "test1", index_col=0) + expected = DataFrame( + [[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "A.1", "B.1"] + ) + tm.assert_frame_equal(result, expected) + + # see gh-10982 + df.to_excel(path, "test1", index=False, header=False) + result = pd.read_excel(path, "test1", header=None) + + expected = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]]) + tm.assert_frame_equal(result, expected) + + msg = "Setting mangle_dupe_cols=False is not supported yet" + with pytest.raises(ValueError, match=msg): + pd.read_excel(path, "test1", header=None, mangle_dupe_cols=False) + + def test_swapped_columns(self, path): + # Test for issue #5427. + write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2]}) + write_frame.to_excel(path, "test1", columns=["B", "A"]) + + read_frame = pd.read_excel(path, "test1", header=0) + + tm.assert_series_equal(write_frame["A"], read_frame["A"]) + tm.assert_series_equal(write_frame["B"], read_frame["B"]) + + def test_invalid_columns(self, path): + # see gh-10982 + write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2]}) + + with pytest.raises(KeyError, match="Not all names specified"): + write_frame.to_excel(path, "test1", columns=["B", "C"]) + + with pytest.raises( + KeyError, match="'passes columns are not ALL present dataframe'" + ): + write_frame.to_excel(path, "test1", columns=["C", "D"]) + + def test_comment_arg(self, path): + # see gh-18735 + # + # Test the comment argument functionality to pd.read_excel. + + # Create file to read in. + df = DataFrame({"A": ["one", "#one", "one"], "B": ["two", "two", "#two"]}) + df.to_excel(path, "test_c") + + # Read file without comment arg. + result1 = pd.read_excel(path, "test_c", index_col=0) + + result1.iloc[1, 0] = None + result1.iloc[1, 1] = None + result1.iloc[2, 1] = None + + result2 = pd.read_excel(path, "test_c", comment="#", index_col=0) + tm.assert_frame_equal(result1, result2) + + def test_comment_default(self, path): + # Re issue #18735 + # Test the comment argument default to pd.read_excel + + # Create file to read in + df = DataFrame({"A": ["one", "#one", "one"], "B": ["two", "two", "#two"]}) + df.to_excel(path, "test_c") + + # Read file with default and explicit comment=None + result1 = pd.read_excel(path, "test_c") + result2 = pd.read_excel(path, "test_c", comment=None) + tm.assert_frame_equal(result1, result2) + + def test_comment_used(self, path): + # see gh-18735 + # + # Test the comment argument is working as expected when used. + + # Create file to read in. + df = DataFrame({"A": ["one", "#one", "one"], "B": ["two", "two", "#two"]}) + df.to_excel(path, "test_c") + + # Test read_frame_comment against manually produced expected output. + expected = DataFrame({"A": ["one", None, "one"], "B": ["two", None, None]}) + result = pd.read_excel(path, "test_c", comment="#", index_col=0) + tm.assert_frame_equal(result, expected) + + def test_comment_empty_line(self, path): + # Re issue #18735 + # Test that pd.read_excel ignores commented lines at the end of file + + df = DataFrame({"a": ["1", "#2"], "b": ["2", "3"]}) + df.to_excel(path, index=False) + + # Test that all-comment lines at EoF are ignored + expected = DataFrame({"a": [1], "b": [2]}) + result = pd.read_excel(path, comment="#") + tm.assert_frame_equal(result, expected) + + def test_datetimes(self, path): + + # Test writing and reading datetimes. For issue #9139. (xref #9185) + datetimes = [ + datetime(2013, 1, 13, 1, 2, 3), + datetime(2013, 1, 13, 2, 45, 56), + datetime(2013, 1, 13, 4, 29, 49), + datetime(2013, 1, 13, 6, 13, 42), + datetime(2013, 1, 13, 7, 57, 35), + datetime(2013, 1, 13, 9, 41, 28), + datetime(2013, 1, 13, 11, 25, 21), + datetime(2013, 1, 13, 13, 9, 14), + datetime(2013, 1, 13, 14, 53, 7), + datetime(2013, 1, 13, 16, 37, 0), + datetime(2013, 1, 13, 18, 20, 52), + ] + + write_frame = DataFrame({"A": datetimes}) + write_frame.to_excel(path, "Sheet1") + read_frame = pd.read_excel(path, "Sheet1", header=0) + + tm.assert_series_equal(write_frame["A"], read_frame["A"]) + + def test_bytes_io(self, engine): + # see gh-7074 + bio = BytesIO() + df = DataFrame(np.random.randn(10, 2)) + + # Pass engine explicitly, as there is no file path to infer from. + writer = ExcelWriter(bio, engine=engine) + df.to_excel(writer) + writer.save() + + bio.seek(0) + reread_df = pd.read_excel(bio, index_col=0) + tm.assert_frame_equal(df, reread_df) + + def test_write_lists_dict(self, path): + # see gh-8188. + df = DataFrame( + { + "mixed": ["a", ["b", "c"], {"d": "e", "f": 2}], + "numeric": [1, 2, 3.0], + "str": ["apple", "banana", "cherry"], + } + ) + df.to_excel(path, "Sheet1") + read = pd.read_excel(path, "Sheet1", header=0, index_col=0) + + expected = df.copy() + expected.mixed = expected.mixed.apply(str) + expected.numeric = expected.numeric.astype("int64") + + tm.assert_frame_equal(read, expected) + + def test_true_and_false_value_options(self, path): + # see gh-13347 + df = pd.DataFrame([["foo", "bar"]], columns=["col1", "col2"]) + expected = df.replace({"foo": True, "bar": False}) + + df.to_excel(path) + read_frame = pd.read_excel( + path, true_values=["foo"], false_values=["bar"], index_col=0 + ) + tm.assert_frame_equal(read_frame, expected) + + def test_freeze_panes(self, path): + # see gh-15160 + expected = DataFrame([[1, 2], [3, 4]], columns=["col1", "col2"]) + expected.to_excel(path, "Sheet1", freeze_panes=(1, 1)) + + result = pd.read_excel(path, index_col=0) + tm.assert_frame_equal(result, expected) + + def test_path_path_lib(self, engine, ext): + df = tm.makeDataFrame() + writer = partial(df.to_excel, engine=engine) + + reader = partial(pd.read_excel, index_col=0) + result = tm.round_trip_pathlib(writer, reader, path="foo.{ext}".format(ext=ext)) + tm.assert_frame_equal(result, df) + + def test_path_local_path(self, engine, ext): + df = tm.makeDataFrame() + writer = partial(df.to_excel, engine=engine) + + reader = partial(pd.read_excel, index_col=0) + result = tm.round_trip_pathlib(writer, reader, path="foo.{ext}".format(ext=ext)) + tm.assert_frame_equal(result, df) + + def test_merged_cell_custom_objects(self, merge_cells, path): + # see GH-27006 + mi = MultiIndex.from_tuples( + [ + (pd.Period("2018"), pd.Period("2018Q1")), + (pd.Period("2018"), pd.Period("2018Q2")), + ] + ) + expected = DataFrame(np.ones((2, 2)), columns=mi) + expected.to_excel(path) + result = pd.read_excel(path, header=[0, 1], index_col=0, convert_float=False) + # need to convert PeriodIndexes to standard Indexes for assert equal + expected.columns.set_levels( + [[str(i) for i in mi.levels[0]], [str(i) for i in mi.levels[1]]], + level=[0, 1], + inplace=True, + ) + expected.index = expected.index.astype(np.float64) + tm.assert_frame_equal(expected, result) + + @pytest.mark.parametrize("dtype", [None, object]) + def test_raise_when_saving_timezones(self, dtype, tz_aware_fixture, path): + # GH 27008, GH 7056 + tz = tz_aware_fixture + data = pd.Timestamp("2019", tz=tz) + df = DataFrame([data], dtype=dtype) + with pytest.raises(ValueError, match="Excel does not support"): + df.to_excel(path) + + data = data.to_pydatetime() + df = DataFrame([data], dtype=dtype) + with pytest.raises(ValueError, match="Excel does not support"): + df.to_excel(path) + + +class TestExcelWriterEngineTests: + @pytest.mark.parametrize( + "klass,ext", + [ + pytest.param(_XlsxWriter, ".xlsx", marks=td.skip_if_no("xlsxwriter")), + pytest.param(_OpenpyxlWriter, ".xlsx", marks=td.skip_if_no("openpyxl")), + pytest.param(_XlwtWriter, ".xls", marks=td.skip_if_no("xlwt")), + ], + ) + def test_ExcelWriter_dispatch(self, klass, ext): + with tm.ensure_clean(ext) as path: + writer = ExcelWriter(path) + if ext == ".xlsx" and td.safe_import("xlsxwriter"): + # xlsxwriter has preference over openpyxl if both installed + assert isinstance(writer, _XlsxWriter) + else: + assert isinstance(writer, klass) + + def test_ExcelWriter_dispatch_raises(self): + with pytest.raises(ValueError, match="No engine"): + ExcelWriter("nothing") + + def test_register_writer(self): + # some awkward mocking to test out dispatch and such actually works + called_save = [] + called_write_cells = [] + + class DummyClass(ExcelWriter): + called_save = False + called_write_cells = False + supported_extensions = ["xlsx", "xls"] + engine = "dummy" + + def save(self): + called_save.append(True) + + def write_cells(self, *args, **kwargs): + called_write_cells.append(True) + + def check_called(func): + func() + assert len(called_save) >= 1 + assert len(called_write_cells) >= 1 + del called_save[:] + del called_write_cells[:] + + with pd.option_context("io.excel.xlsx.writer", "dummy"): + register_writer(DummyClass) + writer = ExcelWriter("something.xlsx") + assert isinstance(writer, DummyClass) + df = tm.makeCustomDataframe(1, 1) + check_called(lambda: df.to_excel("something.xlsx")) + check_called(lambda: df.to_excel("something.xls", engine="dummy")) + + +@td.skip_if_no("xlrd") +@td.skip_if_no("openpyxl") +class TestFSPath: + def test_excelfile_fspath(self): + with tm.ensure_clean("foo.xlsx") as path: + df = DataFrame({"A": [1, 2]}) + df.to_excel(path) + xl = ExcelFile(path) + result = os.fspath(xl) + assert result == path + + def test_excelwriter_fspath(self): + with tm.ensure_clean("foo.xlsx") as path: + writer = ExcelWriter(path) + assert os.fspath(writer) == str(path) diff --git a/venv/Lib/site-packages/pandas/tests/io/excel/test_xlrd.py b/venv/Lib/site-packages/pandas/tests/io/excel/test_xlrd.py new file mode 100644 index 0000000..cc7e231 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/excel/test_xlrd.py @@ -0,0 +1,43 @@ +import pytest + +import pandas as pd +import pandas._testing as tm + +from pandas.io.excel import ExcelFile + +xlrd = pytest.importorskip("xlrd") +xlwt = pytest.importorskip("xlwt") + + +@pytest.fixture(autouse=True) +def skip_ods_and_xlsb_files(read_ext): + if read_ext == ".ods": + pytest.skip("Not valid for xlrd") + if read_ext == ".xlsb": + pytest.skip("Not valid for xlrd") + + +def test_read_xlrd_book(read_ext, frame): + df = frame + + engine = "xlrd" + sheet_name = "SheetA" + + with tm.ensure_clean(read_ext) as pth: + df.to_excel(pth, sheet_name) + book = xlrd.open_workbook(pth) + + with ExcelFile(book, engine=engine) as xl: + result = pd.read_excel(xl, sheet_name, index_col=0) + tm.assert_frame_equal(df, result) + + result = pd.read_excel(book, sheet_name=sheet_name, engine=engine, index_col=0) + tm.assert_frame_equal(df, result) + + +# TODO: test for openpyxl as well +def test_excel_table_sheet_by_index(datapath, read_ext): + path = datapath("io", "data", "excel", "test1{}".format(read_ext)) + with pd.ExcelFile(path) as excel: + with pytest.raises(xlrd.XLRDError): + pd.read_excel(excel, "asdf") diff --git a/venv/Lib/site-packages/pandas/tests/io/excel/test_xlsxwriter.py b/venv/Lib/site-packages/pandas/tests/io/excel/test_xlsxwriter.py new file mode 100644 index 0000000..b6f7914 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/excel/test_xlsxwriter.py @@ -0,0 +1,64 @@ +import warnings + +import pytest + +from pandas import DataFrame +import pandas._testing as tm + +from pandas.io.excel import ExcelWriter + +xlsxwriter = pytest.importorskip("xlsxwriter") + +pytestmark = pytest.mark.parametrize("ext", [".xlsx"]) + + +def test_column_format(ext): + # Test that column formats are applied to cells. Test for issue #9167. + # Applicable to xlsxwriter only. + with warnings.catch_warnings(): + # Ignore the openpyxl lxml warning. + warnings.simplefilter("ignore") + openpyxl = pytest.importorskip("openpyxl") + + with tm.ensure_clean(ext) as path: + frame = DataFrame({"A": [123456, 123456], "B": [123456, 123456]}) + + writer = ExcelWriter(path) + frame.to_excel(writer) + + # Add a number format to col B and ensure it is applied to cells. + num_format = "#,##0" + write_workbook = writer.book + write_worksheet = write_workbook.worksheets()[0] + col_format = write_workbook.add_format({"num_format": num_format}) + write_worksheet.set_column("B:B", None, col_format) + writer.save() + + read_workbook = openpyxl.load_workbook(path) + try: + read_worksheet = read_workbook["Sheet1"] + except TypeError: + # compat + read_worksheet = read_workbook.get_sheet_by_name(name="Sheet1") + + # Get the number format from the cell. + try: + cell = read_worksheet["B2"] + except TypeError: + # compat + cell = read_worksheet.cell("B2") + + try: + read_num_format = cell.number_format + except AttributeError: + read_num_format = cell.style.number_format._format_code + + assert read_num_format == num_format + + +def test_write_append_mode_raises(ext): + msg = "Append mode is not supported with xlsxwriter!" + + with tm.ensure_clean(ext) as f: + with pytest.raises(ValueError, match=msg): + ExcelWriter(f, engine="xlsxwriter", mode="a") diff --git a/venv/Lib/site-packages/pandas/tests/io/excel/test_xlwt.py b/venv/Lib/site-packages/pandas/tests/io/excel/test_xlwt.py new file mode 100644 index 0000000..01feab0 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/excel/test_xlwt.py @@ -0,0 +1,67 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, MultiIndex +import pandas._testing as tm + +from pandas.io.excel import ExcelWriter, _XlwtWriter + +xlwt = pytest.importorskip("xlwt") + +pytestmark = pytest.mark.parametrize("ext,", [".xls"]) + + +def test_excel_raise_error_on_multiindex_columns_and_no_index(ext): + # MultiIndex as columns is not yet implemented 9794 + cols = MultiIndex.from_tuples( + [("site", ""), ("2014", "height"), ("2014", "weight")] + ) + df = DataFrame(np.random.randn(10, 3), columns=cols) + with pytest.raises(NotImplementedError): + with tm.ensure_clean(ext) as path: + df.to_excel(path, index=False) + + +def test_excel_multiindex_columns_and_index_true(ext): + cols = MultiIndex.from_tuples( + [("site", ""), ("2014", "height"), ("2014", "weight")] + ) + df = pd.DataFrame(np.random.randn(10, 3), columns=cols) + with tm.ensure_clean(ext) as path: + df.to_excel(path, index=True) + + +def test_excel_multiindex_index(ext): + # MultiIndex as index works so assert no error #9794 + cols = MultiIndex.from_tuples( + [("site", ""), ("2014", "height"), ("2014", "weight")] + ) + df = DataFrame(np.random.randn(3, 10), index=cols) + with tm.ensure_clean(ext) as path: + df.to_excel(path, index=False) + + +def test_to_excel_styleconverter(ext): + hstyle = { + "font": {"bold": True}, + "borders": {"top": "thin", "right": "thin", "bottom": "thin", "left": "thin"}, + "alignment": {"horizontal": "center", "vertical": "top"}, + } + + xls_style = _XlwtWriter._convert_to_style(hstyle) + assert xls_style.font.bold + assert xlwt.Borders.THIN == xls_style.borders.top + assert xlwt.Borders.THIN == xls_style.borders.right + assert xlwt.Borders.THIN == xls_style.borders.bottom + assert xlwt.Borders.THIN == xls_style.borders.left + assert xlwt.Alignment.HORZ_CENTER == xls_style.alignment.horz + assert xlwt.Alignment.VERT_TOP == xls_style.alignment.vert + + +def test_write_append_mode_raises(ext): + msg = "Append mode is not supported with xlwt!" + + with tm.ensure_clean(ext) as f: + with pytest.raises(ValueError, match=msg): + ExcelWriter(f, engine="xlwt", mode="a") diff --git a/venv/Lib/site-packages/pandas/tests/io/formats/__init__.py b/venv/Lib/site-packages/pandas/tests/io/formats/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/io/formats/test_console.py b/venv/Lib/site-packages/pandas/tests/io/formats/test_console.py new file mode 100644 index 0000000..e56d148 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/formats/test_console.py @@ -0,0 +1,72 @@ +import locale + +import pytest + +from pandas._config import detect_console_encoding + + +class MockEncoding: # TODO(py27): replace with mock + """ + Used to add a side effect when accessing the 'encoding' property. If the + side effect is a str in nature, the value will be returned. Otherwise, the + side effect should be an exception that will be raised. + """ + + def __init__(self, encoding): + super().__init__() + self.val = encoding + + @property + def encoding(self): + return self.raise_or_return(self.val) + + @staticmethod + def raise_or_return(val): + if isinstance(val, str): + return val + else: + raise val + + +@pytest.mark.parametrize("empty,filled", [["stdin", "stdout"], ["stdout", "stdin"]]) +def test_detect_console_encoding_from_stdout_stdin(monkeypatch, empty, filled): + # Ensures that when sys.stdout.encoding or sys.stdin.encoding is used when + # they have values filled. + # GH 21552 + with monkeypatch.context() as context: + context.setattr("sys.{}".format(empty), MockEncoding("")) + context.setattr("sys.{}".format(filled), MockEncoding(filled)) + assert detect_console_encoding() == filled + + +@pytest.mark.parametrize("encoding", [AttributeError, IOError, "ascii"]) +def test_detect_console_encoding_fallback_to_locale(monkeypatch, encoding): + # GH 21552 + with monkeypatch.context() as context: + context.setattr("locale.getpreferredencoding", lambda: "foo") + context.setattr("sys.stdout", MockEncoding(encoding)) + assert detect_console_encoding() == "foo" + + +@pytest.mark.parametrize( + "std,locale", + [ + ["ascii", "ascii"], + ["ascii", locale.Error], + [AttributeError, "ascii"], + [AttributeError, locale.Error], + [IOError, "ascii"], + [IOError, locale.Error], + ], +) +def test_detect_console_encoding_fallback_to_default(monkeypatch, std, locale): + # When both the stdout/stdin encoding and locale preferred encoding checks + # fail (or return 'ascii', we should default to the sys default encoding. + # GH 21552 + with monkeypatch.context() as context: + context.setattr( + "locale.getpreferredencoding", lambda: MockEncoding.raise_or_return(locale) + ) + context.setattr("sys.stdout", MockEncoding(std)) + context.setattr("sys.getdefaultencoding", lambda: "sysDefaultEncoding") + assert detect_console_encoding() == "sysDefaultEncoding" diff --git a/venv/Lib/site-packages/pandas/tests/io/formats/test_css.py b/venv/Lib/site-packages/pandas/tests/io/formats/test_css.py new file mode 100644 index 0000000..7008cef --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/formats/test_css.py @@ -0,0 +1,232 @@ +import pytest + +import pandas._testing as tm + +from pandas.io.formats.css import CSSResolver, CSSWarning + + +def assert_resolves(css, props, inherited=None): + resolve = CSSResolver() + actual = resolve(css, inherited=inherited) + assert props == actual + + +def assert_same_resolution(css1, css2, inherited=None): + resolve = CSSResolver() + resolved1 = resolve(css1, inherited=inherited) + resolved2 = resolve(css2, inherited=inherited) + assert resolved1 == resolved2 + + +@pytest.mark.parametrize( + "name,norm,abnorm", + [ + ( + "whitespace", + "hello: world; foo: bar", + " \t hello \t :\n world \n ; \n foo: \tbar\n\n", + ), + ("case", "hello: world; foo: bar", "Hello: WORLD; foO: bar"), + ("empty-decl", "hello: world; foo: bar", "; hello: world;; foo: bar;\n; ;"), + ("empty-list", "", ";"), + ], +) +def test_css_parse_normalisation(name, norm, abnorm): + assert_same_resolution(norm, abnorm) + + +@pytest.mark.parametrize( + "invalid_css,remainder", + [ + # No colon + ("hello-world", ""), + ("border-style: solid; hello-world", "border-style: solid"), + ( + "border-style: solid; hello-world; font-weight: bold", + "border-style: solid; font-weight: bold", + ), + # Unclosed string fail + # Invalid size + ("font-size: blah", "font-size: 1em"), + ("font-size: 1a2b", "font-size: 1em"), + ("font-size: 1e5pt", "font-size: 1em"), + ("font-size: 1+6pt", "font-size: 1em"), + ("font-size: 1unknownunit", "font-size: 1em"), + ("font-size: 10", "font-size: 1em"), + ("font-size: 10 pt", "font-size: 1em"), + ], +) +def test_css_parse_invalid(invalid_css, remainder): + with tm.assert_produces_warning(CSSWarning): + assert_same_resolution(invalid_css, remainder) + + # TODO: we should be checking that in other cases no warnings are raised + + +@pytest.mark.parametrize( + "shorthand,expansions", + [ + ("margin", ["margin-top", "margin-right", "margin-bottom", "margin-left"]), + ("padding", ["padding-top", "padding-right", "padding-bottom", "padding-left"]), + ( + "border-width", + [ + "border-top-width", + "border-right-width", + "border-bottom-width", + "border-left-width", + ], + ), + ( + "border-color", + [ + "border-top-color", + "border-right-color", + "border-bottom-color", + "border-left-color", + ], + ), + ( + "border-style", + [ + "border-top-style", + "border-right-style", + "border-bottom-style", + "border-left-style", + ], + ), + ], +) +def test_css_side_shorthands(shorthand, expansions): + top, right, bottom, left = expansions + + assert_resolves( + f"{shorthand}: 1pt", {top: "1pt", right: "1pt", bottom: "1pt", left: "1pt"}, + ) + + assert_resolves( + f"{shorthand}: 1pt 4pt", {top: "1pt", right: "4pt", bottom: "1pt", left: "4pt"}, + ) + + assert_resolves( + f"{shorthand}: 1pt 4pt 2pt", + {top: "1pt", right: "4pt", bottom: "2pt", left: "4pt"}, + ) + + assert_resolves( + f"{shorthand}: 1pt 4pt 2pt 0pt", + {top: "1pt", right: "4pt", bottom: "2pt", left: "0pt"}, + ) + + with tm.assert_produces_warning(CSSWarning): + assert_resolves(f"{shorthand}: 1pt 1pt 1pt 1pt 1pt", {}) + + +@pytest.mark.parametrize( + "style,inherited,equiv", + [ + ("margin: 1px; margin: 2px", "", "margin: 2px"), + ("margin: 1px", "margin: 2px", "margin: 1px"), + ("margin: 1px; margin: inherit", "margin: 2px", "margin: 2px"), + ( + "margin: 1px; margin-top: 2px", + "", + "margin-left: 1px; margin-right: 1px; " + + "margin-bottom: 1px; margin-top: 2px", + ), + ("margin-top: 2px", "margin: 1px", "margin: 1px; margin-top: 2px"), + ("margin: 1px", "margin-top: 2px", "margin: 1px"), + ( + "margin: 1px; margin-top: inherit", + "margin: 2px", + "margin: 1px; margin-top: 2px", + ), + ], +) +def test_css_precedence(style, inherited, equiv): + resolve = CSSResolver() + inherited_props = resolve(inherited) + style_props = resolve(style, inherited=inherited_props) + equiv_props = resolve(equiv) + assert style_props == equiv_props + + +@pytest.mark.parametrize( + "style,equiv", + [ + ( + "margin: 1px; margin-top: inherit", + "margin-bottom: 1px; margin-right: 1px; margin-left: 1px", + ), + ("margin-top: inherit", ""), + ("margin-top: initial", ""), + ], +) +def test_css_none_absent(style, equiv): + assert_same_resolution(style, equiv) + + +@pytest.mark.parametrize( + "size,resolved", + [ + ("xx-small", "6pt"), + ("x-small", f"{7.5:f}pt"), + ("small", f"{9.6:f}pt"), + ("medium", "12pt"), + ("large", f"{13.5:f}pt"), + ("x-large", "18pt"), + ("xx-large", "24pt"), + ("8px", "6pt"), + ("1.25pc", "15pt"), + (".25in", "18pt"), + ("02.54cm", "72pt"), + ("25.4mm", "72pt"), + ("101.6q", "72pt"), + ("101.6q", "72pt"), + ], +) +@pytest.mark.parametrize("relative_to", [None, "16pt"]) # invariant to inherited size +def test_css_absolute_font_size(size, relative_to, resolved): + if relative_to is None: + inherited = None + else: + inherited = {"font-size": relative_to} + assert_resolves( + f"font-size: {size}", {"font-size": resolved}, inherited=inherited, + ) + + +@pytest.mark.parametrize( + "size,relative_to,resolved", + [ + ("1em", None, "12pt"), + ("1.0em", None, "12pt"), + ("1.25em", None, "15pt"), + ("1em", "16pt", "16pt"), + ("1.0em", "16pt", "16pt"), + ("1.25em", "16pt", "20pt"), + ("1rem", "16pt", "12pt"), + ("1.0rem", "16pt", "12pt"), + ("1.25rem", "16pt", "15pt"), + ("100%", None, "12pt"), + ("125%", None, "15pt"), + ("100%", "16pt", "16pt"), + ("125%", "16pt", "20pt"), + ("2ex", None, "12pt"), + ("2.0ex", None, "12pt"), + ("2.50ex", None, "15pt"), + ("inherit", "16pt", "16pt"), + ("smaller", None, "10pt"), + ("smaller", "18pt", "15pt"), + ("larger", None, f"{14.4:f}pt"), + ("larger", "15pt", "18pt"), + ], +) +def test_css_relative_font_size(size, relative_to, resolved): + if relative_to is None: + inherited = None + else: + inherited = {"font-size": relative_to} + assert_resolves( + f"font-size: {size}", {"font-size": resolved}, inherited=inherited, + ) diff --git a/venv/Lib/site-packages/pandas/tests/io/formats/test_eng_formatting.py b/venv/Lib/site-packages/pandas/tests/io/formats/test_eng_formatting.py new file mode 100644 index 0000000..6801316 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/formats/test_eng_formatting.py @@ -0,0 +1,235 @@ +import numpy as np + +import pandas as pd +from pandas import DataFrame +import pandas._testing as tm + +import pandas.io.formats.format as fmt + + +class TestEngFormatter: + def test_eng_float_formatter(self): + df = DataFrame({"A": [1.41, 141.0, 14100, 1410000.0]}) + + fmt.set_eng_float_format() + result = df.to_string() + expected = ( + " A\n" + "0 1.410E+00\n" + "1 141.000E+00\n" + "2 14.100E+03\n" + "3 1.410E+06" + ) + assert result == expected + + fmt.set_eng_float_format(use_eng_prefix=True) + result = df.to_string() + expected = " A\n0 1.410\n1 141.000\n2 14.100k\n3 1.410M" + assert result == expected + + fmt.set_eng_float_format(accuracy=0) + result = df.to_string() + expected = " A\n0 1E+00\n1 141E+00\n2 14E+03\n3 1E+06" + assert result == expected + + tm.reset_display_options() + + def compare(self, formatter, input, output): + formatted_input = formatter(input) + assert formatted_input == output + + def compare_all(self, formatter, in_out): + """ + Parameters: + ----------- + formatter: EngFormatter under test + in_out: list of tuples. Each tuple = (number, expected_formatting) + + It is tested if 'formatter(number) == expected_formatting'. + *number* should be >= 0 because formatter(-number) == fmt is also + tested. *fmt* is derived from *expected_formatting* + """ + for input, output in in_out: + self.compare(formatter, input, output) + self.compare(formatter, -input, "-" + output[1:]) + + def test_exponents_with_eng_prefix(self): + formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True) + f = np.sqrt(2) + in_out = [ + (f * 10 ** -24, " 1.414y"), + (f * 10 ** -23, " 14.142y"), + (f * 10 ** -22, " 141.421y"), + (f * 10 ** -21, " 1.414z"), + (f * 10 ** -20, " 14.142z"), + (f * 10 ** -19, " 141.421z"), + (f * 10 ** -18, " 1.414a"), + (f * 10 ** -17, " 14.142a"), + (f * 10 ** -16, " 141.421a"), + (f * 10 ** -15, " 1.414f"), + (f * 10 ** -14, " 14.142f"), + (f * 10 ** -13, " 141.421f"), + (f * 10 ** -12, " 1.414p"), + (f * 10 ** -11, " 14.142p"), + (f * 10 ** -10, " 141.421p"), + (f * 10 ** -9, " 1.414n"), + (f * 10 ** -8, " 14.142n"), + (f * 10 ** -7, " 141.421n"), + (f * 10 ** -6, " 1.414u"), + (f * 10 ** -5, " 14.142u"), + (f * 10 ** -4, " 141.421u"), + (f * 10 ** -3, " 1.414m"), + (f * 10 ** -2, " 14.142m"), + (f * 10 ** -1, " 141.421m"), + (f * 10 ** 0, " 1.414"), + (f * 10 ** 1, " 14.142"), + (f * 10 ** 2, " 141.421"), + (f * 10 ** 3, " 1.414k"), + (f * 10 ** 4, " 14.142k"), + (f * 10 ** 5, " 141.421k"), + (f * 10 ** 6, " 1.414M"), + (f * 10 ** 7, " 14.142M"), + (f * 10 ** 8, " 141.421M"), + (f * 10 ** 9, " 1.414G"), + (f * 10 ** 10, " 14.142G"), + (f * 10 ** 11, " 141.421G"), + (f * 10 ** 12, " 1.414T"), + (f * 10 ** 13, " 14.142T"), + (f * 10 ** 14, " 141.421T"), + (f * 10 ** 15, " 1.414P"), + (f * 10 ** 16, " 14.142P"), + (f * 10 ** 17, " 141.421P"), + (f * 10 ** 18, " 1.414E"), + (f * 10 ** 19, " 14.142E"), + (f * 10 ** 20, " 141.421E"), + (f * 10 ** 21, " 1.414Z"), + (f * 10 ** 22, " 14.142Z"), + (f * 10 ** 23, " 141.421Z"), + (f * 10 ** 24, " 1.414Y"), + (f * 10 ** 25, " 14.142Y"), + (f * 10 ** 26, " 141.421Y"), + ] + self.compare_all(formatter, in_out) + + def test_exponents_without_eng_prefix(self): + formatter = fmt.EngFormatter(accuracy=4, use_eng_prefix=False) + f = np.pi + in_out = [ + (f * 10 ** -24, " 3.1416E-24"), + (f * 10 ** -23, " 31.4159E-24"), + (f * 10 ** -22, " 314.1593E-24"), + (f * 10 ** -21, " 3.1416E-21"), + (f * 10 ** -20, " 31.4159E-21"), + (f * 10 ** -19, " 314.1593E-21"), + (f * 10 ** -18, " 3.1416E-18"), + (f * 10 ** -17, " 31.4159E-18"), + (f * 10 ** -16, " 314.1593E-18"), + (f * 10 ** -15, " 3.1416E-15"), + (f * 10 ** -14, " 31.4159E-15"), + (f * 10 ** -13, " 314.1593E-15"), + (f * 10 ** -12, " 3.1416E-12"), + (f * 10 ** -11, " 31.4159E-12"), + (f * 10 ** -10, " 314.1593E-12"), + (f * 10 ** -9, " 3.1416E-09"), + (f * 10 ** -8, " 31.4159E-09"), + (f * 10 ** -7, " 314.1593E-09"), + (f * 10 ** -6, " 3.1416E-06"), + (f * 10 ** -5, " 31.4159E-06"), + (f * 10 ** -4, " 314.1593E-06"), + (f * 10 ** -3, " 3.1416E-03"), + (f * 10 ** -2, " 31.4159E-03"), + (f * 10 ** -1, " 314.1593E-03"), + (f * 10 ** 0, " 3.1416E+00"), + (f * 10 ** 1, " 31.4159E+00"), + (f * 10 ** 2, " 314.1593E+00"), + (f * 10 ** 3, " 3.1416E+03"), + (f * 10 ** 4, " 31.4159E+03"), + (f * 10 ** 5, " 314.1593E+03"), + (f * 10 ** 6, " 3.1416E+06"), + (f * 10 ** 7, " 31.4159E+06"), + (f * 10 ** 8, " 314.1593E+06"), + (f * 10 ** 9, " 3.1416E+09"), + (f * 10 ** 10, " 31.4159E+09"), + (f * 10 ** 11, " 314.1593E+09"), + (f * 10 ** 12, " 3.1416E+12"), + (f * 10 ** 13, " 31.4159E+12"), + (f * 10 ** 14, " 314.1593E+12"), + (f * 10 ** 15, " 3.1416E+15"), + (f * 10 ** 16, " 31.4159E+15"), + (f * 10 ** 17, " 314.1593E+15"), + (f * 10 ** 18, " 3.1416E+18"), + (f * 10 ** 19, " 31.4159E+18"), + (f * 10 ** 20, " 314.1593E+18"), + (f * 10 ** 21, " 3.1416E+21"), + (f * 10 ** 22, " 31.4159E+21"), + (f * 10 ** 23, " 314.1593E+21"), + (f * 10 ** 24, " 3.1416E+24"), + (f * 10 ** 25, " 31.4159E+24"), + (f * 10 ** 26, " 314.1593E+24"), + ] + self.compare_all(formatter, in_out) + + def test_rounding(self): + formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True) + in_out = [ + (5.55555, " 5.556"), + (55.5555, " 55.556"), + (555.555, " 555.555"), + (5555.55, " 5.556k"), + (55555.5, " 55.556k"), + (555555, " 555.555k"), + ] + self.compare_all(formatter, in_out) + + formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True) + in_out = [ + (5.55555, " 5.6"), + (55.5555, " 55.6"), + (555.555, " 555.6"), + (5555.55, " 5.6k"), + (55555.5, " 55.6k"), + (555555, " 555.6k"), + ] + self.compare_all(formatter, in_out) + + formatter = fmt.EngFormatter(accuracy=0, use_eng_prefix=True) + in_out = [ + (5.55555, " 6"), + (55.5555, " 56"), + (555.555, " 556"), + (5555.55, " 6k"), + (55555.5, " 56k"), + (555555, " 556k"), + ] + self.compare_all(formatter, in_out) + + formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True) + result = formatter(0) + assert result == " 0.000" + + def test_nan(self): + # Issue #11981 + + formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True) + result = formatter(np.nan) + assert result == "NaN" + + df = pd.DataFrame( + { + "a": [1.5, 10.3, 20.5], + "b": [50.3, 60.67, 70.12], + "c": [100.2, 101.33, 120.33], + } + ) + pt = df.pivot_table(values="a", index="b", columns="c") + fmt.set_eng_float_format(accuracy=1) + result = pt.to_string() + assert "NaN" in result + tm.reset_display_options() + + def test_inf(self): + # Issue #11981 + + formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True) + result = formatter(np.inf) + assert result == "inf" diff --git a/venv/Lib/site-packages/pandas/tests/io/formats/test_format.py b/venv/Lib/site-packages/pandas/tests/io/formats/test_format.py new file mode 100644 index 0000000..9795648 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/formats/test_format.py @@ -0,0 +1,3279 @@ +""" +Test output formatting for Series/DataFrame, including to_string & reprs +""" + +from datetime import datetime +from io import StringIO +import itertools +from operator import methodcaller +import os +from pathlib import Path +import re +from shutil import get_terminal_size +import sys +import textwrap + +import dateutil +import numpy as np +import pytest +import pytz + +from pandas.compat import is_platform_32bit, is_platform_windows + +import pandas as pd +from pandas import ( + DataFrame, + Index, + MultiIndex, + NaT, + Series, + Timestamp, + date_range, + get_option, + option_context, + read_csv, + reset_option, + set_option, +) +import pandas._testing as tm + +import pandas.io.formats.format as fmt +import pandas.io.formats.printing as printing + +use_32bit_repr = is_platform_windows() or is_platform_32bit() + + +@pytest.fixture(params=["string", "pathlike", "buffer"]) +def filepath_or_buffer_id(request): + """ + A fixture yielding test ids for filepath_or_buffer testing. + """ + return request.param + + +@pytest.fixture +def filepath_or_buffer(filepath_or_buffer_id, tmp_path): + """ + A fixture yielding a string representing a filepath, a path-like object + and a StringIO buffer. Also checks that buffer is not closed. + """ + if filepath_or_buffer_id == "buffer": + buf = StringIO() + yield buf + assert not buf.closed + else: + assert isinstance(tmp_path, Path) + if filepath_or_buffer_id == "pathlike": + yield tmp_path / "foo" + else: + yield str(tmp_path / "foo") + + +@pytest.fixture +def assert_filepath_or_buffer_equals( + filepath_or_buffer, filepath_or_buffer_id, encoding +): + """ + Assertion helper for checking filepath_or_buffer. + """ + + def _assert_filepath_or_buffer_equals(expected): + if filepath_or_buffer_id == "string": + with open(filepath_or_buffer, encoding=encoding) as f: + result = f.read() + elif filepath_or_buffer_id == "pathlike": + result = filepath_or_buffer.read_text(encoding=encoding) + elif filepath_or_buffer_id == "buffer": + result = filepath_or_buffer.getvalue() + assert result == expected + + return _assert_filepath_or_buffer_equals + + +def curpath(): + pth, _ = os.path.split(os.path.abspath(__file__)) + return pth + + +def has_info_repr(df): + r = repr(df) + c1 = r.split("\n")[0].startswith(" + # 2. Index + # 3. Columns + # 4. dtype + # 5. memory usage + # 6. trailing newline + nv = len(r.split("\n")) == 6 + return has_info and nv + + +def has_horizontally_truncated_repr(df): + try: # Check header row + fst_line = np.array(repr(df).splitlines()[0].split()) + cand_col = np.where(fst_line == "...")[0][0] + except IndexError: + return False + # Make sure each row has this ... in the same place + r = repr(df) + for ix, l in enumerate(r.splitlines()): + if not r.split()[cand_col] == "...": + return False + return True + + +def has_vertically_truncated_repr(df): + r = repr(df) + only_dot_row = False + for row in r.splitlines(): + if re.match(r"^[\.\ ]+$", row): + only_dot_row = True + return only_dot_row + + +def has_truncated_repr(df): + return has_horizontally_truncated_repr(df) or has_vertically_truncated_repr(df) + + +def has_doubly_truncated_repr(df): + return has_horizontally_truncated_repr(df) and has_vertically_truncated_repr(df) + + +def has_expanded_repr(df): + r = repr(df) + for line in r.split("\n"): + if line.endswith("\\"): + return True + return False + + +@pytest.mark.filterwarnings("ignore::FutureWarning:.*format") +class TestDataFrameFormatting: + def test_repr_embedded_ndarray(self): + arr = np.empty(10, dtype=[("err", object)]) + for i in range(len(arr)): + arr["err"][i] = np.random.randn(i) + + df = DataFrame(arr) + repr(df["err"]) + repr(df) + df.to_string() + + def test_eng_float_formatter(self, float_frame): + df = float_frame + df.loc[5] = 0 + + fmt.set_eng_float_format() + repr(df) + + fmt.set_eng_float_format(use_eng_prefix=True) + repr(df) + + fmt.set_eng_float_format(accuracy=0) + repr(df) + tm.reset_display_options() + + def test_show_null_counts(self): + + df = DataFrame(1, columns=range(10), index=range(10)) + df.iloc[1, 1] = np.nan + + def check(null_counts, result): + buf = StringIO() + df.info(buf=buf, null_counts=null_counts) + assert ("non-null" in buf.getvalue()) is result + + with option_context( + "display.max_info_rows", 20, "display.max_info_columns", 20 + ): + check(None, True) + check(True, True) + check(False, False) + + with option_context("display.max_info_rows", 5, "display.max_info_columns", 5): + check(None, False) + check(True, False) + check(False, False) + + def test_repr_tuples(self): + buf = StringIO() + + df = DataFrame({"tups": list(zip(range(10), range(10)))}) + repr(df) + df.to_string(col_space=10, buf=buf) + + def test_repr_truncation(self): + max_len = 20 + with option_context("display.max_colwidth", max_len): + df = DataFrame( + { + "A": np.random.randn(10), + "B": [ + tm.rands(np.random.randint(max_len - 1, max_len + 1)) + for i in range(10) + ], + } + ) + r = repr(df) + r = r[r.find("\n") + 1 :] + + adj = fmt._get_adjustment() + + for line, value in zip(r.split("\n"), df["B"]): + if adj.len(value) + 1 > max_len: + assert "..." in line + else: + assert "..." not in line + + with option_context("display.max_colwidth", 999999): + assert "..." not in repr(df) + + with option_context("display.max_colwidth", max_len + 2): + assert "..." not in repr(df) + + def test_repr_chop_threshold(self): + df = DataFrame([[0.1, 0.5], [0.5, -0.1]]) + pd.reset_option("display.chop_threshold") # default None + assert repr(df) == " 0 1\n0 0.1 0.5\n1 0.5 -0.1" + + with option_context("display.chop_threshold", 0.2): + assert repr(df) == " 0 1\n0 0.0 0.5\n1 0.5 0.0" + + with option_context("display.chop_threshold", 0.6): + assert repr(df) == " 0 1\n0 0.0 0.0\n1 0.0 0.0" + + with option_context("display.chop_threshold", None): + assert repr(df) == " 0 1\n0 0.1 0.5\n1 0.5 -0.1" + + def test_repr_chop_threshold_column_below(self): + # GH 6839: validation case + + df = pd.DataFrame([[10, 20, 30, 40], [8e-10, -1e-11, 2e-9, -2e-11]]).T + + with option_context("display.chop_threshold", 0): + assert repr(df) == ( + " 0 1\n" + "0 10.0 8.000000e-10\n" + "1 20.0 -1.000000e-11\n" + "2 30.0 2.000000e-09\n" + "3 40.0 -2.000000e-11" + ) + + with option_context("display.chop_threshold", 1e-8): + assert repr(df) == ( + " 0 1\n" + "0 10.0 0.000000e+00\n" + "1 20.0 0.000000e+00\n" + "2 30.0 0.000000e+00\n" + "3 40.0 0.000000e+00" + ) + + with option_context("display.chop_threshold", 5e-11): + assert repr(df) == ( + " 0 1\n" + "0 10.0 8.000000e-10\n" + "1 20.0 0.000000e+00\n" + "2 30.0 2.000000e-09\n" + "3 40.0 0.000000e+00" + ) + + def test_repr_obeys_max_seq_limit(self): + with option_context("display.max_seq_items", 2000): + assert len(printing.pprint_thing(list(range(1000)))) > 1000 + + with option_context("display.max_seq_items", 5): + assert len(printing.pprint_thing(list(range(1000)))) < 100 + + def test_repr_set(self): + assert printing.pprint_thing({1}) == "{1}" + + def test_repr_is_valid_construction_code(self): + # for the case of Index, where the repr is traditional rather then + # stylized + idx = Index(["a", "b"]) + res = eval("pd." + repr(idx)) + tm.assert_series_equal(Series(res), Series(idx)) + + def test_repr_should_return_str(self): + # https://docs.python.org/3/reference/datamodel.html#object.__repr__ + # "...The return value must be a string object." + + # (str on py2.x, str (unicode) on py3) + + data = [8, 5, 3, 5] + index1 = ["\u03c3", "\u03c4", "\u03c5", "\u03c6"] + cols = ["\u03c8"] + df = DataFrame(data, columns=cols, index=index1) + assert type(df.__repr__()) == str # both py2 / 3 + + def test_repr_no_backslash(self): + with option_context("mode.sim_interactive", True): + df = DataFrame(np.random.randn(10, 4)) + assert "\\" not in repr(df) + + def test_expand_frame_repr(self): + df_small = DataFrame("hello", index=[0], columns=[0]) + df_wide = DataFrame("hello", index=[0], columns=range(10)) + df_tall = DataFrame("hello", index=range(30), columns=range(5)) + + with option_context("mode.sim_interactive", True): + with option_context( + "display.max_columns", + 10, + "display.width", + 20, + "display.max_rows", + 20, + "display.show_dimensions", + True, + ): + with option_context("display.expand_frame_repr", True): + assert not has_truncated_repr(df_small) + assert not has_expanded_repr(df_small) + assert not has_truncated_repr(df_wide) + assert has_expanded_repr(df_wide) + assert has_vertically_truncated_repr(df_tall) + assert has_expanded_repr(df_tall) + + with option_context("display.expand_frame_repr", False): + assert not has_truncated_repr(df_small) + assert not has_expanded_repr(df_small) + assert not has_horizontally_truncated_repr(df_wide) + assert not has_expanded_repr(df_wide) + assert has_vertically_truncated_repr(df_tall) + assert not has_expanded_repr(df_tall) + + def test_repr_non_interactive(self): + # in non interactive mode, there can be no dependency on the + # result of terminal auto size detection + df = DataFrame("hello", index=range(1000), columns=range(5)) + + with option_context( + "mode.sim_interactive", False, "display.width", 0, "display.max_rows", 5000 + ): + assert not has_truncated_repr(df) + assert not has_expanded_repr(df) + + def test_repr_truncates_terminal_size(self, monkeypatch): + # see gh-21180 + + terminal_size = (118, 96) + monkeypatch.setattr( + "pandas.io.formats.format.get_terminal_size", lambda: terminal_size + ) + + index = range(5) + columns = pd.MultiIndex.from_tuples( + [ + ("This is a long title with > 37 chars.", "cat"), + ("This is a loooooonger title with > 43 chars.", "dog"), + ] + ) + df = pd.DataFrame(1, index=index, columns=columns) + + result = repr(df) + + h1, h2 = result.split("\n")[:2] + assert "long" in h1 + assert "loooooonger" in h1 + assert "cat" in h2 + assert "dog" in h2 + + # regular columns + df2 = pd.DataFrame({"A" * 41: [1, 2], "B" * 41: [1, 2]}) + result = repr(df2) + + assert df2.columns[0] in result.split("\n")[0] + + def test_repr_truncates_terminal_size_full(self, monkeypatch): + # GH 22984 ensure entire window is filled + terminal_size = (80, 24) + df = pd.DataFrame(np.random.rand(1, 7)) + + monkeypatch.setattr( + "pandas.io.formats.format.get_terminal_size", lambda: terminal_size + ) + assert "..." not in str(df) + + def test_repr_truncation_column_size(self): + # dataframe with last column very wide -> check it is not used to + # determine size of truncation (...) column + df = pd.DataFrame( + { + "a": [108480, 30830], + "b": [12345, 12345], + "c": [12345, 12345], + "d": [12345, 12345], + "e": ["a" * 50] * 2, + } + ) + assert "..." in str(df) + assert " ... " not in str(df) + + def test_repr_max_columns_max_rows(self): + term_width, term_height = get_terminal_size() + if term_width < 10 or term_height < 10: + pytest.skip(f"terminal size too small, {term_width} x {term_height}") + + def mkframe(n): + index = [f"{i:05d}" for i in range(n)] + return DataFrame(0, index, index) + + df6 = mkframe(6) + df10 = mkframe(10) + with option_context("mode.sim_interactive", True): + with option_context("display.width", term_width * 2): + with option_context("display.max_rows", 5, "display.max_columns", 5): + assert not has_expanded_repr(mkframe(4)) + assert not has_expanded_repr(mkframe(5)) + assert not has_expanded_repr(df6) + assert has_doubly_truncated_repr(df6) + + with option_context("display.max_rows", 20, "display.max_columns", 10): + # Out off max_columns boundary, but no extending + # since not exceeding width + assert not has_expanded_repr(df6) + assert not has_truncated_repr(df6) + + with option_context("display.max_rows", 9, "display.max_columns", 10): + # out vertical bounds can not result in expanded repr + assert not has_expanded_repr(df10) + assert has_vertically_truncated_repr(df10) + + # width=None in terminal, auto detection + with option_context( + "display.max_columns", + 100, + "display.max_rows", + term_width * 20, + "display.width", + None, + ): + df = mkframe((term_width // 7) - 2) + assert not has_expanded_repr(df) + df = mkframe((term_width // 7) + 2) + printing.pprint_thing(df._repr_fits_horizontal_()) + assert has_expanded_repr(df) + + def test_repr_min_rows(self): + df = pd.DataFrame({"a": range(20)}) + + # default setting no truncation even if above min_rows + assert ".." not in repr(df) + assert ".." not in df._repr_html_() + + df = pd.DataFrame({"a": range(61)}) + + # default of max_rows 60 triggers truncation if above + assert ".." in repr(df) + assert ".." in df._repr_html_() + + with option_context("display.max_rows", 10, "display.min_rows", 4): + # truncated after first two rows + assert ".." in repr(df) + assert "2 " not in repr(df) + assert "..." in df._repr_html_() + assert "
255{40 + h}31
'.format(unit=unit) + assert expected in h + + +def test_html_repr_min_rows_default(datapath): + # gh-27991 + + # default setting no truncation even if above min_rows + df = pd.DataFrame({"a": range(20)}) + result = df._repr_html_() + expected = expected_html(datapath, "html_repr_min_rows_default_no_truncation") + assert result == expected + + # default of max_rows 60 triggers truncation if above + df = pd.DataFrame({"a": range(61)}) + result = df._repr_html_() + expected = expected_html(datapath, "html_repr_min_rows_default_truncated") + assert result == expected + + +@pytest.mark.parametrize( + "max_rows,min_rows,expected", + [ + # truncated after first two rows + (10, 4, "html_repr_max_rows_10_min_rows_4"), + # when set to None, follow value of max_rows + (12, None, "html_repr_max_rows_12_min_rows_None"), + # when set value higher as max_rows, use the minimum + (10, 12, "html_repr_max_rows_10_min_rows_12"), + # max_rows of None -> never truncate + (None, 12, "html_repr_max_rows_None_min_rows_12"), + ], +) +def test_html_repr_min_rows(datapath, max_rows, min_rows, expected): + # gh-27991 + + df = pd.DataFrame({"a": range(61)}) + expected = expected_html(datapath, expected) + with option_context("display.max_rows", max_rows, "display.min_rows", min_rows): + result = df._repr_html_() + assert result == expected diff --git a/venv/Lib/site-packages/pandas/tests/io/formats/test_to_latex.py b/venv/Lib/site-packages/pandas/tests/io/formats/test_to_latex.py new file mode 100644 index 0000000..bd68103 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/formats/test_to_latex.py @@ -0,0 +1,884 @@ +import codecs +from datetime import datetime + +import pytest + +import pandas as pd +from pandas import DataFrame, Series +import pandas._testing as tm + + +class TestToLatex: + def test_to_latex_filename(self, float_frame): + with tm.ensure_clean("test.tex") as path: + float_frame.to_latex(path) + + with open(path, "r") as f: + assert float_frame.to_latex() == f.read() + + # test with utf-8 and encoding option (GH 7061) + df = DataFrame([["au\xdfgangen"]]) + with tm.ensure_clean("test.tex") as path: + df.to_latex(path, encoding="utf-8") + with codecs.open(path, "r", encoding="utf-8") as f: + assert df.to_latex() == f.read() + + # test with utf-8 without encoding option + with tm.ensure_clean("test.tex") as path: + df.to_latex(path) + with codecs.open(path, "r", encoding="utf-8") as f: + assert df.to_latex() == f.read() + + def test_to_latex(self, float_frame): + # it works! + float_frame.to_latex() + + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + withindex_result = df.to_latex() + withindex_expected = r"""\begin{tabular}{lrl} +\toprule +{} & a & b \\ +\midrule +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\bottomrule +\end{tabular} +""" + + assert withindex_result == withindex_expected + + withoutindex_result = df.to_latex(index=False) + withoutindex_expected = r"""\begin{tabular}{rl} +\toprule + a & b \\ +\midrule + 1 & b1 \\ + 2 & b2 \\ +\bottomrule +\end{tabular} +""" + + assert withoutindex_result == withoutindex_expected + + def test_to_latex_format(self, float_frame): + # GH Bug #9402 + float_frame.to_latex(column_format="ccc") + + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + withindex_result = df.to_latex(column_format="ccc") + withindex_expected = r"""\begin{tabular}{ccc} +\toprule +{} & a & b \\ +\midrule +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\bottomrule +\end{tabular} +""" + + assert withindex_result == withindex_expected + + def test_to_latex_empty(self): + df = DataFrame() + result = df.to_latex() + expected = r"""\begin{tabular}{l} +\toprule +Empty DataFrame +Columns: Index([], dtype='object') +Index: Index([], dtype='object') \\ +\bottomrule +\end{tabular} +""" + assert result == expected + + result = df.to_latex(longtable=True) + expected = r"""\begin{longtable}{l} +\toprule +Empty DataFrame +Columns: Index([], dtype='object') +Index: Index([], dtype='object') \\ +\end{longtable} +""" + assert result == expected + + def test_to_latex_with_formatters(self): + df = DataFrame( + { + "datetime64": [ + datetime(2016, 1, 1), + datetime(2016, 2, 5), + datetime(2016, 3, 3), + ], + "float": [1.0, 2.0, 3.0], + "int": [1, 2, 3], + "object": [(1, 2), True, False], + } + ) + + formatters = { + "datetime64": lambda x: x.strftime("%Y-%m"), + "float": lambda x: "[{x: 4.1f}]".format(x=x), + "int": lambda x: "0x{x:x}".format(x=x), + "object": lambda x: "-{x!s}-".format(x=x), + "__index__": lambda x: "index: {x}".format(x=x), + } + result = df.to_latex(formatters=dict(formatters)) + + expected = r"""\begin{tabular}{llrrl} +\toprule +{} & datetime64 & float & int & object \\ +\midrule +index: 0 & 2016-01 & [ 1.0] & 0x1 & -(1, 2)- \\ +index: 1 & 2016-02 & [ 2.0] & 0x2 & -True- \\ +index: 2 & 2016-03 & [ 3.0] & 0x3 & -False- \\ +\bottomrule +\end{tabular} +""" + assert result == expected + + def test_to_latex_multiindex(self): + df = DataFrame({("x", "y"): ["a"]}) + result = df.to_latex() + expected = r"""\begin{tabular}{ll} +\toprule +{} & x \\ +{} & y \\ +\midrule +0 & a \\ +\bottomrule +\end{tabular} +""" + + assert result == expected + + result = df.T.to_latex() + expected = r"""\begin{tabular}{lll} +\toprule + & & 0 \\ +\midrule +x & y & a \\ +\bottomrule +\end{tabular} +""" + + assert result == expected + + df = DataFrame.from_dict( + { + ("c1", 0): pd.Series({x: x for x in range(4)}), + ("c1", 1): pd.Series({x: x + 4 for x in range(4)}), + ("c2", 0): pd.Series({x: x for x in range(4)}), + ("c2", 1): pd.Series({x: x + 4 for x in range(4)}), + ("c3", 0): pd.Series({x: x for x in range(4)}), + } + ).T + result = df.to_latex() + expected = r"""\begin{tabular}{llrrrr} +\toprule + & & 0 & 1 & 2 & 3 \\ +\midrule +c1 & 0 & 0 & 1 & 2 & 3 \\ + & 1 & 4 & 5 & 6 & 7 \\ +c2 & 0 & 0 & 1 & 2 & 3 \\ + & 1 & 4 & 5 & 6 & 7 \\ +c3 & 0 & 0 & 1 & 2 & 3 \\ +\bottomrule +\end{tabular} +""" + + assert result == expected + + # GH 14184 + df = df.T + df.columns.names = ["a", "b"] + result = df.to_latex() + expected = r"""\begin{tabular}{lrrrrr} +\toprule +a & \multicolumn{2}{l}{c1} & \multicolumn{2}{l}{c2} & c3 \\ +b & 0 & 1 & 0 & 1 & 0 \\ +\midrule +0 & 0 & 4 & 0 & 4 & 0 \\ +1 & 1 & 5 & 1 & 5 & 1 \\ +2 & 2 & 6 & 2 & 6 & 2 \\ +3 & 3 & 7 & 3 & 7 & 3 \\ +\bottomrule +\end{tabular} +""" + assert result == expected + + # GH 10660 + df = pd.DataFrame({"a": [0, 0, 1, 1], "b": list("abab"), "c": [1, 2, 3, 4]}) + result = df.set_index(["a", "b"]).to_latex() + expected = r"""\begin{tabular}{llr} +\toprule + & & c \\ +a & b & \\ +\midrule +0 & a & 1 \\ + & b & 2 \\ +1 & a & 3 \\ + & b & 4 \\ +\bottomrule +\end{tabular} +""" + + assert result == expected + + result = df.groupby("a").describe().to_latex() + expected = r"""\begin{tabular}{lrrrrrrrr} +\toprule +{} & \multicolumn{8}{l}{c} \\ +{} & count & mean & std & min & 25\% & 50\% & 75\% & max \\ +a & & & & & & & & \\ +\midrule +0 & 2.0 & 1.5 & 0.707107 & 1.0 & 1.25 & 1.5 & 1.75 & 2.0 \\ +1 & 2.0 & 3.5 & 0.707107 & 3.0 & 3.25 & 3.5 & 3.75 & 4.0 \\ +\bottomrule +\end{tabular} +""" + + assert result == expected + + def test_to_latex_multiindex_dupe_level(self): + # see gh-14484 + # + # If an index is repeated in subsequent rows, it should be + # replaced with a blank in the created table. This should + # ONLY happen if all higher order indices (to the left) are + # equal too. In this test, 'c' has to be printed both times + # because the higher order index 'A' != 'B'. + df = pd.DataFrame( + index=pd.MultiIndex.from_tuples([("A", "c"), ("B", "c")]), columns=["col"] + ) + result = df.to_latex() + expected = r"""\begin{tabular}{lll} +\toprule + & & col \\ +\midrule +A & c & NaN \\ +B & c & NaN \\ +\bottomrule +\end{tabular} +""" + assert result == expected + + def test_to_latex_multicolumnrow(self): + df = pd.DataFrame( + { + ("c1", 0): {x: x for x in range(5)}, + ("c1", 1): {x: x + 5 for x in range(5)}, + ("c2", 0): {x: x for x in range(5)}, + ("c2", 1): {x: x + 5 for x in range(5)}, + ("c3", 0): {x: x for x in range(5)}, + } + ) + result = df.to_latex() + expected = r"""\begin{tabular}{lrrrrr} +\toprule +{} & \multicolumn{2}{l}{c1} & \multicolumn{2}{l}{c2} & c3 \\ +{} & 0 & 1 & 0 & 1 & 0 \\ +\midrule +0 & 0 & 5 & 0 & 5 & 0 \\ +1 & 1 & 6 & 1 & 6 & 1 \\ +2 & 2 & 7 & 2 & 7 & 2 \\ +3 & 3 & 8 & 3 & 8 & 3 \\ +4 & 4 & 9 & 4 & 9 & 4 \\ +\bottomrule +\end{tabular} +""" + assert result == expected + + result = df.to_latex(multicolumn=False) + expected = r"""\begin{tabular}{lrrrrr} +\toprule +{} & c1 & & c2 & & c3 \\ +{} & 0 & 1 & 0 & 1 & 0 \\ +\midrule +0 & 0 & 5 & 0 & 5 & 0 \\ +1 & 1 & 6 & 1 & 6 & 1 \\ +2 & 2 & 7 & 2 & 7 & 2 \\ +3 & 3 & 8 & 3 & 8 & 3 \\ +4 & 4 & 9 & 4 & 9 & 4 \\ +\bottomrule +\end{tabular} +""" + assert result == expected + + result = df.T.to_latex(multirow=True) + expected = r"""\begin{tabular}{llrrrrr} +\toprule + & & 0 & 1 & 2 & 3 & 4 \\ +\midrule +\multirow{2}{*}{c1} & 0 & 0 & 1 & 2 & 3 & 4 \\ + & 1 & 5 & 6 & 7 & 8 & 9 \\ +\cline{1-7} +\multirow{2}{*}{c2} & 0 & 0 & 1 & 2 & 3 & 4 \\ + & 1 & 5 & 6 & 7 & 8 & 9 \\ +\cline{1-7} +c3 & 0 & 0 & 1 & 2 & 3 & 4 \\ +\bottomrule +\end{tabular} +""" + assert result == expected + + df.index = df.T.index + result = df.T.to_latex(multirow=True, multicolumn=True, multicolumn_format="c") + expected = r"""\begin{tabular}{llrrrrr} +\toprule + & & \multicolumn{2}{c}{c1} & \multicolumn{2}{c}{c2} & c3 \\ + & & 0 & 1 & 0 & 1 & 0 \\ +\midrule +\multirow{2}{*}{c1} & 0 & 0 & 1 & 2 & 3 & 4 \\ + & 1 & 5 & 6 & 7 & 8 & 9 \\ +\cline{1-7} +\multirow{2}{*}{c2} & 0 & 0 & 1 & 2 & 3 & 4 \\ + & 1 & 5 & 6 & 7 & 8 & 9 \\ +\cline{1-7} +c3 & 0 & 0 & 1 & 2 & 3 & 4 \\ +\bottomrule +\end{tabular} +""" + assert result == expected + + def test_to_latex_escape(self): + a = "a" + b = "b" + + test_dict = {"co$e^x$": {a: "a", b: "b"}, "co^l1": {a: "a", b: "b"}} + + unescaped_result = DataFrame(test_dict).to_latex(escape=False) + escaped_result = DataFrame(test_dict).to_latex() # default: escape=True + + unescaped_expected = r"""\begin{tabular}{lll} +\toprule +{} & co$e^x$ & co^l1 \\ +\midrule +a & a & a \\ +b & b & b \\ +\bottomrule +\end{tabular} +""" + + escaped_expected = r"""\begin{tabular}{lll} +\toprule +{} & co\$e\textasciicircum x\$ & co\textasciicircum l1 \\ +\midrule +a & a & a \\ +b & b & b \\ +\bottomrule +\end{tabular} +""" + + assert unescaped_result == unescaped_expected + assert escaped_result == escaped_expected + + def test_to_latex_special_escape(self): + df = DataFrame([r"a\b\c", r"^a^b^c", r"~a~b~c"]) + + escaped_result = df.to_latex() + escaped_expected = r"""\begin{tabular}{ll} +\toprule +{} & 0 \\ +\midrule +0 & a\textbackslash b\textbackslash c \\ +1 & \textasciicircum a\textasciicircum b\textasciicircum c \\ +2 & \textasciitilde a\textasciitilde b\textasciitilde c \\ +\bottomrule +\end{tabular} +""" + assert escaped_result == escaped_expected + + def test_to_latex_longtable(self): + + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + withindex_result = df.to_latex(longtable=True) + withindex_expected = r"""\begin{longtable}{lrl} +\toprule +{} & a & b \\ +\midrule +\endhead +\midrule +\multicolumn{3}{r}{{Continued on next page}} \\ +\midrule +\endfoot + +\bottomrule +\endlastfoot +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\end{longtable} +""" + assert withindex_result == withindex_expected + + withoutindex_result = df.to_latex(index=False, longtable=True) + withoutindex_expected = r"""\begin{longtable}{rl} +\toprule + a & b \\ +\midrule +\endhead +\midrule +\multicolumn{2}{r}{{Continued on next page}} \\ +\midrule +\endfoot + +\bottomrule +\endlastfoot + 1 & b1 \\ + 2 & b2 \\ +\end{longtable} +""" + + assert withoutindex_result == withoutindex_expected + + df = DataFrame({"a": [1, 2]}) + with1column_result = df.to_latex(index=False, longtable=True) + assert r"\multicolumn{1}" in with1column_result + + df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}) + with3columns_result = df.to_latex(index=False, longtable=True) + assert r"\multicolumn{3}" in with3columns_result + + def test_to_latex_caption_label(self): + # GH 25436 + the_caption = "a table in a \\texttt{table/tabular} environment" + the_label = "tab:table_tabular" + + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + + # test when only the caption is provided + result_c = df.to_latex(caption=the_caption) + + expected_c = r"""\begin{table} +\centering +\caption{a table in a \texttt{table/tabular} environment} +\begin{tabular}{lrl} +\toprule +{} & a & b \\ +\midrule +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\bottomrule +\end{tabular} +\end{table} +""" + assert result_c == expected_c + + # test when only the label is provided + result_l = df.to_latex(label=the_label) + + expected_l = r"""\begin{table} +\centering +\label{tab:table_tabular} +\begin{tabular}{lrl} +\toprule +{} & a & b \\ +\midrule +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\bottomrule +\end{tabular} +\end{table} +""" + assert result_l == expected_l + + # test when the caption and the label are provided + result_cl = df.to_latex(caption=the_caption, label=the_label) + + expected_cl = r"""\begin{table} +\centering +\caption{a table in a \texttt{table/tabular} environment} +\label{tab:table_tabular} +\begin{tabular}{lrl} +\toprule +{} & a & b \\ +\midrule +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\bottomrule +\end{tabular} +\end{table} +""" + assert result_cl == expected_cl + + def test_to_latex_longtable_caption_label(self): + # GH 25436 + the_caption = "a table in a \\texttt{longtable} environment" + the_label = "tab:longtable" + + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + + # test when only the caption is provided + result_c = df.to_latex(longtable=True, caption=the_caption) + + expected_c = r"""\begin{longtable}{lrl} +\caption{a table in a \texttt{longtable} environment}\\ +\toprule +{} & a & b \\ +\midrule +\endhead +\midrule +\multicolumn{3}{r}{{Continued on next page}} \\ +\midrule +\endfoot + +\bottomrule +\endlastfoot +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\end{longtable} +""" + assert result_c == expected_c + + # test when only the label is provided + result_l = df.to_latex(longtable=True, label=the_label) + + expected_l = r"""\begin{longtable}{lrl} +\label{tab:longtable}\\ +\toprule +{} & a & b \\ +\midrule +\endhead +\midrule +\multicolumn{3}{r}{{Continued on next page}} \\ +\midrule +\endfoot + +\bottomrule +\endlastfoot +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\end{longtable} +""" + assert result_l == expected_l + + # test when the caption and the label are provided + result_cl = df.to_latex(longtable=True, caption=the_caption, label=the_label) + + expected_cl = r"""\begin{longtable}{lrl} +\caption{a table in a \texttt{longtable} environment}\label{tab:longtable}\\ +\toprule +{} & a & b \\ +\midrule +\endhead +\midrule +\multicolumn{3}{r}{{Continued on next page}} \\ +\midrule +\endfoot + +\bottomrule +\endlastfoot +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\end{longtable} +""" + assert result_cl == expected_cl + + def test_to_latex_escape_special_chars(self): + special_characters = ["&", "%", "$", "#", "_", "{", "}", "~", "^", "\\"] + df = DataFrame(data=special_characters) + observed = df.to_latex() + expected = r"""\begin{tabular}{ll} +\toprule +{} & 0 \\ +\midrule +0 & \& \\ +1 & \% \\ +2 & \$ \\ +3 & \# \\ +4 & \_ \\ +5 & \{ \\ +6 & \} \\ +7 & \textasciitilde \\ +8 & \textasciicircum \\ +9 & \textbackslash \\ +\bottomrule +\end{tabular} +""" + + assert observed == expected + + def test_to_latex_no_header(self): + # GH 7124 + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + withindex_result = df.to_latex(header=False) + withindex_expected = r"""\begin{tabular}{lrl} +\toprule +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\bottomrule +\end{tabular} +""" + + assert withindex_result == withindex_expected + + withoutindex_result = df.to_latex(index=False, header=False) + withoutindex_expected = r"""\begin{tabular}{rl} +\toprule + 1 & b1 \\ + 2 & b2 \\ +\bottomrule +\end{tabular} +""" + + assert withoutindex_result == withoutindex_expected + + def test_to_latex_specified_header(self): + # GH 7124 + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + withindex_result = df.to_latex(header=["AA", "BB"]) + withindex_expected = r"""\begin{tabular}{lrl} +\toprule +{} & AA & BB \\ +\midrule +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\bottomrule +\end{tabular} +""" + + assert withindex_result == withindex_expected + + withoutindex_result = df.to_latex(header=["AA", "BB"], index=False) + withoutindex_expected = r"""\begin{tabular}{rl} +\toprule +AA & BB \\ +\midrule + 1 & b1 \\ + 2 & b2 \\ +\bottomrule +\end{tabular} +""" + + assert withoutindex_result == withoutindex_expected + + withoutescape_result = df.to_latex(header=["$A$", "$B$"], escape=False) + withoutescape_expected = r"""\begin{tabular}{lrl} +\toprule +{} & $A$ & $B$ \\ +\midrule +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\bottomrule +\end{tabular} +""" + + assert withoutescape_result == withoutescape_expected + + with pytest.raises(ValueError): + df.to_latex(header=["A"]) + + def test_to_latex_decimal(self, float_frame): + # GH 12031 + float_frame.to_latex() + + df = DataFrame({"a": [1.0, 2.1], "b": ["b1", "b2"]}) + withindex_result = df.to_latex(decimal=",") + + withindex_expected = r"""\begin{tabular}{lrl} +\toprule +{} & a & b \\ +\midrule +0 & 1,0 & b1 \\ +1 & 2,1 & b2 \\ +\bottomrule +\end{tabular} +""" + + assert withindex_result == withindex_expected + + def test_to_latex_series(self): + s = Series(["a", "b", "c"]) + withindex_result = s.to_latex() + withindex_expected = r"""\begin{tabular}{ll} +\toprule +{} & 0 \\ +\midrule +0 & a \\ +1 & b \\ +2 & c \\ +\bottomrule +\end{tabular} +""" + assert withindex_result == withindex_expected + + def test_to_latex_bold_rows(self): + # GH 16707 + df = pd.DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + observed = df.to_latex(bold_rows=True) + expected = r"""\begin{tabular}{lrl} +\toprule +{} & a & b \\ +\midrule +\textbf{0} & 1 & b1 \\ +\textbf{1} & 2 & b2 \\ +\bottomrule +\end{tabular} +""" + assert observed == expected + + def test_to_latex_no_bold_rows(self): + # GH 16707 + df = pd.DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + observed = df.to_latex(bold_rows=False) + expected = r"""\begin{tabular}{lrl} +\toprule +{} & a & b \\ +\midrule +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\bottomrule +\end{tabular} +""" + assert observed == expected + + @pytest.mark.parametrize("name0", [None, "named0"]) + @pytest.mark.parametrize("name1", [None, "named1"]) + @pytest.mark.parametrize("axes", [[0], [1], [0, 1]]) + def test_to_latex_multiindex_names(self, name0, name1, axes): + # GH 18667 + names = [name0, name1] + mi = pd.MultiIndex.from_product([[1, 2], [3, 4]]) + df = pd.DataFrame(-1, index=mi.copy(), columns=mi.copy()) + for idx in axes: + df.axes[idx].names = names + + idx_names = tuple(n or "{}" for n in names) + idx_names_row = ( + "{idx_names[0]} & {idx_names[1]} & & & & \\\\\n".format( + idx_names=idx_names + ) + if (0 in axes and any(names)) + else "" + ) + placeholder = "{}" if any(names) and 1 in axes else " " + col_names = [n if (bool(n) and 1 in axes) else placeholder for n in names] + observed = df.to_latex() + expected = r"""\begin{tabular}{llrrrr} +\toprule + & %s & \multicolumn{2}{l}{1} & \multicolumn{2}{l}{2} \\ + & %s & 3 & 4 & 3 & 4 \\ +%s\midrule +1 & 3 & -1 & -1 & -1 & -1 \\ + & 4 & -1 & -1 & -1 & -1 \\ +2 & 3 & -1 & -1 & -1 & -1 \\ + & 4 & -1 & -1 & -1 & -1 \\ +\bottomrule +\end{tabular} +""" % tuple( + list(col_names) + [idx_names_row] + ) + assert observed == expected + + @pytest.mark.parametrize("one_row", [True, False]) + def test_to_latex_multiindex_nans(self, one_row): + # GH 14249 + df = pd.DataFrame({"a": [None, 1], "b": [2, 3], "c": [4, 5]}) + if one_row: + df = df.iloc[[0]] + observed = df.set_index(["a", "b"]).to_latex() + expected = r"""\begin{tabular}{llr} +\toprule + & & c \\ +a & b & \\ +\midrule +NaN & 2 & 4 \\ +""" + if not one_row: + expected += r"""1.0 & 3 & 5 \\ +""" + expected += r"""\bottomrule +\end{tabular} +""" + assert observed == expected + + def test_to_latex_non_string_index(self): + # GH 19981 + observed = pd.DataFrame([[1, 2, 3]] * 2).set_index([0, 1]).to_latex() + expected = r"""\begin{tabular}{llr} +\toprule + & & 2 \\ +0 & 1 & \\ +\midrule +1 & 2 & 3 \\ + & 2 & 3 \\ +\bottomrule +\end{tabular} +""" + assert observed == expected + + def test_to_latex_midrule_location(self): + # GH 18326 + df = pd.DataFrame({"a": [1, 2]}) + df.index.name = "foo" + observed = df.to_latex(index_names=False) + expected = r"""\begin{tabular}{lr} +\toprule +{} & a \\ +\midrule +0 & 1 \\ +1 & 2 \\ +\bottomrule +\end{tabular} +""" + + assert observed == expected + + def test_to_latex_multiindex_empty_name(self): + # GH 18669 + mi = pd.MultiIndex.from_product([[1, 2]], names=[""]) + df = pd.DataFrame(-1, index=mi, columns=range(4)) + observed = df.to_latex() + expected = r"""\begin{tabular}{lrrrr} +\toprule + & 0 & 1 & 2 & 3 \\ +{} & & & & \\ +\midrule +1 & -1 & -1 & -1 & -1 \\ +2 & -1 & -1 & -1 & -1 \\ +\bottomrule +\end{tabular} +""" + assert observed == expected + + def test_to_latex_float_format_no_fixed_width(self): + + # GH 21625 + df = DataFrame({"x": [0.19999]}) + expected = r"""\begin{tabular}{lr} +\toprule +{} & x \\ +\midrule +0 & 0.200 \\ +\bottomrule +\end{tabular} +""" + assert df.to_latex(float_format="%.3f") == expected + + # GH 22270 + df = DataFrame({"x": [100.0]}) + expected = r"""\begin{tabular}{lr} +\toprule +{} & x \\ +\midrule +0 & 100 \\ +\bottomrule +\end{tabular} +""" + assert df.to_latex(float_format="%.0f") == expected + + def test_to_latex_multindex_header(self): + # GH 16718 + df = pd.DataFrame({"a": [0], "b": [1], "c": [2], "d": [3]}).set_index( + ["a", "b"] + ) + observed = df.to_latex(header=["r1", "r2"]) + expected = r"""\begin{tabular}{llrr} +\toprule + & & r1 & r2 \\ +a & b & & \\ +\midrule +0 & 1 & 2 & 3 \\ +\bottomrule +\end{tabular} +""" + assert observed == expected diff --git a/venv/Lib/site-packages/pandas/tests/io/formats/test_to_markdown.py b/venv/Lib/site-packages/pandas/tests/io/formats/test_to_markdown.py new file mode 100644 index 0000000..8893e42 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/formats/test_to_markdown.py @@ -0,0 +1,55 @@ +from io import StringIO + +import pytest + +import pandas as pd + +pytest.importorskip("tabulate") + + +def test_simple(): + buf = StringIO() + df = pd.DataFrame([1, 2, 3]) + df.to_markdown(buf=buf) + result = buf.getvalue() + assert ( + result == "| | 0 |\n|---:|----:|\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |" + ) + + +def test_other_tablefmt(): + buf = StringIO() + df = pd.DataFrame([1, 2, 3]) + df.to_markdown(buf=buf, tablefmt="jira") + result = buf.getvalue() + assert result == "|| || 0 ||\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |" + + +def test_other_headers(): + buf = StringIO() + df = pd.DataFrame([1, 2, 3]) + df.to_markdown(buf=buf, headers=["foo", "bar"]) + result = buf.getvalue() + assert result == ( + "| foo | bar |\n|------:|------:|\n| 0 " + "| 1 |\n| 1 | 2 |\n| 2 | 3 |" + ) + + +def test_series(): + buf = StringIO() + s = pd.Series([1, 2, 3], name="foo") + s.to_markdown(buf=buf) + result = buf.getvalue() + assert result == ( + "| | foo |\n|---:|------:|\n| 0 | 1 " + "|\n| 1 | 2 |\n| 2 | 3 |" + ) + + +def test_no_buf(capsys): + df = pd.DataFrame([1, 2, 3]) + result = df.to_markdown() + assert ( + result == "| | 0 |\n|---:|----:|\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |" + ) diff --git a/venv/Lib/site-packages/pandas/tests/io/generate_legacy_storage_files.py b/venv/Lib/site-packages/pandas/tests/io/generate_legacy_storage_files.py new file mode 100644 index 0000000..6ef0e04 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/generate_legacy_storage_files.py @@ -0,0 +1,360 @@ +#!/usr/bin/env python + +""" +self-contained to write legacy storage pickle files + +To use this script. Create an environment where you want +generate pickles, say its for 0.20.3, with your pandas clone +in ~/pandas + +. activate pandas_0.20.3 +cd ~/ + +$ python pandas/pandas/tests/io/generate_legacy_storage_files.py \ + pandas/pandas/tests/io/data/legacy_pickle/0.20.3/ pickle + +This script generates a storage file for the current arch, system, +and python version + pandas version: 0.20.3 + output dir : pandas/pandas/tests/io/data/legacy_pickle/0.20.3/ + storage format: pickle +created pickle file: 0.20.3_x86_64_darwin_3.5.2.pickle + +The idea here is you are using the *current* version of the +generate_legacy_storage_files with an *older* version of pandas to +generate a pickle file. We will then check this file into a current +branch, and test using test_pickle.py. This will load the *older* +pickles and test versus the current data that is generated +(with master). These are then compared. + +If we have cases where we changed the signature (e.g. we renamed +offset -> freq in Timestamp). Then we have to conditionally execute +in the generate_legacy_storage_files.py to make it +run under the older AND the newer version. + +""" + +from datetime import timedelta +from distutils.version import LooseVersion +import os +import pickle +import platform as pl +import sys + +import numpy as np + +import pandas +from pandas import ( + Categorical, + DataFrame, + Index, + MultiIndex, + NaT, + Period, + RangeIndex, + Series, + Timestamp, + bdate_range, + date_range, + period_range, + timedelta_range, +) + +from pandas.tseries.offsets import ( + FY5253, + BusinessDay, + BusinessHour, + CustomBusinessDay, + DateOffset, + Day, + Easter, + Hour, + LastWeekOfMonth, + Minute, + MonthBegin, + MonthEnd, + QuarterBegin, + QuarterEnd, + SemiMonthBegin, + SemiMonthEnd, + Week, + WeekOfMonth, + YearBegin, + YearEnd, +) + +try: + # TODO: remove try/except when 0.24.0 is the legacy version. + from pandas.arrays import SparseArray +except ImportError: + from pandas.core.sparse.api import SparseArray + + +_loose_version = LooseVersion(pandas.__version__) + + +def _create_sp_series(): + nan = np.nan + + # nan-based + arr = np.arange(15, dtype=np.float64) + arr[7:12] = nan + arr[-1:] = nan + + bseries = Series(SparseArray(arr, kind="block")) + bseries.name = "bseries" + return bseries + + +def _create_sp_tsseries(): + nan = np.nan + + # nan-based + arr = np.arange(15, dtype=np.float64) + arr[7:12] = nan + arr[-1:] = nan + + date_index = bdate_range("1/1/2011", periods=len(arr)) + bseries = Series(SparseArray(arr, kind="block"), index=date_index) + bseries.name = "btsseries" + return bseries + + +def _create_sp_frame(): + nan = np.nan + + data = { + "A": [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], + "B": [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], + "C": np.arange(10).astype(np.int64), + "D": [0, 1, 2, 3, 4, 5, nan, nan, nan, nan], + } + + dates = bdate_range("1/1/2011", periods=10) + return DataFrame(data, index=dates).apply(SparseArray) + + +def create_data(): + """ create the pickle data """ + + data = { + "A": [0.0, 1.0, 2.0, 3.0, np.nan], + "B": [0, 1, 0, 1, 0], + "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], + "D": date_range("1/1/2009", periods=5), + "E": [0.0, 1, Timestamp("20100101"), "foo", 2.0], + } + + scalars = dict(timestamp=Timestamp("20130101"), period=Period("2012", "M")) + + index = dict( + int=Index(np.arange(10)), + date=date_range("20130101", periods=10), + period=period_range("2013-01-01", freq="M", periods=10), + float=Index(np.arange(10, dtype=np.float64)), + uint=Index(np.arange(10, dtype=np.uint64)), + timedelta=timedelta_range("00:00:00", freq="30T", periods=10), + ) + + index["range"] = RangeIndex(10) + + if _loose_version >= LooseVersion("0.21"): + from pandas import interval_range + + index["interval"] = interval_range(0, periods=10) + + mi = dict( + reg2=MultiIndex.from_tuples( + tuple( + zip( + *[ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + ) + ), + names=["first", "second"], + ) + ) + + series = dict( + float=Series(data["A"]), + int=Series(data["B"]), + mixed=Series(data["E"]), + ts=Series( + np.arange(10).astype(np.int64), index=date_range("20130101", periods=10) + ), + mi=Series( + np.arange(5).astype(np.float64), + index=MultiIndex.from_tuples( + tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=["one", "two"] + ), + ), + dup=Series(np.arange(5).astype(np.float64), index=["A", "B", "C", "D", "A"]), + cat=Series(Categorical(["foo", "bar", "baz"])), + dt=Series(date_range("20130101", periods=5)), + dt_tz=Series(date_range("20130101", periods=5, tz="US/Eastern")), + period=Series([Period("2000Q1")] * 5), + ) + + mixed_dup_df = DataFrame(data) + mixed_dup_df.columns = list("ABCDA") + frame = dict( + float=DataFrame({"A": series["float"], "B": series["float"] + 1}), + int=DataFrame({"A": series["int"], "B": series["int"] + 1}), + mixed=DataFrame({k: data[k] for k in ["A", "B", "C", "D"]}), + mi=DataFrame( + {"A": np.arange(5).astype(np.float64), "B": np.arange(5).astype(np.int64)}, + index=MultiIndex.from_tuples( + tuple( + zip( + *[ + ["bar", "bar", "baz", "baz", "baz"], + ["one", "two", "one", "two", "three"], + ] + ) + ), + names=["first", "second"], + ), + ), + dup=DataFrame( + np.arange(15).reshape(5, 3).astype(np.float64), columns=["A", "B", "A"] + ), + cat_onecol=DataFrame({"A": Categorical(["foo", "bar"])}), + cat_and_float=DataFrame( + { + "A": Categorical(["foo", "bar", "baz"]), + "B": np.arange(3).astype(np.int64), + } + ), + mixed_dup=mixed_dup_df, + dt_mixed_tzs=DataFrame( + { + "A": Timestamp("20130102", tz="US/Eastern"), + "B": Timestamp("20130603", tz="CET"), + }, + index=range(5), + ), + dt_mixed2_tzs=DataFrame( + { + "A": Timestamp("20130102", tz="US/Eastern"), + "B": Timestamp("20130603", tz="CET"), + "C": Timestamp("20130603", tz="UTC"), + }, + index=range(5), + ), + ) + + cat = dict( + int8=Categorical(list("abcdefg")), + int16=Categorical(np.arange(1000)), + int32=Categorical(np.arange(10000)), + ) + + timestamp = dict( + normal=Timestamp("2011-01-01"), + nat=NaT, + tz=Timestamp("2011-01-01", tz="US/Eastern"), + ) + + timestamp["freq"] = Timestamp("2011-01-01", freq="D") + timestamp["both"] = Timestamp("2011-01-01", tz="Asia/Tokyo", freq="M") + + off = { + "DateOffset": DateOffset(years=1), + "DateOffset_h_ns": DateOffset(hour=6, nanoseconds=5824), + "BusinessDay": BusinessDay(offset=timedelta(seconds=9)), + "BusinessHour": BusinessHour(normalize=True, n=6, end="15:14"), + "CustomBusinessDay": CustomBusinessDay(weekmask="Mon Fri"), + "SemiMonthBegin": SemiMonthBegin(day_of_month=9), + "SemiMonthEnd": SemiMonthEnd(day_of_month=24), + "MonthBegin": MonthBegin(1), + "MonthEnd": MonthEnd(1), + "QuarterBegin": QuarterBegin(1), + "QuarterEnd": QuarterEnd(1), + "Day": Day(1), + "YearBegin": YearBegin(1), + "YearEnd": YearEnd(1), + "Week": Week(1), + "Week_Tues": Week(2, normalize=False, weekday=1), + "WeekOfMonth": WeekOfMonth(week=3, weekday=4), + "LastWeekOfMonth": LastWeekOfMonth(n=1, weekday=3), + "FY5253": FY5253(n=2, weekday=6, startingMonth=7, variation="last"), + "Easter": Easter(), + "Hour": Hour(1), + "Minute": Minute(1), + } + + return dict( + series=series, + frame=frame, + index=index, + scalars=scalars, + mi=mi, + sp_series=dict(float=_create_sp_series(), ts=_create_sp_tsseries()), + sp_frame=dict(float=_create_sp_frame()), + cat=cat, + timestamp=timestamp, + offsets=off, + ) + + +def create_pickle_data(): + data = create_data() + + return data + + +def platform_name(): + return "_".join( + [ + str(pandas.__version__), + str(pl.machine()), + str(pl.system().lower()), + str(pl.python_version()), + ] + ) + + +def write_legacy_pickles(output_dir): + + version = pandas.__version__ + + print( + "This script generates a storage file for the current arch, system, " + "and python version" + ) + print(" pandas version: {0}".format(version)) + print(" output dir : {0}".format(output_dir)) + print(" storage format: pickle") + + pth = "{0}.pickle".format(platform_name()) + + fh = open(os.path.join(output_dir, pth), "wb") + pickle.dump(create_pickle_data(), fh, pickle.HIGHEST_PROTOCOL) + fh.close() + + print("created pickle file: {pth}".format(pth=pth)) + + +def write_legacy_file(): + # force our cwd to be the first searched + sys.path.insert(0, ".") + + if not (3 <= len(sys.argv) <= 4): + exit( + "Specify output directory and storage type: generate_legacy_" + "storage_files.py " + ) + + output_dir = str(sys.argv[1]) + storage_type = str(sys.argv[2]) + + if storage_type == "pickle": + write_legacy_pickles(output_dir=output_dir) + else: + exit("storage_type must be one of {'pickle'}") + + +if __name__ == "__main__": + write_legacy_file() diff --git a/venv/Lib/site-packages/pandas/tests/io/json/__init__.py b/venv/Lib/site-packages/pandas/tests/io/json/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/io/json/conftest.py b/venv/Lib/site-packages/pandas/tests/io/json/conftest.py new file mode 100644 index 0000000..4e848cd --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/json/conftest.py @@ -0,0 +1,9 @@ +import pytest + + +@pytest.fixture(params=["split", "records", "index", "columns", "values"]) +def orient(request): + """ + Fixture for orients excluding the table format. + """ + return request.param diff --git a/venv/Lib/site-packages/pandas/tests/io/json/test_compression.py b/venv/Lib/site-packages/pandas/tests/io/json/test_compression.py new file mode 100644 index 0000000..182c21e --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/json/test_compression.py @@ -0,0 +1,113 @@ +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +import pandas._testing as tm + + +def test_compression_roundtrip(compression): + df = pd.DataFrame( + [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) + + with tm.ensure_clean() as path: + df.to_json(path, compression=compression) + tm.assert_frame_equal(df, pd.read_json(path, compression=compression)) + + # explicitly ensure file was compressed. + with tm.decompress_file(path, compression) as fh: + result = fh.read().decode("utf8") + tm.assert_frame_equal(df, pd.read_json(result)) + + +def test_read_zipped_json(datapath): + uncompressed_path = datapath("io", "json", "data", "tsframe_v012.json") + uncompressed_df = pd.read_json(uncompressed_path) + + compressed_path = datapath("io", "json", "data", "tsframe_v012.json.zip") + compressed_df = pd.read_json(compressed_path, compression="zip") + + tm.assert_frame_equal(uncompressed_df, compressed_df) + + +@td.skip_if_not_us_locale +def test_with_s3_url(compression, s3_resource): + # Bucket "pandas-test" created in tests/io/conftest.py + + df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}') + + with tm.ensure_clean() as path: + df.to_json(path, compression=compression) + with open(path, "rb") as f: + s3_resource.Bucket("pandas-test").put_object(Key="test-1", Body=f) + + roundtripped_df = pd.read_json("s3://pandas-test/test-1", compression=compression) + tm.assert_frame_equal(df, roundtripped_df) + + +def test_lines_with_compression(compression): + + with tm.ensure_clean() as path: + df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}') + df.to_json(path, orient="records", lines=True, compression=compression) + roundtripped_df = pd.read_json(path, lines=True, compression=compression) + tm.assert_frame_equal(df, roundtripped_df) + + +def test_chunksize_with_compression(compression): + + with tm.ensure_clean() as path: + df = pd.read_json('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}') + df.to_json(path, orient="records", lines=True, compression=compression) + + res = pd.read_json(path, lines=True, chunksize=1, compression=compression) + roundtripped_df = pd.concat(res) + tm.assert_frame_equal(df, roundtripped_df) + + +def test_write_unsupported_compression_type(): + df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}') + with tm.ensure_clean() as path: + msg = "Unrecognized compression type: unsupported" + with pytest.raises(ValueError, match=msg): + df.to_json(path, compression="unsupported") + + +def test_read_unsupported_compression_type(): + with tm.ensure_clean() as path: + msg = "Unrecognized compression type: unsupported" + with pytest.raises(ValueError, match=msg): + pd.read_json(path, compression="unsupported") + + +@pytest.mark.parametrize("to_infer", [True, False]) +@pytest.mark.parametrize("read_infer", [True, False]) +def test_to_json_compression(compression_only, read_infer, to_infer): + # see gh-15008 + compression = compression_only + + if compression == "zip": + pytest.skip(f"{compression} is not supported for to_csv") + + # We'll complete file extension subsequently. + filename = "test." + + if compression == "gzip": + filename += "gz" + else: + # xz --> .xz + # bz2 --> .bz2 + filename += compression + + df = pd.DataFrame({"A": [1]}) + + to_compression = "infer" if to_infer else compression + read_compression = "infer" if read_infer else compression + + with tm.ensure_clean(filename) as path: + df.to_json(path, compression=to_compression) + result = pd.read_json(path, compression=read_compression) + tm.assert_frame_equal(result, df) diff --git a/venv/Lib/site-packages/pandas/tests/io/json/test_json_table_schema.py b/venv/Lib/site-packages/pandas/tests/io/json/test_json_table_schema.py new file mode 100644 index 0000000..2ac2acc --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/json/test_json_table_schema.py @@ -0,0 +1,724 @@ +"""Tests for Table Schema integration.""" +from collections import OrderedDict +import json + +import numpy as np +import pytest + +from pandas.core.dtypes.dtypes import CategoricalDtype, DatetimeTZDtype, PeriodDtype + +import pandas as pd +from pandas import DataFrame +import pandas._testing as tm + +from pandas.io.json._table_schema import ( + as_json_table_type, + build_table_schema, + convert_json_field_to_pandas_type, + convert_pandas_type_to_json_field, + set_default_names, +) + + +class TestBuildSchema: + def setup_method(self, method): + self.df = DataFrame( + { + "A": [1, 2, 3, 4], + "B": ["a", "b", "c", "c"], + "C": pd.date_range("2016-01-01", freq="d", periods=4), + "D": pd.timedelta_range("1H", periods=4, freq="T"), + }, + index=pd.Index(range(4), name="idx"), + ) + + def test_build_table_schema(self): + result = build_table_schema(self.df, version=False) + expected = { + "fields": [ + {"name": "idx", "type": "integer"}, + {"name": "A", "type": "integer"}, + {"name": "B", "type": "string"}, + {"name": "C", "type": "datetime"}, + {"name": "D", "type": "duration"}, + ], + "primaryKey": ["idx"], + } + assert result == expected + result = build_table_schema(self.df) + assert "pandas_version" in result + + def test_series(self): + s = pd.Series([1, 2, 3], name="foo") + result = build_table_schema(s, version=False) + expected = { + "fields": [ + {"name": "index", "type": "integer"}, + {"name": "foo", "type": "integer"}, + ], + "primaryKey": ["index"], + } + assert result == expected + result = build_table_schema(s) + assert "pandas_version" in result + + def test_series_unnamed(self): + result = build_table_schema(pd.Series([1, 2, 3]), version=False) + expected = { + "fields": [ + {"name": "index", "type": "integer"}, + {"name": "values", "type": "integer"}, + ], + "primaryKey": ["index"], + } + assert result == expected + + def test_multiindex(self): + df = self.df.copy() + idx = pd.MultiIndex.from_product([("a", "b"), (1, 2)]) + df.index = idx + + result = build_table_schema(df, version=False) + expected = { + "fields": [ + {"name": "level_0", "type": "string"}, + {"name": "level_1", "type": "integer"}, + {"name": "A", "type": "integer"}, + {"name": "B", "type": "string"}, + {"name": "C", "type": "datetime"}, + {"name": "D", "type": "duration"}, + ], + "primaryKey": ["level_0", "level_1"], + } + assert result == expected + + df.index.names = ["idx0", None] + expected["fields"][0]["name"] = "idx0" + expected["primaryKey"] = ["idx0", "level_1"] + result = build_table_schema(df, version=False) + assert result == expected + + +class TestTableSchemaType: + @pytest.mark.parametrize("int_type", [np.int, np.int16, np.int32, np.int64]) + def test_as_json_table_type_int_data(self, int_type): + int_data = [1, 2, 3] + assert as_json_table_type(np.array(int_data, dtype=int_type)) == "integer" + + @pytest.mark.parametrize( + "float_type", [np.float, np.float16, np.float32, np.float64] + ) + def test_as_json_table_type_float_data(self, float_type): + float_data = [1.0, 2.0, 3.0] + assert as_json_table_type(np.array(float_data, dtype=float_type)) == "number" + + @pytest.mark.parametrize("bool_type", [bool, np.bool]) + def test_as_json_table_type_bool_data(self, bool_type): + bool_data = [True, False] + assert as_json_table_type(np.array(bool_data, dtype=bool_type)) == "boolean" + + @pytest.mark.parametrize( + "date_data", + [ + pd.to_datetime(["2016"]), + pd.to_datetime(["2016"], utc=True), + pd.Series(pd.to_datetime(["2016"])), + pd.Series(pd.to_datetime(["2016"], utc=True)), + pd.period_range("2016", freq="A", periods=3), + ], + ) + def test_as_json_table_type_date_data(self, date_data): + assert as_json_table_type(date_data) == "datetime" + + @pytest.mark.parametrize("str_data", [pd.Series(["a", "b"]), pd.Index(["a", "b"])]) + def test_as_json_table_type_string_data(self, str_data): + assert as_json_table_type(str_data) == "string" + + @pytest.mark.parametrize( + "cat_data", + [ + pd.Categorical(["a"]), + pd.Categorical([1]), + pd.Series(pd.Categorical([1])), + pd.CategoricalIndex([1]), + pd.Categorical([1]), + ], + ) + def test_as_json_table_type_categorical_data(self, cat_data): + assert as_json_table_type(cat_data) == "any" + + # ------ + # dtypes + # ------ + @pytest.mark.parametrize("int_dtype", [np.int, np.int16, np.int32, np.int64]) + def test_as_json_table_type_int_dtypes(self, int_dtype): + assert as_json_table_type(int_dtype) == "integer" + + @pytest.mark.parametrize( + "float_dtype", [np.float, np.float16, np.float32, np.float64] + ) + def test_as_json_table_type_float_dtypes(self, float_dtype): + assert as_json_table_type(float_dtype) == "number" + + @pytest.mark.parametrize("bool_dtype", [bool, np.bool]) + def test_as_json_table_type_bool_dtypes(self, bool_dtype): + assert as_json_table_type(bool_dtype) == "boolean" + + @pytest.mark.parametrize( + "date_dtype", + [ + np.datetime64, + np.dtype(" str: + return self.hexed + + hexed = "574b4454ba8c5eb4f98a8f45" + binthing = BinaryThing(hexed) + + # verify the proper conversion of printable content + df_printable = DataFrame({"A": [binthing.hexed]}) + assert df_printable.to_json() == f'{{"A":{{"0":"{hexed}"}}}}' + + # check if non-printable content throws appropriate Exception + df_nonprintable = DataFrame({"A": [binthing]}) + msg = "Unsupported UTF-8 sequence length when encoding string" + with pytest.raises(OverflowError, match=msg): + df_nonprintable.to_json() + + # the same with multiple columns threw segfaults + df_mixed = DataFrame({"A": [binthing], "B": [1]}, columns=["A", "B"]) + with pytest.raises(OverflowError): + df_mixed.to_json() + + # default_handler should resolve exceptions for non-string types + result = df_nonprintable.to_json(default_handler=str) + expected = f'{{"A":{{"0":"{hexed}"}}}}' + assert result == expected + assert ( + df_mixed.to_json(default_handler=str) + == f'{{"A":{{"0":"{hexed}"}},"B":{{"0":1}}}}' + ) + + def test_label_overflow(self): + # GH14256: buffer length not checked when writing label + result = pd.DataFrame({"bar" * 100000: [1], "foo": [1337]}).to_json() + expected = f'{{"{"bar" * 100000}":{{"0":1}},"foo":{{"0":1337}}}}' + assert result == expected + + def test_series_non_unique_index(self): + s = Series(["a", "b"], index=[1, 1]) + + msg = "Series index must be unique for orient='index'" + with pytest.raises(ValueError, match=msg): + s.to_json(orient="index") + + tm.assert_series_equal( + s, read_json(s.to_json(orient="split"), orient="split", typ="series") + ) + unser = read_json(s.to_json(orient="records"), orient="records", typ="series") + tm.assert_numpy_array_equal(s.values, unser.values) + + def test_series_default_orient(self): + assert self.series.to_json() == self.series.to_json(orient="index") + + @pytest.mark.parametrize("numpy", [True, False]) + def test_series_roundtrip_simple(self, orient, numpy): + data = self.series.to_json(orient=orient) + result = pd.read_json(data, typ="series", orient=orient, numpy=numpy) + expected = self.series.copy() + + if orient in ("values", "records"): + expected = expected.reset_index(drop=True) + if orient != "split": + expected.name = None + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("dtype", [False, None]) + @pytest.mark.parametrize("numpy", [True, False]) + def test_series_roundtrip_object(self, orient, numpy, dtype): + data = self.objSeries.to_json(orient=orient) + result = pd.read_json( + data, typ="series", orient=orient, numpy=numpy, dtype=dtype + ) + expected = self.objSeries.copy() + + if orient in ("values", "records"): + expected = expected.reset_index(drop=True) + if orient != "split": + expected.name = None + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("numpy", [True, False]) + def test_series_roundtrip_empty(self, orient, numpy): + data = self.empty_series.to_json(orient=orient) + result = pd.read_json(data, typ="series", orient=orient, numpy=numpy) + expected = self.empty_series.copy() + + # TODO: see what causes inconsistency + if orient in ("values", "records"): + expected = expected.reset_index(drop=True) + else: + expected.index = expected.index.astype(float) + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("numpy", [True, False]) + def test_series_roundtrip_timeseries(self, orient, numpy): + data = self.ts.to_json(orient=orient) + result = pd.read_json(data, typ="series", orient=orient, numpy=numpy) + expected = self.ts.copy() + + if orient in ("values", "records"): + expected = expected.reset_index(drop=True) + if orient != "split": + expected.name = None + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("dtype", [np.float64, np.int]) + @pytest.mark.parametrize("numpy", [True, False]) + def test_series_roundtrip_numeric(self, orient, numpy, dtype): + s = Series(range(6), index=["a", "b", "c", "d", "e", "f"]) + data = s.to_json(orient=orient) + result = pd.read_json(data, typ="series", orient=orient, numpy=numpy) + + expected = s.copy() + if orient in ("values", "records"): + expected = expected.reset_index(drop=True) + + tm.assert_series_equal(result, expected) + + def test_series_to_json_except(self): + s = Series([1, 2, 3]) + msg = "Invalid value 'garbage' for option 'orient'" + with pytest.raises(ValueError, match=msg): + s.to_json(orient="garbage") + + def test_series_from_json_precise_float(self): + s = Series([4.56, 4.56, 4.56]) + result = read_json(s.to_json(), typ="series", precise_float=True) + tm.assert_series_equal(result, s, check_index_type=False) + + def test_series_with_dtype(self): + # GH 21986 + s = Series([4.56, 4.56, 4.56]) + result = read_json(s.to_json(), typ="series", dtype=np.int64) + expected = Series([4] * 3) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "dtype,expected", + [ + (True, Series(["2000-01-01"], dtype="datetime64[ns]")), + (False, Series([946684800000])), + ], + ) + def test_series_with_dtype_datetime(self, dtype, expected): + s = Series(["2000-01-01"], dtype="datetime64[ns]") + data = s.to_json() + result = pd.read_json(data, typ="series", dtype=dtype) + tm.assert_series_equal(result, expected) + + def test_frame_from_json_precise_float(self): + df = DataFrame([[4.56, 4.56, 4.56], [4.56, 4.56, 4.56]]) + result = read_json(df.to_json(), precise_float=True) + tm.assert_frame_equal( + result, df, check_index_type=False, check_column_type=False + ) + + def test_typ(self): + + s = Series(range(6), index=["a", "b", "c", "d", "e", "f"], dtype="int64") + result = read_json(s.to_json(), typ=None) + tm.assert_series_equal(result, s) + + def test_reconstruction_index(self): + + df = DataFrame([[1, 2, 3], [4, 5, 6]]) + result = read_json(df.to_json()) + + tm.assert_frame_equal(result, df) + + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=["A", "B", "C"]) + result = read_json(df.to_json()) + tm.assert_frame_equal(result, df) + + def test_path(self): + with tm.ensure_clean("test.json") as path: + for df in [ + self.frame, + self.frame2, + self.intframe, + self.tsframe, + self.mixed_frame, + ]: + df.to_json(path) + read_json(path) + + def test_axis_dates(self): + + # frame + json = self.tsframe.to_json() + result = read_json(json) + tm.assert_frame_equal(result, self.tsframe) + + # series + json = self.ts.to_json() + result = read_json(json, typ="series") + tm.assert_series_equal(result, self.ts, check_names=False) + assert result.name is None + + def test_convert_dates(self): + + # frame + df = self.tsframe.copy() + df["date"] = Timestamp("20130101") + + json = df.to_json() + result = read_json(json) + tm.assert_frame_equal(result, df) + + df["foo"] = 1.0 + json = df.to_json(date_unit="ns") + + result = read_json(json, convert_dates=False) + expected = df.copy() + expected["date"] = expected["date"].values.view("i8") + expected["foo"] = expected["foo"].astype("int64") + tm.assert_frame_equal(result, expected) + + # series + ts = Series(Timestamp("20130101"), index=self.ts.index) + json = ts.to_json() + result = read_json(json, typ="series") + tm.assert_series_equal(result, ts) + + @pytest.mark.parametrize("date_format", ["epoch", "iso"]) + @pytest.mark.parametrize("as_object", [True, False]) + @pytest.mark.parametrize( + "date_typ", [datetime.date, datetime.datetime, pd.Timestamp] + ) + def test_date_index_and_values(self, date_format, as_object, date_typ): + data = [date_typ(year=2020, month=1, day=1), pd.NaT] + if as_object: + data.append("a") + + ser = pd.Series(data, index=data) + result = ser.to_json(date_format=date_format) + + if date_format == "epoch": + expected = '{"1577836800000":1577836800000,"null":null}' + else: + expected = ( + '{"2020-01-01T00:00:00.000Z":"2020-01-01T00:00:00.000Z","null":null}' + ) + + if as_object: + expected = expected.replace("}", ',"a":"a"}') + + assert result == expected + + @pytest.mark.parametrize( + "infer_word", + [ + "trade_time", + "date", + "datetime", + "sold_at", + "modified", + "timestamp", + "timestamps", + ], + ) + def test_convert_dates_infer(self, infer_word): + # GH10747 + from pandas.io.json import dumps + + data = [{"id": 1, infer_word: 1036713600000}, {"id": 2}] + expected = DataFrame( + [[1, Timestamp("2002-11-08")], [2, pd.NaT]], columns=["id", infer_word] + ) + result = read_json(dumps(data))[["id", infer_word]] + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "date,date_unit", + [ + ("20130101 20:43:42.123", None), + ("20130101 20:43:42", "s"), + ("20130101 20:43:42.123", "ms"), + ("20130101 20:43:42.123456", "us"), + ("20130101 20:43:42.123456789", "ns"), + ], + ) + def test_date_format_frame(self, date, date_unit): + df = self.tsframe.copy() + + df["date"] = Timestamp(date) + df.iloc[1, df.columns.get_loc("date")] = pd.NaT + df.iloc[5, df.columns.get_loc("date")] = pd.NaT + if date_unit: + json = df.to_json(date_format="iso", date_unit=date_unit) + else: + json = df.to_json(date_format="iso") + result = read_json(json) + expected = df.copy() + expected.index = expected.index.tz_localize("UTC") + expected["date"] = expected["date"].dt.tz_localize("UTC") + tm.assert_frame_equal(result, expected) + + def test_date_format_frame_raises(self): + df = self.tsframe.copy() + msg = "Invalid value 'foo' for option 'date_unit'" + with pytest.raises(ValueError, match=msg): + df.to_json(date_format="iso", date_unit="foo") + + @pytest.mark.parametrize( + "date,date_unit", + [ + ("20130101 20:43:42.123", None), + ("20130101 20:43:42", "s"), + ("20130101 20:43:42.123", "ms"), + ("20130101 20:43:42.123456", "us"), + ("20130101 20:43:42.123456789", "ns"), + ], + ) + def test_date_format_series(self, date, date_unit): + ts = Series(Timestamp(date), index=self.ts.index) + ts.iloc[1] = pd.NaT + ts.iloc[5] = pd.NaT + if date_unit: + json = ts.to_json(date_format="iso", date_unit=date_unit) + else: + json = ts.to_json(date_format="iso") + result = read_json(json, typ="series") + expected = ts.copy() + expected.index = expected.index.tz_localize("UTC") + expected = expected.dt.tz_localize("UTC") + tm.assert_series_equal(result, expected) + + def test_date_format_series_raises(self): + ts = Series(Timestamp("20130101 20:43:42.123"), index=self.ts.index) + msg = "Invalid value 'foo' for option 'date_unit'" + with pytest.raises(ValueError, match=msg): + ts.to_json(date_format="iso", date_unit="foo") + + @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) + def test_date_unit(self, unit): + df = self.tsframe.copy() + df["date"] = Timestamp("20130101 20:43:42") + dl = df.columns.get_loc("date") + df.iloc[1, dl] = Timestamp("19710101 20:43:42") + df.iloc[2, dl] = Timestamp("21460101 20:43:42") + df.iloc[4, dl] = pd.NaT + + json = df.to_json(date_format="epoch", date_unit=unit) + + # force date unit + result = read_json(json, date_unit=unit) + tm.assert_frame_equal(result, df) + + # detect date unit + result = read_json(json, date_unit=None) + tm.assert_frame_equal(result, df) + + def test_weird_nested_json(self): + # this used to core dump the parser + s = r"""{ + "status": "success", + "data": { + "posts": [ + { + "id": 1, + "title": "A blog post", + "body": "Some useful content" + }, + { + "id": 2, + "title": "Another blog post", + "body": "More content" + } + ] + } + }""" + + read_json(s) + + def test_doc_example(self): + dfj2 = DataFrame(np.random.randn(5, 2), columns=list("AB")) + dfj2["date"] = Timestamp("20130101") + dfj2["ints"] = range(5) + dfj2["bools"] = True + dfj2.index = pd.date_range("20130101", periods=5) + + json = dfj2.to_json() + result = read_json(json, dtype={"ints": np.int64, "bools": np.bool_}) + tm.assert_frame_equal(result, result) + + def test_misc_example(self): + + # parsing unordered input fails + result = read_json('[{"a": 1, "b": 2}, {"b":2, "a" :1}]', numpy=True) + expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) + + error_msg = """DataFrame\\.index are different + +DataFrame\\.index values are different \\(100\\.0 %\\) +\\[left\\]: Index\\(\\['a', 'b'\\], dtype='object'\\) +\\[right\\]: RangeIndex\\(start=0, stop=2, step=1\\)""" + with pytest.raises(AssertionError, match=error_msg): + tm.assert_frame_equal(result, expected, check_index_type=False) + + result = read_json('[{"a": 1, "b": 2}, {"b":2, "a" :1}]') + expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + + @tm.network + @pytest.mark.single + def test_round_trip_exception_(self): + # GH 3867 + csv = "https://raw.github.com/hayd/lahman2012/master/csvs/Teams.csv" + df = pd.read_csv(csv) + s = df.to_json() + result = pd.read_json(s) + tm.assert_frame_equal(result.reindex(index=df.index, columns=df.columns), df) + + @tm.network + @pytest.mark.single + @pytest.mark.parametrize( + "field,dtype", + [ + ["created_at", pd.DatetimeTZDtype(tz="UTC")], + ["closed_at", "datetime64[ns]"], + ["updated_at", pd.DatetimeTZDtype(tz="UTC")], + ], + ) + def test_url(self, field, dtype): + url = "https://api.github.com/repos/pandas-dev/pandas/issues?per_page=5" # noqa + result = read_json(url, convert_dates=True) + assert result[field].dtype == dtype + + def test_timedelta(self): + converter = lambda x: pd.to_timedelta(x, unit="ms") + + s = Series([timedelta(23), timedelta(seconds=5)]) + assert s.dtype == "timedelta64[ns]" + + result = pd.read_json(s.to_json(), typ="series").apply(converter) + tm.assert_series_equal(result, s) + + s = Series([timedelta(23), timedelta(seconds=5)], index=pd.Index([0, 1])) + assert s.dtype == "timedelta64[ns]" + result = pd.read_json(s.to_json(), typ="series").apply(converter) + tm.assert_series_equal(result, s) + + frame = DataFrame([timedelta(23), timedelta(seconds=5)]) + assert frame[0].dtype == "timedelta64[ns]" + tm.assert_frame_equal(frame, pd.read_json(frame.to_json()).apply(converter)) + + frame = DataFrame( + { + "a": [timedelta(days=23), timedelta(seconds=5)], + "b": [1, 2], + "c": pd.date_range(start="20130101", periods=2), + } + ) + + result = pd.read_json(frame.to_json(date_unit="ns")) + result["a"] = pd.to_timedelta(result.a, unit="ns") + result["c"] = pd.to_datetime(result.c) + tm.assert_frame_equal(frame, result) + + def test_mixed_timedelta_datetime(self): + frame = DataFrame( + {"a": [timedelta(23), pd.Timestamp("20130101")]}, dtype=object + ) + + expected = DataFrame( + {"a": [pd.Timedelta(frame.a[0]).value, pd.Timestamp(frame.a[1]).value]} + ) + result = pd.read_json(frame.to_json(date_unit="ns"), dtype={"a": "int64"}) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_default_handler(self): + value = object() + frame = DataFrame({"a": [7, value]}) + expected = DataFrame({"a": [7, str(value)]}) + result = pd.read_json(frame.to_json(default_handler=str)) + tm.assert_frame_equal(expected, result, check_index_type=False) + + def test_default_handler_indirect(self): + from pandas.io.json import dumps + + def default(obj): + if isinstance(obj, complex): + return [("mathjs", "Complex"), ("re", obj.real), ("im", obj.imag)] + return str(obj) + + df_list = [ + 9, + DataFrame( + {"a": [1, "STR", complex(4, -5)], "b": [float("nan"), None, "N/A"]}, + columns=["a", "b"], + ), + ] + expected = ( + '[9,[[1,null],["STR",null],[[["mathjs","Complex"],' + '["re",4.0],["im",-5.0]],"N\\/A"]]]' + ) + assert dumps(df_list, default_handler=default, orient="values") == expected + + def test_default_handler_numpy_unsupported_dtype(self): + # GH12554 to_json raises 'Unhandled numpy dtype 15' + df = DataFrame( + {"a": [1, 2.3, complex(4, -5)], "b": [float("nan"), None, complex(1.2, 0)]}, + columns=["a", "b"], + ) + expected = ( + '[["(1+0j)","(nan+0j)"],' + '["(2.3+0j)","(nan+0j)"],' + '["(4-5j)","(1.2+0j)"]]' + ) + assert df.to_json(default_handler=str, orient="values") == expected + + def test_default_handler_raises(self): + msg = "raisin" + + def my_handler_raises(obj): + raise TypeError(msg) + + with pytest.raises(TypeError, match=msg): + DataFrame({"a": [1, 2, object()]}).to_json( + default_handler=my_handler_raises + ) + with pytest.raises(TypeError, match=msg): + DataFrame({"a": [1, 2, complex(4, -5)]}).to_json( + default_handler=my_handler_raises + ) + + def test_categorical(self): + # GH4377 df.to_json segfaults with non-ndarray blocks + df = DataFrame({"A": ["a", "b", "c", "a", "b", "b", "a"]}) + df["B"] = df["A"] + expected = df.to_json() + + df["B"] = df["A"].astype("category") + assert expected == df.to_json() + + s = df["A"] + sc = df["B"] + assert s.to_json() == sc.to_json() + + def test_datetime_tz(self): + # GH4377 df.to_json segfaults with non-ndarray blocks + tz_range = pd.date_range("20130101", periods=3, tz="US/Eastern") + tz_naive = tz_range.tz_convert("utc").tz_localize(None) + + df = DataFrame({"A": tz_range, "B": pd.date_range("20130101", periods=3)}) + + df_naive = df.copy() + df_naive["A"] = tz_naive + expected = df_naive.to_json() + assert expected == df.to_json() + + stz = Series(tz_range) + s_naive = Series(tz_naive) + assert stz.to_json() == s_naive.to_json() + + def test_sparse(self): + # GH4377 df.to_json segfaults with non-ndarray blocks + df = pd.DataFrame(np.random.randn(10, 4)) + df.loc[:8] = np.nan + + sdf = df.astype("Sparse") + expected = df.to_json() + assert expected == sdf.to_json() + + s = pd.Series(np.random.randn(10)) + s.loc[:8] = np.nan + ss = s.astype("Sparse") + + expected = s.to_json() + assert expected == ss.to_json() + + @pytest.mark.parametrize( + "ts", + [ + Timestamp("2013-01-10 05:00:00Z"), + Timestamp("2013-01-10 00:00:00", tz="US/Eastern"), + Timestamp("2013-01-10 00:00:00-0500"), + ], + ) + def test_tz_is_utc(self, ts): + from pandas.io.json import dumps + + exp = '"2013-01-10T05:00:00.000Z"' + + assert dumps(ts, iso_dates=True) == exp + dt = ts.to_pydatetime() + assert dumps(dt, iso_dates=True) == exp + + @pytest.mark.parametrize( + "tz_range", + [ + pd.date_range("2013-01-01 05:00:00Z", periods=2), + pd.date_range("2013-01-01 00:00:00", periods=2, tz="US/Eastern"), + pd.date_range("2013-01-01 00:00:00-0500", periods=2), + ], + ) + def test_tz_range_is_utc(self, tz_range): + from pandas.io.json import dumps + + exp = '["2013-01-01T05:00:00.000Z","2013-01-02T05:00:00.000Z"]' + dfexp = ( + '{"DT":{' + '"0":"2013-01-01T05:00:00.000Z",' + '"1":"2013-01-02T05:00:00.000Z"}}' + ) + + assert dumps(tz_range, iso_dates=True) == exp + dti = pd.DatetimeIndex(tz_range) + assert dumps(dti, iso_dates=True) == exp + df = DataFrame({"DT": dti}) + result = dumps(df, iso_dates=True) + assert result == dfexp + + def test_read_inline_jsonl(self): + # GH9180 + result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) + expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + + @td.skip_if_not_us_locale + def test_read_s3_jsonl(self, s3_resource): + # GH17200 + + result = read_json("s3n://pandas-test/items.jsonl", lines=True) + expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + + def test_read_local_jsonl(self): + # GH17200 + with tm.ensure_clean("tmp_items.json") as path: + with open(path, "w") as infile: + infile.write('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n') + result = read_json(path, lines=True) + expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + + def test_read_jsonl_unicode_chars(self): + # GH15132: non-ascii unicode characters + # \u201d == RIGHT DOUBLE QUOTATION MARK + + # simulate file handle + json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' + json = StringIO(json) + result = read_json(json, lines=True) + expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + + # simulate string + json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' + result = read_json(json, lines=True) + expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + + def test_read_json_large_numbers(self): + # GH18842 + json = '{"articleId": "1404366058080022500245"}' + json = StringIO(json) + result = read_json(json, typ="series") + expected = Series(1.404366e21, index=["articleId"]) + tm.assert_series_equal(result, expected) + + json = '{"0": {"articleId": "1404366058080022500245"}}' + json = StringIO(json) + result = read_json(json) + expected = DataFrame(1.404366e21, index=["articleId"], columns=[0]) + tm.assert_frame_equal(result, expected) + + def test_to_jsonl(self): + # GH9180 + df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) + result = df.to_json(orient="records", lines=True) + expected = '{"a":1,"b":2}\n{"a":1,"b":2}' + assert result == expected + + df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=["a", "b"]) + result = df.to_json(orient="records", lines=True) + expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}' + assert result == expected + tm.assert_frame_equal(pd.read_json(result, lines=True), df) + + # GH15096: escaped characters in columns and data + df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], columns=["a\\", "b"]) + result = df.to_json(orient="records", lines=True) + expected = '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}' + assert result == expected + tm.assert_frame_equal(pd.read_json(result, lines=True), df) + + # TODO: there is a near-identical test for pytables; can we share? + def test_latin_encoding(self): + # GH 13774 + pytest.skip("encoding not implemented in .to_json(), xref #13774") + + values = [ + [b"E\xc9, 17", b"", b"a", b"b", b"c"], + [b"E\xc9, 17", b"a", b"b", b"c"], + [b"EE, 17", b"", b"a", b"b", b"c"], + [b"E\xc9, 17", b"\xf8\xfc", b"a", b"b", b"c"], + [b"", b"a", b"b", b"c"], + [b"\xf8\xfc", b"a", b"b", b"c"], + [b"A\xf8\xfc", b"", b"a", b"b", b"c"], + [np.nan, b"", b"b", b"c"], + [b"A\xf8\xfc", np.nan, b"", b"b", b"c"], + ] + + values = [ + [x.decode("latin-1") if isinstance(x, bytes) else x for x in y] + for y in values + ] + + examples = [] + for dtype in ["category", object]: + for val in values: + examples.append(Series(val, dtype=dtype)) + + def roundtrip(s, encoding="latin-1"): + with tm.ensure_clean("test.json") as path: + s.to_json(path, encoding=encoding) + retr = read_json(path, encoding=encoding) + tm.assert_series_equal(s, retr, check_categorical=False) + + for s in examples: + roundtrip(s) + + def test_data_frame_size_after_to_json(self): + # GH15344 + df = DataFrame({"a": [str(1)]}) + + size_before = df.memory_usage(index=True, deep=True).sum() + df.to_json() + size_after = df.memory_usage(index=True, deep=True).sum() + + assert size_before == size_after + + @pytest.mark.parametrize( + "index", [None, [1, 2], [1.0, 2.0], ["a", "b"], ["1", "2"], ["1.", "2."]] + ) + @pytest.mark.parametrize("columns", [["a", "b"], ["1", "2"], ["1.", "2."]]) + def test_from_json_to_json_table_index_and_columns(self, index, columns): + # GH25433 GH25435 + expected = DataFrame([[1, 2], [3, 4]], index=index, columns=columns) + dfjson = expected.to_json(orient="table") + result = pd.read_json(dfjson, orient="table") + tm.assert_frame_equal(result, expected) + + def test_from_json_to_json_table_dtypes(self): + # GH21345 + expected = pd.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["5", "6"]}) + dfjson = expected.to_json(orient="table") + result = pd.read_json(dfjson, orient="table") + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("dtype", [True, {"b": int, "c": int}]) + def test_read_json_table_dtype_raises(self, dtype): + # GH21345 + df = pd.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["5", "6"]}) + dfjson = df.to_json(orient="table") + msg = "cannot pass both dtype and orient='table'" + with pytest.raises(ValueError, match=msg): + pd.read_json(dfjson, orient="table", dtype=dtype) + + def test_read_json_table_convert_axes_raises(self): + # GH25433 GH25435 + df = DataFrame([[1, 2], [3, 4]], index=[1.0, 2.0], columns=["1.", "2."]) + dfjson = df.to_json(orient="table") + msg = "cannot pass both convert_axes and orient='table'" + with pytest.raises(ValueError, match=msg): + pd.read_json(dfjson, orient="table", convert_axes=True) + + @pytest.mark.parametrize( + "data, expected", + [ + ( + DataFrame([[1, 2], [4, 5]], columns=["a", "b"]), + {"columns": ["a", "b"], "data": [[1, 2], [4, 5]]}, + ), + ( + DataFrame([[1, 2], [4, 5]], columns=["a", "b"]).rename_axis("foo"), + {"columns": ["a", "b"], "data": [[1, 2], [4, 5]]}, + ), + ( + DataFrame( + [[1, 2], [4, 5]], columns=["a", "b"], index=[["a", "b"], ["c", "d"]] + ), + {"columns": ["a", "b"], "data": [[1, 2], [4, 5]]}, + ), + (Series([1, 2, 3], name="A"), {"name": "A", "data": [1, 2, 3]}), + ( + Series([1, 2, 3], name="A").rename_axis("foo"), + {"name": "A", "data": [1, 2, 3]}, + ), + ( + Series([1, 2], name="A", index=[["a", "b"], ["c", "d"]]), + {"name": "A", "data": [1, 2]}, + ), + ], + ) + def test_index_false_to_json_split(self, data, expected): + # GH 17394 + # Testing index=False in to_json with orient='split' + + result = data.to_json(orient="split", index=False) + result = json.loads(result) + + assert result == expected + + @pytest.mark.parametrize( + "data", + [ + (DataFrame([[1, 2], [4, 5]], columns=["a", "b"])), + (DataFrame([[1, 2], [4, 5]], columns=["a", "b"]).rename_axis("foo")), + ( + DataFrame( + [[1, 2], [4, 5]], columns=["a", "b"], index=[["a", "b"], ["c", "d"]] + ) + ), + (Series([1, 2, 3], name="A")), + (Series([1, 2, 3], name="A").rename_axis("foo")), + (Series([1, 2], name="A", index=[["a", "b"], ["c", "d"]])), + ], + ) + def test_index_false_to_json_table(self, data): + # GH 17394 + # Testing index=False in to_json with orient='table' + + result = data.to_json(orient="table", index=False) + result = json.loads(result) + + expected = { + "schema": pd.io.json.build_table_schema(data, index=False), + "data": DataFrame(data).to_dict(orient="records"), + } + + assert result == expected + + @pytest.mark.parametrize("orient", ["records", "index", "columns", "values"]) + def test_index_false_error_to_json(self, orient): + # GH 17394 + # Testing error message from to_json with index=False + + df = pd.DataFrame([[1, 2], [4, 5]], columns=["a", "b"]) + + msg = "'index=False' is only valid when 'orient' is 'split' or 'table'" + with pytest.raises(ValueError, match=msg): + df.to_json(orient=orient, index=False) + + @pytest.mark.parametrize("orient", ["split", "table"]) + @pytest.mark.parametrize("index", [True, False]) + def test_index_false_from_json_to_json(self, orient, index): + # GH25170 + # Test index=False in from_json to_json + expected = DataFrame({"a": [1, 2], "b": [3, 4]}) + dfjson = expected.to_json(orient=orient, index=index) + result = read_json(dfjson, orient=orient) + tm.assert_frame_equal(result, expected) + + def test_read_timezone_information(self): + # GH 25546 + result = read_json( + '{"2019-01-01T11:00:00.000Z":88}', typ="series", orient="index" + ) + expected = Series([88], index=DatetimeIndex(["2019-01-01 11:00:00"], tz="UTC")) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "date_format,key", [("epoch", 86400000), ("iso", "P1DT0H0M0S")] + ) + def test_timedelta_as_label(self, date_format, key): + df = pd.DataFrame([[1]], columns=[pd.Timedelta("1D")]) + expected = f'{{"{key}":{{"0":1}}}}' + result = df.to_json(date_format=date_format) + + assert result == expected + + @pytest.mark.parametrize( + "orient,expected", + [ + ("index", "{\"('a', 'b')\":{\"('c', 'd')\":1}}"), + ("columns", "{\"('c', 'd')\":{\"('a', 'b')\":1}}"), + # TODO: the below have separate encoding procedures + # They produce JSON but not in a consistent manner + pytest.param("split", "", marks=pytest.mark.skip), + pytest.param("table", "", marks=pytest.mark.skip), + ], + ) + def test_tuple_labels(self, orient, expected): + # GH 20500 + df = pd.DataFrame([[1]], index=[("a", "b")], columns=[("c", "d")]) + result = df.to_json(orient=orient) + assert result == expected + + @pytest.mark.parametrize("indent", [1, 2, 4]) + def test_to_json_indent(self, indent): + # GH 12004 + df = pd.DataFrame([["foo", "bar"], ["baz", "qux"]], columns=["a", "b"]) + + result = df.to_json(indent=indent) + spaces = " " * indent + expected = f"""{{ +{spaces}"a":{{ +{spaces}{spaces}"0":"foo", +{spaces}{spaces}"1":"baz" +{spaces}}}, +{spaces}"b":{{ +{spaces}{spaces}"0":"bar", +{spaces}{spaces}"1":"qux" +{spaces}}} +}}""" + + assert result == expected + + @pytest.mark.parametrize( + "orient,expected", + [ + ( + "split", + """{ + "columns":[ + "a", + "b" + ], + "index":[ + 0, + 1 + ], + "data":[ + [ + "foo", + "bar" + ], + [ + "baz", + "qux" + ] + ] +}""", + ), + ( + "records", + """[ + { + "a":"foo", + "b":"bar" + }, + { + "a":"baz", + "b":"qux" + } +]""", + ), + ( + "index", + """{ + "0":{ + "a":"foo", + "b":"bar" + }, + "1":{ + "a":"baz", + "b":"qux" + } +}""", + ), + ( + "columns", + """{ + "a":{ + "0":"foo", + "1":"baz" + }, + "b":{ + "0":"bar", + "1":"qux" + } +}""", + ), + ( + "values", + """[ + [ + "foo", + "bar" + ], + [ + "baz", + "qux" + ] +]""", + ), + ( + "table", + """{ + "schema":{ + "fields":[ + { + "name":"index", + "type":"integer" + }, + { + "name":"a", + "type":"string" + }, + { + "name":"b", + "type":"string" + } + ], + "primaryKey":[ + "index" + ], + "pandas_version":"0.20.0" + }, + "data":[ + { + "index":0, + "a":"foo", + "b":"bar" + }, + { + "index":1, + "a":"baz", + "b":"qux" + } + ] +}""", + ), + ], + ) + def test_json_indent_all_orients(self, orient, expected): + # GH 12004 + df = pd.DataFrame([["foo", "bar"], ["baz", "qux"]], columns=["a", "b"]) + result = df.to_json(orient=orient, indent=4) + assert result == expected + + def test_json_negative_indent_raises(self): + with pytest.raises(ValueError, match="must be a nonnegative integer"): + pd.DataFrame().to_json(indent=-1) + + def test_emca_262_nan_inf_support(self): + # GH 12213 + data = '["a", NaN, "NaN", Infinity, "Infinity", -Infinity, "-Infinity"]' + result = pd.read_json(data) + expected = pd.DataFrame( + ["a", np.nan, "NaN", np.inf, "Infinity", -np.inf, "-Infinity"] + ) + tm.assert_frame_equal(result, expected) + + def test_deprecate_numpy_argument_read_json(self): + # GH 28512 + expected = DataFrame([1, 2, 3]) + with tm.assert_produces_warning(FutureWarning): + result = read_json(expected.to_json(), numpy=True) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/io/json/test_readlines.py b/venv/Lib/site-packages/pandas/tests/io/json/test_readlines.py new file mode 100644 index 0000000..e531457 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/json/test_readlines.py @@ -0,0 +1,181 @@ +from io import StringIO + +import pytest + +import pandas as pd +from pandas import DataFrame, read_json +import pandas._testing as tm + +from pandas.io.json._json import JsonReader + + +@pytest.fixture +def lines_json_df(): + df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + return df.to_json(lines=True, orient="records") + + +def test_read_jsonl(): + # GH9180 + result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) + expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + + +def test_read_jsonl_unicode_chars(): + # GH15132: non-ascii unicode characters + # \u201d == RIGHT DOUBLE QUOTATION MARK + + # simulate file handle + json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' + json = StringIO(json) + result = read_json(json, lines=True) + expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + + # simulate string + json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' + result = read_json(json, lines=True) + expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + + +def test_to_jsonl(): + # GH9180 + df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) + result = df.to_json(orient="records", lines=True) + expected = '{"a":1,"b":2}\n{"a":1,"b":2}' + assert result == expected + + df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=["a", "b"]) + result = df.to_json(orient="records", lines=True) + expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}' + assert result == expected + tm.assert_frame_equal(read_json(result, lines=True), df) + + # GH15096: escaped characters in columns and data + df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], columns=["a\\", "b"]) + result = df.to_json(orient="records", lines=True) + expected = '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}' + assert result == expected + tm.assert_frame_equal(read_json(result, lines=True), df) + + +@pytest.mark.parametrize("chunksize", [1, 1.0]) +def test_readjson_chunks(lines_json_df, chunksize): + # Basic test that read_json(chunks=True) gives the same result as + # read_json(chunks=False) + # GH17048: memory usage when lines=True + + unchunked = read_json(StringIO(lines_json_df), lines=True) + reader = read_json(StringIO(lines_json_df), lines=True, chunksize=chunksize) + chunked = pd.concat(reader) + + tm.assert_frame_equal(chunked, unchunked) + + +def test_readjson_chunksize_requires_lines(lines_json_df): + msg = "chunksize can only be passed if lines=True" + with pytest.raises(ValueError, match=msg): + pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2) + + +def test_readjson_chunks_series(): + # Test reading line-format JSON to Series with chunksize param + s = pd.Series({"A": 1, "B": 2}) + + strio = StringIO(s.to_json(lines=True, orient="records")) + unchunked = pd.read_json(strio, lines=True, typ="Series") + + strio = StringIO(s.to_json(lines=True, orient="records")) + chunked = pd.concat(pd.read_json(strio, lines=True, typ="Series", chunksize=1)) + + tm.assert_series_equal(chunked, unchunked) + + +def test_readjson_each_chunk(lines_json_df): + # Other tests check that the final result of read_json(chunksize=True) + # is correct. This checks the intermediate chunks. + chunks = list(pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2)) + assert chunks[0].shape == (2, 2) + assert chunks[1].shape == (1, 2) + + +def test_readjson_chunks_from_file(): + with tm.ensure_clean("test.json") as path: + df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + df.to_json(path, lines=True, orient="records") + chunked = pd.concat(pd.read_json(path, lines=True, chunksize=1)) + unchunked = pd.read_json(path, lines=True) + tm.assert_frame_equal(unchunked, chunked) + + +@pytest.mark.parametrize("chunksize", [None, 1]) +def test_readjson_chunks_closes(chunksize): + with tm.ensure_clean("test.json") as path: + df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + df.to_json(path, lines=True, orient="records") + reader = JsonReader( + path, + orient=None, + typ="frame", + dtype=True, + convert_axes=True, + convert_dates=True, + keep_default_dates=True, + numpy=False, + precise_float=False, + date_unit=None, + encoding=None, + lines=True, + chunksize=chunksize, + compression=None, + ) + reader.read() + assert ( + reader.open_stream.closed + ), f"didn't close stream with chunksize = {chunksize}" + + +@pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"]) +def test_readjson_invalid_chunksize(lines_json_df, chunksize): + msg = r"'chunksize' must be an integer >=1" + + with pytest.raises(ValueError, match=msg): + pd.read_json(StringIO(lines_json_df), lines=True, chunksize=chunksize) + + +@pytest.mark.parametrize("chunksize", [None, 1, 2]) +def test_readjson_chunks_multiple_empty_lines(chunksize): + j = """ + + {"A":1,"B":4} + + + + {"A":2,"B":5} + + + + + + + + {"A":3,"B":6} + """ + orig = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + test = pd.read_json(j, lines=True, chunksize=chunksize) + if chunksize is not None: + test = pd.concat(test) + tm.assert_frame_equal(orig, test, obj=f"chunksize: {chunksize}") + + +def test_readjson_unicode(monkeypatch): + with tm.ensure_clean("test.json") as path: + monkeypatch.setattr("_bootlocale.getpreferredencoding", lambda l: "cp949") + with open(path, "w", encoding="utf-8") as f: + f.write('{"£©µÀÆÖÞßéöÿ":["АБВГДабвгд가"]}') + + result = read_json(path) + expected = pd.DataFrame({"£©µÀÆÖÞßéöÿ": ["АБВГДабвгд가"]}) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/io/json/test_ujson.py b/venv/Lib/site-packages/pandas/tests/io/json/test_ujson.py new file mode 100644 index 0000000..bedd600 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/json/test_ujson.py @@ -0,0 +1,1079 @@ +import calendar +import datetime +import decimal +import json +import locale +import math +import re +import time + +import dateutil +import numpy as np +import pytest +import pytz + +import pandas._libs.json as ujson +from pandas._libs.tslib import Timestamp +import pandas.compat as compat + +from pandas import DataFrame, DatetimeIndex, Index, NaT, Series, date_range +import pandas._testing as tm + + +def _clean_dict(d): + """ + Sanitize dictionary for JSON by converting all keys to strings. + + Parameters + ---------- + d : dict + The dictionary to convert. + + Returns + ------- + cleaned_dict : dict + """ + + return {str(k): v for k, v in d.items()} + + +@pytest.fixture( + params=[None, "split", "records", "values", "index"] # Column indexed by default. +) +def orient(request): + return request.param + + +@pytest.fixture(params=[None, True]) +def numpy(request): + return request.param + + +class TestUltraJSONTests: + @pytest.mark.skipif( + compat.is_platform_32bit(), reason="not compliant on 32-bit, xref #15865" + ) + def test_encode_decimal(self): + sut = decimal.Decimal("1337.1337") + encoded = ujson.encode(sut, double_precision=15) + decoded = ujson.decode(encoded) + assert decoded == 1337.1337 + + sut = decimal.Decimal("0.95") + encoded = ujson.encode(sut, double_precision=1) + assert encoded == "1.0" + + decoded = ujson.decode(encoded) + assert decoded == 1.0 + + sut = decimal.Decimal("0.94") + encoded = ujson.encode(sut, double_precision=1) + assert encoded == "0.9" + + decoded = ujson.decode(encoded) + assert decoded == 0.9 + + sut = decimal.Decimal("1.95") + encoded = ujson.encode(sut, double_precision=1) + assert encoded == "2.0" + + decoded = ujson.decode(encoded) + assert decoded == 2.0 + + sut = decimal.Decimal("-1.95") + encoded = ujson.encode(sut, double_precision=1) + assert encoded == "-2.0" + + decoded = ujson.decode(encoded) + assert decoded == -2.0 + + sut = decimal.Decimal("0.995") + encoded = ujson.encode(sut, double_precision=2) + assert encoded == "1.0" + + decoded = ujson.decode(encoded) + assert decoded == 1.0 + + sut = decimal.Decimal("0.9995") + encoded = ujson.encode(sut, double_precision=3) + assert encoded == "1.0" + + decoded = ujson.decode(encoded) + assert decoded == 1.0 + + sut = decimal.Decimal("0.99999999999999944") + encoded = ujson.encode(sut, double_precision=15) + assert encoded == "1.0" + + decoded = ujson.decode(encoded) + assert decoded == 1.0 + + @pytest.mark.parametrize("ensure_ascii", [True, False]) + def test_encode_string_conversion(self, ensure_ascii): + string_input = "A string \\ / \b \f \n \r \t &" + not_html_encoded = '"A string \\\\ \\/ \\b \\f \\n \\r \\t <\\/script> &"' + html_encoded = ( + '"A string \\\\ \\/ \\b \\f \\n \\r \\t \\u003c\\/script\\u003e \\u0026"' + ) + + def helper(expected_output, **encode_kwargs): + output = ujson.encode( + string_input, ensure_ascii=ensure_ascii, **encode_kwargs + ) + + assert output == expected_output + assert string_input == json.loads(output) + assert string_input == ujson.decode(output) + + # Default behavior assumes encode_html_chars=False. + helper(not_html_encoded) + + # Make sure explicit encode_html_chars=False works. + helper(not_html_encoded, encode_html_chars=False) + + # Make sure explicit encode_html_chars=True does the encoding. + helper(html_encoded, encode_html_chars=True) + + @pytest.mark.parametrize( + "long_number", [-4342969734183514, -12345678901234.56789012, -528656961.4399388] + ) + def test_double_long_numbers(self, long_number): + sut = {"a": long_number} + encoded = ujson.encode(sut, double_precision=15) + + decoded = ujson.decode(encoded) + assert sut == decoded + + def test_encode_non_c_locale(self): + lc_category = locale.LC_NUMERIC + + # We just need one of these locales to work. + for new_locale in ("it_IT.UTF-8", "Italian_Italy"): + if tm.can_set_locale(new_locale, lc_category): + with tm.set_locale(new_locale, lc_category): + assert ujson.loads(ujson.dumps(4.78e60)) == 4.78e60 + assert ujson.loads("4.78", precise_float=True) == 4.78 + break + + def test_decimal_decode_test_precise(self): + sut = {"a": 4.56} + encoded = ujson.encode(sut) + decoded = ujson.decode(encoded, precise_float=True) + assert sut == decoded + + def test_encode_double_tiny_exponential(self): + num = 1e-40 + assert num == ujson.decode(ujson.encode(num)) + num = 1e-100 + assert num == ujson.decode(ujson.encode(num)) + num = -1e-45 + assert num == ujson.decode(ujson.encode(num)) + num = -1e-145 + assert np.allclose(num, ujson.decode(ujson.encode(num))) + + @pytest.mark.parametrize("unicode_key", ["key1", "بن"]) + def test_encode_dict_with_unicode_keys(self, unicode_key): + unicode_dict = {unicode_key: "value1"} + assert unicode_dict == ujson.decode(ujson.encode(unicode_dict)) + + @pytest.mark.parametrize( + "double_input", [math.pi, -math.pi] # Should work with negatives too. + ) + def test_encode_double_conversion(self, double_input): + output = ujson.encode(double_input) + assert round(double_input, 5) == round(json.loads(output), 5) + assert round(double_input, 5) == round(ujson.decode(output), 5) + + def test_encode_with_decimal(self): + decimal_input = 1.0 + output = ujson.encode(decimal_input) + + assert output == "1.0" + + def test_encode_array_of_nested_arrays(self): + nested_input = [[[[]]]] * 20 + output = ujson.encode(nested_input) + + assert nested_input == json.loads(output) + assert nested_input == ujson.decode(output) + + nested_input = np.array(nested_input) + tm.assert_numpy_array_equal( + nested_input, ujson.decode(output, numpy=True, dtype=nested_input.dtype) + ) + + def test_encode_array_of_doubles(self): + doubles_input = [31337.31337, 31337.31337, 31337.31337, 31337.31337] * 10 + output = ujson.encode(doubles_input) + + assert doubles_input == json.loads(output) + assert doubles_input == ujson.decode(output) + + tm.assert_numpy_array_equal( + np.array(doubles_input), ujson.decode(output, numpy=True) + ) + + def test_double_precision(self): + double_input = 30.012345678901234 + output = ujson.encode(double_input, double_precision=15) + + assert double_input == json.loads(output) + assert double_input == ujson.decode(output) + + for double_precision in (3, 9): + output = ujson.encode(double_input, double_precision=double_precision) + rounded_input = round(double_input, double_precision) + + assert rounded_input == json.loads(output) + assert rounded_input == ujson.decode(output) + + @pytest.mark.parametrize("invalid_val", [20, -1, "9", None]) + def test_invalid_double_precision(self, invalid_val): + double_input = 30.12345678901234567890 + expected_exception = ValueError if isinstance(invalid_val, int) else TypeError + + with pytest.raises(expected_exception): + ujson.encode(double_input, double_precision=invalid_val) + + def test_encode_string_conversion2(self): + string_input = "A string \\ / \b \f \n \r \t" + output = ujson.encode(string_input) + + assert string_input == json.loads(output) + assert string_input == ujson.decode(output) + assert output == '"A string \\\\ \\/ \\b \\f \\n \\r \\t"' + + @pytest.mark.parametrize( + "unicode_input", + ["Räksmörgås اسامة بن محمد بن عوض بن لادن", "\xe6\x97\xa5\xd1\x88"], + ) + def test_encode_unicode_conversion(self, unicode_input): + enc = ujson.encode(unicode_input) + dec = ujson.decode(enc) + + assert enc == json.dumps(unicode_input) + assert dec == json.loads(enc) + + def test_encode_control_escaping(self): + escaped_input = "\x19" + enc = ujson.encode(escaped_input) + dec = ujson.decode(enc) + + assert escaped_input == dec + assert enc == json.dumps(escaped_input) + + def test_encode_unicode_surrogate_pair(self): + surrogate_input = "\xf0\x90\x8d\x86" + enc = ujson.encode(surrogate_input) + dec = ujson.decode(enc) + + assert enc == json.dumps(surrogate_input) + assert dec == json.loads(enc) + + def test_encode_unicode_4bytes_utf8(self): + four_bytes_input = "\xf0\x91\x80\xb0TRAILINGNORMAL" + enc = ujson.encode(four_bytes_input) + dec = ujson.decode(enc) + + assert enc == json.dumps(four_bytes_input) + assert dec == json.loads(enc) + + def test_encode_unicode_4bytes_utf8highest(self): + four_bytes_input = "\xf3\xbf\xbf\xbfTRAILINGNORMAL" + enc = ujson.encode(four_bytes_input) + + dec = ujson.decode(enc) + + assert enc == json.dumps(four_bytes_input) + assert dec == json.loads(enc) + + def test_encode_array_in_array(self): + arr_in_arr_input = [[[[]]]] + output = ujson.encode(arr_in_arr_input) + + assert arr_in_arr_input == json.loads(output) + assert output == json.dumps(arr_in_arr_input) + assert arr_in_arr_input == ujson.decode(output) + + tm.assert_numpy_array_equal( + np.array(arr_in_arr_input), ujson.decode(output, numpy=True) + ) + + @pytest.mark.parametrize( + "num_input", + [ + 31337, + -31337, # Negative number. + -9223372036854775808, # Large negative number. + ], + ) + def test_encode_num_conversion(self, num_input): + output = ujson.encode(num_input) + assert num_input == json.loads(output) + assert output == json.dumps(num_input) + assert num_input == ujson.decode(output) + + def test_encode_list_conversion(self): + list_input = [1, 2, 3, 4] + output = ujson.encode(list_input) + + assert list_input == json.loads(output) + assert list_input == ujson.decode(output) + + tm.assert_numpy_array_equal( + np.array(list_input), ujson.decode(output, numpy=True) + ) + + def test_encode_dict_conversion(self): + dict_input = {"k1": 1, "k2": 2, "k3": 3, "k4": 4} + output = ujson.encode(dict_input) + + assert dict_input == json.loads(output) + assert dict_input == ujson.decode(output) + + @pytest.mark.parametrize("builtin_value", [None, True, False]) + def test_encode_builtin_values_conversion(self, builtin_value): + output = ujson.encode(builtin_value) + assert builtin_value == json.loads(output) + assert output == json.dumps(builtin_value) + assert builtin_value == ujson.decode(output) + + def test_encode_datetime_conversion(self): + datetime_input = datetime.datetime.fromtimestamp(time.time()) + output = ujson.encode(datetime_input, date_unit="s") + expected = calendar.timegm(datetime_input.utctimetuple()) + + assert int(expected) == json.loads(output) + assert int(expected) == ujson.decode(output) + + def test_encode_date_conversion(self): + date_input = datetime.date.fromtimestamp(time.time()) + output = ujson.encode(date_input, date_unit="s") + + tup = (date_input.year, date_input.month, date_input.day, 0, 0, 0) + expected = calendar.timegm(tup) + + assert int(expected) == json.loads(output) + assert int(expected) == ujson.decode(output) + + @pytest.mark.parametrize( + "test", + [datetime.time(), datetime.time(1, 2, 3), datetime.time(10, 12, 15, 343243)], + ) + def test_encode_time_conversion_basic(self, test): + output = ujson.encode(test) + expected = f'"{test.isoformat()}"' + assert expected == output + + def test_encode_time_conversion_pytz(self): + # see gh-11473: to_json segfaults with timezone-aware datetimes + test = datetime.time(10, 12, 15, 343243, pytz.utc) + output = ujson.encode(test) + expected = f'"{test.isoformat()}"' + assert expected == output + + def test_encode_time_conversion_dateutil(self): + # see gh-11473: to_json segfaults with timezone-aware datetimes + test = datetime.time(10, 12, 15, 343243, dateutil.tz.tzutc()) + output = ujson.encode(test) + expected = f'"{test.isoformat()}"' + assert expected == output + + @pytest.mark.parametrize( + "decoded_input", [NaT, np.datetime64("NaT"), np.nan, np.inf, -np.inf] + ) + def test_encode_as_null(self, decoded_input): + assert ujson.encode(decoded_input) == "null", "Expected null" + + def test_datetime_units(self): + val = datetime.datetime(2013, 8, 17, 21, 17, 12, 215504) + stamp = Timestamp(val) + + roundtrip = ujson.decode(ujson.encode(val, date_unit="s")) + assert roundtrip == stamp.value // 10 ** 9 + + roundtrip = ujson.decode(ujson.encode(val, date_unit="ms")) + assert roundtrip == stamp.value // 10 ** 6 + + roundtrip = ujson.decode(ujson.encode(val, date_unit="us")) + assert roundtrip == stamp.value // 10 ** 3 + + roundtrip = ujson.decode(ujson.encode(val, date_unit="ns")) + assert roundtrip == stamp.value + + msg = "Invalid value 'foo' for option 'date_unit'" + with pytest.raises(ValueError, match=msg): + ujson.encode(val, date_unit="foo") + + def test_encode_to_utf8(self): + unencoded = "\xe6\x97\xa5\xd1\x88" + + enc = ujson.encode(unencoded, ensure_ascii=False) + dec = ujson.decode(enc) + + assert enc == json.dumps(unencoded, ensure_ascii=False) + assert dec == json.loads(enc) + + def test_decode_from_unicode(self): + unicode_input = '{"obj": 31337}' + + dec1 = ujson.decode(unicode_input) + dec2 = ujson.decode(str(unicode_input)) + + assert dec1 == dec2 + + def test_encode_recursion_max(self): + # 8 is the max recursion depth + + class O2: + member = 0 + pass + + class O1: + member = 0 + pass + + decoded_input = O1() + decoded_input.member = O2() + decoded_input.member.member = decoded_input + + with pytest.raises(OverflowError): + ujson.encode(decoded_input) + + def test_decode_jibberish(self): + jibberish = "fdsa sda v9sa fdsa" + + with pytest.raises(ValueError): + ujson.decode(jibberish) + + @pytest.mark.parametrize( + "broken_json", + [ + "[", # Broken array start. + "{", # Broken object start. + "]", # Broken array end. + "}", # Broken object end. + ], + ) + def test_decode_broken_json(self, broken_json): + with pytest.raises(ValueError): + ujson.decode(broken_json) + + @pytest.mark.parametrize("too_big_char", ["[", "{"]) + def test_decode_depth_too_big(self, too_big_char): + with pytest.raises(ValueError): + ujson.decode(too_big_char * (1024 * 1024)) + + @pytest.mark.parametrize( + "bad_string", + [ + '"TESTING', # Unterminated. + '"TESTING\\"', # Unterminated escape. + "tru", # Broken True. + "fa", # Broken False. + "n", # Broken None. + ], + ) + def test_decode_bad_string(self, bad_string): + with pytest.raises(ValueError): + ujson.decode(bad_string) + + @pytest.mark.parametrize("broken_json", ['{{1337:""}}', '{{"key":"}', "[[[true"]) + def test_decode_broken_json_leak(self, broken_json): + for _ in range(1000): + with pytest.raises(ValueError): + ujson.decode(broken_json) + + @pytest.mark.parametrize( + "invalid_dict", + [ + "{{{{31337}}}}", # No key. + '{{{{"key":}}}}', # No value. + '{{{{"key"}}}}', # No colon or value. + ], + ) + def test_decode_invalid_dict(self, invalid_dict): + with pytest.raises(ValueError): + ujson.decode(invalid_dict) + + @pytest.mark.parametrize( + "numeric_int_as_str", ["31337", "-31337"] # Should work with negatives. + ) + def test_decode_numeric_int(self, numeric_int_as_str): + assert int(numeric_int_as_str) == ujson.decode(numeric_int_as_str) + + def test_encode_null_character(self): + wrapped_input = "31337 \x00 1337" + output = ujson.encode(wrapped_input) + + assert wrapped_input == json.loads(output) + assert output == json.dumps(wrapped_input) + assert wrapped_input == ujson.decode(output) + + alone_input = "\x00" + output = ujson.encode(alone_input) + + assert alone_input == json.loads(output) + assert output == json.dumps(alone_input) + assert alone_input == ujson.decode(output) + assert '" \\u0000\\r\\n "' == ujson.dumps(" \u0000\r\n ") + + def test_decode_null_character(self): + wrapped_input = '"31337 \\u0000 31337"' + assert ujson.decode(wrapped_input) == json.loads(wrapped_input) + + def test_encode_list_long_conversion(self): + long_input = [ + 9223372036854775807, + 9223372036854775807, + 9223372036854775807, + 9223372036854775807, + 9223372036854775807, + 9223372036854775807, + ] + output = ujson.encode(long_input) + + assert long_input == json.loads(output) + assert long_input == ujson.decode(output) + + tm.assert_numpy_array_equal( + np.array(long_input), ujson.decode(output, numpy=True, dtype=np.int64) + ) + + def test_encode_long_conversion(self): + long_input = 9223372036854775807 + output = ujson.encode(long_input) + + assert long_input == json.loads(output) + assert output == json.dumps(long_input) + assert long_input == ujson.decode(output) + + @pytest.mark.parametrize( + "int_exp", ["1337E40", "1.337E40", "1337E+9", "1.337e+40", "1.337E-4"] + ) + def test_decode_numeric_int_exp(self, int_exp): + assert ujson.decode(int_exp) == json.loads(int_exp) + + def test_loads_non_str_bytes_raises(self): + msg = "Expected 'str' or 'bytes'" + with pytest.raises(TypeError, match=msg): + ujson.loads(None) + + def test_encode_numeric_overflow(self): + with pytest.raises(OverflowError): + ujson.encode(12839128391289382193812939) + + def test_encode_numeric_overflow_nested(self): + class Nested: + x = 12839128391289382193812939 + + for _ in range(0, 100): + with pytest.raises(OverflowError): + ujson.encode(Nested()) + + @pytest.mark.parametrize("val", [3590016419, 2 ** 31, 2 ** 32, (2 ** 32) - 1]) + def test_decode_number_with_32bit_sign_bit(self, val): + # Test that numbers that fit within 32 bits but would have the + # sign bit set (2**31 <= x < 2**32) are decoded properly. + doc = f'{{"id": {val}}}' + assert ujson.decode(doc)["id"] == val + + def test_encode_big_escape(self): + # Make sure no Exception is raised. + for _ in range(10): + base = "\u00e5".encode("utf-8") + escape_input = base * 1024 * 1024 * 2 + ujson.encode(escape_input) + + def test_decode_big_escape(self): + # Make sure no Exception is raised. + for _ in range(10): + base = "\u00e5".encode("utf-8") + quote = b'"' + + escape_input = quote + (base * 1024 * 1024 * 2) + quote + ujson.decode(escape_input) + + def test_to_dict(self): + d = {"key": 31337} + + class DictTest: + def toDict(self): + return d + + o = DictTest() + output = ujson.encode(o) + + dec = ujson.decode(output) + assert dec == d + + def test_default_handler(self): + class _TestObject: + def __init__(self, val): + self.val = val + + @property + def recursive_attr(self): + return _TestObject("recursive_attr") + + def __str__(self) -> str: + return str(self.val) + + msg = "Maximum recursion level reached" + with pytest.raises(OverflowError, match=msg): + ujson.encode(_TestObject("foo")) + assert '"foo"' == ujson.encode(_TestObject("foo"), default_handler=str) + + def my_handler(_): + return "foobar" + + assert '"foobar"' == ujson.encode( + _TestObject("foo"), default_handler=my_handler + ) + + def my_handler_raises(_): + raise TypeError("I raise for anything") + + with pytest.raises(TypeError, match="I raise for anything"): + ujson.encode(_TestObject("foo"), default_handler=my_handler_raises) + + def my_int_handler(_): + return 42 + + assert ( + ujson.decode( + ujson.encode(_TestObject("foo"), default_handler=my_int_handler) + ) + == 42 + ) + + def my_obj_handler(_): + return datetime.datetime(2013, 2, 3) + + assert ujson.decode( + ujson.encode(datetime.datetime(2013, 2, 3)) + ) == ujson.decode( + ujson.encode(_TestObject("foo"), default_handler=my_obj_handler) + ) + + obj_list = [_TestObject("foo"), _TestObject("bar")] + assert json.loads(json.dumps(obj_list, default=str)) == ujson.decode( + ujson.encode(obj_list, default_handler=str) + ) + + +class TestNumpyJSONTests: + @pytest.mark.parametrize("bool_input", [True, False]) + def test_bool(self, bool_input): + b = np.bool(bool_input) + assert ujson.decode(ujson.encode(b)) == b + + def test_bool_array(self): + bool_array = np.array( + [True, False, True, True, False, True, False, False], dtype=np.bool + ) + output = np.array(ujson.decode(ujson.encode(bool_array)), dtype=np.bool) + tm.assert_numpy_array_equal(bool_array, output) + + def test_int(self, any_int_dtype): + klass = np.dtype(any_int_dtype).type + num = klass(1) + + assert klass(ujson.decode(ujson.encode(num))) == num + + def test_int_array(self, any_int_dtype): + arr = np.arange(100, dtype=np.int) + arr_input = arr.astype(any_int_dtype) + + arr_output = np.array( + ujson.decode(ujson.encode(arr_input)), dtype=any_int_dtype + ) + tm.assert_numpy_array_equal(arr_input, arr_output) + + def test_int_max(self, any_int_dtype): + if any_int_dtype in ("int64", "uint64") and compat.is_platform_32bit(): + pytest.skip("Cannot test 64-bit integer on 32-bit platform") + + klass = np.dtype(any_int_dtype).type + + # uint64 max will always overflow, + # as it's encoded to signed. + if any_int_dtype == "uint64": + num = np.iinfo("int64").max + else: + num = np.iinfo(any_int_dtype).max + + assert klass(ujson.decode(ujson.encode(num))) == num + + def test_float(self, float_dtype): + klass = np.dtype(float_dtype).type + num = klass(256.2013) + + assert klass(ujson.decode(ujson.encode(num))) == num + + def test_float_array(self, float_dtype): + arr = np.arange(12.5, 185.72, 1.7322, dtype=np.float) + float_input = arr.astype(float_dtype) + + float_output = np.array( + ujson.decode(ujson.encode(float_input, double_precision=15)), + dtype=float_dtype, + ) + tm.assert_almost_equal(float_input, float_output) + + def test_float_max(self, float_dtype): + klass = np.dtype(float_dtype).type + num = klass(np.finfo(float_dtype).max / 10) + + tm.assert_almost_equal( + klass(ujson.decode(ujson.encode(num, double_precision=15))), num + ) + + def test_array_basic(self): + arr = np.arange(96) + arr = arr.reshape((2, 2, 2, 2, 3, 2)) + + tm.assert_numpy_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) + tm.assert_numpy_array_equal(ujson.decode(ujson.encode(arr), numpy=True), arr) + + @pytest.mark.parametrize("shape", [(10, 10), (5, 5, 4), (100, 1)]) + def test_array_reshaped(self, shape): + arr = np.arange(100) + arr = arr.reshape(shape) + + tm.assert_numpy_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) + tm.assert_numpy_array_equal(ujson.decode(ujson.encode(arr), numpy=True), arr) + + def test_array_list(self): + arr_list = [ + "a", + list(), + dict(), + dict(), + list(), + 42, + 97.8, + ["a", "b"], + {"key": "val"}, + ] + arr = np.array(arr_list, dtype=object) + result = np.array(ujson.decode(ujson.encode(arr)), dtype=object) + tm.assert_numpy_array_equal(result, arr) + + def test_array_float(self): + dtype = np.float32 + + arr = np.arange(100.202, 200.202, 1, dtype=dtype) + arr = arr.reshape((5, 5, 4)) + + arr_out = np.array(ujson.decode(ujson.encode(arr)), dtype=dtype) + tm.assert_almost_equal(arr, arr_out) + + arr_out = ujson.decode(ujson.encode(arr), numpy=True, dtype=dtype) + tm.assert_almost_equal(arr, arr_out) + + def test_0d_array(self): + # gh-18878 + msg = re.escape("array(1) (0d array) is not JSON serializable at the moment") + with pytest.raises(TypeError, match=msg): + ujson.encode(np.array(1)) + + @pytest.mark.parametrize( + "bad_input,exc_type,kwargs", + [ + ([{}, []], ValueError, {}), + ([42, None], TypeError, {}), + ([["a"], 42], ValueError, {}), + ([42, {}, "a"], TypeError, {}), + ([42, ["a"], 42], ValueError, {}), + (["a", "b", [], "c"], ValueError, {}), + ([{"a": "b"}], ValueError, dict(labelled=True)), + ({"a": {"b": {"c": 42}}}, ValueError, dict(labelled=True)), + ([{"a": 42, "b": 23}, {"c": 17}], ValueError, dict(labelled=True)), + ], + ) + def test_array_numpy_except(self, bad_input, exc_type, kwargs): + with pytest.raises(exc_type): + ujson.decode(ujson.dumps(bad_input), numpy=True, **kwargs) + + def test_array_numpy_labelled(self): + labelled_input = {"a": []} + output = ujson.loads(ujson.dumps(labelled_input), numpy=True, labelled=True) + assert (np.empty((1, 0)) == output[0]).all() + assert (np.array(["a"]) == output[1]).all() + assert output[2] is None + + labelled_input = [{"a": 42}] + output = ujson.loads(ujson.dumps(labelled_input), numpy=True, labelled=True) + assert (np.array(["a"]) == output[2]).all() + assert (np.array([42]) == output[0]).all() + assert output[1] is None + + # see gh-10837: write out the dump explicitly + # so there is no dependency on iteration order + input_dumps = '[{"a": 42, "b":31}, {"a": 24, "c": 99}, {"a": 2.4, "b": 78}]' + output = ujson.loads(input_dumps, numpy=True, labelled=True) + expected_vals = np.array([42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3, 2)) + assert (expected_vals == output[0]).all() + assert output[1] is None + assert (np.array(["a", "b"]) == output[2]).all() + + input_dumps = ( + '{"1": {"a": 42, "b":31}, "2": {"a": 24, "c": 99}, ' + '"3": {"a": 2.4, "b": 78}}' + ) + output = ujson.loads(input_dumps, numpy=True, labelled=True) + expected_vals = np.array([42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3, 2)) + assert (expected_vals == output[0]).all() + assert (np.array(["1", "2", "3"]) == output[1]).all() + assert (np.array(["a", "b"]) == output[2]).all() + + +class TestPandasJSONTests: + def test_dataframe(self, orient, numpy): + if orient == "records" and numpy: + pytest.skip("Not idiomatic pandas") + + df = DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["a", "b"], columns=["x", "y", "z"] + ) + encode_kwargs = {} if orient is None else dict(orient=orient) + decode_kwargs = {} if numpy is None else dict(numpy=numpy) + + output = ujson.decode(ujson.encode(df, **encode_kwargs), **decode_kwargs) + + # Ensure proper DataFrame initialization. + if orient == "split": + dec = _clean_dict(output) + output = DataFrame(**dec) + else: + output = DataFrame(output) + + # Corrections to enable DataFrame comparison. + if orient == "values": + df.columns = [0, 1, 2] + df.index = [0, 1] + elif orient == "records": + df.index = [0, 1] + elif orient == "index": + df = df.transpose() + + tm.assert_frame_equal(output, df, check_dtype=False) + + def test_dataframe_nested(self, orient): + df = DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["a", "b"], columns=["x", "y", "z"] + ) + + nested = {"df1": df, "df2": df.copy()} + kwargs = {} if orient is None else dict(orient=orient) + + exp = { + "df1": ujson.decode(ujson.encode(df, **kwargs)), + "df2": ujson.decode(ujson.encode(df, **kwargs)), + } + assert ujson.decode(ujson.encode(nested, **kwargs)) == exp + + def test_dataframe_numpy_labelled(self, orient): + if orient in ("split", "values"): + pytest.skip("Incompatible with labelled=True") + + df = DataFrame( + [[1, 2, 3], [4, 5, 6]], + index=["a", "b"], + columns=["x", "y", "z"], + dtype=np.int, + ) + kwargs = {} if orient is None else dict(orient=orient) + + output = DataFrame( + *ujson.decode(ujson.encode(df, **kwargs), numpy=True, labelled=True) + ) + + if orient is None: + df = df.T + elif orient == "records": + df.index = [0, 1] + + tm.assert_frame_equal(output, df) + + def test_series(self, orient, numpy): + s = Series( + [10, 20, 30, 40, 50, 60], name="series", index=[6, 7, 8, 9, 10, 15] + ).sort_values() + + encode_kwargs = {} if orient is None else dict(orient=orient) + decode_kwargs = {} if numpy is None else dict(numpy=numpy) + + output = ujson.decode(ujson.encode(s, **encode_kwargs), **decode_kwargs) + + if orient == "split": + dec = _clean_dict(output) + output = Series(**dec) + else: + output = Series(output) + + if orient in (None, "index"): + s.name = None + output = output.sort_values() + s.index = ["6", "7", "8", "9", "10", "15"] + elif orient in ("records", "values"): + s.name = None + s.index = [0, 1, 2, 3, 4, 5] + + tm.assert_series_equal(output, s, check_dtype=False) + + def test_series_nested(self, orient): + s = Series( + [10, 20, 30, 40, 50, 60], name="series", index=[6, 7, 8, 9, 10, 15] + ).sort_values() + nested = {"s1": s, "s2": s.copy()} + kwargs = {} if orient is None else dict(orient=orient) + + exp = { + "s1": ujson.decode(ujson.encode(s, **kwargs)), + "s2": ujson.decode(ujson.encode(s, **kwargs)), + } + assert ujson.decode(ujson.encode(nested, **kwargs)) == exp + + def test_index(self): + i = Index([23, 45, 18, 98, 43, 11], name="index") + + # Column indexed. + output = Index(ujson.decode(ujson.encode(i)), name="index") + tm.assert_index_equal(i, output) + + output = Index(ujson.decode(ujson.encode(i), numpy=True), name="index") + tm.assert_index_equal(i, output) + + dec = _clean_dict(ujson.decode(ujson.encode(i, orient="split"))) + output = Index(**dec) + + tm.assert_index_equal(i, output) + assert i.name == output.name + + dec = _clean_dict(ujson.decode(ujson.encode(i, orient="split"), numpy=True)) + output = Index(**dec) + + tm.assert_index_equal(i, output) + assert i.name == output.name + + output = Index(ujson.decode(ujson.encode(i, orient="values")), name="index") + tm.assert_index_equal(i, output) + + output = Index( + ujson.decode(ujson.encode(i, orient="values"), numpy=True), name="index" + ) + tm.assert_index_equal(i, output) + + output = Index(ujson.decode(ujson.encode(i, orient="records")), name="index") + tm.assert_index_equal(i, output) + + output = Index( + ujson.decode(ujson.encode(i, orient="records"), numpy=True), name="index" + ) + tm.assert_index_equal(i, output) + + output = Index(ujson.decode(ujson.encode(i, orient="index")), name="index") + tm.assert_index_equal(i, output) + + output = Index( + ujson.decode(ujson.encode(i, orient="index"), numpy=True), name="index" + ) + tm.assert_index_equal(i, output) + + def test_datetime_index(self): + date_unit = "ns" + + rng = date_range("1/1/2000", periods=20) + encoded = ujson.encode(rng, date_unit=date_unit) + + decoded = DatetimeIndex(np.array(ujson.decode(encoded))) + tm.assert_index_equal(rng, decoded) + + ts = Series(np.random.randn(len(rng)), index=rng) + decoded = Series(ujson.decode(ujson.encode(ts, date_unit=date_unit))) + + idx_values = decoded.index.values.astype(np.int64) + decoded.index = DatetimeIndex(idx_values) + tm.assert_series_equal(ts, decoded) + + @pytest.mark.parametrize( + "invalid_arr", + [ + "[31337,]", # Trailing comma. + "[,31337]", # Leading comma. + "[]]", # Unmatched bracket. + "[,]", # Only comma. + ], + ) + def test_decode_invalid_array(self, invalid_arr): + with pytest.raises(ValueError): + ujson.decode(invalid_arr) + + @pytest.mark.parametrize("arr", [[], [31337]]) + def test_decode_array(self, arr): + assert arr == ujson.decode(str(arr)) + + @pytest.mark.parametrize("extreme_num", [9223372036854775807, -9223372036854775808]) + def test_decode_extreme_numbers(self, extreme_num): + assert extreme_num == ujson.decode(str(extreme_num)) + + @pytest.mark.parametrize( + "too_extreme_num", ["9223372036854775808", "-90223372036854775809"] + ) + def test_decode_too_extreme_numbers(self, too_extreme_num): + with pytest.raises(ValueError): + ujson.decode(too_extreme_num) + + def test_decode_with_trailing_whitespaces(self): + assert {} == ujson.decode("{}\n\t ") + + def test_decode_with_trailing_non_whitespaces(self): + with pytest.raises(ValueError): + ujson.decode("{}\n\t a") + + def test_decode_array_with_big_int(self): + with pytest.raises(ValueError): + ujson.loads("[18446098363113800555]") + + @pytest.mark.parametrize( + "float_number", + [ + 1.1234567893, + 1.234567893, + 1.34567893, + 1.4567893, + 1.567893, + 1.67893, + 1.7893, + 1.893, + 1.3, + ], + ) + @pytest.mark.parametrize("sign", [-1, 1]) + def test_decode_floating_point(self, sign, float_number): + float_number *= sign + tm.assert_almost_equal( + float_number, ujson.loads(str(float_number)), check_less_precise=15 + ) + + def test_encode_big_set(self): + s = set() + + for x in range(0, 100000): + s.add(x) + + # Make sure no Exception is raised. + ujson.encode(s) + + def test_encode_empty_set(self): + assert "[]" == ujson.encode(set()) + + def test_encode_set(self): + s = {1, 2, 3, 4, 5, 6, 7, 8, 9} + enc = ujson.encode(s) + dec = ujson.decode(enc) + + for v in dec: + assert v in s diff --git a/venv/Lib/site-packages/pandas/tests/io/parser/__init__.py b/venv/Lib/site-packages/pandas/tests/io/parser/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/io/parser/conftest.py b/venv/Lib/site-packages/pandas/tests/io/parser/conftest.py new file mode 100644 index 0000000..15967e3 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/parser/conftest.py @@ -0,0 +1,123 @@ +import os +from typing import List, Optional + +import pytest + +from pandas import read_csv, read_table + + +class BaseParser: + engine: Optional[str] = None + low_memory = True + float_precision_choices: List[Optional[str]] = [] + + def update_kwargs(self, kwargs): + kwargs = kwargs.copy() + kwargs.update(dict(engine=self.engine, low_memory=self.low_memory)) + + return kwargs + + def read_csv(self, *args, **kwargs): + kwargs = self.update_kwargs(kwargs) + return read_csv(*args, **kwargs) + + def read_table(self, *args, **kwargs): + kwargs = self.update_kwargs(kwargs) + return read_table(*args, **kwargs) + + +class CParser(BaseParser): + engine = "c" + float_precision_choices = [None, "high", "round_trip"] + + +class CParserHighMemory(CParser): + low_memory = False + + +class CParserLowMemory(CParser): + low_memory = True + + +class PythonParser(BaseParser): + engine = "python" + float_precision_choices = [None] + + +@pytest.fixture +def csv_dir_path(datapath): + """ + The directory path to the data files needed for parser tests. + """ + return datapath("io", "parser", "data") + + +@pytest.fixture +def csv1(csv_dir_path): + """ + The path to the data file "test1.csv" needed for parser tests. + """ + return os.path.join(csv_dir_path, "test1.csv") + + +_cParserHighMemory = CParserHighMemory() +_cParserLowMemory = CParserLowMemory() +_pythonParser = PythonParser() + +_py_parsers_only = [_pythonParser] +_c_parsers_only = [_cParserHighMemory, _cParserLowMemory] +_all_parsers = [*_c_parsers_only, *_py_parsers_only] + +_py_parser_ids = ["python"] +_c_parser_ids = ["c_high", "c_low"] +_all_parser_ids = [*_c_parser_ids, *_py_parser_ids] + + +@pytest.fixture(params=_all_parsers, ids=_all_parser_ids) +def all_parsers(request): + """ + Fixture all of the CSV parsers. + """ + return request.param + + +@pytest.fixture(params=_c_parsers_only, ids=_c_parser_ids) +def c_parser_only(request): + """ + Fixture all of the CSV parsers using the C engine. + """ + return request.param + + +@pytest.fixture(params=_py_parsers_only, ids=_py_parser_ids) +def python_parser_only(request): + """ + Fixture all of the CSV parsers using the Python engine. + """ + return request.param + + +_utf_values = [8, 16, 32] + +_encoding_seps = ["", "-", "_"] +_encoding_prefixes = ["utf", "UTF"] + +_encoding_fmts = [ + f"{prefix}{sep}" + "{0}" for sep in _encoding_seps for prefix in _encoding_prefixes +] + + +@pytest.fixture(params=_utf_values) +def utf_value(request): + """ + Fixture for all possible integer values for a UTF encoding. + """ + return request.param + + +@pytest.fixture(params=_encoding_fmts) +def encoding_fmt(request): + """ + Fixture for all possible string formats of a UTF encoding. + """ + return request.param diff --git a/venv/Lib/site-packages/pandas/tests/io/parser/test_c_parser_only.py b/venv/Lib/site-packages/pandas/tests/io/parser/test_c_parser_only.py new file mode 100644 index 0000000..1737f14 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/parser/test_c_parser_only.py @@ -0,0 +1,610 @@ +""" +Tests that apply specifically to the CParser. Unless specifically stated +as a CParser-specific issue, the goal is to eventually move as many of +these tests out of this module as soon as the Python parser can accept +further arguments when parsing. +""" + +from io import BytesIO, StringIO, TextIOWrapper +import mmap +import os +import tarfile + +import numpy as np +import pytest + +from pandas.errors import ParserError +import pandas.util._test_decorators as td + +from pandas import DataFrame, concat +import pandas._testing as tm + + +@pytest.mark.parametrize( + "malformed", + ["1\r1\r1\r 1\r 1\r", "1\r1\r1\r 1\r 1\r11\r", "1\r1\r1\r 1\r 1\r11\r1\r"], + ids=["words pointer", "stream pointer", "lines pointer"], +) +def test_buffer_overflow(c_parser_only, malformed): + # see gh-9205: test certain malformed input files that cause + # buffer overflows in tokenizer.c + msg = "Buffer overflow caught - possible malformed input file." + parser = c_parser_only + + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(malformed)) + + +def test_buffer_rd_bytes(c_parser_only): + # see gh-12098: src->buffer in the C parser can be freed twice leading + # to a segfault if a corrupt gzip file is read with 'read_csv', and the + # buffer is filled more than once before gzip raises an Exception. + + data = ( + "\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09" + "\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0" + "\xA6\x4D" + "\x55" * 267 + "\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00" + "\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO" + ) + parser = c_parser_only + + for _ in range(100): + try: + parser.read_csv(StringIO(data), compression="gzip", delim_whitespace=True) + except Exception: + pass + + +def test_delim_whitespace_custom_terminator(c_parser_only): + # See gh-12912 + data = "a b c~1 2 3~4 5 6~7 8 9" + parser = c_parser_only + + df = parser.read_csv(StringIO(data), lineterminator="~", delim_whitespace=True) + expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]) + tm.assert_frame_equal(df, expected) + + +def test_dtype_and_names_error(c_parser_only): + # see gh-8833: passing both dtype and names + # resulting in an error reporting issue + parser = c_parser_only + data = """ +1.0 1 +2.0 2 +3.0 3 +""" + # base cases + result = parser.read_csv(StringIO(data), sep=r"\s+", header=None) + expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]]) + tm.assert_frame_equal(result, expected) + + result = parser.read_csv(StringIO(data), sep=r"\s+", header=None, names=["a", "b"]) + expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + + # fallback casting + result = parser.read_csv( + StringIO(data), sep=r"\s+", header=None, names=["a", "b"], dtype={"a": np.int32} + ) + expected = DataFrame([[1, 1], [2, 2], [3, 3]], columns=["a", "b"]) + expected["a"] = expected["a"].astype(np.int32) + tm.assert_frame_equal(result, expected) + + data = """ +1.0 1 +nan 2 +3.0 3 +""" + # fallback casting, but not castable + with pytest.raises(ValueError, match="cannot safely convert"): + parser.read_csv( + StringIO(data), + sep=r"\s+", + header=None, + names=["a", "b"], + dtype={"a": np.int32}, + ) + + +@pytest.mark.parametrize( + "match,kwargs", + [ + # For each of these cases, all of the dtypes are valid, just unsupported. + ( + ( + "the dtype datetime64 is not supported for parsing, " + "pass this column using parse_dates instead" + ), + dict(dtype={"A": "datetime64", "B": "float64"}), + ), + ( + ( + "the dtype datetime64 is not supported for parsing, " + "pass this column using parse_dates instead" + ), + dict(dtype={"A": "datetime64", "B": "float64"}, parse_dates=["B"]), + ), + ( + "the dtype timedelta64 is not supported for parsing", + dict(dtype={"A": "timedelta64", "B": "float64"}), + ), + ("the dtype 262144b) + parser = c_parser_only + header_narrow = "\t".join(["COL_HEADER_" + str(i) for i in range(10)]) + "\n" + data_narrow = "\t".join(["somedatasomedatasomedata1" for _ in range(10)]) + "\n" + header_wide = "\t".join(["COL_HEADER_" + str(i) for i in range(15)]) + "\n" + data_wide = "\t".join(["somedatasomedatasomedata2" for _ in range(15)]) + "\n" + test_input = header_narrow + data_narrow * 1050 + header_wide + data_wide * 2 + + df = parser.read_csv(StringIO(test_input), sep="\t", nrows=1010) + + assert df.size == 1010 * 10 + + +def test_float_precision_round_trip_with_text(c_parser_only): + # see gh-15140 + parser = c_parser_only + df = parser.read_csv(StringIO("a"), header=None, float_precision="round_trip") + tm.assert_frame_equal(df, DataFrame({0: ["a"]})) + + +def test_large_difference_in_columns(c_parser_only): + # see gh-14125 + parser = c_parser_only + + count = 10000 + large_row = ("X," * count)[:-1] + "\n" + normal_row = "XXXXXX XXXXXX,111111111111111\n" + test_input = (large_row + normal_row * 6)[:-1] + + result = parser.read_csv(StringIO(test_input), header=None, usecols=[0]) + rows = test_input.split("\n") + + expected = DataFrame([row.split(",")[0] for row in rows]) + tm.assert_frame_equal(result, expected) + + +def test_data_after_quote(c_parser_only): + # see gh-15910 + parser = c_parser_only + + data = 'a\n1\n"b"a' + result = parser.read_csv(StringIO(data)) + + expected = DataFrame({"a": ["1", "ba"]}) + tm.assert_frame_equal(result, expected) + + +def test_comment_whitespace_delimited(c_parser_only, capsys): + parser = c_parser_only + test_input = """\ +1 2 +2 2 3 +3 2 3 # 3 fields +4 2 3# 3 fields +5 2 # 2 fields +6 2# 2 fields +7 # 1 field, NaN +8# 1 field, NaN +9 2 3 # skipped line +# comment""" + df = parser.read_csv( + StringIO(test_input), + comment="#", + header=None, + delimiter="\\s+", + skiprows=0, + error_bad_lines=False, + ) + captured = capsys.readouterr() + # skipped lines 2, 3, 4, 9 + for line_num in (2, 3, 4, 9): + assert "Skipping line {}".format(line_num) in captured.err + expected = DataFrame([[1, 2], [5, 2], [6, 2], [7, np.nan], [8, np.nan]]) + tm.assert_frame_equal(df, expected) + + +def test_file_like_no_next(c_parser_only): + # gh-16530: the file-like need not have a "next" or "__next__" + # attribute despite having an "__iter__" attribute. + # + # NOTE: This is only true for the C engine, not Python engine. + class NoNextBuffer(StringIO): + def __next__(self): + raise AttributeError("No next method") + + next = __next__ + + parser = c_parser_only + data = "a\n1" + + expected = DataFrame({"a": [1]}) + result = parser.read_csv(NoNextBuffer(data)) + + tm.assert_frame_equal(result, expected) + + +def test_buffer_rd_bytes_bad_unicode(c_parser_only): + # see gh-22748 + t = BytesIO(b"\xB0") + t = TextIOWrapper(t, encoding="ascii", errors="surrogateescape") + msg = "'utf-8' codec can't encode character" + with pytest.raises(UnicodeError, match=msg): + c_parser_only.read_csv(t, encoding="UTF-8") + + +@pytest.mark.parametrize("tar_suffix", [".tar", ".tar.gz"]) +def test_read_tarfile(c_parser_only, csv_dir_path, tar_suffix): + # see gh-16530 + # + # Unfortunately, Python's CSV library can't handle + # tarfile objects (expects string, not bytes when + # iterating through a file-like). + parser = c_parser_only + tar_path = os.path.join(csv_dir_path, "tar_csv" + tar_suffix) + + with tarfile.open(tar_path, "r") as tar: + data_file = tar.extractfile("tar_data.csv") + + out = parser.read_csv(data_file) + expected = DataFrame({"a": [1]}) + tm.assert_frame_equal(out, expected) + + +@pytest.mark.high_memory +def test_bytes_exceed_2gb(c_parser_only): + # see gh-16798 + # + # Read from a "CSV" that has a column larger than 2GB. + parser = c_parser_only + + if parser.low_memory: + pytest.skip("not a high_memory test") + + csv = StringIO("strings\n" + "\n".join(["x" * (1 << 20) for _ in range(2100)])) + df = parser.read_csv(csv) + assert not df.empty + + +def test_chunk_whitespace_on_boundary(c_parser_only): + # see gh-9735: this issue is C parser-specific (bug when + # parsing whitespace and characters at chunk boundary) + # + # This test case has a field too large for the Python parser / CSV library. + parser = c_parser_only + + chunk1 = "a" * (1024 * 256 - 2) + "\na" + chunk2 = "\n a" + result = parser.read_csv(StringIO(chunk1 + chunk2), header=None) + + expected = DataFrame(["a" * (1024 * 256 - 2), "a", " a"]) + tm.assert_frame_equal(result, expected) + + +def test_file_handles_mmap(c_parser_only, csv1): + # gh-14418 + # + # Don't close user provided file handles. + parser = c_parser_only + + with open(csv1, "r") as f: + m = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) + parser.read_csv(m) + + assert not m.closed + m.close() + + +def test_file_binary_mode(c_parser_only): + # see gh-23779 + parser = c_parser_only + expected = DataFrame([[1, 2, 3], [4, 5, 6]]) + + with tm.ensure_clean() as path: + with open(path, "w") as f: + f.write("1,2,3\n4,5,6") + + with open(path, "rb") as f: + result = parser.read_csv(f, header=None) + tm.assert_frame_equal(result, expected) + + +def test_unix_style_breaks(c_parser_only): + # GH 11020 + parser = c_parser_only + with tm.ensure_clean() as path: + with open(path, "w", newline="\n") as f: + f.write("blah\n\ncol_1,col_2,col_3\n\n") + result = parser.read_csv(path, skiprows=2, encoding="utf-8", engine="c") + expected = DataFrame(columns=["col_1", "col_2", "col_3"]) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/io/parser/test_comment.py b/venv/Lib/site-packages/pandas/tests/io/parser/test_comment.py new file mode 100644 index 0000000..60e32d7 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/parser/test_comment.py @@ -0,0 +1,136 @@ +""" +Tests that comments are properly handled during parsing +for all of the parsers defined in parsers.py +""" +from io import StringIO + +import numpy as np +import pytest + +from pandas import DataFrame +import pandas._testing as tm + + +@pytest.mark.parametrize("na_values", [None, ["NaN"]]) +def test_comment(all_parsers, na_values): + parser = all_parsers + data = """A,B,C +1,2.,4.#hello world +5.,NaN,10.0 +""" + expected = DataFrame( + [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] + ) + result = parser.read_csv(StringIO(data), comment="#", na_values=na_values) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "read_kwargs", [dict(), dict(lineterminator="*"), dict(delim_whitespace=True)] +) +def test_line_comment(all_parsers, read_kwargs): + parser = all_parsers + data = """# empty +A,B,C +1,2.,4.#hello world +#ignore this line +5.,NaN,10.0 +""" + if read_kwargs.get("delim_whitespace"): + data = data.replace(",", " ") + elif read_kwargs.get("lineterminator"): + if parser.engine != "c": + pytest.skip("Custom terminator not supported with Python engine") + + data = data.replace("\n", read_kwargs.get("lineterminator")) + + read_kwargs["comment"] = "#" + result = parser.read_csv(StringIO(data), **read_kwargs) + + expected = DataFrame( + [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] + ) + tm.assert_frame_equal(result, expected) + + +def test_comment_skiprows(all_parsers): + parser = all_parsers + data = """# empty +random line +# second empty line +1,2,3 +A,B,C +1,2.,4. +5.,NaN,10.0 +""" + # This should ignore the first four lines (including comments). + expected = DataFrame( + [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] + ) + result = parser.read_csv(StringIO(data), comment="#", skiprows=4) + tm.assert_frame_equal(result, expected) + + +def test_comment_header(all_parsers): + parser = all_parsers + data = """# empty +# second empty line +1,2,3 +A,B,C +1,2.,4. +5.,NaN,10.0 +""" + # Header should begin at the second non-comment line. + expected = DataFrame( + [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] + ) + result = parser.read_csv(StringIO(data), comment="#", header=1) + tm.assert_frame_equal(result, expected) + + +def test_comment_skiprows_header(all_parsers): + parser = all_parsers + data = """# empty +# second empty line +# third empty line +X,Y,Z +1,2,3 +A,B,C +1,2.,4. +5.,NaN,10.0 +""" + # Skiprows should skip the first 4 lines (including comments), + # while header should start from the second non-commented line, + # starting with line 5. + expected = DataFrame( + [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] + ) + result = parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("comment_char", ["#", "~", "&", "^", "*", "@"]) +def test_custom_comment_char(all_parsers, comment_char): + parser = all_parsers + data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo" + result = parser.read_csv( + StringIO(data.replace("#", comment_char)), comment=comment_char + ) + + expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("header", ["infer", None]) +def test_comment_first_line(all_parsers, header): + # see gh-4623 + parser = all_parsers + data = "# notes\na,b,c\n# more notes\n1,2,3" + + if header is None: + expected = DataFrame({0: ["a", "1"], 1: ["b", "2"], 2: ["c", "3"]}) + else: + expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"]) + + result = parser.read_csv(StringIO(data), comment="#", header=header) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/io/parser/test_common.py b/venv/Lib/site-packages/pandas/tests/io/parser/test_common.py new file mode 100644 index 0000000..4c02a37 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/parser/test_common.py @@ -0,0 +1,2072 @@ +""" +Tests that work on both the Python and C engines but do not have a +specific classification into the other test modules. +""" +import codecs +import csv +from datetime import datetime +from io import StringIO +import os +import platform +from tempfile import TemporaryFile +from urllib.error import URLError + +import numpy as np +import pytest + +from pandas._libs.tslib import Timestamp +from pandas.errors import DtypeWarning, EmptyDataError, ParserError + +from pandas import DataFrame, Index, MultiIndex, Series, compat, concat +import pandas._testing as tm + +from pandas.io.parsers import CParserWrapper, TextFileReader, TextParser + + +def test_override_set_noconvert_columns(): + # see gh-17351 + # + # Usecols needs to be sorted in _set_noconvert_columns based + # on the test_usecols_with_parse_dates test from test_usecols.py + class MyTextFileReader(TextFileReader): + def __init__(self): + self._currow = 0 + self.squeeze = False + + class MyCParserWrapper(CParserWrapper): + def _set_noconvert_columns(self): + if self.usecols_dtype == "integer": + # self.usecols is a set, which is documented as unordered + # but in practice, a CPython set of integers is sorted. + # In other implementations this assumption does not hold. + # The following code simulates a different order, which + # before GH 17351 would cause the wrong columns to be + # converted via the parse_dates parameter + self.usecols = list(self.usecols) + self.usecols.reverse() + return CParserWrapper._set_noconvert_columns(self) + + data = """a,b,c,d,e +0,1,20140101,0900,4 +0,1,20140102,1000,4""" + + parse_dates = [[1, 2]] + cols = { + "a": [0, 0], + "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], + } + expected = DataFrame(cols, columns=["c_d", "a"]) + + parser = MyTextFileReader() + parser.options = { + "usecols": [0, 2, 3], + "parse_dates": parse_dates, + "delimiter": ",", + } + parser._engine = MyCParserWrapper(StringIO(data), **parser.options) + + result = parser.read() + tm.assert_frame_equal(result, expected) + + +def test_empty_decimal_marker(all_parsers): + data = """A|B|C +1|2,334|5 +10|13|10. +""" + # Parsers support only length-1 decimals + msg = "Only length-1 decimal markers supported" + parser = all_parsers + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), decimal="") + + +def test_bad_stream_exception(all_parsers, csv_dir_path): + # see gh-13652 + # + # This test validates that both the Python engine and C engine will + # raise UnicodeDecodeError instead of C engine raising ParserError + # and swallowing the exception that caused read to fail. + path = os.path.join(csv_dir_path, "sauron.SHIFT_JIS.csv") + codec = codecs.lookup("utf-8") + utf8 = codecs.lookup("utf-8") + parser = all_parsers + msg = "'utf-8' codec can't decode byte" + + # Stream must be binary UTF8. + with open(path, "rb") as handle, codecs.StreamRecoder( + handle, utf8.encode, utf8.decode, codec.streamreader, codec.streamwriter + ) as stream: + + with pytest.raises(UnicodeDecodeError, match=msg): + parser.read_csv(stream) + + +def test_read_csv_local(all_parsers, csv1): + prefix = "file:///" if compat.is_platform_windows() else "file://" + parser = all_parsers + + fname = prefix + str(os.path.abspath(csv1)) + result = parser.read_csv(fname, index_col=0, parse_dates=True) + + expected = DataFrame( + [ + [0.980269, 3.685731, -0.364216805298, -1.159738], + [1.047916, -0.041232, -0.16181208307, 0.212549], + [0.498581, 0.731168, -0.537677223318, 1.346270], + [1.120202, 1.567621, 0.00364077397681, 0.675253], + [-0.487094, 0.571455, -1.6116394093, 0.103469], + [0.836649, 0.246462, 0.588542635376, 1.062782], + [-0.157161, 1.340307, 1.1957779562, -1.097007], + ], + columns=["A", "B", "C", "D"], + index=Index( + [ + datetime(2000, 1, 3), + datetime(2000, 1, 4), + datetime(2000, 1, 5), + datetime(2000, 1, 6), + datetime(2000, 1, 7), + datetime(2000, 1, 10), + datetime(2000, 1, 11), + ], + name="index", + ), + ) + tm.assert_frame_equal(result, expected) + + +def test_1000_sep(all_parsers): + parser = all_parsers + data = """A|B|C +1|2,334|5 +10|13|10. +""" + expected = DataFrame({"A": [1, 10], "B": [2334, 13], "C": [5, 10.0]}) + + result = parser.read_csv(StringIO(data), sep="|", thousands=",") + tm.assert_frame_equal(result, expected) + + +def test_squeeze(all_parsers): + data = """\ +a,1 +b,2 +c,3 +""" + parser = all_parsers + index = Index(["a", "b", "c"], name=0) + expected = Series([1, 2, 3], name=1, index=index) + + result = parser.read_csv(StringIO(data), index_col=0, header=None, squeeze=True) + tm.assert_series_equal(result, expected) + + # see gh-8217 + # + # Series should not be a view. + assert not result._is_view + + +def test_malformed(all_parsers): + # see gh-6607 + parser = all_parsers + data = """ignore +A,B,C +1,2,3 # comment +1,2,3,4,5 +2,3,4 +""" + msg = "Expected 3 fields in line 4, saw 5" + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data), header=1, comment="#") + + +@pytest.mark.parametrize("nrows", [5, 3, None]) +def test_malformed_chunks(all_parsers, nrows): + data = """ignore +A,B,C +skip +1,2,3 +3,5,10 # comment +1,2,3,4,5 +2,3,4 +""" + parser = all_parsers + msg = "Expected 3 fields in line 6, saw 5" + reader = parser.read_csv( + StringIO(data), header=1, comment="#", iterator=True, chunksize=1, skiprows=[2] + ) + + with pytest.raises(ParserError, match=msg): + reader.read(nrows) + + +def test_unnamed_columns(all_parsers): + data = """A,B,C,, +1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 +""" + parser = all_parsers + expected = DataFrame( + [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], + dtype=np.int64, + columns=["A", "B", "C", "Unnamed: 3", "Unnamed: 4"], + ) + result = parser.read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) + + +def test_csv_mixed_type(all_parsers): + data = """A,B,C +a,1,2 +b,3,4 +c,4,5 +""" + parser = all_parsers + expected = DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]}) + result = parser.read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) + + +def test_read_csv_low_memory_no_rows_with_index(all_parsers): + # see gh-21141 + parser = all_parsers + + if not parser.low_memory: + pytest.skip("This is a low-memory specific test") + + data = """A,B,C +1,1,1,2 +2,2,3,4 +3,3,4,5 +""" + result = parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0) + expected = DataFrame(columns=["A", "B", "C"]) + tm.assert_frame_equal(result, expected) + + +def test_read_csv_dataframe(all_parsers, csv1): + parser = all_parsers + result = parser.read_csv(csv1, index_col=0, parse_dates=True) + + expected = DataFrame( + [ + [0.980269, 3.685731, -0.364216805298, -1.159738], + [1.047916, -0.041232, -0.16181208307, 0.212549], + [0.498581, 0.731168, -0.537677223318, 1.346270], + [1.120202, 1.567621, 0.00364077397681, 0.675253], + [-0.487094, 0.571455, -1.6116394093, 0.103469], + [0.836649, 0.246462, 0.588542635376, 1.062782], + [-0.157161, 1.340307, 1.1957779562, -1.097007], + ], + columns=["A", "B", "C", "D"], + index=Index( + [ + datetime(2000, 1, 3), + datetime(2000, 1, 4), + datetime(2000, 1, 5), + datetime(2000, 1, 6), + datetime(2000, 1, 7), + datetime(2000, 1, 10), + datetime(2000, 1, 11), + ], + name="index", + ), + ) + tm.assert_frame_equal(result, expected) + + +def test_read_csv_no_index_name(all_parsers, csv_dir_path): + parser = all_parsers + csv2 = os.path.join(csv_dir_path, "test2.csv") + result = parser.read_csv(csv2, index_col=0, parse_dates=True) + + expected = DataFrame( + [ + [0.980269, 3.685731, -0.364216805298, -1.159738, "foo"], + [1.047916, -0.041232, -0.16181208307, 0.212549, "bar"], + [0.498581, 0.731168, -0.537677223318, 1.346270, "baz"], + [1.120202, 1.567621, 0.00364077397681, 0.675253, "qux"], + [-0.487094, 0.571455, -1.6116394093, 0.103469, "foo2"], + ], + columns=["A", "B", "C", "D", "E"], + index=Index( + [ + datetime(2000, 1, 3), + datetime(2000, 1, 4), + datetime(2000, 1, 5), + datetime(2000, 1, 6), + datetime(2000, 1, 7), + ] + ), + ) + tm.assert_frame_equal(result, expected) + + +def test_read_csv_wrong_num_columns(all_parsers): + # Too few columns. + data = """A,B,C,D,E,F +1,2,3,4,5,6 +6,7,8,9,10,11,12 +11,12,13,14,15,16 +""" + parser = all_parsers + msg = "Expected 6 fields in line 3, saw 7" + + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data)) + + +def test_read_duplicate_index_explicit(all_parsers): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo,12,13,14,15 +bar,12,13,14,15 +""" + parser = all_parsers + result = parser.read_csv(StringIO(data), index_col=0) + + expected = DataFrame( + [ + [2, 3, 4, 5], + [7, 8, 9, 10], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + ], + columns=["A", "B", "C", "D"], + index=Index(["foo", "bar", "baz", "qux", "foo", "bar"], name="index"), + ) + tm.assert_frame_equal(result, expected) + + +def test_read_duplicate_index_implicit(all_parsers): + data = """A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo,12,13,14,15 +bar,12,13,14,15 +""" + parser = all_parsers + result = parser.read_csv(StringIO(data)) + + expected = DataFrame( + [ + [2, 3, 4, 5], + [7, 8, 9, 10], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + ], + columns=["A", "B", "C", "D"], + index=Index(["foo", "bar", "baz", "qux", "foo", "bar"]), + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + ( + "A,B\nTrue,1\nFalse,2\nTrue,3", + dict(), + DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]), + ), + ( + "A,B\nYES,1\nno,2\nyes,3\nNo,3\nYes,3", + dict(true_values=["yes", "Yes", "YES"], false_values=["no", "NO", "No"]), + DataFrame( + [[True, 1], [False, 2], [True, 3], [False, 3], [True, 3]], + columns=["A", "B"], + ), + ), + ( + "A,B\nTRUE,1\nFALSE,2\nTRUE,3", + dict(), + DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]), + ), + ( + "A,B\nfoo,bar\nbar,foo", + dict(true_values=["foo"], false_values=["bar"]), + DataFrame([[True, False], [False, True]], columns=["A", "B"]), + ), + ], +) +def test_parse_bool(all_parsers, data, kwargs, expected): + parser = all_parsers + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_int_conversion(all_parsers): + data = """A,B +1.0,1 +2.0,2 +3.0,3 +""" + parser = all_parsers + result = parser.read_csv(StringIO(data)) + + expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["A", "B"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("nrows", [3, 3.0]) +def test_read_nrows(all_parsers, nrows): + # see gh-10476 + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + expected = DataFrame( + [["foo", 2, 3, 4, 5], ["bar", 7, 8, 9, 10], ["baz", 12, 13, 14, 15]], + columns=["index", "A", "B", "C", "D"], + ) + parser = all_parsers + + result = parser.read_csv(StringIO(data), nrows=nrows) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("nrows", [1.2, "foo", -1]) +def test_read_nrows_bad(all_parsers, nrows): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + msg = r"'nrows' must be an integer >=0" + parser = all_parsers + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), nrows=nrows) + + +@pytest.mark.parametrize("index_col", [0, "index"]) +def test_read_chunksize_with_index(all_parsers, index_col): + parser = all_parsers + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + + reader = parser.read_csv(StringIO(data), index_col=0, chunksize=2) + expected = DataFrame( + [ + ["foo", 2, 3, 4, 5], + ["bar", 7, 8, 9, 10], + ["baz", 12, 13, 14, 15], + ["qux", 12, 13, 14, 15], + ["foo2", 12, 13, 14, 15], + ["bar2", 12, 13, 14, 15], + ], + columns=["index", "A", "B", "C", "D"], + ) + expected = expected.set_index("index") + + chunks = list(reader) + tm.assert_frame_equal(chunks[0], expected[:2]) + tm.assert_frame_equal(chunks[1], expected[2:4]) + tm.assert_frame_equal(chunks[2], expected[4:]) + + +@pytest.mark.parametrize("chunksize", [1.3, "foo", 0]) +def test_read_chunksize_bad(all_parsers, chunksize): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + parser = all_parsers + msg = r"'chunksize' must be an integer >=1" + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), chunksize=chunksize) + + +@pytest.mark.parametrize("chunksize", [2, 8]) +def test_read_chunksize_and_nrows(all_parsers, chunksize): + # see gh-15755 + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + parser = all_parsers + kwargs = dict(index_col=0, nrows=5) + + reader = parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs) + expected = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(concat(reader), expected) + + +def test_read_chunksize_and_nrows_changing_size(all_parsers): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + parser = all_parsers + kwargs = dict(index_col=0, nrows=5) + + reader = parser.read_csv(StringIO(data), chunksize=8, **kwargs) + expected = parser.read_csv(StringIO(data), **kwargs) + + tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2]) + tm.assert_frame_equal(reader.get_chunk(size=4), expected.iloc[2:5]) + + with pytest.raises(StopIteration, match=""): + reader.get_chunk(size=3) + + +def test_get_chunk_passed_chunksize(all_parsers): + parser = all_parsers + data = """A,B,C +1,2,3 +4,5,6 +7,8,9 +1,2,3""" + + reader = parser.read_csv(StringIO(data), chunksize=2) + result = reader.get_chunk() + + expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("kwargs", [dict(), dict(index_col=0)]) +def test_read_chunksize_compat(all_parsers, kwargs): + # see gh-12185 + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + parser = all_parsers + reader = parser.read_csv(StringIO(data), chunksize=2, **kwargs) + + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(concat(reader), result) + + +def test_read_chunksize_jagged_names(all_parsers): + # see gh-23509 + parser = all_parsers + data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)]) + + expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10]) + reader = parser.read_csv(StringIO(data), names=range(10), chunksize=4) + + result = concat(reader) + tm.assert_frame_equal(result, expected) + + +def test_read_data_list(all_parsers): + parser = all_parsers + kwargs = dict(index_col=0) + data = "A,B,C\nfoo,1,2,3\nbar,4,5,6" + + data_list = [["A", "B", "C"], ["foo", "1", "2", "3"], ["bar", "4", "5", "6"]] + expected = parser.read_csv(StringIO(data), **kwargs) + + parser = TextParser(data_list, chunksize=2, **kwargs) + result = parser.read() + + tm.assert_frame_equal(result, expected) + + +def test_iterator(all_parsers): + # see gh-6607 + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + parser = all_parsers + kwargs = dict(index_col=0) + + expected = parser.read_csv(StringIO(data), **kwargs) + reader = parser.read_csv(StringIO(data), iterator=True, **kwargs) + + first_chunk = reader.read(3) + tm.assert_frame_equal(first_chunk, expected[:3]) + + last_chunk = reader.read(5) + tm.assert_frame_equal(last_chunk, expected[3:]) + + +def test_iterator2(all_parsers): + parser = all_parsers + data = """A,B,C +foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + + reader = parser.read_csv(StringIO(data), iterator=True) + result = list(reader) + + expected = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=["foo", "bar", "baz"], + columns=["A", "B", "C"], + ) + tm.assert_frame_equal(result[0], expected) + + +def test_reader_list(all_parsers): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + parser = all_parsers + kwargs = dict(index_col=0) + + lines = list(csv.reader(StringIO(data))) + reader = TextParser(lines, chunksize=2, **kwargs) + + expected = parser.read_csv(StringIO(data), **kwargs) + chunks = list(reader) + + tm.assert_frame_equal(chunks[0], expected[:2]) + tm.assert_frame_equal(chunks[1], expected[2:4]) + tm.assert_frame_equal(chunks[2], expected[4:]) + + +def test_reader_list_skiprows(all_parsers): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + parser = all_parsers + kwargs = dict(index_col=0) + + lines = list(csv.reader(StringIO(data))) + reader = TextParser(lines, chunksize=2, skiprows=[1], **kwargs) + + expected = parser.read_csv(StringIO(data), **kwargs) + chunks = list(reader) + + tm.assert_frame_equal(chunks[0], expected[1:3]) + + +def test_iterator_stop_on_chunksize(all_parsers): + # gh-3967: stopping iteration when chunksize is specified + parser = all_parsers + data = """A,B,C +foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + + reader = parser.read_csv(StringIO(data), chunksize=1) + result = list(reader) + + assert len(result) == 3 + expected = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=["foo", "bar", "baz"], + columns=["A", "B", "C"], + ) + tm.assert_frame_equal(concat(result), expected) + + +@pytest.mark.parametrize( + "kwargs", [dict(iterator=True, chunksize=1), dict(iterator=True), dict(chunksize=1)] +) +def test_iterator_skipfooter_errors(all_parsers, kwargs): + msg = "'skipfooter' not supported for 'iteration'" + parser = all_parsers + data = "a\n1\n2" + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), skipfooter=1, **kwargs) + + +def test_nrows_skipfooter_errors(all_parsers): + msg = "'skipfooter' not supported with 'nrows'" + data = "a\n1\n2\n3\n4\n5\n6" + parser = all_parsers + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), skipfooter=1, nrows=5) + + +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + ( + """foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""", + dict(index_col=0, names=["index", "A", "B", "C", "D"]), + DataFrame( + [ + [2, 3, 4, 5], + [7, 8, 9, 10], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + ], + index=Index(["foo", "bar", "baz", "qux", "foo2", "bar2"], name="index"), + columns=["A", "B", "C", "D"], + ), + ), + ( + """foo,one,2,3,4,5 +foo,two,7,8,9,10 +foo,three,12,13,14,15 +bar,one,12,13,14,15 +bar,two,12,13,14,15 +""", + dict(index_col=[0, 1], names=["index1", "index2", "A", "B", "C", "D"]), + DataFrame( + [ + [2, 3, 4, 5], + [7, 8, 9, 10], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + ], + index=MultiIndex.from_tuples( + [ + ("foo", "one"), + ("foo", "two"), + ("foo", "three"), + ("bar", "one"), + ("bar", "two"), + ], + names=["index1", "index2"], + ), + columns=["A", "B", "C", "D"], + ), + ), + ], +) +def test_pass_names_with_index(all_parsers, data, kwargs, expected): + parser = all_parsers + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) +def test_multi_index_no_level_names(all_parsers, index_col): + data = """index1,index2,A,B,C,D +foo,one,2,3,4,5 +foo,two,7,8,9,10 +foo,three,12,13,14,15 +bar,one,12,13,14,15 +bar,two,12,13,14,15 +""" + headless_data = "\n".join(data.split("\n")[1:]) + + names = ["A", "B", "C", "D"] + parser = all_parsers + + result = parser.read_csv( + StringIO(headless_data), index_col=index_col, header=None, names=names + ) + expected = parser.read_csv(StringIO(data), index_col=index_col) + + # No index names in headless data. + expected.index.names = [None] * 2 + tm.assert_frame_equal(result, expected) + + +def test_multi_index_no_level_names_implicit(all_parsers): + parser = all_parsers + data = """A,B,C,D +foo,one,2,3,4,5 +foo,two,7,8,9,10 +foo,three,12,13,14,15 +bar,one,12,13,14,15 +bar,two,12,13,14,15 +""" + + result = parser.read_csv(StringIO(data)) + expected = DataFrame( + [ + [2, 3, 4, 5], + [7, 8, 9, 10], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + ], + columns=["A", "B", "C", "D"], + index=MultiIndex.from_tuples( + [ + ("foo", "one"), + ("foo", "two"), + ("foo", "three"), + ("bar", "one"), + ("bar", "two"), + ] + ), + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data,expected,header", + [ + ("a,b", DataFrame(columns=["a", "b"]), [0]), + ( + "a,b\nc,d", + DataFrame(columns=MultiIndex.from_tuples([("a", "c"), ("b", "d")])), + [0, 1], + ), + ], +) +@pytest.mark.parametrize("round_trip", [True, False]) +def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip): + # see gh-14545 + parser = all_parsers + data = expected.to_csv(index=False) if round_trip else data + + result = parser.read_csv(StringIO(data), header=header) + tm.assert_frame_equal(result, expected) + + +def test_no_unnamed_index(all_parsers): + parser = all_parsers + data = """ id c0 c1 c2 +0 1 0 a b +1 2 0 c d +2 2 2 e f +""" + result = parser.read_csv(StringIO(data), sep=" ") + expected = DataFrame( + [[0, 1, 0, "a", "b"], [1, 2, 0, "c", "d"], [2, 2, 2, "e", "f"]], + columns=["Unnamed: 0", "id", "c0", "c1", "c2"], + ) + tm.assert_frame_equal(result, expected) + + +def test_read_csv_parse_simple_list(all_parsers): + parser = all_parsers + data = """foo +bar baz +qux foo +foo +bar""" + + result = parser.read_csv(StringIO(data), header=None) + expected = DataFrame(["foo", "bar baz", "qux foo", "foo", "bar"]) + tm.assert_frame_equal(result, expected) + + +@tm.network +def test_url(all_parsers, csv_dir_path): + # TODO: FTP testing + parser = all_parsers + kwargs = dict(sep="\t") + + url = ( + "https://raw.github.com/pandas-dev/pandas/master/" + "pandas/tests/io/parser/data/salaries.csv" + ) + url_result = parser.read_csv(url, **kwargs) + + local_path = os.path.join(csv_dir_path, "salaries.csv") + local_result = parser.read_csv(local_path, **kwargs) + tm.assert_frame_equal(url_result, local_result) + + +@pytest.mark.slow +def test_local_file(all_parsers, csv_dir_path): + parser = all_parsers + kwargs = dict(sep="\t") + + local_path = os.path.join(csv_dir_path, "salaries.csv") + local_result = parser.read_csv(local_path, **kwargs) + url = "file://localhost/" + local_path + + try: + url_result = parser.read_csv(url, **kwargs) + tm.assert_frame_equal(url_result, local_result) + except URLError: + # Fails on some systems. + pytest.skip("Failing on: " + " ".join(platform.uname())) + + +def test_path_path_lib(all_parsers): + parser = all_parsers + df = tm.makeDataFrame() + result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0)) + tm.assert_frame_equal(df, result) + + +def test_path_local_path(all_parsers): + parser = all_parsers + df = tm.makeDataFrame() + result = tm.round_trip_localpath( + df.to_csv, lambda p: parser.read_csv(p, index_col=0) + ) + tm.assert_frame_equal(df, result) + + +def test_nonexistent_path(all_parsers): + # gh-2428: pls no segfault + # gh-14086: raise more helpful FileNotFoundError + # GH#29233 "File foo" instead of "File b'foo'" + parser = all_parsers + path = "{}.csv".format(tm.rands(10)) + + msg = f"File {path} does not exist" if parser.engine == "c" else r"\[Errno 2\]" + with pytest.raises(FileNotFoundError, match=msg) as e: + parser.read_csv(path) + + filename = e.value.filename + + assert path == filename + + +def test_missing_trailing_delimiters(all_parsers): + parser = all_parsers + data = """A,B,C,D +1,2,3,4 +1,3,3, +1,4,5""" + + result = parser.read_csv(StringIO(data)) + expected = DataFrame( + [[1, 2, 3, 4], [1, 3, 3, np.nan], [1, 4, 5, np.nan]], + columns=["A", "B", "C", "D"], + ) + tm.assert_frame_equal(result, expected) + + +def test_skip_initial_space(all_parsers): + data = ( + '"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, ' + "1.00361, 1.12551, 330.65659, 0355626618.16711, 73.48821, " + "314.11625, 1917.09447, 179.71425, 80.000, 240.000, -350, " + "70.06056, 344.98370, 1, 1, -0.689265, -0.692787, " + "0.212036, 14.7674, 41.605, -9999.0, -9999.0, " + "-9999.0, -9999.0, -9999.0, -9999.0, 000, 012, 128" + ) + parser = all_parsers + + result = parser.read_csv( + StringIO(data), + names=list(range(33)), + header=None, + na_values=["-9999.0"], + skipinitialspace=True, + ) + expected = DataFrame( + [ + [ + "09-Apr-2012", + "01:10:18.300", + 2456026.548822908, + 12849, + 1.00361, + 1.12551, + 330.65659, + 355626618.16711, + 73.48821, + 314.11625, + 1917.09447, + 179.71425, + 80.0, + 240.0, + -350, + 70.06056, + 344.9837, + 1, + 1, + -0.689265, + -0.692787, + 0.212036, + 14.7674, + 41.605, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + 0, + 12, + 128, + ] + ] + ) + tm.assert_frame_equal(result, expected) + + +def test_trailing_delimiters(all_parsers): + # see gh-2442 + data = """A,B,C +1,2,3, +4,5,6, +7,8,9,""" + parser = all_parsers + result = parser.read_csv(StringIO(data), index_col=False) + + expected = DataFrame({"A": [1, 4, 7], "B": [2, 5, 8], "C": [3, 6, 9]}) + tm.assert_frame_equal(result, expected) + + +def test_escapechar(all_parsers): + # https://stackoverflow.com/questions/13824840/feature-request-for- + # pandas-read-csv + data = '''SEARCH_TERM,ACTUAL_URL +"bra tv bord","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" +"tv p\xc3\xa5 hjul","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" +"SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals serie","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"''' # noqa + + parser = all_parsers + result = parser.read_csv( + StringIO(data), escapechar="\\", quotechar='"', encoding="utf-8" + ) + + assert result["SEARCH_TERM"][2] == 'SLAGBORD, "Bergslagen", IKEA:s 1700-tals serie' + + tm.assert_index_equal(result.columns, Index(["SEARCH_TERM", "ACTUAL_URL"])) + + +def test_int64_min_issues(all_parsers): + # see gh-2599 + parser = all_parsers + data = "A,B\n0,0\n0," + result = parser.read_csv(StringIO(data)) + + expected = DataFrame({"A": [0, 0], "B": [0, np.nan]}) + tm.assert_frame_equal(result, expected) + + +def test_parse_integers_above_fp_precision(all_parsers): + data = """Numbers +17007000002000191 +17007000002000191 +17007000002000191 +17007000002000191 +17007000002000192 +17007000002000192 +17007000002000192 +17007000002000192 +17007000002000192 +17007000002000194""" + parser = all_parsers + result = parser.read_csv(StringIO(data)) + expected = DataFrame( + { + "Numbers": [ + 17007000002000191, + 17007000002000191, + 17007000002000191, + 17007000002000191, + 17007000002000192, + 17007000002000192, + 17007000002000192, + 17007000002000192, + 17007000002000192, + 17007000002000194, + ] + } + ) + tm.assert_frame_equal(result, expected) + + +def test_chunks_have_consistent_numerical_type(all_parsers): + parser = all_parsers + integers = [str(i) for i in range(499999)] + data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers) + + # Coercions should work without warnings. + with tm.assert_produces_warning(None): + result = parser.read_csv(StringIO(data)) + + assert type(result.a[0]) is np.float64 + assert result.a.dtype == np.float + + +def test_warn_if_chunks_have_mismatched_type(all_parsers): + warning_type = None + parser = all_parsers + integers = [str(i) for i in range(499999)] + data = "a\n" + "\n".join(integers + ["a", "b"] + integers) + + # see gh-3866: if chunks are different types and can't + # be coerced using numerical types, then issue warning. + if parser.engine == "c" and parser.low_memory: + warning_type = DtypeWarning + + with tm.assert_produces_warning(warning_type): + df = parser.read_csv(StringIO(data)) + assert df.a.dtype == np.object + + +@pytest.mark.parametrize("sep", [" ", r"\s+"]) +def test_integer_overflow_bug(all_parsers, sep): + # see gh-2601 + data = "65248E10 11\n55555E55 22\n" + parser = all_parsers + + result = parser.read_csv(StringIO(data), header=None, sep=sep) + expected = DataFrame([[6.5248e14, 11], [5.5555e59, 22]]) + tm.assert_frame_equal(result, expected) + + +def test_catch_too_many_names(all_parsers): + # see gh-5156 + data = """\ +1,2,3 +4,,6 +7,8,9 +10,11,12\n""" + parser = all_parsers + msg = ( + "Too many columns specified: expected 4 and found 3" + if parser.engine == "c" + else "Number of passed names did not match " + "number of header fields in the file" + ) + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"]) + + +def test_ignore_leading_whitespace(all_parsers): + # see gh-3374, gh-6607 + parser = all_parsers + data = " a b c\n 1 2 3\n 4 5 6\n 7 8 9" + result = parser.read_csv(StringIO(data), sep=r"\s+") + + expected = DataFrame({"a": [1, 4, 7], "b": [2, 5, 8], "c": [3, 6, 9]}) + tm.assert_frame_equal(result, expected) + + +def test_chunk_begins_with_newline_whitespace(all_parsers): + # see gh-10022 + parser = all_parsers + data = "\n hello\nworld\n" + + result = parser.read_csv(StringIO(data), header=None) + expected = DataFrame([" hello", "world"]) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_index(all_parsers): + # see gh-10184 + data = "x,y" + parser = all_parsers + result = parser.read_csv(StringIO(data), index_col=0) + + expected = DataFrame(columns=["y"], index=Index([], name="x")) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_multi_index(all_parsers): + # see gh-10467 + data = "x,y,z" + parser = all_parsers + result = parser.read_csv(StringIO(data), index_col=["x", "y"]) + + expected = DataFrame( + columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"]) + ) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_reversed_multi_index(all_parsers): + data = "x,y,z" + parser = all_parsers + result = parser.read_csv(StringIO(data), index_col=[1, 0]) + + expected = DataFrame( + columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"]) + ) + tm.assert_frame_equal(result, expected) + + +def test_float_parser(all_parsers): + # see gh-9565 + parser = all_parsers + data = "45e-1,4.5,45.,inf,-inf" + result = parser.read_csv(StringIO(data), header=None) + + expected = DataFrame([[float(s) for s in data.split(",")]]) + tm.assert_frame_equal(result, expected) + + +def test_scientific_no_exponent(all_parsers): + # see gh-12215 + df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]}) + data = df.to_csv(index=False) + parser = all_parsers + + for precision in parser.float_precision_choices: + df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision) + tm.assert_frame_equal(df_roundtrip, df) + + +@pytest.mark.parametrize("conv", [None, np.int64, np.uint64]) +def test_int64_overflow(all_parsers, conv): + data = """ID +00013007854817840016671868 +00013007854817840016749251 +00013007854817840016754630 +00013007854817840016781876 +00013007854817840017028824 +00013007854817840017963235 +00013007854817840018860166""" + parser = all_parsers + + if conv is None: + # 13007854817840016671868 > UINT64_MAX, so this + # will overflow and return object as the dtype. + result = parser.read_csv(StringIO(data)) + expected = DataFrame( + [ + "00013007854817840016671868", + "00013007854817840016749251", + "00013007854817840016754630", + "00013007854817840016781876", + "00013007854817840017028824", + "00013007854817840017963235", + "00013007854817840018860166", + ], + columns=["ID"], + ) + tm.assert_frame_equal(result, expected) + else: + # 13007854817840016671868 > UINT64_MAX, so attempts + # to cast to either int64 or uint64 will result in + # an OverflowError being raised. + msg = ( + "(Python int too large to convert to C long)|" + "(long too big to convert)|" + "(int too big to convert)" + ) + + with pytest.raises(OverflowError, match=msg): + parser.read_csv(StringIO(data), converters={"ID": conv}) + + +@pytest.mark.parametrize( + "val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min] +) +def test_int64_uint64_range(all_parsers, val): + # These numbers fall right inside the int64-uint64 + # range, so they should be parsed as string. + parser = all_parsers + result = parser.read_csv(StringIO(str(val)), header=None) + + expected = DataFrame([val]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1] +) +def test_outside_int64_uint64_range(all_parsers, val): + # These numbers fall just outside the int64-uint64 + # range, so they should be parsed as string. + parser = all_parsers + result = parser.read_csv(StringIO(str(val)), header=None) + + expected = DataFrame([str(val)]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("exp_data", [[str(-1), str(2 ** 63)], [str(2 ** 63), str(-1)]]) +def test_numeric_range_too_wide(all_parsers, exp_data): + # No numerical dtype can hold both negative and uint64 + # values, so they should be cast as string. + parser = all_parsers + data = "\n".join(exp_data) + expected = DataFrame(exp_data) + + result = parser.read_csv(StringIO(data), header=None) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("iterator", [True, False]) +def test_empty_with_nrows_chunksize(all_parsers, iterator): + # see gh-9535 + parser = all_parsers + expected = DataFrame(columns=["foo", "bar"]) + + nrows = 10 + data = StringIO("foo,bar\n") + + if iterator: + result = next(iter(parser.read_csv(data, chunksize=nrows))) + else: + result = parser.read_csv(data, nrows=nrows) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data,kwargs,expected,msg", + [ + # gh-10728: WHITESPACE_LINE + ( + "a,b,c\n4,5,6\n ", + dict(), + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), + None, + ), + # gh-10548: EAT_LINE_COMMENT + ( + "a,b,c\n4,5,6\n#comment", + dict(comment="#"), + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), + None, + ), + # EAT_CRNL_NOP + ( + "a,b,c\n4,5,6\n\r", + dict(), + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), + None, + ), + # EAT_COMMENT + ( + "a,b,c\n4,5,6#comment", + dict(comment="#"), + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), + None, + ), + # SKIP_LINE + ( + "a,b,c\n4,5,6\nskipme", + dict(skiprows=[2]), + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), + None, + ), + # EAT_LINE_COMMENT + ( + "a,b,c\n4,5,6\n#comment", + dict(comment="#", skip_blank_lines=False), + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), + None, + ), + # IN_FIELD + ( + "a,b,c\n4,5,6\n ", + dict(skip_blank_lines=False), + DataFrame([["4", 5, 6], [" ", None, None]], columns=["a", "b", "c"]), + None, + ), + # EAT_CRNL + ( + "a,b,c\n4,5,6\n\r", + dict(skip_blank_lines=False), + DataFrame([[4, 5, 6], [None, None, None]], columns=["a", "b", "c"]), + None, + ), + # ESCAPED_CHAR + ( + "a,b,c\n4,5,6\n\\", + dict(escapechar="\\"), + None, + "(EOF following escape character)|(unexpected end of data)", + ), + # ESCAPE_IN_QUOTED_FIELD + ( + 'a,b,c\n4,5,6\n"\\', + dict(escapechar="\\"), + None, + "(EOF inside string starting at row 2)|(unexpected end of data)", + ), + # IN_QUOTED_FIELD + ( + 'a,b,c\n4,5,6\n"', + dict(escapechar="\\"), + None, + "(EOF inside string starting at row 2)|(unexpected end of data)", + ), + ], + ids=[ + "whitespace-line", + "eat-line-comment", + "eat-crnl-nop", + "eat-comment", + "skip-line", + "eat-line-comment", + "in-field", + "eat-crnl", + "escaped-char", + "escape-in-quoted-field", + "in-quoted-field", + ], +) +def test_eof_states(all_parsers, data, kwargs, expected, msg): + # see gh-10728, gh-10548 + parser = all_parsers + + if expected is None: + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + else: + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("usecols", [None, [0, 1], ["a", "b"]]) +def test_uneven_lines_with_usecols(all_parsers, usecols): + # see gh-12203 + parser = all_parsers + data = r"""a,b,c +0,1,2 +3,4,5,6,7 +8,9,10""" + + if usecols is None: + # Make sure that an error is still raised + # when the "usecols" parameter is not provided. + msg = r"Expected \d+ fields in line \d+, saw \d+" + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data)) + else: + expected = DataFrame({"a": [0, 3, 8], "b": [1, 4, 9]}) + + result = parser.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + # First, check to see that the response of parser when faced with no + # provided columns raises the correct error, with or without usecols. + ("", dict(), None), + ("", dict(usecols=["X"]), None), + ( + ",,", + dict(names=["Dummy", "X", "Dummy_2"], usecols=["X"]), + DataFrame(columns=["X"], index=[0], dtype=np.float64), + ), + ( + "", + dict(names=["Dummy", "X", "Dummy_2"], usecols=["X"]), + DataFrame(columns=["X"]), + ), + ], +) +def test_read_empty_with_usecols(all_parsers, data, kwargs, expected): + # see gh-12493 + parser = all_parsers + + if expected is None: + msg = "No columns to parse from file" + with pytest.raises(EmptyDataError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + else: + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "kwargs,expected", + [ + # gh-8661, gh-8679: this should ignore six lines, including + # lines with trailing whitespace and blank lines. + ( + dict( + header=None, + delim_whitespace=True, + skiprows=[0, 1, 2, 3, 5, 6], + skip_blank_lines=True, + ), + DataFrame([[1.0, 2.0, 4.0], [5.1, np.nan, 10.0]]), + ), + # gh-8983: test skipping set of rows after a row with trailing spaces. + ( + dict( + delim_whitespace=True, skiprows=[1, 2, 3, 5, 6], skip_blank_lines=True + ), + DataFrame({"A": [1.0, 5.1], "B": [2.0, np.nan], "C": [4.0, 10]}), + ), + ], +) +def test_trailing_spaces(all_parsers, kwargs, expected): + data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa + parser = all_parsers + + result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_raise_on_sep_with_delim_whitespace(all_parsers): + # see gh-6607 + data = "a b c\n1 2 3" + parser = all_parsers + + with pytest.raises(ValueError, match="you can only specify one"): + parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True) + + +@pytest.mark.parametrize("delim_whitespace", [True, False]) +def test_single_char_leading_whitespace(all_parsers, delim_whitespace): + # see gh-9710 + parser = all_parsers + data = """\ +MyColumn +a +b +a +b\n""" + + expected = DataFrame({"MyColumn": list("abab")}) + result = parser.read_csv( + StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "sep,skip_blank_lines,exp_data", + [ + (",", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]), + (r"\s+", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]), + ( + ",", + False, + [ + [1.0, 2.0, 4.0], + [np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan], + [5.0, np.nan, 10.0], + [np.nan, np.nan, np.nan], + [-70.0, 0.4, 1.0], + ], + ), + ], +) +def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data): + parser = all_parsers + data = """\ +A,B,C +1,2.,4. + + +5.,NaN,10.0 + +-70,.4,1 +""" + + if sep == r"\s+": + data = data.replace(",", " ") + + result = parser.read_csv(StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines) + expected = DataFrame(exp_data, columns=["A", "B", "C"]) + tm.assert_frame_equal(result, expected) + + +def test_whitespace_lines(all_parsers): + parser = all_parsers + data = """ + +\t \t\t +\t +A,B,C +\t 1,2.,4. +5.,NaN,10.0 +""" + expected = DataFrame([[1, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]) + result = parser.read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data,expected", + [ + ( + """ A B C D +a 1 2 3 4 +b 1 2 3 4 +c 1 2 3 4 +""", + DataFrame( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], + columns=["A", "B", "C", "D"], + index=["a", "b", "c"], + ), + ), + ( + " a b c\n1 2 3 \n4 5 6\n 7 8 9", + DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]), + ), + ], +) +def test_whitespace_regex_separator(all_parsers, data, expected): + # see gh-6607 + parser = all_parsers + result = parser.read_csv(StringIO(data), sep=r"\s+") + tm.assert_frame_equal(result, expected) + + +def test_verbose_read(all_parsers, capsys): + parser = all_parsers + data = """a,b,c,d +one,1,2,3 +one,1,2,3 +,1,2,3 +one,1,2,3 +,1,2,3 +,1,2,3 +one,1,2,3 +two,1,2,3""" + + # Engines are verbose in different ways. + parser.read_csv(StringIO(data), verbose=True) + captured = capsys.readouterr() + + if parser.engine == "c": + assert "Tokenization took:" in captured.out + assert "Parser memory cleanup took:" in captured.out + else: # Python engine + assert captured.out == "Filled 3 NA values in column a\n" + + +def test_verbose_read2(all_parsers, capsys): + parser = all_parsers + data = """a,b,c,d +one,1,2,3 +two,1,2,3 +three,1,2,3 +four,1,2,3 +five,1,2,3 +,1,2,3 +seven,1,2,3 +eight,1,2,3""" + + parser.read_csv(StringIO(data), verbose=True, index_col=0) + captured = capsys.readouterr() + + # Engines are verbose in different ways. + if parser.engine == "c": + assert "Tokenization took:" in captured.out + assert "Parser memory cleanup took:" in captured.out + else: # Python engine + assert captured.out == "Filled 1 NA values in column a\n" + + +def test_iteration_open_handle(all_parsers): + parser = all_parsers + kwargs = dict(squeeze=True, header=None) + + with tm.ensure_clean() as path: + with open(path, "w") as f: + f.write("AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG") + + with open(path, "r") as f: + for line in f: + if "CCC" in line: + break + + result = parser.read_csv(f, **kwargs) + expected = Series(["DDD", "EEE", "FFF", "GGG"], name=0) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "data,thousands,decimal", + [ + ( + """A|B|C +1|2,334.01|5 +10|13|10. +""", + ",", + ".", + ), + ( + """A|B|C +1|2.334,01|5 +10|13|10, +""", + ".", + ",", + ), + ], +) +def test_1000_sep_with_decimal(all_parsers, data, thousands, decimal): + parser = all_parsers + expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]}) + + result = parser.read_csv( + StringIO(data), sep="|", thousands=thousands, decimal=decimal + ) + tm.assert_frame_equal(result, expected) + + +def test_euro_decimal_format(all_parsers): + parser = all_parsers + data = """Id;Number1;Number2;Text1;Text2;Number3 +1;1521,1541;187101,9543;ABC;poi;4,738797819 +2;121,12;14897,76;DEF;uyt;0,377320872 +3;878,158;108013,434;GHI;rez;2,735694704""" + + result = parser.read_csv(StringIO(data), sep=";", decimal=",") + expected = DataFrame( + [ + [1, 1521.1541, 187101.9543, "ABC", "poi", 4.738797819], + [2, 121.12, 14897.76, "DEF", "uyt", 0.377320872], + [3, 878.158, 108013.434, "GHI", "rez", 2.735694704], + ], + columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"], + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("na_filter", [True, False]) +def test_inf_parsing(all_parsers, na_filter): + parser = all_parsers + data = """\ +,A +a,inf +b,-inf +c,+Inf +d,-Inf +e,INF +f,-INF +g,+INf +h,-INf +i,inF +j,-inF""" + expected = DataFrame( + {"A": [float("inf"), float("-inf")] * 5}, + index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"], + ) + result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("na_filter", [True, False]) +def test_infinity_parsing(all_parsers, na_filter): + parser = all_parsers + data = """\ +,A +a,Infinity +b,-Infinity +c,+Infinity +""" + expected = DataFrame( + {"A": [float("infinity"), float("-infinity"), float("+infinity")]}, + index=["a", "b", "c"], + ) + result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5]) +def test_raise_on_no_columns(all_parsers, nrows): + parser = all_parsers + data = "\n" * nrows + + msg = "No columns to parse from file" + with pytest.raises(EmptyDataError, match=msg): + parser.read_csv(StringIO(data)) + + +def test_memory_map(all_parsers, csv_dir_path): + mmap_file = os.path.join(csv_dir_path, "test_mmap.csv") + parser = all_parsers + + expected = DataFrame( + {"a": [1, 2, 3], "b": ["one", "two", "three"], "c": ["I", "II", "III"]} + ) + + result = parser.read_csv(mmap_file, memory_map=True) + tm.assert_frame_equal(result, expected) + + +def test_null_byte_char(all_parsers): + # see gh-2741 + data = "\x00,foo" + names = ["a", "b"] + parser = all_parsers + + if parser.engine == "c": + expected = DataFrame([[np.nan, "foo"]], columns=names) + out = parser.read_csv(StringIO(data), names=names) + tm.assert_frame_equal(out, expected) + else: + msg = "NULL byte detected" + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data), names=names) + + +def test_temporary_file(all_parsers): + # see gh-13398 + parser = all_parsers + data = "0 0" + + new_file = TemporaryFile("w+") + new_file.write(data) + new_file.flush() + new_file.seek(0) + + result = parser.read_csv(new_file, sep=r"\s+", header=None) + new_file.close() + + expected = DataFrame([[0, 0]]) + tm.assert_frame_equal(result, expected) + + +def test_internal_eof_byte(all_parsers): + # see gh-5500 + parser = all_parsers + data = "a,b\n1\x1a,2" + + expected = DataFrame([["1\x1a", 2]], columns=["a", "b"]) + result = parser.read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) + + +def test_internal_eof_byte_to_file(all_parsers): + # see gh-16559 + parser = all_parsers + data = b'c1,c2\r\n"test \x1a test", test\r\n' + expected = DataFrame([["test \x1a test", " test"]], columns=["c1", "c2"]) + path = "__{}__.csv".format(tm.rands(10)) + + with tm.ensure_clean(path) as path: + with open(path, "wb") as f: + f.write(data) + + result = parser.read_csv(path) + tm.assert_frame_equal(result, expected) + + +def test_sub_character(all_parsers, csv_dir_path): + # see gh-16893 + filename = os.path.join(csv_dir_path, "sub_char.csv") + expected = DataFrame([[1, 2, 3]], columns=["a", "\x1ab", "c"]) + + parser = all_parsers + result = parser.read_csv(filename) + tm.assert_frame_equal(result, expected) + + +def test_file_handle_string_io(all_parsers): + # gh-14418 + # + # Don't close user provided file handles. + parser = all_parsers + data = "a,b\n1,2" + + fh = StringIO(data) + parser.read_csv(fh) + assert not fh.closed + + +def test_file_handles_with_open(all_parsers, csv1): + # gh-14418 + # + # Don't close user provided file handles. + parser = all_parsers + + for mode in ["r", "rb"]: + with open(csv1, mode) as f: + parser.read_csv(f) + assert not f.closed + + +def test_invalid_file_buffer_class(all_parsers): + # see gh-15337 + class InvalidBuffer: + pass + + parser = all_parsers + msg = "Invalid file path or buffer object type" + + with pytest.raises(ValueError, match=msg): + parser.read_csv(InvalidBuffer()) + + +def test_invalid_file_buffer_mock(all_parsers): + # see gh-15337 + parser = all_parsers + msg = "Invalid file path or buffer object type" + + class Foo: + pass + + with pytest.raises(ValueError, match=msg): + parser.read_csv(Foo()) + + +def test_valid_file_buffer_seems_invalid(all_parsers): + # gh-16135: we want to ensure that "tell" and "seek" + # aren't actually being used when we call `read_csv` + # + # Thus, while the object may look "invalid" (these + # methods are attributes of the `StringIO` class), + # it is still a valid file-object for our purposes. + class NoSeekTellBuffer(StringIO): + def tell(self): + raise AttributeError("No tell method") + + def seek(self, pos, whence=0): + raise AttributeError("No seek method") + + data = "a\n1" + parser = all_parsers + expected = DataFrame({"a": [1]}) + + result = parser.read_csv(NoSeekTellBuffer(data)) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "kwargs", + [dict(), dict(error_bad_lines=True)], # Default is True. # Explicitly pass in. +) +@pytest.mark.parametrize( + "warn_kwargs", [dict(), dict(warn_bad_lines=True), dict(warn_bad_lines=False)] +) +def test_error_bad_lines(all_parsers, kwargs, warn_kwargs): + # see gh-15925 + parser = all_parsers + kwargs.update(**warn_kwargs) + data = "a\n1\n1,2,3\n4\n5,6,7" + + msg = "Expected 1 fields in line 3, saw 3" + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + + +def test_warn_bad_lines(all_parsers, capsys): + # see gh-15925 + parser = all_parsers + data = "a\n1\n1,2,3\n4\n5,6,7" + expected = DataFrame({"a": [1, 4]}) + + result = parser.read_csv(StringIO(data), error_bad_lines=False, warn_bad_lines=True) + tm.assert_frame_equal(result, expected) + + captured = capsys.readouterr() + assert "Skipping line 3" in captured.err + assert "Skipping line 5" in captured.err + + +def test_suppress_error_output(all_parsers, capsys): + # see gh-15925 + parser = all_parsers + data = "a\n1\n1,2,3\n4\n5,6,7" + expected = DataFrame({"a": [1, 4]}) + + result = parser.read_csv( + StringIO(data), error_bad_lines=False, warn_bad_lines=False + ) + tm.assert_frame_equal(result, expected) + + captured = capsys.readouterr() + assert captured.err == "" + + +@pytest.mark.parametrize("filename", ["sé-es-vé.csv", "ru-sй.csv", "中文文件名.csv"]) +def test_filename_with_special_chars(all_parsers, filename): + # see gh-15086. + parser = all_parsers + df = DataFrame({"a": [1, 2, 3]}) + + with tm.ensure_clean(filename) as path: + df.to_csv(path, index=False) + + result = parser.read_csv(path) + tm.assert_frame_equal(result, df) + + +def test_read_csv_memory_growth_chunksize(all_parsers): + # see gh-24805 + # + # Let's just make sure that we don't crash + # as we iteratively process all chunks. + parser = all_parsers + + with tm.ensure_clean() as path: + with open(path, "w") as f: + for i in range(1000): + f.write(str(i) + "\n") + + result = parser.read_csv(path, chunksize=20) + + for _ in result: + pass + + +def test_read_table_equivalency_to_read_csv(all_parsers): + # see gh-21948 + # As of 0.25.0, read_table is undeprecated + parser = all_parsers + data = "a\tb\n1\t2\n3\t4" + expected = parser.read_csv(StringIO(data), sep="\t") + result = parser.read_table(StringIO(data)) + tm.assert_frame_equal(result, expected) + + +def test_first_row_bom(all_parsers): + # see gh-26545 + parser = all_parsers + data = '''\ufeff"Head1" "Head2" "Head3"''' + + result = parser.read_csv(StringIO(data), delimiter="\t") + expected = DataFrame(columns=["Head1", "Head2", "Head3"]) + tm.assert_frame_equal(result, expected) + + +def test_integer_precision(all_parsers): + # Gh 7072 + s = """1,1;0;0;0;1;1;3844;3844;3844;1;1;1;1;1;1;0;0;1;1;0;0,,,4321583677327450765 +5,1;0;0;0;1;1;843;843;843;1;1;1;1;1;1;0;0;1;1;0;0,64.0,;,4321113141090630389""" + parser = all_parsers + result = parser.read_csv(StringIO(s), header=None)[4] + expected = Series([4321583677327450765, 4321113141090630389], name=4) + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/io/parser/test_compression.py b/venv/Lib/site-packages/pandas/tests/io/parser/test_compression.py new file mode 100644 index 0000000..dc03370 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/parser/test_compression.py @@ -0,0 +1,151 @@ +""" +Tests compressed data parsing functionality for all +of the parsers defined in parsers.py +""" + +import os +import zipfile + +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.fixture(params=[True, False]) +def buffer(request): + return request.param + + +@pytest.fixture +def parser_and_data(all_parsers, csv1): + parser = all_parsers + + with open(csv1, "rb") as f: + data = f.read() + expected = parser.read_csv(csv1) + + return parser, data, expected + + +@pytest.mark.parametrize("compression", ["zip", "infer", "zip2"]) +def test_zip(parser_and_data, compression): + parser, data, expected = parser_and_data + + with tm.ensure_clean("test_file.zip") as path: + with zipfile.ZipFile(path, mode="w") as tmp: + tmp.writestr("test_file", data) + + if compression == "zip2": + with open(path, "rb") as f: + result = parser.read_csv(f, compression="zip") + else: + result = parser.read_csv(path, compression=compression) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("compression", ["zip", "infer"]) +def test_zip_error_multiple_files(parser_and_data, compression): + parser, data, expected = parser_and_data + + with tm.ensure_clean("combined_zip.zip") as path: + inner_file_names = ["test_file", "second_file"] + + with zipfile.ZipFile(path, mode="w") as tmp: + for file_name in inner_file_names: + tmp.writestr(file_name, data) + + with pytest.raises(ValueError, match="Multiple files"): + parser.read_csv(path, compression=compression) + + +def test_zip_error_no_files(parser_and_data): + parser, _, _ = parser_and_data + + with tm.ensure_clean() as path: + with zipfile.ZipFile(path, mode="w"): + pass + + with pytest.raises(ValueError, match="Zero files"): + parser.read_csv(path, compression="zip") + + +def test_zip_error_invalid_zip(parser_and_data): + parser, _, _ = parser_and_data + + with tm.ensure_clean() as path: + with open(path, "wb") as f: + with pytest.raises(zipfile.BadZipfile, match="File is not a zip file"): + parser.read_csv(f, compression="zip") + + +@pytest.mark.parametrize("filename", [None, "test.{ext}"]) +def test_compression(parser_and_data, compression_only, buffer, filename): + parser, data, expected = parser_and_data + compress_type = compression_only + + ext = "gz" if compress_type == "gzip" else compress_type + filename = filename if filename is None else filename.format(ext=ext) + + if filename and buffer: + pytest.skip("Cannot deduce compression from buffer of compressed data.") + + with tm.ensure_clean(filename=filename) as path: + tm.write_to_compressed(compress_type, path, data) + compression = "infer" if filename else compress_type + + if buffer: + with open(path, "rb") as f: + result = parser.read_csv(f, compression=compression) + else: + result = parser.read_csv(path, compression=compression) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("ext", [None, "gz", "bz2"]) +def test_infer_compression(all_parsers, csv1, buffer, ext): + # see gh-9770 + parser = all_parsers + kwargs = dict(index_col=0, parse_dates=True) + + expected = parser.read_csv(csv1, **kwargs) + kwargs["compression"] = "infer" + + if buffer: + with open(csv1) as f: + result = parser.read_csv(f, **kwargs) + else: + ext = "." + ext if ext else "" + result = parser.read_csv(csv1 + ext, **kwargs) + + tm.assert_frame_equal(result, expected) + + +def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt): + # see gh-18071, gh-24130 + parser = all_parsers + encoding = encoding_fmt.format(utf_value) + path = os.path.join(csv_dir_path, f"utf{utf_value}_ex_small.zip") + + result = parser.read_csv(path, encoding=encoding, compression="zip", sep="\t") + expected = pd.DataFrame( + { + "Country": ["Venezuela", "Venezuela"], + "Twitter": ["Hugo Chávez Frías", "Henrique Capriles R."], + } + ) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"]) +def test_invalid_compression(all_parsers, invalid_compression): + parser = all_parsers + compress_kwargs = dict(compression=invalid_compression) + + msg = "Unrecognized compression type: {compression}".format(**compress_kwargs) + + with pytest.raises(ValueError, match=msg): + parser.read_csv("test_file.zip", **compress_kwargs) diff --git a/venv/Lib/site-packages/pandas/tests/io/parser/test_converters.py b/venv/Lib/site-packages/pandas/tests/io/parser/test_converters.py new file mode 100644 index 0000000..88b400d --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/parser/test_converters.py @@ -0,0 +1,160 @@ +""" +Tests column conversion functionality during parsing +for all of the parsers defined in parsers.py +""" +from io import StringIO + +from dateutil.parser import parse +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Index +import pandas._testing as tm + + +def test_converters_type_must_be_dict(all_parsers): + parser = all_parsers + data = """index,A,B,C,D +foo,2,3,4,5 +""" + + with pytest.raises(TypeError, match="Type converters.+"): + parser.read_csv(StringIO(data), converters=0) + + +@pytest.mark.parametrize("column", [3, "D"]) +@pytest.mark.parametrize( + "converter", [parse, lambda x: int(x.split("/")[2])] # Produce integer. +) +def test_converters(all_parsers, column, converter): + parser = all_parsers + data = """A,B,C,D +a,1,2,01/01/2009 +b,3,4,01/02/2009 +c,4,5,01/03/2009 +""" + result = parser.read_csv(StringIO(data), converters={column: converter}) + + expected = parser.read_csv(StringIO(data)) + expected["D"] = expected["D"].map(converter) + + tm.assert_frame_equal(result, expected) + + +def test_converters_no_implicit_conv(all_parsers): + # see gh-2184 + parser = all_parsers + data = """000102,1.2,A\n001245,2,B""" + + converters = {0: lambda x: x.strip()} + result = parser.read_csv(StringIO(data), header=None, converters=converters) + + # Column 0 should not be casted to numeric and should remain as object. + expected = DataFrame([["000102", 1.2, "A"], ["001245", 2, "B"]]) + tm.assert_frame_equal(result, expected) + + +def test_converters_euro_decimal_format(all_parsers): + # see gh-583 + converters = dict() + parser = all_parsers + + data = """Id;Number1;Number2;Text1;Text2;Number3 +1;1521,1541;187101,9543;ABC;poi;4,7387 +2;121,12;14897,76;DEF;uyt;0,3773 +3;878,158;108013,434;GHI;rez;2,7356""" + converters["Number1"] = converters["Number2"] = converters[ + "Number3" + ] = lambda x: float(x.replace(",", ".")) + + result = parser.read_csv(StringIO(data), sep=";", converters=converters) + expected = DataFrame( + [ + [1, 1521.1541, 187101.9543, "ABC", "poi", 4.7387], + [2, 121.12, 14897.76, "DEF", "uyt", 0.3773], + [3, 878.158, 108013.434, "GHI", "rez", 2.7356], + ], + columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"], + ) + tm.assert_frame_equal(result, expected) + + +def test_converters_corner_with_nans(all_parsers): + parser = all_parsers + data = """id,score,days +1,2,12 +2,2-5, +3,,14+ +4,6-12,2""" + + # Example converters. + def convert_days(x): + x = x.strip() + + if not x: + return np.nan + + is_plus = x.endswith("+") + + if is_plus: + x = int(x[:-1]) + 1 + else: + x = int(x) + + return x + + def convert_days_sentinel(x): + x = x.strip() + + if not x: + return np.nan + + is_plus = x.endswith("+") + + if is_plus: + x = int(x[:-1]) + 1 + else: + x = int(x) + + return x + + def convert_score(x): + x = x.strip() + + if not x: + return np.nan + + if x.find("-") > 0: + val_min, val_max = map(int, x.split("-")) + val = 0.5 * (val_min + val_max) + else: + val = float(x) + + return val + + results = [] + + for day_converter in [convert_days, convert_days_sentinel]: + result = parser.read_csv( + StringIO(data), + converters={"score": convert_score, "days": day_converter}, + na_values=["", None], + ) + assert pd.isna(result["days"][1]) + results.append(result) + + tm.assert_frame_equal(results[0], results[1]) + + +def test_converter_index_col_bug(all_parsers): + # see gh-1835 + parser = all_parsers + data = "A;B\n1;2\n3;4" + + rs = parser.read_csv( + StringIO(data), sep=";", index_col="A", converters={"A": lambda x: x} + ) + + xp = DataFrame({"B": [2, 4]}, index=Index([1, 3], name="A")) + tm.assert_frame_equal(rs, xp) diff --git a/venv/Lib/site-packages/pandas/tests/io/parser/test_dialect.py b/venv/Lib/site-packages/pandas/tests/io/parser/test_dialect.py new file mode 100644 index 0000000..cc65def --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/parser/test_dialect.py @@ -0,0 +1,144 @@ +""" +Tests that dialects are properly handled during parsing +for all of the parsers defined in parsers.py +""" + +import csv +from io import StringIO + +import pytest + +from pandas.errors import ParserWarning + +from pandas import DataFrame +import pandas._testing as tm + + +@pytest.fixture +def custom_dialect(): + dialect_name = "weird" + dialect_kwargs = dict( + doublequote=False, + escapechar="~", + delimiter=":", + skipinitialspace=False, + quotechar="~", + quoting=3, + ) + return dialect_name, dialect_kwargs + + +def test_dialect(all_parsers): + parser = all_parsers + data = """\ +label1,label2,label3 +index1,"a,c,e +index2,b,d,f +""" + + dia = csv.excel() + dia.quoting = csv.QUOTE_NONE + df = parser.read_csv(StringIO(data), dialect=dia) + + data = """\ +label1,label2,label3 +index1,a,c,e +index2,b,d,f +""" + exp = parser.read_csv(StringIO(data)) + exp.replace("a", '"a', inplace=True) + tm.assert_frame_equal(df, exp) + + +def test_dialect_str(all_parsers): + dialect_name = "mydialect" + parser = all_parsers + data = """\ +fruit:vegetable +apple:broccoli +pear:tomato +""" + exp = DataFrame({"fruit": ["apple", "pear"], "vegetable": ["broccoli", "tomato"]}) + + with tm.with_csv_dialect(dialect_name, delimiter=":"): + df = parser.read_csv(StringIO(data), dialect=dialect_name) + tm.assert_frame_equal(df, exp) + + +def test_invalid_dialect(all_parsers): + class InvalidDialect: + pass + + data = "a\n1" + parser = all_parsers + msg = "Invalid dialect" + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), dialect=InvalidDialect) + + +@pytest.mark.parametrize( + "arg", + [None, "doublequote", "escapechar", "skipinitialspace", "quotechar", "quoting"], +) +@pytest.mark.parametrize("value", ["dialect", "default", "other"]) +def test_dialect_conflict_except_delimiter(all_parsers, custom_dialect, arg, value): + # see gh-23761. + dialect_name, dialect_kwargs = custom_dialect + parser = all_parsers + + expected = DataFrame({"a": [1], "b": [2]}) + data = "a:b\n1:2" + + warning_klass = None + kwds = dict() + + # arg=None tests when we pass in the dialect without any other arguments. + if arg is not None: + if "value" == "dialect": # No conflict --> no warning. + kwds[arg] = dialect_kwargs[arg] + elif "value" == "default": # Default --> no warning. + from pandas.io.parsers import _parser_defaults + + kwds[arg] = _parser_defaults[arg] + else: # Non-default + conflict with dialect --> warning. + warning_klass = ParserWarning + kwds[arg] = "blah" + + with tm.with_csv_dialect(dialect_name, **dialect_kwargs): + with tm.assert_produces_warning(warning_klass): + result = parser.read_csv(StringIO(data), dialect=dialect_name, **kwds) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "kwargs,warning_klass", + [ + (dict(sep=","), None), # sep is default --> sep_override=True + (dict(sep="."), ParserWarning), # sep isn't default --> sep_override=False + (dict(delimiter=":"), None), # No conflict + (dict(delimiter=None), None), # Default arguments --> sep_override=True + (dict(delimiter=","), ParserWarning), # Conflict + (dict(delimiter="."), ParserWarning), # Conflict + ], + ids=[ + "sep-override-true", + "sep-override-false", + "delimiter-no-conflict", + "delimiter-default-arg", + "delimiter-conflict", + "delimiter-conflict2", + ], +) +def test_dialect_conflict_delimiter(all_parsers, custom_dialect, kwargs, warning_klass): + # see gh-23761. + dialect_name, dialect_kwargs = custom_dialect + parser = all_parsers + + expected = DataFrame({"a": [1], "b": [2]}) + data = "a:b\n1:2" + + with tm.with_csv_dialect(dialect_name, **dialect_kwargs): + with tm.assert_produces_warning(warning_klass): + result = parser.read_csv(StringIO(data), dialect=dialect_name, **kwargs) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/io/parser/test_dtypes.py b/venv/Lib/site-packages/pandas/tests/io/parser/test_dtypes.py new file mode 100644 index 0000000..11dcf7f --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/parser/test_dtypes.py @@ -0,0 +1,584 @@ +""" +Tests dtype specification during parsing +for all of the parsers defined in parsers.py +""" +from io import StringIO +import os + +import numpy as np +import pytest + +from pandas.errors import ParserWarning + +from pandas.core.dtypes.dtypes import CategoricalDtype + +import pandas as pd +from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timestamp, concat +import pandas._testing as tm + + +@pytest.mark.parametrize("dtype", [str, object]) +@pytest.mark.parametrize("check_orig", [True, False]) +def test_dtype_all_columns(all_parsers, dtype, check_orig): + # see gh-3795, gh-6607 + parser = all_parsers + + df = DataFrame( + np.random.rand(5, 2).round(4), + columns=list("AB"), + index=["1A", "1B", "1C", "1D", "1E"], + ) + + with tm.ensure_clean("__passing_str_as_dtype__.csv") as path: + df.to_csv(path) + + result = parser.read_csv(path, dtype=dtype, index_col=0) + + if check_orig: + expected = df.copy() + result = result.astype(float) + else: + expected = df.astype(str) + + tm.assert_frame_equal(result, expected) + + +def test_dtype_all_columns_empty(all_parsers): + # see gh-12048 + parser = all_parsers + result = parser.read_csv(StringIO("A,B"), dtype=str) + + expected = DataFrame({"A": [], "B": []}, index=[], dtype=str) + tm.assert_frame_equal(result, expected) + + +def test_dtype_per_column(all_parsers): + parser = all_parsers + data = """\ +one,two +1,2.5 +2,3.5 +3,4.5 +4,5.5""" + expected = DataFrame( + [[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"] + ) + expected["one"] = expected["one"].astype(np.float64) + expected["two"] = expected["two"].astype(object) + + result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str}) + tm.assert_frame_equal(result, expected) + + +def test_invalid_dtype_per_column(all_parsers): + parser = all_parsers + data = """\ +one,two +1,2.5 +2,3.5 +3,4.5 +4,5.5""" + + with pytest.raises(TypeError, match="data type [\"']foo[\"'] not understood"): + parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"}) + + +@pytest.mark.parametrize( + "dtype", + [ + "category", + CategoricalDtype(), + {"a": "category", "b": "category", "c": CategoricalDtype()}, + ], +) +def test_categorical_dtype(all_parsers, dtype): + # see gh-10153 + parser = all_parsers + data = """a,b,c +1,a,3.4 +1,a,3.4 +2,b,4.5""" + expected = DataFrame( + { + "a": Categorical(["1", "1", "2"]), + "b": Categorical(["a", "a", "b"]), + "c": Categorical(["3.4", "3.4", "4.5"]), + } + ) + actual = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(actual, expected) + + +@pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}]) +def test_categorical_dtype_single(all_parsers, dtype): + # see gh-10153 + parser = all_parsers + data = """a,b,c +1,a,3.4 +1,a,3.4 +2,b,4.5""" + expected = DataFrame( + {"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]} + ) + actual = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(actual, expected) + + +def test_categorical_dtype_unsorted(all_parsers): + # see gh-10153 + parser = all_parsers + data = """a,b,c +1,b,3.4 +1,b,3.4 +2,a,4.5""" + expected = DataFrame( + { + "a": Categorical(["1", "1", "2"]), + "b": Categorical(["b", "b", "a"]), + "c": Categorical(["3.4", "3.4", "4.5"]), + } + ) + actual = parser.read_csv(StringIO(data), dtype="category") + tm.assert_frame_equal(actual, expected) + + +def test_categorical_dtype_missing(all_parsers): + # see gh-10153 + parser = all_parsers + data = """a,b,c +1,b,3.4 +1,nan,3.4 +2,a,4.5""" + expected = DataFrame( + { + "a": Categorical(["1", "1", "2"]), + "b": Categorical(["b", np.nan, "a"]), + "c": Categorical(["3.4", "3.4", "4.5"]), + } + ) + actual = parser.read_csv(StringIO(data), dtype="category") + tm.assert_frame_equal(actual, expected) + + +@pytest.mark.slow +def test_categorical_dtype_high_cardinality_numeric(all_parsers): + # see gh-18186 + parser = all_parsers + data = np.sort([str(i) for i in range(524289)]) + expected = DataFrame({"a": Categorical(data, ordered=True)}) + + actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category") + actual["a"] = actual["a"].cat.reorder_categories( + np.sort(actual.a.cat.categories), ordered=True + ) + tm.assert_frame_equal(actual, expected) + + +def test_categorical_dtype_latin1(all_parsers, csv_dir_path): + # see gh-10153 + pth = os.path.join(csv_dir_path, "unicode_series.csv") + parser = all_parsers + encoding = "latin-1" + + expected = parser.read_csv(pth, header=None, encoding=encoding) + expected[1] = Categorical(expected[1]) + + actual = parser.read_csv(pth, header=None, encoding=encoding, dtype={1: "category"}) + tm.assert_frame_equal(actual, expected) + + +def test_categorical_dtype_utf16(all_parsers, csv_dir_path): + # see gh-10153 + pth = os.path.join(csv_dir_path, "utf16_ex.txt") + parser = all_parsers + encoding = "utf-16" + sep = "," + + expected = parser.read_csv(pth, sep=sep, encoding=encoding) + expected = expected.apply(Categorical) + + actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category") + tm.assert_frame_equal(actual, expected) + + +def test_categorical_dtype_chunksize_infer_categories(all_parsers): + # see gh-10153 + parser = all_parsers + data = """a,b +1,a +1,b +1,b +2,c""" + expecteds = [ + DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}), + DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]), + ] + actuals = parser.read_csv(StringIO(data), dtype={"b": "category"}, chunksize=2) + + for actual, expected in zip(actuals, expecteds): + tm.assert_frame_equal(actual, expected) + + +def test_categorical_dtype_chunksize_explicit_categories(all_parsers): + # see gh-10153 + parser = all_parsers + data = """a,b +1,a +1,b +1,b +2,c""" + cats = ["a", "b", "c"] + expecteds = [ + DataFrame({"a": [1, 1], "b": Categorical(["a", "b"], categories=cats)}), + DataFrame( + {"a": [1, 2], "b": Categorical(["b", "c"], categories=cats)}, index=[2, 3] + ), + ] + dtype = CategoricalDtype(cats) + actuals = parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) + + for actual, expected in zip(actuals, expecteds): + tm.assert_frame_equal(actual, expected) + + +@pytest.mark.parametrize("ordered", [False, True]) +@pytest.mark.parametrize( + "categories", + [["a", "b", "c"], ["a", "c", "b"], ["a", "b", "c", "d"], ["c", "b", "a"]], +) +def test_categorical_category_dtype(all_parsers, categories, ordered): + parser = all_parsers + data = """a,b +1,a +1,b +1,b +2,c""" + expected = DataFrame( + { + "a": [1, 1, 1, 2], + "b": Categorical( + ["a", "b", "b", "c"], categories=categories, ordered=ordered + ), + } + ) + + dtype = {"b": CategoricalDtype(categories=categories, ordered=ordered)} + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + +def test_categorical_category_dtype_unsorted(all_parsers): + parser = all_parsers + data = """a,b +1,a +1,b +1,b +2,c""" + dtype = CategoricalDtype(["c", "b", "a"]) + expected = DataFrame( + { + "a": [1, 1, 1, 2], + "b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]), + } + ) + + result = parser.read_csv(StringIO(data), dtype={"b": dtype}) + tm.assert_frame_equal(result, expected) + + +def test_categorical_coerces_numeric(all_parsers): + parser = all_parsers + dtype = {"b": CategoricalDtype([1, 2, 3])} + + data = "b\n1\n1\n2\n3" + expected = DataFrame({"b": Categorical([1, 1, 2, 3])}) + + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + +def test_categorical_coerces_datetime(all_parsers): + parser = all_parsers + dtype = {"b": CategoricalDtype(pd.date_range("2017", "2019", freq="AS"))} + + data = "b\n2017-01-01\n2018-01-01\n2019-01-01" + expected = DataFrame({"b": Categorical(dtype["b"].categories)}) + + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + +def test_categorical_coerces_timestamp(all_parsers): + parser = all_parsers + dtype = {"b": CategoricalDtype([Timestamp("2014")])} + + data = "b\n2014-01-01\n2014-01-01T00:00:00" + expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)}) + + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + +def test_categorical_coerces_timedelta(all_parsers): + parser = all_parsers + dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))} + + data = "b\n1H\n2H\n3H" + expected = DataFrame({"b": Categorical(dtype["b"].categories)}) + + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data", + [ + "b\nTrue\nFalse\nNA\nFalse", + "b\ntrue\nfalse\nNA\nfalse", + "b\nTRUE\nFALSE\nNA\nFALSE", + "b\nTrue\nFalse\nNA\nFALSE", + ], +) +def test_categorical_dtype_coerces_boolean(all_parsers, data): + # see gh-20498 + parser = all_parsers + dtype = {"b": CategoricalDtype([False, True])} + expected = DataFrame({"b": Categorical([True, False, None, False])}) + + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + +def test_categorical_unexpected_categories(all_parsers): + parser = all_parsers + dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])} + + data = "b\nd\na\nc\nd" # Unexpected c + expected = DataFrame({"b": Categorical(list("dacd"), dtype=dtype["b"])}) + + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + +def test_empty_pass_dtype(all_parsers): + parser = all_parsers + + data = "one,two" + result = parser.read_csv(StringIO(data), dtype={"one": "u1"}) + + expected = DataFrame( + {"one": np.empty(0, dtype="u1"), "two": np.empty(0, dtype=np.object)}, + index=Index([], dtype=object), + ) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_index_pass_dtype(all_parsers): + parser = all_parsers + + data = "one,two" + result = parser.read_csv( + StringIO(data), index_col=["one"], dtype={"one": "u1", 1: "f"} + ) + + expected = DataFrame( + {"two": np.empty(0, dtype="f")}, index=Index([], dtype="u1", name="one") + ) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_multi_index_pass_dtype(all_parsers): + parser = all_parsers + + data = "one,two,three" + result = parser.read_csv( + StringIO(data), index_col=["one", "two"], dtype={"one": "u1", 1: "f8"} + ) + + exp_idx = MultiIndex.from_arrays( + [np.empty(0, dtype="u1"), np.empty(0, dtype=np.float64)], names=["one", "two"] + ) + expected = DataFrame({"three": np.empty(0, dtype=np.object)}, index=exp_idx) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers): + parser = all_parsers + + data = "one,one" + result = parser.read_csv(StringIO(data), dtype={"one": "u1", "one.1": "f"}) + + expected = DataFrame( + {"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")}, + index=Index([], dtype=object), + ) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers): + parser = all_parsers + + data = "one,one" + result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"}) + + expected = DataFrame( + {"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")}, + index=Index([], dtype=object), + ) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers): + # see gh-9424 + parser = all_parsers + expected = concat( + [Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")], + axis=1, + ) + expected.index = expected.index.astype(object) + + data = "one,one" + result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"}) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_dup_column_pass_dtype_by_indexes_raises(all_parsers): + # see gh-9424 + parser = all_parsers + expected = concat( + [Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")], + axis=1, + ) + expected.index = expected.index.astype(object) + + with pytest.raises(ValueError, match="Duplicate names"): + data = "" + parser.read_csv(StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"}) + + +def test_raise_on_passed_int_dtype_with_nas(all_parsers): + # see gh-2631 + parser = all_parsers + data = """YEAR, DOY, a +2001,106380451,10 +2001,,11 +2001,106380451,67""" + + msg = ( + "Integer column has NA values" + if parser.engine == "c" + else "Unable to convert column DOY" + ) + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True) + + +def test_dtype_with_converters(all_parsers): + parser = all_parsers + data = """a,b +1.1,2.2 +1.2,2.3""" + + # Dtype spec ignored if converted specified. + with tm.assert_produces_warning(ParserWarning): + result = parser.read_csv( + StringIO(data), dtype={"a": "i8"}, converters={"a": lambda x: str(x)} + ) + expected = DataFrame({"a": ["1.1", "1.2"], "b": [2.2, 2.3]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype,expected", + [ + (np.float64, DataFrame(columns=["a", "b"], dtype=np.float64)), + ("category", DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[])), + ( + dict(a="category", b="category"), + DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[]), + ), + ("datetime64[ns]", DataFrame(columns=["a", "b"], dtype="datetime64[ns]")), + ( + "timedelta64[ns]", + DataFrame( + { + "a": Series([], dtype="timedelta64[ns]"), + "b": Series([], dtype="timedelta64[ns]"), + }, + index=[], + ), + ), + ( + dict(a=np.int64, b=np.int32), + DataFrame( + {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, + index=[], + ), + ), + ( + {0: np.int64, 1: np.int32}, + DataFrame( + {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, + index=[], + ), + ), + ( + {"a": np.int64, 1: np.int32}, + DataFrame( + {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, + index=[], + ), + ), + ], +) +def test_empty_dtype(all_parsers, dtype, expected): + # see gh-14712 + parser = all_parsers + data = "a,b" + + result = parser.read_csv(StringIO(data), header=0, dtype=dtype) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype", list(np.typecodes["AllInteger"] + np.typecodes["Float"]) +) +def test_numeric_dtype(all_parsers, dtype): + data = "0\n1" + parser = all_parsers + expected = DataFrame([0, 1], dtype=dtype) + + result = parser.read_csv(StringIO(data), header=None, dtype=dtype) + tm.assert_frame_equal(expected, result) + + +def test_boolean_dtype(all_parsers): + parser = all_parsers + data = "\n".join( + [ + "a", + "True", + "TRUE", + "true", + "False", + "FALSE", + "false", + "NaN", + "nan", + "NA", + "null", + "NULL", + ] + ) + + result = parser.read_csv(StringIO(data), dtype="boolean") + expected = pd.DataFrame( + { + "a": pd.array( + [True, True, True, False, False, False, None, None, None, None, None], + dtype="boolean", + ) + } + ) + + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/io/parser/test_encoding.py b/venv/Lib/site-packages/pandas/tests/io/parser/test_encoding.py new file mode 100644 index 0000000..33abf4b --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/parser/test_encoding.py @@ -0,0 +1,172 @@ +""" +Tests encoding functionality during parsing +for all of the parsers defined in parsers.py +""" + +from io import BytesIO +import os +import tempfile + +import numpy as np +import pytest + +from pandas import DataFrame +import pandas._testing as tm + + +def test_bytes_io_input(all_parsers): + encoding = "cp1255" + parser = all_parsers + + data = BytesIO("שלום:1234\n562:123".encode(encoding)) + result = parser.read_csv(data, sep=":", encoding=encoding) + + expected = DataFrame([[562, 123]], columns=["שלום", "1234"]) + tm.assert_frame_equal(result, expected) + + +def test_read_csv_unicode(all_parsers): + parser = all_parsers + data = BytesIO("\u0141aski, Jan;1".encode("utf-8")) + + result = parser.read_csv(data, sep=";", encoding="utf-8", header=None) + expected = DataFrame([["\u0141aski, Jan", 1]]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("sep", [",", "\t"]) +@pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"]) +def test_utf16_bom_skiprows(all_parsers, sep, encoding): + # see gh-2298 + parser = all_parsers + data = """skip this +skip this too +A,B,C +1,2,3 +4,5,6""".replace( + ",", sep + ) + path = "__{}__.csv".format(tm.rands(10)) + kwargs = dict(sep=sep, skiprows=2) + utf8 = "utf-8" + + with tm.ensure_clean(path) as path: + from io import TextIOWrapper + + bytes_data = data.encode(encoding) + + with open(path, "wb") as f: + f.write(bytes_data) + + bytes_buffer = BytesIO(data.encode(utf8)) + bytes_buffer = TextIOWrapper(bytes_buffer, encoding=utf8) + + result = parser.read_csv(path, encoding=encoding, **kwargs) + expected = parser.read_csv(bytes_buffer, encoding=utf8, **kwargs) + + bytes_buffer.close() + tm.assert_frame_equal(result, expected) + + +def test_utf16_example(all_parsers, csv_dir_path): + path = os.path.join(csv_dir_path, "utf16_ex.txt") + parser = all_parsers + result = parser.read_csv(path, encoding="utf-16", sep="\t") + assert len(result) == 50 + + +def test_unicode_encoding(all_parsers, csv_dir_path): + path = os.path.join(csv_dir_path, "unicode_series.csv") + parser = all_parsers + + result = parser.read_csv(path, header=None, encoding="latin-1") + result = result.set_index(0) + got = result[1][1632] + + expected = "\xc1 k\xf6ldum klaka (Cold Fever) (1994)" + assert got == expected + + +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + # Basic test + ("a\n1", dict(), DataFrame({"a": [1]})), + # "Regular" quoting + ('"a"\n1', dict(quotechar='"'), DataFrame({"a": [1]})), + # Test in a data row instead of header + ("b\n1", dict(names=["a"]), DataFrame({"a": ["b", "1"]})), + # Test in empty data row with skipping + ("\n1", dict(names=["a"], skip_blank_lines=True), DataFrame({"a": [1]})), + # Test in empty data row without skipping + ( + "\n1", + dict(names=["a"], skip_blank_lines=False), + DataFrame({"a": [np.nan, 1]}), + ), + ], +) +def test_utf8_bom(all_parsers, data, kwargs, expected): + # see gh-4793 + parser = all_parsers + bom = "\ufeff" + utf8 = "utf-8" + + def _encode_data_with_bom(_data): + bom_data = (bom + _data).encode(utf8) + return BytesIO(bom_data) + + result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt): + # see gh-13549 + expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]}) + parser = all_parsers + + encoding = encoding_fmt.format(utf_value) + data = "mb_num,multibyte\n4.8,test".encode(encoding) + + result = parser.read_csv(BytesIO(data), encoding=encoding) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "fname,encoding", + [ + ("test1.csv", "utf-8"), + ("unicode_series.csv", "latin-1"), + ("sauron.SHIFT_JIS.csv", "shiftjis"), + ], +) +def test_binary_mode_file_buffers(all_parsers, csv_dir_path, fname, encoding): + # gh-23779: Python csv engine shouldn't error on files opened in binary. + parser = all_parsers + + fpath = os.path.join(csv_dir_path, fname) + expected = parser.read_csv(fpath, encoding=encoding) + + with open(fpath, mode="r", encoding=encoding) as fa: + result = parser.read_csv(fa) + tm.assert_frame_equal(expected, result) + + with open(fpath, mode="rb") as fb: + result = parser.read_csv(fb, encoding=encoding) + tm.assert_frame_equal(expected, result) + + +@pytest.mark.parametrize("pass_encoding", [True, False]) +def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding): + # see gh-24130 + parser = all_parsers + encoding = encoding_fmt.format(utf_value) + + expected = DataFrame({"foo": ["bar"]}) + + with tempfile.TemporaryFile(mode="w+", encoding=encoding) as f: + f.write("foo\nbar") + f.seek(0) + + result = parser.read_csv(f, encoding=encoding if pass_encoding else None) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/io/parser/test_header.py b/venv/Lib/site-packages/pandas/tests/io/parser/test_header.py new file mode 100644 index 0000000..7dc106e --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/parser/test_header.py @@ -0,0 +1,573 @@ +""" +Tests that the file header is properly handled or inferred +during parsing for all of the parsers defined in parsers.py +""" + +from collections import namedtuple +from io import StringIO + +import numpy as np +import pytest + +from pandas.errors import ParserError + +from pandas import DataFrame, Index, MultiIndex +import pandas._testing as tm + + +def test_read_with_bad_header(all_parsers): + parser = all_parsers + msg = r"but only \d+ lines in file" + + with pytest.raises(ValueError, match=msg): + s = StringIO(",,") + parser.read_csv(s, header=[10]) + + +def test_negative_header(all_parsers): + # see gh-27779 + parser = all_parsers + data = """1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 +""" + with pytest.raises( + ValueError, + match="Passing negative integer to header is invalid. " + "For no header, use header=None instead", + ): + parser.read_csv(StringIO(data), header=-1) + + +@pytest.mark.parametrize("header", [([-1, 2, 4]), ([-5, 0])]) +def test_negative_multi_index_header(all_parsers, header): + # see gh-27779 + parser = all_parsers + data = """1,2,3,4,5 + 6,7,8,9,10 + 11,12,13,14,15 + """ + with pytest.raises( + ValueError, match="cannot specify multi-index header with negative integers" + ): + parser.read_csv(StringIO(data), header=header) + + +@pytest.mark.parametrize("header", [True, False]) +def test_bool_header_arg(all_parsers, header): + # see gh-6114 + parser = all_parsers + data = """\ +MyColumn +a +b +a +b""" + msg = "Passing a bool to header is invalid" + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), header=header) + + +def test_no_header_prefix(all_parsers): + parser = all_parsers + data = """1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 +""" + result = parser.read_csv(StringIO(data), prefix="Field", header=None) + expected = DataFrame( + [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], + columns=["Field0", "Field1", "Field2", "Field3", "Field4"], + ) + tm.assert_frame_equal(result, expected) + + +def test_header_with_index_col(all_parsers): + parser = all_parsers + data = """foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + names = ["A", "B", "C"] + result = parser.read_csv(StringIO(data), names=names) + + expected = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=["foo", "bar", "baz"], + columns=["A", "B", "C"], + ) + tm.assert_frame_equal(result, expected) + + +def test_header_not_first_line(all_parsers): + parser = all_parsers + data = """got,to,ignore,this,line +got,to,ignore,this,line +index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +""" + data2 = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +""" + + result = parser.read_csv(StringIO(data), header=2, index_col=0) + expected = parser.read_csv(StringIO(data2), header=0, index_col=0) + tm.assert_frame_equal(result, expected) + + +def test_header_multi_index(all_parsers): + parser = all_parsers + expected = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) + + data = """\ +C0,,C_l0_g0,C_l0_g1,C_l0_g2 + +C1,,C_l1_g0,C_l1_g1,C_l1_g2 +C2,,C_l2_g0,C_l2_g1,C_l2_g2 +C3,,C_l3_g0,C_l3_g1,C_l3_g2 +R0,R1,,, +R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2 +R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2 +R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2 +R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2 +R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2 +""" + result = parser.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "kwargs,msg", + [ + ( + dict(index_col=["foo", "bar"]), + ( + "index_col must only contain " + "row numbers when specifying " + "a multi-index header" + ), + ), + ( + dict(index_col=[0, 1], names=["foo", "bar"]), + ("cannot specify names when specifying a multi-index header"), + ), + ( + dict(index_col=[0, 1], usecols=["foo", "bar"]), + ("cannot specify usecols when specifying a multi-index header"), + ), + ], +) +def test_header_multi_index_invalid(all_parsers, kwargs, msg): + data = """\ +C0,,C_l0_g0,C_l0_g1,C_l0_g2 + +C1,,C_l1_g0,C_l1_g1,C_l1_g2 +C2,,C_l2_g0,C_l2_g1,C_l2_g2 +C3,,C_l3_g0,C_l3_g1,C_l3_g2 +R0,R1,,, +R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2 +R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2 +R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2 +R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2 +R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2 +""" + parser = all_parsers + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), header=[0, 1, 2, 3], **kwargs) + + +_TestTuple = namedtuple("names", ["first", "second"]) + + +@pytest.mark.parametrize( + "kwargs", + [ + dict(header=[0, 1]), + dict( + skiprows=3, + names=[ + ("a", "q"), + ("a", "r"), + ("a", "s"), + ("b", "t"), + ("c", "u"), + ("c", "v"), + ], + ), + dict( + skiprows=3, + names=[ + _TestTuple("a", "q"), + _TestTuple("a", "r"), + _TestTuple("a", "s"), + _TestTuple("b", "t"), + _TestTuple("c", "u"), + _TestTuple("c", "v"), + ], + ), + ], +) +def test_header_multi_index_common_format1(all_parsers, kwargs): + parser = all_parsers + expected = DataFrame( + [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], + index=["one", "two"], + columns=MultiIndex.from_tuples( + [("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")] + ), + ) + data = """,a,a,a,b,c,c +,q,r,s,t,u,v +,,,,,, +one,1,2,3,4,5,6 +two,7,8,9,10,11,12""" + + result = parser.read_csv(StringIO(data), index_col=0, **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "kwargs", + [ + dict(header=[0, 1]), + dict( + skiprows=2, + names=[ + ("a", "q"), + ("a", "r"), + ("a", "s"), + ("b", "t"), + ("c", "u"), + ("c", "v"), + ], + ), + dict( + skiprows=2, + names=[ + _TestTuple("a", "q"), + _TestTuple("a", "r"), + _TestTuple("a", "s"), + _TestTuple("b", "t"), + _TestTuple("c", "u"), + _TestTuple("c", "v"), + ], + ), + ], +) +def test_header_multi_index_common_format2(all_parsers, kwargs): + parser = all_parsers + expected = DataFrame( + [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], + index=["one", "two"], + columns=MultiIndex.from_tuples( + [("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")] + ), + ) + data = """,a,a,a,b,c,c +,q,r,s,t,u,v +one,1,2,3,4,5,6 +two,7,8,9,10,11,12""" + + result = parser.read_csv(StringIO(data), index_col=0, **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "kwargs", + [ + dict(header=[0, 1]), + dict( + skiprows=2, + names=[ + ("a", "q"), + ("a", "r"), + ("a", "s"), + ("b", "t"), + ("c", "u"), + ("c", "v"), + ], + ), + dict( + skiprows=2, + names=[ + _TestTuple("a", "q"), + _TestTuple("a", "r"), + _TestTuple("a", "s"), + _TestTuple("b", "t"), + _TestTuple("c", "u"), + _TestTuple("c", "v"), + ], + ), + ], +) +def test_header_multi_index_common_format3(all_parsers, kwargs): + parser = all_parsers + expected = DataFrame( + [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], + index=["one", "two"], + columns=MultiIndex.from_tuples( + [("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")] + ), + ) + expected = expected.reset_index(drop=True) + data = """a,a,a,b,c,c +q,r,s,t,u,v +1,2,3,4,5,6 +7,8,9,10,11,12""" + + result = parser.read_csv(StringIO(data), index_col=None, **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_header_multi_index_common_format_malformed1(all_parsers): + parser = all_parsers + expected = DataFrame( + np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"), + index=Index([1, 7]), + columns=MultiIndex( + levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]], + codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], + names=["a", "q"], + ), + ) + data = """a,a,a,b,c,c +q,r,s,t,u,v +1,2,3,4,5,6 +7,8,9,10,11,12""" + + result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0) + tm.assert_frame_equal(expected, result) + + +def test_header_multi_index_common_format_malformed2(all_parsers): + parser = all_parsers + expected = DataFrame( + np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"), + index=Index([1, 7]), + columns=MultiIndex( + levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]], + codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], + names=[None, "q"], + ), + ) + + data = """,a,a,b,c,c +q,r,s,t,u,v +1,2,3,4,5,6 +7,8,9,10,11,12""" + + result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0) + tm.assert_frame_equal(expected, result) + + +def test_header_multi_index_common_format_malformed3(all_parsers): + parser = all_parsers + expected = DataFrame( + np.array([[3, 4, 5, 6], [9, 10, 11, 12]], dtype="int64"), + index=MultiIndex(levels=[[1, 7], [2, 8]], codes=[[0, 1], [0, 1]]), + columns=MultiIndex( + levels=[["a", "b", "c"], ["s", "t", "u", "v"]], + codes=[[0, 1, 2, 2], [0, 1, 2, 3]], + names=[None, "q"], + ), + ) + data = """,a,a,b,c,c +q,r,s,t,u,v +1,2,3,4,5,6 +7,8,9,10,11,12""" + + result = parser.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1]) + tm.assert_frame_equal(expected, result) + + +@pytest.mark.parametrize( + "data,header", [("1,2,3\n4,5,6", None), ("foo,bar,baz\n1,2,3\n4,5,6", 0)] +) +def test_header_names_backward_compat(all_parsers, data, header): + # see gh-2539 + parser = all_parsers + expected = parser.read_csv(StringIO("1,2,3\n4,5,6"), names=["a", "b", "c"]) + + result = parser.read_csv(StringIO(data), names=["a", "b", "c"], header=header) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("kwargs", [dict(), dict(index_col=False)]) +def test_read_only_header_no_rows(all_parsers, kwargs): + # See gh-7773 + parser = all_parsers + expected = DataFrame(columns=["a", "b", "c"]) + + result = parser.read_csv(StringIO("a,b,c"), **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "kwargs,names", + [ + (dict(), [0, 1, 2, 3, 4]), + (dict(prefix="X"), ["X0", "X1", "X2", "X3", "X4"]), + ( + dict(names=["foo", "bar", "baz", "quux", "panda"]), + ["foo", "bar", "baz", "quux", "panda"], + ), + ], +) +def test_no_header(all_parsers, kwargs, names): + parser = all_parsers + data = """1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 +""" + expected = DataFrame( + [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], columns=names + ) + result = parser.read_csv(StringIO(data), header=None, **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("header", [["a", "b"], "string_header"]) +def test_non_int_header(all_parsers, header): + # see gh-16338 + msg = "header must be integer or list of integers" + data = """1,2\n3,4""" + parser = all_parsers + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), header=header) + + +def test_singleton_header(all_parsers): + # see gh-7757 + data = """a,b,c\n0,1,2\n1,2,3""" + parser = all_parsers + + expected = DataFrame({"a": [0, 1], "b": [1, 2], "c": [2, 3]}) + result = parser.read_csv(StringIO(data), header=[0]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data,expected", + [ + ( + "A,A,A,B\none,one,one,two\n0,40,34,0.1", + DataFrame( + [[0, 40, 34, 0.1]], + columns=MultiIndex.from_tuples( + [("A", "one"), ("A", "one.1"), ("A", "one.2"), ("B", "two")] + ), + ), + ), + ( + "A,A,A,B\none,one,one.1,two\n0,40,34,0.1", + DataFrame( + [[0, 40, 34, 0.1]], + columns=MultiIndex.from_tuples( + [("A", "one"), ("A", "one.1"), ("A", "one.1.1"), ("B", "two")] + ), + ), + ), + ( + "A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1", + DataFrame( + [[0, 40, 34, 0.1, 0.1]], + columns=MultiIndex.from_tuples( + [ + ("A", "one"), + ("A", "one.1"), + ("A", "one.1.1"), + ("B", "two"), + ("B", "two.1"), + ] + ), + ), + ), + ], +) +def test_mangles_multi_index(all_parsers, data, expected): + # see gh-18062 + parser = all_parsers + + result = parser.read_csv(StringIO(data), header=[0, 1]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("index_col", [None, [0]]) +@pytest.mark.parametrize( + "columns", [None, (["", "Unnamed"]), (["Unnamed", ""]), (["Unnamed", "NotUnnamed"])] +) +def test_multi_index_unnamed(all_parsers, index_col, columns): + # see gh-23687 + # + # When specifying a multi-index header, make sure that + # we don't error just because one of the rows in our header + # has ALL column names containing the string "Unnamed". The + # correct condition to check is whether the row contains + # ALL columns that did not have names (and instead were given + # placeholder ones). + parser = all_parsers + header = [0, 1] + + if index_col is None: + data = ",".join(columns or ["", ""]) + "\n0,1\n2,3\n4,5\n" + else: + data = ",".join([""] + (columns or ["", ""])) + "\n,0,1\n0,2,3\n1,4,5\n" + + if columns is None: + msg = ( + r"Passed header=\[0,1\] are too " + r"many rows for this multi_index of columns" + ) + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data), header=header, index_col=index_col) + else: + result = parser.read_csv(StringIO(data), header=header, index_col=index_col) + template = "Unnamed: {i}_level_0" + exp_columns = [] + + for i, col in enumerate(columns): + if not col: # Unnamed. + col = template.format(i=i if index_col is None else i + 1) + + exp_columns.append(col) + + columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"])) + expected = DataFrame([[2, 3], [4, 5]], columns=columns) + tm.assert_frame_equal(result, expected) + + +def test_read_csv_multiindex_columns(all_parsers): + # GH#6051 + parser = all_parsers + + s1 = "Male, Male, Male, Female, Female\nR, R, L, R, R\n.86, .67, .88, .78, .81" + s2 = ( + "Male, Male, Male, Female, Female\n" + "R, R, L, R, R\n" + ".86, .67, .88, .78, .81\n" + ".86, .67, .88, .78, .82" + ) + + mi = MultiIndex.from_tuples( + [ + ("Male", "R"), + (" Male", " R"), + (" Male", " L"), + (" Female", " R"), + (" Female", " R.1"), + ] + ) + expected = DataFrame( + [[0.86, 0.67, 0.88, 0.78, 0.81], [0.86, 0.67, 0.88, 0.78, 0.82]], columns=mi + ) + + df1 = parser.read_csv(StringIO(s1), header=[0, 1]) + tm.assert_frame_equal(df1, expected.iloc[:1]) + df2 = parser.read_csv(StringIO(s2), header=[0, 1]) + tm.assert_frame_equal(df2, expected) diff --git a/venv/Lib/site-packages/pandas/tests/io/parser/test_index_col.py b/venv/Lib/site-packages/pandas/tests/io/parser/test_index_col.py new file mode 100644 index 0000000..f67a658 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/parser/test_index_col.py @@ -0,0 +1,186 @@ +""" +Tests that the specified index column (a.k.a "index_col") +is properly handled or inferred during parsing for all of +the parsers defined in parsers.py +""" +from io import StringIO + +import numpy as np +import pytest + +from pandas import DataFrame, Index, MultiIndex +import pandas._testing as tm + + +@pytest.mark.parametrize("with_header", [True, False]) +def test_index_col_named(all_parsers, with_header): + parser = all_parsers + no_header = """\ +KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" # noqa + header = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n" + + if with_header: + data = header + no_header + + result = parser.read_csv(StringIO(data), index_col="ID") + expected = parser.read_csv(StringIO(data), header=0).set_index("ID") + tm.assert_frame_equal(result, expected) + else: + data = no_header + msg = "Index ID invalid" + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), index_col="ID") + + +def test_index_col_named2(all_parsers): + parser = all_parsers + data = """\ +1,2,3,4,hello +5,6,7,8,world +9,10,11,12,foo +""" + + expected = DataFrame( + {"a": [1, 5, 9], "b": [2, 6, 10], "c": [3, 7, 11], "d": [4, 8, 12]}, + index=Index(["hello", "world", "foo"], name="message"), + ) + names = ["a", "b", "c", "d", "message"] + + result = parser.read_csv(StringIO(data), names=names, index_col=["message"]) + tm.assert_frame_equal(result, expected) + + +def test_index_col_is_true(all_parsers): + # see gh-9798 + data = "a,b\n1,2" + parser = all_parsers + + msg = "The value of index_col couldn't be 'True'" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), index_col=True) + + +def test_infer_index_col(all_parsers): + data = """A,B,C +foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + parser = all_parsers + result = parser.read_csv(StringIO(data)) + + expected = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=["foo", "bar", "baz"], + columns=["A", "B", "C"], + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "index_col,kwargs", + [ + (None, dict(columns=["x", "y", "z"])), + (False, dict(columns=["x", "y", "z"])), + (0, dict(columns=["y", "z"], index=Index([], name="x"))), + (1, dict(columns=["x", "z"], index=Index([], name="y"))), + ("x", dict(columns=["y", "z"], index=Index([], name="x"))), + ("y", dict(columns=["x", "z"], index=Index([], name="y"))), + ( + [0, 1], + dict( + columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"]) + ), + ), + ( + ["x", "y"], + dict( + columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"]) + ), + ), + ( + [1, 0], + dict( + columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"]) + ), + ), + ( + ["y", "x"], + dict( + columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"]) + ), + ), + ], +) +def test_index_col_empty_data(all_parsers, index_col, kwargs): + data = "x,y,z" + parser = all_parsers + result = parser.read_csv(StringIO(data), index_col=index_col) + + expected = DataFrame(**kwargs) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_index_col_false(all_parsers): + # see gh-10413 + data = "x,y" + parser = all_parsers + result = parser.read_csv(StringIO(data), index_col=False) + + expected = DataFrame(columns=["x", "y"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "index_names", + [ + ["", ""], + ["foo", ""], + ["", "bar"], + ["foo", "bar"], + ["NotReallyUnnamed", "Unnamed: 0"], + ], +) +def test_multi_index_naming(all_parsers, index_names): + parser = all_parsers + + # We don't want empty index names being replaced with "Unnamed: 0" + data = ",".join(index_names + ["col\na,c,1\na,d,2\nb,c,3\nb,d,4"]) + result = parser.read_csv(StringIO(data), index_col=[0, 1]) + + expected = DataFrame( + {"col": [1, 2, 3, 4]}, index=MultiIndex.from_product([["a", "b"], ["c", "d"]]) + ) + expected.index.names = [name if name else None for name in index_names] + tm.assert_frame_equal(result, expected) + + +def test_multi_index_naming_not_all_at_beginning(all_parsers): + parser = all_parsers + data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4" + result = parser.read_csv(StringIO(data), index_col=[0, 2]) + + expected = DataFrame( + {"Unnamed: 2": ["c", "d", "c", "d"]}, + index=MultiIndex( + levels=[["a", "b"], [1, 2, 3, 4]], codes=[[0, 0, 1, 1], [0, 1, 2, 3]] + ), + ) + tm.assert_frame_equal(result, expected) + + +def test_no_multi_index_level_names_empty(all_parsers): + # GH 10984 + parser = all_parsers + midx = MultiIndex.from_tuples([("A", 1, 2), ("A", 1, 2), ("B", 1, 2)]) + expected = DataFrame(np.random.randn(3, 3), index=midx, columns=["x", "y", "z"]) + with tm.ensure_clean() as path: + expected.to_csv(path) + result = parser.read_csv(path, index_col=[0, 1, 2]) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/io/parser/test_mangle_dupes.py b/venv/Lib/site-packages/pandas/tests/io/parser/test_mangle_dupes.py new file mode 100644 index 0000000..5c4e642 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/parser/test_mangle_dupes.py @@ -0,0 +1,132 @@ +""" +Tests that duplicate columns are handled appropriately when parsed by the +CSV engine. In general, the expected result is that they are either thoroughly +de-duplicated (if mangling requested) or ignored otherwise. +""" +from io import StringIO + +import pytest + +from pandas import DataFrame +import pandas._testing as tm + + +@pytest.mark.parametrize("kwargs", [dict(), dict(mangle_dupe_cols=True)]) +def test_basic(all_parsers, kwargs): + # TODO: add test for condition "mangle_dupe_cols=False" + # once it is actually supported (gh-12935) + parser = all_parsers + + data = "a,a,b,b,b\n1,2,3,4,5" + result = parser.read_csv(StringIO(data), sep=",", **kwargs) + + expected = DataFrame([[1, 2, 3, 4, 5]], columns=["a", "a.1", "b", "b.1", "b.2"]) + tm.assert_frame_equal(result, expected) + + +def test_basic_names(all_parsers): + # See gh-7160 + parser = all_parsers + + data = "a,b,a\n0,1,2\n3,4,5" + expected = DataFrame([[0, 1, 2], [3, 4, 5]], columns=["a", "b", "a.1"]) + + result = parser.read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) + + +def test_basic_names_raise(all_parsers): + # See gh-7160 + parser = all_parsers + + data = "0,1,2\n3,4,5" + with pytest.raises(ValueError, match="Duplicate names"): + parser.read_csv(StringIO(data), names=["a", "b", "a"]) + + +@pytest.mark.parametrize( + "data,expected", + [ + ("a,a,a.1\n1,2,3", DataFrame([[1, 2, 3]], columns=["a", "a.1", "a.1.1"])), + ( + "a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6", + DataFrame( + [[1, 2, 3, 4, 5, 6]], + columns=["a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1", "a.1.1.1.1.1"], + ), + ), + ( + "a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7", + DataFrame( + [[1, 2, 3, 4, 5, 6, 7]], + columns=["a", "a.1", "a.3", "a.1.1", "a.2", "a.2.1", "a.3.1"], + ), + ), + ], +) +def test_thorough_mangle_columns(all_parsers, data, expected): + # see gh-17060 + parser = all_parsers + + result = parser.read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data,names,expected", + [ + ( + "a,b,b\n1,2,3", + ["a.1", "a.1", "a.1.1"], + DataFrame( + [["a", "b", "b"], ["1", "2", "3"]], columns=["a.1", "a.1.1", "a.1.1.1"] + ), + ), + ( + "a,b,c,d,e,f\n1,2,3,4,5,6", + ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"], + DataFrame( + [["a", "b", "c", "d", "e", "f"], ["1", "2", "3", "4", "5", "6"]], + columns=["a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1", "a.1.1.1.1.1"], + ), + ), + ( + "a,b,c,d,e,f,g\n1,2,3,4,5,6,7", + ["a", "a", "a.3", "a.1", "a.2", "a", "a"], + DataFrame( + [ + ["a", "b", "c", "d", "e", "f", "g"], + ["1", "2", "3", "4", "5", "6", "7"], + ], + columns=["a", "a.1", "a.3", "a.1.1", "a.2", "a.2.1", "a.3.1"], + ), + ), + ], +) +def test_thorough_mangle_names(all_parsers, data, names, expected): + # see gh-17095 + parser = all_parsers + + with pytest.raises(ValueError, match="Duplicate names"): + parser.read_csv(StringIO(data), names=names) + + +def test_mangled_unnamed_placeholders(all_parsers): + # xref gh-13017 + orig_key = "0" + parser = all_parsers + + orig_value = [1, 2, 3] + df = DataFrame({orig_key: orig_value}) + + # This test recursively updates `df`. + for i in range(3): + expected = DataFrame() + + for j in range(i + 1): + expected["Unnamed: 0" + ".1" * j] = [0, 1, 2] + + expected[orig_key] = orig_value + df = parser.read_csv(StringIO(df.to_csv())) + + tm.assert_frame_equal(df, expected) diff --git a/venv/Lib/site-packages/pandas/tests/io/parser/test_multi_thread.py b/venv/Lib/site-packages/pandas/tests/io/parser/test_multi_thread.py new file mode 100644 index 0000000..64ccaf6 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/parser/test_multi_thread.py @@ -0,0 +1,146 @@ +""" +Tests multithreading behaviour for reading and +parsing files for each parser defined in parsers.py +""" +from io import BytesIO +from multiprocessing.pool import ThreadPool + +import numpy as np + +import pandas as pd +from pandas import DataFrame +import pandas._testing as tm + + +def _construct_dataframe(num_rows): + """ + Construct a DataFrame for testing. + + Parameters + ---------- + num_rows : int + The number of rows for our DataFrame. + + Returns + ------- + df : DataFrame + """ + df = DataFrame(np.random.rand(num_rows, 5), columns=list("abcde")) + df["foo"] = "foo" + df["bar"] = "bar" + df["baz"] = "baz" + df["date"] = pd.date_range("20000101 09:00:00", periods=num_rows, freq="s") + df["int"] = np.arange(num_rows, dtype="int64") + return df + + +def test_multi_thread_string_io_read_csv(all_parsers): + # see gh-11786 + parser = all_parsers + max_row_range = 10000 + num_files = 100 + + bytes_to_df = [ + "\n".join( + ["{i:d},{i:d},{i:d}".format(i=i) for i in range(max_row_range)] + ).encode() + for _ in range(num_files) + ] + files = [BytesIO(b) for b in bytes_to_df] + + # Read all files in many threads. + pool = ThreadPool(8) + + results = pool.map(parser.read_csv, files) + first_result = results[0] + + for result in results: + tm.assert_frame_equal(first_result, result) + + +def _generate_multi_thread_dataframe(parser, path, num_rows, num_tasks): + """ + Generate a DataFrame via multi-thread. + + Parameters + ---------- + parser : BaseParser + The parser object to use for reading the data. + path : str + The location of the CSV file to read. + num_rows : int + The number of rows to read per task. + num_tasks : int + The number of tasks to use for reading this DataFrame. + + Returns + ------- + df : DataFrame + """ + + def reader(arg): + """ + Create a reader for part of the CSV. + + Parameters + ---------- + arg : tuple + A tuple of the following: + + * start : int + The starting row to start for parsing CSV + * nrows : int + The number of rows to read. + + Returns + ------- + df : DataFrame + """ + start, nrows = arg + + if not start: + return parser.read_csv( + path, index_col=0, header=0, nrows=nrows, parse_dates=["date"] + ) + + return parser.read_csv( + path, + index_col=0, + header=None, + skiprows=int(start) + 1, + nrows=nrows, + parse_dates=[9], + ) + + tasks = [ + (num_rows * i // num_tasks, num_rows // num_tasks) for i in range(num_tasks) + ] + + pool = ThreadPool(processes=num_tasks) + results = pool.map(reader, tasks) + + header = results[0].columns + + for r in results[1:]: + r.columns = header + + final_dataframe = pd.concat(results) + return final_dataframe + + +def test_multi_thread_path_multipart_read_csv(all_parsers): + # see gh-11786 + num_tasks = 4 + num_rows = 100000 + + parser = all_parsers + file_name = "__thread_pool_reader__.csv" + df = _construct_dataframe(num_rows) + + with tm.ensure_clean(file_name) as path: + df.to_csv(path) + + final_dataframe = _generate_multi_thread_dataframe( + parser, path, num_rows, num_tasks + ) + tm.assert_frame_equal(df, final_dataframe) diff --git a/venv/Lib/site-packages/pandas/tests/io/parser/test_na_values.py b/venv/Lib/site-packages/pandas/tests/io/parser/test_na_values.py new file mode 100644 index 0000000..f9a083d --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/parser/test_na_values.py @@ -0,0 +1,567 @@ +""" +Tests that NA values are properly handled during +parsing for all of the parsers defined in parsers.py +""" +from io import StringIO + +import numpy as np +import pytest + +from pandas._libs.parsers import STR_NA_VALUES + +from pandas import DataFrame, Index, MultiIndex +import pandas._testing as tm + + +def test_string_nas(all_parsers): + parser = all_parsers + data = """A,B,C +a,b,c +d,,f +,g,h +""" + result = parser.read_csv(StringIO(data)) + expected = DataFrame( + [["a", "b", "c"], ["d", np.nan, "f"], [np.nan, "g", "h"]], + columns=["A", "B", "C"], + ) + tm.assert_frame_equal(result, expected) + + +def test_detect_string_na(all_parsers): + parser = all_parsers + data = """A,B +foo,bar +NA,baz +NaN,nan +""" + expected = DataFrame( + [["foo", "bar"], [np.nan, "baz"], [np.nan, np.nan]], columns=["A", "B"] + ) + result = parser.read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "na_values", + [ + ["-999.0", "-999"], + [-999, -999.0], + [-999.0, -999], + ["-999.0"], + ["-999"], + [-999.0], + [-999], + ], +) +@pytest.mark.parametrize( + "data", + [ + """A,B +-999,1.2 +2,-999 +3,4.5 +""", + """A,B +-999,1.200 +2,-999.000 +3,4.500 +""", + ], +) +def test_non_string_na_values(all_parsers, data, na_values): + # see gh-3611: with an odd float format, we can't match + # the string "999.0" exactly but still need float matching + parser = all_parsers + expected = DataFrame([[np.nan, 1.2], [2.0, np.nan], [3.0, 4.5]], columns=["A", "B"]) + + result = parser.read_csv(StringIO(data), na_values=na_values) + tm.assert_frame_equal(result, expected) + + +def test_default_na_values(all_parsers): + _NA_VALUES = { + "-1.#IND", + "1.#QNAN", + "1.#IND", + "-1.#QNAN", + "#N/A", + "N/A", + "n/a", + "NA", + "", + "#NA", + "NULL", + "null", + "NaN", + "nan", + "-NaN", + "-nan", + "#N/A N/A", + "", + } + assert _NA_VALUES == STR_NA_VALUES + + parser = all_parsers + nv = len(_NA_VALUES) + + def f(i, v): + if i == 0: + buf = "" + elif i > 0: + buf = "".join([","] * i) + + buf = "{0}{1}".format(buf, v) + + if i < nv - 1: + buf = "{0}{1}".format(buf, "".join([","] * (nv - i - 1))) + + return buf + + data = StringIO("\n".join(f(i, v) for i, v in enumerate(_NA_VALUES))) + expected = DataFrame(np.nan, columns=range(nv), index=range(nv)) + + result = parser.read_csv(data, header=None) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("na_values", ["baz", ["baz"]]) +def test_custom_na_values(all_parsers, na_values): + parser = all_parsers + data = """A,B,C +ignore,this,row +1,NA,3 +-1.#IND,5,baz +7,8,NaN +""" + expected = DataFrame( + [[1.0, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], columns=["A", "B", "C"] + ) + result = parser.read_csv(StringIO(data), na_values=na_values, skiprows=[1]) + tm.assert_frame_equal(result, expected) + + +def test_bool_na_values(all_parsers): + data = """A,B,C +True,False,True +NA,True,False +False,NA,True""" + parser = all_parsers + result = parser.read_csv(StringIO(data)) + expected = DataFrame( + { + "A": np.array([True, np.nan, False], dtype=object), + "B": np.array([False, True, np.nan], dtype=object), + "C": [True, False, True], + } + ) + tm.assert_frame_equal(result, expected) + + +def test_na_value_dict(all_parsers): + data = """A,B,C +foo,bar,NA +bar,foo,foo +foo,bar,NA +bar,foo,foo""" + parser = all_parsers + df = parser.read_csv(StringIO(data), na_values={"A": ["foo"], "B": ["bar"]}) + expected = DataFrame( + { + "A": [np.nan, "bar", np.nan, "bar"], + "B": [np.nan, "foo", np.nan, "foo"], + "C": [np.nan, "foo", np.nan, "foo"], + } + ) + tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize( + "index_col,expected", + [ + ( + [0], + DataFrame({"b": [np.nan], "c": [1], "d": [5]}, index=Index([0], name="a")), + ), + ( + [0, 2], + DataFrame( + {"b": [np.nan], "d": [5]}, + index=MultiIndex.from_tuples([(0, 1)], names=["a", "c"]), + ), + ), + ( + ["a", "c"], + DataFrame( + {"b": [np.nan], "d": [5]}, + index=MultiIndex.from_tuples([(0, 1)], names=["a", "c"]), + ), + ), + ], +) +def test_na_value_dict_multi_index(all_parsers, index_col, expected): + data = """\ +a,b,c,d +0,NA,1,5 +""" + parser = all_parsers + result = parser.read_csv(StringIO(data), na_values=set(), index_col=index_col) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "kwargs,expected", + [ + ( + dict(), + DataFrame( + { + "A": ["a", "b", np.nan, "d", "e", np.nan, "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["one", "two", "three", np.nan, "five", np.nan, "seven"], + } + ), + ), + ( + dict(na_values={"A": [], "C": []}, keep_default_na=False), + DataFrame( + { + "A": ["a", "b", "", "d", "e", "nan", "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["one", "two", "three", "nan", "five", "", "seven"], + } + ), + ), + ( + dict(na_values=["a"], keep_default_na=False), + DataFrame( + { + "A": [np.nan, "b", "", "d", "e", "nan", "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["one", "two", "three", "nan", "five", "", "seven"], + } + ), + ), + ( + dict(na_values={"A": [], "C": []}), + DataFrame( + { + "A": ["a", "b", np.nan, "d", "e", np.nan, "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["one", "two", "three", np.nan, "five", np.nan, "seven"], + } + ), + ), + ], +) +def test_na_values_keep_default(all_parsers, kwargs, expected): + data = """\ +A,B,C +a,1,one +b,2,two +,3,three +d,4,nan +e,5,five +nan,6, +g,7,seven +""" + parser = all_parsers + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_no_na_values_no_keep_default(all_parsers): + # see gh-4318: passing na_values=None and + # keep_default_na=False yields 'None" as a na_value + data = """\ +A,B,C +a,1,None +b,2,two +,3,None +d,4,nan +e,5,five +nan,6, +g,7,seven +""" + parser = all_parsers + result = parser.read_csv(StringIO(data), keep_default_na=False) + + expected = DataFrame( + { + "A": ["a", "b", "", "d", "e", "nan", "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["None", "two", "None", "nan", "five", "", "seven"], + } + ) + tm.assert_frame_equal(result, expected) + + +def test_no_keep_default_na_dict_na_values(all_parsers): + # see gh-19227 + data = "a,b\n,2" + parser = all_parsers + result = parser.read_csv( + StringIO(data), na_values={"b": ["2"]}, keep_default_na=False + ) + expected = DataFrame({"a": [""], "b": [np.nan]}) + tm.assert_frame_equal(result, expected) + + +def test_no_keep_default_na_dict_na_scalar_values(all_parsers): + # see gh-19227 + # + # Scalar values shouldn't cause the parsing to crash or fail. + data = "a,b\n1,2" + parser = all_parsers + df = parser.read_csv(StringIO(data), na_values={"b": 2}, keep_default_na=False) + expected = DataFrame({"a": [1], "b": [np.nan]}) + tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize("col_zero_na_values", [113125, "113125"]) +def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_values): + # see gh-19227 + data = """\ +113125,"blah","/blaha",kjsdkj,412.166,225.874,214.008 +729639,"qwer","",asdfkj,466.681,,252.373 +""" + parser = all_parsers + expected = DataFrame( + { + 0: [np.nan, 729639.0], + 1: [np.nan, "qwer"], + 2: ["/blaha", np.nan], + 3: ["kjsdkj", "asdfkj"], + 4: [412.166, 466.681], + 5: ["225.874", ""], + 6: [np.nan, 252.373], + } + ) + + result = parser.read_csv( + StringIO(data), + header=None, + keep_default_na=False, + na_values={2: "", 6: "214.008", 1: "blah", 0: col_zero_na_values}, + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "na_filter,row_data", + [ + (True, [[1, "A"], [np.nan, np.nan], [3, "C"]]), + (False, [["1", "A"], ["nan", "B"], ["3", "C"]]), + ], +) +def test_na_values_na_filter_override(all_parsers, na_filter, row_data): + data = """\ +A,B +1,A +nan,B +3,C +""" + parser = all_parsers + result = parser.read_csv(StringIO(data), na_values=["B"], na_filter=na_filter) + + expected = DataFrame(row_data, columns=["A", "B"]) + tm.assert_frame_equal(result, expected) + + +def test_na_trailing_columns(all_parsers): + parser = all_parsers + data = """Date,Currency,Symbol,Type,Units,UnitPrice,Cost,Tax +2012-03-14,USD,AAPL,BUY,1000 +2012-05-12,USD,SBUX,SELL,500""" + + # Trailing columns should be all NaN. + result = parser.read_csv(StringIO(data)) + expected = DataFrame( + [ + ["2012-03-14", "USD", "AAPL", "BUY", 1000, np.nan, np.nan, np.nan], + ["2012-05-12", "USD", "SBUX", "SELL", 500, np.nan, np.nan, np.nan], + ], + columns=[ + "Date", + "Currency", + "Symbol", + "Type", + "Units", + "UnitPrice", + "Cost", + "Tax", + ], + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "na_values,row_data", + [ + (1, [[np.nan, 2.0], [2.0, np.nan]]), + ({"a": 2, "b": 1}, [[1.0, 2.0], [np.nan, np.nan]]), + ], +) +def test_na_values_scalar(all_parsers, na_values, row_data): + # see gh-12224 + parser = all_parsers + names = ["a", "b"] + data = "1,2\n2,1" + + result = parser.read_csv(StringIO(data), names=names, na_values=na_values) + expected = DataFrame(row_data, columns=names) + tm.assert_frame_equal(result, expected) + + +def test_na_values_dict_aliasing(all_parsers): + parser = all_parsers + na_values = {"a": 2, "b": 1} + na_values_copy = na_values.copy() + + names = ["a", "b"] + data = "1,2\n2,1" + + expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], columns=names) + result = parser.read_csv(StringIO(data), names=names, na_values=na_values) + + tm.assert_frame_equal(result, expected) + tm.assert_dict_equal(na_values, na_values_copy) + + +def test_na_values_dict_col_index(all_parsers): + # see gh-14203 + data = "a\nfoo\n1" + parser = all_parsers + na_values = {0: "foo"} + + result = parser.read_csv(StringIO(data), na_values=na_values) + expected = DataFrame({"a": [np.nan, 1]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + ( + str(2 ** 63) + "\n" + str(2 ** 63 + 1), + dict(na_values=[2 ** 63]), + DataFrame([str(2 ** 63), str(2 ** 63 + 1)]), + ), + (str(2 ** 63) + ",1" + "\n,2", dict(), DataFrame([[str(2 ** 63), 1], ["", 2]])), + (str(2 ** 63) + "\n1", dict(na_values=[2 ** 63]), DataFrame([np.nan, 1])), + ], +) +def test_na_values_uint64(all_parsers, data, kwargs, expected): + # see gh-14983 + parser = all_parsers + result = parser.read_csv(StringIO(data), header=None, **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_empty_na_values_no_default_with_index(all_parsers): + # see gh-15835 + data = "a,1\nb,2" + parser = all_parsers + expected = DataFrame({"1": [2]}, index=Index(["b"], name="a")) + + result = parser.read_csv(StringIO(data), index_col=0, keep_default_na=False) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "na_filter,index_data", [(False, ["", "5"]), (True, [np.nan, 5.0])] +) +def test_no_na_filter_on_index(all_parsers, na_filter, index_data): + # see gh-5239 + # + # Don't parse NA-values in index unless na_filter=True + parser = all_parsers + data = "a,b,c\n1,,3\n4,5,6" + + expected = DataFrame({"a": [1, 4], "c": [3, 6]}, index=Index(index_data, name="b")) + result = parser.read_csv(StringIO(data), index_col=[1], na_filter=na_filter) + tm.assert_frame_equal(result, expected) + + +def test_inf_na_values_with_int_index(all_parsers): + # see gh-17128 + parser = all_parsers + data = "idx,col1,col2\n1,3,4\n2,inf,-inf" + + # Don't fail with OverflowError with inf's and integer index column. + out = parser.read_csv(StringIO(data), index_col=[0], na_values=["inf", "-inf"]) + expected = DataFrame( + {"col1": [3, np.nan], "col2": [4, np.nan]}, index=Index([1, 2], name="idx") + ) + tm.assert_frame_equal(out, expected) + + +@pytest.mark.parametrize("na_filter", [True, False]) +def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): + # see gh-20377 + parser = all_parsers + data = "a,b,c\n1,,3\n4,5,6" + + # na_filter=True --> missing value becomes NaN. + # na_filter=False --> missing value remains empty string. + empty = np.nan if na_filter else "" + expected = DataFrame({"a": ["1", "4"], "b": [empty, "5"], "c": ["3", "6"]}) + + result = parser.read_csv(StringIO(data), na_filter=na_filter, dtype=str) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data, na_values", + [ + ("false,1\n,1\ntrue", None), + ("false,1\nnull,1\ntrue", None), + ("false,1\nnan,1\ntrue", None), + ("false,1\nfoo,1\ntrue", "foo"), + ("false,1\nfoo,1\ntrue", ["foo"]), + ("false,1\nfoo,1\ntrue", {"a": "foo"}), + ], +) +def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values): + parser = all_parsers + msg = ( + "(Bool column has NA values in column [0a])|" + "(cannot safely convert passed user dtype of " + "bool for object dtyped data in column 0)" + ) + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + header=None, + names=["a", "b"], + dtype={"a": "bool"}, + na_values=na_values, + ) + + +def test_str_nan_dropped(all_parsers): + # see gh-21131 + parser = all_parsers + + data = """File: small.csv,, +10010010233,0123,654 +foo,,bar +01001000155,4530,898""" + + result = parser.read_csv( + StringIO(data), + header=None, + names=["col1", "col2", "col3"], + dtype={"col1": str, "col2": str, "col3": str}, + ).dropna() + + expected = DataFrame( + { + "col1": ["10010010233", "01001000155"], + "col2": ["0123", "4530"], + "col3": ["654", "898"], + }, + index=[1, 3], + ) + + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/io/parser/test_network.py b/venv/Lib/site-packages/pandas/tests/io/parser/test_network.py new file mode 100644 index 0000000..b716447 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/parser/test_network.py @@ -0,0 +1,213 @@ +""" +Tests parsers ability to read and parse non-local files +and hence require a network connection to be read. +""" +from io import BytesIO, StringIO +import logging + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import DataFrame +import pandas._testing as tm + +from pandas.io.parsers import read_csv + + +@pytest.mark.network +@pytest.mark.parametrize( + "compress_type, extension", + [("gzip", ".gz"), ("bz2", ".bz2"), ("zip", ".zip"), ("xz", ".xz")], +) +@pytest.mark.parametrize("mode", ["explicit", "infer"]) +@pytest.mark.parametrize("engine", ["python", "c"]) +def test_compressed_urls(salaries_table, compress_type, extension, mode, engine): + check_compressed_urls(salaries_table, compress_type, extension, mode, engine) + + +@tm.network +def check_compressed_urls(salaries_table, compression, extension, mode, engine): + # test reading compressed urls with various engines and + # extension inference + base_url = ( + "https://github.com/pandas-dev/pandas/raw/master/" + "pandas/tests/io/parser/data/salaries.csv" + ) + + url = base_url + extension + + if mode != "explicit": + compression = mode + + url_table = read_csv(url, sep="\t", compression=compression, engine=engine) + tm.assert_frame_equal(url_table, salaries_table) + + +@pytest.fixture +def tips_df(datapath): + """DataFrame with the tips dataset.""" + return read_csv(datapath("io", "parser", "data", "tips.csv")) + + +@pytest.mark.usefixtures("s3_resource") +@td.skip_if_not_us_locale() +class TestS3: + def test_parse_public_s3_bucket(self, tips_df): + pytest.importorskip("s3fs") + + # more of an integration test due to the not-public contents portion + # can probably mock this though. + for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: + df = read_csv("s3://pandas-test/tips.csv" + ext, compression=comp) + assert isinstance(df, DataFrame) + assert not df.empty + tm.assert_frame_equal(df, tips_df) + + # Read public file from bucket with not-public contents + df = read_csv("s3://cant_get_it/tips.csv") + assert isinstance(df, DataFrame) + assert not df.empty + tm.assert_frame_equal(df, tips_df) + + def test_parse_public_s3n_bucket(self, tips_df): + + # Read from AWS s3 as "s3n" URL + df = read_csv("s3n://pandas-test/tips.csv", nrows=10) + assert isinstance(df, DataFrame) + assert not df.empty + tm.assert_frame_equal(tips_df.iloc[:10], df) + + def test_parse_public_s3a_bucket(self, tips_df): + # Read from AWS s3 as "s3a" URL + df = read_csv("s3a://pandas-test/tips.csv", nrows=10) + assert isinstance(df, DataFrame) + assert not df.empty + tm.assert_frame_equal(tips_df.iloc[:10], df) + + def test_parse_public_s3_bucket_nrows(self, tips_df): + for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: + df = read_csv("s3://pandas-test/tips.csv" + ext, nrows=10, compression=comp) + assert isinstance(df, DataFrame) + assert not df.empty + tm.assert_frame_equal(tips_df.iloc[:10], df) + + def test_parse_public_s3_bucket_chunked(self, tips_df): + # Read with a chunksize + chunksize = 5 + for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: + df_reader = read_csv( + "s3://pandas-test/tips.csv" + ext, chunksize=chunksize, compression=comp + ) + assert df_reader.chunksize == chunksize + for i_chunk in [0, 1, 2]: + # Read a couple of chunks and make sure we see them + # properly. + df = df_reader.get_chunk() + assert isinstance(df, DataFrame) + assert not df.empty + true_df = tips_df.iloc[chunksize * i_chunk : chunksize * (i_chunk + 1)] + tm.assert_frame_equal(true_df, df) + + def test_parse_public_s3_bucket_chunked_python(self, tips_df): + # Read with a chunksize using the Python parser + chunksize = 5 + for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: + df_reader = read_csv( + "s3://pandas-test/tips.csv" + ext, + chunksize=chunksize, + compression=comp, + engine="python", + ) + assert df_reader.chunksize == chunksize + for i_chunk in [0, 1, 2]: + # Read a couple of chunks and make sure we see them properly. + df = df_reader.get_chunk() + assert isinstance(df, DataFrame) + assert not df.empty + true_df = tips_df.iloc[chunksize * i_chunk : chunksize * (i_chunk + 1)] + tm.assert_frame_equal(true_df, df) + + def test_parse_public_s3_bucket_python(self, tips_df): + for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: + df = read_csv( + "s3://pandas-test/tips.csv" + ext, engine="python", compression=comp + ) + assert isinstance(df, DataFrame) + assert not df.empty + tm.assert_frame_equal(df, tips_df) + + def test_infer_s3_compression(self, tips_df): + for ext in ["", ".gz", ".bz2"]: + df = read_csv( + "s3://pandas-test/tips.csv" + ext, engine="python", compression="infer" + ) + assert isinstance(df, DataFrame) + assert not df.empty + tm.assert_frame_equal(df, tips_df) + + def test_parse_public_s3_bucket_nrows_python(self, tips_df): + for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: + df = read_csv( + "s3://pandas-test/tips.csv" + ext, + engine="python", + nrows=10, + compression=comp, + ) + assert isinstance(df, DataFrame) + assert not df.empty + tm.assert_frame_equal(tips_df.iloc[:10], df) + + def test_s3_fails(self): + with pytest.raises(IOError): + read_csv("s3://nyqpug/asdf.csv") + + # Receive a permission error when trying to read a private bucket. + # It's irrelevant here that this isn't actually a table. + with pytest.raises(IOError): + read_csv("s3://cant_get_it/file.csv") + + def test_read_csv_handles_boto_s3_object(self, s3_resource, tips_file): + # see gh-16135 + + s3_object = s3_resource.meta.client.get_object( + Bucket="pandas-test", Key="tips.csv" + ) + + result = read_csv(BytesIO(s3_object["Body"].read()), encoding="utf8") + assert isinstance(result, DataFrame) + assert not result.empty + + expected = read_csv(tips_file) + tm.assert_frame_equal(result, expected) + + def test_read_csv_chunked_download(self, s3_resource, caplog): + # 8 MB, S3FS usees 5MB chunks + import s3fs + + df = DataFrame(np.random.randn(100000, 4), columns=list("abcd")) + buf = BytesIO() + str_buf = StringIO() + + df.to_csv(str_buf) + + buf = BytesIO(str_buf.getvalue().encode("utf-8")) + + s3_resource.Bucket("pandas-test").put_object(Key="large-file.csv", Body=buf) + + # Possibly some state leaking in between tests. + # If we don't clear this cache, we saw `GetObject operation: Forbidden`. + # Presumably the s3fs instance is being cached, with the directory listing + # from *before* we add the large-file.csv in the pandas-test bucket. + s3fs.S3FileSystem.clear_instance_cache() + + with caplog.at_level(logging.DEBUG, logger="s3fs"): + read_csv("s3://pandas-test/large-file.csv", nrows=5) + # log of fetch_range (start, stop) + assert (0, 5505024) in (x.args[-2:] for x in caplog.records) + + def test_read_s3_with_hash_in_key(self, tips_df): + # GH 25945 + result = read_csv("s3://pandas-test/tips#1.csv") + tm.assert_frame_equal(tips_df, result) diff --git a/venv/Lib/site-packages/pandas/tests/io/parser/test_parse_dates.py b/venv/Lib/site-packages/pandas/tests/io/parser/test_parse_dates.py new file mode 100644 index 0000000..b01b22e --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/parser/test_parse_dates.py @@ -0,0 +1,1518 @@ +""" +Tests date parsing functionality for all of the +parsers defined in parsers.py +""" + +from datetime import date, datetime +from io import StringIO + +from dateutil.parser import parse as du_parse +from hypothesis import given, settings, strategies as st +import numpy as np +import pytest +import pytz + +from pandas._libs.tslib import Timestamp +from pandas._libs.tslibs import parsing +from pandas._libs.tslibs.parsing import parse_datetime_string +from pandas.compat import is_platform_windows +from pandas.compat.numpy import np_array_datetime64_compat + +import pandas as pd +from pandas import DataFrame, DatetimeIndex, Index, MultiIndex, Series +import pandas._testing as tm +from pandas.core.indexes.datetimes import date_range + +import pandas.io.date_converters as conv + +# constant +_DEFAULT_DATETIME = datetime(1, 1, 1) + +# Strategy for hypothesis +if is_platform_windows(): + date_strategy = st.datetimes(min_value=datetime(1900, 1, 1)) +else: + date_strategy = st.datetimes() + + +def test_separator_date_conflict(all_parsers): + # Regression test for gh-4678 + # + # Make sure thousands separator and + # date parsing do not conflict. + parser = all_parsers + data = "06-02-2013;13:00;1-000.215" + expected = DataFrame( + [[datetime(2013, 6, 2, 13, 0, 0), 1000.215]], columns=["Date", 2] + ) + + df = parser.read_csv( + StringIO(data), + sep=";", + thousands="-", + parse_dates={"Date": [0, 1]}, + header=None, + ) + tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize("keep_date_col", [True, False]) +def test_multiple_date_col_custom(all_parsers, keep_date_col): + data = """\ +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 +""" + parser = all_parsers + + def date_parser(*date_cols): + """ + Test date parser. + + Parameters + ---------- + date_cols : args + The list of data columns to parse. + + Returns + ------- + parsed : Series + """ + return parsing.try_parse_dates(parsing._concat_date_cols(date_cols)) + + result = parser.read_csv( + StringIO(data), + header=None, + date_parser=date_parser, + prefix="X", + parse_dates={"actual": [1, 2], "nominal": [1, 3]}, + keep_date_col=keep_date_col, + ) + expected = DataFrame( + [ + [ + datetime(1999, 1, 27, 19, 0), + datetime(1999, 1, 27, 18, 56), + "KORD", + "19990127", + " 19:00:00", + " 18:56:00", + 0.81, + 2.81, + 7.2, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 20, 0), + datetime(1999, 1, 27, 19, 56), + "KORD", + "19990127", + " 20:00:00", + " 19:56:00", + 0.01, + 2.21, + 7.2, + 0.0, + 260.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 20, 56), + "KORD", + "19990127", + " 21:00:00", + " 20:56:00", + -0.59, + 2.21, + 5.7, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 21, 18), + "KORD", + "19990127", + " 21:00:00", + " 21:18:00", + -0.99, + 2.01, + 3.6, + 0.0, + 270.0, + ], + [ + datetime(1999, 1, 27, 22, 0), + datetime(1999, 1, 27, 21, 56), + "KORD", + "19990127", + " 22:00:00", + " 21:56:00", + -0.59, + 1.71, + 5.1, + 0.0, + 290.0, + ], + [ + datetime(1999, 1, 27, 23, 0), + datetime(1999, 1, 27, 22, 56), + "KORD", + "19990127", + " 23:00:00", + " 22:56:00", + -0.59, + 1.71, + 4.6, + 0.0, + 280.0, + ], + ], + columns=[ + "actual", + "nominal", + "X0", + "X1", + "X2", + "X3", + "X4", + "X5", + "X6", + "X7", + "X8", + ], + ) + + if not keep_date_col: + expected = expected.drop(["X1", "X2", "X3"], axis=1) + elif parser.engine == "python": + expected["X1"] = expected["X1"].astype(np.int64) + + # Python can sometimes be flaky about how + # the aggregated columns are entered, so + # this standardizes the order. + result = result[expected.columns] + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("container", [list, tuple, Index, Series]) +@pytest.mark.parametrize("dim", [1, 2]) +def test_concat_date_col_fail(container, dim): + msg = "not all elements from date_cols are numpy arrays" + value = "19990127" + + date_cols = tuple(container([value]) for _ in range(dim)) + + with pytest.raises(ValueError, match=msg): + parsing._concat_date_cols(date_cols) + + +@pytest.mark.parametrize("keep_date_col", [True, False]) +def test_multiple_date_col(all_parsers, keep_date_col): + data = """\ +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 +""" + parser = all_parsers + result = parser.read_csv( + StringIO(data), + header=None, + prefix="X", + parse_dates=[[1, 2], [1, 3]], + keep_date_col=keep_date_col, + ) + expected = DataFrame( + [ + [ + datetime(1999, 1, 27, 19, 0), + datetime(1999, 1, 27, 18, 56), + "KORD", + "19990127", + " 19:00:00", + " 18:56:00", + 0.81, + 2.81, + 7.2, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 20, 0), + datetime(1999, 1, 27, 19, 56), + "KORD", + "19990127", + " 20:00:00", + " 19:56:00", + 0.01, + 2.21, + 7.2, + 0.0, + 260.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 20, 56), + "KORD", + "19990127", + " 21:00:00", + " 20:56:00", + -0.59, + 2.21, + 5.7, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 21, 18), + "KORD", + "19990127", + " 21:00:00", + " 21:18:00", + -0.99, + 2.01, + 3.6, + 0.0, + 270.0, + ], + [ + datetime(1999, 1, 27, 22, 0), + datetime(1999, 1, 27, 21, 56), + "KORD", + "19990127", + " 22:00:00", + " 21:56:00", + -0.59, + 1.71, + 5.1, + 0.0, + 290.0, + ], + [ + datetime(1999, 1, 27, 23, 0), + datetime(1999, 1, 27, 22, 56), + "KORD", + "19990127", + " 23:00:00", + " 22:56:00", + -0.59, + 1.71, + 4.6, + 0.0, + 280.0, + ], + ], + columns=[ + "X1_X2", + "X1_X3", + "X0", + "X1", + "X2", + "X3", + "X4", + "X5", + "X6", + "X7", + "X8", + ], + ) + + if not keep_date_col: + expected = expected.drop(["X1", "X2", "X3"], axis=1) + elif parser.engine == "python": + expected["X1"] = expected["X1"].astype(np.int64) + + tm.assert_frame_equal(result, expected) + + +def test_date_col_as_index_col(all_parsers): + data = """\ +KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +""" + parser = all_parsers + result = parser.read_csv( + StringIO(data), header=None, prefix="X", parse_dates=[1], index_col=1 + ) + + index = Index( + [ + datetime(1999, 1, 27, 19, 0), + datetime(1999, 1, 27, 20, 0), + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 22, 0), + ], + name="X1", + ) + expected = DataFrame( + [ + ["KORD", " 18:56:00", 0.81, 2.81, 7.2, 0.0, 280.0], + ["KORD", " 19:56:00", 0.01, 2.21, 7.2, 0.0, 260.0], + ["KORD", " 20:56:00", -0.59, 2.21, 5.7, 0.0, 280.0], + ["KORD", " 21:18:00", -0.99, 2.01, 3.6, 0.0, 270.0], + ["KORD", " 21:56:00", -0.59, 1.71, 5.1, 0.0, 290.0], + ], + columns=["X0", "X2", "X3", "X4", "X5", "X6", "X7"], + index=index, + ) + tm.assert_frame_equal(result, expected) + + +def test_multiple_date_cols_int_cast(all_parsers): + data = ( + "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" + "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" + "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" + "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" + "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" + "KORD,19990127, 23:00:00, 22:56:00, -0.5900" + ) + parse_dates = {"actual": [1, 2], "nominal": [1, 3]} + parser = all_parsers + + result = parser.read_csv( + StringIO(data), + header=None, + date_parser=conv.parse_date_time, + parse_dates=parse_dates, + prefix="X", + ) + expected = DataFrame( + [ + [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), "KORD", 0.81], + [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56), "KORD", 0.01], + [ + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 20, 56), + "KORD", + -0.59, + ], + [ + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 21, 18), + "KORD", + -0.99, + ], + [ + datetime(1999, 1, 27, 22, 0), + datetime(1999, 1, 27, 21, 56), + "KORD", + -0.59, + ], + [ + datetime(1999, 1, 27, 23, 0), + datetime(1999, 1, 27, 22, 56), + "KORD", + -0.59, + ], + ], + columns=["actual", "nominal", "X0", "X4"], + ) + + # Python can sometimes be flaky about how + # the aggregated columns are entered, so + # this standardizes the order. + result = result[expected.columns] + tm.assert_frame_equal(result, expected) + + +def test_multiple_date_col_timestamp_parse(all_parsers): + parser = all_parsers + data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25 +05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25""" + + result = parser.read_csv( + StringIO(data), parse_dates=[[0, 1]], header=None, date_parser=Timestamp + ) + expected = DataFrame( + [ + [ + Timestamp("05/31/2012, 15:30:00.029"), + 1306.25, + 1, + "E", + 0, + np.nan, + 1306.25, + ], + [ + Timestamp("05/31/2012, 15:30:00.029"), + 1306.25, + 8, + "E", + 0, + np.nan, + 1306.25, + ], + ], + columns=["0_1", 2, 3, 4, 5, 6, 7], + ) + tm.assert_frame_equal(result, expected) + + +def test_multiple_date_cols_with_header(all_parsers): + parser = all_parsers + data = """\ +ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" + + result = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]}) + expected = DataFrame( + [ + [ + datetime(1999, 1, 27, 19, 0), + "KORD", + " 18:56:00", + 0.81, + 2.81, + 7.2, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 20, 0), + "KORD", + " 19:56:00", + 0.01, + 2.21, + 7.2, + 0.0, + 260.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + "KORD", + " 20:56:00", + -0.59, + 2.21, + 5.7, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + "KORD", + " 21:18:00", + -0.99, + 2.01, + 3.6, + 0.0, + 270.0, + ], + [ + datetime(1999, 1, 27, 22, 0), + "KORD", + " 21:56:00", + -0.59, + 1.71, + 5.1, + 0.0, + 290.0, + ], + [ + datetime(1999, 1, 27, 23, 0), + "KORD", + " 22:56:00", + -0.59, + 1.71, + 4.6, + 0.0, + 280.0, + ], + ], + columns=[ + "nominal", + "ID", + "ActualTime", + "TDew", + "TAir", + "Windspeed", + "Precip", + "WindDir", + ], + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data,parse_dates,msg", + [ + ( + """\ +date_NominalTime,date,NominalTime +KORD1,19990127, 19:00:00 +KORD2,19990127, 20:00:00""", + [[1, 2]], + ("New date column already in dict date_NominalTime"), + ), + ( + """\ +ID,date,nominalTime +KORD,19990127, 19:00:00 +KORD,19990127, 20:00:00""", + dict(ID=[1, 2]), + "Date column ID already in dict", + ), + ], +) +def test_multiple_date_col_name_collision(all_parsers, data, parse_dates, msg): + parser = all_parsers + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), parse_dates=parse_dates) + + +def test_date_parser_int_bug(all_parsers): + # see gh-3071 + parser = all_parsers + data = ( + "posix_timestamp,elapsed,sys,user,queries,query_time,rows," + "accountid,userid,contactid,level,silo,method\n" + "1343103150,0.062353,0,4,6,0.01690,3," + "12345,1,-1,3,invoice_InvoiceResource,search\n" + ) + + result = parser.read_csv( + StringIO(data), + index_col=0, + parse_dates=[0], + date_parser=lambda x: datetime.utcfromtimestamp(int(x)), + ) + expected = DataFrame( + [ + [ + 0.062353, + 0, + 4, + 6, + 0.01690, + 3, + 12345, + 1, + -1, + 3, + "invoice_InvoiceResource", + "search", + ] + ], + columns=[ + "elapsed", + "sys", + "user", + "queries", + "query_time", + "rows", + "accountid", + "userid", + "contactid", + "level", + "silo", + "method", + ], + index=Index([Timestamp("2012-07-24 04:12:30")], name="posix_timestamp"), + ) + tm.assert_frame_equal(result, expected) + + +def test_nat_parse(all_parsers): + # see gh-3062 + parser = all_parsers + df = DataFrame( + dict({"A": np.arange(10, dtype="float64"), "B": pd.Timestamp("20010101")}) + ) + df.iloc[3:6, :] = np.nan + + with tm.ensure_clean("__nat_parse_.csv") as path: + df.to_csv(path) + + result = parser.read_csv(path, index_col=0, parse_dates=["B"]) + tm.assert_frame_equal(result, df) + + +def test_csv_custom_parser(all_parsers): + data = """A,B,C +20090101,a,1,2 +20090102,b,3,4 +20090103,c,4,5 +""" + parser = all_parsers + result = parser.read_csv( + StringIO(data), date_parser=lambda x: datetime.strptime(x, "%Y%m%d") + ) + expected = parser.read_csv(StringIO(data), parse_dates=True) + tm.assert_frame_equal(result, expected) + + +def test_parse_dates_implicit_first_col(all_parsers): + data = """A,B,C +20090101,a,1,2 +20090102,b,3,4 +20090103,c,4,5 +""" + parser = all_parsers + result = parser.read_csv(StringIO(data), parse_dates=True) + + expected = parser.read_csv(StringIO(data), index_col=0, parse_dates=True) + tm.assert_frame_equal(result, expected) + + +def test_parse_dates_string(all_parsers): + data = """date,A,B,C +20090101,a,1,2 +20090102,b,3,4 +20090103,c,4,5 +""" + parser = all_parsers + result = parser.read_csv(StringIO(data), index_col="date", parse_dates=["date"]) + index = date_range("1/1/2009", periods=3) + index.name = "date" + + expected = DataFrame( + {"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]}, index=index + ) + tm.assert_frame_equal(result, expected) + + +# Bug in https://github.com/dateutil/dateutil/issues/217 +# has been addressed, but we just don't pass in the `yearfirst` +@pytest.mark.xfail(reason="yearfirst is not surfaced in read_*") +@pytest.mark.parametrize("parse_dates", [[["date", "time"]], [[0, 1]]]) +def test_yy_format_with_year_first(all_parsers, parse_dates): + data = """date,time,B,C +090131,0010,1,2 +090228,1020,3,4 +090331,0830,5,6 +""" + parser = all_parsers + result = parser.read_csv(StringIO(data), index_col=0, parse_dates=parse_dates) + index = DatetimeIndex( + [ + datetime(2009, 1, 31, 0, 10, 0), + datetime(2009, 2, 28, 10, 20, 0), + datetime(2009, 3, 31, 8, 30, 0), + ], + dtype=object, + name="date_time", + ) + expected = DataFrame({"B": [1, 3, 5], "C": [2, 4, 6]}, index=index) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("parse_dates", [[0, 2], ["a", "c"]]) +def test_parse_dates_column_list(all_parsers, parse_dates): + data = "a,b,c\n01/01/2010,1,15/02/2010" + parser = all_parsers + + expected = DataFrame( + {"a": [datetime(2010, 1, 1)], "b": [1], "c": [datetime(2010, 2, 15)]} + ) + expected = expected.set_index(["a", "b"]) + + result = parser.read_csv( + StringIO(data), index_col=[0, 1], parse_dates=parse_dates, dayfirst=True + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) +def test_multi_index_parse_dates(all_parsers, index_col): + data = """index1,index2,A,B,C +20090101,one,a,1,2 +20090101,two,b,3,4 +20090101,three,c,4,5 +20090102,one,a,1,2 +20090102,two,b,3,4 +20090102,three,c,4,5 +20090103,one,a,1,2 +20090103,two,b,3,4 +20090103,three,c,4,5 +""" + parser = all_parsers + index = MultiIndex.from_product( + [ + (datetime(2009, 1, 1), datetime(2009, 1, 2), datetime(2009, 1, 3)), + ("one", "two", "three"), + ], + names=["index1", "index2"], + ) + + # Out of order. + if index_col == [1, 0]: + index = index.swaplevel(0, 1) + + expected = DataFrame( + [ + ["a", 1, 2], + ["b", 3, 4], + ["c", 4, 5], + ["a", 1, 2], + ["b", 3, 4], + ["c", 4, 5], + ["a", 1, 2], + ["b", 3, 4], + ["c", 4, 5], + ], + columns=["A", "B", "C"], + index=index, + ) + result = parser.read_csv(StringIO(data), index_col=index_col, parse_dates=True) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("kwargs", [dict(dayfirst=True), dict(day_first=True)]) +def test_parse_dates_custom_euro_format(all_parsers, kwargs): + parser = all_parsers + data = """foo,bar,baz +31/01/2010,1,2 +01/02/2010,1,NA +02/02/2010,1,2 +""" + if "dayfirst" in kwargs: + df = parser.read_csv( + StringIO(data), + names=["time", "Q", "NTU"], + date_parser=lambda d: du_parse(d, **kwargs), + header=0, + index_col=0, + parse_dates=True, + na_values=["NA"], + ) + exp_index = Index( + [datetime(2010, 1, 31), datetime(2010, 2, 1), datetime(2010, 2, 2)], + name="time", + ) + expected = DataFrame( + {"Q": [1, 1, 1], "NTU": [2, np.nan, 2]}, + index=exp_index, + columns=["Q", "NTU"], + ) + tm.assert_frame_equal(df, expected) + else: + msg = "got an unexpected keyword argument 'day_first'" + with pytest.raises(TypeError, match=msg): + parser.read_csv( + StringIO(data), + names=["time", "Q", "NTU"], + date_parser=lambda d: du_parse(d, **kwargs), + skiprows=[0], + index_col=0, + parse_dates=True, + na_values=["NA"], + ) + + +def test_parse_tz_aware(all_parsers): + # See gh-1693 + parser = all_parsers + data = "Date,x\n2012-06-13T01:39:00Z,0.5" + + result = parser.read_csv(StringIO(data), index_col=0, parse_dates=True) + expected = DataFrame( + {"x": [0.5]}, index=Index([Timestamp("2012-06-13 01:39:00+00:00")], name="Date") + ) + tm.assert_frame_equal(result, expected) + assert result.index.tz is pytz.utc + + +@pytest.mark.parametrize( + "parse_dates,index_col", + [({"nominal": [1, 2]}, "nominal"), ({"nominal": [1, 2]}, 0), ([[1, 2]], 0)], +) +def test_multiple_date_cols_index(all_parsers, parse_dates, index_col): + parser = all_parsers + data = """ +ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir +KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 +""" + expected = DataFrame( + [ + [ + datetime(1999, 1, 27, 19, 0), + "KORD1", + " 18:56:00", + 0.81, + 2.81, + 7.2, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 20, 0), + "KORD2", + " 19:56:00", + 0.01, + 2.21, + 7.2, + 0.0, + 260.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + "KORD3", + " 20:56:00", + -0.59, + 2.21, + 5.7, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + "KORD4", + " 21:18:00", + -0.99, + 2.01, + 3.6, + 0.0, + 270.0, + ], + [ + datetime(1999, 1, 27, 22, 0), + "KORD5", + " 21:56:00", + -0.59, + 1.71, + 5.1, + 0.0, + 290.0, + ], + [ + datetime(1999, 1, 27, 23, 0), + "KORD6", + " 22:56:00", + -0.59, + 1.71, + 4.6, + 0.0, + 280.0, + ], + ], + columns=[ + "nominal", + "ID", + "ActualTime", + "TDew", + "TAir", + "Windspeed", + "Precip", + "WindDir", + ], + ) + expected = expected.set_index("nominal") + + if not isinstance(parse_dates, dict): + expected.index.name = "date_NominalTime" + + result = parser.read_csv( + StringIO(data), parse_dates=parse_dates, index_col=index_col + ) + tm.assert_frame_equal(result, expected) + + +def test_multiple_date_cols_chunked(all_parsers): + parser = all_parsers + data = """\ +ID,date,nominalTime,actualTime,A,B,C,D,E +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 +""" + + expected = DataFrame( + [ + [ + datetime(1999, 1, 27, 19, 0), + "KORD", + " 18:56:00", + 0.81, + 2.81, + 7.2, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 20, 0), + "KORD", + " 19:56:00", + 0.01, + 2.21, + 7.2, + 0.0, + 260.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + "KORD", + " 20:56:00", + -0.59, + 2.21, + 5.7, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + "KORD", + " 21:18:00", + -0.99, + 2.01, + 3.6, + 0.0, + 270.0, + ], + [ + datetime(1999, 1, 27, 22, 0), + "KORD", + " 21:56:00", + -0.59, + 1.71, + 5.1, + 0.0, + 290.0, + ], + [ + datetime(1999, 1, 27, 23, 0), + "KORD", + " 22:56:00", + -0.59, + 1.71, + 4.6, + 0.0, + 280.0, + ], + ], + columns=["nominal", "ID", "actualTime", "A", "B", "C", "D", "E"], + ) + expected = expected.set_index("nominal") + + reader = parser.read_csv( + StringIO(data), + parse_dates={"nominal": [1, 2]}, + index_col="nominal", + chunksize=2, + ) + chunks = list(reader) + + tm.assert_frame_equal(chunks[0], expected[:2]) + tm.assert_frame_equal(chunks[1], expected[2:4]) + tm.assert_frame_equal(chunks[2], expected[4:]) + + +def test_multiple_date_col_named_index_compat(all_parsers): + parser = all_parsers + data = """\ +ID,date,nominalTime,actualTime,A,B,C,D,E +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 +""" + + with_indices = parser.read_csv( + StringIO(data), parse_dates={"nominal": [1, 2]}, index_col="nominal" + ) + with_names = parser.read_csv( + StringIO(data), + index_col="nominal", + parse_dates={"nominal": ["date", "nominalTime"]}, + ) + tm.assert_frame_equal(with_indices, with_names) + + +def test_multiple_date_col_multiple_index_compat(all_parsers): + parser = all_parsers + data = """\ +ID,date,nominalTime,actualTime,A,B,C,D,E +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 +""" + result = parser.read_csv( + StringIO(data), index_col=["nominal", "ID"], parse_dates={"nominal": [1, 2]} + ) + expected = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]}) + + expected = expected.set_index(["nominal", "ID"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("kwargs", [dict(), dict(index_col="C")]) +def test_read_with_parse_dates_scalar_non_bool(all_parsers, kwargs): + # see gh-5636 + parser = all_parsers + msg = ( + "Only booleans, lists, and dictionaries " + "are accepted for the 'parse_dates' parameter" + ) + data = """A,B,C + 1,2,2003-11-1""" + + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), parse_dates="C", **kwargs) + + +@pytest.mark.parametrize("parse_dates", [(1,), np.array([4, 5]), {1, 3, 3}]) +def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates): + parser = all_parsers + msg = ( + "Only booleans, lists, and dictionaries " + "are accepted for the 'parse_dates' parameter" + ) + data = """A,B,C + 1,2,2003-11-1""" + + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), parse_dates=(1,)) + + +@pytest.mark.parametrize("cache_dates", [True, False]) +@pytest.mark.parametrize("value", ["nan", "0", ""]) +def test_bad_date_parse(all_parsers, cache_dates, value): + # if we have an invalid date make sure that we handle this with + # and w/o the cache properly + parser = all_parsers + s = StringIO(("{value},\n".format(value=value)) * 50000) + + parser.read_csv( + s, + header=None, + names=["foo", "bar"], + parse_dates=["foo"], + infer_datetime_format=False, + cache_dates=cache_dates, + ) + + +def test_parse_dates_empty_string(all_parsers): + # see gh-2263 + parser = all_parsers + data = "Date,test\n2012-01-01,1\n,2" + result = parser.read_csv(StringIO(data), parse_dates=["Date"], na_filter=False) + + expected = DataFrame( + [[datetime(2012, 1, 1), 1], [pd.NaT, 2]], columns=["Date", "test"] + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + ( + "a\n04.15.2016", + dict(parse_dates=["a"]), + DataFrame([datetime(2016, 4, 15)], columns=["a"]), + ), + ( + "a\n04.15.2016", + dict(parse_dates=True, index_col=0), + DataFrame(index=DatetimeIndex(["2016-04-15"], name="a")), + ), + ( + "a,b\n04.15.2016,09.16.2013", + dict(parse_dates=["a", "b"]), + DataFrame( + [[datetime(2016, 4, 15), datetime(2013, 9, 16)]], columns=["a", "b"] + ), + ), + ( + "a,b\n04.15.2016,09.16.2013", + dict(parse_dates=True, index_col=[0, 1]), + DataFrame( + index=MultiIndex.from_tuples( + [(datetime(2016, 4, 15), datetime(2013, 9, 16))], names=["a", "b"] + ) + ), + ), + ], +) +def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected): + # see gh-14066 + parser = all_parsers + + result = parser.read_csv(StringIO(data), thousands=".", **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_parse_date_time_multi_level_column_name(all_parsers): + data = """\ +D,T,A,B +date, time,a,b +2001-01-05, 09:00:00, 0.0, 10. +2001-01-06, 00:00:00, 1.0, 11. +""" + parser = all_parsers + result = parser.read_csv( + StringIO(data), + header=[0, 1], + parse_dates={"date_time": [0, 1]}, + date_parser=conv.parse_date_time, + ) + + expected_data = [ + [datetime(2001, 1, 5, 9, 0, 0), 0.0, 10.0], + [datetime(2001, 1, 6, 0, 0, 0), 1.0, 11.0], + ] + expected = DataFrame(expected_data, columns=["date_time", ("A", "a"), ("B", "b")]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + ( + """\ +date,time,a,b +2001-01-05, 10:00:00, 0.0, 10. +2001-01-05, 00:00:00, 1., 11. +""", + dict(header=0, parse_dates={"date_time": [0, 1]}), + DataFrame( + [ + [datetime(2001, 1, 5, 10, 0, 0), 0.0, 10], + [datetime(2001, 1, 5, 0, 0, 0), 1.0, 11.0], + ], + columns=["date_time", "a", "b"], + ), + ), + ( + ( + "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" + "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" + "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" + "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" + "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" + "KORD,19990127, 23:00:00, 22:56:00, -0.5900" + ), + dict(header=None, parse_dates={"actual": [1, 2], "nominal": [1, 3]}), + DataFrame( + [ + [ + datetime(1999, 1, 27, 19, 0), + datetime(1999, 1, 27, 18, 56), + "KORD", + 0.81, + ], + [ + datetime(1999, 1, 27, 20, 0), + datetime(1999, 1, 27, 19, 56), + "KORD", + 0.01, + ], + [ + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 20, 56), + "KORD", + -0.59, + ], + [ + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 21, 18), + "KORD", + -0.99, + ], + [ + datetime(1999, 1, 27, 22, 0), + datetime(1999, 1, 27, 21, 56), + "KORD", + -0.59, + ], + [ + datetime(1999, 1, 27, 23, 0), + datetime(1999, 1, 27, 22, 56), + "KORD", + -0.59, + ], + ], + columns=["actual", "nominal", 0, 4], + ), + ), + ], +) +def test_parse_date_time(all_parsers, data, kwargs, expected): + parser = all_parsers + result = parser.read_csv(StringIO(data), date_parser=conv.parse_date_time, **kwargs) + + # Python can sometimes be flaky about how + # the aggregated columns are entered, so + # this standardizes the order. + result = result[expected.columns] + tm.assert_frame_equal(result, expected) + + +def test_parse_date_fields(all_parsers): + parser = all_parsers + data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11." + result = parser.read_csv( + StringIO(data), + header=0, + parse_dates={"ymd": [0, 1, 2]}, + date_parser=conv.parse_date_fields, + ) + + expected = DataFrame( + [[datetime(2001, 1, 10), 10.0], [datetime(2001, 2, 1), 11.0]], + columns=["ymd", "a"], + ) + tm.assert_frame_equal(result, expected) + + +def test_parse_date_all_fields(all_parsers): + parser = all_parsers + data = """\ +year,month,day,hour,minute,second,a,b +2001,01,05,10,00,0,0.0,10. +2001,01,5,10,0,00,1.,11. +""" + result = parser.read_csv( + StringIO(data), + header=0, + date_parser=conv.parse_all_fields, + parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}, + ) + expected = DataFrame( + [ + [datetime(2001, 1, 5, 10, 0, 0), 0.0, 10.0], + [datetime(2001, 1, 5, 10, 0, 0), 1.0, 11.0], + ], + columns=["ymdHMS", "a", "b"], + ) + tm.assert_frame_equal(result, expected) + + +def test_datetime_fractional_seconds(all_parsers): + parser = all_parsers + data = """\ +year,month,day,hour,minute,second,a,b +2001,01,05,10,00,0.123456,0.0,10. +2001,01,5,10,0,0.500000,1.,11. +""" + result = parser.read_csv( + StringIO(data), + header=0, + date_parser=conv.parse_all_fields, + parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}, + ) + expected = DataFrame( + [ + [datetime(2001, 1, 5, 10, 0, 0, microsecond=123456), 0.0, 10.0], + [datetime(2001, 1, 5, 10, 0, 0, microsecond=500000), 1.0, 11.0], + ], + columns=["ymdHMS", "a", "b"], + ) + tm.assert_frame_equal(result, expected) + + +def test_generic(all_parsers): + parser = all_parsers + data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11." + + result = parser.read_csv( + StringIO(data), + header=0, + parse_dates={"ym": [0, 1]}, + date_parser=lambda y, m: date(year=int(y), month=int(m), day=1), + ) + expected = DataFrame( + [[date(2001, 1, 1), 10, 10.0], [date(2001, 2, 1), 1, 11.0]], + columns=["ym", "day", "a"], + ) + tm.assert_frame_equal(result, expected) + + +def test_date_parser_resolution_if_not_ns(all_parsers): + # see gh-10245 + parser = all_parsers + data = """\ +date,time,prn,rxstatus +2013-11-03,19:00:00,126,00E80000 +2013-11-03,19:00:00,23,00E80000 +2013-11-03,19:00:00,13,00E80000 +""" + + def date_parser(dt, time): + return np_array_datetime64_compat(dt + "T" + time + "Z", dtype="datetime64[s]") + + result = parser.read_csv( + StringIO(data), + date_parser=date_parser, + parse_dates={"datetime": ["date", "time"]}, + index_col=["datetime", "prn"], + ) + + datetimes = np_array_datetime64_compat( + ["2013-11-03T19:00:00Z"] * 3, dtype="datetime64[s]" + ) + expected = DataFrame( + data={"rxstatus": ["00E80000"] * 3}, + index=MultiIndex.from_tuples( + [(datetimes[0], 126), (datetimes[1], 23), (datetimes[2], 13)], + names=["datetime", "prn"], + ), + ) + tm.assert_frame_equal(result, expected) + + +def test_parse_date_column_with_empty_string(all_parsers): + # see gh-6428 + parser = all_parsers + data = "case,opdate\n7,10/18/2006\n7,10/18/2008\n621, " + result = parser.read_csv(StringIO(data), parse_dates=["opdate"]) + + expected_data = [[7, "10/18/2006"], [7, "10/18/2008"], [621, " "]] + expected = DataFrame(expected_data, columns=["case", "opdate"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data,expected", + [ + ( + "a\n135217135789158401\n1352171357E+5", + DataFrame({"a": [135217135789158401, 135217135700000]}, dtype="float64"), + ), + ( + "a\n99999999999\n123456789012345\n1234E+0", + DataFrame({"a": [99999999999, 123456789012345, 1234]}, dtype="float64"), + ), + ], +) +@pytest.mark.parametrize("parse_dates", [True, False]) +def test_parse_date_float(all_parsers, data, expected, parse_dates): + # see gh-2697 + # + # Date parsing should fail, so we leave the data untouched + # (i.e. float precision should remain unchanged). + parser = all_parsers + + result = parser.read_csv(StringIO(data), parse_dates=parse_dates) + tm.assert_frame_equal(result, expected) + + +def test_parse_timezone(all_parsers): + # see gh-22256 + parser = all_parsers + data = """dt,val + 2018-01-04 09:01:00+09:00,23350 + 2018-01-04 09:02:00+09:00,23400 + 2018-01-04 09:03:00+09:00,23400 + 2018-01-04 09:04:00+09:00,23400 + 2018-01-04 09:05:00+09:00,23400""" + result = parser.read_csv(StringIO(data), parse_dates=["dt"]) + + dti = pd.date_range( + start="2018-01-04 09:01:00", + end="2018-01-04 09:05:00", + freq="1min", + tz=pytz.FixedOffset(540), + ) + expected_data = {"dt": dti, "val": [23350, 23400, 23400, 23400, 23400]} + + expected = DataFrame(expected_data) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "date_string", + ["32/32/2019", "02/30/2019", "13/13/2019", "13/2019", "a3/11/2018", "10/11/2o17"], +) +def test_invalid_parse_delimited_date(all_parsers, date_string): + parser = all_parsers + expected = DataFrame({0: [date_string]}, dtype="object") + result = parser.read_csv(StringIO(date_string), header=None, parse_dates=[0]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "date_string,dayfirst,expected", + [ + # %d/%m/%Y; month > 12 thus replacement + ("13/02/2019", False, datetime(2019, 2, 13)), + ("13/02/2019", True, datetime(2019, 2, 13)), + # %m/%d/%Y; day > 12 thus there will be no replacement + ("02/13/2019", False, datetime(2019, 2, 13)), + ("02/13/2019", True, datetime(2019, 2, 13)), + # %d/%m/%Y; dayfirst==True thus replacement + ("04/02/2019", True, datetime(2019, 2, 4)), + ], +) +def test_parse_delimited_date_swap(all_parsers, date_string, dayfirst, expected): + parser = all_parsers + expected = DataFrame({0: [expected]}, dtype="datetime64[ns]") + result = parser.read_csv( + StringIO(date_string), header=None, dayfirst=dayfirst, parse_dates=[0] + ) + tm.assert_frame_equal(result, expected) + + +def _helper_hypothesis_delimited_date(call, date_string, **kwargs): + msg, result = None, None + try: + result = call(date_string, **kwargs) + except ValueError as er: + msg = str(er) + pass + return msg, result + + +@given(date_strategy) +@settings(deadline=None) +@pytest.mark.parametrize("delimiter", list(" -./")) +@pytest.mark.parametrize("dayfirst", [True, False]) +@pytest.mark.parametrize( + "date_format", + ["%d %m %Y", "%m %d %Y", "%m %Y", "%Y %m %d", "%y %m %d", "%Y%m%d", "%y%m%d"], +) +def test_hypothesis_delimited_date(date_format, dayfirst, delimiter, test_datetime): + if date_format == "%m %Y" and delimiter == ".": + pytest.skip( + "parse_datetime_string cannot reliably tell whether \ + e.g. %m.%Y is a float or a date, thus we skip it" + ) + result, expected = None, None + except_in_dateutil, except_out_dateutil = None, None + date_string = test_datetime.strftime(date_format.replace(" ", delimiter)) + + except_out_dateutil, result = _helper_hypothesis_delimited_date( + parse_datetime_string, date_string, dayfirst=dayfirst + ) + except_in_dateutil, expected = _helper_hypothesis_delimited_date( + du_parse, + date_string, + default=_DEFAULT_DATETIME, + dayfirst=dayfirst, + yearfirst=False, + ) + + assert except_out_dateutil == except_in_dateutil + assert result == expected diff --git a/venv/Lib/site-packages/pandas/tests/io/parser/test_python_parser_only.py b/venv/Lib/site-packages/pandas/tests/io/parser/test_python_parser_only.py new file mode 100644 index 0000000..7367b19 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/parser/test_python_parser_only.py @@ -0,0 +1,298 @@ +""" +Tests that apply specifically to the Python parser. Unless specifically +stated as a Python-specific issue, the goal is to eventually move as many of +these tests out of this module as soon as the C parser can accept further +arguments when parsing. +""" + +import csv +from io import BytesIO, StringIO + +import pytest + +from pandas.errors import ParserError + +from pandas import DataFrame, Index, MultiIndex +import pandas._testing as tm + + +def test_default_separator(python_parser_only): + # see gh-17333 + # + # csv.Sniffer in Python treats "o" as separator. + data = "aob\n1o2\n3o4" + parser = python_parser_only + expected = DataFrame({"a": [1, 3], "b": [2, 4]}) + + result = parser.read_csv(StringIO(data), sep=None) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("skipfooter", ["foo", 1.5, True]) +def test_invalid_skipfooter_non_int(python_parser_only, skipfooter): + # see gh-15925 (comment) + data = "a\n1\n2" + parser = python_parser_only + msg = "skipfooter must be an integer" + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), skipfooter=skipfooter) + + +def test_invalid_skipfooter_negative(python_parser_only): + # see gh-15925 (comment) + data = "a\n1\n2" + parser = python_parser_only + msg = "skipfooter cannot be negative" + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), skipfooter=-1) + + +@pytest.mark.parametrize("kwargs", [dict(sep=None), dict(delimiter="|")]) +def test_sniff_delimiter(python_parser_only, kwargs): + data = """index|A|B|C +foo|1|2|3 +bar|4|5|6 +baz|7|8|9 +""" + parser = python_parser_only + result = parser.read_csv(StringIO(data), index_col=0, **kwargs) + expected = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + columns=["A", "B", "C"], + index=Index(["foo", "bar", "baz"], name="index"), + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("encoding", [None, "utf-8"]) +def test_sniff_delimiter_encoding(python_parser_only, encoding): + parser = python_parser_only + data = """ignore this +ignore this too +index|A|B|C +foo|1|2|3 +bar|4|5|6 +baz|7|8|9 +""" + + if encoding is not None: + from io import TextIOWrapper + + data = data.encode(encoding) + data = BytesIO(data) + data = TextIOWrapper(data, encoding=encoding) + else: + data = StringIO(data) + + result = parser.read_csv(data, index_col=0, sep=None, skiprows=2, encoding=encoding) + expected = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + columns=["A", "B", "C"], + index=Index(["foo", "bar", "baz"], name="index"), + ) + tm.assert_frame_equal(result, expected) + + +def test_single_line(python_parser_only): + # see gh-6607: sniff separator + parser = python_parser_only + result = parser.read_csv(StringIO("1,2"), names=["a", "b"], header=None, sep=None) + + expected = DataFrame({"a": [1], "b": [2]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("kwargs", [dict(skipfooter=2), dict(nrows=3)]) +def test_skipfooter(python_parser_only, kwargs): + # see gh-6607 + data = """A,B,C +1,2,3 +4,5,6 +7,8,9 +want to skip this +also also skip this +""" + parser = python_parser_only + result = parser.read_csv(StringIO(data), **kwargs) + + expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["A", "B", "C"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "compression,klass", [("gzip", "GzipFile"), ("bz2", "BZ2File")] +) +def test_decompression_regex_sep(python_parser_only, csv1, compression, klass): + # see gh-6607 + parser = python_parser_only + + with open(csv1, "rb") as f: + data = f.read() + + data = data.replace(b",", b"::") + expected = parser.read_csv(csv1) + + module = pytest.importorskip(compression) + klass = getattr(module, klass) + + with tm.ensure_clean() as path: + tmp = klass(path, mode="wb") + tmp.write(data) + tmp.close() + + result = parser.read_csv(path, sep="::", compression=compression) + tm.assert_frame_equal(result, expected) + + +def test_read_csv_buglet_4x_multi_index(python_parser_only): + # see gh-6607 + data = """ A B C D E +one two three four +a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 +a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 +x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" + parser = python_parser_only + + expected = DataFrame( + [ + [-0.5109, -2.3358, -0.4645, 0.05076, 0.3640], + [0.4473, 1.4152, 0.2834, 1.00661, 0.1744], + [-0.6662, -0.5243, -0.3580, 0.89145, 2.5838], + ], + columns=["A", "B", "C", "D", "E"], + index=MultiIndex.from_tuples( + [("a", "b", 10.0032, 5), ("a", "q", 20, 4), ("x", "q", 30, 3)], + names=["one", "two", "three", "four"], + ), + ) + result = parser.read_csv(StringIO(data), sep=r"\s+") + tm.assert_frame_equal(result, expected) + + +def test_read_csv_buglet_4x_multi_index2(python_parser_only): + # see gh-6893 + data = " A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9" + parser = python_parser_only + + expected = DataFrame.from_records( + [(1, 3, 7, 0, 3, 6), (3, 1, 4, 1, 5, 9)], + columns=list("abcABC"), + index=list("abc"), + ) + result = parser.read_csv(StringIO(data), sep=r"\s+") + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("add_footer", [True, False]) +def test_skipfooter_with_decimal(python_parser_only, add_footer): + # see gh-6971 + data = "1#2\n3#4" + parser = python_parser_only + expected = DataFrame({"a": [1.2, 3.4]}) + + if add_footer: + # The stray footer line should not mess with the + # casting of the first two lines if we skip it. + kwargs = dict(skipfooter=1) + data += "\nFooter" + else: + kwargs = dict() + + result = parser.read_csv(StringIO(data), names=["a"], decimal="#", **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "sep", ["::", "#####", "!!!", "123", "#1!c5", "%!c!d", "@@#4:2", "_!pd#_"] +) +@pytest.mark.parametrize( + "encoding", ["utf-16", "utf-16-be", "utf-16-le", "utf-32", "cp037"] +) +def test_encoding_non_utf8_multichar_sep(python_parser_only, sep, encoding): + # see gh-3404 + expected = DataFrame({"a": [1], "b": [2]}) + parser = python_parser_only + + data = "1" + sep + "2" + encoded_data = data.encode(encoding) + + result = parser.read_csv( + BytesIO(encoded_data), sep=sep, names=["a", "b"], encoding=encoding + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE]) +def test_multi_char_sep_quotes(python_parser_only, quoting): + # see gh-13374 + kwargs = dict(sep=",,") + parser = python_parser_only + + data = 'a,,b\n1,,a\n2,,"2,,b"' + msg = "ignored when a multi-char delimiter is used" + + def fail_read(): + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data), quoting=quoting, **kwargs) + + if quoting == csv.QUOTE_NONE: + # We expect no match, so there should be an assertion + # error out of the inner context manager. + with pytest.raises(AssertionError): + fail_read() + else: + fail_read() + + +def test_none_delimiter(python_parser_only, capsys): + # see gh-13374 and gh-17465 + parser = python_parser_only + data = "a,b,c\n0,1,2\n3,4,5,6\n7,8,9" + expected = DataFrame({"a": [0, 7], "b": [1, 8], "c": [2, 9]}) + + # We expect the third line in the data to be + # skipped because it is malformed, but we do + # not expect any errors to occur. + result = parser.read_csv( + StringIO(data), header=0, sep=None, warn_bad_lines=True, error_bad_lines=False + ) + tm.assert_frame_equal(result, expected) + + captured = capsys.readouterr() + assert "Skipping line 3" in captured.err + + +@pytest.mark.parametrize("data", ['a\n1\n"b"a', 'a,b,c\ncat,foo,bar\ndog,foo,"baz']) +@pytest.mark.parametrize("skipfooter", [0, 1]) +def test_skipfooter_bad_row(python_parser_only, data, skipfooter): + # see gh-13879 and gh-15910 + msg = "parsing errors in the skipped footer rows" + parser = python_parser_only + + def fail_read(): + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data), skipfooter=skipfooter) + + if skipfooter: + fail_read() + else: + # We expect no match, so there should be an assertion + # error out of the inner context manager. + with pytest.raises(AssertionError): + fail_read() + + +def test_malformed_skipfooter(python_parser_only): + parser = python_parser_only + data = """ignore +A,B,C +1,2,3 # comment +1,2,3,4,5 +2,3,4 +footer +""" + msg = "Expected 3 fields in line 4, saw 5" + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1) diff --git a/venv/Lib/site-packages/pandas/tests/io/parser/test_quoting.py b/venv/Lib/site-packages/pandas/tests/io/parser/test_quoting.py new file mode 100644 index 0000000..14773df --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/parser/test_quoting.py @@ -0,0 +1,159 @@ +""" +Tests that quoting specifications are properly handled +during parsing for all of the parsers defined in parsers.py +""" + +import csv +from io import StringIO + +import pytest + +from pandas.errors import ParserError + +from pandas import DataFrame +import pandas._testing as tm + + +@pytest.mark.parametrize( + "kwargs,msg", + [ + (dict(quotechar="foo"), '"quotechar" must be a(n)? 1-character string'), + ( + dict(quotechar=None, quoting=csv.QUOTE_MINIMAL), + "quotechar must be set if quoting enabled", + ), + (dict(quotechar=2), '"quotechar" must be string, not int'), + ], +) +def test_bad_quote_char(all_parsers, kwargs, msg): + data = "1,2,3" + parser = all_parsers + + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + + +@pytest.mark.parametrize( + "quoting,msg", + [ + ("foo", '"quoting" must be an integer'), + (5, 'bad "quoting" value'), # quoting must be in the range [0, 3] + ], +) +def test_bad_quoting(all_parsers, quoting, msg): + data = "1,2,3" + parser = all_parsers + + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), quoting=quoting) + + +def test_quote_char_basic(all_parsers): + parser = all_parsers + data = 'a,b,c\n1,2,"cat"' + expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"]) + + result = parser.read_csv(StringIO(data), quotechar='"') + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"]) +def test_quote_char_various(all_parsers, quote_char): + parser = all_parsers + expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"]) + + data = 'a,b,c\n1,2,"cat"' + new_data = data.replace('"', quote_char) + + result = parser.read_csv(StringIO(new_data), quotechar=quote_char) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE]) +@pytest.mark.parametrize("quote_char", ["", None]) +def test_null_quote_char(all_parsers, quoting, quote_char): + kwargs = dict(quotechar=quote_char, quoting=quoting) + data = "a,b,c\n1,2,3" + parser = all_parsers + + if quoting != csv.QUOTE_NONE: + # Sanity checking. + msg = "quotechar must be set if quoting enabled" + + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + else: + expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"]) + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "kwargs,exp_data", + [ + (dict(), [[1, 2, "foo"]]), # Test default. + # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading. + (dict(quotechar='"', quoting=csv.QUOTE_MINIMAL), [[1, 2, "foo"]]), + # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading. + (dict(quotechar='"', quoting=csv.QUOTE_ALL), [[1, 2, "foo"]]), + # QUOTE_NONE tells the reader to do no special handling + # of quote characters and leave them alone. + (dict(quotechar='"', quoting=csv.QUOTE_NONE), [[1, 2, '"foo"']]), + # QUOTE_NONNUMERIC tells the reader to cast + # all non-quoted fields to float + (dict(quotechar='"', quoting=csv.QUOTE_NONNUMERIC), [[1.0, 2.0, "foo"]]), + ], +) +def test_quoting_various(all_parsers, kwargs, exp_data): + data = '1,2,"foo"' + parser = all_parsers + columns = ["a", "b", "c"] + + result = parser.read_csv(StringIO(data), names=columns, **kwargs) + expected = DataFrame(exp_data, columns=columns) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "doublequote,exp_data", [(True, [[3, '4 " 5']]), (False, [[3, '4 " 5"']])] +) +def test_double_quote(all_parsers, doublequote, exp_data): + parser = all_parsers + data = 'a,b\n3,"4 "" 5"' + + result = parser.read_csv(StringIO(data), quotechar='"', doublequote=doublequote) + expected = DataFrame(exp_data, columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("quotechar", ['"', "\u0001"]) +def test_quotechar_unicode(all_parsers, quotechar): + # see gh-14477 + data = "a\n1" + parser = all_parsers + expected = DataFrame({"a": [1]}) + + result = parser.read_csv(StringIO(data), quotechar=quotechar) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("balanced", [True, False]) +def test_unbalanced_quoting(all_parsers, balanced): + # see gh-22789. + parser = all_parsers + data = 'a,b,c\n1,2,"3' + + if balanced: + # Re-balance the quoting and read in without errors. + expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"]) + result = parser.read_csv(StringIO(data + '"')) + tm.assert_frame_equal(result, expected) + else: + msg = ( + "EOF inside string starting at row 1" + if parser.engine == "c" + else "unexpected end of data" + ) + + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data)) diff --git a/venv/Lib/site-packages/pandas/tests/io/parser/test_read_fwf.py b/venv/Lib/site-packages/pandas/tests/io/parser/test_read_fwf.py new file mode 100644 index 0000000..27aef23 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/parser/test_read_fwf.py @@ -0,0 +1,618 @@ +""" +Tests the 'read_fwf' function in parsers.py. This +test suite is independent of the others because the +engine is set to 'python-fwf' internally. +""" + +from datetime import datetime +from io import BytesIO, StringIO + +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, DatetimeIndex +import pandas._testing as tm + +from pandas.io.parsers import EmptyDataError, read_csv, read_fwf + + +def test_basic(): + data = """\ +A B C D +201158 360.242940 149.910199 11950.7 +201159 444.953632 166.985655 11788.4 +201160 364.136849 183.628767 11806.2 +201161 413.836124 184.375703 11916.8 +201162 502.953953 173.237159 12468.3 +""" + result = read_fwf(StringIO(data)) + expected = DataFrame( + [ + [201158, 360.242940, 149.910199, 11950.7], + [201159, 444.953632, 166.985655, 11788.4], + [201160, 364.136849, 183.628767, 11806.2], + [201161, 413.836124, 184.375703, 11916.8], + [201162, 502.953953, 173.237159, 12468.3], + ], + columns=["A", "B", "C", "D"], + ) + tm.assert_frame_equal(result, expected) + + +def test_colspecs(): + data = """\ +A B C D E +201158 360.242940 149.910199 11950.7 +201159 444.953632 166.985655 11788.4 +201160 364.136849 183.628767 11806.2 +201161 413.836124 184.375703 11916.8 +201162 502.953953 173.237159 12468.3 +""" + colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)] + result = read_fwf(StringIO(data), colspecs=colspecs) + + expected = DataFrame( + [ + [2011, 58, 360.242940, 149.910199, 11950.7], + [2011, 59, 444.953632, 166.985655, 11788.4], + [2011, 60, 364.136849, 183.628767, 11806.2], + [2011, 61, 413.836124, 184.375703, 11916.8], + [2011, 62, 502.953953, 173.237159, 12468.3], + ], + columns=["A", "B", "C", "D", "E"], + ) + tm.assert_frame_equal(result, expected) + + +def test_widths(): + data = """\ +A B C D E +2011 58 360.242940 149.910199 11950.7 +2011 59 444.953632 166.985655 11788.4 +2011 60 364.136849 183.628767 11806.2 +2011 61 413.836124 184.375703 11916.8 +2011 62 502.953953 173.237159 12468.3 +""" + result = read_fwf(StringIO(data), widths=[5, 5, 13, 13, 7]) + + expected = DataFrame( + [ + [2011, 58, 360.242940, 149.910199, 11950.7], + [2011, 59, 444.953632, 166.985655, 11788.4], + [2011, 60, 364.136849, 183.628767, 11806.2], + [2011, 61, 413.836124, 184.375703, 11916.8], + [2011, 62, 502.953953, 173.237159, 12468.3], + ], + columns=["A", "B", "C", "D", "E"], + ) + tm.assert_frame_equal(result, expected) + + +def test_non_space_filler(): + # From Thomas Kluyver: + # + # Apparently, some non-space filler characters can be seen, this is + # supported by specifying the 'delimiter' character: + # + # http://publib.boulder.ibm.com/infocenter/dmndhelp/v6r1mx/index.jsp?topic=/com.ibm.wbit.612.help.config.doc/topics/rfixwidth.html + data = """\ +A~~~~B~~~~C~~~~~~~~~~~~D~~~~~~~~~~~~E +201158~~~~360.242940~~~149.910199~~~11950.7 +201159~~~~444.953632~~~166.985655~~~11788.4 +201160~~~~364.136849~~~183.628767~~~11806.2 +201161~~~~413.836124~~~184.375703~~~11916.8 +201162~~~~502.953953~~~173.237159~~~12468.3 +""" + colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)] + result = read_fwf(StringIO(data), colspecs=colspecs, delimiter="~") + + expected = DataFrame( + [ + [2011, 58, 360.242940, 149.910199, 11950.7], + [2011, 59, 444.953632, 166.985655, 11788.4], + [2011, 60, 364.136849, 183.628767, 11806.2], + [2011, 61, 413.836124, 184.375703, 11916.8], + [2011, 62, 502.953953, 173.237159, 12468.3], + ], + columns=["A", "B", "C", "D", "E"], + ) + tm.assert_frame_equal(result, expected) + + +def test_over_specified(): + data = """\ +A B C D E +201158 360.242940 149.910199 11950.7 +201159 444.953632 166.985655 11788.4 +201160 364.136849 183.628767 11806.2 +201161 413.836124 184.375703 11916.8 +201162 502.953953 173.237159 12468.3 +""" + colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)] + + with pytest.raises(ValueError, match="must specify only one of"): + read_fwf(StringIO(data), colspecs=colspecs, widths=[6, 10, 10, 7]) + + +def test_under_specified(): + data = """\ +A B C D E +201158 360.242940 149.910199 11950.7 +201159 444.953632 166.985655 11788.4 +201160 364.136849 183.628767 11806.2 +201161 413.836124 184.375703 11916.8 +201162 502.953953 173.237159 12468.3 +""" + with pytest.raises(ValueError, match="Must specify either"): + read_fwf(StringIO(data), colspecs=None, widths=None) + + +def test_read_csv_compat(): + csv_data = """\ +A,B,C,D,E +2011,58,360.242940,149.910199,11950.7 +2011,59,444.953632,166.985655,11788.4 +2011,60,364.136849,183.628767,11806.2 +2011,61,413.836124,184.375703,11916.8 +2011,62,502.953953,173.237159,12468.3 +""" + expected = read_csv(StringIO(csv_data), engine="python") + + fwf_data = """\ +A B C D E +201158 360.242940 149.910199 11950.7 +201159 444.953632 166.985655 11788.4 +201160 364.136849 183.628767 11806.2 +201161 413.836124 184.375703 11916.8 +201162 502.953953 173.237159 12468.3 +""" + colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)] + result = read_fwf(StringIO(fwf_data), colspecs=colspecs) + tm.assert_frame_equal(result, expected) + + +def test_bytes_io_input(): + result = read_fwf( + BytesIO("שלום\nשלום".encode("utf8")), widths=[2, 2], encoding="utf8" + ) + expected = DataFrame([["של", "ום"]], columns=["של", "ום"]) + tm.assert_frame_equal(result, expected) + + +def test_fwf_colspecs_is_list_or_tuple(): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + + msg = "column specifications must be a list or tuple.+" + + with pytest.raises(TypeError, match=msg): + read_fwf(StringIO(data), colspecs={"a": 1}, delimiter=",") + + +def test_fwf_colspecs_is_list_or_tuple_of_two_element_tuples(): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + + msg = "Each column specification must be.+" + + with pytest.raises(TypeError, match=msg): + read_fwf(StringIO(data), [("a", 1)]) + + +@pytest.mark.parametrize( + "colspecs,exp_data", + [ + ([(0, 3), (3, None)], [[123, 456], [456, 789]]), + ([(None, 3), (3, 6)], [[123, 456], [456, 789]]), + ([(0, None), (3, None)], [[123456, 456], [456789, 789]]), + ([(None, None), (3, 6)], [[123456, 456], [456789, 789]]), + ], +) +def test_fwf_colspecs_none(colspecs, exp_data): + # see gh-7079 + data = """\ +123456 +456789 +""" + expected = DataFrame(exp_data) + + result = read_fwf(StringIO(data), colspecs=colspecs, header=None) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "infer_nrows,exp_data", + [ + # infer_nrows --> colspec == [(2, 3), (5, 6)] + (1, [[1, 2], [3, 8]]), + # infer_nrows > number of rows + (10, [[1, 2], [123, 98]]), + ], +) +def test_fwf_colspecs_infer_nrows(infer_nrows, exp_data): + # see gh-15138 + data = """\ + 1 2 +123 98 +""" + expected = DataFrame(exp_data) + + result = read_fwf(StringIO(data), infer_nrows=infer_nrows, header=None) + tm.assert_frame_equal(result, expected) + + +def test_fwf_regression(): + # see gh-3594 + # + # Turns out "T060" is parsable as a datetime slice! + tz_list = [1, 10, 20, 30, 60, 80, 100] + widths = [16] + [8] * len(tz_list) + names = ["SST"] + ["T{z:03d}".format(z=z) for z in tz_list[1:]] + + data = """ 2009164202000 9.5403 9.4105 8.6571 7.8372 6.0612 5.8843 5.5192 +2009164203000 9.5435 9.2010 8.6167 7.8176 6.0804 5.8728 5.4869 +2009164204000 9.5873 9.1326 8.4694 7.5889 6.0422 5.8526 5.4657 +2009164205000 9.5810 9.0896 8.4009 7.4652 6.0322 5.8189 5.4379 +2009164210000 9.6034 9.0897 8.3822 7.4905 6.0908 5.7904 5.4039 +""" + + result = read_fwf( + StringIO(data), + index_col=0, + header=None, + names=names, + widths=widths, + parse_dates=True, + date_parser=lambda s: datetime.strptime(s, "%Y%j%H%M%S"), + ) + expected = DataFrame( + [ + [9.5403, 9.4105, 8.6571, 7.8372, 6.0612, 5.8843, 5.5192], + [9.5435, 9.2010, 8.6167, 7.8176, 6.0804, 5.8728, 5.4869], + [9.5873, 9.1326, 8.4694, 7.5889, 6.0422, 5.8526, 5.4657], + [9.5810, 9.0896, 8.4009, 7.4652, 6.0322, 5.8189, 5.4379], + [9.6034, 9.0897, 8.3822, 7.4905, 6.0908, 5.7904, 5.4039], + ], + index=DatetimeIndex( + [ + "2009-06-13 20:20:00", + "2009-06-13 20:30:00", + "2009-06-13 20:40:00", + "2009-06-13 20:50:00", + "2009-06-13 21:00:00", + ] + ), + columns=["SST", "T010", "T020", "T030", "T060", "T080", "T100"], + ) + tm.assert_frame_equal(result, expected) + + +def test_fwf_for_uint8(): + data = """1421302965.213420 PRI=3 PGN=0xef00 DST=0x17 SRC=0x28 04 154 00 00 00 00 00 127 +1421302964.226776 PRI=6 PGN=0xf002 SRC=0x47 243 00 00 255 247 00 00 71""" # noqa + df = read_fwf( + StringIO(data), + colspecs=[(0, 17), (25, 26), (33, 37), (49, 51), (58, 62), (63, 1000)], + names=["time", "pri", "pgn", "dst", "src", "data"], + converters={ + "pgn": lambda x: int(x, 16), + "src": lambda x: int(x, 16), + "dst": lambda x: int(x, 16), + "data": lambda x: len(x.split(" ")), + }, + ) + + expected = DataFrame( + [ + [1421302965.213420, 3, 61184, 23, 40, 8], + [1421302964.226776, 6, 61442, None, 71, 8], + ], + columns=["time", "pri", "pgn", "dst", "src", "data"], + ) + expected["dst"] = expected["dst"].astype(object) + tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize("comment", ["#", "~", "!"]) +def test_fwf_comment(comment): + data = """\ + 1 2. 4 #hello world + 5 NaN 10.0 +""" + data = data.replace("#", comment) + + colspecs = [(0, 3), (4, 9), (9, 25)] + expected = DataFrame([[1, 2.0, 4], [5, np.nan, 10.0]]) + + result = read_fwf(StringIO(data), colspecs=colspecs, header=None, comment=comment) + tm.assert_almost_equal(result, expected) + + +@pytest.mark.parametrize("thousands", [",", "#", "~"]) +def test_fwf_thousands(thousands): + data = """\ + 1 2,334.0 5 +10 13 10. +""" + data = data.replace(",", thousands) + + colspecs = [(0, 3), (3, 11), (12, 16)] + expected = DataFrame([[1, 2334.0, 5], [10, 13, 10.0]]) + + result = read_fwf( + StringIO(data), header=None, colspecs=colspecs, thousands=thousands + ) + tm.assert_almost_equal(result, expected) + + +@pytest.mark.parametrize("header", [True, False]) +def test_bool_header_arg(header): + # see gh-6114 + data = """\ +MyColumn + a + b + a + b""" + + msg = "Passing a bool to header is invalid" + with pytest.raises(TypeError, match=msg): + read_fwf(StringIO(data), header=header) + + +def test_full_file(): + # File with all values. + test = """index A B C +2000-01-03T00:00:00 0.980268513777 3 foo +2000-01-04T00:00:00 1.04791624281 -4 bar +2000-01-05T00:00:00 0.498580885705 73 baz +2000-01-06T00:00:00 1.12020151869 1 foo +2000-01-07T00:00:00 0.487094399463 0 bar +2000-01-10T00:00:00 0.836648671666 2 baz +2000-01-11T00:00:00 0.157160753327 34 foo""" + colspecs = ((0, 19), (21, 35), (38, 40), (42, 45)) + expected = read_fwf(StringIO(test), colspecs=colspecs) + + result = read_fwf(StringIO(test)) + tm.assert_frame_equal(result, expected) + + +def test_full_file_with_missing(): + # File with missing values. + test = """index A B C +2000-01-03T00:00:00 0.980268513777 3 foo +2000-01-04T00:00:00 1.04791624281 -4 bar + 0.498580885705 73 baz +2000-01-06T00:00:00 1.12020151869 1 foo +2000-01-07T00:00:00 0 bar +2000-01-10T00:00:00 0.836648671666 2 baz + 34""" + colspecs = ((0, 19), (21, 35), (38, 40), (42, 45)) + expected = read_fwf(StringIO(test), colspecs=colspecs) + + result = read_fwf(StringIO(test)) + tm.assert_frame_equal(result, expected) + + +def test_full_file_with_spaces(): + # File with spaces in columns. + test = """ +Account Name Balance CreditLimit AccountCreated +101 Keanu Reeves 9315.45 10000.00 1/17/1998 +312 Gerard Butler 90.00 1000.00 8/6/2003 +868 Jennifer Love Hewitt 0 17000.00 5/25/1985 +761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 +317 Bill Murray 789.65 5000.00 2/5/2007 +""".strip( + "\r\n" + ) + colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70)) + expected = read_fwf(StringIO(test), colspecs=colspecs) + + result = read_fwf(StringIO(test)) + tm.assert_frame_equal(result, expected) + + +def test_full_file_with_spaces_and_missing(): + # File with spaces and missing values in columns. + test = """ +Account Name Balance CreditLimit AccountCreated +101 10000.00 1/17/1998 +312 Gerard Butler 90.00 1000.00 8/6/2003 +868 5/25/1985 +761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 +317 Bill Murray 789.65 +""".strip( + "\r\n" + ) + colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70)) + expected = read_fwf(StringIO(test), colspecs=colspecs) + + result = read_fwf(StringIO(test)) + tm.assert_frame_equal(result, expected) + + +def test_messed_up_data(): + # Completely messed up file. + test = """ + Account Name Balance Credit Limit Account Created + 101 10000.00 1/17/1998 + 312 Gerard Butler 90.00 1000.00 + + 761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 + 317 Bill Murray 789.65 +""".strip( + "\r\n" + ) + colspecs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79)) + expected = read_fwf(StringIO(test), colspecs=colspecs) + + result = read_fwf(StringIO(test)) + tm.assert_frame_equal(result, expected) + + +def test_multiple_delimiters(): + test = r""" +col1~~~~~col2 col3++++++++++++++++++col4 +~~22.....11.0+++foo~~~~~~~~~~Keanu Reeves + 33+++122.33\\\bar.........Gerard Butler +++44~~~~12.01 baz~~Jennifer Love Hewitt +~~55 11+++foo++++Jada Pinkett-Smith +..66++++++.03~~~bar Bill Murray +""".strip( + "\r\n" + ) + delimiter = " +~.\\" + colspecs = ((0, 4), (7, 13), (15, 19), (21, 41)) + expected = read_fwf(StringIO(test), colspecs=colspecs, delimiter=delimiter) + + result = read_fwf(StringIO(test), delimiter=delimiter) + tm.assert_frame_equal(result, expected) + + +def test_variable_width_unicode(): + data = """ +שלום שלום +ום שלל +של ום +""".strip( + "\r\n" + ) + encoding = "utf8" + kwargs = dict(header=None, encoding=encoding) + + expected = read_fwf( + BytesIO(data.encode(encoding)), colspecs=[(0, 4), (5, 9)], **kwargs + ) + result = read_fwf(BytesIO(data.encode(encoding)), **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype", [dict(), {"a": "float64", "b": str, "c": "int32"}]) +def test_dtype(dtype): + data = """ a b c +1 2 3.2 +3 4 5.2 +""" + colspecs = [(0, 5), (5, 10), (10, None)] + result = read_fwf(StringIO(data), colspecs=colspecs, dtype=dtype) + + expected = pd.DataFrame( + {"a": [1, 3], "b": [2, 4], "c": [3.2, 5.2]}, columns=["a", "b", "c"] + ) + + for col, dt in dtype.items(): + expected[col] = expected[col].astype(dt) + + tm.assert_frame_equal(result, expected) + + +def test_skiprows_inference(): + # see gh-11256 + data = """ +Text contained in the file header + +DataCol1 DataCol2 + 0.0 1.0 + 101.6 956.1 +""".strip() + skiprows = 2 + expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True) + + result = read_fwf(StringIO(data), skiprows=skiprows) + tm.assert_frame_equal(result, expected) + + +def test_skiprows_by_index_inference(): + data = """ +To be skipped +Not To Be Skipped +Once more to be skipped +123 34 8 123 +456 78 9 456 +""".strip() + skiprows = [0, 2] + expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True) + + result = read_fwf(StringIO(data), skiprows=skiprows) + tm.assert_frame_equal(result, expected) + + +def test_skiprows_inference_empty(): + data = """ +AA BBB C +12 345 6 +78 901 2 +""".strip() + + msg = "No rows from which to infer column width" + with pytest.raises(EmptyDataError, match=msg): + read_fwf(StringIO(data), skiprows=3) + + +def test_whitespace_preservation(): + # see gh-16772 + header = None + csv_data = """ + a ,bbb + cc,dd """ + + fwf_data = """ + a bbb + ccdd """ + result = read_fwf( + StringIO(fwf_data), widths=[3, 3], header=header, skiprows=[0], delimiter="\n\t" + ) + expected = read_csv(StringIO(csv_data), header=header) + tm.assert_frame_equal(result, expected) + + +def test_default_delimiter(): + header = None + csv_data = """ +a,bbb +cc,dd""" + + fwf_data = """ +a \tbbb +cc\tdd """ + result = read_fwf(StringIO(fwf_data), widths=[3, 3], header=header, skiprows=[0]) + expected = read_csv(StringIO(csv_data), header=header) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("infer", [True, False, None]) +def test_fwf_compression(compression_only, infer): + data = """1111111111 + 2222222222 + 3333333333""".strip() + + compression = compression_only + extension = "gz" if compression == "gzip" else compression + + kwargs = dict(widths=[5, 5], names=["one", "two"]) + expected = read_fwf(StringIO(data), **kwargs) + + data = bytes(data, encoding="utf-8") + + with tm.ensure_clean(filename="tmp." + extension) as path: + tm.write_to_compressed(compression, path, data) + + if infer is not None: + kwargs["compression"] = "infer" if infer else compression + + result = read_fwf(path, **kwargs) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/io/parser/test_skiprows.py b/venv/Lib/site-packages/pandas/tests/io/parser/test_skiprows.py new file mode 100644 index 0000000..fdccef1 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/parser/test_skiprows.py @@ -0,0 +1,252 @@ +""" +Tests that skipped rows are properly handled during +parsing for all of the parsers defined in parsers.py +""" + +from datetime import datetime +from io import StringIO + +import numpy as np +import pytest + +from pandas.errors import EmptyDataError + +from pandas import DataFrame, Index +import pandas._testing as tm + + +@pytest.mark.parametrize("skiprows", [list(range(6)), 6]) +def test_skip_rows_bug(all_parsers, skiprows): + # see gh-505 + parser = all_parsers + text = """#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +1/1/2000,1.,2.,3. +1/2/2000,4,5,6 +1/3/2000,7,8,9 +""" + result = parser.read_csv( + StringIO(text), skiprows=skiprows, header=None, index_col=0, parse_dates=True + ) + index = Index( + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0 + ) + + expected = DataFrame( + np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index + ) + tm.assert_frame_equal(result, expected) + + +def test_deep_skip_rows(all_parsers): + # see gh-4382 + parser = all_parsers + data = "a,b,c\n" + "\n".join( + [",".join([str(i), str(i + 1), str(i + 2)]) for i in range(10)] + ) + condensed_data = "a,b,c\n" + "\n".join( + [",".join([str(i), str(i + 1), str(i + 2)]) for i in [0, 1, 2, 3, 4, 6, 8, 9]] + ) + + result = parser.read_csv(StringIO(data), skiprows=[6, 8]) + condensed_result = parser.read_csv(StringIO(condensed_data)) + tm.assert_frame_equal(result, condensed_result) + + +def test_skip_rows_blank(all_parsers): + # see gh-9832 + parser = all_parsers + text = """#foo,a,b,c +#foo,a,b,c + +#foo,a,b,c +#foo,a,b,c + +1/1/2000,1.,2.,3. +1/2/2000,4,5,6 +1/3/2000,7,8,9 +""" + data = parser.read_csv( + StringIO(text), skiprows=6, header=None, index_col=0, parse_dates=True + ) + index = Index( + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0 + ) + + expected = DataFrame( + np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index + ) + tm.assert_frame_equal(data, expected) + + +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + ( + """id,text,num_lines +1,"line 11 +line 12",2 +2,"line 21 +line 22",2 +3,"line 31",1""", + dict(skiprows=[1]), + DataFrame( + [[2, "line 21\nline 22", 2], [3, "line 31", 1]], + columns=["id", "text", "num_lines"], + ), + ), + ( + "a,b,c\n~a\n b~,~e\n d~,~f\n f~\n1,2,~12\n 13\n 14~", + dict(quotechar="~", skiprows=[2]), + DataFrame([["a\n b", "e\n d", "f\n f"]], columns=["a", "b", "c"]), + ), + ( + ( + "Text,url\n~example\n " + "sentence\n one~,url1\n~" + "example\n sentence\n two~,url2\n~" + "example\n sentence\n three~,url3" + ), + dict(quotechar="~", skiprows=[1, 3]), + DataFrame([["example\n sentence\n two", "url2"]], columns=["Text", "url"]), + ), + ], +) +def test_skip_row_with_newline(all_parsers, data, kwargs, expected): + # see gh-12775 and gh-10911 + parser = all_parsers + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_skip_row_with_quote(all_parsers): + # see gh-12775 and gh-10911 + parser = all_parsers + data = """id,text,num_lines +1,"line '11' line 12",2 +2,"line '21' line 22",2 +3,"line '31' line 32",1""" + + exp_data = [[2, "line '21' line 22", 2], [3, "line '31' line 32", 1]] + expected = DataFrame(exp_data, columns=["id", "text", "num_lines"]) + + result = parser.read_csv(StringIO(data), skiprows=[1]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data,exp_data", + [ + ( + """id,text,num_lines +1,"line \n'11' line 12",2 +2,"line \n'21' line 22",2 +3,"line \n'31' line 32",1""", + [[2, "line \n'21' line 22", 2], [3, "line \n'31' line 32", 1]], + ), + ( + """id,text,num_lines +1,"line '11\n' line 12",2 +2,"line '21\n' line 22",2 +3,"line '31\n' line 32",1""", + [[2, "line '21\n' line 22", 2], [3, "line '31\n' line 32", 1]], + ), + ( + """id,text,num_lines +1,"line '11\n' \r\tline 12",2 +2,"line '21\n' \r\tline 22",2 +3,"line '31\n' \r\tline 32",1""", + [[2, "line '21\n' \r\tline 22", 2], [3, "line '31\n' \r\tline 32", 1]], + ), + ], +) +def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data): + # see gh-12775 and gh-10911 + parser = all_parsers + result = parser.read_csv(StringIO(data), skiprows=[1]) + + expected = DataFrame(exp_data, columns=["id", "text", "num_lines"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "line_terminator", ["\n", "\r\n", "\r"] # "LF" # "CRLF" # "CR" +) +def test_skiprows_lineterminator(all_parsers, line_terminator): + # see gh-9079 + parser = all_parsers + data = "\n".join( + [ + "SMOSMANIA ThetaProbe-ML2X ", + "2007/01/01 01:00 0.2140 U M ", + "2007/01/01 02:00 0.2141 M O ", + "2007/01/01 04:00 0.2142 D M ", + ] + ) + expected = DataFrame( + [ + ["2007/01/01", "01:00", 0.2140, "U", "M"], + ["2007/01/01", "02:00", 0.2141, "M", "O"], + ["2007/01/01", "04:00", 0.2142, "D", "M"], + ], + columns=["date", "time", "var", "flag", "oflag"], + ) + + if parser.engine == "python" and line_terminator == "\r": + pytest.skip("'CR' not respect with the Python parser yet") + + data = data.replace("\n", line_terminator) + result = parser.read_csv( + StringIO(data), + skiprows=1, + delim_whitespace=True, + names=["date", "time", "var", "flag", "oflag"], + ) + tm.assert_frame_equal(result, expected) + + +def test_skiprows_infield_quote(all_parsers): + # see gh-14459 + parser = all_parsers + data = 'a"\nb"\na\n1' + expected = DataFrame({"a": [1]}) + + result = parser.read_csv(StringIO(data), skiprows=2) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "kwargs,expected", + [ + (dict(), DataFrame({"1": [3, 5]})), + (dict(header=0, names=["foo"]), DataFrame({"foo": [3, 5]})), + ], +) +def test_skip_rows_callable(all_parsers, kwargs, expected): + parser = all_parsers + data = "a\n1\n2\n3\n4\n5" + + result = parser.read_csv(StringIO(data), skiprows=lambda x: x % 2 == 0, **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_skip_rows_skip_all(all_parsers): + parser = all_parsers + data = "a\n1\n2\n3\n4\n5" + msg = "No columns to parse from file" + + with pytest.raises(EmptyDataError, match=msg): + parser.read_csv(StringIO(data), skiprows=lambda x: True) + + +def test_skip_rows_bad_callable(all_parsers): + msg = "by zero" + parser = all_parsers + data = "a\n1\n2\n3\n4\n5" + + with pytest.raises(ZeroDivisionError, match=msg): + parser.read_csv(StringIO(data), skiprows=lambda x: 1 / 0) diff --git a/venv/Lib/site-packages/pandas/tests/io/parser/test_textreader.py b/venv/Lib/site-packages/pandas/tests/io/parser/test_textreader.py new file mode 100644 index 0000000..8d5af85 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/parser/test_textreader.py @@ -0,0 +1,347 @@ +""" +Tests the TextReader class in parsers.pyx, which +is integral to the C engine in parsers.py +""" +from io import BytesIO, StringIO +import os + +import numpy as np +import pytest + +import pandas._libs.parsers as parser +from pandas._libs.parsers import TextReader + +from pandas import DataFrame +import pandas._testing as tm + +from pandas.io.parsers import TextFileReader, read_csv + + +class TestTextReader: + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath("io", "parser", "data") + self.csv1 = os.path.join(self.dirpath, "test1.csv") + self.csv2 = os.path.join(self.dirpath, "test2.csv") + self.xls1 = os.path.join(self.dirpath, "test.xls") + + def test_file_handle(self): + with open(self.csv1, "rb") as f: + reader = TextReader(f) + reader.read() + + def test_string_filename(self): + reader = TextReader(self.csv1, header=None) + reader.read() + + def test_file_handle_mmap(self): + with open(self.csv1, "rb") as f: + reader = TextReader(f, memory_map=True, header=None) + reader.read() + + def test_StringIO(self): + with open(self.csv1, "rb") as f: + text = f.read() + src = BytesIO(text) + reader = TextReader(src, header=None) + reader.read() + + def test_string_factorize(self): + # should this be optional? + data = "a\nb\na\nb\na" + reader = TextReader(StringIO(data), header=None) + result = reader.read() + assert len(set(map(id, result[0]))) == 2 + + def test_skipinitialspace(self): + data = "a, b\na, b\na, b\na, b" + + reader = TextReader(StringIO(data), skipinitialspace=True, header=None) + result = reader.read() + + tm.assert_numpy_array_equal( + result[0], np.array(["a", "a", "a", "a"], dtype=np.object_) + ) + tm.assert_numpy_array_equal( + result[1], np.array(["b", "b", "b", "b"], dtype=np.object_) + ) + + def test_parse_booleans(self): + data = "True\nFalse\nTrue\nTrue" + + reader = TextReader(StringIO(data), header=None) + result = reader.read() + + assert result[0].dtype == np.bool_ + + def test_delimit_whitespace(self): + data = 'a b\na\t\t "b"\n"a"\t \t b' + + reader = TextReader(StringIO(data), delim_whitespace=True, header=None) + result = reader.read() + + tm.assert_numpy_array_equal( + result[0], np.array(["a", "a", "a"], dtype=np.object_) + ) + tm.assert_numpy_array_equal( + result[1], np.array(["b", "b", "b"], dtype=np.object_) + ) + + def test_embedded_newline(self): + data = 'a\n"hello\nthere"\nthis' + + reader = TextReader(StringIO(data), header=None) + result = reader.read() + + expected = np.array(["a", "hello\nthere", "this"], dtype=np.object_) + tm.assert_numpy_array_equal(result[0], expected) + + def test_euro_decimal(self): + data = "12345,67\n345,678" + + reader = TextReader(StringIO(data), delimiter=":", decimal=",", header=None) + result = reader.read() + + expected = np.array([12345.67, 345.678]) + tm.assert_almost_equal(result[0], expected) + + def test_integer_thousands(self): + data = "123,456\n12,500" + + reader = TextReader(StringIO(data), delimiter=":", thousands=",", header=None) + result = reader.read() + + expected = np.array([123456, 12500], dtype=np.int64) + tm.assert_almost_equal(result[0], expected) + + def test_integer_thousands_alt(self): + data = "123.456\n12.500" + + reader = TextFileReader( + StringIO(data), delimiter=":", thousands=".", header=None + ) + result = reader.read() + + expected = DataFrame([123456, 12500]) + tm.assert_frame_equal(result, expected) + + def test_skip_bad_lines(self, capsys): + # too many lines, see #2430 for why + data = "a:b:c\nd:e:f\ng:h:i\nj:k:l:m\nl:m:n\no:p:q:r" + + reader = TextReader(StringIO(data), delimiter=":", header=None) + msg = r"Error tokenizing data\. C error: Expected 3 fields in line 4, saw 4" + with pytest.raises(parser.ParserError, match=msg): + reader.read() + + reader = TextReader( + StringIO(data), + delimiter=":", + header=None, + error_bad_lines=False, + warn_bad_lines=False, + ) + result = reader.read() + expected = { + 0: np.array(["a", "d", "g", "l"], dtype=object), + 1: np.array(["b", "e", "h", "m"], dtype=object), + 2: np.array(["c", "f", "i", "n"], dtype=object), + } + assert_array_dicts_equal(result, expected) + + reader = TextReader( + StringIO(data), + delimiter=":", + header=None, + error_bad_lines=False, + warn_bad_lines=True, + ) + reader.read() + captured = capsys.readouterr() + + assert "Skipping line 4" in captured.err + assert "Skipping line 6" in captured.err + + def test_header_not_enough_lines(self): + data = "skip this\nskip this\na,b,c\n1,2,3\n4,5,6" + + reader = TextReader(StringIO(data), delimiter=",", header=2) + header = reader.header + expected = [["a", "b", "c"]] + assert header == expected + + recs = reader.read() + expected = { + 0: np.array([1, 4], dtype=np.int64), + 1: np.array([2, 5], dtype=np.int64), + 2: np.array([3, 6], dtype=np.int64), + } + assert_array_dicts_equal(recs, expected) + + def test_escapechar(self): + data = '\\"hello world"\n\\"hello world"\n\\"hello world"' + + reader = TextReader(StringIO(data), delimiter=",", header=None, escapechar="\\") + result = reader.read() + expected = {0: np.array(['"hello world"'] * 3, dtype=object)} + assert_array_dicts_equal(result, expected) + + def test_eof_has_eol(self): + # handling of new line at EOF + pass + + def test_na_substitution(self): + pass + + def test_numpy_string_dtype(self): + data = """\ +a,1 +aa,2 +aaa,3 +aaaa,4 +aaaaa,5""" + + def _make_reader(**kwds): + return TextReader(StringIO(data), delimiter=",", header=None, **kwds) + + reader = _make_reader(dtype="S5,i4") + result = reader.read() + + assert result[0].dtype == "S5" + + ex_values = np.array(["a", "aa", "aaa", "aaaa", "aaaaa"], dtype="S5") + assert (result[0] == ex_values).all() + assert result[1].dtype == "i4" + + reader = _make_reader(dtype="S4") + result = reader.read() + assert result[0].dtype == "S4" + ex_values = np.array(["a", "aa", "aaa", "aaaa", "aaaa"], dtype="S4") + assert (result[0] == ex_values).all() + assert result[1].dtype == "S4" + + def test_pass_dtype(self): + data = """\ +one,two +1,a +2,b +3,c +4,d""" + + def _make_reader(**kwds): + return TextReader(StringIO(data), delimiter=",", **kwds) + + reader = _make_reader(dtype={"one": "u1", 1: "S1"}) + result = reader.read() + assert result[0].dtype == "u1" + assert result[1].dtype == "S1" + + reader = _make_reader(dtype={"one": np.uint8, 1: object}) + result = reader.read() + assert result[0].dtype == "u1" + assert result[1].dtype == "O" + + reader = _make_reader(dtype={"one": np.dtype("u1"), 1: np.dtype("O")}) + result = reader.read() + assert result[0].dtype == "u1" + assert result[1].dtype == "O" + + def test_usecols(self): + data = """\ +a,b,c +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + + def _make_reader(**kwds): + return TextReader(StringIO(data), delimiter=",", **kwds) + + reader = _make_reader(usecols=(1, 2)) + result = reader.read() + + exp = _make_reader().read() + assert len(result) == 2 + assert (result[1] == exp[1]).all() + assert (result[2] == exp[2]).all() + + def test_cr_delimited(self): + def _test(text, **kwargs): + nice_text = text.replace("\r", "\r\n") + result = TextReader(StringIO(text), **kwargs).read() + expected = TextReader(StringIO(nice_text), **kwargs).read() + assert_array_dicts_equal(result, expected) + + data = "a,b,c\r1,2,3\r4,5,6\r7,8,9\r10,11,12" + _test(data, delimiter=",") + + data = "a b c\r1 2 3\r4 5 6\r7 8 9\r10 11 12" + _test(data, delim_whitespace=True) + + data = "a,b,c\r1,2,3\r4,5,6\r,88,9\r10,11,12" + _test(data, delimiter=",") + + sample = ( + "A,B,C,D,E,F,G,H,I,J,K,L,M,N,O\r" + "AAAAA,BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0\r" + ",BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0" + ) + _test(sample, delimiter=",") + + data = "A B C\r 2 3\r4 5 6" + _test(data, delim_whitespace=True) + + data = "A B C\r2 3\r4 5 6" + _test(data, delim_whitespace=True) + + def test_empty_field_eof(self): + data = "a,b,c\n1,2,3\n4,," + + result = TextReader(StringIO(data), delimiter=",").read() + + expected = { + 0: np.array([1, 4], dtype=np.int64), + 1: np.array(["2", ""], dtype=object), + 2: np.array(["3", ""], dtype=object), + } + assert_array_dicts_equal(result, expected) + + # GH5664 + a = DataFrame([["b"], [np.nan]], columns=["a"], index=["a", "c"]) + b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]], columns=list("abcd"), index=[1, 1]) + c = DataFrame( + [ + [1, 2, 3, 4], + [6, np.nan, np.nan, np.nan], + [8, 9, 10, 11], + [13, 14, np.nan, np.nan], + ], + columns=list("abcd"), + index=[0, 5, 7, 12], + ) + + for _ in range(100): + df = read_csv(StringIO("a,b\nc\n"), skiprows=0, names=["a"], engine="c") + tm.assert_frame_equal(df, a) + + df = read_csv( + StringIO("1,1,1,1,0\n" * 2 + "\n" * 2), names=list("abcd"), engine="c" + ) + tm.assert_frame_equal(df, b) + + df = read_csv( + StringIO("0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14"), + names=list("abcd"), + engine="c", + ) + tm.assert_frame_equal(df, c) + + def test_empty_csv_input(self): + # GH14867 + df = read_csv(StringIO(), chunksize=20, header=None, names=["a", "b", "c"]) + assert isinstance(df, TextFileReader) + + +def assert_array_dicts_equal(left, right): + for k, v in left.items(): + tm.assert_numpy_array_equal(np.asarray(v), np.asarray(right[k])) diff --git a/venv/Lib/site-packages/pandas/tests/io/parser/test_unsupported.py b/venv/Lib/site-packages/pandas/tests/io/parser/test_unsupported.py new file mode 100644 index 0000000..267fae7 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/parser/test_unsupported.py @@ -0,0 +1,123 @@ +""" +Tests that features that are currently unsupported in +either the Python or C parser are actually enforced +and are clearly communicated to the user. + +Ultimately, the goal is to remove test cases from this +test suite as new feature support is added to the parsers. +""" +from io import StringIO + +import pytest + +from pandas.errors import ParserError + +import pandas._testing as tm + +import pandas.io.parsers as parsers +from pandas.io.parsers import read_csv + + +@pytest.fixture(params=["python", "python-fwf"], ids=lambda val: val) +def python_engine(request): + return request.param + + +class TestUnsupportedFeatures: + def test_mangle_dupe_cols_false(self): + # see gh-12935 + data = "a b c\n1 2 3" + msg = "is not supported" + + for engine in ("c", "python"): + with pytest.raises(ValueError, match=msg): + read_csv(StringIO(data), engine=engine, mangle_dupe_cols=False) + + def test_c_engine(self): + # see gh-6607 + data = "a b c\n1 2 3" + msg = "does not support" + + # specify C engine with unsupported options (raise) + with pytest.raises(ValueError, match=msg): + read_csv(StringIO(data), engine="c", sep=None, delim_whitespace=False) + with pytest.raises(ValueError, match=msg): + read_csv(StringIO(data), engine="c", sep=r"\s") + with pytest.raises(ValueError, match=msg): + read_csv(StringIO(data), engine="c", sep="\t", quotechar=chr(128)) + with pytest.raises(ValueError, match=msg): + read_csv(StringIO(data), engine="c", skipfooter=1) + + # specify C-unsupported options without python-unsupported options + with tm.assert_produces_warning(parsers.ParserWarning): + read_csv(StringIO(data), sep=None, delim_whitespace=False) + with tm.assert_produces_warning(parsers.ParserWarning): + read_csv(StringIO(data), sep=r"\s") + with tm.assert_produces_warning(parsers.ParserWarning): + read_csv(StringIO(data), sep="\t", quotechar=chr(128)) + with tm.assert_produces_warning(parsers.ParserWarning): + read_csv(StringIO(data), skipfooter=1) + + text = """ A B C D E +one two three four +a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 +a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 +x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" + msg = "Error tokenizing data" + + with pytest.raises(ParserError, match=msg): + read_csv(StringIO(text), sep="\\s+") + with pytest.raises(ParserError, match=msg): + read_csv(StringIO(text), engine="c", sep="\\s+") + + msg = "Only length-1 thousands markers supported" + data = """A|B|C +1|2,334|5 +10|13|10. +""" + with pytest.raises(ValueError, match=msg): + read_csv(StringIO(data), thousands=",,") + with pytest.raises(ValueError, match=msg): + read_csv(StringIO(data), thousands="") + + msg = "Only length-1 line terminators supported" + data = "a,b,c~~1,2,3~~4,5,6" + with pytest.raises(ValueError, match=msg): + read_csv(StringIO(data), lineterminator="~~") + + def test_python_engine(self, python_engine): + from pandas.io.parsers import _python_unsupported as py_unsupported + + data = """1,2,3,, +1,2,3,4, +1,2,3,4,5 +1,2,,, +1,2,3,4,""" + + for default in py_unsupported: + msg = ( + f"The {repr(default)} option is not " + f"supported with the {repr(python_engine)} engine" + ) + + kwargs = {default: object()} + with pytest.raises(ValueError, match=msg): + read_csv(StringIO(data), engine=python_engine, **kwargs) + + def test_python_engine_file_no_next(self, python_engine): + # see gh-16530 + class NoNextBuffer: + def __init__(self, csv_data): + self.data = csv_data + + def __iter__(self): + return self + + def read(self): + return self.data + + data = "a\n1" + msg = "The 'python' engine cannot iterate" + + with pytest.raises(ValueError, match=msg): + read_csv(NoNextBuffer(data), engine=python_engine) diff --git a/venv/Lib/site-packages/pandas/tests/io/parser/test_usecols.py b/venv/Lib/site-packages/pandas/tests/io/parser/test_usecols.py new file mode 100644 index 0000000..979eb47 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/parser/test_usecols.py @@ -0,0 +1,572 @@ +""" +Tests the usecols functionality during parsing +for all of the parsers defined in parsers.py +""" +from io import StringIO + +import numpy as np +import pytest + +from pandas._libs.tslib import Timestamp + +from pandas import DataFrame, Index +import pandas._testing as tm + +_msg_validate_usecols_arg = ( + "'usecols' must either be list-like " + "of all strings, all unicode, all " + "integers or a callable." +) +_msg_validate_usecols_names = ( + "Usecols do not match columns, columns expected but not found: {0}" +) + + +def test_raise_on_mixed_dtype_usecols(all_parsers): + # See gh-12678 + data = """a,b,c + 1000,2000,3000 + 4000,5000,6000 + """ + usecols = [0, "b", 2] + parser = all_parsers + + with pytest.raises(ValueError, match=_msg_validate_usecols_arg): + parser.read_csv(StringIO(data), usecols=usecols) + + +@pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")]) +def test_usecols(all_parsers, usecols): + data = """\ +a,b,c +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + parser = all_parsers + result = parser.read_csv(StringIO(data), usecols=usecols) + + expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"]) + tm.assert_frame_equal(result, expected) + + +def test_usecols_with_names(all_parsers): + data = """\ +a,b,c +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + parser = all_parsers + names = ["foo", "bar"] + result = parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0) + + expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=names) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])] +) +def test_usecols_relative_to_names(all_parsers, names, usecols): + data = """\ +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + parser = all_parsers + result = parser.read_csv(StringIO(data), names=names, header=None, usecols=usecols) + + expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"]) + tm.assert_frame_equal(result, expected) + + +def test_usecols_relative_to_names2(all_parsers): + # see gh-5766 + data = """\ +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + parser = all_parsers + result = parser.read_csv( + StringIO(data), names=["a", "b"], header=None, usecols=[0, 1] + ) + + expected = DataFrame([[1, 2], [4, 5], [7, 8], [10, 11]], columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + + +def test_usecols_name_length_conflict(all_parsers): + data = """\ +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + parser = all_parsers + msg = ( + "Number of passed names did not match number of header fields in the file" + if parser.engine == "python" + else "Passed header names mismatches usecols" + ) + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), names=["a", "b"], header=None, usecols=[1]) + + +def test_usecols_single_string(all_parsers): + # see gh-20558 + parser = all_parsers + data = """foo, bar, baz +1000, 2000, 3000 +4000, 5000, 6000""" + + with pytest.raises(ValueError, match=_msg_validate_usecols_arg): + parser.read_csv(StringIO(data), usecols="foo") + + +@pytest.mark.parametrize( + "data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"] +) +def test_usecols_index_col_false(all_parsers, data): + # see gh-9082 + parser = all_parsers + usecols = ["a", "c", "d"] + expected = DataFrame({"a": [1, 5], "c": [3, 7], "d": [4, 8]}) + + result = parser.read_csv(StringIO(data), usecols=usecols, index_col=False) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("index_col", ["b", 0]) +@pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]]) +def test_usecols_index_col_conflict(all_parsers, usecols, index_col): + # see gh-4201: test that index_col as integer reflects usecols + parser = all_parsers + data = "a,b,c,d\nA,a,1,one\nB,b,2,two" + expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b")) + + result = parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col) + tm.assert_frame_equal(result, expected) + + +def test_usecols_index_col_conflict2(all_parsers): + # see gh-4201: test that index_col as integer reflects usecols + parser = all_parsers + data = "a,b,c,d\nA,a,1,one\nB,b,2,two" + + expected = DataFrame({"b": ["a", "b"], "c": [1, 2], "d": ("one", "two")}) + expected = expected.set_index(["b", "c"]) + + result = parser.read_csv( + StringIO(data), usecols=["b", "c", "d"], index_col=["b", "c"] + ) + tm.assert_frame_equal(result, expected) + + +def test_usecols_implicit_index_col(all_parsers): + # see gh-2654 + parser = all_parsers + data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10" + + result = parser.read_csv(StringIO(data), usecols=["a", "b"]) + expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) + tm.assert_frame_equal(result, expected) + + +def test_usecols_regex_sep(all_parsers): + # see gh-2733 + parser = all_parsers + data = "a b c\n4 apple bat 5.7\n8 orange cow 10" + result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b")) + + expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) + tm.assert_frame_equal(result, expected) + + +def test_usecols_with_whitespace(all_parsers): + parser = all_parsers + data = "a b c\n4 apple bat 5.7\n8 orange cow 10" + + result = parser.read_csv(StringIO(data), delim_whitespace=True, usecols=("a", "b")) + expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "usecols,expected", + [ + # Column selection by index. + ([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]], columns=["2", "0"])), + # Column selection by name. + (["0", "1"], DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"]),), + ], +) +def test_usecols_with_integer_like_header(all_parsers, usecols, expected): + parser = all_parsers + data = """2,0,1 +1000,2000,3000 +4000,5000,6000""" + + result = parser.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) +def test_usecols_with_parse_dates(all_parsers, usecols): + # see gh-9755 + data = """a,b,c,d,e +0,1,20140101,0900,4 +0,1,20140102,1000,4""" + parser = all_parsers + parse_dates = [[1, 2]] + + cols = { + "a": [0, 0], + "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], + } + expected = DataFrame(cols, columns=["c_d", "a"]) + result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) + tm.assert_frame_equal(result, expected) + + +def test_usecols_with_parse_dates2(all_parsers): + # see gh-13604 + parser = all_parsers + data = """2008-02-07 09:40,1032.43 +2008-02-07 09:50,1042.54 +2008-02-07 10:00,1051.65""" + + names = ["date", "values"] + usecols = names[:] + parse_dates = [0] + + index = Index( + [ + Timestamp("2008-02-07 09:40"), + Timestamp("2008-02-07 09:50"), + Timestamp("2008-02-07 10:00"), + ], + name="date", + ) + cols = {"values": [1032.43, 1042.54, 1051.65]} + expected = DataFrame(cols, index=index) + + result = parser.read_csv( + StringIO(data), + parse_dates=parse_dates, + index_col=0, + usecols=usecols, + header=None, + names=names, + ) + tm.assert_frame_equal(result, expected) + + +def test_usecols_with_parse_dates3(all_parsers): + # see gh-14792 + parser = all_parsers + data = """a,b,c,d,e,f,g,h,i,j +2016/09/21,1,1,2,3,4,5,6,7,8""" + + usecols = list("abcdefghij") + parse_dates = [0] + + cols = { + "a": Timestamp("2016-09-21"), + "b": [1], + "c": [1], + "d": [2], + "e": [3], + "f": [4], + "g": [5], + "h": [6], + "i": [7], + "j": [8], + } + expected = DataFrame(cols, columns=usecols) + + result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) + tm.assert_frame_equal(result, expected) + + +def test_usecols_with_parse_dates4(all_parsers): + data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8" + usecols = list("abcdefghij") + parse_dates = [[0, 1]] + parser = all_parsers + + cols = { + "a_b": "2016/09/21 1", + "c": [1], + "d": [2], + "e": [3], + "f": [4], + "g": [5], + "h": [6], + "i": [7], + "j": [8], + } + expected = DataFrame(cols, columns=["a_b"] + list("cdefghij")) + + result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) +@pytest.mark.parametrize( + "names", + [ + list("abcde"), # Names span all columns in original data. + list("acd"), # Names span only the selected columns. + ], +) +def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names): + # see gh-9755 + s = """0,1,20140101,0900,4 +0,1,20140102,1000,4""" + parse_dates = [[1, 2]] + parser = all_parsers + + cols = { + "a": [0, 0], + "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], + } + expected = DataFrame(cols, columns=["c_d", "a"]) + + result = parser.read_csv( + StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols + ) + tm.assert_frame_equal(result, expected) + + +def test_usecols_with_unicode_strings(all_parsers): + # see gh-13219 + data = """AAA,BBB,CCC,DDD +0.056674973,8,True,a +2.613230982,2,False,b +3.568935038,7,False,a""" + parser = all_parsers + + exp_data = { + "AAA": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002}, + "BBB": {0: 8, 1: 2, 2: 7}, + } + expected = DataFrame(exp_data) + + result = parser.read_csv(StringIO(data), usecols=["AAA", "BBB"]) + tm.assert_frame_equal(result, expected) + + +def test_usecols_with_single_byte_unicode_strings(all_parsers): + # see gh-13219 + data = """A,B,C,D +0.056674973,8,True,a +2.613230982,2,False,b +3.568935038,7,False,a""" + parser = all_parsers + + exp_data = { + "A": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002}, + "B": {0: 8, 1: 2, 2: 7}, + } + expected = DataFrame(exp_data) + + result = parser.read_csv(StringIO(data), usecols=["A", "B"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("usecols", [["AAA", b"BBB"], [b"AAA", "BBB"]]) +def test_usecols_with_mixed_encoding_strings(all_parsers, usecols): + data = """AAA,BBB,CCC,DDD +0.056674973,8,True,a +2.613230982,2,False,b +3.568935038,7,False,a""" + parser = all_parsers + + with pytest.raises(ValueError, match=_msg_validate_usecols_arg): + parser.read_csv(StringIO(data), usecols=usecols) + + +@pytest.mark.parametrize("usecols", [["あああ", "いい"], ["あああ", "いい"]]) +def test_usecols_with_multi_byte_characters(all_parsers, usecols): + data = """あああ,いい,ううう,ええええ +0.056674973,8,True,a +2.613230982,2,False,b +3.568935038,7,False,a""" + parser = all_parsers + + exp_data = { + "あああ": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002}, + "いい": {0: 8, 1: 2, 2: 7}, + } + expected = DataFrame(exp_data) + + result = parser.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(result, expected) + + +def test_empty_usecols(all_parsers): + data = "a,b,c\n1,2,3\n4,5,6" + expected = DataFrame() + parser = all_parsers + + result = parser.read_csv(StringIO(data), usecols=set()) + tm.assert_frame_equal(result, expected) + + +def test_np_array_usecols(all_parsers): + # see gh-12546 + parser = all_parsers + data = "a,b,c\n1,2,3" + usecols = np.array(["a", "b"]) + + expected = DataFrame([[1, 2]], columns=usecols) + result = parser.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "usecols,expected", + [ + ( + lambda x: x.upper() in ["AAA", "BBB", "DDD"], + DataFrame( + { + "AaA": { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002, + }, + "bBb": {0: 8, 1: 2, 2: 7}, + "ddd": {0: "a", 1: "b", 2: "a"}, + } + ), + ), + (lambda x: False, DataFrame()), + ], +) +def test_callable_usecols(all_parsers, usecols, expected): + # see gh-14154 + data = """AaA,bBb,CCC,ddd +0.056674973,8,True,a +2.613230982,2,False,b +3.568935038,7,False,a""" + parser = all_parsers + + result = parser.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]]) +def test_incomplete_first_row(all_parsers, usecols): + # see gh-6710 + data = "1,2\n1,2,3" + parser = all_parsers + names = ["a", "b", "c"] + expected = DataFrame({"a": [1, 1], "c": [np.nan, 3]}) + + result = parser.read_csv(StringIO(data), names=names, usecols=usecols) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data,usecols,kwargs,expected", + [ + # see gh-8985 + ( + "19,29,39\n" * 2 + "10,20,30,40", + [0, 1, 2], + dict(header=None), + DataFrame([[19, 29, 39], [19, 29, 39], [10, 20, 30]]), + ), + # see gh-9549 + ( + ("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n1,2,3,,,1,\n1,2,3\n5,6,7"), + ["A", "B", "C"], + dict(), + DataFrame( + { + "A": [1, 3, 1, 1, 1, 5], + "B": [2, 4, 2, 2, 2, 6], + "C": [3, 5, 4, 3, 3, 7], + } + ), + ), + ], +) +def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected): + # see gh-8985 + parser = all_parsers + result = parser.read_csv(StringIO(data), usecols=usecols, **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "usecols,kwargs,expected,msg", + [ + ( + ["a", "b", "c", "d"], + dict(), + DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}), + None, + ), + ( + ["a", "b", "c", "f"], + dict(), + None, + _msg_validate_usecols_names.format(r"\['f'\]"), + ), + (["a", "b", "f"], dict(), None, _msg_validate_usecols_names.format(r"\['f'\]")), + ( + ["a", "b", "f", "g"], + dict(), + None, + _msg_validate_usecols_names.format(r"\[('f', 'g'|'g', 'f')\]"), + ), + # see gh-14671 + ( + None, + dict(header=0, names=["A", "B", "C", "D"]), + DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 7], "D": [4, 8]}), + None, + ), + ( + ["A", "B", "C", "f"], + dict(header=0, names=["A", "B", "C", "D"]), + None, + _msg_validate_usecols_names.format(r"\['f'\]"), + ), + ( + ["A", "B", "f"], + dict(names=["A", "B", "C", "D"]), + None, + _msg_validate_usecols_names.format(r"\['f'\]"), + ), + ], +) +def test_raises_on_usecols_names_mismatch(all_parsers, usecols, kwargs, expected, msg): + data = "a,b,c,d\n1,2,3,4\n5,6,7,8" + kwargs.update(usecols=usecols) + parser = all_parsers + + if expected is None: + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + else: + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.xfail( + reason="see gh-16469: works on the C engine but not the Python engine", strict=False +) +@pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]]) +def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols): + data = "a,b,c,d\n1,2,3,4\n5,6,7,8" + names = ["A", "B", "C", "D"] + parser = all_parsers + + result = parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols) + expected = DataFrame({"A": [1, 5], "C": [3, 7]}) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/io/pytables/__init__.py b/venv/Lib/site-packages/pandas/tests/io/pytables/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/io/pytables/common.py b/venv/Lib/site-packages/pandas/tests/io/pytables/common.py new file mode 100644 index 0000000..d06f467 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/pytables/common.py @@ -0,0 +1,82 @@ +from contextlib import contextmanager +import os +import tempfile + +import pytest + +from pandas.io.pytables import HDFStore + +tables = pytest.importorskip("tables") +# set these parameters so we don't have file sharing +tables.parameters.MAX_NUMEXPR_THREADS = 1 +tables.parameters.MAX_BLOSC_THREADS = 1 +tables.parameters.MAX_THREADS = 1 + + +def safe_remove(path): + if path is not None: + try: + os.remove(path) + except OSError: + pass + + +def safe_close(store): + try: + if store is not None: + store.close() + except IOError: + pass + + +def create_tempfile(path): + """ create an unopened named temporary file """ + return os.path.join(tempfile.gettempdir(), path) + + +# contextmanager to ensure the file cleanup +@contextmanager +def ensure_clean_store(path, mode="a", complevel=None, complib=None, fletcher32=False): + + try: + + # put in the temporary path if we don't have one already + if not len(os.path.dirname(path)): + path = create_tempfile(path) + + store = HDFStore( + path, mode=mode, complevel=complevel, complib=complib, fletcher32=False + ) + yield store + finally: + safe_close(store) + if mode == "w" or mode == "a": + safe_remove(path) + + +@contextmanager +def ensure_clean_path(path): + """ + return essentially a named temporary file that is not opened + and deleted on exiting; if path is a list, then create and + return list of filenames + """ + try: + if isinstance(path, list): + filenames = [create_tempfile(p) for p in path] + yield filenames + else: + filenames = [create_tempfile(path)] + yield filenames[0] + finally: + for f in filenames: + safe_remove(f) + + +def _maybe_remove(store, key): + """For tests using tables, try removing the table to be sure there is + no content from previous tests using the same table name.""" + try: + store.remove(key) + except (ValueError, KeyError): + pass diff --git a/venv/Lib/site-packages/pandas/tests/io/pytables/conftest.py b/venv/Lib/site-packages/pandas/tests/io/pytables/conftest.py new file mode 100644 index 0000000..214f95c --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/pytables/conftest.py @@ -0,0 +1,17 @@ +import pytest + +import pandas._testing as tm + + +@pytest.fixture +def setup_path(): + """Fixture for setup path""" + return "tmp.__{}__.h5".format(tm.rands(10)) + + +@pytest.fixture(scope="module", autouse=True) +def setup_mode(): + """ Reset testing mode fixture""" + tm.reset_testing_mode() + yield + tm.set_testing_mode() diff --git a/venv/Lib/site-packages/pandas/tests/io/pytables/test_compat.py b/venv/Lib/site-packages/pandas/tests/io/pytables/test_compat.py new file mode 100644 index 0000000..c720038 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/pytables/test_compat.py @@ -0,0 +1,77 @@ +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.tests.io.pytables.common import ensure_clean_path + +tables = pytest.importorskip("tables") + + +@pytest.fixture +def pytables_hdf5_file(): + """ + Use PyTables to create a simple HDF5 file. + """ + table_schema = { + "c0": tables.Time64Col(pos=0), + "c1": tables.StringCol(5, pos=1), + "c2": tables.Int64Col(pos=2), + } + + t0 = 1_561_105_000.0 + + testsamples = [ + {"c0": t0, "c1": "aaaaa", "c2": 1}, + {"c0": t0 + 1, "c1": "bbbbb", "c2": 2}, + {"c0": t0 + 2, "c1": "ccccc", "c2": 10 ** 5}, + {"c0": t0 + 3, "c1": "ddddd", "c2": 4_294_967_295}, + ] + + objname = "pandas_test_timeseries" + + with ensure_clean_path("written_with_pytables.h5") as path: + # The `ensure_clean_path` context mgr removes the temp file upon exit. + with tables.open_file(path, mode="w") as f: + t = f.create_table("/", name=objname, description=table_schema) + for sample in testsamples: + for key, value in sample.items(): + t.row[key] = value + t.row.append() + + yield path, objname, pd.DataFrame(testsamples) + + +class TestReadPyTablesHDF5: + """ + A group of tests which covers reading HDF5 files written by plain PyTables + (not written by pandas). + + Was introduced for regression-testing issue 11188. + """ + + def test_read_complete(self, pytables_hdf5_file): + path, objname, df = pytables_hdf5_file + result = pd.read_hdf(path, key=objname) + expected = df + tm.assert_frame_equal(result, expected) + + def test_read_with_start(self, pytables_hdf5_file): + path, objname, df = pytables_hdf5_file + # This is a regression test for pandas-dev/pandas/issues/11188 + result = pd.read_hdf(path, key=objname, start=1) + expected = df[1:].reset_index(drop=True) + tm.assert_frame_equal(result, expected) + + def test_read_with_stop(self, pytables_hdf5_file): + path, objname, df = pytables_hdf5_file + # This is a regression test for pandas-dev/pandas/issues/11188 + result = pd.read_hdf(path, key=objname, stop=1) + expected = df[:1].reset_index(drop=True) + tm.assert_frame_equal(result, expected) + + def test_read_with_startstop(self, pytables_hdf5_file): + path, objname, df = pytables_hdf5_file + # This is a regression test for pandas-dev/pandas/issues/11188 + result = pd.read_hdf(path, key=objname, start=1, stop=2) + expected = df[1:2].reset_index(drop=True) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/io/pytables/test_complex.py b/venv/Lib/site-packages/pandas/tests/io/pytables/test_complex.py new file mode 100644 index 0000000..543940e --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/pytables/test_complex.py @@ -0,0 +1,185 @@ +from warnings import catch_warnings + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import DataFrame, Series +import pandas._testing as tm +from pandas.tests.io.pytables.common import ensure_clean_path, ensure_clean_store + +from pandas.io.pytables import read_hdf + +# GH10447 + + +def test_complex_fixed(setup_path): + df = DataFrame( + np.random.rand(4, 5).astype(np.complex64), + index=list("abcd"), + columns=list("ABCDE"), + ) + + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df") + reread = read_hdf(path, "df") + tm.assert_frame_equal(df, reread) + + df = DataFrame( + np.random.rand(4, 5).astype(np.complex128), + index=list("abcd"), + columns=list("ABCDE"), + ) + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df") + reread = read_hdf(path, "df") + tm.assert_frame_equal(df, reread) + + +def test_complex_table(setup_path): + df = DataFrame( + np.random.rand(4, 5).astype(np.complex64), + index=list("abcd"), + columns=list("ABCDE"), + ) + + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", format="table") + reread = read_hdf(path, "df") + tm.assert_frame_equal(df, reread) + + df = DataFrame( + np.random.rand(4, 5).astype(np.complex128), + index=list("abcd"), + columns=list("ABCDE"), + ) + + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", format="table", mode="w") + reread = read_hdf(path, "df") + tm.assert_frame_equal(df, reread) + + +@td.xfail_non_writeable +def test_complex_mixed_fixed(setup_path): + complex64 = np.array( + [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64 + ) + complex128 = np.array( + [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128 + ) + df = DataFrame( + { + "A": [1, 2, 3, 4], + "B": ["a", "b", "c", "d"], + "C": complex64, + "D": complex128, + "E": [1.0, 2.0, 3.0, 4.0], + }, + index=list("abcd"), + ) + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df") + reread = read_hdf(path, "df") + tm.assert_frame_equal(df, reread) + + +def test_complex_mixed_table(setup_path): + complex64 = np.array( + [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64 + ) + complex128 = np.array( + [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128 + ) + df = DataFrame( + { + "A": [1, 2, 3, 4], + "B": ["a", "b", "c", "d"], + "C": complex64, + "D": complex128, + "E": [1.0, 2.0, 3.0, 4.0], + }, + index=list("abcd"), + ) + + with ensure_clean_store(setup_path) as store: + store.append("df", df, data_columns=["A", "B"]) + result = store.select("df", where="A>2") + tm.assert_frame_equal(df.loc[df.A > 2], result) + + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", format="table") + reread = read_hdf(path, "df") + tm.assert_frame_equal(df, reread) + + +def test_complex_across_dimensions_fixed(setup_path): + with catch_warnings(record=True): + complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) + s = Series(complex128, index=list("abcd")) + df = DataFrame({"A": s, "B": s}) + + objs = [s, df] + comps = [tm.assert_series_equal, tm.assert_frame_equal] + for obj, comp in zip(objs, comps): + with ensure_clean_path(setup_path) as path: + obj.to_hdf(path, "obj", format="fixed") + reread = read_hdf(path, "obj") + comp(obj, reread) + + +def test_complex_across_dimensions(setup_path): + complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) + s = Series(complex128, index=list("abcd")) + df = DataFrame({"A": s, "B": s}) + + with catch_warnings(record=True): + + objs = [df] + comps = [tm.assert_frame_equal] + for obj, comp in zip(objs, comps): + with ensure_clean_path(setup_path) as path: + obj.to_hdf(path, "obj", format="table") + reread = read_hdf(path, "obj") + comp(obj, reread) + + +def test_complex_indexing_error(setup_path): + complex128 = np.array( + [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128 + ) + df = DataFrame( + {"A": [1, 2, 3, 4], "B": ["a", "b", "c", "d"], "C": complex128}, + index=list("abcd"), + ) + with ensure_clean_store(setup_path) as store: + with pytest.raises(TypeError): + store.append("df", df, data_columns=["C"]) + + +def test_complex_series_error(setup_path): + complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) + s = Series(complex128, index=list("abcd")) + + with ensure_clean_path(setup_path) as path: + with pytest.raises(TypeError): + s.to_hdf(path, "obj", format="t") + + with ensure_clean_path(setup_path) as path: + s.to_hdf(path, "obj", format="t", index=False) + reread = read_hdf(path, "obj") + tm.assert_series_equal(s, reread) + + +def test_complex_append(setup_path): + df = DataFrame( + {"a": np.random.randn(100).astype(np.complex128), "b": np.random.randn(100)} + ) + + with ensure_clean_store(setup_path) as store: + store.append("df", df, data_columns=["b"]) + store.append("df", df) + result = store.select("df") + tm.assert_frame_equal(pd.concat([df, df], 0), result) diff --git a/venv/Lib/site-packages/pandas/tests/io/pytables/test_pytables_missing.py b/venv/Lib/site-packages/pandas/tests/io/pytables/test_pytables_missing.py new file mode 100644 index 0000000..9adb0a6 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/pytables/test_pytables_missing.py @@ -0,0 +1,14 @@ +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +import pandas._testing as tm + + +@td.skip_if_installed("tables") +def test_pytables_raises(): + df = pd.DataFrame({"A": [1, 2]}) + with pytest.raises(ImportError, match="tables"): + with tm.ensure_clean("foo.h5") as path: + df.to_hdf(path, "df") diff --git a/venv/Lib/site-packages/pandas/tests/io/pytables/test_store.py b/venv/Lib/site-packages/pandas/tests/io/pytables/test_store.py new file mode 100644 index 0000000..f56d042 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/pytables/test_store.py @@ -0,0 +1,4796 @@ +import datetime +from datetime import timedelta +from distutils.version import LooseVersion +from io import BytesIO +import os +from pathlib import Path +import re +from warnings import catch_warnings, simplefilter + +import numpy as np +import pytest + +from pandas.compat import is_platform_little_endian, is_platform_windows +import pandas.util._test_decorators as td + +from pandas.core.dtypes.common import is_categorical_dtype + +import pandas as pd +from pandas import ( + Categorical, + CategoricalIndex, + DataFrame, + DatetimeIndex, + Index, + Int64Index, + MultiIndex, + RangeIndex, + Series, + Timestamp, + bdate_range, + concat, + date_range, + isna, + timedelta_range, +) +import pandas._testing as tm +from pandas.tests.io.pytables.common import ( + _maybe_remove, + create_tempfile, + ensure_clean_path, + ensure_clean_store, + safe_close, + safe_remove, + tables, +) + +from pandas.io.pytables import ( + ClosedFileError, + HDFStore, + PossibleDataLossError, + Term, + read_hdf, +) + +from pandas.io import pytables as pytables # noqa: E402 isort:skip +from pandas.io.pytables import TableIterator # noqa: E402 isort:skip + + +_default_compressor = "blosc" +ignore_natural_naming_warning = pytest.mark.filterwarnings( + "ignore:object name:tables.exceptions.NaturalNameWarning" +) + + +@pytest.mark.single +class TestHDFStore: + def test_format_type(self, setup_path): + df = pd.DataFrame({"A": [1, 2]}) + with ensure_clean_path(setup_path) as path: + with HDFStore(path) as store: + store.put("a", df, format="fixed") + store.put("b", df, format="table") + + assert store.get_storer("a").format_type == "fixed" + assert store.get_storer("b").format_type == "table" + + def test_format_kwarg_in_constructor(self, setup_path): + # GH 13291 + + msg = "format is not a defined argument for HDFStore" + + with ensure_clean_path(setup_path) as path: + with pytest.raises(ValueError, match=msg): + HDFStore(path, format="table") + + def test_context(self, setup_path): + path = create_tempfile(setup_path) + try: + with HDFStore(path) as tbl: + raise ValueError("blah") + except ValueError: + pass + finally: + safe_remove(path) + + try: + with HDFStore(path) as tbl: + tbl["a"] = tm.makeDataFrame() + + with HDFStore(path) as tbl: + assert len(tbl) == 1 + assert type(tbl["a"]) == DataFrame + finally: + safe_remove(path) + + def test_conv_read_write(self, setup_path): + path = create_tempfile(setup_path) + try: + + def roundtrip(key, obj, **kwargs): + obj.to_hdf(path, key, **kwargs) + return read_hdf(path, key) + + o = tm.makeTimeSeries() + tm.assert_series_equal(o, roundtrip("series", o)) + + o = tm.makeStringSeries() + tm.assert_series_equal(o, roundtrip("string_series", o)) + + o = tm.makeDataFrame() + tm.assert_frame_equal(o, roundtrip("frame", o)) + + # table + df = DataFrame(dict(A=range(5), B=range(5))) + df.to_hdf(path, "table", append=True) + result = read_hdf(path, "table", where=["index>2"]) + tm.assert_frame_equal(df[df.index > 2], result) + + finally: + safe_remove(path) + + def test_long_strings(self, setup_path): + + # GH6166 + df = DataFrame( + {"a": tm.rands_array(100, size=10)}, index=tm.rands_array(100, size=10) + ) + + with ensure_clean_store(setup_path) as store: + store.append("df", df, data_columns=["a"]) + + result = store.select("df") + tm.assert_frame_equal(df, result) + + def test_api(self, setup_path): + + # GH4584 + # API issue when to_hdf doesn't accept append AND format args + with ensure_clean_path(setup_path) as path: + + df = tm.makeDataFrame() + df.iloc[:10].to_hdf(path, "df", append=True, format="table") + df.iloc[10:].to_hdf(path, "df", append=True, format="table") + tm.assert_frame_equal(read_hdf(path, "df"), df) + + # append to False + df.iloc[:10].to_hdf(path, "df", append=False, format="table") + df.iloc[10:].to_hdf(path, "df", append=True, format="table") + tm.assert_frame_equal(read_hdf(path, "df"), df) + + with ensure_clean_path(setup_path) as path: + + df = tm.makeDataFrame() + df.iloc[:10].to_hdf(path, "df", append=True) + df.iloc[10:].to_hdf(path, "df", append=True, format="table") + tm.assert_frame_equal(read_hdf(path, "df"), df) + + # append to False + df.iloc[:10].to_hdf(path, "df", append=False, format="table") + df.iloc[10:].to_hdf(path, "df", append=True) + tm.assert_frame_equal(read_hdf(path, "df"), df) + + with ensure_clean_path(setup_path) as path: + + df = tm.makeDataFrame() + df.to_hdf(path, "df", append=False, format="fixed") + tm.assert_frame_equal(read_hdf(path, "df"), df) + + df.to_hdf(path, "df", append=False, format="f") + tm.assert_frame_equal(read_hdf(path, "df"), df) + + df.to_hdf(path, "df", append=False) + tm.assert_frame_equal(read_hdf(path, "df"), df) + + df.to_hdf(path, "df") + tm.assert_frame_equal(read_hdf(path, "df"), df) + + with ensure_clean_store(setup_path) as store: + + path = store._path + df = tm.makeDataFrame() + + _maybe_remove(store, "df") + store.append("df", df.iloc[:10], append=True, format="table") + store.append("df", df.iloc[10:], append=True, format="table") + tm.assert_frame_equal(store.select("df"), df) + + # append to False + _maybe_remove(store, "df") + store.append("df", df.iloc[:10], append=False, format="table") + store.append("df", df.iloc[10:], append=True, format="table") + tm.assert_frame_equal(store.select("df"), df) + + # formats + _maybe_remove(store, "df") + store.append("df", df.iloc[:10], append=False, format="table") + store.append("df", df.iloc[10:], append=True, format="table") + tm.assert_frame_equal(store.select("df"), df) + + _maybe_remove(store, "df") + store.append("df", df.iloc[:10], append=False, format="table") + store.append("df", df.iloc[10:], append=True, format=None) + tm.assert_frame_equal(store.select("df"), df) + + with ensure_clean_path(setup_path) as path: + # Invalid. + df = tm.makeDataFrame() + + msg = "Can only append to Tables" + + with pytest.raises(ValueError, match=msg): + df.to_hdf(path, "df", append=True, format="f") + + with pytest.raises(ValueError, match=msg): + df.to_hdf(path, "df", append=True, format="fixed") + + msg = r"invalid HDFStore format specified \[foo\]" + + with pytest.raises(TypeError, match=msg): + df.to_hdf(path, "df", append=True, format="foo") + + with pytest.raises(TypeError, match=msg): + df.to_hdf(path, "df", append=False, format="foo") + + # File path doesn't exist + path = "" + msg = f"File {path} does not exist" + + with pytest.raises(FileNotFoundError, match=msg): + read_hdf(path, "df") + + def test_api_default_format(self, setup_path): + + # default_format option + with ensure_clean_store(setup_path) as store: + df = tm.makeDataFrame() + + pd.set_option("io.hdf.default_format", "fixed") + _maybe_remove(store, "df") + store.put("df", df) + assert not store.get_storer("df").is_table + + msg = "Can only append to Tables" + + with pytest.raises(ValueError, match=msg): + store.append("df2", df) + + pd.set_option("io.hdf.default_format", "table") + _maybe_remove(store, "df") + store.put("df", df) + assert store.get_storer("df").is_table + _maybe_remove(store, "df2") + store.append("df2", df) + assert store.get_storer("df").is_table + + pd.set_option("io.hdf.default_format", None) + + with ensure_clean_path(setup_path) as path: + + df = tm.makeDataFrame() + + pd.set_option("io.hdf.default_format", "fixed") + df.to_hdf(path, "df") + with HDFStore(path) as store: + assert not store.get_storer("df").is_table + with pytest.raises(ValueError, match=msg): + df.to_hdf(path, "df2", append=True) + + pd.set_option("io.hdf.default_format", "table") + df.to_hdf(path, "df3") + with HDFStore(path) as store: + assert store.get_storer("df3").is_table + df.to_hdf(path, "df4", append=True) + with HDFStore(path) as store: + assert store.get_storer("df4").is_table + + pd.set_option("io.hdf.default_format", None) + + def test_keys(self, setup_path): + + with ensure_clean_store(setup_path) as store: + store["a"] = tm.makeTimeSeries() + store["b"] = tm.makeStringSeries() + store["c"] = tm.makeDataFrame() + + assert len(store) == 3 + expected = {"/a", "/b", "/c"} + assert set(store.keys()) == expected + assert set(store) == expected + + def test_keys_ignore_hdf_softlink(self, setup_path): + + # GH 20523 + # Puts a softlink into HDF file and rereads + + with ensure_clean_store(setup_path) as store: + + df = DataFrame(dict(A=range(5), B=range(5))) + store.put("df", df) + + assert store.keys() == ["/df"] + + store._handle.create_soft_link(store._handle.root, "symlink", "df") + + # Should ignore the softlink + assert store.keys() == ["/df"] + + def test_iter_empty(self, setup_path): + + with ensure_clean_store(setup_path) as store: + # GH 12221 + assert list(store) == [] + + def test_repr(self, setup_path): + + with ensure_clean_store(setup_path) as store: + repr(store) + store.info() + store["a"] = tm.makeTimeSeries() + store["b"] = tm.makeStringSeries() + store["c"] = tm.makeDataFrame() + + df = tm.makeDataFrame() + df["obj1"] = "foo" + df["obj2"] = "bar" + df["bool1"] = df["A"] > 0 + df["bool2"] = df["B"] > 0 + df["bool3"] = True + df["int1"] = 1 + df["int2"] = 2 + df["timestamp1"] = Timestamp("20010102") + df["timestamp2"] = Timestamp("20010103") + df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) + df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) + df.loc[3:6, ["obj1"]] = np.nan + df = df._consolidate()._convert(datetime=True) + + with catch_warnings(record=True): + simplefilter("ignore", pd.errors.PerformanceWarning) + store["df"] = df + + # make a random group in hdf space + store._handle.create_group(store._handle.root, "bah") + + assert store.filename in repr(store) + assert store.filename in str(store) + store.info() + + # storers + with ensure_clean_store(setup_path) as store: + + df = tm.makeDataFrame() + store.append("df", df) + + s = store.get_storer("df") + repr(s) + str(s) + + @ignore_natural_naming_warning + def test_contains(self, setup_path): + + with ensure_clean_store(setup_path) as store: + store["a"] = tm.makeTimeSeries() + store["b"] = tm.makeDataFrame() + store["foo/bar"] = tm.makeDataFrame() + assert "a" in store + assert "b" in store + assert "c" not in store + assert "foo/bar" in store + assert "/foo/bar" in store + assert "/foo/b" not in store + assert "bar" not in store + + # gh-2694: tables.NaturalNameWarning + with catch_warnings(record=True): + store["node())"] = tm.makeDataFrame() + assert "node())" in store + + def test_versioning(self, setup_path): + + with ensure_clean_store(setup_path) as store: + store["a"] = tm.makeTimeSeries() + store["b"] = tm.makeDataFrame() + df = tm.makeTimeDataFrame() + _maybe_remove(store, "df1") + store.append("df1", df[:10]) + store.append("df1", df[10:]) + assert store.root.a._v_attrs.pandas_version == "0.15.2" + assert store.root.b._v_attrs.pandas_version == "0.15.2" + assert store.root.df1._v_attrs.pandas_version == "0.15.2" + + # write a file and wipe its versioning + _maybe_remove(store, "df2") + store.append("df2", df) + + # this is an error because its table_type is appendable, but no + # version info + store.get_node("df2")._v_attrs.pandas_version = None + + msg = "'NoneType' object has no attribute 'startswith'" + + with pytest.raises(Exception, match=msg): + store.select("df2") + + def test_mode(self, setup_path): + + df = tm.makeTimeDataFrame() + + def check(mode): + + with ensure_clean_path(setup_path) as path: + + # constructor + if mode in ["r", "r+"]: + with pytest.raises(IOError): + HDFStore(path, mode=mode) + + else: + store = HDFStore(path, mode=mode) + assert store._handle.mode == mode + store.close() + + with ensure_clean_path(setup_path) as path: + + # context + if mode in ["r", "r+"]: + with pytest.raises(IOError): + with HDFStore(path, mode=mode) as store: # noqa + pass + else: + with HDFStore(path, mode=mode) as store: + assert store._handle.mode == mode + + with ensure_clean_path(setup_path) as path: + + # conv write + if mode in ["r", "r+"]: + with pytest.raises(IOError): + df.to_hdf(path, "df", mode=mode) + df.to_hdf(path, "df", mode="w") + else: + df.to_hdf(path, "df", mode=mode) + + # conv read + if mode in ["w"]: + msg = ( + "mode w is not allowed while performing a read. " + r"Allowed modes are r, r\+ and a." + ) + with pytest.raises(ValueError, match=msg): + read_hdf(path, "df", mode=mode) + else: + result = read_hdf(path, "df", mode=mode) + tm.assert_frame_equal(result, df) + + def check_default_mode(): + + # read_hdf uses default mode + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", mode="w") + result = read_hdf(path, "df") + tm.assert_frame_equal(result, df) + + check("r") + check("r+") + check("a") + check("w") + check_default_mode() + + def test_reopen_handle(self, setup_path): + + with ensure_clean_path(setup_path) as path: + + store = HDFStore(path, mode="a") + store["a"] = tm.makeTimeSeries() + + # invalid mode change + with pytest.raises(PossibleDataLossError): + store.open("w") + + store.close() + assert not store.is_open + + # truncation ok here + store.open("w") + assert store.is_open + assert len(store) == 0 + store.close() + assert not store.is_open + + store = HDFStore(path, mode="a") + store["a"] = tm.makeTimeSeries() + + # reopen as read + store.open("r") + assert store.is_open + assert len(store) == 1 + assert store._mode == "r" + store.close() + assert not store.is_open + + # reopen as append + store.open("a") + assert store.is_open + assert len(store) == 1 + assert store._mode == "a" + store.close() + assert not store.is_open + + # reopen as append (again) + store.open("a") + assert store.is_open + assert len(store) == 1 + assert store._mode == "a" + store.close() + assert not store.is_open + + def test_open_args(self, setup_path): + + with ensure_clean_path(setup_path) as path: + + df = tm.makeDataFrame() + + # create an in memory store + store = HDFStore( + path, mode="a", driver="H5FD_CORE", driver_core_backing_store=0 + ) + store["df"] = df + store.append("df2", df) + + tm.assert_frame_equal(store["df"], df) + tm.assert_frame_equal(store["df2"], df) + + store.close() + + # the file should not have actually been written + assert not os.path.exists(path) + + def test_flush(self, setup_path): + + with ensure_clean_store(setup_path) as store: + store["a"] = tm.makeTimeSeries() + store.flush() + store.flush(fsync=True) + + def test_get(self, setup_path): + + with ensure_clean_store(setup_path) as store: + store["a"] = tm.makeTimeSeries() + left = store.get("a") + right = store["a"] + tm.assert_series_equal(left, right) + + left = store.get("/a") + right = store["/a"] + tm.assert_series_equal(left, right) + + with pytest.raises(KeyError, match="'No object named b in the file'"): + store.get("b") + + @pytest.mark.parametrize( + "where, expected", + [ + ( + "/", + { + "": ({"first_group", "second_group"}, set()), + "/first_group": (set(), {"df1", "df2"}), + "/second_group": ({"third_group"}, {"df3", "s1"}), + "/second_group/third_group": (set(), {"df4"}), + }, + ), + ( + "/second_group", + { + "/second_group": ({"third_group"}, {"df3", "s1"}), + "/second_group/third_group": (set(), {"df4"}), + }, + ), + ], + ) + def test_walk(self, where, expected, setup_path): + # GH10143 + objs = { + "df1": pd.DataFrame([1, 2, 3]), + "df2": pd.DataFrame([4, 5, 6]), + "df3": pd.DataFrame([6, 7, 8]), + "df4": pd.DataFrame([9, 10, 11]), + "s1": pd.Series([10, 9, 8]), + # Next 3 items aren't pandas objects and should be ignored + "a1": np.array([[1, 2, 3], [4, 5, 6]]), + "tb1": np.array([(1, 2, 3), (4, 5, 6)], dtype="i,i,i"), + "tb2": np.array([(7, 8, 9), (10, 11, 12)], dtype="i,i,i"), + } + + with ensure_clean_store("walk_groups.hdf", mode="w") as store: + store.put("/first_group/df1", objs["df1"]) + store.put("/first_group/df2", objs["df2"]) + store.put("/second_group/df3", objs["df3"]) + store.put("/second_group/s1", objs["s1"]) + store.put("/second_group/third_group/df4", objs["df4"]) + # Create non-pandas objects + store._handle.create_array("/first_group", "a1", objs["a1"]) + store._handle.create_table("/first_group", "tb1", obj=objs["tb1"]) + store._handle.create_table("/second_group", "tb2", obj=objs["tb2"]) + + assert len(list(store.walk(where=where))) == len(expected) + for path, groups, leaves in store.walk(where=where): + assert path in expected + expected_groups, expected_frames = expected[path] + assert expected_groups == set(groups) + assert expected_frames == set(leaves) + for leaf in leaves: + frame_path = "/".join([path, leaf]) + obj = store.get(frame_path) + if "df" in leaf: + tm.assert_frame_equal(obj, objs[leaf]) + else: + tm.assert_series_equal(obj, objs[leaf]) + + def test_getattr(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + s = tm.makeTimeSeries() + store["a"] = s + + # test attribute access + result = store.a + tm.assert_series_equal(result, s) + result = getattr(store, "a") + tm.assert_series_equal(result, s) + + df = tm.makeTimeDataFrame() + store["df"] = df + result = store.df + tm.assert_frame_equal(result, df) + + # errors + for x in ["d", "mode", "path", "handle", "complib"]: + with pytest.raises(AttributeError): + getattr(store, x) + + # not stores + for x in ["mode", "path", "handle", "complib"]: + getattr(store, "_{x}".format(x=x)) + + def test_put(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + ts = tm.makeTimeSeries() + df = tm.makeTimeDataFrame() + store["a"] = ts + store["b"] = df[:10] + store["foo/bar/bah"] = df[:10] + store["foo"] = df[:10] + store["/foo"] = df[:10] + store.put("c", df[:10], format="table") + + # not OK, not a table + with pytest.raises(ValueError): + store.put("b", df[10:], append=True) + + # node does not currently exist, test _is_table_type returns False + # in this case + _maybe_remove(store, "f") + with pytest.raises(ValueError): + store.put("f", df[10:], append=True) + + # can't put to a table (use append instead) + with pytest.raises(ValueError): + store.put("c", df[10:], append=True) + + # overwrite table + store.put("c", df[:10], format="table", append=False) + tm.assert_frame_equal(df[:10], store["c"]) + + def test_put_string_index(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + index = Index( + ["I am a very long string index: {i}".format(i=i) for i in range(20)] + ) + s = Series(np.arange(20), index=index) + df = DataFrame({"A": s, "B": s}) + + store["a"] = s + tm.assert_series_equal(store["a"], s) + + store["b"] = df + tm.assert_frame_equal(store["b"], df) + + # mixed length + index = Index( + ["abcdefghijklmnopqrstuvwxyz1234567890"] + + ["I am a very long string index: {i}".format(i=i) for i in range(20)] + ) + s = Series(np.arange(21), index=index) + df = DataFrame({"A": s, "B": s}) + store["a"] = s + tm.assert_series_equal(store["a"], s) + + store["b"] = df + tm.assert_frame_equal(store["b"], df) + + def test_put_compression(self, setup_path): + + with ensure_clean_store(setup_path) as store: + df = tm.makeTimeDataFrame() + + store.put("c", df, format="table", complib="zlib") + tm.assert_frame_equal(store["c"], df) + + # can't compress if format='fixed' + with pytest.raises(ValueError): + store.put("b", df, format="fixed", complib="zlib") + + @td.skip_if_windows_python_3 + def test_put_compression_blosc(self, setup_path): + df = tm.makeTimeDataFrame() + + with ensure_clean_store(setup_path) as store: + + # can't compress if format='fixed' + with pytest.raises(ValueError): + store.put("b", df, format="fixed", complib="blosc") + + store.put("c", df, format="table", complib="blosc") + tm.assert_frame_equal(store["c"], df) + + def test_complibs_default_settings(self, setup_path): + # GH15943 + df = tm.makeDataFrame() + + # Set complevel and check if complib is automatically set to + # default value + with ensure_clean_path(setup_path) as tmpfile: + df.to_hdf(tmpfile, "df", complevel=9) + result = pd.read_hdf(tmpfile, "df") + tm.assert_frame_equal(result, df) + + with tables.open_file(tmpfile, mode="r") as h5file: + for node in h5file.walk_nodes(where="/df", classname="Leaf"): + assert node.filters.complevel == 9 + assert node.filters.complib == "zlib" + + # Set complib and check to see if compression is disabled + with ensure_clean_path(setup_path) as tmpfile: + df.to_hdf(tmpfile, "df", complib="zlib") + result = pd.read_hdf(tmpfile, "df") + tm.assert_frame_equal(result, df) + + with tables.open_file(tmpfile, mode="r") as h5file: + for node in h5file.walk_nodes(where="/df", classname="Leaf"): + assert node.filters.complevel == 0 + assert node.filters.complib is None + + # Check if not setting complib or complevel results in no compression + with ensure_clean_path(setup_path) as tmpfile: + df.to_hdf(tmpfile, "df") + result = pd.read_hdf(tmpfile, "df") + tm.assert_frame_equal(result, df) + + with tables.open_file(tmpfile, mode="r") as h5file: + for node in h5file.walk_nodes(where="/df", classname="Leaf"): + assert node.filters.complevel == 0 + assert node.filters.complib is None + + # Check if file-defaults can be overridden on a per table basis + with ensure_clean_path(setup_path) as tmpfile: + store = pd.HDFStore(tmpfile) + store.append("dfc", df, complevel=9, complib="blosc") + store.append("df", df) + store.close() + + with tables.open_file(tmpfile, mode="r") as h5file: + for node in h5file.walk_nodes(where="/df", classname="Leaf"): + assert node.filters.complevel == 0 + assert node.filters.complib is None + for node in h5file.walk_nodes(where="/dfc", classname="Leaf"): + assert node.filters.complevel == 9 + assert node.filters.complib == "blosc" + + def test_complibs(self, setup_path): + # GH14478 + df = tm.makeDataFrame() + + # Building list of all complibs and complevels tuples + all_complibs = tables.filters.all_complibs + # Remove lzo if its not available on this platform + if not tables.which_lib_version("lzo"): + all_complibs.remove("lzo") + # Remove bzip2 if its not available on this platform + if not tables.which_lib_version("bzip2"): + all_complibs.remove("bzip2") + + all_levels = range(0, 10) + all_tests = [(lib, lvl) for lib in all_complibs for lvl in all_levels] + + for (lib, lvl) in all_tests: + with ensure_clean_path(setup_path) as tmpfile: + gname = "foo" + + # Write and read file to see if data is consistent + df.to_hdf(tmpfile, gname, complib=lib, complevel=lvl) + result = pd.read_hdf(tmpfile, gname) + tm.assert_frame_equal(result, df) + + # Open file and check metadata + # for correct amount of compression + h5table = tables.open_file(tmpfile, mode="r") + for node in h5table.walk_nodes(where="/" + gname, classname="Leaf"): + assert node.filters.complevel == lvl + if lvl == 0: + assert node.filters.complib is None + else: + assert node.filters.complib == lib + h5table.close() + + def test_put_integer(self, setup_path): + # non-date, non-string index + df = DataFrame(np.random.randn(50, 100)) + self._check_roundtrip(df, tm.assert_frame_equal, setup_path) + + @td.xfail_non_writeable + def test_put_mixed_type(self, setup_path): + df = tm.makeTimeDataFrame() + df["obj1"] = "foo" + df["obj2"] = "bar" + df["bool1"] = df["A"] > 0 + df["bool2"] = df["B"] > 0 + df["bool3"] = True + df["int1"] = 1 + df["int2"] = 2 + df["timestamp1"] = Timestamp("20010102") + df["timestamp2"] = Timestamp("20010103") + df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) + df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) + df.loc[3:6, ["obj1"]] = np.nan + df = df._consolidate()._convert(datetime=True) + + with ensure_clean_store(setup_path) as store: + _maybe_remove(store, "df") + + # PerformanceWarning + with catch_warnings(record=True): + simplefilter("ignore", pd.errors.PerformanceWarning) + store.put("df", df) + + expected = store.get("df") + tm.assert_frame_equal(expected, df) + + @pytest.mark.filterwarnings( + "ignore:object name:tables.exceptions.NaturalNameWarning" + ) + def test_append(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + # this is allowed by almost always don't want to do it + # tables.NaturalNameWarning): + with catch_warnings(record=True): + + df = tm.makeTimeDataFrame() + _maybe_remove(store, "df1") + store.append("df1", df[:10]) + store.append("df1", df[10:]) + tm.assert_frame_equal(store["df1"], df) + + _maybe_remove(store, "df2") + store.put("df2", df[:10], format="table") + store.append("df2", df[10:]) + tm.assert_frame_equal(store["df2"], df) + + _maybe_remove(store, "df3") + store.append("/df3", df[:10]) + store.append("/df3", df[10:]) + tm.assert_frame_equal(store["df3"], df) + + # this is allowed by almost always don't want to do it + # tables.NaturalNameWarning + _maybe_remove(store, "/df3 foo") + store.append("/df3 foo", df[:10]) + store.append("/df3 foo", df[10:]) + tm.assert_frame_equal(store["df3 foo"], df) + + # dtype issues - mizxed type in a single object column + df = DataFrame(data=[[1, 2], [0, 1], [1, 2], [0, 0]]) + df["mixed_column"] = "testing" + df.loc[2, "mixed_column"] = np.nan + _maybe_remove(store, "df") + store.append("df", df) + tm.assert_frame_equal(store["df"], df) + + # uints - test storage of uints + uint_data = DataFrame( + { + "u08": Series( + np.random.randint(0, high=255, size=5), dtype=np.uint8 + ), + "u16": Series( + np.random.randint(0, high=65535, size=5), dtype=np.uint16 + ), + "u32": Series( + np.random.randint(0, high=2 ** 30, size=5), dtype=np.uint32 + ), + "u64": Series( + [2 ** 58, 2 ** 59, 2 ** 60, 2 ** 61, 2 ** 62], + dtype=np.uint64, + ), + }, + index=np.arange(5), + ) + _maybe_remove(store, "uints") + store.append("uints", uint_data) + tm.assert_frame_equal(store["uints"], uint_data) + + # uints - test storage of uints in indexable columns + _maybe_remove(store, "uints") + # 64-bit indices not yet supported + store.append("uints", uint_data, data_columns=["u08", "u16", "u32"]) + tm.assert_frame_equal(store["uints"], uint_data) + + def test_append_series(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + # basic + ss = tm.makeStringSeries() + ts = tm.makeTimeSeries() + ns = Series(np.arange(100)) + + store.append("ss", ss) + result = store["ss"] + tm.assert_series_equal(result, ss) + assert result.name is None + + store.append("ts", ts) + result = store["ts"] + tm.assert_series_equal(result, ts) + assert result.name is None + + ns.name = "foo" + store.append("ns", ns) + result = store["ns"] + tm.assert_series_equal(result, ns) + assert result.name == ns.name + + # select on the values + expected = ns[ns > 60] + result = store.select("ns", "foo>60") + tm.assert_series_equal(result, expected) + + # select on the index and values + expected = ns[(ns > 70) & (ns.index < 90)] + result = store.select("ns", "foo>70 and index<90") + tm.assert_series_equal(result, expected) + + # multi-index + mi = DataFrame(np.random.randn(5, 1), columns=["A"]) + mi["B"] = np.arange(len(mi)) + mi["C"] = "foo" + mi.loc[3:5, "C"] = "bar" + mi.set_index(["C", "B"], inplace=True) + s = mi.stack() + s.index = s.index.droplevel(2) + store.append("mi", s) + tm.assert_series_equal(store["mi"], s) + + def test_store_index_types(self, setup_path): + # GH5386 + # test storing various index types + + with ensure_clean_store(setup_path) as store: + + def check(format, index): + df = DataFrame(np.random.randn(10, 2), columns=list("AB")) + df.index = index(len(df)) + + _maybe_remove(store, "df") + store.put("df", df, format=format) + tm.assert_frame_equal(df, store["df"]) + + for index in [ + tm.makeFloatIndex, + tm.makeStringIndex, + tm.makeIntIndex, + tm.makeDateIndex, + ]: + + check("table", index) + check("fixed", index) + + # period index currently broken for table + # seee GH7796 FIXME + check("fixed", tm.makePeriodIndex) + # check('table',tm.makePeriodIndex) + + # unicode + index = tm.makeUnicodeIndex + check("table", index) + check("fixed", index) + + @pytest.mark.skipif( + not is_platform_little_endian(), reason="reason platform is not little endian" + ) + def test_encoding(self, setup_path): + + with ensure_clean_store(setup_path) as store: + df = DataFrame(dict(A="foo", B="bar"), index=range(5)) + df.loc[2, "A"] = np.nan + df.loc[3, "B"] = np.nan + _maybe_remove(store, "df") + store.append("df", df, encoding="ascii") + tm.assert_frame_equal(store["df"], df) + + expected = df.reindex(columns=["A"]) + result = store.select("df", Term("columns=A", encoding="ascii")) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "val", + [ + [b"E\xc9, 17", b"", b"a", b"b", b"c"], + [b"E\xc9, 17", b"a", b"b", b"c"], + [b"EE, 17", b"", b"a", b"b", b"c"], + [b"E\xc9, 17", b"\xf8\xfc", b"a", b"b", b"c"], + [b"", b"a", b"b", b"c"], + [b"\xf8\xfc", b"a", b"b", b"c"], + [b"A\xf8\xfc", b"", b"a", b"b", b"c"], + [np.nan, b"", b"b", b"c"], + [b"A\xf8\xfc", np.nan, b"", b"b", b"c"], + ], + ) + @pytest.mark.parametrize("dtype", ["category", object]) + def test_latin_encoding(self, setup_path, dtype, val): + enc = "latin-1" + nan_rep = "" + key = "data" + + val = [x.decode(enc) if isinstance(x, bytes) else x for x in val] + ser = pd.Series(val, dtype=dtype) + + with ensure_clean_path(setup_path) as store: + ser.to_hdf(store, key, format="table", encoding=enc, nan_rep=nan_rep) + retr = read_hdf(store, key) + + s_nan = ser.replace(nan_rep, np.nan) + + if is_categorical_dtype(s_nan): + assert is_categorical_dtype(retr) + tm.assert_series_equal( + s_nan, retr, check_dtype=False, check_categorical=False + ) + else: + tm.assert_series_equal(s_nan, retr) + + # FIXME: don't leave commented-out + # fails: + # for x in examples: + # roundtrip(s, nan_rep=b'\xf8\xfc') + + def test_append_some_nans(self, setup_path): + + with ensure_clean_store(setup_path) as store: + df = DataFrame( + { + "A": Series(np.random.randn(20)).astype("int32"), + "A1": np.random.randn(20), + "A2": np.random.randn(20), + "B": "foo", + "C": "bar", + "D": Timestamp("20010101"), + "E": datetime.datetime(2001, 1, 2, 0, 0), + }, + index=np.arange(20), + ) + # some nans + _maybe_remove(store, "df1") + df.loc[0:15, ["A1", "B", "D", "E"]] = np.nan + store.append("df1", df[:10]) + store.append("df1", df[10:]) + tm.assert_frame_equal(store["df1"], df) + + # first column + df1 = df.copy() + df1.loc[:, "A1"] = np.nan + _maybe_remove(store, "df1") + store.append("df1", df1[:10]) + store.append("df1", df1[10:]) + tm.assert_frame_equal(store["df1"], df1) + + # 2nd column + df2 = df.copy() + df2.loc[:, "A2"] = np.nan + _maybe_remove(store, "df2") + store.append("df2", df2[:10]) + store.append("df2", df2[10:]) + tm.assert_frame_equal(store["df2"], df2) + + # datetimes + df3 = df.copy() + df3.loc[:, "E"] = np.nan + _maybe_remove(store, "df3") + store.append("df3", df3[:10]) + store.append("df3", df3[10:]) + tm.assert_frame_equal(store["df3"], df3) + + def test_append_all_nans(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + df = DataFrame( + {"A1": np.random.randn(20), "A2": np.random.randn(20)}, + index=np.arange(20), + ) + df.loc[0:15, :] = np.nan + + # nan some entire rows (dropna=True) + _maybe_remove(store, "df") + store.append("df", df[:10], dropna=True) + store.append("df", df[10:], dropna=True) + tm.assert_frame_equal(store["df"], df[-4:]) + + # nan some entire rows (dropna=False) + _maybe_remove(store, "df2") + store.append("df2", df[:10], dropna=False) + store.append("df2", df[10:], dropna=False) + tm.assert_frame_equal(store["df2"], df) + + # tests the option io.hdf.dropna_table + pd.set_option("io.hdf.dropna_table", False) + _maybe_remove(store, "df3") + store.append("df3", df[:10]) + store.append("df3", df[10:]) + tm.assert_frame_equal(store["df3"], df) + + pd.set_option("io.hdf.dropna_table", True) + _maybe_remove(store, "df4") + store.append("df4", df[:10]) + store.append("df4", df[10:]) + tm.assert_frame_equal(store["df4"], df[-4:]) + + # nan some entire rows (string are still written!) + df = DataFrame( + { + "A1": np.random.randn(20), + "A2": np.random.randn(20), + "B": "foo", + "C": "bar", + }, + index=np.arange(20), + ) + + df.loc[0:15, :] = np.nan + + _maybe_remove(store, "df") + store.append("df", df[:10], dropna=True) + store.append("df", df[10:], dropna=True) + tm.assert_frame_equal(store["df"], df) + + _maybe_remove(store, "df2") + store.append("df2", df[:10], dropna=False) + store.append("df2", df[10:], dropna=False) + tm.assert_frame_equal(store["df2"], df) + + # nan some entire rows (but since we have dates they are still + # written!) + df = DataFrame( + { + "A1": np.random.randn(20), + "A2": np.random.randn(20), + "B": "foo", + "C": "bar", + "D": Timestamp("20010101"), + "E": datetime.datetime(2001, 1, 2, 0, 0), + }, + index=np.arange(20), + ) + + df.loc[0:15, :] = np.nan + + _maybe_remove(store, "df") + store.append("df", df[:10], dropna=True) + store.append("df", df[10:], dropna=True) + tm.assert_frame_equal(store["df"], df) + + _maybe_remove(store, "df2") + store.append("df2", df[:10], dropna=False) + store.append("df2", df[10:], dropna=False) + tm.assert_frame_equal(store["df2"], df) + + # Test to make sure defaults are to not drop. + # Corresponding to Issue 9382 + df_with_missing = DataFrame( + {"col1": [0, np.nan, 2], "col2": [1, np.nan, np.nan]} + ) + + with ensure_clean_path(setup_path) as path: + df_with_missing.to_hdf(path, "df_with_missing", format="table") + reloaded = read_hdf(path, "df_with_missing") + tm.assert_frame_equal(df_with_missing, reloaded) + + def test_read_missing_key_close_store(self, setup_path): + # GH 25766 + with ensure_clean_path(setup_path) as path: + df = pd.DataFrame({"a": range(2), "b": range(2)}) + df.to_hdf(path, "k1") + + with pytest.raises(KeyError, match="'No object named k2 in the file'"): + pd.read_hdf(path, "k2") + + # smoke test to test that file is properly closed after + # read with KeyError before another write + df.to_hdf(path, "k2") + + def test_read_missing_key_opened_store(self, setup_path): + # GH 28699 + with ensure_clean_path(setup_path) as path: + df = pd.DataFrame({"a": range(2), "b": range(2)}) + df.to_hdf(path, "k1") + + store = pd.HDFStore(path, "r") + + with pytest.raises(KeyError, match="'No object named k2 in the file'"): + pd.read_hdf(store, "k2") + + # Test that the file is still open after a KeyError and that we can + # still read from it. + pd.read_hdf(store, "k1") + + def test_append_frame_column_oriented(self, setup_path): + with ensure_clean_store(setup_path) as store: + + # column oriented + df = tm.makeTimeDataFrame() + _maybe_remove(store, "df1") + store.append("df1", df.iloc[:, :2], axes=["columns"]) + store.append("df1", df.iloc[:, 2:]) + tm.assert_frame_equal(store["df1"], df) + + result = store.select("df1", "columns=A") + expected = df.reindex(columns=["A"]) + tm.assert_frame_equal(expected, result) + + # selection on the non-indexable + result = store.select("df1", ("columns=A", "index=df.index[0:4]")) + expected = df.reindex(columns=["A"], index=df.index[0:4]) + tm.assert_frame_equal(expected, result) + + # this isn't supported + with pytest.raises(TypeError): + store.select("df1", "columns=A and index>df.index[4]") + + def test_append_with_different_block_ordering(self, setup_path): + + # GH 4096; using same frames, but different block orderings + with ensure_clean_store(setup_path) as store: + + for i in range(10): + + df = DataFrame(np.random.randn(10, 2), columns=list("AB")) + df["index"] = range(10) + df["index"] += i * 10 + df["int64"] = Series([1] * len(df), dtype="int64") + df["int16"] = Series([1] * len(df), dtype="int16") + + if i % 2 == 0: + del df["int64"] + df["int64"] = Series([1] * len(df), dtype="int64") + if i % 3 == 0: + a = df.pop("A") + df["A"] = a + + df.set_index("index", inplace=True) + + store.append("df", df) + + # test a different ordering but with more fields (like invalid + # combinate) + with ensure_clean_store(setup_path) as store: + + df = DataFrame(np.random.randn(10, 2), columns=list("AB"), dtype="float64") + df["int64"] = Series([1] * len(df), dtype="int64") + df["int16"] = Series([1] * len(df), dtype="int16") + store.append("df", df) + + # store additional fields in different blocks + df["int16_2"] = Series([1] * len(df), dtype="int16") + with pytest.raises(ValueError): + store.append("df", df) + + # store multiple additional fields in different blocks + df["float_3"] = Series([1.0] * len(df), dtype="float64") + with pytest.raises(ValueError): + store.append("df", df) + + def test_append_with_strings(self, setup_path): + + with ensure_clean_store(setup_path) as store: + with catch_warnings(record=True): + + def check_col(key, name, size): + assert ( + getattr(store.get_storer(key).table.description, name).itemsize + == size + ) + + # avoid truncation on elements + df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]]) + store.append("df_big", df) + tm.assert_frame_equal(store.select("df_big"), df) + check_col("df_big", "values_block_1", 15) + + # appending smaller string ok + df2 = DataFrame([[124, "asdqy"], [346, "dggnhefbdfb"]]) + store.append("df_big", df2) + expected = concat([df, df2]) + tm.assert_frame_equal(store.select("df_big"), expected) + check_col("df_big", "values_block_1", 15) + + # avoid truncation on elements + df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]]) + store.append("df_big2", df, min_itemsize={"values": 50}) + tm.assert_frame_equal(store.select("df_big2"), df) + check_col("df_big2", "values_block_1", 50) + + # bigger string on next append + store.append("df_new", df) + df_new = DataFrame( + [[124, "abcdefqhij"], [346, "abcdefghijklmnopqrtsuvwxyz"]] + ) + with pytest.raises(ValueError): + store.append("df_new", df_new) + + # min_itemsize on Series index (GH 11412) + df = tm.makeMixedDataFrame().set_index("C") + store.append("ss", df["B"], min_itemsize={"index": 4}) + tm.assert_series_equal(store.select("ss"), df["B"]) + + # same as above, with data_columns=True + store.append( + "ss2", df["B"], data_columns=True, min_itemsize={"index": 4} + ) + tm.assert_series_equal(store.select("ss2"), df["B"]) + + # min_itemsize in index without appending (GH 10381) + store.put("ss3", df, format="table", min_itemsize={"index": 6}) + # just make sure there is a longer string: + df2 = df.copy().reset_index().assign(C="longer").set_index("C") + store.append("ss3", df2) + tm.assert_frame_equal(store.select("ss3"), pd.concat([df, df2])) + + # same as above, with a Series + store.put("ss4", df["B"], format="table", min_itemsize={"index": 6}) + store.append("ss4", df2["B"]) + tm.assert_series_equal( + store.select("ss4"), pd.concat([df["B"], df2["B"]]) + ) + + # with nans + _maybe_remove(store, "df") + df = tm.makeTimeDataFrame() + df["string"] = "foo" + df.loc[1:4, "string"] = np.nan + df["string2"] = "bar" + df.loc[4:8, "string2"] = np.nan + df["string3"] = "bah" + df.loc[1:, "string3"] = np.nan + store.append("df", df) + result = store.select("df") + tm.assert_frame_equal(result, df) + + with ensure_clean_store(setup_path) as store: + + def check_col(key, name, size): + assert getattr( + store.get_storer(key).table.description, name + ).itemsize, size + + df = DataFrame(dict(A="foo", B="bar"), index=range(10)) + + # a min_itemsize that creates a data_column + _maybe_remove(store, "df") + store.append("df", df, min_itemsize={"A": 200}) + check_col("df", "A", 200) + assert store.get_storer("df").data_columns == ["A"] + + # a min_itemsize that creates a data_column2 + _maybe_remove(store, "df") + store.append("df", df, data_columns=["B"], min_itemsize={"A": 200}) + check_col("df", "A", 200) + assert store.get_storer("df").data_columns == ["B", "A"] + + # a min_itemsize that creates a data_column2 + _maybe_remove(store, "df") + store.append("df", df, data_columns=["B"], min_itemsize={"values": 200}) + check_col("df", "B", 200) + check_col("df", "values_block_0", 200) + assert store.get_storer("df").data_columns == ["B"] + + # infer the .typ on subsequent appends + _maybe_remove(store, "df") + store.append("df", df[:5], min_itemsize=200) + store.append("df", df[5:], min_itemsize=200) + tm.assert_frame_equal(store["df"], df) + + # invalid min_itemsize keys + df = DataFrame(["foo", "foo", "foo", "barh", "barh", "barh"], columns=["A"]) + _maybe_remove(store, "df") + with pytest.raises(ValueError): + store.append("df", df, min_itemsize={"foo": 20, "foobar": 20}) + + def test_append_with_empty_string(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + # with all empty strings (GH 12242) + df = DataFrame({"x": ["a", "b", "c", "d", "e", "f", ""]}) + store.append("df", df[:-1], min_itemsize={"x": 1}) + store.append("df", df[-1:], min_itemsize={"x": 1}) + tm.assert_frame_equal(store.select("df"), df) + + def test_to_hdf_with_min_itemsize(self, setup_path): + + with ensure_clean_path(setup_path) as path: + + # min_itemsize in index with to_hdf (GH 10381) + df = tm.makeMixedDataFrame().set_index("C") + df.to_hdf(path, "ss3", format="table", min_itemsize={"index": 6}) + # just make sure there is a longer string: + df2 = df.copy().reset_index().assign(C="longer").set_index("C") + df2.to_hdf(path, "ss3", append=True, format="table") + tm.assert_frame_equal(pd.read_hdf(path, "ss3"), pd.concat([df, df2])) + + # same as above, with a Series + df["B"].to_hdf(path, "ss4", format="table", min_itemsize={"index": 6}) + df2["B"].to_hdf(path, "ss4", append=True, format="table") + tm.assert_series_equal( + pd.read_hdf(path, "ss4"), pd.concat([df["B"], df2["B"]]) + ) + + @pytest.mark.parametrize( + "format", [pytest.param("fixed", marks=td.xfail_non_writeable), "table"] + ) + def test_to_hdf_errors(self, format, setup_path): + + data = ["\ud800foo"] + ser = pd.Series(data, index=pd.Index(data)) + with ensure_clean_path(setup_path) as path: + # GH 20835 + ser.to_hdf(path, "table", format=format, errors="surrogatepass") + + result = pd.read_hdf(path, "table", errors="surrogatepass") + tm.assert_series_equal(result, ser) + + def test_append_with_data_columns(self, setup_path): + + with ensure_clean_store(setup_path) as store: + df = tm.makeTimeDataFrame() + df.iloc[0, df.columns.get_loc("B")] = 1.0 + _maybe_remove(store, "df") + store.append("df", df[:2], data_columns=["B"]) + store.append("df", df[2:]) + tm.assert_frame_equal(store["df"], df) + + # check that we have indices created + assert store._handle.root.df.table.cols.index.is_indexed is True + assert store._handle.root.df.table.cols.B.is_indexed is True + + # data column searching + result = store.select("df", "B>0") + expected = df[df.B > 0] + tm.assert_frame_equal(result, expected) + + # data column searching (with an indexable and a data_columns) + result = store.select("df", "B>0 and index>df.index[3]") + df_new = df.reindex(index=df.index[4:]) + expected = df_new[df_new.B > 0] + tm.assert_frame_equal(result, expected) + + # data column selection with a string data_column + df_new = df.copy() + df_new["string"] = "foo" + df_new.loc[1:4, "string"] = np.nan + df_new.loc[5:6, "string"] = "bar" + _maybe_remove(store, "df") + store.append("df", df_new, data_columns=["string"]) + result = store.select("df", "string='foo'") + expected = df_new[df_new.string == "foo"] + tm.assert_frame_equal(result, expected) + + # using min_itemsize and a data column + def check_col(key, name, size): + assert ( + getattr(store.get_storer(key).table.description, name).itemsize + == size + ) + + with ensure_clean_store(setup_path) as store: + _maybe_remove(store, "df") + store.append( + "df", df_new, data_columns=["string"], min_itemsize={"string": 30} + ) + check_col("df", "string", 30) + _maybe_remove(store, "df") + store.append("df", df_new, data_columns=["string"], min_itemsize=30) + check_col("df", "string", 30) + _maybe_remove(store, "df") + store.append( + "df", df_new, data_columns=["string"], min_itemsize={"values": 30} + ) + check_col("df", "string", 30) + + with ensure_clean_store(setup_path) as store: + df_new["string2"] = "foobarbah" + df_new["string_block1"] = "foobarbah1" + df_new["string_block2"] = "foobarbah2" + _maybe_remove(store, "df") + store.append( + "df", + df_new, + data_columns=["string", "string2"], + min_itemsize={"string": 30, "string2": 40, "values": 50}, + ) + check_col("df", "string", 30) + check_col("df", "string2", 40) + check_col("df", "values_block_1", 50) + + with ensure_clean_store(setup_path) as store: + # multiple data columns + df_new = df.copy() + df_new.iloc[0, df_new.columns.get_loc("A")] = 1.0 + df_new.iloc[0, df_new.columns.get_loc("B")] = -1.0 + df_new["string"] = "foo" + + sl = df_new.columns.get_loc("string") + df_new.iloc[1:4, sl] = np.nan + df_new.iloc[5:6, sl] = "bar" + + df_new["string2"] = "foo" + sl = df_new.columns.get_loc("string2") + df_new.iloc[2:5, sl] = np.nan + df_new.iloc[7:8, sl] = "bar" + _maybe_remove(store, "df") + store.append("df", df_new, data_columns=["A", "B", "string", "string2"]) + result = store.select( + "df", "string='foo' and string2='foo' and A>0 and B<0" + ) + expected = df_new[ + (df_new.string == "foo") + & (df_new.string2 == "foo") + & (df_new.A > 0) + & (df_new.B < 0) + ] + tm.assert_frame_equal(result, expected, check_index_type=False) + + # yield an empty frame + result = store.select("df", "string='foo' and string2='cool'") + expected = df_new[(df_new.string == "foo") & (df_new.string2 == "cool")] + tm.assert_frame_equal(result, expected, check_index_type=False) + + with ensure_clean_store(setup_path) as store: + # doc example + df_dc = df.copy() + df_dc["string"] = "foo" + df_dc.loc[4:6, "string"] = np.nan + df_dc.loc[7:9, "string"] = "bar" + df_dc["string2"] = "cool" + df_dc["datetime"] = Timestamp("20010102") + df_dc = df_dc._convert(datetime=True) + df_dc.loc[3:5, ["A", "B", "datetime"]] = np.nan + + _maybe_remove(store, "df_dc") + store.append( + "df_dc", df_dc, data_columns=["B", "C", "string", "string2", "datetime"] + ) + result = store.select("df_dc", "B>0") + + expected = df_dc[df_dc.B > 0] + tm.assert_frame_equal(result, expected, check_index_type=False) + + result = store.select("df_dc", ["B > 0", "C > 0", "string == foo"]) + expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")] + tm.assert_frame_equal(result, expected, check_index_type=False) + + with ensure_clean_store(setup_path) as store: + # doc example part 2 + np.random.seed(1234) + index = date_range("1/1/2000", periods=8) + df_dc = DataFrame( + np.random.randn(8, 3), index=index, columns=["A", "B", "C"] + ) + df_dc["string"] = "foo" + df_dc.loc[4:6, "string"] = np.nan + df_dc.loc[7:9, "string"] = "bar" + df_dc.loc[:, ["B", "C"]] = df_dc.loc[:, ["B", "C"]].abs() + df_dc["string2"] = "cool" + + # on-disk operations + store.append("df_dc", df_dc, data_columns=["B", "C", "string", "string2"]) + + result = store.select("df_dc", "B>0") + expected = df_dc[df_dc.B > 0] + tm.assert_frame_equal(result, expected) + + result = store.select("df_dc", ["B > 0", "C > 0", 'string == "foo"']) + expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")] + tm.assert_frame_equal(result, expected) + + def test_create_table_index(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + with catch_warnings(record=True): + + def col(t, column): + return getattr(store.get_storer(t).table.cols, column) + + # data columns + df = tm.makeTimeDataFrame() + df["string"] = "foo" + df["string2"] = "bar" + store.append("f", df, data_columns=["string", "string2"]) + assert col("f", "index").is_indexed is True + assert col("f", "string").is_indexed is True + assert col("f", "string2").is_indexed is True + + # specify index=columns + store.append( + "f2", df, index=["string"], data_columns=["string", "string2"] + ) + assert col("f2", "index").is_indexed is False + assert col("f2", "string").is_indexed is True + assert col("f2", "string2").is_indexed is False + + # try to index a non-table + _maybe_remove(store, "f2") + store.put("f2", df) + with pytest.raises(TypeError): + store.create_table_index("f2") + + def test_append_hierarchical(self, setup_path): + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["foo", "bar"], + ) + df = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) + + with ensure_clean_store(setup_path) as store: + store.append("mi", df) + result = store.select("mi") + tm.assert_frame_equal(result, df) + + # GH 3748 + result = store.select("mi", columns=["A", "B"]) + expected = df.reindex(columns=["A", "B"]) + tm.assert_frame_equal(result, expected) + + with ensure_clean_path("test.hdf") as path: + df.to_hdf(path, "df", format="table") + result = read_hdf(path, "df", columns=["A", "B"]) + expected = df.reindex(columns=["A", "B"]) + tm.assert_frame_equal(result, expected) + + def test_column_multiindex(self, setup_path): + # GH 4710 + # recreate multi-indexes properly + + index = MultiIndex.from_tuples( + [("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")], names=["first", "second"] + ) + df = DataFrame(np.arange(12).reshape(3, 4), columns=index) + expected = df.copy() + if isinstance(expected.index, RangeIndex): + expected.index = Int64Index(expected.index) + + with ensure_clean_store(setup_path) as store: + + store.put("df", df) + tm.assert_frame_equal( + store["df"], expected, check_index_type=True, check_column_type=True + ) + + store.put("df1", df, format="table") + tm.assert_frame_equal( + store["df1"], expected, check_index_type=True, check_column_type=True + ) + + with pytest.raises(ValueError): + store.put("df2", df, format="table", data_columns=["A"]) + with pytest.raises(ValueError): + store.put("df3", df, format="table", data_columns=True) + + # appending multi-column on existing table (see GH 6167) + with ensure_clean_store(setup_path) as store: + store.append("df2", df) + store.append("df2", df) + + tm.assert_frame_equal(store["df2"], concat((df, df))) + + # non_index_axes name + df = DataFrame( + np.arange(12).reshape(3, 4), columns=Index(list("ABCD"), name="foo") + ) + expected = df.copy() + if isinstance(expected.index, RangeIndex): + expected.index = Int64Index(expected.index) + + with ensure_clean_store(setup_path) as store: + + store.put("df1", df, format="table") + tm.assert_frame_equal( + store["df1"], expected, check_index_type=True, check_column_type=True + ) + + def test_store_multiindex(self, setup_path): + + # validate multi-index names + # GH 5527 + with ensure_clean_store(setup_path) as store: + + def make_index(names=None): + return MultiIndex.from_tuples( + [ + (datetime.datetime(2013, 12, d), s, t) + for d in range(1, 3) + for s in range(2) + for t in range(3) + ], + names=names, + ) + + # no names + _maybe_remove(store, "df") + df = DataFrame(np.zeros((12, 2)), columns=["a", "b"], index=make_index()) + store.append("df", df) + tm.assert_frame_equal(store.select("df"), df) + + # partial names + _maybe_remove(store, "df") + df = DataFrame( + np.zeros((12, 2)), + columns=["a", "b"], + index=make_index(["date", None, None]), + ) + store.append("df", df) + tm.assert_frame_equal(store.select("df"), df) + + # series + _maybe_remove(store, "s") + s = Series(np.zeros(12), index=make_index(["date", None, None])) + store.append("s", s) + xp = Series(np.zeros(12), index=make_index(["date", "level_1", "level_2"])) + tm.assert_series_equal(store.select("s"), xp) + + # dup with column + _maybe_remove(store, "df") + df = DataFrame( + np.zeros((12, 2)), + columns=["a", "b"], + index=make_index(["date", "a", "t"]), + ) + with pytest.raises(ValueError): + store.append("df", df) + + # dup within level + _maybe_remove(store, "df") + df = DataFrame( + np.zeros((12, 2)), + columns=["a", "b"], + index=make_index(["date", "date", "date"]), + ) + with pytest.raises(ValueError): + store.append("df", df) + + # fully names + _maybe_remove(store, "df") + df = DataFrame( + np.zeros((12, 2)), + columns=["a", "b"], + index=make_index(["date", "s", "t"]), + ) + store.append("df", df) + tm.assert_frame_equal(store.select("df"), df) + + def test_select_columns_in_where(self, setup_path): + + # GH 6169 + # recreate multi-indexes when columns is passed + # in the `where` argument + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["foo_name", "bar_name"], + ) + + # With a DataFrame + df = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) + + with ensure_clean_store(setup_path) as store: + store.put("df", df, format="table") + expected = df[["A"]] + + tm.assert_frame_equal(store.select("df", columns=["A"]), expected) + + tm.assert_frame_equal(store.select("df", where="columns=['A']"), expected) + + # With a Series + s = Series(np.random.randn(10), index=index, name="A") + with ensure_clean_store(setup_path) as store: + store.put("s", s, format="table") + tm.assert_series_equal(store.select("s", where="columns=['A']"), s) + + def test_mi_data_columns(self, setup_path): + # GH 14435 + idx = pd.MultiIndex.from_arrays( + [date_range("2000-01-01", periods=5), range(5)], names=["date", "id"] + ) + df = pd.DataFrame({"a": [1.1, 1.2, 1.3, 1.4, 1.5]}, index=idx) + + with ensure_clean_store(setup_path) as store: + store.append("df", df, data_columns=True) + + actual = store.select("df", where="id == 1") + expected = df.iloc[[1], :] + tm.assert_frame_equal(actual, expected) + + def test_pass_spec_to_storer(self, setup_path): + + df = tm.makeDataFrame() + + with ensure_clean_store(setup_path) as store: + store.put("df", df) + with pytest.raises(TypeError): + store.select("df", columns=["A"]) + with pytest.raises(TypeError): + store.select("df", where=[("columns=A")]) + + @td.xfail_non_writeable + def test_append_misc(self, setup_path): + + with ensure_clean_store(setup_path) as store: + df = tm.makeDataFrame() + store.append("df", df, chunksize=1) + result = store.select("df") + tm.assert_frame_equal(result, df) + + store.append("df1", df, expectedrows=10) + result = store.select("df1") + tm.assert_frame_equal(result, df) + + # more chunksize in append tests + def check(obj, comparator): + for c in [10, 200, 1000]: + with ensure_clean_store(setup_path, mode="w") as store: + store.append("obj", obj, chunksize=c) + result = store.select("obj") + comparator(result, obj) + + df = tm.makeDataFrame() + df["string"] = "foo" + df["float322"] = 1.0 + df["float322"] = df["float322"].astype("float32") + df["bool"] = df["float322"] > 0 + df["time1"] = Timestamp("20130101") + df["time2"] = Timestamp("20130102") + check(df, tm.assert_frame_equal) + + # empty frame, GH4273 + with ensure_clean_store(setup_path) as store: + + # 0 len + df_empty = DataFrame(columns=list("ABC")) + store.append("df", df_empty) + with pytest.raises(KeyError, match="'No object named df in the file'"): + store.select("df") + + # repeated append of 0/non-zero frames + df = DataFrame(np.random.rand(10, 3), columns=list("ABC")) + store.append("df", df) + tm.assert_frame_equal(store.select("df"), df) + store.append("df", df_empty) + tm.assert_frame_equal(store.select("df"), df) + + # store + df = DataFrame(columns=list("ABC")) + store.put("df2", df) + tm.assert_frame_equal(store.select("df2"), df) + + def test_append_raise(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + # test append with invalid input to get good error messages + + # list in column + df = tm.makeDataFrame() + df["invalid"] = [["a"]] * len(df) + assert df.dtypes["invalid"] == np.object_ + with pytest.raises(TypeError): + store.append("df", df) + + # multiple invalid columns + df["invalid2"] = [["a"]] * len(df) + df["invalid3"] = [["a"]] * len(df) + with pytest.raises(TypeError): + store.append("df", df) + + # datetime with embedded nans as object + df = tm.makeDataFrame() + s = Series(datetime.datetime(2001, 1, 2), index=df.index) + s = s.astype(object) + s[0:5] = np.nan + df["invalid"] = s + assert df.dtypes["invalid"] == np.object_ + with pytest.raises(TypeError): + store.append("df", df) + + # directly ndarray + with pytest.raises(TypeError): + store.append("df", np.arange(10)) + + # series directly + with pytest.raises(TypeError): + store.append("df", Series(np.arange(10))) + + # appending an incompatible table + df = tm.makeDataFrame() + store.append("df", df) + + df["foo"] = "foo" + with pytest.raises(ValueError): + store.append("df", df) + + def test_table_index_incompatible_dtypes(self, setup_path): + df1 = DataFrame({"a": [1, 2, 3]}) + df2 = DataFrame({"a": [4, 5, 6]}, index=date_range("1/1/2000", periods=3)) + + with ensure_clean_store(setup_path) as store: + store.put("frame", df1, format="table") + with pytest.raises(TypeError): + store.put("frame", df2, format="table", append=True) + + def test_table_values_dtypes_roundtrip(self, setup_path): + + with ensure_clean_store(setup_path) as store: + df1 = DataFrame({"a": [1, 2, 3]}, dtype="f8") + store.append("df_f8", df1) + tm.assert_series_equal(df1.dtypes, store["df_f8"].dtypes) + + df2 = DataFrame({"a": [1, 2, 3]}, dtype="i8") + store.append("df_i8", df2) + tm.assert_series_equal(df2.dtypes, store["df_i8"].dtypes) + + # incompatible dtype + with pytest.raises(ValueError): + store.append("df_i8", df1) + + # check creation/storage/retrieval of float32 (a bit hacky to + # actually create them thought) + df1 = DataFrame(np.array([[1], [2], [3]], dtype="f4"), columns=["A"]) + store.append("df_f4", df1) + tm.assert_series_equal(df1.dtypes, store["df_f4"].dtypes) + assert df1.dtypes[0] == "float32" + + # check with mixed dtypes + df1 = DataFrame( + { + c: Series(np.random.randint(5), dtype=c) + for c in ["float32", "float64", "int32", "int64", "int16", "int8"] + } + ) + df1["string"] = "foo" + df1["float322"] = 1.0 + df1["float322"] = df1["float322"].astype("float32") + df1["bool"] = df1["float32"] > 0 + df1["time1"] = Timestamp("20130101") + df1["time2"] = Timestamp("20130102") + + store.append("df_mixed_dtypes1", df1) + result = store.select("df_mixed_dtypes1").dtypes.value_counts() + result.index = [str(i) for i in result.index] + expected = Series( + { + "float32": 2, + "float64": 1, + "int32": 1, + "bool": 1, + "int16": 1, + "int8": 1, + "int64": 1, + "object": 1, + "datetime64[ns]": 2, + } + ) + result = result.sort_index() + expected = expected.sort_index() + tm.assert_series_equal(result, expected) + + def test_table_mixed_dtypes(self, setup_path): + + # frame + df = tm.makeDataFrame() + df["obj1"] = "foo" + df["obj2"] = "bar" + df["bool1"] = df["A"] > 0 + df["bool2"] = df["B"] > 0 + df["bool3"] = True + df["int1"] = 1 + df["int2"] = 2 + df["timestamp1"] = Timestamp("20010102") + df["timestamp2"] = Timestamp("20010103") + df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) + df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) + df.loc[3:6, ["obj1"]] = np.nan + df = df._consolidate()._convert(datetime=True) + + with ensure_clean_store(setup_path) as store: + store.append("df1_mixed", df) + tm.assert_frame_equal(store.select("df1_mixed"), df) + + def test_unimplemented_dtypes_table_columns(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + dtypes = [("date", datetime.date(2001, 1, 2))] + + # currently not supported dtypes #### + for n, f in dtypes: + df = tm.makeDataFrame() + df[n] = f + with pytest.raises(TypeError): + store.append("df1_{n}".format(n=n), df) + + # frame + df = tm.makeDataFrame() + df["obj1"] = "foo" + df["obj2"] = "bar" + df["datetime1"] = datetime.date(2001, 1, 2) + df = df._consolidate()._convert(datetime=True) + + with ensure_clean_store(setup_path) as store: + # this fails because we have a date in the object block...... + with pytest.raises(TypeError): + store.append("df_unimplemented", df) + + @td.xfail_non_writeable + @pytest.mark.skipif( + LooseVersion(np.__version__) == LooseVersion("1.15.0"), + reason=( + "Skipping pytables test when numpy version is " + "exactly equal to 1.15.0: gh-22098" + ), + ) + def test_calendar_roundtrip_issue(self, setup_path): + + # 8591 + # doc example from tseries holiday section + weekmask_egypt = "Sun Mon Tue Wed Thu" + holidays = [ + "2012-05-01", + datetime.datetime(2013, 5, 1), + np.datetime64("2014-05-01"), + ] + bday_egypt = pd.offsets.CustomBusinessDay( + holidays=holidays, weekmask=weekmask_egypt + ) + dt = datetime.datetime(2013, 4, 30) + dts = date_range(dt, periods=5, freq=bday_egypt) + + s = Series(dts.weekday, dts).map(Series("Mon Tue Wed Thu Fri Sat Sun".split())) + + with ensure_clean_store(setup_path) as store: + + store.put("fixed", s) + result = store.select("fixed") + tm.assert_series_equal(result, s) + + store.append("table", s) + result = store.select("table") + tm.assert_series_equal(result, s) + + def test_roundtrip_tz_aware_index(self, setup_path): + # GH 17618 + time = pd.Timestamp("2000-01-01 01:00:00", tz="US/Eastern") + df = pd.DataFrame(data=[0], index=[time]) + + with ensure_clean_store(setup_path) as store: + store.put("frame", df, format="fixed") + recons = store["frame"] + tm.assert_frame_equal(recons, df) + assert recons.index[0].value == 946706400000000000 + + def test_append_with_timedelta(self, setup_path): + # GH 3577 + # append timedelta + + df = DataFrame( + dict( + A=Timestamp("20130101"), + B=[ + Timestamp("20130101") + timedelta(days=i, seconds=10) + for i in range(10) + ], + ) + ) + df["C"] = df["A"] - df["B"] + df.loc[3:5, "C"] = np.nan + + with ensure_clean_store(setup_path) as store: + + # table + _maybe_remove(store, "df") + store.append("df", df, data_columns=True) + result = store.select("df") + tm.assert_frame_equal(result, df) + + result = store.select("df", where="C<100000") + tm.assert_frame_equal(result, df) + + result = store.select("df", where="C") + + # from the docs + with ensure_clean_path(setup_path) as path: + dfq = DataFrame( + np.random.randn(10, 4), + columns=list("ABCD"), + index=date_range("20130101", periods=10), + ) + dfq.to_hdf(path, "dfq", format="table", data_columns=True) + + # check ok + read_hdf( + path, "dfq", where="index>Timestamp('20130104') & columns=['A', 'B']" + ) + read_hdf(path, "dfq", where="A>0 or C>0") + + # catch the invalid reference + with ensure_clean_path(setup_path) as path: + dfq = DataFrame( + np.random.randn(10, 4), + columns=list("ABCD"), + index=date_range("20130101", periods=10), + ) + dfq.to_hdf(path, "dfq", format="table") + + with pytest.raises(ValueError): + read_hdf(path, "dfq", where="A>0 or C>0") + + def test_same_name_scoping(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + import pandas as pd + + df = DataFrame( + np.random.randn(20, 2), index=pd.date_range("20130101", periods=20) + ) + store.put("df", df, format="table") + expected = df[df.index > pd.Timestamp("20130105")] + + import datetime # noqa + + result = store.select("df", "index>datetime.datetime(2013,1,5)") + tm.assert_frame_equal(result, expected) + + from datetime import datetime # noqa + + # technically an error, but allow it + result = store.select("df", "index>datetime.datetime(2013,1,5)") + tm.assert_frame_equal(result, expected) + + result = store.select("df", "index>datetime(2013,1,5)") + tm.assert_frame_equal(result, expected) + + def test_series(self, setup_path): + + s = tm.makeStringSeries() + self._check_roundtrip(s, tm.assert_series_equal, path=setup_path) + + ts = tm.makeTimeSeries() + self._check_roundtrip(ts, tm.assert_series_equal, path=setup_path) + + ts2 = Series(ts.index, Index(ts.index, dtype=object)) + self._check_roundtrip(ts2, tm.assert_series_equal, path=setup_path) + + ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object)) + self._check_roundtrip( + ts3, tm.assert_series_equal, path=setup_path, check_index_type=False + ) + + def test_float_index(self, setup_path): + + # GH #454 + index = np.random.randn(10) + s = Series(np.random.randn(10), index=index) + self._check_roundtrip(s, tm.assert_series_equal, path=setup_path) + + @td.xfail_non_writeable + def test_tuple_index(self, setup_path): + + # GH #492 + col = np.arange(10) + idx = [(0.0, 1.0), (2.0, 3.0), (4.0, 5.0)] + data = np.random.randn(30).reshape((3, 10)) + DF = DataFrame(data, index=idx, columns=col) + + with catch_warnings(record=True): + simplefilter("ignore", pd.errors.PerformanceWarning) + self._check_roundtrip(DF, tm.assert_frame_equal, path=setup_path) + + @td.xfail_non_writeable + @pytest.mark.filterwarnings("ignore::pandas.errors.PerformanceWarning") + def test_index_types(self, setup_path): + + with catch_warnings(record=True): + values = np.random.randn(2) + + func = lambda l, r: tm.assert_series_equal( + l, r, check_dtype=True, check_index_type=True, check_series_type=True + ) + + with catch_warnings(record=True): + ser = Series(values, [0, "y"]) + self._check_roundtrip(ser, func, path=setup_path) + + with catch_warnings(record=True): + ser = Series(values, [datetime.datetime.today(), 0]) + self._check_roundtrip(ser, func, path=setup_path) + + with catch_warnings(record=True): + ser = Series(values, ["y", 0]) + self._check_roundtrip(ser, func, path=setup_path) + + with catch_warnings(record=True): + ser = Series(values, [datetime.date.today(), "a"]) + self._check_roundtrip(ser, func, path=setup_path) + + with catch_warnings(record=True): + + ser = Series(values, [0, "y"]) + self._check_roundtrip(ser, func, path=setup_path) + + ser = Series(values, [datetime.datetime.today(), 0]) + self._check_roundtrip(ser, func, path=setup_path) + + ser = Series(values, ["y", 0]) + self._check_roundtrip(ser, func, path=setup_path) + + ser = Series(values, [datetime.date.today(), "a"]) + self._check_roundtrip(ser, func, path=setup_path) + + ser = Series(values, [1.23, "b"]) + self._check_roundtrip(ser, func, path=setup_path) + + ser = Series(values, [1, 1.53]) + self._check_roundtrip(ser, func, path=setup_path) + + ser = Series(values, [1, 5]) + self._check_roundtrip(ser, func, path=setup_path) + + ser = Series( + values, [datetime.datetime(2012, 1, 1), datetime.datetime(2012, 1, 2)] + ) + self._check_roundtrip(ser, func, path=setup_path) + + def test_timeseries_preepoch(self, setup_path): + + dr = bdate_range("1/1/1940", "1/1/1960") + ts = Series(np.random.randn(len(dr)), index=dr) + try: + self._check_roundtrip(ts, tm.assert_series_equal, path=setup_path) + except OverflowError: + pytest.skip("known failer on some windows platforms") + + @td.xfail_non_writeable + @pytest.mark.parametrize( + "compression", [False, pytest.param(True, marks=td.skip_if_windows_python_3)] + ) + def test_frame(self, compression, setup_path): + + df = tm.makeDataFrame() + + # put in some random NAs + df.values[0, 0] = np.nan + df.values[5, 3] = np.nan + + self._check_roundtrip_table( + df, tm.assert_frame_equal, path=setup_path, compression=compression + ) + self._check_roundtrip( + df, tm.assert_frame_equal, path=setup_path, compression=compression + ) + + tdf = tm.makeTimeDataFrame() + self._check_roundtrip( + tdf, tm.assert_frame_equal, path=setup_path, compression=compression + ) + + with ensure_clean_store(setup_path) as store: + # not consolidated + df["foo"] = np.random.randn(len(df)) + store["df"] = df + recons = store["df"] + assert recons._data.is_consolidated() + + # empty + self._check_roundtrip(df[:0], tm.assert_frame_equal, path=setup_path) + + @td.xfail_non_writeable + def test_empty_series_frame(self, setup_path): + s0 = Series(dtype=object) + s1 = Series(name="myseries", dtype=object) + df0 = DataFrame() + df1 = DataFrame(index=["a", "b", "c"]) + df2 = DataFrame(columns=["d", "e", "f"]) + + self._check_roundtrip(s0, tm.assert_series_equal, path=setup_path) + self._check_roundtrip(s1, tm.assert_series_equal, path=setup_path) + self._check_roundtrip(df0, tm.assert_frame_equal, path=setup_path) + self._check_roundtrip(df1, tm.assert_frame_equal, path=setup_path) + self._check_roundtrip(df2, tm.assert_frame_equal, path=setup_path) + + @td.xfail_non_writeable + @pytest.mark.parametrize( + "dtype", [np.int64, np.float64, np.object, "m8[ns]", "M8[ns]"] + ) + def test_empty_series(self, dtype, setup_path): + s = Series(dtype=dtype) + self._check_roundtrip(s, tm.assert_series_equal, path=setup_path) + + def test_can_serialize_dates(self, setup_path): + + rng = [x.date() for x in bdate_range("1/1/2000", "1/30/2000")] + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + + self._check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) + + def test_store_hierarchical(self, setup_path): + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["foo", "bar"], + ) + frame = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) + + self._check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) + self._check_roundtrip(frame.T, tm.assert_frame_equal, path=setup_path) + self._check_roundtrip(frame["A"], tm.assert_series_equal, path=setup_path) + + # check that the names are stored + with ensure_clean_store(setup_path) as store: + store["frame"] = frame + recons = store["frame"] + tm.assert_frame_equal(recons, frame) + + def test_store_index_name(self, setup_path): + df = tm.makeDataFrame() + df.index.name = "foo" + + with ensure_clean_store(setup_path) as store: + store["frame"] = df + recons = store["frame"] + tm.assert_frame_equal(recons, df) + + def test_store_index_name_with_tz(self, setup_path): + # GH 13884 + df = pd.DataFrame({"A": [1, 2]}) + df.index = pd.DatetimeIndex([1234567890123456787, 1234567890123456788]) + df.index = df.index.tz_localize("UTC") + df.index.name = "foo" + + with ensure_clean_store(setup_path) as store: + store.put("frame", df, format="table") + recons = store["frame"] + tm.assert_frame_equal(recons, df) + + @pytest.mark.parametrize("table_format", ["table", "fixed"]) + def test_store_index_name_numpy_str(self, table_format, setup_path): + # GH #13492 + idx = pd.Index( + pd.to_datetime([datetime.date(2000, 1, 1), datetime.date(2000, 1, 2)]), + name="cols\u05d2", + ) + idx1 = pd.Index( + pd.to_datetime([datetime.date(2010, 1, 1), datetime.date(2010, 1, 2)]), + name="rows\u05d0", + ) + df = pd.DataFrame(np.arange(4).reshape(2, 2), columns=idx, index=idx1) + + # This used to fail, returning numpy strings instead of python strings. + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", format=table_format) + df2 = read_hdf(path, "df") + + tm.assert_frame_equal(df, df2, check_names=True) + + assert type(df2.index.name) == str + assert type(df2.columns.name) == str + + def test_store_series_name(self, setup_path): + df = tm.makeDataFrame() + series = df["A"] + + with ensure_clean_store(setup_path) as store: + store["series"] = series + recons = store["series"] + tm.assert_series_equal(recons, series) + + @td.xfail_non_writeable + @pytest.mark.parametrize( + "compression", [False, pytest.param(True, marks=td.skip_if_windows_python_3)] + ) + def test_store_mixed(self, compression, setup_path): + def _make_one(): + df = tm.makeDataFrame() + df["obj1"] = "foo" + df["obj2"] = "bar" + df["bool1"] = df["A"] > 0 + df["bool2"] = df["B"] > 0 + df["int1"] = 1 + df["int2"] = 2 + return df._consolidate() + + df1 = _make_one() + df2 = _make_one() + + self._check_roundtrip(df1, tm.assert_frame_equal, path=setup_path) + self._check_roundtrip(df2, tm.assert_frame_equal, path=setup_path) + + with ensure_clean_store(setup_path) as store: + store["obj"] = df1 + tm.assert_frame_equal(store["obj"], df1) + store["obj"] = df2 + tm.assert_frame_equal(store["obj"], df2) + + # check that can store Series of all of these types + self._check_roundtrip( + df1["obj1"], + tm.assert_series_equal, + path=setup_path, + compression=compression, + ) + self._check_roundtrip( + df1["bool1"], + tm.assert_series_equal, + path=setup_path, + compression=compression, + ) + self._check_roundtrip( + df1["int1"], + tm.assert_series_equal, + path=setup_path, + compression=compression, + ) + + @pytest.mark.filterwarnings( + "ignore:\\nduplicate:pandas.io.pytables.DuplicateWarning" + ) + def test_select_with_dups(self, setup_path): + + # single dtypes + df = DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]) + df.index = date_range("20130101 9:30", periods=10, freq="T") + + with ensure_clean_store(setup_path) as store: + store.append("df", df) + + result = store.select("df") + expected = df + tm.assert_frame_equal(result, expected, by_blocks=True) + + result = store.select("df", columns=df.columns) + expected = df + tm.assert_frame_equal(result, expected, by_blocks=True) + + result = store.select("df", columns=["A"]) + expected = df.loc[:, ["A"]] + tm.assert_frame_equal(result, expected) + + # dups across dtypes + df = concat( + [ + DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]), + DataFrame( + np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"] + ), + ], + axis=1, + ) + df.index = date_range("20130101 9:30", periods=10, freq="T") + + with ensure_clean_store(setup_path) as store: + store.append("df", df) + + result = store.select("df") + expected = df + tm.assert_frame_equal(result, expected, by_blocks=True) + + result = store.select("df", columns=df.columns) + expected = df + tm.assert_frame_equal(result, expected, by_blocks=True) + + expected = df.loc[:, ["A"]] + result = store.select("df", columns=["A"]) + tm.assert_frame_equal(result, expected, by_blocks=True) + + expected = df.loc[:, ["B", "A"]] + result = store.select("df", columns=["B", "A"]) + tm.assert_frame_equal(result, expected, by_blocks=True) + + # duplicates on both index and columns + with ensure_clean_store(setup_path) as store: + store.append("df", df) + store.append("df", df) + + expected = df.loc[:, ["B", "A"]] + expected = concat([expected, expected]) + result = store.select("df", columns=["B", "A"]) + tm.assert_frame_equal(result, expected, by_blocks=True) + + def test_overwrite_node(self, setup_path): + + with ensure_clean_store(setup_path) as store: + store["a"] = tm.makeTimeDataFrame() + ts = tm.makeTimeSeries() + store["a"] = ts + + tm.assert_series_equal(store["a"], ts) + + def test_select(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + with catch_warnings(record=True): + + # select with columns= + df = tm.makeTimeDataFrame() + _maybe_remove(store, "df") + store.append("df", df) + result = store.select("df", columns=["A", "B"]) + expected = df.reindex(columns=["A", "B"]) + tm.assert_frame_equal(expected, result) + + # equivalently + result = store.select("df", [("columns=['A', 'B']")]) + expected = df.reindex(columns=["A", "B"]) + tm.assert_frame_equal(expected, result) + + # with a data column + _maybe_remove(store, "df") + store.append("df", df, data_columns=["A"]) + result = store.select("df", ["A > 0"], columns=["A", "B"]) + expected = df[df.A > 0].reindex(columns=["A", "B"]) + tm.assert_frame_equal(expected, result) + + # all a data columns + _maybe_remove(store, "df") + store.append("df", df, data_columns=True) + result = store.select("df", ["A > 0"], columns=["A", "B"]) + expected = df[df.A > 0].reindex(columns=["A", "B"]) + tm.assert_frame_equal(expected, result) + + # with a data column, but different columns + _maybe_remove(store, "df") + store.append("df", df, data_columns=["A"]) + result = store.select("df", ["A > 0"], columns=["C", "D"]) + expected = df[df.A > 0].reindex(columns=["C", "D"]) + tm.assert_frame_equal(expected, result) + + def test_select_dtypes(self, setup_path): + + with ensure_clean_store(setup_path) as store: + # with a Timestamp data column (GH #2637) + df = DataFrame( + dict(ts=bdate_range("2012-01-01", periods=300), A=np.random.randn(300)) + ) + _maybe_remove(store, "df") + store.append("df", df, data_columns=["ts", "A"]) + + result = store.select("df", "ts>=Timestamp('2012-02-01')") + expected = df[df.ts >= Timestamp("2012-02-01")] + tm.assert_frame_equal(expected, result) + + # bool columns (GH #2849) + df = DataFrame(np.random.randn(5, 2), columns=["A", "B"]) + df["object"] = "foo" + df.loc[4:5, "object"] = "bar" + df["boolv"] = df["A"] > 0 + _maybe_remove(store, "df") + store.append("df", df, data_columns=True) + + expected = df[df.boolv == True].reindex(columns=["A", "boolv"]) # noqa + for v in [True, "true", 1]: + result = store.select( + "df", "boolv == {v!s}".format(v=v), columns=["A", "boolv"] + ) + tm.assert_frame_equal(expected, result) + + expected = df[df.boolv == False].reindex(columns=["A", "boolv"]) # noqa + for v in [False, "false", 0]: + result = store.select( + "df", "boolv == {v!s}".format(v=v), columns=["A", "boolv"] + ) + tm.assert_frame_equal(expected, result) + + # integer index + df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20))) + _maybe_remove(store, "df_int") + store.append("df_int", df) + result = store.select("df_int", "index<10 and columns=['A']") + expected = df.reindex(index=list(df.index)[0:10], columns=["A"]) + tm.assert_frame_equal(expected, result) + + # float index + df = DataFrame( + dict( + A=np.random.rand(20), + B=np.random.rand(20), + index=np.arange(20, dtype="f8"), + ) + ) + _maybe_remove(store, "df_float") + store.append("df_float", df) + result = store.select("df_float", "index<10.0 and columns=['A']") + expected = df.reindex(index=list(df.index)[0:10], columns=["A"]) + tm.assert_frame_equal(expected, result) + + with ensure_clean_store(setup_path) as store: + + # floats w/o NaN + df = DataFrame(dict(cols=range(11), values=range(11)), dtype="float64") + df["cols"] = (df["cols"] + 10).apply(str) + + store.append("df1", df, data_columns=True) + result = store.select("df1", where="values>2.0") + expected = df[df["values"] > 2.0] + tm.assert_frame_equal(expected, result) + + # floats with NaN + df.iloc[0] = np.nan + expected = df[df["values"] > 2.0] + + store.append("df2", df, data_columns=True, index=False) + result = store.select("df2", where="values>2.0") + tm.assert_frame_equal(expected, result) + + # https://github.com/PyTables/PyTables/issues/282 + # bug in selection when 0th row has a np.nan and an index + # store.append('df3',df,data_columns=True) + # result = store.select( + # 'df3', where='values>2.0') + # tm.assert_frame_equal(expected, result) + + # not in first position float with NaN ok too + df = DataFrame(dict(cols=range(11), values=range(11)), dtype="float64") + df["cols"] = (df["cols"] + 10).apply(str) + + df.iloc[1] = np.nan + expected = df[df["values"] > 2.0] + + store.append("df4", df, data_columns=True) + result = store.select("df4", where="values>2.0") + tm.assert_frame_equal(expected, result) + + # test selection with comparison against numpy scalar + # GH 11283 + with ensure_clean_store(setup_path) as store: + df = tm.makeDataFrame() + + expected = df[df["A"] > 0] + + store.append("df", df, data_columns=True) + np_zero = np.float64(0) # noqa + result = store.select("df", where=["A>np_zero"]) + tm.assert_frame_equal(expected, result) + + def test_select_with_many_inputs(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + df = DataFrame( + dict( + ts=bdate_range("2012-01-01", periods=300), + A=np.random.randn(300), + B=range(300), + users=["a"] * 50 + + ["b"] * 50 + + ["c"] * 100 + + ["a{i:03d}".format(i=i) for i in range(100)], + ) + ) + _maybe_remove(store, "df") + store.append("df", df, data_columns=["ts", "A", "B", "users"]) + + # regular select + result = store.select("df", "ts>=Timestamp('2012-02-01')") + expected = df[df.ts >= Timestamp("2012-02-01")] + tm.assert_frame_equal(expected, result) + + # small selector + result = store.select( + "df", "ts>=Timestamp('2012-02-01') & users=['a','b','c']" + ) + expected = df[ + (df.ts >= Timestamp("2012-02-01")) & df.users.isin(["a", "b", "c"]) + ] + tm.assert_frame_equal(expected, result) + + # big selector along the columns + selector = ["a", "b", "c"] + ["a{i:03d}".format(i=i) for i in range(60)] + result = store.select( + "df", "ts>=Timestamp('2012-02-01') and users=selector" + ) + expected = df[(df.ts >= Timestamp("2012-02-01")) & df.users.isin(selector)] + tm.assert_frame_equal(expected, result) + + selector = range(100, 200) + result = store.select("df", "B=selector") + expected = df[df.B.isin(selector)] + tm.assert_frame_equal(expected, result) + assert len(result) == 100 + + # big selector along the index + selector = Index(df.ts[0:100].values) + result = store.select("df", "ts=selector") + expected = df[df.ts.isin(selector.values)] + tm.assert_frame_equal(expected, result) + assert len(result) == 100 + + def test_select_iterator(self, setup_path): + + # single table + with ensure_clean_store(setup_path) as store: + + df = tm.makeTimeDataFrame(500) + _maybe_remove(store, "df") + store.append("df", df) + + expected = store.select("df") + + results = list(store.select("df", iterator=True)) + result = concat(results) + tm.assert_frame_equal(expected, result) + + results = list(store.select("df", chunksize=100)) + assert len(results) == 5 + result = concat(results) + tm.assert_frame_equal(expected, result) + + results = list(store.select("df", chunksize=150)) + result = concat(results) + tm.assert_frame_equal(result, expected) + + with ensure_clean_path(setup_path) as path: + + df = tm.makeTimeDataFrame(500) + df.to_hdf(path, "df_non_table") + + with pytest.raises(TypeError): + read_hdf(path, "df_non_table", chunksize=100) + + with pytest.raises(TypeError): + read_hdf(path, "df_non_table", iterator=True) + + with ensure_clean_path(setup_path) as path: + + df = tm.makeTimeDataFrame(500) + df.to_hdf(path, "df", format="table") + + results = list(read_hdf(path, "df", chunksize=100)) + result = concat(results) + + assert len(results) == 5 + tm.assert_frame_equal(result, df) + tm.assert_frame_equal(result, read_hdf(path, "df")) + + # multiple + + with ensure_clean_store(setup_path) as store: + + df1 = tm.makeTimeDataFrame(500) + store.append("df1", df1, data_columns=True) + df2 = tm.makeTimeDataFrame(500).rename(columns="{}_2".format) + df2["foo"] = "bar" + store.append("df2", df2) + + df = concat([df1, df2], axis=1) + + # full selection + expected = store.select_as_multiple(["df1", "df2"], selector="df1") + results = list( + store.select_as_multiple(["df1", "df2"], selector="df1", chunksize=150) + ) + result = concat(results) + tm.assert_frame_equal(expected, result) + + def test_select_iterator_complete_8014(self, setup_path): + + # GH 8014 + # using iterator and where clause + chunksize = 1e4 + + # no iterator + with ensure_clean_store(setup_path) as store: + + expected = tm.makeTimeDataFrame(100064, "S") + _maybe_remove(store, "df") + store.append("df", expected) + + beg_dt = expected.index[0] + end_dt = expected.index[-1] + + # select w/o iteration and no where clause works + result = store.select("df") + tm.assert_frame_equal(expected, result) + + # select w/o iterator and where clause, single term, begin + # of range, works + where = "index >= '{beg_dt}'".format(beg_dt=beg_dt) + result = store.select("df", where=where) + tm.assert_frame_equal(expected, result) + + # select w/o iterator and where clause, single term, end + # of range, works + where = "index <= '{end_dt}'".format(end_dt=end_dt) + result = store.select("df", where=where) + tm.assert_frame_equal(expected, result) + + # select w/o iterator and where clause, inclusive range, + # works + where = "index >= '{beg_dt}' & index <= '{end_dt}'".format( + beg_dt=beg_dt, end_dt=end_dt + ) + result = store.select("df", where=where) + tm.assert_frame_equal(expected, result) + + # with iterator, full range + with ensure_clean_store(setup_path) as store: + + expected = tm.makeTimeDataFrame(100064, "S") + _maybe_remove(store, "df") + store.append("df", expected) + + beg_dt = expected.index[0] + end_dt = expected.index[-1] + + # select w/iterator and no where clause works + results = list(store.select("df", chunksize=chunksize)) + result = concat(results) + tm.assert_frame_equal(expected, result) + + # select w/iterator and where clause, single term, begin of range + where = "index >= '{beg_dt}'".format(beg_dt=beg_dt) + results = list(store.select("df", where=where, chunksize=chunksize)) + result = concat(results) + tm.assert_frame_equal(expected, result) + + # select w/iterator and where clause, single term, end of range + where = "index <= '{end_dt}'".format(end_dt=end_dt) + results = list(store.select("df", where=where, chunksize=chunksize)) + result = concat(results) + tm.assert_frame_equal(expected, result) + + # select w/iterator and where clause, inclusive range + where = "index >= '{beg_dt}' & index <= '{end_dt}'".format( + beg_dt=beg_dt, end_dt=end_dt + ) + results = list(store.select("df", where=where, chunksize=chunksize)) + result = concat(results) + tm.assert_frame_equal(expected, result) + + def test_select_iterator_non_complete_8014(self, setup_path): + + # GH 8014 + # using iterator and where clause + chunksize = 1e4 + + # with iterator, non complete range + with ensure_clean_store(setup_path) as store: + + expected = tm.makeTimeDataFrame(100064, "S") + _maybe_remove(store, "df") + store.append("df", expected) + + beg_dt = expected.index[1] + end_dt = expected.index[-2] + + # select w/iterator and where clause, single term, begin of range + where = "index >= '{beg_dt}'".format(beg_dt=beg_dt) + results = list(store.select("df", where=where, chunksize=chunksize)) + result = concat(results) + rexpected = expected[expected.index >= beg_dt] + tm.assert_frame_equal(rexpected, result) + + # select w/iterator and where clause, single term, end of range + where = "index <= '{end_dt}'".format(end_dt=end_dt) + results = list(store.select("df", where=where, chunksize=chunksize)) + result = concat(results) + rexpected = expected[expected.index <= end_dt] + tm.assert_frame_equal(rexpected, result) + + # select w/iterator and where clause, inclusive range + where = "index >= '{beg_dt}' & index <= '{end_dt}'".format( + beg_dt=beg_dt, end_dt=end_dt + ) + results = list(store.select("df", where=where, chunksize=chunksize)) + result = concat(results) + rexpected = expected[ + (expected.index >= beg_dt) & (expected.index <= end_dt) + ] + tm.assert_frame_equal(rexpected, result) + + # with iterator, empty where + with ensure_clean_store(setup_path) as store: + + expected = tm.makeTimeDataFrame(100064, "S") + _maybe_remove(store, "df") + store.append("df", expected) + + end_dt = expected.index[-1] + + # select w/iterator and where clause, single term, begin of range + where = "index > '{end_dt}'".format(end_dt=end_dt) + results = list(store.select("df", where=where, chunksize=chunksize)) + assert 0 == len(results) + + def test_select_iterator_many_empty_frames(self, setup_path): + + # GH 8014 + # using iterator and where clause can return many empty + # frames. + chunksize = int(1e4) + + # with iterator, range limited to the first chunk + with ensure_clean_store(setup_path) as store: + + expected = tm.makeTimeDataFrame(100000, "S") + _maybe_remove(store, "df") + store.append("df", expected) + + beg_dt = expected.index[0] + end_dt = expected.index[chunksize - 1] + + # select w/iterator and where clause, single term, begin of range + where = "index >= '{beg_dt}'".format(beg_dt=beg_dt) + results = list(store.select("df", where=where, chunksize=chunksize)) + result = concat(results) + rexpected = expected[expected.index >= beg_dt] + tm.assert_frame_equal(rexpected, result) + + # select w/iterator and where clause, single term, end of range + where = "index <= '{end_dt}'".format(end_dt=end_dt) + results = list(store.select("df", where=where, chunksize=chunksize)) + + assert len(results) == 1 + result = concat(results) + rexpected = expected[expected.index <= end_dt] + tm.assert_frame_equal(rexpected, result) + + # select w/iterator and where clause, inclusive range + where = "index >= '{beg_dt}' & index <= '{end_dt}'".format( + beg_dt=beg_dt, end_dt=end_dt + ) + results = list(store.select("df", where=where, chunksize=chunksize)) + + # should be 1, is 10 + assert len(results) == 1 + result = concat(results) + rexpected = expected[ + (expected.index >= beg_dt) & (expected.index <= end_dt) + ] + tm.assert_frame_equal(rexpected, result) + + # select w/iterator and where clause which selects + # *nothing*. + # + # To be consistent with Python idiom I suggest this should + # return [] e.g. `for e in []: print True` never prints + # True. + + where = "index <= '{beg_dt}' & index >= '{end_dt}'".format( + beg_dt=beg_dt, end_dt=end_dt + ) + results = list(store.select("df", where=where, chunksize=chunksize)) + + # should be [] + assert len(results) == 0 + + @pytest.mark.filterwarnings( + "ignore:\\nthe :pandas.io.pytables.AttributeConflictWarning" + ) + def test_retain_index_attributes(self, setup_path): + + # GH 3499, losing frequency info on index recreation + df = DataFrame( + dict(A=Series(range(3), index=date_range("2000-1-1", periods=3, freq="H"))) + ) + + with ensure_clean_store(setup_path) as store: + _maybe_remove(store, "data") + store.put("data", df, format="table") + + result = store.get("data") + tm.assert_frame_equal(df, result) + + for attr in ["freq", "tz", "name"]: + for idx in ["index", "columns"]: + assert getattr(getattr(df, idx), attr, None) == getattr( + getattr(result, idx), attr, None + ) + + # try to append a table with a different frequency + with catch_warnings(record=True): + df2 = DataFrame( + dict( + A=Series( + range(3), index=date_range("2002-1-1", periods=3, freq="D") + ) + ) + ) + store.append("data", df2) + + assert store.get_storer("data").info["index"]["freq"] is None + + # this is ok + _maybe_remove(store, "df2") + df2 = DataFrame( + dict( + A=Series( + range(3), + index=[ + Timestamp("20010101"), + Timestamp("20010102"), + Timestamp("20020101"), + ], + ) + ) + ) + store.append("df2", df2) + df3 = DataFrame( + dict( + A=Series( + range(3), index=date_range("2002-1-1", periods=3, freq="D") + ) + ) + ) + store.append("df2", df3) + + @pytest.mark.filterwarnings( + "ignore:\\nthe :pandas.io.pytables.AttributeConflictWarning" + ) + def test_retain_index_attributes2(self, setup_path): + with ensure_clean_path(setup_path) as path: + + with catch_warnings(record=True): + + df = DataFrame( + dict( + A=Series( + range(3), index=date_range("2000-1-1", periods=3, freq="H") + ) + ) + ) + df.to_hdf(path, "data", mode="w", append=True) + df2 = DataFrame( + dict( + A=Series( + range(3), index=date_range("2002-1-1", periods=3, freq="D") + ) + ) + ) + df2.to_hdf(path, "data", append=True) + + idx = date_range("2000-1-1", periods=3, freq="H") + idx.name = "foo" + df = DataFrame(dict(A=Series(range(3), index=idx))) + df.to_hdf(path, "data", mode="w", append=True) + + assert read_hdf(path, "data").index.name == "foo" + + with catch_warnings(record=True): + + idx2 = date_range("2001-1-1", periods=3, freq="H") + idx2.name = "bar" + df2 = DataFrame(dict(A=Series(range(3), index=idx2))) + df2.to_hdf(path, "data", append=True) + + assert read_hdf(path, "data").index.name is None + + def test_frame_select(self, setup_path): + + df = tm.makeTimeDataFrame() + + with ensure_clean_store(setup_path) as store: + store.put("frame", df, format="table") + date = df.index[len(df) // 2] + + crit1 = Term("index>=date") + assert crit1.env.scope["date"] == date + + crit2 = "columns=['A', 'D']" + crit3 = "columns=A" + + result = store.select("frame", [crit1, crit2]) + expected = df.loc[date:, ["A", "D"]] + tm.assert_frame_equal(result, expected) + + result = store.select("frame", [crit3]) + expected = df.loc[:, ["A"]] + tm.assert_frame_equal(result, expected) + + # invalid terms + df = tm.makeTimeDataFrame() + store.append("df_time", df) + with pytest.raises(ValueError): + store.select("df_time", "index>0") + + # can't select if not written as table + # store['frame'] = df + # with pytest.raises(ValueError): + # store.select('frame', [crit1, crit2]) + + def test_frame_select_complex(self, setup_path): + # select via complex criteria + + df = tm.makeTimeDataFrame() + df["string"] = "foo" + df.loc[df.index[0:4], "string"] = "bar" + + with ensure_clean_store(setup_path) as store: + store.put("df", df, format="table", data_columns=["string"]) + + # empty + result = store.select("df", 'index>df.index[3] & string="bar"') + expected = df.loc[(df.index > df.index[3]) & (df.string == "bar")] + tm.assert_frame_equal(result, expected) + + result = store.select("df", 'index>df.index[3] & string="foo"') + expected = df.loc[(df.index > df.index[3]) & (df.string == "foo")] + tm.assert_frame_equal(result, expected) + + # or + result = store.select("df", 'index>df.index[3] | string="bar"') + expected = df.loc[(df.index > df.index[3]) | (df.string == "bar")] + tm.assert_frame_equal(result, expected) + + result = store.select( + "df", '(index>df.index[3] & index<=df.index[6]) | string="bar"' + ) + expected = df.loc[ + ((df.index > df.index[3]) & (df.index <= df.index[6])) + | (df.string == "bar") + ] + tm.assert_frame_equal(result, expected) + + # invert + result = store.select("df", 'string!="bar"') + expected = df.loc[df.string != "bar"] + tm.assert_frame_equal(result, expected) + + # invert not implemented in numexpr :( + with pytest.raises(NotImplementedError): + store.select("df", '~(string="bar")') + + # invert ok for filters + result = store.select("df", "~(columns=['A','B'])") + expected = df.loc[:, df.columns.difference(["A", "B"])] + tm.assert_frame_equal(result, expected) + + # in + result = store.select("df", "index>df.index[3] & columns in ['A','B']") + expected = df.loc[df.index > df.index[3]].reindex(columns=["A", "B"]) + tm.assert_frame_equal(result, expected) + + def test_frame_select_complex2(self, setup_path): + + with ensure_clean_path(["parms.hdf", "hist.hdf"]) as paths: + + pp, hh = paths + + # use non-trivial selection criteria + parms = DataFrame({"A": [1, 1, 2, 2, 3]}) + parms.to_hdf(pp, "df", mode="w", format="table", data_columns=["A"]) + + selection = read_hdf(pp, "df", where="A=[2,3]") + hist = DataFrame( + np.random.randn(25, 1), + columns=["data"], + index=MultiIndex.from_tuples( + [(i, j) for i in range(5) for j in range(5)], names=["l1", "l2"] + ), + ) + + hist.to_hdf(hh, "df", mode="w", format="table") + + expected = read_hdf(hh, "df", where="l1=[2, 3, 4]") + + # scope with list like + l = selection.index.tolist() # noqa + store = HDFStore(hh) + result = store.select("df", where="l1=l") + tm.assert_frame_equal(result, expected) + store.close() + + result = read_hdf(hh, "df", where="l1=l") + tm.assert_frame_equal(result, expected) + + # index + index = selection.index # noqa + result = read_hdf(hh, "df", where="l1=index") + tm.assert_frame_equal(result, expected) + + result = read_hdf(hh, "df", where="l1=selection.index") + tm.assert_frame_equal(result, expected) + + result = read_hdf(hh, "df", where="l1=selection.index.tolist()") + tm.assert_frame_equal(result, expected) + + result = read_hdf(hh, "df", where="l1=list(selection.index)") + tm.assert_frame_equal(result, expected) + + # scope with index + store = HDFStore(hh) + + result = store.select("df", where="l1=index") + tm.assert_frame_equal(result, expected) + + result = store.select("df", where="l1=selection.index") + tm.assert_frame_equal(result, expected) + + result = store.select("df", where="l1=selection.index.tolist()") + tm.assert_frame_equal(result, expected) + + result = store.select("df", where="l1=list(selection.index)") + tm.assert_frame_equal(result, expected) + + store.close() + + def test_invalid_filtering(self, setup_path): + + # can't use more than one filter (atm) + + df = tm.makeTimeDataFrame() + + with ensure_clean_store(setup_path) as store: + store.put("df", df, format="table") + + # not implemented + with pytest.raises(NotImplementedError): + store.select("df", "columns=['A'] | columns=['B']") + + # in theory we could deal with this + with pytest.raises(NotImplementedError): + store.select("df", "columns=['A','B'] & columns=['C']") + + def test_string_select(self, setup_path): + # GH 2973 + with ensure_clean_store(setup_path) as store: + + df = tm.makeTimeDataFrame() + + # test string ==/!= + df["x"] = "none" + df.loc[2:7, "x"] = "" + + store.append("df", df, data_columns=["x"]) + + result = store.select("df", "x=none") + expected = df[df.x == "none"] + tm.assert_frame_equal(result, expected) + + result = store.select("df", "x!=none") + expected = df[df.x != "none"] + tm.assert_frame_equal(result, expected) + + df2 = df.copy() + df2.loc[df2.x == "", "x"] = np.nan + + store.append("df2", df2, data_columns=["x"]) + result = store.select("df2", "x!=none") + expected = df2[isna(df2.x)] + tm.assert_frame_equal(result, expected) + + # int ==/!= + df["int"] = 1 + df.loc[2:7, "int"] = 2 + + store.append("df3", df, data_columns=["int"]) + + result = store.select("df3", "int=2") + expected = df[df.int == 2] + tm.assert_frame_equal(result, expected) + + result = store.select("df3", "int!=2") + expected = df[df.int != 2] + tm.assert_frame_equal(result, expected) + + def test_read_column(self, setup_path): + + df = tm.makeTimeDataFrame() + + with ensure_clean_store(setup_path) as store: + _maybe_remove(store, "df") + + # GH 17912 + # HDFStore.select_column should raise a KeyError + # exception if the key is not a valid store + with pytest.raises(KeyError, match="No object named df in the file"): + store.select_column("df", "index") + + store.append("df", df) + # error + with pytest.raises( + KeyError, match=re.escape("'column [foo] not found in the table'") + ): + store.select_column("df", "foo") + + with pytest.raises(Exception): + store.select_column("df", "index", where=["index>5"]) + + # valid + result = store.select_column("df", "index") + tm.assert_almost_equal(result.values, Series(df.index).values) + assert isinstance(result, Series) + + # not a data indexable column + with pytest.raises(ValueError): + store.select_column("df", "values_block_0") + + # a data column + df2 = df.copy() + df2["string"] = "foo" + store.append("df2", df2, data_columns=["string"]) + result = store.select_column("df2", "string") + tm.assert_almost_equal(result.values, df2["string"].values) + + # a data column with NaNs, result excludes the NaNs + df3 = df.copy() + df3["string"] = "foo" + df3.loc[4:6, "string"] = np.nan + store.append("df3", df3, data_columns=["string"]) + result = store.select_column("df3", "string") + tm.assert_almost_equal(result.values, df3["string"].values) + + # start/stop + result = store.select_column("df3", "string", start=2) + tm.assert_almost_equal(result.values, df3["string"].values[2:]) + + result = store.select_column("df3", "string", start=-2) + tm.assert_almost_equal(result.values, df3["string"].values[-2:]) + + result = store.select_column("df3", "string", stop=2) + tm.assert_almost_equal(result.values, df3["string"].values[:2]) + + result = store.select_column("df3", "string", stop=-2) + tm.assert_almost_equal(result.values, df3["string"].values[:-2]) + + result = store.select_column("df3", "string", start=2, stop=-2) + tm.assert_almost_equal(result.values, df3["string"].values[2:-2]) + + result = store.select_column("df3", "string", start=-2, stop=2) + tm.assert_almost_equal(result.values, df3["string"].values[-2:2]) + + # GH 10392 - make sure column name is preserved + df4 = DataFrame({"A": np.random.randn(10), "B": "foo"}) + store.append("df4", df4, data_columns=True) + expected = df4["B"] + result = store.select_column("df4", "B") + tm.assert_series_equal(result, expected) + + def test_coordinates(self, setup_path): + df = tm.makeTimeDataFrame() + + with ensure_clean_store(setup_path) as store: + + _maybe_remove(store, "df") + store.append("df", df) + + # all + c = store.select_as_coordinates("df") + assert (c.values == np.arange(len(df.index))).all() + + # get coordinates back & test vs frame + _maybe_remove(store, "df") + + df = DataFrame(dict(A=range(5), B=range(5))) + store.append("df", df) + c = store.select_as_coordinates("df", ["index<3"]) + assert (c.values == np.arange(3)).all() + result = store.select("df", where=c) + expected = df.loc[0:2, :] + tm.assert_frame_equal(result, expected) + + c = store.select_as_coordinates("df", ["index>=3", "index<=4"]) + assert (c.values == np.arange(2) + 3).all() + result = store.select("df", where=c) + expected = df.loc[3:4, :] + tm.assert_frame_equal(result, expected) + assert isinstance(c, Index) + + # multiple tables + _maybe_remove(store, "df1") + _maybe_remove(store, "df2") + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) + store.append("df1", df1, data_columns=["A", "B"]) + store.append("df2", df2) + + c = store.select_as_coordinates("df1", ["A>0", "B>0"]) + df1_result = store.select("df1", c) + df2_result = store.select("df2", c) + result = concat([df1_result, df2_result], axis=1) + + expected = concat([df1, df2], axis=1) + expected = expected[(expected.A > 0) & (expected.B > 0)] + tm.assert_frame_equal(result, expected) + + # pass array/mask as the coordinates + with ensure_clean_store(setup_path) as store: + + df = DataFrame( + np.random.randn(1000, 2), index=date_range("20000101", periods=1000) + ) + store.append("df", df) + c = store.select_column("df", "index") + where = c[DatetimeIndex(c).month == 5].index + expected = df.iloc[where] + + # locations + result = store.select("df", where=where) + tm.assert_frame_equal(result, expected) + + # boolean + result = store.select("df", where=where) + tm.assert_frame_equal(result, expected) + + # invalid + with pytest.raises(ValueError): + store.select("df", where=np.arange(len(df), dtype="float64")) + + with pytest.raises(ValueError): + store.select("df", where=np.arange(len(df) + 1)) + + with pytest.raises(ValueError): + store.select("df", where=np.arange(len(df)), start=5) + + with pytest.raises(ValueError): + store.select("df", where=np.arange(len(df)), start=5, stop=10) + + # selection with filter + selection = date_range("20000101", periods=500) + result = store.select("df", where="index in selection") + expected = df[df.index.isin(selection)] + tm.assert_frame_equal(result, expected) + + # list + df = DataFrame(np.random.randn(10, 2)) + store.append("df2", df) + result = store.select("df2", where=[0, 3, 5]) + expected = df.iloc[[0, 3, 5]] + tm.assert_frame_equal(result, expected) + + # boolean + where = [True] * 10 + where[-2] = False + result = store.select("df2", where=where) + expected = df.loc[where] + tm.assert_frame_equal(result, expected) + + # start/stop + result = store.select("df2", start=5, stop=10) + expected = df[5:10] + tm.assert_frame_equal(result, expected) + + def test_append_to_multiple(self, setup_path): + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) + df2["foo"] = "bar" + df = concat([df1, df2], axis=1) + + with ensure_clean_store(setup_path) as store: + + # exceptions + with pytest.raises(ValueError): + store.append_to_multiple( + {"df1": ["A", "B"], "df2": None}, df, selector="df3" + ) + + with pytest.raises(ValueError): + store.append_to_multiple({"df1": None, "df2": None}, df, selector="df3") + + with pytest.raises(ValueError): + store.append_to_multiple("df1", df, "df1") + + # regular operation + store.append_to_multiple( + {"df1": ["A", "B"], "df2": None}, df, selector="df1" + ) + result = store.select_as_multiple( + ["df1", "df2"], where=["A>0", "B>0"], selector="df1" + ) + expected = df[(df.A > 0) & (df.B > 0)] + tm.assert_frame_equal(result, expected) + + def test_append_to_multiple_dropna(self, setup_path): + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) + df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan + df = concat([df1, df2], axis=1) + + with ensure_clean_store(setup_path) as store: + + # dropna=True should guarantee rows are synchronized + store.append_to_multiple( + {"df1": ["A", "B"], "df2": None}, df, selector="df1", dropna=True + ) + result = store.select_as_multiple(["df1", "df2"]) + expected = df.dropna() + tm.assert_frame_equal(result, expected) + tm.assert_index_equal(store.select("df1").index, store.select("df2").index) + + @pytest.mark.xfail( + run=False, reason="append_to_multiple_dropna_false is not raising as failed" + ) + def test_append_to_multiple_dropna_false(self, setup_path): + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) + df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan + df = concat([df1, df2], axis=1) + + with ensure_clean_store(setup_path) as store: + + # dropna=False shouldn't synchronize row indexes + store.append_to_multiple( + {"df1a": ["A", "B"], "df2a": None}, df, selector="df1a", dropna=False + ) + + with pytest.raises(ValueError): + store.select_as_multiple(["df1a", "df2a"]) + + assert not store.select("df1a").index.equals(store.select("df2a").index) + + def test_select_as_multiple(self, setup_path): + + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) + df2["foo"] = "bar" + + with ensure_clean_store(setup_path) as store: + + # no tables stored + with pytest.raises(Exception): + store.select_as_multiple(None, where=["A>0", "B>0"], selector="df1") + + store.append("df1", df1, data_columns=["A", "B"]) + store.append("df2", df2) + + # exceptions + with pytest.raises(Exception): + store.select_as_multiple(None, where=["A>0", "B>0"], selector="df1") + + with pytest.raises(Exception): + store.select_as_multiple([None], where=["A>0", "B>0"], selector="df1") + + msg = "'No object named df3 in the file'" + with pytest.raises(KeyError, match=msg): + store.select_as_multiple( + ["df1", "df3"], where=["A>0", "B>0"], selector="df1" + ) + + with pytest.raises(KeyError, match=msg): + store.select_as_multiple(["df3"], where=["A>0", "B>0"], selector="df1") + + with pytest.raises(KeyError, match="'No object named df4 in the file'"): + store.select_as_multiple( + ["df1", "df2"], where=["A>0", "B>0"], selector="df4" + ) + + # default select + result = store.select("df1", ["A>0", "B>0"]) + expected = store.select_as_multiple( + ["df1"], where=["A>0", "B>0"], selector="df1" + ) + tm.assert_frame_equal(result, expected) + expected = store.select_as_multiple( + "df1", where=["A>0", "B>0"], selector="df1" + ) + tm.assert_frame_equal(result, expected) + + # multiple + result = store.select_as_multiple( + ["df1", "df2"], where=["A>0", "B>0"], selector="df1" + ) + expected = concat([df1, df2], axis=1) + expected = expected[(expected.A > 0) & (expected.B > 0)] + tm.assert_frame_equal(result, expected) + + # multiple (diff selector) + result = store.select_as_multiple( + ["df1", "df2"], where="index>df2.index[4]", selector="df2" + ) + expected = concat([df1, df2], axis=1) + expected = expected[5:] + tm.assert_frame_equal(result, expected) + + # test exception for diff rows + store.append("df3", tm.makeTimeDataFrame(nper=50)) + with pytest.raises(ValueError): + store.select_as_multiple( + ["df1", "df3"], where=["A>0", "B>0"], selector="df1" + ) + + @pytest.mark.skipif( + LooseVersion(tables.__version__) < LooseVersion("3.1.0"), + reason=("tables version does not support fix for nan selection bug: GH 4858"), + ) + def test_nan_selection_bug_4858(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + df = DataFrame(dict(cols=range(6), values=range(6)), dtype="float64") + df["cols"] = (df["cols"] + 10).apply(str) + df.iloc[0] = np.nan + + expected = DataFrame( + dict(cols=["13.0", "14.0", "15.0"], values=[3.0, 4.0, 5.0]), + index=[3, 4, 5], + ) + + # write w/o the index on that particular column + store.append("df", df, data_columns=True, index=["cols"]) + result = store.select("df", where="values>2.0") + tm.assert_frame_equal(result, expected) + + def test_start_stop_table(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + # table + df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20))) + store.append("df", df) + + result = store.select("df", "columns=['A']", start=0, stop=5) + expected = df.loc[0:4, ["A"]] + tm.assert_frame_equal(result, expected) + + # out of range + result = store.select("df", "columns=['A']", start=30, stop=40) + assert len(result) == 0 + expected = df.loc[30:40, ["A"]] + tm.assert_frame_equal(result, expected) + + def test_start_stop_multiple(self, setup_path): + + # GH 16209 + with ensure_clean_store(setup_path) as store: + + df = DataFrame({"foo": [1, 2], "bar": [1, 2]}) + + store.append_to_multiple( + {"selector": ["foo"], "data": None}, df, selector="selector" + ) + result = store.select_as_multiple( + ["selector", "data"], selector="selector", start=0, stop=1 + ) + expected = df.loc[[0], ["foo", "bar"]] + tm.assert_frame_equal(result, expected) + + def test_start_stop_fixed(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + # fixed, GH 8287 + df = DataFrame( + dict(A=np.random.rand(20), B=np.random.rand(20)), + index=pd.date_range("20130101", periods=20), + ) + store.put("df", df) + + result = store.select("df", start=0, stop=5) + expected = df.iloc[0:5, :] + tm.assert_frame_equal(result, expected) + + result = store.select("df", start=5, stop=10) + expected = df.iloc[5:10, :] + tm.assert_frame_equal(result, expected) + + # out of range + result = store.select("df", start=30, stop=40) + expected = df.iloc[30:40, :] + tm.assert_frame_equal(result, expected) + + # series + s = df.A + store.put("s", s) + result = store.select("s", start=0, stop=5) + expected = s.iloc[0:5] + tm.assert_series_equal(result, expected) + + result = store.select("s", start=5, stop=10) + expected = s.iloc[5:10] + tm.assert_series_equal(result, expected) + + # sparse; not implemented + df = tm.makeDataFrame() + df.iloc[3:5, 1:3] = np.nan + df.iloc[8:10, -2] = np.nan + + def test_select_filter_corner(self, setup_path): + + df = DataFrame(np.random.randn(50, 100)) + df.index = ["{c:3d}".format(c=c) for c in df.index] + df.columns = ["{c:3d}".format(c=c) for c in df.columns] + + with ensure_clean_store(setup_path) as store: + store.put("frame", df, format="table") + + crit = "columns=df.columns[:75]" + result = store.select("frame", [crit]) + tm.assert_frame_equal(result, df.loc[:, df.columns[:75]]) + + crit = "columns=df.columns[:75:2]" + result = store.select("frame", [crit]) + tm.assert_frame_equal(result, df.loc[:, df.columns[:75:2]]) + + def test_path_pathlib(self, setup_path): + df = tm.makeDataFrame() + + result = tm.round_trip_pathlib( + lambda p: df.to_hdf(p, "df"), lambda p: pd.read_hdf(p, "df") + ) + tm.assert_frame_equal(df, result) + + @pytest.mark.parametrize("start, stop", [(0, 2), (1, 2), (None, None)]) + def test_contiguous_mixed_data_table(self, start, stop, setup_path): + # GH 17021 + # ValueError when reading a contiguous mixed-data table ft. VLArray + df = DataFrame( + { + "a": Series([20111010, 20111011, 20111012]), + "b": Series(["ab", "cd", "ab"]), + } + ) + + with ensure_clean_store(setup_path) as store: + store.append("test_dataset", df) + + result = store.select("test_dataset", start=start, stop=stop) + tm.assert_frame_equal(df[start:stop], result) + + def test_path_pathlib_hdfstore(self, setup_path): + df = tm.makeDataFrame() + + def writer(path): + with pd.HDFStore(path) as store: + df.to_hdf(store, "df") + + def reader(path): + with pd.HDFStore(path) as store: + return pd.read_hdf(store, "df") + + result = tm.round_trip_pathlib(writer, reader) + tm.assert_frame_equal(df, result) + + def test_pickle_path_localpath(self, setup_path): + df = tm.makeDataFrame() + result = tm.round_trip_pathlib( + lambda p: df.to_hdf(p, "df"), lambda p: pd.read_hdf(p, "df") + ) + tm.assert_frame_equal(df, result) + + def test_path_localpath_hdfstore(self, setup_path): + df = tm.makeDataFrame() + + def writer(path): + with pd.HDFStore(path) as store: + df.to_hdf(store, "df") + + def reader(path): + with pd.HDFStore(path) as store: + return pd.read_hdf(store, "df") + + result = tm.round_trip_localpath(writer, reader) + tm.assert_frame_equal(df, result) + + def _check_roundtrip(self, obj, comparator, path, compression=False, **kwargs): + + options = {} + if compression: + options["complib"] = _default_compressor + + with ensure_clean_store(path, "w", **options) as store: + store["obj"] = obj + retrieved = store["obj"] + comparator(retrieved, obj, **kwargs) + + def _check_double_roundtrip( + self, obj, comparator, path, compression=False, **kwargs + ): + options = {} + if compression: + options["complib"] = compression or _default_compressor + + with ensure_clean_store(path, "w", **options) as store: + store["obj"] = obj + retrieved = store["obj"] + comparator(retrieved, obj, **kwargs) + store["obj"] = retrieved + again = store["obj"] + comparator(again, obj, **kwargs) + + def _check_roundtrip_table(self, obj, comparator, path, compression=False): + options = {} + if compression: + options["complib"] = _default_compressor + + with ensure_clean_store(path, "w", **options) as store: + store.put("obj", obj, format="table") + retrieved = store["obj"] + + comparator(retrieved, obj) + + def test_multiple_open_close(self, setup_path): + # gh-4409: open & close multiple times + + with ensure_clean_path(setup_path) as path: + + df = tm.makeDataFrame() + df.to_hdf(path, "df", mode="w", format="table") + + # single + store = HDFStore(path) + assert "CLOSED" not in store.info() + assert store.is_open + + store.close() + assert "CLOSED" in store.info() + assert not store.is_open + + with ensure_clean_path(setup_path) as path: + + if pytables._table_file_open_policy_is_strict: + + # multiples + store1 = HDFStore(path) + + with pytest.raises(ValueError): + HDFStore(path) + + store1.close() + else: + + # multiples + store1 = HDFStore(path) + store2 = HDFStore(path) + + assert "CLOSED" not in store1.info() + assert "CLOSED" not in store2.info() + assert store1.is_open + assert store2.is_open + + store1.close() + assert "CLOSED" in store1.info() + assert not store1.is_open + assert "CLOSED" not in store2.info() + assert store2.is_open + + store2.close() + assert "CLOSED" in store1.info() + assert "CLOSED" in store2.info() + assert not store1.is_open + assert not store2.is_open + + # nested close + store = HDFStore(path, mode="w") + store.append("df", df) + + store2 = HDFStore(path) + store2.append("df2", df) + store2.close() + assert "CLOSED" in store2.info() + assert not store2.is_open + + store.close() + assert "CLOSED" in store.info() + assert not store.is_open + + # double closing + store = HDFStore(path, mode="w") + store.append("df", df) + + store2 = HDFStore(path) + store.close() + assert "CLOSED" in store.info() + assert not store.is_open + + store2.close() + assert "CLOSED" in store2.info() + assert not store2.is_open + + # ops on a closed store + with ensure_clean_path(setup_path) as path: + + df = tm.makeDataFrame() + df.to_hdf(path, "df", mode="w", format="table") + + store = HDFStore(path) + store.close() + + with pytest.raises(ClosedFileError): + store.keys() + + with pytest.raises(ClosedFileError): + "df" in store + + with pytest.raises(ClosedFileError): + len(store) + + with pytest.raises(ClosedFileError): + store["df"] + + with pytest.raises(AttributeError): + store.df + + with pytest.raises(ClosedFileError): + store.select("df") + + with pytest.raises(ClosedFileError): + store.get("df") + + with pytest.raises(ClosedFileError): + store.append("df2", df) + + with pytest.raises(ClosedFileError): + store.put("df3", df) + + with pytest.raises(ClosedFileError): + store.get_storer("df2") + + with pytest.raises(ClosedFileError): + store.remove("df2") + + with pytest.raises(ClosedFileError, match="file is not open"): + store.select("df") + + def test_pytables_native_read(self, datapath, setup_path): + with ensure_clean_store( + datapath("io", "data", "legacy_hdf/pytables_native.h5"), mode="r" + ) as store: + d2 = store["detector/readout"] + assert isinstance(d2, DataFrame) + + @pytest.mark.skipif( + is_platform_windows(), reason="native2 read fails oddly on windows" + ) + def test_pytables_native2_read(self, datapath, setup_path): + with ensure_clean_store( + datapath("io", "data", "legacy_hdf", "pytables_native2.h5"), mode="r" + ) as store: + str(store) + d1 = store["detector"] + assert isinstance(d1, DataFrame) + + @td.xfail_non_writeable + def test_legacy_table_fixed_format_read_py2(self, datapath, setup_path): + # GH 24510 + # legacy table with fixed format written in Python 2 + with ensure_clean_store( + datapath("io", "data", "legacy_hdf", "legacy_table_fixed_py2.h5"), mode="r" + ) as store: + result = store.select("df") + expected = pd.DataFrame( + [[1, 2, 3, "D"]], + columns=["A", "B", "C", "D"], + index=pd.Index(["ABC"], name="INDEX_NAME"), + ) + tm.assert_frame_equal(expected, result) + + def test_legacy_table_read_py2(self, datapath, setup_path): + # issue: 24925 + # legacy table written in Python 2 + with ensure_clean_store( + datapath("io", "data", "legacy_hdf", "legacy_table_py2.h5"), mode="r" + ) as store: + result = store.select("table") + + expected = pd.DataFrame({"a": ["a", "b"], "b": [2, 3]}) + tm.assert_frame_equal(expected, result) + + def test_copy(self, setup_path): + + with catch_warnings(record=True): + + def do_copy(f, new_f=None, keys=None, propindexes=True, **kwargs): + try: + store = HDFStore(f, "r") + + if new_f is None: + import tempfile + + fd, new_f = tempfile.mkstemp() + + tstore = store.copy( + new_f, keys=keys, propindexes=propindexes, **kwargs + ) + + # check keys + if keys is None: + keys = store.keys() + assert set(keys) == set(tstore.keys()) + + # check indices & nrows + for k in tstore.keys(): + if tstore.get_storer(k).is_table: + new_t = tstore.get_storer(k) + orig_t = store.get_storer(k) + + assert orig_t.nrows == new_t.nrows + + # check propindixes + if propindexes: + for a in orig_t.axes: + if a.is_indexed: + assert new_t[a.name].is_indexed + + finally: + safe_close(store) + safe_close(tstore) + try: + os.close(fd) + except (OSError, ValueError): + pass + safe_remove(new_f) + + # new table + df = tm.makeDataFrame() + + try: + path = create_tempfile(setup_path) + st = HDFStore(path) + st.append("df", df, data_columns=["A"]) + st.close() + do_copy(f=path) + do_copy(f=path, propindexes=False) + finally: + safe_remove(path) + + def test_store_datetime_fractional_secs(self, setup_path): + + with ensure_clean_store(setup_path) as store: + dt = datetime.datetime(2012, 1, 2, 3, 4, 5, 123456) + series = Series([0], [dt]) + store["a"] = series + assert store["a"].index[0] == dt + + def test_tseries_indices_series(self, setup_path): + + with ensure_clean_store(setup_path) as store: + idx = tm.makeDateIndex(10) + ser = Series(np.random.randn(len(idx)), idx) + store["a"] = ser + result = store["a"] + + tm.assert_series_equal(result, ser) + assert result.index.freq == ser.index.freq + tm.assert_class_equal(result.index, ser.index, obj="series index") + + idx = tm.makePeriodIndex(10) + ser = Series(np.random.randn(len(idx)), idx) + store["a"] = ser + result = store["a"] + + tm.assert_series_equal(result, ser) + assert result.index.freq == ser.index.freq + tm.assert_class_equal(result.index, ser.index, obj="series index") + + def test_tseries_indices_frame(self, setup_path): + + with ensure_clean_store(setup_path) as store: + idx = tm.makeDateIndex(10) + df = DataFrame(np.random.randn(len(idx), 3), index=idx) + store["a"] = df + result = store["a"] + + tm.assert_frame_equal(result, df) + assert result.index.freq == df.index.freq + tm.assert_class_equal(result.index, df.index, obj="dataframe index") + + idx = tm.makePeriodIndex(10) + df = DataFrame(np.random.randn(len(idx), 3), idx) + store["a"] = df + result = store["a"] + + tm.assert_frame_equal(result, df) + assert result.index.freq == df.index.freq + tm.assert_class_equal(result.index, df.index, obj="dataframe index") + + def test_unicode_index(self, setup_path): + + unicode_values = ["\u03c3", "\u03c3\u03c3"] + + # PerformanceWarning + with catch_warnings(record=True): + simplefilter("ignore", pd.errors.PerformanceWarning) + s = Series(np.random.randn(len(unicode_values)), unicode_values) + self._check_roundtrip(s, tm.assert_series_equal, path=setup_path) + + def test_unicode_longer_encoded(self, setup_path): + # GH 11234 + char = "\u0394" + df = pd.DataFrame({"A": [char]}) + with ensure_clean_store(setup_path) as store: + store.put("df", df, format="table", encoding="utf-8") + result = store.get("df") + tm.assert_frame_equal(result, df) + + df = pd.DataFrame({"A": ["a", char], "B": ["b", "b"]}) + with ensure_clean_store(setup_path) as store: + store.put("df", df, format="table", encoding="utf-8") + result = store.get("df") + tm.assert_frame_equal(result, df) + + @td.xfail_non_writeable + def test_store_datetime_mixed(self, setup_path): + + df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["a", "b", "c"]}) + ts = tm.makeTimeSeries() + df["d"] = ts.index[:3] + self._check_roundtrip(df, tm.assert_frame_equal, path=setup_path) + + # FIXME: don't leave commented-out code + # def test_cant_write_multiindex_table(self): + # # for now, #1848 + # df = DataFrame(np.random.randn(10, 4), + # index=[np.arange(5).repeat(2), + # np.tile(np.arange(2), 5)]) + # + # with pytest.raises(Exception): + # store.put('foo', df, format='table') + + def test_append_with_diff_col_name_types_raises_value_error(self, setup_path): + df = DataFrame(np.random.randn(10, 1)) + df2 = DataFrame({"a": np.random.randn(10)}) + df3 = DataFrame({(1, 2): np.random.randn(10)}) + df4 = DataFrame({("1", 2): np.random.randn(10)}) + df5 = DataFrame({("1", 2, object): np.random.randn(10)}) + + with ensure_clean_store(setup_path) as store: + name = "df_{}".format(tm.rands(10)) + store.append(name, df) + + for d in (df2, df3, df4, df5): + with pytest.raises(ValueError): + store.append(name, d) + + def test_query_with_nested_special_character(self, setup_path): + df = DataFrame( + { + "a": ["a", "a", "c", "b", "test & test", "c", "b", "e"], + "b": [1, 2, 3, 4, 5, 6, 7, 8], + } + ) + expected = df[df.a == "test & test"] + with ensure_clean_store(setup_path) as store: + store.append("test", df, format="table", data_columns=True) + result = store.select("test", 'a = "test & test"') + tm.assert_frame_equal(expected, result) + + def test_categorical(self, setup_path): + + with ensure_clean_store(setup_path) as store: + + # Basic + _maybe_remove(store, "s") + s = Series( + Categorical( + ["a", "b", "b", "a", "a", "c"], + categories=["a", "b", "c", "d"], + ordered=False, + ) + ) + store.append("s", s, format="table") + result = store.select("s") + tm.assert_series_equal(s, result) + + _maybe_remove(store, "s_ordered") + s = Series( + Categorical( + ["a", "b", "b", "a", "a", "c"], + categories=["a", "b", "c", "d"], + ordered=True, + ) + ) + store.append("s_ordered", s, format="table") + result = store.select("s_ordered") + tm.assert_series_equal(s, result) + + _maybe_remove(store, "df") + df = DataFrame({"s": s, "vals": [1, 2, 3, 4, 5, 6]}) + store.append("df", df, format="table") + result = store.select("df") + tm.assert_frame_equal(result, df) + + # Dtypes + _maybe_remove(store, "si") + s = Series([1, 1, 2, 2, 3, 4, 5]).astype("category") + store.append("si", s) + result = store.select("si") + tm.assert_series_equal(result, s) + + _maybe_remove(store, "si2") + s = Series([1, 1, np.nan, 2, 3, 4, 5]).astype("category") + store.append("si2", s) + result = store.select("si2") + tm.assert_series_equal(result, s) + + # Multiple + _maybe_remove(store, "df2") + df2 = df.copy() + df2["s2"] = Series(list("abcdefg")).astype("category") + store.append("df2", df2) + result = store.select("df2") + tm.assert_frame_equal(result, df2) + + # Make sure the metadata is OK + info = store.info() + assert "/df2 " in info + # assert '/df2/meta/values_block_0/meta' in info + assert "/df2/meta/values_block_1/meta" in info + + # unordered + _maybe_remove(store, "s2") + s = Series( + Categorical( + ["a", "b", "b", "a", "a", "c"], + categories=["a", "b", "c", "d"], + ordered=False, + ) + ) + store.append("s2", s, format="table") + result = store.select("s2") + tm.assert_series_equal(result, s) + + # Query + _maybe_remove(store, "df3") + store.append("df3", df, data_columns=["s"]) + expected = df[df.s.isin(["b", "c"])] + result = store.select("df3", where=['s in ["b","c"]']) + tm.assert_frame_equal(result, expected) + + expected = df[df.s.isin(["b", "c"])] + result = store.select("df3", where=['s = ["b","c"]']) + tm.assert_frame_equal(result, expected) + + expected = df[df.s.isin(["d"])] + result = store.select("df3", where=['s in ["d"]']) + tm.assert_frame_equal(result, expected) + + expected = df[df.s.isin(["f"])] + result = store.select("df3", where=['s in ["f"]']) + tm.assert_frame_equal(result, expected) + + # Appending with same categories is ok + store.append("df3", df) + + df = concat([df, df]) + expected = df[df.s.isin(["b", "c"])] + result = store.select("df3", where=['s in ["b","c"]']) + tm.assert_frame_equal(result, expected) + + # Appending must have the same categories + df3 = df.copy() + df3["s"].cat.remove_unused_categories(inplace=True) + + with pytest.raises(ValueError): + store.append("df3", df3) + + # Remove, and make sure meta data is removed (its a recursive + # removal so should be). + result = store.select("df3/meta/s/meta") + assert result is not None + store.remove("df3") + + with pytest.raises( + KeyError, match="'No object named df3/meta/s/meta in the file'" + ): + store.select("df3/meta/s/meta") + + def test_categorical_conversion(self, setup_path): + + # GH13322 + # Check that read_hdf with categorical columns doesn't return rows if + # where criteria isn't met. + obsids = ["ESP_012345_6789", "ESP_987654_3210"] + imgids = ["APF00006np", "APF0001imm"] + data = [4.3, 9.8] + + # Test without categories + df = DataFrame(dict(obsids=obsids, imgids=imgids, data=data)) + + # We are expecting an empty DataFrame matching types of df + expected = df.iloc[[], :] + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", format="table", data_columns=True) + result = read_hdf(path, "df", where="obsids=B") + tm.assert_frame_equal(result, expected) + + # Test with categories + df.obsids = df.obsids.astype("category") + df.imgids = df.imgids.astype("category") + + # We are expecting an empty DataFrame matching types of df + expected = df.iloc[[], :] + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", format="table", data_columns=True) + result = read_hdf(path, "df", where="obsids=B") + tm.assert_frame_equal(result, expected) + + def test_categorical_nan_only_columns(self, setup_path): + # GH18413 + # Check that read_hdf with categorical columns with NaN-only values can + # be read back. + df = pd.DataFrame( + { + "a": ["a", "b", "c", np.nan], + "b": [np.nan, np.nan, np.nan, np.nan], + "c": [1, 2, 3, 4], + "d": pd.Series([None] * 4, dtype=object), + } + ) + df["a"] = df.a.astype("category") + df["b"] = df.b.astype("category") + df["d"] = df.b.astype("category") + expected = df + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", format="table", data_columns=True) + result = read_hdf(path, "df") + tm.assert_frame_equal(result, expected) + + def test_duplicate_column_name(self, setup_path): + df = DataFrame(columns=["a", "a"], data=[[0, 0]]) + + with ensure_clean_path(setup_path) as path: + with pytest.raises(ValueError): + df.to_hdf(path, "df", format="fixed") + + df.to_hdf(path, "df", format="table") + other = read_hdf(path, "df") + + tm.assert_frame_equal(df, other) + assert df.equals(other) + assert other.equals(df) + + def test_round_trip_equals(self, setup_path): + # GH 9330 + df = DataFrame({"B": [1, 2], "A": ["x", "y"]}) + + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", format="table") + other = read_hdf(path, "df") + tm.assert_frame_equal(df, other) + assert df.equals(other) + assert other.equals(df) + + def test_preserve_timedeltaindex_type(self, setup_path): + # GH9635 + # Storing TimedeltaIndexed DataFrames in fixed stores did not preserve + # the type of the index. + df = DataFrame(np.random.normal(size=(10, 5))) + df.index = timedelta_range(start="0s", periods=10, freq="1s", name="example") + + with ensure_clean_store(setup_path) as store: + + store["df"] = df + tm.assert_frame_equal(store["df"], df) + + def test_columns_multiindex_modified(self, setup_path): + # BUG: 7212 + # read_hdf store.select modified the passed columns parameters + # when multi-indexed. + + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) + df.index.name = "letters" + df = df.set_index(keys="E", append=True) + + data_columns = df.index.names + df.columns.tolist() + with ensure_clean_path(setup_path) as path: + df.to_hdf( + path, + "df", + mode="a", + append=True, + data_columns=data_columns, + index=False, + ) + cols2load = list("BCD") + cols2load_original = list(cols2load) + df_loaded = read_hdf(path, "df", columns=cols2load) # noqa + assert cols2load_original == cols2load + + @ignore_natural_naming_warning + def test_to_hdf_with_object_column_names(self, setup_path): + # GH9057 + # Writing HDF5 table format should only work for string-like + # column types + + types_should_fail = [ + tm.makeIntIndex, + tm.makeFloatIndex, + tm.makeDateIndex, + tm.makeTimedeltaIndex, + tm.makePeriodIndex, + ] + types_should_run = [ + tm.makeStringIndex, + tm.makeCategoricalIndex, + tm.makeUnicodeIndex, + ] + + for index in types_should_fail: + df = DataFrame(np.random.randn(10, 2), columns=index(2)) + with ensure_clean_path(setup_path) as path: + with catch_warnings(record=True): + msg = "cannot have non-object label DataIndexableCol" + with pytest.raises(ValueError, match=msg): + df.to_hdf(path, "df", format="table", data_columns=True) + + for index in types_should_run: + df = DataFrame(np.random.randn(10, 2), columns=index(2)) + with ensure_clean_path(setup_path) as path: + with catch_warnings(record=True): + df.to_hdf(path, "df", format="table", data_columns=True) + result = pd.read_hdf( + path, "df", where="index = [{0}]".format(df.index[0]) + ) + assert len(result) + + def test_read_hdf_open_store(self, setup_path): + # GH10330 + # No check for non-string path_or-buf, and no test of open store + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) + df.index.name = "letters" + df = df.set_index(keys="E", append=True) + + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", mode="w") + direct = read_hdf(path, "df") + store = HDFStore(path, mode="r") + indirect = read_hdf(store, "df") + tm.assert_frame_equal(direct, indirect) + assert store.is_open + store.close() + + def test_read_hdf_iterator(self, setup_path): + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) + df.index.name = "letters" + df = df.set_index(keys="E", append=True) + + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", mode="w", format="t") + direct = read_hdf(path, "df") + iterator = read_hdf(path, "df", iterator=True) + assert isinstance(iterator, TableIterator) + indirect = next(iterator.__iter__()) + tm.assert_frame_equal(direct, indirect) + iterator.store.close() + + def test_read_hdf_errors(self, setup_path): + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) + + with ensure_clean_path(setup_path) as path: + with pytest.raises(IOError): + read_hdf(path, "key") + + df.to_hdf(path, "df") + store = HDFStore(path, mode="r") + store.close() + + with pytest.raises(IOError): + read_hdf(store, "df") + + def test_read_hdf_generic_buffer_errors(self): + with pytest.raises(NotImplementedError): + read_hdf(BytesIO(b""), "df") + + def test_invalid_complib(self, setup_path): + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) + with ensure_clean_path(setup_path) as path: + with pytest.raises(ValueError): + df.to_hdf(path, "df", complib="foolib") + + # GH10443 + + def test_read_nokey(self, setup_path): + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) + + # Categorical dtype not supported for "fixed" format. So no need + # to test with that dtype in the dataframe here. + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", mode="a") + reread = read_hdf(path) + tm.assert_frame_equal(df, reread) + df.to_hdf(path, "df2", mode="a") + + with pytest.raises(ValueError): + read_hdf(path) + + def test_read_nokey_table(self, setup_path): + # GH13231 + df = DataFrame({"i": range(5), "c": Series(list("abacd"), dtype="category")}) + + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", mode="a", format="table") + reread = read_hdf(path) + tm.assert_frame_equal(df, reread) + df.to_hdf(path, "df2", mode="a", format="table") + + with pytest.raises(ValueError): + read_hdf(path) + + def test_read_nokey_empty(self, setup_path): + with ensure_clean_path(setup_path) as path: + store = HDFStore(path) + store.close() + + with pytest.raises(ValueError): + read_hdf(path) + + def test_read_from_pathlib_path(self, setup_path): + + # GH11773 + expected = DataFrame( + np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE") + ) + with ensure_clean_path(setup_path) as filename: + path_obj = Path(filename) + + expected.to_hdf(path_obj, "df", mode="a") + actual = read_hdf(path_obj, "df") + + tm.assert_frame_equal(expected, actual) + + @td.skip_if_no("py.path") + def test_read_from_py_localpath(self, setup_path): + + # GH11773 + from py.path import local as LocalPath + + expected = DataFrame( + np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE") + ) + with ensure_clean_path(setup_path) as filename: + path_obj = LocalPath(filename) + + expected.to_hdf(path_obj, "df", mode="a") + actual = read_hdf(path_obj, "df") + + tm.assert_frame_equal(expected, actual) + + def test_query_long_float_literal(self, setup_path): + # GH 14241 + df = pd.DataFrame({"A": [1000000000.0009, 1000000000.0011, 1000000000.0015]}) + + with ensure_clean_store(setup_path) as store: + store.append("test", df, format="table", data_columns=True) + + cutoff = 1000000000.0006 + result = store.select("test", "A < {cutoff:.4f}".format(cutoff=cutoff)) + assert result.empty + + cutoff = 1000000000.0010 + result = store.select("test", "A > {cutoff:.4f}".format(cutoff=cutoff)) + expected = df.loc[[1, 2], :] + tm.assert_frame_equal(expected, result) + + exact = 1000000000.0011 + result = store.select("test", "A == {exact:.4f}".format(exact=exact)) + expected = df.loc[[1], :] + tm.assert_frame_equal(expected, result) + + def test_query_compare_column_type(self, setup_path): + # GH 15492 + df = pd.DataFrame( + { + "date": ["2014-01-01", "2014-01-02"], + "real_date": date_range("2014-01-01", periods=2), + "float": [1.1, 1.2], + "int": [1, 2], + }, + columns=["date", "real_date", "float", "int"], + ) + + with ensure_clean_store(setup_path) as store: + store.append("test", df, format="table", data_columns=True) + + ts = pd.Timestamp("2014-01-01") # noqa + result = store.select("test", where="real_date > ts") + expected = df.loc[[1], :] + tm.assert_frame_equal(expected, result) + + for op in ["<", ">", "=="]: + # non strings to string column always fail + for v in [2.1, True, pd.Timestamp("2014-01-01"), pd.Timedelta(1, "s")]: + query = "date {op} v".format(op=op) + with pytest.raises(TypeError): + store.select("test", where=query) + + # strings to other columns must be convertible to type + v = "a" + for col in ["int", "float", "real_date"]: + query = "{col} {op} v".format(op=op, col=col) + with pytest.raises(ValueError): + store.select("test", where=query) + + for v, col in zip( + ["1", "1.1", "2014-01-01"], ["int", "float", "real_date"] + ): + query = "{col} {op} v".format(op=op, col=col) + result = store.select("test", where=query) + + if op == "==": + expected = df.loc[[0], :] + elif op == ">": + expected = df.loc[[1], :] + else: + expected = df.loc[[], :] + tm.assert_frame_equal(expected, result) + + @pytest.mark.parametrize("format", ["fixed", "table"]) + def test_read_hdf_series_mode_r(self, format, setup_path): + # GH 16583 + # Tests that reading a Series saved to an HDF file + # still works if a mode='r' argument is supplied + series = tm.makeFloatSeries() + with ensure_clean_path(setup_path) as path: + series.to_hdf(path, key="data", format=format) + result = pd.read_hdf(path, key="data", mode="r") + tm.assert_series_equal(result, series) + + def test_fspath(self): + with tm.ensure_clean("foo.h5") as path: + with pd.HDFStore(path) as store: + assert os.fspath(store) == str(path) + + def test_read_py2_hdf_file_in_py3(self, datapath): + # GH 16781 + + # tests reading a PeriodIndex DataFrame written in Python2 in Python3 + + # the file was generated in Python 2.7 like so: + # + # df = pd.DataFrame([1.,2,3], index=pd.PeriodIndex( + # ['2015-01-01', '2015-01-02', '2015-01-05'], freq='B')) + # df.to_hdf('periodindex_0.20.1_x86_64_darwin_2.7.13.h5', 'p') + + expected = pd.DataFrame( + [1.0, 2, 3], + index=pd.PeriodIndex(["2015-01-01", "2015-01-02", "2015-01-05"], freq="B"), + ) + + with ensure_clean_store( + datapath( + "io", "data", "legacy_hdf", "periodindex_0.20.1_x86_64_darwin_2.7.13.h5" + ), + mode="r", + ) as store: + result = store["p"] + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("where", ["", (), (None,), [], [None]]) + def test_select_empty_where(self, where): + # GH26610 + + # Using keyword `where` as '' or (), or [None], etc + # while reading from HDF store raises + # "SyntaxError: only a single expression is allowed" + + df = pd.DataFrame([1, 2, 3]) + with ensure_clean_path("empty_where.h5") as path: + with pd.HDFStore(path) as store: + store.put("df", df, "t") + result = pd.read_hdf(store, "df", where=where) + tm.assert_frame_equal(result, df) + + @pytest.mark.parametrize( + "idx", + [ + date_range("2019", freq="D", periods=3, tz="UTC"), + CategoricalIndex(list("abc")), + ], + ) + def test_to_hdf_multiindex_extension_dtype(self, idx, setup_path): + # GH 7775 + mi = MultiIndex.from_arrays([idx, idx]) + df = pd.DataFrame(0, index=mi, columns=["a"]) + with ensure_clean_path(setup_path) as path: + with pytest.raises(NotImplementedError, match="Saving a MultiIndex"): + df.to_hdf(path, "df") diff --git a/venv/Lib/site-packages/pandas/tests/io/pytables/test_timezones.py b/venv/Lib/site-packages/pandas/tests/io/pytables/test_timezones.py new file mode 100644 index 0000000..2bf22d9 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/pytables/test_timezones.py @@ -0,0 +1,386 @@ +import datetime + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import DataFrame, DatetimeIndex, Series, Timestamp, date_range +import pandas._testing as tm +from pandas.tests.io.pytables.common import ( + _maybe_remove, + ensure_clean_path, + ensure_clean_store, +) + + +def _compare_with_tz(a, b): + tm.assert_frame_equal(a, b) + + # compare the zones on each element + for c in a.columns: + for i in a.index: + a_e = a.loc[i, c] + b_e = b.loc[i, c] + if not (a_e == b_e and a_e.tz == b_e.tz): + raise AssertionError( + "invalid tz comparison [{a_e}] [{b_e}]".format(a_e=a_e, b_e=b_e) + ) + + +def test_append_with_timezones_dateutil(setup_path): + + from datetime import timedelta + + # use maybe_get_tz instead of dateutil.tz.gettz to handle the windows + # filename issues. + from pandas._libs.tslibs.timezones import maybe_get_tz + + gettz = lambda x: maybe_get_tz("dateutil/" + x) + + # as columns + with ensure_clean_store(setup_path) as store: + + _maybe_remove(store, "df_tz") + df = DataFrame( + dict( + A=[ + Timestamp("20130102 2:00:00", tz=gettz("US/Eastern")) + + timedelta(hours=1) * i + for i in range(5) + ] + ) + ) + + store.append("df_tz", df, data_columns=["A"]) + result = store["df_tz"] + _compare_with_tz(result, df) + tm.assert_frame_equal(result, df) + + # select with tz aware + expected = df[df.A >= df.A[3]] + result = store.select("df_tz", where="A>=df.A[3]") + _compare_with_tz(result, expected) + + # ensure we include dates in DST and STD time here. + _maybe_remove(store, "df_tz") + df = DataFrame( + dict( + A=Timestamp("20130102", tz=gettz("US/Eastern")), + B=Timestamp("20130603", tz=gettz("US/Eastern")), + ), + index=range(5), + ) + store.append("df_tz", df) + result = store["df_tz"] + _compare_with_tz(result, df) + tm.assert_frame_equal(result, df) + + df = DataFrame( + dict( + A=Timestamp("20130102", tz=gettz("US/Eastern")), + B=Timestamp("20130102", tz=gettz("EET")), + ), + index=range(5), + ) + with pytest.raises(ValueError): + store.append("df_tz", df) + + # this is ok + _maybe_remove(store, "df_tz") + store.append("df_tz", df, data_columns=["A", "B"]) + result = store["df_tz"] + _compare_with_tz(result, df) + tm.assert_frame_equal(result, df) + + # can't append with diff timezone + df = DataFrame( + dict( + A=Timestamp("20130102", tz=gettz("US/Eastern")), + B=Timestamp("20130102", tz=gettz("CET")), + ), + index=range(5), + ) + with pytest.raises(ValueError): + store.append("df_tz", df) + + # as index + with ensure_clean_store(setup_path) as store: + + # GH 4098 example + df = DataFrame( + dict( + A=Series( + range(3), + index=date_range( + "2000-1-1", periods=3, freq="H", tz=gettz("US/Eastern") + ), + ) + ) + ) + + _maybe_remove(store, "df") + store.put("df", df) + result = store.select("df") + tm.assert_frame_equal(result, df) + + _maybe_remove(store, "df") + store.append("df", df) + result = store.select("df") + tm.assert_frame_equal(result, df) + + +def test_append_with_timezones_pytz(setup_path): + + from datetime import timedelta + + # as columns + with ensure_clean_store(setup_path) as store: + + _maybe_remove(store, "df_tz") + df = DataFrame( + dict( + A=[ + Timestamp("20130102 2:00:00", tz="US/Eastern") + + timedelta(hours=1) * i + for i in range(5) + ] + ) + ) + store.append("df_tz", df, data_columns=["A"]) + result = store["df_tz"] + _compare_with_tz(result, df) + tm.assert_frame_equal(result, df) + + # select with tz aware + _compare_with_tz(store.select("df_tz", where="A>=df.A[3]"), df[df.A >= df.A[3]]) + + _maybe_remove(store, "df_tz") + # ensure we include dates in DST and STD time here. + df = DataFrame( + dict( + A=Timestamp("20130102", tz="US/Eastern"), + B=Timestamp("20130603", tz="US/Eastern"), + ), + index=range(5), + ) + store.append("df_tz", df) + result = store["df_tz"] + _compare_with_tz(result, df) + tm.assert_frame_equal(result, df) + + df = DataFrame( + dict( + A=Timestamp("20130102", tz="US/Eastern"), + B=Timestamp("20130102", tz="EET"), + ), + index=range(5), + ) + with pytest.raises(ValueError): + store.append("df_tz", df) + + # this is ok + _maybe_remove(store, "df_tz") + store.append("df_tz", df, data_columns=["A", "B"]) + result = store["df_tz"] + _compare_with_tz(result, df) + tm.assert_frame_equal(result, df) + + # can't append with diff timezone + df = DataFrame( + dict( + A=Timestamp("20130102", tz="US/Eastern"), + B=Timestamp("20130102", tz="CET"), + ), + index=range(5), + ) + with pytest.raises(ValueError): + store.append("df_tz", df) + + # as index + with ensure_clean_store(setup_path) as store: + + # GH 4098 example + df = DataFrame( + dict( + A=Series( + range(3), + index=date_range("2000-1-1", periods=3, freq="H", tz="US/Eastern"), + ) + ) + ) + + _maybe_remove(store, "df") + store.put("df", df) + result = store.select("df") + tm.assert_frame_equal(result, df) + + _maybe_remove(store, "df") + store.append("df", df) + result = store.select("df") + tm.assert_frame_equal(result, df) + + +def test_tseries_select_index_column(setup_path): + # GH7777 + # selecting a UTC datetimeindex column did + # not preserve UTC tzinfo set before storing + + # check that no tz still works + rng = date_range("1/1/2000", "1/30/2000") + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + + with ensure_clean_store(setup_path) as store: + store.append("frame", frame) + result = store.select_column("frame", "index") + assert rng.tz == DatetimeIndex(result.values).tz + + # check utc + rng = date_range("1/1/2000", "1/30/2000", tz="UTC") + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + + with ensure_clean_store(setup_path) as store: + store.append("frame", frame) + result = store.select_column("frame", "index") + assert rng.tz == result.dt.tz + + # double check non-utc + rng = date_range("1/1/2000", "1/30/2000", tz="US/Eastern") + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + + with ensure_clean_store(setup_path) as store: + store.append("frame", frame) + result = store.select_column("frame", "index") + assert rng.tz == result.dt.tz + + +def test_timezones_fixed(setup_path): + with ensure_clean_store(setup_path) as store: + + # index + rng = date_range("1/1/2000", "1/30/2000", tz="US/Eastern") + df = DataFrame(np.random.randn(len(rng), 4), index=rng) + store["df"] = df + result = store["df"] + tm.assert_frame_equal(result, df) + + # as data + # GH11411 + _maybe_remove(store, "df") + df = DataFrame( + { + "A": rng, + "B": rng.tz_convert("UTC").tz_localize(None), + "C": rng.tz_convert("CET"), + "D": range(len(rng)), + }, + index=rng, + ) + store["df"] = df + result = store["df"] + tm.assert_frame_equal(result, df) + + +def test_fixed_offset_tz(setup_path): + rng = date_range("1/1/2000 00:00:00-07:00", "1/30/2000 00:00:00-07:00") + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + + with ensure_clean_store(setup_path) as store: + store["frame"] = frame + recons = store["frame"] + tm.assert_index_equal(recons.index, rng) + assert rng.tz == recons.index.tz + + +@td.skip_if_windows +def test_store_timezone(setup_path): + # GH2852 + # issue storing datetime.date with a timezone as it resets when read + # back in a new timezone + + # original method + with ensure_clean_store(setup_path) as store: + + today = datetime.date(2013, 9, 10) + df = DataFrame([1, 2, 3], index=[today, today, today]) + store["obj1"] = df + result = store["obj1"] + tm.assert_frame_equal(result, df) + + # with tz setting + with ensure_clean_store(setup_path) as store: + + with tm.set_timezone("EST5EDT"): + today = datetime.date(2013, 9, 10) + df = DataFrame([1, 2, 3], index=[today, today, today]) + store["obj1"] = df + + with tm.set_timezone("CST6CDT"): + result = store["obj1"] + + tm.assert_frame_equal(result, df) + + +def test_legacy_datetimetz_object(datapath, setup_path): + # legacy from < 0.17.0 + # 8260 + expected = DataFrame( + dict( + A=Timestamp("20130102", tz="US/Eastern"), B=Timestamp("20130603", tz="CET") + ), + index=range(5), + ) + with ensure_clean_store( + datapath("io", "data", "legacy_hdf", "datetimetz_object.h5"), mode="r" + ) as store: + result = store["df"] + tm.assert_frame_equal(result, expected) + + +def test_dst_transitions(setup_path): + # make sure we are not failing on transitions + with ensure_clean_store(setup_path) as store: + times = pd.date_range( + "2013-10-26 23:00", + "2013-10-27 01:00", + tz="Europe/London", + freq="H", + ambiguous="infer", + ) + + for i in [times, times + pd.Timedelta("10min")]: + _maybe_remove(store, "df") + df = DataFrame({"A": range(len(i)), "B": i}, index=i) + store.append("df", df) + result = store.select("df") + tm.assert_frame_equal(result, df) + + +def test_read_with_where_tz_aware_index(setup_path): + # GH 11926 + periods = 10 + dts = pd.date_range("20151201", periods=periods, freq="D", tz="UTC") + mi = pd.MultiIndex.from_arrays([dts, range(periods)], names=["DATE", "NO"]) + expected = pd.DataFrame({"MYCOL": 0}, index=mi) + + key = "mykey" + with ensure_clean_path(setup_path) as path: + with pd.HDFStore(path) as store: + store.append(key, expected, format="table", append=True) + result = pd.read_hdf(path, key, where="DATE > 20151130") + tm.assert_frame_equal(result, expected) + + +def test_py2_created_with_datetimez(datapath, setup_path): + # The test HDF5 file was created in Python 2, but could not be read in + # Python 3. + # + # GH26443 + index = [pd.Timestamp("2019-01-01T18:00").tz_localize("America/New_York")] + expected = DataFrame({"data": 123}, index=index) + with ensure_clean_store( + datapath("io", "data", "legacy_hdf", "gh26443.h5"), mode="r" + ) as store: + result = store["key"] + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/io/sas/__init__.py b/venv/Lib/site-packages/pandas/tests/io/sas/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/io/sas/test_sas.py b/venv/Lib/site-packages/pandas/tests/io/sas/test_sas.py new file mode 100644 index 0000000..5d2643c --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/sas/test_sas.py @@ -0,0 +1,26 @@ +from io import StringIO + +import pytest + +from pandas import read_sas +import pandas._testing as tm + + +class TestSas: + def test_sas_buffer_format(self): + # see gh-14947 + b = StringIO("") + + msg = ( + "If this is a buffer object rather than a string " + "name, you must specify a format string" + ) + with pytest.raises(ValueError, match=msg): + read_sas(b) + + def test_sas_read_no_format_or_extension(self): + # see gh-24548 + msg = "unable to infer format of SAS file" + with tm.ensure_clean("test_file_no_extension") as path: + with pytest.raises(ValueError, match=msg): + read_sas(path) diff --git a/venv/Lib/site-packages/pandas/tests/io/sas/test_sas7bdat.py b/venv/Lib/site-packages/pandas/tests/io/sas/test_sas7bdat.py new file mode 100644 index 0000000..62e9ac6 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/sas/test_sas7bdat.py @@ -0,0 +1,216 @@ +from datetime import datetime +import io +import os +from pathlib import Path + +import numpy as np +import pytest + +from pandas.errors import EmptyDataError +import pandas.util._test_decorators as td + +import pandas as pd +import pandas._testing as tm + + +# https://github.com/cython/cython/issues/1720 +@pytest.mark.filterwarnings("ignore:can't resolve package:ImportWarning") +class TestSAS7BDAT: + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath("io", "sas", "data") + self.data = [] + self.test_ix = [list(range(1, 16)), [16]] + for j in 1, 2: + fname = os.path.join(self.dirpath, f"test_sas7bdat_{j}.csv") + df = pd.read_csv(fname) + epoch = datetime(1960, 1, 1) + t1 = pd.to_timedelta(df["Column4"], unit="d") + df["Column4"] = epoch + t1 + t2 = pd.to_timedelta(df["Column12"], unit="d") + df["Column12"] = epoch + t2 + for k in range(df.shape[1]): + col = df.iloc[:, k] + if col.dtype == np.int64: + df.iloc[:, k] = df.iloc[:, k].astype(np.float64) + self.data.append(df) + + def test_from_file(self): + for j in 0, 1: + df0 = self.data[j] + for k in self.test_ix[j]: + fname = os.path.join(self.dirpath, f"test{k}.sas7bdat") + df = pd.read_sas(fname, encoding="utf-8") + tm.assert_frame_equal(df, df0) + + def test_from_buffer(self): + for j in 0, 1: + df0 = self.data[j] + for k in self.test_ix[j]: + fname = os.path.join(self.dirpath, f"test{k}.sas7bdat") + with open(fname, "rb") as f: + byts = f.read() + buf = io.BytesIO(byts) + rdr = pd.read_sas( + buf, format="sas7bdat", iterator=True, encoding="utf-8" + ) + df = rdr.read() + tm.assert_frame_equal(df, df0, check_exact=False) + rdr.close() + + def test_from_iterator(self): + for j in 0, 1: + df0 = self.data[j] + for k in self.test_ix[j]: + fname = os.path.join(self.dirpath, f"test{k}.sas7bdat") + rdr = pd.read_sas(fname, iterator=True, encoding="utf-8") + df = rdr.read(2) + tm.assert_frame_equal(df, df0.iloc[0:2, :]) + df = rdr.read(3) + tm.assert_frame_equal(df, df0.iloc[2:5, :]) + rdr.close() + + def test_path_pathlib(self): + for j in 0, 1: + df0 = self.data[j] + for k in self.test_ix[j]: + fname = Path(os.path.join(self.dirpath, f"test{k}.sas7bdat")) + df = pd.read_sas(fname, encoding="utf-8") + tm.assert_frame_equal(df, df0) + + @td.skip_if_no("py.path") + def test_path_localpath(self): + from py.path import local as LocalPath + + for j in 0, 1: + df0 = self.data[j] + for k in self.test_ix[j]: + fname = LocalPath(os.path.join(self.dirpath, f"test{k}.sas7bdat")) + df = pd.read_sas(fname, encoding="utf-8") + tm.assert_frame_equal(df, df0) + + def test_iterator_loop(self): + # github #13654 + for j in 0, 1: + for k in self.test_ix[j]: + for chunksize in 3, 5, 10, 11: + fname = os.path.join(self.dirpath, f"test{k}.sas7bdat") + rdr = pd.read_sas(fname, chunksize=10, encoding="utf-8") + y = 0 + for x in rdr: + y += x.shape[0] + assert y == rdr.row_count + rdr.close() + + def test_iterator_read_too_much(self): + # github #14734 + k = self.test_ix[0][0] + fname = os.path.join(self.dirpath, f"test{k}.sas7bdat") + rdr = pd.read_sas(fname, format="sas7bdat", iterator=True, encoding="utf-8") + d1 = rdr.read(rdr.row_count + 20) + rdr.close() + + rdr = pd.read_sas(fname, iterator=True, encoding="utf-8") + d2 = rdr.read(rdr.row_count + 20) + tm.assert_frame_equal(d1, d2) + rdr.close() + + +def test_encoding_options(datapath): + fname = datapath("io", "sas", "data", "test1.sas7bdat") + df1 = pd.read_sas(fname) + df2 = pd.read_sas(fname, encoding="utf-8") + for col in df1.columns: + try: + df1[col] = df1[col].str.decode("utf-8") + except AttributeError: + pass + tm.assert_frame_equal(df1, df2) + + from pandas.io.sas.sas7bdat import SAS7BDATReader + + rdr = SAS7BDATReader(fname, convert_header_text=False) + df3 = rdr.read() + rdr.close() + for x, y in zip(df1.columns, df3.columns): + assert x == y.decode() + + +def test_productsales(datapath): + fname = datapath("io", "sas", "data", "productsales.sas7bdat") + df = pd.read_sas(fname, encoding="utf-8") + fname = datapath("io", "sas", "data", "productsales.csv") + df0 = pd.read_csv(fname, parse_dates=["MONTH"]) + vn = ["ACTUAL", "PREDICT", "QUARTER", "YEAR"] + df0[vn] = df0[vn].astype(np.float64) + tm.assert_frame_equal(df, df0) + + +def test_12659(datapath): + fname = datapath("io", "sas", "data", "test_12659.sas7bdat") + df = pd.read_sas(fname) + fname = datapath("io", "sas", "data", "test_12659.csv") + df0 = pd.read_csv(fname) + df0 = df0.astype(np.float64) + tm.assert_frame_equal(df, df0) + + +def test_airline(datapath): + fname = datapath("io", "sas", "data", "airline.sas7bdat") + df = pd.read_sas(fname) + fname = datapath("io", "sas", "data", "airline.csv") + df0 = pd.read_csv(fname) + df0 = df0.astype(np.float64) + tm.assert_frame_equal(df, df0, check_exact=False) + + +def test_date_time(datapath): + # Support of different SAS date/datetime formats (PR #15871) + fname = datapath("io", "sas", "data", "datetime.sas7bdat") + df = pd.read_sas(fname) + fname = datapath("io", "sas", "data", "datetime.csv") + df0 = pd.read_csv( + fname, parse_dates=["Date1", "Date2", "DateTime", "DateTimeHi", "Taiw"] + ) + # GH 19732: Timestamps imported from sas will incur floating point errors + df.iloc[:, 3] = df.iloc[:, 3].dt.round("us") + tm.assert_frame_equal(df, df0) + + +def test_compact_numerical_values(datapath): + # Regression test for #21616 + fname = datapath("io", "sas", "data", "cars.sas7bdat") + df = pd.read_sas(fname, encoding="latin-1") + # The two columns CYL and WGT in cars.sas7bdat have column + # width < 8 and only contain integral values. + # Test that pandas doesn't corrupt the numbers by adding + # decimals. + result = df["WGT"] + expected = df["WGT"].round() + tm.assert_series_equal(result, expected, check_exact=True) + result = df["CYL"] + expected = df["CYL"].round() + tm.assert_series_equal(result, expected, check_exact=True) + + +def test_many_columns(datapath): + # Test for looking for column information in more places (PR #22628) + fname = datapath("io", "sas", "data", "many_columns.sas7bdat") + df = pd.read_sas(fname, encoding="latin-1") + fname = datapath("io", "sas", "data", "many_columns.csv") + df0 = pd.read_csv(fname, encoding="latin-1") + tm.assert_frame_equal(df, df0) + + +def test_inconsistent_number_of_rows(datapath): + # Regression test for issue #16615. (PR #22628) + fname = datapath("io", "sas", "data", "load_log.sas7bdat") + df = pd.read_sas(fname, encoding="latin-1") + assert len(df) == 2097 + + +def test_zero_variables(datapath): + # Check if the SAS file has zero variables (PR #18184) + fname = datapath("io", "sas", "data", "zero_variables.sas7bdat") + with pytest.raises(EmptyDataError): + pd.read_sas(fname) diff --git a/venv/Lib/site-packages/pandas/tests/io/sas/test_xport.py b/venv/Lib/site-packages/pandas/tests/io/sas/test_xport.py new file mode 100644 index 0000000..ee97f08 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/sas/test_xport.py @@ -0,0 +1,141 @@ +import os + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + +from pandas.io.sas.sasreader import read_sas + +# CSV versions of test xpt files were obtained using the R foreign library + +# Numbers in a SAS xport file are always float64, so need to convert +# before making comparisons. + + +def numeric_as_float(data): + for v in data.columns: + if data[v].dtype is np.dtype("int64"): + data[v] = data[v].astype(np.float64) + + +class TestXport: + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath("io", "sas", "data") + self.file01 = os.path.join(self.dirpath, "DEMO_G.xpt") + self.file02 = os.path.join(self.dirpath, "SSHSV1_A.xpt") + self.file03 = os.path.join(self.dirpath, "DRXFCD_G.xpt") + self.file04 = os.path.join(self.dirpath, "paxraw_d_short.xpt") + + def test1_basic(self): + # Tests with DEMO_G.xpt (all numeric file) + + # Compare to this + data_csv = pd.read_csv(self.file01.replace(".xpt", ".csv")) + numeric_as_float(data_csv) + + # Read full file + data = read_sas(self.file01, format="xport") + tm.assert_frame_equal(data, data_csv) + num_rows = data.shape[0] + + # Test reading beyond end of file + reader = read_sas(self.file01, format="xport", iterator=True) + data = reader.read(num_rows + 100) + assert data.shape[0] == num_rows + reader.close() + + # Test incremental read with `read` method. + reader = read_sas(self.file01, format="xport", iterator=True) + data = reader.read(10) + reader.close() + tm.assert_frame_equal(data, data_csv.iloc[0:10, :]) + + # Test incremental read with `get_chunk` method. + reader = read_sas(self.file01, format="xport", chunksize=10) + data = reader.get_chunk() + reader.close() + tm.assert_frame_equal(data, data_csv.iloc[0:10, :]) + + # Test read in loop + m = 0 + reader = read_sas(self.file01, format="xport", chunksize=100) + for x in reader: + m += x.shape[0] + reader.close() + assert m == num_rows + + # Read full file with `read_sas` method + data = read_sas(self.file01) + tm.assert_frame_equal(data, data_csv) + + def test1_index(self): + # Tests with DEMO_G.xpt using index (all numeric file) + + # Compare to this + data_csv = pd.read_csv(self.file01.replace(".xpt", ".csv")) + data_csv = data_csv.set_index("SEQN") + numeric_as_float(data_csv) + + # Read full file + data = read_sas(self.file01, index="SEQN", format="xport") + tm.assert_frame_equal(data, data_csv, check_index_type=False) + + # Test incremental read with `read` method. + reader = read_sas(self.file01, index="SEQN", format="xport", iterator=True) + data = reader.read(10) + reader.close() + tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False) + + # Test incremental read with `get_chunk` method. + reader = read_sas(self.file01, index="SEQN", format="xport", chunksize=10) + data = reader.get_chunk() + reader.close() + tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False) + + def test1_incremental(self): + # Test with DEMO_G.xpt, reading full file incrementally + + data_csv = pd.read_csv(self.file01.replace(".xpt", ".csv")) + data_csv = data_csv.set_index("SEQN") + numeric_as_float(data_csv) + + reader = read_sas(self.file01, index="SEQN", chunksize=1000) + + all_data = list(reader) + data = pd.concat(all_data, axis=0) + + tm.assert_frame_equal(data, data_csv, check_index_type=False) + + def test2(self): + # Test with SSHSV1_A.xpt + + # Compare to this + data_csv = pd.read_csv(self.file02.replace(".xpt", ".csv")) + numeric_as_float(data_csv) + + data = read_sas(self.file02) + tm.assert_frame_equal(data, data_csv) + + def test_multiple_types(self): + # Test with DRXFCD_G.xpt (contains text and numeric variables) + + # Compare to this + data_csv = pd.read_csv(self.file03.replace(".xpt", ".csv")) + + data = read_sas(self.file03, encoding="utf-8") + tm.assert_frame_equal(data, data_csv) + + def test_truncated_float_support(self): + # Test with paxraw_d_short.xpt, a shortened version of: + # http://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/PAXRAW_D.ZIP + # This file has truncated floats (5 bytes in this case). + + # GH 11713 + + data_csv = pd.read_csv(self.file04.replace(".xpt", ".csv")) + + data = read_sas(self.file04, format="xport") + tm.assert_frame_equal(data.astype("int64"), data_csv) diff --git a/venv/Lib/site-packages/pandas/tests/io/test_clipboard.py b/venv/Lib/site-packages/pandas/tests/io/test_clipboard.py new file mode 100644 index 0000000..652caca --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/test_clipboard.py @@ -0,0 +1,256 @@ +from textwrap import dedent + +import numpy as np +from numpy.random import randint +import pytest + +import pandas as pd +from pandas import DataFrame, get_option, read_clipboard +import pandas._testing as tm + +from pandas.io.clipboard import clipboard_get, clipboard_set + + +def build_kwargs(sep, excel): + kwargs = {} + if excel != "default": + kwargs["excel"] = excel + if sep != "default": + kwargs["sep"] = sep + return kwargs + + +@pytest.fixture( + params=[ + "delims", + "utf8", + "utf16", + "string", + "long", + "nonascii", + "colwidth", + "mixed", + "float", + "int", + ] +) +def df(request): + data_type = request.param + + if data_type == "delims": + return pd.DataFrame({"a": ['"a,\t"b|c', "d\tef´"], "b": ["hi'j", "k''lm"]}) + elif data_type == "utf8": + return pd.DataFrame({"a": ["µasd", "Ωœ∑´"], "b": ["øπ∆˚¬", "œ∑´®"]}) + elif data_type == "utf16": + return pd.DataFrame( + {"a": ["\U0001f44d\U0001f44d", "\U0001f44d\U0001f44d"], "b": ["abc", "def"]} + ) + elif data_type == "string": + return tm.makeCustomDataframe( + 5, 3, c_idx_type="s", r_idx_type="i", c_idx_names=[None], r_idx_names=[None] + ) + elif data_type == "long": + max_rows = get_option("display.max_rows") + return tm.makeCustomDataframe( + max_rows + 1, + 3, + data_gen_f=lambda *args: randint(2), + c_idx_type="s", + r_idx_type="i", + c_idx_names=[None], + r_idx_names=[None], + ) + elif data_type == "nonascii": + return pd.DataFrame({"en": "in English".split(), "es": "en español".split()}) + elif data_type == "colwidth": + _cw = get_option("display.max_colwidth") + 1 + return tm.makeCustomDataframe( + 5, + 3, + data_gen_f=lambda *args: "x" * _cw, + c_idx_type="s", + r_idx_type="i", + c_idx_names=[None], + r_idx_names=[None], + ) + elif data_type == "mixed": + return DataFrame( + {"a": np.arange(1.0, 6.0) + 0.01, "b": np.arange(1, 6), "c": list("abcde")} + ) + elif data_type == "float": + return tm.makeCustomDataframe( + 5, + 3, + data_gen_f=lambda r, c: float(r) + 0.01, + c_idx_type="s", + r_idx_type="i", + c_idx_names=[None], + r_idx_names=[None], + ) + elif data_type == "int": + return tm.makeCustomDataframe( + 5, + 3, + data_gen_f=lambda *args: randint(2), + c_idx_type="s", + r_idx_type="i", + c_idx_names=[None], + r_idx_names=[None], + ) + else: + raise ValueError + + +@pytest.fixture +def mock_clipboard(monkeypatch, request): + """Fixture mocking clipboard IO. + + This mocks pandas.io.clipboard.clipboard_get and + pandas.io.clipboard.clipboard_set. + + This uses a local dict for storing data. The dictionary + key used is the test ID, available with ``request.node.name``. + + This returns the local dictionary, for direct manipulation by + tests. + """ + + # our local clipboard for tests + _mock_data = {} + + def _mock_set(data): + _mock_data[request.node.name] = data + + def _mock_get(): + return _mock_data[request.node.name] + + monkeypatch.setattr("pandas.io.clipboard.clipboard_set", _mock_set) + monkeypatch.setattr("pandas.io.clipboard.clipboard_get", _mock_get) + + yield _mock_data + + +@pytest.mark.clipboard +def test_mock_clipboard(mock_clipboard): + import pandas.io.clipboard + + pandas.io.clipboard.clipboard_set("abc") + assert "abc" in set(mock_clipboard.values()) + result = pandas.io.clipboard.clipboard_get() + assert result == "abc" + + +@pytest.mark.single +@pytest.mark.clipboard +@pytest.mark.usefixtures("mock_clipboard") +class TestClipboard: + def check_round_trip_frame(self, data, excel=None, sep=None, encoding=None): + data.to_clipboard(excel=excel, sep=sep, encoding=encoding) + result = read_clipboard(sep=sep or "\t", index_col=0, encoding=encoding) + tm.assert_frame_equal(data, result, check_dtype=False) + + # Test that default arguments copy as tab delimited + def test_round_trip_frame(self, df): + self.check_round_trip_frame(df) + + # Test that explicit delimiters are respected + @pytest.mark.parametrize("sep", ["\t", ",", "|"]) + def test_round_trip_frame_sep(self, df, sep): + self.check_round_trip_frame(df, sep=sep) + + # Test white space separator + def test_round_trip_frame_string(self, df): + df.to_clipboard(excel=False, sep=None) + result = read_clipboard() + assert df.to_string() == result.to_string() + assert df.shape == result.shape + + # Two character separator is not supported in to_clipboard + # Test that multi-character separators are not silently passed + def test_excel_sep_warning(self, df): + with tm.assert_produces_warning(): + df.to_clipboard(excel=True, sep=r"\t") + + # Separator is ignored when excel=False and should produce a warning + def test_copy_delim_warning(self, df): + with tm.assert_produces_warning(): + df.to_clipboard(excel=False, sep="\t") + + # Tests that the default behavior of to_clipboard is tab + # delimited and excel="True" + @pytest.mark.parametrize("sep", ["\t", None, "default"]) + @pytest.mark.parametrize("excel", [True, None, "default"]) + def test_clipboard_copy_tabs_default(self, sep, excel, df, request, mock_clipboard): + kwargs = build_kwargs(sep, excel) + df.to_clipboard(**kwargs) + assert mock_clipboard[request.node.name] == df.to_csv(sep="\t") + + # Tests reading of white space separated tables + @pytest.mark.parametrize("sep", [None, "default"]) + @pytest.mark.parametrize("excel", [False]) + def test_clipboard_copy_strings(self, sep, excel, df): + kwargs = build_kwargs(sep, excel) + df.to_clipboard(**kwargs) + result = read_clipboard(sep=r"\s+") + assert result.to_string() == df.to_string() + assert df.shape == result.shape + + def test_read_clipboard_infer_excel(self, request, mock_clipboard): + # gh-19010: avoid warnings + clip_kwargs = dict(engine="python") + + text = dedent( + """ + John James Charlie Mingus + 1 2 + 4 Harry Carney + """.strip() + ) + mock_clipboard[request.node.name] = text + df = pd.read_clipboard(**clip_kwargs) + + # excel data is parsed correctly + assert df.iloc[1][1] == "Harry Carney" + + # having diff tab counts doesn't trigger it + text = dedent( + """ + a\t b + 1 2 + 3 4 + """.strip() + ) + mock_clipboard[request.node.name] = text + res = pd.read_clipboard(**clip_kwargs) + + text = dedent( + """ + a b + 1 2 + 3 4 + """.strip() + ) + mock_clipboard[request.node.name] = text + exp = pd.read_clipboard(**clip_kwargs) + + tm.assert_frame_equal(res, exp) + + def test_invalid_encoding(self, df): + # test case for testing invalid encoding + with pytest.raises(ValueError): + df.to_clipboard(encoding="ascii") + with pytest.raises(NotImplementedError): + pd.read_clipboard(encoding="ascii") + + @pytest.mark.parametrize("enc", ["UTF-8", "utf-8", "utf8"]) + def test_round_trip_valid_encodings(self, enc, df): + self.check_round_trip_frame(df, encoding=enc) + + +@pytest.mark.single +@pytest.mark.clipboard +@pytest.mark.parametrize("data", ["\U0001f44d...", "Ωœ∑´...", "abcd..."]) +def test_raw_roundtrip(data): + # PR #25040 wide unicode wasn't copied correctly on PY3 on windows + clipboard_set(data) + assert data == clipboard_get() diff --git a/venv/Lib/site-packages/pandas/tests/io/test_common.py b/venv/Lib/site-packages/pandas/tests/io/test_common.py new file mode 100644 index 0000000..a126f83 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/test_common.py @@ -0,0 +1,359 @@ +""" +Tests for the pandas.io.common functionalities +""" +from io import StringIO +import mmap +import os +from pathlib import Path + +import pytest + +from pandas.compat import is_platform_windows +import pandas.util._test_decorators as td + +import pandas as pd +import pandas._testing as tm + +import pandas.io.common as icom + + +class CustomFSPath: + """For testing fspath on unknown objects""" + + def __init__(self, path): + self.path = path + + def __fspath__(self): + return self.path + + +# Functions that consume a string path and return a string or path-like object +path_types = [str, CustomFSPath, Path] + +try: + from py.path import local as LocalPath + + path_types.append(LocalPath) +except ImportError: + pass + +HERE = os.path.abspath(os.path.dirname(__file__)) + + +# https://github.com/cython/cython/issues/1720 +@pytest.mark.filterwarnings("ignore:can't resolve package:ImportWarning") +class TestCommonIOCapabilities: + data1 = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + + def test_expand_user(self): + filename = "~/sometest" + expanded_name = icom._expand_user(filename) + + assert expanded_name != filename + assert os.path.isabs(expanded_name) + assert os.path.expanduser(filename) == expanded_name + + def test_expand_user_normal_path(self): + filename = "/somefolder/sometest" + expanded_name = icom._expand_user(filename) + + assert expanded_name == filename + assert os.path.expanduser(filename) == expanded_name + + def test_stringify_path_pathlib(self): + rel_path = icom.stringify_path(Path(".")) + assert rel_path == "." + redundant_path = icom.stringify_path(Path("foo//bar")) + assert redundant_path == os.path.join("foo", "bar") + + @td.skip_if_no("py.path") + def test_stringify_path_localpath(self): + path = os.path.join("foo", "bar") + abs_path = os.path.abspath(path) + lpath = LocalPath(path) + assert icom.stringify_path(lpath) == abs_path + + def test_stringify_path_fspath(self): + p = CustomFSPath("foo/bar.csv") + result = icom.stringify_path(p) + assert result == "foo/bar.csv" + + @pytest.mark.parametrize( + "extension,expected", + [("", None), (".gz", "gzip"), (".bz2", "bz2"), (".zip", "zip"), (".xz", "xz")], + ) + @pytest.mark.parametrize("path_type", path_types) + def test_infer_compression_from_path(self, extension, expected, path_type): + path = path_type("foo/bar.csv" + extension) + compression = icom.infer_compression(path, compression="infer") + assert compression == expected + + def test_get_filepath_or_buffer_with_path(self): + filename = "~/sometest" + filepath_or_buffer, _, _, should_close = icom.get_filepath_or_buffer(filename) + assert filepath_or_buffer != filename + assert os.path.isabs(filepath_or_buffer) + assert os.path.expanduser(filename) == filepath_or_buffer + assert not should_close + + def test_get_filepath_or_buffer_with_buffer(self): + input_buffer = StringIO() + filepath_or_buffer, _, _, should_close = icom.get_filepath_or_buffer( + input_buffer + ) + assert filepath_or_buffer == input_buffer + assert not should_close + + def test_iterator(self): + reader = pd.read_csv(StringIO(self.data1), chunksize=1) + result = pd.concat(reader, ignore_index=True) + expected = pd.read_csv(StringIO(self.data1)) + tm.assert_frame_equal(result, expected) + + # GH12153 + it = pd.read_csv(StringIO(self.data1), chunksize=1) + first = next(it) + tm.assert_frame_equal(first, expected.iloc[[0]]) + tm.assert_frame_equal(pd.concat(it), expected.iloc[1:]) + + @pytest.mark.parametrize( + "reader, module, error_class, fn_ext", + [ + (pd.read_csv, "os", FileNotFoundError, "csv"), + (pd.read_fwf, "os", FileNotFoundError, "txt"), + (pd.read_excel, "xlrd", FileNotFoundError, "xlsx"), + (pd.read_feather, "feather", Exception, "feather"), + (pd.read_hdf, "tables", FileNotFoundError, "h5"), + (pd.read_stata, "os", FileNotFoundError, "dta"), + (pd.read_sas, "os", FileNotFoundError, "sas7bdat"), + (pd.read_json, "os", ValueError, "json"), + (pd.read_pickle, "os", FileNotFoundError, "pickle"), + ], + ) + def test_read_non_existant(self, reader, module, error_class, fn_ext): + pytest.importorskip(module) + + path = os.path.join(HERE, "data", "does_not_exist." + fn_ext) + msg1 = r"File (b')?.+does_not_exist\.{}'? does not exist".format(fn_ext) + msg2 = fr"\[Errno 2\] No such file or directory: '.+does_not_exist\.{fn_ext}'" + msg3 = "Expected object or value" + msg4 = "path_or_buf needs to be a string file path or file-like" + msg5 = ( + fr"\[Errno 2\] File .+does_not_exist\.{fn_ext} does not exist:" + fr" '.+does_not_exist\.{fn_ext}'" + ) + msg6 = fr"\[Errno 2\] 没有那个文件或目录: '.+does_not_exist\.{fn_ext}'" + msg7 = ( + fr"\[Errno 2\] File o directory non esistente: '.+does_not_exist\.{fn_ext}'" + ) + with pytest.raises( + error_class, match=fr"({msg1}|{msg2}|{msg3}|{msg4}|{msg5}|{msg6}|{msg7})" + ): + reader(path) + + @pytest.mark.parametrize( + "reader, module, error_class, fn_ext", + [ + (pd.read_csv, "os", FileNotFoundError, "csv"), + (pd.read_table, "os", FileNotFoundError, "csv"), + (pd.read_fwf, "os", FileNotFoundError, "txt"), + (pd.read_excel, "xlrd", FileNotFoundError, "xlsx"), + (pd.read_feather, "feather", Exception, "feather"), + (pd.read_hdf, "tables", FileNotFoundError, "h5"), + (pd.read_stata, "os", FileNotFoundError, "dta"), + (pd.read_sas, "os", FileNotFoundError, "sas7bdat"), + (pd.read_json, "os", ValueError, "json"), + (pd.read_pickle, "os", FileNotFoundError, "pickle"), + ], + ) + def test_read_expands_user_home_dir( + self, reader, module, error_class, fn_ext, monkeypatch + ): + pytest.importorskip(module) + + path = os.path.join("~", "does_not_exist." + fn_ext) + monkeypatch.setattr(icom, "_expand_user", lambda x: os.path.join("foo", x)) + + msg1 = fr"File (b')?.+does_not_exist\.{fn_ext}'? does not exist" + msg2 = fr"\[Errno 2\] No such file or directory: '.+does_not_exist\.{fn_ext}'" + msg3 = "Unexpected character found when decoding 'false'" + msg4 = "path_or_buf needs to be a string file path or file-like" + msg5 = ( + fr"\[Errno 2\] File .+does_not_exist\.{fn_ext} does not exist:" + fr" '.+does_not_exist\.{fn_ext}'" + ) + msg6 = fr"\[Errno 2\] 没有那个文件或目录: '.+does_not_exist\.{fn_ext}'" + msg7 = ( + fr"\[Errno 2\] File o directory non esistente: '.+does_not_exist\.{fn_ext}'" + ) + + with pytest.raises( + error_class, match=fr"({msg1}|{msg2}|{msg3}|{msg4}|{msg5}|{msg6}|{msg7})" + ): + reader(path) + + @pytest.mark.parametrize( + "reader, module, path", + [ + (pd.read_csv, "os", ("data", "iris.csv")), + (pd.read_table, "os", ("data", "iris.csv")), + ( + pd.read_fwf, + "os", + ("io", "data", "fixed_width", "fixed_width_format.txt"), + ), + (pd.read_excel, "xlrd", ("io", "data", "excel", "test1.xlsx")), + ( + pd.read_feather, + "feather", + ("io", "data", "feather", "feather-0_3_1.feather"), + ), + ( + pd.read_hdf, + "tables", + ("io", "data", "legacy_hdf", "datetimetz_object.h5"), + ), + (pd.read_stata, "os", ("io", "data", "stata", "stata10_115.dta")), + (pd.read_sas, "os", ("io", "sas", "data", "test1.sas7bdat")), + (pd.read_json, "os", ("io", "json", "data", "tsframe_v012.json")), + ( + pd.read_pickle, + "os", + ("io", "data", "pickle", "categorical.0.25.0.pickle"), + ), + ], + ) + def test_read_fspath_all(self, reader, module, path, datapath): + pytest.importorskip(module) + path = datapath(*path) + + mypath = CustomFSPath(path) + result = reader(mypath) + expected = reader(path) + + if path.endswith(".pickle"): + # categorical + tm.assert_categorical_equal(result, expected) + else: + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "writer_name, writer_kwargs, module", + [ + ("to_csv", {}, "os"), + ("to_excel", {"engine": "xlwt"}, "xlwt"), + ("to_feather", {}, "feather"), + ("to_html", {}, "os"), + ("to_json", {}, "os"), + ("to_latex", {}, "os"), + ("to_pickle", {}, "os"), + ("to_stata", {"time_stamp": pd.to_datetime("2019-01-01 00:00")}, "os"), + ], + ) + def test_write_fspath_all(self, writer_name, writer_kwargs, module): + p1 = tm.ensure_clean("string") + p2 = tm.ensure_clean("fspath") + df = pd.DataFrame({"A": [1, 2]}) + + with p1 as string, p2 as fspath: + pytest.importorskip(module) + mypath = CustomFSPath(fspath) + writer = getattr(df, writer_name) + + writer(string, **writer_kwargs) + with open(string, "rb") as f: + expected = f.read() + + writer(mypath, **writer_kwargs) + with open(fspath, "rb") as f: + result = f.read() + + assert result == expected + + def test_write_fspath_hdf5(self): + # Same test as write_fspath_all, except HDF5 files aren't + # necessarily byte-for-byte identical for a given dataframe, so we'll + # have to read and compare equality + pytest.importorskip("tables") + + df = pd.DataFrame({"A": [1, 2]}) + p1 = tm.ensure_clean("string") + p2 = tm.ensure_clean("fspath") + + with p1 as string, p2 as fspath: + mypath = CustomFSPath(fspath) + df.to_hdf(mypath, key="bar") + df.to_hdf(string, key="bar") + + result = pd.read_hdf(fspath, key="bar") + expected = pd.read_hdf(string, key="bar") + + tm.assert_frame_equal(result, expected) + + +@pytest.fixture +def mmap_file(datapath): + return datapath("io", "data", "csv", "test_mmap.csv") + + +class TestMMapWrapper: + def test_constructor_bad_file(self, mmap_file): + non_file = StringIO("I am not a file") + non_file.fileno = lambda: -1 + + # the error raised is different on Windows + if is_platform_windows(): + msg = "The parameter is incorrect" + err = OSError + else: + msg = "[Errno 22]" + err = mmap.error + + with pytest.raises(err, match=msg): + icom._MMapWrapper(non_file) + + target = open(mmap_file, "r") + target.close() + + msg = "I/O operation on closed file" + with pytest.raises(ValueError, match=msg): + icom._MMapWrapper(target) + + def test_get_attr(self, mmap_file): + with open(mmap_file, "r") as target: + wrapper = icom._MMapWrapper(target) + + attrs = dir(wrapper.mmap) + attrs = [attr for attr in attrs if not attr.startswith("__")] + attrs.append("__next__") + + for attr in attrs: + assert hasattr(wrapper, attr) + + assert not hasattr(wrapper, "foo") + + def test_next(self, mmap_file): + with open(mmap_file, "r") as target: + wrapper = icom._MMapWrapper(target) + lines = target.readlines() + + for line in lines: + next_line = next(wrapper) + assert next_line.strip() == line.strip() + + with pytest.raises(StopIteration, match=r"^$"): + next(wrapper) + + def test_unknown_engine(self): + with tm.ensure_clean() as path: + df = tm.makeDataFrame() + df.to_csv(path) + with pytest.raises(ValueError, match="Unknown engine"): + pd.read_csv(path, engine="pyt") diff --git a/venv/Lib/site-packages/pandas/tests/io/test_compression.py b/venv/Lib/site-packages/pandas/tests/io/test_compression.py new file mode 100644 index 0000000..fb81e57 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/test_compression.py @@ -0,0 +1,144 @@ +import os +import subprocess +import sys +import textwrap + +import pytest + +import pandas as pd +import pandas._testing as tm + +import pandas.io.common as icom + + +@pytest.mark.parametrize( + "obj", + [ + pd.DataFrame( + 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + columns=["X", "Y", "Z"], + ), + pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"), + ], +) +@pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"]) +def test_compression_size(obj, method, compression_only): + with tm.ensure_clean() as path: + getattr(obj, method)(path, compression=compression_only) + compressed_size = os.path.getsize(path) + getattr(obj, method)(path, compression=None) + uncompressed_size = os.path.getsize(path) + assert uncompressed_size > compressed_size + + +@pytest.mark.parametrize( + "obj", + [ + pd.DataFrame( + 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + columns=["X", "Y", "Z"], + ), + pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"), + ], +) +@pytest.mark.parametrize("method", ["to_csv", "to_json"]) +def test_compression_size_fh(obj, method, compression_only): + with tm.ensure_clean() as path: + f, handles = icom.get_handle(path, "w", compression=compression_only) + with f: + getattr(obj, method)(f) + assert not f.closed + assert f.closed + compressed_size = os.path.getsize(path) + with tm.ensure_clean() as path: + f, handles = icom.get_handle(path, "w", compression=None) + with f: + getattr(obj, method)(f) + assert not f.closed + assert f.closed + uncompressed_size = os.path.getsize(path) + assert uncompressed_size > compressed_size + + +@pytest.mark.parametrize( + "write_method, write_kwargs, read_method", + [ + ("to_csv", {"index": False}, pd.read_csv), + ("to_json", {}, pd.read_json), + ("to_pickle", {}, pd.read_pickle), + ], +) +def test_dataframe_compression_defaults_to_infer( + write_method, write_kwargs, read_method, compression_only +): + # GH22004 + input = pd.DataFrame([[1.0, 0, -4], [3.4, 5, 2]], columns=["X", "Y", "Z"]) + extension = icom._compression_to_extension[compression_only] + with tm.ensure_clean("compressed" + extension) as path: + getattr(input, write_method)(path, **write_kwargs) + output = read_method(path, compression=compression_only) + tm.assert_frame_equal(output, input) + + +@pytest.mark.parametrize( + "write_method,write_kwargs,read_method,read_kwargs", + [ + ("to_csv", {"index": False, "header": True}, pd.read_csv, {"squeeze": True}), + ("to_json", {}, pd.read_json, {"typ": "series"}), + ("to_pickle", {}, pd.read_pickle, {}), + ], +) +def test_series_compression_defaults_to_infer( + write_method, write_kwargs, read_method, read_kwargs, compression_only +): + # GH22004 + input = pd.Series([0, 5, -2, 10], name="X") + extension = icom._compression_to_extension[compression_only] + with tm.ensure_clean("compressed" + extension) as path: + getattr(input, write_method)(path, **write_kwargs) + output = read_method(path, compression=compression_only, **read_kwargs) + tm.assert_series_equal(output, input, check_names=False) + + +def test_compression_warning(compression_only): + # Assert that passing a file object to to_csv while explicitly specifying a + # compression protocol triggers a RuntimeWarning, as per GH21227. + df = pd.DataFrame( + 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + columns=["X", "Y", "Z"], + ) + with tm.ensure_clean() as path: + f, handles = icom.get_handle(path, "w", compression=compression_only) + with tm.assert_produces_warning(RuntimeWarning, check_stacklevel=False): + with f: + df.to_csv(f, compression=compression_only) + + +def test_with_missing_lzma(): + """Tests if import pandas works when lzma is not present.""" + # https://github.com/pandas-dev/pandas/issues/27575 + code = textwrap.dedent( + """\ + import sys + sys.modules['lzma'] = None + import pandas + """ + ) + subprocess.check_output([sys.executable, "-c", code], stderr=subprocess.PIPE) + + +def test_with_missing_lzma_runtime(): + """Tests if RuntimeError is hit when calling lzma without + having the module available.""" + code = textwrap.dedent( + """ + import sys + import pytest + sys.modules['lzma'] = None + import pandas + df = pandas.DataFrame() + with pytest.raises(RuntimeError, match='lzma module'): + df.to_csv('foo.csv', compression='xz') + """ + ) + subprocess.check_output([sys.executable, "-c", code], stderr=subprocess.PIPE) diff --git a/venv/Lib/site-packages/pandas/tests/io/test_date_converters.py b/venv/Lib/site-packages/pandas/tests/io/test_date_converters.py new file mode 100644 index 0000000..cdb8eca --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/test_date_converters.py @@ -0,0 +1,40 @@ +from datetime import datetime + +import numpy as np + +import pandas._testing as tm + +import pandas.io.date_converters as conv + + +def test_parse_date_time(): + dates = np.array(["2007/1/3", "2008/2/4"], dtype=object) + times = np.array(["05:07:09", "06:08:00"], dtype=object) + expected = np.array([datetime(2007, 1, 3, 5, 7, 9), datetime(2008, 2, 4, 6, 8, 0)]) + + result = conv.parse_date_time(dates, times) + tm.assert_numpy_array_equal(result, expected) + + +def test_parse_date_fields(): + days = np.array([3, 4]) + months = np.array([1, 2]) + years = np.array([2007, 2008]) + result = conv.parse_date_fields(years, months, days) + + expected = np.array([datetime(2007, 1, 3), datetime(2008, 2, 4)]) + tm.assert_numpy_array_equal(result, expected) + + +def test_parse_all_fields(): + hours = np.array([5, 6]) + minutes = np.array([7, 8]) + seconds = np.array([9, 0]) + + days = np.array([3, 4]) + years = np.array([2007, 2008]) + months = np.array([1, 2]) + + result = conv.parse_all_fields(years, months, days, hours, minutes, seconds) + expected = np.array([datetime(2007, 1, 3, 5, 7, 9), datetime(2008, 2, 4, 6, 8, 0)]) + tm.assert_numpy_array_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/io/test_feather.py b/venv/Lib/site-packages/pandas/tests/io/test_feather.py new file mode 100644 index 0000000..0038df7 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/test_feather.py @@ -0,0 +1,150 @@ +""" test feather-format compat """ +from distutils.version import LooseVersion + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + +from pandas.io.feather_format import read_feather, to_feather # noqa: E402 isort:skip + +pyarrow = pytest.importorskip("pyarrow") + + +pyarrow_version = LooseVersion(pyarrow.__version__) +filter_sparse = pytest.mark.filterwarnings("ignore:The Sparse") + + +@filter_sparse +@pytest.mark.single +class TestFeather: + def check_error_on_write(self, df, exc): + # check that we are raising the exception + # on writing + + with pytest.raises(exc): + with tm.ensure_clean() as path: + to_feather(df, path) + + def check_round_trip(self, df, expected=None, **kwargs): + + if expected is None: + expected = df + + with tm.ensure_clean() as path: + to_feather(df, path) + + result = read_feather(path, **kwargs) + tm.assert_frame_equal(result, expected) + + def test_error(self): + + for obj in [ + pd.Series([1, 2, 3]), + 1, + "foo", + pd.Timestamp("20130101"), + np.array([1, 2, 3]), + ]: + self.check_error_on_write(obj, ValueError) + + def test_basic(self): + + df = pd.DataFrame( + { + "string": list("abc"), + "int": list(range(1, 4)), + "uint": np.arange(3, 6).astype("u1"), + "float": np.arange(4.0, 7.0, dtype="float64"), + "float_with_null": [1.0, np.nan, 3], + "bool": [True, False, True], + "bool_with_null": [True, np.nan, False], + "cat": pd.Categorical(list("abc")), + "dt": pd.date_range("20130101", periods=3), + "dttz": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "dt_with_null": [ + pd.Timestamp("20130101"), + pd.NaT, + pd.Timestamp("20130103"), + ], + "dtns": pd.date_range("20130101", periods=3, freq="ns"), + } + ) + + assert df.dttz.dtype.tz.zone == "US/Eastern" + self.check_round_trip(df) + + def test_duplicate_columns(self): + + # https://github.com/wesm/feather/issues/53 + # not currently able to handle duplicate columns + df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy() + self.check_error_on_write(df, ValueError) + + def test_stringify_columns(self): + + df = pd.DataFrame(np.arange(12).reshape(4, 3)).copy() + self.check_error_on_write(df, ValueError) + + def test_read_columns(self): + # GH 24025 + df = pd.DataFrame( + { + "col1": list("abc"), + "col2": list(range(1, 4)), + "col3": list("xyz"), + "col4": list(range(4, 7)), + } + ) + columns = ["col1", "col3"] + self.check_round_trip(df, expected=df[columns], columns=columns) + + def test_unsupported_other(self): + + # period + df = pd.DataFrame({"a": pd.period_range("2013", freq="M", periods=3)}) + # Some versions raise ValueError, others raise ArrowInvalid. + self.check_error_on_write(df, Exception) + + def test_rw_use_threads(self): + df = pd.DataFrame({"A": np.arange(100000)}) + self.check_round_trip(df, use_threads=True) + self.check_round_trip(df, use_threads=False) + + def test_write_with_index(self): + + df = pd.DataFrame({"A": [1, 2, 3]}) + self.check_round_trip(df) + + # non-default index + for index in [ + [2, 3, 4], + pd.date_range("20130101", periods=3), + list("abc"), + [1, 3, 4], + pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]), + ]: + + df.index = index + self.check_error_on_write(df, ValueError) + + # index with meta-data + df.index = [0, 1, 2] + df.index.name = "foo" + self.check_error_on_write(df, ValueError) + + # column multi-index + df.index = [0, 1, 2] + df.columns = pd.MultiIndex.from_tuples([("a", 1)]) + self.check_error_on_write(df, ValueError) + + def test_path_pathlib(self): + df = tm.makeDataFrame().reset_index() + result = tm.round_trip_pathlib(df.to_feather, pd.read_feather) + tm.assert_frame_equal(df, result) + + def test_path_localpath(self): + df = tm.makeDataFrame().reset_index() + result = tm.round_trip_localpath(df.to_feather, pd.read_feather) + tm.assert_frame_equal(df, result) diff --git a/venv/Lib/site-packages/pandas/tests/io/test_gbq.py b/venv/Lib/site-packages/pandas/tests/io/test_gbq.py new file mode 100644 index 0000000..7a5eba5 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/test_gbq.py @@ -0,0 +1,235 @@ +from contextlib import ExitStack as does_not_raise +from datetime import datetime +import os +import platform +import random +import string + +import numpy as np +import pytest +import pytz + +import pandas as pd +from pandas import DataFrame + +api_exceptions = pytest.importorskip("google.api_core.exceptions") +bigquery = pytest.importorskip("google.cloud.bigquery") +service_account = pytest.importorskip("google.oauth2.service_account") +pandas_gbq = pytest.importorskip("pandas_gbq") + +PROJECT_ID = None +PRIVATE_KEY_JSON_PATH = None +PRIVATE_KEY_JSON_CONTENTS = None + +VERSION = platform.python_version() + + +def _skip_if_no_project_id(): + if not _get_project_id(): + pytest.skip("Cannot run integration tests without a project id") + + +def _skip_if_no_private_key_path(): + if not _get_private_key_path(): + pytest.skip("Cannot run integration tests without a private key json file path") + + +def _in_travis_environment(): + return "TRAVIS_BUILD_DIR" in os.environ and "GBQ_PROJECT_ID" in os.environ + + +def _get_project_id(): + if _in_travis_environment(): + return os.environ.get("GBQ_PROJECT_ID") + return PROJECT_ID or os.environ.get("GBQ_PROJECT_ID") + + +def _get_private_key_path(): + if _in_travis_environment(): + return os.path.join( + *[os.environ.get("TRAVIS_BUILD_DIR"), "ci", "travis_gbq.json"] + ) + + private_key_path = PRIVATE_KEY_JSON_PATH + if not private_key_path: + private_key_path = os.environ.get("GBQ_GOOGLE_APPLICATION_CREDENTIALS") + return private_key_path + + +def _get_credentials(): + private_key_path = _get_private_key_path() + if private_key_path: + return service_account.Credentials.from_service_account_file(private_key_path) + + +def _get_client(): + project_id = _get_project_id() + credentials = _get_credentials() + return bigquery.Client(project=project_id, credentials=credentials) + + +def generate_rand_str(length: int = 10) -> str: + return "".join(random.choices(string.ascii_lowercase, k=length)) + + +def make_mixed_dataframe_v2(test_size): + # create df to test for all BQ datatypes except RECORD + bools = np.random.randint(2, size=(1, test_size)).astype(bool) + flts = np.random.randn(1, test_size) + ints = np.random.randint(1, 10, size=(1, test_size)) + strs = np.random.randint(1, 10, size=(1, test_size)).astype(str) + times = [datetime.now(pytz.timezone("US/Arizona")) for t in range(test_size)] + return DataFrame( + { + "bools": bools[0], + "flts": flts[0], + "ints": ints[0], + "strs": strs[0], + "times": times[0], + }, + index=range(test_size), + ) + + +def test_read_gbq_without_deprecated_kwargs(monkeypatch): + captured_kwargs = {} + + def mock_read_gbq(sql, **kwargs): + captured_kwargs.update(kwargs) + return DataFrame([[1.0]]) + + monkeypatch.setattr("pandas_gbq.read_gbq", mock_read_gbq) + pd.read_gbq("SELECT 1") + + assert "verbose" not in captured_kwargs + assert "private_key" not in captured_kwargs + + +def test_read_gbq_with_new_kwargs(monkeypatch): + captured_kwargs = {} + + def mock_read_gbq(sql, **kwargs): + captured_kwargs.update(kwargs) + return DataFrame([[1.0]]) + + monkeypatch.setattr("pandas_gbq.read_gbq", mock_read_gbq) + pd.read_gbq("SELECT 1", use_bqstorage_api=True) + + assert captured_kwargs["use_bqstorage_api"] + + +def test_read_gbq_without_new_kwargs(monkeypatch): + captured_kwargs = {} + + def mock_read_gbq(sql, **kwargs): + captured_kwargs.update(kwargs) + return DataFrame([[1.0]]) + + monkeypatch.setattr("pandas_gbq.read_gbq", mock_read_gbq) + pd.read_gbq("SELECT 1") + + assert "use_bqstorage_api" not in captured_kwargs + + +@pytest.mark.parametrize("progress_bar", [None, "foo"]) +def test_read_gbq_progress_bar_type_kwarg(monkeypatch, progress_bar): + # GH 29857 + captured_kwargs = {} + + def mock_read_gbq(sql, **kwargs): + captured_kwargs.update(kwargs) + return DataFrame([[1.0]]) + + monkeypatch.setattr("pandas_gbq.read_gbq", mock_read_gbq) + pd.read_gbq("SELECT 1", progress_bar_type=progress_bar) + + if progress_bar: + assert "progress_bar_type" in captured_kwargs + else: + assert "progress_bar_type" not in captured_kwargs + + +@pytest.mark.single +class TestToGBQIntegrationWithServiceAccountKeyPath: + @pytest.fixture() + def gbq_dataset(self): + # Setup Dataset + _skip_if_no_project_id() + _skip_if_no_private_key_path() + + dataset_id = "pydata_pandas_bq_testing_" + generate_rand_str() + + self.client = _get_client() + self.dataset = self.client.dataset(dataset_id) + + # Create the dataset + self.client.create_dataset(bigquery.Dataset(self.dataset)) + + table_name = generate_rand_str() + destination_table = f"{dataset_id}.{table_name}" + yield destination_table + + # Teardown Dataset + self.client.delete_dataset(self.dataset, delete_contents=True) + + def test_roundtrip(self, gbq_dataset): + destination_table = gbq_dataset + + test_size = 20001 + df = make_mixed_dataframe_v2(test_size) + + df.to_gbq( + destination_table, + _get_project_id(), + chunksize=None, + credentials=_get_credentials(), + ) + + result = pd.read_gbq( + f"SELECT COUNT(*) AS num_rows FROM {destination_table}", + project_id=_get_project_id(), + credentials=_get_credentials(), + dialect="standard", + ) + assert result["num_rows"][0] == test_size + + @pytest.mark.parametrize( + "if_exists, expected_num_rows, expectation", + [ + ("append", 300, does_not_raise()), + ("fail", 200, pytest.raises(pandas_gbq.gbq.TableCreationError)), + ("replace", 100, does_not_raise()), + ], + ) + def test_gbq_if_exists( + self, if_exists, expected_num_rows, expectation, gbq_dataset + ): + # GH 29598 + destination_table = gbq_dataset + + test_size = 200 + df = make_mixed_dataframe_v2(test_size) + + df.to_gbq( + destination_table, + _get_project_id(), + chunksize=None, + credentials=_get_credentials(), + ) + + with expectation: + df.iloc[:100].to_gbq( + destination_table, + _get_project_id(), + if_exists=if_exists, + chunksize=None, + credentials=_get_credentials(), + ) + + result = pd.read_gbq( + f"SELECT COUNT(*) AS num_rows FROM {destination_table}", + project_id=_get_project_id(), + credentials=_get_credentials(), + dialect="standard", + ) + assert result["num_rows"][0] == expected_num_rows diff --git a/venv/Lib/site-packages/pandas/tests/io/test_gcs.py b/venv/Lib/site-packages/pandas/tests/io/test_gcs.py new file mode 100644 index 0000000..557a9d5 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/test_gcs.py @@ -0,0 +1,115 @@ +from io import StringIO +import os + +import numpy as np +import pytest + +from pandas import DataFrame, date_range, read_csv +import pandas._testing as tm +from pandas.util import _test_decorators as td + +from pandas.io.common import is_gcs_url + + +def test_is_gcs_url(): + assert is_gcs_url("gcs://pandas/somethingelse.com") + assert is_gcs_url("gs://pandas/somethingelse.com") + assert not is_gcs_url("s3://pandas/somethingelse.com") + + +@td.skip_if_no("gcsfs") +def test_read_csv_gcs(monkeypatch): + df1 = DataFrame( + { + "int": [1, 3], + "float": [2.0, np.nan], + "str": ["t", "s"], + "dt": date_range("2018-06-18", periods=2), + } + ) + + class MockGCSFileSystem: + def open(*args): + return StringIO(df1.to_csv(index=False)) + + monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) + df2 = read_csv("gs://test/test.csv", parse_dates=["dt"]) + + tm.assert_frame_equal(df1, df2) + + +@td.skip_if_no("gcsfs") +def test_to_csv_gcs(monkeypatch): + df1 = DataFrame( + { + "int": [1, 3], + "float": [2.0, np.nan], + "str": ["t", "s"], + "dt": date_range("2018-06-18", periods=2), + } + ) + s = StringIO() + + class MockGCSFileSystem: + def open(*args): + return s + + monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) + df1.to_csv("gs://test/test.csv", index=True) + df2 = read_csv(StringIO(s.getvalue()), parse_dates=["dt"], index_col=0) + + tm.assert_frame_equal(df1, df2) + + +@td.skip_if_no("fastparquet") +@td.skip_if_no("gcsfs") +def test_to_parquet_gcs_new_file(monkeypatch, tmpdir): + """Regression test for writing to a not-yet-existent GCS Parquet file.""" + df1 = DataFrame( + { + "int": [1, 3], + "float": [2.0, np.nan], + "str": ["t", "s"], + "dt": date_range("2018-06-18", periods=2), + } + ) + + class MockGCSFileSystem: + def open(self, path, mode="r", *args): + if "w" not in mode: + raise FileNotFoundError + return open(os.path.join(tmpdir, "test.parquet"), mode) + + monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) + df1.to_parquet( + "gs://test/test.csv", index=True, engine="fastparquet", compression=None + ) + + +@td.skip_if_no("gcsfs") +def test_gcs_get_filepath_or_buffer(monkeypatch): + df1 = DataFrame( + { + "int": [1, 3], + "float": [2.0, np.nan], + "str": ["t", "s"], + "dt": date_range("2018-06-18", periods=2), + } + ) + + def mock_get_filepath_or_buffer(*args, **kwargs): + return (StringIO(df1.to_csv(index=False)), None, None, False) + + monkeypatch.setattr( + "pandas.io.gcs.get_filepath_or_buffer", mock_get_filepath_or_buffer + ) + df2 = read_csv("gs://test/test.csv", parse_dates=["dt"]) + + tm.assert_frame_equal(df1, df2) + + +@td.skip_if_installed("gcsfs") +def test_gcs_not_present_exception(): + with pytest.raises(ImportError) as e: + read_csv("gs://test/test.csv") + assert "gcsfs library is required" in str(e.value) diff --git a/venv/Lib/site-packages/pandas/tests/io/test_html.py b/venv/Lib/site-packages/pandas/tests/io/test_html.py new file mode 100644 index 0000000..7a814ce --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/test_html.py @@ -0,0 +1,1258 @@ +from functools import partial +from importlib import reload +from io import BytesIO, StringIO +import os +import re +import threading +from urllib.error import URLError + +import numpy as np +from numpy.random import rand +import pytest + +from pandas.compat import is_platform_windows +from pandas.errors import ParserError +import pandas.util._test_decorators as td + +from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv +import pandas._testing as tm + +from pandas.io.common import file_path_to_url +import pandas.io.html +from pandas.io.html import read_html + +HERE = os.path.dirname(__file__) + + +@pytest.fixture( + params=[ + "chinese_utf-16.html", + "chinese_utf-32.html", + "chinese_utf-8.html", + "letz_latin1.html", + ] +) +def html_encoding_file(request, datapath): + """Parametrized fixture for HTML encoding test filenames.""" + return datapath("io", "data", "html_encoding", request.param) + + +def assert_framelist_equal(list1, list2, *args, **kwargs): + assert len(list1) == len(list2), ( + "lists are not of equal size " + "len(list1) == {0}, " + "len(list2) == {1}".format(len(list1), len(list2)) + ) + msg = "not all list elements are DataFrames" + both_frames = all( + map( + lambda x, y: isinstance(x, DataFrame) and isinstance(y, DataFrame), + list1, + list2, + ) + ) + assert both_frames, msg + for frame_i, frame_j in zip(list1, list2): + tm.assert_frame_equal(frame_i, frame_j, *args, **kwargs) + assert not frame_i.empty, "frames are both empty" + + +@td.skip_if_no("bs4") +def test_bs4_version_fails(monkeypatch, datapath): + import bs4 + + monkeypatch.setattr(bs4, "__version__", "4.2") + with pytest.raises(ImportError, match="Pandas requires version"): + read_html(datapath("io", "data", "html", "spam.html"), flavor="bs4") + + +def test_invalid_flavor(): + url = "google.com" + flavor = "invalid flavor" + msg = r"\{" + flavor + r"\} is not a valid set of flavors" + + with pytest.raises(ValueError, match=msg): + read_html(url, "google", flavor=flavor) + + +@td.skip_if_no("bs4") +@td.skip_if_no("lxml") +def test_same_ordering(datapath): + filename = datapath("io", "data", "html", "valid_markup.html") + dfs_lxml = read_html(filename, index_col=0, flavor=["lxml"]) + dfs_bs4 = read_html(filename, index_col=0, flavor=["bs4"]) + assert_framelist_equal(dfs_lxml, dfs_bs4) + + +@pytest.mark.parametrize( + "flavor", + [ + pytest.param("bs4", marks=td.skip_if_no("bs4")), + pytest.param("lxml", marks=td.skip_if_no("lxml")), + ], + scope="class", +) +class TestReadHtml: + @pytest.fixture(autouse=True) + def set_files(self, datapath): + self.spam_data = datapath("io", "data", "html", "spam.html") + self.spam_data_kwargs = {} + self.spam_data_kwargs["encoding"] = "UTF-8" + self.banklist_data = datapath("io", "data", "html", "banklist.html") + + @pytest.fixture(autouse=True, scope="function") + def set_defaults(self, flavor, request): + self.read_html = partial(read_html, flavor=flavor) + yield + + def test_to_html_compat(self): + df = ( + tm.makeCustomDataframe( + 4, + 3, + data_gen_f=lambda *args: rand(), + c_idx_names=False, + r_idx_names=False, + ) + .applymap("{0:.3f}".format) + .astype(float) + ) + out = df.to_html() + res = self.read_html(out, attrs={"class": "dataframe"}, index_col=0)[0] + tm.assert_frame_equal(res, df) + + @tm.network + def test_banklist_url(self): + url = "http://www.fdic.gov/bank/individual/failed/banklist.html" + df1 = self.read_html( + url, "First Federal Bank of Florida", attrs={"id": "table"} + ) + df2 = self.read_html(url, "Metcalf Bank", attrs={"id": "table"}) + + assert_framelist_equal(df1, df2) + + @tm.network + def test_spam_url(self): + url = ( + "https://raw.githubusercontent.com/pandas-dev/pandas/master/" + "pandas/tests/io/data/html/spam.html" + ) + df1 = self.read_html(url, ".*Water.*") + df2 = self.read_html(url, "Unit") + + assert_framelist_equal(df1, df2) + + @pytest.mark.slow + def test_banklist(self): + df1 = self.read_html(self.banklist_data, ".*Florida.*", attrs={"id": "table"}) + df2 = self.read_html(self.banklist_data, "Metcalf Bank", attrs={"id": "table"}) + + assert_framelist_equal(df1, df2) + + def test_spam(self): + df1 = self.read_html(self.spam_data, ".*Water.*") + df2 = self.read_html(self.spam_data, "Unit") + assert_framelist_equal(df1, df2) + + assert df1[0].iloc[0, 0] == "Proximates" + assert df1[0].columns[0] == "Nutrient" + + def test_spam_no_match(self): + dfs = self.read_html(self.spam_data) + for df in dfs: + assert isinstance(df, DataFrame) + + def test_banklist_no_match(self): + dfs = self.read_html(self.banklist_data, attrs={"id": "table"}) + for df in dfs: + assert isinstance(df, DataFrame) + + def test_spam_header(self): + df = self.read_html(self.spam_data, ".*Water.*", header=2)[0] + assert df.columns[0] == "Proximates" + assert not df.empty + + def test_skiprows_int(self): + df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=1) + df2 = self.read_html(self.spam_data, "Unit", skiprows=1) + + assert_framelist_equal(df1, df2) + + def test_skiprows_range(self): + df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=range(2))[0] + df2 = self.read_html(self.spam_data, "Unit", skiprows=range(2))[0] + tm.assert_frame_equal(df1, df2) + + def test_skiprows_list(self): + df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=[1, 2]) + df2 = self.read_html(self.spam_data, "Unit", skiprows=[2, 1]) + + assert_framelist_equal(df1, df2) + + def test_skiprows_set(self): + df1 = self.read_html(self.spam_data, ".*Water.*", skiprows={1, 2}) + df2 = self.read_html(self.spam_data, "Unit", skiprows={2, 1}) + + assert_framelist_equal(df1, df2) + + def test_skiprows_slice(self): + df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=1) + df2 = self.read_html(self.spam_data, "Unit", skiprows=1) + + assert_framelist_equal(df1, df2) + + def test_skiprows_slice_short(self): + df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=slice(2)) + df2 = self.read_html(self.spam_data, "Unit", skiprows=slice(2)) + + assert_framelist_equal(df1, df2) + + def test_skiprows_slice_long(self): + df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=slice(2, 5)) + df2 = self.read_html(self.spam_data, "Unit", skiprows=slice(4, 1, -1)) + + assert_framelist_equal(df1, df2) + + def test_skiprows_ndarray(self): + df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=np.arange(2)) + df2 = self.read_html(self.spam_data, "Unit", skiprows=np.arange(2)) + + assert_framelist_equal(df1, df2) + + def test_skiprows_invalid(self): + with pytest.raises(TypeError, match=("is not a valid type for skipping rows")): + self.read_html(self.spam_data, ".*Water.*", skiprows="asdf") + + def test_index(self): + df1 = self.read_html(self.spam_data, ".*Water.*", index_col=0) + df2 = self.read_html(self.spam_data, "Unit", index_col=0) + assert_framelist_equal(df1, df2) + + def test_header_and_index_no_types(self): + df1 = self.read_html(self.spam_data, ".*Water.*", header=1, index_col=0) + df2 = self.read_html(self.spam_data, "Unit", header=1, index_col=0) + assert_framelist_equal(df1, df2) + + def test_header_and_index_with_types(self): + df1 = self.read_html(self.spam_data, ".*Water.*", header=1, index_col=0) + df2 = self.read_html(self.spam_data, "Unit", header=1, index_col=0) + assert_framelist_equal(df1, df2) + + def test_infer_types(self): + + # 10892 infer_types removed + df1 = self.read_html(self.spam_data, ".*Water.*", index_col=0) + df2 = self.read_html(self.spam_data, "Unit", index_col=0) + assert_framelist_equal(df1, df2) + + def test_string_io(self): + with open(self.spam_data, **self.spam_data_kwargs) as f: + data1 = StringIO(f.read()) + + with open(self.spam_data, **self.spam_data_kwargs) as f: + data2 = StringIO(f.read()) + + df1 = self.read_html(data1, ".*Water.*") + df2 = self.read_html(data2, "Unit") + assert_framelist_equal(df1, df2) + + def test_string(self): + with open(self.spam_data, **self.spam_data_kwargs) as f: + data = f.read() + + df1 = self.read_html(data, ".*Water.*") + df2 = self.read_html(data, "Unit") + + assert_framelist_equal(df1, df2) + + def test_file_like(self): + with open(self.spam_data, **self.spam_data_kwargs) as f: + df1 = self.read_html(f, ".*Water.*") + + with open(self.spam_data, **self.spam_data_kwargs) as f: + df2 = self.read_html(f, "Unit") + + assert_framelist_equal(df1, df2) + + @tm.network + def test_bad_url_protocol(self): + with pytest.raises(URLError): + self.read_html("git://github.com", match=".*Water.*") + + @tm.network + @pytest.mark.slow + def test_invalid_url(self): + try: + with pytest.raises(URLError): + self.read_html("http://www.a23950sdfa908sd.com", match=".*Water.*") + except ValueError as e: + assert "No tables found" in str(e) + + @pytest.mark.slow + def test_file_url(self): + url = self.banklist_data + dfs = self.read_html( + file_path_to_url(os.path.abspath(url)), "First", attrs={"id": "table"} + ) + assert isinstance(dfs, list) + for df in dfs: + assert isinstance(df, DataFrame) + + @pytest.mark.slow + def test_invalid_table_attrs(self): + url = self.banklist_data + with pytest.raises(ValueError, match="No tables found"): + self.read_html( + url, "First Federal Bank of Florida", attrs={"id": "tasdfable"} + ) + + def _bank_data(self, *args, **kwargs): + return self.read_html( + self.banklist_data, "Metcalf", attrs={"id": "table"}, *args, **kwargs + ) + + @pytest.mark.slow + def test_multiindex_header(self): + df = self._bank_data(header=[0, 1])[0] + assert isinstance(df.columns, MultiIndex) + + @pytest.mark.slow + def test_multiindex_index(self): + df = self._bank_data(index_col=[0, 1])[0] + assert isinstance(df.index, MultiIndex) + + @pytest.mark.slow + def test_multiindex_header_index(self): + df = self._bank_data(header=[0, 1], index_col=[0, 1])[0] + assert isinstance(df.columns, MultiIndex) + assert isinstance(df.index, MultiIndex) + + @pytest.mark.slow + def test_multiindex_header_skiprows_tuples(self): + df = self._bank_data(header=[0, 1], skiprows=1)[0] + assert isinstance(df.columns, MultiIndex) + + @pytest.mark.slow + def test_multiindex_header_skiprows(self): + df = self._bank_data(header=[0, 1], skiprows=1)[0] + assert isinstance(df.columns, MultiIndex) + + @pytest.mark.slow + def test_multiindex_header_index_skiprows(self): + df = self._bank_data(header=[0, 1], index_col=[0, 1], skiprows=1)[0] + assert isinstance(df.index, MultiIndex) + assert isinstance(df.columns, MultiIndex) + + @pytest.mark.slow + def test_regex_idempotency(self): + url = self.banklist_data + dfs = self.read_html( + file_path_to_url(os.path.abspath(url)), + match=re.compile(re.compile("Florida")), + attrs={"id": "table"}, + ) + assert isinstance(dfs, list) + for df in dfs: + assert isinstance(df, DataFrame) + + def test_negative_skiprows(self): + msg = r"\(you passed a negative value\)" + with pytest.raises(ValueError, match=msg): + self.read_html(self.spam_data, "Water", skiprows=-1) + + @tm.network + def test_multiple_matches(self): + url = "https://docs.python.org/2/" + dfs = self.read_html(url, match="Python") + assert len(dfs) > 1 + + @tm.network + def test_python_docs_table(self): + url = "https://docs.python.org/2/" + dfs = self.read_html(url, match="Python") + zz = [df.iloc[0, 0][0:4] for df in dfs] + assert sorted(zz) == sorted(["Repo", "What"]) + + @pytest.mark.slow + def test_thousands_macau_stats(self, datapath): + all_non_nan_table_index = -2 + macau_data = datapath("io", "data", "html", "macau.html") + dfs = self.read_html(macau_data, index_col=0, attrs={"class": "style1"}) + df = dfs[all_non_nan_table_index] + + assert not any(s.isna().any() for _, s in df.items()) + + @pytest.mark.slow + def test_thousands_macau_index_col(self, datapath, request): + # https://github.com/pandas-dev/pandas/issues/29622 + # This tests fails for bs4 >= 4.8.0 - so handle xfail accordingly + if self.read_html.keywords.get("flavor") == "bs4" and td.safe_import( + "bs4", "4.8.0" + ): + reason = "fails for bs4 version >= 4.8.0" + request.node.add_marker(pytest.mark.xfail(reason=reason)) + + all_non_nan_table_index = -2 + macau_data = datapath("io", "data", "html", "macau.html") + dfs = self.read_html(macau_data, index_col=0, header=0) + df = dfs[all_non_nan_table_index] + + assert not any(s.isna().any() for _, s in df.items()) + + def test_empty_tables(self): + """ + Make sure that read_html ignores empty tables. + """ + html = """ + + + + + + + + + + + + + +
AB
12
+ + + +
+ """ + result = self.read_html(html) + assert len(result) == 1 + + def test_multiple_tbody(self): + # GH-20690 + # Read all tbody tags within a single table. + result = self.read_html( + """ + + + + + + + + + + + + + + + + + + +
AB
12
34
""" + )[0] + + expected = DataFrame(data=[[1, 2], [3, 4]], columns=["A", "B"]) + + tm.assert_frame_equal(result, expected) + + def test_header_and_one_column(self): + """ + Don't fail with bs4 when there is a header and only one column + as described in issue #9178 + """ + result = self.read_html( + """ + + + + + + + + + + +
Header
first
""" + )[0] + + expected = DataFrame(data={"Header": "first"}, index=[0]) + + tm.assert_frame_equal(result, expected) + + def test_thead_without_tr(self): + """ + Ensure parser adds
+ + + + + + + + + + + + + + +
CountryMunicipalityYear
UkraineOdessa1944
""" + )[0] + + expected = DataFrame( + data=[["Ukraine", "Odessa", 1944]], + columns=["Country", "Municipality", "Year"], + ) + + tm.assert_frame_equal(result, expected) + + def test_tfoot_read(self): + """ + Make sure that read_html reads tfoot, containing td or th. + Ignores empty tfoot + """ + data_template = """ + + + + + + + + + + + + + + {footer} + +
AB
bodyAbodyB
""" + + expected1 = DataFrame(data=[["bodyA", "bodyB"]], columns=["A", "B"]) + + expected2 = DataFrame( + data=[["bodyA", "bodyB"], ["footA", "footB"]], columns=["A", "B"] + ) + + data1 = data_template.format(footer="") + data2 = data_template.format(footer="
footAfootB
+ + + + + + + + +
SI
text1944
+ """, + header=0, + )[0] + + expected = DataFrame([["text", 1944]], columns=("S", "I")) + + tm.assert_frame_equal(result, expected) + + def test_nyse_wsj_commas_table(self, datapath): + data = datapath("io", "data", "html", "nyse_wsj.html") + df = self.read_html(data, index_col=0, header=0, attrs={"class": "mdcTable"})[0] + + expected = Index( + [ + "Issue(Roll over for charts and headlines)", + "Volume", + "Price", + "Chg", + "% Chg", + ] + ) + nrows = 100 + assert df.shape[0] == nrows + tm.assert_index_equal(df.columns, expected) + + @pytest.mark.slow + def test_banklist_header(self, datapath): + from pandas.io.html import _remove_whitespace + + def try_remove_ws(x): + try: + return _remove_whitespace(x) + except AttributeError: + return x + + df = self.read_html(self.banklist_data, "Metcalf", attrs={"id": "table"})[0] + ground_truth = read_csv( + datapath("io", "data", "csv", "banklist.csv"), + converters={"Updated Date": Timestamp, "Closing Date": Timestamp}, + ) + assert df.shape == ground_truth.shape + old = [ + "First Vietnamese American BankIn Vietnamese", + "Westernbank Puerto RicoEn Espanol", + "R-G Premier Bank of Puerto RicoEn Espanol", + "EurobankEn Espanol", + "Sanderson State BankEn Espanol", + "Washington Mutual Bank(Including its subsidiary Washington " + "Mutual Bank FSB)", + "Silver State BankEn Espanol", + "AmTrade International BankEn Espanol", + "Hamilton Bank, NAEn Espanol", + "The Citizens Savings BankPioneer Community Bank, Inc.", + ] + new = [ + "First Vietnamese American Bank", + "Westernbank Puerto Rico", + "R-G Premier Bank of Puerto Rico", + "Eurobank", + "Sanderson State Bank", + "Washington Mutual Bank", + "Silver State Bank", + "AmTrade International Bank", + "Hamilton Bank, NA", + "The Citizens Savings Bank", + ] + dfnew = df.applymap(try_remove_ws).replace(old, new) + gtnew = ground_truth.applymap(try_remove_ws) + converted = dfnew._convert(datetime=True, numeric=True) + date_cols = ["Closing Date", "Updated Date"] + converted[date_cols] = converted[date_cols]._convert(datetime=True, coerce=True) + tm.assert_frame_equal(converted, gtnew) + + @pytest.mark.slow + def test_gold_canyon(self): + gc = "Gold Canyon" + with open(self.banklist_data, "r") as f: + raw_text = f.read() + + assert gc in raw_text + df = self.read_html(self.banklist_data, "Gold Canyon", attrs={"id": "table"})[0] + assert gc in df.to_string() + + def test_different_number_of_cols(self): + expected = self.read_html( + """ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
C_l0_g0C_l0_g1C_l0_g2C_l0_g3C_l0_g4
R_l0_g0 0.763 0.233 nan nan nan
R_l0_g1 0.244 0.285 0.392 0.137 0.222
""", + index_col=0, + )[0] + + result = self.read_html( + """ + + + + + + + + + + + + + + + + + + + + + + + + + +
C_l0_g0C_l0_g1C_l0_g2C_l0_g3C_l0_g4
R_l0_g0 0.763 0.233
R_l0_g1 0.244 0.285 0.392 0.137 0.222
""", + index_col=0, + )[0] + + tm.assert_frame_equal(result, expected) + + def test_colspan_rowspan_1(self): + # GH17054 + result = self.read_html( + """ + + + + + + + + + + + +
ABC
abc
+ """ + )[0] + + expected = DataFrame([["a", "b", "c"]], columns=["A", "B", "C"]) + + tm.assert_frame_equal(result, expected) + + def test_colspan_rowspan_copy_values(self): + # GH17054 + + # In ASCII, with lowercase letters being copies: + # + # X x Y Z W + # A B b z C + + result = self.read_html( + """ + + + + + + + + + + + + +
XYZW
ABC
+ """, + header=0, + )[0] + + expected = DataFrame( + data=[["A", "B", "B", "Z", "C"]], columns=["X", "X.1", "Y", "Z", "W"] + ) + + tm.assert_frame_equal(result, expected) + + def test_colspan_rowspan_both_not_1(self): + # GH17054 + + # In ASCII, with lowercase letters being copies: + # + # A B b b C + # a b b b D + + result = self.read_html( + """ + + + + + + + + + +
ABC
D
+ """, + header=0, + )[0] + + expected = DataFrame( + data=[["A", "B", "B", "B", "D"]], columns=["A", "B", "B.1", "B.2", "C"] + ) + + tm.assert_frame_equal(result, expected) + + def test_rowspan_at_end_of_row(self): + # GH17054 + + # In ASCII, with lowercase letters being copies: + # + # A B + # C b + + result = self.read_html( + """ + + + + + + + + +
AB
C
+ """, + header=0, + )[0] + + expected = DataFrame(data=[["C", "B"]], columns=["A", "B"]) + + tm.assert_frame_equal(result, expected) + + def test_rowspan_only_rows(self): + # GH17054 + + result = self.read_html( + """ + + + + + +
AB
+ """, + header=0, + )[0] + + expected = DataFrame(data=[["A", "B"], ["A", "B"]], columns=["A", "B"]) + + tm.assert_frame_equal(result, expected) + + def test_header_inferred_from_rows_with_only_th(self): + # GH17054 + result = self.read_html( + """ + + + + + + + + + + + + + +
AB
ab
12
+ """ + )[0] + + columns = MultiIndex(levels=[["A", "B"], ["a", "b"]], codes=[[0, 1], [0, 1]]) + expected = DataFrame(data=[[1, 2]], columns=columns) + + tm.assert_frame_equal(result, expected) + + def test_parse_dates_list(self): + df = DataFrame({"date": date_range("1/1/2001", periods=10)}) + expected = df.to_html() + res = self.read_html(expected, parse_dates=[1], index_col=0) + tm.assert_frame_equal(df, res[0]) + res = self.read_html(expected, parse_dates=["date"], index_col=0) + tm.assert_frame_equal(df, res[0]) + + def test_parse_dates_combine(self): + raw_dates = Series(date_range("1/1/2001", periods=10)) + df = DataFrame( + { + "date": raw_dates.map(lambda x: str(x.date())), + "time": raw_dates.map(lambda x: str(x.time())), + } + ) + res = self.read_html( + df.to_html(), parse_dates={"datetime": [1, 2]}, index_col=1 + ) + newdf = DataFrame({"datetime": raw_dates}) + tm.assert_frame_equal(newdf, res[0]) + + def test_computer_sales_page(self, datapath): + data = datapath("io", "data", "html", "computer_sales_page.html") + msg = ( + r"Passed header=\[0,1\] are too many " + r"rows for this multi_index of columns" + ) + with pytest.raises(ParserError, match=msg): + self.read_html(data, header=[0, 1]) + + data = datapath("io", "data", "html", "computer_sales_page.html") + assert self.read_html(data, header=[1, 2]) + + def test_wikipedia_states_table(self, datapath): + data = datapath("io", "data", "html", "wikipedia_states.html") + assert os.path.isfile(data), f"{repr(data)} is not a file" + assert os.path.getsize(data), f"{repr(data)} is an empty file" + result = self.read_html(data, "Arizona", header=1)[0] + assert result["sq mi"].dtype == np.dtype("float64") + + def test_parser_error_on_empty_header_row(self): + msg = ( + r"Passed header=\[0,1\] are too many " + r"rows for this multi_index of columns" + ) + with pytest.raises(ParserError, match=msg): + self.read_html( + """ + + + + + + + + +
AB
ab
+ """, + header=[0, 1], + ) + + def test_decimal_rows(self): + # GH 12907 + result = self.read_html( + """ + + + + + + + + + + + + +
Header
1100#101
+ + """, + decimal="#", + )[0] + + expected = DataFrame(data={"Header": 1100.101}, index=[0]) + + assert result["Header"].dtype == np.dtype("float64") + tm.assert_frame_equal(result, expected) + + def test_bool_header_arg(self): + # GH 6114 + for arg in [True, False]: + with pytest.raises(TypeError): + self.read_html(self.spam_data, header=arg) + + def test_converters(self): + # GH 13461 + result = self.read_html( + """ + + + + + + + + + + + + + +
a
0.763
0.244
""", + converters={"a": str}, + )[0] + + expected = DataFrame({"a": ["0.763", "0.244"]}) + + tm.assert_frame_equal(result, expected) + + def test_na_values(self): + # GH 13461 + result = self.read_html( + """ + + + + + + + + + + + + + +
a
0.763
0.244
""", + na_values=[0.244], + )[0] + + expected = DataFrame({"a": [0.763, np.nan]}) + + tm.assert_frame_equal(result, expected) + + def test_keep_default_na(self): + html_data = """ + + + + + + + + + + + + + +
a
N/A
NA
""" + + expected_df = DataFrame({"a": ["N/A", "NA"]}) + html_df = self.read_html(html_data, keep_default_na=False)[0] + tm.assert_frame_equal(expected_df, html_df) + + expected_df = DataFrame({"a": [np.nan, np.nan]}) + html_df = self.read_html(html_data, keep_default_na=True)[0] + tm.assert_frame_equal(expected_df, html_df) + + def test_preserve_empty_rows(self): + result = self.read_html( + """ + + + + + + + + + + + + + +
AB
ab
+ """ + )[0] + + expected = DataFrame(data=[["a", "b"], [np.nan, np.nan]], columns=["A", "B"]) + + tm.assert_frame_equal(result, expected) + + def test_ignore_empty_rows_when_inferring_header(self): + result = self.read_html( + """ + + + + + + + + + +
AB
ab
12
+ """ + )[0] + + columns = MultiIndex(levels=[["A", "B"], ["a", "b"]], codes=[[0, 1], [0, 1]]) + expected = DataFrame(data=[[1, 2]], columns=columns) + + tm.assert_frame_equal(result, expected) + + def test_multiple_header_rows(self): + # Issue #13434 + expected_df = DataFrame( + data=[("Hillary", 68, "D"), ("Bernie", 74, "D"), ("Donald", 69, "R")] + ) + expected_df.columns = [ + ["Unnamed: 0_level_0", "Age", "Party"], + ["Name", "Unnamed: 1_level_1", "Unnamed: 2_level_1"], + ] + html = expected_df.to_html(index=False) + html_df = self.read_html(html)[0] + tm.assert_frame_equal(expected_df, html_df) + + def test_works_on_valid_markup(self, datapath): + filename = datapath("io", "data", "html", "valid_markup.html") + dfs = self.read_html(filename, index_col=0) + assert isinstance(dfs, list) + assert isinstance(dfs[0], DataFrame) + + @pytest.mark.slow + def test_fallback_success(self, datapath): + banklist_data = datapath("io", "data", "html", "banklist.html") + self.read_html(banklist_data, ".*Water.*", flavor=["lxml", "html5lib"]) + + def test_to_html_timestamp(self): + rng = date_range("2000-01-01", periods=10) + df = DataFrame(np.random.randn(10, 4), index=rng) + + result = df.to_html() + assert "2000-01-01" in result + + @pytest.mark.parametrize( + "displayed_only,exp0,exp1", + [ + (True, DataFrame(["foo"]), None), + (False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"])), + ], + ) + def test_displayed_only(self, displayed_only, exp0, exp1): + # GH 20027 + data = StringIO( + """ + + + + + +
+ foo + bar + baz + qux +
+ + + + +
foo
+ + """ + ) + + dfs = self.read_html(data, displayed_only=displayed_only) + tm.assert_frame_equal(dfs[0], exp0) + + if exp1 is not None: + tm.assert_frame_equal(dfs[1], exp1) + else: + assert len(dfs) == 1 # Should not parse hidden table + + def test_encode(self, html_encoding_file): + base_path = os.path.basename(html_encoding_file) + root = os.path.splitext(base_path)[0] + _, encoding = root.split("_") + + try: + with open(html_encoding_file, "rb") as fobj: + from_string = self.read_html( + fobj.read(), encoding=encoding, index_col=0 + ).pop() + + with open(html_encoding_file, "rb") as fobj: + from_file_like = self.read_html( + BytesIO(fobj.read()), encoding=encoding, index_col=0 + ).pop() + + from_filename = self.read_html( + html_encoding_file, encoding=encoding, index_col=0 + ).pop() + tm.assert_frame_equal(from_string, from_file_like) + tm.assert_frame_equal(from_string, from_filename) + except Exception: + # seems utf-16/32 fail on windows + if is_platform_windows(): + if "16" in encoding or "32" in encoding: + pytest.skip() + raise + + def test_parse_failure_unseekable(self): + # Issue #17975 + + if self.read_html.keywords.get("flavor") == "lxml": + pytest.skip("Not applicable for lxml") + + class UnseekableStringIO(StringIO): + def seekable(self): + return False + + bad = UnseekableStringIO( + """ +
spameggs
""" + ) + + assert self.read_html(bad) + + with pytest.raises(ValueError, match="passed a non-rewindable file object"): + self.read_html(bad) + + def test_parse_failure_rewinds(self): + # Issue #17975 + + class MockFile: + def __init__(self, data): + self.data = data + self.at_end = False + + def read(self, size=None): + data = "" if self.at_end else self.data + self.at_end = True + return data + + def seek(self, offset): + self.at_end = False + + def seekable(self): + return True + + good = MockFile("
spam
eggs
") + bad = MockFile("
spameggs
") + + assert self.read_html(good) + assert self.read_html(bad) + + @pytest.mark.slow + def test_importcheck_thread_safety(self, datapath): + # see gh-16928 + + class ErrorThread(threading.Thread): + def run(self): + try: + super().run() + except Exception as err: + self.err = err + else: + self.err = None + + # force import check by reinitalising global vars in html.py + reload(pandas.io.html) + + filename = datapath("io", "data", "html", "valid_markup.html") + helper_thread1 = ErrorThread(target=self.read_html, args=(filename,)) + helper_thread2 = ErrorThread(target=self.read_html, args=(filename,)) + + helper_thread1.start() + helper_thread2.start() + + while helper_thread1.is_alive() or helper_thread2.is_alive(): + pass + assert None is helper_thread1.err is helper_thread2.err diff --git a/venv/Lib/site-packages/pandas/tests/io/test_orc.py b/venv/Lib/site-packages/pandas/tests/io/test_orc.py new file mode 100644 index 0000000..a1f9c6f --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/test_orc.py @@ -0,0 +1,227 @@ +""" test orc compat """ +import datetime +import os + +import numpy as np +import pytest + +import pandas as pd +from pandas import read_orc +import pandas._testing as tm + +pytest.importorskip("pyarrow", minversion="0.13.0") +pytest.importorskip("pyarrow.orc") + +pytestmark = pytest.mark.filterwarnings( + "ignore:RangeIndex.* is deprecated:DeprecationWarning" +) + + +@pytest.fixture +def dirpath(datapath): + return datapath("io", "data", "orc") + + +def test_orc_reader_empty(dirpath): + columns = [ + "boolean1", + "byte1", + "short1", + "int1", + "long1", + "float1", + "double1", + "bytes1", + "string1", + ] + dtypes = [ + "bool", + "int8", + "int16", + "int32", + "int64", + "float32", + "float64", + "object", + "object", + ] + expected = pd.DataFrame(index=pd.RangeIndex(0)) + for colname, dtype in zip(columns, dtypes): + expected[colname] = pd.Series(dtype=dtype) + + inputfile = os.path.join(dirpath, "TestOrcFile.emptyFile.orc") + got = read_orc(inputfile, columns=columns) + + tm.assert_equal(expected, got) + + +def test_orc_reader_basic(dirpath): + data = { + "boolean1": np.array([False, True], dtype="bool"), + "byte1": np.array([1, 100], dtype="int8"), + "short1": np.array([1024, 2048], dtype="int16"), + "int1": np.array([65536, 65536], dtype="int32"), + "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"), + "float1": np.array([1.0, 2.0], dtype="float32"), + "double1": np.array([-15.0, -5.0], dtype="float64"), + "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"), + "string1": np.array(["hi", "bye"], dtype="object"), + } + expected = pd.DataFrame.from_dict(data) + + inputfile = os.path.join(dirpath, "TestOrcFile.test1.orc") + got = read_orc(inputfile, columns=data.keys()) + + tm.assert_equal(expected, got) + + +def test_orc_reader_decimal(dirpath): + from decimal import Decimal + + # Only testing the first 10 rows of data + data = { + "_col0": np.array( + [ + Decimal("-1000.50000"), + Decimal("-999.60000"), + Decimal("-998.70000"), + Decimal("-997.80000"), + Decimal("-996.90000"), + Decimal("-995.10000"), + Decimal("-994.11000"), + Decimal("-993.12000"), + Decimal("-992.13000"), + Decimal("-991.14000"), + ], + dtype="object", + ) + } + expected = pd.DataFrame.from_dict(data) + + inputfile = os.path.join(dirpath, "TestOrcFile.decimal.orc") + got = read_orc(inputfile).iloc[:10] + + tm.assert_equal(expected, got) + + +def test_orc_reader_date_low(dirpath): + data = { + "time": np.array( + [ + "1900-05-05 12:34:56.100000", + "1900-05-05 12:34:56.100100", + "1900-05-05 12:34:56.100200", + "1900-05-05 12:34:56.100300", + "1900-05-05 12:34:56.100400", + "1900-05-05 12:34:56.100500", + "1900-05-05 12:34:56.100600", + "1900-05-05 12:34:56.100700", + "1900-05-05 12:34:56.100800", + "1900-05-05 12:34:56.100900", + ], + dtype="datetime64[ns]", + ), + "date": np.array( + [ + datetime.date(1900, 12, 25), + datetime.date(1900, 12, 25), + datetime.date(1900, 12, 25), + datetime.date(1900, 12, 25), + datetime.date(1900, 12, 25), + datetime.date(1900, 12, 25), + datetime.date(1900, 12, 25), + datetime.date(1900, 12, 25), + datetime.date(1900, 12, 25), + datetime.date(1900, 12, 25), + ], + dtype="object", + ), + } + expected = pd.DataFrame.from_dict(data) + + inputfile = os.path.join(dirpath, "TestOrcFile.testDate1900.orc") + got = read_orc(inputfile).iloc[:10] + + tm.assert_equal(expected, got) + + +def test_orc_reader_date_high(dirpath): + data = { + "time": np.array( + [ + "2038-05-05 12:34:56.100000", + "2038-05-05 12:34:56.100100", + "2038-05-05 12:34:56.100200", + "2038-05-05 12:34:56.100300", + "2038-05-05 12:34:56.100400", + "2038-05-05 12:34:56.100500", + "2038-05-05 12:34:56.100600", + "2038-05-05 12:34:56.100700", + "2038-05-05 12:34:56.100800", + "2038-05-05 12:34:56.100900", + ], + dtype="datetime64[ns]", + ), + "date": np.array( + [ + datetime.date(2038, 12, 25), + datetime.date(2038, 12, 25), + datetime.date(2038, 12, 25), + datetime.date(2038, 12, 25), + datetime.date(2038, 12, 25), + datetime.date(2038, 12, 25), + datetime.date(2038, 12, 25), + datetime.date(2038, 12, 25), + datetime.date(2038, 12, 25), + datetime.date(2038, 12, 25), + ], + dtype="object", + ), + } + expected = pd.DataFrame.from_dict(data) + + inputfile = os.path.join(dirpath, "TestOrcFile.testDate2038.orc") + got = read_orc(inputfile).iloc[:10] + + tm.assert_equal(expected, got) + + +def test_orc_reader_snappy_compressed(dirpath): + data = { + "int1": np.array( + [ + -1160101563, + 1181413113, + 2065821249, + -267157795, + 172111193, + 1752363137, + 1406072123, + 1911809390, + -1308542224, + -467100286, + ], + dtype="int32", + ), + "string1": np.array( + [ + "f50dcb8", + "382fdaaa", + "90758c6", + "9e8caf3f", + "ee97332b", + "d634da1", + "2bea4396", + "d67d89e8", + "ad71007e", + "e8c82066", + ], + dtype="object", + ), + } + expected = pd.DataFrame.from_dict(data) + + inputfile = os.path.join(dirpath, "TestOrcFile.testSnappy.orc") + got = read_orc(inputfile).iloc[:10] + + tm.assert_equal(expected, got) diff --git a/venv/Lib/site-packages/pandas/tests/io/test_parquet.py b/venv/Lib/site-packages/pandas/tests/io/test_parquet.py new file mode 100644 index 0000000..d51c712 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/test_parquet.py @@ -0,0 +1,685 @@ +""" test parquet compat """ +import datetime +from distutils.version import LooseVersion +import os +from warnings import catch_warnings + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +import pandas._testing as tm + +from pandas.io.parquet import ( + FastParquetImpl, + PyArrowImpl, + get_engine, + read_parquet, + to_parquet, +) + +try: + import pyarrow # noqa + + _HAVE_PYARROW = True +except ImportError: + _HAVE_PYARROW = False + +try: + import fastparquet # noqa + + _HAVE_FASTPARQUET = True +except ImportError: + _HAVE_FASTPARQUET = False + +pytestmark = pytest.mark.filterwarnings( + "ignore:RangeIndex.* is deprecated:DeprecationWarning" +) + + +# setup engines & skips +@pytest.fixture( + params=[ + pytest.param( + "fastparquet", + marks=pytest.mark.skipif( + not _HAVE_FASTPARQUET, reason="fastparquet is not installed" + ), + ), + pytest.param( + "pyarrow", + marks=pytest.mark.skipif( + not _HAVE_PYARROW, reason="pyarrow is not installed" + ), + ), + ] +) +def engine(request): + return request.param + + +@pytest.fixture +def pa(): + if not _HAVE_PYARROW: + pytest.skip("pyarrow is not installed") + return "pyarrow" + + +@pytest.fixture +def fp(): + if not _HAVE_FASTPARQUET: + pytest.skip("fastparquet is not installed") + return "fastparquet" + + +@pytest.fixture +def df_compat(): + return pd.DataFrame({"A": [1, 2, 3], "B": "foo"}) + + +@pytest.fixture +def df_cross_compat(): + df = pd.DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + # 'c': np.arange(3, 6).astype('u1'), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.date_range("20130101", periods=3), + # 'g': pd.date_range('20130101', periods=3, + # tz='US/Eastern'), + # 'h': pd.date_range('20130101', periods=3, freq='ns') + } + ) + return df + + +@pytest.fixture +def df_full(): + return pd.DataFrame( + { + "string": list("abc"), + "string_with_nan": ["a", np.nan, "c"], + "string_with_none": ["a", None, "c"], + "bytes": [b"foo", b"bar", b"baz"], + "unicode": ["foo", "bar", "baz"], + "int": list(range(1, 4)), + "uint": np.arange(3, 6).astype("u1"), + "float": np.arange(4.0, 7.0, dtype="float64"), + "float_with_nan": [2.0, np.nan, 3.0], + "bool": [True, False, True], + "datetime": pd.date_range("20130101", periods=3), + "datetime_with_nat": [ + pd.Timestamp("20130101"), + pd.NaT, + pd.Timestamp("20130103"), + ], + } + ) + + +def check_round_trip( + df, + engine=None, + path=None, + write_kwargs=None, + read_kwargs=None, + expected=None, + check_names=True, + repeat=2, +): + """Verify parquet serializer and deserializer produce the same results. + + Performs a pandas to disk and disk to pandas round trip, + then compares the 2 resulting DataFrames to verify equality. + + Parameters + ---------- + df: Dataframe + engine: str, optional + 'pyarrow' or 'fastparquet' + path: str, optional + write_kwargs: dict of str:str, optional + read_kwargs: dict of str:str, optional + expected: DataFrame, optional + Expected deserialization result, otherwise will be equal to `df` + check_names: list of str, optional + Closed set of column names to be compared + repeat: int, optional + How many times to repeat the test + """ + + write_kwargs = write_kwargs or {"compression": None} + read_kwargs = read_kwargs or {} + + if expected is None: + expected = df + + if engine: + write_kwargs["engine"] = engine + read_kwargs["engine"] = engine + + def compare(repeat): + for _ in range(repeat): + df.to_parquet(path, **write_kwargs) + with catch_warnings(record=True): + actual = read_parquet(path, **read_kwargs) + + tm.assert_frame_equal(expected, actual, check_names=check_names) + + if path is None: + with tm.ensure_clean() as path: + compare(repeat) + else: + compare(repeat) + + +def test_invalid_engine(df_compat): + with pytest.raises(ValueError): + check_round_trip(df_compat, "foo", "bar") + + +def test_options_py(df_compat, pa): + # use the set option + + with pd.option_context("io.parquet.engine", "pyarrow"): + check_round_trip(df_compat) + + +def test_options_fp(df_compat, fp): + # use the set option + + with pd.option_context("io.parquet.engine", "fastparquet"): + check_round_trip(df_compat) + + +def test_options_auto(df_compat, fp, pa): + # use the set option + + with pd.option_context("io.parquet.engine", "auto"): + check_round_trip(df_compat) + + +def test_options_get_engine(fp, pa): + assert isinstance(get_engine("pyarrow"), PyArrowImpl) + assert isinstance(get_engine("fastparquet"), FastParquetImpl) + + with pd.option_context("io.parquet.engine", "pyarrow"): + assert isinstance(get_engine("auto"), PyArrowImpl) + assert isinstance(get_engine("pyarrow"), PyArrowImpl) + assert isinstance(get_engine("fastparquet"), FastParquetImpl) + + with pd.option_context("io.parquet.engine", "fastparquet"): + assert isinstance(get_engine("auto"), FastParquetImpl) + assert isinstance(get_engine("pyarrow"), PyArrowImpl) + assert isinstance(get_engine("fastparquet"), FastParquetImpl) + + with pd.option_context("io.parquet.engine", "auto"): + assert isinstance(get_engine("auto"), PyArrowImpl) + assert isinstance(get_engine("pyarrow"), PyArrowImpl) + assert isinstance(get_engine("fastparquet"), FastParquetImpl) + + +def test_cross_engine_pa_fp(df_cross_compat, pa, fp): + # cross-compat with differing reading/writing engines + + df = df_cross_compat + with tm.ensure_clean() as path: + df.to_parquet(path, engine=pa, compression=None) + + result = read_parquet(path, engine=fp) + tm.assert_frame_equal(result, df) + + result = read_parquet(path, engine=fp, columns=["a", "d"]) + tm.assert_frame_equal(result, df[["a", "d"]]) + + +def test_cross_engine_fp_pa(df_cross_compat, pa, fp): + # cross-compat with differing reading/writing engines + + if ( + LooseVersion(pyarrow.__version__) < "0.15" + and LooseVersion(pyarrow.__version__) >= "0.13" + ): + pytest.xfail( + "Reading fastparquet with pyarrow in 0.14 fails: " + "https://issues.apache.org/jira/browse/ARROW-6492" + ) + + df = df_cross_compat + with tm.ensure_clean() as path: + df.to_parquet(path, engine=fp, compression=None) + + with catch_warnings(record=True): + result = read_parquet(path, engine=pa) + tm.assert_frame_equal(result, df) + + result = read_parquet(path, engine=pa, columns=["a", "d"]) + tm.assert_frame_equal(result, df[["a", "d"]]) + + +class Base: + def check_error_on_write(self, df, engine, exc): + # check that we are raising the exception on writing + with tm.ensure_clean() as path: + with pytest.raises(exc): + to_parquet(df, path, engine, compression=None) + + +class TestBasic(Base): + def test_error(self, engine): + for obj in [ + pd.Series([1, 2, 3]), + 1, + "foo", + pd.Timestamp("20130101"), + np.array([1, 2, 3]), + ]: + self.check_error_on_write(obj, engine, ValueError) + + def test_columns_dtypes(self, engine): + df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) + + # unicode + df.columns = ["foo", "bar"] + check_round_trip(df, engine) + + def test_columns_dtypes_invalid(self, engine): + df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) + + # numeric + df.columns = [0, 1] + self.check_error_on_write(df, engine, ValueError) + + # bytes + df.columns = [b"foo", b"bar"] + self.check_error_on_write(df, engine, ValueError) + + # python object + df.columns = [ + datetime.datetime(2011, 1, 1, 0, 0), + datetime.datetime(2011, 1, 1, 1, 1), + ] + self.check_error_on_write(df, engine, ValueError) + + @pytest.mark.parametrize("compression", [None, "gzip", "snappy", "brotli"]) + def test_compression(self, engine, compression): + + if compression == "snappy": + pytest.importorskip("snappy") + + elif compression == "brotli": + pytest.importorskip("brotli") + + df = pd.DataFrame({"A": [1, 2, 3]}) + check_round_trip(df, engine, write_kwargs={"compression": compression}) + + def test_read_columns(self, engine): + # GH18154 + df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) + + expected = pd.DataFrame({"string": list("abc")}) + check_round_trip( + df, engine, expected=expected, read_kwargs={"columns": ["string"]} + ) + + def test_write_index(self, engine): + check_names = engine != "fastparquet" + + df = pd.DataFrame({"A": [1, 2, 3]}) + check_round_trip(df, engine) + + indexes = [ + [2, 3, 4], + pd.date_range("20130101", periods=3), + list("abc"), + [1, 3, 4], + ] + # non-default index + for index in indexes: + df.index = index + check_round_trip(df, engine, check_names=check_names) + + # index with meta-data + df.index = [0, 1, 2] + df.index.name = "foo" + check_round_trip(df, engine) + + def test_write_multiindex(self, pa): + # Not supported in fastparquet as of 0.1.3 or older pyarrow version + engine = pa + + df = pd.DataFrame({"A": [1, 2, 3]}) + index = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]) + df.index = index + check_round_trip(df, engine) + + def test_write_column_multiindex(self, engine): + # column multi-index + mi_columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]) + df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns) + self.check_error_on_write(df, engine, ValueError) + + def test_multiindex_with_columns(self, pa): + engine = pa + dates = pd.date_range("01-Jan-2018", "01-Dec-2018", freq="MS") + df = pd.DataFrame(np.random.randn(2 * len(dates), 3), columns=list("ABC")) + index1 = pd.MultiIndex.from_product( + [["Level1", "Level2"], dates], names=["level", "date"] + ) + index2 = index1.copy(names=None) + for index in [index1, index2]: + df.index = index + + check_round_trip(df, engine) + check_round_trip( + df, engine, read_kwargs={"columns": ["A", "B"]}, expected=df[["A", "B"]] + ) + + def test_write_ignoring_index(self, engine): + # ENH 20768 + # Ensure index=False omits the index from the written Parquet file. + df = pd.DataFrame({"a": [1, 2, 3], "b": ["q", "r", "s"]}) + + write_kwargs = {"compression": None, "index": False} + + # Because we're dropping the index, we expect the loaded dataframe to + # have the default integer index. + expected = df.reset_index(drop=True) + + check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected) + + # Ignore custom index + df = pd.DataFrame( + {"a": [1, 2, 3], "b": ["q", "r", "s"]}, index=["zyx", "wvu", "tsr"] + ) + + check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected) + + # Ignore multi-indexes as well. + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + df = pd.DataFrame( + {"one": list(range(8)), "two": [-i for i in range(8)]}, index=arrays + ) + + expected = df.reset_index(drop=True) + check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected) + + +class TestParquetPyArrow(Base): + def test_basic(self, pa, df_full): + + df = df_full + + # additional supported types for pyarrow + df["datetime_tz"] = pd.date_range("20130101", periods=3, tz="Europe/Brussels") + df["bool_with_none"] = [True, None, True] + + check_round_trip(df, pa) + + def test_basic_subset_columns(self, pa, df_full): + # GH18628 + + df = df_full + # additional supported types for pyarrow + df["datetime_tz"] = pd.date_range("20130101", periods=3, tz="Europe/Brussels") + + check_round_trip( + df, + pa, + expected=df[["string", "int"]], + read_kwargs={"columns": ["string", "int"]}, + ) + + def test_duplicate_columns(self, pa): + # not currently able to handle duplicate columns + df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy() + self.check_error_on_write(df, pa, ValueError) + + def test_unsupported(self, pa): + if LooseVersion(pyarrow.__version__) < LooseVersion("0.15.1.dev"): + # period - will be supported using an extension type with pyarrow 1.0 + df = pd.DataFrame({"a": pd.period_range("2013", freq="M", periods=3)}) + # pyarrow 0.11 raises ArrowTypeError + # older pyarrows raise ArrowInvalid + self.check_error_on_write(df, pa, Exception) + + # timedelta + df = pd.DataFrame({"a": pd.timedelta_range("1 day", periods=3)}) + self.check_error_on_write(df, pa, NotImplementedError) + + # mixed python objects + df = pd.DataFrame({"a": ["a", 1, 2.0]}) + # pyarrow 0.11 raises ArrowTypeError + # older pyarrows raise ArrowInvalid + self.check_error_on_write(df, pa, Exception) + + def test_categorical(self, pa): + + # supported in >= 0.7.0 + df = pd.DataFrame() + df["a"] = pd.Categorical(list("abcdef")) + + # test for null, out-of-order values, and unobserved category + df["b"] = pd.Categorical( + ["bar", "foo", "foo", "bar", None, "bar"], + dtype=pd.CategoricalDtype(["foo", "bar", "baz"]), + ) + + # test for ordered flag + df["c"] = pd.Categorical( + ["a", "b", "c", "a", "c", "b"], categories=["b", "c", "d"], ordered=True + ) + + if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15.0"): + check_round_trip(df, pa) + else: + # de-serialized as object for pyarrow < 0.15 + expected = df.astype(object) + check_round_trip(df, pa, expected=expected) + + def test_s3_roundtrip(self, df_compat, s3_resource, pa): + # GH #19134 + check_round_trip(df_compat, pa, path="s3://pandas-test/pyarrow.parquet") + + def test_partition_cols_supported(self, pa, df_full): + # GH #23283 + partition_cols = ["bool", "int"] + df = df_full + with tm.ensure_clean_dir() as path: + df.to_parquet(path, partition_cols=partition_cols, compression=None) + import pyarrow.parquet as pq + + dataset = pq.ParquetDataset(path, validate_schema=False) + assert len(dataset.partitions.partition_names) == 2 + assert dataset.partitions.partition_names == set(partition_cols) + + def test_partition_cols_string(self, pa, df_full): + # GH #27117 + partition_cols = "bool" + partition_cols_list = [partition_cols] + df = df_full + with tm.ensure_clean_dir() as path: + df.to_parquet(path, partition_cols=partition_cols, compression=None) + import pyarrow.parquet as pq + + dataset = pq.ParquetDataset(path, validate_schema=False) + assert len(dataset.partitions.partition_names) == 1 + assert dataset.partitions.partition_names == set(partition_cols_list) + + def test_empty_dataframe(self, pa): + # GH #27339 + df = pd.DataFrame() + check_round_trip(df, pa) + + def test_write_with_schema(self, pa): + import pyarrow + + df = pd.DataFrame({"x": [0, 1]}) + schema = pyarrow.schema([pyarrow.field("x", type=pyarrow.bool_())]) + out_df = df.astype(bool) + check_round_trip(df, pa, write_kwargs={"schema": schema}, expected=out_df) + + @td.skip_if_no("pyarrow", min_version="0.15.0") + def test_additional_extension_arrays(self, pa): + # test additional ExtensionArrays that are supported through the + # __arrow_array__ protocol + df = pd.DataFrame( + { + "a": pd.Series([1, 2, 3], dtype="Int64"), + "b": pd.Series(["a", None, "c"], dtype="string"), + } + ) + if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15.1.dev"): + expected = df + else: + # de-serialized as plain int / object + expected = df.assign(a=df.a.astype("int64"), b=df.b.astype("object")) + check_round_trip(df, pa, expected=expected) + + df = pd.DataFrame({"a": pd.Series([1, 2, 3, None], dtype="Int64")}) + if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15.1.dev"): + expected = df + else: + # if missing values in integer, currently de-serialized as float + expected = df.assign(a=df.a.astype("float64")) + check_round_trip(df, pa, expected=expected) + + @td.skip_if_no("pyarrow", min_version="0.15.1.dev") + def test_additional_extension_types(self, pa): + # test additional ExtensionArrays that are supported through the + # __arrow_array__ protocol + by defining a custom ExtensionType + df = pd.DataFrame( + { + # Arrow does not yet support struct in writing to Parquet (ARROW-1644) + # "c": pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 2), (3, 4)]), + "d": pd.period_range("2012-01-01", periods=3, freq="D"), + } + ) + check_round_trip(df, pa) + + +class TestParquetFastParquet(Base): + @td.skip_if_no("fastparquet", min_version="0.3.2") + def test_basic(self, fp, df_full): + df = df_full + + df["datetime_tz"] = pd.date_range("20130101", periods=3, tz="US/Eastern") + df["timedelta"] = pd.timedelta_range("1 day", periods=3) + check_round_trip(df, fp) + + @pytest.mark.skip(reason="not supported") + def test_duplicate_columns(self, fp): + + # not currently able to handle duplicate columns + df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy() + self.check_error_on_write(df, fp, ValueError) + + def test_bool_with_none(self, fp): + df = pd.DataFrame({"a": [True, None, False]}) + expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16") + check_round_trip(df, fp, expected=expected) + + def test_unsupported(self, fp): + + # period + df = pd.DataFrame({"a": pd.period_range("2013", freq="M", periods=3)}) + self.check_error_on_write(df, fp, ValueError) + + # mixed + df = pd.DataFrame({"a": ["a", 1, 2.0]}) + self.check_error_on_write(df, fp, ValueError) + + def test_categorical(self, fp): + df = pd.DataFrame({"a": pd.Categorical(list("abc"))}) + check_round_trip(df, fp) + + def test_filter_row_groups(self, fp): + d = {"a": list(range(0, 3))} + df = pd.DataFrame(d) + with tm.ensure_clean() as path: + df.to_parquet(path, fp, compression=None, row_group_offsets=1) + result = read_parquet(path, fp, filters=[("a", "==", 0)]) + assert len(result) == 1 + + def test_s3_roundtrip(self, df_compat, s3_resource, fp): + # GH #19134 + check_round_trip(df_compat, fp, path="s3://pandas-test/fastparquet.parquet") + + def test_partition_cols_supported(self, fp, df_full): + # GH #23283 + partition_cols = ["bool", "int"] + df = df_full + with tm.ensure_clean_dir() as path: + df.to_parquet( + path, + engine="fastparquet", + partition_cols=partition_cols, + compression=None, + ) + assert os.path.exists(path) + import fastparquet # noqa: F811 + + actual_partition_cols = fastparquet.ParquetFile(path, False).cats + assert len(actual_partition_cols) == 2 + + def test_partition_cols_string(self, fp, df_full): + # GH #27117 + partition_cols = "bool" + df = df_full + with tm.ensure_clean_dir() as path: + df.to_parquet( + path, + engine="fastparquet", + partition_cols=partition_cols, + compression=None, + ) + assert os.path.exists(path) + import fastparquet # noqa: F811 + + actual_partition_cols = fastparquet.ParquetFile(path, False).cats + assert len(actual_partition_cols) == 1 + + def test_partition_on_supported(self, fp, df_full): + # GH #23283 + partition_cols = ["bool", "int"] + df = df_full + with tm.ensure_clean_dir() as path: + df.to_parquet( + path, + engine="fastparquet", + compression=None, + partition_on=partition_cols, + ) + assert os.path.exists(path) + import fastparquet # noqa: F811 + + actual_partition_cols = fastparquet.ParquetFile(path, False).cats + assert len(actual_partition_cols) == 2 + + def test_error_on_using_partition_cols_and_partition_on(self, fp, df_full): + # GH #23283 + partition_cols = ["bool", "int"] + df = df_full + with pytest.raises(ValueError): + with tm.ensure_clean_dir() as path: + df.to_parquet( + path, + engine="fastparquet", + compression=None, + partition_on=partition_cols, + partition_cols=partition_cols, + ) + + def test_empty_dataframe(self, fp): + # GH #27339 + df = pd.DataFrame() + expected = df.copy() + expected.index.name = "index" + check_round_trip(df, fp, expected=expected) diff --git a/venv/Lib/site-packages/pandas/tests/io/test_pickle.py b/venv/Lib/site-packages/pandas/tests/io/test_pickle.py new file mode 100644 index 0000000..3d427dd --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/test_pickle.py @@ -0,0 +1,489 @@ +""" +manage legacy pickle tests + +How to add pickle tests: + +1. Install pandas version intended to output the pickle. + +2. Execute "generate_legacy_storage_files.py" to create the pickle. +$ python generate_legacy_storage_files.py pickle + +3. Move the created pickle to "data/legacy_pickle/" directory. +""" +import bz2 +import glob +import gzip +import os +import pickle +import shutil +from warnings import catch_warnings, simplefilter +import zipfile + +import pytest + +from pandas.compat import _get_lzma_file, _import_lzma, is_platform_little_endian +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import Index +import pandas._testing as tm + +from pandas.tseries.offsets import Day, MonthEnd + +lzma = _import_lzma() + + +@pytest.fixture(scope="module") +def current_pickle_data(): + # our current version pickle data + from pandas.tests.io.generate_legacy_storage_files import create_pickle_data + + return create_pickle_data() + + +# --------------------- +# comparison functions +# --------------------- +def compare_element(result, expected, typ, version=None): + if isinstance(expected, Index): + tm.assert_index_equal(expected, result) + return + + if typ.startswith("sp_"): + comparator = tm.assert_equal + comparator(result, expected) + elif typ == "timestamp": + if expected is pd.NaT: + assert result is pd.NaT + else: + assert result == expected + assert result.freq == expected.freq + else: + comparator = getattr( + tm, "assert_{typ}_equal".format(typ=typ), tm.assert_almost_equal + ) + comparator(result, expected) + + +def compare(data, vf, version): + + data = pd.read_pickle(vf) + + m = globals() + for typ, dv in data.items(): + for dt, result in dv.items(): + expected = data[typ][dt] + + # use a specific comparator + # if available + comparator = "compare_{typ}_{dt}".format(typ=typ, dt=dt) + + comparator = m.get(comparator, m["compare_element"]) + comparator(result, expected, typ, version) + return data + + +def compare_series_ts(result, expected, typ, version): + # GH 7748 + tm.assert_series_equal(result, expected) + assert result.index.freq == expected.index.freq + assert not result.index.freq.normalize + tm.assert_series_equal(result > 0, expected > 0) + + # GH 9291 + freq = result.index.freq + assert freq + Day(1) == Day(2) + + res = freq + pd.Timedelta(hours=1) + assert isinstance(res, pd.Timedelta) + assert res == pd.Timedelta(days=1, hours=1) + + res = freq + pd.Timedelta(nanoseconds=1) + assert isinstance(res, pd.Timedelta) + assert res == pd.Timedelta(days=1, nanoseconds=1) + + +def compare_series_dt_tz(result, expected, typ, version): + tm.assert_series_equal(result, expected) + + +def compare_series_cat(result, expected, typ, version): + tm.assert_series_equal(result, expected) + + +def compare_frame_dt_mixed_tzs(result, expected, typ, version): + tm.assert_frame_equal(result, expected) + + +def compare_frame_cat_onecol(result, expected, typ, version): + tm.assert_frame_equal(result, expected) + + +def compare_frame_cat_and_float(result, expected, typ, version): + compare_frame_cat_onecol(result, expected, typ, version) + + +def compare_index_period(result, expected, typ, version): + tm.assert_index_equal(result, expected) + assert isinstance(result.freq, MonthEnd) + assert result.freq == MonthEnd() + assert result.freqstr == "M" + tm.assert_index_equal(result.shift(2), expected.shift(2)) + + +files = glob.glob( + os.path.join(os.path.dirname(__file__), "data", "legacy_pickle", "*", "*.pickle") +) + + +@pytest.fixture(params=files) +def legacy_pickle(request, datapath): + return datapath(request.param) + + +# --------------------- +# tests +# --------------------- +def test_pickles(current_pickle_data, legacy_pickle): + if not is_platform_little_endian(): + pytest.skip("known failure on non-little endian") + + version = os.path.basename(os.path.dirname(legacy_pickle)) + with catch_warnings(record=True): + simplefilter("ignore") + compare(current_pickle_data, legacy_pickle, version) + + +def test_round_trip_current(current_pickle_data): + def python_pickler(obj, path): + with open(path, "wb") as fh: + pickle.dump(obj, fh, protocol=-1) + + def python_unpickler(path): + with open(path, "rb") as fh: + fh.seek(0) + return pickle.load(fh) + + data = current_pickle_data + for typ, dv in data.items(): + for dt, expected in dv.items(): + + for writer in [pd.to_pickle, python_pickler]: + if writer is None: + continue + + with tm.ensure_clean() as path: + + # test writing with each pickler + writer(expected, path) + + # test reading with each unpickler + result = pd.read_pickle(path) + compare_element(result, expected, typ) + + result = python_unpickler(path) + compare_element(result, expected, typ) + + +def test_pickle_path_pathlib(): + df = tm.makeDataFrame() + result = tm.round_trip_pathlib(df.to_pickle, pd.read_pickle) + tm.assert_frame_equal(df, result) + + +def test_pickle_path_localpath(): + df = tm.makeDataFrame() + result = tm.round_trip_localpath(df.to_pickle, pd.read_pickle) + tm.assert_frame_equal(df, result) + + +def test_legacy_sparse_warning(datapath): + """ + + Generated with + + >>> df = pd.DataFrame({"A": [1, 2, 3, 4], "B": [0, 0, 1, 1]}).to_sparse() + >>> df.to_pickle("pandas/tests/io/data/pickle/sparseframe-0.20.3.pickle.gz", + ... compression="gzip") + + >>> s = df['B'] + >>> s.to_pickle("pandas/tests/io/data/pickle/sparseseries-0.20.3.pickle.gz", + ... compression="gzip") + """ + with tm.assert_produces_warning(FutureWarning): + simplefilter("ignore", DeprecationWarning) # from boto + pd.read_pickle( + datapath("io", "data", "pickle", "sparseseries-0.20.3.pickle.gz"), + compression="gzip", + ) + + with tm.assert_produces_warning(FutureWarning): + simplefilter("ignore", DeprecationWarning) # from boto + pd.read_pickle( + datapath("io", "data", "pickle", "sparseframe-0.20.3.pickle.gz"), + compression="gzip", + ) + + +# --------------------- +# test pickle compression +# --------------------- + + +@pytest.fixture +def get_random_path(): + return "__{}__.pickle".format(tm.rands(10)) + + +class TestCompression: + + _compression_to_extension = { + None: ".none", + "gzip": ".gz", + "bz2": ".bz2", + "zip": ".zip", + "xz": ".xz", + } + + def compress_file(self, src_path, dest_path, compression): + if compression is None: + shutil.copyfile(src_path, dest_path) + return + + if compression == "gzip": + f = gzip.open(dest_path, "w") + elif compression == "bz2": + f = bz2.BZ2File(dest_path, "w") + elif compression == "zip": + with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_DEFLATED) as f: + f.write(src_path, os.path.basename(src_path)) + elif compression == "xz": + f = _get_lzma_file(lzma)(dest_path, "w") + else: + msg = "Unrecognized compression type: {}".format(compression) + raise ValueError(msg) + + if compression != "zip": + with open(src_path, "rb") as fh, f: + f.write(fh.read()) + + def test_write_explicit(self, compression, get_random_path): + base = get_random_path + path1 = base + ".compressed" + path2 = base + ".raw" + + with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: + df = tm.makeDataFrame() + + # write to compressed file + df.to_pickle(p1, compression=compression) + + # decompress + with tm.decompress_file(p1, compression=compression) as f: + with open(p2, "wb") as fh: + fh.write(f.read()) + + # read decompressed file + df2 = pd.read_pickle(p2, compression=None) + + tm.assert_frame_equal(df, df2) + + @pytest.mark.parametrize("compression", ["", "None", "bad", "7z"]) + def test_write_explicit_bad(self, compression, get_random_path): + with pytest.raises(ValueError, match="Unrecognized compression type"): + with tm.ensure_clean(get_random_path) as path: + df = tm.makeDataFrame() + df.to_pickle(path, compression=compression) + + @pytest.mark.parametrize("ext", ["", ".gz", ".bz2", ".no_compress", ".xz"]) + def test_write_infer(self, ext, get_random_path): + base = get_random_path + path1 = base + ext + path2 = base + ".raw" + compression = None + for c in self._compression_to_extension: + if self._compression_to_extension[c] == ext: + compression = c + break + + with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: + df = tm.makeDataFrame() + + # write to compressed file by inferred compression method + df.to_pickle(p1) + + # decompress + with tm.decompress_file(p1, compression=compression) as f: + with open(p2, "wb") as fh: + fh.write(f.read()) + + # read decompressed file + df2 = pd.read_pickle(p2, compression=None) + + tm.assert_frame_equal(df, df2) + + def test_read_explicit(self, compression, get_random_path): + base = get_random_path + path1 = base + ".raw" + path2 = base + ".compressed" + + with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: + df = tm.makeDataFrame() + + # write to uncompressed file + df.to_pickle(p1, compression=None) + + # compress + self.compress_file(p1, p2, compression=compression) + + # read compressed file + df2 = pd.read_pickle(p2, compression=compression) + + tm.assert_frame_equal(df, df2) + + @pytest.mark.parametrize("ext", ["", ".gz", ".bz2", ".zip", ".no_compress", ".xz"]) + def test_read_infer(self, ext, get_random_path): + base = get_random_path + path1 = base + ".raw" + path2 = base + ext + compression = None + for c in self._compression_to_extension: + if self._compression_to_extension[c] == ext: + compression = c + break + + with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: + df = tm.makeDataFrame() + + # write to uncompressed file + df.to_pickle(p1, compression=None) + + # compress + self.compress_file(p1, p2, compression=compression) + + # read compressed file by inferred compression method + df2 = pd.read_pickle(p2) + + tm.assert_frame_equal(df, df2) + + +# --------------------- +# test pickle compression +# --------------------- + + +class TestProtocol: + @pytest.mark.parametrize("protocol", [-1, 0, 1, 2]) + def test_read(self, protocol, get_random_path): + with tm.ensure_clean(get_random_path) as path: + df = tm.makeDataFrame() + df.to_pickle(path, protocol=protocol) + df2 = pd.read_pickle(path) + tm.assert_frame_equal(df, df2) + + +def test_unicode_decode_error(datapath): + # pickle file written with py27, should be readable without raising + # UnicodeDecodeError, see GH#28645 + path = datapath("io", "data", "pickle", "test_py27.pkl") + df = pd.read_pickle(path) + + # just test the columns are correct since the values are random + excols = pd.Index(["a", "b", "c"]) + tm.assert_index_equal(df.columns, excols) + + +# --------------------- +# tests for buffer I/O +# --------------------- + + +def test_pickle_buffer_roundtrip(): + with tm.ensure_clean() as path: + df = tm.makeDataFrame() + with open(path, "wb") as fh: + df.to_pickle(fh) + with open(path, "rb") as fh: + result = pd.read_pickle(fh) + tm.assert_frame_equal(df, result) + + +# --------------------- +# tests for URL I/O +# --------------------- + + +@pytest.mark.parametrize( + "mockurl", ["http://url.com", "ftp://test.com", "http://gzip.com"] +) +def test_pickle_generalurl_read(monkeypatch, mockurl): + def python_pickler(obj, path): + with open(path, "wb") as fh: + pickle.dump(obj, fh, protocol=-1) + + class MockReadResponse: + def __init__(self, path): + self.file = open(path, "rb") + if "gzip" in path: + self.headers = {"Content-Encoding": "gzip"} + else: + self.headers = {"Content-Encoding": None} + + def read(self): + return self.file.read() + + def close(self): + return self.file.close() + + with tm.ensure_clean() as path: + + def mock_urlopen_read(*args, **kwargs): + return MockReadResponse(path) + + df = tm.makeDataFrame() + python_pickler(df, path) + monkeypatch.setattr("urllib.request.urlopen", mock_urlopen_read) + result = pd.read_pickle(mockurl) + tm.assert_frame_equal(df, result) + + +@td.skip_if_no("gcsfs") +@pytest.mark.parametrize("mockurl", ["gs://gcs.com", "gcs://gcs.com"]) +def test_pickle_gcsurl_roundtrip(monkeypatch, mockurl): + with tm.ensure_clean() as path: + + class MockGCSFileSystem: + def __init__(self, *args, **kwargs): + pass + + def open(self, *args): + mode = args[1] or None + f = open(path, mode) + return f + + monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) + df = tm.makeDataFrame() + df.to_pickle(mockurl) + result = pd.read_pickle(mockurl) + tm.assert_frame_equal(df, result) + + +@td.skip_if_no("s3fs") +@pytest.mark.parametrize("mockurl", ["s3://s3.com", "s3n://s3.com", "s3a://s3.com"]) +def test_pickle_s3url_roundtrip(monkeypatch, mockurl): + with tm.ensure_clean() as path: + + class MockS3FileSystem: + def __init__(self, *args, **kwargs): + pass + + def open(self, *args): + mode = args[1] or None + f = open(path, mode) + return f + + monkeypatch.setattr("s3fs.S3FileSystem", MockS3FileSystem) + df = tm.makeDataFrame() + df.to_pickle(mockurl) + result = pd.read_pickle(mockurl) + tm.assert_frame_equal(df, result) diff --git a/venv/Lib/site-packages/pandas/tests/io/test_s3.py b/venv/Lib/site-packages/pandas/tests/io/test_s3.py new file mode 100644 index 0000000..04c6979 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/test_s3.py @@ -0,0 +1,25 @@ +from io import BytesIO + +import pytest + +from pandas import read_csv + +from pandas.io.common import is_s3_url + + +class TestS3URL: + def test_is_s3_url(self): + assert is_s3_url("s3://pandas/somethingelse.com") + assert not is_s3_url("s4://pandas/somethingelse.com") + + +def test_streaming_s3_objects(): + # GH17135 + # botocore gained iteration support in 1.10.47, can now be used in read_* + pytest.importorskip("botocore", minversion="1.10.47") + from botocore.response import StreamingBody + + data = [b"foo,bar,baz\n1,2,3\n4,5,6\n", b"just,the,header\n"] + for el in data: + body = StreamingBody(BytesIO(el), content_length=len(el)) + read_csv(body) diff --git a/venv/Lib/site-packages/pandas/tests/io/test_spss.py b/venv/Lib/site-packages/pandas/tests/io/test_spss.py new file mode 100644 index 0000000..013f56f --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/test_spss.py @@ -0,0 +1,73 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + +pyreadstat = pytest.importorskip("pyreadstat") + + +def test_spss_labelled_num(datapath): + # test file from the Haven project (https://haven.tidyverse.org/) + fname = datapath("io", "data", "spss", "labelled-num.sav") + + df = pd.read_spss(fname, convert_categoricals=True) + expected = pd.DataFrame({"VAR00002": "This is one"}, index=[0]) + expected["VAR00002"] = pd.Categorical(expected["VAR00002"]) + tm.assert_frame_equal(df, expected) + + df = pd.read_spss(fname, convert_categoricals=False) + expected = pd.DataFrame({"VAR00002": 1.0}, index=[0]) + tm.assert_frame_equal(df, expected) + + +def test_spss_labelled_num_na(datapath): + # test file from the Haven project (https://haven.tidyverse.org/) + fname = datapath("io", "data", "spss", "labelled-num-na.sav") + + df = pd.read_spss(fname, convert_categoricals=True) + expected = pd.DataFrame({"VAR00002": ["This is one", None]}) + expected["VAR00002"] = pd.Categorical(expected["VAR00002"]) + tm.assert_frame_equal(df, expected) + + df = pd.read_spss(fname, convert_categoricals=False) + expected = pd.DataFrame({"VAR00002": [1.0, np.nan]}) + tm.assert_frame_equal(df, expected) + + +def test_spss_labelled_str(datapath): + # test file from the Haven project (https://haven.tidyverse.org/) + fname = datapath("io", "data", "spss", "labelled-str.sav") + + df = pd.read_spss(fname, convert_categoricals=True) + expected = pd.DataFrame({"gender": ["Male", "Female"]}) + expected["gender"] = pd.Categorical(expected["gender"]) + tm.assert_frame_equal(df, expected) + + df = pd.read_spss(fname, convert_categoricals=False) + expected = pd.DataFrame({"gender": ["M", "F"]}) + tm.assert_frame_equal(df, expected) + + +def test_spss_umlauts(datapath): + # test file from the Haven project (https://haven.tidyverse.org/) + fname = datapath("io", "data", "spss", "umlauts.sav") + + df = pd.read_spss(fname, convert_categoricals=True) + expected = pd.DataFrame( + {"var1": ["the ä umlaut", "the ü umlaut", "the ä umlaut", "the ö umlaut"]} + ) + expected["var1"] = pd.Categorical(expected["var1"]) + tm.assert_frame_equal(df, expected) + + df = pd.read_spss(fname, convert_categoricals=False) + expected = pd.DataFrame({"var1": [1.0, 2.0, 1.0, 3.0]}) + tm.assert_frame_equal(df, expected) + + +def test_spss_usecols(datapath): + # usecols must be list-like + fname = datapath("io", "data", "spss", "labelled-num.sav") + + with pytest.raises(TypeError, match="usecols must be list-like."): + pd.read_spss(fname, usecols="VAR00002") diff --git a/venv/Lib/site-packages/pandas/tests/io/test_sql.py b/venv/Lib/site-packages/pandas/tests/io/test_sql.py new file mode 100644 index 0000000..45b3e83 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/test_sql.py @@ -0,0 +1,2851 @@ +"""SQL io tests + +The SQL tests are broken down in different classes: + +- `PandasSQLTest`: base class with common methods for all test classes +- Tests for the public API (only tests with sqlite3) + - `_TestSQLApi` base class + - `TestSQLApi`: test the public API with sqlalchemy engine + - `TestSQLiteFallbackApi`: test the public API with a sqlite DBAPI + connection +- Tests for the different SQL flavors (flavor specific type conversions) + - Tests for the sqlalchemy mode: `_TestSQLAlchemy` is the base class with + common methods, `_TestSQLAlchemyConn` tests the API with a SQLAlchemy + Connection object. The different tested flavors (sqlite3, MySQL, + PostgreSQL) derive from the base class + - Tests for the fallback mode (`TestSQLiteFallback`) + +""" + +import csv +from datetime import date, datetime, time +from io import StringIO +import sqlite3 +import warnings + +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_datetime64_dtype, is_datetime64tz_dtype + +import pandas as pd +from pandas import ( + DataFrame, + Index, + MultiIndex, + Series, + Timestamp, + concat, + date_range, + isna, + to_datetime, + to_timedelta, +) +import pandas._testing as tm + +import pandas.io.sql as sql +from pandas.io.sql import read_sql_query, read_sql_table + +try: + import sqlalchemy + import sqlalchemy.schema + import sqlalchemy.sql.sqltypes as sqltypes + from sqlalchemy.ext import declarative + from sqlalchemy.orm import session as sa_session + + SQLALCHEMY_INSTALLED = True +except ImportError: + SQLALCHEMY_INSTALLED = False + +SQL_STRINGS = { + "create_iris": { + "sqlite": """CREATE TABLE iris ( + "SepalLength" REAL, + "SepalWidth" REAL, + "PetalLength" REAL, + "PetalWidth" REAL, + "Name" TEXT + )""", + "mysql": """CREATE TABLE iris ( + `SepalLength` DOUBLE, + `SepalWidth` DOUBLE, + `PetalLength` DOUBLE, + `PetalWidth` DOUBLE, + `Name` VARCHAR(200) + )""", + "postgresql": """CREATE TABLE iris ( + "SepalLength" DOUBLE PRECISION, + "SepalWidth" DOUBLE PRECISION, + "PetalLength" DOUBLE PRECISION, + "PetalWidth" DOUBLE PRECISION, + "Name" VARCHAR(200) + )""", + }, + "insert_iris": { + "sqlite": """INSERT INTO iris VALUES(?, ?, ?, ?, ?)""", + "mysql": """INSERT INTO iris VALUES(%s, %s, %s, %s, "%s");""", + "postgresql": """INSERT INTO iris VALUES(%s, %s, %s, %s, %s);""", + }, + "create_test_types": { + "sqlite": """CREATE TABLE types_test_data ( + "TextCol" TEXT, + "DateCol" TEXT, + "IntDateCol" INTEGER, + "IntDateOnlyCol" INTEGER, + "FloatCol" REAL, + "IntCol" INTEGER, + "BoolCol" INTEGER, + "IntColWithNull" INTEGER, + "BoolColWithNull" INTEGER + )""", + "mysql": """CREATE TABLE types_test_data ( + `TextCol` TEXT, + `DateCol` DATETIME, + `IntDateCol` INTEGER, + `IntDateOnlyCol` INTEGER, + `FloatCol` DOUBLE, + `IntCol` INTEGER, + `BoolCol` BOOLEAN, + `IntColWithNull` INTEGER, + `BoolColWithNull` BOOLEAN + )""", + "postgresql": """CREATE TABLE types_test_data ( + "TextCol" TEXT, + "DateCol" TIMESTAMP, + "DateColWithTz" TIMESTAMP WITH TIME ZONE, + "IntDateCol" INTEGER, + "IntDateOnlyCol" INTEGER, + "FloatCol" DOUBLE PRECISION, + "IntCol" INTEGER, + "BoolCol" BOOLEAN, + "IntColWithNull" INTEGER, + "BoolColWithNull" BOOLEAN + )""", + }, + "insert_test_types": { + "sqlite": { + "query": """ + INSERT INTO types_test_data + VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + "fields": ( + "TextCol", + "DateCol", + "IntDateCol", + "IntDateOnlyCol", + "FloatCol", + "IntCol", + "BoolCol", + "IntColWithNull", + "BoolColWithNull", + ), + }, + "mysql": { + "query": """ + INSERT INTO types_test_data + VALUES("%s", %s, %s, %s, %s, %s, %s, %s, %s) + """, + "fields": ( + "TextCol", + "DateCol", + "IntDateCol", + "IntDateOnlyCol", + "FloatCol", + "IntCol", + "BoolCol", + "IntColWithNull", + "BoolColWithNull", + ), + }, + "postgresql": { + "query": """ + INSERT INTO types_test_data + VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + """, + "fields": ( + "TextCol", + "DateCol", + "DateColWithTz", + "IntDateCol", + "IntDateOnlyCol", + "FloatCol", + "IntCol", + "BoolCol", + "IntColWithNull", + "BoolColWithNull", + ), + }, + }, + "read_parameters": { + "sqlite": "SELECT * FROM iris WHERE Name=? AND SepalLength=?", + "mysql": 'SELECT * FROM iris WHERE `Name`="%s" AND `SepalLength`=%s', + "postgresql": 'SELECT * FROM iris WHERE "Name"=%s AND "SepalLength"=%s', + }, + "read_named_parameters": { + "sqlite": """ + SELECT * FROM iris WHERE Name=:name AND SepalLength=:length + """, + "mysql": """ + SELECT * FROM iris WHERE + `Name`="%(name)s" AND `SepalLength`=%(length)s + """, + "postgresql": """ + SELECT * FROM iris WHERE + "Name"=%(name)s AND "SepalLength"=%(length)s + """, + }, + "create_view": { + "sqlite": """ + CREATE VIEW iris_view AS + SELECT * FROM iris + """ + }, +} + + +class MixInBase: + def teardown_method(self, method): + # if setup fails, there may not be a connection to close. + if hasattr(self, "conn"): + for tbl in self._get_all_tables(): + self.drop_table(tbl) + self._close_conn() + + +class MySQLMixIn(MixInBase): + def drop_table(self, table_name): + cur = self.conn.cursor() + cur.execute(f"DROP TABLE IF EXISTS {sql._get_valid_mysql_name(table_name)}") + self.conn.commit() + + def _get_all_tables(self): + cur = self.conn.cursor() + cur.execute("SHOW TABLES") + return [table[0] for table in cur.fetchall()] + + def _close_conn(self): + from pymysql.err import Error + + try: + self.conn.close() + except Error: + pass + + +class SQLiteMixIn(MixInBase): + def drop_table(self, table_name): + self.conn.execute( + f"DROP TABLE IF EXISTS {sql._get_valid_sqlite_name(table_name)}" + ) + self.conn.commit() + + def _get_all_tables(self): + c = self.conn.execute("SELECT name FROM sqlite_master WHERE type='table'") + return [table[0] for table in c.fetchall()] + + def _close_conn(self): + self.conn.close() + + +class SQLAlchemyMixIn(MixInBase): + def drop_table(self, table_name): + sql.SQLDatabase(self.conn).drop_table(table_name) + + def _get_all_tables(self): + meta = sqlalchemy.schema.MetaData(bind=self.conn) + meta.reflect() + table_list = meta.tables.keys() + return table_list + + def _close_conn(self): + pass + + +class PandasSQLTest: + """ + Base class with common private methods for SQLAlchemy and fallback cases. + + """ + + def _get_exec(self): + if hasattr(self.conn, "execute"): + return self.conn + else: + return self.conn.cursor() + + @pytest.fixture(params=[("data", "iris.csv")]) + def load_iris_data(self, datapath, request): + import io + + iris_csv_file = datapath(*request.param) + + if not hasattr(self, "conn"): + self.setup_connect() + + self.drop_table("iris") + self._get_exec().execute(SQL_STRINGS["create_iris"][self.flavor]) + + with io.open(iris_csv_file, mode="r", newline=None) as iris_csv: + r = csv.reader(iris_csv) + next(r) # skip header row + ins = SQL_STRINGS["insert_iris"][self.flavor] + + for row in r: + self._get_exec().execute(ins, row) + + def _load_iris_view(self): + self.drop_table("iris_view") + self._get_exec().execute(SQL_STRINGS["create_view"][self.flavor]) + + def _check_iris_loaded_frame(self, iris_frame): + pytype = iris_frame.dtypes[0].type + row = iris_frame.iloc[0] + + assert issubclass(pytype, np.floating) + tm.equalContents(row.values, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) + + def _load_test1_data(self): + columns = ["index", "A", "B", "C", "D"] + data = [ + ( + "2000-01-03 00:00:00", + 0.980268513777, + 3.68573087906, + -0.364216805298, + -1.15973806169, + ), + ( + "2000-01-04 00:00:00", + 1.04791624281, + -0.0412318367011, + -0.16181208307, + 0.212549316967, + ), + ( + "2000-01-05 00:00:00", + 0.498580885705, + 0.731167677815, + -0.537677223318, + 1.34627041952, + ), + ( + "2000-01-06 00:00:00", + 1.12020151869, + 1.56762092543, + 0.00364077397681, + 0.67525259227, + ), + ] + + self.test_frame1 = DataFrame(data, columns=columns) + + def _load_test2_data(self): + df = DataFrame( + dict( + A=[4, 1, 3, 6], + B=["asd", "gsq", "ylt", "jkl"], + C=[1.1, 3.1, 6.9, 5.3], + D=[False, True, True, False], + E=["1990-11-22", "1991-10-26", "1993-11-26", "1995-12-12"], + ) + ) + df["E"] = to_datetime(df["E"]) + + self.test_frame2 = df + + def _load_test3_data(self): + columns = ["index", "A", "B"] + data = [ + ("2000-01-03 00:00:00", 2 ** 31 - 1, -1.987670), + ("2000-01-04 00:00:00", -29, -0.0412318367011), + ("2000-01-05 00:00:00", 20000, 0.731167677815), + ("2000-01-06 00:00:00", -290867, 1.56762092543), + ] + + self.test_frame3 = DataFrame(data, columns=columns) + + def _load_raw_sql(self): + self.drop_table("types_test_data") + self._get_exec().execute(SQL_STRINGS["create_test_types"][self.flavor]) + ins = SQL_STRINGS["insert_test_types"][self.flavor] + data = [ + { + "TextCol": "first", + "DateCol": "2000-01-03 00:00:00", + "DateColWithTz": "2000-01-01 00:00:00-08:00", + "IntDateCol": 535852800, + "IntDateOnlyCol": 20101010, + "FloatCol": 10.10, + "IntCol": 1, + "BoolCol": False, + "IntColWithNull": 1, + "BoolColWithNull": False, + }, + { + "TextCol": "first", + "DateCol": "2000-01-04 00:00:00", + "DateColWithTz": "2000-06-01 00:00:00-07:00", + "IntDateCol": 1356998400, + "IntDateOnlyCol": 20101212, + "FloatCol": 10.10, + "IntCol": 1, + "BoolCol": False, + "IntColWithNull": None, + "BoolColWithNull": None, + }, + ] + + for d in data: + self._get_exec().execute( + ins["query"], [d[field] for field in ins["fields"]] + ) + + def _count_rows(self, table_name): + result = ( + self._get_exec() + .execute(f"SELECT count(*) AS count_1 FROM {table_name}") + .fetchone() + ) + return result[0] + + def _read_sql_iris(self): + iris_frame = self.pandasSQL.read_query("SELECT * FROM iris") + self._check_iris_loaded_frame(iris_frame) + + def _read_sql_iris_parameter(self): + query = SQL_STRINGS["read_parameters"][self.flavor] + params = ["Iris-setosa", 5.1] + iris_frame = self.pandasSQL.read_query(query, params=params) + self._check_iris_loaded_frame(iris_frame) + + def _read_sql_iris_named_parameter(self): + query = SQL_STRINGS["read_named_parameters"][self.flavor] + params = {"name": "Iris-setosa", "length": 5.1} + iris_frame = self.pandasSQL.read_query(query, params=params) + self._check_iris_loaded_frame(iris_frame) + + def _to_sql(self, method=None): + self.drop_table("test_frame1") + + self.pandasSQL.to_sql(self.test_frame1, "test_frame1", method=method) + assert self.pandasSQL.has_table("test_frame1") + + num_entries = len(self.test_frame1) + num_rows = self._count_rows("test_frame1") + assert num_rows == num_entries + + # Nuke table + self.drop_table("test_frame1") + + def _to_sql_empty(self): + self.drop_table("test_frame1") + self.pandasSQL.to_sql(self.test_frame1.iloc[:0], "test_frame1") + + def _to_sql_fail(self): + self.drop_table("test_frame1") + + self.pandasSQL.to_sql(self.test_frame1, "test_frame1", if_exists="fail") + assert self.pandasSQL.has_table("test_frame1") + + msg = "Table 'test_frame1' already exists" + with pytest.raises(ValueError, match=msg): + self.pandasSQL.to_sql(self.test_frame1, "test_frame1", if_exists="fail") + + self.drop_table("test_frame1") + + def _to_sql_replace(self): + self.drop_table("test_frame1") + + self.pandasSQL.to_sql(self.test_frame1, "test_frame1", if_exists="fail") + # Add to table again + self.pandasSQL.to_sql(self.test_frame1, "test_frame1", if_exists="replace") + assert self.pandasSQL.has_table("test_frame1") + + num_entries = len(self.test_frame1) + num_rows = self._count_rows("test_frame1") + + assert num_rows == num_entries + self.drop_table("test_frame1") + + def _to_sql_append(self): + # Nuke table just in case + self.drop_table("test_frame1") + + self.pandasSQL.to_sql(self.test_frame1, "test_frame1", if_exists="fail") + + # Add to table again + self.pandasSQL.to_sql(self.test_frame1, "test_frame1", if_exists="append") + assert self.pandasSQL.has_table("test_frame1") + + num_entries = 2 * len(self.test_frame1) + num_rows = self._count_rows("test_frame1") + + assert num_rows == num_entries + self.drop_table("test_frame1") + + def _to_sql_method_callable(self): + check = [] # used to double check function below is really being used + + def sample(pd_table, conn, keys, data_iter): + check.append(1) + data = [dict(zip(keys, row)) for row in data_iter] + conn.execute(pd_table.table.insert(), data) + + self.drop_table("test_frame1") + + self.pandasSQL.to_sql(self.test_frame1, "test_frame1", method=sample) + assert self.pandasSQL.has_table("test_frame1") + + assert check == [1] + num_entries = len(self.test_frame1) + num_rows = self._count_rows("test_frame1") + assert num_rows == num_entries + # Nuke table + self.drop_table("test_frame1") + + def _roundtrip(self): + self.drop_table("test_frame_roundtrip") + self.pandasSQL.to_sql(self.test_frame1, "test_frame_roundtrip") + result = self.pandasSQL.read_query("SELECT * FROM test_frame_roundtrip") + + result.set_index("level_0", inplace=True) + # result.index.astype(int) + + result.index.name = None + + tm.assert_frame_equal(result, self.test_frame1) + + def _execute_sql(self): + # drop_sql = "DROP TABLE IF EXISTS test" # should already be done + iris_results = self.pandasSQL.execute("SELECT * FROM iris") + row = iris_results.fetchone() + tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) + + def _to_sql_save_index(self): + df = DataFrame.from_records( + [(1, 2.1, "line1"), (2, 1.5, "line2")], columns=["A", "B", "C"], index=["A"] + ) + self.pandasSQL.to_sql(df, "test_to_sql_saves_index") + ix_cols = self._get_index_columns("test_to_sql_saves_index") + assert ix_cols == [["A"]] + + def _transaction_test(self): + with self.pandasSQL.run_transaction() as trans: + trans.execute("CREATE TABLE test_trans (A INT, B TEXT)") + + class DummyException(Exception): + pass + + # Make sure when transaction is rolled back, no rows get inserted + ins_sql = "INSERT INTO test_trans (A,B) VALUES (1, 'blah')" + try: + with self.pandasSQL.run_transaction() as trans: + trans.execute(ins_sql) + raise DummyException("error") + except DummyException: + # ignore raised exception + pass + res = self.pandasSQL.read_query("SELECT * FROM test_trans") + assert len(res) == 0 + + # Make sure when transaction is committed, rows do get inserted + with self.pandasSQL.run_transaction() as trans: + trans.execute(ins_sql) + res2 = self.pandasSQL.read_query("SELECT * FROM test_trans") + assert len(res2) == 1 + + +# ----------------------------------------------------------------------------- +# -- Testing the public API + + +class _TestSQLApi(PandasSQLTest): + """ + Base class to test the public API. + + From this two classes are derived to run these tests for both the + sqlalchemy mode (`TestSQLApi`) and the fallback mode + (`TestSQLiteFallbackApi`). These tests are run with sqlite3. Specific + tests for the different sql flavours are included in `_TestSQLAlchemy`. + + Notes: + flavor can always be passed even in SQLAlchemy mode, + should be correctly ignored. + + we don't use drop_table because that isn't part of the public api + + """ + + flavor = "sqlite" + mode: str + + def setup_connect(self): + self.conn = self.connect() + + @pytest.fixture(autouse=True) + def setup_method(self, load_iris_data): + self.load_test_data_and_sql() + + def load_test_data_and_sql(self): + self._load_iris_view() + self._load_test1_data() + self._load_test2_data() + self._load_test3_data() + self._load_raw_sql() + + def test_read_sql_iris(self): + iris_frame = sql.read_sql_query("SELECT * FROM iris", self.conn) + self._check_iris_loaded_frame(iris_frame) + + def test_read_sql_view(self): + iris_frame = sql.read_sql_query("SELECT * FROM iris_view", self.conn) + self._check_iris_loaded_frame(iris_frame) + + def test_to_sql(self): + sql.to_sql(self.test_frame1, "test_frame1", self.conn) + assert sql.has_table("test_frame1", self.conn) + + def test_to_sql_fail(self): + sql.to_sql(self.test_frame1, "test_frame2", self.conn, if_exists="fail") + assert sql.has_table("test_frame2", self.conn) + + msg = "Table 'test_frame2' already exists" + with pytest.raises(ValueError, match=msg): + sql.to_sql(self.test_frame1, "test_frame2", self.conn, if_exists="fail") + + def test_to_sql_replace(self): + sql.to_sql(self.test_frame1, "test_frame3", self.conn, if_exists="fail") + # Add to table again + sql.to_sql(self.test_frame1, "test_frame3", self.conn, if_exists="replace") + assert sql.has_table("test_frame3", self.conn) + + num_entries = len(self.test_frame1) + num_rows = self._count_rows("test_frame3") + + assert num_rows == num_entries + + def test_to_sql_append(self): + sql.to_sql(self.test_frame1, "test_frame4", self.conn, if_exists="fail") + + # Add to table again + sql.to_sql(self.test_frame1, "test_frame4", self.conn, if_exists="append") + assert sql.has_table("test_frame4", self.conn) + + num_entries = 2 * len(self.test_frame1) + num_rows = self._count_rows("test_frame4") + + assert num_rows == num_entries + + def test_to_sql_type_mapping(self): + sql.to_sql(self.test_frame3, "test_frame5", self.conn, index=False) + result = sql.read_sql("SELECT * FROM test_frame5", self.conn) + + tm.assert_frame_equal(self.test_frame3, result) + + def test_to_sql_series(self): + s = Series(np.arange(5, dtype="int64"), name="series") + sql.to_sql(s, "test_series", self.conn, index=False) + s2 = sql.read_sql_query("SELECT * FROM test_series", self.conn) + tm.assert_frame_equal(s.to_frame(), s2) + + def test_roundtrip(self): + sql.to_sql(self.test_frame1, "test_frame_roundtrip", con=self.conn) + result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=self.conn) + + # HACK! + result.index = self.test_frame1.index + result.set_index("level_0", inplace=True) + result.index.astype(int) + result.index.name = None + tm.assert_frame_equal(result, self.test_frame1) + + def test_roundtrip_chunksize(self): + sql.to_sql( + self.test_frame1, + "test_frame_roundtrip", + con=self.conn, + index=False, + chunksize=2, + ) + result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=self.conn) + tm.assert_frame_equal(result, self.test_frame1) + + def test_execute_sql(self): + # drop_sql = "DROP TABLE IF EXISTS test" # should already be done + iris_results = sql.execute("SELECT * FROM iris", con=self.conn) + row = iris_results.fetchone() + tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) + + def test_date_parsing(self): + # Test date parsing in read_sql + # No Parsing + df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn) + assert not issubclass(df.DateCol.dtype.type, np.datetime64) + + df = sql.read_sql_query( + "SELECT * FROM types_test_data", self.conn, parse_dates=["DateCol"] + ) + assert issubclass(df.DateCol.dtype.type, np.datetime64) + assert df.DateCol.tolist() == [ + pd.Timestamp(2000, 1, 3, 0, 0, 0), + pd.Timestamp(2000, 1, 4, 0, 0, 0), + ] + + df = sql.read_sql_query( + "SELECT * FROM types_test_data", + self.conn, + parse_dates={"DateCol": "%Y-%m-%d %H:%M:%S"}, + ) + assert issubclass(df.DateCol.dtype.type, np.datetime64) + assert df.DateCol.tolist() == [ + pd.Timestamp(2000, 1, 3, 0, 0, 0), + pd.Timestamp(2000, 1, 4, 0, 0, 0), + ] + + df = sql.read_sql_query( + "SELECT * FROM types_test_data", self.conn, parse_dates=["IntDateCol"] + ) + assert issubclass(df.IntDateCol.dtype.type, np.datetime64) + assert df.IntDateCol.tolist() == [ + pd.Timestamp(1986, 12, 25, 0, 0, 0), + pd.Timestamp(2013, 1, 1, 0, 0, 0), + ] + + df = sql.read_sql_query( + "SELECT * FROM types_test_data", self.conn, parse_dates={"IntDateCol": "s"} + ) + assert issubclass(df.IntDateCol.dtype.type, np.datetime64) + assert df.IntDateCol.tolist() == [ + pd.Timestamp(1986, 12, 25, 0, 0, 0), + pd.Timestamp(2013, 1, 1, 0, 0, 0), + ] + + df = sql.read_sql_query( + "SELECT * FROM types_test_data", + self.conn, + parse_dates={"IntDateOnlyCol": "%Y%m%d"}, + ) + assert issubclass(df.IntDateOnlyCol.dtype.type, np.datetime64) + assert df.IntDateOnlyCol.tolist() == [ + pd.Timestamp("2010-10-10"), + pd.Timestamp("2010-12-12"), + ] + + def test_date_and_index(self): + # Test case where same column appears in parse_date and index_col + + df = sql.read_sql_query( + "SELECT * FROM types_test_data", + self.conn, + index_col="DateCol", + parse_dates=["DateCol", "IntDateCol"], + ) + + assert issubclass(df.index.dtype.type, np.datetime64) + assert issubclass(df.IntDateCol.dtype.type, np.datetime64) + + def test_timedelta(self): + + # see #6921 + df = to_timedelta(Series(["00:00:01", "00:00:03"], name="foo")).to_frame() + with tm.assert_produces_warning(UserWarning): + df.to_sql("test_timedelta", self.conn) + result = sql.read_sql_query("SELECT * FROM test_timedelta", self.conn) + tm.assert_series_equal(result["foo"], df["foo"].astype("int64")) + + def test_complex_raises(self): + df = DataFrame({"a": [1 + 1j, 2j]}) + msg = "Complex datatypes not supported" + with pytest.raises(ValueError, match=msg): + df.to_sql("test_complex", self.conn) + + @pytest.mark.parametrize( + "index_name,index_label,expected", + [ + # no index name, defaults to 'index' + (None, None, "index"), + # specifying index_label + (None, "other_label", "other_label"), + # using the index name + ("index_name", None, "index_name"), + # has index name, but specifying index_label + ("index_name", "other_label", "other_label"), + # index name is integer + (0, None, "0"), + # index name is None but index label is integer + (None, 0, "0"), + ], + ) + def test_to_sql_index_label(self, index_name, index_label, expected): + temp_frame = DataFrame({"col1": range(4)}) + temp_frame.index.name = index_name + query = "SELECT * FROM test_index_label" + sql.to_sql(temp_frame, "test_index_label", self.conn, index_label=index_label) + frame = sql.read_sql_query(query, self.conn) + assert frame.columns[0] == expected + + def test_to_sql_index_label_multiindex(self): + temp_frame = DataFrame( + {"col1": range(4)}, + index=MultiIndex.from_product([("A0", "A1"), ("B0", "B1")]), + ) + + # no index name, defaults to 'level_0' and 'level_1' + sql.to_sql(temp_frame, "test_index_label", self.conn) + frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn) + assert frame.columns[0] == "level_0" + assert frame.columns[1] == "level_1" + + # specifying index_label + sql.to_sql( + temp_frame, + "test_index_label", + self.conn, + if_exists="replace", + index_label=["A", "B"], + ) + frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn) + assert frame.columns[:2].tolist() == ["A", "B"] + + # using the index name + temp_frame.index.names = ["A", "B"] + sql.to_sql(temp_frame, "test_index_label", self.conn, if_exists="replace") + frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn) + assert frame.columns[:2].tolist() == ["A", "B"] + + # has index name, but specifying index_label + sql.to_sql( + temp_frame, + "test_index_label", + self.conn, + if_exists="replace", + index_label=["C", "D"], + ) + frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn) + assert frame.columns[:2].tolist() == ["C", "D"] + + msg = "Length of 'index_label' should match number of levels, which is 2" + with pytest.raises(ValueError, match=msg): + sql.to_sql( + temp_frame, + "test_index_label", + self.conn, + if_exists="replace", + index_label="C", + ) + + def test_multiindex_roundtrip(self): + df = DataFrame.from_records( + [(1, 2.1, "line1"), (2, 1.5, "line2")], + columns=["A", "B", "C"], + index=["A", "B"], + ) + + df.to_sql("test_multiindex_roundtrip", self.conn) + result = sql.read_sql_query( + "SELECT * FROM test_multiindex_roundtrip", self.conn, index_col=["A", "B"] + ) + tm.assert_frame_equal(df, result, check_index_type=True) + + def test_integer_col_names(self): + df = DataFrame([[1, 2], [3, 4]], columns=[0, 1]) + sql.to_sql(df, "test_frame_integer_col_names", self.conn, if_exists="replace") + + def test_get_schema(self): + create_sql = sql.get_schema(self.test_frame1, "test", con=self.conn) + assert "CREATE" in create_sql + + def test_get_schema_dtypes(self): + float_frame = DataFrame({"a": [1.1, 1.2], "b": [2.1, 2.2]}) + dtype = sqlalchemy.Integer if self.mode == "sqlalchemy" else "INTEGER" + create_sql = sql.get_schema( + float_frame, "test", con=self.conn, dtype={"b": dtype} + ) + assert "CREATE" in create_sql + assert "INTEGER" in create_sql + + def test_get_schema_keys(self): + frame = DataFrame({"Col1": [1.1, 1.2], "Col2": [2.1, 2.2]}) + create_sql = sql.get_schema(frame, "test", con=self.conn, keys="Col1") + constraint_sentence = 'CONSTRAINT test_pk PRIMARY KEY ("Col1")' + assert constraint_sentence in create_sql + + # multiple columns as key (GH10385) + create_sql = sql.get_schema( + self.test_frame1, "test", con=self.conn, keys=["A", "B"] + ) + constraint_sentence = 'CONSTRAINT test_pk PRIMARY KEY ("A", "B")' + assert constraint_sentence in create_sql + + def test_chunksize_read(self): + df = DataFrame(np.random.randn(22, 5), columns=list("abcde")) + df.to_sql("test_chunksize", self.conn, index=False) + + # reading the query in one time + res1 = sql.read_sql_query("select * from test_chunksize", self.conn) + + # reading the query in chunks with read_sql_query + res2 = DataFrame() + i = 0 + sizes = [5, 5, 5, 5, 2] + + for chunk in sql.read_sql_query( + "select * from test_chunksize", self.conn, chunksize=5 + ): + res2 = concat([res2, chunk], ignore_index=True) + assert len(chunk) == sizes[i] + i += 1 + + tm.assert_frame_equal(res1, res2) + + # reading the query in chunks with read_sql_query + if self.mode == "sqlalchemy": + res3 = DataFrame() + i = 0 + sizes = [5, 5, 5, 5, 2] + + for chunk in sql.read_sql_table("test_chunksize", self.conn, chunksize=5): + res3 = concat([res3, chunk], ignore_index=True) + assert len(chunk) == sizes[i] + i += 1 + + tm.assert_frame_equal(res1, res3) + + def test_categorical(self): + # GH8624 + # test that categorical gets written correctly as dense column + df = DataFrame( + { + "person_id": [1, 2, 3], + "person_name": ["John P. Doe", "Jane Dove", "John P. Doe"], + } + ) + df2 = df.copy() + df2["person_name"] = df2["person_name"].astype("category") + + df2.to_sql("test_categorical", self.conn, index=False) + res = sql.read_sql_query("SELECT * FROM test_categorical", self.conn) + + tm.assert_frame_equal(res, df) + + def test_unicode_column_name(self): + # GH 11431 + df = DataFrame([[1, 2], [3, 4]], columns=["\xe9", "b"]) + df.to_sql("test_unicode", self.conn, index=False) + + def test_escaped_table_name(self): + # GH 13206 + df = DataFrame({"A": [0, 1, 2], "B": [0.2, np.nan, 5.6]}) + df.to_sql("d1187b08-4943-4c8d-a7f6", self.conn, index=False) + + res = sql.read_sql_query("SELECT * FROM `d1187b08-4943-4c8d-a7f6`", self.conn) + + tm.assert_frame_equal(res, df) + + +@pytest.mark.single +@pytest.mark.skipif(not SQLALCHEMY_INSTALLED, reason="SQLAlchemy not installed") +class TestSQLApi(SQLAlchemyMixIn, _TestSQLApi): + """ + Test the public API as it would be used directly + + Tests for `read_sql_table` are included here, as this is specific for the + sqlalchemy mode. + + """ + + flavor = "sqlite" + mode = "sqlalchemy" + + def connect(self): + return sqlalchemy.create_engine("sqlite:///:memory:") + + def test_read_table_columns(self): + # test columns argument in read_table + sql.to_sql(self.test_frame1, "test_frame", self.conn) + + cols = ["A", "B"] + result = sql.read_sql_table("test_frame", self.conn, columns=cols) + assert result.columns.tolist() == cols + + def test_read_table_index_col(self): + # test columns argument in read_table + sql.to_sql(self.test_frame1, "test_frame", self.conn) + + result = sql.read_sql_table("test_frame", self.conn, index_col="index") + assert result.index.names == ["index"] + + result = sql.read_sql_table("test_frame", self.conn, index_col=["A", "B"]) + assert result.index.names == ["A", "B"] + + result = sql.read_sql_table( + "test_frame", self.conn, index_col=["A", "B"], columns=["C", "D"] + ) + assert result.index.names == ["A", "B"] + assert result.columns.tolist() == ["C", "D"] + + def test_read_sql_delegate(self): + iris_frame1 = sql.read_sql_query("SELECT * FROM iris", self.conn) + iris_frame2 = sql.read_sql("SELECT * FROM iris", self.conn) + tm.assert_frame_equal(iris_frame1, iris_frame2) + + iris_frame1 = sql.read_sql_table("iris", self.conn) + iris_frame2 = sql.read_sql("iris", self.conn) + tm.assert_frame_equal(iris_frame1, iris_frame2) + + def test_not_reflect_all_tables(self): + # create invalid table + qry = """CREATE TABLE invalid (x INTEGER, y UNKNOWN);""" + self.conn.execute(qry) + qry = """CREATE TABLE other_table (x INTEGER, y INTEGER);""" + self.conn.execute(qry) + + with warnings.catch_warnings(record=True) as w: + # Cause all warnings to always be triggered. + warnings.simplefilter("always") + # Trigger a warning. + sql.read_sql_table("other_table", self.conn) + sql.read_sql_query("SELECT * FROM other_table", self.conn) + # Verify some things + assert len(w) == 0 + + def test_warning_case_insensitive_table_name(self): + # see gh-7815 + # + # We can't test that this warning is triggered, a the database + # configuration would have to be altered. But here we test that + # the warning is certainly NOT triggered in a normal case. + with warnings.catch_warnings(record=True) as w: + # Cause all warnings to always be triggered. + warnings.simplefilter("always") + # This should not trigger a Warning + self.test_frame1.to_sql("CaseSensitive", self.conn) + # Verify some things + assert len(w) == 0 + + def _get_index_columns(self, tbl_name): + from sqlalchemy.engine import reflection + + insp = reflection.Inspector.from_engine(self.conn) + ixs = insp.get_indexes("test_index_saved") + ixs = [i["column_names"] for i in ixs] + return ixs + + def test_sqlalchemy_type_mapping(self): + + # Test Timestamp objects (no datetime64 because of timezone) (GH9085) + df = DataFrame( + {"time": to_datetime(["201412120154", "201412110254"], utc=True)} + ) + db = sql.SQLDatabase(self.conn) + table = sql.SQLTable("test_type", db, frame=df) + # GH 9086: TIMESTAMP is the suggested type for datetimes with timezones + assert isinstance(table.table.c["time"].type, sqltypes.TIMESTAMP) + + def test_database_uri_string(self): + + # Test read_sql and .to_sql method with a database URI (GH10654) + test_frame1 = self.test_frame1 + # db_uri = 'sqlite:///:memory:' # raises + # sqlalchemy.exc.OperationalError: (sqlite3.OperationalError) near + # "iris": syntax error [SQL: 'iris'] + with tm.ensure_clean() as name: + db_uri = "sqlite:///" + name + table = "iris" + test_frame1.to_sql(table, db_uri, if_exists="replace", index=False) + test_frame2 = sql.read_sql(table, db_uri) + test_frame3 = sql.read_sql_table(table, db_uri) + query = "SELECT * FROM iris" + test_frame4 = sql.read_sql_query(query, db_uri) + tm.assert_frame_equal(test_frame1, test_frame2) + tm.assert_frame_equal(test_frame1, test_frame3) + tm.assert_frame_equal(test_frame1, test_frame4) + + # using driver that will not be installed on Travis to trigger error + # in sqlalchemy.create_engine -> test passing of this error to user + try: + # the rest of this test depends on pg8000's being absent + import pg8000 # noqa + + pytest.skip("pg8000 is installed") + except ImportError: + pass + + db_uri = "postgresql+pg8000://user:pass@host/dbname" + with pytest.raises(ImportError, match="pg8000"): + sql.read_sql("select * from table", db_uri) + + def _make_iris_table_metadata(self): + sa = sqlalchemy + metadata = sa.MetaData() + iris = sa.Table( + "iris", + metadata, + sa.Column("SepalLength", sa.REAL), + sa.Column("SepalWidth", sa.REAL), + sa.Column("PetalLength", sa.REAL), + sa.Column("PetalWidth", sa.REAL), + sa.Column("Name", sa.TEXT), + ) + + return iris + + def test_query_by_text_obj(self): + # WIP : GH10846 + name_text = sqlalchemy.text("select * from iris where name=:name") + iris_df = sql.read_sql(name_text, self.conn, params={"name": "Iris-versicolor"}) + all_names = set(iris_df["Name"]) + assert all_names == {"Iris-versicolor"} + + def test_query_by_select_obj(self): + # WIP : GH10846 + iris = self._make_iris_table_metadata() + + name_select = sqlalchemy.select([iris]).where( + iris.c.Name == sqlalchemy.bindparam("name") + ) + iris_df = sql.read_sql(name_select, self.conn, params={"name": "Iris-setosa"}) + all_names = set(iris_df["Name"]) + assert all_names == {"Iris-setosa"} + + +class _EngineToConnMixin: + """ + A mixin that causes setup_connect to create a conn rather than an engine. + """ + + @pytest.fixture(autouse=True) + def setup_method(self, load_iris_data): + super().load_test_data_and_sql() + engine = self.conn + conn = engine.connect() + self.__tx = conn.begin() + self.pandasSQL = sql.SQLDatabase(conn) + self.__engine = engine + self.conn = conn + + yield + + self.__tx.rollback() + self.conn.close() + self.conn = self.__engine + self.pandasSQL = sql.SQLDatabase(self.__engine) + # XXX: + # super().teardown_method(method) + + +@pytest.mark.single +class TestSQLApiConn(_EngineToConnMixin, TestSQLApi): + pass + + +@pytest.mark.single +class TestSQLiteFallbackApi(SQLiteMixIn, _TestSQLApi): + """ + Test the public sqlite connection fallback API + + """ + + flavor = "sqlite" + mode = "fallback" + + def connect(self, database=":memory:"): + return sqlite3.connect(database) + + def test_sql_open_close(self): + # Test if the IO in the database still work if the connection closed + # between the writing and reading (as in many real situations). + + with tm.ensure_clean() as name: + + conn = self.connect(name) + sql.to_sql(self.test_frame3, "test_frame3_legacy", conn, index=False) + conn.close() + + conn = self.connect(name) + result = sql.read_sql_query("SELECT * FROM test_frame3_legacy;", conn) + conn.close() + + tm.assert_frame_equal(self.test_frame3, result) + + @pytest.mark.skipif(SQLALCHEMY_INSTALLED, reason="SQLAlchemy is installed") + def test_con_string_import_error(self): + conn = "mysql://root@localhost/pandas_nosetest" + msg = "Using URI string without sqlalchemy installed" + with pytest.raises(ImportError, match=msg): + sql.read_sql("SELECT * FROM iris", conn) + + def test_read_sql_delegate(self): + iris_frame1 = sql.read_sql_query("SELECT * FROM iris", self.conn) + iris_frame2 = sql.read_sql("SELECT * FROM iris", self.conn) + tm.assert_frame_equal(iris_frame1, iris_frame2) + + msg = "Execution failed on sql 'iris': near \"iris\": syntax error" + with pytest.raises(sql.DatabaseError, match=msg): + sql.read_sql("iris", self.conn) + + def test_safe_names_warning(self): + # GH 6798 + df = DataFrame([[1, 2], [3, 4]], columns=["a", "b "]) # has a space + # warns on create table with spaces in names + with tm.assert_produces_warning(): + sql.to_sql(df, "test_frame3_legacy", self.conn, index=False) + + def test_get_schema2(self): + # without providing a connection object (available for backwards comp) + create_sql = sql.get_schema(self.test_frame1, "test") + assert "CREATE" in create_sql + + def _get_sqlite_column_type(self, schema, column): + + for col in schema.split("\n"): + if col.split()[0].strip('""') == column: + return col.split()[1] + raise ValueError(f"Column {column} not found") + + def test_sqlite_type_mapping(self): + + # Test Timestamp objects (no datetime64 because of timezone) (GH9085) + df = DataFrame( + {"time": to_datetime(["201412120154", "201412110254"], utc=True)} + ) + db = sql.SQLiteDatabase(self.conn) + table = sql.SQLiteTable("test_type", db, frame=df) + schema = table.sql_schema() + assert self._get_sqlite_column_type(schema, "time") == "TIMESTAMP" + + +# ----------------------------------------------------------------------------- +# -- Database flavor specific tests + + +class _TestSQLAlchemy(SQLAlchemyMixIn, PandasSQLTest): + """ + Base class for testing the sqlalchemy backend. + + Subclasses for specific database types are created below. Tests that + deviate for each flavor are overwritten there. + + """ + + flavor: str + + @pytest.fixture(autouse=True, scope="class") + def setup_class(cls): + cls.setup_import() + cls.setup_driver() + conn = cls.connect() + conn.connect() + + def load_test_data_and_sql(self): + self._load_raw_sql() + self._load_test1_data() + + @pytest.fixture(autouse=True) + def setup_method(self, load_iris_data): + self.load_test_data_and_sql() + + @classmethod + def setup_import(cls): + # Skip this test if SQLAlchemy not available + if not SQLALCHEMY_INSTALLED: + pytest.skip("SQLAlchemy not installed") + + @classmethod + def setup_driver(cls): + raise NotImplementedError() + + @classmethod + def connect(cls): + raise NotImplementedError() + + def setup_connect(self): + try: + self.conn = self.connect() + self.pandasSQL = sql.SQLDatabase(self.conn) + # to test if connection can be made: + self.conn.connect() + except sqlalchemy.exc.OperationalError: + pytest.skip(f"Can't connect to {self.flavor} server") + + def test_read_sql(self): + self._read_sql_iris() + + def test_read_sql_parameter(self): + self._read_sql_iris_parameter() + + def test_read_sql_named_parameter(self): + self._read_sql_iris_named_parameter() + + def test_to_sql(self): + self._to_sql() + + def test_to_sql_empty(self): + self._to_sql_empty() + + def test_to_sql_fail(self): + self._to_sql_fail() + + def test_to_sql_replace(self): + self._to_sql_replace() + + def test_to_sql_append(self): + self._to_sql_append() + + def test_to_sql_method_multi(self): + self._to_sql(method="multi") + + def test_to_sql_method_callable(self): + self._to_sql_method_callable() + + def test_create_table(self): + temp_conn = self.connect() + temp_frame = DataFrame( + {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]} + ) + + pandasSQL = sql.SQLDatabase(temp_conn) + pandasSQL.to_sql(temp_frame, "temp_frame") + + assert temp_conn.has_table("temp_frame") + + def test_drop_table(self): + temp_conn = self.connect() + + temp_frame = DataFrame( + {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]} + ) + + pandasSQL = sql.SQLDatabase(temp_conn) + pandasSQL.to_sql(temp_frame, "temp_frame") + + assert temp_conn.has_table("temp_frame") + + pandasSQL.drop_table("temp_frame") + + assert not temp_conn.has_table("temp_frame") + + def test_roundtrip(self): + self._roundtrip() + + def test_execute_sql(self): + self._execute_sql() + + def test_read_table(self): + iris_frame = sql.read_sql_table("iris", con=self.conn) + self._check_iris_loaded_frame(iris_frame) + + def test_read_table_columns(self): + iris_frame = sql.read_sql_table( + "iris", con=self.conn, columns=["SepalLength", "SepalLength"] + ) + tm.equalContents(iris_frame.columns.values, ["SepalLength", "SepalLength"]) + + def test_read_table_absent_raises(self): + msg = "Table this_doesnt_exist not found" + with pytest.raises(ValueError, match=msg): + sql.read_sql_table("this_doesnt_exist", con=self.conn) + + def test_default_type_conversion(self): + df = sql.read_sql_table("types_test_data", self.conn) + + assert issubclass(df.FloatCol.dtype.type, np.floating) + assert issubclass(df.IntCol.dtype.type, np.integer) + assert issubclass(df.BoolCol.dtype.type, np.bool_) + + # Int column with NA values stays as float + assert issubclass(df.IntColWithNull.dtype.type, np.floating) + # Bool column with NA values becomes object + assert issubclass(df.BoolColWithNull.dtype.type, np.object) + + def test_bigint(self): + # int64 should be converted to BigInteger, GH7433 + df = DataFrame(data={"i64": [2 ** 62]}) + df.to_sql("test_bigint", self.conn, index=False) + result = sql.read_sql_table("test_bigint", self.conn) + + tm.assert_frame_equal(df, result) + + def test_default_date_load(self): + df = sql.read_sql_table("types_test_data", self.conn) + + # IMPORTANT - sqlite has no native date type, so shouldn't parse, but + # MySQL SHOULD be converted. + assert issubclass(df.DateCol.dtype.type, np.datetime64) + + def test_datetime_with_timezone(self): + # edge case that converts postgresql datetime with time zone types + # to datetime64[ns,psycopg2.tz.FixedOffsetTimezone..], which is ok + # but should be more natural, so coerce to datetime64[ns] for now + + def check(col): + # check that a column is either datetime64[ns] + # or datetime64[ns, UTC] + if is_datetime64_dtype(col.dtype): + + # "2000-01-01 00:00:00-08:00" should convert to + # "2000-01-01 08:00:00" + assert col[0] == Timestamp("2000-01-01 08:00:00") + + # "2000-06-01 00:00:00-07:00" should convert to + # "2000-06-01 07:00:00" + assert col[1] == Timestamp("2000-06-01 07:00:00") + + elif is_datetime64tz_dtype(col.dtype): + assert str(col.dt.tz) == "UTC" + + # "2000-01-01 00:00:00-08:00" should convert to + # "2000-01-01 08:00:00" + # "2000-06-01 00:00:00-07:00" should convert to + # "2000-06-01 07:00:00" + # GH 6415 + expected_data = [ + Timestamp("2000-01-01 08:00:00", tz="UTC"), + Timestamp("2000-06-01 07:00:00", tz="UTC"), + ] + expected = Series(expected_data, name=col.name) + tm.assert_series_equal(col, expected) + + else: + raise AssertionError( + f"DateCol loaded with incorrect type -> {col.dtype}" + ) + + # GH11216 + df = pd.read_sql_query("select * from types_test_data", self.conn) + if not hasattr(df, "DateColWithTz"): + pytest.skip("no column with datetime with time zone") + + # this is parsed on Travis (linux), but not on macosx for some reason + # even with the same versions of psycopg2 & sqlalchemy, possibly a + # Postgresql server version difference + col = df.DateColWithTz + assert is_datetime64tz_dtype(col.dtype) + + df = pd.read_sql_query( + "select * from types_test_data", self.conn, parse_dates=["DateColWithTz"] + ) + if not hasattr(df, "DateColWithTz"): + pytest.skip("no column with datetime with time zone") + col = df.DateColWithTz + assert is_datetime64tz_dtype(col.dtype) + assert str(col.dt.tz) == "UTC" + check(df.DateColWithTz) + + df = pd.concat( + list( + pd.read_sql_query( + "select * from types_test_data", self.conn, chunksize=1 + ) + ), + ignore_index=True, + ) + col = df.DateColWithTz + assert is_datetime64tz_dtype(col.dtype) + assert str(col.dt.tz) == "UTC" + expected = sql.read_sql_table("types_test_data", self.conn) + col = expected.DateColWithTz + assert is_datetime64tz_dtype(col.dtype) + tm.assert_series_equal(df.DateColWithTz, expected.DateColWithTz) + + # xref #7139 + # this might or might not be converted depending on the postgres driver + df = sql.read_sql_table("types_test_data", self.conn) + check(df.DateColWithTz) + + def test_datetime_with_timezone_roundtrip(self): + # GH 9086 + # Write datetimetz data to a db and read it back + # For dbs that support timestamps with timezones, should get back UTC + # otherwise naive data should be returned + expected = DataFrame( + {"A": date_range("2013-01-01 09:00:00", periods=3, tz="US/Pacific")} + ) + expected.to_sql("test_datetime_tz", self.conn, index=False) + + if self.flavor == "postgresql": + # SQLAlchemy "timezones" (i.e. offsets) are coerced to UTC + expected["A"] = expected["A"].dt.tz_convert("UTC") + else: + # Otherwise, timestamps are returned as local, naive + expected["A"] = expected["A"].dt.tz_localize(None) + + result = sql.read_sql_table("test_datetime_tz", self.conn) + tm.assert_frame_equal(result, expected) + + result = sql.read_sql_query("SELECT * FROM test_datetime_tz", self.conn) + if self.flavor == "sqlite": + # read_sql_query does not return datetime type like read_sql_table + assert isinstance(result.loc[0, "A"], str) + result["A"] = to_datetime(result["A"]) + tm.assert_frame_equal(result, expected) + + def test_naive_datetimeindex_roundtrip(self): + # GH 23510 + # Ensure that a naive DatetimeIndex isn't converted to UTC + dates = date_range("2018-01-01", periods=5, freq="6H") + expected = DataFrame({"nums": range(5)}, index=dates) + expected.to_sql("foo_table", self.conn, index_label="info_date") + result = sql.read_sql_table("foo_table", self.conn, index_col="info_date") + # result index with gain a name from a set_index operation; expected + tm.assert_frame_equal(result, expected, check_names=False) + + def test_date_parsing(self): + # No Parsing + df = sql.read_sql_table("types_test_data", self.conn) + expected_type = object if self.flavor == "sqlite" else np.datetime64 + assert issubclass(df.DateCol.dtype.type, expected_type) + + df = sql.read_sql_table("types_test_data", self.conn, parse_dates=["DateCol"]) + assert issubclass(df.DateCol.dtype.type, np.datetime64) + + df = sql.read_sql_table( + "types_test_data", self.conn, parse_dates={"DateCol": "%Y-%m-%d %H:%M:%S"} + ) + assert issubclass(df.DateCol.dtype.type, np.datetime64) + + df = sql.read_sql_table( + "types_test_data", + self.conn, + parse_dates={"DateCol": {"format": "%Y-%m-%d %H:%M:%S"}}, + ) + assert issubclass(df.DateCol.dtype.type, np.datetime64) + + df = sql.read_sql_table( + "types_test_data", self.conn, parse_dates=["IntDateCol"] + ) + assert issubclass(df.IntDateCol.dtype.type, np.datetime64) + + df = sql.read_sql_table( + "types_test_data", self.conn, parse_dates={"IntDateCol": "s"} + ) + assert issubclass(df.IntDateCol.dtype.type, np.datetime64) + + df = sql.read_sql_table( + "types_test_data", self.conn, parse_dates={"IntDateCol": {"unit": "s"}} + ) + assert issubclass(df.IntDateCol.dtype.type, np.datetime64) + + def test_datetime(self): + df = DataFrame( + {"A": date_range("2013-01-01 09:00:00", periods=3), "B": np.arange(3.0)} + ) + df.to_sql("test_datetime", self.conn) + + # with read_table -> type information from schema used + result = sql.read_sql_table("test_datetime", self.conn) + result = result.drop("index", axis=1) + tm.assert_frame_equal(result, df) + + # with read_sql -> no type information -> sqlite has no native + result = sql.read_sql_query("SELECT * FROM test_datetime", self.conn) + result = result.drop("index", axis=1) + if self.flavor == "sqlite": + assert isinstance(result.loc[0, "A"], str) + result["A"] = to_datetime(result["A"]) + tm.assert_frame_equal(result, df) + else: + tm.assert_frame_equal(result, df) + + def test_datetime_NaT(self): + df = DataFrame( + {"A": date_range("2013-01-01 09:00:00", periods=3), "B": np.arange(3.0)} + ) + df.loc[1, "A"] = np.nan + df.to_sql("test_datetime", self.conn, index=False) + + # with read_table -> type information from schema used + result = sql.read_sql_table("test_datetime", self.conn) + tm.assert_frame_equal(result, df) + + # with read_sql -> no type information -> sqlite has no native + result = sql.read_sql_query("SELECT * FROM test_datetime", self.conn) + if self.flavor == "sqlite": + assert isinstance(result.loc[0, "A"], str) + result["A"] = to_datetime(result["A"], errors="coerce") + tm.assert_frame_equal(result, df) + else: + tm.assert_frame_equal(result, df) + + def test_datetime_date(self): + # test support for datetime.date + df = DataFrame([date(2014, 1, 1), date(2014, 1, 2)], columns=["a"]) + df.to_sql("test_date", self.conn, index=False) + res = read_sql_table("test_date", self.conn) + result = res["a"] + expected = to_datetime(df["a"]) + # comes back as datetime64 + tm.assert_series_equal(result, expected) + + def test_datetime_time(self): + # test support for datetime.time + df = DataFrame([time(9, 0, 0), time(9, 1, 30)], columns=["a"]) + df.to_sql("test_time", self.conn, index=False) + res = read_sql_table("test_time", self.conn) + tm.assert_frame_equal(res, df) + + # GH8341 + # first, use the fallback to have the sqlite adapter put in place + sqlite_conn = TestSQLiteFallback.connect() + sql.to_sql(df, "test_time2", sqlite_conn, index=False) + res = sql.read_sql_query("SELECT * FROM test_time2", sqlite_conn) + ref = df.applymap(lambda _: _.strftime("%H:%M:%S.%f")) + tm.assert_frame_equal(ref, res) # check if adapter is in place + # then test if sqlalchemy is unaffected by the sqlite adapter + sql.to_sql(df, "test_time3", self.conn, index=False) + if self.flavor == "sqlite": + res = sql.read_sql_query("SELECT * FROM test_time3", self.conn) + ref = df.applymap(lambda _: _.strftime("%H:%M:%S.%f")) + tm.assert_frame_equal(ref, res) + res = sql.read_sql_table("test_time3", self.conn) + tm.assert_frame_equal(df, res) + + def test_mixed_dtype_insert(self): + # see GH6509 + s1 = Series(2 ** 25 + 1, dtype=np.int32) + s2 = Series(0.0, dtype=np.float32) + df = DataFrame({"s1": s1, "s2": s2}) + + # write and read again + df.to_sql("test_read_write", self.conn, index=False) + df2 = sql.read_sql_table("test_read_write", self.conn) + + tm.assert_frame_equal(df, df2, check_dtype=False, check_exact=True) + + def test_nan_numeric(self): + # NaNs in numeric float column + df = DataFrame({"A": [0, 1, 2], "B": [0.2, np.nan, 5.6]}) + df.to_sql("test_nan", self.conn, index=False) + + # with read_table + result = sql.read_sql_table("test_nan", self.conn) + tm.assert_frame_equal(result, df) + + # with read_sql + result = sql.read_sql_query("SELECT * FROM test_nan", self.conn) + tm.assert_frame_equal(result, df) + + def test_nan_fullcolumn(self): + # full NaN column (numeric float column) + df = DataFrame({"A": [0, 1, 2], "B": [np.nan, np.nan, np.nan]}) + df.to_sql("test_nan", self.conn, index=False) + + # with read_table + result = sql.read_sql_table("test_nan", self.conn) + tm.assert_frame_equal(result, df) + + # with read_sql -> not type info from table -> stays None + df["B"] = df["B"].astype("object") + df["B"] = None + result = sql.read_sql_query("SELECT * FROM test_nan", self.conn) + tm.assert_frame_equal(result, df) + + def test_nan_string(self): + # NaNs in string column + df = DataFrame({"A": [0, 1, 2], "B": ["a", "b", np.nan]}) + df.to_sql("test_nan", self.conn, index=False) + + # NaNs are coming back as None + df.loc[2, "B"] = None + + # with read_table + result = sql.read_sql_table("test_nan", self.conn) + tm.assert_frame_equal(result, df) + + # with read_sql + result = sql.read_sql_query("SELECT * FROM test_nan", self.conn) + tm.assert_frame_equal(result, df) + + def _get_index_columns(self, tbl_name): + from sqlalchemy.engine import reflection + + insp = reflection.Inspector.from_engine(self.conn) + ixs = insp.get_indexes(tbl_name) + ixs = [i["column_names"] for i in ixs] + return ixs + + def test_to_sql_save_index(self): + self._to_sql_save_index() + + def test_transactions(self): + self._transaction_test() + + def test_get_schema_create_table(self): + # Use a dataframe without a bool column, since MySQL converts bool to + # TINYINT (which read_sql_table returns as an int and causes a dtype + # mismatch) + + self._load_test3_data() + tbl = "test_get_schema_create_table" + create_sql = sql.get_schema(self.test_frame3, tbl, con=self.conn) + blank_test_df = self.test_frame3.iloc[:0] + + self.drop_table(tbl) + self.conn.execute(create_sql) + returned_df = sql.read_sql_table(tbl, self.conn) + tm.assert_frame_equal(returned_df, blank_test_df, check_index_type=False) + self.drop_table(tbl) + + def test_dtype(self): + cols = ["A", "B"] + data = [(0.8, True), (0.9, None)] + df = DataFrame(data, columns=cols) + df.to_sql("dtype_test", self.conn) + df.to_sql("dtype_test2", self.conn, dtype={"B": sqlalchemy.TEXT}) + meta = sqlalchemy.schema.MetaData(bind=self.conn) + meta.reflect() + sqltype = meta.tables["dtype_test2"].columns["B"].type + assert isinstance(sqltype, sqlalchemy.TEXT) + msg = "The type of B is not a SQLAlchemy type" + with pytest.raises(ValueError, match=msg): + df.to_sql("error", self.conn, dtype={"B": str}) + + # GH9083 + df.to_sql("dtype_test3", self.conn, dtype={"B": sqlalchemy.String(10)}) + meta.reflect() + sqltype = meta.tables["dtype_test3"].columns["B"].type + assert isinstance(sqltype, sqlalchemy.String) + assert sqltype.length == 10 + + # single dtype + df.to_sql("single_dtype_test", self.conn, dtype=sqlalchemy.TEXT) + meta = sqlalchemy.schema.MetaData(bind=self.conn) + meta.reflect() + sqltypea = meta.tables["single_dtype_test"].columns["A"].type + sqltypeb = meta.tables["single_dtype_test"].columns["B"].type + assert isinstance(sqltypea, sqlalchemy.TEXT) + assert isinstance(sqltypeb, sqlalchemy.TEXT) + + def test_notna_dtype(self): + cols = { + "Bool": Series([True, None]), + "Date": Series([datetime(2012, 5, 1), None]), + "Int": Series([1, None], dtype="object"), + "Float": Series([1.1, None]), + } + df = DataFrame(cols) + + tbl = "notna_dtype_test" + df.to_sql(tbl, self.conn) + returned_df = sql.read_sql_table(tbl, self.conn) # noqa + meta = sqlalchemy.schema.MetaData(bind=self.conn) + meta.reflect() + if self.flavor == "mysql": + my_type = sqltypes.Integer + else: + my_type = sqltypes.Boolean + + col_dict = meta.tables[tbl].columns + + assert isinstance(col_dict["Bool"].type, my_type) + assert isinstance(col_dict["Date"].type, sqltypes.DateTime) + assert isinstance(col_dict["Int"].type, sqltypes.Integer) + assert isinstance(col_dict["Float"].type, sqltypes.Float) + + def test_double_precision(self): + V = 1.23456789101112131415 + + df = DataFrame( + { + "f32": Series([V], dtype="float32"), + "f64": Series([V], dtype="float64"), + "f64_as_f32": Series([V], dtype="float64"), + "i32": Series([5], dtype="int32"), + "i64": Series([5], dtype="int64"), + } + ) + + df.to_sql( + "test_dtypes", + self.conn, + index=False, + if_exists="replace", + dtype={"f64_as_f32": sqlalchemy.Float(precision=23)}, + ) + res = sql.read_sql_table("test_dtypes", self.conn) + + # check precision of float64 + assert np.round(df["f64"].iloc[0], 14) == np.round(res["f64"].iloc[0], 14) + + # check sql types + meta = sqlalchemy.schema.MetaData(bind=self.conn) + meta.reflect() + col_dict = meta.tables["test_dtypes"].columns + assert str(col_dict["f32"].type) == str(col_dict["f64_as_f32"].type) + assert isinstance(col_dict["f32"].type, sqltypes.Float) + assert isinstance(col_dict["f64"].type, sqltypes.Float) + assert isinstance(col_dict["i32"].type, sqltypes.Integer) + assert isinstance(col_dict["i64"].type, sqltypes.BigInteger) + + def test_connectable_issue_example(self): + # This tests the example raised in issue + # https://github.com/pandas-dev/pandas/issues/10104 + + def foo(connection): + query = "SELECT test_foo_data FROM test_foo_data" + return sql.read_sql_query(query, con=connection) + + def bar(connection, data): + data.to_sql(name="test_foo_data", con=connection, if_exists="append") + + def main(connectable): + with connectable.connect() as conn: + with conn.begin(): + foo_data = conn.run_callable(foo) + conn.run_callable(bar, foo_data) + + DataFrame({"test_foo_data": [0, 1, 2]}).to_sql("test_foo_data", self.conn) + main(self.conn) + + def test_temporary_table(self): + test_data = "Hello, World!" + expected = DataFrame({"spam": [test_data]}) + Base = declarative.declarative_base() + + class Temporary(Base): + __tablename__ = "temp_test" + __table_args__ = {"prefixes": ["TEMPORARY"]} + id = sqlalchemy.Column(sqlalchemy.Integer, primary_key=True) + spam = sqlalchemy.Column(sqlalchemy.Unicode(30), nullable=False) + + Session = sa_session.sessionmaker(bind=self.conn) + session = Session() + with session.transaction: + conn = session.connection() + Temporary.__table__.create(conn) + session.add(Temporary(spam=test_data)) + session.flush() + df = sql.read_sql_query(sql=sqlalchemy.select([Temporary.spam]), con=conn) + + tm.assert_frame_equal(df, expected) + + +class _TestSQLAlchemyConn(_EngineToConnMixin, _TestSQLAlchemy): + def test_transactions(self): + pytest.skip("Nested transactions rollbacks don't work with Pandas") + + +class _TestSQLiteAlchemy: + """ + Test the sqlalchemy backend against an in-memory sqlite database. + + """ + + flavor = "sqlite" + + @classmethod + def connect(cls): + return sqlalchemy.create_engine("sqlite:///:memory:") + + @classmethod + def setup_driver(cls): + # sqlite3 is built-in + cls.driver = None + + def test_default_type_conversion(self): + df = sql.read_sql_table("types_test_data", self.conn) + + assert issubclass(df.FloatCol.dtype.type, np.floating) + assert issubclass(df.IntCol.dtype.type, np.integer) + + # sqlite has no boolean type, so integer type is returned + assert issubclass(df.BoolCol.dtype.type, np.integer) + + # Int column with NA values stays as float + assert issubclass(df.IntColWithNull.dtype.type, np.floating) + + # Non-native Bool column with NA values stays as float + assert issubclass(df.BoolColWithNull.dtype.type, np.floating) + + def test_default_date_load(self): + df = sql.read_sql_table("types_test_data", self.conn) + + # IMPORTANT - sqlite has no native date type, so shouldn't parse, but + assert not issubclass(df.DateCol.dtype.type, np.datetime64) + + def test_bigint_warning(self): + # test no warning for BIGINT (to support int64) is raised (GH7433) + df = DataFrame({"a": [1, 2]}, dtype="int64") + df.to_sql("test_bigintwarning", self.conn, index=False) + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + sql.read_sql_table("test_bigintwarning", self.conn) + assert len(w) == 0 + + +class _TestMySQLAlchemy: + """ + Test the sqlalchemy backend against an MySQL database. + + """ + + flavor = "mysql" + + @classmethod + def connect(cls): + url = "mysql+{driver}://root@localhost/pandas_nosetest" + return sqlalchemy.create_engine( + url.format(driver=cls.driver), connect_args=cls.connect_args + ) + + @classmethod + def setup_driver(cls): + pymysql = pytest.importorskip("pymysql") + cls.driver = "pymysql" + cls.connect_args = {"client_flag": pymysql.constants.CLIENT.MULTI_STATEMENTS} + + def test_default_type_conversion(self): + df = sql.read_sql_table("types_test_data", self.conn) + + assert issubclass(df.FloatCol.dtype.type, np.floating) + assert issubclass(df.IntCol.dtype.type, np.integer) + + # MySQL has no real BOOL type (it's an alias for TINYINT) + assert issubclass(df.BoolCol.dtype.type, np.integer) + + # Int column with NA values stays as float + assert issubclass(df.IntColWithNull.dtype.type, np.floating) + + # Bool column with NA = int column with NA values => becomes float + assert issubclass(df.BoolColWithNull.dtype.type, np.floating) + + def test_read_procedure(self): + import pymysql + + # see GH7324. Although it is more an api test, it is added to the + # mysql tests as sqlite does not have stored procedures + df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) + df.to_sql("test_procedure", self.conn, index=False) + + proc = """DROP PROCEDURE IF EXISTS get_testdb; + + CREATE PROCEDURE get_testdb () + + BEGIN + SELECT * FROM test_procedure; + END""" + + connection = self.conn.connect() + trans = connection.begin() + try: + r1 = connection.execute(proc) # noqa + trans.commit() + except pymysql.Error: + trans.rollback() + raise + + res1 = sql.read_sql_query("CALL get_testdb();", self.conn) + tm.assert_frame_equal(df, res1) + + # test delegation to read_sql_query + res2 = sql.read_sql("CALL get_testdb();", self.conn) + tm.assert_frame_equal(df, res2) + + +class _TestPostgreSQLAlchemy: + """ + Test the sqlalchemy backend against an PostgreSQL database. + + """ + + flavor = "postgresql" + + @classmethod + def connect(cls): + url = "postgresql+{driver}://postgres@localhost/pandas_nosetest" + return sqlalchemy.create_engine(url.format(driver=cls.driver)) + + @classmethod + def setup_driver(cls): + pytest.importorskip("psycopg2") + cls.driver = "psycopg2" + + def test_schema_support(self): + # only test this for postgresql (schema's not supported in + # mysql/sqlite) + df = DataFrame({"col1": [1, 2], "col2": [0.1, 0.2], "col3": ["a", "n"]}) + + # create a schema + self.conn.execute("DROP SCHEMA IF EXISTS other CASCADE;") + self.conn.execute("CREATE SCHEMA other;") + + # write dataframe to different schema's + df.to_sql("test_schema_public", self.conn, index=False) + df.to_sql( + "test_schema_public_explicit", self.conn, index=False, schema="public" + ) + df.to_sql("test_schema_other", self.conn, index=False, schema="other") + + # read dataframes back in + res1 = sql.read_sql_table("test_schema_public", self.conn) + tm.assert_frame_equal(df, res1) + res2 = sql.read_sql_table("test_schema_public_explicit", self.conn) + tm.assert_frame_equal(df, res2) + res3 = sql.read_sql_table( + "test_schema_public_explicit", self.conn, schema="public" + ) + tm.assert_frame_equal(df, res3) + res4 = sql.read_sql_table("test_schema_other", self.conn, schema="other") + tm.assert_frame_equal(df, res4) + msg = "Table test_schema_other not found" + with pytest.raises(ValueError, match=msg): + sql.read_sql_table("test_schema_other", self.conn, schema="public") + + # different if_exists options + + # create a schema + self.conn.execute("DROP SCHEMA IF EXISTS other CASCADE;") + self.conn.execute("CREATE SCHEMA other;") + + # write dataframe with different if_exists options + df.to_sql("test_schema_other", self.conn, schema="other", index=False) + df.to_sql( + "test_schema_other", + self.conn, + schema="other", + index=False, + if_exists="replace", + ) + df.to_sql( + "test_schema_other", + self.conn, + schema="other", + index=False, + if_exists="append", + ) + res = sql.read_sql_table("test_schema_other", self.conn, schema="other") + tm.assert_frame_equal(concat([df, df], ignore_index=True), res) + + # specifying schema in user-provided meta + + # The schema won't be applied on another Connection + # because of transactional schemas + if isinstance(self.conn, sqlalchemy.engine.Engine): + engine2 = self.connect() + meta = sqlalchemy.MetaData(engine2, schema="other") + pdsql = sql.SQLDatabase(engine2, meta=meta) + pdsql.to_sql(df, "test_schema_other2", index=False) + pdsql.to_sql(df, "test_schema_other2", index=False, if_exists="replace") + pdsql.to_sql(df, "test_schema_other2", index=False, if_exists="append") + res1 = sql.read_sql_table("test_schema_other2", self.conn, schema="other") + res2 = pdsql.read_table("test_schema_other2") + tm.assert_frame_equal(res1, res2) + + def test_copy_from_callable_insertion_method(self): + # GH 8953 + # Example in io.rst found under _io.sql.method + # not available in sqlite, mysql + def psql_insert_copy(table, conn, keys, data_iter): + # gets a DBAPI connection that can provide a cursor + dbapi_conn = conn.connection + with dbapi_conn.cursor() as cur: + s_buf = StringIO() + writer = csv.writer(s_buf) + writer.writerows(data_iter) + s_buf.seek(0) + + columns = ", ".join(f'"{k}"' for k in keys) + if table.schema: + table_name = f"{table.schema}.{table.name}" + else: + table_name = table.name + + sql_query = f"COPY {table_name} ({columns}) FROM STDIN WITH CSV" + cur.copy_expert(sql=sql_query, file=s_buf) + + expected = DataFrame({"col1": [1, 2], "col2": [0.1, 0.2], "col3": ["a", "n"]}) + expected.to_sql( + "test_copy_insert", self.conn, index=False, method=psql_insert_copy + ) + result = sql.read_sql_table("test_copy_insert", self.conn) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.single +@pytest.mark.db +class TestMySQLAlchemy(_TestMySQLAlchemy, _TestSQLAlchemy): + pass + + +@pytest.mark.single +@pytest.mark.db +class TestMySQLAlchemyConn(_TestMySQLAlchemy, _TestSQLAlchemyConn): + pass + + +@pytest.mark.single +@pytest.mark.db +class TestPostgreSQLAlchemy(_TestPostgreSQLAlchemy, _TestSQLAlchemy): + pass + + +@pytest.mark.single +@pytest.mark.db +class TestPostgreSQLAlchemyConn(_TestPostgreSQLAlchemy, _TestSQLAlchemyConn): + pass + + +@pytest.mark.single +class TestSQLiteAlchemy(_TestSQLiteAlchemy, _TestSQLAlchemy): + pass + + +@pytest.mark.single +class TestSQLiteAlchemyConn(_TestSQLiteAlchemy, _TestSQLAlchemyConn): + pass + + +# ----------------------------------------------------------------------------- +# -- Test Sqlite / MySQL fallback + + +@pytest.mark.single +class TestSQLiteFallback(SQLiteMixIn, PandasSQLTest): + """ + Test the fallback mode against an in-memory sqlite database. + + """ + + flavor = "sqlite" + + @classmethod + def connect(cls): + return sqlite3.connect(":memory:") + + def setup_connect(self): + self.conn = self.connect() + + def load_test_data_and_sql(self): + self.pandasSQL = sql.SQLiteDatabase(self.conn) + self._load_test1_data() + + @pytest.fixture(autouse=True) + def setup_method(self, load_iris_data): + self.load_test_data_and_sql() + + def test_read_sql(self): + self._read_sql_iris() + + def test_read_sql_parameter(self): + self._read_sql_iris_parameter() + + def test_read_sql_named_parameter(self): + self._read_sql_iris_named_parameter() + + def test_to_sql(self): + self._to_sql() + + def test_to_sql_empty(self): + self._to_sql_empty() + + def test_to_sql_fail(self): + self._to_sql_fail() + + def test_to_sql_replace(self): + self._to_sql_replace() + + def test_to_sql_append(self): + self._to_sql_append() + + def test_create_and_drop_table(self): + temp_frame = DataFrame( + {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]} + ) + + self.pandasSQL.to_sql(temp_frame, "drop_test_frame") + + assert self.pandasSQL.has_table("drop_test_frame") + + self.pandasSQL.drop_table("drop_test_frame") + + assert not self.pandasSQL.has_table("drop_test_frame") + + def test_roundtrip(self): + self._roundtrip() + + def test_execute_sql(self): + self._execute_sql() + + def test_datetime_date(self): + # test support for datetime.date + df = DataFrame([date(2014, 1, 1), date(2014, 1, 2)], columns=["a"]) + df.to_sql("test_date", self.conn, index=False) + res = read_sql_query("SELECT * FROM test_date", self.conn) + if self.flavor == "sqlite": + # comes back as strings + tm.assert_frame_equal(res, df.astype(str)) + elif self.flavor == "mysql": + tm.assert_frame_equal(res, df) + + def test_datetime_time(self): + # test support for datetime.time, GH #8341 + df = DataFrame([time(9, 0, 0), time(9, 1, 30)], columns=["a"]) + df.to_sql("test_time", self.conn, index=False) + res = read_sql_query("SELECT * FROM test_time", self.conn) + if self.flavor == "sqlite": + # comes back as strings + expected = df.applymap(lambda _: _.strftime("%H:%M:%S.%f")) + tm.assert_frame_equal(res, expected) + + def _get_index_columns(self, tbl_name): + ixs = sql.read_sql_query( + "SELECT * FROM sqlite_master WHERE type = 'index' " + + f"AND tbl_name = '{tbl_name}'", + self.conn, + ) + ix_cols = [] + for ix_name in ixs.name: + ix_info = sql.read_sql_query(f"PRAGMA index_info({ix_name})", self.conn) + ix_cols.append(ix_info.name.tolist()) + return ix_cols + + def test_to_sql_save_index(self): + self._to_sql_save_index() + + def test_transactions(self): + self._transaction_test() + + def _get_sqlite_column_type(self, table, column): + recs = self.conn.execute(f"PRAGMA table_info({table})") + for cid, name, ctype, not_null, default, pk in recs: + if name == column: + return ctype + raise ValueError(f"Table {table}, column {column} not found") + + def test_dtype(self): + if self.flavor == "mysql": + pytest.skip("Not applicable to MySQL legacy") + cols = ["A", "B"] + data = [(0.8, True), (0.9, None)] + df = DataFrame(data, columns=cols) + df.to_sql("dtype_test", self.conn) + df.to_sql("dtype_test2", self.conn, dtype={"B": "STRING"}) + + # sqlite stores Boolean values as INTEGER + assert self._get_sqlite_column_type("dtype_test", "B") == "INTEGER" + + assert self._get_sqlite_column_type("dtype_test2", "B") == "STRING" + msg = r"B \(\) not a string" + with pytest.raises(ValueError, match=msg): + df.to_sql("error", self.conn, dtype={"B": bool}) + + # single dtype + df.to_sql("single_dtype_test", self.conn, dtype="STRING") + assert self._get_sqlite_column_type("single_dtype_test", "A") == "STRING" + assert self._get_sqlite_column_type("single_dtype_test", "B") == "STRING" + + def test_notna_dtype(self): + if self.flavor == "mysql": + pytest.skip("Not applicable to MySQL legacy") + + cols = { + "Bool": Series([True, None]), + "Date": Series([datetime(2012, 5, 1), None]), + "Int": Series([1, None], dtype="object"), + "Float": Series([1.1, None]), + } + df = DataFrame(cols) + + tbl = "notna_dtype_test" + df.to_sql(tbl, self.conn) + + assert self._get_sqlite_column_type(tbl, "Bool") == "INTEGER" + assert self._get_sqlite_column_type(tbl, "Date") == "TIMESTAMP" + assert self._get_sqlite_column_type(tbl, "Int") == "INTEGER" + assert self._get_sqlite_column_type(tbl, "Float") == "REAL" + + def test_illegal_names(self): + # For sqlite, these should work fine + df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"]) + + msg = "Empty table or column name specified" + with pytest.raises(ValueError, match=msg): + df.to_sql("", self.conn) + + for ndx, weird_name in enumerate( + [ + "test_weird_name]", + "test_weird_name[", + "test_weird_name`", + 'test_weird_name"', + "test_weird_name'", + "_b.test_weird_name_01-30", + '"_b.test_weird_name_01-30"', + "99beginswithnumber", + "12345", + "\xe9", + ] + ): + df.to_sql(weird_name, self.conn) + sql.table_exists(weird_name, self.conn) + + df2 = DataFrame([[1, 2], [3, 4]], columns=["a", weird_name]) + c_tbl = f"test_weird_col_name{ndx:d}" + df2.to_sql(c_tbl, self.conn) + sql.table_exists(c_tbl, self.conn) + + +# ----------------------------------------------------------------------------- +# -- Old tests from 0.13.1 (before refactor using sqlalchemy) + + +def date_format(dt): + """Returns date in YYYYMMDD format.""" + return dt.strftime("%Y%m%d") + + +_formatters = { + datetime: "'{}'".format, + str: "'{}'".format, + np.str_: "'{}'".format, + bytes: "'{}'".format, + float: "{:.8f}".format, + int: "{:d}".format, + type(None): lambda x: "NULL", + np.float64: "{:.10f}".format, + bool: "'{!s}'".format, +} + + +def format_query(sql, *args): + """ + + """ + processed_args = [] + for arg in args: + if isinstance(arg, float) and isna(arg): + arg = None + + formatter = _formatters[type(arg)] + processed_args.append(formatter(arg)) + + return sql % tuple(processed_args) + + +def tquery(query, con=None, cur=None): + """Replace removed sql.tquery function""" + res = sql.execute(query, con=con, cur=cur).fetchall() + if res is None: + return None + else: + return list(res) + + +@pytest.mark.single +class TestXSQLite(SQLiteMixIn): + @pytest.fixture(autouse=True) + def setup_method(self, request, datapath): + self.method = request.function + self.conn = sqlite3.connect(":memory:") + + # In some test cases we may close db connection + # Re-open conn here so we can perform cleanup in teardown + yield + self.method = request.function + self.conn = sqlite3.connect(":memory:") + + def test_basic(self): + frame = tm.makeTimeDataFrame() + self._check_roundtrip(frame) + + def test_write_row_by_row(self): + + frame = tm.makeTimeDataFrame() + frame.iloc[0, 0] = np.nan + create_sql = sql.get_schema(frame, "test") + cur = self.conn.cursor() + cur.execute(create_sql) + + cur = self.conn.cursor() + + ins = "INSERT INTO test VALUES (%s, %s, %s, %s)" + for idx, row in frame.iterrows(): + fmt_sql = format_query(ins, *row) + tquery(fmt_sql, cur=cur) + + self.conn.commit() + + result = sql.read_sql("select * from test", con=self.conn) + result.index = frame.index + tm.assert_frame_equal(result, frame, check_less_precise=True) + + def test_execute(self): + frame = tm.makeTimeDataFrame() + create_sql = sql.get_schema(frame, "test") + cur = self.conn.cursor() + cur.execute(create_sql) + ins = "INSERT INTO test VALUES (?, ?, ?, ?)" + + row = frame.iloc[0] + sql.execute(ins, self.conn, params=tuple(row)) + self.conn.commit() + + result = sql.read_sql("select * from test", self.conn) + result.index = frame.index[:1] + tm.assert_frame_equal(result, frame[:1]) + + def test_schema(self): + frame = tm.makeTimeDataFrame() + create_sql = sql.get_schema(frame, "test") + lines = create_sql.splitlines() + for l in lines: + tokens = l.split(" ") + if len(tokens) == 2 and tokens[0] == "A": + assert tokens[1] == "DATETIME" + + frame = tm.makeTimeDataFrame() + create_sql = sql.get_schema(frame, "test", keys=["A", "B"]) + lines = create_sql.splitlines() + assert 'PRIMARY KEY ("A", "B")' in create_sql + cur = self.conn.cursor() + cur.execute(create_sql) + + def test_execute_fail(self): + create_sql = """ + CREATE TABLE test + ( + a TEXT, + b TEXT, + c REAL, + PRIMARY KEY (a, b) + ); + """ + cur = self.conn.cursor() + cur.execute(create_sql) + + sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)', self.conn) + sql.execute('INSERT INTO test VALUES("foo", "baz", 2.567)', self.conn) + + with pytest.raises(Exception): + sql.execute('INSERT INTO test VALUES("foo", "bar", 7)', self.conn) + + def test_execute_closed_connection(self): + create_sql = """ + CREATE TABLE test + ( + a TEXT, + b TEXT, + c REAL, + PRIMARY KEY (a, b) + ); + """ + cur = self.conn.cursor() + cur.execute(create_sql) + + sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)', self.conn) + self.conn.close() + + with pytest.raises(Exception): + tquery("select * from test", con=self.conn) + + def test_na_roundtrip(self): + pass + + def _check_roundtrip(self, frame): + sql.to_sql(frame, name="test_table", con=self.conn, index=False) + result = sql.read_sql("select * from test_table", self.conn) + + # HACK! Change this once indexes are handled properly. + result.index = frame.index + + expected = frame + tm.assert_frame_equal(result, expected) + + frame["txt"] = ["a"] * len(frame) + frame2 = frame.copy() + new_idx = Index(np.arange(len(frame2))) + 10 + frame2["Idx"] = new_idx.copy() + sql.to_sql(frame2, name="test_table2", con=self.conn, index=False) + result = sql.read_sql("select * from test_table2", self.conn, index_col="Idx") + expected = frame.copy() + expected.index = new_idx + expected.index.name = "Idx" + tm.assert_frame_equal(expected, result) + + def test_keyword_as_column_names(self): + df = DataFrame({"From": np.ones(5)}) + sql.to_sql(df, con=self.conn, name="testkeywords", index=False) + + def test_onecolumn_of_integer(self): + # GH 3628 + # a column_of_integers dataframe should transfer well to sql + + mono_df = DataFrame([1, 2], columns=["c0"]) + sql.to_sql(mono_df, con=self.conn, name="mono_df", index=False) + # computing the sum via sql + con_x = self.conn + the_sum = sum(my_c0[0] for my_c0 in con_x.execute("select * from mono_df")) + # it should not fail, and gives 3 ( Issue #3628 ) + assert the_sum == 3 + + result = sql.read_sql("select * from mono_df", con_x) + tm.assert_frame_equal(result, mono_df) + + def test_if_exists(self): + df_if_exists_1 = DataFrame({"col1": [1, 2], "col2": ["A", "B"]}) + df_if_exists_2 = DataFrame({"col1": [3, 4, 5], "col2": ["C", "D", "E"]}) + table_name = "table_if_exists" + sql_select = f"SELECT * FROM {table_name}" + + def clean_up(test_table_to_drop): + """ + Drops tables created from individual tests + so no dependencies arise from sequential tests + """ + self.drop_table(test_table_to_drop) + + msg = "'notvalidvalue' is not valid for if_exists" + with pytest.raises(ValueError, match=msg): + sql.to_sql( + frame=df_if_exists_1, + con=self.conn, + name=table_name, + if_exists="notvalidvalue", + ) + clean_up(table_name) + + # test if_exists='fail' + sql.to_sql( + frame=df_if_exists_1, con=self.conn, name=table_name, if_exists="fail" + ) + msg = "Table 'table_if_exists' already exists" + with pytest.raises(ValueError, match=msg): + sql.to_sql( + frame=df_if_exists_1, con=self.conn, name=table_name, if_exists="fail" + ) + # test if_exists='replace' + sql.to_sql( + frame=df_if_exists_1, + con=self.conn, + name=table_name, + if_exists="replace", + index=False, + ) + assert tquery(sql_select, con=self.conn) == [(1, "A"), (2, "B")] + sql.to_sql( + frame=df_if_exists_2, + con=self.conn, + name=table_name, + if_exists="replace", + index=False, + ) + assert tquery(sql_select, con=self.conn) == [(3, "C"), (4, "D"), (5, "E")] + clean_up(table_name) + + # test if_exists='append' + sql.to_sql( + frame=df_if_exists_1, + con=self.conn, + name=table_name, + if_exists="fail", + index=False, + ) + assert tquery(sql_select, con=self.conn) == [(1, "A"), (2, "B")] + sql.to_sql( + frame=df_if_exists_2, + con=self.conn, + name=table_name, + if_exists="append", + index=False, + ) + assert tquery(sql_select, con=self.conn) == [ + (1, "A"), + (2, "B"), + (3, "C"), + (4, "D"), + (5, "E"), + ] + clean_up(table_name) + + +@pytest.mark.single +@pytest.mark.db +@pytest.mark.skip( + reason="gh-13611: there is no support for MySQL if SQLAlchemy is not installed" +) +class TestXMySQL(MySQLMixIn): + @pytest.fixture(autouse=True, scope="class") + def setup_class(cls): + pymysql = pytest.importorskip("pymysql") + pymysql.connect(host="localhost", user="root", passwd="", db="pandas_nosetest") + try: + pymysql.connect(read_default_group="pandas") + except pymysql.ProgrammingError: + raise RuntimeError( + "Create a group of connection parameters under the heading " + "[pandas] in your system's mysql default file, " + "typically located at ~/.my.cnf or /etc/.my.cnf." + ) + except pymysql.Error: + raise RuntimeError( + "Cannot connect to database. " + "Create a group of connection parameters under the heading " + "[pandas] in your system's mysql default file, " + "typically located at ~/.my.cnf or /etc/.my.cnf." + ) + + @pytest.fixture(autouse=True) + def setup_method(self, request, datapath): + pymysql = pytest.importorskip("pymysql") + pymysql.connect(host="localhost", user="root", passwd="", db="pandas_nosetest") + try: + pymysql.connect(read_default_group="pandas") + except pymysql.ProgrammingError: + raise RuntimeError( + "Create a group of connection parameters under the heading " + "[pandas] in your system's mysql default file, " + "typically located at ~/.my.cnf or /etc/.my.cnf." + ) + except pymysql.Error: + raise RuntimeError( + "Cannot connect to database. " + "Create a group of connection parameters under the heading " + "[pandas] in your system's mysql default file, " + "typically located at ~/.my.cnf or /etc/.my.cnf." + ) + + self.method = request.function + + def test_basic(self): + frame = tm.makeTimeDataFrame() + self._check_roundtrip(frame) + + def test_write_row_by_row(self): + frame = tm.makeTimeDataFrame() + frame.iloc[0, 0] = np.nan + drop_sql = "DROP TABLE IF EXISTS test" + create_sql = sql.get_schema(frame, "test") + cur = self.conn.cursor() + cur.execute(drop_sql) + cur.execute(create_sql) + ins = "INSERT INTO test VALUES (%s, %s, %s, %s)" + for idx, row in frame.iterrows(): + fmt_sql = format_query(ins, *row) + tquery(fmt_sql, cur=cur) + + self.conn.commit() + + result = sql.read_sql("select * from test", con=self.conn) + result.index = frame.index + tm.assert_frame_equal(result, frame, check_less_precise=True) + + def test_chunksize_read_type(self): + frame = tm.makeTimeDataFrame() + frame.index.name = "index" + drop_sql = "DROP TABLE IF EXISTS test" + cur = self.conn.cursor() + cur.execute(drop_sql) + sql.to_sql(frame, name="test", con=self.conn) + query = "select * from test" + chunksize = 5 + chunk_gen = pd.read_sql_query( + sql=query, con=self.conn, chunksize=chunksize, index_col="index" + ) + chunk_df = next(chunk_gen) + tm.assert_frame_equal(frame[:chunksize], chunk_df) + + def test_execute(self): + frame = tm.makeTimeDataFrame() + drop_sql = "DROP TABLE IF EXISTS test" + create_sql = sql.get_schema(frame, "test") + cur = self.conn.cursor() + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "Unknown table.*") + cur.execute(drop_sql) + cur.execute(create_sql) + ins = "INSERT INTO test VALUES (%s, %s, %s, %s)" + + row = frame.iloc[0].values.tolist() + sql.execute(ins, self.conn, params=tuple(row)) + self.conn.commit() + + result = sql.read_sql("select * from test", self.conn) + result.index = frame.index[:1] + tm.assert_frame_equal(result, frame[:1]) + + def test_schema(self): + frame = tm.makeTimeDataFrame() + create_sql = sql.get_schema(frame, "test") + lines = create_sql.splitlines() + for l in lines: + tokens = l.split(" ") + if len(tokens) == 2 and tokens[0] == "A": + assert tokens[1] == "DATETIME" + + frame = tm.makeTimeDataFrame() + drop_sql = "DROP TABLE IF EXISTS test" + create_sql = sql.get_schema(frame, "test", keys=["A", "B"]) + lines = create_sql.splitlines() + assert "PRIMARY KEY (`A`, `B`)" in create_sql + cur = self.conn.cursor() + cur.execute(drop_sql) + cur.execute(create_sql) + + def test_execute_fail(self): + drop_sql = "DROP TABLE IF EXISTS test" + create_sql = """ + CREATE TABLE test + ( + a TEXT, + b TEXT, + c REAL, + PRIMARY KEY (a(5), b(5)) + ); + """ + cur = self.conn.cursor() + cur.execute(drop_sql) + cur.execute(create_sql) + + sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)', self.conn) + sql.execute('INSERT INTO test VALUES("foo", "baz", 2.567)', self.conn) + + with pytest.raises(Exception): + sql.execute('INSERT INTO test VALUES("foo", "bar", 7)', self.conn) + + def test_execute_closed_connection(self, request, datapath): + drop_sql = "DROP TABLE IF EXISTS test" + create_sql = """ + CREATE TABLE test + ( + a TEXT, + b TEXT, + c REAL, + PRIMARY KEY (a(5), b(5)) + ); + """ + cur = self.conn.cursor() + cur.execute(drop_sql) + cur.execute(create_sql) + + sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)', self.conn) + self.conn.close() + + with pytest.raises(Exception): + tquery("select * from test", con=self.conn) + + # Initialize connection again (needed for tearDown) + self.setup_method(request, datapath) + + def test_na_roundtrip(self): + pass + + def _check_roundtrip(self, frame): + drop_sql = "DROP TABLE IF EXISTS test_table" + cur = self.conn.cursor() + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "Unknown table.*") + cur.execute(drop_sql) + sql.to_sql(frame, name="test_table", con=self.conn, index=False) + result = sql.read_sql("select * from test_table", self.conn) + + # HACK! Change this once indexes are handled properly. + result.index = frame.index + result.index.name = frame.index.name + + expected = frame + tm.assert_frame_equal(result, expected) + + frame["txt"] = ["a"] * len(frame) + frame2 = frame.copy() + index = Index(np.arange(len(frame2))) + 10 + frame2["Idx"] = index + drop_sql = "DROP TABLE IF EXISTS test_table2" + cur = self.conn.cursor() + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "Unknown table.*") + cur.execute(drop_sql) + sql.to_sql(frame2, name="test_table2", con=self.conn, index=False) + result = sql.read_sql("select * from test_table2", self.conn, index_col="Idx") + expected = frame.copy() + + # HACK! Change this once indexes are handled properly. + expected.index = index + expected.index.names = result.index.names + tm.assert_frame_equal(expected, result) + + def test_keyword_as_column_names(self): + df = DataFrame({"From": np.ones(5)}) + sql.to_sql( + df, con=self.conn, name="testkeywords", if_exists="replace", index=False + ) + + def test_if_exists(self): + df_if_exists_1 = DataFrame({"col1": [1, 2], "col2": ["A", "B"]}) + df_if_exists_2 = DataFrame({"col1": [3, 4, 5], "col2": ["C", "D", "E"]}) + table_name = "table_if_exists" + sql_select = f"SELECT * FROM {table_name}" + + def clean_up(test_table_to_drop): + """ + Drops tables created from individual tests + so no dependencies arise from sequential tests + """ + self.drop_table(test_table_to_drop) + + # test if invalid value for if_exists raises appropriate error + with pytest.raises(ValueError, match=""): + sql.to_sql( + frame=df_if_exists_1, + con=self.conn, + name=table_name, + if_exists="notvalidvalue", + ) + clean_up(table_name) + + # test if_exists='fail' + sql.to_sql( + frame=df_if_exists_1, + con=self.conn, + name=table_name, + if_exists="fail", + index=False, + ) + with pytest.raises(ValueError, match=""): + sql.to_sql( + frame=df_if_exists_1, con=self.conn, name=table_name, if_exists="fail" + ) + + # test if_exists='replace' + sql.to_sql( + frame=df_if_exists_1, + con=self.conn, + name=table_name, + if_exists="replace", + index=False, + ) + assert tquery(sql_select, con=self.conn) == [(1, "A"), (2, "B")] + sql.to_sql( + frame=df_if_exists_2, + con=self.conn, + name=table_name, + if_exists="replace", + index=False, + ) + assert tquery(sql_select, con=self.conn) == [(3, "C"), (4, "D"), (5, "E")] + clean_up(table_name) + + # test if_exists='append' + sql.to_sql( + frame=df_if_exists_1, + con=self.conn, + name=table_name, + if_exists="fail", + index=False, + ) + assert tquery(sql_select, con=self.conn) == [(1, "A"), (2, "B")] + sql.to_sql( + frame=df_if_exists_2, + con=self.conn, + name=table_name, + if_exists="append", + index=False, + ) + assert tquery(sql_select, con=self.conn) == [ + (1, "A"), + (2, "B"), + (3, "C"), + (4, "D"), + (5, "E"), + ] + clean_up(table_name) diff --git a/venv/Lib/site-packages/pandas/tests/io/test_stata.py b/venv/Lib/site-packages/pandas/tests/io/test_stata.py new file mode 100644 index 0000000..8e459f0 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/io/test_stata.py @@ -0,0 +1,1818 @@ +import datetime as dt +from datetime import datetime +import gzip +import io +import os +import struct +import warnings + +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_categorical_dtype + +import pandas as pd +import pandas._testing as tm +from pandas.core.frame import DataFrame, Series + +from pandas.io.parsers import read_csv +from pandas.io.stata import ( + InvalidColumnName, + PossiblePrecisionLoss, + StataMissingValue, + StataReader, + StataWriterUTF8, + read_stata, +) + + +@pytest.fixture() +def mixed_frame(): + return pd.DataFrame( + { + "a": [1, 2, 3, 4], + "b": [1.0, 3.0, 27.0, 81.0], + "c": ["Atlanta", "Birmingham", "Cincinnati", "Detroit"], + } + ) + + +@pytest.fixture +def dirpath(datapath): + return datapath("io", "data", "stata") + + +@pytest.fixture +def parsed_114(dirpath): + dta14_114 = os.path.join(dirpath, "stata5_114.dta") + parsed_114 = read_stata(dta14_114, convert_dates=True) + parsed_114.index.name = "index" + return parsed_114 + + +class TestStata: + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath("io", "data", "stata") + self.dta1_114 = os.path.join(self.dirpath, "stata1_114.dta") + self.dta1_117 = os.path.join(self.dirpath, "stata1_117.dta") + + self.dta2_113 = os.path.join(self.dirpath, "stata2_113.dta") + self.dta2_114 = os.path.join(self.dirpath, "stata2_114.dta") + self.dta2_115 = os.path.join(self.dirpath, "stata2_115.dta") + self.dta2_117 = os.path.join(self.dirpath, "stata2_117.dta") + + self.dta3_113 = os.path.join(self.dirpath, "stata3_113.dta") + self.dta3_114 = os.path.join(self.dirpath, "stata3_114.dta") + self.dta3_115 = os.path.join(self.dirpath, "stata3_115.dta") + self.dta3_117 = os.path.join(self.dirpath, "stata3_117.dta") + self.csv3 = os.path.join(self.dirpath, "stata3.csv") + + self.dta4_113 = os.path.join(self.dirpath, "stata4_113.dta") + self.dta4_114 = os.path.join(self.dirpath, "stata4_114.dta") + self.dta4_115 = os.path.join(self.dirpath, "stata4_115.dta") + self.dta4_117 = os.path.join(self.dirpath, "stata4_117.dta") + + self.dta_encoding = os.path.join(self.dirpath, "stata1_encoding.dta") + self.dta_encoding_118 = os.path.join(self.dirpath, "stata1_encoding_118.dta") + + self.csv14 = os.path.join(self.dirpath, "stata5.csv") + self.dta14_113 = os.path.join(self.dirpath, "stata5_113.dta") + self.dta14_114 = os.path.join(self.dirpath, "stata5_114.dta") + self.dta14_115 = os.path.join(self.dirpath, "stata5_115.dta") + self.dta14_117 = os.path.join(self.dirpath, "stata5_117.dta") + + self.csv15 = os.path.join(self.dirpath, "stata6.csv") + self.dta15_113 = os.path.join(self.dirpath, "stata6_113.dta") + self.dta15_114 = os.path.join(self.dirpath, "stata6_114.dta") + self.dta15_115 = os.path.join(self.dirpath, "stata6_115.dta") + self.dta15_117 = os.path.join(self.dirpath, "stata6_117.dta") + + self.dta16_115 = os.path.join(self.dirpath, "stata7_115.dta") + self.dta16_117 = os.path.join(self.dirpath, "stata7_117.dta") + + self.dta17_113 = os.path.join(self.dirpath, "stata8_113.dta") + self.dta17_115 = os.path.join(self.dirpath, "stata8_115.dta") + self.dta17_117 = os.path.join(self.dirpath, "stata8_117.dta") + + self.dta18_115 = os.path.join(self.dirpath, "stata9_115.dta") + self.dta18_117 = os.path.join(self.dirpath, "stata9_117.dta") + + self.dta19_115 = os.path.join(self.dirpath, "stata10_115.dta") + self.dta19_117 = os.path.join(self.dirpath, "stata10_117.dta") + + self.dta20_115 = os.path.join(self.dirpath, "stata11_115.dta") + self.dta20_117 = os.path.join(self.dirpath, "stata11_117.dta") + + self.dta21_117 = os.path.join(self.dirpath, "stata12_117.dta") + + self.dta22_118 = os.path.join(self.dirpath, "stata14_118.dta") + self.dta23 = os.path.join(self.dirpath, "stata15.dta") + + self.dta24_111 = os.path.join(self.dirpath, "stata7_111.dta") + self.dta25_118 = os.path.join(self.dirpath, "stata16_118.dta") + + self.dta26_119 = os.path.join(self.dirpath, "stata1_119.dta.gz") + + self.stata_dates = os.path.join(self.dirpath, "stata13_dates.dta") + + def read_dta(self, file): + # Legacy default reader configuration + return read_stata(file, convert_dates=True) + + def read_csv(self, file): + return read_csv(file, parse_dates=True) + + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) + def test_read_empty_dta(self, version): + empty_ds = DataFrame(columns=["unit"]) + # GH 7369, make sure can read a 0-obs dta file + with tm.ensure_clean() as path: + empty_ds.to_stata(path, write_index=False, version=version) + empty_ds2 = read_stata(path) + tm.assert_frame_equal(empty_ds, empty_ds2) + + @pytest.mark.parametrize("file", ["dta1_114", "dta1_117"]) + def test_read_dta1(self, file): + + file = getattr(self, file) + parsed = self.read_dta(file) + + # Pandas uses np.nan as missing value. + # Thus, all columns will be of type float, regardless of their name. + expected = DataFrame( + [(np.nan, np.nan, np.nan, np.nan, np.nan)], + columns=["float_miss", "double_miss", "byte_miss", "int_miss", "long_miss"], + ) + + # this is an oddity as really the nan should be float64, but + # the casting doesn't fail so need to match stata here + expected["float_miss"] = expected["float_miss"].astype(np.float32) + + tm.assert_frame_equal(parsed, expected) + + def test_read_dta2(self): + + expected = DataFrame.from_records( + [ + ( + datetime(2006, 11, 19, 23, 13, 20), + 1479596223000, + datetime(2010, 1, 20), + datetime(2010, 1, 8), + datetime(2010, 1, 1), + datetime(1974, 7, 1), + datetime(2010, 1, 1), + datetime(2010, 1, 1), + ), + ( + datetime(1959, 12, 31, 20, 3, 20), + -1479590, + datetime(1953, 10, 2), + datetime(1948, 6, 10), + datetime(1955, 1, 1), + datetime(1955, 7, 1), + datetime(1955, 1, 1), + datetime(2, 1, 1), + ), + (pd.NaT, pd.NaT, pd.NaT, pd.NaT, pd.NaT, pd.NaT, pd.NaT, pd.NaT), + ], + columns=[ + "datetime_c", + "datetime_big_c", + "date", + "weekly_date", + "monthly_date", + "quarterly_date", + "half_yearly_date", + "yearly_date", + ], + ) + expected["yearly_date"] = expected["yearly_date"].astype("O") + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + parsed_114 = self.read_dta(self.dta2_114) + parsed_115 = self.read_dta(self.dta2_115) + parsed_117 = self.read_dta(self.dta2_117) + # 113 is buggy due to limits of date format support in Stata + # parsed_113 = self.read_dta(self.dta2_113) + + # Remove resource warnings + w = [x for x in w if x.category is UserWarning] + + # should get warning for each call to read_dta + assert len(w) == 3 + + # buggy test because of the NaT comparison on certain platforms + # Format 113 test fails since it does not support tc and tC formats + # tm.assert_frame_equal(parsed_113, expected) + tm.assert_frame_equal(parsed_114, expected, check_datetimelike_compat=True) + tm.assert_frame_equal(parsed_115, expected, check_datetimelike_compat=True) + tm.assert_frame_equal(parsed_117, expected, check_datetimelike_compat=True) + + @pytest.mark.parametrize("file", ["dta3_113", "dta3_114", "dta3_115", "dta3_117"]) + def test_read_dta3(self, file): + + file = getattr(self, file) + parsed = self.read_dta(file) + + # match stata here + expected = self.read_csv(self.csv3) + expected = expected.astype(np.float32) + expected["year"] = expected["year"].astype(np.int16) + expected["quarter"] = expected["quarter"].astype(np.int8) + + tm.assert_frame_equal(parsed, expected) + + @pytest.mark.parametrize("file", ["dta4_113", "dta4_114", "dta4_115", "dta4_117"]) + def test_read_dta4(self, file): + + file = getattr(self, file) + parsed = self.read_dta(file) + + expected = DataFrame.from_records( + [ + ["one", "ten", "one", "one", "one"], + ["two", "nine", "two", "two", "two"], + ["three", "eight", "three", "three", "three"], + ["four", "seven", 4, "four", "four"], + ["five", "six", 5, np.nan, "five"], + ["six", "five", 6, np.nan, "six"], + ["seven", "four", 7, np.nan, "seven"], + ["eight", "three", 8, np.nan, "eight"], + ["nine", "two", 9, np.nan, "nine"], + ["ten", "one", "ten", np.nan, "ten"], + ], + columns=[ + "fully_labeled", + "fully_labeled2", + "incompletely_labeled", + "labeled_with_missings", + "float_labelled", + ], + ) + + # these are all categoricals + expected = pd.concat( + [expected[col].astype("category") for col in expected], axis=1 + ) + + # stata doesn't save .category metadata + tm.assert_frame_equal(parsed, expected, check_categorical=False) + + # File containing strls + def test_read_dta12(self): + parsed_117 = self.read_dta(self.dta21_117) + expected = DataFrame.from_records( + [ + [1, "abc", "abcdefghi"], + [3, "cba", "qwertywertyqwerty"], + [93, "", "strl"], + ], + columns=["x", "y", "z"], + ) + + tm.assert_frame_equal(parsed_117, expected, check_dtype=False) + + def test_read_dta18(self): + parsed_118 = self.read_dta(self.dta22_118) + parsed_118["Bytes"] = parsed_118["Bytes"].astype("O") + expected = DataFrame.from_records( + [ + ["Cat", "Bogota", "Bogotá", 1, 1.0, "option b Ünicode", 1.0], + ["Dog", "Boston", "Uzunköprü", np.nan, np.nan, np.nan, np.nan], + ["Plane", "Rome", "Tromsø", 0, 0.0, "option a", 0.0], + ["Potato", "Tokyo", "Elâzığ", -4, 4.0, 4, 4], + ["", "", "", 0, 0.3332999, "option a", 1 / 3.0], + ], + columns=[ + "Things", + "Cities", + "Unicode_Cities_Strl", + "Ints", + "Floats", + "Bytes", + "Longs", + ], + ) + expected["Floats"] = expected["Floats"].astype(np.float32) + for col in parsed_118.columns: + tm.assert_almost_equal(parsed_118[col], expected[col]) + + with StataReader(self.dta22_118) as rdr: + vl = rdr.variable_labels() + vl_expected = { + "Unicode_Cities_Strl": "Here are some strls with Ünicode chars", + "Longs": "long data", + "Things": "Here are some things", + "Bytes": "byte data", + "Ints": "int data", + "Cities": "Here are some cities", + "Floats": "float data", + } + tm.assert_dict_equal(vl, vl_expected) + + assert rdr.data_label == "This is a Ünicode data label" + + def test_read_write_dta5(self): + original = DataFrame( + [(np.nan, np.nan, np.nan, np.nan, np.nan)], + columns=["float_miss", "double_miss", "byte_miss", "int_miss", "long_miss"], + ) + original.index.name = "index" + + with tm.ensure_clean() as path: + original.to_stata(path, None) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index("index"), original) + + def test_write_dta6(self): + original = self.read_csv(self.csv3) + original.index.name = "index" + original.index = original.index.astype(np.int32) + original["year"] = original["year"].astype(np.int32) + original["quarter"] = original["quarter"].astype(np.int32) + + with tm.ensure_clean() as path: + original.to_stata(path, None) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal( + written_and_read_again.set_index("index"), + original, + check_index_type=False, + ) + + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) + def test_read_write_dta10(self, version): + original = DataFrame( + data=[["string", "object", 1, 1.1, np.datetime64("2003-12-25")]], + columns=["string", "object", "integer", "floating", "datetime"], + ) + original["object"] = Series(original["object"], dtype=object) + original.index.name = "index" + original.index = original.index.astype(np.int32) + original["integer"] = original["integer"].astype(np.int32) + + with tm.ensure_clean() as path: + original.to_stata(path, {"datetime": "tc"}, version=version) + written_and_read_again = self.read_dta(path) + # original.index is np.int32, read index is np.int64 + tm.assert_frame_equal( + written_and_read_again.set_index("index"), + original, + check_index_type=False, + ) + + def test_stata_doc_examples(self): + with tm.ensure_clean() as path: + df = DataFrame(np.random.randn(10, 2), columns=list("AB")) + df.to_stata(path) + + def test_write_preserves_original(self): + # 9795 + np.random.seed(423) + df = pd.DataFrame(np.random.randn(5, 4), columns=list("abcd")) + df.loc[2, "a":"c"] = np.nan + df_copy = df.copy() + with tm.ensure_clean() as path: + df.to_stata(path, write_index=False) + tm.assert_frame_equal(df, df_copy) + + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) + def test_encoding(self, version): + + # GH 4626, proper encoding handling + raw = read_stata(self.dta_encoding) + encoded = read_stata(self.dta_encoding) + result = encoded.kreis1849[0] + + expected = raw.kreis1849[0] + assert result == expected + assert isinstance(result, str) + + with tm.ensure_clean() as path: + encoded.to_stata(path, write_index=False, version=version) + reread_encoded = read_stata(path) + tm.assert_frame_equal(encoded, reread_encoded) + + def test_read_write_dta11(self): + original = DataFrame( + [(1, 2, 3, 4)], + columns=[ + "good", + "b\u00E4d", + "8number", + "astringwithmorethan32characters______", + ], + ) + formatted = DataFrame( + [(1, 2, 3, 4)], + columns=["good", "b_d", "_8number", "astringwithmorethan32characters_"], + ) + formatted.index.name = "index" + formatted = formatted.astype(np.int32) + + with tm.ensure_clean() as path: + with tm.assert_produces_warning(pd.io.stata.InvalidColumnName): + original.to_stata(path, None) + + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index("index"), formatted) + + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) + def test_read_write_dta12(self, version): + original = DataFrame( + [(1, 2, 3, 4, 5, 6)], + columns=[ + "astringwithmorethan32characters_1", + "astringwithmorethan32characters_2", + "+", + "-", + "short", + "delete", + ], + ) + formatted = DataFrame( + [(1, 2, 3, 4, 5, 6)], + columns=[ + "astringwithmorethan32characters_", + "_0astringwithmorethan32character", + "_", + "_1_", + "_short", + "_delete", + ], + ) + formatted.index.name = "index" + formatted = formatted.astype(np.int32) + + with tm.ensure_clean() as path: + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always", InvalidColumnName) + original.to_stata(path, None, version=version) + # should get a warning for that format. + assert len(w) == 1 + + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index("index"), formatted) + + def test_read_write_dta13(self): + s1 = Series(2 ** 9, dtype=np.int16) + s2 = Series(2 ** 17, dtype=np.int32) + s3 = Series(2 ** 33, dtype=np.int64) + original = DataFrame({"int16": s1, "int32": s2, "int64": s3}) + original.index.name = "index" + + formatted = original + formatted["int64"] = formatted["int64"].astype(np.float64) + + with tm.ensure_clean() as path: + original.to_stata(path) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index("index"), formatted) + + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) + @pytest.mark.parametrize( + "file", ["dta14_113", "dta14_114", "dta14_115", "dta14_117"] + ) + def test_read_write_reread_dta14(self, file, parsed_114, version): + file = getattr(self, file) + parsed = self.read_dta(file) + parsed.index.name = "index" + + expected = self.read_csv(self.csv14) + cols = ["byte_", "int_", "long_", "float_", "double_"] + for col in cols: + expected[col] = expected[col]._convert(datetime=True, numeric=True) + expected["float_"] = expected["float_"].astype(np.float32) + expected["date_td"] = pd.to_datetime(expected["date_td"], errors="coerce") + + tm.assert_frame_equal(parsed_114, parsed) + + with tm.ensure_clean() as path: + parsed_114.to_stata(path, {"date_td": "td"}, version=version) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index("index"), parsed_114) + + @pytest.mark.parametrize( + "file", ["dta15_113", "dta15_114", "dta15_115", "dta15_117"] + ) + def test_read_write_reread_dta15(self, file): + + expected = self.read_csv(self.csv15) + expected["byte_"] = expected["byte_"].astype(np.int8) + expected["int_"] = expected["int_"].astype(np.int16) + expected["long_"] = expected["long_"].astype(np.int32) + expected["float_"] = expected["float_"].astype(np.float32) + expected["double_"] = expected["double_"].astype(np.float64) + expected["date_td"] = expected["date_td"].apply( + datetime.strptime, args=("%Y-%m-%d",) + ) + + file = getattr(self, file) + parsed = self.read_dta(file) + + tm.assert_frame_equal(expected, parsed) + + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) + def test_timestamp_and_label(self, version): + original = DataFrame([(1,)], columns=["variable"]) + time_stamp = datetime(2000, 2, 29, 14, 21) + data_label = "This is a data file." + with tm.ensure_clean() as path: + original.to_stata( + path, time_stamp=time_stamp, data_label=data_label, version=version + ) + + with StataReader(path) as reader: + assert reader.time_stamp == "29 Feb 2000 14:21" + assert reader.data_label == data_label + + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) + def test_invalid_timestamp(self, version): + original = DataFrame([(1,)], columns=["variable"]) + time_stamp = "01 Jan 2000, 00:00:00" + with tm.ensure_clean() as path: + msg = "time_stamp should be datetime type" + with pytest.raises(ValueError, match=msg): + original.to_stata(path, time_stamp=time_stamp, version=version) + + def test_numeric_column_names(self): + original = DataFrame(np.reshape(np.arange(25.0), (5, 5))) + original.index.name = "index" + with tm.ensure_clean() as path: + # should get a warning for that format. + with tm.assert_produces_warning(InvalidColumnName): + original.to_stata(path) + + written_and_read_again = self.read_dta(path) + written_and_read_again = written_and_read_again.set_index("index") + columns = list(written_and_read_again.columns) + convert_col_name = lambda x: int(x[1]) + written_and_read_again.columns = map(convert_col_name, columns) + tm.assert_frame_equal(original, written_and_read_again) + + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) + def test_nan_to_missing_value(self, version): + s1 = Series(np.arange(4.0), dtype=np.float32) + s2 = Series(np.arange(4.0), dtype=np.float64) + s1[::2] = np.nan + s2[1::2] = np.nan + original = DataFrame({"s1": s1, "s2": s2}) + original.index.name = "index" + with tm.ensure_clean() as path: + original.to_stata(path, version=version) + written_and_read_again = self.read_dta(path) + written_and_read_again = written_and_read_again.set_index("index") + tm.assert_frame_equal(written_and_read_again, original) + + def test_no_index(self): + columns = ["x", "y"] + original = DataFrame(np.reshape(np.arange(10.0), (5, 2)), columns=columns) + original.index.name = "index_not_written" + with tm.ensure_clean() as path: + original.to_stata(path, write_index=False) + written_and_read_again = self.read_dta(path) + with pytest.raises(KeyError, match=original.index.name): + written_and_read_again["index_not_written"] + + def test_string_no_dates(self): + s1 = Series(["a", "A longer string"]) + s2 = Series([1.0, 2.0], dtype=np.float64) + original = DataFrame({"s1": s1, "s2": s2}) + original.index.name = "index" + with tm.ensure_clean() as path: + original.to_stata(path) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index("index"), original) + + def test_large_value_conversion(self): + s0 = Series([1, 99], dtype=np.int8) + s1 = Series([1, 127], dtype=np.int8) + s2 = Series([1, 2 ** 15 - 1], dtype=np.int16) + s3 = Series([1, 2 ** 63 - 1], dtype=np.int64) + original = DataFrame({"s0": s0, "s1": s1, "s2": s2, "s3": s3}) + original.index.name = "index" + with tm.ensure_clean() as path: + with tm.assert_produces_warning(PossiblePrecisionLoss): + original.to_stata(path) + + written_and_read_again = self.read_dta(path) + modified = original.copy() + modified["s1"] = Series(modified["s1"], dtype=np.int16) + modified["s2"] = Series(modified["s2"], dtype=np.int32) + modified["s3"] = Series(modified["s3"], dtype=np.float64) + tm.assert_frame_equal(written_and_read_again.set_index("index"), modified) + + def test_dates_invalid_column(self): + original = DataFrame([datetime(2006, 11, 19, 23, 13, 20)]) + original.index.name = "index" + with tm.ensure_clean() as path: + with tm.assert_produces_warning(InvalidColumnName): + original.to_stata(path, {0: "tc"}) + + written_and_read_again = self.read_dta(path) + modified = original.copy() + modified.columns = ["_0"] + tm.assert_frame_equal(written_and_read_again.set_index("index"), modified) + + def test_105(self): + # Data obtained from: + # http://go.worldbank.org/ZXY29PVJ21 + dpath = os.path.join(self.dirpath, "S4_EDUC1.dta") + df = pd.read_stata(dpath) + df0 = [[1, 1, 3, -2], [2, 1, 2, -2], [4, 1, 1, -2]] + df0 = pd.DataFrame(df0) + df0.columns = ["clustnum", "pri_schl", "psch_num", "psch_dis"] + df0["clustnum"] = df0["clustnum"].astype(np.int16) + df0["pri_schl"] = df0["pri_schl"].astype(np.int8) + df0["psch_num"] = df0["psch_num"].astype(np.int8) + df0["psch_dis"] = df0["psch_dis"].astype(np.float32) + tm.assert_frame_equal(df.head(3), df0) + + def test_value_labels_old_format(self): + # GH 19417 + # + # Test that value_labels() returns an empty dict if the file format + # predates supporting value labels. + dpath = os.path.join(self.dirpath, "S4_EDUC1.dta") + reader = StataReader(dpath) + assert reader.value_labels() == {} + reader.close() + + def test_date_export_formats(self): + columns = ["tc", "td", "tw", "tm", "tq", "th", "ty"] + conversions = {c: c for c in columns} + data = [datetime(2006, 11, 20, 23, 13, 20)] * len(columns) + original = DataFrame([data], columns=columns) + original.index.name = "index" + expected_values = [ + datetime(2006, 11, 20, 23, 13, 20), # Time + datetime(2006, 11, 20), # Day + datetime(2006, 11, 19), # Week + datetime(2006, 11, 1), # Month + datetime(2006, 10, 1), # Quarter year + datetime(2006, 7, 1), # Half year + datetime(2006, 1, 1), + ] # Year + + expected = DataFrame([expected_values], columns=columns) + expected.index.name = "index" + with tm.ensure_clean() as path: + original.to_stata(path, conversions) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) + + def test_write_missing_strings(self): + original = DataFrame([["1"], [None]], columns=["foo"]) + expected = DataFrame([["1"], [""]], columns=["foo"]) + expected.index.name = "index" + with tm.ensure_clean() as path: + original.to_stata(path) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) + + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) + @pytest.mark.parametrize("byteorder", [">", "<"]) + def test_bool_uint(self, byteorder, version): + s0 = Series([0, 1, True], dtype=np.bool) + s1 = Series([0, 1, 100], dtype=np.uint8) + s2 = Series([0, 1, 255], dtype=np.uint8) + s3 = Series([0, 1, 2 ** 15 - 100], dtype=np.uint16) + s4 = Series([0, 1, 2 ** 16 - 1], dtype=np.uint16) + s5 = Series([0, 1, 2 ** 31 - 100], dtype=np.uint32) + s6 = Series([0, 1, 2 ** 32 - 1], dtype=np.uint32) + + original = DataFrame( + {"s0": s0, "s1": s1, "s2": s2, "s3": s3, "s4": s4, "s5": s5, "s6": s6} + ) + original.index.name = "index" + expected = original.copy() + expected_types = ( + np.int8, + np.int8, + np.int16, + np.int16, + np.int32, + np.int32, + np.float64, + ) + for c, t in zip(expected.columns, expected_types): + expected[c] = expected[c].astype(t) + + with tm.ensure_clean() as path: + original.to_stata(path, byteorder=byteorder, version=version) + written_and_read_again = self.read_dta(path) + written_and_read_again = written_and_read_again.set_index("index") + tm.assert_frame_equal(written_and_read_again, expected) + + def test_variable_labels(self): + with StataReader(self.dta16_115) as rdr: + sr_115 = rdr.variable_labels() + with StataReader(self.dta16_117) as rdr: + sr_117 = rdr.variable_labels() + keys = ("var1", "var2", "var3") + labels = ("label1", "label2", "label3") + for k, v in sr_115.items(): + assert k in sr_117 + assert v == sr_117[k] + assert k in keys + assert v in labels + + def test_minimal_size_col(self): + str_lens = (1, 100, 244) + s = {} + for str_len in str_lens: + s["s" + str(str_len)] = Series( + ["a" * str_len, "b" * str_len, "c" * str_len] + ) + original = DataFrame(s) + with tm.ensure_clean() as path: + original.to_stata(path, write_index=False) + + with StataReader(path) as sr: + typlist = sr.typlist + variables = sr.varlist + formats = sr.fmtlist + for variable, fmt, typ in zip(variables, formats, typlist): + assert int(variable[1:]) == int(fmt[1:-1]) + assert int(variable[1:]) == typ + + def test_excessively_long_string(self): + str_lens = (1, 244, 500) + s = {} + for str_len in str_lens: + s["s" + str(str_len)] = Series( + ["a" * str_len, "b" * str_len, "c" * str_len] + ) + original = DataFrame(s) + msg = ( + r"Fixed width strings in Stata \.dta files are limited to 244" + r" \(or fewer\)\ncharacters\. Column 's500' does not satisfy" + r" this restriction\. Use the\n'version=117' parameter to write" + r" the newer \(Stata 13 and later\) format\." + ) + with pytest.raises(ValueError, match=msg): + with tm.ensure_clean() as path: + original.to_stata(path) + + def test_missing_value_generator(self): + types = ("b", "h", "l") + df = DataFrame([[0.0]], columns=["float_"]) + with tm.ensure_clean() as path: + df.to_stata(path) + with StataReader(path) as rdr: + valid_range = rdr.VALID_RANGE + expected_values = ["." + chr(97 + i) for i in range(26)] + expected_values.insert(0, ".") + for t in types: + offset = valid_range[t][1] + for i in range(0, 27): + val = StataMissingValue(offset + 1 + i) + assert val.string == expected_values[i] + + # Test extremes for floats + val = StataMissingValue(struct.unpack(" 0 + + if layout is not None: + result = self._get_axes_layout(_flatten(axes)) + assert result == layout + + tm.assert_numpy_array_equal( + visible_axes[0].figure.get_size_inches(), + np.array(figsize, dtype=np.float64), + ) + + def _get_axes_layout(self, axes): + x_set = set() + y_set = set() + for ax in axes: + # check axes coordinates to estimate layout + points = ax.get_position().get_points() + x_set.add(points[0][0]) + y_set.add(points[0][1]) + return (len(y_set), len(x_set)) + + def _flatten_visible(self, axes): + """ + Flatten axes, and filter only visible + + Parameters + ---------- + axes : matplotlib Axes object, or its list-like + + """ + from pandas.plotting._matplotlib.tools import _flatten + + axes = _flatten(axes) + axes = [ax for ax in axes if ax.get_visible()] + return axes + + def _check_has_errorbars(self, axes, xerr=0, yerr=0): + """ + Check axes has expected number of errorbars + + Parameters + ---------- + axes : matplotlib Axes object, or its list-like + xerr : number + expected number of x errorbar + yerr : number + expected number of y errorbar + """ + axes = self._flatten_visible(axes) + for ax in axes: + containers = ax.containers + xerr_count = 0 + yerr_count = 0 + for c in containers: + has_xerr = getattr(c, "has_xerr", False) + has_yerr = getattr(c, "has_yerr", False) + if has_xerr: + xerr_count += 1 + if has_yerr: + yerr_count += 1 + assert xerr == xerr_count + assert yerr == yerr_count + + def _check_box_return_type( + self, returned, return_type, expected_keys=None, check_ax_title=True + ): + """ + Check box returned type is correct + + Parameters + ---------- + returned : object to be tested, returned from boxplot + return_type : str + return_type passed to boxplot + expected_keys : list-like, optional + group labels in subplot case. If not passed, + the function checks assuming boxplot uses single ax + check_ax_title : bool + Whether to check the ax.title is the same as expected_key + Intended to be checked by calling from ``boxplot``. + Normal ``plot`` doesn't attach ``ax.title``, it must be disabled. + """ + from matplotlib.axes import Axes + + types = {"dict": dict, "axes": Axes, "both": tuple} + if expected_keys is None: + # should be fixed when the returning default is changed + if return_type is None: + return_type = "dict" + + assert isinstance(returned, types[return_type]) + if return_type == "both": + assert isinstance(returned.ax, Axes) + assert isinstance(returned.lines, dict) + else: + # should be fixed when the returning default is changed + if return_type is None: + for r in self._flatten_visible(returned): + assert isinstance(r, Axes) + return + + assert isinstance(returned, Series) + + assert sorted(returned.keys()) == sorted(expected_keys) + for key, value in returned.items(): + assert isinstance(value, types[return_type]) + # check returned dict has correct mapping + if return_type == "axes": + if check_ax_title: + assert value.get_title() == key + elif return_type == "both": + if check_ax_title: + assert value.ax.get_title() == key + assert isinstance(value.ax, Axes) + assert isinstance(value.lines, dict) + elif return_type == "dict": + line = value["medians"][0] + axes = line.axes + if check_ax_title: + assert axes.get_title() == key + else: + raise AssertionError + + def _check_grid_settings(self, obj, kinds, kws={}): + # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 + + import matplotlib as mpl + + def is_grid_on(): + xticks = self.plt.gca().xaxis.get_major_ticks() + yticks = self.plt.gca().yaxis.get_major_ticks() + # for mpl 2.2.2, gridOn and gridline.get_visible disagree. + # for new MPL, they are the same. + + if self.mpl_ge_3_1_0: + xoff = all(not g.gridline.get_visible() for g in xticks) + yoff = all(not g.gridline.get_visible() for g in yticks) + else: + xoff = all(not g.gridOn for g in xticks) + yoff = all(not g.gridOn for g in yticks) + + return not (xoff and yoff) + + spndx = 1 + for kind in kinds: + + self.plt.subplot(1, 4 * len(kinds), spndx) + spndx += 1 + mpl.rc("axes", grid=False) + obj.plot(kind=kind, **kws) + assert not is_grid_on() + + self.plt.subplot(1, 4 * len(kinds), spndx) + spndx += 1 + mpl.rc("axes", grid=True) + obj.plot(kind=kind, grid=False, **kws) + assert not is_grid_on() + + if kind != "pie": + self.plt.subplot(1, 4 * len(kinds), spndx) + spndx += 1 + mpl.rc("axes", grid=True) + obj.plot(kind=kind, **kws) + assert is_grid_on() + + self.plt.subplot(1, 4 * len(kinds), spndx) + spndx += 1 + mpl.rc("axes", grid=False) + obj.plot(kind=kind, grid=True, **kws) + assert is_grid_on() + + def _unpack_cycler(self, rcParams, field="color"): + """ + Auxiliary function for correctly unpacking cycler after MPL >= 1.5 + """ + return [v[field] for v in rcParams["axes.prop_cycle"]] + + +def _check_plot_works(f, filterwarnings="always", **kwargs): + import matplotlib.pyplot as plt + + ret = None + with warnings.catch_warnings(): + warnings.simplefilter(filterwarnings) + try: + try: + fig = kwargs["figure"] + except KeyError: + fig = plt.gcf() + + plt.clf() + + kwargs.get("ax", fig.add_subplot(211)) + ret = f(**kwargs) + + tm.assert_is_valid_plot_return_object(ret) + + if f is pd.plotting.bootstrap_plot: + assert "ax" not in kwargs + else: + kwargs["ax"] = fig.add_subplot(212) + + ret = f(**kwargs) + tm.assert_is_valid_plot_return_object(ret) + + with tm.ensure_clean(return_filelike=True) as path: + plt.savefig(path) + finally: + tm.close(fig) + + return ret + + +def curpath(): + pth, _ = os.path.split(os.path.abspath(__file__)) + return pth diff --git a/venv/Lib/site-packages/pandas/tests/plotting/test_backend.py b/venv/Lib/site-packages/pandas/tests/plotting/test_backend.py new file mode 100644 index 0000000..9025f8c --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/plotting/test_backend.py @@ -0,0 +1,104 @@ +import sys +import types + +import pkg_resources +import pytest + +import pandas.util._test_decorators as td + +import pandas + +dummy_backend = types.ModuleType("pandas_dummy_backend") +setattr(dummy_backend, "plot", lambda *args, **kwargs: "used_dummy") + + +@pytest.fixture +def restore_backend(): + """Restore the plotting backend to matplotlib""" + pandas.set_option("plotting.backend", "matplotlib") + yield + pandas.set_option("plotting.backend", "matplotlib") + + +def test_backend_is_not_module(): + msg = "Could not find plotting backend 'not_an_existing_module'." + with pytest.raises(ValueError, match=msg): + pandas.set_option("plotting.backend", "not_an_existing_module") + + assert pandas.options.plotting.backend == "matplotlib" + + +def test_backend_is_correct(monkeypatch, restore_backend): + monkeypatch.setitem(sys.modules, "pandas_dummy_backend", dummy_backend) + + pandas.set_option("plotting.backend", "pandas_dummy_backend") + assert pandas.get_option("plotting.backend") == "pandas_dummy_backend" + assert ( + pandas.plotting._core._get_plot_backend("pandas_dummy_backend") is dummy_backend + ) + + +def test_backend_can_be_set_in_plot_call(monkeypatch, restore_backend): + monkeypatch.setitem(sys.modules, "pandas_dummy_backend", dummy_backend) + df = pandas.DataFrame([1, 2, 3]) + + assert pandas.get_option("plotting.backend") == "matplotlib" + assert df.plot(backend="pandas_dummy_backend") == "used_dummy" + + +@td.skip_if_no_mpl +def test_register_entrypoint(restore_backend): + + dist = pkg_resources.get_distribution("pandas") + if dist.module_path not in pandas.__file__: + # We are running from a non-installed pandas, and this test is invalid + pytest.skip("Testing a non-installed pandas") + + mod = types.ModuleType("my_backend") + mod.plot = lambda *args, **kwargs: 1 + + backends = pkg_resources.get_entry_map("pandas") + my_entrypoint = pkg_resources.EntryPoint( + "pandas_plotting_backend", mod.__name__, dist=dist + ) + backends["pandas_plotting_backends"]["my_backend"] = my_entrypoint + # TODO: the docs recommend importlib.util.module_from_spec. But this works for now. + sys.modules["my_backend"] = mod + + result = pandas.plotting._core._get_plot_backend("my_backend") + assert result is mod + + # TODO: https://github.com/pandas-dev/pandas/issues/27517 + # Remove the td.skip_if_no_mpl + with pandas.option_context("plotting.backend", "my_backend"): + result = pandas.plotting._core._get_plot_backend() + + assert result is mod + + +def test_setting_backend_without_plot_raises(): + # GH-28163 + module = types.ModuleType("pandas_plot_backend") + sys.modules["pandas_plot_backend"] = module + + assert pandas.options.plotting.backend == "matplotlib" + with pytest.raises( + ValueError, match="Could not find plotting backend 'pandas_plot_backend'." + ): + pandas.set_option("plotting.backend", "pandas_plot_backend") + + assert pandas.options.plotting.backend == "matplotlib" + + +@td.skip_if_mpl +def test_no_matplotlib_ok(): + with pytest.raises(ImportError): + pandas.plotting._core._get_plot_backend("matplotlib") + + +def test_extra_kinds_ok(monkeypatch, restore_backend): + # https://github.com/pandas-dev/pandas/pull/28647 + monkeypatch.setitem(sys.modules, "pandas_dummy_backend", dummy_backend) + pandas.set_option("plotting.backend", "pandas_dummy_backend") + df = pandas.DataFrame({"A": [1, 2, 3]}) + df.plot(kind="not a real kind") diff --git a/venv/Lib/site-packages/pandas/tests/plotting/test_boxplot_method.py b/venv/Lib/site-packages/pandas/tests/plotting/test_boxplot_method.py new file mode 100644 index 0000000..8ee279f --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/plotting/test_boxplot_method.py @@ -0,0 +1,442 @@ +# coding: utf-8 + +import itertools +import string + +import numpy as np +from numpy import random +import pytest + +import pandas.util._test_decorators as td + +from pandas import DataFrame, MultiIndex, Series, date_range, timedelta_range +import pandas._testing as tm +from pandas.tests.plotting.common import TestPlotBase, _check_plot_works + +import pandas.plotting as plotting + +""" Test cases for .boxplot method """ + + +@td.skip_if_no_mpl +class TestDataFramePlots(TestPlotBase): + @pytest.mark.slow + def test_boxplot_legacy1(self): + df = DataFrame( + np.random.randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=["one", "two", "three", "four"], + ) + df["indic"] = ["foo", "bar"] * 3 + df["indic2"] = ["foo", "bar", "foo"] * 2 + + _check_plot_works(df.boxplot, return_type="dict") + _check_plot_works(df.boxplot, column=["one", "two"], return_type="dict") + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.boxplot, column=["one", "two"], by="indic") + _check_plot_works(df.boxplot, column="one", by=["indic", "indic2"]) + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.boxplot, by="indic") + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.boxplot, by=["indic", "indic2"]) + _check_plot_works(plotting._core.boxplot, data=df["one"], return_type="dict") + _check_plot_works(df.boxplot, notch=1, return_type="dict") + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.boxplot, by="indic", notch=1) + + @pytest.mark.slow + def test_boxplot_legacy2(self): + df = DataFrame(np.random.rand(10, 2), columns=["Col1", "Col2"]) + df["X"] = Series(["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"]) + df["Y"] = Series(["A"] * 10) + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.boxplot, by="X") + + # When ax is supplied and required number of axes is 1, + # passed ax should be used: + fig, ax = self.plt.subplots() + axes = df.boxplot("Col1", by="X", ax=ax) + ax_axes = ax.axes + assert ax_axes is axes + + fig, ax = self.plt.subplots() + axes = df.groupby("Y").boxplot(ax=ax, return_type="axes") + ax_axes = ax.axes + assert ax_axes is axes["A"] + + # Multiple columns with an ax argument should use same figure + fig, ax = self.plt.subplots() + with tm.assert_produces_warning(UserWarning): + axes = df.boxplot( + column=["Col1", "Col2"], by="X", ax=ax, return_type="axes" + ) + assert axes["Col1"].get_figure() is fig + + # When by is None, check that all relevant lines are present in the + # dict + fig, ax = self.plt.subplots() + d = df.boxplot(ax=ax, return_type="dict") + lines = list(itertools.chain.from_iterable(d.values())) + assert len(ax.get_lines()) == len(lines) + + @pytest.mark.slow + def test_boxplot_return_type_none(self): + # GH 12216; return_type=None & by=None -> axes + result = self.hist_df.boxplot() + assert isinstance(result, self.plt.Axes) + + @pytest.mark.slow + def test_boxplot_return_type_legacy(self): + # API change in https://github.com/pandas-dev/pandas/pull/7096 + import matplotlib as mpl # noqa + + df = DataFrame( + np.random.randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=["one", "two", "three", "four"], + ) + with pytest.raises(ValueError): + df.boxplot(return_type="NOTATYPE") + + result = df.boxplot() + self._check_box_return_type(result, "axes") + + with tm.assert_produces_warning(False): + result = df.boxplot(return_type="dict") + self._check_box_return_type(result, "dict") + + with tm.assert_produces_warning(False): + result = df.boxplot(return_type="axes") + self._check_box_return_type(result, "axes") + + with tm.assert_produces_warning(False): + result = df.boxplot(return_type="both") + self._check_box_return_type(result, "both") + + @pytest.mark.slow + def test_boxplot_axis_limits(self): + def _check_ax_limits(col, ax): + y_min, y_max = ax.get_ylim() + assert y_min <= col.min() + assert y_max >= col.max() + + df = self.hist_df.copy() + df["age"] = np.random.randint(1, 20, df.shape[0]) + # One full row + height_ax, weight_ax = df.boxplot(["height", "weight"], by="category") + _check_ax_limits(df["height"], height_ax) + _check_ax_limits(df["weight"], weight_ax) + assert weight_ax._sharey == height_ax + + # Two rows, one partial + p = df.boxplot(["height", "weight", "age"], by="category") + height_ax, weight_ax, age_ax = p[0, 0], p[0, 1], p[1, 0] + dummy_ax = p[1, 1] + + _check_ax_limits(df["height"], height_ax) + _check_ax_limits(df["weight"], weight_ax) + _check_ax_limits(df["age"], age_ax) + assert weight_ax._sharey == height_ax + assert age_ax._sharey == height_ax + assert dummy_ax._sharey is None + + @pytest.mark.slow + def test_boxplot_empty_column(self): + df = DataFrame(np.random.randn(20, 4)) + df.loc[:, 0] = np.nan + _check_plot_works(df.boxplot, return_type="axes") + + @pytest.mark.slow + def test_figsize(self): + df = DataFrame(np.random.rand(10, 5), columns=["A", "B", "C", "D", "E"]) + result = df.boxplot(return_type="axes", figsize=(12, 8)) + assert result.figure.bbox_inches.width == 12 + assert result.figure.bbox_inches.height == 8 + + def test_fontsize(self): + df = DataFrame({"a": [1, 2, 3, 4, 5, 6]}) + self._check_ticks_props( + df.boxplot("a", fontsize=16), xlabelsize=16, ylabelsize=16 + ) + + def test_boxplot_numeric_data(self): + # GH 22799 + df = DataFrame( + { + "a": date_range("2012-01-01", periods=100), + "b": np.random.randn(100), + "c": np.random.randn(100) + 2, + "d": date_range("2012-01-01", periods=100).astype(str), + "e": date_range("2012-01-01", periods=100, tz="UTC"), + "f": timedelta_range("1 days", periods=100), + } + ) + ax = df.plot(kind="box") + assert [x.get_text() for x in ax.get_xticklabels()] == ["b", "c"] + + @pytest.mark.parametrize( + "colors_kwd, expected", + [ + ( + dict(boxes="r", whiskers="b", medians="g", caps="c"), + dict(boxes="r", whiskers="b", medians="g", caps="c"), + ), + (dict(boxes="r"), dict(boxes="r")), + ("r", dict(boxes="r", whiskers="r", medians="r", caps="r")), + ], + ) + def test_color_kwd(self, colors_kwd, expected): + # GH: 26214 + df = DataFrame(random.rand(10, 2)) + result = df.boxplot(color=colors_kwd, return_type="dict") + for k, v in expected.items(): + assert result[k][0].get_color() == v + + @pytest.mark.parametrize( + "dict_colors, msg", + [(dict(boxes="r", invalid_key="r"), "invalid key 'invalid_key'")], + ) + def test_color_kwd_errors(self, dict_colors, msg): + # GH: 26214 + df = DataFrame(random.rand(10, 2)) + with pytest.raises(ValueError, match=msg): + df.boxplot(color=dict_colors, return_type="dict") + + +@td.skip_if_no_mpl +class TestDataFrameGroupByPlots(TestPlotBase): + @pytest.mark.slow + def test_boxplot_legacy1(self): + grouped = self.hist_df.groupby(by="gender") + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(grouped.boxplot, return_type="axes") + self._check_axes_shape(list(axes.values), axes_num=2, layout=(1, 2)) + axes = _check_plot_works(grouped.boxplot, subplots=False, return_type="axes") + self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + + @pytest.mark.slow + def test_boxplot_legacy2(self): + tuples = zip(string.ascii_letters[:10], range(10)) + df = DataFrame(np.random.rand(10, 3), index=MultiIndex.from_tuples(tuples)) + grouped = df.groupby(level=1) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(grouped.boxplot, return_type="axes") + self._check_axes_shape(list(axes.values), axes_num=10, layout=(4, 3)) + + axes = _check_plot_works(grouped.boxplot, subplots=False, return_type="axes") + self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + + @pytest.mark.slow + def test_boxplot_legacy3(self): + tuples = zip(string.ascii_letters[:10], range(10)) + df = DataFrame(np.random.rand(10, 3), index=MultiIndex.from_tuples(tuples)) + grouped = df.unstack(level=1).groupby(level=0, axis=1) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(grouped.boxplot, return_type="axes") + self._check_axes_shape(list(axes.values), axes_num=3, layout=(2, 2)) + axes = _check_plot_works(grouped.boxplot, subplots=False, return_type="axes") + self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + + @pytest.mark.slow + def test_grouped_plot_fignums(self): + n = 10 + weight = Series(np.random.normal(166, 20, size=n)) + height = Series(np.random.normal(60, 10, size=n)) + with tm.RNGContext(42): + gender = np.random.choice(["male", "female"], size=n) + df = DataFrame({"height": height, "weight": weight, "gender": gender}) + gb = df.groupby("gender") + + res = gb.plot() + assert len(self.plt.get_fignums()) == 2 + assert len(res) == 2 + tm.close() + + res = gb.boxplot(return_type="axes") + assert len(self.plt.get_fignums()) == 1 + assert len(res) == 2 + tm.close() + + # now works with GH 5610 as gender is excluded + res = df.groupby("gender").hist() + tm.close() + + @pytest.mark.slow + def test_grouped_box_return_type(self): + df = self.hist_df + + # old style: return_type=None + result = df.boxplot(by="gender") + assert isinstance(result, np.ndarray) + self._check_box_return_type( + result, None, expected_keys=["height", "weight", "category"] + ) + + # now for groupby + result = df.groupby("gender").boxplot(return_type="dict") + self._check_box_return_type(result, "dict", expected_keys=["Male", "Female"]) + + columns2 = "X B C D A G Y N Q O".split() + df2 = DataFrame(random.randn(50, 10), columns=columns2) + categories2 = "A B C D E F G H I J".split() + df2["category"] = categories2 * 5 + + for t in ["dict", "axes", "both"]: + returned = df.groupby("classroom").boxplot(return_type=t) + self._check_box_return_type(returned, t, expected_keys=["A", "B", "C"]) + + returned = df.boxplot(by="classroom", return_type=t) + self._check_box_return_type( + returned, t, expected_keys=["height", "weight", "category"] + ) + + returned = df2.groupby("category").boxplot(return_type=t) + self._check_box_return_type(returned, t, expected_keys=categories2) + + returned = df2.boxplot(by="category", return_type=t) + self._check_box_return_type(returned, t, expected_keys=columns2) + + @pytest.mark.slow + def test_grouped_box_layout(self): + df = self.hist_df + + msg = "Layout of 1x1 must be larger than required size 2" + with pytest.raises(ValueError, match=msg): + df.boxplot(column=["weight", "height"], by=df.gender, layout=(1, 1)) + + msg = "The 'layout' keyword is not supported when 'by' is None" + with pytest.raises(ValueError, match=msg): + df.boxplot( + column=["height", "weight", "category"], + layout=(2, 1), + return_type="dict", + ) + + msg = "At least one dimension of layout must be positive" + with pytest.raises(ValueError, match=msg): + df.boxplot(column=["weight", "height"], by=df.gender, layout=(-1, -1)) + + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + box = _check_plot_works( + df.groupby("gender").boxplot, column="height", return_type="dict" + ) + self._check_axes_shape(self.plt.gcf().axes, axes_num=2, layout=(1, 2)) + + with tm.assert_produces_warning(UserWarning): + box = _check_plot_works( + df.groupby("category").boxplot, column="height", return_type="dict" + ) + self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(2, 2)) + + # GH 6769 + with tm.assert_produces_warning(UserWarning): + box = _check_plot_works( + df.groupby("classroom").boxplot, column="height", return_type="dict" + ) + self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(2, 2)) + + # GH 5897 + axes = df.boxplot( + column=["height", "weight", "category"], by="gender", return_type="axes" + ) + self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(2, 2)) + for ax in [axes["height"]]: + self._check_visible(ax.get_xticklabels(), visible=False) + self._check_visible([ax.xaxis.get_label()], visible=False) + for ax in [axes["weight"], axes["category"]]: + self._check_visible(ax.get_xticklabels()) + self._check_visible([ax.xaxis.get_label()]) + + box = df.groupby("classroom").boxplot( + column=["height", "weight", "category"], return_type="dict" + ) + self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(2, 2)) + + with tm.assert_produces_warning(UserWarning): + box = _check_plot_works( + df.groupby("category").boxplot, + column="height", + layout=(3, 2), + return_type="dict", + ) + self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(3, 2)) + with tm.assert_produces_warning(UserWarning): + box = _check_plot_works( + df.groupby("category").boxplot, + column="height", + layout=(3, -1), + return_type="dict", + ) + self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(3, 2)) + + box = df.boxplot( + column=["height", "weight", "category"], by="gender", layout=(4, 1) + ) + self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(4, 1)) + + box = df.boxplot( + column=["height", "weight", "category"], by="gender", layout=(-1, 1) + ) + self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(3, 1)) + + box = df.groupby("classroom").boxplot( + column=["height", "weight", "category"], layout=(1, 4), return_type="dict" + ) + self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(1, 4)) + + box = df.groupby("classroom").boxplot( # noqa + column=["height", "weight", "category"], layout=(1, -1), return_type="dict" + ) + self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(1, 3)) + + @pytest.mark.slow + def test_grouped_box_multiple_axes(self): + # GH 6970, GH 7069 + df = self.hist_df + + # check warning to ignore sharex / sharey + # this check should be done in the first function which + # passes multiple axes to plot, hist or boxplot + # location should be changed if other test is added + # which has earlier alphabetical order + with tm.assert_produces_warning(UserWarning): + fig, axes = self.plt.subplots(2, 2) + df.groupby("category").boxplot(column="height", return_type="axes", ax=axes) + self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(2, 2)) + + fig, axes = self.plt.subplots(2, 3) + with tm.assert_produces_warning(UserWarning): + returned = df.boxplot( + column=["height", "weight", "category"], + by="gender", + return_type="axes", + ax=axes[0], + ) + returned = np.array(list(returned.values)) + self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) + tm.assert_numpy_array_equal(returned, axes[0]) + assert returned[0].figure is fig + + # draw on second row + with tm.assert_produces_warning(UserWarning): + returned = df.groupby("classroom").boxplot( + column=["height", "weight", "category"], return_type="axes", ax=axes[1] + ) + returned = np.array(list(returned.values)) + self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) + tm.assert_numpy_array_equal(returned, axes[1]) + assert returned[0].figure is fig + + with pytest.raises(ValueError): + fig, axes = self.plt.subplots(2, 3) + # pass different number of axes from required + with tm.assert_produces_warning(UserWarning): + axes = df.groupby("classroom").boxplot(ax=axes) + + def test_fontsize(self): + df = DataFrame({"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]}) + self._check_ticks_props( + df.boxplot("a", by="b", fontsize=16), xlabelsize=16, ylabelsize=16 + ) diff --git a/venv/Lib/site-packages/pandas/tests/plotting/test_converter.py b/venv/Lib/site-packages/pandas/tests/plotting/test_converter.py new file mode 100644 index 0000000..9cd3ccb --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/plotting/test_converter.py @@ -0,0 +1,367 @@ +from datetime import date, datetime +import subprocess +import sys + +import numpy as np +import pytest + +import pandas._config.config as cf + +from pandas.compat.numpy import np_datetime64_compat + +from pandas import Index, Period, Series, Timestamp, date_range +import pandas._testing as tm + +from pandas.plotting import ( + deregister_matplotlib_converters, + register_matplotlib_converters, +) +from pandas.tseries.offsets import Day, Micro, Milli, Second + +try: + from pandas.plotting._matplotlib import converter +except ImportError: + # try / except, rather than skip, to avoid internal refactoring + # causing an improper skip + pass + +pytest.importorskip("matplotlib.pyplot") + + +def test_registry_mpl_resets(): + # Check that Matplotlib converters are properly reset (see issue #27481) + code = ( + "import matplotlib.units as units; " + "import matplotlib.dates as mdates; " + "n_conv = len(units.registry); " + "import pandas as pd; " + "pd.plotting.register_matplotlib_converters(); " + "pd.plotting.deregister_matplotlib_converters(); " + "assert len(units.registry) == n_conv" + ) + call = [sys.executable, "-c", code] + subprocess.check_output(call) + + +def test_timtetonum_accepts_unicode(): + assert converter.time2num("00:01") == converter.time2num("00:01") + + +class TestRegistration: + def test_register_by_default(self): + # Run in subprocess to ensure a clean state + code = ( + "'import matplotlib.units; " + "import pandas as pd; " + "units = dict(matplotlib.units.registry); " + "assert pd.Timestamp in units)'" + ) + call = [sys.executable, "-c", code] + assert subprocess.check_call(call) == 0 + + def test_registering_no_warning(self): + plt = pytest.importorskip("matplotlib.pyplot") + s = Series(range(12), index=date_range("2017", periods=12)) + _, ax = plt.subplots() + + # Set to the "warn" state, in case this isn't the first test run + register_matplotlib_converters() + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + # GH#30588 DeprecationWarning from 2D indexing + ax.plot(s.index, s.values) + + def test_pandas_plots_register(self): + pytest.importorskip("matplotlib.pyplot") + s = Series(range(12), index=date_range("2017", periods=12)) + # Set to the "warn" state, in case this isn't the first test run + with tm.assert_produces_warning(None) as w: + s.plot() + + assert len(w) == 0 + + def test_matplotlib_formatters(self): + units = pytest.importorskip("matplotlib.units") + + # Can't make any assertion about the start state. + # We we check that toggling converters off removes it, and toggling it + # on restores it. + + with cf.option_context("plotting.matplotlib.register_converters", True): + with cf.option_context("plotting.matplotlib.register_converters", False): + assert Timestamp not in units.registry + assert Timestamp in units.registry + + def test_option_no_warning(self): + pytest.importorskip("matplotlib.pyplot") + ctx = cf.option_context("plotting.matplotlib.register_converters", False) + plt = pytest.importorskip("matplotlib.pyplot") + s = Series(range(12), index=date_range("2017", periods=12)) + _, ax = plt.subplots() + + # Test without registering first, no warning + with ctx: + # GH#30588 DeprecationWarning from 2D indexing on Index + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + ax.plot(s.index, s.values) + + # Now test with registering + register_matplotlib_converters() + with ctx: + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + ax.plot(s.index, s.values) + + def test_registry_resets(self): + units = pytest.importorskip("matplotlib.units") + dates = pytest.importorskip("matplotlib.dates") + + # make a copy, to reset to + original = dict(units.registry) + + try: + # get to a known state + units.registry.clear() + date_converter = dates.DateConverter() + units.registry[datetime] = date_converter + units.registry[date] = date_converter + + register_matplotlib_converters() + assert units.registry[date] is not date_converter + deregister_matplotlib_converters() + assert units.registry[date] is date_converter + + finally: + # restore original stater + units.registry.clear() + for k, v in original.items(): + units.registry[k] = v + + +class TestDateTimeConverter: + def setup_method(self, method): + self.dtc = converter.DatetimeConverter() + self.tc = converter.TimeFormatter(None) + + def test_convert_accepts_unicode(self): + r1 = self.dtc.convert("12:22", None, None) + r2 = self.dtc.convert("12:22", None, None) + assert r1 == r2, "DatetimeConverter.convert should accept unicode" + + def test_conversion(self): + rs = self.dtc.convert(["2012-1-1"], None, None)[0] + xp = datetime(2012, 1, 1).toordinal() + assert rs == xp + + rs = self.dtc.convert("2012-1-1", None, None) + assert rs == xp + + rs = self.dtc.convert(date(2012, 1, 1), None, None) + assert rs == xp + + rs = self.dtc.convert(datetime(2012, 1, 1).toordinal(), None, None) + assert rs == xp + + rs = self.dtc.convert("2012-1-1", None, None) + assert rs == xp + + rs = self.dtc.convert(Timestamp("2012-1-1"), None, None) + assert rs == xp + + # also testing datetime64 dtype (GH8614) + rs = self.dtc.convert(np_datetime64_compat("2012-01-01"), None, None) + assert rs == xp + + rs = self.dtc.convert( + np_datetime64_compat("2012-01-01 00:00:00+0000"), None, None + ) + assert rs == xp + + rs = self.dtc.convert( + np.array( + [ + np_datetime64_compat("2012-01-01 00:00:00+0000"), + np_datetime64_compat("2012-01-02 00:00:00+0000"), + ] + ), + None, + None, + ) + assert rs[0] == xp + + # we have a tz-aware date (constructed to that when we turn to utc it + # is the same as our sample) + ts = Timestamp("2012-01-01").tz_localize("UTC").tz_convert("US/Eastern") + rs = self.dtc.convert(ts, None, None) + assert rs == xp + + rs = self.dtc.convert(ts.to_pydatetime(), None, None) + assert rs == xp + + rs = self.dtc.convert(Index([ts - Day(1), ts]), None, None) + assert rs[1] == xp + + rs = self.dtc.convert(Index([ts - Day(1), ts]).to_pydatetime(), None, None) + assert rs[1] == xp + + def test_conversion_float(self): + decimals = 9 + + rs = self.dtc.convert(Timestamp("2012-1-1 01:02:03", tz="UTC"), None, None) + xp = converter.dates.date2num(Timestamp("2012-1-1 01:02:03", tz="UTC")) + tm.assert_almost_equal(rs, xp, decimals) + + rs = self.dtc.convert( + Timestamp("2012-1-1 09:02:03", tz="Asia/Hong_Kong"), None, None + ) + tm.assert_almost_equal(rs, xp, decimals) + + rs = self.dtc.convert(datetime(2012, 1, 1, 1, 2, 3), None, None) + tm.assert_almost_equal(rs, xp, decimals) + + def test_conversion_outofbounds_datetime(self): + # 2579 + values = [date(1677, 1, 1), date(1677, 1, 2)] + rs = self.dtc.convert(values, None, None) + xp = converter.dates.date2num(values) + tm.assert_numpy_array_equal(rs, xp) + rs = self.dtc.convert(values[0], None, None) + xp = converter.dates.date2num(values[0]) + assert rs == xp + + values = [datetime(1677, 1, 1, 12), datetime(1677, 1, 2, 12)] + rs = self.dtc.convert(values, None, None) + xp = converter.dates.date2num(values) + tm.assert_numpy_array_equal(rs, xp) + rs = self.dtc.convert(values[0], None, None) + xp = converter.dates.date2num(values[0]) + assert rs == xp + + @pytest.mark.parametrize( + "time,format_expected", + [ + (0, "00:00"), # time2num(datetime.time.min) + (86399.999999, "23:59:59.999999"), # time2num(datetime.time.max) + (90000, "01:00"), + (3723, "01:02:03"), + (39723.2, "11:02:03.200"), + ], + ) + def test_time_formatter(self, time, format_expected): + # issue 18478 + result = self.tc(time) + assert result == format_expected + + def test_dateindex_conversion(self): + decimals = 9 + + for freq in ("B", "L", "S"): + dateindex = tm.makeDateIndex(k=10, freq=freq) + rs = self.dtc.convert(dateindex, None, None) + xp = converter.dates.date2num(dateindex._mpl_repr()) + tm.assert_almost_equal(rs, xp, decimals) + + def test_resolution(self): + def _assert_less(ts1, ts2): + val1 = self.dtc.convert(ts1, None, None) + val2 = self.dtc.convert(ts2, None, None) + if not val1 < val2: + raise AssertionError(f"{val1} is not less than {val2}.") + + # Matplotlib's time representation using floats cannot distinguish + # intervals smaller than ~10 microsecond in the common range of years. + ts = Timestamp("2012-1-1") + _assert_less(ts, ts + Second()) + _assert_less(ts, ts + Milli()) + _assert_less(ts, ts + Micro(50)) + + def test_convert_nested(self): + inner = [Timestamp("2017-01-01"), Timestamp("2017-01-02")] + data = [inner, inner] + result = self.dtc.convert(data, None, None) + expected = [self.dtc.convert(x, None, None) for x in data] + assert (np.array(result) == expected).all() + + +class TestPeriodConverter: + def setup_method(self, method): + self.pc = converter.PeriodConverter() + + class Axis: + pass + + self.axis = Axis() + self.axis.freq = "D" + + def test_convert_accepts_unicode(self): + r1 = self.pc.convert("2012-1-1", None, self.axis) + r2 = self.pc.convert("2012-1-1", None, self.axis) + assert r1 == r2 + + def test_conversion(self): + rs = self.pc.convert(["2012-1-1"], None, self.axis)[0] + xp = Period("2012-1-1").ordinal + assert rs == xp + + rs = self.pc.convert("2012-1-1", None, self.axis) + assert rs == xp + + rs = self.pc.convert([date(2012, 1, 1)], None, self.axis)[0] + assert rs == xp + + rs = self.pc.convert(date(2012, 1, 1), None, self.axis) + assert rs == xp + + rs = self.pc.convert([Timestamp("2012-1-1")], None, self.axis)[0] + assert rs == xp + + rs = self.pc.convert(Timestamp("2012-1-1"), None, self.axis) + assert rs == xp + + rs = self.pc.convert(np_datetime64_compat("2012-01-01"), None, self.axis) + assert rs == xp + + rs = self.pc.convert( + np_datetime64_compat("2012-01-01 00:00:00+0000"), None, self.axis + ) + assert rs == xp + + rs = self.pc.convert( + np.array( + [ + np_datetime64_compat("2012-01-01 00:00:00+0000"), + np_datetime64_compat("2012-01-02 00:00:00+0000"), + ] + ), + None, + self.axis, + ) + assert rs[0] == xp + + def test_integer_passthrough(self): + # GH9012 + rs = self.pc.convert([0, 1], None, self.axis) + xp = [0, 1] + assert rs == xp + + def test_convert_nested(self): + data = ["2012-1-1", "2012-1-2"] + r1 = self.pc.convert([data, data], None, self.axis) + r2 = [self.pc.convert(data, None, self.axis) for _ in range(2)] + assert r1 == r2 + + +class TestTimeDeltaConverter: + """Test timedelta converter""" + + @pytest.mark.parametrize( + "x, decimal, format_expected", + [ + (0.0, 0, "00:00:00"), + (3972320000000, 1, "01:06:12.3"), + (713233432000000, 2, "8 days 06:07:13.43"), + (32423432000000, 4, "09:00:23.4320"), + ], + ) + def test_format_timedelta_ticks(self, x, decimal, format_expected): + tdc = converter.TimeSeries_TimedeltaFormatter + result = tdc.format_timedelta_ticks(x, pos=None, n_decimals=decimal) + assert result == format_expected diff --git a/venv/Lib/site-packages/pandas/tests/plotting/test_datetimelike.py b/venv/Lib/site-packages/pandas/tests/plotting/test_datetimelike.py new file mode 100644 index 0000000..8f855fd --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/plotting/test_datetimelike.py @@ -0,0 +1,1515 @@ +""" Test cases for time series specific (freq conversion, etc) """ +from datetime import date, datetime, time, timedelta +import pickle +import sys + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import DataFrame, Index, NaT, Series, isna +import pandas._testing as tm +from pandas.core.indexes.datetimes import bdate_range, date_range +from pandas.core.indexes.period import Period, PeriodIndex, period_range +from pandas.core.indexes.timedeltas import timedelta_range +from pandas.core.resample import DatetimeIndex +from pandas.tests.plotting.common import TestPlotBase + +from pandas.tseries.offsets import DateOffset + + +@td.skip_if_no_mpl +class TestTSPlot(TestPlotBase): + def setup_method(self, method): + TestPlotBase.setup_method(self, method) + + self.freq = ["S", "T", "H", "D", "W", "M", "Q", "A"] + idx = [period_range("12/31/1999", freq=x, periods=100) for x in self.freq] + self.period_ser = [Series(np.random.randn(len(x)), x) for x in idx] + self.period_df = [ + DataFrame(np.random.randn(len(x), 3), index=x, columns=["A", "B", "C"]) + for x in idx + ] + + freq = ["S", "T", "H", "D", "W", "M", "Q-DEC", "A", "1B30Min"] + idx = [date_range("12/31/1999", freq=x, periods=100) for x in freq] + self.datetime_ser = [Series(np.random.randn(len(x)), x) for x in idx] + self.datetime_df = [ + DataFrame(np.random.randn(len(x), 3), index=x, columns=["A", "B", "C"]) + for x in idx + ] + + def teardown_method(self, method): + tm.close() + + @pytest.mark.slow + def test_ts_plot_with_tz(self, tz_aware_fixture): + # GH2877, GH17173 + tz = tz_aware_fixture + index = date_range("1/1/2011", periods=2, freq="H", tz=tz) + ts = Series([188.5, 328.25], index=index) + _check_plot_works(ts.plot) + + def test_fontsize_set_correctly(self): + # For issue #8765 + df = DataFrame(np.random.randn(10, 9), index=range(10)) + fig, ax = self.plt.subplots() + df.plot(fontsize=2, ax=ax) + for label in ax.get_xticklabels() + ax.get_yticklabels(): + assert label.get_fontsize() == 2 + + @pytest.mark.slow + def test_frame_inferred(self): + # inferred freq + idx = date_range("1/1/1987", freq="MS", periods=100) + idx = DatetimeIndex(idx.values, freq=None) + + df = DataFrame(np.random.randn(len(idx), 3), index=idx) + _check_plot_works(df.plot) + + # axes freq + idx = idx[0:40].union(idx[45:99]) + df2 = DataFrame(np.random.randn(len(idx), 3), index=idx) + _check_plot_works(df2.plot) + + # N > 1 + idx = date_range("2008-1-1 00:15:00", freq="15T", periods=10) + idx = DatetimeIndex(idx.values, freq=None) + df = DataFrame(np.random.randn(len(idx), 3), index=idx) + _check_plot_works(df.plot) + + def test_is_error_nozeroindex(self): + # GH11858 + i = np.array([1, 2, 3]) + a = DataFrame(i, index=i) + _check_plot_works(a.plot, xerr=a) + _check_plot_works(a.plot, yerr=a) + + def test_nonnumeric_exclude(self): + idx = date_range("1/1/1987", freq="A", periods=3) + df = DataFrame({"A": ["x", "y", "z"], "B": [1, 2, 3]}, idx) + + fig, ax = self.plt.subplots() + df.plot(ax=ax) # it works + assert len(ax.get_lines()) == 1 # B was plotted + self.plt.close(fig) + + msg = "no numeric data to plot" + with pytest.raises(TypeError, match=msg): + df["A"].plot() + + @pytest.mark.slow + def test_tsplot(self): + + _, ax = self.plt.subplots() + ts = tm.makeTimeSeries() + + for s in self.period_ser: + _check_plot_works(s.plot, ax=ax) + + for s in self.datetime_ser: + _check_plot_works(s.plot, ax=ax) + + _, ax = self.plt.subplots() + ts.plot(style="k", ax=ax) + color = (0.0, 0.0, 0.0, 1) + assert color == ax.get_lines()[0].get_color() + + def test_both_style_and_color(self): + + ts = tm.makeTimeSeries() + msg = ( + "Cannot pass 'style' string with a color symbol and 'color' " + "keyword argument. Please use one or the other or pass 'style'" + " without a color symbol" + ) + with pytest.raises(ValueError, match=msg): + ts.plot(style="b-", color="#000099") + + s = ts.reset_index(drop=True) + with pytest.raises(ValueError, match=msg): + s.plot(style="b-", color="#000099") + + @pytest.mark.slow + def test_high_freq(self): + freaks = ["ms", "us"] + for freq in freaks: + _, ax = self.plt.subplots() + rng = date_range("1/1/2012", periods=100, freq=freq) + ser = Series(np.random.randn(len(rng)), rng) + _check_plot_works(ser.plot, ax=ax) + + def test_get_datevalue(self): + from pandas.plotting._matplotlib.converter import get_datevalue + + assert get_datevalue(None, "D") is None + assert get_datevalue(1987, "A") == 1987 + assert get_datevalue(Period(1987, "A"), "M") == Period("1987-12", "M").ordinal + assert get_datevalue("1/1/1987", "D") == Period("1987-1-1", "D").ordinal + + @pytest.mark.slow + def test_ts_plot_format_coord(self): + def check_format_of_first_point(ax, expected_string): + first_line = ax.get_lines()[0] + first_x = first_line.get_xdata()[0].ordinal + first_y = first_line.get_ydata()[0] + try: + assert expected_string == ax.format_coord(first_x, first_y) + except (ValueError): + pytest.skip( + "skipping test because issue forming test comparison GH7664" + ) + + annual = Series(1, index=date_range("2014-01-01", periods=3, freq="A-DEC")) + _, ax = self.plt.subplots() + annual.plot(ax=ax) + check_format_of_first_point(ax, "t = 2014 y = 1.000000") + + # note this is added to the annual plot already in existence, and + # changes its freq field + daily = Series(1, index=date_range("2014-01-01", periods=3, freq="D")) + daily.plot(ax=ax) + check_format_of_first_point(ax, "t = 2014-01-01 y = 1.000000") + tm.close() + + @pytest.mark.slow + def test_line_plot_period_series(self): + for s in self.period_ser: + _check_plot_works(s.plot, s.index.freq) + + @pytest.mark.slow + @pytest.mark.parametrize( + "frqncy", ["1S", "3S", "5T", "7H", "4D", "8W", "11M", "3A"] + ) + def test_line_plot_period_mlt_series(self, frqncy): + # test period index line plot for series with multiples (`mlt`) of the + # frequency (`frqncy`) rule code. tests resolution of issue #14763 + idx = period_range("12/31/1999", freq=frqncy, periods=100) + s = Series(np.random.randn(len(idx)), idx) + _check_plot_works(s.plot, s.index.freq.rule_code) + + @pytest.mark.slow + def test_line_plot_datetime_series(self): + for s in self.datetime_ser: + _check_plot_works(s.plot, s.index.freq.rule_code) + + @pytest.mark.slow + def test_line_plot_period_frame(self): + for df in self.period_df: + _check_plot_works(df.plot, df.index.freq) + + @pytest.mark.slow + @pytest.mark.parametrize( + "frqncy", ["1S", "3S", "5T", "7H", "4D", "8W", "11M", "3A"] + ) + def test_line_plot_period_mlt_frame(self, frqncy): + # test period index line plot for DataFrames with multiples (`mlt`) + # of the frequency (`frqncy`) rule code. tests resolution of issue + # #14763 + idx = period_range("12/31/1999", freq=frqncy, periods=100) + df = DataFrame(np.random.randn(len(idx), 3), index=idx, columns=["A", "B", "C"]) + freq = df.index.asfreq(df.index.freq.rule_code).freq + _check_plot_works(df.plot, freq) + + @pytest.mark.slow + def test_line_plot_datetime_frame(self): + for df in self.datetime_df: + freq = df.index.to_period(df.index.freq.rule_code).freq + _check_plot_works(df.plot, freq) + + @pytest.mark.slow + def test_line_plot_inferred_freq(self): + for ser in self.datetime_ser: + ser = Series(ser.values, Index(np.asarray(ser.index))) + _check_plot_works(ser.plot, ser.index.inferred_freq) + + ser = ser[[0, 3, 5, 6]] + _check_plot_works(ser.plot) + + def test_fake_inferred_business(self): + _, ax = self.plt.subplots() + rng = date_range("2001-1-1", "2001-1-10") + ts = Series(range(len(rng)), index=rng) + ts = ts[:3].append(ts[5:]) + ts.plot(ax=ax) + assert not hasattr(ax, "freq") + + @pytest.mark.slow + def test_plot_offset_freq(self): + ser = tm.makeTimeSeries() + _check_plot_works(ser.plot) + + dr = date_range(ser.index[0], freq="BQS", periods=10) + ser = Series(np.random.randn(len(dr)), index=dr) + _check_plot_works(ser.plot) + + @pytest.mark.slow + def test_plot_multiple_inferred_freq(self): + dr = Index([datetime(2000, 1, 1), datetime(2000, 1, 6), datetime(2000, 1, 11)]) + ser = Series(np.random.randn(len(dr)), index=dr) + _check_plot_works(ser.plot) + + @pytest.mark.slow + def test_uhf(self): + import pandas.plotting._matplotlib.converter as conv + + idx = date_range("2012-6-22 21:59:51.960928", freq="L", periods=500) + df = DataFrame(np.random.randn(len(idx), 2), index=idx) + + _, ax = self.plt.subplots() + df.plot(ax=ax) + axis = ax.get_xaxis() + + tlocs = axis.get_ticklocs() + tlabels = axis.get_ticklabels() + for loc, label in zip(tlocs, tlabels): + xp = conv._from_ordinal(loc).strftime("%H:%M:%S.%f") + rs = str(label.get_text()) + if len(rs): + assert xp == rs + + @pytest.mark.slow + def test_irreg_hf(self): + idx = date_range("2012-6-22 21:59:51", freq="S", periods=100) + df = DataFrame(np.random.randn(len(idx), 2), index=idx) + + irreg = df.iloc[[0, 1, 3, 4]] + _, ax = self.plt.subplots() + irreg.plot(ax=ax) + diffs = Series(ax.get_lines()[0].get_xydata()[:, 0]).diff() + + sec = 1.0 / 24 / 60 / 60 + assert (np.fabs(diffs[1:] - [sec, sec * 2, sec]) < 1e-8).all() + + _, ax = self.plt.subplots() + df2 = df.copy() + df2.index = df.index.astype(object) + df2.plot(ax=ax) + diffs = Series(ax.get_lines()[0].get_xydata()[:, 0]).diff() + assert (np.fabs(diffs[1:] - sec) < 1e-8).all() + + def test_irregular_datetime64_repr_bug(self): + ser = tm.makeTimeSeries() + ser = ser[[0, 1, 2, 7]] + + _, ax = self.plt.subplots() + + ret = ser.plot(ax=ax) + assert ret is not None + + for rs, xp in zip(ax.get_lines()[0].get_xdata(), ser.index): + assert rs == xp + + def test_business_freq(self): + bts = tm.makePeriodSeries() + _, ax = self.plt.subplots() + bts.plot(ax=ax) + assert ax.get_lines()[0].get_xydata()[0, 0] == bts.index[0].ordinal + idx = ax.get_lines()[0].get_xdata() + assert PeriodIndex(data=idx).freqstr == "B" + + @pytest.mark.slow + def test_business_freq_convert(self): + bts = tm.makeTimeSeries(300).asfreq("BM") + ts = bts.to_period("M") + _, ax = self.plt.subplots() + bts.plot(ax=ax) + assert ax.get_lines()[0].get_xydata()[0, 0] == ts.index[0].ordinal + idx = ax.get_lines()[0].get_xdata() + assert PeriodIndex(data=idx).freqstr == "M" + + def test_nonzero_base(self): + # GH2571 + idx = date_range("2012-12-20", periods=24, freq="H") + timedelta(minutes=30) + df = DataFrame(np.arange(24), index=idx) + _, ax = self.plt.subplots() + df.plot(ax=ax) + rs = ax.get_lines()[0].get_xdata() + assert not Index(rs).is_normalized + + def test_dataframe(self): + bts = DataFrame({"a": tm.makeTimeSeries()}) + _, ax = self.plt.subplots() + bts.plot(ax=ax) + idx = ax.get_lines()[0].get_xdata() + tm.assert_index_equal(bts.index.to_period(), PeriodIndex(idx)) + + @pytest.mark.slow + def test_axis_limits(self): + def _test(ax): + xlim = ax.get_xlim() + ax.set_xlim(xlim[0] - 5, xlim[1] + 10) + result = ax.get_xlim() + assert result[0] == xlim[0] - 5 + assert result[1] == xlim[1] + 10 + + # string + expected = (Period("1/1/2000", ax.freq), Period("4/1/2000", ax.freq)) + ax.set_xlim("1/1/2000", "4/1/2000") + result = ax.get_xlim() + assert int(result[0]) == expected[0].ordinal + assert int(result[1]) == expected[1].ordinal + + # datetime + expected = (Period("1/1/2000", ax.freq), Period("4/1/2000", ax.freq)) + ax.set_xlim(datetime(2000, 1, 1), datetime(2000, 4, 1)) + result = ax.get_xlim() + assert int(result[0]) == expected[0].ordinal + assert int(result[1]) == expected[1].ordinal + fig = ax.get_figure() + self.plt.close(fig) + + ser = tm.makeTimeSeries() + _, ax = self.plt.subplots() + ser.plot(ax=ax) + _test(ax) + + _, ax = self.plt.subplots() + df = DataFrame({"a": ser, "b": ser + 1}) + df.plot(ax=ax) + _test(ax) + + df = DataFrame({"a": ser, "b": ser + 1}) + axes = df.plot(subplots=True) + + for ax in axes: + _test(ax) + + def test_get_finder(self): + import pandas.plotting._matplotlib.converter as conv + + assert conv.get_finder("B") == conv._daily_finder + assert conv.get_finder("D") == conv._daily_finder + assert conv.get_finder("M") == conv._monthly_finder + assert conv.get_finder("Q") == conv._quarterly_finder + assert conv.get_finder("A") == conv._annual_finder + assert conv.get_finder("W") == conv._daily_finder + + @pytest.mark.slow + def test_finder_daily(self): + day_lst = [10, 40, 252, 400, 950, 2750, 10000] + + xpl1 = xpl2 = [Period("1999-1-1", freq="B").ordinal] * len(day_lst) + rs1 = [] + rs2 = [] + for i, n in enumerate(day_lst): + rng = bdate_range("1999-1-1", periods=n) + ser = Series(np.random.randn(len(rng)), rng) + _, ax = self.plt.subplots() + ser.plot(ax=ax) + xaxis = ax.get_xaxis() + rs1.append(xaxis.get_majorticklocs()[0]) + + vmin, vmax = ax.get_xlim() + ax.set_xlim(vmin + 0.9, vmax) + rs2.append(xaxis.get_majorticklocs()[0]) + self.plt.close(ax.get_figure()) + + assert rs1 == xpl1 + assert rs2 == xpl2 + + @pytest.mark.slow + def test_finder_quarterly(self): + yrs = [3.5, 11] + + xpl1 = xpl2 = [Period("1988Q1").ordinal] * len(yrs) + rs1 = [] + rs2 = [] + for i, n in enumerate(yrs): + rng = period_range("1987Q2", periods=int(n * 4), freq="Q") + ser = Series(np.random.randn(len(rng)), rng) + _, ax = self.plt.subplots() + ser.plot(ax=ax) + xaxis = ax.get_xaxis() + rs1.append(xaxis.get_majorticklocs()[0]) + + (vmin, vmax) = ax.get_xlim() + ax.set_xlim(vmin + 0.9, vmax) + rs2.append(xaxis.get_majorticklocs()[0]) + self.plt.close(ax.get_figure()) + + assert rs1 == xpl1 + assert rs2 == xpl2 + + @pytest.mark.slow + def test_finder_monthly(self): + yrs = [1.15, 2.5, 4, 11] + + xpl1 = xpl2 = [Period("Jan 1988").ordinal] * len(yrs) + rs1 = [] + rs2 = [] + for i, n in enumerate(yrs): + rng = period_range("1987Q2", periods=int(n * 12), freq="M") + ser = Series(np.random.randn(len(rng)), rng) + _, ax = self.plt.subplots() + ser.plot(ax=ax) + xaxis = ax.get_xaxis() + rs1.append(xaxis.get_majorticklocs()[0]) + + vmin, vmax = ax.get_xlim() + ax.set_xlim(vmin + 0.9, vmax) + rs2.append(xaxis.get_majorticklocs()[0]) + self.plt.close(ax.get_figure()) + + assert rs1 == xpl1 + assert rs2 == xpl2 + + def test_finder_monthly_long(self): + rng = period_range("1988Q1", periods=24 * 12, freq="M") + ser = Series(np.random.randn(len(rng)), rng) + _, ax = self.plt.subplots() + ser.plot(ax=ax) + xaxis = ax.get_xaxis() + rs = xaxis.get_majorticklocs()[0] + xp = Period("1989Q1", "M").ordinal + assert rs == xp + + @pytest.mark.slow + def test_finder_annual(self): + xp = [1987, 1988, 1990, 1990, 1995, 2020, 2070, 2170] + xp = [Period(x, freq="A").ordinal for x in xp] + rs = [] + for i, nyears in enumerate([5, 10, 19, 49, 99, 199, 599, 1001]): + rng = period_range("1987", periods=nyears, freq="A") + ser = Series(np.random.randn(len(rng)), rng) + _, ax = self.plt.subplots() + ser.plot(ax=ax) + xaxis = ax.get_xaxis() + rs.append(xaxis.get_majorticklocs()[0]) + self.plt.close(ax.get_figure()) + + assert rs == xp + + @pytest.mark.slow + def test_finder_minutely(self): + nminutes = 50 * 24 * 60 + rng = date_range("1/1/1999", freq="Min", periods=nminutes) + ser = Series(np.random.randn(len(rng)), rng) + _, ax = self.plt.subplots() + ser.plot(ax=ax) + xaxis = ax.get_xaxis() + rs = xaxis.get_majorticklocs()[0] + xp = Period("1/1/1999", freq="Min").ordinal + + assert rs == xp + + def test_finder_hourly(self): + nhours = 23 + rng = date_range("1/1/1999", freq="H", periods=nhours) + ser = Series(np.random.randn(len(rng)), rng) + _, ax = self.plt.subplots() + ser.plot(ax=ax) + xaxis = ax.get_xaxis() + rs = xaxis.get_majorticklocs()[0] + xp = Period("1/1/1999", freq="H").ordinal + + assert rs == xp + + @pytest.mark.slow + def test_gaps(self): + ts = tm.makeTimeSeries() + ts[5:25] = np.nan + _, ax = self.plt.subplots() + ts.plot(ax=ax) + lines = ax.get_lines() + assert len(lines) == 1 + line = lines[0] + data = line.get_xydata() + + if self.mpl_ge_3_0_0 or not self.mpl_ge_2_2_3: + data = np.ma.MaskedArray(data, mask=isna(data), fill_value=np.nan) + + assert isinstance(data, np.ma.core.MaskedArray) + mask = data.mask + assert mask[5:25, 1].all() + self.plt.close(ax.get_figure()) + + # irregular + ts = tm.makeTimeSeries() + ts = ts[[0, 1, 2, 5, 7, 9, 12, 15, 20]] + ts[2:5] = np.nan + _, ax = self.plt.subplots() + ax = ts.plot(ax=ax) + lines = ax.get_lines() + assert len(lines) == 1 + line = lines[0] + data = line.get_xydata() + + if self.mpl_ge_3_0_0 or not self.mpl_ge_2_2_3: + data = np.ma.MaskedArray(data, mask=isna(data), fill_value=np.nan) + + assert isinstance(data, np.ma.core.MaskedArray) + mask = data.mask + assert mask[2:5, 1].all() + self.plt.close(ax.get_figure()) + + # non-ts + idx = [0, 1, 2, 5, 7, 9, 12, 15, 20] + ser = Series(np.random.randn(len(idx)), idx) + ser[2:5] = np.nan + _, ax = self.plt.subplots() + ser.plot(ax=ax) + lines = ax.get_lines() + assert len(lines) == 1 + line = lines[0] + data = line.get_xydata() + if self.mpl_ge_3_0_0 or not self.mpl_ge_2_2_3: + data = np.ma.MaskedArray(data, mask=isna(data), fill_value=np.nan) + + assert isinstance(data, np.ma.core.MaskedArray) + mask = data.mask + assert mask[2:5, 1].all() + + @pytest.mark.slow + def test_gap_upsample(self): + low = tm.makeTimeSeries() + low[5:25] = np.nan + _, ax = self.plt.subplots() + low.plot(ax=ax) + + idxh = date_range(low.index[0], low.index[-1], freq="12h") + s = Series(np.random.randn(len(idxh)), idxh) + s.plot(secondary_y=True) + lines = ax.get_lines() + assert len(lines) == 1 + assert len(ax.right_ax.get_lines()) == 1 + + line = lines[0] + data = line.get_xydata() + if self.mpl_ge_3_0_0 or not self.mpl_ge_2_2_3: + data = np.ma.MaskedArray(data, mask=isna(data), fill_value=np.nan) + + assert isinstance(data, np.ma.core.MaskedArray) + mask = data.mask + assert mask[5:25, 1].all() + + @pytest.mark.slow + def test_secondary_y(self): + ser = Series(np.random.randn(10)) + ser2 = Series(np.random.randn(10)) + fig, _ = self.plt.subplots() + ax = ser.plot(secondary_y=True) + assert hasattr(ax, "left_ax") + assert not hasattr(ax, "right_ax") + axes = fig.get_axes() + line = ax.get_lines()[0] + xp = Series(line.get_ydata(), line.get_xdata()) + tm.assert_series_equal(ser, xp) + assert ax.get_yaxis().get_ticks_position() == "right" + assert not axes[0].get_yaxis().get_visible() + self.plt.close(fig) + + _, ax2 = self.plt.subplots() + ser2.plot(ax=ax2) + assert ax2.get_yaxis().get_ticks_position() == self.default_tick_position + self.plt.close(ax2.get_figure()) + + ax = ser2.plot() + ax2 = ser.plot(secondary_y=True) + assert ax.get_yaxis().get_visible() + assert not hasattr(ax, "left_ax") + assert hasattr(ax, "right_ax") + assert hasattr(ax2, "left_ax") + assert not hasattr(ax2, "right_ax") + + @pytest.mark.slow + def test_secondary_y_ts(self): + idx = date_range("1/1/2000", periods=10) + ser = Series(np.random.randn(10), idx) + ser2 = Series(np.random.randn(10), idx) + fig, _ = self.plt.subplots() + ax = ser.plot(secondary_y=True) + assert hasattr(ax, "left_ax") + assert not hasattr(ax, "right_ax") + axes = fig.get_axes() + line = ax.get_lines()[0] + xp = Series(line.get_ydata(), line.get_xdata()).to_timestamp() + tm.assert_series_equal(ser, xp) + assert ax.get_yaxis().get_ticks_position() == "right" + assert not axes[0].get_yaxis().get_visible() + self.plt.close(fig) + + _, ax2 = self.plt.subplots() + ser2.plot(ax=ax2) + assert ax2.get_yaxis().get_ticks_position() == self.default_tick_position + self.plt.close(ax2.get_figure()) + + ax = ser2.plot() + ax2 = ser.plot(secondary_y=True) + assert ax.get_yaxis().get_visible() + + @pytest.mark.slow + @td.skip_if_no_scipy + def test_secondary_kde(self): + + ser = Series(np.random.randn(10)) + fig, ax = self.plt.subplots() + ax = ser.plot(secondary_y=True, kind="density", ax=ax) + assert hasattr(ax, "left_ax") + assert not hasattr(ax, "right_ax") + axes = fig.get_axes() + assert axes[1].get_yaxis().get_ticks_position() == "right" + + @pytest.mark.slow + def test_secondary_bar(self): + ser = Series(np.random.randn(10)) + fig, ax = self.plt.subplots() + ser.plot(secondary_y=True, kind="bar", ax=ax) + axes = fig.get_axes() + assert axes[1].get_yaxis().get_ticks_position() == "right" + + @pytest.mark.slow + def test_secondary_frame(self): + df = DataFrame(np.random.randn(5, 3), columns=["a", "b", "c"]) + axes = df.plot(secondary_y=["a", "c"], subplots=True) + assert axes[0].get_yaxis().get_ticks_position() == "right" + assert axes[1].get_yaxis().get_ticks_position() == self.default_tick_position + assert axes[2].get_yaxis().get_ticks_position() == "right" + + @pytest.mark.slow + def test_secondary_bar_frame(self): + df = DataFrame(np.random.randn(5, 3), columns=["a", "b", "c"]) + axes = df.plot(kind="bar", secondary_y=["a", "c"], subplots=True) + assert axes[0].get_yaxis().get_ticks_position() == "right" + assert axes[1].get_yaxis().get_ticks_position() == self.default_tick_position + assert axes[2].get_yaxis().get_ticks_position() == "right" + + def test_mixed_freq_regular_first(self): + # TODO + s1 = tm.makeTimeSeries() + s2 = s1[[0, 5, 10, 11, 12, 13, 14, 15]] + + # it works! + _, ax = self.plt.subplots() + s1.plot(ax=ax) + + ax2 = s2.plot(style="g", ax=ax) + lines = ax2.get_lines() + idx1 = PeriodIndex(lines[0].get_xdata()) + idx2 = PeriodIndex(lines[1].get_xdata()) + + tm.assert_index_equal(idx1, s1.index.to_period("B")) + tm.assert_index_equal(idx2, s2.index.to_period("B")) + + left, right = ax2.get_xlim() + pidx = s1.index.to_period() + assert left <= pidx[0].ordinal + assert right >= pidx[-1].ordinal + + @pytest.mark.slow + def test_mixed_freq_irregular_first(self): + s1 = tm.makeTimeSeries() + s2 = s1[[0, 5, 10, 11, 12, 13, 14, 15]] + _, ax = self.plt.subplots() + s2.plot(style="g", ax=ax) + s1.plot(ax=ax) + assert not hasattr(ax, "freq") + lines = ax.get_lines() + x1 = lines[0].get_xdata() + tm.assert_numpy_array_equal(x1, s2.index.astype(object).values) + x2 = lines[1].get_xdata() + tm.assert_numpy_array_equal(x2, s1.index.astype(object).values) + + def test_mixed_freq_regular_first_df(self): + # GH 9852 + s1 = tm.makeTimeSeries().to_frame() + s2 = s1.iloc[[0, 5, 10, 11, 12, 13, 14, 15], :] + _, ax = self.plt.subplots() + s1.plot(ax=ax) + ax2 = s2.plot(style="g", ax=ax) + lines = ax2.get_lines() + idx1 = PeriodIndex(lines[0].get_xdata()) + idx2 = PeriodIndex(lines[1].get_xdata()) + assert idx1.equals(s1.index.to_period("B")) + assert idx2.equals(s2.index.to_period("B")) + left, right = ax2.get_xlim() + pidx = s1.index.to_period() + assert left <= pidx[0].ordinal + assert right >= pidx[-1].ordinal + + @pytest.mark.slow + def test_mixed_freq_irregular_first_df(self): + # GH 9852 + s1 = tm.makeTimeSeries().to_frame() + s2 = s1.iloc[[0, 5, 10, 11, 12, 13, 14, 15], :] + _, ax = self.plt.subplots() + s2.plot(style="g", ax=ax) + s1.plot(ax=ax) + assert not hasattr(ax, "freq") + lines = ax.get_lines() + x1 = lines[0].get_xdata() + tm.assert_numpy_array_equal(x1, s2.index.astype(object).values) + x2 = lines[1].get_xdata() + tm.assert_numpy_array_equal(x2, s1.index.astype(object).values) + + def test_mixed_freq_hf_first(self): + idxh = date_range("1/1/1999", periods=365, freq="D") + idxl = date_range("1/1/1999", periods=12, freq="M") + high = Series(np.random.randn(len(idxh)), idxh) + low = Series(np.random.randn(len(idxl)), idxl) + _, ax = self.plt.subplots() + high.plot(ax=ax) + low.plot(ax=ax) + for l in ax.get_lines(): + assert PeriodIndex(data=l.get_xdata()).freq == "D" + + @pytest.mark.slow + def test_mixed_freq_alignment(self): + ts_ind = date_range("2012-01-01 13:00", "2012-01-02", freq="H") + ts_data = np.random.randn(12) + + ts = Series(ts_data, index=ts_ind) + ts2 = ts.asfreq("T").interpolate() + + _, ax = self.plt.subplots() + ax = ts.plot(ax=ax) + ts2.plot(style="r", ax=ax) + + assert ax.lines[0].get_xdata()[0] == ax.lines[1].get_xdata()[0] + + @pytest.mark.slow + def test_mixed_freq_lf_first(self): + + idxh = date_range("1/1/1999", periods=365, freq="D") + idxl = date_range("1/1/1999", periods=12, freq="M") + high = Series(np.random.randn(len(idxh)), idxh) + low = Series(np.random.randn(len(idxl)), idxl) + _, ax = self.plt.subplots() + low.plot(legend=True, ax=ax) + high.plot(legend=True, ax=ax) + for l in ax.get_lines(): + assert PeriodIndex(data=l.get_xdata()).freq == "D" + leg = ax.get_legend() + assert len(leg.texts) == 2 + self.plt.close(ax.get_figure()) + + idxh = date_range("1/1/1999", periods=240, freq="T") + idxl = date_range("1/1/1999", periods=4, freq="H") + high = Series(np.random.randn(len(idxh)), idxh) + low = Series(np.random.randn(len(idxl)), idxl) + _, ax = self.plt.subplots() + low.plot(ax=ax) + high.plot(ax=ax) + for l in ax.get_lines(): + assert PeriodIndex(data=l.get_xdata()).freq == "T" + + def test_mixed_freq_irreg_period(self): + ts = tm.makeTimeSeries() + irreg = ts[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 16, 17, 18, 29]] + rng = period_range("1/3/2000", periods=30, freq="B") + ps = Series(np.random.randn(len(rng)), rng) + _, ax = self.plt.subplots() + irreg.plot(ax=ax) + ps.plot(ax=ax) + + def test_mixed_freq_shared_ax(self): + + # GH13341, using sharex=True + idx1 = date_range("2015-01-01", periods=3, freq="M") + idx2 = idx1[:1].union(idx1[2:]) + s1 = Series(range(len(idx1)), idx1) + s2 = Series(range(len(idx2)), idx2) + + fig, (ax1, ax2) = self.plt.subplots(nrows=2, sharex=True) + s1.plot(ax=ax1) + s2.plot(ax=ax2) + + assert ax1.freq == "M" + assert ax2.freq == "M" + assert ax1.lines[0].get_xydata()[0, 0] == ax2.lines[0].get_xydata()[0, 0] + + # using twinx + fig, ax1 = self.plt.subplots() + ax2 = ax1.twinx() + s1.plot(ax=ax1) + s2.plot(ax=ax2) + + assert ax1.lines[0].get_xydata()[0, 0] == ax2.lines[0].get_xydata()[0, 0] + + # TODO (GH14330, GH14322) + # plotting the irregular first does not yet work + # fig, ax1 = plt.subplots() + # ax2 = ax1.twinx() + # s2.plot(ax=ax1) + # s1.plot(ax=ax2) + # assert (ax1.lines[0].get_xydata()[0, 0] == + # ax2.lines[0].get_xydata()[0, 0]) + + def test_nat_handling(self): + + _, ax = self.plt.subplots() + + dti = DatetimeIndex(["2015-01-01", NaT, "2015-01-03"]) + s = Series(range(len(dti)), dti) + s.plot(ax=ax) + xdata = ax.get_lines()[0].get_xdata() + # plot x data is bounded by index values + assert s.index.min() <= Series(xdata).min() + assert Series(xdata).max() <= s.index.max() + + @pytest.mark.slow + def test_to_weekly_resampling(self): + idxh = date_range("1/1/1999", periods=52, freq="W") + idxl = date_range("1/1/1999", periods=12, freq="M") + high = Series(np.random.randn(len(idxh)), idxh) + low = Series(np.random.randn(len(idxl)), idxl) + _, ax = self.plt.subplots() + high.plot(ax=ax) + low.plot(ax=ax) + for l in ax.get_lines(): + assert PeriodIndex(data=l.get_xdata()).freq == idxh.freq + + @pytest.mark.slow + def test_from_weekly_resampling(self): + idxh = date_range("1/1/1999", periods=52, freq="W") + idxl = date_range("1/1/1999", periods=12, freq="M") + high = Series(np.random.randn(len(idxh)), idxh) + low = Series(np.random.randn(len(idxl)), idxl) + _, ax = self.plt.subplots() + low.plot(ax=ax) + high.plot(ax=ax) + + expected_h = idxh.to_period().asi8.astype(np.float64) + expected_l = np.array( + [1514, 1519, 1523, 1527, 1531, 1536, 1540, 1544, 1549, 1553, 1558, 1562], + dtype=np.float64, + ) + for l in ax.get_lines(): + assert PeriodIndex(data=l.get_xdata()).freq == idxh.freq + xdata = l.get_xdata(orig=False) + if len(xdata) == 12: # idxl lines + tm.assert_numpy_array_equal(xdata, expected_l) + else: + tm.assert_numpy_array_equal(xdata, expected_h) + tm.close() + + @pytest.mark.slow + def test_from_resampling_area_line_mixed(self): + idxh = date_range("1/1/1999", periods=52, freq="W") + idxl = date_range("1/1/1999", periods=12, freq="M") + high = DataFrame(np.random.rand(len(idxh), 3), index=idxh, columns=[0, 1, 2]) + low = DataFrame(np.random.rand(len(idxl), 3), index=idxl, columns=[0, 1, 2]) + + # low to high + for kind1, kind2 in [("line", "area"), ("area", "line")]: + _, ax = self.plt.subplots() + low.plot(kind=kind1, stacked=True, ax=ax) + high.plot(kind=kind2, stacked=True, ax=ax) + + # check low dataframe result + expected_x = np.array( + [ + 1514, + 1519, + 1523, + 1527, + 1531, + 1536, + 1540, + 1544, + 1549, + 1553, + 1558, + 1562, + ], + dtype=np.float64, + ) + expected_y = np.zeros(len(expected_x), dtype=np.float64) + for i in range(3): + line = ax.lines[i] + assert PeriodIndex(line.get_xdata()).freq == idxh.freq + tm.assert_numpy_array_equal(line.get_xdata(orig=False), expected_x) + # check stacked values are correct + expected_y += low[i].values + tm.assert_numpy_array_equal(line.get_ydata(orig=False), expected_y) + + # check high dataframe result + expected_x = idxh.to_period().asi8.astype(np.float64) + expected_y = np.zeros(len(expected_x), dtype=np.float64) + for i in range(3): + line = ax.lines[3 + i] + assert PeriodIndex(data=line.get_xdata()).freq == idxh.freq + tm.assert_numpy_array_equal(line.get_xdata(orig=False), expected_x) + expected_y += high[i].values + tm.assert_numpy_array_equal(line.get_ydata(orig=False), expected_y) + + # high to low + for kind1, kind2 in [("line", "area"), ("area", "line")]: + _, ax = self.plt.subplots() + high.plot(kind=kind1, stacked=True, ax=ax) + low.plot(kind=kind2, stacked=True, ax=ax) + + # check high dataframe result + expected_x = idxh.to_period().asi8.astype(np.float64) + expected_y = np.zeros(len(expected_x), dtype=np.float64) + for i in range(3): + line = ax.lines[i] + assert PeriodIndex(data=line.get_xdata()).freq == idxh.freq + tm.assert_numpy_array_equal(line.get_xdata(orig=False), expected_x) + expected_y += high[i].values + tm.assert_numpy_array_equal(line.get_ydata(orig=False), expected_y) + + # check low dataframe result + expected_x = np.array( + [ + 1514, + 1519, + 1523, + 1527, + 1531, + 1536, + 1540, + 1544, + 1549, + 1553, + 1558, + 1562, + ], + dtype=np.float64, + ) + expected_y = np.zeros(len(expected_x), dtype=np.float64) + for i in range(3): + lines = ax.lines[3 + i] + assert PeriodIndex(data=lines.get_xdata()).freq == idxh.freq + tm.assert_numpy_array_equal(lines.get_xdata(orig=False), expected_x) + expected_y += low[i].values + tm.assert_numpy_array_equal(lines.get_ydata(orig=False), expected_y) + + @pytest.mark.slow + def test_mixed_freq_second_millisecond(self): + # GH 7772, GH 7760 + idxh = date_range("2014-07-01 09:00", freq="S", periods=50) + idxl = date_range("2014-07-01 09:00", freq="100L", periods=500) + high = Series(np.random.randn(len(idxh)), idxh) + low = Series(np.random.randn(len(idxl)), idxl) + # high to low + _, ax = self.plt.subplots() + high.plot(ax=ax) + low.plot(ax=ax) + assert len(ax.get_lines()) == 2 + for l in ax.get_lines(): + assert PeriodIndex(data=l.get_xdata()).freq == "L" + tm.close() + + # low to high + _, ax = self.plt.subplots() + low.plot(ax=ax) + high.plot(ax=ax) + assert len(ax.get_lines()) == 2 + for l in ax.get_lines(): + assert PeriodIndex(data=l.get_xdata()).freq == "L" + + @pytest.mark.slow + def test_irreg_dtypes(self): + # date + idx = [date(2000, 1, 1), date(2000, 1, 5), date(2000, 1, 20)] + df = DataFrame(np.random.randn(len(idx), 3), Index(idx, dtype=object)) + _check_plot_works(df.plot) + + # np.datetime64 + idx = date_range("1/1/2000", periods=10) + idx = idx[[0, 2, 5, 9]].astype(object) + df = DataFrame(np.random.randn(len(idx), 3), idx) + _, ax = self.plt.subplots() + _check_plot_works(df.plot, ax=ax) + + @pytest.mark.slow + def test_time(self): + t = datetime(1, 1, 1, 3, 30, 0) + deltas = np.random.randint(1, 20, 3).cumsum() + ts = np.array([(t + timedelta(minutes=int(x))).time() for x in deltas]) + df = DataFrame( + {"a": np.random.randn(len(ts)), "b": np.random.randn(len(ts))}, index=ts + ) + fig, ax = self.plt.subplots() + df.plot(ax=ax) + + # verify tick labels + ticks = ax.get_xticks() + labels = ax.get_xticklabels() + for t, l in zip(ticks, labels): + m, s = divmod(int(t), 60) + h, m = divmod(m, 60) + rs = l.get_text() + if len(rs) > 0: + if s != 0: + xp = time(h, m, s).strftime("%H:%M:%S") + else: + xp = time(h, m, s).strftime("%H:%M") + assert xp == rs + + @pytest.mark.slow + def test_time_change_xlim(self): + t = datetime(1, 1, 1, 3, 30, 0) + deltas = np.random.randint(1, 20, 3).cumsum() + ts = np.array([(t + timedelta(minutes=int(x))).time() for x in deltas]) + df = DataFrame( + {"a": np.random.randn(len(ts)), "b": np.random.randn(len(ts))}, index=ts + ) + fig, ax = self.plt.subplots() + df.plot(ax=ax) + + # verify tick labels + ticks = ax.get_xticks() + labels = ax.get_xticklabels() + for t, l in zip(ticks, labels): + m, s = divmod(int(t), 60) + h, m = divmod(m, 60) + rs = l.get_text() + if len(rs) > 0: + if s != 0: + xp = time(h, m, s).strftime("%H:%M:%S") + else: + xp = time(h, m, s).strftime("%H:%M") + assert xp == rs + + # change xlim + ax.set_xlim("1:30", "5:00") + + # check tick labels again + ticks = ax.get_xticks() + labels = ax.get_xticklabels() + for t, l in zip(ticks, labels): + m, s = divmod(int(t), 60) + h, m = divmod(m, 60) + rs = l.get_text() + if len(rs) > 0: + if s != 0: + xp = time(h, m, s).strftime("%H:%M:%S") + else: + xp = time(h, m, s).strftime("%H:%M") + assert xp == rs + + @pytest.mark.slow + def test_time_musec(self): + t = datetime(1, 1, 1, 3, 30, 0) + deltas = np.random.randint(1, 20, 3).cumsum() + ts = np.array([(t + timedelta(microseconds=int(x))).time() for x in deltas]) + df = DataFrame( + {"a": np.random.randn(len(ts)), "b": np.random.randn(len(ts))}, index=ts + ) + fig, ax = self.plt.subplots() + ax = df.plot(ax=ax) + + # verify tick labels + ticks = ax.get_xticks() + labels = ax.get_xticklabels() + for t, l in zip(ticks, labels): + m, s = divmod(int(t), 60) + + us = int(round((t - int(t)) * 1e6)) + + h, m = divmod(m, 60) + rs = l.get_text() + if len(rs) > 0: + if (us % 1000) != 0: + xp = time(h, m, s, us).strftime("%H:%M:%S.%f") + elif (us // 1000) != 0: + xp = time(h, m, s, us).strftime("%H:%M:%S.%f")[:-3] + elif s != 0: + xp = time(h, m, s, us).strftime("%H:%M:%S") + else: + xp = time(h, m, s, us).strftime("%H:%M") + assert xp == rs + + @pytest.mark.slow + def test_secondary_upsample(self): + idxh = date_range("1/1/1999", periods=365, freq="D") + idxl = date_range("1/1/1999", periods=12, freq="M") + high = Series(np.random.randn(len(idxh)), idxh) + low = Series(np.random.randn(len(idxl)), idxl) + _, ax = self.plt.subplots() + low.plot(ax=ax) + ax = high.plot(secondary_y=True, ax=ax) + for l in ax.get_lines(): + assert PeriodIndex(l.get_xdata()).freq == "D" + assert hasattr(ax, "left_ax") + assert not hasattr(ax, "right_ax") + for l in ax.left_ax.get_lines(): + assert PeriodIndex(l.get_xdata()).freq == "D" + + @pytest.mark.slow + def test_secondary_legend(self): + fig = self.plt.figure() + ax = fig.add_subplot(211) + + # ts + df = tm.makeTimeDataFrame() + df.plot(secondary_y=["A", "B"], ax=ax) + leg = ax.get_legend() + assert len(leg.get_lines()) == 4 + assert leg.get_texts()[0].get_text() == "A (right)" + assert leg.get_texts()[1].get_text() == "B (right)" + assert leg.get_texts()[2].get_text() == "C" + assert leg.get_texts()[3].get_text() == "D" + assert ax.right_ax.get_legend() is None + colors = set() + for line in leg.get_lines(): + colors.add(line.get_color()) + + # TODO: color cycle problems + assert len(colors) == 4 + self.plt.close(fig) + + fig = self.plt.figure() + ax = fig.add_subplot(211) + df.plot(secondary_y=["A", "C"], mark_right=False, ax=ax) + leg = ax.get_legend() + assert len(leg.get_lines()) == 4 + assert leg.get_texts()[0].get_text() == "A" + assert leg.get_texts()[1].get_text() == "B" + assert leg.get_texts()[2].get_text() == "C" + assert leg.get_texts()[3].get_text() == "D" + self.plt.close(fig) + + fig, ax = self.plt.subplots() + df.plot(kind="bar", secondary_y=["A"], ax=ax) + leg = ax.get_legend() + assert leg.get_texts()[0].get_text() == "A (right)" + assert leg.get_texts()[1].get_text() == "B" + self.plt.close(fig) + + fig, ax = self.plt.subplots() + df.plot(kind="bar", secondary_y=["A"], mark_right=False, ax=ax) + leg = ax.get_legend() + assert leg.get_texts()[0].get_text() == "A" + assert leg.get_texts()[1].get_text() == "B" + self.plt.close(fig) + + fig = self.plt.figure() + ax = fig.add_subplot(211) + df = tm.makeTimeDataFrame() + ax = df.plot(secondary_y=["C", "D"], ax=ax) + leg = ax.get_legend() + assert len(leg.get_lines()) == 4 + assert ax.right_ax.get_legend() is None + colors = set() + for line in leg.get_lines(): + colors.add(line.get_color()) + + # TODO: color cycle problems + assert len(colors) == 4 + self.plt.close(fig) + + # non-ts + df = tm.makeDataFrame() + fig = self.plt.figure() + ax = fig.add_subplot(211) + ax = df.plot(secondary_y=["A", "B"], ax=ax) + leg = ax.get_legend() + assert len(leg.get_lines()) == 4 + assert ax.right_ax.get_legend() is None + colors = set() + for line in leg.get_lines(): + colors.add(line.get_color()) + + # TODO: color cycle problems + assert len(colors) == 4 + self.plt.close() + + fig = self.plt.figure() + ax = fig.add_subplot(211) + ax = df.plot(secondary_y=["C", "D"], ax=ax) + leg = ax.get_legend() + assert len(leg.get_lines()) == 4 + assert ax.right_ax.get_legend() is None + colors = set() + for line in leg.get_lines(): + colors.add(line.get_color()) + + # TODO: color cycle problems + assert len(colors) == 4 + + def test_format_date_axis(self): + rng = date_range("1/1/2012", periods=12, freq="M") + df = DataFrame(np.random.randn(len(rng), 3), rng) + _, ax = self.plt.subplots() + ax = df.plot(ax=ax) + xaxis = ax.get_xaxis() + for l in xaxis.get_ticklabels(): + if len(l.get_text()) > 0: + assert l.get_rotation() == 30 + + @pytest.mark.slow + def test_ax_plot(self): + x = date_range(start="2012-01-02", periods=10, freq="D") + y = list(range(len(x))) + _, ax = self.plt.subplots() + lines = ax.plot(x, y, label="Y") + tm.assert_index_equal(DatetimeIndex(lines[0].get_xdata()), x) + + @pytest.mark.slow + def test_mpl_nopandas(self): + dates = [date(2008, 12, 31), date(2009, 1, 31)] + values1 = np.arange(10.0, 11.0, 0.5) + values2 = np.arange(11.0, 12.0, 0.5) + + kw = dict(fmt="-", lw=4) + + _, ax = self.plt.subplots() + ax.plot_date([x.toordinal() for x in dates], values1, **kw) + ax.plot_date([x.toordinal() for x in dates], values2, **kw) + + line1, line2 = ax.get_lines() + + exp = np.array([x.toordinal() for x in dates], dtype=np.float64) + tm.assert_numpy_array_equal(line1.get_xydata()[:, 0], exp) + exp = np.array([x.toordinal() for x in dates], dtype=np.float64) + tm.assert_numpy_array_equal(line2.get_xydata()[:, 0], exp) + + @pytest.mark.slow + def test_irregular_ts_shared_ax_xlim(self): + # GH 2960 + ts = tm.makeTimeSeries()[:20] + ts_irregular = ts[[1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, 17, 18]] + + # plot the left section of the irregular series, then the right section + _, ax = self.plt.subplots() + ts_irregular[:5].plot(ax=ax) + ts_irregular[5:].plot(ax=ax) + + # check that axis limits are correct + left, right = ax.get_xlim() + assert left <= ts_irregular.index.min().toordinal() + assert right >= ts_irregular.index.max().toordinal() + + @pytest.mark.slow + def test_secondary_y_non_ts_xlim(self): + # GH 3490 - non-timeseries with secondary y + index_1 = [1, 2, 3, 4] + index_2 = [5, 6, 7, 8] + s1 = Series(1, index=index_1) + s2 = Series(2, index=index_2) + + _, ax = self.plt.subplots() + s1.plot(ax=ax) + left_before, right_before = ax.get_xlim() + s2.plot(secondary_y=True, ax=ax) + left_after, right_after = ax.get_xlim() + + assert left_before >= left_after + assert right_before < right_after + + @pytest.mark.slow + def test_secondary_y_regular_ts_xlim(self): + # GH 3490 - regular-timeseries with secondary y + index_1 = date_range(start="2000-01-01", periods=4, freq="D") + index_2 = date_range(start="2000-01-05", periods=4, freq="D") + s1 = Series(1, index=index_1) + s2 = Series(2, index=index_2) + + _, ax = self.plt.subplots() + s1.plot(ax=ax) + left_before, right_before = ax.get_xlim() + s2.plot(secondary_y=True, ax=ax) + left_after, right_after = ax.get_xlim() + + assert left_before >= left_after + assert right_before < right_after + + @pytest.mark.slow + def test_secondary_y_mixed_freq_ts_xlim(self): + # GH 3490 - mixed frequency timeseries with secondary y + rng = date_range("2000-01-01", periods=10000, freq="min") + ts = Series(1, index=rng) + + _, ax = self.plt.subplots() + ts.plot(ax=ax) + left_before, right_before = ax.get_xlim() + ts.resample("D").mean().plot(secondary_y=True, ax=ax) + left_after, right_after = ax.get_xlim() + + # a downsample should not have changed either limit + assert left_before == left_after + assert right_before == right_after + + @pytest.mark.slow + def test_secondary_y_irregular_ts_xlim(self): + # GH 3490 - irregular-timeseries with secondary y + ts = tm.makeTimeSeries()[:20] + ts_irregular = ts[[1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, 17, 18]] + + _, ax = self.plt.subplots() + ts_irregular[:5].plot(ax=ax) + # plot higher-x values on secondary axis + ts_irregular[5:].plot(secondary_y=True, ax=ax) + # ensure secondary limits aren't overwritten by plot on primary + ts_irregular[:5].plot(ax=ax) + + left, right = ax.get_xlim() + assert left <= ts_irregular.index.min().toordinal() + assert right >= ts_irregular.index.max().toordinal() + + def test_plot_outofbounds_datetime(self): + # 2579 - checking this does not raise + values = [date(1677, 1, 1), date(1677, 1, 2)] + _, ax = self.plt.subplots() + ax.plot(values) + + values = [datetime(1677, 1, 1, 12), datetime(1677, 1, 2, 12)] + ax.plot(values) + + def test_format_timedelta_ticks_narrow(self): + + expected_labels = [f"00:00:00.0000000{i:0>2d}" for i in np.arange(10)] + + rng = timedelta_range("0", periods=10, freq="ns") + df = DataFrame(np.random.randn(len(rng), 3), rng) + fig, ax = self.plt.subplots() + df.plot(fontsize=2, ax=ax) + self.plt.draw() + labels = ax.get_xticklabels() + + result_labels = [x.get_text() for x in labels] + assert len(result_labels) == len(expected_labels) + assert result_labels == expected_labels + + def test_format_timedelta_ticks_wide(self): + expected_labels = [ + "00:00:00", + "1 days 03:46:40", + "2 days 07:33:20", + "3 days 11:20:00", + "4 days 15:06:40", + "5 days 18:53:20", + "6 days 22:40:00", + "8 days 02:26:40", + "9 days 06:13:20", + ] + + rng = timedelta_range("0", periods=10, freq="1 d") + df = DataFrame(np.random.randn(len(rng), 3), rng) + fig, ax = self.plt.subplots() + ax = df.plot(fontsize=2, ax=ax) + self.plt.draw() + labels = ax.get_xticklabels() + + result_labels = [x.get_text() for x in labels] + assert len(result_labels) == len(expected_labels) + assert result_labels == expected_labels + + def test_timedelta_plot(self): + # test issue #8711 + s = Series(range(5), timedelta_range("1day", periods=5)) + _, ax = self.plt.subplots() + _check_plot_works(s.plot, ax=ax) + + # test long period + index = timedelta_range("1 day 2 hr 30 min 10 s", periods=10, freq="1 d") + s = Series(np.random.randn(len(index)), index) + _, ax = self.plt.subplots() + _check_plot_works(s.plot, ax=ax) + + # test short period + index = timedelta_range("1 day 2 hr 30 min 10 s", periods=10, freq="1 ns") + s = Series(np.random.randn(len(index)), index) + _, ax = self.plt.subplots() + _check_plot_works(s.plot, ax=ax) + + def test_hist(self): + # https://github.com/matplotlib/matplotlib/issues/8459 + rng = date_range("1/1/2011", periods=10, freq="H") + x = rng + w1 = np.arange(0, 1, 0.1) + w2 = np.arange(0, 1, 0.1)[::-1] + _, ax = self.plt.subplots() + ax.hist([x, x], weights=[w1, w2]) + + @pytest.mark.slow + def test_overlapping_datetime(self): + # GB 6608 + s1 = Series( + [1, 2, 3], + index=[ + datetime(1995, 12, 31), + datetime(2000, 12, 31), + datetime(2005, 12, 31), + ], + ) + s2 = Series( + [1, 2, 3], + index=[ + datetime(1997, 12, 31), + datetime(2003, 12, 31), + datetime(2008, 12, 31), + ], + ) + + # plot first series, then add the second series to those axes, + # then try adding the first series again + _, ax = self.plt.subplots() + s1.plot(ax=ax) + s2.plot(ax=ax) + s1.plot(ax=ax) + + @pytest.mark.xfail(reason="GH9053 matplotlib does not use ax.xaxis.converter") + def test_add_matplotlib_datetime64(self): + # GH9053 - ensure that a plot with PeriodConverter still understands + # datetime64 data. This still fails because matplotlib overrides the + # ax.xaxis.converter with a DatetimeConverter + s = Series(np.random.randn(10), index=date_range("1970-01-02", periods=10)) + ax = s.plot() + ax.plot(s.index, s.values, color="g") + l1, l2 = ax.lines + tm.assert_numpy_array_equal(l1.get_xydata(), l2.get_xydata()) + + def test_matplotlib_scatter_datetime64(self): + # https://github.com/matplotlib/matplotlib/issues/11391 + df = DataFrame(np.random.RandomState(0).rand(10, 2), columns=["x", "y"]) + df["time"] = date_range("2018-01-01", periods=10, freq="D") + fig, ax = self.plt.subplots() + ax.scatter(x="time", y="y", data=df) + self.plt.draw() + label = ax.get_xticklabels()[0] + if self.mpl_ge_3_0_0: + expected = "2017-12-08" + else: + expected = "2017-12-12" + assert label.get_text() == expected + + +def _check_plot_works(f, freq=None, series=None, *args, **kwargs): + import matplotlib.pyplot as plt + + fig = plt.gcf() + + try: + plt.clf() + ax = fig.add_subplot(211) + orig_ax = kwargs.pop("ax", plt.gca()) + orig_axfreq = getattr(orig_ax, "freq", None) + + ret = f(*args, **kwargs) + assert ret is not None # do something more intelligent + + ax = kwargs.pop("ax", plt.gca()) + if series is not None: + dfreq = series.index.freq + if isinstance(dfreq, DateOffset): + dfreq = dfreq.rule_code + if orig_axfreq is None: + assert ax.freq == dfreq + + if freq is not None and orig_axfreq is None: + assert ax.freq == freq + + ax = fig.add_subplot(212) + kwargs["ax"] = ax + ret = f(*args, **kwargs) + assert ret is not None # TODO: do something more intelligent + + with tm.ensure_clean(return_filelike=True) as path: + plt.savefig(path) + + # GH18439 + # this is supported only in Python 3 pickle since + # pickle in Python2 doesn't support instancemethod pickling + # TODO(statsmodels 0.10.0): Remove the statsmodels check + # https://github.com/pandas-dev/pandas/issues/24088 + # https://github.com/statsmodels/statsmodels/issues/4772 + if "statsmodels" not in sys.modules: + with tm.ensure_clean(return_filelike=True) as path: + pickle.dump(fig, path) + finally: + plt.close(fig) diff --git a/venv/Lib/site-packages/pandas/tests/plotting/test_frame.py b/venv/Lib/site-packages/pandas/tests/plotting/test_frame.py new file mode 100644 index 0000000..1c429ba --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/plotting/test_frame.py @@ -0,0 +1,3314 @@ +# coding: utf-8 + +""" Test cases for DataFrame.plot """ + +from datetime import date, datetime +import itertools +import string +import warnings + +import numpy as np +from numpy.random import rand, randn +import pytest + +import pandas.util._test_decorators as td + +from pandas.core.dtypes.api import is_list_like + +import pandas as pd +from pandas import DataFrame, MultiIndex, PeriodIndex, Series, bdate_range, date_range +import pandas._testing as tm +from pandas.core.arrays import integer_array +from pandas.tests.plotting.common import TestPlotBase, _check_plot_works + +from pandas.io.formats.printing import pprint_thing +import pandas.plotting as plotting + + +@td.skip_if_no_mpl +class TestDataFramePlots(TestPlotBase): + def setup_method(self, method): + TestPlotBase.setup_method(self, method) + import matplotlib as mpl + + mpl.rcdefaults() + + self.tdf = tm.makeTimeDataFrame() + self.hexbin_df = DataFrame( + { + "A": np.random.uniform(size=20), + "B": np.random.uniform(size=20), + "C": np.arange(20) + np.random.uniform(size=20), + } + ) + + def _assert_ytickslabels_visibility(self, axes, expected): + for ax, exp in zip(axes, expected): + self._check_visible(ax.get_yticklabels(), visible=exp) + + def _assert_xtickslabels_visibility(self, axes, expected): + for ax, exp in zip(axes, expected): + self._check_visible(ax.get_xticklabels(), visible=exp) + + @pytest.mark.slow + def test_plot(self): + from pandas.plotting._matplotlib.compat import _mpl_ge_3_1_0 + + df = self.tdf + _check_plot_works(df.plot, grid=False) + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.plot, subplots=True) + self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.plot, subplots=True, layout=(-1, 2)) + self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.plot, subplots=True, use_index=False) + self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) + + df = DataFrame({"x": [1, 2], "y": [3, 4]}) + if _mpl_ge_3_1_0(): + msg = "'Line2D' object has no property 'blarg'" + else: + msg = "Unknown property blarg" + with pytest.raises(AttributeError, match=msg): + df.plot.line(blarg=True) + + df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10])) + + _check_plot_works(df.plot, use_index=True) + _check_plot_works(df.plot, sort_columns=False) + _check_plot_works(df.plot, yticks=[1, 5, 10]) + _check_plot_works(df.plot, xticks=[1, 5, 10]) + _check_plot_works(df.plot, ylim=(-100, 100), xlim=(-100, 100)) + + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.plot, subplots=True, title="blah") + + # We have to redo it here because _check_plot_works does two plots, + # once without an ax kwarg and once with an ax kwarg and the new sharex + # behaviour does not remove the visibility of the latter axis (as ax is + # present). see: https://github.com/pandas-dev/pandas/issues/9737 + + axes = df.plot(subplots=True, title="blah") + self._check_axes_shape(axes, axes_num=3, layout=(3, 1)) + # axes[0].figure.savefig("test.png") + for ax in axes[:2]: + self._check_visible(ax.xaxis) # xaxis must be visible for grid + self._check_visible(ax.get_xticklabels(), visible=False) + self._check_visible(ax.get_xticklabels(minor=True), visible=False) + self._check_visible([ax.xaxis.get_label()], visible=False) + for ax in [axes[2]]: + self._check_visible(ax.xaxis) + self._check_visible(ax.get_xticklabels()) + self._check_visible([ax.xaxis.get_label()]) + self._check_ticks_props(ax, xrot=0) + + _check_plot_works(df.plot, title="blah") + + tuples = zip(string.ascii_letters[:10], range(10)) + df = DataFrame(np.random.rand(10, 3), index=MultiIndex.from_tuples(tuples)) + _check_plot_works(df.plot, use_index=True) + + # unicode + index = MultiIndex.from_tuples( + [ + ("\u03b1", 0), + ("\u03b1", 1), + ("\u03b2", 2), + ("\u03b2", 3), + ("\u03b3", 4), + ("\u03b3", 5), + ("\u03b4", 6), + ("\u03b4", 7), + ], + names=["i0", "i1"], + ) + columns = MultiIndex.from_tuples( + [("bar", "\u0394"), ("bar", "\u0395")], names=["c0", "c1"] + ) + df = DataFrame(np.random.randint(0, 10, (8, 2)), columns=columns, index=index) + _check_plot_works(df.plot, title="\u03A3") + + # GH 6951 + # Test with single column + df = DataFrame({"x": np.random.rand(10)}) + axes = _check_plot_works(df.plot.bar, subplots=True) + self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + + axes = _check_plot_works(df.plot.bar, subplots=True, layout=(-1, 1)) + self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + # When ax is supplied and required number of axes is 1, + # passed ax should be used: + fig, ax = self.plt.subplots() + axes = df.plot.bar(subplots=True, ax=ax) + assert len(axes) == 1 + result = ax.axes + assert result is axes[0] + + def test_integer_array_plot(self): + # GH 25587 + arr = integer_array([1, 2, 3, 4], dtype="UInt32") + + s = Series(arr) + _check_plot_works(s.plot.line) + _check_plot_works(s.plot.bar) + _check_plot_works(s.plot.hist) + _check_plot_works(s.plot.pie) + + df = DataFrame({"x": arr, "y": arr}) + _check_plot_works(df.plot.line) + _check_plot_works(df.plot.bar) + _check_plot_works(df.plot.hist) + _check_plot_works(df.plot.pie, y="y") + _check_plot_works(df.plot.scatter, x="x", y="y") + _check_plot_works(df.plot.hexbin, x="x", y="y") + + def test_mpl2_color_cycle_str(self): + # GH 15516 + colors = ["C" + str(x) for x in range(10)] + df = DataFrame(randn(10, 3), columns=["a", "b", "c"]) + for c in colors: + _check_plot_works(df.plot, color=c) + + def test_color_single_series_list(self): + # GH 3486 + df = DataFrame({"A": [1, 2, 3]}) + _check_plot_works(df.plot, color=["red"]) + + def test_rgb_tuple_color(self): + # GH 16695 + df = DataFrame({"x": [1, 2], "y": [3, 4]}) + _check_plot_works(df.plot, x="x", y="y", color=(1, 0, 0)) + _check_plot_works(df.plot, x="x", y="y", color=(1, 0, 0, 0.5)) + + def test_color_empty_string(self): + df = DataFrame(randn(10, 2)) + with pytest.raises(ValueError): + df.plot(color="") + + def test_color_and_style_arguments(self): + df = DataFrame({"x": [1, 2], "y": [3, 4]}) + # passing both 'color' and 'style' arguments should be allowed + # if there is no color symbol in the style strings: + ax = df.plot(color=["red", "black"], style=["-", "--"]) + # check that the linestyles are correctly set: + linestyle = [line.get_linestyle() for line in ax.lines] + assert linestyle == ["-", "--"] + # check that the colors are correctly set: + color = [line.get_color() for line in ax.lines] + assert color == ["red", "black"] + # passing both 'color' and 'style' arguments should not be allowed + # if there is a color symbol in the style strings: + with pytest.raises(ValueError): + df.plot(color=["red", "black"], style=["k-", "r--"]) + + def test_nonnumeric_exclude(self): + df = DataFrame({"A": ["x", "y", "z"], "B": [1, 2, 3]}) + ax = df.plot() + assert len(ax.get_lines()) == 1 # B was plotted + + @pytest.mark.slow + def test_implicit_label(self): + df = DataFrame(randn(10, 3), columns=["a", "b", "c"]) + ax = df.plot(x="a", y="b") + self._check_text_labels(ax.xaxis.get_label(), "a") + + @pytest.mark.slow + def test_donot_overwrite_index_name(self): + # GH 8494 + df = DataFrame(randn(2, 2), columns=["a", "b"]) + df.index.name = "NAME" + df.plot(y="b", label="LABEL") + assert df.index.name == "NAME" + + @pytest.mark.slow + def test_plot_xy(self): + # columns.inferred_type == 'string' + df = self.tdf + self._check_data(df.plot(x=0, y=1), df.set_index("A")["B"].plot()) + self._check_data(df.plot(x=0), df.set_index("A").plot()) + self._check_data(df.plot(y=0), df.B.plot()) + self._check_data(df.plot(x="A", y="B"), df.set_index("A").B.plot()) + self._check_data(df.plot(x="A"), df.set_index("A").plot()) + self._check_data(df.plot(y="B"), df.B.plot()) + + # columns.inferred_type == 'integer' + df.columns = np.arange(1, len(df.columns) + 1) + self._check_data(df.plot(x=1, y=2), df.set_index(1)[2].plot()) + self._check_data(df.plot(x=1), df.set_index(1).plot()) + self._check_data(df.plot(y=1), df[1].plot()) + + # figsize and title + ax = df.plot(x=1, y=2, title="Test", figsize=(16, 8)) + self._check_text_labels(ax.title, "Test") + self._check_axes_shape(ax, axes_num=1, layout=(1, 1), figsize=(16.0, 8.0)) + + # columns.inferred_type == 'mixed' + # TODO add MultiIndex test + + @pytest.mark.slow + @pytest.mark.parametrize( + "input_log, expected_log", [(True, "log"), ("sym", "symlog")] + ) + def test_logscales(self, input_log, expected_log): + df = DataFrame({"a": np.arange(100)}, index=np.arange(100)) + + ax = df.plot(logy=input_log) + self._check_ax_scales(ax, yaxis=expected_log) + assert ax.get_yscale() == expected_log + + ax = df.plot(logx=input_log) + self._check_ax_scales(ax, xaxis=expected_log) + assert ax.get_xscale() == expected_log + + ax = df.plot(loglog=input_log) + self._check_ax_scales(ax, xaxis=expected_log, yaxis=expected_log) + assert ax.get_xscale() == expected_log + assert ax.get_yscale() == expected_log + + @pytest.mark.parametrize("input_param", ["logx", "logy", "loglog"]) + def test_invalid_logscale(self, input_param): + # GH: 24867 + df = DataFrame({"a": np.arange(100)}, index=np.arange(100)) + + msg = "Boolean, None and 'sym' are valid options, 'sm' is given." + with pytest.raises(ValueError, match=msg): + df.plot(**{input_param: "sm"}) + + @pytest.mark.slow + def test_xcompat(self): + import pandas as pd + + df = self.tdf + ax = df.plot(x_compat=True) + lines = ax.get_lines() + assert not isinstance(lines[0].get_xdata(), PeriodIndex) + + tm.close() + pd.plotting.plot_params["xaxis.compat"] = True + ax = df.plot() + lines = ax.get_lines() + assert not isinstance(lines[0].get_xdata(), PeriodIndex) + + tm.close() + pd.plotting.plot_params["x_compat"] = False + + ax = df.plot() + lines = ax.get_lines() + assert not isinstance(lines[0].get_xdata(), PeriodIndex) + assert isinstance(PeriodIndex(lines[0].get_xdata()), PeriodIndex) + + tm.close() + # useful if you're plotting a bunch together + with pd.plotting.plot_params.use("x_compat", True): + ax = df.plot() + lines = ax.get_lines() + assert not isinstance(lines[0].get_xdata(), PeriodIndex) + + tm.close() + ax = df.plot() + lines = ax.get_lines() + assert not isinstance(lines[0].get_xdata(), PeriodIndex) + assert isinstance(PeriodIndex(lines[0].get_xdata()), PeriodIndex) + + def test_period_compat(self): + # GH 9012 + # period-array conversions + df = DataFrame( + np.random.rand(21, 2), + index=bdate_range(datetime(2000, 1, 1), datetime(2000, 1, 31)), + columns=["a", "b"], + ) + + df.plot() + self.plt.axhline(y=0) + tm.close() + + def test_unsorted_index(self): + df = DataFrame( + {"y": np.arange(100)}, index=np.arange(99, -1, -1), dtype=np.int64 + ) + ax = df.plot() + lines = ax.get_lines()[0] + rs = lines.get_xydata() + rs = Series(rs[:, 1], rs[:, 0], dtype=np.int64, name="y") + tm.assert_series_equal(rs, df.y, check_index_type=False) + tm.close() + + df.index = pd.Index(np.arange(99, -1, -1), dtype=np.float64) + ax = df.plot() + lines = ax.get_lines()[0] + rs = lines.get_xydata() + rs = Series(rs[:, 1], rs[:, 0], dtype=np.int64, name="y") + tm.assert_series_equal(rs, df.y) + + def test_unsorted_index_lims(self): + df = DataFrame({"y": [0.0, 1.0, 2.0, 3.0]}, index=[1.0, 0.0, 3.0, 2.0]) + ax = df.plot() + xmin, xmax = ax.get_xlim() + lines = ax.get_lines() + assert xmin <= np.nanmin(lines[0].get_data()[0]) + assert xmax >= np.nanmax(lines[0].get_data()[0]) + + df = DataFrame( + {"y": [0.0, 1.0, np.nan, 3.0, 4.0, 5.0, 6.0]}, + index=[1.0, 0.0, 3.0, 2.0, np.nan, 3.0, 2.0], + ) + ax = df.plot() + xmin, xmax = ax.get_xlim() + lines = ax.get_lines() + assert xmin <= np.nanmin(lines[0].get_data()[0]) + assert xmax >= np.nanmax(lines[0].get_data()[0]) + + df = DataFrame({"y": [0.0, 1.0, 2.0, 3.0], "z": [91.0, 90.0, 93.0, 92.0]}) + ax = df.plot(x="z", y="y") + xmin, xmax = ax.get_xlim() + lines = ax.get_lines() + assert xmin <= np.nanmin(lines[0].get_data()[0]) + assert xmax >= np.nanmax(lines[0].get_data()[0]) + + @pytest.mark.slow + def test_subplots(self): + df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10])) + + for kind in ["bar", "barh", "line", "area"]: + axes = df.plot(kind=kind, subplots=True, sharex=True, legend=True) + self._check_axes_shape(axes, axes_num=3, layout=(3, 1)) + assert axes.shape == (3,) + + for ax, column in zip(axes, df.columns): + self._check_legend_labels(ax, labels=[pprint_thing(column)]) + + for ax in axes[:-2]: + self._check_visible(ax.xaxis) # xaxis must be visible for grid + self._check_visible(ax.get_xticklabels(), visible=False) + if not (kind == "bar" and self.mpl_ge_3_1_0): + # change https://github.com/pandas-dev/pandas/issues/26714 + self._check_visible(ax.get_xticklabels(minor=True), visible=False) + self._check_visible(ax.xaxis.get_label(), visible=False) + self._check_visible(ax.get_yticklabels()) + + self._check_visible(axes[-1].xaxis) + self._check_visible(axes[-1].get_xticklabels()) + self._check_visible(axes[-1].get_xticklabels(minor=True)) + self._check_visible(axes[-1].xaxis.get_label()) + self._check_visible(axes[-1].get_yticklabels()) + + axes = df.plot(kind=kind, subplots=True, sharex=False) + for ax in axes: + self._check_visible(ax.xaxis) + self._check_visible(ax.get_xticklabels()) + self._check_visible(ax.get_xticklabels(minor=True)) + self._check_visible(ax.xaxis.get_label()) + self._check_visible(ax.get_yticklabels()) + + axes = df.plot(kind=kind, subplots=True, legend=False) + for ax in axes: + assert ax.get_legend() is None + + def test_groupby_boxplot_sharey(self): + # https://github.com/pandas-dev/pandas/issues/20968 + # sharey can now be switched check whether the right + # pair of axes is turned on or off + + df = DataFrame( + { + "a": [-1.43, -0.15, -3.70, -1.43, -0.14], + "b": [0.56, 0.84, 0.29, 0.56, 0.85], + "c": [0, 1, 2, 3, 1], + }, + index=[0, 1, 2, 3, 4], + ) + + # behavior without keyword + axes = df.groupby("c").boxplot() + expected = [True, False, True, False] + self._assert_ytickslabels_visibility(axes, expected) + + # set sharey=True should be identical + axes = df.groupby("c").boxplot(sharey=True) + expected = [True, False, True, False] + self._assert_ytickslabels_visibility(axes, expected) + + # sharey=False, all yticklabels should be visible + axes = df.groupby("c").boxplot(sharey=False) + expected = [True, True, True, True] + self._assert_ytickslabels_visibility(axes, expected) + + def test_groupby_boxplot_sharex(self): + # https://github.com/pandas-dev/pandas/issues/20968 + # sharex can now be switched check whether the right + # pair of axes is turned on or off + + df = DataFrame( + { + "a": [-1.43, -0.15, -3.70, -1.43, -0.14], + "b": [0.56, 0.84, 0.29, 0.56, 0.85], + "c": [0, 1, 2, 3, 1], + }, + index=[0, 1, 2, 3, 4], + ) + + # behavior without keyword + axes = df.groupby("c").boxplot() + expected = [True, True, True, True] + self._assert_xtickslabels_visibility(axes, expected) + + # set sharex=False should be identical + axes = df.groupby("c").boxplot(sharex=False) + expected = [True, True, True, True] + self._assert_xtickslabels_visibility(axes, expected) + + # sharex=True, yticklabels should be visible + # only for bottom plots + axes = df.groupby("c").boxplot(sharex=True) + expected = [False, False, True, True] + self._assert_xtickslabels_visibility(axes, expected) + + @pytest.mark.slow + def test_subplots_timeseries(self): + idx = date_range(start="2014-07-01", freq="M", periods=10) + df = DataFrame(np.random.rand(10, 3), index=idx) + + for kind in ["line", "area"]: + axes = df.plot(kind=kind, subplots=True, sharex=True) + self._check_axes_shape(axes, axes_num=3, layout=(3, 1)) + + for ax in axes[:-2]: + # GH 7801 + self._check_visible(ax.xaxis) # xaxis must be visible for grid + self._check_visible(ax.get_xticklabels(), visible=False) + self._check_visible(ax.get_xticklabels(minor=True), visible=False) + self._check_visible(ax.xaxis.get_label(), visible=False) + self._check_visible(ax.get_yticklabels()) + + self._check_visible(axes[-1].xaxis) + self._check_visible(axes[-1].get_xticklabels()) + self._check_visible(axes[-1].get_xticklabels(minor=True)) + self._check_visible(axes[-1].xaxis.get_label()) + self._check_visible(axes[-1].get_yticklabels()) + self._check_ticks_props(axes, xrot=0) + + axes = df.plot(kind=kind, subplots=True, sharex=False, rot=45, fontsize=7) + for ax in axes: + self._check_visible(ax.xaxis) + self._check_visible(ax.get_xticklabels()) + self._check_visible(ax.get_xticklabels(minor=True)) + self._check_visible(ax.xaxis.get_label()) + self._check_visible(ax.get_yticklabels()) + self._check_ticks_props(ax, xlabelsize=7, xrot=45, ylabelsize=7) + + def test_subplots_timeseries_y_axis(self): + # GH16953 + data = { + "numeric": np.array([1, 2, 5]), + "timedelta": [ + pd.Timedelta(-10, unit="s"), + pd.Timedelta(10, unit="m"), + pd.Timedelta(10, unit="h"), + ], + "datetime_no_tz": [ + pd.to_datetime("2017-08-01 00:00:00"), + pd.to_datetime("2017-08-01 02:00:00"), + pd.to_datetime("2017-08-02 00:00:00"), + ], + "datetime_all_tz": [ + pd.to_datetime("2017-08-01 00:00:00", utc=True), + pd.to_datetime("2017-08-01 02:00:00", utc=True), + pd.to_datetime("2017-08-02 00:00:00", utc=True), + ], + "text": ["This", "should", "fail"], + } + testdata = DataFrame(data) + + ax_numeric = testdata.plot(y="numeric") + assert ( + ax_numeric.get_lines()[0].get_data()[1] == testdata["numeric"].values + ).all() + ax_timedelta = testdata.plot(y="timedelta") + assert ( + ax_timedelta.get_lines()[0].get_data()[1] == testdata["timedelta"].values + ).all() + ax_datetime_no_tz = testdata.plot(y="datetime_no_tz") + assert ( + ax_datetime_no_tz.get_lines()[0].get_data()[1] + == testdata["datetime_no_tz"].values + ).all() + ax_datetime_all_tz = testdata.plot(y="datetime_all_tz") + assert ( + ax_datetime_all_tz.get_lines()[0].get_data()[1] + == testdata["datetime_all_tz"].values + ).all() + + msg = "no numeric data to plot" + with pytest.raises(TypeError, match=msg): + testdata.plot(y="text") + + @pytest.mark.xfail(reason="not support for period, categorical, datetime_mixed_tz") + def test_subplots_timeseries_y_axis_not_supported(self): + """ + This test will fail for: + period: + since period isn't yet implemented in ``select_dtypes`` + and because it will need a custom value converter + + tick formatter (as was done for x-axis plots) + + categorical: + because it will need a custom value converter + + tick formatter (also doesn't work for x-axis, as of now) + + datetime_mixed_tz: + because of the way how pandas handles ``Series`` of + ``datetime`` objects with different timezone, + generally converting ``datetime`` objects in a tz-aware + form could help with this problem + """ + data = { + "numeric": np.array([1, 2, 5]), + "period": [ + pd.Period("2017-08-01 00:00:00", freq="H"), + pd.Period("2017-08-01 02:00", freq="H"), + pd.Period("2017-08-02 00:00:00", freq="H"), + ], + "categorical": pd.Categorical( + ["c", "b", "a"], categories=["a", "b", "c"], ordered=False + ), + "datetime_mixed_tz": [ + pd.to_datetime("2017-08-01 00:00:00", utc=True), + pd.to_datetime("2017-08-01 02:00:00"), + pd.to_datetime("2017-08-02 00:00:00"), + ], + } + testdata = pd.DataFrame(data) + ax_period = testdata.plot(x="numeric", y="period") + assert ( + ax_period.get_lines()[0].get_data()[1] == testdata["period"].values + ).all() + ax_categorical = testdata.plot(x="numeric", y="categorical") + assert ( + ax_categorical.get_lines()[0].get_data()[1] + == testdata["categorical"].values + ).all() + ax_datetime_mixed_tz = testdata.plot(x="numeric", y="datetime_mixed_tz") + assert ( + ax_datetime_mixed_tz.get_lines()[0].get_data()[1] + == testdata["datetime_mixed_tz"].values + ).all() + + @pytest.mark.slow + def test_subplots_layout(self): + # GH 6667 + df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10])) + + axes = df.plot(subplots=True, layout=(2, 2)) + self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + assert axes.shape == (2, 2) + + axes = df.plot(subplots=True, layout=(-1, 2)) + self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + assert axes.shape == (2, 2) + + axes = df.plot(subplots=True, layout=(2, -1)) + self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + assert axes.shape == (2, 2) + + axes = df.plot(subplots=True, layout=(1, 4)) + self._check_axes_shape(axes, axes_num=3, layout=(1, 4)) + assert axes.shape == (1, 4) + + axes = df.plot(subplots=True, layout=(-1, 4)) + self._check_axes_shape(axes, axes_num=3, layout=(1, 4)) + assert axes.shape == (1, 4) + + axes = df.plot(subplots=True, layout=(4, -1)) + self._check_axes_shape(axes, axes_num=3, layout=(4, 1)) + assert axes.shape == (4, 1) + + with pytest.raises(ValueError): + df.plot(subplots=True, layout=(1, 1)) + with pytest.raises(ValueError): + df.plot(subplots=True, layout=(-1, -1)) + + # single column + df = DataFrame(np.random.rand(10, 1), index=list(string.ascii_letters[:10])) + axes = df.plot(subplots=True) + self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + assert axes.shape == (1,) + + axes = df.plot(subplots=True, layout=(3, 3)) + self._check_axes_shape(axes, axes_num=1, layout=(3, 3)) + assert axes.shape == (3, 3) + + @pytest.mark.slow + def test_subplots_warnings(self): + # GH 9464 + with tm.assert_produces_warning(None): + df = DataFrame(np.random.randn(100, 4)) + df.plot(subplots=True, layout=(3, 2)) + + df = DataFrame( + np.random.randn(100, 4), index=date_range("1/1/2000", periods=100) + ) + df.plot(subplots=True, layout=(3, 2)) + + @pytest.mark.slow + def test_subplots_multiple_axes(self): + # GH 5353, 6970, GH 7069 + fig, axes = self.plt.subplots(2, 3) + df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10])) + + returned = df.plot(subplots=True, ax=axes[0], sharex=False, sharey=False) + self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) + assert returned.shape == (3,) + assert returned[0].figure is fig + # draw on second row + returned = df.plot(subplots=True, ax=axes[1], sharex=False, sharey=False) + self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) + assert returned.shape == (3,) + assert returned[0].figure is fig + self._check_axes_shape(axes, axes_num=6, layout=(2, 3)) + tm.close() + + with pytest.raises(ValueError): + fig, axes = self.plt.subplots(2, 3) + # pass different number of axes from required + df.plot(subplots=True, ax=axes) + + # pass 2-dim axes and invalid layout + # invalid lauout should not affect to input and return value + # (show warning is tested in + # TestDataFrameGroupByPlots.test_grouped_box_multiple_axes + fig, axes = self.plt.subplots(2, 2) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + df = DataFrame(np.random.rand(10, 4), index=list(string.ascii_letters[:10])) + + returned = df.plot( + subplots=True, ax=axes, layout=(2, 1), sharex=False, sharey=False + ) + self._check_axes_shape(returned, axes_num=4, layout=(2, 2)) + assert returned.shape == (4,) + + returned = df.plot( + subplots=True, ax=axes, layout=(2, -1), sharex=False, sharey=False + ) + self._check_axes_shape(returned, axes_num=4, layout=(2, 2)) + assert returned.shape == (4,) + + returned = df.plot( + subplots=True, ax=axes, layout=(-1, 2), sharex=False, sharey=False + ) + self._check_axes_shape(returned, axes_num=4, layout=(2, 2)) + assert returned.shape == (4,) + + # single column + fig, axes = self.plt.subplots(1, 1) + df = DataFrame(np.random.rand(10, 1), index=list(string.ascii_letters[:10])) + + axes = df.plot(subplots=True, ax=[axes], sharex=False, sharey=False) + self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + assert axes.shape == (1,) + + def test_subplots_ts_share_axes(self): + # GH 3964 + fig, axes = self.plt.subplots(3, 3, sharex=True, sharey=True) + self.plt.subplots_adjust(left=0.05, right=0.95, hspace=0.3, wspace=0.3) + df = DataFrame( + np.random.randn(10, 9), + index=date_range(start="2014-07-01", freq="M", periods=10), + ) + for i, ax in enumerate(axes.ravel()): + df[i].plot(ax=ax, fontsize=5) + + # Rows other than bottom should not be visible + for ax in axes[0:-1].ravel(): + self._check_visible(ax.get_xticklabels(), visible=False) + + # Bottom row should be visible + for ax in axes[-1].ravel(): + self._check_visible(ax.get_xticklabels(), visible=True) + + # First column should be visible + for ax in axes[[0, 1, 2], [0]].ravel(): + self._check_visible(ax.get_yticklabels(), visible=True) + + # Other columns should not be visible + for ax in axes[[0, 1, 2], [1]].ravel(): + self._check_visible(ax.get_yticklabels(), visible=False) + for ax in axes[[0, 1, 2], [2]].ravel(): + self._check_visible(ax.get_yticklabels(), visible=False) + + def test_subplots_sharex_axes_existing_axes(self): + # GH 9158 + d = {"A": [1.0, 2.0, 3.0, 4.0], "B": [4.0, 3.0, 2.0, 1.0], "C": [5, 1, 3, 4]} + df = DataFrame(d, index=date_range("2014 10 11", "2014 10 14")) + + axes = df[["A", "B"]].plot(subplots=True) + df["C"].plot(ax=axes[0], secondary_y=True) + + self._check_visible(axes[0].get_xticklabels(), visible=False) + self._check_visible(axes[1].get_xticklabels(), visible=True) + for ax in axes.ravel(): + self._check_visible(ax.get_yticklabels(), visible=True) + + @pytest.mark.slow + def test_subplots_dup_columns(self): + # GH 10962 + df = DataFrame(np.random.rand(5, 5), columns=list("aaaaa")) + axes = df.plot(subplots=True) + for ax in axes: + self._check_legend_labels(ax, labels=["a"]) + assert len(ax.lines) == 1 + tm.close() + + axes = df.plot(subplots=True, secondary_y="a") + for ax in axes: + # (right) is only attached when subplots=False + self._check_legend_labels(ax, labels=["a"]) + assert len(ax.lines) == 1 + tm.close() + + ax = df.plot(secondary_y="a") + self._check_legend_labels(ax, labels=["a (right)"] * 5) + assert len(ax.lines) == 0 + assert len(ax.right_ax.lines) == 5 + + def test_negative_log(self): + df = -DataFrame( + rand(6, 4), + index=list(string.ascii_letters[:6]), + columns=["x", "y", "z", "four"], + ) + + with pytest.raises(ValueError): + df.plot.area(logy=True) + with pytest.raises(ValueError): + df.plot.area(loglog=True) + + def _compare_stacked_y_cood(self, normal_lines, stacked_lines): + base = np.zeros(len(normal_lines[0].get_data()[1])) + for nl, sl in zip(normal_lines, stacked_lines): + base += nl.get_data()[1] # get y coordinates + sy = sl.get_data()[1] + tm.assert_numpy_array_equal(base, sy) + + def test_line_area_stacked(self): + with tm.RNGContext(42): + df = DataFrame(rand(6, 4), columns=["w", "x", "y", "z"]) + neg_df = -df + # each column has either positive or negative value + sep_df = DataFrame( + {"w": rand(6), "x": rand(6), "y": -rand(6), "z": -rand(6)} + ) + # each column has positive-negative mixed value + mixed_df = DataFrame( + randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=["w", "x", "y", "z"], + ) + + for kind in ["line", "area"]: + ax1 = _check_plot_works(df.plot, kind=kind, stacked=False) + ax2 = _check_plot_works(df.plot, kind=kind, stacked=True) + self._compare_stacked_y_cood(ax1.lines, ax2.lines) + + ax1 = _check_plot_works(neg_df.plot, kind=kind, stacked=False) + ax2 = _check_plot_works(neg_df.plot, kind=kind, stacked=True) + self._compare_stacked_y_cood(ax1.lines, ax2.lines) + + ax1 = _check_plot_works(sep_df.plot, kind=kind, stacked=False) + ax2 = _check_plot_works(sep_df.plot, kind=kind, stacked=True) + self._compare_stacked_y_cood(ax1.lines[:2], ax2.lines[:2]) + self._compare_stacked_y_cood(ax1.lines[2:], ax2.lines[2:]) + + _check_plot_works(mixed_df.plot, stacked=False) + with pytest.raises(ValueError): + mixed_df.plot(stacked=True) + + # Use an index with strictly positive values, preventing + # matplotlib from warning about ignoring xlim + df2 = df.set_index(df.index + 1) + _check_plot_works(df2.plot, kind=kind, logx=True, stacked=True) + + def test_line_area_nan_df(self): + values1 = [1, 2, np.nan, 3] + values2 = [3, np.nan, 2, 1] + df = DataFrame({"a": values1, "b": values2}) + tdf = DataFrame({"a": values1, "b": values2}, index=tm.makeDateIndex(k=4)) + + for d in [df, tdf]: + ax = _check_plot_works(d.plot) + masked1 = ax.lines[0].get_ydata() + masked2 = ax.lines[1].get_ydata() + # remove nan for comparison purpose + + exp = np.array([1, 2, 3], dtype=np.float64) + tm.assert_numpy_array_equal(np.delete(masked1.data, 2), exp) + + exp = np.array([3, 2, 1], dtype=np.float64) + tm.assert_numpy_array_equal(np.delete(masked2.data, 1), exp) + tm.assert_numpy_array_equal( + masked1.mask, np.array([False, False, True, False]) + ) + tm.assert_numpy_array_equal( + masked2.mask, np.array([False, True, False, False]) + ) + + expected1 = np.array([1, 2, 0, 3], dtype=np.float64) + expected2 = np.array([3, 0, 2, 1], dtype=np.float64) + + ax = _check_plot_works(d.plot, stacked=True) + tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1) + tm.assert_numpy_array_equal(ax.lines[1].get_ydata(), expected1 + expected2) + + ax = _check_plot_works(d.plot.area) + tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1) + tm.assert_numpy_array_equal(ax.lines[1].get_ydata(), expected1 + expected2) + + ax = _check_plot_works(d.plot.area, stacked=False) + tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1) + tm.assert_numpy_array_equal(ax.lines[1].get_ydata(), expected2) + + def test_line_lim(self): + df = DataFrame(rand(6, 3), columns=["x", "y", "z"]) + ax = df.plot() + xmin, xmax = ax.get_xlim() + lines = ax.get_lines() + assert xmin <= lines[0].get_data()[0][0] + assert xmax >= lines[0].get_data()[0][-1] + + ax = df.plot(secondary_y=True) + xmin, xmax = ax.get_xlim() + lines = ax.get_lines() + assert xmin <= lines[0].get_data()[0][0] + assert xmax >= lines[0].get_data()[0][-1] + + axes = df.plot(secondary_y=True, subplots=True) + self._check_axes_shape(axes, axes_num=3, layout=(3, 1)) + for ax in axes: + assert hasattr(ax, "left_ax") + assert not hasattr(ax, "right_ax") + xmin, xmax = ax.get_xlim() + lines = ax.get_lines() + assert xmin <= lines[0].get_data()[0][0] + assert xmax >= lines[0].get_data()[0][-1] + + def test_area_lim(self): + df = DataFrame(rand(6, 4), columns=["x", "y", "z", "four"]) + + neg_df = -df + for stacked in [True, False]: + ax = _check_plot_works(df.plot.area, stacked=stacked) + xmin, xmax = ax.get_xlim() + ymin, ymax = ax.get_ylim() + lines = ax.get_lines() + assert xmin <= lines[0].get_data()[0][0] + assert xmax >= lines[0].get_data()[0][-1] + assert ymin == 0 + + ax = _check_plot_works(neg_df.plot.area, stacked=stacked) + ymin, ymax = ax.get_ylim() + assert ymax == 0 + + @pytest.mark.slow + def test_bar_colors(self): + import matplotlib.pyplot as plt + + default_colors = self._unpack_cycler(plt.rcParams) + + df = DataFrame(randn(5, 5)) + ax = df.plot.bar() + self._check_colors(ax.patches[::5], facecolors=default_colors[:5]) + tm.close() + + custom_colors = "rgcby" + ax = df.plot.bar(color=custom_colors) + self._check_colors(ax.patches[::5], facecolors=custom_colors) + tm.close() + + from matplotlib import cm + + # Test str -> colormap functionality + ax = df.plot.bar(colormap="jet") + rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, 5)] + self._check_colors(ax.patches[::5], facecolors=rgba_colors) + tm.close() + + # Test colormap functionality + ax = df.plot.bar(colormap=cm.jet) + rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, 5)] + self._check_colors(ax.patches[::5], facecolors=rgba_colors) + tm.close() + + ax = df.loc[:, [0]].plot.bar(color="DodgerBlue") + self._check_colors([ax.patches[0]], facecolors=["DodgerBlue"]) + tm.close() + + ax = df.plot(kind="bar", color="green") + self._check_colors(ax.patches[::5], facecolors=["green"] * 5) + tm.close() + + def test_bar_user_colors(self): + df = pd.DataFrame( + {"A": range(4), "B": range(1, 5), "color": ["red", "blue", "blue", "red"]} + ) + # This should *only* work when `y` is specified, else + # we use one color per column + ax = df.plot.bar(y="A", color=df["color"]) + result = [p.get_facecolor() for p in ax.patches] + expected = [ + (1.0, 0.0, 0.0, 1.0), + (0.0, 0.0, 1.0, 1.0), + (0.0, 0.0, 1.0, 1.0), + (1.0, 0.0, 0.0, 1.0), + ] + assert result == expected + + @pytest.mark.slow + def test_bar_linewidth(self): + df = DataFrame(randn(5, 5)) + + # regular + ax = df.plot.bar(linewidth=2) + for r in ax.patches: + assert r.get_linewidth() == 2 + + # stacked + ax = df.plot.bar(stacked=True, linewidth=2) + for r in ax.patches: + assert r.get_linewidth() == 2 + + # subplots + axes = df.plot.bar(linewidth=2, subplots=True) + self._check_axes_shape(axes, axes_num=5, layout=(5, 1)) + for ax in axes: + for r in ax.patches: + assert r.get_linewidth() == 2 + + @pytest.mark.slow + def test_bar_barwidth(self): + df = DataFrame(randn(5, 5)) + + width = 0.9 + + # regular + ax = df.plot.bar(width=width) + for r in ax.patches: + assert r.get_width() == width / len(df.columns) + + # stacked + ax = df.plot.bar(stacked=True, width=width) + for r in ax.patches: + assert r.get_width() == width + + # horizontal regular + ax = df.plot.barh(width=width) + for r in ax.patches: + assert r.get_height() == width / len(df.columns) + + # horizontal stacked + ax = df.plot.barh(stacked=True, width=width) + for r in ax.patches: + assert r.get_height() == width + + # subplots + axes = df.plot.bar(width=width, subplots=True) + for ax in axes: + for r in ax.patches: + assert r.get_width() == width + + # horizontal subplots + axes = df.plot.barh(width=width, subplots=True) + for ax in axes: + for r in ax.patches: + assert r.get_height() == width + + @pytest.mark.slow + def test_bar_barwidth_position(self): + df = DataFrame(randn(5, 5)) + self._check_bar_alignment( + df, kind="bar", stacked=False, width=0.9, position=0.2 + ) + self._check_bar_alignment(df, kind="bar", stacked=True, width=0.9, position=0.2) + self._check_bar_alignment( + df, kind="barh", stacked=False, width=0.9, position=0.2 + ) + self._check_bar_alignment( + df, kind="barh", stacked=True, width=0.9, position=0.2 + ) + self._check_bar_alignment( + df, kind="bar", subplots=True, width=0.9, position=0.2 + ) + self._check_bar_alignment( + df, kind="barh", subplots=True, width=0.9, position=0.2 + ) + + @pytest.mark.slow + def test_bar_barwidth_position_int(self): + # GH 12979 + df = DataFrame(randn(5, 5)) + + for w in [1, 1.0]: + ax = df.plot.bar(stacked=True, width=w) + ticks = ax.xaxis.get_ticklocs() + tm.assert_numpy_array_equal(ticks, np.array([0, 1, 2, 3, 4])) + assert ax.get_xlim() == (-0.75, 4.75) + # check left-edge of bars + assert ax.patches[0].get_x() == -0.5 + assert ax.patches[-1].get_x() == 3.5 + + self._check_bar_alignment(df, kind="bar", stacked=True, width=1) + self._check_bar_alignment(df, kind="barh", stacked=False, width=1) + self._check_bar_alignment(df, kind="barh", stacked=True, width=1) + self._check_bar_alignment(df, kind="bar", subplots=True, width=1) + self._check_bar_alignment(df, kind="barh", subplots=True, width=1) + + @pytest.mark.slow + def test_bar_bottom_left(self): + df = DataFrame(rand(5, 5)) + ax = df.plot.bar(stacked=False, bottom=1) + result = [p.get_y() for p in ax.patches] + assert result == [1] * 25 + + ax = df.plot.bar(stacked=True, bottom=[-1, -2, -3, -4, -5]) + result = [p.get_y() for p in ax.patches[:5]] + assert result == [-1, -2, -3, -4, -5] + + ax = df.plot.barh(stacked=False, left=np.array([1, 1, 1, 1, 1])) + result = [p.get_x() for p in ax.patches] + assert result == [1] * 25 + + ax = df.plot.barh(stacked=True, left=[1, 2, 3, 4, 5]) + result = [p.get_x() for p in ax.patches[:5]] + assert result == [1, 2, 3, 4, 5] + + axes = df.plot.bar(subplots=True, bottom=-1) + for ax in axes: + result = [p.get_y() for p in ax.patches] + assert result == [-1] * 5 + + axes = df.plot.barh(subplots=True, left=np.array([1, 1, 1, 1, 1])) + for ax in axes: + result = [p.get_x() for p in ax.patches] + assert result == [1] * 5 + + @pytest.mark.slow + def test_bar_nan(self): + df = DataFrame({"A": [10, np.nan, 20], "B": [5, 10, 20], "C": [1, 2, 3]}) + ax = df.plot.bar() + expected = [10, 0, 20, 5, 10, 20, 1, 2, 3] + result = [p.get_height() for p in ax.patches] + assert result == expected + + ax = df.plot.bar(stacked=True) + result = [p.get_height() for p in ax.patches] + assert result == expected + + result = [p.get_y() for p in ax.patches] + expected = [0.0, 0.0, 0.0, 10.0, 0.0, 20.0, 15.0, 10.0, 40.0] + assert result == expected + + @pytest.mark.slow + def test_bar_categorical(self): + # GH 13019 + df1 = pd.DataFrame( + np.random.randn(6, 5), + index=pd.Index(list("ABCDEF")), + columns=pd.Index(list("abcde")), + ) + # categorical index must behave the same + df2 = pd.DataFrame( + np.random.randn(6, 5), + index=pd.CategoricalIndex(list("ABCDEF")), + columns=pd.CategoricalIndex(list("abcde")), + ) + + for df in [df1, df2]: + ax = df.plot.bar() + ticks = ax.xaxis.get_ticklocs() + tm.assert_numpy_array_equal(ticks, np.array([0, 1, 2, 3, 4, 5])) + assert ax.get_xlim() == (-0.5, 5.5) + # check left-edge of bars + assert ax.patches[0].get_x() == -0.25 + assert ax.patches[-1].get_x() == 5.15 + + ax = df.plot.bar(stacked=True) + tm.assert_numpy_array_equal(ticks, np.array([0, 1, 2, 3, 4, 5])) + assert ax.get_xlim() == (-0.5, 5.5) + assert ax.patches[0].get_x() == -0.25 + assert ax.patches[-1].get_x() == 4.75 + + @pytest.mark.slow + def test_plot_scatter(self): + df = DataFrame( + randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=["x", "y", "z", "four"], + ) + + _check_plot_works(df.plot.scatter, x="x", y="y") + _check_plot_works(df.plot.scatter, x=1, y=2) + + with pytest.raises(TypeError): + df.plot.scatter(x="x") + with pytest.raises(TypeError): + df.plot.scatter(y="y") + + # GH 6951 + axes = df.plot(x="x", y="y", kind="scatter", subplots=True) + self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + + def test_raise_error_on_datetime_time_data(self): + # GH 8113, datetime.time type is not supported by matplotlib in scatter + df = pd.DataFrame(np.random.randn(10), columns=["a"]) + df["dtime"] = pd.date_range(start="2014-01-01", freq="h", periods=10).time + msg = "must be a string or a number, not 'datetime.time'" + + with pytest.raises(TypeError, match=msg): + df.plot(kind="scatter", x="dtime", y="a") + + def test_scatterplot_datetime_data(self): + # GH 30391 + dates = pd.date_range(start=date(2019, 1, 1), periods=12, freq="W") + vals = np.random.normal(0, 1, len(dates)) + df = pd.DataFrame({"dates": dates, "vals": vals}) + + _check_plot_works(df.plot.scatter, x="dates", y="vals") + _check_plot_works(df.plot.scatter, x=0, y=1) + + def test_scatterplot_object_data(self): + # GH 18755 + df = pd.DataFrame(dict(a=["A", "B", "C"], b=[2, 3, 4])) + + _check_plot_works(df.plot.scatter, x="a", y="b") + _check_plot_works(df.plot.scatter, x=0, y=1) + + df = pd.DataFrame(dict(a=["A", "B", "C"], b=["a", "b", "c"])) + + _check_plot_works(df.plot.scatter, x="a", y="b") + _check_plot_works(df.plot.scatter, x=0, y=1) + + @pytest.mark.slow + def test_if_scatterplot_colorbar_affects_xaxis_visibility(self): + # addressing issue #10611, to ensure colobar does not + # interfere with x-axis label and ticklabels with + # ipython inline backend. + random_array = np.random.random((1000, 3)) + df = pd.DataFrame(random_array, columns=["A label", "B label", "C label"]) + + ax1 = df.plot.scatter(x="A label", y="B label") + ax2 = df.plot.scatter(x="A label", y="B label", c="C label") + + vis1 = [vis.get_visible() for vis in ax1.xaxis.get_minorticklabels()] + vis2 = [vis.get_visible() for vis in ax2.xaxis.get_minorticklabels()] + assert vis1 == vis2 + + vis1 = [vis.get_visible() for vis in ax1.xaxis.get_majorticklabels()] + vis2 = [vis.get_visible() for vis in ax2.xaxis.get_majorticklabels()] + assert vis1 == vis2 + + assert ( + ax1.xaxis.get_label().get_visible() == ax2.xaxis.get_label().get_visible() + ) + + @pytest.mark.slow + def test_if_hexbin_xaxis_label_is_visible(self): + # addressing issue #10678, to ensure colobar does not + # interfere with x-axis label and ticklabels with + # ipython inline backend. + random_array = np.random.random((1000, 3)) + df = pd.DataFrame(random_array, columns=["A label", "B label", "C label"]) + + ax = df.plot.hexbin("A label", "B label", gridsize=12) + assert all(vis.get_visible() for vis in ax.xaxis.get_minorticklabels()) + assert all(vis.get_visible() for vis in ax.xaxis.get_majorticklabels()) + assert ax.xaxis.get_label().get_visible() + + @pytest.mark.slow + def test_if_scatterplot_colorbars_are_next_to_parent_axes(self): + import matplotlib.pyplot as plt + + random_array = np.random.random((1000, 3)) + df = pd.DataFrame(random_array, columns=["A label", "B label", "C label"]) + + fig, axes = plt.subplots(1, 2) + df.plot.scatter("A label", "B label", c="C label", ax=axes[0]) + df.plot.scatter("A label", "B label", c="C label", ax=axes[1]) + plt.tight_layout() + + points = np.array([ax.get_position().get_points() for ax in fig.axes]) + axes_x_coords = points[:, :, 0] + parent_distance = axes_x_coords[1, :] - axes_x_coords[0, :] + colorbar_distance = axes_x_coords[3, :] - axes_x_coords[2, :] + assert np.isclose(parent_distance, colorbar_distance, atol=1e-7).all() + + @pytest.mark.parametrize("x, y", [("x", "y"), ("y", "x"), ("y", "y")]) + @pytest.mark.slow + def test_plot_scatter_with_categorical_data(self, x, y): + # after fixing GH 18755, should be able to plot categorical data + df = pd.DataFrame( + {"x": [1, 2, 3, 4], "y": pd.Categorical(["a", "b", "a", "c"])} + ) + + _check_plot_works(df.plot.scatter, x=x, y=y) + + @pytest.mark.slow + def test_plot_scatter_with_c(self): + df = DataFrame( + randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=["x", "y", "z", "four"], + ) + + axes = [df.plot.scatter(x="x", y="y", c="z"), df.plot.scatter(x=0, y=1, c=2)] + for ax in axes: + # default to Greys + assert ax.collections[0].cmap.name == "Greys" + + # n.b. there appears to be no public method + # to get the colorbar label + assert ax.collections[0].colorbar._label == "z" + + cm = "cubehelix" + ax = df.plot.scatter(x="x", y="y", c="z", colormap=cm) + assert ax.collections[0].cmap.name == cm + + # verify turning off colorbar works + ax = df.plot.scatter(x="x", y="y", c="z", colorbar=False) + assert ax.collections[0].colorbar is None + + # verify that we can still plot a solid color + ax = df.plot.scatter(x=0, y=1, c="red") + assert ax.collections[0].colorbar is None + self._check_colors(ax.collections, facecolors=["r"]) + + # Ensure that we can pass an np.array straight through to matplotlib, + # this functionality was accidentally removed previously. + # See https://github.com/pandas-dev/pandas/issues/8852 for bug report + # + # Exercise colormap path and non-colormap path as they are independent + # + df = DataFrame({"A": [1, 2], "B": [3, 4]}) + red_rgba = [1.0, 0.0, 0.0, 1.0] + green_rgba = [0.0, 1.0, 0.0, 1.0] + rgba_array = np.array([red_rgba, green_rgba]) + ax = df.plot.scatter(x="A", y="B", c=rgba_array) + # expect the face colors of the points in the non-colormap path to be + # identical to the values we supplied, normally we'd be on shaky ground + # comparing floats for equality but here we expect them to be + # identical. + tm.assert_numpy_array_equal(ax.collections[0].get_facecolor(), rgba_array) + # we don't test the colors of the faces in this next plot because they + # are dependent on the spring colormap, which may change its colors + # later. + float_array = np.array([0.0, 1.0]) + df.plot.scatter(x="A", y="B", c=float_array, cmap="spring") + + def test_scatter_colors(self): + df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]}) + with pytest.raises(TypeError): + df.plot.scatter(x="a", y="b", c="c", color="green") + + default_colors = self._unpack_cycler(self.plt.rcParams) + + ax = df.plot.scatter(x="a", y="b", c="c") + tm.assert_numpy_array_equal( + ax.collections[0].get_facecolor()[0], + np.array(self.colorconverter.to_rgba(default_colors[0])), + ) + + ax = df.plot.scatter(x="a", y="b", color="white") + tm.assert_numpy_array_equal( + ax.collections[0].get_facecolor()[0], + np.array([1, 1, 1, 1], dtype=np.float64), + ) + + @pytest.mark.slow + def test_plot_bar(self): + df = DataFrame( + randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=["one", "two", "three", "four"], + ) + + _check_plot_works(df.plot.bar) + _check_plot_works(df.plot.bar, legend=False) + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.plot.bar, subplots=True) + _check_plot_works(df.plot.bar, stacked=True) + + df = DataFrame( + randn(10, 15), index=list(string.ascii_letters[:10]), columns=range(15) + ) + _check_plot_works(df.plot.bar) + + df = DataFrame({"a": [0, 1], "b": [1, 0]}) + ax = _check_plot_works(df.plot.bar) + self._check_ticks_props(ax, xrot=90) + + ax = df.plot.bar(rot=35, fontsize=10) + self._check_ticks_props(ax, xrot=35, xlabelsize=10, ylabelsize=10) + + ax = _check_plot_works(df.plot.barh) + self._check_ticks_props(ax, yrot=0) + + ax = df.plot.barh(rot=55, fontsize=11) + self._check_ticks_props(ax, yrot=55, ylabelsize=11, xlabelsize=11) + + def _check_bar_alignment( + self, + df, + kind="bar", + stacked=False, + subplots=False, + align="center", + width=0.5, + position=0.5, + ): + + axes = df.plot( + kind=kind, + stacked=stacked, + subplots=subplots, + align=align, + width=width, + position=position, + grid=True, + ) + + axes = self._flatten_visible(axes) + + for ax in axes: + if kind == "bar": + axis = ax.xaxis + ax_min, ax_max = ax.get_xlim() + min_edge = min(p.get_x() for p in ax.patches) + max_edge = max(p.get_x() + p.get_width() for p in ax.patches) + elif kind == "barh": + axis = ax.yaxis + ax_min, ax_max = ax.get_ylim() + min_edge = min(p.get_y() for p in ax.patches) + max_edge = max(p.get_y() + p.get_height() for p in ax.patches) + else: + raise ValueError + + # GH 7498 + # compare margins between lim and bar edges + tm.assert_almost_equal(ax_min, min_edge - 0.25) + tm.assert_almost_equal(ax_max, max_edge + 0.25) + + p = ax.patches[0] + if kind == "bar" and (stacked is True or subplots is True): + edge = p.get_x() + center = edge + p.get_width() * position + elif kind == "bar" and stacked is False: + center = p.get_x() + p.get_width() * len(df.columns) * position + edge = p.get_x() + elif kind == "barh" and (stacked is True or subplots is True): + center = p.get_y() + p.get_height() * position + edge = p.get_y() + elif kind == "barh" and stacked is False: + center = p.get_y() + p.get_height() * len(df.columns) * position + edge = p.get_y() + else: + raise ValueError + + # Check the ticks locates on integer + assert (axis.get_ticklocs() == np.arange(len(df))).all() + + if align == "center": + # Check whether the bar locates on center + tm.assert_almost_equal(axis.get_ticklocs()[0], center) + elif align == "edge": + # Check whether the bar's edge starts from the tick + tm.assert_almost_equal(axis.get_ticklocs()[0], edge) + else: + raise ValueError + + return axes + + @pytest.mark.slow + def test_bar_stacked_center(self): + # GH2157 + df = DataFrame({"A": [3] * 5, "B": list(range(5))}, index=range(5)) + self._check_bar_alignment(df, kind="bar", stacked=True) + self._check_bar_alignment(df, kind="bar", stacked=True, width=0.9) + self._check_bar_alignment(df, kind="barh", stacked=True) + self._check_bar_alignment(df, kind="barh", stacked=True, width=0.9) + + @pytest.mark.slow + def test_bar_center(self): + df = DataFrame({"A": [3] * 5, "B": list(range(5))}, index=range(5)) + self._check_bar_alignment(df, kind="bar", stacked=False) + self._check_bar_alignment(df, kind="bar", stacked=False, width=0.9) + self._check_bar_alignment(df, kind="barh", stacked=False) + self._check_bar_alignment(df, kind="barh", stacked=False, width=0.9) + + @pytest.mark.slow + def test_bar_subplots_center(self): + df = DataFrame({"A": [3] * 5, "B": list(range(5))}, index=range(5)) + self._check_bar_alignment(df, kind="bar", subplots=True) + self._check_bar_alignment(df, kind="bar", subplots=True, width=0.9) + self._check_bar_alignment(df, kind="barh", subplots=True) + self._check_bar_alignment(df, kind="barh", subplots=True, width=0.9) + + @pytest.mark.slow + def test_bar_align_single_column(self): + df = DataFrame(randn(5)) + self._check_bar_alignment(df, kind="bar", stacked=False) + self._check_bar_alignment(df, kind="bar", stacked=True) + self._check_bar_alignment(df, kind="barh", stacked=False) + self._check_bar_alignment(df, kind="barh", stacked=True) + self._check_bar_alignment(df, kind="bar", subplots=True) + self._check_bar_alignment(df, kind="barh", subplots=True) + + @pytest.mark.slow + def test_bar_edge(self): + df = DataFrame({"A": [3] * 5, "B": list(range(5))}, index=range(5)) + + self._check_bar_alignment(df, kind="bar", stacked=True, align="edge") + self._check_bar_alignment(df, kind="bar", stacked=True, width=0.9, align="edge") + self._check_bar_alignment(df, kind="barh", stacked=True, align="edge") + self._check_bar_alignment( + df, kind="barh", stacked=True, width=0.9, align="edge" + ) + + self._check_bar_alignment(df, kind="bar", stacked=False, align="edge") + self._check_bar_alignment( + df, kind="bar", stacked=False, width=0.9, align="edge" + ) + self._check_bar_alignment(df, kind="barh", stacked=False, align="edge") + self._check_bar_alignment( + df, kind="barh", stacked=False, width=0.9, align="edge" + ) + + self._check_bar_alignment(df, kind="bar", subplots=True, align="edge") + self._check_bar_alignment( + df, kind="bar", subplots=True, width=0.9, align="edge" + ) + self._check_bar_alignment(df, kind="barh", subplots=True, align="edge") + self._check_bar_alignment( + df, kind="barh", subplots=True, width=0.9, align="edge" + ) + + @pytest.mark.slow + def test_bar_log_no_subplots(self): + # GH3254, GH3298 matplotlib/matplotlib#1882, #1892 + # regressions in 1.2.1 + expected = np.array([0.1, 1.0, 10.0, 100]) + + # no subplots + df = DataFrame({"A": [3] * 5, "B": list(range(1, 6))}, index=range(5)) + ax = df.plot.bar(grid=True, log=True) + tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), expected) + + @pytest.mark.slow + def test_bar_log_subplots(self): + expected = np.array([0.1, 1.0, 10.0, 100.0, 1000.0, 1e4]) + + ax = DataFrame([Series([200, 300]), Series([300, 500])]).plot.bar( + log=True, subplots=True + ) + + tm.assert_numpy_array_equal(ax[0].yaxis.get_ticklocs(), expected) + tm.assert_numpy_array_equal(ax[1].yaxis.get_ticklocs(), expected) + + @pytest.mark.slow + def test_boxplot(self): + df = self.hist_df + series = df["height"] + numeric_cols = df._get_numeric_data().columns + labels = [pprint_thing(c) for c in numeric_cols] + + ax = _check_plot_works(df.plot.box) + self._check_text_labels(ax.get_xticklabels(), labels) + tm.assert_numpy_array_equal( + ax.xaxis.get_ticklocs(), np.arange(1, len(numeric_cols) + 1) + ) + assert len(ax.lines) == self.bp_n_objects * len(numeric_cols) + + axes = series.plot.box(rot=40) + self._check_ticks_props(axes, xrot=40, yrot=0) + tm.close() + + ax = _check_plot_works(series.plot.box) + + positions = np.array([1, 6, 7]) + ax = df.plot.box(positions=positions) + numeric_cols = df._get_numeric_data().columns + labels = [pprint_thing(c) for c in numeric_cols] + self._check_text_labels(ax.get_xticklabels(), labels) + tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), positions) + assert len(ax.lines) == self.bp_n_objects * len(numeric_cols) + + @pytest.mark.slow + def test_boxplot_vertical(self): + df = self.hist_df + numeric_cols = df._get_numeric_data().columns + labels = [pprint_thing(c) for c in numeric_cols] + + # if horizontal, yticklabels are rotated + ax = df.plot.box(rot=50, fontsize=8, vert=False) + self._check_ticks_props(ax, xrot=0, yrot=50, ylabelsize=8) + self._check_text_labels(ax.get_yticklabels(), labels) + assert len(ax.lines) == self.bp_n_objects * len(numeric_cols) + + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.plot.box, subplots=True, vert=False, logx=True) + self._check_axes_shape(axes, axes_num=3, layout=(1, 3)) + self._check_ax_scales(axes, xaxis="log") + for ax, label in zip(axes, labels): + self._check_text_labels(ax.get_yticklabels(), [label]) + assert len(ax.lines) == self.bp_n_objects + + positions = np.array([3, 2, 8]) + ax = df.plot.box(positions=positions, vert=False) + self._check_text_labels(ax.get_yticklabels(), labels) + tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), positions) + assert len(ax.lines) == self.bp_n_objects * len(numeric_cols) + + @pytest.mark.slow + def test_boxplot_return_type(self): + df = DataFrame( + randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=["one", "two", "three", "four"], + ) + with pytest.raises(ValueError): + df.plot.box(return_type="NOTATYPE") + + result = df.plot.box(return_type="dict") + self._check_box_return_type(result, "dict") + + result = df.plot.box(return_type="axes") + self._check_box_return_type(result, "axes") + + result = df.plot.box() # default axes + self._check_box_return_type(result, "axes") + + result = df.plot.box(return_type="both") + self._check_box_return_type(result, "both") + + @pytest.mark.slow + def test_boxplot_subplots_return_type(self): + df = self.hist_df + + # normal style: return_type=None + result = df.plot.box(subplots=True) + assert isinstance(result, Series) + self._check_box_return_type( + result, None, expected_keys=["height", "weight", "category"] + ) + + for t in ["dict", "axes", "both"]: + returned = df.plot.box(return_type=t, subplots=True) + self._check_box_return_type( + returned, + t, + expected_keys=["height", "weight", "category"], + check_ax_title=False, + ) + + @pytest.mark.slow + @td.skip_if_no_scipy + def test_kde_df(self): + df = DataFrame(randn(100, 4)) + ax = _check_plot_works(df.plot, kind="kde") + expected = [pprint_thing(c) for c in df.columns] + self._check_legend_labels(ax, labels=expected) + self._check_ticks_props(ax, xrot=0) + + ax = df.plot(kind="kde", rot=20, fontsize=5) + self._check_ticks_props(ax, xrot=20, xlabelsize=5, ylabelsize=5) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.plot, kind="kde", subplots=True) + self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) + + axes = df.plot(kind="kde", logy=True, subplots=True) + self._check_ax_scales(axes, yaxis="log") + + @pytest.mark.slow + @td.skip_if_no_scipy + def test_kde_missing_vals(self): + df = DataFrame(np.random.uniform(size=(100, 4))) + df.loc[0, 0] = np.nan + _check_plot_works(df.plot, kind="kde") + + @pytest.mark.slow + def test_hist_df(self): + from matplotlib.patches import Rectangle + + df = DataFrame(randn(100, 4)) + series = df[0] + + ax = _check_plot_works(df.plot.hist) + expected = [pprint_thing(c) for c in df.columns] + self._check_legend_labels(ax, labels=expected) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.plot.hist, subplots=True, logy=True) + self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) + self._check_ax_scales(axes, yaxis="log") + + axes = series.plot.hist(rot=40) + self._check_ticks_props(axes, xrot=40, yrot=0) + tm.close() + + ax = series.plot.hist(cumulative=True, bins=4, density=True) + # height of last bin (index 5) must be 1.0 + rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] + tm.assert_almost_equal(rects[-1].get_height(), 1.0) + tm.close() + + ax = series.plot.hist(cumulative=True, bins=4) + rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] + + tm.assert_almost_equal(rects[-2].get_height(), 100.0) + tm.close() + + # if horizontal, yticklabels are rotated + axes = df.plot.hist(rot=50, fontsize=8, orientation="horizontal") + self._check_ticks_props(axes, xrot=0, yrot=50, ylabelsize=8) + + def _check_box_coord( + self, + patches, + expected_y=None, + expected_h=None, + expected_x=None, + expected_w=None, + ): + result_y = np.array([p.get_y() for p in patches]) + result_height = np.array([p.get_height() for p in patches]) + result_x = np.array([p.get_x() for p in patches]) + result_width = np.array([p.get_width() for p in patches]) + # dtype is depending on above values, no need to check + + if expected_y is not None: + tm.assert_numpy_array_equal(result_y, expected_y, check_dtype=False) + if expected_h is not None: + tm.assert_numpy_array_equal(result_height, expected_h, check_dtype=False) + if expected_x is not None: + tm.assert_numpy_array_equal(result_x, expected_x, check_dtype=False) + if expected_w is not None: + tm.assert_numpy_array_equal(result_width, expected_w, check_dtype=False) + + @pytest.mark.slow + def test_hist_df_coord(self): + normal_df = DataFrame( + { + "A": np.repeat(np.array([1, 2, 3, 4, 5]), np.array([10, 9, 8, 7, 6])), + "B": np.repeat(np.array([1, 2, 3, 4, 5]), np.array([8, 8, 8, 8, 8])), + "C": np.repeat(np.array([1, 2, 3, 4, 5]), np.array([6, 7, 8, 9, 10])), + }, + columns=["A", "B", "C"], + ) + + nan_df = DataFrame( + { + "A": np.repeat( + np.array([np.nan, 1, 2, 3, 4, 5]), np.array([3, 10, 9, 8, 7, 6]) + ), + "B": np.repeat( + np.array([1, np.nan, 2, 3, 4, 5]), np.array([8, 3, 8, 8, 8, 8]) + ), + "C": np.repeat( + np.array([1, 2, 3, np.nan, 4, 5]), np.array([6, 7, 8, 3, 9, 10]) + ), + }, + columns=["A", "B", "C"], + ) + + for df in [normal_df, nan_df]: + ax = df.plot.hist(bins=5) + self._check_box_coord( + ax.patches[:5], + expected_y=np.array([0, 0, 0, 0, 0]), + expected_h=np.array([10, 9, 8, 7, 6]), + ) + self._check_box_coord( + ax.patches[5:10], + expected_y=np.array([0, 0, 0, 0, 0]), + expected_h=np.array([8, 8, 8, 8, 8]), + ) + self._check_box_coord( + ax.patches[10:], + expected_y=np.array([0, 0, 0, 0, 0]), + expected_h=np.array([6, 7, 8, 9, 10]), + ) + + ax = df.plot.hist(bins=5, stacked=True) + self._check_box_coord( + ax.patches[:5], + expected_y=np.array([0, 0, 0, 0, 0]), + expected_h=np.array([10, 9, 8, 7, 6]), + ) + self._check_box_coord( + ax.patches[5:10], + expected_y=np.array([10, 9, 8, 7, 6]), + expected_h=np.array([8, 8, 8, 8, 8]), + ) + self._check_box_coord( + ax.patches[10:], + expected_y=np.array([18, 17, 16, 15, 14]), + expected_h=np.array([6, 7, 8, 9, 10]), + ) + + axes = df.plot.hist(bins=5, stacked=True, subplots=True) + self._check_box_coord( + axes[0].patches, + expected_y=np.array([0, 0, 0, 0, 0]), + expected_h=np.array([10, 9, 8, 7, 6]), + ) + self._check_box_coord( + axes[1].patches, + expected_y=np.array([0, 0, 0, 0, 0]), + expected_h=np.array([8, 8, 8, 8, 8]), + ) + self._check_box_coord( + axes[2].patches, + expected_y=np.array([0, 0, 0, 0, 0]), + expected_h=np.array([6, 7, 8, 9, 10]), + ) + + # horizontal + ax = df.plot.hist(bins=5, orientation="horizontal") + self._check_box_coord( + ax.patches[:5], + expected_x=np.array([0, 0, 0, 0, 0]), + expected_w=np.array([10, 9, 8, 7, 6]), + ) + self._check_box_coord( + ax.patches[5:10], + expected_x=np.array([0, 0, 0, 0, 0]), + expected_w=np.array([8, 8, 8, 8, 8]), + ) + self._check_box_coord( + ax.patches[10:], + expected_x=np.array([0, 0, 0, 0, 0]), + expected_w=np.array([6, 7, 8, 9, 10]), + ) + + ax = df.plot.hist(bins=5, stacked=True, orientation="horizontal") + self._check_box_coord( + ax.patches[:5], + expected_x=np.array([0, 0, 0, 0, 0]), + expected_w=np.array([10, 9, 8, 7, 6]), + ) + self._check_box_coord( + ax.patches[5:10], + expected_x=np.array([10, 9, 8, 7, 6]), + expected_w=np.array([8, 8, 8, 8, 8]), + ) + self._check_box_coord( + ax.patches[10:], + expected_x=np.array([18, 17, 16, 15, 14]), + expected_w=np.array([6, 7, 8, 9, 10]), + ) + + axes = df.plot.hist( + bins=5, stacked=True, subplots=True, orientation="horizontal" + ) + self._check_box_coord( + axes[0].patches, + expected_x=np.array([0, 0, 0, 0, 0]), + expected_w=np.array([10, 9, 8, 7, 6]), + ) + self._check_box_coord( + axes[1].patches, + expected_x=np.array([0, 0, 0, 0, 0]), + expected_w=np.array([8, 8, 8, 8, 8]), + ) + self._check_box_coord( + axes[2].patches, + expected_x=np.array([0, 0, 0, 0, 0]), + expected_w=np.array([6, 7, 8, 9, 10]), + ) + + @pytest.mark.slow + def test_plot_int_columns(self): + df = DataFrame(randn(100, 4)).cumsum() + _check_plot_works(df.plot, legend=True) + + @pytest.mark.slow + def test_df_legend_labels(self): + kinds = ["line", "bar", "barh", "kde", "area", "hist"] + df = DataFrame(rand(3, 3), columns=["a", "b", "c"]) + df2 = DataFrame(rand(3, 3), columns=["d", "e", "f"]) + df3 = DataFrame(rand(3, 3), columns=["g", "h", "i"]) + df4 = DataFrame(rand(3, 3), columns=["j", "k", "l"]) + + for kind in kinds: + + ax = df.plot(kind=kind, legend=True) + self._check_legend_labels(ax, labels=df.columns) + + ax = df2.plot(kind=kind, legend=False, ax=ax) + self._check_legend_labels(ax, labels=df.columns) + + ax = df3.plot(kind=kind, legend=True, ax=ax) + self._check_legend_labels(ax, labels=df.columns.union(df3.columns)) + + ax = df4.plot(kind=kind, legend="reverse", ax=ax) + expected = list(df.columns.union(df3.columns)) + list(reversed(df4.columns)) + self._check_legend_labels(ax, labels=expected) + + # Secondary Y + ax = df.plot(legend=True, secondary_y="b") + self._check_legend_labels(ax, labels=["a", "b (right)", "c"]) + ax = df2.plot(legend=False, ax=ax) + self._check_legend_labels(ax, labels=["a", "b (right)", "c"]) + ax = df3.plot(kind="bar", legend=True, secondary_y="h", ax=ax) + self._check_legend_labels( + ax, labels=["a", "b (right)", "c", "g", "h (right)", "i"] + ) + + # Time Series + ind = date_range("1/1/2014", periods=3) + df = DataFrame(randn(3, 3), columns=["a", "b", "c"], index=ind) + df2 = DataFrame(randn(3, 3), columns=["d", "e", "f"], index=ind) + df3 = DataFrame(randn(3, 3), columns=["g", "h", "i"], index=ind) + ax = df.plot(legend=True, secondary_y="b") + self._check_legend_labels(ax, labels=["a", "b (right)", "c"]) + ax = df2.plot(legend=False, ax=ax) + self._check_legend_labels(ax, labels=["a", "b (right)", "c"]) + ax = df3.plot(legend=True, ax=ax) + self._check_legend_labels(ax, labels=["a", "b (right)", "c", "g", "h", "i"]) + + # scatter + ax = df.plot.scatter(x="a", y="b", label="data1") + self._check_legend_labels(ax, labels=["data1"]) + ax = df2.plot.scatter(x="d", y="e", legend=False, label="data2", ax=ax) + self._check_legend_labels(ax, labels=["data1"]) + ax = df3.plot.scatter(x="g", y="h", label="data3", ax=ax) + self._check_legend_labels(ax, labels=["data1", "data3"]) + + # ensure label args pass through and + # index name does not mutate + # column names don't mutate + df5 = df.set_index("a") + ax = df5.plot(y="b") + self._check_legend_labels(ax, labels=["b"]) + ax = df5.plot(y="b", label="LABEL_b") + self._check_legend_labels(ax, labels=["LABEL_b"]) + self._check_text_labels(ax.xaxis.get_label(), "a") + ax = df5.plot(y="c", label="LABEL_c", ax=ax) + self._check_legend_labels(ax, labels=["LABEL_b", "LABEL_c"]) + assert df5.columns.tolist() == ["b", "c"] + + def test_missing_marker_multi_plots_on_same_ax(self): + # GH 18222 + df = pd.DataFrame( + data=[[1, 1, 1, 1], [2, 2, 4, 8]], columns=["x", "r", "g", "b"] + ) + fig, ax = self.plt.subplots(nrows=1, ncols=3) + # Left plot + df.plot(x="x", y="r", linewidth=0, marker="o", color="r", ax=ax[0]) + df.plot(x="x", y="g", linewidth=1, marker="x", color="g", ax=ax[0]) + df.plot(x="x", y="b", linewidth=1, marker="o", color="b", ax=ax[0]) + self._check_legend_labels(ax[0], labels=["r", "g", "b"]) + self._check_legend_marker(ax[0], expected_markers=["o", "x", "o"]) + # Center plot + df.plot(x="x", y="b", linewidth=1, marker="o", color="b", ax=ax[1]) + df.plot(x="x", y="r", linewidth=0, marker="o", color="r", ax=ax[1]) + df.plot(x="x", y="g", linewidth=1, marker="x", color="g", ax=ax[1]) + self._check_legend_labels(ax[1], labels=["b", "r", "g"]) + self._check_legend_marker(ax[1], expected_markers=["o", "o", "x"]) + # Right plot + df.plot(x="x", y="g", linewidth=1, marker="x", color="g", ax=ax[2]) + df.plot(x="x", y="b", linewidth=1, marker="o", color="b", ax=ax[2]) + df.plot(x="x", y="r", linewidth=0, marker="o", color="r", ax=ax[2]) + self._check_legend_labels(ax[2], labels=["g", "b", "r"]) + self._check_legend_marker(ax[2], expected_markers=["x", "o", "o"]) + + def test_legend_name(self): + multi = DataFrame( + randn(4, 4), + columns=[np.array(["a", "a", "b", "b"]), np.array(["x", "y", "x", "y"])], + ) + multi.columns.names = ["group", "individual"] + + ax = multi.plot() + leg_title = ax.legend_.get_title() + self._check_text_labels(leg_title, "group,individual") + + df = DataFrame(randn(5, 5)) + ax = df.plot(legend=True, ax=ax) + leg_title = ax.legend_.get_title() + self._check_text_labels(leg_title, "group,individual") + + df.columns.name = "new" + ax = df.plot(legend=False, ax=ax) + leg_title = ax.legend_.get_title() + self._check_text_labels(leg_title, "group,individual") + + ax = df.plot(legend=True, ax=ax) + leg_title = ax.legend_.get_title() + self._check_text_labels(leg_title, "new") + + @pytest.mark.slow + def test_no_legend(self): + kinds = ["line", "bar", "barh", "kde", "area", "hist"] + df = DataFrame(rand(3, 3), columns=["a", "b", "c"]) + + for kind in kinds: + + ax = df.plot(kind=kind, legend=False) + self._check_legend_labels(ax, visible=False) + + @pytest.mark.slow + def test_style_by_column(self): + import matplotlib.pyplot as plt + + fig = plt.gcf() + + df = DataFrame(randn(100, 3)) + for markers in [ + {0: "^", 1: "+", 2: "o"}, + {0: "^", 1: "+"}, + ["^", "+", "o"], + ["^", "+"], + ]: + fig.clf() + fig.add_subplot(111) + ax = df.plot(style=markers) + for i, l in enumerate(ax.get_lines()[: len(markers)]): + assert l.get_marker() == markers[i] + + @pytest.mark.slow + def test_line_label_none(self): + s = Series([1, 2]) + ax = s.plot() + assert ax.get_legend() is None + + ax = s.plot(legend=True) + assert ax.get_legend().get_texts()[0].get_text() == "None" + + @pytest.mark.slow + def test_line_colors(self): + from matplotlib import cm + + custom_colors = "rgcby" + df = DataFrame(randn(5, 5)) + + ax = df.plot(color=custom_colors) + self._check_colors(ax.get_lines(), linecolors=custom_colors) + + tm.close() + + ax2 = df.plot(color=custom_colors) + lines2 = ax2.get_lines() + + for l1, l2 in zip(ax.get_lines(), lines2): + assert l1.get_color() == l2.get_color() + + tm.close() + + ax = df.plot(colormap="jet") + rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] + self._check_colors(ax.get_lines(), linecolors=rgba_colors) + tm.close() + + ax = df.plot(colormap=cm.jet) + rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] + self._check_colors(ax.get_lines(), linecolors=rgba_colors) + tm.close() + + # make color a list if plotting one column frame + # handles cases like df.plot(color='DodgerBlue') + ax = df.loc[:, [0]].plot(color="DodgerBlue") + self._check_colors(ax.lines, linecolors=["DodgerBlue"]) + + ax = df.plot(color="red") + self._check_colors(ax.get_lines(), linecolors=["red"] * 5) + tm.close() + + # GH 10299 + custom_colors = ["#FF0000", "#0000FF", "#FFFF00", "#000000", "#FFFFFF"] + ax = df.plot(color=custom_colors) + self._check_colors(ax.get_lines(), linecolors=custom_colors) + tm.close() + + with pytest.raises(ValueError): + # Color contains shorthand hex value results in ValueError + custom_colors = ["#F00", "#00F", "#FF0", "#000", "#FFF"] + # Forced show plot + _check_plot_works(df.plot, color=custom_colors) + + @pytest.mark.slow + def test_dont_modify_colors(self): + colors = ["r", "g", "b"] + pd.DataFrame(np.random.rand(10, 2)).plot(color=colors) + assert len(colors) == 3 + + @pytest.mark.slow + def test_line_colors_and_styles_subplots(self): + # GH 9894 + from matplotlib import cm + + default_colors = self._unpack_cycler(self.plt.rcParams) + + df = DataFrame(randn(5, 5)) + + axes = df.plot(subplots=True) + for ax, c in zip(axes, list(default_colors)): + c = [c] + self._check_colors(ax.get_lines(), linecolors=c) + tm.close() + + # single color char + axes = df.plot(subplots=True, color="k") + for ax in axes: + self._check_colors(ax.get_lines(), linecolors=["k"]) + tm.close() + + # single color str + axes = df.plot(subplots=True, color="green") + for ax in axes: + self._check_colors(ax.get_lines(), linecolors=["green"]) + tm.close() + + custom_colors = "rgcby" + axes = df.plot(color=custom_colors, subplots=True) + for ax, c in zip(axes, list(custom_colors)): + self._check_colors(ax.get_lines(), linecolors=[c]) + tm.close() + + axes = df.plot(color=list(custom_colors), subplots=True) + for ax, c in zip(axes, list(custom_colors)): + self._check_colors(ax.get_lines(), linecolors=[c]) + tm.close() + + # GH 10299 + custom_colors = ["#FF0000", "#0000FF", "#FFFF00", "#000000", "#FFFFFF"] + axes = df.plot(color=custom_colors, subplots=True) + for ax, c in zip(axes, list(custom_colors)): + self._check_colors(ax.get_lines(), linecolors=[c]) + tm.close() + + with pytest.raises(ValueError): + # Color contains shorthand hex value results in ValueError + custom_colors = ["#F00", "#00F", "#FF0", "#000", "#FFF"] + # Forced show plot + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.plot, color=custom_colors, subplots=True) + + rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] + for cmap in ["jet", cm.jet]: + axes = df.plot(colormap=cmap, subplots=True) + for ax, c in zip(axes, rgba_colors): + self._check_colors(ax.get_lines(), linecolors=[c]) + tm.close() + + # make color a list if plotting one column frame + # handles cases like df.plot(color='DodgerBlue') + axes = df.loc[:, [0]].plot(color="DodgerBlue", subplots=True) + self._check_colors(axes[0].lines, linecolors=["DodgerBlue"]) + + # single character style + axes = df.plot(style="r", subplots=True) + for ax in axes: + self._check_colors(ax.get_lines(), linecolors=["r"]) + tm.close() + + # list of styles + styles = list("rgcby") + axes = df.plot(style=styles, subplots=True) + for ax, c in zip(axes, styles): + self._check_colors(ax.get_lines(), linecolors=[c]) + tm.close() + + @pytest.mark.slow + def test_area_colors(self): + from matplotlib import cm + from matplotlib.collections import PolyCollection + + custom_colors = "rgcby" + df = DataFrame(rand(5, 5)) + + ax = df.plot.area(color=custom_colors) + self._check_colors(ax.get_lines(), linecolors=custom_colors) + poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)] + self._check_colors(poly, facecolors=custom_colors) + + handles, labels = ax.get_legend_handles_labels() + self._check_colors(handles, facecolors=custom_colors) + + for h in handles: + assert h.get_alpha() is None + tm.close() + + ax = df.plot.area(colormap="jet") + jet_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] + self._check_colors(ax.get_lines(), linecolors=jet_colors) + poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)] + self._check_colors(poly, facecolors=jet_colors) + + handles, labels = ax.get_legend_handles_labels() + self._check_colors(handles, facecolors=jet_colors) + for h in handles: + assert h.get_alpha() is None + tm.close() + + # When stacked=False, alpha is set to 0.5 + ax = df.plot.area(colormap=cm.jet, stacked=False) + self._check_colors(ax.get_lines(), linecolors=jet_colors) + poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)] + jet_with_alpha = [(c[0], c[1], c[2], 0.5) for c in jet_colors] + self._check_colors(poly, facecolors=jet_with_alpha) + + handles, labels = ax.get_legend_handles_labels() + linecolors = jet_with_alpha + self._check_colors(handles[: len(jet_colors)], linecolors=linecolors) + for h in handles: + assert h.get_alpha() == 0.5 + + @pytest.mark.slow + def test_hist_colors(self): + default_colors = self._unpack_cycler(self.plt.rcParams) + + df = DataFrame(randn(5, 5)) + ax = df.plot.hist() + self._check_colors(ax.patches[::10], facecolors=default_colors[:5]) + tm.close() + + custom_colors = "rgcby" + ax = df.plot.hist(color=custom_colors) + self._check_colors(ax.patches[::10], facecolors=custom_colors) + tm.close() + + from matplotlib import cm + + # Test str -> colormap functionality + ax = df.plot.hist(colormap="jet") + rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, 5)] + self._check_colors(ax.patches[::10], facecolors=rgba_colors) + tm.close() + + # Test colormap functionality + ax = df.plot.hist(colormap=cm.jet) + rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, 5)] + self._check_colors(ax.patches[::10], facecolors=rgba_colors) + tm.close() + + ax = df.loc[:, [0]].plot.hist(color="DodgerBlue") + self._check_colors([ax.patches[0]], facecolors=["DodgerBlue"]) + + ax = df.plot(kind="hist", color="green") + self._check_colors(ax.patches[::10], facecolors=["green"] * 5) + tm.close() + + @pytest.mark.slow + @td.skip_if_no_scipy + def test_kde_colors(self): + from matplotlib import cm + + custom_colors = "rgcby" + df = DataFrame(rand(5, 5)) + + ax = df.plot.kde(color=custom_colors) + self._check_colors(ax.get_lines(), linecolors=custom_colors) + tm.close() + + ax = df.plot.kde(colormap="jet") + rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] + self._check_colors(ax.get_lines(), linecolors=rgba_colors) + tm.close() + + ax = df.plot.kde(colormap=cm.jet) + rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] + self._check_colors(ax.get_lines(), linecolors=rgba_colors) + + @pytest.mark.slow + @td.skip_if_no_scipy + def test_kde_colors_and_styles_subplots(self): + from matplotlib import cm + + default_colors = self._unpack_cycler(self.plt.rcParams) + + df = DataFrame(randn(5, 5)) + + axes = df.plot(kind="kde", subplots=True) + for ax, c in zip(axes, list(default_colors)): + self._check_colors(ax.get_lines(), linecolors=[c]) + tm.close() + + # single color char + axes = df.plot(kind="kde", color="k", subplots=True) + for ax in axes: + self._check_colors(ax.get_lines(), linecolors=["k"]) + tm.close() + + # single color str + axes = df.plot(kind="kde", color="red", subplots=True) + for ax in axes: + self._check_colors(ax.get_lines(), linecolors=["red"]) + tm.close() + + custom_colors = "rgcby" + axes = df.plot(kind="kde", color=custom_colors, subplots=True) + for ax, c in zip(axes, list(custom_colors)): + self._check_colors(ax.get_lines(), linecolors=[c]) + tm.close() + + rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] + for cmap in ["jet", cm.jet]: + axes = df.plot(kind="kde", colormap=cmap, subplots=True) + for ax, c in zip(axes, rgba_colors): + self._check_colors(ax.get_lines(), linecolors=[c]) + tm.close() + + # make color a list if plotting one column frame + # handles cases like df.plot(color='DodgerBlue') + axes = df.loc[:, [0]].plot(kind="kde", color="DodgerBlue", subplots=True) + self._check_colors(axes[0].lines, linecolors=["DodgerBlue"]) + + # single character style + axes = df.plot(kind="kde", style="r", subplots=True) + for ax in axes: + self._check_colors(ax.get_lines(), linecolors=["r"]) + tm.close() + + # list of styles + styles = list("rgcby") + axes = df.plot(kind="kde", style=styles, subplots=True) + for ax, c in zip(axes, styles): + self._check_colors(ax.get_lines(), linecolors=[c]) + tm.close() + + @pytest.mark.slow + def test_boxplot_colors(self): + def _check_colors(bp, box_c, whiskers_c, medians_c, caps_c="k", fliers_c=None): + # TODO: outside this func? + if fliers_c is None: + fliers_c = "k" + self._check_colors(bp["boxes"], linecolors=[box_c] * len(bp["boxes"])) + self._check_colors( + bp["whiskers"], linecolors=[whiskers_c] * len(bp["whiskers"]) + ) + self._check_colors( + bp["medians"], linecolors=[medians_c] * len(bp["medians"]) + ) + self._check_colors(bp["fliers"], linecolors=[fliers_c] * len(bp["fliers"])) + self._check_colors(bp["caps"], linecolors=[caps_c] * len(bp["caps"])) + + default_colors = self._unpack_cycler(self.plt.rcParams) + + df = DataFrame(randn(5, 5)) + bp = df.plot.box(return_type="dict") + _check_colors(bp, default_colors[0], default_colors[0], default_colors[2]) + tm.close() + + dict_colors = dict( + boxes="#572923", whiskers="#982042", medians="#804823", caps="#123456" + ) + bp = df.plot.box(color=dict_colors, sym="r+", return_type="dict") + _check_colors( + bp, + dict_colors["boxes"], + dict_colors["whiskers"], + dict_colors["medians"], + dict_colors["caps"], + "r", + ) + tm.close() + + # partial colors + dict_colors = dict(whiskers="c", medians="m") + bp = df.plot.box(color=dict_colors, return_type="dict") + _check_colors(bp, default_colors[0], "c", "m") + tm.close() + + from matplotlib import cm + + # Test str -> colormap functionality + bp = df.plot.box(colormap="jet", return_type="dict") + jet_colors = [cm.jet(n) for n in np.linspace(0, 1, 3)] + _check_colors(bp, jet_colors[0], jet_colors[0], jet_colors[2]) + tm.close() + + # Test colormap functionality + bp = df.plot.box(colormap=cm.jet, return_type="dict") + _check_colors(bp, jet_colors[0], jet_colors[0], jet_colors[2]) + tm.close() + + # string color is applied to all artists except fliers + bp = df.plot.box(color="DodgerBlue", return_type="dict") + _check_colors(bp, "DodgerBlue", "DodgerBlue", "DodgerBlue", "DodgerBlue") + + # tuple is also applied to all artists except fliers + bp = df.plot.box(color=(0, 1, 0), sym="#123456", return_type="dict") + _check_colors(bp, (0, 1, 0), (0, 1, 0), (0, 1, 0), (0, 1, 0), "#123456") + + with pytest.raises(ValueError): + # Color contains invalid key results in ValueError + df.plot.box(color=dict(boxes="red", xxxx="blue")) + + def test_default_color_cycle(self): + import matplotlib.pyplot as plt + import cycler + + colors = list("rgbk") + plt.rcParams["axes.prop_cycle"] = cycler.cycler("color", colors) + + df = DataFrame(randn(5, 3)) + ax = df.plot() + + expected = self._unpack_cycler(plt.rcParams)[:3] + self._check_colors(ax.get_lines(), linecolors=expected) + + def test_unordered_ts(self): + df = DataFrame( + np.array([3.0, 2.0, 1.0]), + index=[date(2012, 10, 1), date(2012, 9, 1), date(2012, 8, 1)], + columns=["test"], + ) + ax = df.plot() + xticks = ax.lines[0].get_xdata() + assert xticks[0] < xticks[1] + ydata = ax.lines[0].get_ydata() + tm.assert_numpy_array_equal(ydata, np.array([1.0, 2.0, 3.0])) + + @td.skip_if_no_scipy + def test_kind_both_ways(self): + df = DataFrame({"x": [1, 2, 3]}) + for kind in plotting.PlotAccessor._common_kinds: + + df.plot(kind=kind) + getattr(df.plot, kind)() + for kind in ["scatter", "hexbin"]: + df.plot("x", "x", kind=kind) + getattr(df.plot, kind)("x", "x") + + def test_all_invalid_plot_data(self): + df = DataFrame(list("abcd")) + for kind in plotting.PlotAccessor._common_kinds: + + msg = "no numeric data to plot" + with pytest.raises(TypeError, match=msg): + df.plot(kind=kind) + + @pytest.mark.slow + def test_partially_invalid_plot_data(self): + with tm.RNGContext(42): + df = DataFrame(randn(10, 2), dtype=object) + df[np.random.rand(df.shape[0]) > 0.5] = "a" + for kind in plotting.PlotAccessor._common_kinds: + + msg = "no numeric data to plot" + with pytest.raises(TypeError, match=msg): + df.plot(kind=kind) + + with tm.RNGContext(42): + # area plot doesn't support positive/negative mixed data + kinds = ["area"] + df = DataFrame(rand(10, 2), dtype=object) + df[np.random.rand(df.shape[0]) > 0.5] = "a" + for kind in kinds: + with pytest.raises(TypeError): + df.plot(kind=kind) + + def test_invalid_kind(self): + df = DataFrame(randn(10, 2)) + with pytest.raises(ValueError): + df.plot(kind="aasdf") + + @pytest.mark.parametrize( + "x,y,lbl", + [ + (["B", "C"], "A", "a"), + (["A"], ["B", "C"], ["b", "c"]), + ("A", ["B", "C"], "badlabel"), + ], + ) + def test_invalid_xy_args(self, x, y, lbl): + # GH 18671, 19699 allows y to be list-like but not x + df = DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) + with pytest.raises(ValueError): + df.plot(x=x, y=y, label=lbl) + + @pytest.mark.parametrize("x,y", [("A", "B"), (["A"], "B")]) + def test_invalid_xy_args_dup_cols(self, x, y): + # GH 18671, 19699 allows y to be list-like but not x + df = DataFrame([[1, 3, 5], [2, 4, 6]], columns=list("AAB")) + with pytest.raises(ValueError): + df.plot(x=x, y=y) + + @pytest.mark.parametrize( + "x,y,lbl,colors", + [ + ("A", ["B"], ["b"], ["red"]), + ("A", ["B", "C"], ["b", "c"], ["red", "blue"]), + (0, [1, 2], ["bokeh", "cython"], ["green", "yellow"]), + ], + ) + def test_y_listlike(self, x, y, lbl, colors): + # GH 19699: tests list-like y and verifies lbls & colors + df = DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) + _check_plot_works(df.plot, x="A", y=y, label=lbl) + + ax = df.plot(x=x, y=y, label=lbl, color=colors) + assert len(ax.lines) == len(y) + self._check_colors(ax.get_lines(), linecolors=colors) + + @pytest.mark.parametrize("x,y,colnames", [(0, 1, ["A", "B"]), (1, 0, [0, 1])]) + def test_xy_args_integer(self, x, y, colnames): + # GH 20056: tests integer args for xy and checks col names + df = DataFrame({"A": [1, 2], "B": [3, 4]}) + df.columns = colnames + _check_plot_works(df.plot, x=x, y=y) + + @pytest.mark.slow + def test_hexbin_basic(self): + df = self.hexbin_df + + ax = df.plot.hexbin(x="A", y="B", gridsize=10) + # TODO: need better way to test. This just does existence. + assert len(ax.collections) == 1 + + # GH 6951 + axes = df.plot.hexbin(x="A", y="B", subplots=True) + # hexbin should have 2 axes in the figure, 1 for plotting and another + # is colorbar + assert len(axes[0].figure.axes) == 2 + # return value is single axes + self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + + @pytest.mark.slow + def test_hexbin_with_c(self): + df = self.hexbin_df + + ax = df.plot.hexbin(x="A", y="B", C="C") + assert len(ax.collections) == 1 + + ax = df.plot.hexbin(x="A", y="B", C="C", reduce_C_function=np.std) + assert len(ax.collections) == 1 + + @pytest.mark.slow + def test_hexbin_cmap(self): + df = self.hexbin_df + + # Default to BuGn + ax = df.plot.hexbin(x="A", y="B") + assert ax.collections[0].cmap.name == "BuGn" + + cm = "cubehelix" + ax = df.plot.hexbin(x="A", y="B", colormap=cm) + assert ax.collections[0].cmap.name == cm + + @pytest.mark.slow + def test_no_color_bar(self): + df = self.hexbin_df + + ax = df.plot.hexbin(x="A", y="B", colorbar=None) + assert ax.collections[0].colorbar is None + + @pytest.mark.slow + def test_allow_cmap(self): + df = self.hexbin_df + + ax = df.plot.hexbin(x="A", y="B", cmap="YlGn") + assert ax.collections[0].cmap.name == "YlGn" + + with pytest.raises(TypeError): + df.plot.hexbin(x="A", y="B", cmap="YlGn", colormap="BuGn") + + @pytest.mark.slow + def test_pie_df(self): + df = DataFrame( + np.random.rand(5, 3), + columns=["X", "Y", "Z"], + index=["a", "b", "c", "d", "e"], + ) + with pytest.raises(ValueError): + df.plot.pie() + + ax = _check_plot_works(df.plot.pie, y="Y") + self._check_text_labels(ax.texts, df.index) + + ax = _check_plot_works(df.plot.pie, y=2) + self._check_text_labels(ax.texts, df.index) + + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.plot.pie, subplots=True) + assert len(axes) == len(df.columns) + for ax in axes: + self._check_text_labels(ax.texts, df.index) + for ax, ylabel in zip(axes, df.columns): + assert ax.get_ylabel() == ylabel + + labels = ["A", "B", "C", "D", "E"] + color_args = ["r", "g", "b", "c", "m"] + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works( + df.plot.pie, subplots=True, labels=labels, colors=color_args + ) + assert len(axes) == len(df.columns) + + for ax in axes: + self._check_text_labels(ax.texts, labels) + self._check_colors(ax.patches, facecolors=color_args) + + def test_pie_df_nan(self): + df = DataFrame(np.random.rand(4, 4)) + for i in range(4): + df.iloc[i, i] = np.nan + fig, axes = self.plt.subplots(ncols=4) + df.plot.pie(subplots=True, ax=axes, legend=True) + + base_expected = ["0", "1", "2", "3"] + for i, ax in enumerate(axes): + expected = list(base_expected) # force copy + expected[i] = "" + result = [x.get_text() for x in ax.texts] + assert result == expected + # legend labels + # NaN's not included in legend with subplots + # see https://github.com/pandas-dev/pandas/issues/8390 + assert [x.get_text() for x in ax.get_legend().get_texts()] == base_expected[ + :i + ] + base_expected[i + 1 :] + + @pytest.mark.slow + def test_errorbar_plot(self): + with warnings.catch_warnings(): + d = {"x": np.arange(12), "y": np.arange(12, 0, -1)} + df = DataFrame(d) + d_err = {"x": np.ones(12) * 0.2, "y": np.ones(12) * 0.4} + df_err = DataFrame(d_err) + + # check line plots + ax = _check_plot_works(df.plot, yerr=df_err, logy=True) + self._check_has_errorbars(ax, xerr=0, yerr=2) + ax = _check_plot_works(df.plot, yerr=df_err, logx=True, logy=True) + self._check_has_errorbars(ax, xerr=0, yerr=2) + ax = _check_plot_works(df.plot, yerr=df_err, loglog=True) + self._check_has_errorbars(ax, xerr=0, yerr=2) + + kinds = ["line", "bar", "barh"] + for kind in kinds: + ax = _check_plot_works(df.plot, yerr=df_err["x"], kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=2) + ax = _check_plot_works(df.plot, yerr=d_err, kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=2) + ax = _check_plot_works(df.plot, yerr=df_err, xerr=df_err, kind=kind) + self._check_has_errorbars(ax, xerr=2, yerr=2) + ax = _check_plot_works( + df.plot, yerr=df_err["x"], xerr=df_err["x"], kind=kind + ) + self._check_has_errorbars(ax, xerr=2, yerr=2) + ax = _check_plot_works(df.plot, xerr=0.2, yerr=0.2, kind=kind) + self._check_has_errorbars(ax, xerr=2, yerr=2) + + # _check_plot_works adds an ax so catch warning. see GH #13188 + axes = _check_plot_works( + df.plot, yerr=df_err, xerr=df_err, subplots=True, kind=kind + ) + self._check_has_errorbars(axes, xerr=1, yerr=1) + + ax = _check_plot_works( + (df + 1).plot, yerr=df_err, xerr=df_err, kind="bar", log=True + ) + self._check_has_errorbars(ax, xerr=2, yerr=2) + + # yerr is raw error values + ax = _check_plot_works(df["y"].plot, yerr=np.ones(12) * 0.4) + self._check_has_errorbars(ax, xerr=0, yerr=1) + ax = _check_plot_works(df.plot, yerr=np.ones((2, 12)) * 0.4) + self._check_has_errorbars(ax, xerr=0, yerr=2) + + # yerr is column name + for yerr in ["yerr", "誤差"]: + s_df = df.copy() + s_df[yerr] = np.ones(12) * 0.2 + ax = _check_plot_works(s_df.plot, yerr=yerr) + self._check_has_errorbars(ax, xerr=0, yerr=2) + ax = _check_plot_works(s_df.plot, y="y", x="x", yerr=yerr) + self._check_has_errorbars(ax, xerr=0, yerr=1) + + with pytest.raises(ValueError): + df.plot(yerr=np.random.randn(11)) + + df_err = DataFrame({"x": ["zzz"] * 12, "y": ["zzz"] * 12}) + with pytest.raises((ValueError, TypeError)): + df.plot(yerr=df_err) + + @pytest.mark.xfail(reason="Iterator is consumed", raises=ValueError) + @pytest.mark.slow + def test_errorbar_plot_iterator(self): + with warnings.catch_warnings(): + d = {"x": np.arange(12), "y": np.arange(12, 0, -1)} + df = DataFrame(d) + + # yerr is iterator + ax = _check_plot_works(df.plot, yerr=itertools.repeat(0.1, len(df))) + self._check_has_errorbars(ax, xerr=0, yerr=2) + + @pytest.mark.slow + def test_errorbar_with_integer_column_names(self): + # test with integer column names + df = DataFrame(np.random.randn(10, 2)) + df_err = DataFrame(np.random.randn(10, 2)) + ax = _check_plot_works(df.plot, yerr=df_err) + self._check_has_errorbars(ax, xerr=0, yerr=2) + ax = _check_plot_works(df.plot, y=0, yerr=1) + self._check_has_errorbars(ax, xerr=0, yerr=1) + + @pytest.mark.slow + def test_errorbar_with_partial_columns(self): + df = DataFrame(np.random.randn(10, 3)) + df_err = DataFrame(np.random.randn(10, 2), columns=[0, 2]) + kinds = ["line", "bar"] + for kind in kinds: + ax = _check_plot_works(df.plot, yerr=df_err, kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=2) + + ix = date_range("1/1/2000", periods=10, freq="M") + df.set_index(ix, inplace=True) + df_err.set_index(ix, inplace=True) + ax = _check_plot_works(df.plot, yerr=df_err, kind="line") + self._check_has_errorbars(ax, xerr=0, yerr=2) + + d = {"x": np.arange(12), "y": np.arange(12, 0, -1)} + df = DataFrame(d) + d_err = {"x": np.ones(12) * 0.2, "z": np.ones(12) * 0.4} + df_err = DataFrame(d_err) + for err in [d_err, df_err]: + ax = _check_plot_works(df.plot, yerr=err) + self._check_has_errorbars(ax, xerr=0, yerr=1) + + @pytest.mark.slow + def test_errorbar_timeseries(self): + + with warnings.catch_warnings(): + d = {"x": np.arange(12), "y": np.arange(12, 0, -1)} + d_err = {"x": np.ones(12) * 0.2, "y": np.ones(12) * 0.4} + + # check time-series plots + ix = date_range("1/1/2000", "1/1/2001", freq="M") + tdf = DataFrame(d, index=ix) + tdf_err = DataFrame(d_err, index=ix) + + kinds = ["line", "bar", "barh"] + for kind in kinds: + ax = _check_plot_works(tdf.plot, yerr=tdf_err, kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=2) + ax = _check_plot_works(tdf.plot, yerr=d_err, kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=2) + ax = _check_plot_works(tdf.plot, y="y", yerr=tdf_err["x"], kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=1) + ax = _check_plot_works(tdf.plot, y="y", yerr="x", kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=1) + ax = _check_plot_works(tdf.plot, yerr=tdf_err, kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=2) + + # _check_plot_works adds an ax so catch warning. see GH #13188 + axes = _check_plot_works( + tdf.plot, kind=kind, yerr=tdf_err, subplots=True + ) + self._check_has_errorbars(axes, xerr=0, yerr=1) + + def test_errorbar_asymmetrical(self): + + np.random.seed(0) + err = np.random.rand(3, 2, 5) + + # each column is [0, 1, 2, 3, 4], [3, 4, 5, 6, 7]... + df = DataFrame(np.arange(15).reshape(3, 5)).T + + ax = df.plot(yerr=err, xerr=err / 2) + + yerr_0_0 = ax.collections[1].get_paths()[0].vertices[:, 1] + expected_0_0 = err[0, :, 0] * np.array([-1, 1]) + tm.assert_almost_equal(yerr_0_0, expected_0_0) + + with pytest.raises(ValueError): + df.plot(yerr=err.T) + + tm.close() + + def test_table(self): + df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10])) + _check_plot_works(df.plot, table=True) + _check_plot_works(df.plot, table=df) + + ax = df.plot() + assert len(ax.tables) == 0 + plotting.table(ax, df.T) + assert len(ax.tables) == 1 + + def test_errorbar_scatter(self): + df = DataFrame(np.random.randn(5, 2), index=range(5), columns=["x", "y"]) + df_err = DataFrame( + np.random.randn(5, 2) / 5, index=range(5), columns=["x", "y"] + ) + + ax = _check_plot_works(df.plot.scatter, x="x", y="y") + self._check_has_errorbars(ax, xerr=0, yerr=0) + ax = _check_plot_works(df.plot.scatter, x="x", y="y", xerr=df_err) + self._check_has_errorbars(ax, xerr=1, yerr=0) + + ax = _check_plot_works(df.plot.scatter, x="x", y="y", yerr=df_err) + self._check_has_errorbars(ax, xerr=0, yerr=1) + ax = _check_plot_works(df.plot.scatter, x="x", y="y", xerr=df_err, yerr=df_err) + self._check_has_errorbars(ax, xerr=1, yerr=1) + + def _check_errorbar_color(containers, expected, has_err="has_xerr"): + lines = [] + errs = [c.lines for c in ax.containers if getattr(c, has_err, False)][0] + for el in errs: + if is_list_like(el): + lines.extend(el) + else: + lines.append(el) + err_lines = [x for x in lines if x in ax.collections] + self._check_colors( + err_lines, linecolors=np.array([expected] * len(err_lines)) + ) + + # GH 8081 + df = DataFrame(np.random.randn(10, 5), columns=["a", "b", "c", "d", "e"]) + ax = df.plot.scatter(x="a", y="b", xerr="d", yerr="e", c="red") + self._check_has_errorbars(ax, xerr=1, yerr=1) + _check_errorbar_color(ax.containers, "red", has_err="has_xerr") + _check_errorbar_color(ax.containers, "red", has_err="has_yerr") + + ax = df.plot.scatter(x="a", y="b", yerr="e", color="green") + self._check_has_errorbars(ax, xerr=0, yerr=1) + _check_errorbar_color(ax.containers, "green", has_err="has_yerr") + + @pytest.mark.slow + def test_sharex_and_ax(self): + # https://github.com/pandas-dev/pandas/issues/9737 using gridspec, + # the axis in fig.get_axis() are sorted differently than pandas + # expected them, so make sure that only the right ones are removed + import matplotlib.pyplot as plt + + plt.close("all") + gs, axes = _generate_4_axes_via_gridspec() + + df = DataFrame( + { + "a": [1, 2, 3, 4, 5, 6], + "b": [1, 2, 3, 4, 5, 6], + "c": [1, 2, 3, 4, 5, 6], + "d": [1, 2, 3, 4, 5, 6], + } + ) + + def _check(axes): + for ax in axes: + assert len(ax.lines) == 1 + self._check_visible(ax.get_yticklabels(), visible=True) + for ax in [axes[0], axes[2]]: + self._check_visible(ax.get_xticklabels(), visible=False) + self._check_visible(ax.get_xticklabels(minor=True), visible=False) + for ax in [axes[1], axes[3]]: + self._check_visible(ax.get_xticklabels(), visible=True) + self._check_visible(ax.get_xticklabels(minor=True), visible=True) + + for ax in axes: + df.plot(x="a", y="b", title="title", ax=ax, sharex=True) + gs.tight_layout(plt.gcf()) + _check(axes) + tm.close() + + gs, axes = _generate_4_axes_via_gridspec() + with tm.assert_produces_warning(UserWarning): + axes = df.plot(subplots=True, ax=axes, sharex=True) + _check(axes) + tm.close() + + gs, axes = _generate_4_axes_via_gridspec() + # without sharex, no labels should be touched! + for ax in axes: + df.plot(x="a", y="b", title="title", ax=ax) + + gs.tight_layout(plt.gcf()) + for ax in axes: + assert len(ax.lines) == 1 + self._check_visible(ax.get_yticklabels(), visible=True) + self._check_visible(ax.get_xticklabels(), visible=True) + self._check_visible(ax.get_xticklabels(minor=True), visible=True) + tm.close() + + @pytest.mark.slow + def test_sharey_and_ax(self): + # https://github.com/pandas-dev/pandas/issues/9737 using gridspec, + # the axis in fig.get_axis() are sorted differently than pandas + # expected them, so make sure that only the right ones are removed + import matplotlib.pyplot as plt + + gs, axes = _generate_4_axes_via_gridspec() + + df = DataFrame( + { + "a": [1, 2, 3, 4, 5, 6], + "b": [1, 2, 3, 4, 5, 6], + "c": [1, 2, 3, 4, 5, 6], + "d": [1, 2, 3, 4, 5, 6], + } + ) + + def _check(axes): + for ax in axes: + assert len(ax.lines) == 1 + self._check_visible(ax.get_xticklabels(), visible=True) + self._check_visible(ax.get_xticklabels(minor=True), visible=True) + for ax in [axes[0], axes[1]]: + self._check_visible(ax.get_yticklabels(), visible=True) + for ax in [axes[2], axes[3]]: + self._check_visible(ax.get_yticklabels(), visible=False) + + for ax in axes: + df.plot(x="a", y="b", title="title", ax=ax, sharey=True) + gs.tight_layout(plt.gcf()) + _check(axes) + tm.close() + + gs, axes = _generate_4_axes_via_gridspec() + with tm.assert_produces_warning(UserWarning): + axes = df.plot(subplots=True, ax=axes, sharey=True) + + gs.tight_layout(plt.gcf()) + _check(axes) + tm.close() + + gs, axes = _generate_4_axes_via_gridspec() + # without sharex, no labels should be touched! + for ax in axes: + df.plot(x="a", y="b", title="title", ax=ax) + + gs.tight_layout(plt.gcf()) + for ax in axes: + assert len(ax.lines) == 1 + self._check_visible(ax.get_yticklabels(), visible=True) + self._check_visible(ax.get_xticklabels(), visible=True) + self._check_visible(ax.get_xticklabels(minor=True), visible=True) + + @td.skip_if_no_scipy + def test_memory_leak(self): + """ Check that every plot type gets properly collected. """ + import weakref + import gc + + results = {} + for kind in plotting.PlotAccessor._all_kinds: + + args = {} + if kind in ["hexbin", "scatter", "pie"]: + df = self.hexbin_df + args = {"x": "A", "y": "B"} + elif kind == "area": + df = self.tdf.abs() + else: + df = self.tdf + + # Use a weakref so we can see if the object gets collected without + # also preventing it from being collected + results[kind] = weakref.proxy(df.plot(kind=kind, **args)) + + # have matplotlib delete all the figures + tm.close() + # force a garbage collection + gc.collect() + for key in results: + # check that every plot was collected + with pytest.raises(ReferenceError): + # need to actually access something to get an error + results[key].lines + + @pytest.mark.slow + def test_df_subplots_patterns_minorticks(self): + # GH 10657 + import matplotlib.pyplot as plt + + df = DataFrame( + np.random.randn(10, 2), + index=date_range("1/1/2000", periods=10), + columns=list("AB"), + ) + + # shared subplots + fig, axes = plt.subplots(2, 1, sharex=True) + axes = df.plot(subplots=True, ax=axes) + for ax in axes: + assert len(ax.lines) == 1 + self._check_visible(ax.get_yticklabels(), visible=True) + # xaxis of 1st ax must be hidden + self._check_visible(axes[0].get_xticklabels(), visible=False) + self._check_visible(axes[0].get_xticklabels(minor=True), visible=False) + self._check_visible(axes[1].get_xticklabels(), visible=True) + self._check_visible(axes[1].get_xticklabels(minor=True), visible=True) + tm.close() + + fig, axes = plt.subplots(2, 1) + with tm.assert_produces_warning(UserWarning): + axes = df.plot(subplots=True, ax=axes, sharex=True) + for ax in axes: + assert len(ax.lines) == 1 + self._check_visible(ax.get_yticklabels(), visible=True) + # xaxis of 1st ax must be hidden + self._check_visible(axes[0].get_xticklabels(), visible=False) + self._check_visible(axes[0].get_xticklabels(minor=True), visible=False) + self._check_visible(axes[1].get_xticklabels(), visible=True) + self._check_visible(axes[1].get_xticklabels(minor=True), visible=True) + tm.close() + + # not shared + fig, axes = plt.subplots(2, 1) + axes = df.plot(subplots=True, ax=axes) + for ax in axes: + assert len(ax.lines) == 1 + self._check_visible(ax.get_yticklabels(), visible=True) + self._check_visible(ax.get_xticklabels(), visible=True) + self._check_visible(ax.get_xticklabels(minor=True), visible=True) + tm.close() + + @pytest.mark.slow + def test_df_gridspec_patterns(self): + # GH 10819 + import matplotlib.pyplot as plt + import matplotlib.gridspec as gridspec + + ts = Series(np.random.randn(10), index=date_range("1/1/2000", periods=10)) + + df = DataFrame(np.random.randn(10, 2), index=ts.index, columns=list("AB")) + + def _get_vertical_grid(): + gs = gridspec.GridSpec(3, 1) + fig = plt.figure() + ax1 = fig.add_subplot(gs[:2, :]) + ax2 = fig.add_subplot(gs[2, :]) + return ax1, ax2 + + def _get_horizontal_grid(): + gs = gridspec.GridSpec(1, 3) + fig = plt.figure() + ax1 = fig.add_subplot(gs[:, :2]) + ax2 = fig.add_subplot(gs[:, 2]) + return ax1, ax2 + + for ax1, ax2 in [_get_vertical_grid(), _get_horizontal_grid()]: + ax1 = ts.plot(ax=ax1) + assert len(ax1.lines) == 1 + ax2 = df.plot(ax=ax2) + assert len(ax2.lines) == 2 + for ax in [ax1, ax2]: + self._check_visible(ax.get_yticklabels(), visible=True) + self._check_visible(ax.get_xticklabels(), visible=True) + self._check_visible(ax.get_xticklabels(minor=True), visible=True) + tm.close() + + # subplots=True + for ax1, ax2 in [_get_vertical_grid(), _get_horizontal_grid()]: + axes = df.plot(subplots=True, ax=[ax1, ax2]) + assert len(ax1.lines) == 1 + assert len(ax2.lines) == 1 + for ax in axes: + self._check_visible(ax.get_yticklabels(), visible=True) + self._check_visible(ax.get_xticklabels(), visible=True) + self._check_visible(ax.get_xticklabels(minor=True), visible=True) + tm.close() + + # vertical / subplots / sharex=True / sharey=True + ax1, ax2 = _get_vertical_grid() + with tm.assert_produces_warning(UserWarning): + axes = df.plot(subplots=True, ax=[ax1, ax2], sharex=True, sharey=True) + assert len(axes[0].lines) == 1 + assert len(axes[1].lines) == 1 + for ax in [ax1, ax2]: + # yaxis are visible because there is only one column + self._check_visible(ax.get_yticklabels(), visible=True) + # xaxis of axes0 (top) are hidden + self._check_visible(axes[0].get_xticklabels(), visible=False) + self._check_visible(axes[0].get_xticklabels(minor=True), visible=False) + self._check_visible(axes[1].get_xticklabels(), visible=True) + self._check_visible(axes[1].get_xticklabels(minor=True), visible=True) + tm.close() + + # horizontal / subplots / sharex=True / sharey=True + ax1, ax2 = _get_horizontal_grid() + with tm.assert_produces_warning(UserWarning): + axes = df.plot(subplots=True, ax=[ax1, ax2], sharex=True, sharey=True) + assert len(axes[0].lines) == 1 + assert len(axes[1].lines) == 1 + self._check_visible(axes[0].get_yticklabels(), visible=True) + # yaxis of axes1 (right) are hidden + self._check_visible(axes[1].get_yticklabels(), visible=False) + for ax in [ax1, ax2]: + # xaxis are visible because there is only one column + self._check_visible(ax.get_xticklabels(), visible=True) + self._check_visible(ax.get_xticklabels(minor=True), visible=True) + tm.close() + + # boxed + def _get_boxed_grid(): + gs = gridspec.GridSpec(3, 3) + fig = plt.figure() + ax1 = fig.add_subplot(gs[:2, :2]) + ax2 = fig.add_subplot(gs[:2, 2]) + ax3 = fig.add_subplot(gs[2, :2]) + ax4 = fig.add_subplot(gs[2, 2]) + return ax1, ax2, ax3, ax4 + + axes = _get_boxed_grid() + df = DataFrame(np.random.randn(10, 4), index=ts.index, columns=list("ABCD")) + axes = df.plot(subplots=True, ax=axes) + for ax in axes: + assert len(ax.lines) == 1 + # axis are visible because these are not shared + self._check_visible(ax.get_yticklabels(), visible=True) + self._check_visible(ax.get_xticklabels(), visible=True) + self._check_visible(ax.get_xticklabels(minor=True), visible=True) + tm.close() + + # subplots / sharex=True / sharey=True + axes = _get_boxed_grid() + with tm.assert_produces_warning(UserWarning): + axes = df.plot(subplots=True, ax=axes, sharex=True, sharey=True) + for ax in axes: + assert len(ax.lines) == 1 + for ax in [axes[0], axes[2]]: # left column + self._check_visible(ax.get_yticklabels(), visible=True) + for ax in [axes[1], axes[3]]: # right column + self._check_visible(ax.get_yticklabels(), visible=False) + for ax in [axes[0], axes[1]]: # top row + self._check_visible(ax.get_xticklabels(), visible=False) + self._check_visible(ax.get_xticklabels(minor=True), visible=False) + for ax in [axes[2], axes[3]]: # bottom row + self._check_visible(ax.get_xticklabels(), visible=True) + self._check_visible(ax.get_xticklabels(minor=True), visible=True) + tm.close() + + @pytest.mark.slow + def test_df_grid_settings(self): + # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 + self._check_grid_settings( + DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}), + plotting.PlotAccessor._dataframe_kinds, + kws={"x": "a", "y": "b"}, + ) + + def test_invalid_colormap(self): + df = DataFrame(randn(3, 2), columns=["A", "B"]) + + with pytest.raises(ValueError): + df.plot(colormap="invalid_colormap") + + def test_plain_axes(self): + + # supplied ax itself is a SubplotAxes, but figure contains also + # a plain Axes object (GH11556) + fig, ax = self.plt.subplots() + fig.add_axes([0.2, 0.2, 0.2, 0.2]) + Series(rand(10)).plot(ax=ax) + + # supplied ax itself is a plain Axes, but because the cmap keyword + # a new ax is created for the colorbar -> also multiples axes (GH11520) + df = DataFrame({"a": randn(8), "b": randn(8)}) + fig = self.plt.figure() + ax = fig.add_axes((0, 0, 1, 1)) + df.plot(kind="scatter", ax=ax, x="a", y="b", c="a", cmap="hsv") + + # other examples + fig, ax = self.plt.subplots() + from mpl_toolkits.axes_grid1 import make_axes_locatable + + divider = make_axes_locatable(ax) + cax = divider.append_axes("right", size="5%", pad=0.05) + Series(rand(10)).plot(ax=ax) + Series(rand(10)).plot(ax=cax) + + fig, ax = self.plt.subplots() + from mpl_toolkits.axes_grid1.inset_locator import inset_axes + + iax = inset_axes(ax, width="30%", height=1.0, loc=3) + Series(rand(10)).plot(ax=ax) + Series(rand(10)).plot(ax=iax) + + def test_passed_bar_colors(self): + import matplotlib as mpl + + color_tuples = [(0.9, 0, 0, 1), (0, 0.9, 0, 1), (0, 0, 0.9, 1)] + colormap = mpl.colors.ListedColormap(color_tuples) + barplot = pd.DataFrame([[1, 2, 3]]).plot(kind="bar", cmap=colormap) + assert color_tuples == [c.get_facecolor() for c in barplot.patches] + + def test_rcParams_bar_colors(self): + import matplotlib as mpl + + color_tuples = [(0.9, 0, 0, 1), (0, 0.9, 0, 1), (0, 0, 0.9, 1)] + with mpl.rc_context(rc={"axes.prop_cycle": mpl.cycler("color", color_tuples)}): + barplot = pd.DataFrame([[1, 2, 3]]).plot(kind="bar") + assert color_tuples == [c.get_facecolor() for c in barplot.patches] + + @pytest.mark.parametrize("method", ["line", "barh", "bar"]) + def test_secondary_axis_font_size(self, method): + # GH: 12565 + df = ( + pd.DataFrame(np.random.randn(15, 2), columns=list("AB")) + .assign(C=lambda df: df.B.cumsum()) + .assign(D=lambda df: df.C * 1.1) + ) + + fontsize = 20 + sy = ["C", "D"] + + kwargs = dict(secondary_y=sy, fontsize=fontsize, mark_right=True) + ax = getattr(df.plot, method)(**kwargs) + self._check_ticks_props(axes=ax.right_ax, ylabelsize=fontsize) + + @pytest.mark.slow + def test_x_string_values_ticks(self): + # Test if string plot index have a fixed xtick position + # GH: 7612, GH: 22334 + df = pd.DataFrame( + { + "sales": [3, 2, 3], + "visits": [20, 42, 28], + "day": ["Monday", "Tuesday", "Wednesday"], + } + ) + ax = df.plot.area(x="day") + ax.set_xlim(-1, 3) + xticklabels = [t.get_text() for t in ax.get_xticklabels()] + labels_position = dict(zip(xticklabels, ax.get_xticks())) + # Testing if the label stayed at the right position + assert labels_position["Monday"] == 0.0 + assert labels_position["Tuesday"] == 1.0 + assert labels_position["Wednesday"] == 2.0 + + @pytest.mark.slow + def test_x_multiindex_values_ticks(self): + # Test if multiindex plot index have a fixed xtick position + # GH: 15912 + index = pd.MultiIndex.from_product([[2012, 2013], [1, 2]]) + df = pd.DataFrame(np.random.randn(4, 2), columns=["A", "B"], index=index) + ax = df.plot() + ax.set_xlim(-1, 4) + xticklabels = [t.get_text() for t in ax.get_xticklabels()] + labels_position = dict(zip(xticklabels, ax.get_xticks())) + # Testing if the label stayed at the right position + assert labels_position["(2012, 1)"] == 0.0 + assert labels_position["(2012, 2)"] == 1.0 + assert labels_position["(2013, 1)"] == 2.0 + assert labels_position["(2013, 2)"] == 3.0 + + @pytest.mark.parametrize("kind", ["line", "area"]) + def test_xlim_plot_line(self, kind): + # test if xlim is set correctly in plot.line and plot.area + # GH 27686 + df = pd.DataFrame([2, 4], index=[1, 2]) + ax = df.plot(kind=kind) + xlims = ax.get_xlim() + assert xlims[0] < 1 + assert xlims[1] > 2 + + def test_xlim_plot_line_correctly_in_mixed_plot_type(self): + # test if xlim is set correctly when ax contains multiple different kinds + # of plots, GH 27686 + fig, ax = self.plt.subplots() + + indexes = ["k1", "k2", "k3", "k4"] + df = pd.DataFrame( + { + "s1": [1000, 2000, 1500, 2000], + "s2": [900, 1400, 2000, 3000], + "s3": [1500, 1500, 1600, 1200], + "secondary_y": [1, 3, 4, 3], + }, + index=indexes, + ) + df[["s1", "s2", "s3"]].plot.bar(ax=ax, stacked=False) + df[["secondary_y"]].plot(ax=ax, secondary_y=True) + + xlims = ax.get_xlim() + assert xlims[0] < 0 + assert xlims[1] > 3 + + # make sure axis labels are plotted correctly as well + xticklabels = [t.get_text() for t in ax.get_xticklabels()] + assert xticklabels == indexes + + def test_subplots_sharex_false(self): + # test when sharex is set to False, two plots should have different + # labels, GH 25160 + df = pd.DataFrame(np.random.rand(10, 2)) + df.iloc[5:, 1] = np.nan + df.iloc[:5, 0] = np.nan + + figs, axs = self.plt.subplots(2, 1) + df.plot.line(ax=axs, subplots=True, sharex=False) + + expected_ax1 = np.arange(4.5, 10, 0.5) + expected_ax2 = np.arange(-0.5, 5, 0.5) + + tm.assert_numpy_array_equal(axs[0].get_xticks(), expected_ax1) + tm.assert_numpy_array_equal(axs[1].get_xticks(), expected_ax2) + + def test_plot_no_rows(self): + # GH 27758 + df = pd.DataFrame(columns=["foo"], dtype=int) + assert df.empty + ax = df.plot() + assert len(ax.get_lines()) == 1 + line = ax.get_lines()[0] + assert len(line.get_xdata()) == 0 + assert len(line.get_ydata()) == 0 + + def test_plot_no_numeric_data(self): + df = pd.DataFrame(["a", "b", "c"]) + with pytest.raises(TypeError): + df.plot() + + def test_missing_markers_legend(self): + # 14958 + df = pd.DataFrame(np.random.randn(8, 3), columns=["A", "B", "C"]) + ax = df.plot(y=["A"], marker="x", linestyle="solid") + df.plot(y=["B"], marker="o", linestyle="dotted", ax=ax) + df.plot(y=["C"], marker="<", linestyle="dotted", ax=ax) + + self._check_legend_labels(ax, labels=["A", "B", "C"]) + self._check_legend_marker(ax, expected_markers=["x", "o", "<"]) + + def test_missing_markers_legend_using_style(self): + # 14563 + df = pd.DataFrame( + { + "A": [1, 2, 3, 4, 5, 6], + "B": [2, 4, 1, 3, 2, 4], + "C": [3, 3, 2, 6, 4, 2], + "X": [1, 2, 3, 4, 5, 6], + } + ) + + fig, ax = self.plt.subplots() + for kind in "ABC": + df.plot("X", kind, label=kind, ax=ax, style=".") + + self._check_legend_labels(ax, labels=["A", "B", "C"]) + self._check_legend_marker(ax, expected_markers=[".", ".", "."]) + + +def _generate_4_axes_via_gridspec(): + import matplotlib.pyplot as plt + import matplotlib as mpl + import matplotlib.gridspec # noqa + + gs = mpl.gridspec.GridSpec(2, 2) + ax_tl = plt.subplot(gs[0, 0]) + ax_ll = plt.subplot(gs[1, 0]) + ax_tr = plt.subplot(gs[0, 1]) + ax_lr = plt.subplot(gs[1, 1]) + + return gs, [ax_tl, ax_ll, ax_tr, ax_lr] diff --git a/venv/Lib/site-packages/pandas/tests/plotting/test_groupby.py b/venv/Lib/site-packages/pandas/tests/plotting/test_groupby.py new file mode 100644 index 0000000..8fec4bb --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/plotting/test_groupby.py @@ -0,0 +1,69 @@ +# coding: utf-8 + +""" Test cases for GroupBy.plot """ + + +import numpy as np + +import pandas.util._test_decorators as td + +from pandas import DataFrame, Series +import pandas._testing as tm +from pandas.tests.plotting.common import TestPlotBase + + +@td.skip_if_no_mpl +class TestDataFrameGroupByPlots(TestPlotBase): + def test_series_groupby_plotting_nominally_works(self): + n = 10 + weight = Series(np.random.normal(166, 20, size=n)) + height = Series(np.random.normal(60, 10, size=n)) + with tm.RNGContext(42): + gender = np.random.choice(["male", "female"], size=n) + + weight.groupby(gender).plot() + tm.close() + height.groupby(gender).hist() + tm.close() + # Regression test for GH8733 + height.groupby(gender).plot(alpha=0.5) + tm.close() + + def test_plotting_with_float_index_works(self): + # GH 7025 + df = DataFrame( + {"def": [1, 1, 1, 2, 2, 2, 3, 3, 3], "val": np.random.randn(9)}, + index=[1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0], + ) + + df.groupby("def")["val"].plot() + tm.close() + df.groupby("def")["val"].apply(lambda x: x.plot()) + tm.close() + + def test_hist_single_row(self): + # GH10214 + bins = np.arange(80, 100 + 2, 1) + df = DataFrame({"Name": ["AAA", "BBB"], "ByCol": [1, 2], "Mark": [85, 89]}) + df["Mark"].hist(by=df["ByCol"], bins=bins) + df = DataFrame({"Name": ["AAA"], "ByCol": [1], "Mark": [85]}) + df["Mark"].hist(by=df["ByCol"], bins=bins) + + def test_plot_submethod_works(self): + df = DataFrame({"x": [1, 2, 3, 4, 5], "y": [1, 2, 3, 2, 1], "z": list("ababa")}) + df.groupby("z").plot.scatter("x", "y") + tm.close() + df.groupby("z")["x"].plot.line() + tm.close() + + def test_plot_kwargs(self): + + df = DataFrame({"x": [1, 2, 3, 4, 5], "y": [1, 2, 3, 2, 1], "z": list("ababa")}) + + res = df.groupby("z").plot(kind="scatter", x="x", y="y") + # check that a scatter plot is effectively plotted: the axes should + # contain a PathCollection from the scatter plot (GH11805) + assert len(res["a"].collections) == 1 + + res = df.groupby("z").plot.scatter(x="x", y="y") + assert len(res["a"].collections) == 1 diff --git a/venv/Lib/site-packages/pandas/tests/plotting/test_hist_method.py b/venv/Lib/site-packages/pandas/tests/plotting/test_hist_method.py new file mode 100644 index 0000000..50ebbc2 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/plotting/test_hist_method.py @@ -0,0 +1,464 @@ +# coding: utf-8 + +""" Test cases for .hist method """ + +import numpy as np +from numpy.random import randn +import pytest + +import pandas.util._test_decorators as td + +from pandas import DataFrame, Series +import pandas._testing as tm +from pandas.tests.plotting.common import TestPlotBase, _check_plot_works + + +@td.skip_if_no_mpl +class TestSeriesPlots(TestPlotBase): + def setup_method(self, method): + TestPlotBase.setup_method(self, method) + import matplotlib as mpl + + mpl.rcdefaults() + + self.ts = tm.makeTimeSeries() + self.ts.name = "ts" + + @pytest.mark.slow + def test_hist_legacy(self): + _check_plot_works(self.ts.hist) + _check_plot_works(self.ts.hist, grid=False) + _check_plot_works(self.ts.hist, figsize=(8, 10)) + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + _check_plot_works(self.ts.hist, by=self.ts.index.month) + with tm.assert_produces_warning(UserWarning): + _check_plot_works(self.ts.hist, by=self.ts.index.month, bins=5) + + fig, ax = self.plt.subplots(1, 1) + _check_plot_works(self.ts.hist, ax=ax) + _check_plot_works(self.ts.hist, ax=ax, figure=fig) + _check_plot_works(self.ts.hist, figure=fig) + tm.close() + + fig, (ax1, ax2) = self.plt.subplots(1, 2) + _check_plot_works(self.ts.hist, figure=fig, ax=ax1) + _check_plot_works(self.ts.hist, figure=fig, ax=ax2) + + with pytest.raises(ValueError): + self.ts.hist(by=self.ts.index, figure=fig) + + @pytest.mark.slow + def test_hist_bins_legacy(self): + df = DataFrame(np.random.randn(10, 2)) + ax = df.hist(bins=2)[0][0] + assert len(ax.patches) == 2 + + @pytest.mark.slow + def test_hist_layout(self): + df = self.hist_df + with pytest.raises(ValueError): + df.height.hist(layout=(1, 1)) + + with pytest.raises(ValueError): + df.height.hist(layout=[1, 1]) + + @pytest.mark.slow + def test_hist_layout_with_by(self): + df = self.hist_df + + # _check_plot_works adds an `ax` kwarg to the method call + # so we get a warning about an axis being cleared, even + # though we don't explicing pass one, see GH #13188 + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, by=df.gender, layout=(2, 1)) + self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, by=df.gender, layout=(3, -1)) + self._check_axes_shape(axes, axes_num=2, layout=(3, 1)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, by=df.category, layout=(4, 1)) + self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, by=df.category, layout=(2, -1)) + self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, by=df.category, layout=(3, -1)) + self._check_axes_shape(axes, axes_num=4, layout=(3, 2)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, by=df.category, layout=(-1, 4)) + self._check_axes_shape(axes, axes_num=4, layout=(1, 4)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, by=df.classroom, layout=(2, 2)) + self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + + axes = df.height.hist(by=df.category, layout=(4, 2), figsize=(12, 7)) + self._check_axes_shape(axes, axes_num=4, layout=(4, 2), figsize=(12, 7)) + + @pytest.mark.slow + def test_hist_no_overlap(self): + from matplotlib.pyplot import subplot, gcf + + x = Series(randn(2)) + y = Series(randn(2)) + subplot(121) + x.hist() + subplot(122) + y.hist() + fig = gcf() + axes = fig.axes + assert len(axes) == 2 + + @pytest.mark.slow + def test_hist_by_no_extra_plots(self): + df = self.hist_df + axes = df.height.hist(by=df.gender) # noqa + assert len(self.plt.get_fignums()) == 1 + + @pytest.mark.slow + def test_plot_fails_when_ax_differs_from_figure(self): + from pylab import figure + + fig1 = figure() + fig2 = figure() + ax1 = fig1.add_subplot(111) + with pytest.raises(AssertionError): + self.ts.hist(ax=ax1, figure=fig2) + + +@td.skip_if_no_mpl +class TestDataFramePlots(TestPlotBase): + @pytest.mark.slow + def test_hist_df_legacy(self): + from matplotlib.patches import Rectangle + + with tm.assert_produces_warning(UserWarning): + _check_plot_works(self.hist_df.hist) + + # make sure layout is handled + df = DataFrame(randn(100, 3)) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.hist, grid=False) + self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + assert not axes[1, 1].get_visible() + + df = DataFrame(randn(100, 1)) + _check_plot_works(df.hist) + + # make sure layout is handled + df = DataFrame(randn(100, 6)) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.hist, layout=(4, 2)) + self._check_axes_shape(axes, axes_num=6, layout=(4, 2)) + + # make sure sharex, sharey is handled + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.hist, sharex=True, sharey=True) + + # handle figsize arg + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.hist, figsize=(8, 10)) + + # check bins argument + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.hist, bins=5) + + # make sure xlabelsize and xrot are handled + ser = df[0] + xf, yf = 20, 18 + xrot, yrot = 30, 40 + axes = ser.hist(xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) + self._check_ticks_props( + axes, xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot + ) + + xf, yf = 20, 18 + xrot, yrot = 30, 40 + axes = df.hist(xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) + self._check_ticks_props( + axes, xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot + ) + + tm.close() + + ax = ser.hist(cumulative=True, bins=4, density=True) + # height of last bin (index 5) must be 1.0 + rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] + tm.assert_almost_equal(rects[-1].get_height(), 1.0) + + tm.close() + ax = ser.hist(log=True) + # scale of y must be 'log' + self._check_ax_scales(ax, yaxis="log") + + tm.close() + + # propagate attr exception from matplotlib.Axes.hist + with pytest.raises(AttributeError): + ser.hist(foo="bar") + + @pytest.mark.slow + def test_hist_non_numerical_raises(self): + # gh-10444 + df = DataFrame(np.random.rand(10, 2)) + df_o = df.astype(np.object) + + msg = "hist method requires numerical columns, nothing to plot." + with pytest.raises(ValueError, match=msg): + df_o.hist() + + @pytest.mark.slow + def test_hist_layout(self): + df = DataFrame(randn(100, 3)) + + layout_to_expected_size = ( + {"layout": None, "expected_size": (2, 2)}, # default is 2x2 + {"layout": (2, 2), "expected_size": (2, 2)}, + {"layout": (4, 1), "expected_size": (4, 1)}, + {"layout": (1, 4), "expected_size": (1, 4)}, + {"layout": (3, 3), "expected_size": (3, 3)}, + {"layout": (-1, 4), "expected_size": (1, 4)}, + {"layout": (4, -1), "expected_size": (4, 1)}, + {"layout": (-1, 2), "expected_size": (2, 2)}, + {"layout": (2, -1), "expected_size": (2, 2)}, + ) + + for layout_test in layout_to_expected_size: + axes = df.hist(layout=layout_test["layout"]) + expected = layout_test["expected_size"] + self._check_axes_shape(axes, axes_num=3, layout=expected) + + # layout too small for all 4 plots + with pytest.raises(ValueError): + df.hist(layout=(1, 1)) + + # invalid format for layout + with pytest.raises(ValueError): + df.hist(layout=(1,)) + with pytest.raises(ValueError): + df.hist(layout=(-1, -1)) + + @pytest.mark.slow + # GH 9351 + def test_tight_layout(self): + df = DataFrame(randn(100, 3)) + _check_plot_works(df.hist) + self.plt.tight_layout() + + tm.close() + + def test_hist_subplot_xrot(self): + # GH 30288 + df = DataFrame( + { + "length": [1.5, 0.5, 1.2, 0.9, 3], + "animal": ["pig", "rabbit", "pig", "pig", "rabbit"], + } + ) + axes = _check_plot_works( + df.hist, + filterwarnings="always", + column="length", + by="animal", + bins=5, + xrot=0, + ) + self._check_ticks_props(axes, xrot=0) + + +@td.skip_if_no_mpl +class TestDataFrameGroupByPlots(TestPlotBase): + @pytest.mark.slow + def test_grouped_hist_legacy(self): + from matplotlib.patches import Rectangle + from pandas.plotting._matplotlib.hist import _grouped_hist + + df = DataFrame(randn(500, 2), columns=["A", "B"]) + df["C"] = np.random.randint(0, 4, 500) + df["D"] = ["X"] * 500 + + axes = _grouped_hist(df.A, by=df.C) + self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) + + tm.close() + axes = df.hist(by=df.C) + self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) + + tm.close() + # group by a key with single value + axes = df.hist(by="D", rot=30) + self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + self._check_ticks_props(axes, xrot=30) + + tm.close() + # make sure kwargs to hist are handled + xf, yf = 20, 18 + xrot, yrot = 30, 40 + + axes = _grouped_hist( + df.A, + by=df.C, + cumulative=True, + bins=4, + xlabelsize=xf, + xrot=xrot, + ylabelsize=yf, + yrot=yrot, + density=True, + ) + # height of last bin (index 5) must be 1.0 + for ax in axes.ravel(): + rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] + height = rects[-1].get_height() + tm.assert_almost_equal(height, 1.0) + self._check_ticks_props( + axes, xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot + ) + + tm.close() + axes = _grouped_hist(df.A, by=df.C, log=True) + # scale of y must be 'log' + self._check_ax_scales(axes, yaxis="log") + + tm.close() + # propagate attr exception from matplotlib.Axes.hist + with pytest.raises(AttributeError): + _grouped_hist(df.A, by=df.C, foo="bar") + + msg = "Specify figure size by tuple instead" + with pytest.raises(ValueError, match=msg): + df.hist(by="C", figsize="default") + + @pytest.mark.slow + def test_grouped_hist_legacy2(self): + n = 10 + weight = Series(np.random.normal(166, 20, size=n)) + height = Series(np.random.normal(60, 10, size=n)) + with tm.RNGContext(42): + gender_int = np.random.choice([0, 1], size=n) + df_int = DataFrame({"height": height, "weight": weight, "gender": gender_int}) + gb = df_int.groupby("gender") + axes = gb.hist() + assert len(axes) == 2 + assert len(self.plt.get_fignums()) == 2 + tm.close() + + @pytest.mark.slow + def test_grouped_hist_layout(self): + df = self.hist_df + msg = "Layout of 1x1 must be larger than required size 2" + with pytest.raises(ValueError, match=msg): + df.hist(column="weight", by=df.gender, layout=(1, 1)) + + msg = "Layout of 1x3 must be larger than required size 4" + with pytest.raises(ValueError, match=msg): + df.hist(column="height", by=df.category, layout=(1, 3)) + + msg = "At least one dimension of layout must be positive" + with pytest.raises(ValueError, match=msg): + df.hist(column="height", by=df.category, layout=(-1, -1)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works( + df.hist, column="height", by=df.gender, layout=(2, 1) + ) + self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works( + df.hist, column="height", by=df.gender, layout=(2, -1) + ) + self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) + + axes = df.hist(column="height", by=df.category, layout=(4, 1)) + self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) + + axes = df.hist(column="height", by=df.category, layout=(-1, 1)) + self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) + + axes = df.hist(column="height", by=df.category, layout=(4, 2), figsize=(12, 8)) + self._check_axes_shape(axes, axes_num=4, layout=(4, 2), figsize=(12, 8)) + tm.close() + + # GH 6769 + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works( + df.hist, column="height", by="classroom", layout=(2, 2) + ) + self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + + # without column + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.hist, by="classroom") + self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + + axes = df.hist(by="gender", layout=(3, 5)) + self._check_axes_shape(axes, axes_num=2, layout=(3, 5)) + + axes = df.hist(column=["height", "weight", "category"]) + self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + + @pytest.mark.slow + def test_grouped_hist_multiple_axes(self): + # GH 6970, GH 7069 + df = self.hist_df + + fig, axes = self.plt.subplots(2, 3) + returned = df.hist(column=["height", "weight", "category"], ax=axes[0]) + self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) + tm.assert_numpy_array_equal(returned, axes[0]) + assert returned[0].figure is fig + returned = df.hist(by="classroom", ax=axes[1]) + self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) + tm.assert_numpy_array_equal(returned, axes[1]) + assert returned[0].figure is fig + + with pytest.raises(ValueError): + fig, axes = self.plt.subplots(2, 3) + # pass different number of axes from required + axes = df.hist(column="height", ax=axes) + + @pytest.mark.slow + def test_axis_share_x(self): + df = self.hist_df + # GH4089 + ax1, ax2 = df.hist(column="height", by=df.gender, sharex=True) + + # share x + assert ax1._shared_x_axes.joined(ax1, ax2) + assert ax2._shared_x_axes.joined(ax1, ax2) + + # don't share y + assert not ax1._shared_y_axes.joined(ax1, ax2) + assert not ax2._shared_y_axes.joined(ax1, ax2) + + @pytest.mark.slow + def test_axis_share_y(self): + df = self.hist_df + ax1, ax2 = df.hist(column="height", by=df.gender, sharey=True) + + # share y + assert ax1._shared_y_axes.joined(ax1, ax2) + assert ax2._shared_y_axes.joined(ax1, ax2) + + # don't share x + assert not ax1._shared_x_axes.joined(ax1, ax2) + assert not ax2._shared_x_axes.joined(ax1, ax2) + + @pytest.mark.slow + def test_axis_share_xy(self): + df = self.hist_df + ax1, ax2 = df.hist(column="height", by=df.gender, sharex=True, sharey=True) + + # share both x and y + assert ax1._shared_x_axes.joined(ax1, ax2) + assert ax2._shared_x_axes.joined(ax1, ax2) + + assert ax1._shared_y_axes.joined(ax1, ax2) + assert ax2._shared_y_axes.joined(ax1, ax2) diff --git a/venv/Lib/site-packages/pandas/tests/plotting/test_misc.py b/venv/Lib/site-packages/pandas/tests/plotting/test_misc.py new file mode 100644 index 0000000..c8aa1f2 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/plotting/test_misc.py @@ -0,0 +1,408 @@ +# coding: utf-8 + +""" Test cases for misc plot functions """ + +import numpy as np +from numpy import random +from numpy.random import randn +import pytest + +import pandas.util._test_decorators as td + +from pandas import DataFrame, Series +import pandas._testing as tm +from pandas.tests.plotting.common import TestPlotBase, _check_plot_works + +import pandas.plotting as plotting + + +@td.skip_if_mpl +def test_import_error_message(): + # GH-19810 + df = DataFrame({"A": [1, 2]}) + + with pytest.raises(ImportError, match="matplotlib is required for plotting"): + df.plot() + + +def test_get_accessor_args(): + func = plotting._core.PlotAccessor._get_call_args + + msg = "Called plot accessor for type list, expected Series or DataFrame" + with pytest.raises(TypeError, match=msg): + func(backend_name="", data=[], args=[], kwargs={}) + + msg = "should not be called with positional arguments" + with pytest.raises(TypeError, match=msg): + func(backend_name="", data=Series(dtype=object), args=["line", None], kwargs={}) + + x, y, kind, kwargs = func( + backend_name="", + data=DataFrame(), + args=["x"], + kwargs={"y": "y", "kind": "bar", "grid": False}, + ) + assert x == "x" + assert y == "y" + assert kind == "bar" + assert kwargs == {"grid": False} + + x, y, kind, kwargs = func( + backend_name="pandas.plotting._matplotlib", + data=Series(dtype=object), + args=[], + kwargs={}, + ) + assert x is None + assert y is None + assert kind == "line" + assert len(kwargs) == 22 + + +@td.skip_if_no_mpl +class TestSeriesPlots(TestPlotBase): + def setup_method(self, method): + TestPlotBase.setup_method(self, method) + import matplotlib as mpl + + mpl.rcdefaults() + + self.ts = tm.makeTimeSeries() + self.ts.name = "ts" + + @pytest.mark.slow + def test_autocorrelation_plot(self): + from pandas.plotting import autocorrelation_plot + + _check_plot_works(autocorrelation_plot, series=self.ts) + _check_plot_works(autocorrelation_plot, series=self.ts.values) + + ax = autocorrelation_plot(self.ts, label="Test") + self._check_legend_labels(ax, labels=["Test"]) + + @pytest.mark.slow + def test_lag_plot(self): + from pandas.plotting import lag_plot + + _check_plot_works(lag_plot, series=self.ts) + _check_plot_works(lag_plot, series=self.ts, lag=5) + + @pytest.mark.slow + def test_bootstrap_plot(self): + from pandas.plotting import bootstrap_plot + + _check_plot_works(bootstrap_plot, series=self.ts, size=10) + + +@td.skip_if_no_mpl +class TestDataFramePlots(TestPlotBase): + @td.skip_if_no_scipy + def test_scatter_matrix_axis(self): + scatter_matrix = plotting.scatter_matrix + + with tm.RNGContext(42): + df = DataFrame(randn(100, 3)) + + # we are plotting multiples on a sub-plot + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works( + scatter_matrix, filterwarnings="always", frame=df, range_padding=0.1 + ) + axes0_labels = axes[0][0].yaxis.get_majorticklabels() + + # GH 5662 + expected = ["-2", "0", "2"] + self._check_text_labels(axes0_labels, expected) + self._check_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) + + df[0] = (df[0] - 2) / 3 + + # we are plotting multiples on a sub-plot + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works( + scatter_matrix, filterwarnings="always", frame=df, range_padding=0.1 + ) + axes0_labels = axes[0][0].yaxis.get_majorticklabels() + expected = ["-1.0", "-0.5", "0.0"] + self._check_text_labels(axes0_labels, expected) + self._check_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) + + @pytest.mark.slow + def test_andrews_curves(self, iris): + from pandas.plotting import andrews_curves + from matplotlib import cm + + df = iris + + _check_plot_works(andrews_curves, frame=df, class_column="Name") + + rgba = ("#556270", "#4ECDC4", "#C7F464") + ax = _check_plot_works( + andrews_curves, frame=df, class_column="Name", color=rgba + ) + self._check_colors( + ax.get_lines()[:10], linecolors=rgba, mapping=df["Name"][:10] + ) + + cnames = ["dodgerblue", "aquamarine", "seagreen"] + ax = _check_plot_works( + andrews_curves, frame=df, class_column="Name", color=cnames + ) + self._check_colors( + ax.get_lines()[:10], linecolors=cnames, mapping=df["Name"][:10] + ) + + ax = _check_plot_works( + andrews_curves, frame=df, class_column="Name", colormap=cm.jet + ) + cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] + self._check_colors( + ax.get_lines()[:10], linecolors=cmaps, mapping=df["Name"][:10] + ) + + length = 10 + df = DataFrame( + { + "A": random.rand(length), + "B": random.rand(length), + "C": random.rand(length), + "Name": ["A"] * length, + } + ) + + _check_plot_works(andrews_curves, frame=df, class_column="Name") + + rgba = ("#556270", "#4ECDC4", "#C7F464") + ax = _check_plot_works( + andrews_curves, frame=df, class_column="Name", color=rgba + ) + self._check_colors( + ax.get_lines()[:10], linecolors=rgba, mapping=df["Name"][:10] + ) + + cnames = ["dodgerblue", "aquamarine", "seagreen"] + ax = _check_plot_works( + andrews_curves, frame=df, class_column="Name", color=cnames + ) + self._check_colors( + ax.get_lines()[:10], linecolors=cnames, mapping=df["Name"][:10] + ) + + ax = _check_plot_works( + andrews_curves, frame=df, class_column="Name", colormap=cm.jet + ) + cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] + self._check_colors( + ax.get_lines()[:10], linecolors=cmaps, mapping=df["Name"][:10] + ) + + colors = ["b", "g", "r"] + df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3], "C": [1, 2, 3], "Name": colors}) + ax = andrews_curves(df, "Name", color=colors) + handles, labels = ax.get_legend_handles_labels() + self._check_colors(handles, linecolors=colors) + + @pytest.mark.slow + def test_parallel_coordinates(self, iris): + from pandas.plotting import parallel_coordinates + from matplotlib import cm + + df = iris + + ax = _check_plot_works(parallel_coordinates, frame=df, class_column="Name") + nlines = len(ax.get_lines()) + nxticks = len(ax.xaxis.get_ticklabels()) + + rgba = ("#556270", "#4ECDC4", "#C7F464") + ax = _check_plot_works( + parallel_coordinates, frame=df, class_column="Name", color=rgba + ) + self._check_colors( + ax.get_lines()[:10], linecolors=rgba, mapping=df["Name"][:10] + ) + + cnames = ["dodgerblue", "aquamarine", "seagreen"] + ax = _check_plot_works( + parallel_coordinates, frame=df, class_column="Name", color=cnames + ) + self._check_colors( + ax.get_lines()[:10], linecolors=cnames, mapping=df["Name"][:10] + ) + + ax = _check_plot_works( + parallel_coordinates, frame=df, class_column="Name", colormap=cm.jet + ) + cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] + self._check_colors( + ax.get_lines()[:10], linecolors=cmaps, mapping=df["Name"][:10] + ) + + ax = _check_plot_works( + parallel_coordinates, frame=df, class_column="Name", axvlines=False + ) + assert len(ax.get_lines()) == (nlines - nxticks) + + colors = ["b", "g", "r"] + df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3], "C": [1, 2, 3], "Name": colors}) + ax = parallel_coordinates(df, "Name", color=colors) + handles, labels = ax.get_legend_handles_labels() + self._check_colors(handles, linecolors=colors) + + # not sure if this is indicative of a problem + @pytest.mark.filterwarnings("ignore:Attempting to set:UserWarning") + def test_parallel_coordinates_with_sorted_labels(self): + """ For #15908 """ + from pandas.plotting import parallel_coordinates + + df = DataFrame( + { + "feat": list(range(30)), + "class": [2 for _ in range(10)] + + [3 for _ in range(10)] + + [1 for _ in range(10)], + } + ) + ax = parallel_coordinates(df, "class", sort_labels=True) + polylines, labels = ax.get_legend_handles_labels() + color_label_tuples = zip( + [polyline.get_color() for polyline in polylines], labels + ) + ordered_color_label_tuples = sorted(color_label_tuples, key=lambda x: x[1]) + prev_next_tupels = zip( + list(ordered_color_label_tuples[0:-1]), list(ordered_color_label_tuples[1:]) + ) + for prev, nxt in prev_next_tupels: + # labels and colors are ordered strictly increasing + assert prev[1] < nxt[1] and prev[0] < nxt[0] + + @pytest.mark.slow + def test_radviz(self, iris): + from pandas.plotting import radviz + from matplotlib import cm + + df = iris + _check_plot_works(radviz, frame=df, class_column="Name") + + rgba = ("#556270", "#4ECDC4", "#C7F464") + ax = _check_plot_works(radviz, frame=df, class_column="Name", color=rgba) + # skip Circle drawn as ticks + patches = [p for p in ax.patches[:20] if p.get_label() != ""] + self._check_colors(patches[:10], facecolors=rgba, mapping=df["Name"][:10]) + + cnames = ["dodgerblue", "aquamarine", "seagreen"] + _check_plot_works(radviz, frame=df, class_column="Name", color=cnames) + patches = [p for p in ax.patches[:20] if p.get_label() != ""] + self._check_colors(patches, facecolors=cnames, mapping=df["Name"][:10]) + + _check_plot_works(radviz, frame=df, class_column="Name", colormap=cm.jet) + cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] + patches = [p for p in ax.patches[:20] if p.get_label() != ""] + self._check_colors(patches, facecolors=cmaps, mapping=df["Name"][:10]) + + colors = [[0.0, 0.0, 1.0, 1.0], [0.0, 0.5, 1.0, 1.0], [1.0, 0.0, 0.0, 1.0]] + df = DataFrame( + {"A": [1, 2, 3], "B": [2, 1, 3], "C": [3, 2, 1], "Name": ["b", "g", "r"]} + ) + ax = radviz(df, "Name", color=colors) + handles, labels = ax.get_legend_handles_labels() + self._check_colors(handles, facecolors=colors) + + @pytest.mark.slow + def test_subplot_titles(self, iris): + df = iris.drop("Name", axis=1).head() + # Use the column names as the subplot titles + title = list(df.columns) + + # Case len(title) == len(df) + plot = df.plot(subplots=True, title=title) + assert [p.get_title() for p in plot] == title + + # Case len(title) > len(df) + msg = ( + "The length of `title` must equal the number of columns if" + " using `title` of type `list` and `subplots=True`" + ) + with pytest.raises(ValueError, match=msg): + df.plot(subplots=True, title=title + ["kittens > puppies"]) + + # Case len(title) < len(df) + with pytest.raises(ValueError, match=msg): + df.plot(subplots=True, title=title[:2]) + + # Case subplots=False and title is of type list + msg = ( + "Using `title` of type `list` is not supported unless" + " `subplots=True` is passed" + ) + with pytest.raises(ValueError, match=msg): + df.plot(subplots=False, title=title) + + # Case df with 3 numeric columns but layout of (2,2) + plot = df.drop("SepalWidth", axis=1).plot( + subplots=True, layout=(2, 2), title=title[:-1] + ) + title_list = [ax.get_title() for sublist in plot for ax in sublist] + assert title_list == title[:3] + [""] + + def test_get_standard_colors_random_seed(self): + # GH17525 + df = DataFrame(np.zeros((10, 10))) + + # Make sure that the random seed isn't reset by _get_standard_colors + plotting.parallel_coordinates(df, 0) + rand1 = random.random() + plotting.parallel_coordinates(df, 0) + rand2 = random.random() + assert rand1 != rand2 + + # Make sure it produces the same colors every time it's called + from pandas.plotting._matplotlib.style import _get_standard_colors + + color1 = _get_standard_colors(1, color_type="random") + color2 = _get_standard_colors(1, color_type="random") + assert color1 == color2 + + def test_get_standard_colors_default_num_colors(self): + from pandas.plotting._matplotlib.style import _get_standard_colors + + # Make sure the default color_types returns the specified amount + color1 = _get_standard_colors(1, color_type="default") + color2 = _get_standard_colors(9, color_type="default") + color3 = _get_standard_colors(20, color_type="default") + assert len(color1) == 1 + assert len(color2) == 9 + assert len(color3) == 20 + + def test_plot_single_color(self): + # Example from #20585. All 3 bars should have the same color + df = DataFrame( + { + "account-start": ["2017-02-03", "2017-03-03", "2017-01-01"], + "client": ["Alice Anders", "Bob Baker", "Charlie Chaplin"], + "balance": [-1432.32, 10.43, 30000.00], + "db-id": [1234, 2424, 251], + "proxy-id": [525, 1525, 2542], + "rank": [52, 525, 32], + } + ) + ax = df.client.value_counts().plot.bar() + colors = [rect.get_facecolor() for rect in ax.get_children()[0:3]] + assert all(color == colors[0] for color in colors) + + def test_get_standard_colors_no_appending(self): + # GH20726 + + # Make sure not to add more colors so that matplotlib can cycle + # correctly. + from matplotlib import cm + from pandas.plotting._matplotlib.style import _get_standard_colors + + color_before = cm.gnuplot(range(5)) + color_after = _get_standard_colors(1, color=color_before) + assert len(color_after) == len(color_before) + + df = DataFrame(np.random.randn(48, 4), columns=list("ABCD")) + + color_list = cm.gnuplot(np.linspace(0, 1, 16)) + p = df.A.plot.bar(figsize=(16, 7), color=color_list) + assert p.patches[1].get_facecolor() == p.patches[17].get_facecolor() diff --git a/venv/Lib/site-packages/pandas/tests/plotting/test_series.py b/venv/Lib/site-packages/pandas/tests/plotting/test_series.py new file mode 100644 index 0000000..8463f30 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/plotting/test_series.py @@ -0,0 +1,938 @@ +# coding: utf-8 + +""" Test cases for Series.plot """ + + +from datetime import datetime +from itertools import chain + +import numpy as np +from numpy.random import randn +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import DataFrame, Series, date_range +import pandas._testing as tm +from pandas.tests.plotting.common import TestPlotBase, _check_plot_works + +import pandas.plotting as plotting + + +@td.skip_if_no_mpl +class TestSeriesPlots(TestPlotBase): + def setup_method(self, method): + TestPlotBase.setup_method(self, method) + import matplotlib as mpl + + mpl.rcdefaults() + + self.ts = tm.makeTimeSeries() + self.ts.name = "ts" + + self.series = tm.makeStringSeries() + self.series.name = "series" + + self.iseries = tm.makePeriodSeries() + self.iseries.name = "iseries" + + @pytest.mark.slow + def test_plot(self): + _check_plot_works(self.ts.plot, label="foo") + _check_plot_works(self.ts.plot, use_index=False) + axes = _check_plot_works(self.ts.plot, rot=0) + self._check_ticks_props(axes, xrot=0) + + ax = _check_plot_works(self.ts.plot, style=".", logy=True) + self._check_ax_scales(ax, yaxis="log") + + ax = _check_plot_works(self.ts.plot, style=".", logx=True) + self._check_ax_scales(ax, xaxis="log") + + ax = _check_plot_works(self.ts.plot, style=".", loglog=True) + self._check_ax_scales(ax, xaxis="log", yaxis="log") + + _check_plot_works(self.ts[:10].plot.bar) + _check_plot_works(self.ts.plot.area, stacked=False) + _check_plot_works(self.iseries.plot) + + for kind in ["line", "bar", "barh", "kde", "hist", "box"]: + _check_plot_works(self.series[:5].plot, kind=kind) + + _check_plot_works(self.series[:10].plot.barh) + ax = _check_plot_works(Series(randn(10)).plot.bar, color="black") + self._check_colors([ax.patches[0]], facecolors=["black"]) + + # GH 6951 + ax = _check_plot_works(self.ts.plot, subplots=True) + self._check_axes_shape(ax, axes_num=1, layout=(1, 1)) + + ax = _check_plot_works(self.ts.plot, subplots=True, layout=(-1, 1)) + self._check_axes_shape(ax, axes_num=1, layout=(1, 1)) + ax = _check_plot_works(self.ts.plot, subplots=True, layout=(1, -1)) + self._check_axes_shape(ax, axes_num=1, layout=(1, 1)) + + @pytest.mark.slow + def test_plot_figsize_and_title(self): + # figsize and title + _, ax = self.plt.subplots() + ax = self.series.plot(title="Test", figsize=(16, 8), ax=ax) + self._check_text_labels(ax.title, "Test") + self._check_axes_shape(ax, axes_num=1, layout=(1, 1), figsize=(16, 8)) + + def test_dont_modify_rcParams(self): + # GH 8242 + key = "axes.prop_cycle" + colors = self.plt.rcParams[key] + _, ax = self.plt.subplots() + Series([1, 2, 3]).plot(ax=ax) + assert colors == self.plt.rcParams[key] + + def test_ts_line_lim(self): + fig, ax = self.plt.subplots() + ax = self.ts.plot(ax=ax) + xmin, xmax = ax.get_xlim() + lines = ax.get_lines() + assert xmin <= lines[0].get_data(orig=False)[0][0] + assert xmax >= lines[0].get_data(orig=False)[0][-1] + tm.close() + + ax = self.ts.plot(secondary_y=True, ax=ax) + xmin, xmax = ax.get_xlim() + lines = ax.get_lines() + assert xmin <= lines[0].get_data(orig=False)[0][0] + assert xmax >= lines[0].get_data(orig=False)[0][-1] + + def test_ts_area_lim(self): + _, ax = self.plt.subplots() + ax = self.ts.plot.area(stacked=False, ax=ax) + xmin, xmax = ax.get_xlim() + line = ax.get_lines()[0].get_data(orig=False)[0] + assert xmin <= line[0] + assert xmax >= line[-1] + tm.close() + + # GH 7471 + _, ax = self.plt.subplots() + ax = self.ts.plot.area(stacked=False, x_compat=True, ax=ax) + xmin, xmax = ax.get_xlim() + line = ax.get_lines()[0].get_data(orig=False)[0] + assert xmin <= line[0] + assert xmax >= line[-1] + tm.close() + + tz_ts = self.ts.copy() + tz_ts.index = tz_ts.tz_localize("GMT").tz_convert("CET") + _, ax = self.plt.subplots() + ax = tz_ts.plot.area(stacked=False, x_compat=True, ax=ax) + xmin, xmax = ax.get_xlim() + line = ax.get_lines()[0].get_data(orig=False)[0] + assert xmin <= line[0] + assert xmax >= line[-1] + tm.close() + + _, ax = self.plt.subplots() + ax = tz_ts.plot.area(stacked=False, secondary_y=True, ax=ax) + xmin, xmax = ax.get_xlim() + line = ax.get_lines()[0].get_data(orig=False)[0] + assert xmin <= line[0] + assert xmax >= line[-1] + + def test_label(self): + s = Series([1, 2]) + _, ax = self.plt.subplots() + ax = s.plot(label="LABEL", legend=True, ax=ax) + self._check_legend_labels(ax, labels=["LABEL"]) + self.plt.close() + _, ax = self.plt.subplots() + ax = s.plot(legend=True, ax=ax) + self._check_legend_labels(ax, labels=["None"]) + self.plt.close() + # get name from index + s.name = "NAME" + _, ax = self.plt.subplots() + ax = s.plot(legend=True, ax=ax) + self._check_legend_labels(ax, labels=["NAME"]) + self.plt.close() + # override the default + _, ax = self.plt.subplots() + ax = s.plot(legend=True, label="LABEL", ax=ax) + self._check_legend_labels(ax, labels=["LABEL"]) + self.plt.close() + # Add lebel info, but don't draw + _, ax = self.plt.subplots() + ax = s.plot(legend=False, label="LABEL", ax=ax) + assert ax.get_legend() is None # Hasn't been drawn + ax.legend() # draw it + self._check_legend_labels(ax, labels=["LABEL"]) + + def test_boolean(self): + # GH 23719 + s = Series([False, False, True]) + _check_plot_works(s.plot, include_bool=True) + + msg = "no numeric data to plot" + with pytest.raises(TypeError, match=msg): + _check_plot_works(s.plot) + + def test_line_area_nan_series(self): + values = [1, 2, np.nan, 3] + s = Series(values) + ts = Series(values, index=tm.makeDateIndex(k=4)) + + for d in [s, ts]: + ax = _check_plot_works(d.plot) + masked = ax.lines[0].get_ydata() + # remove nan for comparison purpose + exp = np.array([1, 2, 3], dtype=np.float64) + tm.assert_numpy_array_equal(np.delete(masked.data, 2), exp) + tm.assert_numpy_array_equal( + masked.mask, np.array([False, False, True, False]) + ) + + expected = np.array([1, 2, 0, 3], dtype=np.float64) + ax = _check_plot_works(d.plot, stacked=True) + tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected) + ax = _check_plot_works(d.plot.area) + tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected) + ax = _check_plot_works(d.plot.area, stacked=False) + tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected) + + def test_line_use_index_false(self): + s = Series([1, 2, 3], index=["a", "b", "c"]) + s.index.name = "The Index" + _, ax = self.plt.subplots() + ax = s.plot(use_index=False, ax=ax) + label = ax.get_xlabel() + assert label == "" + _, ax = self.plt.subplots() + ax2 = s.plot.bar(use_index=False, ax=ax) + label2 = ax2.get_xlabel() + assert label2 == "" + + @pytest.mark.slow + def test_bar_log(self): + expected = np.array([1e-1, 1e0, 1e1, 1e2, 1e3, 1e4]) + + _, ax = self.plt.subplots() + ax = Series([200, 500]).plot.bar(log=True, ax=ax) + tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), expected) + tm.close() + + _, ax = self.plt.subplots() + ax = Series([200, 500]).plot.barh(log=True, ax=ax) + tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), expected) + tm.close() + + # GH 9905 + expected = np.array([1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1]) + + _, ax = self.plt.subplots() + ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind="bar", ax=ax) + ymin = 0.0007943282347242822 + ymax = 0.12589254117941673 + res = ax.get_ylim() + tm.assert_almost_equal(res[0], ymin) + tm.assert_almost_equal(res[1], ymax) + tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), expected) + tm.close() + + _, ax = self.plt.subplots() + ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind="barh", ax=ax) + res = ax.get_xlim() + tm.assert_almost_equal(res[0], ymin) + tm.assert_almost_equal(res[1], ymax) + tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), expected) + + @pytest.mark.slow + def test_bar_ignore_index(self): + df = Series([1, 2, 3, 4], index=["a", "b", "c", "d"]) + _, ax = self.plt.subplots() + ax = df.plot.bar(use_index=False, ax=ax) + self._check_text_labels(ax.get_xticklabels(), ["0", "1", "2", "3"]) + + def test_bar_user_colors(self): + s = Series([1, 2, 3, 4]) + ax = s.plot.bar(color=["red", "blue", "blue", "red"]) + result = [p.get_facecolor() for p in ax.patches] + expected = [ + (1.0, 0.0, 0.0, 1.0), + (0.0, 0.0, 1.0, 1.0), + (0.0, 0.0, 1.0, 1.0), + (1.0, 0.0, 0.0, 1.0), + ] + assert result == expected + + def test_rotation(self): + df = DataFrame(randn(5, 5)) + # Default rot 0 + _, ax = self.plt.subplots() + axes = df.plot(ax=ax) + self._check_ticks_props(axes, xrot=0) + + _, ax = self.plt.subplots() + axes = df.plot(rot=30, ax=ax) + self._check_ticks_props(axes, xrot=30) + + def test_irregular_datetime(self): + rng = date_range("1/1/2000", "3/1/2000") + rng = rng[[0, 1, 2, 3, 5, 9, 10, 11, 12]] + ser = Series(randn(len(rng)), rng) + _, ax = self.plt.subplots() + ax = ser.plot(ax=ax) + xp = datetime(1999, 1, 1).toordinal() + ax.set_xlim("1/1/1999", "1/1/2001") + assert xp == ax.get_xlim()[0] + + def test_unsorted_index_xlim(self): + ser = Series( + [0.0, 1.0, np.nan, 3.0, 4.0, 5.0, 6.0], + index=[1.0, 0.0, 3.0, 2.0, np.nan, 3.0, 2.0], + ) + _, ax = self.plt.subplots() + ax = ser.plot(ax=ax) + xmin, xmax = ax.get_xlim() + lines = ax.get_lines() + assert xmin <= np.nanmin(lines[0].get_data(orig=False)[0]) + assert xmax >= np.nanmax(lines[0].get_data(orig=False)[0]) + + @pytest.mark.slow + def test_pie_series(self): + # if sum of values is less than 1.0, pie handle them as rate and draw + # semicircle. + series = Series( + np.random.randint(1, 5), index=["a", "b", "c", "d", "e"], name="YLABEL" + ) + ax = _check_plot_works(series.plot.pie) + self._check_text_labels(ax.texts, series.index) + assert ax.get_ylabel() == "YLABEL" + + # without wedge labels + ax = _check_plot_works(series.plot.pie, labels=None) + self._check_text_labels(ax.texts, [""] * 5) + + # with less colors than elements + color_args = ["r", "g", "b"] + ax = _check_plot_works(series.plot.pie, colors=color_args) + + color_expected = ["r", "g", "b", "r", "g"] + self._check_colors(ax.patches, facecolors=color_expected) + + # with labels and colors + labels = ["A", "B", "C", "D", "E"] + color_args = ["r", "g", "b", "c", "m"] + ax = _check_plot_works(series.plot.pie, labels=labels, colors=color_args) + self._check_text_labels(ax.texts, labels) + self._check_colors(ax.patches, facecolors=color_args) + + # with autopct and fontsize + ax = _check_plot_works( + series.plot.pie, colors=color_args, autopct="%.2f", fontsize=7 + ) + pcts = [f"{s*100:.2f}" for s in series.values / float(series.sum())] + expected_texts = list(chain.from_iterable(zip(series.index, pcts))) + self._check_text_labels(ax.texts, expected_texts) + for t in ax.texts: + assert t.get_fontsize() == 7 + + # includes negative value + with pytest.raises(ValueError): + series = Series([1, 2, 0, 4, -1], index=["a", "b", "c", "d", "e"]) + series.plot.pie() + + # includes nan + series = Series([1, 2, np.nan, 4], index=["a", "b", "c", "d"], name="YLABEL") + ax = _check_plot_works(series.plot.pie) + self._check_text_labels(ax.texts, ["a", "b", "", "d"]) + + def test_pie_nan(self): + s = Series([1, np.nan, 1, 1]) + _, ax = self.plt.subplots() + ax = s.plot.pie(legend=True, ax=ax) + expected = ["0", "", "2", "3"] + result = [x.get_text() for x in ax.texts] + assert result == expected + + @pytest.mark.slow + def test_hist_df_kwargs(self): + df = DataFrame(np.random.randn(10, 2)) + _, ax = self.plt.subplots() + ax = df.plot.hist(bins=5, ax=ax) + assert len(ax.patches) == 10 + + @pytest.mark.slow + def test_hist_df_with_nonnumerics(self): + # GH 9853 + with tm.RNGContext(1): + df = DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"]) + df["E"] = ["x", "y"] * 5 + _, ax = self.plt.subplots() + ax = df.plot.hist(bins=5, ax=ax) + assert len(ax.patches) == 20 + + _, ax = self.plt.subplots() + ax = df.plot.hist(ax=ax) # bins=10 + assert len(ax.patches) == 40 + + @pytest.mark.slow + def test_hist_legacy(self): + _check_plot_works(self.ts.hist) + _check_plot_works(self.ts.hist, grid=False) + _check_plot_works(self.ts.hist, figsize=(8, 10)) + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + _check_plot_works(self.ts.hist, by=self.ts.index.month) + with tm.assert_produces_warning(UserWarning): + _check_plot_works(self.ts.hist, by=self.ts.index.month, bins=5) + + fig, ax = self.plt.subplots(1, 1) + _check_plot_works(self.ts.hist, ax=ax) + _check_plot_works(self.ts.hist, ax=ax, figure=fig) + _check_plot_works(self.ts.hist, figure=fig) + tm.close() + + fig, (ax1, ax2) = self.plt.subplots(1, 2) + _check_plot_works(self.ts.hist, figure=fig, ax=ax1) + _check_plot_works(self.ts.hist, figure=fig, ax=ax2) + + with pytest.raises(ValueError): + self.ts.hist(by=self.ts.index, figure=fig) + + @pytest.mark.slow + def test_hist_bins_legacy(self): + df = DataFrame(np.random.randn(10, 2)) + ax = df.hist(bins=2)[0][0] + assert len(ax.patches) == 2 + + @pytest.mark.slow + def test_hist_layout(self): + df = self.hist_df + with pytest.raises(ValueError): + df.height.hist(layout=(1, 1)) + + with pytest.raises(ValueError): + df.height.hist(layout=[1, 1]) + + @pytest.mark.slow + def test_hist_layout_with_by(self): + df = self.hist_df + + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, by=df.gender, layout=(2, 1)) + self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, by=df.gender, layout=(3, -1)) + self._check_axes_shape(axes, axes_num=2, layout=(3, 1)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, by=df.category, layout=(4, 1)) + self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, by=df.category, layout=(2, -1)) + self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, by=df.category, layout=(3, -1)) + self._check_axes_shape(axes, axes_num=4, layout=(3, 2)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, by=df.category, layout=(-1, 4)) + self._check_axes_shape(axes, axes_num=4, layout=(1, 4)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, by=df.classroom, layout=(2, 2)) + self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + + axes = df.height.hist(by=df.category, layout=(4, 2), figsize=(12, 7)) + self._check_axes_shape(axes, axes_num=4, layout=(4, 2), figsize=(12, 7)) + + @pytest.mark.slow + def test_hist_no_overlap(self): + from matplotlib.pyplot import subplot, gcf + + x = Series(randn(2)) + y = Series(randn(2)) + subplot(121) + x.hist() + subplot(122) + y.hist() + fig = gcf() + axes = fig.axes + assert len(axes) == 2 + + @pytest.mark.slow + def test_hist_secondary_legend(self): + # GH 9610 + df = DataFrame(np.random.randn(30, 4), columns=list("abcd")) + + # primary -> secondary + _, ax = self.plt.subplots() + ax = df["a"].plot.hist(legend=True, ax=ax) + df["b"].plot.hist(ax=ax, legend=True, secondary_y=True) + # both legends are dran on left ax + # left and right axis must be visible + self._check_legend_labels(ax, labels=["a", "b (right)"]) + assert ax.get_yaxis().get_visible() + assert ax.right_ax.get_yaxis().get_visible() + tm.close() + + # secondary -> secondary + _, ax = self.plt.subplots() + ax = df["a"].plot.hist(legend=True, secondary_y=True, ax=ax) + df["b"].plot.hist(ax=ax, legend=True, secondary_y=True) + # both legends are draw on left ax + # left axis must be invisible, right axis must be visible + self._check_legend_labels(ax.left_ax, labels=["a (right)", "b (right)"]) + assert not ax.left_ax.get_yaxis().get_visible() + assert ax.get_yaxis().get_visible() + tm.close() + + # secondary -> primary + _, ax = self.plt.subplots() + ax = df["a"].plot.hist(legend=True, secondary_y=True, ax=ax) + # right axes is returned + df["b"].plot.hist(ax=ax, legend=True) + # both legends are draw on left ax + # left and right axis must be visible + self._check_legend_labels(ax.left_ax, labels=["a (right)", "b"]) + assert ax.left_ax.get_yaxis().get_visible() + assert ax.get_yaxis().get_visible() + tm.close() + + @pytest.mark.slow + def test_df_series_secondary_legend(self): + # GH 9779 + df = DataFrame(np.random.randn(30, 3), columns=list("abc")) + s = Series(np.random.randn(30), name="x") + + # primary -> secondary (without passing ax) + _, ax = self.plt.subplots() + ax = df.plot(ax=ax) + s.plot(legend=True, secondary_y=True, ax=ax) + # both legends are dran on left ax + # left and right axis must be visible + self._check_legend_labels(ax, labels=["a", "b", "c", "x (right)"]) + assert ax.get_yaxis().get_visible() + assert ax.right_ax.get_yaxis().get_visible() + tm.close() + + # primary -> secondary (with passing ax) + _, ax = self.plt.subplots() + ax = df.plot(ax=ax) + s.plot(ax=ax, legend=True, secondary_y=True) + # both legends are dran on left ax + # left and right axis must be visible + self._check_legend_labels(ax, labels=["a", "b", "c", "x (right)"]) + assert ax.get_yaxis().get_visible() + assert ax.right_ax.get_yaxis().get_visible() + tm.close() + + # secondary -> secondary (without passing ax) + _, ax = self.plt.subplots() + ax = df.plot(secondary_y=True, ax=ax) + s.plot(legend=True, secondary_y=True, ax=ax) + # both legends are dran on left ax + # left axis must be invisible and right axis must be visible + expected = ["a (right)", "b (right)", "c (right)", "x (right)"] + self._check_legend_labels(ax.left_ax, labels=expected) + assert not ax.left_ax.get_yaxis().get_visible() + assert ax.get_yaxis().get_visible() + tm.close() + + # secondary -> secondary (with passing ax) + _, ax = self.plt.subplots() + ax = df.plot(secondary_y=True, ax=ax) + s.plot(ax=ax, legend=True, secondary_y=True) + # both legends are dran on left ax + # left axis must be invisible and right axis must be visible + expected = ["a (right)", "b (right)", "c (right)", "x (right)"] + self._check_legend_labels(ax.left_ax, expected) + assert not ax.left_ax.get_yaxis().get_visible() + assert ax.get_yaxis().get_visible() + tm.close() + + # secondary -> secondary (with passing ax) + _, ax = self.plt.subplots() + ax = df.plot(secondary_y=True, mark_right=False, ax=ax) + s.plot(ax=ax, legend=True, secondary_y=True) + # both legends are dran on left ax + # left axis must be invisible and right axis must be visible + expected = ["a", "b", "c", "x (right)"] + self._check_legend_labels(ax.left_ax, expected) + assert not ax.left_ax.get_yaxis().get_visible() + assert ax.get_yaxis().get_visible() + tm.close() + + @pytest.mark.slow + @pytest.mark.parametrize( + "input_logy, expected_scale", [(True, "log"), ("sym", "symlog")] + ) + def test_secondary_logy(self, input_logy, expected_scale): + # GH 25545 + s1 = Series(np.random.randn(30)) + s2 = Series(np.random.randn(30)) + + # GH 24980 + ax1 = s1.plot(logy=input_logy) + ax2 = s2.plot(secondary_y=True, logy=input_logy) + + assert ax1.get_yscale() == expected_scale + assert ax2.get_yscale() == expected_scale + + @pytest.mark.slow + def test_plot_fails_with_dupe_color_and_style(self): + x = Series(randn(2)) + with pytest.raises(ValueError): + _, ax = self.plt.subplots() + x.plot(style="k--", color="k", ax=ax) + + @pytest.mark.slow + @td.skip_if_no_scipy + def test_hist_kde(self): + + _, ax = self.plt.subplots() + ax = self.ts.plot.hist(logy=True, ax=ax) + self._check_ax_scales(ax, yaxis="log") + xlabels = ax.get_xticklabels() + # ticks are values, thus ticklabels are blank + self._check_text_labels(xlabels, [""] * len(xlabels)) + ylabels = ax.get_yticklabels() + self._check_text_labels(ylabels, [""] * len(ylabels)) + + _check_plot_works(self.ts.plot.kde) + _check_plot_works(self.ts.plot.density) + _, ax = self.plt.subplots() + ax = self.ts.plot.kde(logy=True, ax=ax) + self._check_ax_scales(ax, yaxis="log") + xlabels = ax.get_xticklabels() + self._check_text_labels(xlabels, [""] * len(xlabels)) + ylabels = ax.get_yticklabels() + self._check_text_labels(ylabels, [""] * len(ylabels)) + + @pytest.mark.slow + @td.skip_if_no_scipy + def test_kde_kwargs(self): + sample_points = np.linspace(-100, 100, 20) + _check_plot_works(self.ts.plot.kde, bw_method="scott", ind=20) + _check_plot_works(self.ts.plot.kde, bw_method=None, ind=20) + _check_plot_works(self.ts.plot.kde, bw_method=None, ind=np.int(20)) + _check_plot_works(self.ts.plot.kde, bw_method=0.5, ind=sample_points) + _check_plot_works(self.ts.plot.density, bw_method=0.5, ind=sample_points) + _, ax = self.plt.subplots() + ax = self.ts.plot.kde(logy=True, bw_method=0.5, ind=sample_points, ax=ax) + self._check_ax_scales(ax, yaxis="log") + self._check_text_labels(ax.yaxis.get_label(), "Density") + + @pytest.mark.slow + @td.skip_if_no_scipy + def test_kde_missing_vals(self): + s = Series(np.random.uniform(size=50)) + s[0] = np.nan + axes = _check_plot_works(s.plot.kde) + + # gh-14821: check if the values have any missing values + assert any(~np.isnan(axes.lines[0].get_xdata())) + + @pytest.mark.slow + def test_hist_kwargs(self): + _, ax = self.plt.subplots() + ax = self.ts.plot.hist(bins=5, ax=ax) + assert len(ax.patches) == 5 + self._check_text_labels(ax.yaxis.get_label(), "Frequency") + tm.close() + + _, ax = self.plt.subplots() + ax = self.ts.plot.hist(orientation="horizontal", ax=ax) + self._check_text_labels(ax.xaxis.get_label(), "Frequency") + tm.close() + + _, ax = self.plt.subplots() + ax = self.ts.plot.hist(align="left", stacked=True, ax=ax) + tm.close() + + @pytest.mark.slow + @td.skip_if_no_scipy + def test_hist_kde_color(self): + _, ax = self.plt.subplots() + ax = self.ts.plot.hist(logy=True, bins=10, color="b", ax=ax) + self._check_ax_scales(ax, yaxis="log") + assert len(ax.patches) == 10 + self._check_colors(ax.patches, facecolors=["b"] * 10) + + _, ax = self.plt.subplots() + ax = self.ts.plot.kde(logy=True, color="r", ax=ax) + self._check_ax_scales(ax, yaxis="log") + lines = ax.get_lines() + assert len(lines) == 1 + self._check_colors(lines, ["r"]) + + @pytest.mark.slow + def test_boxplot_series(self): + _, ax = self.plt.subplots() + ax = self.ts.plot.box(logy=True, ax=ax) + self._check_ax_scales(ax, yaxis="log") + xlabels = ax.get_xticklabels() + self._check_text_labels(xlabels, [self.ts.name]) + ylabels = ax.get_yticklabels() + self._check_text_labels(ylabels, [""] * len(ylabels)) + + @pytest.mark.slow + def test_kind_both_ways(self): + s = Series(range(3)) + kinds = ( + plotting.PlotAccessor._common_kinds + plotting.PlotAccessor._series_kinds + ) + _, ax = self.plt.subplots() + for kind in kinds: + + s.plot(kind=kind, ax=ax) + getattr(s.plot, kind)() + + @pytest.mark.slow + def test_invalid_plot_data(self): + s = Series(list("abcd")) + _, ax = self.plt.subplots() + for kind in plotting.PlotAccessor._common_kinds: + + msg = "no numeric data to plot" + with pytest.raises(TypeError, match=msg): + s.plot(kind=kind, ax=ax) + + @pytest.mark.slow + def test_valid_object_plot(self): + s = Series(range(10), dtype=object) + for kind in plotting.PlotAccessor._common_kinds: + _check_plot_works(s.plot, kind=kind) + + def test_partially_invalid_plot_data(self): + s = Series(["a", "b", 1.0, 2]) + _, ax = self.plt.subplots() + for kind in plotting.PlotAccessor._common_kinds: + + msg = "no numeric data to plot" + with pytest.raises(TypeError, match=msg): + s.plot(kind=kind, ax=ax) + + def test_invalid_kind(self): + s = Series([1, 2]) + with pytest.raises(ValueError): + s.plot(kind="aasdf") + + @pytest.mark.slow + def test_dup_datetime_index_plot(self): + dr1 = date_range("1/1/2009", periods=4) + dr2 = date_range("1/2/2009", periods=4) + index = dr1.append(dr2) + values = randn(index.size) + s = Series(values, index=index) + _check_plot_works(s.plot) + + @pytest.mark.slow + def test_errorbar_plot(self): + + s = Series(np.arange(10), name="x") + s_err = np.random.randn(10) + d_err = DataFrame(randn(10, 2), index=s.index, columns=["x", "y"]) + # test line and bar plots + kinds = ["line", "bar"] + for kind in kinds: + ax = _check_plot_works(s.plot, yerr=Series(s_err), kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=1) + ax = _check_plot_works(s.plot, yerr=s_err, kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=1) + ax = _check_plot_works(s.plot, yerr=s_err.tolist(), kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=1) + ax = _check_plot_works(s.plot, yerr=d_err, kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=1) + ax = _check_plot_works(s.plot, xerr=0.2, yerr=0.2, kind=kind) + self._check_has_errorbars(ax, xerr=1, yerr=1) + + ax = _check_plot_works(s.plot, xerr=s_err) + self._check_has_errorbars(ax, xerr=1, yerr=0) + + # test time series plotting + ix = date_range("1/1/2000", "1/1/2001", freq="M") + ts = Series(np.arange(12), index=ix, name="x") + ts_err = Series(np.random.randn(12), index=ix) + td_err = DataFrame(randn(12, 2), index=ix, columns=["x", "y"]) + + ax = _check_plot_works(ts.plot, yerr=ts_err) + self._check_has_errorbars(ax, xerr=0, yerr=1) + ax = _check_plot_works(ts.plot, yerr=td_err) + self._check_has_errorbars(ax, xerr=0, yerr=1) + + # check incorrect lengths and types + with pytest.raises(ValueError): + s.plot(yerr=np.arange(11)) + + s_err = ["zzz"] * 10 + with pytest.raises(TypeError): + s.plot(yerr=s_err) + + def test_table(self): + _check_plot_works(self.series.plot, table=True) + _check_plot_works(self.series.plot, table=self.series) + + @pytest.mark.slow + def test_series_grid_settings(self): + # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 + self._check_grid_settings( + Series([1, 2, 3]), + plotting.PlotAccessor._series_kinds + plotting.PlotAccessor._common_kinds, + ) + + @pytest.mark.slow + def test_standard_colors(self): + from pandas.plotting._matplotlib.style import _get_standard_colors + + for c in ["r", "red", "green", "#FF0000"]: + result = _get_standard_colors(1, color=c) + assert result == [c] + + result = _get_standard_colors(1, color=[c]) + assert result == [c] + + result = _get_standard_colors(3, color=c) + assert result == [c] * 3 + + result = _get_standard_colors(3, color=[c]) + assert result == [c] * 3 + + @pytest.mark.slow + def test_standard_colors_all(self): + import matplotlib.colors as colors + from pandas.plotting._matplotlib.style import _get_standard_colors + + # multiple colors like mediumaquamarine + for c in colors.cnames: + result = _get_standard_colors(num_colors=1, color=c) + assert result == [c] + + result = _get_standard_colors(num_colors=1, color=[c]) + assert result == [c] + + result = _get_standard_colors(num_colors=3, color=c) + assert result == [c] * 3 + + result = _get_standard_colors(num_colors=3, color=[c]) + assert result == [c] * 3 + + # single letter colors like k + for c in colors.ColorConverter.colors: + result = _get_standard_colors(num_colors=1, color=c) + assert result == [c] + + result = _get_standard_colors(num_colors=1, color=[c]) + assert result == [c] + + result = _get_standard_colors(num_colors=3, color=c) + assert result == [c] * 3 + + result = _get_standard_colors(num_colors=3, color=[c]) + assert result == [c] * 3 + + def test_series_plot_color_kwargs(self): + # GH1890 + _, ax = self.plt.subplots() + ax = Series(np.arange(12) + 1).plot(color="green", ax=ax) + self._check_colors(ax.get_lines(), linecolors=["green"]) + + def test_time_series_plot_color_kwargs(self): + # #1890 + _, ax = self.plt.subplots() + ax = Series(np.arange(12) + 1, index=date_range("1/1/2000", periods=12)).plot( + color="green", ax=ax + ) + self._check_colors(ax.get_lines(), linecolors=["green"]) + + def test_time_series_plot_color_with_empty_kwargs(self): + import matplotlib as mpl + + def_colors = self._unpack_cycler(mpl.rcParams) + index = date_range("1/1/2000", periods=12) + s = Series(np.arange(1, 13), index=index) + + ncolors = 3 + + _, ax = self.plt.subplots() + for i in range(ncolors): + ax = s.plot(ax=ax) + self._check_colors(ax.get_lines(), linecolors=def_colors[:ncolors]) + + def test_xticklabels(self): + # GH11529 + s = Series(np.arange(10), index=[f"P{i:02d}" for i in range(10)]) + _, ax = self.plt.subplots() + ax = s.plot(xticks=[0, 3, 5, 9], ax=ax) + exp = [f"P{i:02d}" for i in [0, 3, 5, 9]] + self._check_text_labels(ax.get_xticklabels(), exp) + + def test_xtick_barPlot(self): + # GH28172 + s = pd.Series(range(10), index=[f"P{i:02d}" for i in range(10)]) + ax = s.plot.bar(xticks=range(0, 11, 2)) + exp = np.array(list(range(0, 11, 2))) + tm.assert_numpy_array_equal(exp, ax.get_xticks()) + + def test_custom_business_day_freq(self): + # GH7222 + from pandas.tseries.offsets import CustomBusinessDay + + s = Series( + range(100, 121), + index=pd.bdate_range( + start="2014-05-01", + end="2014-06-01", + freq=CustomBusinessDay(holidays=["2014-05-26"]), + ), + ) + + _check_plot_works(s.plot) + + @pytest.mark.xfail + def test_plot_accessor_updates_on_inplace(self): + s = Series([1, 2, 3, 4]) + _, ax = self.plt.subplots() + ax = s.plot(ax=ax) + before = ax.xaxis.get_ticklocs() + + s.drop([0, 1], inplace=True) + _, ax = self.plt.subplots() + after = ax.xaxis.get_ticklocs() + tm.assert_numpy_array_equal(before, after) + + @pytest.mark.parametrize("kind", ["line", "area"]) + def test_plot_xlim_for_series(self, kind): + # test if xlim is also correctly plotted in Series for line and area + # GH 27686 + s = Series([2, 3]) + _, ax = self.plt.subplots() + s.plot(kind=kind, ax=ax) + xlims = ax.get_xlim() + + assert xlims[0] < 0 + assert xlims[1] > 1 + + def test_plot_no_rows(self): + # GH 27758 + df = pd.Series(dtype=int) + assert df.empty + ax = df.plot() + assert len(ax.get_lines()) == 1 + line = ax.get_lines()[0] + assert len(line.get_xdata()) == 0 + assert len(line.get_ydata()) == 0 + + def test_plot_no_numeric_data(self): + df = pd.Series(["a", "b", "c"]) + with pytest.raises(TypeError): + df.plot() + + def test_style_single_ok(self): + s = pd.Series([1, 2]) + ax = s.plot(style="s", color="C3") + assert ax.lines[0].get_color() == ["C3"] diff --git a/venv/Lib/site-packages/pandas/tests/reductions/__init__.py b/venv/Lib/site-packages/pandas/tests/reductions/__init__.py new file mode 100644 index 0000000..e385175 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/reductions/__init__.py @@ -0,0 +1,4 @@ +""" +Tests for reductions where we want to test for matching behavior across +Array, Index, Series, and DataFrame methods. +""" diff --git a/venv/Lib/site-packages/pandas/tests/reductions/test_reductions.py b/venv/Lib/site-packages/pandas/tests/reductions/test_reductions.py new file mode 100644 index 0000000..7400b04 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/reductions/test_reductions.py @@ -0,0 +1,1278 @@ +from datetime import datetime, timedelta + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + Categorical, + DataFrame, + DatetimeIndex, + Index, + NaT, + Period, + PeriodIndex, + RangeIndex, + Series, + Timedelta, + TimedeltaIndex, + Timestamp, + isna, + timedelta_range, + to_timedelta, +) +import pandas._testing as tm +from pandas.core import nanops + + +def get_objs(): + indexes = [ + tm.makeBoolIndex(10, name="a"), + tm.makeIntIndex(10, name="a"), + tm.makeFloatIndex(10, name="a"), + tm.makeDateIndex(10, name="a"), + tm.makeDateIndex(10, name="a").tz_localize(tz="US/Eastern"), + tm.makePeriodIndex(10, name="a"), + tm.makeStringIndex(10, name="a"), + tm.makeUnicodeIndex(10, name="a"), + ] + + arr = np.random.randn(10) + series = [Series(arr, index=idx, name="a") for idx in indexes] + + objs = indexes + series + return objs + + +objs = get_objs() + + +class TestReductions: + @pytest.mark.parametrize("opname", ["max", "min"]) + @pytest.mark.parametrize("obj", objs) + def test_ops(self, opname, obj): + result = getattr(obj, opname)() + if not isinstance(obj, PeriodIndex): + expected = getattr(obj.values, opname)() + else: + expected = pd.Period( + ordinal=getattr(obj._ndarray_values, opname)(), freq=obj.freq + ) + try: + assert result == expected + except TypeError: + # comparing tz-aware series with np.array results in + # TypeError + expected = expected.astype("M8[ns]").astype("int64") + assert result.value == expected + + def test_nanops(self): + # GH#7261 + for opname in ["max", "min"]: + for klass in [Index, Series]: + arg_op = "arg" + opname if klass is Index else "idx" + opname + + obj = klass([np.nan, 2.0]) + assert getattr(obj, opname)() == 2.0 + + obj = klass([np.nan]) + assert pd.isna(getattr(obj, opname)()) + assert pd.isna(getattr(obj, opname)(skipna=False)) + + obj = klass([], dtype=object) + assert pd.isna(getattr(obj, opname)()) + assert pd.isna(getattr(obj, opname)(skipna=False)) + + obj = klass([pd.NaT, datetime(2011, 11, 1)]) + # check DatetimeIndex monotonic path + assert getattr(obj, opname)() == datetime(2011, 11, 1) + assert getattr(obj, opname)(skipna=False) is pd.NaT + + assert getattr(obj, arg_op)() == 1 + result = getattr(obj, arg_op)(skipna=False) + if klass is Series: + assert np.isnan(result) + else: + assert result == -1 + + obj = klass([pd.NaT, datetime(2011, 11, 1), pd.NaT]) + # check DatetimeIndex non-monotonic path + assert getattr(obj, opname)(), datetime(2011, 11, 1) + assert getattr(obj, opname)(skipna=False) is pd.NaT + + assert getattr(obj, arg_op)() == 1 + result = getattr(obj, arg_op)(skipna=False) + if klass is Series: + assert np.isnan(result) + else: + assert result == -1 + + for dtype in ["M8[ns]", "datetime64[ns, UTC]"]: + # cases with empty Series/DatetimeIndex + obj = klass([], dtype=dtype) + + assert getattr(obj, opname)() is pd.NaT + assert getattr(obj, opname)(skipna=False) is pd.NaT + + with pytest.raises(ValueError, match="empty sequence"): + getattr(obj, arg_op)() + with pytest.raises(ValueError, match="empty sequence"): + getattr(obj, arg_op)(skipna=False) + + # argmin/max + obj = Index(np.arange(5, dtype="int64")) + assert obj.argmin() == 0 + assert obj.argmax() == 4 + + obj = Index([np.nan, 1, np.nan, 2]) + assert obj.argmin() == 1 + assert obj.argmax() == 3 + assert obj.argmin(skipna=False) == -1 + assert obj.argmax(skipna=False) == -1 + + obj = Index([np.nan]) + assert obj.argmin() == -1 + assert obj.argmax() == -1 + assert obj.argmin(skipna=False) == -1 + assert obj.argmax(skipna=False) == -1 + + obj = Index([pd.NaT, datetime(2011, 11, 1), datetime(2011, 11, 2), pd.NaT]) + assert obj.argmin() == 1 + assert obj.argmax() == 2 + assert obj.argmin(skipna=False) == -1 + assert obj.argmax(skipna=False) == -1 + + obj = Index([pd.NaT]) + assert obj.argmin() == -1 + assert obj.argmax() == -1 + assert obj.argmin(skipna=False) == -1 + assert obj.argmax(skipna=False) == -1 + + @pytest.mark.parametrize("op, expected_col", [["max", "a"], ["min", "b"]]) + def test_same_tz_min_max_axis_1(self, op, expected_col): + # GH 10390 + df = DataFrame( + pd.date_range("2016-01-01 00:00:00", periods=3, tz="UTC"), columns=["a"] + ) + df["b"] = df.a.subtract(pd.Timedelta(seconds=3600)) + result = getattr(df, op)(axis=1) + expected = df[expected_col].rename(None) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("func", ["maximum", "minimum"]) + def test_numpy_reduction_with_tz_aware_dtype(self, tz_aware_fixture, func): + # GH 15552 + tz = tz_aware_fixture + arg = pd.to_datetime(["2019"]).tz_localize(tz) + expected = Series(arg) + result = getattr(np, func)(expected, expected) + tm.assert_series_equal(result, expected) + + +class TestIndexReductions: + # Note: the name TestIndexReductions indicates these tests + # were moved from a Index-specific test file, _not_ that these tests are + # intended long-term to be Index-specific + + @pytest.mark.parametrize( + "start,stop,step", + [ + (0, 400, 3), + (500, 0, -6), + (-(10 ** 6), 10 ** 6, 4), + (10 ** 6, -(10 ** 6), -4), + (0, 10, 20), + ], + ) + def test_max_min_range(self, start, stop, step): + # GH#17607 + idx = RangeIndex(start, stop, step) + expected = idx._int64index.max() + result = idx.max() + assert result == expected + + # skipna should be irrelevant since RangeIndex should never have NAs + result2 = idx.max(skipna=False) + assert result2 == expected + + expected = idx._int64index.min() + result = idx.min() + assert result == expected + + # skipna should be irrelevant since RangeIndex should never have NAs + result2 = idx.min(skipna=False) + assert result2 == expected + + # empty + idx = RangeIndex(start, stop, -step) + assert isna(idx.max()) + assert isna(idx.min()) + + def test_minmax_timedelta64(self): + + # monotonic + idx1 = TimedeltaIndex(["1 days", "2 days", "3 days"]) + assert idx1.is_monotonic + + # non-monotonic + idx2 = TimedeltaIndex(["1 days", np.nan, "3 days", "NaT"]) + assert not idx2.is_monotonic + + for idx in [idx1, idx2]: + assert idx.min() == Timedelta("1 days") + assert idx.max() == Timedelta("3 days") + assert idx.argmin() == 0 + assert idx.argmax() == 2 + + for op in ["min", "max"]: + # Return NaT + obj = TimedeltaIndex([]) + assert pd.isna(getattr(obj, op)()) + + obj = TimedeltaIndex([pd.NaT]) + assert pd.isna(getattr(obj, op)()) + + obj = TimedeltaIndex([pd.NaT, pd.NaT, pd.NaT]) + assert pd.isna(getattr(obj, op)()) + + def test_numpy_minmax_timedelta64(self): + td = timedelta_range("16815 days", "16820 days", freq="D") + + assert np.min(td) == Timedelta("16815 days") + assert np.max(td) == Timedelta("16820 days") + + errmsg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=errmsg): + np.min(td, out=0) + with pytest.raises(ValueError, match=errmsg): + np.max(td, out=0) + + assert np.argmin(td) == 0 + assert np.argmax(td) == 5 + + errmsg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=errmsg): + np.argmin(td, out=0) + with pytest.raises(ValueError, match=errmsg): + np.argmax(td, out=0) + + def test_timedelta_ops(self): + # GH#4984 + # make sure ops return Timedelta + s = Series( + [Timestamp("20130101") + timedelta(seconds=i * i) for i in range(10)] + ) + td = s.diff() + + result = td.mean() + expected = to_timedelta(timedelta(seconds=9)) + assert result == expected + + result = td.to_frame().mean() + assert result[0] == expected + + result = td.quantile(0.1) + expected = Timedelta(np.timedelta64(2600, "ms")) + assert result == expected + + result = td.median() + expected = to_timedelta("00:00:09") + assert result == expected + + result = td.to_frame().median() + assert result[0] == expected + + # GH#6462 + # consistency in returned values for sum + result = td.sum() + expected = to_timedelta("00:01:21") + assert result == expected + + result = td.to_frame().sum() + assert result[0] == expected + + # std + result = td.std() + expected = to_timedelta(Series(td.dropna().values).std()) + assert result == expected + + result = td.to_frame().std() + assert result[0] == expected + + # GH#10040 + # make sure NaT is properly handled by median() + s = Series([Timestamp("2015-02-03"), Timestamp("2015-02-07")]) + assert s.diff().median() == timedelta(days=4) + + s = Series( + [Timestamp("2015-02-03"), Timestamp("2015-02-07"), Timestamp("2015-02-15")] + ) + assert s.diff().median() == timedelta(days=6) + + @pytest.mark.parametrize("opname", ["skew", "kurt", "sem", "prod", "var"]) + def test_invalid_td64_reductions(self, opname): + s = Series( + [Timestamp("20130101") + timedelta(seconds=i * i) for i in range(10)] + ) + td = s.diff() + + msg = "reduction operation '{op}' not allowed for this dtype" + msg = msg.format(op=opname) + + with pytest.raises(TypeError, match=msg): + getattr(td, opname)() + + with pytest.raises(TypeError, match=msg): + getattr(td.to_frame(), opname)(numeric_only=False) + + def test_minmax_tz(self, tz_naive_fixture): + tz = tz_naive_fixture + # monotonic + idx1 = pd.DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"], tz=tz) + assert idx1.is_monotonic + + # non-monotonic + idx2 = pd.DatetimeIndex( + ["2011-01-01", pd.NaT, "2011-01-03", "2011-01-02", pd.NaT], tz=tz + ) + assert not idx2.is_monotonic + + for idx in [idx1, idx2]: + assert idx.min() == Timestamp("2011-01-01", tz=tz) + assert idx.max() == Timestamp("2011-01-03", tz=tz) + assert idx.argmin() == 0 + assert idx.argmax() == 2 + + @pytest.mark.parametrize("op", ["min", "max"]) + def test_minmax_nat_datetime64(self, op): + # Return NaT + obj = DatetimeIndex([]) + assert pd.isna(getattr(obj, op)()) + + obj = DatetimeIndex([pd.NaT]) + assert pd.isna(getattr(obj, op)()) + + obj = DatetimeIndex([pd.NaT, pd.NaT, pd.NaT]) + assert pd.isna(getattr(obj, op)()) + + def test_numpy_minmax_integer(self): + # GH#26125 + idx = Index([1, 2, 3]) + + expected = idx.values.max() + result = np.max(idx) + assert result == expected + + expected = idx.values.min() + result = np.min(idx) + assert result == expected + + errmsg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=errmsg): + np.min(idx, out=0) + with pytest.raises(ValueError, match=errmsg): + np.max(idx, out=0) + + expected = idx.values.argmax() + result = np.argmax(idx) + assert result == expected + + expected = idx.values.argmin() + result = np.argmin(idx) + assert result == expected + + errmsg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=errmsg): + np.argmin(idx, out=0) + with pytest.raises(ValueError, match=errmsg): + np.argmax(idx, out=0) + + def test_numpy_minmax_range(self): + # GH#26125 + idx = RangeIndex(0, 10, 3) + + expected = idx._int64index.max() + result = np.max(idx) + assert result == expected + + expected = idx._int64index.min() + result = np.min(idx) + assert result == expected + + errmsg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=errmsg): + np.min(idx, out=0) + with pytest.raises(ValueError, match=errmsg): + np.max(idx, out=0) + + # No need to test again argmax/argmin compat since the implementation + # is the same as basic integer index + + def test_numpy_minmax_datetime64(self): + dr = pd.date_range(start="2016-01-15", end="2016-01-20") + + assert np.min(dr) == Timestamp("2016-01-15 00:00:00", freq="D") + assert np.max(dr) == Timestamp("2016-01-20 00:00:00", freq="D") + + errmsg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=errmsg): + np.min(dr, out=0) + + with pytest.raises(ValueError, match=errmsg): + np.max(dr, out=0) + + assert np.argmin(dr) == 0 + assert np.argmax(dr) == 5 + + errmsg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=errmsg): + np.argmin(dr, out=0) + + with pytest.raises(ValueError, match=errmsg): + np.argmax(dr, out=0) + + def test_minmax_period(self): + + # monotonic + idx1 = pd.PeriodIndex([NaT, "2011-01-01", "2011-01-02", "2011-01-03"], freq="D") + assert idx1.is_monotonic + + # non-monotonic + idx2 = pd.PeriodIndex( + ["2011-01-01", NaT, "2011-01-03", "2011-01-02", NaT], freq="D" + ) + assert not idx2.is_monotonic + + for idx in [idx1, idx2]: + assert idx.min() == pd.Period("2011-01-01", freq="D") + assert idx.max() == pd.Period("2011-01-03", freq="D") + assert idx1.argmin() == 1 + assert idx2.argmin() == 0 + assert idx1.argmax() == 3 + assert idx2.argmax() == 2 + + for op in ["min", "max"]: + # Return NaT + obj = PeriodIndex([], freq="M") + result = getattr(obj, op)() + assert result is NaT + + obj = PeriodIndex([NaT], freq="M") + result = getattr(obj, op)() + assert result is NaT + + obj = PeriodIndex([NaT, NaT, NaT], freq="M") + result = getattr(obj, op)() + assert result is NaT + + def test_numpy_minmax_period(self): + pr = pd.period_range(start="2016-01-15", end="2016-01-20") + + assert np.min(pr) == Period("2016-01-15", freq="D") + assert np.max(pr) == Period("2016-01-20", freq="D") + + errmsg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=errmsg): + np.min(pr, out=0) + with pytest.raises(ValueError, match=errmsg): + np.max(pr, out=0) + + assert np.argmin(pr) == 0 + assert np.argmax(pr) == 5 + + errmsg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=errmsg): + np.argmin(pr, out=0) + with pytest.raises(ValueError, match=errmsg): + np.argmax(pr, out=0) + + def test_min_max_categorical(self): + + ci = pd.CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False) + with pytest.raises(TypeError): + ci.min() + with pytest.raises(TypeError): + ci.max() + + ci = pd.CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=True) + assert ci.min() == "c" + assert ci.max() == "b" + + +class TestSeriesReductions: + # Note: the name TestSeriesReductions indicates these tests + # were moved from a series-specific test file, _not_ that these tests are + # intended long-term to be series-specific + + def test_sum_inf(self): + s = Series(np.random.randn(10)) + s2 = s.copy() + + s[5:8] = np.inf + s2[5:8] = np.nan + + assert np.isinf(s.sum()) + + arr = np.random.randn(100, 100).astype("f4") + arr[:, 2] = np.inf + + with pd.option_context("mode.use_inf_as_na", True): + tm.assert_almost_equal(s.sum(), s2.sum()) + + res = nanops.nansum(arr, axis=1) + assert np.isinf(res).all() + + @pytest.mark.parametrize("use_bottleneck", [True, False]) + @pytest.mark.parametrize("method, unit", [("sum", 0.0), ("prod", 1.0)]) + def test_empty(self, method, unit, use_bottleneck): + with pd.option_context("use_bottleneck", use_bottleneck): + # GH#9422 / GH#18921 + # Entirely empty + s = Series([], dtype=object) + # NA by default + result = getattr(s, method)() + assert result == unit + + # Explicit + result = getattr(s, method)(min_count=0) + assert result == unit + + result = getattr(s, method)(min_count=1) + assert pd.isna(result) + + # Skipna, default + result = getattr(s, method)(skipna=True) + result == unit + + # Skipna, explicit + result = getattr(s, method)(skipna=True, min_count=0) + assert result == unit + + result = getattr(s, method)(skipna=True, min_count=1) + assert pd.isna(result) + + # All-NA + s = Series([np.nan]) + # NA by default + result = getattr(s, method)() + assert result == unit + + # Explicit + result = getattr(s, method)(min_count=0) + assert result == unit + + result = getattr(s, method)(min_count=1) + assert pd.isna(result) + + # Skipna, default + result = getattr(s, method)(skipna=True) + result == unit + + # skipna, explicit + result = getattr(s, method)(skipna=True, min_count=0) + assert result == unit + + result = getattr(s, method)(skipna=True, min_count=1) + assert pd.isna(result) + + # Mix of valid, empty + s = Series([np.nan, 1]) + # Default + result = getattr(s, method)() + assert result == 1.0 + + # Explicit + result = getattr(s, method)(min_count=0) + assert result == 1.0 + + result = getattr(s, method)(min_count=1) + assert result == 1.0 + + # Skipna + result = getattr(s, method)(skipna=True) + assert result == 1.0 + + result = getattr(s, method)(skipna=True, min_count=0) + assert result == 1.0 + + result = getattr(s, method)(skipna=True, min_count=1) + assert result == 1.0 + + # GH#844 (changed in GH#9422) + df = DataFrame(np.empty((10, 0))) + assert (getattr(df, method)(1) == unit).all() + + s = pd.Series([1]) + result = getattr(s, method)(min_count=2) + assert pd.isna(result) + + s = pd.Series([np.nan]) + result = getattr(s, method)(min_count=2) + assert pd.isna(result) + + s = pd.Series([np.nan, 1]) + result = getattr(s, method)(min_count=2) + assert pd.isna(result) + + @pytest.mark.parametrize("method, unit", [("sum", 0.0), ("prod", 1.0)]) + def test_empty_multi(self, method, unit): + s = pd.Series( + [1, np.nan, np.nan, np.nan], + index=pd.MultiIndex.from_product([("a", "b"), (0, 1)]), + ) + # 1 / 0 by default + result = getattr(s, method)(level=0) + expected = pd.Series([1, unit], index=["a", "b"]) + tm.assert_series_equal(result, expected) + + # min_count=0 + result = getattr(s, method)(level=0, min_count=0) + expected = pd.Series([1, unit], index=["a", "b"]) + tm.assert_series_equal(result, expected) + + # min_count=1 + result = getattr(s, method)(level=0, min_count=1) + expected = pd.Series([1, np.nan], index=["a", "b"]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("method", ["mean", "median", "std", "var"]) + def test_ops_consistency_on_empty(self, method): + + # GH#7869 + # consistency on empty + + # float + result = getattr(Series(dtype=float), method)() + assert pd.isna(result) + + # timedelta64[ns] + tdser = Series([], dtype="m8[ns]") + if method == "var": + with pytest.raises(TypeError, match="operation 'var' not allowed"): + getattr(tdser, method)() + else: + result = getattr(tdser, method)() + assert result is pd.NaT + + def test_nansum_buglet(self): + ser = Series([1.0, np.nan], index=[0, 1]) + result = np.nansum(ser) + tm.assert_almost_equal(result, 1) + + @pytest.mark.parametrize("use_bottleneck", [True, False]) + def test_sum_overflow(self, use_bottleneck): + + with pd.option_context("use_bottleneck", use_bottleneck): + # GH#6915 + # overflowing on the smaller int dtypes + for dtype in ["int32", "int64"]: + v = np.arange(5000000, dtype=dtype) + s = Series(v) + + result = s.sum(skipna=False) + assert int(result) == v.sum(dtype="int64") + result = s.min(skipna=False) + assert int(result) == 0 + result = s.max(skipna=False) + assert int(result) == v[-1] + + for dtype in ["float32", "float64"]: + v = np.arange(5000000, dtype=dtype) + s = Series(v) + + result = s.sum(skipna=False) + assert result == v.sum(dtype=dtype) + result = s.min(skipna=False) + assert np.allclose(float(result), 0.0) + result = s.max(skipna=False) + assert np.allclose(float(result), v[-1]) + + def test_empty_timeseries_reductions_return_nat(self): + # covers GH#11245 + for dtype in ("m8[ns]", "m8[ns]", "M8[ns]", "M8[ns, UTC]"): + assert Series([], dtype=dtype).min() is pd.NaT + assert Series([], dtype=dtype).max() is pd.NaT + assert Series([], dtype=dtype).min(skipna=False) is pd.NaT + assert Series([], dtype=dtype).max(skipna=False) is pd.NaT + + def test_numpy_argmin(self): + # See GH#16830 + data = np.arange(1, 11) + + s = Series(data, index=data) + result = np.argmin(s) + + expected = np.argmin(data) + assert result == expected + + result = s.argmin() + + assert result == expected + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.argmin(s, out=data) + + def test_numpy_argmax(self): + # See GH#16830 + data = np.arange(1, 11) + + s = Series(data, index=data) + result = np.argmax(s) + expected = np.argmax(data) + assert result == expected + + result = s.argmax() + + assert result == expected + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.argmax(s, out=data) + + def test_idxmin(self): + # test idxmin + # _check_stat_op approach can not be used here because of isna check. + string_series = tm.makeStringSeries().rename("series") + + # add some NaNs + string_series[5:15] = np.NaN + + # skipna or no + assert string_series[string_series.idxmin()] == string_series.min() + assert pd.isna(string_series.idxmin(skipna=False)) + + # no NaNs + nona = string_series.dropna() + assert nona[nona.idxmin()] == nona.min() + assert nona.index.values.tolist().index(nona.idxmin()) == nona.values.argmin() + + # all NaNs + allna = string_series * np.nan + assert pd.isna(allna.idxmin()) + + # datetime64[ns] + s = Series(pd.date_range("20130102", periods=6)) + result = s.idxmin() + assert result == 0 + + s[0] = np.nan + result = s.idxmin() + assert result == 1 + + def test_idxmax(self): + # test idxmax + # _check_stat_op approach can not be used here because of isna check. + string_series = tm.makeStringSeries().rename("series") + + # add some NaNs + string_series[5:15] = np.NaN + + # skipna or no + assert string_series[string_series.idxmax()] == string_series.max() + assert pd.isna(string_series.idxmax(skipna=False)) + + # no NaNs + nona = string_series.dropna() + assert nona[nona.idxmax()] == nona.max() + assert nona.index.values.tolist().index(nona.idxmax()) == nona.values.argmax() + + # all NaNs + allna = string_series * np.nan + assert pd.isna(allna.idxmax()) + + from pandas import date_range + + s = Series(date_range("20130102", periods=6)) + result = s.idxmax() + assert result == 5 + + s[5] = np.nan + result = s.idxmax() + assert result == 4 + + # Float64Index + # GH#5914 + s = pd.Series([1, 2, 3], [1.1, 2.1, 3.1]) + result = s.idxmax() + assert result == 3.1 + result = s.idxmin() + assert result == 1.1 + + s = pd.Series(s.index, s.index) + result = s.idxmax() + assert result == 3.1 + result = s.idxmin() + assert result == 1.1 + + def test_all_any(self): + ts = tm.makeTimeSeries() + bool_series = ts > 0 + assert not bool_series.all() + assert bool_series.any() + + # Alternative types, with implicit 'object' dtype. + s = Series(["abc", True]) + assert "abc" == s.any() # 'abc' || True => 'abc' + + def test_all_any_params(self): + # Check skipna, with implicit 'object' dtype. + s1 = Series([np.nan, True]) + s2 = Series([np.nan, False]) + assert s1.all(skipna=False) # nan && True => True + assert s1.all(skipna=True) + assert np.isnan(s2.any(skipna=False)) # nan || False => nan + assert not s2.any(skipna=True) + + # Check level. + s = pd.Series([False, False, True, True, False, True], index=[0, 0, 1, 1, 2, 2]) + tm.assert_series_equal(s.all(level=0), Series([False, True, False])) + tm.assert_series_equal(s.any(level=0), Series([False, True, True])) + + # bool_only is not implemented with level option. + with pytest.raises(NotImplementedError): + s.any(bool_only=True, level=0) + with pytest.raises(NotImplementedError): + s.all(bool_only=True, level=0) + + # bool_only is not implemented alone. + with pytest.raises(NotImplementedError): + s.any(bool_only=True) + with pytest.raises(NotImplementedError): + s.all(bool_only=True) + + def test_timedelta64_analytics(self): + + # index min/max + dti = pd.date_range("2012-1-1", periods=3, freq="D") + td = Series(dti) - pd.Timestamp("20120101") + + result = td.idxmin() + assert result == 0 + + result = td.idxmax() + assert result == 2 + + # GH#2982 + # with NaT + td[0] = np.nan + + result = td.idxmin() + assert result == 1 + + result = td.idxmax() + assert result == 2 + + # abs + s1 = Series(pd.date_range("20120101", periods=3)) + s2 = Series(pd.date_range("20120102", periods=3)) + expected = Series(s2 - s1) + + # FIXME: don't leave commented-out code + # this fails as numpy returns timedelta64[us] + # result = np.abs(s1-s2) + # assert_frame_equal(result,expected) + + result = (s1 - s2).abs() + tm.assert_series_equal(result, expected) + + # max/min + result = td.max() + expected = pd.Timedelta("2 days") + assert result == expected + + result = td.min() + expected = pd.Timedelta("1 days") + assert result == expected + + @pytest.mark.parametrize( + "test_input,error_type", + [ + (pd.Series([], dtype="float64"), ValueError), + # For strings, or any Series with dtype 'O' + (pd.Series(["foo", "bar", "baz"]), TypeError), + (pd.Series([(1,), (2,)]), TypeError), + # For mixed data types + (pd.Series(["foo", "foo", "bar", "bar", None, np.nan, "baz"]), TypeError), + ], + ) + def test_assert_idxminmax_raises(self, test_input, error_type): + """ + Cases where ``Series.argmax`` and related should raise an exception + """ + with pytest.raises(error_type): + test_input.idxmin() + with pytest.raises(error_type): + test_input.idxmin(skipna=False) + with pytest.raises(error_type): + test_input.idxmax() + with pytest.raises(error_type): + test_input.idxmax(skipna=False) + + def test_idxminmax_with_inf(self): + # For numeric data with NA and Inf (GH #13595) + s = pd.Series([0, -np.inf, np.inf, np.nan]) + + assert s.idxmin() == 1 + assert np.isnan(s.idxmin(skipna=False)) + + assert s.idxmax() == 2 + assert np.isnan(s.idxmax(skipna=False)) + + # Using old-style behavior that treats floating point nan, -inf, and + # +inf as missing + with pd.option_context("mode.use_inf_as_na", True): + assert s.idxmin() == 0 + assert np.isnan(s.idxmin(skipna=False)) + assert s.idxmax() == 0 + np.isnan(s.idxmax(skipna=False)) + + +class TestDatetime64SeriesReductions: + # Note: the name TestDatetime64SeriesReductions indicates these tests + # were moved from a series-specific test file, _not_ that these tests are + # intended long-term to be series-specific + + @pytest.mark.parametrize( + "nat_ser", + [ + Series([pd.NaT, pd.NaT]), + Series([pd.NaT, pd.Timedelta("nat")]), + Series([pd.Timedelta("nat"), pd.Timedelta("nat")]), + ], + ) + def test_minmax_nat_series(self, nat_ser): + # GH#23282 + assert nat_ser.min() is pd.NaT + assert nat_ser.max() is pd.NaT + assert nat_ser.min(skipna=False) is pd.NaT + assert nat_ser.max(skipna=False) is pd.NaT + + @pytest.mark.parametrize( + "nat_df", + [ + pd.DataFrame([pd.NaT, pd.NaT]), + pd.DataFrame([pd.NaT, pd.Timedelta("nat")]), + pd.DataFrame([pd.Timedelta("nat"), pd.Timedelta("nat")]), + ], + ) + def test_minmax_nat_dataframe(self, nat_df): + # GH#23282 + assert nat_df.min()[0] is pd.NaT + assert nat_df.max()[0] is pd.NaT + assert nat_df.min(skipna=False)[0] is pd.NaT + assert nat_df.max(skipna=False)[0] is pd.NaT + + def test_min_max(self): + rng = pd.date_range("1/1/2000", "12/31/2000") + rng2 = rng.take(np.random.permutation(len(rng))) + + the_min = rng2.min() + the_max = rng2.max() + assert isinstance(the_min, pd.Timestamp) + assert isinstance(the_max, pd.Timestamp) + assert the_min == rng[0] + assert the_max == rng[-1] + + assert rng.min() == rng[0] + assert rng.max() == rng[-1] + + def test_min_max_series(self): + rng = pd.date_range("1/1/2000", periods=10, freq="4h") + lvls = ["A", "A", "A", "B", "B", "B", "C", "C", "C", "C"] + df = DataFrame({"TS": rng, "V": np.random.randn(len(rng)), "L": lvls}) + + result = df.TS.max() + exp = pd.Timestamp(df.TS.iat[-1]) + assert isinstance(result, pd.Timestamp) + assert result == exp + + result = df.TS.min() + exp = pd.Timestamp(df.TS.iat[0]) + assert isinstance(result, pd.Timestamp) + assert result == exp + + +class TestCategoricalSeriesReductions: + # Note: the name TestCategoricalSeriesReductions indicates these tests + # were moved from a series-specific test file, _not_ that these tests are + # intended long-term to be series-specific + + def test_min_max(self): + # unordered cats have no min/max + cat = Series(Categorical(["a", "b", "c", "d"], ordered=False)) + with pytest.raises(TypeError): + cat.min() + with pytest.raises(TypeError): + cat.max() + + cat = Series(Categorical(["a", "b", "c", "d"], ordered=True)) + _min = cat.min() + _max = cat.max() + assert _min == "a" + assert _max == "d" + + cat = Series( + Categorical( + ["a", "b", "c", "d"], categories=["d", "c", "b", "a"], ordered=True + ) + ) + _min = cat.min() + _max = cat.max() + assert _min == "d" + assert _max == "a" + + cat = Series( + Categorical( + [np.nan, "b", "c", np.nan], + categories=["d", "c", "b", "a"], + ordered=True, + ) + ) + _min = cat.min() + _max = cat.max() + assert _min == "c" + assert _max == "b" + + cat = Series( + Categorical( + [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True + ) + ) + _min = cat.min() + _max = cat.max() + assert _min == 2 + assert _max == 1 + + @pytest.mark.parametrize("skipna", [True, False]) + def test_min_max_skipna(self, skipna): + # GH 25303 + cat = Series( + Categorical(["a", "b", np.nan, "a"], categories=["b", "a"], ordered=True) + ) + _min = cat.min(skipna=skipna) + _max = cat.max(skipna=skipna) + + if skipna is True: + assert _min == "b" + assert _max == "a" + else: + assert np.isnan(_min) + assert np.isnan(_max) + + +class TestSeriesMode: + # Note: the name TestSeriesMode indicates these tests + # were moved from a series-specific test file, _not_ that these tests are + # intended long-term to be series-specific + + @pytest.mark.parametrize( + "dropna, expected", + [(True, Series([], dtype=np.float64)), (False, Series([], dtype=np.float64))], + ) + def test_mode_empty(self, dropna, expected): + s = Series([], dtype=np.float64) + result = s.mode(dropna) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "dropna, data, expected", + [ + (True, [1, 1, 1, 2], [1]), + (True, [1, 1, 1, 2, 3, 3, 3], [1, 3]), + (False, [1, 1, 1, 2], [1]), + (False, [1, 1, 1, 2, 3, 3, 3], [1, 3]), + ], + ) + @pytest.mark.parametrize( + "dt", list(np.typecodes["AllInteger"] + np.typecodes["Float"]) + ) + def test_mode_numerical(self, dropna, data, expected, dt): + s = Series(data, dtype=dt) + result = s.mode(dropna) + expected = Series(expected, dtype=dt) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("dropna, expected", [(True, [1.0]), (False, [1, np.nan])]) + def test_mode_numerical_nan(self, dropna, expected): + s = Series([1, 1, 2, np.nan, np.nan]) + result = s.mode(dropna) + expected = Series(expected) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "dropna, expected1, expected2, expected3", + [(True, ["b"], ["bar"], ["nan"]), (False, ["b"], [np.nan], ["nan"])], + ) + def test_mode_str_obj(self, dropna, expected1, expected2, expected3): + # Test string and object types. + data = ["a"] * 2 + ["b"] * 3 + + s = Series(data, dtype="c") + result = s.mode(dropna) + expected1 = Series(expected1, dtype="c") + tm.assert_series_equal(result, expected1) + + data = ["foo", "bar", "bar", np.nan, np.nan, np.nan] + + s = Series(data, dtype=object) + result = s.mode(dropna) + expected2 = Series(expected2, dtype=object) + tm.assert_series_equal(result, expected2) + + data = ["foo", "bar", "bar", np.nan, np.nan, np.nan] + + s = Series(data, dtype=object).astype(str) + result = s.mode(dropna) + expected3 = Series(expected3, dtype=str) + tm.assert_series_equal(result, expected3) + + @pytest.mark.parametrize( + "dropna, expected1, expected2", + [(True, ["foo"], ["foo"]), (False, ["foo"], [np.nan])], + ) + def test_mode_mixeddtype(self, dropna, expected1, expected2): + s = Series([1, "foo", "foo"]) + result = s.mode(dropna) + expected = Series(expected1) + tm.assert_series_equal(result, expected) + + s = Series([1, "foo", "foo", np.nan, np.nan, np.nan]) + result = s.mode(dropna) + expected = Series(expected2, dtype=object) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "dropna, expected1, expected2", + [ + ( + True, + ["1900-05-03", "2011-01-03", "2013-01-02"], + ["2011-01-03", "2013-01-02"], + ), + (False, [np.nan], [np.nan, "2011-01-03", "2013-01-02"]), + ], + ) + def test_mode_datetime(self, dropna, expected1, expected2): + s = Series( + ["2011-01-03", "2013-01-02", "1900-05-03", "nan", "nan"], dtype="M8[ns]" + ) + result = s.mode(dropna) + expected1 = Series(expected1, dtype="M8[ns]") + tm.assert_series_equal(result, expected1) + + s = Series( + [ + "2011-01-03", + "2013-01-02", + "1900-05-03", + "2011-01-03", + "2013-01-02", + "nan", + "nan", + ], + dtype="M8[ns]", + ) + result = s.mode(dropna) + expected2 = Series(expected2, dtype="M8[ns]") + tm.assert_series_equal(result, expected2) + + @pytest.mark.parametrize( + "dropna, expected1, expected2", + [ + (True, ["-1 days", "0 days", "1 days"], ["2 min", "1 day"]), + (False, [np.nan], [np.nan, "2 min", "1 day"]), + ], + ) + def test_mode_timedelta(self, dropna, expected1, expected2): + # gh-5986: Test timedelta types. + + s = Series( + ["1 days", "-1 days", "0 days", "nan", "nan"], dtype="timedelta64[ns]" + ) + result = s.mode(dropna) + expected1 = Series(expected1, dtype="timedelta64[ns]") + tm.assert_series_equal(result, expected1) + + s = Series( + [ + "1 day", + "1 day", + "-1 day", + "-1 day 2 min", + "2 min", + "2 min", + "nan", + "nan", + ], + dtype="timedelta64[ns]", + ) + result = s.mode(dropna) + expected2 = Series(expected2, dtype="timedelta64[ns]") + tm.assert_series_equal(result, expected2) + + @pytest.mark.parametrize( + "dropna, expected1, expected2, expected3", + [ + ( + True, + Categorical([1, 2], categories=[1, 2]), + Categorical(["a"], categories=[1, "a"]), + Categorical([3, 1], categories=[3, 2, 1], ordered=True), + ), + ( + False, + Categorical([np.nan], categories=[1, 2]), + Categorical([np.nan, "a"], categories=[1, "a"]), + Categorical([np.nan, 3, 1], categories=[3, 2, 1], ordered=True), + ), + ], + ) + def test_mode_category(self, dropna, expected1, expected2, expected3): + s = Series(Categorical([1, 2, np.nan, np.nan])) + result = s.mode(dropna) + expected1 = Series(expected1, dtype="category") + tm.assert_series_equal(result, expected1) + + s = Series(Categorical([1, "a", "a", np.nan, np.nan])) + result = s.mode(dropna) + expected2 = Series(expected2, dtype="category") + tm.assert_series_equal(result, expected2) + + s = Series( + Categorical( + [1, 1, 2, 3, 3, np.nan, np.nan], categories=[3, 2, 1], ordered=True + ) + ) + result = s.mode(dropna) + expected3 = Series(expected3, dtype="category") + tm.assert_series_equal(result, expected3) + + @pytest.mark.parametrize( + "dropna, expected1, expected2", + [(True, [2 ** 63], [1, 2 ** 63]), (False, [2 ** 63], [1, 2 ** 63])], + ) + def test_mode_intoverflow(self, dropna, expected1, expected2): + # Test for uint64 overflow. + s = Series([1, 2 ** 63, 2 ** 63], dtype=np.uint64) + result = s.mode(dropna) + expected1 = Series(expected1, dtype=np.uint64) + tm.assert_series_equal(result, expected1) + + s = Series([1, 2 ** 63], dtype=np.uint64) + result = s.mode(dropna) + expected2 = Series(expected2, dtype=np.uint64) + tm.assert_series_equal(result, expected2) + + def test_mode_sortwarning(self): + # Check for the warning that is raised when the mode + # results cannot be sorted + + expected = Series(["foo", np.nan]) + s = Series([1, "foo", "foo", np.nan, np.nan]) + + with tm.assert_produces_warning(UserWarning, check_stacklevel=False): + result = s.mode(dropna=False) + result = result.sort_values().reset_index(drop=True) + + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/reductions/test_stat_reductions.py b/venv/Lib/site-packages/pandas/tests/reductions/test_stat_reductions.py new file mode 100644 index 0000000..59dbcb9 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/reductions/test_stat_reductions.py @@ -0,0 +1,270 @@ +""" +Tests for statistical reductions of 2nd moment or higher: var, skew, kurt, ... +""" +import inspect + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import DataFrame, Series +import pandas._testing as tm +from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray + + +class TestDatetimeLikeStatReductions: + @pytest.mark.parametrize("box", [Series, pd.Index, DatetimeArray]) + def test_dt64_mean(self, tz_naive_fixture, box): + tz = tz_naive_fixture + + dti = pd.date_range("2001-01-01", periods=11, tz=tz) + # shuffle so that we are not just working with monotone-increasing + dti = dti.take([4, 1, 3, 10, 9, 7, 8, 5, 0, 2, 6]) + dtarr = dti._data + + obj = box(dtarr) + assert obj.mean() == pd.Timestamp("2001-01-06", tz=tz) + assert obj.mean(skipna=False) == pd.Timestamp("2001-01-06", tz=tz) + + # dtarr[-2] will be the first date 2001-01-1 + dtarr[-2] = pd.NaT + + obj = box(dtarr) + assert obj.mean() == pd.Timestamp("2001-01-06 07:12:00", tz=tz) + assert obj.mean(skipna=False) is pd.NaT + + @pytest.mark.parametrize("box", [Series, pd.Index, PeriodArray]) + def test_period_mean(self, box): + # GH#24757 + dti = pd.date_range("2001-01-01", periods=11) + # shuffle so that we are not just working with monotone-increasing + dti = dti.take([4, 1, 3, 10, 9, 7, 8, 5, 0, 2, 6]) + + # use hourly frequency to avoid rounding errors in expected results + # TODO: flesh this out with different frequencies + parr = dti._data.to_period("H") + obj = box(parr) + with pytest.raises(TypeError, match="ambiguous"): + obj.mean() + with pytest.raises(TypeError, match="ambiguous"): + obj.mean(skipna=True) + + # parr[-2] will be the first date 2001-01-1 + parr[-2] = pd.NaT + + with pytest.raises(TypeError, match="ambiguous"): + obj.mean() + with pytest.raises(TypeError, match="ambiguous"): + obj.mean(skipna=True) + + @pytest.mark.parametrize("box", [Series, pd.Index, TimedeltaArray]) + def test_td64_mean(self, box): + tdi = pd.TimedeltaIndex([0, 3, -2, -7, 1, 2, -1, 3, 5, -2, 4], unit="D") + + tdarr = tdi._data + obj = box(tdarr) + + result = obj.mean() + expected = np.array(tdarr).mean() + assert result == expected + + tdarr[0] = pd.NaT + assert obj.mean(skipna=False) is pd.NaT + + result2 = obj.mean(skipna=True) + assert result2 == tdi[1:].mean() + + # exact equality fails by 1 nanosecond + assert result2.round("us") == (result * 11.0 / 10).round("us") + + +class TestSeriesStatReductions: + # Note: the name TestSeriesStatReductions indicates these tests + # were moved from a series-specific test file, _not_ that these tests are + # intended long-term to be series-specific + + def _check_stat_op( + self, name, alternate, string_series_, check_objects=False, check_allna=False + ): + + with pd.option_context("use_bottleneck", False): + f = getattr(Series, name) + + # add some NaNs + string_series_[5:15] = np.NaN + + # mean, idxmax, idxmin, min, and max are valid for dates + if name not in ["max", "min", "mean"]: + ds = Series(pd.date_range("1/1/2001", periods=10)) + with pytest.raises(TypeError): + f(ds) + + # skipna or no + assert pd.notna(f(string_series_)) + assert pd.isna(f(string_series_, skipna=False)) + + # check the result is correct + nona = string_series_.dropna() + tm.assert_almost_equal(f(nona), alternate(nona.values)) + tm.assert_almost_equal(f(string_series_), alternate(nona.values)) + + allna = string_series_ * np.nan + + if check_allna: + assert np.isnan(f(allna)) + + # dtype=object with None, it works! + s = Series([1, 2, 3, None, 5]) + f(s) + + # GH#2888 + items = [0] + items.extend(range(2 ** 40, 2 ** 40 + 1000)) + s = Series(items, dtype="int64") + tm.assert_almost_equal(float(f(s)), float(alternate(s.values))) + + # check date range + if check_objects: + s = Series(pd.bdate_range("1/1/2000", periods=10)) + res = f(s) + exp = alternate(s) + assert res == exp + + # check on string data + if name not in ["sum", "min", "max"]: + with pytest.raises(TypeError): + f(Series(list("abc"))) + + # Invalid axis. + with pytest.raises(ValueError): + f(string_series_, axis=1) + + # Unimplemented numeric_only parameter. + if "numeric_only" in inspect.getfullargspec(f).args: + with pytest.raises(NotImplementedError, match=name): + f(string_series_, numeric_only=True) + + def test_sum(self): + string_series = tm.makeStringSeries().rename("series") + self._check_stat_op("sum", np.sum, string_series, check_allna=False) + + def test_mean(self): + string_series = tm.makeStringSeries().rename("series") + self._check_stat_op("mean", np.mean, string_series) + + def test_median(self): + string_series = tm.makeStringSeries().rename("series") + self._check_stat_op("median", np.median, string_series) + + # test with integers, test failure + int_ts = Series(np.ones(10, dtype=int), index=range(10)) + tm.assert_almost_equal(np.median(int_ts), int_ts.median()) + + def test_prod(self): + string_series = tm.makeStringSeries().rename("series") + self._check_stat_op("prod", np.prod, string_series) + + def test_min(self): + string_series = tm.makeStringSeries().rename("series") + self._check_stat_op("min", np.min, string_series, check_objects=True) + + def test_max(self): + string_series = tm.makeStringSeries().rename("series") + self._check_stat_op("max", np.max, string_series, check_objects=True) + + def test_var_std(self): + string_series = tm.makeStringSeries().rename("series") + datetime_series = tm.makeTimeSeries().rename("ts") + + alt = lambda x: np.std(x, ddof=1) + self._check_stat_op("std", alt, string_series) + + alt = lambda x: np.var(x, ddof=1) + self._check_stat_op("var", alt, string_series) + + result = datetime_series.std(ddof=4) + expected = np.std(datetime_series.values, ddof=4) + tm.assert_almost_equal(result, expected) + + result = datetime_series.var(ddof=4) + expected = np.var(datetime_series.values, ddof=4) + tm.assert_almost_equal(result, expected) + + # 1 - element series with ddof=1 + s = datetime_series.iloc[[0]] + result = s.var(ddof=1) + assert pd.isna(result) + + result = s.std(ddof=1) + assert pd.isna(result) + + def test_sem(self): + string_series = tm.makeStringSeries().rename("series") + datetime_series = tm.makeTimeSeries().rename("ts") + + alt = lambda x: np.std(x, ddof=1) / np.sqrt(len(x)) + self._check_stat_op("sem", alt, string_series) + + result = datetime_series.sem(ddof=4) + expected = np.std(datetime_series.values, ddof=4) / np.sqrt( + len(datetime_series.values) + ) + tm.assert_almost_equal(result, expected) + + # 1 - element series with ddof=1 + s = datetime_series.iloc[[0]] + result = s.sem(ddof=1) + assert pd.isna(result) + + @td.skip_if_no_scipy + def test_skew(self): + from scipy.stats import skew + + string_series = tm.makeStringSeries().rename("series") + + alt = lambda x: skew(x, bias=False) + self._check_stat_op("skew", alt, string_series) + + # test corner cases, skew() returns NaN unless there's at least 3 + # values + min_N = 3 + for i in range(1, min_N + 1): + s = Series(np.ones(i)) + df = DataFrame(np.ones((i, i))) + if i < min_N: + assert np.isnan(s.skew()) + assert np.isnan(df.skew()).all() + else: + assert 0 == s.skew() + assert (df.skew() == 0).all() + + @td.skip_if_no_scipy + def test_kurt(self): + from scipy.stats import kurtosis + + string_series = tm.makeStringSeries().rename("series") + + alt = lambda x: kurtosis(x, bias=False) + self._check_stat_op("kurt", alt, string_series) + + index = pd.MultiIndex( + levels=[["bar"], ["one", "two", "three"], [0, 1]], + codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], + ) + s = Series(np.random.randn(6), index=index) + tm.assert_almost_equal(s.kurt(), s.kurt(level=0)["bar"]) + + # test corner cases, kurt() returns NaN unless there's at least 4 + # values + min_N = 4 + for i in range(1, min_N + 1): + s = Series(np.ones(i)) + df = DataFrame(np.ones((i, i))) + if i < min_N: + assert np.isnan(s.kurt()) + assert np.isnan(df.kurt()).all() + else: + assert 0 == s.kurt() + assert (df.kurt() == 0).all() diff --git a/venv/Lib/site-packages/pandas/tests/resample/__init__.py b/venv/Lib/site-packages/pandas/tests/resample/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/resample/conftest.py b/venv/Lib/site-packages/pandas/tests/resample/conftest.py new file mode 100644 index 0000000..bb4f7ce --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/resample/conftest.py @@ -0,0 +1,158 @@ +from datetime import datetime + +import numpy as np +import pytest + +from pandas import DataFrame, Series +from pandas.core.indexes.datetimes import date_range +from pandas.core.indexes.period import period_range + +# The various methods we support +downsample_methods = [ + "min", + "max", + "first", + "last", + "sum", + "mean", + "sem", + "median", + "prod", + "var", + "std", + "ohlc", + "quantile", +] +upsample_methods = ["count", "size"] +series_methods = ["nunique"] +resample_methods = downsample_methods + upsample_methods + series_methods + + +@pytest.fixture(params=downsample_methods) +def downsample_method(request): + """Fixture for parametrization of Grouper downsample methods.""" + return request.param + + +@pytest.fixture(params=upsample_methods) +def upsample_method(request): + """Fixture for parametrization of Grouper upsample methods.""" + return request.param + + +@pytest.fixture(params=resample_methods) +def resample_method(request): + """Fixture for parametrization of Grouper resample methods.""" + return request.param + + +@pytest.fixture +def simple_date_range_series(): + """ + Series with date range index and random data for test purposes. + """ + + def _simple_date_range_series(start, end, freq="D"): + rng = date_range(start, end, freq=freq) + return Series(np.random.randn(len(rng)), index=rng) + + return _simple_date_range_series + + +@pytest.fixture +def simple_period_range_series(): + """ + Series with period range index and random data for test purposes. + """ + + def _simple_period_range_series(start, end, freq="D"): + rng = period_range(start, end, freq=freq) + return Series(np.random.randn(len(rng)), index=rng) + + return _simple_period_range_series + + +@pytest.fixture +def _index_start(): + """Fixture for parametrization of index, series and frame.""" + return datetime(2005, 1, 1) + + +@pytest.fixture +def _index_end(): + """Fixture for parametrization of index, series and frame.""" + return datetime(2005, 1, 10) + + +@pytest.fixture +def _index_freq(): + """Fixture for parametrization of index, series and frame.""" + return "D" + + +@pytest.fixture +def _index_name(): + """Fixture for parametrization of index, series and frame.""" + return None + + +@pytest.fixture +def index(_index_factory, _index_start, _index_end, _index_freq, _index_name): + """Fixture for parametrization of date_range, period_range and + timedelta_range indexes""" + return _index_factory(_index_start, _index_end, freq=_index_freq, name=_index_name) + + +@pytest.fixture +def _static_values(index): + """Fixture for parametrization of values used in parametrization of + Series and DataFrames with date_range, period_range and + timedelta_range indexes""" + return np.arange(len(index)) + + +@pytest.fixture +def _series_name(): + """Fixture for parametrization of Series name for Series used with + date_range, period_range and timedelta_range indexes""" + return None + + +@pytest.fixture +def series(index, _series_name, _static_values): + """Fixture for parametrization of Series with date_range, period_range and + timedelta_range indexes""" + return Series(_static_values, index=index, name=_series_name) + + +@pytest.fixture +def empty_series(series): + """Fixture for parametrization of empty Series with date_range, + period_range and timedelta_range indexes""" + return series[:0] + + +@pytest.fixture +def frame(index, _series_name, _static_values): + """Fixture for parametrization of DataFrame with date_range, period_range + and timedelta_range indexes""" + # _series_name is intentionally unused + return DataFrame({"value": _static_values}, index=index) + + +@pytest.fixture +def empty_frame(series): + """Fixture for parametrization of empty DataFrame with date_range, + period_range and timedelta_range indexes""" + index = series.index[:0] + return DataFrame(index=index) + + +@pytest.fixture(params=[Series, DataFrame]) +def series_and_frame(request, series, frame): + """Fixture for parametrization of Series and DataFrame with date_range, + period_range and timedelta_range indexes""" + if request.param == Series: + return series + if request.param == DataFrame: + return frame diff --git a/venv/Lib/site-packages/pandas/tests/resample/test_base.py b/venv/Lib/site-packages/pandas/tests/resample/test_base.py new file mode 100644 index 0000000..f8a1810 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/resample/test_base.py @@ -0,0 +1,269 @@ +from datetime import datetime, timedelta + +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Series +import pandas._testing as tm +from pandas.core.groupby.groupby import DataError +from pandas.core.groupby.grouper import Grouper +from pandas.core.indexes.datetimes import date_range +from pandas.core.indexes.period import PeriodIndex, period_range +from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range + +# a fixture value can be overridden by the test parameter value. Note that the +# value of the fixture can be overridden this way even if the test doesn't use +# it directly (doesn't mention it in the function prototype). +# see https://docs.pytest.org/en/latest/fixture.html#override-a-fixture-with-direct-test-parametrization # noqa +# in this module we override the fixture values defined in conftest.py +# tuples of '_index_factory,_series_name,_index_start,_index_end' +DATE_RANGE = (date_range, "dti", datetime(2005, 1, 1), datetime(2005, 1, 10)) +PERIOD_RANGE = (period_range, "pi", datetime(2005, 1, 1), datetime(2005, 1, 10)) +TIMEDELTA_RANGE = (timedelta_range, "tdi", "1 day", "10 day") + +all_ts = pytest.mark.parametrize( + "_index_factory,_series_name,_index_start,_index_end", + [DATE_RANGE, PERIOD_RANGE, TIMEDELTA_RANGE], +) + + +@pytest.fixture +def create_index(_index_factory): + def _create_index(*args, **kwargs): + """ return the _index_factory created using the args, kwargs """ + return _index_factory(*args, **kwargs) + + return _create_index + + +@pytest.mark.parametrize("freq", ["2D", "1H"]) +@pytest.mark.parametrize( + "_index_factory,_series_name,_index_start,_index_end", [DATE_RANGE, TIMEDELTA_RANGE] +) +def test_asfreq(series_and_frame, freq, create_index): + obj = series_and_frame + + result = obj.resample(freq).asfreq() + new_index = create_index(obj.index[0], obj.index[-1], freq=freq) + expected = obj.reindex(new_index) + tm.assert_almost_equal(result, expected) + + +@pytest.mark.parametrize( + "_index_factory,_series_name,_index_start,_index_end", [DATE_RANGE, TIMEDELTA_RANGE] +) +def test_asfreq_fill_value(series, create_index): + # test for fill value during resampling, issue 3715 + + s = series + + result = s.resample("1H").asfreq() + new_index = create_index(s.index[0], s.index[-1], freq="1H") + expected = s.reindex(new_index) + tm.assert_series_equal(result, expected) + + frame = s.to_frame("value") + frame.iloc[1] = None + result = frame.resample("1H").asfreq(fill_value=4.0) + new_index = create_index(frame.index[0], frame.index[-1], freq="1H") + expected = frame.reindex(new_index, fill_value=4.0) + tm.assert_frame_equal(result, expected) + + +@all_ts +def test_resample_interpolate(frame): + # # 12925 + df = frame + tm.assert_frame_equal( + df.resample("1T").asfreq().interpolate(), df.resample("1T").interpolate() + ) + + +def test_raises_on_non_datetimelike_index(): + # this is a non datetimelike index + xp = DataFrame() + msg = ( + "Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex, " + "but got an instance of 'Index'" + ) + with pytest.raises(TypeError, match=msg): + xp.resample("A").mean() + + +@all_ts +@pytest.mark.parametrize("freq", ["M", "D", "H"]) +def test_resample_empty_series(freq, empty_series, resample_method): + # GH12771 & GH12868 + + if resample_method == "ohlc": + pytest.skip("need to test for ohlc from GH13083") + + s = empty_series + result = getattr(s.resample(freq), resample_method)() + + expected = s.copy() + if isinstance(s.index, PeriodIndex): + expected.index = s.index.asfreq(freq=freq) + else: + expected.index = s.index._shallow_copy(freq=freq) + tm.assert_index_equal(result.index, expected.index) + assert result.index.freq == expected.index.freq + tm.assert_series_equal(result, expected, check_dtype=False) + + +@all_ts +@pytest.mark.parametrize("freq", ["M", "D", "H"]) +@pytest.mark.parametrize("resample_method", ["count", "size"]) +def test_resample_count_empty_series(freq, empty_series, resample_method): + # GH28427 + result = getattr(empty_series.resample(freq), resample_method)() + + if isinstance(empty_series.index, PeriodIndex): + index = empty_series.index.asfreq(freq=freq) + else: + index = empty_series.index._shallow_copy(freq=freq) + expected = pd.Series([], dtype="int64", index=index, name=empty_series.name) + + tm.assert_series_equal(result, expected) + + +@all_ts +@pytest.mark.parametrize("freq", ["M", "D", "H"]) +def test_resample_empty_dataframe(empty_frame, freq, resample_method): + # GH13212 + df = empty_frame + # count retains dimensions too + result = getattr(df.resample(freq), resample_method)() + if resample_method != "size": + expected = df.copy() + else: + # GH14962 + expected = Series([], dtype=object) + + if isinstance(df.index, PeriodIndex): + expected.index = df.index.asfreq(freq=freq) + else: + expected.index = df.index._shallow_copy(freq=freq) + tm.assert_index_equal(result.index, expected.index) + assert result.index.freq == expected.index.freq + tm.assert_almost_equal(result, expected, check_dtype=False) + + # test size for GH13212 (currently stays as df) + + +@all_ts +@pytest.mark.parametrize("freq", ["M", "D", "H"]) +def test_resample_count_empty_dataframe(freq, empty_frame): + # GH28427 + + empty_frame = empty_frame.copy() + empty_frame["a"] = [] + + result = empty_frame.resample(freq).count() + + if isinstance(empty_frame.index, PeriodIndex): + index = empty_frame.index.asfreq(freq=freq) + else: + index = empty_frame.index._shallow_copy(freq=freq) + expected = pd.DataFrame({"a": []}, dtype="int64", index=index) + + tm.assert_frame_equal(result, expected) + + +@all_ts +@pytest.mark.parametrize("freq", ["M", "D", "H"]) +def test_resample_size_empty_dataframe(freq, empty_frame): + # GH28427 + + empty_frame = empty_frame.copy() + empty_frame["a"] = [] + + result = empty_frame.resample(freq).size() + + if isinstance(empty_frame.index, PeriodIndex): + index = empty_frame.index.asfreq(freq=freq) + else: + index = empty_frame.index._shallow_copy(freq=freq) + expected = pd.Series([], dtype="int64", index=index) + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("index", tm.all_timeseries_index_generator(0)) +@pytest.mark.parametrize("dtype", [np.float, np.int, np.object, "datetime64[ns]"]) +def test_resample_empty_dtypes(index, dtype, resample_method): + + # Empty series were sometimes causing a segfault (for the functions + # with Cython bounds-checking disabled) or an IndexError. We just run + # them to ensure they no longer do. (GH #10228) + empty_series = Series([], index, dtype) + try: + getattr(empty_series.resample("d"), resample_method)() + except DataError: + # Ignore these since some combinations are invalid + # (ex: doing mean with dtype of np.object) + pass + + +@all_ts +@pytest.mark.parametrize("arg", ["mean", {"value": "mean"}, ["mean"]]) +def test_resample_loffset_arg_type(frame, create_index, arg): + # GH 13218, 15002 + df = frame + expected_means = [df.values[i : i + 2].mean() for i in range(0, len(df.values), 2)] + expected_index = create_index(df.index[0], periods=len(df.index) / 2, freq="2D") + + # loffset coerces PeriodIndex to DateTimeIndex + if isinstance(expected_index, PeriodIndex): + expected_index = expected_index.to_timestamp() + + expected_index += timedelta(hours=2) + expected = DataFrame({"value": expected_means}, index=expected_index) + + result_agg = df.resample("2D", loffset="2H").agg(arg) + + if isinstance(arg, list): + expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) + + # GH 13022, 7687 - TODO: fix resample w/ TimedeltaIndex + if isinstance(expected.index, TimedeltaIndex): + msg = "DataFrame are different" + with pytest.raises(AssertionError, match=msg): + tm.assert_frame_equal(result_agg, expected) + else: + tm.assert_frame_equal(result_agg, expected) + + +@all_ts +def test_apply_to_empty_series(empty_series): + # GH 14313 + s = empty_series + for freq in ["M", "D", "H"]: + result = s.resample(freq).apply(lambda x: 1) + expected = s.resample(freq).apply(np.sum) + + tm.assert_series_equal(result, expected, check_dtype=False) + + +@all_ts +def test_resampler_is_iterable(series): + # GH 15314 + freq = "H" + tg = Grouper(freq=freq, convention="start") + grouped = series.groupby(tg) + resampled = series.resample(freq) + for (rk, rv), (gk, gv) in zip(resampled, grouped): + assert rk == gk + tm.assert_series_equal(rv, gv) + + +@all_ts +def test_resample_quantile(series): + # GH 15023 + s = series + q = 0.75 + freq = "H" + result = s.resample(freq).quantile(q) + expected = s.resample(freq).agg(lambda x: x.quantile(q)).rename(s.name) + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/resample/test_datetime_index.py b/venv/Lib/site-packages/pandas/tests/resample/test_datetime_index.py new file mode 100644 index 0000000..3ad82b9 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/resample/test_datetime_index.py @@ -0,0 +1,1585 @@ +from datetime import datetime, timedelta +from functools import partial +from io import StringIO + +import numpy as np +import pytest +import pytz + +from pandas.errors import UnsupportedFunctionCall + +import pandas as pd +from pandas import DataFrame, Series, Timedelta, Timestamp, isna, notna +import pandas._testing as tm +from pandas.core.groupby.grouper import Grouper +from pandas.core.indexes.datetimes import date_range +from pandas.core.indexes.period import Period, period_range +from pandas.core.resample import DatetimeIndex, _get_timestamp_range_edges + +import pandas.tseries.offsets as offsets +from pandas.tseries.offsets import BDay, Minute + + +@pytest.fixture() +def _index_factory(): + return date_range + + +@pytest.fixture +def _index_freq(): + return "Min" + + +@pytest.fixture +def _static_values(index): + return np.random.rand(len(index)) + + +def test_custom_grouper(index): + + dti = index + s = Series(np.array([1] * len(dti)), index=dti, dtype="int64") + + b = Grouper(freq=Minute(5)) + g = s.groupby(b) + + # check all cython functions work + funcs = ["add", "mean", "prod", "ohlc", "min", "max", "var"] + for f in funcs: + g._cython_agg_general(f) + + b = Grouper(freq=Minute(5), closed="right", label="right") + g = s.groupby(b) + # check all cython functions work + funcs = ["add", "mean", "prod", "ohlc", "min", "max", "var"] + for f in funcs: + g._cython_agg_general(f) + + assert g.ngroups == 2593 + assert notna(g.mean()).all() + + # construct expected val + arr = [1] + [5] * 2592 + idx = dti[0:-1:5] + idx = idx.append(dti[-1:]) + expect = Series(arr, index=idx) + + # GH2763 - return in put dtype if we can + result = g.agg(np.sum) + tm.assert_series_equal(result, expect) + + df = DataFrame(np.random.rand(len(dti), 10), index=dti, dtype="float64") + r = df.groupby(b).agg(np.sum) + + assert len(r.columns) == 10 + assert len(r.index) == 2593 + + +@pytest.mark.parametrize( + "_index_start,_index_end,_index_name", + [("1/1/2000 00:00:00", "1/1/2000 00:13:00", "index")], +) +@pytest.mark.parametrize( + "closed, expected", + [ + ( + "right", + lambda s: Series( + [s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()], + index=date_range("1/1/2000", periods=4, freq="5min", name="index"), + ), + ), + ( + "left", + lambda s: Series( + [s[:5].mean(), s[5:10].mean(), s[10:].mean()], + index=date_range( + "1/1/2000 00:05", periods=3, freq="5min", name="index" + ), + ), + ), + ], +) +def test_resample_basic(series, closed, expected): + s = series + expected = expected(s) + result = s.resample("5min", closed=closed, label="right").mean() + tm.assert_series_equal(result, expected) + + +def test_resample_integerarray(): + # GH 25580, resample on IntegerArray + ts = pd.Series( + range(9), index=pd.date_range("1/1/2000", periods=9, freq="T"), dtype="Int64" + ) + result = ts.resample("3T").sum() + expected = Series( + [3, 12, 21], + index=pd.date_range("1/1/2000", periods=3, freq="3T"), + dtype="Int64", + ) + tm.assert_series_equal(result, expected) + + result = ts.resample("3T").mean() + expected = Series( + [1, 4, 7], + index=pd.date_range("1/1/2000", periods=3, freq="3T"), + dtype="float64", + ) + tm.assert_series_equal(result, expected) + + +def test_resample_basic_grouper(series): + s = series + result = s.resample("5Min").last() + grouper = Grouper(freq=Minute(5), closed="left", label="left") + expected = s.groupby(grouper).agg(lambda x: x[-1]) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "_index_start,_index_end,_index_name", + [("1/1/2000 00:00:00", "1/1/2000 00:13:00", "index")], +) +@pytest.mark.parametrize( + "keyword,value", + [("label", "righttt"), ("closed", "righttt"), ("convention", "starttt")], +) +def test_resample_string_kwargs(series, keyword, value): + # see gh-19303 + # Check that wrong keyword argument strings raise an error + msg = f"Unsupported value {value} for `{keyword}`" + with pytest.raises(ValueError, match=msg): + series.resample("5min", **({keyword: value})) + + +@pytest.mark.parametrize( + "_index_start,_index_end,_index_name", + [("1/1/2000 00:00:00", "1/1/2000 00:13:00", "index")], +) +def test_resample_how(series, downsample_method): + if downsample_method == "ohlc": + pytest.skip("covered by test_resample_how_ohlc") + + s = series + grouplist = np.ones_like(s) + grouplist[0] = 0 + grouplist[1:6] = 1 + grouplist[6:11] = 2 + grouplist[11:] = 3 + expected = s.groupby(grouplist).agg(downsample_method) + expected.index = date_range("1/1/2000", periods=4, freq="5min", name="index") + + result = getattr( + s.resample("5min", closed="right", label="right"), downsample_method + )() + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "_index_start,_index_end,_index_name", + [("1/1/2000 00:00:00", "1/1/2000 00:13:00", "index")], +) +def test_resample_how_ohlc(series): + s = series + grouplist = np.ones_like(s) + grouplist[0] = 0 + grouplist[1:6] = 1 + grouplist[6:11] = 2 + grouplist[11:] = 3 + + def _ohlc(group): + if isna(group).all(): + return np.repeat(np.nan, 4) + return [group[0], group.max(), group.min(), group[-1]] + + expected = DataFrame( + s.groupby(grouplist).agg(_ohlc).values.tolist(), + index=date_range("1/1/2000", periods=4, freq="5min", name="index"), + columns=["open", "high", "low", "close"], + ) + + result = s.resample("5min", closed="right", label="right").ohlc() + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("func", ["min", "max", "sum", "prod", "mean", "var", "std"]) +def test_numpy_compat(func): + # see gh-12811 + s = Series([1, 2, 3, 4, 5], index=date_range("20130101", periods=5, freq="s")) + r = s.resample("2s") + + msg = "numpy operations are not valid with resample" + + with pytest.raises(UnsupportedFunctionCall, match=msg): + getattr(r, func)(func, 1, 2, 3) + with pytest.raises(UnsupportedFunctionCall, match=msg): + getattr(r, func)(axis=1) + + +def test_resample_how_callables(): + # GH#7929 + data = np.arange(5, dtype=np.int64) + ind = date_range(start="2014-01-01", periods=len(data), freq="d") + df = DataFrame({"A": data, "B": data}, index=ind) + + def fn(x, a=1): + return str(type(x)) + + class FnClass: + def __call__(self, x): + return str(type(x)) + + df_standard = df.resample("M").apply(fn) + df_lambda = df.resample("M").apply(lambda x: str(type(x))) + df_partial = df.resample("M").apply(partial(fn)) + df_partial2 = df.resample("M").apply(partial(fn, a=2)) + df_class = df.resample("M").apply(FnClass()) + + tm.assert_frame_equal(df_standard, df_lambda) + tm.assert_frame_equal(df_standard, df_partial) + tm.assert_frame_equal(df_standard, df_partial2) + tm.assert_frame_equal(df_standard, df_class) + + +def test_resample_rounding(): + # GH 8371 + # odd results when rounding is needed + + data = """date,time,value +11-08-2014,00:00:01.093,1 +11-08-2014,00:00:02.159,1 +11-08-2014,00:00:02.667,1 +11-08-2014,00:00:03.175,1 +11-08-2014,00:00:07.058,1 +11-08-2014,00:00:07.362,1 +11-08-2014,00:00:08.324,1 +11-08-2014,00:00:08.830,1 +11-08-2014,00:00:08.982,1 +11-08-2014,00:00:09.815,1 +11-08-2014,00:00:10.540,1 +11-08-2014,00:00:11.061,1 +11-08-2014,00:00:11.617,1 +11-08-2014,00:00:13.607,1 +11-08-2014,00:00:14.535,1 +11-08-2014,00:00:15.525,1 +11-08-2014,00:00:17.960,1 +11-08-2014,00:00:20.674,1 +11-08-2014,00:00:21.191,1""" + + df = pd.read_csv( + StringIO(data), + parse_dates={"timestamp": ["date", "time"]}, + index_col="timestamp", + ) + df.index.name = None + result = df.resample("6s").sum() + expected = DataFrame( + {"value": [4, 9, 4, 2]}, index=date_range("2014-11-08", freq="6s", periods=4) + ) + tm.assert_frame_equal(result, expected) + + result = df.resample("7s").sum() + expected = DataFrame( + {"value": [4, 10, 4, 1]}, index=date_range("2014-11-08", freq="7s", periods=4) + ) + tm.assert_frame_equal(result, expected) + + result = df.resample("11s").sum() + expected = DataFrame( + {"value": [11, 8]}, index=date_range("2014-11-08", freq="11s", periods=2) + ) + tm.assert_frame_equal(result, expected) + + result = df.resample("13s").sum() + expected = DataFrame( + {"value": [13, 6]}, index=date_range("2014-11-08", freq="13s", periods=2) + ) + tm.assert_frame_equal(result, expected) + + result = df.resample("17s").sum() + expected = DataFrame( + {"value": [16, 3]}, index=date_range("2014-11-08", freq="17s", periods=2) + ) + tm.assert_frame_equal(result, expected) + + +def test_resample_basic_from_daily(): + # from daily + dti = date_range( + start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq="D", name="index" + ) + + s = Series(np.random.rand(len(dti)), dti) + + # to weekly + result = s.resample("w-sun").last() + + assert len(result) == 3 + assert (result.index.dayofweek == [6, 6, 6]).all() + assert result.iloc[0] == s["1/2/2005"] + assert result.iloc[1] == s["1/9/2005"] + assert result.iloc[2] == s.iloc[-1] + + result = s.resample("W-MON").last() + assert len(result) == 2 + assert (result.index.dayofweek == [0, 0]).all() + assert result.iloc[0] == s["1/3/2005"] + assert result.iloc[1] == s["1/10/2005"] + + result = s.resample("W-TUE").last() + assert len(result) == 2 + assert (result.index.dayofweek == [1, 1]).all() + assert result.iloc[0] == s["1/4/2005"] + assert result.iloc[1] == s["1/10/2005"] + + result = s.resample("W-WED").last() + assert len(result) == 2 + assert (result.index.dayofweek == [2, 2]).all() + assert result.iloc[0] == s["1/5/2005"] + assert result.iloc[1] == s["1/10/2005"] + + result = s.resample("W-THU").last() + assert len(result) == 2 + assert (result.index.dayofweek == [3, 3]).all() + assert result.iloc[0] == s["1/6/2005"] + assert result.iloc[1] == s["1/10/2005"] + + result = s.resample("W-FRI").last() + assert len(result) == 2 + assert (result.index.dayofweek == [4, 4]).all() + assert result.iloc[0] == s["1/7/2005"] + assert result.iloc[1] == s["1/10/2005"] + + # to biz day + result = s.resample("B").last() + assert len(result) == 7 + assert (result.index.dayofweek == [4, 0, 1, 2, 3, 4, 0]).all() + + assert result.iloc[0] == s["1/2/2005"] + assert result.iloc[1] == s["1/3/2005"] + assert result.iloc[5] == s["1/9/2005"] + assert result.index.name == "index" + + +def test_resample_upsampling_picked_but_not_correct(): + + # Test for issue #3020 + dates = date_range("01-Jan-2014", "05-Jan-2014", freq="D") + series = Series(1, index=dates) + + result = series.resample("D").mean() + assert result.index[0] == dates[0] + + # GH 5955 + # incorrect deciding to upsample when the axis frequency matches the + # resample frequency + + s = Series( + np.arange(1.0, 6), index=[datetime(1975, 1, i, 12, 0) for i in range(1, 6)] + ) + expected = Series( + np.arange(1.0, 6), index=date_range("19750101", periods=5, freq="D") + ) + + result = s.resample("D").count() + tm.assert_series_equal(result, Series(1, index=expected.index)) + + result1 = s.resample("D").sum() + result2 = s.resample("D").mean() + tm.assert_series_equal(result1, expected) + tm.assert_series_equal(result2, expected) + + +def test_resample_frame_basic(): + df = tm.makeTimeDataFrame() + + b = Grouper(freq="M") + g = df.groupby(b) + + # check all cython functions work + funcs = ["add", "mean", "prod", "min", "max", "var"] + for f in funcs: + g._cython_agg_general(f) + + result = df.resample("A").mean() + tm.assert_series_equal(result["A"], df["A"].resample("A").mean()) + + result = df.resample("M").mean() + tm.assert_series_equal(result["A"], df["A"].resample("M").mean()) + + df.resample("M", kind="period").mean() + df.resample("W-WED", kind="period").mean() + + +@pytest.mark.parametrize( + "loffset", [timedelta(minutes=1), "1min", Minute(1), np.timedelta64(1, "m")] +) +def test_resample_loffset(loffset): + # GH 7687 + rng = date_range("1/1/2000 00:00:00", "1/1/2000 00:13:00", freq="min") + s = Series(np.random.randn(14), index=rng) + + result = s.resample("5min", closed="right", label="right", loffset=loffset).mean() + idx = date_range("1/1/2000", periods=4, freq="5min") + expected = Series( + [s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()], + index=idx + timedelta(minutes=1), + ) + tm.assert_series_equal(result, expected) + assert result.index.freq == Minute(5) + + # from daily + dti = date_range(start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq="D") + ser = Series(np.random.rand(len(dti)), dti) + + # to weekly + result = ser.resample("w-sun").last() + business_day_offset = BDay() + expected = ser.resample("w-sun", loffset=-business_day_offset).last() + assert result.index[0] - business_day_offset == expected.index[0] + + +def test_resample_loffset_upsample(): + # GH 20744 + rng = date_range("1/1/2000 00:00:00", "1/1/2000 00:13:00", freq="min") + s = Series(np.random.randn(14), index=rng) + + result = s.resample( + "5min", closed="right", label="right", loffset=timedelta(minutes=1) + ).ffill() + idx = date_range("1/1/2000", periods=4, freq="5min") + expected = Series([s[0], s[5], s[10], s[-1]], index=idx + timedelta(minutes=1)) + + tm.assert_series_equal(result, expected) + + +def test_resample_loffset_count(): + # GH 12725 + start_time = "1/1/2000 00:00:00" + rng = date_range(start_time, periods=100, freq="S") + ts = Series(np.random.randn(len(rng)), index=rng) + + result = ts.resample("10S", loffset="1s").count() + + expected_index = date_range(start_time, periods=10, freq="10S") + timedelta( + seconds=1 + ) + expected = Series(10, index=expected_index) + + tm.assert_series_equal(result, expected) + + # Same issue should apply to .size() since it goes through + # same code path + result = ts.resample("10S", loffset="1s").size() + + tm.assert_series_equal(result, expected) + + +def test_resample_upsample(): + # from daily + dti = date_range( + start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq="D", name="index" + ) + + s = Series(np.random.rand(len(dti)), dti) + + # to minutely, by padding + result = s.resample("Min").pad() + assert len(result) == 12961 + assert result[0] == s[0] + assert result[-1] == s[-1] + + assert result.index.name == "index" + + +def test_resample_how_method(): + # GH9915 + s = Series( + [11, 22], + index=[ + Timestamp("2015-03-31 21:48:52.672000"), + Timestamp("2015-03-31 21:49:52.739000"), + ], + ) + expected = Series( + [11, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, 22], + index=[ + Timestamp("2015-03-31 21:48:50"), + Timestamp("2015-03-31 21:49:00"), + Timestamp("2015-03-31 21:49:10"), + Timestamp("2015-03-31 21:49:20"), + Timestamp("2015-03-31 21:49:30"), + Timestamp("2015-03-31 21:49:40"), + Timestamp("2015-03-31 21:49:50"), + ], + ) + tm.assert_series_equal(s.resample("10S").mean(), expected) + + +def test_resample_extra_index_point(): + # GH#9756 + index = date_range(start="20150101", end="20150331", freq="BM") + expected = DataFrame({"A": Series([21, 41, 63], index=index)}) + + index = date_range(start="20150101", end="20150331", freq="B") + df = DataFrame({"A": Series(range(len(index)), index=index)}, dtype="int64") + result = df.resample("BM").last() + tm.assert_frame_equal(result, expected) + + +def test_upsample_with_limit(): + rng = date_range("1/1/2000", periods=3, freq="5t") + ts = Series(np.random.randn(len(rng)), rng) + + result = ts.resample("t").ffill(limit=2) + expected = ts.reindex(result.index, method="ffill", limit=2) + tm.assert_series_equal(result, expected) + + +def test_nearest_upsample_with_limit(): + rng = date_range("1/1/2000", periods=3, freq="5t") + ts = Series(np.random.randn(len(rng)), rng) + + result = ts.resample("t").nearest(limit=2) + expected = ts.reindex(result.index, method="nearest", limit=2) + tm.assert_series_equal(result, expected) + + +def test_resample_ohlc(series): + s = series + + grouper = Grouper(freq=Minute(5)) + expect = s.groupby(grouper).agg(lambda x: x[-1]) + result = s.resample("5Min").ohlc() + + assert len(result) == len(expect) + assert len(result.columns) == 4 + + xs = result.iloc[-2] + assert xs["open"] == s[-6] + assert xs["high"] == s[-6:-1].max() + assert xs["low"] == s[-6:-1].min() + assert xs["close"] == s[-2] + + xs = result.iloc[0] + assert xs["open"] == s[0] + assert xs["high"] == s[:5].max() + assert xs["low"] == s[:5].min() + assert xs["close"] == s[4] + + +def test_resample_ohlc_result(): + + # GH 12332 + index = pd.date_range("1-1-2000", "2-15-2000", freq="h") + index = index.union(pd.date_range("4-15-2000", "5-15-2000", freq="h")) + s = Series(range(len(index)), index=index) + + a = s.loc[:"4-15-2000"].resample("30T").ohlc() + assert isinstance(a, DataFrame) + + b = s.loc[:"4-14-2000"].resample("30T").ohlc() + assert isinstance(b, DataFrame) + + # GH12348 + # raising on odd period + rng = date_range("2013-12-30", "2014-01-07") + index = rng.drop( + [ + Timestamp("2014-01-01"), + Timestamp("2013-12-31"), + Timestamp("2014-01-04"), + Timestamp("2014-01-05"), + ] + ) + df = DataFrame(data=np.arange(len(index)), index=index) + result = df.resample("B").mean() + expected = df.reindex(index=date_range(rng[0], rng[-1], freq="B")) + tm.assert_frame_equal(result, expected) + + +def test_resample_ohlc_dataframe(): + df = ( + DataFrame( + { + "PRICE": { + Timestamp("2011-01-06 10:59:05", tz=None): 24990, + Timestamp("2011-01-06 12:43:33", tz=None): 25499, + Timestamp("2011-01-06 12:54:09", tz=None): 25499, + }, + "VOLUME": { + Timestamp("2011-01-06 10:59:05", tz=None): 1500000000, + Timestamp("2011-01-06 12:43:33", tz=None): 5000000000, + Timestamp("2011-01-06 12:54:09", tz=None): 100000000, + }, + } + ) + ).reindex(["VOLUME", "PRICE"], axis=1) + res = df.resample("H").ohlc() + exp = pd.concat( + [df["VOLUME"].resample("H").ohlc(), df["PRICE"].resample("H").ohlc()], + axis=1, + keys=["VOLUME", "PRICE"], + ) + tm.assert_frame_equal(exp, res) + + df.columns = [["a", "b"], ["c", "d"]] + res = df.resample("H").ohlc() + exp.columns = pd.MultiIndex.from_tuples( + [ + ("a", "c", "open"), + ("a", "c", "high"), + ("a", "c", "low"), + ("a", "c", "close"), + ("b", "d", "open"), + ("b", "d", "high"), + ("b", "d", "low"), + ("b", "d", "close"), + ] + ) + tm.assert_frame_equal(exp, res) + + # dupe columns fail atm + # df.columns = ['PRICE', 'PRICE'] + + +def test_resample_dup_index(): + + # GH 4812 + # dup columns with resample raising + df = DataFrame( + np.random.randn(4, 12), + index=[2000, 2000, 2000, 2000], + columns=[Period(year=2000, month=i + 1, freq="M") for i in range(12)], + ) + df.iloc[3, :] = np.nan + result = df.resample("Q", axis=1).mean() + expected = df.groupby(lambda x: int((x.month - 1) / 3), axis=1).mean() + expected.columns = [Period(year=2000, quarter=i + 1, freq="Q") for i in range(4)] + tm.assert_frame_equal(result, expected) + + +def test_resample_reresample(): + dti = date_range(start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq="D") + s = Series(np.random.rand(len(dti)), dti) + bs = s.resample("B", closed="right", label="right").mean() + result = bs.resample("8H").mean() + assert len(result) == 22 + assert isinstance(result.index.freq, offsets.DateOffset) + assert result.index.freq == offsets.Hour(8) + + +def test_resample_timestamp_to_period(simple_date_range_series): + ts = simple_date_range_series("1/1/1990", "1/1/2000") + + result = ts.resample("A-DEC", kind="period").mean() + expected = ts.resample("A-DEC").mean() + expected.index = period_range("1990", "2000", freq="a-dec") + tm.assert_series_equal(result, expected) + + result = ts.resample("A-JUN", kind="period").mean() + expected = ts.resample("A-JUN").mean() + expected.index = period_range("1990", "2000", freq="a-jun") + tm.assert_series_equal(result, expected) + + result = ts.resample("M", kind="period").mean() + expected = ts.resample("M").mean() + expected.index = period_range("1990-01", "2000-01", freq="M") + tm.assert_series_equal(result, expected) + + result = ts.resample("M", kind="period").mean() + expected = ts.resample("M").mean() + expected.index = period_range("1990-01", "2000-01", freq="M") + tm.assert_series_equal(result, expected) + + +def test_ohlc_5min(): + def _ohlc(group): + if isna(group).all(): + return np.repeat(np.nan, 4) + return [group[0], group.max(), group.min(), group[-1]] + + rng = date_range("1/1/2000 00:00:00", "1/1/2000 5:59:50", freq="10s") + ts = Series(np.random.randn(len(rng)), index=rng) + + resampled = ts.resample("5min", closed="right", label="right").ohlc() + + assert (resampled.loc["1/1/2000 00:00"] == ts[0]).all() + + exp = _ohlc(ts[1:31]) + assert (resampled.loc["1/1/2000 00:05"] == exp).all() + + exp = _ohlc(ts["1/1/2000 5:55:01":]) + assert (resampled.loc["1/1/2000 6:00:00"] == exp).all() + + +def test_downsample_non_unique(): + rng = date_range("1/1/2000", "2/29/2000") + rng2 = rng.repeat(5).values + ts = Series(np.random.randn(len(rng2)), index=rng2) + + result = ts.resample("M").mean() + + expected = ts.groupby(lambda x: x.month).mean() + assert len(result) == 2 + tm.assert_almost_equal(result[0], expected[1]) + tm.assert_almost_equal(result[1], expected[2]) + + +def test_asfreq_non_unique(): + # GH #1077 + rng = date_range("1/1/2000", "2/29/2000") + rng2 = rng.repeat(2).values + ts = Series(np.random.randn(len(rng2)), index=rng2) + + msg = "cannot reindex from a duplicate axis" + with pytest.raises(ValueError, match=msg): + ts.asfreq("B") + + +def test_resample_axis1(): + rng = date_range("1/1/2000", "2/29/2000") + df = DataFrame(np.random.randn(3, len(rng)), columns=rng, index=["a", "b", "c"]) + + result = df.resample("M", axis=1).mean() + expected = df.T.resample("M").mean().T + tm.assert_frame_equal(result, expected) + + +def test_resample_anchored_ticks(): + # If a fixed delta (5 minute, 4 hour) evenly divides a day, we should + # "anchor" the origin at midnight so we get regular intervals rather + # than starting from the first timestamp which might start in the + # middle of a desired interval + + rng = date_range("1/1/2000 04:00:00", periods=86400, freq="s") + ts = Series(np.random.randn(len(rng)), index=rng) + ts[:2] = np.nan # so results are the same + + freqs = ["t", "5t", "15t", "30t", "4h", "12h"] + for freq in freqs: + result = ts[2:].resample(freq, closed="left", label="left").mean() + expected = ts.resample(freq, closed="left", label="left").mean() + tm.assert_series_equal(result, expected) + + +def test_resample_single_group(): + mysum = lambda x: x.sum() + + rng = date_range("2000-1-1", "2000-2-10", freq="D") + ts = Series(np.random.randn(len(rng)), index=rng) + tm.assert_series_equal(ts.resample("M").sum(), ts.resample("M").apply(mysum)) + + rng = date_range("2000-1-1", "2000-1-10", freq="D") + ts = Series(np.random.randn(len(rng)), index=rng) + tm.assert_series_equal(ts.resample("M").sum(), ts.resample("M").apply(mysum)) + + # GH 3849 + s = Series( + [30.1, 31.6], + index=[Timestamp("20070915 15:30:00"), Timestamp("20070915 15:40:00")], + ) + expected = Series([0.75], index=[Timestamp("20070915")]) + result = s.resample("D").apply(lambda x: np.std(x)) + tm.assert_series_equal(result, expected) + + +def test_resample_base(): + rng = date_range("1/1/2000 00:00:00", "1/1/2000 02:00", freq="s") + ts = Series(np.random.randn(len(rng)), index=rng) + + resampled = ts.resample("5min", base=2).mean() + exp_rng = date_range("12/31/1999 23:57:00", "1/1/2000 01:57", freq="5min") + tm.assert_index_equal(resampled.index, exp_rng) + + +def test_resample_float_base(): + # GH25161 + dt = pd.to_datetime( + ["2018-11-26 16:17:43.51", "2018-11-26 16:17:44.51", "2018-11-26 16:17:45.51"] + ) + s = Series(np.arange(3), index=dt) + + base = 17 + 43.51 / 60 + result = s.resample("3min", base=base).size() + expected = Series(3, index=pd.DatetimeIndex(["2018-11-26 16:17:43.51"])) + tm.assert_series_equal(result, expected) + + +def test_resample_daily_anchored(): + rng = date_range("1/1/2000 0:00:00", periods=10000, freq="T") + ts = Series(np.random.randn(len(rng)), index=rng) + ts[:2] = np.nan # so results are the same + + result = ts[2:].resample("D", closed="left", label="left").mean() + expected = ts.resample("D", closed="left", label="left").mean() + tm.assert_series_equal(result, expected) + + +def test_resample_to_period_monthly_buglet(): + # GH #1259 + + rng = date_range("1/1/2000", "12/31/2000") + ts = Series(np.random.randn(len(rng)), index=rng) + + result = ts.resample("M", kind="period").mean() + exp_index = period_range("Jan-2000", "Dec-2000", freq="M") + tm.assert_index_equal(result.index, exp_index) + + +def test_period_with_agg(): + + # aggregate a period resampler with a lambda + s2 = Series( + np.random.randint(0, 5, 50), + index=pd.period_range("2012-01-01", freq="H", periods=50), + dtype="float64", + ) + + expected = s2.to_timestamp().resample("D").mean().to_period() + result = s2.resample("D").agg(lambda x: x.mean()) + tm.assert_series_equal(result, expected) + + +def test_resample_segfault(): + # GH 8573 + # segfaulting in older versions + all_wins_and_wagers = [ + (1, datetime(2013, 10, 1, 16, 20), 1, 0), + (2, datetime(2013, 10, 1, 16, 10), 1, 0), + (2, datetime(2013, 10, 1, 18, 15), 1, 0), + (2, datetime(2013, 10, 1, 16, 10, 31), 1, 0), + ] + + df = DataFrame.from_records( + all_wins_and_wagers, columns=("ID", "timestamp", "A", "B") + ).set_index("timestamp") + result = df.groupby("ID").resample("5min").sum() + expected = df.groupby("ID").apply(lambda x: x.resample("5min").sum()) + tm.assert_frame_equal(result, expected) + + +def test_resample_dtype_preservation(): + + # GH 12202 + # validation tests for dtype preservation + + df = DataFrame( + { + "date": pd.date_range(start="2016-01-01", periods=4, freq="W"), + "group": [1, 1, 2, 2], + "val": Series([5, 6, 7, 8], dtype="int32"), + } + ).set_index("date") + + result = df.resample("1D").ffill() + assert result.val.dtype == np.int32 + + result = df.groupby("group").resample("1D").ffill() + assert result.val.dtype == np.int32 + + +def test_resample_dtype_coercion(): + + pytest.importorskip("scipy.interpolate") + + # GH 16361 + df = {"a": [1, 3, 1, 4]} + df = DataFrame(df, index=pd.date_range("2017-01-01", "2017-01-04")) + + expected = df.astype("float64").resample("H").mean()["a"].interpolate("cubic") + + result = df.resample("H")["a"].mean().interpolate("cubic") + tm.assert_series_equal(result, expected) + + result = df.resample("H").mean()["a"].interpolate("cubic") + tm.assert_series_equal(result, expected) + + +def test_weekly_resample_buglet(): + # #1327 + rng = date_range("1/1/2000", freq="B", periods=20) + ts = Series(np.random.randn(len(rng)), index=rng) + + resampled = ts.resample("W").mean() + expected = ts.resample("W-SUN").mean() + tm.assert_series_equal(resampled, expected) + + +def test_monthly_resample_error(): + # #1451 + dates = date_range("4/16/2012 20:00", periods=5000, freq="h") + ts = Series(np.random.randn(len(dates)), index=dates) + # it works! + ts.resample("M") + + +def test_nanosecond_resample_error(): + # GH 12307 - Values falls after last bin when + # Resampling using pd.tseries.offsets.Nano as period + start = 1443707890427 + exp_start = 1443707890400 + indx = pd.date_range(start=pd.to_datetime(start), periods=10, freq="100n") + ts = Series(range(len(indx)), index=indx) + r = ts.resample(pd.tseries.offsets.Nano(100)) + result = r.agg("mean") + + exp_indx = pd.date_range(start=pd.to_datetime(exp_start), periods=10, freq="100n") + exp = Series(range(len(exp_indx)), index=exp_indx) + + tm.assert_series_equal(result, exp) + + +def test_resample_anchored_intraday(simple_date_range_series): + # #1471, #1458 + + rng = date_range("1/1/2012", "4/1/2012", freq="100min") + df = DataFrame(rng.month, index=rng) + + result = df.resample("M").mean() + expected = df.resample("M", kind="period").mean().to_timestamp(how="end") + expected.index += Timedelta(1, "ns") - Timedelta(1, "D") + tm.assert_frame_equal(result, expected) + + result = df.resample("M", closed="left").mean() + exp = df.tshift(1, freq="D").resample("M", kind="period").mean() + exp = exp.to_timestamp(how="end") + + exp.index = exp.index + Timedelta(1, "ns") - Timedelta(1, "D") + tm.assert_frame_equal(result, exp) + + rng = date_range("1/1/2012", "4/1/2012", freq="100min") + df = DataFrame(rng.month, index=rng) + + result = df.resample("Q").mean() + expected = df.resample("Q", kind="period").mean().to_timestamp(how="end") + expected.index += Timedelta(1, "ns") - Timedelta(1, "D") + tm.assert_frame_equal(result, expected) + + result = df.resample("Q", closed="left").mean() + expected = df.tshift(1, freq="D").resample("Q", kind="period", closed="left").mean() + expected = expected.to_timestamp(how="end") + expected.index += Timedelta(1, "ns") - Timedelta(1, "D") + tm.assert_frame_equal(result, expected) + + ts = simple_date_range_series("2012-04-29 23:00", "2012-04-30 5:00", freq="h") + resampled = ts.resample("M").mean() + assert len(resampled) == 1 + + +def test_resample_anchored_monthstart(simple_date_range_series): + ts = simple_date_range_series("1/1/2000", "12/31/2002") + + freqs = ["MS", "BMS", "QS-MAR", "AS-DEC", "AS-JUN"] + + for freq in freqs: + ts.resample(freq).mean() + + +def test_resample_anchored_multiday(): + # When resampling a range spanning multiple days, ensure that the + # start date gets used to determine the offset. Fixes issue where + # a one day period is not a multiple of the frequency. + # + # See: https://github.com/pandas-dev/pandas/issues/8683 + + index = pd.date_range( + "2014-10-14 23:06:23.206", periods=3, freq="400L" + ) | pd.date_range("2014-10-15 23:00:00", periods=2, freq="2200L") + + s = Series(np.random.randn(5), index=index) + + # Ensure left closing works + result = s.resample("2200L").mean() + assert result.index[-1] == Timestamp("2014-10-15 23:00:02.000") + + # Ensure right closing works + result = s.resample("2200L", label="right").mean() + assert result.index[-1] == Timestamp("2014-10-15 23:00:04.200") + + +def test_corner_cases(simple_period_range_series, simple_date_range_series): + # miscellaneous test coverage + + rng = date_range("1/1/2000", periods=12, freq="t") + ts = Series(np.random.randn(len(rng)), index=rng) + + result = ts.resample("5t", closed="right", label="left").mean() + ex_index = date_range("1999-12-31 23:55", periods=4, freq="5t") + tm.assert_index_equal(result.index, ex_index) + + len0pts = simple_period_range_series("2007-01", "2010-05", freq="M")[:0] + # it works + result = len0pts.resample("A-DEC").mean() + assert len(result) == 0 + + # resample to periods + ts = simple_date_range_series("2000-04-28", "2000-04-30 11:00", freq="h") + result = ts.resample("M", kind="period").mean() + assert len(result) == 1 + assert result.index[0] == Period("2000-04", freq="M") + + +def test_anchored_lowercase_buglet(): + dates = date_range("4/16/2012 20:00", periods=50000, freq="s") + ts = Series(np.random.randn(len(dates)), index=dates) + # it works! + ts.resample("d").mean() + + +def test_upsample_apply_functions(): + # #1596 + rng = pd.date_range("2012-06-12", periods=4, freq="h") + + ts = Series(np.random.randn(len(rng)), index=rng) + + result = ts.resample("20min").aggregate(["mean", "sum"]) + assert isinstance(result, DataFrame) + + +def test_resample_not_monotonic(): + rng = pd.date_range("2012-06-12", periods=200, freq="h") + ts = Series(np.random.randn(len(rng)), index=rng) + + ts = ts.take(np.random.permutation(len(ts))) + + result = ts.resample("D").sum() + exp = ts.sort_index().resample("D").sum() + tm.assert_series_equal(result, exp) + + +def test_resample_median_bug_1688(): + + for dtype in ["int64", "int32", "float64", "float32"]: + df = DataFrame( + [1, 2], + index=[datetime(2012, 1, 1, 0, 0, 0), datetime(2012, 1, 1, 0, 5, 0)], + dtype=dtype, + ) + + result = df.resample("T").apply(lambda x: x.mean()) + exp = df.asfreq("T") + tm.assert_frame_equal(result, exp) + + result = df.resample("T").median() + exp = df.asfreq("T") + tm.assert_frame_equal(result, exp) + + +def test_how_lambda_functions(simple_date_range_series): + + ts = simple_date_range_series("1/1/2000", "4/1/2000") + + result = ts.resample("M").apply(lambda x: x.mean()) + exp = ts.resample("M").mean() + tm.assert_series_equal(result, exp) + + foo_exp = ts.resample("M").mean() + foo_exp.name = "foo" + bar_exp = ts.resample("M").std() + bar_exp.name = "bar" + + result = ts.resample("M").apply([lambda x: x.mean(), lambda x: x.std(ddof=1)]) + result.columns = ["foo", "bar"] + tm.assert_series_equal(result["foo"], foo_exp) + tm.assert_series_equal(result["bar"], bar_exp) + + # this is a MI Series, so comparing the names of the results + # doesn't make sense + result = ts.resample("M").aggregate( + {"foo": lambda x: x.mean(), "bar": lambda x: x.std(ddof=1)} + ) + tm.assert_series_equal(result["foo"], foo_exp, check_names=False) + tm.assert_series_equal(result["bar"], bar_exp, check_names=False) + + +def test_resample_unequal_times(): + # #1772 + start = datetime(1999, 3, 1, 5) + # end hour is less than start + end = datetime(2012, 7, 31, 4) + bad_ind = date_range(start, end, freq="30min") + df = DataFrame({"close": 1}, index=bad_ind) + + # it works! + df.resample("AS").sum() + + +def test_resample_consistency(): + + # GH 6418 + # resample with bfill / limit / reindex consistency + + i30 = pd.date_range("2002-02-02", periods=4, freq="30T") + s = Series(np.arange(4.0), index=i30) + s[2] = np.NaN + + # Upsample by factor 3 with reindex() and resample() methods: + i10 = pd.date_range(i30[0], i30[-1], freq="10T") + + s10 = s.reindex(index=i10, method="bfill") + s10_2 = s.reindex(index=i10, method="bfill", limit=2) + rl = s.reindex_like(s10, method="bfill", limit=2) + r10_2 = s.resample("10Min").bfill(limit=2) + r10 = s.resample("10Min").bfill() + + # s10_2, r10, r10_2, rl should all be equal + tm.assert_series_equal(s10_2, r10) + tm.assert_series_equal(s10_2, r10_2) + tm.assert_series_equal(s10_2, rl) + + +def test_resample_timegrouper(): + # GH 7227 + dates1 = [ + datetime(2014, 10, 1), + datetime(2014, 9, 3), + datetime(2014, 11, 5), + datetime(2014, 9, 5), + datetime(2014, 10, 8), + datetime(2014, 7, 15), + ] + + dates2 = dates1[:2] + [pd.NaT] + dates1[2:4] + [pd.NaT] + dates1[4:] + dates3 = [pd.NaT] + dates1 + [pd.NaT] + + for dates in [dates1, dates2, dates3]: + df = DataFrame(dict(A=dates, B=np.arange(len(dates)))) + result = df.set_index("A").resample("M").count() + exp_idx = pd.DatetimeIndex( + ["2014-07-31", "2014-08-31", "2014-09-30", "2014-10-31", "2014-11-30"], + freq="M", + name="A", + ) + expected = DataFrame({"B": [1, 0, 2, 2, 1]}, index=exp_idx) + tm.assert_frame_equal(result, expected) + + result = df.groupby(pd.Grouper(freq="M", key="A")).count() + tm.assert_frame_equal(result, expected) + + df = DataFrame(dict(A=dates, B=np.arange(len(dates)), C=np.arange(len(dates)))) + result = df.set_index("A").resample("M").count() + expected = DataFrame( + {"B": [1, 0, 2, 2, 1], "C": [1, 0, 2, 2, 1]}, + index=exp_idx, + columns=["B", "C"], + ) + tm.assert_frame_equal(result, expected) + + result = df.groupby(pd.Grouper(freq="M", key="A")).count() + tm.assert_frame_equal(result, expected) + + +def test_resample_nunique(): + + # GH 12352 + df = DataFrame( + { + "ID": { + Timestamp("2015-06-05 00:00:00"): "0010100903", + Timestamp("2015-06-08 00:00:00"): "0010150847", + }, + "DATE": { + Timestamp("2015-06-05 00:00:00"): "2015-06-05", + Timestamp("2015-06-08 00:00:00"): "2015-06-08", + }, + } + ) + r = df.resample("D") + g = df.groupby(pd.Grouper(freq="D")) + expected = df.groupby(pd.Grouper(freq="D")).ID.apply(lambda x: x.nunique()) + assert expected.name == "ID" + + for t in [r, g]: + result = r.ID.nunique() + tm.assert_series_equal(result, expected) + + result = df.ID.resample("D").nunique() + tm.assert_series_equal(result, expected) + + result = df.ID.groupby(pd.Grouper(freq="D")).nunique() + tm.assert_series_equal(result, expected) + + +def test_resample_nunique_preserves_column_level_names(): + # see gh-23222 + df = tm.makeTimeDataFrame(freq="1D").abs() + df.columns = pd.MultiIndex.from_arrays( + [df.columns.tolist()] * 2, names=["lev0", "lev1"] + ) + result = df.resample("1h").nunique() + tm.assert_index_equal(df.columns, result.columns) + + +def test_resample_nunique_with_date_gap(): + # GH 13453 + index = pd.date_range("1-1-2000", "2-15-2000", freq="h") + index2 = pd.date_range("4-15-2000", "5-15-2000", freq="h") + index3 = index.append(index2) + s = Series(range(len(index3)), index=index3, dtype="int64") + r = s.resample("M") + + # Since all elements are unique, these should all be the same + results = [r.count(), r.nunique(), r.agg(Series.nunique), r.agg("nunique")] + + tm.assert_series_equal(results[0], results[1]) + tm.assert_series_equal(results[0], results[2]) + tm.assert_series_equal(results[0], results[3]) + + +@pytest.mark.parametrize("n", [10000, 100000]) +@pytest.mark.parametrize("k", [10, 100, 1000]) +def test_resample_group_info(n, k): + # GH10914 + + # use a fixed seed to always have the same uniques + prng = np.random.RandomState(1234) + + dr = date_range(start="2015-08-27", periods=n // 10, freq="T") + ts = Series(prng.randint(0, n // k, n).astype("int64"), index=prng.choice(dr, n)) + + left = ts.resample("30T").nunique() + ix = date_range(start=ts.index.min(), end=ts.index.max(), freq="30T") + + vals = ts.values + bins = np.searchsorted(ix.values, ts.index, side="right") + + sorter = np.lexsort((vals, bins)) + vals, bins = vals[sorter], bins[sorter] + + mask = np.r_[True, vals[1:] != vals[:-1]] + mask |= np.r_[True, bins[1:] != bins[:-1]] + + arr = np.bincount(bins[mask] - 1, minlength=len(ix)).astype("int64", copy=False) + right = Series(arr, index=ix) + + tm.assert_series_equal(left, right) + + +def test_resample_size(): + n = 10000 + dr = date_range("2015-09-19", periods=n, freq="T") + ts = Series(np.random.randn(n), index=np.random.choice(dr, n)) + + left = ts.resample("7T").size() + ix = date_range(start=left.index.min(), end=ts.index.max(), freq="7T") + + bins = np.searchsorted(ix.values, ts.index.values, side="right") + val = np.bincount(bins, minlength=len(ix) + 1)[1:].astype("int64", copy=False) + + right = Series(val, index=ix) + tm.assert_series_equal(left, right) + + +def test_resample_across_dst(): + # The test resamples a DatetimeIndex with values before and after a + # DST change + # Issue: 14682 + + # The DatetimeIndex we will start with + # (note that DST happens at 03:00+02:00 -> 02:00+01:00) + # 2016-10-30 02:23:00+02:00, 2016-10-30 02:23:00+01:00 + df1 = DataFrame([1477786980, 1477790580], columns=["ts"]) + dti1 = DatetimeIndex( + pd.to_datetime(df1.ts, unit="s") + .dt.tz_localize("UTC") + .dt.tz_convert("Europe/Madrid") + ) + + # The expected DatetimeIndex after resampling. + # 2016-10-30 02:00:00+02:00, 2016-10-30 02:00:00+01:00 + df2 = DataFrame([1477785600, 1477789200], columns=["ts"]) + dti2 = DatetimeIndex( + pd.to_datetime(df2.ts, unit="s") + .dt.tz_localize("UTC") + .dt.tz_convert("Europe/Madrid") + ) + df = DataFrame([5, 5], index=dti1) + + result = df.resample(rule="H").sum() + expected = DataFrame([5, 5], index=dti2) + + tm.assert_frame_equal(result, expected) + + +def test_groupby_with_dst_time_change(): + # GH 24972 + index = pd.DatetimeIndex( + [1478064900001000000, 1480037118776792000], tz="UTC" + ).tz_convert("America/Chicago") + + df = pd.DataFrame([1, 2], index=index) + result = df.groupby(pd.Grouper(freq="1d")).last() + expected_index_values = pd.date_range( + "2016-11-02", "2016-11-24", freq="d", tz="America/Chicago" + ) + + index = pd.DatetimeIndex(expected_index_values) + expected = pd.DataFrame([1.0] + ([np.nan] * 21) + [2.0], index=index) + tm.assert_frame_equal(result, expected) + + +def test_resample_dst_anchor(): + # 5172 + dti = DatetimeIndex([datetime(2012, 11, 4, 23)], tz="US/Eastern") + df = DataFrame([5], index=dti) + tm.assert_frame_equal( + df.resample(rule="D").sum(), DataFrame([5], index=df.index.normalize()) + ) + df.resample(rule="MS").sum() + tm.assert_frame_equal( + df.resample(rule="MS").sum(), + DataFrame([5], index=DatetimeIndex([datetime(2012, 11, 1)], tz="US/Eastern")), + ) + + dti = date_range("2013-09-30", "2013-11-02", freq="30Min", tz="Europe/Paris") + values = range(dti.size) + df = DataFrame({"a": values, "b": values, "c": values}, index=dti, dtype="int64") + how = {"a": "min", "b": "max", "c": "count"} + + tm.assert_frame_equal( + df.resample("W-MON").agg(how)[["a", "b", "c"]], + DataFrame( + { + "a": [0, 48, 384, 720, 1056, 1394], + "b": [47, 383, 719, 1055, 1393, 1586], + "c": [48, 336, 336, 336, 338, 193], + }, + index=date_range("9/30/2013", "11/4/2013", freq="W-MON", tz="Europe/Paris"), + ), + "W-MON Frequency", + ) + + tm.assert_frame_equal( + df.resample("2W-MON").agg(how)[["a", "b", "c"]], + DataFrame( + { + "a": [0, 48, 720, 1394], + "b": [47, 719, 1393, 1586], + "c": [48, 672, 674, 193], + }, + index=date_range( + "9/30/2013", "11/11/2013", freq="2W-MON", tz="Europe/Paris" + ), + ), + "2W-MON Frequency", + ) + + tm.assert_frame_equal( + df.resample("MS").agg(how)[["a", "b", "c"]], + DataFrame( + {"a": [0, 48, 1538], "b": [47, 1537, 1586], "c": [48, 1490, 49]}, + index=date_range("9/1/2013", "11/1/2013", freq="MS", tz="Europe/Paris"), + ), + "MS Frequency", + ) + + tm.assert_frame_equal( + df.resample("2MS").agg(how)[["a", "b", "c"]], + DataFrame( + {"a": [0, 1538], "b": [1537, 1586], "c": [1538, 49]}, + index=date_range("9/1/2013", "11/1/2013", freq="2MS", tz="Europe/Paris"), + ), + "2MS Frequency", + ) + + df_daily = df["10/26/2013":"10/29/2013"] + tm.assert_frame_equal( + df_daily.resample("D").agg({"a": "min", "b": "max", "c": "count"})[ + ["a", "b", "c"] + ], + DataFrame( + { + "a": [1248, 1296, 1346, 1394], + "b": [1295, 1345, 1393, 1441], + "c": [48, 50, 48, 48], + }, + index=date_range("10/26/2013", "10/29/2013", freq="D", tz="Europe/Paris"), + ), + "D Frequency", + ) + + +def test_downsample_across_dst(): + # GH 8531 + tz = pytz.timezone("Europe/Berlin") + dt = datetime(2014, 10, 26) + dates = date_range(tz.localize(dt), periods=4, freq="2H") + result = Series(5, index=dates).resample("H").mean() + expected = Series( + [5.0, np.nan] * 3 + [5.0], + index=date_range(tz.localize(dt), periods=7, freq="H"), + ) + tm.assert_series_equal(result, expected) + + +def test_downsample_across_dst_weekly(): + # GH 9119, GH 21459 + df = DataFrame( + index=DatetimeIndex( + ["2017-03-25", "2017-03-26", "2017-03-27", "2017-03-28", "2017-03-29"], + tz="Europe/Amsterdam", + ), + data=[11, 12, 13, 14, 15], + ) + result = df.resample("1W").sum() + expected = DataFrame( + [23, 42], + index=pd.DatetimeIndex(["2017-03-26", "2017-04-02"], tz="Europe/Amsterdam"), + ) + tm.assert_frame_equal(result, expected) + + idx = pd.date_range("2013-04-01", "2013-05-01", tz="Europe/London", freq="H") + s = Series(index=idx, dtype=np.float64) + result = s.resample("W").mean() + expected = Series( + index=pd.date_range("2013-04-07", freq="W", periods=5, tz="Europe/London"), + dtype=np.float64, + ) + tm.assert_series_equal(result, expected) + + +def test_resample_with_nat(): + # GH 13020 + index = DatetimeIndex( + [ + pd.NaT, + "1970-01-01 00:00:00", + pd.NaT, + "1970-01-01 00:00:01", + "1970-01-01 00:00:02", + ] + ) + frame = DataFrame([2, 3, 5, 7, 11], index=index) + + index_1s = DatetimeIndex( + ["1970-01-01 00:00:00", "1970-01-01 00:00:01", "1970-01-01 00:00:02"] + ) + frame_1s = DataFrame([3, 7, 11], index=index_1s) + tm.assert_frame_equal(frame.resample("1s").mean(), frame_1s) + + index_2s = DatetimeIndex(["1970-01-01 00:00:00", "1970-01-01 00:00:02"]) + frame_2s = DataFrame([5, 11], index=index_2s) + tm.assert_frame_equal(frame.resample("2s").mean(), frame_2s) + + index_3s = DatetimeIndex(["1970-01-01 00:00:00"]) + frame_3s = DataFrame([7], index=index_3s) + tm.assert_frame_equal(frame.resample("3s").mean(), frame_3s) + + tm.assert_frame_equal(frame.resample("60s").mean(), frame_3s) + + +def test_resample_datetime_values(): + # GH 13119 + # check that datetime dtype is preserved when NaT values are + # introduced by the resampling + + dates = [datetime(2016, 1, 15), datetime(2016, 1, 19)] + df = DataFrame({"timestamp": dates}, index=dates) + + exp = Series( + [datetime(2016, 1, 15), pd.NaT, datetime(2016, 1, 19)], + index=date_range("2016-01-15", periods=3, freq="2D"), + name="timestamp", + ) + + res = df.resample("2D").first()["timestamp"] + tm.assert_series_equal(res, exp) + res = df["timestamp"].resample("2D").first() + tm.assert_series_equal(res, exp) + + +def test_resample_apply_with_additional_args(series): + # GH 14615 + def f(data, add_arg): + return np.mean(data) * add_arg + + multiplier = 10 + result = series.resample("D").apply(f, multiplier) + expected = series.resample("D").mean().multiply(multiplier) + tm.assert_series_equal(result, expected) + + # Testing as kwarg + result = series.resample("D").apply(f, add_arg=multiplier) + expected = series.resample("D").mean().multiply(multiplier) + tm.assert_series_equal(result, expected) + + # Testing dataframe + df = pd.DataFrame({"A": 1, "B": 2}, index=pd.date_range("2017", periods=10)) + result = df.groupby("A").resample("D").agg(f, multiplier) + expected = df.groupby("A").resample("D").mean().multiply(multiplier) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("k", [1, 2, 3]) +@pytest.mark.parametrize( + "n1, freq1, n2, freq2", + [ + (30, "S", 0.5, "Min"), + (60, "S", 1, "Min"), + (3600, "S", 1, "H"), + (60, "Min", 1, "H"), + (21600, "S", 0.25, "D"), + (86400, "S", 1, "D"), + (43200, "S", 0.5, "D"), + (1440, "Min", 1, "D"), + (12, "H", 0.5, "D"), + (24, "H", 1, "D"), + ], +) +def test_resample_equivalent_offsets(n1, freq1, n2, freq2, k): + # GH 24127 + n1_ = n1 * k + n2_ = n2 * k + s = pd.Series( + 0, index=pd.date_range("19910905 13:00", "19911005 07:00", freq=freq1) + ) + s = s + range(len(s)) + + result1 = s.resample(str(n1_) + freq1).mean() + result2 = s.resample(str(n2_) + freq2).mean() + tm.assert_series_equal(result1, result2) + + +@pytest.mark.parametrize( + "first,last,offset,exp_first,exp_last", + [ + ("19910905", "19920406", "D", "19910905", "19920407"), + ("19910905 00:00", "19920406 06:00", "D", "19910905", "19920407"), + ("19910905 06:00", "19920406 06:00", "H", "19910905 06:00", "19920406 07:00"), + ("19910906", "19920406", "M", "19910831", "19920430"), + ("19910831", "19920430", "M", "19910831", "19920531"), + ("1991-08", "1992-04", "M", "19910831", "19920531"), + ], +) +def test_get_timestamp_range_edges(first, last, offset, exp_first, exp_last): + first = pd.Period(first) + first = first.to_timestamp(first.freq) + last = pd.Period(last) + last = last.to_timestamp(last.freq) + + exp_first = pd.Timestamp(exp_first, freq=offset) + exp_last = pd.Timestamp(exp_last, freq=offset) + + offset = pd.tseries.frequencies.to_offset(offset) + result = _get_timestamp_range_edges(first, last, offset) + expected = (exp_first, exp_last) + assert result == expected + + +def test_resample_apply_product(): + # GH 5586 + index = date_range(start="2012-01-31", freq="M", periods=12) + + ts = Series(range(12), index=index) + df = DataFrame(dict(A=ts, B=ts + 2)) + result = df.resample("Q").apply(np.product) + expected = DataFrame( + np.array([[0, 24], [60, 210], [336, 720], [990, 1716]], dtype=np.int64), + index=DatetimeIndex( + ["2012-03-31", "2012-06-30", "2012-09-30", "2012-12-31"], freq="Q-DEC" + ), + columns=["A", "B"], + ) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/resample/test_period_index.py b/venv/Lib/site-packages/pandas/tests/resample/test_period_index.py new file mode 100644 index 0000000..955f8c7 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/resample/test_period_index.py @@ -0,0 +1,872 @@ +from datetime import datetime, timedelta + +import dateutil +import numpy as np +import pytest +import pytz + +from pandas._libs.tslibs.ccalendar import DAYS, MONTHS +from pandas._libs.tslibs.period import IncompatibleFrequency + +import pandas as pd +from pandas import DataFrame, Series, Timestamp +import pandas._testing as tm +from pandas.core.indexes.base import InvalidIndexError +from pandas.core.indexes.datetimes import date_range +from pandas.core.indexes.period import Period, PeriodIndex, period_range +from pandas.core.resample import _get_period_range_edges + +import pandas.tseries.offsets as offsets + + +@pytest.fixture() +def _index_factory(): + return period_range + + +@pytest.fixture +def _series_name(): + return "pi" + + +class TestPeriodIndex: + @pytest.mark.parametrize("freq", ["2D", "1H", "2H"]) + @pytest.mark.parametrize("kind", ["period", None, "timestamp"]) + def test_asfreq(self, series_and_frame, freq, kind): + # GH 12884, 15944 + # make sure .asfreq() returns PeriodIndex (except kind='timestamp') + + obj = series_and_frame + if kind == "timestamp": + expected = obj.to_timestamp().resample(freq).asfreq() + else: + start = obj.index[0].to_timestamp(how="start") + end = (obj.index[-1] + obj.index.freq).to_timestamp(how="start") + new_index = date_range(start=start, end=end, freq=freq, closed="left") + expected = obj.to_timestamp().reindex(new_index).to_period(freq) + result = obj.resample(freq, kind=kind).asfreq() + tm.assert_almost_equal(result, expected) + + def test_asfreq_fill_value(self, series): + # test for fill value during resampling, issue 3715 + + s = series + new_index = date_range( + s.index[0].to_timestamp(how="start"), + (s.index[-1]).to_timestamp(how="start"), + freq="1H", + ) + expected = s.to_timestamp().reindex(new_index, fill_value=4.0) + result = s.resample("1H", kind="timestamp").asfreq(fill_value=4.0) + tm.assert_series_equal(result, expected) + + frame = s.to_frame("value") + new_index = date_range( + frame.index[0].to_timestamp(how="start"), + (frame.index[-1]).to_timestamp(how="start"), + freq="1H", + ) + expected = frame.to_timestamp().reindex(new_index, fill_value=3.0) + result = frame.resample("1H", kind="timestamp").asfreq(fill_value=3.0) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("freq", ["H", "12H", "2D", "W"]) + @pytest.mark.parametrize("kind", [None, "period", "timestamp"]) + @pytest.mark.parametrize("kwargs", [dict(on="date"), dict(level="d")]) + def test_selection(self, index, freq, kind, kwargs): + # This is a bug, these should be implemented + # GH 14008 + rng = np.arange(len(index), dtype=np.int64) + df = DataFrame( + {"date": index, "a": rng}, + index=pd.MultiIndex.from_arrays([rng, index], names=["v", "d"]), + ) + msg = ( + "Resampling from level= or on= selection with a PeriodIndex is " + r"not currently supported, use \.set_index\(\.\.\.\) to " + "explicitly set index" + ) + with pytest.raises(NotImplementedError, match=msg): + df.resample(freq, kind=kind, **kwargs) + + @pytest.mark.parametrize("month", MONTHS) + @pytest.mark.parametrize("meth", ["ffill", "bfill"]) + @pytest.mark.parametrize("conv", ["start", "end"]) + @pytest.mark.parametrize("targ", ["D", "B", "M"]) + def test_annual_upsample_cases( + self, targ, conv, meth, month, simple_period_range_series + ): + ts = simple_period_range_series( + "1/1/1990", "12/31/1991", freq="A-{month}".format(month=month) + ) + + result = getattr(ts.resample(targ, convention=conv), meth)() + expected = result.to_timestamp(targ, how=conv) + expected = expected.asfreq(targ, meth).to_period() + tm.assert_series_equal(result, expected) + + def test_basic_downsample(self, simple_period_range_series): + ts = simple_period_range_series("1/1/1990", "6/30/1995", freq="M") + result = ts.resample("a-dec").mean() + + expected = ts.groupby(ts.index.year).mean() + expected.index = period_range("1/1/1990", "6/30/1995", freq="a-dec") + tm.assert_series_equal(result, expected) + + # this is ok + tm.assert_series_equal(ts.resample("a-dec").mean(), result) + tm.assert_series_equal(ts.resample("a").mean(), result) + + @pytest.mark.parametrize( + "rule,expected_error_msg", + [ + ("a-dec", ""), + ("q-mar", ""), + ("M", ""), + ("w-thu", ""), + ], + ) + def test_not_subperiod(self, simple_period_range_series, rule, expected_error_msg): + # These are incompatible period rules for resampling + ts = simple_period_range_series("1/1/1990", "6/30/1995", freq="w-wed") + msg = ( + "Frequency cannot be resampled to {}, as they " + "are not sub or super periods" + ).format(expected_error_msg) + with pytest.raises(IncompatibleFrequency, match=msg): + ts.resample(rule).mean() + + @pytest.mark.parametrize("freq", ["D", "2D"]) + def test_basic_upsample(self, freq, simple_period_range_series): + ts = simple_period_range_series("1/1/1990", "6/30/1995", freq="M") + result = ts.resample("a-dec").mean() + + resampled = result.resample(freq, convention="end").ffill() + expected = result.to_timestamp(freq, how="end") + expected = expected.asfreq(freq, "ffill").to_period(freq) + tm.assert_series_equal(resampled, expected) + + def test_upsample_with_limit(self): + rng = period_range("1/1/2000", periods=5, freq="A") + ts = Series(np.random.randn(len(rng)), rng) + + result = ts.resample("M", convention="end").ffill(limit=2) + expected = ts.asfreq("M").reindex(result.index, method="ffill", limit=2) + tm.assert_series_equal(result, expected) + + def test_annual_upsample(self, simple_period_range_series): + ts = simple_period_range_series("1/1/1990", "12/31/1995", freq="A-DEC") + df = DataFrame({"a": ts}) + rdf = df.resample("D").ffill() + exp = df["a"].resample("D").ffill() + tm.assert_series_equal(rdf["a"], exp) + + rng = period_range("2000", "2003", freq="A-DEC") + ts = Series([1, 2, 3, 4], index=rng) + + result = ts.resample("M").ffill() + ex_index = period_range("2000-01", "2003-12", freq="M") + + expected = ts.asfreq("M", how="start").reindex(ex_index, method="ffill") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("month", MONTHS) + @pytest.mark.parametrize("target", ["D", "B", "M"]) + @pytest.mark.parametrize("convention", ["start", "end"]) + def test_quarterly_upsample( + self, month, target, convention, simple_period_range_series + ): + freq = "Q-{month}".format(month=month) + ts = simple_period_range_series("1/1/1990", "12/31/1995", freq=freq) + result = ts.resample(target, convention=convention).ffill() + expected = result.to_timestamp(target, how=convention) + expected = expected.asfreq(target, "ffill").to_period() + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("target", ["D", "B"]) + @pytest.mark.parametrize("convention", ["start", "end"]) + def test_monthly_upsample(self, target, convention, simple_period_range_series): + ts = simple_period_range_series("1/1/1990", "12/31/1995", freq="M") + result = ts.resample(target, convention=convention).ffill() + expected = result.to_timestamp(target, how=convention) + expected = expected.asfreq(target, "ffill").to_period() + tm.assert_series_equal(result, expected) + + def test_resample_basic(self): + # GH3609 + s = Series( + range(100), + index=date_range("20130101", freq="s", periods=100, name="idx"), + dtype="float", + ) + s[10:30] = np.nan + index = PeriodIndex( + [Period("2013-01-01 00:00", "T"), Period("2013-01-01 00:01", "T")], + name="idx", + ) + expected = Series([34.5, 79.5], index=index) + result = s.to_period().resample("T", kind="period").mean() + tm.assert_series_equal(result, expected) + result2 = s.resample("T", kind="period").mean() + tm.assert_series_equal(result2, expected) + + @pytest.mark.parametrize( + "freq,expected_vals", [("M", [31, 29, 31, 9]), ("2M", [31 + 29, 31 + 9])] + ) + def test_resample_count(self, freq, expected_vals): + # GH12774 + series = Series(1, index=pd.period_range(start="2000", periods=100)) + result = series.resample(freq).count() + expected_index = pd.period_range( + start="2000", freq=freq, periods=len(expected_vals) + ) + expected = Series(expected_vals, index=expected_index) + tm.assert_series_equal(result, expected) + + def test_resample_same_freq(self, resample_method): + + # GH12770 + series = Series( + range(3), index=pd.period_range(start="2000", periods=3, freq="M") + ) + expected = series + + result = getattr(series.resample("M"), resample_method)() + tm.assert_series_equal(result, expected) + + def test_resample_incompat_freq(self): + msg = ( + "Frequency cannot be resampled to , " + "as they are not sub or super periods" + ) + with pytest.raises(IncompatibleFrequency, match=msg): + Series( + range(3), index=pd.period_range(start="2000", periods=3, freq="M") + ).resample("W").mean() + + def test_with_local_timezone_pytz(self): + # see gh-5430 + local_timezone = pytz.timezone("America/Los_Angeles") + + start = datetime(year=2013, month=11, day=1, hour=0, minute=0, tzinfo=pytz.utc) + # 1 day later + end = datetime(year=2013, month=11, day=2, hour=0, minute=0, tzinfo=pytz.utc) + + index = pd.date_range(start, end, freq="H") + + series = Series(1, index=index) + series = series.tz_convert(local_timezone) + result = series.resample("D", kind="period").mean() + + # Create the expected series + # Index is moved back a day with the timezone conversion from UTC to + # Pacific + expected_index = pd.period_range(start=start, end=end, freq="D") - offsets.Day() + expected = Series(1, index=expected_index) + tm.assert_series_equal(result, expected) + + def test_resample_with_pytz(self): + # GH 13238 + s = Series( + 2, index=pd.date_range("2017-01-01", periods=48, freq="H", tz="US/Eastern") + ) + result = s.resample("D").mean() + expected = Series( + 2, index=pd.DatetimeIndex(["2017-01-01", "2017-01-02"], tz="US/Eastern") + ) + tm.assert_series_equal(result, expected) + # Especially assert that the timezone is LMT for pytz + assert result.index.tz == pytz.timezone("US/Eastern") + + def test_with_local_timezone_dateutil(self): + # see gh-5430 + local_timezone = "dateutil/America/Los_Angeles" + + start = datetime( + year=2013, month=11, day=1, hour=0, minute=0, tzinfo=dateutil.tz.tzutc() + ) + # 1 day later + end = datetime( + year=2013, month=11, day=2, hour=0, minute=0, tzinfo=dateutil.tz.tzutc() + ) + + index = pd.date_range(start, end, freq="H", name="idx") + + series = Series(1, index=index) + series = series.tz_convert(local_timezone) + result = series.resample("D", kind="period").mean() + + # Create the expected series + # Index is moved back a day with the timezone conversion from UTC to + # Pacific + expected_index = ( + pd.period_range(start=start, end=end, freq="D", name="idx") - offsets.Day() + ) + expected = Series(1, index=expected_index) + tm.assert_series_equal(result, expected) + + def test_resample_nonexistent_time_bin_edge(self): + # GH 19375 + index = date_range("2017-03-12", "2017-03-12 1:45:00", freq="15T") + s = Series(np.zeros(len(index)), index=index) + expected = s.tz_localize("US/Pacific") + result = expected.resample("900S").mean() + tm.assert_series_equal(result, expected) + + # GH 23742 + index = date_range(start="2017-10-10", end="2017-10-20", freq="1H") + index = index.tz_localize("UTC").tz_convert("America/Sao_Paulo") + df = DataFrame(data=list(range(len(index))), index=index) + result = df.groupby(pd.Grouper(freq="1D")).count() + expected = date_range( + start="2017-10-09", + end="2017-10-20", + freq="D", + tz="America/Sao_Paulo", + nonexistent="shift_forward", + closed="left", + ) + tm.assert_index_equal(result.index, expected) + + def test_resample_ambiguous_time_bin_edge(self): + # GH 10117 + idx = pd.date_range( + "2014-10-25 22:00:00", "2014-10-26 00:30:00", freq="30T", tz="Europe/London" + ) + expected = Series(np.zeros(len(idx)), index=idx) + result = expected.resample("30T").mean() + tm.assert_series_equal(result, expected) + + def test_fill_method_and_how_upsample(self): + # GH2073 + s = Series( + np.arange(9, dtype="int64"), + index=date_range("2010-01-01", periods=9, freq="Q"), + ) + last = s.resample("M").ffill() + both = s.resample("M").ffill().resample("M").last().astype("int64") + tm.assert_series_equal(last, both) + + @pytest.mark.parametrize("day", DAYS) + @pytest.mark.parametrize("target", ["D", "B"]) + @pytest.mark.parametrize("convention", ["start", "end"]) + def test_weekly_upsample(self, day, target, convention, simple_period_range_series): + freq = "W-{day}".format(day=day) + ts = simple_period_range_series("1/1/1990", "12/31/1995", freq=freq) + result = ts.resample(target, convention=convention).ffill() + expected = result.to_timestamp(target, how=convention) + expected = expected.asfreq(target, "ffill").to_period() + tm.assert_series_equal(result, expected) + + def test_resample_to_timestamps(self, simple_period_range_series): + ts = simple_period_range_series("1/1/1990", "12/31/1995", freq="M") + + result = ts.resample("A-DEC", kind="timestamp").mean() + expected = ts.to_timestamp(how="start").resample("A-DEC").mean() + tm.assert_series_equal(result, expected) + + def test_resample_to_quarterly(self, simple_period_range_series): + for month in MONTHS: + ts = simple_period_range_series( + "1990", "1992", freq="A-{month}".format(month=month) + ) + quar_ts = ts.resample("Q-{month}".format(month=month)).ffill() + + stamps = ts.to_timestamp("D", how="start") + qdates = period_range( + ts.index[0].asfreq("D", "start"), + ts.index[-1].asfreq("D", "end"), + freq="Q-{month}".format(month=month), + ) + + expected = stamps.reindex(qdates.to_timestamp("D", "s"), method="ffill") + expected.index = qdates + + tm.assert_series_equal(quar_ts, expected) + + # conforms, but different month + ts = simple_period_range_series("1990", "1992", freq="A-JUN") + + for how in ["start", "end"]: + result = ts.resample("Q-MAR", convention=how).ffill() + expected = ts.asfreq("Q-MAR", how=how) + expected = expected.reindex(result.index, method="ffill") + + # .to_timestamp('D') + # expected = expected.resample('Q-MAR').ffill() + + tm.assert_series_equal(result, expected) + + def test_resample_fill_missing(self): + rng = PeriodIndex([2000, 2005, 2007, 2009], freq="A") + + s = Series(np.random.randn(4), index=rng) + + stamps = s.to_timestamp() + filled = s.resample("A").ffill() + expected = stamps.resample("A").ffill().to_period("A") + tm.assert_series_equal(filled, expected) + + def test_cant_fill_missing_dups(self): + rng = PeriodIndex([2000, 2005, 2005, 2007, 2007], freq="A") + s = Series(np.random.randn(5), index=rng) + msg = "Reindexing only valid with uniquely valued Index objects" + with pytest.raises(InvalidIndexError, match=msg): + s.resample("A").ffill() + + @pytest.mark.parametrize("freq", ["5min"]) + @pytest.mark.parametrize("kind", ["period", None, "timestamp"]) + def test_resample_5minute(self, freq, kind): + rng = period_range("1/1/2000", "1/5/2000", freq="T") + ts = Series(np.random.randn(len(rng)), index=rng) + expected = ts.to_timestamp().resample(freq).mean() + if kind != "timestamp": + expected = expected.to_period(freq) + result = ts.resample(freq, kind=kind).mean() + tm.assert_series_equal(result, expected) + + def test_upsample_daily_business_daily(self, simple_period_range_series): + ts = simple_period_range_series("1/1/2000", "2/1/2000", freq="B") + + result = ts.resample("D").asfreq() + expected = ts.asfreq("D").reindex(period_range("1/3/2000", "2/1/2000")) + tm.assert_series_equal(result, expected) + + ts = simple_period_range_series("1/1/2000", "2/1/2000") + result = ts.resample("H", convention="s").asfreq() + exp_rng = period_range("1/1/2000", "2/1/2000 23:00", freq="H") + expected = ts.asfreq("H", how="s").reindex(exp_rng) + tm.assert_series_equal(result, expected) + + def test_resample_irregular_sparse(self): + dr = date_range(start="1/1/2012", freq="5min", periods=1000) + s = Series(np.array(100), index=dr) + # subset the data. + subset = s[:"2012-01-04 06:55"] + + result = subset.resample("10min").apply(len) + expected = s.resample("10min").apply(len).loc[result.index] + tm.assert_series_equal(result, expected) + + def test_resample_weekly_all_na(self): + rng = date_range("1/1/2000", periods=10, freq="W-WED") + ts = Series(np.random.randn(len(rng)), index=rng) + + result = ts.resample("W-THU").asfreq() + + assert result.isna().all() + + result = ts.resample("W-THU").asfreq().ffill()[:-1] + expected = ts.asfreq("W-THU").ffill() + tm.assert_series_equal(result, expected) + + def test_resample_tz_localized(self): + dr = date_range(start="2012-4-13", end="2012-5-1") + ts = Series(range(len(dr)), index=dr) + + ts_utc = ts.tz_localize("UTC") + ts_local = ts_utc.tz_convert("America/Los_Angeles") + + result = ts_local.resample("W").mean() + + ts_local_naive = ts_local.copy() + ts_local_naive.index = [ + x.replace(tzinfo=None) for x in ts_local_naive.index.to_pydatetime() + ] + + exp = ts_local_naive.resample("W").mean().tz_localize("America/Los_Angeles") + + tm.assert_series_equal(result, exp) + + # it works + result = ts_local.resample("D").mean() + + # #2245 + idx = date_range( + "2001-09-20 15:59", "2001-09-20 16:00", freq="T", tz="Australia/Sydney" + ) + s = Series([1, 2], index=idx) + + result = s.resample("D", closed="right", label="right").mean() + ex_index = date_range("2001-09-21", periods=1, freq="D", tz="Australia/Sydney") + expected = Series([1.5], index=ex_index) + + tm.assert_series_equal(result, expected) + + # for good measure + result = s.resample("D", kind="period").mean() + ex_index = period_range("2001-09-20", periods=1, freq="D") + expected = Series([1.5], index=ex_index) + tm.assert_series_equal(result, expected) + + # GH 6397 + # comparing an offset that doesn't propagate tz's + rng = date_range("1/1/2011", periods=20000, freq="H") + rng = rng.tz_localize("EST") + ts = DataFrame(index=rng) + ts["first"] = np.random.randn(len(rng)) + ts["second"] = np.cumsum(np.random.randn(len(rng))) + expected = DataFrame( + { + "first": ts.resample("A").sum()["first"], + "second": ts.resample("A").mean()["second"], + }, + columns=["first", "second"], + ) + result = ( + ts.resample("A") + .agg({"first": np.sum, "second": np.mean}) + .reindex(columns=["first", "second"]) + ) + tm.assert_frame_equal(result, expected) + + def test_closed_left_corner(self): + # #1465 + s = Series( + np.random.randn(21), + index=date_range(start="1/1/2012 9:30", freq="1min", periods=21), + ) + s[0] = np.nan + + result = s.resample("10min", closed="left", label="right").mean() + exp = s[1:].resample("10min", closed="left", label="right").mean() + tm.assert_series_equal(result, exp) + + result = s.resample("10min", closed="left", label="left").mean() + exp = s[1:].resample("10min", closed="left", label="left").mean() + + ex_index = date_range(start="1/1/2012 9:30", freq="10min", periods=3) + + tm.assert_index_equal(result.index, ex_index) + tm.assert_series_equal(result, exp) + + def test_quarterly_resampling(self): + rng = period_range("2000Q1", periods=10, freq="Q-DEC") + ts = Series(np.arange(10), index=rng) + + result = ts.resample("A").mean() + exp = ts.to_timestamp().resample("A").mean().to_period() + tm.assert_series_equal(result, exp) + + def test_resample_weekly_bug_1726(self): + # 8/6/12 is a Monday + ind = date_range(start="8/6/2012", end="8/26/2012", freq="D") + n = len(ind) + data = [[x] * 5 for x in range(n)] + df = DataFrame(data, columns=["open", "high", "low", "close", "vol"], index=ind) + + # it works! + df.resample("W-MON", closed="left", label="left").first() + + def test_resample_with_dst_time_change(self): + # GH 15549 + index = ( + pd.DatetimeIndex([1457537600000000000, 1458059600000000000]) + .tz_localize("UTC") + .tz_convert("America/Chicago") + ) + df = pd.DataFrame([1, 2], index=index) + result = df.resample("12h", closed="right", label="right").last().ffill() + + expected_index_values = [ + "2016-03-09 12:00:00-06:00", + "2016-03-10 00:00:00-06:00", + "2016-03-10 12:00:00-06:00", + "2016-03-11 00:00:00-06:00", + "2016-03-11 12:00:00-06:00", + "2016-03-12 00:00:00-06:00", + "2016-03-12 12:00:00-06:00", + "2016-03-13 00:00:00-06:00", + "2016-03-13 13:00:00-05:00", + "2016-03-14 01:00:00-05:00", + "2016-03-14 13:00:00-05:00", + "2016-03-15 01:00:00-05:00", + "2016-03-15 13:00:00-05:00", + ] + index = pd.to_datetime(expected_index_values, utc=True).tz_convert( + "America/Chicago" + ) + expected = pd.DataFrame( + [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0], + index=index, + ) + tm.assert_frame_equal(result, expected) + + def test_resample_bms_2752(self): + # GH2753 + foo = Series(index=pd.bdate_range("20000101", "20000201"), dtype=np.float64) + res1 = foo.resample("BMS").mean() + res2 = foo.resample("BMS").mean().resample("B").mean() + assert res1.index[0] == Timestamp("20000103") + assert res1.index[0] == res2.index[0] + + # def test_monthly_convention_span(self): + # rng = period_range('2000-01', periods=3, freq='M') + # ts = Series(np.arange(3), index=rng) + + # # hacky way to get same thing + # exp_index = period_range('2000-01-01', '2000-03-31', freq='D') + # expected = ts.asfreq('D', how='end').reindex(exp_index) + # expected = expected.fillna(method='bfill') + + # result = ts.resample('D', convention='span').mean() + + # tm.assert_series_equal(result, expected) + + def test_default_right_closed_label(self): + end_freq = ["D", "Q", "M", "D"] + end_types = ["M", "A", "Q", "W"] + + for from_freq, to_freq in zip(end_freq, end_types): + idx = date_range(start="8/15/2012", periods=100, freq=from_freq) + df = DataFrame(np.random.randn(len(idx), 2), idx) + + resampled = df.resample(to_freq).mean() + tm.assert_frame_equal( + resampled, df.resample(to_freq, closed="right", label="right").mean() + ) + + def test_default_left_closed_label(self): + others = ["MS", "AS", "QS", "D", "H"] + others_freq = ["D", "Q", "M", "H", "T"] + + for from_freq, to_freq in zip(others_freq, others): + idx = date_range(start="8/15/2012", periods=100, freq=from_freq) + df = DataFrame(np.random.randn(len(idx), 2), idx) + + resampled = df.resample(to_freq).mean() + tm.assert_frame_equal( + resampled, df.resample(to_freq, closed="left", label="left").mean() + ) + + def test_all_values_single_bin(self): + # 2070 + index = period_range(start="2012-01-01", end="2012-12-31", freq="M") + s = Series(np.random.randn(len(index)), index=index) + + result = s.resample("A").mean() + tm.assert_almost_equal(result[0], s.mean()) + + def test_evenly_divisible_with_no_extra_bins(self): + # 4076 + # when the frequency is evenly divisible, sometimes extra bins + + df = DataFrame(np.random.randn(9, 3), index=date_range("2000-1-1", periods=9)) + result = df.resample("5D").mean() + expected = pd.concat([df.iloc[0:5].mean(), df.iloc[5:].mean()], axis=1).T + expected.index = [Timestamp("2000-1-1"), Timestamp("2000-1-6")] + tm.assert_frame_equal(result, expected) + + index = date_range(start="2001-5-4", periods=28) + df = DataFrame( + [ + { + "REST_KEY": 1, + "DLY_TRN_QT": 80, + "DLY_SLS_AMT": 90, + "COOP_DLY_TRN_QT": 30, + "COOP_DLY_SLS_AMT": 20, + } + ] + * 28 + + [ + { + "REST_KEY": 2, + "DLY_TRN_QT": 70, + "DLY_SLS_AMT": 10, + "COOP_DLY_TRN_QT": 50, + "COOP_DLY_SLS_AMT": 20, + } + ] + * 28, + index=index.append(index), + ).sort_index() + + index = date_range("2001-5-4", periods=4, freq="7D") + expected = DataFrame( + [ + { + "REST_KEY": 14, + "DLY_TRN_QT": 14, + "DLY_SLS_AMT": 14, + "COOP_DLY_TRN_QT": 14, + "COOP_DLY_SLS_AMT": 14, + } + ] + * 4, + index=index, + ) + result = df.resample("7D").count() + tm.assert_frame_equal(result, expected) + + expected = DataFrame( + [ + { + "REST_KEY": 21, + "DLY_TRN_QT": 1050, + "DLY_SLS_AMT": 700, + "COOP_DLY_TRN_QT": 560, + "COOP_DLY_SLS_AMT": 280, + } + ] + * 4, + index=index, + ) + result = df.resample("7D").sum() + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("kind", ["period", None, "timestamp"]) + @pytest.mark.parametrize("agg_arg", ["mean", {"value": "mean"}, ["mean"]]) + def test_loffset_returns_datetimeindex(self, frame, kind, agg_arg): + # make sure passing loffset returns DatetimeIndex in all cases + # basic method taken from Base.test_resample_loffset_arg_type() + df = frame + expected_means = [ + df.values[i : i + 2].mean() for i in range(0, len(df.values), 2) + ] + expected_index = period_range(df.index[0], periods=len(df.index) / 2, freq="2D") + + # loffset coerces PeriodIndex to DateTimeIndex + expected_index = expected_index.to_timestamp() + expected_index += timedelta(hours=2) + expected = DataFrame({"value": expected_means}, index=expected_index) + + result_agg = df.resample("2D", loffset="2H", kind=kind).agg(agg_arg) + if isinstance(agg_arg, list): + expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) + tm.assert_frame_equal(result_agg, expected) + + @pytest.mark.parametrize("freq, period_mult", [("H", 24), ("12H", 2)]) + @pytest.mark.parametrize("kind", [None, "period"]) + def test_upsampling_ohlc(self, freq, period_mult, kind): + # GH 13083 + pi = period_range(start="2000", freq="D", periods=10) + s = Series(range(len(pi)), index=pi) + expected = s.to_timestamp().resample(freq).ohlc().to_period(freq) + + # timestamp-based resampling doesn't include all sub-periods + # of the last original period, so extend accordingly: + new_index = period_range(start="2000", freq=freq, periods=period_mult * len(pi)) + expected = expected.reindex(new_index) + result = s.resample(freq, kind=kind).ohlc() + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "periods, values", + [ + ( + [ + pd.NaT, + "1970-01-01 00:00:00", + pd.NaT, + "1970-01-01 00:00:02", + "1970-01-01 00:00:03", + ], + [2, 3, 5, 7, 11], + ), + ( + [ + pd.NaT, + pd.NaT, + "1970-01-01 00:00:00", + pd.NaT, + pd.NaT, + pd.NaT, + "1970-01-01 00:00:02", + "1970-01-01 00:00:03", + pd.NaT, + pd.NaT, + ], + [1, 2, 3, 5, 6, 8, 7, 11, 12, 13], + ), + ], + ) + @pytest.mark.parametrize( + "freq, expected_values", + [ + ("1s", [3, np.NaN, 7, 11]), + ("2s", [3, int((7 + 11) / 2)]), + ("3s", [int((3 + 7) / 2), 11]), + ], + ) + def test_resample_with_nat(self, periods, values, freq, expected_values): + # GH 13224 + index = PeriodIndex(periods, freq="S") + frame = DataFrame(values, index=index) + + expected_index = period_range( + "1970-01-01 00:00:00", periods=len(expected_values), freq=freq + ) + expected = DataFrame(expected_values, index=expected_index) + result = frame.resample(freq).mean() + tm.assert_frame_equal(result, expected) + + def test_resample_with_only_nat(self): + # GH 13224 + pi = PeriodIndex([pd.NaT] * 3, freq="S") + frame = DataFrame([2, 3, 5], index=pi) + expected_index = PeriodIndex(data=[], freq=pi.freq) + expected = DataFrame(index=expected_index) + result = frame.resample("1s").mean() + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "start,end,start_freq,end_freq,base", + [ + ("19910905", "19910909 03:00", "H", "24H", 10), + ("19910905", "19910909 12:00", "H", "24H", 10), + ("19910905", "19910909 23:00", "H", "24H", 10), + ("19910905 10:00", "19910909", "H", "24H", 10), + ("19910905 10:00", "19910909 10:00", "H", "24H", 10), + ("19910905", "19910909 10:00", "H", "24H", 10), + ("19910905 12:00", "19910909", "H", "24H", 10), + ("19910905 12:00", "19910909 03:00", "H", "24H", 10), + ("19910905 12:00", "19910909 12:00", "H", "24H", 10), + ("19910905 12:00", "19910909 12:00", "H", "24H", 34), + ("19910905 12:00", "19910909 12:00", "H", "17H", 10), + ("19910905 12:00", "19910909 12:00", "H", "17H", 3), + ("19910905 12:00", "19910909 1:00", "H", "M", 3), + ("19910905", "19910913 06:00", "2H", "24H", 10), + ("19910905", "19910905 01:39", "Min", "5Min", 3), + ("19910905", "19910905 03:18", "2Min", "5Min", 3), + ], + ) + def test_resample_with_non_zero_base(self, start, end, start_freq, end_freq, base): + # GH 23882 + s = pd.Series(0, index=pd.period_range(start, end, freq=start_freq)) + s = s + np.arange(len(s)) + result = s.resample(end_freq, base=base).mean() + result = result.to_timestamp(end_freq) + # to_timestamp casts 24H -> D + result = result.asfreq(end_freq) if end_freq == "24H" else result + expected = s.to_timestamp().resample(end_freq, base=base).mean() + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "first,last,offset,exp_first,exp_last", + [ + ("19910905", "19920406", "D", "19910905", "19920406"), + ("19910905 00:00", "19920406 06:00", "D", "19910905", "19920406"), + ( + "19910905 06:00", + "19920406 06:00", + "H", + "19910905 06:00", + "19920406 06:00", + ), + ("19910906", "19920406", "M", "1991-09", "1992-04"), + ("19910831", "19920430", "M", "1991-08", "1992-04"), + ("1991-08", "1992-04", "M", "1991-08", "1992-04"), + ], + ) + def test_get_period_range_edges(self, first, last, offset, exp_first, exp_last): + first = pd.Period(first) + last = pd.Period(last) + + exp_first = pd.Period(exp_first, freq=offset) + exp_last = pd.Period(exp_last, freq=offset) + + offset = pd.tseries.frequencies.to_offset(offset) + result = _get_period_range_edges(first, last, offset) + expected = (exp_first, exp_last) + assert result == expected diff --git a/venv/Lib/site-packages/pandas/tests/resample/test_resample_api.py b/venv/Lib/site-packages/pandas/tests/resample/test_resample_api.py new file mode 100644 index 0000000..170201b --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/resample/test_resample_api.py @@ -0,0 +1,582 @@ +from collections import OrderedDict +from datetime import datetime + +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Series +import pandas._testing as tm +from pandas.core.indexes.datetimes import date_range + +dti = date_range(start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq="Min") + +test_series = Series(np.random.rand(len(dti)), dti) +_test_frame = DataFrame({"A": test_series, "B": test_series, "C": np.arange(len(dti))}) + + +@pytest.fixture +def test_frame(): + return _test_frame.copy() + + +def test_str(): + + r = test_series.resample("H") + assert ( + "DatetimeIndexResampler [freq=, axis=0, closed=left, " + "label=left, convention=start, base=0]" in str(r) + ) + + +def test_api(): + + r = test_series.resample("H") + result = r.mean() + assert isinstance(result, Series) + assert len(result) == 217 + + r = test_series.to_frame().resample("H") + result = r.mean() + assert isinstance(result, DataFrame) + assert len(result) == 217 + + +def test_groupby_resample_api(): + + # GH 12448 + # .groupby(...).resample(...) hitting warnings + # when appropriate + df = DataFrame( + { + "date": pd.date_range(start="2016-01-01", periods=4, freq="W"), + "group": [1, 1, 2, 2], + "val": [5, 6, 7, 8], + } + ).set_index("date") + + # replication step + i = ( + pd.date_range("2016-01-03", periods=8).tolist() + + pd.date_range("2016-01-17", periods=8).tolist() + ) + index = pd.MultiIndex.from_arrays([[1] * 8 + [2] * 8, i], names=["group", "date"]) + expected = DataFrame({"val": [5] * 7 + [6] + [7] * 7 + [8]}, index=index) + result = df.groupby("group").apply(lambda x: x.resample("1D").ffill())[["val"]] + tm.assert_frame_equal(result, expected) + + +def test_groupby_resample_on_api(): + + # GH 15021 + # .groupby(...).resample(on=...) results in an unexpected + # keyword warning. + df = DataFrame( + { + "key": ["A", "B"] * 5, + "dates": pd.date_range("2016-01-01", periods=10), + "values": np.random.randn(10), + } + ) + + expected = df.set_index("dates").groupby("key").resample("D").mean() + + result = df.groupby("key").resample("D", on="dates").mean() + tm.assert_frame_equal(result, expected) + + +def test_pipe(test_frame): + # GH17905 + + # series + r = test_series.resample("H") + expected = r.max() - r.mean() + result = r.pipe(lambda x: x.max() - x.mean()) + tm.assert_series_equal(result, expected) + + # dataframe + r = test_frame.resample("H") + expected = r.max() - r.mean() + result = r.pipe(lambda x: x.max() - x.mean()) + tm.assert_frame_equal(result, expected) + + +def test_getitem(test_frame): + + r = test_frame.resample("H") + tm.assert_index_equal(r._selected_obj.columns, test_frame.columns) + + r = test_frame.resample("H")["B"] + assert r._selected_obj.name == test_frame.columns[1] + + # technically this is allowed + r = test_frame.resample("H")["A", "B"] + tm.assert_index_equal(r._selected_obj.columns, test_frame.columns[[0, 1]]) + + r = test_frame.resample("H")["A", "B"] + tm.assert_index_equal(r._selected_obj.columns, test_frame.columns[[0, 1]]) + + +@pytest.mark.parametrize("key", [["D"], ["A", "D"]]) +def test_select_bad_cols(key, test_frame): + g = test_frame.resample("H") + # 'A' should not be referenced as a bad column... + # will have to rethink regex if you change message! + msg = r"^\"Columns not found: 'D'\"$" + with pytest.raises(KeyError, match=msg): + g[key] + + +def test_attribute_access(test_frame): + + r = test_frame.resample("H") + tm.assert_series_equal(r.A.sum(), r["A"].sum()) + + +def test_api_compat_before_use(): + + # make sure that we are setting the binner + # on these attributes + for attr in ["groups", "ngroups", "indices"]: + rng = pd.date_range("1/1/2012", periods=100, freq="S") + ts = Series(np.arange(len(rng)), index=rng) + rs = ts.resample("30s") + + # before use + getattr(rs, attr) + + # after grouper is initialized is ok + rs.mean() + getattr(rs, attr) + + +def tests_skip_nuisance(test_frame): + + df = test_frame + df["D"] = "foo" + r = df.resample("H") + result = r[["A", "B"]].sum() + expected = pd.concat([r.A.sum(), r.B.sum()], axis=1) + tm.assert_frame_equal(result, expected) + + expected = r[["A", "B", "C"]].sum() + result = r.sum() + tm.assert_frame_equal(result, expected) + + +def test_downsample_but_actually_upsampling(): + + # this is reindex / asfreq + rng = pd.date_range("1/1/2012", periods=100, freq="S") + ts = Series(np.arange(len(rng), dtype="int64"), index=rng) + result = ts.resample("20s").asfreq() + expected = Series( + [0, 20, 40, 60, 80], + index=pd.date_range("2012-01-01 00:00:00", freq="20s", periods=5), + ) + tm.assert_series_equal(result, expected) + + +def test_combined_up_downsampling_of_irregular(): + + # since we are really doing an operation like this + # ts2.resample('2s').mean().ffill() + # preserve these semantics + + rng = pd.date_range("1/1/2012", periods=100, freq="S") + ts = Series(np.arange(len(rng)), index=rng) + ts2 = ts.iloc[[0, 1, 2, 3, 5, 7, 11, 15, 16, 25, 30]] + + result = ts2.resample("2s").mean().ffill() + expected = Series( + [ + 0.5, + 2.5, + 5.0, + 7.0, + 7.0, + 11.0, + 11.0, + 15.0, + 16.0, + 16.0, + 16.0, + 16.0, + 25.0, + 25.0, + 25.0, + 30.0, + ], + index=pd.DatetimeIndex( + [ + "2012-01-01 00:00:00", + "2012-01-01 00:00:02", + "2012-01-01 00:00:04", + "2012-01-01 00:00:06", + "2012-01-01 00:00:08", + "2012-01-01 00:00:10", + "2012-01-01 00:00:12", + "2012-01-01 00:00:14", + "2012-01-01 00:00:16", + "2012-01-01 00:00:18", + "2012-01-01 00:00:20", + "2012-01-01 00:00:22", + "2012-01-01 00:00:24", + "2012-01-01 00:00:26", + "2012-01-01 00:00:28", + "2012-01-01 00:00:30", + ], + dtype="datetime64[ns]", + freq="2S", + ), + ) + tm.assert_series_equal(result, expected) + + +def test_transform(): + + r = test_series.resample("20min") + expected = test_series.groupby(pd.Grouper(freq="20min")).transform("mean") + result = r.transform("mean") + tm.assert_series_equal(result, expected) + + +def test_fillna(): + + # need to upsample here + rng = pd.date_range("1/1/2012", periods=10, freq="2S") + ts = Series(np.arange(len(rng), dtype="int64"), index=rng) + r = ts.resample("s") + + expected = r.ffill() + result = r.fillna(method="ffill") + tm.assert_series_equal(result, expected) + + expected = r.bfill() + result = r.fillna(method="bfill") + tm.assert_series_equal(result, expected) + + msg = ( + r"Invalid fill method\. Expecting pad \(ffill\), backfill" + r" \(bfill\) or nearest\. Got 0" + ) + with pytest.raises(ValueError, match=msg): + r.fillna(0) + + +def test_apply_without_aggregation(): + + # both resample and groupby should work w/o aggregation + r = test_series.resample("20min") + g = test_series.groupby(pd.Grouper(freq="20min")) + + for t in [g, r]: + result = t.apply(lambda x: x) + tm.assert_series_equal(result, test_series) + + +def test_agg_consistency(): + + # make sure that we are consistent across + # similar aggregations with and w/o selection list + df = DataFrame( + np.random.randn(1000, 3), + index=pd.date_range("1/1/2012", freq="S", periods=1000), + columns=["A", "B", "C"], + ) + + r = df.resample("3T") + + msg = "nested renamer is not supported" + with pytest.raises(pd.core.base.SpecificationError, match=msg): + r.agg({"r1": "mean", "r2": "sum"}) + + +# TODO: once GH 14008 is fixed, move these tests into +# `Base` test class + + +def test_agg(): + # test with all three Resampler apis and TimeGrouper + + np.random.seed(1234) + index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D") + index.name = "date" + df = DataFrame(np.random.rand(10, 2), columns=list("AB"), index=index) + df_col = df.reset_index() + df_mult = df_col.copy() + df_mult.index = pd.MultiIndex.from_arrays( + [range(10), df.index], names=["index", "date"] + ) + r = df.resample("2D") + cases = [ + r, + df_col.resample("2D", on="date"), + df_mult.resample("2D", level="date"), + df.groupby(pd.Grouper(freq="2D")), + ] + + a_mean = r["A"].mean() + a_std = r["A"].std() + a_sum = r["A"].sum() + b_mean = r["B"].mean() + b_std = r["B"].std() + b_sum = r["B"].sum() + + expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) + expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]]) + for t in cases: + result = t.aggregate([np.mean, np.std]) + tm.assert_frame_equal(result, expected) + + expected = pd.concat([a_mean, b_std], axis=1) + for t in cases: + result = t.aggregate({"A": np.mean, "B": np.std}) + tm.assert_frame_equal(result, expected, check_like=True) + + expected = pd.concat([a_mean, a_std], axis=1) + expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "std")]) + for t in cases: + result = t.aggregate({"A": ["mean", "std"]}) + tm.assert_frame_equal(result, expected) + + expected = pd.concat([a_mean, a_sum], axis=1) + expected.columns = ["mean", "sum"] + for t in cases: + result = t["A"].aggregate(["mean", "sum"]) + tm.assert_frame_equal(result, expected) + + msg = "nested renamer is not supported" + for t in cases: + with pytest.raises(pd.core.base.SpecificationError, match=msg): + t.aggregate({"A": {"mean": "mean", "sum": "sum"}}) + + expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1) + expected.columns = pd.MultiIndex.from_tuples( + [("A", "mean"), ("A", "sum"), ("B", "mean2"), ("B", "sum2")] + ) + for t in cases: + with pytest.raises(pd.core.base.SpecificationError, match=msg): + t.aggregate( + { + "A": {"mean": "mean", "sum": "sum"}, + "B": {"mean2": "mean", "sum2": "sum"}, + } + ) + + expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) + expected.columns = pd.MultiIndex.from_tuples( + [("A", "mean"), ("A", "std"), ("B", "mean"), ("B", "std")] + ) + for t in cases: + result = t.aggregate({"A": ["mean", "std"], "B": ["mean", "std"]}) + tm.assert_frame_equal(result, expected, check_like=True) + + expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1) + expected.columns = pd.MultiIndex.from_tuples( + [ + ("r1", "A", "mean"), + ("r1", "A", "sum"), + ("r2", "B", "mean"), + ("r2", "B", "sum"), + ] + ) + + +def test_agg_misc(): + # test with all three Resampler apis and TimeGrouper + + np.random.seed(1234) + index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D") + index.name = "date" + df = DataFrame(np.random.rand(10, 2), columns=list("AB"), index=index) + df_col = df.reset_index() + df_mult = df_col.copy() + df_mult.index = pd.MultiIndex.from_arrays( + [range(10), df.index], names=["index", "date"] + ) + + r = df.resample("2D") + cases = [ + r, + df_col.resample("2D", on="date"), + df_mult.resample("2D", level="date"), + df.groupby(pd.Grouper(freq="2D")), + ] + + # passed lambda + for t in cases: + result = t.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)}) + rcustom = t["B"].apply(lambda x: np.std(x, ddof=1)) + expected = pd.concat([r["A"].sum(), rcustom], axis=1) + tm.assert_frame_equal(result, expected, check_like=True) + + # agg with renamers + expected = pd.concat( + [t["A"].sum(), t["B"].sum(), t["A"].mean(), t["B"].mean()], axis=1 + ) + expected.columns = pd.MultiIndex.from_tuples( + [("result1", "A"), ("result1", "B"), ("result2", "A"), ("result2", "B")] + ) + + msg = "nested renamer is not supported" + for t in cases: + with pytest.raises(pd.core.base.SpecificationError, match=msg): + t[["A", "B"]].agg(OrderedDict([("result1", np.sum), ("result2", np.mean)])) + + # agg with different hows + expected = pd.concat( + [t["A"].sum(), t["A"].std(), t["B"].mean(), t["B"].std()], axis=1 + ) + expected.columns = pd.MultiIndex.from_tuples( + [("A", "sum"), ("A", "std"), ("B", "mean"), ("B", "std")] + ) + for t in cases: + result = t.agg(OrderedDict([("A", ["sum", "std"]), ("B", ["mean", "std"])])) + tm.assert_frame_equal(result, expected, check_like=True) + + # equivalent of using a selection list / or not + for t in cases: + result = t[["A", "B"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) + tm.assert_frame_equal(result, expected, check_like=True) + + # series like aggs + for t in cases: + with pytest.raises(pd.core.base.SpecificationError, match=msg): + t["A"].agg({"A": ["sum", "std"]}) + + with pytest.raises(pd.core.base.SpecificationError, match=msg): + t["A"].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) + + # errors + # invalid names in the agg specification + msg = "\"Column 'B' does not exist!\"" + for t in cases: + with pytest.raises(KeyError, match=msg): + t[["A"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) + + +def test_agg_nested_dicts(): + + np.random.seed(1234) + index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D") + index.name = "date" + df = DataFrame(np.random.rand(10, 2), columns=list("AB"), index=index) + df_col = df.reset_index() + df_mult = df_col.copy() + df_mult.index = pd.MultiIndex.from_arrays( + [range(10), df.index], names=["index", "date"] + ) + r = df.resample("2D") + cases = [ + r, + df_col.resample("2D", on="date"), + df_mult.resample("2D", level="date"), + df.groupby(pd.Grouper(freq="2D")), + ] + + msg = "nested renamer is not supported" + for t in cases: + with pytest.raises(pd.core.base.SpecificationError, match=msg): + t.aggregate({"r1": {"A": ["mean", "sum"]}, "r2": {"B": ["mean", "sum"]}}) + + for t in cases: + + with pytest.raises(pd.core.base.SpecificationError, match=msg): + t[["A", "B"]].agg( + {"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}} + ) + + with pytest.raises(pd.core.base.SpecificationError, match=msg): + t.agg({"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}}) + + +def test_try_aggregate_non_existing_column(): + # GH 16766 + data = [ + {"dt": datetime(2017, 6, 1, 0), "x": 1.0, "y": 2.0}, + {"dt": datetime(2017, 6, 1, 1), "x": 2.0, "y": 2.0}, + {"dt": datetime(2017, 6, 1, 2), "x": 3.0, "y": 1.5}, + ] + df = DataFrame(data).set_index("dt") + + # Error as we don't have 'z' column + msg = "\"Column 'z' does not exist!\"" + with pytest.raises(KeyError, match=msg): + df.resample("30T").agg({"x": ["mean"], "y": ["median"], "z": ["sum"]}) + + +def test_selection_api_validation(): + # GH 13500 + index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D") + + rng = np.arange(len(index), dtype=np.int64) + df = DataFrame( + {"date": index, "a": rng}, + index=pd.MultiIndex.from_arrays([rng, index], names=["v", "d"]), + ) + df_exp = DataFrame({"a": rng}, index=index) + + # non DatetimeIndex + msg = ( + "Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex, " + "but got an instance of 'Int64Index'" + ) + with pytest.raises(TypeError, match=msg): + df.resample("2D", level="v") + + msg = "The Grouper cannot specify both a key and a level!" + with pytest.raises(ValueError, match=msg): + df.resample("2D", on="date", level="d") + + msg = "unhashable type: 'list'" + with pytest.raises(TypeError, match=msg): + df.resample("2D", on=["a", "date"]) + + msg = r"\"Level \['a', 'date'\] not found\"" + with pytest.raises(KeyError, match=msg): + df.resample("2D", level=["a", "date"]) + + # upsampling not allowed + msg = ( + "Upsampling from level= or on= selection is not supported, use " + r"\.set_index\(\.\.\.\) to explicitly set index to datetime-like" + ) + with pytest.raises(ValueError, match=msg): + df.resample("2D", level="d").asfreq() + with pytest.raises(ValueError, match=msg): + df.resample("2D", on="date").asfreq() + + exp = df_exp.resample("2D").sum() + exp.index.name = "date" + tm.assert_frame_equal(exp, df.resample("2D", on="date").sum()) + + exp.index.name = "d" + tm.assert_frame_equal(exp, df.resample("2D", level="d").sum()) + + +@pytest.mark.parametrize( + "col_name", ["t2", "t2x", "t2q", "T_2M", "t2p", "t2m", "t2m1", "T2M"] +) +def test_agg_with_datetime_index_list_agg_func(col_name): + # GH 22660 + # The parametrized column names would get converted to dates by our + # date parser. Some would result in OutOfBoundsError (ValueError) while + # others would result in OverflowError when passed into Timestamp. + # We catch these errors and move on to the correct branch. + df = pd.DataFrame( + list(range(200)), + index=pd.date_range( + start="2017-01-01", freq="15min", periods=200, tz="Europe/Berlin" + ), + columns=[col_name], + ) + result = df.resample("1d").aggregate(["mean"]) + expected = pd.DataFrame( + [47.5, 143.5, 195.5], + index=pd.date_range( + start="2017-01-01", freq="D", periods=3, tz="Europe/Berlin" + ), + columns=pd.MultiIndex(levels=[[col_name], ["mean"]], codes=[[0], [0]]), + ) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/resample/test_resampler_grouper.py b/venv/Lib/site-packages/pandas/tests/resample/test_resampler_grouper.py new file mode 100644 index 0000000..4e3585c --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/resample/test_resampler_grouper.py @@ -0,0 +1,280 @@ +from textwrap import dedent + +import numpy as np + +from pandas.util._test_decorators import async_mark + +import pandas as pd +from pandas import DataFrame, Series, Timestamp +import pandas._testing as tm +from pandas.core.indexes.datetimes import date_range + +test_frame = DataFrame( + {"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)}, + index=date_range("1/1/2000", freq="s", periods=40), +) + + +@async_mark() +async def test_tab_complete_ipython6_warning(ip): + from IPython.core.completer import provisionalcompleter + + code = dedent( + """\ + import pandas._testing as tm + s = tm.makeTimeSeries() + rs = s.resample("D") + """ + ) + await ip.run_code(code) + + with tm.assert_produces_warning(None): + with provisionalcompleter("ignore"): + list(ip.Completer.completions("rs.", 1)) + + +def test_deferred_with_groupby(): + + # GH 12486 + # support deferred resample ops with groupby + data = [ + ["2010-01-01", "A", 2], + ["2010-01-02", "A", 3], + ["2010-01-05", "A", 8], + ["2010-01-10", "A", 7], + ["2010-01-13", "A", 3], + ["2010-01-01", "B", 5], + ["2010-01-03", "B", 2], + ["2010-01-04", "B", 1], + ["2010-01-11", "B", 7], + ["2010-01-14", "B", 3], + ] + + df = DataFrame(data, columns=["date", "id", "score"]) + df.date = pd.to_datetime(df.date) + + def f(x): + return x.set_index("date").resample("D").asfreq() + + expected = df.groupby("id").apply(f) + result = df.set_index("date").groupby("id").resample("D").asfreq() + tm.assert_frame_equal(result, expected) + + df = DataFrame( + { + "date": pd.date_range(start="2016-01-01", periods=4, freq="W"), + "group": [1, 1, 2, 2], + "val": [5, 6, 7, 8], + } + ).set_index("date") + + def f(x): + return x.resample("1D").ffill() + + expected = df.groupby("group").apply(f) + result = df.groupby("group").resample("1D").ffill() + tm.assert_frame_equal(result, expected) + + +def test_getitem(): + g = test_frame.groupby("A") + + expected = g.B.apply(lambda x: x.resample("2s").mean()) + + result = g.resample("2s").B.mean() + tm.assert_series_equal(result, expected) + + result = g.B.resample("2s").mean() + tm.assert_series_equal(result, expected) + + result = g.resample("2s").mean().B + tm.assert_series_equal(result, expected) + + +def test_getitem_multiple(): + + # GH 13174 + # multiple calls after selection causing an issue with aliasing + data = [{"id": 1, "buyer": "A"}, {"id": 2, "buyer": "B"}] + df = DataFrame(data, index=pd.date_range("2016-01-01", periods=2)) + r = df.groupby("id").resample("1D") + result = r["buyer"].count() + expected = Series( + [1, 1], + index=pd.MultiIndex.from_tuples( + [(1, Timestamp("2016-01-01")), (2, Timestamp("2016-01-02"))], + names=["id", None], + ), + name="buyer", + ) + tm.assert_series_equal(result, expected) + + result = r["buyer"].count() + tm.assert_series_equal(result, expected) + + +def test_groupby_resample_on_api_with_getitem(): + # GH 17813 + df = pd.DataFrame( + {"id": list("aabbb"), "date": pd.date_range("1-1-2016", periods=5), "data": 1} + ) + exp = df.set_index("date").groupby("id").resample("2D")["data"].sum() + result = df.groupby("id").resample("2D", on="date")["data"].sum() + tm.assert_series_equal(result, exp) + + +def test_nearest(): + + # GH 17496 + # Resample nearest + index = pd.date_range("1/1/2000", periods=3, freq="T") + result = Series(range(3), index=index).resample("20s").nearest() + + expected = Series( + [0, 0, 1, 1, 1, 2, 2], + index=pd.DatetimeIndex( + [ + "2000-01-01 00:00:00", + "2000-01-01 00:00:20", + "2000-01-01 00:00:40", + "2000-01-01 00:01:00", + "2000-01-01 00:01:20", + "2000-01-01 00:01:40", + "2000-01-01 00:02:00", + ], + dtype="datetime64[ns]", + freq="20S", + ), + ) + tm.assert_series_equal(result, expected) + + +def test_methods(): + g = test_frame.groupby("A") + r = g.resample("2s") + + for f in ["first", "last", "median", "sem", "sum", "mean", "min", "max"]: + result = getattr(r, f)() + expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) + tm.assert_frame_equal(result, expected) + + for f in ["size"]: + result = getattr(r, f)() + expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) + tm.assert_series_equal(result, expected) + + for f in ["count"]: + result = getattr(r, f)() + expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) + tm.assert_frame_equal(result, expected) + + # series only + for f in ["nunique"]: + result = getattr(r.B, f)() + expected = g.B.apply(lambda x: getattr(x.resample("2s"), f)()) + tm.assert_series_equal(result, expected) + + for f in ["nearest", "backfill", "ffill", "asfreq"]: + result = getattr(r, f)() + expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) + tm.assert_frame_equal(result, expected) + + result = r.ohlc() + expected = g.apply(lambda x: x.resample("2s").ohlc()) + tm.assert_frame_equal(result, expected) + + for f in ["std", "var"]: + result = getattr(r, f)(ddof=1) + expected = g.apply(lambda x: getattr(x.resample("2s"), f)(ddof=1)) + tm.assert_frame_equal(result, expected) + + +def test_apply(): + + g = test_frame.groupby("A") + r = g.resample("2s") + + # reduction + expected = g.resample("2s").sum() + + def f(x): + return x.resample("2s").sum() + + result = r.apply(f) + tm.assert_frame_equal(result, expected) + + def f(x): + return x.resample("2s").apply(lambda y: y.sum()) + + result = g.apply(f) + tm.assert_frame_equal(result, expected) + + +def test_apply_with_mutated_index(): + # GH 15169 + index = pd.date_range("1-1-2015", "12-31-15", freq="D") + df = DataFrame(data={"col1": np.random.rand(len(index))}, index=index) + + def f(x): + s = Series([1, 2], index=["a", "b"]) + return s + + expected = df.groupby(pd.Grouper(freq="M")).apply(f) + + result = df.resample("M").apply(f) + tm.assert_frame_equal(result, expected) + + # A case for series + expected = df["col1"].groupby(pd.Grouper(freq="M")).apply(f) + result = df["col1"].resample("M").apply(f) + tm.assert_series_equal(result, expected) + + +def test_resample_groupby_with_label(): + # GH 13235 + index = date_range("2000-01-01", freq="2D", periods=5) + df = DataFrame(index=index, data={"col0": [0, 0, 1, 1, 2], "col1": [1, 1, 1, 1, 1]}) + result = df.groupby("col0").resample("1W", label="left").sum() + + mi = [ + np.array([0, 0, 1, 2]), + pd.to_datetime( + np.array(["1999-12-26", "2000-01-02", "2000-01-02", "2000-01-02"]) + ), + ] + mindex = pd.MultiIndex.from_arrays(mi, names=["col0", None]) + expected = DataFrame( + data={"col0": [0, 0, 2, 2], "col1": [1, 1, 2, 1]}, index=mindex + ) + + tm.assert_frame_equal(result, expected) + + +def test_consistency_with_window(): + + # consistent return values with window + df = test_frame + expected = pd.Int64Index([1, 2, 3], name="A") + result = df.groupby("A").resample("2s").mean() + assert result.index.nlevels == 2 + tm.assert_index_equal(result.index.levels[0], expected) + + result = df.groupby("A").rolling(20).mean() + assert result.index.nlevels == 2 + tm.assert_index_equal(result.index.levels[0], expected) + + +def test_median_duplicate_columns(): + # GH 14233 + + df = DataFrame( + np.random.randn(20, 3), + columns=list("aaa"), + index=pd.date_range("2012-01-01", periods=20, freq="s"), + ) + df2 = df.copy() + df2.columns = ["a", "b", "c"] + expected = df2.resample("5s").median() + result = df.resample("5s").median() + expected.columns = result.columns + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/resample/test_time_grouper.py b/venv/Lib/site-packages/pandas/tests/resample/test_time_grouper.py new file mode 100644 index 0000000..3aa7765 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/resample/test_time_grouper.py @@ -0,0 +1,278 @@ +from datetime import datetime +from operator import methodcaller + +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Series +import pandas._testing as tm +from pandas.core.groupby.grouper import Grouper +from pandas.core.indexes.datetimes import date_range + +test_series = Series(np.random.randn(1000), index=date_range("1/1/2000", periods=1000)) + + +def test_apply(): + grouper = Grouper(freq="A", label="right", closed="right") + + grouped = test_series.groupby(grouper) + + def f(x): + return x.sort_values()[-3:] + + applied = grouped.apply(f) + expected = test_series.groupby(lambda x: x.year).apply(f) + + applied.index = applied.index.droplevel(0) + expected.index = expected.index.droplevel(0) + tm.assert_series_equal(applied, expected) + + +def test_count(): + test_series[::3] = np.nan + + expected = test_series.groupby(lambda x: x.year).count() + + grouper = Grouper(freq="A", label="right", closed="right") + result = test_series.groupby(grouper).count() + expected.index = result.index + tm.assert_series_equal(result, expected) + + result = test_series.resample("A").count() + expected.index = result.index + tm.assert_series_equal(result, expected) + + +def test_numpy_reduction(): + result = test_series.resample("A", closed="right").prod() + + expected = test_series.groupby(lambda x: x.year).agg(np.prod) + expected.index = result.index + + tm.assert_series_equal(result, expected) + + +def test_apply_iteration(): + # #2300 + N = 1000 + ind = pd.date_range(start="2000-01-01", freq="D", periods=N) + df = DataFrame({"open": 1, "close": 2}, index=ind) + tg = Grouper(freq="M") + + _, grouper, _ = tg._get_grouper(df) + + # Errors + grouped = df.groupby(grouper, group_keys=False) + + def f(df): + return df["close"] / df["open"] + + # it works! + result = grouped.apply(f) + tm.assert_index_equal(result.index, df.index) + + +@pytest.mark.parametrize( + "name, func", + [ + ("Int64Index", tm.makeIntIndex), + ("Index", tm.makeUnicodeIndex), + ("Float64Index", tm.makeFloatIndex), + ("MultiIndex", lambda m: tm.makeCustomIndex(m, 2)), + ], +) +def test_fails_on_no_datetime_index(name, func): + n = 2 + index = func(n) + df = DataFrame({"a": np.random.randn(n)}, index=index) + + msg = ( + "Only valid with DatetimeIndex, TimedeltaIndex " + f"or PeriodIndex, but got an instance of '{name}'" + ) + with pytest.raises(TypeError, match=msg): + df.groupby(Grouper(freq="D")) + + +def test_aaa_group_order(): + # GH 12840 + # check TimeGrouper perform stable sorts + n = 20 + data = np.random.randn(n, 4) + df = DataFrame(data, columns=["A", "B", "C", "D"]) + df["key"] = [ + datetime(2013, 1, 1), + datetime(2013, 1, 2), + datetime(2013, 1, 3), + datetime(2013, 1, 4), + datetime(2013, 1, 5), + ] * 4 + grouped = df.groupby(Grouper(key="key", freq="D")) + + tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 1)), df[::5]) + tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 2)), df[1::5]) + tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 3)), df[2::5]) + tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 4)), df[3::5]) + tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 5)), df[4::5]) + + +def test_aggregate_normal(resample_method): + """Check TimeGrouper's aggregation is identical as normal groupby.""" + + if resample_method == "ohlc": + pytest.xfail(reason="DataError: No numeric types to aggregate") + + data = np.random.randn(20, 4) + normal_df = DataFrame(data, columns=["A", "B", "C", "D"]) + normal_df["key"] = [1, 2, 3, 4, 5] * 4 + + dt_df = DataFrame(data, columns=["A", "B", "C", "D"]) + dt_df["key"] = [ + datetime(2013, 1, 1), + datetime(2013, 1, 2), + datetime(2013, 1, 3), + datetime(2013, 1, 4), + datetime(2013, 1, 5), + ] * 4 + + normal_grouped = normal_df.groupby("key") + dt_grouped = dt_df.groupby(Grouper(key="key", freq="D")) + + expected = getattr(normal_grouped, resample_method)() + dt_result = getattr(dt_grouped, resample_method)() + expected.index = date_range(start="2013-01-01", freq="D", periods=5, name="key") + tm.assert_equal(expected, dt_result) + + # if TimeGrouper is used included, 'nth' doesn't work yet + + """ + for func in ['nth']: + expected = getattr(normal_grouped, func)(3) + expected.index = date_range(start='2013-01-01', + freq='D', periods=5, name='key') + dt_result = getattr(dt_grouped, func)(3) + tm.assert_frame_equal(expected, dt_result) + """ + + +@pytest.mark.parametrize( + "method, method_args, unit", + [ + ("sum", dict(), 0), + ("sum", dict(min_count=0), 0), + ("sum", dict(min_count=1), np.nan), + ("prod", dict(), 1), + ("prod", dict(min_count=0), 1), + ("prod", dict(min_count=1), np.nan), + ], +) +def test_resample_entirly_nat_window(method, method_args, unit): + s = pd.Series([0] * 2 + [np.nan] * 2, index=pd.date_range("2017", periods=4)) + result = methodcaller(method, **method_args)(s.resample("2d")) + expected = pd.Series( + [0.0, unit], index=pd.to_datetime(["2017-01-01", "2017-01-03"]) + ) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "func, fill_value", + [("min", np.nan), ("max", np.nan), ("sum", 0), ("prod", 1), ("count", 0)], +) +def test_aggregate_with_nat(func, fill_value): + # check TimeGrouper's aggregation is identical as normal groupby + # if NaT is included, 'var', 'std', 'mean', 'first','last' + # and 'nth' doesn't work yet + + n = 20 + data = np.random.randn(n, 4).astype("int64") + normal_df = DataFrame(data, columns=["A", "B", "C", "D"]) + normal_df["key"] = [1, 2, np.nan, 4, 5] * 4 + + dt_df = DataFrame(data, columns=["A", "B", "C", "D"]) + dt_df["key"] = [ + datetime(2013, 1, 1), + datetime(2013, 1, 2), + pd.NaT, + datetime(2013, 1, 4), + datetime(2013, 1, 5), + ] * 4 + + normal_grouped = normal_df.groupby("key") + dt_grouped = dt_df.groupby(Grouper(key="key", freq="D")) + + normal_result = getattr(normal_grouped, func)() + dt_result = getattr(dt_grouped, func)() + + pad = DataFrame([[fill_value] * 4], index=[3], columns=["A", "B", "C", "D"]) + expected = normal_result.append(pad) + expected = expected.sort_index() + expected.index = date_range(start="2013-01-01", freq="D", periods=5, name="key") + tm.assert_frame_equal(expected, dt_result) + assert dt_result.index.name == "key" + + +def test_aggregate_with_nat_size(): + # GH 9925 + n = 20 + data = np.random.randn(n, 4).astype("int64") + normal_df = DataFrame(data, columns=["A", "B", "C", "D"]) + normal_df["key"] = [1, 2, np.nan, 4, 5] * 4 + + dt_df = DataFrame(data, columns=["A", "B", "C", "D"]) + dt_df["key"] = [ + datetime(2013, 1, 1), + datetime(2013, 1, 2), + pd.NaT, + datetime(2013, 1, 4), + datetime(2013, 1, 5), + ] * 4 + + normal_grouped = normal_df.groupby("key") + dt_grouped = dt_df.groupby(Grouper(key="key", freq="D")) + + normal_result = normal_grouped.size() + dt_result = dt_grouped.size() + + pad = Series([0], index=[3]) + expected = normal_result.append(pad) + expected = expected.sort_index() + expected.index = date_range(start="2013-01-01", freq="D", periods=5, name="key") + tm.assert_series_equal(expected, dt_result) + assert dt_result.index.name == "key" + + +def test_repr(): + # GH18203 + result = repr(Grouper(key="A", freq="H")) + expected = ( + "TimeGrouper(key='A', freq=, axis=0, sort=True, " + "closed='left', label='left', how='mean', " + "convention='e', base=0)" + ) + assert result == expected + + +@pytest.mark.parametrize( + "method, method_args, expected_values", + [ + ("sum", dict(), [1, 0, 1]), + ("sum", dict(min_count=0), [1, 0, 1]), + ("sum", dict(min_count=1), [1, np.nan, 1]), + ("sum", dict(min_count=2), [np.nan, np.nan, np.nan]), + ("prod", dict(), [1, 1, 1]), + ("prod", dict(min_count=0), [1, 1, 1]), + ("prod", dict(min_count=1), [1, np.nan, 1]), + ("prod", dict(min_count=2), [np.nan, np.nan, np.nan]), + ], +) +def test_upsample_sum(method, method_args, expected_values): + s = pd.Series(1, index=pd.date_range("2017", periods=2, freq="H")) + resampled = s.resample("30T") + index = pd.to_datetime( + ["2017-01-01T00:00:00", "2017-01-01T00:30:00", "2017-01-01T01:00:00"] + ) + result = methodcaller(method, **method_args)(resampled) + expected = pd.Series(expected_values, index=index) + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/resample/test_timedelta.py b/venv/Lib/site-packages/pandas/tests/resample/test_timedelta.py new file mode 100644 index 0000000..a4d14f1 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/resample/test_timedelta.py @@ -0,0 +1,127 @@ +from datetime import timedelta + +import numpy as np + +import pandas as pd +from pandas import DataFrame, Series +import pandas._testing as tm +from pandas.core.indexes.timedeltas import timedelta_range + + +def test_asfreq_bug(): + df = DataFrame(data=[1, 3], index=[timedelta(), timedelta(minutes=3)]) + result = df.resample("1T").asfreq() + expected = DataFrame( + data=[1, np.nan, np.nan, 3], + index=timedelta_range("0 day", periods=4, freq="1T"), + ) + tm.assert_frame_equal(result, expected) + + +def test_resample_with_nat(): + # GH 13223 + index = pd.to_timedelta(["0s", pd.NaT, "2s"]) + result = DataFrame({"value": [2, 3, 5]}, index).resample("1s").mean() + expected = DataFrame( + {"value": [2.5, np.nan, 5.0]}, + index=timedelta_range("0 day", periods=3, freq="1S"), + ) + tm.assert_frame_equal(result, expected) + + +def test_resample_as_freq_with_subperiod(): + # GH 13022 + index = timedelta_range("00:00:00", "00:10:00", freq="5T") + df = DataFrame(data={"value": [1, 5, 10]}, index=index) + result = df.resample("2T").asfreq() + expected_data = {"value": [1, np.nan, np.nan, np.nan, np.nan, 10]} + expected = DataFrame( + data=expected_data, index=timedelta_range("00:00:00", "00:10:00", freq="2T") + ) + tm.assert_frame_equal(result, expected) + + +def test_resample_with_timedeltas(): + + expected = DataFrame({"A": np.arange(1480)}) + expected = expected.groupby(expected.index // 30).sum() + expected.index = pd.timedelta_range("0 days", freq="30T", periods=50) + + df = DataFrame( + {"A": np.arange(1480)}, index=pd.to_timedelta(np.arange(1480), unit="T") + ) + result = df.resample("30T").sum() + + tm.assert_frame_equal(result, expected) + + s = df["A"] + result = s.resample("30T").sum() + tm.assert_series_equal(result, expected["A"]) + + +def test_resample_single_period_timedelta(): + + s = Series(list(range(5)), index=pd.timedelta_range("1 day", freq="s", periods=5)) + result = s.resample("2s").sum() + expected = Series( + [1, 5, 4], index=pd.timedelta_range("1 day", freq="2s", periods=3) + ) + tm.assert_series_equal(result, expected) + + +def test_resample_timedelta_idempotency(): + + # GH 12072 + index = pd.timedelta_range("0", periods=9, freq="10L") + series = Series(range(9), index=index) + result = series.resample("10L").mean() + expected = series + tm.assert_series_equal(result, expected) + + +def test_resample_base_with_timedeltaindex(): + + # GH 10530 + rng = timedelta_range(start="0s", periods=25, freq="s") + ts = Series(np.random.randn(len(rng)), index=rng) + + with_base = ts.resample("2s", base=5).mean() + without_base = ts.resample("2s").mean() + + exp_without_base = timedelta_range(start="0s", end="25s", freq="2s") + exp_with_base = timedelta_range(start="5s", end="29s", freq="2s") + + tm.assert_index_equal(without_base.index, exp_without_base) + tm.assert_index_equal(with_base.index, exp_with_base) + + +def test_resample_categorical_data_with_timedeltaindex(): + # GH #12169 + df = DataFrame({"Group_obj": "A"}, index=pd.to_timedelta(list(range(20)), unit="s")) + df["Group"] = df["Group_obj"].astype("category") + result = df.resample("10s").agg(lambda x: (x.value_counts().index[0])) + expected = DataFrame( + {"Group_obj": ["A", "A"], "Group": ["A", "A"]}, + index=pd.to_timedelta([0, 10], unit="s"), + ) + expected = expected.reindex(["Group_obj", "Group"], axis=1) + expected["Group"] = expected["Group_obj"] + tm.assert_frame_equal(result, expected) + + +def test_resample_timedelta_values(): + # GH 13119 + # check that timedelta dtype is preserved when NaT values are + # introduced by the resampling + + times = timedelta_range("1 day", "4 day", freq="4D") + df = DataFrame({"time": times}, index=times) + + times2 = timedelta_range("1 day", "4 day", freq="2D") + exp = Series(times2, index=times2, name="time") + exp.iloc[1] = pd.NaT + + res = df.resample("2D").first()["time"] + tm.assert_series_equal(res, exp) + res = df["time"].resample("2D").first() + tm.assert_series_equal(res, exp) diff --git a/venv/Lib/site-packages/pandas/tests/reshape/__init__.py b/venv/Lib/site-packages/pandas/tests/reshape/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/reshape/merge/__init__.py b/venv/Lib/site-packages/pandas/tests/reshape/merge/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/reshape/merge/test_join.py b/venv/Lib/site-packages/pandas/tests/reshape/merge/test_join.py new file mode 100644 index 0000000..a660acb --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/reshape/merge/test_join.py @@ -0,0 +1,884 @@ +import numpy as np +from numpy.random import randn +import pytest + +from pandas._libs import join as libjoin + +import pandas as pd +from pandas import DataFrame, Index, MultiIndex, Series, concat, merge +import pandas._testing as tm +from pandas.tests.reshape.merge.test_merge import NGROUPS, N, get_test_data + +a_ = np.array + + +class TestJoin: + def setup_method(self, method): + # aggregate multiple columns + self.df = DataFrame( + { + "key1": get_test_data(), + "key2": get_test_data(), + "data1": np.random.randn(N), + "data2": np.random.randn(N), + } + ) + + # exclude a couple keys for fun + self.df = self.df[self.df["key2"] > 1] + + self.df2 = DataFrame( + { + "key1": get_test_data(n=N // 5), + "key2": get_test_data(ngroups=NGROUPS // 2, n=N // 5), + "value": np.random.randn(N // 5), + } + ) + + index, data = tm.getMixedTypeDict() + self.target = DataFrame(data, index=index) + + # Join on string value + self.source = DataFrame( + {"MergedA": data["A"], "MergedD": data["D"]}, index=data["C"] + ) + + def test_cython_left_outer_join(self): + left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) + right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) + max_group = 5 + + ls, rs = libjoin.left_outer_join(left, right, max_group) + + exp_ls = left.argsort(kind="mergesort") + exp_rs = right.argsort(kind="mergesort") + + exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 10]) + exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5, -1, -1]) + + exp_ls = exp_ls.take(exp_li) + exp_ls[exp_li == -1] = -1 + + exp_rs = exp_rs.take(exp_ri) + exp_rs[exp_ri == -1] = -1 + + tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) + tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) + + def test_cython_right_outer_join(self): + left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) + right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) + max_group = 5 + + rs, ls = libjoin.left_outer_join(right, left, max_group) + + exp_ls = left.argsort(kind="mergesort") + exp_rs = right.argsort(kind="mergesort") + + # 0 1 1 1 + exp_li = a_( + [ + 0, + 1, + 2, + 3, + 4, + 5, + 3, + 4, + 5, + 3, + 4, + 5, + # 2 2 4 + 6, + 7, + 8, + 6, + 7, + 8, + -1, + ] + ) + exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6]) + + exp_ls = exp_ls.take(exp_li) + exp_ls[exp_li == -1] = -1 + + exp_rs = exp_rs.take(exp_ri) + exp_rs[exp_ri == -1] = -1 + + tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) + tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) + + def test_cython_inner_join(self): + left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) + right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64) + max_group = 5 + + ls, rs = libjoin.inner_join(left, right, max_group) + + exp_ls = left.argsort(kind="mergesort") + exp_rs = right.argsort(kind="mergesort") + + exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8]) + exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5]) + + exp_ls = exp_ls.take(exp_li) + exp_ls[exp_li == -1] = -1 + + exp_rs = exp_rs.take(exp_ri) + exp_rs[exp_ri == -1] = -1 + + tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) + tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) + + def test_left_outer_join(self): + joined_key2 = merge(self.df, self.df2, on="key2") + _check_join(self.df, self.df2, joined_key2, ["key2"], how="left") + + joined_both = merge(self.df, self.df2) + _check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="left") + + def test_right_outer_join(self): + joined_key2 = merge(self.df, self.df2, on="key2", how="right") + _check_join(self.df, self.df2, joined_key2, ["key2"], how="right") + + joined_both = merge(self.df, self.df2, how="right") + _check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="right") + + def test_full_outer_join(self): + joined_key2 = merge(self.df, self.df2, on="key2", how="outer") + _check_join(self.df, self.df2, joined_key2, ["key2"], how="outer") + + joined_both = merge(self.df, self.df2, how="outer") + _check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="outer") + + def test_inner_join(self): + joined_key2 = merge(self.df, self.df2, on="key2", how="inner") + _check_join(self.df, self.df2, joined_key2, ["key2"], how="inner") + + joined_both = merge(self.df, self.df2, how="inner") + _check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="inner") + + def test_handle_overlap(self): + joined = merge(self.df, self.df2, on="key2", suffixes=[".foo", ".bar"]) + + assert "key1.foo" in joined + assert "key1.bar" in joined + + def test_handle_overlap_arbitrary_key(self): + joined = merge( + self.df, + self.df2, + left_on="key2", + right_on="key1", + suffixes=[".foo", ".bar"], + ) + assert "key1.foo" in joined + assert "key2.bar" in joined + + def test_join_on(self): + target = self.target + source = self.source + + merged = target.join(source, on="C") + tm.assert_series_equal(merged["MergedA"], target["A"], check_names=False) + tm.assert_series_equal(merged["MergedD"], target["D"], check_names=False) + + # join with duplicates (fix regression from DataFrame/Matrix merge) + df = DataFrame({"key": ["a", "a", "b", "b", "c"]}) + df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"]) + joined = df.join(df2, on="key") + expected = DataFrame( + {"key": ["a", "a", "b", "b", "c"], "value": [0, 0, 1, 1, 2]} + ) + tm.assert_frame_equal(joined, expected) + + # Test when some are missing + df_a = DataFrame([[1], [2], [3]], index=["a", "b", "c"], columns=["one"]) + df_b = DataFrame([["foo"], ["bar"]], index=[1, 2], columns=["two"]) + df_c = DataFrame([[1], [2]], index=[1, 2], columns=["three"]) + joined = df_a.join(df_b, on="one") + joined = joined.join(df_c, on="one") + assert np.isnan(joined["two"]["c"]) + assert np.isnan(joined["three"]["c"]) + + # merge column not p resent + with pytest.raises(KeyError, match="^'E'$"): + target.join(source, on="E") + + # overlap + source_copy = source.copy() + source_copy["A"] = 0 + msg = ( + "You are trying to merge on float64 and object columns. If" + " you wish to proceed you should use pd.concat" + ) + with pytest.raises(ValueError, match=msg): + target.join(source_copy, on="A") + + def test_join_on_fails_with_different_right_index(self): + df = DataFrame( + {"a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3)} + ) + df2 = DataFrame( + {"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)}, + index=tm.makeCustomIndex(10, 2), + ) + msg = r'len\(left_on\) must equal the number of levels in the index of "right"' + with pytest.raises(ValueError, match=msg): + merge(df, df2, left_on="a", right_index=True) + + def test_join_on_fails_with_different_left_index(self): + df = DataFrame( + {"a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3)}, + index=tm.makeCustomIndex(3, 2), + ) + df2 = DataFrame( + {"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)} + ) + msg = r'len\(right_on\) must equal the number of levels in the index of "left"' + with pytest.raises(ValueError, match=msg): + merge(df, df2, right_on="b", left_index=True) + + def test_join_on_fails_with_different_column_counts(self): + df = DataFrame( + {"a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3)} + ) + df2 = DataFrame( + {"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)}, + index=tm.makeCustomIndex(10, 2), + ) + msg = r"len\(right_on\) must equal len\(left_on\)" + with pytest.raises(ValueError, match=msg): + merge(df, df2, right_on="a", left_on=["a", "b"]) + + @pytest.mark.parametrize("wrong_type", [2, "str", None, np.array([0, 1])]) + def test_join_on_fails_with_wrong_object_type(self, wrong_type): + # GH12081 - original issue + + # GH21220 - merging of Series and DataFrame is now allowed + # Edited test to remove the Series object from test parameters + + df = DataFrame({"a": [1, 1]}) + msg = "Can only merge Series or DataFrame objects, a {} was passed".format( + str(type(wrong_type)) + ) + with pytest.raises(TypeError, match=msg): + merge(wrong_type, df, left_on="a", right_on="a") + with pytest.raises(TypeError, match=msg): + merge(df, wrong_type, left_on="a", right_on="a") + + def test_join_on_pass_vector(self): + expected = self.target.join(self.source, on="C") + del expected["C"] + + join_col = self.target.pop("C") + result = self.target.join(self.source, on=join_col) + tm.assert_frame_equal(result, expected) + + def test_join_with_len0(self): + # nothing to merge + merged = self.target.join(self.source.reindex([]), on="C") + for col in self.source: + assert col in merged + assert merged[col].isna().all() + + merged2 = self.target.join(self.source.reindex([]), on="C", how="inner") + tm.assert_index_equal(merged2.columns, merged.columns) + assert len(merged2) == 0 + + def test_join_on_inner(self): + df = DataFrame({"key": ["a", "a", "d", "b", "b", "c"]}) + df2 = DataFrame({"value": [0, 1]}, index=["a", "b"]) + + joined = df.join(df2, on="key", how="inner") + + expected = df.join(df2, on="key") + expected = expected[expected["value"].notna()] + tm.assert_series_equal(joined["key"], expected["key"], check_dtype=False) + tm.assert_series_equal(joined["value"], expected["value"], check_dtype=False) + tm.assert_index_equal(joined.index, expected.index) + + def test_join_on_singlekey_list(self): + df = DataFrame({"key": ["a", "a", "b", "b", "c"]}) + df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"]) + + # corner cases + joined = df.join(df2, on=["key"]) + expected = df.join(df2, on="key") + + tm.assert_frame_equal(joined, expected) + + def test_join_on_series(self): + result = self.target.join(self.source["MergedA"], on="C") + expected = self.target.join(self.source[["MergedA"]], on="C") + tm.assert_frame_equal(result, expected) + + def test_join_on_series_buglet(self): + # GH #638 + df = DataFrame({"a": [1, 1]}) + ds = Series([2], index=[1], name="b") + result = df.join(ds, on="a") + expected = DataFrame({"a": [1, 1], "b": [2, 2]}, index=df.index) + tm.assert_frame_equal(result, expected) + + def test_join_index_mixed(self, join_type): + # no overlapping blocks + df1 = DataFrame(index=np.arange(10)) + df1["bool"] = True + df1["string"] = "foo" + + df2 = DataFrame(index=np.arange(5, 15)) + df2["int"] = 1 + df2["float"] = 1.0 + + joined = df1.join(df2, how=join_type) + expected = _join_by_hand(df1, df2, how=join_type) + tm.assert_frame_equal(joined, expected) + + joined = df2.join(df1, how=join_type) + expected = _join_by_hand(df2, df1, how=join_type) + tm.assert_frame_equal(joined, expected) + + def test_join_index_mixed_overlap(self): + df1 = DataFrame( + {"A": 1.0, "B": 2, "C": "foo", "D": True}, + index=np.arange(10), + columns=["A", "B", "C", "D"], + ) + assert df1["B"].dtype == np.int64 + assert df1["D"].dtype == np.bool_ + + df2 = DataFrame( + {"A": 1.0, "B": 2, "C": "foo", "D": True}, + index=np.arange(0, 10, 2), + columns=["A", "B", "C", "D"], + ) + + # overlap + joined = df1.join(df2, lsuffix="_one", rsuffix="_two") + expected_columns = [ + "A_one", + "B_one", + "C_one", + "D_one", + "A_two", + "B_two", + "C_two", + "D_two", + ] + df1.columns = expected_columns[:4] + df2.columns = expected_columns[4:] + expected = _join_by_hand(df1, df2) + tm.assert_frame_equal(joined, expected) + + def test_join_empty_bug(self): + # generated an exception in 0.4.3 + x = DataFrame() + x.join(DataFrame([3], index=[0], columns=["A"]), how="outer") + + def test_join_unconsolidated(self): + # GH #331 + a = DataFrame(randn(30, 2), columns=["a", "b"]) + c = Series(randn(30)) + a["c"] = c + d = DataFrame(randn(30, 1), columns=["q"]) + + # it works! + a.join(d) + d.join(a) + + def test_join_multiindex(self): + index1 = MultiIndex.from_arrays( + [["a", "a", "a", "b", "b", "b"], [1, 2, 3, 1, 2, 3]], + names=["first", "second"], + ) + + index2 = MultiIndex.from_arrays( + [["b", "b", "b", "c", "c", "c"], [1, 2, 3, 1, 2, 3]], + names=["first", "second"], + ) + + df1 = DataFrame(data=np.random.randn(6), index=index1, columns=["var X"]) + df2 = DataFrame(data=np.random.randn(6), index=index2, columns=["var Y"]) + + df1 = df1.sort_index(level=0) + df2 = df2.sort_index(level=0) + + joined = df1.join(df2, how="outer") + ex_index = Index(index1.values).union(Index(index2.values)) + expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) + expected.index.names = index1.names + tm.assert_frame_equal(joined, expected) + assert joined.index.names == index1.names + + df1 = df1.sort_index(level=1) + df2 = df2.sort_index(level=1) + + joined = df1.join(df2, how="outer").sort_index(level=0) + ex_index = Index(index1.values).union(Index(index2.values)) + expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) + expected.index.names = index1.names + + tm.assert_frame_equal(joined, expected) + assert joined.index.names == index1.names + + def test_join_inner_multiindex(self): + key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"] + key2 = [ + "two", + "one", + "three", + "one", + "two", + "one", + "two", + "two", + "three", + "one", + ] + + data = np.random.randn(len(key1)) + data = DataFrame({"key1": key1, "key2": key2, "data": data}) + + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + to_join = DataFrame( + np.random.randn(10, 3), index=index, columns=["j_one", "j_two", "j_three"] + ) + + joined = data.join(to_join, on=["key1", "key2"], how="inner") + expected = merge( + data, + to_join.reset_index(), + left_on=["key1", "key2"], + right_on=["first", "second"], + how="inner", + sort=False, + ) + + expected2 = merge( + to_join, + data, + right_on=["key1", "key2"], + left_index=True, + how="inner", + sort=False, + ) + tm.assert_frame_equal(joined, expected2.reindex_like(joined)) + + expected2 = merge( + to_join, + data, + right_on=["key1", "key2"], + left_index=True, + how="inner", + sort=False, + ) + + expected = expected.drop(["first", "second"], axis=1) + expected.index = joined.index + + assert joined.index.is_monotonic + tm.assert_frame_equal(joined, expected) + + # _assert_same_contents(expected, expected2.loc[:, expected.columns]) + + def test_join_hierarchical_mixed(self): + # GH 2024 + df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "c"]) + new_df = df.groupby(["a"]).agg({"b": [np.mean, np.sum]}) + other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=["a", "b", "d"]) + other_df.set_index("a", inplace=True) + # GH 9455, 12219 + with tm.assert_produces_warning(UserWarning): + result = merge(new_df, other_df, left_index=True, right_index=True) + assert ("b", "mean") in result + assert "b" in result + + def test_join_float64_float32(self): + + a = DataFrame(randn(10, 2), columns=["a", "b"], dtype=np.float64) + b = DataFrame(randn(10, 1), columns=["c"], dtype=np.float32) + joined = a.join(b) + assert joined.dtypes["a"] == "float64" + assert joined.dtypes["b"] == "float64" + assert joined.dtypes["c"] == "float32" + + a = np.random.randint(0, 5, 100).astype("int64") + b = np.random.random(100).astype("float64") + c = np.random.random(100).astype("float32") + df = DataFrame({"a": a, "b": b, "c": c}) + xpdf = DataFrame({"a": a, "b": b, "c": c}) + s = DataFrame(np.random.random(5).astype("float32"), columns=["md"]) + rs = df.merge(s, left_on="a", right_index=True) + assert rs.dtypes["a"] == "int64" + assert rs.dtypes["b"] == "float64" + assert rs.dtypes["c"] == "float32" + assert rs.dtypes["md"] == "float32" + + xp = xpdf.merge(s, left_on="a", right_index=True) + tm.assert_frame_equal(rs, xp) + + def test_join_many_non_unique_index(self): + df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]}) + df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]}) + df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]}) + idf1 = df1.set_index(["a", "b"]) + idf2 = df2.set_index(["a", "b"]) + idf3 = df3.set_index(["a", "b"]) + + result = idf1.join([idf2, idf3], how="outer") + + df_partially_merged = merge(df1, df2, on=["a", "b"], how="outer") + expected = merge(df_partially_merged, df3, on=["a", "b"], how="outer") + + result = result.reset_index() + expected = expected[result.columns] + expected["a"] = expected.a.astype("int64") + expected["b"] = expected.b.astype("int64") + tm.assert_frame_equal(result, expected) + + df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]}) + df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]}) + df3 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]}) + idf1 = df1.set_index(["a", "b"]) + idf2 = df2.set_index(["a", "b"]) + idf3 = df3.set_index(["a", "b"]) + result = idf1.join([idf2, idf3], how="inner") + + df_partially_merged = merge(df1, df2, on=["a", "b"], how="inner") + expected = merge(df_partially_merged, df3, on=["a", "b"], how="inner") + + result = result.reset_index() + + tm.assert_frame_equal(result, expected.loc[:, result.columns]) + + # GH 11519 + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + } + ) + s = Series( + np.repeat(np.arange(8), 2), index=np.repeat(np.arange(8), 2), name="TEST" + ) + inner = df.join(s, how="inner") + outer = df.join(s, how="outer") + left = df.join(s, how="left") + right = df.join(s, how="right") + tm.assert_frame_equal(inner, outer) + tm.assert_frame_equal(inner, left) + tm.assert_frame_equal(inner, right) + + def test_join_sort(self): + left = DataFrame({"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]}) + right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"]) + + joined = left.join(right, on="key", sort=True) + expected = DataFrame( + { + "key": ["bar", "baz", "foo", "foo"], + "value": [2, 3, 1, 4], + "value2": ["a", "b", "c", "c"], + }, + index=[1, 2, 0, 3], + ) + tm.assert_frame_equal(joined, expected) + + # smoke test + joined = left.join(right, on="key", sort=False) + tm.assert_index_equal(joined.index, pd.Index(list(range(4)))) + + def test_join_mixed_non_unique_index(self): + # GH 12814, unorderable types in py3 with a non-unique index + df1 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 3, "a"]) + df2 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 3, 3, 4]) + result = df1.join(df2) + expected = DataFrame( + {"a": [1, 2, 3, 3, 4], "b": [5, np.nan, 6, 7, np.nan]}, + index=[1, 2, 3, 3, "a"], + ) + tm.assert_frame_equal(result, expected) + + df3 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 2, "a"]) + df4 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 2, 3, 4]) + result = df3.join(df4) + expected = DataFrame( + {"a": [1, 2, 3, 4], "b": [5, 6, 6, np.nan]}, index=[1, 2, 2, "a"] + ) + tm.assert_frame_equal(result, expected) + + def test_join_non_unique_period_index(self): + # GH #16871 + index = pd.period_range("2016-01-01", periods=16, freq="M") + df = DataFrame(list(range(len(index))), index=index, columns=["pnum"]) + df2 = concat([df, df]) + result = df.join(df2, how="inner", rsuffix="_df2") + expected = DataFrame( + np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2), + columns=["pnum", "pnum_df2"], + index=df2.sort_index().index, + ) + tm.assert_frame_equal(result, expected) + + def test_mixed_type_join_with_suffix(self): + # GH #916 + df = DataFrame(np.random.randn(20, 6), columns=["a", "b", "c", "d", "e", "f"]) + df.insert(0, "id", 0) + df.insert(5, "dt", "foo") + + grouped = df.groupby("id") + mn = grouped.mean() + cn = grouped.count() + + # it works! + mn.join(cn, rsuffix="_right") + + def test_join_many(self): + df = DataFrame(np.random.randn(10, 6), columns=list("abcdef")) + df_list = [df[["a", "b"]], df[["c", "d"]], df[["e", "f"]]] + + joined = df_list[0].join(df_list[1:]) + tm.assert_frame_equal(joined, df) + + df_list = [df[["a", "b"]][:-2], df[["c", "d"]][2:], df[["e", "f"]][1:9]] + + def _check_diff_index(df_list, result, exp_index): + reindexed = [x.reindex(exp_index) for x in df_list] + expected = reindexed[0].join(reindexed[1:]) + tm.assert_frame_equal(result, expected) + + # different join types + joined = df_list[0].join(df_list[1:], how="outer") + _check_diff_index(df_list, joined, df.index) + + joined = df_list[0].join(df_list[1:]) + _check_diff_index(df_list, joined, df_list[0].index) + + joined = df_list[0].join(df_list[1:], how="inner") + _check_diff_index(df_list, joined, df.index[2:8]) + + msg = "Joining multiple DataFrames only supported for joining on index" + with pytest.raises(ValueError, match=msg): + df_list[0].join(df_list[1:], on="a") + + def test_join_many_mixed(self): + df = DataFrame(np.random.randn(8, 4), columns=["A", "B", "C", "D"]) + df["key"] = ["foo", "bar"] * 4 + df1 = df.loc[:, ["A", "B"]] + df2 = df.loc[:, ["C", "D"]] + df3 = df.loc[:, ["key"]] + + result = df1.join([df2, df3]) + tm.assert_frame_equal(result, df) + + def test_join_dups(self): + + # joining dups + df = concat( + [ + DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]), + DataFrame( + np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"] + ), + ], + axis=1, + ) + + expected = concat([df, df], axis=1) + result = df.join(df, rsuffix="_2") + result.columns = expected.columns + tm.assert_frame_equal(result, expected) + + # GH 4975, invalid join on dups + w = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) + x = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) + y = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) + z = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) + + dta = x.merge(y, left_index=True, right_index=True).merge( + z, left_index=True, right_index=True, how="outer" + ) + dta = dta.merge(w, left_index=True, right_index=True) + expected = concat([x, y, z, w], axis=1) + expected.columns = ["x_x", "y_x", "x_y", "y_y", "x_x", "y_x", "x_y", "y_y"] + tm.assert_frame_equal(dta, expected) + + def test_join_multi_to_multi(self, join_type): + # GH 20475 + leftindex = MultiIndex.from_product( + [list("abc"), list("xy"), [1, 2]], names=["abc", "xy", "num"] + ) + left = DataFrame({"v1": range(12)}, index=leftindex) + + rightindex = MultiIndex.from_product( + [list("abc"), list("xy")], names=["abc", "xy"] + ) + right = DataFrame({"v2": [100 * i for i in range(1, 7)]}, index=rightindex) + + result = left.join(right, on=["abc", "xy"], how=join_type) + expected = ( + left.reset_index() + .merge(right.reset_index(), on=["abc", "xy"], how=join_type) + .set_index(["abc", "xy", "num"]) + ) + tm.assert_frame_equal(expected, result) + + msg = r'len\(left_on\) must equal the number of levels in the index of "right"' + with pytest.raises(ValueError, match=msg): + left.join(right, on="xy", how=join_type) + + with pytest.raises(ValueError, match=msg): + right.join(left, on=["abc", "xy"], how=join_type) + + def test_join_on_tz_aware_datetimeindex(self): + # GH 23931, 26335 + df1 = pd.DataFrame( + { + "date": pd.date_range( + start="2018-01-01", periods=5, tz="America/Chicago" + ), + "vals": list("abcde"), + } + ) + + df2 = pd.DataFrame( + { + "date": pd.date_range( + start="2018-01-03", periods=5, tz="America/Chicago" + ), + "vals_2": list("tuvwx"), + } + ) + result = df1.join(df2.set_index("date"), on="date") + expected = df1.copy() + expected["vals_2"] = pd.Series([np.nan] * 2 + list("tuv"), dtype=object) + tm.assert_frame_equal(result, expected) + + def test_join_datetime_string(self): + # GH 5647 + dfa = DataFrame( + [ + ["2012-08-02", "L", 10], + ["2012-08-02", "J", 15], + ["2013-04-06", "L", 20], + ["2013-04-06", "J", 25], + ], + columns=["x", "y", "a"], + ) + dfa["x"] = pd.to_datetime(dfa["x"]) + dfb = DataFrame( + [["2012-08-02", "J", 1], ["2013-04-06", "L", 2]], + columns=["x", "y", "z"], + index=[2, 4], + ) + dfb["x"] = pd.to_datetime(dfb["x"]) + result = dfb.join(dfa.set_index(["x", "y"]), on=["x", "y"]) + expected = DataFrame( + [ + [pd.Timestamp("2012-08-02 00:00:00"), "J", 1, 15], + [pd.Timestamp("2013-04-06 00:00:00"), "L", 2, 20], + ], + index=[2, 4], + columns=["x", "y", "z", "a"], + ) + tm.assert_frame_equal(result, expected) + + +def _check_join(left, right, result, join_col, how="left", lsuffix="_x", rsuffix="_y"): + + # some smoke tests + for c in join_col: + assert result[c].notna().all() + + left_grouped = left.groupby(join_col) + right_grouped = right.groupby(join_col) + + for group_key, group in result.groupby(join_col): + l_joined = _restrict_to_columns(group, left.columns, lsuffix) + r_joined = _restrict_to_columns(group, right.columns, rsuffix) + + try: + lgroup = left_grouped.get_group(group_key) + except KeyError: + if how in ("left", "inner"): + raise AssertionError( + "key {group_key!s} should not have been in the join".format( + group_key=group_key + ) + ) + + _assert_all_na(l_joined, left.columns, join_col) + else: + _assert_same_contents(l_joined, lgroup) + + try: + rgroup = right_grouped.get_group(group_key) + except KeyError: + if how in ("right", "inner"): + raise AssertionError( + "key {group_key!s} should not have been in the join".format( + group_key=group_key + ) + ) + + _assert_all_na(r_joined, right.columns, join_col) + else: + _assert_same_contents(r_joined, rgroup) + + +def _restrict_to_columns(group, columns, suffix): + found = [ + c for c in group.columns if c in columns or c.replace(suffix, "") in columns + ] + + # filter + group = group.loc[:, found] + + # get rid of suffixes, if any + group = group.rename(columns=lambda x: x.replace(suffix, "")) + + # put in the right order... + group = group.loc[:, columns] + + return group + + +def _assert_same_contents(join_chunk, source): + NA_SENTINEL = -1234567 # drop_duplicates not so NA-friendly... + + jvalues = join_chunk.fillna(NA_SENTINEL).drop_duplicates().values + svalues = source.fillna(NA_SENTINEL).drop_duplicates().values + + rows = {tuple(row) for row in jvalues} + assert len(rows) == len(source) + assert all(tuple(row) in rows for row in svalues) + + +def _assert_all_na(join_chunk, source_columns, join_col): + for c in source_columns: + if c in join_col: + continue + assert join_chunk[c].isna().all() + + +def _join_by_hand(a, b, how="left"): + join_index = a.index.join(b.index, how=how) + + a_re = a.reindex(join_index) + b_re = b.reindex(join_index) + + result_columns = a.columns.append(b.columns) + + for col, s in b_re.items(): + a_re[col] = s + return a_re.reindex(columns=result_columns) diff --git a/venv/Lib/site-packages/pandas/tests/reshape/merge/test_merge.py b/venv/Lib/site-packages/pandas/tests/reshape/merge/test_merge.py new file mode 100644 index 0000000..8465e2c --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/reshape/merge/test_merge.py @@ -0,0 +1,2172 @@ +from collections import OrderedDict +from datetime import date, datetime, timedelta +import random +import re + +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_categorical_dtype, is_object_dtype +from pandas.core.dtypes.dtypes import CategoricalDtype + +import pandas as pd +from pandas import ( + Categorical, + CategoricalIndex, + DataFrame, + DatetimeIndex, + Float64Index, + Int64Index, + IntervalIndex, + MultiIndex, + PeriodIndex, + RangeIndex, + Series, + TimedeltaIndex, + UInt64Index, +) +import pandas._testing as tm +from pandas.api.types import CategoricalDtype as CDT +from pandas.core.reshape.concat import concat +from pandas.core.reshape.merge import MergeError, merge + +N = 50 +NGROUPS = 8 + + +def get_test_data(ngroups=NGROUPS, n=N): + unique_groups = list(range(ngroups)) + arr = np.asarray(np.tile(unique_groups, n // ngroups)) + + if len(arr) < n: + arr = np.asarray(list(arr) + unique_groups[: n - len(arr)]) + + random.shuffle(arr) + return arr + + +def get_series(): + return [ + pd.Series([1], dtype="int64"), + pd.Series([1], dtype="Int64"), + pd.Series([1.23]), + pd.Series(["foo"]), + pd.Series([True]), + pd.Series([pd.Timestamp("2018-01-01")]), + pd.Series([pd.Timestamp("2018-01-01", tz="US/Eastern")]), + ] + + +def get_series_na(): + return [ + pd.Series([np.nan], dtype="Int64"), + pd.Series([np.nan], dtype="float"), + pd.Series([np.nan], dtype="object"), + pd.Series([pd.NaT]), + ] + + +@pytest.fixture(params=get_series(), ids=lambda x: x.dtype.name) +def series_of_dtype(request): + """ + A parametrized fixture returning a variety of Series of different + dtypes + """ + return request.param + + +@pytest.fixture(params=get_series(), ids=lambda x: x.dtype.name) +def series_of_dtype2(request): + """ + A duplicate of the series_of_dtype fixture, so that it can be used + twice by a single function + """ + return request.param + + +@pytest.fixture(params=get_series_na(), ids=lambda x: x.dtype.name) +def series_of_dtype_all_na(request): + """ + A parametrized fixture returning a variety of Series with all NA + values + """ + return request.param + + +class TestMerge: + def setup_method(self, method): + # aggregate multiple columns + self.df = DataFrame( + { + "key1": get_test_data(), + "key2": get_test_data(), + "data1": np.random.randn(N), + "data2": np.random.randn(N), + } + ) + + # exclude a couple keys for fun + self.df = self.df[self.df["key2"] > 1] + + self.df2 = DataFrame( + { + "key1": get_test_data(n=N // 5), + "key2": get_test_data(ngroups=NGROUPS // 2, n=N // 5), + "value": np.random.randn(N // 5), + } + ) + + self.left = DataFrame( + {"key": ["a", "b", "c", "d", "e", "e", "a"], "v1": np.random.randn(7)} + ) + self.right = DataFrame({"v2": np.random.randn(4)}, index=["d", "b", "c", "a"]) + + def test_merge_inner_join_empty(self): + # GH 15328 + df_empty = pd.DataFrame() + df_a = pd.DataFrame({"a": [1, 2]}, index=[0, 1], dtype="int64") + result = pd.merge(df_empty, df_a, left_index=True, right_index=True) + expected = pd.DataFrame({"a": []}, index=[], dtype="int64") + tm.assert_frame_equal(result, expected) + + def test_merge_common(self): + joined = merge(self.df, self.df2) + exp = merge(self.df, self.df2, on=["key1", "key2"]) + tm.assert_frame_equal(joined, exp) + + def test_merge_non_string_columns(self): + # https://github.com/pandas-dev/pandas/issues/17962 + # Checks that method runs for non string column names + left = pd.DataFrame( + {0: [1, 0, 1, 0], 1: [0, 1, 0, 0], 2: [0, 0, 2, 0], 3: [1, 0, 0, 3]} + ) + + right = left.astype(float) + expected = left + result = pd.merge(left, right) + tm.assert_frame_equal(expected, result) + + def test_merge_index_as_on_arg(self): + # GH14355 + + left = self.df.set_index("key1") + right = self.df2.set_index("key1") + result = merge(left, right, on="key1") + expected = merge(self.df, self.df2, on="key1").set_index("key1") + tm.assert_frame_equal(result, expected) + + def test_merge_index_singlekey_right_vs_left(self): + left = DataFrame( + {"key": ["a", "b", "c", "d", "e", "e", "a"], "v1": np.random.randn(7)} + ) + right = DataFrame({"v2": np.random.randn(4)}, index=["d", "b", "c", "a"]) + + merged1 = merge( + left, right, left_on="key", right_index=True, how="left", sort=False + ) + merged2 = merge( + right, left, right_on="key", left_index=True, how="right", sort=False + ) + tm.assert_frame_equal(merged1, merged2.loc[:, merged1.columns]) + + merged1 = merge( + left, right, left_on="key", right_index=True, how="left", sort=True + ) + merged2 = merge( + right, left, right_on="key", left_index=True, how="right", sort=True + ) + tm.assert_frame_equal(merged1, merged2.loc[:, merged1.columns]) + + def test_merge_index_singlekey_inner(self): + left = DataFrame( + {"key": ["a", "b", "c", "d", "e", "e", "a"], "v1": np.random.randn(7)} + ) + right = DataFrame({"v2": np.random.randn(4)}, index=["d", "b", "c", "a"]) + + # inner join + result = merge(left, right, left_on="key", right_index=True, how="inner") + expected = left.join(right, on="key").loc[result.index] + tm.assert_frame_equal(result, expected) + + result = merge(right, left, right_on="key", left_index=True, how="inner") + expected = left.join(right, on="key").loc[result.index] + tm.assert_frame_equal(result, expected.loc[:, result.columns]) + + def test_merge_misspecified(self): + msg = "Must pass right_on or right_index=True" + with pytest.raises(pd.errors.MergeError, match=msg): + merge(self.left, self.right, left_index=True) + msg = "Must pass left_on or left_index=True" + with pytest.raises(pd.errors.MergeError, match=msg): + merge(self.left, self.right, right_index=True) + + msg = ( + 'Can only pass argument "on" OR "left_on" and "right_on", not' + " a combination of both" + ) + with pytest.raises(pd.errors.MergeError, match=msg): + merge(self.left, self.left, left_on="key", on="key") + + msg = r"len\(right_on\) must equal len\(left_on\)" + with pytest.raises(ValueError, match=msg): + merge(self.df, self.df2, left_on=["key1"], right_on=["key1", "key2"]) + + def test_index_and_on_parameters_confusion(self): + msg = "right_index parameter must be of type bool, not " + with pytest.raises(ValueError, match=msg): + merge( + self.df, + self.df2, + how="left", + left_index=False, + right_index=["key1", "key2"], + ) + msg = "left_index parameter must be of type bool, not " + with pytest.raises(ValueError, match=msg): + merge( + self.df, + self.df2, + how="left", + left_index=["key1", "key2"], + right_index=False, + ) + with pytest.raises(ValueError, match=msg): + merge( + self.df, + self.df2, + how="left", + left_index=["key1", "key2"], + right_index=["key1", "key2"], + ) + + def test_merge_overlap(self): + merged = merge(self.left, self.left, on="key") + exp_len = (self.left["key"].value_counts() ** 2).sum() + assert len(merged) == exp_len + assert "v1_x" in merged + assert "v1_y" in merged + + def test_merge_different_column_key_names(self): + left = DataFrame({"lkey": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]}) + right = DataFrame({"rkey": ["foo", "bar", "qux", "foo"], "value": [5, 6, 7, 8]}) + + merged = left.merge( + right, left_on="lkey", right_on="rkey", how="outer", sort=True + ) + + exp = pd.Series(["bar", "baz", "foo", "foo", "foo", "foo", np.nan], name="lkey") + tm.assert_series_equal(merged["lkey"], exp) + + exp = pd.Series(["bar", np.nan, "foo", "foo", "foo", "foo", "qux"], name="rkey") + tm.assert_series_equal(merged["rkey"], exp) + + exp = pd.Series([2, 3, 1, 1, 4, 4, np.nan], name="value_x") + tm.assert_series_equal(merged["value_x"], exp) + + exp = pd.Series([6, np.nan, 5, 8, 5, 8, 7], name="value_y") + tm.assert_series_equal(merged["value_y"], exp) + + def test_merge_copy(self): + left = DataFrame({"a": 0, "b": 1}, index=range(10)) + right = DataFrame({"c": "foo", "d": "bar"}, index=range(10)) + + merged = merge(left, right, left_index=True, right_index=True, copy=True) + + merged["a"] = 6 + assert (left["a"] == 0).all() + + merged["d"] = "peekaboo" + assert (right["d"] == "bar").all() + + def test_merge_nocopy(self): + left = DataFrame({"a": 0, "b": 1}, index=range(10)) + right = DataFrame({"c": "foo", "d": "bar"}, index=range(10)) + + merged = merge(left, right, left_index=True, right_index=True, copy=False) + + merged["a"] = 6 + assert (left["a"] == 6).all() + + merged["d"] = "peekaboo" + assert (right["d"] == "peekaboo").all() + + def test_intelligently_handle_join_key(self): + # #733, be a bit more 1337 about not returning unconsolidated DataFrame + + left = DataFrame( + {"key": [1, 1, 2, 2, 3], "value": list(range(5))}, columns=["value", "key"] + ) + right = DataFrame({"key": [1, 1, 2, 3, 4, 5], "rvalue": list(range(6))}) + + joined = merge(left, right, on="key", how="outer") + expected = DataFrame( + { + "key": [1, 1, 1, 1, 2, 2, 3, 4, 5], + "value": np.array([0, 0, 1, 1, 2, 3, 4, np.nan, np.nan]), + "rvalue": [0, 1, 0, 1, 2, 2, 3, 4, 5], + }, + columns=["value", "key", "rvalue"], + ) + tm.assert_frame_equal(joined, expected) + + def test_merge_join_key_dtype_cast(self): + # #8596 + + df1 = DataFrame({"key": [1], "v1": [10]}) + df2 = DataFrame({"key": [2], "v1": [20]}) + df = merge(df1, df2, how="outer") + assert df["key"].dtype == "int64" + + df1 = DataFrame({"key": [True], "v1": [1]}) + df2 = DataFrame({"key": [False], "v1": [0]}) + df = merge(df1, df2, how="outer") + + # GH13169 + # this really should be bool + assert df["key"].dtype == "object" + + df1 = DataFrame({"val": [1]}) + df2 = DataFrame({"val": [2]}) + lkey = np.array([1]) + rkey = np.array([2]) + df = merge(df1, df2, left_on=lkey, right_on=rkey, how="outer") + assert df["key_0"].dtype == "int64" + + def test_handle_join_key_pass_array(self): + left = DataFrame( + {"key": [1, 1, 2, 2, 3], "value": np.arange(5)}, columns=["value", "key"] + ) + right = DataFrame({"rvalue": np.arange(6)}) + key = np.array([1, 1, 2, 3, 4, 5]) + + merged = merge(left, right, left_on="key", right_on=key, how="outer") + merged2 = merge(right, left, left_on=key, right_on="key", how="outer") + + tm.assert_series_equal(merged["key"], merged2["key"]) + assert merged["key"].notna().all() + assert merged2["key"].notna().all() + + left = DataFrame({"value": np.arange(5)}, columns=["value"]) + right = DataFrame({"rvalue": np.arange(6)}) + lkey = np.array([1, 1, 2, 2, 3]) + rkey = np.array([1, 1, 2, 3, 4, 5]) + + merged = merge(left, right, left_on=lkey, right_on=rkey, how="outer") + tm.assert_series_equal( + merged["key_0"], Series([1, 1, 1, 1, 2, 2, 3, 4, 5], name="key_0") + ) + + left = DataFrame({"value": np.arange(3)}) + right = DataFrame({"rvalue": np.arange(6)}) + + key = np.array([0, 1, 1, 2, 2, 3], dtype=np.int64) + merged = merge(left, right, left_index=True, right_on=key, how="outer") + tm.assert_series_equal(merged["key_0"], Series(key, name="key_0")) + + def test_no_overlap_more_informative_error(self): + dt = datetime.now() + df1 = DataFrame({"x": ["a"]}, index=[dt]) + + df2 = DataFrame({"y": ["b", "c"]}, index=[dt, dt]) + + msg = ( + "No common columns to perform merge on. " + "Merge options: left_on={lon}, right_on={ron}, " + "left_index={lidx}, right_index={ridx}".format( + lon=None, ron=None, lidx=False, ridx=False + ) + ) + + with pytest.raises(MergeError, match=msg): + merge(df1, df2) + + def test_merge_non_unique_indexes(self): + + dt = datetime(2012, 5, 1) + dt2 = datetime(2012, 5, 2) + dt3 = datetime(2012, 5, 3) + dt4 = datetime(2012, 5, 4) + + df1 = DataFrame({"x": ["a"]}, index=[dt]) + df2 = DataFrame({"y": ["b", "c"]}, index=[dt, dt]) + _check_merge(df1, df2) + + # Not monotonic + df1 = DataFrame({"x": ["a", "b", "q"]}, index=[dt2, dt, dt4]) + df2 = DataFrame( + {"y": ["c", "d", "e", "f", "g", "h"]}, index=[dt3, dt3, dt2, dt2, dt, dt] + ) + _check_merge(df1, df2) + + df1 = DataFrame({"x": ["a", "b"]}, index=[dt, dt]) + df2 = DataFrame({"y": ["c", "d"]}, index=[dt, dt]) + _check_merge(df1, df2) + + def test_merge_non_unique_index_many_to_many(self): + dt = datetime(2012, 5, 1) + dt2 = datetime(2012, 5, 2) + dt3 = datetime(2012, 5, 3) + df1 = DataFrame({"x": ["a", "b", "c", "d"]}, index=[dt2, dt2, dt, dt]) + df2 = DataFrame( + {"y": ["e", "f", "g", " h", "i"]}, index=[dt2, dt2, dt3, dt, dt] + ) + _check_merge(df1, df2) + + def test_left_merge_empty_dataframe(self): + left = DataFrame({"key": [1], "value": [2]}) + right = DataFrame({"key": []}) + + result = merge(left, right, on="key", how="left") + tm.assert_frame_equal(result, left) + + result = merge(right, left, on="key", how="right") + tm.assert_frame_equal(result, left) + + @pytest.mark.parametrize( + "kwarg", + [ + dict(left_index=True, right_index=True), + dict(left_index=True, right_on="x"), + dict(left_on="a", right_index=True), + dict(left_on="a", right_on="x"), + ], + ) + def test_merge_left_empty_right_empty(self, join_type, kwarg): + # GH 10824 + left = pd.DataFrame(columns=["a", "b", "c"]) + right = pd.DataFrame(columns=["x", "y", "z"]) + + exp_in = pd.DataFrame( + columns=["a", "b", "c", "x", "y", "z"], + index=pd.Index([], dtype=object), + dtype=object, + ) + + result = pd.merge(left, right, how=join_type, **kwarg) + tm.assert_frame_equal(result, exp_in) + + def test_merge_left_empty_right_notempty(self): + # GH 10824 + left = pd.DataFrame(columns=["a", "b", "c"]) + right = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["x", "y", "z"]) + + exp_out = pd.DataFrame( + { + "a": np.array([np.nan] * 3, dtype=object), + "b": np.array([np.nan] * 3, dtype=object), + "c": np.array([np.nan] * 3, dtype=object), + "x": [1, 4, 7], + "y": [2, 5, 8], + "z": [3, 6, 9], + }, + columns=["a", "b", "c", "x", "y", "z"], + ) + exp_in = exp_out[0:0] # make empty DataFrame keeping dtype + # result will have object dtype + exp_in.index = exp_in.index.astype(object) + + def check1(exp, kwarg): + result = pd.merge(left, right, how="inner", **kwarg) + tm.assert_frame_equal(result, exp) + result = pd.merge(left, right, how="left", **kwarg) + tm.assert_frame_equal(result, exp) + + def check2(exp, kwarg): + result = pd.merge(left, right, how="right", **kwarg) + tm.assert_frame_equal(result, exp) + result = pd.merge(left, right, how="outer", **kwarg) + tm.assert_frame_equal(result, exp) + + for kwarg in [ + dict(left_index=True, right_index=True), + dict(left_index=True, right_on="x"), + ]: + check1(exp_in, kwarg) + check2(exp_out, kwarg) + + kwarg = dict(left_on="a", right_index=True) + check1(exp_in, kwarg) + exp_out["a"] = [0, 1, 2] + check2(exp_out, kwarg) + + kwarg = dict(left_on="a", right_on="x") + check1(exp_in, kwarg) + exp_out["a"] = np.array([np.nan] * 3, dtype=object) + check2(exp_out, kwarg) + + def test_merge_left_notempty_right_empty(self): + # GH 10824 + left = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]) + right = pd.DataFrame(columns=["x", "y", "z"]) + + exp_out = pd.DataFrame( + { + "a": [1, 4, 7], + "b": [2, 5, 8], + "c": [3, 6, 9], + "x": np.array([np.nan] * 3, dtype=object), + "y": np.array([np.nan] * 3, dtype=object), + "z": np.array([np.nan] * 3, dtype=object), + }, + columns=["a", "b", "c", "x", "y", "z"], + ) + exp_in = exp_out[0:0] # make empty DataFrame keeping dtype + # result will have object dtype + exp_in.index = exp_in.index.astype(object) + + def check1(exp, kwarg): + result = pd.merge(left, right, how="inner", **kwarg) + tm.assert_frame_equal(result, exp) + result = pd.merge(left, right, how="right", **kwarg) + tm.assert_frame_equal(result, exp) + + def check2(exp, kwarg): + result = pd.merge(left, right, how="left", **kwarg) + tm.assert_frame_equal(result, exp) + result = pd.merge(left, right, how="outer", **kwarg) + tm.assert_frame_equal(result, exp) + + for kwarg in [ + dict(left_index=True, right_index=True), + dict(left_index=True, right_on="x"), + dict(left_on="a", right_index=True), + dict(left_on="a", right_on="x"), + ]: + check1(exp_in, kwarg) + check2(exp_out, kwarg) + + def test_merge_empty_frame(self, series_of_dtype, series_of_dtype2): + # GH 25183 + df = pd.DataFrame( + {"key": series_of_dtype, "value": series_of_dtype2}, + columns=["key", "value"], + ) + df_empty = df[:0] + expected = pd.DataFrame( + { + "value_x": pd.Series(dtype=df.dtypes["value"]), + "key": pd.Series(dtype=df.dtypes["key"]), + "value_y": pd.Series(dtype=df.dtypes["value"]), + }, + columns=["value_x", "key", "value_y"], + ) + actual = df_empty.merge(df, on="key") + tm.assert_frame_equal(actual, expected) + + def test_merge_all_na_column(self, series_of_dtype, series_of_dtype_all_na): + # GH 25183 + df_left = pd.DataFrame( + {"key": series_of_dtype, "value": series_of_dtype_all_na}, + columns=["key", "value"], + ) + df_right = pd.DataFrame( + {"key": series_of_dtype, "value": series_of_dtype_all_na}, + columns=["key", "value"], + ) + expected = pd.DataFrame( + { + "key": series_of_dtype, + "value_x": series_of_dtype_all_na, + "value_y": series_of_dtype_all_na, + }, + columns=["key", "value_x", "value_y"], + ) + actual = df_left.merge(df_right, on="key") + tm.assert_frame_equal(actual, expected) + + def test_merge_nosort(self): + # GH#2098, TODO: anything to do? + + d = { + "var1": np.random.randint(0, 10, size=10), + "var2": np.random.randint(0, 10, size=10), + "var3": [ + datetime(2012, 1, 12), + datetime(2011, 2, 4), + datetime(2010, 2, 3), + datetime(2012, 1, 12), + datetime(2011, 2, 4), + datetime(2012, 4, 3), + datetime(2012, 3, 4), + datetime(2008, 5, 1), + datetime(2010, 2, 3), + datetime(2012, 2, 3), + ], + } + df = DataFrame.from_dict(d) + var3 = df.var3.unique() + var3.sort() + new = DataFrame.from_dict({"var3": var3, "var8": np.random.random(7)}) + + result = df.merge(new, on="var3", sort=False) + exp = merge(df, new, on="var3", sort=False) + tm.assert_frame_equal(result, exp) + + assert (df.var3.unique() == result.var3.unique()).all() + + def test_merge_nan_right(self): + df1 = DataFrame({"i1": [0, 1], "i2": [0, 1]}) + df2 = DataFrame({"i1": [0], "i3": [0]}) + result = df1.join(df2, on="i1", rsuffix="_") + expected = ( + DataFrame( + { + "i1": {0: 0.0, 1: 1}, + "i2": {0: 0, 1: 1}, + "i1_": {0: 0, 1: np.nan}, + "i3": {0: 0.0, 1: np.nan}, + None: {0: 0, 1: 0}, + } + ) + .set_index(None) + .reset_index()[["i1", "i2", "i1_", "i3"]] + ) + tm.assert_frame_equal(result, expected, check_dtype=False) + + df1 = DataFrame({"i1": [0, 1], "i2": [0.5, 1.5]}) + df2 = DataFrame({"i1": [0], "i3": [0.7]}) + result = df1.join(df2, rsuffix="_", on="i1") + expected = DataFrame( + { + "i1": {0: 0, 1: 1}, + "i1_": {0: 0.0, 1: np.nan}, + "i2": {0: 0.5, 1: 1.5}, + "i3": {0: 0.69999999999999996, 1: np.nan}, + } + )[["i1", "i2", "i1_", "i3"]] + tm.assert_frame_equal(result, expected) + + def test_merge_type(self): + class NotADataFrame(DataFrame): + @property + def _constructor(self): + return NotADataFrame + + nad = NotADataFrame(self.df) + result = nad.merge(self.df2, on="key1") + + assert isinstance(result, NotADataFrame) + + def test_join_append_timedeltas(self): + # timedelta64 issues with join/merge + # GH 5695 + + d = {"d": datetime(2013, 11, 5, 5, 56), "t": timedelta(0, 22500)} + df = DataFrame(columns=list("dt")) + df = df.append(d, ignore_index=True) + result = df.append(d, ignore_index=True) + expected = DataFrame( + { + "d": [datetime(2013, 11, 5, 5, 56), datetime(2013, 11, 5, 5, 56)], + "t": [timedelta(0, 22500), timedelta(0, 22500)], + } + ) + tm.assert_frame_equal(result, expected) + + td = np.timedelta64(300000000) + lhs = DataFrame(Series([td, td], index=["A", "B"])) + rhs = DataFrame(Series([td], index=["A"])) + + result = lhs.join(rhs, rsuffix="r", how="left") + expected = DataFrame( + { + "0": Series([td, td], index=list("AB")), + "0r": Series([td, pd.NaT], index=list("AB")), + } + ) + tm.assert_frame_equal(result, expected) + + def test_other_datetime_unit(self): + # GH 13389 + df1 = pd.DataFrame({"entity_id": [101, 102]}) + s = pd.Series([None, None], index=[101, 102], name="days") + + for dtype in [ + "datetime64[D]", + "datetime64[h]", + "datetime64[m]", + "datetime64[s]", + "datetime64[ms]", + "datetime64[us]", + "datetime64[ns]", + ]: + + df2 = s.astype(dtype).to_frame("days") + # coerces to datetime64[ns], thus should not be affected + assert df2["days"].dtype == "datetime64[ns]" + + result = df1.merge(df2, left_on="entity_id", right_index=True) + + exp = pd.DataFrame( + { + "entity_id": [101, 102], + "days": np.array(["nat", "nat"], dtype="datetime64[ns]"), + }, + columns=["entity_id", "days"], + ) + tm.assert_frame_equal(result, exp) + + @pytest.mark.parametrize("unit", ["D", "h", "m", "s", "ms", "us", "ns"]) + def test_other_timedelta_unit(self, unit): + # GH 13389 + df1 = pd.DataFrame({"entity_id": [101, 102]}) + s = pd.Series([None, None], index=[101, 102], name="days") + + dtype = "m8[{}]".format(unit) + df2 = s.astype(dtype).to_frame("days") + assert df2["days"].dtype == "m8[ns]" + + result = df1.merge(df2, left_on="entity_id", right_index=True) + + exp = pd.DataFrame( + {"entity_id": [101, 102], "days": np.array(["nat", "nat"], dtype=dtype)}, + columns=["entity_id", "days"], + ) + tm.assert_frame_equal(result, exp) + + def test_overlapping_columns_error_message(self): + df = DataFrame({"key": [1, 2, 3], "v1": [4, 5, 6], "v2": [7, 8, 9]}) + df2 = DataFrame({"key": [1, 2, 3], "v1": [4, 5, 6], "v2": [7, 8, 9]}) + + df.columns = ["key", "foo", "foo"] + df2.columns = ["key", "bar", "bar"] + expected = DataFrame( + { + "key": [1, 2, 3], + "v1": [4, 5, 6], + "v2": [7, 8, 9], + "v3": [4, 5, 6], + "v4": [7, 8, 9], + } + ) + expected.columns = ["key", "foo", "foo", "bar", "bar"] + tm.assert_frame_equal(merge(df, df2), expected) + + # #2649, #10639 + df2.columns = ["key1", "foo", "foo"] + msg = r"Data columns not unique: Index\(\['foo', 'foo'\], dtype='object'\)" + with pytest.raises(MergeError, match=msg): + merge(df, df2) + + def test_merge_on_datetime64tz(self): + + # GH11405 + left = pd.DataFrame( + { + "key": pd.date_range("20151010", periods=2, tz="US/Eastern"), + "value": [1, 2], + } + ) + right = pd.DataFrame( + { + "key": pd.date_range("20151011", periods=3, tz="US/Eastern"), + "value": [1, 2, 3], + } + ) + + expected = DataFrame( + { + "key": pd.date_range("20151010", periods=4, tz="US/Eastern"), + "value_x": [1, 2, np.nan, np.nan], + "value_y": [np.nan, 1, 2, 3], + } + ) + result = pd.merge(left, right, on="key", how="outer") + tm.assert_frame_equal(result, expected) + + left = pd.DataFrame( + { + "key": [1, 2], + "value": pd.date_range("20151010", periods=2, tz="US/Eastern"), + } + ) + right = pd.DataFrame( + { + "key": [2, 3], + "value": pd.date_range("20151011", periods=2, tz="US/Eastern"), + } + ) + expected = DataFrame( + { + "key": [1, 2, 3], + "value_x": list(pd.date_range("20151010", periods=2, tz="US/Eastern")) + + [pd.NaT], + "value_y": [pd.NaT] + + list(pd.date_range("20151011", periods=2, tz="US/Eastern")), + } + ) + result = pd.merge(left, right, on="key", how="outer") + tm.assert_frame_equal(result, expected) + assert result["value_x"].dtype == "datetime64[ns, US/Eastern]" + assert result["value_y"].dtype == "datetime64[ns, US/Eastern]" + + def test_merge_on_datetime64tz_empty(self): + # https://github.com/pandas-dev/pandas/issues/25014 + dtz = pd.DatetimeTZDtype(tz="UTC") + right = pd.DataFrame( + { + "date": [pd.Timestamp("2018", tz=dtz.tz)], + "value": [4.0], + "date2": [pd.Timestamp("2019", tz=dtz.tz)], + }, + columns=["date", "value", "date2"], + ) + left = right[:0] + result = left.merge(right, on="date") + expected = pd.DataFrame( + { + "value_x": pd.Series(dtype=float), + "date2_x": pd.Series(dtype=dtz), + "date": pd.Series(dtype=dtz), + "value_y": pd.Series(dtype=float), + "date2_y": pd.Series(dtype=dtz), + }, + columns=["value_x", "date2_x", "date", "value_y", "date2_y"], + ) + tm.assert_frame_equal(result, expected) + + def test_merge_datetime64tz_with_dst_transition(self): + # GH 18885 + df1 = pd.DataFrame( + pd.date_range("2017-10-29 01:00", periods=4, freq="H", tz="Europe/Madrid"), + columns=["date"], + ) + df1["value"] = 1 + df2 = pd.DataFrame( + { + "date": pd.to_datetime( + [ + "2017-10-29 03:00:00", + "2017-10-29 04:00:00", + "2017-10-29 05:00:00", + ] + ), + "value": 2, + } + ) + df2["date"] = df2["date"].dt.tz_localize("UTC").dt.tz_convert("Europe/Madrid") + result = pd.merge(df1, df2, how="outer", on="date") + expected = pd.DataFrame( + { + "date": pd.date_range( + "2017-10-29 01:00", periods=7, freq="H", tz="Europe/Madrid" + ), + "value_x": [1] * 4 + [np.nan] * 3, + "value_y": [np.nan] * 4 + [2] * 3, + } + ) + tm.assert_frame_equal(result, expected) + + def test_merge_non_unique_period_index(self): + # GH #16871 + index = pd.period_range("2016-01-01", periods=16, freq="M") + df = DataFrame(list(range(len(index))), index=index, columns=["pnum"]) + df2 = concat([df, df]) + result = df.merge(df2, left_index=True, right_index=True, how="inner") + expected = DataFrame( + np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2), + columns=["pnum_x", "pnum_y"], + index=df2.sort_index().index, + ) + tm.assert_frame_equal(result, expected) + + def test_merge_on_periods(self): + left = pd.DataFrame( + {"key": pd.period_range("20151010", periods=2, freq="D"), "value": [1, 2]} + ) + right = pd.DataFrame( + { + "key": pd.period_range("20151011", periods=3, freq="D"), + "value": [1, 2, 3], + } + ) + + expected = DataFrame( + { + "key": pd.period_range("20151010", periods=4, freq="D"), + "value_x": [1, 2, np.nan, np.nan], + "value_y": [np.nan, 1, 2, 3], + } + ) + result = pd.merge(left, right, on="key", how="outer") + tm.assert_frame_equal(result, expected) + + left = pd.DataFrame( + {"key": [1, 2], "value": pd.period_range("20151010", periods=2, freq="D")} + ) + right = pd.DataFrame( + {"key": [2, 3], "value": pd.period_range("20151011", periods=2, freq="D")} + ) + + exp_x = pd.period_range("20151010", periods=2, freq="D") + exp_y = pd.period_range("20151011", periods=2, freq="D") + expected = DataFrame( + { + "key": [1, 2, 3], + "value_x": list(exp_x) + [pd.NaT], + "value_y": [pd.NaT] + list(exp_y), + } + ) + result = pd.merge(left, right, on="key", how="outer") + tm.assert_frame_equal(result, expected) + assert result["value_x"].dtype == "Period[D]" + assert result["value_y"].dtype == "Period[D]" + + def test_indicator(self): + # PR #10054. xref #7412 and closes #8790. + df1 = DataFrame( + {"col1": [0, 1], "col_conflict": [1, 2], "col_left": ["a", "b"]} + ) + df1_copy = df1.copy() + + df2 = DataFrame( + { + "col1": [1, 2, 3, 4, 5], + "col_conflict": [1, 2, 3, 4, 5], + "col_right": [2, 2, 2, 2, 2], + } + ) + df2_copy = df2.copy() + + df_result = DataFrame( + { + "col1": [0, 1, 2, 3, 4, 5], + "col_conflict_x": [1, 2, np.nan, np.nan, np.nan, np.nan], + "col_left": ["a", "b", np.nan, np.nan, np.nan, np.nan], + "col_conflict_y": [np.nan, 1, 2, 3, 4, 5], + "col_right": [np.nan, 2, 2, 2, 2, 2], + } + ) + df_result["_merge"] = Categorical( + [ + "left_only", + "both", + "right_only", + "right_only", + "right_only", + "right_only", + ], + categories=["left_only", "right_only", "both"], + ) + + df_result = df_result[ + [ + "col1", + "col_conflict_x", + "col_left", + "col_conflict_y", + "col_right", + "_merge", + ] + ] + + test = merge(df1, df2, on="col1", how="outer", indicator=True) + tm.assert_frame_equal(test, df_result) + test = df1.merge(df2, on="col1", how="outer", indicator=True) + tm.assert_frame_equal(test, df_result) + + # No side effects + tm.assert_frame_equal(df1, df1_copy) + tm.assert_frame_equal(df2, df2_copy) + + # Check with custom name + df_result_custom_name = df_result + df_result_custom_name = df_result_custom_name.rename( + columns={"_merge": "custom_name"} + ) + + test_custom_name = merge( + df1, df2, on="col1", how="outer", indicator="custom_name" + ) + tm.assert_frame_equal(test_custom_name, df_result_custom_name) + test_custom_name = df1.merge( + df2, on="col1", how="outer", indicator="custom_name" + ) + tm.assert_frame_equal(test_custom_name, df_result_custom_name) + + # Check only accepts strings and booleans + msg = "indicator option can only accept boolean or string arguments" + with pytest.raises(ValueError, match=msg): + merge(df1, df2, on="col1", how="outer", indicator=5) + with pytest.raises(ValueError, match=msg): + df1.merge(df2, on="col1", how="outer", indicator=5) + + # Check result integrity + + test2 = merge(df1, df2, on="col1", how="left", indicator=True) + assert (test2._merge != "right_only").all() + test2 = df1.merge(df2, on="col1", how="left", indicator=True) + assert (test2._merge != "right_only").all() + + test3 = merge(df1, df2, on="col1", how="right", indicator=True) + assert (test3._merge != "left_only").all() + test3 = df1.merge(df2, on="col1", how="right", indicator=True) + assert (test3._merge != "left_only").all() + + test4 = merge(df1, df2, on="col1", how="inner", indicator=True) + assert (test4._merge == "both").all() + test4 = df1.merge(df2, on="col1", how="inner", indicator=True) + assert (test4._merge == "both").all() + + # Check if working name in df + for i in ["_right_indicator", "_left_indicator", "_merge"]: + df_badcolumn = DataFrame({"col1": [1, 2], i: [2, 2]}) + + msg = ( + "Cannot use `indicator=True` option when data contains a" + " column named {}|" + "Cannot use name of an existing column for indicator" + " column" + ).format(i) + with pytest.raises(ValueError, match=msg): + merge(df1, df_badcolumn, on="col1", how="outer", indicator=True) + with pytest.raises(ValueError, match=msg): + df1.merge(df_badcolumn, on="col1", how="outer", indicator=True) + + # Check for name conflict with custom name + df_badcolumn = DataFrame({"col1": [1, 2], "custom_column_name": [2, 2]}) + + msg = "Cannot use name of an existing column for indicator column" + with pytest.raises(ValueError, match=msg): + merge( + df1, + df_badcolumn, + on="col1", + how="outer", + indicator="custom_column_name", + ) + with pytest.raises(ValueError, match=msg): + df1.merge( + df_badcolumn, on="col1", how="outer", indicator="custom_column_name" + ) + + # Merge on multiple columns + df3 = DataFrame({"col1": [0, 1], "col2": ["a", "b"]}) + + df4 = DataFrame({"col1": [1, 1, 3], "col2": ["b", "x", "y"]}) + + hand_coded_result = DataFrame( + {"col1": [0, 1, 1, 3], "col2": ["a", "b", "x", "y"]} + ) + hand_coded_result["_merge"] = Categorical( + ["left_only", "both", "right_only", "right_only"], + categories=["left_only", "right_only", "both"], + ) + + test5 = merge(df3, df4, on=["col1", "col2"], how="outer", indicator=True) + tm.assert_frame_equal(test5, hand_coded_result) + test5 = df3.merge(df4, on=["col1", "col2"], how="outer", indicator=True) + tm.assert_frame_equal(test5, hand_coded_result) + + def test_validation(self): + left = DataFrame( + {"a": ["a", "b", "c", "d"], "b": ["cat", "dog", "weasel", "horse"]}, + index=range(4), + ) + + right = DataFrame( + { + "a": ["a", "b", "c", "d", "e"], + "c": ["meow", "bark", "um... weasel noise?", "nay", "chirp"], + }, + index=range(5), + ) + + # Make sure no side effects. + left_copy = left.copy() + right_copy = right.copy() + + result = merge(left, right, left_index=True, right_index=True, validate="1:1") + tm.assert_frame_equal(left, left_copy) + tm.assert_frame_equal(right, right_copy) + + # make sure merge still correct + expected = DataFrame( + { + "a_x": ["a", "b", "c", "d"], + "b": ["cat", "dog", "weasel", "horse"], + "a_y": ["a", "b", "c", "d"], + "c": ["meow", "bark", "um... weasel noise?", "nay"], + }, + index=range(4), + columns=["a_x", "b", "a_y", "c"], + ) + + result = merge( + left, right, left_index=True, right_index=True, validate="one_to_one" + ) + tm.assert_frame_equal(result, expected) + + expected_2 = DataFrame( + { + "a": ["a", "b", "c", "d"], + "b": ["cat", "dog", "weasel", "horse"], + "c": ["meow", "bark", "um... weasel noise?", "nay"], + }, + index=range(4), + ) + + result = merge(left, right, on="a", validate="1:1") + tm.assert_frame_equal(left, left_copy) + tm.assert_frame_equal(right, right_copy) + tm.assert_frame_equal(result, expected_2) + + result = merge(left, right, on="a", validate="one_to_one") + tm.assert_frame_equal(result, expected_2) + + # One index, one column + expected_3 = DataFrame( + { + "b": ["cat", "dog", "weasel", "horse"], + "a": ["a", "b", "c", "d"], + "c": ["meow", "bark", "um... weasel noise?", "nay"], + }, + columns=["b", "a", "c"], + index=range(4), + ) + + left_index_reset = left.set_index("a") + result = merge( + left_index_reset, + right, + left_index=True, + right_on="a", + validate="one_to_one", + ) + tm.assert_frame_equal(result, expected_3) + + # Dups on right + right_w_dups = right.append(pd.DataFrame({"a": ["e"], "c": ["moo"]}, index=[4])) + merge( + left, + right_w_dups, + left_index=True, + right_index=True, + validate="one_to_many", + ) + + msg = "Merge keys are not unique in right dataset; not a one-to-one merge" + with pytest.raises(MergeError, match=msg): + merge( + left, + right_w_dups, + left_index=True, + right_index=True, + validate="one_to_one", + ) + + with pytest.raises(MergeError, match=msg): + merge(left, right_w_dups, on="a", validate="one_to_one") + + # Dups on left + left_w_dups = left.append( + pd.DataFrame({"a": ["a"], "c": ["cow"]}, index=[3]), sort=True + ) + merge( + left_w_dups, + right, + left_index=True, + right_index=True, + validate="many_to_one", + ) + + msg = "Merge keys are not unique in left dataset; not a one-to-one merge" + with pytest.raises(MergeError, match=msg): + merge( + left_w_dups, + right, + left_index=True, + right_index=True, + validate="one_to_one", + ) + + with pytest.raises(MergeError, match=msg): + merge(left_w_dups, right, on="a", validate="one_to_one") + + # Dups on both + merge(left_w_dups, right_w_dups, on="a", validate="many_to_many") + + msg = "Merge keys are not unique in right dataset; not a many-to-one merge" + with pytest.raises(MergeError, match=msg): + merge( + left_w_dups, + right_w_dups, + left_index=True, + right_index=True, + validate="many_to_one", + ) + + msg = "Merge keys are not unique in left dataset; not a one-to-many merge" + with pytest.raises(MergeError, match=msg): + merge(left_w_dups, right_w_dups, on="a", validate="one_to_many") + + # Check invalid arguments + msg = "Not a valid argument for validate" + with pytest.raises(ValueError, match=msg): + merge(left, right, on="a", validate="jibberish") + + # Two column merge, dups in both, but jointly no dups. + left = DataFrame( + { + "a": ["a", "a", "b", "b"], + "b": [0, 1, 0, 1], + "c": ["cat", "dog", "weasel", "horse"], + }, + index=range(4), + ) + + right = DataFrame( + { + "a": ["a", "a", "b"], + "b": [0, 1, 0], + "d": ["meow", "bark", "um... weasel noise?"], + }, + index=range(3), + ) + + expected_multi = DataFrame( + { + "a": ["a", "a", "b"], + "b": [0, 1, 0], + "c": ["cat", "dog", "weasel"], + "d": ["meow", "bark", "um... weasel noise?"], + }, + index=range(3), + ) + + msg = ( + "Merge keys are not unique in either left or right dataset;" + " not a one-to-one merge" + ) + with pytest.raises(MergeError, match=msg): + merge(left, right, on="a", validate="1:1") + + result = merge(left, right, on=["a", "b"], validate="1:1") + tm.assert_frame_equal(result, expected_multi) + + def test_merge_two_empty_df_no_division_error(self): + # GH17776, PR #17846 + a = pd.DataFrame({"a": [], "b": [], "c": []}) + with np.errstate(divide="raise"): + merge(a, a, on=("a", "b")) + + @pytest.mark.parametrize("how", ["right", "outer"]) + @pytest.mark.parametrize( + "index,expected_index", + [ + ( + CategoricalIndex([1, 2, 4]), + CategoricalIndex([1, 2, 4, None, None, None]), + ), + ( + DatetimeIndex(["2001-01-01", "2002-02-02", "2003-03-03"]), + DatetimeIndex( + ["2001-01-01", "2002-02-02", "2003-03-03", pd.NaT, pd.NaT, pd.NaT] + ), + ), + (Float64Index([1, 2, 3]), Float64Index([1, 2, 3, None, None, None])), + (Int64Index([1, 2, 3]), Float64Index([1, 2, 3, None, None, None])), + ( + IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4)]), + IntervalIndex.from_tuples( + [(1, 2), (2, 3), (3, 4), np.nan, np.nan, np.nan] + ), + ), + ( + PeriodIndex(["2001-01-01", "2001-01-02", "2001-01-03"], freq="D"), + PeriodIndex( + ["2001-01-01", "2001-01-02", "2001-01-03", pd.NaT, pd.NaT, pd.NaT], + freq="D", + ), + ), + ( + TimedeltaIndex(["1d", "2d", "3d"]), + TimedeltaIndex(["1d", "2d", "3d", pd.NaT, pd.NaT, pd.NaT]), + ), + ], + ) + def test_merge_on_index_with_more_values(self, how, index, expected_index): + # GH 24212 + # pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that + # -1 is interpreted as a missing value instead of the last element + df1 = pd.DataFrame({"a": [1, 2, 3], "key": [0, 2, 2]}, index=index) + df2 = pd.DataFrame({"b": [1, 2, 3, 4, 5]}) + result = df1.merge(df2, left_on="key", right_index=True, how=how) + expected = pd.DataFrame( + [ + [1.0, 0, 1], + [2.0, 2, 3], + [3.0, 2, 3], + [np.nan, 1, 2], + [np.nan, 3, 4], + [np.nan, 4, 5], + ], + columns=["a", "key", "b"], + ) + expected.set_index(expected_index, inplace=True) + tm.assert_frame_equal(result, expected) + + def test_merge_right_index_right(self): + # Note: the expected output here is probably incorrect. + # See https://github.com/pandas-dev/pandas/issues/17257 for more. + # We include this as a regression test for GH-24897. + left = pd.DataFrame({"a": [1, 2, 3], "key": [0, 1, 1]}) + right = pd.DataFrame({"b": [1, 2, 3]}) + + expected = pd.DataFrame( + {"a": [1, 2, 3, None], "key": [0, 1, 1, 2], "b": [1, 2, 2, 3]}, + columns=["a", "key", "b"], + index=[0, 1, 2, np.nan], + ) + result = left.merge(right, left_on="key", right_index=True, how="right") + tm.assert_frame_equal(result, expected) + + def test_merge_take_missing_values_from_index_of_other_dtype(self): + # GH 24212 + left = pd.DataFrame( + { + "a": [1, 2, 3], + "key": pd.Categorical(["a", "a", "b"], categories=list("abc")), + } + ) + right = pd.DataFrame( + {"b": [1, 2, 3]}, index=pd.CategoricalIndex(["a", "b", "c"]) + ) + result = left.merge(right, left_on="key", right_index=True, how="right") + expected = pd.DataFrame( + { + "a": [1, 2, 3, None], + "key": pd.Categorical(["a", "a", "b", "c"]), + "b": [1, 1, 2, 3], + }, + index=[0, 1, 2, np.nan], + ) + expected = expected.reindex(columns=["a", "key", "b"]) + tm.assert_frame_equal(result, expected) + + def test_merge_readonly(self): + # https://github.com/pandas-dev/pandas/issues/27943 + data1 = pd.DataFrame( + np.arange(20).reshape((4, 5)) + 1, columns=["a", "b", "c", "d", "e"] + ) + data2 = pd.DataFrame( + np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"] + ) + + data1._data.blocks[0].values.flags.writeable = False + data1.merge(data2) # no error + + +def _check_merge(x, y): + for how in ["inner", "left", "outer"]: + result = x.join(y, how=how) + + expected = merge(x.reset_index(), y.reset_index(), how=how, sort=True) + expected = expected.set_index("index") + + # TODO check_names on merge? + tm.assert_frame_equal(result, expected, check_names=False) + + +class TestMergeDtypes: + @pytest.mark.parametrize( + "right_vals", [["foo", "bar"], Series(["foo", "bar"]).astype("category")] + ) + def test_different(self, right_vals): + + left = DataFrame( + { + "A": ["foo", "bar"], + "B": Series(["foo", "bar"]).astype("category"), + "C": [1, 2], + "D": [1.0, 2.0], + "E": Series([1, 2], dtype="uint64"), + "F": Series([1, 2], dtype="int32"), + } + ) + right = DataFrame({"A": right_vals}) + + # GH 9780 + # We allow merging on object and categorical cols and cast + # categorical cols to object + result = pd.merge(left, right, on="A") + assert is_object_dtype(result.A.dtype) + + @pytest.mark.parametrize("d1", [np.int64, np.int32, np.int16, np.int8, np.uint8]) + @pytest.mark.parametrize("d2", [np.int64, np.float64, np.float32, np.float16]) + def test_join_multi_dtypes(self, d1, d2): + + dtype1 = np.dtype(d1) + dtype2 = np.dtype(d2) + + left = DataFrame( + { + "k1": np.array([0, 1, 2] * 8, dtype=dtype1), + "k2": ["foo", "bar"] * 12, + "v": np.array(np.arange(24), dtype=np.int64), + } + ) + + index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")]) + right = DataFrame({"v2": np.array([5, 7], dtype=dtype2)}, index=index) + + result = left.join(right, on=["k1", "k2"]) + + expected = left.copy() + + if dtype2.kind == "i": + dtype2 = np.dtype("float64") + expected["v2"] = np.array(np.nan, dtype=dtype2) + expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5 + expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7 + + tm.assert_frame_equal(result, expected) + + result = left.join(right, on=["k1", "k2"], sort=True) + expected.sort_values(["k1", "k2"], kind="mergesort", inplace=True) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "int_vals, float_vals, exp_vals", + [ + ([1, 2, 3], [1.0, 2.0, 3.0], {"X": [1, 2, 3], "Y": [1.0, 2.0, 3.0]}), + ([1, 2, 3], [1.0, 3.0], {"X": [1, 3], "Y": [1.0, 3.0]}), + ([1, 2], [1.0, 2.0, 3.0], {"X": [1, 2], "Y": [1.0, 2.0]}), + ], + ) + def test_merge_on_ints_floats(self, int_vals, float_vals, exp_vals): + # GH 16572 + # Check that float column is not cast to object if + # merging on float and int columns + A = DataFrame({"X": int_vals}) + B = DataFrame({"Y": float_vals}) + expected = DataFrame(exp_vals) + + result = A.merge(B, left_on="X", right_on="Y") + tm.assert_frame_equal(result, expected) + + result = B.merge(A, left_on="Y", right_on="X") + tm.assert_frame_equal(result, expected[["Y", "X"]]) + + def test_merge_key_dtype_cast(self): + # GH 17044 + df1 = DataFrame({"key": [1.0, 2.0], "v1": [10, 20]}, columns=["key", "v1"]) + df2 = DataFrame({"key": [2], "v2": [200]}, columns=["key", "v2"]) + result = df1.merge(df2, on="key", how="left") + expected = DataFrame( + {"key": [1.0, 2.0], "v1": [10, 20], "v2": [np.nan, 200.0]}, + columns=["key", "v1", "v2"], + ) + tm.assert_frame_equal(result, expected) + + def test_merge_on_ints_floats_warning(self): + # GH 16572 + # merge will produce a warning when merging on int and + # float columns where the float values are not exactly + # equal to their int representation + A = DataFrame({"X": [1, 2, 3]}) + B = DataFrame({"Y": [1.1, 2.5, 3.0]}) + expected = DataFrame({"X": [3], "Y": [3.0]}) + + with tm.assert_produces_warning(UserWarning): + result = A.merge(B, left_on="X", right_on="Y") + tm.assert_frame_equal(result, expected) + + with tm.assert_produces_warning(UserWarning): + result = B.merge(A, left_on="Y", right_on="X") + tm.assert_frame_equal(result, expected[["Y", "X"]]) + + # test no warning if float has NaNs + B = DataFrame({"Y": [np.nan, np.nan, 3.0]}) + + with tm.assert_produces_warning(None): + result = B.merge(A, left_on="Y", right_on="X") + tm.assert_frame_equal(result, expected[["Y", "X"]]) + + def test_merge_incompat_infer_boolean_object(self): + # GH21119: bool + object bool merge OK + df1 = DataFrame({"key": Series([True, False], dtype=object)}) + df2 = DataFrame({"key": [True, False]}) + + expected = DataFrame({"key": [True, False]}, dtype=object) + result = pd.merge(df1, df2, on="key") + tm.assert_frame_equal(result, expected) + result = pd.merge(df2, df1, on="key") + tm.assert_frame_equal(result, expected) + + # with missing value + df1 = DataFrame({"key": Series([True, False, np.nan], dtype=object)}) + df2 = DataFrame({"key": [True, False]}) + + expected = DataFrame({"key": [True, False]}, dtype=object) + result = pd.merge(df1, df2, on="key") + tm.assert_frame_equal(result, expected) + result = pd.merge(df2, df1, on="key") + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "df1_vals, df2_vals", + [ + # merge on category coerces to object + ([0, 1, 2], Series(["a", "b", "a"]).astype("category")), + ([0.0, 1.0, 2.0], Series(["a", "b", "a"]).astype("category")), + # no not infer + ([0, 1], pd.Series([False, True], dtype=object)), + ([0, 1], pd.Series([False, True], dtype=bool)), + ], + ) + def test_merge_incompat_dtypes_are_ok(self, df1_vals, df2_vals): + # these are explicitly allowed incompat merges, that pass thru + # the result type is dependent on if the values on the rhs are + # inferred, otherwise these will be coerced to object + + df1 = DataFrame({"A": df1_vals}) + df2 = DataFrame({"A": df2_vals}) + + result = pd.merge(df1, df2, on=["A"]) + assert is_object_dtype(result.A.dtype) + result = pd.merge(df2, df1, on=["A"]) + assert is_object_dtype(result.A.dtype) + + @pytest.mark.parametrize( + "df1_vals, df2_vals", + [ + # do not infer to numeric + (Series([1, 2], dtype="uint64"), ["a", "b", "c"]), + (Series([1, 2], dtype="int32"), ["a", "b", "c"]), + ([0, 1, 2], ["0", "1", "2"]), + ([0.0, 1.0, 2.0], ["0", "1", "2"]), + ([0, 1, 2], ["0", "1", "2"]), + ( + pd.date_range("1/1/2011", periods=2, freq="D"), + ["2011-01-01", "2011-01-02"], + ), + (pd.date_range("1/1/2011", periods=2, freq="D"), [0, 1]), + (pd.date_range("1/1/2011", periods=2, freq="D"), [0.0, 1.0]), + ( + pd.date_range("20130101", periods=3), + pd.date_range("20130101", periods=3, tz="US/Eastern"), + ), + ], + ) + def test_merge_incompat_dtypes_error(self, df1_vals, df2_vals): + # GH 9780, GH 15800 + # Raise a ValueError when a user tries to merge on + # dtypes that are incompatible (e.g., obj and int/float) + + df1 = DataFrame({"A": df1_vals}) + df2 = DataFrame({"A": df2_vals}) + + msg = ( + "You are trying to merge on {lk_dtype} and " + "{rk_dtype} columns. If you wish to proceed " + "you should use pd.concat".format( + lk_dtype=df1["A"].dtype, rk_dtype=df2["A"].dtype + ) + ) + msg = re.escape(msg) + with pytest.raises(ValueError, match=msg): + pd.merge(df1, df2, on=["A"]) + + # Check that error still raised when swapping order of dataframes + msg = ( + "You are trying to merge on {lk_dtype} and " + "{rk_dtype} columns. If you wish to proceed " + "you should use pd.concat".format( + lk_dtype=df2["A"].dtype, rk_dtype=df1["A"].dtype + ) + ) + msg = re.escape(msg) + with pytest.raises(ValueError, match=msg): + pd.merge(df2, df1, on=["A"]) + + +@pytest.fixture +def left(): + np.random.seed(1234) + return DataFrame( + { + "X": Series(np.random.choice(["foo", "bar"], size=(10,))).astype( + CDT(["foo", "bar"]) + ), + "Y": np.random.choice(["one", "two", "three"], size=(10,)), + } + ) + + +@pytest.fixture +def right(): + np.random.seed(1234) + return DataFrame( + {"X": Series(["foo", "bar"]).astype(CDT(["foo", "bar"])), "Z": [1, 2]} + ) + + +class TestMergeCategorical: + def test_identical(self, left): + # merging on the same, should preserve dtypes + merged = pd.merge(left, left, on="X") + result = merged.dtypes.sort_index() + expected = Series( + [CategoricalDtype(), np.dtype("O"), np.dtype("O")], + index=["X", "Y_x", "Y_y"], + ) + tm.assert_series_equal(result, expected) + + def test_basic(self, left, right): + # we have matching Categorical dtypes in X + # so should preserve the merged column + merged = pd.merge(left, right, on="X") + result = merged.dtypes.sort_index() + expected = Series( + [CategoricalDtype(), np.dtype("O"), np.dtype("int64")], + index=["X", "Y", "Z"], + ) + tm.assert_series_equal(result, expected) + + def test_merge_categorical(self): + # GH 9426 + + right = DataFrame( + { + "c": {0: "a", 1: "b", 2: "c", 3: "d", 4: "e"}, + "d": {0: "null", 1: "null", 2: "null", 3: "null", 4: "null"}, + } + ) + left = DataFrame( + { + "a": {0: "f", 1: "f", 2: "f", 3: "f", 4: "f"}, + "b": {0: "g", 1: "g", 2: "g", 3: "g", 4: "g"}, + } + ) + df = pd.merge(left, right, how="left", left_on="b", right_on="c") + + # object-object + expected = df.copy() + + # object-cat + # note that we propagate the category + # because we don't have any matching rows + cright = right.copy() + cright["d"] = cright["d"].astype("category") + result = pd.merge(left, cright, how="left", left_on="b", right_on="c") + expected["d"] = expected["d"].astype(CategoricalDtype(["null"])) + tm.assert_frame_equal(result, expected) + + # cat-object + cleft = left.copy() + cleft["b"] = cleft["b"].astype("category") + result = pd.merge(cleft, cright, how="left", left_on="b", right_on="c") + tm.assert_frame_equal(result, expected) + + # cat-cat + cright = right.copy() + cright["d"] = cright["d"].astype("category") + cleft = left.copy() + cleft["b"] = cleft["b"].astype("category") + result = pd.merge(cleft, cright, how="left", left_on="b", right_on="c") + tm.assert_frame_equal(result, expected) + + def tests_merge_categorical_unordered_equal(self): + # GH-19551 + df1 = DataFrame( + { + "Foo": Categorical(["A", "B", "C"], categories=["A", "B", "C"]), + "Left": ["A0", "B0", "C0"], + } + ) + + df2 = DataFrame( + { + "Foo": Categorical(["C", "B", "A"], categories=["C", "B", "A"]), + "Right": ["C1", "B1", "A1"], + } + ) + result = pd.merge(df1, df2, on=["Foo"]) + expected = DataFrame( + { + "Foo": pd.Categorical(["A", "B", "C"]), + "Left": ["A0", "B0", "C0"], + "Right": ["A1", "B1", "C1"], + } + ) + tm.assert_frame_equal(result, expected) + + def test_other_columns(self, left, right): + # non-merge columns should preserve if possible + right = right.assign(Z=right.Z.astype("category")) + + merged = pd.merge(left, right, on="X") + result = merged.dtypes.sort_index() + expected = Series( + [CategoricalDtype(), np.dtype("O"), CategoricalDtype()], + index=["X", "Y", "Z"], + ) + tm.assert_series_equal(result, expected) + + # categories are preserved + assert left.X.values.is_dtype_equal(merged.X.values) + assert right.Z.values.is_dtype_equal(merged.Z.values) + + @pytest.mark.parametrize( + "change", + [ + lambda x: x, + lambda x: x.astype(CDT(["foo", "bar", "bah"])), + lambda x: x.astype(CDT(ordered=True)), + ], + ) + def test_dtype_on_merged_different(self, change, join_type, left, right): + # our merging columns, X now has 2 different dtypes + # so we must be object as a result + + X = change(right.X.astype("object")) + right = right.assign(X=X) + assert is_categorical_dtype(left.X.values) + # assert not left.X.values.is_dtype_equal(right.X.values) + + merged = pd.merge(left, right, on="X", how=join_type) + + result = merged.dtypes.sort_index() + expected = Series( + [np.dtype("O"), np.dtype("O"), np.dtype("int64")], index=["X", "Y", "Z"] + ) + tm.assert_series_equal(result, expected) + + def test_self_join_multiple_categories(self): + # GH 16767 + # non-duplicates should work with multiple categories + m = 5 + df = pd.DataFrame( + { + "a": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"] * m, + "b": ["t", "w", "x", "y", "z"] * 2 * m, + "c": [ + letter + for each in ["m", "n", "u", "p", "o"] + for letter in [each] * 2 * m + ], + "d": [ + letter + for each in [ + "aa", + "bb", + "cc", + "dd", + "ee", + "ff", + "gg", + "hh", + "ii", + "jj", + ] + for letter in [each] * m + ], + } + ) + + # change them all to categorical variables + df = df.apply(lambda x: x.astype("category")) + + # self-join should equal ourselves + result = pd.merge(df, df, on=list(df.columns)) + + tm.assert_frame_equal(result, df) + + def test_dtype_on_categorical_dates(self): + # GH 16900 + # dates should not be coerced to ints + + df = pd.DataFrame( + [[date(2001, 1, 1), 1.1], [date(2001, 1, 2), 1.3]], columns=["date", "num2"] + ) + df["date"] = df["date"].astype("category") + + df2 = pd.DataFrame( + [[date(2001, 1, 1), 1.3], [date(2001, 1, 3), 1.4]], columns=["date", "num4"] + ) + df2["date"] = df2["date"].astype("category") + + expected_outer = pd.DataFrame( + [ + [pd.Timestamp("2001-01-01"), 1.1, 1.3], + [pd.Timestamp("2001-01-02"), 1.3, np.nan], + [pd.Timestamp("2001-01-03"), np.nan, 1.4], + ], + columns=["date", "num2", "num4"], + ) + result_outer = pd.merge(df, df2, how="outer", on=["date"]) + tm.assert_frame_equal(result_outer, expected_outer) + + expected_inner = pd.DataFrame( + [[pd.Timestamp("2001-01-01"), 1.1, 1.3]], columns=["date", "num2", "num4"] + ) + result_inner = pd.merge(df, df2, how="inner", on=["date"]) + tm.assert_frame_equal(result_inner, expected_inner) + + @pytest.mark.parametrize("ordered", [True, False]) + @pytest.mark.parametrize( + "category_column,categories,expected_categories", + [ + ([False, True, True, False], [True, False], [True, False]), + ([2, 1, 1, 2], [1, 2], [1, 2]), + (["False", "True", "True", "False"], ["True", "False"], ["True", "False"]), + ], + ) + def test_merging_with_bool_or_int_cateorical_column( + self, category_column, categories, expected_categories, ordered + ): + # GH 17187 + # merging with a boolean/int categorical column + df1 = pd.DataFrame({"id": [1, 2, 3, 4], "cat": category_column}) + df1["cat"] = df1["cat"].astype(CDT(categories, ordered=ordered)) + df2 = pd.DataFrame({"id": [2, 4], "num": [1, 9]}) + result = df1.merge(df2) + expected = pd.DataFrame( + {"id": [2, 4], "cat": expected_categories, "num": [1, 9]} + ) + expected["cat"] = expected["cat"].astype(CDT(categories, ordered=ordered)) + tm.assert_frame_equal(expected, result) + + def test_merge_on_int_array(self): + # GH 23020 + df = pd.DataFrame({"A": pd.Series([1, 2, np.nan], dtype="Int64"), "B": 1}) + result = pd.merge(df, df, on="A") + expected = pd.DataFrame( + {"A": pd.Series([1, 2, np.nan], dtype="Int64"), "B_x": 1, "B_y": 1} + ) + tm.assert_frame_equal(result, expected) + + +@pytest.fixture +def left_df(): + return DataFrame({"a": [20, 10, 0]}, index=[2, 1, 0]) + + +@pytest.fixture +def right_df(): + return DataFrame({"b": [300, 100, 200]}, index=[3, 1, 2]) + + +class TestMergeOnIndexes: + @pytest.mark.parametrize( + "how, sort, expected", + [ + ("inner", False, DataFrame({"a": [20, 10], "b": [200, 100]}, index=[2, 1])), + ("inner", True, DataFrame({"a": [10, 20], "b": [100, 200]}, index=[1, 2])), + ( + "left", + False, + DataFrame({"a": [20, 10, 0], "b": [200, 100, np.nan]}, index=[2, 1, 0]), + ), + ( + "left", + True, + DataFrame({"a": [0, 10, 20], "b": [np.nan, 100, 200]}, index=[0, 1, 2]), + ), + ( + "right", + False, + DataFrame( + {"a": [np.nan, 10, 20], "b": [300, 100, 200]}, index=[3, 1, 2] + ), + ), + ( + "right", + True, + DataFrame( + {"a": [10, 20, np.nan], "b": [100, 200, 300]}, index=[1, 2, 3] + ), + ), + ( + "outer", + False, + DataFrame( + {"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]}, + index=[0, 1, 2, 3], + ), + ), + ( + "outer", + True, + DataFrame( + {"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]}, + index=[0, 1, 2, 3], + ), + ), + ], + ) + def test_merge_on_indexes(self, left_df, right_df, how, sort, expected): + result = pd.merge( + left_df, right_df, left_index=True, right_index=True, how=how, sort=sort + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "index", + [ + CategoricalIndex(["A", "B"], categories=["A", "B"], name="index_col"), + Float64Index([1.0, 2.0], name="index_col"), + Int64Index([1, 2], name="index_col"), + UInt64Index([1, 2], name="index_col"), + RangeIndex(start=0, stop=2, name="index_col"), + DatetimeIndex(["2018-01-01", "2018-01-02"], name="index_col"), + ], + ids=lambda x: type(x).__name__, +) +def test_merge_index_types(index): + # gh-20777 + # assert key access is consistent across index types + left = DataFrame({"left_data": [1, 2]}, index=index) + right = DataFrame({"right_data": [1.0, 2.0]}, index=index) + + result = left.merge(right, on=["index_col"]) + + expected = DataFrame( + OrderedDict([("left_data", [1, 2]), ("right_data", [1.0, 2.0])]), index=index + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "on,left_on,right_on,left_index,right_index,nm", + [ + (["outer", "inner"], None, None, False, False, "B"), + (None, None, None, True, True, "B"), + (None, ["outer", "inner"], None, False, True, "B"), + (None, None, ["outer", "inner"], True, False, "B"), + (["outer", "inner"], None, None, False, False, None), + (None, None, None, True, True, None), + (None, ["outer", "inner"], None, False, True, None), + (None, None, ["outer", "inner"], True, False, None), + ], +) +def test_merge_series(on, left_on, right_on, left_index, right_index, nm): + # GH 21220 + a = pd.DataFrame( + {"A": [1, 2, 3, 4]}, + index=pd.MultiIndex.from_product( + [["a", "b"], [0, 1]], names=["outer", "inner"] + ), + ) + b = pd.Series( + [1, 2, 3, 4], + index=pd.MultiIndex.from_product( + [["a", "b"], [1, 2]], names=["outer", "inner"] + ), + name=nm, + ) + expected = pd.DataFrame( + {"A": [2, 4], "B": [1, 3]}, + index=pd.MultiIndex.from_product([["a", "b"], [1]], names=["outer", "inner"]), + ) + if nm is not None: + result = pd.merge( + a, + b, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + ) + tm.assert_frame_equal(result, expected) + else: + msg = "Cannot merge a Series without a name" + with pytest.raises(ValueError, match=msg): + result = pd.merge( + a, + b, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + ) + + +@pytest.mark.parametrize( + "col1, col2, kwargs, expected_cols", + [ + (0, 0, dict(suffixes=("", "_dup")), ["0", "0_dup"]), + (0, 0, dict(suffixes=(None, "_dup")), [0, "0_dup"]), + (0, 0, dict(suffixes=("_x", "_y")), ["0_x", "0_y"]), + ("a", 0, dict(suffixes=(None, "_y")), ["a", 0]), + (0.0, 0.0, dict(suffixes=("_x", None)), ["0.0_x", 0.0]), + ("b", "b", dict(suffixes=(None, "_y")), ["b", "b_y"]), + ("a", "a", dict(suffixes=("_x", None)), ["a_x", "a"]), + ("a", "b", dict(suffixes=("_x", None)), ["a", "b"]), + ("a", "a", dict(suffixes=[None, "_x"]), ["a", "a_x"]), + (0, 0, dict(suffixes=["_a", None]), ["0_a", 0]), + ("a", "a", dict(), ["a_x", "a_y"]), + (0, 0, dict(), ["0_x", "0_y"]), + ], +) +def test_merge_suffix(col1, col2, kwargs, expected_cols): + # issue: 24782 + a = pd.DataFrame({col1: [1, 2, 3]}) + b = pd.DataFrame({col2: [4, 5, 6]}) + + expected = pd.DataFrame([[1, 4], [2, 5], [3, 6]], columns=expected_cols) + + result = a.merge(b, left_index=True, right_index=True, **kwargs) + tm.assert_frame_equal(result, expected) + + result = pd.merge(a, b, left_index=True, right_index=True, **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "col1, col2, suffixes", + [ + ("a", "a", [None, None]), + ("a", "a", (None, None)), + ("a", "a", ("", None)), + (0, 0, [None, None]), + (0, 0, (None, "")), + ], +) +def test_merge_suffix_error(col1, col2, suffixes): + # issue: 24782 + a = pd.DataFrame({col1: [1, 2, 3]}) + b = pd.DataFrame({col2: [3, 4, 5]}) + + # TODO: might reconsider current raise behaviour, see issue 24782 + msg = "columns overlap but no suffix specified" + with pytest.raises(ValueError, match=msg): + pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes) + + +@pytest.mark.parametrize("col1, col2, suffixes", [("a", "a", None), (0, 0, None)]) +def test_merge_suffix_none_error(col1, col2, suffixes): + # issue: 24782 + a = pd.DataFrame({col1: [1, 2, 3]}) + b = pd.DataFrame({col2: [3, 4, 5]}) + + # TODO: might reconsider current raise behaviour, see GH24782 + msg = "iterable" + with pytest.raises(TypeError, match=msg): + pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes) + + +@pytest.mark.parametrize("cat_dtype", ["one", "two"]) +@pytest.mark.parametrize("reverse", [True, False]) +def test_merge_equal_cat_dtypes(cat_dtype, reverse): + # see gh-22501 + cat_dtypes = { + "one": CategoricalDtype(categories=["a", "b", "c"], ordered=False), + "two": CategoricalDtype(categories=["a", "b", "c"], ordered=False), + } + + df1 = DataFrame( + {"foo": Series(["a", "b", "c"]).astype(cat_dtypes["one"]), "left": [1, 2, 3]} + ).set_index("foo") + + data_foo = ["a", "b", "c"] + data_right = [1, 2, 3] + + if reverse: + data_foo.reverse() + data_right.reverse() + + df2 = DataFrame( + {"foo": Series(data_foo).astype(cat_dtypes[cat_dtype]), "right": data_right} + ).set_index("foo") + + result = df1.merge(df2, left_index=True, right_index=True) + + expected = DataFrame( + { + "left": [1, 2, 3], + "right": [1, 2, 3], + "foo": Series(["a", "b", "c"]).astype(cat_dtypes["one"]), + } + ).set_index("foo") + + # Categorical is unordered, so don't check ordering. + tm.assert_frame_equal(result, expected, check_categorical=False) + + +def test_merge_equal_cat_dtypes2(): + # see gh-22501 + cat_dtype = CategoricalDtype(categories=["a", "b", "c"], ordered=False) + + # Test Data + df1 = DataFrame( + {"foo": Series(["a", "b"]).astype(cat_dtype), "left": [1, 2]} + ).set_index("foo") + + df2 = DataFrame( + {"foo": Series(["a", "b", "c"]).astype(cat_dtype), "right": [3, 2, 1]} + ).set_index("foo") + + result = df1.merge(df2, left_index=True, right_index=True) + + expected = DataFrame( + {"left": [1, 2], "right": [3, 2], "foo": Series(["a", "b"]).astype(cat_dtype)} + ).set_index("foo") + + # Categorical is unordered, so don't check ordering. + tm.assert_frame_equal(result, expected, check_categorical=False) + + +def test_merge_on_cat_and_ext_array(): + # GH 28668 + right = DataFrame( + {"a": Series([pd.Interval(0, 1), pd.Interval(1, 2)], dtype="interval")} + ) + left = right.copy() + left["a"] = left["a"].astype("category") + + result = pd.merge(left, right, how="inner", on="a") + expected = right.copy() + + tm.assert_frame_equal(result, expected) + + +def test_merge_multiindex_columns(): + # Issue #28518 + # Verify that merging two dataframes give the expected labels + # The original cause of this issue come from a bug lexsort_depth and is tested in + # test_lexsort_depth + + letters = ["a", "b", "c", "d"] + numbers = ["1", "2", "3"] + index = pd.MultiIndex.from_product((letters, numbers), names=["outer", "inner"]) + + frame_x = pd.DataFrame(columns=index) + frame_x["id"] = "" + frame_y = pd.DataFrame(columns=index) + frame_y["id"] = "" + + l_suf = "_x" + r_suf = "_y" + result = frame_x.merge(frame_y, on="id", suffixes=((l_suf, r_suf))) + + # Constructing the expected results + expected_labels = [l + l_suf for l in letters] + [l + r_suf for l in letters] + expected_index = pd.MultiIndex.from_product( + [expected_labels, numbers], names=["outer", "inner"] + ) + expected = pd.DataFrame(columns=expected_index) + expected["id"] = "" + + tm.assert_frame_equal(result, expected) + + +def test_merge_datetime_upcast_dtype(): + # https://github.com/pandas-dev/pandas/issues/31208 + df1 = pd.DataFrame({"x": ["a", "b", "c"], "y": ["1", "2", "4"]}) + df2 = pd.DataFrame( + {"y": ["1", "2", "3"], "z": pd.to_datetime(["2000", "2001", "2002"])} + ) + result = pd.merge(df1, df2, how="left", on="y") + expected = pd.DataFrame( + { + "x": ["a", "b", "c"], + "y": ["1", "2", "4"], + "z": pd.to_datetime(["2000", "2001", "NaT"]), + } + ) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/reshape/merge/test_merge_asof.py b/venv/Lib/site-packages/pandas/tests/reshape/merge/test_merge_asof.py new file mode 100644 index 0000000..8037095 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/reshape/merge/test_merge_asof.py @@ -0,0 +1,1343 @@ +import datetime + +import numpy as np +import pytest +import pytz + +import pandas as pd +from pandas import Timedelta, merge_asof, read_csv, to_datetime +import pandas._testing as tm +from pandas.core.reshape.merge import MergeError + + +class TestAsOfMerge: + def read_data(self, datapath, name, dedupe=False): + path = datapath("reshape", "merge", "data", name) + x = read_csv(path) + if dedupe: + x = x.drop_duplicates(["time", "ticker"], keep="last").reset_index( + drop=True + ) + x.time = to_datetime(x.time) + return x + + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + + self.trades = self.read_data(datapath, "trades.csv") + self.quotes = self.read_data(datapath, "quotes.csv", dedupe=True) + self.asof = self.read_data(datapath, "asof.csv") + self.tolerance = self.read_data(datapath, "tolerance.csv") + self.allow_exact_matches = self.read_data(datapath, "allow_exact_matches.csv") + self.allow_exact_matches_and_tolerance = self.read_data( + datapath, "allow_exact_matches_and_tolerance.csv" + ) + + def test_examples1(self): + """ doc-string examples """ + + left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}) + + expected = pd.DataFrame( + {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [1, 3, 7]} + ) + + result = pd.merge_asof(left, right, on="a") + tm.assert_frame_equal(result, expected) + + def test_examples2(self): + """ doc-string examples """ + + trades = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.038", + "20160525 13:30:00.048", + "20160525 13:30:00.048", + "20160525 13:30:00.048", + ] + ), + "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], + "price": [51.95, 51.95, 720.77, 720.92, 98.00], + "quantity": [75, 155, 100, 100, 100], + }, + columns=["time", "ticker", "price", "quantity"], + ) + + quotes = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.023", + "20160525 13:30:00.030", + "20160525 13:30:00.041", + "20160525 13:30:00.048", + "20160525 13:30:00.049", + "20160525 13:30:00.072", + "20160525 13:30:00.075", + ] + ), + "ticker": [ + "GOOG", + "MSFT", + "MSFT", + "MSFT", + "GOOG", + "AAPL", + "GOOG", + "MSFT", + ], + "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], + "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03], + }, + columns=["time", "ticker", "bid", "ask"], + ) + + pd.merge_asof(trades, quotes, on="time", by="ticker") + + pd.merge_asof( + trades, quotes, on="time", by="ticker", tolerance=pd.Timedelta("2ms") + ) + + expected = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.038", + "20160525 13:30:00.048", + "20160525 13:30:00.048", + "20160525 13:30:00.048", + ] + ), + "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], + "price": [51.95, 51.95, 720.77, 720.92, 98.00], + "quantity": [75, 155, 100, 100, 100], + "bid": [np.nan, 51.97, np.nan, np.nan, np.nan], + "ask": [np.nan, 51.98, np.nan, np.nan, np.nan], + }, + columns=["time", "ticker", "price", "quantity", "bid", "ask"], + ) + + result = pd.merge_asof( + trades, + quotes, + on="time", + by="ticker", + tolerance=pd.Timedelta("10ms"), + allow_exact_matches=False, + ) + tm.assert_frame_equal(result, expected) + + def test_examples3(self): + """ doc-string examples """ + # GH14887 + + left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}) + + expected = pd.DataFrame( + {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [1, 6, np.nan]} + ) + + result = pd.merge_asof(left, right, on="a", direction="forward") + tm.assert_frame_equal(result, expected) + + def test_examples4(self): + """ doc-string examples """ + # GH14887 + + left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}) + + expected = pd.DataFrame( + {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [1, 6, 7]} + ) + + result = pd.merge_asof(left, right, on="a", direction="nearest") + tm.assert_frame_equal(result, expected) + + def test_basic(self): + + expected = self.asof + trades = self.trades + quotes = self.quotes + + result = merge_asof(trades, quotes, on="time", by="ticker") + tm.assert_frame_equal(result, expected) + + def test_basic_categorical(self): + + expected = self.asof + trades = self.trades.copy() + trades.ticker = trades.ticker.astype("category") + quotes = self.quotes.copy() + quotes.ticker = quotes.ticker.astype("category") + expected.ticker = expected.ticker.astype("category") + + result = merge_asof(trades, quotes, on="time", by="ticker") + tm.assert_frame_equal(result, expected) + + def test_basic_left_index(self): + + # GH14253 + expected = self.asof + trades = self.trades.set_index("time") + quotes = self.quotes + + result = merge_asof( + trades, quotes, left_index=True, right_on="time", by="ticker" + ) + # left-only index uses right"s index, oddly + expected.index = result.index + # time column appears after left"s columns + expected = expected[result.columns] + tm.assert_frame_equal(result, expected) + + def test_basic_right_index(self): + + expected = self.asof + trades = self.trades + quotes = self.quotes.set_index("time") + + result = merge_asof( + trades, quotes, left_on="time", right_index=True, by="ticker" + ) + tm.assert_frame_equal(result, expected) + + def test_basic_left_index_right_index(self): + + expected = self.asof.set_index("time") + trades = self.trades.set_index("time") + quotes = self.quotes.set_index("time") + + result = merge_asof( + trades, quotes, left_index=True, right_index=True, by="ticker" + ) + tm.assert_frame_equal(result, expected) + + def test_multi_index(self): + + # MultiIndex is prohibited + trades = self.trades.set_index(["time", "price"]) + quotes = self.quotes.set_index("time") + with pytest.raises(MergeError): + merge_asof(trades, quotes, left_index=True, right_index=True) + + trades = self.trades.set_index("time") + quotes = self.quotes.set_index(["time", "bid"]) + with pytest.raises(MergeError): + merge_asof(trades, quotes, left_index=True, right_index=True) + + def test_on_and_index(self): + + # "on" parameter and index together is prohibited + trades = self.trades.set_index("time") + quotes = self.quotes.set_index("time") + with pytest.raises(MergeError): + merge_asof( + trades, quotes, left_on="price", left_index=True, right_index=True + ) + + trades = self.trades.set_index("time") + quotes = self.quotes.set_index("time") + with pytest.raises(MergeError): + merge_asof( + trades, quotes, right_on="bid", left_index=True, right_index=True + ) + + def test_basic_left_by_right_by(self): + + # GH14253 + expected = self.asof + trades = self.trades + quotes = self.quotes + + result = merge_asof( + trades, quotes, on="time", left_by="ticker", right_by="ticker" + ) + tm.assert_frame_equal(result, expected) + + def test_missing_right_by(self): + + expected = self.asof + trades = self.trades + quotes = self.quotes + + q = quotes[quotes.ticker != "MSFT"] + result = merge_asof(trades, q, on="time", by="ticker") + expected.loc[expected.ticker == "MSFT", ["bid", "ask"]] = np.nan + tm.assert_frame_equal(result, expected) + + def test_multiby(self): + # GH13936 + trades = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.023", + "20160525 13:30:00.046", + "20160525 13:30:00.048", + "20160525 13:30:00.050", + ] + ), + "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], + "exch": ["ARCA", "NSDQ", "NSDQ", "BATS", "NSDQ"], + "price": [51.95, 51.95, 720.77, 720.92, 98.00], + "quantity": [75, 155, 100, 100, 100], + }, + columns=["time", "ticker", "exch", "price", "quantity"], + ) + + quotes = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.023", + "20160525 13:30:00.030", + "20160525 13:30:00.041", + "20160525 13:30:00.045", + "20160525 13:30:00.049", + ] + ), + "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL"], + "exch": ["BATS", "NSDQ", "ARCA", "ARCA", "NSDQ", "ARCA"], + "bid": [720.51, 51.95, 51.97, 51.99, 720.50, 97.99], + "ask": [720.92, 51.96, 51.98, 52.00, 720.93, 98.01], + }, + columns=["time", "ticker", "exch", "bid", "ask"], + ) + + expected = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.023", + "20160525 13:30:00.046", + "20160525 13:30:00.048", + "20160525 13:30:00.050", + ] + ), + "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], + "exch": ["ARCA", "NSDQ", "NSDQ", "BATS", "NSDQ"], + "price": [51.95, 51.95, 720.77, 720.92, 98.00], + "quantity": [75, 155, 100, 100, 100], + "bid": [np.nan, 51.95, 720.50, 720.51, np.nan], + "ask": [np.nan, 51.96, 720.93, 720.92, np.nan], + }, + columns=["time", "ticker", "exch", "price", "quantity", "bid", "ask"], + ) + + result = pd.merge_asof(trades, quotes, on="time", by=["ticker", "exch"]) + tm.assert_frame_equal(result, expected) + + def test_multiby_heterogeneous_types(self): + # GH13936 + trades = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.023", + "20160525 13:30:00.046", + "20160525 13:30:00.048", + "20160525 13:30:00.050", + ] + ), + "ticker": [0, 0, 1, 1, 2], + "exch": ["ARCA", "NSDQ", "NSDQ", "BATS", "NSDQ"], + "price": [51.95, 51.95, 720.77, 720.92, 98.00], + "quantity": [75, 155, 100, 100, 100], + }, + columns=["time", "ticker", "exch", "price", "quantity"], + ) + + quotes = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.023", + "20160525 13:30:00.030", + "20160525 13:30:00.041", + "20160525 13:30:00.045", + "20160525 13:30:00.049", + ] + ), + "ticker": [1, 0, 0, 0, 1, 2], + "exch": ["BATS", "NSDQ", "ARCA", "ARCA", "NSDQ", "ARCA"], + "bid": [720.51, 51.95, 51.97, 51.99, 720.50, 97.99], + "ask": [720.92, 51.96, 51.98, 52.00, 720.93, 98.01], + }, + columns=["time", "ticker", "exch", "bid", "ask"], + ) + + expected = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.023", + "20160525 13:30:00.046", + "20160525 13:30:00.048", + "20160525 13:30:00.050", + ] + ), + "ticker": [0, 0, 1, 1, 2], + "exch": ["ARCA", "NSDQ", "NSDQ", "BATS", "NSDQ"], + "price": [51.95, 51.95, 720.77, 720.92, 98.00], + "quantity": [75, 155, 100, 100, 100], + "bid": [np.nan, 51.95, 720.50, 720.51, np.nan], + "ask": [np.nan, 51.96, 720.93, 720.92, np.nan], + }, + columns=["time", "ticker", "exch", "price", "quantity", "bid", "ask"], + ) + + result = pd.merge_asof(trades, quotes, on="time", by=["ticker", "exch"]) + tm.assert_frame_equal(result, expected) + + def test_multiby_indexed(self): + # GH15676 + left = pd.DataFrame( + [ + [pd.to_datetime("20160602"), 1, "a"], + [pd.to_datetime("20160602"), 2, "a"], + [pd.to_datetime("20160603"), 1, "b"], + [pd.to_datetime("20160603"), 2, "b"], + ], + columns=["time", "k1", "k2"], + ).set_index("time") + + right = pd.DataFrame( + [ + [pd.to_datetime("20160502"), 1, "a", 1.0], + [pd.to_datetime("20160502"), 2, "a", 2.0], + [pd.to_datetime("20160503"), 1, "b", 3.0], + [pd.to_datetime("20160503"), 2, "b", 4.0], + ], + columns=["time", "k1", "k2", "value"], + ).set_index("time") + + expected = pd.DataFrame( + [ + [pd.to_datetime("20160602"), 1, "a", 1.0], + [pd.to_datetime("20160602"), 2, "a", 2.0], + [pd.to_datetime("20160603"), 1, "b", 3.0], + [pd.to_datetime("20160603"), 2, "b", 4.0], + ], + columns=["time", "k1", "k2", "value"], + ).set_index("time") + + result = pd.merge_asof( + left, right, left_index=True, right_index=True, by=["k1", "k2"] + ) + + tm.assert_frame_equal(expected, result) + + with pytest.raises(MergeError): + pd.merge_asof( + left, + right, + left_index=True, + right_index=True, + left_by=["k1", "k2"], + right_by=["k1"], + ) + + def test_basic2(self, datapath): + + expected = self.read_data(datapath, "asof2.csv") + trades = self.read_data(datapath, "trades2.csv") + quotes = self.read_data(datapath, "quotes2.csv", dedupe=True) + + result = merge_asof(trades, quotes, on="time", by="ticker") + tm.assert_frame_equal(result, expected) + + def test_basic_no_by(self): + f = ( + lambda x: x[x.ticker == "MSFT"] + .drop("ticker", axis=1) + .reset_index(drop=True) + ) + + # just use a single ticker + expected = f(self.asof) + trades = f(self.trades) + quotes = f(self.quotes) + + result = merge_asof(trades, quotes, on="time") + tm.assert_frame_equal(result, expected) + + def test_valid_join_keys(self): + + trades = self.trades + quotes = self.quotes + + with pytest.raises(MergeError): + merge_asof(trades, quotes, left_on="time", right_on="bid", by="ticker") + + with pytest.raises(MergeError): + merge_asof(trades, quotes, on=["time", "ticker"], by="ticker") + + with pytest.raises(MergeError): + merge_asof(trades, quotes, by="ticker") + + def test_with_duplicates(self, datapath): + + q = ( + pd.concat([self.quotes, self.quotes]) + .sort_values(["time", "ticker"]) + .reset_index(drop=True) + ) + result = merge_asof(self.trades, q, on="time", by="ticker") + expected = self.read_data(datapath, "asof.csv") + tm.assert_frame_equal(result, expected) + + def test_with_duplicates_no_on(self): + + df1 = pd.DataFrame({"key": [1, 1, 3], "left_val": [1, 2, 3]}) + df2 = pd.DataFrame({"key": [1, 2, 2], "right_val": [1, 2, 3]}) + result = merge_asof(df1, df2, on="key") + expected = pd.DataFrame( + {"key": [1, 1, 3], "left_val": [1, 2, 3], "right_val": [1, 1, 3]} + ) + tm.assert_frame_equal(result, expected) + + def test_valid_allow_exact_matches(self): + + trades = self.trades + quotes = self.quotes + + with pytest.raises(MergeError): + merge_asof( + trades, quotes, on="time", by="ticker", allow_exact_matches="foo" + ) + + def test_valid_tolerance(self): + + trades = self.trades + quotes = self.quotes + + # dti + merge_asof(trades, quotes, on="time", by="ticker", tolerance=Timedelta("1s")) + + # integer + merge_asof( + trades.reset_index(), + quotes.reset_index(), + on="index", + by="ticker", + tolerance=1, + ) + + # incompat + with pytest.raises(MergeError): + merge_asof(trades, quotes, on="time", by="ticker", tolerance=1) + + # invalid + with pytest.raises(MergeError): + merge_asof( + trades.reset_index(), + quotes.reset_index(), + on="index", + by="ticker", + tolerance=1.0, + ) + + # invalid negative + with pytest.raises(MergeError): + merge_asof( + trades, quotes, on="time", by="ticker", tolerance=-Timedelta("1s") + ) + + with pytest.raises(MergeError): + merge_asof( + trades.reset_index(), + quotes.reset_index(), + on="index", + by="ticker", + tolerance=-1, + ) + + def test_non_sorted(self): + + trades = self.trades.sort_values("time", ascending=False) + quotes = self.quotes.sort_values("time", ascending=False) + + # we require that we are already sorted on time & quotes + assert not trades.time.is_monotonic + assert not quotes.time.is_monotonic + with pytest.raises(ValueError): + merge_asof(trades, quotes, on="time", by="ticker") + + trades = self.trades.sort_values("time") + assert trades.time.is_monotonic + assert not quotes.time.is_monotonic + with pytest.raises(ValueError): + merge_asof(trades, quotes, on="time", by="ticker") + + quotes = self.quotes.sort_values("time") + assert trades.time.is_monotonic + assert quotes.time.is_monotonic + + # ok, though has dupes + merge_asof(trades, self.quotes, on="time", by="ticker") + + @pytest.mark.parametrize( + "tolerance", + [Timedelta("1day"), datetime.timedelta(days=1)], + ids=["pd.Timedelta", "datetime.timedelta"], + ) + def test_tolerance(self, tolerance): + + trades = self.trades + quotes = self.quotes + + result = merge_asof(trades, quotes, on="time", by="ticker", tolerance=tolerance) + expected = self.tolerance + tm.assert_frame_equal(result, expected) + + def test_tolerance_forward(self): + # GH14887 + + left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame({"a": [1, 2, 3, 7, 11], "right_val": [1, 2, 3, 7, 11]}) + + expected = pd.DataFrame( + {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [1, np.nan, 11]} + ) + + result = pd.merge_asof(left, right, on="a", direction="forward", tolerance=1) + tm.assert_frame_equal(result, expected) + + def test_tolerance_nearest(self): + # GH14887 + + left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame({"a": [1, 2, 3, 7, 11], "right_val": [1, 2, 3, 7, 11]}) + + expected = pd.DataFrame( + {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [1, np.nan, 11]} + ) + + result = pd.merge_asof(left, right, on="a", direction="nearest", tolerance=1) + tm.assert_frame_equal(result, expected) + + def test_tolerance_tz(self): + # GH 14844 + left = pd.DataFrame( + { + "date": pd.date_range( + start=pd.to_datetime("2016-01-02"), + freq="D", + periods=5, + tz=pytz.timezone("UTC"), + ), + "value1": np.arange(5), + } + ) + right = pd.DataFrame( + { + "date": pd.date_range( + start=pd.to_datetime("2016-01-01"), + freq="D", + periods=5, + tz=pytz.timezone("UTC"), + ), + "value2": list("ABCDE"), + } + ) + result = pd.merge_asof(left, right, on="date", tolerance=pd.Timedelta("1 day")) + + expected = pd.DataFrame( + { + "date": pd.date_range( + start=pd.to_datetime("2016-01-02"), + freq="D", + periods=5, + tz=pytz.timezone("UTC"), + ), + "value1": np.arange(5), + "value2": list("BCDEE"), + } + ) + tm.assert_frame_equal(result, expected) + + def test_tolerance_float(self): + # GH22981 + left = pd.DataFrame({"a": [1.1, 3.5, 10.9], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame( + {"a": [1.0, 2.5, 3.3, 7.5, 11.5], "right_val": [1.0, 2.5, 3.3, 7.5, 11.5]} + ) + + expected = pd.DataFrame( + { + "a": [1.1, 3.5, 10.9], + "left_val": ["a", "b", "c"], + "right_val": [1, 3.3, np.nan], + } + ) + + result = pd.merge_asof(left, right, on="a", direction="nearest", tolerance=0.5) + tm.assert_frame_equal(result, expected) + + def test_index_tolerance(self): + # GH 15135 + expected = self.tolerance.set_index("time") + trades = self.trades.set_index("time") + quotes = self.quotes.set_index("time") + + result = pd.merge_asof( + trades, + quotes, + left_index=True, + right_index=True, + by="ticker", + tolerance=pd.Timedelta("1day"), + ) + tm.assert_frame_equal(result, expected) + + def test_allow_exact_matches(self): + + result = merge_asof( + self.trades, self.quotes, on="time", by="ticker", allow_exact_matches=False + ) + expected = self.allow_exact_matches + tm.assert_frame_equal(result, expected) + + def test_allow_exact_matches_forward(self): + # GH14887 + + left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame({"a": [1, 2, 3, 7, 11], "right_val": [1, 2, 3, 7, 11]}) + + expected = pd.DataFrame( + {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [2, 7, 11]} + ) + + result = pd.merge_asof( + left, right, on="a", direction="forward", allow_exact_matches=False + ) + tm.assert_frame_equal(result, expected) + + def test_allow_exact_matches_nearest(self): + # GH14887 + + left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame({"a": [1, 2, 3, 7, 11], "right_val": [1, 2, 3, 7, 11]}) + + expected = pd.DataFrame( + {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [2, 3, 11]} + ) + + result = pd.merge_asof( + left, right, on="a", direction="nearest", allow_exact_matches=False + ) + tm.assert_frame_equal(result, expected) + + def test_allow_exact_matches_and_tolerance(self): + + result = merge_asof( + self.trades, + self.quotes, + on="time", + by="ticker", + tolerance=Timedelta("100ms"), + allow_exact_matches=False, + ) + expected = self.allow_exact_matches_and_tolerance + tm.assert_frame_equal(result, expected) + + def test_allow_exact_matches_and_tolerance2(self): + # GH 13695 + df1 = pd.DataFrame( + {"time": pd.to_datetime(["2016-07-15 13:30:00.030"]), "username": ["bob"]} + ) + df2 = pd.DataFrame( + { + "time": pd.to_datetime( + ["2016-07-15 13:30:00.000", "2016-07-15 13:30:00.030"] + ), + "version": [1, 2], + } + ) + + result = pd.merge_asof(df1, df2, on="time") + expected = pd.DataFrame( + { + "time": pd.to_datetime(["2016-07-15 13:30:00.030"]), + "username": ["bob"], + "version": [2], + } + ) + tm.assert_frame_equal(result, expected) + + result = pd.merge_asof(df1, df2, on="time", allow_exact_matches=False) + expected = pd.DataFrame( + { + "time": pd.to_datetime(["2016-07-15 13:30:00.030"]), + "username": ["bob"], + "version": [1], + } + ) + tm.assert_frame_equal(result, expected) + + result = pd.merge_asof( + df1, + df2, + on="time", + allow_exact_matches=False, + tolerance=pd.Timedelta("10ms"), + ) + expected = pd.DataFrame( + { + "time": pd.to_datetime(["2016-07-15 13:30:00.030"]), + "username": ["bob"], + "version": [np.nan], + } + ) + tm.assert_frame_equal(result, expected) + + def test_allow_exact_matches_and_tolerance3(self): + # GH 13709 + df1 = pd.DataFrame( + { + "time": pd.to_datetime( + ["2016-07-15 13:30:00.030", "2016-07-15 13:30:00.030"] + ), + "username": ["bob", "charlie"], + } + ) + df2 = pd.DataFrame( + { + "time": pd.to_datetime( + ["2016-07-15 13:30:00.000", "2016-07-15 13:30:00.030"] + ), + "version": [1, 2], + } + ) + + result = pd.merge_asof( + df1, + df2, + on="time", + allow_exact_matches=False, + tolerance=pd.Timedelta("10ms"), + ) + expected = pd.DataFrame( + { + "time": pd.to_datetime( + ["2016-07-15 13:30:00.030", "2016-07-15 13:30:00.030"] + ), + "username": ["bob", "charlie"], + "version": [np.nan, np.nan], + } + ) + tm.assert_frame_equal(result, expected) + + def test_allow_exact_matches_and_tolerance_forward(self): + # GH14887 + + left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame({"a": [1, 3, 4, 6, 11], "right_val": [1, 3, 4, 6, 11]}) + + expected = pd.DataFrame( + {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [np.nan, 6, 11]} + ) + + result = pd.merge_asof( + left, + right, + on="a", + direction="forward", + allow_exact_matches=False, + tolerance=1, + ) + tm.assert_frame_equal(result, expected) + + def test_allow_exact_matches_and_tolerance_nearest(self): + # GH14887 + + left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame({"a": [1, 3, 4, 6, 11], "right_val": [1, 3, 4, 7, 11]}) + + expected = pd.DataFrame( + {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [np.nan, 4, 11]} + ) + + result = pd.merge_asof( + left, + right, + on="a", + direction="nearest", + allow_exact_matches=False, + tolerance=1, + ) + tm.assert_frame_equal(result, expected) + + def test_forward_by(self): + # GH14887 + + left = pd.DataFrame( + { + "a": [1, 5, 10, 12, 15], + "b": ["X", "X", "Y", "Z", "Y"], + "left_val": ["a", "b", "c", "d", "e"], + } + ) + right = pd.DataFrame( + { + "a": [1, 6, 11, 15, 16], + "b": ["X", "Z", "Y", "Z", "Y"], + "right_val": [1, 6, 11, 15, 16], + } + ) + + expected = pd.DataFrame( + { + "a": [1, 5, 10, 12, 15], + "b": ["X", "X", "Y", "Z", "Y"], + "left_val": ["a", "b", "c", "d", "e"], + "right_val": [1, np.nan, 11, 15, 16], + } + ) + + result = pd.merge_asof(left, right, on="a", by="b", direction="forward") + tm.assert_frame_equal(result, expected) + + def test_nearest_by(self): + # GH14887 + + left = pd.DataFrame( + { + "a": [1, 5, 10, 12, 15], + "b": ["X", "X", "Z", "Z", "Y"], + "left_val": ["a", "b", "c", "d", "e"], + } + ) + right = pd.DataFrame( + { + "a": [1, 6, 11, 15, 16], + "b": ["X", "Z", "Z", "Z", "Y"], + "right_val": [1, 6, 11, 15, 16], + } + ) + + expected = pd.DataFrame( + { + "a": [1, 5, 10, 12, 15], + "b": ["X", "X", "Z", "Z", "Y"], + "left_val": ["a", "b", "c", "d", "e"], + "right_val": [1, 1, 11, 11, 16], + } + ) + + result = pd.merge_asof(left, right, on="a", by="b", direction="nearest") + tm.assert_frame_equal(result, expected) + + def test_by_int(self): + # we specialize by type, so test that this is correct + df1 = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.020", + "20160525 13:30:00.030", + "20160525 13:30:00.040", + "20160525 13:30:00.050", + "20160525 13:30:00.060", + ] + ), + "key": [1, 2, 1, 3, 2], + "value1": [1.1, 1.2, 1.3, 1.4, 1.5], + }, + columns=["time", "key", "value1"], + ) + + df2 = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.015", + "20160525 13:30:00.020", + "20160525 13:30:00.025", + "20160525 13:30:00.035", + "20160525 13:30:00.040", + "20160525 13:30:00.055", + "20160525 13:30:00.060", + "20160525 13:30:00.065", + ] + ), + "key": [2, 1, 1, 3, 2, 1, 2, 3], + "value2": [2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8], + }, + columns=["time", "key", "value2"], + ) + + result = pd.merge_asof(df1, df2, on="time", by="key") + + expected = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.020", + "20160525 13:30:00.030", + "20160525 13:30:00.040", + "20160525 13:30:00.050", + "20160525 13:30:00.060", + ] + ), + "key": [1, 2, 1, 3, 2], + "value1": [1.1, 1.2, 1.3, 1.4, 1.5], + "value2": [2.2, 2.1, 2.3, 2.4, 2.7], + }, + columns=["time", "key", "value1", "value2"], + ) + + tm.assert_frame_equal(result, expected) + + def test_on_float(self): + # mimics how to determine the minimum-price variation + df1 = pd.DataFrame( + { + "price": [5.01, 0.0023, 25.13, 340.05, 30.78, 1040.90, 0.0078], + "symbol": list("ABCDEFG"), + }, + columns=["symbol", "price"], + ) + + df2 = pd.DataFrame( + {"price": [0.0, 1.0, 100.0], "mpv": [0.0001, 0.01, 0.05]}, + columns=["price", "mpv"], + ) + + df1 = df1.sort_values("price").reset_index(drop=True) + + result = pd.merge_asof(df1, df2, on="price") + + expected = pd.DataFrame( + { + "symbol": list("BGACEDF"), + "price": [0.0023, 0.0078, 5.01, 25.13, 30.78, 340.05, 1040.90], + "mpv": [0.0001, 0.0001, 0.01, 0.01, 0.01, 0.05, 0.05], + }, + columns=["symbol", "price", "mpv"], + ) + + tm.assert_frame_equal(result, expected) + + def test_on_specialized_type(self, any_real_dtype): + # see gh-13936 + dtype = np.dtype(any_real_dtype).type + + df1 = pd.DataFrame( + {"value": [5, 2, 25, 100, 78, 120, 79], "symbol": list("ABCDEFG")}, + columns=["symbol", "value"], + ) + df1.value = dtype(df1.value) + + df2 = pd.DataFrame( + {"value": [0, 80, 120, 125], "result": list("xyzw")}, + columns=["value", "result"], + ) + df2.value = dtype(df2.value) + + df1 = df1.sort_values("value").reset_index(drop=True) + result = pd.merge_asof(df1, df2, on="value") + + expected = pd.DataFrame( + { + "symbol": list("BACEGDF"), + "value": [2, 5, 25, 78, 79, 100, 120], + "result": list("xxxxxyz"), + }, + columns=["symbol", "value", "result"], + ) + expected.value = dtype(expected.value) + + tm.assert_frame_equal(result, expected) + + def test_on_specialized_type_by_int(self, any_real_dtype): + # see gh-13936 + dtype = np.dtype(any_real_dtype).type + + df1 = pd.DataFrame( + { + "value": [5, 2, 25, 100, 78, 120, 79], + "key": [1, 2, 3, 2, 3, 1, 2], + "symbol": list("ABCDEFG"), + }, + columns=["symbol", "key", "value"], + ) + df1.value = dtype(df1.value) + + df2 = pd.DataFrame( + {"value": [0, 80, 120, 125], "key": [1, 2, 2, 3], "result": list("xyzw")}, + columns=["value", "key", "result"], + ) + df2.value = dtype(df2.value) + + df1 = df1.sort_values("value").reset_index(drop=True) + result = pd.merge_asof(df1, df2, on="value", by="key") + + expected = pd.DataFrame( + { + "symbol": list("BACEGDF"), + "key": [2, 1, 3, 3, 2, 2, 1], + "value": [2, 5, 25, 78, 79, 100, 120], + "result": [np.nan, "x", np.nan, np.nan, np.nan, "y", "x"], + }, + columns=["symbol", "key", "value", "result"], + ) + expected.value = dtype(expected.value) + + tm.assert_frame_equal(result, expected) + + def test_on_float_by_int(self): + # type specialize both "by" and "on" parameters + df1 = pd.DataFrame( + { + "symbol": list("AAABBBCCC"), + "exch": [1, 2, 3, 1, 2, 3, 1, 2, 3], + "price": [ + 3.26, + 3.2599, + 3.2598, + 12.58, + 12.59, + 12.5, + 378.15, + 378.2, + 378.25, + ], + }, + columns=["symbol", "exch", "price"], + ) + + df2 = pd.DataFrame( + { + "exch": [1, 1, 1, 2, 2, 2, 3, 3, 3], + "price": [0.0, 1.0, 100.0, 0.0, 5.0, 100.0, 0.0, 5.0, 1000.0], + "mpv": [0.0001, 0.01, 0.05, 0.0001, 0.01, 0.1, 0.0001, 0.25, 1.0], + }, + columns=["exch", "price", "mpv"], + ) + + df1 = df1.sort_values("price").reset_index(drop=True) + df2 = df2.sort_values("price").reset_index(drop=True) + + result = pd.merge_asof(df1, df2, on="price", by="exch") + + expected = pd.DataFrame( + { + "symbol": list("AAABBBCCC"), + "exch": [3, 2, 1, 3, 1, 2, 1, 2, 3], + "price": [ + 3.2598, + 3.2599, + 3.26, + 12.5, + 12.58, + 12.59, + 378.15, + 378.2, + 378.25, + ], + "mpv": [0.0001, 0.0001, 0.01, 0.25, 0.01, 0.01, 0.05, 0.1, 0.25], + }, + columns=["symbol", "exch", "price", "mpv"], + ) + + tm.assert_frame_equal(result, expected) + + def test_merge_datatype_error_raises(self): + msg = r"incompatible merge keys \[0\] .*, must be the same type" + + left = pd.DataFrame({"left_val": [1, 5, 10], "a": ["a", "b", "c"]}) + right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7], "a": [1, 2, 3, 6, 7]}) + + with pytest.raises(MergeError, match=msg): + merge_asof(left, right, on="a") + + def test_merge_datatype_categorical_error_raises(self): + msg = ( + r"incompatible merge keys \[0\] .* both sides category, " + "but not equal ones" + ) + + left = pd.DataFrame( + {"left_val": [1, 5, 10], "a": pd.Categorical(["a", "b", "c"])} + ) + right = pd.DataFrame( + { + "right_val": [1, 2, 3, 6, 7], + "a": pd.Categorical(["a", "X", "c", "X", "b"]), + } + ) + + with pytest.raises(MergeError, match=msg): + merge_asof(left, right, on="a") + + def test_merge_groupby_multiple_column_with_categorical_column(self): + # GH 16454 + df = pd.DataFrame({"x": [0], "y": [0], "z": pd.Categorical([0])}) + result = merge_asof(df, df, on="x", by=["y", "z"]) + expected = pd.DataFrame({"x": [0], "y": [0], "z": pd.Categorical([0])}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "func", [lambda x: x, lambda x: to_datetime(x)], ids=["numeric", "datetime"] + ) + @pytest.mark.parametrize("side", ["left", "right"]) + def test_merge_on_nans(self, func, side): + # GH 23189 + msg = "Merge keys contain null values on {} side".format(side) + nulls = func([1.0, 5.0, np.nan]) + non_nulls = func([1.0, 5.0, 10.0]) + df_null = pd.DataFrame({"a": nulls, "left_val": ["a", "b", "c"]}) + df = pd.DataFrame({"a": non_nulls, "right_val": [1, 6, 11]}) + + with pytest.raises(ValueError, match=msg): + if side == "left": + merge_asof(df_null, df, on="a") + else: + merge_asof(df, df_null, on="a") + + def test_merge_by_col_tz_aware(self): + # GH 21184 + left = pd.DataFrame( + { + "by_col": pd.DatetimeIndex(["2018-01-01"]).tz_localize("UTC"), + "on_col": [2], + "values": ["a"], + } + ) + right = pd.DataFrame( + { + "by_col": pd.DatetimeIndex(["2018-01-01"]).tz_localize("UTC"), + "on_col": [1], + "values": ["b"], + } + ) + result = pd.merge_asof(left, right, by="by_col", on="on_col") + expected = pd.DataFrame( + [[pd.Timestamp("2018-01-01", tz="UTC"), 2, "a", "b"]], + columns=["by_col", "on_col", "values_x", "values_y"], + ) + tm.assert_frame_equal(result, expected) + + def test_by_mixed_tz_aware(self): + # GH 26649 + left = pd.DataFrame( + { + "by_col1": pd.DatetimeIndex(["2018-01-01"]).tz_localize("UTC"), + "by_col2": ["HELLO"], + "on_col": [2], + "value": ["a"], + } + ) + right = pd.DataFrame( + { + "by_col1": pd.DatetimeIndex(["2018-01-01"]).tz_localize("UTC"), + "by_col2": ["WORLD"], + "on_col": [1], + "value": ["b"], + } + ) + result = pd.merge_asof(left, right, by=["by_col1", "by_col2"], on="on_col") + expected = pd.DataFrame( + [[pd.Timestamp("2018-01-01", tz="UTC"), "HELLO", 2, "a"]], + columns=["by_col1", "by_col2", "on_col", "value_x"], + ) + expected["value_y"] = np.array([np.nan], dtype=object) + tm.assert_frame_equal(result, expected) + + def test_timedelta_tolerance_nearest(self): + # GH 27642 + + left = pd.DataFrame( + list(zip([0, 5, 10, 15, 20, 25], [0, 1, 2, 3, 4, 5])), + columns=["time", "left"], + ) + + left["time"] = pd.to_timedelta(left["time"], "ms") + + right = pd.DataFrame( + list(zip([0, 3, 9, 12, 15, 18], [0, 1, 2, 3, 4, 5])), + columns=["time", "right"], + ) + + right["time"] = pd.to_timedelta(right["time"], "ms") + + expected = pd.DataFrame( + list( + zip( + [0, 5, 10, 15, 20, 25], + [0, 1, 2, 3, 4, 5], + [0, np.nan, 2, 4, np.nan, np.nan], + ) + ), + columns=["time", "left", "right"], + ) + + expected["time"] = pd.to_timedelta(expected["time"], "ms") + + result = pd.merge_asof( + left, right, on="time", tolerance=Timedelta("1ms"), direction="nearest" + ) + + tm.assert_frame_equal(result, expected) + + def test_int_type_tolerance(self, any_int_dtype): + # GH #28870 + + left = pd.DataFrame({"a": [0, 10, 20], "left_val": [1, 2, 3]}) + right = pd.DataFrame({"a": [5, 15, 25], "right_val": [1, 2, 3]}) + left["a"] = left["a"].astype(any_int_dtype) + right["a"] = right["a"].astype(any_int_dtype) + + expected = pd.DataFrame( + {"a": [0, 10, 20], "left_val": [1, 2, 3], "right_val": [np.nan, 1.0, 2.0]} + ) + expected["a"] = expected["a"].astype(any_int_dtype) + + result = pd.merge_asof(left, right, on="a", tolerance=10) + tm.assert_frame_equal(result, expected) + + def test_merge_index_column_tz(self): + # GH 29864 + index = pd.date_range("2019-10-01", freq="30min", periods=5, tz="UTC") + left = pd.DataFrame([0.9, 0.8, 0.7, 0.6], columns=["xyz"], index=index[1:]) + right = pd.DataFrame({"from_date": index, "abc": [2.46] * 4 + [2.19]}) + result = pd.merge_asof( + left=left, right=right, left_index=True, right_on=["from_date"] + ) + expected = pd.DataFrame( + { + "xyz": [0.9, 0.8, 0.7, 0.6], + "from_date": index[1:], + "abc": [2.46] * 3 + [2.19], + }, + index=pd.Index([1, 2, 3, 4]), + ) + tm.assert_frame_equal(result, expected) + + result = pd.merge_asof( + left=right, right=left, right_index=True, left_on=["from_date"] + ) + expected = pd.DataFrame( + { + "from_date": index, + "abc": [2.46] * 4 + [2.19], + "xyz": [np.nan, 0.9, 0.8, 0.7, 0.6], + }, + index=pd.Index([0, 1, 2, 3, 4]), + ) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/reshape/merge/test_merge_index_as_string.py b/venv/Lib/site-packages/pandas/tests/reshape/merge/test_merge_index_as_string.py new file mode 100644 index 0000000..691f254 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/reshape/merge/test_merge_index_as_string.py @@ -0,0 +1,188 @@ +import numpy as np +import pytest + +from pandas import DataFrame +import pandas._testing as tm + + +@pytest.fixture +def df1(): + return DataFrame( + dict( + outer=[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4], + inner=[1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2], + v1=np.linspace(0, 1, 11), + ) + ) + + +@pytest.fixture +def df2(): + return DataFrame( + dict( + outer=[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3], + inner=[1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3], + v2=np.linspace(10, 11, 12), + ) + ) + + +@pytest.fixture(params=[[], ["outer"], ["outer", "inner"]]) +def left_df(request, df1): + """ Construct left test DataFrame with specified levels + (any of 'outer', 'inner', and 'v1')""" + levels = request.param + if levels: + df1 = df1.set_index(levels) + + return df1 + + +@pytest.fixture(params=[[], ["outer"], ["outer", "inner"]]) +def right_df(request, df2): + """ Construct right test DataFrame with specified levels + (any of 'outer', 'inner', and 'v2')""" + levels = request.param + + if levels: + df2 = df2.set_index(levels) + + return df2 + + +def compute_expected(df_left, df_right, on=None, left_on=None, right_on=None, how=None): + """ + Compute the expected merge result for the test case. + + This method computes the expected result of merging two DataFrames on + a combination of their columns and index levels. It does so by + explicitly dropping/resetting their named index levels, performing a + merge on their columns, and then finally restoring the appropriate + index in the result. + + Parameters + ---------- + df_left : DataFrame + The left DataFrame (may have zero or more named index levels) + df_right : DataFrame + The right DataFrame (may have zero or more named index levels) + on : list of str + The on parameter to the merge operation + left_on : list of str + The left_on parameter to the merge operation + right_on : list of str + The right_on parameter to the merge operation + how : str + The how parameter to the merge operation + + Returns + ------- + DataFrame + The expected merge result + """ + + # Handle on param if specified + if on is not None: + left_on, right_on = on, on + + # Compute input named index levels + left_levels = [n for n in df_left.index.names if n is not None] + right_levels = [n for n in df_right.index.names if n is not None] + + # Compute output named index levels + output_levels = [i for i in left_on if i in right_levels and i in left_levels] + + # Drop index levels that aren't involved in the merge + drop_left = [n for n in left_levels if n not in left_on] + if drop_left: + df_left = df_left.reset_index(drop_left, drop=True) + + drop_right = [n for n in right_levels if n not in right_on] + if drop_right: + df_right = df_right.reset_index(drop_right, drop=True) + + # Convert remaining index levels to columns + reset_left = [n for n in left_levels if n in left_on] + if reset_left: + df_left = df_left.reset_index(level=reset_left) + + reset_right = [n for n in right_levels if n in right_on] + if reset_right: + df_right = df_right.reset_index(level=reset_right) + + # Perform merge + expected = df_left.merge(df_right, left_on=left_on, right_on=right_on, how=how) + + # Restore index levels + if output_levels: + expected = expected.set_index(output_levels) + + return expected + + +@pytest.mark.parametrize( + "on,how", + [ + (["outer"], "inner"), + (["inner"], "left"), + (["outer", "inner"], "right"), + (["inner", "outer"], "outer"), + ], +) +def test_merge_indexes_and_columns_on(left_df, right_df, on, how): + + # Construct expected result + expected = compute_expected(left_df, right_df, on=on, how=how) + + # Perform merge + result = left_df.merge(right_df, on=on, how=how) + tm.assert_frame_equal(result, expected, check_like=True) + + +@pytest.mark.parametrize( + "left_on,right_on,how", + [ + (["outer"], ["outer"], "inner"), + (["inner"], ["inner"], "right"), + (["outer", "inner"], ["outer", "inner"], "left"), + (["inner", "outer"], ["inner", "outer"], "outer"), + ], +) +def test_merge_indexes_and_columns_lefton_righton( + left_df, right_df, left_on, right_on, how +): + + # Construct expected result + expected = compute_expected( + left_df, right_df, left_on=left_on, right_on=right_on, how=how + ) + + # Perform merge + result = left_df.merge(right_df, left_on=left_on, right_on=right_on, how=how) + tm.assert_frame_equal(result, expected, check_like=True) + + +@pytest.mark.parametrize("left_index", ["inner", ["inner", "outer"]]) +def test_join_indexes_and_columns_on(df1, df2, left_index, join_type): + + # Construct left_df + left_df = df1.set_index(left_index) + + # Construct right_df + right_df = df2.set_index(["outer", "inner"]) + + # Result + expected = ( + left_df.reset_index() + .join( + right_df, on=["outer", "inner"], how=join_type, lsuffix="_x", rsuffix="_y" + ) + .set_index(left_index) + ) + + # Perform join + result = left_df.join( + right_df, on=["outer", "inner"], how=join_type, lsuffix="_x", rsuffix="_y" + ) + + tm.assert_frame_equal(result, expected, check_like=True) diff --git a/venv/Lib/site-packages/pandas/tests/reshape/merge/test_merge_ordered.py b/venv/Lib/site-packages/pandas/tests/reshape/merge/test_merge_ordered.py new file mode 100644 index 0000000..e006392 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/reshape/merge/test_merge_ordered.py @@ -0,0 +1,117 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, merge_ordered +import pandas._testing as tm + + +class TestMergeOrdered: + def setup_method(self, method): + self.left = DataFrame({"key": ["a", "c", "e"], "lvalue": [1, 2.0, 3]}) + + self.right = DataFrame({"key": ["b", "c", "d", "f"], "rvalue": [1, 2, 3.0, 4]}) + + def test_basic(self): + result = merge_ordered(self.left, self.right, on="key") + expected = DataFrame( + { + "key": ["a", "b", "c", "d", "e", "f"], + "lvalue": [1, np.nan, 2, np.nan, 3, np.nan], + "rvalue": [np.nan, 1, 2, 3, np.nan, 4], + } + ) + + tm.assert_frame_equal(result, expected) + + def test_ffill(self): + result = merge_ordered(self.left, self.right, on="key", fill_method="ffill") + expected = DataFrame( + { + "key": ["a", "b", "c", "d", "e", "f"], + "lvalue": [1.0, 1, 2, 2, 3, 3.0], + "rvalue": [np.nan, 1, 2, 3, 3, 4], + } + ) + tm.assert_frame_equal(result, expected) + + def test_multigroup(self): + left = pd.concat([self.left, self.left], ignore_index=True) + + left["group"] = ["a"] * 3 + ["b"] * 3 + + result = merge_ordered( + left, self.right, on="key", left_by="group", fill_method="ffill" + ) + expected = DataFrame( + { + "key": ["a", "b", "c", "d", "e", "f"] * 2, + "lvalue": [1.0, 1, 2, 2, 3, 3.0] * 2, + "rvalue": [np.nan, 1, 2, 3, 3, 4] * 2, + } + ) + expected["group"] = ["a"] * 6 + ["b"] * 6 + + tm.assert_frame_equal(result, expected.loc[:, result.columns]) + + result2 = merge_ordered( + self.right, left, on="key", right_by="group", fill_method="ffill" + ) + tm.assert_frame_equal(result, result2.loc[:, result.columns]) + + result = merge_ordered(left, self.right, on="key", left_by="group") + assert result["group"].notna().all() + + def test_merge_type(self): + class NotADataFrame(DataFrame): + @property + def _constructor(self): + return NotADataFrame + + nad = NotADataFrame(self.left) + result = nad.merge(self.right, on="key") + + assert isinstance(result, NotADataFrame) + + def test_empty_sequence_concat(self): + # GH 9157 + empty_pat = "[Nn]o objects" + none_pat = "objects.*None" + test_cases = [ + ((), empty_pat), + ([], empty_pat), + ({}, empty_pat), + ([None], none_pat), + ([None, None], none_pat), + ] + for df_seq, pattern in test_cases: + with pytest.raises(ValueError, match=pattern): + pd.concat(df_seq) + + pd.concat([pd.DataFrame()]) + pd.concat([None, pd.DataFrame()]) + pd.concat([pd.DataFrame(), None]) + + def test_doc_example(self): + left = DataFrame( + { + "group": list("aaabbb"), + "key": ["a", "c", "e", "a", "c", "e"], + "lvalue": [1, 2, 3] * 2, + } + ) + + right = DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]}) + + result = merge_ordered(left, right, fill_method="ffill", left_by="group") + + expected = DataFrame( + { + "group": list("aaaaabbbbb"), + "key": ["a", "b", "c", "d", "e"] * 2, + "lvalue": [1, 1, 2, 2, 3] * 2, + "rvalue": [np.nan, 1, 2, 3, 3] * 2, + } + ) + + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/reshape/merge/test_multi.py b/venv/Lib/site-packages/pandas/tests/reshape/merge/test_multi.py new file mode 100644 index 0000000..1f78c19 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/reshape/merge/test_multi.py @@ -0,0 +1,839 @@ +import numpy as np +from numpy.random import randn +import pytest + +import pandas as pd +from pandas import DataFrame, Index, MultiIndex, Series +import pandas._testing as tm +from pandas.core.reshape.concat import concat +from pandas.core.reshape.merge import merge + + +@pytest.fixture +def left(): + """left dataframe (not multi-indexed) for multi-index join tests""" + # a little relevant example with NAs + key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"] + key2 = ["two", "one", "three", "one", "two", "one", "two", "two", "three", "one"] + + data = np.random.randn(len(key1)) + return DataFrame({"key1": key1, "key2": key2, "data": data}) + + +@pytest.fixture +def right(): + """right dataframe (multi-indexed) for multi-index join tests""" + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["key1", "key2"], + ) + + return DataFrame( + np.random.randn(10, 3), index=index, columns=["j_one", "j_two", "j_three"] + ) + + +@pytest.fixture +def left_multi(): + return DataFrame( + dict( + Origin=["A", "A", "B", "B", "C"], + Destination=["A", "B", "A", "C", "A"], + Period=["AM", "AM", "IP", "AM", "OP"], + TripPurp=["hbw", "nhb", "hbo", "nhb", "hbw"], + Trips=[1987, 3647, 2470, 4296, 4444], + ), + columns=["Origin", "Destination", "Period", "TripPurp", "Trips"], + ).set_index(["Origin", "Destination", "Period", "TripPurp"]) + + +@pytest.fixture +def right_multi(): + return DataFrame( + dict( + Origin=["A", "A", "B", "B", "C", "C", "E"], + Destination=["A", "B", "A", "B", "A", "B", "F"], + Period=["AM", "AM", "IP", "AM", "OP", "IP", "AM"], + LinkType=["a", "b", "c", "b", "a", "b", "a"], + Distance=[100, 80, 90, 80, 75, 35, 55], + ), + columns=["Origin", "Destination", "Period", "LinkType", "Distance"], + ).set_index(["Origin", "Destination", "Period", "LinkType"]) + + +@pytest.fixture +def on_cols_multi(): + return ["Origin", "Destination", "Period"] + + +@pytest.fixture +def idx_cols_multi(): + return ["Origin", "Destination", "Period", "TripPurp", "LinkType"] + + +class TestMergeMulti: + def setup_method(self): + self.index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + self.to_join = DataFrame( + np.random.randn(10, 3), + index=self.index, + columns=["j_one", "j_two", "j_three"], + ) + + # a little relevant example with NAs + key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"] + key2 = [ + "two", + "one", + "three", + "one", + "two", + "one", + "two", + "two", + "three", + "one", + ] + + data = np.random.randn(len(key1)) + self.data = DataFrame({"key1": key1, "key2": key2, "data": data}) + + def test_merge_on_multikey(self, left, right, join_type): + on_cols = ["key1", "key2"] + result = left.join(right, on=on_cols, how=join_type).reset_index(drop=True) + + expected = pd.merge(left, right.reset_index(), on=on_cols, how=join_type) + + tm.assert_frame_equal(result, expected) + + result = left.join(right, on=on_cols, how=join_type, sort=True).reset_index( + drop=True + ) + + expected = pd.merge( + left, right.reset_index(), on=on_cols, how=join_type, sort=True + ) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("sort", [False, True]) + def test_left_join_multi_index(self, left, right, sort): + icols = ["1st", "2nd", "3rd"] + + def bind_cols(df): + iord = lambda a: 0 if a != a else ord(a) + f = lambda ts: ts.map(iord) - ord("a") + return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 1e4 + + def run_asserts(left, right, sort): + res = left.join(right, on=icols, how="left", sort=sort) + + assert len(left) < len(res) + 1 + assert not res["4th"].isna().any() + assert not res["5th"].isna().any() + + tm.assert_series_equal(res["4th"], -res["5th"], check_names=False) + result = bind_cols(res.iloc[:, :-2]) + tm.assert_series_equal(res["4th"], result, check_names=False) + assert result.name is None + + if sort: + tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort")) + + out = merge(left, right.reset_index(), on=icols, sort=sort, how="left") + + res.index = np.arange(len(res)) + tm.assert_frame_equal(out, res) + + lc = list(map(chr, np.arange(ord("a"), ord("z") + 1))) + left = DataFrame(np.random.choice(lc, (5000, 2)), columns=["1st", "3rd"]) + left.insert(1, "2nd", np.random.randint(0, 1000, len(left))) + + i = np.random.permutation(len(left)) + right = left.iloc[i].copy() + + left["4th"] = bind_cols(left) + right["5th"] = -bind_cols(right) + right.set_index(icols, inplace=True) + + run_asserts(left, right, sort) + + # inject some nulls + left.loc[1::23, "1st"] = np.nan + left.loc[2::37, "2nd"] = np.nan + left.loc[3::43, "3rd"] = np.nan + left["4th"] = bind_cols(left) + + i = np.random.permutation(len(left)) + right = left.iloc[i, :-1] + right["5th"] = -bind_cols(right) + right.set_index(icols, inplace=True) + + run_asserts(left, right, sort) + + @pytest.mark.parametrize("sort", [False, True]) + def test_merge_right_vs_left(self, left, right, sort): + # compare left vs right merge with multikey + on_cols = ["key1", "key2"] + merged_left_right = left.merge( + right, left_on=on_cols, right_index=True, how="left", sort=sort + ) + + merge_right_left = right.merge( + left, right_on=on_cols, left_index=True, how="right", sort=sort + ) + + # Reorder columns + merge_right_left = merge_right_left[merged_left_right.columns] + + tm.assert_frame_equal(merged_left_right, merge_right_left) + + def test_merge_multiple_cols_with_mixed_cols_index(self): + # GH29522 + s = pd.Series( + range(6), + pd.MultiIndex.from_product([["A", "B"], [1, 2, 3]], names=["lev1", "lev2"]), + name="Amount", + ) + df = pd.DataFrame( + {"lev1": list("AAABBB"), "lev2": [1, 2, 3, 1, 2, 3], "col": 0} + ) + result = pd.merge(df, s.reset_index(), on=["lev1", "lev2"]) + expected = pd.DataFrame( + { + "lev1": list("AAABBB"), + "lev2": [1, 2, 3, 1, 2, 3], + "col": [0] * 6, + "Amount": range(6), + } + ) + tm.assert_frame_equal(result, expected) + + def test_compress_group_combinations(self): + + # ~ 40000000 possible unique groups + key1 = tm.rands_array(10, 10000) + key1 = np.tile(key1, 2) + key2 = key1[::-1] + + df = DataFrame({"key1": key1, "key2": key2, "value1": np.random.randn(20000)}) + + df2 = DataFrame( + {"key1": key1[::2], "key2": key2[::2], "value2": np.random.randn(10000)} + ) + + # just to hit the label compression code path + merge(df, df2, how="outer") + + def test_left_join_index_preserve_order(self): + + on_cols = ["k1", "k2"] + left = DataFrame( + { + "k1": [0, 1, 2] * 8, + "k2": ["foo", "bar"] * 12, + "v": np.array(np.arange(24), dtype=np.int64), + } + ) + + index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")]) + right = DataFrame({"v2": [5, 7]}, index=index) + + result = left.join(right, on=on_cols) + + expected = left.copy() + expected["v2"] = np.nan + expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5 + expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7 + + tm.assert_frame_equal(result, expected) + + result.sort_values(on_cols, kind="mergesort", inplace=True) + expected = left.join(right, on=on_cols, sort=True) + + tm.assert_frame_equal(result, expected) + + # test join with multi dtypes blocks + left = DataFrame( + { + "k1": [0, 1, 2] * 8, + "k2": ["foo", "bar"] * 12, + "k3": np.array([0, 1, 2] * 8, dtype=np.float32), + "v": np.array(np.arange(24), dtype=np.int32), + } + ) + + index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")]) + right = DataFrame({"v2": [5, 7]}, index=index) + + result = left.join(right, on=on_cols) + + expected = left.copy() + expected["v2"] = np.nan + expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5 + expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7 + + tm.assert_frame_equal(result, expected) + + result = result.sort_values(on_cols, kind="mergesort") + expected = left.join(right, on=on_cols, sort=True) + + tm.assert_frame_equal(result, expected) + + def test_left_join_index_multi_match_multiindex(self): + left = DataFrame( + [ + ["X", "Y", "C", "a"], + ["W", "Y", "C", "e"], + ["V", "Q", "A", "h"], + ["V", "R", "D", "i"], + ["X", "Y", "D", "b"], + ["X", "Y", "A", "c"], + ["W", "Q", "B", "f"], + ["W", "R", "C", "g"], + ["V", "Y", "C", "j"], + ["X", "Y", "B", "d"], + ], + columns=["cola", "colb", "colc", "tag"], + index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8], + ) + + right = DataFrame( + [ + ["W", "R", "C", 0], + ["W", "Q", "B", 3], + ["W", "Q", "B", 8], + ["X", "Y", "A", 1], + ["X", "Y", "A", 4], + ["X", "Y", "B", 5], + ["X", "Y", "C", 6], + ["X", "Y", "C", 9], + ["X", "Q", "C", -6], + ["X", "R", "C", -9], + ["V", "Y", "C", 7], + ["V", "R", "D", 2], + ["V", "R", "D", -1], + ["V", "Q", "A", -3], + ], + columns=["col1", "col2", "col3", "val"], + ).set_index(["col1", "col2", "col3"]) + + result = left.join(right, on=["cola", "colb", "colc"], how="left") + + expected = DataFrame( + [ + ["X", "Y", "C", "a", 6], + ["X", "Y", "C", "a", 9], + ["W", "Y", "C", "e", np.nan], + ["V", "Q", "A", "h", -3], + ["V", "R", "D", "i", 2], + ["V", "R", "D", "i", -1], + ["X", "Y", "D", "b", np.nan], + ["X", "Y", "A", "c", 1], + ["X", "Y", "A", "c", 4], + ["W", "Q", "B", "f", 3], + ["W", "Q", "B", "f", 8], + ["W", "R", "C", "g", 0], + ["V", "Y", "C", "j", 7], + ["X", "Y", "B", "d", 5], + ], + columns=["cola", "colb", "colc", "tag", "val"], + index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8], + ) + + tm.assert_frame_equal(result, expected) + + result = left.join(right, on=["cola", "colb", "colc"], how="left", sort=True) + + expected = expected.sort_values(["cola", "colb", "colc"], kind="mergesort") + + tm.assert_frame_equal(result, expected) + + def test_left_join_index_multi_match(self): + left = DataFrame( + [["c", 0], ["b", 1], ["a", 2], ["b", 3]], + columns=["tag", "val"], + index=[2, 0, 1, 3], + ) + + right = DataFrame( + [ + ["a", "v"], + ["c", "w"], + ["c", "x"], + ["d", "y"], + ["a", "z"], + ["c", "r"], + ["e", "q"], + ["c", "s"], + ], + columns=["tag", "char"], + ).set_index("tag") + + result = left.join(right, on="tag", how="left") + + expected = DataFrame( + [ + ["c", 0, "w"], + ["c", 0, "x"], + ["c", 0, "r"], + ["c", 0, "s"], + ["b", 1, np.nan], + ["a", 2, "v"], + ["a", 2, "z"], + ["b", 3, np.nan], + ], + columns=["tag", "val", "char"], + index=[2, 2, 2, 2, 0, 1, 1, 3], + ) + + tm.assert_frame_equal(result, expected) + + result = left.join(right, on="tag", how="left", sort=True) + expected2 = expected.sort_values("tag", kind="mergesort") + + tm.assert_frame_equal(result, expected2) + + # GH7331 - maintain left frame order in left merge + result = merge(left, right.reset_index(), how="left", on="tag") + expected.index = np.arange(len(expected)) + tm.assert_frame_equal(result, expected) + + def test_left_merge_na_buglet(self): + left = DataFrame( + { + "id": list("abcde"), + "v1": randn(5), + "v2": randn(5), + "dummy": list("abcde"), + "v3": randn(5), + }, + columns=["id", "v1", "v2", "dummy", "v3"], + ) + right = DataFrame( + { + "id": ["a", "b", np.nan, np.nan, np.nan], + "sv3": [1.234, 5.678, np.nan, np.nan, np.nan], + } + ) + + result = merge(left, right, on="id", how="left") + + rdf = right.drop(["id"], axis=1) + expected = left.join(rdf) + tm.assert_frame_equal(result, expected) + + def test_merge_na_keys(self): + data = [ + [1950, "A", 1.5], + [1950, "B", 1.5], + [1955, "B", 1.5], + [1960, "B", np.nan], + [1970, "B", 4.0], + [1950, "C", 4.0], + [1960, "C", np.nan], + [1965, "C", 3.0], + [1970, "C", 4.0], + ] + + frame = DataFrame(data, columns=["year", "panel", "data"]) + + other_data = [ + [1960, "A", np.nan], + [1970, "A", np.nan], + [1955, "A", np.nan], + [1965, "A", np.nan], + [1965, "B", np.nan], + [1955, "C", np.nan], + ] + other = DataFrame(other_data, columns=["year", "panel", "data"]) + + result = frame.merge(other, how="outer") + + expected = frame.fillna(-999).merge(other.fillna(-999), how="outer") + expected = expected.replace(-999, np.nan) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("klass", [None, np.asarray, Series, Index]) + def test_merge_datetime_index(self, klass): + # see gh-19038 + df = DataFrame( + [1, 2, 3], ["2016-01-01", "2017-01-01", "2018-01-01"], columns=["a"] + ) + df.index = pd.to_datetime(df.index) + on_vector = df.index.year + + if klass is not None: + on_vector = klass(on_vector) + + expected = DataFrame({"a": [1, 2, 3], "key_1": [2016, 2017, 2018]}) + + result = df.merge(df, on=["a", on_vector], how="inner") + tm.assert_frame_equal(result, expected) + + expected = DataFrame( + {"key_0": [2016, 2017, 2018], "a_x": [1, 2, 3], "a_y": [1, 2, 3]} + ) + + result = df.merge(df, on=[df.index.year], how="inner") + tm.assert_frame_equal(result, expected) + + def test_join_multi_levels(self): + + # GH 3662 + # merge multi-levels + household = DataFrame( + dict( + household_id=[1, 2, 3], + male=[0, 1, 0], + wealth=[196087.3, 316478.7, 294750], + ), + columns=["household_id", "male", "wealth"], + ).set_index("household_id") + portfolio = DataFrame( + dict( + household_id=[1, 2, 2, 3, 3, 3, 4], + asset_id=[ + "nl0000301109", + "nl0000289783", + "gb00b03mlx29", + "gb00b03mlx29", + "lu0197800237", + "nl0000289965", + np.nan, + ], + name=[ + "ABN Amro", + "Robeco", + "Royal Dutch Shell", + "Royal Dutch Shell", + "AAB Eastern Europe Equity Fund", + "Postbank BioTech Fonds", + np.nan, + ], + share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0], + ), + columns=["household_id", "asset_id", "name", "share"], + ).set_index(["household_id", "asset_id"]) + result = household.join(portfolio, how="inner") + expected = ( + DataFrame( + dict( + male=[0, 1, 1, 0, 0, 0], + wealth=[196087.3, 316478.7, 316478.7, 294750.0, 294750.0, 294750.0], + name=[ + "ABN Amro", + "Robeco", + "Royal Dutch Shell", + "Royal Dutch Shell", + "AAB Eastern Europe Equity Fund", + "Postbank BioTech Fonds", + ], + share=[1.00, 0.40, 0.60, 0.15, 0.60, 0.25], + household_id=[1, 2, 2, 3, 3, 3], + asset_id=[ + "nl0000301109", + "nl0000289783", + "gb00b03mlx29", + "gb00b03mlx29", + "lu0197800237", + "nl0000289965", + ], + ) + ) + .set_index(["household_id", "asset_id"]) + .reindex(columns=["male", "wealth", "name", "share"]) + ) + tm.assert_frame_equal(result, expected) + + # equivalency + result = merge( + household.reset_index(), + portfolio.reset_index(), + on=["household_id"], + how="inner", + ).set_index(["household_id", "asset_id"]) + tm.assert_frame_equal(result, expected) + + result = household.join(portfolio, how="outer") + expected = concat( + [ + expected, + ( + DataFrame( + dict(share=[1.00]), + index=MultiIndex.from_tuples( + [(4, np.nan)], names=["household_id", "asset_id"] + ), + ) + ), + ], + axis=0, + sort=True, + ).reindex(columns=expected.columns) + tm.assert_frame_equal(result, expected) + + # invalid cases + household.index.name = "foo" + + with pytest.raises(ValueError): + household.join(portfolio, how="inner") + + portfolio2 = portfolio.copy() + portfolio2.index.set_names(["household_id", "foo"]) + + with pytest.raises(ValueError): + portfolio2.join(portfolio, how="inner") + + def test_join_multi_levels2(self): + + # some more advanced merges + # GH6360 + household = DataFrame( + dict( + household_id=[1, 2, 2, 3, 3, 3, 4], + asset_id=[ + "nl0000301109", + "nl0000301109", + "gb00b03mlx29", + "gb00b03mlx29", + "lu0197800237", + "nl0000289965", + np.nan, + ], + share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0], + ), + columns=["household_id", "asset_id", "share"], + ).set_index(["household_id", "asset_id"]) + + log_return = DataFrame( + dict( + asset_id=[ + "gb00b03mlx29", + "gb00b03mlx29", + "gb00b03mlx29", + "lu0197800237", + "lu0197800237", + ], + t=[233, 234, 235, 180, 181], + log_return=[0.09604978, -0.06524096, 0.03532373, 0.03025441, 0.036997], + ) + ).set_index(["asset_id", "t"]) + + expected = ( + DataFrame( + dict( + household_id=[2, 2, 2, 3, 3, 3, 3, 3], + asset_id=[ + "gb00b03mlx29", + "gb00b03mlx29", + "gb00b03mlx29", + "gb00b03mlx29", + "gb00b03mlx29", + "gb00b03mlx29", + "lu0197800237", + "lu0197800237", + ], + t=[233, 234, 235, 233, 234, 235, 180, 181], + share=[0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6], + log_return=[ + 0.09604978, + -0.06524096, + 0.03532373, + 0.09604978, + -0.06524096, + 0.03532373, + 0.03025441, + 0.036997, + ], + ) + ) + .set_index(["household_id", "asset_id", "t"]) + .reindex(columns=["share", "log_return"]) + ) + + # this is the equivalency + result = merge( + household.reset_index(), + log_return.reset_index(), + on=["asset_id"], + how="inner", + ).set_index(["household_id", "asset_id", "t"]) + tm.assert_frame_equal(result, expected) + + expected = ( + DataFrame( + dict( + household_id=[1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4], + asset_id=[ + "nl0000301109", + "nl0000301109", + "gb00b03mlx29", + "gb00b03mlx29", + "gb00b03mlx29", + "gb00b03mlx29", + "gb00b03mlx29", + "gb00b03mlx29", + "lu0197800237", + "lu0197800237", + "nl0000289965", + None, + ], + t=[None, None, 233, 234, 235, 233, 234, 235, 180, 181, None, None], + share=[ + 1.0, + 0.4, + 0.6, + 0.6, + 0.6, + 0.15, + 0.15, + 0.15, + 0.6, + 0.6, + 0.25, + 1.0, + ], + log_return=[ + None, + None, + 0.09604978, + -0.06524096, + 0.03532373, + 0.09604978, + -0.06524096, + 0.03532373, + 0.03025441, + 0.036997, + None, + None, + ], + ) + ) + .set_index(["household_id", "asset_id", "t"]) + .reindex(columns=["share", "log_return"]) + ) + + result = merge( + household.reset_index(), + log_return.reset_index(), + on=["asset_id"], + how="outer", + ).set_index(["household_id", "asset_id", "t"]) + + tm.assert_frame_equal(result, expected) + + +class TestJoinMultiMulti: + def test_join_multi_multi( + self, left_multi, right_multi, join_type, on_cols_multi, idx_cols_multi + ): + # Multi-index join tests + expected = ( + pd.merge( + left_multi.reset_index(), + right_multi.reset_index(), + how=join_type, + on=on_cols_multi, + ) + .set_index(idx_cols_multi) + .sort_index() + ) + + result = left_multi.join(right_multi, how=join_type).sort_index() + tm.assert_frame_equal(result, expected) + + def test_join_multi_empty_frames( + self, left_multi, right_multi, join_type, on_cols_multi, idx_cols_multi + ): + + left_multi = left_multi.drop(columns=left_multi.columns) + right_multi = right_multi.drop(columns=right_multi.columns) + + expected = ( + pd.merge( + left_multi.reset_index(), + right_multi.reset_index(), + how=join_type, + on=on_cols_multi, + ) + .set_index(idx_cols_multi) + .sort_index() + ) + + result = left_multi.join(right_multi, how=join_type).sort_index() + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("box", [None, np.asarray, Series, Index]) + def test_merge_datetime_index(self, box): + # see gh-19038 + df = DataFrame( + [1, 2, 3], ["2016-01-01", "2017-01-01", "2018-01-01"], columns=["a"] + ) + df.index = pd.to_datetime(df.index) + on_vector = df.index.year + + if box is not None: + on_vector = box(on_vector) + + expected = DataFrame({"a": [1, 2, 3], "key_1": [2016, 2017, 2018]}) + + result = df.merge(df, on=["a", on_vector], how="inner") + tm.assert_frame_equal(result, expected) + + expected = DataFrame( + {"key_0": [2016, 2017, 2018], "a_x": [1, 2, 3], "a_y": [1, 2, 3]} + ) + + result = df.merge(df, on=[df.index.year], how="inner") + tm.assert_frame_equal(result, expected) + + def test_single_common_level(self): + index_left = pd.MultiIndex.from_tuples( + [("K0", "X0"), ("K0", "X1"), ("K1", "X2")], names=["key", "X"] + ) + + left = pd.DataFrame( + {"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, index=index_left + ) + + index_right = pd.MultiIndex.from_tuples( + [("K0", "Y0"), ("K1", "Y1"), ("K2", "Y2"), ("K2", "Y3")], names=["key", "Y"] + ) + + right = pd.DataFrame( + {"C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"]}, + index=index_right, + ) + + result = left.join(right) + expected = pd.merge( + left.reset_index(), right.reset_index(), on=["key"], how="inner" + ).set_index(["key", "X", "Y"]) + + tm.assert_frame_equal(result, expected) + + def test_join_multi_wrong_order(self): + # GH 25760 + # GH 28956 + + midx1 = pd.MultiIndex.from_product([[1, 2], [3, 4]], names=["a", "b"]) + midx3 = pd.MultiIndex.from_tuples([(4, 1), (3, 2), (3, 1)], names=["b", "a"]) + + left = pd.DataFrame(index=midx1, data={"x": [10, 20, 30, 40]}) + right = pd.DataFrame(index=midx3, data={"y": ["foo", "bar", "fing"]}) + + result = left.join(right) + + expected = pd.DataFrame( + index=midx1, + data={"x": [10, 20, 30, 40], "y": ["fing", "foo", "bar", np.nan]}, + ) + + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/reshape/merge/test_pivot_old.py b/venv/Lib/site-packages/pandas/tests/reshape/merge/test_pivot_old.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/reshape/test_concat.py b/venv/Lib/site-packages/pandas/tests/reshape/test_concat.py new file mode 100644 index 0000000..990669f --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/reshape/test_concat.py @@ -0,0 +1,2752 @@ +from collections import OrderedDict, abc, deque +import datetime as dt +from datetime import datetime +from decimal import Decimal +from io import StringIO +from itertools import combinations +from warnings import catch_warnings + +import dateutil +import numpy as np +from numpy.random import randn +import pytest + +from pandas.core.dtypes.dtypes import CategoricalDtype + +import pandas as pd +from pandas import ( + Categorical, + DataFrame, + DatetimeIndex, + Index, + MultiIndex, + Series, + Timestamp, + concat, + date_range, + isna, + read_csv, +) +import pandas._testing as tm +from pandas.core.arrays import SparseArray +from pandas.core.construction import create_series_with_explicit_dtype +from pandas.tests.extension.decimal import to_decimal + + +@pytest.fixture(params=[True, False]) +def sort(request): + """Boolean sort keyword for concat and DataFrame.append.""" + return request.param + + +class TestConcatAppendCommon: + """ + Test common dtype coercion rules between concat and append. + """ + + def setup_method(self, method): + + dt_data = [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-03"), + ] + tz_data = [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timestamp("2011-01-03", tz="US/Eastern"), + ] + + td_data = [ + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), + ] + + period_data = [ + pd.Period("2011-01", freq="M"), + pd.Period("2011-02", freq="M"), + pd.Period("2011-03", freq="M"), + ] + + self.data = { + "bool": [True, False, True], + "int64": [1, 2, 3], + "float64": [1.1, np.nan, 3.3], + "category": pd.Categorical(["X", "Y", "Z"]), + "object": ["a", "b", "c"], + "datetime64[ns]": dt_data, + "datetime64[ns, US/Eastern]": tz_data, + "timedelta64[ns]": td_data, + "period[M]": period_data, + } + + def _check_expected_dtype(self, obj, label): + """ + Check whether obj has expected dtype depending on label + considering not-supported dtypes + """ + if isinstance(obj, pd.Index): + if label == "bool": + assert obj.dtype == "object" + else: + assert obj.dtype == label + elif isinstance(obj, pd.Series): + if label.startswith("period"): + assert obj.dtype == "Period[M]" + else: + assert obj.dtype == label + else: + raise ValueError + + def test_dtypes(self): + # to confirm test case covers intended dtypes + for typ, vals in self.data.items(): + self._check_expected_dtype(pd.Index(vals), typ) + self._check_expected_dtype(pd.Series(vals), typ) + + def test_concatlike_same_dtypes(self): + # GH 13660 + for typ1, vals1 in self.data.items(): + + vals2 = vals1 + vals3 = vals1 + + if typ1 == "category": + exp_data = pd.Categorical(list(vals1) + list(vals2)) + exp_data3 = pd.Categorical(list(vals1) + list(vals2) + list(vals3)) + else: + exp_data = vals1 + vals2 + exp_data3 = vals1 + vals2 + vals3 + + # ----- Index ----- # + + # index.append + res = pd.Index(vals1).append(pd.Index(vals2)) + exp = pd.Index(exp_data) + tm.assert_index_equal(res, exp) + + # 3 elements + res = pd.Index(vals1).append([pd.Index(vals2), pd.Index(vals3)]) + exp = pd.Index(exp_data3) + tm.assert_index_equal(res, exp) + + # index.append name mismatch + i1 = pd.Index(vals1, name="x") + i2 = pd.Index(vals2, name="y") + res = i1.append(i2) + exp = pd.Index(exp_data) + tm.assert_index_equal(res, exp) + + # index.append name match + i1 = pd.Index(vals1, name="x") + i2 = pd.Index(vals2, name="x") + res = i1.append(i2) + exp = pd.Index(exp_data, name="x") + tm.assert_index_equal(res, exp) + + # cannot append non-index + with pytest.raises(TypeError, match="all inputs must be Index"): + pd.Index(vals1).append(vals2) + + with pytest.raises(TypeError, match="all inputs must be Index"): + pd.Index(vals1).append([pd.Index(vals2), vals3]) + + # ----- Series ----- # + + # series.append + res = pd.Series(vals1).append(pd.Series(vals2), ignore_index=True) + exp = pd.Series(exp_data) + tm.assert_series_equal(res, exp, check_index_type=True) + + # concat + res = pd.concat([pd.Series(vals1), pd.Series(vals2)], ignore_index=True) + tm.assert_series_equal(res, exp, check_index_type=True) + + # 3 elements + res = pd.Series(vals1).append( + [pd.Series(vals2), pd.Series(vals3)], ignore_index=True + ) + exp = pd.Series(exp_data3) + tm.assert_series_equal(res, exp) + + res = pd.concat( + [pd.Series(vals1), pd.Series(vals2), pd.Series(vals3)], + ignore_index=True, + ) + tm.assert_series_equal(res, exp) + + # name mismatch + s1 = pd.Series(vals1, name="x") + s2 = pd.Series(vals2, name="y") + res = s1.append(s2, ignore_index=True) + exp = pd.Series(exp_data) + tm.assert_series_equal(res, exp, check_index_type=True) + + res = pd.concat([s1, s2], ignore_index=True) + tm.assert_series_equal(res, exp, check_index_type=True) + + # name match + s1 = pd.Series(vals1, name="x") + s2 = pd.Series(vals2, name="x") + res = s1.append(s2, ignore_index=True) + exp = pd.Series(exp_data, name="x") + tm.assert_series_equal(res, exp, check_index_type=True) + + res = pd.concat([s1, s2], ignore_index=True) + tm.assert_series_equal(res, exp, check_index_type=True) + + # cannot append non-index + msg = ( + r"cannot concatenate object of type '.+';" + " only Series and DataFrame objs are valid" + ) + with pytest.raises(TypeError, match=msg): + pd.Series(vals1).append(vals2) + + with pytest.raises(TypeError, match=msg): + pd.Series(vals1).append([pd.Series(vals2), vals3]) + + with pytest.raises(TypeError, match=msg): + pd.concat([pd.Series(vals1), vals2]) + + with pytest.raises(TypeError, match=msg): + pd.concat([pd.Series(vals1), pd.Series(vals2), vals3]) + + def test_concatlike_dtypes_coercion(self): + # GH 13660 + for typ1, vals1 in self.data.items(): + for typ2, vals2 in self.data.items(): + + vals3 = vals2 + + # basically infer + exp_index_dtype = None + exp_series_dtype = None + + if typ1 == typ2: + # same dtype is tested in test_concatlike_same_dtypes + continue + elif typ1 == "category" or typ2 == "category": + # ToDo: suspicious + continue + + # specify expected dtype + if typ1 == "bool" and typ2 in ("int64", "float64"): + # series coerces to numeric based on numpy rule + # index doesn't because bool is object dtype + exp_series_dtype = typ2 + elif typ2 == "bool" and typ1 in ("int64", "float64"): + exp_series_dtype = typ1 + elif ( + typ1 == "datetime64[ns, US/Eastern]" + or typ2 == "datetime64[ns, US/Eastern]" + or typ1 == "timedelta64[ns]" + or typ2 == "timedelta64[ns]" + ): + exp_index_dtype = object + exp_series_dtype = object + + exp_data = vals1 + vals2 + exp_data3 = vals1 + vals2 + vals3 + + # ----- Index ----- # + + # index.append + res = pd.Index(vals1).append(pd.Index(vals2)) + exp = pd.Index(exp_data, dtype=exp_index_dtype) + tm.assert_index_equal(res, exp) + + # 3 elements + res = pd.Index(vals1).append([pd.Index(vals2), pd.Index(vals3)]) + exp = pd.Index(exp_data3, dtype=exp_index_dtype) + tm.assert_index_equal(res, exp) + + # ----- Series ----- # + + # series.append + res = pd.Series(vals1).append(pd.Series(vals2), ignore_index=True) + exp = pd.Series(exp_data, dtype=exp_series_dtype) + tm.assert_series_equal(res, exp, check_index_type=True) + + # concat + res = pd.concat([pd.Series(vals1), pd.Series(vals2)], ignore_index=True) + tm.assert_series_equal(res, exp, check_index_type=True) + + # 3 elements + res = pd.Series(vals1).append( + [pd.Series(vals2), pd.Series(vals3)], ignore_index=True + ) + exp = pd.Series(exp_data3, dtype=exp_series_dtype) + tm.assert_series_equal(res, exp) + + res = pd.concat( + [pd.Series(vals1), pd.Series(vals2), pd.Series(vals3)], + ignore_index=True, + ) + tm.assert_series_equal(res, exp) + + def test_concatlike_common_coerce_to_pandas_object(self): + # GH 13626 + # result must be Timestamp/Timedelta, not datetime.datetime/timedelta + dti = pd.DatetimeIndex(["2011-01-01", "2011-01-02"]) + tdi = pd.TimedeltaIndex(["1 days", "2 days"]) + + exp = pd.Index( + [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + ] + ) + + res = dti.append(tdi) + tm.assert_index_equal(res, exp) + assert isinstance(res[0], pd.Timestamp) + assert isinstance(res[-1], pd.Timedelta) + + dts = pd.Series(dti) + tds = pd.Series(tdi) + res = dts.append(tds) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + assert isinstance(res.iloc[0], pd.Timestamp) + assert isinstance(res.iloc[-1], pd.Timedelta) + + res = pd.concat([dts, tds]) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + assert isinstance(res.iloc[0], pd.Timestamp) + assert isinstance(res.iloc[-1], pd.Timedelta) + + def test_concatlike_datetimetz(self, tz_aware_fixture): + tz = tz_aware_fixture + # GH 7795 + dti1 = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], tz=tz) + dti2 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz=tz) + + exp = pd.DatetimeIndex( + ["2011-01-01", "2011-01-02", "2012-01-01", "2012-01-02"], tz=tz + ) + + res = dti1.append(dti2) + tm.assert_index_equal(res, exp) + + dts1 = pd.Series(dti1) + dts2 = pd.Series(dti2) + res = dts1.append(dts2) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([dts1, dts2]) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + @pytest.mark.parametrize("tz", ["UTC", "US/Eastern", "Asia/Tokyo", "EST5EDT"]) + def test_concatlike_datetimetz_short(self, tz): + # GH#7795 + ix1 = pd.date_range(start="2014-07-15", end="2014-07-17", freq="D", tz=tz) + ix2 = pd.DatetimeIndex(["2014-07-11", "2014-07-21"], tz=tz) + df1 = pd.DataFrame(0, index=ix1, columns=["A", "B"]) + df2 = pd.DataFrame(0, index=ix2, columns=["A", "B"]) + + exp_idx = pd.DatetimeIndex( + ["2014-07-15", "2014-07-16", "2014-07-17", "2014-07-11", "2014-07-21"], + tz=tz, + ) + exp = pd.DataFrame(0, index=exp_idx, columns=["A", "B"]) + + tm.assert_frame_equal(df1.append(df2), exp) + tm.assert_frame_equal(pd.concat([df1, df2]), exp) + + def test_concatlike_datetimetz_to_object(self, tz_aware_fixture): + tz = tz_aware_fixture + # GH 13660 + + # different tz coerces to object + dti1 = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], tz=tz) + dti2 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"]) + + exp = pd.Index( + [ + pd.Timestamp("2011-01-01", tz=tz), + pd.Timestamp("2011-01-02", tz=tz), + pd.Timestamp("2012-01-01"), + pd.Timestamp("2012-01-02"), + ], + dtype=object, + ) + + res = dti1.append(dti2) + tm.assert_index_equal(res, exp) + + dts1 = pd.Series(dti1) + dts2 = pd.Series(dti2) + res = dts1.append(dts2) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([dts1, dts2]) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + # different tz + dti3 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz="US/Pacific") + + exp = pd.Index( + [ + pd.Timestamp("2011-01-01", tz=tz), + pd.Timestamp("2011-01-02", tz=tz), + pd.Timestamp("2012-01-01", tz="US/Pacific"), + pd.Timestamp("2012-01-02", tz="US/Pacific"), + ], + dtype=object, + ) + + res = dti1.append(dti3) + # tm.assert_index_equal(res, exp) + + dts1 = pd.Series(dti1) + dts3 = pd.Series(dti3) + res = dts1.append(dts3) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([dts1, dts3]) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + def test_concatlike_common_period(self): + # GH 13660 + pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M") + pi2 = pd.PeriodIndex(["2012-01", "2012-02"], freq="M") + + exp = pd.PeriodIndex(["2011-01", "2011-02", "2012-01", "2012-02"], freq="M") + + res = pi1.append(pi2) + tm.assert_index_equal(res, exp) + + ps1 = pd.Series(pi1) + ps2 = pd.Series(pi2) + res = ps1.append(ps2) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([ps1, ps2]) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + def test_concatlike_common_period_diff_freq_to_object(self): + # GH 13221 + pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M") + pi2 = pd.PeriodIndex(["2012-01-01", "2012-02-01"], freq="D") + + exp = pd.Index( + [ + pd.Period("2011-01", freq="M"), + pd.Period("2011-02", freq="M"), + pd.Period("2012-01-01", freq="D"), + pd.Period("2012-02-01", freq="D"), + ], + dtype=object, + ) + + res = pi1.append(pi2) + tm.assert_index_equal(res, exp) + + ps1 = pd.Series(pi1) + ps2 = pd.Series(pi2) + res = ps1.append(ps2) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([ps1, ps2]) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + def test_concatlike_common_period_mixed_dt_to_object(self): + # GH 13221 + # different datetimelike + pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M") + tdi = pd.TimedeltaIndex(["1 days", "2 days"]) + exp = pd.Index( + [ + pd.Period("2011-01", freq="M"), + pd.Period("2011-02", freq="M"), + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + ], + dtype=object, + ) + + res = pi1.append(tdi) + tm.assert_index_equal(res, exp) + + ps1 = pd.Series(pi1) + tds = pd.Series(tdi) + res = ps1.append(tds) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([ps1, tds]) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + # inverse + exp = pd.Index( + [ + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Period("2011-01", freq="M"), + pd.Period("2011-02", freq="M"), + ], + dtype=object, + ) + + res = tdi.append(pi1) + tm.assert_index_equal(res, exp) + + ps1 = pd.Series(pi1) + tds = pd.Series(tdi) + res = tds.append(ps1) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([tds, ps1]) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + def test_concat_categorical(self): + # GH 13524 + + # same categories -> category + s1 = pd.Series([1, 2, np.nan], dtype="category") + s2 = pd.Series([2, 1, 2], dtype="category") + + exp = pd.Series([1, 2, np.nan, 2, 1, 2], dtype="category") + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + # partially different categories => not-category + s1 = pd.Series([3, 2], dtype="category") + s2 = pd.Series([2, 1], dtype="category") + + exp = pd.Series([3, 2, 2, 1]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + # completely different categories (same dtype) => not-category + s1 = pd.Series([10, 11, np.nan], dtype="category") + s2 = pd.Series([np.nan, 1, 3, 2], dtype="category") + + exp = pd.Series([10, 11, np.nan, np.nan, 1, 3, 2], dtype="object") + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + def test_union_categorical_same_categories_different_order(self): + # https://github.com/pandas-dev/pandas/issues/19096 + a = pd.Series(Categorical(["a", "b", "c"], categories=["a", "b", "c"])) + b = pd.Series(Categorical(["a", "b", "c"], categories=["b", "a", "c"])) + result = pd.concat([a, b], ignore_index=True) + expected = pd.Series( + Categorical(["a", "b", "c", "a", "b", "c"], categories=["a", "b", "c"]) + ) + tm.assert_series_equal(result, expected) + + def test_concat_categorical_coercion(self): + # GH 13524 + + # category + not-category => not-category + s1 = pd.Series([1, 2, np.nan], dtype="category") + s2 = pd.Series([2, 1, 2]) + + exp = pd.Series([1, 2, np.nan, 2, 1, 2], dtype="object") + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + # result shouldn't be affected by 1st elem dtype + exp = pd.Series([2, 1, 2, 1, 2, np.nan], dtype="object") + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # all values are not in category => not-category + s1 = pd.Series([3, 2], dtype="category") + s2 = pd.Series([2, 1]) + + exp = pd.Series([3, 2, 2, 1]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = pd.Series([2, 1, 3, 2]) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # completely different categories => not-category + s1 = pd.Series([10, 11, np.nan], dtype="category") + s2 = pd.Series([1, 3, 2]) + + exp = pd.Series([10, 11, np.nan, 1, 3, 2], dtype="object") + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = pd.Series([1, 3, 2, 10, 11, np.nan], dtype="object") + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # different dtype => not-category + s1 = pd.Series([10, 11, np.nan], dtype="category") + s2 = pd.Series(["a", "b", "c"]) + + exp = pd.Series([10, 11, np.nan, "a", "b", "c"]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = pd.Series(["a", "b", "c", 10, 11, np.nan]) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # if normal series only contains NaN-likes => not-category + s1 = pd.Series([10, 11], dtype="category") + s2 = pd.Series([np.nan, np.nan, np.nan]) + + exp = pd.Series([10, 11, np.nan, np.nan, np.nan]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = pd.Series([np.nan, np.nan, np.nan, 10, 11]) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + def test_concat_categorical_3elem_coercion(self): + # GH 13524 + + # mixed dtypes => not-category + s1 = pd.Series([1, 2, np.nan], dtype="category") + s2 = pd.Series([2, 1, 2], dtype="category") + s3 = pd.Series([1, 2, 1, 2, np.nan]) + + exp = pd.Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan], dtype="object") + tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) + tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) + + exp = pd.Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2], dtype="object") + tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) + + # values are all in either category => not-category + s1 = pd.Series([4, 5, 6], dtype="category") + s2 = pd.Series([1, 2, 3], dtype="category") + s3 = pd.Series([1, 3, 4]) + + exp = pd.Series([4, 5, 6, 1, 2, 3, 1, 3, 4]) + tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) + tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) + + exp = pd.Series([1, 3, 4, 4, 5, 6, 1, 2, 3]) + tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) + + # values are all in either category => not-category + s1 = pd.Series([4, 5, 6], dtype="category") + s2 = pd.Series([1, 2, 3], dtype="category") + s3 = pd.Series([10, 11, 12]) + + exp = pd.Series([4, 5, 6, 1, 2, 3, 10, 11, 12]) + tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) + tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) + + exp = pd.Series([10, 11, 12, 4, 5, 6, 1, 2, 3]) + tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) + + def test_concat_categorical_multi_coercion(self): + # GH 13524 + + s1 = pd.Series([1, 3], dtype="category") + s2 = pd.Series([3, 4], dtype="category") + s3 = pd.Series([2, 3]) + s4 = pd.Series([2, 2], dtype="category") + s5 = pd.Series([1, np.nan]) + s6 = pd.Series([1, 3, 2], dtype="category") + + # mixed dtype, values are all in categories => not-category + exp = pd.Series([1, 3, 3, 4, 2, 3, 2, 2, 1, np.nan, 1, 3, 2]) + res = pd.concat([s1, s2, s3, s4, s5, s6], ignore_index=True) + tm.assert_series_equal(res, exp) + res = s1.append([s2, s3, s4, s5, s6], ignore_index=True) + tm.assert_series_equal(res, exp) + + exp = pd.Series([1, 3, 2, 1, np.nan, 2, 2, 2, 3, 3, 4, 1, 3]) + res = pd.concat([s6, s5, s4, s3, s2, s1], ignore_index=True) + tm.assert_series_equal(res, exp) + res = s6.append([s5, s4, s3, s2, s1], ignore_index=True) + tm.assert_series_equal(res, exp) + + def test_concat_categorical_ordered(self): + # GH 13524 + + s1 = pd.Series(pd.Categorical([1, 2, np.nan], ordered=True)) + s2 = pd.Series(pd.Categorical([2, 1, 2], ordered=True)) + + exp = pd.Series(pd.Categorical([1, 2, np.nan, 2, 1, 2], ordered=True)) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = pd.Series( + pd.Categorical([1, 2, np.nan, 2, 1, 2, 1, 2, np.nan], ordered=True) + ) + tm.assert_series_equal(pd.concat([s1, s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s1.append([s2, s1], ignore_index=True), exp) + + def test_concat_categorical_coercion_nan(self): + # GH 13524 + + # some edge cases + # category + not-category => not category + s1 = pd.Series(np.array([np.nan, np.nan], dtype=np.float64), dtype="category") + s2 = pd.Series([np.nan, 1]) + + exp = pd.Series([np.nan, np.nan, np.nan, 1]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + s1 = pd.Series([1, np.nan], dtype="category") + s2 = pd.Series([np.nan, np.nan]) + + exp = pd.Series([1, np.nan, np.nan, np.nan], dtype="object") + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + # mixed dtype, all nan-likes => not-category + s1 = pd.Series([np.nan, np.nan], dtype="category") + s2 = pd.Series([np.nan, np.nan]) + + exp = pd.Series([np.nan, np.nan, np.nan, np.nan]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # all category nan-likes => category + s1 = pd.Series([np.nan, np.nan], dtype="category") + s2 = pd.Series([np.nan, np.nan], dtype="category") + + exp = pd.Series([np.nan, np.nan, np.nan, np.nan], dtype="category") + + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + def test_concat_categorical_empty(self): + # GH 13524 + + s1 = pd.Series([], dtype="category") + s2 = pd.Series([1, 2], dtype="category") + + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) + tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) + + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2) + tm.assert_series_equal(s2.append(s1, ignore_index=True), s2) + + s1 = pd.Series([], dtype="category") + s2 = pd.Series([], dtype="category") + + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) + tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) + + s1 = pd.Series([], dtype="category") + s2 = pd.Series([], dtype="object") + + # different dtype => not-category + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) + tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2) + tm.assert_series_equal(s2.append(s1, ignore_index=True), s2) + + s1 = pd.Series([], dtype="category") + s2 = pd.Series([np.nan, np.nan]) + + # empty Series is ignored + exp = pd.Series([np.nan, np.nan]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + +class TestAppend: + def test_append(self, sort, float_frame): + mixed_frame = float_frame.copy() + mixed_frame["foo"] = "bar" + + begin_index = float_frame.index[:5] + end_index = float_frame.index[5:] + + begin_frame = float_frame.reindex(begin_index) + end_frame = float_frame.reindex(end_index) + + appended = begin_frame.append(end_frame) + tm.assert_almost_equal(appended["A"], float_frame["A"]) + + del end_frame["A"] + partial_appended = begin_frame.append(end_frame, sort=sort) + assert "A" in partial_appended + + partial_appended = end_frame.append(begin_frame, sort=sort) + assert "A" in partial_appended + + # mixed type handling + appended = mixed_frame[:5].append(mixed_frame[5:]) + tm.assert_frame_equal(appended, mixed_frame) + + # what to test here + mixed_appended = mixed_frame[:5].append(float_frame[5:], sort=sort) + mixed_appended2 = float_frame[:5].append(mixed_frame[5:], sort=sort) + + # all equal except 'foo' column + tm.assert_frame_equal( + mixed_appended.reindex(columns=["A", "B", "C", "D"]), + mixed_appended2.reindex(columns=["A", "B", "C", "D"]), + ) + + def test_append_empty(self, float_frame): + empty = DataFrame() + + appended = float_frame.append(empty) + tm.assert_frame_equal(float_frame, appended) + assert appended is not float_frame + + appended = empty.append(float_frame) + tm.assert_frame_equal(float_frame, appended) + assert appended is not float_frame + + def test_append_overlap_raises(self, float_frame): + msg = "Indexes have overlapping values" + with pytest.raises(ValueError, match=msg): + float_frame.append(float_frame, verify_integrity=True) + + def test_append_new_columns(self): + # see gh-6129: new columns + df = DataFrame({"a": {"x": 1, "y": 2}, "b": {"x": 3, "y": 4}}) + row = Series([5, 6, 7], index=["a", "b", "c"], name="z") + expected = DataFrame( + { + "a": {"x": 1, "y": 2, "z": 5}, + "b": {"x": 3, "y": 4, "z": 6}, + "c": {"z": 7}, + } + ) + result = df.append(row) + tm.assert_frame_equal(result, expected) + + def test_append_length0_frame(self, sort): + df = DataFrame(columns=["A", "B", "C"]) + df3 = DataFrame(index=[0, 1], columns=["A", "B"]) + df5 = df.append(df3, sort=sort) + + expected = DataFrame(index=[0, 1], columns=["A", "B", "C"]) + tm.assert_frame_equal(df5, expected) + + def test_append_records(self): + arr1 = np.zeros((2,), dtype=("i4,f4,a10")) + arr1[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")] + + arr2 = np.zeros((3,), dtype=("i4,f4,a10")) + arr2[:] = [(3, 4.0, "foo"), (5, 6.0, "bar"), (7.0, 8.0, "baz")] + + df1 = DataFrame(arr1) + df2 = DataFrame(arr2) + + result = df1.append(df2, ignore_index=True) + expected = DataFrame(np.concatenate((arr1, arr2))) + tm.assert_frame_equal(result, expected) + + # rewrite sort fixture, since we also want to test default of None + def test_append_sorts(self, sort): + df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"]) + df2 = pd.DataFrame({"a": [1, 2], "c": [3, 4]}, index=[2, 3]) + + with tm.assert_produces_warning(None): + result = df1.append(df2, sort=sort) + + # for None / True + expected = pd.DataFrame( + {"b": [1, 2, None, None], "a": [1, 2, 1, 2], "c": [None, None, 3, 4]}, + columns=["a", "b", "c"], + ) + if sort is False: + expected = expected[["b", "a", "c"]] + tm.assert_frame_equal(result, expected) + + def test_append_different_columns(self, sort): + df = DataFrame( + { + "bools": np.random.randn(10) > 0, + "ints": np.random.randint(0, 10, 10), + "floats": np.random.randn(10), + "strings": ["foo", "bar"] * 5, + } + ) + + a = df[:5].loc[:, ["bools", "ints", "floats"]] + b = df[5:].loc[:, ["strings", "ints", "floats"]] + + appended = a.append(b, sort=sort) + assert isna(appended["strings"][0:4]).all() + assert isna(appended["bools"][5:]).all() + + def test_append_many(self, sort, float_frame): + chunks = [ + float_frame[:5], + float_frame[5:10], + float_frame[10:15], + float_frame[15:], + ] + + result = chunks[0].append(chunks[1:]) + tm.assert_frame_equal(result, float_frame) + + chunks[-1] = chunks[-1].copy() + chunks[-1]["foo"] = "bar" + result = chunks[0].append(chunks[1:], sort=sort) + tm.assert_frame_equal(result.loc[:, float_frame.columns], float_frame) + assert (result["foo"][15:] == "bar").all() + assert result["foo"][:15].isna().all() + + def test_append_preserve_index_name(self): + # #980 + df1 = DataFrame(columns=["A", "B", "C"]) + df1 = df1.set_index(["A"]) + df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]], columns=["A", "B", "C"]) + df2 = df2.set_index(["A"]) + + result = df1.append(df2) + assert result.index.name == "A" + + indexes_can_append = [ + pd.RangeIndex(3), + pd.Index([4, 5, 6]), + pd.Index([4.5, 5.5, 6.5]), + pd.Index(list("abc")), + pd.CategoricalIndex("A B C".split()), + pd.CategoricalIndex("D E F".split(), ordered=True), + pd.IntervalIndex.from_breaks([7, 8, 9, 10]), + pd.DatetimeIndex( + [ + dt.datetime(2013, 1, 3, 0, 0), + dt.datetime(2013, 1, 3, 6, 10), + dt.datetime(2013, 1, 3, 7, 12), + ] + ), + ] + + indexes_cannot_append_with_other = [ + pd.MultiIndex.from_arrays(["A B C".split(), "D E F".split()]) + ] + + all_indexes = indexes_can_append + indexes_cannot_append_with_other + + @pytest.mark.parametrize("index", all_indexes, ids=lambda x: type(x).__name__) + def test_append_same_columns_type(self, index): + # GH18359 + + # df wider than ser + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index) + ser_index = index[:2] + ser = pd.Series([7, 8], index=ser_index, name=2) + result = df.append(ser) + expected = pd.DataFrame( + [[1.0, 2.0, 3.0], [4, 5, 6], [7, 8, np.nan]], index=[0, 1, 2], columns=index + ) + tm.assert_frame_equal(result, expected) + + # ser wider than df + ser_index = index + index = index[:2] + df = pd.DataFrame([[1, 2], [4, 5]], columns=index) + ser = pd.Series([7, 8, 9], index=ser_index, name=2) + result = df.append(ser) + expected = pd.DataFrame( + [[1, 2, np.nan], [4, 5, np.nan], [7, 8, 9]], + index=[0, 1, 2], + columns=ser_index, + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "df_columns, series_index", + combinations(indexes_can_append, r=2), + ids=lambda x: type(x).__name__, + ) + def test_append_different_columns_types(self, df_columns, series_index): + # GH18359 + # See also test 'test_append_different_columns_types_raises' below + # for errors raised when appending + + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=df_columns) + ser = pd.Series([7, 8, 9], index=series_index, name=2) + + result = df.append(ser) + idx_diff = ser.index.difference(df_columns) + combined_columns = Index(df_columns.tolist()).append(idx_diff) + expected = pd.DataFrame( + [ + [1.0, 2.0, 3.0, np.nan, np.nan, np.nan], + [4, 5, 6, np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan, 7, 8, 9], + ], + index=[0, 1, 2], + columns=combined_columns, + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "index_can_append", indexes_can_append, ids=lambda x: type(x).__name__ + ) + @pytest.mark.parametrize( + "index_cannot_append_with_other", + indexes_cannot_append_with_other, + ids=lambda x: type(x).__name__, + ) + def test_append_different_columns_types_raises( + self, index_can_append, index_cannot_append_with_other + ): + # GH18359 + # Dataframe.append will raise if MultiIndex appends + # or is appended to a different index type + # + # See also test 'test_append_different_columns_types' above for + # appending without raising. + + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index_can_append) + ser = pd.Series([7, 8, 9], index=index_cannot_append_with_other, name=2) + msg = ( + r"Expected tuple, got (int|long|float|str|" + r"pandas._libs.interval.Interval)|" + r"object of type '(int|float|Timestamp|" + r"pandas._libs.interval.Interval)' has no len\(\)|" + ) + with pytest.raises(TypeError, match=msg): + df.append(ser) + + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], columns=index_cannot_append_with_other + ) + ser = pd.Series([7, 8, 9], index=index_can_append, name=2) + + with pytest.raises(TypeError, match=msg): + df.append(ser) + + def test_append_dtype_coerce(self, sort): + + # GH 4993 + # appending with datetime will incorrectly convert datetime64 + + df1 = DataFrame( + index=[1, 2], + data=[dt.datetime(2013, 1, 1, 0, 0), dt.datetime(2013, 1, 2, 0, 0)], + columns=["start_time"], + ) + df2 = DataFrame( + index=[4, 5], + data=[ + [dt.datetime(2013, 1, 3, 0, 0), dt.datetime(2013, 1, 3, 6, 10)], + [dt.datetime(2013, 1, 4, 0, 0), dt.datetime(2013, 1, 4, 7, 10)], + ], + columns=["start_time", "end_time"], + ) + + expected = concat( + [ + Series( + [ + pd.NaT, + pd.NaT, + dt.datetime(2013, 1, 3, 6, 10), + dt.datetime(2013, 1, 4, 7, 10), + ], + name="end_time", + ), + Series( + [ + dt.datetime(2013, 1, 1, 0, 0), + dt.datetime(2013, 1, 2, 0, 0), + dt.datetime(2013, 1, 3, 0, 0), + dt.datetime(2013, 1, 4, 0, 0), + ], + name="start_time", + ), + ], + axis=1, + sort=sort, + ) + result = df1.append(df2, ignore_index=True, sort=sort) + if sort: + expected = expected[["end_time", "start_time"]] + else: + expected = expected[["start_time", "end_time"]] + + tm.assert_frame_equal(result, expected) + + def test_append_missing_column_proper_upcast(self, sort): + df1 = DataFrame({"A": np.array([1, 2, 3, 4], dtype="i8")}) + df2 = DataFrame({"B": np.array([True, False, True, False], dtype=bool)}) + + appended = df1.append(df2, ignore_index=True, sort=sort) + assert appended["A"].dtype == "f8" + assert appended["B"].dtype == "O" + + def test_append_empty_frame_to_series_with_dateutil_tz(self): + # GH 23682 + date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc()) + s = Series({"date": date, "a": 1.0, "b": 2.0}) + df = DataFrame(columns=["c", "d"]) + result = df.append(s, ignore_index=True) + # n.b. it's not clear to me that expected is correct here. + # It's possible that the `date` column should have + # datetime64[ns, tz] dtype for both result and expected. + # that would be more consistent with new columns having + # their own dtype (float for a and b, datetime64ns, tz for date). + expected = DataFrame( + [[np.nan, np.nan, 1.0, 2.0, date]], + columns=["c", "d", "a", "b", "date"], + dtype=object, + ) + # These columns get cast to object after append + expected["a"] = expected["a"].astype(float) + expected["b"] = expected["b"].astype(float) + tm.assert_frame_equal(result, expected) + + +class TestConcatenate: + def test_concat_copy(self): + df = DataFrame(np.random.randn(4, 3)) + df2 = DataFrame(np.random.randint(0, 10, size=4).reshape(4, 1)) + df3 = DataFrame({5: "foo"}, index=range(4)) + + # These are actual copies. + result = concat([df, df2, df3], axis=1, copy=True) + + for b in result._data.blocks: + assert b.values.base is None + + # These are the same. + result = concat([df, df2, df3], axis=1, copy=False) + + for b in result._data.blocks: + if b.is_float: + assert b.values.base is df._data.blocks[0].values.base + elif b.is_integer: + assert b.values.base is df2._data.blocks[0].values.base + elif b.is_object: + assert b.values.base is not None + + # Float block was consolidated. + df4 = DataFrame(np.random.randn(4, 1)) + result = concat([df, df2, df3, df4], axis=1, copy=False) + for b in result._data.blocks: + if b.is_float: + assert b.values.base is None + elif b.is_integer: + assert b.values.base is df2._data.blocks[0].values.base + elif b.is_object: + assert b.values.base is not None + + def test_concat_with_group_keys(self): + df = DataFrame(np.random.randn(4, 3)) + df2 = DataFrame(np.random.randn(4, 4)) + + # axis=0 + df = DataFrame(np.random.randn(3, 4)) + df2 = DataFrame(np.random.randn(4, 4)) + + result = concat([df, df2], keys=[0, 1]) + exp_index = MultiIndex.from_arrays( + [[0, 0, 0, 1, 1, 1, 1], [0, 1, 2, 0, 1, 2, 3]] + ) + expected = DataFrame(np.r_[df.values, df2.values], index=exp_index) + tm.assert_frame_equal(result, expected) + + result = concat([df, df], keys=[0, 1]) + exp_index2 = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) + expected = DataFrame(np.r_[df.values, df.values], index=exp_index2) + tm.assert_frame_equal(result, expected) + + # axis=1 + df = DataFrame(np.random.randn(4, 3)) + df2 = DataFrame(np.random.randn(4, 4)) + + result = concat([df, df2], keys=[0, 1], axis=1) + expected = DataFrame(np.c_[df.values, df2.values], columns=exp_index) + tm.assert_frame_equal(result, expected) + + result = concat([df, df], keys=[0, 1], axis=1) + expected = DataFrame(np.c_[df.values, df.values], columns=exp_index2) + tm.assert_frame_equal(result, expected) + + def test_concat_keys_specific_levels(self): + df = DataFrame(np.random.randn(10, 4)) + pieces = [df.iloc[:, [0, 1]], df.iloc[:, [2]], df.iloc[:, [3]]] + level = ["three", "two", "one", "zero"] + result = concat( + pieces, + axis=1, + keys=["one", "two", "three"], + levels=[level], + names=["group_key"], + ) + + tm.assert_index_equal(result.columns.levels[0], Index(level, name="group_key")) + tm.assert_index_equal(result.columns.levels[1], Index([0, 1, 2, 3])) + + assert result.columns.names == ["group_key", None] + + def test_concat_dataframe_keys_bug(self, sort): + t1 = DataFrame( + {"value": Series([1, 2, 3], index=Index(["a", "b", "c"], name="id"))} + ) + t2 = DataFrame({"value": Series([7, 8], index=Index(["a", "b"], name="id"))}) + + # it works + result = concat([t1, t2], axis=1, keys=["t1", "t2"], sort=sort) + assert list(result.columns) == [("t1", "value"), ("t2", "value")] + + def test_concat_series_partial_columns_names(self): + # GH10698 + foo = Series([1, 2], name="foo") + bar = Series([1, 2]) + baz = Series([4, 5]) + + result = concat([foo, bar, baz], axis=1) + expected = DataFrame( + {"foo": [1, 2], 0: [1, 2], 1: [4, 5]}, columns=["foo", 0, 1] + ) + tm.assert_frame_equal(result, expected) + + result = concat([foo, bar, baz], axis=1, keys=["red", "blue", "yellow"]) + expected = DataFrame( + {"red": [1, 2], "blue": [1, 2], "yellow": [4, 5]}, + columns=["red", "blue", "yellow"], + ) + tm.assert_frame_equal(result, expected) + + result = concat([foo, bar, baz], axis=1, ignore_index=True) + expected = DataFrame({0: [1, 2], 1: [1, 2], 2: [4, 5]}) + tm.assert_frame_equal(result, expected) + + def test_concat_dict(self): + frames = { + "foo": DataFrame(np.random.randn(4, 3)), + "bar": DataFrame(np.random.randn(4, 3)), + "baz": DataFrame(np.random.randn(4, 3)), + "qux": DataFrame(np.random.randn(4, 3)), + } + + sorted_keys = list(frames.keys()) + + result = concat(frames) + expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys) + tm.assert_frame_equal(result, expected) + + result = concat(frames, axis=1) + expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys, axis=1) + tm.assert_frame_equal(result, expected) + + keys = ["baz", "foo", "bar"] + result = concat(frames, keys=keys) + expected = concat([frames[k] for k in keys], keys=keys) + tm.assert_frame_equal(result, expected) + + def test_concat_ignore_index(self, sort): + frame1 = DataFrame( + {"test1": ["a", "b", "c"], "test2": [1, 2, 3], "test3": [4.5, 3.2, 1.2]} + ) + frame2 = DataFrame({"test3": [5.2, 2.2, 4.3]}) + frame1.index = Index(["x", "y", "z"]) + frame2.index = Index(["x", "y", "q"]) + + v1 = concat([frame1, frame2], axis=1, ignore_index=True, sort=sort) + + nan = np.nan + expected = DataFrame( + [ + [nan, nan, nan, 4.3], + ["a", 1, 4.5, 5.2], + ["b", 2, 3.2, 2.2], + ["c", 3, 1.2, nan], + ], + index=Index(["q", "x", "y", "z"]), + ) + if not sort: + expected = expected.loc[["x", "y", "z", "q"]] + + tm.assert_frame_equal(v1, expected) + + def test_concat_multiindex_with_keys(self): + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + frame = DataFrame( + np.random.randn(10, 3), + index=index, + columns=Index(["A", "B", "C"], name="exp"), + ) + result = concat([frame, frame], keys=[0, 1], names=["iteration"]) + + assert result.index.names == ("iteration",) + index.names + tm.assert_frame_equal(result.loc[0], frame) + tm.assert_frame_equal(result.loc[1], frame) + assert result.index.nlevels == 3 + + def test_concat_multiindex_with_tz(self): + # GH 6606 + df = DataFrame( + { + "dt": [ + datetime(2014, 1, 1), + datetime(2014, 1, 2), + datetime(2014, 1, 3), + ], + "b": ["A", "B", "C"], + "c": [1, 2, 3], + "d": [4, 5, 6], + } + ) + df["dt"] = df["dt"].apply(lambda d: Timestamp(d, tz="US/Pacific")) + df = df.set_index(["dt", "b"]) + + exp_idx1 = DatetimeIndex( + ["2014-01-01", "2014-01-02", "2014-01-03"] * 2, tz="US/Pacific", name="dt" + ) + exp_idx2 = Index(["A", "B", "C"] * 2, name="b") + exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2]) + expected = DataFrame( + {"c": [1, 2, 3] * 2, "d": [4, 5, 6] * 2}, index=exp_idx, columns=["c", "d"] + ) + + result = concat([df, df]) + tm.assert_frame_equal(result, expected) + + def test_concat_multiindex_with_none_in_index_names(self): + # GH 15787 + index = pd.MultiIndex.from_product([[1], range(5)], names=["level1", None]) + df = pd.DataFrame({"col": range(5)}, index=index, dtype=np.int32) + + result = concat([df, df], keys=[1, 2], names=["level2"]) + index = pd.MultiIndex.from_product( + [[1, 2], [1], range(5)], names=["level2", "level1", None] + ) + expected = pd.DataFrame( + {"col": list(range(5)) * 2}, index=index, dtype=np.int32 + ) + tm.assert_frame_equal(result, expected) + + result = concat([df, df[:2]], keys=[1, 2], names=["level2"]) + level2 = [1] * 5 + [2] * 2 + level1 = [1] * 7 + no_name = list(range(5)) + list(range(2)) + tuples = list(zip(level2, level1, no_name)) + index = pd.MultiIndex.from_tuples(tuples, names=["level2", "level1", None]) + expected = pd.DataFrame({"col": no_name}, index=index, dtype=np.int32) + tm.assert_frame_equal(result, expected) + + def test_concat_keys_and_levels(self): + df = DataFrame(np.random.randn(1, 3)) + df2 = DataFrame(np.random.randn(1, 4)) + + levels = [["foo", "baz"], ["one", "two"]] + names = ["first", "second"] + result = concat( + [df, df2, df, df2], + keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")], + levels=levels, + names=names, + ) + expected = concat([df, df2, df, df2]) + exp_index = MultiIndex( + levels=levels + [[0]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1], [0, 0, 0, 0]], + names=names + [None], + ) + expected.index = exp_index + + tm.assert_frame_equal(result, expected) + + # no names + result = concat( + [df, df2, df, df2], + keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")], + levels=levels, + ) + assert result.index.names == (None,) * 3 + + # no levels + result = concat( + [df, df2, df, df2], + keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")], + names=["first", "second"], + ) + assert result.index.names == ("first", "second", None) + tm.assert_index_equal( + result.index.levels[0], Index(["baz", "foo"], name="first") + ) + + def test_concat_keys_levels_no_overlap(self): + # GH #1406 + df = DataFrame(np.random.randn(1, 3), index=["a"]) + df2 = DataFrame(np.random.randn(1, 4), index=["b"]) + + msg = "Values not found in passed level" + with pytest.raises(ValueError, match=msg): + concat([df, df], keys=["one", "two"], levels=[["foo", "bar", "baz"]]) + + msg = "Key one not in level" + with pytest.raises(ValueError, match=msg): + concat([df, df2], keys=["one", "two"], levels=[["foo", "bar", "baz"]]) + + def test_concat_rename_index(self): + a = DataFrame( + np.random.rand(3, 3), + columns=list("ABC"), + index=Index(list("abc"), name="index_a"), + ) + b = DataFrame( + np.random.rand(3, 3), + columns=list("ABC"), + index=Index(list("abc"), name="index_b"), + ) + + result = concat([a, b], keys=["key0", "key1"], names=["lvl0", "lvl1"]) + + exp = concat([a, b], keys=["key0", "key1"], names=["lvl0"]) + names = list(exp.index.names) + names[1] = "lvl1" + exp.index.set_names(names, inplace=True) + + tm.assert_frame_equal(result, exp) + assert result.index.names == exp.index.names + + def test_crossed_dtypes_weird_corner(self): + columns = ["A", "B", "C", "D"] + df1 = DataFrame( + { + "A": np.array([1, 2, 3, 4], dtype="f8"), + "B": np.array([1, 2, 3, 4], dtype="i8"), + "C": np.array([1, 2, 3, 4], dtype="f8"), + "D": np.array([1, 2, 3, 4], dtype="i8"), + }, + columns=columns, + ) + + df2 = DataFrame( + { + "A": np.array([1, 2, 3, 4], dtype="i8"), + "B": np.array([1, 2, 3, 4], dtype="f8"), + "C": np.array([1, 2, 3, 4], dtype="i8"), + "D": np.array([1, 2, 3, 4], dtype="f8"), + }, + columns=columns, + ) + + appended = df1.append(df2, ignore_index=True) + expected = DataFrame( + np.concatenate([df1.values, df2.values], axis=0), columns=columns + ) + tm.assert_frame_equal(appended, expected) + + df = DataFrame(np.random.randn(1, 3), index=["a"]) + df2 = DataFrame(np.random.randn(1, 4), index=["b"]) + result = concat([df, df2], keys=["one", "two"], names=["first", "second"]) + assert result.index.names == ("first", "second") + + def test_dups_index(self): + # GH 4771 + + # single dtypes + df = DataFrame( + np.random.randint(0, 10, size=40).reshape(10, 4), + columns=["A", "A", "C", "C"], + ) + + result = concat([df, df], axis=1) + tm.assert_frame_equal(result.iloc[:, :4], df) + tm.assert_frame_equal(result.iloc[:, 4:], df) + + result = concat([df, df], axis=0) + tm.assert_frame_equal(result.iloc[:10], df) + tm.assert_frame_equal(result.iloc[10:], df) + + # multi dtypes + df = concat( + [ + DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]), + DataFrame( + np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"] + ), + ], + axis=1, + ) + + result = concat([df, df], axis=1) + tm.assert_frame_equal(result.iloc[:, :6], df) + tm.assert_frame_equal(result.iloc[:, 6:], df) + + result = concat([df, df], axis=0) + tm.assert_frame_equal(result.iloc[:10], df) + tm.assert_frame_equal(result.iloc[10:], df) + + # append + result = df.iloc[0:8, :].append(df.iloc[8:]) + tm.assert_frame_equal(result, df) + + result = df.iloc[0:8, :].append(df.iloc[8:9]).append(df.iloc[9:10]) + tm.assert_frame_equal(result, df) + + expected = concat([df, df], axis=0) + result = df.append(df) + tm.assert_frame_equal(result, expected) + + def test_with_mixed_tuples(self, sort): + # 10697 + # columns have mixed tuples, so handle properly + df1 = DataFrame({"A": "foo", ("B", 1): "bar"}, index=range(2)) + df2 = DataFrame({"B": "foo", ("B", 1): "bar"}, index=range(2)) + + # it works + concat([df1, df2], sort=sort) + + def test_handle_empty_objects(self, sort): + df = DataFrame(np.random.randn(10, 4), columns=list("abcd")) + + baz = df[:5].copy() + baz["foo"] = "bar" + empty = df[5:5] + + frames = [baz, empty, empty, df[5:]] + concatted = concat(frames, axis=0, sort=sort) + + expected = df.reindex(columns=["a", "b", "c", "d", "foo"]) + expected["foo"] = expected["foo"].astype("O") + expected.loc[0:4, "foo"] = "bar" + + tm.assert_frame_equal(concatted, expected) + + # empty as first element with time series + # GH3259 + df = DataFrame( + dict(A=range(10000)), index=date_range("20130101", periods=10000, freq="s") + ) + empty = DataFrame() + result = concat([df, empty], axis=1) + tm.assert_frame_equal(result, df) + result = concat([empty, df], axis=1) + tm.assert_frame_equal(result, df) + + result = concat([df, empty]) + tm.assert_frame_equal(result, df) + result = concat([empty, df]) + tm.assert_frame_equal(result, df) + + def test_concat_mixed_objs(self): + + # concat mixed series/frames + # G2385 + + # axis 1 + index = date_range("01-Jan-2013", periods=10, freq="H") + arr = np.arange(10, dtype="int64") + s1 = Series(arr, index=index) + s2 = Series(arr, index=index) + df = DataFrame(arr.reshape(-1, 1), index=index) + + expected = DataFrame( + np.repeat(arr, 2).reshape(-1, 2), index=index, columns=[0, 0] + ) + result = concat([df, df], axis=1) + tm.assert_frame_equal(result, expected) + + expected = DataFrame( + np.repeat(arr, 2).reshape(-1, 2), index=index, columns=[0, 1] + ) + result = concat([s1, s2], axis=1) + tm.assert_frame_equal(result, expected) + + expected = DataFrame( + np.repeat(arr, 3).reshape(-1, 3), index=index, columns=[0, 1, 2] + ) + result = concat([s1, s2, s1], axis=1) + tm.assert_frame_equal(result, expected) + + expected = DataFrame( + np.repeat(arr, 5).reshape(-1, 5), index=index, columns=[0, 0, 1, 2, 3] + ) + result = concat([s1, df, s2, s2, s1], axis=1) + tm.assert_frame_equal(result, expected) + + # with names + s1.name = "foo" + expected = DataFrame( + np.repeat(arr, 3).reshape(-1, 3), index=index, columns=["foo", 0, 0] + ) + result = concat([s1, df, s2], axis=1) + tm.assert_frame_equal(result, expected) + + s2.name = "bar" + expected = DataFrame( + np.repeat(arr, 3).reshape(-1, 3), index=index, columns=["foo", 0, "bar"] + ) + result = concat([s1, df, s2], axis=1) + tm.assert_frame_equal(result, expected) + + # ignore index + expected = DataFrame( + np.repeat(arr, 3).reshape(-1, 3), index=index, columns=[0, 1, 2] + ) + result = concat([s1, df, s2], axis=1, ignore_index=True) + tm.assert_frame_equal(result, expected) + + # axis 0 + expected = DataFrame( + np.tile(arr, 3).reshape(-1, 1), index=index.tolist() * 3, columns=[0] + ) + result = concat([s1, df, s2]) + tm.assert_frame_equal(result, expected) + + expected = DataFrame(np.tile(arr, 3).reshape(-1, 1), columns=[0]) + result = concat([s1, df, s2], ignore_index=True) + tm.assert_frame_equal(result, expected) + + def test_empty_dtype_coerce(self): + + # xref to #12411 + # xref to #12045 + # xref to #11594 + # see below + + # 10571 + df1 = DataFrame(data=[[1, None], [2, None]], columns=["a", "b"]) + df2 = DataFrame(data=[[3, None], [4, None]], columns=["a", "b"]) + result = concat([df1, df2]) + expected = df1.dtypes + tm.assert_series_equal(result.dtypes, expected) + + def test_dtype_coerceion(self): + + # 12411 + df = DataFrame({"date": [pd.Timestamp("20130101").tz_localize("UTC"), pd.NaT]}) + + result = concat([df.iloc[[0]], df.iloc[[1]]]) + tm.assert_series_equal(result.dtypes, df.dtypes) + + # 12045 + import datetime + + df = DataFrame( + {"date": [datetime.datetime(2012, 1, 1), datetime.datetime(1012, 1, 2)]} + ) + result = concat([df.iloc[[0]], df.iloc[[1]]]) + tm.assert_series_equal(result.dtypes, df.dtypes) + + # 11594 + df = DataFrame({"text": ["some words"] + [None] * 9}) + result = concat([df.iloc[[0]], df.iloc[[1]]]) + tm.assert_series_equal(result.dtypes, df.dtypes) + + def test_concat_series(self): + + ts = tm.makeTimeSeries() + ts.name = "foo" + + pieces = [ts[:5], ts[5:15], ts[15:]] + + result = concat(pieces) + tm.assert_series_equal(result, ts) + assert result.name == ts.name + + result = concat(pieces, keys=[0, 1, 2]) + expected = ts.copy() + + ts.index = DatetimeIndex(np.array(ts.index.values, dtype="M8[ns]")) + + exp_codes = [np.repeat([0, 1, 2], [len(x) for x in pieces]), np.arange(len(ts))] + exp_index = MultiIndex(levels=[[0, 1, 2], ts.index], codes=exp_codes) + expected.index = exp_index + tm.assert_series_equal(result, expected) + + def test_concat_series_axis1(self, sort=sort): + ts = tm.makeTimeSeries() + + pieces = [ts[:-2], ts[2:], ts[2:-2]] + + result = concat(pieces, axis=1) + expected = DataFrame(pieces).T + tm.assert_frame_equal(result, expected) + + result = concat(pieces, keys=["A", "B", "C"], axis=1) + expected = DataFrame(pieces, index=["A", "B", "C"]).T + tm.assert_frame_equal(result, expected) + + # preserve series names, #2489 + s = Series(randn(5), name="A") + s2 = Series(randn(5), name="B") + + result = concat([s, s2], axis=1) + expected = DataFrame({"A": s, "B": s2}) + tm.assert_frame_equal(result, expected) + + s2.name = None + result = concat([s, s2], axis=1) + tm.assert_index_equal(result.columns, Index(["A", 0], dtype="object")) + + # must reindex, #2603 + s = Series(randn(3), index=["c", "a", "b"], name="A") + s2 = Series(randn(4), index=["d", "a", "b", "c"], name="B") + result = concat([s, s2], axis=1, sort=sort) + expected = DataFrame({"A": s, "B": s2}) + tm.assert_frame_equal(result, expected) + + def test_concat_series_axis1_names_applied(self): + # ensure names argument is not ignored on axis=1, #23490 + s = Series([1, 2, 3]) + s2 = Series([4, 5, 6]) + result = concat([s, s2], axis=1, keys=["a", "b"], names=["A"]) + expected = DataFrame( + [[1, 4], [2, 5], [3, 6]], columns=pd.Index(["a", "b"], name="A") + ) + tm.assert_frame_equal(result, expected) + + result = concat([s, s2], axis=1, keys=[("a", 1), ("b", 2)], names=["A", "B"]) + expected = DataFrame( + [[1, 4], [2, 5], [3, 6]], + columns=MultiIndex.from_tuples([("a", 1), ("b", 2)], names=["A", "B"]), + ) + tm.assert_frame_equal(result, expected) + + def test_concat_single_with_key(self): + df = DataFrame(np.random.randn(10, 4)) + + result = concat([df], keys=["foo"]) + expected = concat([df, df], keys=["foo", "bar"]) + tm.assert_frame_equal(result, expected[:10]) + + def test_concat_exclude_none(self): + df = DataFrame(np.random.randn(10, 4)) + + pieces = [df[:5], None, None, df[5:]] + result = concat(pieces) + tm.assert_frame_equal(result, df) + with pytest.raises(ValueError, match="All objects passed were None"): + concat([None, None]) + + def test_concat_datetime64_block(self): + from pandas.core.indexes.datetimes import date_range + + rng = date_range("1/1/2000", periods=10) + + df = DataFrame({"time": rng}) + + result = concat([df, df]) + assert (result.iloc[:10]["time"] == rng).all() + assert (result.iloc[10:]["time"] == rng).all() + + def test_concat_timedelta64_block(self): + from pandas import to_timedelta + + rng = to_timedelta(np.arange(10), unit="s") + + df = DataFrame({"time": rng}) + + result = concat([df, df]) + assert (result.iloc[:10]["time"] == rng).all() + assert (result.iloc[10:]["time"] == rng).all() + + def test_concat_keys_with_none(self): + # #1649 + df0 = DataFrame([[10, 20, 30], [10, 20, 30], [10, 20, 30]]) + + result = concat(dict(a=None, b=df0, c=df0[:2], d=df0[:1], e=df0)) + expected = concat(dict(b=df0, c=df0[:2], d=df0[:1], e=df0)) + tm.assert_frame_equal(result, expected) + + result = concat( + [None, df0, df0[:2], df0[:1], df0], keys=["a", "b", "c", "d", "e"] + ) + expected = concat([df0, df0[:2], df0[:1], df0], keys=["b", "c", "d", "e"]) + tm.assert_frame_equal(result, expected) + + def test_concat_bug_1719(self): + ts1 = tm.makeTimeSeries() + ts2 = tm.makeTimeSeries()[::2] + + # to join with union + # these two are of different length! + left = concat([ts1, ts2], join="outer", axis=1) + right = concat([ts2, ts1], join="outer", axis=1) + + assert len(left) == len(right) + + def test_concat_bug_2972(self): + ts0 = Series(np.zeros(5)) + ts1 = Series(np.ones(5)) + ts0.name = ts1.name = "same name" + result = concat([ts0, ts1], axis=1) + + expected = DataFrame({0: ts0, 1: ts1}) + expected.columns = ["same name", "same name"] + tm.assert_frame_equal(result, expected) + + def test_concat_bug_3602(self): + + # GH 3602, duplicate columns + df1 = DataFrame( + { + "firmNo": [0, 0, 0, 0], + "prc": [6, 6, 6, 6], + "stringvar": ["rrr", "rrr", "rrr", "rrr"], + } + ) + df2 = DataFrame( + {"C": [9, 10, 11, 12], "misc": [1, 2, 3, 4], "prc": [6, 6, 6, 6]} + ) + expected = DataFrame( + [ + [0, 6, "rrr", 9, 1, 6], + [0, 6, "rrr", 10, 2, 6], + [0, 6, "rrr", 11, 3, 6], + [0, 6, "rrr", 12, 4, 6], + ] + ) + expected.columns = ["firmNo", "prc", "stringvar", "C", "misc", "prc"] + + result = concat([df1, df2], axis=1) + tm.assert_frame_equal(result, expected) + + def test_concat_inner_join_empty(self): + # GH 15328 + df_empty = pd.DataFrame() + df_a = pd.DataFrame({"a": [1, 2]}, index=[0, 1], dtype="int64") + df_expected = pd.DataFrame({"a": []}, index=[], dtype="int64") + + for how, expected in [("inner", df_expected), ("outer", df_a)]: + result = pd.concat([df_a, df_empty], axis=1, join=how) + tm.assert_frame_equal(result, expected) + + def test_concat_series_axis1_same_names_ignore_index(self): + dates = date_range("01-Jan-2013", "01-Jan-2014", freq="MS")[0:-1] + s1 = Series(randn(len(dates)), index=dates, name="value") + s2 = Series(randn(len(dates)), index=dates, name="value") + + result = concat([s1, s2], axis=1, ignore_index=True) + expected = Index([0, 1]) + + tm.assert_index_equal(result.columns, expected) + + def test_concat_iterables(self): + # GH8645 check concat works with tuples, list, generators, and weird + # stuff like deque and custom iterables + df1 = DataFrame([1, 2, 3]) + df2 = DataFrame([4, 5, 6]) + expected = DataFrame([1, 2, 3, 4, 5, 6]) + tm.assert_frame_equal(concat((df1, df2), ignore_index=True), expected) + tm.assert_frame_equal(concat([df1, df2], ignore_index=True), expected) + tm.assert_frame_equal( + concat((df for df in (df1, df2)), ignore_index=True), expected + ) + tm.assert_frame_equal(concat(deque((df1, df2)), ignore_index=True), expected) + + class CustomIterator1: + def __len__(self) -> int: + return 2 + + def __getitem__(self, index): + try: + return {0: df1, 1: df2}[index] + except KeyError: + raise IndexError + + tm.assert_frame_equal(pd.concat(CustomIterator1(), ignore_index=True), expected) + + class CustomIterator2(abc.Iterable): + def __iter__(self): + yield df1 + yield df2 + + tm.assert_frame_equal(pd.concat(CustomIterator2(), ignore_index=True), expected) + + def test_concat_invalid(self): + + # trying to concat a ndframe with a non-ndframe + df1 = tm.makeCustomDataframe(10, 2) + msg = ( + "cannot concatenate object of type '{}';" + " only Series and DataFrame objs are valid" + ) + for obj in [1, dict(), [1, 2], (1, 2)]: + with pytest.raises(TypeError, match=msg.format(type(obj))): + concat([df1, obj]) + + def test_concat_invalid_first_argument(self): + df1 = tm.makeCustomDataframe(10, 2) + df2 = tm.makeCustomDataframe(10, 2) + msg = ( + "first argument must be an iterable of pandas " + 'objects, you passed an object of type "DataFrame"' + ) + with pytest.raises(TypeError, match=msg): + concat(df1, df2) + + # generator ok though + concat(DataFrame(np.random.rand(5, 5)) for _ in range(3)) + + # text reader ok + # GH6583 + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + + reader = read_csv(StringIO(data), chunksize=1) + result = concat(reader, ignore_index=True) + expected = read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) + + def test_concat_NaT_series(self): + # GH 11693 + # test for merging NaT series with datetime series. + x = Series( + date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="US/Eastern") + ) + y = Series(pd.NaT, index=[0, 1], dtype="datetime64[ns, US/Eastern]") + expected = Series([x[0], x[1], pd.NaT, pd.NaT]) + + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + # all NaT with tz + expected = Series(pd.NaT, index=range(4), dtype="datetime64[ns, US/Eastern]") + result = pd.concat([y, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + # without tz + x = pd.Series(pd.date_range("20151124 08:00", "20151124 09:00", freq="1h")) + y = pd.Series(pd.date_range("20151124 10:00", "20151124 11:00", freq="1h")) + y[:] = pd.NaT + expected = pd.Series([x[0], x[1], pd.NaT, pd.NaT]) + result = pd.concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + # all NaT without tz + x[:] = pd.NaT + expected = pd.Series(pd.NaT, index=range(4), dtype="datetime64[ns]") + result = pd.concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + def test_concat_tz_frame(self): + df2 = DataFrame( + dict( + A=pd.Timestamp("20130102", tz="US/Eastern"), + B=pd.Timestamp("20130603", tz="CET"), + ), + index=range(5), + ) + + # concat + df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1) + tm.assert_frame_equal(df2, df3) + + def test_concat_tz_series(self): + # gh-11755: tz and no tz + x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="UTC")) + y = Series(date_range("2012-01-01", "2012-01-02")) + expected = Series([x[0], x[1], y[0], y[1]], dtype="object") + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + # gh-11887: concat tz and object + x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="UTC")) + y = Series(["a", "b"]) + expected = Series([x[0], x[1], y[0], y[1]], dtype="object") + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + # see gh-12217 and gh-12306 + # Concatenating two UTC times + first = pd.DataFrame([[datetime(2016, 1, 1)]]) + first[0] = first[0].dt.tz_localize("UTC") + + second = pd.DataFrame([[datetime(2016, 1, 2)]]) + second[0] = second[0].dt.tz_localize("UTC") + + result = pd.concat([first, second]) + assert result[0].dtype == "datetime64[ns, UTC]" + + # Concatenating two London times + first = pd.DataFrame([[datetime(2016, 1, 1)]]) + first[0] = first[0].dt.tz_localize("Europe/London") + + second = pd.DataFrame([[datetime(2016, 1, 2)]]) + second[0] = second[0].dt.tz_localize("Europe/London") + + result = pd.concat([first, second]) + assert result[0].dtype == "datetime64[ns, Europe/London]" + + # Concatenating 2+1 London times + first = pd.DataFrame([[datetime(2016, 1, 1)], [datetime(2016, 1, 2)]]) + first[0] = first[0].dt.tz_localize("Europe/London") + + second = pd.DataFrame([[datetime(2016, 1, 3)]]) + second[0] = second[0].dt.tz_localize("Europe/London") + + result = pd.concat([first, second]) + assert result[0].dtype == "datetime64[ns, Europe/London]" + + # Concat'ing 1+2 London times + first = pd.DataFrame([[datetime(2016, 1, 1)]]) + first[0] = first[0].dt.tz_localize("Europe/London") + + second = pd.DataFrame([[datetime(2016, 1, 2)], [datetime(2016, 1, 3)]]) + second[0] = second[0].dt.tz_localize("Europe/London") + + result = pd.concat([first, second]) + assert result[0].dtype == "datetime64[ns, Europe/London]" + + def test_concat_tz_series_with_datetimelike(self): + # see gh-12620: tz and timedelta + x = [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-02-01", tz="US/Eastern"), + ] + y = [pd.Timedelta("1 day"), pd.Timedelta("2 day")] + result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) + tm.assert_series_equal(result, pd.Series(x + y, dtype="object")) + + # tz and period + y = [pd.Period("2011-03", freq="M"), pd.Period("2011-04", freq="M")] + result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) + tm.assert_series_equal(result, pd.Series(x + y, dtype="object")) + + def test_concat_tz_series_tzlocal(self): + # see gh-13583 + x = [ + pd.Timestamp("2011-01-01", tz=dateutil.tz.tzlocal()), + pd.Timestamp("2011-02-01", tz=dateutil.tz.tzlocal()), + ] + y = [ + pd.Timestamp("2012-01-01", tz=dateutil.tz.tzlocal()), + pd.Timestamp("2012-02-01", tz=dateutil.tz.tzlocal()), + ] + + result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) + tm.assert_series_equal(result, pd.Series(x + y)) + assert result.dtype == "datetime64[ns, tzlocal()]" + + @pytest.mark.parametrize("tz1", [None, "UTC"]) + @pytest.mark.parametrize("tz2", [None, "UTC"]) + @pytest.mark.parametrize("s", [pd.NaT, pd.Timestamp("20150101")]) + def test_concat_NaT_dataframes_all_NaT_axis_0(self, tz1, tz2, s): + # GH 12396 + + # tz-naive + first = pd.DataFrame([[pd.NaT], [pd.NaT]]).apply( + lambda x: x.dt.tz_localize(tz1) + ) + second = pd.DataFrame([s]).apply(lambda x: x.dt.tz_localize(tz2)) + + result = pd.concat([first, second], axis=0) + expected = pd.DataFrame(pd.Series([pd.NaT, pd.NaT, s], index=[0, 1, 0])) + expected = expected.apply(lambda x: x.dt.tz_localize(tz2)) + if tz1 != tz2: + expected = expected.astype(object) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("tz1", [None, "UTC"]) + @pytest.mark.parametrize("tz2", [None, "UTC"]) + def test_concat_NaT_dataframes_all_NaT_axis_1(self, tz1, tz2): + # GH 12396 + + first = pd.DataFrame(pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1)) + second = pd.DataFrame(pd.Series([pd.NaT]).dt.tz_localize(tz2), columns=[1]) + expected = pd.DataFrame( + { + 0: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1), + 1: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz2), + } + ) + result = pd.concat([first, second], axis=1) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("tz1", [None, "UTC"]) + @pytest.mark.parametrize("tz2", [None, "UTC"]) + def test_concat_NaT_series_dataframe_all_NaT(self, tz1, tz2): + # GH 12396 + + # tz-naive + first = pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1) + second = pd.DataFrame( + [ + [pd.Timestamp("2015/01/01", tz=tz2)], + [pd.Timestamp("2016/01/01", tz=tz2)], + ], + index=[2, 3], + ) + + expected = pd.DataFrame( + [ + pd.NaT, + pd.NaT, + pd.Timestamp("2015/01/01", tz=tz2), + pd.Timestamp("2016/01/01", tz=tz2), + ] + ) + if tz1 != tz2: + expected = expected.astype(object) + + result = pd.concat([first, second]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("tz", [None, "UTC"]) + def test_concat_NaT_dataframes(self, tz): + # GH 12396 + + first = pd.DataFrame([[pd.NaT], [pd.NaT]]) + first = first.apply(lambda x: x.dt.tz_localize(tz)) + second = pd.DataFrame( + [[pd.Timestamp("2015/01/01", tz=tz)], [pd.Timestamp("2016/01/01", tz=tz)]], + index=[2, 3], + ) + expected = pd.DataFrame( + [ + pd.NaT, + pd.NaT, + pd.Timestamp("2015/01/01", tz=tz), + pd.Timestamp("2016/01/01", tz=tz), + ] + ) + + result = pd.concat([first, second], axis=0) + tm.assert_frame_equal(result, expected) + + def test_concat_period_series(self): + x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D")) + y = Series(pd.PeriodIndex(["2015-10-01", "2016-01-01"], freq="D")) + expected = Series([x[0], x[1], y[0], y[1]], dtype="Period[D]") + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + def test_concat_period_multiple_freq_series(self): + x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D")) + y = Series(pd.PeriodIndex(["2015-10-01", "2016-01-01"], freq="M")) + expected = Series([x[0], x[1], y[0], y[1]], dtype="object") + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + assert result.dtype == "object" + + def test_concat_period_other_series(self): + x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D")) + y = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="M")) + expected = Series([x[0], x[1], y[0], y[1]], dtype="object") + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + assert result.dtype == "object" + + # non-period + x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D")) + y = Series(pd.DatetimeIndex(["2015-11-01", "2015-12-01"])) + expected = Series([x[0], x[1], y[0], y[1]], dtype="object") + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + assert result.dtype == "object" + + x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D")) + y = Series(["A", "B"]) + expected = Series([x[0], x[1], y[0], y[1]], dtype="object") + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + assert result.dtype == "object" + + def test_concat_empty_series(self): + # GH 11082 + s1 = pd.Series([1, 2, 3], name="x") + s2 = pd.Series(name="y", dtype="float64") + res = pd.concat([s1, s2], axis=1) + exp = pd.DataFrame( + {"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan]}, + index=pd.Index([0, 1, 2], dtype="O"), + ) + tm.assert_frame_equal(res, exp) + + s1 = pd.Series([1, 2, 3], name="x") + s2 = pd.Series(name="y", dtype="float64") + res = pd.concat([s1, s2], axis=0) + # name will be reset + exp = pd.Series([1, 2, 3]) + tm.assert_series_equal(res, exp) + + # empty Series with no name + s1 = pd.Series([1, 2, 3], name="x") + s2 = pd.Series(name=None, dtype="float64") + res = pd.concat([s1, s2], axis=1) + exp = pd.DataFrame( + {"x": [1, 2, 3], 0: [np.nan, np.nan, np.nan]}, + columns=["x", 0], + index=pd.Index([0, 1, 2], dtype="O"), + ) + tm.assert_frame_equal(res, exp) + + @pytest.mark.parametrize("tz", [None, "UTC"]) + @pytest.mark.parametrize("values", [[], [1, 2, 3]]) + def test_concat_empty_series_timelike(self, tz, values): + # GH 18447 + + first = Series([], dtype="M8[ns]").dt.tz_localize(tz) + dtype = None if values else np.float64 + second = Series(values, dtype=dtype) + + expected = DataFrame( + { + 0: pd.Series([pd.NaT] * len(values), dtype="M8[ns]").dt.tz_localize(tz), + 1: values, + } + ) + result = concat([first, second], axis=1) + tm.assert_frame_equal(result, expected) + + def test_default_index(self): + # is_series and ignore_index + s1 = pd.Series([1, 2, 3], name="x") + s2 = pd.Series([4, 5, 6], name="y") + res = pd.concat([s1, s2], axis=1, ignore_index=True) + assert isinstance(res.columns, pd.RangeIndex) + exp = pd.DataFrame([[1, 4], [2, 5], [3, 6]]) + # use check_index_type=True to check the result have + # RangeIndex (default index) + tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) + + # is_series and all inputs have no names + s1 = pd.Series([1, 2, 3]) + s2 = pd.Series([4, 5, 6]) + res = pd.concat([s1, s2], axis=1, ignore_index=False) + assert isinstance(res.columns, pd.RangeIndex) + exp = pd.DataFrame([[1, 4], [2, 5], [3, 6]]) + exp.columns = pd.RangeIndex(2) + tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) + + # is_dataframe and ignore_index + df1 = pd.DataFrame({"A": [1, 2], "B": [5, 6]}) + df2 = pd.DataFrame({"A": [3, 4], "B": [7, 8]}) + + res = pd.concat([df1, df2], axis=0, ignore_index=True) + exp = pd.DataFrame([[1, 5], [2, 6], [3, 7], [4, 8]], columns=["A", "B"]) + tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) + + res = pd.concat([df1, df2], axis=1, ignore_index=True) + exp = pd.DataFrame([[1, 5, 3, 7], [2, 6, 4, 8]]) + tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) + + def test_concat_multiindex_rangeindex(self): + # GH13542 + # when multi-index levels are RangeIndex objects + # there is a bug in concat with objects of len 1 + + df = DataFrame(np.random.randn(9, 2)) + df.index = MultiIndex( + levels=[pd.RangeIndex(3), pd.RangeIndex(3)], + codes=[np.repeat(np.arange(3), 3), np.tile(np.arange(3), 3)], + ) + + res = concat([df.iloc[[2, 3, 4], :], df.iloc[[5], :]]) + exp = df.iloc[[2, 3, 4, 5], :] + tm.assert_frame_equal(res, exp) + + def test_concat_multiindex_dfs_with_deepcopy(self): + # GH 9967 + from copy import deepcopy + + example_multiindex1 = pd.MultiIndex.from_product([["a"], ["b"]]) + example_dataframe1 = pd.DataFrame([0], index=example_multiindex1) + + example_multiindex2 = pd.MultiIndex.from_product([["a"], ["c"]]) + example_dataframe2 = pd.DataFrame([1], index=example_multiindex2) + + example_dict = {"s1": example_dataframe1, "s2": example_dataframe2} + expected_index = pd.MultiIndex( + levels=[["s1", "s2"], ["a"], ["b", "c"]], + codes=[[0, 1], [0, 0], [0, 1]], + names=["testname", None, None], + ) + expected = pd.DataFrame([[0], [1]], index=expected_index) + result_copy = pd.concat(deepcopy(example_dict), names=["testname"]) + tm.assert_frame_equal(result_copy, expected) + result_no_copy = pd.concat(example_dict, names=["testname"]) + tm.assert_frame_equal(result_no_copy, expected) + + def test_categorical_concat_append(self): + cat = Categorical(["a", "b"], categories=["a", "b"]) + vals = [1, 2] + df = DataFrame({"cats": cat, "vals": vals}) + cat2 = Categorical(["a", "b", "a", "b"], categories=["a", "b"]) + vals2 = [1, 2, 1, 2] + exp = DataFrame({"cats": cat2, "vals": vals2}, index=Index([0, 1, 0, 1])) + + tm.assert_frame_equal(pd.concat([df, df]), exp) + tm.assert_frame_equal(df.append(df), exp) + + # GH 13524 can concat different categories + cat3 = Categorical(["a", "b"], categories=["a", "b", "c"]) + vals3 = [1, 2] + df_different_categories = DataFrame({"cats": cat3, "vals": vals3}) + + res = pd.concat([df, df_different_categories], ignore_index=True) + exp = DataFrame({"cats": list("abab"), "vals": [1, 2, 1, 2]}) + tm.assert_frame_equal(res, exp) + + res = df.append(df_different_categories, ignore_index=True) + tm.assert_frame_equal(res, exp) + + def test_categorical_concat_dtypes(self): + + # GH8143 + index = ["cat", "obj", "num"] + cat = Categorical(["a", "b", "c"]) + obj = Series(["a", "b", "c"]) + num = Series([1, 2, 3]) + df = pd.concat([Series(cat), obj, num], axis=1, keys=index) + + result = df.dtypes == "object" + expected = Series([False, True, False], index=index) + tm.assert_series_equal(result, expected) + + result = df.dtypes == "int64" + expected = Series([False, False, True], index=index) + tm.assert_series_equal(result, expected) + + result = df.dtypes == "category" + expected = Series([True, False, False], index=index) + tm.assert_series_equal(result, expected) + + def test_categorical_concat(self, sort): + # See GH 10177 + df1 = DataFrame( + np.arange(18, dtype="int64").reshape(6, 3), columns=["a", "b", "c"] + ) + + df2 = DataFrame(np.arange(14, dtype="int64").reshape(7, 2), columns=["a", "c"]) + + cat_values = ["one", "one", "two", "one", "two", "two", "one"] + df2["h"] = Series(Categorical(cat_values)) + + res = pd.concat((df1, df2), axis=0, ignore_index=True, sort=sort) + exp = DataFrame( + { + "a": [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12], + "b": [ + 1, + 4, + 7, + 10, + 13, + 16, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + ], + "c": [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13], + "h": [None] * 6 + cat_values, + } + ) + tm.assert_frame_equal(res, exp) + + def test_categorical_concat_gh7864(self): + # GH 7864 + # make sure ordering is preserved + df = DataFrame({"id": [1, 2, 3, 4, 5, 6], "raw_grade": list("abbaae")}) + df["grade"] = Categorical(df["raw_grade"]) + df["grade"].cat.set_categories(["e", "a", "b"]) + + df1 = df[0:3] + df2 = df[3:] + + tm.assert_index_equal(df["grade"].cat.categories, df1["grade"].cat.categories) + tm.assert_index_equal(df["grade"].cat.categories, df2["grade"].cat.categories) + + dfx = pd.concat([df1, df2]) + tm.assert_index_equal(df["grade"].cat.categories, dfx["grade"].cat.categories) + + dfa = df1.append(df2) + tm.assert_index_equal(df["grade"].cat.categories, dfa["grade"].cat.categories) + + def test_categorical_concat_preserve(self): + + # GH 8641 series concat not preserving category dtype + # GH 13524 can concat different categories + s = Series(list("abc"), dtype="category") + s2 = Series(list("abd"), dtype="category") + + exp = Series(list("abcabd")) + res = pd.concat([s, s2], ignore_index=True) + tm.assert_series_equal(res, exp) + + exp = Series(list("abcabc"), dtype="category") + res = pd.concat([s, s], ignore_index=True) + tm.assert_series_equal(res, exp) + + exp = Series(list("abcabc"), index=[0, 1, 2, 0, 1, 2], dtype="category") + res = pd.concat([s, s]) + tm.assert_series_equal(res, exp) + + a = Series(np.arange(6, dtype="int64")) + b = Series(list("aabbca")) + + df2 = DataFrame({"A": a, "B": b.astype(CategoricalDtype(list("cab")))}) + res = pd.concat([df2, df2]) + exp = DataFrame( + { + "A": pd.concat([a, a]), + "B": pd.concat([b, b]).astype(CategoricalDtype(list("cab"))), + } + ) + tm.assert_frame_equal(res, exp) + + def test_categorical_index_preserver(self): + + a = Series(np.arange(6, dtype="int64")) + b = Series(list("aabbca")) + + df2 = DataFrame( + {"A": a, "B": b.astype(CategoricalDtype(list("cab")))} + ).set_index("B") + result = pd.concat([df2, df2]) + expected = DataFrame( + { + "A": pd.concat([a, a]), + "B": pd.concat([b, b]).astype(CategoricalDtype(list("cab"))), + } + ).set_index("B") + tm.assert_frame_equal(result, expected) + + # wrong categories + df3 = DataFrame( + {"A": a, "B": Categorical(b, categories=list("abe"))} + ).set_index("B") + msg = "categories must match existing categories when appending" + with pytest.raises(TypeError, match=msg): + pd.concat([df2, df3]) + + def test_concat_categoricalindex(self): + # GH 16111, categories that aren't lexsorted + categories = [9, 0, 1, 2, 3] + + a = pd.Series(1, index=pd.CategoricalIndex([9, 0], categories=categories)) + b = pd.Series(2, index=pd.CategoricalIndex([0, 1], categories=categories)) + c = pd.Series(3, index=pd.CategoricalIndex([1, 2], categories=categories)) + + result = pd.concat([a, b, c], axis=1) + + exp_idx = pd.CategoricalIndex([9, 0, 1, 2], categories=categories) + exp = pd.DataFrame( + { + 0: [1, 1, np.nan, np.nan], + 1: [np.nan, 2, 2, np.nan], + 2: [np.nan, np.nan, 3, 3], + }, + columns=[0, 1, 2], + index=exp_idx, + ) + tm.assert_frame_equal(result, exp) + + def test_concat_order(self): + # GH 17344 + dfs = [pd.DataFrame(index=range(3), columns=["a", 1, None])] + dfs += [ + pd.DataFrame(index=range(3), columns=[None, 1, "a"]) for i in range(100) + ] + + result = pd.concat(dfs, sort=True).columns + expected = dfs[0].columns + tm.assert_index_equal(result, expected) + + def test_concat_datetime_timezone(self): + # GH 18523 + idx1 = pd.date_range("2011-01-01", periods=3, freq="H", tz="Europe/Paris") + idx2 = pd.date_range(start=idx1[0], end=idx1[-1], freq="H") + df1 = pd.DataFrame({"a": [1, 2, 3]}, index=idx1) + df2 = pd.DataFrame({"b": [1, 2, 3]}, index=idx2) + result = pd.concat([df1, df2], axis=1) + + exp_idx = ( + DatetimeIndex( + [ + "2011-01-01 00:00:00+01:00", + "2011-01-01 01:00:00+01:00", + "2011-01-01 02:00:00+01:00", + ], + freq="H", + ) + .tz_convert("UTC") + .tz_convert("Europe/Paris") + ) + + expected = pd.DataFrame( + [[1, 1], [2, 2], [3, 3]], index=exp_idx, columns=["a", "b"] + ) + + tm.assert_frame_equal(result, expected) + + idx3 = pd.date_range("2011-01-01", periods=3, freq="H", tz="Asia/Tokyo") + df3 = pd.DataFrame({"b": [1, 2, 3]}, index=idx3) + result = pd.concat([df1, df3], axis=1) + + exp_idx = DatetimeIndex( + [ + "2010-12-31 15:00:00+00:00", + "2010-12-31 16:00:00+00:00", + "2010-12-31 17:00:00+00:00", + "2010-12-31 23:00:00+00:00", + "2011-01-01 00:00:00+00:00", + "2011-01-01 01:00:00+00:00", + ] + ) + + expected = pd.DataFrame( + [ + [np.nan, 1], + [np.nan, 2], + [np.nan, 3], + [1, np.nan], + [2, np.nan], + [3, np.nan], + ], + index=exp_idx, + columns=["a", "b"], + ) + + tm.assert_frame_equal(result, expected) + + # GH 13783: Concat after resample + result = pd.concat( + [df1.resample("H").mean(), df2.resample("H").mean()], sort=True + ) + expected = pd.DataFrame( + {"a": [1, 2, 3] + [np.nan] * 3, "b": [np.nan] * 3 + [1, 2, 3]}, + index=idx1.append(idx1), + ) + tm.assert_frame_equal(result, expected) + + def test_concat_different_extension_dtypes_upcasts(self): + a = pd.Series(pd.core.arrays.integer_array([1, 2])) + b = pd.Series(to_decimal([1, 2])) + + result = pd.concat([a, b], ignore_index=True) + expected = pd.Series([1, 2, Decimal(1), Decimal(2)], dtype=object) + tm.assert_series_equal(result, expected) + + def test_concat_odered_dict(self): + # GH 21510 + expected = pd.concat( + [pd.Series(range(3)), pd.Series(range(4))], keys=["First", "Another"] + ) + result = pd.concat( + OrderedDict( + [("First", pd.Series(range(3))), ("Another", pd.Series(range(4)))] + ) + ) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("pdt", [pd.Series, pd.DataFrame]) +@pytest.mark.parametrize("dt", np.sctypes["float"]) +def test_concat_no_unnecessary_upcast(dt, pdt): + # GH 13247 + dims = pdt(dtype=object).ndim + + dfs = [ + pdt(np.array([1], dtype=dt, ndmin=dims)), + pdt(np.array([np.nan], dtype=dt, ndmin=dims)), + pdt(np.array([5], dtype=dt, ndmin=dims)), + ] + x = pd.concat(dfs) + assert x.values.dtype == dt + + +@pytest.mark.parametrize("pdt", [create_series_with_explicit_dtype, pd.DataFrame]) +@pytest.mark.parametrize("dt", np.sctypes["int"]) +def test_concat_will_upcast(dt, pdt): + with catch_warnings(record=True): + dims = pdt().ndim + dfs = [ + pdt(np.array([1], dtype=dt, ndmin=dims)), + pdt(np.array([np.nan], ndmin=dims)), + pdt(np.array([5], dtype=dt, ndmin=dims)), + ] + x = pd.concat(dfs) + assert x.values.dtype == "float64" + + +def test_concat_empty_and_non_empty_frame_regression(): + # GH 18178 regression test + df1 = pd.DataFrame({"foo": [1]}) + df2 = pd.DataFrame({"foo": []}) + expected = pd.DataFrame({"foo": [1.0]}) + result = pd.concat([df1, df2]) + tm.assert_frame_equal(result, expected) + + +def test_concat_empty_and_non_empty_series_regression(): + # GH 18187 regression test + s1 = pd.Series([1]) + s2 = pd.Series([], dtype=object) + + expected = s1 + result = pd.concat([s1, s2]) + tm.assert_series_equal(result, expected) + + +def test_concat_sorts_columns(sort): + # GH-4588 + df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"]) + df2 = pd.DataFrame({"a": [3, 4], "c": [5, 6]}) + + # for sort=True/None + expected = pd.DataFrame( + {"a": [1, 2, 3, 4], "b": [1, 2, None, None], "c": [None, None, 5, 6]}, + columns=["a", "b", "c"], + ) + + if sort is False: + expected = expected[["b", "a", "c"]] + + # default + with tm.assert_produces_warning(None): + result = pd.concat([df1, df2], ignore_index=True, sort=sort) + tm.assert_frame_equal(result, expected) + + +def test_concat_sorts_index(sort): + df1 = pd.DataFrame({"a": [1, 2, 3]}, index=["c", "a", "b"]) + df2 = pd.DataFrame({"b": [1, 2]}, index=["a", "b"]) + + # For True/None + expected = pd.DataFrame( + {"a": [2, 3, 1], "b": [1, 2, None]}, index=["a", "b", "c"], columns=["a", "b"] + ) + if sort is False: + expected = expected.loc[["c", "a", "b"]] + + # Warn and sort by default + with tm.assert_produces_warning(None): + result = pd.concat([df1, df2], axis=1, sort=sort) + tm.assert_frame_equal(result, expected) + + +def test_concat_inner_sort(sort): + # https://github.com/pandas-dev/pandas/pull/20613 + df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2], "c": [1, 2]}, columns=["b", "a", "c"]) + df2 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[3, 4]) + + with tm.assert_produces_warning(None): + # unset sort should *not* warn for inner join + # since that never sorted + result = pd.concat([df1, df2], sort=sort, join="inner", ignore_index=True) + + expected = pd.DataFrame({"b": [1, 2, 3, 4], "a": [1, 2, 1, 2]}, columns=["b", "a"]) + if sort is True: + expected = expected[["a", "b"]] + tm.assert_frame_equal(result, expected) + + +def test_concat_aligned_sort(): + # GH-4588 + df = pd.DataFrame({"c": [1, 2], "b": [3, 4], "a": [5, 6]}, columns=["c", "b", "a"]) + result = pd.concat([df, df], sort=True, ignore_index=True) + expected = pd.DataFrame( + {"a": [5, 6, 5, 6], "b": [3, 4, 3, 4], "c": [1, 2, 1, 2]}, + columns=["a", "b", "c"], + ) + tm.assert_frame_equal(result, expected) + + result = pd.concat([df, df[["c", "b"]]], join="inner", sort=True, ignore_index=True) + expected = expected[["b", "c"]] + tm.assert_frame_equal(result, expected) + + +def test_concat_aligned_sort_does_not_raise(): + # GH-4588 + # We catch TypeErrors from sorting internally and do not re-raise. + df = pd.DataFrame({1: [1, 2], "a": [3, 4]}, columns=[1, "a"]) + expected = pd.DataFrame({1: [1, 2, 1, 2], "a": [3, 4, 3, 4]}, columns=[1, "a"]) + result = pd.concat([df, df], ignore_index=True, sort=True) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("s1name,s2name", [(np.int64(190), (43, 0)), (190, (43, 0))]) +def test_concat_series_name_npscalar_tuple(s1name, s2name): + # GH21015 + s1 = pd.Series({"a": 1, "b": 2}, name=s1name) + s2 = pd.Series({"c": 5, "d": 6}, name=s2name) + result = pd.concat([s1, s2]) + expected = pd.Series({"a": 1, "b": 2, "c": 5, "d": 6}) + tm.assert_series_equal(result, expected) + + +def test_concat_categorical_tz(): + # GH-23816 + a = pd.Series(pd.date_range("2017-01-01", periods=2, tz="US/Pacific")) + b = pd.Series(["a", "b"], dtype="category") + result = pd.concat([a, b], ignore_index=True) + expected = pd.Series( + [ + pd.Timestamp("2017-01-01", tz="US/Pacific"), + pd.Timestamp("2017-01-02", tz="US/Pacific"), + "a", + "b", + ] + ) + tm.assert_series_equal(result, expected) + + +def test_concat_categorical_unchanged(): + # GH-12007 + # test fix for when concat on categorical and float + # coerces dtype categorical -> float + df = pd.DataFrame(pd.Series(["a", "b", "c"], dtype="category", name="A")) + ser = pd.Series([0, 1, 2], index=[0, 1, 3], name="B") + result = pd.concat([df, ser], axis=1) + expected = pd.DataFrame( + { + "A": pd.Series(["a", "b", "c", np.nan], dtype="category"), + "B": pd.Series([0, 1, np.nan, 2], dtype="float"), + } + ) + tm.assert_equal(result, expected) + + +def test_concat_datetimeindex_freq(): + # GH 3232 + # Monotonic index result + dr = pd.date_range("01-Jan-2013", periods=100, freq="50L", tz="UTC") + data = list(range(100)) + expected = pd.DataFrame(data, index=dr) + result = pd.concat([expected[:50], expected[50:]]) + tm.assert_frame_equal(result, expected) + + # Non-monotonic index result + result = pd.concat([expected[50:], expected[:50]]) + expected = pd.DataFrame(data[50:] + data[:50], index=dr[50:].append(dr[:50])) + expected.index._data.freq = None + tm.assert_frame_equal(result, expected) + + +def test_concat_empty_df_object_dtype(): + # GH 9149 + df_1 = pd.DataFrame({"Row": [0, 1, 1], "EmptyCol": np.nan, "NumberCol": [1, 2, 3]}) + df_2 = pd.DataFrame(columns=df_1.columns) + result = pd.concat([df_1, df_2], axis=0) + expected = df_1.astype(object) + tm.assert_frame_equal(result, expected) + + +def test_concat_sparse(): + # GH 23557 + a = pd.Series(SparseArray([0, 1, 2])) + expected = pd.DataFrame(data=[[0, 0], [1, 1], [2, 2]]).astype( + pd.SparseDtype(np.int64, 0) + ) + result = pd.concat([a, a], axis=1) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/reshape/test_cut.py b/venv/Lib/site-packages/pandas/tests/reshape/test_cut.py new file mode 100644 index 0000000..13b6f05 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/reshape/test_cut.py @@ -0,0 +1,614 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + Categorical, + DataFrame, + DatetimeIndex, + Index, + Interval, + IntervalIndex, + Series, + TimedeltaIndex, + Timestamp, + cut, + date_range, + isna, + qcut, + timedelta_range, + to_datetime, +) +import pandas._testing as tm +from pandas.api.types import CategoricalDtype as CDT +import pandas.core.reshape.tile as tmod + + +def test_simple(): + data = np.ones(5, dtype="int64") + result = cut(data, 4, labels=False) + + expected = np.array([1, 1, 1, 1, 1]) + tm.assert_numpy_array_equal(result, expected, check_dtype=False) + + +def test_bins(): + data = np.array([0.2, 1.4, 2.5, 6.2, 9.7, 2.1]) + result, bins = cut(data, 3, retbins=True) + + intervals = IntervalIndex.from_breaks(bins.round(3)) + intervals = intervals.take([0, 0, 0, 1, 2, 0]) + expected = Categorical(intervals, ordered=True) + + tm.assert_categorical_equal(result, expected) + tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667, 6.53333333, 9.7])) + + +def test_right(): + data = np.array([0.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) + result, bins = cut(data, 4, right=True, retbins=True) + + intervals = IntervalIndex.from_breaks(bins.round(3)) + expected = Categorical(intervals, ordered=True) + expected = expected.take([0, 0, 0, 2, 3, 0, 0]) + + tm.assert_categorical_equal(result, expected) + tm.assert_almost_equal(bins, np.array([0.1905, 2.575, 4.95, 7.325, 9.7])) + + +def test_no_right(): + data = np.array([0.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) + result, bins = cut(data, 4, right=False, retbins=True) + + intervals = IntervalIndex.from_breaks(bins.round(3), closed="left") + intervals = intervals.take([0, 0, 0, 2, 3, 0, 1]) + expected = Categorical(intervals, ordered=True) + + tm.assert_categorical_equal(result, expected) + tm.assert_almost_equal(bins, np.array([0.2, 2.575, 4.95, 7.325, 9.7095])) + + +def test_array_like(): + data = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1] + result, bins = cut(data, 3, retbins=True) + + intervals = IntervalIndex.from_breaks(bins.round(3)) + intervals = intervals.take([0, 0, 0, 1, 2, 0]) + expected = Categorical(intervals, ordered=True) + + tm.assert_categorical_equal(result, expected) + tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667, 6.53333333, 9.7])) + + +def test_bins_from_interval_index(): + c = cut(range(5), 3) + expected = c + result = cut(range(5), bins=expected.categories) + tm.assert_categorical_equal(result, expected) + + expected = Categorical.from_codes( + np.append(c.codes, -1), categories=c.categories, ordered=True + ) + result = cut(range(6), bins=expected.categories) + tm.assert_categorical_equal(result, expected) + + +def test_bins_from_interval_index_doc_example(): + # Make sure we preserve the bins. + ages = np.array([10, 15, 13, 12, 23, 25, 28, 59, 60]) + c = cut(ages, bins=[0, 18, 35, 70]) + expected = IntervalIndex.from_tuples([(0, 18), (18, 35), (35, 70)]) + tm.assert_index_equal(c.categories, expected) + + result = cut([25, 20, 50], bins=c.categories) + tm.assert_index_equal(result.categories, expected) + tm.assert_numpy_array_equal(result.codes, np.array([1, 1, 2], dtype="int8")) + + +def test_bins_not_overlapping_from_interval_index(): + # see gh-23980 + msg = "Overlapping IntervalIndex is not accepted" + ii = IntervalIndex.from_tuples([(0, 10), (2, 12), (4, 14)]) + + with pytest.raises(ValueError, match=msg): + cut([5, 6], bins=ii) + + +def test_bins_not_monotonic(): + msg = "bins must increase monotonically" + data = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1] + + with pytest.raises(ValueError, match=msg): + cut(data, [0.1, 1.5, 1, 10]) + + +@pytest.mark.parametrize( + "x, bins, expected", + [ + ( + date_range("2017-12-31", periods=3), + [Timestamp.min, Timestamp("2018-01-01"), Timestamp.max], + IntervalIndex.from_tuples( + [ + (Timestamp.min, Timestamp("2018-01-01")), + (Timestamp("2018-01-01"), Timestamp.max), + ] + ), + ), + ( + [-1, 0, 1], + np.array( + [np.iinfo(np.int64).min, 0, np.iinfo(np.int64).max], dtype="int64" + ), + IntervalIndex.from_tuples( + [(np.iinfo(np.int64).min, 0), (0, np.iinfo(np.int64).max)] + ), + ), + ( + [np.timedelta64(-1), np.timedelta64(0), np.timedelta64(1)], + np.array( + [ + np.timedelta64(-np.iinfo(np.int64).max), + np.timedelta64(0), + np.timedelta64(np.iinfo(np.int64).max), + ] + ), + IntervalIndex.from_tuples( + [ + (np.timedelta64(-np.iinfo(np.int64).max), np.timedelta64(0)), + (np.timedelta64(0), np.timedelta64(np.iinfo(np.int64).max)), + ] + ), + ), + ], +) +def test_bins_monotonic_not_overflowing(x, bins, expected): + # GH 26045 + result = cut(x, bins) + tm.assert_index_equal(result.categories, expected) + + +def test_wrong_num_labels(): + msg = "Bin labels must be one fewer than the number of bin edges" + data = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1] + + with pytest.raises(ValueError, match=msg): + cut(data, [0, 1, 10], labels=["foo", "bar", "baz"]) + + +@pytest.mark.parametrize( + "x,bins,msg", + [ + ([], 2, "Cannot cut empty array"), + ([1, 2, 3], 0.5, "`bins` should be a positive integer"), + ], +) +def test_cut_corner(x, bins, msg): + with pytest.raises(ValueError, match=msg): + cut(x, bins) + + +@pytest.mark.parametrize("arg", [2, np.eye(2), DataFrame(np.eye(2))]) +@pytest.mark.parametrize("cut_func", [cut, qcut]) +def test_cut_not_1d_arg(arg, cut_func): + msg = "Input array must be 1 dimensional" + with pytest.raises(ValueError, match=msg): + cut_func(arg, 2) + + +@pytest.mark.parametrize( + "data", + [ + [0, 1, 2, 3, 4, np.inf], + [-np.inf, 0, 1, 2, 3, 4], + [-np.inf, 0, 1, 2, 3, 4, np.inf], + ], +) +def test_int_bins_with_inf(data): + # GH 24314 + msg = "cannot specify integer `bins` when input data contains infinity" + with pytest.raises(ValueError, match=msg): + cut(data, bins=3) + + +def test_cut_out_of_range_more(): + # see gh-1511 + name = "x" + + ser = Series([0, -1, 0, 1, -3], name=name) + ind = cut(ser, [0, 1], labels=False) + + exp = Series([np.nan, np.nan, np.nan, 0, np.nan], name=name) + tm.assert_series_equal(ind, exp) + + +@pytest.mark.parametrize( + "right,breaks,closed", + [ + (True, [-1e-3, 0.25, 0.5, 0.75, 1], "right"), + (False, [0, 0.25, 0.5, 0.75, 1 + 1e-3], "left"), + ], +) +def test_labels(right, breaks, closed): + arr = np.tile(np.arange(0, 1.01, 0.1), 4) + + result, bins = cut(arr, 4, retbins=True, right=right) + ex_levels = IntervalIndex.from_breaks(breaks, closed=closed) + tm.assert_index_equal(result.categories, ex_levels) + + +def test_cut_pass_series_name_to_factor(): + name = "foo" + ser = Series(np.random.randn(100), name=name) + + factor = cut(ser, 4) + assert factor.name == name + + +def test_label_precision(): + arr = np.arange(0, 0.73, 0.01) + result = cut(arr, 4, precision=2) + + ex_levels = IntervalIndex.from_breaks([-0.00072, 0.18, 0.36, 0.54, 0.72]) + tm.assert_index_equal(result.categories, ex_levels) + + +@pytest.mark.parametrize("labels", [None, False]) +def test_na_handling(labels): + arr = np.arange(0, 0.75, 0.01) + arr[::3] = np.nan + + result = cut(arr, 4, labels=labels) + result = np.asarray(result) + + expected = np.where(isna(arr), np.nan, result) + tm.assert_almost_equal(result, expected) + + +def test_inf_handling(): + data = np.arange(6) + data_ser = Series(data, dtype="int64") + + bins = [-np.inf, 2, 4, np.inf] + result = cut(data, bins) + result_ser = cut(data_ser, bins) + + ex_uniques = IntervalIndex.from_breaks(bins) + tm.assert_index_equal(result.categories, ex_uniques) + + assert result[5] == Interval(4, np.inf) + assert result[0] == Interval(-np.inf, 2) + assert result_ser[5] == Interval(4, np.inf) + assert result_ser[0] == Interval(-np.inf, 2) + + +def test_cut_out_of_bounds(): + arr = np.random.randn(100) + result = cut(arr, [-1, 0, 1]) + + mask = isna(result) + ex_mask = (arr < -1) | (arr > 1) + tm.assert_numpy_array_equal(mask, ex_mask) + + +@pytest.mark.parametrize( + "get_labels,get_expected", + [ + ( + lambda labels: labels, + lambda labels: Categorical( + ["Medium"] + 4 * ["Small"] + ["Medium", "Large"], + categories=labels, + ordered=True, + ), + ), + ( + lambda labels: Categorical.from_codes([0, 1, 2], labels), + lambda labels: Categorical.from_codes([1] + 4 * [0] + [1, 2], labels), + ), + ], +) +def test_cut_pass_labels(get_labels, get_expected): + bins = [0, 25, 50, 100] + arr = [50, 5, 10, 15, 20, 30, 70] + labels = ["Small", "Medium", "Large"] + + result = cut(arr, bins, labels=get_labels(labels)) + tm.assert_categorical_equal(result, get_expected(labels)) + + +def test_cut_pass_labels_compat(): + # see gh-16459 + arr = [50, 5, 10, 15, 20, 30, 70] + labels = ["Good", "Medium", "Bad"] + + result = cut(arr, 3, labels=labels) + exp = cut(arr, 3, labels=Categorical(labels, categories=labels, ordered=True)) + tm.assert_categorical_equal(result, exp) + + +@pytest.mark.parametrize("x", [np.arange(11.0), np.arange(11.0) / 1e10]) +def test_round_frac_just_works(x): + # It works. + cut(x, 2) + + +@pytest.mark.parametrize( + "val,precision,expected", + [ + (-117.9998, 3, -118), + (117.9998, 3, 118), + (117.9998, 2, 118), + (0.000123456, 2, 0.00012), + ], +) +def test_round_frac(val, precision, expected): + # see gh-1979 + result = tmod._round_frac(val, precision=precision) + assert result == expected + + +def test_cut_return_intervals(): + ser = Series([0, 1, 2, 3, 4, 5, 6, 7, 8]) + result = cut(ser, 3) + + exp_bins = np.linspace(0, 8, num=4).round(3) + exp_bins[0] -= 0.008 + + expected = Series( + IntervalIndex.from_breaks(exp_bins, closed="right").take( + [0, 0, 0, 1, 1, 1, 2, 2, 2] + ) + ).astype(CDT(ordered=True)) + tm.assert_series_equal(result, expected) + + +def test_series_ret_bins(): + # see gh-8589 + ser = Series(np.arange(4)) + result, bins = cut(ser, 2, retbins=True) + + expected = Series( + IntervalIndex.from_breaks([-0.003, 1.5, 3], closed="right").repeat(2) + ).astype(CDT(ordered=True)) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "kwargs,msg", + [ + (dict(duplicates="drop"), None), + (dict(), "Bin edges must be unique"), + (dict(duplicates="raise"), "Bin edges must be unique"), + (dict(duplicates="foo"), "invalid value for 'duplicates' parameter"), + ], +) +def test_cut_duplicates_bin(kwargs, msg): + # see gh-20947 + bins = [0, 2, 4, 6, 10, 10] + values = Series(np.array([1, 3, 5, 7, 9]), index=["a", "b", "c", "d", "e"]) + + if msg is not None: + with pytest.raises(ValueError, match=msg): + cut(values, bins, **kwargs) + else: + result = cut(values, bins, **kwargs) + expected = cut(values, pd.unique(bins)) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("data", [9.0, -9.0, 0.0]) +@pytest.mark.parametrize("length", [1, 2]) +def test_single_bin(data, length): + # see gh-14652, gh-15428 + ser = Series([data] * length) + result = cut(ser, 1, labels=False) + + expected = Series([0] * length) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "array_1_writeable,array_2_writeable", [(True, True), (True, False), (False, False)] +) +def test_cut_read_only(array_1_writeable, array_2_writeable): + # issue 18773 + array_1 = np.arange(0, 100, 10) + array_1.flags.writeable = array_1_writeable + + array_2 = np.arange(0, 100, 10) + array_2.flags.writeable = array_2_writeable + + hundred_elements = np.arange(100) + tm.assert_categorical_equal( + cut(hundred_elements, array_1), cut(hundred_elements, array_2) + ) + + +@pytest.mark.parametrize( + "conv", + [ + lambda v: Timestamp(v), + lambda v: to_datetime(v), + lambda v: np.datetime64(v), + lambda v: Timestamp(v).to_pydatetime(), + ], +) +def test_datetime_bin(conv): + data = [np.datetime64("2012-12-13"), np.datetime64("2012-12-15")] + bin_data = ["2012-12-12", "2012-12-14", "2012-12-16"] + + expected = Series( + IntervalIndex( + [ + Interval(Timestamp(bin_data[0]), Timestamp(bin_data[1])), + Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2])), + ] + ) + ).astype(CDT(ordered=True)) + + bins = [conv(v) for v in bin_data] + result = Series(cut(data, bins=bins)) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "data", + [ + to_datetime(Series(["2013-01-01", "2013-01-02", "2013-01-03"])), + [ + np.datetime64("2013-01-01"), + np.datetime64("2013-01-02"), + np.datetime64("2013-01-03"), + ], + np.array( + [ + np.datetime64("2013-01-01"), + np.datetime64("2013-01-02"), + np.datetime64("2013-01-03"), + ] + ), + DatetimeIndex(["2013-01-01", "2013-01-02", "2013-01-03"]), + ], +) +def test_datetime_cut(data): + # see gh-14714 + # + # Testing time data when it comes in various collection types. + result, _ = cut(data, 3, retbins=True) + expected = Series( + IntervalIndex( + [ + Interval( + Timestamp("2012-12-31 23:57:07.200000"), + Timestamp("2013-01-01 16:00:00"), + ), + Interval( + Timestamp("2013-01-01 16:00:00"), Timestamp("2013-01-02 08:00:00") + ), + Interval( + Timestamp("2013-01-02 08:00:00"), Timestamp("2013-01-03 00:00:00") + ), + ] + ) + ).astype(CDT(ordered=True)) + tm.assert_series_equal(Series(result), expected) + + +@pytest.mark.parametrize( + "bins", + [ + 3, + [ + Timestamp("2013-01-01 04:57:07.200000"), + Timestamp("2013-01-01 21:00:00"), + Timestamp("2013-01-02 13:00:00"), + Timestamp("2013-01-03 05:00:00"), + ], + ], +) +@pytest.mark.parametrize("box", [list, np.array, Index, Series]) +def test_datetime_tz_cut(bins, box): + # see gh-19872 + tz = "US/Eastern" + s = Series(date_range("20130101", periods=3, tz=tz)) + + if not isinstance(bins, int): + bins = box(bins) + + result = cut(s, bins) + expected = Series( + IntervalIndex( + [ + Interval( + Timestamp("2012-12-31 23:57:07.200000", tz=tz), + Timestamp("2013-01-01 16:00:00", tz=tz), + ), + Interval( + Timestamp("2013-01-01 16:00:00", tz=tz), + Timestamp("2013-01-02 08:00:00", tz=tz), + ), + Interval( + Timestamp("2013-01-02 08:00:00", tz=tz), + Timestamp("2013-01-03 00:00:00", tz=tz), + ), + ] + ) + ).astype(CDT(ordered=True)) + tm.assert_series_equal(result, expected) + + +def test_datetime_nan_error(): + msg = "bins must be of datetime64 dtype" + + with pytest.raises(ValueError, match=msg): + cut(date_range("20130101", periods=3), bins=[0, 2, 4]) + + +def test_datetime_nan_mask(): + result = cut( + date_range("20130102", periods=5), bins=date_range("20130101", periods=2) + ) + + mask = result.categories.isna() + tm.assert_numpy_array_equal(mask, np.array([False])) + + mask = result.isna() + tm.assert_numpy_array_equal(mask, np.array([False, True, True, True, True])) + + +@pytest.mark.parametrize("tz", [None, "UTC", "US/Pacific"]) +def test_datetime_cut_roundtrip(tz): + # see gh-19891 + ser = Series(date_range("20180101", periods=3, tz=tz)) + result, result_bins = cut(ser, 2, retbins=True) + + expected = cut(ser, result_bins) + tm.assert_series_equal(result, expected) + + expected_bins = DatetimeIndex( + ["2017-12-31 23:57:07.200000", "2018-01-02 00:00:00", "2018-01-03 00:00:00"] + ) + expected_bins = expected_bins.tz_localize(tz) + tm.assert_index_equal(result_bins, expected_bins) + + +def test_timedelta_cut_roundtrip(): + # see gh-19891 + ser = Series(timedelta_range("1day", periods=3)) + result, result_bins = cut(ser, 2, retbins=True) + + expected = cut(ser, result_bins) + tm.assert_series_equal(result, expected) + + expected_bins = TimedeltaIndex( + ["0 days 23:57:07.200000", "2 days 00:00:00", "3 days 00:00:00"] + ) + tm.assert_index_equal(result_bins, expected_bins) + + +@pytest.mark.parametrize("bins", [6, 7]) +@pytest.mark.parametrize( + "box, compare", + [ + (Series, tm.assert_series_equal), + (np.array, tm.assert_categorical_equal), + (list, tm.assert_equal), + ], +) +def test_cut_bool_coercion_to_int(bins, box, compare): + # issue 20303 + data_expected = box([0, 1, 1, 0, 1] * 10) + data_result = box([False, True, True, False, True] * 10) + expected = cut(data_expected, bins, duplicates="drop") + result = cut(data_result, bins, duplicates="drop") + compare(result, expected) + + +@pytest.mark.parametrize("labels", ["foo", 1, True]) +def test_cut_incorrect_labels(labels): + # GH 13318 + values = range(5) + msg = "Bin labels must either be False, None or passed in as a list-like argument" + with pytest.raises(ValueError, match=msg): + cut(values, 4, labels=labels) diff --git a/venv/Lib/site-packages/pandas/tests/reshape/test_melt.py b/venv/Lib/site-packages/pandas/tests/reshape/test_melt.py new file mode 100644 index 0000000..8143258 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/reshape/test_melt.py @@ -0,0 +1,992 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, lreshape, melt, wide_to_long +import pandas._testing as tm + + +class TestMelt: + def setup_method(self, method): + self.df = tm.makeTimeDataFrame()[:10] + self.df["id1"] = (self.df["A"] > 0).astype(np.int64) + self.df["id2"] = (self.df["B"] > 0).astype(np.int64) + + self.var_name = "var" + self.value_name = "val" + + self.df1 = pd.DataFrame( + [ + [1.067683, -1.110463, 0.20867], + [-1.321405, 0.368915, -1.055342], + [-0.807333, 0.08298, -0.873361], + ] + ) + self.df1.columns = [list("ABC"), list("abc")] + self.df1.columns.names = ["CAP", "low"] + + def test_top_level_method(self): + result = melt(self.df) + assert result.columns.tolist() == ["variable", "value"] + + def test_method_signatures(self): + tm.assert_frame_equal(self.df.melt(), melt(self.df)) + + tm.assert_frame_equal( + self.df.melt(id_vars=["id1", "id2"], value_vars=["A", "B"]), + melt(self.df, id_vars=["id1", "id2"], value_vars=["A", "B"]), + ) + + tm.assert_frame_equal( + self.df.melt(var_name=self.var_name, value_name=self.value_name), + melt(self.df, var_name=self.var_name, value_name=self.value_name), + ) + + tm.assert_frame_equal(self.df1.melt(col_level=0), melt(self.df1, col_level=0)) + + def test_default_col_names(self): + result = self.df.melt() + assert result.columns.tolist() == ["variable", "value"] + + result1 = self.df.melt(id_vars=["id1"]) + assert result1.columns.tolist() == ["id1", "variable", "value"] + + result2 = self.df.melt(id_vars=["id1", "id2"]) + assert result2.columns.tolist() == ["id1", "id2", "variable", "value"] + + def test_value_vars(self): + result3 = self.df.melt(id_vars=["id1", "id2"], value_vars="A") + assert len(result3) == 10 + + result4 = self.df.melt(id_vars=["id1", "id2"], value_vars=["A", "B"]) + expected4 = DataFrame( + { + "id1": self.df["id1"].tolist() * 2, + "id2": self.df["id2"].tolist() * 2, + "variable": ["A"] * 10 + ["B"] * 10, + "value": (self.df["A"].tolist() + self.df["B"].tolist()), + }, + columns=["id1", "id2", "variable", "value"], + ) + tm.assert_frame_equal(result4, expected4) + + def test_value_vars_types(self): + # GH 15348 + expected = DataFrame( + { + "id1": self.df["id1"].tolist() * 2, + "id2": self.df["id2"].tolist() * 2, + "variable": ["A"] * 10 + ["B"] * 10, + "value": (self.df["A"].tolist() + self.df["B"].tolist()), + }, + columns=["id1", "id2", "variable", "value"], + ) + + for type_ in (tuple, list, np.array): + result = self.df.melt(id_vars=["id1", "id2"], value_vars=type_(("A", "B"))) + tm.assert_frame_equal(result, expected) + + def test_vars_work_with_multiindex(self): + expected = DataFrame( + { + ("A", "a"): self.df1[("A", "a")], + "CAP": ["B"] * len(self.df1), + "low": ["b"] * len(self.df1), + "value": self.df1[("B", "b")], + }, + columns=[("A", "a"), "CAP", "low", "value"], + ) + + result = self.df1.melt(id_vars=[("A", "a")], value_vars=[("B", "b")]) + tm.assert_frame_equal(result, expected) + + def test_single_vars_work_with_multiindex(self): + expected = DataFrame( + { + "A": {0: 1.067683, 1: -1.321405, 2: -0.807333}, + "CAP": {0: "B", 1: "B", 2: "B"}, + "value": {0: -1.110463, 1: 0.368915, 2: 0.08298}, + } + ) + result = self.df1.melt(["A"], ["B"], col_level=0) + tm.assert_frame_equal(result, expected) + + def test_tuple_vars_fail_with_multiindex(self): + # melt should fail with an informative error message if + # the columns have a MultiIndex and a tuple is passed + # for id_vars or value_vars. + tuple_a = ("A", "a") + list_a = [tuple_a] + tuple_b = ("B", "b") + list_b = [tuple_b] + + msg = r"(id|value)_vars must be a list of tuples when columns are a MultiIndex" + for id_vars, value_vars in ( + (tuple_a, list_b), + (list_a, tuple_b), + (tuple_a, tuple_b), + ): + with pytest.raises(ValueError, match=msg): + self.df1.melt(id_vars=id_vars, value_vars=value_vars) + + def test_custom_var_name(self): + result5 = self.df.melt(var_name=self.var_name) + assert result5.columns.tolist() == ["var", "value"] + + result6 = self.df.melt(id_vars=["id1"], var_name=self.var_name) + assert result6.columns.tolist() == ["id1", "var", "value"] + + result7 = self.df.melt(id_vars=["id1", "id2"], var_name=self.var_name) + assert result7.columns.tolist() == ["id1", "id2", "var", "value"] + + result8 = self.df.melt( + id_vars=["id1", "id2"], value_vars="A", var_name=self.var_name + ) + assert result8.columns.tolist() == ["id1", "id2", "var", "value"] + + result9 = self.df.melt( + id_vars=["id1", "id2"], value_vars=["A", "B"], var_name=self.var_name + ) + expected9 = DataFrame( + { + "id1": self.df["id1"].tolist() * 2, + "id2": self.df["id2"].tolist() * 2, + self.var_name: ["A"] * 10 + ["B"] * 10, + "value": (self.df["A"].tolist() + self.df["B"].tolist()), + }, + columns=["id1", "id2", self.var_name, "value"], + ) + tm.assert_frame_equal(result9, expected9) + + def test_custom_value_name(self): + result10 = self.df.melt(value_name=self.value_name) + assert result10.columns.tolist() == ["variable", "val"] + + result11 = self.df.melt(id_vars=["id1"], value_name=self.value_name) + assert result11.columns.tolist() == ["id1", "variable", "val"] + + result12 = self.df.melt(id_vars=["id1", "id2"], value_name=self.value_name) + assert result12.columns.tolist() == ["id1", "id2", "variable", "val"] + + result13 = self.df.melt( + id_vars=["id1", "id2"], value_vars="A", value_name=self.value_name + ) + assert result13.columns.tolist() == ["id1", "id2", "variable", "val"] + + result14 = self.df.melt( + id_vars=["id1", "id2"], value_vars=["A", "B"], value_name=self.value_name + ) + expected14 = DataFrame( + { + "id1": self.df["id1"].tolist() * 2, + "id2": self.df["id2"].tolist() * 2, + "variable": ["A"] * 10 + ["B"] * 10, + self.value_name: (self.df["A"].tolist() + self.df["B"].tolist()), + }, + columns=["id1", "id2", "variable", self.value_name], + ) + tm.assert_frame_equal(result14, expected14) + + def test_custom_var_and_value_name(self): + + result15 = self.df.melt(var_name=self.var_name, value_name=self.value_name) + assert result15.columns.tolist() == ["var", "val"] + + result16 = self.df.melt( + id_vars=["id1"], var_name=self.var_name, value_name=self.value_name + ) + assert result16.columns.tolist() == ["id1", "var", "val"] + + result17 = self.df.melt( + id_vars=["id1", "id2"], var_name=self.var_name, value_name=self.value_name + ) + assert result17.columns.tolist() == ["id1", "id2", "var", "val"] + + result18 = self.df.melt( + id_vars=["id1", "id2"], + value_vars="A", + var_name=self.var_name, + value_name=self.value_name, + ) + assert result18.columns.tolist() == ["id1", "id2", "var", "val"] + + result19 = self.df.melt( + id_vars=["id1", "id2"], + value_vars=["A", "B"], + var_name=self.var_name, + value_name=self.value_name, + ) + expected19 = DataFrame( + { + "id1": self.df["id1"].tolist() * 2, + "id2": self.df["id2"].tolist() * 2, + self.var_name: ["A"] * 10 + ["B"] * 10, + self.value_name: (self.df["A"].tolist() + self.df["B"].tolist()), + }, + columns=["id1", "id2", self.var_name, self.value_name], + ) + tm.assert_frame_equal(result19, expected19) + + df20 = self.df.copy() + df20.columns.name = "foo" + result20 = df20.melt() + assert result20.columns.tolist() == ["foo", "value"] + + def test_col_level(self): + res1 = self.df1.melt(col_level=0) + res2 = self.df1.melt(col_level="CAP") + assert res1.columns.tolist() == ["CAP", "value"] + assert res2.columns.tolist() == ["CAP", "value"] + + def test_multiindex(self): + res = self.df1.melt() + assert res.columns.tolist() == ["CAP", "low", "value"] + + @pytest.mark.parametrize( + "col", + [ + pd.Series(pd.date_range("2010", periods=5, tz="US/Pacific")), + pd.Series(["a", "b", "c", "a", "d"], dtype="category"), + pd.Series([0, 1, 0, 0, 0]), + ], + ) + def test_pandas_dtypes(self, col): + # GH 15785 + df = DataFrame( + {"klass": range(5), "col": col, "attr1": [1, 0, 0, 0, 0], "attr2": col} + ) + expected_value = pd.concat([pd.Series([1, 0, 0, 0, 0]), col], ignore_index=True) + result = melt( + df, id_vars=["klass", "col"], var_name="attribute", value_name="value" + ) + expected = DataFrame( + { + 0: list(range(5)) * 2, + 1: pd.concat([col] * 2, ignore_index=True), + 2: ["attr1"] * 5 + ["attr2"] * 5, + 3: expected_value, + } + ) + expected.columns = ["klass", "col", "attribute", "value"] + tm.assert_frame_equal(result, expected) + + def test_preserve_category(self): + # GH 15853 + data = DataFrame({"A": [1, 2], "B": pd.Categorical(["X", "Y"])}) + result = pd.melt(data, ["B"], ["A"]) + expected = DataFrame( + {"B": pd.Categorical(["X", "Y"]), "variable": ["A", "A"], "value": [1, 2]} + ) + + tm.assert_frame_equal(result, expected) + + def test_melt_missing_columns_raises(self): + # GH-23575 + # This test is to ensure that pandas raises an error if melting is + # attempted with column names absent from the dataframe + + # Generate data + df = pd.DataFrame(np.random.randn(5, 4), columns=list("abcd")) + + # Try to melt with missing `value_vars` column name + msg = "The following '{Var}' are not present in the DataFrame: {Col}" + with pytest.raises( + KeyError, match=msg.format(Var="value_vars", Col="\\['C'\\]") + ): + df.melt(["a", "b"], ["C", "d"]) + + # Try to melt with missing `id_vars` column name + with pytest.raises(KeyError, match=msg.format(Var="id_vars", Col="\\['A'\\]")): + df.melt(["A", "b"], ["c", "d"]) + + # Multiple missing + with pytest.raises( + KeyError, + match=msg.format(Var="id_vars", Col="\\['not_here', 'or_there'\\]"), + ): + df.melt(["a", "b", "not_here", "or_there"], ["c", "d"]) + + # Multiindex melt fails if column is missing from multilevel melt + multi = df.copy() + multi.columns = [list("ABCD"), list("abcd")] + with pytest.raises(KeyError, match=msg.format(Var="id_vars", Col="\\['E'\\]")): + multi.melt([("E", "a")], [("B", "b")]) + # Multiindex fails if column is missing from single level melt + with pytest.raises( + KeyError, match=msg.format(Var="value_vars", Col="\\['F'\\]") + ): + multi.melt(["A"], ["F"], col_level=0) + + def test_melt_mixed_int_str_id_vars(self): + # GH 29718 + df = DataFrame({0: ["foo"], "a": ["bar"], "b": [1], "d": [2]}) + result = melt(df, id_vars=[0, "a"], value_vars=["b", "d"]) + expected = DataFrame( + {0: ["foo"] * 2, "a": ["bar"] * 2, "variable": list("bd"), "value": [1, 2]} + ) + tm.assert_frame_equal(result, expected) + + def test_melt_mixed_int_str_value_vars(self): + # GH 29718 + df = DataFrame({0: ["foo"], "a": ["bar"]}) + result = melt(df, value_vars=[0, "a"]) + expected = DataFrame({"variable": [0, "a"], "value": ["foo", "bar"]}) + tm.assert_frame_equal(result, expected) + + +class TestLreshape: + def test_pairs(self): + data = { + "birthdt": [ + "08jan2009", + "20dec2008", + "30dec2008", + "21dec2008", + "11jan2009", + ], + "birthwt": [1766, 3301, 1454, 3139, 4133], + "id": [101, 102, 103, 104, 105], + "sex": ["Male", "Female", "Female", "Female", "Female"], + "visitdt1": [ + "11jan2009", + "22dec2008", + "04jan2009", + "29dec2008", + "20jan2009", + ], + "visitdt2": ["21jan2009", np.nan, "22jan2009", "31dec2008", "03feb2009"], + "visitdt3": ["05feb2009", np.nan, np.nan, "02jan2009", "15feb2009"], + "wt1": [1823, 3338, 1549, 3298, 4306], + "wt2": [2011.0, np.nan, 1892.0, 3338.0, 4575.0], + "wt3": [2293.0, np.nan, np.nan, 3377.0, 4805.0], + } + + df = DataFrame(data) + + spec = { + "visitdt": ["visitdt{i:d}".format(i=i) for i in range(1, 4)], + "wt": ["wt{i:d}".format(i=i) for i in range(1, 4)], + } + result = lreshape(df, spec) + + exp_data = { + "birthdt": [ + "08jan2009", + "20dec2008", + "30dec2008", + "21dec2008", + "11jan2009", + "08jan2009", + "30dec2008", + "21dec2008", + "11jan2009", + "08jan2009", + "21dec2008", + "11jan2009", + ], + "birthwt": [ + 1766, + 3301, + 1454, + 3139, + 4133, + 1766, + 1454, + 3139, + 4133, + 1766, + 3139, + 4133, + ], + "id": [101, 102, 103, 104, 105, 101, 103, 104, 105, 101, 104, 105], + "sex": [ + "Male", + "Female", + "Female", + "Female", + "Female", + "Male", + "Female", + "Female", + "Female", + "Male", + "Female", + "Female", + ], + "visitdt": [ + "11jan2009", + "22dec2008", + "04jan2009", + "29dec2008", + "20jan2009", + "21jan2009", + "22jan2009", + "31dec2008", + "03feb2009", + "05feb2009", + "02jan2009", + "15feb2009", + ], + "wt": [ + 1823.0, + 3338.0, + 1549.0, + 3298.0, + 4306.0, + 2011.0, + 1892.0, + 3338.0, + 4575.0, + 2293.0, + 3377.0, + 4805.0, + ], + } + exp = DataFrame(exp_data, columns=result.columns) + tm.assert_frame_equal(result, exp) + + result = lreshape(df, spec, dropna=False) + exp_data = { + "birthdt": [ + "08jan2009", + "20dec2008", + "30dec2008", + "21dec2008", + "11jan2009", + "08jan2009", + "20dec2008", + "30dec2008", + "21dec2008", + "11jan2009", + "08jan2009", + "20dec2008", + "30dec2008", + "21dec2008", + "11jan2009", + ], + "birthwt": [ + 1766, + 3301, + 1454, + 3139, + 4133, + 1766, + 3301, + 1454, + 3139, + 4133, + 1766, + 3301, + 1454, + 3139, + 4133, + ], + "id": [ + 101, + 102, + 103, + 104, + 105, + 101, + 102, + 103, + 104, + 105, + 101, + 102, + 103, + 104, + 105, + ], + "sex": [ + "Male", + "Female", + "Female", + "Female", + "Female", + "Male", + "Female", + "Female", + "Female", + "Female", + "Male", + "Female", + "Female", + "Female", + "Female", + ], + "visitdt": [ + "11jan2009", + "22dec2008", + "04jan2009", + "29dec2008", + "20jan2009", + "21jan2009", + np.nan, + "22jan2009", + "31dec2008", + "03feb2009", + "05feb2009", + np.nan, + np.nan, + "02jan2009", + "15feb2009", + ], + "wt": [ + 1823.0, + 3338.0, + 1549.0, + 3298.0, + 4306.0, + 2011.0, + np.nan, + 1892.0, + 3338.0, + 4575.0, + 2293.0, + np.nan, + np.nan, + 3377.0, + 4805.0, + ], + } + exp = DataFrame(exp_data, columns=result.columns) + tm.assert_frame_equal(result, exp) + + with tm.assert_produces_warning(FutureWarning): + result = lreshape(df, spec, dropna=False, label="foo") + + spec = { + "visitdt": ["visitdt{i:d}".format(i=i) for i in range(1, 3)], + "wt": ["wt{i:d}".format(i=i) for i in range(1, 4)], + } + msg = "All column lists must be same length" + with pytest.raises(ValueError, match=msg): + lreshape(df, spec) + + +class TestWideToLong: + def test_simple(self): + np.random.seed(123) + x = np.random.randn(3) + df = pd.DataFrame( + { + "A1970": {0: "a", 1: "b", 2: "c"}, + "A1980": {0: "d", 1: "e", 2: "f"}, + "B1970": {0: 2.5, 1: 1.2, 2: 0.7}, + "B1980": {0: 3.2, 1: 1.3, 2: 0.1}, + "X": dict(zip(range(3), x)), + } + ) + df["id"] = df.index + exp_data = { + "X": x.tolist() + x.tolist(), + "A": ["a", "b", "c", "d", "e", "f"], + "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], + "year": [1970, 1970, 1970, 1980, 1980, 1980], + "id": [0, 1, 2, 0, 1, 2], + } + expected = DataFrame(exp_data) + expected = expected.set_index(["id", "year"])[["X", "A", "B"]] + result = wide_to_long(df, ["A", "B"], i="id", j="year") + tm.assert_frame_equal(result, expected) + + def test_stubs(self): + # GH9204 + df = pd.DataFrame([[0, 1, 2, 3, 8], [4, 5, 6, 7, 9]]) + df.columns = ["id", "inc1", "inc2", "edu1", "edu2"] + stubs = ["inc", "edu"] + + # TODO: unused? + df_long = pd.wide_to_long(df, stubs, i="id", j="age") # noqa + + assert stubs == ["inc", "edu"] + + def test_separating_character(self): + # GH14779 + np.random.seed(123) + x = np.random.randn(3) + df = pd.DataFrame( + { + "A.1970": {0: "a", 1: "b", 2: "c"}, + "A.1980": {0: "d", 1: "e", 2: "f"}, + "B.1970": {0: 2.5, 1: 1.2, 2: 0.7}, + "B.1980": {0: 3.2, 1: 1.3, 2: 0.1}, + "X": dict(zip(range(3), x)), + } + ) + df["id"] = df.index + exp_data = { + "X": x.tolist() + x.tolist(), + "A": ["a", "b", "c", "d", "e", "f"], + "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], + "year": [1970, 1970, 1970, 1980, 1980, 1980], + "id": [0, 1, 2, 0, 1, 2], + } + expected = DataFrame(exp_data) + expected = expected.set_index(["id", "year"])[["X", "A", "B"]] + result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=".") + tm.assert_frame_equal(result, expected) + + def test_escapable_characters(self): + np.random.seed(123) + x = np.random.randn(3) + df = pd.DataFrame( + { + "A(quarterly)1970": {0: "a", 1: "b", 2: "c"}, + "A(quarterly)1980": {0: "d", 1: "e", 2: "f"}, + "B(quarterly)1970": {0: 2.5, 1: 1.2, 2: 0.7}, + "B(quarterly)1980": {0: 3.2, 1: 1.3, 2: 0.1}, + "X": dict(zip(range(3), x)), + } + ) + df["id"] = df.index + exp_data = { + "X": x.tolist() + x.tolist(), + "A(quarterly)": ["a", "b", "c", "d", "e", "f"], + "B(quarterly)": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], + "year": [1970, 1970, 1970, 1980, 1980, 1980], + "id": [0, 1, 2, 0, 1, 2], + } + expected = DataFrame(exp_data) + expected = expected.set_index(["id", "year"])[ + ["X", "A(quarterly)", "B(quarterly)"] + ] + result = wide_to_long(df, ["A(quarterly)", "B(quarterly)"], i="id", j="year") + tm.assert_frame_equal(result, expected) + + def test_unbalanced(self): + # test that we can have a varying amount of time variables + df = pd.DataFrame( + { + "A2010": [1.0, 2.0], + "A2011": [3.0, 4.0], + "B2010": [5.0, 6.0], + "X": ["X1", "X2"], + } + ) + df["id"] = df.index + exp_data = { + "X": ["X1", "X1", "X2", "X2"], + "A": [1.0, 3.0, 2.0, 4.0], + "B": [5.0, np.nan, 6.0, np.nan], + "id": [0, 0, 1, 1], + "year": [2010, 2011, 2010, 2011], + } + expected = pd.DataFrame(exp_data) + expected = expected.set_index(["id", "year"])[["X", "A", "B"]] + result = wide_to_long(df, ["A", "B"], i="id", j="year") + tm.assert_frame_equal(result, expected) + + def test_character_overlap(self): + # Test we handle overlapping characters in both id_vars and value_vars + df = pd.DataFrame( + { + "A11": ["a11", "a22", "a33"], + "A12": ["a21", "a22", "a23"], + "B11": ["b11", "b12", "b13"], + "B12": ["b21", "b22", "b23"], + "BB11": [1, 2, 3], + "BB12": [4, 5, 6], + "BBBX": [91, 92, 93], + "BBBZ": [91, 92, 93], + } + ) + df["id"] = df.index + expected = pd.DataFrame( + { + "BBBX": [91, 92, 93, 91, 92, 93], + "BBBZ": [91, 92, 93, 91, 92, 93], + "A": ["a11", "a22", "a33", "a21", "a22", "a23"], + "B": ["b11", "b12", "b13", "b21", "b22", "b23"], + "BB": [1, 2, 3, 4, 5, 6], + "id": [0, 1, 2, 0, 1, 2], + "year": [11, 11, 11, 12, 12, 12], + } + ) + expected = expected.set_index(["id", "year"])[["BBBX", "BBBZ", "A", "B", "BB"]] + result = wide_to_long(df, ["A", "B", "BB"], i="id", j="year") + tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1)) + + def test_invalid_separator(self): + # if an invalid separator is supplied a empty data frame is returned + sep = "nope!" + df = pd.DataFrame( + { + "A2010": [1.0, 2.0], + "A2011": [3.0, 4.0], + "B2010": [5.0, 6.0], + "X": ["X1", "X2"], + } + ) + df["id"] = df.index + exp_data = { + "X": "", + "A2010": [], + "A2011": [], + "B2010": [], + "id": [], + "year": [], + "A": [], + "B": [], + } + expected = pd.DataFrame(exp_data).astype({"year": "int"}) + expected = expected.set_index(["id", "year"])[ + ["X", "A2010", "A2011", "B2010", "A", "B"] + ] + expected.index.set_levels([0, 1], level=0, inplace=True) + result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=sep) + tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1)) + + def test_num_string_disambiguation(self): + # Test that we can disambiguate number value_vars from + # string value_vars + df = pd.DataFrame( + { + "A11": ["a11", "a22", "a33"], + "A12": ["a21", "a22", "a23"], + "B11": ["b11", "b12", "b13"], + "B12": ["b21", "b22", "b23"], + "BB11": [1, 2, 3], + "BB12": [4, 5, 6], + "Arating": [91, 92, 93], + "Arating_old": [91, 92, 93], + } + ) + df["id"] = df.index + expected = pd.DataFrame( + { + "Arating": [91, 92, 93, 91, 92, 93], + "Arating_old": [91, 92, 93, 91, 92, 93], + "A": ["a11", "a22", "a33", "a21", "a22", "a23"], + "B": ["b11", "b12", "b13", "b21", "b22", "b23"], + "BB": [1, 2, 3, 4, 5, 6], + "id": [0, 1, 2, 0, 1, 2], + "year": [11, 11, 11, 12, 12, 12], + } + ) + expected = expected.set_index(["id", "year"])[ + ["Arating", "Arating_old", "A", "B", "BB"] + ] + result = wide_to_long(df, ["A", "B", "BB"], i="id", j="year") + tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1)) + + def test_invalid_suffixtype(self): + # If all stubs names end with a string, but a numeric suffix is + # assumed, an empty data frame is returned + df = pd.DataFrame( + { + "Aone": [1.0, 2.0], + "Atwo": [3.0, 4.0], + "Bone": [5.0, 6.0], + "X": ["X1", "X2"], + } + ) + df["id"] = df.index + exp_data = { + "X": "", + "Aone": [], + "Atwo": [], + "Bone": [], + "id": [], + "year": [], + "A": [], + "B": [], + } + expected = pd.DataFrame(exp_data).astype({"year": "int"}) + + expected = expected.set_index(["id", "year"]) + expected.index.set_levels([0, 1], level=0, inplace=True) + result = wide_to_long(df, ["A", "B"], i="id", j="year") + tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1)) + + def test_multiple_id_columns(self): + # Taken from http://www.ats.ucla.edu/stat/stata/modules/reshapel.htm + df = pd.DataFrame( + { + "famid": [1, 1, 1, 2, 2, 2, 3, 3, 3], + "birth": [1, 2, 3, 1, 2, 3, 1, 2, 3], + "ht1": [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], + "ht2": [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9], + } + ) + expected = pd.DataFrame( + { + "ht": [ + 2.8, + 3.4, + 2.9, + 3.8, + 2.2, + 2.9, + 2.0, + 3.2, + 1.8, + 2.8, + 1.9, + 2.4, + 2.2, + 3.3, + 2.3, + 3.4, + 2.1, + 2.9, + ], + "famid": [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3], + "birth": [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3], + "age": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2], + } + ) + expected = expected.set_index(["famid", "birth", "age"])[["ht"]] + result = wide_to_long(df, "ht", i=["famid", "birth"], j="age") + tm.assert_frame_equal(result, expected) + + def test_non_unique_idvars(self): + # GH16382 + # Raise an error message if non unique id vars (i) are passed + df = pd.DataFrame( + {"A_A1": [1, 2, 3, 4, 5], "B_B1": [1, 2, 3, 4, 5], "x": [1, 1, 1, 1, 1]} + ) + msg = "the id variables need to uniquely identify each row" + with pytest.raises(ValueError, match=msg): + wide_to_long(df, ["A_A", "B_B"], i="x", j="colname") + + def test_cast_j_int(self): + df = pd.DataFrame( + { + "actor_1": ["CCH Pounder", "Johnny Depp", "Christoph Waltz"], + "actor_2": ["Joel David Moore", "Orlando Bloom", "Rory Kinnear"], + "actor_fb_likes_1": [1000.0, 40000.0, 11000.0], + "actor_fb_likes_2": [936.0, 5000.0, 393.0], + "title": ["Avatar", "Pirates of the Caribbean", "Spectre"], + } + ) + + expected = pd.DataFrame( + { + "actor": [ + "CCH Pounder", + "Johnny Depp", + "Christoph Waltz", + "Joel David Moore", + "Orlando Bloom", + "Rory Kinnear", + ], + "actor_fb_likes": [1000.0, 40000.0, 11000.0, 936.0, 5000.0, 393.0], + "num": [1, 1, 1, 2, 2, 2], + "title": [ + "Avatar", + "Pirates of the Caribbean", + "Spectre", + "Avatar", + "Pirates of the Caribbean", + "Spectre", + ], + } + ).set_index(["title", "num"]) + result = wide_to_long( + df, ["actor", "actor_fb_likes"], i="title", j="num", sep="_" + ) + + tm.assert_frame_equal(result, expected) + + def test_identical_stubnames(self): + df = pd.DataFrame( + { + "A2010": [1.0, 2.0], + "A2011": [3.0, 4.0], + "B2010": [5.0, 6.0], + "A": ["X1", "X2"], + } + ) + msg = "stubname can't be identical to a column name" + with pytest.raises(ValueError, match=msg): + wide_to_long(df, ["A", "B"], i="A", j="colname") + + def test_nonnumeric_suffix(self): + df = pd.DataFrame( + { + "treatment_placebo": [1.0, 2.0], + "treatment_test": [3.0, 4.0], + "result_placebo": [5.0, 6.0], + "A": ["X1", "X2"], + } + ) + expected = pd.DataFrame( + { + "A": ["X1", "X1", "X2", "X2"], + "colname": ["placebo", "test", "placebo", "test"], + "result": [5.0, np.nan, 6.0, np.nan], + "treatment": [1.0, 3.0, 2.0, 4.0], + } + ) + expected = expected.set_index(["A", "colname"]) + result = wide_to_long( + df, ["result", "treatment"], i="A", j="colname", suffix="[a-z]+", sep="_" + ) + tm.assert_frame_equal(result, expected) + + def test_mixed_type_suffix(self): + df = pd.DataFrame( + { + "A": ["X1", "X2"], + "result_1": [0, 9], + "result_foo": [5.0, 6.0], + "treatment_1": [1.0, 2.0], + "treatment_foo": [3.0, 4.0], + } + ) + expected = pd.DataFrame( + { + "A": ["X1", "X2", "X1", "X2"], + "colname": ["1", "1", "foo", "foo"], + "result": [0.0, 9.0, 5.0, 6.0], + "treatment": [1.0, 2.0, 3.0, 4.0], + } + ).set_index(["A", "colname"]) + result = wide_to_long( + df, ["result", "treatment"], i="A", j="colname", suffix=".+", sep="_" + ) + tm.assert_frame_equal(result, expected) + + def test_float_suffix(self): + df = pd.DataFrame( + { + "treatment_1.1": [1.0, 2.0], + "treatment_2.1": [3.0, 4.0], + "result_1.2": [5.0, 6.0], + "result_1": [0, 9], + "A": ["X1", "X2"], + } + ) + expected = pd.DataFrame( + { + "A": ["X1", "X1", "X1", "X1", "X2", "X2", "X2", "X2"], + "colname": [1, 1.1, 1.2, 2.1, 1, 1.1, 1.2, 2.1], + "result": [0.0, np.nan, 5.0, np.nan, 9.0, np.nan, 6.0, np.nan], + "treatment": [np.nan, 1.0, np.nan, 3.0, np.nan, 2.0, np.nan, 4.0], + } + ) + expected = expected.set_index(["A", "colname"]) + result = wide_to_long( + df, ["result", "treatment"], i="A", j="colname", suffix="[0-9.]+", sep="_" + ) + tm.assert_frame_equal(result, expected) + + def test_col_substring_of_stubname(self): + # GH22468 + # Don't raise ValueError when a column name is a substring + # of a stubname that's been passed as a string + wide_data = { + "node_id": {0: 0, 1: 1, 2: 2, 3: 3, 4: 4}, + "A": {0: 0.80, 1: 0.0, 2: 0.25, 3: 1.0, 4: 0.81}, + "PA0": {0: 0.74, 1: 0.56, 2: 0.56, 3: 0.98, 4: 0.6}, + "PA1": {0: 0.77, 1: 0.64, 2: 0.52, 3: 0.98, 4: 0.67}, + "PA3": {0: 0.34, 1: 0.70, 2: 0.52, 3: 0.98, 4: 0.67}, + } + wide_df = pd.DataFrame.from_dict(wide_data) + expected = pd.wide_to_long( + wide_df, stubnames=["PA"], i=["node_id", "A"], j="time" + ) + result = pd.wide_to_long(wide_df, stubnames="PA", i=["node_id", "A"], j="time") + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/reshape/test_pivot.py b/venv/Lib/site-packages/pandas/tests/reshape/test_pivot.py new file mode 100644 index 0000000..743fc50 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/reshape/test_pivot.py @@ -0,0 +1,2643 @@ +from datetime import date, datetime, timedelta +from itertools import product + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + Categorical, + DataFrame, + Grouper, + Index, + MultiIndex, + Series, + concat, + date_range, +) +import pandas._testing as tm +from pandas.api.types import CategoricalDtype as CDT +from pandas.core.reshape.pivot import crosstab, pivot_table + + +@pytest.fixture(params=[True, False]) +def dropna(request): + return request.param + + +@pytest.fixture(params=[([0] * 4, [1] * 4), (range(0, 3), range(1, 4))]) +def interval_values(request, closed): + left, right = request.param + return Categorical(pd.IntervalIndex.from_arrays(left, right, closed)) + + +class TestPivotTable: + def setup_method(self, method): + self.data = DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + "D": np.random.randn(11), + "E": np.random.randn(11), + "F": np.random.randn(11), + } + ) + + def test_pivot_table(self, observed): + index = ["A", "B"] + columns = "C" + table = pivot_table( + self.data, values="D", index=index, columns=columns, observed=observed + ) + + table2 = self.data.pivot_table( + values="D", index=index, columns=columns, observed=observed + ) + tm.assert_frame_equal(table, table2) + + # this works + pivot_table(self.data, values="D", index=index, observed=observed) + + if len(index) > 1: + assert table.index.names == tuple(index) + else: + assert table.index.name == index[0] + + if len(columns) > 1: + assert table.columns.names == columns + else: + assert table.columns.name == columns[0] + + expected = self.data.groupby(index + [columns])["D"].agg(np.mean).unstack() + tm.assert_frame_equal(table, expected) + + def test_pivot_table_categorical_observed_equal(self, observed): + # issue #24923 + df = pd.DataFrame( + {"col1": list("abcde"), "col2": list("fghij"), "col3": [1, 2, 3, 4, 5]} + ) + + expected = df.pivot_table( + index="col1", values="col3", columns="col2", aggfunc=np.sum, fill_value=0 + ) + + expected.index = expected.index.astype("category") + expected.columns = expected.columns.astype("category") + + df.col1 = df.col1.astype("category") + df.col2 = df.col2.astype("category") + + result = df.pivot_table( + index="col1", + values="col3", + columns="col2", + aggfunc=np.sum, + fill_value=0, + observed=observed, + ) + + tm.assert_frame_equal(result, expected) + + def test_pivot_table_nocols(self): + df = DataFrame( + {"rows": ["a", "b", "c"], "cols": ["x", "y", "z"], "values": [1, 2, 3]} + ) + rs = df.pivot_table(columns="cols", aggfunc=np.sum) + xp = df.pivot_table(index="cols", aggfunc=np.sum).T + tm.assert_frame_equal(rs, xp) + + rs = df.pivot_table(columns="cols", aggfunc={"values": "mean"}) + xp = df.pivot_table(index="cols", aggfunc={"values": "mean"}).T + tm.assert_frame_equal(rs, xp) + + def test_pivot_table_dropna(self): + df = DataFrame( + { + "amount": {0: 60000, 1: 100000, 2: 50000, 3: 30000}, + "customer": {0: "A", 1: "A", 2: "B", 3: "C"}, + "month": {0: 201307, 1: 201309, 2: 201308, 3: 201310}, + "product": {0: "a", 1: "b", 2: "c", 3: "d"}, + "quantity": {0: 2000000, 1: 500000, 2: 1000000, 3: 1000000}, + } + ) + pv_col = df.pivot_table( + "quantity", "month", ["customer", "product"], dropna=False + ) + pv_ind = df.pivot_table( + "quantity", ["customer", "product"], "month", dropna=False + ) + + m = MultiIndex.from_tuples( + [ + ("A", "a"), + ("A", "b"), + ("A", "c"), + ("A", "d"), + ("B", "a"), + ("B", "b"), + ("B", "c"), + ("B", "d"), + ("C", "a"), + ("C", "b"), + ("C", "c"), + ("C", "d"), + ], + names=["customer", "product"], + ) + tm.assert_index_equal(pv_col.columns, m) + tm.assert_index_equal(pv_ind.index, m) + + def test_pivot_table_categorical(self): + + cat1 = Categorical( + ["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True + ) + cat2 = Categorical( + ["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True + ) + df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) + result = pd.pivot_table(df, values="values", index=["A", "B"], dropna=True) + + exp_index = pd.MultiIndex.from_arrays([cat1, cat2], names=["A", "B"]) + expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index) + tm.assert_frame_equal(result, expected) + + def test_pivot_table_dropna_categoricals(self, dropna): + # GH 15193 + categories = ["a", "b", "c", "d"] + + df = DataFrame( + { + "A": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + "B": [1, 2, 3, 1, 2, 3, 1, 2, 3], + "C": range(0, 9), + } + ) + + df["A"] = df["A"].astype(CDT(categories, ordered=False)) + result = df.pivot_table(index="B", columns="A", values="C", dropna=dropna) + expected_columns = Series(["a", "b", "c"], name="A") + expected_columns = expected_columns.astype(CDT(categories, ordered=False)) + expected_index = Series([1, 2, 3], name="B") + expected = DataFrame( + [[0, 3, 6], [1, 4, 7], [2, 5, 8]], + index=expected_index, + columns=expected_columns, + ) + if not dropna: + # add back the non observed to compare + expected = expected.reindex(columns=Categorical(categories)).astype("float") + + tm.assert_frame_equal(result, expected) + + def test_pivot_with_non_observable_dropna(self, dropna): + # gh-21133 + df = pd.DataFrame( + { + "A": pd.Categorical( + [np.nan, "low", "high", "low", "high"], + categories=["low", "high"], + ordered=True, + ), + "B": range(5), + } + ) + + result = df.pivot_table(index="A", values="B", dropna=dropna) + expected = pd.DataFrame( + {"B": [2, 3]}, + index=pd.Index( + pd.Categorical.from_codes( + [0, 1], categories=["low", "high"], ordered=True + ), + name="A", + ), + ) + + tm.assert_frame_equal(result, expected) + + # gh-21378 + df = pd.DataFrame( + { + "A": pd.Categorical( + ["left", "low", "high", "low", "high"], + categories=["low", "high", "left"], + ordered=True, + ), + "B": range(5), + } + ) + + result = df.pivot_table(index="A", values="B", dropna=dropna) + expected = pd.DataFrame( + {"B": [2, 3, 0]}, + index=pd.Index( + pd.Categorical.from_codes( + [0, 1, 2], categories=["low", "high", "left"], ordered=True + ), + name="A", + ), + ) + + tm.assert_frame_equal(result, expected) + + def test_pivot_with_interval_index(self, interval_values, dropna): + # GH 25814 + df = DataFrame({"A": interval_values, "B": 1}) + result = df.pivot_table(index="A", values="B", dropna=dropna) + expected = DataFrame({"B": 1}, index=Index(interval_values.unique(), name="A")) + tm.assert_frame_equal(result, expected) + + def test_pivot_with_interval_index_margins(self): + # GH 25815 + ordered_cat = pd.IntervalIndex.from_arrays([0, 0, 1, 1], [1, 1, 2, 2]) + df = DataFrame( + { + "A": np.arange(4, 0, -1, dtype=np.intp), + "B": ["a", "b", "a", "b"], + "C": pd.Categorical(ordered_cat, ordered=True).sort_values( + ascending=False + ), + } + ) + + pivot_tab = pd.pivot_table( + df, index="C", columns="B", values="A", aggfunc="sum", margins=True + ) + + result = pivot_tab["All"] + expected = Series( + [3, 7, 10], + index=Index([pd.Interval(0, 1), pd.Interval(1, 2), "All"], name="C"), + name="All", + dtype=np.intp, + ) + tm.assert_series_equal(result, expected) + + def test_pass_array(self): + result = self.data.pivot_table("D", index=self.data.A, columns=self.data.C) + expected = self.data.pivot_table("D", index="A", columns="C") + tm.assert_frame_equal(result, expected) + + def test_pass_function(self): + result = self.data.pivot_table("D", index=lambda x: x // 5, columns=self.data.C) + expected = self.data.pivot_table("D", index=self.data.index // 5, columns="C") + tm.assert_frame_equal(result, expected) + + def test_pivot_table_multiple(self): + index = ["A", "B"] + columns = "C" + table = pivot_table(self.data, index=index, columns=columns) + expected = self.data.groupby(index + [columns]).agg(np.mean).unstack() + tm.assert_frame_equal(table, expected) + + def test_pivot_dtypes(self): + + # can convert dtypes + f = DataFrame( + { + "a": ["cat", "bat", "cat", "bat"], + "v": [1, 2, 3, 4], + "i": ["a", "b", "a", "b"], + } + ) + assert f.dtypes["v"] == "int64" + + z = pivot_table( + f, values="v", index=["a"], columns=["i"], fill_value=0, aggfunc=np.sum + ) + result = z.dtypes + expected = Series([np.dtype("int64")] * 2, index=Index(list("ab"), name="i")) + tm.assert_series_equal(result, expected) + + # cannot convert dtypes + f = DataFrame( + { + "a": ["cat", "bat", "cat", "bat"], + "v": [1.5, 2.5, 3.5, 4.5], + "i": ["a", "b", "a", "b"], + } + ) + assert f.dtypes["v"] == "float64" + + z = pivot_table( + f, values="v", index=["a"], columns=["i"], fill_value=0, aggfunc=np.mean + ) + result = z.dtypes + expected = Series([np.dtype("float64")] * 2, index=Index(list("ab"), name="i")) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "columns,values", + [ + ("bool1", ["float1", "float2"]), + ("bool1", ["float1", "float2", "bool1"]), + ("bool2", ["float1", "float2", "bool1"]), + ], + ) + def test_pivot_preserve_dtypes(self, columns, values): + # GH 7142 regression test + v = np.arange(5, dtype=np.float64) + df = DataFrame( + {"float1": v, "float2": v + 2.0, "bool1": v <= 2, "bool2": v <= 3} + ) + + df_res = df.reset_index().pivot_table( + index="index", columns=columns, values=values + ) + + result = dict(df_res.dtypes) + expected = { + col: np.dtype("O") if col[0].startswith("b") else np.dtype("float64") + for col in df_res + } + assert result == expected + + def test_pivot_no_values(self): + # GH 14380 + idx = pd.DatetimeIndex( + ["2011-01-01", "2011-02-01", "2011-01-02", "2011-01-01", "2011-01-02"] + ) + df = pd.DataFrame({"A": [1, 2, 3, 4, 5]}, index=idx) + res = df.pivot_table(index=df.index.month, columns=df.index.day) + + exp_columns = pd.MultiIndex.from_tuples([("A", 1), ("A", 2)]) + exp = pd.DataFrame( + [[2.5, 4.0], [2.0, np.nan]], index=[1, 2], columns=exp_columns + ) + tm.assert_frame_equal(res, exp) + + df = pd.DataFrame( + { + "A": [1, 2, 3, 4, 5], + "dt": pd.date_range("2011-01-01", freq="D", periods=5), + }, + index=idx, + ) + res = df.pivot_table( + index=df.index.month, columns=pd.Grouper(key="dt", freq="M") + ) + exp_columns = pd.MultiIndex.from_tuples([("A", pd.Timestamp("2011-01-31"))]) + exp_columns.names = [None, "dt"] + exp = pd.DataFrame([3.25, 2.0], index=[1, 2], columns=exp_columns) + tm.assert_frame_equal(res, exp) + + res = df.pivot_table( + index=pd.Grouper(freq="A"), columns=pd.Grouper(key="dt", freq="M") + ) + exp = pd.DataFrame( + [3], index=pd.DatetimeIndex(["2011-12-31"]), columns=exp_columns + ) + tm.assert_frame_equal(res, exp) + + def test_pivot_multi_values(self): + result = pivot_table( + self.data, values=["D", "E"], index="A", columns=["B", "C"], fill_value=0 + ) + expected = pivot_table( + self.data.drop(["F"], axis=1), index="A", columns=["B", "C"], fill_value=0 + ) + tm.assert_frame_equal(result, expected) + + def test_pivot_multi_functions(self): + f = lambda func: pivot_table( + self.data, values=["D", "E"], index=["A", "B"], columns="C", aggfunc=func + ) + result = f([np.mean, np.std]) + means = f(np.mean) + stds = f(np.std) + expected = concat([means, stds], keys=["mean", "std"], axis=1) + tm.assert_frame_equal(result, expected) + + # margins not supported?? + f = lambda func: pivot_table( + self.data, + values=["D", "E"], + index=["A", "B"], + columns="C", + aggfunc=func, + margins=True, + ) + result = f([np.mean, np.std]) + means = f(np.mean) + stds = f(np.std) + expected = concat([means, stds], keys=["mean", "std"], axis=1) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("method", [True, False]) + def test_pivot_index_with_nan(self, method): + # GH 3588 + nan = np.nan + df = DataFrame( + { + "a": ["R1", "R2", nan, "R4"], + "b": ["C1", "C2", "C3", "C4"], + "c": [10, 15, 17, 20], + } + ) + if method: + result = df.pivot("a", "b", "c") + else: + result = pd.pivot(df, "a", "b", "c") + expected = DataFrame( + [ + [nan, nan, 17, nan], + [10, nan, nan, nan], + [nan, 15, nan, nan], + [nan, nan, nan, 20], + ], + index=Index([nan, "R1", "R2", "R4"], name="a"), + columns=Index(["C1", "C2", "C3", "C4"], name="b"), + ) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(df.pivot("b", "a", "c"), expected.T) + + # GH9491 + df = DataFrame( + { + "a": pd.date_range("2014-02-01", periods=6, freq="D"), + "c": 100 + np.arange(6), + } + ) + df["b"] = df["a"] - pd.Timestamp("2014-02-02") + df.loc[1, "a"] = df.loc[3, "a"] = nan + df.loc[1, "b"] = df.loc[4, "b"] = nan + + if method: + pv = df.pivot("a", "b", "c") + else: + pv = pd.pivot(df, "a", "b", "c") + assert pv.notna().values.sum() == len(df) + + for _, row in df.iterrows(): + assert pv.loc[row["a"], row["b"]] == row["c"] + + if method: + result = df.pivot("b", "a", "c") + else: + result = pd.pivot(df, "b", "a", "c") + tm.assert_frame_equal(result, pv.T) + + @pytest.mark.parametrize("method", [True, False]) + def test_pivot_with_tz(self, method): + # GH 5878 + df = DataFrame( + { + "dt1": [ + datetime(2013, 1, 1, 9, 0), + datetime(2013, 1, 2, 9, 0), + datetime(2013, 1, 1, 9, 0), + datetime(2013, 1, 2, 9, 0), + ], + "dt2": [ + datetime(2014, 1, 1, 9, 0), + datetime(2014, 1, 1, 9, 0), + datetime(2014, 1, 2, 9, 0), + datetime(2014, 1, 2, 9, 0), + ], + "data1": np.arange(4, dtype="int64"), + "data2": np.arange(4, dtype="int64"), + } + ) + + df["dt1"] = df["dt1"].apply(lambda d: pd.Timestamp(d, tz="US/Pacific")) + df["dt2"] = df["dt2"].apply(lambda d: pd.Timestamp(d, tz="Asia/Tokyo")) + + exp_col1 = Index(["data1", "data1", "data2", "data2"]) + exp_col2 = pd.DatetimeIndex( + ["2014/01/01 09:00", "2014/01/02 09:00"] * 2, name="dt2", tz="Asia/Tokyo" + ) + exp_col = pd.MultiIndex.from_arrays([exp_col1, exp_col2]) + expected = DataFrame( + [[0, 2, 0, 2], [1, 3, 1, 3]], + index=pd.DatetimeIndex( + ["2013/01/01 09:00", "2013/01/02 09:00"], name="dt1", tz="US/Pacific" + ), + columns=exp_col, + ) + + if method: + pv = df.pivot(index="dt1", columns="dt2") + else: + pv = pd.pivot(df, index="dt1", columns="dt2") + tm.assert_frame_equal(pv, expected) + + expected = DataFrame( + [[0, 2], [1, 3]], + index=pd.DatetimeIndex( + ["2013/01/01 09:00", "2013/01/02 09:00"], name="dt1", tz="US/Pacific" + ), + columns=pd.DatetimeIndex( + ["2014/01/01 09:00", "2014/01/02 09:00"], name="dt2", tz="Asia/Tokyo" + ), + ) + + if method: + pv = df.pivot(index="dt1", columns="dt2", values="data1") + else: + pv = pd.pivot(df, index="dt1", columns="dt2", values="data1") + tm.assert_frame_equal(pv, expected) + + def test_pivot_tz_in_values(self): + # GH 14948 + df = pd.DataFrame( + [ + { + "uid": "aa", + "ts": pd.Timestamp("2016-08-12 13:00:00-0700", tz="US/Pacific"), + }, + { + "uid": "aa", + "ts": pd.Timestamp("2016-08-12 08:00:00-0700", tz="US/Pacific"), + }, + { + "uid": "aa", + "ts": pd.Timestamp("2016-08-12 14:00:00-0700", tz="US/Pacific"), + }, + { + "uid": "aa", + "ts": pd.Timestamp("2016-08-25 11:00:00-0700", tz="US/Pacific"), + }, + { + "uid": "aa", + "ts": pd.Timestamp("2016-08-25 13:00:00-0700", tz="US/Pacific"), + }, + ] + ) + + df = df.set_index("ts").reset_index() + mins = df.ts.map(lambda x: x.replace(hour=0, minute=0, second=0, microsecond=0)) + + result = pd.pivot_table( + df.set_index("ts").reset_index(), + values="ts", + index=["uid"], + columns=[mins], + aggfunc=np.min, + ) + expected = pd.DataFrame( + [ + [ + pd.Timestamp("2016-08-12 08:00:00-0700", tz="US/Pacific"), + pd.Timestamp("2016-08-25 11:00:00-0700", tz="US/Pacific"), + ] + ], + index=pd.Index(["aa"], name="uid"), + columns=pd.DatetimeIndex( + [ + pd.Timestamp("2016-08-12 00:00:00", tz="US/Pacific"), + pd.Timestamp("2016-08-25 00:00:00", tz="US/Pacific"), + ], + name="ts", + ), + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("method", [True, False]) + def test_pivot_periods(self, method): + df = DataFrame( + { + "p1": [ + pd.Period("2013-01-01", "D"), + pd.Period("2013-01-02", "D"), + pd.Period("2013-01-01", "D"), + pd.Period("2013-01-02", "D"), + ], + "p2": [ + pd.Period("2013-01", "M"), + pd.Period("2013-01", "M"), + pd.Period("2013-02", "M"), + pd.Period("2013-02", "M"), + ], + "data1": np.arange(4, dtype="int64"), + "data2": np.arange(4, dtype="int64"), + } + ) + + exp_col1 = Index(["data1", "data1", "data2", "data2"]) + exp_col2 = pd.PeriodIndex(["2013-01", "2013-02"] * 2, name="p2", freq="M") + exp_col = pd.MultiIndex.from_arrays([exp_col1, exp_col2]) + expected = DataFrame( + [[0, 2, 0, 2], [1, 3, 1, 3]], + index=pd.PeriodIndex(["2013-01-01", "2013-01-02"], name="p1", freq="D"), + columns=exp_col, + ) + if method: + pv = df.pivot(index="p1", columns="p2") + else: + pv = pd.pivot(df, index="p1", columns="p2") + tm.assert_frame_equal(pv, expected) + + expected = DataFrame( + [[0, 2], [1, 3]], + index=pd.PeriodIndex(["2013-01-01", "2013-01-02"], name="p1", freq="D"), + columns=pd.PeriodIndex(["2013-01", "2013-02"], name="p2", freq="M"), + ) + if method: + pv = df.pivot(index="p1", columns="p2", values="data1") + else: + pv = pd.pivot(df, index="p1", columns="p2", values="data1") + tm.assert_frame_equal(pv, expected) + + def test_pivot_periods_with_margins(self): + # GH 28323 + df = DataFrame( + { + "a": [1, 1, 2, 2], + "b": [ + pd.Period("2019Q1"), + pd.Period("2019Q2"), + pd.Period("2019Q1"), + pd.Period("2019Q2"), + ], + "x": 1.0, + } + ) + + expected = DataFrame( + data=1.0, + index=pd.Index([1, 2, "All"], name="a"), + columns=pd.Index( + [pd.Period("2019Q1"), pd.Period("2019Q2"), "All"], name="b" + ), + ) + + result = df.pivot_table(index="a", columns="b", values="x", margins=True) + tm.assert_frame_equal(expected, result) + + @pytest.mark.parametrize( + "values", + [ + ["baz", "zoo"], + np.array(["baz", "zoo"]), + pd.Series(["baz", "zoo"]), + pd.Index(["baz", "zoo"]), + ], + ) + @pytest.mark.parametrize("method", [True, False]) + def test_pivot_with_list_like_values(self, values, method): + # issue #17160 + df = pd.DataFrame( + { + "foo": ["one", "one", "one", "two", "two", "two"], + "bar": ["A", "B", "C", "A", "B", "C"], + "baz": [1, 2, 3, 4, 5, 6], + "zoo": ["x", "y", "z", "q", "w", "t"], + } + ) + + if method: + result = df.pivot(index="foo", columns="bar", values=values) + else: + result = pd.pivot(df, index="foo", columns="bar", values=values) + + data = [[1, 2, 3, "x", "y", "z"], [4, 5, 6, "q", "w", "t"]] + index = Index(data=["one", "two"], name="foo") + columns = MultiIndex( + levels=[["baz", "zoo"], ["A", "B", "C"]], + codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], + names=[None, "bar"], + ) + expected = DataFrame(data=data, index=index, columns=columns, dtype="object") + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "values", + [ + ["bar", "baz"], + np.array(["bar", "baz"]), + pd.Series(["bar", "baz"]), + pd.Index(["bar", "baz"]), + ], + ) + @pytest.mark.parametrize("method", [True, False]) + def test_pivot_with_list_like_values_nans(self, values, method): + # issue #17160 + df = pd.DataFrame( + { + "foo": ["one", "one", "one", "two", "two", "two"], + "bar": ["A", "B", "C", "A", "B", "C"], + "baz": [1, 2, 3, 4, 5, 6], + "zoo": ["x", "y", "z", "q", "w", "t"], + } + ) + + if method: + result = df.pivot(index="zoo", columns="foo", values=values) + else: + result = pd.pivot(df, index="zoo", columns="foo", values=values) + + data = [ + [np.nan, "A", np.nan, 4], + [np.nan, "C", np.nan, 6], + [np.nan, "B", np.nan, 5], + ["A", np.nan, 1, np.nan], + ["B", np.nan, 2, np.nan], + ["C", np.nan, 3, np.nan], + ] + index = Index(data=["q", "t", "w", "x", "y", "z"], name="zoo") + columns = MultiIndex( + levels=[["bar", "baz"], ["one", "two"]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1]], + names=[None, "foo"], + ) + expected = DataFrame(data=data, index=index, columns=columns, dtype="object") + tm.assert_frame_equal(result, expected) + + @pytest.mark.xfail( + reason="MultiIndexed unstack with tuple names fails with KeyError GH#19966" + ) + @pytest.mark.parametrize("method", [True, False]) + def test_pivot_with_multiindex(self, method): + # issue #17160 + index = Index(data=[0, 1, 2, 3, 4, 5]) + data = [ + ["one", "A", 1, "x"], + ["one", "B", 2, "y"], + ["one", "C", 3, "z"], + ["two", "A", 4, "q"], + ["two", "B", 5, "w"], + ["two", "C", 6, "t"], + ] + columns = MultiIndex( + levels=[["bar", "baz"], ["first", "second"]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1]], + ) + df = DataFrame(data=data, index=index, columns=columns, dtype="object") + if method: + result = df.pivot( + index=("bar", "first"), + columns=("bar", "second"), + values=("baz", "first"), + ) + else: + result = pd.pivot( + df, + index=("bar", "first"), + columns=("bar", "second"), + values=("baz", "first"), + ) + + data = { + "A": Series([1, 4], index=["one", "two"]), + "B": Series([2, 5], index=["one", "two"]), + "C": Series([3, 6], index=["one", "two"]), + } + expected = DataFrame(data) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("method", [True, False]) + def test_pivot_with_tuple_of_values(self, method): + # issue #17160 + df = pd.DataFrame( + { + "foo": ["one", "one", "one", "two", "two", "two"], + "bar": ["A", "B", "C", "A", "B", "C"], + "baz": [1, 2, 3, 4, 5, 6], + "zoo": ["x", "y", "z", "q", "w", "t"], + } + ) + with pytest.raises(KeyError, match=r"^\('bar', 'baz'\)$"): + # tuple is seen as a single column name + if method: + df.pivot(index="zoo", columns="foo", values=("bar", "baz")) + else: + pd.pivot(df, index="zoo", columns="foo", values=("bar", "baz")) + + def test_margins(self): + def _check_output( + result, values_col, index=["A", "B"], columns=["C"], margins_col="All" + ): + col_margins = result.loc[result.index[:-1], margins_col] + expected_col_margins = self.data.groupby(index)[values_col].mean() + tm.assert_series_equal(col_margins, expected_col_margins, check_names=False) + assert col_margins.name == margins_col + + result = result.sort_index() + index_margins = result.loc[(margins_col, "")].iloc[:-1] + + expected_ix_margins = self.data.groupby(columns)[values_col].mean() + tm.assert_series_equal( + index_margins, expected_ix_margins, check_names=False + ) + assert index_margins.name == (margins_col, "") + + grand_total_margins = result.loc[(margins_col, ""), margins_col] + expected_total_margins = self.data[values_col].mean() + assert grand_total_margins == expected_total_margins + + # column specified + result = self.data.pivot_table( + values="D", index=["A", "B"], columns="C", margins=True, aggfunc=np.mean + ) + _check_output(result, "D") + + # Set a different margins_name (not 'All') + result = self.data.pivot_table( + values="D", + index=["A", "B"], + columns="C", + margins=True, + aggfunc=np.mean, + margins_name="Totals", + ) + _check_output(result, "D", margins_col="Totals") + + # no column specified + table = self.data.pivot_table( + index=["A", "B"], columns="C", margins=True, aggfunc=np.mean + ) + for value_col in table.columns.levels[0]: + _check_output(table[value_col], value_col) + + # no col + + # to help with a buglet + self.data.columns = [k * 2 for k in self.data.columns] + table = self.data.pivot_table(index=["AA", "BB"], margins=True, aggfunc=np.mean) + for value_col in table.columns: + totals = table.loc[("All", ""), value_col] + assert totals == self.data[value_col].mean() + + # no rows + rtable = self.data.pivot_table( + columns=["AA", "BB"], margins=True, aggfunc=np.mean + ) + assert isinstance(rtable, Series) + + table = self.data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean") + for item in ["DD", "EE", "FF"]: + totals = table.loc[("All", ""), item] + assert totals == self.data[item].mean() + + def test_margins_dtype(self): + # GH 17013 + + df = self.data.copy() + df[["D", "E", "F"]] = np.arange(len(df) * 3).reshape(len(df), 3) + + mi_val = list(product(["bar", "foo"], ["one", "two"])) + [("All", "")] + mi = MultiIndex.from_tuples(mi_val, names=("A", "B")) + expected = DataFrame( + {"dull": [12, 21, 3, 9, 45], "shiny": [33, 0, 36, 51, 120]}, index=mi + ).rename_axis("C", axis=1) + expected["All"] = expected["dull"] + expected["shiny"] + + result = df.pivot_table( + values="D", + index=["A", "B"], + columns="C", + margins=True, + aggfunc=np.sum, + fill_value=0, + ) + + tm.assert_frame_equal(expected, result) + + @pytest.mark.xfail(reason="GH#17035 (len of floats is casted back to floats)") + def test_margins_dtype_len(self): + mi_val = list(product(["bar", "foo"], ["one", "two"])) + [("All", "")] + mi = MultiIndex.from_tuples(mi_val, names=("A", "B")) + expected = DataFrame( + {"dull": [1, 1, 2, 1, 5], "shiny": [2, 0, 2, 2, 6]}, index=mi + ).rename_axis("C", axis=1) + expected["All"] = expected["dull"] + expected["shiny"] + + result = self.data.pivot_table( + values="D", + index=["A", "B"], + columns="C", + margins=True, + aggfunc=len, + fill_value=0, + ) + + tm.assert_frame_equal(expected, result) + + def test_pivot_integer_columns(self): + # caused by upstream bug in unstack + + d = date.min + data = list( + product( + ["foo", "bar"], + ["A", "B", "C"], + ["x1", "x2"], + [d + timedelta(i) for i in range(20)], + [1.0], + ) + ) + df = DataFrame(data) + table = df.pivot_table(values=4, index=[0, 1, 3], columns=[2]) + + df2 = df.rename(columns=str) + table2 = df2.pivot_table(values="4", index=["0", "1", "3"], columns=["2"]) + + tm.assert_frame_equal(table, table2, check_names=False) + + def test_pivot_no_level_overlap(self): + # GH #1181 + + data = DataFrame( + { + "a": ["a", "a", "a", "a", "b", "b", "b", "b"] * 2, + "b": [0, 0, 0, 0, 1, 1, 1, 1] * 2, + "c": (["foo"] * 4 + ["bar"] * 4) * 2, + "value": np.random.randn(16), + } + ) + + table = data.pivot_table("value", index="a", columns=["b", "c"]) + + grouped = data.groupby(["a", "b", "c"])["value"].mean() + expected = grouped.unstack("b").unstack("c").dropna(axis=1, how="all") + tm.assert_frame_equal(table, expected) + + def test_pivot_columns_lexsorted(self): + + n = 10000 + + dtype = np.dtype( + [ + ("Index", object), + ("Symbol", object), + ("Year", int), + ("Month", int), + ("Day", int), + ("Quantity", int), + ("Price", float), + ] + ) + + products = np.array( + [ + ("SP500", "ADBE"), + ("SP500", "NVDA"), + ("SP500", "ORCL"), + ("NDQ100", "AAPL"), + ("NDQ100", "MSFT"), + ("NDQ100", "GOOG"), + ("FTSE", "DGE.L"), + ("FTSE", "TSCO.L"), + ("FTSE", "GSK.L"), + ], + dtype=[("Index", object), ("Symbol", object)], + ) + items = np.empty(n, dtype=dtype) + iproduct = np.random.randint(0, len(products), n) + items["Index"] = products["Index"][iproduct] + items["Symbol"] = products["Symbol"][iproduct] + dr = pd.date_range(date(2000, 1, 1), date(2010, 12, 31)) + dates = dr[np.random.randint(0, len(dr), n)] + items["Year"] = dates.year + items["Month"] = dates.month + items["Day"] = dates.day + items["Price"] = np.random.lognormal(4.0, 2.0, n) + + df = DataFrame(items) + + pivoted = df.pivot_table( + "Price", + index=["Month", "Day"], + columns=["Index", "Symbol", "Year"], + aggfunc="mean", + ) + + assert pivoted.columns.is_monotonic + + def test_pivot_complex_aggfunc(self): + f = {"D": ["std"], "E": ["sum"]} + expected = self.data.groupby(["A", "B"]).agg(f).unstack("B") + result = self.data.pivot_table(index="A", columns="B", aggfunc=f) + + tm.assert_frame_equal(result, expected) + + def test_margins_no_values_no_cols(self): + # Regression test on pivot table: no values or cols passed. + result = self.data[["A", "B"]].pivot_table( + index=["A", "B"], aggfunc=len, margins=True + ) + result_list = result.tolist() + assert sum(result_list[:-1]) == result_list[-1] + + def test_margins_no_values_two_rows(self): + # Regression test on pivot table: no values passed but rows are a + # multi-index + result = self.data[["A", "B", "C"]].pivot_table( + index=["A", "B"], columns="C", aggfunc=len, margins=True + ) + assert result.All.tolist() == [3.0, 1.0, 4.0, 3.0, 11.0] + + def test_margins_no_values_one_row_one_col(self): + # Regression test on pivot table: no values passed but row and col + # defined + result = self.data[["A", "B"]].pivot_table( + index="A", columns="B", aggfunc=len, margins=True + ) + assert result.All.tolist() == [4.0, 7.0, 11.0] + + def test_margins_no_values_two_row_two_cols(self): + # Regression test on pivot table: no values passed but rows and cols + # are multi-indexed + self.data["D"] = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"] + result = self.data[["A", "B", "C", "D"]].pivot_table( + index=["A", "B"], columns=["C", "D"], aggfunc=len, margins=True + ) + assert result.All.tolist() == [3.0, 1.0, 4.0, 3.0, 11.0] + + @pytest.mark.parametrize("margin_name", ["foo", "one", 666, None, ["a", "b"]]) + def test_pivot_table_with_margins_set_margin_name(self, margin_name): + # see gh-3335 + msg = ( + r'Conflicting name "{}" in margins|' + "margins_name argument must be a string" + ).format(margin_name) + with pytest.raises(ValueError, match=msg): + # multi-index index + pivot_table( + self.data, + values="D", + index=["A", "B"], + columns=["C"], + margins=True, + margins_name=margin_name, + ) + with pytest.raises(ValueError, match=msg): + # multi-index column + pivot_table( + self.data, + values="D", + index=["C"], + columns=["A", "B"], + margins=True, + margins_name=margin_name, + ) + with pytest.raises(ValueError, match=msg): + # non-multi-index index/column + pivot_table( + self.data, + values="D", + index=["A"], + columns=["B"], + margins=True, + margins_name=margin_name, + ) + + def test_pivot_timegrouper(self): + df = DataFrame( + { + "Branch": "A A A A A A A B".split(), + "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(), + "Quantity": [1, 3, 5, 1, 8, 1, 9, 3], + "Date": [ + datetime(2013, 1, 1), + datetime(2013, 1, 1), + datetime(2013, 10, 1), + datetime(2013, 10, 2), + datetime(2013, 10, 1), + datetime(2013, 10, 2), + datetime(2013, 12, 2), + datetime(2013, 12, 2), + ], + } + ).set_index("Date") + + expected = DataFrame( + np.array([10, 18, 3], dtype="int64").reshape(1, 3), + index=[datetime(2013, 12, 31)], + columns="Carl Joe Mark".split(), + ) + expected.index.name = "Date" + expected.columns.name = "Buyer" + + result = pivot_table( + df, + index=Grouper(freq="A"), + columns="Buyer", + values="Quantity", + aggfunc=np.sum, + ) + tm.assert_frame_equal(result, expected) + + result = pivot_table( + df, + index="Buyer", + columns=Grouper(freq="A"), + values="Quantity", + aggfunc=np.sum, + ) + tm.assert_frame_equal(result, expected.T) + + expected = DataFrame( + np.array([1, np.nan, 3, 9, 18, np.nan]).reshape(2, 3), + index=[datetime(2013, 1, 1), datetime(2013, 7, 1)], + columns="Carl Joe Mark".split(), + ) + expected.index.name = "Date" + expected.columns.name = "Buyer" + + result = pivot_table( + df, + index=Grouper(freq="6MS"), + columns="Buyer", + values="Quantity", + aggfunc=np.sum, + ) + tm.assert_frame_equal(result, expected) + + result = pivot_table( + df, + index="Buyer", + columns=Grouper(freq="6MS"), + values="Quantity", + aggfunc=np.sum, + ) + tm.assert_frame_equal(result, expected.T) + + # passing the name + df = df.reset_index() + result = pivot_table( + df, + index=Grouper(freq="6MS", key="Date"), + columns="Buyer", + values="Quantity", + aggfunc=np.sum, + ) + tm.assert_frame_equal(result, expected) + + result = pivot_table( + df, + index="Buyer", + columns=Grouper(freq="6MS", key="Date"), + values="Quantity", + aggfunc=np.sum, + ) + tm.assert_frame_equal(result, expected.T) + + msg = "'The grouper name foo is not found'" + with pytest.raises(KeyError, match=msg): + pivot_table( + df, + index=Grouper(freq="6MS", key="foo"), + columns="Buyer", + values="Quantity", + aggfunc=np.sum, + ) + with pytest.raises(KeyError, match=msg): + pivot_table( + df, + index="Buyer", + columns=Grouper(freq="6MS", key="foo"), + values="Quantity", + aggfunc=np.sum, + ) + + # passing the level + df = df.set_index("Date") + result = pivot_table( + df, + index=Grouper(freq="6MS", level="Date"), + columns="Buyer", + values="Quantity", + aggfunc=np.sum, + ) + tm.assert_frame_equal(result, expected) + + result = pivot_table( + df, + index="Buyer", + columns=Grouper(freq="6MS", level="Date"), + values="Quantity", + aggfunc=np.sum, + ) + tm.assert_frame_equal(result, expected.T) + + msg = "The level foo is not valid" + with pytest.raises(ValueError, match=msg): + pivot_table( + df, + index=Grouper(freq="6MS", level="foo"), + columns="Buyer", + values="Quantity", + aggfunc=np.sum, + ) + with pytest.raises(ValueError, match=msg): + pivot_table( + df, + index="Buyer", + columns=Grouper(freq="6MS", level="foo"), + values="Quantity", + aggfunc=np.sum, + ) + + # double grouper + df = DataFrame( + { + "Branch": "A A A A A A A B".split(), + "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(), + "Quantity": [1, 3, 5, 1, 8, 1, 9, 3], + "Date": [ + datetime(2013, 11, 1, 13, 0), + datetime(2013, 9, 1, 13, 5), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 2, 10, 0), + datetime(2013, 11, 1, 20, 0), + datetime(2013, 10, 2, 10, 0), + datetime(2013, 10, 2, 12, 0), + datetime(2013, 12, 5, 14, 0), + ], + "PayDay": [ + datetime(2013, 10, 4, 0, 0), + datetime(2013, 10, 15, 13, 5), + datetime(2013, 9, 5, 20, 0), + datetime(2013, 11, 2, 10, 0), + datetime(2013, 10, 7, 20, 0), + datetime(2013, 9, 5, 10, 0), + datetime(2013, 12, 30, 12, 0), + datetime(2013, 11, 20, 14, 0), + ], + } + ) + + result = pivot_table( + df, + index=Grouper(freq="M", key="Date"), + columns=Grouper(freq="M", key="PayDay"), + values="Quantity", + aggfunc=np.sum, + ) + expected = DataFrame( + np.array( + [ + np.nan, + 3, + np.nan, + np.nan, + 6, + np.nan, + 1, + 9, + np.nan, + 9, + np.nan, + np.nan, + np.nan, + np.nan, + 3, + np.nan, + ] + ).reshape(4, 4), + index=[ + datetime(2013, 9, 30), + datetime(2013, 10, 31), + datetime(2013, 11, 30), + datetime(2013, 12, 31), + ], + columns=[ + datetime(2013, 9, 30), + datetime(2013, 10, 31), + datetime(2013, 11, 30), + datetime(2013, 12, 31), + ], + ) + expected.index.name = "Date" + expected.columns.name = "PayDay" + + tm.assert_frame_equal(result, expected) + + result = pivot_table( + df, + index=Grouper(freq="M", key="PayDay"), + columns=Grouper(freq="M", key="Date"), + values="Quantity", + aggfunc=np.sum, + ) + tm.assert_frame_equal(result, expected.T) + + tuples = [ + (datetime(2013, 9, 30), datetime(2013, 10, 31)), + (datetime(2013, 10, 31), datetime(2013, 9, 30)), + (datetime(2013, 10, 31), datetime(2013, 11, 30)), + (datetime(2013, 10, 31), datetime(2013, 12, 31)), + (datetime(2013, 11, 30), datetime(2013, 10, 31)), + (datetime(2013, 12, 31), datetime(2013, 11, 30)), + ] + idx = MultiIndex.from_tuples(tuples, names=["Date", "PayDay"]) + expected = DataFrame( + np.array( + [3, np.nan, 6, np.nan, 1, np.nan, 9, np.nan, 9, np.nan, np.nan, 3] + ).reshape(6, 2), + index=idx, + columns=["A", "B"], + ) + expected.columns.name = "Branch" + + result = pivot_table( + df, + index=[Grouper(freq="M", key="Date"), Grouper(freq="M", key="PayDay")], + columns=["Branch"], + values="Quantity", + aggfunc=np.sum, + ) + tm.assert_frame_equal(result, expected) + + result = pivot_table( + df, + index=["Branch"], + columns=[Grouper(freq="M", key="Date"), Grouper(freq="M", key="PayDay")], + values="Quantity", + aggfunc=np.sum, + ) + tm.assert_frame_equal(result, expected.T) + + def test_pivot_datetime_tz(self): + dates1 = [ + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + ] + dates2 = [ + "2013-01-01 15:00:00", + "2013-01-01 15:00:00", + "2013-01-01 15:00:00", + "2013-02-01 15:00:00", + "2013-02-01 15:00:00", + "2013-02-01 15:00:00", + ] + df = DataFrame( + { + "label": ["a", "a", "a", "b", "b", "b"], + "dt1": dates1, + "dt2": dates2, + "value1": np.arange(6, dtype="int64"), + "value2": [1, 2] * 3, + } + ) + df["dt1"] = df["dt1"].apply(lambda d: pd.Timestamp(d, tz="US/Pacific")) + df["dt2"] = df["dt2"].apply(lambda d: pd.Timestamp(d, tz="Asia/Tokyo")) + + exp_idx = pd.DatetimeIndex( + ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], + tz="US/Pacific", + name="dt1", + ) + exp_col1 = Index(["value1", "value1"]) + exp_col2 = Index(["a", "b"], name="label") + exp_col = MultiIndex.from_arrays([exp_col1, exp_col2]) + expected = DataFrame([[0, 3], [1, 4], [2, 5]], index=exp_idx, columns=exp_col) + result = pivot_table(df, index=["dt1"], columns=["label"], values=["value1"]) + tm.assert_frame_equal(result, expected) + + exp_col1 = Index(["sum", "sum", "sum", "sum", "mean", "mean", "mean", "mean"]) + exp_col2 = Index(["value1", "value1", "value2", "value2"] * 2) + exp_col3 = pd.DatetimeIndex( + ["2013-01-01 15:00:00", "2013-02-01 15:00:00"] * 4, + tz="Asia/Tokyo", + name="dt2", + ) + exp_col = MultiIndex.from_arrays([exp_col1, exp_col2, exp_col3]) + expected = DataFrame( + np.array( + [ + [0, 3, 1, 2, 0, 3, 1, 2], + [1, 4, 2, 1, 1, 4, 2, 1], + [2, 5, 1, 2, 2, 5, 1, 2], + ], + dtype="int64", + ), + index=exp_idx, + columns=exp_col, + ) + + result = pivot_table( + df, + index=["dt1"], + columns=["dt2"], + values=["value1", "value2"], + aggfunc=[np.sum, np.mean], + ) + tm.assert_frame_equal(result, expected) + + def test_pivot_dtaccessor(self): + # GH 8103 + dates1 = [ + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + ] + dates2 = [ + "2013-01-01 15:00:00", + "2013-01-01 15:00:00", + "2013-01-01 15:00:00", + "2013-02-01 15:00:00", + "2013-02-01 15:00:00", + "2013-02-01 15:00:00", + ] + df = DataFrame( + { + "label": ["a", "a", "a", "b", "b", "b"], + "dt1": dates1, + "dt2": dates2, + "value1": np.arange(6, dtype="int64"), + "value2": [1, 2] * 3, + } + ) + df["dt1"] = df["dt1"].apply(lambda d: pd.Timestamp(d)) + df["dt2"] = df["dt2"].apply(lambda d: pd.Timestamp(d)) + + result = pivot_table( + df, index="label", columns=df["dt1"].dt.hour, values="value1" + ) + + exp_idx = Index(["a", "b"], name="label") + expected = DataFrame( + {7: [0, 3], 8: [1, 4], 9: [2, 5]}, + index=exp_idx, + columns=Index([7, 8, 9], name="dt1"), + ) + tm.assert_frame_equal(result, expected) + + result = pivot_table( + df, index=df["dt2"].dt.month, columns=df["dt1"].dt.hour, values="value1" + ) + + expected = DataFrame( + {7: [0, 3], 8: [1, 4], 9: [2, 5]}, + index=Index([1, 2], name="dt2"), + columns=Index([7, 8, 9], name="dt1"), + ) + tm.assert_frame_equal(result, expected) + + result = pivot_table( + df, + index=df["dt2"].dt.year.values, + columns=[df["dt1"].dt.hour, df["dt2"].dt.month], + values="value1", + ) + + exp_col = MultiIndex.from_arrays( + [[7, 7, 8, 8, 9, 9], [1, 2] * 3], names=["dt1", "dt2"] + ) + expected = DataFrame( + np.array([[0, 3, 1, 4, 2, 5]], dtype="int64"), index=[2013], columns=exp_col + ) + tm.assert_frame_equal(result, expected) + + result = pivot_table( + df, + index=np.array(["X", "X", "X", "X", "Y", "Y"]), + columns=[df["dt1"].dt.hour, df["dt2"].dt.month], + values="value1", + ) + expected = DataFrame( + np.array( + [[0, 3, 1, np.nan, 2, np.nan], [np.nan, np.nan, np.nan, 4, np.nan, 5]] + ), + index=["X", "Y"], + columns=exp_col, + ) + tm.assert_frame_equal(result, expected) + + def test_daily(self): + rng = date_range("1/1/2000", "12/31/2004", freq="D") + ts = Series(np.random.randn(len(rng)), index=rng) + + annual = pivot_table( + DataFrame(ts), index=ts.index.year, columns=ts.index.dayofyear + ) + annual.columns = annual.columns.droplevel(0) + + doy = np.asarray(ts.index.dayofyear) + + for i in range(1, 367): + subset = ts[doy == i] + subset.index = subset.index.year + + result = annual[i].dropna() + tm.assert_series_equal(result, subset, check_names=False) + assert result.name == i + + def test_monthly(self): + rng = date_range("1/1/2000", "12/31/2004", freq="M") + ts = Series(np.random.randn(len(rng)), index=rng) + + annual = pivot_table( + pd.DataFrame(ts), index=ts.index.year, columns=ts.index.month + ) + annual.columns = annual.columns.droplevel(0) + + month = ts.index.month + for i in range(1, 13): + subset = ts[month == i] + subset.index = subset.index.year + result = annual[i].dropna() + tm.assert_series_equal(result, subset, check_names=False) + assert result.name == i + + def test_pivot_table_with_iterator_values(self): + # GH 12017 + aggs = {"D": "sum", "E": "mean"} + + pivot_values_list = pd.pivot_table( + self.data, index=["A"], values=list(aggs.keys()), aggfunc=aggs + ) + + pivot_values_keys = pd.pivot_table( + self.data, index=["A"], values=aggs.keys(), aggfunc=aggs + ) + tm.assert_frame_equal(pivot_values_keys, pivot_values_list) + + agg_values_gen = (value for value in aggs.keys()) + pivot_values_gen = pd.pivot_table( + self.data, index=["A"], values=agg_values_gen, aggfunc=aggs + ) + tm.assert_frame_equal(pivot_values_gen, pivot_values_list) + + def test_pivot_table_margins_name_with_aggfunc_list(self): + # GH 13354 + margins_name = "Weekly" + costs = pd.DataFrame( + { + "item": ["bacon", "cheese", "bacon", "cheese"], + "cost": [2.5, 4.5, 3.2, 3.3], + "day": ["M", "M", "T", "T"], + } + ) + table = costs.pivot_table( + index="item", + columns="day", + margins=True, + margins_name=margins_name, + aggfunc=[np.mean, max], + ) + ix = pd.Index(["bacon", "cheese", margins_name], dtype="object", name="item") + tups = [ + ("mean", "cost", "M"), + ("mean", "cost", "T"), + ("mean", "cost", margins_name), + ("max", "cost", "M"), + ("max", "cost", "T"), + ("max", "cost", margins_name), + ] + cols = pd.MultiIndex.from_tuples(tups, names=[None, None, "day"]) + expected = pd.DataFrame(table.values, index=ix, columns=cols) + tm.assert_frame_equal(table, expected) + + @pytest.mark.xfail(reason="GH#17035 (np.mean of ints is casted back to ints)") + def test_categorical_margins(self, observed): + # GH 10989 + df = pd.DataFrame( + {"x": np.arange(8), "y": np.arange(8) // 4, "z": np.arange(8) % 2} + ) + + expected = pd.DataFrame([[1.0, 2.0, 1.5], [5, 6, 5.5], [3, 4, 3.5]]) + expected.index = Index([0, 1, "All"], name="y") + expected.columns = Index([0, 1, "All"], name="z") + + table = df.pivot_table("x", "y", "z", dropna=observed, margins=True) + tm.assert_frame_equal(table, expected) + + @pytest.mark.xfail(reason="GH#17035 (np.mean of ints is casted back to ints)") + def test_categorical_margins_category(self, observed): + df = pd.DataFrame( + {"x": np.arange(8), "y": np.arange(8) // 4, "z": np.arange(8) % 2} + ) + + expected = pd.DataFrame([[1.0, 2.0, 1.5], [5, 6, 5.5], [3, 4, 3.5]]) + expected.index = Index([0, 1, "All"], name="y") + expected.columns = Index([0, 1, "All"], name="z") + + df.y = df.y.astype("category") + df.z = df.z.astype("category") + table = df.pivot_table("x", "y", "z", dropna=observed, margins=True) + tm.assert_frame_equal(table, expected) + + def test_margins_casted_to_float(self, observed): + # GH 24893 + df = pd.DataFrame( + { + "A": [2, 4, 6, 8], + "B": [1, 4, 5, 8], + "C": [1, 3, 4, 6], + "D": ["X", "X", "Y", "Y"], + } + ) + + result = pd.pivot_table(df, index="D", margins=True) + expected = pd.DataFrame( + {"A": [3, 7, 5], "B": [2.5, 6.5, 4.5], "C": [2, 5, 3.5]}, + index=pd.Index(["X", "Y", "All"], name="D"), + ) + tm.assert_frame_equal(result, expected) + + def test_pivot_with_categorical(self, observed, ordered_fixture): + # gh-21370 + idx = [np.nan, "low", "high", "low", np.nan] + col = [np.nan, "A", "B", np.nan, "A"] + df = pd.DataFrame( + { + "In": pd.Categorical( + idx, categories=["low", "high"], ordered=ordered_fixture + ), + "Col": pd.Categorical( + col, categories=["A", "B"], ordered=ordered_fixture + ), + "Val": range(1, 6), + } + ) + # case with index/columns/value + result = df.pivot_table( + index="In", columns="Col", values="Val", observed=observed + ) + + expected_cols = pd.CategoricalIndex( + ["A", "B"], ordered=ordered_fixture, name="Col" + ) + + expected = pd.DataFrame( + data=[[2.0, np.nan], [np.nan, 3.0]], columns=expected_cols + ) + expected.index = Index( + pd.Categorical( + ["low", "high"], categories=["low", "high"], ordered=ordered_fixture + ), + name="In", + ) + + tm.assert_frame_equal(result, expected) + + # case with columns/value + result = df.pivot_table(columns="Col", values="Val", observed=observed) + + expected = pd.DataFrame( + data=[[3.5, 3.0]], columns=expected_cols, index=Index(["Val"]) + ) + + tm.assert_frame_equal(result, expected) + + def test_categorical_aggfunc(self, observed): + # GH 9534 + df = pd.DataFrame( + {"C1": ["A", "B", "C", "C"], "C2": ["a", "a", "b", "b"], "V": [1, 2, 3, 4]} + ) + df["C1"] = df["C1"].astype("category") + result = df.pivot_table( + "V", index="C1", columns="C2", dropna=observed, aggfunc="count" + ) + + expected_index = pd.CategoricalIndex( + ["A", "B", "C"], categories=["A", "B", "C"], ordered=False, name="C1" + ) + expected_columns = pd.Index(["a", "b"], name="C2") + expected_data = np.array([[1.0, np.nan], [1.0, np.nan], [np.nan, 2.0]]) + expected = pd.DataFrame( + expected_data, index=expected_index, columns=expected_columns + ) + tm.assert_frame_equal(result, expected) + + def test_categorical_pivot_index_ordering(self, observed): + # GH 8731 + df = pd.DataFrame( + { + "Sales": [100, 120, 220], + "Month": ["January", "January", "January"], + "Year": [2013, 2014, 2013], + } + ) + months = [ + "January", + "February", + "March", + "April", + "May", + "June", + "July", + "August", + "September", + "October", + "November", + "December", + ] + df["Month"] = df["Month"].astype("category").cat.set_categories(months) + result = df.pivot_table( + values="Sales", + index="Month", + columns="Year", + dropna=observed, + aggfunc="sum", + ) + expected_columns = pd.Int64Index([2013, 2014], name="Year") + expected_index = pd.CategoricalIndex( + ["January"], categories=months, ordered=False, name="Month" + ) + expected = pd.DataFrame( + [[320, 120]], index=expected_index, columns=expected_columns + ) + if not observed: + result = result.dropna().astype(np.int64) + + tm.assert_frame_equal(result, expected) + + def test_pivot_table_not_series(self): + # GH 4386 + # pivot_table always returns a DataFrame + # when values is not list like and columns is None + # and aggfunc is not instance of list + df = DataFrame({"col1": [3, 4, 5], "col2": ["C", "D", "E"], "col3": [1, 3, 9]}) + + result = df.pivot_table("col1", index=["col3", "col2"], aggfunc=np.sum) + m = MultiIndex.from_arrays([[1, 3, 9], ["C", "D", "E"]], names=["col3", "col2"]) + expected = DataFrame([3, 4, 5], index=m, columns=["col1"]) + + tm.assert_frame_equal(result, expected) + + result = df.pivot_table("col1", index="col3", columns="col2", aggfunc=np.sum) + expected = DataFrame( + [[3, np.NaN, np.NaN], [np.NaN, 4, np.NaN], [np.NaN, np.NaN, 5]], + index=Index([1, 3, 9], name="col3"), + columns=Index(["C", "D", "E"], name="col2"), + ) + + tm.assert_frame_equal(result, expected) + + result = df.pivot_table("col1", index="col3", aggfunc=[np.sum]) + m = MultiIndex.from_arrays([["sum"], ["col1"]]) + expected = DataFrame([3, 4, 5], index=Index([1, 3, 9], name="col3"), columns=m) + + tm.assert_frame_equal(result, expected) + + def test_pivot_margins_name_unicode(self): + # issue #13292 + greek = "\u0394\u03bf\u03ba\u03b9\u03bc\u03ae" + frame = pd.DataFrame({"foo": [1, 2, 3]}) + table = pd.pivot_table( + frame, index=["foo"], aggfunc=len, margins=True, margins_name=greek + ) + index = pd.Index([1, 2, 3, greek], dtype="object", name="foo") + expected = pd.DataFrame(index=index) + tm.assert_frame_equal(table, expected) + + def test_pivot_string_as_func(self): + # GH #18713 + # for correctness purposes + data = DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": range(11), + } + ) + + result = pivot_table(data, index="A", columns="B", aggfunc="sum") + mi = MultiIndex( + levels=[["C"], ["one", "two"]], codes=[[0, 0], [0, 1]], names=[None, "B"] + ) + expected = DataFrame( + {("C", "one"): {"bar": 15, "foo": 13}, ("C", "two"): {"bar": 7, "foo": 20}}, + columns=mi, + ).rename_axis("A") + tm.assert_frame_equal(result, expected) + + result = pivot_table(data, index="A", columns="B", aggfunc=["sum", "mean"]) + mi = MultiIndex( + levels=[["sum", "mean"], ["C"], ["one", "two"]], + codes=[[0, 0, 1, 1], [0, 0, 0, 0], [0, 1, 0, 1]], + names=[None, None, "B"], + ) + expected = DataFrame( + { + ("mean", "C", "one"): {"bar": 5.0, "foo": 3.25}, + ("mean", "C", "two"): {"bar": 7.0, "foo": 6.666666666666667}, + ("sum", "C", "one"): {"bar": 15, "foo": 13}, + ("sum", "C", "two"): {"bar": 7, "foo": 20}, + }, + columns=mi, + ).rename_axis("A") + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "f, f_numpy", + [ + ("sum", np.sum), + ("mean", np.mean), + ("std", np.std), + (["sum", "mean"], [np.sum, np.mean]), + (["sum", "std"], [np.sum, np.std]), + (["std", "mean"], [np.std, np.mean]), + ], + ) + def test_pivot_string_func_vs_func(self, f, f_numpy): + # GH #18713 + # for consistency purposes + result = pivot_table(self.data, index="A", columns="B", aggfunc=f) + expected = pivot_table(self.data, index="A", columns="B", aggfunc=f_numpy) + tm.assert_frame_equal(result, expected) + + @pytest.mark.slow + def test_pivot_number_of_levels_larger_than_int32(self): + # GH 20601 + df = DataFrame( + {"ind1": np.arange(2 ** 16), "ind2": np.arange(2 ** 16), "count": 0} + ) + + msg = "Unstacked DataFrame is too big, causing int32 overflow" + with pytest.raises(ValueError, match=msg): + df.pivot_table( + index="ind1", columns="ind2", values="count", aggfunc="count" + ) + + def test_pivot_table_aggfunc_dropna(self, dropna): + # GH 22159 + df = pd.DataFrame( + { + "fruit": ["apple", "peach", "apple"], + "size": [1, 1, 2], + "taste": [7, 6, 6], + } + ) + + def ret_one(x): + return 1 + + def ret_sum(x): + return sum(x) + + def ret_none(x): + return np.nan + + result = pd.pivot_table( + df, columns="fruit", aggfunc=[ret_sum, ret_none, ret_one], dropna=dropna + ) + + data = [[3, 1, np.nan, np.nan, 1, 1], [13, 6, np.nan, np.nan, 1, 1]] + col = pd.MultiIndex.from_product( + [["ret_sum", "ret_none", "ret_one"], ["apple", "peach"]], + names=[None, "fruit"], + ) + expected = pd.DataFrame(data, index=["size", "taste"], columns=col) + + if dropna: + expected = expected.dropna(axis="columns") + + tm.assert_frame_equal(result, expected) + + def test_pivot_table_aggfunc_scalar_dropna(self, dropna): + # GH 22159 + df = pd.DataFrame( + {"A": ["one", "two", "one"], "x": [3, np.nan, 2], "y": [1, np.nan, np.nan]} + ) + + result = pd.pivot_table(df, columns="A", aggfunc=np.mean, dropna=dropna) + + data = [[2.5, np.nan], [1, np.nan]] + col = pd.Index(["one", "two"], name="A") + expected = pd.DataFrame(data, index=["x", "y"], columns=col) + + if dropna: + expected = expected.dropna(axis="columns") + + tm.assert_frame_equal(result, expected) + + def test_pivot_table_empty_aggfunc(self): + # GH 9186 + df = pd.DataFrame( + { + "A": [2, 2, 3, 3, 2], + "id": [5, 6, 7, 8, 9], + "C": ["p", "q", "q", "p", "q"], + "D": [None, None, None, None, None], + } + ) + result = df.pivot_table(index="A", columns="D", values="id", aggfunc=np.size) + expected = pd.DataFrame() + tm.assert_frame_equal(result, expected) + + def test_pivot_table_no_column_raises(self): + # GH 10326 + def agg(l): + return np.mean(l) + + foo = pd.DataFrame( + {"X": [0, 0, 1, 1], "Y": [0, 1, 0, 1], "Z": [10, 20, 30, 40]} + ) + with pytest.raises(KeyError, match="notpresent"): + foo.pivot_table("notpresent", "X", "Y", aggfunc=agg) + + +class TestCrosstab: + def setup_method(self, method): + df = DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + "D": np.random.randn(11), + "E": np.random.randn(11), + "F": np.random.randn(11), + } + ) + + self.df = df.append(df, ignore_index=True) + + def test_crosstab_single(self): + df = self.df + result = crosstab(df["A"], df["C"]) + expected = df.groupby(["A", "C"]).size().unstack() + tm.assert_frame_equal(result, expected.fillna(0).astype(np.int64)) + + def test_crosstab_multiple(self): + df = self.df + + result = crosstab(df["A"], [df["B"], df["C"]]) + expected = df.groupby(["A", "B", "C"]).size() + expected = expected.unstack("B").unstack("C").fillna(0).astype(np.int64) + tm.assert_frame_equal(result, expected) + + result = crosstab([df["B"], df["C"]], df["A"]) + expected = df.groupby(["B", "C", "A"]).size() + expected = expected.unstack("A").fillna(0).astype(np.int64) + tm.assert_frame_equal(result, expected) + + def test_crosstab_ndarray(self): + a = np.random.randint(0, 5, size=100) + b = np.random.randint(0, 3, size=100) + c = np.random.randint(0, 10, size=100) + + df = DataFrame({"a": a, "b": b, "c": c}) + + result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c")) + expected = crosstab(df["a"], [df["b"], df["c"]]) + tm.assert_frame_equal(result, expected) + + result = crosstab([b, c], a, colnames=["a"], rownames=("b", "c")) + expected = crosstab([df["b"], df["c"]], df["a"]) + tm.assert_frame_equal(result, expected) + + # assign arbitrary names + result = crosstab(self.df["A"].values, self.df["C"].values) + assert result.index.name == "row_0" + assert result.columns.name == "col_0" + + def test_crosstab_non_aligned(self): + # GH 17005 + a = pd.Series([0, 1, 1], index=["a", "b", "c"]) + b = pd.Series([3, 4, 3, 4, 3], index=["a", "b", "c", "d", "f"]) + c = np.array([3, 4, 3]) + + expected = pd.DataFrame( + [[1, 0], [1, 1]], + index=Index([0, 1], name="row_0"), + columns=Index([3, 4], name="col_0"), + ) + + result = crosstab(a, b) + tm.assert_frame_equal(result, expected) + + result = crosstab(a, c) + tm.assert_frame_equal(result, expected) + + def test_crosstab_margins(self): + a = np.random.randint(0, 7, size=100) + b = np.random.randint(0, 3, size=100) + c = np.random.randint(0, 5, size=100) + + df = DataFrame({"a": a, "b": b, "c": c}) + + result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c"), margins=True) + + assert result.index.names == ("a",) + assert result.columns.names == ["b", "c"] + + all_cols = result["All", ""] + exp_cols = df.groupby(["a"]).size().astype("i8") + # to keep index.name + exp_margin = Series([len(df)], index=Index(["All"], name="a")) + exp_cols = exp_cols.append(exp_margin) + exp_cols.name = ("All", "") + + tm.assert_series_equal(all_cols, exp_cols) + + all_rows = result.loc["All"] + exp_rows = df.groupby(["b", "c"]).size().astype("i8") + exp_rows = exp_rows.append(Series([len(df)], index=[("All", "")])) + exp_rows.name = "All" + + exp_rows = exp_rows.reindex(all_rows.index) + exp_rows = exp_rows.fillna(0).astype(np.int64) + tm.assert_series_equal(all_rows, exp_rows) + + def test_crosstab_margins_set_margin_name(self): + # GH 15972 + a = np.random.randint(0, 7, size=100) + b = np.random.randint(0, 3, size=100) + c = np.random.randint(0, 5, size=100) + + df = DataFrame({"a": a, "b": b, "c": c}) + + result = crosstab( + a, + [b, c], + rownames=["a"], + colnames=("b", "c"), + margins=True, + margins_name="TOTAL", + ) + + assert result.index.names == ("a",) + assert result.columns.names == ["b", "c"] + + all_cols = result["TOTAL", ""] + exp_cols = df.groupby(["a"]).size().astype("i8") + # to keep index.name + exp_margin = Series([len(df)], index=Index(["TOTAL"], name="a")) + exp_cols = exp_cols.append(exp_margin) + exp_cols.name = ("TOTAL", "") + + tm.assert_series_equal(all_cols, exp_cols) + + all_rows = result.loc["TOTAL"] + exp_rows = df.groupby(["b", "c"]).size().astype("i8") + exp_rows = exp_rows.append(Series([len(df)], index=[("TOTAL", "")])) + exp_rows.name = "TOTAL" + + exp_rows = exp_rows.reindex(all_rows.index) + exp_rows = exp_rows.fillna(0).astype(np.int64) + tm.assert_series_equal(all_rows, exp_rows) + + msg = "margins_name argument must be a string" + for margins_name in [666, None, ["a", "b"]]: + with pytest.raises(ValueError, match=msg): + crosstab( + a, + [b, c], + rownames=["a"], + colnames=("b", "c"), + margins=True, + margins_name=margins_name, + ) + + def test_crosstab_pass_values(self): + a = np.random.randint(0, 7, size=100) + b = np.random.randint(0, 3, size=100) + c = np.random.randint(0, 5, size=100) + values = np.random.randn(100) + + table = crosstab( + [a, b], c, values, aggfunc=np.sum, rownames=["foo", "bar"], colnames=["baz"] + ) + + df = DataFrame({"foo": a, "bar": b, "baz": c, "values": values}) + + expected = df.pivot_table( + "values", index=["foo", "bar"], columns="baz", aggfunc=np.sum + ) + tm.assert_frame_equal(table, expected) + + def test_crosstab_dropna(self): + # GH 3820 + a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object) + b = np.array(["one", "one", "two", "one", "two", "two", "two"], dtype=object) + c = np.array( + ["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object + ) + res = pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=False) + m = MultiIndex.from_tuples( + [("one", "dull"), ("one", "shiny"), ("two", "dull"), ("two", "shiny")], + names=["b", "c"], + ) + tm.assert_index_equal(res.columns, m) + + def test_crosstab_no_overlap(self): + # GS 10291 + + s1 = pd.Series([1, 2, 3], index=[1, 2, 3]) + s2 = pd.Series([4, 5, 6], index=[4, 5, 6]) + + actual = crosstab(s1, s2) + expected = pd.DataFrame() + + tm.assert_frame_equal(actual, expected) + + def test_margin_dropna(self): + # GH 12577 + # pivot_table counts null into margin ('All') + # when margins=true and dropna=true + + df = pd.DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]}) + actual = pd.crosstab(df.a, df.b, margins=True, dropna=True) + expected = pd.DataFrame([[1, 0, 1], [1, 3, 4], [2, 3, 5]]) + expected.index = Index([1.0, 2.0, "All"], name="a") + expected.columns = Index([3, 4, "All"], name="b") + tm.assert_frame_equal(actual, expected) + + df = DataFrame( + {"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]} + ) + actual = pd.crosstab(df.a, df.b, margins=True, dropna=True) + expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) + expected.index = Index([1.0, 2.0, "All"], name="a") + expected.columns = Index([3.0, 4.0, "All"], name="b") + tm.assert_frame_equal(actual, expected) + + df = DataFrame( + {"a": [1, np.nan, np.nan, np.nan, np.nan, 2], "b": [3, 3, 4, 4, 4, 4]} + ) + actual = pd.crosstab(df.a, df.b, margins=True, dropna=True) + expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) + expected.index = Index([1.0, 2.0, "All"], name="a") + expected.columns = Index([3, 4, "All"], name="b") + tm.assert_frame_equal(actual, expected) + + # GH 12642 + # _add_margins raises KeyError: Level None not found + # when margins=True and dropna=False + df = pd.DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]}) + actual = pd.crosstab(df.a, df.b, margins=True, dropna=False) + expected = pd.DataFrame([[1, 0, 1], [1, 3, 4], [2, 4, 6]]) + expected.index = Index([1.0, 2.0, "All"], name="a") + expected.columns = Index([3, 4, "All"], name="b") + tm.assert_frame_equal(actual, expected) + + df = DataFrame( + {"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]} + ) + actual = pd.crosstab(df.a, df.b, margins=True, dropna=False) + expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 4, 6]]) + expected.index = Index([1.0, 2.0, "All"], name="a") + expected.columns = Index([3.0, 4.0, "All"], name="b") + tm.assert_frame_equal(actual, expected) + + a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object) + b = np.array(["one", "one", "two", "one", "two", np.nan, "two"], dtype=object) + c = np.array( + ["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object + ) + + actual = pd.crosstab( + a, [b, c], rownames=["a"], colnames=["b", "c"], margins=True, dropna=False + ) + m = MultiIndex.from_arrays( + [ + ["one", "one", "two", "two", "All"], + ["dull", "shiny", "dull", "shiny", ""], + ], + names=["b", "c"], + ) + expected = DataFrame( + [[1, 0, 1, 0, 2], [2, 0, 1, 1, 5], [3, 0, 2, 1, 7]], columns=m + ) + expected.index = Index(["bar", "foo", "All"], name="a") + tm.assert_frame_equal(actual, expected) + + actual = pd.crosstab( + [a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=False + ) + m = MultiIndex.from_arrays( + [["bar", "bar", "foo", "foo", "All"], ["one", "two", "one", "two", ""]], + names=["a", "b"], + ) + expected = DataFrame( + [[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], [5, 2, 7]], index=m + ) + expected.columns = Index(["dull", "shiny", "All"], name="c") + tm.assert_frame_equal(actual, expected) + + actual = pd.crosstab( + [a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=True + ) + m = MultiIndex.from_arrays( + [["bar", "bar", "foo", "foo", "All"], ["one", "two", "one", "two", ""]], + names=["a", "b"], + ) + expected = DataFrame( + [[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], [5, 1, 6]], index=m + ) + expected.columns = Index(["dull", "shiny", "All"], name="c") + tm.assert_frame_equal(actual, expected) + + def test_crosstab_normalize(self): + # Issue 12578 + df = pd.DataFrame( + {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]} + ) + + rindex = pd.Index([1, 2], name="a") + cindex = pd.Index([3, 4], name="b") + full_normal = pd.DataFrame([[0.2, 0], [0.2, 0.6]], index=rindex, columns=cindex) + row_normal = pd.DataFrame( + [[1.0, 0], [0.25, 0.75]], index=rindex, columns=cindex + ) + col_normal = pd.DataFrame([[0.5, 0], [0.5, 1.0]], index=rindex, columns=cindex) + + # Check all normalize args + tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize="all"), full_normal) + tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=True), full_normal) + tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize="index"), row_normal) + tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize="columns"), col_normal) + tm.assert_frame_equal( + pd.crosstab(df.a, df.b, normalize=1), + pd.crosstab(df.a, df.b, normalize="columns"), + ) + tm.assert_frame_equal( + pd.crosstab(df.a, df.b, normalize=0), + pd.crosstab(df.a, df.b, normalize="index"), + ) + + row_normal_margins = pd.DataFrame( + [[1.0, 0], [0.25, 0.75], [0.4, 0.6]], + index=pd.Index([1, 2, "All"], name="a", dtype="object"), + columns=pd.Index([3, 4], name="b", dtype="object"), + ) + col_normal_margins = pd.DataFrame( + [[0.5, 0, 0.2], [0.5, 1.0, 0.8]], + index=pd.Index([1, 2], name="a", dtype="object"), + columns=pd.Index([3, 4, "All"], name="b", dtype="object"), + ) + + all_normal_margins = pd.DataFrame( + [[0.2, 0, 0.2], [0.2, 0.6, 0.8], [0.4, 0.6, 1]], + index=pd.Index([1, 2, "All"], name="a", dtype="object"), + columns=pd.Index([3, 4, "All"], name="b", dtype="object"), + ) + tm.assert_frame_equal( + pd.crosstab(df.a, df.b, normalize="index", margins=True), row_normal_margins + ) + tm.assert_frame_equal( + pd.crosstab(df.a, df.b, normalize="columns", margins=True), + col_normal_margins, + ) + tm.assert_frame_equal( + pd.crosstab(df.a, df.b, normalize=True, margins=True), all_normal_margins + ) + + # Test arrays + pd.crosstab( + [np.array([1, 1, 2, 2]), np.array([1, 2, 1, 2])], np.array([1, 2, 1, 2]) + ) + + # Test with aggfunc + norm_counts = pd.DataFrame( + [[0.25, 0, 0.25], [0.25, 0.5, 0.75], [0.5, 0.5, 1]], + index=pd.Index([1, 2, "All"], name="a", dtype="object"), + columns=pd.Index([3, 4, "All"], name="b"), + ) + test_case = pd.crosstab( + df.a, df.b, df.c, aggfunc="count", normalize="all", margins=True + ) + tm.assert_frame_equal(test_case, norm_counts) + + df = pd.DataFrame( + {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [0, 4, np.nan, 3, 3]} + ) + + norm_sum = pd.DataFrame( + [[0, 0, 0.0], [0.4, 0.6, 1], [0.4, 0.6, 1]], + index=pd.Index([1, 2, "All"], name="a", dtype="object"), + columns=pd.Index([3, 4, "All"], name="b", dtype="object"), + ) + test_case = pd.crosstab( + df.a, df.b, df.c, aggfunc=np.sum, normalize="all", margins=True + ) + tm.assert_frame_equal(test_case, norm_sum) + + def test_crosstab_with_empties(self): + # Check handling of empties + df = pd.DataFrame( + { + "a": [1, 2, 2, 2, 2], + "b": [3, 3, 4, 4, 4], + "c": [np.nan, np.nan, np.nan, np.nan, np.nan], + } + ) + + empty = pd.DataFrame( + [[0.0, 0.0], [0.0, 0.0]], + index=pd.Index([1, 2], name="a", dtype="int64"), + columns=pd.Index([3, 4], name="b"), + ) + + for i in [True, "index", "columns"]: + calculated = pd.crosstab( + df.a, df.b, values=df.c, aggfunc="count", normalize=i + ) + tm.assert_frame_equal(empty, calculated) + + nans = pd.DataFrame( + [[0.0, np.nan], [0.0, 0.0]], + index=pd.Index([1, 2], name="a", dtype="int64"), + columns=pd.Index([3, 4], name="b"), + ) + + calculated = pd.crosstab( + df.a, df.b, values=df.c, aggfunc="count", normalize=False + ) + tm.assert_frame_equal(nans, calculated) + + def test_crosstab_errors(self): + # Issue 12578 + + df = pd.DataFrame( + {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]} + ) + + error = "values cannot be used without an aggfunc." + with pytest.raises(ValueError, match=error): + pd.crosstab(df.a, df.b, values=df.c) + + error = "aggfunc cannot be used without values" + with pytest.raises(ValueError, match=error): + pd.crosstab(df.a, df.b, aggfunc=np.mean) + + error = "Not a valid normalize argument" + with pytest.raises(ValueError, match=error): + pd.crosstab(df.a, df.b, normalize="42") + + with pytest.raises(ValueError, match=error): + pd.crosstab(df.a, df.b, normalize=42) + + error = "Not a valid margins argument" + with pytest.raises(ValueError, match=error): + pd.crosstab(df.a, df.b, normalize="all", margins=42) + + def test_crosstab_with_categorial_columns(self): + # GH 8860 + df = pd.DataFrame( + { + "MAKE": ["Honda", "Acura", "Tesla", "Honda", "Honda", "Acura"], + "MODEL": ["Sedan", "Sedan", "Electric", "Pickup", "Sedan", "Sedan"], + } + ) + categories = ["Sedan", "Electric", "Pickup"] + df["MODEL"] = df["MODEL"].astype("category").cat.set_categories(categories) + result = pd.crosstab(df["MAKE"], df["MODEL"]) + + expected_index = pd.Index(["Acura", "Honda", "Tesla"], name="MAKE") + expected_columns = pd.CategoricalIndex( + categories, categories=categories, ordered=False, name="MODEL" + ) + expected_data = [[2, 0, 0], [2, 0, 1], [0, 1, 0]] + expected = pd.DataFrame( + expected_data, index=expected_index, columns=expected_columns + ) + tm.assert_frame_equal(result, expected) + + def test_crosstab_with_numpy_size(self): + # GH 4003 + df = pd.DataFrame( + { + "A": ["one", "one", "two", "three"] * 6, + "B": ["A", "B", "C"] * 8, + "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, + "D": np.random.randn(24), + "E": np.random.randn(24), + } + ) + result = pd.crosstab( + index=[df["A"], df["B"]], + columns=[df["C"]], + margins=True, + aggfunc=np.size, + values=df["D"], + ) + expected_index = pd.MultiIndex( + levels=[["All", "one", "three", "two"], ["", "A", "B", "C"]], + codes=[[1, 1, 1, 2, 2, 2, 3, 3, 3, 0], [1, 2, 3, 1, 2, 3, 1, 2, 3, 0]], + names=["A", "B"], + ) + expected_column = pd.Index(["bar", "foo", "All"], dtype="object", name="C") + expected_data = np.array( + [ + [2.0, 2.0, 4.0], + [2.0, 2.0, 4.0], + [2.0, 2.0, 4.0], + [2.0, np.nan, 2.0], + [np.nan, 2.0, 2.0], + [2.0, np.nan, 2.0], + [np.nan, 2.0, 2.0], + [2.0, np.nan, 2.0], + [np.nan, 2.0, 2.0], + [12.0, 12.0, 24.0], + ] + ) + expected = pd.DataFrame( + expected_data, index=expected_index, columns=expected_column + ) + tm.assert_frame_equal(result, expected) + + def test_crosstab_dup_index_names(self): + # GH 13279 + s = pd.Series(range(3), name="foo") + + result = pd.crosstab(s, s) + expected_index = pd.Index(range(3), name="foo") + expected = pd.DataFrame( + np.eye(3, dtype=np.int64), index=expected_index, columns=expected_index + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("names", [["a", ("b", "c")], [("a", "b"), "c"]]) + def test_crosstab_tuple_name(self, names): + s1 = pd.Series(range(3), name=names[0]) + s2 = pd.Series(range(1, 4), name=names[1]) + + mi = pd.MultiIndex.from_arrays([range(3), range(1, 4)], names=names) + expected = pd.Series(1, index=mi).unstack(1, fill_value=0) + + result = pd.crosstab(s1, s2) + tm.assert_frame_equal(result, expected) + + def test_crosstab_unsorted_order(self): + df = pd.DataFrame({"b": [3, 1, 2], "a": [5, 4, 6]}, index=["C", "A", "B"]) + result = pd.crosstab(df.index, [df.b, df.a]) + e_idx = pd.Index(["A", "B", "C"], name="row_0") + e_columns = pd.MultiIndex.from_tuples( + [(1, 4), (2, 6), (3, 5)], names=["b", "a"] + ) + expected = pd.DataFrame( + [[1, 0, 0], [0, 1, 0], [0, 0, 1]], index=e_idx, columns=e_columns + ) + tm.assert_frame_equal(result, expected) + + def test_margin_normalize(self): + # GH 27500 + df = pd.DataFrame( + { + "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], + "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], + "C": [ + "small", + "large", + "large", + "small", + "small", + "large", + "small", + "small", + "large", + ], + "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], + "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], + } + ) + # normalize on index + result = pd.crosstab( + [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=0 + ) + expected = pd.DataFrame( + [[0.5, 0.5], [0.5, 0.5], [0.666667, 0.333333], [0, 1], [0.444444, 0.555556]] + ) + expected.index = MultiIndex( + levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]], + codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]], + names=["A", "B"], + ) + expected.columns = Index(["large", "small"], dtype="object", name="C") + tm.assert_frame_equal(result, expected) + + # normalize on columns + result = pd.crosstab( + [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=1 + ) + expected = pd.DataFrame( + [ + [0.25, 0.2, 0.222222], + [0.25, 0.2, 0.222222], + [0.5, 0.2, 0.333333], + [0, 0.4, 0.222222], + ] + ) + expected.columns = Index( + ["large", "small", "Sub-Total"], dtype="object", name="C" + ) + expected.index = MultiIndex( + levels=[["bar", "foo"], ["one", "two"]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1]], + names=["A", "B"], + ) + tm.assert_frame_equal(result, expected) + + # normalize on both index and column + result = pd.crosstab( + [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=True + ) + expected = pd.DataFrame( + [ + [0.111111, 0.111111, 0.222222], + [0.111111, 0.111111, 0.222222], + [0.222222, 0.111111, 0.333333], + [0.000000, 0.222222, 0.222222], + [0.444444, 0.555555, 1], + ] + ) + expected.columns = Index( + ["large", "small", "Sub-Total"], dtype="object", name="C" + ) + expected.index = MultiIndex( + levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]], + codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]], + names=["A", "B"], + ) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/reshape/test_qcut.py b/venv/Lib/site-packages/pandas/tests/reshape/test_qcut.py new file mode 100644 index 0000000..95406a5 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/reshape/test_qcut.py @@ -0,0 +1,288 @@ +import os + +import numpy as np +import pytest + +from pandas import ( + Categorical, + DatetimeIndex, + Interval, + IntervalIndex, + NaT, + Series, + TimedeltaIndex, + Timestamp, + cut, + date_range, + isna, + qcut, + timedelta_range, +) +import pandas._testing as tm +from pandas.api.types import CategoricalDtype as CDT +from pandas.core.algorithms import quantile + +from pandas.tseries.offsets import Day, Nano + + +def test_qcut(): + arr = np.random.randn(1000) + + # We store the bins as Index that have been + # rounded to comparisons are a bit tricky. + labels, bins = qcut(arr, 4, retbins=True) + ex_bins = quantile(arr, [0, 0.25, 0.5, 0.75, 1.0]) + + result = labels.categories.left.values + assert np.allclose(result, ex_bins[:-1], atol=1e-2) + + result = labels.categories.right.values + assert np.allclose(result, ex_bins[1:], atol=1e-2) + + ex_levels = cut(arr, ex_bins, include_lowest=True) + tm.assert_categorical_equal(labels, ex_levels) + + +def test_qcut_bounds(): + arr = np.random.randn(1000) + + factor = qcut(arr, 10, labels=False) + assert len(np.unique(factor)) == 10 + + +def test_qcut_specify_quantiles(): + arr = np.random.randn(100) + factor = qcut(arr, [0, 0.25, 0.5, 0.75, 1.0]) + + expected = qcut(arr, 4) + tm.assert_categorical_equal(factor, expected) + + +def test_qcut_all_bins_same(): + with pytest.raises(ValueError, match="edges.*unique"): + qcut([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3) + + +def test_qcut_include_lowest(): + values = np.arange(10) + ii = qcut(values, 4) + + ex_levels = IntervalIndex( + [ + Interval(-0.001, 2.25), + Interval(2.25, 4.5), + Interval(4.5, 6.75), + Interval(6.75, 9), + ] + ) + tm.assert_index_equal(ii.categories, ex_levels) + + +def test_qcut_nas(): + arr = np.random.randn(100) + arr[:20] = np.nan + + result = qcut(arr, 4) + assert isna(result[:20]).all() + + +def test_qcut_index(): + result = qcut([0, 2], 2) + intervals = [Interval(-0.001, 1), Interval(1, 2)] + + expected = Categorical(intervals, ordered=True) + tm.assert_categorical_equal(result, expected) + + +def test_qcut_binning_issues(datapath): + # see gh-1978, gh-1979 + cut_file = datapath(os.path.join("reshape", "data", "cut_data.csv")) + arr = np.loadtxt(cut_file) + result = qcut(arr, 20) + + starts = [] + ends = [] + + for lev in np.unique(result): + s = lev.left + e = lev.right + assert s != e + + starts.append(float(s)) + ends.append(float(e)) + + for (sp, sn), (ep, en) in zip( + zip(starts[:-1], starts[1:]), zip(ends[:-1], ends[1:]) + ): + assert sp < sn + assert ep < en + assert ep <= sn + + +def test_qcut_return_intervals(): + ser = Series([0, 1, 2, 3, 4, 5, 6, 7, 8]) + res = qcut(ser, [0, 0.333, 0.666, 1]) + + exp_levels = np.array( + [Interval(-0.001, 2.664), Interval(2.664, 5.328), Interval(5.328, 8)] + ) + exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(CDT(ordered=True)) + tm.assert_series_equal(res, exp) + + +@pytest.mark.parametrize("labels", ["foo", 1, True]) +def test_qcut_incorrect_labels(labels): + # GH 13318 + values = range(5) + msg = "Bin labels must either be False, None or passed in as a list-like argument" + with pytest.raises(ValueError, match=msg): + qcut(values, 4, labels=labels) + + +@pytest.mark.parametrize("labels", [["a", "b", "c"], list(range(3))]) +def test_qcut_wrong_length_labels(labels): + # GH 13318 + values = range(10) + msg = "Bin labels must be one fewer than the number of bin edges" + with pytest.raises(ValueError, match=msg): + qcut(values, 4, labels=labels) + + +@pytest.mark.parametrize( + "labels, expected", + [ + (["a", "b", "c"], Categorical(["a", "b", "c"], ordered=True)), + (list(range(3)), Categorical([0, 1, 2], ordered=True)), + ], +) +def test_qcut_list_like_labels(labels, expected): + # GH 13318 + values = range(3) + result = qcut(values, 3, labels=labels) + tm.assert_categorical_equal(result, expected) + + +@pytest.mark.parametrize( + "kwargs,msg", + [ + (dict(duplicates="drop"), None), + (dict(), "Bin edges must be unique"), + (dict(duplicates="raise"), "Bin edges must be unique"), + (dict(duplicates="foo"), "invalid value for 'duplicates' parameter"), + ], +) +def test_qcut_duplicates_bin(kwargs, msg): + # see gh-7751 + values = [0, 0, 0, 0, 1, 2, 3] + + if msg is not None: + with pytest.raises(ValueError, match=msg): + qcut(values, 3, **kwargs) + else: + result = qcut(values, 3, **kwargs) + expected = IntervalIndex([Interval(-0.001, 1), Interval(1, 3)]) + tm.assert_index_equal(result.categories, expected) + + +@pytest.mark.parametrize( + "data,start,end", [(9.0, 8.999, 9.0), (0.0, -0.001, 0.0), (-9.0, -9.001, -9.0)] +) +@pytest.mark.parametrize("length", [1, 2]) +@pytest.mark.parametrize("labels", [None, False]) +def test_single_quantile(data, start, end, length, labels): + # see gh-15431 + ser = Series([data] * length) + result = qcut(ser, 1, labels=labels) + + if labels is None: + intervals = IntervalIndex([Interval(start, end)] * length, closed="right") + expected = Series(intervals).astype(CDT(ordered=True)) + else: + expected = Series([0] * length) + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "ser", + [ + Series(DatetimeIndex(["20180101", NaT, "20180103"])), + Series(TimedeltaIndex(["0 days", NaT, "2 days"])), + ], + ids=lambda x: str(x.dtype), +) +def test_qcut_nat(ser): + # see gh-19768 + intervals = IntervalIndex.from_tuples( + [(ser[0] - Nano(), ser[2] - Day()), np.nan, (ser[2] - Day(), ser[2])] + ) + expected = Series(Categorical(intervals, ordered=True)) + + result = qcut(ser, 2) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("bins", [3, np.linspace(0, 1, 4)]) +def test_datetime_tz_qcut(bins): + # see gh-19872 + tz = "US/Eastern" + ser = Series(date_range("20130101", periods=3, tz=tz)) + + result = qcut(ser, bins) + expected = Series( + IntervalIndex( + [ + Interval( + Timestamp("2012-12-31 23:59:59.999999999", tz=tz), + Timestamp("2013-01-01 16:00:00", tz=tz), + ), + Interval( + Timestamp("2013-01-01 16:00:00", tz=tz), + Timestamp("2013-01-02 08:00:00", tz=tz), + ), + Interval( + Timestamp("2013-01-02 08:00:00", tz=tz), + Timestamp("2013-01-03 00:00:00", tz=tz), + ), + ] + ) + ).astype(CDT(ordered=True)) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "arg,expected_bins", + [ + [ + timedelta_range("1day", periods=3), + TimedeltaIndex(["1 days", "2 days", "3 days"]), + ], + [ + date_range("20180101", periods=3), + DatetimeIndex(["2018-01-01", "2018-01-02", "2018-01-03"]), + ], + ], +) +def test_date_like_qcut_bins(arg, expected_bins): + # see gh-19891 + ser = Series(arg) + result, result_bins = qcut(ser, 2, retbins=True) + tm.assert_index_equal(result_bins, expected_bins) + + +@pytest.mark.parametrize("bins", [6, 7]) +@pytest.mark.parametrize( + "box, compare", + [ + (Series, tm.assert_series_equal), + (np.array, tm.assert_categorical_equal), + (list, tm.assert_equal), + ], +) +def test_qcut_bool_coercion_to_int(bins, box, compare): + # issue 20303 + data_expected = box([0, 1, 1, 0, 1] * 10) + data_result = box([False, True, True, False, True] * 10) + expected = qcut(data_expected, bins, duplicates="drop") + result = qcut(data_result, bins, duplicates="drop") + compare(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/reshape/test_reshape.py b/venv/Lib/site-packages/pandas/tests/reshape/test_reshape.py new file mode 100644 index 0000000..f25291f --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/reshape/test_reshape.py @@ -0,0 +1,647 @@ +from collections import OrderedDict + +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_integer_dtype + +import pandas as pd +from pandas import Categorical, DataFrame, Index, Series, get_dummies +import pandas._testing as tm +from pandas.core.arrays.sparse import SparseArray, SparseDtype + + +class TestGetDummies: + @pytest.fixture + def df(self): + return DataFrame({"A": ["a", "b", "a"], "B": ["b", "b", "c"], "C": [1, 2, 3]}) + + @pytest.fixture(params=["uint8", "i8", np.float64, bool, None]) + def dtype(self, request): + return np.dtype(request.param) + + @pytest.fixture(params=["dense", "sparse"]) + def sparse(self, request): + # params are strings to simplify reading test results, + # e.g. TestGetDummies::test_basic[uint8-sparse] instead of [uint8-True] + return request.param == "sparse" + + def effective_dtype(self, dtype): + if dtype is None: + return np.uint8 + return dtype + + def test_raises_on_dtype_object(self, df): + with pytest.raises(ValueError): + get_dummies(df, dtype="object") + + def test_basic(self, sparse, dtype): + s_list = list("abc") + s_series = Series(s_list) + s_series_index = Series(s_list, list("ABC")) + + expected = DataFrame( + {"a": [1, 0, 0], "b": [0, 1, 0], "c": [0, 0, 1]}, + dtype=self.effective_dtype(dtype), + ) + if sparse: + expected = expected.apply(SparseArray, fill_value=0.0) + result = get_dummies(s_list, sparse=sparse, dtype=dtype) + tm.assert_frame_equal(result, expected) + + result = get_dummies(s_series, sparse=sparse, dtype=dtype) + tm.assert_frame_equal(result, expected) + + expected.index = list("ABC") + result = get_dummies(s_series_index, sparse=sparse, dtype=dtype) + tm.assert_frame_equal(result, expected) + + def test_basic_types(self, sparse, dtype): + # GH 10531 + s_list = list("abc") + s_series = Series(s_list) + s_df = DataFrame( + {"a": [0, 1, 0, 1, 2], "b": ["A", "A", "B", "C", "C"], "c": [2, 3, 3, 3, 2]} + ) + + expected = DataFrame( + {"a": [1, 0, 0], "b": [0, 1, 0], "c": [0, 0, 1]}, + dtype=self.effective_dtype(dtype), + columns=list("abc"), + ) + if sparse: + if is_integer_dtype(dtype): + fill_value = 0 + elif dtype == bool: + fill_value = False + else: + fill_value = 0.0 + + expected = expected.apply(SparseArray, fill_value=fill_value) + result = get_dummies(s_list, sparse=sparse, dtype=dtype) + tm.assert_frame_equal(result, expected) + + result = get_dummies(s_series, sparse=sparse, dtype=dtype) + tm.assert_frame_equal(result, expected) + + result = get_dummies(s_df, columns=s_df.columns, sparse=sparse, dtype=dtype) + if sparse: + dtype_name = "Sparse[{}, {}]".format( + self.effective_dtype(dtype).name, fill_value + ) + else: + dtype_name = self.effective_dtype(dtype).name + + expected = Series({dtype_name: 8}) + result = result.dtypes.value_counts() + result.index = [str(i) for i in result.index] + tm.assert_series_equal(result, expected) + + result = get_dummies(s_df, columns=["a"], sparse=sparse, dtype=dtype) + + expected_counts = {"int64": 1, "object": 1} + expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0) + + expected = Series(expected_counts).sort_index() + result = result.dtypes.value_counts() + result.index = [str(i) for i in result.index] + result = result.sort_index() + tm.assert_series_equal(result, expected) + + def test_just_na(self, sparse): + just_na_list = [np.nan] + just_na_series = Series(just_na_list) + just_na_series_index = Series(just_na_list, index=["A"]) + + res_list = get_dummies(just_na_list, sparse=sparse) + res_series = get_dummies(just_na_series, sparse=sparse) + res_series_index = get_dummies(just_na_series_index, sparse=sparse) + + assert res_list.empty + assert res_series.empty + assert res_series_index.empty + + assert res_list.index.tolist() == [0] + assert res_series.index.tolist() == [0] + assert res_series_index.index.tolist() == ["A"] + + def test_include_na(self, sparse, dtype): + s = ["a", "b", np.nan] + res = get_dummies(s, sparse=sparse, dtype=dtype) + exp = DataFrame( + {"a": [1, 0, 0], "b": [0, 1, 0]}, dtype=self.effective_dtype(dtype) + ) + if sparse: + exp = exp.apply(SparseArray, fill_value=0.0) + tm.assert_frame_equal(res, exp) + + # Sparse dataframes do not allow nan labelled columns, see #GH8822 + res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype) + exp_na = DataFrame( + {np.nan: [0, 0, 1], "a": [1, 0, 0], "b": [0, 1, 0]}, + dtype=self.effective_dtype(dtype), + ) + exp_na = exp_na.reindex(["a", "b", np.nan], axis=1) + # hack (NaN handling in assert_index_equal) + exp_na.columns = res_na.columns + if sparse: + exp_na = exp_na.apply(SparseArray, fill_value=0.0) + tm.assert_frame_equal(res_na, exp_na) + + res_just_na = get_dummies([np.nan], dummy_na=True, sparse=sparse, dtype=dtype) + exp_just_na = DataFrame( + Series(1, index=[0]), columns=[np.nan], dtype=self.effective_dtype(dtype) + ) + tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values) + + def test_unicode(self, sparse): + # See GH 6885 - get_dummies chokes on unicode values + import unicodedata + + e = "e" + eacute = unicodedata.lookup("LATIN SMALL LETTER E WITH ACUTE") + s = [e, eacute, eacute] + res = get_dummies(s, prefix="letter", sparse=sparse) + exp = DataFrame( + {"letter_e": [1, 0, 0], "letter_{eacute}".format(eacute=eacute): [0, 1, 1]}, + dtype=np.uint8, + ) + if sparse: + exp = exp.apply(SparseArray, fill_value=0) + tm.assert_frame_equal(res, exp) + + def test_dataframe_dummies_all_obj(self, df, sparse): + df = df[["A", "B"]] + result = get_dummies(df, sparse=sparse) + expected = DataFrame( + {"A_a": [1, 0, 1], "A_b": [0, 1, 0], "B_b": [1, 1, 0], "B_c": [0, 0, 1]}, + dtype=np.uint8, + ) + if sparse: + expected = pd.DataFrame( + { + "A_a": SparseArray([1, 0, 1], dtype="uint8"), + "A_b": SparseArray([0, 1, 0], dtype="uint8"), + "B_b": SparseArray([1, 1, 0], dtype="uint8"), + "B_c": SparseArray([0, 0, 1], dtype="uint8"), + } + ) + + tm.assert_frame_equal(result, expected) + + def test_dataframe_dummies_mix_default(self, df, sparse, dtype): + result = get_dummies(df, sparse=sparse, dtype=dtype) + if sparse: + arr = SparseArray + typ = SparseDtype(dtype, 0) + else: + arr = np.array + typ = dtype + expected = DataFrame( + { + "C": [1, 2, 3], + "A_a": arr([1, 0, 1], dtype=typ), + "A_b": arr([0, 1, 0], dtype=typ), + "B_b": arr([1, 1, 0], dtype=typ), + "B_c": arr([0, 0, 1], dtype=typ), + } + ) + expected = expected[["C", "A_a", "A_b", "B_b", "B_c"]] + tm.assert_frame_equal(result, expected) + + def test_dataframe_dummies_prefix_list(self, df, sparse): + prefixes = ["from_A", "from_B"] + result = get_dummies(df, prefix=prefixes, sparse=sparse) + expected = DataFrame( + { + "C": [1, 2, 3], + "from_A_a": [1, 0, 1], + "from_A_b": [0, 1, 0], + "from_B_b": [1, 1, 0], + "from_B_c": [0, 0, 1], + }, + dtype=np.uint8, + ) + expected[["C"]] = df[["C"]] + cols = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"] + expected = expected[["C"] + cols] + + typ = SparseArray if sparse else pd.Series + expected[cols] = expected[cols].apply(lambda x: typ(x)) + tm.assert_frame_equal(result, expected) + + def test_dataframe_dummies_prefix_str(self, df, sparse): + # not that you should do this... + result = get_dummies(df, prefix="bad", sparse=sparse) + bad_columns = ["bad_a", "bad_b", "bad_b", "bad_c"] + expected = DataFrame( + [[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]], + columns=["C"] + bad_columns, + dtype=np.uint8, + ) + expected = expected.astype({"C": np.int64}) + if sparse: + # work around astyping & assigning with duplicate columns + # https://github.com/pandas-dev/pandas/issues/14427 + expected = pd.concat( + [ + pd.Series([1, 2, 3], name="C"), + pd.Series([1, 0, 1], name="bad_a", dtype="Sparse[uint8]"), + pd.Series([0, 1, 0], name="bad_b", dtype="Sparse[uint8]"), + pd.Series([1, 1, 0], name="bad_b", dtype="Sparse[uint8]"), + pd.Series([0, 0, 1], name="bad_c", dtype="Sparse[uint8]"), + ], + axis=1, + ) + + tm.assert_frame_equal(result, expected) + + def test_dataframe_dummies_subset(self, df, sparse): + result = get_dummies(df, prefix=["from_A"], columns=["A"], sparse=sparse) + expected = DataFrame( + { + "B": ["b", "b", "c"], + "C": [1, 2, 3], + "from_A_a": [1, 0, 1], + "from_A_b": [0, 1, 0], + }, + dtype=np.uint8, + ) + expected[["C"]] = df[["C"]] + if sparse: + cols = ["from_A_a", "from_A_b"] + expected[cols] = expected[cols].astype(pd.SparseDtype("uint8", 0)) + tm.assert_frame_equal(result, expected) + + def test_dataframe_dummies_prefix_sep(self, df, sparse): + result = get_dummies(df, prefix_sep="..", sparse=sparse) + expected = DataFrame( + { + "C": [1, 2, 3], + "A..a": [1, 0, 1], + "A..b": [0, 1, 0], + "B..b": [1, 1, 0], + "B..c": [0, 0, 1], + }, + dtype=np.uint8, + ) + expected[["C"]] = df[["C"]] + expected = expected[["C", "A..a", "A..b", "B..b", "B..c"]] + if sparse: + cols = ["A..a", "A..b", "B..b", "B..c"] + expected[cols] = expected[cols].astype(pd.SparseDtype("uint8", 0)) + + tm.assert_frame_equal(result, expected) + + result = get_dummies(df, prefix_sep=["..", "__"], sparse=sparse) + expected = expected.rename(columns={"B..b": "B__b", "B..c": "B__c"}) + tm.assert_frame_equal(result, expected) + + result = get_dummies(df, prefix_sep={"A": "..", "B": "__"}, sparse=sparse) + tm.assert_frame_equal(result, expected) + + def test_dataframe_dummies_prefix_bad_length(self, df, sparse): + with pytest.raises(ValueError): + get_dummies(df, prefix=["too few"], sparse=sparse) + + def test_dataframe_dummies_prefix_sep_bad_length(self, df, sparse): + with pytest.raises(ValueError): + get_dummies(df, prefix_sep=["bad"], sparse=sparse) + + def test_dataframe_dummies_prefix_dict(self, sparse): + prefixes = {"A": "from_A", "B": "from_B"} + df = DataFrame({"C": [1, 2, 3], "A": ["a", "b", "a"], "B": ["b", "b", "c"]}) + result = get_dummies(df, prefix=prefixes, sparse=sparse) + + expected = DataFrame( + { + "C": [1, 2, 3], + "from_A_a": [1, 0, 1], + "from_A_b": [0, 1, 0], + "from_B_b": [1, 1, 0], + "from_B_c": [0, 0, 1], + } + ) + + columns = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"] + expected[columns] = expected[columns].astype(np.uint8) + if sparse: + expected[columns] = expected[columns].astype(pd.SparseDtype("uint8", 0)) + + tm.assert_frame_equal(result, expected) + + def test_dataframe_dummies_with_na(self, df, sparse, dtype): + df.loc[3, :] = [np.nan, np.nan, np.nan] + result = get_dummies(df, dummy_na=True, sparse=sparse, dtype=dtype).sort_index( + axis=1 + ) + + if sparse: + arr = SparseArray + typ = SparseDtype(dtype, 0) + else: + arr = np.array + typ = dtype + + expected = DataFrame( + { + "C": [1, 2, 3, np.nan], + "A_a": arr([1, 0, 1, 0], dtype=typ), + "A_b": arr([0, 1, 0, 0], dtype=typ), + "A_nan": arr([0, 0, 0, 1], dtype=typ), + "B_b": arr([1, 1, 0, 0], dtype=typ), + "B_c": arr([0, 0, 1, 0], dtype=typ), + "B_nan": arr([0, 0, 0, 1], dtype=typ), + } + ).sort_index(axis=1) + + tm.assert_frame_equal(result, expected) + + result = get_dummies(df, dummy_na=False, sparse=sparse, dtype=dtype) + expected = expected[["C", "A_a", "A_b", "B_b", "B_c"]] + tm.assert_frame_equal(result, expected) + + def test_dataframe_dummies_with_categorical(self, df, sparse, dtype): + df["cat"] = pd.Categorical(["x", "y", "y"]) + result = get_dummies(df, sparse=sparse, dtype=dtype).sort_index(axis=1) + if sparse: + arr = SparseArray + typ = SparseDtype(dtype, 0) + else: + arr = np.array + typ = dtype + + expected = DataFrame( + { + "C": [1, 2, 3], + "A_a": arr([1, 0, 1], dtype=typ), + "A_b": arr([0, 1, 0], dtype=typ), + "B_b": arr([1, 1, 0], dtype=typ), + "B_c": arr([0, 0, 1], dtype=typ), + "cat_x": arr([1, 0, 0], dtype=typ), + "cat_y": arr([0, 1, 1], dtype=typ), + } + ).sort_index(axis=1) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "get_dummies_kwargs,expected", + [ + ( + {"data": pd.DataFrame(({"ä": ["a"]}))}, + pd.DataFrame({"ä_a": [1]}, dtype=np.uint8), + ), + ( + {"data": pd.DataFrame({"x": ["ä"]})}, + pd.DataFrame({"x_ä": [1]}, dtype=np.uint8), + ), + ( + {"data": pd.DataFrame({"x": ["a"]}), "prefix": "ä"}, + pd.DataFrame({"ä_a": [1]}, dtype=np.uint8), + ), + ( + {"data": pd.DataFrame({"x": ["a"]}), "prefix_sep": "ä"}, + pd.DataFrame({"xäa": [1]}, dtype=np.uint8), + ), + ], + ) + def test_dataframe_dummies_unicode(self, get_dummies_kwargs, expected): + # GH22084 pd.get_dummies incorrectly encodes unicode characters + # in dataframe column names + result = get_dummies(**get_dummies_kwargs) + tm.assert_frame_equal(result, expected) + + def test_basic_drop_first(self, sparse): + # GH12402 Add a new parameter `drop_first` to avoid collinearity + # Basic case + s_list = list("abc") + s_series = Series(s_list) + s_series_index = Series(s_list, list("ABC")) + + expected = DataFrame({"b": [0, 1, 0], "c": [0, 0, 1]}, dtype=np.uint8) + + result = get_dummies(s_list, drop_first=True, sparse=sparse) + if sparse: + expected = expected.apply(SparseArray, fill_value=0) + tm.assert_frame_equal(result, expected) + + result = get_dummies(s_series, drop_first=True, sparse=sparse) + tm.assert_frame_equal(result, expected) + + expected.index = list("ABC") + result = get_dummies(s_series_index, drop_first=True, sparse=sparse) + tm.assert_frame_equal(result, expected) + + def test_basic_drop_first_one_level(self, sparse): + # Test the case that categorical variable only has one level. + s_list = list("aaa") + s_series = Series(s_list) + s_series_index = Series(s_list, list("ABC")) + + expected = DataFrame(index=np.arange(3)) + + result = get_dummies(s_list, drop_first=True, sparse=sparse) + tm.assert_frame_equal(result, expected) + + result = get_dummies(s_series, drop_first=True, sparse=sparse) + tm.assert_frame_equal(result, expected) + + expected = DataFrame(index=list("ABC")) + result = get_dummies(s_series_index, drop_first=True, sparse=sparse) + tm.assert_frame_equal(result, expected) + + def test_basic_drop_first_NA(self, sparse): + # Test NA handling together with drop_first + s_NA = ["a", "b", np.nan] + res = get_dummies(s_NA, drop_first=True, sparse=sparse) + exp = DataFrame({"b": [0, 1, 0]}, dtype=np.uint8) + if sparse: + exp = exp.apply(SparseArray, fill_value=0) + + tm.assert_frame_equal(res, exp) + + res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, sparse=sparse) + exp_na = DataFrame({"b": [0, 1, 0], np.nan: [0, 0, 1]}, dtype=np.uint8).reindex( + ["b", np.nan], axis=1 + ) + if sparse: + exp_na = exp_na.apply(SparseArray, fill_value=0) + tm.assert_frame_equal(res_na, exp_na) + + res_just_na = get_dummies( + [np.nan], dummy_na=True, drop_first=True, sparse=sparse + ) + exp_just_na = DataFrame(index=np.arange(1)) + tm.assert_frame_equal(res_just_na, exp_just_na) + + def test_dataframe_dummies_drop_first(self, df, sparse): + df = df[["A", "B"]] + result = get_dummies(df, drop_first=True, sparse=sparse) + expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=np.uint8) + if sparse: + expected = expected.apply(SparseArray, fill_value=0) + tm.assert_frame_equal(result, expected) + + def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype): + df["cat"] = pd.Categorical(["x", "y", "y"]) + result = get_dummies(df, drop_first=True, sparse=sparse) + expected = DataFrame( + {"C": [1, 2, 3], "A_b": [0, 1, 0], "B_c": [0, 0, 1], "cat_y": [0, 1, 1]} + ) + cols = ["A_b", "B_c", "cat_y"] + expected[cols] = expected[cols].astype(np.uint8) + expected = expected[["C", "A_b", "B_c", "cat_y"]] + if sparse: + for col in cols: + expected[col] = SparseArray(expected[col]) + tm.assert_frame_equal(result, expected) + + def test_dataframe_dummies_drop_first_with_na(self, df, sparse): + df.loc[3, :] = [np.nan, np.nan, np.nan] + result = get_dummies( + df, dummy_na=True, drop_first=True, sparse=sparse + ).sort_index(axis=1) + expected = DataFrame( + { + "C": [1, 2, 3, np.nan], + "A_b": [0, 1, 0, 0], + "A_nan": [0, 0, 0, 1], + "B_c": [0, 0, 1, 0], + "B_nan": [0, 0, 0, 1], + } + ) + cols = ["A_b", "A_nan", "B_c", "B_nan"] + expected[cols] = expected[cols].astype(np.uint8) + expected = expected.sort_index(axis=1) + if sparse: + for col in cols: + expected[col] = SparseArray(expected[col]) + + tm.assert_frame_equal(result, expected) + + result = get_dummies(df, dummy_na=False, drop_first=True, sparse=sparse) + expected = expected[["C", "A_b", "B_c"]] + tm.assert_frame_equal(result, expected) + + def test_int_int(self): + data = Series([1, 2, 1]) + result = pd.get_dummies(data) + expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], dtype=np.uint8) + tm.assert_frame_equal(result, expected) + + data = Series(pd.Categorical(["a", "b", "a"])) + result = pd.get_dummies(data) + expected = DataFrame( + [[1, 0], [0, 1], [1, 0]], columns=pd.Categorical(["a", "b"]), dtype=np.uint8 + ) + tm.assert_frame_equal(result, expected) + + def test_int_df(self, dtype): + data = DataFrame( + { + "A": [1, 2, 1], + "B": pd.Categorical(["a", "b", "a"]), + "C": [1, 2, 1], + "D": [1.0, 2.0, 1.0], + } + ) + columns = ["C", "D", "A_1", "A_2", "B_a", "B_b"] + expected = DataFrame( + [[1, 1.0, 1, 0, 1, 0], [2, 2.0, 0, 1, 0, 1], [1, 1.0, 1, 0, 1, 0]], + columns=columns, + ) + expected[columns[2:]] = expected[columns[2:]].astype(dtype) + result = pd.get_dummies(data, columns=["A", "B"], dtype=dtype) + tm.assert_frame_equal(result, expected) + + def test_dataframe_dummies_preserve_categorical_dtype(self, dtype): + # GH13854 + for ordered in [False, True]: + cat = pd.Categorical(list("xy"), categories=list("xyz"), ordered=ordered) + result = get_dummies(cat, dtype=dtype) + + data = np.array([[1, 0, 0], [0, 1, 0]], dtype=self.effective_dtype(dtype)) + cols = pd.CategoricalIndex( + cat.categories, categories=cat.categories, ordered=ordered + ) + expected = DataFrame(data, columns=cols, dtype=self.effective_dtype(dtype)) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("sparse", [True, False]) + def test_get_dummies_dont_sparsify_all_columns(self, sparse): + # GH18914 + df = DataFrame.from_dict( + OrderedDict([("GDP", [1, 2]), ("Nation", ["AB", "CD"])]) + ) + df = get_dummies(df, columns=["Nation"], sparse=sparse) + df2 = df.reindex(columns=["GDP"]) + + tm.assert_frame_equal(df[["GDP"]], df2) + + def test_get_dummies_duplicate_columns(self, df): + # GH20839 + df.columns = ["A", "A", "A"] + result = get_dummies(df).sort_index(axis=1) + + expected = DataFrame( + [[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]], + columns=["A", "A_a", "A_b", "A_b", "A_c"], + dtype=np.uint8, + ).sort_index(axis=1) + + expected = expected.astype({"A": np.int64}) + + tm.assert_frame_equal(result, expected) + + def test_get_dummies_all_sparse(self): + df = pd.DataFrame({"A": [1, 2]}) + result = pd.get_dummies(df, columns=["A"], sparse=True) + dtype = SparseDtype("uint8", 0) + expected = pd.DataFrame( + { + "A_1": SparseArray([1, 0], dtype=dtype), + "A_2": SparseArray([0, 1], dtype=dtype), + } + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("values", ["baz"]) + def test_get_dummies_with_string_values(self, values): + # issue #28383 + df = pd.DataFrame( + { + "bar": [1, 2, 3, 4, 5, 6], + "foo": ["one", "one", "one", "two", "two", "two"], + "baz": ["A", "B", "C", "A", "B", "C"], + "zoo": ["x", "y", "z", "q", "w", "t"], + } + ) + + msg = "Input must be a list-like for parameter `columns`" + + with pytest.raises(TypeError, match=msg): + pd.get_dummies(df, columns=values) + + +class TestCategoricalReshape: + def test_reshaping_multi_index_categorical(self): + + cols = ["ItemA", "ItemB", "ItemC"] + data = {c: tm.makeTimeDataFrame() for c in cols} + df = pd.concat({c: data[c].stack() for c in data}, axis="columns") + df.index.names = ["major", "minor"] + df["str"] = "foo" + + df["category"] = df["str"].astype("category") + result = df["category"].unstack() + + dti = df.index.levels[0] + c = Categorical(["foo"] * len(dti)) + expected = DataFrame( + {"A": c.copy(), "B": c.copy(), "C": c.copy(), "D": c.copy()}, + columns=Index(list("ABCD"), name="minor"), + index=dti.rename("major"), + ) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/reshape/test_union_categoricals.py b/venv/Lib/site-packages/pandas/tests/reshape/test_union_categoricals.py new file mode 100644 index 0000000..a503173 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/reshape/test_union_categoricals.py @@ -0,0 +1,348 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.concat import union_categoricals + +import pandas as pd +from pandas import Categorical, CategoricalIndex, Series +import pandas._testing as tm + + +class TestUnionCategoricals: + def test_union_categorical(self): + # GH 13361 + data = [ + (list("abc"), list("abd"), list("abcabd")), + ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]), + ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]), + ( + ["b", "b", np.nan, "a"], + ["a", np.nan, "c"], + ["b", "b", np.nan, "a", "a", np.nan, "c"], + ), + ( + pd.date_range("2014-01-01", "2014-01-05"), + pd.date_range("2014-01-06", "2014-01-07"), + pd.date_range("2014-01-01", "2014-01-07"), + ), + ( + pd.date_range("2014-01-01", "2014-01-05", tz="US/Central"), + pd.date_range("2014-01-06", "2014-01-07", tz="US/Central"), + pd.date_range("2014-01-01", "2014-01-07", tz="US/Central"), + ), + ( + pd.period_range("2014-01-01", "2014-01-05"), + pd.period_range("2014-01-06", "2014-01-07"), + pd.period_range("2014-01-01", "2014-01-07"), + ), + ] + + for a, b, combined in data: + for box in [Categorical, CategoricalIndex, Series]: + result = union_categoricals([box(Categorical(a)), box(Categorical(b))]) + expected = Categorical(combined) + tm.assert_categorical_equal(result, expected, check_category_order=True) + + # new categories ordered by appearance + s = Categorical(["x", "y", "z"]) + s2 = Categorical(["a", "b", "c"]) + result = union_categoricals([s, s2]) + expected = Categorical( + ["x", "y", "z", "a", "b", "c"], categories=["x", "y", "z", "a", "b", "c"] + ) + tm.assert_categorical_equal(result, expected) + + s = Categorical([0, 1.2, 2], ordered=True) + s2 = Categorical([0, 1.2, 2], ordered=True) + result = union_categoricals([s, s2]) + expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True) + tm.assert_categorical_equal(result, expected) + + # must exactly match types + s = Categorical([0, 1.2, 2]) + s2 = Categorical([2, 3, 4]) + msg = "dtype of categories must be the same" + with pytest.raises(TypeError, match=msg): + union_categoricals([s, s2]) + + msg = "No Categoricals to union" + with pytest.raises(ValueError, match=msg): + union_categoricals([]) + + def test_union_categoricals_nan(self): + # GH 13759 + res = union_categoricals( + [pd.Categorical([1, 2, np.nan]), pd.Categorical([3, 2, np.nan])] + ) + exp = Categorical([1, 2, np.nan, 3, 2, np.nan]) + tm.assert_categorical_equal(res, exp) + + res = union_categoricals( + [pd.Categorical(["A", "B"]), pd.Categorical(["B", "B", np.nan])] + ) + exp = Categorical(["A", "B", "B", "B", np.nan]) + tm.assert_categorical_equal(res, exp) + + val1 = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-03-01"), pd.NaT] + val2 = [pd.NaT, pd.Timestamp("2011-01-01"), pd.Timestamp("2011-02-01")] + + res = union_categoricals([pd.Categorical(val1), pd.Categorical(val2)]) + exp = Categorical( + val1 + val2, + categories=[ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-03-01"), + pd.Timestamp("2011-02-01"), + ], + ) + tm.assert_categorical_equal(res, exp) + + # all NaN + res = union_categoricals( + [ + pd.Categorical(np.array([np.nan, np.nan], dtype=object)), + pd.Categorical(["X"]), + ] + ) + exp = Categorical([np.nan, np.nan, "X"]) + tm.assert_categorical_equal(res, exp) + + res = union_categoricals( + [pd.Categorical([np.nan, np.nan]), pd.Categorical([np.nan, np.nan])] + ) + exp = Categorical([np.nan, np.nan, np.nan, np.nan]) + tm.assert_categorical_equal(res, exp) + + def test_union_categoricals_empty(self): + # GH 13759 + res = union_categoricals([pd.Categorical([]), pd.Categorical([])]) + exp = Categorical([]) + tm.assert_categorical_equal(res, exp) + + res = union_categoricals([Categorical([]), Categorical(["1"])]) + exp = Categorical(["1"]) + tm.assert_categorical_equal(res, exp) + + def test_union_categorical_same_category(self): + # check fastpath + c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4]) + c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4]) + res = union_categoricals([c1, c2]) + exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan], categories=[1, 2, 3, 4]) + tm.assert_categorical_equal(res, exp) + + c1 = Categorical(["z", "z", "z"], categories=["x", "y", "z"]) + c2 = Categorical(["x", "x", "x"], categories=["x", "y", "z"]) + res = union_categoricals([c1, c2]) + exp = Categorical(["z", "z", "z", "x", "x", "x"], categories=["x", "y", "z"]) + tm.assert_categorical_equal(res, exp) + + def test_union_categorical_same_categories_different_order(self): + # https://github.com/pandas-dev/pandas/issues/19096 + c1 = Categorical(["a", "b", "c"], categories=["a", "b", "c"]) + c2 = Categorical(["a", "b", "c"], categories=["b", "a", "c"]) + result = union_categoricals([c1, c2]) + expected = Categorical( + ["a", "b", "c", "a", "b", "c"], categories=["a", "b", "c"] + ) + tm.assert_categorical_equal(result, expected) + + def test_union_categoricals_ordered(self): + c1 = Categorical([1, 2, 3], ordered=True) + c2 = Categorical([1, 2, 3], ordered=False) + + msg = "Categorical.ordered must be the same" + with pytest.raises(TypeError, match=msg): + union_categoricals([c1, c2]) + + res = union_categoricals([c1, c1]) + exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True) + tm.assert_categorical_equal(res, exp) + + c1 = Categorical([1, 2, 3, np.nan], ordered=True) + c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True) + + res = union_categoricals([c1, c2]) + exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True) + tm.assert_categorical_equal(res, exp) + + c1 = Categorical([1, 2, 3], ordered=True) + c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True) + + msg = "to union ordered Categoricals, all categories must be the same" + with pytest.raises(TypeError, match=msg): + union_categoricals([c1, c2]) + + def test_union_categoricals_ignore_order(self): + # GH 15219 + c1 = Categorical([1, 2, 3], ordered=True) + c2 = Categorical([1, 2, 3], ordered=False) + + res = union_categoricals([c1, c2], ignore_order=True) + exp = Categorical([1, 2, 3, 1, 2, 3]) + tm.assert_categorical_equal(res, exp) + + msg = "Categorical.ordered must be the same" + with pytest.raises(TypeError, match=msg): + union_categoricals([c1, c2], ignore_order=False) + + res = union_categoricals([c1, c1], ignore_order=True) + exp = Categorical([1, 2, 3, 1, 2, 3]) + tm.assert_categorical_equal(res, exp) + + res = union_categoricals([c1, c1], ignore_order=False) + exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3], ordered=True) + tm.assert_categorical_equal(res, exp) + + c1 = Categorical([1, 2, 3, np.nan], ordered=True) + c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True) + + res = union_categoricals([c1, c2], ignore_order=True) + exp = Categorical([1, 2, 3, np.nan, 3, 2]) + tm.assert_categorical_equal(res, exp) + + c1 = Categorical([1, 2, 3], ordered=True) + c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True) + + res = union_categoricals([c1, c2], ignore_order=True) + exp = Categorical([1, 2, 3, 1, 2, 3]) + tm.assert_categorical_equal(res, exp) + + res = union_categoricals([c2, c1], ignore_order=True, sort_categories=True) + exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3]) + tm.assert_categorical_equal(res, exp) + + c1 = Categorical([1, 2, 3], ordered=True) + c2 = Categorical([4, 5, 6], ordered=True) + result = union_categoricals([c1, c2], ignore_order=True) + expected = Categorical([1, 2, 3, 4, 5, 6]) + tm.assert_categorical_equal(result, expected) + + msg = "to union ordered Categoricals, all categories must be the same" + with pytest.raises(TypeError, match=msg): + union_categoricals([c1, c2], ignore_order=False) + + with pytest.raises(TypeError, match=msg): + union_categoricals([c1, c2]) + + def test_union_categoricals_sort(self): + # GH 13846 + c1 = Categorical(["x", "y", "z"]) + c2 = Categorical(["a", "b", "c"]) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical( + ["x", "y", "z", "a", "b", "c"], categories=["a", "b", "c", "x", "y", "z"] + ) + tm.assert_categorical_equal(result, expected) + + # fastpath + c1 = Categorical(["a", "b"], categories=["b", "a", "c"]) + c2 = Categorical(["b", "c"], categories=["b", "a", "c"]) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"]) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical(["a", "b"], categories=["c", "a", "b"]) + c2 = Categorical(["b", "c"], categories=["c", "a", "b"]) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"]) + tm.assert_categorical_equal(result, expected) + + # fastpath - skip resort + c1 = Categorical(["a", "b"], categories=["a", "b", "c"]) + c2 = Categorical(["b", "c"], categories=["a", "b", "c"]) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"]) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical(["x", np.nan]) + c2 = Categorical([np.nan, "b"]) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical(["x", np.nan, np.nan, "b"], categories=["b", "x"]) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical([np.nan]) + c2 = Categorical([np.nan]) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical([np.nan, np.nan]) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical([]) + c2 = Categorical([]) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical([]) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical(["b", "a"], categories=["b", "a", "c"], ordered=True) + c2 = Categorical(["a", "c"], categories=["b", "a", "c"], ordered=True) + with pytest.raises(TypeError): + union_categoricals([c1, c2], sort_categories=True) + + def test_union_categoricals_sort_false(self): + # GH 13846 + c1 = Categorical(["x", "y", "z"]) + c2 = Categorical(["a", "b", "c"]) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical( + ["x", "y", "z", "a", "b", "c"], categories=["x", "y", "z", "a", "b", "c"] + ) + tm.assert_categorical_equal(result, expected) + + # fastpath + c1 = Categorical(["a", "b"], categories=["b", "a", "c"]) + c2 = Categorical(["b", "c"], categories=["b", "a", "c"]) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical(["a", "b", "b", "c"], categories=["b", "a", "c"]) + tm.assert_categorical_equal(result, expected) + + # fastpath - skip resort + c1 = Categorical(["a", "b"], categories=["a", "b", "c"]) + c2 = Categorical(["b", "c"], categories=["a", "b", "c"]) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"]) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical(["x", np.nan]) + c2 = Categorical([np.nan, "b"]) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical(["x", np.nan, np.nan, "b"], categories=["x", "b"]) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical([np.nan]) + c2 = Categorical([np.nan]) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical([np.nan, np.nan]) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical([]) + c2 = Categorical([]) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical([]) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical(["b", "a"], categories=["b", "a", "c"], ordered=True) + c2 = Categorical(["a", "c"], categories=["b", "a", "c"], ordered=True) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical( + ["b", "a", "a", "c"], categories=["b", "a", "c"], ordered=True + ) + tm.assert_categorical_equal(result, expected) + + def test_union_categorical_unwrap(self): + # GH 14173 + c1 = Categorical(["a", "b"]) + c2 = pd.Series(["b", "c"], dtype="category") + result = union_categoricals([c1, c2]) + expected = Categorical(["a", "b", "b", "c"]) + tm.assert_categorical_equal(result, expected) + + c2 = CategoricalIndex(c2) + result = union_categoricals([c1, c2]) + tm.assert_categorical_equal(result, expected) + + c1 = Series(c1) + result = union_categoricals([c1, c2]) + tm.assert_categorical_equal(result, expected) + + with pytest.raises(TypeError): + union_categoricals([c1, ["a", "b", "c"]]) diff --git a/venv/Lib/site-packages/pandas/tests/reshape/test_util.py b/venv/Lib/site-packages/pandas/tests/reshape/test_util.py new file mode 100644 index 0000000..cd518dd --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/reshape/test_util.py @@ -0,0 +1,51 @@ +import numpy as np +import pytest + +from pandas import Index, date_range +import pandas._testing as tm +from pandas.core.reshape.util import cartesian_product + + +class TestCartesianProduct: + def test_simple(self): + x, y = list("ABC"), [1, 22] + result1, result2 = cartesian_product([x, y]) + expected1 = np.array(["A", "A", "B", "B", "C", "C"]) + expected2 = np.array([1, 22, 1, 22, 1, 22]) + tm.assert_numpy_array_equal(result1, expected1) + tm.assert_numpy_array_equal(result2, expected2) + + def test_datetimeindex(self): + # regression test for GitHub issue #6439 + # make sure that the ordering on datetimeindex is consistent + x = date_range("2000-01-01", periods=2) + result1, result2 = [Index(y).day for y in cartesian_product([x, x])] + expected1 = Index([1, 1, 2, 2]) + expected2 = Index([1, 2, 1, 2]) + tm.assert_index_equal(result1, expected1) + tm.assert_index_equal(result2, expected2) + + def test_empty(self): + # product of empty factors + X = [[], [0, 1], []] + Y = [[], [], ["a", "b", "c"]] + for x, y in zip(X, Y): + expected1 = np.array([], dtype=np.asarray(x).dtype) + expected2 = np.array([], dtype=np.asarray(y).dtype) + result1, result2 = cartesian_product([x, y]) + tm.assert_numpy_array_equal(result1, expected1) + tm.assert_numpy_array_equal(result2, expected2) + + # empty product (empty input): + result = cartesian_product([]) + expected = [] + assert result == expected + + @pytest.mark.parametrize( + "X", [1, [1], [1, 2], [[1], 2], "a", ["a"], ["a", "b"], [["a"], "b"]] + ) + def test_invalid_input(self, X): + msg = "Input must be a list-like of list-likes" + + with pytest.raises(TypeError, match=msg): + cartesian_product(X=X) diff --git a/venv/Lib/site-packages/pandas/tests/scalar/__init__.py b/venv/Lib/site-packages/pandas/tests/scalar/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/scalar/interval/__init__.py b/venv/Lib/site-packages/pandas/tests/scalar/interval/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/scalar/interval/test_interval.py b/venv/Lib/site-packages/pandas/tests/scalar/interval/test_interval.py new file mode 100644 index 0000000..b51429d --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/scalar/interval/test_interval.py @@ -0,0 +1,259 @@ +import numpy as np +import pytest + +from pandas import Interval, Period, Timedelta, Timestamp +import pandas.core.common as com + + +@pytest.fixture +def interval(): + return Interval(0, 1) + + +class TestInterval: + def test_properties(self, interval): + assert interval.closed == "right" + assert interval.left == 0 + assert interval.right == 1 + assert interval.mid == 0.5 + + def test_repr(self, interval): + assert repr(interval) == "Interval(0, 1, closed='right')" + assert str(interval) == "(0, 1]" + + interval_left = Interval(0, 1, closed="left") + assert repr(interval_left) == "Interval(0, 1, closed='left')" + assert str(interval_left) == "[0, 1)" + + def test_contains(self, interval): + assert 0.5 in interval + assert 1 in interval + assert 0 not in interval + + msg = "__contains__ not defined for two intervals" + with pytest.raises(TypeError, match=msg): + interval in interval + + interval_both = Interval(0, 1, closed="both") + assert 0 in interval_both + assert 1 in interval_both + + interval_neither = Interval(0, 1, closed="neither") + assert 0 not in interval_neither + assert 0.5 in interval_neither + assert 1 not in interval_neither + + def test_equal(self): + assert Interval(0, 1) == Interval(0, 1, closed="right") + assert Interval(0, 1) != Interval(0, 1, closed="left") + assert Interval(0, 1) != 0 + + def test_comparison(self): + with pytest.raises(TypeError, match="unorderable types"): + Interval(0, 1) < 2 + + assert Interval(0, 1) < Interval(1, 2) + assert Interval(0, 1) < Interval(0, 2) + assert Interval(0, 1) < Interval(0.5, 1.5) + assert Interval(0, 1) <= Interval(0, 1) + assert Interval(0, 1) > Interval(-1, 2) + assert Interval(0, 1) >= Interval(0, 1) + + def test_hash(self, interval): + # should not raise + hash(interval) + + @pytest.mark.parametrize( + "left, right, expected", + [ + (0, 5, 5), + (-2, 5.5, 7.5), + (10, 10, 0), + (10, np.inf, np.inf), + (-np.inf, -5, np.inf), + (-np.inf, np.inf, np.inf), + (Timedelta("0 days"), Timedelta("5 days"), Timedelta("5 days")), + (Timedelta("10 days"), Timedelta("10 days"), Timedelta("0 days")), + (Timedelta("1H10M"), Timedelta("5H5M"), Timedelta("3H55M")), + (Timedelta("5S"), Timedelta("1H"), Timedelta("59M55S")), + ], + ) + def test_length(self, left, right, expected): + # GH 18789 + iv = Interval(left, right) + result = iv.length + assert result == expected + + @pytest.mark.parametrize( + "left, right, expected", + [ + ("2017-01-01", "2017-01-06", "5 days"), + ("2017-01-01", "2017-01-01 12:00:00", "12 hours"), + ("2017-01-01 12:00", "2017-01-01 12:00:00", "0 days"), + ("2017-01-01 12:01", "2017-01-05 17:31:00", "4 days 5 hours 30 min"), + ], + ) + @pytest.mark.parametrize("tz", (None, "UTC", "CET", "US/Eastern")) + def test_length_timestamp(self, tz, left, right, expected): + # GH 18789 + iv = Interval(Timestamp(left, tz=tz), Timestamp(right, tz=tz)) + result = iv.length + expected = Timedelta(expected) + assert result == expected + + @pytest.mark.parametrize( + "left, right", + [ + (0, 1), + (Timedelta("0 days"), Timedelta("1 day")), + (Timestamp("2018-01-01"), Timestamp("2018-01-02")), + ( + Timestamp("2018-01-01", tz="US/Eastern"), + Timestamp("2018-01-02", tz="US/Eastern"), + ), + ], + ) + def test_is_empty(self, left, right, closed): + # GH27219 + # non-empty always return False + iv = Interval(left, right, closed) + assert iv.is_empty is False + + # same endpoint is empty except when closed='both' (contains one point) + iv = Interval(left, left, closed) + result = iv.is_empty + expected = closed != "both" + assert result is expected + + @pytest.mark.parametrize( + "left, right", + [ + ("a", "z"), + (("a", "b"), ("c", "d")), + (list("AB"), list("ab")), + (Interval(0, 1), Interval(1, 2)), + (Period("2018Q1", freq="Q"), Period("2018Q1", freq="Q")), + ], + ) + def test_construct_errors(self, left, right): + # GH 23013 + msg = "Only numeric, Timestamp and Timedelta endpoints are allowed" + with pytest.raises(ValueError, match=msg): + Interval(left, right) + + def test_math_add(self, closed): + interval = Interval(0, 1, closed=closed) + expected = Interval(1, 2, closed=closed) + + result = interval + 1 + assert result == expected + + result = 1 + interval + assert result == expected + + result = interval + result += 1 + assert result == expected + + msg = r"unsupported operand type\(s\) for \+" + with pytest.raises(TypeError, match=msg): + interval + interval + + with pytest.raises(TypeError, match=msg): + interval + "foo" + + def test_math_sub(self, closed): + interval = Interval(0, 1, closed=closed) + expected = Interval(-1, 0, closed=closed) + + result = interval - 1 + assert result == expected + + result = interval + result -= 1 + assert result == expected + + msg = r"unsupported operand type\(s\) for -" + with pytest.raises(TypeError, match=msg): + interval - interval + + with pytest.raises(TypeError, match=msg): + interval - "foo" + + def test_math_mult(self, closed): + interval = Interval(0, 1, closed=closed) + expected = Interval(0, 2, closed=closed) + + result = interval * 2 + assert result == expected + + result = 2 * interval + assert result == expected + + result = interval + result *= 2 + assert result == expected + + msg = r"unsupported operand type\(s\) for \*" + with pytest.raises(TypeError, match=msg): + interval * interval + + msg = r"can\'t multiply sequence by non-int" + with pytest.raises(TypeError, match=msg): + interval * "foo" + + def test_math_div(self, closed): + interval = Interval(0, 1, closed=closed) + expected = Interval(0, 0.5, closed=closed) + + result = interval / 2.0 + assert result == expected + + result = interval + result /= 2.0 + assert result == expected + + msg = r"unsupported operand type\(s\) for /" + with pytest.raises(TypeError, match=msg): + interval / interval + + with pytest.raises(TypeError, match=msg): + interval / "foo" + + def test_math_floordiv(self, closed): + interval = Interval(1, 2, closed=closed) + expected = Interval(0, 1, closed=closed) + + result = interval // 2 + assert result == expected + + result = interval + result //= 2 + assert result == expected + + msg = r"unsupported operand type\(s\) for //" + with pytest.raises(TypeError, match=msg): + interval // interval + + with pytest.raises(TypeError, match=msg): + interval // "foo" + + def test_constructor_errors(self): + msg = "invalid option for 'closed': foo" + with pytest.raises(ValueError, match=msg): + Interval(0, 1, closed="foo") + + msg = "left side of interval must be <= right side" + with pytest.raises(ValueError, match=msg): + Interval(1, 0) + + @pytest.mark.parametrize( + "tz_left, tz_right", [(None, "UTC"), ("UTC", None), ("UTC", "US/Eastern")] + ) + def test_constructor_errors_tz(self, tz_left, tz_right): + # GH 18538 + left = Timestamp("2017-01-01", tz=tz_left) + right = Timestamp("2017-01-02", tz=tz_right) + error = TypeError if com.any_none(tz_left, tz_right) else ValueError + with pytest.raises(error): + Interval(left, right) diff --git a/venv/Lib/site-packages/pandas/tests/scalar/interval/test_ops.py b/venv/Lib/site-packages/pandas/tests/scalar/interval/test_ops.py new file mode 100644 index 0000000..2d9f095 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/scalar/interval/test_ops.py @@ -0,0 +1,64 @@ +"""Tests for Interval-Interval operations, such as overlaps, contains, etc.""" +import pytest + +from pandas import Interval, Timedelta, Timestamp + + +@pytest.fixture( + params=[ + (Timedelta("0 days"), Timedelta("1 day")), + (Timestamp("2018-01-01"), Timedelta("1 day")), + (0, 1), + ], + ids=lambda x: type(x[0]).__name__, +) +def start_shift(request): + """ + Fixture for generating intervals of types from a start value and a shift + value that can be added to start to generate an endpoint + """ + return request.param + + +class TestOverlaps: + def test_overlaps_self(self, start_shift, closed): + start, shift = start_shift + interval = Interval(start, start + shift, closed) + assert interval.overlaps(interval) + + def test_overlaps_nested(self, start_shift, closed, other_closed): + start, shift = start_shift + interval1 = Interval(start, start + 3 * shift, other_closed) + interval2 = Interval(start + shift, start + 2 * shift, closed) + + # nested intervals should always overlap + assert interval1.overlaps(interval2) + + def test_overlaps_disjoint(self, start_shift, closed, other_closed): + start, shift = start_shift + interval1 = Interval(start, start + shift, other_closed) + interval2 = Interval(start + 2 * shift, start + 3 * shift, closed) + + # disjoint intervals should never overlap + assert not interval1.overlaps(interval2) + + def test_overlaps_endpoint(self, start_shift, closed, other_closed): + start, shift = start_shift + interval1 = Interval(start, start + shift, other_closed) + interval2 = Interval(start + shift, start + 2 * shift, closed) + + # overlap if shared endpoint is closed for both (overlap at a point) + result = interval1.overlaps(interval2) + expected = interval1.closed_right and interval2.closed_left + assert result == expected + + @pytest.mark.parametrize( + "other", + [10, True, "foo", Timedelta("1 day"), Timestamp("2018-01-01")], + ids=lambda x: type(x).__name__, + ) + def test_overlaps_invalid_type(self, other): + interval = Interval(0, 1) + msg = f"`other` must be an Interval, got {type(other).__name__}" + with pytest.raises(TypeError, match=msg): + interval.overlaps(other) diff --git a/venv/Lib/site-packages/pandas/tests/scalar/period/__init__.py b/venv/Lib/site-packages/pandas/tests/scalar/period/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/scalar/period/test_asfreq.py b/venv/Lib/site-packages/pandas/tests/scalar/period/test_asfreq.py new file mode 100644 index 0000000..357274e --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/scalar/period/test_asfreq.py @@ -0,0 +1,780 @@ +import pytest + +from pandas._libs.tslibs.frequencies import INVALID_FREQ_ERR_MSG, _period_code_map +from pandas.errors import OutOfBoundsDatetime + +from pandas import Period, offsets + + +class TestFreqConversion: + """Test frequency conversion of date objects""" + + @pytest.mark.parametrize("freq", ["A", "Q", "M", "W", "B", "D"]) + def test_asfreq_near_zero(self, freq): + # GH#19643, GH#19650 + per = Period("0001-01-01", freq=freq) + tup1 = (per.year, per.hour, per.day) + + prev = per - 1 + assert prev.ordinal == per.ordinal - 1 + tup2 = (prev.year, prev.month, prev.day) + assert tup2 < tup1 + + def test_asfreq_near_zero_weekly(self): + # GH#19834 + per1 = Period("0001-01-01", "D") + 6 + per2 = Period("0001-01-01", "D") - 6 + week1 = per1.asfreq("W") + week2 = per2.asfreq("W") + assert week1 != week2 + assert week1.asfreq("D", "E") >= per1 + assert week2.asfreq("D", "S") <= per2 + + def test_to_timestamp_out_of_bounds(self): + # GH#19643, used to incorrectly give Timestamp in 1754 + per = Period("0001-01-01", freq="B") + with pytest.raises(OutOfBoundsDatetime): + per.to_timestamp() + + def test_asfreq_corner(self): + val = Period(freq="A", year=2007) + result1 = val.asfreq("5t") + result2 = val.asfreq("t") + expected = Period("2007-12-31 23:59", freq="t") + assert result1.ordinal == expected.ordinal + assert result1.freqstr == "5T" + assert result2.ordinal == expected.ordinal + assert result2.freqstr == "T" + + def test_conv_annual(self): + # frequency conversion tests: from Annual Frequency + + ival_A = Period(freq="A", year=2007) + + ival_AJAN = Period(freq="A-JAN", year=2007) + ival_AJUN = Period(freq="A-JUN", year=2007) + ival_ANOV = Period(freq="A-NOV", year=2007) + + ival_A_to_Q_start = Period(freq="Q", year=2007, quarter=1) + ival_A_to_Q_end = Period(freq="Q", year=2007, quarter=4) + ival_A_to_M_start = Period(freq="M", year=2007, month=1) + ival_A_to_M_end = Period(freq="M", year=2007, month=12) + ival_A_to_W_start = Period(freq="W", year=2007, month=1, day=1) + ival_A_to_W_end = Period(freq="W", year=2007, month=12, day=31) + ival_A_to_B_start = Period(freq="B", year=2007, month=1, day=1) + ival_A_to_B_end = Period(freq="B", year=2007, month=12, day=31) + ival_A_to_D_start = Period(freq="D", year=2007, month=1, day=1) + ival_A_to_D_end = Period(freq="D", year=2007, month=12, day=31) + ival_A_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) + ival_A_to_H_end = Period(freq="H", year=2007, month=12, day=31, hour=23) + ival_A_to_T_start = Period( + freq="Min", year=2007, month=1, day=1, hour=0, minute=0 + ) + ival_A_to_T_end = Period( + freq="Min", year=2007, month=12, day=31, hour=23, minute=59 + ) + ival_A_to_S_start = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + ) + ival_A_to_S_end = Period( + freq="S", year=2007, month=12, day=31, hour=23, minute=59, second=59 + ) + + ival_AJAN_to_D_end = Period(freq="D", year=2007, month=1, day=31) + ival_AJAN_to_D_start = Period(freq="D", year=2006, month=2, day=1) + ival_AJUN_to_D_end = Period(freq="D", year=2007, month=6, day=30) + ival_AJUN_to_D_start = Period(freq="D", year=2006, month=7, day=1) + ival_ANOV_to_D_end = Period(freq="D", year=2007, month=11, day=30) + ival_ANOV_to_D_start = Period(freq="D", year=2006, month=12, day=1) + + assert ival_A.asfreq("Q", "S") == ival_A_to_Q_start + assert ival_A.asfreq("Q", "e") == ival_A_to_Q_end + assert ival_A.asfreq("M", "s") == ival_A_to_M_start + assert ival_A.asfreq("M", "E") == ival_A_to_M_end + assert ival_A.asfreq("W", "S") == ival_A_to_W_start + assert ival_A.asfreq("W", "E") == ival_A_to_W_end + assert ival_A.asfreq("B", "S") == ival_A_to_B_start + assert ival_A.asfreq("B", "E") == ival_A_to_B_end + assert ival_A.asfreq("D", "S") == ival_A_to_D_start + assert ival_A.asfreq("D", "E") == ival_A_to_D_end + assert ival_A.asfreq("H", "S") == ival_A_to_H_start + assert ival_A.asfreq("H", "E") == ival_A_to_H_end + assert ival_A.asfreq("min", "S") == ival_A_to_T_start + assert ival_A.asfreq("min", "E") == ival_A_to_T_end + assert ival_A.asfreq("T", "S") == ival_A_to_T_start + assert ival_A.asfreq("T", "E") == ival_A_to_T_end + assert ival_A.asfreq("S", "S") == ival_A_to_S_start + assert ival_A.asfreq("S", "E") == ival_A_to_S_end + + assert ival_AJAN.asfreq("D", "S") == ival_AJAN_to_D_start + assert ival_AJAN.asfreq("D", "E") == ival_AJAN_to_D_end + + assert ival_AJUN.asfreq("D", "S") == ival_AJUN_to_D_start + assert ival_AJUN.asfreq("D", "E") == ival_AJUN_to_D_end + + assert ival_ANOV.asfreq("D", "S") == ival_ANOV_to_D_start + assert ival_ANOV.asfreq("D", "E") == ival_ANOV_to_D_end + + assert ival_A.asfreq("A") == ival_A + + def test_conv_quarterly(self): + # frequency conversion tests: from Quarterly Frequency + + ival_Q = Period(freq="Q", year=2007, quarter=1) + ival_Q_end_of_year = Period(freq="Q", year=2007, quarter=4) + + ival_QEJAN = Period(freq="Q-JAN", year=2007, quarter=1) + ival_QEJUN = Period(freq="Q-JUN", year=2007, quarter=1) + + ival_Q_to_A = Period(freq="A", year=2007) + ival_Q_to_M_start = Period(freq="M", year=2007, month=1) + ival_Q_to_M_end = Period(freq="M", year=2007, month=3) + ival_Q_to_W_start = Period(freq="W", year=2007, month=1, day=1) + ival_Q_to_W_end = Period(freq="W", year=2007, month=3, day=31) + ival_Q_to_B_start = Period(freq="B", year=2007, month=1, day=1) + ival_Q_to_B_end = Period(freq="B", year=2007, month=3, day=30) + ival_Q_to_D_start = Period(freq="D", year=2007, month=1, day=1) + ival_Q_to_D_end = Period(freq="D", year=2007, month=3, day=31) + ival_Q_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) + ival_Q_to_H_end = Period(freq="H", year=2007, month=3, day=31, hour=23) + ival_Q_to_T_start = Period( + freq="Min", year=2007, month=1, day=1, hour=0, minute=0 + ) + ival_Q_to_T_end = Period( + freq="Min", year=2007, month=3, day=31, hour=23, minute=59 + ) + ival_Q_to_S_start = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + ) + ival_Q_to_S_end = Period( + freq="S", year=2007, month=3, day=31, hour=23, minute=59, second=59 + ) + + ival_QEJAN_to_D_start = Period(freq="D", year=2006, month=2, day=1) + ival_QEJAN_to_D_end = Period(freq="D", year=2006, month=4, day=30) + + ival_QEJUN_to_D_start = Period(freq="D", year=2006, month=7, day=1) + ival_QEJUN_to_D_end = Period(freq="D", year=2006, month=9, day=30) + + assert ival_Q.asfreq("A") == ival_Q_to_A + assert ival_Q_end_of_year.asfreq("A") == ival_Q_to_A + + assert ival_Q.asfreq("M", "S") == ival_Q_to_M_start + assert ival_Q.asfreq("M", "E") == ival_Q_to_M_end + assert ival_Q.asfreq("W", "S") == ival_Q_to_W_start + assert ival_Q.asfreq("W", "E") == ival_Q_to_W_end + assert ival_Q.asfreq("B", "S") == ival_Q_to_B_start + assert ival_Q.asfreq("B", "E") == ival_Q_to_B_end + assert ival_Q.asfreq("D", "S") == ival_Q_to_D_start + assert ival_Q.asfreq("D", "E") == ival_Q_to_D_end + assert ival_Q.asfreq("H", "S") == ival_Q_to_H_start + assert ival_Q.asfreq("H", "E") == ival_Q_to_H_end + assert ival_Q.asfreq("Min", "S") == ival_Q_to_T_start + assert ival_Q.asfreq("Min", "E") == ival_Q_to_T_end + assert ival_Q.asfreq("S", "S") == ival_Q_to_S_start + assert ival_Q.asfreq("S", "E") == ival_Q_to_S_end + + assert ival_QEJAN.asfreq("D", "S") == ival_QEJAN_to_D_start + assert ival_QEJAN.asfreq("D", "E") == ival_QEJAN_to_D_end + assert ival_QEJUN.asfreq("D", "S") == ival_QEJUN_to_D_start + assert ival_QEJUN.asfreq("D", "E") == ival_QEJUN_to_D_end + + assert ival_Q.asfreq("Q") == ival_Q + + def test_conv_monthly(self): + # frequency conversion tests: from Monthly Frequency + + ival_M = Period(freq="M", year=2007, month=1) + ival_M_end_of_year = Period(freq="M", year=2007, month=12) + ival_M_end_of_quarter = Period(freq="M", year=2007, month=3) + ival_M_to_A = Period(freq="A", year=2007) + ival_M_to_Q = Period(freq="Q", year=2007, quarter=1) + ival_M_to_W_start = Period(freq="W", year=2007, month=1, day=1) + ival_M_to_W_end = Period(freq="W", year=2007, month=1, day=31) + ival_M_to_B_start = Period(freq="B", year=2007, month=1, day=1) + ival_M_to_B_end = Period(freq="B", year=2007, month=1, day=31) + ival_M_to_D_start = Period(freq="D", year=2007, month=1, day=1) + ival_M_to_D_end = Period(freq="D", year=2007, month=1, day=31) + ival_M_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) + ival_M_to_H_end = Period(freq="H", year=2007, month=1, day=31, hour=23) + ival_M_to_T_start = Period( + freq="Min", year=2007, month=1, day=1, hour=0, minute=0 + ) + ival_M_to_T_end = Period( + freq="Min", year=2007, month=1, day=31, hour=23, minute=59 + ) + ival_M_to_S_start = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + ) + ival_M_to_S_end = Period( + freq="S", year=2007, month=1, day=31, hour=23, minute=59, second=59 + ) + + assert ival_M.asfreq("A") == ival_M_to_A + assert ival_M_end_of_year.asfreq("A") == ival_M_to_A + assert ival_M.asfreq("Q") == ival_M_to_Q + assert ival_M_end_of_quarter.asfreq("Q") == ival_M_to_Q + + assert ival_M.asfreq("W", "S") == ival_M_to_W_start + assert ival_M.asfreq("W", "E") == ival_M_to_W_end + assert ival_M.asfreq("B", "S") == ival_M_to_B_start + assert ival_M.asfreq("B", "E") == ival_M_to_B_end + assert ival_M.asfreq("D", "S") == ival_M_to_D_start + assert ival_M.asfreq("D", "E") == ival_M_to_D_end + assert ival_M.asfreq("H", "S") == ival_M_to_H_start + assert ival_M.asfreq("H", "E") == ival_M_to_H_end + assert ival_M.asfreq("Min", "S") == ival_M_to_T_start + assert ival_M.asfreq("Min", "E") == ival_M_to_T_end + assert ival_M.asfreq("S", "S") == ival_M_to_S_start + assert ival_M.asfreq("S", "E") == ival_M_to_S_end + + assert ival_M.asfreq("M") == ival_M + + def test_conv_weekly(self): + # frequency conversion tests: from Weekly Frequency + ival_W = Period(freq="W", year=2007, month=1, day=1) + + ival_WSUN = Period(freq="W", year=2007, month=1, day=7) + ival_WSAT = Period(freq="W-SAT", year=2007, month=1, day=6) + ival_WFRI = Period(freq="W-FRI", year=2007, month=1, day=5) + ival_WTHU = Period(freq="W-THU", year=2007, month=1, day=4) + ival_WWED = Period(freq="W-WED", year=2007, month=1, day=3) + ival_WTUE = Period(freq="W-TUE", year=2007, month=1, day=2) + ival_WMON = Period(freq="W-MON", year=2007, month=1, day=1) + + ival_WSUN_to_D_start = Period(freq="D", year=2007, month=1, day=1) + ival_WSUN_to_D_end = Period(freq="D", year=2007, month=1, day=7) + ival_WSAT_to_D_start = Period(freq="D", year=2006, month=12, day=31) + ival_WSAT_to_D_end = Period(freq="D", year=2007, month=1, day=6) + ival_WFRI_to_D_start = Period(freq="D", year=2006, month=12, day=30) + ival_WFRI_to_D_end = Period(freq="D", year=2007, month=1, day=5) + ival_WTHU_to_D_start = Period(freq="D", year=2006, month=12, day=29) + ival_WTHU_to_D_end = Period(freq="D", year=2007, month=1, day=4) + ival_WWED_to_D_start = Period(freq="D", year=2006, month=12, day=28) + ival_WWED_to_D_end = Period(freq="D", year=2007, month=1, day=3) + ival_WTUE_to_D_start = Period(freq="D", year=2006, month=12, day=27) + ival_WTUE_to_D_end = Period(freq="D", year=2007, month=1, day=2) + ival_WMON_to_D_start = Period(freq="D", year=2006, month=12, day=26) + ival_WMON_to_D_end = Period(freq="D", year=2007, month=1, day=1) + + ival_W_end_of_year = Period(freq="W", year=2007, month=12, day=31) + ival_W_end_of_quarter = Period(freq="W", year=2007, month=3, day=31) + ival_W_end_of_month = Period(freq="W", year=2007, month=1, day=31) + ival_W_to_A = Period(freq="A", year=2007) + ival_W_to_Q = Period(freq="Q", year=2007, quarter=1) + ival_W_to_M = Period(freq="M", year=2007, month=1) + + if Period(freq="D", year=2007, month=12, day=31).weekday == 6: + ival_W_to_A_end_of_year = Period(freq="A", year=2007) + else: + ival_W_to_A_end_of_year = Period(freq="A", year=2008) + + if Period(freq="D", year=2007, month=3, day=31).weekday == 6: + ival_W_to_Q_end_of_quarter = Period(freq="Q", year=2007, quarter=1) + else: + ival_W_to_Q_end_of_quarter = Period(freq="Q", year=2007, quarter=2) + + if Period(freq="D", year=2007, month=1, day=31).weekday == 6: + ival_W_to_M_end_of_month = Period(freq="M", year=2007, month=1) + else: + ival_W_to_M_end_of_month = Period(freq="M", year=2007, month=2) + + ival_W_to_B_start = Period(freq="B", year=2007, month=1, day=1) + ival_W_to_B_end = Period(freq="B", year=2007, month=1, day=5) + ival_W_to_D_start = Period(freq="D", year=2007, month=1, day=1) + ival_W_to_D_end = Period(freq="D", year=2007, month=1, day=7) + ival_W_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) + ival_W_to_H_end = Period(freq="H", year=2007, month=1, day=7, hour=23) + ival_W_to_T_start = Period( + freq="Min", year=2007, month=1, day=1, hour=0, minute=0 + ) + ival_W_to_T_end = Period( + freq="Min", year=2007, month=1, day=7, hour=23, minute=59 + ) + ival_W_to_S_start = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + ) + ival_W_to_S_end = Period( + freq="S", year=2007, month=1, day=7, hour=23, minute=59, second=59 + ) + + assert ival_W.asfreq("A") == ival_W_to_A + assert ival_W_end_of_year.asfreq("A") == ival_W_to_A_end_of_year + + assert ival_W.asfreq("Q") == ival_W_to_Q + assert ival_W_end_of_quarter.asfreq("Q") == ival_W_to_Q_end_of_quarter + + assert ival_W.asfreq("M") == ival_W_to_M + assert ival_W_end_of_month.asfreq("M") == ival_W_to_M_end_of_month + + assert ival_W.asfreq("B", "S") == ival_W_to_B_start + assert ival_W.asfreq("B", "E") == ival_W_to_B_end + + assert ival_W.asfreq("D", "S") == ival_W_to_D_start + assert ival_W.asfreq("D", "E") == ival_W_to_D_end + + assert ival_WSUN.asfreq("D", "S") == ival_WSUN_to_D_start + assert ival_WSUN.asfreq("D", "E") == ival_WSUN_to_D_end + assert ival_WSAT.asfreq("D", "S") == ival_WSAT_to_D_start + assert ival_WSAT.asfreq("D", "E") == ival_WSAT_to_D_end + assert ival_WFRI.asfreq("D", "S") == ival_WFRI_to_D_start + assert ival_WFRI.asfreq("D", "E") == ival_WFRI_to_D_end + assert ival_WTHU.asfreq("D", "S") == ival_WTHU_to_D_start + assert ival_WTHU.asfreq("D", "E") == ival_WTHU_to_D_end + assert ival_WWED.asfreq("D", "S") == ival_WWED_to_D_start + assert ival_WWED.asfreq("D", "E") == ival_WWED_to_D_end + assert ival_WTUE.asfreq("D", "S") == ival_WTUE_to_D_start + assert ival_WTUE.asfreq("D", "E") == ival_WTUE_to_D_end + assert ival_WMON.asfreq("D", "S") == ival_WMON_to_D_start + assert ival_WMON.asfreq("D", "E") == ival_WMON_to_D_end + + assert ival_W.asfreq("H", "S") == ival_W_to_H_start + assert ival_W.asfreq("H", "E") == ival_W_to_H_end + assert ival_W.asfreq("Min", "S") == ival_W_to_T_start + assert ival_W.asfreq("Min", "E") == ival_W_to_T_end + assert ival_W.asfreq("S", "S") == ival_W_to_S_start + assert ival_W.asfreq("S", "E") == ival_W_to_S_end + + assert ival_W.asfreq("W") == ival_W + + msg = INVALID_FREQ_ERR_MSG + with pytest.raises(ValueError, match=msg): + ival_W.asfreq("WK") + + def test_conv_weekly_legacy(self): + # frequency conversion tests: from Weekly Frequency + msg = INVALID_FREQ_ERR_MSG + with pytest.raises(ValueError, match=msg): + Period(freq="WK", year=2007, month=1, day=1) + + with pytest.raises(ValueError, match=msg): + Period(freq="WK-SAT", year=2007, month=1, day=6) + with pytest.raises(ValueError, match=msg): + Period(freq="WK-FRI", year=2007, month=1, day=5) + with pytest.raises(ValueError, match=msg): + Period(freq="WK-THU", year=2007, month=1, day=4) + with pytest.raises(ValueError, match=msg): + Period(freq="WK-WED", year=2007, month=1, day=3) + with pytest.raises(ValueError, match=msg): + Period(freq="WK-TUE", year=2007, month=1, day=2) + with pytest.raises(ValueError, match=msg): + Period(freq="WK-MON", year=2007, month=1, day=1) + + def test_conv_business(self): + # frequency conversion tests: from Business Frequency" + + ival_B = Period(freq="B", year=2007, month=1, day=1) + ival_B_end_of_year = Period(freq="B", year=2007, month=12, day=31) + ival_B_end_of_quarter = Period(freq="B", year=2007, month=3, day=30) + ival_B_end_of_month = Period(freq="B", year=2007, month=1, day=31) + ival_B_end_of_week = Period(freq="B", year=2007, month=1, day=5) + + ival_B_to_A = Period(freq="A", year=2007) + ival_B_to_Q = Period(freq="Q", year=2007, quarter=1) + ival_B_to_M = Period(freq="M", year=2007, month=1) + ival_B_to_W = Period(freq="W", year=2007, month=1, day=7) + ival_B_to_D = Period(freq="D", year=2007, month=1, day=1) + ival_B_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) + ival_B_to_H_end = Period(freq="H", year=2007, month=1, day=1, hour=23) + ival_B_to_T_start = Period( + freq="Min", year=2007, month=1, day=1, hour=0, minute=0 + ) + ival_B_to_T_end = Period( + freq="Min", year=2007, month=1, day=1, hour=23, minute=59 + ) + ival_B_to_S_start = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + ) + ival_B_to_S_end = Period( + freq="S", year=2007, month=1, day=1, hour=23, minute=59, second=59 + ) + + assert ival_B.asfreq("A") == ival_B_to_A + assert ival_B_end_of_year.asfreq("A") == ival_B_to_A + assert ival_B.asfreq("Q") == ival_B_to_Q + assert ival_B_end_of_quarter.asfreq("Q") == ival_B_to_Q + assert ival_B.asfreq("M") == ival_B_to_M + assert ival_B_end_of_month.asfreq("M") == ival_B_to_M + assert ival_B.asfreq("W") == ival_B_to_W + assert ival_B_end_of_week.asfreq("W") == ival_B_to_W + + assert ival_B.asfreq("D") == ival_B_to_D + + assert ival_B.asfreq("H", "S") == ival_B_to_H_start + assert ival_B.asfreq("H", "E") == ival_B_to_H_end + assert ival_B.asfreq("Min", "S") == ival_B_to_T_start + assert ival_B.asfreq("Min", "E") == ival_B_to_T_end + assert ival_B.asfreq("S", "S") == ival_B_to_S_start + assert ival_B.asfreq("S", "E") == ival_B_to_S_end + + assert ival_B.asfreq("B") == ival_B + + def test_conv_daily(self): + # frequency conversion tests: from Business Frequency" + + ival_D = Period(freq="D", year=2007, month=1, day=1) + ival_D_end_of_year = Period(freq="D", year=2007, month=12, day=31) + ival_D_end_of_quarter = Period(freq="D", year=2007, month=3, day=31) + ival_D_end_of_month = Period(freq="D", year=2007, month=1, day=31) + ival_D_end_of_week = Period(freq="D", year=2007, month=1, day=7) + + ival_D_friday = Period(freq="D", year=2007, month=1, day=5) + ival_D_saturday = Period(freq="D", year=2007, month=1, day=6) + ival_D_sunday = Period(freq="D", year=2007, month=1, day=7) + + # TODO: unused? + # ival_D_monday = Period(freq='D', year=2007, month=1, day=8) + + ival_B_friday = Period(freq="B", year=2007, month=1, day=5) + ival_B_monday = Period(freq="B", year=2007, month=1, day=8) + + ival_D_to_A = Period(freq="A", year=2007) + + ival_Deoq_to_AJAN = Period(freq="A-JAN", year=2008) + ival_Deoq_to_AJUN = Period(freq="A-JUN", year=2007) + ival_Deoq_to_ADEC = Period(freq="A-DEC", year=2007) + + ival_D_to_QEJAN = Period(freq="Q-JAN", year=2007, quarter=4) + ival_D_to_QEJUN = Period(freq="Q-JUN", year=2007, quarter=3) + ival_D_to_QEDEC = Period(freq="Q-DEC", year=2007, quarter=1) + + ival_D_to_M = Period(freq="M", year=2007, month=1) + ival_D_to_W = Period(freq="W", year=2007, month=1, day=7) + + ival_D_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) + ival_D_to_H_end = Period(freq="H", year=2007, month=1, day=1, hour=23) + ival_D_to_T_start = Period( + freq="Min", year=2007, month=1, day=1, hour=0, minute=0 + ) + ival_D_to_T_end = Period( + freq="Min", year=2007, month=1, day=1, hour=23, minute=59 + ) + ival_D_to_S_start = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + ) + ival_D_to_S_end = Period( + freq="S", year=2007, month=1, day=1, hour=23, minute=59, second=59 + ) + + assert ival_D.asfreq("A") == ival_D_to_A + + assert ival_D_end_of_quarter.asfreq("A-JAN") == ival_Deoq_to_AJAN + assert ival_D_end_of_quarter.asfreq("A-JUN") == ival_Deoq_to_AJUN + assert ival_D_end_of_quarter.asfreq("A-DEC") == ival_Deoq_to_ADEC + + assert ival_D_end_of_year.asfreq("A") == ival_D_to_A + assert ival_D_end_of_quarter.asfreq("Q") == ival_D_to_QEDEC + assert ival_D.asfreq("Q-JAN") == ival_D_to_QEJAN + assert ival_D.asfreq("Q-JUN") == ival_D_to_QEJUN + assert ival_D.asfreq("Q-DEC") == ival_D_to_QEDEC + assert ival_D.asfreq("M") == ival_D_to_M + assert ival_D_end_of_month.asfreq("M") == ival_D_to_M + assert ival_D.asfreq("W") == ival_D_to_W + assert ival_D_end_of_week.asfreq("W") == ival_D_to_W + + assert ival_D_friday.asfreq("B") == ival_B_friday + assert ival_D_saturday.asfreq("B", "S") == ival_B_friday + assert ival_D_saturday.asfreq("B", "E") == ival_B_monday + assert ival_D_sunday.asfreq("B", "S") == ival_B_friday + assert ival_D_sunday.asfreq("B", "E") == ival_B_monday + + assert ival_D.asfreq("H", "S") == ival_D_to_H_start + assert ival_D.asfreq("H", "E") == ival_D_to_H_end + assert ival_D.asfreq("Min", "S") == ival_D_to_T_start + assert ival_D.asfreq("Min", "E") == ival_D_to_T_end + assert ival_D.asfreq("S", "S") == ival_D_to_S_start + assert ival_D.asfreq("S", "E") == ival_D_to_S_end + + assert ival_D.asfreq("D") == ival_D + + def test_conv_hourly(self): + # frequency conversion tests: from Hourly Frequency" + + ival_H = Period(freq="H", year=2007, month=1, day=1, hour=0) + ival_H_end_of_year = Period(freq="H", year=2007, month=12, day=31, hour=23) + ival_H_end_of_quarter = Period(freq="H", year=2007, month=3, day=31, hour=23) + ival_H_end_of_month = Period(freq="H", year=2007, month=1, day=31, hour=23) + ival_H_end_of_week = Period(freq="H", year=2007, month=1, day=7, hour=23) + ival_H_end_of_day = Period(freq="H", year=2007, month=1, day=1, hour=23) + ival_H_end_of_bus = Period(freq="H", year=2007, month=1, day=1, hour=23) + + ival_H_to_A = Period(freq="A", year=2007) + ival_H_to_Q = Period(freq="Q", year=2007, quarter=1) + ival_H_to_M = Period(freq="M", year=2007, month=1) + ival_H_to_W = Period(freq="W", year=2007, month=1, day=7) + ival_H_to_D = Period(freq="D", year=2007, month=1, day=1) + ival_H_to_B = Period(freq="B", year=2007, month=1, day=1) + + ival_H_to_T_start = Period( + freq="Min", year=2007, month=1, day=1, hour=0, minute=0 + ) + ival_H_to_T_end = Period( + freq="Min", year=2007, month=1, day=1, hour=0, minute=59 + ) + ival_H_to_S_start = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + ) + ival_H_to_S_end = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=59, second=59 + ) + + assert ival_H.asfreq("A") == ival_H_to_A + assert ival_H_end_of_year.asfreq("A") == ival_H_to_A + assert ival_H.asfreq("Q") == ival_H_to_Q + assert ival_H_end_of_quarter.asfreq("Q") == ival_H_to_Q + assert ival_H.asfreq("M") == ival_H_to_M + assert ival_H_end_of_month.asfreq("M") == ival_H_to_M + assert ival_H.asfreq("W") == ival_H_to_W + assert ival_H_end_of_week.asfreq("W") == ival_H_to_W + assert ival_H.asfreq("D") == ival_H_to_D + assert ival_H_end_of_day.asfreq("D") == ival_H_to_D + assert ival_H.asfreq("B") == ival_H_to_B + assert ival_H_end_of_bus.asfreq("B") == ival_H_to_B + + assert ival_H.asfreq("Min", "S") == ival_H_to_T_start + assert ival_H.asfreq("Min", "E") == ival_H_to_T_end + assert ival_H.asfreq("S", "S") == ival_H_to_S_start + assert ival_H.asfreq("S", "E") == ival_H_to_S_end + + assert ival_H.asfreq("H") == ival_H + + def test_conv_minutely(self): + # frequency conversion tests: from Minutely Frequency" + + ival_T = Period(freq="Min", year=2007, month=1, day=1, hour=0, minute=0) + ival_T_end_of_year = Period( + freq="Min", year=2007, month=12, day=31, hour=23, minute=59 + ) + ival_T_end_of_quarter = Period( + freq="Min", year=2007, month=3, day=31, hour=23, minute=59 + ) + ival_T_end_of_month = Period( + freq="Min", year=2007, month=1, day=31, hour=23, minute=59 + ) + ival_T_end_of_week = Period( + freq="Min", year=2007, month=1, day=7, hour=23, minute=59 + ) + ival_T_end_of_day = Period( + freq="Min", year=2007, month=1, day=1, hour=23, minute=59 + ) + ival_T_end_of_bus = Period( + freq="Min", year=2007, month=1, day=1, hour=23, minute=59 + ) + ival_T_end_of_hour = Period( + freq="Min", year=2007, month=1, day=1, hour=0, minute=59 + ) + + ival_T_to_A = Period(freq="A", year=2007) + ival_T_to_Q = Period(freq="Q", year=2007, quarter=1) + ival_T_to_M = Period(freq="M", year=2007, month=1) + ival_T_to_W = Period(freq="W", year=2007, month=1, day=7) + ival_T_to_D = Period(freq="D", year=2007, month=1, day=1) + ival_T_to_B = Period(freq="B", year=2007, month=1, day=1) + ival_T_to_H = Period(freq="H", year=2007, month=1, day=1, hour=0) + + ival_T_to_S_start = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + ) + ival_T_to_S_end = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=59 + ) + + assert ival_T.asfreq("A") == ival_T_to_A + assert ival_T_end_of_year.asfreq("A") == ival_T_to_A + assert ival_T.asfreq("Q") == ival_T_to_Q + assert ival_T_end_of_quarter.asfreq("Q") == ival_T_to_Q + assert ival_T.asfreq("M") == ival_T_to_M + assert ival_T_end_of_month.asfreq("M") == ival_T_to_M + assert ival_T.asfreq("W") == ival_T_to_W + assert ival_T_end_of_week.asfreq("W") == ival_T_to_W + assert ival_T.asfreq("D") == ival_T_to_D + assert ival_T_end_of_day.asfreq("D") == ival_T_to_D + assert ival_T.asfreq("B") == ival_T_to_B + assert ival_T_end_of_bus.asfreq("B") == ival_T_to_B + assert ival_T.asfreq("H") == ival_T_to_H + assert ival_T_end_of_hour.asfreq("H") == ival_T_to_H + + assert ival_T.asfreq("S", "S") == ival_T_to_S_start + assert ival_T.asfreq("S", "E") == ival_T_to_S_end + + assert ival_T.asfreq("Min") == ival_T + + def test_conv_secondly(self): + # frequency conversion tests: from Secondly Frequency" + + ival_S = Period(freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0) + ival_S_end_of_year = Period( + freq="S", year=2007, month=12, day=31, hour=23, minute=59, second=59 + ) + ival_S_end_of_quarter = Period( + freq="S", year=2007, month=3, day=31, hour=23, minute=59, second=59 + ) + ival_S_end_of_month = Period( + freq="S", year=2007, month=1, day=31, hour=23, minute=59, second=59 + ) + ival_S_end_of_week = Period( + freq="S", year=2007, month=1, day=7, hour=23, minute=59, second=59 + ) + ival_S_end_of_day = Period( + freq="S", year=2007, month=1, day=1, hour=23, minute=59, second=59 + ) + ival_S_end_of_bus = Period( + freq="S", year=2007, month=1, day=1, hour=23, minute=59, second=59 + ) + ival_S_end_of_hour = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=59, second=59 + ) + ival_S_end_of_minute = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=59 + ) + + ival_S_to_A = Period(freq="A", year=2007) + ival_S_to_Q = Period(freq="Q", year=2007, quarter=1) + ival_S_to_M = Period(freq="M", year=2007, month=1) + ival_S_to_W = Period(freq="W", year=2007, month=1, day=7) + ival_S_to_D = Period(freq="D", year=2007, month=1, day=1) + ival_S_to_B = Period(freq="B", year=2007, month=1, day=1) + ival_S_to_H = Period(freq="H", year=2007, month=1, day=1, hour=0) + ival_S_to_T = Period(freq="Min", year=2007, month=1, day=1, hour=0, minute=0) + + assert ival_S.asfreq("A") == ival_S_to_A + assert ival_S_end_of_year.asfreq("A") == ival_S_to_A + assert ival_S.asfreq("Q") == ival_S_to_Q + assert ival_S_end_of_quarter.asfreq("Q") == ival_S_to_Q + assert ival_S.asfreq("M") == ival_S_to_M + assert ival_S_end_of_month.asfreq("M") == ival_S_to_M + assert ival_S.asfreq("W") == ival_S_to_W + assert ival_S_end_of_week.asfreq("W") == ival_S_to_W + assert ival_S.asfreq("D") == ival_S_to_D + assert ival_S_end_of_day.asfreq("D") == ival_S_to_D + assert ival_S.asfreq("B") == ival_S_to_B + assert ival_S_end_of_bus.asfreq("B") == ival_S_to_B + assert ival_S.asfreq("H") == ival_S_to_H + assert ival_S_end_of_hour.asfreq("H") == ival_S_to_H + assert ival_S.asfreq("Min") == ival_S_to_T + assert ival_S_end_of_minute.asfreq("Min") == ival_S_to_T + + assert ival_S.asfreq("S") == ival_S + + def test_asfreq_mult(self): + # normal freq to mult freq + p = Period(freq="A", year=2007) + # ordinal will not change + for freq in ["3A", offsets.YearEnd(3)]: + result = p.asfreq(freq) + expected = Period("2007", freq="3A") + + assert result == expected + assert result.ordinal == expected.ordinal + assert result.freq == expected.freq + # ordinal will not change + for freq in ["3A", offsets.YearEnd(3)]: + result = p.asfreq(freq, how="S") + expected = Period("2007", freq="3A") + + assert result == expected + assert result.ordinal == expected.ordinal + assert result.freq == expected.freq + + # mult freq to normal freq + p = Period(freq="3A", year=2007) + # ordinal will change because how=E is the default + for freq in ["A", offsets.YearEnd()]: + result = p.asfreq(freq) + expected = Period("2009", freq="A") + + assert result == expected + assert result.ordinal == expected.ordinal + assert result.freq == expected.freq + # ordinal will not change + for freq in ["A", offsets.YearEnd()]: + result = p.asfreq(freq, how="S") + expected = Period("2007", freq="A") + + assert result == expected + assert result.ordinal == expected.ordinal + assert result.freq == expected.freq + + p = Period(freq="A", year=2007) + for freq in ["2M", offsets.MonthEnd(2)]: + result = p.asfreq(freq) + expected = Period("2007-12", freq="2M") + + assert result == expected + assert result.ordinal == expected.ordinal + assert result.freq == expected.freq + for freq in ["2M", offsets.MonthEnd(2)]: + result = p.asfreq(freq, how="S") + expected = Period("2007-01", freq="2M") + + assert result == expected + assert result.ordinal == expected.ordinal + assert result.freq == expected.freq + + p = Period(freq="3A", year=2007) + for freq in ["2M", offsets.MonthEnd(2)]: + result = p.asfreq(freq) + expected = Period("2009-12", freq="2M") + + assert result == expected + assert result.ordinal == expected.ordinal + assert result.freq == expected.freq + for freq in ["2M", offsets.MonthEnd(2)]: + result = p.asfreq(freq, how="S") + expected = Period("2007-01", freq="2M") + + assert result == expected + assert result.ordinal == expected.ordinal + assert result.freq == expected.freq + + def test_asfreq_combined(self): + # normal freq to combined freq + p = Period("2007", freq="H") + + # ordinal will not change + expected = Period("2007", freq="25H") + for freq, how in zip(["1D1H", "1H1D"], ["E", "S"]): + result = p.asfreq(freq, how=how) + assert result == expected + assert result.ordinal == expected.ordinal + assert result.freq == expected.freq + + # combined freq to normal freq + p1 = Period(freq="1D1H", year=2007) + p2 = Period(freq="1H1D", year=2007) + + # ordinal will change because how=E is the default + result1 = p1.asfreq("H") + result2 = p2.asfreq("H") + expected = Period("2007-01-02", freq="H") + assert result1 == expected + assert result1.ordinal == expected.ordinal + assert result1.freq == expected.freq + assert result2 == expected + assert result2.ordinal == expected.ordinal + assert result2.freq == expected.freq + + # ordinal will not change + result1 = p1.asfreq("H", how="S") + result2 = p2.asfreq("H", how="S") + expected = Period("2007-01-01", freq="H") + assert result1 == expected + assert result1.ordinal == expected.ordinal + assert result1.freq == expected.freq + assert result2 == expected + assert result2.ordinal == expected.ordinal + assert result2.freq == expected.freq + + def test_asfreq_MS(self): + initial = Period("2013") + + assert initial.asfreq(freq="M", how="S") == Period("2013-01", "M") + + msg = INVALID_FREQ_ERR_MSG + with pytest.raises(ValueError, match=msg): + initial.asfreq(freq="MS", how="S") + + with pytest.raises(ValueError, match=msg): + Period("2013-01", "MS") + + assert _period_code_map.get("MS") is None diff --git a/venv/Lib/site-packages/pandas/tests/scalar/period/test_period.py b/venv/Lib/site-packages/pandas/tests/scalar/period/test_period.py new file mode 100644 index 0000000..6af9c98 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/scalar/period/test_period.py @@ -0,0 +1,1567 @@ +from datetime import date, datetime, timedelta +from distutils.version import StrictVersion + +import dateutil +import numpy as np +import pytest +import pytz + +from pandas._libs.tslibs import iNaT, period as libperiod +from pandas._libs.tslibs.ccalendar import DAYS, MONTHS +from pandas._libs.tslibs.frequencies import INVALID_FREQ_ERR_MSG +from pandas._libs.tslibs.parsing import DateParseError +from pandas._libs.tslibs.period import IncompatibleFrequency +from pandas._libs.tslibs.timezones import dateutil_gettz, maybe_get_tz +from pandas.compat.numpy import np_datetime64_compat + +import pandas as pd +from pandas import NaT, Period, Timedelta, Timestamp, offsets +import pandas._testing as tm + + +class TestPeriodConstruction: + def test_construction(self): + i1 = Period("1/1/2005", freq="M") + i2 = Period("Jan 2005") + + assert i1 == i2 + + i1 = Period("2005", freq="A") + i2 = Period("2005") + i3 = Period("2005", freq="a") + + assert i1 == i2 + assert i1 == i3 + + i4 = Period("2005", freq="M") + i5 = Period("2005", freq="m") + + msg = r"Input has different freq=M from Period\(freq=A-DEC\)" + with pytest.raises(IncompatibleFrequency, match=msg): + i1 != i4 + assert i4 == i5 + + i1 = Period.now("Q") + i2 = Period(datetime.now(), freq="Q") + i3 = Period.now("q") + + assert i1 == i2 + assert i1 == i3 + + i1 = Period("1982", freq="min") + i2 = Period("1982", freq="MIN") + assert i1 == i2 + i2 = Period("1982", freq=("Min", 1)) + assert i1 == i2 + + i1 = Period(year=2005, month=3, day=1, freq="D") + i2 = Period("3/1/2005", freq="D") + assert i1 == i2 + + i3 = Period(year=2005, month=3, day=1, freq="d") + assert i1 == i3 + + i1 = Period("2007-01-01 09:00:00.001") + expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1000), freq="L") + assert i1 == expected + + expected = Period(np_datetime64_compat("2007-01-01 09:00:00.001Z"), freq="L") + assert i1 == expected + + i1 = Period("2007-01-01 09:00:00.00101") + expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1010), freq="U") + assert i1 == expected + + expected = Period(np_datetime64_compat("2007-01-01 09:00:00.00101Z"), freq="U") + assert i1 == expected + + msg = "Must supply freq for ordinal value" + with pytest.raises(ValueError, match=msg): + Period(ordinal=200701) + + with pytest.raises(ValueError, match="Invalid frequency: X"): + Period("2007-1-1", freq="X") + + def test_construction_bday(self): + + # Biz day construction, roll forward if non-weekday + i1 = Period("3/10/12", freq="B") + i2 = Period("3/10/12", freq="D") + assert i1 == i2.asfreq("B") + i2 = Period("3/11/12", freq="D") + assert i1 == i2.asfreq("B") + i2 = Period("3/12/12", freq="D") + assert i1 == i2.asfreq("B") + + i3 = Period("3/10/12", freq="b") + assert i1 == i3 + + i1 = Period(year=2012, month=3, day=10, freq="B") + i2 = Period("3/12/12", freq="B") + assert i1 == i2 + + def test_construction_quarter(self): + + i1 = Period(year=2005, quarter=1, freq="Q") + i2 = Period("1/1/2005", freq="Q") + assert i1 == i2 + + i1 = Period(year=2005, quarter=3, freq="Q") + i2 = Period("9/1/2005", freq="Q") + assert i1 == i2 + + i1 = Period("2005Q1") + i2 = Period(year=2005, quarter=1, freq="Q") + i3 = Period("2005q1") + assert i1 == i2 + assert i1 == i3 + + i1 = Period("05Q1") + assert i1 == i2 + lower = Period("05q1") + assert i1 == lower + + i1 = Period("1Q2005") + assert i1 == i2 + lower = Period("1q2005") + assert i1 == lower + + i1 = Period("1Q05") + assert i1 == i2 + lower = Period("1q05") + assert i1 == lower + + i1 = Period("4Q1984") + assert i1.year == 1984 + lower = Period("4q1984") + assert i1 == lower + + def test_construction_month(self): + + expected = Period("2007-01", freq="M") + i1 = Period("200701", freq="M") + assert i1 == expected + + i1 = Period("200701", freq="M") + assert i1 == expected + + i1 = Period(200701, freq="M") + assert i1 == expected + + i1 = Period(ordinal=200701, freq="M") + assert i1.year == 18695 + + i1 = Period(datetime(2007, 1, 1), freq="M") + i2 = Period("200701", freq="M") + assert i1 == i2 + + i1 = Period(date(2007, 1, 1), freq="M") + i2 = Period(datetime(2007, 1, 1), freq="M") + i3 = Period(np.datetime64("2007-01-01"), freq="M") + i4 = Period(np_datetime64_compat("2007-01-01 00:00:00Z"), freq="M") + i5 = Period(np_datetime64_compat("2007-01-01 00:00:00.000Z"), freq="M") + assert i1 == i2 + assert i1 == i3 + assert i1 == i4 + assert i1 == i5 + + def test_period_constructor_offsets(self): + assert Period("1/1/2005", freq=offsets.MonthEnd()) == Period( + "1/1/2005", freq="M" + ) + assert Period("2005", freq=offsets.YearEnd()) == Period("2005", freq="A") + assert Period("2005", freq=offsets.MonthEnd()) == Period("2005", freq="M") + assert Period("3/10/12", freq=offsets.BusinessDay()) == Period( + "3/10/12", freq="B" + ) + assert Period("3/10/12", freq=offsets.Day()) == Period("3/10/12", freq="D") + + assert Period( + year=2005, quarter=1, freq=offsets.QuarterEnd(startingMonth=12) + ) == Period(year=2005, quarter=1, freq="Q") + assert Period( + year=2005, quarter=2, freq=offsets.QuarterEnd(startingMonth=12) + ) == Period(year=2005, quarter=2, freq="Q") + + assert Period(year=2005, month=3, day=1, freq=offsets.Day()) == Period( + year=2005, month=3, day=1, freq="D" + ) + assert Period(year=2012, month=3, day=10, freq=offsets.BDay()) == Period( + year=2012, month=3, day=10, freq="B" + ) + + expected = Period("2005-03-01", freq="3D") + assert Period(year=2005, month=3, day=1, freq=offsets.Day(3)) == expected + assert Period(year=2005, month=3, day=1, freq="3D") == expected + + assert Period(year=2012, month=3, day=10, freq=offsets.BDay(3)) == Period( + year=2012, month=3, day=10, freq="3B" + ) + + assert Period(200701, freq=offsets.MonthEnd()) == Period(200701, freq="M") + + i1 = Period(ordinal=200701, freq=offsets.MonthEnd()) + i2 = Period(ordinal=200701, freq="M") + assert i1 == i2 + assert i1.year == 18695 + assert i2.year == 18695 + + i1 = Period(datetime(2007, 1, 1), freq="M") + i2 = Period("200701", freq="M") + assert i1 == i2 + + i1 = Period(date(2007, 1, 1), freq="M") + i2 = Period(datetime(2007, 1, 1), freq="M") + i3 = Period(np.datetime64("2007-01-01"), freq="M") + i4 = Period(np_datetime64_compat("2007-01-01 00:00:00Z"), freq="M") + i5 = Period(np_datetime64_compat("2007-01-01 00:00:00.000Z"), freq="M") + assert i1 == i2 + assert i1 == i3 + assert i1 == i4 + assert i1 == i5 + + i1 = Period("2007-01-01 09:00:00.001") + expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1000), freq="L") + assert i1 == expected + + expected = Period(np_datetime64_compat("2007-01-01 09:00:00.001Z"), freq="L") + assert i1 == expected + + i1 = Period("2007-01-01 09:00:00.00101") + expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1010), freq="U") + assert i1 == expected + + expected = Period(np_datetime64_compat("2007-01-01 09:00:00.00101Z"), freq="U") + assert i1 == expected + + def test_invalid_arguments(self): + with pytest.raises(ValueError): + Period(datetime.now()) + with pytest.raises(ValueError): + Period(datetime.now().date()) + + with pytest.raises(ValueError): + Period(1.6, freq="D") + with pytest.raises(ValueError): + Period(ordinal=1.6, freq="D") + with pytest.raises(ValueError): + Period(ordinal=2, value=1, freq="D") + + with pytest.raises(ValueError): + Period(month=1) + + with pytest.raises(ValueError): + Period("-2000", "A") + with pytest.raises(DateParseError): + Period("0", "A") + with pytest.raises(DateParseError): + Period("1/1/-2000", "A") + + def test_constructor_corner(self): + expected = Period("2007-01", freq="2M") + assert Period(year=2007, month=1, freq="2M") == expected + + assert Period(None) is NaT + + p = Period("2007-01-01", freq="D") + + result = Period(p, freq="A") + exp = Period("2007", freq="A") + assert result == exp + + def test_constructor_infer_freq(self): + p = Period("2007-01-01") + assert p.freq == "D" + + p = Period("2007-01-01 07") + assert p.freq == "H" + + p = Period("2007-01-01 07:10") + assert p.freq == "T" + + p = Period("2007-01-01 07:10:15") + assert p.freq == "S" + + p = Period("2007-01-01 07:10:15.123") + assert p.freq == "L" + + p = Period("2007-01-01 07:10:15.123000") + assert p.freq == "L" + + p = Period("2007-01-01 07:10:15.123400") + assert p.freq == "U" + + def test_multiples(self): + result1 = Period("1989", freq="2A") + result2 = Period("1989", freq="A") + assert result1.ordinal == result2.ordinal + assert result1.freqstr == "2A-DEC" + assert result2.freqstr == "A-DEC" + assert result1.freq == offsets.YearEnd(2) + assert result2.freq == offsets.YearEnd() + + assert (result1 + 1).ordinal == result1.ordinal + 2 + assert (1 + result1).ordinal == result1.ordinal + 2 + assert (result1 - 1).ordinal == result2.ordinal - 2 + assert (-1 + result1).ordinal == result2.ordinal - 2 + + @pytest.mark.parametrize("month", MONTHS) + def test_period_cons_quarterly(self, month): + # bugs in scikits.timeseries + freq = "Q-{month}".format(month=month) + exp = Period("1989Q3", freq=freq) + assert "1989Q3" in str(exp) + stamp = exp.to_timestamp("D", how="end") + p = Period(stamp, freq=freq) + assert p == exp + + stamp = exp.to_timestamp("3D", how="end") + p = Period(stamp, freq=freq) + assert p == exp + + @pytest.mark.parametrize("month", MONTHS) + def test_period_cons_annual(self, month): + # bugs in scikits.timeseries + freq = "A-{month}".format(month=month) + exp = Period("1989", freq=freq) + stamp = exp.to_timestamp("D", how="end") + timedelta(days=30) + p = Period(stamp, freq=freq) + + assert p == exp + 1 + assert isinstance(p, Period) + + @pytest.mark.parametrize("day", DAYS) + @pytest.mark.parametrize("num", range(10, 17)) + def test_period_cons_weekly(self, num, day): + daystr = "2011-02-{num}".format(num=num) + freq = "W-{day}".format(day=day) + + result = Period(daystr, freq=freq) + expected = Period(daystr, freq="D").asfreq(freq) + assert result == expected + assert isinstance(result, Period) + + def test_period_from_ordinal(self): + p = Period("2011-01", freq="M") + res = Period._from_ordinal(p.ordinal, freq="M") + assert p == res + assert isinstance(res, Period) + + def test_period_cons_nat(self): + p = Period("NaT", freq="M") + assert p is NaT + + p = Period("nat", freq="W-SUN") + assert p is NaT + + p = Period(iNaT, freq="D") + assert p is NaT + + p = Period(iNaT, freq="3D") + assert p is NaT + + p = Period(iNaT, freq="1D1H") + assert p is NaT + + p = Period("NaT") + assert p is NaT + + p = Period(iNaT) + assert p is NaT + + def test_period_cons_mult(self): + p1 = Period("2011-01", freq="3M") + p2 = Period("2011-01", freq="M") + assert p1.ordinal == p2.ordinal + + assert p1.freq == offsets.MonthEnd(3) + assert p1.freqstr == "3M" + + assert p2.freq == offsets.MonthEnd() + assert p2.freqstr == "M" + + result = p1 + 1 + assert result.ordinal == (p2 + 3).ordinal + + assert result.freq == p1.freq + assert result.freqstr == "3M" + + result = p1 - 1 + assert result.ordinal == (p2 - 3).ordinal + assert result.freq == p1.freq + assert result.freqstr == "3M" + + msg = "Frequency must be positive, because it represents span: -3M" + with pytest.raises(ValueError, match=msg): + Period("2011-01", freq="-3M") + + msg = "Frequency must be positive, because it represents span: 0M" + with pytest.raises(ValueError, match=msg): + Period("2011-01", freq="0M") + + def test_period_cons_combined(self): + p = [ + ( + Period("2011-01", freq="1D1H"), + Period("2011-01", freq="1H1D"), + Period("2011-01", freq="H"), + ), + ( + Period(ordinal=1, freq="1D1H"), + Period(ordinal=1, freq="1H1D"), + Period(ordinal=1, freq="H"), + ), + ] + + for p1, p2, p3 in p: + assert p1.ordinal == p3.ordinal + assert p2.ordinal == p3.ordinal + + assert p1.freq == offsets.Hour(25) + assert p1.freqstr == "25H" + + assert p2.freq == offsets.Hour(25) + assert p2.freqstr == "25H" + + assert p3.freq == offsets.Hour() + assert p3.freqstr == "H" + + result = p1 + 1 + assert result.ordinal == (p3 + 25).ordinal + assert result.freq == p1.freq + assert result.freqstr == "25H" + + result = p2 + 1 + assert result.ordinal == (p3 + 25).ordinal + assert result.freq == p2.freq + assert result.freqstr == "25H" + + result = p1 - 1 + assert result.ordinal == (p3 - 25).ordinal + assert result.freq == p1.freq + assert result.freqstr == "25H" + + result = p2 - 1 + assert result.ordinal == (p3 - 25).ordinal + assert result.freq == p2.freq + assert result.freqstr == "25H" + + msg = "Frequency must be positive, because it represents span: -25H" + with pytest.raises(ValueError, match=msg): + Period("2011-01", freq="-1D1H") + with pytest.raises(ValueError, match=msg): + Period("2011-01", freq="-1H1D") + with pytest.raises(ValueError, match=msg): + Period(ordinal=1, freq="-1D1H") + with pytest.raises(ValueError, match=msg): + Period(ordinal=1, freq="-1H1D") + + msg = "Frequency must be positive, because it represents span: 0D" + with pytest.raises(ValueError, match=msg): + Period("2011-01", freq="0D0H") + with pytest.raises(ValueError, match=msg): + Period(ordinal=1, freq="0D0H") + + # You can only combine together day and intraday offsets + msg = "Invalid frequency: 1W1D" + with pytest.raises(ValueError, match=msg): + Period("2011-01", freq="1W1D") + msg = "Invalid frequency: 1D1W" + with pytest.raises(ValueError, match=msg): + Period("2011-01", freq="1D1W") + + +class TestPeriodMethods: + def test_round_trip(self): + p = Period("2000Q1") + new_p = tm.round_trip_pickle(p) + assert new_p == p + + def test_hash(self): + assert hash(Period("2011-01", freq="M")) == hash(Period("2011-01", freq="M")) + + assert hash(Period("2011-01-01", freq="D")) != hash(Period("2011-01", freq="M")) + + assert hash(Period("2011-01", freq="3M")) != hash(Period("2011-01", freq="2M")) + + assert hash(Period("2011-01", freq="M")) != hash(Period("2011-02", freq="M")) + + # -------------------------------------------------------------- + # to_timestamp + + @pytest.mark.parametrize("tzstr", ["Europe/Brussels", "Asia/Tokyo", "US/Pacific"]) + def test_to_timestamp_tz_arg(self, tzstr): + p = Period("1/1/2005", freq="M").to_timestamp(tz=tzstr) + exp = Timestamp("1/1/2005", tz="UTC").tz_convert(tzstr) + exp_zone = pytz.timezone(tzstr).normalize(p) + + assert p == exp + assert p.tz == exp_zone.tzinfo + assert p.tz == exp.tz + + p = Period("1/1/2005", freq="3H").to_timestamp(tz=tzstr) + exp = Timestamp("1/1/2005", tz="UTC").tz_convert(tzstr) + exp_zone = pytz.timezone(tzstr).normalize(p) + + assert p == exp + assert p.tz == exp_zone.tzinfo + assert p.tz == exp.tz + + p = Period("1/1/2005", freq="A").to_timestamp(freq="A", tz=tzstr) + exp = Timestamp("31/12/2005", tz="UTC").tz_convert(tzstr) + exp_zone = pytz.timezone(tzstr).normalize(p) + + assert p == exp + assert p.tz == exp_zone.tzinfo + assert p.tz == exp.tz + + p = Period("1/1/2005", freq="A").to_timestamp(freq="3H", tz=tzstr) + exp = Timestamp("1/1/2005", tz="UTC").tz_convert(tzstr) + exp_zone = pytz.timezone(tzstr).normalize(p) + + assert p == exp + assert p.tz == exp_zone.tzinfo + assert p.tz == exp.tz + + @pytest.mark.parametrize( + "tzstr", + ["dateutil/Europe/Brussels", "dateutil/Asia/Tokyo", "dateutil/US/Pacific"], + ) + def test_to_timestamp_tz_arg_dateutil(self, tzstr): + tz = maybe_get_tz(tzstr) + p = Period("1/1/2005", freq="M").to_timestamp(tz=tz) + exp = Timestamp("1/1/2005", tz="UTC").tz_convert(tzstr) + assert p == exp + assert p.tz == dateutil_gettz(tzstr.split("/", 1)[1]) + assert p.tz == exp.tz + + p = Period("1/1/2005", freq="M").to_timestamp(freq="3H", tz=tz) + exp = Timestamp("1/1/2005", tz="UTC").tz_convert(tzstr) + assert p == exp + assert p.tz == dateutil_gettz(tzstr.split("/", 1)[1]) + assert p.tz == exp.tz + + def test_to_timestamp_tz_arg_dateutil_from_string(self): + p = Period("1/1/2005", freq="M").to_timestamp(tz="dateutil/Europe/Brussels") + assert p.tz == dateutil_gettz("Europe/Brussels") + + def test_to_timestamp_mult(self): + p = Period("2011-01", freq="M") + assert p.to_timestamp(how="S") == Timestamp("2011-01-01") + expected = Timestamp("2011-02-01") - Timedelta(1, "ns") + assert p.to_timestamp(how="E") == expected + + p = Period("2011-01", freq="3M") + assert p.to_timestamp(how="S") == Timestamp("2011-01-01") + expected = Timestamp("2011-04-01") - Timedelta(1, "ns") + assert p.to_timestamp(how="E") == expected + + def test_to_timestamp(self): + p = Period("1982", freq="A") + start_ts = p.to_timestamp(how="S") + aliases = ["s", "StarT", "BEGIn"] + for a in aliases: + assert start_ts == p.to_timestamp("D", how=a) + # freq with mult should not affect to the result + assert start_ts == p.to_timestamp("3D", how=a) + + end_ts = p.to_timestamp(how="E") + aliases = ["e", "end", "FINIsH"] + for a in aliases: + assert end_ts == p.to_timestamp("D", how=a) + assert end_ts == p.to_timestamp("3D", how=a) + + from_lst = ["A", "Q", "M", "W", "B", "D", "H", "Min", "S"] + + def _ex(p): + return Timestamp((p + p.freq).start_time.value - 1) + + for i, fcode in enumerate(from_lst): + p = Period("1982", freq=fcode) + result = p.to_timestamp().to_period(fcode) + assert result == p + + assert p.start_time == p.to_timestamp(how="S") + + assert p.end_time == _ex(p) + + # Frequency other than daily + + p = Period("1985", freq="A") + + result = p.to_timestamp("H", how="end") + expected = Timestamp(1986, 1, 1) - Timedelta(1, "ns") + assert result == expected + result = p.to_timestamp("3H", how="end") + assert result == expected + + result = p.to_timestamp("T", how="end") + expected = Timestamp(1986, 1, 1) - Timedelta(1, "ns") + assert result == expected + result = p.to_timestamp("2T", how="end") + assert result == expected + + result = p.to_timestamp(how="end") + expected = Timestamp(1986, 1, 1) - Timedelta(1, "ns") + assert result == expected + + expected = datetime(1985, 1, 1) + result = p.to_timestamp("H", how="start") + assert result == expected + result = p.to_timestamp("T", how="start") + assert result == expected + result = p.to_timestamp("S", how="start") + assert result == expected + result = p.to_timestamp("3H", how="start") + assert result == expected + result = p.to_timestamp("5S", how="start") + assert result == expected + + # -------------------------------------------------------------- + # Rendering: __repr__, strftime, etc + + def test_repr(self): + p = Period("Jan-2000") + assert "2000-01" in repr(p) + + p = Period("2000-12-15") + assert "2000-12-15" in repr(p) + + def test_repr_nat(self): + p = Period("nat", freq="M") + assert repr(NaT) in repr(p) + + def test_millisecond_repr(self): + p = Period("2000-01-01 12:15:02.123") + + assert repr(p) == "Period('2000-01-01 12:15:02.123', 'L')" + + def test_microsecond_repr(self): + p = Period("2000-01-01 12:15:02.123567") + + assert repr(p) == "Period('2000-01-01 12:15:02.123567', 'U')" + + def test_strftime(self): + # GH#3363 + p = Period("2000-1-1 12:34:12", freq="S") + res = p.strftime("%Y-%m-%d %H:%M:%S") + assert res == "2000-01-01 12:34:12" + assert isinstance(res, str) + + +class TestPeriodProperties: + "Test properties such as year, month, weekday, etc...." + + @pytest.mark.parametrize("freq", ["A", "M", "D", "H"]) + def test_is_leap_year(self, freq): + # GH 13727 + p = Period("2000-01-01 00:00:00", freq=freq) + assert p.is_leap_year + assert isinstance(p.is_leap_year, bool) + + p = Period("1999-01-01 00:00:00", freq=freq) + assert not p.is_leap_year + + p = Period("2004-01-01 00:00:00", freq=freq) + assert p.is_leap_year + + p = Period("2100-01-01 00:00:00", freq=freq) + assert not p.is_leap_year + + def test_quarterly_negative_ordinals(self): + p = Period(ordinal=-1, freq="Q-DEC") + assert p.year == 1969 + assert p.quarter == 4 + assert isinstance(p, Period) + + p = Period(ordinal=-2, freq="Q-DEC") + assert p.year == 1969 + assert p.quarter == 3 + assert isinstance(p, Period) + + p = Period(ordinal=-2, freq="M") + assert p.year == 1969 + assert p.month == 11 + assert isinstance(p, Period) + + def test_freq_str(self): + i1 = Period("1982", freq="Min") + assert i1.freq == offsets.Minute() + assert i1.freqstr == "T" + + def test_period_deprecated_freq(self): + cases = { + "M": ["MTH", "MONTH", "MONTHLY", "Mth", "month", "monthly"], + "B": ["BUS", "BUSINESS", "BUSINESSLY", "WEEKDAY", "bus"], + "D": ["DAY", "DLY", "DAILY", "Day", "Dly", "Daily"], + "H": ["HR", "HOUR", "HRLY", "HOURLY", "hr", "Hour", "HRly"], + "T": ["minute", "MINUTE", "MINUTELY", "minutely"], + "S": ["sec", "SEC", "SECOND", "SECONDLY", "second"], + "L": ["MILLISECOND", "MILLISECONDLY", "millisecond"], + "U": ["MICROSECOND", "MICROSECONDLY", "microsecond"], + "N": ["NANOSECOND", "NANOSECONDLY", "nanosecond"], + } + + msg = INVALID_FREQ_ERR_MSG + for exp, freqs in cases.items(): + for freq in freqs: + with pytest.raises(ValueError, match=msg): + Period("2016-03-01 09:00", freq=freq) + with pytest.raises(ValueError, match=msg): + Period(ordinal=1, freq=freq) + + # check supported freq-aliases still works + p1 = Period("2016-03-01 09:00", freq=exp) + p2 = Period(ordinal=1, freq=exp) + assert isinstance(p1, Period) + assert isinstance(p2, Period) + + def test_start_time(self): + freq_lst = ["A", "Q", "M", "D", "H", "T", "S"] + xp = datetime(2012, 1, 1) + for f in freq_lst: + p = Period("2012", freq=f) + assert p.start_time == xp + assert Period("2012", freq="B").start_time == datetime(2012, 1, 2) + assert Period("2012", freq="W").start_time == datetime(2011, 12, 26) + + def test_end_time(self): + p = Period("2012", freq="A") + + def _ex(*args): + return Timestamp(Timestamp(datetime(*args)).value - 1) + + xp = _ex(2013, 1, 1) + assert xp == p.end_time + + p = Period("2012", freq="Q") + xp = _ex(2012, 4, 1) + assert xp == p.end_time + + p = Period("2012", freq="M") + xp = _ex(2012, 2, 1) + assert xp == p.end_time + + p = Period("2012", freq="D") + xp = _ex(2012, 1, 2) + assert xp == p.end_time + + p = Period("2012", freq="H") + xp = _ex(2012, 1, 1, 1) + assert xp == p.end_time + + p = Period("2012", freq="B") + xp = _ex(2012, 1, 3) + assert xp == p.end_time + + p = Period("2012", freq="W") + xp = _ex(2012, 1, 2) + assert xp == p.end_time + + # Test for GH 11738 + p = Period("2012", freq="15D") + xp = _ex(2012, 1, 16) + assert xp == p.end_time + + p = Period("2012", freq="1D1H") + xp = _ex(2012, 1, 2, 1) + assert xp == p.end_time + + p = Period("2012", freq="1H1D") + xp = _ex(2012, 1, 2, 1) + assert xp == p.end_time + + def test_anchor_week_end_time(self): + def _ex(*args): + return Timestamp(Timestamp(datetime(*args)).value - 1) + + p = Period("2013-1-1", "W-SAT") + xp = _ex(2013, 1, 6) + assert p.end_time == xp + + def test_properties_annually(self): + # Test properties on Periods with annually frequency. + a_date = Period(freq="A", year=2007) + assert a_date.year == 2007 + + def test_properties_quarterly(self): + # Test properties on Periods with daily frequency. + qedec_date = Period(freq="Q-DEC", year=2007, quarter=1) + qejan_date = Period(freq="Q-JAN", year=2007, quarter=1) + qejun_date = Period(freq="Q-JUN", year=2007, quarter=1) + # + for x in range(3): + for qd in (qedec_date, qejan_date, qejun_date): + assert (qd + x).qyear == 2007 + assert (qd + x).quarter == x + 1 + + def test_properties_monthly(self): + # Test properties on Periods with daily frequency. + m_date = Period(freq="M", year=2007, month=1) + for x in range(11): + m_ival_x = m_date + x + assert m_ival_x.year == 2007 + if 1 <= x + 1 <= 3: + assert m_ival_x.quarter == 1 + elif 4 <= x + 1 <= 6: + assert m_ival_x.quarter == 2 + elif 7 <= x + 1 <= 9: + assert m_ival_x.quarter == 3 + elif 10 <= x + 1 <= 12: + assert m_ival_x.quarter == 4 + assert m_ival_x.month == x + 1 + + def test_properties_weekly(self): + # Test properties on Periods with daily frequency. + w_date = Period(freq="W", year=2007, month=1, day=7) + # + assert w_date.year == 2007 + assert w_date.quarter == 1 + assert w_date.month == 1 + assert w_date.week == 1 + assert (w_date - 1).week == 52 + assert w_date.days_in_month == 31 + assert Period(freq="W", year=2012, month=2, day=1).days_in_month == 29 + + def test_properties_weekly_legacy(self): + # Test properties on Periods with daily frequency. + w_date = Period(freq="W", year=2007, month=1, day=7) + assert w_date.year == 2007 + assert w_date.quarter == 1 + assert w_date.month == 1 + assert w_date.week == 1 + assert (w_date - 1).week == 52 + assert w_date.days_in_month == 31 + + exp = Period(freq="W", year=2012, month=2, day=1) + assert exp.days_in_month == 29 + + msg = INVALID_FREQ_ERR_MSG + with pytest.raises(ValueError, match=msg): + Period(freq="WK", year=2007, month=1, day=7) + + def test_properties_daily(self): + # Test properties on Periods with daily frequency. + b_date = Period(freq="B", year=2007, month=1, day=1) + # + assert b_date.year == 2007 + assert b_date.quarter == 1 + assert b_date.month == 1 + assert b_date.day == 1 + assert b_date.weekday == 0 + assert b_date.dayofyear == 1 + assert b_date.days_in_month == 31 + assert Period(freq="B", year=2012, month=2, day=1).days_in_month == 29 + + d_date = Period(freq="D", year=2007, month=1, day=1) + + assert d_date.year == 2007 + assert d_date.quarter == 1 + assert d_date.month == 1 + assert d_date.day == 1 + assert d_date.weekday == 0 + assert d_date.dayofyear == 1 + assert d_date.days_in_month == 31 + assert Period(freq="D", year=2012, month=2, day=1).days_in_month == 29 + + def test_properties_hourly(self): + # Test properties on Periods with hourly frequency. + h_date1 = Period(freq="H", year=2007, month=1, day=1, hour=0) + h_date2 = Period(freq="2H", year=2007, month=1, day=1, hour=0) + + for h_date in [h_date1, h_date2]: + assert h_date.year == 2007 + assert h_date.quarter == 1 + assert h_date.month == 1 + assert h_date.day == 1 + assert h_date.weekday == 0 + assert h_date.dayofyear == 1 + assert h_date.hour == 0 + assert h_date.days_in_month == 31 + assert ( + Period(freq="H", year=2012, month=2, day=1, hour=0).days_in_month == 29 + ) + + def test_properties_minutely(self): + # Test properties on Periods with minutely frequency. + t_date = Period(freq="Min", year=2007, month=1, day=1, hour=0, minute=0) + # + assert t_date.quarter == 1 + assert t_date.month == 1 + assert t_date.day == 1 + assert t_date.weekday == 0 + assert t_date.dayofyear == 1 + assert t_date.hour == 0 + assert t_date.minute == 0 + assert t_date.days_in_month == 31 + assert ( + Period(freq="D", year=2012, month=2, day=1, hour=0, minute=0).days_in_month + == 29 + ) + + def test_properties_secondly(self): + # Test properties on Periods with secondly frequency. + s_date = Period( + freq="Min", year=2007, month=1, day=1, hour=0, minute=0, second=0 + ) + # + assert s_date.year == 2007 + assert s_date.quarter == 1 + assert s_date.month == 1 + assert s_date.day == 1 + assert s_date.weekday == 0 + assert s_date.dayofyear == 1 + assert s_date.hour == 0 + assert s_date.minute == 0 + assert s_date.second == 0 + assert s_date.days_in_month == 31 + assert ( + Period( + freq="Min", year=2012, month=2, day=1, hour=0, minute=0, second=0 + ).days_in_month + == 29 + ) + + +class TestPeriodField: + def test_get_period_field_array_raises_on_out_of_range(self): + msg = "Buffer dtype mismatch, expected 'int64_t' but got 'double'" + with pytest.raises(ValueError, match=msg): + libperiod.get_period_field_arr(-1, np.empty(1), 0) + + +class TestComparisons: + def setup_method(self, method): + self.january1 = Period("2000-01", "M") + self.january2 = Period("2000-01", "M") + self.february = Period("2000-02", "M") + self.march = Period("2000-03", "M") + self.day = Period("2012-01-01", "D") + + def test_equal(self): + assert self.january1 == self.january2 + + def test_equal_Raises_Value(self): + with pytest.raises(IncompatibleFrequency): + self.january1 == self.day + + def test_notEqual(self): + assert self.january1 != 1 + assert self.january1 != self.february + + def test_greater(self): + assert self.february > self.january1 + + def test_greater_Raises_Value(self): + with pytest.raises(IncompatibleFrequency): + self.january1 > self.day + + def test_greater_Raises_Type(self): + with pytest.raises(TypeError): + self.january1 > 1 + + def test_greaterEqual(self): + assert self.january1 >= self.january2 + + def test_greaterEqual_Raises_Value(self): + with pytest.raises(IncompatibleFrequency): + self.january1 >= self.day + + with pytest.raises(TypeError): + print(self.january1 >= 1) + + def test_smallerEqual(self): + assert self.january1 <= self.january2 + + def test_smallerEqual_Raises_Value(self): + with pytest.raises(IncompatibleFrequency): + self.january1 <= self.day + + def test_smallerEqual_Raises_Type(self): + with pytest.raises(TypeError): + self.january1 <= 1 + + def test_smaller(self): + assert self.january1 < self.february + + def test_smaller_Raises_Value(self): + with pytest.raises(IncompatibleFrequency): + self.january1 < self.day + + def test_smaller_Raises_Type(self): + with pytest.raises(TypeError): + self.january1 < 1 + + def test_sort(self): + periods = [self.march, self.january1, self.february] + correctPeriods = [self.january1, self.february, self.march] + assert sorted(periods) == correctPeriods + + def test_period_nat_comp(self): + p_nat = Period("NaT", freq="D") + p = Period("2011-01-01", freq="D") + + nat = Timestamp("NaT") + t = Timestamp("2011-01-01") + # confirm Period('NaT') work identical with Timestamp('NaT') + for left, right in [ + (p_nat, p), + (p, p_nat), + (p_nat, p_nat), + (nat, t), + (t, nat), + (nat, nat), + ]: + assert not left < right + assert not left > right + assert not left == right + assert left != right + assert not left <= right + assert not left >= right + + +class TestArithmetic: + def test_sub_delta(self): + left, right = Period("2011", freq="A"), Period("2007", freq="A") + result = left - right + assert result == 4 * right.freq + + with pytest.raises(IncompatibleFrequency): + left - Period("2007-01", freq="M") + + def test_add_integer(self): + per1 = Period(freq="D", year=2008, month=1, day=1) + per2 = Period(freq="D", year=2008, month=1, day=2) + assert per1 + 1 == per2 + assert 1 + per1 == per2 + + def test_add_sub_nat(self): + # GH#13071 + p = Period("2011-01", freq="M") + assert p + NaT is NaT + assert NaT + p is NaT + assert p - NaT is NaT + assert NaT - p is NaT + + p = Period("NaT", freq="M") + assert p is NaT + assert p + NaT is NaT + assert NaT + p is NaT + assert p - NaT is NaT + assert NaT - p is NaT + + def test_add_invalid(self): + # GH#4731 + per1 = Period(freq="D", year=2008, month=1, day=1) + per2 = Period(freq="D", year=2008, month=1, day=2) + + msg = r"unsupported operand type\(s\)" + with pytest.raises(TypeError, match=msg): + per1 + "str" + with pytest.raises(TypeError, match=msg): + "str" + per1 + with pytest.raises(TypeError, match=msg): + per1 + per2 + + boxes = [lambda x: x, lambda x: pd.Series([x]), lambda x: pd.Index([x])] + ids = ["identity", "Series", "Index"] + + @pytest.mark.parametrize("lbox", boxes, ids=ids) + @pytest.mark.parametrize("rbox", boxes, ids=ids) + def test_add_timestamp_raises(self, rbox, lbox): + # GH#17983 + ts = Timestamp("2017") + per = Period("2017", freq="M") + + # We may get a different message depending on which class raises + # the error. + msg = ( + r"cannot add|unsupported operand|" + r"can only operate on a|incompatible type|" + r"ufunc add cannot use operands" + ) + with pytest.raises(TypeError, match=msg): + lbox(ts) + rbox(per) + + with pytest.raises(TypeError, match=msg): + lbox(per) + rbox(ts) + + with pytest.raises(TypeError, match=msg): + lbox(per) + rbox(per) + + def test_sub(self): + per1 = Period("2011-01-01", freq="D") + per2 = Period("2011-01-15", freq="D") + + off = per1.freq + assert per1 - per2 == -14 * off + assert per2 - per1 == 14 * off + + msg = r"Input has different freq=M from Period\(freq=D\)" + with pytest.raises(IncompatibleFrequency, match=msg): + per1 - Period("2011-02", freq="M") + + @pytest.mark.parametrize("n", [1, 2, 3, 4]) + def test_sub_n_gt_1_ticks(self, tick_classes, n): + # GH 23878 + p1 = Period("19910905", freq=tick_classes(n)) + p2 = Period("19920406", freq=tick_classes(n)) + + expected = Period(str(p2), freq=p2.freq.base) - Period( + str(p1), freq=p1.freq.base + ) + + assert (p2 - p1) == expected + + @pytest.mark.parametrize("normalize", [True, False]) + @pytest.mark.parametrize("n", [1, 2, 3, 4]) + @pytest.mark.parametrize( + "offset, kwd_name", + [ + (offsets.YearEnd, "month"), + (offsets.QuarterEnd, "startingMonth"), + (offsets.MonthEnd, None), + (offsets.Week, "weekday"), + ], + ) + def test_sub_n_gt_1_offsets(self, offset, kwd_name, n, normalize): + # GH 23878 + kwds = {kwd_name: 3} if kwd_name is not None else {} + p1_d = "19910905" + p2_d = "19920406" + p1 = Period(p1_d, freq=offset(n, normalize, **kwds)) + p2 = Period(p2_d, freq=offset(n, normalize, **kwds)) + + expected = Period(p2_d, freq=p2.freq.base) - Period(p1_d, freq=p1.freq.base) + + assert (p2 - p1) == expected + + def test_add_offset(self): + # freq is DateOffset + for freq in ["A", "2A", "3A"]: + p = Period("2011", freq=freq) + exp = Period("2013", freq=freq) + assert p + offsets.YearEnd(2) == exp + assert offsets.YearEnd(2) + p == exp + + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: + with pytest.raises(IncompatibleFrequency): + p + o + + if isinstance(o, np.timedelta64): + with pytest.raises(TypeError): + o + p + else: + with pytest.raises(IncompatibleFrequency): + o + p + + for freq in ["M", "2M", "3M"]: + p = Period("2011-03", freq=freq) + exp = Period("2011-05", freq=freq) + assert p + offsets.MonthEnd(2) == exp + assert offsets.MonthEnd(2) + p == exp + + exp = Period("2012-03", freq=freq) + assert p + offsets.MonthEnd(12) == exp + assert offsets.MonthEnd(12) + p == exp + + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: + with pytest.raises(IncompatibleFrequency): + p + o + + if isinstance(o, np.timedelta64): + with pytest.raises(TypeError): + o + p + else: + with pytest.raises(IncompatibleFrequency): + o + p + + # freq is Tick + for freq in ["D", "2D", "3D"]: + p = Period("2011-04-01", freq=freq) + + exp = Period("2011-04-06", freq=freq) + assert p + offsets.Day(5) == exp + assert offsets.Day(5) + p == exp + + exp = Period("2011-04-02", freq=freq) + assert p + offsets.Hour(24) == exp + assert offsets.Hour(24) + p == exp + + exp = Period("2011-04-03", freq=freq) + assert p + np.timedelta64(2, "D") == exp + with pytest.raises(TypeError): + np.timedelta64(2, "D") + p + + exp = Period("2011-04-02", freq=freq) + assert p + np.timedelta64(3600 * 24, "s") == exp + with pytest.raises(TypeError): + np.timedelta64(3600 * 24, "s") + p + + exp = Period("2011-03-30", freq=freq) + assert p + timedelta(-2) == exp + assert timedelta(-2) + p == exp + + exp = Period("2011-04-03", freq=freq) + assert p + timedelta(hours=48) == exp + assert timedelta(hours=48) + p == exp + + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(4, "h"), + timedelta(hours=23), + ]: + with pytest.raises(IncompatibleFrequency): + p + o + + if isinstance(o, np.timedelta64): + with pytest.raises(TypeError): + o + p + else: + with pytest.raises(IncompatibleFrequency): + o + p + + for freq in ["H", "2H", "3H"]: + p = Period("2011-04-01 09:00", freq=freq) + + exp = Period("2011-04-03 09:00", freq=freq) + assert p + offsets.Day(2) == exp + assert offsets.Day(2) + p == exp + + exp = Period("2011-04-01 12:00", freq=freq) + assert p + offsets.Hour(3) == exp + assert offsets.Hour(3) + p == exp + + exp = Period("2011-04-01 12:00", freq=freq) + assert p + np.timedelta64(3, "h") == exp + with pytest.raises(TypeError): + np.timedelta64(3, "h") + p + + exp = Period("2011-04-01 10:00", freq=freq) + assert p + np.timedelta64(3600, "s") == exp + with pytest.raises(TypeError): + np.timedelta64(3600, "s") + p + + exp = Period("2011-04-01 11:00", freq=freq) + assert p + timedelta(minutes=120) == exp + assert timedelta(minutes=120) + p == exp + + exp = Period("2011-04-05 12:00", freq=freq) + assert p + timedelta(days=4, minutes=180) == exp + assert timedelta(days=4, minutes=180) + p == exp + + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(3200, "s"), + timedelta(hours=23, minutes=30), + ]: + with pytest.raises(IncompatibleFrequency): + p + o + + if isinstance(o, np.timedelta64): + with pytest.raises(TypeError): + o + p + else: + with pytest.raises(IncompatibleFrequency): + o + p + + def test_add_offset_nat(self): + # freq is DateOffset + for freq in ["A", "2A", "3A"]: + p = Period("NaT", freq=freq) + assert p is NaT + for o in [offsets.YearEnd(2)]: + assert p + o is NaT + assert o + p is NaT + + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: + assert p + o is NaT + assert o + p is NaT + + for freq in ["M", "2M", "3M"]: + p = Period("NaT", freq=freq) + assert p is NaT + for o in [offsets.MonthEnd(2), offsets.MonthEnd(12)]: + assert p + o is NaT + assert o + p is NaT + + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: + assert p + o is NaT + assert o + p is NaT + + # freq is Tick + for freq in ["D", "2D", "3D"]: + p = Period("NaT", freq=freq) + assert p is NaT + for o in [ + offsets.Day(5), + offsets.Hour(24), + np.timedelta64(2, "D"), + np.timedelta64(3600 * 24, "s"), + timedelta(-2), + timedelta(hours=48), + ]: + assert p + o is NaT + assert o + p is NaT + + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(4, "h"), + timedelta(hours=23), + ]: + assert p + o is NaT + assert o + p is NaT + + for freq in ["H", "2H", "3H"]: + p = Period("NaT", freq=freq) + assert p is NaT + for o in [ + offsets.Day(2), + offsets.Hour(3), + np.timedelta64(3, "h"), + np.timedelta64(3600, "s"), + timedelta(minutes=120), + timedelta(days=4, minutes=180), + ]: + assert p + o is NaT + assert o + p is NaT + + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(3200, "s"), + timedelta(hours=23, minutes=30), + ]: + assert p + o is NaT + assert o + p is NaT + + def test_sub_offset(self): + # freq is DateOffset + for freq in ["A", "2A", "3A"]: + p = Period("2011", freq=freq) + assert p - offsets.YearEnd(2) == Period("2009", freq=freq) + + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: + with pytest.raises(IncompatibleFrequency): + p - o + + for freq in ["M", "2M", "3M"]: + p = Period("2011-03", freq=freq) + assert p - offsets.MonthEnd(2) == Period("2011-01", freq=freq) + assert p - offsets.MonthEnd(12) == Period("2010-03", freq=freq) + + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: + with pytest.raises(IncompatibleFrequency): + p - o + + # freq is Tick + for freq in ["D", "2D", "3D"]: + p = Period("2011-04-01", freq=freq) + assert p - offsets.Day(5) == Period("2011-03-27", freq=freq) + assert p - offsets.Hour(24) == Period("2011-03-31", freq=freq) + assert p - np.timedelta64(2, "D") == Period("2011-03-30", freq=freq) + assert p - np.timedelta64(3600 * 24, "s") == Period("2011-03-31", freq=freq) + assert p - timedelta(-2) == Period("2011-04-03", freq=freq) + assert p - timedelta(hours=48) == Period("2011-03-30", freq=freq) + + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(4, "h"), + timedelta(hours=23), + ]: + with pytest.raises(IncompatibleFrequency): + p - o + + for freq in ["H", "2H", "3H"]: + p = Period("2011-04-01 09:00", freq=freq) + assert p - offsets.Day(2) == Period("2011-03-30 09:00", freq=freq) + assert p - offsets.Hour(3) == Period("2011-04-01 06:00", freq=freq) + assert p - np.timedelta64(3, "h") == Period("2011-04-01 06:00", freq=freq) + assert p - np.timedelta64(3600, "s") == Period( + "2011-04-01 08:00", freq=freq + ) + assert p - timedelta(minutes=120) == Period("2011-04-01 07:00", freq=freq) + assert p - timedelta(days=4, minutes=180) == Period( + "2011-03-28 06:00", freq=freq + ) + + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(3200, "s"), + timedelta(hours=23, minutes=30), + ]: + with pytest.raises(IncompatibleFrequency): + p - o + + def test_sub_offset_nat(self): + # freq is DateOffset + for freq in ["A", "2A", "3A"]: + p = Period("NaT", freq=freq) + assert p is NaT + for o in [offsets.YearEnd(2)]: + assert p - o is NaT + + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: + assert p - o is NaT + + for freq in ["M", "2M", "3M"]: + p = Period("NaT", freq=freq) + assert p is NaT + for o in [offsets.MonthEnd(2), offsets.MonthEnd(12)]: + assert p - o is NaT + + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: + assert p - o is NaT + + # freq is Tick + for freq in ["D", "2D", "3D"]: + p = Period("NaT", freq=freq) + assert p is NaT + for o in [ + offsets.Day(5), + offsets.Hour(24), + np.timedelta64(2, "D"), + np.timedelta64(3600 * 24, "s"), + timedelta(-2), + timedelta(hours=48), + ]: + assert p - o is NaT + + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(4, "h"), + timedelta(hours=23), + ]: + assert p - o is NaT + + for freq in ["H", "2H", "3H"]: + p = Period("NaT", freq=freq) + assert p is NaT + for o in [ + offsets.Day(2), + offsets.Hour(3), + np.timedelta64(3, "h"), + np.timedelta64(3600, "s"), + timedelta(minutes=120), + timedelta(days=4, minutes=180), + ]: + assert p - o is NaT + + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(3200, "s"), + timedelta(hours=23, minutes=30), + ]: + assert p - o is NaT + + @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) + def test_nat_ops(self, freq): + p = Period("NaT", freq=freq) + assert p is NaT + assert p + 1 is NaT + assert 1 + p is NaT + assert p - 1 is NaT + assert p - Period("2011-01", freq=freq) is NaT + assert Period("2011-01", freq=freq) - p is NaT + + def test_period_ops_offset(self): + p = Period("2011-04-01", freq="D") + result = p + offsets.Day() + exp = Period("2011-04-02", freq="D") + assert result == exp + + result = p - offsets.Day(2) + exp = Period("2011-03-30", freq="D") + assert result == exp + + msg = r"Input cannot be converted to Period\(freq=D\)" + with pytest.raises(IncompatibleFrequency, match=msg): + p + offsets.Hour(2) + + with pytest.raises(IncompatibleFrequency, match=msg): + p - offsets.Hour(2) + + +def test_period_immutable(): + # see gh-17116 + per = Period("2014Q1") + with pytest.raises(AttributeError): + per.ordinal = 14 + + freq = per.freq + with pytest.raises(AttributeError): + per.freq = 2 * freq + + +@pytest.mark.xfail( + StrictVersion(dateutil.__version__.split(".dev")[0]) < StrictVersion("2.7.0"), + reason="Bug in dateutil < 2.7.0 when parsing old dates: Period('0001-01-07', 'D')", + strict=False, +) +def test_small_year_parsing(): + per1 = Period("0001-01-07", "D") + assert per1.year == 1 + assert per1.day == 7 diff --git a/venv/Lib/site-packages/pandas/tests/scalar/test_na_scalar.py b/venv/Lib/site-packages/pandas/tests/scalar/test_na_scalar.py new file mode 100644 index 0000000..dcb9d66 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/scalar/test_na_scalar.py @@ -0,0 +1,269 @@ +import numpy as np +import pytest + +from pandas._libs.missing import NA + +from pandas.core.dtypes.common import is_scalar + +import pandas as pd +import pandas._testing as tm + + +def test_singleton(): + assert NA is NA + new_NA = type(NA)() + assert new_NA is NA + + +def test_repr(): + assert repr(NA) == "" + assert str(NA) == "" + + +def test_truthiness(): + with pytest.raises(TypeError): + bool(NA) + + with pytest.raises(TypeError): + not NA + + +def test_hashable(): + assert hash(NA) == hash(NA) + d = {NA: "test"} + assert d[NA] == "test" + + +def test_arithmetic_ops(all_arithmetic_functions): + op = all_arithmetic_functions + + for other in [NA, 1, 1.0, "a", np.int64(1), np.nan]: + if op.__name__ in ("pow", "rpow", "rmod") and isinstance(other, str): + continue + if op.__name__ in ("divmod", "rdivmod"): + assert op(NA, other) is (NA, NA) + else: + if op.__name__ == "rpow": + # avoid special case + other += 1 + assert op(NA, other) is NA + + +def test_comparison_ops(): + + for other in [NA, 1, 1.0, "a", np.int64(1), np.nan, np.bool_(True)]: + assert (NA == other) is NA + assert (NA != other) is NA + assert (NA > other) is NA + assert (NA >= other) is NA + assert (NA < other) is NA + assert (NA <= other) is NA + assert (other == NA) is NA + assert (other != NA) is NA + assert (other > NA) is NA + assert (other >= NA) is NA + assert (other < NA) is NA + assert (other <= NA) is NA + + +@pytest.mark.parametrize( + "value", + [ + 0, + 0.0, + -0, + -0.0, + False, + np.bool_(False), + np.int_(0), + np.float_(0), + np.int_(-0), + np.float_(-0), + ], +) +@pytest.mark.parametrize("asarray", [True, False]) +def test_pow_special(value, asarray): + if asarray: + value = np.array([value]) + result = pd.NA ** value + + if asarray: + result = result[0] + else: + # this assertion isn't possible for ndarray. + assert isinstance(result, type(value)) + assert result == 1 + + +@pytest.mark.parametrize( + "value", [1, 1.0, True, np.bool_(True), np.int_(1), np.float_(1)], +) +@pytest.mark.parametrize("asarray", [True, False]) +def test_rpow_special(value, asarray): + if asarray: + value = np.array([value]) + result = value ** pd.NA + + if asarray: + result = result[0] + elif not isinstance(value, (np.float_, np.bool_, np.int_)): + # this assertion isn't possible with asarray=True + assert isinstance(result, type(value)) + + assert result == value + + +@pytest.mark.parametrize( + "value", [-1, -1.0, np.int_(-1), np.float_(-1)], +) +@pytest.mark.parametrize("asarray", [True, False]) +def test_rpow_minus_one(value, asarray): + if asarray: + value = np.array([value]) + result = value ** pd.NA + + if asarray: + result = result[0] + + assert pd.isna(result) + + +def test_unary_ops(): + assert +NA is NA + assert -NA is NA + assert abs(NA) is NA + assert ~NA is NA + + +def test_logical_and(): + + assert NA & True is NA + assert True & NA is NA + assert NA & False is False + assert False & NA is False + assert NA & NA is NA + + with pytest.raises(TypeError): + NA & 5 + + +def test_logical_or(): + + assert NA | True is True + assert True | NA is True + assert NA | False is NA + assert False | NA is NA + assert NA | NA is NA + + with pytest.raises(TypeError): + NA | 5 + + +def test_logical_xor(): + + assert NA ^ True is NA + assert True ^ NA is NA + assert NA ^ False is NA + assert False ^ NA is NA + assert NA ^ NA is NA + + with pytest.raises(TypeError): + NA ^ 5 + + +def test_logical_not(): + assert ~NA is NA + + +@pytest.mark.parametrize( + "shape", [(3,), (3, 3), (1, 2, 3)], +) +def test_arithmetic_ndarray(shape, all_arithmetic_functions): + op = all_arithmetic_functions + a = np.zeros(shape) + if op.__name__ == "pow": + a += 5 + result = op(pd.NA, a) + expected = np.full(a.shape, pd.NA, dtype=object) + tm.assert_numpy_array_equal(result, expected) + + +def test_is_scalar(): + assert is_scalar(NA) is True + + +def test_isna(): + assert pd.isna(NA) is True + assert pd.notna(NA) is False + + +def test_series_isna(): + s = pd.Series([1, NA], dtype=object) + expected = pd.Series([False, True]) + tm.assert_series_equal(s.isna(), expected) + + +def test_ufunc(): + assert np.log(pd.NA) is pd.NA + assert np.add(pd.NA, 1) is pd.NA + result = np.divmod(pd.NA, 1) + assert result[0] is pd.NA and result[1] is pd.NA + + result = np.frexp(pd.NA) + assert result[0] is pd.NA and result[1] is pd.NA + + +def test_ufunc_raises(): + with pytest.raises(ValueError, match="ufunc method 'at'"): + np.log.at(pd.NA, 0) + + +def test_binary_input_not_dunder(): + a = np.array([1, 2, 3]) + expected = np.array([pd.NA, pd.NA, pd.NA], dtype=object) + result = np.logaddexp(a, pd.NA) + tm.assert_numpy_array_equal(result, expected) + + result = np.logaddexp(pd.NA, a) + tm.assert_numpy_array_equal(result, expected) + + # all NA, multiple inputs + assert np.logaddexp(pd.NA, pd.NA) is pd.NA + + result = np.modf(pd.NA, pd.NA) + assert len(result) == 2 + assert all(x is pd.NA for x in result) + + +def test_divmod_ufunc(): + # binary in, binary out. + a = np.array([1, 2, 3]) + expected = np.array([pd.NA, pd.NA, pd.NA], dtype=object) + + result = np.divmod(a, pd.NA) + assert isinstance(result, tuple) + for arr in result: + tm.assert_numpy_array_equal(arr, expected) + tm.assert_numpy_array_equal(arr, expected) + + result = np.divmod(pd.NA, a) + for arr in result: + tm.assert_numpy_array_equal(arr, expected) + tm.assert_numpy_array_equal(arr, expected) + + +def test_integer_hash_collision_dict(): + # GH 30013 + result = {NA: "foo", hash(NA): "bar"} + + assert result[NA] == "foo" + assert result[hash(NA)] == "bar" + + +def test_integer_hash_collision_set(): + # GH 30013 + result = {NA, hash(NA)} + + assert len(result) == 2 + assert NA in result + assert hash(NA) in result diff --git a/venv/Lib/site-packages/pandas/tests/scalar/test_nat.py b/venv/Lib/site-packages/pandas/tests/scalar/test_nat.py new file mode 100644 index 0000000..a537f00 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/scalar/test_nat.py @@ -0,0 +1,510 @@ +from datetime import datetime, timedelta +import operator + +import numpy as np +import pytest +import pytz + +from pandas._libs.tslibs import iNaT +import pandas.compat as compat + +from pandas.core.dtypes.common import is_datetime64_any_dtype + +from pandas import ( + DatetimeIndex, + Index, + NaT, + Period, + Series, + Timedelta, + TimedeltaIndex, + Timestamp, + isna, +) +import pandas._testing as tm +from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray +from pandas.core.ops import roperator + + +@pytest.mark.parametrize( + "nat,idx", + [ + (Timestamp("NaT"), DatetimeIndex), + (Timedelta("NaT"), TimedeltaIndex), + (Period("NaT", freq="M"), PeriodArray), + ], +) +def test_nat_fields(nat, idx): + + for field in idx._field_ops: + # weekday is a property of DTI, but a method + # on NaT/Timestamp for compat with datetime + if field == "weekday": + continue + + result = getattr(NaT, field) + assert np.isnan(result) + + result = getattr(nat, field) + assert np.isnan(result) + + for field in idx._bool_ops: + + result = getattr(NaT, field) + assert result is False + + result = getattr(nat, field) + assert result is False + + +def test_nat_vector_field_access(): + idx = DatetimeIndex(["1/1/2000", None, None, "1/4/2000"]) + + for field in DatetimeIndex._field_ops: + # weekday is a property of DTI, but a method + # on NaT/Timestamp for compat with datetime + if field == "weekday": + continue + + result = getattr(idx, field) + expected = Index([getattr(x, field) for x in idx]) + tm.assert_index_equal(result, expected) + + ser = Series(idx) + + for field in DatetimeIndex._field_ops: + # weekday is a property of DTI, but a method + # on NaT/Timestamp for compat with datetime + if field == "weekday": + continue + + result = getattr(ser.dt, field) + expected = [getattr(x, field) for x in idx] + tm.assert_series_equal(result, Series(expected)) + + for field in DatetimeIndex._bool_ops: + result = getattr(ser.dt, field) + expected = [getattr(x, field) for x in idx] + tm.assert_series_equal(result, Series(expected)) + + +@pytest.mark.parametrize("klass", [Timestamp, Timedelta, Period]) +@pytest.mark.parametrize("value", [None, np.nan, iNaT, float("nan"), NaT, "NaT", "nat"]) +def test_identity(klass, value): + assert klass(value) is NaT + + +@pytest.mark.parametrize("klass", [Timestamp, Timedelta, Period]) +@pytest.mark.parametrize("value", ["", "nat", "NAT", None, np.nan]) +def test_equality(klass, value): + if klass is Period and value == "": + pytest.skip("Period cannot parse empty string") + + assert klass(value).value == iNaT + + +@pytest.mark.parametrize("klass", [Timestamp, Timedelta]) +@pytest.mark.parametrize("method", ["round", "floor", "ceil"]) +@pytest.mark.parametrize("freq", ["s", "5s", "min", "5min", "h", "5h"]) +def test_round_nat(klass, method, freq): + # see gh-14940 + ts = klass("nat") + + round_method = getattr(ts, method) + assert round_method(freq) is ts + + +@pytest.mark.parametrize( + "method", + [ + "astimezone", + "combine", + "ctime", + "dst", + "fromordinal", + "fromtimestamp", + pytest.param( + "fromisocalendar", + marks=pytest.mark.skipif( + not compat.PY38, + reason="'fromisocalendar' was added in stdlib datetime in python 3.8", + ), + ), + "isocalendar", + "strftime", + "strptime", + "time", + "timestamp", + "timetuple", + "timetz", + "toordinal", + "tzname", + "utcfromtimestamp", + "utcnow", + "utcoffset", + "utctimetuple", + "timestamp", + ], +) +def test_nat_methods_raise(method): + # see gh-9513, gh-17329 + msg = f"NaTType does not support {method}" + + with pytest.raises(ValueError, match=msg): + getattr(NaT, method)() + + +@pytest.mark.parametrize("method", ["weekday", "isoweekday"]) +def test_nat_methods_nan(method): + # see gh-9513, gh-17329 + assert np.isnan(getattr(NaT, method)()) + + +@pytest.mark.parametrize( + "method", ["date", "now", "replace", "today", "tz_convert", "tz_localize"] +) +def test_nat_methods_nat(method): + # see gh-8254, gh-9513, gh-17329 + assert getattr(NaT, method)() is NaT + + +@pytest.mark.parametrize( + "get_nat", [lambda x: NaT, lambda x: Timedelta(x), lambda x: Timestamp(x)] +) +def test_nat_iso_format(get_nat): + # see gh-12300 + assert get_nat("NaT").isoformat() == "NaT" + + +@pytest.mark.parametrize( + "klass,expected", + [ + (Timestamp, ["freqstr", "normalize", "to_julian_date", "to_period", "tz"]), + ( + Timedelta, + [ + "components", + "delta", + "is_populated", + "resolution_string", + "to_pytimedelta", + "to_timedelta64", + "view", + ], + ), + ], +) +def test_missing_public_nat_methods(klass, expected): + # see gh-17327 + # + # NaT should have *most* of the Timestamp and Timedelta methods. + # Here, we check which public methods NaT does not have. We + # ignore any missing private methods. + nat_names = dir(NaT) + klass_names = dir(klass) + + missing = [x for x in klass_names if x not in nat_names and not x.startswith("_")] + missing.sort() + + assert missing == expected + + +def _get_overlap_public_nat_methods(klass, as_tuple=False): + """ + Get overlapping public methods between NaT and another class. + + Parameters + ---------- + klass : type + The class to compare with NaT + as_tuple : bool, default False + Whether to return a list of tuples of the form (klass, method). + + Returns + ------- + overlap : list + """ + nat_names = dir(NaT) + klass_names = dir(klass) + + overlap = [ + x + for x in nat_names + if x in klass_names and not x.startswith("_") and callable(getattr(klass, x)) + ] + + # Timestamp takes precedence over Timedelta in terms of overlap. + if klass is Timedelta: + ts_names = dir(Timestamp) + overlap = [x for x in overlap if x not in ts_names] + + if as_tuple: + overlap = [(klass, method) for method in overlap] + + overlap.sort() + return overlap + + +@pytest.mark.parametrize( + "klass,expected", + [ + ( + Timestamp, + [ + "astimezone", + "ceil", + "combine", + "ctime", + "date", + "day_name", + "dst", + "floor", + "fromisocalendar", + "fromisoformat", + "fromordinal", + "fromtimestamp", + "isocalendar", + "isoformat", + "isoweekday", + "month_name", + "now", + "replace", + "round", + "strftime", + "strptime", + "time", + "timestamp", + "timetuple", + "timetz", + "to_datetime64", + "to_numpy", + "to_pydatetime", + "today", + "toordinal", + "tz_convert", + "tz_localize", + "tzname", + "utcfromtimestamp", + "utcnow", + "utcoffset", + "utctimetuple", + "weekday", + ], + ), + (Timedelta, ["total_seconds"]), + ], +) +def test_overlap_public_nat_methods(klass, expected): + # see gh-17327 + # + # NaT should have *most* of the Timestamp and Timedelta methods. + # In case when Timestamp, Timedelta, and NaT are overlap, the overlap + # is considered to be with Timestamp and NaT, not Timedelta. + + # "fromisoformat" was introduced in 3.7 + if klass is Timestamp and not compat.PY37: + expected.remove("fromisoformat") + + # "fromisocalendar" was introduced in 3.8 + if klass is Timestamp and not compat.PY38: + expected.remove("fromisocalendar") + + assert _get_overlap_public_nat_methods(klass) == expected + + +@pytest.mark.parametrize( + "compare", + ( + _get_overlap_public_nat_methods(Timestamp, True) + + _get_overlap_public_nat_methods(Timedelta, True) + ), +) +def test_nat_doc_strings(compare): + # see gh-17327 + # + # The docstrings for overlapping methods should match. + klass, method = compare + klass_doc = getattr(klass, method).__doc__ + + nat_doc = getattr(NaT, method).__doc__ + assert klass_doc == nat_doc + + +_ops = { + "left_plus_right": lambda a, b: a + b, + "right_plus_left": lambda a, b: b + a, + "left_minus_right": lambda a, b: a - b, + "right_minus_left": lambda a, b: b - a, + "left_times_right": lambda a, b: a * b, + "right_times_left": lambda a, b: b * a, + "left_div_right": lambda a, b: a / b, + "right_div_left": lambda a, b: b / a, +} + + +@pytest.mark.parametrize("op_name", list(_ops.keys())) +@pytest.mark.parametrize( + "value,val_type", + [ + (2, "scalar"), + (1.5, "floating"), + (np.nan, "floating"), + ("foo", "str"), + (timedelta(3600), "timedelta"), + (Timedelta("5s"), "timedelta"), + (datetime(2014, 1, 1), "timestamp"), + (Timestamp("2014-01-01"), "timestamp"), + (Timestamp("2014-01-01", tz="UTC"), "timestamp"), + (Timestamp("2014-01-01", tz="US/Eastern"), "timestamp"), + (pytz.timezone("Asia/Tokyo").localize(datetime(2014, 1, 1)), "timestamp"), + ], +) +def test_nat_arithmetic_scalar(op_name, value, val_type): + # see gh-6873 + invalid_ops = { + "scalar": {"right_div_left"}, + "floating": { + "right_div_left", + "left_minus_right", + "right_minus_left", + "left_plus_right", + "right_plus_left", + }, + "str": set(_ops.keys()), + "timedelta": {"left_times_right", "right_times_left"}, + "timestamp": { + "left_times_right", + "right_times_left", + "left_div_right", + "right_div_left", + }, + } + + op = _ops[op_name] + + if op_name in invalid_ops.get(val_type, set()): + if ( + val_type == "timedelta" + and "times" in op_name + and isinstance(value, Timedelta) + ): + msg = "Cannot multiply" + elif val_type == "str": + # un-specific check here because the message comes from str + # and varies by method + msg = ( + "can only concatenate str|" + "unsupported operand type|" + "can't multiply sequence|" + "Can't convert 'NaTType'|" + "must be str, not NaTType" + ) + else: + msg = "unsupported operand type" + + with pytest.raises(TypeError, match=msg): + op(NaT, value) + else: + if val_type == "timedelta" and "div" in op_name: + expected = np.nan + else: + expected = NaT + + assert op(NaT, value) is expected + + +@pytest.mark.parametrize( + "val,expected", [(np.nan, NaT), (NaT, np.nan), (np.timedelta64("NaT"), np.nan)] +) +def test_nat_rfloordiv_timedelta(val, expected): + # see gh-#18846 + # + # See also test_timedelta.TestTimedeltaArithmetic.test_floordiv + td = Timedelta(hours=3, minutes=4) + assert td // val is expected + + +@pytest.mark.parametrize( + "op_name", + ["left_plus_right", "right_plus_left", "left_minus_right", "right_minus_left"], +) +@pytest.mark.parametrize( + "value", + [ + DatetimeIndex(["2011-01-01", "2011-01-02"], name="x"), + DatetimeIndex(["2011-01-01", "2011-01-02"], tz="US/Eastern", name="x"), + DatetimeArray._from_sequence(["2011-01-01", "2011-01-02"]), + DatetimeArray._from_sequence(["2011-01-01", "2011-01-02"], tz="US/Pacific"), + TimedeltaIndex(["1 day", "2 day"], name="x"), + ], +) +def test_nat_arithmetic_index(op_name, value): + # see gh-11718 + exp_name = "x" + exp_data = [NaT] * 2 + + if is_datetime64_any_dtype(value.dtype) and "plus" in op_name: + expected = DatetimeIndex(exp_data, tz=value.tz, name=exp_name) + else: + expected = TimedeltaIndex(exp_data, name=exp_name) + + if not isinstance(value, Index): + expected = expected.array + + op = _ops[op_name] + result = op(NaT, value) + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "op_name", + ["left_plus_right", "right_plus_left", "left_minus_right", "right_minus_left"], +) +@pytest.mark.parametrize("box", [TimedeltaIndex, Series, TimedeltaArray._from_sequence]) +def test_nat_arithmetic_td64_vector(op_name, box): + # see gh-19124 + vec = box(["1 day", "2 day"], dtype="timedelta64[ns]") + box_nat = box([NaT, NaT], dtype="timedelta64[ns]") + tm.assert_equal(_ops[op_name](vec, NaT), box_nat) + + +@pytest.mark.parametrize( + "dtype,op,out_dtype", + [ + ("datetime64[ns]", operator.add, "datetime64[ns]"), + ("datetime64[ns]", roperator.radd, "datetime64[ns]"), + ("datetime64[ns]", operator.sub, "timedelta64[ns]"), + ("datetime64[ns]", roperator.rsub, "timedelta64[ns]"), + ("timedelta64[ns]", operator.add, "datetime64[ns]"), + ("timedelta64[ns]", roperator.radd, "datetime64[ns]"), + ("timedelta64[ns]", operator.sub, "datetime64[ns]"), + ("timedelta64[ns]", roperator.rsub, "timedelta64[ns]"), + ], +) +def test_nat_arithmetic_ndarray(dtype, op, out_dtype): + other = np.arange(10).astype(dtype) + result = op(NaT, other) + + expected = np.empty(other.shape, dtype=out_dtype) + expected.fill("NaT") + tm.assert_numpy_array_equal(result, expected) + + +def test_nat_pinned_docstrings(): + # see gh-17327 + assert NaT.ctime.__doc__ == datetime.ctime.__doc__ + + +def test_to_numpy_alias(): + # GH 24653: alias .to_numpy() for scalars + expected = NaT.to_datetime64() + result = NaT.to_numpy() + + assert isna(expected) and isna(result) + + +@pytest.mark.parametrize("other", [Timedelta(0), Timestamp(0)]) +def test_nat_comparisons(compare_operators_no_eq_ne, other): + # GH 26039 + assert getattr(NaT, compare_operators_no_eq_ne)(other) is False + assert getattr(other, compare_operators_no_eq_ne)(NaT) is False diff --git a/venv/Lib/site-packages/pandas/tests/scalar/timedelta/__init__.py b/venv/Lib/site-packages/pandas/tests/scalar/timedelta/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/scalar/timedelta/test_arithmetic.py b/venv/Lib/site-packages/pandas/tests/scalar/timedelta/test_arithmetic.py new file mode 100644 index 0000000..6a9ef86 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -0,0 +1,744 @@ +""" +Tests for scalar Timedelta arithmetic ops +""" +from datetime import datetime, timedelta +import operator + +import numpy as np +import pytest + +import pandas as pd +from pandas import NaT, Timedelta, Timestamp, offsets +import pandas._testing as tm +from pandas.core import ops + + +class TestTimedeltaAdditionSubtraction: + """ + Tests for Timedelta methods: + + __add__, __radd__, + __sub__, __rsub__ + """ + + @pytest.mark.parametrize( + "ten_seconds", + [ + Timedelta(10, unit="s"), + timedelta(seconds=10), + np.timedelta64(10, "s"), + np.timedelta64(10000000000, "ns"), + offsets.Second(10), + ], + ) + def test_td_add_sub_ten_seconds(self, ten_seconds): + # GH#6808 + base = Timestamp("20130101 09:01:12.123456") + expected_add = Timestamp("20130101 09:01:22.123456") + expected_sub = Timestamp("20130101 09:01:02.123456") + + result = base + ten_seconds + assert result == expected_add + + result = base - ten_seconds + assert result == expected_sub + + @pytest.mark.parametrize( + "one_day_ten_secs", + [ + Timedelta("1 day, 00:00:10"), + Timedelta("1 days, 00:00:10"), + timedelta(days=1, seconds=10), + np.timedelta64(1, "D") + np.timedelta64(10, "s"), + offsets.Day() + offsets.Second(10), + ], + ) + def test_td_add_sub_one_day_ten_seconds(self, one_day_ten_secs): + # GH#6808 + base = Timestamp("20130102 09:01:12.123456") + expected_add = Timestamp("20130103 09:01:22.123456") + expected_sub = Timestamp("20130101 09:01:02.123456") + + result = base + one_day_ten_secs + assert result == expected_add + + result = base - one_day_ten_secs + assert result == expected_sub + + @pytest.mark.parametrize("op", [operator.add, ops.radd]) + def test_td_add_datetimelike_scalar(self, op): + # GH#19738 + td = Timedelta(10, unit="d") + + result = op(td, datetime(2016, 1, 1)) + if op is operator.add: + # datetime + Timedelta does _not_ call Timedelta.__radd__, + # so we get a datetime back instead of a Timestamp + assert isinstance(result, Timestamp) + assert result == Timestamp(2016, 1, 11) + + result = op(td, Timestamp("2018-01-12 18:09")) + assert isinstance(result, Timestamp) + assert result == Timestamp("2018-01-22 18:09") + + result = op(td, np.datetime64("2018-01-12")) + assert isinstance(result, Timestamp) + assert result == Timestamp("2018-01-22") + + result = op(td, NaT) + assert result is NaT + + @pytest.mark.parametrize("op", [operator.add, ops.radd]) + def test_td_add_td(self, op): + td = Timedelta(10, unit="d") + + result = op(td, Timedelta(days=10)) + assert isinstance(result, Timedelta) + assert result == Timedelta(days=20) + + @pytest.mark.parametrize("op", [operator.add, ops.radd]) + def test_td_add_pytimedelta(self, op): + td = Timedelta(10, unit="d") + result = op(td, timedelta(days=9)) + assert isinstance(result, Timedelta) + assert result == Timedelta(days=19) + + @pytest.mark.parametrize("op", [operator.add, ops.radd]) + def test_td_add_timedelta64(self, op): + td = Timedelta(10, unit="d") + result = op(td, np.timedelta64(-4, "D")) + assert isinstance(result, Timedelta) + assert result == Timedelta(days=6) + + @pytest.mark.parametrize("op", [operator.add, ops.radd]) + def test_td_add_offset(self, op): + td = Timedelta(10, unit="d") + + result = op(td, offsets.Hour(6)) + assert isinstance(result, Timedelta) + assert result == Timedelta(days=10, hours=6) + + def test_td_sub_td(self): + td = Timedelta(10, unit="d") + expected = Timedelta(0, unit="ns") + result = td - td + assert isinstance(result, Timedelta) + assert result == expected + + def test_td_sub_pytimedelta(self): + td = Timedelta(10, unit="d") + expected = Timedelta(0, unit="ns") + + result = td - td.to_pytimedelta() + assert isinstance(result, Timedelta) + assert result == expected + + result = td.to_pytimedelta() - td + assert isinstance(result, Timedelta) + assert result == expected + + def test_td_sub_timedelta64(self): + td = Timedelta(10, unit="d") + expected = Timedelta(0, unit="ns") + + result = td - td.to_timedelta64() + assert isinstance(result, Timedelta) + assert result == expected + + result = td.to_timedelta64() - td + assert isinstance(result, Timedelta) + assert result == expected + + def test_td_sub_nat(self): + # In this context pd.NaT is treated as timedelta-like + td = Timedelta(10, unit="d") + result = td - NaT + assert result is NaT + + def test_td_sub_td64_nat(self): + td = Timedelta(10, unit="d") + td_nat = np.timedelta64("NaT") + + result = td - td_nat + assert result is NaT + + result = td_nat - td + assert result is NaT + + def test_td_sub_offset(self): + td = Timedelta(10, unit="d") + result = td - offsets.Hour(1) + assert isinstance(result, Timedelta) + assert result == Timedelta(239, unit="h") + + def test_td_add_sub_numeric_raises(self): + td = Timedelta(10, unit="d") + for other in [2, 2.0, np.int64(2), np.float64(2)]: + with pytest.raises(TypeError): + td + other + with pytest.raises(TypeError): + other + td + with pytest.raises(TypeError): + td - other + with pytest.raises(TypeError): + other - td + + def test_td_rsub_nat(self): + td = Timedelta(10, unit="d") + result = NaT - td + assert result is NaT + + result = np.datetime64("NaT") - td + assert result is NaT + + def test_td_rsub_offset(self): + result = offsets.Hour(1) - Timedelta(10, unit="d") + assert isinstance(result, Timedelta) + assert result == Timedelta(-239, unit="h") + + def test_td_sub_timedeltalike_object_dtype_array(self): + # GH#21980 + arr = np.array([Timestamp("20130101 9:01"), Timestamp("20121230 9:02")]) + exp = np.array([Timestamp("20121231 9:01"), Timestamp("20121229 9:02")]) + res = arr - Timedelta("1D") + tm.assert_numpy_array_equal(res, exp) + + def test_td_sub_mixed_most_timedeltalike_object_dtype_array(self): + # GH#21980 + now = Timestamp.now() + arr = np.array([now, Timedelta("1D"), np.timedelta64(2, "h")]) + exp = np.array( + [ + now - Timedelta("1D"), + Timedelta("0D"), + np.timedelta64(2, "h") - Timedelta("1D"), + ] + ) + res = arr - Timedelta("1D") + tm.assert_numpy_array_equal(res, exp) + + def test_td_rsub_mixed_most_timedeltalike_object_dtype_array(self): + # GH#21980 + now = Timestamp.now() + arr = np.array([now, Timedelta("1D"), np.timedelta64(2, "h")]) + with pytest.raises(TypeError): + Timedelta("1D") - arr + + @pytest.mark.parametrize("op", [operator.add, ops.radd]) + def test_td_add_timedeltalike_object_dtype_array(self, op): + # GH#21980 + arr = np.array([Timestamp("20130101 9:01"), Timestamp("20121230 9:02")]) + exp = np.array([Timestamp("20130102 9:01"), Timestamp("20121231 9:02")]) + res = op(arr, Timedelta("1D")) + tm.assert_numpy_array_equal(res, exp) + + @pytest.mark.parametrize("op", [operator.add, ops.radd]) + def test_td_add_mixed_timedeltalike_object_dtype_array(self, op): + # GH#21980 + now = Timestamp.now() + arr = np.array([now, Timedelta("1D")]) + exp = np.array([now + Timedelta("1D"), Timedelta("2D")]) + res = op(arr, Timedelta("1D")) + tm.assert_numpy_array_equal(res, exp) + + # TODO: moved from index tests following #24365, may need de-duplication + def test_ops_ndarray(self): + td = Timedelta("1 day") + + # timedelta, timedelta + other = pd.to_timedelta(["1 day"]).values + expected = pd.to_timedelta(["2 days"]).values + tm.assert_numpy_array_equal(td + other, expected) + tm.assert_numpy_array_equal(other + td, expected) + msg = r"unsupported operand type\(s\) for \+: 'Timedelta' and 'int'" + with pytest.raises(TypeError, match=msg): + td + np.array([1]) + msg = r"unsupported operand type\(s\) for \+: 'numpy.ndarray' and 'Timedelta'" + with pytest.raises(TypeError, match=msg): + np.array([1]) + td + + expected = pd.to_timedelta(["0 days"]).values + tm.assert_numpy_array_equal(td - other, expected) + tm.assert_numpy_array_equal(-other + td, expected) + msg = r"unsupported operand type\(s\) for -: 'Timedelta' and 'int'" + with pytest.raises(TypeError, match=msg): + td - np.array([1]) + msg = r"unsupported operand type\(s\) for -: 'numpy.ndarray' and 'Timedelta'" + with pytest.raises(TypeError, match=msg): + np.array([1]) - td + + expected = pd.to_timedelta(["2 days"]).values + tm.assert_numpy_array_equal(td * np.array([2]), expected) + tm.assert_numpy_array_equal(np.array([2]) * td, expected) + msg = ( + "ufunc '?multiply'? cannot use operands with types" + r" dtype\(' right + + assert not left == right + assert left != right + + def test_ops_notimplemented(self): + class Other: + pass + + other = Other() + + td = Timedelta("1 day") + assert td.__add__(other) is NotImplemented + assert td.__sub__(other) is NotImplemented + assert td.__truediv__(other) is NotImplemented + assert td.__mul__(other) is NotImplemented + assert td.__floordiv__(other) is NotImplemented + + def test_unary_ops(self): + td = Timedelta(10, unit="d") + + # __neg__, __pos__ + assert -td == Timedelta(-10, unit="d") + assert -td == Timedelta("-10d") + assert +td == Timedelta(10, unit="d") + + # __abs__, __abs__(__neg__) + assert abs(td) == td + assert abs(-td) == td + assert abs(-td) == Timedelta("10d") + + +class TestTimedeltaComparison: + def test_compare_tick(self, tick_classes): + cls = tick_classes + + off = cls(4) + td = off.delta + assert isinstance(td, Timedelta) + + assert td == off + assert not td != off + assert td <= off + assert td >= off + assert not td < off + assert not td > off + + assert not td == 2 * off + assert td != 2 * off + assert td <= 2 * off + assert td < 2 * off + assert not td >= 2 * off + assert not td > 2 * off + + def test_comparison_object_array(self): + # analogous to GH#15183 + td = Timedelta("2 days") + other = Timedelta("3 hours") + + arr = np.array([other, td], dtype=object) + res = arr == td + expected = np.array([False, True], dtype=bool) + assert (res == expected).all() + + # 2D case + arr = np.array([[other, td], [td, other]], dtype=object) + res = arr != td + expected = np.array([[True, False], [False, True]], dtype=bool) + assert res.shape == expected.shape + assert (res == expected).all() + + def test_compare_timedelta_ndarray(self): + # GH11835 + periods = [Timedelta("0 days 01:00:00"), Timedelta("0 days 01:00:00")] + arr = np.array(periods) + result = arr[0] > arr + expected = np.array([False, False]) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.skip(reason="GH#20829 is reverted until after 0.24.0") + def test_compare_custom_object(self): + """ + Make sure non supported operations on Timedelta returns NonImplemented + and yields to other operand (GH#20829). + """ + + class CustomClass: + def __init__(self, cmp_result=None): + self.cmp_result = cmp_result + + def generic_result(self): + if self.cmp_result is None: + return NotImplemented + else: + return self.cmp_result + + def __eq__(self, other): + return self.generic_result() + + def __gt__(self, other): + return self.generic_result() + + t = Timedelta("1s") + + assert not (t == "string") + assert not (t == 1) + assert not (t == CustomClass()) + assert not (t == CustomClass(cmp_result=False)) + + assert t < CustomClass(cmp_result=True) + assert not (t < CustomClass(cmp_result=False)) + + assert t == CustomClass(cmp_result=True) + + @pytest.mark.parametrize("val", ["string", 1]) + def test_compare_unknown_type(self, val): + # GH20829 + t = Timedelta("1s") + with pytest.raises(TypeError): + t >= val + with pytest.raises(TypeError): + t > val + with pytest.raises(TypeError): + t <= val + with pytest.raises(TypeError): + t < val + + +class TestTimedeltas: + @pytest.mark.parametrize( + "unit, value, expected", + [ + ("us", 9.999, 9999), + ("ms", 9.999999, 9999999), + ("s", 9.999999999, 9999999999), + ], + ) + def test_rounding_on_int_unit_construction(self, unit, value, expected): + # GH 12690 + result = Timedelta(value, unit=unit) + assert result.value == expected + result = Timedelta(str(value) + unit) + assert result.value == expected + + def test_total_seconds_scalar(self): + # see gh-10939 + rng = Timedelta("1 days, 10:11:12.100123456") + expt = 1 * 86400 + 10 * 3600 + 11 * 60 + 12 + 100123456.0 / 1e9 + tm.assert_almost_equal(rng.total_seconds(), expt) + + rng = Timedelta(np.nan) + assert np.isnan(rng.total_seconds()) + + def test_conversion(self): + + for td in [Timedelta(10, unit="d"), Timedelta("1 days, 10:11:12.012345")]: + pydt = td.to_pytimedelta() + assert td == Timedelta(pydt) + assert td == pydt + assert isinstance(pydt, timedelta) and not isinstance(pydt, Timedelta) + + assert td == np.timedelta64(td.value, "ns") + td64 = td.to_timedelta64() + + assert td64 == np.timedelta64(td.value, "ns") + assert td == td64 + + assert isinstance(td64, np.timedelta64) + + # this is NOT equal and cannot be roundtripped (because of the nanos) + td = Timedelta("1 days, 10:11:12.012345678") + assert td != td.to_pytimedelta() + + def test_freq_conversion(self): + + # truediv + td = Timedelta("1 days 2 hours 3 ns") + result = td / np.timedelta64(1, "D") + assert result == td.value / float(86400 * 1e9) + result = td / np.timedelta64(1, "s") + assert result == td.value / float(1e9) + result = td / np.timedelta64(1, "ns") + assert result == td.value + + # floordiv + td = Timedelta("1 days 2 hours 3 ns") + result = td // np.timedelta64(1, "D") + assert result == 1 + result = td // np.timedelta64(1, "s") + assert result == 93600 + result = td // np.timedelta64(1, "ns") + assert result == td.value + + def test_fields(self): + def check(value): + # that we are int + assert isinstance(value, int) + + # compat to datetime.timedelta + rng = to_timedelta("1 days, 10:11:12") + assert rng.days == 1 + assert rng.seconds == 10 * 3600 + 11 * 60 + 12 + assert rng.microseconds == 0 + assert rng.nanoseconds == 0 + + msg = "'Timedelta' object has no attribute '{}'" + with pytest.raises(AttributeError, match=msg.format("hours")): + rng.hours + with pytest.raises(AttributeError, match=msg.format("minutes")): + rng.minutes + with pytest.raises(AttributeError, match=msg.format("milliseconds")): + rng.milliseconds + + # GH 10050 + check(rng.days) + check(rng.seconds) + check(rng.microseconds) + check(rng.nanoseconds) + + td = Timedelta("-1 days, 10:11:12") + assert abs(td) == Timedelta("13:48:48") + assert str(td) == "-1 days +10:11:12" + assert -td == Timedelta("0 days 13:48:48") + assert -Timedelta("-1 days, 10:11:12").value == 49728000000000 + assert Timedelta("-1 days, 10:11:12").value == -49728000000000 + + rng = to_timedelta("-1 days, 10:11:12.100123456") + assert rng.days == -1 + assert rng.seconds == 10 * 3600 + 11 * 60 + 12 + assert rng.microseconds == 100 * 1000 + 123 + assert rng.nanoseconds == 456 + msg = "'Timedelta' object has no attribute '{}'" + with pytest.raises(AttributeError, match=msg.format("hours")): + rng.hours + with pytest.raises(AttributeError, match=msg.format("minutes")): + rng.minutes + with pytest.raises(AttributeError, match=msg.format("milliseconds")): + rng.milliseconds + + # components + tup = to_timedelta(-1, "us").components + assert tup.days == -1 + assert tup.hours == 23 + assert tup.minutes == 59 + assert tup.seconds == 59 + assert tup.milliseconds == 999 + assert tup.microseconds == 999 + assert tup.nanoseconds == 0 + + # GH 10050 + check(tup.days) + check(tup.hours) + check(tup.minutes) + check(tup.seconds) + check(tup.milliseconds) + check(tup.microseconds) + check(tup.nanoseconds) + + tup = Timedelta("-1 days 1 us").components + assert tup.days == -2 + assert tup.hours == 23 + assert tup.minutes == 59 + assert tup.seconds == 59 + assert tup.milliseconds == 999 + assert tup.microseconds == 999 + assert tup.nanoseconds == 0 + + def test_iso_conversion(self): + # GH #21877 + expected = Timedelta(1, unit="s") + assert to_timedelta("P0DT0H0M1S") == expected + + def test_nat_converters(self): + result = to_timedelta("nat").to_numpy() + assert result.dtype.kind == "M" + assert result.astype("int64") == iNaT + + result = to_timedelta("nan").to_numpy() + assert result.dtype.kind == "M" + assert result.astype("int64") == iNaT + + @pytest.mark.parametrize( + "units, np_unit", + [ + (["W", "w"], "W"), + (["D", "d", "days", "day", "Days", "Day"], "D"), + ( + ["m", "minute", "min", "minutes", "t", "Minute", "Min", "Minutes", "T"], + "m", + ), + (["s", "seconds", "sec", "second", "S", "Seconds", "Sec", "Second"], "s"), + ( + [ + "ms", + "milliseconds", + "millisecond", + "milli", + "millis", + "l", + "MS", + "Milliseconds", + "Millisecond", + "Milli", + "Millis", + "L", + ], + "ms", + ), + ( + [ + "us", + "microseconds", + "microsecond", + "micro", + "micros", + "u", + "US", + "Microseconds", + "Microsecond", + "Micro", + "Micros", + "U", + ], + "us", + ), + ( + [ + "ns", + "nanoseconds", + "nanosecond", + "nano", + "nanos", + "n", + "NS", + "Nanoseconds", + "Nanosecond", + "Nano", + "Nanos", + "N", + ], + "ns", + ), + ], + ) + @pytest.mark.parametrize("wrapper", [np.array, list, pd.Index]) + def test_unit_parser(self, units, np_unit, wrapper): + # validate all units, GH 6855, GH 21762 + for unit in units: + # array-likes + expected = TimedeltaIndex( + [np.timedelta64(i, np_unit) for i in np.arange(5).tolist()] + ) + result = to_timedelta(wrapper(range(5)), unit=unit) + tm.assert_index_equal(result, expected) + result = TimedeltaIndex(wrapper(range(5)), unit=unit) + tm.assert_index_equal(result, expected) + + if unit == "M": + # M is treated as minutes in string repr + expected = TimedeltaIndex( + [np.timedelta64(i, "m") for i in np.arange(5).tolist()] + ) + + str_repr = [f"{x}{unit}" for x in np.arange(5)] + result = to_timedelta(wrapper(str_repr)) + tm.assert_index_equal(result, expected) + result = TimedeltaIndex(wrapper(str_repr)) + tm.assert_index_equal(result, expected) + + # scalar + expected = Timedelta(np.timedelta64(2, np_unit).astype("timedelta64[ns]")) + + result = to_timedelta(2, unit=unit) + assert result == expected + result = Timedelta(2, unit=unit) + assert result == expected + + if unit == "M": + expected = Timedelta(np.timedelta64(2, "m").astype("timedelta64[ns]")) + + result = to_timedelta(f"2{unit}") + assert result == expected + result = Timedelta(f"2{unit}") + assert result == expected + + @pytest.mark.parametrize("unit", ["Y", "y", "M"]) + def test_unit_m_y_raises(self, unit): + msg = "Units 'M' and 'Y' are no longer supported" + with pytest.raises(ValueError, match=msg): + Timedelta(10, unit) + + with pytest.raises(ValueError, match=msg): + to_timedelta(10, unit) + + with pytest.raises(ValueError, match=msg): + to_timedelta([1, 2], unit) + + def test_numeric_conversions(self): + assert Timedelta(0) == np.timedelta64(0, "ns") + assert Timedelta(10) == np.timedelta64(10, "ns") + assert Timedelta(10, unit="ns") == np.timedelta64(10, "ns") + + assert Timedelta(10, unit="us") == np.timedelta64(10, "us") + assert Timedelta(10, unit="ms") == np.timedelta64(10, "ms") + assert Timedelta(10, unit="s") == np.timedelta64(10, "s") + assert Timedelta(10, unit="d") == np.timedelta64(10, "D") + + def test_timedelta_conversions(self): + assert Timedelta(timedelta(seconds=1)) == np.timedelta64(1, "s").astype( + "m8[ns]" + ) + assert Timedelta(timedelta(microseconds=1)) == np.timedelta64(1, "us").astype( + "m8[ns]" + ) + assert Timedelta(timedelta(days=1)) == np.timedelta64(1, "D").astype("m8[ns]") + + def test_to_numpy_alias(self): + # GH 24653: alias .to_numpy() for scalars + td = Timedelta("10m7s") + assert td.to_timedelta64() == td.to_numpy() + + def test_round(self): + + t1 = Timedelta("1 days 02:34:56.789123456") + t2 = Timedelta("-1 days 02:34:56.789123456") + + for (freq, s1, s2) in [ + ("N", t1, t2), + ( + "U", + Timedelta("1 days 02:34:56.789123000"), + Timedelta("-1 days 02:34:56.789123000"), + ), + ( + "L", + Timedelta("1 days 02:34:56.789000000"), + Timedelta("-1 days 02:34:56.789000000"), + ), + ("S", Timedelta("1 days 02:34:57"), Timedelta("-1 days 02:34:57")), + ("2S", Timedelta("1 days 02:34:56"), Timedelta("-1 days 02:34:56")), + ("5S", Timedelta("1 days 02:34:55"), Timedelta("-1 days 02:34:55")), + ("T", Timedelta("1 days 02:35:00"), Timedelta("-1 days 02:35:00")), + ("12T", Timedelta("1 days 02:36:00"), Timedelta("-1 days 02:36:00")), + ("H", Timedelta("1 days 03:00:00"), Timedelta("-1 days 03:00:00")), + ("d", Timedelta("1 days"), Timedelta("-1 days")), + ]: + r1 = t1.round(freq) + assert r1 == s1 + r2 = t2.round(freq) + assert r2 == s2 + + # invalid + for freq, msg in [ + ("Y", " is a non-fixed frequency"), + ("M", " is a non-fixed frequency"), + ("foobar", "Invalid frequency: foobar"), + ]: + with pytest.raises(ValueError, match=msg): + t1.round(freq) + + t1 = timedelta_range("1 days", periods=3, freq="1 min 2 s 3 us") + t2 = -1 * t1 + t1a = timedelta_range("1 days", periods=3, freq="1 min 2 s") + t1c = TimedeltaIndex([1, 1, 1], unit="D") + + # note that negative times round DOWN! so don't give whole numbers + for (freq, s1, s2) in [ + ("N", t1, t2), + ("U", t1, t2), + ( + "L", + t1a, + TimedeltaIndex( + ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"], + dtype="timedelta64[ns]", + freq=None, + ), + ), + ( + "S", + t1a, + TimedeltaIndex( + ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"], + dtype="timedelta64[ns]", + freq=None, + ), + ), + ( + "12T", + t1c, + TimedeltaIndex( + ["-1 days", "-1 days", "-1 days"], + dtype="timedelta64[ns]", + freq=None, + ), + ), + ( + "H", + t1c, + TimedeltaIndex( + ["-1 days", "-1 days", "-1 days"], + dtype="timedelta64[ns]", + freq=None, + ), + ), + ("d", t1c, TimedeltaIndex([-1, -1, -1], unit="D")), + ]: + + r1 = t1.round(freq) + tm.assert_index_equal(r1, s1) + r2 = t2.round(freq) + tm.assert_index_equal(r2, s2) + + # invalid + for freq, msg in [ + ("Y", " is a non-fixed frequency"), + ("M", " is a non-fixed frequency"), + ("foobar", "Invalid frequency: foobar"), + ]: + with pytest.raises(ValueError, match=msg): + t1.round(freq) + + def test_contains(self): + # Checking for any NaT-like objects + # GH 13603 + td = to_timedelta(range(5), unit="d") + pd.offsets.Hour(1) + for v in [NaT, None, float("nan"), np.nan]: + assert not (v in td) + + td = to_timedelta([NaT]) + for v in [NaT, None, float("nan"), np.nan]: + assert v in td + + def test_identity(self): + + td = Timedelta(10, unit="d") + assert isinstance(td, Timedelta) + assert isinstance(td, timedelta) + + def test_short_format_converters(self): + def conv(v): + return v.astype("m8[ns]") + + assert Timedelta("10") == np.timedelta64(10, "ns") + assert Timedelta("10ns") == np.timedelta64(10, "ns") + assert Timedelta("100") == np.timedelta64(100, "ns") + assert Timedelta("100ns") == np.timedelta64(100, "ns") + + assert Timedelta("1000") == np.timedelta64(1000, "ns") + assert Timedelta("1000ns") == np.timedelta64(1000, "ns") + assert Timedelta("1000NS") == np.timedelta64(1000, "ns") + + assert Timedelta("10us") == np.timedelta64(10000, "ns") + assert Timedelta("100us") == np.timedelta64(100000, "ns") + assert Timedelta("1000us") == np.timedelta64(1000000, "ns") + assert Timedelta("1000Us") == np.timedelta64(1000000, "ns") + assert Timedelta("1000uS") == np.timedelta64(1000000, "ns") + + assert Timedelta("1ms") == np.timedelta64(1000000, "ns") + assert Timedelta("10ms") == np.timedelta64(10000000, "ns") + assert Timedelta("100ms") == np.timedelta64(100000000, "ns") + assert Timedelta("1000ms") == np.timedelta64(1000000000, "ns") + + assert Timedelta("-1s") == -np.timedelta64(1000000000, "ns") + assert Timedelta("1s") == np.timedelta64(1000000000, "ns") + assert Timedelta("10s") == np.timedelta64(10000000000, "ns") + assert Timedelta("100s") == np.timedelta64(100000000000, "ns") + assert Timedelta("1000s") == np.timedelta64(1000000000000, "ns") + + assert Timedelta("1d") == conv(np.timedelta64(1, "D")) + assert Timedelta("-1d") == -conv(np.timedelta64(1, "D")) + assert Timedelta("1D") == conv(np.timedelta64(1, "D")) + assert Timedelta("10D") == conv(np.timedelta64(10, "D")) + assert Timedelta("100D") == conv(np.timedelta64(100, "D")) + assert Timedelta("1000D") == conv(np.timedelta64(1000, "D")) + assert Timedelta("10000D") == conv(np.timedelta64(10000, "D")) + + # space + assert Timedelta(" 10000D ") == conv(np.timedelta64(10000, "D")) + assert Timedelta(" - 10000D ") == -conv(np.timedelta64(10000, "D")) + + # invalid + with pytest.raises(ValueError): + Timedelta("1foo") + with pytest.raises(ValueError): + Timedelta("foo") + + def test_full_format_converters(self): + def conv(v): + return v.astype("m8[ns]") + + d1 = np.timedelta64(1, "D") + + assert Timedelta("1days") == conv(d1) + assert Timedelta("1days,") == conv(d1) + assert Timedelta("- 1days,") == -conv(d1) + + assert Timedelta("00:00:01") == conv(np.timedelta64(1, "s")) + assert Timedelta("06:00:01") == conv(np.timedelta64(6 * 3600 + 1, "s")) + assert Timedelta("06:00:01.0") == conv(np.timedelta64(6 * 3600 + 1, "s")) + assert Timedelta("06:00:01.01") == conv( + np.timedelta64(1000 * (6 * 3600 + 1) + 10, "ms") + ) + + assert Timedelta("- 1days, 00:00:01") == conv(-d1 + np.timedelta64(1, "s")) + assert Timedelta("1days, 06:00:01") == conv( + d1 + np.timedelta64(6 * 3600 + 1, "s") + ) + assert Timedelta("1days, 06:00:01.01") == conv( + d1 + np.timedelta64(1000 * (6 * 3600 + 1) + 10, "ms") + ) + + # invalid + with pytest.raises(ValueError): + Timedelta("- 1days, 00") + + def test_overflow(self): + # GH 9442 + s = Series(pd.date_range("20130101", periods=100000, freq="H")) + s[0] += Timedelta("1s 1ms") + + # mean + result = (s - s.min()).mean() + expected = Timedelta((TimedeltaIndex((s - s.min())).asi8 / len(s)).sum()) + + # the computation is converted to float so + # might be some loss of precision + assert np.allclose(result.value / 1000, expected.value / 1000) + + # sum + msg = "overflow in timedelta operation" + with pytest.raises(ValueError, match=msg): + (s - s.min()).sum() + s1 = s[0:10000] + with pytest.raises(ValueError, match=msg): + (s1 - s1.min()).sum() + s2 = s[0:1000] + result = (s2 - s2.min()).sum() + + def test_pickle(self): + + v = Timedelta("1 days 10:11:12.0123456") + v_p = tm.round_trip_pickle(v) + assert v == v_p + + def test_timedelta_hash_equality(self): + # GH 11129 + v = Timedelta(1, "D") + td = timedelta(days=1) + assert hash(v) == hash(td) + + d = {td: 2} + assert d[v] == 2 + + tds = timedelta_range("1 second", periods=20) + assert all(hash(td) == hash(td.to_pytimedelta()) for td in tds) + + # python timedeltas drop ns resolution + ns_td = Timedelta(1, "ns") + assert hash(ns_td) != hash(ns_td.to_pytimedelta()) + + def test_implementation_limits(self): + min_td = Timedelta(Timedelta.min) + max_td = Timedelta(Timedelta.max) + + # GH 12727 + # timedelta limits correspond to int64 boundaries + assert min_td.value == np.iinfo(np.int64).min + 1 + assert max_td.value == np.iinfo(np.int64).max + + # Beyond lower limit, a NAT before the Overflow + assert (min_td - Timedelta(1, "ns")) is NaT + + with pytest.raises(OverflowError): + min_td - Timedelta(2, "ns") + + with pytest.raises(OverflowError): + max_td + Timedelta(1, "ns") + + # Same tests using the internal nanosecond values + td = Timedelta(min_td.value - 1, "ns") + assert td is NaT + + with pytest.raises(OverflowError): + Timedelta(min_td.value - 2, "ns") + + with pytest.raises(OverflowError): + Timedelta(max_td.value + 1, "ns") + + def test_total_seconds_precision(self): + # GH 19458 + assert Timedelta("30S").total_seconds() == 30.0 + assert Timedelta("0").total_seconds() == 0.0 + assert Timedelta("-2S").total_seconds() == -2.0 + assert Timedelta("5.324S").total_seconds() == 5.324 + assert (Timedelta("30S").total_seconds() - 30.0) < 1e-20 + assert (30.0 - Timedelta("30S").total_seconds()) < 1e-20 + + def test_timedelta_arithmetic(self): + data = Series(["nat", "32 days"], dtype="timedelta64[ns]") + deltas = [timedelta(days=1), Timedelta(1, unit="D")] + for delta in deltas: + result_method = data.add(delta) + result_operator = data + delta + expected = Series(["nat", "33 days"], dtype="timedelta64[ns]") + tm.assert_series_equal(result_operator, expected) + tm.assert_series_equal(result_method, expected) + + result_method = data.sub(delta) + result_operator = data - delta + expected = Series(["nat", "31 days"], dtype="timedelta64[ns]") + tm.assert_series_equal(result_operator, expected) + tm.assert_series_equal(result_method, expected) + # GH 9396 + result_method = data.div(delta) + result_operator = data / delta + expected = Series([np.nan, 32.0], dtype="float64") + tm.assert_series_equal(result_operator, expected) + tm.assert_series_equal(result_method, expected) + + def test_apply_to_timedelta(self): + timedelta_NaT = to_timedelta("NaT") + + list_of_valid_strings = ["00:00:01", "00:00:02"] + a = to_timedelta(list_of_valid_strings) + b = Series(list_of_valid_strings).apply(to_timedelta) + # Can't compare until apply on a Series gives the correct dtype + # assert_series_equal(a, b) + + list_of_strings = ["00:00:01", np.nan, NaT, timedelta_NaT] + + # TODO: unused? + a = to_timedelta(list_of_strings) # noqa + b = Series(list_of_strings).apply(to_timedelta) # noqa + # Can't compare until apply on a Series gives the correct dtype + # assert_series_equal(a, b) + + def test_components(self): + rng = timedelta_range("1 days, 10:11:12", periods=2, freq="s") + rng.components + + # with nat + s = Series(rng) + s[1] = np.nan + + result = s.dt.components + assert not result.iloc[0].isna().all() + assert result.iloc[1].isna().all() + + def test_resolution_string(self): + assert Timedelta(days=1).resolution_string == "D" + assert Timedelta(days=1, hours=6).resolution_string == "H" + assert Timedelta(days=1, minutes=6).resolution_string == "T" + assert Timedelta(days=1, seconds=6).resolution_string == "S" + assert Timedelta(days=1, milliseconds=6).resolution_string == "L" + assert Timedelta(days=1, microseconds=6).resolution_string == "U" + assert Timedelta(days=1, nanoseconds=6).resolution_string == "N" + + def test_resolution_deprecated(self): + # GH#21344 + td = Timedelta(days=4, hours=3) + result = td.resolution + assert result == Timedelta(nanoseconds=1) + + # Check that the attribute is available on the class, mirroring + # the stdlib timedelta behavior + result = Timedelta.resolution + assert result == Timedelta(nanoseconds=1) + + +@pytest.mark.parametrize( + "value, expected", + [ + (Timedelta("10S"), True), + (Timedelta("-10S"), True), + (Timedelta(10, unit="ns"), True), + (Timedelta(0, unit="ns"), False), + (Timedelta(-10, unit="ns"), True), + (Timedelta(None), True), + (NaT, True), + ], +) +def test_truthiness(value, expected): + # https://github.com/pandas-dev/pandas/issues/21484 + assert bool(value) is expected diff --git a/venv/Lib/site-packages/pandas/tests/scalar/timestamp/__init__.py b/venv/Lib/site-packages/pandas/tests/scalar/timestamp/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/scalar/timestamp/test_arithmetic.py b/venv/Lib/site-packages/pandas/tests/scalar/timestamp/test_arithmetic.py new file mode 100644 index 0000000..1cab007 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/scalar/timestamp/test_arithmetic.py @@ -0,0 +1,214 @@ +from datetime import datetime, timedelta + +import numpy as np +import pytest + +from pandas import Timedelta, Timestamp + +from pandas.tseries import offsets +from pandas.tseries.frequencies import to_offset + + +class TestTimestampArithmetic: + def test_overflow_offset(self): + # no overflow expected + + stamp = Timestamp("2000/1/1") + offset_no_overflow = to_offset("D") * 100 + + expected = Timestamp("2000/04/10") + assert stamp + offset_no_overflow == expected + + assert offset_no_overflow + stamp == expected + + expected = Timestamp("1999/09/23") + assert stamp - offset_no_overflow == expected + + def test_overflow_offset_raises(self): + # xref https://github.com/statsmodels/statsmodels/issues/3374 + # ends up multiplying really large numbers which overflow + + stamp = Timestamp("2017-01-13 00:00:00", freq="D") + offset_overflow = 20169940 * offsets.Day(1) + msg = ( + "the add operation between " + r"\<-?\d+ \* Days\> and \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} " + "will overflow" + ) + + with pytest.raises(OverflowError, match=msg): + stamp + offset_overflow + + with pytest.raises(OverflowError, match=msg): + offset_overflow + stamp + + with pytest.raises(OverflowError, match=msg): + stamp - offset_overflow + + # xref https://github.com/pandas-dev/pandas/issues/14080 + # used to crash, so check for proper overflow exception + + stamp = Timestamp("2000/1/1") + offset_overflow = to_offset("D") * 100 ** 25 + + with pytest.raises(OverflowError, match=msg): + stamp + offset_overflow + + with pytest.raises(OverflowError, match=msg): + offset_overflow + stamp + + with pytest.raises(OverflowError, match=msg): + stamp - offset_overflow + + def test_delta_preserve_nanos(self): + val = Timestamp(1337299200000000123) + result = val + timedelta(1) + assert result.nanosecond == val.nanosecond + + def test_rsub_dtscalars(self, tz_naive_fixture): + # In particular, check that datetime64 - Timestamp works GH#28286 + td = Timedelta(1235345642000) + ts = Timestamp.now(tz_naive_fixture) + other = ts + td + + assert other - ts == td + assert other.to_pydatetime() - ts == td + if tz_naive_fixture is None: + assert other.to_datetime64() - ts == td + else: + with pytest.raises(TypeError, match="subtraction must have"): + other.to_datetime64() - ts + + def test_timestamp_sub_datetime(self): + dt = datetime(2013, 10, 12) + ts = Timestamp(datetime(2013, 10, 13)) + assert (ts - dt).days == 1 + assert (dt - ts).days == -1 + + def test_addition_subtraction_types(self): + # Assert on the types resulting from Timestamp +/- various date/time + # objects + dt = datetime(2014, 3, 4) + td = timedelta(seconds=1) + # build a timestamp with a frequency, since then it supports + # addition/subtraction of integers + ts = Timestamp(dt, freq="D") + + msg = "Addition/subtraction of integers" + with pytest.raises(TypeError, match=msg): + # GH#22535 add/sub with integers is deprecated + ts + 1 + with pytest.raises(TypeError, match=msg): + ts - 1 + + # Timestamp + datetime not supported, though subtraction is supported + # and yields timedelta more tests in tseries/base/tests/test_base.py + assert type(ts - dt) == Timedelta + assert type(ts + td) == Timestamp + assert type(ts - td) == Timestamp + + # Timestamp +/- datetime64 not supported, so not tested (could possibly + # assert error raised?) + td64 = np.timedelta64(1, "D") + assert type(ts + td64) == Timestamp + assert type(ts - td64) == Timestamp + + @pytest.mark.parametrize( + "freq, td, td64", + [ + ("S", timedelta(seconds=1), np.timedelta64(1, "s")), + ("min", timedelta(minutes=1), np.timedelta64(1, "m")), + ("H", timedelta(hours=1), np.timedelta64(1, "h")), + ("D", timedelta(days=1), np.timedelta64(1, "D")), + ("W", timedelta(weeks=1), np.timedelta64(1, "W")), + ("M", None, np.timedelta64(1, "M")), + ], + ) + def test_addition_subtraction_preserve_frequency(self, freq, td, td64): + ts = Timestamp("2014-03-05 00:00:00", freq=freq) + original_freq = ts.freq + + assert (ts + 1 * original_freq).freq == original_freq + assert (ts - 1 * original_freq).freq == original_freq + + if td is not None: + # timedelta does not support months as unit + assert (ts + td).freq == original_freq + assert (ts - td).freq == original_freq + + assert (ts + td64).freq == original_freq + assert (ts - td64).freq == original_freq + + @pytest.mark.parametrize( + "td", [Timedelta(hours=3), np.timedelta64(3, "h"), timedelta(hours=3)] + ) + def test_radd_tdscalar(self, td): + # GH#24775 timedelta64+Timestamp should not raise + ts = Timestamp.now() + assert td + ts == ts + td + + @pytest.mark.parametrize( + "other,expected_difference", + [ + (np.timedelta64(-123, "ns"), -123), + (np.timedelta64(1234567898, "ns"), 1234567898), + (np.timedelta64(-123, "us"), -123000), + (np.timedelta64(-123, "ms"), -123000000), + ], + ) + def test_timestamp_add_timedelta64_unit(self, other, expected_difference): + ts = Timestamp(datetime.utcnow()) + result = ts + other + valdiff = result.value - ts.value + assert valdiff == expected_difference + + @pytest.mark.parametrize("ts", [Timestamp.now(), Timestamp.now("utc")]) + @pytest.mark.parametrize( + "other", + [ + 1, + np.int64(1), + np.array([1, 2], dtype=np.int32), + np.array([3, 4], dtype=np.uint64), + ], + ) + def test_add_int_no_freq_raises(self, ts, other): + msg = "Addition/subtraction of integers and integer-arrays" + with pytest.raises(TypeError, match=msg): + ts + other + with pytest.raises(TypeError, match=msg): + other + ts + + with pytest.raises(TypeError, match=msg): + ts - other + with pytest.raises(TypeError): + other - ts + + @pytest.mark.parametrize( + "ts", + [ + Timestamp("1776-07-04", freq="D"), + Timestamp("1776-07-04", tz="UTC", freq="D"), + ], + ) + @pytest.mark.parametrize( + "other", + [ + 1, + np.int64(1), + np.array([1, 2], dtype=np.int32), + np.array([3, 4], dtype=np.uint64), + ], + ) + def test_add_int_with_freq(self, ts, other): + + with pytest.raises(TypeError): + ts + other + with pytest.raises(TypeError): + other + ts + + with pytest.raises(TypeError): + ts - other + + with pytest.raises(TypeError): + other - ts diff --git a/venv/Lib/site-packages/pandas/tests/scalar/timestamp/test_comparisons.py b/venv/Lib/site-packages/pandas/tests/scalar/timestamp/test_comparisons.py new file mode 100644 index 0000000..fce4fa6 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/scalar/timestamp/test_comparisons.py @@ -0,0 +1,192 @@ +from datetime import datetime +import operator + +import numpy as np +import pytest + +from pandas import Timestamp + + +class TestTimestampComparison: + def test_comparison_object_array(self): + # GH#15183 + ts = Timestamp("2011-01-03 00:00:00-0500", tz="US/Eastern") + other = Timestamp("2011-01-01 00:00:00-0500", tz="US/Eastern") + naive = Timestamp("2011-01-01 00:00:00") + + arr = np.array([other, ts], dtype=object) + res = arr == ts + expected = np.array([False, True], dtype=bool) + assert (res == expected).all() + + # 2D case + arr = np.array([[other, ts], [ts, other]], dtype=object) + res = arr != ts + expected = np.array([[True, False], [False, True]], dtype=bool) + assert res.shape == expected.shape + assert (res == expected).all() + + # tzaware mismatch + arr = np.array([naive], dtype=object) + with pytest.raises(TypeError): + arr < ts + + def test_comparison(self): + # 5-18-2012 00:00:00.000 + stamp = 1337299200000000000 + + val = Timestamp(stamp) + + assert val == val + assert not val != val + assert not val < val + assert val <= val + assert not val > val + assert val >= val + + other = datetime(2012, 5, 18) + assert val == other + assert not val != other + assert not val < other + assert val <= other + assert not val > other + assert val >= other + + other = Timestamp(stamp + 100) + + assert val != other + assert val != other + assert val < other + assert val <= other + assert other > val + assert other >= val + + def test_compare_invalid(self): + # GH#8058 + val = Timestamp("20130101 12:01:02") + assert not val == "foo" + assert not val == 10.0 + assert not val == 1 + assert not val == [] + assert not val == {"foo": 1} + assert not val == np.float64(1) + assert not val == np.int64(1) + + assert val != "foo" + assert val != 10.0 + assert val != 1 + assert val != [] + assert val != {"foo": 1} + assert val != np.float64(1) + assert val != np.int64(1) + + def test_cant_compare_tz_naive_w_aware(self, utc_fixture): + # see GH#1404 + a = Timestamp("3/12/2012") + b = Timestamp("3/12/2012", tz=utc_fixture) + + with pytest.raises(TypeError): + a == b + with pytest.raises(TypeError): + a != b + with pytest.raises(TypeError): + a < b + with pytest.raises(TypeError): + a <= b + with pytest.raises(TypeError): + a > b + with pytest.raises(TypeError): + a >= b + + with pytest.raises(TypeError): + b == a + with pytest.raises(TypeError): + b != a + with pytest.raises(TypeError): + b < a + with pytest.raises(TypeError): + b <= a + with pytest.raises(TypeError): + b > a + with pytest.raises(TypeError): + b >= a + + assert not a == b.to_pydatetime() + assert not a.to_pydatetime() == b + + def test_timestamp_compare_scalars(self): + # case where ndim == 0 + lhs = np.datetime64(datetime(2013, 12, 6)) + rhs = Timestamp("now") + nat = Timestamp("nat") + + ops = {"gt": "lt", "lt": "gt", "ge": "le", "le": "ge", "eq": "eq", "ne": "ne"} + + for left, right in ops.items(): + left_f = getattr(operator, left) + right_f = getattr(operator, right) + expected = left_f(lhs, rhs) + + result = right_f(rhs, lhs) + assert result == expected + + expected = left_f(rhs, nat) + result = right_f(nat, rhs) + assert result == expected + + def test_timestamp_compare_with_early_datetime(self): + # e.g. datetime.min + stamp = Timestamp("2012-01-01") + + assert not stamp == datetime.min + assert not stamp == datetime(1600, 1, 1) + assert not stamp == datetime(2700, 1, 1) + assert stamp != datetime.min + assert stamp != datetime(1600, 1, 1) + assert stamp != datetime(2700, 1, 1) + assert stamp > datetime(1600, 1, 1) + assert stamp >= datetime(1600, 1, 1) + assert stamp < datetime(2700, 1, 1) + assert stamp <= datetime(2700, 1, 1) + + def test_compare_zerodim_array(self): + # GH#26916 + ts = Timestamp.now() + dt64 = np.datetime64("2016-01-01", "ns") + arr = np.array(dt64) + assert arr.ndim == 0 + + result = arr < ts + assert result is True + result = arr > ts + assert result is False + + +def test_rich_comparison_with_unsupported_type(): + # Comparisons with unsupported objects should return NotImplemented + # (it previously raised TypeError, see #24011) + + class Inf: + def __lt__(self, o): + return False + + def __le__(self, o): + return isinstance(o, Inf) + + def __gt__(self, o): + return not isinstance(o, Inf) + + def __ge__(self, o): + return True + + def __eq__(self, other) -> bool: + return isinstance(other, Inf) + + inf = Inf() + timestamp = Timestamp("2018-11-30") + + for left, right in [(inf, timestamp), (timestamp, inf)]: + assert left > right or left < right + assert left >= right or left <= right + assert not (left == right) + assert left != right diff --git a/venv/Lib/site-packages/pandas/tests/scalar/timestamp/test_rendering.py b/venv/Lib/site-packages/pandas/tests/scalar/timestamp/test_rendering.py new file mode 100644 index 0000000..6b64b23 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/scalar/timestamp/test_rendering.py @@ -0,0 +1,87 @@ +import pprint + +import pytest +import pytz # noqa # a test below uses pytz but only inside a `eval` call + +from pandas import Timestamp + + +class TestTimestampRendering: + + timezones = ["UTC", "Asia/Tokyo", "US/Eastern", "dateutil/US/Pacific"] + + @pytest.mark.parametrize("tz", timezones) + @pytest.mark.parametrize("freq", ["D", "M", "S", "N"]) + @pytest.mark.parametrize( + "date", ["2014-03-07", "2014-01-01 09:00", "2014-01-01 00:00:00.000000001"] + ) + def test_repr(self, date, freq, tz): + # avoid to match with timezone name + freq_repr = "'{0}'".format(freq) + if tz.startswith("dateutil"): + tz_repr = tz.replace("dateutil", "") + else: + tz_repr = tz + + date_only = Timestamp(date) + assert date in repr(date_only) + assert tz_repr not in repr(date_only) + assert freq_repr not in repr(date_only) + assert date_only == eval(repr(date_only)) + + date_tz = Timestamp(date, tz=tz) + assert date in repr(date_tz) + assert tz_repr in repr(date_tz) + assert freq_repr not in repr(date_tz) + assert date_tz == eval(repr(date_tz)) + + date_freq = Timestamp(date, freq=freq) + assert date in repr(date_freq) + assert tz_repr not in repr(date_freq) + assert freq_repr in repr(date_freq) + assert date_freq == eval(repr(date_freq)) + + date_tz_freq = Timestamp(date, tz=tz, freq=freq) + assert date in repr(date_tz_freq) + assert tz_repr in repr(date_tz_freq) + assert freq_repr in repr(date_tz_freq) + assert date_tz_freq == eval(repr(date_tz_freq)) + + def test_repr_utcoffset(self): + # This can cause the tz field to be populated, but it's redundant to + # include this information in the date-string. + date_with_utc_offset = Timestamp("2014-03-13 00:00:00-0400", tz=None) + assert "2014-03-13 00:00:00-0400" in repr(date_with_utc_offset) + assert "tzoffset" not in repr(date_with_utc_offset) + assert "pytz.FixedOffset(-240)" in repr(date_with_utc_offset) + expr = repr(date_with_utc_offset).replace( + "'pytz.FixedOffset(-240)'", "pytz.FixedOffset(-240)" + ) + assert date_with_utc_offset == eval(expr) + + def test_timestamp_repr_pre1900(self): + # pre-1900 + stamp = Timestamp("1850-01-01", tz="US/Eastern") + repr(stamp) + + iso8601 = "1850-01-01 01:23:45.012345" + stamp = Timestamp(iso8601, tz="US/Eastern") + result = repr(stamp) + assert iso8601 in result + + def test_pprint(self): + # GH#12622 + nested_obj = {"foo": 1, "bar": [{"w": {"a": Timestamp("2011-01-01")}}] * 10} + result = pprint.pformat(nested_obj, width=50) + expected = r"""{'bar': [{'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}], + 'foo': 1}""" + assert result == expected diff --git a/venv/Lib/site-packages/pandas/tests/scalar/timestamp/test_timestamp.py b/venv/Lib/site-packages/pandas/tests/scalar/timestamp/test_timestamp.py new file mode 100644 index 0000000..1ce1087 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/scalar/timestamp/test_timestamp.py @@ -0,0 +1,1087 @@ +""" test the scalar Timestamp """ + +import calendar +from datetime import datetime, timedelta +import locale +import unicodedata + +import dateutil +from dateutil.tz import tzutc +import numpy as np +import pytest +import pytz +from pytz import timezone, utc + +from pandas._libs.tslibs import conversion +from pandas._libs.tslibs.timezones import dateutil_gettz as gettz, get_timezone +import pandas.compat as compat +from pandas.compat.numpy import np_datetime64_compat +from pandas.errors import OutOfBoundsDatetime +import pandas.util._test_decorators as td + +from pandas import NaT, Period, Timedelta, Timestamp +import pandas._testing as tm + +from pandas.tseries import offsets + + +class TestTimestampProperties: + def test_properties_business(self): + ts = Timestamp("2017-10-01", freq="B") + control = Timestamp("2017-10-01") + assert ts.dayofweek == 6 + assert not ts.is_month_start # not a weekday + assert not ts.is_quarter_start # not a weekday + # Control case: non-business is month/qtr start + assert control.is_month_start + assert control.is_quarter_start + + ts = Timestamp("2017-09-30", freq="B") + control = Timestamp("2017-09-30") + assert ts.dayofweek == 5 + assert not ts.is_month_end # not a weekday + assert not ts.is_quarter_end # not a weekday + # Control case: non-business is month/qtr start + assert control.is_month_end + assert control.is_quarter_end + + def test_fields(self): + def check(value, equal): + # that we are int like + assert isinstance(value, int) + assert value == equal + + # GH 10050 + ts = Timestamp("2015-05-10 09:06:03.000100001") + check(ts.year, 2015) + check(ts.month, 5) + check(ts.day, 10) + check(ts.hour, 9) + check(ts.minute, 6) + check(ts.second, 3) + msg = "'Timestamp' object has no attribute 'millisecond'" + with pytest.raises(AttributeError, match=msg): + ts.millisecond + check(ts.microsecond, 100) + check(ts.nanosecond, 1) + check(ts.dayofweek, 6) + check(ts.quarter, 2) + check(ts.dayofyear, 130) + check(ts.week, 19) + check(ts.daysinmonth, 31) + check(ts.daysinmonth, 31) + + # GH 13303 + ts = Timestamp("2014-12-31 23:59:00-05:00", tz="US/Eastern") + check(ts.year, 2014) + check(ts.month, 12) + check(ts.day, 31) + check(ts.hour, 23) + check(ts.minute, 59) + check(ts.second, 0) + msg = "'Timestamp' object has no attribute 'millisecond'" + with pytest.raises(AttributeError, match=msg): + ts.millisecond + check(ts.microsecond, 0) + check(ts.nanosecond, 0) + check(ts.dayofweek, 2) + check(ts.quarter, 4) + check(ts.dayofyear, 365) + check(ts.week, 1) + check(ts.daysinmonth, 31) + + ts = Timestamp("2014-01-01 00:00:00+01:00") + starts = ["is_month_start", "is_quarter_start", "is_year_start"] + for start in starts: + assert getattr(ts, start) + ts = Timestamp("2014-12-31 23:59:59+01:00") + ends = ["is_month_end", "is_year_end", "is_quarter_end"] + for end in ends: + assert getattr(ts, end) + + # GH 12806 + @pytest.mark.parametrize( + "data", + [Timestamp("2017-08-28 23:00:00"), Timestamp("2017-08-28 23:00:00", tz="EST")], + ) + @pytest.mark.parametrize( + "time_locale", [None] if tm.get_locales() is None else [None] + tm.get_locales() + ) + def test_names(self, data, time_locale): + # GH 17354 + # Test .day_name(), .month_name + if time_locale is None: + expected_day = "Monday" + expected_month = "August" + else: + with tm.set_locale(time_locale, locale.LC_TIME): + expected_day = calendar.day_name[0].capitalize() + expected_month = calendar.month_name[8].capitalize() + + result_day = data.day_name(time_locale) + result_month = data.month_name(time_locale) + + # Work around https://github.com/pandas-dev/pandas/issues/22342 + # different normalizations + expected_day = unicodedata.normalize("NFD", expected_day) + expected_month = unicodedata.normalize("NFD", expected_month) + + result_day = unicodedata.normalize("NFD", result_day) + result_month = unicodedata.normalize("NFD", result_month) + + assert result_day == expected_day + assert result_month == expected_month + + # Test NaT + nan_ts = Timestamp(NaT) + assert np.isnan(nan_ts.day_name(time_locale)) + assert np.isnan(nan_ts.month_name(time_locale)) + + def test_is_leap_year(self, tz_naive_fixture): + tz = tz_naive_fixture + # GH 13727 + dt = Timestamp("2000-01-01 00:00:00", tz=tz) + assert dt.is_leap_year + assert isinstance(dt.is_leap_year, bool) + + dt = Timestamp("1999-01-01 00:00:00", tz=tz) + assert not dt.is_leap_year + + dt = Timestamp("2004-01-01 00:00:00", tz=tz) + assert dt.is_leap_year + + dt = Timestamp("2100-01-01 00:00:00", tz=tz) + assert not dt.is_leap_year + + def test_woy_boundary(self): + # make sure weeks at year boundaries are correct + d = datetime(2013, 12, 31) + result = Timestamp(d).week + expected = 1 # ISO standard + assert result == expected + + d = datetime(2008, 12, 28) + result = Timestamp(d).week + expected = 52 # ISO standard + assert result == expected + + d = datetime(2009, 12, 31) + result = Timestamp(d).week + expected = 53 # ISO standard + assert result == expected + + d = datetime(2010, 1, 1) + result = Timestamp(d).week + expected = 53 # ISO standard + assert result == expected + + d = datetime(2010, 1, 3) + result = Timestamp(d).week + expected = 53 # ISO standard + assert result == expected + + result = np.array( + [ + Timestamp(datetime(*args)).week + for args in [(2000, 1, 1), (2000, 1, 2), (2005, 1, 1), (2005, 1, 2)] + ] + ) + assert (result == [52, 52, 53, 53]).all() + + def test_resolution(self): + # GH#21336, GH#21365 + dt = Timestamp("2100-01-01 00:00:00") + assert dt.resolution == Timedelta(nanoseconds=1) + + # Check that the attribute is available on the class, mirroring + # the stdlib datetime behavior + assert Timestamp.resolution == Timedelta(nanoseconds=1) + + +class TestTimestampConstructors: + def test_constructor(self): + base_str = "2014-07-01 09:00" + base_dt = datetime(2014, 7, 1, 9) + base_expected = 1_404_205_200_000_000_000 + + # confirm base representation is correct + assert calendar.timegm(base_dt.timetuple()) * 1_000_000_000 == base_expected + + tests = [ + (base_str, base_dt, base_expected), + ( + "2014-07-01 10:00", + datetime(2014, 7, 1, 10), + base_expected + 3600 * 1_000_000_000, + ), + ( + "2014-07-01 09:00:00.000008000", + datetime(2014, 7, 1, 9, 0, 0, 8), + base_expected + 8000, + ), + ( + "2014-07-01 09:00:00.000000005", + Timestamp("2014-07-01 09:00:00.000000005"), + base_expected + 5, + ), + ] + + timezones = [ + (None, 0), + ("UTC", 0), + (pytz.utc, 0), + ("Asia/Tokyo", 9), + ("US/Eastern", -4), + ("dateutil/US/Pacific", -7), + (pytz.FixedOffset(-180), -3), + (dateutil.tz.tzoffset(None, 18000), 5), + ] + + for date_str, date, expected in tests: + for result in [Timestamp(date_str), Timestamp(date)]: + # only with timestring + assert result.value == expected + assert conversion.pydt_to_i8(result) == expected + + # re-creation shouldn't affect to internal value + result = Timestamp(result) + assert result.value == expected + assert conversion.pydt_to_i8(result) == expected + + # with timezone + for tz, offset in timezones: + for result in [Timestamp(date_str, tz=tz), Timestamp(date, tz=tz)]: + expected_tz = expected - offset * 3600 * 1_000_000_000 + assert result.value == expected_tz + assert conversion.pydt_to_i8(result) == expected_tz + + # should preserve tz + result = Timestamp(result) + assert result.value == expected_tz + assert conversion.pydt_to_i8(result) == expected_tz + + # should convert to UTC + if tz is not None: + result = Timestamp(result).tz_convert("UTC") + else: + result = Timestamp(result, tz="UTC") + expected_utc = expected - offset * 3600 * 1_000_000_000 + assert result.value == expected_utc + assert conversion.pydt_to_i8(result) == expected_utc + + def test_constructor_with_stringoffset(self): + # GH 7833 + base_str = "2014-07-01 11:00:00+02:00" + base_dt = datetime(2014, 7, 1, 9) + base_expected = 1_404_205_200_000_000_000 + + # confirm base representation is correct + assert calendar.timegm(base_dt.timetuple()) * 1_000_000_000 == base_expected + + tests = [ + (base_str, base_expected), + ("2014-07-01 12:00:00+02:00", base_expected + 3600 * 1_000_000_000), + ("2014-07-01 11:00:00.000008000+02:00", base_expected + 8000), + ("2014-07-01 11:00:00.000000005+02:00", base_expected + 5), + ] + + timezones = [ + (None, 0), + ("UTC", 0), + (pytz.utc, 0), + ("Asia/Tokyo", 9), + ("US/Eastern", -4), + ("dateutil/US/Pacific", -7), + (pytz.FixedOffset(-180), -3), + (dateutil.tz.tzoffset(None, 18000), 5), + ] + + for date_str, expected in tests: + for result in [Timestamp(date_str)]: + # only with timestring + assert result.value == expected + assert conversion.pydt_to_i8(result) == expected + + # re-creation shouldn't affect to internal value + result = Timestamp(result) + assert result.value == expected + assert conversion.pydt_to_i8(result) == expected + + # with timezone + for tz, offset in timezones: + result = Timestamp(date_str, tz=tz) + expected_tz = expected + assert result.value == expected_tz + assert conversion.pydt_to_i8(result) == expected_tz + + # should preserve tz + result = Timestamp(result) + assert result.value == expected_tz + assert conversion.pydt_to_i8(result) == expected_tz + + # should convert to UTC + result = Timestamp(result).tz_convert("UTC") + expected_utc = expected + assert result.value == expected_utc + assert conversion.pydt_to_i8(result) == expected_utc + + # This should be 2013-11-01 05:00 in UTC + # converted to Chicago tz + result = Timestamp("2013-11-01 00:00:00-0500", tz="America/Chicago") + assert result.value == Timestamp("2013-11-01 05:00").value + expected = "Timestamp('2013-11-01 00:00:00-0500', tz='America/Chicago')" # noqa + assert repr(result) == expected + assert result == eval(repr(result)) + + # This should be 2013-11-01 05:00 in UTC + # converted to Tokyo tz (+09:00) + result = Timestamp("2013-11-01 00:00:00-0500", tz="Asia/Tokyo") + assert result.value == Timestamp("2013-11-01 05:00").value + expected = "Timestamp('2013-11-01 14:00:00+0900', tz='Asia/Tokyo')" + assert repr(result) == expected + assert result == eval(repr(result)) + + # GH11708 + # This should be 2015-11-18 10:00 in UTC + # converted to Asia/Katmandu + result = Timestamp("2015-11-18 15:45:00+05:45", tz="Asia/Katmandu") + assert result.value == Timestamp("2015-11-18 10:00").value + expected = "Timestamp('2015-11-18 15:45:00+0545', tz='Asia/Katmandu')" + assert repr(result) == expected + assert result == eval(repr(result)) + + # This should be 2015-11-18 10:00 in UTC + # converted to Asia/Kolkata + result = Timestamp("2015-11-18 15:30:00+05:30", tz="Asia/Kolkata") + assert result.value == Timestamp("2015-11-18 10:00").value + expected = "Timestamp('2015-11-18 15:30:00+0530', tz='Asia/Kolkata')" + assert repr(result) == expected + assert result == eval(repr(result)) + + def test_constructor_invalid(self): + with pytest.raises(TypeError, match="Cannot convert input"): + Timestamp(slice(2)) + with pytest.raises(ValueError, match="Cannot convert Period"): + Timestamp(Period("1000-01-01")) + + def test_constructor_invalid_tz(self): + # GH#17690 + with pytest.raises(TypeError, match="must be a datetime.tzinfo"): + Timestamp("2017-10-22", tzinfo="US/Eastern") + + with pytest.raises(ValueError, match="at most one of"): + Timestamp("2017-10-22", tzinfo=utc, tz="UTC") + + with pytest.raises(ValueError, match="Invalid frequency:"): + # GH#5168 + # case where user tries to pass tz as an arg, not kwarg, gets + # interpreted as a `freq` + Timestamp("2012-01-01", "US/Pacific") + + def test_constructor_strptime(self): + # GH25016 + # Test support for Timestamp.strptime + fmt = "%Y%m%d-%H%M%S-%f%z" + ts = "20190129-235348-000001+0000" + with pytest.raises(NotImplementedError): + Timestamp.strptime(ts, fmt) + + def test_constructor_tz_or_tzinfo(self): + # GH#17943, GH#17690, GH#5168 + stamps = [ + Timestamp(year=2017, month=10, day=22, tz="UTC"), + Timestamp(year=2017, month=10, day=22, tzinfo=utc), + Timestamp(year=2017, month=10, day=22, tz=utc), + Timestamp(datetime(2017, 10, 22), tzinfo=utc), + Timestamp(datetime(2017, 10, 22), tz="UTC"), + Timestamp(datetime(2017, 10, 22), tz=utc), + ] + assert all(ts == stamps[0] for ts in stamps) + + def test_constructor_positional(self): + # see gh-10758 + with pytest.raises(TypeError): + Timestamp(2000, 1) + with pytest.raises(ValueError): + Timestamp(2000, 0, 1) + with pytest.raises(ValueError): + Timestamp(2000, 13, 1) + with pytest.raises(ValueError): + Timestamp(2000, 1, 0) + with pytest.raises(ValueError): + Timestamp(2000, 1, 32) + + # see gh-11630 + assert repr(Timestamp(2015, 11, 12)) == repr(Timestamp("20151112")) + assert repr(Timestamp(2015, 11, 12, 1, 2, 3, 999999)) == repr( + Timestamp("2015-11-12 01:02:03.999999") + ) + + def test_constructor_keyword(self): + # GH 10758 + with pytest.raises(TypeError): + Timestamp(year=2000, month=1) + with pytest.raises(ValueError): + Timestamp(year=2000, month=0, day=1) + with pytest.raises(ValueError): + Timestamp(year=2000, month=13, day=1) + with pytest.raises(ValueError): + Timestamp(year=2000, month=1, day=0) + with pytest.raises(ValueError): + Timestamp(year=2000, month=1, day=32) + + assert repr(Timestamp(year=2015, month=11, day=12)) == repr( + Timestamp("20151112") + ) + + assert repr( + Timestamp( + year=2015, + month=11, + day=12, + hour=1, + minute=2, + second=3, + microsecond=999999, + ) + ) == repr(Timestamp("2015-11-12 01:02:03.999999")) + + def test_constructor_fromordinal(self): + base = datetime(2000, 1, 1) + + ts = Timestamp.fromordinal(base.toordinal(), freq="D") + assert base == ts + assert ts.freq == "D" + assert base.toordinal() == ts.toordinal() + + ts = Timestamp.fromordinal(base.toordinal(), tz="US/Eastern") + assert Timestamp("2000-01-01", tz="US/Eastern") == ts + assert base.toordinal() == ts.toordinal() + + # GH#3042 + dt = datetime(2011, 4, 16, 0, 0) + ts = Timestamp.fromordinal(dt.toordinal()) + assert ts.to_pydatetime() == dt + + # with a tzinfo + stamp = Timestamp("2011-4-16", tz="US/Eastern") + dt_tz = stamp.to_pydatetime() + ts = Timestamp.fromordinal(dt_tz.toordinal(), tz="US/Eastern") + assert ts.to_pydatetime() == dt_tz + + @pytest.mark.parametrize( + "result", + [ + Timestamp(datetime(2000, 1, 2, 3, 4, 5, 6), nanosecond=1), + Timestamp( + year=2000, + month=1, + day=2, + hour=3, + minute=4, + second=5, + microsecond=6, + nanosecond=1, + ), + Timestamp( + year=2000, + month=1, + day=2, + hour=3, + minute=4, + second=5, + microsecond=6, + nanosecond=1, + tz="UTC", + ), + Timestamp(2000, 1, 2, 3, 4, 5, 6, 1, None), + Timestamp(2000, 1, 2, 3, 4, 5, 6, 1, pytz.UTC), + ], + ) + def test_constructor_nanosecond(self, result): + # GH 18898 + expected = Timestamp(datetime(2000, 1, 2, 3, 4, 5, 6), tz=result.tz) + expected = expected + Timedelta(nanoseconds=1) + assert result == expected + + @pytest.mark.parametrize("z", ["Z0", "Z00"]) + def test_constructor_invalid_Z0_isostring(self, z): + # GH 8910 + with pytest.raises(ValueError): + Timestamp("2014-11-02 01:00{}".format(z)) + + @pytest.mark.parametrize( + "arg", + [ + "year", + "month", + "day", + "hour", + "minute", + "second", + "microsecond", + "nanosecond", + ], + ) + def test_invalid_date_kwarg_with_string_input(self, arg): + kwarg = {arg: 1} + with pytest.raises(ValueError): + Timestamp("2010-10-10 12:59:59.999999999", **kwarg) + + def test_out_of_bounds_integer_value(self): + # GH#26651 check that we raise OutOfBoundsDatetime, not OverflowError + with pytest.raises(OutOfBoundsDatetime): + Timestamp(Timestamp.max.value * 2) + with pytest.raises(OutOfBoundsDatetime): + Timestamp(Timestamp.min.value * 2) + + def test_out_of_bounds_value(self): + one_us = np.timedelta64(1).astype("timedelta64[us]") + + # By definition we can't go out of bounds in [ns], so we + # convert the datetime64s to [us] so we can go out of bounds + min_ts_us = np.datetime64(Timestamp.min).astype("M8[us]") + max_ts_us = np.datetime64(Timestamp.max).astype("M8[us]") + + # No error for the min/max datetimes + Timestamp(min_ts_us) + Timestamp(max_ts_us) + + # One us less than the minimum is an error + with pytest.raises(ValueError): + Timestamp(min_ts_us - one_us) + + # One us more than the maximum is an error + with pytest.raises(ValueError): + Timestamp(max_ts_us + one_us) + + def test_out_of_bounds_string(self): + with pytest.raises(ValueError): + Timestamp("1676-01-01") + with pytest.raises(ValueError): + Timestamp("2263-01-01") + + def test_barely_out_of_bounds(self): + # GH#19529 + # GH#19382 close enough to bounds that dropping nanos would result + # in an in-bounds datetime + with pytest.raises(OutOfBoundsDatetime): + Timestamp("2262-04-11 23:47:16.854775808") + + def test_bounds_with_different_units(self): + out_of_bounds_dates = ("1677-09-21", "2262-04-12") + + time_units = ("D", "h", "m", "s", "ms", "us") + + for date_string in out_of_bounds_dates: + for unit in time_units: + dt64 = np.datetime64(date_string, unit) + with pytest.raises(ValueError): + Timestamp(dt64) + + in_bounds_dates = ("1677-09-23", "2262-04-11") + + for date_string in in_bounds_dates: + for unit in time_units: + dt64 = np.datetime64(date_string, unit) + Timestamp(dt64) + + def test_min_valid(self): + # Ensure that Timestamp.min is a valid Timestamp + Timestamp(Timestamp.min) + + def test_max_valid(self): + # Ensure that Timestamp.max is a valid Timestamp + Timestamp(Timestamp.max) + + def test_now(self): + # GH#9000 + ts_from_string = Timestamp("now") + ts_from_method = Timestamp.now() + ts_datetime = datetime.now() + + ts_from_string_tz = Timestamp("now", tz="US/Eastern") + ts_from_method_tz = Timestamp.now(tz="US/Eastern") + + # Check that the delta between the times is less than 1s (arbitrarily + # small) + delta = Timedelta(seconds=1) + assert abs(ts_from_method - ts_from_string) < delta + assert abs(ts_datetime - ts_from_method) < delta + assert abs(ts_from_method_tz - ts_from_string_tz) < delta + assert ( + abs( + ts_from_string_tz.tz_localize(None) + - ts_from_method_tz.tz_localize(None) + ) + < delta + ) + + def test_today(self): + ts_from_string = Timestamp("today") + ts_from_method = Timestamp.today() + ts_datetime = datetime.today() + + ts_from_string_tz = Timestamp("today", tz="US/Eastern") + ts_from_method_tz = Timestamp.today(tz="US/Eastern") + + # Check that the delta between the times is less than 1s (arbitrarily + # small) + delta = Timedelta(seconds=1) + assert abs(ts_from_method - ts_from_string) < delta + assert abs(ts_datetime - ts_from_method) < delta + assert abs(ts_from_method_tz - ts_from_string_tz) < delta + assert ( + abs( + ts_from_string_tz.tz_localize(None) + - ts_from_method_tz.tz_localize(None) + ) + < delta + ) + + @pytest.mark.parametrize("tz", [None, pytz.timezone("US/Pacific")]) + def test_disallow_setting_tz(self, tz): + # GH 3746 + ts = Timestamp("2010") + with pytest.raises(AttributeError): + ts.tz = tz + + @pytest.mark.parametrize("offset", ["+0300", "+0200"]) + def test_construct_timestamp_near_dst(self, offset): + # GH 20854 + expected = Timestamp( + "2016-10-30 03:00:00{}".format(offset), tz="Europe/Helsinki" + ) + result = Timestamp(expected).tz_convert("Europe/Helsinki") + assert result == expected + + @pytest.mark.parametrize( + "arg", ["2013/01/01 00:00:00+09:00", "2013-01-01 00:00:00+09:00"] + ) + def test_construct_with_different_string_format(self, arg): + # GH 12064 + result = Timestamp(arg) + expected = Timestamp(datetime(2013, 1, 1), tz=pytz.FixedOffset(540)) + assert result == expected + + def test_construct_timestamp_preserve_original_frequency(self): + # GH 22311 + result = Timestamp(Timestamp("2010-08-08", freq="D")).freq + expected = offsets.Day() + assert result == expected + + def test_constructor_invalid_frequency(self): + # GH 22311 + with pytest.raises(ValueError, match="Invalid frequency:"): + Timestamp("2012-01-01", freq=[]) + + @pytest.mark.parametrize("box", [datetime, Timestamp]) + def test_raise_tz_and_tzinfo_in_datetime_input(self, box): + # GH 23579 + kwargs = {"year": 2018, "month": 1, "day": 1, "tzinfo": utc} + with pytest.raises(ValueError, match="Cannot pass a datetime or Timestamp"): + Timestamp(box(**kwargs), tz="US/Pacific") + with pytest.raises(ValueError, match="Cannot pass a datetime or Timestamp"): + Timestamp(box(**kwargs), tzinfo=pytz.timezone("US/Pacific")) + + def test_dont_convert_dateutil_utc_to_pytz_utc(self): + result = Timestamp(datetime(2018, 1, 1), tz=tzutc()) + expected = Timestamp(datetime(2018, 1, 1)).tz_localize(tzutc()) + assert result == expected + + def test_constructor_subclassed_datetime(self): + # GH 25851 + # ensure that subclassed datetime works for + # Timestamp creation + class SubDatetime(datetime): + pass + + data = SubDatetime(2000, 1, 1) + result = Timestamp(data) + expected = Timestamp(2000, 1, 1) + assert result == expected + + @pytest.mark.skipif( + not compat.PY38, + reason="datetime.fromisocalendar was added in Python version 3.8", + ) + def test_constructor_fromisocalendar(self): + # GH 30395 + expected_timestamp = Timestamp("2000-01-03 00:00:00") + expected_stdlib = datetime.fromisocalendar(2000, 1, 1) + result = Timestamp.fromisocalendar(2000, 1, 1) + assert result == expected_timestamp + assert result == expected_stdlib + assert isinstance(result, Timestamp) + + +class TestTimestamp: + def test_tz(self): + tstr = "2014-02-01 09:00" + ts = Timestamp(tstr) + local = ts.tz_localize("Asia/Tokyo") + assert local.hour == 9 + assert local == Timestamp(tstr, tz="Asia/Tokyo") + conv = local.tz_convert("US/Eastern") + assert conv == Timestamp("2014-01-31 19:00", tz="US/Eastern") + assert conv.hour == 19 + + # preserves nanosecond + ts = Timestamp(tstr) + offsets.Nano(5) + local = ts.tz_localize("Asia/Tokyo") + assert local.hour == 9 + assert local.nanosecond == 5 + conv = local.tz_convert("US/Eastern") + assert conv.nanosecond == 5 + assert conv.hour == 19 + + def test_utc_z_designator(self): + assert get_timezone(Timestamp("2014-11-02 01:00Z").tzinfo) is utc + + def test_asm8(self): + np.random.seed(7_960_929) + ns = [Timestamp.min.value, Timestamp.max.value, 1000] + + for n in ns: + assert ( + Timestamp(n).asm8.view("i8") == np.datetime64(n, "ns").view("i8") == n + ) + + assert Timestamp("nat").asm8.view("i8") == np.datetime64("nat", "ns").view("i8") + + def test_class_ops_pytz(self): + def compare(x, y): + assert int((Timestamp(x).value - Timestamp(y).value) / 1e9) == 0 + + compare(Timestamp.now(), datetime.now()) + compare(Timestamp.now("UTC"), datetime.now(timezone("UTC"))) + compare(Timestamp.utcnow(), datetime.utcnow()) + compare(Timestamp.today(), datetime.today()) + current_time = calendar.timegm(datetime.now().utctimetuple()) + compare( + Timestamp.utcfromtimestamp(current_time), + datetime.utcfromtimestamp(current_time), + ) + compare( + Timestamp.fromtimestamp(current_time), datetime.fromtimestamp(current_time) + ) + + date_component = datetime.utcnow() + time_component = (date_component + timedelta(minutes=10)).time() + compare( + Timestamp.combine(date_component, time_component), + datetime.combine(date_component, time_component), + ) + + def test_class_ops_dateutil(self): + def compare(x, y): + assert ( + int( + np.round(Timestamp(x).value / 1e9) + - np.round(Timestamp(y).value / 1e9) + ) + == 0 + ) + + compare(Timestamp.now(), datetime.now()) + compare(Timestamp.now("UTC"), datetime.now(tzutc())) + compare(Timestamp.utcnow(), datetime.utcnow()) + compare(Timestamp.today(), datetime.today()) + current_time = calendar.timegm(datetime.now().utctimetuple()) + compare( + Timestamp.utcfromtimestamp(current_time), + datetime.utcfromtimestamp(current_time), + ) + compare( + Timestamp.fromtimestamp(current_time), datetime.fromtimestamp(current_time) + ) + + date_component = datetime.utcnow() + time_component = (date_component + timedelta(minutes=10)).time() + compare( + Timestamp.combine(date_component, time_component), + datetime.combine(date_component, time_component), + ) + + def test_basics_nanos(self): + val = np.int64(946_684_800_000_000_000).view("M8[ns]") + stamp = Timestamp(val.view("i8") + 500) + assert stamp.year == 2000 + assert stamp.month == 1 + assert stamp.microsecond == 0 + assert stamp.nanosecond == 500 + + # GH 14415 + val = np.iinfo(np.int64).min + 80_000_000_000_000 + stamp = Timestamp(val) + assert stamp.year == 1677 + assert stamp.month == 9 + assert stamp.day == 21 + assert stamp.microsecond == 145224 + assert stamp.nanosecond == 192 + + @pytest.mark.parametrize( + "value, check_kwargs", + [ + [946688461000000000, {}], + [946688461000000000 / 1000, dict(unit="us")], + [946688461000000000 / 1_000_000, dict(unit="ms")], + [946688461000000000 / 1_000_000_000, dict(unit="s")], + [10957, dict(unit="D", h=0)], + [ + (946688461000000000 + 500000) / 1000000000, + dict(unit="s", us=499, ns=964), + ], + [(946688461000000000 + 500000000) / 1000000000, dict(unit="s", us=500000)], + [(946688461000000000 + 500000) / 1000000, dict(unit="ms", us=500)], + [(946688461000000000 + 500000) / 1000, dict(unit="us", us=500)], + [(946688461000000000 + 500000000) / 1000000, dict(unit="ms", us=500000)], + [946688461000000000 / 1000.0 + 5, dict(unit="us", us=5)], + [946688461000000000 / 1000.0 + 5000, dict(unit="us", us=5000)], + [946688461000000000 / 1000000.0 + 0.5, dict(unit="ms", us=500)], + [946688461000000000 / 1000000.0 + 0.005, dict(unit="ms", us=5, ns=5)], + [946688461000000000 / 1000000000.0 + 0.5, dict(unit="s", us=500000)], + [10957 + 0.5, dict(unit="D", h=12)], + ], + ) + def test_unit(self, value, check_kwargs): + def check(value, unit=None, h=1, s=1, us=0, ns=0): + stamp = Timestamp(value, unit=unit) + assert stamp.year == 2000 + assert stamp.month == 1 + assert stamp.day == 1 + assert stamp.hour == h + if unit != "D": + assert stamp.minute == 1 + assert stamp.second == s + assert stamp.microsecond == us + else: + assert stamp.minute == 0 + assert stamp.second == 0 + assert stamp.microsecond == 0 + assert stamp.nanosecond == ns + + check(value, **check_kwargs) + + def test_roundtrip(self): + + # test value to string and back conversions + # further test accessors + base = Timestamp("20140101 00:00:00") + + result = Timestamp(base.value + Timedelta("5ms").value) + assert result == Timestamp(f"{base}.005000") + assert result.microsecond == 5000 + + result = Timestamp(base.value + Timedelta("5us").value) + assert result == Timestamp(f"{base}.000005") + assert result.microsecond == 5 + + result = Timestamp(base.value + Timedelta("5ns").value) + assert result == Timestamp(f"{base}.000000005") + assert result.nanosecond == 5 + assert result.microsecond == 0 + + result = Timestamp(base.value + Timedelta("6ms 5us").value) + assert result == Timestamp(f"{base}.006005") + assert result.microsecond == 5 + 6 * 1000 + + result = Timestamp(base.value + Timedelta("200ms 5us").value) + assert result == Timestamp(f"{base}.200005") + assert result.microsecond == 5 + 200 * 1000 + + def test_hash_equivalent(self): + d = {datetime(2011, 1, 1): 5} + stamp = Timestamp(datetime(2011, 1, 1)) + assert d[stamp] == 5 + + def test_tz_conversion_freq(self, tz_naive_fixture): + # GH25241 + t1 = Timestamp("2019-01-01 10:00", freq="H") + assert t1.tz_localize(tz=tz_naive_fixture).freq == t1.freq + t2 = Timestamp("2019-01-02 12:00", tz="UTC", freq="T") + assert t2.tz_convert(tz="UTC").freq == t2.freq + + +class TestTimestampNsOperations: + def test_nanosecond_string_parsing(self): + ts = Timestamp("2013-05-01 07:15:45.123456789") + # GH 7878 + expected_repr = "2013-05-01 07:15:45.123456789" + expected_value = 1_367_392_545_123_456_789 + assert ts.value == expected_value + assert expected_repr in repr(ts) + + ts = Timestamp("2013-05-01 07:15:45.123456789+09:00", tz="Asia/Tokyo") + assert ts.value == expected_value - 9 * 3600 * 1_000_000_000 + assert expected_repr in repr(ts) + + ts = Timestamp("2013-05-01 07:15:45.123456789", tz="UTC") + assert ts.value == expected_value + assert expected_repr in repr(ts) + + ts = Timestamp("2013-05-01 07:15:45.123456789", tz="US/Eastern") + assert ts.value == expected_value + 4 * 3600 * 1_000_000_000 + assert expected_repr in repr(ts) + + # GH 10041 + ts = Timestamp("20130501T071545.123456789") + assert ts.value == expected_value + assert expected_repr in repr(ts) + + def test_nanosecond_timestamp(self): + # GH 7610 + expected = 1_293_840_000_000_000_005 + t = Timestamp("2011-01-01") + offsets.Nano(5) + assert repr(t) == "Timestamp('2011-01-01 00:00:00.000000005')" + assert t.value == expected + assert t.nanosecond == 5 + + t = Timestamp(t) + assert repr(t) == "Timestamp('2011-01-01 00:00:00.000000005')" + assert t.value == expected + assert t.nanosecond == 5 + + t = Timestamp(np_datetime64_compat("2011-01-01 00:00:00.000000005Z")) + assert repr(t) == "Timestamp('2011-01-01 00:00:00.000000005')" + assert t.value == expected + assert t.nanosecond == 5 + + expected = 1_293_840_000_000_000_010 + t = t + offsets.Nano(5) + assert repr(t) == "Timestamp('2011-01-01 00:00:00.000000010')" + assert t.value == expected + assert t.nanosecond == 10 + + t = Timestamp(t) + assert repr(t) == "Timestamp('2011-01-01 00:00:00.000000010')" + assert t.value == expected + assert t.nanosecond == 10 + + t = Timestamp(np_datetime64_compat("2011-01-01 00:00:00.000000010Z")) + assert repr(t) == "Timestamp('2011-01-01 00:00:00.000000010')" + assert t.value == expected + assert t.nanosecond == 10 + + +class TestTimestampToJulianDate: + def test_compare_1700(self): + r = Timestamp("1700-06-23").to_julian_date() + assert r == 2_342_145.5 + + def test_compare_2000(self): + r = Timestamp("2000-04-12").to_julian_date() + assert r == 2_451_646.5 + + def test_compare_2100(self): + r = Timestamp("2100-08-12").to_julian_date() + assert r == 2_488_292.5 + + def test_compare_hour01(self): + r = Timestamp("2000-08-12T01:00:00").to_julian_date() + assert r == 2_451_768.5416666666666666 + + def test_compare_hour13(self): + r = Timestamp("2000-08-12T13:00:00").to_julian_date() + assert r == 2_451_769.0416666666666666 + + +class TestTimestampConversion: + def test_conversion(self): + # GH#9255 + ts = Timestamp("2000-01-01") + + result = ts.to_pydatetime() + expected = datetime(2000, 1, 1) + assert result == expected + assert type(result) == type(expected) + + result = ts.to_datetime64() + expected = np.datetime64(ts.value, "ns") + assert result == expected + assert type(result) == type(expected) + assert result.dtype == expected.dtype + + def test_to_pydatetime_nonzero_nano(self): + ts = Timestamp("2011-01-01 9:00:00.123456789") + + # Warn the user of data loss (nanoseconds). + with tm.assert_produces_warning(UserWarning, check_stacklevel=False): + expected = datetime(2011, 1, 1, 9, 0, 0, 123456) + result = ts.to_pydatetime() + assert result == expected + + def test_timestamp_to_datetime(self): + stamp = Timestamp("20090415", tz="US/Eastern", freq="D") + dtval = stamp.to_pydatetime() + assert stamp == dtval + assert stamp.tzinfo == dtval.tzinfo + + def test_timestamp_to_datetime_dateutil(self): + stamp = Timestamp("20090415", tz="dateutil/US/Eastern", freq="D") + dtval = stamp.to_pydatetime() + assert stamp == dtval + assert stamp.tzinfo == dtval.tzinfo + + def test_timestamp_to_datetime_explicit_pytz(self): + stamp = Timestamp("20090415", tz=pytz.timezone("US/Eastern"), freq="D") + dtval = stamp.to_pydatetime() + assert stamp == dtval + assert stamp.tzinfo == dtval.tzinfo + + @td.skip_if_windows_python_3 + def test_timestamp_to_datetime_explicit_dateutil(self): + stamp = Timestamp("20090415", tz=gettz("US/Eastern"), freq="D") + dtval = stamp.to_pydatetime() + assert stamp == dtval + assert stamp.tzinfo == dtval.tzinfo + + def test_to_datetime_bijective(self): + # Ensure that converting to datetime and back only loses precision + # by going from nanoseconds to microseconds. + exp_warning = None if Timestamp.max.nanosecond == 0 else UserWarning + with tm.assert_produces_warning(exp_warning, check_stacklevel=False): + assert ( + Timestamp(Timestamp.max.to_pydatetime()).value / 1000 + == Timestamp.max.value / 1000 + ) + + exp_warning = None if Timestamp.min.nanosecond == 0 else UserWarning + with tm.assert_produces_warning(exp_warning, check_stacklevel=False): + assert ( + Timestamp(Timestamp.min.to_pydatetime()).value / 1000 + == Timestamp.min.value / 1000 + ) + + def test_to_period_tz_warning(self): + # GH#21333 make sure a warning is issued when timezone + # info is lost + ts = Timestamp("2009-04-15 16:17:18", tz="US/Eastern") + with tm.assert_produces_warning(UserWarning): + # warning that timezone info will be lost + ts.to_period("D") + + def test_to_numpy_alias(self): + # GH 24653: alias .to_numpy() for scalars + ts = Timestamp(datetime.now()) + assert ts.to_datetime64() == ts.to_numpy() + + +class SubDatetime(datetime): + pass + + +@pytest.mark.parametrize( + "lh,rh", + [ + (SubDatetime(2000, 1, 1), Timedelta(hours=1)), + (Timedelta(hours=1), SubDatetime(2000, 1, 1)), + ], +) +def test_dt_subclass_add_timedelta(lh, rh): + # GH#25851 + # ensure that subclassed datetime works for + # Timedelta operations + result = lh + rh + expected = SubDatetime(2000, 1, 1, 1) + assert result == expected diff --git a/venv/Lib/site-packages/pandas/tests/scalar/timestamp/test_timezones.py b/venv/Lib/site-packages/pandas/tests/scalar/timestamp/test_timezones.py new file mode 100644 index 0000000..6537f6c --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/scalar/timestamp/test_timezones.py @@ -0,0 +1,418 @@ +""" +Tests for Timestamp timezone-related methods +""" +from datetime import date, datetime, timedelta + +import dateutil +from dateutil.tz import gettz, tzoffset +import pytest +import pytz +from pytz.exceptions import AmbiguousTimeError, NonExistentTimeError + +from pandas._libs.tslibs import timezones +from pandas.errors import OutOfBoundsDatetime +import pandas.util._test_decorators as td + +from pandas import NaT, Timestamp + + +class TestTimestampTZOperations: + # -------------------------------------------------------------- + # Timestamp.tz_localize + + def test_tz_localize_pushes_out_of_bounds(self): + # GH#12677 + # tz_localize that pushes away from the boundary is OK + pac = Timestamp.min.tz_localize("US/Pacific") + assert pac.value > Timestamp.min.value + pac.tz_convert("Asia/Tokyo") # tz_convert doesn't change value + with pytest.raises(OutOfBoundsDatetime): + Timestamp.min.tz_localize("Asia/Tokyo") + + # tz_localize that pushes away from the boundary is OK + tokyo = Timestamp.max.tz_localize("Asia/Tokyo") + assert tokyo.value < Timestamp.max.value + tokyo.tz_convert("US/Pacific") # tz_convert doesn't change value + with pytest.raises(OutOfBoundsDatetime): + Timestamp.max.tz_localize("US/Pacific") + + def test_tz_localize_ambiguous_bool(self): + # make sure that we are correctly accepting bool values as ambiguous + # GH#14402 + ts = Timestamp("2015-11-01 01:00:03") + expected0 = Timestamp("2015-11-01 01:00:03-0500", tz="US/Central") + expected1 = Timestamp("2015-11-01 01:00:03-0600", tz="US/Central") + + with pytest.raises(pytz.AmbiguousTimeError): + ts.tz_localize("US/Central") + + result = ts.tz_localize("US/Central", ambiguous=True) + assert result == expected0 + + result = ts.tz_localize("US/Central", ambiguous=False) + assert result == expected1 + + def test_tz_localize_ambiguous(self): + ts = Timestamp("2014-11-02 01:00") + ts_dst = ts.tz_localize("US/Eastern", ambiguous=True) + ts_no_dst = ts.tz_localize("US/Eastern", ambiguous=False) + + assert (ts_no_dst.value - ts_dst.value) / 1e9 == 3600 + with pytest.raises(ValueError): + ts.tz_localize("US/Eastern", ambiguous="infer") + + # GH#8025 + msg = "Cannot localize tz-aware Timestamp, use tz_convert for conversions" + with pytest.raises(TypeError, match=msg): + Timestamp("2011-01-01", tz="US/Eastern").tz_localize("Asia/Tokyo") + + msg = "Cannot convert tz-naive Timestamp, use tz_localize to localize" + with pytest.raises(TypeError, match=msg): + Timestamp("2011-01-01").tz_convert("Asia/Tokyo") + + @pytest.mark.parametrize( + "stamp, tz", + [ + ("2015-03-08 02:00", "US/Eastern"), + ("2015-03-08 02:30", "US/Pacific"), + ("2015-03-29 02:00", "Europe/Paris"), + ("2015-03-29 02:30", "Europe/Belgrade"), + ], + ) + def test_tz_localize_nonexistent(self, stamp, tz): + # GH#13057 + ts = Timestamp(stamp) + with pytest.raises(NonExistentTimeError): + ts.tz_localize(tz) + # GH 22644 + with pytest.raises(NonExistentTimeError): + ts.tz_localize(tz, nonexistent="raise") + assert ts.tz_localize(tz, nonexistent="NaT") is NaT + + def test_tz_localize_ambiguous_raise(self): + # GH#13057 + ts = Timestamp("2015-11-1 01:00") + with pytest.raises(AmbiguousTimeError): + ts.tz_localize("US/Pacific", ambiguous="raise") + + def test_tz_localize_nonexistent_invalid_arg(self): + # GH 22644 + tz = "Europe/Warsaw" + ts = Timestamp("2015-03-29 02:00:00") + with pytest.raises(ValueError): + ts.tz_localize(tz, nonexistent="foo") + + @pytest.mark.parametrize( + "stamp", + [ + "2014-02-01 09:00", + "2014-07-08 09:00", + "2014-11-01 17:00", + "2014-11-05 00:00", + ], + ) + def test_tz_localize_roundtrip(self, stamp, tz_aware_fixture): + tz = tz_aware_fixture + ts = Timestamp(stamp) + localized = ts.tz_localize(tz) + assert localized == Timestamp(stamp, tz=tz) + + with pytest.raises(TypeError): + localized.tz_localize(tz) + + reset = localized.tz_localize(None) + assert reset == ts + assert reset.tzinfo is None + + def test_tz_localize_ambiguous_compat(self): + # validate that pytz and dateutil are compat for dst + # when the transition happens + naive = Timestamp("2013-10-27 01:00:00") + + pytz_zone = "Europe/London" + dateutil_zone = "dateutil/Europe/London" + result_pytz = naive.tz_localize(pytz_zone, ambiguous=0) + result_dateutil = naive.tz_localize(dateutil_zone, ambiguous=0) + assert result_pytz.value == result_dateutil.value + assert result_pytz.value == 1382835600000000000 + + # fixed ambiguous behavior + # see gh-14621 + assert result_pytz.to_pydatetime().tzname() == "GMT" + assert result_dateutil.to_pydatetime().tzname() == "BST" + assert str(result_pytz) != str(result_dateutil) + + # 1 hour difference + result_pytz = naive.tz_localize(pytz_zone, ambiguous=1) + result_dateutil = naive.tz_localize(dateutil_zone, ambiguous=1) + assert result_pytz.value == result_dateutil.value + assert result_pytz.value == 1382832000000000000 + + # see gh-14621 + assert str(result_pytz) == str(result_dateutil) + assert ( + result_pytz.to_pydatetime().tzname() + == result_dateutil.to_pydatetime().tzname() + ) + + @pytest.mark.parametrize( + "tz", + [ + pytz.timezone("US/Eastern"), + gettz("US/Eastern"), + "US/Eastern", + "dateutil/US/Eastern", + ], + ) + def test_timestamp_tz_localize(self, tz): + stamp = Timestamp("3/11/2012 04:00") + + result = stamp.tz_localize(tz) + expected = Timestamp("3/11/2012 04:00", tz=tz) + assert result.hour == expected.hour + assert result == expected + + @pytest.mark.parametrize( + "start_ts, tz, end_ts, shift", + [ + ["2015-03-29 02:20:00", "Europe/Warsaw", "2015-03-29 03:00:00", "forward"], + [ + "2015-03-29 02:20:00", + "Europe/Warsaw", + "2015-03-29 01:59:59.999999999", + "backward", + ], + [ + "2015-03-29 02:20:00", + "Europe/Warsaw", + "2015-03-29 03:20:00", + timedelta(hours=1), + ], + [ + "2015-03-29 02:20:00", + "Europe/Warsaw", + "2015-03-29 01:20:00", + timedelta(hours=-1), + ], + ["2018-03-11 02:33:00", "US/Pacific", "2018-03-11 03:00:00", "forward"], + [ + "2018-03-11 02:33:00", + "US/Pacific", + "2018-03-11 01:59:59.999999999", + "backward", + ], + [ + "2018-03-11 02:33:00", + "US/Pacific", + "2018-03-11 03:33:00", + timedelta(hours=1), + ], + [ + "2018-03-11 02:33:00", + "US/Pacific", + "2018-03-11 01:33:00", + timedelta(hours=-1), + ], + ], + ) + @pytest.mark.parametrize("tz_type", ["", "dateutil/"]) + def test_timestamp_tz_localize_nonexistent_shift( + self, start_ts, tz, end_ts, shift, tz_type + ): + # GH 8917, 24466 + tz = tz_type + tz + if isinstance(shift, str): + shift = "shift_" + shift + ts = Timestamp(start_ts) + result = ts.tz_localize(tz, nonexistent=shift) + expected = Timestamp(end_ts).tz_localize(tz) + assert result == expected + + @pytest.mark.parametrize("offset", [-1, 1]) + @pytest.mark.parametrize("tz_type", ["", "dateutil/"]) + def test_timestamp_tz_localize_nonexistent_shift_invalid(self, offset, tz_type): + # GH 8917, 24466 + tz = tz_type + "Europe/Warsaw" + ts = Timestamp("2015-03-29 02:20:00") + msg = "The provided timedelta will relocalize on a nonexistent time" + with pytest.raises(ValueError, match=msg): + ts.tz_localize(tz, nonexistent=timedelta(seconds=offset)) + + @pytest.mark.parametrize("tz", ["Europe/Warsaw", "dateutil/Europe/Warsaw"]) + def test_timestamp_tz_localize_nonexistent_NaT(self, tz): + # GH 8917 + ts = Timestamp("2015-03-29 02:20:00") + result = ts.tz_localize(tz, nonexistent="NaT") + assert result is NaT + + @pytest.mark.parametrize("tz", ["Europe/Warsaw", "dateutil/Europe/Warsaw"]) + def test_timestamp_tz_localize_nonexistent_raise(self, tz): + # GH 8917 + ts = Timestamp("2015-03-29 02:20:00") + with pytest.raises(pytz.NonExistentTimeError): + ts.tz_localize(tz, nonexistent="raise") + with pytest.raises(ValueError): + ts.tz_localize(tz, nonexistent="foo") + + # ------------------------------------------------------------------ + # Timestamp.tz_convert + + @pytest.mark.parametrize( + "stamp", + [ + "2014-02-01 09:00", + "2014-07-08 09:00", + "2014-11-01 17:00", + "2014-11-05 00:00", + ], + ) + def test_tz_convert_roundtrip(self, stamp, tz_aware_fixture): + tz = tz_aware_fixture + + ts = Timestamp(stamp, tz="UTC") + converted = ts.tz_convert(tz) + + reset = converted.tz_convert(None) + assert reset == Timestamp(stamp) + assert reset.tzinfo is None + assert reset == converted.tz_convert("UTC").tz_localize(None) + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_astimezone(self, tzstr): + # astimezone is an alias for tz_convert, so keep it with + # the tz_convert tests + utcdate = Timestamp("3/11/2012 22:00", tz="UTC") + expected = utcdate.tz_convert(tzstr) + result = utcdate.astimezone(tzstr) + assert expected == result + assert isinstance(result, Timestamp) + + @td.skip_if_windows + def test_tz_convert_utc_with_system_utc(self): + + # from system utc to real utc + ts = Timestamp("2001-01-05 11:56", tz=timezones.maybe_get_tz("dateutil/UTC")) + # check that the time hasn't changed. + assert ts == ts.tz_convert(dateutil.tz.tzutc()) + + # from system utc to real utc + ts = Timestamp("2001-01-05 11:56", tz=timezones.maybe_get_tz("dateutil/UTC")) + # check that the time hasn't changed. + assert ts == ts.tz_convert(dateutil.tz.tzutc()) + + # ------------------------------------------------------------------ + # Timestamp.__init__ with tz str or tzinfo + + def test_timestamp_constructor_tz_utc(self): + utc_stamp = Timestamp("3/11/2012 05:00", tz="utc") + assert utc_stamp.tzinfo is pytz.utc + assert utc_stamp.hour == 5 + + utc_stamp = Timestamp("3/11/2012 05:00").tz_localize("utc") + assert utc_stamp.hour == 5 + + def test_timestamp_to_datetime_tzoffset(self): + tzinfo = tzoffset(None, 7200) + expected = Timestamp("3/11/2012 04:00", tz=tzinfo) + result = Timestamp(expected.to_pydatetime()) + assert expected == result + + def test_timestamp_constructor_near_dst_boundary(self): + # GH#11481 & GH#15777 + # Naive string timestamps were being localized incorrectly + # with tz_convert_single instead of tz_localize_to_utc + + for tz in ["Europe/Brussels", "Europe/Prague"]: + result = Timestamp("2015-10-25 01:00", tz=tz) + expected = Timestamp("2015-10-25 01:00").tz_localize(tz) + assert result == expected + + with pytest.raises(pytz.AmbiguousTimeError): + Timestamp("2015-10-25 02:00", tz=tz) + + result = Timestamp("2017-03-26 01:00", tz="Europe/Paris") + expected = Timestamp("2017-03-26 01:00").tz_localize("Europe/Paris") + assert result == expected + + with pytest.raises(pytz.NonExistentTimeError): + Timestamp("2017-03-26 02:00", tz="Europe/Paris") + + # GH#11708 + naive = Timestamp("2015-11-18 10:00:00") + result = naive.tz_localize("UTC").tz_convert("Asia/Kolkata") + expected = Timestamp("2015-11-18 15:30:00+0530", tz="Asia/Kolkata") + assert result == expected + + # GH#15823 + result = Timestamp("2017-03-26 00:00", tz="Europe/Paris") + expected = Timestamp("2017-03-26 00:00:00+0100", tz="Europe/Paris") + assert result == expected + + result = Timestamp("2017-03-26 01:00", tz="Europe/Paris") + expected = Timestamp("2017-03-26 01:00:00+0100", tz="Europe/Paris") + assert result == expected + + with pytest.raises(pytz.NonExistentTimeError): + Timestamp("2017-03-26 02:00", tz="Europe/Paris") + + result = Timestamp("2017-03-26 02:00:00+0100", tz="Europe/Paris") + naive = Timestamp(result.value) + expected = naive.tz_localize("UTC").tz_convert("Europe/Paris") + assert result == expected + + result = Timestamp("2017-03-26 03:00", tz="Europe/Paris") + expected = Timestamp("2017-03-26 03:00:00+0200", tz="Europe/Paris") + assert result == expected + + @pytest.mark.parametrize( + "tz", + [ + pytz.timezone("US/Eastern"), + gettz("US/Eastern"), + "US/Eastern", + "dateutil/US/Eastern", + ], + ) + def test_timestamp_constructed_by_date_and_tz(self, tz): + # GH#2993, Timestamp cannot be constructed by datetime.date + # and tz correctly + + result = Timestamp(date(2012, 3, 11), tz=tz) + + expected = Timestamp("3/11/2012", tz=tz) + assert result.hour == expected.hour + assert result == expected + + @pytest.mark.parametrize( + "tz", + [ + pytz.timezone("US/Eastern"), + gettz("US/Eastern"), + "US/Eastern", + "dateutil/US/Eastern", + ], + ) + def test_timestamp_add_timedelta_push_over_dst_boundary(self, tz): + # GH#1389 + + # 4 hours before DST transition + stamp = Timestamp("3/10/2012 22:00", tz=tz) + + result = stamp + timedelta(hours=6) + + # spring forward, + "7" hours + expected = Timestamp("3/11/2012 05:00", tz=tz) + + assert result == expected + + def test_timestamp_timetz_equivalent_with_datetime_tz(self, tz_naive_fixture): + # GH21358 + tz = timezones.maybe_get_tz(tz_naive_fixture) + + stamp = Timestamp("2018-06-04 10:20:30", tz=tz) + _datetime = datetime(2018, 6, 4, hour=10, minute=20, second=30, tzinfo=tz) + + result = stamp.timetz() + expected = _datetime.timetz() + + assert result == expected diff --git a/venv/Lib/site-packages/pandas/tests/scalar/timestamp/test_unary_ops.py b/venv/Lib/site-packages/pandas/tests/scalar/timestamp/test_unary_ops.py new file mode 100644 index 0000000..65066fd --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -0,0 +1,420 @@ +from datetime import datetime + +from dateutil.tz import gettz +import pytest +import pytz +from pytz import utc + +from pandas._libs.tslibs import conversion +from pandas._libs.tslibs.frequencies import INVALID_FREQ_ERR_MSG +import pandas.util._test_decorators as td + +from pandas import NaT, Timestamp +import pandas._testing as tm + +from pandas.tseries.frequencies import to_offset + + +class TestTimestampUnaryOps: + + # -------------------------------------------------------------- + # Timestamp.round + @pytest.mark.parametrize( + "timestamp, freq, expected", + [ + ("20130101 09:10:11", "D", "20130101"), + ("20130101 19:10:11", "D", "20130102"), + ("20130201 12:00:00", "D", "20130202"), + ("20130104 12:00:00", "D", "20130105"), + ("2000-01-05 05:09:15.13", "D", "2000-01-05 00:00:00"), + ("2000-01-05 05:09:15.13", "H", "2000-01-05 05:00:00"), + ("2000-01-05 05:09:15.13", "S", "2000-01-05 05:09:15"), + ], + ) + def test_round_frequencies(self, timestamp, freq, expected): + dt = Timestamp(timestamp) + result = dt.round(freq) + expected = Timestamp(expected) + assert result == expected + + def test_round_tzaware(self): + dt = Timestamp("20130101 09:10:11", tz="US/Eastern") + result = dt.round("D") + expected = Timestamp("20130101", tz="US/Eastern") + assert result == expected + + dt = Timestamp("20130101 09:10:11", tz="US/Eastern") + result = dt.round("s") + assert result == dt + + def test_round_30min(self): + # round + dt = Timestamp("20130104 12:32:00") + result = dt.round("30Min") + expected = Timestamp("20130104 12:30:00") + assert result == expected + + def test_round_subsecond(self): + # GH#14440 & GH#15578 + result = Timestamp("2016-10-17 12:00:00.0015").round("ms") + expected = Timestamp("2016-10-17 12:00:00.002000") + assert result == expected + + result = Timestamp("2016-10-17 12:00:00.00149").round("ms") + expected = Timestamp("2016-10-17 12:00:00.001000") + assert result == expected + + ts = Timestamp("2016-10-17 12:00:00.0015") + for freq in ["us", "ns"]: + assert ts == ts.round(freq) + + result = Timestamp("2016-10-17 12:00:00.001501031").round("10ns") + expected = Timestamp("2016-10-17 12:00:00.001501030") + assert result == expected + + def test_round_nonstandard_freq(self): + with tm.assert_produces_warning(False): + Timestamp("2016-10-17 12:00:00.001501031").round("1010ns") + + def test_round_invalid_arg(self): + stamp = Timestamp("2000-01-05 05:09:15.13") + with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): + stamp.round("foo") + + @pytest.mark.parametrize( + "test_input, rounder, freq, expected", + [ + ("2117-01-01 00:00:45", "floor", "15s", "2117-01-01 00:00:45"), + ("2117-01-01 00:00:45", "ceil", "15s", "2117-01-01 00:00:45"), + ( + "2117-01-01 00:00:45.000000012", + "floor", + "10ns", + "2117-01-01 00:00:45.000000010", + ), + ( + "1823-01-01 00:00:01.000000012", + "ceil", + "10ns", + "1823-01-01 00:00:01.000000020", + ), + ("1823-01-01 00:00:01", "floor", "1s", "1823-01-01 00:00:01"), + ("1823-01-01 00:00:01", "ceil", "1s", "1823-01-01 00:00:01"), + ("NaT", "floor", "1s", "NaT"), + ("NaT", "ceil", "1s", "NaT"), + ], + ) + def test_ceil_floor_edge(self, test_input, rounder, freq, expected): + dt = Timestamp(test_input) + func = getattr(dt, rounder) + result = func(freq) + + if dt is NaT: + assert result is NaT + else: + expected = Timestamp(expected) + assert result == expected + + @pytest.mark.parametrize( + "test_input, freq, expected", + [ + ("2018-01-01 00:02:06", "2s", "2018-01-01 00:02:06"), + ("2018-01-01 00:02:00", "2T", "2018-01-01 00:02:00"), + ("2018-01-01 00:04:00", "4T", "2018-01-01 00:04:00"), + ("2018-01-01 00:15:00", "15T", "2018-01-01 00:15:00"), + ("2018-01-01 00:20:00", "20T", "2018-01-01 00:20:00"), + ("2018-01-01 03:00:00", "3H", "2018-01-01 03:00:00"), + ], + ) + @pytest.mark.parametrize("rounder", ["ceil", "floor", "round"]) + def test_round_minute_freq(self, test_input, freq, expected, rounder): + # Ensure timestamps that shouldn't round dont! + # GH#21262 + + dt = Timestamp(test_input) + expected = Timestamp(expected) + func = getattr(dt, rounder) + result = func(freq) + assert result == expected + + def test_ceil(self): + dt = Timestamp("20130101 09:10:11") + result = dt.ceil("D") + expected = Timestamp("20130102") + assert result == expected + + def test_floor(self): + dt = Timestamp("20130101 09:10:11") + result = dt.floor("D") + expected = Timestamp("20130101") + assert result == expected + + @pytest.mark.parametrize("method", ["ceil", "round", "floor"]) + def test_round_dst_border_ambiguous(self, method): + # GH 18946 round near "fall back" DST + ts = Timestamp("2017-10-29 00:00:00", tz="UTC").tz_convert("Europe/Madrid") + # + result = getattr(ts, method)("H", ambiguous=True) + assert result == ts + + result = getattr(ts, method)("H", ambiguous=False) + expected = Timestamp("2017-10-29 01:00:00", tz="UTC").tz_convert( + "Europe/Madrid" + ) + assert result == expected + + result = getattr(ts, method)("H", ambiguous="NaT") + assert result is NaT + + with pytest.raises(pytz.AmbiguousTimeError): + getattr(ts, method)("H", ambiguous="raise") + + @pytest.mark.parametrize( + "method, ts_str, freq", + [ + ["ceil", "2018-03-11 01:59:00-0600", "5min"], + ["round", "2018-03-11 01:59:00-0600", "5min"], + ["floor", "2018-03-11 03:01:00-0500", "2H"], + ], + ) + def test_round_dst_border_nonexistent(self, method, ts_str, freq): + # GH 23324 round near "spring forward" DST + ts = Timestamp(ts_str, tz="America/Chicago") + result = getattr(ts, method)(freq, nonexistent="shift_forward") + expected = Timestamp("2018-03-11 03:00:00", tz="America/Chicago") + assert result == expected + + result = getattr(ts, method)(freq, nonexistent="NaT") + assert result is NaT + + with pytest.raises(pytz.NonExistentTimeError, match="2018-03-11 02:00:00"): + getattr(ts, method)(freq, nonexistent="raise") + + @pytest.mark.parametrize( + "timestamp", + [ + "2018-01-01 0:0:0.124999360", + "2018-01-01 0:0:0.125000367", + "2018-01-01 0:0:0.125500", + "2018-01-01 0:0:0.126500", + "2018-01-01 12:00:00", + "2019-01-01 12:00:00", + ], + ) + @pytest.mark.parametrize( + "freq", + [ + "2ns", + "3ns", + "4ns", + "5ns", + "6ns", + "7ns", + "250ns", + "500ns", + "750ns", + "1us", + "19us", + "250us", + "500us", + "750us", + "1s", + "2s", + "3s", + "1D", + ], + ) + def test_round_int64(self, timestamp, freq): + """check that all rounding modes are accurate to int64 precision + see GH#22591 + """ + dt = Timestamp(timestamp) + unit = to_offset(freq).nanos + + # test floor + result = dt.floor(freq) + assert result.value % unit == 0, "floor not a {} multiple".format(freq) + assert 0 <= dt.value - result.value < unit, "floor error" + + # test ceil + result = dt.ceil(freq) + assert result.value % unit == 0, "ceil not a {} multiple".format(freq) + assert 0 <= result.value - dt.value < unit, "ceil error" + + # test round + result = dt.round(freq) + assert result.value % unit == 0, "round not a {} multiple".format(freq) + assert abs(result.value - dt.value) <= unit // 2, "round error" + if unit % 2 == 0 and abs(result.value - dt.value) == unit // 2: + # round half to even + assert result.value // unit % 2 == 0, "round half to even error" + + # -------------------------------------------------------------- + # Timestamp.replace + + def test_replace_naive(self): + # GH#14621, GH#7825 + ts = Timestamp("2016-01-01 09:00:00") + result = ts.replace(hour=0) + expected = Timestamp("2016-01-01 00:00:00") + assert result == expected + + def test_replace_aware(self, tz_aware_fixture): + tz = tz_aware_fixture + # GH#14621, GH#7825 + # replacing datetime components with and w/o presence of a timezone + ts = Timestamp("2016-01-01 09:00:00", tz=tz) + result = ts.replace(hour=0) + expected = Timestamp("2016-01-01 00:00:00", tz=tz) + assert result == expected + + def test_replace_preserves_nanos(self, tz_aware_fixture): + tz = tz_aware_fixture + # GH#14621, GH#7825 + ts = Timestamp("2016-01-01 09:00:00.000000123", tz=tz) + result = ts.replace(hour=0) + expected = Timestamp("2016-01-01 00:00:00.000000123", tz=tz) + assert result == expected + + def test_replace_multiple(self, tz_aware_fixture): + tz = tz_aware_fixture + # GH#14621, GH#7825 + # replacing datetime components with and w/o presence of a timezone + # test all + ts = Timestamp("2016-01-01 09:00:00.000000123", tz=tz) + result = ts.replace( + year=2015, + month=2, + day=2, + hour=0, + minute=5, + second=5, + microsecond=5, + nanosecond=5, + ) + expected = Timestamp("2015-02-02 00:05:05.000005005", tz=tz) + assert result == expected + + def test_replace_invalid_kwarg(self, tz_aware_fixture): + tz = tz_aware_fixture + # GH#14621, GH#7825 + ts = Timestamp("2016-01-01 09:00:00.000000123", tz=tz) + with pytest.raises(TypeError): + ts.replace(foo=5) + + def test_replace_integer_args(self, tz_aware_fixture): + tz = tz_aware_fixture + # GH#14621, GH#7825 + ts = Timestamp("2016-01-01 09:00:00.000000123", tz=tz) + with pytest.raises(ValueError): + ts.replace(hour=0.1) + + def test_replace_tzinfo_equiv_tz_localize_none(self): + # GH#14621, GH#7825 + # assert conversion to naive is the same as replacing tzinfo with None + ts = Timestamp("2013-11-03 01:59:59.999999-0400", tz="US/Eastern") + assert ts.tz_localize(None) == ts.replace(tzinfo=None) + + @td.skip_if_windows + def test_replace_tzinfo(self): + # GH#15683 + dt = datetime(2016, 3, 27, 1) + tzinfo = pytz.timezone("CET").localize(dt, is_dst=False).tzinfo + + result_dt = dt.replace(tzinfo=tzinfo) + result_pd = Timestamp(dt).replace(tzinfo=tzinfo) + + # datetime.timestamp() converts in the local timezone + with tm.set_timezone("UTC"): + assert result_dt.timestamp() == result_pd.timestamp() + + assert result_dt == result_pd + assert result_dt == result_pd.to_pydatetime() + + result_dt = dt.replace(tzinfo=tzinfo).replace(tzinfo=None) + result_pd = Timestamp(dt).replace(tzinfo=tzinfo).replace(tzinfo=None) + + # datetime.timestamp() converts in the local timezone + with tm.set_timezone("UTC"): + assert result_dt.timestamp() == result_pd.timestamp() + + assert result_dt == result_pd + assert result_dt == result_pd.to_pydatetime() + + @pytest.mark.parametrize( + "tz, normalize", + [ + (pytz.timezone("US/Eastern"), lambda x: x.tzinfo.normalize(x)), + (gettz("US/Eastern"), lambda x: x), + ], + ) + def test_replace_across_dst(self, tz, normalize): + # GH#18319 check that 1) timezone is correctly normalized and + # 2) that hour is not incorrectly changed by this normalization + ts_naive = Timestamp("2017-12-03 16:03:30") + ts_aware = conversion.localize_pydatetime(ts_naive, tz) + + # Preliminary sanity-check + assert ts_aware == normalize(ts_aware) + + # Replace across DST boundary + ts2 = ts_aware.replace(month=6) + + # Check that `replace` preserves hour literal + assert (ts2.hour, ts2.minute) == (ts_aware.hour, ts_aware.minute) + + # Check that post-replace object is appropriately normalized + ts2b = normalize(ts2) + assert ts2 == ts2b + + def test_replace_dst_border(self): + # Gh 7825 + t = Timestamp("2013-11-3", tz="America/Chicago") + result = t.replace(hour=3) + expected = Timestamp("2013-11-3 03:00:00", tz="America/Chicago") + assert result == expected + + @pytest.mark.parametrize("fold", [0, 1]) + @pytest.mark.parametrize("tz", ["dateutil/Europe/London", "Europe/London"]) + def test_replace_dst_fold(self, fold, tz): + # GH 25017 + d = datetime(2019, 10, 27, 2, 30) + ts = Timestamp(d, tz=tz) + result = ts.replace(hour=1, fold=fold) + expected = Timestamp(datetime(2019, 10, 27, 1, 30)).tz_localize( + tz, ambiguous=not fold + ) + assert result == expected + + # -------------------------------------------------------------- + # Timestamp.normalize + + @pytest.mark.parametrize("arg", ["2013-11-30", "2013-11-30 12:00:00"]) + def test_normalize(self, tz_naive_fixture, arg): + tz = tz_naive_fixture + ts = Timestamp(arg, tz=tz) + result = ts.normalize() + expected = Timestamp("2013-11-30", tz=tz) + assert result == expected + + # -------------------------------------------------------------- + + @td.skip_if_windows + def test_timestamp(self): + # GH#17329 + # tz-naive --> treat it as if it were UTC for purposes of timestamp() + ts = Timestamp.now() + uts = ts.replace(tzinfo=utc) + assert ts.timestamp() == uts.timestamp() + + tsc = Timestamp("2014-10-11 11:00:01.12345678", tz="US/Central") + utsc = tsc.tz_convert("UTC") + + # utsc is a different representation of the same time + assert tsc.timestamp() == utsc.timestamp() + + # datetime.timestamp() converts in the local timezone + with tm.set_timezone("UTC"): + # should agree with datetime.timestamp method + dt = ts.to_pydatetime() + assert dt.timestamp() == ts.timestamp() diff --git a/venv/Lib/site-packages/pandas/tests/series/__init__.py b/venv/Lib/site-packages/pandas/tests/series/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/series/conftest.py b/venv/Lib/site-packages/pandas/tests/series/conftest.py new file mode 100644 index 0000000..ff0b0c7 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/conftest.py @@ -0,0 +1,33 @@ +import pytest + +import pandas._testing as tm + + +@pytest.fixture +def datetime_series(): + """ + Fixture for Series of floats with DatetimeIndex + """ + s = tm.makeTimeSeries() + s.name = "ts" + return s + + +@pytest.fixture +def string_series(): + """ + Fixture for Series of floats with Index of unique strings + """ + s = tm.makeStringSeries() + s.name = "series" + return s + + +@pytest.fixture +def object_series(): + """ + Fixture for Series of dtype object with Index of unique strings + """ + s = tm.makeObjectSeries() + s.name = "objects" + return s diff --git a/venv/Lib/site-packages/pandas/tests/series/indexing/__init__.py b/venv/Lib/site-packages/pandas/tests/series/indexing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/series/indexing/test_alter_index.py b/venv/Lib/site-packages/pandas/tests/series/indexing/test_alter_index.py new file mode 100644 index 0000000..47f40e2 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/indexing/test_alter_index.py @@ -0,0 +1,559 @@ +from datetime import datetime + +import numpy as np +import pytest + +import pandas as pd +from pandas import Categorical, Series, date_range, isna +import pandas._testing as tm + + +@pytest.mark.parametrize( + "first_slice,second_slice", + [ + [[2, None], [None, -5]], + [[None, 0], [None, -5]], + [[None, -5], [None, 0]], + [[None, 0], [None, 0]], + ], +) +@pytest.mark.parametrize("fill", [None, -1]) +def test_align(datetime_series, first_slice, second_slice, join_type, fill): + a = datetime_series[slice(*first_slice)] + b = datetime_series[slice(*second_slice)] + + aa, ab = a.align(b, join=join_type, fill_value=fill) + + join_index = a.index.join(b.index, how=join_type) + if fill is not None: + diff_a = aa.index.difference(join_index) + diff_b = ab.index.difference(join_index) + if len(diff_a) > 0: + assert (aa.reindex(diff_a) == fill).all() + if len(diff_b) > 0: + assert (ab.reindex(diff_b) == fill).all() + + ea = a.reindex(join_index) + eb = b.reindex(join_index) + + if fill is not None: + ea = ea.fillna(fill) + eb = eb.fillna(fill) + + tm.assert_series_equal(aa, ea) + tm.assert_series_equal(ab, eb) + assert aa.name == "ts" + assert ea.name == "ts" + assert ab.name == "ts" + assert eb.name == "ts" + + +@pytest.mark.parametrize( + "first_slice,second_slice", + [ + [[2, None], [None, -5]], + [[None, 0], [None, -5]], + [[None, -5], [None, 0]], + [[None, 0], [None, 0]], + ], +) +@pytest.mark.parametrize("method", ["pad", "bfill"]) +@pytest.mark.parametrize("limit", [None, 1]) +def test_align_fill_method( + datetime_series, first_slice, second_slice, join_type, method, limit +): + a = datetime_series[slice(*first_slice)] + b = datetime_series[slice(*second_slice)] + + aa, ab = a.align(b, join=join_type, method=method, limit=limit) + + join_index = a.index.join(b.index, how=join_type) + ea = a.reindex(join_index) + eb = b.reindex(join_index) + + ea = ea.fillna(method=method, limit=limit) + eb = eb.fillna(method=method, limit=limit) + + tm.assert_series_equal(aa, ea) + tm.assert_series_equal(ab, eb) + + +def test_align_nocopy(datetime_series): + b = datetime_series[:5].copy() + + # do copy + a = datetime_series.copy() + ra, _ = a.align(b, join="left") + ra[:5] = 5 + assert not (a[:5] == 5).any() + + # do not copy + a = datetime_series.copy() + ra, _ = a.align(b, join="left", copy=False) + ra[:5] = 5 + assert (a[:5] == 5).all() + + # do copy + a = datetime_series.copy() + b = datetime_series[:5].copy() + _, rb = a.align(b, join="right") + rb[:3] = 5 + assert not (b[:3] == 5).any() + + # do not copy + a = datetime_series.copy() + b = datetime_series[:5].copy() + _, rb = a.align(b, join="right", copy=False) + rb[:2] = 5 + assert (b[:2] == 5).all() + + +def test_align_same_index(datetime_series): + a, b = datetime_series.align(datetime_series, copy=False) + assert a.index is datetime_series.index + assert b.index is datetime_series.index + + a, b = datetime_series.align(datetime_series, copy=True) + assert a.index is not datetime_series.index + assert b.index is not datetime_series.index + + +def test_align_multiindex(): + # GH 10665 + + midx = pd.MultiIndex.from_product( + [range(2), range(3), range(2)], names=("a", "b", "c") + ) + idx = pd.Index(range(2), name="b") + s1 = pd.Series(np.arange(12, dtype="int64"), index=midx) + s2 = pd.Series(np.arange(2, dtype="int64"), index=idx) + + # these must be the same results (but flipped) + res1l, res1r = s1.align(s2, join="left") + res2l, res2r = s2.align(s1, join="right") + + expl = s1 + tm.assert_series_equal(expl, res1l) + tm.assert_series_equal(expl, res2r) + expr = pd.Series([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx) + tm.assert_series_equal(expr, res1r) + tm.assert_series_equal(expr, res2l) + + res1l, res1r = s1.align(s2, join="right") + res2l, res2r = s2.align(s1, join="left") + + exp_idx = pd.MultiIndex.from_product( + [range(2), range(2), range(2)], names=("a", "b", "c") + ) + expl = pd.Series([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx) + tm.assert_series_equal(expl, res1l) + tm.assert_series_equal(expl, res2r) + expr = pd.Series([0, 0, 1, 1] * 2, index=exp_idx) + tm.assert_series_equal(expr, res1r) + tm.assert_series_equal(expr, res2l) + + +def test_reindex(datetime_series, string_series): + identity = string_series.reindex(string_series.index) + + # __array_interface__ is not defined for older numpies + # and on some pythons + try: + assert np.may_share_memory(string_series.index, identity.index) + except AttributeError: + pass + + assert identity.index.is_(string_series.index) + assert identity.index.identical(string_series.index) + + subIndex = string_series.index[10:20] + subSeries = string_series.reindex(subIndex) + + for idx, val in subSeries.items(): + assert val == string_series[idx] + + subIndex2 = datetime_series.index[10:20] + subTS = datetime_series.reindex(subIndex2) + + for idx, val in subTS.items(): + assert val == datetime_series[idx] + stuffSeries = datetime_series.reindex(subIndex) + + assert np.isnan(stuffSeries).all() + + # This is extremely important for the Cython code to not screw up + nonContigIndex = datetime_series.index[::2] + subNonContig = datetime_series.reindex(nonContigIndex) + for idx, val in subNonContig.items(): + assert val == datetime_series[idx] + + # return a copy the same index here + result = datetime_series.reindex() + assert not (result is datetime_series) + + +def test_reindex_nan(): + ts = Series([2, 3, 5, 7], index=[1, 4, np.nan, 8]) + + i, j = [np.nan, 1, np.nan, 8, 4, np.nan], [2, 0, 2, 3, 1, 2] + tm.assert_series_equal(ts.reindex(i), ts.iloc[j]) + + ts.index = ts.index.astype("object") + + # reindex coerces index.dtype to float, loc/iloc doesn't + tm.assert_series_equal(ts.reindex(i), ts.iloc[j], check_index_type=False) + + +def test_reindex_series_add_nat(): + rng = date_range("1/1/2000 00:00:00", periods=10, freq="10s") + series = Series(rng) + + result = series.reindex(range(15)) + assert np.issubdtype(result.dtype, np.dtype("M8[ns]")) + + mask = result.isna() + assert mask[-5:].all() + assert not mask[:-5].any() + + +def test_reindex_with_datetimes(): + rng = date_range("1/1/2000", periods=20) + ts = Series(np.random.randn(20), index=rng) + + result = ts.reindex(list(ts.index[5:10])) + expected = ts[5:10] + tm.assert_series_equal(result, expected) + + result = ts[list(ts.index[5:10])] + tm.assert_series_equal(result, expected) + + +def test_reindex_corner(datetime_series): + # (don't forget to fix this) I think it's fixed + empty = Series(dtype=object) + empty.reindex(datetime_series.index, method="pad") # it works + + # corner case: pad empty series + reindexed = empty.reindex(datetime_series.index, method="pad") + + # pass non-Index + reindexed = datetime_series.reindex(list(datetime_series.index)) + tm.assert_series_equal(datetime_series, reindexed) + + # bad fill method + ts = datetime_series[::2] + msg = ( + r"Invalid fill method\. Expecting pad \(ffill\), backfill" + r" \(bfill\) or nearest\. Got foo" + ) + with pytest.raises(ValueError, match=msg): + ts.reindex(datetime_series.index, method="foo") + + +def test_reindex_pad(): + s = Series(np.arange(10), dtype="int64") + s2 = s[::2] + + reindexed = s2.reindex(s.index, method="pad") + reindexed2 = s2.reindex(s.index, method="ffill") + tm.assert_series_equal(reindexed, reindexed2) + + expected = Series([0, 0, 2, 2, 4, 4, 6, 6, 8, 8], index=np.arange(10)) + tm.assert_series_equal(reindexed, expected) + + # GH4604 + s = Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"]) + new_index = ["a", "g", "c", "f"] + expected = Series([1, 1, 3, 3], index=new_index) + + # this changes dtype because the ffill happens after + result = s.reindex(new_index).ffill() + tm.assert_series_equal(result, expected.astype("float64")) + + result = s.reindex(new_index).ffill(downcast="infer") + tm.assert_series_equal(result, expected) + + expected = Series([1, 5, 3, 5], index=new_index) + result = s.reindex(new_index, method="ffill") + tm.assert_series_equal(result, expected) + + # inference of new dtype + s = Series([True, False, False, True], index=list("abcd")) + new_index = "agc" + result = s.reindex(list(new_index)).ffill() + expected = Series([True, True, False], index=list(new_index)) + tm.assert_series_equal(result, expected) + + # GH4618 shifted series downcasting + s = Series(False, index=range(0, 5)) + result = s.shift(1).fillna(method="bfill") + expected = Series(False, index=range(0, 5)) + tm.assert_series_equal(result, expected) + + +def test_reindex_nearest(): + s = Series(np.arange(10, dtype="int64")) + target = [0.1, 0.9, 1.5, 2.0] + actual = s.reindex(target, method="nearest") + expected = Series(np.around(target).astype("int64"), target) + tm.assert_series_equal(expected, actual) + + actual = s.reindex_like(actual, method="nearest") + tm.assert_series_equal(expected, actual) + + actual = s.reindex_like(actual, method="nearest", tolerance=1) + tm.assert_series_equal(expected, actual) + actual = s.reindex_like(actual, method="nearest", tolerance=[1, 2, 3, 4]) + tm.assert_series_equal(expected, actual) + + actual = s.reindex(target, method="nearest", tolerance=0.2) + expected = Series([0, 1, np.nan, 2], target) + tm.assert_series_equal(expected, actual) + + actual = s.reindex(target, method="nearest", tolerance=[0.3, 0.01, 0.4, 3]) + expected = Series([0, np.nan, np.nan, 2], target) + tm.assert_series_equal(expected, actual) + + +def test_reindex_backfill(): + pass + + +def test_reindex_int(datetime_series): + ts = datetime_series[::2] + int_ts = Series(np.zeros(len(ts), dtype=int), index=ts.index) + + # this should work fine + reindexed_int = int_ts.reindex(datetime_series.index) + + # if NaNs introduced + assert reindexed_int.dtype == np.float_ + + # NO NaNs introduced + reindexed_int = int_ts.reindex(int_ts.index[::2]) + assert reindexed_int.dtype == np.int_ + + +def test_reindex_bool(datetime_series): + # A series other than float, int, string, or object + ts = datetime_series[::2] + bool_ts = Series(np.zeros(len(ts), dtype=bool), index=ts.index) + + # this should work fine + reindexed_bool = bool_ts.reindex(datetime_series.index) + + # if NaNs introduced + assert reindexed_bool.dtype == np.object_ + + # NO NaNs introduced + reindexed_bool = bool_ts.reindex(bool_ts.index[::2]) + assert reindexed_bool.dtype == np.bool_ + + +def test_reindex_bool_pad(datetime_series): + # fail + ts = datetime_series[5:] + bool_ts = Series(np.zeros(len(ts), dtype=bool), index=ts.index) + filled_bool = bool_ts.reindex(datetime_series.index, method="pad") + assert isna(filled_bool[:5]).all() + + +def test_reindex_categorical(): + index = date_range("20000101", periods=3) + + # reindexing to an invalid Categorical + s = Series(["a", "b", "c"], dtype="category") + result = s.reindex(index) + expected = Series( + Categorical(values=[np.nan, np.nan, np.nan], categories=["a", "b", "c"]) + ) + expected.index = index + tm.assert_series_equal(result, expected) + + # partial reindexing + expected = Series(Categorical(values=["b", "c"], categories=["a", "b", "c"])) + expected.index = [1, 2] + result = s.reindex([1, 2]) + tm.assert_series_equal(result, expected) + + expected = Series(Categorical(values=["c", np.nan], categories=["a", "b", "c"])) + expected.index = [2, 3] + result = s.reindex([2, 3]) + tm.assert_series_equal(result, expected) + + +def test_reindex_like(datetime_series): + other = datetime_series[::2] + tm.assert_series_equal( + datetime_series.reindex(other.index), datetime_series.reindex_like(other) + ) + + # GH 7179 + day1 = datetime(2013, 3, 5) + day2 = datetime(2013, 5, 5) + day3 = datetime(2014, 3, 5) + + series1 = Series([5, None, None], [day1, day2, day3]) + series2 = Series([None, None], [day1, day3]) + + result = series1.reindex_like(series2, method="pad") + expected = Series([5, np.nan], index=[day1, day3]) + tm.assert_series_equal(result, expected) + + +def test_reindex_fill_value(): + # ----------------------------------------------------------- + # floats + floats = Series([1.0, 2.0, 3.0]) + result = floats.reindex([1, 2, 3]) + expected = Series([2.0, 3.0, np.nan], index=[1, 2, 3]) + tm.assert_series_equal(result, expected) + + result = floats.reindex([1, 2, 3], fill_value=0) + expected = Series([2.0, 3.0, 0], index=[1, 2, 3]) + tm.assert_series_equal(result, expected) + + # ----------------------------------------------------------- + # ints + ints = Series([1, 2, 3]) + + result = ints.reindex([1, 2, 3]) + expected = Series([2.0, 3.0, np.nan], index=[1, 2, 3]) + tm.assert_series_equal(result, expected) + + # don't upcast + result = ints.reindex([1, 2, 3], fill_value=0) + expected = Series([2, 3, 0], index=[1, 2, 3]) + assert issubclass(result.dtype.type, np.integer) + tm.assert_series_equal(result, expected) + + # ----------------------------------------------------------- + # objects + objects = Series([1, 2, 3], dtype=object) + + result = objects.reindex([1, 2, 3]) + expected = Series([2, 3, np.nan], index=[1, 2, 3], dtype=object) + tm.assert_series_equal(result, expected) + + result = objects.reindex([1, 2, 3], fill_value="foo") + expected = Series([2, 3, "foo"], index=[1, 2, 3], dtype=object) + tm.assert_series_equal(result, expected) + + # ------------------------------------------------------------ + # bools + bools = Series([True, False, True]) + + result = bools.reindex([1, 2, 3]) + expected = Series([False, True, np.nan], index=[1, 2, 3], dtype=object) + tm.assert_series_equal(result, expected) + + result = bools.reindex([1, 2, 3], fill_value=False) + expected = Series([False, True, False], index=[1, 2, 3]) + tm.assert_series_equal(result, expected) + + +def test_reindex_datetimeindexes_tz_naive_and_aware(): + # GH 8306 + idx = date_range("20131101", tz="America/Chicago", periods=7) + newidx = date_range("20131103", periods=10, freq="H") + s = Series(range(7), index=idx) + with pytest.raises(TypeError): + s.reindex(newidx, method="ffill") + + +def test_reindex_empty_series_tz_dtype(): + # GH 20869 + result = Series(dtype="datetime64[ns, UTC]").reindex([0, 1]) + expected = Series([pd.NaT] * 2, dtype="datetime64[ns, UTC]") + tm.assert_equal(result, expected) + + +def test_rename(): + # GH 17407 + s = Series(range(1, 6), index=pd.Index(range(2, 7), name="IntIndex")) + result = s.rename(str) + expected = s.rename(lambda i: str(i)) + tm.assert_series_equal(result, expected) + + assert result.name == expected.name + + +@pytest.mark.parametrize( + "data, index, drop_labels, axis, expected_data, expected_index", + [ + # Unique Index + ([1, 2], ["one", "two"], ["two"], 0, [1], ["one"]), + ([1, 2], ["one", "two"], ["two"], "rows", [1], ["one"]), + ([1, 1, 2], ["one", "two", "one"], ["two"], 0, [1, 2], ["one", "one"]), + # GH 5248 Non-Unique Index + ([1, 1, 2], ["one", "two", "one"], "two", 0, [1, 2], ["one", "one"]), + ([1, 1, 2], ["one", "two", "one"], ["one"], 0, [1], ["two"]), + ([1, 1, 2], ["one", "two", "one"], "one", 0, [1], ["two"]), + ], +) +def test_drop_unique_and_non_unique_index( + data, index, axis, drop_labels, expected_data, expected_index +): + + s = Series(data=data, index=index) + result = s.drop(drop_labels, axis=axis) + expected = Series(data=expected_data, index=expected_index) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "data, index, drop_labels, axis, error_type, error_desc", + [ + # single string/tuple-like + (range(3), list("abc"), "bc", 0, KeyError, "not found in axis"), + # bad axis + (range(3), list("abc"), ("a",), 0, KeyError, "not found in axis"), + (range(3), list("abc"), "one", "columns", ValueError, "No axis named columns"), + ], +) +def test_drop_exception_raised(data, index, drop_labels, axis, error_type, error_desc): + + with pytest.raises(error_type, match=error_desc): + Series(data, index=index).drop(drop_labels, axis=axis) + + +def test_drop_with_ignore_errors(): + # errors='ignore' + s = Series(range(3), index=list("abc")) + result = s.drop("bc", errors="ignore") + tm.assert_series_equal(result, s) + result = s.drop(["a", "d"], errors="ignore") + expected = s.iloc[1:] + tm.assert_series_equal(result, expected) + + # GH 8522 + s = Series([2, 3], index=[True, False]) + assert s.index.is_object() + result = s.drop(True) + expected = Series([3], index=[False]) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("index", [[1, 2, 3], [1, 1, 3]]) +@pytest.mark.parametrize("drop_labels", [[], [1], [3]]) +def test_drop_empty_list(index, drop_labels): + # GH 21494 + expected_index = [i for i in index if i not in drop_labels] + series = pd.Series(index=index, dtype=object).drop(drop_labels) + expected = pd.Series(index=expected_index, dtype=object) + tm.assert_series_equal(series, expected) + + +@pytest.mark.parametrize( + "data, index, drop_labels", + [ + (None, [1, 2, 3], [1, 4]), + (None, [1, 2, 2], [1, 4]), + ([2, 3], [0, 1], [False, True]), + ], +) +def test_drop_non_empty_list(data, index, drop_labels): + # GH 21494 and GH 16877 + with pytest.raises(KeyError, match="not found in axis"): + dtype = object if data is None else None + pd.Series(data=data, index=index, dtype=dtype).drop(drop_labels) diff --git a/venv/Lib/site-packages/pandas/tests/series/indexing/test_boolean.py b/venv/Lib/site-packages/pandas/tests/series/indexing/test_boolean.py new file mode 100644 index 0000000..d75efcf --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/indexing/test_boolean.py @@ -0,0 +1,627 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_integer + +import pandas as pd +from pandas import Index, Series, Timestamp, date_range, isna +import pandas._testing as tm +from pandas.core.indexing import IndexingError + +from pandas.tseries.offsets import BDay + + +def test_getitem_boolean(string_series): + s = string_series + mask = s > s.median() + + # passing list is OK + result = s[list(mask)] + expected = s[mask] + tm.assert_series_equal(result, expected) + tm.assert_index_equal(result.index, s.index[mask]) + + +def test_getitem_boolean_empty(): + s = Series([], dtype=np.int64) + s.index.name = "index_name" + s = s[s.isna()] + assert s.index.name == "index_name" + assert s.dtype == np.int64 + + # GH5877 + # indexing with empty series + s = Series(["A", "B"]) + expected = Series(np.nan, index=["C"], dtype=object) + result = s[Series(["C"], dtype=object)] + tm.assert_series_equal(result, expected) + + s = Series(["A", "B"]) + expected = Series(dtype=object, index=Index([], dtype="int64")) + result = s[Series([], dtype=object)] + tm.assert_series_equal(result, expected) + + # invalid because of the boolean indexer + # that's empty or not-aligned + msg = ( + r"Unalignable boolean Series provided as indexer \(index of" + r" the boolean Series and of the indexed object do not match" + ) + with pytest.raises(IndexingError, match=msg): + s[Series([], dtype=bool)] + + with pytest.raises(IndexingError, match=msg): + s[Series([True], dtype=bool)] + + +def test_getitem_boolean_object(string_series): + # using column from DataFrame + + s = string_series + mask = s > s.median() + omask = mask.astype(object) + + # getitem + result = s[omask] + expected = s[mask] + tm.assert_series_equal(result, expected) + + # setitem + s2 = s.copy() + cop = s.copy() + cop[omask] = 5 + s2[mask] = 5 + tm.assert_series_equal(cop, s2) + + # nans raise exception + omask[5:10] = np.nan + msg = "cannot mask with array containing NA / NaN values" + with pytest.raises(ValueError, match=msg): + s[omask] + with pytest.raises(ValueError, match=msg): + s[omask] = 5 + + +def test_getitem_setitem_boolean_corner(datetime_series): + ts = datetime_series + mask_shifted = ts.shift(1, freq=BDay()) > ts.median() + + # these used to raise...?? + + msg = ( + r"Unalignable boolean Series provided as indexer \(index of" + r" the boolean Series and of the indexed object do not match" + ) + with pytest.raises(IndexingError, match=msg): + ts[mask_shifted] + with pytest.raises(IndexingError, match=msg): + ts[mask_shifted] = 1 + + with pytest.raises(IndexingError, match=msg): + ts.loc[mask_shifted] + with pytest.raises(IndexingError, match=msg): + ts.loc[mask_shifted] = 1 + + +def test_setitem_boolean(string_series): + mask = string_series > string_series.median() + + # similar indexed series + result = string_series.copy() + result[mask] = string_series * 2 + expected = string_series * 2 + tm.assert_series_equal(result[mask], expected[mask]) + + # needs alignment + result = string_series.copy() + result[mask] = (string_series * 2)[0:5] + expected = (string_series * 2)[0:5].reindex_like(string_series) + expected[-mask] = string_series[mask] + tm.assert_series_equal(result[mask], expected[mask]) + + +def test_get_set_boolean_different_order(string_series): + ordered = string_series.sort_values() + + # setting + copy = string_series.copy() + copy[ordered > 0] = 0 + + expected = string_series.copy() + expected[expected > 0] = 0 + + tm.assert_series_equal(copy, expected) + + # getting + sel = string_series[ordered > 0] + exp = string_series[string_series > 0] + tm.assert_series_equal(sel, exp) + + +def test_where_unsafe_int(sint_dtype): + s = Series(np.arange(10), dtype=sint_dtype) + mask = s < 5 + + s[mask] = range(2, 7) + expected = Series(list(range(2, 7)) + list(range(5, 10)), dtype=sint_dtype) + + tm.assert_series_equal(s, expected) + + +def test_where_unsafe_float(float_dtype): + s = Series(np.arange(10), dtype=float_dtype) + mask = s < 5 + + s[mask] = range(2, 7) + data = list(range(2, 7)) + list(range(5, 10)) + expected = Series(data, dtype=float_dtype) + + tm.assert_series_equal(s, expected) + + +@pytest.mark.parametrize( + "dtype,expected_dtype", + [ + (np.int8, np.float64), + (np.int16, np.float64), + (np.int32, np.float64), + (np.int64, np.float64), + (np.float32, np.float32), + (np.float64, np.float64), + ], +) +def test_where_unsafe_upcast(dtype, expected_dtype): + # see gh-9743 + s = Series(np.arange(10), dtype=dtype) + values = [2.5, 3.5, 4.5, 5.5, 6.5] + mask = s < 5 + expected = Series(values + list(range(5, 10)), dtype=expected_dtype) + s[mask] = values + tm.assert_series_equal(s, expected) + + +def test_where_unsafe(): + # see gh-9731 + s = Series(np.arange(10), dtype="int64") + values = [2.5, 3.5, 4.5, 5.5] + + mask = s > 5 + expected = Series(list(range(6)) + values, dtype="float64") + + s[mask] = values + tm.assert_series_equal(s, expected) + + # see gh-3235 + s = Series(np.arange(10), dtype="int64") + mask = s < 5 + s[mask] = range(2, 7) + expected = Series(list(range(2, 7)) + list(range(5, 10)), dtype="int64") + tm.assert_series_equal(s, expected) + assert s.dtype == expected.dtype + + s = Series(np.arange(10), dtype="int64") + mask = s > 5 + s[mask] = [0] * 4 + expected = Series([0, 1, 2, 3, 4, 5] + [0] * 4, dtype="int64") + tm.assert_series_equal(s, expected) + + s = Series(np.arange(10)) + mask = s > 5 + + msg = "cannot assign mismatch length to masked array" + with pytest.raises(ValueError, match=msg): + s[mask] = [5, 4, 3, 2, 1] + + with pytest.raises(ValueError, match=msg): + s[mask] = [0] * 5 + + # dtype changes + s = Series([1, 2, 3, 4]) + result = s.where(s > 2, np.nan) + expected = Series([np.nan, np.nan, 3, 4]) + tm.assert_series_equal(result, expected) + + # GH 4667 + # setting with None changes dtype + s = Series(range(10)).astype(float) + s[8] = None + result = s[8] + assert isna(result) + + s = Series(range(10)).astype(float) + s[s > 8] = None + result = s[isna(s)] + expected = Series(np.nan, index=[9]) + tm.assert_series_equal(result, expected) + + +def test_where(): + s = Series(np.random.randn(5)) + cond = s > 0 + + rs = s.where(cond).dropna() + rs2 = s[cond] + tm.assert_series_equal(rs, rs2) + + rs = s.where(cond, -s) + tm.assert_series_equal(rs, s.abs()) + + rs = s.where(cond) + assert s.shape == rs.shape + assert rs is not s + + # test alignment + cond = Series([True, False, False, True, False], index=s.index) + s2 = -(s.abs()) + + expected = s2[cond].reindex(s2.index[:3]).reindex(s2.index) + rs = s2.where(cond[:3]) + tm.assert_series_equal(rs, expected) + + expected = s2.abs() + expected.iloc[0] = s2[0] + rs = s2.where(cond[:3], -s2) + tm.assert_series_equal(rs, expected) + + +def test_where_error(): + s = Series(np.random.randn(5)) + cond = s > 0 + + msg = "Array conditional must be same shape as self" + with pytest.raises(ValueError, match=msg): + s.where(1) + with pytest.raises(ValueError, match=msg): + s.where(cond[:3].values, -s) + + # GH 2745 + s = Series([1, 2]) + s[[True, False]] = [0, 1] + expected = Series([0, 2]) + tm.assert_series_equal(s, expected) + + # failures + msg = "cannot assign mismatch length to masked array" + with pytest.raises(ValueError, match=msg): + s[[True, False]] = [0, 2, 3] + msg = ( + "NumPy boolean array indexing assignment cannot assign 0 input " + "values to the 1 output values where the mask is true" + ) + with pytest.raises(ValueError, match=msg): + s[[True, False]] = [] + + +@pytest.mark.parametrize("klass", [list, tuple, np.array, Series]) +def test_where_array_like(klass): + # see gh-15414 + s = Series([1, 2, 3]) + cond = [False, True, True] + expected = Series([np.nan, 2, 3]) + + result = s.where(klass(cond)) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "cond", + [ + [1, 0, 1], + Series([2, 5, 7]), + ["True", "False", "True"], + [Timestamp("2017-01-01"), pd.NaT, Timestamp("2017-01-02")], + ], +) +def test_where_invalid_input(cond): + # see gh-15414: only boolean arrays accepted + s = Series([1, 2, 3]) + msg = "Boolean array expected for the condition" + + with pytest.raises(ValueError, match=msg): + s.where(cond) + + msg = "Array conditional must be same shape as self" + with pytest.raises(ValueError, match=msg): + s.where([True]) + + +def test_where_ndframe_align(): + msg = "Array conditional must be same shape as self" + s = Series([1, 2, 3]) + + cond = [True] + with pytest.raises(ValueError, match=msg): + s.where(cond) + + expected = Series([1, np.nan, np.nan]) + + out = s.where(Series(cond)) + tm.assert_series_equal(out, expected) + + cond = np.array([False, True, False, True]) + with pytest.raises(ValueError, match=msg): + s.where(cond) + + expected = Series([np.nan, 2, np.nan]) + + out = s.where(Series(cond)) + tm.assert_series_equal(out, expected) + + +def test_where_setitem_invalid(): + # GH 2702 + # make sure correct exceptions are raised on invalid list assignment + + msg = "cannot set using a {} indexer with a different length than the value" + + # slice + s = Series(list("abc")) + + with pytest.raises(ValueError, match=msg.format("slice")): + s[0:3] = list(range(27)) + + s[0:3] = list(range(3)) + expected = Series([0, 1, 2]) + tm.assert_series_equal(s.astype(np.int64), expected) + + # slice with step + s = Series(list("abcdef")) + + with pytest.raises(ValueError, match=msg.format("slice")): + s[0:4:2] = list(range(27)) + + s = Series(list("abcdef")) + s[0:4:2] = list(range(2)) + expected = Series([0, "b", 1, "d", "e", "f"]) + tm.assert_series_equal(s, expected) + + # neg slices + s = Series(list("abcdef")) + + with pytest.raises(ValueError, match=msg.format("slice")): + s[:-1] = list(range(27)) + + s[-3:-1] = list(range(2)) + expected = Series(["a", "b", "c", 0, 1, "f"]) + tm.assert_series_equal(s, expected) + + # list + s = Series(list("abc")) + + with pytest.raises(ValueError, match=msg.format("list-like")): + s[[0, 1, 2]] = list(range(27)) + + s = Series(list("abc")) + + with pytest.raises(ValueError, match=msg.format("list-like")): + s[[0, 1, 2]] = list(range(2)) + + # scalar + s = Series(list("abc")) + s[0] = list(range(10)) + expected = Series([list(range(10)), "b", "c"]) + tm.assert_series_equal(s, expected) + + +@pytest.mark.parametrize("size", range(2, 6)) +@pytest.mark.parametrize( + "mask", [[True, False, False, False, False], [True, False], [False]] +) +@pytest.mark.parametrize( + "item", [2.0, np.nan, np.finfo(np.float).max, np.finfo(np.float).min] +) +# Test numpy arrays, lists and tuples as the input to be +# broadcast +@pytest.mark.parametrize( + "box", [lambda x: np.array([x]), lambda x: [x], lambda x: (x,)] +) +def test_broadcast(size, mask, item, box): + selection = np.resize(mask, size) + + data = np.arange(size, dtype=float) + + # Construct the expected series by taking the source + # data or item based on the selection + expected = Series( + [item if use_item else data[i] for i, use_item in enumerate(selection)] + ) + + s = Series(data) + s[selection] = box(item) + tm.assert_series_equal(s, expected) + + s = Series(data) + result = s.where(~selection, box(item)) + tm.assert_series_equal(result, expected) + + s = Series(data) + result = s.mask(selection, box(item)) + tm.assert_series_equal(result, expected) + + +def test_where_inplace(): + s = Series(np.random.randn(5)) + cond = s > 0 + + rs = s.copy() + + rs.where(cond, inplace=True) + tm.assert_series_equal(rs.dropna(), s[cond]) + tm.assert_series_equal(rs, s.where(cond)) + + rs = s.copy() + rs.where(cond, -s, inplace=True) + tm.assert_series_equal(rs, s.where(cond, -s)) + + +def test_where_dups(): + # GH 4550 + # where crashes with dups in index + s1 = Series(list(range(3))) + s2 = Series(list(range(3))) + comb = pd.concat([s1, s2]) + result = comb.where(comb < 2) + expected = Series([0, 1, np.nan, 0, 1, np.nan], index=[0, 1, 2, 0, 1, 2]) + tm.assert_series_equal(result, expected) + + # GH 4548 + # inplace updating not working with dups + comb[comb < 1] = 5 + expected = Series([5, 1, 2, 5, 1, 2], index=[0, 1, 2, 0, 1, 2]) + tm.assert_series_equal(comb, expected) + + comb[comb < 2] += 10 + expected = Series([5, 11, 2, 5, 11, 2], index=[0, 1, 2, 0, 1, 2]) + tm.assert_series_equal(comb, expected) + + +def test_where_numeric_with_string(): + # GH 9280 + s = pd.Series([1, 2, 3]) + w = s.where(s > 1, "X") + + assert not is_integer(w[0]) + assert is_integer(w[1]) + assert is_integer(w[2]) + assert isinstance(w[0], str) + assert w.dtype == "object" + + w = s.where(s > 1, ["X", "Y", "Z"]) + assert not is_integer(w[0]) + assert is_integer(w[1]) + assert is_integer(w[2]) + assert isinstance(w[0], str) + assert w.dtype == "object" + + w = s.where(s > 1, np.array(["X", "Y", "Z"])) + assert not is_integer(w[0]) + assert is_integer(w[1]) + assert is_integer(w[2]) + assert isinstance(w[0], str) + assert w.dtype == "object" + + +def test_where_timedelta_coerce(): + s = Series([1, 2], dtype="timedelta64[ns]") + expected = Series([10, 10]) + mask = np.array([False, False]) + + rs = s.where(mask, [10, 10]) + tm.assert_series_equal(rs, expected) + + rs = s.where(mask, 10) + tm.assert_series_equal(rs, expected) + + rs = s.where(mask, 10.0) + tm.assert_series_equal(rs, expected) + + rs = s.where(mask, [10.0, 10.0]) + tm.assert_series_equal(rs, expected) + + rs = s.where(mask, [10.0, np.nan]) + expected = Series([10, None], dtype="object") + tm.assert_series_equal(rs, expected) + + +def test_where_datetime_conversion(): + s = Series(date_range("20130102", periods=2)) + expected = Series([10, 10]) + mask = np.array([False, False]) + + rs = s.where(mask, [10, 10]) + tm.assert_series_equal(rs, expected) + + rs = s.where(mask, 10) + tm.assert_series_equal(rs, expected) + + rs = s.where(mask, 10.0) + tm.assert_series_equal(rs, expected) + + rs = s.where(mask, [10.0, 10.0]) + tm.assert_series_equal(rs, expected) + + rs = s.where(mask, [10.0, np.nan]) + expected = Series([10, None], dtype="object") + tm.assert_series_equal(rs, expected) + + # GH 15701 + timestamps = ["2016-12-31 12:00:04+00:00", "2016-12-31 12:00:04.010000+00:00"] + s = Series([pd.Timestamp(t) for t in timestamps]) + rs = s.where(Series([False, True])) + expected = Series([pd.NaT, s[1]]) + tm.assert_series_equal(rs, expected) + + +def test_where_dt_tz_values(tz_naive_fixture): + ser1 = pd.Series( + pd.DatetimeIndex(["20150101", "20150102", "20150103"], tz=tz_naive_fixture) + ) + ser2 = pd.Series( + pd.DatetimeIndex(["20160514", "20160515", "20160516"], tz=tz_naive_fixture) + ) + mask = pd.Series([True, True, False]) + result = ser1.where(mask, ser2) + exp = pd.Series( + pd.DatetimeIndex(["20150101", "20150102", "20160516"], tz=tz_naive_fixture) + ) + tm.assert_series_equal(exp, result) + + +def test_mask(): + # compare with tested results in test_where + s = Series(np.random.randn(5)) + cond = s > 0 + + rs = s.where(~cond, np.nan) + tm.assert_series_equal(rs, s.mask(cond)) + + rs = s.where(~cond) + rs2 = s.mask(cond) + tm.assert_series_equal(rs, rs2) + + rs = s.where(~cond, -s) + rs2 = s.mask(cond, -s) + tm.assert_series_equal(rs, rs2) + + cond = Series([True, False, False, True, False], index=s.index) + s2 = -(s.abs()) + rs = s2.where(~cond[:3]) + rs2 = s2.mask(cond[:3]) + tm.assert_series_equal(rs, rs2) + + rs = s2.where(~cond[:3], -s2) + rs2 = s2.mask(cond[:3], -s2) + tm.assert_series_equal(rs, rs2) + + msg = "Array conditional must be same shape as self" + with pytest.raises(ValueError, match=msg): + s.mask(1) + with pytest.raises(ValueError, match=msg): + s.mask(cond[:3].values, -s) + + # dtype changes + s = Series([1, 2, 3, 4]) + result = s.mask(s > 2, np.nan) + expected = Series([1, 2, np.nan, np.nan]) + tm.assert_series_equal(result, expected) + + # see gh-21891 + s = Series([1, 2]) + res = s.mask([True, False]) + + exp = Series([np.nan, 2]) + tm.assert_series_equal(res, exp) + + +def test_mask_inplace(): + s = Series(np.random.randn(5)) + cond = s > 0 + + rs = s.copy() + rs.mask(cond, inplace=True) + tm.assert_series_equal(rs.dropna(), s[~cond]) + tm.assert_series_equal(rs, s.mask(cond)) + + rs = s.copy() + rs.mask(cond, -s, inplace=True) + tm.assert_series_equal(rs, s.mask(cond, -s)) diff --git a/venv/Lib/site-packages/pandas/tests/series/indexing/test_callable.py b/venv/Lib/site-packages/pandas/tests/series/indexing/test_callable.py new file mode 100644 index 0000000..fe575cf --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/indexing/test_callable.py @@ -0,0 +1,33 @@ +import pandas as pd +import pandas._testing as tm + + +def test_getitem_callable(): + # GH 12533 + s = pd.Series(4, index=list("ABCD")) + result = s[lambda x: "A"] + assert result == s.loc["A"] + + result = s[lambda x: ["A", "B"]] + tm.assert_series_equal(result, s.loc[["A", "B"]]) + + result = s[lambda x: [True, False, True, True]] + tm.assert_series_equal(result, s.iloc[[0, 2, 3]]) + + +def test_setitem_callable(): + # GH 12533 + s = pd.Series([1, 2, 3, 4], index=list("ABCD")) + s[lambda x: "A"] = -1 + tm.assert_series_equal(s, pd.Series([-1, 2, 3, 4], index=list("ABCD"))) + + +def test_setitem_other_callable(): + # GH 13299 + inc = lambda x: x + 1 + + s = pd.Series([1, 2, -1, 4]) + s[s < 0] = inc + + expected = pd.Series([1, 2, inc, 4]) + tm.assert_series_equal(s, expected) diff --git a/venv/Lib/site-packages/pandas/tests/series/indexing/test_datetime.py b/venv/Lib/site-packages/pandas/tests/series/indexing/test_datetime.py new file mode 100644 index 0000000..15ff5f6 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/indexing/test_datetime.py @@ -0,0 +1,772 @@ +from datetime import datetime, timedelta + +import numpy as np +import pytest + +from pandas._libs import iNaT +import pandas._libs.index as _index + +import pandas as pd +from pandas import DataFrame, DatetimeIndex, NaT, Series, Timestamp, date_range +import pandas._testing as tm + + +""" +Also test support for datetime64[ns] in Series / DataFrame +""" + + +def test_fancy_getitem(): + dti = date_range( + freq="WOM-1FRI", start=datetime(2005, 1, 1), end=datetime(2010, 1, 1) + ) + + s = Series(np.arange(len(dti)), index=dti) + + assert s[48] == 48 + assert s["1/2/2009"] == 48 + assert s["2009-1-2"] == 48 + assert s[datetime(2009, 1, 2)] == 48 + assert s[Timestamp(datetime(2009, 1, 2))] == 48 + with pytest.raises(KeyError, match=r"^'2009-1-3'$"): + s["2009-1-3"] + tm.assert_series_equal( + s["3/6/2009":"2009-06-05"], s[datetime(2009, 3, 6) : datetime(2009, 6, 5)] + ) + + +def test_fancy_setitem(): + dti = date_range( + freq="WOM-1FRI", start=datetime(2005, 1, 1), end=datetime(2010, 1, 1) + ) + + s = Series(np.arange(len(dti)), index=dti) + s[48] = -1 + assert s[48] == -1 + s["1/2/2009"] = -2 + assert s[48] == -2 + s["1/2/2009":"2009-06-05"] = -3 + assert (s[48:54] == -3).all() + + +@pytest.mark.filterwarnings("ignore::DeprecationWarning") +@pytest.mark.parametrize("tz", [None, "Asia/Shanghai", "Europe/Berlin"]) +@pytest.mark.parametrize("name", [None, "my_dti"]) +def test_dti_snap(name, tz): + dti = DatetimeIndex( + [ + "1/1/2002", + "1/2/2002", + "1/3/2002", + "1/4/2002", + "1/5/2002", + "1/6/2002", + "1/7/2002", + ], + name=name, + tz=tz, + freq="D", + ) + + result = dti.snap(freq="W-MON") + expected = date_range("12/31/2001", "1/7/2002", name=name, tz=tz, freq="w-mon") + expected = expected.repeat([3, 4]) + tm.assert_index_equal(result, expected) + assert result.tz == expected.tz + + result = dti.snap(freq="B") + + expected = date_range("1/1/2002", "1/7/2002", name=name, tz=tz, freq="b") + expected = expected.repeat([1, 1, 1, 2, 2]) + tm.assert_index_equal(result, expected) + assert result.tz == expected.tz + + +def test_dti_reset_index_round_trip(): + dti = date_range(start="1/1/2001", end="6/1/2001", freq="D") + d1 = DataFrame({"v": np.random.rand(len(dti))}, index=dti) + d2 = d1.reset_index() + assert d2.dtypes[0] == np.dtype("M8[ns]") + d3 = d2.set_index("index") + tm.assert_frame_equal(d1, d3, check_names=False) + + # #2329 + stamp = datetime(2012, 11, 22) + df = DataFrame([[stamp, 12.1]], columns=["Date", "Value"]) + df = df.set_index("Date") + + assert df.index[0] == stamp + assert df.reset_index()["Date"][0] == stamp + + +def test_series_set_value(): + # #1561 + + dates = [datetime(2001, 1, 1), datetime(2001, 1, 2)] + index = DatetimeIndex(dates) + + s = Series(dtype=object)._set_value(dates[0], 1.0) + s2 = s._set_value(dates[1], np.nan) + + expected = Series([1.0, np.nan], index=index) + + tm.assert_series_equal(s2, expected) + + # FIXME: dont leave commented-out + # s = Series(index[:1], index[:1]) + # s2 = s._set_value(dates[1], index[1]) + # assert s2.values.dtype == 'M8[ns]' + + +@pytest.mark.slow +def test_slice_locs_indexerror(): + times = [datetime(2000, 1, 1) + timedelta(minutes=i * 10) for i in range(100000)] + s = Series(range(100000), times) + s.loc[datetime(1900, 1, 1) : datetime(2100, 1, 1)] + + +def test_slicing_datetimes(): + # GH 7523 + + # unique + df = DataFrame( + np.arange(4.0, dtype="float64"), + index=[datetime(2001, 1, i, 10, 00) for i in [1, 2, 3, 4]], + ) + result = df.loc[datetime(2001, 1, 1, 10) :] + tm.assert_frame_equal(result, df) + result = df.loc[: datetime(2001, 1, 4, 10)] + tm.assert_frame_equal(result, df) + result = df.loc[datetime(2001, 1, 1, 10) : datetime(2001, 1, 4, 10)] + tm.assert_frame_equal(result, df) + + result = df.loc[datetime(2001, 1, 1, 11) :] + expected = df.iloc[1:] + tm.assert_frame_equal(result, expected) + result = df.loc["20010101 11":] + tm.assert_frame_equal(result, expected) + + # duplicates + df = pd.DataFrame( + np.arange(5.0, dtype="float64"), + index=[datetime(2001, 1, i, 10, 00) for i in [1, 2, 2, 3, 4]], + ) + + result = df.loc[datetime(2001, 1, 1, 10) :] + tm.assert_frame_equal(result, df) + result = df.loc[: datetime(2001, 1, 4, 10)] + tm.assert_frame_equal(result, df) + result = df.loc[datetime(2001, 1, 1, 10) : datetime(2001, 1, 4, 10)] + tm.assert_frame_equal(result, df) + + result = df.loc[datetime(2001, 1, 1, 11) :] + expected = df.iloc[1:] + tm.assert_frame_equal(result, expected) + result = df.loc["20010101 11":] + tm.assert_frame_equal(result, expected) + + +def test_frame_datetime64_duplicated(): + dates = date_range("2010-07-01", end="2010-08-05") + + tst = DataFrame({"symbol": "AAA", "date": dates}) + result = tst.duplicated(["date", "symbol"]) + assert (-result).all() + + tst = DataFrame({"date": dates}) + result = tst.duplicated() + assert (-result).all() + + +def test_getitem_setitem_datetime_tz_pytz(): + from pytz import timezone as tz + from pandas import date_range + + N = 50 + # testing with timezone, GH #2785 + rng = date_range("1/1/1990", periods=N, freq="H", tz="US/Eastern") + ts = Series(np.random.randn(N), index=rng) + + # also test Timestamp tz handling, GH #2789 + result = ts.copy() + result["1990-01-01 09:00:00+00:00"] = 0 + result["1990-01-01 09:00:00+00:00"] = ts[4] + tm.assert_series_equal(result, ts) + + result = ts.copy() + result["1990-01-01 03:00:00-06:00"] = 0 + result["1990-01-01 03:00:00-06:00"] = ts[4] + tm.assert_series_equal(result, ts) + + # repeat with datetimes + result = ts.copy() + result[datetime(1990, 1, 1, 9, tzinfo=tz("UTC"))] = 0 + result[datetime(1990, 1, 1, 9, tzinfo=tz("UTC"))] = ts[4] + tm.assert_series_equal(result, ts) + + result = ts.copy() + + # comparison dates with datetime MUST be localized! + date = tz("US/Central").localize(datetime(1990, 1, 1, 3)) + result[date] = 0 + result[date] = ts[4] + tm.assert_series_equal(result, ts) + + +def test_getitem_setitem_datetime_tz_dateutil(): + from dateutil.tz import tzutc + from pandas._libs.tslibs.timezones import dateutil_gettz as gettz + + tz = ( + lambda x: tzutc() if x == "UTC" else gettz(x) + ) # handle special case for utc in dateutil + + from pandas import date_range + + N = 50 + + # testing with timezone, GH #2785 + rng = date_range("1/1/1990", periods=N, freq="H", tz="America/New_York") + ts = Series(np.random.randn(N), index=rng) + + # also test Timestamp tz handling, GH #2789 + result = ts.copy() + result["1990-01-01 09:00:00+00:00"] = 0 + result["1990-01-01 09:00:00+00:00"] = ts[4] + tm.assert_series_equal(result, ts) + + result = ts.copy() + result["1990-01-01 03:00:00-06:00"] = 0 + result["1990-01-01 03:00:00-06:00"] = ts[4] + tm.assert_series_equal(result, ts) + + # repeat with datetimes + result = ts.copy() + result[datetime(1990, 1, 1, 9, tzinfo=tz("UTC"))] = 0 + result[datetime(1990, 1, 1, 9, tzinfo=tz("UTC"))] = ts[4] + tm.assert_series_equal(result, ts) + + result = ts.copy() + result[datetime(1990, 1, 1, 3, tzinfo=tz("America/Chicago"))] = 0 + result[datetime(1990, 1, 1, 3, tzinfo=tz("America/Chicago"))] = ts[4] + tm.assert_series_equal(result, ts) + + +def test_getitem_setitem_datetimeindex(): + N = 50 + # testing with timezone, GH #2785 + rng = date_range("1/1/1990", periods=N, freq="H", tz="US/Eastern") + ts = Series(np.random.randn(N), index=rng) + + result = ts["1990-01-01 04:00:00"] + expected = ts[4] + assert result == expected + + result = ts.copy() + result["1990-01-01 04:00:00"] = 0 + result["1990-01-01 04:00:00"] = ts[4] + tm.assert_series_equal(result, ts) + + result = ts["1990-01-01 04:00:00":"1990-01-01 07:00:00"] + expected = ts[4:8] + tm.assert_series_equal(result, expected) + + result = ts.copy() + result["1990-01-01 04:00:00":"1990-01-01 07:00:00"] = 0 + result["1990-01-01 04:00:00":"1990-01-01 07:00:00"] = ts[4:8] + tm.assert_series_equal(result, ts) + + lb = "1990-01-01 04:00:00" + rb = "1990-01-01 07:00:00" + # GH#18435 strings get a pass from tzawareness compat + result = ts[(ts.index >= lb) & (ts.index <= rb)] + expected = ts[4:8] + tm.assert_series_equal(result, expected) + + lb = "1990-01-01 04:00:00-0500" + rb = "1990-01-01 07:00:00-0500" + result = ts[(ts.index >= lb) & (ts.index <= rb)] + expected = ts[4:8] + tm.assert_series_equal(result, expected) + + # repeat all the above with naive datetimes + result = ts[datetime(1990, 1, 1, 4)] + expected = ts[4] + assert result == expected + + result = ts.copy() + result[datetime(1990, 1, 1, 4)] = 0 + result[datetime(1990, 1, 1, 4)] = ts[4] + tm.assert_series_equal(result, ts) + + result = ts[datetime(1990, 1, 1, 4) : datetime(1990, 1, 1, 7)] + expected = ts[4:8] + tm.assert_series_equal(result, expected) + + result = ts.copy() + result[datetime(1990, 1, 1, 4) : datetime(1990, 1, 1, 7)] = 0 + result[datetime(1990, 1, 1, 4) : datetime(1990, 1, 1, 7)] = ts[4:8] + tm.assert_series_equal(result, ts) + + lb = datetime(1990, 1, 1, 4) + rb = datetime(1990, 1, 1, 7) + msg = "Cannot compare tz-naive and tz-aware datetime-like objects" + with pytest.raises(TypeError, match=msg): + # tznaive vs tzaware comparison is invalid + # see GH#18376, GH#18162 + ts[(ts.index >= lb) & (ts.index <= rb)] + + lb = pd.Timestamp(datetime(1990, 1, 1, 4)).tz_localize(rng.tzinfo) + rb = pd.Timestamp(datetime(1990, 1, 1, 7)).tz_localize(rng.tzinfo) + result = ts[(ts.index >= lb) & (ts.index <= rb)] + expected = ts[4:8] + tm.assert_series_equal(result, expected) + + result = ts[ts.index[4]] + expected = ts[4] + assert result == expected + + result = ts[ts.index[4:8]] + expected = ts[4:8] + tm.assert_series_equal(result, expected) + + result = ts.copy() + result[ts.index[4:8]] = 0 + result[4:8] = ts[4:8] + tm.assert_series_equal(result, ts) + + # also test partial date slicing + result = ts["1990-01-02"] + expected = ts[24:48] + tm.assert_series_equal(result, expected) + + result = ts.copy() + result["1990-01-02"] = 0 + result["1990-01-02"] = ts[24:48] + tm.assert_series_equal(result, ts) + + +def test_getitem_setitem_periodindex(): + from pandas import period_range + + N = 50 + rng = period_range("1/1/1990", periods=N, freq="H") + ts = Series(np.random.randn(N), index=rng) + + result = ts["1990-01-01 04"] + expected = ts[4] + assert result == expected + + result = ts.copy() + result["1990-01-01 04"] = 0 + result["1990-01-01 04"] = ts[4] + tm.assert_series_equal(result, ts) + + result = ts["1990-01-01 04":"1990-01-01 07"] + expected = ts[4:8] + tm.assert_series_equal(result, expected) + + result = ts.copy() + result["1990-01-01 04":"1990-01-01 07"] = 0 + result["1990-01-01 04":"1990-01-01 07"] = ts[4:8] + tm.assert_series_equal(result, ts) + + lb = "1990-01-01 04" + rb = "1990-01-01 07" + result = ts[(ts.index >= lb) & (ts.index <= rb)] + expected = ts[4:8] + tm.assert_series_equal(result, expected) + + # GH 2782 + result = ts[ts.index[4]] + expected = ts[4] + assert result == expected + + result = ts[ts.index[4:8]] + expected = ts[4:8] + tm.assert_series_equal(result, expected) + + result = ts.copy() + result[ts.index[4:8]] = 0 + result[4:8] = ts[4:8] + tm.assert_series_equal(result, ts) + + +# FutureWarning from NumPy. +@pytest.mark.filterwarnings("ignore:Using a non-tuple:FutureWarning") +def test_getitem_median_slice_bug(): + index = date_range("20090415", "20090519", freq="2B") + s = Series(np.random.randn(13), index=index) + + indexer = [slice(6, 7, None)] + result = s[indexer] + expected = s[indexer[0]] + tm.assert_series_equal(result, expected) + + +def test_datetime_indexing(): + from pandas import date_range + + index = date_range("1/1/2000", "1/7/2000") + index = index.repeat(3) + + s = Series(len(index), index=index) + stamp = Timestamp("1/8/2000") + + with pytest.raises(KeyError, match=r"^947289600000000000$"): + s[stamp] + s[stamp] = 0 + assert s[stamp] == 0 + + # not monotonic + s = Series(len(index), index=index) + s = s[::-1] + + with pytest.raises(KeyError, match=r"^947289600000000000$"): + s[stamp] + s[stamp] = 0 + assert s[stamp] == 0 + + +""" +test duplicates in time series +""" + + +@pytest.fixture +def dups(): + dates = [ + datetime(2000, 1, 2), + datetime(2000, 1, 2), + datetime(2000, 1, 2), + datetime(2000, 1, 3), + datetime(2000, 1, 3), + datetime(2000, 1, 3), + datetime(2000, 1, 4), + datetime(2000, 1, 4), + datetime(2000, 1, 4), + datetime(2000, 1, 5), + ] + + return Series(np.random.randn(len(dates)), index=dates) + + +def test_constructor(dups): + assert isinstance(dups, Series) + assert isinstance(dups.index, DatetimeIndex) + + +def test_is_unique_monotonic(dups): + assert not dups.index.is_unique + + +def test_index_unique(dups): + uniques = dups.index.unique() + expected = DatetimeIndex( + [ + datetime(2000, 1, 2), + datetime(2000, 1, 3), + datetime(2000, 1, 4), + datetime(2000, 1, 5), + ] + ) + assert uniques.dtype == "M8[ns]" # sanity + tm.assert_index_equal(uniques, expected) + assert dups.index.nunique() == 4 + + # #2563 + assert isinstance(uniques, DatetimeIndex) + + dups_local = dups.index.tz_localize("US/Eastern") + dups_local.name = "foo" + result = dups_local.unique() + expected = DatetimeIndex(expected, name="foo") + expected = expected.tz_localize("US/Eastern") + assert result.tz is not None + assert result.name == "foo" + tm.assert_index_equal(result, expected) + + # NaT, note this is excluded + arr = [1370745748 + t for t in range(20)] + [iNaT] + idx = DatetimeIndex(arr * 3) + tm.assert_index_equal(idx.unique(), DatetimeIndex(arr)) + assert idx.nunique() == 20 + assert idx.nunique(dropna=False) == 21 + + arr = [ + Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20) + ] + [NaT] + idx = DatetimeIndex(arr * 3) + tm.assert_index_equal(idx.unique(), DatetimeIndex(arr)) + assert idx.nunique() == 20 + assert idx.nunique(dropna=False) == 21 + + +def test_index_dupes_contains(): + d = datetime(2011, 12, 5, 20, 30) + ix = DatetimeIndex([d, d]) + assert d in ix + + +def test_duplicate_dates_indexing(dups): + ts = dups + + uniques = ts.index.unique() + for date in uniques: + result = ts[date] + + mask = ts.index == date + total = (ts.index == date).sum() + expected = ts[mask] + if total > 1: + tm.assert_series_equal(result, expected) + else: + tm.assert_almost_equal(result, expected[0]) + + cp = ts.copy() + cp[date] = 0 + expected = Series(np.where(mask, 0, ts), index=ts.index) + tm.assert_series_equal(cp, expected) + + with pytest.raises(KeyError, match=r"^947116800000000000$"): + ts[datetime(2000, 1, 6)] + + # new index + ts[datetime(2000, 1, 6)] = 0 + assert ts[datetime(2000, 1, 6)] == 0 + + +def test_range_slice(): + idx = DatetimeIndex(["1/1/2000", "1/2/2000", "1/2/2000", "1/3/2000", "1/4/2000"]) + + ts = Series(np.random.randn(len(idx)), index=idx) + + result = ts["1/2/2000":] + expected = ts[1:] + tm.assert_series_equal(result, expected) + + result = ts["1/2/2000":"1/3/2000"] + expected = ts[1:4] + tm.assert_series_equal(result, expected) + + +def test_groupby_average_dup_values(dups): + result = dups.groupby(level=0).mean() + expected = dups.groupby(dups.index).mean() + tm.assert_series_equal(result, expected) + + +def test_indexing_over_size_cutoff(): + import datetime + + # #1821 + + old_cutoff = _index._SIZE_CUTOFF + try: + _index._SIZE_CUTOFF = 1000 + + # create large list of non periodic datetime + dates = [] + sec = datetime.timedelta(seconds=1) + half_sec = datetime.timedelta(microseconds=500000) + d = datetime.datetime(2011, 12, 5, 20, 30) + n = 1100 + for i in range(n): + dates.append(d) + dates.append(d + sec) + dates.append(d + sec + half_sec) + dates.append(d + sec + sec + half_sec) + d += 3 * sec + + # duplicate some values in the list + duplicate_positions = np.random.randint(0, len(dates) - 1, 20) + for p in duplicate_positions: + dates[p + 1] = dates[p] + + df = DataFrame( + np.random.randn(len(dates), 4), index=dates, columns=list("ABCD") + ) + + pos = n * 3 + timestamp = df.index[pos] + assert timestamp in df.index + + # it works! + df.loc[timestamp] + assert len(df.loc[[timestamp]]) > 0 + finally: + _index._SIZE_CUTOFF = old_cutoff + + +def test_indexing_over_size_cutoff_period_index(monkeypatch): + # GH 27136 + + monkeypatch.setattr(_index, "_SIZE_CUTOFF", 1000) + + n = 1100 + idx = pd.period_range("1/1/2000", freq="T", periods=n) + assert idx._engine.over_size_threshold + + s = pd.Series(np.random.randn(len(idx)), index=idx) + + pos = n - 1 + timestamp = idx[pos] + assert timestamp in s.index + + # it works! + s[timestamp] + assert len(s.loc[[timestamp]]) > 0 + + +def test_indexing_unordered(): + # GH 2437 + rng = date_range(start="2011-01-01", end="2011-01-15") + ts = Series(np.random.rand(len(rng)), index=rng) + ts2 = pd.concat([ts[0:4], ts[-4:], ts[4:-4]]) + + for t in ts.index: + # TODO: unused? + s = str(t) # noqa + + expected = ts[t] + result = ts2[t] + assert expected == result + + # GH 3448 (ranges) + def compare(slobj): + result = ts2[slobj].copy() + result = result.sort_index() + expected = ts[slobj] + tm.assert_series_equal(result, expected) + + compare(slice("2011-01-01", "2011-01-15")) + compare(slice("2010-12-30", "2011-01-15")) + compare(slice("2011-01-01", "2011-01-16")) + + # partial ranges + compare(slice("2011-01-01", "2011-01-6")) + compare(slice("2011-01-06", "2011-01-8")) + compare(slice("2011-01-06", "2011-01-12")) + + # single values + result = ts2["2011"].sort_index() + expected = ts["2011"] + tm.assert_series_equal(result, expected) + + # diff freq + rng = date_range(datetime(2005, 1, 1), periods=20, freq="M") + ts = Series(np.arange(len(rng)), index=rng) + ts = ts.take(np.random.permutation(20)) + + result = ts["2005"] + for t in result.index: + assert t.year == 2005 + + +def test_indexing(): + idx = date_range("2001-1-1", periods=20, freq="M") + ts = Series(np.random.rand(len(idx)), index=idx) + + # getting + + # GH 3070, make sure semantics work on Series/Frame + expected = ts["2001"] + expected.name = "A" + + df = DataFrame(dict(A=ts)) + result = df["2001"]["A"] + tm.assert_series_equal(expected, result) + + # setting + ts["2001"] = 1 + expected = ts["2001"] + expected.name = "A" + + df.loc["2001", "A"] = 1 + + result = df["2001"]["A"] + tm.assert_series_equal(expected, result) + + # GH3546 (not including times on the last day) + idx = date_range(start="2013-05-31 00:00", end="2013-05-31 23:00", freq="H") + ts = Series(range(len(idx)), index=idx) + expected = ts["2013-05"] + tm.assert_series_equal(expected, ts) + + idx = date_range(start="2013-05-31 00:00", end="2013-05-31 23:59", freq="S") + ts = Series(range(len(idx)), index=idx) + expected = ts["2013-05"] + tm.assert_series_equal(expected, ts) + + idx = [ + Timestamp("2013-05-31 00:00"), + Timestamp(datetime(2013, 5, 31, 23, 59, 59, 999999)), + ] + ts = Series(range(len(idx)), index=idx) + expected = ts["2013"] + tm.assert_series_equal(expected, ts) + + # GH14826, indexing with a seconds resolution string / datetime object + df = DataFrame( + np.random.rand(5, 5), + columns=["open", "high", "low", "close", "volume"], + index=date_range("2012-01-02 18:01:00", periods=5, tz="US/Central", freq="s"), + ) + expected = df.loc[[df.index[2]]] + + # this is a single date, so will raise + with pytest.raises(KeyError, match=r"^'2012-01-02 18:01:02'$"): + df["2012-01-02 18:01:02"] + msg = r"Timestamp\('2012-01-02 18:01:02-0600', tz='US/Central', freq='S'\)" + with pytest.raises(KeyError, match=msg): + df[df.index[2]] + + +""" +test NaT support +""" + + +def test_set_none_nan(): + series = Series(date_range("1/1/2000", periods=10)) + series[3] = None + assert series[3] is NaT + + series[3:5] = None + assert series[4] is NaT + + series[5] = np.nan + assert series[5] is NaT + + series[5:7] = np.nan + assert series[6] is NaT + + +def test_nat_operations(): + # GH 8617 + s = Series([0, pd.NaT], dtype="m8[ns]") + exp = s[0] + assert s.median() == exp + assert s.min() == exp + assert s.max() == exp + + +@pytest.mark.parametrize("method", ["round", "floor", "ceil"]) +@pytest.mark.parametrize("freq", ["s", "5s", "min", "5min", "h", "5h"]) +def test_round_nat(method, freq): + # GH14940 + s = Series([pd.NaT]) + expected = Series(pd.NaT) + round_method = getattr(s.dt, method) + tm.assert_series_equal(round_method(freq), expected) + + +def test_setitem_tuple_with_datetimetz(): + # GH 20441 + arr = date_range("2017", periods=4, tz="US/Eastern") + index = [(0, 1), (0, 2), (0, 3), (0, 4)] + result = Series(arr, index=index) + expected = result.copy() + result[(0, 1)] = np.nan + expected.iloc[0] = np.nan + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/series/indexing/test_iloc.py b/venv/Lib/site-packages/pandas/tests/series/indexing/test_iloc.py new file mode 100644 index 0000000..f276eb5 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/indexing/test_iloc.py @@ -0,0 +1,32 @@ +import numpy as np + +from pandas import Series +import pandas._testing as tm + + +def test_iloc(): + s = Series(np.random.randn(10), index=list(range(0, 20, 2))) + + for i in range(len(s)): + result = s.iloc[i] + exp = s[s.index[i]] + tm.assert_almost_equal(result, exp) + + # pass a slice + result = s.iloc[slice(1, 3)] + expected = s.loc[2:4] + tm.assert_series_equal(result, expected) + + # test slice is a view + result[:] = 0 + assert (s[1:3] == 0).all() + + # list of integers + result = s.iloc[[0, 2, 3, 4, 5]] + expected = s.reindex(s.index[[0, 2, 3, 4, 5]]) + tm.assert_series_equal(result, expected) + + +def test_iloc_nonunique(): + s = Series([0, 1, 2], index=[0, 1, 0]) + assert s.iloc[2] == 2 diff --git a/venv/Lib/site-packages/pandas/tests/series/indexing/test_indexing.py b/venv/Lib/site-packages/pandas/tests/series/indexing/test_indexing.py new file mode 100644 index 0000000..4fa7079 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/indexing/test_indexing.py @@ -0,0 +1,937 @@ +""" test get/set & misc """ + +from datetime import timedelta + +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_scalar + +import pandas as pd +from pandas import Categorical, DataFrame, MultiIndex, Series, Timedelta, Timestamp +import pandas._testing as tm + +from pandas.tseries.offsets import BDay + + +def test_basic_indexing(): + s = Series(np.random.randn(5), index=["a", "b", "a", "a", "b"]) + + msg = "index out of bounds" + with pytest.raises(IndexError, match=msg): + s[5] + msg = "index 5 is out of bounds for axis 0 with size 5" + with pytest.raises(IndexError, match=msg): + s[5] = 0 + + with pytest.raises(KeyError, match=r"^'c'$"): + s["c"] + + s = s.sort_index() + + msg = r"index out of bounds|^5$" + with pytest.raises(IndexError, match=msg): + s[5] + msg = r"index 5 is out of bounds for axis (0|1) with size 5|^5$" + with pytest.raises(IndexError, match=msg): + s[5] = 0 + + +def test_basic_getitem_with_labels(datetime_series): + indices = datetime_series.index[[5, 10, 15]] + + result = datetime_series[indices] + expected = datetime_series.reindex(indices) + tm.assert_series_equal(result, expected) + + result = datetime_series[indices[0] : indices[2]] + expected = datetime_series.loc[indices[0] : indices[2]] + tm.assert_series_equal(result, expected) + + # integer indexes, be careful + s = Series(np.random.randn(10), index=list(range(0, 20, 2))) + inds = [0, 2, 5, 7, 8] + arr_inds = np.array([0, 2, 5, 7, 8]) + with pytest.raises(KeyError, match="with any missing labels"): + s[inds] + + with pytest.raises(KeyError, match="with any missing labels"): + s[arr_inds] + + # GH12089 + # with tz for values + s = Series( + pd.date_range("2011-01-01", periods=3, tz="US/Eastern"), index=["a", "b", "c"] + ) + expected = Timestamp("2011-01-01", tz="US/Eastern") + result = s.loc["a"] + assert result == expected + result = s.iloc[0] + assert result == expected + result = s["a"] + assert result == expected + + +def test_getitem_setitem_ellipsis(): + s = Series(np.random.randn(10)) + + np.fix(s) + + result = s[...] + tm.assert_series_equal(result, s) + + s[...] = 5 + assert (result == 5).all() + + +def test_getitem_get(datetime_series, string_series, object_series): + idx1 = string_series.index[5] + idx2 = object_series.index[5] + + assert string_series[idx1] == string_series.get(idx1) + assert object_series[idx2] == object_series.get(idx2) + + assert string_series[idx1] == string_series[5] + assert object_series[idx2] == object_series[5] + + assert string_series.get(-1) == string_series.get(string_series.index[-1]) + assert string_series[5] == string_series.get(string_series.index[5]) + + # missing + d = datetime_series.index[0] - BDay() + msg = r"Timestamp\('1999-12-31 00:00:00', freq='B'\)" + with pytest.raises(KeyError, match=msg): + datetime_series[d] + + # None + # GH 5652 + s1 = Series(dtype=object) + s2 = Series(dtype=object, index=list("abc")) + for s in [s1, s2]: + result = s.get(None) + assert result is None + + +def test_getitem_fancy(string_series, object_series): + slice1 = string_series[[1, 2, 3]] + slice2 = object_series[[1, 2, 3]] + assert string_series.index[2] == slice1.index[1] + assert object_series.index[2] == slice2.index[1] + assert string_series[2] == slice1[1] + assert object_series[2] == slice2[1] + + +def test_getitem_generator(string_series): + gen = (x > 0 for x in string_series) + result = string_series[gen] + result2 = string_series[iter(string_series > 0)] + expected = string_series[string_series > 0] + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result2, expected) + + +def test_type_promotion(): + # GH12599 + s = pd.Series(dtype=object) + s["a"] = pd.Timestamp("2016-01-01") + s["b"] = 3.0 + s["c"] = "foo" + expected = Series([pd.Timestamp("2016-01-01"), 3.0, "foo"], index=["a", "b", "c"]) + tm.assert_series_equal(s, expected) + + +@pytest.mark.parametrize( + "result_1, duplicate_item, expected_1", + [ + [ + pd.Series({1: 12, 2: [1, 2, 2, 3]}), + pd.Series({1: 313}), + pd.Series({1: 12}, dtype=object), + ], + [ + pd.Series({1: [1, 2, 3], 2: [1, 2, 2, 3]}), + pd.Series({1: [1, 2, 3]}), + pd.Series({1: [1, 2, 3]}), + ], + ], +) +def test_getitem_with_duplicates_indices(result_1, duplicate_item, expected_1): + # GH 17610 + result = result_1.append(duplicate_item) + expected = expected_1.append(duplicate_item) + tm.assert_series_equal(result[1], expected) + assert result[2] == result_1[2] + + +def test_getitem_out_of_bounds(datetime_series): + # don't segfault, GH #495 + msg = "index out of bounds" + with pytest.raises(IndexError, match=msg): + datetime_series[len(datetime_series)] + + # GH #917 + s = Series([], dtype=object) + with pytest.raises(IndexError, match=msg): + s[-1] + + +def test_getitem_setitem_integers(): + # caused bug without test + s = Series([1, 2, 3], ["a", "b", "c"]) + + assert s.iloc[0] == s["a"] + s.iloc[0] = 5 + tm.assert_almost_equal(s["a"], 5) + + +def test_getitem_box_float64(datetime_series): + value = datetime_series[5] + assert isinstance(value, np.float64) + + +@pytest.mark.parametrize( + "arr", + [np.random.randn(10), tm.makeDateIndex(10, name="a").tz_localize(tz="US/Eastern")], +) +def test_get(arr): + # GH 21260 + s = Series(arr, index=[2 * i for i in range(len(arr))]) + assert s.get(4) == s.iloc[2] + + result = s.get([4, 6]) + expected = s.iloc[[2, 3]] + tm.assert_series_equal(result, expected) + + result = s.get(slice(2)) + expected = s.iloc[[0, 1]] + tm.assert_series_equal(result, expected) + + assert s.get(-1) is None + assert s.get(s.index.max() + 1) is None + + s = Series(arr[:6], index=list("abcdef")) + assert s.get("c") == s.iloc[2] + + result = s.get(slice("b", "d")) + expected = s.iloc[[1, 2, 3]] + tm.assert_series_equal(result, expected) + + result = s.get("Z") + assert result is None + + assert s.get(4) == s.iloc[4] + assert s.get(-1) == s.iloc[-1] + assert s.get(len(s)) is None + + # GH 21257 + s = pd.Series(arr) + s2 = s[::2] + assert s2.get(1) is None + + +def test_series_box_timestamp(): + rng = pd.date_range("20090415", "20090519", freq="B") + ser = Series(rng) + + assert isinstance(ser[5], pd.Timestamp) + + rng = pd.date_range("20090415", "20090519", freq="B") + ser = Series(rng, index=rng) + assert isinstance(ser[5], pd.Timestamp) + + assert isinstance(ser.iat[5], pd.Timestamp) + + +def test_getitem_ambiguous_keyerror(): + s = Series(range(10), index=list(range(0, 20, 2))) + with pytest.raises(KeyError, match=r"^1$"): + s[1] + with pytest.raises(KeyError, match=r"^1$"): + s.loc[1] + + +def test_getitem_unordered_dup(): + obj = Series(range(5), index=["c", "a", "a", "b", "b"]) + assert is_scalar(obj["c"]) + assert obj["c"] == 0 + + +def test_getitem_dups_with_missing(): + # breaks reindex, so need to use .loc internally + # GH 4246 + s = Series([1, 2, 3, 4], ["foo", "bar", "foo", "bah"]) + with pytest.raises(KeyError, match="with any missing labels"): + s.loc[["foo", "bar", "bah", "bam"]] + + with pytest.raises(KeyError, match="with any missing labels"): + s[["foo", "bar", "bah", "bam"]] + + +def test_getitem_dups(): + s = Series(range(5), index=["A", "A", "B", "C", "C"], dtype=np.int64) + expected = Series([3, 4], index=["C", "C"], dtype=np.int64) + result = s["C"] + tm.assert_series_equal(result, expected) + + +def test_setitem_ambiguous_keyerror(): + s = Series(range(10), index=list(range(0, 20, 2))) + + # equivalent of an append + s2 = s.copy() + s2[1] = 5 + expected = s.append(Series([5], index=[1])) + tm.assert_series_equal(s2, expected) + + s2 = s.copy() + s2.loc[1] = 5 + expected = s.append(Series([5], index=[1])) + tm.assert_series_equal(s2, expected) + + +def test_getitem_dataframe(): + rng = list(range(10)) + s = pd.Series(10, index=rng) + df = pd.DataFrame(rng, index=rng) + msg = ( + "Indexing a Series with DataFrame is not supported, " + "use the appropriate DataFrame column" + ) + with pytest.raises(TypeError, match=msg): + s[df > 5] + + +def test_setitem(datetime_series, string_series): + datetime_series[datetime_series.index[5]] = np.NaN + datetime_series[[1, 2, 17]] = np.NaN + datetime_series[6] = np.NaN + assert np.isnan(datetime_series[6]) + assert np.isnan(datetime_series[2]) + datetime_series[np.isnan(datetime_series)] = 5 + assert not np.isnan(datetime_series[2]) + + # caught this bug when writing tests + series = Series(tm.makeIntIndex(20).astype(float), index=tm.makeIntIndex(20)) + + series[::2] = 0 + assert (series[::2] == 0).all() + + # set item that's not contained + s = string_series.copy() + s["foobar"] = 1 + + app = Series([1], index=["foobar"], name="series") + expected = string_series.append(app) + tm.assert_series_equal(s, expected) + + # Test for issue #10193 + key = pd.Timestamp("2012-01-01") + series = pd.Series(dtype=object) + series[key] = 47 + expected = pd.Series(47, [key]) + tm.assert_series_equal(series, expected) + + series = pd.Series([], pd.DatetimeIndex([], freq="D"), dtype=object) + series[key] = 47 + expected = pd.Series(47, pd.DatetimeIndex([key], freq="D")) + tm.assert_series_equal(series, expected) + + +def test_setitem_dtypes(): + # change dtypes + # GH 4463 + expected = Series([np.nan, 2, 3]) + + s = Series([1, 2, 3]) + s.iloc[0] = np.nan + tm.assert_series_equal(s, expected) + + s = Series([1, 2, 3]) + s.loc[0] = np.nan + tm.assert_series_equal(s, expected) + + s = Series([1, 2, 3]) + s[0] = np.nan + tm.assert_series_equal(s, expected) + + s = Series([False]) + s.loc[0] = np.nan + tm.assert_series_equal(s, Series([np.nan])) + + s = Series([False, True]) + s.loc[0] = np.nan + tm.assert_series_equal(s, Series([np.nan, 1.0])) + + +def test_set_value(datetime_series, string_series): + idx = datetime_series.index[10] + res = datetime_series._set_value(idx, 0) + assert res is datetime_series + assert datetime_series[idx] == 0 + + # equiv + s = string_series.copy() + res = s._set_value("foobar", 0) + assert res is s + assert res.index[-1] == "foobar" + assert res["foobar"] == 0 + + s = string_series.copy() + s.loc["foobar"] = 0 + assert s.index[-1] == "foobar" + assert s["foobar"] == 0 + + +def test_setslice(datetime_series): + sl = datetime_series[5:20] + assert len(sl) == len(sl.index) + assert sl.index.is_unique is True + + +def test_2d_to_1d_assignment_raises(): + x = np.random.randn(2, 2) + y = pd.Series(range(2)) + + msg = ( + r"shape mismatch: value array of shape \(2,2\) could not be" + r" broadcast to indexing result of shape \(2,\)" + ) + with pytest.raises(ValueError, match=msg): + y.loc[range(2)] = x + + msg = r"could not broadcast input array from shape \(2,2\) into shape \(2\)" + with pytest.raises(ValueError, match=msg): + y.loc[:] = x + + +# FutureWarning from NumPy about [slice(None, 5). +@pytest.mark.filterwarnings("ignore:Using a non-tuple:FutureWarning") +def test_basic_getitem_setitem_corner(datetime_series): + # invalid tuples, e.g. td.ts[:, None] vs. td.ts[:, 2] + msg = "Can only tuple-index with a MultiIndex" + with pytest.raises(ValueError, match=msg): + datetime_series[:, 2] + with pytest.raises(ValueError, match=msg): + datetime_series[:, 2] = 2 + + # weird lists. [slice(0, 5)] will work but not two slices + result = datetime_series[[slice(None, 5)]] + expected = datetime_series[:5] + tm.assert_series_equal(result, expected) + + # OK + msg = r"unhashable type(: 'slice')?" + with pytest.raises(TypeError, match=msg): + datetime_series[[5, slice(None, None)]] + with pytest.raises(TypeError, match=msg): + datetime_series[[5, slice(None, None)]] = 2 + + +@pytest.mark.parametrize("tz", ["US/Eastern", "UTC", "Asia/Tokyo"]) +def test_setitem_with_tz(tz): + orig = pd.Series(pd.date_range("2016-01-01", freq="H", periods=3, tz=tz)) + assert orig.dtype == "datetime64[ns, {0}]".format(tz) + + # scalar + s = orig.copy() + s[1] = pd.Timestamp("2011-01-01", tz=tz) + exp = pd.Series( + [ + pd.Timestamp("2016-01-01 00:00", tz=tz), + pd.Timestamp("2011-01-01 00:00", tz=tz), + pd.Timestamp("2016-01-01 02:00", tz=tz), + ] + ) + tm.assert_series_equal(s, exp) + + s = orig.copy() + s.loc[1] = pd.Timestamp("2011-01-01", tz=tz) + tm.assert_series_equal(s, exp) + + s = orig.copy() + s.iloc[1] = pd.Timestamp("2011-01-01", tz=tz) + tm.assert_series_equal(s, exp) + + # vector + vals = pd.Series( + [pd.Timestamp("2011-01-01", tz=tz), pd.Timestamp("2012-01-01", tz=tz)], + index=[1, 2], + ) + assert vals.dtype == "datetime64[ns, {0}]".format(tz) + + s[[1, 2]] = vals + exp = pd.Series( + [ + pd.Timestamp("2016-01-01 00:00", tz=tz), + pd.Timestamp("2011-01-01 00:00", tz=tz), + pd.Timestamp("2012-01-01 00:00", tz=tz), + ] + ) + tm.assert_series_equal(s, exp) + + s = orig.copy() + s.loc[[1, 2]] = vals + tm.assert_series_equal(s, exp) + + s = orig.copy() + s.iloc[[1, 2]] = vals + tm.assert_series_equal(s, exp) + + +def test_setitem_with_tz_dst(): + # GH XXX + tz = "US/Eastern" + orig = pd.Series(pd.date_range("2016-11-06", freq="H", periods=3, tz=tz)) + assert orig.dtype == "datetime64[ns, {0}]".format(tz) + + # scalar + s = orig.copy() + s[1] = pd.Timestamp("2011-01-01", tz=tz) + exp = pd.Series( + [ + pd.Timestamp("2016-11-06 00:00-04:00", tz=tz), + pd.Timestamp("2011-01-01 00:00-05:00", tz=tz), + pd.Timestamp("2016-11-06 01:00-05:00", tz=tz), + ] + ) + tm.assert_series_equal(s, exp) + + s = orig.copy() + s.loc[1] = pd.Timestamp("2011-01-01", tz=tz) + tm.assert_series_equal(s, exp) + + s = orig.copy() + s.iloc[1] = pd.Timestamp("2011-01-01", tz=tz) + tm.assert_series_equal(s, exp) + + # vector + vals = pd.Series( + [pd.Timestamp("2011-01-01", tz=tz), pd.Timestamp("2012-01-01", tz=tz)], + index=[1, 2], + ) + assert vals.dtype == "datetime64[ns, {0}]".format(tz) + + s[[1, 2]] = vals + exp = pd.Series( + [ + pd.Timestamp("2016-11-06 00:00", tz=tz), + pd.Timestamp("2011-01-01 00:00", tz=tz), + pd.Timestamp("2012-01-01 00:00", tz=tz), + ] + ) + tm.assert_series_equal(s, exp) + + s = orig.copy() + s.loc[[1, 2]] = vals + tm.assert_series_equal(s, exp) + + s = orig.copy() + s.iloc[[1, 2]] = vals + tm.assert_series_equal(s, exp) + + +def test_categorical_assigning_ops(): + orig = Series(Categorical(["b", "b"], categories=["a", "b"])) + s = orig.copy() + s[:] = "a" + exp = Series(Categorical(["a", "a"], categories=["a", "b"])) + tm.assert_series_equal(s, exp) + + s = orig.copy() + s[1] = "a" + exp = Series(Categorical(["b", "a"], categories=["a", "b"])) + tm.assert_series_equal(s, exp) + + s = orig.copy() + s[s.index > 0] = "a" + exp = Series(Categorical(["b", "a"], categories=["a", "b"])) + tm.assert_series_equal(s, exp) + + s = orig.copy() + s[[False, True]] = "a" + exp = Series(Categorical(["b", "a"], categories=["a", "b"])) + tm.assert_series_equal(s, exp) + + s = orig.copy() + s.index = ["x", "y"] + s["y"] = "a" + exp = Series(Categorical(["b", "a"], categories=["a", "b"]), index=["x", "y"]) + tm.assert_series_equal(s, exp) + + # ensure that one can set something to np.nan + s = Series(Categorical([1, 2, 3])) + exp = Series(Categorical([1, np.nan, 3], categories=[1, 2, 3])) + s[1] = np.nan + tm.assert_series_equal(s, exp) + + +def test_slice(string_series, object_series): + numSlice = string_series[10:20] + numSliceEnd = string_series[-10:] + objSlice = object_series[10:20] + + assert string_series.index[9] not in numSlice.index + assert object_series.index[9] not in objSlice.index + + assert len(numSlice) == len(numSlice.index) + assert string_series[numSlice.index[0]] == numSlice[numSlice.index[0]] + + assert numSlice.index[1] == string_series.index[11] + assert tm.equalContents(numSliceEnd, np.array(string_series)[-10:]) + + # Test return view. + sl = string_series[10:20] + sl[:] = 0 + + assert (string_series[10:20] == 0).all() + + +def test_slice_can_reorder_not_uniquely_indexed(): + s = Series(1, index=["a", "a", "b", "b", "c"]) + s[::-1] # it works! + + +def test_loc_setitem(string_series): + inds = string_series.index[[3, 4, 7]] + + result = string_series.copy() + result.loc[inds] = 5 + + expected = string_series.copy() + expected[[3, 4, 7]] = 5 + tm.assert_series_equal(result, expected) + + result.iloc[5:10] = 10 + expected[5:10] = 10 + tm.assert_series_equal(result, expected) + + # set slice with indices + d1, d2 = string_series.index[[5, 15]] + result.loc[d1:d2] = 6 + expected[5:16] = 6 # because it's inclusive + tm.assert_series_equal(result, expected) + + # set index value + string_series.loc[d1] = 4 + string_series.loc[d2] = 6 + assert string_series[d1] == 4 + assert string_series[d2] == 6 + + +def test_setitem_na(): + # these induce dtype changes + expected = Series([np.nan, 3, np.nan, 5, np.nan, 7, np.nan, 9, np.nan]) + s = Series([2, 3, 4, 5, 6, 7, 8, 9, 10]) + s[::2] = np.nan + tm.assert_series_equal(s, expected) + + # gets coerced to float, right? + expected = Series([np.nan, 1, np.nan, 0]) + s = Series([True, True, False, False]) + s[::2] = np.nan + tm.assert_series_equal(s, expected) + + expected = Series([np.nan, np.nan, np.nan, np.nan, np.nan, 5, 6, 7, 8, 9]) + s = Series(np.arange(10)) + s[:5] = np.nan + tm.assert_series_equal(s, expected) + + +def test_timedelta_assignment(): + # GH 8209 + s = Series([], dtype=object) + s.loc["B"] = timedelta(1) + tm.assert_series_equal(s, Series(Timedelta("1 days"), index=["B"])) + + s = s.reindex(s.index.insert(0, "A")) + tm.assert_series_equal(s, Series([np.nan, Timedelta("1 days")], index=["A", "B"])) + + result = s.fillna(timedelta(1)) + expected = Series(Timedelta("1 days"), index=["A", "B"]) + tm.assert_series_equal(result, expected) + + s.loc["A"] = timedelta(1) + tm.assert_series_equal(s, expected) + + # GH 14155 + s = Series(10 * [np.timedelta64(10, "m")]) + s.loc[[1, 2, 3]] = np.timedelta64(20, "m") + expected = pd.Series(10 * [np.timedelta64(10, "m")]) + expected.loc[[1, 2, 3]] = pd.Timedelta(np.timedelta64(20, "m")) + tm.assert_series_equal(s, expected) + + +@pytest.mark.parametrize( + "nat_val,should_cast", + [ + (pd.NaT, True), + (np.timedelta64("NaT", "ns"), False), + (np.datetime64("NaT", "ns"), True), + ], +) +@pytest.mark.parametrize("tz", [None, "UTC"]) +def test_dt64_series_assign_nat(nat_val, should_cast, tz): + # some nat-like values should be cast to datetime64 when inserting + # into a datetime64 series. Others should coerce to object + # and retain their dtypes. + dti = pd.date_range("2016-01-01", periods=3, tz=tz) + base = pd.Series(dti) + expected = pd.Series([pd.NaT] + list(dti[1:]), dtype=dti.dtype) + if not should_cast: + expected = expected.astype(object) + + ser = base.copy(deep=True) + ser[0] = nat_val + tm.assert_series_equal(ser, expected) + + ser = base.copy(deep=True) + ser.loc[0] = nat_val + tm.assert_series_equal(ser, expected) + + ser = base.copy(deep=True) + ser.iloc[0] = nat_val + tm.assert_series_equal(ser, expected) + + +@pytest.mark.parametrize( + "nat_val,should_cast", + [ + (pd.NaT, True), + (np.timedelta64("NaT", "ns"), True), + (np.datetime64("NaT", "ns"), False), + ], +) +def test_td64_series_assign_nat(nat_val, should_cast): + # some nat-like values should be cast to timedelta64 when inserting + # into a timedelta64 series. Others should coerce to object + # and retain their dtypes. + base = pd.Series([0, 1, 2], dtype="m8[ns]") + expected = pd.Series([pd.NaT, 1, 2], dtype="m8[ns]") + if not should_cast: + expected = expected.astype(object) + + ser = base.copy(deep=True) + ser[0] = nat_val + tm.assert_series_equal(ser, expected) + + ser = base.copy(deep=True) + ser.loc[0] = nat_val + tm.assert_series_equal(ser, expected) + + ser = base.copy(deep=True) + ser.iloc[0] = nat_val + tm.assert_series_equal(ser, expected) + + +@pytest.mark.parametrize( + "td", + [ + pd.Timedelta("9 days"), + pd.Timedelta("9 days").to_timedelta64(), + pd.Timedelta("9 days").to_pytimedelta(), + ], +) +def test_append_timedelta_does_not_cast(td): + # GH#22717 inserting a Timedelta should _not_ cast to int64 + expected = pd.Series(["x", td], index=[0, "td"], dtype=object) + + ser = pd.Series(["x"]) + ser["td"] = td + tm.assert_series_equal(ser, expected) + assert isinstance(ser["td"], pd.Timedelta) + + ser = pd.Series(["x"]) + ser.loc["td"] = pd.Timedelta("9 days") + tm.assert_series_equal(ser, expected) + assert isinstance(ser["td"], pd.Timedelta) + + +def test_underlying_data_conversion(): + # GH 4080 + df = DataFrame({c: [1, 2, 3] for c in ["a", "b", "c"]}) + df.set_index(["a", "b", "c"], inplace=True) + s = Series([1], index=[(2, 2, 2)]) + df["val"] = 0 + df + df["val"].update(s) + + expected = DataFrame(dict(a=[1, 2, 3], b=[1, 2, 3], c=[1, 2, 3], val=[0, 1, 0])) + expected.set_index(["a", "b", "c"], inplace=True) + tm.assert_frame_equal(df, expected) + + # GH 3970 + # these are chained assignments as well + pd.set_option("chained_assignment", None) + df = DataFrame({"aa": range(5), "bb": [2.2] * 5}) + df["cc"] = 0.0 + + ck = [True] * len(df) + + df["bb"].iloc[0] = 0.13 + + # TODO: unused + df_tmp = df.iloc[ck] # noqa + + df["bb"].iloc[0] = 0.15 + assert df["bb"].iloc[0] == 0.15 + pd.set_option("chained_assignment", "raise") + + # GH 3217 + df = DataFrame(dict(a=[1, 3], b=[np.nan, 2])) + df["c"] = np.nan + df["c"].update(pd.Series(["foo"], index=[0])) + + expected = DataFrame(dict(a=[1, 3], b=[np.nan, 2], c=["foo", np.nan])) + tm.assert_frame_equal(df, expected) + + +def test_preserve_refs(datetime_series): + seq = datetime_series[[5, 10, 15]] + seq[1] = np.NaN + assert not np.isnan(datetime_series[10]) + + +def test_cast_on_putmask(): + # GH 2746 + + # need to upcast + s = Series([1, 2], index=[1, 2], dtype="int64") + s[[True, False]] = Series([0], index=[1], dtype="int64") + expected = Series([0, 2], index=[1, 2], dtype="int64") + + tm.assert_series_equal(s, expected) + + +def test_type_promote_putmask(): + # GH8387: test that changing types does not break alignment + ts = Series(np.random.randn(100), index=np.arange(100, 0, -1)).round(5) + left, mask = ts.copy(), ts > 0 + right = ts[mask].copy().map(str) + left[mask] = right + tm.assert_series_equal(left, ts.map(lambda t: str(t) if t > 0 else t)) + + s = Series([0, 1, 2, 0]) + mask = s > 0 + s2 = s[mask].map(str) + s[mask] = s2 + tm.assert_series_equal(s, Series([0, "1", "2", 0])) + + s = Series([0, "foo", "bar", 0]) + mask = Series([False, True, True, False]) + s2 = s[mask] + s[mask] = s2 + tm.assert_series_equal(s, Series([0, "foo", "bar", 0])) + + +def test_multilevel_preserve_name(): + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + s = Series(np.random.randn(len(index)), index=index, name="sth") + + result = s["foo"] + result2 = s.loc["foo"] + assert result.name == s.name + assert result2.name == s.name + + +def test_setitem_scalar_into_readonly_backing_data(): + # GH14359: test that you cannot mutate a read only buffer + + array = np.zeros(5) + array.flags.writeable = False # make the array immutable + series = Series(array) + + for n in range(len(series)): + msg = "assignment destination is read-only" + with pytest.raises(ValueError, match=msg): + series[n] = 1 + + assert array[n] == 0 + + +def test_setitem_slice_into_readonly_backing_data(): + # GH14359: test that you cannot mutate a read only buffer + + array = np.zeros(5) + array.flags.writeable = False # make the array immutable + series = Series(array) + + msg = "assignment destination is read-only" + with pytest.raises(ValueError, match=msg): + series[1:3] = 1 + + assert not array.any() + + +""" +miscellaneous methods +""" + + +def test_pop(): + # GH 6600 + df = DataFrame({"A": 0, "B": np.arange(5, dtype="int64"), "C": 0}) + k = df.iloc[4] + + result = k.pop("B") + assert result == 4 + + expected = Series([0, 0], index=["A", "C"], name=4) + tm.assert_series_equal(k, expected) + + +def test_take(): + s = Series([-1, 5, 6, 2, 4]) + + actual = s.take([1, 3, 4]) + expected = Series([5, 2, 4], index=[1, 3, 4]) + tm.assert_series_equal(actual, expected) + + actual = s.take([-1, 3, 4]) + expected = Series([4, 2, 4], index=[4, 3, 4]) + tm.assert_series_equal(actual, expected) + + msg = "index {} is out of bounds for( axis 0 with)? size 5" + with pytest.raises(IndexError, match=msg.format(10)): + s.take([1, 10]) + with pytest.raises(IndexError, match=msg.format(5)): + s.take([2, 5]) + + +def test_take_categorical(): + # https://github.com/pandas-dev/pandas/issues/20664 + s = Series(pd.Categorical(["a", "b", "c"])) + result = s.take([-2, -2, 0]) + expected = Series( + pd.Categorical(["b", "b", "a"], categories=["a", "b", "c"]), index=[1, 1, 0] + ) + tm.assert_series_equal(result, expected) + + +def test_head_tail(string_series): + tm.assert_series_equal(string_series.head(), string_series[:5]) + tm.assert_series_equal(string_series.head(0), string_series[0:0]) + tm.assert_series_equal(string_series.tail(), string_series[-5:]) + tm.assert_series_equal(string_series.tail(0), string_series[0:0]) + + +def test_uint_drop(any_int_dtype): + # see GH18311 + # assigning series.loc[0] = 4 changed series.dtype to int + series = pd.Series([1, 2, 3], dtype=any_int_dtype) + series.loc[0] = 4 + expected = pd.Series([4, 2, 3], dtype=any_int_dtype) + tm.assert_series_equal(series, expected) + + +def test_getitem_2d_no_warning(): + # https://github.com/pandas-dev/pandas/issues/30867 + # Don't want to support this long-term, but + # for now ensure that the warning from Index + # doesn't comes through via Series.__getitem__. + series = pd.Series([1, 2, 3], index=[1, 2, 3]) + with tm.assert_produces_warning(None): + series[:, None] diff --git a/venv/Lib/site-packages/pandas/tests/series/indexing/test_loc.py b/venv/Lib/site-packages/pandas/tests/series/indexing/test_loc.py new file mode 100644 index 0000000..7d6b6c7 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/indexing/test_loc.py @@ -0,0 +1,159 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import Series, Timestamp +import pandas._testing as tm + + +@pytest.mark.parametrize("val,expected", [(2 ** 63 - 1, 3), (2 ** 63, 4)]) +def test_loc_uint64(val, expected): + # see gh-19399 + s = Series({2 ** 63 - 1: 3, 2 ** 63: 4}) + assert s.loc[val] == expected + + +def test_loc_getitem(string_series, datetime_series): + inds = string_series.index[[3, 4, 7]] + tm.assert_series_equal(string_series.loc[inds], string_series.reindex(inds)) + tm.assert_series_equal(string_series.iloc[5::2], string_series[5::2]) + + # slice with indices + d1, d2 = datetime_series.index[[5, 15]] + result = datetime_series.loc[d1:d2] + expected = datetime_series.truncate(d1, d2) + tm.assert_series_equal(result, expected) + + # boolean + mask = string_series > string_series.median() + tm.assert_series_equal(string_series.loc[mask], string_series[mask]) + + # ask for index value + assert datetime_series.loc[d1] == datetime_series[d1] + assert datetime_series.loc[d2] == datetime_series[d2] + + +def test_loc_getitem_not_monotonic(datetime_series): + d1, d2 = datetime_series.index[[5, 15]] + + ts2 = datetime_series[::2][[1, 2, 0]] + + msg = r"Timestamp\('2000-01-10 00:00:00'\)" + with pytest.raises(KeyError, match=msg): + ts2.loc[d1:d2] + with pytest.raises(KeyError, match=msg): + ts2.loc[d1:d2] = 0 + + +def test_loc_getitem_setitem_integer_slice_keyerrors(): + s = Series(np.random.randn(10), index=list(range(0, 20, 2))) + + # this is OK + cp = s.copy() + cp.iloc[4:10] = 0 + assert (cp.iloc[4:10] == 0).all() + + # so is this + cp = s.copy() + cp.iloc[3:11] = 0 + assert (cp.iloc[3:11] == 0).values.all() + + result = s.iloc[2:6] + result2 = s.loc[3:11] + expected = s.reindex([4, 6, 8, 10]) + + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result2, expected) + + # non-monotonic, raise KeyError + s2 = s.iloc[list(range(5)) + list(range(9, 4, -1))] + with pytest.raises(KeyError, match=r"^3$"): + s2.loc[3:11] + with pytest.raises(KeyError, match=r"^3$"): + s2.loc[3:11] = 0 + + +def test_loc_getitem_iterator(string_series): + idx = iter(string_series.index[:10]) + result = string_series.loc[idx] + tm.assert_series_equal(result, string_series[:10]) + + +def test_loc_setitem_boolean(string_series): + mask = string_series > string_series.median() + + result = string_series.copy() + result.loc[mask] = 0 + expected = string_series + expected[mask] = 0 + tm.assert_series_equal(result, expected) + + +def test_loc_setitem_corner(string_series): + inds = list(string_series.index[[5, 8, 12]]) + string_series.loc[inds] = 5 + msg = r"\['foo'\] not in index" + with pytest.raises(KeyError, match=msg): + string_series.loc[inds + ["foo"]] = 5 + + +def test_basic_setitem_with_labels(datetime_series): + indices = datetime_series.index[[5, 10, 15]] + + cp = datetime_series.copy() + exp = datetime_series.copy() + cp[indices] = 0 + exp.loc[indices] = 0 + tm.assert_series_equal(cp, exp) + + cp = datetime_series.copy() + exp = datetime_series.copy() + cp[indices[0] : indices[2]] = 0 + exp.loc[indices[0] : indices[2]] = 0 + tm.assert_series_equal(cp, exp) + + # integer indexes, be careful + s = Series(np.random.randn(10), index=list(range(0, 20, 2))) + inds = [0, 4, 6] + arr_inds = np.array([0, 4, 6]) + + cp = s.copy() + exp = s.copy() + s[inds] = 0 + s.loc[inds] = 0 + tm.assert_series_equal(cp, exp) + + cp = s.copy() + exp = s.copy() + s[arr_inds] = 0 + s.loc[arr_inds] = 0 + tm.assert_series_equal(cp, exp) + + inds_notfound = [0, 4, 5, 6] + arr_inds_notfound = np.array([0, 4, 5, 6]) + msg = r"\[5\] not contained in the index" + with pytest.raises(ValueError, match=msg): + s[inds_notfound] = 0 + with pytest.raises(Exception, match=msg): + s[arr_inds_notfound] = 0 + + # GH12089 + # with tz for values + s = Series( + pd.date_range("2011-01-01", periods=3, tz="US/Eastern"), index=["a", "b", "c"] + ) + s2 = s.copy() + expected = Timestamp("2011-01-03", tz="US/Eastern") + s2.loc["a"] = expected + result = s2.loc["a"] + assert result == expected + + s2 = s.copy() + s2.iloc[0] = expected + result = s2.iloc[0] + assert result == expected + + s2 = s.copy() + s2["a"] = expected + result = s2["a"] + assert result == expected diff --git a/venv/Lib/site-packages/pandas/tests/series/indexing/test_numeric.py b/venv/Lib/site-packages/pandas/tests/series/indexing/test_numeric.py new file mode 100644 index 0000000..ce0d04f --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/indexing/test_numeric.py @@ -0,0 +1,313 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Index, Series +import pandas._testing as tm + + +def test_get(): + # GH 6383 + s = Series( + np.array( + [ + 43, + 48, + 60, + 48, + 50, + 51, + 50, + 45, + 57, + 48, + 56, + 45, + 51, + 39, + 55, + 43, + 54, + 52, + 51, + 54, + ] + ) + ) + + result = s.get(25, 0) + expected = 0 + assert result == expected + + s = Series( + np.array( + [ + 43, + 48, + 60, + 48, + 50, + 51, + 50, + 45, + 57, + 48, + 56, + 45, + 51, + 39, + 55, + 43, + 54, + 52, + 51, + 54, + ] + ), + index=pd.Float64Index( + [ + 25.0, + 36.0, + 49.0, + 64.0, + 81.0, + 100.0, + 121.0, + 144.0, + 169.0, + 196.0, + 1225.0, + 1296.0, + 1369.0, + 1444.0, + 1521.0, + 1600.0, + 1681.0, + 1764.0, + 1849.0, + 1936.0, + ] + ), + ) + + result = s.get(25, 0) + expected = 43 + assert result == expected + + # GH 7407 + # with a boolean accessor + df = pd.DataFrame({"i": [0] * 3, "b": [False] * 3}) + vc = df.i.value_counts() + result = vc.get(99, default="Missing") + assert result == "Missing" + + vc = df.b.value_counts() + result = vc.get(False, default="Missing") + assert result == 3 + + result = vc.get(True, default="Missing") + assert result == "Missing" + + +def test_get_nan(): + # GH 8569 + s = pd.Float64Index(range(10)).to_series() + assert s.get(np.nan) is None + assert s.get(np.nan, default="Missing") == "Missing" + + +def test_get_nan_multiple(): + # GH 8569 + # ensure that fixing "test_get_nan" above hasn't broken get + # with multiple elements + s = pd.Float64Index(range(10)).to_series() + + idx = [2, 30] + assert s.get(idx) is None + + idx = [2, np.nan] + assert s.get(idx) is None + + # GH 17295 - all missing keys + idx = [20, 30] + assert s.get(idx) is None + + idx = [np.nan, np.nan] + assert s.get(idx) is None + + +def test_delitem(): + # GH 5542 + # should delete the item inplace + s = Series(range(5)) + del s[0] + + expected = Series(range(1, 5), index=range(1, 5)) + tm.assert_series_equal(s, expected) + + del s[1] + expected = Series(range(2, 5), index=range(2, 5)) + tm.assert_series_equal(s, expected) + + # empty + s = Series(dtype=object) + + with pytest.raises(KeyError, match=r"^0$"): + del s[0] + + # only 1 left, del, add, del + s = Series(1) + del s[0] + tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="int64"))) + s[0] = 1 + tm.assert_series_equal(s, Series(1)) + del s[0] + tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="int64"))) + + # Index(dtype=object) + s = Series(1, index=["a"]) + del s["a"] + tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="object"))) + s["a"] = 1 + tm.assert_series_equal(s, Series(1, index=["a"])) + del s["a"] + tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="object"))) + + +def test_slice_float64(): + values = np.arange(10.0, 50.0, 2) + index = Index(values) + + start, end = values[[5, 15]] + + s = Series(np.random.randn(20), index=index) + + result = s[start:end] + expected = s.iloc[5:16] + tm.assert_series_equal(result, expected) + + result = s.loc[start:end] + tm.assert_series_equal(result, expected) + + df = DataFrame(np.random.randn(20, 3), index=index) + + result = df[start:end] + expected = df.iloc[5:16] + tm.assert_frame_equal(result, expected) + + result = df.loc[start:end] + tm.assert_frame_equal(result, expected) + + +def test_getitem_negative_out_of_bounds(): + s = Series(tm.rands_array(5, 10), index=tm.rands_array(10, 10)) + + msg = "index out of bounds" + with pytest.raises(IndexError, match=msg): + s[-11] + msg = "index -11 is out of bounds for axis 0 with size 10" + with pytest.raises(IndexError, match=msg): + s[-11] = "foo" + + +def test_getitem_regression(): + s = Series(range(5), index=list(range(5))) + result = s[list(range(5))] + tm.assert_series_equal(result, s) + + +def test_getitem_setitem_slice_bug(): + s = Series(range(10), index=list(range(10))) + result = s[-12:] + tm.assert_series_equal(result, s) + + result = s[-7:] + tm.assert_series_equal(result, s[3:]) + + result = s[:-12] + tm.assert_series_equal(result, s[:0]) + + s = Series(range(10), index=list(range(10))) + s[-12:] = 0 + assert (s == 0).all() + + s[:-12] = 5 + assert (s == 0).all() + + +def test_getitem_setitem_slice_integers(): + s = Series(np.random.randn(8), index=[2, 4, 6, 8, 10, 12, 14, 16]) + + result = s[:4] + expected = s.reindex([2, 4, 6, 8]) + tm.assert_series_equal(result, expected) + + s[:4] = 0 + assert (s[:4] == 0).all() + assert not (s[4:] == 0).any() + + +def test_setitem_float_labels(): + # note labels are floats + s = Series(["a", "b", "c"], index=[0, 0.5, 1]) + tmp = s.copy() + + s.loc[1] = "zoo" + tmp.iloc[2] = "zoo" + + tm.assert_series_equal(s, tmp) + + +def test_slice_float_get_set(datetime_series): + msg = ( + r"cannot do slice indexing on with these indexers \[{key}\]" + r" of " + ) + with pytest.raises(TypeError, match=msg.format(key=r"4\.0")): + datetime_series[4.0:10.0] + + with pytest.raises(TypeError, match=msg.format(key=r"4\.0")): + datetime_series[4.0:10.0] = 0 + + with pytest.raises(TypeError, match=msg.format(key=r"4\.5")): + datetime_series[4.5:10.0] + with pytest.raises(TypeError, match=msg.format(key=r"4\.5")): + datetime_series[4.5:10.0] = 0 + + +def test_slice_floats2(): + s = Series(np.random.rand(10), index=np.arange(10, 20, dtype=float)) + + assert len(s.loc[12.0:]) == 8 + assert len(s.loc[12.5:]) == 7 + + i = np.arange(10, 20, dtype=float) + i[2] = 12.2 + s.index = i + assert len(s.loc[12.0:]) == 8 + assert len(s.loc[12.5:]) == 7 + + +def test_int_indexing(): + s = Series(np.random.randn(6), index=[0, 0, 1, 1, 2, 2]) + + with pytest.raises(KeyError, match=r"^5$"): + s[5] + + with pytest.raises(KeyError, match=r"^'c'$"): + s["c"] + + # not monotonic + s = Series(np.random.randn(6), index=[2, 2, 0, 0, 1, 1]) + + with pytest.raises(KeyError, match=r"^5$"): + s[5] + + with pytest.raises(KeyError, match=r"^'c'$"): + s["c"] + + +def test_getitem_int64(datetime_series): + idx = np.int64(5) + assert datetime_series[idx] == datetime_series[5] diff --git a/venv/Lib/site-packages/pandas/tests/series/methods/__init__.py b/venv/Lib/site-packages/pandas/tests/series/methods/__init__.py new file mode 100644 index 0000000..bcb0d30 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/methods/__init__.py @@ -0,0 +1,7 @@ +""" +Test files dedicated to individual (stand-alone) Series methods + +Ideally these files/tests should correspond 1-to-1 with tests.frame.methods + +These may also present opportunities for sharing/de-duplicating test code. +""" diff --git a/venv/Lib/site-packages/pandas/tests/series/methods/test_append.py b/venv/Lib/site-packages/pandas/tests/series/methods/test_append.py new file mode 100644 index 0000000..4d64b5b --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/methods/test_append.py @@ -0,0 +1,168 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, DatetimeIndex, Series, date_range +import pandas._testing as tm + + +class TestSeriesAppend: + def test_append(self, datetime_series, string_series, object_series): + appended_series = string_series.append(object_series) + for idx, value in appended_series.items(): + if idx in string_series.index: + assert value == string_series[idx] + elif idx in object_series.index: + assert value == object_series[idx] + else: + raise AssertionError("orphaned index!") + + msg = "Indexes have overlapping values:" + with pytest.raises(ValueError, match=msg): + datetime_series.append(datetime_series, verify_integrity=True) + + def test_append_many(self, datetime_series): + pieces = [datetime_series[:5], datetime_series[5:10], datetime_series[10:]] + + result = pieces[0].append(pieces[1:]) + tm.assert_series_equal(result, datetime_series) + + def test_append_duplicates(self): + # GH 13677 + s1 = pd.Series([1, 2, 3]) + s2 = pd.Series([4, 5, 6]) + exp = pd.Series([1, 2, 3, 4, 5, 6], index=[0, 1, 2, 0, 1, 2]) + tm.assert_series_equal(s1.append(s2), exp) + tm.assert_series_equal(pd.concat([s1, s2]), exp) + + # the result must have RangeIndex + exp = pd.Series([1, 2, 3, 4, 5, 6]) + tm.assert_series_equal( + s1.append(s2, ignore_index=True), exp, check_index_type=True + ) + tm.assert_series_equal( + pd.concat([s1, s2], ignore_index=True), exp, check_index_type=True + ) + + msg = "Indexes have overlapping values:" + with pytest.raises(ValueError, match=msg): + s1.append(s2, verify_integrity=True) + with pytest.raises(ValueError, match=msg): + pd.concat([s1, s2], verify_integrity=True) + + def test_append_tuples(self): + # GH 28410 + s = pd.Series([1, 2, 3]) + list_input = [s, s] + tuple_input = (s, s) + + expected = s.append(list_input) + result = s.append(tuple_input) + + tm.assert_series_equal(expected, result) + + def test_append_dataframe_regression(self): + # GH 30975 + df = pd.DataFrame({"A": [1, 2]}) + result = df.A.append([df]) + expected = pd.DataFrame( + {0: [1.0, 2.0, None, None], "A": [None, None, 1.0, 2.0]}, index=[0, 1, 0, 1] + ) + + tm.assert_frame_equal(expected, result) + + +class TestSeriesAppendWithDatetimeIndex: + def test_append(self): + rng = date_range("5/8/2012 1:45", periods=10, freq="5T") + ts = Series(np.random.randn(len(rng)), rng) + df = DataFrame(np.random.randn(len(rng), 4), index=rng) + + result = ts.append(ts) + result_df = df.append(df) + ex_index = DatetimeIndex(np.tile(rng.values, 2)) + tm.assert_index_equal(result.index, ex_index) + tm.assert_index_equal(result_df.index, ex_index) + + appended = rng.append(rng) + tm.assert_index_equal(appended, ex_index) + + appended = rng.append([rng, rng]) + ex_index = DatetimeIndex(np.tile(rng.values, 3)) + tm.assert_index_equal(appended, ex_index) + + # different index names + rng1 = rng.copy() + rng2 = rng.copy() + rng1.name = "foo" + rng2.name = "bar" + assert rng1.append(rng1).name == "foo" + assert rng1.append(rng2).name is None + + def test_append_tz(self): + # see gh-2938 + rng = date_range("5/8/2012 1:45", periods=10, freq="5T", tz="US/Eastern") + rng2 = date_range("5/8/2012 2:35", periods=10, freq="5T", tz="US/Eastern") + rng3 = date_range("5/8/2012 1:45", periods=20, freq="5T", tz="US/Eastern") + ts = Series(np.random.randn(len(rng)), rng) + df = DataFrame(np.random.randn(len(rng), 4), index=rng) + ts2 = Series(np.random.randn(len(rng2)), rng2) + df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) + + result = ts.append(ts2) + result_df = df.append(df2) + tm.assert_index_equal(result.index, rng3) + tm.assert_index_equal(result_df.index, rng3) + + appended = rng.append(rng2) + tm.assert_index_equal(appended, rng3) + + def test_append_tz_explicit_pytz(self): + # see gh-2938 + from pytz import timezone as timezone + + rng = date_range( + "5/8/2012 1:45", periods=10, freq="5T", tz=timezone("US/Eastern") + ) + rng2 = date_range( + "5/8/2012 2:35", periods=10, freq="5T", tz=timezone("US/Eastern") + ) + rng3 = date_range( + "5/8/2012 1:45", periods=20, freq="5T", tz=timezone("US/Eastern") + ) + ts = Series(np.random.randn(len(rng)), rng) + df = DataFrame(np.random.randn(len(rng), 4), index=rng) + ts2 = Series(np.random.randn(len(rng2)), rng2) + df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) + + result = ts.append(ts2) + result_df = df.append(df2) + tm.assert_index_equal(result.index, rng3) + tm.assert_index_equal(result_df.index, rng3) + + appended = rng.append(rng2) + tm.assert_index_equal(appended, rng3) + + def test_append_tz_dateutil(self): + # see gh-2938 + rng = date_range( + "5/8/2012 1:45", periods=10, freq="5T", tz="dateutil/US/Eastern" + ) + rng2 = date_range( + "5/8/2012 2:35", periods=10, freq="5T", tz="dateutil/US/Eastern" + ) + rng3 = date_range( + "5/8/2012 1:45", periods=20, freq="5T", tz="dateutil/US/Eastern" + ) + ts = Series(np.random.randn(len(rng)), rng) + df = DataFrame(np.random.randn(len(rng), 4), index=rng) + ts2 = Series(np.random.randn(len(rng2)), rng2) + df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) + + result = ts.append(ts2) + result_df = df.append(df2) + tm.assert_index_equal(result.index, rng3) + tm.assert_index_equal(result_df.index, rng3) + + appended = rng.append(rng2) + tm.assert_index_equal(appended, rng3) diff --git a/venv/Lib/site-packages/pandas/tests/series/methods/test_argsort.py b/venv/Lib/site-packages/pandas/tests/series/methods/test_argsort.py new file mode 100644 index 0000000..1fc98de --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/methods/test_argsort.py @@ -0,0 +1,63 @@ +import numpy as np +import pytest + +from pandas import Series, Timestamp, isna +import pandas._testing as tm + + +class TestSeriesArgsort: + def _check_accum_op(self, name, ser, check_dtype=True): + func = getattr(np, name) + tm.assert_numpy_array_equal( + func(ser).values, func(np.array(ser)), check_dtype=check_dtype, + ) + + # with missing values + ts = ser.copy() + ts[::2] = np.NaN + + result = func(ts)[1::2] + expected = func(np.array(ts.dropna())) + + tm.assert_numpy_array_equal(result.values, expected, check_dtype=False) + + def test_argsort(self, datetime_series): + self._check_accum_op("argsort", datetime_series, check_dtype=False) + argsorted = datetime_series.argsort() + assert issubclass(argsorted.dtype.type, np.integer) + + # GH#2967 (introduced bug in 0.11-dev I think) + s = Series([Timestamp("201301{i:02d}".format(i=i)) for i in range(1, 6)]) + assert s.dtype == "datetime64[ns]" + shifted = s.shift(-1) + assert shifted.dtype == "datetime64[ns]" + assert isna(shifted[4]) + + result = s.argsort() + expected = Series(range(5), dtype="int64") + tm.assert_series_equal(result, expected) + + result = shifted.argsort() + expected = Series(list(range(4)) + [-1], dtype="int64") + tm.assert_series_equal(result, expected) + + def test_argsort_stable(self): + s = Series(np.random.randint(0, 100, size=10000)) + mindexer = s.argsort(kind="mergesort") + qindexer = s.argsort() + + mexpected = np.argsort(s.values, kind="mergesort") + qexpected = np.argsort(s.values, kind="quicksort") + + tm.assert_series_equal(mindexer, Series(mexpected), check_dtype=False) + tm.assert_series_equal(qindexer, Series(qexpected), check_dtype=False) + msg = ( + r"ndarray Expected type ," + r" found instead" + ) + with pytest.raises(AssertionError, match=msg): + tm.assert_numpy_array_equal(qindexer, mindexer) + + def test_argsort_preserve_name(self, datetime_series): + result = datetime_series.argsort() + assert result.name == datetime_series.name diff --git a/venv/Lib/site-packages/pandas/tests/series/methods/test_asof.py b/venv/Lib/site-packages/pandas/tests/series/methods/test_asof.py new file mode 100644 index 0000000..b121efd --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/methods/test_asof.py @@ -0,0 +1,178 @@ +import numpy as np +import pytest + +from pandas import Series, Timestamp, date_range, isna, notna, offsets +import pandas._testing as tm + + +class TestSeriesAsof: + def test_basic(self): + + # array or list or dates + N = 50 + rng = date_range("1/1/1990", periods=N, freq="53s") + ts = Series(np.random.randn(N), index=rng) + ts[15:30] = np.nan + dates = date_range("1/1/1990", periods=N * 3, freq="25s") + + result = ts.asof(dates) + assert notna(result).all() + lb = ts.index[14] + ub = ts.index[30] + + result = ts.asof(list(dates)) + assert notna(result).all() + lb = ts.index[14] + ub = ts.index[30] + + mask = (result.index >= lb) & (result.index < ub) + rs = result[mask] + assert (rs == ts[lb]).all() + + val = result[result.index[result.index >= ub][0]] + assert ts[ub] == val + + def test_scalar(self): + + N = 30 + rng = date_range("1/1/1990", periods=N, freq="53s") + ts = Series(np.arange(N), index=rng) + ts[5:10] = np.NaN + ts[15:20] = np.NaN + + val1 = ts.asof(ts.index[7]) + val2 = ts.asof(ts.index[19]) + + assert val1 == ts[4] + assert val2 == ts[14] + + # accepts strings + val1 = ts.asof(str(ts.index[7])) + assert val1 == ts[4] + + # in there + result = ts.asof(ts.index[3]) + assert result == ts[3] + + # no as of value + d = ts.index[0] - offsets.BDay() + assert np.isnan(ts.asof(d)) + + def test_with_nan(self): + # basic asof test + rng = date_range("1/1/2000", "1/2/2000", freq="4h") + s = Series(np.arange(len(rng)), index=rng) + r = s.resample("2h").mean() + + result = r.asof(r.index) + expected = Series( + [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6.0], + index=date_range("1/1/2000", "1/2/2000", freq="2h"), + ) + tm.assert_series_equal(result, expected) + + r.iloc[3:5] = np.nan + result = r.asof(r.index) + expected = Series( + [0, 0, 1, 1, 1, 1, 3, 3, 4, 4, 5, 5, 6.0], + index=date_range("1/1/2000", "1/2/2000", freq="2h"), + ) + tm.assert_series_equal(result, expected) + + r.iloc[-3:] = np.nan + result = r.asof(r.index) + expected = Series( + [0, 0, 1, 1, 1, 1, 3, 3, 4, 4, 4, 4, 4.0], + index=date_range("1/1/2000", "1/2/2000", freq="2h"), + ) + tm.assert_series_equal(result, expected) + + def test_periodindex(self): + from pandas import period_range, PeriodIndex + + # array or list or dates + N = 50 + rng = period_range("1/1/1990", periods=N, freq="H") + ts = Series(np.random.randn(N), index=rng) + ts[15:30] = np.nan + dates = date_range("1/1/1990", periods=N * 3, freq="37min") + + result = ts.asof(dates) + assert notna(result).all() + lb = ts.index[14] + ub = ts.index[30] + + result = ts.asof(list(dates)) + assert notna(result).all() + lb = ts.index[14] + ub = ts.index[30] + + pix = PeriodIndex(result.index.values, freq="H") + mask = (pix >= lb) & (pix < ub) + rs = result[mask] + assert (rs == ts[lb]).all() + + ts[5:10] = np.nan + ts[15:20] = np.nan + + val1 = ts.asof(ts.index[7]) + val2 = ts.asof(ts.index[19]) + + assert val1 == ts[4] + assert val2 == ts[14] + + # accepts strings + val1 = ts.asof(str(ts.index[7])) + assert val1 == ts[4] + + # in there + assert ts.asof(ts.index[3]) == ts[3] + + # no as of value + d = ts.index[0].to_timestamp() - offsets.BDay() + assert isna(ts.asof(d)) + + def test_errors(self): + + s = Series( + [1, 2, 3], + index=[Timestamp("20130101"), Timestamp("20130103"), Timestamp("20130102")], + ) + + # non-monotonic + assert not s.index.is_monotonic + with pytest.raises(ValueError): + s.asof(s.index[0]) + + # subset with Series + N = 10 + rng = date_range("1/1/1990", periods=N, freq="53s") + s = Series(np.random.randn(N), index=rng) + with pytest.raises(ValueError): + s.asof(s.index[0], subset="foo") + + def test_all_nans(self): + # GH 15713 + # series is all nans + result = Series([np.nan]).asof([0]) + expected = Series([np.nan]) + tm.assert_series_equal(result, expected) + + # testing non-default indexes + N = 50 + rng = date_range("1/1/1990", periods=N, freq="53s") + + dates = date_range("1/1/1990", periods=N * 3, freq="25s") + result = Series(np.nan, index=rng).asof(dates) + expected = Series(np.nan, index=dates) + tm.assert_series_equal(result, expected) + + # testing scalar input + date = date_range("1/1/1990", periods=N * 3, freq="25s")[0] + result = Series(np.nan, index=rng).asof(date) + assert isna(result) + + # test name is propagated + result = Series(np.nan, index=[1, 2, 3, 4], name="test").asof([4, 5]) + expected = Series(np.nan, index=[4, 5], name="test") + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/series/methods/test_clip.py b/venv/Lib/site-packages/pandas/tests/series/methods/test_clip.py new file mode 100644 index 0000000..37764d3 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/methods/test_clip.py @@ -0,0 +1,99 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import Series, Timestamp, isna, notna +import pandas._testing as tm + + +class TestSeriesClip: + def test_clip(self, datetime_series): + val = datetime_series.median() + + assert datetime_series.clip(lower=val).min() == val + assert datetime_series.clip(upper=val).max() == val + + result = datetime_series.clip(-0.5, 0.5) + expected = np.clip(datetime_series, -0.5, 0.5) + tm.assert_series_equal(result, expected) + assert isinstance(expected, Series) + + def test_clip_types_and_nulls(self): + + sers = [ + Series([np.nan, 1.0, 2.0, 3.0]), + Series([None, "a", "b", "c"]), + Series(pd.to_datetime([np.nan, 1, 2, 3], unit="D")), + ] + + for s in sers: + thresh = s[2] + lower = s.clip(lower=thresh) + upper = s.clip(upper=thresh) + assert lower[notna(lower)].min() == thresh + assert upper[notna(upper)].max() == thresh + assert list(isna(s)) == list(isna(lower)) + assert list(isna(s)) == list(isna(upper)) + + def test_clip_with_na_args(self): + """Should process np.nan argument as None """ + # GH#17276 + s = Series([1, 2, 3]) + + tm.assert_series_equal(s.clip(np.nan), Series([1, 2, 3])) + tm.assert_series_equal(s.clip(upper=np.nan, lower=np.nan), Series([1, 2, 3])) + + # GH#19992 + tm.assert_series_equal(s.clip(lower=[0, 4, np.nan]), Series([1, 4, np.nan])) + tm.assert_series_equal(s.clip(upper=[1, np.nan, 1]), Series([1, np.nan, 1])) + + def test_clip_against_series(self): + # GH#6966 + + s = Series([1.0, 1.0, 4.0]) + + lower = Series([1.0, 2.0, 3.0]) + upper = Series([1.5, 2.5, 3.5]) + + tm.assert_series_equal(s.clip(lower, upper), Series([1.0, 2.0, 3.5])) + tm.assert_series_equal(s.clip(1.5, upper), Series([1.5, 1.5, 3.5])) + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize("upper", [[1, 2, 3], np.asarray([1, 2, 3])]) + def test_clip_against_list_like(self, inplace, upper): + # GH#15390 + original = pd.Series([5, 6, 7]) + result = original.clip(upper=upper, inplace=inplace) + expected = pd.Series([1, 2, 3]) + + if inplace: + result = original + tm.assert_series_equal(result, expected, check_exact=True) + + def test_clip_with_datetimes(self): + # GH#11838 + # naive and tz-aware datetimes + + t = Timestamp("2015-12-01 09:30:30") + s = Series([Timestamp("2015-12-01 09:30:00"), Timestamp("2015-12-01 09:31:00")]) + result = s.clip(upper=t) + expected = Series( + [Timestamp("2015-12-01 09:30:00"), Timestamp("2015-12-01 09:30:30")] + ) + tm.assert_series_equal(result, expected) + + t = Timestamp("2015-12-01 09:30:30", tz="US/Eastern") + s = Series( + [ + Timestamp("2015-12-01 09:30:00", tz="US/Eastern"), + Timestamp("2015-12-01 09:31:00", tz="US/Eastern"), + ] + ) + result = s.clip(upper=t) + expected = Series( + [ + Timestamp("2015-12-01 09:30:00", tz="US/Eastern"), + Timestamp("2015-12-01 09:30:30", tz="US/Eastern"), + ] + ) + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/series/methods/test_count.py b/venv/Lib/site-packages/pandas/tests/series/methods/test_count.py new file mode 100644 index 0000000..1ca48ee --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/methods/test_count.py @@ -0,0 +1,38 @@ +import numpy as np + +import pandas as pd +from pandas import Categorical, MultiIndex, Series +import pandas._testing as tm + + +class TestSeriesCount: + def test_count(self, datetime_series): + assert datetime_series.count() == len(datetime_series) + + datetime_series[::2] = np.NaN + + assert datetime_series.count() == np.isfinite(datetime_series).sum() + + mi = MultiIndex.from_arrays([list("aabbcc"), [1, 2, 2, np.nan, 1, 2]]) + ts = Series(np.arange(len(mi)), index=mi) + + left = ts.count(level=1) + right = Series([2, 3, 1], index=[1, 2, np.nan]) + tm.assert_series_equal(left, right) + + ts.iloc[[0, 3, 5]] = np.nan + tm.assert_series_equal(ts.count(level=1), right - 1) + + # GH#29478 + with pd.option_context("use_inf_as_na", True): + assert pd.Series([pd.Timestamp("1990/1/1")]).count() == 1 + + def test_count_categorical(self): + + ser = Series( + Categorical( + [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True + ) + ) + result = ser.count() + assert result == 2 diff --git a/venv/Lib/site-packages/pandas/tests/series/methods/test_cov_corr.py b/venv/Lib/site-packages/pandas/tests/series/methods/test_cov_corr.py new file mode 100644 index 0000000..1f6033d --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/methods/test_cov_corr.py @@ -0,0 +1,158 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import Series, isna +import pandas._testing as tm + + +class TestSeriesCov: + def test_cov(self, datetime_series): + # full overlap + tm.assert_almost_equal( + datetime_series.cov(datetime_series), datetime_series.std() ** 2 + ) + + # partial overlap + tm.assert_almost_equal( + datetime_series[:15].cov(datetime_series[5:]), + datetime_series[5:15].std() ** 2, + ) + + # No overlap + assert np.isnan(datetime_series[::2].cov(datetime_series[1::2])) + + # all NA + cp = datetime_series[:10].copy() + cp[:] = np.nan + assert isna(cp.cov(cp)) + + # min_periods + assert isna(datetime_series[:15].cov(datetime_series[5:], min_periods=12)) + + ts1 = datetime_series[:15].reindex(datetime_series.index) + ts2 = datetime_series[5:].reindex(datetime_series.index) + assert isna(ts1.cov(ts2, min_periods=12)) + + +class TestSeriesCorr: + @td.skip_if_no_scipy + def test_corr(self, datetime_series): + import scipy.stats as stats + + # full overlap + tm.assert_almost_equal(datetime_series.corr(datetime_series), 1) + + # partial overlap + tm.assert_almost_equal(datetime_series[:15].corr(datetime_series[5:]), 1) + + assert isna(datetime_series[:15].corr(datetime_series[5:], min_periods=12)) + + ts1 = datetime_series[:15].reindex(datetime_series.index) + ts2 = datetime_series[5:].reindex(datetime_series.index) + assert isna(ts1.corr(ts2, min_periods=12)) + + # No overlap + assert np.isnan(datetime_series[::2].corr(datetime_series[1::2])) + + # all NA + cp = datetime_series[:10].copy() + cp[:] = np.nan + assert isna(cp.corr(cp)) + + A = tm.makeTimeSeries() + B = tm.makeTimeSeries() + result = A.corr(B) + expected, _ = stats.pearsonr(A, B) + tm.assert_almost_equal(result, expected) + + @td.skip_if_no_scipy + def test_corr_rank(self): + import scipy.stats as stats + + # kendall and spearman + A = tm.makeTimeSeries() + B = tm.makeTimeSeries() + A[-5:] = A[:5] + result = A.corr(B, method="kendall") + expected = stats.kendalltau(A, B)[0] + tm.assert_almost_equal(result, expected) + + result = A.corr(B, method="spearman") + expected = stats.spearmanr(A, B)[0] + tm.assert_almost_equal(result, expected) + + # results from R + A = Series( + [ + -0.89926396, + 0.94209606, + -1.03289164, + -0.95445587, + 0.76910310, + -0.06430576, + -2.09704447, + 0.40660407, + -0.89926396, + 0.94209606, + ] + ) + B = Series( + [ + -1.01270225, + -0.62210117, + -1.56895827, + 0.59592943, + -0.01680292, + 1.17258718, + -1.06009347, + -0.10222060, + -0.89076239, + 0.89372375, + ] + ) + kexp = 0.4319297 + sexp = 0.5853767 + tm.assert_almost_equal(A.corr(B, method="kendall"), kexp) + tm.assert_almost_equal(A.corr(B, method="spearman"), sexp) + + def test_corr_invalid_method(self): + # GH PR #22298 + s1 = pd.Series(np.random.randn(10)) + s2 = pd.Series(np.random.randn(10)) + msg = "method must be either 'pearson', 'spearman', 'kendall', or a callable, " + with pytest.raises(ValueError, match=msg): + s1.corr(s2, method="____") + + def test_corr_callable_method(self, datetime_series): + # simple correlation example + # returns 1 if exact equality, 0 otherwise + my_corr = lambda a, b: 1.0 if (a == b).all() else 0.0 + + # simple example + s1 = Series([1, 2, 3, 4, 5]) + s2 = Series([5, 4, 3, 2, 1]) + expected = 0 + tm.assert_almost_equal(s1.corr(s2, method=my_corr), expected) + + # full overlap + tm.assert_almost_equal( + datetime_series.corr(datetime_series, method=my_corr), 1.0 + ) + + # partial overlap + tm.assert_almost_equal( + datetime_series[:15].corr(datetime_series[5:], method=my_corr), 1.0 + ) + + # No overlap + assert np.isnan( + datetime_series[::2].corr(datetime_series[1::2], method=my_corr) + ) + + # dataframe example + df = pd.DataFrame([s1, s2]) + expected = pd.DataFrame([{0: 1.0, 1: 0}, {0: 0, 1: 1.0}]) + tm.assert_almost_equal(df.transpose().corr(method=my_corr), expected) diff --git a/venv/Lib/site-packages/pandas/tests/series/methods/test_describe.py b/venv/Lib/site-packages/pandas/tests/series/methods/test_describe.py new file mode 100644 index 0000000..b147a04 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/methods/test_describe.py @@ -0,0 +1,69 @@ +import numpy as np + +from pandas import Series, Timestamp, date_range +import pandas._testing as tm + + +class TestSeriesDescribe: + def test_describe(self): + s = Series([0, 1, 2, 3, 4], name="int_data") + result = s.describe() + expected = Series( + [5, 2, s.std(), 0, 1, 2, 3, 4], + name="int_data", + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_series_equal(result, expected) + + s = Series([True, True, False, False, False], name="bool_data") + result = s.describe() + expected = Series( + [5, 2, False, 3], name="bool_data", index=["count", "unique", "top", "freq"] + ) + tm.assert_series_equal(result, expected) + + s = Series(["a", "a", "b", "c", "d"], name="str_data") + result = s.describe() + expected = Series( + [5, 4, "a", 2], name="str_data", index=["count", "unique", "top", "freq"] + ) + tm.assert_series_equal(result, expected) + + def test_describe_empty_object(self): + # https://github.com/pandas-dev/pandas/issues/27183 + s = Series([None, None], dtype=object) + result = s.describe() + expected = Series( + [0, 0, np.nan, np.nan], + dtype=object, + index=["count", "unique", "top", "freq"], + ) + tm.assert_series_equal(result, expected) + + result = s[:0].describe() + tm.assert_series_equal(result, expected) + # ensure NaN, not None + assert np.isnan(result.iloc[2]) + assert np.isnan(result.iloc[3]) + + def test_describe_with_tz(self, tz_naive_fixture): + # GH 21332 + tz = tz_naive_fixture + name = str(tz_naive_fixture) + start = Timestamp(2018, 1, 1) + end = Timestamp(2018, 1, 5) + s = Series(date_range(start, end, tz=tz), name=name) + result = s.describe() + expected = Series( + [ + 5, + 5, + s.value_counts().index[0], + 1, + start.tz_localize(tz), + end.tz_localize(tz), + ], + name=name, + index=["count", "unique", "top", "freq", "first", "last"], + ) + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/series/methods/test_diff.py b/venv/Lib/site-packages/pandas/tests/series/methods/test_diff.py new file mode 100644 index 0000000..033f75e --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/methods/test_diff.py @@ -0,0 +1,77 @@ +import numpy as np +import pytest + +from pandas import Series, TimedeltaIndex, date_range +import pandas._testing as tm + + +class TestSeriesDiff: + def test_diff_np(self): + pytest.skip("skipping due to Series no longer being an ndarray") + + # no longer works as the return type of np.diff is now nd.array + s = Series(np.arange(5)) + + r = np.diff(s) + tm.assert_series_equal(Series([np.nan, 0, 0, 0, np.nan]), r) + + def test_diff_int(self): + # int dtype + a = 10000000000000000 + b = a + 1 + s = Series([a, b]) + + result = s.diff() + assert result[1] == 1 + + def test_diff_tz(self): + # Combined datetime diff, normal diff and boolean diff test + ts = tm.makeTimeSeries(name="ts") + ts.diff() + + # neg n + result = ts.diff(-1) + expected = ts - ts.shift(-1) + tm.assert_series_equal(result, expected) + + # 0 + result = ts.diff(0) + expected = ts - ts + tm.assert_series_equal(result, expected) + + # datetime diff (GH#3100) + s = Series(date_range("20130102", periods=5)) + result = s.diff() + expected = s - s.shift(1) + tm.assert_series_equal(result, expected) + + # timedelta diff + result = result - result.shift(1) # previous result + expected = expected.diff() # previously expected + tm.assert_series_equal(result, expected) + + # with tz + s = Series( + date_range("2000-01-01 09:00:00", periods=5, tz="US/Eastern"), name="foo" + ) + result = s.diff() + expected = Series(TimedeltaIndex(["NaT"] + ["1 days"] * 4), name="foo") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "input,output,diff", + [([False, True, True, False, False], [np.nan, True, False, True, False], 1)], + ) + def test_diff_bool(self, input, output, diff): + # boolean series (test for fixing #17294) + s = Series(input) + result = s.diff() + expected = Series(output) + tm.assert_series_equal(result, expected) + + def test_diff_object_dtype(self): + # object series + s = Series([False, True, 5.0, np.nan, True, False]) + result = s.diff() + expected = s - s.shift(1) + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/series/methods/test_drop_duplicates.py b/venv/Lib/site-packages/pandas/tests/series/methods/test_drop_duplicates.py new file mode 100644 index 0000000..2d05250 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/methods/test_drop_duplicates.py @@ -0,0 +1,141 @@ +import numpy as np +import pytest + +from pandas import Categorical, Series +import pandas._testing as tm + + +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, False, False, True, True, False])), + ("last", Series([False, True, True, False, False, False, False])), + (False, Series([False, True, True, False, True, True, False])), + ], +) +def test_drop_duplicates(any_numpy_dtype, keep, expected): + tc = Series([1, 0, 3, 5, 3, 0, 4], dtype=np.dtype(any_numpy_dtype)) + + if tc.dtype == "bool": + pytest.skip("tested separately in test_drop_duplicates_bool") + + tm.assert_series_equal(tc.duplicated(keep=keep), expected) + tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) + sc = tc.copy() + sc.drop_duplicates(keep=keep, inplace=True) + tm.assert_series_equal(sc, tc[~expected]) + + +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, True])), + ("last", Series([True, True, False, False])), + (False, Series([True, True, True, True])), + ], +) +def test_drop_duplicates_bool(keep, expected): + tc = Series([True, False, True, False]) + + tm.assert_series_equal(tc.duplicated(keep=keep), expected) + tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) + sc = tc.copy() + sc.drop_duplicates(keep=keep, inplace=True) + tm.assert_series_equal(sc, tc[~expected]) + + +class TestSeriesDropDuplicates: + @pytest.mark.parametrize( + "dtype", + ["int_", "uint", "float_", "unicode_", "timedelta64[h]", "datetime64[D]"], + ) + def test_drop_duplicates_categorical_non_bool(self, dtype, ordered_fixture): + cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype)) + + # Test case 1 + input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype)) + tc1 = Series(Categorical(input1, categories=cat_array, ordered=ordered_fixture)) + if dtype == "datetime64[D]": + # pre-empty flaky xfail, tc1 values are seemingly-random + if not (np.array(tc1) == input1).all(): + pytest.xfail(reason="GH#7996") + + expected = Series([False, False, False, True]) + tm.assert_series_equal(tc1.duplicated(), expected) + tm.assert_series_equal(tc1.drop_duplicates(), tc1[~expected]) + sc = tc1.copy() + sc.drop_duplicates(inplace=True) + tm.assert_series_equal(sc, tc1[~expected]) + + expected = Series([False, False, True, False]) + tm.assert_series_equal(tc1.duplicated(keep="last"), expected) + tm.assert_series_equal(tc1.drop_duplicates(keep="last"), tc1[~expected]) + sc = tc1.copy() + sc.drop_duplicates(keep="last", inplace=True) + tm.assert_series_equal(sc, tc1[~expected]) + + expected = Series([False, False, True, True]) + tm.assert_series_equal(tc1.duplicated(keep=False), expected) + tm.assert_series_equal(tc1.drop_duplicates(keep=False), tc1[~expected]) + sc = tc1.copy() + sc.drop_duplicates(keep=False, inplace=True) + tm.assert_series_equal(sc, tc1[~expected]) + + # Test case 2 + input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype)) + tc2 = Series(Categorical(input2, categories=cat_array, ordered=ordered_fixture)) + if dtype == "datetime64[D]": + # pre-empty flaky xfail, tc2 values are seemingly-random + if not (np.array(tc2) == input2).all(): + pytest.xfail(reason="GH#7996") + + expected = Series([False, False, False, False, True, True, False]) + tm.assert_series_equal(tc2.duplicated(), expected) + tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected]) + sc = tc2.copy() + sc.drop_duplicates(inplace=True) + tm.assert_series_equal(sc, tc2[~expected]) + + expected = Series([False, True, True, False, False, False, False]) + tm.assert_series_equal(tc2.duplicated(keep="last"), expected) + tm.assert_series_equal(tc2.drop_duplicates(keep="last"), tc2[~expected]) + sc = tc2.copy() + sc.drop_duplicates(keep="last", inplace=True) + tm.assert_series_equal(sc, tc2[~expected]) + + expected = Series([False, True, True, False, True, True, False]) + tm.assert_series_equal(tc2.duplicated(keep=False), expected) + tm.assert_series_equal(tc2.drop_duplicates(keep=False), tc2[~expected]) + sc = tc2.copy() + sc.drop_duplicates(keep=False, inplace=True) + tm.assert_series_equal(sc, tc2[~expected]) + + def test_drop_duplicates_categorical_bool(self, ordered_fixture): + tc = Series( + Categorical( + [True, False, True, False], + categories=[True, False], + ordered=ordered_fixture, + ) + ) + + expected = Series([False, False, True, True]) + tm.assert_series_equal(tc.duplicated(), expected) + tm.assert_series_equal(tc.drop_duplicates(), tc[~expected]) + sc = tc.copy() + sc.drop_duplicates(inplace=True) + tm.assert_series_equal(sc, tc[~expected]) + + expected = Series([True, True, False, False]) + tm.assert_series_equal(tc.duplicated(keep="last"), expected) + tm.assert_series_equal(tc.drop_duplicates(keep="last"), tc[~expected]) + sc = tc.copy() + sc.drop_duplicates(keep="last", inplace=True) + tm.assert_series_equal(sc, tc[~expected]) + + expected = Series([True, True, True, True]) + tm.assert_series_equal(tc.duplicated(keep=False), expected) + tm.assert_series_equal(tc.drop_duplicates(keep=False), tc[~expected]) + sc = tc.copy() + sc.drop_duplicates(keep=False, inplace=True) + tm.assert_series_equal(sc, tc[~expected]) diff --git a/venv/Lib/site-packages/pandas/tests/series/methods/test_duplicated.py b/venv/Lib/site-packages/pandas/tests/series/methods/test_duplicated.py new file mode 100644 index 0000000..5cc2979 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/methods/test_duplicated.py @@ -0,0 +1,35 @@ +import numpy as np +import pytest + +from pandas import Series +import pandas._testing as tm + + +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, False, True], name="name")), + ("last", Series([True, True, False, False, False], name="name")), + (False, Series([True, True, True, False, True], name="name")), + ], +) +def test_duplicated_keep(keep, expected): + ser = Series(["a", "b", "b", "c", "a"], name="name") + + result = ser.duplicated(keep=keep) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, False, True])), + ("last", Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])), + ], +) +def test_duplicated_nan_none(keep, expected): + ser = Series([np.nan, 3, 3, None, np.nan], dtype=object) + + result = ser.duplicated(keep=keep) + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/series/methods/test_explode.py b/venv/Lib/site-packages/pandas/tests/series/methods/test_explode.py new file mode 100644 index 0000000..979199e --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/methods/test_explode.py @@ -0,0 +1,121 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +def test_basic(): + s = pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd"), name="foo") + result = s.explode() + expected = pd.Series( + [0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object, name="foo" + ) + tm.assert_series_equal(result, expected) + + +def test_mixed_type(): + s = pd.Series( + [[0, 1, 2], np.nan, None, np.array([]), pd.Series(["a", "b"])], name="foo" + ) + result = s.explode() + expected = pd.Series( + [0, 1, 2, np.nan, None, np.nan, "a", "b"], + index=[0, 0, 0, 1, 2, 3, 4, 4], + dtype=object, + name="foo", + ) + tm.assert_series_equal(result, expected) + + +def test_empty(): + s = pd.Series(dtype=object) + result = s.explode() + expected = s.copy() + tm.assert_series_equal(result, expected) + + +def test_nested_lists(): + s = pd.Series([[[1, 2, 3]], [1, 2], 1]) + result = s.explode() + expected = pd.Series([[1, 2, 3], 1, 2, 1], index=[0, 1, 1, 2]) + tm.assert_series_equal(result, expected) + + +def test_multi_index(): + s = pd.Series( + [[0, 1, 2], np.nan, [], (3, 4)], + name="foo", + index=pd.MultiIndex.from_product([list("ab"), range(2)], names=["foo", "bar"]), + ) + result = s.explode() + index = pd.MultiIndex.from_tuples( + [("a", 0), ("a", 0), ("a", 0), ("a", 1), ("b", 0), ("b", 1), ("b", 1)], + names=["foo", "bar"], + ) + expected = pd.Series( + [0, 1, 2, np.nan, np.nan, 3, 4], index=index, dtype=object, name="foo" + ) + tm.assert_series_equal(result, expected) + + +def test_large(): + s = pd.Series([range(256)]).explode() + result = s.explode() + tm.assert_series_equal(result, s) + + +def test_invert_array(): + df = pd.DataFrame({"a": pd.date_range("20190101", periods=3, tz="UTC")}) + + listify = df.apply(lambda x: x.array, axis=1) + result = listify.explode() + tm.assert_series_equal(result, df["a"].rename()) + + +@pytest.mark.parametrize( + "s", [pd.Series([1, 2, 3]), pd.Series(pd.date_range("2019", periods=3, tz="UTC"))] +) +def non_object_dtype(s): + result = s.explode() + tm.assert_series_equal(result, s) + + +def test_typical_usecase(): + + df = pd.DataFrame( + [{"var1": "a,b,c", "var2": 1}, {"var1": "d,e,f", "var2": 2}], + columns=["var1", "var2"], + ) + exploded = df.var1.str.split(",").explode() + exploded + result = df[["var2"]].join(exploded) + expected = pd.DataFrame( + {"var2": [1, 1, 1, 2, 2, 2], "var1": list("abcdef")}, + columns=["var2", "var1"], + index=[0, 0, 0, 1, 1, 1], + ) + tm.assert_frame_equal(result, expected) + + +def test_nested_EA(): + # a nested EA array + s = pd.Series( + [ + pd.date_range("20170101", periods=3, tz="UTC"), + pd.date_range("20170104", periods=3, tz="UTC"), + ] + ) + result = s.explode() + expected = pd.Series( + pd.date_range("20170101", periods=6, tz="UTC"), index=[0, 0, 0, 1, 1, 1] + ) + tm.assert_series_equal(result, expected) + + +def test_duplicate_index(): + # GH 28005 + s = pd.Series([[1, 2], [3, 4]], index=[0, 0]) + result = s.explode() + expected = pd.Series([1, 2, 3, 4], index=[0, 0, 0, 0], dtype=object) + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/series/methods/test_isin.py b/venv/Lib/site-packages/pandas/tests/series/methods/test_isin.py new file mode 100644 index 0000000..ca93e98 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/methods/test_isin.py @@ -0,0 +1,82 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import Series, date_range +import pandas._testing as tm + + +class TestSeriesIsIn: + def test_isin(self): + s = Series(["A", "B", "C", "a", "B", "B", "A", "C"]) + + result = s.isin(["A", "C"]) + expected = Series([True, False, True, False, False, False, True, True]) + tm.assert_series_equal(result, expected) + + # GH#16012 + # This specific issue has to have a series over 1e6 in len, but the + # comparison array (in_list) must be large enough so that numpy doesn't + # do a manual masking trick that will avoid this issue altogether + s = Series(list("abcdefghijk" * 10 ** 5)) + # If numpy doesn't do the manual comparison/mask, these + # unorderable mixed types are what cause the exception in numpy + in_list = [-1, "a", "b", "G", "Y", "Z", "E", "K", "E", "S", "I", "R", "R"] * 6 + + assert s.isin(in_list).sum() == 200000 + + def test_isin_with_string_scalar(self): + # GH#4763 + s = Series(["A", "B", "C", "a", "B", "B", "A", "C"]) + msg = ( + r"only list-like objects are allowed to be passed to isin\(\)," + r" you passed a \[str\]" + ) + with pytest.raises(TypeError, match=msg): + s.isin("a") + + s = Series(["aaa", "b", "c"]) + with pytest.raises(TypeError, match=msg): + s.isin("aaa") + + def test_isin_with_i8(self): + # GH#5021 + + expected = Series([True, True, False, False, False]) + expected2 = Series([False, True, False, False, False]) + + # datetime64[ns] + s = Series(date_range("jan-01-2013", "jan-05-2013")) + + result = s.isin(s[0:2]) + tm.assert_series_equal(result, expected) + + result = s.isin(s[0:2].values) + tm.assert_series_equal(result, expected) + + # fails on dtype conversion in the first place + result = s.isin(s[0:2].values.astype("datetime64[D]")) + tm.assert_series_equal(result, expected) + + result = s.isin([s[1]]) + tm.assert_series_equal(result, expected2) + + result = s.isin([np.datetime64(s[1])]) + tm.assert_series_equal(result, expected2) + + result = s.isin(set(s[0:2])) + tm.assert_series_equal(result, expected) + + # timedelta64[ns] + s = Series(pd.to_timedelta(range(5), unit="d")) + result = s.isin(s[0:2]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("empty", [[], Series(dtype=object), np.array([])]) + def test_isin_empty(self, empty): + # see GH#16991 + s = Series(["a", "b"]) + expected = Series([False, False]) + + result = s.isin(empty) + tm.assert_series_equal(expected, result) diff --git a/venv/Lib/site-packages/pandas/tests/series/methods/test_nlargest.py b/venv/Lib/site-packages/pandas/tests/series/methods/test_nlargest.py new file mode 100644 index 0000000..a029965 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/methods/test_nlargest.py @@ -0,0 +1,213 @@ +""" +Note: for naming purposes, most tests are title with as e.g. "test_nlargest_foo" +but are implicitly also testing nsmallest_foo. +""" +from itertools import product + +import numpy as np +import pytest + +import pandas as pd +from pandas import Series +import pandas._testing as tm + +main_dtypes = [ + "datetime", + "datetimetz", + "timedelta", + "int8", + "int16", + "int32", + "int64", + "float32", + "float64", + "uint8", + "uint16", + "uint32", + "uint64", +] + + +@pytest.fixture +def s_main_dtypes(): + """ + A DataFrame with many dtypes + + * datetime + * datetimetz + * timedelta + * [u]int{8,16,32,64} + * float{32,64} + + The columns are the name of the dtype. + """ + df = pd.DataFrame( + { + "datetime": pd.to_datetime(["2003", "2002", "2001", "2002", "2005"]), + "datetimetz": pd.to_datetime( + ["2003", "2002", "2001", "2002", "2005"] + ).tz_localize("US/Eastern"), + "timedelta": pd.to_timedelta(["3d", "2d", "1d", "2d", "5d"]), + } + ) + + for dtype in [ + "int8", + "int16", + "int32", + "int64", + "float32", + "float64", + "uint8", + "uint16", + "uint32", + "uint64", + ]: + df[dtype] = Series([3, 2, 1, 2, 5], dtype=dtype) + + return df + + +@pytest.fixture(params=main_dtypes) +def s_main_dtypes_split(request, s_main_dtypes): + """Each series in s_main_dtypes.""" + return s_main_dtypes[request.param] + + +def assert_check_nselect_boundary(vals, dtype, method): + # helper function for 'test_boundary_{dtype}' tests + ser = Series(vals, dtype=dtype) + result = getattr(ser, method)(3) + expected_idxr = [0, 1, 2] if method == "nsmallest" else [3, 2, 1] + expected = ser.loc[expected_idxr] + tm.assert_series_equal(result, expected) + + +class TestSeriesNLargestNSmallest: + @pytest.mark.parametrize( + "r", + [ + Series([3.0, 2, 1, 2, "5"], dtype="object"), + Series([3.0, 2, 1, 2, 5], dtype="object"), + # not supported on some archs + # Series([3., 2, 1, 2, 5], dtype='complex256'), + Series([3.0, 2, 1, 2, 5], dtype="complex128"), + Series(list("abcde")), + Series(list("abcde"), dtype="category"), + ], + ) + def test_nlargest_error(self, r): + dt = r.dtype + msg = "Cannot use method 'n(larg|small)est' with dtype {dt}".format(dt=dt) + args = 2, len(r), 0, -1 + methods = r.nlargest, r.nsmallest + for method, arg in product(methods, args): + with pytest.raises(TypeError, match=msg): + method(arg) + + def test_nsmallest_nlargest(self, s_main_dtypes_split): + # float, int, datetime64 (use i8), timedelts64 (same), + # object that are numbers, object that are strings + ser = s_main_dtypes_split + + tm.assert_series_equal(ser.nsmallest(2), ser.iloc[[2, 1]]) + tm.assert_series_equal(ser.nsmallest(2, keep="last"), ser.iloc[[2, 3]]) + + empty = ser.iloc[0:0] + tm.assert_series_equal(ser.nsmallest(0), empty) + tm.assert_series_equal(ser.nsmallest(-1), empty) + tm.assert_series_equal(ser.nlargest(0), empty) + tm.assert_series_equal(ser.nlargest(-1), empty) + + tm.assert_series_equal(ser.nsmallest(len(ser)), ser.sort_values()) + tm.assert_series_equal(ser.nsmallest(len(ser) + 1), ser.sort_values()) + tm.assert_series_equal(ser.nlargest(len(ser)), ser.iloc[[4, 0, 1, 3, 2]]) + tm.assert_series_equal(ser.nlargest(len(ser) + 1), ser.iloc[[4, 0, 1, 3, 2]]) + + def test_nlargest_misc(self): + + ser = Series([3.0, np.nan, 1, 2, 5]) + tm.assert_series_equal(ser.nlargest(), ser.iloc[[4, 0, 3, 2]]) + tm.assert_series_equal(ser.nsmallest(), ser.iloc[[2, 3, 0, 4]]) + + msg = 'keep must be either "first", "last"' + with pytest.raises(ValueError, match=msg): + ser.nsmallest(keep="invalid") + with pytest.raises(ValueError, match=msg): + ser.nlargest(keep="invalid") + + # GH#15297 + ser = Series([1] * 5, index=[1, 2, 3, 4, 5]) + expected_first = Series([1] * 3, index=[1, 2, 3]) + expected_last = Series([1] * 3, index=[5, 4, 3]) + + result = ser.nsmallest(3) + tm.assert_series_equal(result, expected_first) + + result = ser.nsmallest(3, keep="last") + tm.assert_series_equal(result, expected_last) + + result = ser.nlargest(3) + tm.assert_series_equal(result, expected_first) + + result = ser.nlargest(3, keep="last") + tm.assert_series_equal(result, expected_last) + + @pytest.mark.parametrize("n", range(1, 5)) + def test_nlargest_n(self, n): + + # GH 13412 + ser = Series([1, 4, 3, 2], index=[0, 0, 1, 1]) + result = ser.nlargest(n) + expected = ser.sort_values(ascending=False).head(n) + tm.assert_series_equal(result, expected) + + result = ser.nsmallest(n) + expected = ser.sort_values().head(n) + tm.assert_series_equal(result, expected) + + def test_nlargest_boundary_integer(self, nselect_method, any_int_dtype): + # GH#21426 + dtype_info = np.iinfo(any_int_dtype) + min_val, max_val = dtype_info.min, dtype_info.max + vals = [min_val, min_val + 1, max_val - 1, max_val] + assert_check_nselect_boundary(vals, any_int_dtype, nselect_method) + + def test_nlargest_boundary_float(self, nselect_method, float_dtype): + # GH#21426 + dtype_info = np.finfo(float_dtype) + min_val, max_val = dtype_info.min, dtype_info.max + min_2nd, max_2nd = np.nextafter([min_val, max_val], 0, dtype=float_dtype) + vals = [min_val, min_2nd, max_2nd, max_val] + assert_check_nselect_boundary(vals, float_dtype, nselect_method) + + @pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"]) + def test_nlargest_boundary_datetimelike(self, nselect_method, dtype): + # GH#21426 + # use int64 bounds and +1 to min_val since true minimum is NaT + # (include min_val/NaT at end to maintain same expected_idxr) + dtype_info = np.iinfo("int64") + min_val, max_val = dtype_info.min, dtype_info.max + vals = [min_val + 1, min_val + 2, max_val - 1, max_val, min_val] + assert_check_nselect_boundary(vals, dtype, nselect_method) + + def test_nlargest_duplicate_keep_all_ties(self): + # see GH#16818 + ser = Series([10, 9, 8, 7, 7, 7, 7, 6]) + result = ser.nlargest(4, keep="all") + expected = Series([10, 9, 8, 7, 7, 7, 7]) + tm.assert_series_equal(result, expected) + + result = ser.nsmallest(2, keep="all") + expected = Series([6, 7, 7, 7, 7], index=[7, 3, 4, 5, 6]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "data,expected", [([True, False], [True]), ([True, False, True, True], [True])] + ) + def test_nlargest_boolean(self, data, expected): + # GH#26154 : ensure True > False + ser = Series(data) + result = ser.nlargest(1) + expected = Series(expected) + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/series/methods/test_pct_change.py b/venv/Lib/site-packages/pandas/tests/series/methods/test_pct_change.py new file mode 100644 index 0000000..1efb578 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/methods/test_pct_change.py @@ -0,0 +1,79 @@ +import numpy as np +import pytest + +from pandas import Series, date_range +import pandas._testing as tm + + +class TestSeriesPctChange: + def test_pct_change(self, datetime_series): + rs = datetime_series.pct_change(fill_method=None) + tm.assert_series_equal(rs, datetime_series / datetime_series.shift(1) - 1) + + rs = datetime_series.pct_change(2) + filled = datetime_series.fillna(method="pad") + tm.assert_series_equal(rs, filled / filled.shift(2) - 1) + + rs = datetime_series.pct_change(fill_method="bfill", limit=1) + filled = datetime_series.fillna(method="bfill", limit=1) + tm.assert_series_equal(rs, filled / filled.shift(1) - 1) + + rs = datetime_series.pct_change(freq="5D") + filled = datetime_series.fillna(method="pad") + tm.assert_series_equal( + rs, (filled / filled.shift(freq="5D") - 1).reindex_like(filled) + ) + + def test_pct_change_with_duplicate_axis(self): + # GH#28664 + common_idx = date_range("2019-11-14", periods=5, freq="D") + result = Series(range(5), common_idx).pct_change(freq="B") + + # the reason that the expected should be like this is documented at PR 28681 + expected = Series([np.NaN, np.inf, np.NaN, np.NaN, 3.0], common_idx) + + tm.assert_series_equal(result, expected) + + def test_pct_change_shift_over_nas(self): + s = Series([1.0, 1.5, np.nan, 2.5, 3.0]) + + chg = s.pct_change() + expected = Series([np.nan, 0.5, 0.0, 2.5 / 1.5 - 1, 0.2]) + tm.assert_series_equal(chg, expected) + + @pytest.mark.parametrize( + "freq, periods, fill_method, limit", + [ + ("5B", 5, None, None), + ("3B", 3, None, None), + ("3B", 3, "bfill", None), + ("7B", 7, "pad", 1), + ("7B", 7, "bfill", 3), + ("14B", 14, None, None), + ], + ) + def test_pct_change_periods_freq( + self, freq, periods, fill_method, limit, datetime_series + ): + # GH#7292 + rs_freq = datetime_series.pct_change( + freq=freq, fill_method=fill_method, limit=limit + ) + rs_periods = datetime_series.pct_change( + periods, fill_method=fill_method, limit=limit + ) + tm.assert_series_equal(rs_freq, rs_periods) + + empty_ts = Series(index=datetime_series.index, dtype=object) + rs_freq = empty_ts.pct_change(freq=freq, fill_method=fill_method, limit=limit) + rs_periods = empty_ts.pct_change(periods, fill_method=fill_method, limit=limit) + tm.assert_series_equal(rs_freq, rs_periods) + + +@pytest.mark.parametrize("fill_method", ["pad", "ffill", None]) +def test_pct_change_with_duplicated_indices(fill_method): + # GH30463 + s = Series([np.nan, 1, 2, 3, 9, 18], index=["a", "b"] * 3) + result = s.pct_change(fill_method=fill_method) + expected = Series([np.nan, np.nan, 1.0, 0.5, 2.0, 1.0], index=["a", "b"] * 3) + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/series/methods/test_quantile.py b/venv/Lib/site-packages/pandas/tests/series/methods/test_quantile.py new file mode 100644 index 0000000..79f50af --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/methods/test_quantile.py @@ -0,0 +1,216 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_integer + +import pandas as pd +from pandas import Index, Series +import pandas._testing as tm +from pandas.core.indexes.datetimes import Timestamp + + +class TestSeriesQuantile: + def test_quantile(self, datetime_series): + + q = datetime_series.quantile(0.1) + assert q == np.percentile(datetime_series.dropna(), 10) + + q = datetime_series.quantile(0.9) + assert q == np.percentile(datetime_series.dropna(), 90) + + # object dtype + q = Series(datetime_series, dtype=object).quantile(0.9) + assert q == np.percentile(datetime_series.dropna(), 90) + + # datetime64[ns] dtype + dts = datetime_series.index.to_series() + q = dts.quantile(0.2) + assert q == Timestamp("2000-01-10 19:12:00") + + # timedelta64[ns] dtype + tds = dts.diff() + q = tds.quantile(0.25) + assert q == pd.to_timedelta("24:00:00") + + # GH7661 + result = Series([np.timedelta64("NaT")]).sum() + assert result == pd.Timedelta(0) + + msg = "percentiles should all be in the interval \\[0, 1\\]" + for invalid in [-1, 2, [0.5, -1], [0.5, 2]]: + with pytest.raises(ValueError, match=msg): + datetime_series.quantile(invalid) + + def test_quantile_multi(self, datetime_series): + + qs = [0.1, 0.9] + result = datetime_series.quantile(qs) + expected = pd.Series( + [ + np.percentile(datetime_series.dropna(), 10), + np.percentile(datetime_series.dropna(), 90), + ], + index=qs, + name=datetime_series.name, + ) + tm.assert_series_equal(result, expected) + + dts = datetime_series.index.to_series() + dts.name = "xxx" + result = dts.quantile((0.2, 0.2)) + expected = Series( + [Timestamp("2000-01-10 19:12:00"), Timestamp("2000-01-10 19:12:00")], + index=[0.2, 0.2], + name="xxx", + ) + tm.assert_series_equal(result, expected) + + result = datetime_series.quantile([]) + expected = pd.Series( + [], name=datetime_series.name, index=Index([], dtype=float), dtype="float64" + ) + tm.assert_series_equal(result, expected) + + def test_quantile_interpolation(self, datetime_series): + # see gh-10174 + + # interpolation = linear (default case) + q = datetime_series.quantile(0.1, interpolation="linear") + assert q == np.percentile(datetime_series.dropna(), 10) + q1 = datetime_series.quantile(0.1) + assert q1 == np.percentile(datetime_series.dropna(), 10) + + # test with and without interpolation keyword + assert q == q1 + + def test_quantile_interpolation_dtype(self): + # GH #10174 + + # interpolation = linear (default case) + q = pd.Series([1, 3, 4]).quantile(0.5, interpolation="lower") + assert q == np.percentile(np.array([1, 3, 4]), 50) + assert is_integer(q) + + q = pd.Series([1, 3, 4]).quantile(0.5, interpolation="higher") + assert q == np.percentile(np.array([1, 3, 4]), 50) + assert is_integer(q) + + def test_quantile_nan(self): + + # GH 13098 + s = pd.Series([1, 2, 3, 4, np.nan]) + result = s.quantile(0.5) + expected = 2.5 + assert result == expected + + # all nan/empty + s1 = Series([], dtype=object) + cases = [s1, Series([np.nan, np.nan])] + + for s in cases: + res = s.quantile(0.5) + assert np.isnan(res) + + res = s.quantile([0.5]) + tm.assert_series_equal(res, pd.Series([np.nan], index=[0.5])) + + res = s.quantile([0.2, 0.3]) + tm.assert_series_equal(res, pd.Series([np.nan, np.nan], index=[0.2, 0.3])) + + @pytest.mark.parametrize( + "case", + [ + [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-03"), + ], + [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timestamp("2011-01-03", tz="US/Eastern"), + ], + [pd.Timedelta("1 days"), pd.Timedelta("2 days"), pd.Timedelta("3 days")], + # NaT + [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-03"), + pd.NaT, + ], + [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timestamp("2011-01-03", tz="US/Eastern"), + pd.NaT, + ], + [ + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), + pd.NaT, + ], + ], + ) + def test_quantile_box(self, case): + s = pd.Series(case, name="XXX") + res = s.quantile(0.5) + assert res == case[1] + + res = s.quantile([0.5]) + exp = pd.Series([case[1]], index=[0.5], name="XXX") + tm.assert_series_equal(res, exp) + + def test_datetime_timedelta_quantiles(self): + # covers #9694 + assert pd.isna(Series([], dtype="M8[ns]").quantile(0.5)) + assert pd.isna(Series([], dtype="m8[ns]").quantile(0.5)) + + def test_quantile_nat(self): + res = Series([pd.NaT, pd.NaT]).quantile(0.5) + assert res is pd.NaT + + res = Series([pd.NaT, pd.NaT]).quantile([0.5]) + tm.assert_series_equal(res, pd.Series([pd.NaT], index=[0.5])) + + @pytest.mark.parametrize( + "values, dtype", + [([0, 0, 0, 1, 2, 3], "Sparse[int]"), ([0.0, None, 1.0, 2.0], "Sparse[float]")], + ) + def test_quantile_sparse(self, values, dtype): + ser = pd.Series(values, dtype=dtype) + result = ser.quantile([0.5]) + expected = pd.Series(np.asarray(ser)).quantile([0.5]) + tm.assert_series_equal(result, expected) + + def test_quantile_empty(self): + + # floats + s = Series([], dtype="float64") + + res = s.quantile(0.5) + assert np.isnan(res) + + res = s.quantile([0.5]) + exp = Series([np.nan], index=[0.5]) + tm.assert_series_equal(res, exp) + + # int + s = Series([], dtype="int64") + + res = s.quantile(0.5) + assert np.isnan(res) + + res = s.quantile([0.5]) + exp = Series([np.nan], index=[0.5]) + tm.assert_series_equal(res, exp) + + # datetime + s = Series([], dtype="datetime64[ns]") + + res = s.quantile(0.5) + assert res is pd.NaT + + res = s.quantile([0.5]) + exp = Series([pd.NaT], index=[0.5]) + tm.assert_series_equal(res, exp) diff --git a/venv/Lib/site-packages/pandas/tests/series/methods/test_rank.py b/venv/Lib/site-packages/pandas/tests/series/methods/test_rank.py new file mode 100644 index 0000000..3d4688c --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/methods/test_rank.py @@ -0,0 +1,565 @@ +from itertools import chain, product + +import numpy as np +import pytest + +from pandas._libs.algos import Infinity, NegInfinity +from pandas._libs.tslib import iNaT +import pandas.util._test_decorators as td + +from pandas import NaT, Series, Timestamp, date_range +import pandas._testing as tm +from pandas.api.types import CategoricalDtype + + +class TestSeriesRank: + s = Series([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]) + + results = { + "average": np.array([1.5, 5.5, 7.0, 3.5, np.nan, 3.5, 1.5, 8.0, np.nan, 5.5]), + "min": np.array([1, 5, 7, 3, np.nan, 3, 1, 8, np.nan, 5]), + "max": np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6]), + "first": np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6]), + "dense": np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]), + } + + def test_rank(self, datetime_series): + pytest.importorskip("scipy.stats.special") + rankdata = pytest.importorskip("scipy.stats.rankdata") + + datetime_series[::2] = np.nan + datetime_series[:10][::3] = 4.0 + + ranks = datetime_series.rank() + oranks = datetime_series.astype("O").rank() + + tm.assert_series_equal(ranks, oranks) + + mask = np.isnan(datetime_series) + filled = datetime_series.fillna(np.inf) + + # rankdata returns a ndarray + exp = Series(rankdata(filled), index=filled.index, name="ts") + exp[mask] = np.nan + + tm.assert_series_equal(ranks, exp) + + iseries = Series(np.arange(5).repeat(2)) + + iranks = iseries.rank() + exp = iseries.astype(float).rank() + tm.assert_series_equal(iranks, exp) + iseries = Series(np.arange(5)) + 1.0 + exp = iseries / 5.0 + iranks = iseries.rank(pct=True) + + tm.assert_series_equal(iranks, exp) + + iseries = Series(np.repeat(1, 100)) + exp = Series(np.repeat(0.505, 100)) + iranks = iseries.rank(pct=True) + tm.assert_series_equal(iranks, exp) + + iseries[1] = np.nan + exp = Series(np.repeat(50.0 / 99.0, 100)) + exp[1] = np.nan + iranks = iseries.rank(pct=True) + tm.assert_series_equal(iranks, exp) + + iseries = Series(np.arange(5)) + 1.0 + iseries[4] = np.nan + exp = iseries / 4.0 + iranks = iseries.rank(pct=True) + tm.assert_series_equal(iranks, exp) + + iseries = Series(np.repeat(np.nan, 100)) + exp = iseries.copy() + iranks = iseries.rank(pct=True) + tm.assert_series_equal(iranks, exp) + + iseries = Series(np.arange(5)) + 1 + iseries[4] = np.nan + exp = iseries / 4.0 + iranks = iseries.rank(pct=True) + tm.assert_series_equal(iranks, exp) + + rng = date_range("1/1/1990", periods=5) + iseries = Series(np.arange(5), rng) + 1 + iseries.iloc[4] = np.nan + exp = iseries / 4.0 + iranks = iseries.rank(pct=True) + tm.assert_series_equal(iranks, exp) + + iseries = Series([1e-50, 1e-100, 1e-20, 1e-2, 1e-20 + 1e-30, 1e-1]) + exp = Series([2, 1, 3, 5, 4, 6.0]) + iranks = iseries.rank() + tm.assert_series_equal(iranks, exp) + + # GH 5968 + iseries = Series(["3 day", "1 day 10m", "-2 day", NaT], dtype="m8[ns]") + exp = Series([3, 2, 1, np.nan]) + iranks = iseries.rank() + tm.assert_series_equal(iranks, exp) + + values = np.array( + [-50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40], + dtype="float64", + ) + random_order = np.random.permutation(len(values)) + iseries = Series(values[random_order]) + exp = Series(random_order + 1.0, dtype="float64") + iranks = iseries.rank() + tm.assert_series_equal(iranks, exp) + + def test_rank_categorical(self): + # GH issue #15420 rank incorrectly orders ordered categories + + # Test ascending/descending ranking for ordered categoricals + exp = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]) + exp_desc = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0]) + ordered = Series( + ["first", "second", "third", "fourth", "fifth", "sixth"] + ).astype( + CategoricalDtype( + categories=["first", "second", "third", "fourth", "fifth", "sixth"], + ordered=True, + ) + ) + tm.assert_series_equal(ordered.rank(), exp) + tm.assert_series_equal(ordered.rank(ascending=False), exp_desc) + + # Unordered categoricals should be ranked as objects + unordered = Series( + ["first", "second", "third", "fourth", "fifth", "sixth"] + ).astype( + CategoricalDtype( + categories=["first", "second", "third", "fourth", "fifth", "sixth"], + ordered=False, + ) + ) + exp_unordered = Series([2.0, 4.0, 6.0, 3.0, 1.0, 5.0]) + res = unordered.rank() + tm.assert_series_equal(res, exp_unordered) + + unordered1 = Series([1, 2, 3, 4, 5, 6]).astype( + CategoricalDtype([1, 2, 3, 4, 5, 6], False) + ) + exp_unordered1 = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]) + res1 = unordered1.rank() + tm.assert_series_equal(res1, exp_unordered1) + + # Test na_option for rank data + na_ser = Series( + ["first", "second", "third", "fourth", "fifth", "sixth", np.NaN] + ).astype( + CategoricalDtype( + ["first", "second", "third", "fourth", "fifth", "sixth", "seventh"], + True, + ) + ) + + exp_top = Series([2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 1.0]) + exp_bot = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]) + exp_keep = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, np.NaN]) + + tm.assert_series_equal(na_ser.rank(na_option="top"), exp_top) + tm.assert_series_equal(na_ser.rank(na_option="bottom"), exp_bot) + tm.assert_series_equal(na_ser.rank(na_option="keep"), exp_keep) + + # Test na_option for rank data with ascending False + exp_top = Series([7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0]) + exp_bot = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 7.0]) + exp_keep = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0, np.NaN]) + + tm.assert_series_equal(na_ser.rank(na_option="top", ascending=False), exp_top) + tm.assert_series_equal( + na_ser.rank(na_option="bottom", ascending=False), exp_bot + ) + tm.assert_series_equal(na_ser.rank(na_option="keep", ascending=False), exp_keep) + + # Test invalid values for na_option + msg = "na_option must be one of 'keep', 'top', or 'bottom'" + + with pytest.raises(ValueError, match=msg): + na_ser.rank(na_option="bad", ascending=False) + + # invalid type + with pytest.raises(ValueError, match=msg): + na_ser.rank(na_option=True, ascending=False) + + # Test with pct=True + na_ser = Series(["first", "second", "third", "fourth", np.NaN]).astype( + CategoricalDtype(["first", "second", "third", "fourth"], True) + ) + exp_top = Series([0.4, 0.6, 0.8, 1.0, 0.2]) + exp_bot = Series([0.2, 0.4, 0.6, 0.8, 1.0]) + exp_keep = Series([0.25, 0.5, 0.75, 1.0, np.NaN]) + + tm.assert_series_equal(na_ser.rank(na_option="top", pct=True), exp_top) + tm.assert_series_equal(na_ser.rank(na_option="bottom", pct=True), exp_bot) + tm.assert_series_equal(na_ser.rank(na_option="keep", pct=True), exp_keep) + + def test_rank_signature(self): + s = Series([0, 1]) + s.rank(method="average") + msg = ( + "No axis named average for object type " + ) + with pytest.raises(ValueError, match=msg): + s.rank("average") + + @pytest.mark.parametrize( + "contents,dtype", + [ + ( + [ + -np.inf, + -50, + -1, + -1e-20, + -1e-25, + -1e-50, + 0, + 1e-40, + 1e-20, + 1e-10, + 2, + 40, + np.inf, + ], + "float64", + ), + ( + [ + -np.inf, + -50, + -1, + -1e-20, + -1e-25, + -1e-45, + 0, + 1e-40, + 1e-20, + 1e-10, + 2, + 40, + np.inf, + ], + "float32", + ), + ([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max], "uint8"), + pytest.param( + [ + np.iinfo(np.int64).min, + -100, + 0, + 1, + 9999, + 100000, + 1e10, + np.iinfo(np.int64).max, + ], + "int64", + marks=pytest.mark.xfail( + reason="iNaT is equivalent to minimum value of dtype" + "int64 pending issue GH#16674" + ), + ), + ([NegInfinity(), "1", "A", "BA", "Ba", "C", Infinity()], "object"), + ], + ) + def test_rank_inf(self, contents, dtype): + dtype_na_map = { + "float64": np.nan, + "float32": np.nan, + "int64": iNaT, + "object": None, + } + # Insert nans at random positions if underlying dtype has missing + # value. Then adjust the expected order by adding nans accordingly + # This is for testing whether rank calculation is affected + # when values are interwined with nan values. + values = np.array(contents, dtype=dtype) + exp_order = np.array(range(len(values)), dtype="float64") + 1.0 + if dtype in dtype_na_map: + na_value = dtype_na_map[dtype] + nan_indices = np.random.choice(range(len(values)), 5) + values = np.insert(values, nan_indices, na_value) + exp_order = np.insert(exp_order, nan_indices, np.nan) + # shuffle the testing array and expected results in the same way + random_order = np.random.permutation(len(values)) + iseries = Series(values[random_order]) + exp = Series(exp_order[random_order], dtype="float64") + iranks = iseries.rank() + tm.assert_series_equal(iranks, exp) + + def test_rank_tie_methods(self): + s = self.s + + def _check(s, expected, method="average"): + result = s.rank(method=method) + tm.assert_series_equal(result, Series(expected)) + + dtypes = [None, object] + disabled = {(object, "first")} + results = self.results + + for method, dtype in product(results, dtypes): + if (dtype, method) in disabled: + continue + series = s if dtype is None else s.astype(dtype) + _check(series, results[method], method=method) + + @td.skip_if_no_scipy + @pytest.mark.parametrize("ascending", [True, False]) + @pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"]) + @pytest.mark.parametrize("na_option", ["top", "bottom", "keep"]) + def test_rank_tie_methods_on_infs_nans(self, method, na_option, ascending): + dtypes = [ + ("object", None, Infinity(), NegInfinity()), + ("float64", np.nan, np.inf, -np.inf), + ] + chunk = 3 + disabled = {("object", "first")} + + def _check(s, method, na_option, ascending): + exp_ranks = { + "average": ([2, 2, 2], [5, 5, 5], [8, 8, 8]), + "min": ([1, 1, 1], [4, 4, 4], [7, 7, 7]), + "max": ([3, 3, 3], [6, 6, 6], [9, 9, 9]), + "first": ([1, 2, 3], [4, 5, 6], [7, 8, 9]), + "dense": ([1, 1, 1], [2, 2, 2], [3, 3, 3]), + } + ranks = exp_ranks[method] + if na_option == "top": + order = [ranks[1], ranks[0], ranks[2]] + elif na_option == "bottom": + order = [ranks[0], ranks[2], ranks[1]] + else: + order = [ranks[0], [np.nan] * chunk, ranks[1]] + expected = order if ascending else order[::-1] + expected = list(chain.from_iterable(expected)) + result = s.rank(method=method, na_option=na_option, ascending=ascending) + tm.assert_series_equal(result, Series(expected, dtype="float64")) + + for dtype, na_value, pos_inf, neg_inf in dtypes: + in_arr = [neg_inf] * chunk + [na_value] * chunk + [pos_inf] * chunk + iseries = Series(in_arr, dtype=dtype) + if (dtype, method) in disabled: + continue + _check(iseries, method, na_option, ascending) + + def test_rank_desc_mix_nans_infs(self): + # GH 19538 + # check descending ranking when mix nans and infs + iseries = Series([1, np.nan, np.inf, -np.inf, 25]) + result = iseries.rank(ascending=False) + exp = Series([3, np.nan, 1, 4, 2], dtype="float64") + tm.assert_series_equal(result, exp) + + def test_rank_methods_series(self): + pytest.importorskip("scipy.stats.special") + rankdata = pytest.importorskip("scipy.stats.rankdata") + + xs = np.random.randn(9) + xs = np.concatenate([xs[i:] for i in range(0, 9, 2)]) # add duplicates + np.random.shuffle(xs) + + index = [chr(ord("a") + i) for i in range(len(xs))] + + for vals in [xs, xs + 1e6, xs * 1e-6]: + ts = Series(vals, index=index) + + for m in ["average", "min", "max", "first", "dense"]: + result = ts.rank(method=m) + sprank = rankdata(vals, m if m != "first" else "ordinal") + expected = Series(sprank, index=index).astype("float64") + tm.assert_series_equal(result, expected) + + def test_rank_dense_method(self): + dtypes = ["O", "f8", "i8"] + in_out = [ + ([1], [1]), + ([2], [1]), + ([0], [1]), + ([2, 2], [1, 1]), + ([1, 2, 3], [1, 2, 3]), + ([4, 2, 1], [3, 2, 1]), + ([1, 1, 5, 5, 3], [1, 1, 3, 3, 2]), + ([-5, -4, -3, -2, -1], [1, 2, 3, 4, 5]), + ] + + for ser, exp in in_out: + for dtype in dtypes: + s = Series(ser).astype(dtype) + result = s.rank(method="dense") + expected = Series(exp).astype(result.dtype) + tm.assert_series_equal(result, expected) + + def test_rank_descending(self): + dtypes = ["O", "f8", "i8"] + + for dtype, method in product(dtypes, self.results): + if "i" in dtype: + s = self.s.dropna() + else: + s = self.s.astype(dtype) + + res = s.rank(ascending=False) + expected = (s.max() - s).rank() + tm.assert_series_equal(res, expected) + + if method == "first" and dtype == "O": + continue + + expected = (s.max() - s).rank(method=method) + res2 = s.rank(method=method, ascending=False) + tm.assert_series_equal(res2, expected) + + def test_rank_int(self): + s = self.s.dropna().astype("i8") + + for method, res in self.results.items(): + result = s.rank(method=method) + expected = Series(res).dropna() + expected.index = result.index + tm.assert_series_equal(result, expected) + + def test_rank_object_bug(self): + # GH 13445 + + # smoke tests + Series([np.nan] * 32).astype(object).rank(ascending=True) + Series([np.nan] * 32).astype(object).rank(ascending=False) + + def test_rank_modify_inplace(self): + # GH 18521 + # Check rank does not mutate series + s = Series([Timestamp("2017-01-05 10:20:27.569000"), NaT]) + expected = s.copy() + + s.rank() + result = s + tm.assert_series_equal(result, expected) + + +# GH15630, pct should be on 100% basis when method='dense' + + +@pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) +@pytest.mark.parametrize( + "ser, exp", + [ + ([1], [1.0]), + ([1, 2], [1.0 / 2, 2.0 / 2]), + ([2, 2], [1.0, 1.0]), + ([1, 2, 3], [1.0 / 3, 2.0 / 3, 3.0 / 3]), + ([1, 2, 2], [1.0 / 2, 2.0 / 2, 2.0 / 2]), + ([4, 2, 1], [3.0 / 3, 2.0 / 3, 1.0 / 3]), + ([1, 1, 5, 5, 3], [1.0 / 3, 1.0 / 3, 3.0 / 3, 3.0 / 3, 2.0 / 3]), + ([1, 1, 3, 3, 5, 5], [1.0 / 3, 1.0 / 3, 2.0 / 3, 2.0 / 3, 3.0 / 3, 3.0 / 3]), + ([-5, -4, -3, -2, -1], [1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 5.0 / 5]), + ], +) +def test_rank_dense_pct(dtype, ser, exp): + s = Series(ser).astype(dtype) + result = s.rank(method="dense", pct=True) + expected = Series(exp).astype(result.dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) +@pytest.mark.parametrize( + "ser, exp", + [ + ([1], [1.0]), + ([1, 2], [1.0 / 2, 2.0 / 2]), + ([2, 2], [1.0 / 2, 1.0 / 2]), + ([1, 2, 3], [1.0 / 3, 2.0 / 3, 3.0 / 3]), + ([1, 2, 2], [1.0 / 3, 2.0 / 3, 2.0 / 3]), + ([4, 2, 1], [3.0 / 3, 2.0 / 3, 1.0 / 3]), + ([1, 1, 5, 5, 3], [1.0 / 5, 1.0 / 5, 4.0 / 5, 4.0 / 5, 3.0 / 5]), + ([1, 1, 3, 3, 5, 5], [1.0 / 6, 1.0 / 6, 3.0 / 6, 3.0 / 6, 5.0 / 6, 5.0 / 6]), + ([-5, -4, -3, -2, -1], [1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 5.0 / 5]), + ], +) +def test_rank_min_pct(dtype, ser, exp): + s = Series(ser).astype(dtype) + result = s.rank(method="min", pct=True) + expected = Series(exp).astype(result.dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) +@pytest.mark.parametrize( + "ser, exp", + [ + ([1], [1.0]), + ([1, 2], [1.0 / 2, 2.0 / 2]), + ([2, 2], [1.0, 1.0]), + ([1, 2, 3], [1.0 / 3, 2.0 / 3, 3.0 / 3]), + ([1, 2, 2], [1.0 / 3, 3.0 / 3, 3.0 / 3]), + ([4, 2, 1], [3.0 / 3, 2.0 / 3, 1.0 / 3]), + ([1, 1, 5, 5, 3], [2.0 / 5, 2.0 / 5, 5.0 / 5, 5.0 / 5, 3.0 / 5]), + ([1, 1, 3, 3, 5, 5], [2.0 / 6, 2.0 / 6, 4.0 / 6, 4.0 / 6, 6.0 / 6, 6.0 / 6]), + ([-5, -4, -3, -2, -1], [1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 5.0 / 5]), + ], +) +def test_rank_max_pct(dtype, ser, exp): + s = Series(ser).astype(dtype) + result = s.rank(method="max", pct=True) + expected = Series(exp).astype(result.dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) +@pytest.mark.parametrize( + "ser, exp", + [ + ([1], [1.0]), + ([1, 2], [1.0 / 2, 2.0 / 2]), + ([2, 2], [1.5 / 2, 1.5 / 2]), + ([1, 2, 3], [1.0 / 3, 2.0 / 3, 3.0 / 3]), + ([1, 2, 2], [1.0 / 3, 2.5 / 3, 2.5 / 3]), + ([4, 2, 1], [3.0 / 3, 2.0 / 3, 1.0 / 3]), + ([1, 1, 5, 5, 3], [1.5 / 5, 1.5 / 5, 4.5 / 5, 4.5 / 5, 3.0 / 5]), + ([1, 1, 3, 3, 5, 5], [1.5 / 6, 1.5 / 6, 3.5 / 6, 3.5 / 6, 5.5 / 6, 5.5 / 6]), + ([-5, -4, -3, -2, -1], [1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 5.0 / 5]), + ], +) +def test_rank_average_pct(dtype, ser, exp): + s = Series(ser).astype(dtype) + result = s.rank(method="average", pct=True) + expected = Series(exp).astype(result.dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["f8", "i8"]) +@pytest.mark.parametrize( + "ser, exp", + [ + ([1], [1.0]), + ([1, 2], [1.0 / 2, 2.0 / 2]), + ([2, 2], [1.0 / 2, 2.0 / 2.0]), + ([1, 2, 3], [1.0 / 3, 2.0 / 3, 3.0 / 3]), + ([1, 2, 2], [1.0 / 3, 2.0 / 3, 3.0 / 3]), + ([4, 2, 1], [3.0 / 3, 2.0 / 3, 1.0 / 3]), + ([1, 1, 5, 5, 3], [1.0 / 5, 2.0 / 5, 4.0 / 5, 5.0 / 5, 3.0 / 5]), + ([1, 1, 3, 3, 5, 5], [1.0 / 6, 2.0 / 6, 3.0 / 6, 4.0 / 6, 5.0 / 6, 6.0 / 6]), + ([-5, -4, -3, -2, -1], [1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 5.0 / 5]), + ], +) +def test_rank_first_pct(dtype, ser, exp): + s = Series(ser).astype(dtype) + result = s.rank(method="first", pct=True) + expected = Series(exp).astype(result.dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.single +@pytest.mark.high_memory +def test_pct_max_many_rows(): + # GH 18271 + s = Series(np.arange(2 ** 24 + 1)) + result = s.rank(pct=True).max() + assert result == 1 diff --git a/venv/Lib/site-packages/pandas/tests/series/methods/test_replace.py b/venv/Lib/site-packages/pandas/tests/series/methods/test_replace.py new file mode 100644 index 0000000..b20baa2 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/methods/test_replace.py @@ -0,0 +1,364 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +class TestSeriesReplace: + def test_replace(self, datetime_series): + N = 100 + ser = pd.Series(np.random.randn(N)) + ser[0:4] = np.nan + ser[6:10] = 0 + + # replace list with a single value + ser.replace([np.nan], -1, inplace=True) + + exp = ser.fillna(-1) + tm.assert_series_equal(ser, exp) + + rs = ser.replace(0.0, np.nan) + ser[ser == 0.0] = np.nan + tm.assert_series_equal(rs, ser) + + ser = pd.Series(np.fabs(np.random.randn(N)), tm.makeDateIndex(N), dtype=object) + ser[:5] = np.nan + ser[6:10] = "foo" + ser[20:30] = "bar" + + # replace list with a single value + rs = ser.replace([np.nan, "foo", "bar"], -1) + + assert (rs[:5] == -1).all() + assert (rs[6:10] == -1).all() + assert (rs[20:30] == -1).all() + assert (pd.isna(ser[:5])).all() + + # replace with different values + rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) + + assert (rs[:5] == -1).all() + assert (rs[6:10] == -2).all() + assert (rs[20:30] == -3).all() + assert (pd.isna(ser[:5])).all() + + # replace with different values with 2 lists + rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) + tm.assert_series_equal(rs, rs2) + + # replace inplace + ser.replace([np.nan, "foo", "bar"], -1, inplace=True) + + assert (ser[:5] == -1).all() + assert (ser[6:10] == -1).all() + assert (ser[20:30] == -1).all() + + ser = pd.Series([np.nan, 0, np.inf]) + tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0)) + + ser = pd.Series([np.nan, 0, "foo", "bar", np.inf, None, pd.NaT]) + tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0)) + filled = ser.copy() + filled[4] = 0 + tm.assert_series_equal(ser.replace(np.inf, 0), filled) + + ser = pd.Series(datetime_series.index) + tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0)) + + # malformed + msg = r"Replacement lists must match in length\. Expecting 3 got 2" + with pytest.raises(ValueError, match=msg): + ser.replace([1, 2, 3], [np.nan, 0]) + + # make sure that we aren't just masking a TypeError because bools don't + # implement indexing + with pytest.raises(TypeError, match="Cannot compare types .+"): + ser.replace([1, 2], [np.nan, 0]) + + ser = pd.Series([0, 1, 2, 3, 4]) + result = ser.replace([0, 1, 2, 3, 4], [4, 3, 2, 1, 0]) + tm.assert_series_equal(result, pd.Series([4, 3, 2, 1, 0])) + + def test_replace_gh5319(self): + # API change from 0.12? + # GH 5319 + ser = pd.Series([0, np.nan, 2, 3, 4]) + expected = ser.ffill() + result = ser.replace([np.nan]) + tm.assert_series_equal(result, expected) + + ser = pd.Series([0, np.nan, 2, 3, 4]) + expected = ser.ffill() + result = ser.replace(np.nan) + tm.assert_series_equal(result, expected) + # GH 5797 + ser = pd.Series(pd.date_range("20130101", periods=5)) + expected = ser.copy() + expected.loc[2] = pd.Timestamp("20120101") + result = ser.replace({pd.Timestamp("20130103"): pd.Timestamp("20120101")}) + tm.assert_series_equal(result, expected) + result = ser.replace(pd.Timestamp("20130103"), pd.Timestamp("20120101")) + tm.assert_series_equal(result, expected) + + # GH 11792: Test with replacing NaT in a list with tz data + ts = pd.Timestamp("2015/01/01", tz="UTC") + s = pd.Series([pd.NaT, pd.Timestamp("2015/01/01", tz="UTC")]) + result = s.replace([np.nan, pd.NaT], pd.Timestamp.min) + expected = pd.Series([pd.Timestamp.min, ts], dtype=object) + tm.assert_series_equal(expected, result) + + def test_replace_with_single_list(self): + ser = pd.Series([0, 1, 2, 3, 4]) + result = ser.replace([1, 2, 3]) + tm.assert_series_equal(result, pd.Series([0, 0, 0, 0, 4])) + + s = ser.copy() + s.replace([1, 2, 3], inplace=True) + tm.assert_series_equal(s, pd.Series([0, 0, 0, 0, 4])) + + # make sure things don't get corrupted when fillna call fails + s = ser.copy() + msg = ( + r"Invalid fill method\. Expecting pad \(ffill\) or backfill" + r" \(bfill\)\. Got crash_cymbal" + ) + with pytest.raises(ValueError, match=msg): + s.replace([1, 2, 3], inplace=True, method="crash_cymbal") + tm.assert_series_equal(s, ser) + + def test_replace_with_empty_list(self): + # GH 21977 + s = pd.Series([[1], [2, 3], [], np.nan, [4]]) + expected = s + result = s.replace([], np.nan) + tm.assert_series_equal(result, expected) + + # GH 19266 + with pytest.raises(ValueError, match="cannot assign mismatch"): + s.replace({np.nan: []}) + with pytest.raises(ValueError, match="cannot assign mismatch"): + s.replace({np.nan: ["dummy", "alt"]}) + + def test_replace_mixed_types(self): + s = pd.Series(np.arange(5), dtype="int64") + + def check_replace(to_rep, val, expected): + sc = s.copy() + r = s.replace(to_rep, val) + sc.replace(to_rep, val, inplace=True) + tm.assert_series_equal(expected, r) + tm.assert_series_equal(expected, sc) + + # MUST upcast to float + e = pd.Series([0.0, 1.0, 2.0, 3.0, 4.0]) + tr, v = [3], [3.0] + check_replace(tr, v, e) + + # MUST upcast to float + e = pd.Series([0, 1, 2, 3.5, 4]) + tr, v = [3], [3.5] + check_replace(tr, v, e) + + # casts to object + e = pd.Series([0, 1, 2, 3.5, "a"]) + tr, v = [3, 4], [3.5, "a"] + check_replace(tr, v, e) + + # again casts to object + e = pd.Series([0, 1, 2, 3.5, pd.Timestamp("20130101")]) + tr, v = [3, 4], [3.5, pd.Timestamp("20130101")] + check_replace(tr, v, e) + + # casts to object + e = pd.Series([0, 1, 2, 3.5, True], dtype="object") + tr, v = [3, 4], [3.5, True] + check_replace(tr, v, e) + + # test an object with dates + floats + integers + strings + dr = ( + pd.date_range("1/1/2001", "1/10/2001", freq="D") + .to_series() + .reset_index(drop=True) + ) + result = dr.astype(object).replace([dr[0], dr[1], dr[2]], [1.0, 2, "a"]) + expected = pd.Series([1.0, 2, "a"] + dr[3:].tolist(), dtype=object) + tm.assert_series_equal(result, expected) + + def test_replace_bool_with_string_no_op(self): + s = pd.Series([True, False, True]) + result = s.replace("fun", "in-the-sun") + tm.assert_series_equal(s, result) + + def test_replace_bool_with_string(self): + # nonexistent elements + s = pd.Series([True, False, True]) + result = s.replace(True, "2u") + expected = pd.Series(["2u", False, "2u"]) + tm.assert_series_equal(expected, result) + + def test_replace_bool_with_bool(self): + s = pd.Series([True, False, True]) + result = s.replace(True, False) + expected = pd.Series([False] * len(s)) + tm.assert_series_equal(expected, result) + + def test_replace_with_dict_with_bool_keys(self): + s = pd.Series([True, False, True]) + with pytest.raises(TypeError, match="Cannot compare types .+"): + s.replace({"asdf": "asdb", True: "yes"}) + + def test_replace2(self): + N = 100 + ser = pd.Series(np.fabs(np.random.randn(N)), tm.makeDateIndex(N), dtype=object) + ser[:5] = np.nan + ser[6:10] = "foo" + ser[20:30] = "bar" + + # replace list with a single value + rs = ser.replace([np.nan, "foo", "bar"], -1) + + assert (rs[:5] == -1).all() + assert (rs[6:10] == -1).all() + assert (rs[20:30] == -1).all() + assert (pd.isna(ser[:5])).all() + + # replace with different values + rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) + + assert (rs[:5] == -1).all() + assert (rs[6:10] == -2).all() + assert (rs[20:30] == -3).all() + assert (pd.isna(ser[:5])).all() + + # replace with different values with 2 lists + rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) + tm.assert_series_equal(rs, rs2) + + # replace inplace + ser.replace([np.nan, "foo", "bar"], -1, inplace=True) + assert (ser[:5] == -1).all() + assert (ser[6:10] == -1).all() + assert (ser[20:30] == -1).all() + + def test_replace_with_empty_dictlike(self): + # GH 15289 + s = pd.Series(list("abcd")) + tm.assert_series_equal(s, s.replace(dict())) + + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + empty_series = pd.Series([]) + tm.assert_series_equal(s, s.replace(empty_series)) + + def test_replace_string_with_number(self): + # GH 15743 + s = pd.Series([1, 2, 3]) + result = s.replace("2", np.nan) + expected = pd.Series([1, 2, 3]) + tm.assert_series_equal(expected, result) + + def test_replace_replacer_equals_replacement(self): + # GH 20656 + # make sure all replacers are matching against original values + s = pd.Series(["a", "b"]) + expected = pd.Series(["b", "a"]) + result = s.replace({"a": "b", "b": "a"}) + tm.assert_series_equal(expected, result) + + def test_replace_unicode_with_number(self): + # GH 15743 + s = pd.Series([1, 2, 3]) + result = s.replace("2", np.nan) + expected = pd.Series([1, 2, 3]) + tm.assert_series_equal(expected, result) + + def test_replace_mixed_types_with_string(self): + # Testing mixed + s = pd.Series([1, 2, 3, "4", 4, 5]) + result = s.replace([2, "4"], np.nan) + expected = pd.Series([1, np.nan, 3, np.nan, 4, 5]) + tm.assert_series_equal(expected, result) + + @pytest.mark.parametrize( + "categorical, numeric", + [ + (pd.Categorical("A", categories=["A", "B"]), [1]), + (pd.Categorical(("A",), categories=["A", "B"]), [1]), + (pd.Categorical(("A", "B"), categories=["A", "B"]), [1, 2]), + ], + ) + def test_replace_categorical(self, categorical, numeric): + # GH 24971 + # Do not check if dtypes are equal due to a known issue that + # Categorical.replace sometimes coerces to object (GH 23305) + s = pd.Series(categorical) + result = s.replace({"A": 1, "B": 2}) + expected = pd.Series(numeric) + tm.assert_series_equal(expected, result, check_dtype=False) + + def test_replace_categorical_single(self): + # GH 26988 + dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") + s = pd.Series(dti) + c = s.astype("category") + + expected = c.copy() + expected = expected.cat.add_categories("foo") + expected[2] = "foo" + expected = expected.cat.remove_unused_categories() + assert c[2] != "foo" + + result = c.replace(c[2], "foo") + tm.assert_series_equal(expected, result) + assert c[2] != "foo" # ensure non-inplace call does not alter original + + c.replace(c[2], "foo", inplace=True) + tm.assert_series_equal(expected, c) + + first_value = c[0] + c.replace(c[1], c[0], inplace=True) + assert c[0] == c[1] == first_value # test replacing with existing value + + def test_replace_with_no_overflowerror(self): + # GH 25616 + # casts to object without Exception from OverflowError + s = pd.Series([0, 1, 2, 3, 4]) + result = s.replace([3], ["100000000000000000000"]) + expected = pd.Series([0, 1, 2, "100000000000000000000", 4]) + tm.assert_series_equal(result, expected) + + s = pd.Series([0, "100000000000000000000", "100000000000000000001"]) + result = s.replace(["100000000000000000000"], [1]) + expected = pd.Series([0, 1, "100000000000000000001"]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "ser, to_replace, exp", + [ + ([1, 2, 3], {1: 2, 2: 3, 3: 4}, [2, 3, 4]), + (["1", "2", "3"], {"1": "2", "2": "3", "3": "4"}, ["2", "3", "4"]), + ], + ) + def test_replace_commutative(self, ser, to_replace, exp): + # GH 16051 + # DataFrame.replace() overwrites when values are non-numeric + + series = pd.Series(ser) + + expected = pd.Series(exp) + result = series.replace(to_replace) + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "ser, exp", [([1, 2, 3], [1, True, 3]), (["x", 2, 3], ["x", True, 3])] + ) + def test_replace_no_cast(self, ser, exp): + # GH 9113 + # BUG: replace int64 dtype with bool coerces to int64 + + series = pd.Series(ser) + result = series.replace(2, True) + expected = pd.Series(exp) + + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/series/methods/test_round.py b/venv/Lib/site-packages/pandas/tests/series/methods/test_round.py new file mode 100644 index 0000000..7f0711a --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/methods/test_round.py @@ -0,0 +1,46 @@ +import numpy as np +import pytest + +from pandas import Series +import pandas._testing as tm + + +class TestSeriesRound: + def test_round(self, datetime_series): + datetime_series.index.name = "index_name" + result = datetime_series.round(2) + expected = Series( + np.round(datetime_series.values, 2), index=datetime_series.index, name="ts" + ) + tm.assert_series_equal(result, expected) + assert result.name == datetime_series.name + + def test_round_numpy(self): + # See GH#12600 + ser = Series([1.53, 1.36, 0.06]) + out = np.round(ser, decimals=0) + expected = Series([2.0, 1.0, 0.0]) + tm.assert_series_equal(out, expected) + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.round(ser, decimals=0, out=ser) + + def test_round_numpy_with_nan(self): + # See GH#14197 + ser = Series([1.53, np.nan, 0.06]) + with tm.assert_produces_warning(None): + result = ser.round() + expected = Series([2.0, np.nan, 0.0]) + tm.assert_series_equal(result, expected) + + def test_round_builtin(self): + ser = Series([1.123, 2.123, 3.123], index=range(3)) + result = round(ser) + expected_rounded0 = Series([1.0, 2.0, 3.0], index=range(3)) + tm.assert_series_equal(result, expected_rounded0) + + decimals = 2 + expected_rounded = Series([1.12, 2.12, 3.12], index=range(3)) + result = round(ser, decimals) + tm.assert_series_equal(result, expected_rounded) diff --git a/venv/Lib/site-packages/pandas/tests/series/methods/test_searchsorted.py b/venv/Lib/site-packages/pandas/tests/series/methods/test_searchsorted.py new file mode 100644 index 0000000..fd6c6f7 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/methods/test_searchsorted.py @@ -0,0 +1,55 @@ +import numpy as np + +from pandas import Series, Timestamp, date_range +import pandas._testing as tm +from pandas.api.types import is_scalar + + +class TestSeriesSearchSorted: + def test_searchsorted(self): + ser = Series([1, 2, 3]) + + result = ser.searchsorted(1, side="left") + assert is_scalar(result) + assert result == 0 + + result = ser.searchsorted(1, side="right") + assert is_scalar(result) + assert result == 1 + + def test_searchsorted_numeric_dtypes_scalar(self): + ser = Series([1, 2, 90, 1000, 3e9]) + res = ser.searchsorted(30) + assert is_scalar(res) + assert res == 2 + + res = ser.searchsorted([30]) + exp = np.array([2], dtype=np.intp) + tm.assert_numpy_array_equal(res, exp) + + def test_searchsorted_numeric_dtypes_vector(self): + ser = Series([1, 2, 90, 1000, 3e9]) + res = ser.searchsorted([91, 2e6]) + exp = np.array([3, 4], dtype=np.intp) + tm.assert_numpy_array_equal(res, exp) + + def test_searchsorted_datetime64_scalar(self): + ser = Series(date_range("20120101", periods=10, freq="2D")) + val = Timestamp("20120102") + res = ser.searchsorted(val) + assert is_scalar(res) + assert res == 1 + + def test_searchsorted_datetime64_list(self): + ser = Series(date_range("20120101", periods=10, freq="2D")) + vals = [Timestamp("20120102"), Timestamp("20120104")] + res = ser.searchsorted(vals) + exp = np.array([1, 2], dtype=np.intp) + tm.assert_numpy_array_equal(res, exp) + + def test_searchsorted_sorter(self): + # GH8490 + ser = Series([3, 1, 2]) + res = ser.searchsorted([0, 3], sorter=np.argsort(ser)) + exp = np.array([0, 2], dtype=np.intp) + tm.assert_numpy_array_equal(res, exp) diff --git a/venv/Lib/site-packages/pandas/tests/series/methods/test_shift.py b/venv/Lib/site-packages/pandas/tests/series/methods/test_shift.py new file mode 100644 index 0000000..8256e2f --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/methods/test_shift.py @@ -0,0 +1,265 @@ +import numpy as np +import pytest + +from pandas.errors import NullFrequencyError + +import pandas as pd +from pandas import ( + DatetimeIndex, + Index, + NaT, + Series, + TimedeltaIndex, + date_range, + offsets, +) +import pandas._testing as tm + +from pandas.tseries.offsets import BDay + + +class TestShift: + def test_shift(self, datetime_series): + shifted = datetime_series.shift(1) + unshifted = shifted.shift(-1) + + tm.assert_index_equal(shifted.index, datetime_series.index) + tm.assert_index_equal(unshifted.index, datetime_series.index) + tm.assert_numpy_array_equal( + unshifted.dropna().values, datetime_series.values[:-1] + ) + + offset = BDay() + shifted = datetime_series.shift(1, freq=offset) + unshifted = shifted.shift(-1, freq=offset) + + tm.assert_series_equal(unshifted, datetime_series) + + unshifted = datetime_series.shift(0, freq=offset) + tm.assert_series_equal(unshifted, datetime_series) + + shifted = datetime_series.shift(1, freq="B") + unshifted = shifted.shift(-1, freq="B") + + tm.assert_series_equal(unshifted, datetime_series) + + # corner case + unshifted = datetime_series.shift(0) + tm.assert_series_equal(unshifted, datetime_series) + + # Shifting with PeriodIndex + ps = tm.makePeriodSeries() + shifted = ps.shift(1) + unshifted = shifted.shift(-1) + tm.assert_index_equal(shifted.index, ps.index) + tm.assert_index_equal(unshifted.index, ps.index) + tm.assert_numpy_array_equal(unshifted.dropna().values, ps.values[:-1]) + + shifted2 = ps.shift(1, "B") + shifted3 = ps.shift(1, BDay()) + tm.assert_series_equal(shifted2, shifted3) + tm.assert_series_equal(ps, shifted2.shift(-1, "B")) + + msg = "Given freq D does not match PeriodIndex freq B" + with pytest.raises(ValueError, match=msg): + ps.shift(freq="D") + + # legacy support + shifted4 = ps.shift(1, freq="B") + tm.assert_series_equal(shifted2, shifted4) + + shifted5 = ps.shift(1, freq=BDay()) + tm.assert_series_equal(shifted5, shifted4) + + # 32-bit taking + # GH#8129 + index = date_range("2000-01-01", periods=5) + for dtype in ["int32", "int64"]: + s1 = Series(np.arange(5, dtype=dtype), index=index) + p = s1.iloc[1] + result = s1.shift(periods=p) + expected = Series([np.nan, 0, 1, 2, 3], index=index) + tm.assert_series_equal(result, expected) + + # GH#8260 + # with tz + s = Series( + date_range("2000-01-01 09:00:00", periods=5, tz="US/Eastern"), name="foo" + ) + result = s - s.shift() + + exp = Series(TimedeltaIndex(["NaT"] + ["1 days"] * 4), name="foo") + tm.assert_series_equal(result, exp) + + # incompat tz + s2 = Series(date_range("2000-01-01 09:00:00", periods=5, tz="CET"), name="foo") + msg = "DatetimeArray subtraction must have the same timezones or no timezones" + with pytest.raises(TypeError, match=msg): + s - s2 + + def test_shift2(self): + ts = Series( + np.random.randn(5), index=date_range("1/1/2000", periods=5, freq="H") + ) + + result = ts.shift(1, freq="5T") + exp_index = ts.index.shift(1, freq="5T") + tm.assert_index_equal(result.index, exp_index) + + # GH#1063, multiple of same base + result = ts.shift(1, freq="4H") + exp_index = ts.index + offsets.Hour(4) + tm.assert_index_equal(result.index, exp_index) + + idx = DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-04"]) + msg = "Cannot shift with no freq" + with pytest.raises(NullFrequencyError, match=msg): + idx.shift(1) + + def test_shift_fill_value(self): + # GH#24128 + ts = Series( + [1.0, 2.0, 3.0, 4.0, 5.0], index=date_range("1/1/2000", periods=5, freq="H") + ) + + exp = Series( + [0.0, 1.0, 2.0, 3.0, 4.0], index=date_range("1/1/2000", periods=5, freq="H") + ) + # check that fill value works + result = ts.shift(1, fill_value=0.0) + tm.assert_series_equal(result, exp) + + exp = Series( + [0.0, 0.0, 1.0, 2.0, 3.0], index=date_range("1/1/2000", periods=5, freq="H") + ) + result = ts.shift(2, fill_value=0.0) + tm.assert_series_equal(result, exp) + + ts = pd.Series([1, 2, 3]) + res = ts.shift(2, fill_value=0) + assert res.dtype == ts.dtype + + def test_shift_categorical_fill_value(self): + ts = pd.Series(["a", "b", "c", "d"], dtype="category") + res = ts.shift(1, fill_value="a") + expected = pd.Series( + pd.Categorical( + ["a", "a", "b", "c"], categories=["a", "b", "c", "d"], ordered=False + ) + ) + tm.assert_equal(res, expected) + + # check for incorrect fill_value + msg = "'fill_value=f' is not present in this Categorical's categories" + with pytest.raises(ValueError, match=msg): + ts.shift(1, fill_value="f") + + def test_shift_dst(self): + # GH#13926 + dates = date_range("2016-11-06", freq="H", periods=10, tz="US/Eastern") + s = Series(dates) + + res = s.shift(0) + tm.assert_series_equal(res, s) + assert res.dtype == "datetime64[ns, US/Eastern]" + + res = s.shift(1) + exp_vals = [NaT] + dates.astype(object).values.tolist()[:9] + exp = Series(exp_vals) + tm.assert_series_equal(res, exp) + assert res.dtype == "datetime64[ns, US/Eastern]" + + res = s.shift(-2) + exp_vals = dates.astype(object).values.tolist()[2:] + [NaT, NaT] + exp = Series(exp_vals) + tm.assert_series_equal(res, exp) + assert res.dtype == "datetime64[ns, US/Eastern]" + + for ex in [10, -10, 20, -20]: + res = s.shift(ex) + exp = Series([NaT] * 10, dtype="datetime64[ns, US/Eastern]") + tm.assert_series_equal(res, exp) + assert res.dtype == "datetime64[ns, US/Eastern]" + + def test_tshift(self, datetime_series): + # PeriodIndex + ps = tm.makePeriodSeries() + shifted = ps.tshift(1) + unshifted = shifted.tshift(-1) + + tm.assert_series_equal(unshifted, ps) + + shifted2 = ps.tshift(freq="B") + tm.assert_series_equal(shifted, shifted2) + + shifted3 = ps.tshift(freq=BDay()) + tm.assert_series_equal(shifted, shifted3) + + msg = "Given freq M does not match PeriodIndex freq B" + with pytest.raises(ValueError, match=msg): + ps.tshift(freq="M") + + # DatetimeIndex + shifted = datetime_series.tshift(1) + unshifted = shifted.tshift(-1) + + tm.assert_series_equal(datetime_series, unshifted) + + shifted2 = datetime_series.tshift(freq=datetime_series.index.freq) + tm.assert_series_equal(shifted, shifted2) + + inferred_ts = Series( + datetime_series.values, Index(np.asarray(datetime_series.index)), name="ts" + ) + shifted = inferred_ts.tshift(1) + unshifted = shifted.tshift(-1) + tm.assert_series_equal(shifted, datetime_series.tshift(1)) + tm.assert_series_equal(unshifted, inferred_ts) + + no_freq = datetime_series[[0, 5, 7]] + msg = "Freq was not given and was not set in the index" + with pytest.raises(ValueError, match=msg): + no_freq.tshift() + + def test_shift_int(self, datetime_series): + ts = datetime_series.astype(int) + shifted = ts.shift(1) + expected = ts.astype(float).shift(1) + tm.assert_series_equal(shifted, expected) + + def test_shift_object_non_scalar_fill(self): + # shift requires scalar fill_value except for object dtype + ser = Series(range(3)) + with pytest.raises(ValueError, match="fill_value must be a scalar"): + ser.shift(1, fill_value=[]) + + df = ser.to_frame() + with pytest.raises(ValueError, match="fill_value must be a scalar"): + df.shift(1, fill_value=np.arange(3)) + + obj_ser = ser.astype(object) + result = obj_ser.shift(1, fill_value={}) + assert result[0] == {} + + obj_df = obj_ser.to_frame() + result = obj_df.shift(1, fill_value={}) + assert result.iloc[0, 0] == {} + + def test_shift_categorical(self): + # GH#9416 + s = pd.Series(["a", "b", "c", "d"], dtype="category") + + tm.assert_series_equal(s.iloc[:-1], s.shift(1).shift(-1).dropna()) + + sp1 = s.shift(1) + tm.assert_index_equal(s.index, sp1.index) + assert np.all(sp1.values.codes[:1] == -1) + assert np.all(s.values.codes[:-1] == sp1.values.codes[1:]) + + sn2 = s.shift(-2) + tm.assert_index_equal(s.index, sn2.index) + assert np.all(sn2.values.codes[-2:] == -1) + assert np.all(s.values.codes[2:] == sn2.values.codes[:-2]) + + tm.assert_index_equal(s.values.categories, sp1.values.categories) + tm.assert_index_equal(s.values.categories, sn2.values.categories) diff --git a/venv/Lib/site-packages/pandas/tests/series/methods/test_sort_index.py b/venv/Lib/site-packages/pandas/tests/series/methods/test_sort_index.py new file mode 100644 index 0000000..6fa4eea --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/methods/test_sort_index.py @@ -0,0 +1,168 @@ +import random + +import numpy as np +import pytest + +from pandas import IntervalIndex, MultiIndex, Series +import pandas._testing as tm + + +class TestSeriesSortIndex: + def test_sort_index(self, datetime_series): + rindex = list(datetime_series.index) + random.shuffle(rindex) + + random_order = datetime_series.reindex(rindex) + sorted_series = random_order.sort_index() + tm.assert_series_equal(sorted_series, datetime_series) + + # descending + sorted_series = random_order.sort_index(ascending=False) + tm.assert_series_equal( + sorted_series, datetime_series.reindex(datetime_series.index[::-1]) + ) + + # compat on level + sorted_series = random_order.sort_index(level=0) + tm.assert_series_equal(sorted_series, datetime_series) + + # compat on axis + sorted_series = random_order.sort_index(axis=0) + tm.assert_series_equal(sorted_series, datetime_series) + + msg = "No axis named 1 for object type " + with pytest.raises(ValueError, match=msg): + random_order.sort_values(axis=1) + + sorted_series = random_order.sort_index(level=0, axis=0) + tm.assert_series_equal(sorted_series, datetime_series) + + with pytest.raises(ValueError, match=msg): + random_order.sort_index(level=0, axis=1) + + def test_sort_index_inplace(self, datetime_series): + + # For GH#11402 + rindex = list(datetime_series.index) + random.shuffle(rindex) + + # descending + random_order = datetime_series.reindex(rindex) + result = random_order.sort_index(ascending=False, inplace=True) + + assert result is None + tm.assert_series_equal( + random_order, datetime_series.reindex(datetime_series.index[::-1]) + ) + + # ascending + random_order = datetime_series.reindex(rindex) + result = random_order.sort_index(ascending=True, inplace=True) + + assert result is None + tm.assert_series_equal(random_order, datetime_series) + + def test_sort_index_level(self): + mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC")) + s = Series([1, 2], mi) + backwards = s.iloc[[1, 0]] + + res = s.sort_index(level="A") + tm.assert_series_equal(backwards, res) + + res = s.sort_index(level=["A", "B"]) + tm.assert_series_equal(backwards, res) + + res = s.sort_index(level="A", sort_remaining=False) + tm.assert_series_equal(s, res) + + res = s.sort_index(level=["A", "B"], sort_remaining=False) + tm.assert_series_equal(s, res) + + @pytest.mark.parametrize("level", ["A", 0]) # GH#21052 + def test_sort_index_multiindex(self, level): + + mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC")) + s = Series([1, 2], mi) + backwards = s.iloc[[1, 0]] + + # implicit sort_remaining=True + res = s.sort_index(level=level) + tm.assert_series_equal(backwards, res) + + # GH#13496 + # sort has no effect without remaining lvls + res = s.sort_index(level=level, sort_remaining=False) + tm.assert_series_equal(s, res) + + def test_sort_index_kind(self): + # GH#14444 & GH#13589: Add support for sort algo choosing + series = Series(index=[3, 2, 1, 4, 3], dtype=object) + expected_series = Series(index=[1, 2, 3, 3, 4], dtype=object) + + index_sorted_series = series.sort_index(kind="mergesort") + tm.assert_series_equal(expected_series, index_sorted_series) + + index_sorted_series = series.sort_index(kind="quicksort") + tm.assert_series_equal(expected_series, index_sorted_series) + + index_sorted_series = series.sort_index(kind="heapsort") + tm.assert_series_equal(expected_series, index_sorted_series) + + def test_sort_index_na_position(self): + series = Series(index=[3, 2, 1, 4, 3, np.nan], dtype=object) + expected_series_first = Series(index=[np.nan, 1, 2, 3, 3, 4], dtype=object) + + index_sorted_series = series.sort_index(na_position="first") + tm.assert_series_equal(expected_series_first, index_sorted_series) + + expected_series_last = Series(index=[1, 2, 3, 3, 4, np.nan], dtype=object) + + index_sorted_series = series.sort_index(na_position="last") + tm.assert_series_equal(expected_series_last, index_sorted_series) + + def test_sort_index_intervals(self): + s = Series( + [np.nan, 1, 2, 3], IntervalIndex.from_arrays([0, 1, 2, 3], [1, 2, 3, 4]) + ) + + result = s.sort_index() + expected = s + tm.assert_series_equal(result, expected) + + result = s.sort_index(ascending=False) + expected = Series( + [3, 2, 1, np.nan], IntervalIndex.from_arrays([3, 2, 1, 0], [4, 3, 2, 1]) + ) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize( + "original_list, sorted_list, ascending, ignore_index, output_index", + [ + ([2, 3, 6, 1], [2, 3, 6, 1], True, True, [0, 1, 2, 3]), + ([2, 3, 6, 1], [2, 3, 6, 1], True, False, [0, 1, 2, 3]), + ([2, 3, 6, 1], [1, 6, 3, 2], False, True, [0, 1, 2, 3]), + ([2, 3, 6, 1], [1, 6, 3, 2], False, False, [3, 2, 1, 0]), + ], + ) + def test_sort_index_ignore_index( + self, inplace, original_list, sorted_list, ascending, ignore_index, output_index + ): + # GH 30114 + ser = Series(original_list) + expected = Series(sorted_list, index=output_index) + kwargs = { + "ascending": ascending, + "ignore_index": ignore_index, + "inplace": inplace, + } + + if inplace: + result_ser = ser.copy() + result_ser.sort_index(**kwargs) + else: + result_ser = ser.sort_index(**kwargs) + + tm.assert_series_equal(result_ser, expected) + tm.assert_series_equal(ser, Series(original_list)) diff --git a/venv/Lib/site-packages/pandas/tests/series/methods/test_sort_values.py b/venv/Lib/site-packages/pandas/tests/series/methods/test_sort_values.py new file mode 100644 index 0000000..caa2abd --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/methods/test_sort_values.py @@ -0,0 +1,183 @@ +import numpy as np +import pytest + +from pandas import Categorical, DataFrame, Series +import pandas._testing as tm + + +class TestSeriesSortValues: + def test_sort_values(self, datetime_series): + + # check indexes are reordered corresponding with the values + ser = Series([3, 2, 4, 1], ["A", "B", "C", "D"]) + expected = Series([1, 2, 3, 4], ["D", "B", "A", "C"]) + result = ser.sort_values() + tm.assert_series_equal(expected, result) + + ts = datetime_series.copy() + ts[:5] = np.NaN + vals = ts.values + + result = ts.sort_values() + assert np.isnan(result[-5:]).all() + tm.assert_numpy_array_equal(result[:-5].values, np.sort(vals[5:])) + + # na_position + result = ts.sort_values(na_position="first") + assert np.isnan(result[:5]).all() + tm.assert_numpy_array_equal(result[5:].values, np.sort(vals[5:])) + + # something object-type + ser = Series(["A", "B"], [1, 2]) + # no failure + ser.sort_values() + + # ascending=False + ordered = ts.sort_values(ascending=False) + expected = np.sort(ts.dropna().values)[::-1] + tm.assert_almost_equal(expected, ordered.dropna().values) + ordered = ts.sort_values(ascending=False, na_position="first") + tm.assert_almost_equal(expected, ordered.dropna().values) + + # ascending=[False] should behave the same as ascending=False + ordered = ts.sort_values(ascending=[False]) + expected = ts.sort_values(ascending=False) + tm.assert_series_equal(expected, ordered) + ordered = ts.sort_values(ascending=[False], na_position="first") + expected = ts.sort_values(ascending=False, na_position="first") + tm.assert_series_equal(expected, ordered) + + msg = "ascending must be boolean" + with pytest.raises(ValueError, match=msg): + ts.sort_values(ascending=None) + msg = r"Length of ascending \(0\) must be 1 for Series" + with pytest.raises(ValueError, match=msg): + ts.sort_values(ascending=[]) + msg = r"Length of ascending \(3\) must be 1 for Series" + with pytest.raises(ValueError, match=msg): + ts.sort_values(ascending=[1, 2, 3]) + msg = r"Length of ascending \(2\) must be 1 for Series" + with pytest.raises(ValueError, match=msg): + ts.sort_values(ascending=[False, False]) + msg = "ascending must be boolean" + with pytest.raises(ValueError, match=msg): + ts.sort_values(ascending="foobar") + + # inplace=True + ts = datetime_series.copy() + ts.sort_values(ascending=False, inplace=True) + tm.assert_series_equal(ts, datetime_series.sort_values(ascending=False)) + tm.assert_index_equal( + ts.index, datetime_series.sort_values(ascending=False).index + ) + + # GH#5856/5853 + # Series.sort_values operating on a view + df = DataFrame(np.random.randn(10, 4)) + s = df.iloc[:, 0] + + msg = ( + "This Series is a view of some other array, to sort in-place " + "you must create a copy" + ) + with pytest.raises(ValueError, match=msg): + s.sort_values(inplace=True) + + def test_sort_values_categorical(self): + + c = Categorical(["a", "b", "b", "a"], ordered=False) + cat = Series(c.copy()) + + # sort in the categories order + expected = Series( + Categorical(["a", "a", "b", "b"], ordered=False), index=[0, 3, 1, 2] + ) + result = cat.sort_values() + tm.assert_series_equal(result, expected) + + cat = Series(Categorical(["a", "c", "b", "d"], ordered=True)) + res = cat.sort_values() + exp = np.array(["a", "b", "c", "d"], dtype=np.object_) + tm.assert_numpy_array_equal(res.__array__(), exp) + + cat = Series( + Categorical( + ["a", "c", "b", "d"], categories=["a", "b", "c", "d"], ordered=True + ) + ) + res = cat.sort_values() + exp = np.array(["a", "b", "c", "d"], dtype=np.object_) + tm.assert_numpy_array_equal(res.__array__(), exp) + + res = cat.sort_values(ascending=False) + exp = np.array(["d", "c", "b", "a"], dtype=np.object_) + tm.assert_numpy_array_equal(res.__array__(), exp) + + raw_cat1 = Categorical( + ["a", "b", "c", "d"], categories=["a", "b", "c", "d"], ordered=False + ) + raw_cat2 = Categorical( + ["a", "b", "c", "d"], categories=["d", "c", "b", "a"], ordered=True + ) + s = ["a", "b", "c", "d"] + df = DataFrame( + {"unsort": raw_cat1, "sort": raw_cat2, "string": s, "values": [1, 2, 3, 4]} + ) + + # Cats must be sorted in a dataframe + res = df.sort_values(by=["string"], ascending=False) + exp = np.array(["d", "c", "b", "a"], dtype=np.object_) + tm.assert_numpy_array_equal(res["sort"].values.__array__(), exp) + assert res["sort"].dtype == "category" + + res = df.sort_values(by=["sort"], ascending=False) + exp = df.sort_values(by=["string"], ascending=True) + tm.assert_series_equal(res["values"], exp["values"]) + assert res["sort"].dtype == "category" + assert res["unsort"].dtype == "category" + + # unordered cat, but we allow this + df.sort_values(by=["unsort"], ascending=False) + + # multi-columns sort + # GH#7848 + df = DataFrame( + {"id": [6, 5, 4, 3, 2, 1], "raw_grade": ["a", "b", "b", "a", "a", "e"]} + ) + df["grade"] = Categorical(df["raw_grade"], ordered=True) + df["grade"] = df["grade"].cat.set_categories(["b", "e", "a"]) + + # sorts 'grade' according to the order of the categories + result = df.sort_values(by=["grade"]) + expected = df.iloc[[1, 2, 5, 0, 3, 4]] + tm.assert_frame_equal(result, expected) + + # multi + result = df.sort_values(by=["grade", "id"]) + expected = df.iloc[[2, 1, 5, 4, 3, 0]] + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize( + "original_list, sorted_list, ignore_index, output_index", + [ + ([2, 3, 6, 1], [6, 3, 2, 1], True, [0, 1, 2, 3]), + ([2, 3, 6, 1], [6, 3, 2, 1], False, [2, 1, 0, 3]), + ], + ) + def test_sort_values_ignore_index( + self, inplace, original_list, sorted_list, ignore_index, output_index + ): + # GH 30114 + ser = Series(original_list) + expected = Series(sorted_list, index=output_index) + kwargs = {"ignore_index": ignore_index, "inplace": inplace} + + if inplace: + result_ser = ser.copy() + result_ser.sort_values(ascending=False, **kwargs) + else: + result_ser = ser.sort_values(ascending=False, **kwargs) + + tm.assert_series_equal(result_ser, expected) + tm.assert_series_equal(ser, Series(original_list)) diff --git a/venv/Lib/site-packages/pandas/tests/series/methods/test_to_dict.py b/venv/Lib/site-packages/pandas/tests/series/methods/test_to_dict.py new file mode 100644 index 0000000..2fbf3e8 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/methods/test_to_dict.py @@ -0,0 +1,20 @@ +import collections + +import pytest + +from pandas import Series +import pandas._testing as tm + + +class TestSeriesToDict: + @pytest.mark.parametrize( + "mapping", (dict, collections.defaultdict(list), collections.OrderedDict) + ) + def test_to_dict(self, mapping, datetime_series): + # GH#16122 + tm.assert_series_equal( + Series(datetime_series.to_dict(mapping), name="ts"), datetime_series + ) + from_method = Series(datetime_series.to_dict(collections.Counter)) + from_constructor = Series(collections.Counter(datetime_series.items())) + tm.assert_series_equal(from_method, from_constructor) diff --git a/venv/Lib/site-packages/pandas/tests/series/methods/test_truncate.py b/venv/Lib/site-packages/pandas/tests/series/methods/test_truncate.py new file mode 100644 index 0000000..d4e2890 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/methods/test_truncate.py @@ -0,0 +1,78 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + +from pandas.tseries.offsets import BDay + + +class TestTruncate: + def test_truncate(self, datetime_series): + offset = BDay() + + ts = datetime_series[::3] + + start, end = datetime_series.index[3], datetime_series.index[6] + start_missing, end_missing = datetime_series.index[2], datetime_series.index[7] + + # neither specified + truncated = ts.truncate() + tm.assert_series_equal(truncated, ts) + + # both specified + expected = ts[1:3] + + truncated = ts.truncate(start, end) + tm.assert_series_equal(truncated, expected) + + truncated = ts.truncate(start_missing, end_missing) + tm.assert_series_equal(truncated, expected) + + # start specified + expected = ts[1:] + + truncated = ts.truncate(before=start) + tm.assert_series_equal(truncated, expected) + + truncated = ts.truncate(before=start_missing) + tm.assert_series_equal(truncated, expected) + + # end specified + expected = ts[:3] + + truncated = ts.truncate(after=end) + tm.assert_series_equal(truncated, expected) + + truncated = ts.truncate(after=end_missing) + tm.assert_series_equal(truncated, expected) + + # corner case, empty series returned + truncated = ts.truncate(after=datetime_series.index[0] - offset) + assert len(truncated) == 0 + + truncated = ts.truncate(before=datetime_series.index[-1] + offset) + assert len(truncated) == 0 + + msg = "Truncate: 1999-12-31 00:00:00 must be after 2000-02-14 00:00:00" + with pytest.raises(ValueError, match=msg): + ts.truncate( + before=datetime_series.index[-1] + offset, + after=datetime_series.index[0] - offset, + ) + + def test_truncate_nonsortedindex(self): + # GH#17935 + + s = pd.Series(["a", "b", "c", "d", "e"], index=[5, 3, 2, 9, 0]) + msg = "truncate requires a sorted index" + + with pytest.raises(ValueError, match=msg): + s.truncate(before=3, after=9) + + rng = pd.date_range("2011-01-01", "2012-01-01", freq="W") + ts = pd.Series(np.random.randn(len(rng)), index=rng) + msg = "truncate requires a sorted index" + + with pytest.raises(ValueError, match=msg): + ts.sort_values(ascending=False).truncate(before="2011-11", after="2011-12") diff --git a/venv/Lib/site-packages/pandas/tests/series/methods/test_value_counts.py b/venv/Lib/site-packages/pandas/tests/series/methods/test_value_counts.py new file mode 100644 index 0000000..fdb35be --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/methods/test_value_counts.py @@ -0,0 +1,179 @@ +import numpy as np + +import pandas as pd +from pandas import Categorical, CategoricalIndex, Series +import pandas._testing as tm + + +class TestSeriesValueCounts: + def test_value_counts_datetime(self): + # most dtypes are tested in tests/base + values = [ + pd.Timestamp("2011-01-01 09:00"), + pd.Timestamp("2011-01-01 10:00"), + pd.Timestamp("2011-01-01 11:00"), + pd.Timestamp("2011-01-01 09:00"), + pd.Timestamp("2011-01-01 09:00"), + pd.Timestamp("2011-01-01 11:00"), + ] + + exp_idx = pd.DatetimeIndex( + ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"] + ) + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") + + ser = pd.Series(values, name="xxx") + tm.assert_series_equal(ser.value_counts(), exp) + # check DatetimeIndex outputs the same result + idx = pd.DatetimeIndex(values, name="xxx") + tm.assert_series_equal(idx.value_counts(), exp) + + # normalize + exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") + tm.assert_series_equal(ser.value_counts(normalize=True), exp) + tm.assert_series_equal(idx.value_counts(normalize=True), exp) + + def test_value_counts_datetime_tz(self): + values = [ + pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"), + pd.Timestamp("2011-01-01 10:00", tz="US/Eastern"), + pd.Timestamp("2011-01-01 11:00", tz="US/Eastern"), + pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"), + pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"), + pd.Timestamp("2011-01-01 11:00", tz="US/Eastern"), + ] + + exp_idx = pd.DatetimeIndex( + ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"], + tz="US/Eastern", + ) + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") + + ser = pd.Series(values, name="xxx") + tm.assert_series_equal(ser.value_counts(), exp) + idx = pd.DatetimeIndex(values, name="xxx") + tm.assert_series_equal(idx.value_counts(), exp) + + exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") + tm.assert_series_equal(ser.value_counts(normalize=True), exp) + tm.assert_series_equal(idx.value_counts(normalize=True), exp) + + def test_value_counts_period(self): + values = [ + pd.Period("2011-01", freq="M"), + pd.Period("2011-02", freq="M"), + pd.Period("2011-03", freq="M"), + pd.Period("2011-01", freq="M"), + pd.Period("2011-01", freq="M"), + pd.Period("2011-03", freq="M"), + ] + + exp_idx = pd.PeriodIndex(["2011-01", "2011-03", "2011-02"], freq="M") + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") + + ser = pd.Series(values, name="xxx") + tm.assert_series_equal(ser.value_counts(), exp) + # check DatetimeIndex outputs the same result + idx = pd.PeriodIndex(values, name="xxx") + tm.assert_series_equal(idx.value_counts(), exp) + + # normalize + exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") + tm.assert_series_equal(ser.value_counts(normalize=True), exp) + tm.assert_series_equal(idx.value_counts(normalize=True), exp) + + def test_value_counts_categorical_ordered(self): + # most dtypes are tested in tests/base + values = pd.Categorical([1, 2, 3, 1, 1, 3], ordered=True) + + exp_idx = pd.CategoricalIndex([1, 3, 2], categories=[1, 2, 3], ordered=True) + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") + + ser = pd.Series(values, name="xxx") + tm.assert_series_equal(ser.value_counts(), exp) + # check CategoricalIndex outputs the same result + idx = pd.CategoricalIndex(values, name="xxx") + tm.assert_series_equal(idx.value_counts(), exp) + + # normalize + exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") + tm.assert_series_equal(ser.value_counts(normalize=True), exp) + tm.assert_series_equal(idx.value_counts(normalize=True), exp) + + def test_value_counts_categorical_not_ordered(self): + values = pd.Categorical([1, 2, 3, 1, 1, 3], ordered=False) + + exp_idx = pd.CategoricalIndex([1, 3, 2], categories=[1, 2, 3], ordered=False) + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") + + ser = pd.Series(values, name="xxx") + tm.assert_series_equal(ser.value_counts(), exp) + # check CategoricalIndex outputs the same result + idx = pd.CategoricalIndex(values, name="xxx") + tm.assert_series_equal(idx.value_counts(), exp) + + # normalize + exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") + tm.assert_series_equal(ser.value_counts(normalize=True), exp) + tm.assert_series_equal(idx.value_counts(normalize=True), exp) + + def test_value_counts_categorical(self): + # GH#12835 + cats = Categorical(list("abcccb"), categories=list("cabd")) + ser = Series(cats, name="xxx") + res = ser.value_counts(sort=False) + + exp_index = CategoricalIndex(list("cabd"), categories=cats.categories) + exp = Series([3, 1, 2, 0], name="xxx", index=exp_index) + tm.assert_series_equal(res, exp) + + res = ser.value_counts(sort=True) + + exp_index = CategoricalIndex(list("cbad"), categories=cats.categories) + exp = Series([3, 2, 1, 0], name="xxx", index=exp_index) + tm.assert_series_equal(res, exp) + + # check object dtype handles the Series.name as the same + # (tested in tests/base) + ser = Series(["a", "b", "c", "c", "c", "b"], name="xxx") + res = ser.value_counts() + exp = Series([3, 2, 1], name="xxx", index=["c", "b", "a"]) + tm.assert_series_equal(res, exp) + + def test_value_counts_categorical_with_nan(self): + # see GH#9443 + + # sanity check + ser = Series(["a", "b", "a"], dtype="category") + exp = Series([2, 1], index=CategoricalIndex(["a", "b"])) + + res = ser.value_counts(dropna=True) + tm.assert_series_equal(res, exp) + + res = ser.value_counts(dropna=True) + tm.assert_series_equal(res, exp) + + # same Series via two different constructions --> same behaviour + series = [ + Series(["a", "b", None, "a", None, None], dtype="category"), + Series( + Categorical(["a", "b", None, "a", None, None], categories=["a", "b"]) + ), + ] + + for ser in series: + # None is a NaN value, so we exclude its count here + exp = Series([2, 1], index=CategoricalIndex(["a", "b"])) + res = ser.value_counts(dropna=True) + tm.assert_series_equal(res, exp) + + # we don't exclude the count of None and sort by counts + exp = Series([3, 2, 1], index=CategoricalIndex([np.nan, "a", "b"])) + res = ser.value_counts(dropna=False) + tm.assert_series_equal(res, exp) + + # When we aren't sorting by counts, and np.nan isn't a + # category, it should be last. + exp = Series([2, 1, 3], index=CategoricalIndex(["a", "b", np.nan])) + res = ser.value_counts(dropna=False, sort=False) + tm.assert_series_equal(res, exp) diff --git a/venv/Lib/site-packages/pandas/tests/series/test_alter_axes.py b/venv/Lib/site-packages/pandas/tests/series/test_alter_axes.py new file mode 100644 index 0000000..628c665 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/test_alter_axes.py @@ -0,0 +1,352 @@ +from datetime import datetime + +import numpy as np +import pytest + +from pandas import DataFrame, Index, MultiIndex, RangeIndex, Series +import pandas._testing as tm + + +class TestSeriesAlterAxes: + def test_setindex(self, string_series): + # wrong type + msg = ( + r"Index\(\.\.\.\) must be called with a collection of some" + r" kind, None was passed" + ) + with pytest.raises(TypeError, match=msg): + string_series.index = None + + # wrong length + msg = ( + "Length mismatch: Expected axis has 30 elements, " + "new values have 29 elements" + ) + with pytest.raises(ValueError, match=msg): + string_series.index = np.arange(len(string_series) - 1) + + # works + string_series.index = np.arange(len(string_series)) + assert isinstance(string_series.index, Index) + + # Renaming + + def test_rename(self, datetime_series): + ts = datetime_series + renamer = lambda x: x.strftime("%Y%m%d") + renamed = ts.rename(renamer) + assert renamed.index[0] == renamer(ts.index[0]) + + # dict + rename_dict = dict(zip(ts.index, renamed.index)) + renamed2 = ts.rename(rename_dict) + tm.assert_series_equal(renamed, renamed2) + + # partial dict + s = Series(np.arange(4), index=["a", "b", "c", "d"], dtype="int64") + renamed = s.rename({"b": "foo", "d": "bar"}) + tm.assert_index_equal(renamed.index, Index(["a", "foo", "c", "bar"])) + + # index with name + renamer = Series( + np.arange(4), index=Index(["a", "b", "c", "d"], name="name"), dtype="int64" + ) + renamed = renamer.rename({}) + assert renamed.index.name == renamer.index.name + + def test_rename_by_series(self): + s = Series(range(5), name="foo") + renamer = Series({1: 10, 2: 20}) + result = s.rename(renamer) + expected = Series(range(5), index=[0, 10, 20, 3, 4], name="foo") + tm.assert_series_equal(result, expected) + + def test_rename_set_name(self): + s = Series(range(4), index=list("abcd")) + for name in ["foo", 123, 123.0, datetime(2001, 11, 11), ("foo",)]: + result = s.rename(name) + assert result.name == name + tm.assert_numpy_array_equal(result.index.values, s.index.values) + assert s.name is None + + def test_rename_set_name_inplace(self): + s = Series(range(3), index=list("abc")) + for name in ["foo", 123, 123.0, datetime(2001, 11, 11), ("foo",)]: + s.rename(name, inplace=True) + assert s.name == name + + exp = np.array(["a", "b", "c"], dtype=np.object_) + tm.assert_numpy_array_equal(s.index.values, exp) + + def test_rename_axis_supported(self): + # Supporting axis for compatibility, detailed in GH-18589 + s = Series(range(5)) + s.rename({}, axis=0) + s.rename({}, axis="index") + # TODO: clean up shared index validation + # with pytest.raises(ValueError, match="No axis named 5"): + # s.rename({}, axis=5) + + def test_set_name_attribute(self): + s = Series([1, 2, 3]) + s2 = Series([1, 2, 3], name="bar") + for name in [7, 7.0, "name", datetime(2001, 1, 1), (1,), "\u05D0"]: + s.name = name + assert s.name == name + s2.name = name + assert s2.name == name + + def test_set_name(self): + s = Series([1, 2, 3]) + s2 = s._set_name("foo") + assert s2.name == "foo" + assert s.name is None + assert s is not s2 + + def test_rename_inplace(self, datetime_series): + renamer = lambda x: x.strftime("%Y%m%d") + expected = renamer(datetime_series.index[0]) + + datetime_series.rename(renamer, inplace=True) + assert datetime_series.index[0] == expected + + def test_set_index_makes_timeseries(self): + idx = tm.makeDateIndex(10) + + s = Series(range(10)) + s.index = idx + assert s.index.is_all_dates + + def test_reset_index(self): + df = tm.makeDataFrame()[:5] + ser = df.stack() + ser.index.names = ["hash", "category"] + + ser.name = "value" + df = ser.reset_index() + assert "value" in df + + df = ser.reset_index(name="value2") + assert "value2" in df + + # check inplace + s = ser.reset_index(drop=True) + s2 = ser + s2.reset_index(drop=True, inplace=True) + tm.assert_series_equal(s, s2) + + # level + index = MultiIndex( + levels=[["bar"], ["one", "two", "three"], [0, 1]], + codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], + ) + s = Series(np.random.randn(6), index=index) + rs = s.reset_index(level=1) + assert len(rs.columns) == 2 + + rs = s.reset_index(level=[0, 2], drop=True) + tm.assert_index_equal(rs.index, Index(index.get_level_values(1))) + assert isinstance(rs, Series) + + def test_reset_index_name(self): + s = Series([1, 2, 3], index=Index(range(3), name="x")) + assert s.reset_index().index.name is None + assert s.reset_index(drop=True).index.name is None + + def test_reset_index_level(self): + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"]) + + for levels in ["A", "B"], [0, 1]: + # With MultiIndex + s = df.set_index(["A", "B"])["C"] + + result = s.reset_index(level=levels[0]) + tm.assert_frame_equal(result, df.set_index("B")) + + result = s.reset_index(level=levels[:1]) + tm.assert_frame_equal(result, df.set_index("B")) + + result = s.reset_index(level=levels) + tm.assert_frame_equal(result, df) + + result = df.set_index(["A", "B"]).reset_index(level=levels, drop=True) + tm.assert_frame_equal(result, df[["C"]]) + + with pytest.raises(KeyError, match="Level E "): + s.reset_index(level=["A", "E"]) + + # With single-level Index + s = df.set_index("A")["B"] + + result = s.reset_index(level=levels[0]) + tm.assert_frame_equal(result, df[["A", "B"]]) + + result = s.reset_index(level=levels[:1]) + tm.assert_frame_equal(result, df[["A", "B"]]) + + result = s.reset_index(level=levels[0], drop=True) + tm.assert_series_equal(result, df["B"]) + + with pytest.raises(IndexError, match="Too many levels"): + s.reset_index(level=[0, 1, 2]) + + # Check that .reset_index([],drop=True) doesn't fail + result = Series(range(4)).reset_index([], drop=True) + expected = Series(range(4)) + tm.assert_series_equal(result, expected) + + def test_reset_index_range(self): + # GH 12071 + s = Series(range(2), name="A", dtype="int64") + series_result = s.reset_index() + assert isinstance(series_result.index, RangeIndex) + series_expected = DataFrame( + [[0, 0], [1, 1]], columns=["index", "A"], index=RangeIndex(stop=2) + ) + tm.assert_frame_equal(series_result, series_expected) + + def test_reorder_levels(self): + index = MultiIndex( + levels=[["bar"], ["one", "two", "three"], [0, 1]], + codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], + names=["L0", "L1", "L2"], + ) + s = Series(np.arange(6), index=index) + + # no change, position + result = s.reorder_levels([0, 1, 2]) + tm.assert_series_equal(s, result) + + # no change, labels + result = s.reorder_levels(["L0", "L1", "L2"]) + tm.assert_series_equal(s, result) + + # rotate, position + result = s.reorder_levels([1, 2, 0]) + e_idx = MultiIndex( + levels=[["one", "two", "three"], [0, 1], ["bar"]], + codes=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1], [0, 0, 0, 0, 0, 0]], + names=["L1", "L2", "L0"], + ) + expected = Series(np.arange(6), index=e_idx) + tm.assert_series_equal(result, expected) + + def test_rename_axis_mapper(self): + # GH 19978 + mi = MultiIndex.from_product([["a", "b", "c"], [1, 2]], names=["ll", "nn"]) + s = Series(list(range(len(mi))), index=mi) + + result = s.rename_axis(index={"ll": "foo"}) + assert result.index.names == ["foo", "nn"] + + result = s.rename_axis(index=str.upper, axis=0) + assert result.index.names == ["LL", "NN"] + + result = s.rename_axis(index=["foo", "goo"]) + assert result.index.names == ["foo", "goo"] + + with pytest.raises(TypeError, match="unexpected"): + s.rename_axis(columns="wrong") + + def test_rename_axis_inplace(self, datetime_series): + # GH 15704 + expected = datetime_series.rename_axis("foo") + result = datetime_series + no_return = result.rename_axis("foo", inplace=True) + + assert no_return is None + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("kwargs", [{"mapper": None}, {"index": None}, {}]) + def test_rename_axis_none(self, kwargs): + # GH 25034 + index = Index(list("abc"), name="foo") + df = Series([1, 2, 3], index=index) + + result = df.rename_axis(**kwargs) + expected_index = index.rename(None) if kwargs else index + expected = Series([1, 2, 3], index=expected_index) + tm.assert_series_equal(result, expected) + + def test_rename_with_custom_indexer(self): + # GH 27814 + class MyIndexer: + pass + + ix = MyIndexer() + s = Series([1, 2, 3]).rename(ix) + assert s.name is ix + + def test_rename_with_custom_indexer_inplace(self): + # GH 27814 + class MyIndexer: + pass + + ix = MyIndexer() + s = Series([1, 2, 3]) + s.rename(ix, inplace=True) + assert s.name is ix + + def test_set_axis_inplace_axes(self, axis_series): + # GH14636 + ser = Series(np.arange(4), index=[1, 3, 5, 7], dtype="int64") + + expected = ser.copy() + expected.index = list("abcd") + + # inplace=True + # The FutureWarning comes from the fact that we would like to have + # inplace default to False some day + result = ser.copy() + result.set_axis(list("abcd"), axis=axis_series, inplace=True) + tm.assert_series_equal(result, expected) + + def test_set_axis_inplace(self): + # GH14636 + + s = Series(np.arange(4), index=[1, 3, 5, 7], dtype="int64") + + expected = s.copy() + expected.index = list("abcd") + + # inplace=False + result = s.set_axis(list("abcd"), axis=0, inplace=False) + tm.assert_series_equal(expected, result) + + # omitting the "axis" parameter + with tm.assert_produces_warning(None): + result = s.set_axis(list("abcd"), inplace=False) + tm.assert_series_equal(result, expected) + + # wrong values for the "axis" parameter + for axis in [2, "foo"]: + with pytest.raises(ValueError, match="No axis named"): + s.set_axis(list("abcd"), axis=axis, inplace=False) + + def test_reset_index_drop_errors(self): + # GH 20925 + + # KeyError raised for series index when passed level name is missing + s = Series(range(4)) + with pytest.raises(KeyError, match="does not match index name"): + s.reset_index("wrong", drop=True) + with pytest.raises(KeyError, match="does not match index name"): + s.reset_index("wrong") + + # KeyError raised for series when level to be dropped is missing + s = Series(range(4), index=MultiIndex.from_product([[1, 2]] * 2)) + with pytest.raises(KeyError, match="not found"): + s.reset_index("wrong", drop=True) + + def test_droplevel(self): + # GH20342 + ser = Series([1, 2, 3, 4]) + ser.index = MultiIndex.from_arrays( + [(1, 2, 3, 4), (5, 6, 7, 8)], names=["a", "b"] + ) + expected = ser.reset_index("b", drop=True) + result = ser.droplevel("b", axis="index") + tm.assert_series_equal(result, expected) + # test that droplevel raises ValueError on axis != 0 + with pytest.raises(ValueError): + ser.droplevel(1, axis="columns") diff --git a/venv/Lib/site-packages/pandas/tests/series/test_analytics.py b/venv/Lib/site-packages/pandas/tests/series/test_analytics.py new file mode 100644 index 0000000..c29bd3e --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/test_analytics.py @@ -0,0 +1,270 @@ +import operator + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import DataFrame, MultiIndex, Series +import pandas._testing as tm + + +class TestSeriesAnalytics: + def test_prod_numpy16_bug(self): + s = Series([1.0, 1.0, 1.0], index=range(3)) + result = s.prod() + + assert not isinstance(result, Series) + + def test_dot(self): + a = Series(np.random.randn(4), index=["p", "q", "r", "s"]) + b = DataFrame( + np.random.randn(3, 4), index=["1", "2", "3"], columns=["p", "q", "r", "s"] + ).T + + result = a.dot(b) + expected = Series(np.dot(a.values, b.values), index=["1", "2", "3"]) + tm.assert_series_equal(result, expected) + + # Check index alignment + b2 = b.reindex(index=reversed(b.index)) + result = a.dot(b) + tm.assert_series_equal(result, expected) + + # Check ndarray argument + result = a.dot(b.values) + assert np.all(result == expected.values) + tm.assert_almost_equal(a.dot(b["2"].values), expected["2"]) + + # Check series argument + tm.assert_almost_equal(a.dot(b["1"]), expected["1"]) + tm.assert_almost_equal(a.dot(b2["1"]), expected["1"]) + + msg = r"Dot product shape mismatch, \(4,\) vs \(3,\)" + # exception raised is of type Exception + with pytest.raises(Exception, match=msg): + a.dot(a.values[:3]) + msg = "matrices are not aligned" + with pytest.raises(ValueError, match=msg): + a.dot(b.T) + + def test_matmul(self): + # matmul test is for GH #10259 + a = Series(np.random.randn(4), index=["p", "q", "r", "s"]) + b = DataFrame( + np.random.randn(3, 4), index=["1", "2", "3"], columns=["p", "q", "r", "s"] + ).T + + # Series @ DataFrame -> Series + result = operator.matmul(a, b) + expected = Series(np.dot(a.values, b.values), index=["1", "2", "3"]) + tm.assert_series_equal(result, expected) + + # DataFrame @ Series -> Series + result = operator.matmul(b.T, a) + expected = Series(np.dot(b.T.values, a.T.values), index=["1", "2", "3"]) + tm.assert_series_equal(result, expected) + + # Series @ Series -> scalar + result = operator.matmul(a, a) + expected = np.dot(a.values, a.values) + tm.assert_almost_equal(result, expected) + + # GH 21530 + # vector (1D np.array) @ Series (__rmatmul__) + result = operator.matmul(a.values, a) + expected = np.dot(a.values, a.values) + tm.assert_almost_equal(result, expected) + + # GH 21530 + # vector (1D list) @ Series (__rmatmul__) + result = operator.matmul(a.values.tolist(), a) + expected = np.dot(a.values, a.values) + tm.assert_almost_equal(result, expected) + + # GH 21530 + # matrix (2D np.array) @ Series (__rmatmul__) + result = operator.matmul(b.T.values, a) + expected = np.dot(b.T.values, a.values) + tm.assert_almost_equal(result, expected) + + # GH 21530 + # matrix (2D nested lists) @ Series (__rmatmul__) + result = operator.matmul(b.T.values.tolist(), a) + expected = np.dot(b.T.values, a.values) + tm.assert_almost_equal(result, expected) + + # mixed dtype DataFrame @ Series + a["p"] = int(a.p) + result = operator.matmul(b.T, a) + expected = Series(np.dot(b.T.values, a.T.values), index=["1", "2", "3"]) + tm.assert_series_equal(result, expected) + + # different dtypes DataFrame @ Series + a = a.astype(int) + result = operator.matmul(b.T, a) + expected = Series(np.dot(b.T.values, a.T.values), index=["1", "2", "3"]) + tm.assert_series_equal(result, expected) + + msg = r"Dot product shape mismatch, \(4,\) vs \(3,\)" + # exception raised is of type Exception + with pytest.raises(Exception, match=msg): + a.dot(a.values[:3]) + msg = "matrices are not aligned" + with pytest.raises(ValueError, match=msg): + a.dot(b.T) + + def test_ptp(self): + # GH21614 + N = 1000 + arr = np.random.randn(N) + ser = Series(arr) + assert np.ptp(ser) == np.ptp(arr) + + def test_repeat(self): + s = Series(np.random.randn(3), index=["a", "b", "c"]) + + reps = s.repeat(5) + exp = Series(s.values.repeat(5), index=s.index.values.repeat(5)) + tm.assert_series_equal(reps, exp) + + to_rep = [2, 3, 4] + reps = s.repeat(to_rep) + exp = Series(s.values.repeat(to_rep), index=s.index.values.repeat(to_rep)) + tm.assert_series_equal(reps, exp) + + def test_numpy_repeat(self): + s = Series(np.arange(3), name="x") + expected = Series(s.values.repeat(2), name="x", index=s.index.values.repeat(2)) + tm.assert_series_equal(np.repeat(s, 2), expected) + + msg = "the 'axis' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.repeat(s, 2, axis=0) + + def test_is_monotonic(self): + + s = Series(np.random.randint(0, 10, size=1000)) + assert not s.is_monotonic + s = Series(np.arange(1000)) + assert s.is_monotonic is True + assert s.is_monotonic_increasing is True + s = Series(np.arange(1000, 0, -1)) + assert s.is_monotonic_decreasing is True + + s = Series(pd.date_range("20130101", periods=10)) + assert s.is_monotonic is True + assert s.is_monotonic_increasing is True + s = Series(list(reversed(s.tolist()))) + assert s.is_monotonic is False + assert s.is_monotonic_decreasing is True + + def test_unstack(self): + + index = MultiIndex( + levels=[["bar", "foo"], ["one", "three", "two"]], + codes=[[1, 1, 0, 0], [0, 1, 0, 2]], + ) + + s = Series(np.arange(4.0), index=index) + unstacked = s.unstack() + + expected = DataFrame( + [[2.0, np.nan, 3.0], [0.0, 1.0, np.nan]], + index=["bar", "foo"], + columns=["one", "three", "two"], + ) + + tm.assert_frame_equal(unstacked, expected) + + unstacked = s.unstack(level=0) + tm.assert_frame_equal(unstacked, expected.T) + + index = MultiIndex( + levels=[["bar"], ["one", "two", "three"], [0, 1]], + codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], + ) + s = Series(np.random.randn(6), index=index) + exp_index = MultiIndex( + levels=[["one", "two", "three"], [0, 1]], + codes=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], + ) + expected = DataFrame({"bar": s.values}, index=exp_index).sort_index(level=0) + unstacked = s.unstack(0).sort_index() + tm.assert_frame_equal(unstacked, expected) + + # GH5873 + idx = pd.MultiIndex.from_arrays([[101, 102], [3.5, np.nan]]) + ts = pd.Series([1, 2], index=idx) + left = ts.unstack() + right = DataFrame( + [[np.nan, 1], [2, np.nan]], index=[101, 102], columns=[np.nan, 3.5] + ) + tm.assert_frame_equal(left, right) + + idx = pd.MultiIndex.from_arrays( + [ + ["cat", "cat", "cat", "dog", "dog"], + ["a", "a", "b", "a", "b"], + [1, 2, 1, 1, np.nan], + ] + ) + ts = pd.Series([1.0, 1.1, 1.2, 1.3, 1.4], index=idx) + right = DataFrame( + [[1.0, 1.3], [1.1, np.nan], [np.nan, 1.4], [1.2, np.nan]], + columns=["cat", "dog"], + ) + tpls = [("a", 1), ("a", 2), ("b", np.nan), ("b", 1)] + right.index = pd.MultiIndex.from_tuples(tpls) + tm.assert_frame_equal(ts.unstack(level=0), right) + + @pytest.mark.parametrize("func", [np.any, np.all]) + @pytest.mark.parametrize("kwargs", [dict(keepdims=True), dict(out=object())]) + @td.skip_if_np_lt("1.15") + def test_validate_any_all_out_keepdims_raises(self, kwargs, func): + s = pd.Series([1, 2]) + param = list(kwargs)[0] + name = func.__name__ + + msg = ( + r"the '{arg}' parameter is not " + r"supported in the pandas " + r"implementation of {fname}\(\)" + ).format(arg=param, fname=name) + with pytest.raises(ValueError, match=msg): + func(s, **kwargs) + + @td.skip_if_np_lt("1.15") + def test_validate_sum_initial(self): + s = pd.Series([1, 2]) + msg = ( + r"the 'initial' parameter is not " + r"supported in the pandas " + r"implementation of sum\(\)" + ) + with pytest.raises(ValueError, match=msg): + np.sum(s, initial=10) + + def test_validate_median_initial(self): + s = pd.Series([1, 2]) + msg = ( + r"the 'overwrite_input' parameter is not " + r"supported in the pandas " + r"implementation of median\(\)" + ) + with pytest.raises(ValueError, match=msg): + # It seems like np.median doesn't dispatch, so we use the + # method instead of the ufunc. + s.median(overwrite_input=True) + + @td.skip_if_np_lt("1.15") + def test_validate_stat_keepdims(self): + s = pd.Series([1, 2]) + msg = ( + r"the 'keepdims' parameter is not " + r"supported in the pandas " + r"implementation of sum\(\)" + ) + with pytest.raises(ValueError, match=msg): + np.sum(s, keepdims=True) diff --git a/venv/Lib/site-packages/pandas/tests/series/test_api.py b/venv/Lib/site-packages/pandas/tests/series/test_api.py new file mode 100644 index 0000000..f96d6dd --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/test_api.py @@ -0,0 +1,736 @@ +from collections import OrderedDict +import pydoc +import warnings + +import numpy as np +import pytest + +from pandas.util._test_decorators import async_mark + +import pandas as pd +from pandas import ( + Categorical, + DataFrame, + DatetimeIndex, + Index, + Series, + Timedelta, + TimedeltaIndex, + Timestamp, + date_range, + period_range, + timedelta_range, +) +import pandas._testing as tm +from pandas.core.arrays import PeriodArray + +import pandas.io.formats.printing as printing + + +class TestSeriesMisc: + def test_scalarop_preserve_name(self, datetime_series): + result = datetime_series * 2 + assert result.name == datetime_series.name + + def test_copy_name(self, datetime_series): + result = datetime_series.copy() + assert result.name == datetime_series.name + + def test_copy_index_name_checking(self, datetime_series): + # don't want to be able to modify the index stored elsewhere after + # making a copy + + datetime_series.index.name = None + assert datetime_series.index.name is None + assert datetime_series is datetime_series + + cp = datetime_series.copy() + cp.index.name = "foo" + printing.pprint_thing(datetime_series.index.name) + assert datetime_series.index.name is None + + def test_append_preserve_name(self, datetime_series): + result = datetime_series[:5].append(datetime_series[5:]) + assert result.name == datetime_series.name + + def test_binop_maybe_preserve_name(self, datetime_series): + # names match, preserve + result = datetime_series * datetime_series + assert result.name == datetime_series.name + result = datetime_series.mul(datetime_series) + assert result.name == datetime_series.name + + result = datetime_series * datetime_series[:-2] + assert result.name == datetime_series.name + + # names don't match, don't preserve + cp = datetime_series.copy() + cp.name = "something else" + result = datetime_series + cp + assert result.name is None + result = datetime_series.add(cp) + assert result.name is None + + ops = ["add", "sub", "mul", "div", "truediv", "floordiv", "mod", "pow"] + ops = ops + ["r" + op for op in ops] + for op in ops: + # names match, preserve + s = datetime_series.copy() + result = getattr(s, op)(s) + assert result.name == datetime_series.name + + # names don't match, don't preserve + cp = datetime_series.copy() + cp.name = "changed" + result = getattr(s, op)(cp) + assert result.name is None + + def test_combine_first_name(self, datetime_series): + result = datetime_series.combine_first(datetime_series[:5]) + assert result.name == datetime_series.name + + def test_getitem_preserve_name(self, datetime_series): + result = datetime_series[datetime_series > 0] + assert result.name == datetime_series.name + + result = datetime_series[[0, 2, 4]] + assert result.name == datetime_series.name + + result = datetime_series[5:10] + assert result.name == datetime_series.name + + def test_pickle_datetimes(self, datetime_series): + unp_ts = self._pickle_roundtrip(datetime_series) + tm.assert_series_equal(unp_ts, datetime_series) + + def test_pickle_strings(self, string_series): + unp_series = self._pickle_roundtrip(string_series) + tm.assert_series_equal(unp_series, string_series) + + def _pickle_roundtrip(self, obj): + + with tm.ensure_clean() as path: + obj.to_pickle(path) + unpickled = pd.read_pickle(path) + return unpickled + + def test_sort_index_name(self, datetime_series): + result = datetime_series.sort_index(ascending=False) + assert result.name == datetime_series.name + + def test_constructor_dict(self): + d = {"a": 0.0, "b": 1.0, "c": 2.0} + result = Series(d) + expected = Series(d, index=sorted(d.keys())) + tm.assert_series_equal(result, expected) + + result = Series(d, index=["b", "c", "d", "a"]) + expected = Series([1, 2, np.nan, 0], index=["b", "c", "d", "a"]) + tm.assert_series_equal(result, expected) + + def test_constructor_subclass_dict(self, dict_subclass): + data = dict_subclass((x, 10.0 * x) for x in range(10)) + series = Series(data) + expected = Series(dict(data.items())) + tm.assert_series_equal(series, expected) + + def test_constructor_ordereddict(self): + # GH3283 + data = OrderedDict( + ("col{i}".format(i=i), np.random.random()) for i in range(12) + ) + + series = Series(data) + expected = Series(list(data.values()), list(data.keys())) + tm.assert_series_equal(series, expected) + + # Test with subclass + class A(OrderedDict): + pass + + series = Series(A(data)) + tm.assert_series_equal(series, expected) + + def test_constructor_dict_multiindex(self): + d = {("a", "a"): 0.0, ("b", "a"): 1.0, ("b", "c"): 2.0} + _d = sorted(d.items()) + result = Series(d) + expected = Series( + [x[1] for x in _d], index=pd.MultiIndex.from_tuples([x[0] for x in _d]) + ) + tm.assert_series_equal(result, expected) + + d["z"] = 111.0 + _d.insert(0, ("z", d["z"])) + result = Series(d) + expected = Series( + [x[1] for x in _d], index=pd.Index([x[0] for x in _d], tupleize_cols=False) + ) + result = result.reindex(index=expected.index) + tm.assert_series_equal(result, expected) + + def test_constructor_dict_timedelta_index(self): + # GH #12169 : Resample category data with timedelta index + # construct Series from dict as data and TimedeltaIndex as index + # will result NaN in result Series data + expected = Series( + data=["A", "B", "C"], index=pd.to_timedelta([0, 10, 20], unit="s") + ) + + result = Series( + data={ + pd.to_timedelta(0, unit="s"): "A", + pd.to_timedelta(10, unit="s"): "B", + pd.to_timedelta(20, unit="s"): "C", + }, + index=pd.to_timedelta([0, 10, 20], unit="s"), + ) + tm.assert_series_equal(result, expected) + + def test_sparse_accessor_updates_on_inplace(self): + s = pd.Series([1, 1, 2, 3], dtype="Sparse[int]") + s.drop([0, 1], inplace=True) + assert s.sparse.density == 1.0 + + def test_tab_completion(self): + # GH 9910 + s = Series(list("abcd")) + # Series of str values should have .str but not .dt/.cat in __dir__ + assert "str" in dir(s) + assert "dt" not in dir(s) + assert "cat" not in dir(s) + + # similarly for .dt + s = Series(date_range("1/1/2015", periods=5)) + assert "dt" in dir(s) + assert "str" not in dir(s) + assert "cat" not in dir(s) + + # Similarly for .cat, but with the twist that str and dt should be + # there if the categories are of that type first cat and str. + s = Series(list("abbcd"), dtype="category") + assert "cat" in dir(s) + assert "str" in dir(s) # as it is a string categorical + assert "dt" not in dir(s) + + # similar to cat and str + s = Series(date_range("1/1/2015", periods=5)).astype("category") + assert "cat" in dir(s) + assert "str" not in dir(s) + assert "dt" in dir(s) # as it is a datetime categorical + + def test_tab_completion_with_categorical(self): + # test the tab completion display + ok_for_cat = [ + "categories", + "codes", + "ordered", + "set_categories", + "add_categories", + "remove_categories", + "rename_categories", + "reorder_categories", + "remove_unused_categories", + "as_ordered", + "as_unordered", + ] + + def get_dir(s): + results = [r for r in s.cat.__dir__() if not r.startswith("_")] + return sorted(set(results)) + + s = Series(list("aabbcde")).astype("category") + results = get_dir(s) + tm.assert_almost_equal(results, sorted(set(ok_for_cat))) + + @pytest.mark.parametrize( + "index", + [ + tm.makeUnicodeIndex(10), + tm.makeStringIndex(10), + tm.makeCategoricalIndex(10), + Index(["foo", "bar", "baz"] * 2), + tm.makeDateIndex(10), + tm.makePeriodIndex(10), + tm.makeTimedeltaIndex(10), + tm.makeIntIndex(10), + tm.makeUIntIndex(10), + tm.makeIntIndex(10), + tm.makeFloatIndex(10), + Index([True, False]), + Index(["a{}".format(i) for i in range(101)]), + pd.MultiIndex.from_tuples(zip("ABCD", "EFGH")), + pd.MultiIndex.from_tuples(zip([0, 1, 2, 3], "EFGH")), + ], + ) + def test_index_tab_completion(self, index): + # dir contains string-like values of the Index. + s = pd.Series(index=index, dtype=object) + dir_s = dir(s) + for i, x in enumerate(s.index.unique(level=0)): + if i < 100: + assert not isinstance(x, str) or not x.isidentifier() or x in dir_s + else: + assert x not in dir_s + + def test_not_hashable(self): + s_empty = Series(dtype=object) + s = Series([1]) + msg = "'Series' objects are mutable, thus they cannot be hashed" + with pytest.raises(TypeError, match=msg): + hash(s_empty) + with pytest.raises(TypeError, match=msg): + hash(s) + + def test_contains(self, datetime_series): + tm.assert_contains_all(datetime_series.index, datetime_series) + + def test_iter_datetimes(self, datetime_series): + for i, val in enumerate(datetime_series): + assert val == datetime_series[i] + + def test_iter_strings(self, string_series): + for i, val in enumerate(string_series): + assert val == string_series[i] + + def test_keys(self, datetime_series): + # HACK: By doing this in two stages, we avoid 2to3 wrapping the call + # to .keys() in a list() + getkeys = datetime_series.keys + assert getkeys() is datetime_series.index + + def test_values(self, datetime_series): + tm.assert_almost_equal( + datetime_series.values, datetime_series, check_dtype=False + ) + + def test_iteritems_datetimes(self, datetime_series): + for idx, val in datetime_series.iteritems(): + assert val == datetime_series[idx] + + def test_iteritems_strings(self, string_series): + for idx, val in string_series.iteritems(): + assert val == string_series[idx] + + # assert is lazy (generators don't define reverse, lists do) + assert not hasattr(string_series.iteritems(), "reverse") + + def test_items_datetimes(self, datetime_series): + for idx, val in datetime_series.items(): + assert val == datetime_series[idx] + + def test_items_strings(self, string_series): + for idx, val in string_series.items(): + assert val == string_series[idx] + + # assert is lazy (generators don't define reverse, lists do) + assert not hasattr(string_series.items(), "reverse") + + def test_raise_on_info(self): + s = Series(np.random.randn(10)) + msg = "'Series' object has no attribute 'info'" + with pytest.raises(AttributeError, match=msg): + s.info() + + def test_copy(self): + + for deep in [None, False, True]: + s = Series(np.arange(10), dtype="float64") + + # default deep is True + if deep is None: + s2 = s.copy() + else: + s2 = s.copy(deep=deep) + + s2[::2] = np.NaN + + if deep is None or deep is True: + # Did not modify original Series + assert np.isnan(s2[0]) + assert not np.isnan(s[0]) + else: + # we DID modify the original Series + assert np.isnan(s2[0]) + assert np.isnan(s[0]) + + def test_copy_tzaware(self): + # GH#11794 + # copy of tz-aware + expected = Series([Timestamp("2012/01/01", tz="UTC")]) + expected2 = Series([Timestamp("1999/01/01", tz="UTC")]) + + for deep in [None, False, True]: + + s = Series([Timestamp("2012/01/01", tz="UTC")]) + + if deep is None: + s2 = s.copy() + else: + s2 = s.copy(deep=deep) + + s2[0] = pd.Timestamp("1999/01/01", tz="UTC") + + # default deep is True + if deep is None or deep is True: + # Did not modify original Series + tm.assert_series_equal(s2, expected2) + tm.assert_series_equal(s, expected) + else: + # we DID modify the original Series + tm.assert_series_equal(s2, expected2) + tm.assert_series_equal(s, expected2) + + def test_axis_alias(self): + s = Series([1, 2, np.nan]) + tm.assert_series_equal(s.dropna(axis="rows"), s.dropna(axis="index")) + assert s.dropna().sum("rows") == 3 + assert s._get_axis_number("rows") == 0 + assert s._get_axis_name("rows") == "index" + + def test_class_axis(self): + # https://github.com/pandas-dev/pandas/issues/18147 + # no exception and no empty docstring + assert pydoc.getdoc(Series.index) + + def test_numpy_unique(self, datetime_series): + # it works! + np.unique(datetime_series) + + def test_item(self): + s = Series([1]) + result = s.item() + assert result == 1 + assert result == s.iloc[0] + assert isinstance(result, int) # i.e. not np.int64 + + ser = Series([0.5], index=[3]) + result = ser.item() + assert isinstance(result, float) + assert result == 0.5 + + ser = Series([1, 2]) + msg = "can only convert an array of size 1" + with pytest.raises(ValueError, match=msg): + ser.item() + + dti = pd.date_range("2016-01-01", periods=2) + with pytest.raises(ValueError, match=msg): + dti.item() + with pytest.raises(ValueError, match=msg): + Series(dti).item() + + val = dti[:1].item() + assert isinstance(val, Timestamp) + val = Series(dti)[:1].item() + assert isinstance(val, Timestamp) + + tdi = dti - dti + with pytest.raises(ValueError, match=msg): + tdi.item() + with pytest.raises(ValueError, match=msg): + Series(tdi).item() + + val = tdi[:1].item() + assert isinstance(val, Timedelta) + val = Series(tdi)[:1].item() + assert isinstance(val, Timedelta) + + # Case where ser[0] would not work + ser = Series(dti, index=[5, 6]) + val = ser[:1].item() + assert val == dti[0] + + def test_ndarray_compat(self): + + # test numpy compat with Series as sub-class of NDFrame + tsdf = DataFrame( + np.random.randn(1000, 3), + columns=["A", "B", "C"], + index=date_range("1/1/2000", periods=1000), + ) + + def f(x): + return x[x.idxmax()] + + result = tsdf.apply(f) + expected = tsdf.max() + tm.assert_series_equal(result, expected) + + # using an ndarray like function + s = Series(np.random.randn(10)) + result = Series(np.ones_like(s)) + expected = Series(1, index=range(10), dtype="float64") + tm.assert_series_equal(result, expected) + + # ravel + s = Series(np.random.randn(10)) + tm.assert_almost_equal(s.ravel(order="F"), s.values.ravel(order="F")) + + def test_str_accessor_updates_on_inplace(self): + s = pd.Series(list("abc")) + s.drop([0], inplace=True) + assert len(s.str.lower()) == 2 + + def test_str_attribute(self): + # GH9068 + methods = ["strip", "rstrip", "lstrip"] + s = Series([" jack", "jill ", " jesse ", "frank"]) + for method in methods: + expected = Series([getattr(str, method)(x) for x in s.values]) + tm.assert_series_equal(getattr(Series.str, method)(s.str), expected) + + # str accessor only valid with string values + s = Series(range(5)) + with pytest.raises(AttributeError, match="only use .str accessor"): + s.str.repeat(2) + + def test_empty_method(self): + s_empty = pd.Series(dtype=object) + assert s_empty.empty + + s2 = pd.Series(index=[1], dtype=object) + for full_series in [pd.Series([1]), s2]: + assert not full_series.empty + + @async_mark() + async def test_tab_complete_warning(self, ip): + # https://github.com/pandas-dev/pandas/issues/16409 + pytest.importorskip("IPython", minversion="6.0.0") + from IPython.core.completer import provisionalcompleter + + code = "import pandas as pd; s = pd.Series()" + await ip.run_code(code) + with tm.assert_produces_warning(None): + with provisionalcompleter("ignore"): + list(ip.Completer.completions("s.", 1)) + + def test_integer_series_size(self): + # GH 25580 + s = Series(range(9)) + assert s.size == 9 + s = Series(range(9), dtype="Int64") + assert s.size == 9 + + def test_attrs(self): + s = pd.Series([0, 1], name="abc") + assert s.attrs == {} + s.attrs["version"] = 1 + result = s + 1 + assert result.attrs == {"version": 1} + + +class TestCategoricalSeries: + @pytest.mark.parametrize( + "method", + [ + lambda x: x.cat.set_categories([1, 2, 3]), + lambda x: x.cat.reorder_categories([2, 3, 1], ordered=True), + lambda x: x.cat.rename_categories([1, 2, 3]), + lambda x: x.cat.remove_unused_categories(), + lambda x: x.cat.remove_categories([2]), + lambda x: x.cat.add_categories([4]), + lambda x: x.cat.as_ordered(), + lambda x: x.cat.as_unordered(), + ], + ) + def test_getname_categorical_accessor(self, method): + # GH 17509 + s = Series([1, 2, 3], name="A").astype("category") + expected = "A" + result = method(s).name + assert result == expected + + def test_cat_accessor(self): + s = Series(Categorical(["a", "b", np.nan, "a"])) + tm.assert_index_equal(s.cat.categories, Index(["a", "b"])) + assert not s.cat.ordered, False + + exp = Categorical(["a", "b", np.nan, "a"], categories=["b", "a"]) + s.cat.set_categories(["b", "a"], inplace=True) + tm.assert_categorical_equal(s.values, exp) + + res = s.cat.set_categories(["b", "a"]) + tm.assert_categorical_equal(res.values, exp) + + s[:] = "a" + s = s.cat.remove_unused_categories() + tm.assert_index_equal(s.cat.categories, Index(["a"])) + + def test_cat_accessor_api(self): + # GH 9322 + from pandas.core.arrays.categorical import CategoricalAccessor + + assert Series.cat is CategoricalAccessor + s = Series(list("aabbcde")).astype("category") + assert isinstance(s.cat, CategoricalAccessor) + + invalid = Series([1]) + with pytest.raises(AttributeError, match="only use .cat accessor"): + invalid.cat + assert not hasattr(invalid, "cat") + + def test_cat_accessor_no_new_attributes(self): + # https://github.com/pandas-dev/pandas/issues/10673 + c = Series(list("aabbcde")).astype("category") + with pytest.raises(AttributeError, match="You cannot add any new attribute"): + c.cat.xlabel = "a" + + def test_cat_accessor_updates_on_inplace(self): + s = Series(list("abc")).astype("category") + s.drop(0, inplace=True) + s.cat.remove_unused_categories(inplace=True) + assert len(s.cat.categories) == 2 + + def test_categorical_delegations(self): + + # invalid accessor + msg = r"Can only use \.cat accessor with a 'category' dtype" + with pytest.raises(AttributeError, match=msg): + Series([1, 2, 3]).cat + with pytest.raises(AttributeError, match=msg): + Series([1, 2, 3]).cat() + with pytest.raises(AttributeError, match=msg): + Series(["a", "b", "c"]).cat + with pytest.raises(AttributeError, match=msg): + Series(np.arange(5.0)).cat + with pytest.raises(AttributeError, match=msg): + Series([Timestamp("20130101")]).cat + + # Series should delegate calls to '.categories', '.codes', '.ordered' + # and the methods '.set_categories()' 'drop_unused_categories()' to the + # categorical + s = Series(Categorical(["a", "b", "c", "a"], ordered=True)) + exp_categories = Index(["a", "b", "c"]) + tm.assert_index_equal(s.cat.categories, exp_categories) + s.cat.categories = [1, 2, 3] + exp_categories = Index([1, 2, 3]) + tm.assert_index_equal(s.cat.categories, exp_categories) + + exp_codes = Series([0, 1, 2, 0], dtype="int8") + tm.assert_series_equal(s.cat.codes, exp_codes) + + assert s.cat.ordered + s = s.cat.as_unordered() + assert not s.cat.ordered + s.cat.as_ordered(inplace=True) + assert s.cat.ordered + + # reorder + s = Series(Categorical(["a", "b", "c", "a"], ordered=True)) + exp_categories = Index(["c", "b", "a"]) + exp_values = np.array(["a", "b", "c", "a"], dtype=np.object_) + s = s.cat.set_categories(["c", "b", "a"]) + tm.assert_index_equal(s.cat.categories, exp_categories) + tm.assert_numpy_array_equal(s.values.__array__(), exp_values) + tm.assert_numpy_array_equal(s.__array__(), exp_values) + + # remove unused categories + s = Series(Categorical(["a", "b", "b", "a"], categories=["a", "b", "c"])) + exp_categories = Index(["a", "b"]) + exp_values = np.array(["a", "b", "b", "a"], dtype=np.object_) + s = s.cat.remove_unused_categories() + tm.assert_index_equal(s.cat.categories, exp_categories) + tm.assert_numpy_array_equal(s.values.__array__(), exp_values) + tm.assert_numpy_array_equal(s.__array__(), exp_values) + + # This method is likely to be confused, so test that it raises an error + # on wrong inputs: + msg = "'Series' object has no attribute 'set_categories'" + with pytest.raises(AttributeError, match=msg): + s.set_categories([4, 3, 2, 1]) + + # right: s.cat.set_categories([4,3,2,1]) + + # GH18862 (let Series.cat.rename_categories take callables) + s = Series(Categorical(["a", "b", "c", "a"], ordered=True)) + result = s.cat.rename_categories(lambda x: x.upper()) + expected = Series( + Categorical(["A", "B", "C", "A"], categories=["A", "B", "C"], ordered=True) + ) + tm.assert_series_equal(result, expected) + + def test_dt_accessor_api_for_categorical(self): + # https://github.com/pandas-dev/pandas/issues/10661 + from pandas.core.indexes.accessors import Properties + + s_dr = Series(date_range("1/1/2015", periods=5, tz="MET")) + c_dr = s_dr.astype("category") + + s_pr = Series(period_range("1/1/2015", freq="D", periods=5)) + c_pr = s_pr.astype("category") + + s_tdr = Series(timedelta_range("1 days", "10 days")) + c_tdr = s_tdr.astype("category") + + # only testing field (like .day) + # and bool (is_month_start) + get_ops = lambda x: x._datetimelike_ops + + test_data = [ + ("Datetime", get_ops(DatetimeIndex), s_dr, c_dr), + ("Period", get_ops(PeriodArray), s_pr, c_pr), + ("Timedelta", get_ops(TimedeltaIndex), s_tdr, c_tdr), + ] + + assert isinstance(c_dr.dt, Properties) + + special_func_defs = [ + ("strftime", ("%Y-%m-%d",), {}), + ("tz_convert", ("EST",), {}), + ("round", ("D",), {}), + ("floor", ("D",), {}), + ("ceil", ("D",), {}), + ("asfreq", ("D",), {}), + # FIXME: don't leave commented-out + # ('tz_localize', ("UTC",), {}), + ] + _special_func_names = [f[0] for f in special_func_defs] + + # the series is already localized + _ignore_names = ["tz_localize", "components"] + + for name, attr_names, s, c in test_data: + func_names = [ + f + for f in dir(s.dt) + if not ( + f.startswith("_") + or f in attr_names + or f in _special_func_names + or f in _ignore_names + ) + ] + + func_defs = [(f, (), {}) for f in func_names] + for f_def in special_func_defs: + if f_def[0] in dir(s.dt): + func_defs.append(f_def) + + for func, args, kwargs in func_defs: + with warnings.catch_warnings(): + if func == "to_period": + # dropping TZ + warnings.simplefilter("ignore", UserWarning) + res = getattr(c.dt, func)(*args, **kwargs) + exp = getattr(s.dt, func)(*args, **kwargs) + + tm.assert_equal(res, exp) + + for attr in attr_names: + res = getattr(c.dt, attr) + exp = getattr(s.dt, attr) + + if isinstance(res, DataFrame): + tm.assert_frame_equal(res, exp) + elif isinstance(res, Series): + tm.assert_series_equal(res, exp) + else: + tm.assert_almost_equal(res, exp) + + invalid = Series([1, 2, 3]).astype("category") + msg = "Can only use .dt accessor with datetimelike" + + with pytest.raises(AttributeError, match=msg): + invalid.dt + assert not hasattr(invalid, "str") diff --git a/venv/Lib/site-packages/pandas/tests/series/test_apply.py b/venv/Lib/site-packages/pandas/tests/series/test_apply.py new file mode 100644 index 0000000..a4c55a8 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/test_apply.py @@ -0,0 +1,789 @@ +from collections import Counter, defaultdict +from itertools import chain + +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Index, Series, isna +import pandas._testing as tm +from pandas.conftest import _get_cython_table_params +from pandas.core.base import SpecificationError + + +class TestSeriesApply: + def test_apply(self, datetime_series): + with np.errstate(all="ignore"): + tm.assert_series_equal( + datetime_series.apply(np.sqrt), np.sqrt(datetime_series) + ) + + # element-wise apply + import math + + tm.assert_series_equal( + datetime_series.apply(math.exp), np.exp(datetime_series) + ) + + # empty series + s = Series(dtype=object, name="foo", index=pd.Index([], name="bar")) + rs = s.apply(lambda x: x) + tm.assert_series_equal(s, rs) + + # check all metadata (GH 9322) + assert s is not rs + assert s.index is rs.index + assert s.dtype == rs.dtype + assert s.name == rs.name + + # index but no data + s = Series(index=[1, 2, 3], dtype=np.float64) + rs = s.apply(lambda x: x) + tm.assert_series_equal(s, rs) + + def test_apply_same_length_inference_bug(self): + s = Series([1, 2]) + f = lambda x: (x, x + 1) + + result = s.apply(f) + expected = s.map(f) + tm.assert_series_equal(result, expected) + + s = Series([1, 2, 3]) + result = s.apply(f) + expected = s.map(f) + tm.assert_series_equal(result, expected) + + def test_apply_dont_convert_dtype(self): + s = Series(np.random.randn(10)) + + f = lambda x: x if x > 0 else np.nan + result = s.apply(f, convert_dtype=False) + assert result.dtype == object + + def test_with_string_args(self, datetime_series): + + for arg in ["sum", "mean", "min", "max", "std"]: + result = datetime_series.apply(arg) + expected = getattr(datetime_series, arg)() + assert result == expected + + def test_apply_args(self): + s = Series(["foo,bar"]) + + result = s.apply(str.split, args=(",",)) + assert result[0] == ["foo", "bar"] + assert isinstance(result[0], list) + + def test_series_map_box_timestamps(self): + # GH#2689, GH#2627 + ser = Series(pd.date_range("1/1/2000", periods=10)) + + def func(x): + return (x.hour, x.day, x.month) + + # it works! + ser.map(func) + ser.apply(func) + + def test_apply_box(self): + # ufunc will not be boxed. Same test cases as the test_map_box + vals = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")] + s = pd.Series(vals) + assert s.dtype == "datetime64[ns]" + # boxed value must be Timestamp instance + res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") + exp = pd.Series(["Timestamp_1_None", "Timestamp_2_None"]) + tm.assert_series_equal(res, exp) + + vals = [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + ] + s = pd.Series(vals) + assert s.dtype == "datetime64[ns, US/Eastern]" + res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") + exp = pd.Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"]) + tm.assert_series_equal(res, exp) + + # timedelta + vals = [pd.Timedelta("1 days"), pd.Timedelta("2 days")] + s = pd.Series(vals) + assert s.dtype == "timedelta64[ns]" + res = s.apply(lambda x: f"{type(x).__name__}_{x.days}") + exp = pd.Series(["Timedelta_1", "Timedelta_2"]) + tm.assert_series_equal(res, exp) + + # period + vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")] + s = pd.Series(vals) + assert s.dtype == "Period[M]" + res = s.apply(lambda x: f"{type(x).__name__}_{x.freqstr}") + exp = pd.Series(["Period_M", "Period_M"]) + tm.assert_series_equal(res, exp) + + def test_apply_datetimetz(self): + values = pd.date_range("2011-01-01", "2011-01-02", freq="H").tz_localize( + "Asia/Tokyo" + ) + s = pd.Series(values, name="XX") + + result = s.apply(lambda x: x + pd.offsets.Day()) + exp_values = pd.date_range("2011-01-02", "2011-01-03", freq="H").tz_localize( + "Asia/Tokyo" + ) + exp = pd.Series(exp_values, name="XX") + tm.assert_series_equal(result, exp) + + # change dtype + # GH 14506 : Returned dtype changed from int32 to int64 + result = s.apply(lambda x: x.hour) + exp = pd.Series(list(range(24)) + [0], name="XX", dtype=np.int64) + tm.assert_series_equal(result, exp) + + # not vectorized + def f(x): + if not isinstance(x, pd.Timestamp): + raise ValueError + return str(x.tz) + + result = s.map(f) + exp = pd.Series(["Asia/Tokyo"] * 25, name="XX") + tm.assert_series_equal(result, exp) + + def test_apply_dict_depr(self): + + tsdf = pd.DataFrame( + np.random.randn(10, 3), + columns=["A", "B", "C"], + index=pd.date_range("1/1/2000", periods=10), + ) + msg = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + tsdf.A.agg({"foo": ["sum", "mean"]}) + + def test_apply_categorical(self): + values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True) + ser = pd.Series(values, name="XX", index=list("abcdefg")) + result = ser.apply(lambda x: x.lower()) + + # should be categorical dtype when the number of categories are + # the same + values = pd.Categorical(list("abbabcd"), categories=list("dcba"), ordered=True) + exp = pd.Series(values, name="XX", index=list("abcdefg")) + tm.assert_series_equal(result, exp) + tm.assert_categorical_equal(result.values, exp.values) + + result = ser.apply(lambda x: "A") + exp = pd.Series(["A"] * 7, name="XX", index=list("abcdefg")) + tm.assert_series_equal(result, exp) + assert result.dtype == np.object + + @pytest.mark.parametrize("series", [["1-1", "1-1", np.NaN], ["1-1", "1-2", np.NaN]]) + def test_apply_categorical_with_nan_values(self, series): + # GH 20714 bug fixed in: GH 24275 + s = pd.Series(series, dtype="category") + result = s.apply(lambda x: x.split("-")[0]) + result = result.astype(object) + expected = pd.Series(["1", "1", np.NaN], dtype="category") + expected = expected.astype(object) + tm.assert_series_equal(result, expected) + + def test_apply_empty_integer_series_with_datetime_index(self): + # GH 21245 + s = pd.Series([], index=pd.date_range(start="2018-01-01", periods=0), dtype=int) + result = s.apply(lambda x: x) + tm.assert_series_equal(result, s) + + +class TestSeriesAggregate: + def test_transform(self, string_series): + # transforming functions + + with np.errstate(all="ignore"): + + f_sqrt = np.sqrt(string_series) + f_abs = np.abs(string_series) + + # ufunc + result = string_series.transform(np.sqrt) + expected = f_sqrt.copy() + tm.assert_series_equal(result, expected) + + result = string_series.apply(np.sqrt) + tm.assert_series_equal(result, expected) + + # list-like + result = string_series.transform([np.sqrt]) + expected = f_sqrt.to_frame().copy() + expected.columns = ["sqrt"] + tm.assert_frame_equal(result, expected) + + result = string_series.transform([np.sqrt]) + tm.assert_frame_equal(result, expected) + + result = string_series.transform(["sqrt"]) + tm.assert_frame_equal(result, expected) + + # multiple items in list + # these are in the order as if we are applying both functions per + # series and then concatting + expected = pd.concat([f_sqrt, f_abs], axis=1) + expected.columns = ["sqrt", "absolute"] + result = string_series.apply([np.sqrt, np.abs]) + tm.assert_frame_equal(result, expected) + + result = string_series.transform(["sqrt", "abs"]) + expected.columns = ["sqrt", "abs"] + tm.assert_frame_equal(result, expected) + + # dict, provide renaming + expected = pd.concat([f_sqrt, f_abs], axis=1) + expected.columns = ["foo", "bar"] + expected = expected.unstack().rename("series") + + result = string_series.apply({"foo": np.sqrt, "bar": np.abs}) + tm.assert_series_equal(result.reindex_like(expected), expected) + + def test_transform_and_agg_error(self, string_series): + # we are trying to transform with an aggregator + with pytest.raises(ValueError): + string_series.transform(["min", "max"]) + + with pytest.raises(ValueError): + with np.errstate(all="ignore"): + string_series.agg(["sqrt", "max"]) + + with pytest.raises(ValueError): + with np.errstate(all="ignore"): + string_series.transform(["sqrt", "max"]) + + with pytest.raises(ValueError): + with np.errstate(all="ignore"): + string_series.agg({"foo": np.sqrt, "bar": "sum"}) + + def test_demo(self): + # demonstration tests + s = Series(range(6), dtype="int64", name="series") + + result = s.agg(["min", "max"]) + expected = Series([0, 5], index=["min", "max"], name="series") + tm.assert_series_equal(result, expected) + + result = s.agg({"foo": "min"}) + expected = Series([0], index=["foo"], name="series") + tm.assert_series_equal(result, expected) + + # nested renaming + msg = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + s.agg({"foo": ["min", "max"]}) + + def test_multiple_aggregators_with_dict_api(self): + + s = Series(range(6), dtype="int64", name="series") + # nested renaming + msg = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + s.agg({"foo": ["min", "max"], "bar": ["sum", "mean"]}) + + def test_agg_apply_evaluate_lambdas_the_same(self, string_series): + # test that we are evaluating row-by-row first + # before vectorized evaluation + result = string_series.apply(lambda x: str(x)) + expected = string_series.agg(lambda x: str(x)) + tm.assert_series_equal(result, expected) + + result = string_series.apply(str) + expected = string_series.agg(str) + tm.assert_series_equal(result, expected) + + def test_with_nested_series(self, datetime_series): + # GH 2316 + # .agg with a reducer and a transform, what to do + result = datetime_series.apply( + lambda x: Series([x, x ** 2], index=["x", "x^2"]) + ) + expected = DataFrame({"x": datetime_series, "x^2": datetime_series ** 2}) + tm.assert_frame_equal(result, expected) + + result = datetime_series.agg(lambda x: Series([x, x ** 2], index=["x", "x^2"])) + tm.assert_frame_equal(result, expected) + + def test_replicate_describe(self, string_series): + # this also tests a result set that is all scalars + expected = string_series.describe() + result = string_series.apply( + { + "count": "count", + "mean": "mean", + "std": "std", + "min": "min", + "25%": lambda x: x.quantile(0.25), + "50%": "median", + "75%": lambda x: x.quantile(0.75), + "max": "max", + } + ) + tm.assert_series_equal(result, expected) + + def test_reduce(self, string_series): + # reductions with named functions + result = string_series.agg(["sum", "mean"]) + expected = Series( + [string_series.sum(), string_series.mean()], + ["sum", "mean"], + name=string_series.name, + ) + tm.assert_series_equal(result, expected) + + def test_non_callable_aggregates(self): + # test agg using non-callable series attributes + s = Series([1, 2, None]) + + # Calling agg w/ just a string arg same as calling s.arg + result = s.agg("size") + expected = s.size + assert result == expected + + # test when mixed w/ callable reducers + result = s.agg(["size", "count", "mean"]) + expected = Series({"size": 3.0, "count": 2.0, "mean": 1.5}) + tm.assert_series_equal(result[expected.index], expected) + + @pytest.mark.parametrize( + "series, func, expected", + chain( + _get_cython_table_params( + Series(dtype=np.float64), + [ + ("sum", 0), + ("max", np.nan), + ("min", np.nan), + ("all", True), + ("any", False), + ("mean", np.nan), + ("prod", 1), + ("std", np.nan), + ("var", np.nan), + ("median", np.nan), + ], + ), + _get_cython_table_params( + Series([np.nan, 1, 2, 3]), + [ + ("sum", 6), + ("max", 3), + ("min", 1), + ("all", True), + ("any", True), + ("mean", 2), + ("prod", 6), + ("std", 1), + ("var", 1), + ("median", 2), + ], + ), + _get_cython_table_params( + Series("a b c".split()), + [ + ("sum", "abc"), + ("max", "c"), + ("min", "a"), + ("all", "c"), # see GH12863 + ("any", "a"), + ], + ), + ), + ) + def test_agg_cython_table(self, series, func, expected): + # GH21224 + # test reducing functions in + # pandas.core.base.SelectionMixin._cython_table + result = series.agg(func) + if tm.is_number(expected): + assert np.isclose(result, expected, equal_nan=True) + else: + assert result == expected + + @pytest.mark.parametrize( + "series, func, expected", + chain( + _get_cython_table_params( + Series(dtype=np.float64), + [ + ("cumprod", Series([], Index([]), dtype=np.float64)), + ("cumsum", Series([], Index([]), dtype=np.float64)), + ], + ), + _get_cython_table_params( + Series([np.nan, 1, 2, 3]), + [ + ("cumprod", Series([np.nan, 1, 2, 6])), + ("cumsum", Series([np.nan, 1, 3, 6])), + ], + ), + _get_cython_table_params( + Series("a b c".split()), [("cumsum", Series(["a", "ab", "abc"]))] + ), + ), + ) + def test_agg_cython_table_transform(self, series, func, expected): + # GH21224 + # test transforming functions in + # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) + result = series.agg(func) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "series, func, expected", + chain( + _get_cython_table_params( + Series("a b c".split()), + [ + ("mean", TypeError), # mean raises TypeError + ("prod", TypeError), + ("std", TypeError), + ("var", TypeError), + ("median", TypeError), + ("cumprod", TypeError), + ], + ) + ), + ) + def test_agg_cython_table_raises(self, series, func, expected): + # GH21224 + with pytest.raises(expected): + # e.g. Series('a b'.split()).cumprod() will raise + series.agg(func) + + +class TestSeriesMap: + def test_map(self, datetime_series): + index, data = tm.getMixedTypeDict() + + source = Series(data["B"], index=data["C"]) + target = Series(data["C"][:4], index=data["D"][:4]) + + merged = target.map(source) + + for k, v in merged.items(): + assert v == source[target[k]] + + # input could be a dict + merged = target.map(source.to_dict()) + + for k, v in merged.items(): + assert v == source[target[k]] + + # function + result = datetime_series.map(lambda x: x * 2) + tm.assert_series_equal(result, datetime_series * 2) + + # GH 10324 + a = Series([1, 2, 3, 4]) + b = Series(["even", "odd", "even", "odd"], dtype="category") + c = Series(["even", "odd", "even", "odd"]) + + exp = Series(["odd", "even", "odd", np.nan], dtype="category") + tm.assert_series_equal(a.map(b), exp) + exp = Series(["odd", "even", "odd", np.nan]) + tm.assert_series_equal(a.map(c), exp) + + a = Series(["a", "b", "c", "d"]) + b = Series([1, 2, 3, 4], index=pd.CategoricalIndex(["b", "c", "d", "e"])) + c = Series([1, 2, 3, 4], index=Index(["b", "c", "d", "e"])) + + exp = Series([np.nan, 1, 2, 3]) + tm.assert_series_equal(a.map(b), exp) + exp = Series([np.nan, 1, 2, 3]) + tm.assert_series_equal(a.map(c), exp) + + a = Series(["a", "b", "c", "d"]) + b = Series( + ["B", "C", "D", "E"], + dtype="category", + index=pd.CategoricalIndex(["b", "c", "d", "e"]), + ) + c = Series(["B", "C", "D", "E"], index=Index(["b", "c", "d", "e"])) + + exp = Series( + pd.Categorical([np.nan, "B", "C", "D"], categories=["B", "C", "D", "E"]) + ) + tm.assert_series_equal(a.map(b), exp) + exp = Series([np.nan, "B", "C", "D"]) + tm.assert_series_equal(a.map(c), exp) + + @pytest.mark.parametrize("index", tm.all_index_generator(10)) + def test_map_empty(self, index): + s = Series(index) + result = s.map({}) + + expected = pd.Series(np.nan, index=s.index) + tm.assert_series_equal(result, expected) + + def test_map_compat(self): + # related GH 8024 + s = Series([True, True, False], index=[1, 2, 3]) + result = s.map({True: "foo", False: "bar"}) + expected = Series(["foo", "foo", "bar"], index=[1, 2, 3]) + tm.assert_series_equal(result, expected) + + def test_map_int(self): + left = Series({"a": 1.0, "b": 2.0, "c": 3.0, "d": 4}) + right = Series({1: 11, 2: 22, 3: 33}) + + assert left.dtype == np.float_ + assert issubclass(right.dtype.type, np.integer) + + merged = left.map(right) + assert merged.dtype == np.float_ + assert isna(merged["d"]) + assert not isna(merged["c"]) + + def test_map_type_inference(self): + s = Series(range(3)) + s2 = s.map(lambda x: np.where(x == 0, 0, 1)) + assert issubclass(s2.dtype.type, np.integer) + + def test_map_decimal(self, string_series): + from decimal import Decimal + + result = string_series.map(lambda x: Decimal(str(x))) + assert result.dtype == np.object_ + assert isinstance(result[0], Decimal) + + def test_map_na_exclusion(self): + s = Series([1.5, np.nan, 3, np.nan, 5]) + + result = s.map(lambda x: x * 2, na_action="ignore") + exp = s * 2 + tm.assert_series_equal(result, exp) + + def test_map_dict_with_tuple_keys(self): + """ + Due to new MultiIndex-ing behaviour in v0.14.0, + dicts with tuple keys passed to map were being + converted to a multi-index, preventing tuple values + from being mapped properly. + """ + # GH 18496 + df = pd.DataFrame({"a": [(1,), (2,), (3, 4), (5, 6)]}) + label_mappings = {(1,): "A", (2,): "B", (3, 4): "A", (5, 6): "B"} + + df["labels"] = df["a"].map(label_mappings) + df["expected_labels"] = pd.Series(["A", "B", "A", "B"], index=df.index) + # All labels should be filled now + tm.assert_series_equal(df["labels"], df["expected_labels"], check_names=False) + + def test_map_counter(self): + s = Series(["a", "b", "c"], index=[1, 2, 3]) + counter = Counter() + counter["b"] = 5 + counter["c"] += 1 + result = s.map(counter) + expected = Series([0, 5, 1], index=[1, 2, 3]) + tm.assert_series_equal(result, expected) + + def test_map_defaultdict(self): + s = Series([1, 2, 3], index=["a", "b", "c"]) + default_dict = defaultdict(lambda: "blank") + default_dict[1] = "stuff" + result = s.map(default_dict) + expected = Series(["stuff", "blank", "blank"], index=["a", "b", "c"]) + tm.assert_series_equal(result, expected) + + def test_map_dict_na_key(self): + # https://github.com/pandas-dev/pandas/issues/17648 + # Checks that np.nan key is appropriately mapped + s = Series([1, 2, np.nan]) + expected = Series(["a", "b", "c"]) + result = s.map({1: "a", 2: "b", np.nan: "c"}) + tm.assert_series_equal(result, expected) + + def test_map_dict_subclass_with_missing(self): + """ + Test Series.map with a dictionary subclass that defines __missing__, + i.e. sets a default value (GH #15999). + """ + + class DictWithMissing(dict): + def __missing__(self, key): + return "missing" + + s = Series([1, 2, 3]) + dictionary = DictWithMissing({3: "three"}) + result = s.map(dictionary) + expected = Series(["missing", "missing", "three"]) + tm.assert_series_equal(result, expected) + + def test_map_dict_subclass_without_missing(self): + class DictWithoutMissing(dict): + pass + + s = Series([1, 2, 3]) + dictionary = DictWithoutMissing({3: "three"}) + result = s.map(dictionary) + expected = Series([np.nan, np.nan, "three"]) + tm.assert_series_equal(result, expected) + + def test_map_abc_mapping(self, non_mapping_dict_subclass): + # https://github.com/pandas-dev/pandas/issues/29733 + # Check collections.abc.Mapping support as mapper for Series.map + s = Series([1, 2, 3]) + not_a_dictionary = non_mapping_dict_subclass({3: "three"}) + result = s.map(not_a_dictionary) + expected = Series([np.nan, np.nan, "three"]) + tm.assert_series_equal(result, expected) + + def test_map_abc_mapping_with_missing(self, non_mapping_dict_subclass): + # https://github.com/pandas-dev/pandas/issues/29733 + # Check collections.abc.Mapping support as mapper for Series.map + class NonDictMappingWithMissing(non_mapping_dict_subclass): + def __missing__(self, key): + return "missing" + + s = Series([1, 2, 3]) + not_a_dictionary = NonDictMappingWithMissing({3: "three"}) + result = s.map(not_a_dictionary) + # __missing__ is a dict concept, not a Mapping concept, + # so it should not change the result! + expected = Series([np.nan, np.nan, "three"]) + tm.assert_series_equal(result, expected) + + def test_map_box(self): + vals = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")] + s = pd.Series(vals) + assert s.dtype == "datetime64[ns]" + # boxed value must be Timestamp instance + res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") + exp = pd.Series(["Timestamp_1_None", "Timestamp_2_None"]) + tm.assert_series_equal(res, exp) + + vals = [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + ] + s = pd.Series(vals) + assert s.dtype == "datetime64[ns, US/Eastern]" + res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") + exp = pd.Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"]) + tm.assert_series_equal(res, exp) + + # timedelta + vals = [pd.Timedelta("1 days"), pd.Timedelta("2 days")] + s = pd.Series(vals) + assert s.dtype == "timedelta64[ns]" + res = s.apply(lambda x: f"{type(x).__name__}_{x.days}") + exp = pd.Series(["Timedelta_1", "Timedelta_2"]) + tm.assert_series_equal(res, exp) + + # period + vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")] + s = pd.Series(vals) + assert s.dtype == "Period[M]" + res = s.apply(lambda x: f"{type(x).__name__}_{x.freqstr}") + exp = pd.Series(["Period_M", "Period_M"]) + tm.assert_series_equal(res, exp) + + def test_map_categorical(self): + values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True) + s = pd.Series(values, name="XX", index=list("abcdefg")) + + result = s.map(lambda x: x.lower()) + exp_values = pd.Categorical( + list("abbabcd"), categories=list("dcba"), ordered=True + ) + exp = pd.Series(exp_values, name="XX", index=list("abcdefg")) + tm.assert_series_equal(result, exp) + tm.assert_categorical_equal(result.values, exp_values) + + result = s.map(lambda x: "A") + exp = pd.Series(["A"] * 7, name="XX", index=list("abcdefg")) + tm.assert_series_equal(result, exp) + assert result.dtype == np.object + + with pytest.raises(NotImplementedError): + s.map(lambda x: x, na_action="ignore") + + def test_map_datetimetz(self): + values = pd.date_range("2011-01-01", "2011-01-02", freq="H").tz_localize( + "Asia/Tokyo" + ) + s = pd.Series(values, name="XX") + + # keep tz + result = s.map(lambda x: x + pd.offsets.Day()) + exp_values = pd.date_range("2011-01-02", "2011-01-03", freq="H").tz_localize( + "Asia/Tokyo" + ) + exp = pd.Series(exp_values, name="XX") + tm.assert_series_equal(result, exp) + + # change dtype + # GH 14506 : Returned dtype changed from int32 to int64 + result = s.map(lambda x: x.hour) + exp = pd.Series(list(range(24)) + [0], name="XX", dtype=np.int64) + tm.assert_series_equal(result, exp) + + with pytest.raises(NotImplementedError): + s.map(lambda x: x, na_action="ignore") + + # not vectorized + def f(x): + if not isinstance(x, pd.Timestamp): + raise ValueError + return str(x.tz) + + result = s.map(f) + exp = pd.Series(["Asia/Tokyo"] * 25, name="XX") + tm.assert_series_equal(result, exp) + + @pytest.mark.parametrize( + "vals,mapping,exp", + [ + (list("abc"), {np.nan: "not NaN"}, [np.nan] * 3 + ["not NaN"]), + (list("abc"), {"a": "a letter"}, ["a letter"] + [np.nan] * 3), + (list(range(3)), {0: 42}, [42] + [np.nan] * 3), + ], + ) + def test_map_missing_mixed(self, vals, mapping, exp): + # GH20495 + s = pd.Series(vals + [np.nan]) + result = s.map(mapping) + + tm.assert_series_equal(result, pd.Series(exp)) + + @pytest.mark.parametrize( + "dti,exp", + [ + ( + Series([1, 2], index=pd.DatetimeIndex([0, 31536000000])), + DataFrame(np.repeat([[1, 2]], 2, axis=0), dtype="int64"), + ), + ( + tm.makeTimeSeries(nper=30), + DataFrame(np.repeat([[1, 2]], 30, axis=0), dtype="int64"), + ), + ], + ) + def test_apply_series_on_date_time_index_aware_series(self, dti, exp): + # GH 25959 + # Calling apply on a localized time series should not cause an error + index = dti.tz_localize("UTC").index + result = pd.Series(index).apply(lambda x: pd.Series([1, 2])) + tm.assert_frame_equal(result, exp) + + def test_apply_scaler_on_date_time_index_aware_series(self): + # GH 25959 + # Calling apply on a localized time series should not cause an error + series = tm.makeTimeSeries(nper=30).tz_localize("UTC") + result = pd.Series(series.index).apply(lambda x: 1) + tm.assert_series_equal(result, pd.Series(np.ones(30), dtype="int64")) + + def test_map_float_to_string_precision(self): + # GH 13228 + ser = pd.Series(1 / 3) + result = ser.map(lambda val: str(val)).to_dict() + expected = {0: "0.3333333333333333"} + assert result == expected diff --git a/venv/Lib/site-packages/pandas/tests/series/test_arithmetic.py b/venv/Lib/site-packages/pandas/tests/series/test_arithmetic.py new file mode 100644 index 0000000..f3ffdc3 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/test_arithmetic.py @@ -0,0 +1,205 @@ +import operator + +import numpy as np +import pytest + +from pandas._libs.tslibs import IncompatibleFrequency + +import pandas as pd +from pandas import Series +import pandas._testing as tm + + +def _permute(obj): + return obj.take(np.random.permutation(len(obj))) + + +class TestSeriesFlexArithmetic: + @pytest.mark.parametrize( + "ts", + [ + (lambda x: x, lambda x: x * 2, False), + (lambda x: x, lambda x: x[::2], False), + (lambda x: x, lambda x: 5, True), + (lambda x: tm.makeFloatSeries(), lambda x: tm.makeFloatSeries(), True), + ], + ) + @pytest.mark.parametrize( + "opname", ["add", "sub", "mul", "floordiv", "truediv", "pow"] + ) + def test_flex_method_equivalence(self, opname, ts): + # check that Series.{opname} behaves like Series.__{opname}__, + tser = tm.makeTimeSeries().rename("ts") + + series = ts[0](tser) + other = ts[1](tser) + check_reverse = ts[2] + + op = getattr(Series, opname) + alt = getattr(operator, opname) + + result = op(series, other) + expected = alt(series, other) + tm.assert_almost_equal(result, expected) + if check_reverse: + rop = getattr(Series, "r" + opname) + result = rop(series, other) + expected = alt(other, series) + tm.assert_almost_equal(result, expected) + + def test_flex_method_subclass_metadata_preservation(self, all_arithmetic_operators): + # GH 13208 + class MySeries(Series): + _metadata = ["x"] + + @property + def _constructor(self): + return MySeries + + opname = all_arithmetic_operators + op = getattr(Series, opname) + m = MySeries([1, 2, 3], name="test") + m.x = 42 + result = op(m, 1) + assert result.x == 42 + + +class TestSeriesArithmetic: + # Some of these may end up in tests/arithmetic, but are not yet sorted + + def test_add_series_with_period_index(self): + rng = pd.period_range("1/1/2000", "1/1/2010", freq="A") + ts = Series(np.random.randn(len(rng)), index=rng) + + result = ts + ts[::2] + expected = ts + ts + expected[1::2] = np.nan + tm.assert_series_equal(result, expected) + + result = ts + _permute(ts[::2]) + tm.assert_series_equal(result, expected) + + msg = "Input has different freq=D from PeriodIndex\\(freq=A-DEC\\)" + with pytest.raises(IncompatibleFrequency, match=msg): + ts + ts.asfreq("D", how="end") + + @pytest.mark.parametrize( + "target_add,input_value,expected_value", + [ + ("!", ["hello", "world"], ["hello!", "world!"]), + ("m", ["hello", "world"], ["hellom", "worldm"]), + ], + ) + def test_string_addition(self, target_add, input_value, expected_value): + # GH28658 - ensure adding 'm' does not raise an error + a = Series(input_value) + + result = a + target_add + expected = Series(expected_value) + tm.assert_series_equal(result, expected) + + +# ------------------------------------------------------------------ +# Comparisons + + +class TestSeriesFlexComparison: + def test_comparison_flex_basic(self): + left = pd.Series(np.random.randn(10)) + right = pd.Series(np.random.randn(10)) + + tm.assert_series_equal(left.eq(right), left == right) + tm.assert_series_equal(left.ne(right), left != right) + tm.assert_series_equal(left.le(right), left < right) + tm.assert_series_equal(left.lt(right), left <= right) + tm.assert_series_equal(left.gt(right), left > right) + tm.assert_series_equal(left.ge(right), left >= right) + + # axis + for axis in [0, None, "index"]: + tm.assert_series_equal(left.eq(right, axis=axis), left == right) + tm.assert_series_equal(left.ne(right, axis=axis), left != right) + tm.assert_series_equal(left.le(right, axis=axis), left < right) + tm.assert_series_equal(left.lt(right, axis=axis), left <= right) + tm.assert_series_equal(left.gt(right, axis=axis), left > right) + tm.assert_series_equal(left.ge(right, axis=axis), left >= right) + + # + msg = "No axis named 1 for object type" + for op in ["eq", "ne", "le", "le", "gt", "ge"]: + with pytest.raises(ValueError, match=msg): + getattr(left, op)(right, axis=1) + + +class TestSeriesComparison: + def test_comparison_different_length(self): + a = Series(["a", "b", "c"]) + b = Series(["b", "a"]) + with pytest.raises(ValueError): + a < b + + a = Series([1, 2]) + b = Series([2, 3, 4]) + with pytest.raises(ValueError): + a == b + + @pytest.mark.parametrize("opname", ["eq", "ne", "gt", "lt", "ge", "le"]) + def test_ser_flex_cmp_return_dtypes(self, opname): + # GH#15115 + ser = Series([1, 3, 2], index=range(3)) + const = 2 + result = getattr(ser, opname)(const).dtypes + expected = np.dtype("bool") + assert result == expected + + @pytest.mark.parametrize("opname", ["eq", "ne", "gt", "lt", "ge", "le"]) + def test_ser_flex_cmp_return_dtypes_empty(self, opname): + # GH#15115 empty Series case + ser = Series([1, 3, 2], index=range(3)) + empty = ser.iloc[:0] + const = 2 + result = getattr(empty, opname)(const).dtypes + expected = np.dtype("bool") + assert result == expected + + @pytest.mark.parametrize( + "op", + [operator.eq, operator.ne, operator.le, operator.lt, operator.ge, operator.gt], + ) + @pytest.mark.parametrize( + "names", [(None, None, None), ("foo", "bar", None), ("baz", "baz", "baz")] + ) + def test_ser_cmp_result_names(self, names, op): + # datetime64 dtype + dti = pd.date_range("1949-06-07 03:00:00", freq="H", periods=5, name=names[0]) + ser = Series(dti).rename(names[1]) + result = op(ser, dti) + assert result.name == names[2] + + # datetime64tz dtype + dti = dti.tz_localize("US/Central") + ser = Series(dti).rename(names[1]) + result = op(ser, dti) + assert result.name == names[2] + + # timedelta64 dtype + tdi = dti - dti.shift(1) + ser = Series(tdi).rename(names[1]) + result = op(ser, tdi) + assert result.name == names[2] + + # interval dtype + if op in [operator.eq, operator.ne]: + # interval dtype comparisons not yet implemented + ii = pd.interval_range(start=0, periods=5, name=names[0]) + ser = Series(ii).rename(names[1]) + result = op(ser, ii) + assert result.name == names[2] + + # categorical + if op in [operator.eq, operator.ne]: + # categorical dtype comparisons raise for inequalities + cidx = tdi.astype("category") + ser = Series(cidx).rename(names[1]) + result = op(ser, cidx) + assert result.name == names[2] diff --git a/venv/Lib/site-packages/pandas/tests/series/test_block_internals.py b/venv/Lib/site-packages/pandas/tests/series/test_block_internals.py new file mode 100644 index 0000000..18e75c3 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/test_block_internals.py @@ -0,0 +1,39 @@ +import pandas as pd + +# Segregated collection of methods that require the BlockManager internal data +# structure + + +class TestSeriesBlockInternals: + def test_setitem_invalidates_datetime_index_freq(self): + # GH#24096 altering a datetime64tz Series inplace invalidates the + # `freq` attribute on the underlying DatetimeIndex + + dti = pd.date_range("20130101", periods=3, tz="US/Eastern") + ts = dti[1] + ser = pd.Series(dti) + assert ser._values is not dti + assert ser._values._data.base is not dti._data._data.base + assert dti.freq == "D" + ser.iloc[1] = pd.NaT + assert ser._values.freq is None + + # check that the DatetimeIndex was not altered in place + assert ser._values is not dti + assert ser._values._data.base is not dti._data._data.base + assert dti[1] == ts + assert dti.freq == "D" + + def test_dt64tz_setitem_does_not_mutate_dti(self): + # GH#21907, GH#24096 + dti = pd.date_range("2016-01-01", periods=10, tz="US/Pacific") + ts = dti[0] + ser = pd.Series(dti) + assert ser._values is not dti + assert ser._values._data.base is not dti._data._data.base + assert ser._data.blocks[0].values is not dti + assert ser._data.blocks[0].values._data.base is not dti._data._data.base + + ser[::3] = pd.NaT + assert ser[0] is pd.NaT + assert dti[0] == ts diff --git a/venv/Lib/site-packages/pandas/tests/series/test_combine_concat.py b/venv/Lib/site-packages/pandas/tests/series/test_combine_concat.py new file mode 100644 index 0000000..239353d --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/test_combine_concat.py @@ -0,0 +1,267 @@ +from datetime import datetime + +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Series +import pandas._testing as tm + + +class TestSeriesCombine: + def test_combine_scalar(self): + # GH 21248 + # Note - combine() with another Series is tested elsewhere because + # it is used when testing operators + s = pd.Series([i * 10 for i in range(5)]) + result = s.combine(3, lambda x, y: x + y) + expected = pd.Series([i * 10 + 3 for i in range(5)]) + tm.assert_series_equal(result, expected) + + result = s.combine(22, lambda x, y: min(x, y)) + expected = pd.Series([min(i * 10, 22) for i in range(5)]) + tm.assert_series_equal(result, expected) + + def test_combine_first(self): + values = tm.makeIntIndex(20).values.astype(float) + series = Series(values, index=tm.makeIntIndex(20)) + + series_copy = series * 2 + series_copy[::2] = np.NaN + + # nothing used from the input + combined = series.combine_first(series_copy) + + tm.assert_series_equal(combined, series) + + # Holes filled from input + combined = series_copy.combine_first(series) + assert np.isfinite(combined).all() + + tm.assert_series_equal(combined[::2], series[::2]) + tm.assert_series_equal(combined[1::2], series_copy[1::2]) + + # mixed types + index = tm.makeStringIndex(20) + floats = Series(tm.randn(20), index=index) + strings = Series(tm.makeStringIndex(10), index=index[::2]) + + combined = strings.combine_first(floats) + + tm.assert_series_equal(strings, combined.loc[index[::2]]) + tm.assert_series_equal(floats[1::2].astype(object), combined.loc[index[1::2]]) + + # corner case + s = Series([1.0, 2, 3], index=[0, 1, 2]) + empty = Series([], index=[], dtype=object) + result = s.combine_first(empty) + s.index = s.index.astype("O") + tm.assert_series_equal(s, result) + + def test_update(self): + s = Series([1.5, np.nan, 3.0, 4.0, np.nan]) + s2 = Series([np.nan, 3.5, np.nan, 5.0]) + s.update(s2) + + expected = Series([1.5, 3.5, 3.0, 5.0, np.nan]) + tm.assert_series_equal(s, expected) + + # GH 3217 + df = DataFrame([{"a": 1}, {"a": 3, "b": 2}]) + df["c"] = np.nan + + df["c"].update(Series(["foo"], index=[0])) + expected = DataFrame( + [[1, np.nan, "foo"], [3, 2.0, np.nan]], columns=["a", "b", "c"] + ) + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize( + "other, dtype, expected", + [ + # other is int + ([61, 63], "int32", pd.Series([10, 61, 12], dtype="int32")), + ([61, 63], "int64", pd.Series([10, 61, 12])), + ([61, 63], float, pd.Series([10.0, 61.0, 12.0])), + ([61, 63], object, pd.Series([10, 61, 12], dtype=object)), + # other is float, but can be cast to int + ([61.0, 63.0], "int32", pd.Series([10, 61, 12], dtype="int32")), + ([61.0, 63.0], "int64", pd.Series([10, 61, 12])), + ([61.0, 63.0], float, pd.Series([10.0, 61.0, 12.0])), + ([61.0, 63.0], object, pd.Series([10, 61.0, 12], dtype=object)), + # others is float, cannot be cast to int + ([61.1, 63.1], "int32", pd.Series([10.0, 61.1, 12.0])), + ([61.1, 63.1], "int64", pd.Series([10.0, 61.1, 12.0])), + ([61.1, 63.1], float, pd.Series([10.0, 61.1, 12.0])), + ([61.1, 63.1], object, pd.Series([10, 61.1, 12], dtype=object)), + # other is object, cannot be cast + ([(61,), (63,)], "int32", pd.Series([10, (61,), 12])), + ([(61,), (63,)], "int64", pd.Series([10, (61,), 12])), + ([(61,), (63,)], float, pd.Series([10.0, (61,), 12.0])), + ([(61,), (63,)], object, pd.Series([10, (61,), 12])), + ], + ) + def test_update_dtypes(self, other, dtype, expected): + + s = Series([10, 11, 12], dtype=dtype) + other = Series(other, index=[1, 3]) + s.update(other) + + tm.assert_series_equal(s, expected) + + def test_concat_empty_series_dtypes_roundtrips(self): + + # round-tripping with self & like self + dtypes = map(np.dtype, ["float64", "int8", "uint8", "bool", "m8[ns]", "M8[ns]"]) + + for dtype in dtypes: + assert pd.concat([Series(dtype=dtype)]).dtype == dtype + assert pd.concat([Series(dtype=dtype), Series(dtype=dtype)]).dtype == dtype + + def int_result_type(dtype, dtype2): + typs = {dtype.kind, dtype2.kind} + if not len(typs - {"i", "u", "b"}) and ( + dtype.kind == "i" or dtype2.kind == "i" + ): + return "i" + elif not len(typs - {"u", "b"}) and ( + dtype.kind == "u" or dtype2.kind == "u" + ): + return "u" + return None + + def float_result_type(dtype, dtype2): + typs = {dtype.kind, dtype2.kind} + if not len(typs - {"f", "i", "u"}) and ( + dtype.kind == "f" or dtype2.kind == "f" + ): + return "f" + return None + + def get_result_type(dtype, dtype2): + result = float_result_type(dtype, dtype2) + if result is not None: + return result + result = int_result_type(dtype, dtype2) + if result is not None: + return result + return "O" + + for dtype in dtypes: + for dtype2 in dtypes: + if dtype == dtype2: + continue + + expected = get_result_type(dtype, dtype2) + result = pd.concat([Series(dtype=dtype), Series(dtype=dtype2)]).dtype + assert result.kind == expected + + def test_combine_first_dt_tz_values(self, tz_naive_fixture): + ser1 = pd.Series( + pd.DatetimeIndex(["20150101", "20150102", "20150103"], tz=tz_naive_fixture), + name="ser1", + ) + ser2 = pd.Series( + pd.DatetimeIndex(["20160514", "20160515", "20160516"], tz=tz_naive_fixture), + index=[2, 3, 4], + name="ser2", + ) + result = ser1.combine_first(ser2) + exp_vals = pd.DatetimeIndex( + ["20150101", "20150102", "20150103", "20160515", "20160516"], + tz=tz_naive_fixture, + ) + exp = pd.Series(exp_vals, name="ser1") + tm.assert_series_equal(exp, result) + + def test_concat_empty_series_dtypes(self): + + # booleans + assert ( + pd.concat([Series(dtype=np.bool_), Series(dtype=np.int32)]).dtype + == np.int32 + ) + assert ( + pd.concat([Series(dtype=np.bool_), Series(dtype=np.float32)]).dtype + == np.object_ + ) + + # datetime-like + assert ( + pd.concat([Series(dtype="m8[ns]"), Series(dtype=np.bool)]).dtype + == np.object_ + ) + assert ( + pd.concat([Series(dtype="m8[ns]"), Series(dtype=np.int64)]).dtype + == np.object_ + ) + assert ( + pd.concat([Series(dtype="M8[ns]"), Series(dtype=np.bool)]).dtype + == np.object_ + ) + assert ( + pd.concat([Series(dtype="M8[ns]"), Series(dtype=np.int64)]).dtype + == np.object_ + ) + assert ( + pd.concat( + [Series(dtype="M8[ns]"), Series(dtype=np.bool_), Series(dtype=np.int64)] + ).dtype + == np.object_ + ) + + # categorical + assert ( + pd.concat([Series(dtype="category"), Series(dtype="category")]).dtype + == "category" + ) + # GH 18515 + assert ( + pd.concat( + [Series(np.array([]), dtype="category"), Series(dtype="float64")] + ).dtype + == "float64" + ) + assert ( + pd.concat([Series(dtype="category"), Series(dtype="object")]).dtype + == "object" + ) + + # sparse + # TODO: move? + result = pd.concat( + [ + Series(dtype="float64").astype("Sparse"), + Series(dtype="float64").astype("Sparse"), + ] + ) + assert result.dtype == "Sparse[float64]" + + result = pd.concat( + [Series(dtype="float64").astype("Sparse"), Series(dtype="float64")] + ) + # TODO: release-note: concat sparse dtype + expected = pd.SparseDtype(np.float64) + assert result.dtype == expected + + result = pd.concat( + [Series(dtype="float64").astype("Sparse"), Series(dtype="object")] + ) + # TODO: release-note: concat sparse dtype + expected = pd.SparseDtype("object") + assert result.dtype == expected + + def test_combine_first_dt64(self): + from pandas.core.tools.datetimes import to_datetime + + s0 = to_datetime(Series(["2010", np.NaN])) + s1 = to_datetime(Series([np.NaN, "2011"])) + rs = s0.combine_first(s1) + xp = to_datetime(Series(["2010", "2011"])) + tm.assert_series_equal(rs, xp) + + s0 = to_datetime(Series(["2010", np.NaN])) + s1 = Series([np.NaN, "2011"]) + rs = s0.combine_first(s1) + xp = Series([datetime(2010, 1, 1), "2011"]) + tm.assert_series_equal(rs, xp) diff --git a/venv/Lib/site-packages/pandas/tests/series/test_constructors.py b/venv/Lib/site-packages/pandas/tests/series/test_constructors.py new file mode 100644 index 0000000..c38e570 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/test_constructors.py @@ -0,0 +1,1406 @@ +from collections import OrderedDict +from datetime import datetime, timedelta + +import numpy as np +import numpy.ma as ma +import pytest + +from pandas._libs import lib +from pandas._libs.tslib import iNaT + +from pandas.core.dtypes.common import is_categorical_dtype, is_datetime64tz_dtype +from pandas.core.dtypes.dtypes import CategoricalDtype + +import pandas as pd +from pandas import ( + Categorical, + DataFrame, + Index, + IntervalIndex, + MultiIndex, + NaT, + Series, + Timestamp, + date_range, + isna, + period_range, + timedelta_range, +) +import pandas._testing as tm +from pandas.core.arrays import IntervalArray, period_array + + +class TestSeriesConstructors: + @pytest.mark.parametrize( + "constructor,check_index_type", + [ + # NOTE: some overlap with test_constructor_empty but that test does not + # test for None or an empty generator. + # test_constructor_pass_none tests None but only with the index also + # passed. + (lambda: Series(), True), + (lambda: Series(None), True), + (lambda: Series({}), True), + (lambda: Series(()), False), # creates a RangeIndex + (lambda: Series([]), False), # creates a RangeIndex + (lambda: Series((_ for _ in [])), False), # creates a RangeIndex + (lambda: Series(data=None), True), + (lambda: Series(data={}), True), + (lambda: Series(data=()), False), # creates a RangeIndex + (lambda: Series(data=[]), False), # creates a RangeIndex + (lambda: Series(data=(_ for _ in [])), False), # creates a RangeIndex + ], + ) + def test_empty_constructor(self, constructor, check_index_type): + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + expected = Series() + result = constructor() + + assert len(result.index) == 0 + tm.assert_series_equal(result, expected, check_index_type=check_index_type) + + def test_invalid_dtype(self): + # GH15520 + msg = "not understood" + invalid_list = [pd.Timestamp, "pd.Timestamp", list] + for dtype in invalid_list: + with pytest.raises(TypeError, match=msg): + Series([], name="time", dtype=dtype) + + def test_invalid_compound_dtype(self): + # GH#13296 + c_dtype = np.dtype([("a", "i8"), ("b", "f4")]) + cdt_arr = np.array([(1, 0.4), (256, -13)], dtype=c_dtype) + + with pytest.raises(ValueError, match="Use DataFrame instead"): + Series(cdt_arr, index=["A", "B"]) + + def test_scalar_conversion(self): + + # Pass in scalar is disabled + scalar = Series(0.5) + assert not isinstance(scalar, float) + + # Coercion + assert float(Series([1.0])) == 1.0 + assert int(Series([1.0])) == 1 + + def test_constructor(self, datetime_series): + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + empty_series = Series() + assert datetime_series.index.is_all_dates + + # Pass in Series + derived = Series(datetime_series) + assert derived.index.is_all_dates + + assert tm.equalContents(derived.index, datetime_series.index) + # Ensure new index is not created + assert id(datetime_series.index) == id(derived.index) + + # Mixed type Series + mixed = Series(["hello", np.NaN], index=[0, 1]) + assert mixed.dtype == np.object_ + assert mixed[1] is np.NaN + + assert not empty_series.index.is_all_dates + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + assert not Series().index.is_all_dates + + # exception raised is of type Exception + with pytest.raises(Exception, match="Data must be 1-dimensional"): + Series(np.random.randn(3, 3), index=np.arange(3)) + + mixed.name = "Series" + rs = Series(mixed).name + xp = "Series" + assert rs == xp + + # raise on MultiIndex GH4187 + m = MultiIndex.from_arrays([[1, 2], [3, 4]]) + msg = "initializing a Series from a MultiIndex is not supported" + with pytest.raises(NotImplementedError, match=msg): + Series(m) + + @pytest.mark.parametrize("input_class", [list, dict, OrderedDict]) + def test_constructor_empty(self, input_class): + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + empty = Series() + empty2 = Series(input_class()) + + # these are Index() and RangeIndex() which don't compare type equal + # but are just .equals + tm.assert_series_equal(empty, empty2, check_index_type=False) + + # With explicit dtype: + empty = Series(dtype="float64") + empty2 = Series(input_class(), dtype="float64") + tm.assert_series_equal(empty, empty2, check_index_type=False) + + # GH 18515 : with dtype=category: + empty = Series(dtype="category") + empty2 = Series(input_class(), dtype="category") + tm.assert_series_equal(empty, empty2, check_index_type=False) + + if input_class is not list: + # With index: + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + empty = Series(index=range(10)) + empty2 = Series(input_class(), index=range(10)) + tm.assert_series_equal(empty, empty2) + + # With index and dtype float64: + empty = Series(np.nan, index=range(10)) + empty2 = Series(input_class(), index=range(10), dtype="float64") + tm.assert_series_equal(empty, empty2) + + # GH 19853 : with empty string, index and dtype str + empty = Series("", dtype=str, index=range(3)) + empty2 = Series("", index=range(3)) + tm.assert_series_equal(empty, empty2) + + @pytest.mark.parametrize("input_arg", [np.nan, float("nan")]) + def test_constructor_nan(self, input_arg): + empty = Series(dtype="float64", index=range(10)) + empty2 = Series(input_arg, index=range(10)) + + tm.assert_series_equal(empty, empty2, check_index_type=False) + + @pytest.mark.parametrize( + "dtype", + ["f8", "i8", "M8[ns]", "m8[ns]", "category", "object", "datetime64[ns, UTC]"], + ) + @pytest.mark.parametrize("index", [None, pd.Index([])]) + def test_constructor_dtype_only(self, dtype, index): + # GH-20865 + result = pd.Series(dtype=dtype, index=index) + assert result.dtype == dtype + assert len(result) == 0 + + def test_constructor_no_data_index_order(self): + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + result = pd.Series(index=["b", "a", "c"]) + assert result.index.tolist() == ["b", "a", "c"] + + def test_constructor_no_data_string_type(self): + # GH 22477 + result = pd.Series(index=[1], dtype=str) + assert np.isnan(result.iloc[0]) + + @pytest.mark.parametrize("item", ["entry", "ѐ", 13]) + def test_constructor_string_element_string_type(self, item): + # GH 22477 + result = pd.Series(item, index=[1], dtype=str) + assert result.iloc[0] == str(item) + + def test_constructor_dtype_str_na_values(self, string_dtype): + # https://github.com/pandas-dev/pandas/issues/21083 + ser = Series(["x", None], dtype=string_dtype) + result = ser.isna() + expected = Series([False, True]) + tm.assert_series_equal(result, expected) + assert ser.iloc[1] is None + + ser = Series(["x", np.nan], dtype=string_dtype) + assert np.isnan(ser.iloc[1]) + + def test_constructor_series(self): + index1 = ["d", "b", "a", "c"] + index2 = sorted(index1) + s1 = Series([4, 7, -5, 3], index=index1) + s2 = Series(s1, index=index2) + + tm.assert_series_equal(s2, s1.sort_index()) + + def test_constructor_iterable(self): + # GH 21987 + class Iter: + def __iter__(self): + for i in range(10): + yield i + + expected = Series(list(range(10)), dtype="int64") + result = Series(Iter(), dtype="int64") + tm.assert_series_equal(result, expected) + + def test_constructor_sequence(self): + # GH 21987 + expected = Series(list(range(10)), dtype="int64") + result = Series(range(10), dtype="int64") + tm.assert_series_equal(result, expected) + + def test_constructor_single_str(self): + # GH 21987 + expected = Series(["abc"]) + result = Series("abc") + tm.assert_series_equal(result, expected) + + def test_constructor_list_like(self): + + # make sure that we are coercing different + # list-likes to standard dtypes and not + # platform specific + expected = Series([1, 2, 3], dtype="int64") + for obj in [[1, 2, 3], (1, 2, 3), np.array([1, 2, 3], dtype="int64")]: + result = Series(obj, index=[0, 1, 2]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("dtype", ["bool", "int32", "int64", "float64"]) + def test_constructor_index_dtype(self, dtype): + # GH 17088 + + s = Series(Index([0, 2, 4]), dtype=dtype) + assert s.dtype == dtype + + @pytest.mark.parametrize( + "input_vals", + [ + ([1, 2]), + (["1", "2"]), + (list(pd.date_range("1/1/2011", periods=2, freq="H"))), + (list(pd.date_range("1/1/2011", periods=2, freq="H", tz="US/Eastern"))), + ([pd.Interval(left=0, right=5)]), + ], + ) + def test_constructor_list_str(self, input_vals, string_dtype): + # GH 16605 + # Ensure that data elements from a list are converted to strings + # when dtype is str, 'str', or 'U' + result = Series(input_vals, dtype=string_dtype) + expected = Series(input_vals).astype(string_dtype) + tm.assert_series_equal(result, expected) + + def test_constructor_list_str_na(self, string_dtype): + result = Series([1.0, 2.0, np.nan], dtype=string_dtype) + expected = Series(["1.0", "2.0", np.nan], dtype=object) + tm.assert_series_equal(result, expected) + assert np.isnan(result[2]) + + def test_constructor_generator(self): + gen = (i for i in range(10)) + + result = Series(gen) + exp = Series(range(10)) + tm.assert_series_equal(result, exp) + + gen = (i for i in range(10)) + result = Series(gen, index=range(10, 20)) + exp.index = range(10, 20) + tm.assert_series_equal(result, exp) + + def test_constructor_map(self): + # GH8909 + m = map(lambda x: x, range(10)) + + result = Series(m) + exp = Series(range(10)) + tm.assert_series_equal(result, exp) + + m = map(lambda x: x, range(10)) + result = Series(m, index=range(10, 20)) + exp.index = range(10, 20) + tm.assert_series_equal(result, exp) + + def test_constructor_categorical(self): + cat = pd.Categorical([0, 1, 2, 0, 1, 2], ["a", "b", "c"], fastpath=True) + res = Series(cat) + tm.assert_categorical_equal(res.values, cat) + + # can cast to a new dtype + result = Series(pd.Categorical([1, 2, 3]), dtype="int64") + expected = pd.Series([1, 2, 3], dtype="int64") + tm.assert_series_equal(result, expected) + + # GH12574 + cat = Series(pd.Categorical([1, 2, 3]), dtype="category") + assert is_categorical_dtype(cat) + assert is_categorical_dtype(cat.dtype) + s = Series([1, 2, 3], dtype="category") + assert is_categorical_dtype(s) + assert is_categorical_dtype(s.dtype) + + def test_constructor_categorical_with_coercion(self): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"]) + # test basic creation / coercion of categoricals + s = Series(factor, name="A") + assert s.dtype == "category" + assert len(s) == len(factor) + str(s.values) + str(s) + + # in a frame + df = DataFrame({"A": factor}) + result = df["A"] + tm.assert_series_equal(result, s) + result = df.iloc[:, 0] + tm.assert_series_equal(result, s) + assert len(df) == len(factor) + str(df.values) + str(df) + + df = DataFrame({"A": s}) + result = df["A"] + tm.assert_series_equal(result, s) + assert len(df) == len(factor) + str(df.values) + str(df) + + # multiples + df = DataFrame({"A": s, "B": s, "C": 1}) + result1 = df["A"] + result2 = df["B"] + tm.assert_series_equal(result1, s) + tm.assert_series_equal(result2, s, check_names=False) + assert result2.name == "B" + assert len(df) == len(factor) + str(df.values) + str(df) + + # GH8623 + x = DataFrame( + [[1, "John P. Doe"], [2, "Jane Dove"], [1, "John P. Doe"]], + columns=["person_id", "person_name"], + ) + x["person_name"] = Categorical(x.person_name) # doing this breaks transform + + expected = x.iloc[0].person_name + result = x.person_name.iloc[0] + assert result == expected + + result = x.person_name[0] + assert result == expected + + result = x.person_name.loc[0] + assert result == expected + + def test_constructor_categorical_dtype(self): + result = pd.Series( + ["a", "b"], dtype=CategoricalDtype(["a", "b", "c"], ordered=True) + ) + assert is_categorical_dtype(result) is True + tm.assert_index_equal(result.cat.categories, pd.Index(["a", "b", "c"])) + assert result.cat.ordered + + result = pd.Series(["a", "b"], dtype=CategoricalDtype(["b", "a"])) + assert is_categorical_dtype(result) + tm.assert_index_equal(result.cat.categories, pd.Index(["b", "a"])) + assert result.cat.ordered is False + + # GH 19565 - Check broadcasting of scalar with Categorical dtype + result = Series( + "a", index=[0, 1], dtype=CategoricalDtype(["a", "b"], ordered=True) + ) + expected = Series( + ["a", "a"], index=[0, 1], dtype=CategoricalDtype(["a", "b"], ordered=True) + ) + tm.assert_series_equal(result, expected, check_categorical=True) + + def test_constructor_categorical_string(self): + # GH 26336: the string 'category' maintains existing CategoricalDtype + cdt = CategoricalDtype(categories=list("dabc"), ordered=True) + expected = Series(list("abcabc"), dtype=cdt) + + # Series(Categorical, dtype='category') keeps existing dtype + cat = Categorical(list("abcabc"), dtype=cdt) + result = Series(cat, dtype="category") + tm.assert_series_equal(result, expected) + + # Series(Series[Categorical], dtype='category') keeps existing dtype + result = Series(result, dtype="category") + tm.assert_series_equal(result, expected) + + def test_categorical_sideeffects_free(self): + # Passing a categorical to a Series and then changing values in either + # the series or the categorical should not change the values in the + # other one, IF you specify copy! + cat = Categorical(["a", "b", "c", "a"]) + s = Series(cat, copy=True) + assert s.cat is not cat + s.cat.categories = [1, 2, 3] + exp_s = np.array([1, 2, 3, 1], dtype=np.int64) + exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_) + tm.assert_numpy_array_equal(s.__array__(), exp_s) + tm.assert_numpy_array_equal(cat.__array__(), exp_cat) + + # setting + s[0] = 2 + exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64) + tm.assert_numpy_array_equal(s.__array__(), exp_s2) + tm.assert_numpy_array_equal(cat.__array__(), exp_cat) + + # however, copy is False by default + # so this WILL change values + cat = Categorical(["a", "b", "c", "a"]) + s = Series(cat) + assert s.values is cat + s.cat.categories = [1, 2, 3] + exp_s = np.array([1, 2, 3, 1], dtype=np.int64) + tm.assert_numpy_array_equal(s.__array__(), exp_s) + tm.assert_numpy_array_equal(cat.__array__(), exp_s) + + s[0] = 2 + exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64) + tm.assert_numpy_array_equal(s.__array__(), exp_s2) + tm.assert_numpy_array_equal(cat.__array__(), exp_s2) + + def test_unordered_compare_equal(self): + left = pd.Series(["a", "b", "c"], dtype=CategoricalDtype(["a", "b"])) + right = pd.Series(pd.Categorical(["a", "b", np.nan], categories=["a", "b"])) + tm.assert_series_equal(left, right) + + def test_constructor_maskedarray(self): + data = ma.masked_all((3,), dtype=float) + result = Series(data) + expected = Series([np.nan, np.nan, np.nan]) + tm.assert_series_equal(result, expected) + + data[0] = 0.0 + data[2] = 2.0 + index = ["a", "b", "c"] + result = Series(data, index=index) + expected = Series([0.0, np.nan, 2.0], index=index) + tm.assert_series_equal(result, expected) + + data[1] = 1.0 + result = Series(data, index=index) + expected = Series([0.0, 1.0, 2.0], index=index) + tm.assert_series_equal(result, expected) + + data = ma.masked_all((3,), dtype=int) + result = Series(data) + expected = Series([np.nan, np.nan, np.nan], dtype=float) + tm.assert_series_equal(result, expected) + + data[0] = 0 + data[2] = 2 + index = ["a", "b", "c"] + result = Series(data, index=index) + expected = Series([0, np.nan, 2], index=index, dtype=float) + tm.assert_series_equal(result, expected) + + data[1] = 1 + result = Series(data, index=index) + expected = Series([0, 1, 2], index=index, dtype=int) + tm.assert_series_equal(result, expected) + + data = ma.masked_all((3,), dtype=bool) + result = Series(data) + expected = Series([np.nan, np.nan, np.nan], dtype=object) + tm.assert_series_equal(result, expected) + + data[0] = True + data[2] = False + index = ["a", "b", "c"] + result = Series(data, index=index) + expected = Series([True, np.nan, False], index=index, dtype=object) + tm.assert_series_equal(result, expected) + + data[1] = True + result = Series(data, index=index) + expected = Series([True, True, False], index=index, dtype=bool) + tm.assert_series_equal(result, expected) + + data = ma.masked_all((3,), dtype="M8[ns]") + result = Series(data) + expected = Series([iNaT, iNaT, iNaT], dtype="M8[ns]") + tm.assert_series_equal(result, expected) + + data[0] = datetime(2001, 1, 1) + data[2] = datetime(2001, 1, 3) + index = ["a", "b", "c"] + result = Series(data, index=index) + expected = Series( + [datetime(2001, 1, 1), iNaT, datetime(2001, 1, 3)], + index=index, + dtype="M8[ns]", + ) + tm.assert_series_equal(result, expected) + + data[1] = datetime(2001, 1, 2) + result = Series(data, index=index) + expected = Series( + [datetime(2001, 1, 1), datetime(2001, 1, 2), datetime(2001, 1, 3)], + index=index, + dtype="M8[ns]", + ) + tm.assert_series_equal(result, expected) + + def test_constructor_maskedarray_hardened(self): + # Check numpy masked arrays with hard masks -- from GH24574 + data = ma.masked_all((3,), dtype=float).harden_mask() + result = pd.Series(data) + expected = pd.Series([np.nan, np.nan, np.nan]) + tm.assert_series_equal(result, expected) + + def test_series_ctor_plus_datetimeindex(self): + rng = date_range("20090415", "20090519", freq="B") + data = {k: 1 for k in rng} + + result = Series(data, index=rng) + assert result.index is rng + + def test_constructor_default_index(self): + s = Series([0, 1, 2]) + tm.assert_index_equal(s.index, pd.Index(np.arange(3))) + + @pytest.mark.parametrize( + "input", + [ + [1, 2, 3], + (1, 2, 3), + list(range(3)), + pd.Categorical(["a", "b", "a"]), + (i for i in range(3)), + map(lambda x: x, range(3)), + ], + ) + def test_constructor_index_mismatch(self, input): + # GH 19342 + # test that construction of a Series with an index of different length + # raises an error + msg = "Length of passed values is 3, index implies 4" + with pytest.raises(ValueError, match=msg): + Series(input, index=np.arange(4)) + + def test_constructor_numpy_scalar(self): + # GH 19342 + # construction with a numpy scalar + # should not raise + result = Series(np.array(100), index=np.arange(4), dtype="int64") + expected = Series(100, index=np.arange(4), dtype="int64") + tm.assert_series_equal(result, expected) + + def test_constructor_broadcast_list(self): + # GH 19342 + # construction with single-element container and index + # should raise + msg = "Length of passed values is 1, index implies 3" + with pytest.raises(ValueError, match=msg): + Series(["foo"], index=["a", "b", "c"]) + + def test_constructor_corner(self): + df = tm.makeTimeDataFrame() + objs = [df, df] + s = Series(objs, index=[0, 1]) + assert isinstance(s, Series) + + def test_constructor_sanitize(self): + s = Series(np.array([1.0, 1.0, 8.0]), dtype="i8") + assert s.dtype == np.dtype("i8") + + s = Series(np.array([1.0, 1.0, np.nan]), copy=True, dtype="i8") + assert s.dtype == np.dtype("f8") + + def test_constructor_copy(self): + # GH15125 + # test dtype parameter has no side effects on copy=True + for data in [[1.0], np.array([1.0])]: + x = Series(data) + y = pd.Series(x, copy=True, dtype=float) + + # copy=True maintains original data in Series + tm.assert_series_equal(x, y) + + # changes to origin of copy does not affect the copy + x[0] = 2.0 + assert not x.equals(y) + assert x[0] == 2.0 + assert y[0] == 1.0 + + @pytest.mark.parametrize( + "index", + [ + pd.date_range("20170101", periods=3, tz="US/Eastern"), + pd.date_range("20170101", periods=3), + pd.timedelta_range("1 day", periods=3), + pd.period_range("2012Q1", periods=3, freq="Q"), + pd.Index(list("abc")), + pd.Int64Index([1, 2, 3]), + pd.RangeIndex(0, 3), + ], + ids=lambda x: type(x).__name__, + ) + def test_constructor_limit_copies(self, index): + # GH 17449 + # limit copies of input + s = pd.Series(index) + + # we make 1 copy; this is just a smoke test here + assert s._data.blocks[0].values is not index + + def test_constructor_pass_none(self): + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + s = Series(None, index=range(5)) + assert s.dtype == np.float64 + + s = Series(None, index=range(5), dtype=object) + assert s.dtype == np.object_ + + # GH 7431 + # inference on the index + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + s = Series(index=np.array([None])) + expected = Series(index=Index([None])) + tm.assert_series_equal(s, expected) + + def test_constructor_pass_nan_nat(self): + # GH 13467 + exp = Series([np.nan, np.nan], dtype=np.float64) + assert exp.dtype == np.float64 + tm.assert_series_equal(Series([np.nan, np.nan]), exp) + tm.assert_series_equal(Series(np.array([np.nan, np.nan])), exp) + + exp = Series([pd.NaT, pd.NaT]) + assert exp.dtype == "datetime64[ns]" + tm.assert_series_equal(Series([pd.NaT, pd.NaT]), exp) + tm.assert_series_equal(Series(np.array([pd.NaT, pd.NaT])), exp) + + tm.assert_series_equal(Series([pd.NaT, np.nan]), exp) + tm.assert_series_equal(Series(np.array([pd.NaT, np.nan])), exp) + + tm.assert_series_equal(Series([np.nan, pd.NaT]), exp) + tm.assert_series_equal(Series(np.array([np.nan, pd.NaT])), exp) + + def test_constructor_cast(self): + msg = "could not convert string to float" + with pytest.raises(ValueError, match=msg): + Series(["a", "b", "c"], dtype=float) + + def test_constructor_unsigned_dtype_overflow(self, uint_dtype): + # see gh-15832 + msg = "Trying to coerce negative values to unsigned integers" + with pytest.raises(OverflowError, match=msg): + Series([-1], dtype=uint_dtype) + + def test_constructor_coerce_float_fail(self, any_int_dtype): + # see gh-15832 + msg = "Trying to coerce float values to integers" + with pytest.raises(ValueError, match=msg): + Series([1, 2, 3.5], dtype=any_int_dtype) + + def test_constructor_coerce_float_valid(self, float_dtype): + s = Series([1, 2, 3.5], dtype=float_dtype) + expected = Series([1, 2, 3.5]).astype(float_dtype) + tm.assert_series_equal(s, expected) + + def test_constructor_dtype_no_cast(self): + # see gh-1572 + s = Series([1, 2, 3]) + s2 = Series(s, dtype=np.int64) + + s2[1] = 5 + assert s[1] == 5 + + def test_constructor_datelike_coercion(self): + + # GH 9477 + # incorrectly inferring on dateimelike looking when object dtype is + # specified + s = Series([Timestamp("20130101"), "NOV"], dtype=object) + assert s.iloc[0] == Timestamp("20130101") + assert s.iloc[1] == "NOV" + assert s.dtype == object + + # the dtype was being reset on the slicing and re-inferred to datetime + # even thought the blocks are mixed + belly = "216 3T19".split() + wing1 = "2T15 4H19".split() + wing2 = "416 4T20".split() + mat = pd.to_datetime("2016-01-22 2019-09-07".split()) + df = pd.DataFrame({"wing1": wing1, "wing2": wing2, "mat": mat}, index=belly) + + result = df.loc["3T19"] + assert result.dtype == object + result = df.loc["216"] + assert result.dtype == object + + def test_constructor_datetimes_with_nulls(self): + # gh-15869 + for arr in [ + np.array([None, None, None, None, datetime.now(), None]), + np.array([None, None, datetime.now(), None]), + ]: + result = Series(arr) + assert result.dtype == "M8[ns]" + + def test_constructor_dtype_datetime64(self): + + s = Series(iNaT, dtype="M8[ns]", index=range(5)) + assert isna(s).all() + + # in theory this should be all nulls, but since + # we are not specifying a dtype is ambiguous + s = Series(iNaT, index=range(5)) + assert not isna(s).all() + + s = Series(np.nan, dtype="M8[ns]", index=range(5)) + assert isna(s).all() + + s = Series([datetime(2001, 1, 2, 0, 0), iNaT], dtype="M8[ns]") + assert isna(s[1]) + assert s.dtype == "M8[ns]" + + s = Series([datetime(2001, 1, 2, 0, 0), np.nan], dtype="M8[ns]") + assert isna(s[1]) + assert s.dtype == "M8[ns]" + + # GH3416 + dates = [ + np.datetime64(datetime(2013, 1, 1)), + np.datetime64(datetime(2013, 1, 2)), + np.datetime64(datetime(2013, 1, 3)), + ] + + s = Series(dates) + assert s.dtype == "M8[ns]" + + s.iloc[0] = np.nan + assert s.dtype == "M8[ns]" + + # GH3414 related + expected = Series( + [datetime(2013, 1, 1), datetime(2013, 1, 2), datetime(2013, 1, 3)], + dtype="datetime64[ns]", + ) + + result = Series(Series(dates).astype(np.int64) / 1000000, dtype="M8[ms]") + tm.assert_series_equal(result, expected) + + result = Series(dates, dtype="datetime64[ns]") + tm.assert_series_equal(result, expected) + + expected = Series( + [pd.NaT, datetime(2013, 1, 2), datetime(2013, 1, 3)], dtype="datetime64[ns]" + ) + result = Series([np.nan] + dates[1:], dtype="datetime64[ns]") + tm.assert_series_equal(result, expected) + + dts = Series(dates, dtype="datetime64[ns]") + + # valid astype + dts.astype("int64") + + # invalid casting + msg = r"cannot astype a datetimelike from \[datetime64\[ns\]\] to \[int32\]" + with pytest.raises(TypeError, match=msg): + dts.astype("int32") + + # ints are ok + # we test with np.int64 to get similar results on + # windows / 32-bit platforms + result = Series(dts, dtype=np.int64) + expected = Series(dts.astype(np.int64)) + tm.assert_series_equal(result, expected) + + # invalid dates can be help as object + result = Series([datetime(2, 1, 1)]) + assert result[0] == datetime(2, 1, 1, 0, 0) + + result = Series([datetime(3000, 1, 1)]) + assert result[0] == datetime(3000, 1, 1, 0, 0) + + # don't mix types + result = Series([Timestamp("20130101"), 1], index=["a", "b"]) + assert result["a"] == Timestamp("20130101") + assert result["b"] == 1 + + # GH6529 + # coerce datetime64 non-ns properly + dates = date_range("01-Jan-2015", "01-Dec-2015", freq="M") + values2 = dates.view(np.ndarray).astype("datetime64[ns]") + expected = Series(values2, index=dates) + + for dtype in ["s", "D", "ms", "us", "ns"]: + values1 = dates.view(np.ndarray).astype("M8[{0}]".format(dtype)) + result = Series(values1, dates) + tm.assert_series_equal(result, expected) + + # GH 13876 + # coerce to non-ns to object properly + expected = Series(values2, index=dates, dtype=object) + for dtype in ["s", "D", "ms", "us", "ns"]: + values1 = dates.view(np.ndarray).astype("M8[{0}]".format(dtype)) + result = Series(values1, index=dates, dtype=object) + tm.assert_series_equal(result, expected) + + # leave datetime.date alone + dates2 = np.array([d.date() for d in dates.to_pydatetime()], dtype=object) + series1 = Series(dates2, dates) + tm.assert_numpy_array_equal(series1.values, dates2) + assert series1.dtype == object + + # these will correctly infer a datetime + s = Series([None, pd.NaT, "2013-08-05 15:30:00.000001"]) + assert s.dtype == "datetime64[ns]" + s = Series([np.nan, pd.NaT, "2013-08-05 15:30:00.000001"]) + assert s.dtype == "datetime64[ns]" + s = Series([pd.NaT, None, "2013-08-05 15:30:00.000001"]) + assert s.dtype == "datetime64[ns]" + s = Series([pd.NaT, np.nan, "2013-08-05 15:30:00.000001"]) + assert s.dtype == "datetime64[ns]" + + # tz-aware (UTC and other tz's) + # GH 8411 + dr = date_range("20130101", periods=3) + assert Series(dr).iloc[0].tz is None + dr = date_range("20130101", periods=3, tz="UTC") + assert str(Series(dr).iloc[0].tz) == "UTC" + dr = date_range("20130101", periods=3, tz="US/Eastern") + assert str(Series(dr).iloc[0].tz) == "US/Eastern" + + # non-convertible + s = Series([1479596223000, -1479590, pd.NaT]) + assert s.dtype == "object" + assert s[2] is pd.NaT + assert "NaT" in str(s) + + # if we passed a NaT it remains + s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), pd.NaT]) + assert s.dtype == "object" + assert s[2] is pd.NaT + assert "NaT" in str(s) + + # if we passed a nan it remains + s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), np.nan]) + assert s.dtype == "object" + assert s[2] is np.nan + assert "NaN" in str(s) + + def test_constructor_with_datetime_tz(self): + + # 8260 + # support datetime64 with tz + + dr = date_range("20130101", periods=3, tz="US/Eastern") + s = Series(dr) + assert s.dtype.name == "datetime64[ns, US/Eastern]" + assert s.dtype == "datetime64[ns, US/Eastern]" + assert is_datetime64tz_dtype(s.dtype) + assert "datetime64[ns, US/Eastern]" in str(s) + + # export + result = s.values + assert isinstance(result, np.ndarray) + assert result.dtype == "datetime64[ns]" + + exp = pd.DatetimeIndex(result) + exp = exp.tz_localize("UTC").tz_convert(tz=s.dt.tz) + tm.assert_index_equal(dr, exp) + + # indexing + result = s.iloc[0] + assert result == Timestamp( + "2013-01-01 00:00:00-0500", tz="US/Eastern", freq="D" + ) + result = s[0] + assert result == Timestamp( + "2013-01-01 00:00:00-0500", tz="US/Eastern", freq="D" + ) + + result = s[Series([True, True, False], index=s.index)] + tm.assert_series_equal(result, s[0:2]) + + result = s.iloc[0:1] + tm.assert_series_equal(result, Series(dr[0:1])) + + # concat + result = pd.concat([s.iloc[0:1], s.iloc[1:]]) + tm.assert_series_equal(result, s) + + # short str + assert "datetime64[ns, US/Eastern]" in str(s) + + # formatting with NaT + result = s.shift() + assert "datetime64[ns, US/Eastern]" in str(result) + assert "NaT" in str(result) + + # long str + t = Series(date_range("20130101", periods=1000, tz="US/Eastern")) + assert "datetime64[ns, US/Eastern]" in str(t) + + result = pd.DatetimeIndex(s, freq="infer") + tm.assert_index_equal(result, dr) + + # inference + s = Series( + [ + pd.Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"), + pd.Timestamp("2013-01-02 14:00:00-0800", tz="US/Pacific"), + ] + ) + assert s.dtype == "datetime64[ns, US/Pacific]" + assert lib.infer_dtype(s, skipna=True) == "datetime64" + + s = Series( + [ + pd.Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"), + pd.Timestamp("2013-01-02 14:00:00-0800", tz="US/Eastern"), + ] + ) + assert s.dtype == "object" + assert lib.infer_dtype(s, skipna=True) == "datetime" + + # with all NaT + s = Series(pd.NaT, index=[0, 1], dtype="datetime64[ns, US/Eastern]") + expected = Series(pd.DatetimeIndex(["NaT", "NaT"], tz="US/Eastern")) + tm.assert_series_equal(s, expected) + + @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64]) + @pytest.mark.parametrize("dtype", ["M8", "m8"]) + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) + def test_construction_to_datetimelike_unit(self, arr_dtype, dtype, unit): + # tests all units + # gh-19223 + dtype = "{}[{}]".format(dtype, unit) + arr = np.array([1, 2, 3], dtype=arr_dtype) + s = Series(arr) + result = s.astype(dtype) + expected = Series(arr.astype(dtype)) + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("arg", ["2013-01-01 00:00:00", pd.NaT, np.nan, None]) + def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg): + # GH 17415: With naive string + result = Series([arg], dtype="datetime64[ns, CET]") + expected = Series(pd.Timestamp(arg)).dt.tz_localize("CET") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("interval_constructor", [IntervalIndex, IntervalArray]) + def test_construction_interval(self, interval_constructor): + # construction from interval & array of intervals + intervals = interval_constructor.from_breaks(np.arange(3), closed="right") + result = Series(intervals) + assert result.dtype == "interval[int64]" + tm.assert_index_equal(Index(result.values), Index(intervals)) + + @pytest.mark.parametrize( + "data_constructor", [list, np.array], ids=["list", "ndarray[object]"] + ) + def test_constructor_infer_interval(self, data_constructor): + # GH 23563: consistent closed results in interval dtype + data = [pd.Interval(0, 1), pd.Interval(0, 2), None] + result = pd.Series(data_constructor(data)) + expected = pd.Series(IntervalArray(data)) + assert result.dtype == "interval[float64]" + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "data_constructor", [list, np.array], ids=["list", "ndarray[object]"] + ) + def test_constructor_interval_mixed_closed(self, data_constructor): + # GH 23563: mixed closed results in object dtype (not interval dtype) + data = [pd.Interval(0, 1, closed="both"), pd.Interval(0, 2, closed="neither")] + result = Series(data_constructor(data)) + assert result.dtype == object + assert result.tolist() == data + + def test_construction_consistency(self): + + # make sure that we are not re-localizing upon construction + # GH 14928 + s = Series(pd.date_range("20130101", periods=3, tz="US/Eastern")) + + result = Series(s, dtype=s.dtype) + tm.assert_series_equal(result, s) + + result = Series(s.dt.tz_convert("UTC"), dtype=s.dtype) + tm.assert_series_equal(result, s) + + result = Series(s.values, dtype=s.dtype) + tm.assert_series_equal(result, s) + + @pytest.mark.parametrize( + "data_constructor", [list, np.array], ids=["list", "ndarray[object]"] + ) + def test_constructor_infer_period(self, data_constructor): + data = [pd.Period("2000", "D"), pd.Period("2001", "D"), None] + result = pd.Series(data_constructor(data)) + expected = pd.Series(period_array(data)) + tm.assert_series_equal(result, expected) + assert result.dtype == "Period[D]" + + def test_constructor_period_incompatible_frequency(self): + data = [pd.Period("2000", "D"), pd.Period("2001", "A")] + result = pd.Series(data) + assert result.dtype == object + assert result.tolist() == data + + def test_constructor_periodindex(self): + # GH7932 + # converting a PeriodIndex when put in a Series + + pi = period_range("20130101", periods=5, freq="D") + s = Series(pi) + assert s.dtype == "Period[D]" + expected = Series(pi.astype(object)) + tm.assert_series_equal(s, expected) + + def test_constructor_dict(self): + d = {"a": 0.0, "b": 1.0, "c": 2.0} + result = Series(d, index=["b", "c", "d", "a"]) + expected = Series([1, 2, np.nan, 0], index=["b", "c", "d", "a"]) + tm.assert_series_equal(result, expected) + + pidx = tm.makePeriodIndex(100) + d = {pidx[0]: 0, pidx[1]: 1} + result = Series(d, index=pidx) + expected = Series(np.nan, pidx, dtype=np.float64) + expected.iloc[0] = 0 + expected.iloc[1] = 1 + tm.assert_series_equal(result, expected) + + def test_constructor_dict_list_value_explicit_dtype(self): + # GH 18625 + d = {"a": [[2], [3], [4]]} + result = Series(d, index=["a"], dtype="object") + expected = Series(d, index=["a"]) + tm.assert_series_equal(result, expected) + + def test_constructor_dict_order(self): + # GH19018 + # initialization ordering: by insertion order if python>= 3.6, else + # order by value + d = {"b": 1, "a": 0, "c": 2} + result = Series(d) + expected = Series([1, 0, 2], index=list("bac")) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("value", [2, np.nan, None, float("nan")]) + def test_constructor_dict_nan_key(self, value): + # GH 18480 + d = {1: "a", value: "b", float("nan"): "c", 4: "d"} + result = Series(d).sort_values() + expected = Series(["a", "b", "c", "d"], index=[1, value, np.nan, 4]) + tm.assert_series_equal(result, expected) + + # MultiIndex: + d = {(1, 1): "a", (2, np.nan): "b", (3, value): "c"} + result = Series(d).sort_values() + expected = Series( + ["a", "b", "c"], index=Index([(1, 1), (2, np.nan), (3, value)]) + ) + tm.assert_series_equal(result, expected) + + def test_constructor_dict_datetime64_index(self): + # GH 9456 + + dates_as_str = ["1984-02-19", "1988-11-06", "1989-12-03", "1990-03-15"] + values = [42544017.198965244, 1234565, 40512335.181958228, -1] + + def create_data(constructor): + return dict(zip((constructor(x) for x in dates_as_str), values)) + + data_datetime64 = create_data(np.datetime64) + data_datetime = create_data(lambda x: datetime.strptime(x, "%Y-%m-%d")) + data_Timestamp = create_data(Timestamp) + + expected = Series(values, (Timestamp(x) for x in dates_as_str)) + + result_datetime64 = Series(data_datetime64) + result_datetime = Series(data_datetime) + result_Timestamp = Series(data_Timestamp) + + tm.assert_series_equal(result_datetime64, expected) + tm.assert_series_equal(result_datetime, expected) + tm.assert_series_equal(result_Timestamp, expected) + + def test_constructor_mapping(self, non_mapping_dict_subclass): + # GH 29788 + ndm = non_mapping_dict_subclass({3: "three"}) + result = Series(ndm) + expected = Series(["three"], index=[3]) + + tm.assert_series_equal(result, expected) + + def test_constructor_list_of_tuples(self): + data = [(1, 1), (2, 2), (2, 3)] + s = Series(data) + assert list(s) == data + + def test_constructor_tuple_of_tuples(self): + data = ((1, 1), (2, 2), (2, 3)) + s = Series(data) + assert tuple(s) == data + + def test_constructor_dict_of_tuples(self): + data = {(1, 2): 3, (None, 5): 6} + result = Series(data).sort_values() + expected = Series([3, 6], index=MultiIndex.from_tuples([(1, 2), (None, 5)])) + tm.assert_series_equal(result, expected) + + def test_constructor_set(self): + values = {1, 2, 3, 4, 5} + with pytest.raises(TypeError, match="'set' type is unordered"): + Series(values) + values = frozenset(values) + with pytest.raises(TypeError, match="'frozenset' type is unordered"): + Series(values) + + # https://github.com/pandas-dev/pandas/issues/22698 + @pytest.mark.filterwarnings("ignore:elementwise comparison:FutureWarning") + def test_fromDict(self): + data = {"a": 0, "b": 1, "c": 2, "d": 3} + + series = Series(data) + tm.assert_is_sorted(series.index) + + data = {"a": 0, "b": "1", "c": "2", "d": datetime.now()} + series = Series(data) + assert series.dtype == np.object_ + + data = {"a": 0, "b": "1", "c": "2", "d": "3"} + series = Series(data) + assert series.dtype == np.object_ + + data = {"a": "0", "b": "1"} + series = Series(data, dtype=float) + assert series.dtype == np.float64 + + def test_fromValue(self, datetime_series): + + nans = Series(np.NaN, index=datetime_series.index, dtype=np.float64) + assert nans.dtype == np.float_ + assert len(nans) == len(datetime_series) + + strings = Series("foo", index=datetime_series.index) + assert strings.dtype == np.object_ + assert len(strings) == len(datetime_series) + + d = datetime.now() + dates = Series(d, index=datetime_series.index) + assert dates.dtype == "M8[ns]" + assert len(dates) == len(datetime_series) + + # GH12336 + # Test construction of categorical series from value + categorical = Series(0, index=datetime_series.index, dtype="category") + expected = Series(0, index=datetime_series.index).astype("category") + assert categorical.dtype == "category" + assert len(categorical) == len(datetime_series) + tm.assert_series_equal(categorical, expected) + + def test_constructor_dtype_timedelta64(self): + + # basic + td = Series([timedelta(days=i) for i in range(3)]) + assert td.dtype == "timedelta64[ns]" + + td = Series([timedelta(days=1)]) + assert td.dtype == "timedelta64[ns]" + + td = Series([timedelta(days=1), timedelta(days=2), np.timedelta64(1, "s")]) + + assert td.dtype == "timedelta64[ns]" + + # mixed with NaT + td = Series([timedelta(days=1), NaT], dtype="m8[ns]") + assert td.dtype == "timedelta64[ns]" + + td = Series([timedelta(days=1), np.nan], dtype="m8[ns]") + assert td.dtype == "timedelta64[ns]" + + td = Series([np.timedelta64(300000000), pd.NaT], dtype="m8[ns]") + assert td.dtype == "timedelta64[ns]" + + # improved inference + # GH5689 + td = Series([np.timedelta64(300000000), NaT]) + assert td.dtype == "timedelta64[ns]" + + # because iNaT is int, not coerced to timedelta + td = Series([np.timedelta64(300000000), iNaT]) + assert td.dtype == "object" + + td = Series([np.timedelta64(300000000), np.nan]) + assert td.dtype == "timedelta64[ns]" + + td = Series([pd.NaT, np.timedelta64(300000000)]) + assert td.dtype == "timedelta64[ns]" + + td = Series([np.timedelta64(1, "s")]) + assert td.dtype == "timedelta64[ns]" + + # these are frequency conversion astypes + # for t in ['s', 'D', 'us', 'ms']: + # with pytest.raises(TypeError): + # td.astype('m8[%s]' % t) + + # valid astype + td.astype("int64") + + # invalid casting + msg = r"cannot astype a timedelta from \[timedelta64\[ns\]\] to \[int32\]" + with pytest.raises(TypeError, match=msg): + td.astype("int32") + + # this is an invalid casting + msg = "Could not convert object to NumPy timedelta" + with pytest.raises(ValueError, match=msg): + Series([timedelta(days=1), "foo"], dtype="m8[ns]") + + # leave as object here + td = Series([timedelta(days=i) for i in range(3)] + ["foo"]) + assert td.dtype == "object" + + # these will correctly infer a timedelta + s = Series([None, pd.NaT, "1 Day"]) + assert s.dtype == "timedelta64[ns]" + s = Series([np.nan, pd.NaT, "1 Day"]) + assert s.dtype == "timedelta64[ns]" + s = Series([pd.NaT, None, "1 Day"]) + assert s.dtype == "timedelta64[ns]" + s = Series([pd.NaT, np.nan, "1 Day"]) + assert s.dtype == "timedelta64[ns]" + + # GH 16406 + def test_constructor_mixed_tz(self): + s = Series([Timestamp("20130101"), Timestamp("20130101", tz="US/Eastern")]) + expected = Series( + [Timestamp("20130101"), Timestamp("20130101", tz="US/Eastern")], + dtype="object", + ) + tm.assert_series_equal(s, expected) + + def test_NaT_scalar(self): + series = Series([0, 1000, 2000, iNaT], dtype="M8[ns]") + + val = series[3] + assert isna(val) + + series[2] = val + assert isna(series[2]) + + def test_NaT_cast(self): + # GH10747 + result = Series([np.nan]).astype("M8[ns]") + expected = Series([NaT]) + tm.assert_series_equal(result, expected) + + def test_constructor_name_hashable(self): + for n in [777, 777.0, "name", datetime(2001, 11, 11), (1,), "\u05D0"]: + for data in [[1, 2, 3], np.ones(3), {"a": 0, "b": 1}]: + s = Series(data, name=n) + assert s.name == n + + def test_constructor_name_unhashable(self): + msg = r"Series\.name must be a hashable type" + for n in [["name_list"], np.ones(2), {1: 2}]: + for data in [["name_list"], np.ones(2), {1: 2}]: + with pytest.raises(TypeError, match=msg): + Series(data, name=n) + + def test_auto_conversion(self): + series = Series(list(date_range("1/1/2000", periods=10))) + assert series.dtype == "M8[ns]" + + def test_convert_non_ns(self): + # convert from a numpy array of non-ns timedelta64 + arr = np.array([1, 2, 3], dtype="timedelta64[s]") + s = Series(arr) + expected = Series(pd.timedelta_range("00:00:01", periods=3, freq="s")) + tm.assert_series_equal(s, expected) + + # convert from a numpy array of non-ns datetime64 + # note that creating a numpy datetime64 is in LOCAL time!!!! + # seems to work for M8[D], but not for M8[s] + + s = Series( + np.array(["2013-01-01", "2013-01-02", "2013-01-03"], dtype="datetime64[D]") + ) + tm.assert_series_equal(s, Series(date_range("20130101", periods=3, freq="D"))) + + # s = Series(np.array(['2013-01-01 00:00:01','2013-01-01 + # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]')) + + # tm.assert_series_equal(s,date_range('20130101 + # 00:00:01',period=3,freq='s')) + + @pytest.mark.parametrize( + "index", + [ + date_range("1/1/2000", periods=10), + timedelta_range("1 day", periods=10), + period_range("2000-Q1", periods=10, freq="Q"), + ], + ids=lambda x: type(x).__name__, + ) + def test_constructor_cant_cast_datetimelike(self, index): + + # floats are not ok + msg = "Cannot cast {}.*? to ".format( + # strip Index to convert PeriodIndex -> Period + # We don't care whether the error message says + # PeriodIndex or PeriodArray + type(index).__name__.rstrip("Index") + ) + with pytest.raises(TypeError, match=msg): + Series(index, dtype=float) + + # ints are ok + # we test with np.int64 to get similar results on + # windows / 32-bit platforms + result = Series(index, dtype=np.int64) + expected = Series(index.astype(np.int64)) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "index", + [ + date_range("1/1/2000", periods=10), + timedelta_range("1 day", periods=10), + period_range("2000-Q1", periods=10, freq="Q"), + ], + ids=lambda x: type(x).__name__, + ) + def test_constructor_cast_object(self, index): + s = Series(index, dtype=object) + exp = Series(index).astype(object) + tm.assert_series_equal(s, exp) + + s = Series(pd.Index(index, dtype=object), dtype=object) + exp = Series(index).astype(object) + tm.assert_series_equal(s, exp) + + s = Series(index.astype(object), dtype=object) + exp = Series(index).astype(object) + tm.assert_series_equal(s, exp) + + @pytest.mark.parametrize("dtype", [np.datetime64, np.timedelta64]) + def test_constructor_generic_timestamp_no_frequency(self, dtype): + # see gh-15524, gh-15987 + msg = "dtype has no unit. Please pass in" + + with pytest.raises(ValueError, match=msg): + Series([], dtype=dtype) + + @pytest.mark.parametrize( + "dtype,msg", + [ + ("m8[ps]", "cannot convert timedeltalike"), + ("M8[ps]", "cannot convert datetimelike"), + ], + ) + def test_constructor_generic_timestamp_bad_frequency(self, dtype, msg): + # see gh-15524, gh-15987 + + with pytest.raises(TypeError, match=msg): + Series([], dtype=dtype) + + @pytest.mark.parametrize("dtype", [None, "uint8", "category"]) + def test_constructor_range_dtype(self, dtype): + # GH 16804 + expected = Series([0, 1, 2, 3, 4], dtype=dtype or "int64") + result = Series(range(5), dtype=dtype) + tm.assert_series_equal(result, expected) + + def test_constructor_tz_mixed_data(self): + # GH 13051 + dt_list = [ + Timestamp("2016-05-01 02:03:37"), + Timestamp("2016-04-30 19:03:37-0700", tz="US/Pacific"), + ] + result = Series(dt_list) + expected = Series(dt_list, dtype=object) + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/series/test_convert_dtypes.py b/venv/Lib/site-packages/pandas/tests/series/test_convert_dtypes.py new file mode 100644 index 0000000..923b5a9 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/test_convert_dtypes.py @@ -0,0 +1,248 @@ +from itertools import product + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +class TestSeriesConvertDtypes: + # The answerdict has keys that have 4 tuples, corresponding to the arguments + # infer_objects, convert_string, convert_integer, convert_boolean + # This allows all 16 possible combinations to be tested. Since common + # combinations expect the same answer, this provides an easy way to list + # all the possibilities + @pytest.mark.parametrize( + "data, maindtype, answerdict", + [ + ( + [1, 2, 3], + np.dtype("int32"), + { + ((True, False), (True, False), (True,), (True, False)): "Int32", + ((True, False), (True, False), (False,), (True, False)): np.dtype( + "int32" + ), + }, + ), + ( + [1, 2, 3], + np.dtype("int64"), + { + ((True, False), (True, False), (True,), (True, False)): "Int64", + ((True, False), (True, False), (False,), (True, False)): np.dtype( + "int64" + ), + }, + ), + ( + ["x", "y", "z"], + np.dtype("O"), + { + ( + (True, False), + (True,), + (True, False), + (True, False), + ): pd.StringDtype(), + ((True, False), (False,), (True, False), (True, False)): np.dtype( + "O" + ), + }, + ), + ( + [True, False, np.nan], + np.dtype("O"), + { + ( + (True, False), + (True, False), + (True, False), + (True,), + ): pd.BooleanDtype(), + ((True, False), (True, False), (True, False), (False,)): np.dtype( + "O" + ), + }, + ), + ( + ["h", "i", np.nan], + np.dtype("O"), + { + ( + (True, False), + (True,), + (True, False), + (True, False), + ): pd.StringDtype(), + ((True, False), (False,), (True, False), (True, False)): np.dtype( + "O" + ), + }, + ), + ( + [10, np.nan, 20], + np.dtype("float"), + { + ((True, False), (True, False), (True,), (True, False)): "Int64", + ((True, False), (True, False), (False,), (True, False)): np.dtype( + "float" + ), + }, + ), + ( + [np.nan, 100.5, 200], + np.dtype("float"), + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): np.dtype("float"), + }, + ), + ( + [3, 4, 5], + "Int8", + {((True, False), (True, False), (True, False), (True, False)): "Int8"}, + ), + ( + [[1, 2], [3, 4], [5]], + None, + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): np.dtype("O"), + }, + ), + ( + [4, 5, 6], + np.dtype("uint32"), + { + ((True, False), (True, False), (True,), (True, False)): "UInt32", + ((True, False), (True, False), (False,), (True, False)): np.dtype( + "uint32" + ), + }, + ), + ( + [-10, 12, 13], + np.dtype("i1"), + { + ((True, False), (True, False), (True,), (True, False)): "Int8", + ((True, False), (True, False), (False,), (True, False)): np.dtype( + "i1" + ), + }, + ), + ( + [1, 2.0], + object, + { + ((True, False), (True, False), (True,), (True, False)): "Int64", + ((True,), (True, False), (False,), (True, False)): np.dtype( + "float" + ), + ((False,), (True, False), (False,), (True, False)): np.dtype( + "object" + ), + }, + ), + ( + ["a", "b"], + pd.CategoricalDtype(), + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): pd.CategoricalDtype(), + }, + ), + ( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), + pd.DatetimeTZDtype(tz="UTC"), + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): pd.DatetimeTZDtype(tz="UTC"), + }, + ), + ( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), + "datetime64[ns]", + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): np.dtype("datetime64[ns]"), + }, + ), + ( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), + object, + { + ((True,), (True, False), (True, False), (True, False),): np.dtype( + "datetime64[ns]" + ), + ((False,), (True, False), (True, False), (True, False),): np.dtype( + "O" + ), + }, + ), + ( + pd.period_range("1/1/2011", freq="M", periods=3), + None, + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): pd.PeriodDtype("M"), + }, + ), + ( + pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]), + None, + { + ( + (True, False), + (True, False), + (True, False), + (True, False), + ): pd.IntervalDtype("int64"), + }, + ), + ], + ) + @pytest.mark.parametrize("params", product(*[(True, False)] * 4)) + def test_convert_dtypes(self, data, maindtype, params, answerdict): + if maindtype is not None: + series = pd.Series(data, dtype=maindtype) + else: + series = pd.Series(data) + answers = {k: a for (kk, a) in answerdict.items() for k in product(*kk)} + + ns = series.convert_dtypes(*params) + expected_dtype = answers[tuple(params)] + expected = pd.Series(series.values, dtype=expected_dtype) + tm.assert_series_equal(ns, expected) + + # Test that it is a copy + copy = series.copy(deep=True) + ns[ns.notna()] = np.nan + + # Make sure original not changed + tm.assert_series_equal(series, copy) diff --git a/venv/Lib/site-packages/pandas/tests/series/test_cumulative.py b/venv/Lib/site-packages/pandas/tests/series/test_cumulative.py new file mode 100644 index 0000000..885b5bf --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/test_cumulative.py @@ -0,0 +1,170 @@ +""" +Tests for Series cumulative operations. + +See also +-------- +tests.frame.test_cumulative +""" +from itertools import product + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +def _check_accum_op(name, series, check_dtype=True): + func = getattr(np, name) + tm.assert_numpy_array_equal( + func(series).values, func(np.array(series)), check_dtype=check_dtype, + ) + + # with missing values + ts = series.copy() + ts[::2] = np.NaN + + result = func(ts)[1::2] + expected = func(np.array(ts.dropna())) + + tm.assert_numpy_array_equal(result.values, expected, check_dtype=False) + + +class TestSeriesCumulativeOps: + def test_cumsum(self, datetime_series): + _check_accum_op("cumsum", datetime_series) + + def test_cumprod(self, datetime_series): + _check_accum_op("cumprod", datetime_series) + + def test_cummin(self, datetime_series): + tm.assert_numpy_array_equal( + datetime_series.cummin().values, + np.minimum.accumulate(np.array(datetime_series)), + ) + ts = datetime_series.copy() + ts[::2] = np.NaN + result = ts.cummin()[1::2] + expected = np.minimum.accumulate(ts.dropna()) + + tm.assert_series_equal(result, expected) + + def test_cummax(self, datetime_series): + tm.assert_numpy_array_equal( + datetime_series.cummax().values, + np.maximum.accumulate(np.array(datetime_series)), + ) + ts = datetime_series.copy() + ts[::2] = np.NaN + result = ts.cummax()[1::2] + expected = np.maximum.accumulate(ts.dropna()) + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("tz", [None, "US/Pacific"]) + def test_cummin_datetime64(self, tz): + s = pd.Series( + pd.to_datetime( + ["NaT", "2000-1-2", "NaT", "2000-1-1", "NaT", "2000-1-3"] + ).tz_localize(tz) + ) + + expected = pd.Series( + pd.to_datetime( + ["NaT", "2000-1-2", "NaT", "2000-1-1", "NaT", "2000-1-1"] + ).tz_localize(tz) + ) + result = s.cummin(skipna=True) + tm.assert_series_equal(expected, result) + + expected = pd.Series( + pd.to_datetime( + ["NaT", "2000-1-2", "2000-1-2", "2000-1-1", "2000-1-1", "2000-1-1"] + ).tz_localize(tz) + ) + result = s.cummin(skipna=False) + tm.assert_series_equal(expected, result) + + @pytest.mark.parametrize("tz", [None, "US/Pacific"]) + def test_cummax_datetime64(self, tz): + s = pd.Series( + pd.to_datetime( + ["NaT", "2000-1-2", "NaT", "2000-1-1", "NaT", "2000-1-3"] + ).tz_localize(tz) + ) + + expected = pd.Series( + pd.to_datetime( + ["NaT", "2000-1-2", "NaT", "2000-1-2", "NaT", "2000-1-3"] + ).tz_localize(tz) + ) + result = s.cummax(skipna=True) + tm.assert_series_equal(expected, result) + + expected = pd.Series( + pd.to_datetime( + ["NaT", "2000-1-2", "2000-1-2", "2000-1-2", "2000-1-2", "2000-1-3"] + ).tz_localize(tz) + ) + result = s.cummax(skipna=False) + tm.assert_series_equal(expected, result) + + def test_cummin_timedelta64(self): + s = pd.Series(pd.to_timedelta(["NaT", "2 min", "NaT", "1 min", "NaT", "3 min"])) + + expected = pd.Series( + pd.to_timedelta(["NaT", "2 min", "NaT", "1 min", "NaT", "1 min"]) + ) + result = s.cummin(skipna=True) + tm.assert_series_equal(expected, result) + + expected = pd.Series( + pd.to_timedelta(["NaT", "2 min", "2 min", "1 min", "1 min", "1 min"]) + ) + result = s.cummin(skipna=False) + tm.assert_series_equal(expected, result) + + def test_cummax_timedelta64(self): + s = pd.Series(pd.to_timedelta(["NaT", "2 min", "NaT", "1 min", "NaT", "3 min"])) + + expected = pd.Series( + pd.to_timedelta(["NaT", "2 min", "NaT", "2 min", "NaT", "3 min"]) + ) + result = s.cummax(skipna=True) + tm.assert_series_equal(expected, result) + + expected = pd.Series( + pd.to_timedelta(["NaT", "2 min", "2 min", "2 min", "2 min", "3 min"]) + ) + result = s.cummax(skipna=False) + tm.assert_series_equal(expected, result) + + def test_cummethods_bool(self): + # GH#6270 + + a = pd.Series([False, False, False, True, True, False, False]) + b = ~a + c = pd.Series([False] * len(b)) + d = ~c + methods = { + "cumsum": np.cumsum, + "cumprod": np.cumprod, + "cummin": np.minimum.accumulate, + "cummax": np.maximum.accumulate, + } + args = product((a, b, c, d), methods) + for s, method in args: + expected = pd.Series(methods[method](s.values)) + result = getattr(s, method)() + tm.assert_series_equal(result, expected) + + e = pd.Series([False, True, np.nan, False]) + cse = pd.Series([0, 1, np.nan, 1], dtype=object) + cpe = pd.Series([False, 0, np.nan, 0]) + cmin = pd.Series([False, False, np.nan, False]) + cmax = pd.Series([False, True, np.nan, True]) + expecteds = {"cumsum": cse, "cumprod": cpe, "cummin": cmin, "cummax": cmax} + + for method in methods: + res = getattr(e, method)() + tm.assert_series_equal(res, expecteds[method]) diff --git a/venv/Lib/site-packages/pandas/tests/series/test_datetime_values.py b/venv/Lib/site-packages/pandas/tests/series/test_datetime_values.py new file mode 100644 index 0000000..b8be4ea --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/test_datetime_values.py @@ -0,0 +1,689 @@ +import calendar +from datetime import date, datetime, time +import locale +import unicodedata + +import numpy as np +import pytest +import pytz + +from pandas._libs.tslibs.timezones import maybe_get_tz + +from pandas.core.dtypes.common import is_integer_dtype, is_list_like + +import pandas as pd +from pandas import ( + DataFrame, + DatetimeIndex, + Index, + PeriodIndex, + Series, + TimedeltaIndex, + bdate_range, + date_range, + period_range, + timedelta_range, +) +import pandas._testing as tm +from pandas.core.arrays import PeriodArray +import pandas.core.common as com + + +class TestSeriesDatetimeValues: + def test_dt_namespace_accessor(self): + + # GH 7207, 11128 + # test .dt namespace accessor + + ok_for_period = PeriodArray._datetimelike_ops + ok_for_period_methods = ["strftime", "to_timestamp", "asfreq"] + ok_for_dt = DatetimeIndex._datetimelike_ops + ok_for_dt_methods = [ + "to_period", + "to_pydatetime", + "tz_localize", + "tz_convert", + "normalize", + "strftime", + "round", + "floor", + "ceil", + "day_name", + "month_name", + ] + ok_for_td = TimedeltaIndex._datetimelike_ops + ok_for_td_methods = [ + "components", + "to_pytimedelta", + "total_seconds", + "round", + "floor", + "ceil", + ] + + def get_expected(s, name): + result = getattr(Index(s._values), prop) + if isinstance(result, np.ndarray): + if is_integer_dtype(result): + result = result.astype("int64") + elif not is_list_like(result): + return result + return Series(result, index=s.index, name=s.name) + + def compare(s, name): + a = getattr(s.dt, prop) + b = get_expected(s, prop) + if not (is_list_like(a) and is_list_like(b)): + assert a == b + else: + tm.assert_series_equal(a, b) + + # datetimeindex + cases = [ + Series(date_range("20130101", periods=5), name="xxx"), + Series(date_range("20130101", periods=5, freq="s"), name="xxx"), + Series(date_range("20130101 00:00:00", periods=5, freq="ms"), name="xxx"), + ] + for s in cases: + for prop in ok_for_dt: + # we test freq below + if prop != "freq": + compare(s, prop) + + for prop in ok_for_dt_methods: + getattr(s.dt, prop) + + result = s.dt.to_pydatetime() + assert isinstance(result, np.ndarray) + assert result.dtype == object + + result = s.dt.tz_localize("US/Eastern") + exp_values = DatetimeIndex(s.values).tz_localize("US/Eastern") + expected = Series(exp_values, index=s.index, name="xxx") + tm.assert_series_equal(result, expected) + + tz_result = result.dt.tz + assert str(tz_result) == "US/Eastern" + freq_result = s.dt.freq + assert freq_result == DatetimeIndex(s.values, freq="infer").freq + + # let's localize, then convert + result = s.dt.tz_localize("UTC").dt.tz_convert("US/Eastern") + exp_values = ( + DatetimeIndex(s.values).tz_localize("UTC").tz_convert("US/Eastern") + ) + expected = Series(exp_values, index=s.index, name="xxx") + tm.assert_series_equal(result, expected) + + # datetimeindex with tz + s = Series(date_range("20130101", periods=5, tz="US/Eastern"), name="xxx") + for prop in ok_for_dt: + + # we test freq below + if prop != "freq": + compare(s, prop) + + for prop in ok_for_dt_methods: + getattr(s.dt, prop) + + result = s.dt.to_pydatetime() + assert isinstance(result, np.ndarray) + assert result.dtype == object + + result = s.dt.tz_convert("CET") + expected = Series(s._values.tz_convert("CET"), index=s.index, name="xxx") + tm.assert_series_equal(result, expected) + + tz_result = result.dt.tz + assert str(tz_result) == "CET" + freq_result = s.dt.freq + assert freq_result == DatetimeIndex(s.values, freq="infer").freq + + # timedelta index + cases = [ + Series( + timedelta_range("1 day", periods=5), index=list("abcde"), name="xxx" + ), + Series(timedelta_range("1 day 01:23:45", periods=5, freq="s"), name="xxx"), + Series( + timedelta_range("2 days 01:23:45.012345", periods=5, freq="ms"), + name="xxx", + ), + ] + for s in cases: + for prop in ok_for_td: + # we test freq below + if prop != "freq": + compare(s, prop) + + for prop in ok_for_td_methods: + getattr(s.dt, prop) + + result = s.dt.components + assert isinstance(result, DataFrame) + tm.assert_index_equal(result.index, s.index) + + result = s.dt.to_pytimedelta() + assert isinstance(result, np.ndarray) + assert result.dtype == object + + result = s.dt.total_seconds() + assert isinstance(result, pd.Series) + assert result.dtype == "float64" + + freq_result = s.dt.freq + assert freq_result == TimedeltaIndex(s.values, freq="infer").freq + + # both + index = date_range("20130101", periods=3, freq="D") + s = Series(date_range("20140204", periods=3, freq="s"), index=index, name="xxx") + exp = Series( + np.array([2014, 2014, 2014], dtype="int64"), index=index, name="xxx" + ) + tm.assert_series_equal(s.dt.year, exp) + + exp = Series(np.array([2, 2, 2], dtype="int64"), index=index, name="xxx") + tm.assert_series_equal(s.dt.month, exp) + + exp = Series(np.array([0, 1, 2], dtype="int64"), index=index, name="xxx") + tm.assert_series_equal(s.dt.second, exp) + + exp = pd.Series([s[0]] * 3, index=index, name="xxx") + tm.assert_series_equal(s.dt.normalize(), exp) + + # periodindex + cases = [Series(period_range("20130101", periods=5, freq="D"), name="xxx")] + for s in cases: + for prop in ok_for_period: + # we test freq below + if prop != "freq": + compare(s, prop) + + for prop in ok_for_period_methods: + getattr(s.dt, prop) + + freq_result = s.dt.freq + assert freq_result == PeriodIndex(s.values).freq + + # test limited display api + def get_dir(s): + results = [r for r in s.dt.__dir__() if not r.startswith("_")] + return sorted(set(results)) + + s = Series(date_range("20130101", periods=5, freq="D"), name="xxx") + results = get_dir(s) + tm.assert_almost_equal(results, sorted(set(ok_for_dt + ok_for_dt_methods))) + + s = Series( + period_range("20130101", periods=5, freq="D", name="xxx").astype(object) + ) + results = get_dir(s) + tm.assert_almost_equal( + results, sorted(set(ok_for_period + ok_for_period_methods)) + ) + + # 11295 + # ambiguous time error on the conversions + s = Series(pd.date_range("2015-01-01", "2016-01-01", freq="T"), name="xxx") + s = s.dt.tz_localize("UTC").dt.tz_convert("America/Chicago") + results = get_dir(s) + tm.assert_almost_equal(results, sorted(set(ok_for_dt + ok_for_dt_methods))) + exp_values = pd.date_range( + "2015-01-01", "2016-01-01", freq="T", tz="UTC" + ).tz_convert("America/Chicago") + expected = Series(exp_values, name="xxx") + tm.assert_series_equal(s, expected) + + # no setting allowed + s = Series(date_range("20130101", periods=5, freq="D"), name="xxx") + with pytest.raises(ValueError, match="modifications"): + s.dt.hour = 5 + + # trying to set a copy + with pd.option_context("chained_assignment", "raise"): + with pytest.raises(com.SettingWithCopyError): + s.dt.hour[0] = 5 + + @pytest.mark.parametrize( + "method, dates", + [ + ["round", ["2012-01-02", "2012-01-02", "2012-01-01"]], + ["floor", ["2012-01-01", "2012-01-01", "2012-01-01"]], + ["ceil", ["2012-01-02", "2012-01-02", "2012-01-02"]], + ], + ) + def test_dt_round(self, method, dates): + # round + s = Series( + pd.to_datetime( + ["2012-01-01 13:00:00", "2012-01-01 12:01:00", "2012-01-01 08:00:00"] + ), + name="xxx", + ) + result = getattr(s.dt, method)("D") + expected = Series(pd.to_datetime(dates), name="xxx") + tm.assert_series_equal(result, expected) + + def test_dt_round_tz(self): + s = Series( + pd.to_datetime( + ["2012-01-01 13:00:00", "2012-01-01 12:01:00", "2012-01-01 08:00:00"] + ), + name="xxx", + ) + result = s.dt.tz_localize("UTC").dt.tz_convert("US/Eastern").dt.round("D") + + exp_values = pd.to_datetime( + ["2012-01-01", "2012-01-01", "2012-01-01"] + ).tz_localize("US/Eastern") + expected = Series(exp_values, name="xxx") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("method", ["ceil", "round", "floor"]) + def test_dt_round_tz_ambiguous(self, method): + # GH 18946 round near "fall back" DST + df1 = pd.DataFrame( + [ + pd.to_datetime("2017-10-29 02:00:00+02:00", utc=True), + pd.to_datetime("2017-10-29 02:00:00+01:00", utc=True), + pd.to_datetime("2017-10-29 03:00:00+01:00", utc=True), + ], + columns=["date"], + ) + df1["date"] = df1["date"].dt.tz_convert("Europe/Madrid") + # infer + result = getattr(df1.date.dt, method)("H", ambiguous="infer") + expected = df1["date"] + tm.assert_series_equal(result, expected) + + # bool-array + result = getattr(df1.date.dt, method)("H", ambiguous=[True, False, False]) + tm.assert_series_equal(result, expected) + + # NaT + result = getattr(df1.date.dt, method)("H", ambiguous="NaT") + expected = df1["date"].copy() + expected.iloc[0:2] = pd.NaT + tm.assert_series_equal(result, expected) + + # raise + with pytest.raises(pytz.AmbiguousTimeError): + getattr(df1.date.dt, method)("H", ambiguous="raise") + + @pytest.mark.parametrize( + "method, ts_str, freq", + [ + ["ceil", "2018-03-11 01:59:00-0600", "5min"], + ["round", "2018-03-11 01:59:00-0600", "5min"], + ["floor", "2018-03-11 03:01:00-0500", "2H"], + ], + ) + def test_dt_round_tz_nonexistent(self, method, ts_str, freq): + # GH 23324 round near "spring forward" DST + s = Series([pd.Timestamp(ts_str, tz="America/Chicago")]) + result = getattr(s.dt, method)(freq, nonexistent="shift_forward") + expected = Series([pd.Timestamp("2018-03-11 03:00:00", tz="America/Chicago")]) + tm.assert_series_equal(result, expected) + + result = getattr(s.dt, method)(freq, nonexistent="NaT") + expected = Series([pd.NaT]).dt.tz_localize(result.dt.tz) + tm.assert_series_equal(result, expected) + + with pytest.raises(pytz.NonExistentTimeError, match="2018-03-11 02:00:00"): + getattr(s.dt, method)(freq, nonexistent="raise") + + def test_dt_namespace_accessor_categorical(self): + # GH 19468 + dti = DatetimeIndex(["20171111", "20181212"]).repeat(2) + s = Series(pd.Categorical(dti), name="foo") + result = s.dt.year + expected = Series([2017, 2017, 2018, 2018], name="foo") + tm.assert_series_equal(result, expected) + + def test_dt_tz_localize_categorical(self, tz_aware_fixture): + # GH 27952 + tz = tz_aware_fixture + datetimes = pd.Series( + ["2019-01-01", "2019-01-01", "2019-01-02"], dtype="datetime64[ns]" + ) + categorical = datetimes.astype("category") + result = categorical.dt.tz_localize(tz) + expected = datetimes.dt.tz_localize(tz) + tm.assert_series_equal(result, expected) + + def test_dt_tz_convert_categorical(self, tz_aware_fixture): + # GH 27952 + tz = tz_aware_fixture + datetimes = pd.Series( + ["2019-01-01", "2019-01-01", "2019-01-02"], dtype="datetime64[ns, MET]" + ) + categorical = datetimes.astype("category") + result = categorical.dt.tz_convert(tz) + expected = datetimes.dt.tz_convert(tz) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("accessor", ["year", "month", "day"]) + def test_dt_other_accessors_categorical(self, accessor): + # GH 27952 + datetimes = pd.Series( + ["2018-01-01", "2018-01-01", "2019-01-02"], dtype="datetime64[ns]" + ) + categorical = datetimes.astype("category") + result = getattr(categorical.dt, accessor) + expected = getattr(datetimes.dt, accessor) + tm.assert_series_equal(result, expected) + + def test_dt_accessor_no_new_attributes(self): + # https://github.com/pandas-dev/pandas/issues/10673 + s = Series(date_range("20130101", periods=5, freq="D")) + with pytest.raises(AttributeError, match="You cannot add any new attribute"): + s.dt.xlabel = "a" + + @pytest.mark.parametrize( + "time_locale", [None] if tm.get_locales() is None else [None] + tm.get_locales() + ) + def test_dt_accessor_datetime_name_accessors(self, time_locale): + # Test Monday -> Sunday and January -> December, in that sequence + if time_locale is None: + # If the time_locale is None, day-name and month_name should + # return the english attributes + expected_days = [ + "Monday", + "Tuesday", + "Wednesday", + "Thursday", + "Friday", + "Saturday", + "Sunday", + ] + expected_months = [ + "January", + "February", + "March", + "April", + "May", + "June", + "July", + "August", + "September", + "October", + "November", + "December", + ] + else: + with tm.set_locale(time_locale, locale.LC_TIME): + expected_days = calendar.day_name[:] + expected_months = calendar.month_name[1:] + + s = Series(date_range(freq="D", start=datetime(1998, 1, 1), periods=365)) + english_days = [ + "Monday", + "Tuesday", + "Wednesday", + "Thursday", + "Friday", + "Saturday", + "Sunday", + ] + for day, name, eng_name in zip(range(4, 11), expected_days, english_days): + name = name.capitalize() + assert s.dt.day_name(locale=time_locale)[day] == name + s = s.append(Series([pd.NaT])) + assert np.isnan(s.dt.day_name(locale=time_locale).iloc[-1]) + + s = Series(date_range(freq="M", start="2012", end="2013")) + result = s.dt.month_name(locale=time_locale) + expected = Series([month.capitalize() for month in expected_months]) + + # work around https://github.com/pandas-dev/pandas/issues/22342 + result = result.str.normalize("NFD") + expected = expected.str.normalize("NFD") + + tm.assert_series_equal(result, expected) + + for s_date, expected in zip(s, expected_months): + result = s_date.month_name(locale=time_locale) + expected = expected.capitalize() + + result = unicodedata.normalize("NFD", result) + expected = unicodedata.normalize("NFD", expected) + + assert result == expected + + s = s.append(Series([pd.NaT])) + assert np.isnan(s.dt.month_name(locale=time_locale).iloc[-1]) + + def test_strftime(self): + # GH 10086 + s = Series(date_range("20130101", periods=5)) + result = s.dt.strftime("%Y/%m/%d") + expected = Series( + ["2013/01/01", "2013/01/02", "2013/01/03", "2013/01/04", "2013/01/05"] + ) + tm.assert_series_equal(result, expected) + + s = Series(date_range("2015-02-03 11:22:33.4567", periods=5)) + result = s.dt.strftime("%Y/%m/%d %H-%M-%S") + expected = Series( + [ + "2015/02/03 11-22-33", + "2015/02/04 11-22-33", + "2015/02/05 11-22-33", + "2015/02/06 11-22-33", + "2015/02/07 11-22-33", + ] + ) + tm.assert_series_equal(result, expected) + + s = Series(period_range("20130101", periods=5)) + result = s.dt.strftime("%Y/%m/%d") + expected = Series( + ["2013/01/01", "2013/01/02", "2013/01/03", "2013/01/04", "2013/01/05"] + ) + tm.assert_series_equal(result, expected) + + s = Series(period_range("2015-02-03 11:22:33.4567", periods=5, freq="s")) + result = s.dt.strftime("%Y/%m/%d %H-%M-%S") + expected = Series( + [ + "2015/02/03 11-22-33", + "2015/02/03 11-22-34", + "2015/02/03 11-22-35", + "2015/02/03 11-22-36", + "2015/02/03 11-22-37", + ] + ) + tm.assert_series_equal(result, expected) + + s = Series(date_range("20130101", periods=5)) + s.iloc[0] = pd.NaT + result = s.dt.strftime("%Y/%m/%d") + expected = Series( + [np.nan, "2013/01/02", "2013/01/03", "2013/01/04", "2013/01/05"] + ) + tm.assert_series_equal(result, expected) + + datetime_index = date_range("20150301", periods=5) + result = datetime_index.strftime("%Y/%m/%d") + + expected = Index( + ["2015/03/01", "2015/03/02", "2015/03/03", "2015/03/04", "2015/03/05"], + dtype=np.object_, + ) + # dtype may be S10 or U10 depending on python version + tm.assert_index_equal(result, expected) + + period_index = period_range("20150301", periods=5) + result = period_index.strftime("%Y/%m/%d") + expected = Index( + ["2015/03/01", "2015/03/02", "2015/03/03", "2015/03/04", "2015/03/05"], + dtype="=U10", + ) + tm.assert_index_equal(result, expected) + + s = Series([datetime(2013, 1, 1, 2, 32, 59), datetime(2013, 1, 2, 14, 32, 1)]) + result = s.dt.strftime("%Y-%m-%d %H:%M:%S") + expected = Series(["2013-01-01 02:32:59", "2013-01-02 14:32:01"]) + tm.assert_series_equal(result, expected) + + s = Series(period_range("20130101", periods=4, freq="H")) + result = s.dt.strftime("%Y/%m/%d %H:%M:%S") + expected = Series( + [ + "2013/01/01 00:00:00", + "2013/01/01 01:00:00", + "2013/01/01 02:00:00", + "2013/01/01 03:00:00", + ] + ) + + s = Series(period_range("20130101", periods=4, freq="L")) + result = s.dt.strftime("%Y/%m/%d %H:%M:%S.%l") + expected = Series( + [ + "2013/01/01 00:00:00.000", + "2013/01/01 00:00:00.001", + "2013/01/01 00:00:00.002", + "2013/01/01 00:00:00.003", + ] + ) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "data", + [ + DatetimeIndex(["2019-01-01", pd.NaT]), + PeriodIndex(["2019-01-01", pd.NaT], dtype="period[D]"), + ], + ) + def test_strftime_nat(self, data): + # GH 29578 + s = Series(data) + result = s.dt.strftime("%Y-%m-%d") + expected = Series(["2019-01-01", np.nan]) + tm.assert_series_equal(result, expected) + + def test_valid_dt_with_missing_values(self): + + from datetime import date, time + + # GH 8689 + s = Series(date_range("20130101", periods=5, freq="D")) + s.iloc[2] = pd.NaT + + for attr in ["microsecond", "nanosecond", "second", "minute", "hour", "day"]: + expected = getattr(s.dt, attr).copy() + expected.iloc[2] = np.nan + result = getattr(s.dt, attr) + tm.assert_series_equal(result, expected) + + result = s.dt.date + expected = Series( + [ + date(2013, 1, 1), + date(2013, 1, 2), + np.nan, + date(2013, 1, 4), + date(2013, 1, 5), + ], + dtype="object", + ) + tm.assert_series_equal(result, expected) + + result = s.dt.time + expected = Series([time(0), time(0), np.nan, time(0), time(0)], dtype="object") + tm.assert_series_equal(result, expected) + + def test_dt_accessor_api(self): + # GH 9322 + from pandas.core.indexes.accessors import ( + CombinedDatetimelikeProperties, + DatetimeProperties, + ) + + assert Series.dt is CombinedDatetimelikeProperties + + s = Series(date_range("2000-01-01", periods=3)) + assert isinstance(s.dt, DatetimeProperties) + + @pytest.mark.parametrize( + "ser", [Series(np.arange(5)), Series(list("abcde")), Series(np.random.randn(5))] + ) + def test_dt_accessor_invalid(self, ser): + # GH#9322 check that series with incorrect dtypes don't have attr + with pytest.raises(AttributeError, match="only use .dt accessor"): + ser.dt + assert not hasattr(ser, "dt") + + def test_dt_accessor_updates_on_inplace(self): + s = Series(pd.date_range("2018-01-01", periods=10)) + s[2] = None + s.fillna(pd.Timestamp("2018-01-01"), inplace=True) + result = s.dt.date + assert result[0] == result[2] + + def test_between(self): + s = Series(bdate_range("1/1/2000", periods=20).astype(object)) + s[::2] = np.nan + + result = s[s.between(s[3], s[17])] + expected = s[3:18].dropna() + tm.assert_series_equal(result, expected) + + result = s[s.between(s[3], s[17], inclusive=False)] + expected = s[5:16].dropna() + tm.assert_series_equal(result, expected) + + def test_date_tz(self): + # GH11757 + rng = pd.DatetimeIndex( + ["2014-04-04 23:56", "2014-07-18 21:24", "2015-11-22 22:14"], + tz="US/Eastern", + ) + s = Series(rng) + expected = Series([date(2014, 4, 4), date(2014, 7, 18), date(2015, 11, 22)]) + tm.assert_series_equal(s.dt.date, expected) + tm.assert_series_equal(s.apply(lambda x: x.date()), expected) + + def test_datetime_understood(self): + # Ensures it doesn't fail to create the right series + # reported in issue#16726 + series = pd.Series(pd.date_range("2012-01-01", periods=3)) + offset = pd.offsets.DateOffset(days=6) + result = series - offset + expected = pd.Series(pd.to_datetime(["2011-12-26", "2011-12-27", "2011-12-28"])) + tm.assert_series_equal(result, expected) + + def test_dt_timetz_accessor(self, tz_naive_fixture): + # GH21358 + tz = maybe_get_tz(tz_naive_fixture) + + dtindex = pd.DatetimeIndex( + ["2014-04-04 23:56", "2014-07-18 21:24", "2015-11-22 22:14"], tz=tz + ) + s = Series(dtindex) + expected = Series( + [time(23, 56, tzinfo=tz), time(21, 24, tzinfo=tz), time(22, 14, tzinfo=tz)] + ) + result = s.dt.timetz + tm.assert_series_equal(result, expected) + + def test_setitem_with_string_index(self): + # GH 23451 + x = pd.Series([1, 2, 3], index=["Date", "b", "other"]) + x["Date"] = date.today() + assert x.Date == date.today() + assert x["Date"] == date.today() + + def test_setitem_with_different_tz(self): + # GH#24024 + ser = pd.Series(pd.date_range("2000", periods=2, tz="US/Central")) + ser[0] = pd.Timestamp("2000", tz="US/Eastern") + expected = pd.Series( + [ + pd.Timestamp("2000-01-01 00:00:00-05:00", tz="US/Eastern"), + pd.Timestamp("2000-01-02 00:00:00-06:00", tz="US/Central"), + ], + dtype=object, + ) + tm.assert_series_equal(ser, expected) diff --git a/venv/Lib/site-packages/pandas/tests/series/test_dtypes.py b/venv/Lib/site-packages/pandas/tests/series/test_dtypes.py new file mode 100644 index 0000000..a57ec2b --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/test_dtypes.py @@ -0,0 +1,489 @@ +from datetime import datetime, timedelta +from importlib import reload +import string +import sys + +import numpy as np +import pytest + +from pandas._libs.tslibs import iNaT + +from pandas.core.dtypes.dtypes import CategoricalDtype + +import pandas as pd +from pandas import ( + Categorical, + DataFrame, + Index, + Series, + Timedelta, + Timestamp, + date_range, +) +import pandas._testing as tm + + +class TestSeriesDtypes: + def test_dt64_series_astype_object(self): + dt64ser = Series(date_range("20130101", periods=3)) + result = dt64ser.astype(object) + assert isinstance(result.iloc[0], datetime) + assert result.dtype == np.object_ + + def test_td64_series_astype_object(self): + tdser = Series(["59 Days", "59 Days", "NaT"], dtype="timedelta64[ns]") + result = tdser.astype(object) + assert isinstance(result.iloc[0], timedelta) + assert result.dtype == np.object_ + + @pytest.mark.parametrize("dtype", ["float32", "float64", "int64", "int32"]) + def test_astype(self, dtype): + s = Series(np.random.randn(5), name="foo") + as_typed = s.astype(dtype) + + assert as_typed.dtype == dtype + assert as_typed.name == s.name + + def test_dtype(self, datetime_series): + + assert datetime_series.dtype == np.dtype("float64") + assert datetime_series.dtypes == np.dtype("float64") + + @pytest.mark.parametrize("value", [np.nan, np.inf]) + @pytest.mark.parametrize("dtype", [np.int32, np.int64]) + def test_astype_cast_nan_inf_int(self, dtype, value): + # gh-14265: check NaN and inf raise error when converting to int + msg = "Cannot convert non-finite values \\(NA or inf\\) to integer" + s = Series([value]) + + with pytest.raises(ValueError, match=msg): + s.astype(dtype) + + @pytest.mark.parametrize("dtype", [int, np.int8, np.int64]) + def test_astype_cast_object_int_fail(self, dtype): + arr = Series(["car", "house", "tree", "1"]) + msg = r"invalid literal for int\(\) with base 10: 'car'" + with pytest.raises(ValueError, match=msg): + arr.astype(dtype) + + def test_astype_cast_object_int(self): + arr = Series(["1", "2", "3", "4"], dtype=object) + result = arr.astype(int) + + tm.assert_series_equal(result, Series(np.arange(1, 5))) + + def test_astype_datetime(self): + s = Series(iNaT, dtype="M8[ns]", index=range(5)) + + s = s.astype("O") + assert s.dtype == np.object_ + + s = Series([datetime(2001, 1, 2, 0, 0)]) + + s = s.astype("O") + assert s.dtype == np.object_ + + s = Series([datetime(2001, 1, 2, 0, 0) for i in range(3)]) + + s[1] = np.nan + assert s.dtype == "M8[ns]" + + s = s.astype("O") + assert s.dtype == np.object_ + + def test_astype_datetime64tz(self): + s = Series(date_range("20130101", periods=3, tz="US/Eastern")) + + # astype + result = s.astype(object) + expected = Series(s.astype(object), dtype=object) + tm.assert_series_equal(result, expected) + + result = Series(s.values).dt.tz_localize("UTC").dt.tz_convert(s.dt.tz) + tm.assert_series_equal(result, s) + + # astype - object, preserves on construction + result = Series(s.astype(object)) + expected = s.astype(object) + tm.assert_series_equal(result, expected) + + # astype - datetime64[ns, tz] + result = Series(s.values).astype("datetime64[ns, US/Eastern]") + tm.assert_series_equal(result, s) + + result = Series(s.values).astype(s.dtype) + tm.assert_series_equal(result, s) + + result = s.astype("datetime64[ns, CET]") + expected = Series(date_range("20130101 06:00:00", periods=3, tz="CET")) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("dtype", [str, np.str_]) + @pytest.mark.parametrize( + "series", + [ + Series([string.digits * 10, tm.rands(63), tm.rands(64), tm.rands(1000)]), + Series([string.digits * 10, tm.rands(63), tm.rands(64), np.nan, 1.0]), + ], + ) + def test_astype_str_map(self, dtype, series): + # see gh-4405 + result = series.astype(dtype) + expected = series.map(str) + tm.assert_series_equal(result, expected) + + def test_astype_str_cast(self): + # see gh-9757 + ts = Series([Timestamp("2010-01-04 00:00:00")]) + s = ts.astype(str) + + expected = Series([str("2010-01-04")]) + tm.assert_series_equal(s, expected) + + ts = Series([Timestamp("2010-01-04 00:00:00", tz="US/Eastern")]) + s = ts.astype(str) + + expected = Series([str("2010-01-04 00:00:00-05:00")]) + tm.assert_series_equal(s, expected) + + td = Series([Timedelta(1, unit="d")]) + s = td.astype(str) + + expected = Series([str("1 days 00:00:00.000000000")]) + tm.assert_series_equal(s, expected) + + def test_astype_unicode(self): + # see gh-7758: A bit of magic is required to set + # default encoding to utf-8 + digits = string.digits + test_series = [ + Series([digits * 10, tm.rands(63), tm.rands(64), tm.rands(1000)]), + Series(["データーサイエンス、お前はもう死んでいる"]), + ] + + former_encoding = None + + if sys.getdefaultencoding() == "utf-8": + test_series.append(Series(["野菜食べないとやばい".encode("utf-8")])) + + for s in test_series: + res = s.astype("unicode") + expec = s.map(str) + tm.assert_series_equal(res, expec) + + # Restore the former encoding + if former_encoding is not None and former_encoding != "utf-8": + reload(sys) + sys.setdefaultencoding(former_encoding) + + @pytest.mark.parametrize("dtype_class", [dict, Series]) + def test_astype_dict_like(self, dtype_class): + # see gh-7271 + s = Series(range(0, 10, 2), name="abc") + + dt1 = dtype_class({"abc": str}) + result = s.astype(dt1) + expected = Series(["0", "2", "4", "6", "8"], name="abc") + tm.assert_series_equal(result, expected) + + dt2 = dtype_class({"abc": "float64"}) + result = s.astype(dt2) + expected = Series([0.0, 2.0, 4.0, 6.0, 8.0], dtype="float64", name="abc") + tm.assert_series_equal(result, expected) + + dt3 = dtype_class({"abc": str, "def": str}) + msg = ( + "Only the Series name can be used for the key in Series dtype" + r" mappings\." + ) + with pytest.raises(KeyError, match=msg): + s.astype(dt3) + + dt4 = dtype_class({0: str}) + with pytest.raises(KeyError, match=msg): + s.astype(dt4) + + # GH16717 + # if dtypes provided is empty, it should error + if dtype_class is Series: + dt5 = dtype_class({}, dtype=object) + else: + dt5 = dtype_class({}) + + with pytest.raises(KeyError, match=msg): + s.astype(dt5) + + def test_astype_categories_raises(self): + # deprecated 17636, removed in GH-27141 + s = Series(["a", "b", "a"]) + with pytest.raises(TypeError, match="got an unexpected"): + s.astype("category", categories=["a", "b"], ordered=True) + + def test_astype_from_categorical(self): + items = ["a", "b", "c", "a"] + s = Series(items) + exp = Series(Categorical(items)) + res = s.astype("category") + tm.assert_series_equal(res, exp) + + items = [1, 2, 3, 1] + s = Series(items) + exp = Series(Categorical(items)) + res = s.astype("category") + tm.assert_series_equal(res, exp) + + df = DataFrame({"cats": [1, 2, 3, 4, 5, 6], "vals": [1, 2, 3, 4, 5, 6]}) + cats = Categorical([1, 2, 3, 4, 5, 6]) + exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]}) + df["cats"] = df["cats"].astype("category") + tm.assert_frame_equal(exp_df, df) + + df = DataFrame( + {"cats": ["a", "b", "b", "a", "a", "d"], "vals": [1, 2, 3, 4, 5, 6]} + ) + cats = Categorical(["a", "b", "b", "a", "a", "d"]) + exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]}) + df["cats"] = df["cats"].astype("category") + tm.assert_frame_equal(exp_df, df) + + # with keywords + lst = ["a", "b", "c", "a"] + s = Series(lst) + exp = Series(Categorical(lst, ordered=True)) + res = s.astype(CategoricalDtype(None, ordered=True)) + tm.assert_series_equal(res, exp) + + exp = Series(Categorical(lst, categories=list("abcdef"), ordered=True)) + res = s.astype(CategoricalDtype(list("abcdef"), ordered=True)) + tm.assert_series_equal(res, exp) + + def test_astype_categorical_to_other(self): + + value = np.random.RandomState(0).randint(0, 10000, 100) + df = DataFrame({"value": value}) + labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] + cat_labels = Categorical(labels, labels) + + df = df.sort_values(by=["value"], ascending=True) + df["value_group"] = pd.cut( + df.value, range(0, 10500, 500), right=False, labels=cat_labels + ) + + s = df["value_group"] + expected = s + tm.assert_series_equal(s.astype("category"), expected) + tm.assert_series_equal(s.astype(CategoricalDtype()), expected) + msg = r"could not convert string to float|invalid literal for float\(\)" + with pytest.raises(ValueError, match=msg): + s.astype("float64") + + cat = Series(Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])) + exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"]) + tm.assert_series_equal(cat.astype("str"), exp) + s2 = Series(Categorical(["1", "2", "3", "4"])) + exp2 = Series([1, 2, 3, 4]).astype(int) + tm.assert_series_equal(s2.astype("int"), exp2) + + # object don't sort correctly, so just compare that we have the same + # values + def cmp(a, b): + tm.assert_almost_equal(np.sort(np.unique(a)), np.sort(np.unique(b))) + + expected = Series(np.array(s.values), name="value_group") + cmp(s.astype("object"), expected) + cmp(s.astype(np.object_), expected) + + # array conversion + tm.assert_almost_equal(np.array(s), np.array(s.values)) + + # valid conversion + for valid in [ + lambda x: x.astype("category"), + lambda x: x.astype(CategoricalDtype()), + lambda x: x.astype("object").astype("category"), + lambda x: x.astype("object").astype(CategoricalDtype()), + ]: + + result = valid(s) + # compare series values + # internal .categories can't be compared because it is sorted + tm.assert_series_equal(result, s, check_categorical=False) + + # invalid conversion (these are NOT a dtype) + msg = ( + r"invalid type for astype" + ) + for invalid in [ + lambda x: x.astype(Categorical), + lambda x: x.astype("object").astype(Categorical), + ]: + with pytest.raises(TypeError, match=msg): + invalid(s) + + @pytest.mark.parametrize("name", [None, "foo"]) + @pytest.mark.parametrize("dtype_ordered", [True, False]) + @pytest.mark.parametrize("series_ordered", [True, False]) + def test_astype_categorical_to_categorical( + self, name, dtype_ordered, series_ordered + ): + # GH 10696/18593 + s_data = list("abcaacbab") + s_dtype = CategoricalDtype(list("bac"), ordered=series_ordered) + s = Series(s_data, dtype=s_dtype, name=name) + + # unspecified categories + dtype = CategoricalDtype(ordered=dtype_ordered) + result = s.astype(dtype) + exp_dtype = CategoricalDtype(s_dtype.categories, dtype_ordered) + expected = Series(s_data, name=name, dtype=exp_dtype) + tm.assert_series_equal(result, expected) + + # different categories + dtype = CategoricalDtype(list("adc"), dtype_ordered) + result = s.astype(dtype) + expected = Series(s_data, name=name, dtype=dtype) + tm.assert_series_equal(result, expected) + + if dtype_ordered is False: + # not specifying ordered, so only test once + expected = s + result = s.astype("category") + tm.assert_series_equal(result, expected) + + def test_astype_bool_missing_to_categorical(self): + # GH-19182 + s = Series([True, False, np.nan]) + assert s.dtypes == np.object_ + + result = s.astype(CategoricalDtype(categories=[True, False])) + expected = Series(Categorical([True, False, np.nan], categories=[True, False])) + tm.assert_series_equal(result, expected) + + def test_astype_categoricaldtype(self): + s = Series(["a", "b", "a"]) + result = s.astype(CategoricalDtype(["a", "b"], ordered=True)) + expected = Series(Categorical(["a", "b", "a"], ordered=True)) + tm.assert_series_equal(result, expected) + + result = s.astype(CategoricalDtype(["a", "b"], ordered=False)) + expected = Series(Categorical(["a", "b", "a"], ordered=False)) + tm.assert_series_equal(result, expected) + + result = s.astype(CategoricalDtype(["a", "b", "c"], ordered=False)) + expected = Series( + Categorical(["a", "b", "a"], categories=["a", "b", "c"], ordered=False) + ) + tm.assert_series_equal(result, expected) + tm.assert_index_equal(result.cat.categories, Index(["a", "b", "c"])) + + @pytest.mark.parametrize("dtype", [np.datetime64, np.timedelta64]) + def test_astype_generic_timestamp_no_frequency(self, dtype): + # see gh-15524, gh-15987 + data = [1] + s = Series(data) + + msg = ( + r"The '{dtype}' dtype has no unit\. " + r"Please pass in '{dtype}\[ns\]' instead." + ).format(dtype=dtype.__name__) + with pytest.raises(ValueError, match=msg): + s.astype(dtype) + + @pytest.mark.parametrize("dtype", np.typecodes["All"]) + def test_astype_empty_constructor_equality(self, dtype): + # see gh-15524 + + if dtype not in ( + "S", + "V", # poor support (if any) currently + "M", + "m", # Generic timestamps raise a ValueError. Already tested. + ): + init_empty = Series([], dtype=dtype) + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + as_type_empty = Series([]).astype(dtype) + tm.assert_series_equal(init_empty, as_type_empty) + + def test_arg_for_errors_in_astype(self): + # see gh-14878 + s = Series([1, 2, 3]) + + msg = ( + r"Expected value of kwarg 'errors' to be one of \['raise'," + r" 'ignore'\]\. Supplied value is 'False'" + ) + with pytest.raises(ValueError, match=msg): + s.astype(np.float64, errors=False) + + s.astype(np.int8, errors="raise") + + def test_intercept_astype_object(self): + series = Series(date_range("1/1/2000", periods=10)) + + # This test no longer makes sense, as + # Series is by default already M8[ns]. + expected = series.astype("object") + + df = DataFrame({"a": series, "b": np.random.randn(len(series))}) + exp_dtypes = Series( + [np.dtype("datetime64[ns]"), np.dtype("float64")], index=["a", "b"] + ) + tm.assert_series_equal(df.dtypes, exp_dtypes) + + result = df.values.squeeze() + assert (result[:, 0] == expected.values).all() + + df = DataFrame({"a": series, "b": ["foo"] * len(series)}) + + result = df.values.squeeze() + assert (result[:, 0] == expected.values).all() + + def test_series_to_categorical(self): + # see gh-16524: test conversion of Series to Categorical + series = Series(["a", "b", "c"]) + + result = Series(series, dtype="category") + expected = Series(["a", "b", "c"], dtype="category") + + tm.assert_series_equal(result, expected) + + def test_infer_objects_series(self): + # GH 11221 + actual = Series(np.array([1, 2, 3], dtype="O")).infer_objects() + expected = Series([1, 2, 3]) + tm.assert_series_equal(actual, expected) + + actual = Series(np.array([1, 2, 3, None], dtype="O")).infer_objects() + expected = Series([1.0, 2.0, 3.0, np.nan]) + tm.assert_series_equal(actual, expected) + + # only soft conversions, unconvertable pass thru unchanged + actual = Series(np.array([1, 2, 3, None, "a"], dtype="O")).infer_objects() + expected = Series([1, 2, 3, None, "a"]) + + assert actual.dtype == "object" + tm.assert_series_equal(actual, expected) + + @pytest.mark.parametrize( + "data", + [ + pd.period_range("2000", periods=4), + pd.IntervalIndex.from_breaks([1, 2, 3, 4]), + ], + ) + def test_values_compatibility(self, data): + # https://github.com/pandas-dev/pandas/issues/23995 + result = pd.Series(data).values + expected = np.array(data.astype(object)) + tm.assert_numpy_array_equal(result, expected) + + def test_reindex_astype_order_consistency(self): + # GH 17444 + s = Series([1, 2, 3], index=[2, 0, 1]) + new_index = [0, 1, 2] + temp_dtype = "category" + new_dtype = str + s1 = s.reindex(new_index).astype(temp_dtype).astype(new_dtype) + s2 = s.astype(temp_dtype).reindex(new_index).astype(new_dtype) + tm.assert_series_equal(s1, s2) diff --git a/venv/Lib/site-packages/pandas/tests/series/test_duplicates.py b/venv/Lib/site-packages/pandas/tests/series/test_duplicates.py new file mode 100644 index 0000000..3513db6 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/test_duplicates.py @@ -0,0 +1,92 @@ +import numpy as np +import pytest + +from pandas import Categorical, Series +import pandas._testing as tm +from pandas.core.construction import create_series_with_explicit_dtype + + +def test_nunique(): + # basics.rst doc example + series = Series(np.random.randn(500)) + series[20:500] = np.nan + series[10:20] = 5000 + result = series.nunique() + assert result == 11 + + # GH 18051 + s = Series(Categorical([])) + assert s.nunique() == 0 + s = Series(Categorical([np.nan])) + assert s.nunique() == 0 + + +def test_unique(): + # GH714 also, dtype=float + s = Series([1.2345] * 100) + s[::2] = np.nan + result = s.unique() + assert len(result) == 2 + + s = Series([1.2345] * 100, dtype="f4") + s[::2] = np.nan + result = s.unique() + assert len(result) == 2 + + # NAs in object arrays #714 + s = Series(["foo"] * 100, dtype="O") + s[::2] = np.nan + result = s.unique() + assert len(result) == 2 + + # decision about None + s = Series([1, 2, 3, None, None, None], dtype=object) + result = s.unique() + expected = np.array([1, 2, 3, None], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + # GH 18051 + s = Series(Categorical([])) + tm.assert_categorical_equal(s.unique(), Categorical([]), check_dtype=False) + s = Series(Categorical([np.nan])) + tm.assert_categorical_equal(s.unique(), Categorical([np.nan]), check_dtype=False) + + +def test_unique_data_ownership(): + # it works! #1807 + Series(Series(["a", "c", "b"]).unique()).sort_values() + + +@pytest.mark.parametrize( + "data, expected", + [ + (np.random.randint(0, 10, size=1000), False), + (np.arange(1000), True), + ([], True), + ([np.nan], True), + (["foo", "bar", np.nan], True), + (["foo", "foo", np.nan], False), + (["foo", "bar", np.nan, np.nan], False), + ], +) +def test_is_unique(data, expected): + # GH11946 / GH25180 + s = create_series_with_explicit_dtype(data, dtype_if_empty=object) + assert s.is_unique is expected + + +def test_is_unique_class_ne(capsys): + # GH 20661 + class Foo: + def __init__(self, val): + self._value = val + + def __ne__(self, other): + raise Exception("NEQ not supported") + + with capsys.disabled(): + li = [Foo(i) for i in range(5)] + s = Series(li, index=list(range(5))) + s.is_unique + captured = capsys.readouterr() + assert len(captured.err) == 0 diff --git a/venv/Lib/site-packages/pandas/tests/series/test_internals.py b/venv/Lib/site-packages/pandas/tests/series/test_internals.py new file mode 100644 index 0000000..4c817ed --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/test_internals.py @@ -0,0 +1,244 @@ +from datetime import datetime + +import numpy as np +import pytest + +import pandas as pd +from pandas import NaT, Series, Timestamp +import pandas._testing as tm +from pandas.core.internals.blocks import IntBlock + + +class TestSeriesInternals: + + # GH 10265 + def test_convert(self): + # Tests: All to nans, coerce, true + # Test coercion returns correct type + s = Series(["a", "b", "c"]) + results = s._convert(datetime=True, coerce=True) + expected = Series([NaT] * 3) + tm.assert_series_equal(results, expected) + + results = s._convert(numeric=True, coerce=True) + expected = Series([np.nan] * 3) + tm.assert_series_equal(results, expected) + + expected = Series([NaT] * 3, dtype=np.dtype("m8[ns]")) + results = s._convert(timedelta=True, coerce=True) + tm.assert_series_equal(results, expected) + + dt = datetime(2001, 1, 1, 0, 0) + td = dt - datetime(2000, 1, 1, 0, 0) + + # Test coercion with mixed types + s = Series(["a", "3.1415", dt, td]) + results = s._convert(datetime=True, coerce=True) + expected = Series([NaT, NaT, dt, NaT]) + tm.assert_series_equal(results, expected) + + results = s._convert(numeric=True, coerce=True) + expected = Series([np.nan, 3.1415, np.nan, np.nan]) + tm.assert_series_equal(results, expected) + + results = s._convert(timedelta=True, coerce=True) + expected = Series([NaT, NaT, NaT, td], dtype=np.dtype("m8[ns]")) + tm.assert_series_equal(results, expected) + + # Test standard conversion returns original + results = s._convert(datetime=True) + tm.assert_series_equal(results, s) + results = s._convert(numeric=True) + expected = Series([np.nan, 3.1415, np.nan, np.nan]) + tm.assert_series_equal(results, expected) + results = s._convert(timedelta=True) + tm.assert_series_equal(results, s) + + # test pass-through and non-conversion when other types selected + s = Series(["1.0", "2.0", "3.0"]) + results = s._convert(datetime=True, numeric=True, timedelta=True) + expected = Series([1.0, 2.0, 3.0]) + tm.assert_series_equal(results, expected) + results = s._convert(True, False, True) + tm.assert_series_equal(results, s) + + s = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 1, 0, 0)], dtype="O") + results = s._convert(datetime=True, numeric=True, timedelta=True) + expected = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 1, 0, 0)]) + tm.assert_series_equal(results, expected) + results = s._convert(datetime=False, numeric=True, timedelta=True) + tm.assert_series_equal(results, s) + + td = datetime(2001, 1, 1, 0, 0) - datetime(2000, 1, 1, 0, 0) + s = Series([td, td], dtype="O") + results = s._convert(datetime=True, numeric=True, timedelta=True) + expected = Series([td, td]) + tm.assert_series_equal(results, expected) + results = s._convert(True, True, False) + tm.assert_series_equal(results, s) + + s = Series([1.0, 2, 3], index=["a", "b", "c"]) + result = s._convert(numeric=True) + tm.assert_series_equal(result, s) + + # force numeric conversion + r = s.copy().astype("O") + r["a"] = "1" + result = r._convert(numeric=True) + tm.assert_series_equal(result, s) + + r = s.copy().astype("O") + r["a"] = "1." + result = r._convert(numeric=True) + tm.assert_series_equal(result, s) + + r = s.copy().astype("O") + r["a"] = "garbled" + result = r._convert(numeric=True) + expected = s.copy() + expected["a"] = np.nan + tm.assert_series_equal(result, expected) + + # GH 4119, not converting a mixed type (e.g.floats and object) + s = Series([1, "na", 3, 4]) + result = s._convert(datetime=True, numeric=True) + expected = Series([1, np.nan, 3, 4]) + tm.assert_series_equal(result, expected) + + s = Series([1, "", 3, 4]) + result = s._convert(datetime=True, numeric=True) + tm.assert_series_equal(result, expected) + + # dates + s = Series( + [ + datetime(2001, 1, 1, 0, 0), + datetime(2001, 1, 2, 0, 0), + datetime(2001, 1, 3, 0, 0), + ] + ) + s2 = Series( + [ + datetime(2001, 1, 1, 0, 0), + datetime(2001, 1, 2, 0, 0), + datetime(2001, 1, 3, 0, 0), + "foo", + 1.0, + 1, + Timestamp("20010104"), + "20010105", + ], + dtype="O", + ) + + result = s._convert(datetime=True) + expected = Series( + [Timestamp("20010101"), Timestamp("20010102"), Timestamp("20010103")], + dtype="M8[ns]", + ) + tm.assert_series_equal(result, expected) + + result = s._convert(datetime=True, coerce=True) + tm.assert_series_equal(result, expected) + + expected = Series( + [ + Timestamp("20010101"), + Timestamp("20010102"), + Timestamp("20010103"), + NaT, + NaT, + NaT, + Timestamp("20010104"), + Timestamp("20010105"), + ], + dtype="M8[ns]", + ) + result = s2._convert(datetime=True, numeric=False, timedelta=False, coerce=True) + tm.assert_series_equal(result, expected) + result = s2._convert(datetime=True, coerce=True) + tm.assert_series_equal(result, expected) + + s = Series(["foo", "bar", 1, 1.0], dtype="O") + result = s._convert(datetime=True, coerce=True) + expected = Series([NaT] * 2 + [Timestamp(1)] * 2) + tm.assert_series_equal(result, expected) + + # preserver if non-object + s = Series([1], dtype="float32") + result = s._convert(datetime=True, coerce=True) + tm.assert_series_equal(result, s) + + # r = s.copy() + # r[0] = np.nan + # result = r._convert(convert_dates=True,convert_numeric=False) + # assert result.dtype == 'M8[ns]' + + # dateutil parses some single letters into today's value as a date + expected = Series([NaT]) + for x in "abcdefghijklmnopqrstuvwxyz": + s = Series([x]) + result = s._convert(datetime=True, coerce=True) + tm.assert_series_equal(result, expected) + s = Series([x.upper()]) + result = s._convert(datetime=True, coerce=True) + tm.assert_series_equal(result, expected) + + def test_convert_no_arg_error(self): + s = Series(["1.0", "2"]) + msg = r"At least one of datetime, numeric or timedelta must be True\." + with pytest.raises(ValueError, match=msg): + s._convert() + + def test_convert_preserve_bool(self): + s = Series([1, True, 3, 5], dtype=object) + r = s._convert(datetime=True, numeric=True) + e = Series([1, 1, 3, 5], dtype="i8") + tm.assert_series_equal(r, e) + + def test_convert_preserve_all_bool(self): + s = Series([False, True, False, False], dtype=object) + r = s._convert(datetime=True, numeric=True) + e = Series([False, True, False, False], dtype=bool) + tm.assert_series_equal(r, e) + + def test_constructor_no_pandas_array(self): + ser = pd.Series([1, 2, 3]) + result = pd.Series(ser.array) + tm.assert_series_equal(ser, result) + assert isinstance(result._data.blocks[0], IntBlock) + + def test_astype_no_pandas_dtype(self): + # https://github.com/pandas-dev/pandas/pull/24866 + ser = pd.Series([1, 2], dtype="int64") + # Don't have PandasDtype in the public API, so we use `.array.dtype`, + # which is a PandasDtype. + result = ser.astype(ser.array.dtype) + tm.assert_series_equal(result, ser) + + def test_from_array(self): + result = pd.Series(pd.array(["1H", "2H"], dtype="timedelta64[ns]")) + assert result._data.blocks[0].is_extension is False + + result = pd.Series(pd.array(["2015"], dtype="datetime64[ns]")) + assert result._data.blocks[0].is_extension is False + + def test_from_list_dtype(self): + result = pd.Series(["1H", "2H"], dtype="timedelta64[ns]") + assert result._data.blocks[0].is_extension is False + + result = pd.Series(["2015"], dtype="datetime64[ns]") + assert result._data.blocks[0].is_extension is False + + +def test_hasnans_unchached_for_series(): + # GH#19700 + idx = pd.Index([0, 1]) + assert idx.hasnans is False + assert "hasnans" in idx._cache + ser = idx.to_series() + assert ser.hasnans is False + assert not hasattr(ser, "_cache") + ser.iloc[-1] = np.nan + assert ser.hasnans is True + assert Series.hasnans.__doc__ == pd.Index.hasnans.__doc__ diff --git a/venv/Lib/site-packages/pandas/tests/series/test_io.py b/venv/Lib/site-packages/pandas/tests/series/test_io.py new file mode 100644 index 0000000..510c11a --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/test_io.py @@ -0,0 +1,240 @@ +from datetime import datetime +from io import StringIO + +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Series +import pandas._testing as tm + +from pandas.io.common import get_handle + + +class TestSeriesToCSV: + def read_csv(self, path, **kwargs): + params = dict(squeeze=True, index_col=0, header=None, parse_dates=True) + params.update(**kwargs) + + header = params.get("header") + out = pd.read_csv(path, **params) + + if header is None: + out.name = out.index.name = None + + return out + + def test_from_csv(self, datetime_series, string_series): + + with tm.ensure_clean() as path: + datetime_series.to_csv(path, header=False) + ts = self.read_csv(path) + tm.assert_series_equal(datetime_series, ts, check_names=False) + + assert ts.name is None + assert ts.index.name is None + + # see gh-10483 + datetime_series.to_csv(path, header=True) + ts_h = self.read_csv(path, header=0) + assert ts_h.name == "ts" + + string_series.to_csv(path, header=False) + series = self.read_csv(path) + tm.assert_series_equal(string_series, series, check_names=False) + + assert series.name is None + assert series.index.name is None + + string_series.to_csv(path, header=True) + series_h = self.read_csv(path, header=0) + assert series_h.name == "series" + + with open(path, "w") as outfile: + outfile.write("1998-01-01|1.0\n1999-01-01|2.0") + + series = self.read_csv(path, sep="|") + check_series = Series( + {datetime(1998, 1, 1): 1.0, datetime(1999, 1, 1): 2.0} + ) + tm.assert_series_equal(check_series, series) + + series = self.read_csv(path, sep="|", parse_dates=False) + check_series = Series({"1998-01-01": 1.0, "1999-01-01": 2.0}) + tm.assert_series_equal(check_series, series) + + def test_to_csv(self, datetime_series): + import io + + with tm.ensure_clean() as path: + datetime_series.to_csv(path, header=False) + + with io.open(path, newline=None) as f: + lines = f.readlines() + assert lines[1] != "\n" + + datetime_series.to_csv(path, index=False, header=False) + arr = np.loadtxt(path) + tm.assert_almost_equal(arr, datetime_series.values) + + def test_to_csv_unicode_index(self): + buf = StringIO() + s = Series(["\u05d0", "d2"], index=["\u05d0", "\u05d1"]) + + s.to_csv(buf, encoding="UTF-8", header=False) + buf.seek(0) + + s2 = self.read_csv(buf, index_col=0, encoding="UTF-8") + tm.assert_series_equal(s, s2) + + def test_to_csv_float_format(self): + + with tm.ensure_clean() as filename: + ser = Series([0.123456, 0.234567, 0.567567]) + ser.to_csv(filename, float_format="%.2f", header=False) + + rs = self.read_csv(filename) + xp = Series([0.12, 0.23, 0.57]) + tm.assert_series_equal(rs, xp) + + def test_to_csv_list_entries(self): + s = Series(["jack and jill", "jesse and frank"]) + + split = s.str.split(r"\s+and\s+") + + buf = StringIO() + split.to_csv(buf, header=False) + + def test_to_csv_path_is_none(self): + # GH 8215 + # Series.to_csv() was returning None, inconsistent with + # DataFrame.to_csv() which returned string + s = Series([1, 2, 3]) + csv_str = s.to_csv(path_or_buf=None, header=False) + assert isinstance(csv_str, str) + + @pytest.mark.parametrize( + "s,encoding", + [ + ( + Series([0.123456, 0.234567, 0.567567], index=["A", "B", "C"], name="X"), + None, + ), + # GH 21241, 21118 + (Series(["abc", "def", "ghi"], name="X"), "ascii"), + (Series(["123", "你好", "世界"], name="中文"), "gb2312"), + (Series(["123", "Γειά σου", "Κόσμε"], name="Ελληνικά"), "cp737"), + ], + ) + def test_to_csv_compression(self, s, encoding, compression): + + with tm.ensure_clean() as filename: + + s.to_csv(filename, compression=compression, encoding=encoding, header=True) + # test the round trip - to_csv -> read_csv + result = pd.read_csv( + filename, + compression=compression, + encoding=encoding, + index_col=0, + squeeze=True, + ) + tm.assert_series_equal(s, result) + + # test the round trip using file handle - to_csv -> read_csv + f, _handles = get_handle( + filename, "w", compression=compression, encoding=encoding + ) + with f: + s.to_csv(f, encoding=encoding, header=True) + result = pd.read_csv( + filename, + compression=compression, + encoding=encoding, + index_col=0, + squeeze=True, + ) + tm.assert_series_equal(s, result) + + # explicitly ensure file was compressed + with tm.decompress_file(filename, compression) as fh: + text = fh.read().decode(encoding or "utf8") + assert s.name in text + + with tm.decompress_file(filename, compression) as fh: + tm.assert_series_equal( + s, pd.read_csv(fh, index_col=0, squeeze=True, encoding=encoding) + ) + + def test_to_csv_interval_index(self): + # GH 28210 + s = Series(["foo", "bar", "baz"], index=pd.interval_range(0, 3)) + + with tm.ensure_clean("__tmp_to_csv_interval_index__.csv") as path: + s.to_csv(path, header=False) + result = self.read_csv(path, index_col=0, squeeze=True) + + # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) + expected = s.copy() + expected.index = expected.index.astype(str) + + tm.assert_series_equal(result, expected) + + +class TestSeriesIO: + def test_to_frame(self, datetime_series): + datetime_series.name = None + rs = datetime_series.to_frame() + xp = pd.DataFrame(datetime_series.values, index=datetime_series.index) + tm.assert_frame_equal(rs, xp) + + datetime_series.name = "testname" + rs = datetime_series.to_frame() + xp = pd.DataFrame( + dict(testname=datetime_series.values), index=datetime_series.index + ) + tm.assert_frame_equal(rs, xp) + + rs = datetime_series.to_frame(name="testdifferent") + xp = pd.DataFrame( + dict(testdifferent=datetime_series.values), index=datetime_series.index + ) + tm.assert_frame_equal(rs, xp) + + def test_timeseries_periodindex(self): + # GH2891 + from pandas import period_range + + prng = period_range("1/1/2011", "1/1/2012", freq="M") + ts = Series(np.random.randn(len(prng)), prng) + new_ts = tm.round_trip_pickle(ts) + assert new_ts.index.freq == "M" + + def test_pickle_preserve_name(self): + for n in [777, 777.0, "name", datetime(2001, 11, 11), (1, 2)]: + unpickled = self._pickle_roundtrip_name(tm.makeTimeSeries(name=n)) + assert unpickled.name == n + + def _pickle_roundtrip_name(self, obj): + + with tm.ensure_clean() as path: + obj.to_pickle(path) + unpickled = pd.read_pickle(path) + return unpickled + + def test_to_frame_expanddim(self): + # GH 9762 + + class SubclassedSeries(Series): + @property + def _constructor_expanddim(self): + return SubclassedFrame + + class SubclassedFrame(DataFrame): + pass + + s = SubclassedSeries([1, 2, 3], name="X") + result = s.to_frame() + assert isinstance(result, SubclassedFrame) + expected = SubclassedFrame({"X": [1, 2, 3]}) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/series/test_missing.py b/venv/Lib/site-packages/pandas/tests/series/test_missing.py new file mode 100644 index 0000000..7b6d921 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/test_missing.py @@ -0,0 +1,1650 @@ +from datetime import datetime, timedelta + +import numpy as np +import pytest +import pytz + +from pandas._libs.tslib import iNaT +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + Categorical, + DataFrame, + Index, + IntervalIndex, + MultiIndex, + NaT, + Series, + Timedelta, + Timestamp, + date_range, + isna, +) +import pandas._testing as tm + + +def _simple_ts(start, end, freq="D"): + rng = date_range(start, end, freq=freq) + return Series(np.random.randn(len(rng)), index=rng) + + +class TestSeriesMissingData: + def test_timedelta_fillna(self): + # GH 3371 + s = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130102"), + Timestamp("20130103 9:01:01"), + ] + ) + td = s.diff() + + # reg fillna + result = td.fillna(Timedelta(seconds=0)) + expected = Series( + [ + timedelta(0), + timedelta(0), + timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1), + ] + ) + tm.assert_series_equal(result, expected) + + # interpreted as seconds, deprecated + with pytest.raises(TypeError, match="Passing integers to fillna"): + td.fillna(1) + + result = td.fillna(Timedelta(seconds=1)) + expected = Series( + [ + timedelta(seconds=1), + timedelta(0), + timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1), + ] + ) + tm.assert_series_equal(result, expected) + + result = td.fillna(timedelta(days=1, seconds=1)) + expected = Series( + [ + timedelta(days=1, seconds=1), + timedelta(0), + timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1), + ] + ) + tm.assert_series_equal(result, expected) + + result = td.fillna(np.timedelta64(int(1e9))) + expected = Series( + [ + timedelta(seconds=1), + timedelta(0), + timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1), + ] + ) + tm.assert_series_equal(result, expected) + + result = td.fillna(NaT) + expected = Series( + [ + NaT, + timedelta(0), + timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1), + ], + dtype="m8[ns]", + ) + tm.assert_series_equal(result, expected) + + # ffill + td[2] = np.nan + result = td.ffill() + expected = td.fillna(Timedelta(seconds=0)) + expected[0] = np.nan + tm.assert_series_equal(result, expected) + + # bfill + td[2] = np.nan + result = td.bfill() + expected = td.fillna(Timedelta(seconds=0)) + expected[2] = timedelta(days=1, seconds=9 * 3600 + 60 + 1) + tm.assert_series_equal(result, expected) + + def test_datetime64_fillna(self): + + s = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130102"), + Timestamp("20130103 9:01:01"), + ] + ) + s[2] = np.nan + + # reg fillna + result = s.fillna(Timestamp("20130104")) + expected = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130104"), + Timestamp("20130103 9:01:01"), + ] + ) + tm.assert_series_equal(result, expected) + + result = s.fillna(NaT) + expected = s + tm.assert_series_equal(result, expected) + + # ffill + result = s.ffill() + expected = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130103 9:01:01"), + ] + ) + tm.assert_series_equal(result, expected) + + # bfill + result = s.bfill() + expected = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130103 9:01:01"), + Timestamp("20130103 9:01:01"), + ] + ) + tm.assert_series_equal(result, expected) + + # GH 6587 + # make sure that we are treating as integer when filling + # this also tests inference of a datetime-like with NaT's + s = Series([pd.NaT, pd.NaT, "2013-08-05 15:30:00.000001"]) + expected = Series( + [ + "2013-08-05 15:30:00.000001", + "2013-08-05 15:30:00.000001", + "2013-08-05 15:30:00.000001", + ], + dtype="M8[ns]", + ) + result = s.fillna(method="backfill") + tm.assert_series_equal(result, expected) + + def test_datetime64_tz_fillna(self): + + for tz in ["US/Eastern", "Asia/Tokyo"]: + # DatetimeBlock + s = Series( + [ + Timestamp("2011-01-01 10:00"), + pd.NaT, + Timestamp("2011-01-03 10:00"), + pd.NaT, + ] + ) + null_loc = pd.Series([False, True, False, True]) + + result = s.fillna(pd.Timestamp("2011-01-02 10:00")) + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00"), + Timestamp("2011-01-03 10:00"), + Timestamp("2011-01-02 10:00"), + ] + ) + tm.assert_series_equal(expected, result) + # check s is not changed + tm.assert_series_equal(pd.isna(s), null_loc) + + result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz)) + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00", tz=tz), + Timestamp("2011-01-03 10:00"), + Timestamp("2011-01-02 10:00", tz=tz), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + result = s.fillna("AAA") + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + "AAA", + Timestamp("2011-01-03 10:00"), + "AAA", + ], + dtype=object, + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + result = s.fillna( + { + 1: pd.Timestamp("2011-01-02 10:00", tz=tz), + 3: pd.Timestamp("2011-01-04 10:00"), + } + ) + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00", tz=tz), + Timestamp("2011-01-03 10:00"), + Timestamp("2011-01-04 10:00"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + result = s.fillna( + { + 1: pd.Timestamp("2011-01-02 10:00"), + 3: pd.Timestamp("2011-01-04 10:00"), + } + ) + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00"), + Timestamp("2011-01-03 10:00"), + Timestamp("2011-01-04 10:00"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + # DatetimeBlockTZ + idx = pd.DatetimeIndex( + ["2011-01-01 10:00", pd.NaT, "2011-01-03 10:00", pd.NaT], tz=tz + ) + s = pd.Series(idx) + assert s.dtype == f"datetime64[ns, {tz}]" + tm.assert_series_equal(pd.isna(s), null_loc) + + result = s.fillna(pd.Timestamp("2011-01-02 10:00")) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2011-01-02 10:00"), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2011-01-02 10:00"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz)) + idx = pd.DatetimeIndex( + [ + "2011-01-01 10:00", + "2011-01-02 10:00", + "2011-01-03 10:00", + "2011-01-02 10:00", + ], + tz=tz, + ) + expected = Series(idx) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz).to_pydatetime()) + idx = pd.DatetimeIndex( + [ + "2011-01-01 10:00", + "2011-01-02 10:00", + "2011-01-03 10:00", + "2011-01-02 10:00", + ], + tz=tz, + ) + expected = Series(idx) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + result = s.fillna("AAA") + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + "AAA", + Timestamp("2011-01-03 10:00", tz=tz), + "AAA", + ], + dtype=object, + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + result = s.fillna( + { + 1: pd.Timestamp("2011-01-02 10:00", tz=tz), + 3: pd.Timestamp("2011-01-04 10:00"), + } + ) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2011-01-02 10:00", tz=tz), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2011-01-04 10:00"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + result = s.fillna( + { + 1: pd.Timestamp("2011-01-02 10:00", tz=tz), + 3: pd.Timestamp("2011-01-04 10:00", tz=tz), + } + ) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2011-01-02 10:00", tz=tz), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2011-01-04 10:00", tz=tz), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + # filling with a naive/other zone, coerce to object + result = s.fillna(Timestamp("20130101")) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2013-01-01"), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2013-01-01"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + result = s.fillna(Timestamp("20130101", tz="US/Pacific")) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2013-01-01", tz="US/Pacific"), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2013-01-01", tz="US/Pacific"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + # with timezone + # GH 15855 + df = pd.Series([pd.Timestamp("2012-11-11 00:00:00+01:00"), pd.NaT]) + exp = pd.Series( + [ + pd.Timestamp("2012-11-11 00:00:00+01:00"), + pd.Timestamp("2012-11-11 00:00:00+01:00"), + ] + ) + tm.assert_series_equal(df.fillna(method="pad"), exp) + + df = pd.Series([pd.NaT, pd.Timestamp("2012-11-11 00:00:00+01:00")]) + exp = pd.Series( + [ + pd.Timestamp("2012-11-11 00:00:00+01:00"), + pd.Timestamp("2012-11-11 00:00:00+01:00"), + ] + ) + tm.assert_series_equal(df.fillna(method="bfill"), exp) + + def test_datetime64_non_nano_fillna(self): + # GH#27419 + ser = Series([Timestamp("2010-01-01"), pd.NaT, Timestamp("2000-01-01")]) + val = np.datetime64("1975-04-05", "ms") + + result = ser.fillna(val) + expected = Series( + [Timestamp("2010-01-01"), Timestamp("1975-04-05"), Timestamp("2000-01-01")] + ) + tm.assert_series_equal(result, expected) + + def test_fillna_consistency(self): + # GH 16402 + # fillna with a tz aware to a tz-naive, should result in object + + s = Series([Timestamp("20130101"), pd.NaT]) + + result = s.fillna(Timestamp("20130101", tz="US/Eastern")) + expected = Series( + [Timestamp("20130101"), Timestamp("2013-01-01", tz="US/Eastern")], + dtype="object", + ) + tm.assert_series_equal(result, expected) + + # where (we ignore the errors=) + result = s.where( + [True, False], Timestamp("20130101", tz="US/Eastern"), errors="ignore" + ) + tm.assert_series_equal(result, expected) + + result = s.where( + [True, False], Timestamp("20130101", tz="US/Eastern"), errors="ignore" + ) + tm.assert_series_equal(result, expected) + + # with a non-datetime + result = s.fillna("foo") + expected = Series([Timestamp("20130101"), "foo"]) + tm.assert_series_equal(result, expected) + + # assignment + s2 = s.copy() + s2[1] = "foo" + tm.assert_series_equal(s2, expected) + + def test_where_sparse(self): + # GH#17198 make sure we dont get an AttributeError for sp_index + ser = pd.Series(pd.arrays.SparseArray([1, 2])) + result = ser.where(ser >= 2, 0) + expected = pd.Series(pd.arrays.SparseArray([0, 2])) + tm.assert_series_equal(result, expected) + + def test_datetime64tz_fillna_round_issue(self): + # GH 14872 + + data = pd.Series( + [pd.NaT, pd.NaT, datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc)] + ) + + filled = data.fillna(method="bfill") + + expected = pd.Series( + [ + datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), + datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), + datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), + ] + ) + + tm.assert_series_equal(filled, expected) + + def test_fillna_downcast(self): + # GH 15277 + # infer int64 from float64 + s = pd.Series([1.0, np.nan]) + result = s.fillna(0, downcast="infer") + expected = pd.Series([1, 0]) + tm.assert_series_equal(result, expected) + + # infer int64 from float64 when fillna value is a dict + s = pd.Series([1.0, np.nan]) + result = s.fillna({1: 0}, downcast="infer") + expected = pd.Series([1, 0]) + tm.assert_series_equal(result, expected) + + def test_fillna_int(self): + s = Series(np.random.randint(-100, 100, 50)) + s.fillna(method="ffill", inplace=True) + tm.assert_series_equal(s.fillna(method="ffill", inplace=False), s) + + def test_fillna_raise(self): + s = Series(np.random.randint(-100, 100, 50)) + msg = '"value" parameter must be a scalar or dict, but you passed a "list"' + with pytest.raises(TypeError, match=msg): + s.fillna([1, 2]) + + msg = '"value" parameter must be a scalar or dict, but you passed a "tuple"' + with pytest.raises(TypeError, match=msg): + s.fillna((1, 2)) + + # related GH 9217, make sure limit is an int and greater than 0 + s = Series([1, 2, 3, None]) + msg = ( + r"Cannot specify both 'value' and 'method'\.|" + r"Limit must be greater than 0|" + "Limit must be an integer" + ) + for limit in [-1, 0, 1.0, 2.0]: + for method in ["backfill", "bfill", "pad", "ffill", None]: + with pytest.raises(ValueError, match=msg): + s.fillna(1, limit=limit, method=method) + + def test_categorical_nan_equality(self): + cat = Series(Categorical(["a", "b", "c", np.nan])) + exp = Series([True, True, True, False]) + res = cat == cat + tm.assert_series_equal(res, exp) + + def test_categorical_nan_handling(self): + + # NaNs are represented as -1 in labels + s = Series(Categorical(["a", "b", np.nan, "a"])) + tm.assert_index_equal(s.cat.categories, Index(["a", "b"])) + tm.assert_numpy_array_equal( + s.values.codes, np.array([0, 1, -1, 0], dtype=np.int8) + ) + + @pytest.mark.parametrize( + "fill_value, expected_output", + [ + ("a", ["a", "a", "b", "a", "a"]), + ({1: "a", 3: "b", 4: "b"}, ["a", "a", "b", "b", "b"]), + ({1: "a"}, ["a", "a", "b", np.nan, np.nan]), + ({1: "a", 3: "b"}, ["a", "a", "b", "b", np.nan]), + (Series("a"), ["a", np.nan, "b", np.nan, np.nan]), + (Series("a", index=[1]), ["a", "a", "b", np.nan, np.nan]), + (Series({1: "a", 3: "b"}), ["a", "a", "b", "b", np.nan]), + (Series(["a", "b"], index=[3, 4]), ["a", np.nan, "b", "a", "b"]), + ], + ) + def test_fillna_categorical(self, fill_value, expected_output): + # GH 17033 + # Test fillna for a Categorical series + data = ["a", np.nan, "b", np.nan, np.nan] + s = Series(Categorical(data, categories=["a", "b"])) + exp = Series(Categorical(expected_output, categories=["a", "b"])) + tm.assert_series_equal(s.fillna(fill_value), exp) + + @pytest.mark.parametrize( + "fill_value, expected_output", + [ + (Series(["a", "b", "c", "d", "e"]), ["a", "b", "b", "d", "e"]), + (Series(["b", "d", "a", "d", "a"]), ["a", "d", "b", "d", "a"]), + ( + Series( + Categorical( + ["b", "d", "a", "d", "a"], categories=["b", "c", "d", "e", "a"] + ) + ), + ["a", "d", "b", "d", "a"], + ), + ], + ) + def test_fillna_categorical_with_new_categories(self, fill_value, expected_output): + # GH 26215 + data = ["a", np.nan, "b", np.nan, np.nan] + s = Series(Categorical(data, categories=["a", "b", "c", "d", "e"])) + exp = Series(Categorical(expected_output, categories=["a", "b", "c", "d", "e"])) + tm.assert_series_equal(s.fillna(fill_value), exp) + + def test_fillna_categorical_raise(self): + data = ["a", np.nan, "b", np.nan, np.nan] + s = Series(Categorical(data, categories=["a", "b"])) + + with pytest.raises(ValueError, match="fill value must be in categories"): + s.fillna("d") + + with pytest.raises(ValueError, match="fill value must be in categories"): + s.fillna(Series("d")) + + with pytest.raises(ValueError, match="fill value must be in categories"): + s.fillna({1: "d", 3: "a"}) + + msg = '"value" parameter must be a scalar or dict, but you passed a "list"' + with pytest.raises(TypeError, match=msg): + s.fillna(["a", "b"]) + + msg = '"value" parameter must be a scalar or dict, but you passed a "tuple"' + with pytest.raises(TypeError, match=msg): + s.fillna(("a", "b")) + + msg = ( + '"value" parameter must be a scalar, dict ' + 'or Series, but you passed a "DataFrame"' + ) + with pytest.raises(TypeError, match=msg): + s.fillna(DataFrame({1: ["a"], 3: ["b"]})) + + def test_fillna_nat(self): + series = Series([0, 1, 2, iNaT], dtype="M8[ns]") + + filled = series.fillna(method="pad") + filled2 = series.fillna(value=series.values[2]) + + expected = series.copy() + expected.values[3] = expected.values[2] + + tm.assert_series_equal(filled, expected) + tm.assert_series_equal(filled2, expected) + + df = DataFrame({"A": series}) + filled = df.fillna(method="pad") + filled2 = df.fillna(value=series.values[2]) + expected = DataFrame({"A": expected}) + tm.assert_frame_equal(filled, expected) + tm.assert_frame_equal(filled2, expected) + + series = Series([iNaT, 0, 1, 2], dtype="M8[ns]") + + filled = series.fillna(method="bfill") + filled2 = series.fillna(value=series[1]) + + expected = series.copy() + expected[0] = expected[1] + + tm.assert_series_equal(filled, expected) + tm.assert_series_equal(filled2, expected) + + df = DataFrame({"A": series}) + filled = df.fillna(method="bfill") + filled2 = df.fillna(value=series[1]) + expected = DataFrame({"A": expected}) + tm.assert_frame_equal(filled, expected) + tm.assert_frame_equal(filled2, expected) + + def test_isna_for_inf(self): + s = Series(["a", np.inf, np.nan, 1.0]) + with pd.option_context("mode.use_inf_as_na", True): + r = s.isna() + dr = s.dropna() + e = Series([False, True, True, False]) + de = Series(["a", 1.0], index=[0, 3]) + tm.assert_series_equal(r, e) + tm.assert_series_equal(dr, de) + + def test_isnull_for_inf_deprecated(self): + # gh-17115 + s = Series(["a", np.inf, np.nan, 1.0]) + with pd.option_context("mode.use_inf_as_null", True): + r = s.isna() + dr = s.dropna() + + e = Series([False, True, True, False]) + de = Series(["a", 1.0], index=[0, 3]) + tm.assert_series_equal(r, e) + tm.assert_series_equal(dr, de) + + def test_fillna(self, datetime_series): + ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5)) + + tm.assert_series_equal(ts, ts.fillna(method="ffill")) + + ts[2] = np.NaN + + exp = Series([0.0, 1.0, 1.0, 3.0, 4.0], index=ts.index) + tm.assert_series_equal(ts.fillna(method="ffill"), exp) + + exp = Series([0.0, 1.0, 3.0, 3.0, 4.0], index=ts.index) + tm.assert_series_equal(ts.fillna(method="backfill"), exp) + + exp = Series([0.0, 1.0, 5.0, 3.0, 4.0], index=ts.index) + tm.assert_series_equal(ts.fillna(value=5), exp) + + msg = "Must specify a fill 'value' or 'method'" + with pytest.raises(ValueError, match=msg): + ts.fillna() + + msg = "Cannot specify both 'value' and 'method'" + with pytest.raises(ValueError, match=msg): + datetime_series.fillna(value=0, method="ffill") + + # GH 5703 + s1 = Series([np.nan]) + s2 = Series([1]) + result = s1.fillna(s2) + expected = Series([1.0]) + tm.assert_series_equal(result, expected) + result = s1.fillna({}) + tm.assert_series_equal(result, s1) + result = s1.fillna(Series((), dtype=object)) + tm.assert_series_equal(result, s1) + result = s2.fillna(s1) + tm.assert_series_equal(result, s2) + result = s1.fillna({0: 1}) + tm.assert_series_equal(result, expected) + result = s1.fillna({1: 1}) + tm.assert_series_equal(result, Series([np.nan])) + result = s1.fillna({0: 1, 1: 1}) + tm.assert_series_equal(result, expected) + result = s1.fillna(Series({0: 1, 1: 1})) + tm.assert_series_equal(result, expected) + result = s1.fillna(Series({0: 1, 1: 1}, index=[4, 5])) + tm.assert_series_equal(result, s1) + + s1 = Series([0, 1, 2], list("abc")) + s2 = Series([0, np.nan, 2], list("bac")) + result = s2.fillna(s1) + expected = Series([0, 0, 2.0], list("bac")) + tm.assert_series_equal(result, expected) + + # limit + s = Series(np.nan, index=[0, 1, 2]) + result = s.fillna(999, limit=1) + expected = Series([999, np.nan, np.nan], index=[0, 1, 2]) + tm.assert_series_equal(result, expected) + + result = s.fillna(999, limit=2) + expected = Series([999, 999, np.nan], index=[0, 1, 2]) + tm.assert_series_equal(result, expected) + + # GH 9043 + # make sure a string representation of int/float values can be filled + # correctly without raising errors or being converted + vals = ["0", "1.5", "-0.3"] + for val in vals: + s = Series([0, 1, np.nan, np.nan, 4], dtype="float64") + result = s.fillna(val) + expected = Series([0, 1, val, val, 4], dtype="object") + tm.assert_series_equal(result, expected) + + def test_fillna_bug(self): + x = Series([np.nan, 1.0, np.nan, 3.0, np.nan], ["z", "a", "b", "c", "d"]) + filled = x.fillna(method="ffill") + expected = Series([np.nan, 1.0, 1.0, 3.0, 3.0], x.index) + tm.assert_series_equal(filled, expected) + + filled = x.fillna(method="bfill") + expected = Series([1.0, 1.0, 3.0, 3.0, np.nan], x.index) + tm.assert_series_equal(filled, expected) + + def test_fillna_inplace(self): + x = Series([np.nan, 1.0, np.nan, 3.0, np.nan], ["z", "a", "b", "c", "d"]) + y = x.copy() + + y.fillna(value=0, inplace=True) + + expected = x.fillna(value=0) + tm.assert_series_equal(y, expected) + + def test_fillna_invalid_method(self, datetime_series): + try: + datetime_series.fillna(method="ffil") + except ValueError as inst: + assert "ffil" in str(inst) + + def test_ffill(self): + ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5)) + ts[2] = np.NaN + tm.assert_series_equal(ts.ffill(), ts.fillna(method="ffill")) + + def test_ffill_mixed_dtypes_without_missing_data(self): + # GH14956 + series = pd.Series([datetime(2015, 1, 1, tzinfo=pytz.utc), 1]) + result = series.ffill() + tm.assert_series_equal(series, result) + + def test_bfill(self): + ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5)) + ts[2] = np.NaN + tm.assert_series_equal(ts.bfill(), ts.fillna(method="bfill")) + + def test_timedelta64_nan(self): + + td = Series([timedelta(days=i) for i in range(10)]) + + # nan ops on timedeltas + td1 = td.copy() + td1[0] = np.nan + assert isna(td1[0]) + assert td1[0].value == iNaT + td1[0] = td[0] + assert not isna(td1[0]) + + # GH#16674 iNaT is treated as an integer when given by the user + td1[1] = iNaT + assert not isna(td1[1]) + assert td1.dtype == np.object_ + assert td1[1] == iNaT + td1[1] = td[1] + assert not isna(td1[1]) + + td1[2] = NaT + assert isna(td1[2]) + assert td1[2].value == iNaT + td1[2] = td[2] + assert not isna(td1[2]) + + # FIXME: don't leave commented-out + # boolean setting + # this doesn't work, not sure numpy even supports it + # result = td[(td>np.timedelta64(timedelta(days=3))) & + # td= -0.5) & (datetime_series <= 0.5) + # tm.assert_series_equal(selector, expected) + + def test_dropna_empty(self): + s = Series([], dtype=object) + + assert len(s.dropna()) == 0 + s.dropna(inplace=True) + assert len(s) == 0 + + # invalid axis + msg = "No axis named 1 for object type " + with pytest.raises(ValueError, match=msg): + s.dropna(axis=1) + + def test_datetime64_tz_dropna(self): + # DatetimeBlock + s = Series( + [ + Timestamp("2011-01-01 10:00"), + pd.NaT, + Timestamp("2011-01-03 10:00"), + pd.NaT, + ] + ) + result = s.dropna() + expected = Series( + [Timestamp("2011-01-01 10:00"), Timestamp("2011-01-03 10:00")], index=[0, 2] + ) + tm.assert_series_equal(result, expected) + + # DatetimeBlockTZ + idx = pd.DatetimeIndex( + ["2011-01-01 10:00", pd.NaT, "2011-01-03 10:00", pd.NaT], tz="Asia/Tokyo" + ) + s = pd.Series(idx) + assert s.dtype == "datetime64[ns, Asia/Tokyo]" + result = s.dropna() + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + Timestamp("2011-01-03 10:00", tz="Asia/Tokyo"), + ], + index=[0, 2], + ) + assert result.dtype == "datetime64[ns, Asia/Tokyo]" + tm.assert_series_equal(result, expected) + + def test_dropna_no_nan(self): + for s in [Series([1, 2, 3], name="x"), Series([False, True, False], name="x")]: + + result = s.dropna() + tm.assert_series_equal(result, s) + assert result is not s + + s2 = s.copy() + s2.dropna(inplace=True) + tm.assert_series_equal(s2, s) + + def test_dropna_intervals(self): + s = Series( + [np.nan, 1, 2, 3], + IntervalIndex.from_arrays([np.nan, 0, 1, 2], [np.nan, 1, 2, 3]), + ) + + result = s.dropna() + expected = s.iloc[1:] + tm.assert_series_equal(result, expected) + + def test_valid(self, datetime_series): + ts = datetime_series.copy() + ts[::2] = np.NaN + + result = ts.dropna() + assert len(result) == ts.count() + tm.assert_series_equal(result, ts[1::2]) + tm.assert_series_equal(result, ts[pd.notna(ts)]) + + def test_isna(self): + ser = Series([0, 5.4, 3, np.nan, -0.001]) + expected = Series([False, False, False, True, False]) + tm.assert_series_equal(ser.isna(), expected) + + ser = Series(["hi", "", np.nan]) + expected = Series([False, False, True]) + tm.assert_series_equal(ser.isna(), expected) + + def test_notna(self): + ser = Series([0, 5.4, 3, np.nan, -0.001]) + expected = Series([True, True, True, False, True]) + tm.assert_series_equal(ser.notna(), expected) + + ser = Series(["hi", "", np.nan]) + expected = Series([True, True, False]) + tm.assert_series_equal(ser.notna(), expected) + + def test_pad_nan(self): + x = Series( + [np.nan, 1.0, np.nan, 3.0, np.nan], ["z", "a", "b", "c", "d"], dtype=float + ) + + x.fillna(method="pad", inplace=True) + + expected = Series( + [np.nan, 1.0, 1.0, 3.0, 3.0], ["z", "a", "b", "c", "d"], dtype=float + ) + tm.assert_series_equal(x[1:], expected[1:]) + assert np.isnan(x[0]), np.isnan(expected[0]) + + def test_pad_require_monotonicity(self): + rng = date_range("1/1/2000", "3/1/2000", freq="B") + + # neither monotonic increasing or decreasing + rng2 = rng[[1, 0, 2]] + + msg = "index must be monotonic increasing or decreasing" + with pytest.raises(ValueError, match=msg): + rng2.get_indexer(rng, method="pad") + + def test_dropna_preserve_name(self, datetime_series): + datetime_series[:5] = np.nan + result = datetime_series.dropna() + assert result.name == datetime_series.name + name = datetime_series.name + ts = datetime_series.copy() + ts.dropna(inplace=True) + assert ts.name == name + + def test_fill_value_when_combine_const(self): + # GH12723 + s = Series([0, 1, np.nan, 3, 4, 5]) + + exp = s.fillna(0).add(2) + res = s.add(2, fill_value=0) + tm.assert_series_equal(res, exp) + + def test_series_fillna_limit(self): + index = np.arange(10) + s = Series(np.random.randn(10), index=index) + + result = s[:2].reindex(index) + result = result.fillna(method="pad", limit=5) + + expected = s[:2].reindex(index).fillna(method="pad") + expected[-3:] = np.nan + tm.assert_series_equal(result, expected) + + result = s[-2:].reindex(index) + result = result.fillna(method="bfill", limit=5) + + expected = s[-2:].reindex(index).fillna(method="backfill") + expected[:3] = np.nan + tm.assert_series_equal(result, expected) + + def test_series_pad_backfill_limit(self): + index = np.arange(10) + s = Series(np.random.randn(10), index=index) + + result = s[:2].reindex(index, method="pad", limit=5) + + expected = s[:2].reindex(index).fillna(method="pad") + expected[-3:] = np.nan + tm.assert_series_equal(result, expected) + + result = s[-2:].reindex(index, method="backfill", limit=5) + + expected = s[-2:].reindex(index).fillna(method="backfill") + expected[:3] = np.nan + tm.assert_series_equal(result, expected) + + +@pytest.fixture( + params=[ + "linear", + "index", + "values", + "nearest", + "slinear", + "zero", + "quadratic", + "cubic", + "barycentric", + "krogh", + "polynomial", + "spline", + "piecewise_polynomial", + "from_derivatives", + "pchip", + "akima", + ] +) +def nontemporal_method(request): + """ Fixture that returns an (method name, required kwargs) pair. + + This fixture does not include method 'time' as a parameterization; that + method requires a Series with a DatetimeIndex, and is generally tested + separately from these non-temporal methods. + """ + method = request.param + kwargs = dict(order=1) if method in ("spline", "polynomial") else dict() + return method, kwargs + + +@pytest.fixture( + params=[ + "linear", + "slinear", + "zero", + "quadratic", + "cubic", + "barycentric", + "krogh", + "polynomial", + "spline", + "piecewise_polynomial", + "from_derivatives", + "pchip", + "akima", + ] +) +def interp_methods_ind(request): + """ Fixture that returns a (method name, required kwargs) pair to + be tested for various Index types. + + This fixture does not include methods - 'time', 'index', 'nearest', + 'values' as a parameterization + """ + method = request.param + kwargs = dict(order=1) if method in ("spline", "polynomial") else dict() + return method, kwargs + + +class TestSeriesInterpolateData: + def test_interpolate(self, datetime_series, string_series): + ts = Series(np.arange(len(datetime_series), dtype=float), datetime_series.index) + + ts_copy = ts.copy() + ts_copy[5:10] = np.NaN + + linear_interp = ts_copy.interpolate(method="linear") + tm.assert_series_equal(linear_interp, ts) + + ord_ts = Series( + [d.toordinal() for d in datetime_series.index], index=datetime_series.index + ).astype(float) + + ord_ts_copy = ord_ts.copy() + ord_ts_copy[5:10] = np.NaN + + time_interp = ord_ts_copy.interpolate(method="time") + tm.assert_series_equal(time_interp, ord_ts) + + def test_interpolate_time_raises_for_non_timeseries(self): + # When method='time' is used on a non-TimeSeries that contains a null + # value, a ValueError should be raised. + non_ts = Series([0, 1, 2, np.NaN]) + msg = "time-weighted interpolation only works on Series.* with a DatetimeIndex" + with pytest.raises(ValueError, match=msg): + non_ts.interpolate(method="time") + + @td.skip_if_no_scipy + def test_interpolate_pchip(self): + + ser = Series(np.sort(np.random.uniform(size=100))) + + # interpolate at new_index + new_index = ser.index.union( + Index([49.25, 49.5, 49.75, 50.25, 50.5, 50.75]) + ).astype(float) + interp_s = ser.reindex(new_index).interpolate(method="pchip") + # does not blow up, GH5977 + interp_s[49:51] + + @td.skip_if_no_scipy + def test_interpolate_akima(self): + + ser = Series([10, 11, 12, 13]) + + expected = Series( + [11.00, 11.25, 11.50, 11.75, 12.00, 12.25, 12.50, 12.75, 13.00], + index=Index([1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 2.75, 3.0]), + ) + # interpolate at new_index + new_index = ser.index.union(Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75])).astype( + float + ) + interp_s = ser.reindex(new_index).interpolate(method="akima") + tm.assert_series_equal(interp_s[1:3], expected) + + @td.skip_if_no_scipy + def test_interpolate_piecewise_polynomial(self): + ser = Series([10, 11, 12, 13]) + + expected = Series( + [11.00, 11.25, 11.50, 11.75, 12.00, 12.25, 12.50, 12.75, 13.00], + index=Index([1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 2.75, 3.0]), + ) + # interpolate at new_index + new_index = ser.index.union(Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75])).astype( + float + ) + interp_s = ser.reindex(new_index).interpolate(method="piecewise_polynomial") + tm.assert_series_equal(interp_s[1:3], expected) + + @td.skip_if_no_scipy + def test_interpolate_from_derivatives(self): + ser = Series([10, 11, 12, 13]) + + expected = Series( + [11.00, 11.25, 11.50, 11.75, 12.00, 12.25, 12.50, 12.75, 13.00], + index=Index([1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 2.75, 3.0]), + ) + # interpolate at new_index + new_index = ser.index.union(Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75])).astype( + float + ) + interp_s = ser.reindex(new_index).interpolate(method="from_derivatives") + tm.assert_series_equal(interp_s[1:3], expected) + + @pytest.mark.parametrize( + "kwargs", + [ + {}, + pytest.param( + {"method": "polynomial", "order": 1}, marks=td.skip_if_no_scipy + ), + ], + ) + def test_interpolate_corners(self, kwargs): + s = Series([np.nan, np.nan]) + tm.assert_series_equal(s.interpolate(**kwargs), s) + + s = Series([], dtype=object).interpolate() + tm.assert_series_equal(s.interpolate(**kwargs), s) + + def test_interpolate_index_values(self): + s = Series(np.nan, index=np.sort(np.random.rand(30))) + s[::3] = np.random.randn(10) + + vals = s.index.values.astype(float) + + result = s.interpolate(method="index") + + expected = s.copy() + bad = isna(expected.values) + good = ~bad + expected = Series( + np.interp(vals[bad], vals[good], s.values[good]), index=s.index[bad] + ) + + tm.assert_series_equal(result[bad], expected) + + # 'values' is synonymous with 'index' for the method kwarg + other_result = s.interpolate(method="values") + + tm.assert_series_equal(other_result, result) + tm.assert_series_equal(other_result[bad], expected) + + def test_interpolate_non_ts(self): + s = Series([1, 3, np.nan, np.nan, np.nan, 11]) + msg = ( + "time-weighted interpolation only works on Series or DataFrames " + "with a DatetimeIndex" + ) + with pytest.raises(ValueError, match=msg): + s.interpolate(method="time") + + @pytest.mark.parametrize( + "kwargs", + [ + {}, + pytest.param( + {"method": "polynomial", "order": 1}, marks=td.skip_if_no_scipy + ), + ], + ) + def test_nan_interpolate(self, kwargs): + s = Series([0, 1, np.nan, 3]) + result = s.interpolate(**kwargs) + expected = Series([0.0, 1.0, 2.0, 3.0]) + tm.assert_series_equal(result, expected) + + def test_nan_irregular_index(self): + s = Series([1, 2, np.nan, 4], index=[1, 3, 5, 9]) + result = s.interpolate() + expected = Series([1.0, 2.0, 3.0, 4.0], index=[1, 3, 5, 9]) + tm.assert_series_equal(result, expected) + + def test_nan_str_index(self): + s = Series([0, 1, 2, np.nan], index=list("abcd")) + result = s.interpolate() + expected = Series([0.0, 1.0, 2.0, 2.0], index=list("abcd")) + tm.assert_series_equal(result, expected) + + @td.skip_if_no_scipy + def test_interp_quad(self): + sq = Series([1, 4, np.nan, 16], index=[1, 2, 3, 4]) + result = sq.interpolate(method="quadratic") + expected = Series([1.0, 4.0, 9.0, 16.0], index=[1, 2, 3, 4]) + tm.assert_series_equal(result, expected) + + @td.skip_if_no_scipy + def test_interp_scipy_basic(self): + s = Series([1, 3, np.nan, 12, np.nan, 25]) + # slinear + expected = Series([1.0, 3.0, 7.5, 12.0, 18.5, 25.0]) + result = s.interpolate(method="slinear") + tm.assert_series_equal(result, expected) + + result = s.interpolate(method="slinear", downcast="infer") + tm.assert_series_equal(result, expected) + # nearest + expected = Series([1, 3, 3, 12, 12, 25]) + result = s.interpolate(method="nearest") + tm.assert_series_equal(result, expected.astype("float")) + + result = s.interpolate(method="nearest", downcast="infer") + tm.assert_series_equal(result, expected) + # zero + expected = Series([1, 3, 3, 12, 12, 25]) + result = s.interpolate(method="zero") + tm.assert_series_equal(result, expected.astype("float")) + + result = s.interpolate(method="zero", downcast="infer") + tm.assert_series_equal(result, expected) + # quadratic + # GH #15662. + expected = Series([1, 3.0, 6.823529, 12.0, 18.058824, 25.0]) + result = s.interpolate(method="quadratic") + tm.assert_series_equal(result, expected) + + result = s.interpolate(method="quadratic", downcast="infer") + tm.assert_series_equal(result, expected) + # cubic + expected = Series([1.0, 3.0, 6.8, 12.0, 18.2, 25.0]) + result = s.interpolate(method="cubic") + tm.assert_series_equal(result, expected) + + def test_interp_limit(self): + s = Series([1, 3, np.nan, np.nan, np.nan, 11]) + + expected = Series([1.0, 3.0, 5.0, 7.0, np.nan, 11.0]) + result = s.interpolate(method="linear", limit=2) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("limit", [-1, 0]) + def test_interpolate_invalid_nonpositive_limit(self, nontemporal_method, limit): + # GH 9217: make sure limit is greater than zero. + s = pd.Series([1, 2, np.nan, 4]) + method, kwargs = nontemporal_method + with pytest.raises(ValueError, match="Limit must be greater than 0"): + s.interpolate(limit=limit, method=method, **kwargs) + + def test_interpolate_invalid_float_limit(self, nontemporal_method): + # GH 9217: make sure limit is an integer. + s = pd.Series([1, 2, np.nan, 4]) + method, kwargs = nontemporal_method + limit = 2.0 + with pytest.raises(ValueError, match="Limit must be an integer"): + s.interpolate(limit=limit, method=method, **kwargs) + + @pytest.mark.parametrize("invalid_method", [None, "nonexistent_method"]) + def test_interp_invalid_method(self, invalid_method): + s = Series([1, 3, np.nan, 12, np.nan, 25]) + + msg = f"method must be one of.* Got '{invalid_method}' instead" + with pytest.raises(ValueError, match=msg): + s.interpolate(method=invalid_method) + + # When an invalid method and invalid limit (such as -1) are + # provided, the error message reflects the invalid method. + with pytest.raises(ValueError, match=msg): + s.interpolate(method=invalid_method, limit=-1) + + def test_interp_limit_forward(self): + s = Series([1, 3, np.nan, np.nan, np.nan, 11]) + + # Provide 'forward' (the default) explicitly here. + expected = Series([1.0, 3.0, 5.0, 7.0, np.nan, 11.0]) + + result = s.interpolate(method="linear", limit=2, limit_direction="forward") + tm.assert_series_equal(result, expected) + + result = s.interpolate(method="linear", limit=2, limit_direction="FORWARD") + tm.assert_series_equal(result, expected) + + def test_interp_unlimited(self): + # these test are for issue #16282 default Limit=None is unlimited + s = Series([np.nan, 1.0, 3.0, np.nan, np.nan, np.nan, 11.0, np.nan]) + expected = Series([1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 11.0]) + result = s.interpolate(method="linear", limit_direction="both") + tm.assert_series_equal(result, expected) + + expected = Series([np.nan, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 11.0]) + result = s.interpolate(method="linear", limit_direction="forward") + tm.assert_series_equal(result, expected) + + expected = Series([1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, np.nan]) + result = s.interpolate(method="linear", limit_direction="backward") + tm.assert_series_equal(result, expected) + + def test_interp_limit_bad_direction(self): + s = Series([1, 3, np.nan, np.nan, np.nan, 11]) + + msg = ( + r"Invalid limit_direction: expecting one of \['forward'," + r" 'backward', 'both'\], got 'abc'" + ) + with pytest.raises(ValueError, match=msg): + s.interpolate(method="linear", limit=2, limit_direction="abc") + + # raises an error even if no limit is specified. + with pytest.raises(ValueError, match=msg): + s.interpolate(method="linear", limit_direction="abc") + + # limit_area introduced GH #16284 + def test_interp_limit_area(self): + # These tests are for issue #9218 -- fill NaNs in both directions. + s = Series([np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan]) + + expected = Series([np.nan, np.nan, 3.0, 4.0, 5.0, 6.0, 7.0, np.nan, np.nan]) + result = s.interpolate(method="linear", limit_area="inside") + tm.assert_series_equal(result, expected) + + expected = Series( + [np.nan, np.nan, 3.0, 4.0, np.nan, np.nan, 7.0, np.nan, np.nan] + ) + result = s.interpolate(method="linear", limit_area="inside", limit=1) + + expected = Series([np.nan, np.nan, 3.0, 4.0, np.nan, 6.0, 7.0, np.nan, np.nan]) + result = s.interpolate( + method="linear", limit_area="inside", limit_direction="both", limit=1 + ) + tm.assert_series_equal(result, expected) + + expected = Series([np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, 7.0]) + result = s.interpolate(method="linear", limit_area="outside") + tm.assert_series_equal(result, expected) + + expected = Series( + [np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, np.nan] + ) + result = s.interpolate(method="linear", limit_area="outside", limit=1) + + expected = Series([np.nan, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, np.nan]) + result = s.interpolate( + method="linear", limit_area="outside", limit_direction="both", limit=1 + ) + tm.assert_series_equal(result, expected) + + expected = Series([3.0, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan]) + result = s.interpolate( + method="linear", limit_area="outside", direction="backward" + ) + + # raises an error even if limit type is wrong. + msg = r"Invalid limit_area: expecting one of \['inside', 'outside'\], got abc" + with pytest.raises(ValueError, match=msg): + s.interpolate(method="linear", limit_area="abc") + + def test_interp_limit_direction(self): + # These tests are for issue #9218 -- fill NaNs in both directions. + s = Series([1, 3, np.nan, np.nan, np.nan, 11]) + + expected = Series([1.0, 3.0, np.nan, 7.0, 9.0, 11.0]) + result = s.interpolate(method="linear", limit=2, limit_direction="backward") + tm.assert_series_equal(result, expected) + + expected = Series([1.0, 3.0, 5.0, np.nan, 9.0, 11.0]) + result = s.interpolate(method="linear", limit=1, limit_direction="both") + tm.assert_series_equal(result, expected) + + # Check that this works on a longer series of nans. + s = Series([1, 3, np.nan, np.nan, np.nan, 7, 9, np.nan, np.nan, 12, np.nan]) + + expected = Series([1.0, 3.0, 4.0, 5.0, 6.0, 7.0, 9.0, 10.0, 11.0, 12.0, 12.0]) + result = s.interpolate(method="linear", limit=2, limit_direction="both") + tm.assert_series_equal(result, expected) + + expected = Series( + [1.0, 3.0, 4.0, np.nan, 6.0, 7.0, 9.0, 10.0, 11.0, 12.0, 12.0] + ) + result = s.interpolate(method="linear", limit=1, limit_direction="both") + tm.assert_series_equal(result, expected) + + def test_interp_limit_to_ends(self): + # These test are for issue #10420 -- flow back to beginning. + s = Series([np.nan, np.nan, 5, 7, 9, np.nan]) + + expected = Series([5.0, 5.0, 5.0, 7.0, 9.0, np.nan]) + result = s.interpolate(method="linear", limit=2, limit_direction="backward") + tm.assert_series_equal(result, expected) + + expected = Series([5.0, 5.0, 5.0, 7.0, 9.0, 9.0]) + result = s.interpolate(method="linear", limit=2, limit_direction="both") + tm.assert_series_equal(result, expected) + + def test_interp_limit_before_ends(self): + # These test are for issue #11115 -- limit ends properly. + s = Series([np.nan, np.nan, 5, 7, np.nan, np.nan]) + + expected = Series([np.nan, np.nan, 5.0, 7.0, 7.0, np.nan]) + result = s.interpolate(method="linear", limit=1, limit_direction="forward") + tm.assert_series_equal(result, expected) + + expected = Series([np.nan, 5.0, 5.0, 7.0, np.nan, np.nan]) + result = s.interpolate(method="linear", limit=1, limit_direction="backward") + tm.assert_series_equal(result, expected) + + expected = Series([np.nan, 5.0, 5.0, 7.0, 7.0, np.nan]) + result = s.interpolate(method="linear", limit=1, limit_direction="both") + tm.assert_series_equal(result, expected) + + @td.skip_if_no_scipy + def test_interp_all_good(self): + s = Series([1, 2, 3]) + result = s.interpolate(method="polynomial", order=1) + tm.assert_series_equal(result, s) + + # non-scipy + result = s.interpolate() + tm.assert_series_equal(result, s) + + @pytest.mark.parametrize( + "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] + ) + def test_interp_multiIndex(self, check_scipy): + idx = MultiIndex.from_tuples([(0, "a"), (1, "b"), (2, "c")]) + s = Series([1, 2, np.nan], index=idx) + + expected = s.copy() + expected.loc[2] = 2 + result = s.interpolate() + tm.assert_series_equal(result, expected) + + msg = "Only `method=linear` interpolation is supported on MultiIndexes" + if check_scipy: + with pytest.raises(ValueError, match=msg): + s.interpolate(method="polynomial", order=1) + + @td.skip_if_no_scipy + def test_interp_nonmono_raise(self): + s = Series([1, np.nan, 3], index=[0, 2, 1]) + msg = "krogh interpolation requires that the index be monotonic" + with pytest.raises(ValueError, match=msg): + s.interpolate(method="krogh") + + @td.skip_if_no_scipy + @pytest.mark.parametrize("method", ["nearest", "pad"]) + def test_interp_datetime64(self, method, tz_naive_fixture): + df = Series( + [1, np.nan, 3], index=date_range("1/1/2000", periods=3, tz=tz_naive_fixture) + ) + result = df.interpolate(method=method) + expected = Series( + [1.0, 1.0, 3.0], + index=date_range("1/1/2000", periods=3, tz=tz_naive_fixture), + ) + tm.assert_series_equal(result, expected) + + def test_interp_pad_datetime64tz_values(self): + # GH#27628 missing.interpolate_2d should handle datetimetz values + dti = pd.date_range("2015-04-05", periods=3, tz="US/Central") + ser = pd.Series(dti) + ser[1] = pd.NaT + result = ser.interpolate(method="pad") + + expected = pd.Series(dti) + expected[1] = expected[0] + tm.assert_series_equal(result, expected) + + def test_interp_limit_no_nans(self): + # GH 7173 + s = pd.Series([1.0, 2.0, 3.0]) + result = s.interpolate(limit=1) + expected = s + tm.assert_series_equal(result, expected) + + @td.skip_if_no_scipy + @pytest.mark.parametrize("method", ["polynomial", "spline"]) + def test_no_order(self, method): + # see GH-10633, GH-24014 + s = Series([0, 1, np.nan, 3]) + msg = "You must specify the order of the spline or polynomial" + with pytest.raises(ValueError, match=msg): + s.interpolate(method=method) + + @td.skip_if_no_scipy + @pytest.mark.parametrize("order", [-1, -1.0, 0, 0.0, np.nan]) + def test_interpolate_spline_invalid_order(self, order): + s = Series([0, 1, np.nan, 3]) + msg = "order needs to be specified and greater than 0" + with pytest.raises(ValueError, match=msg): + s.interpolate(method="spline", order=order) + + @td.skip_if_no_scipy + def test_spline(self): + s = Series([1, 2, np.nan, 4, 5, np.nan, 7]) + result = s.interpolate(method="spline", order=1) + expected = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]) + tm.assert_series_equal(result, expected) + + @td.skip_if_no_scipy + def test_spline_extrapolate(self): + s = Series([1, 2, 3, 4, np.nan, 6, np.nan]) + result3 = s.interpolate(method="spline", order=1, ext=3) + expected3 = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 6.0]) + tm.assert_series_equal(result3, expected3) + + result1 = s.interpolate(method="spline", order=1, ext=0) + expected1 = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]) + tm.assert_series_equal(result1, expected1) + + @td.skip_if_no_scipy + def test_spline_smooth(self): + s = Series([1, 2, np.nan, 4, 5.1, np.nan, 7]) + assert ( + s.interpolate(method="spline", order=3, s=0)[5] + != s.interpolate(method="spline", order=3)[5] + ) + + @td.skip_if_no_scipy + def test_spline_interpolation(self): + s = Series(np.arange(10) ** 2) + s[np.random.randint(0, 9, 3)] = np.nan + result1 = s.interpolate(method="spline", order=1) + expected1 = s.interpolate(method="spline", order=1) + tm.assert_series_equal(result1, expected1) + + def test_interp_timedelta64(self): + # GH 6424 + df = Series([1, np.nan, 3], index=pd.to_timedelta([1, 2, 3])) + result = df.interpolate(method="time") + expected = Series([1.0, 2.0, 3.0], index=pd.to_timedelta([1, 2, 3])) + tm.assert_series_equal(result, expected) + + # test for non uniform spacing + df = Series([1, np.nan, 3], index=pd.to_timedelta([1, 2, 4])) + result = df.interpolate(method="time") + expected = Series([1.0, 1.666667, 3.0], index=pd.to_timedelta([1, 2, 4])) + tm.assert_series_equal(result, expected) + + def test_series_interpolate_method_values(self): + # #1646 + ts = _simple_ts("1/1/2000", "1/20/2000") + ts[::2] = np.nan + + result = ts.interpolate(method="values") + exp = ts.interpolate() + tm.assert_series_equal(result, exp) + + def test_series_interpolate_intraday(self): + # #1698 + index = pd.date_range("1/1/2012", periods=4, freq="12D") + ts = pd.Series([0, 12, 24, 36], index) + new_index = index.append(index + pd.DateOffset(days=1)).sort_values() + + exp = ts.reindex(new_index).interpolate(method="time") + + index = pd.date_range("1/1/2012", periods=4, freq="12H") + ts = pd.Series([0, 12, 24, 36], index) + new_index = index.append(index + pd.DateOffset(hours=1)).sort_values() + result = ts.reindex(new_index).interpolate(method="time") + + tm.assert_numpy_array_equal(result.values, exp.values) + + @pytest.mark.parametrize( + "ind", + [ + ["a", "b", "c", "d"], + pd.period_range(start="2019-01-01", periods=4), + pd.interval_range(start=0, end=4), + ], + ) + def test_interp_non_timedelta_index(self, interp_methods_ind, ind): + # gh 21662 + df = pd.DataFrame([0, 1, np.nan, 3], index=ind) + + method, kwargs = interp_methods_ind + if method == "pchip": + pytest.importorskip("scipy") + + if method == "linear": + result = df[0].interpolate(**kwargs) + expected = pd.Series([0.0, 1.0, 2.0, 3.0], name=0, index=ind) + tm.assert_series_equal(result, expected) + else: + expected_error = ( + "Index column must be numeric or datetime type when " + f"using {method} method other than linear. " + "Try setting a numeric or datetime index column before " + "interpolating." + ) + with pytest.raises(ValueError, match=expected_error): + df[0].interpolate(method=method, **kwargs) + + def test_interpolate_timedelta_index(self, interp_methods_ind): + """ + Tests for non numerical index types - object, period, timedelta + Note that all methods except time, index, nearest and values + are tested here. + """ + # gh 21662 + ind = pd.timedelta_range(start=1, periods=4) + df = pd.DataFrame([0, 1, np.nan, 3], index=ind) + + method, kwargs = interp_methods_ind + if method == "pchip": + pytest.importorskip("scipy") + + if method in {"linear", "pchip"}: + result = df[0].interpolate(method=method, **kwargs) + expected = pd.Series([0.0, 1.0, 2.0, 3.0], name=0, index=ind) + tm.assert_series_equal(result, expected) + else: + pytest.skip( + "This interpolation method is not supported for Timedelta Index yet." + ) + + @pytest.mark.parametrize( + "ascending, expected_values", + [(True, [1, 2, 3, 9, 10]), (False, [10, 9, 3, 2, 1])], + ) + def test_interpolate_unsorted_index(self, ascending, expected_values): + # GH 21037 + ts = pd.Series(data=[10, 9, np.nan, 2, 1], index=[10, 9, 3, 2, 1]) + result = ts.sort_index(ascending=ascending).interpolate(method="index") + expected = pd.Series(data=expected_values, index=expected_values, dtype=float) + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/series/test_operators.py b/venv/Lib/site-packages/pandas/tests/series/test_operators.py new file mode 100644 index 0000000..bdd9f92 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/test_operators.py @@ -0,0 +1,936 @@ +from datetime import datetime, timedelta +import operator + +import numpy as np +import pytest + +import pandas as pd +from pandas import Categorical, DataFrame, Index, Series, bdate_range, date_range, isna +import pandas._testing as tm +from pandas.core import ops +import pandas.core.nanops as nanops + + +class TestSeriesLogicalOps: + @pytest.mark.parametrize("bool_op", [operator.and_, operator.or_, operator.xor]) + def test_bool_operators_with_nas(self, bool_op): + # boolean &, |, ^ should work with object arrays and propagate NAs + ser = Series(bdate_range("1/1/2000", periods=10), dtype=object) + ser[::2] = np.nan + + mask = ser.isna() + filled = ser.fillna(ser[0]) + + result = bool_op(ser < ser[9], ser > ser[3]) + + expected = bool_op(filled < filled[9], filled > filled[3]) + expected[mask] = False + tm.assert_series_equal(result, expected) + + def test_logical_operators_bool_dtype_with_empty(self): + # GH#9016: support bitwise op for integer types + index = list("bca") + + s_tft = Series([True, False, True], index=index) + s_fff = Series([False, False, False], index=index) + s_empty = Series([], dtype=object) + + res = s_tft & s_empty + expected = s_fff + tm.assert_series_equal(res, expected) + + res = s_tft | s_empty + expected = s_tft + tm.assert_series_equal(res, expected) + + @pytest.mark.parametrize( + "left, right, op, expected", + [ + ( + [True, False, np.nan], + [True, False, True], + operator.and_, + [True, False, False], + ), + ( + [True, False, True], + [True, False, np.nan], + operator.and_, + [True, False, False], + ), + ( + [True, False, np.nan], + [True, False, True], + operator.or_, + [True, False, False], + ), + ( + [True, False, True], + [True, False, np.nan], + operator.or_, + [True, False, True], + ), + ], + ) + def test_logical_operators_nans(self, left, right, op, expected): + # GH 13896 + result = op(Series(left), Series(right)) + expected = Series(expected) + + tm.assert_series_equal(result, expected) + + def test_logical_operators_int_dtype_with_int_dtype(self): + # GH#9016: support bitwise op for integer types + + # TODO: unused + # s_0101 = Series([0, 1, 0, 1]) + + s_0123 = Series(range(4), dtype="int64") + s_3333 = Series([3] * 4) + s_4444 = Series([4] * 4) + + res = s_0123 & s_3333 + expected = Series(range(4), dtype="int64") + tm.assert_series_equal(res, expected) + + res = s_0123 | s_4444 + expected = Series(range(4, 8), dtype="int64") + tm.assert_series_equal(res, expected) + + s_1111 = Series([1] * 4, dtype="int8") + res = s_0123 & s_1111 + expected = Series([0, 1, 0, 1], dtype="int64") + tm.assert_series_equal(res, expected) + + res = s_0123.astype(np.int16) | s_1111.astype(np.int32) + expected = Series([1, 1, 3, 3], dtype="int32") + tm.assert_series_equal(res, expected) + + def test_logical_operators_int_dtype_with_int_scalar(self): + # GH#9016: support bitwise op for integer types + s_0123 = Series(range(4), dtype="int64") + + res = s_0123 & 0 + expected = Series([0] * 4) + tm.assert_series_equal(res, expected) + + res = s_0123 & 1 + expected = Series([0, 1, 0, 1]) + tm.assert_series_equal(res, expected) + + def test_logical_operators_int_dtype_with_float(self): + # GH#9016: support bitwise op for integer types + s_0123 = Series(range(4), dtype="int64") + + with pytest.raises(TypeError): + s_0123 & np.NaN + with pytest.raises(TypeError): + s_0123 & 3.14 + with pytest.raises(TypeError): + s_0123 & [0.1, 4, 3.14, 2] + with pytest.raises(TypeError): + s_0123 & np.array([0.1, 4, 3.14, 2]) + with pytest.raises(TypeError): + s_0123 & Series([0.1, 4, -3.14, 2]) + + def test_logical_operators_int_dtype_with_str(self): + s_1111 = Series([1] * 4, dtype="int8") + + with pytest.raises(TypeError): + s_1111 & "a" + with pytest.raises(TypeError): + s_1111 & ["a", "b", "c", "d"] + + def test_logical_operators_int_dtype_with_bool(self): + # GH#9016: support bitwise op for integer types + s_0123 = Series(range(4), dtype="int64") + + expected = Series([False] * 4) + + result = s_0123 & False + tm.assert_series_equal(result, expected) + + result = s_0123 & [False] + tm.assert_series_equal(result, expected) + + result = s_0123 & (False,) + tm.assert_series_equal(result, expected) + + result = s_0123 ^ False + expected = Series([False, True, True, True]) + tm.assert_series_equal(result, expected) + + def test_logical_operators_int_dtype_with_object(self): + # GH#9016: support bitwise op for integer types + s_0123 = Series(range(4), dtype="int64") + + result = s_0123 & Series([False, np.NaN, False, False]) + expected = Series([False] * 4) + tm.assert_series_equal(result, expected) + + s_abNd = Series(["a", "b", np.NaN, "d"]) + with pytest.raises(TypeError, match="unsupported.* 'int' and 'str'"): + s_0123 & s_abNd + + def test_logical_operators_bool_dtype_with_int(self): + index = list("bca") + + s_tft = Series([True, False, True], index=index) + s_fff = Series([False, False, False], index=index) + + res = s_tft & 0 + expected = s_fff + tm.assert_series_equal(res, expected) + + res = s_tft & 1 + expected = s_tft + tm.assert_series_equal(res, expected) + + def test_logical_ops_bool_dtype_with_ndarray(self): + # make sure we operate on ndarray the same as Series + left = pd.Series([True, True, True, False, True]) + right = [True, False, None, True, np.nan] + + expected = pd.Series([True, False, False, False, False]) + result = left & right + tm.assert_series_equal(result, expected) + result = left & np.array(right) + tm.assert_series_equal(result, expected) + result = left & pd.Index(right) + tm.assert_series_equal(result, expected) + result = left & pd.Series(right) + tm.assert_series_equal(result, expected) + + expected = pd.Series([True, True, True, True, True]) + result = left | right + tm.assert_series_equal(result, expected) + result = left | np.array(right) + tm.assert_series_equal(result, expected) + result = left | pd.Index(right) + tm.assert_series_equal(result, expected) + result = left | pd.Series(right) + tm.assert_series_equal(result, expected) + + expected = pd.Series([False, True, True, True, True]) + result = left ^ right + tm.assert_series_equal(result, expected) + result = left ^ np.array(right) + tm.assert_series_equal(result, expected) + result = left ^ pd.Index(right) + tm.assert_series_equal(result, expected) + result = left ^ pd.Series(right) + tm.assert_series_equal(result, expected) + + def test_logical_operators_int_dtype_with_bool_dtype_and_reindex(self): + # GH#9016: support bitwise op for integer types + + # with non-matching indexes, logical operators will cast to object + # before operating + index = list("bca") + + s_tft = Series([True, False, True], index=index) + s_tft = Series([True, False, True], index=index) + s_tff = Series([True, False, False], index=index) + + s_0123 = Series(range(4), dtype="int64") + + # s_0123 will be all false now because of reindexing like s_tft + expected = Series([False] * 7, index=[0, 1, 2, 3, "a", "b", "c"]) + result = s_tft & s_0123 + tm.assert_series_equal(result, expected) + + expected = Series([False] * 7, index=[0, 1, 2, 3, "a", "b", "c"]) + result = s_0123 & s_tft + tm.assert_series_equal(result, expected) + + s_a0b1c0 = Series([1], list("b")) + + res = s_tft & s_a0b1c0 + expected = s_tff.reindex(list("abc")) + tm.assert_series_equal(res, expected) + + res = s_tft | s_a0b1c0 + expected = s_tft.reindex(list("abc")) + tm.assert_series_equal(res, expected) + + def test_scalar_na_logical_ops_corners(self): + s = Series([2, 3, 4, 5, 6, 7, 8, 9, 10]) + + with pytest.raises(TypeError): + s & datetime(2005, 1, 1) + + s = Series([2, 3, 4, 5, 6, 7, 8, 9, datetime(2005, 1, 1)]) + s[::2] = np.nan + + expected = Series(True, index=s.index) + expected[::2] = False + result = s & list(s) + tm.assert_series_equal(result, expected) + + d = DataFrame({"A": s}) + # TODO: Fix this exception - needs to be fixed! (see GH5035) + # (previously this was a TypeError because series returned + # NotImplemented + + # this is an alignment issue; these are equivalent + # https://github.com/pandas-dev/pandas/issues/5284 + + with pytest.raises(TypeError): + d.__and__(s, axis="columns") + with pytest.raises(TypeError): + d.__and__(s, axis=1) + + with pytest.raises(TypeError): + s & d + with pytest.raises(TypeError): + d & s + + expected = (s & s).to_frame("A") + result = d.__and__(s, axis="index") + tm.assert_frame_equal(result, expected) + + result = d.__and__(s, axis=0) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("op", [operator.and_, operator.or_, operator.xor]) + def test_logical_ops_with_index(self, op): + # GH#22092, GH#19792 + ser = Series([True, True, False, False]) + idx1 = Index([True, False, True, False]) + idx2 = Index([1, 0, 1, 0]) + + expected = Series([op(ser[n], idx1[n]) for n in range(len(ser))]) + + result = op(ser, idx1) + tm.assert_series_equal(result, expected) + + expected = Series([op(ser[n], idx2[n]) for n in range(len(ser))], dtype=bool) + + result = op(ser, idx2) + tm.assert_series_equal(result, expected) + + def test_reversed_xor_with_index_returns_index(self): + # GH#22092, GH#19792 + ser = Series([True, True, False, False]) + idx1 = Index([True, False, True, False]) + idx2 = Index([1, 0, 1, 0]) + + expected = Index.symmetric_difference(idx1, ser) + result = idx1 ^ ser + tm.assert_index_equal(result, expected) + + expected = Index.symmetric_difference(idx2, ser) + result = idx2 ^ ser + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "op", + [ + pytest.param( + ops.rand_, + marks=pytest.mark.xfail( + reason="GH#22092 Index __and__ returns Index intersection", + raises=AssertionError, + strict=True, + ), + ), + pytest.param( + ops.ror_, + marks=pytest.mark.xfail( + reason="GH#22092 Index __or__ returns Index union", + raises=AssertionError, + strict=True, + ), + ), + ], + ) + def test_reversed_logical_op_with_index_returns_series(self, op): + # GH#22092, GH#19792 + ser = Series([True, True, False, False]) + idx1 = Index([True, False, True, False]) + idx2 = Index([1, 0, 1, 0]) + + expected = pd.Series(op(idx1.values, ser.values)) + result = op(ser, idx1) + tm.assert_series_equal(result, expected) + + expected = pd.Series(op(idx2.values, ser.values)) + result = op(ser, idx2) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "op, expected", + [ + (ops.rand_, pd.Index([False, True])), + (ops.ror_, pd.Index([False, True])), + (ops.rxor, pd.Index([])), + ], + ) + def test_reverse_ops_with_index(self, op, expected): + # https://github.com/pandas-dev/pandas/pull/23628 + # multi-set Index ops are buggy, so let's avoid duplicates... + ser = Series([True, False]) + idx = Index([False, True]) + result = op(ser, idx) + tm.assert_index_equal(result, expected) + + def test_logical_ops_label_based(self): + # GH#4947 + # logical ops should be label based + + a = Series([True, False, True], list("bca")) + b = Series([False, True, False], list("abc")) + + expected = Series([False, True, False], list("abc")) + result = a & b + tm.assert_series_equal(result, expected) + + expected = Series([True, True, False], list("abc")) + result = a | b + tm.assert_series_equal(result, expected) + + expected = Series([True, False, False], list("abc")) + result = a ^ b + tm.assert_series_equal(result, expected) + + # rhs is bigger + a = Series([True, False, True], list("bca")) + b = Series([False, True, False, True], list("abcd")) + + expected = Series([False, True, False, False], list("abcd")) + result = a & b + tm.assert_series_equal(result, expected) + + expected = Series([True, True, False, False], list("abcd")) + result = a | b + tm.assert_series_equal(result, expected) + + # filling + + # vs empty + empty = Series([], dtype=object) + + result = a & empty.copy() + expected = Series([False, False, False], list("bca")) + tm.assert_series_equal(result, expected) + + result = a | empty.copy() + expected = Series([True, False, True], list("bca")) + tm.assert_series_equal(result, expected) + + # vs non-matching + result = a & Series([1], ["z"]) + expected = Series([False, False, False, False], list("abcz")) + tm.assert_series_equal(result, expected) + + result = a | Series([1], ["z"]) + expected = Series([True, True, False, False], list("abcz")) + tm.assert_series_equal(result, expected) + + # identity + # we would like s[s|e] == s to hold for any e, whether empty or not + for e in [ + empty.copy(), + Series([1], ["z"]), + Series(np.nan, b.index), + Series(np.nan, a.index), + ]: + result = a[a | e] + tm.assert_series_equal(result, a[a]) + + for e in [Series(["z"])]: + result = a[a | e] + tm.assert_series_equal(result, a[a]) + + # vs scalars + index = list("bca") + t = Series([True, False, True]) + + for v in [True, 1, 2]: + result = Series([True, False, True], index=index) | v + expected = Series([True, True, True], index=index) + tm.assert_series_equal(result, expected) + + for v in [np.nan, "foo"]: + with pytest.raises(TypeError): + t | v + + for v in [False, 0]: + result = Series([True, False, True], index=index) | v + expected = Series([True, False, True], index=index) + tm.assert_series_equal(result, expected) + + for v in [True, 1]: + result = Series([True, False, True], index=index) & v + expected = Series([True, False, True], index=index) + tm.assert_series_equal(result, expected) + + for v in [False, 0]: + result = Series([True, False, True], index=index) & v + expected = Series([False, False, False], index=index) + tm.assert_series_equal(result, expected) + for v in [np.nan]: + with pytest.raises(TypeError): + t & v + + def test_logical_ops_df_compat(self): + # GH#1134 + s1 = pd.Series([True, False, True], index=list("ABC"), name="x") + s2 = pd.Series([True, True, False], index=list("ABD"), name="x") + + exp = pd.Series([True, False, False, False], index=list("ABCD"), name="x") + tm.assert_series_equal(s1 & s2, exp) + tm.assert_series_equal(s2 & s1, exp) + + # True | np.nan => True + exp_or1 = pd.Series([True, True, True, False], index=list("ABCD"), name="x") + tm.assert_series_equal(s1 | s2, exp_or1) + # np.nan | True => np.nan, filled with False + exp_or = pd.Series([True, True, False, False], index=list("ABCD"), name="x") + tm.assert_series_equal(s2 | s1, exp_or) + + # DataFrame doesn't fill nan with False + tm.assert_frame_equal(s1.to_frame() & s2.to_frame(), exp.to_frame()) + tm.assert_frame_equal(s2.to_frame() & s1.to_frame(), exp.to_frame()) + + exp = pd.DataFrame({"x": [True, True, np.nan, np.nan]}, index=list("ABCD")) + tm.assert_frame_equal(s1.to_frame() | s2.to_frame(), exp_or1.to_frame()) + tm.assert_frame_equal(s2.to_frame() | s1.to_frame(), exp_or.to_frame()) + + # different length + s3 = pd.Series([True, False, True], index=list("ABC"), name="x") + s4 = pd.Series([True, True, True, True], index=list("ABCD"), name="x") + + exp = pd.Series([True, False, True, False], index=list("ABCD"), name="x") + tm.assert_series_equal(s3 & s4, exp) + tm.assert_series_equal(s4 & s3, exp) + + # np.nan | True => np.nan, filled with False + exp_or1 = pd.Series([True, True, True, False], index=list("ABCD"), name="x") + tm.assert_series_equal(s3 | s4, exp_or1) + # True | np.nan => True + exp_or = pd.Series([True, True, True, True], index=list("ABCD"), name="x") + tm.assert_series_equal(s4 | s3, exp_or) + + tm.assert_frame_equal(s3.to_frame() & s4.to_frame(), exp.to_frame()) + tm.assert_frame_equal(s4.to_frame() & s3.to_frame(), exp.to_frame()) + + tm.assert_frame_equal(s3.to_frame() | s4.to_frame(), exp_or1.to_frame()) + tm.assert_frame_equal(s4.to_frame() | s3.to_frame(), exp_or.to_frame()) + + +class TestSeriesComparisons: + def test_comparisons(self): + left = np.random.randn(10) + right = np.random.randn(10) + left[:3] = np.nan + + result = nanops.nangt(left, right) + with np.errstate(invalid="ignore"): + expected = (left > right).astype("O") + expected[:3] = np.nan + + tm.assert_almost_equal(result, expected) + + s = Series(["a", "b", "c"]) + s2 = Series([False, True, False]) + + # it works! + exp = Series([False, False, False]) + tm.assert_series_equal(s == s2, exp) + tm.assert_series_equal(s2 == s, exp) + + def test_categorical_comparisons(self): + # GH 8938 + # allow equality comparisons + a = Series(list("abc"), dtype="category") + b = Series(list("abc"), dtype="object") + c = Series(["a", "b", "cc"], dtype="object") + d = Series(list("acb"), dtype="object") + e = Categorical(list("abc")) + f = Categorical(list("acb")) + + # vs scalar + assert not (a == "a").all() + assert ((a != "a") == ~(a == "a")).all() + + assert not ("a" == a).all() + assert (a == "a")[0] + assert ("a" == a)[0] + assert not ("a" != a)[0] + + # vs list-like + assert (a == a).all() + assert not (a != a).all() + + assert (a == list(a)).all() + assert (a == b).all() + assert (b == a).all() + assert ((~(a == b)) == (a != b)).all() + assert ((~(b == a)) == (b != a)).all() + + assert not (a == c).all() + assert not (c == a).all() + assert not (a == d).all() + assert not (d == a).all() + + # vs a cat-like + assert (a == e).all() + assert (e == a).all() + assert not (a == f).all() + assert not (f == a).all() + + assert (~(a == e) == (a != e)).all() + assert (~(e == a) == (e != a)).all() + assert (~(a == f) == (a != f)).all() + assert (~(f == a) == (f != a)).all() + + # non-equality is not comparable + with pytest.raises(TypeError): + a < b + with pytest.raises(TypeError): + b < a + with pytest.raises(TypeError): + a > b + with pytest.raises(TypeError): + b > a + + def test_comparison_tuples(self): + # GH11339 + # comparisons vs tuple + s = Series([(1, 1), (1, 2)]) + + result = s == (1, 2) + expected = Series([False, True]) + tm.assert_series_equal(result, expected) + + result = s != (1, 2) + expected = Series([True, False]) + tm.assert_series_equal(result, expected) + + result = s == (0, 0) + expected = Series([False, False]) + tm.assert_series_equal(result, expected) + + result = s != (0, 0) + expected = Series([True, True]) + tm.assert_series_equal(result, expected) + + s = Series([(1, 1), (1, 1)]) + + result = s == (1, 1) + expected = Series([True, True]) + tm.assert_series_equal(result, expected) + + result = s != (1, 1) + expected = Series([False, False]) + tm.assert_series_equal(result, expected) + + s = Series([frozenset([1]), frozenset([1, 2])]) + + result = s == frozenset([1]) + expected = Series([True, False]) + tm.assert_series_equal(result, expected) + + def test_comparison_operators_with_nas(self): + ser = Series(bdate_range("1/1/2000", periods=10), dtype=object) + ser[::2] = np.nan + + # test that comparisons work + ops = ["lt", "le", "gt", "ge", "eq", "ne"] + for op in ops: + val = ser[5] + + f = getattr(operator, op) + result = f(ser, val) + + expected = f(ser.dropna(), val).reindex(ser.index) + + if op == "ne": + expected = expected.fillna(True).astype(bool) + else: + expected = expected.fillna(False).astype(bool) + + tm.assert_series_equal(result, expected) + + # FIXME: dont leave commented-out + # fffffffuuuuuuuuuuuu + # result = f(val, s) + # expected = f(val, s.dropna()).reindex(s.index) + # tm.assert_series_equal(result, expected) + + def test_unequal_categorical_comparison_raises_type_error(self): + # unequal comparison should raise for unordered cats + cat = Series(Categorical(list("abc"))) + with pytest.raises(TypeError): + cat > "b" + + cat = Series(Categorical(list("abc"), ordered=False)) + with pytest.raises(TypeError): + cat > "b" + + # https://github.com/pandas-dev/pandas/issues/9836#issuecomment-92123057 + # and following comparisons with scalars not in categories should raise + # for unequal comps, but not for equal/not equal + cat = Series(Categorical(list("abc"), ordered=True)) + + with pytest.raises(TypeError): + cat < "d" + with pytest.raises(TypeError): + cat > "d" + with pytest.raises(TypeError): + "d" < cat + with pytest.raises(TypeError): + "d" > cat + + tm.assert_series_equal(cat == "d", Series([False, False, False])) + tm.assert_series_equal(cat != "d", Series([True, True, True])) + + def test_ne(self): + ts = Series([3, 4, 5, 6, 7], [3, 4, 5, 6, 7], dtype=float) + expected = [True, True, False, True, True] + assert tm.equalContents(ts.index != 5, expected) + assert tm.equalContents(~(ts.index == 5), expected) + + def test_comp_ops_df_compat(self): + # GH 1134 + s1 = pd.Series([1, 2, 3], index=list("ABC"), name="x") + s2 = pd.Series([2, 2, 2], index=list("ABD"), name="x") + + s3 = pd.Series([1, 2, 3], index=list("ABC"), name="x") + s4 = pd.Series([2, 2, 2, 2], index=list("ABCD"), name="x") + + for left, right in [(s1, s2), (s2, s1), (s3, s4), (s4, s3)]: + + msg = "Can only compare identically-labeled Series objects" + with pytest.raises(ValueError, match=msg): + left == right + + with pytest.raises(ValueError, match=msg): + left != right + + with pytest.raises(ValueError, match=msg): + left < right + + msg = "Can only compare identically-labeled DataFrame objects" + with pytest.raises(ValueError, match=msg): + left.to_frame() == right.to_frame() + + with pytest.raises(ValueError, match=msg): + left.to_frame() != right.to_frame() + + with pytest.raises(ValueError, match=msg): + left.to_frame() < right.to_frame() + + def test_compare_series_interval_keyword(self): + # GH 25338 + s = Series(["IntervalA", "IntervalB", "IntervalC"]) + result = s == "IntervalA" + expected = Series([True, False, False]) + tm.assert_series_equal(result, expected) + + +class TestSeriesFlexComparisonOps: + def test_comparison_flex_alignment(self): + left = Series([1, 3, 2], index=list("abc")) + right = Series([2, 2, 2], index=list("bcd")) + + exp = pd.Series([False, False, True, False], index=list("abcd")) + tm.assert_series_equal(left.eq(right), exp) + + exp = pd.Series([True, True, False, True], index=list("abcd")) + tm.assert_series_equal(left.ne(right), exp) + + exp = pd.Series([False, False, True, False], index=list("abcd")) + tm.assert_series_equal(left.le(right), exp) + + exp = pd.Series([False, False, False, False], index=list("abcd")) + tm.assert_series_equal(left.lt(right), exp) + + exp = pd.Series([False, True, True, False], index=list("abcd")) + tm.assert_series_equal(left.ge(right), exp) + + exp = pd.Series([False, True, False, False], index=list("abcd")) + tm.assert_series_equal(left.gt(right), exp) + + def test_comparison_flex_alignment_fill(self): + left = Series([1, 3, 2], index=list("abc")) + right = Series([2, 2, 2], index=list("bcd")) + + exp = pd.Series([False, False, True, True], index=list("abcd")) + tm.assert_series_equal(left.eq(right, fill_value=2), exp) + + exp = pd.Series([True, True, False, False], index=list("abcd")) + tm.assert_series_equal(left.ne(right, fill_value=2), exp) + + exp = pd.Series([False, False, True, True], index=list("abcd")) + tm.assert_series_equal(left.le(right, fill_value=0), exp) + + exp = pd.Series([False, False, False, True], index=list("abcd")) + tm.assert_series_equal(left.lt(right, fill_value=0), exp) + + exp = pd.Series([True, True, True, False], index=list("abcd")) + tm.assert_series_equal(left.ge(right, fill_value=0), exp) + + exp = pd.Series([True, True, False, False], index=list("abcd")) + tm.assert_series_equal(left.gt(right, fill_value=0), exp) + + +class TestSeriesOperators: + def test_operators_empty_int_corner(self): + s1 = Series([], [], dtype=np.int32) + s2 = Series({"x": 0.0}) + tm.assert_series_equal(s1 * s2, Series([np.nan], index=["x"])) + + def test_ops_datetimelike_align(self): + # GH 7500 + # datetimelike ops need to align + dt = Series(date_range("2012-1-1", periods=3, freq="D")) + dt.iloc[2] = np.nan + dt2 = dt[::-1] + + expected = Series([timedelta(0), timedelta(0), pd.NaT]) + # name is reset + result = dt2 - dt + tm.assert_series_equal(result, expected) + + expected = Series(expected, name=0) + result = (dt2.to_frame() - dt.to_frame())[0] + tm.assert_series_equal(result, expected) + + def test_operators_corner(self, datetime_series): + empty = Series([], index=Index([]), dtype=np.float64) + + result = datetime_series + empty + assert np.isnan(result).all() + + result = empty + empty.copy() + assert len(result) == 0 + + # TODO: this returned NotImplemented earlier, what to do? + # deltas = Series([timedelta(1)] * 5, index=np.arange(5)) + # sub_deltas = deltas[::2] + # deltas5 = deltas * 5 + # deltas = deltas + sub_deltas + + # float + int + int_ts = datetime_series.astype(int)[:-5] + added = datetime_series + int_ts + expected = Series( + datetime_series.values[:-5] + int_ts.values, + index=datetime_series.index[:-5], + name="ts", + ) + tm.assert_series_equal(added[:-5], expected) + + pairings = [(Series.div, operator.truediv, 1), (Series.rdiv, ops.rtruediv, 1)] + for op in ["add", "sub", "mul", "pow", "truediv", "floordiv"]: + fv = 0 + lop = getattr(Series, op) + lequiv = getattr(operator, op) + rop = getattr(Series, "r" + op) + # bind op at definition time... + requiv = lambda x, y, op=op: getattr(operator, op)(y, x) + pairings.append((lop, lequiv, fv)) + pairings.append((rop, requiv, fv)) + + @pytest.mark.parametrize("op, equiv_op, fv", pairings) + def test_operators_combine(self, op, equiv_op, fv): + def _check_fill(meth, op, a, b, fill_value=0): + exp_index = a.index.union(b.index) + a = a.reindex(exp_index) + b = b.reindex(exp_index) + + amask = isna(a) + bmask = isna(b) + + exp_values = [] + for i in range(len(exp_index)): + with np.errstate(all="ignore"): + if amask[i]: + if bmask[i]: + exp_values.append(np.nan) + continue + exp_values.append(op(fill_value, b[i])) + elif bmask[i]: + if amask[i]: + exp_values.append(np.nan) + continue + exp_values.append(op(a[i], fill_value)) + else: + exp_values.append(op(a[i], b[i])) + + result = meth(a, b, fill_value=fill_value) + expected = Series(exp_values, exp_index) + tm.assert_series_equal(result, expected) + + a = Series([np.nan, 1.0, 2.0, 3.0, np.nan], index=np.arange(5)) + b = Series([np.nan, 1, np.nan, 3, np.nan, 4.0], index=np.arange(6)) + + result = op(a, b) + exp = equiv_op(a, b) + tm.assert_series_equal(result, exp) + _check_fill(op, equiv_op, a, b, fill_value=fv) + # should accept axis=0 or axis='rows' + op(a, b, axis=0) + + def test_operators_na_handling(self): + from decimal import Decimal + from datetime import date + + s = Series( + [Decimal("1.3"), Decimal("2.3")], index=[date(2012, 1, 1), date(2012, 1, 2)] + ) + + result = s + s.shift(1) + result2 = s.shift(1) + s + assert isna(result[0]) + assert isna(result2[0]) + + def test_op_duplicate_index(self): + # GH14227 + s1 = Series([1, 2], index=[1, 1]) + s2 = Series([10, 10], index=[1, 2]) + result = s1 + s2 + expected = pd.Series([11, 12, np.nan], index=[1, 1, 2]) + tm.assert_series_equal(result, expected) + + def test_divmod(self): + # GH25557 + a = Series([1, 1, 1, np.nan], index=["a", "b", "c", "d"]) + b = Series([2, np.nan, 1, np.nan], index=["a", "b", "d", "e"]) + + result = a.divmod(b) + expected = divmod(a, b) + tm.assert_series_equal(result[0], expected[0]) + tm.assert_series_equal(result[1], expected[1]) + + result = a.rdivmod(b) + expected = divmod(b, a) + tm.assert_series_equal(result[0], expected[0]) + tm.assert_series_equal(result[1], expected[1]) + + @pytest.mark.parametrize("index", [None, range(9)]) + def test_series_integer_mod(self, index): + # see gh-24396 + s1 = Series(range(1, 10)) + s2 = Series("foo", index=index) + + msg = "not all arguments converted during string formatting" + + with pytest.raises(TypeError, match=msg): + s2 % s1 + + +class TestSeriesUnaryOps: + # __neg__, __pos__, __inv__ + + def test_neg(self): + ser = tm.makeStringSeries() + ser.name = "series" + tm.assert_series_equal(-ser, -1 * ser) + + def test_invert(self): + ser = tm.makeStringSeries() + ser.name = "series" + tm.assert_series_equal(-(ser < 0), ~(ser < 0)) diff --git a/venv/Lib/site-packages/pandas/tests/series/test_period.py b/venv/Lib/site-packages/pandas/tests/series/test_period.py new file mode 100644 index 0000000..03fee38 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/test_period.py @@ -0,0 +1,170 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Period, Series, period_range +import pandas._testing as tm +from pandas.core.arrays import PeriodArray + + +class TestSeriesPeriod: + def setup_method(self, method): + self.series = Series(period_range("2000-01-01", periods=10, freq="D")) + + def test_auto_conversion(self): + series = Series(list(period_range("2000-01-01", periods=10, freq="D"))) + assert series.dtype == "Period[D]" + + series = pd.Series( + [pd.Period("2011-01-01", freq="D"), pd.Period("2011-02-01", freq="D")] + ) + assert series.dtype == "Period[D]" + + def test_getitem(self): + assert self.series[1] == pd.Period("2000-01-02", freq="D") + + result = self.series[[2, 4]] + exp = pd.Series( + [pd.Period("2000-01-03", freq="D"), pd.Period("2000-01-05", freq="D")], + index=[2, 4], + dtype="Period[D]", + ) + tm.assert_series_equal(result, exp) + assert result.dtype == "Period[D]" + + def test_isna(self): + # GH 13737 + s = Series([pd.Period("2011-01", freq="M"), pd.Period("NaT", freq="M")]) + tm.assert_series_equal(s.isna(), Series([False, True])) + tm.assert_series_equal(s.notna(), Series([True, False])) + + def test_fillna(self): + # GH 13737 + s = Series([pd.Period("2011-01", freq="M"), pd.Period("NaT", freq="M")]) + + res = s.fillna(pd.Period("2012-01", freq="M")) + exp = Series([pd.Period("2011-01", freq="M"), pd.Period("2012-01", freq="M")]) + tm.assert_series_equal(res, exp) + assert res.dtype == "Period[M]" + + def test_dropna(self): + # GH 13737 + s = Series([pd.Period("2011-01", freq="M"), pd.Period("NaT", freq="M")]) + tm.assert_series_equal(s.dropna(), Series([pd.Period("2011-01", freq="M")])) + + def test_between(self): + left, right = self.series[[2, 7]] + result = self.series.between(left, right) + expected = (self.series >= left) & (self.series <= right) + tm.assert_series_equal(result, expected) + + # --------------------------------------------------------------------- + # NaT support + + @pytest.mark.xfail(reason="PeriodDtype Series not supported yet") + def test_NaT_scalar(self): + series = Series([0, 1000, 2000, pd._libs.iNaT], dtype="period[D]") + + val = series[3] + assert pd.isna(val) + + series[2] = val + assert pd.isna(series[2]) + + def test_NaT_cast(self): + result = Series([np.nan]).astype("period[D]") + expected = Series([pd.NaT], dtype="period[D]") + tm.assert_series_equal(result, expected) + + def test_set_none(self): + self.series[3] = None + assert self.series[3] is pd.NaT + + self.series[3:5] = None + assert self.series[4] is pd.NaT + + def test_set_nan(self): + # Do we want to allow this? + self.series[5] = np.nan + assert self.series[5] is pd.NaT + + self.series[5:7] = np.nan + assert self.series[6] is pd.NaT + + def test_intercept_astype_object(self): + expected = self.series.astype("object") + + df = DataFrame({"a": self.series, "b": np.random.randn(len(self.series))}) + + result = df.values.squeeze() + assert (result[:, 0] == expected.values).all() + + df = DataFrame({"a": self.series, "b": ["foo"] * len(self.series)}) + + result = df.values.squeeze() + assert (result[:, 0] == expected.values).all() + + def test_align_series(self, join_type): + rng = period_range("1/1/2000", "1/1/2010", freq="A") + ts = Series(np.random.randn(len(rng)), index=rng) + + ts.align(ts[::2], join=join_type) + + def test_truncate(self): + # GH 17717 + idx1 = pd.PeriodIndex( + [pd.Period("2017-09-02"), pd.Period("2017-09-02"), pd.Period("2017-09-03")] + ) + series1 = pd.Series([1, 2, 3], index=idx1) + result1 = series1.truncate(after="2017-09-02") + + expected_idx1 = pd.PeriodIndex( + [pd.Period("2017-09-02"), pd.Period("2017-09-02")] + ) + tm.assert_series_equal(result1, pd.Series([1, 2], index=expected_idx1)) + + idx2 = pd.PeriodIndex( + [pd.Period("2017-09-03"), pd.Period("2017-09-02"), pd.Period("2017-09-03")] + ) + series2 = pd.Series([1, 2, 3], index=idx2) + result2 = series2.sort_index().truncate(after="2017-09-02") + + expected_idx2 = pd.PeriodIndex([pd.Period("2017-09-02")]) + tm.assert_series_equal(result2, pd.Series([2], index=expected_idx2)) + + @pytest.mark.parametrize( + "input_vals", + [ + [Period("2016-01", freq="M"), Period("2016-02", freq="M")], + [Period("2016-01-01", freq="D"), Period("2016-01-02", freq="D")], + [ + Period("2016-01-01 00:00:00", freq="H"), + Period("2016-01-01 01:00:00", freq="H"), + ], + [ + Period("2016-01-01 00:00:00", freq="M"), + Period("2016-01-01 00:01:00", freq="M"), + ], + [ + Period("2016-01-01 00:00:00", freq="S"), + Period("2016-01-01 00:00:01", freq="S"), + ], + ], + ) + def test_end_time_timevalues(self, input_vals): + # GH 17157 + # Check that the time part of the Period is adjusted by end_time + # when using the dt accessor on a Series + input_vals = PeriodArray._from_sequence(np.asarray(input_vals)) + + s = Series(input_vals) + result = s.dt.end_time + expected = s.apply(lambda x: x.end_time) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("input_vals", [("2001"), ("NaT")]) + def test_to_period(self, input_vals): + # GH 21205 + expected = Series([input_vals], dtype="Period[D]") + result = Series([input_vals], dtype="datetime64[ns]").dt.to_period("D") + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/series/test_repr.py b/venv/Lib/site-packages/pandas/tests/series/test_repr.py new file mode 100644 index 0000000..64a8c45 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/test_repr.py @@ -0,0 +1,489 @@ +from datetime import datetime, timedelta + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + Categorical, + DataFrame, + Index, + MultiIndex, + Series, + date_range, + option_context, + period_range, + timedelta_range, +) +import pandas._testing as tm + + +class TestSeriesRepr: + def test_multilevel_name_print(self): + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + s = Series(range(len(index)), index=index, name="sth") + expected = [ + "first second", + "foo one 0", + " two 1", + " three 2", + "bar one 3", + " two 4", + "baz two 5", + " three 6", + "qux one 7", + " two 8", + " three 9", + "Name: sth, dtype: int64", + ] + expected = "\n".join(expected) + assert repr(s) == expected + + def test_name_printing(self): + # Test small Series. + s = Series([0, 1, 2]) + + s.name = "test" + assert "Name: test" in repr(s) + + s.name = None + assert "Name:" not in repr(s) + + # Test big Series (diff code path). + s = Series(range(1000)) + + s.name = "test" + assert "Name: test" in repr(s) + + s.name = None + assert "Name:" not in repr(s) + + s = Series(index=date_range("20010101", "20020101"), name="test", dtype=object) + assert "Name: test" in repr(s) + + def test_repr(self, datetime_series, string_series, object_series): + str(datetime_series) + str(string_series) + str(string_series.astype(int)) + str(object_series) + + str(Series(tm.randn(1000), index=np.arange(1000))) + str(Series(tm.randn(1000), index=np.arange(1000, 0, step=-1))) + + # empty + str(Series(dtype=object)) + + # with NaNs + string_series[5:7] = np.NaN + str(string_series) + + # with Nones + ots = datetime_series.astype("O") + ots[::2] = None + repr(ots) + + # various names + for name in [ + "", + 1, + 1.2, + "foo", + "\u03B1\u03B2\u03B3", + "loooooooooooooooooooooooooooooooooooooooooooooooooooong", + ("foo", "bar", "baz"), + (1, 2), + ("foo", 1, 2.3), + ("\u03B1", "\u03B2", "\u03B3"), + ("\u03B1", "bar"), + ]: + string_series.name = name + repr(string_series) + + biggie = Series( + tm.randn(1000), index=np.arange(1000), name=("foo", "bar", "baz") + ) + repr(biggie) + + # 0 as name + ser = Series(np.random.randn(100), name=0) + rep_str = repr(ser) + assert "Name: 0" in rep_str + + # tidy repr + ser = Series(np.random.randn(1001), name=0) + rep_str = repr(ser) + assert "Name: 0" in rep_str + + ser = Series(["a\n\r\tb"], name="a\n\r\td", index=["a\n\r\tf"]) + assert "\t" not in repr(ser) + assert "\r" not in repr(ser) + assert "a\n" not in repr(ser) + + # with empty series (#4651) + s = Series([], dtype=np.int64, name="foo") + assert repr(s) == "Series([], Name: foo, dtype: int64)" + + s = Series([], dtype=np.int64, name=None) + assert repr(s) == "Series([], dtype: int64)" + + def test_tidy_repr(self): + a = Series(["\u05d0"] * 1000) + a.name = "title1" + repr(a) # should not raise exception + + def test_repr_bool_fails(self, capsys): + s = Series([DataFrame(np.random.randn(2, 2)) for i in range(5)]) + + # It works (with no Cython exception barf)! + repr(s) + + captured = capsys.readouterr() + assert captured.err == "" + + def test_repr_name_iterable_indexable(self): + s = Series([1, 2, 3], name=np.int64(3)) + + # it works! + repr(s) + + s.name = ("\u05d0",) * 2 + repr(s) + + def test_repr_should_return_str(self): + # https://docs.python.org/3/reference/datamodel.html#object.__repr__ + # ...The return value must be a string object. + + # (str on py2.x, str (unicode) on py3) + + data = [8, 5, 3, 5] + index1 = ["\u03c3", "\u03c4", "\u03c5", "\u03c6"] + df = Series(data, index=index1) + assert type(df.__repr__() == str) # both py2 / 3 + + def test_repr_max_rows(self): + # GH 6863 + with pd.option_context("max_rows", None): + str(Series(range(1001))) # should not raise exception + + def test_unicode_string_with_unicode(self): + df = Series(["\u05d0"], name="\u05d1") + str(df) + + def test_str_to_bytes_raises(self): + # GH 26447 + df = Series(["abc"], name="abc") + msg = "^'str' object cannot be interpreted as an integer$" + with pytest.raises(TypeError, match=msg): + bytes(df) + + def test_timeseries_repr_object_dtype(self): + index = Index( + [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)], dtype=object + ) + ts = Series(np.random.randn(len(index)), index) + repr(ts) + + ts = tm.makeTimeSeries(1000) + assert repr(ts).splitlines()[-1].startswith("Freq:") + + ts2 = ts.iloc[np.random.randint(0, len(ts) - 1, 400)] + repr(ts2).splitlines()[-1] + + def test_latex_repr(self): + result = r"""\begin{tabular}{ll} +\toprule +{} & 0 \\ +\midrule +0 & $\alpha$ \\ +1 & b \\ +2 & c \\ +\bottomrule +\end{tabular} +""" + with option_context("display.latex.escape", False, "display.latex.repr", True): + s = Series([r"$\alpha$", "b", "c"]) + assert result == s._repr_latex_() + + assert s._repr_latex_() is None + + def test_index_repr_in_frame_with_nan(self): + # see gh-25061 + i = Index([1, np.nan]) + s = Series([1, 2], index=i) + exp = """1.0 1\nNaN 2\ndtype: int64""" + + assert repr(s) == exp + + +class TestCategoricalRepr: + def test_categorical_repr_unicode(self): + # see gh-21002 + + class County: + name = "San Sebastián" + state = "PR" + + def __repr__(self) -> str: + return self.name + ", " + self.state + + cat = pd.Categorical([County() for _ in range(61)]) + idx = pd.Index(cat) + ser = idx.to_series() + + repr(ser) + str(ser) + + def test_categorical_repr(self): + a = Series(Categorical([1, 2, 3, 4])) + exp = ( + "0 1\n1 2\n2 3\n3 4\n" + + "dtype: category\nCategories (4, int64): [1, 2, 3, 4]" + ) + + assert exp == a.__str__() + + a = Series(Categorical(["a", "b"] * 25)) + exp = ( + "0 a\n1 b\n" + + " ..\n" + + "48 a\n49 b\n" + + "Length: 50, dtype: category\nCategories (2, object): [a, b]" + ) + with option_context("display.max_rows", 5): + assert exp == repr(a) + + levs = list("abcdefghijklmnopqrstuvwxyz") + a = Series(Categorical(["a", "b"], categories=levs, ordered=True)) + exp = ( + "0 a\n1 b\n" + "dtype: category\n" + "Categories (26, object): [a < b < c < d ... w < x < y < z]" + ) + assert exp == a.__str__() + + def test_categorical_series_repr(self): + s = Series(Categorical([1, 2, 3])) + exp = """0 1 +1 2 +2 3 +dtype: category +Categories (3, int64): [1, 2, 3]""" + + assert repr(s) == exp + + s = Series(Categorical(np.arange(10))) + exp = """0 0 +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +7 7 +8 8 +9 9 +dtype: category +Categories (10, int64): [0, 1, 2, 3, ..., 6, 7, 8, 9]""" + + assert repr(s) == exp + + def test_categorical_series_repr_ordered(self): + s = Series(Categorical([1, 2, 3], ordered=True)) + exp = """0 1 +1 2 +2 3 +dtype: category +Categories (3, int64): [1 < 2 < 3]""" + + assert repr(s) == exp + + s = Series(Categorical(np.arange(10), ordered=True)) + exp = """0 0 +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +7 7 +8 8 +9 9 +dtype: category +Categories (10, int64): [0 < 1 < 2 < 3 ... 6 < 7 < 8 < 9]""" + + assert repr(s) == exp + + def test_categorical_series_repr_datetime(self): + idx = date_range("2011-01-01 09:00", freq="H", periods=5) + s = Series(Categorical(idx)) + exp = """0 2011-01-01 09:00:00 +1 2011-01-01 10:00:00 +2 2011-01-01 11:00:00 +3 2011-01-01 12:00:00 +4 2011-01-01 13:00:00 +dtype: category +Categories (5, datetime64[ns]): [2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, + 2011-01-01 12:00:00, 2011-01-01 13:00:00]""" # noqa + + assert repr(s) == exp + + idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") + s = Series(Categorical(idx)) + exp = """0 2011-01-01 09:00:00-05:00 +1 2011-01-01 10:00:00-05:00 +2 2011-01-01 11:00:00-05:00 +3 2011-01-01 12:00:00-05:00 +4 2011-01-01 13:00:00-05:00 +dtype: category +Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, + 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, + 2011-01-01 13:00:00-05:00]""" # noqa + + assert repr(s) == exp + + def test_categorical_series_repr_datetime_ordered(self): + idx = date_range("2011-01-01 09:00", freq="H", periods=5) + s = Series(Categorical(idx, ordered=True)) + exp = """0 2011-01-01 09:00:00 +1 2011-01-01 10:00:00 +2 2011-01-01 11:00:00 +3 2011-01-01 12:00:00 +4 2011-01-01 13:00:00 +dtype: category +Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 < + 2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" # noqa + + assert repr(s) == exp + + idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") + s = Series(Categorical(idx, ordered=True)) + exp = """0 2011-01-01 09:00:00-05:00 +1 2011-01-01 10:00:00-05:00 +2 2011-01-01 11:00:00-05:00 +3 2011-01-01 12:00:00-05:00 +4 2011-01-01 13:00:00-05:00 +dtype: category +Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 < + 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 < + 2011-01-01 13:00:00-05:00]""" # noqa + + assert repr(s) == exp + + def test_categorical_series_repr_period(self): + idx = period_range("2011-01-01 09:00", freq="H", periods=5) + s = Series(Categorical(idx)) + exp = """0 2011-01-01 09:00 +1 2011-01-01 10:00 +2 2011-01-01 11:00 +3 2011-01-01 12:00 +4 2011-01-01 13:00 +dtype: category +Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, + 2011-01-01 13:00]""" # noqa + + assert repr(s) == exp + + idx = period_range("2011-01", freq="M", periods=5) + s = Series(Categorical(idx)) + exp = """0 2011-01 +1 2011-02 +2 2011-03 +3 2011-04 +4 2011-05 +dtype: category +Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" + + assert repr(s) == exp + + def test_categorical_series_repr_period_ordered(self): + idx = period_range("2011-01-01 09:00", freq="H", periods=5) + s = Series(Categorical(idx, ordered=True)) + exp = """0 2011-01-01 09:00 +1 2011-01-01 10:00 +2 2011-01-01 11:00 +3 2011-01-01 12:00 +4 2011-01-01 13:00 +dtype: category +Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < + 2011-01-01 13:00]""" # noqa + + assert repr(s) == exp + + idx = period_range("2011-01", freq="M", periods=5) + s = Series(Categorical(idx, ordered=True)) + exp = """0 2011-01 +1 2011-02 +2 2011-03 +3 2011-04 +4 2011-05 +dtype: category +Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" + + assert repr(s) == exp + + def test_categorical_series_repr_timedelta(self): + idx = timedelta_range("1 days", periods=5) + s = Series(Categorical(idx)) + exp = """0 1 days +1 2 days +2 3 days +3 4 days +4 5 days +dtype: category +Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]""" + + assert repr(s) == exp + + idx = timedelta_range("1 hours", periods=10) + s = Series(Categorical(idx)) + exp = """0 0 days 01:00:00 +1 1 days 01:00:00 +2 2 days 01:00:00 +3 3 days 01:00:00 +4 4 days 01:00:00 +5 5 days 01:00:00 +6 6 days 01:00:00 +7 7 days 01:00:00 +8 8 days 01:00:00 +9 9 days 01:00:00 +dtype: category +Categories (10, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, + 3 days 01:00:00, ..., 6 days 01:00:00, 7 days 01:00:00, + 8 days 01:00:00, 9 days 01:00:00]""" # noqa + + assert repr(s) == exp + + def test_categorical_series_repr_timedelta_ordered(self): + idx = timedelta_range("1 days", periods=5) + s = Series(Categorical(idx, ordered=True)) + exp = """0 1 days +1 2 days +2 3 days +3 4 days +4 5 days +dtype: category +Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" # noqa + + assert repr(s) == exp + + idx = timedelta_range("1 hours", periods=10) + s = Series(Categorical(idx, ordered=True)) + exp = """0 0 days 01:00:00 +1 1 days 01:00:00 +2 2 days 01:00:00 +3 3 days 01:00:00 +4 4 days 01:00:00 +5 5 days 01:00:00 +6 6 days 01:00:00 +7 7 days 01:00:00 +8 8 days 01:00:00 +9 9 days 01:00:00 +dtype: category +Categories (10, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 < + 3 days 01:00:00 ... 6 days 01:00:00 < 7 days 01:00:00 < + 8 days 01:00:00 < 9 days 01:00:00]""" # noqa + + assert repr(s) == exp diff --git a/venv/Lib/site-packages/pandas/tests/series/test_subclass.py b/venv/Lib/site-packages/pandas/tests/series/test_subclass.py new file mode 100644 index 0000000..73247bb --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/test_subclass.py @@ -0,0 +1,37 @@ +import pandas._testing as tm + + +class TestSeriesSubclassing: + def test_indexing_sliced(self): + s = tm.SubclassedSeries([1, 2, 3, 4], index=list("abcd")) + res = s.loc[["a", "b"]] + exp = tm.SubclassedSeries([1, 2], index=list("ab")) + tm.assert_series_equal(res, exp) + + res = s.iloc[[2, 3]] + exp = tm.SubclassedSeries([3, 4], index=list("cd")) + tm.assert_series_equal(res, exp) + + res = s.loc[["a", "b"]] + exp = tm.SubclassedSeries([1, 2], index=list("ab")) + tm.assert_series_equal(res, exp) + + def test_to_frame(self): + s = tm.SubclassedSeries([1, 2, 3, 4], index=list("abcd"), name="xxx") + res = s.to_frame() + exp = tm.SubclassedDataFrame({"xxx": [1, 2, 3, 4]}, index=list("abcd")) + tm.assert_frame_equal(res, exp) + + def test_subclass_unstack(self): + # GH 15564 + s = tm.SubclassedSeries([1, 2, 3, 4], index=[list("aabb"), list("xyxy")]) + + res = s.unstack() + exp = tm.SubclassedDataFrame({"x": [1, 3], "y": [2, 4]}, index=["a", "b"]) + + tm.assert_frame_equal(res, exp) + + def test_subclass_empty_repr(self): + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + sub_series = tm.SubclassedSeries() + assert "SubclassedSeries" in repr(sub_series) diff --git a/venv/Lib/site-packages/pandas/tests/series/test_timeseries.py b/venv/Lib/site-packages/pandas/tests/series/test_timeseries.py new file mode 100644 index 0000000..459377f --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/test_timeseries.py @@ -0,0 +1,767 @@ +from datetime import datetime, time, timedelta +from io import StringIO +from itertools import product + +import numpy as np +import pytest + +from pandas._libs.tslib import iNaT +from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + DataFrame, + DatetimeIndex, + NaT, + Series, + Timestamp, + concat, + date_range, + timedelta_range, + to_datetime, +) +import pandas._testing as tm + +from pandas.tseries.offsets import BDay, BMonthEnd + + +def _simple_ts(start, end, freq="D"): + rng = date_range(start, end, freq=freq) + return Series(np.random.randn(len(rng)), index=rng) + + +def assert_range_equal(left, right): + assert left.equals(right) + assert left.freq == right.freq + assert left.tz == right.tz + + +class TestTimeSeries: + def test_asfreq(self): + ts = Series( + [0.0, 1.0, 2.0], + index=[ + datetime(2009, 10, 30), + datetime(2009, 11, 30), + datetime(2009, 12, 31), + ], + ) + + daily_ts = ts.asfreq("B") + monthly_ts = daily_ts.asfreq("BM") + tm.assert_series_equal(monthly_ts, ts) + + daily_ts = ts.asfreq("B", method="pad") + monthly_ts = daily_ts.asfreq("BM") + tm.assert_series_equal(monthly_ts, ts) + + daily_ts = ts.asfreq(BDay()) + monthly_ts = daily_ts.asfreq(BMonthEnd()) + tm.assert_series_equal(monthly_ts, ts) + + result = ts[:0].asfreq("M") + assert len(result) == 0 + assert result is not ts + + daily_ts = ts.asfreq("D", fill_value=-1) + result = daily_ts.value_counts().sort_index() + expected = Series([60, 1, 1, 1], index=[-1.0, 2.0, 1.0, 0.0]).sort_index() + tm.assert_series_equal(result, expected) + + def test_asfreq_datetimeindex_empty_series(self): + # GH 14320 + index = pd.DatetimeIndex(["2016-09-29 11:00"]) + expected = Series(index=index, dtype=object).asfreq("H") + result = Series([3], index=index.copy()).asfreq("H") + tm.assert_index_equal(expected.index, result.index) + + def test_autocorr(self, datetime_series): + # Just run the function + corr1 = datetime_series.autocorr() + + # Now run it with the lag parameter + corr2 = datetime_series.autocorr(lag=1) + + # corr() with lag needs Series of at least length 2 + if len(datetime_series) <= 2: + assert np.isnan(corr1) + assert np.isnan(corr2) + else: + assert corr1 == corr2 + + # Choose a random lag between 1 and length of Series - 2 + # and compare the result with the Series corr() function + n = 1 + np.random.randint(max(1, len(datetime_series) - 2)) + corr1 = datetime_series.corr(datetime_series.shift(n)) + corr2 = datetime_series.autocorr(lag=n) + + # corr() with lag needs Series of at least length 2 + if len(datetime_series) <= 2: + assert np.isnan(corr1) + assert np.isnan(corr2) + else: + assert corr1 == corr2 + + def test_first_last_valid(self, datetime_series): + ts = datetime_series.copy() + ts[:5] = np.NaN + + index = ts.first_valid_index() + assert index == ts.index[5] + + ts[-5:] = np.NaN + index = ts.last_valid_index() + assert index == ts.index[-6] + + ts[:] = np.nan + assert ts.last_valid_index() is None + assert ts.first_valid_index() is None + + ser = Series([], index=[], dtype=object) + assert ser.last_valid_index() is None + assert ser.first_valid_index() is None + + # GH12800 + empty = Series(dtype=object) + assert empty.last_valid_index() is None + assert empty.first_valid_index() is None + + # GH20499: its preserves freq with holes + ts.index = date_range("20110101", periods=len(ts), freq="B") + ts.iloc[1] = 1 + ts.iloc[-2] = 1 + assert ts.first_valid_index() == ts.index[1] + assert ts.last_valid_index() == ts.index[-2] + assert ts.first_valid_index().freq == ts.index.freq + assert ts.last_valid_index().freq == ts.index.freq + + def test_mpl_compat_hack(self, datetime_series): + + # This is currently failing because the test was relying on + # the DeprecationWarning coming through Index.__getitem__. + # We want to implement a warning specifically for Series.__getitem__ + # at which point this will become a Deprecation/FutureWarning + with tm.assert_produces_warning(None): + # GH#30588 multi-dimensional indexing deprecated + result = datetime_series[:, np.newaxis] + expected = datetime_series.values[:, np.newaxis] + tm.assert_almost_equal(result, expected) + + def test_timeseries_coercion(self): + idx = tm.makeDateIndex(10000) + ser = Series(np.random.randn(len(idx)), idx.astype(object)) + assert ser.index.is_all_dates + assert isinstance(ser.index, DatetimeIndex) + + def test_contiguous_boolean_preserve_freq(self): + rng = date_range("1/1/2000", "3/1/2000", freq="B") + + mask = np.zeros(len(rng), dtype=bool) + mask[10:20] = True + + masked = rng[mask] + expected = rng[10:20] + assert expected.freq is not None + assert_range_equal(masked, expected) + + mask[22] = True + masked = rng[mask] + assert masked.freq is None + + def test_to_datetime_unit(self): + + epoch = 1370745748 + s = Series([epoch + t for t in range(20)]) + result = to_datetime(s, unit="s") + expected = Series( + [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] + ) + tm.assert_series_equal(result, expected) + + s = Series([epoch + t for t in range(20)]).astype(float) + result = to_datetime(s, unit="s") + expected = Series( + [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] + ) + tm.assert_series_equal(result, expected) + + s = Series([epoch + t for t in range(20)] + [iNaT]) + result = to_datetime(s, unit="s") + expected = Series( + [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] + + [NaT] + ) + tm.assert_series_equal(result, expected) + + s = Series([epoch + t for t in range(20)] + [iNaT]).astype(float) + result = to_datetime(s, unit="s") + expected = Series( + [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] + + [NaT] + ) + tm.assert_series_equal(result, expected) + + # GH13834 + s = Series([epoch + t for t in np.arange(0, 2, 0.25)] + [iNaT]).astype(float) + result = to_datetime(s, unit="s") + expected = Series( + [ + Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) + for t in np.arange(0, 2, 0.25) + ] + + [NaT] + ) + tm.assert_series_equal(result, expected) + + s = concat( + [Series([epoch + t for t in range(20)]).astype(float), Series([np.nan])], + ignore_index=True, + ) + result = to_datetime(s, unit="s") + expected = Series( + [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] + + [NaT] + ) + tm.assert_series_equal(result, expected) + + result = to_datetime([1, 2, "NaT", pd.NaT, np.nan], unit="D") + expected = DatetimeIndex( + [Timestamp("1970-01-02"), Timestamp("1970-01-03")] + ["NaT"] * 3 + ) + tm.assert_index_equal(result, expected) + + msg = "non convertible value foo with the unit 'D'" + with pytest.raises(ValueError, match=msg): + to_datetime([1, 2, "foo"], unit="D") + msg = "cannot convert input 111111111 with the unit 'D'" + with pytest.raises(OutOfBoundsDatetime, match=msg): + to_datetime([1, 2, 111111111], unit="D") + + # coerce we can process + expected = DatetimeIndex( + [Timestamp("1970-01-02"), Timestamp("1970-01-03")] + ["NaT"] * 1 + ) + result = to_datetime([1, 2, "foo"], unit="D", errors="coerce") + tm.assert_index_equal(result, expected) + + result = to_datetime([1, 2, 111111111], unit="D", errors="coerce") + tm.assert_index_equal(result, expected) + + def test_series_ctor_datetime64(self): + rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s") + dates = np.asarray(rng) + + series = Series(dates) + assert np.issubdtype(series.dtype, np.dtype("M8[ns]")) + + def test_series_repr_nat(self): + series = Series([0, 1000, 2000, iNaT], dtype="M8[ns]") + + result = repr(series) + expected = ( + "0 1970-01-01 00:00:00.000000\n" + "1 1970-01-01 00:00:00.000001\n" + "2 1970-01-01 00:00:00.000002\n" + "3 NaT\n" + "dtype: datetime64[ns]" + ) + assert result == expected + + def test_asfreq_keep_index_name(self): + # GH #9854 + index_name = "bar" + index = pd.date_range("20130101", periods=20, name=index_name) + df = pd.DataFrame(list(range(20)), columns=["foo"], index=index) + + assert index_name == df.index.name + assert index_name == df.asfreq("10D").index.name + + def test_promote_datetime_date(self): + rng = date_range("1/1/2000", periods=20) + ts = Series(np.random.randn(20), index=rng) + + ts_slice = ts[5:] + ts2 = ts_slice.copy() + ts2.index = [x.date() for x in ts2.index] + + result = ts + ts2 + result2 = ts2 + ts + expected = ts + ts[5:] + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result2, expected) + + # test asfreq + result = ts2.asfreq("4H", method="ffill") + expected = ts[5:].asfreq("4H", method="ffill") + tm.assert_series_equal(result, expected) + + result = rng.get_indexer(ts2.index) + expected = rng.get_indexer(ts_slice.index) + tm.assert_numpy_array_equal(result, expected) + + def test_asfreq_normalize(self): + rng = date_range("1/1/2000 09:30", periods=20) + norm = date_range("1/1/2000", periods=20) + vals = np.random.randn(20) + ts = Series(vals, index=rng) + + result = ts.asfreq("D", normalize=True) + norm = date_range("1/1/2000", periods=20) + expected = Series(vals, index=norm) + + tm.assert_series_equal(result, expected) + + vals = np.random.randn(20, 3) + ts = DataFrame(vals, index=rng) + + result = ts.asfreq("D", normalize=True) + expected = DataFrame(vals, index=norm) + + tm.assert_frame_equal(result, expected) + + def test_first_subset(self): + ts = _simple_ts("1/1/2000", "1/1/2010", freq="12h") + result = ts.first("10d") + assert len(result) == 20 + + ts = _simple_ts("1/1/2000", "1/1/2010") + result = ts.first("10d") + assert len(result) == 10 + + result = ts.first("3M") + expected = ts[:"3/31/2000"] + tm.assert_series_equal(result, expected) + + result = ts.first("21D") + expected = ts[:21] + tm.assert_series_equal(result, expected) + + result = ts[:0].first("3M") + tm.assert_series_equal(result, ts[:0]) + + def test_first_raises(self): + # GH20725 + ser = pd.Series("a b c".split()) + msg = "'first' only supports a DatetimeIndex index" + with pytest.raises(TypeError, match=msg): + ser.first("1D") + + def test_last_subset(self): + ts = _simple_ts("1/1/2000", "1/1/2010", freq="12h") + result = ts.last("10d") + assert len(result) == 20 + + ts = _simple_ts("1/1/2000", "1/1/2010") + result = ts.last("10d") + assert len(result) == 10 + + result = ts.last("21D") + expected = ts["12/12/2009":] + tm.assert_series_equal(result, expected) + + result = ts.last("21D") + expected = ts[-21:] + tm.assert_series_equal(result, expected) + + result = ts[:0].last("3M") + tm.assert_series_equal(result, ts[:0]) + + def test_last_raises(self): + # GH20725 + ser = pd.Series("a b c".split()) + msg = "'last' only supports a DatetimeIndex index" + with pytest.raises(TypeError, match=msg): + ser.last("1D") + + def test_format_pre_1900_dates(self): + rng = date_range("1/1/1850", "1/1/1950", freq="A-DEC") + rng.format() + ts = Series(1, index=rng) + repr(ts) + + def test_at_time(self): + rng = date_range("1/1/2000", "1/5/2000", freq="5min") + ts = Series(np.random.randn(len(rng)), index=rng) + rs = ts.at_time(rng[1]) + assert (rs.index.hour == rng[1].hour).all() + assert (rs.index.minute == rng[1].minute).all() + assert (rs.index.second == rng[1].second).all() + + result = ts.at_time("9:30") + expected = ts.at_time(time(9, 30)) + tm.assert_series_equal(result, expected) + + df = DataFrame(np.random.randn(len(rng), 3), index=rng) + + result = ts[time(9, 30)] + result_df = df.loc[time(9, 30)] + expected = ts[(rng.hour == 9) & (rng.minute == 30)] + exp_df = df[(rng.hour == 9) & (rng.minute == 30)] + + # FIXME: dont leave commented-out + # expected.index = date_range('1/1/2000', '1/4/2000') + + tm.assert_series_equal(result, expected) + tm.assert_frame_equal(result_df, exp_df) + + chunk = df.loc["1/4/2000":] + result = chunk.loc[time(9, 30)] + expected = result_df[-1:] + tm.assert_frame_equal(result, expected) + + # midnight, everything + rng = date_range("1/1/2000", "1/31/2000") + ts = Series(np.random.randn(len(rng)), index=rng) + + result = ts.at_time(time(0, 0)) + tm.assert_series_equal(result, ts) + + # time doesn't exist + rng = date_range("1/1/2012", freq="23Min", periods=384) + ts = Series(np.random.randn(len(rng)), rng) + rs = ts.at_time("16:00") + assert len(rs) == 0 + + def test_at_time_raises(self): + # GH20725 + ser = pd.Series("a b c".split()) + msg = "Index must be DatetimeIndex" + with pytest.raises(TypeError, match=msg): + ser.at_time("00:00") + + def test_between(self): + series = Series(date_range("1/1/2000", periods=10)) + left, right = series[[2, 7]] + + result = series.between(left, right) + expected = (series >= left) & (series <= right) + tm.assert_series_equal(result, expected) + + def test_between_time(self): + rng = date_range("1/1/2000", "1/5/2000", freq="5min") + ts = Series(np.random.randn(len(rng)), index=rng) + stime = time(0, 0) + etime = time(1, 0) + + close_open = product([True, False], [True, False]) + for inc_start, inc_end in close_open: + filtered = ts.between_time(stime, etime, inc_start, inc_end) + exp_len = 13 * 4 + 1 + if not inc_start: + exp_len -= 5 + if not inc_end: + exp_len -= 4 + + assert len(filtered) == exp_len + for rs in filtered.index: + t = rs.time() + if inc_start: + assert t >= stime + else: + assert t > stime + + if inc_end: + assert t <= etime + else: + assert t < etime + + result = ts.between_time("00:00", "01:00") + expected = ts.between_time(stime, etime) + tm.assert_series_equal(result, expected) + + # across midnight + rng = date_range("1/1/2000", "1/5/2000", freq="5min") + ts = Series(np.random.randn(len(rng)), index=rng) + stime = time(22, 0) + etime = time(9, 0) + + close_open = product([True, False], [True, False]) + for inc_start, inc_end in close_open: + filtered = ts.between_time(stime, etime, inc_start, inc_end) + exp_len = (12 * 11 + 1) * 4 + 1 + if not inc_start: + exp_len -= 4 + if not inc_end: + exp_len -= 4 + + assert len(filtered) == exp_len + for rs in filtered.index: + t = rs.time() + if inc_start: + assert (t >= stime) or (t <= etime) + else: + assert (t > stime) or (t <= etime) + + if inc_end: + assert (t <= etime) or (t >= stime) + else: + assert (t < etime) or (t >= stime) + + def test_between_time_raises(self): + # GH20725 + ser = pd.Series("a b c".split()) + msg = "Index must be DatetimeIndex" + with pytest.raises(TypeError, match=msg): + ser.between_time(start_time="00:00", end_time="12:00") + + def test_between_time_types(self): + # GH11818 + rng = date_range("1/1/2000", "1/5/2000", freq="5min") + msg = r"Cannot convert arg \[datetime\.datetime\(2010, 1, 2, 1, 0\)\] to a time" + with pytest.raises(ValueError, match=msg): + rng.indexer_between_time(datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5)) + + frame = DataFrame({"A": 0}, index=rng) + with pytest.raises(ValueError, match=msg): + frame.between_time(datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5)) + + series = Series(0, index=rng) + with pytest.raises(ValueError, match=msg): + series.between_time(datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5)) + + @td.skip_if_has_locale + def test_between_time_formats(self): + # GH11818 + rng = date_range("1/1/2000", "1/5/2000", freq="5min") + ts = DataFrame(np.random.randn(len(rng), 2), index=rng) + + strings = [ + ("2:00", "2:30"), + ("0200", "0230"), + ("2:00am", "2:30am"), + ("0200am", "0230am"), + ("2:00:00", "2:30:00"), + ("020000", "023000"), + ("2:00:00am", "2:30:00am"), + ("020000am", "023000am"), + ] + expected_length = 28 + + for time_string in strings: + assert len(ts.between_time(*time_string)) == expected_length + + def test_between_time_axis(self): + # issue 8839 + rng = date_range("1/1/2000", periods=100, freq="10min") + ts = Series(np.random.randn(len(rng)), index=rng) + stime, etime = ("08:00:00", "09:00:00") + expected_length = 7 + + assert len(ts.between_time(stime, etime)) == expected_length + assert len(ts.between_time(stime, etime, axis=0)) == expected_length + msg = "No axis named 1 for object type " + with pytest.raises(ValueError, match=msg): + ts.between_time(stime, etime, axis=1) + + def test_to_period(self): + from pandas.core.indexes.period import period_range + + ts = _simple_ts("1/1/2000", "1/1/2001") + + pts = ts.to_period() + exp = ts.copy() + exp.index = period_range("1/1/2000", "1/1/2001") + tm.assert_series_equal(pts, exp) + + pts = ts.to_period("M") + exp.index = exp.index.asfreq("M") + tm.assert_index_equal(pts.index, exp.index.asfreq("M")) + tm.assert_series_equal(pts, exp) + + # GH 7606 without freq + idx = DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"]) + exp_idx = pd.PeriodIndex( + ["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"], freq="D" + ) + + s = Series(np.random.randn(4), index=idx) + expected = s.copy() + expected.index = exp_idx + tm.assert_series_equal(s.to_period(), expected) + + df = DataFrame(np.random.randn(4, 4), index=idx, columns=idx) + expected = df.copy() + expected.index = exp_idx + tm.assert_frame_equal(df.to_period(), expected) + + expected = df.copy() + expected.columns = exp_idx + tm.assert_frame_equal(df.to_period(axis=1), expected) + + def test_groupby_count_dateparseerror(self): + dr = date_range(start="1/1/2012", freq="5min", periods=10) + + # BAD Example, datetimes first + s = Series(np.arange(10), index=[dr, np.arange(10)]) + grouped = s.groupby(lambda x: x[1] % 2 == 0) + result = grouped.count() + + s = Series(np.arange(10), index=[np.arange(10), dr]) + grouped = s.groupby(lambda x: x[0] % 2 == 0) + expected = grouped.count() + + tm.assert_series_equal(result, expected) + + def test_to_csv_numpy_16_bug(self): + frame = DataFrame({"a": date_range("1/1/2000", periods=10)}) + + buf = StringIO() + frame.to_csv(buf) + + result = buf.getvalue() + assert "2000-01-01" in result + + def test_series_map_box_timedelta(self): + # GH 11349 + s = Series(timedelta_range("1 day 1 s", periods=5, freq="h")) + + def f(x): + return x.total_seconds() + + s.map(f) + s.apply(f) + DataFrame(s).applymap(f) + + def test_asfreq_resample_set_correct_freq(self): + # GH5613 + # we test if .asfreq() and .resample() set the correct value for .freq + df = pd.DataFrame( + {"date": ["2012-01-01", "2012-01-02", "2012-01-03"], "col": [1, 2, 3]} + ) + df = df.set_index(pd.to_datetime(df.date)) + + # testing the settings before calling .asfreq() and .resample() + assert df.index.freq is None + assert df.index.inferred_freq == "D" + + # does .asfreq() set .freq correctly? + assert df.asfreq("D").index.freq == "D" + + # does .resample() set .freq correctly? + assert df.resample("D").asfreq().index.freq == "D" + + def test_pickle(self): + + # GH4606 + p = tm.round_trip_pickle(NaT) + assert p is NaT + + idx = pd.to_datetime(["2013-01-01", NaT, "2014-01-06"]) + idx_p = tm.round_trip_pickle(idx) + assert idx_p[0] == idx[0] + assert idx_p[1] is NaT + assert idx_p[2] == idx[2] + + # GH11002 + # don't infer freq + idx = date_range("1750-1-1", "2050-1-1", freq="7D") + idx_p = tm.round_trip_pickle(idx) + tm.assert_index_equal(idx, idx_p) + + @pytest.mark.parametrize("tz", [None, "Asia/Tokyo", "US/Eastern"]) + def test_setops_preserve_freq(self, tz): + rng = date_range("1/1/2000", "1/1/2002", name="idx", tz=tz) + + result = rng[:50].union(rng[50:100]) + assert result.name == rng.name + assert result.freq == rng.freq + assert result.tz == rng.tz + + result = rng[:50].union(rng[30:100]) + assert result.name == rng.name + assert result.freq == rng.freq + assert result.tz == rng.tz + + result = rng[:50].union(rng[60:100]) + assert result.name == rng.name + assert result.freq is None + assert result.tz == rng.tz + + result = rng[:50].intersection(rng[25:75]) + assert result.name == rng.name + assert result.freqstr == "D" + assert result.tz == rng.tz + + nofreq = DatetimeIndex(list(rng[25:75]), name="other") + result = rng[:50].union(nofreq) + assert result.name is None + assert result.freq == rng.freq + assert result.tz == rng.tz + + result = rng[:50].intersection(nofreq) + assert result.name is None + assert result.freq == rng.freq + assert result.tz == rng.tz + + def test_from_M8_structured(self): + dates = [(datetime(2012, 9, 9, 0, 0), datetime(2012, 9, 8, 15, 10))] + arr = np.array(dates, dtype=[("Date", "M8[us]"), ("Forecasting", "M8[us]")]) + df = DataFrame(arr) + + assert df["Date"][0] == dates[0][0] + assert df["Forecasting"][0] == dates[0][1] + + s = Series(arr["Date"]) + assert isinstance(s[0], Timestamp) + assert s[0] == dates[0][0] + + def test_get_level_values_box(self): + from pandas import MultiIndex + + dates = date_range("1/1/2000", periods=4) + levels = [dates, [0, 1]] + codes = [[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]] + + index = MultiIndex(levels=levels, codes=codes) + + assert isinstance(index.get_level_values(0)[0], Timestamp) + + def test_view_tz(self): + # GH#24024 + ser = pd.Series(pd.date_range("2000", periods=4, tz="US/Central")) + result = ser.view("i8") + expected = pd.Series( + [ + 946706400000000000, + 946792800000000000, + 946879200000000000, + 946965600000000000, + ] + ) + tm.assert_series_equal(result, expected) + + def test_asarray_tz_naive(self): + # This shouldn't produce a warning. + ser = pd.Series(pd.date_range("2000", periods=2)) + expected = np.array(["2000-01-01", "2000-01-02"], dtype="M8[ns]") + result = np.asarray(ser) + + tm.assert_numpy_array_equal(result, expected) + + # optionally, object + result = np.asarray(ser, dtype=object) + + expected = np.array([pd.Timestamp("2000-01-01"), pd.Timestamp("2000-01-02")]) + tm.assert_numpy_array_equal(result, expected) + + def test_asarray_tz_aware(self): + tz = "US/Central" + ser = pd.Series(pd.date_range("2000", periods=2, tz=tz)) + expected = np.array(["2000-01-01T06", "2000-01-02T06"], dtype="M8[ns]") + result = np.asarray(ser, dtype="datetime64[ns]") + + tm.assert_numpy_array_equal(result, expected) + + # Old behavior with no warning + result = np.asarray(ser, dtype="M8[ns]") + + tm.assert_numpy_array_equal(result, expected) + + # Future behavior with no warning + expected = np.array( + [pd.Timestamp("2000-01-01", tz=tz), pd.Timestamp("2000-01-02", tz=tz)] + ) + result = np.asarray(ser, dtype=object) + + tm.assert_numpy_array_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/series/test_timezones.py b/venv/Lib/site-packages/pandas/tests/series/test_timezones.py new file mode 100644 index 0000000..a363f92 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/test_timezones.py @@ -0,0 +1,366 @@ +""" +Tests for Series timezone-related methods +""" +from datetime import datetime + +from dateutil.tz import tzoffset +import numpy as np +import pytest +import pytz + +from pandas._libs.tslibs import conversion, timezones + +from pandas import DatetimeIndex, Index, NaT, Series, Timestamp +import pandas._testing as tm +from pandas.core.indexes.datetimes import date_range + + +class TestSeriesTimezones: + # ----------------------------------------------------------------- + # Series.tz_localize + def test_series_tz_localize(self): + + rng = date_range("1/1/2011", periods=100, freq="H") + ts = Series(1, index=rng) + + result = ts.tz_localize("utc") + assert result.index.tz.zone == "UTC" + + # Can't localize if already tz-aware + rng = date_range("1/1/2011", periods=100, freq="H", tz="utc") + ts = Series(1, index=rng) + + with pytest.raises(TypeError, match="Already tz-aware"): + ts.tz_localize("US/Eastern") + + def test_series_tz_localize_ambiguous_bool(self): + # make sure that we are correctly accepting bool values as ambiguous + + # GH#14402 + ts = Timestamp("2015-11-01 01:00:03") + expected0 = Timestamp("2015-11-01 01:00:03-0500", tz="US/Central") + expected1 = Timestamp("2015-11-01 01:00:03-0600", tz="US/Central") + + ser = Series([ts]) + expected0 = Series([expected0]) + expected1 = Series([expected1]) + + with pytest.raises(pytz.AmbiguousTimeError): + ser.dt.tz_localize("US/Central") + + result = ser.dt.tz_localize("US/Central", ambiguous=True) + tm.assert_series_equal(result, expected0) + + result = ser.dt.tz_localize("US/Central", ambiguous=[True]) + tm.assert_series_equal(result, expected0) + + result = ser.dt.tz_localize("US/Central", ambiguous=False) + tm.assert_series_equal(result, expected1) + + result = ser.dt.tz_localize("US/Central", ambiguous=[False]) + tm.assert_series_equal(result, expected1) + + @pytest.mark.parametrize("tz", ["Europe/Warsaw", "dateutil/Europe/Warsaw"]) + @pytest.mark.parametrize( + "method, exp", + [ + ["shift_forward", "2015-03-29 03:00:00"], + ["NaT", NaT], + ["raise", None], + ["foo", "invalid"], + ], + ) + def test_series_tz_localize_nonexistent(self, tz, method, exp): + # GH 8917 + n = 60 + dti = date_range(start="2015-03-29 02:00:00", periods=n, freq="min") + s = Series(1, dti) + if method == "raise": + with pytest.raises(pytz.NonExistentTimeError): + s.tz_localize(tz, nonexistent=method) + elif exp == "invalid": + with pytest.raises(ValueError): + dti.tz_localize(tz, nonexistent=method) + else: + result = s.tz_localize(tz, nonexistent=method) + expected = Series(1, index=DatetimeIndex([exp] * n, tz=tz)) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_series_tz_localize_empty(self, tzstr): + # GH#2248 + ser = Series(dtype=object) + + ser2 = ser.tz_localize("utc") + assert ser2.index.tz == pytz.utc + + ser2 = ser.tz_localize(tzstr) + timezones.tz_compare(ser2.index.tz, timezones.maybe_get_tz(tzstr)) + + # ----------------------------------------------------------------- + # Series.tz_convert + + def test_series_tz_convert(self): + rng = date_range("1/1/2011", periods=200, freq="D", tz="US/Eastern") + ts = Series(1, index=rng) + + result = ts.tz_convert("Europe/Berlin") + assert result.index.tz.zone == "Europe/Berlin" + + # can't convert tz-naive + rng = date_range("1/1/2011", periods=200, freq="D") + ts = Series(1, index=rng) + + with pytest.raises(TypeError, match="Cannot convert tz-naive"): + ts.tz_convert("US/Eastern") + + def test_series_tz_convert_to_utc(self): + base = DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"], tz="UTC") + idx1 = base.tz_convert("Asia/Tokyo")[:2] + idx2 = base.tz_convert("US/Eastern")[1:] + + res = Series([1, 2], index=idx1) + Series([1, 1], index=idx2) + tm.assert_series_equal(res, Series([np.nan, 3, np.nan], index=base)) + + # ----------------------------------------------------------------- + # Series.append + + def test_series_append_aware(self): + rng1 = date_range("1/1/2011 01:00", periods=1, freq="H", tz="US/Eastern") + rng2 = date_range("1/1/2011 02:00", periods=1, freq="H", tz="US/Eastern") + ser1 = Series([1], index=rng1) + ser2 = Series([2], index=rng2) + ts_result = ser1.append(ser2) + + exp_index = DatetimeIndex( + ["2011-01-01 01:00", "2011-01-01 02:00"], tz="US/Eastern" + ) + exp = Series([1, 2], index=exp_index) + tm.assert_series_equal(ts_result, exp) + assert ts_result.index.tz == rng1.tz + + rng1 = date_range("1/1/2011 01:00", periods=1, freq="H", tz="UTC") + rng2 = date_range("1/1/2011 02:00", periods=1, freq="H", tz="UTC") + ser1 = Series([1], index=rng1) + ser2 = Series([2], index=rng2) + ts_result = ser1.append(ser2) + + exp_index = DatetimeIndex(["2011-01-01 01:00", "2011-01-01 02:00"], tz="UTC") + exp = Series([1, 2], index=exp_index) + tm.assert_series_equal(ts_result, exp) + utc = rng1.tz + assert utc == ts_result.index.tz + + # GH#7795 + # different tz coerces to object dtype, not UTC + rng1 = date_range("1/1/2011 01:00", periods=1, freq="H", tz="US/Eastern") + rng2 = date_range("1/1/2011 02:00", periods=1, freq="H", tz="US/Central") + ser1 = Series([1], index=rng1) + ser2 = Series([2], index=rng2) + ts_result = ser1.append(ser2) + exp_index = Index( + [ + Timestamp("1/1/2011 01:00", tz="US/Eastern"), + Timestamp("1/1/2011 02:00", tz="US/Central"), + ] + ) + exp = Series([1, 2], index=exp_index) + tm.assert_series_equal(ts_result, exp) + + def test_series_append_aware_naive(self): + rng1 = date_range("1/1/2011 01:00", periods=1, freq="H") + rng2 = date_range("1/1/2011 02:00", periods=1, freq="H", tz="US/Eastern") + ser1 = Series(np.random.randn(len(rng1)), index=rng1) + ser2 = Series(np.random.randn(len(rng2)), index=rng2) + ts_result = ser1.append(ser2) + + expected = ser1.index.astype(object).append(ser2.index.astype(object)) + assert ts_result.index.equals(expected) + + # mixed + rng1 = date_range("1/1/2011 01:00", periods=1, freq="H") + rng2 = range(100) + ser1 = Series(np.random.randn(len(rng1)), index=rng1) + ser2 = Series(np.random.randn(len(rng2)), index=rng2) + ts_result = ser1.append(ser2) + + expected = ser1.index.astype(object).append(ser2.index) + assert ts_result.index.equals(expected) + + def test_series_append_dst(self): + rng1 = date_range("1/1/2016 01:00", periods=3, freq="H", tz="US/Eastern") + rng2 = date_range("8/1/2016 01:00", periods=3, freq="H", tz="US/Eastern") + ser1 = Series([1, 2, 3], index=rng1) + ser2 = Series([10, 11, 12], index=rng2) + ts_result = ser1.append(ser2) + + exp_index = DatetimeIndex( + [ + "2016-01-01 01:00", + "2016-01-01 02:00", + "2016-01-01 03:00", + "2016-08-01 01:00", + "2016-08-01 02:00", + "2016-08-01 03:00", + ], + tz="US/Eastern", + ) + exp = Series([1, 2, 3, 10, 11, 12], index=exp_index) + tm.assert_series_equal(ts_result, exp) + assert ts_result.index.tz == rng1.tz + + # ----------------------------------------------------------------- + + def test_dateutil_tzoffset_support(self): + values = [188.5, 328.25] + tzinfo = tzoffset(None, 7200) + index = [ + datetime(2012, 5, 11, 11, tzinfo=tzinfo), + datetime(2012, 5, 11, 12, tzinfo=tzinfo), + ] + series = Series(data=values, index=index) + + assert series.index.tz == tzinfo + + # it works! #2443 + repr(series.index[0]) + + @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) + def test_tz_aware_asfreq(self, tz): + dr = date_range("2011-12-01", "2012-07-20", freq="D", tz=tz) + + ser = Series(np.random.randn(len(dr)), index=dr) + + # it works! + ser.asfreq("T") + + @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) + def test_string_index_alias_tz_aware(self, tz): + rng = date_range("1/1/2000", periods=10, tz=tz) + ser = Series(np.random.randn(len(rng)), index=rng) + + result = ser["1/3/2000"] + tm.assert_almost_equal(result, ser[2]) + + # TODO: De-duplicate with test below + def test_series_add_tz_mismatch_converts_to_utc_duplicate(self): + rng = date_range("1/1/2011", periods=10, freq="H", tz="US/Eastern") + ser = Series(np.random.randn(len(rng)), index=rng) + + ts_moscow = ser.tz_convert("Europe/Moscow") + + result = ser + ts_moscow + assert result.index.tz is pytz.utc + + result = ts_moscow + ser + assert result.index.tz is pytz.utc + + def test_series_add_tz_mismatch_converts_to_utc(self): + rng = date_range("1/1/2011", periods=100, freq="H", tz="utc") + + perm = np.random.permutation(100)[:90] + ser1 = Series( + np.random.randn(90), index=rng.take(perm).tz_convert("US/Eastern") + ) + + perm = np.random.permutation(100)[:90] + ser2 = Series( + np.random.randn(90), index=rng.take(perm).tz_convert("Europe/Berlin") + ) + + result = ser1 + ser2 + + uts1 = ser1.tz_convert("utc") + uts2 = ser2.tz_convert("utc") + expected = uts1 + uts2 + + assert result.index.tz == pytz.UTC + tm.assert_series_equal(result, expected) + + def test_series_add_aware_naive_raises(self): + rng = date_range("1/1/2011", periods=10, freq="H") + ser = Series(np.random.randn(len(rng)), index=rng) + + ser_utc = ser.tz_localize("utc") + + with pytest.raises(Exception): + ser + ser_utc + + with pytest.raises(Exception): + ser_utc + ser + + def test_series_align_aware(self): + idx1 = date_range("2001", periods=5, freq="H", tz="US/Eastern") + ser = Series(np.random.randn(len(idx1)), index=idx1) + ser_central = ser.tz_convert("US/Central") + # # different timezones convert to UTC + + new1, new2 = ser.align(ser_central) + assert new1.index.tz == pytz.UTC + assert new2.index.tz == pytz.UTC + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_localized_at_time_between_time(self, tzstr): + from datetime import time + + tz = timezones.maybe_get_tz(tzstr) + + rng = date_range("4/16/2012", "5/1/2012", freq="H") + ts = Series(np.random.randn(len(rng)), index=rng) + + ts_local = ts.tz_localize(tzstr) + + result = ts_local.at_time(time(10, 0)) + expected = ts.at_time(time(10, 0)).tz_localize(tzstr) + tm.assert_series_equal(result, expected) + assert timezones.tz_compare(result.index.tz, tz) + + t1, t2 = time(10, 0), time(11, 0) + result = ts_local.between_time(t1, t2) + expected = ts.between_time(t1, t2).tz_localize(tzstr) + tm.assert_series_equal(result, expected) + assert timezones.tz_compare(result.index.tz, tz) + + @pytest.mark.parametrize("tzstr", ["Europe/Berlin", "dateutil/Europe/Berlin"]) + def test_getitem_pydatetime_tz(self, tzstr): + tz = timezones.maybe_get_tz(tzstr) + + index = date_range( + start="2012-12-24 16:00", end="2012-12-24 18:00", freq="H", tz=tzstr + ) + ts = Series(index=index, data=index.hour) + time_pandas = Timestamp("2012-12-24 17:00", tz=tzstr) + + dt = datetime(2012, 12, 24, 17, 0) + time_datetime = conversion.localize_pydatetime(dt, tz) + assert ts[time_pandas] == ts[time_datetime] + + def test_series_truncate_datetimeindex_tz(self): + # GH 9243 + idx = date_range("4/1/2005", "4/30/2005", freq="D", tz="US/Pacific") + s = Series(range(len(idx)), index=idx) + result = s.truncate(datetime(2005, 4, 2), datetime(2005, 4, 4)) + expected = Series([1, 2, 3], index=idx[1:4]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("copy", [True, False]) + @pytest.mark.parametrize( + "method, tz", [["tz_localize", None], ["tz_convert", "Europe/Berlin"]] + ) + def test_tz_localize_convert_copy_inplace_mutate(self, copy, method, tz): + # GH 6326 + result = Series( + np.arange(0, 5), index=date_range("20131027", periods=5, freq="1H", tz=tz) + ) + getattr(result, method)("UTC", copy=copy) + expected = Series( + np.arange(0, 5), index=date_range("20131027", periods=5, freq="1H", tz=tz) + ) + tm.assert_series_equal(result, expected) + + def test_constructor_data_aware_dtype_naive(self, tz_aware_fixture): + # GH 25843 + tz = tz_aware_fixture + result = Series([Timestamp("2019", tz=tz)], dtype="datetime64[ns]") + expected = Series([Timestamp("2019")]) + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/series/test_ufunc.py b/venv/Lib/site-packages/pandas/tests/series/test_ufunc.py new file mode 100644 index 0000000..ece7f1f --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/test_ufunc.py @@ -0,0 +1,304 @@ +from collections import deque +import string + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.arrays import SparseArray + +UNARY_UFUNCS = [np.positive, np.floor, np.exp] +BINARY_UFUNCS = [np.add, np.logaddexp] # dunder op +SPARSE = [True, False] +SPARSE_IDS = ["sparse", "dense"] +SHUFFLE = [True, False] + + +@pytest.fixture +def arrays_for_binary_ufunc(): + """ + A pair of random, length-100 integer-dtype arrays, that are mostly 0. + """ + a1 = np.random.randint(0, 10, 100, dtype="int64") + a2 = np.random.randint(0, 10, 100, dtype="int64") + a1[::3] = 0 + a2[::4] = 0 + return a1, a2 + + +@pytest.mark.parametrize("ufunc", UNARY_UFUNCS) +@pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) +def test_unary_ufunc(ufunc, sparse): + # Test that ufunc(Series) == Series(ufunc) + array = np.random.randint(0, 10, 10, dtype="int64") + array[::2] = 0 + if sparse: + array = SparseArray(array, dtype=pd.SparseDtype("int64", 0)) + + index = list(string.ascii_letters[:10]) + name = "name" + series = pd.Series(array, index=index, name=name) + + result = ufunc(series) + expected = pd.Series(ufunc(array), index=index, name=name) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ufunc", BINARY_UFUNCS) +@pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) +@pytest.mark.parametrize("flip", [True, False], ids=["flipped", "straight"]) +def test_binary_ufunc_with_array(flip, sparse, ufunc, arrays_for_binary_ufunc): + # Test that ufunc(Series(a), array) == Series(ufunc(a, b)) + a1, a2 = arrays_for_binary_ufunc + if sparse: + a1 = SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) + a2 = SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) + + name = "name" # op(Series, array) preserves the name. + series = pd.Series(a1, name=name) + other = a2 + + array_args = (a1, a2) + series_args = (series, other) # ufunc(series, array) + + if flip: + array_args = reversed(array_args) + series_args = reversed(series_args) # ufunc(array, series) + + expected = pd.Series(ufunc(*array_args), name=name) + result = ufunc(*series_args) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ufunc", BINARY_UFUNCS) +@pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) +@pytest.mark.parametrize("flip", [True, False], ids=["flipped", "straight"]) +def test_binary_ufunc_with_index(flip, sparse, ufunc, arrays_for_binary_ufunc): + # Test that + # * func(Series(a), Series(b)) == Series(ufunc(a, b)) + # * ufunc(Index, Series) dispatches to Series (returns a Series) + a1, a2 = arrays_for_binary_ufunc + if sparse: + a1 = SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) + a2 = SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) + + name = "name" # op(Series, array) preserves the name. + series = pd.Series(a1, name=name) + other = pd.Index(a2, name=name).astype("int64") + + array_args = (a1, a2) + series_args = (series, other) # ufunc(series, array) + + if flip: + array_args = reversed(array_args) + series_args = reversed(series_args) # ufunc(array, series) + + expected = pd.Series(ufunc(*array_args), name=name) + result = ufunc(*series_args) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ufunc", BINARY_UFUNCS) +@pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) +@pytest.mark.parametrize("shuffle", [True, False], ids=["unaligned", "aligned"]) +@pytest.mark.parametrize("flip", [True, False], ids=["flipped", "straight"]) +def test_binary_ufunc_with_series( + flip, shuffle, sparse, ufunc, arrays_for_binary_ufunc +): + # Test that + # * func(Series(a), Series(b)) == Series(ufunc(a, b)) + # with alignment between the indices + a1, a2 = arrays_for_binary_ufunc + if sparse: + a1 = SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) + a2 = SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) + + name = "name" # op(Series, array) preserves the name. + series = pd.Series(a1, name=name) + other = pd.Series(a2, name=name) + + idx = np.random.permutation(len(a1)) + + if shuffle: + other = other.take(idx) + if flip: + index = other.align(series)[0].index + else: + index = series.align(other)[0].index + else: + index = series.index + + array_args = (a1, a2) + series_args = (series, other) # ufunc(series, array) + + if flip: + array_args = tuple(reversed(array_args)) + series_args = tuple(reversed(series_args)) # ufunc(array, series) + + expected = pd.Series(ufunc(*array_args), index=index, name=name) + result = ufunc(*series_args) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ufunc", BINARY_UFUNCS) +@pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) +@pytest.mark.parametrize("flip", [True, False]) +def test_binary_ufunc_scalar(ufunc, sparse, flip, arrays_for_binary_ufunc): + # Test that + # * ufunc(Series, scalar) == Series(ufunc(array, scalar)) + # * ufunc(Series, scalar) == ufunc(scalar, Series) + array, _ = arrays_for_binary_ufunc + if sparse: + array = SparseArray(array) + other = 2 + series = pd.Series(array, name="name") + + series_args = (series, other) + array_args = (array, other) + + if flip: + series_args = tuple(reversed(series_args)) + array_args = tuple(reversed(array_args)) + + expected = pd.Series(ufunc(*array_args), name="name") + result = ufunc(*series_args) + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ufunc", [np.divmod]) # any others? +@pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) +@pytest.mark.parametrize("shuffle", SHUFFLE) +@pytest.mark.filterwarnings("ignore:divide by zero:RuntimeWarning") +def test_multiple_ouput_binary_ufuncs(ufunc, sparse, shuffle, arrays_for_binary_ufunc): + # Test that + # the same conditions from binary_ufunc_scalar apply to + # ufuncs with multiple outputs. + if sparse and ufunc is np.divmod: + pytest.skip("sparse divmod not implemented.") + + a1, a2 = arrays_for_binary_ufunc + # work around https://github.com/pandas-dev/pandas/issues/26987 + a1[a1 == 0] = 1 + a2[a2 == 0] = 1 + + if sparse: + a1 = SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) + a2 = SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) + + s1 = pd.Series(a1) + s2 = pd.Series(a2) + + if shuffle: + # ensure we align before applying the ufunc + s2 = s2.sample(frac=1) + + expected = ufunc(a1, a2) + assert isinstance(expected, tuple) + + result = ufunc(s1, s2) + assert isinstance(result, tuple) + tm.assert_series_equal(result[0], pd.Series(expected[0])) + tm.assert_series_equal(result[1], pd.Series(expected[1])) + + +@pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) +def test_multiple_ouput_ufunc(sparse, arrays_for_binary_ufunc): + # Test that the same conditions from unary input apply to multi-output + # ufuncs + array, _ = arrays_for_binary_ufunc + + if sparse: + array = SparseArray(array) + + series = pd.Series(array, name="name") + result = np.modf(series) + expected = np.modf(array) + + assert isinstance(result, tuple) + assert isinstance(expected, tuple) + + tm.assert_series_equal(result[0], pd.Series(expected[0], name="name")) + tm.assert_series_equal(result[1], pd.Series(expected[1], name="name")) + + +@pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) +@pytest.mark.parametrize("ufunc", BINARY_UFUNCS) +def test_binary_ufunc_drops_series_name(ufunc, sparse, arrays_for_binary_ufunc): + # Drop the names when they differ. + a1, a2 = arrays_for_binary_ufunc + s1 = pd.Series(a1, name="a") + s2 = pd.Series(a2, name="b") + + result = ufunc(s1, s2) + assert result.name is None + + +def test_object_series_ok(): + class Dummy: + def __init__(self, value): + self.value = value + + def __add__(self, other): + return self.value + other.value + + arr = np.array([Dummy(0), Dummy(1)]) + ser = pd.Series(arr) + tm.assert_series_equal(np.add(ser, ser), pd.Series(np.add(ser, arr))) + tm.assert_series_equal(np.add(ser, Dummy(1)), pd.Series(np.add(ser, Dummy(1)))) + + +@pytest.mark.parametrize( + "values", + [ + pd.array([1, 3, 2], dtype="int64"), + pd.array([1, 10, 0], dtype="Sparse[int]"), + pd.to_datetime(["2000", "2010", "2001"]), + pd.to_datetime(["2000", "2010", "2001"]).tz_localize("CET"), + pd.to_datetime(["2000", "2010", "2001"]).to_period(freq="D"), + ], +) +def test_reduce(values): + a = pd.Series(values) + assert np.maximum.reduce(a) == values[1] + + +@pytest.mark.parametrize("type_", [list, deque, tuple]) +def test_binary_ufunc_other_types(type_): + a = pd.Series([1, 2, 3], name="name") + b = type_([3, 4, 5]) + + result = np.add(a, b) + expected = pd.Series(np.add(a.to_numpy(), b), name="name") + tm.assert_series_equal(result, expected) + + +def test_object_dtype_ok(): + class Thing: + def __init__(self, value): + self.value = value + + def __add__(self, other): + other = getattr(other, "value", other) + return type(self)(self.value + other) + + def __eq__(self, other) -> bool: + return type(other) is Thing and self.value == other.value + + def __repr__(self) -> str: + return "Thing({})".format(self.value) + + s = pd.Series([Thing(1), Thing(2)]) + result = np.add(s, Thing(1)) + expected = pd.Series([Thing(2), Thing(3)]) + tm.assert_series_equal(result, expected) + + +def test_outer(): + # https://github.com/pandas-dev/pandas/issues/27186 + s = pd.Series([1, 2, 3]) + o = np.array([1, 2, 3]) + + with pytest.raises(NotImplementedError): + np.subtract.outer(s, o) diff --git a/venv/Lib/site-packages/pandas/tests/series/test_validate.py b/venv/Lib/site-packages/pandas/tests/series/test_validate.py new file mode 100644 index 0000000..c4311f5 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/series/test_validate.py @@ -0,0 +1,20 @@ +import pytest + + +class TestSeriesValidate: + """Tests for error handling related to data types of method arguments.""" + + @pytest.mark.parametrize( + "func", + ["reset_index", "_set_name", "sort_values", "sort_index", "rename", "dropna"], + ) + @pytest.mark.parametrize("inplace", [1, "True", [1, 2, 3], 5.0]) + def test_validate_bool_args(self, string_series, func, inplace): + msg = 'For argument "inplace" expected type bool' + kwargs = dict(inplace=inplace) + + if func == "_set_name": + kwargs["name"] = "hello" + + with pytest.raises(ValueError, match=msg): + getattr(string_series, func)(**kwargs) diff --git a/venv/Lib/site-packages/pandas/tests/test_algos.py b/venv/Lib/site-packages/pandas/tests/test_algos.py new file mode 100644 index 0000000..2b46f86 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/test_algos.py @@ -0,0 +1,2283 @@ +from datetime import datetime +from itertools import permutations +import struct + +import numpy as np +from numpy.random import RandomState +import pytest + +from pandas._libs import algos as libalgos, groupby as libgroupby, hashtable as ht +from pandas.compat.numpy import np_array_datetime64_compat +import pandas.util._test_decorators as td + +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_complex_dtype, + is_float_dtype, + is_integer_dtype, + is_object_dtype, +) +from pandas.core.dtypes.dtypes import CategoricalDtype as CDT + +import pandas as pd +from pandas import ( + Categorical, + CategoricalIndex, + DatetimeIndex, + Index, + IntervalIndex, + Series, + Timestamp, + compat, +) +import pandas._testing as tm +from pandas.conftest import BYTES_DTYPES, STRING_DTYPES +import pandas.core.algorithms as algos +from pandas.core.arrays import DatetimeArray +import pandas.core.common as com + + +class TestFactorize: + def test_basic(self): + + codes, uniques = algos.factorize(["a", "b", "b", "a", "a", "c", "c", "c"]) + tm.assert_numpy_array_equal(uniques, np.array(["a", "b", "c"], dtype=object)) + + codes, uniques = algos.factorize( + ["a", "b", "b", "a", "a", "c", "c", "c"], sort=True + ) + exp = np.array([0, 1, 1, 0, 0, 2, 2, 2], dtype=np.intp) + tm.assert_numpy_array_equal(codes, exp) + exp = np.array(["a", "b", "c"], dtype=object) + tm.assert_numpy_array_equal(uniques, exp) + + codes, uniques = algos.factorize(list(reversed(range(5)))) + exp = np.array([0, 1, 2, 3, 4], dtype=np.intp) + tm.assert_numpy_array_equal(codes, exp) + exp = np.array([4, 3, 2, 1, 0], dtype=np.int64) + tm.assert_numpy_array_equal(uniques, exp) + + codes, uniques = algos.factorize(list(reversed(range(5))), sort=True) + + exp = np.array([4, 3, 2, 1, 0], dtype=np.intp) + tm.assert_numpy_array_equal(codes, exp) + exp = np.array([0, 1, 2, 3, 4], dtype=np.int64) + tm.assert_numpy_array_equal(uniques, exp) + + codes, uniques = algos.factorize(list(reversed(np.arange(5.0)))) + exp = np.array([0, 1, 2, 3, 4], dtype=np.intp) + tm.assert_numpy_array_equal(codes, exp) + exp = np.array([4.0, 3.0, 2.0, 1.0, 0.0], dtype=np.float64) + tm.assert_numpy_array_equal(uniques, exp) + + codes, uniques = algos.factorize(list(reversed(np.arange(5.0))), sort=True) + exp = np.array([4, 3, 2, 1, 0], dtype=np.intp) + tm.assert_numpy_array_equal(codes, exp) + exp = np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=np.float64) + tm.assert_numpy_array_equal(uniques, exp) + + def test_mixed(self): + + # doc example reshaping.rst + x = Series(["A", "A", np.nan, "B", 3.14, np.inf]) + codes, uniques = algos.factorize(x) + + exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.intp) + tm.assert_numpy_array_equal(codes, exp) + exp = Index(["A", "B", 3.14, np.inf]) + tm.assert_index_equal(uniques, exp) + + codes, uniques = algos.factorize(x, sort=True) + exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.intp) + tm.assert_numpy_array_equal(codes, exp) + exp = Index([3.14, np.inf, "A", "B"]) + tm.assert_index_equal(uniques, exp) + + def test_datelike(self): + + # M8 + v1 = Timestamp("20130101 09:00:00.00004") + v2 = Timestamp("20130101") + x = Series([v1, v1, v1, v2, v2, v1]) + codes, uniques = algos.factorize(x) + + exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp) + tm.assert_numpy_array_equal(codes, exp) + exp = DatetimeIndex([v1, v2]) + tm.assert_index_equal(uniques, exp) + + codes, uniques = algos.factorize(x, sort=True) + exp = np.array([1, 1, 1, 0, 0, 1], dtype=np.intp) + tm.assert_numpy_array_equal(codes, exp) + exp = DatetimeIndex([v2, v1]) + tm.assert_index_equal(uniques, exp) + + # period + v1 = pd.Period("201302", freq="M") + v2 = pd.Period("201303", freq="M") + x = Series([v1, v1, v1, v2, v2, v1]) + + # periods are not 'sorted' as they are converted back into an index + codes, uniques = algos.factorize(x) + exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp) + tm.assert_numpy_array_equal(codes, exp) + tm.assert_index_equal(uniques, pd.PeriodIndex([v1, v2])) + + codes, uniques = algos.factorize(x, sort=True) + exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp) + tm.assert_numpy_array_equal(codes, exp) + tm.assert_index_equal(uniques, pd.PeriodIndex([v1, v2])) + + # GH 5986 + v1 = pd.to_timedelta("1 day 1 min") + v2 = pd.to_timedelta("1 day") + x = Series([v1, v2, v1, v1, v2, v2, v1]) + codes, uniques = algos.factorize(x) + exp = np.array([0, 1, 0, 0, 1, 1, 0], dtype=np.intp) + tm.assert_numpy_array_equal(codes, exp) + tm.assert_index_equal(uniques, pd.to_timedelta([v1, v2])) + + codes, uniques = algos.factorize(x, sort=True) + exp = np.array([1, 0, 1, 1, 0, 0, 1], dtype=np.intp) + tm.assert_numpy_array_equal(codes, exp) + tm.assert_index_equal(uniques, pd.to_timedelta([v2, v1])) + + def test_factorize_nan(self): + # nan should map to na_sentinel, not reverse_indexer[na_sentinel] + # rizer.factorize should not raise an exception if na_sentinel indexes + # outside of reverse_indexer + key = np.array([1, 2, 1, np.nan], dtype="O") + rizer = ht.Factorizer(len(key)) + for na_sentinel in (-1, 20): + ids = rizer.factorize(key, sort=True, na_sentinel=na_sentinel) + expected = np.array([0, 1, 0, na_sentinel], dtype="int32") + assert len(set(key)) == len(set(expected)) + tm.assert_numpy_array_equal(pd.isna(key), expected == na_sentinel) + + # nan still maps to na_sentinel when sort=False + key = np.array([0, np.nan, 1], dtype="O") + na_sentinel = -1 + + # TODO(wesm): unused? + ids = rizer.factorize(key, sort=False, na_sentinel=na_sentinel) # noqa + + expected = np.array([2, -1, 0], dtype="int32") + assert len(set(key)) == len(set(expected)) + tm.assert_numpy_array_equal(pd.isna(key), expected == na_sentinel) + + @pytest.mark.parametrize( + "data, expected_codes, expected_uniques", + [ + ( + [(1, 1), (1, 2), (0, 0), (1, 2), "nonsense"], + [0, 1, 2, 1, 3], + [(1, 1), (1, 2), (0, 0), "nonsense"], + ), + ( + [(1, 1), (1, 2), (0, 0), (1, 2), (1, 2, 3)], + [0, 1, 2, 1, 3], + [(1, 1), (1, 2), (0, 0), (1, 2, 3)], + ), + ([(1, 1), (1, 2), (0, 0), (1, 2)], [0, 1, 2, 1], [(1, 1), (1, 2), (0, 0)]), + ], + ) + def test_factorize_tuple_list(self, data, expected_codes, expected_uniques): + # GH9454 + codes, uniques = pd.factorize(data) + + tm.assert_numpy_array_equal(codes, np.array(expected_codes, dtype=np.intp)) + + expected_uniques_array = com.asarray_tuplesafe(expected_uniques, dtype=object) + tm.assert_numpy_array_equal(uniques, expected_uniques_array) + + def test_complex_sorting(self): + # gh 12666 - check no segfault + x17 = np.array([complex(i) for i in range(17)], dtype=object) + + msg = ( + "unorderable types: .* [<>] .*" + "|" # the above case happens for numpy < 1.14 + "'[<>]' not supported between instances of .*" + ) + with pytest.raises(TypeError, match=msg): + algos.factorize(x17[::-1], sort=True) + + def test_float64_factorize(self, writable): + data = np.array([1.0, 1e8, 1.0, 1e-8, 1e8, 1.0], dtype=np.float64) + data.setflags(write=writable) + expected_codes = np.array([0, 1, 0, 2, 1, 0], dtype=np.intp) + expected_uniques = np.array([1.0, 1e8, 1e-8], dtype=np.float64) + + codes, uniques = algos.factorize(data) + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_numpy_array_equal(uniques, expected_uniques) + + def test_uint64_factorize(self, writable): + data = np.array([2 ** 64 - 1, 1, 2 ** 64 - 1], dtype=np.uint64) + data.setflags(write=writable) + expected_codes = np.array([0, 1, 0], dtype=np.intp) + expected_uniques = np.array([2 ** 64 - 1, 1], dtype=np.uint64) + + codes, uniques = algos.factorize(data) + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_numpy_array_equal(uniques, expected_uniques) + + def test_int64_factorize(self, writable): + data = np.array([2 ** 63 - 1, -(2 ** 63), 2 ** 63 - 1], dtype=np.int64) + data.setflags(write=writable) + expected_codes = np.array([0, 1, 0], dtype=np.intp) + expected_uniques = np.array([2 ** 63 - 1, -(2 ** 63)], dtype=np.int64) + + codes, uniques = algos.factorize(data) + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_numpy_array_equal(uniques, expected_uniques) + + def test_string_factorize(self, writable): + data = np.array(["a", "c", "a", "b", "c"], dtype=object) + data.setflags(write=writable) + expected_codes = np.array([0, 1, 0, 2, 1], dtype=np.intp) + expected_uniques = np.array(["a", "c", "b"], dtype=object) + + codes, uniques = algos.factorize(data) + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_numpy_array_equal(uniques, expected_uniques) + + def test_object_factorize(self, writable): + data = np.array(["a", "c", None, np.nan, "a", "b", pd.NaT, "c"], dtype=object) + data.setflags(write=writable) + expected_codes = np.array([0, 1, -1, -1, 0, 2, -1, 1], dtype=np.intp) + expected_uniques = np.array(["a", "c", "b"], dtype=object) + + codes, uniques = algos.factorize(data) + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_numpy_array_equal(uniques, expected_uniques) + + def test_deprecate_order(self): + # gh 19727 - check warning is raised for deprecated keyword, order. + # Test not valid once order keyword is removed. + data = np.array([2 ** 63, 1, 2 ** 63], dtype=np.uint64) + with pytest.raises(TypeError, match="got an unexpected keyword"): + algos.factorize(data, order=True) + with tm.assert_produces_warning(False): + algos.factorize(data) + + @pytest.mark.parametrize( + "data", + [ + np.array([0, 1, 0], dtype="u8"), + np.array([-(2 ** 63), 1, -(2 ** 63)], dtype="i8"), + np.array(["__nan__", "foo", "__nan__"], dtype="object"), + ], + ) + def test_parametrized_factorize_na_value_default(self, data): + # arrays that include the NA default for that type, but isn't used. + codes, uniques = algos.factorize(data) + expected_uniques = data[[0, 1]] + expected_codes = np.array([0, 1, 0], dtype=np.intp) + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_numpy_array_equal(uniques, expected_uniques) + + @pytest.mark.parametrize( + "data, na_value", + [ + (np.array([0, 1, 0, 2], dtype="u8"), 0), + (np.array([1, 0, 1, 2], dtype="u8"), 1), + (np.array([-(2 ** 63), 1, -(2 ** 63), 0], dtype="i8"), -(2 ** 63)), + (np.array([1, -(2 ** 63), 1, 0], dtype="i8"), 1), + (np.array(["a", "", "a", "b"], dtype=object), "a"), + (np.array([(), ("a", 1), (), ("a", 2)], dtype=object), ()), + (np.array([("a", 1), (), ("a", 1), ("a", 2)], dtype=object), ("a", 1)), + ], + ) + def test_parametrized_factorize_na_value(self, data, na_value): + codes, uniques = algos._factorize_array(data, na_value=na_value) + expected_uniques = data[[1, 3]] + expected_codes = np.array([-1, 0, -1, 1], dtype=np.intp) + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_numpy_array_equal(uniques, expected_uniques) + + @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("na_sentinel", [-1, -10, 100]) + @pytest.mark.parametrize( + "data, uniques", + [ + ( + np.array(["b", "a", None, "b"], dtype=object), + np.array(["b", "a"], dtype=object), + ), + ( + pd.array([2, 1, np.nan, 2], dtype="Int64"), + pd.array([2, 1], dtype="Int64"), + ), + ], + ids=["numpy_array", "extension_array"], + ) + def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): + codes, uniques = algos.factorize(data, sort=sort, na_sentinel=na_sentinel) + if sort: + expected_codes = np.array([1, 0, na_sentinel, 1], dtype=np.intp) + expected_uniques = algos.safe_sort(uniques) + else: + expected_codes = np.array([0, 1, na_sentinel, 0], dtype=np.intp) + expected_uniques = uniques + tm.assert_numpy_array_equal(codes, expected_codes) + if isinstance(data, np.ndarray): + tm.assert_numpy_array_equal(uniques, expected_uniques) + else: + tm.assert_extension_array_equal(uniques, expected_uniques) + + +class TestUnique: + def test_ints(self): + arr = np.random.randint(0, 100, size=50) + + result = algos.unique(arr) + assert isinstance(result, np.ndarray) + + def test_objects(self): + arr = np.random.randint(0, 100, size=50).astype("O") + + result = algos.unique(arr) + assert isinstance(result, np.ndarray) + + def test_object_refcount_bug(self): + lst = ["A", "B", "C", "D", "E"] + for i in range(1000): + len(algos.unique(lst)) + + def test_on_index_object(self): + + mindex = pd.MultiIndex.from_arrays( + [np.arange(5).repeat(5), np.tile(np.arange(5), 5)] + ) + expected = mindex.values + expected.sort() + + mindex = mindex.repeat(2) + + result = pd.unique(mindex) + result.sort() + + tm.assert_almost_equal(result, expected) + + def test_dtype_preservation(self, any_numpy_dtype): + # GH 15442 + if any_numpy_dtype in (BYTES_DTYPES + STRING_DTYPES): + pytest.skip("skip string dtype") + elif is_integer_dtype(any_numpy_dtype): + data = [1, 2, 2] + uniques = [1, 2] + elif is_float_dtype(any_numpy_dtype): + data = [1, 2, 2] + uniques = [1.0, 2.0] + elif is_complex_dtype(any_numpy_dtype): + data = [complex(1, 0), complex(2, 0), complex(2, 0)] + uniques = [complex(1, 0), complex(2, 0)] + elif is_bool_dtype(any_numpy_dtype): + data = [True, True, False] + uniques = [True, False] + elif is_object_dtype(any_numpy_dtype): + data = ["A", "B", "B"] + uniques = ["A", "B"] + else: + # datetime64[ns]/M8[ns]/timedelta64[ns]/m8[ns] tested elsewhere + data = [1, 2, 2] + uniques = [1, 2] + + result = Series(data, dtype=any_numpy_dtype).unique() + expected = np.array(uniques, dtype=any_numpy_dtype) + + tm.assert_numpy_array_equal(result, expected) + + def test_datetime64_dtype_array_returned(self): + # GH 9431 + expected = np_array_datetime64_compat( + [ + "2015-01-03T00:00:00.000000000+0000", + "2015-01-01T00:00:00.000000000+0000", + ], + dtype="M8[ns]", + ) + + dt_index = pd.to_datetime( + [ + "2015-01-03T00:00:00.000000000", + "2015-01-01T00:00:00.000000000", + "2015-01-01T00:00:00.000000000", + ] + ) + result = algos.unique(dt_index) + tm.assert_numpy_array_equal(result, expected) + assert result.dtype == expected.dtype + + s = Series(dt_index) + result = algos.unique(s) + tm.assert_numpy_array_equal(result, expected) + assert result.dtype == expected.dtype + + arr = s.values + result = algos.unique(arr) + tm.assert_numpy_array_equal(result, expected) + assert result.dtype == expected.dtype + + def test_timedelta64_dtype_array_returned(self): + # GH 9431 + expected = np.array([31200, 45678, 10000], dtype="m8[ns]") + + td_index = pd.to_timedelta([31200, 45678, 31200, 10000, 45678]) + result = algos.unique(td_index) + tm.assert_numpy_array_equal(result, expected) + assert result.dtype == expected.dtype + + s = Series(td_index) + result = algos.unique(s) + tm.assert_numpy_array_equal(result, expected) + assert result.dtype == expected.dtype + + arr = s.values + result = algos.unique(arr) + tm.assert_numpy_array_equal(result, expected) + assert result.dtype == expected.dtype + + def test_uint64_overflow(self): + s = Series([1, 2, 2 ** 63, 2 ** 63], dtype=np.uint64) + exp = np.array([1, 2, 2 ** 63], dtype=np.uint64) + tm.assert_numpy_array_equal(algos.unique(s), exp) + + def test_nan_in_object_array(self): + duplicated_items = ["a", np.nan, "c", "c"] + result = pd.unique(duplicated_items) + expected = np.array(["a", np.nan, "c"], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + def test_categorical(self): + + # we are expecting to return in the order + # of appearance + expected = Categorical(list("bac"), categories=list("bac")) + + # we are expecting to return in the order + # of the categories + expected_o = Categorical(list("bac"), categories=list("abc"), ordered=True) + + # GH 15939 + c = Categorical(list("baabc")) + result = c.unique() + tm.assert_categorical_equal(result, expected) + + result = algos.unique(c) + tm.assert_categorical_equal(result, expected) + + c = Categorical(list("baabc"), ordered=True) + result = c.unique() + tm.assert_categorical_equal(result, expected_o) + + result = algos.unique(c) + tm.assert_categorical_equal(result, expected_o) + + # Series of categorical dtype + s = Series(Categorical(list("baabc")), name="foo") + result = s.unique() + tm.assert_categorical_equal(result, expected) + + result = pd.unique(s) + tm.assert_categorical_equal(result, expected) + + # CI -> return CI + ci = CategoricalIndex(Categorical(list("baabc"), categories=list("bac"))) + expected = CategoricalIndex(expected) + result = ci.unique() + tm.assert_index_equal(result, expected) + + result = pd.unique(ci) + tm.assert_index_equal(result, expected) + + def test_datetime64tz_aware(self): + # GH 15939 + + result = Series( + Index( + [ + Timestamp("20160101", tz="US/Eastern"), + Timestamp("20160101", tz="US/Eastern"), + ] + ) + ).unique() + expected = DatetimeArray._from_sequence( + np.array([Timestamp("2016-01-01 00:00:00-0500", tz="US/Eastern")]) + ) + tm.assert_extension_array_equal(result, expected) + + result = Index( + [ + Timestamp("20160101", tz="US/Eastern"), + Timestamp("20160101", tz="US/Eastern"), + ] + ).unique() + expected = DatetimeIndex( + ["2016-01-01 00:00:00"], dtype="datetime64[ns, US/Eastern]", freq=None + ) + tm.assert_index_equal(result, expected) + + result = pd.unique( + Series( + Index( + [ + Timestamp("20160101", tz="US/Eastern"), + Timestamp("20160101", tz="US/Eastern"), + ] + ) + ) + ) + expected = DatetimeArray._from_sequence( + np.array([Timestamp("2016-01-01", tz="US/Eastern")]) + ) + tm.assert_extension_array_equal(result, expected) + + result = pd.unique( + Index( + [ + Timestamp("20160101", tz="US/Eastern"), + Timestamp("20160101", tz="US/Eastern"), + ] + ) + ) + expected = DatetimeIndex( + ["2016-01-01 00:00:00"], dtype="datetime64[ns, US/Eastern]", freq=None + ) + tm.assert_index_equal(result, expected) + + def test_order_of_appearance(self): + # 9346 + # light testing of guarantee of order of appearance + # these also are the doc-examples + result = pd.unique(Series([2, 1, 3, 3])) + tm.assert_numpy_array_equal(result, np.array([2, 1, 3], dtype="int64")) + + result = pd.unique(Series([2] + [1] * 5)) + tm.assert_numpy_array_equal(result, np.array([2, 1], dtype="int64")) + + result = pd.unique(Series([Timestamp("20160101"), Timestamp("20160101")])) + expected = np.array(["2016-01-01T00:00:00.000000000"], dtype="datetime64[ns]") + tm.assert_numpy_array_equal(result, expected) + + result = pd.unique( + Index( + [ + Timestamp("20160101", tz="US/Eastern"), + Timestamp("20160101", tz="US/Eastern"), + ] + ) + ) + expected = DatetimeIndex( + ["2016-01-01 00:00:00"], dtype="datetime64[ns, US/Eastern]", freq=None + ) + tm.assert_index_equal(result, expected) + + result = pd.unique(list("aabc")) + expected = np.array(["a", "b", "c"], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + result = pd.unique(Series(Categorical(list("aabc")))) + expected = Categorical(list("abc")) + tm.assert_categorical_equal(result, expected) + + @pytest.mark.parametrize( + "arg ,expected", + [ + (("1", "1", "2"), np.array(["1", "2"], dtype=object)), + (("foo",), np.array(["foo"], dtype=object)), + ], + ) + def test_tuple_with_strings(self, arg, expected): + # see GH 17108 + result = pd.unique(arg) + tm.assert_numpy_array_equal(result, expected) + + def test_obj_none_preservation(self): + # GH 20866 + arr = np.array(["foo", None], dtype=object) + result = pd.unique(arr) + expected = np.array(["foo", None], dtype=object) + + tm.assert_numpy_array_equal(result, expected, strict_nan=True) + + def test_signed_zero(self): + # GH 21866 + a = np.array([-0.0, 0.0]) + result = pd.unique(a) + expected = np.array([-0.0]) # 0.0 and -0.0 are equivalent + tm.assert_numpy_array_equal(result, expected) + + def test_different_nans(self): + # GH 21866 + # create different nans from bit-patterns: + NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0] + NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0] + assert NAN1 != NAN1 + assert NAN2 != NAN2 + a = np.array([NAN1, NAN2]) # NAN1 and NAN2 are equivalent + result = pd.unique(a) + expected = np.array([np.nan]) + tm.assert_numpy_array_equal(result, expected) + + def test_first_nan_kept(self): + # GH 22295 + # create different nans from bit-patterns: + bits_for_nan1 = 0xFFF8000000000001 + bits_for_nan2 = 0x7FF8000000000001 + NAN1 = struct.unpack("d", struct.pack("=Q", bits_for_nan1))[0] + NAN2 = struct.unpack("d", struct.pack("=Q", bits_for_nan2))[0] + assert NAN1 != NAN1 + assert NAN2 != NAN2 + for el_type in [np.float64, np.object]: + a = np.array([NAN1, NAN2], dtype=el_type) + result = pd.unique(a) + assert result.size == 1 + # use bit patterns to identify which nan was kept: + result_nan_bits = struct.unpack("=Q", struct.pack("d", result[0]))[0] + assert result_nan_bits == bits_for_nan1 + + def test_do_not_mangle_na_values(self, unique_nulls_fixture, unique_nulls_fixture2): + # GH 22295 + if unique_nulls_fixture is unique_nulls_fixture2: + return # skip it, values not unique + a = np.array([unique_nulls_fixture, unique_nulls_fixture2], dtype=np.object) + result = pd.unique(a) + assert result.size == 2 + assert a[0] is unique_nulls_fixture + assert a[1] is unique_nulls_fixture2 + + +class TestIsin: + def test_invalid(self): + + msg = ( + r"only list-like objects are allowed to be passed to isin\(\)," + r" you passed a \[int\]" + ) + with pytest.raises(TypeError, match=msg): + algos.isin(1, 1) + with pytest.raises(TypeError, match=msg): + algos.isin(1, [1]) + with pytest.raises(TypeError, match=msg): + algos.isin([1], 1) + + def test_basic(self): + + result = algos.isin([1, 2], [1]) + expected = np.array([True, False]) + tm.assert_numpy_array_equal(result, expected) + + result = algos.isin(np.array([1, 2]), [1]) + expected = np.array([True, False]) + tm.assert_numpy_array_equal(result, expected) + + result = algos.isin(Series([1, 2]), [1]) + expected = np.array([True, False]) + tm.assert_numpy_array_equal(result, expected) + + result = algos.isin(Series([1, 2]), Series([1])) + expected = np.array([True, False]) + tm.assert_numpy_array_equal(result, expected) + + result = algos.isin(Series([1, 2]), {1}) + expected = np.array([True, False]) + tm.assert_numpy_array_equal(result, expected) + + result = algos.isin(["a", "b"], ["a"]) + expected = np.array([True, False]) + tm.assert_numpy_array_equal(result, expected) + + result = algos.isin(Series(["a", "b"]), Series(["a"])) + expected = np.array([True, False]) + tm.assert_numpy_array_equal(result, expected) + + result = algos.isin(Series(["a", "b"]), {"a"}) + expected = np.array([True, False]) + tm.assert_numpy_array_equal(result, expected) + + result = algos.isin(["a", "b"], [1]) + expected = np.array([False, False]) + tm.assert_numpy_array_equal(result, expected) + + def test_i8(self): + + arr = pd.date_range("20130101", periods=3).values + result = algos.isin(arr, [arr[0]]) + expected = np.array([True, False, False]) + tm.assert_numpy_array_equal(result, expected) + + result = algos.isin(arr, arr[0:2]) + expected = np.array([True, True, False]) + tm.assert_numpy_array_equal(result, expected) + + result = algos.isin(arr, set(arr[0:2])) + expected = np.array([True, True, False]) + tm.assert_numpy_array_equal(result, expected) + + arr = pd.timedelta_range("1 day", periods=3).values + result = algos.isin(arr, [arr[0]]) + expected = np.array([True, False, False]) + tm.assert_numpy_array_equal(result, expected) + + result = algos.isin(arr, arr[0:2]) + expected = np.array([True, True, False]) + tm.assert_numpy_array_equal(result, expected) + + result = algos.isin(arr, set(arr[0:2])) + expected = np.array([True, True, False]) + tm.assert_numpy_array_equal(result, expected) + + def test_large(self): + + s = pd.date_range("20000101", periods=2000000, freq="s").values + result = algos.isin(s, s[0:2]) + expected = np.zeros(len(s), dtype=bool) + expected[0] = True + expected[1] = True + tm.assert_numpy_array_equal(result, expected) + + def test_categorical_from_codes(self): + # GH 16639 + vals = np.array([0, 1, 2, 0]) + cats = ["a", "b", "c"] + Sd = Series(Categorical(1).from_codes(vals, cats)) + St = Series(Categorical(1).from_codes(np.array([0, 1]), cats)) + expected = np.array([True, True, False, True]) + result = algos.isin(Sd, St) + tm.assert_numpy_array_equal(expected, result) + + def test_same_nan_is_in(self): + # GH 22160 + # nan is special, because from " a is b" doesn't follow "a == b" + # at least, isin() should follow python's "np.nan in [nan] == True" + # casting to -> np.float64 -> another float-object somewhere on + # the way could lead jepardize this behavior + comps = [np.nan] # could be casted to float64 + values = [np.nan] + expected = np.array([True]) + result = algos.isin(comps, values) + tm.assert_numpy_array_equal(expected, result) + + def test_same_object_is_in(self): + # GH 22160 + # there could be special treatment for nans + # the user however could define a custom class + # with similar behavior, then we at least should + # fall back to usual python's behavior: "a in [a] == True" + class LikeNan: + def __eq__(self, other) -> bool: + return False + + def __hash__(self): + return 0 + + a, b = LikeNan(), LikeNan() + # same object -> True + tm.assert_numpy_array_equal(algos.isin([a], [a]), np.array([True])) + # different objects -> False + tm.assert_numpy_array_equal(algos.isin([a], [b]), np.array([False])) + + def test_different_nans(self): + # GH 22160 + # all nans are handled as equivalent + + comps = [float("nan")] + values = [float("nan")] + assert comps[0] is not values[0] # different nan-objects + + # as list of python-objects: + result = algos.isin(comps, values) + tm.assert_numpy_array_equal(np.array([True]), result) + + # as object-array: + result = algos.isin( + np.asarray(comps, dtype=np.object), np.asarray(values, dtype=np.object) + ) + tm.assert_numpy_array_equal(np.array([True]), result) + + # as float64-array: + result = algos.isin( + np.asarray(comps, dtype=np.float64), np.asarray(values, dtype=np.float64) + ) + tm.assert_numpy_array_equal(np.array([True]), result) + + def test_no_cast(self): + # GH 22160 + # ensure 42 is not casted to a string + comps = ["ss", 42] + values = ["42"] + expected = np.array([False, False]) + result = algos.isin(comps, values) + tm.assert_numpy_array_equal(expected, result) + + @pytest.mark.parametrize("empty", [[], Series(dtype=object), np.array([])]) + def test_empty(self, empty): + # see gh-16991 + vals = Index(["a", "b"]) + expected = np.array([False, False]) + + result = algos.isin(vals, empty) + tm.assert_numpy_array_equal(expected, result) + + def test_different_nan_objects(self): + # GH 22119 + comps = np.array(["nan", np.nan * 1j, float("nan")], dtype=np.object) + vals = np.array([float("nan")], dtype=np.object) + expected = np.array([False, False, True]) + result = algos.isin(comps, vals) + tm.assert_numpy_array_equal(expected, result) + + def test_different_nans_as_float64(self): + # GH 21866 + # create different nans from bit-patterns, + # these nans will land in different buckets in the hash-table + # if no special care is taken + NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0] + NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0] + assert NAN1 != NAN1 + assert NAN2 != NAN2 + + # check that NAN1 and NAN2 are equivalent: + arr = np.array([NAN1, NAN2], dtype=np.float64) + lookup1 = np.array([NAN1], dtype=np.float64) + result = algos.isin(arr, lookup1) + expected = np.array([True, True]) + tm.assert_numpy_array_equal(result, expected) + + lookup2 = np.array([NAN2], dtype=np.float64) + result = algos.isin(arr, lookup2) + expected = np.array([True, True]) + tm.assert_numpy_array_equal(result, expected) + + +class TestValueCounts: + def test_value_counts(self): + np.random.seed(1234) + from pandas.core.reshape.tile import cut + + arr = np.random.randn(4) + factor = cut(arr, 4) + + # assert isinstance(factor, n) + result = algos.value_counts(factor) + breaks = [-1.194, -0.535, 0.121, 0.777, 1.433] + index = IntervalIndex.from_breaks(breaks).astype(CDT(ordered=True)) + expected = Series([1, 1, 1, 1], index=index) + tm.assert_series_equal(result.sort_index(), expected.sort_index()) + + def test_value_counts_bins(self): + s = [1, 2, 3, 4] + result = algos.value_counts(s, bins=1) + expected = Series([4], index=IntervalIndex.from_tuples([(0.996, 4.0)])) + tm.assert_series_equal(result, expected) + + result = algos.value_counts(s, bins=2, sort=False) + expected = Series( + [2, 2], index=IntervalIndex.from_tuples([(0.996, 2.5), (2.5, 4.0)]) + ) + tm.assert_series_equal(result, expected) + + def test_value_counts_dtypes(self): + result = algos.value_counts([1, 1.0]) + assert len(result) == 1 + + result = algos.value_counts([1, 1.0], bins=1) + assert len(result) == 1 + + result = algos.value_counts(Series([1, 1.0, "1"])) # object + assert len(result) == 2 + + msg = "bins argument only works with numeric data" + with pytest.raises(TypeError, match=msg): + algos.value_counts(["1", 1], bins=1) + + def test_value_counts_nat(self): + td = Series([np.timedelta64(10000), pd.NaT], dtype="timedelta64[ns]") + dt = pd.to_datetime(["NaT", "2014-01-01"]) + + for s in [td, dt]: + vc = algos.value_counts(s) + vc_with_na = algos.value_counts(s, dropna=False) + assert len(vc) == 1 + assert len(vc_with_na) == 2 + + exp_dt = Series({Timestamp("2014-01-01 00:00:00"): 1}) + tm.assert_series_equal(algos.value_counts(dt), exp_dt) + # TODO same for (timedelta) + + def test_value_counts_datetime_outofbounds(self): + # GH 13663 + s = Series( + [ + datetime(3000, 1, 1), + datetime(5000, 1, 1), + datetime(5000, 1, 1), + datetime(6000, 1, 1), + datetime(3000, 1, 1), + datetime(3000, 1, 1), + ] + ) + res = s.value_counts() + + exp_index = Index( + [datetime(3000, 1, 1), datetime(5000, 1, 1), datetime(6000, 1, 1)], + dtype=object, + ) + exp = Series([3, 2, 1], index=exp_index) + tm.assert_series_equal(res, exp) + + # GH 12424 + res = pd.to_datetime(Series(["2362-01-01", np.nan]), errors="ignore") + exp = Series(["2362-01-01", np.nan], dtype=object) + tm.assert_series_equal(res, exp) + + def test_categorical(self): + s = Series(Categorical(list("aaabbc"))) + result = s.value_counts() + expected = Series([3, 2, 1], index=CategoricalIndex(["a", "b", "c"])) + + tm.assert_series_equal(result, expected, check_index_type=True) + + # preserve order? + s = s.cat.as_ordered() + result = s.value_counts() + expected.index = expected.index.as_ordered() + tm.assert_series_equal(result, expected, check_index_type=True) + + def test_categorical_nans(self): + s = Series(Categorical(list("aaaaabbbcc"))) # 4,3,2,1 (nan) + s.iloc[1] = np.nan + result = s.value_counts() + expected = Series( + [4, 3, 2], + index=CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c"]), + ) + tm.assert_series_equal(result, expected, check_index_type=True) + result = s.value_counts(dropna=False) + expected = Series([4, 3, 2, 1], index=CategoricalIndex(["a", "b", "c", np.nan])) + tm.assert_series_equal(result, expected, check_index_type=True) + + # out of order + s = Series( + Categorical(list("aaaaabbbcc"), ordered=True, categories=["b", "a", "c"]) + ) + s.iloc[1] = np.nan + result = s.value_counts() + expected = Series( + [4, 3, 2], + index=CategoricalIndex( + ["a", "b", "c"], categories=["b", "a", "c"], ordered=True + ), + ) + tm.assert_series_equal(result, expected, check_index_type=True) + + result = s.value_counts(dropna=False) + expected = Series( + [4, 3, 2, 1], + index=CategoricalIndex( + ["a", "b", "c", np.nan], categories=["b", "a", "c"], ordered=True + ), + ) + tm.assert_series_equal(result, expected, check_index_type=True) + + def test_categorical_zeroes(self): + # keep the `d` category with 0 + s = Series(Categorical(list("bbbaac"), categories=list("abcd"), ordered=True)) + result = s.value_counts() + expected = Series( + [3, 2, 1, 0], + index=Categorical( + ["b", "a", "c", "d"], categories=list("abcd"), ordered=True + ), + ) + tm.assert_series_equal(result, expected, check_index_type=True) + + def test_dropna(self): + # https://github.com/pandas-dev/pandas/issues/9443#issuecomment-73719328 + + tm.assert_series_equal( + Series([True, True, False]).value_counts(dropna=True), + Series([2, 1], index=[True, False]), + ) + tm.assert_series_equal( + Series([True, True, False]).value_counts(dropna=False), + Series([2, 1], index=[True, False]), + ) + + tm.assert_series_equal( + Series([True, True, False, None]).value_counts(dropna=True), + Series([2, 1], index=[True, False]), + ) + tm.assert_series_equal( + Series([True, True, False, None]).value_counts(dropna=False), + Series([2, 1, 1], index=[True, False, np.nan]), + ) + tm.assert_series_equal( + Series([10.3, 5.0, 5.0]).value_counts(dropna=True), + Series([2, 1], index=[5.0, 10.3]), + ) + tm.assert_series_equal( + Series([10.3, 5.0, 5.0]).value_counts(dropna=False), + Series([2, 1], index=[5.0, 10.3]), + ) + + tm.assert_series_equal( + Series([10.3, 5.0, 5.0, None]).value_counts(dropna=True), + Series([2, 1], index=[5.0, 10.3]), + ) + + # 32-bit linux has a different ordering + if not compat.is_platform_32bit(): + result = Series([10.3, 5.0, 5.0, None]).value_counts(dropna=False) + expected = Series([2, 1, 1], index=[5.0, 10.3, np.nan]) + tm.assert_series_equal(result, expected) + + def test_value_counts_normalized(self): + # GH12558 + s = Series([1, 2, np.nan, np.nan, np.nan]) + dtypes = (np.float64, np.object, "M8[ns]") + for t in dtypes: + s_typed = s.astype(t) + result = s_typed.value_counts(normalize=True, dropna=False) + expected = Series( + [0.6, 0.2, 0.2], index=Series([np.nan, 2.0, 1.0], dtype=t) + ) + tm.assert_series_equal(result, expected) + + result = s_typed.value_counts(normalize=True, dropna=True) + expected = Series([0.5, 0.5], index=Series([2.0, 1.0], dtype=t)) + tm.assert_series_equal(result, expected) + + def test_value_counts_uint64(self): + arr = np.array([2 ** 63], dtype=np.uint64) + expected = Series([1], index=[2 ** 63]) + result = algos.value_counts(arr) + + tm.assert_series_equal(result, expected) + + arr = np.array([-1, 2 ** 63], dtype=object) + expected = Series([1, 1], index=[-1, 2 ** 63]) + result = algos.value_counts(arr) + + # 32-bit linux has a different ordering + if not compat.is_platform_32bit(): + tm.assert_series_equal(result, expected) + + +class TestDuplicated: + def test_duplicated_with_nas(self): + keys = np.array([0, 1, np.nan, 0, 2, np.nan], dtype=object) + + result = algos.duplicated(keys) + expected = np.array([False, False, False, True, False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = algos.duplicated(keys, keep="first") + expected = np.array([False, False, False, True, False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = algos.duplicated(keys, keep="last") + expected = np.array([True, False, True, False, False, False]) + tm.assert_numpy_array_equal(result, expected) + + result = algos.duplicated(keys, keep=False) + expected = np.array([True, False, True, True, False, True]) + tm.assert_numpy_array_equal(result, expected) + + keys = np.empty(8, dtype=object) + for i, t in enumerate( + zip([0, 0, np.nan, np.nan] * 2, [0, np.nan, 0, np.nan] * 2) + ): + keys[i] = t + + result = algos.duplicated(keys) + falses = [False] * 4 + trues = [True] * 4 + expected = np.array(falses + trues) + tm.assert_numpy_array_equal(result, expected) + + result = algos.duplicated(keys, keep="last") + expected = np.array(trues + falses) + tm.assert_numpy_array_equal(result, expected) + + result = algos.duplicated(keys, keep=False) + expected = np.array(trues + trues) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "case", + [ + np.array([1, 2, 1, 5, 3, 2, 4, 1, 5, 6]), + np.array([1.1, 2.2, 1.1, np.nan, 3.3, 2.2, 4.4, 1.1, np.nan, 6.6]), + np.array( + [ + 1 + 1j, + 2 + 2j, + 1 + 1j, + 5 + 5j, + 3 + 3j, + 2 + 2j, + 4 + 4j, + 1 + 1j, + 5 + 5j, + 6 + 6j, + ] + ), + np.array(["a", "b", "a", "e", "c", "b", "d", "a", "e", "f"], dtype=object), + np.array( + [1, 2 ** 63, 1, 3 ** 5, 10, 2 ** 63, 39, 1, 3 ** 5, 7], dtype=np.uint64 + ), + ], + ) + def test_numeric_object_likes(self, case): + exp_first = np.array( + [False, False, True, False, False, True, False, True, True, False] + ) + exp_last = np.array( + [True, True, True, True, False, False, False, False, False, False] + ) + exp_false = exp_first | exp_last + + res_first = algos.duplicated(case, keep="first") + tm.assert_numpy_array_equal(res_first, exp_first) + + res_last = algos.duplicated(case, keep="last") + tm.assert_numpy_array_equal(res_last, exp_last) + + res_false = algos.duplicated(case, keep=False) + tm.assert_numpy_array_equal(res_false, exp_false) + + # index + for idx in [Index(case), Index(case, dtype="category")]: + res_first = idx.duplicated(keep="first") + tm.assert_numpy_array_equal(res_first, exp_first) + + res_last = idx.duplicated(keep="last") + tm.assert_numpy_array_equal(res_last, exp_last) + + res_false = idx.duplicated(keep=False) + tm.assert_numpy_array_equal(res_false, exp_false) + + # series + for s in [Series(case), Series(case, dtype="category")]: + res_first = s.duplicated(keep="first") + tm.assert_series_equal(res_first, Series(exp_first)) + + res_last = s.duplicated(keep="last") + tm.assert_series_equal(res_last, Series(exp_last)) + + res_false = s.duplicated(keep=False) + tm.assert_series_equal(res_false, Series(exp_false)) + + def test_datetime_likes(self): + + dt = [ + "2011-01-01", + "2011-01-02", + "2011-01-01", + "NaT", + "2011-01-03", + "2011-01-02", + "2011-01-04", + "2011-01-01", + "NaT", + "2011-01-06", + ] + td = [ + "1 days", + "2 days", + "1 days", + "NaT", + "3 days", + "2 days", + "4 days", + "1 days", + "NaT", + "6 days", + ] + + cases = [ + np.array([Timestamp(d) for d in dt]), + np.array([Timestamp(d, tz="US/Eastern") for d in dt]), + np.array([pd.Period(d, freq="D") for d in dt]), + np.array([np.datetime64(d) for d in dt]), + np.array([pd.Timedelta(d) for d in td]), + ] + + exp_first = np.array( + [False, False, True, False, False, True, False, True, True, False] + ) + exp_last = np.array( + [True, True, True, True, False, False, False, False, False, False] + ) + exp_false = exp_first | exp_last + + for case in cases: + res_first = algos.duplicated(case, keep="first") + tm.assert_numpy_array_equal(res_first, exp_first) + + res_last = algos.duplicated(case, keep="last") + tm.assert_numpy_array_equal(res_last, exp_last) + + res_false = algos.duplicated(case, keep=False) + tm.assert_numpy_array_equal(res_false, exp_false) + + # index + for idx in [ + Index(case), + Index(case, dtype="category"), + Index(case, dtype=object), + ]: + res_first = idx.duplicated(keep="first") + tm.assert_numpy_array_equal(res_first, exp_first) + + res_last = idx.duplicated(keep="last") + tm.assert_numpy_array_equal(res_last, exp_last) + + res_false = idx.duplicated(keep=False) + tm.assert_numpy_array_equal(res_false, exp_false) + + # series + for s in [ + Series(case), + Series(case, dtype="category"), + Series(case, dtype=object), + ]: + res_first = s.duplicated(keep="first") + tm.assert_series_equal(res_first, Series(exp_first)) + + res_last = s.duplicated(keep="last") + tm.assert_series_equal(res_last, Series(exp_last)) + + res_false = s.duplicated(keep=False) + tm.assert_series_equal(res_false, Series(exp_false)) + + def test_unique_index(self): + cases = [Index([1, 2, 3]), pd.RangeIndex(0, 3)] + for case in cases: + assert case.is_unique is True + tm.assert_numpy_array_equal( + case.duplicated(), np.array([False, False, False]) + ) + + @pytest.mark.parametrize( + "arr, unique", + [ + ( + [(0, 0), (0, 1), (1, 0), (1, 1), (0, 0), (0, 1), (1, 0), (1, 1)], + [(0, 0), (0, 1), (1, 0), (1, 1)], + ), + ( + [("b", "c"), ("a", "b"), ("a", "b"), ("b", "c")], + [("b", "c"), ("a", "b")], + ), + ([("a", 1), ("b", 2), ("a", 3), ("a", 1)], [("a", 1), ("b", 2), ("a", 3)]), + ], + ) + def test_unique_tuples(self, arr, unique): + # https://github.com/pandas-dev/pandas/issues/16519 + expected = np.empty(len(unique), dtype=object) + expected[:] = unique + + result = pd.unique(arr) + tm.assert_numpy_array_equal(result, expected) + + +class GroupVarTestMixin: + def test_group_var_generic_1d(self): + prng = RandomState(1234) + + out = (np.nan * np.ones((5, 1))).astype(self.dtype) + counts = np.zeros(5, dtype="int64") + values = 10 * prng.rand(15, 1).astype(self.dtype) + labels = np.tile(np.arange(5), (3,)).astype("int64") + + expected_out = ( + np.squeeze(values).reshape((5, 3), order="F").std(axis=1, ddof=1) ** 2 + )[:, np.newaxis] + expected_counts = counts + 3 + + self.algo(out, counts, values, labels) + assert np.allclose(out, expected_out, self.rtol) + tm.assert_numpy_array_equal(counts, expected_counts) + + def test_group_var_generic_1d_flat_labels(self): + prng = RandomState(1234) + + out = (np.nan * np.ones((1, 1))).astype(self.dtype) + counts = np.zeros(1, dtype="int64") + values = 10 * prng.rand(5, 1).astype(self.dtype) + labels = np.zeros(5, dtype="int64") + + expected_out = np.array([[values.std(ddof=1) ** 2]]) + expected_counts = counts + 5 + + self.algo(out, counts, values, labels) + + assert np.allclose(out, expected_out, self.rtol) + tm.assert_numpy_array_equal(counts, expected_counts) + + def test_group_var_generic_2d_all_finite(self): + prng = RandomState(1234) + + out = (np.nan * np.ones((5, 2))).astype(self.dtype) + counts = np.zeros(5, dtype="int64") + values = 10 * prng.rand(10, 2).astype(self.dtype) + labels = np.tile(np.arange(5), (2,)).astype("int64") + + expected_out = np.std(values.reshape(2, 5, 2), ddof=1, axis=0) ** 2 + expected_counts = counts + 2 + + self.algo(out, counts, values, labels) + assert np.allclose(out, expected_out, self.rtol) + tm.assert_numpy_array_equal(counts, expected_counts) + + def test_group_var_generic_2d_some_nan(self): + prng = RandomState(1234) + + out = (np.nan * np.ones((5, 2))).astype(self.dtype) + counts = np.zeros(5, dtype="int64") + values = 10 * prng.rand(10, 2).astype(self.dtype) + values[:, 1] = np.nan + labels = np.tile(np.arange(5), (2,)).astype("int64") + + expected_out = np.vstack( + [ + values[:, 0].reshape(5, 2, order="F").std(ddof=1, axis=1) ** 2, + np.nan * np.ones(5), + ] + ).T.astype(self.dtype) + expected_counts = counts + 2 + + self.algo(out, counts, values, labels) + tm.assert_almost_equal(out, expected_out, check_less_precise=6) + tm.assert_numpy_array_equal(counts, expected_counts) + + def test_group_var_constant(self): + # Regression test from GH 10448. + + out = np.array([[np.nan]], dtype=self.dtype) + counts = np.array([0], dtype="int64") + values = 0.832845131556193 * np.ones((3, 1), dtype=self.dtype) + labels = np.zeros(3, dtype="int64") + + self.algo(out, counts, values, labels) + + assert counts[0] == 3 + assert out[0, 0] >= 0 + tm.assert_almost_equal(out[0, 0], 0.0) + + +class TestGroupVarFloat64(GroupVarTestMixin): + __test__ = True + + algo = staticmethod(libgroupby.group_var_float64) + dtype = np.float64 + rtol = 1e-5 + + def test_group_var_large_inputs(self): + + prng = RandomState(1234) + + out = np.array([[np.nan]], dtype=self.dtype) + counts = np.array([0], dtype="int64") + values = (prng.rand(10 ** 6) + 10 ** 12).astype(self.dtype) + values.shape = (10 ** 6, 1) + labels = np.zeros(10 ** 6, dtype="int64") + + self.algo(out, counts, values, labels) + + assert counts[0] == 10 ** 6 + tm.assert_almost_equal(out[0, 0], 1.0 / 12, check_less_precise=True) + + +class TestGroupVarFloat32(GroupVarTestMixin): + __test__ = True + + algo = staticmethod(libgroupby.group_var_float32) + dtype = np.float32 + rtol = 1e-2 + + +class TestHashTable: + def test_string_hashtable_set_item_signature(self): + # GH#30419 fix typing in StringHashTable.set_item to prevent segfault + tbl = ht.StringHashTable() + + tbl.set_item("key", 1) + assert tbl.get_item("key") == 1 + + with pytest.raises(TypeError, match="'key' has incorrect type"): + # key arg typed as string, not object + tbl.set_item(4, 6) + with pytest.raises(TypeError, match="'val' has incorrect type"): + tbl.get_item(4) + + def test_lookup_nan(self, writable): + xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3]) + # GH 21688 ensure we can deal with readonly memory views + xs.setflags(write=writable) + m = ht.Float64HashTable() + m.map_locations(xs) + tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.int64)) + + def test_add_signed_zeros(self): + # GH 21866 inconsistent hash-function for float64 + # default hash-function would lead to different hash-buckets + # for 0.0 and -0.0 if there are more than 2^30 hash-buckets + # but this would mean 16GB + N = 4 # 12 * 10**8 would trigger the error, if you have enough memory + m = ht.Float64HashTable(N) + m.set_item(0.0, 0) + m.set_item(-0.0, 0) + assert len(m) == 1 # 0.0 and -0.0 are equivalent + + def test_add_different_nans(self): + # GH 21866 inconsistent hash-function for float64 + # create different nans from bit-patterns: + NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0] + NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0] + assert NAN1 != NAN1 + assert NAN2 != NAN2 + # default hash function would lead to different hash-buckets + # for NAN1 and NAN2 even if there are only 4 buckets: + m = ht.Float64HashTable() + m.set_item(NAN1, 0) + m.set_item(NAN2, 0) + assert len(m) == 1 # NAN1 and NAN2 are equivalent + + def test_lookup_overflow(self, writable): + xs = np.array([1, 2, 2 ** 63], dtype=np.uint64) + # GH 21688 ensure we can deal with readonly memory views + xs.setflags(write=writable) + m = ht.UInt64HashTable() + m.map_locations(xs) + tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.int64)) + + def test_get_unique(self): + s = Series([1, 2, 2 ** 63, 2 ** 63], dtype=np.uint64) + exp = np.array([1, 2, 2 ** 63], dtype=np.uint64) + tm.assert_numpy_array_equal(s.unique(), exp) + + @pytest.mark.parametrize("nvals", [0, 10]) # resizing to 0 is special case + @pytest.mark.parametrize( + "htable, uniques, dtype, safely_resizes", + [ + (ht.PyObjectHashTable, ht.ObjectVector, "object", False), + (ht.StringHashTable, ht.ObjectVector, "object", True), + (ht.Float64HashTable, ht.Float64Vector, "float64", False), + (ht.Int64HashTable, ht.Int64Vector, "int64", False), + (ht.UInt64HashTable, ht.UInt64Vector, "uint64", False), + ], + ) + def test_vector_resize( + self, writable, htable, uniques, dtype, safely_resizes, nvals + ): + # Test for memory errors after internal vector + # reallocations (GH 7157) + vals = np.array(np.random.randn(1000), dtype=dtype) + + # GH 21688 ensures we can deal with read-only memory views + vals.setflags(write=writable) + + # initialise instances; cannot initialise in parametrization, + # as otherwise external views would be held on the array (which is + # one of the things this test is checking) + htable = htable() + uniques = uniques() + + # get_labels may append to uniques + htable.get_labels(vals[:nvals], uniques, 0, -1) + # to_array() sets an external_view_exists flag on uniques. + tmp = uniques.to_array() + oldshape = tmp.shape + + # subsequent get_labels() calls can no longer append to it + # (except for StringHashTables + ObjectVector) + if safely_resizes: + htable.get_labels(vals, uniques, 0, -1) + else: + with pytest.raises(ValueError, match="external reference.*"): + htable.get_labels(vals, uniques, 0, -1) + + uniques.to_array() # should not raise here + assert tmp.shape == oldshape + + @pytest.mark.parametrize( + "htable, tm_dtype", + [ + (ht.PyObjectHashTable, "String"), + (ht.StringHashTable, "String"), + (ht.Float64HashTable, "Float"), + (ht.Int64HashTable, "Int"), + (ht.UInt64HashTable, "UInt"), + ], + ) + def test_hashtable_unique(self, htable, tm_dtype, writable): + # output of maker has guaranteed unique elements + maker = getattr(tm, "make" + tm_dtype + "Index") + s = Series(maker(1000)) + if htable == ht.Float64HashTable: + # add NaN for float column + s.loc[500] = np.nan + elif htable == ht.PyObjectHashTable: + # use different NaN types for object column + s.loc[500:502] = [np.nan, None, pd.NaT] + + # create duplicated selection + s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True) + s_duplicated.values.setflags(write=writable) + + # drop_duplicates has own cython code (hash_table_func_helper.pxi) + # and is tested separately; keeps first occurrence like ht.unique() + expected_unique = s_duplicated.drop_duplicates(keep="first").values + result_unique = htable().unique(s_duplicated.values) + tm.assert_numpy_array_equal(result_unique, expected_unique) + + # test return_inverse=True + # reconstruction can only succeed if the inverse is correct + result_unique, result_inverse = htable().unique( + s_duplicated.values, return_inverse=True + ) + tm.assert_numpy_array_equal(result_unique, expected_unique) + reconstr = result_unique[result_inverse] + tm.assert_numpy_array_equal(reconstr, s_duplicated.values) + + @pytest.mark.parametrize( + "htable, tm_dtype", + [ + (ht.PyObjectHashTable, "String"), + (ht.StringHashTable, "String"), + (ht.Float64HashTable, "Float"), + (ht.Int64HashTable, "Int"), + (ht.UInt64HashTable, "UInt"), + ], + ) + def test_hashtable_factorize(self, htable, tm_dtype, writable): + # output of maker has guaranteed unique elements + maker = getattr(tm, "make" + tm_dtype + "Index") + s = Series(maker(1000)) + if htable == ht.Float64HashTable: + # add NaN for float column + s.loc[500] = np.nan + elif htable == ht.PyObjectHashTable: + # use different NaN types for object column + s.loc[500:502] = [np.nan, None, pd.NaT] + + # create duplicated selection + s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True) + s_duplicated.values.setflags(write=writable) + na_mask = s_duplicated.isna().values + + result_unique, result_inverse = htable().factorize(s_duplicated.values) + + # drop_duplicates has own cython code (hash_table_func_helper.pxi) + # and is tested separately; keeps first occurrence like ht.factorize() + # since factorize removes all NaNs, we do the same here + expected_unique = s_duplicated.dropna().drop_duplicates().values + tm.assert_numpy_array_equal(result_unique, expected_unique) + + # reconstruction can only succeed if the inverse is correct. Since + # factorize removes the NaNs, those have to be excluded here as well + result_reconstruct = result_unique[result_inverse[~na_mask]] + expected_reconstruct = s_duplicated.dropna().values + tm.assert_numpy_array_equal(result_reconstruct, expected_reconstruct) + + @pytest.mark.parametrize( + "hashtable", + [ + ht.PyObjectHashTable, + ht.StringHashTable, + ht.Float64HashTable, + ht.Int64HashTable, + ht.UInt64HashTable, + ], + ) + def test_hashtable_large_sizehint(self, hashtable): + # GH 22729 + size_hint = np.iinfo(np.uint32).max + 1 + tbl = hashtable(size_hint=size_hint) # noqa + + +def test_quantile(): + s = Series(np.random.randn(100)) + + result = algos.quantile(s, [0, 0.25, 0.5, 0.75, 1.0]) + expected = algos.quantile(s.values, [0, 0.25, 0.5, 0.75, 1.0]) + tm.assert_almost_equal(result, expected) + + +def test_unique_label_indices(): + + a = np.random.randint(1, 1 << 10, 1 << 15).astype("i8") + + left = ht.unique_label_indices(a) + right = np.unique(a, return_index=True)[1] + + tm.assert_numpy_array_equal(left, right, check_dtype=False) + + a[np.random.choice(len(a), 10)] = -1 + left = ht.unique_label_indices(a) + right = np.unique(a, return_index=True)[1][1:] + tm.assert_numpy_array_equal(left, right, check_dtype=False) + + +class TestRank: + @td.skip_if_no_scipy + def test_scipy_compat(self): + from scipy.stats import rankdata + + def _check(arr): + mask = ~np.isfinite(arr) + arr = arr.copy() + result = libalgos.rank_1d(arr) + arr[mask] = np.inf + exp = rankdata(arr) + exp[mask] = np.nan + tm.assert_almost_equal(result, exp) + + _check(np.array([np.nan, np.nan, 5.0, 5.0, 5.0, np.nan, 1, 2, 3, np.nan])) + _check(np.array([4.0, np.nan, 5.0, 5.0, 5.0, np.nan, 1, 2, 4.0, np.nan])) + + def test_basic(self): + exp = np.array([1, 2], dtype=np.float64) + + for dtype in np.typecodes["AllInteger"]: + s = Series([1, 100], dtype=dtype) + tm.assert_numpy_array_equal(algos.rank(s), exp) + + def test_uint64_overflow(self): + exp = np.array([1, 2], dtype=np.float64) + + for dtype in [np.float64, np.uint64]: + s = Series([1, 2 ** 63], dtype=dtype) + tm.assert_numpy_array_equal(algos.rank(s), exp) + + def test_too_many_ndims(self): + arr = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]]]) + msg = "Array with ndim > 2 are not supported" + + with pytest.raises(TypeError, match=msg): + algos.rank(arr) + + @pytest.mark.single + @pytest.mark.high_memory + @pytest.mark.parametrize( + "values", + [np.arange(2 ** 24 + 1), np.arange(2 ** 25 + 2).reshape(2 ** 24 + 1, 2)], + ids=["1d", "2d"], + ) + def test_pct_max_many_rows(self, values): + # GH 18271 + result = algos.rank(values, pct=True).max() + assert result == 1 + + +def test_pad_backfill_object_segfault(): + + old = np.array([], dtype="O") + new = np.array([datetime(2010, 12, 31)], dtype="O") + + result = libalgos.pad["object"](old, new) + expected = np.array([-1], dtype=np.int64) + tm.assert_numpy_array_equal(result, expected) + + result = libalgos.pad["object"](new, old) + expected = np.array([], dtype=np.int64) + tm.assert_numpy_array_equal(result, expected) + + result = libalgos.backfill["object"](old, new) + expected = np.array([-1], dtype=np.int64) + tm.assert_numpy_array_equal(result, expected) + + result = libalgos.backfill["object"](new, old) + expected = np.array([], dtype=np.int64) + tm.assert_numpy_array_equal(result, expected) + + +class TestTseriesUtil: + def test_combineFunc(self): + pass + + def test_reindex(self): + pass + + def test_isna(self): + pass + + def test_groupby(self): + pass + + def test_groupby_withnull(self): + pass + + def test_backfill(self): + old = Index([1, 5, 10]) + new = Index(list(range(12))) + + filler = libalgos.backfill["int64_t"](old.values, new.values) + + expect_filler = np.array([0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, -1], dtype=np.int64) + tm.assert_numpy_array_equal(filler, expect_filler) + + # corner case + old = Index([1, 4]) + new = Index(list(range(5, 10))) + filler = libalgos.backfill["int64_t"](old.values, new.values) + + expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.int64) + tm.assert_numpy_array_equal(filler, expect_filler) + + def test_pad(self): + old = Index([1, 5, 10]) + new = Index(list(range(12))) + + filler = libalgos.pad["int64_t"](old.values, new.values) + + expect_filler = np.array([-1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2], dtype=np.int64) + tm.assert_numpy_array_equal(filler, expect_filler) + + # corner case + old = Index([5, 10]) + new = Index(np.arange(5)) + filler = libalgos.pad["int64_t"](old.values, new.values) + expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.int64) + tm.assert_numpy_array_equal(filler, expect_filler) + + +def test_is_lexsorted(): + failure = [ + np.array( + [ + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + ], + dtype="int64", + ), + np.array( + [ + 30, + 29, + 28, + 27, + 26, + 25, + 24, + 23, + 22, + 21, + 20, + 19, + 18, + 17, + 16, + 15, + 14, + 13, + 12, + 11, + 10, + 9, + 8, + 7, + 6, + 5, + 4, + 3, + 2, + 1, + 0, + 30, + 29, + 28, + 27, + 26, + 25, + 24, + 23, + 22, + 21, + 20, + 19, + 18, + 17, + 16, + 15, + 14, + 13, + 12, + 11, + 10, + 9, + 8, + 7, + 6, + 5, + 4, + 3, + 2, + 1, + 0, + 30, + 29, + 28, + 27, + 26, + 25, + 24, + 23, + 22, + 21, + 20, + 19, + 18, + 17, + 16, + 15, + 14, + 13, + 12, + 11, + 10, + 9, + 8, + 7, + 6, + 5, + 4, + 3, + 2, + 1, + 0, + 30, + 29, + 28, + 27, + 26, + 25, + 24, + 23, + 22, + 21, + 20, + 19, + 18, + 17, + 16, + 15, + 14, + 13, + 12, + 11, + 10, + 9, + 8, + 7, + 6, + 5, + 4, + 3, + 2, + 1, + 0, + ], + dtype="int64", + ), + ] + + assert not libalgos.is_lexsorted(failure) + + +def test_groupsort_indexer(): + a = np.random.randint(0, 1000, 100).astype(np.int64) + b = np.random.randint(0, 1000, 100).astype(np.int64) + + result = libalgos.groupsort_indexer(a, 1000)[0] + + # need to use a stable sort + # np.argsort returns int, groupsort_indexer + # always returns int64 + expected = np.argsort(a, kind="mergesort") + expected = expected.astype(np.int64) + + tm.assert_numpy_array_equal(result, expected) + + # compare with lexsort + # np.lexsort returns int, groupsort_indexer + # always returns int64 + key = a * 1000 + b + result = libalgos.groupsort_indexer(key, 1000000)[0] + expected = np.lexsort((b, a)) + expected = expected.astype(np.int64) + + tm.assert_numpy_array_equal(result, expected) + + +def test_infinity_sort(): + # GH 13445 + # numpy's argsort can be unhappy if something is less than + # itself. Instead, let's give our infinities a self-consistent + # ordering, but outside the float extended real line. + + Inf = libalgos.Infinity() + NegInf = libalgos.NegInfinity() + + ref_nums = [NegInf, float("-inf"), -1e100, 0, 1e100, float("inf"), Inf] + + assert all(Inf >= x for x in ref_nums) + assert all(Inf > x or x is Inf for x in ref_nums) + assert Inf >= Inf and Inf == Inf + assert not Inf < Inf and not Inf > Inf + assert libalgos.Infinity() == libalgos.Infinity() + assert not libalgos.Infinity() != libalgos.Infinity() + + assert all(NegInf <= x for x in ref_nums) + assert all(NegInf < x or x is NegInf for x in ref_nums) + assert NegInf <= NegInf and NegInf == NegInf + assert not NegInf < NegInf and not NegInf > NegInf + assert libalgos.NegInfinity() == libalgos.NegInfinity() + assert not libalgos.NegInfinity() != libalgos.NegInfinity() + + for perm in permutations(ref_nums): + assert sorted(perm) == ref_nums + + # smoke tests + np.array([libalgos.Infinity()] * 32).argsort() + np.array([libalgos.NegInfinity()] * 32).argsort() + + +def test_infinity_against_nan(): + Inf = libalgos.Infinity() + NegInf = libalgos.NegInfinity() + + assert not Inf > np.nan + assert not Inf >= np.nan + assert not Inf < np.nan + assert not Inf <= np.nan + assert not Inf == np.nan + assert Inf != np.nan + + assert not NegInf > np.nan + assert not NegInf >= np.nan + assert not NegInf < np.nan + assert not NegInf <= np.nan + assert not NegInf == np.nan + assert NegInf != np.nan + + +def test_ensure_platform_int(): + arr = np.arange(100, dtype=np.intp) + + result = libalgos.ensure_platform_int(arr) + assert result is arr + + +def test_int64_add_overflow(): + # see gh-14068 + msg = "Overflow in int64 addition" + m = np.iinfo(np.int64).max + n = np.iinfo(np.int64).min + + with pytest.raises(OverflowError, match=msg): + algos.checked_add_with_arr(np.array([m, m]), m) + with pytest.raises(OverflowError, match=msg): + algos.checked_add_with_arr(np.array([m, m]), np.array([m, m])) + with pytest.raises(OverflowError, match=msg): + algos.checked_add_with_arr(np.array([n, n]), n) + with pytest.raises(OverflowError, match=msg): + algos.checked_add_with_arr(np.array([n, n]), np.array([n, n])) + with pytest.raises(OverflowError, match=msg): + algos.checked_add_with_arr(np.array([m, n]), np.array([n, n])) + with pytest.raises(OverflowError, match=msg): + algos.checked_add_with_arr( + np.array([m, m]), np.array([m, m]), arr_mask=np.array([False, True]) + ) + with pytest.raises(OverflowError, match=msg): + algos.checked_add_with_arr( + np.array([m, m]), np.array([m, m]), b_mask=np.array([False, True]) + ) + with pytest.raises(OverflowError, match=msg): + algos.checked_add_with_arr( + np.array([m, m]), + np.array([m, m]), + arr_mask=np.array([False, True]), + b_mask=np.array([False, True]), + ) + with pytest.raises(OverflowError, match=msg): + with tm.assert_produces_warning(RuntimeWarning): + algos.checked_add_with_arr(np.array([m, m]), np.array([np.nan, m])) + + # Check that the nan boolean arrays override whether or not + # the addition overflows. We don't check the result but just + # the fact that an OverflowError is not raised. + algos.checked_add_with_arr( + np.array([m, m]), np.array([m, m]), arr_mask=np.array([True, True]) + ) + algos.checked_add_with_arr( + np.array([m, m]), np.array([m, m]), b_mask=np.array([True, True]) + ) + algos.checked_add_with_arr( + np.array([m, m]), + np.array([m, m]), + arr_mask=np.array([True, False]), + b_mask=np.array([False, True]), + ) + + +class TestMode: + def test_no_mode(self): + exp = Series([], dtype=np.float64) + tm.assert_series_equal(algos.mode([]), exp) + + def test_mode_single(self): + # GH 15714 + exp_single = [1] + data_single = [1] + + exp_multi = [1] + data_multi = [1, 1] + + for dt in np.typecodes["AllInteger"] + np.typecodes["Float"]: + s = Series(data_single, dtype=dt) + exp = Series(exp_single, dtype=dt) + tm.assert_series_equal(algos.mode(s), exp) + + s = Series(data_multi, dtype=dt) + exp = Series(exp_multi, dtype=dt) + tm.assert_series_equal(algos.mode(s), exp) + + exp = Series([1], dtype=np.int) + tm.assert_series_equal(algos.mode([1]), exp) + + exp = Series(["a", "b", "c"], dtype=np.object) + tm.assert_series_equal(algos.mode(["a", "b", "c"]), exp) + + def test_number_mode(self): + exp_single = [1] + data_single = [1] * 5 + [2] * 3 + + exp_multi = [1, 3] + data_multi = [1] * 5 + [2] * 3 + [3] * 5 + + for dt in np.typecodes["AllInteger"] + np.typecodes["Float"]: + s = Series(data_single, dtype=dt) + exp = Series(exp_single, dtype=dt) + tm.assert_series_equal(algos.mode(s), exp) + + s = Series(data_multi, dtype=dt) + exp = Series(exp_multi, dtype=dt) + tm.assert_series_equal(algos.mode(s), exp) + + def test_strobj_mode(self): + exp = ["b"] + data = ["a"] * 2 + ["b"] * 3 + + s = Series(data, dtype="c") + exp = Series(exp, dtype="c") + tm.assert_series_equal(algos.mode(s), exp) + + exp = ["bar"] + data = ["foo"] * 2 + ["bar"] * 3 + + for dt in [str, object]: + s = Series(data, dtype=dt) + exp = Series(exp, dtype=dt) + tm.assert_series_equal(algos.mode(s), exp) + + def test_datelike_mode(self): + exp = Series(["1900-05-03", "2011-01-03", "2013-01-02"], dtype="M8[ns]") + s = Series(["2011-01-03", "2013-01-02", "1900-05-03"], dtype="M8[ns]") + tm.assert_series_equal(algos.mode(s), exp) + + exp = Series(["2011-01-03", "2013-01-02"], dtype="M8[ns]") + s = Series( + ["2011-01-03", "2013-01-02", "1900-05-03", "2011-01-03", "2013-01-02"], + dtype="M8[ns]", + ) + tm.assert_series_equal(algos.mode(s), exp) + + def test_timedelta_mode(self): + exp = Series(["-1 days", "0 days", "1 days"], dtype="timedelta64[ns]") + s = Series(["1 days", "-1 days", "0 days"], dtype="timedelta64[ns]") + tm.assert_series_equal(algos.mode(s), exp) + + exp = Series(["2 min", "1 day"], dtype="timedelta64[ns]") + s = Series( + ["1 day", "1 day", "-1 day", "-1 day 2 min", "2 min", "2 min"], + dtype="timedelta64[ns]", + ) + tm.assert_series_equal(algos.mode(s), exp) + + def test_mixed_dtype(self): + exp = Series(["foo"]) + s = Series([1, "foo", "foo"]) + tm.assert_series_equal(algos.mode(s), exp) + + def test_uint64_overflow(self): + exp = Series([2 ** 63], dtype=np.uint64) + s = Series([1, 2 ** 63, 2 ** 63], dtype=np.uint64) + tm.assert_series_equal(algos.mode(s), exp) + + exp = Series([1, 2 ** 63], dtype=np.uint64) + s = Series([1, 2 ** 63], dtype=np.uint64) + tm.assert_series_equal(algos.mode(s), exp) + + def test_categorical(self): + c = Categorical([1, 2]) + exp = c + tm.assert_categorical_equal(algos.mode(c), exp) + tm.assert_categorical_equal(c.mode(), exp) + + c = Categorical([1, "a", "a"]) + exp = Categorical(["a"], categories=[1, "a"]) + tm.assert_categorical_equal(algos.mode(c), exp) + tm.assert_categorical_equal(c.mode(), exp) + + c = Categorical([1, 1, 2, 3, 3]) + exp = Categorical([1, 3], categories=[1, 2, 3]) + tm.assert_categorical_equal(algos.mode(c), exp) + tm.assert_categorical_equal(c.mode(), exp) + + def test_index(self): + idx = Index([1, 2, 3]) + exp = Series([1, 2, 3], dtype=np.int64) + tm.assert_series_equal(algos.mode(idx), exp) + + idx = Index([1, "a", "a"]) + exp = Series(["a"], dtype=object) + tm.assert_series_equal(algos.mode(idx), exp) + + idx = Index([1, 1, 2, 3, 3]) + exp = Series([1, 3], dtype=np.int64) + tm.assert_series_equal(algos.mode(idx), exp) + + exp = Series(["2 min", "1 day"], dtype="timedelta64[ns]") + idx = Index( + ["1 day", "1 day", "-1 day", "-1 day 2 min", "2 min", "2 min"], + dtype="timedelta64[ns]", + ) + tm.assert_series_equal(algos.mode(idx), exp) diff --git a/venv/Lib/site-packages/pandas/tests/test_common.py b/venv/Lib/site-packages/pandas/tests/test_common.py new file mode 100644 index 0000000..a8a0fce --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/test_common.py @@ -0,0 +1,131 @@ +import collections +from distutils.version import LooseVersion +from functools import partial +import string + +import numpy as np +import pytest + +import pandas as pd +from pandas import Series, Timestamp +from pandas.core import ops +import pandas.core.common as com + + +def test_get_callable_name(): + getname = com.get_callable_name + + def fn(x): + return x + + lambda_ = lambda x: x # noqa: E731 + part1 = partial(fn) + part2 = partial(part1) + + class somecall: + def __call__(self): + return x # noqa + + assert getname(fn) == "fn" + assert getname(lambda_) + assert getname(part1) == "fn" + assert getname(part2) == "fn" + assert getname(somecall()) == "somecall" + assert getname(1) is None + + +def test_any_none(): + assert com.any_none(1, 2, 3, None) + assert not com.any_none(1, 2, 3, 4) + + +def test_all_not_none(): + assert com.all_not_none(1, 2, 3, 4) + assert not com.all_not_none(1, 2, 3, None) + assert not com.all_not_none(None, None, None, None) + + +def test_random_state(): + import numpy.random as npr + + # Check with seed + state = com.random_state(5) + assert state.uniform() == npr.RandomState(5).uniform() + + # Check with random state object + state2 = npr.RandomState(10) + assert com.random_state(state2).uniform() == npr.RandomState(10).uniform() + + # check with no arg random state + assert com.random_state() is np.random + + # Error for floats or strings + with pytest.raises(ValueError): + com.random_state("test") + + with pytest.raises(ValueError): + com.random_state(5.5) + + +@pytest.mark.parametrize( + "left, right, expected", + [ + (Series([1], name="x"), Series([2], name="x"), "x"), + (Series([1], name="x"), Series([2], name="y"), None), + (Series([1]), Series([2], name="x"), None), + (Series([1], name="x"), Series([2]), None), + (Series([1], name="x"), [2], "x"), + ([1], Series([2], name="y"), "y"), + ], +) +def test_maybe_match_name(left, right, expected): + assert ops._maybe_match_name(left, right) == expected + + +def test_dict_compat(): + data_datetime64 = {np.datetime64("1990-03-15"): 1, np.datetime64("2015-03-15"): 2} + data_unchanged = {1: 2, 3: 4, 5: 6} + expected = {Timestamp("1990-3-15"): 1, Timestamp("2015-03-15"): 2} + assert com.dict_compat(data_datetime64) == expected + assert com.dict_compat(expected) == expected + assert com.dict_compat(data_unchanged) == data_unchanged + + +def test_standardize_mapping(): + # No uninitialized defaultdicts + with pytest.raises(TypeError): + com.standardize_mapping(collections.defaultdict) + + # No non-mapping subtypes, instance + with pytest.raises(TypeError): + com.standardize_mapping([]) + + # No non-mapping subtypes, class + with pytest.raises(TypeError): + com.standardize_mapping(list) + + fill = {"bad": "data"} + assert com.standardize_mapping(fill) == dict + + # Convert instance to type + assert com.standardize_mapping({}) == dict + + dd = collections.defaultdict(list) + assert isinstance(com.standardize_mapping(dd), partial) + + +def test_git_version(): + # GH 21295 + git_version = pd.__git_version__ + assert len(git_version) == 40 + assert all(c in string.hexdigits for c in git_version) + + +def test_version_tag(): + version = pd.__version__ + try: + version > LooseVersion("0.0.1") + except TypeError: + raise ValueError( + "No git tags exist, please sync tags between upstream and your repo" + ) diff --git a/venv/Lib/site-packages/pandas/tests/test_compat.py b/venv/Lib/site-packages/pandas/tests/test_compat.py new file mode 100644 index 0000000..4ff8b0b --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/test_compat.py @@ -0,0 +1,3 @@ +""" +Testing that functions from compat work as expected +""" diff --git a/venv/Lib/site-packages/pandas/tests/test_downstream.py b/venv/Lib/site-packages/pandas/tests/test_downstream.py new file mode 100644 index 0000000..ee00623 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/test_downstream.py @@ -0,0 +1,164 @@ +""" +Testing that we work in the downstream packages +""" +import importlib +import subprocess +import sys + +import numpy as np # noqa +import pytest + +from pandas import DataFrame, Series +import pandas._testing as tm + + +def import_module(name): + # we *only* want to skip if the module is truly not available + # and NOT just an actual import error because of pandas changes + + try: + return importlib.import_module(name) + except ModuleNotFoundError: # noqa + pytest.skip("skipping as {} not available".format(name)) + + +@pytest.fixture +def df(): + return DataFrame({"A": [1, 2, 3]}) + + +def test_dask(df): + + toolz = import_module("toolz") # noqa + dask = import_module("dask") # noqa + + import dask.dataframe as dd + + ddf = dd.from_pandas(df, npartitions=3) + assert ddf.A is not None + assert ddf.compute() is not None + + +@pytest.mark.filterwarnings("ignore:Panel class is removed") +def test_xarray(df): + + xarray = import_module("xarray") # noqa + + assert df.to_xarray() is not None + + +def test_oo_optimizable(): + # GH 21071 + subprocess.check_call([sys.executable, "-OO", "-c", "import pandas"]) + + +@tm.network +# Cython import warning +@pytest.mark.filterwarnings("ignore:can't:ImportWarning") +@pytest.mark.filterwarnings( + # patsy needs to update their imports + "ignore:Using or importing the ABCs from 'collections:DeprecationWarning" +) +def test_statsmodels(): + + statsmodels = import_module("statsmodels") # noqa + import statsmodels.api as sm + import statsmodels.formula.api as smf + + df = sm.datasets.get_rdataset("Guerry", "HistData").data + smf.ols("Lottery ~ Literacy + np.log(Pop1831)", data=df).fit() + + +# Cython import warning +@pytest.mark.filterwarnings("ignore:can't:ImportWarning") +def test_scikit_learn(df): + + sklearn = import_module("sklearn") # noqa + from sklearn import svm, datasets + + digits = datasets.load_digits() + clf = svm.SVC(gamma=0.001, C=100.0) + clf.fit(digits.data[:-1], digits.target[:-1]) + clf.predict(digits.data[-1:]) + + +# Cython import warning and traitlets +@tm.network +@pytest.mark.filterwarnings("ignore") +def test_seaborn(): + + seaborn = import_module("seaborn") + tips = seaborn.load_dataset("tips") + seaborn.stripplot(x="day", y="total_bill", data=tips) + + +def test_pandas_gbq(df): + + pandas_gbq = import_module("pandas_gbq") # noqa + + +@pytest.mark.xfail(reason="0.7.0 pending") +@tm.network +def test_pandas_datareader(): + + pandas_datareader = import_module("pandas_datareader") # noqa + pandas_datareader.DataReader("F", "quandl", "2017-01-01", "2017-02-01") + + +# importing from pandas, Cython import warning +@pytest.mark.filterwarnings("ignore:can't resolve:ImportWarning") +def test_geopandas(): + + geopandas = import_module("geopandas") # noqa + fp = geopandas.datasets.get_path("naturalearth_lowres") + assert geopandas.read_file(fp) is not None + + +def test_geopandas_coordinate_indexer(): + # this test is included to have coverage of one case in the indexing.py + # code that is only kept for compatibility with geopandas, see + # https://github.com/pandas-dev/pandas/issues/27258 + # We should be able to remove this after some time when its usage is + # removed in geopandas + from pandas.core.indexing import _NDFrameIndexer + + class _CoordinateIndexer(_NDFrameIndexer): + def _getitem_tuple(self, tup): + obj = self.obj + xs, ys = tup + return obj[xs][ys] + + Series._create_indexer("cx", _CoordinateIndexer) + s = Series(range(5)) + res = s.cx[:, :] + tm.assert_series_equal(s, res) + + +# Cython import warning +@pytest.mark.filterwarnings("ignore:can't resolve:ImportWarning") +@pytest.mark.filterwarnings("ignore:RangeIndex.* is deprecated:DeprecationWarning") +def test_pyarrow(df): + + pyarrow = import_module("pyarrow") # noqa + table = pyarrow.Table.from_pandas(df) + result = table.to_pandas() + tm.assert_frame_equal(result, df) + + +@pytest.mark.xfail(reason="pandas-wheels-50", strict=False) +def test_missing_required_dependency(): + # GH 23868 + # To ensure proper isolation, we pass these flags + # -S : disable site-packages + # -s : disable user site-packages + # -E : disable PYTHON* env vars, especially PYTHONPATH + # And, that's apparently not enough, so we give up. + # https://github.com/MacPython/pandas-wheels/pull/50 + call = ["python", "-sSE", "-c", "import pandas"] + + with pytest.raises(subprocess.CalledProcessError) as exc: + subprocess.check_output(call, stderr=subprocess.STDOUT) + + output = exc.value.stdout.decode() + for name in ["numpy", "pytz", "dateutil"]: + assert name in output diff --git a/venv/Lib/site-packages/pandas/tests/test_errors.py b/venv/Lib/site-packages/pandas/tests/test_errors.py new file mode 100644 index 0000000..fa21424 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/test_errors.py @@ -0,0 +1,66 @@ +import pytest + +from pandas.errors import AbstractMethodError + +import pandas as pd # noqa + + +@pytest.mark.parametrize( + "exc", + [ + "UnsupportedFunctionCall", + "UnsortedIndexError", + "OutOfBoundsDatetime", + "ParserError", + "PerformanceWarning", + "DtypeWarning", + "EmptyDataError", + "ParserWarning", + "MergeError", + ], +) +def test_exception_importable(exc): + from pandas import errors + + e = getattr(errors, exc) + assert e is not None + + # check that we can raise on them + with pytest.raises(e): + raise e() + + +def test_catch_oob(): + from pandas import errors + + try: + pd.Timestamp("15000101") + except errors.OutOfBoundsDatetime: + pass + + +class Foo: + @classmethod + def classmethod(cls): + raise AbstractMethodError(cls, methodtype="classmethod") + + @property + def property(self): + raise AbstractMethodError(self, methodtype="property") + + def method(self): + raise AbstractMethodError(self) + + +def test_AbstractMethodError_classmethod(): + xpr = "This classmethod must be defined in the concrete class Foo" + with pytest.raises(AbstractMethodError, match=xpr): + Foo.classmethod() + + xpr = "This property must be defined in the concrete class Foo" + with pytest.raises(AbstractMethodError, match=xpr): + Foo().property + + xpr = "This method must be defined in the concrete class Foo" + with pytest.raises(AbstractMethodError, match=xpr): + Foo().method() diff --git a/venv/Lib/site-packages/pandas/tests/test_expressions.py b/venv/Lib/site-packages/pandas/tests/test_expressions.py new file mode 100644 index 0000000..fadab5d --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/test_expressions.py @@ -0,0 +1,384 @@ +import operator +import re + +import numpy as np +from numpy.random import randn +import pytest + +import pandas._testing as tm +from pandas.core.api import DataFrame +from pandas.core.computation import expressions as expr + +_frame = DataFrame(randn(10000, 4), columns=list("ABCD"), dtype="float64") +_frame2 = DataFrame(randn(100, 4), columns=list("ABCD"), dtype="float64") +_mixed = DataFrame( + { + "A": _frame["A"].copy(), + "B": _frame["B"].astype("float32"), + "C": _frame["C"].astype("int64"), + "D": _frame["D"].astype("int32"), + } +) +_mixed2 = DataFrame( + { + "A": _frame2["A"].copy(), + "B": _frame2["B"].astype("float32"), + "C": _frame2["C"].astype("int64"), + "D": _frame2["D"].astype("int32"), + } +) +_integer = DataFrame( + np.random.randint(1, 100, size=(10001, 4)), columns=list("ABCD"), dtype="int64" +) +_integer2 = DataFrame( + np.random.randint(1, 100, size=(101, 4)), columns=list("ABCD"), dtype="int64" +) + + +@pytest.mark.skipif(not expr._USE_NUMEXPR, reason="not using numexpr") +class TestExpressions: + def setup_method(self, method): + + self.frame = _frame.copy() + self.frame2 = _frame2.copy() + self.mixed = _mixed.copy() + self.mixed2 = _mixed2.copy() + self._MIN_ELEMENTS = expr._MIN_ELEMENTS + + def teardown_method(self, method): + expr._MIN_ELEMENTS = self._MIN_ELEMENTS + + def run_arithmetic(self, df, other): + expr._MIN_ELEMENTS = 0 + operations = ["add", "sub", "mul", "mod", "truediv", "floordiv"] + for test_flex in [True, False]: + for arith in operations: + # TODO: share with run_binary + if test_flex: + op = lambda x, y: getattr(x, arith)(y) + op.__name__ = arith + else: + op = getattr(operator, arith) + expr.set_use_numexpr(False) + expected = op(df, other) + expr.set_use_numexpr(True) + + result = op(df, other) + if arith == "truediv": + if expected.ndim == 1: + assert expected.dtype.kind == "f" + else: + assert all(x.kind == "f" for x in expected.dtypes.values) + tm.assert_equal(expected, result) + + def run_binary(self, df, other): + """ + tests solely that the result is the same whether or not numexpr is + enabled. Need to test whether the function does the correct thing + elsewhere. + """ + expr._MIN_ELEMENTS = 0 + expr.set_test_mode(True) + operations = ["gt", "lt", "ge", "le", "eq", "ne"] + + for test_flex in [True, False]: + for arith in operations: + if test_flex: + op = lambda x, y: getattr(x, arith)(y) + op.__name__ = arith + else: + op = getattr(operator, arith) + expr.set_use_numexpr(False) + expected = op(df, other) + expr.set_use_numexpr(True) + + expr.get_test_result() + result = op(df, other) + used_numexpr = expr.get_test_result() + assert used_numexpr, "Did not use numexpr as expected." + tm.assert_equal(expected, result) + + def run_frame(self, df, other, run_binary=True): + self.run_arithmetic(df, other) + if run_binary: + expr.set_use_numexpr(False) + binary_comp = other + 1 + expr.set_use_numexpr(True) + self.run_binary(df, binary_comp) + + for i in range(len(df.columns)): + self.run_arithmetic(df.iloc[:, i], other.iloc[:, i]) + # FIXME: dont leave commented-out + # series doesn't uses vec_compare instead of numexpr... + # binary_comp = other.iloc[:, i] + 1 + # self.run_binary(df.iloc[:, i], binary_comp) + + @pytest.mark.parametrize( + "df", + [ + _integer, + _integer2, + # randint to get a case with zeros + _integer * np.random.randint(0, 2, size=np.shape(_integer)), + _frame, + _frame2, + _mixed, + _mixed2, + ], + ) + def test_arithmetic(self, df): + # TODO: FIGURE OUT HOW TO GET RUN_BINARY TO WORK WITH MIXED=... + # can't do arithmetic because comparison methods try to do *entire* + # frame instead of by-column + kinds = {x.kind for x in df.dtypes.values} + should = len(kinds) == 1 + + self.run_frame(df, df, run_binary=should) + + def test_invalid(self): + + # no op + result = expr._can_use_numexpr( + operator.add, None, self.frame, self.frame, "evaluate" + ) + assert not result + + # mixed + result = expr._can_use_numexpr( + operator.add, "+", self.mixed, self.frame, "evaluate" + ) + assert not result + + # min elements + result = expr._can_use_numexpr( + operator.add, "+", self.frame2, self.frame2, "evaluate" + ) + assert not result + + # ok, we only check on first part of expression + result = expr._can_use_numexpr( + operator.add, "+", self.frame, self.frame2, "evaluate" + ) + assert result + + @pytest.mark.parametrize( + "opname,op_str", + [("add", "+"), ("sub", "-"), ("mul", "*"), ("truediv", "/"), ("pow", "**")], + ) + @pytest.mark.parametrize("left,right", [(_frame, _frame2), (_mixed, _mixed2)]) + def test_binary_ops(self, opname, op_str, left, right): + def testit(): + + if opname == "pow": + # TODO: get this working + return + + op = getattr(operator, opname) + + result = expr._can_use_numexpr(op, op_str, left, left, "evaluate") + assert result != left._is_mixed_type + + result = expr.evaluate(op, op_str, left, left, use_numexpr=True) + expected = expr.evaluate(op, op_str, left, left, use_numexpr=False) + + if isinstance(result, DataFrame): + tm.assert_frame_equal(result, expected) + else: + tm.assert_numpy_array_equal(result, expected.values) + + result = expr._can_use_numexpr(op, op_str, right, right, "evaluate") + assert not result + + expr.set_use_numexpr(False) + testit() + expr.set_use_numexpr(True) + expr.set_numexpr_threads(1) + testit() + expr.set_numexpr_threads() + testit() + + @pytest.mark.parametrize( + "opname,op_str", + [ + ("gt", ">"), + ("lt", "<"), + ("ge", ">="), + ("le", "<="), + ("eq", "=="), + ("ne", "!="), + ], + ) + @pytest.mark.parametrize("left,right", [(_frame, _frame2), (_mixed, _mixed2)]) + def test_comparison_ops(self, opname, op_str, left, right): + def testit(): + f12 = left + 1 + f22 = right + 1 + + op = getattr(operator, opname) + + result = expr._can_use_numexpr(op, op_str, left, f12, "evaluate") + assert result != left._is_mixed_type + + result = expr.evaluate(op, op_str, left, f12, use_numexpr=True) + expected = expr.evaluate(op, op_str, left, f12, use_numexpr=False) + if isinstance(result, DataFrame): + tm.assert_frame_equal(result, expected) + else: + tm.assert_numpy_array_equal(result, expected.values) + + result = expr._can_use_numexpr(op, op_str, right, f22, "evaluate") + assert not result + + expr.set_use_numexpr(False) + testit() + expr.set_use_numexpr(True) + expr.set_numexpr_threads(1) + testit() + expr.set_numexpr_threads() + testit() + + @pytest.mark.parametrize("cond", [True, False]) + @pytest.mark.parametrize("df", [_frame, _frame2, _mixed, _mixed2]) + def test_where(self, cond, df): + def testit(): + c = np.empty(df.shape, dtype=np.bool_) + c.fill(cond) + result = expr.where(c, df.values, df.values + 1) + expected = np.where(c, df.values, df.values + 1) + tm.assert_numpy_array_equal(result, expected) + + expr.set_use_numexpr(False) + testit() + expr.set_use_numexpr(True) + expr.set_numexpr_threads(1) + testit() + expr.set_numexpr_threads() + testit() + + @pytest.mark.parametrize( + "op_str,opname", [("/", "truediv"), ("//", "floordiv"), ("**", "pow")] + ) + def test_bool_ops_raise_on_arithmetic(self, op_str, opname): + df = DataFrame({"a": np.random.rand(10) > 0.5, "b": np.random.rand(10) > 0.5}) + + msg = f"operator {repr(op_str)} not implemented for bool dtypes" + f = getattr(operator, opname) + err_msg = re.escape(msg) + + with pytest.raises(NotImplementedError, match=err_msg): + f(df, df) + + with pytest.raises(NotImplementedError, match=err_msg): + f(df.a, df.b) + + with pytest.raises(NotImplementedError, match=err_msg): + f(df.a, True) + + with pytest.raises(NotImplementedError, match=err_msg): + f(False, df.a) + + with pytest.raises(NotImplementedError, match=err_msg): + f(False, df) + + with pytest.raises(NotImplementedError, match=err_msg): + f(df, True) + + @pytest.mark.parametrize( + "op_str,opname", [("+", "add"), ("*", "mul"), ("-", "sub")] + ) + def test_bool_ops_warn_on_arithmetic(self, op_str, opname): + n = 10 + df = DataFrame({"a": np.random.rand(n) > 0.5, "b": np.random.rand(n) > 0.5}) + + subs = {"+": "|", "*": "&", "-": "^"} + sub_funcs = {"|": "or_", "&": "and_", "^": "xor"} + + f = getattr(operator, opname) + fe = getattr(operator, sub_funcs[subs[op_str]]) + + if op_str == "-": + # raises TypeError + return + + with tm.use_numexpr(True, min_elements=5): + with tm.assert_produces_warning(check_stacklevel=False): + r = f(df, df) + e = fe(df, df) + tm.assert_frame_equal(r, e) + + with tm.assert_produces_warning(check_stacklevel=False): + r = f(df.a, df.b) + e = fe(df.a, df.b) + tm.assert_series_equal(r, e) + + with tm.assert_produces_warning(check_stacklevel=False): + r = f(df.a, True) + e = fe(df.a, True) + tm.assert_series_equal(r, e) + + with tm.assert_produces_warning(check_stacklevel=False): + r = f(False, df.a) + e = fe(False, df.a) + tm.assert_series_equal(r, e) + + with tm.assert_produces_warning(check_stacklevel=False): + r = f(False, df) + e = fe(False, df) + tm.assert_frame_equal(r, e) + + with tm.assert_produces_warning(check_stacklevel=False): + r = f(df, True) + e = fe(df, True) + tm.assert_frame_equal(r, e) + + @pytest.mark.parametrize( + "test_input,expected", + [ + ( + DataFrame( + [[0, 1, 2, "aa"], [0, 1, 2, "aa"]], columns=["a", "b", "c", "dtype"] + ), + DataFrame([[False, False], [False, False]], columns=["a", "dtype"]), + ), + ( + DataFrame( + [[0, 3, 2, "aa"], [0, 4, 2, "aa"], [0, 1, 1, "bb"]], + columns=["a", "b", "c", "dtype"], + ), + DataFrame( + [[False, False], [False, False], [False, False]], + columns=["a", "dtype"], + ), + ), + ], + ) + def test_bool_ops_column_name_dtype(self, test_input, expected): + # GH 22383 - .ne fails if columns containing column name 'dtype' + result = test_input.loc[:, ["a", "dtype"]].ne(test_input.loc[:, ["a", "dtype"]]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "arith", ("add", "sub", "mul", "mod", "truediv", "floordiv") + ) + @pytest.mark.parametrize("axis", (0, 1)) + def test_frame_series_axis(self, axis, arith): + # GH#26736 Dataframe.floordiv(Series, axis=1) fails + if axis == 1 and arith == "floordiv": + pytest.xfail("'floordiv' does not succeed with axis=1 #27636") + + df = self.frame + if axis == 1: + other = self.frame.iloc[0, :] + else: + other = self.frame.iloc[:, 0] + + expr._MIN_ELEMENTS = 0 + + op_func = getattr(df, arith) + + expr.set_use_numexpr(False) + expected = op_func(other, axis=axis) + expr.set_use_numexpr(True) + + result = op_func(other, axis=axis) + tm.assert_frame_equal(expected, result) diff --git a/venv/Lib/site-packages/pandas/tests/test_join.py b/venv/Lib/site-packages/pandas/tests/test_join.py new file mode 100644 index 0000000..129dc27 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/test_join.py @@ -0,0 +1,346 @@ +import numpy as np +import pytest + +from pandas._libs import join as _join + +from pandas import Categorical, DataFrame, Index, merge +import pandas._testing as tm + + +class TestIndexer: + @pytest.mark.parametrize( + "dtype", ["int32", "int64", "float32", "float64", "object"] + ) + def test_outer_join_indexer(self, dtype): + indexer = _join.outer_join_indexer + + left = np.arange(3, dtype=dtype) + right = np.arange(2, 5, dtype=dtype) + empty = np.array([], dtype=dtype) + + result, lindexer, rindexer = indexer(left, right) + assert isinstance(result, np.ndarray) + assert isinstance(lindexer, np.ndarray) + assert isinstance(rindexer, np.ndarray) + tm.assert_numpy_array_equal(result, np.arange(5, dtype=dtype)) + exp = np.array([0, 1, 2, -1, -1], dtype=np.int64) + tm.assert_numpy_array_equal(lindexer, exp) + exp = np.array([-1, -1, 0, 1, 2], dtype=np.int64) + tm.assert_numpy_array_equal(rindexer, exp) + + result, lindexer, rindexer = indexer(empty, right) + tm.assert_numpy_array_equal(result, right) + exp = np.array([-1, -1, -1], dtype=np.int64) + tm.assert_numpy_array_equal(lindexer, exp) + exp = np.array([0, 1, 2], dtype=np.int64) + tm.assert_numpy_array_equal(rindexer, exp) + + result, lindexer, rindexer = indexer(left, empty) + tm.assert_numpy_array_equal(result, left) + exp = np.array([0, 1, 2], dtype=np.int64) + tm.assert_numpy_array_equal(lindexer, exp) + exp = np.array([-1, -1, -1], dtype=np.int64) + tm.assert_numpy_array_equal(rindexer, exp) + + +def test_left_join_indexer_unique(): + a = np.array([1, 2, 3, 4, 5], dtype=np.int64) + b = np.array([2, 2, 3, 4, 4], dtype=np.int64) + + result = _join.left_join_indexer_unique(b, a) + expected = np.array([1, 1, 2, 3, 3], dtype=np.int64) + tm.assert_numpy_array_equal(result, expected) + + +def test_left_outer_join_bug(): + left = np.array( + [ + 0, + 1, + 0, + 1, + 1, + 2, + 3, + 1, + 0, + 2, + 1, + 2, + 0, + 1, + 1, + 2, + 3, + 2, + 3, + 2, + 1, + 1, + 3, + 0, + 3, + 2, + 3, + 0, + 0, + 2, + 3, + 2, + 0, + 3, + 1, + 3, + 0, + 1, + 3, + 0, + 0, + 1, + 0, + 3, + 1, + 0, + 1, + 0, + 1, + 1, + 0, + 2, + 2, + 2, + 2, + 2, + 0, + 3, + 1, + 2, + 0, + 0, + 3, + 1, + 3, + 2, + 2, + 0, + 1, + 3, + 0, + 2, + 3, + 2, + 3, + 3, + 2, + 3, + 3, + 1, + 3, + 2, + 0, + 0, + 3, + 1, + 1, + 1, + 0, + 2, + 3, + 3, + 1, + 2, + 0, + 3, + 1, + 2, + 0, + 2, + ], + dtype=np.int64, + ) + + right = np.array([3, 1], dtype=np.int64) + max_groups = 4 + + lidx, ridx = _join.left_outer_join(left, right, max_groups, sort=False) + + exp_lidx = np.arange(len(left), dtype=np.int64) + exp_ridx = -np.ones(len(left), dtype=np.int64) + + exp_ridx[left == 1] = 1 + exp_ridx[left == 3] = 0 + + tm.assert_numpy_array_equal(lidx, exp_lidx) + tm.assert_numpy_array_equal(ridx, exp_ridx) + + +def test_inner_join_indexer(): + a = np.array([1, 2, 3, 4, 5], dtype=np.int64) + b = np.array([0, 3, 5, 7, 9], dtype=np.int64) + + index, ares, bres = _join.inner_join_indexer(a, b) + + index_exp = np.array([3, 5], dtype=np.int64) + tm.assert_almost_equal(index, index_exp) + + aexp = np.array([2, 4], dtype=np.int64) + bexp = np.array([1, 2], dtype=np.int64) + tm.assert_almost_equal(ares, aexp) + tm.assert_almost_equal(bres, bexp) + + a = np.array([5], dtype=np.int64) + b = np.array([5], dtype=np.int64) + + index, ares, bres = _join.inner_join_indexer(a, b) + tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) + tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) + tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) + + +def test_outer_join_indexer(): + a = np.array([1, 2, 3, 4, 5], dtype=np.int64) + b = np.array([0, 3, 5, 7, 9], dtype=np.int64) + + index, ares, bres = _join.outer_join_indexer(a, b) + + index_exp = np.array([0, 1, 2, 3, 4, 5, 7, 9], dtype=np.int64) + tm.assert_almost_equal(index, index_exp) + + aexp = np.array([-1, 0, 1, 2, 3, 4, -1, -1], dtype=np.int64) + bexp = np.array([0, -1, -1, 1, -1, 2, 3, 4], dtype=np.int64) + tm.assert_almost_equal(ares, aexp) + tm.assert_almost_equal(bres, bexp) + + a = np.array([5], dtype=np.int64) + b = np.array([5], dtype=np.int64) + + index, ares, bres = _join.outer_join_indexer(a, b) + tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) + tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) + tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) + + +def test_left_join_indexer(): + a = np.array([1, 2, 3, 4, 5], dtype=np.int64) + b = np.array([0, 3, 5, 7, 9], dtype=np.int64) + + index, ares, bres = _join.left_join_indexer(a, b) + + tm.assert_almost_equal(index, a) + + aexp = np.array([0, 1, 2, 3, 4], dtype=np.int64) + bexp = np.array([-1, -1, 1, -1, 2], dtype=np.int64) + tm.assert_almost_equal(ares, aexp) + tm.assert_almost_equal(bres, bexp) + + a = np.array([5], dtype=np.int64) + b = np.array([5], dtype=np.int64) + + index, ares, bres = _join.left_join_indexer(a, b) + tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) + tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) + tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) + + +def test_left_join_indexer2(): + idx = Index([1, 1, 2, 5]) + idx2 = Index([1, 2, 5, 7, 9]) + + res, lidx, ridx = _join.left_join_indexer(idx2.values, idx.values) + + exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) + tm.assert_almost_equal(res, exp_res) + + exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64) + tm.assert_almost_equal(lidx, exp_lidx) + + exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64) + tm.assert_almost_equal(ridx, exp_ridx) + + +def test_outer_join_indexer2(): + idx = Index([1, 1, 2, 5]) + idx2 = Index([1, 2, 5, 7, 9]) + + res, lidx, ridx = _join.outer_join_indexer(idx2.values, idx.values) + + exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) + tm.assert_almost_equal(res, exp_res) + + exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64) + tm.assert_almost_equal(lidx, exp_lidx) + + exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64) + tm.assert_almost_equal(ridx, exp_ridx) + + +def test_inner_join_indexer2(): + idx = Index([1, 1, 2, 5]) + idx2 = Index([1, 2, 5, 7, 9]) + + res, lidx, ridx = _join.inner_join_indexer(idx2.values, idx.values) + + exp_res = np.array([1, 1, 2, 5], dtype=np.int64) + tm.assert_almost_equal(res, exp_res) + + exp_lidx = np.array([0, 0, 1, 2], dtype=np.int64) + tm.assert_almost_equal(lidx, exp_lidx) + + exp_ridx = np.array([0, 1, 2, 3], dtype=np.int64) + tm.assert_almost_equal(ridx, exp_ridx) + + +def test_merge_join_categorical_multiindex(): + # From issue 16627 + a = { + "Cat1": Categorical(["a", "b", "a", "c", "a", "b"], ["a", "b", "c"]), + "Int1": [0, 1, 0, 1, 0, 0], + } + a = DataFrame(a) + + b = { + "Cat": Categorical(["a", "b", "c", "a", "b", "c"], ["a", "b", "c"]), + "Int": [0, 0, 0, 1, 1, 1], + "Factor": [1.1, 1.2, 1.3, 1.4, 1.5, 1.6], + } + b = DataFrame(b).set_index(["Cat", "Int"])["Factor"] + + expected = merge( + a, + b.reset_index(), + left_on=["Cat1", "Int1"], + right_on=["Cat", "Int"], + how="left", + ) + result = a.join(b, on=["Cat1", "Int1"]) + expected = expected.drop(["Cat", "Int"], axis=1) + tm.assert_frame_equal(expected, result) + + # Same test, but with ordered categorical + a = { + "Cat1": Categorical( + ["a", "b", "a", "c", "a", "b"], ["b", "a", "c"], ordered=True + ), + "Int1": [0, 1, 0, 1, 0, 0], + } + a = DataFrame(a) + + b = { + "Cat": Categorical( + ["a", "b", "c", "a", "b", "c"], ["b", "a", "c"], ordered=True + ), + "Int": [0, 0, 0, 1, 1, 1], + "Factor": [1.1, 1.2, 1.3, 1.4, 1.5, 1.6], + } + b = DataFrame(b).set_index(["Cat", "Int"])["Factor"] + + expected = merge( + a, + b.reset_index(), + left_on=["Cat1", "Int1"], + right_on=["Cat", "Int"], + how="left", + ) + result = a.join(b, on=["Cat1", "Int1"]) + expected = expected.drop(["Cat", "Int"], axis=1) + tm.assert_frame_equal(expected, result) diff --git a/venv/Lib/site-packages/pandas/tests/test_lib.py b/venv/Lib/site-packages/pandas/tests/test_lib.py new file mode 100644 index 0000000..f839aa1 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/test_lib.py @@ -0,0 +1,196 @@ +import numpy as np +import pytest + +from pandas._libs import lib, writers as libwriters + +from pandas import Index +import pandas._testing as tm + + +class TestMisc: + def test_max_len_string_array(self): + + arr = a = np.array(["foo", "b", np.nan], dtype="object") + assert libwriters.max_len_string_array(arr) == 3 + + # unicode + arr = a.astype("U").astype(object) + assert libwriters.max_len_string_array(arr) == 3 + + # bytes for python3 + arr = a.astype("S").astype(object) + assert libwriters.max_len_string_array(arr) == 3 + + # raises + with pytest.raises(TypeError): + libwriters.max_len_string_array(arr.astype("U")) + + def test_fast_unique_multiple_list_gen_sort(self): + keys = [["p", "a"], ["n", "d"], ["a", "s"]] + + gen = (key for key in keys) + expected = np.array(["a", "d", "n", "p", "s"]) + out = lib.fast_unique_multiple_list_gen(gen, sort=True) + tm.assert_numpy_array_equal(np.array(out), expected) + + gen = (key for key in keys) + expected = np.array(["p", "a", "n", "d", "s"]) + out = lib.fast_unique_multiple_list_gen(gen, sort=False) + tm.assert_numpy_array_equal(np.array(out), expected) + + +class TestIndexing: + def test_maybe_indices_to_slice_left_edge(self): + target = np.arange(100) + + # slice + indices = np.array([], dtype=np.int64) + maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) + + assert isinstance(maybe_slice, slice) + tm.assert_numpy_array_equal(target[indices], target[maybe_slice]) + + for end in [1, 2, 5, 20, 99]: + for step in [1, 2, 4]: + indices = np.arange(0, end, step, dtype=np.int64) + maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) + + assert isinstance(maybe_slice, slice) + tm.assert_numpy_array_equal(target[indices], target[maybe_slice]) + + # reverse + indices = indices[::-1] + maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) + + assert isinstance(maybe_slice, slice) + tm.assert_numpy_array_equal(target[indices], target[maybe_slice]) + + # not slice + for case in [[2, 1, 2, 0], [2, 2, 1, 0], [0, 1, 2, 1], [-2, 0, 2], [2, 0, -2]]: + indices = np.array(case, dtype=np.int64) + maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) + + assert not isinstance(maybe_slice, slice) + tm.assert_numpy_array_equal(maybe_slice, indices) + tm.assert_numpy_array_equal(target[indices], target[maybe_slice]) + + def test_maybe_indices_to_slice_right_edge(self): + target = np.arange(100) + + # slice + for start in [0, 2, 5, 20, 97, 98]: + for step in [1, 2, 4]: + indices = np.arange(start, 99, step, dtype=np.int64) + maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) + + assert isinstance(maybe_slice, slice) + tm.assert_numpy_array_equal(target[indices], target[maybe_slice]) + + # reverse + indices = indices[::-1] + maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) + + assert isinstance(maybe_slice, slice) + tm.assert_numpy_array_equal(target[indices], target[maybe_slice]) + + # not slice + indices = np.array([97, 98, 99, 100], dtype=np.int64) + maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) + + assert not isinstance(maybe_slice, slice) + tm.assert_numpy_array_equal(maybe_slice, indices) + + with pytest.raises(IndexError): + target[indices] + with pytest.raises(IndexError): + target[maybe_slice] + + indices = np.array([100, 99, 98, 97], dtype=np.int64) + maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) + + assert not isinstance(maybe_slice, slice) + tm.assert_numpy_array_equal(maybe_slice, indices) + + with pytest.raises(IndexError): + target[indices] + with pytest.raises(IndexError): + target[maybe_slice] + + for case in [[99, 97, 99, 96], [99, 99, 98, 97], [98, 98, 97, 96]]: + indices = np.array(case, dtype=np.int64) + maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) + + assert not isinstance(maybe_slice, slice) + tm.assert_numpy_array_equal(maybe_slice, indices) + tm.assert_numpy_array_equal(target[indices], target[maybe_slice]) + + def test_maybe_indices_to_slice_both_edges(self): + target = np.arange(10) + + # slice + for step in [1, 2, 4, 5, 8, 9]: + indices = np.arange(0, 9, step, dtype=np.int64) + maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) + assert isinstance(maybe_slice, slice) + tm.assert_numpy_array_equal(target[indices], target[maybe_slice]) + + # reverse + indices = indices[::-1] + maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) + assert isinstance(maybe_slice, slice) + tm.assert_numpy_array_equal(target[indices], target[maybe_slice]) + + # not slice + for case in [[4, 2, 0, -2], [2, 2, 1, 0], [0, 1, 2, 1]]: + indices = np.array(case, dtype=np.int64) + maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) + assert not isinstance(maybe_slice, slice) + tm.assert_numpy_array_equal(maybe_slice, indices) + tm.assert_numpy_array_equal(target[indices], target[maybe_slice]) + + def test_maybe_indices_to_slice_middle(self): + target = np.arange(100) + + # slice + for start, end in [(2, 10), (5, 25), (65, 97)]: + for step in [1, 2, 4, 20]: + indices = np.arange(start, end, step, dtype=np.int64) + maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) + + assert isinstance(maybe_slice, slice) + tm.assert_numpy_array_equal(target[indices], target[maybe_slice]) + + # reverse + indices = indices[::-1] + maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) + + assert isinstance(maybe_slice, slice) + tm.assert_numpy_array_equal(target[indices], target[maybe_slice]) + + # not slice + for case in [[14, 12, 10, 12], [12, 12, 11, 10], [10, 11, 12, 11]]: + indices = np.array(case, dtype=np.int64) + maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) + + assert not isinstance(maybe_slice, slice) + tm.assert_numpy_array_equal(maybe_slice, indices) + tm.assert_numpy_array_equal(target[indices], target[maybe_slice]) + + def test_maybe_booleans_to_slice(self): + arr = np.array([0, 0, 1, 1, 1, 0, 1], dtype=np.uint8) + result = lib.maybe_booleans_to_slice(arr) + assert result.dtype == np.bool_ + + result = lib.maybe_booleans_to_slice(arr[:0]) + assert result == slice(0, 0) + + def test_get_reverse_indexer(self): + indexer = np.array([-1, -1, 1, 2, 0, -1, 3, 4], dtype=np.int64) + result = lib.get_reverse_indexer(indexer, 5) + expected = np.array([4, 2, 3, 6, 7], dtype=np.int64) + tm.assert_numpy_array_equal(result, expected) + + +def test_cache_readonly_preserve_docstrings(): + # GH18197 + assert Index.hasnans.__doc__ is not None diff --git a/venv/Lib/site-packages/pandas/tests/test_multilevel.py b/venv/Lib/site-packages/pandas/tests/test_multilevel.py new file mode 100644 index 0000000..5382ad8 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/test_multilevel.py @@ -0,0 +1,2473 @@ +import datetime +from io import StringIO +import itertools +from itertools import product + +import numpy as np +from numpy.random import randn +import pytest +import pytz + +from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype + +import pandas as pd +from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, isna +import pandas._testing as tm + +AGG_FUNCTIONS = [ + "sum", + "prod", + "min", + "max", + "median", + "mean", + "skew", + "mad", + "std", + "var", + "sem", +] + + +class Base: + def setup_method(self, method): + + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + self.frame = DataFrame( + np.random.randn(10, 3), + index=index, + columns=Index(["A", "B", "C"], name="exp"), + ) + + self.single_level = MultiIndex( + levels=[["foo", "bar", "baz", "qux"]], codes=[[0, 1, 2, 3]], names=["first"] + ) + + # create test series object + arrays = [ + ["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + tuples = zip(*arrays) + index = MultiIndex.from_tuples(tuples) + s = Series(randn(8), index=index) + s[3] = np.NaN + self.series = s + + self.tdf = tm.makeTimeDataFrame(100) + self.ymd = self.tdf.groupby( + [lambda x: x.year, lambda x: x.month, lambda x: x.day] + ).sum() + + # use Int64Index, to make sure things work + self.ymd.index.set_levels( + [lev.astype("i8") for lev in self.ymd.index.levels], inplace=True + ) + self.ymd.index.set_names(["year", "month", "day"], inplace=True) + + +class TestMultiLevel(Base): + def test_append(self): + a, b = self.frame[:5], self.frame[5:] + + result = a.append(b) + tm.assert_frame_equal(result, self.frame) + + result = a["A"].append(b["A"]) + tm.assert_series_equal(result, self.frame["A"]) + + def test_append_index(self): + idx1 = Index([1.1, 1.2, 1.3]) + idx2 = pd.date_range("2011-01-01", freq="D", periods=3, tz="Asia/Tokyo") + idx3 = Index(["A", "B", "C"]) + + midx_lv2 = MultiIndex.from_arrays([idx1, idx2]) + midx_lv3 = MultiIndex.from_arrays([idx1, idx2, idx3]) + + result = idx1.append(midx_lv2) + + # see gh-7112 + tz = pytz.timezone("Asia/Tokyo") + expected_tuples = [ + (1.1, tz.localize(datetime.datetime(2011, 1, 1))), + (1.2, tz.localize(datetime.datetime(2011, 1, 2))), + (1.3, tz.localize(datetime.datetime(2011, 1, 3))), + ] + expected = Index([1.1, 1.2, 1.3] + expected_tuples) + tm.assert_index_equal(result, expected) + + result = midx_lv2.append(idx1) + expected = Index(expected_tuples + [1.1, 1.2, 1.3]) + tm.assert_index_equal(result, expected) + + result = midx_lv2.append(midx_lv2) + expected = MultiIndex.from_arrays([idx1.append(idx1), idx2.append(idx2)]) + tm.assert_index_equal(result, expected) + + result = midx_lv2.append(midx_lv3) + tm.assert_index_equal(result, expected) + + result = midx_lv3.append(midx_lv2) + expected = Index._simple_new( + np.array( + [ + (1.1, tz.localize(datetime.datetime(2011, 1, 1)), "A"), + (1.2, tz.localize(datetime.datetime(2011, 1, 2)), "B"), + (1.3, tz.localize(datetime.datetime(2011, 1, 3)), "C"), + ] + + expected_tuples, + dtype=object, + ), + None, + ) + tm.assert_index_equal(result, expected) + + def test_dataframe_constructor(self): + multi = DataFrame( + np.random.randn(4, 4), + index=[np.array(["a", "a", "b", "b"]), np.array(["x", "y", "x", "y"])], + ) + assert isinstance(multi.index, MultiIndex) + assert not isinstance(multi.columns, MultiIndex) + + multi = DataFrame( + np.random.randn(4, 4), columns=[["a", "a", "b", "b"], ["x", "y", "x", "y"]] + ) + assert isinstance(multi.columns, MultiIndex) + + def test_series_constructor(self): + multi = Series( + 1.0, index=[np.array(["a", "a", "b", "b"]), np.array(["x", "y", "x", "y"])] + ) + assert isinstance(multi.index, MultiIndex) + + multi = Series(1.0, index=[["a", "a", "b", "b"], ["x", "y", "x", "y"]]) + assert isinstance(multi.index, MultiIndex) + + multi = Series(range(4), index=[["a", "a", "b", "b"], ["x", "y", "x", "y"]]) + assert isinstance(multi.index, MultiIndex) + + def test_reindex_level(self): + # axis=0 + month_sums = self.ymd.sum(level="month") + result = month_sums.reindex(self.ymd.index, level=1) + expected = self.ymd.groupby(level="month").transform(np.sum) + + tm.assert_frame_equal(result, expected) + + # Series + result = month_sums["A"].reindex(self.ymd.index, level=1) + expected = self.ymd["A"].groupby(level="month").transform(np.sum) + tm.assert_series_equal(result, expected, check_names=False) + + # axis=1 + month_sums = self.ymd.T.sum(axis=1, level="month") + result = month_sums.reindex(columns=self.ymd.index, level=1) + expected = self.ymd.groupby(level="month").transform(np.sum).T + tm.assert_frame_equal(result, expected) + + def test_binops_level(self): + def _check_op(opname): + op = getattr(DataFrame, opname) + month_sums = self.ymd.sum(level="month") + result = op(self.ymd, month_sums, level="month") + + broadcasted = self.ymd.groupby(level="month").transform(np.sum) + expected = op(self.ymd, broadcasted) + tm.assert_frame_equal(result, expected) + + # Series + op = getattr(Series, opname) + result = op(self.ymd["A"], month_sums["A"], level="month") + broadcasted = self.ymd["A"].groupby(level="month").transform(np.sum) + expected = op(self.ymd["A"], broadcasted) + expected.name = "A" + tm.assert_series_equal(result, expected) + + _check_op("sub") + _check_op("add") + _check_op("mul") + _check_op("div") + + def test_pickle(self): + def _test_roundtrip(frame): + unpickled = tm.round_trip_pickle(frame) + tm.assert_frame_equal(frame, unpickled) + + _test_roundtrip(self.frame) + _test_roundtrip(self.frame.T) + _test_roundtrip(self.ymd) + _test_roundtrip(self.ymd.T) + + def test_reindex(self): + expected = self.frame.iloc[[0, 3]] + reindexed = self.frame.loc[[("foo", "one"), ("bar", "one")]] + tm.assert_frame_equal(reindexed, expected) + + def test_reindex_preserve_levels(self): + new_index = self.ymd.index[::10] + chunk = self.ymd.reindex(new_index) + assert chunk.index is new_index + + chunk = self.ymd.loc[new_index] + assert chunk.index is new_index + + ymdT = self.ymd.T + chunk = ymdT.reindex(columns=new_index) + assert chunk.columns is new_index + + chunk = ymdT.loc[:, new_index] + assert chunk.columns is new_index + + def test_repr_to_string(self): + repr(self.frame) + repr(self.ymd) + repr(self.frame.T) + repr(self.ymd.T) + + buf = StringIO() + self.frame.to_string(buf=buf) + self.ymd.to_string(buf=buf) + self.frame.T.to_string(buf=buf) + self.ymd.T.to_string(buf=buf) + + def test_repr_name_coincide(self): + index = MultiIndex.from_tuples( + [("a", 0, "foo"), ("b", 1, "bar")], names=["a", "b", "c"] + ) + + df = DataFrame({"value": [0, 1]}, index=index) + + lines = repr(df).split("\n") + assert lines[2].startswith("a 0 foo") + + def test_delevel_infer_dtype(self): + tuples = list(product(["foo", "bar"], [10, 20], [1.0, 1.1])) + index = MultiIndex.from_tuples(tuples, names=["prm0", "prm1", "prm2"]) + df = DataFrame(np.random.randn(8, 3), columns=["A", "B", "C"], index=index) + deleveled = df.reset_index() + assert is_integer_dtype(deleveled["prm1"]) + assert is_float_dtype(deleveled["prm2"]) + + def test_reset_index_with_drop(self): + deleveled = self.ymd.reset_index(drop=True) + assert len(deleveled.columns) == len(self.ymd.columns) + assert deleveled.index.name == self.ymd.index.name + + deleveled = self.series.reset_index() + assert isinstance(deleveled, DataFrame) + assert len(deleveled.columns) == len(self.series.index.levels) + 1 + assert deleveled.index.name == self.series.index.name + + deleveled = self.series.reset_index(drop=True) + assert isinstance(deleveled, Series) + assert deleveled.index.name == self.series.index.name + + def test_count_level(self): + def _check_counts(frame, axis=0): + index = frame._get_axis(axis) + for i in range(index.nlevels): + result = frame.count(axis=axis, level=i) + expected = frame.groupby(axis=axis, level=i).count() + expected = expected.reindex_like(result).astype("i8") + tm.assert_frame_equal(result, expected) + + self.frame.iloc[1, [1, 2]] = np.nan + self.frame.iloc[7, [0, 1]] = np.nan + self.ymd.iloc[1, [1, 2]] = np.nan + self.ymd.iloc[7, [0, 1]] = np.nan + + _check_counts(self.frame) + _check_counts(self.ymd) + _check_counts(self.frame.T, axis=1) + _check_counts(self.ymd.T, axis=1) + + # can't call with level on regular DataFrame + df = tm.makeTimeDataFrame() + with pytest.raises(TypeError, match="hierarchical"): + df.count(level=0) + + self.frame["D"] = "foo" + result = self.frame.count(level=0, numeric_only=True) + tm.assert_index_equal(result.columns, Index(list("ABC"), name="exp")) + + def test_count_level_series(self): + index = MultiIndex( + levels=[["foo", "bar", "baz"], ["one", "two", "three", "four"]], + codes=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]], + ) + + s = Series(np.random.randn(len(index)), index=index) + + result = s.count(level=0) + expected = s.groupby(level=0).count() + tm.assert_series_equal( + result.astype("f8"), expected.reindex(result.index).fillna(0) + ) + + result = s.count(level=1) + expected = s.groupby(level=1).count() + tm.assert_series_equal( + result.astype("f8"), expected.reindex(result.index).fillna(0) + ) + + def test_count_level_corner(self): + s = self.frame["A"][:0] + result = s.count(level=0) + expected = Series(0, index=s.index.levels[0], name="A") + tm.assert_series_equal(result, expected) + + df = self.frame[:0] + result = df.count(level=0) + expected = ( + DataFrame(index=s.index.levels[0].set_names(["first"]), columns=df.columns) + .fillna(0) + .astype(np.int64) + ) + tm.assert_frame_equal(result, expected) + + def test_get_level_number_out_of_bounds(self): + with pytest.raises(IndexError, match="Too many levels"): + self.frame.index._get_level_number(2) + with pytest.raises(IndexError, match="not a valid level number"): + self.frame.index._get_level_number(-3) + + def test_unstack(self): + # just check that it works for now + unstacked = self.ymd.unstack() + unstacked.unstack() + + # test that ints work + self.ymd.astype(int).unstack() + + # test that int32 work + self.ymd.astype(np.int32).unstack() + + @pytest.mark.parametrize( + "result_rows,result_columns,index_product,expected_row", + [ + ( + [[1, 1, None, None, 30.0, None], [2, 2, None, None, 30.0, None]], + ["ix1", "ix2", "col1", "col2", "col3", "col4"], + 2, + [None, None, 30.0, None], + ), + ( + [[1, 1, None, None, 30.0], [2, 2, None, None, 30.0]], + ["ix1", "ix2", "col1", "col2", "col3"], + 2, + [None, None, 30.0], + ), + ( + [[1, 1, None, None, 30.0], [2, None, None, None, 30.0]], + ["ix1", "ix2", "col1", "col2", "col3"], + None, + [None, None, 30.0], + ), + ], + ) + def test_unstack_partial( + self, result_rows, result_columns, index_product, expected_row + ): + # check for regressions on this issue: + # https://github.com/pandas-dev/pandas/issues/19351 + # make sure DataFrame.unstack() works when its run on a subset of the DataFrame + # and the Index levels contain values that are not present in the subset + result = pd.DataFrame(result_rows, columns=result_columns).set_index( + ["ix1", "ix2"] + ) + result = result.iloc[1:2].unstack("ix2") + expected = pd.DataFrame( + [expected_row], + columns=pd.MultiIndex.from_product( + [result_columns[2:], [index_product]], names=[None, "ix2"] + ), + index=pd.Index([2], name="ix1"), + ) + tm.assert_frame_equal(result, expected) + + def test_unstack_multiple_no_empty_columns(self): + index = MultiIndex.from_tuples( + [(0, "foo", 0), (0, "bar", 0), (1, "baz", 1), (1, "qux", 1)] + ) + + s = Series(np.random.randn(4), index=index) + + unstacked = s.unstack([1, 2]) + expected = unstacked.dropna(axis=1, how="all") + tm.assert_frame_equal(unstacked, expected) + + def test_stack(self): + # regular roundtrip + unstacked = self.ymd.unstack() + restacked = unstacked.stack() + tm.assert_frame_equal(restacked, self.ymd) + + unlexsorted = self.ymd.sort_index(level=2) + + unstacked = unlexsorted.unstack(2) + restacked = unstacked.stack() + tm.assert_frame_equal(restacked.sort_index(level=0), self.ymd) + + unlexsorted = unlexsorted[::-1] + unstacked = unlexsorted.unstack(1) + restacked = unstacked.stack().swaplevel(1, 2) + tm.assert_frame_equal(restacked.sort_index(level=0), self.ymd) + + unlexsorted = unlexsorted.swaplevel(0, 1) + unstacked = unlexsorted.unstack(0).swaplevel(0, 1, axis=1) + restacked = unstacked.stack(0).swaplevel(1, 2) + tm.assert_frame_equal(restacked.sort_index(level=0), self.ymd) + + # columns unsorted + unstacked = self.ymd.unstack() + unstacked = unstacked.sort_index(axis=1, ascending=False) + restacked = unstacked.stack() + tm.assert_frame_equal(restacked, self.ymd) + + # more than 2 levels in the columns + unstacked = self.ymd.unstack(1).unstack(1) + + result = unstacked.stack(1) + expected = self.ymd.unstack() + tm.assert_frame_equal(result, expected) + + result = unstacked.stack(2) + expected = self.ymd.unstack(1) + tm.assert_frame_equal(result, expected) + + result = unstacked.stack(0) + expected = self.ymd.stack().unstack(1).unstack(1) + tm.assert_frame_equal(result, expected) + + # not all levels present in each echelon + unstacked = self.ymd.unstack(2).loc[:, ::3] + stacked = unstacked.stack().stack() + ymd_stacked = self.ymd.stack() + tm.assert_series_equal(stacked, ymd_stacked.reindex(stacked.index)) + + # stack with negative number + result = self.ymd.unstack(0).stack(-2) + expected = self.ymd.unstack(0).stack(0) + + # GH10417 + def check(left, right): + tm.assert_series_equal(left, right) + assert left.index.is_unique is False + li, ri = left.index, right.index + tm.assert_index_equal(li, ri) + + df = DataFrame( + np.arange(12).reshape(4, 3), + index=list("abab"), + columns=["1st", "2nd", "3rd"], + ) + + mi = MultiIndex( + levels=[["a", "b"], ["1st", "2nd", "3rd"]], + codes=[np.tile(np.arange(2).repeat(3), 2), np.tile(np.arange(3), 4)], + ) + + left, right = df.stack(), Series(np.arange(12), index=mi) + check(left, right) + + df.columns = ["1st", "2nd", "1st"] + mi = MultiIndex( + levels=[["a", "b"], ["1st", "2nd"]], + codes=[np.tile(np.arange(2).repeat(3), 2), np.tile([0, 1, 0], 4)], + ) + + left, right = df.stack(), Series(np.arange(12), index=mi) + check(left, right) + + tpls = ("a", 2), ("b", 1), ("a", 1), ("b", 2) + df.index = MultiIndex.from_tuples(tpls) + mi = MultiIndex( + levels=[["a", "b"], [1, 2], ["1st", "2nd"]], + codes=[ + np.tile(np.arange(2).repeat(3), 2), + np.repeat([1, 0, 1], [3, 6, 3]), + np.tile([0, 1, 0], 4), + ], + ) + + left, right = df.stack(), Series(np.arange(12), index=mi) + check(left, right) + + def test_unstack_odd_failure(self): + data = """day,time,smoker,sum,len +Fri,Dinner,No,8.25,3. +Fri,Dinner,Yes,27.03,9 +Fri,Lunch,No,3.0,1 +Fri,Lunch,Yes,13.68,6 +Sat,Dinner,No,139.63,45 +Sat,Dinner,Yes,120.77,42 +Sun,Dinner,No,180.57,57 +Sun,Dinner,Yes,66.82,19 +Thur,Dinner,No,3.0,1 +Thur,Lunch,No,117.32,44 +Thur,Lunch,Yes,51.51,17""" + + df = pd.read_csv(StringIO(data)).set_index(["day", "time", "smoker"]) + + # it works, #2100 + result = df.unstack(2) + + recons = result.stack() + tm.assert_frame_equal(recons, df) + + def test_stack_mixed_dtype(self): + df = self.frame.T + df["foo", "four"] = "foo" + df = df.sort_index(level=1, axis=1) + + stacked = df.stack() + result = df["foo"].stack().sort_index() + tm.assert_series_equal(stacked["foo"], result, check_names=False) + assert result.name is None + assert stacked["bar"].dtype == np.float_ + + def test_unstack_bug(self): + df = DataFrame( + { + "state": ["naive", "naive", "naive", "activ", "activ", "activ"], + "exp": ["a", "b", "b", "b", "a", "a"], + "barcode": [1, 2, 3, 4, 1, 3], + "v": ["hi", "hi", "bye", "bye", "bye", "peace"], + "extra": np.arange(6.0), + } + ) + + result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) + + unstacked = result.unstack() + restacked = unstacked.stack() + tm.assert_series_equal(restacked, result.reindex(restacked.index).astype(float)) + + def test_stack_unstack_preserve_names(self): + unstacked = self.frame.unstack() + assert unstacked.index.name == "first" + assert unstacked.columns.names == ["exp", "second"] + + restacked = unstacked.stack() + assert restacked.index.names == self.frame.index.names + + @pytest.mark.parametrize("method", ["stack", "unstack"]) + def test_stack_unstack_wrong_level_name(self, method): + # GH 18303 - wrong level name should raise + + # A DataFrame with flat axes: + df = self.frame.loc["foo"] + + with pytest.raises(KeyError, match="does not match index name"): + getattr(df, method)("mistake") + + if method == "unstack": + # Same on a Series: + s = df.iloc[:, 0] + with pytest.raises(KeyError, match="does not match index name"): + getattr(s, method)("mistake") + + def test_unused_level_raises(self): + # GH 20410 + mi = MultiIndex( + levels=[["a_lot", "onlyone", "notevenone"], [1970, ""]], + codes=[[1, 0], [1, 0]], + ) + df = DataFrame(-1, index=range(3), columns=mi) + + with pytest.raises(KeyError, match="notevenone"): + df["notevenone"] + + def test_unstack_level_name(self): + result = self.frame.unstack("second") + expected = self.frame.unstack(level=1) + tm.assert_frame_equal(result, expected) + + def test_stack_level_name(self): + unstacked = self.frame.unstack("second") + result = unstacked.stack("exp") + expected = self.frame.unstack().stack(0) + tm.assert_frame_equal(result, expected) + + result = self.frame.stack("exp") + expected = self.frame.stack() + tm.assert_series_equal(result, expected) + + def test_stack_unstack_multiple(self): + unstacked = self.ymd.unstack(["year", "month"]) + expected = self.ymd.unstack("year").unstack("month") + tm.assert_frame_equal(unstacked, expected) + assert unstacked.columns.names == expected.columns.names + + # series + s = self.ymd["A"] + s_unstacked = s.unstack(["year", "month"]) + tm.assert_frame_equal(s_unstacked, expected["A"]) + + restacked = unstacked.stack(["year", "month"]) + restacked = restacked.swaplevel(0, 1).swaplevel(1, 2) + restacked = restacked.sort_index(level=0) + + tm.assert_frame_equal(restacked, self.ymd) + assert restacked.index.names == self.ymd.index.names + + # GH #451 + unstacked = self.ymd.unstack([1, 2]) + expected = self.ymd.unstack(1).unstack(1).dropna(axis=1, how="all") + tm.assert_frame_equal(unstacked, expected) + + unstacked = self.ymd.unstack([2, 1]) + expected = self.ymd.unstack(2).unstack(1).dropna(axis=1, how="all") + tm.assert_frame_equal(unstacked, expected.loc[:, unstacked.columns]) + + def test_stack_names_and_numbers(self): + unstacked = self.ymd.unstack(["year", "month"]) + + # Can't use mixture of names and numbers to stack + with pytest.raises(ValueError, match="level should contain"): + unstacked.stack([0, "month"]) + + def test_stack_multiple_out_of_bounds(self): + # nlevels == 3 + unstacked = self.ymd.unstack(["year", "month"]) + + with pytest.raises(IndexError, match="Too many levels"): + unstacked.stack([2, 3]) + with pytest.raises(IndexError, match="not a valid level number"): + unstacked.stack([-4, -3]) + + def test_unstack_period_series(self): + # GH 4342 + idx1 = pd.PeriodIndex( + ["2013-01", "2013-01", "2013-02", "2013-02", "2013-03", "2013-03"], + freq="M", + name="period", + ) + idx2 = Index(["A", "B"] * 3, name="str") + value = [1, 2, 3, 4, 5, 6] + + idx = MultiIndex.from_arrays([idx1, idx2]) + s = Series(value, index=idx) + + result1 = s.unstack() + result2 = s.unstack(level=1) + result3 = s.unstack(level=0) + + e_idx = pd.PeriodIndex( + ["2013-01", "2013-02", "2013-03"], freq="M", name="period" + ) + expected = DataFrame( + {"A": [1, 3, 5], "B": [2, 4, 6]}, index=e_idx, columns=["A", "B"] + ) + expected.columns.name = "str" + + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + tm.assert_frame_equal(result3, expected.T) + + idx1 = pd.PeriodIndex( + ["2013-01", "2013-01", "2013-02", "2013-02", "2013-03", "2013-03"], + freq="M", + name="period1", + ) + + idx2 = pd.PeriodIndex( + ["2013-12", "2013-11", "2013-10", "2013-09", "2013-08", "2013-07"], + freq="M", + name="period2", + ) + idx = MultiIndex.from_arrays([idx1, idx2]) + s = Series(value, index=idx) + + result1 = s.unstack() + result2 = s.unstack(level=1) + result3 = s.unstack(level=0) + + e_idx = pd.PeriodIndex( + ["2013-01", "2013-02", "2013-03"], freq="M", name="period1" + ) + e_cols = pd.PeriodIndex( + ["2013-07", "2013-08", "2013-09", "2013-10", "2013-11", "2013-12"], + freq="M", + name="period2", + ) + expected = DataFrame( + [ + [np.nan, np.nan, np.nan, np.nan, 2, 1], + [np.nan, np.nan, 4, 3, np.nan, np.nan], + [6, 5, np.nan, np.nan, np.nan, np.nan], + ], + index=e_idx, + columns=e_cols, + ) + + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + tm.assert_frame_equal(result3, expected.T) + + def test_unstack_period_frame(self): + # GH 4342 + idx1 = pd.PeriodIndex( + ["2014-01", "2014-02", "2014-02", "2014-02", "2014-01", "2014-01"], + freq="M", + name="period1", + ) + idx2 = pd.PeriodIndex( + ["2013-12", "2013-12", "2014-02", "2013-10", "2013-10", "2014-02"], + freq="M", + name="period2", + ) + value = {"A": [1, 2, 3, 4, 5, 6], "B": [6, 5, 4, 3, 2, 1]} + idx = MultiIndex.from_arrays([idx1, idx2]) + df = DataFrame(value, index=idx) + + result1 = df.unstack() + result2 = df.unstack(level=1) + result3 = df.unstack(level=0) + + e_1 = pd.PeriodIndex(["2014-01", "2014-02"], freq="M", name="period1") + e_2 = pd.PeriodIndex( + ["2013-10", "2013-12", "2014-02", "2013-10", "2013-12", "2014-02"], + freq="M", + name="period2", + ) + e_cols = MultiIndex.from_arrays(["A A A B B B".split(), e_2]) + expected = DataFrame( + [[5, 1, 6, 2, 6, 1], [4, 2, 3, 3, 5, 4]], index=e_1, columns=e_cols + ) + + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + + e_1 = pd.PeriodIndex( + ["2014-01", "2014-02", "2014-01", "2014-02"], freq="M", name="period1" + ) + e_2 = pd.PeriodIndex( + ["2013-10", "2013-12", "2014-02"], freq="M", name="period2" + ) + e_cols = MultiIndex.from_arrays(["A A B B".split(), e_1]) + expected = DataFrame( + [[5, 4, 2, 3], [1, 2, 6, 5], [6, 3, 1, 4]], index=e_2, columns=e_cols + ) + + tm.assert_frame_equal(result3, expected) + + def test_stack_multiple_bug(self): + """ bug when some uniques are not present in the data #3170""" + id_col = ([1] * 3) + ([2] * 3) + name = (["a"] * 3) + (["b"] * 3) + date = pd.to_datetime(["2013-01-03", "2013-01-04", "2013-01-05"] * 2) + var1 = np.random.randint(0, 100, 6) + df = DataFrame(dict(ID=id_col, NAME=name, DATE=date, VAR1=var1)) + + multi = df.set_index(["DATE", "ID"]) + multi.columns.name = "Params" + unst = multi.unstack("ID") + down = unst.resample("W-THU").mean() + + rs = down.stack("ID") + xp = unst.loc[:, ["VAR1"]].resample("W-THU").mean().stack("ID") + xp.columns.name = "Params" + tm.assert_frame_equal(rs, xp) + + def test_stack_dropna(self): + # GH #3997 + df = DataFrame({"A": ["a1", "a2"], "B": ["b1", "b2"], "C": [1, 1]}) + df = df.set_index(["A", "B"]) + + stacked = df.unstack().stack(dropna=False) + assert len(stacked) > len(stacked.dropna()) + + stacked = df.unstack().stack(dropna=True) + tm.assert_frame_equal(stacked, stacked.dropna()) + + def test_unstack_multiple_hierarchical(self): + df = DataFrame( + index=[ + [0, 0, 0, 0, 1, 1, 1, 1], + [0, 0, 1, 1, 0, 0, 1, 1], + [0, 1, 0, 1, 0, 1, 0, 1], + ], + columns=[[0, 0, 1, 1], [0, 1, 0, 1]], + ) + + df.index.names = ["a", "b", "c"] + df.columns.names = ["d", "e"] + + # it works! + df.unstack(["b", "c"]) + + def test_groupby_transform(self): + s = self.frame["A"] + grouper = s.index.get_level_values(0) + + grouped = s.groupby(grouper) + + applied = grouped.apply(lambda x: x * 2) + expected = grouped.transform(lambda x: x * 2) + result = applied.reindex(expected.index) + tm.assert_series_equal(result, expected, check_names=False) + + def test_unstack_sparse_keyspace(self): + # memory problems with naive impl #2278 + # Generate Long File & Test Pivot + NUM_ROWS = 1000 + + df = DataFrame( + { + "A": np.random.randint(100, size=NUM_ROWS), + "B": np.random.randint(300, size=NUM_ROWS), + "C": np.random.randint(-7, 7, size=NUM_ROWS), + "D": np.random.randint(-19, 19, size=NUM_ROWS), + "E": np.random.randint(3000, size=NUM_ROWS), + "F": np.random.randn(NUM_ROWS), + } + ) + + idf = df.set_index(["A", "B", "C", "D", "E"]) + + # it works! is sufficient + idf.unstack("E") + + def test_unstack_unobserved_keys(self): + # related to #2278 refactoring + levels = [[0, 1], [0, 1, 2, 3]] + codes = [[0, 0, 1, 1], [0, 2, 0, 2]] + + index = MultiIndex(levels, codes) + + df = DataFrame(np.random.randn(4, 2), index=index) + + result = df.unstack() + assert len(result.columns) == 4 + + recons = result.stack() + tm.assert_frame_equal(recons, df) + + @pytest.mark.slow + def test_unstack_number_of_levels_larger_than_int32(self): + # GH 20601 + df = DataFrame( + np.random.randn(2 ** 16, 2), index=[np.arange(2 ** 16), np.arange(2 ** 16)] + ) + with pytest.raises(ValueError, match="int32 overflow"): + df.unstack() + + def test_stack_order_with_unsorted_levels(self): + # GH 16323 + + def manual_compare_stacked(df, df_stacked, lev0, lev1): + assert all( + df.loc[row, col] == df_stacked.loc[(row, col[lev0]), col[lev1]] + for row in df.index + for col in df.columns + ) + + # deep check for 1-row case + for width in [2, 3]: + levels_poss = itertools.product( + itertools.permutations([0, 1, 2], width), repeat=2 + ) + + for levels in levels_poss: + columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) + df = DataFrame(columns=columns, data=[range(4)]) + for stack_lev in range(2): + df_stacked = df.stack(stack_lev) + manual_compare_stacked(df, df_stacked, stack_lev, 1 - stack_lev) + + # check multi-row case + mi = MultiIndex( + levels=[["A", "C", "B"], ["B", "A", "C"]], + codes=[np.repeat(range(3), 3), np.tile(range(3), 3)], + ) + df = DataFrame( + columns=mi, index=range(5), data=np.arange(5 * len(mi)).reshape(5, -1) + ) + manual_compare_stacked(df, df.stack(0), 0, 1) + + def test_groupby_corner(self): + midx = MultiIndex( + levels=[["foo"], ["bar"], ["baz"]], + codes=[[0], [0], [0]], + names=["one", "two", "three"], + ) + df = DataFrame([np.random.rand(4)], columns=["a", "b", "c", "d"], index=midx) + # should work + df.groupby(level="three") + + def test_groupby_level_no_obs(self): + # #1697 + midx = MultiIndex.from_tuples( + [ + ("f1", "s1"), + ("f1", "s2"), + ("f2", "s1"), + ("f2", "s2"), + ("f3", "s1"), + ("f3", "s2"), + ] + ) + df = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], columns=midx) + df1 = df.loc(axis=1)[df.columns.map(lambda u: u[0] in ["f2", "f3"])] + + grouped = df1.groupby(axis=1, level=0) + result = grouped.sum() + assert (result.columns == ["f2", "f3"]).all() + + def test_join(self): + a = self.frame.loc[self.frame.index[:5], ["A"]] + b = self.frame.loc[self.frame.index[2:], ["B", "C"]] + + joined = a.join(b, how="outer").reindex(self.frame.index) + expected = self.frame.copy() + expected.values[np.isnan(joined.values)] = np.nan + + assert not np.isnan(joined.values).all() + + # TODO what should join do with names ? + tm.assert_frame_equal(joined, expected, check_names=False) + + def test_swaplevel(self): + swapped = self.frame["A"].swaplevel() + swapped2 = self.frame["A"].swaplevel(0) + swapped3 = self.frame["A"].swaplevel(0, 1) + swapped4 = self.frame["A"].swaplevel("first", "second") + assert not swapped.index.equals(self.frame.index) + tm.assert_series_equal(swapped, swapped2) + tm.assert_series_equal(swapped, swapped3) + tm.assert_series_equal(swapped, swapped4) + + back = swapped.swaplevel() + back2 = swapped.swaplevel(0) + back3 = swapped.swaplevel(0, 1) + back4 = swapped.swaplevel("second", "first") + assert back.index.equals(self.frame.index) + tm.assert_series_equal(back, back2) + tm.assert_series_equal(back, back3) + tm.assert_series_equal(back, back4) + + ft = self.frame.T + swapped = ft.swaplevel("first", "second", axis=1) + exp = self.frame.swaplevel("first", "second").T + tm.assert_frame_equal(swapped, exp) + + def test_reorder_levels(self): + result = self.ymd.reorder_levels(["month", "day", "year"]) + expected = self.ymd.swaplevel(0, 1).swaplevel(1, 2) + tm.assert_frame_equal(result, expected) + + result = self.ymd["A"].reorder_levels(["month", "day", "year"]) + expected = self.ymd["A"].swaplevel(0, 1).swaplevel(1, 2) + tm.assert_series_equal(result, expected) + + result = self.ymd.T.reorder_levels(["month", "day", "year"], axis=1) + expected = self.ymd.T.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1) + tm.assert_frame_equal(result, expected) + + with pytest.raises(TypeError, match="hierarchical axis"): + self.ymd.reorder_levels([1, 2], axis=1) + + with pytest.raises(IndexError, match="Too many levels"): + self.ymd.index.reorder_levels([1, 2, 3]) + + def test_insert_index(self): + df = self.ymd[:5].T + df[2000, 1, 10] = df[2000, 1, 7] + assert isinstance(df.columns, MultiIndex) + assert (df[2000, 1, 10] == df[2000, 1, 7]).all() + + def test_alignment(self): + x = Series( + data=[1, 2, 3], index=MultiIndex.from_tuples([("A", 1), ("A", 2), ("B", 3)]) + ) + + y = Series( + data=[4, 5, 6], index=MultiIndex.from_tuples([("Z", 1), ("Z", 2), ("B", 3)]) + ) + + res = x - y + exp_index = x.index.union(y.index) + exp = x.reindex(exp_index) - y.reindex(exp_index) + tm.assert_series_equal(res, exp) + + # hit non-monotonic code path + res = x[::-1] - y[::-1] + exp_index = x.index.union(y.index) + exp = x.reindex(exp_index) - y.reindex(exp_index) + tm.assert_series_equal(res, exp) + + def test_count(self): + frame = self.frame.copy() + frame.index.names = ["a", "b"] + + result = frame.count(level="b") + expect = self.frame.count(level=1) + tm.assert_frame_equal(result, expect, check_names=False) + + result = frame.count(level="a") + expect = self.frame.count(level=0) + tm.assert_frame_equal(result, expect, check_names=False) + + series = self.series.copy() + series.index.names = ["a", "b"] + + result = series.count(level="b") + expect = self.series.count(level=1).rename_axis("b") + tm.assert_series_equal(result, expect) + + result = series.count(level="a") + expect = self.series.count(level=0).rename_axis("a") + tm.assert_series_equal(result, expect) + + msg = "Level x not found" + with pytest.raises(KeyError, match=msg): + series.count("x") + with pytest.raises(KeyError, match=msg): + frame.count(level="x") + + @pytest.mark.parametrize("op", AGG_FUNCTIONS) + @pytest.mark.parametrize("level", [0, 1]) + @pytest.mark.parametrize("skipna", [True, False]) + @pytest.mark.parametrize("sort", [True, False]) + def test_series_group_min_max(self, op, level, skipna, sort): + # GH 17537 + grouped = self.series.groupby(level=level, sort=sort) + # skipna=True + leftside = grouped.agg(lambda x: getattr(x, op)(skipna=skipna)) + rightside = getattr(self.series, op)(level=level, skipna=skipna) + if sort: + rightside = rightside.sort_index(level=level) + tm.assert_series_equal(leftside, rightside) + + @pytest.mark.parametrize("op", AGG_FUNCTIONS) + @pytest.mark.parametrize("level", [0, 1]) + @pytest.mark.parametrize("axis", [0, 1]) + @pytest.mark.parametrize("skipna", [True, False]) + @pytest.mark.parametrize("sort", [True, False]) + def test_frame_group_ops(self, op, level, axis, skipna, sort): + # GH 17537 + self.frame.iloc[1, [1, 2]] = np.nan + self.frame.iloc[7, [0, 1]] = np.nan + + level_name = self.frame.index.names[level] + + if axis == 0: + frame = self.frame + else: + frame = self.frame.T + + grouped = frame.groupby(level=level, axis=axis, sort=sort) + + pieces = [] + + def aggf(x): + pieces.append(x) + return getattr(x, op)(skipna=skipna, axis=axis) + + leftside = grouped.agg(aggf) + rightside = getattr(frame, op)(level=level, axis=axis, skipna=skipna) + if sort: + rightside = rightside.sort_index(level=level, axis=axis) + frame = frame.sort_index(level=level, axis=axis) + + # for good measure, groupby detail + level_index = frame._get_axis(axis).levels[level].rename(level_name) + + tm.assert_index_equal(leftside._get_axis(axis), level_index) + tm.assert_index_equal(rightside._get_axis(axis), level_index) + + tm.assert_frame_equal(leftside, rightside) + + def test_stat_op_corner(self): + obj = Series([10.0], index=MultiIndex.from_tuples([(2, 3)])) + + result = obj.sum(level=0) + expected = Series([10.0], index=[2]) + tm.assert_series_equal(result, expected) + + def test_frame_any_all_group(self): + df = DataFrame( + {"data": [False, False, True, False, True, False, True]}, + index=[ + ["one", "one", "two", "one", "two", "two", "two"], + [0, 1, 0, 2, 1, 2, 3], + ], + ) + + result = df.any(level=0) + ex = DataFrame({"data": [False, True]}, index=["one", "two"]) + tm.assert_frame_equal(result, ex) + + result = df.all(level=0) + ex = DataFrame({"data": [False, False]}, index=["one", "two"]) + tm.assert_frame_equal(result, ex) + + def test_series_any_timedelta(self): + # GH 17667 + df = DataFrame( + { + "a": Series([0, 0]), + "t": Series([pd.to_timedelta(0, "s"), pd.to_timedelta(1, "ms")]), + } + ) + + result = df.any(axis=0) + expected = Series(data=[False, True], index=["a", "t"]) + tm.assert_series_equal(result, expected) + + result = df.any(axis=1) + expected = Series(data=[False, True]) + tm.assert_series_equal(result, expected) + + def test_std_var_pass_ddof(self): + index = MultiIndex.from_arrays( + [np.arange(5).repeat(10), np.tile(np.arange(10), 5)] + ) + df = DataFrame(np.random.randn(len(index), 5), index=index) + + for meth in ["var", "std"]: + ddof = 4 + alt = lambda x: getattr(x, meth)(ddof=ddof) + + result = getattr(df[0], meth)(level=0, ddof=ddof) + expected = df[0].groupby(level=0).agg(alt) + tm.assert_series_equal(result, expected) + + result = getattr(df, meth)(level=0, ddof=ddof) + expected = df.groupby(level=0).agg(alt) + tm.assert_frame_equal(result, expected) + + def test_frame_series_agg_multiple_levels(self): + result = self.ymd.sum(level=["year", "month"]) + expected = self.ymd.groupby(level=["year", "month"]).sum() + tm.assert_frame_equal(result, expected) + + result = self.ymd["A"].sum(level=["year", "month"]) + expected = self.ymd["A"].groupby(level=["year", "month"]).sum() + tm.assert_series_equal(result, expected) + + def test_groupby_multilevel(self): + result = self.ymd.groupby(level=[0, 1]).mean() + + k1 = self.ymd.index.get_level_values(0) + k2 = self.ymd.index.get_level_values(1) + + expected = self.ymd.groupby([k1, k2]).mean() + + # TODO groupby with level_values drops names + tm.assert_frame_equal(result, expected, check_names=False) + assert result.index.names == self.ymd.index.names[:2] + + result2 = self.ymd.groupby(level=self.ymd.index.names[:2]).mean() + tm.assert_frame_equal(result, result2) + + def test_groupby_multilevel_with_transform(self): + pass + + def test_multilevel_consolidate(self): + index = MultiIndex.from_tuples( + [("foo", "one"), ("foo", "two"), ("bar", "one"), ("bar", "two")] + ) + df = DataFrame(np.random.randn(4, 4), index=index, columns=index) + df["Totals", ""] = df.sum(1) + df = df._consolidate() + + def test_loc_preserve_names(self): + result = self.ymd.loc[2000] + result2 = self.ymd["A"].loc[2000] + assert result.index.names == self.ymd.index.names[1:] + assert result2.index.names == self.ymd.index.names[1:] + + result = self.ymd.loc[2000, 2] + result2 = self.ymd["A"].loc[2000, 2] + assert result.index.name == self.ymd.index.names[2] + assert result2.index.name == self.ymd.index.names[2] + + def test_unstack_preserve_types(self): + # GH #403 + self.ymd["E"] = "foo" + self.ymd["F"] = 2 + + unstacked = self.ymd.unstack("month") + assert unstacked["A", 1].dtype == np.float64 + assert unstacked["E", 1].dtype == np.object_ + assert unstacked["F", 1].dtype == np.float64 + + def test_unstack_group_index_overflow(self): + codes = np.tile(np.arange(500), 2) + level = np.arange(500) + + index = MultiIndex( + levels=[level] * 8 + [[0, 1]], + codes=[codes] * 8 + [np.arange(2).repeat(500)], + ) + + s = Series(np.arange(1000), index=index) + result = s.unstack() + assert result.shape == (500, 2) + + # test roundtrip + stacked = result.stack() + tm.assert_series_equal(s, stacked.reindex(s.index)) + + # put it at beginning + index = MultiIndex( + levels=[[0, 1]] + [level] * 8, + codes=[np.arange(2).repeat(500)] + [codes] * 8, + ) + + s = Series(np.arange(1000), index=index) + result = s.unstack(0) + assert result.shape == (500, 2) + + # put it in middle + index = MultiIndex( + levels=[level] * 4 + [[0, 1]] + [level] * 4, + codes=([codes] * 4 + [np.arange(2).repeat(500)] + [codes] * 4), + ) + + s = Series(np.arange(1000), index=index) + result = s.unstack(4) + assert result.shape == (500, 2) + + def test_pyint_engine(self): + # GH 18519 : when combinations of codes cannot be represented in 64 + # bits, the index underlying the MultiIndex engine works with Python + # integers, rather than uint64. + N = 5 + keys = [ + tuple(l) + for l in [ + [0] * 10 * N, + [1] * 10 * N, + [2] * 10 * N, + [np.nan] * N + [2] * 9 * N, + [0] * N + [2] * 9 * N, + [np.nan] * N + [2] * 8 * N + [0] * N, + ] + ] + # Each level contains 4 elements (including NaN), so it is represented + # in 2 bits, for a total of 2*N*10 = 100 > 64 bits. If we were using a + # 64 bit engine and truncating the first levels, the fourth and fifth + # keys would collide; if truncating the last levels, the fifth and + # sixth; if rotating bits rather than shifting, the third and fifth. + + for idx in range(len(keys)): + index = MultiIndex.from_tuples(keys) + assert index.get_loc(keys[idx]) == idx + + expected = np.arange(idx + 1, dtype=np.intp) + result = index.get_indexer([keys[i] for i in expected]) + tm.assert_numpy_array_equal(result, expected) + + # With missing key: + idces = range(len(keys)) + expected = np.array([-1] + list(idces), dtype=np.intp) + missing = tuple([0, 1] * 5 * N) + result = index.get_indexer([missing] + [keys[i] for i in idces]) + tm.assert_numpy_array_equal(result, expected) + + def test_to_html(self): + self.ymd.columns.name = "foo" + self.ymd.to_html() + self.ymd.T.to_html() + + def test_level_with_tuples(self): + index = MultiIndex( + levels=[[("foo", "bar", 0), ("foo", "baz", 0), ("foo", "qux", 0)], [0, 1]], + codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], + ) + + series = Series(np.random.randn(6), index=index) + frame = DataFrame(np.random.randn(6, 4), index=index) + + result = series[("foo", "bar", 0)] + result2 = series.loc[("foo", "bar", 0)] + expected = series[:2] + expected.index = expected.index.droplevel(0) + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result2, expected) + + with pytest.raises(KeyError, match=r"^\(\('foo', 'bar', 0\), 2\)$"): + series[("foo", "bar", 0), 2] + + result = frame.loc[("foo", "bar", 0)] + result2 = frame.xs(("foo", "bar", 0)) + expected = frame[:2] + expected.index = expected.index.droplevel(0) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result2, expected) + + index = MultiIndex( + levels=[[("foo", "bar"), ("foo", "baz"), ("foo", "qux")], [0, 1]], + codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], + ) + + series = Series(np.random.randn(6), index=index) + frame = DataFrame(np.random.randn(6, 4), index=index) + + result = series[("foo", "bar")] + result2 = series.loc[("foo", "bar")] + expected = series[:2] + expected.index = expected.index.droplevel(0) + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result2, expected) + + result = frame.loc[("foo", "bar")] + result2 = frame.xs(("foo", "bar")) + expected = frame[:2] + expected.index = expected.index.droplevel(0) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result2, expected) + + def test_mixed_depth_drop(self): + arrays = [ + ["a", "top", "top", "routine1", "routine1", "routine2"], + ["", "OD", "OD", "result1", "result2", "result1"], + ["", "wx", "wy", "", "", ""], + ] + + tuples = sorted(zip(*arrays)) + index = MultiIndex.from_tuples(tuples) + df = DataFrame(randn(4, 6), columns=index) + + result = df.drop("a", axis=1) + expected = df.drop([("a", "", "")], axis=1) + tm.assert_frame_equal(expected, result) + + result = df.drop(["top"], axis=1) + expected = df.drop([("top", "OD", "wx")], axis=1) + expected = expected.drop([("top", "OD", "wy")], axis=1) + tm.assert_frame_equal(expected, result) + + result = df.drop(("top", "OD", "wx"), axis=1) + expected = df.drop([("top", "OD", "wx")], axis=1) + tm.assert_frame_equal(expected, result) + + expected = df.drop([("top", "OD", "wy")], axis=1) + expected = df.drop("top", axis=1) + + result = df.drop("result1", level=1, axis=1) + expected = df.drop( + [("routine1", "result1", ""), ("routine2", "result1", "")], axis=1 + ) + tm.assert_frame_equal(expected, result) + + def test_drop_multiindex_other_level_nan(self): + # GH 12754 + df = ( + DataFrame( + { + "A": ["one", "one", "two", "two"], + "B": [np.nan, 0.0, 1.0, 2.0], + "C": ["a", "b", "c", "c"], + "D": [1, 2, 3, 4], + } + ) + .set_index(["A", "B", "C"]) + .sort_index() + ) + result = df.drop("c", level="C") + expected = DataFrame( + [2, 1], + columns=["D"], + index=pd.MultiIndex.from_tuples( + [("one", 0.0, "b"), ("one", np.nan, "a")], names=["A", "B", "C"] + ), + ) + tm.assert_frame_equal(result, expected) + + def test_drop_nonunique(self): + df = DataFrame( + [ + ["x-a", "x", "a", 1.5], + ["x-a", "x", "a", 1.2], + ["z-c", "z", "c", 3.1], + ["x-a", "x", "a", 4.1], + ["x-b", "x", "b", 5.1], + ["x-b", "x", "b", 4.1], + ["x-b", "x", "b", 2.2], + ["y-a", "y", "a", 1.2], + ["z-b", "z", "b", 2.1], + ], + columns=["var1", "var2", "var3", "var4"], + ) + + grp_size = df.groupby("var1").size() + drop_idx = grp_size.loc[grp_size == 1] + + idf = df.set_index(["var1", "var2", "var3"]) + + # it works! #2101 + result = idf.drop(drop_idx.index, level=0).reset_index() + expected = df[-df.var1.isin(drop_idx.index)] + + result.index = expected.index + + tm.assert_frame_equal(result, expected) + + def test_mixed_depth_pop(self): + arrays = [ + ["a", "top", "top", "routine1", "routine1", "routine2"], + ["", "OD", "OD", "result1", "result2", "result1"], + ["", "wx", "wy", "", "", ""], + ] + + tuples = sorted(zip(*arrays)) + index = MultiIndex.from_tuples(tuples) + df = DataFrame(randn(4, 6), columns=index) + + df1 = df.copy() + df2 = df.copy() + result = df1.pop("a") + expected = df2.pop(("a", "", "")) + tm.assert_series_equal(expected, result, check_names=False) + tm.assert_frame_equal(df1, df2) + assert result.name == "a" + + expected = df1["top"] + df1 = df1.drop(["top"], axis=1) + result = df2.pop("top") + tm.assert_frame_equal(expected, result) + tm.assert_frame_equal(df1, df2) + + def test_reindex_level_partial_selection(self): + result = self.frame.reindex(["foo", "qux"], level=0) + expected = self.frame.iloc[[0, 1, 2, 7, 8, 9]] + tm.assert_frame_equal(result, expected) + + result = self.frame.T.reindex(["foo", "qux"], axis=1, level=0) + tm.assert_frame_equal(result, expected.T) + + result = self.frame.loc[["foo", "qux"]] + tm.assert_frame_equal(result, expected) + + result = self.frame["A"].loc[["foo", "qux"]] + tm.assert_series_equal(result, expected["A"]) + + result = self.frame.T.loc[:, ["foo", "qux"]] + tm.assert_frame_equal(result, expected.T) + + def test_drop_level(self): + result = self.frame.drop(["bar", "qux"], level="first") + expected = self.frame.iloc[[0, 1, 2, 5, 6]] + tm.assert_frame_equal(result, expected) + + result = self.frame.drop(["two"], level="second") + expected = self.frame.iloc[[0, 2, 3, 6, 7, 9]] + tm.assert_frame_equal(result, expected) + + result = self.frame.T.drop(["bar", "qux"], axis=1, level="first") + expected = self.frame.iloc[[0, 1, 2, 5, 6]].T + tm.assert_frame_equal(result, expected) + + result = self.frame.T.drop(["two"], axis=1, level="second") + expected = self.frame.iloc[[0, 2, 3, 6, 7, 9]].T + tm.assert_frame_equal(result, expected) + + def test_drop_level_nonunique_datetime(self): + # GH 12701 + idx = Index([2, 3, 4, 4, 5], name="id") + idxdt = pd.to_datetime( + [ + "201603231400", + "201603231500", + "201603231600", + "201603231600", + "201603231700", + ] + ) + df = DataFrame(np.arange(10).reshape(5, 2), columns=list("ab"), index=idx) + df["tstamp"] = idxdt + df = df.set_index("tstamp", append=True) + ts = Timestamp("201603231600") + assert df.index.is_unique is False + + result = df.drop(ts, level="tstamp") + expected = df.loc[idx != 4] + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("box", [Series, DataFrame]) + def test_drop_tz_aware_timestamp_across_dst(self, box): + # GH 21761 + start = Timestamp("2017-10-29", tz="Europe/Berlin") + end = Timestamp("2017-10-29 04:00:00", tz="Europe/Berlin") + index = pd.date_range(start, end, freq="15min") + data = box(data=[1] * len(index), index=index) + result = data.drop(start) + expected_start = Timestamp("2017-10-29 00:15:00", tz="Europe/Berlin") + expected_idx = pd.date_range(expected_start, end, freq="15min") + expected = box(data=[1] * len(expected_idx), index=expected_idx) + tm.assert_equal(result, expected) + + def test_drop_preserve_names(self): + index = MultiIndex.from_arrays( + [[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]], names=["one", "two"] + ) + + df = DataFrame(np.random.randn(6, 3), index=index) + + result = df.drop([(0, 2)]) + assert result.index.names == ("one", "two") + + def test_unicode_repr_issues(self): + levels = [Index(["a/\u03c3", "b/\u03c3", "c/\u03c3"]), Index([0, 1])] + codes = [np.arange(3).repeat(2), np.tile(np.arange(2), 3)] + index = MultiIndex(levels=levels, codes=codes) + + repr(index.levels) + + # NumPy bug + # repr(index.get_level_values(1)) + + def test_unicode_repr_level_names(self): + index = MultiIndex.from_tuples([(0, 0), (1, 1)], names=["\u0394", "i1"]) + + s = Series(range(2), index=index) + df = DataFrame(np.random.randn(2, 4), index=index) + repr(s) + repr(df) + + def test_join_segfault(self): + # 1532 + df1 = DataFrame({"a": [1, 1], "b": [1, 2], "x": [1, 2]}) + df2 = DataFrame({"a": [2, 2], "b": [1, 2], "y": [1, 2]}) + df1 = df1.set_index(["a", "b"]) + df2 = df2.set_index(["a", "b"]) + # it works! + for how in ["left", "right", "outer"]: + df1.join(df2, how=how) + + def test_frame_dict_constructor_empty_series(self): + s1 = Series( + [1, 2, 3, 4], index=MultiIndex.from_tuples([(1, 2), (1, 3), (2, 2), (2, 4)]) + ) + s2 = Series( + [1, 2, 3, 4], index=MultiIndex.from_tuples([(1, 2), (1, 3), (3, 2), (3, 4)]) + ) + s3 = Series(dtype=object) + + # it works! + DataFrame({"foo": s1, "bar": s2, "baz": s3}) + DataFrame.from_dict({"foo": s1, "baz": s3, "bar": s2}) + + @pytest.mark.parametrize("d", [4, "d"]) + def test_empty_frame_groupby_dtypes_consistency(self, d): + # GH 20888 + group_keys = ["a", "b", "c"] + df = DataFrame({"a": [1], "b": [2], "c": [3], "d": [d]}) + + g = df[df.a == 2].groupby(group_keys) + result = g.first().index + expected = MultiIndex( + levels=[[1], [2], [3]], codes=[[], [], []], names=["a", "b", "c"] + ) + + tm.assert_index_equal(result, expected) + + def test_multiindex_na_repr(self): + # only an issue with long columns + df3 = DataFrame( + { + "A" * 30: {("A", "A0006000", "nuit"): "A0006000"}, + "B" * 30: {("A", "A0006000", "nuit"): np.nan}, + "C" * 30: {("A", "A0006000", "nuit"): np.nan}, + "D" * 30: {("A", "A0006000", "nuit"): np.nan}, + "E" * 30: {("A", "A0006000", "nuit"): "A"}, + "F" * 30: {("A", "A0006000", "nuit"): np.nan}, + } + ) + + idf = df3.set_index(["A" * 30, "C" * 30]) + repr(idf) + + def test_assign_index_sequences(self): + # #2200 + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}).set_index( + ["a", "b"] + ) + index = list(df.index) + index[0] = ("faz", "boo") + df.index = index + repr(df) + + # this travels an improper code path + index[0] = ["faz", "boo"] + df.index = index + repr(df) + + def test_tuples_have_na(self): + index = MultiIndex( + levels=[[1, 0], [0, 1, 2, 3]], + codes=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]], + ) + + assert isna(index[4][0]) + assert isna(index.values[4][0]) + + def test_duplicate_groupby_issues(self): + idx_tp = [ + ("600809", "20061231"), + ("600809", "20070331"), + ("600809", "20070630"), + ("600809", "20070331"), + ] + dt = ["demo", "demo", "demo", "demo"] + + idx = MultiIndex.from_tuples(idx_tp, names=["STK_ID", "RPT_Date"]) + s = Series(dt, index=idx) + + result = s.groupby(s.index).first() + assert len(result) == 3 + + def test_duplicate_mi(self): + # GH 4516 + df = DataFrame( + [ + ["foo", "bar", 1.0, 1], + ["foo", "bar", 2.0, 2], + ["bah", "bam", 3.0, 3], + ["bah", "bam", 4.0, 4], + ["foo", "bar", 5.0, 5], + ["bah", "bam", 6.0, 6], + ], + columns=list("ABCD"), + ) + df = df.set_index(["A", "B"]) + df = df.sort_index(level=0) + expected = DataFrame( + [["foo", "bar", 1.0, 1], ["foo", "bar", 2.0, 2], ["foo", "bar", 5.0, 5]], + columns=list("ABCD"), + ).set_index(["A", "B"]) + result = df.loc[("foo", "bar")] + tm.assert_frame_equal(result, expected) + + def test_duplicated_drop_duplicates(self): + # GH 4060 + idx = MultiIndex.from_arrays(([1, 2, 3, 1, 2, 3], [1, 1, 1, 1, 2, 2])) + + expected = np.array([False, False, False, True, False, False], dtype=bool) + duplicated = idx.duplicated() + tm.assert_numpy_array_equal(duplicated, expected) + assert duplicated.dtype == bool + expected = MultiIndex.from_arrays(([1, 2, 3, 2, 3], [1, 1, 1, 2, 2])) + tm.assert_index_equal(idx.drop_duplicates(), expected) + + expected = np.array([True, False, False, False, False, False]) + duplicated = idx.duplicated(keep="last") + tm.assert_numpy_array_equal(duplicated, expected) + assert duplicated.dtype == bool + expected = MultiIndex.from_arrays(([2, 3, 1, 2, 3], [1, 1, 1, 2, 2])) + tm.assert_index_equal(idx.drop_duplicates(keep="last"), expected) + + expected = np.array([True, False, False, True, False, False]) + duplicated = idx.duplicated(keep=False) + tm.assert_numpy_array_equal(duplicated, expected) + assert duplicated.dtype == bool + expected = MultiIndex.from_arrays(([2, 3, 2, 3], [1, 1, 2, 2])) + tm.assert_index_equal(idx.drop_duplicates(keep=False), expected) + + def test_multiindex_set_index(self): + # segfault in #3308 + d = {"t1": [2, 2.5, 3], "t2": [4, 5, 6]} + df = DataFrame(d) + tuples = [(0, 1), (0, 2), (1, 2)] + df["tuples"] = tuples + + index = MultiIndex.from_tuples(df["tuples"]) + # it works! + df.set_index(index) + + def test_datetimeindex(self): + idx1 = pd.DatetimeIndex( + ["2013-04-01 9:00", "2013-04-02 9:00", "2013-04-03 9:00"] * 2, + tz="Asia/Tokyo", + ) + idx2 = pd.date_range("2010/01/01", periods=6, freq="M", tz="US/Eastern") + idx = MultiIndex.from_arrays([idx1, idx2]) + + expected1 = pd.DatetimeIndex( + ["2013-04-01 9:00", "2013-04-02 9:00", "2013-04-03 9:00"], tz="Asia/Tokyo" + ) + + tm.assert_index_equal(idx.levels[0], expected1) + tm.assert_index_equal(idx.levels[1], idx2) + + # from datetime combos + # GH 7888 + date1 = datetime.date.today() + date2 = datetime.datetime.today() + date3 = Timestamp.today() + + for d1, d2 in itertools.product([date1, date2, date3], [date1, date2, date3]): + index = MultiIndex.from_product([[d1], [d2]]) + assert isinstance(index.levels[0], pd.DatetimeIndex) + assert isinstance(index.levels[1], pd.DatetimeIndex) + + def test_constructor_with_tz(self): + + index = pd.DatetimeIndex( + ["2013/01/01 09:00", "2013/01/02 09:00"], name="dt1", tz="US/Pacific" + ) + columns = pd.DatetimeIndex( + ["2014/01/01 09:00", "2014/01/02 09:00"], name="dt2", tz="Asia/Tokyo" + ) + + result = MultiIndex.from_arrays([index, columns]) + + assert result.names == ["dt1", "dt2"] + tm.assert_index_equal(result.levels[0], index) + tm.assert_index_equal(result.levels[1], columns) + + result = MultiIndex.from_arrays([Series(index), Series(columns)]) + + assert result.names == ["dt1", "dt2"] + tm.assert_index_equal(result.levels[0], index) + tm.assert_index_equal(result.levels[1], columns) + + def test_set_index_datetime(self): + # GH 3950 + df = DataFrame( + { + "label": ["a", "a", "a", "b", "b", "b"], + "datetime": [ + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + ], + "value": range(6), + } + ) + df.index = pd.to_datetime(df.pop("datetime"), utc=True) + df.index = df.index.tz_convert("US/Pacific") + + expected = pd.DatetimeIndex( + ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], + name="datetime", + ) + expected = expected.tz_localize("UTC").tz_convert("US/Pacific") + + df = df.set_index("label", append=True) + tm.assert_index_equal(df.index.levels[0], expected) + tm.assert_index_equal(df.index.levels[1], Index(["a", "b"], name="label")) + assert df.index.names == ["datetime", "label"] + + df = df.swaplevel(0, 1) + tm.assert_index_equal(df.index.levels[0], Index(["a", "b"], name="label")) + tm.assert_index_equal(df.index.levels[1], expected) + assert df.index.names == ["label", "datetime"] + + df = DataFrame(np.random.random(6)) + idx1 = pd.DatetimeIndex( + [ + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + ], + tz="US/Eastern", + ) + idx2 = pd.DatetimeIndex( + [ + "2012-04-01 09:00", + "2012-04-01 09:00", + "2012-04-01 09:00", + "2012-04-02 09:00", + "2012-04-02 09:00", + "2012-04-02 09:00", + ], + tz="US/Eastern", + ) + idx3 = pd.date_range("2011-01-01 09:00", periods=6, tz="Asia/Tokyo") + + df = df.set_index(idx1) + df = df.set_index(idx2, append=True) + df = df.set_index(idx3, append=True) + + expected1 = pd.DatetimeIndex( + ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], + tz="US/Eastern", + ) + expected2 = pd.DatetimeIndex( + ["2012-04-01 09:00", "2012-04-02 09:00"], tz="US/Eastern" + ) + + tm.assert_index_equal(df.index.levels[0], expected1) + tm.assert_index_equal(df.index.levels[1], expected2) + tm.assert_index_equal(df.index.levels[2], idx3) + + # GH 7092 + tm.assert_index_equal(df.index.get_level_values(0), idx1) + tm.assert_index_equal(df.index.get_level_values(1), idx2) + tm.assert_index_equal(df.index.get_level_values(2), idx3) + + def test_reset_index_datetime(self): + # GH 3950 + for tz in ["UTC", "Asia/Tokyo", "US/Eastern"]: + idx1 = pd.date_range("1/1/2011", periods=5, freq="D", tz=tz, name="idx1") + idx2 = Index(range(5), name="idx2", dtype="int64") + idx = MultiIndex.from_arrays([idx1, idx2]) + df = DataFrame( + {"a": np.arange(5, dtype="int64"), "b": ["A", "B", "C", "D", "E"]}, + index=idx, + ) + + expected = DataFrame( + { + "idx1": [ + datetime.datetime(2011, 1, 1), + datetime.datetime(2011, 1, 2), + datetime.datetime(2011, 1, 3), + datetime.datetime(2011, 1, 4), + datetime.datetime(2011, 1, 5), + ], + "idx2": np.arange(5, dtype="int64"), + "a": np.arange(5, dtype="int64"), + "b": ["A", "B", "C", "D", "E"], + }, + columns=["idx1", "idx2", "a", "b"], + ) + expected["idx1"] = expected["idx1"].apply(lambda d: Timestamp(d, tz=tz)) + + tm.assert_frame_equal(df.reset_index(), expected) + + idx3 = pd.date_range( + "1/1/2012", periods=5, freq="MS", tz="Europe/Paris", name="idx3" + ) + idx = MultiIndex.from_arrays([idx1, idx2, idx3]) + df = DataFrame( + {"a": np.arange(5, dtype="int64"), "b": ["A", "B", "C", "D", "E"]}, + index=idx, + ) + + expected = DataFrame( + { + "idx1": [ + datetime.datetime(2011, 1, 1), + datetime.datetime(2011, 1, 2), + datetime.datetime(2011, 1, 3), + datetime.datetime(2011, 1, 4), + datetime.datetime(2011, 1, 5), + ], + "idx2": np.arange(5, dtype="int64"), + "idx3": [ + datetime.datetime(2012, 1, 1), + datetime.datetime(2012, 2, 1), + datetime.datetime(2012, 3, 1), + datetime.datetime(2012, 4, 1), + datetime.datetime(2012, 5, 1), + ], + "a": np.arange(5, dtype="int64"), + "b": ["A", "B", "C", "D", "E"], + }, + columns=["idx1", "idx2", "idx3", "a", "b"], + ) + expected["idx1"] = expected["idx1"].apply(lambda d: Timestamp(d, tz=tz)) + expected["idx3"] = expected["idx3"].apply( + lambda d: Timestamp(d, tz="Europe/Paris") + ) + tm.assert_frame_equal(df.reset_index(), expected) + + # GH 7793 + idx = MultiIndex.from_product( + [["a", "b"], pd.date_range("20130101", periods=3, tz=tz)] + ) + df = DataFrame( + np.arange(6, dtype="int64").reshape(6, 1), columns=["a"], index=idx + ) + + expected = DataFrame( + { + "level_0": "a a a b b b".split(), + "level_1": [ + datetime.datetime(2013, 1, 1), + datetime.datetime(2013, 1, 2), + datetime.datetime(2013, 1, 3), + ] + * 2, + "a": np.arange(6, dtype="int64"), + }, + columns=["level_0", "level_1", "a"], + ) + expected["level_1"] = expected["level_1"].apply( + lambda d: Timestamp(d, freq="D", tz=tz) + ) + tm.assert_frame_equal(df.reset_index(), expected) + + def test_reset_index_period(self): + # GH 7746 + idx = MultiIndex.from_product( + [pd.period_range("20130101", periods=3, freq="M"), list("abc")], + names=["month", "feature"], + ) + + df = DataFrame( + np.arange(9, dtype="int64").reshape(-1, 1), index=idx, columns=["a"] + ) + expected = DataFrame( + { + "month": ( + [pd.Period("2013-01", freq="M")] * 3 + + [pd.Period("2013-02", freq="M")] * 3 + + [pd.Period("2013-03", freq="M")] * 3 + ), + "feature": ["a", "b", "c"] * 3, + "a": np.arange(9, dtype="int64"), + }, + columns=["month", "feature", "a"], + ) + tm.assert_frame_equal(df.reset_index(), expected) + + def test_reset_index_multiindex_columns(self): + levels = [["A", ""], ["B", "b"]] + df = DataFrame([[0, 2], [1, 3]], columns=MultiIndex.from_tuples(levels)) + result = df[["B"]].rename_axis("A").reset_index() + tm.assert_frame_equal(result, df) + + # gh-16120: already existing column + msg = r"cannot insert \('A', ''\), already exists" + with pytest.raises(ValueError, match=msg): + df.rename_axis("A").reset_index() + + # gh-16164: multiindex (tuple) full key + result = df.set_index([("A", "")]).reset_index() + tm.assert_frame_equal(result, df) + + # with additional (unnamed) index level + idx_col = DataFrame( + [[0], [1]], columns=MultiIndex.from_tuples([("level_0", "")]) + ) + expected = pd.concat([idx_col, df[[("B", "b"), ("A", "")]]], axis=1) + result = df.set_index([("B", "b")], append=True).reset_index() + tm.assert_frame_equal(result, expected) + + # with index name which is a too long tuple... + msg = "Item must have length equal to number of levels." + with pytest.raises(ValueError, match=msg): + df.rename_axis([("C", "c", "i")]).reset_index() + + # or too short... + levels = [["A", "a", ""], ["B", "b", "i"]] + df2 = DataFrame([[0, 2], [1, 3]], columns=MultiIndex.from_tuples(levels)) + idx_col = DataFrame( + [[0], [1]], columns=MultiIndex.from_tuples([("C", "c", "ii")]) + ) + expected = pd.concat([idx_col, df2], axis=1) + result = df2.rename_axis([("C", "c")]).reset_index(col_fill="ii") + tm.assert_frame_equal(result, expected) + + # ... which is incompatible with col_fill=None + with pytest.raises( + ValueError, + match=( + "col_fill=None is incompatible with " + r"incomplete column name \('C', 'c'\)" + ), + ): + df2.rename_axis([("C", "c")]).reset_index(col_fill=None) + + # with col_level != 0 + result = df2.rename_axis([("c", "ii")]).reset_index(col_level=1, col_fill="C") + tm.assert_frame_equal(result, expected) + + def test_set_index_period(self): + # GH 6631 + df = DataFrame(np.random.random(6)) + idx1 = pd.period_range("2011-01-01", periods=3, freq="M") + idx1 = idx1.append(idx1) + idx2 = pd.period_range("2013-01-01 09:00", periods=2, freq="H") + idx2 = idx2.append(idx2).append(idx2) + idx3 = pd.period_range("2005", periods=6, freq="A") + + df = df.set_index(idx1) + df = df.set_index(idx2, append=True) + df = df.set_index(idx3, append=True) + + expected1 = pd.period_range("2011-01-01", periods=3, freq="M") + expected2 = pd.period_range("2013-01-01 09:00", periods=2, freq="H") + + tm.assert_index_equal(df.index.levels[0], expected1) + tm.assert_index_equal(df.index.levels[1], expected2) + tm.assert_index_equal(df.index.levels[2], idx3) + + tm.assert_index_equal(df.index.get_level_values(0), idx1) + tm.assert_index_equal(df.index.get_level_values(1), idx2) + tm.assert_index_equal(df.index.get_level_values(2), idx3) + + def test_repeat(self): + # GH 9361 + # fixed by # GH 7891 + m_idx = MultiIndex.from_tuples([(1, 2), (3, 4), (5, 6), (7, 8)]) + data = ["a", "b", "c", "d"] + m_df = Series(data, index=m_idx) + assert m_df.repeat(3).shape == (3 * len(data),) + + def test_subsets_multiindex_dtype(self): + # GH 20757 + data = [["x", 1]] + columns = [("a", "b", np.nan), ("a", "c", 0.0)] + df = DataFrame(data, columns=pd.MultiIndex.from_tuples(columns)) + expected = df.dtypes.a.b + result = df.a.b.dtypes + tm.assert_series_equal(result, expected) + + +class TestSorted(Base): + """ everything you wanted to test about sorting """ + + def test_sort_index_preserve_levels(self): + result = self.frame.sort_index() + assert result.index.names == self.frame.index.names + + def test_sorting_repr_8017(self): + + np.random.seed(0) + data = np.random.randn(3, 4) + + for gen, extra in [ + ([1.0, 3.0, 2.0, 5.0], 4.0), + ([1, 3, 2, 5], 4), + ( + [ + Timestamp("20130101"), + Timestamp("20130103"), + Timestamp("20130102"), + Timestamp("20130105"), + ], + Timestamp("20130104"), + ), + (["1one", "3one", "2one", "5one"], "4one"), + ]: + columns = MultiIndex.from_tuples([("red", i) for i in gen]) + df = DataFrame(data, index=list("def"), columns=columns) + df2 = pd.concat( + [ + df, + DataFrame( + "world", + index=list("def"), + columns=MultiIndex.from_tuples([("red", extra)]), + ), + ], + axis=1, + ) + + # check that the repr is good + # make sure that we have a correct sparsified repr + # e.g. only 1 header of read + assert str(df2).splitlines()[0].split() == ["red"] + + # GH 8017 + # sorting fails after columns added + + # construct single-dtype then sort + result = df.copy().sort_index(axis=1) + expected = df.iloc[:, [0, 2, 1, 3]] + tm.assert_frame_equal(result, expected) + + result = df2.sort_index(axis=1) + expected = df2.iloc[:, [0, 2, 1, 4, 3]] + tm.assert_frame_equal(result, expected) + + # setitem then sort + result = df.copy() + result[("red", extra)] = "world" + + result = result.sort_index(axis=1) + tm.assert_frame_equal(result, expected) + + def test_sort_index_level(self): + df = self.frame.copy() + df.index = np.arange(len(df)) + + # axis=1 + + # series + a_sorted = self.frame["A"].sort_index(level=0) + + # preserve names + assert a_sorted.index.names == self.frame.index.names + + # inplace + rs = self.frame.copy() + rs.sort_index(level=0, inplace=True) + tm.assert_frame_equal(rs, self.frame.sort_index(level=0)) + + def test_sort_index_level_large_cardinality(self): + + # #2684 (int64) + index = MultiIndex.from_arrays([np.arange(4000)] * 3) + df = DataFrame(np.random.randn(4000), index=index, dtype=np.int64) + + # it works! + result = df.sort_index(level=0) + assert result.index.lexsort_depth == 3 + + # #2684 (int32) + index = MultiIndex.from_arrays([np.arange(4000)] * 3) + df = DataFrame(np.random.randn(4000), index=index, dtype=np.int32) + + # it works! + result = df.sort_index(level=0) + assert (result.dtypes.values == df.dtypes.values).all() + assert result.index.lexsort_depth == 3 + + def test_sort_index_level_by_name(self): + self.frame.index.names = ["first", "second"] + result = self.frame.sort_index(level="second") + expected = self.frame.sort_index(level=1) + tm.assert_frame_equal(result, expected) + + def test_sort_index_level_mixed(self): + sorted_before = self.frame.sort_index(level=1) + + df = self.frame.copy() + df["foo"] = "bar" + sorted_after = df.sort_index(level=1) + tm.assert_frame_equal(sorted_before, sorted_after.drop(["foo"], axis=1)) + + dft = self.frame.T + sorted_before = dft.sort_index(level=1, axis=1) + dft["foo", "three"] = "bar" + + sorted_after = dft.sort_index(level=1, axis=1) + tm.assert_frame_equal( + sorted_before.drop([("foo", "three")], axis=1), + sorted_after.drop([("foo", "three")], axis=1), + ) + + def test_is_lexsorted(self): + levels = [[0, 1], [0, 1, 2]] + + index = MultiIndex( + levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] + ) + assert index.is_lexsorted() + + index = MultiIndex( + levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]] + ) + assert not index.is_lexsorted() + + index = MultiIndex( + levels=levels, codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]] + ) + assert not index.is_lexsorted() + assert index.lexsort_depth == 0 + + def test_raise_invalid_sortorder(self): + # Test that the MultiIndex constructor raise when a incorrect sortorder is given + # Issue #28518 + + levels = [[0, 1], [0, 1, 2]] + + # Correct sortorder + MultiIndex( + levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], sortorder=2 + ) + + with pytest.raises(ValueError, match=r".* sortorder 2 with lexsort_depth 1.*"): + MultiIndex( + levels=levels, + codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]], + sortorder=2, + ) + + with pytest.raises(ValueError, match=r".* sortorder 1 with lexsort_depth 0.*"): + MultiIndex( + levels=levels, + codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]], + sortorder=1, + ) + + def test_lexsort_depth(self): + # Test that lexsort_depth return the correct sortorder + # when it was given to the MultiIndex const. + # Issue #28518 + + levels = [[0, 1], [0, 1, 2]] + + index = MultiIndex( + levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], sortorder=2 + ) + assert index.lexsort_depth == 2 + + index = MultiIndex( + levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]], sortorder=1 + ) + assert index.lexsort_depth == 1 + + index = MultiIndex( + levels=levels, codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]], sortorder=0 + ) + assert index.lexsort_depth == 0 + + def test_sort_index_and_reconstruction(self): + + # 15622 + # lexsortedness should be identical + # across MultiIndex construction methods + + df = DataFrame([[1, 1], [2, 2]], index=list("ab")) + expected = DataFrame( + [[1, 1], [2, 2], [1, 1], [2, 2]], + index=MultiIndex.from_tuples( + [(0.5, "a"), (0.5, "b"), (0.8, "a"), (0.8, "b")] + ), + ) + assert expected.index.is_lexsorted() + + result = DataFrame( + [[1, 1], [2, 2], [1, 1], [2, 2]], + index=MultiIndex.from_product([[0.5, 0.8], list("ab")]), + ) + result = result.sort_index() + assert result.index.is_lexsorted() + assert result.index.is_monotonic + + tm.assert_frame_equal(result, expected) + + result = DataFrame( + [[1, 1], [2, 2], [1, 1], [2, 2]], + index=MultiIndex( + levels=[[0.5, 0.8], ["a", "b"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]] + ), + ) + result = result.sort_index() + assert result.index.is_lexsorted() + + tm.assert_frame_equal(result, expected) + + concatted = pd.concat([df, df], keys=[0.8, 0.5]) + result = concatted.sort_index() + + assert result.index.is_lexsorted() + assert result.index.is_monotonic + + tm.assert_frame_equal(result, expected) + + # 14015 + df = DataFrame( + [[1, 2], [6, 7]], + columns=MultiIndex.from_tuples( + [(0, "20160811 12:00:00"), (0, "20160809 12:00:00")], + names=["l1", "Date"], + ), + ) + + df.columns.set_levels( + pd.to_datetime(df.columns.levels[1]), level=1, inplace=True + ) + assert not df.columns.is_lexsorted() + assert not df.columns.is_monotonic + result = df.sort_index(axis=1) + assert result.columns.is_lexsorted() + assert result.columns.is_monotonic + result = df.sort_index(axis=1, level=1) + assert result.columns.is_lexsorted() + assert result.columns.is_monotonic + + def test_sort_index_and_reconstruction_doc_example(self): + # doc example + df = DataFrame( + {"value": [1, 2, 3, 4]}, + index=MultiIndex( + levels=[["a", "b"], ["bb", "aa"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]] + ), + ) + assert df.index.is_lexsorted() + assert not df.index.is_monotonic + + # sort it + expected = DataFrame( + {"value": [2, 1, 4, 3]}, + index=MultiIndex( + levels=[["a", "b"], ["aa", "bb"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]] + ), + ) + result = df.sort_index() + assert result.index.is_lexsorted() + assert result.index.is_monotonic + + tm.assert_frame_equal(result, expected) + + # reconstruct + result = df.sort_index().copy() + result.index = result.index._sort_levels_monotonic() + assert result.index.is_lexsorted() + assert result.index.is_monotonic + + tm.assert_frame_equal(result, expected) + + def test_sort_index_non_existent_label_multiindex(self): + # GH 12261 + df = DataFrame(0, columns=[], index=pd.MultiIndex.from_product([[], []])) + df.loc["b", "2"] = 1 + df.loc["a", "3"] = 1 + result = df.sort_index().index.is_monotonic + assert result is True + + def test_sort_index_reorder_on_ops(self): + # 15687 + df = DataFrame( + np.random.randn(8, 2), + index=MultiIndex.from_product( + [["a", "b"], ["big", "small"], ["red", "blu"]], + names=["letter", "size", "color"], + ), + columns=["near", "far"], + ) + df = df.sort_index() + + def my_func(group): + group.index = ["newz", "newa"] + return group + + result = df.groupby(level=["letter", "size"]).apply(my_func).sort_index() + expected = MultiIndex.from_product( + [["a", "b"], ["big", "small"], ["newa", "newz"]], + names=["letter", "size", None], + ) + + tm.assert_index_equal(result.index, expected) + + def test_sort_non_lexsorted(self): + # degenerate case where we sort but don't + # have a satisfying result :< + # GH 15797 + idx = MultiIndex( + [["A", "B", "C"], ["c", "b", "a"]], [[0, 1, 2, 0, 1, 2], [0, 2, 1, 1, 0, 2]] + ) + + df = DataFrame({"col": range(len(idx))}, index=idx, dtype="int64") + assert df.index.is_lexsorted() is False + assert df.index.is_monotonic is False + + sorted = df.sort_index() + assert sorted.index.is_lexsorted() is True + assert sorted.index.is_monotonic is True + + expected = DataFrame( + {"col": [1, 4, 5, 2]}, + index=MultiIndex.from_tuples( + [("B", "a"), ("B", "c"), ("C", "a"), ("C", "b")] + ), + dtype="int64", + ) + result = sorted.loc[pd.IndexSlice["B":"C", "a":"c"], :] + tm.assert_frame_equal(result, expected) + + def test_sort_index_nan(self): + # GH 14784 + # incorrect sorting w.r.t. nans + tuples = [[12, 13], [np.nan, np.nan], [np.nan, 3], [1, 2]] + mi = MultiIndex.from_tuples(tuples) + + df = DataFrame(np.arange(16).reshape(4, 4), index=mi, columns=list("ABCD")) + s = Series(np.arange(4), index=mi) + + df2 = DataFrame( + { + "date": pd.to_datetime( + [ + "20121002", + "20121007", + "20130130", + "20130202", + "20130305", + "20121002", + "20121207", + "20130130", + "20130202", + "20130305", + "20130202", + "20130305", + ] + ), + "user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5], + "whole_cost": [ + 1790, + np.nan, + 280, + 259, + np.nan, + 623, + 90, + 312, + np.nan, + 301, + 359, + 801, + ], + "cost": [12, 15, 10, 24, 39, 1, 0, np.nan, 45, 34, 1, 12], + } + ).set_index(["date", "user_id"]) + + # sorting frame, default nan position is last + result = df.sort_index() + expected = df.iloc[[3, 0, 2, 1], :] + tm.assert_frame_equal(result, expected) + + # sorting frame, nan position last + result = df.sort_index(na_position="last") + expected = df.iloc[[3, 0, 2, 1], :] + tm.assert_frame_equal(result, expected) + + # sorting frame, nan position first + result = df.sort_index(na_position="first") + expected = df.iloc[[1, 2, 3, 0], :] + tm.assert_frame_equal(result, expected) + + # sorting frame with removed rows + result = df2.dropna().sort_index() + expected = df2.sort_index().dropna() + tm.assert_frame_equal(result, expected) + + # sorting series, default nan position is last + result = s.sort_index() + expected = s.iloc[[3, 0, 2, 1]] + tm.assert_series_equal(result, expected) + + # sorting series, nan position last + result = s.sort_index(na_position="last") + expected = s.iloc[[3, 0, 2, 1]] + tm.assert_series_equal(result, expected) + + # sorting series, nan position first + result = s.sort_index(na_position="first") + expected = s.iloc[[1, 2, 3, 0]] + tm.assert_series_equal(result, expected) + + def test_sort_ascending_list(self): + # GH: 16934 + + # Set up a Series with a three level MultiIndex + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + [4, 3, 2, 1, 4, 3, 2, 1], + ] + tuples = zip(*arrays) + mi = MultiIndex.from_tuples(tuples, names=["first", "second", "third"]) + s = Series(range(8), index=mi) + + # Sort with boolean ascending + result = s.sort_index(level=["third", "first"], ascending=False) + expected = s.iloc[[4, 0, 5, 1, 6, 2, 7, 3]] + tm.assert_series_equal(result, expected) + + # Sort with list of boolean ascending + result = s.sort_index(level=["third", "first"], ascending=[False, True]) + expected = s.iloc[[0, 4, 1, 5, 2, 6, 3, 7]] + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/test_nanops.py b/venv/Lib/site-packages/pandas/tests/test_nanops.py new file mode 100644 index 0000000..2c5d028 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/test_nanops.py @@ -0,0 +1,1074 @@ +from functools import partial +import operator +import warnings + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas.core.dtypes.common import is_integer_dtype + +import pandas as pd +from pandas import Series, isna +import pandas._testing as tm +from pandas.core.arrays import DatetimeArray +import pandas.core.nanops as nanops + +use_bn = nanops._USE_BOTTLENECK +has_c16 = hasattr(np, "complex128") + + +class TestnanopsDataFrame: + def setup_method(self, method): + np.random.seed(11235) + nanops._USE_BOTTLENECK = False + + arr_shape = (11, 7) + + self.arr_float = np.random.randn(*arr_shape) + self.arr_float1 = np.random.randn(*arr_shape) + self.arr_complex = self.arr_float + self.arr_float1 * 1j + self.arr_int = np.random.randint(-10, 10, arr_shape) + self.arr_bool = np.random.randint(0, 2, arr_shape) == 0 + self.arr_str = np.abs(self.arr_float).astype("S") + self.arr_utf = np.abs(self.arr_float).astype("U") + self.arr_date = np.random.randint(0, 20000, arr_shape).astype("M8[ns]") + self.arr_tdelta = np.random.randint(0, 20000, arr_shape).astype("m8[ns]") + + self.arr_nan = np.tile(np.nan, arr_shape) + self.arr_float_nan = np.vstack([self.arr_float, self.arr_nan]) + self.arr_float1_nan = np.vstack([self.arr_float1, self.arr_nan]) + self.arr_nan_float1 = np.vstack([self.arr_nan, self.arr_float1]) + self.arr_nan_nan = np.vstack([self.arr_nan, self.arr_nan]) + + self.arr_inf = self.arr_float * np.inf + self.arr_float_inf = np.vstack([self.arr_float, self.arr_inf]) + + self.arr_nan_inf = np.vstack([self.arr_nan, self.arr_inf]) + self.arr_float_nan_inf = np.vstack([self.arr_float, self.arr_nan, self.arr_inf]) + self.arr_nan_nan_inf = np.vstack([self.arr_nan, self.arr_nan, self.arr_inf]) + self.arr_obj = np.vstack( + [ + self.arr_float.astype("O"), + self.arr_int.astype("O"), + self.arr_bool.astype("O"), + self.arr_complex.astype("O"), + self.arr_str.astype("O"), + self.arr_utf.astype("O"), + self.arr_date.astype("O"), + self.arr_tdelta.astype("O"), + ] + ) + + with np.errstate(invalid="ignore"): + self.arr_nan_nanj = self.arr_nan + self.arr_nan * 1j + self.arr_complex_nan = np.vstack([self.arr_complex, self.arr_nan_nanj]) + + self.arr_nan_infj = self.arr_inf * 1j + self.arr_complex_nan_infj = np.vstack([self.arr_complex, self.arr_nan_infj]) + + self.arr_float_2d = self.arr_float + self.arr_float1_2d = self.arr_float1 + + self.arr_nan_2d = self.arr_nan + self.arr_float_nan_2d = self.arr_float_nan + self.arr_float1_nan_2d = self.arr_float1_nan + self.arr_nan_float1_2d = self.arr_nan_float1 + + self.arr_float_1d = self.arr_float[:, 0] + self.arr_float1_1d = self.arr_float1[:, 0] + + self.arr_nan_1d = self.arr_nan[:, 0] + self.arr_float_nan_1d = self.arr_float_nan[:, 0] + self.arr_float1_nan_1d = self.arr_float1_nan[:, 0] + self.arr_nan_float1_1d = self.arr_nan_float1[:, 0] + + def teardown_method(self, method): + nanops._USE_BOTTLENECK = use_bn + + def check_results(self, targ, res, axis, check_dtype=True): + res = getattr(res, "asm8", res) + res = getattr(res, "values", res) + + # timedeltas are a beast here + def _coerce_tds(targ, res): + if hasattr(targ, "dtype") and targ.dtype == "m8[ns]": + if len(targ) == 1: + targ = targ[0].item() + res = res.item() + else: + targ = targ.view("i8") + return targ, res + + try: + if ( + axis != 0 + and hasattr(targ, "shape") + and targ.ndim + and targ.shape != res.shape + ): + res = np.split(res, [targ.shape[0]], axis=0)[0] + except (ValueError, IndexError): + targ, res = _coerce_tds(targ, res) + + try: + tm.assert_almost_equal(targ, res, check_dtype=check_dtype) + except AssertionError: + + # handle timedelta dtypes + if hasattr(targ, "dtype") and targ.dtype == "m8[ns]": + targ, res = _coerce_tds(targ, res) + tm.assert_almost_equal(targ, res, check_dtype=check_dtype) + return + + # There are sometimes rounding errors with + # complex and object dtypes. + # If it isn't one of those, re-raise the error. + if not hasattr(res, "dtype") or res.dtype.kind not in ["c", "O"]: + raise + # convert object dtypes to something that can be split into + # real and imaginary parts + if res.dtype.kind == "O": + if targ.dtype.kind != "O": + res = res.astype(targ.dtype) + else: + cast_dtype = "c16" if has_c16 else "f8" + res = res.astype(cast_dtype) + targ = targ.astype(cast_dtype) + # there should never be a case where numpy returns an object + # but nanops doesn't, so make that an exception + elif targ.dtype.kind == "O": + raise + tm.assert_almost_equal(np.real(targ), np.real(res), check_dtype=check_dtype) + tm.assert_almost_equal(np.imag(targ), np.imag(res), check_dtype=check_dtype) + + def check_fun_data( + self, + testfunc, + targfunc, + testarval, + targarval, + check_dtype=True, + empty_targfunc=None, + **kwargs, + ): + for axis in list(range(targarval.ndim)) + [None]: + for skipna in [False, True]: + targartempval = targarval if skipna else testarval + if skipna and empty_targfunc and isna(targartempval).all(): + targ = empty_targfunc(targartempval, axis=axis, **kwargs) + else: + targ = targfunc(targartempval, axis=axis, **kwargs) + + res = testfunc(testarval, axis=axis, skipna=skipna, **kwargs) + self.check_results(targ, res, axis, check_dtype=check_dtype) + if skipna: + res = testfunc(testarval, axis=axis, **kwargs) + self.check_results(targ, res, axis, check_dtype=check_dtype) + if axis is None: + res = testfunc(testarval, skipna=skipna, **kwargs) + self.check_results(targ, res, axis, check_dtype=check_dtype) + if skipna and axis is None: + res = testfunc(testarval, **kwargs) + self.check_results(targ, res, axis, check_dtype=check_dtype) + + if testarval.ndim <= 1: + return + + # Recurse on lower-dimension + testarval2 = np.take(testarval, 0, axis=-1) + targarval2 = np.take(targarval, 0, axis=-1) + self.check_fun_data( + testfunc, + targfunc, + testarval2, + targarval2, + check_dtype=check_dtype, + empty_targfunc=empty_targfunc, + **kwargs, + ) + + def check_fun(self, testfunc, targfunc, testar, empty_targfunc=None, **kwargs): + + targar = testar + if testar.endswith("_nan") and hasattr(self, testar[:-4]): + targar = testar[:-4] + + testarval = getattr(self, testar) + targarval = getattr(self, targar) + self.check_fun_data( + testfunc, + targfunc, + testarval, + targarval, + empty_targfunc=empty_targfunc, + **kwargs, + ) + + def check_funs( + self, + testfunc, + targfunc, + allow_complex=True, + allow_all_nan=True, + allow_date=True, + allow_tdelta=True, + allow_obj=True, + **kwargs, + ): + self.check_fun(testfunc, targfunc, "arr_float", **kwargs) + self.check_fun(testfunc, targfunc, "arr_float_nan", **kwargs) + self.check_fun(testfunc, targfunc, "arr_int", **kwargs) + self.check_fun(testfunc, targfunc, "arr_bool", **kwargs) + objs = [ + self.arr_float.astype("O"), + self.arr_int.astype("O"), + self.arr_bool.astype("O"), + ] + + if allow_all_nan: + self.check_fun(testfunc, targfunc, "arr_nan", **kwargs) + + if allow_complex: + self.check_fun(testfunc, targfunc, "arr_complex", **kwargs) + self.check_fun(testfunc, targfunc, "arr_complex_nan", **kwargs) + if allow_all_nan: + self.check_fun(testfunc, targfunc, "arr_nan_nanj", **kwargs) + objs += [self.arr_complex.astype("O")] + + if allow_date: + targfunc(self.arr_date) + self.check_fun(testfunc, targfunc, "arr_date", **kwargs) + objs += [self.arr_date.astype("O")] + + if allow_tdelta: + try: + targfunc(self.arr_tdelta) + except TypeError: + pass + else: + self.check_fun(testfunc, targfunc, "arr_tdelta", **kwargs) + objs += [self.arr_tdelta.astype("O")] + + if allow_obj: + self.arr_obj = np.vstack(objs) + # some nanops handle object dtypes better than their numpy + # counterparts, so the numpy functions need to be given something + # else + if allow_obj == "convert": + targfunc = partial( + self._badobj_wrap, func=targfunc, allow_complex=allow_complex + ) + self.check_fun(testfunc, targfunc, "arr_obj", **kwargs) + + def _badobj_wrap(self, value, func, allow_complex=True, **kwargs): + if value.dtype.kind == "O": + if allow_complex: + value = value.astype("c16") + else: + value = value.astype("f8") + return func(value, **kwargs) + + @pytest.mark.parametrize( + "nan_op,np_op", [(nanops.nanany, np.any), (nanops.nanall, np.all)] + ) + def test_nan_funcs(self, nan_op, np_op): + # TODO: allow tdelta, doesn't break tests + self.check_funs( + nan_op, np_op, allow_all_nan=False, allow_date=False, allow_tdelta=False + ) + + def test_nansum(self): + self.check_funs( + nanops.nansum, + np.sum, + allow_date=False, + check_dtype=False, + empty_targfunc=np.nansum, + ) + + def test_nanmean(self): + self.check_funs( + nanops.nanmean, + np.mean, + allow_complex=False, # TODO: allow this, doesn't break test + allow_obj=False, + allow_date=False, + ) + + def test_nanmean_overflow(self): + # GH 10155 + # In the previous implementation mean can overflow for int dtypes, it + # is now consistent with numpy + + for a in [2 ** 55, -(2 ** 55), 20150515061816532]: + s = Series(a, index=range(500), dtype=np.int64) + result = s.mean() + np_result = s.values.mean() + assert result == a + assert result == np_result + assert result.dtype == np.float64 + + @pytest.mark.parametrize( + "dtype", + [ + np.int16, + np.int32, + np.int64, + np.float32, + np.float64, + getattr(np, "float128", None), + ], + ) + def test_returned_dtype(self, dtype): + if dtype is None: + # no float128 available + return + + s = Series(range(10), dtype=dtype) + group_a = ["mean", "std", "var", "skew", "kurt"] + group_b = ["min", "max"] + for method in group_a + group_b: + result = getattr(s, method)() + if is_integer_dtype(dtype) and method in group_a: + assert result.dtype == np.float64 + else: + assert result.dtype == dtype + + def test_nanmedian(self): + with warnings.catch_warnings(record=True): + warnings.simplefilter("ignore", RuntimeWarning) + self.check_funs( + nanops.nanmedian, + np.median, + allow_complex=False, + allow_date=False, + allow_obj="convert", + ) + + @pytest.mark.parametrize("ddof", range(3)) + def test_nanvar(self, ddof): + self.check_funs( + nanops.nanvar, + np.var, + allow_complex=False, + allow_date=False, + allow_obj="convert", + ddof=ddof, + ) + + @pytest.mark.parametrize("ddof", range(3)) + def test_nanstd(self, ddof): + self.check_funs( + nanops.nanstd, + np.std, + allow_complex=False, + allow_date=False, + allow_obj="convert", + ddof=ddof, + ) + + @td.skip_if_no_scipy + @pytest.mark.parametrize("ddof", range(3)) + def test_nansem(self, ddof): + from scipy.stats import sem + + with np.errstate(invalid="ignore"): + self.check_funs( + nanops.nansem, + sem, + allow_complex=False, + allow_date=False, + allow_tdelta=False, + allow_obj="convert", + ddof=ddof, + ) + + @pytest.mark.parametrize( + "nan_op,np_op", [(nanops.nanmin, np.min), (nanops.nanmax, np.max)] + ) + def test_nanops_with_warnings(self, nan_op, np_op): + with warnings.catch_warnings(record=True): + warnings.simplefilter("ignore", RuntimeWarning) + self.check_funs(nan_op, np_op, allow_obj=False) + + def _argminmax_wrap(self, value, axis=None, func=None): + res = func(value, axis) + nans = np.min(value, axis) + nullnan = isna(nans) + if res.ndim: + res[nullnan] = -1 + elif ( + hasattr(nullnan, "all") + and nullnan.all() + or not hasattr(nullnan, "all") + and nullnan + ): + res = -1 + return res + + def test_nanargmax(self): + with warnings.catch_warnings(record=True): + warnings.simplefilter("ignore", RuntimeWarning) + func = partial(self._argminmax_wrap, func=np.argmax) + self.check_funs(nanops.nanargmax, func, allow_obj=False) + + def test_nanargmin(self): + with warnings.catch_warnings(record=True): + warnings.simplefilter("ignore", RuntimeWarning) + func = partial(self._argminmax_wrap, func=np.argmin) + self.check_funs(nanops.nanargmin, func, allow_obj=False) + + def _skew_kurt_wrap(self, values, axis=None, func=None): + if not isinstance(values.dtype.type, np.floating): + values = values.astype("f8") + result = func(values, axis=axis, bias=False) + # fix for handling cases where all elements in an axis are the same + if isinstance(result, np.ndarray): + result[np.max(values, axis=axis) == np.min(values, axis=axis)] = 0 + return result + elif np.max(values) == np.min(values): + return 0.0 + return result + + @td.skip_if_no_scipy + def test_nanskew(self): + from scipy.stats import skew + + func = partial(self._skew_kurt_wrap, func=skew) + with np.errstate(invalid="ignore"): + self.check_funs( + nanops.nanskew, + func, + allow_complex=False, + allow_date=False, + allow_tdelta=False, + ) + + @td.skip_if_no_scipy + def test_nankurt(self): + from scipy.stats import kurtosis + + func1 = partial(kurtosis, fisher=True) + func = partial(self._skew_kurt_wrap, func=func1) + with np.errstate(invalid="ignore"): + self.check_funs( + nanops.nankurt, + func, + allow_complex=False, + allow_date=False, + allow_tdelta=False, + ) + + def test_nanprod(self): + self.check_funs( + nanops.nanprod, + np.prod, + allow_date=False, + allow_tdelta=False, + empty_targfunc=np.nanprod, + ) + + def check_nancorr_nancov_2d(self, checkfun, targ0, targ1, **kwargs): + res00 = checkfun(self.arr_float_2d, self.arr_float1_2d, **kwargs) + res01 = checkfun( + self.arr_float_2d, + self.arr_float1_2d, + min_periods=len(self.arr_float_2d) - 1, + **kwargs, + ) + tm.assert_almost_equal(targ0, res00) + tm.assert_almost_equal(targ0, res01) + + res10 = checkfun(self.arr_float_nan_2d, self.arr_float1_nan_2d, **kwargs) + res11 = checkfun( + self.arr_float_nan_2d, + self.arr_float1_nan_2d, + min_periods=len(self.arr_float_2d) - 1, + **kwargs, + ) + tm.assert_almost_equal(targ1, res10) + tm.assert_almost_equal(targ1, res11) + + targ2 = np.nan + res20 = checkfun(self.arr_nan_2d, self.arr_float1_2d, **kwargs) + res21 = checkfun(self.arr_float_2d, self.arr_nan_2d, **kwargs) + res22 = checkfun(self.arr_nan_2d, self.arr_nan_2d, **kwargs) + res23 = checkfun(self.arr_float_nan_2d, self.arr_nan_float1_2d, **kwargs) + res24 = checkfun( + self.arr_float_nan_2d, + self.arr_nan_float1_2d, + min_periods=len(self.arr_float_2d) - 1, + **kwargs, + ) + res25 = checkfun( + self.arr_float_2d, + self.arr_float1_2d, + min_periods=len(self.arr_float_2d) + 1, + **kwargs, + ) + tm.assert_almost_equal(targ2, res20) + tm.assert_almost_equal(targ2, res21) + tm.assert_almost_equal(targ2, res22) + tm.assert_almost_equal(targ2, res23) + tm.assert_almost_equal(targ2, res24) + tm.assert_almost_equal(targ2, res25) + + def check_nancorr_nancov_1d(self, checkfun, targ0, targ1, **kwargs): + res00 = checkfun(self.arr_float_1d, self.arr_float1_1d, **kwargs) + res01 = checkfun( + self.arr_float_1d, + self.arr_float1_1d, + min_periods=len(self.arr_float_1d) - 1, + **kwargs, + ) + tm.assert_almost_equal(targ0, res00) + tm.assert_almost_equal(targ0, res01) + + res10 = checkfun(self.arr_float_nan_1d, self.arr_float1_nan_1d, **kwargs) + res11 = checkfun( + self.arr_float_nan_1d, + self.arr_float1_nan_1d, + min_periods=len(self.arr_float_1d) - 1, + **kwargs, + ) + tm.assert_almost_equal(targ1, res10) + tm.assert_almost_equal(targ1, res11) + + targ2 = np.nan + res20 = checkfun(self.arr_nan_1d, self.arr_float1_1d, **kwargs) + res21 = checkfun(self.arr_float_1d, self.arr_nan_1d, **kwargs) + res22 = checkfun(self.arr_nan_1d, self.arr_nan_1d, **kwargs) + res23 = checkfun(self.arr_float_nan_1d, self.arr_nan_float1_1d, **kwargs) + res24 = checkfun( + self.arr_float_nan_1d, + self.arr_nan_float1_1d, + min_periods=len(self.arr_float_1d) - 1, + **kwargs, + ) + res25 = checkfun( + self.arr_float_1d, + self.arr_float1_1d, + min_periods=len(self.arr_float_1d) + 1, + **kwargs, + ) + tm.assert_almost_equal(targ2, res20) + tm.assert_almost_equal(targ2, res21) + tm.assert_almost_equal(targ2, res22) + tm.assert_almost_equal(targ2, res23) + tm.assert_almost_equal(targ2, res24) + tm.assert_almost_equal(targ2, res25) + + def test_nancorr(self): + targ0 = np.corrcoef(self.arr_float_2d, self.arr_float1_2d)[0, 1] + targ1 = np.corrcoef(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0, 1] + self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1) + targ0 = np.corrcoef(self.arr_float_1d, self.arr_float1_1d)[0, 1] + targ1 = np.corrcoef(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0, 1] + self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="pearson") + + def test_nancorr_pearson(self): + targ0 = np.corrcoef(self.arr_float_2d, self.arr_float1_2d)[0, 1] + targ1 = np.corrcoef(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0, 1] + self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1, method="pearson") + targ0 = np.corrcoef(self.arr_float_1d, self.arr_float1_1d)[0, 1] + targ1 = np.corrcoef(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0, 1] + self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="pearson") + + @td.skip_if_no_scipy + def test_nancorr_kendall(self): + from scipy.stats import kendalltau + + targ0 = kendalltau(self.arr_float_2d, self.arr_float1_2d)[0] + targ1 = kendalltau(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0] + self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1, method="kendall") + targ0 = kendalltau(self.arr_float_1d, self.arr_float1_1d)[0] + targ1 = kendalltau(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0] + self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="kendall") + + @td.skip_if_no_scipy + def test_nancorr_spearman(self): + from scipy.stats import spearmanr + + targ0 = spearmanr(self.arr_float_2d, self.arr_float1_2d)[0] + targ1 = spearmanr(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0] + self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1, method="spearman") + targ0 = spearmanr(self.arr_float_1d, self.arr_float1_1d)[0] + targ1 = spearmanr(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0] + self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="spearman") + + @td.skip_if_no_scipy + def test_invalid_method(self): + targ0 = np.corrcoef(self.arr_float_2d, self.arr_float1_2d)[0, 1] + targ1 = np.corrcoef(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0, 1] + msg = "Unkown method 'foo', expected one of 'kendall', 'spearman'" + with pytest.raises(ValueError, match=msg): + self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="foo") + + def test_nancov(self): + targ0 = np.cov(self.arr_float_2d, self.arr_float1_2d)[0, 1] + targ1 = np.cov(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0, 1] + self.check_nancorr_nancov_2d(nanops.nancov, targ0, targ1) + targ0 = np.cov(self.arr_float_1d, self.arr_float1_1d)[0, 1] + targ1 = np.cov(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0, 1] + self.check_nancorr_nancov_1d(nanops.nancov, targ0, targ1) + + def check_nancomp(self, checkfun, targ0): + arr_float = self.arr_float + arr_float1 = self.arr_float1 + arr_nan = self.arr_nan + arr_nan_nan = self.arr_nan_nan + arr_float_nan = self.arr_float_nan + arr_float1_nan = self.arr_float1_nan + arr_nan_float1 = self.arr_nan_float1 + + while targ0.ndim: + res0 = checkfun(arr_float, arr_float1) + tm.assert_almost_equal(targ0, res0) + + if targ0.ndim > 1: + targ1 = np.vstack([targ0, arr_nan]) + else: + targ1 = np.hstack([targ0, arr_nan]) + res1 = checkfun(arr_float_nan, arr_float1_nan) + tm.assert_numpy_array_equal(targ1, res1, check_dtype=False) + + targ2 = arr_nan_nan + res2 = checkfun(arr_float_nan, arr_nan_float1) + tm.assert_numpy_array_equal(targ2, res2, check_dtype=False) + + # Lower dimension for next step in the loop + arr_float = np.take(arr_float, 0, axis=-1) + arr_float1 = np.take(arr_float1, 0, axis=-1) + arr_nan = np.take(arr_nan, 0, axis=-1) + arr_nan_nan = np.take(arr_nan_nan, 0, axis=-1) + arr_float_nan = np.take(arr_float_nan, 0, axis=-1) + arr_float1_nan = np.take(arr_float1_nan, 0, axis=-1) + arr_nan_float1 = np.take(arr_nan_float1, 0, axis=-1) + targ0 = np.take(targ0, 0, axis=-1) + + @pytest.mark.parametrize( + "op,nanop", + [ + (operator.eq, nanops.naneq), + (operator.ne, nanops.nanne), + (operator.gt, nanops.nangt), + (operator.ge, nanops.nange), + (operator.lt, nanops.nanlt), + (operator.le, nanops.nanle), + ], + ) + def test_nan_comparison(self, op, nanop): + targ0 = op(self.arr_float, self.arr_float1) + self.check_nancomp(nanop, targ0) + + def check_bool(self, func, value, correct): + while getattr(value, "ndim", True): + res0 = func(value) + if correct: + assert res0 + else: + assert not res0 + + if not hasattr(value, "ndim"): + break + + # Reduce dimension for next step in the loop + value = np.take(value, 0, axis=-1) + + def test__has_infs(self): + pairs = [ + ("arr_complex", False), + ("arr_int", False), + ("arr_bool", False), + ("arr_str", False), + ("arr_utf", False), + ("arr_complex", False), + ("arr_complex_nan", False), + ("arr_nan_nanj", False), + ("arr_nan_infj", True), + ("arr_complex_nan_infj", True), + ] + pairs_float = [ + ("arr_float", False), + ("arr_nan", False), + ("arr_float_nan", False), + ("arr_nan_nan", False), + ("arr_float_inf", True), + ("arr_inf", True), + ("arr_nan_inf", True), + ("arr_float_nan_inf", True), + ("arr_nan_nan_inf", True), + ] + + for arr, correct in pairs: + val = getattr(self, arr) + self.check_bool(nanops._has_infs, val, correct) + + for arr, correct in pairs_float: + val = getattr(self, arr) + self.check_bool(nanops._has_infs, val, correct) + self.check_bool(nanops._has_infs, val.astype("f4"), correct) + self.check_bool(nanops._has_infs, val.astype("f2"), correct) + + def test__bn_ok_dtype(self): + assert nanops._bn_ok_dtype(self.arr_float.dtype, "test") + assert nanops._bn_ok_dtype(self.arr_complex.dtype, "test") + assert nanops._bn_ok_dtype(self.arr_int.dtype, "test") + assert nanops._bn_ok_dtype(self.arr_bool.dtype, "test") + assert nanops._bn_ok_dtype(self.arr_str.dtype, "test") + assert nanops._bn_ok_dtype(self.arr_utf.dtype, "test") + assert not nanops._bn_ok_dtype(self.arr_date.dtype, "test") + assert not nanops._bn_ok_dtype(self.arr_tdelta.dtype, "test") + assert not nanops._bn_ok_dtype(self.arr_obj.dtype, "test") + + +class TestEnsureNumeric: + def test_numeric_values(self): + # Test integer + assert nanops._ensure_numeric(1) == 1 + + # Test float + assert nanops._ensure_numeric(1.1) == 1.1 + + # Test complex + assert nanops._ensure_numeric(1 + 2j) == 1 + 2j + + def test_ndarray(self): + # Test numeric ndarray + values = np.array([1, 2, 3]) + assert np.allclose(nanops._ensure_numeric(values), values) + + # Test object ndarray + o_values = values.astype(object) + assert np.allclose(nanops._ensure_numeric(o_values), values) + + # Test convertible string ndarray + s_values = np.array(["1", "2", "3"], dtype=object) + assert np.allclose(nanops._ensure_numeric(s_values), values) + + # Test non-convertible string ndarray + s_values = np.array(["foo", "bar", "baz"], dtype=object) + msg = r"could not convert string to float: '(foo|baz)'" + with pytest.raises(ValueError, match=msg): + nanops._ensure_numeric(s_values) + + def test_convertable_values(self): + assert np.allclose(nanops._ensure_numeric("1"), 1.0) + assert np.allclose(nanops._ensure_numeric("1.1"), 1.1) + assert np.allclose(nanops._ensure_numeric("1+1j"), 1 + 1j) + + def test_non_convertable_values(self): + msg = "Could not convert foo to numeric" + with pytest.raises(TypeError, match=msg): + nanops._ensure_numeric("foo") + + # with the wrong type, python raises TypeError for us + msg = "argument must be a string or a number" + with pytest.raises(TypeError, match=msg): + nanops._ensure_numeric({}) + with pytest.raises(TypeError, match=msg): + nanops._ensure_numeric([]) + + +class TestNanvarFixedValues: + + # xref GH10242 + + def setup_method(self, method): + # Samples from a normal distribution. + self.variance = variance = 3.0 + self.samples = self.prng.normal(scale=variance ** 0.5, size=100000) + + def test_nanvar_all_finite(self): + samples = self.samples + actual_variance = nanops.nanvar(samples) + tm.assert_almost_equal(actual_variance, self.variance, check_less_precise=2) + + def test_nanvar_nans(self): + samples = np.nan * np.ones(2 * self.samples.shape[0]) + samples[::2] = self.samples + + actual_variance = nanops.nanvar(samples, skipna=True) + tm.assert_almost_equal(actual_variance, self.variance, check_less_precise=2) + + actual_variance = nanops.nanvar(samples, skipna=False) + tm.assert_almost_equal(actual_variance, np.nan, check_less_precise=2) + + def test_nanstd_nans(self): + samples = np.nan * np.ones(2 * self.samples.shape[0]) + samples[::2] = self.samples + + actual_std = nanops.nanstd(samples, skipna=True) + tm.assert_almost_equal(actual_std, self.variance ** 0.5, check_less_precise=2) + + actual_std = nanops.nanvar(samples, skipna=False) + tm.assert_almost_equal(actual_std, np.nan, check_less_precise=2) + + def test_nanvar_axis(self): + # Generate some sample data. + samples_norm = self.samples + samples_unif = self.prng.uniform(size=samples_norm.shape[0]) + samples = np.vstack([samples_norm, samples_unif]) + + actual_variance = nanops.nanvar(samples, axis=1) + tm.assert_almost_equal( + actual_variance, np.array([self.variance, 1.0 / 12]), check_less_precise=2 + ) + + def test_nanvar_ddof(self): + n = 5 + samples = self.prng.uniform(size=(10000, n + 1)) + samples[:, -1] = np.nan # Force use of our own algorithm. + + variance_0 = nanops.nanvar(samples, axis=1, skipna=True, ddof=0).mean() + variance_1 = nanops.nanvar(samples, axis=1, skipna=True, ddof=1).mean() + variance_2 = nanops.nanvar(samples, axis=1, skipna=True, ddof=2).mean() + + # The unbiased estimate. + var = 1.0 / 12 + tm.assert_almost_equal(variance_1, var, check_less_precise=2) + + # The underestimated variance. + tm.assert_almost_equal(variance_0, (n - 1.0) / n * var, check_less_precise=2) + + # The overestimated variance. + tm.assert_almost_equal( + variance_2, (n - 1.0) / (n - 2.0) * var, check_less_precise=2 + ) + + def test_ground_truth(self): + # Test against values that were precomputed with Numpy. + samples = np.empty((4, 4)) + samples[:3, :3] = np.array( + [ + [0.97303362, 0.21869576, 0.55560287], + [0.72980153, 0.03109364, 0.99155171], + [0.09317602, 0.60078248, 0.15871292], + ] + ) + samples[3] = samples[:, 3] = np.nan + + # Actual variances along axis=0, 1 for ddof=0, 1, 2 + variance = np.array( + [ + [ + [0.13762259, 0.05619224, 0.11568816], + [0.20643388, 0.08428837, 0.17353224], + [0.41286776, 0.16857673, 0.34706449], + ], + [ + [0.09519783, 0.16435395, 0.05082054], + [0.14279674, 0.24653093, 0.07623082], + [0.28559348, 0.49306186, 0.15246163], + ], + ] + ) + + # Test nanvar. + for axis in range(2): + for ddof in range(3): + var = nanops.nanvar(samples, skipna=True, axis=axis, ddof=ddof) + tm.assert_almost_equal(var[:3], variance[axis, ddof]) + assert np.isnan(var[3]) + + # Test nanstd. + for axis in range(2): + for ddof in range(3): + std = nanops.nanstd(samples, skipna=True, axis=axis, ddof=ddof) + tm.assert_almost_equal(std[:3], variance[axis, ddof] ** 0.5) + assert np.isnan(std[3]) + + def test_nanstd_roundoff(self): + # Regression test for GH 10242 (test data taken from GH 10489). Ensure + # that variance is stable. + data = Series(766897346 * np.ones(10)) + for ddof in range(3): + result = data.std(ddof=ddof) + assert result == 0.0 + + @property + def prng(self): + return np.random.RandomState(1234) + + +class TestNanskewFixedValues: + + # xref GH 11974 + + def setup_method(self, method): + # Test data + skewness value (computed with scipy.stats.skew) + self.samples = np.sin(np.linspace(0, 1, 200)) + self.actual_skew = -0.1875895205961754 + + def test_constant_series(self): + # xref GH 11974 + for val in [3075.2, 3075.3, 3075.5]: + data = val * np.ones(300) + skew = nanops.nanskew(data) + assert skew == 0.0 + + def test_all_finite(self): + alpha, beta = 0.3, 0.1 + left_tailed = self.prng.beta(alpha, beta, size=100) + assert nanops.nanskew(left_tailed) < 0 + + alpha, beta = 0.1, 0.3 + right_tailed = self.prng.beta(alpha, beta, size=100) + assert nanops.nanskew(right_tailed) > 0 + + def test_ground_truth(self): + skew = nanops.nanskew(self.samples) + tm.assert_almost_equal(skew, self.actual_skew) + + def test_axis(self): + samples = np.vstack([self.samples, np.nan * np.ones(len(self.samples))]) + skew = nanops.nanskew(samples, axis=1) + tm.assert_almost_equal(skew, np.array([self.actual_skew, np.nan])) + + def test_nans(self): + samples = np.hstack([self.samples, np.nan]) + skew = nanops.nanskew(samples, skipna=False) + assert np.isnan(skew) + + def test_nans_skipna(self): + samples = np.hstack([self.samples, np.nan]) + skew = nanops.nanskew(samples, skipna=True) + tm.assert_almost_equal(skew, self.actual_skew) + + @property + def prng(self): + return np.random.RandomState(1234) + + +class TestNankurtFixedValues: + + # xref GH 11974 + + def setup_method(self, method): + # Test data + kurtosis value (computed with scipy.stats.kurtosis) + self.samples = np.sin(np.linspace(0, 1, 200)) + self.actual_kurt = -1.2058303433799713 + + def test_constant_series(self): + # xref GH 11974 + for val in [3075.2, 3075.3, 3075.5]: + data = val * np.ones(300) + kurt = nanops.nankurt(data) + assert kurt == 0.0 + + def test_all_finite(self): + alpha, beta = 0.3, 0.1 + left_tailed = self.prng.beta(alpha, beta, size=100) + assert nanops.nankurt(left_tailed) < 0 + + alpha, beta = 0.1, 0.3 + right_tailed = self.prng.beta(alpha, beta, size=100) + assert nanops.nankurt(right_tailed) > 0 + + def test_ground_truth(self): + kurt = nanops.nankurt(self.samples) + tm.assert_almost_equal(kurt, self.actual_kurt) + + def test_axis(self): + samples = np.vstack([self.samples, np.nan * np.ones(len(self.samples))]) + kurt = nanops.nankurt(samples, axis=1) + tm.assert_almost_equal(kurt, np.array([self.actual_kurt, np.nan])) + + def test_nans(self): + samples = np.hstack([self.samples, np.nan]) + kurt = nanops.nankurt(samples, skipna=False) + assert np.isnan(kurt) + + def test_nans_skipna(self): + samples = np.hstack([self.samples, np.nan]) + kurt = nanops.nankurt(samples, skipna=True) + tm.assert_almost_equal(kurt, self.actual_kurt) + + @property + def prng(self): + return np.random.RandomState(1234) + + +class TestDatetime64NaNOps: + @pytest.mark.parametrize("tz", [None, "UTC"]) + @pytest.mark.xfail(reason="disabled") + # Enabling mean changes the behavior of DataFrame.mean + # See https://github.com/pandas-dev/pandas/issues/24752 + def test_nanmean(self, tz): + dti = pd.date_range("2016-01-01", periods=3, tz=tz) + expected = dti[1] + + for obj in [dti, DatetimeArray(dti), Series(dti)]: + result = nanops.nanmean(obj) + assert result == expected + + dti2 = dti.insert(1, pd.NaT) + + for obj in [dti2, DatetimeArray(dti2), Series(dti2)]: + result = nanops.nanmean(obj) + assert result == expected + + +def test_use_bottleneck(): + + if nanops._BOTTLENECK_INSTALLED: + + pd.set_option("use_bottleneck", True) + assert pd.get_option("use_bottleneck") + + pd.set_option("use_bottleneck", False) + assert not pd.get_option("use_bottleneck") + + pd.set_option("use_bottleneck", use_bn) + + +@pytest.mark.parametrize( + "numpy_op, expected", + [ + (np.sum, 10), + (np.nansum, 10), + (np.mean, 2.5), + (np.nanmean, 2.5), + (np.median, 2.5), + (np.nanmedian, 2.5), + (np.min, 1), + (np.max, 4), + (np.nanmin, 1), + (np.nanmax, 4), + ], +) +def test_numpy_ops(numpy_op, expected): + # GH8383 + result = numpy_op(pd.Series([1, 2, 3, 4])) + assert result == expected + + +@pytest.mark.parametrize( + "operation", + [ + nanops.nanany, + nanops.nanall, + nanops.nansum, + nanops.nanmean, + nanops.nanmedian, + nanops.nanstd, + nanops.nanvar, + nanops.nansem, + nanops.nanargmax, + nanops.nanargmin, + nanops.nanmax, + nanops.nanmin, + nanops.nanskew, + nanops.nankurt, + nanops.nanprod, + ], +) +def test_nanops_independent_of_mask_param(operation): + # GH22764 + s = pd.Series([1, 2, np.nan, 3, np.nan, 4]) + mask = s.isna() + median_expected = operation(s) + median_result = operation(s, mask=mask) + assert median_expected == median_result diff --git a/venv/Lib/site-packages/pandas/tests/test_optional_dependency.py b/venv/Lib/site-packages/pandas/tests/test_optional_dependency.py new file mode 100644 index 0000000..ce52721 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/test_optional_dependency.py @@ -0,0 +1,52 @@ +import sys +import types + +import pytest + +from pandas.compat._optional import VERSIONS, import_optional_dependency + +import pandas._testing as tm + + +def test_import_optional(): + match = "Missing .*notapackage.* pip .* conda .* notapackage" + with pytest.raises(ImportError, match=match): + import_optional_dependency("notapackage") + + result = import_optional_dependency("notapackage", raise_on_missing=False) + assert result is None + + +def test_xlrd_version_fallback(): + pytest.importorskip("xlrd") + import_optional_dependency("xlrd") + + +def test_bad_version(): + name = "fakemodule" + module = types.ModuleType(name) + module.__version__ = "0.9.0" + sys.modules[name] = module + VERSIONS[name] = "1.0.0" + + match = "Pandas requires .*1.0.0.* of .fakemodule.*'0.9.0'" + with pytest.raises(ImportError, match=match): + import_optional_dependency("fakemodule") + + with tm.assert_produces_warning(UserWarning): + result = import_optional_dependency("fakemodule", on_version="warn") + assert result is None + + module.__version__ = "1.0.0" # exact match is OK + result = import_optional_dependency("fakemodule") + assert result is module + + +def test_no_version_raises(): + name = "fakemodule" + module = types.ModuleType(name) + sys.modules[name] = module + VERSIONS[name] = "1.0.0" + + with pytest.raises(ImportError, match="Can't determine .* fakemodule"): + import_optional_dependency(name) diff --git a/venv/Lib/site-packages/pandas/tests/test_register_accessor.py b/venv/Lib/site-packages/pandas/tests/test_register_accessor.py new file mode 100644 index 0000000..08a5581 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/test_register_accessor.py @@ -0,0 +1,92 @@ +import contextlib + +import pytest + +import pandas as pd +import pandas._testing as tm + + +@contextlib.contextmanager +def ensure_removed(obj, attr): + """Ensure that an attribute added to 'obj' during the test is + removed when we're done""" + try: + yield + finally: + try: + delattr(obj, attr) + except AttributeError: + pass + obj._accessors.discard(attr) + + +class MyAccessor: + def __init__(self, obj): + self.obj = obj + self.item = "item" + + @property + def prop(self): + return self.item + + def method(self): + return self.item + + +@pytest.mark.parametrize( + "obj, registrar", + [ + (pd.Series, pd.api.extensions.register_series_accessor), + (pd.DataFrame, pd.api.extensions.register_dataframe_accessor), + (pd.Index, pd.api.extensions.register_index_accessor), + ], +) +def test_register(obj, registrar): + with ensure_removed(obj, "mine"): + before = set(dir(obj)) + registrar("mine")(MyAccessor) + o = obj([]) if obj is not pd.Series else obj([], dtype=object) + assert o.mine.prop == "item" + after = set(dir(obj)) + assert (before ^ after) == {"mine"} + assert "mine" in obj._accessors + + +def test_accessor_works(): + with ensure_removed(pd.Series, "mine"): + pd.api.extensions.register_series_accessor("mine")(MyAccessor) + + s = pd.Series([1, 2]) + assert s.mine.obj is s + + assert s.mine.prop == "item" + assert s.mine.method() == "item" + + +def test_overwrite_warns(): + # Need to restore mean + mean = pd.Series.mean + try: + with tm.assert_produces_warning(UserWarning) as w: + pd.api.extensions.register_series_accessor("mean")(MyAccessor) + s = pd.Series([1, 2]) + assert s.mean.prop == "item" + msg = str(w[0].message) + assert "mean" in msg + assert "MyAccessor" in msg + assert "Series" in msg + finally: + pd.Series.mean = mean + + +def test_raises_attribute_error(): + + with ensure_removed(pd.Series, "bad"): + + @pd.api.extensions.register_series_accessor("bad") + class Bad: + def __init__(self, data): + raise AttributeError("whoops") + + with pytest.raises(AttributeError, match="whoops"): + pd.Series([], dtype=object).bad diff --git a/venv/Lib/site-packages/pandas/tests/test_sorting.py b/venv/Lib/site-packages/pandas/tests/test_sorting.py new file mode 100644 index 0000000..9829747 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/test_sorting.py @@ -0,0 +1,454 @@ +from collections import defaultdict +from datetime import datetime +from itertools import product + +import numpy as np +import pytest + +from pandas import DataFrame, MultiIndex, Series, array, concat, merge +import pandas._testing as tm +from pandas.core.algorithms import safe_sort +import pandas.core.common as com +from pandas.core.sorting import ( + decons_group_index, + get_group_index, + is_int64_overflow_possible, + lexsort_indexer, + nargsort, +) + + +class TestSorting: + @pytest.mark.slow + def test_int64_overflow(self): + + B = np.concatenate((np.arange(1000), np.arange(1000), np.arange(500))) + A = np.arange(2500) + df = DataFrame( + { + "A": A, + "B": B, + "C": A, + "D": B, + "E": A, + "F": B, + "G": A, + "H": B, + "values": np.random.randn(2500), + } + ) + + lg = df.groupby(["A", "B", "C", "D", "E", "F", "G", "H"]) + rg = df.groupby(["H", "G", "F", "E", "D", "C", "B", "A"]) + + left = lg.sum()["values"] + right = rg.sum()["values"] + + exp_index, _ = left.index.sortlevel() + tm.assert_index_equal(left.index, exp_index) + + exp_index, _ = right.index.sortlevel(0) + tm.assert_index_equal(right.index, exp_index) + + tups = list(map(tuple, df[["A", "B", "C", "D", "E", "F", "G", "H"]].values)) + tups = com.asarray_tuplesafe(tups) + + expected = df.groupby(tups).sum()["values"] + + for k, v in expected.items(): + assert left[k] == right[k[::-1]] + assert left[k] == v + assert len(left) == len(right) + + def test_int64_overflow_moar(self): + + # GH9096 + values = range(55109) + data = DataFrame.from_dict({"a": values, "b": values, "c": values, "d": values}) + grouped = data.groupby(["a", "b", "c", "d"]) + assert len(grouped) == len(values) + + arr = np.random.randint(-1 << 12, 1 << 12, (1 << 15, 5)) + i = np.random.choice(len(arr), len(arr) * 4) + arr = np.vstack((arr, arr[i])) # add sume duplicate rows + + i = np.random.permutation(len(arr)) + arr = arr[i] # shuffle rows + + df = DataFrame(arr, columns=list("abcde")) + df["jim"], df["joe"] = np.random.randn(2, len(df)) * 10 + gr = df.groupby(list("abcde")) + + # verify this is testing what it is supposed to test! + assert is_int64_overflow_possible(gr.grouper.shape) + + # manually compute groupings + jim, joe = defaultdict(list), defaultdict(list) + for key, a, b in zip(map(tuple, arr), df["jim"], df["joe"]): + jim[key].append(a) + joe[key].append(b) + + assert len(gr) == len(jim) + mi = MultiIndex.from_tuples(jim.keys(), names=list("abcde")) + + def aggr(func): + f = lambda a: np.fromiter(map(func, a), dtype="f8") + arr = np.vstack((f(jim.values()), f(joe.values()))).T + res = DataFrame(arr, columns=["jim", "joe"], index=mi) + return res.sort_index() + + tm.assert_frame_equal(gr.mean(), aggr(np.mean)) + tm.assert_frame_equal(gr.median(), aggr(np.median)) + + def test_lexsort_indexer(self): + keys = [[np.nan] * 5 + list(range(100)) + [np.nan] * 5] + # orders=True, na_position='last' + result = lexsort_indexer(keys, orders=True, na_position="last") + exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) + + # orders=True, na_position='first' + result = lexsort_indexer(keys, orders=True, na_position="first") + exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) + + # orders=False, na_position='last' + result = lexsort_indexer(keys, orders=False, na_position="last") + exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) + + # orders=False, na_position='first' + result = lexsort_indexer(keys, orders=False, na_position="first") + exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) + + def test_nargsort(self): + # np.argsort(items) places NaNs last + items = [np.nan] * 5 + list(range(100)) + [np.nan] * 5 + # np.argsort(items2) may not place NaNs first + items2 = np.array(items, dtype="O") + + # mergesort is the most difficult to get right because we want it to be + # stable. + + # According to numpy/core/tests/test_multiarray, """The number of + # sorted items must be greater than ~50 to check the actual algorithm + # because quick and merge sort fall over to insertion sort for small + # arrays.""" + + # mergesort, ascending=True, na_position='last' + result = nargsort(items, kind="mergesort", ascending=True, na_position="last") + exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) + + # mergesort, ascending=True, na_position='first' + result = nargsort(items, kind="mergesort", ascending=True, na_position="first") + exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) + + # mergesort, ascending=False, na_position='last' + result = nargsort(items, kind="mergesort", ascending=False, na_position="last") + exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) + + # mergesort, ascending=False, na_position='first' + result = nargsort(items, kind="mergesort", ascending=False, na_position="first") + exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) + + # mergesort, ascending=True, na_position='last' + result = nargsort(items2, kind="mergesort", ascending=True, na_position="last") + exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) + + # mergesort, ascending=True, na_position='first' + result = nargsort(items2, kind="mergesort", ascending=True, na_position="first") + exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) + + # mergesort, ascending=False, na_position='last' + result = nargsort(items2, kind="mergesort", ascending=False, na_position="last") + exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) + + # mergesort, ascending=False, na_position='first' + result = nargsort( + items2, kind="mergesort", ascending=False, na_position="first" + ) + exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) + + +class TestMerge: + @pytest.mark.slow + def test_int64_overflow_issues(self): + + # #2690, combinatorial explosion + df1 = DataFrame(np.random.randn(1000, 7), columns=list("ABCDEF") + ["G1"]) + df2 = DataFrame(np.random.randn(1000, 7), columns=list("ABCDEF") + ["G2"]) + + # it works! + result = merge(df1, df2, how="outer") + assert len(result) == 2000 + + low, high, n = -1 << 10, 1 << 10, 1 << 20 + left = DataFrame(np.random.randint(low, high, (n, 7)), columns=list("ABCDEFG")) + left["left"] = left.sum(axis=1) + + # one-2-one match + i = np.random.permutation(len(left)) + right = left.iloc[i].copy() + right.columns = right.columns[:-1].tolist() + ["right"] + right.index = np.arange(len(right)) + right["right"] *= -1 + + out = merge(left, right, how="outer") + assert len(out) == len(left) + tm.assert_series_equal(out["left"], -out["right"], check_names=False) + result = out.iloc[:, :-2].sum(axis=1) + tm.assert_series_equal(out["left"], result, check_names=False) + assert result.name is None + + out.sort_values(out.columns.tolist(), inplace=True) + out.index = np.arange(len(out)) + for how in ["left", "right", "outer", "inner"]: + tm.assert_frame_equal(out, merge(left, right, how=how, sort=True)) + + # check that left merge w/ sort=False maintains left frame order + out = merge(left, right, how="left", sort=False) + tm.assert_frame_equal(left, out[left.columns.tolist()]) + + out = merge(right, left, how="left", sort=False) + tm.assert_frame_equal(right, out[right.columns.tolist()]) + + # one-2-many/none match + n = 1 << 11 + left = DataFrame( + np.random.randint(low, high, (n, 7)).astype("int64"), + columns=list("ABCDEFG"), + ) + + # confirm that this is checking what it is supposed to check + shape = left.apply(Series.nunique).values + assert is_int64_overflow_possible(shape) + + # add duplicates to left frame + left = concat([left, left], ignore_index=True) + + right = DataFrame( + np.random.randint(low, high, (n // 2, 7)).astype("int64"), + columns=list("ABCDEFG"), + ) + + # add duplicates & overlap with left to the right frame + i = np.random.choice(len(left), n) + right = concat([right, right, left.iloc[i]], ignore_index=True) + + left["left"] = np.random.randn(len(left)) + right["right"] = np.random.randn(len(right)) + + # shuffle left & right frames + i = np.random.permutation(len(left)) + left = left.iloc[i].copy() + left.index = np.arange(len(left)) + + i = np.random.permutation(len(right)) + right = right.iloc[i].copy() + right.index = np.arange(len(right)) + + # manually compute outer merge + ldict, rdict = defaultdict(list), defaultdict(list) + + for idx, row in left.set_index(list("ABCDEFG")).iterrows(): + ldict[idx].append(row["left"]) + + for idx, row in right.set_index(list("ABCDEFG")).iterrows(): + rdict[idx].append(row["right"]) + + vals = [] + for k, lval in ldict.items(): + rval = rdict.get(k, [np.nan]) + for lv, rv in product(lval, rval): + vals.append(k + tuple([lv, rv])) + + for k, rval in rdict.items(): + if k not in ldict: + for rv in rval: + vals.append(k + tuple([np.nan, rv])) + + def align(df): + df = df.sort_values(df.columns.tolist()) + df.index = np.arange(len(df)) + return df + + def verify_order(df): + kcols = list("ABCDEFG") + tm.assert_frame_equal( + df[kcols].copy(), df[kcols].sort_values(kcols, kind="mergesort") + ) + + out = DataFrame(vals, columns=list("ABCDEFG") + ["left", "right"]) + out = align(out) + + jmask = { + "left": out["left"].notna(), + "right": out["right"].notna(), + "inner": out["left"].notna() & out["right"].notna(), + "outer": np.ones(len(out), dtype="bool"), + } + + for how in "left", "right", "outer", "inner": + mask = jmask[how] + frame = align(out[mask].copy()) + assert mask.all() ^ mask.any() or how == "outer" + + for sort in [False, True]: + res = merge(left, right, how=how, sort=sort) + if sort: + verify_order(res) + + # as in GH9092 dtypes break with outer/right join + tm.assert_frame_equal( + frame, align(res), check_dtype=how not in ("right", "outer") + ) + + +def test_decons(): + def testit(codes_list, shape): + group_index = get_group_index(codes_list, shape, sort=True, xnull=True) + codes_list2 = decons_group_index(group_index, shape) + + for a, b in zip(codes_list, codes_list2): + tm.assert_numpy_array_equal(a, b) + + shape = (4, 5, 6) + codes_list = [ + np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100).astype(np.int64), + np.tile([0, 2, 4, 3, 0, 1, 2, 3], 100).astype(np.int64), + np.tile([5, 1, 0, 2, 3, 0, 5, 4], 100).astype(np.int64), + ] + testit(codes_list, shape) + + shape = (10000, 10000) + codes_list = [ + np.tile(np.arange(10000, dtype=np.int64), 5), + np.tile(np.arange(10000, dtype=np.int64), 5), + ] + testit(codes_list, shape) + + +class TestSafeSort: + def test_basic_sort(self): + values = [3, 1, 2, 0, 4] + result = safe_sort(values) + expected = np.array([0, 1, 2, 3, 4]) + tm.assert_numpy_array_equal(result, expected) + + values = list("baaacb") + result = safe_sort(values) + expected = np.array(list("aaabbc"), dtype="object") + tm.assert_numpy_array_equal(result, expected) + + values = [] + result = safe_sort(values) + expected = np.array([]) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("verify", [True, False]) + def test_codes(self, verify): + values = [3, 1, 2, 0, 4] + expected = np.array([0, 1, 2, 3, 4]) + + codes = [0, 1, 1, 2, 3, 0, -1, 4] + result, result_codes = safe_sort(values, codes, verify=verify) + expected_codes = np.array([3, 1, 1, 2, 0, 3, -1, 4], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result_codes, expected_codes) + + # na_sentinel + codes = [0, 1, 1, 2, 3, 0, 99, 4] + result, result_codes = safe_sort(values, codes, na_sentinel=99, verify=verify) + expected_codes = np.array([3, 1, 1, 2, 0, 3, 99, 4], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result_codes, expected_codes) + + codes = [] + result, result_codes = safe_sort(values, codes, verify=verify) + expected_codes = np.array([], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result_codes, expected_codes) + + @pytest.mark.parametrize("na_sentinel", [-1, 99]) + def test_codes_out_of_bound(self, na_sentinel): + values = [3, 1, 2, 0, 4] + expected = np.array([0, 1, 2, 3, 4]) + + # out of bound indices + codes = [0, 101, 102, 2, 3, 0, 99, 4] + result, result_codes = safe_sort(values, codes, na_sentinel=na_sentinel) + expected_codes = np.array( + [3, na_sentinel, na_sentinel, 2, 0, 3, na_sentinel, 4], dtype=np.intp + ) + tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result_codes, expected_codes) + + def test_mixed_integer(self): + values = np.array(["b", 1, 0, "a", 0, "b"], dtype=object) + result = safe_sort(values) + expected = np.array([0, 0, 1, "a", "b", "b"], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + values = np.array(["b", 1, 0, "a"], dtype=object) + codes = [0, 1, 2, 3, 0, -1, 1] + result, result_codes = safe_sort(values, codes) + expected = np.array([0, 1, "a", "b"], dtype=object) + expected_codes = np.array([3, 1, 0, 2, 3, -1, 1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result_codes, expected_codes) + + def test_mixed_integer_from_list(self): + values = ["b", 1, 0, "a", 0, "b"] + result = safe_sort(values) + expected = np.array([0, 0, 1, "a", "b", "b"], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + def test_unsortable(self): + # GH 13714 + arr = np.array([1, 2, datetime.now(), 0, 3], dtype=object) + msg = ( + "unorderable types: .* [<>] .*" + "|" # the above case happens for numpy < 1.14 + "'[<>]' not supported between instances of .*" + ) + with pytest.raises(TypeError, match=msg): + safe_sort(arr) + + def test_exceptions(self): + with pytest.raises(TypeError, match="Only list-like objects are allowed"): + safe_sort(values=1) + + with pytest.raises(TypeError, match="Only list-like objects or None"): + safe_sort(values=[0, 1, 2], codes=1) + + with pytest.raises(ValueError, match="values should be unique"): + safe_sort(values=[0, 1, 2, 1], codes=[0, 1]) + + def test_extension_array(self): + # a = array([1, 3, np.nan, 2], dtype='Int64') + a = array([1, 3, 2], dtype="Int64") + result = safe_sort(a) + # expected = array([1, 2, 3, np.nan], dtype='Int64') + expected = array([1, 2, 3], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + @pytest.mark.parametrize("verify", [True, False]) + @pytest.mark.parametrize("na_sentinel", [-1, 99]) + def test_extension_array_codes(self, verify, na_sentinel): + a = array([1, 3, 2], dtype="Int64") + result, codes = safe_sort( + a, [0, 1, na_sentinel, 2], na_sentinel=na_sentinel, verify=verify + ) + expected_values = array([1, 2, 3], dtype="Int64") + expected_codes = np.array([0, 2, na_sentinel, 1], dtype=np.intp) + tm.assert_extension_array_equal(result, expected_values) + tm.assert_numpy_array_equal(codes, expected_codes) diff --git a/venv/Lib/site-packages/pandas/tests/test_strings.py b/venv/Lib/site-packages/pandas/tests/test_strings.py new file mode 100644 index 0000000..568b391 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/test_strings.py @@ -0,0 +1,3590 @@ +from datetime import datetime, timedelta +import re + +import numpy as np +from numpy.random import randint +import pytest + +from pandas._libs import lib + +from pandas import DataFrame, Index, MultiIndex, Series, concat, isna, notna +import pandas._testing as tm +import pandas.core.strings as strings + + +def assert_series_or_index_equal(left, right): + if isinstance(left, Series): + tm.assert_series_equal(left, right) + else: # Index + tm.assert_index_equal(left, right) + + +_any_string_method = [ + ("cat", (), {"sep": ","}), + ("cat", (Series(list("zyx")),), {"sep": ",", "join": "left"}), + ("center", (10,), {}), + ("contains", ("a",), {}), + ("count", ("a",), {}), + ("decode", ("UTF-8",), {}), + ("encode", ("UTF-8",), {}), + ("endswith", ("a",), {}), + ("extract", ("([a-z]*)",), {"expand": False}), + ("extract", ("([a-z]*)",), {"expand": True}), + ("extractall", ("([a-z]*)",), {}), + ("find", ("a",), {}), + ("findall", ("a",), {}), + ("get", (0,), {}), + # because "index" (and "rindex") fail intentionally + # if the string is not found, search only for empty string + ("index", ("",), {}), + ("join", (",",), {}), + ("ljust", (10,), {}), + ("match", ("a",), {}), + ("normalize", ("NFC",), {}), + ("pad", (10,), {}), + ("partition", (" ",), {"expand": False}), + ("partition", (" ",), {"expand": True}), + ("repeat", (3,), {}), + ("replace", ("a", "z"), {}), + ("rfind", ("a",), {}), + ("rindex", ("",), {}), + ("rjust", (10,), {}), + ("rpartition", (" ",), {"expand": False}), + ("rpartition", (" ",), {"expand": True}), + ("slice", (0, 1), {}), + ("slice_replace", (0, 1, "z"), {}), + ("split", (" ",), {"expand": False}), + ("split", (" ",), {"expand": True}), + ("startswith", ("a",), {}), + # translating unicode points of "a" to "d" + ("translate", ({97: 100},), {}), + ("wrap", (2,), {}), + ("zfill", (10,), {}), +] + list( + zip( + [ + # methods without positional arguments: zip with empty tuple and empty dict + "capitalize", + "cat", + "get_dummies", + "isalnum", + "isalpha", + "isdecimal", + "isdigit", + "islower", + "isnumeric", + "isspace", + "istitle", + "isupper", + "len", + "lower", + "lstrip", + "partition", + "rpartition", + "rsplit", + "rstrip", + "slice", + "slice_replace", + "split", + "strip", + "swapcase", + "title", + "upper", + "casefold", + ], + [()] * 100, + [{}] * 100, + ) +) +ids, _, _ = zip(*_any_string_method) # use method name as fixture-id + + +# test that the above list captures all methods of StringMethods +missing_methods = { + f for f in dir(strings.StringMethods) if not f.startswith("_") +} - set(ids) +assert not missing_methods + + +@pytest.fixture(params=_any_string_method, ids=ids) +def any_string_method(request): + """ + Fixture for all public methods of `StringMethods` + + This fixture returns a tuple of the method name and sample arguments + necessary to call the method. + + Returns + ------- + method_name : str + The name of the method in `StringMethods` + args : tuple + Sample values for the positional arguments + kwargs : dict + Sample values for the keyword arguments + + Examples + -------- + >>> def test_something(any_string_method): + ... s = pd.Series(['a', 'b', np.nan, 'd']) + ... + ... method_name, args, kwargs = any_string_method + ... method = getattr(s.str, method_name) + ... # will not raise + ... method(*args, **kwargs) + """ + return request.param + + +# subset of the full set from pandas/conftest.py +_any_allowed_skipna_inferred_dtype = [ + ("string", ["a", np.nan, "c"]), + ("bytes", [b"a", np.nan, b"c"]), + ("empty", [np.nan, np.nan, np.nan]), + ("empty", []), + ("mixed-integer", ["a", np.nan, 2]), +] +ids, _ = zip(*_any_allowed_skipna_inferred_dtype) # use inferred type as id + + +@pytest.fixture(params=_any_allowed_skipna_inferred_dtype, ids=ids) +def any_allowed_skipna_inferred_dtype(request): + """ + Fixture for all (inferred) dtypes allowed in StringMethods.__init__ + + The covered (inferred) types are: + * 'string' + * 'empty' + * 'bytes' + * 'mixed' + * 'mixed-integer' + + Returns + ------- + inferred_dtype : str + The string for the inferred dtype from _libs.lib.infer_dtype + values : np.ndarray + An array of object dtype that will be inferred to have + `inferred_dtype` + + Examples + -------- + >>> import pandas._libs.lib as lib + >>> + >>> def test_something(any_allowed_skipna_inferred_dtype): + ... inferred_dtype, values = any_allowed_skipna_inferred_dtype + ... # will pass + ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype + ... + ... # constructor for .str-accessor will also pass + ... pd.Series(values).str + """ + inferred_dtype, values = request.param + values = np.array(values, dtype=object) # object dtype to avoid casting + + # correctness of inference tested in tests/dtypes/test_inference.py + return inferred_dtype, values + + +class TestStringMethods: + def test_api(self): + + # GH 6106, GH 9322 + assert Series.str is strings.StringMethods + assert isinstance(Series([""]).str, strings.StringMethods) + + def test_api_mi_raises(self): + # GH 23679 + mi = MultiIndex.from_arrays([["a", "b", "c"]]) + msg = "Can only use .str accessor with Index, not MultiIndex" + with pytest.raises(AttributeError, match=msg): + mi.str + assert not hasattr(mi, "str") + + @pytest.mark.parametrize("dtype", [object, "category"]) + def test_api_per_dtype(self, index_or_series, dtype, any_skipna_inferred_dtype): + # one instance of parametrized fixture + box = index_or_series + inferred_dtype, values = any_skipna_inferred_dtype + + t = box(values, dtype=dtype) # explicit dtype to avoid casting + + # TODO: get rid of these xfails + if dtype == "category" and inferred_dtype in ["period", "interval"]: + pytest.xfail( + reason="Conversion to numpy array fails because " + "the ._values-attribute is not a numpy array for " + "PeriodArray/IntervalArray; see GH 23553" + ) + + types_passing_constructor = [ + "string", + "unicode", + "empty", + "bytes", + "mixed", + "mixed-integer", + ] + if inferred_dtype in types_passing_constructor: + # GH 6106 + assert isinstance(t.str, strings.StringMethods) + else: + # GH 9184, GH 23011, GH 23163 + msg = "Can only use .str accessor with string values.*" + with pytest.raises(AttributeError, match=msg): + t.str + assert not hasattr(t, "str") + + @pytest.mark.parametrize("dtype", [object, "category"]) + def test_api_per_method( + self, + index_or_series, + dtype, + any_allowed_skipna_inferred_dtype, + any_string_method, + ): + # this test does not check correctness of the different methods, + # just that the methods work on the specified (inferred) dtypes, + # and raise on all others + box = index_or_series + + # one instance of each parametrized fixture + inferred_dtype, values = any_allowed_skipna_inferred_dtype + method_name, args, kwargs = any_string_method + + # TODO: get rid of these xfails + if ( + method_name in ["partition", "rpartition"] + and box == Index + and inferred_dtype == "empty" + ): + pytest.xfail(reason="Method cannot deal with empty Index") + if ( + method_name == "split" + and box == Index + and values.size == 0 + and kwargs.get("expand", None) is not None + ): + pytest.xfail(reason="Split fails on empty Series when expand=True") + if ( + method_name == "get_dummies" + and box == Index + and inferred_dtype == "empty" + and (dtype == object or values.size == 0) + ): + pytest.xfail(reason="Need to fortify get_dummies corner cases") + + t = box(values, dtype=dtype) # explicit dtype to avoid casting + method = getattr(t.str, method_name) + + bytes_allowed = method_name in ["decode", "get", "len", "slice"] + # as of v0.23.4, all methods except 'cat' are very lenient with the + # allowed data types, just returning NaN for entries that error. + # This could be changed with an 'errors'-kwarg to the `str`-accessor, + # see discussion in GH 13877 + mixed_allowed = method_name not in ["cat"] + + allowed_types = ( + ["string", "unicode", "empty"] + + ["bytes"] * bytes_allowed + + ["mixed", "mixed-integer"] * mixed_allowed + ) + + if inferred_dtype in allowed_types: + # xref GH 23555, GH 23556 + method(*args, **kwargs) # works! + else: + # GH 23011, GH 23163 + msg = ( + f"Cannot use .str.{method_name} with values of " + f"inferred dtype {repr(inferred_dtype)}." + ) + with pytest.raises(TypeError, match=msg): + method(*args, **kwargs) + + def test_api_for_categorical(self, any_string_method): + # https://github.com/pandas-dev/pandas/issues/10661 + s = Series(list("aabb")) + s = s + " " + s + c = s.astype("category") + assert isinstance(c.str, strings.StringMethods) + + method_name, args, kwargs = any_string_method + + result = getattr(c.str, method_name)(*args, **kwargs) + expected = getattr(s.str, method_name)(*args, **kwargs) + + if isinstance(result, DataFrame): + tm.assert_frame_equal(result, expected) + elif isinstance(result, Series): + tm.assert_series_equal(result, expected) + else: + # str.cat(others=None) returns string, for example + assert result == expected + + def test_iter(self): + # GH3638 + strs = "google", "wikimedia", "wikipedia", "wikitravel" + ds = Series(strs) + + with tm.assert_produces_warning(FutureWarning): + for s in ds.str: + # iter must yield a Series + assert isinstance(s, Series) + + # indices of each yielded Series should be equal to the index of + # the original Series + tm.assert_index_equal(s.index, ds.index) + + for el in s: + # each element of the series is either a basestring/str or nan + assert isinstance(el, str) or isna(el) + + # desired behavior is to iterate until everything would be nan on the + # next iter so make sure the last element of the iterator was 'l' in + # this case since 'wikitravel' is the longest string + assert s.dropna().values.item() == "l" + + def test_iter_empty(self): + ds = Series([], dtype=object) + + i, s = 100, 1 + + with tm.assert_produces_warning(FutureWarning): + for i, s in enumerate(ds.str): + pass + + # nothing to iterate over so nothing defined values should remain + # unchanged + assert i == 100 + assert s == 1 + + def test_iter_single_element(self): + ds = Series(["a"]) + + with tm.assert_produces_warning(FutureWarning): + for i, s in enumerate(ds.str): + pass + + assert not i + tm.assert_series_equal(ds, s) + + def test_iter_object_try_string(self): + ds = Series([slice(None, randint(10), randint(10, 20)) for _ in range(4)]) + + i, s = 100, "h" + + with tm.assert_produces_warning(FutureWarning): + for i, s in enumerate(ds.str): + pass + + assert i == 100 + assert s == "h" + + @pytest.mark.parametrize("other", [None, Series, Index]) + def test_str_cat_name(self, index_or_series, other): + # GH 21053 + box = index_or_series + values = ["a", "b"] + if other: + other = other(values) + else: + other = values + result = box(values, name="name").str.cat(other, sep=",") + assert result.name == "name" + + def test_str_cat(self, index_or_series): + box = index_or_series + # test_cat above tests "str_cat" from ndarray; + # here testing "str.cat" from Series/Indext to ndarray/list + s = box(["a", "a", "b", "b", "c", np.nan]) + + # single array + result = s.str.cat() + expected = "aabbc" + assert result == expected + + result = s.str.cat(na_rep="-") + expected = "aabbc-" + assert result == expected + + result = s.str.cat(sep="_", na_rep="NA") + expected = "a_a_b_b_c_NA" + assert result == expected + + t = np.array(["a", np.nan, "b", "d", "foo", np.nan], dtype=object) + expected = box(["aa", "a-", "bb", "bd", "cfoo", "--"]) + + # Series/Index with array + result = s.str.cat(t, na_rep="-") + assert_series_or_index_equal(result, expected) + + # Series/Index with list + result = s.str.cat(list(t), na_rep="-") + assert_series_or_index_equal(result, expected) + + # errors for incorrect lengths + rgx = r"If `others` contains arrays or lists \(or other list-likes.*" + z = Series(["1", "2", "3"]) + + with pytest.raises(ValueError, match=rgx): + s.str.cat(z.values) + + with pytest.raises(ValueError, match=rgx): + s.str.cat(list(z)) + + def test_str_cat_raises_intuitive_error(self, index_or_series): + # GH 11334 + box = index_or_series + s = box(["a", "b", "c", "d"]) + message = "Did you mean to supply a `sep` keyword?" + with pytest.raises(ValueError, match=message): + s.str.cat("|") + with pytest.raises(ValueError, match=message): + s.str.cat(" ") + + @pytest.mark.parametrize("sep", ["", None]) + @pytest.mark.parametrize("dtype_target", ["object", "category"]) + @pytest.mark.parametrize("dtype_caller", ["object", "category"]) + def test_str_cat_categorical( + self, index_or_series, dtype_caller, dtype_target, sep + ): + box = index_or_series + + s = Index(["a", "a", "b", "a"], dtype=dtype_caller) + s = s if box == Index else Series(s, index=s) + t = Index(["b", "a", "b", "c"], dtype=dtype_target) + + expected = Index(["ab", "aa", "bb", "ac"]) + expected = expected if box == Index else Series(expected, index=s) + + # Series/Index with unaligned Index -> t.values + result = s.str.cat(t.values, sep=sep) + assert_series_or_index_equal(result, expected) + + # Series/Index with Series having matching Index + t = Series(t.values, index=s) + result = s.str.cat(t, sep=sep) + assert_series_or_index_equal(result, expected) + + # Series/Index with Series.values + result = s.str.cat(t.values, sep=sep) + assert_series_or_index_equal(result, expected) + + # Series/Index with Series having different Index + t = Series(t.values, index=t.values) + expected = Index(["aa", "aa", "aa", "bb", "bb"]) + expected = ( + expected if box == Index else Series(expected, index=expected.str[:1]) + ) + + result = s.str.cat(t, sep=sep) + assert_series_or_index_equal(result, expected) + + # test integer/float dtypes (inferred by constructor) and mixed + @pytest.mark.parametrize( + "data", + [[1, 2, 3], [0.1, 0.2, 0.3], [1, 2, "b"]], + ids=["integers", "floats", "mixed"], + ) + # without dtype=object, np.array would cast [1, 2, 'b'] to ['1', '2', 'b'] + @pytest.mark.parametrize( + "box", + [Series, Index, list, lambda x: np.array(x, dtype=object)], + ids=["Series", "Index", "list", "np.array"], + ) + def test_str_cat_wrong_dtype_raises(self, box, data): + # GH 22722 + s = Series(["a", "b", "c"]) + t = box(data) + + msg = "Concatenation requires list-likes containing only strings.*" + with pytest.raises(TypeError, match=msg): + # need to use outer and na_rep, as otherwise Index would not raise + s.str.cat(t, join="outer", na_rep="-") + + def test_str_cat_mixed_inputs(self, index_or_series): + box = index_or_series + s = Index(["a", "b", "c", "d"]) + s = s if box == Index else Series(s, index=s) + + t = Series(["A", "B", "C", "D"], index=s.values) + d = concat([t, Series(s, index=s)], axis=1) + + expected = Index(["aAa", "bBb", "cCc", "dDd"]) + expected = expected if box == Index else Series(expected.values, index=s.values) + + # Series/Index with DataFrame + result = s.str.cat(d) + assert_series_or_index_equal(result, expected) + + # Series/Index with two-dimensional ndarray + result = s.str.cat(d.values) + assert_series_or_index_equal(result, expected) + + # Series/Index with list of Series + result = s.str.cat([t, s]) + assert_series_or_index_equal(result, expected) + + # Series/Index with mixed list of Series/array + result = s.str.cat([t, s.values]) + assert_series_or_index_equal(result, expected) + + # Series/Index with list of Series; different indexes + t.index = ["b", "c", "d", "a"] + expected = box(["aDa", "bAb", "cBc", "dCd"]) + expected = expected if box == Index else Series(expected.values, index=s.values) + result = s.str.cat([t, s]) + assert_series_or_index_equal(result, expected) + + # Series/Index with mixed list; different index + result = s.str.cat([t, s.values]) + assert_series_or_index_equal(result, expected) + + # Series/Index with DataFrame; different indexes + d.index = ["b", "c", "d", "a"] + expected = box(["aDd", "bAa", "cBb", "dCc"]) + expected = expected if box == Index else Series(expected.values, index=s.values) + result = s.str.cat(d) + assert_series_or_index_equal(result, expected) + + # errors for incorrect lengths + rgx = r"If `others` contains arrays or lists \(or other list-likes.*" + z = Series(["1", "2", "3"]) + e = concat([z, z], axis=1) + + # two-dimensional ndarray + with pytest.raises(ValueError, match=rgx): + s.str.cat(e.values) + + # list of list-likes + with pytest.raises(ValueError, match=rgx): + s.str.cat([z.values, s.values]) + + # mixed list of Series/list-like + with pytest.raises(ValueError, match=rgx): + s.str.cat([z.values, s]) + + # errors for incorrect arguments in list-like + rgx = "others must be Series, Index, DataFrame,.*" + # make sure None/NaN do not crash checks in _get_series_list + u = Series(["a", np.nan, "c", None]) + + # mix of string and Series + with pytest.raises(TypeError, match=rgx): + s.str.cat([u, "u"]) + + # DataFrame in list + with pytest.raises(TypeError, match=rgx): + s.str.cat([u, d]) + + # 2-dim ndarray in list + with pytest.raises(TypeError, match=rgx): + s.str.cat([u, d.values]) + + # nested lists + with pytest.raises(TypeError, match=rgx): + s.str.cat([u, [u, d]]) + + # forbidden input type: set + # GH 23009 + with pytest.raises(TypeError, match=rgx): + s.str.cat(set(u)) + + # forbidden input type: set in list + # GH 23009 + with pytest.raises(TypeError, match=rgx): + s.str.cat([u, set(u)]) + + # other forbidden input type, e.g. int + with pytest.raises(TypeError, match=rgx): + s.str.cat(1) + + # nested list-likes + with pytest.raises(TypeError, match=rgx): + s.str.cat(iter([t.values, list(s)])) + + @pytest.mark.parametrize("join", ["left", "outer", "inner", "right"]) + def test_str_cat_align_indexed(self, index_or_series, join): + # https://github.com/pandas-dev/pandas/issues/18657 + box = index_or_series + + s = Series(["a", "b", "c", "d"], index=["a", "b", "c", "d"]) + t = Series(["D", "A", "E", "B"], index=["d", "a", "e", "b"]) + sa, ta = s.align(t, join=join) + # result after manual alignment of inputs + expected = sa.str.cat(ta, na_rep="-") + + if box == Index: + s = Index(s) + sa = Index(sa) + expected = Index(expected) + + result = s.str.cat(t, join=join, na_rep="-") + assert_series_or_index_equal(result, expected) + + @pytest.mark.parametrize("join", ["left", "outer", "inner", "right"]) + def test_str_cat_align_mixed_inputs(self, join): + s = Series(["a", "b", "c", "d"]) + t = Series(["d", "a", "e", "b"], index=[3, 0, 4, 1]) + d = concat([t, t], axis=1) + + expected_outer = Series(["aaa", "bbb", "c--", "ddd", "-ee"]) + expected = expected_outer.loc[s.index.join(t.index, how=join)] + + # list of Series + result = s.str.cat([t, t], join=join, na_rep="-") + tm.assert_series_equal(result, expected) + + # DataFrame + result = s.str.cat(d, join=join, na_rep="-") + tm.assert_series_equal(result, expected) + + # mixed list of indexed/unindexed + u = np.array(["A", "B", "C", "D"]) + expected_outer = Series(["aaA", "bbB", "c-C", "ddD", "-e-"]) + # joint index of rhs [t, u]; u will be forced have index of s + rhs_idx = t.index & s.index if join == "inner" else t.index | s.index + + expected = expected_outer.loc[s.index.join(rhs_idx, how=join)] + result = s.str.cat([t, u], join=join, na_rep="-") + tm.assert_series_equal(result, expected) + + with pytest.raises(TypeError, match="others must be Series,.*"): + # nested lists are forbidden + s.str.cat([t, list(u)], join=join) + + # errors for incorrect lengths + rgx = r"If `others` contains arrays or lists \(or other list-likes.*" + z = Series(["1", "2", "3"]).values + + # unindexed object of wrong length + with pytest.raises(ValueError, match=rgx): + s.str.cat(z, join=join) + + # unindexed object of wrong length in list + with pytest.raises(ValueError, match=rgx): + s.str.cat([t, z], join=join) + + index_or_series2 = [Series, Index] # type: ignore + # List item 0 has incompatible type "Type[Series]"; expected "Type[PandasObject]" + # See GH#29725 + + @pytest.mark.parametrize("other", index_or_series2) + def test_str_cat_all_na(self, index_or_series, other): + # GH 24044 + box = index_or_series + + # check that all NaNs in caller / target work + s = Index(["a", "b", "c", "d"]) + s = s if box == Index else Series(s, index=s) + t = other([np.nan] * 4, dtype=object) + # add index of s for alignment + t = t if other == Index else Series(t, index=s) + + # all-NA target + if box == Series: + expected = Series([np.nan] * 4, index=s.index, dtype=object) + else: # box == Index + expected = Index([np.nan] * 4, dtype=object) + result = s.str.cat(t, join="left") + assert_series_or_index_equal(result, expected) + + # all-NA caller (only for Series) + if other == Series: + expected = Series([np.nan] * 4, dtype=object, index=t.index) + result = t.str.cat(s, join="left") + tm.assert_series_equal(result, expected) + + def test_str_cat_special_cases(self): + s = Series(["a", "b", "c", "d"]) + t = Series(["d", "a", "e", "b"], index=[3, 0, 4, 1]) + + # iterator of elements with different types + expected = Series(["aaa", "bbb", "c-c", "ddd", "-e-"]) + result = s.str.cat(iter([t, s.values]), join="outer", na_rep="-") + tm.assert_series_equal(result, expected) + + # right-align with different indexes in others + expected = Series(["aa-", "d-d"], index=[0, 3]) + result = s.str.cat([t.loc[[0]], t.loc[[3]]], join="right", na_rep="-") + tm.assert_series_equal(result, expected) + + def test_cat_on_filtered_index(self): + df = DataFrame( + index=MultiIndex.from_product( + [[2011, 2012], [1, 2, 3]], names=["year", "month"] + ) + ) + + df = df.reset_index() + df = df[df.month > 1] + + str_year = df.year.astype("str") + str_month = df.month.astype("str") + str_both = str_year.str.cat(str_month, sep=" ") + + assert str_both.loc[1] == "2011 2" + + str_multiple = str_year.str.cat([str_month, str_month], sep=" ") + + assert str_multiple.loc[1] == "2011 2 2" + + def test_count(self): + values = np.array( + ["foo", "foofoo", np.nan, "foooofooofommmfoo"], dtype=np.object_ + ) + + result = strings.str_count(values, "f[o]+") + exp = np.array([1, 2, np.nan, 4]) + tm.assert_numpy_array_equal(result, exp) + + result = Series(values).str.count("f[o]+") + exp = Series([1, 2, np.nan, 4]) + assert isinstance(result, Series) + tm.assert_series_equal(result, exp) + + # mixed + mixed = np.array( + ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], + dtype=object, + ) + rs = strings.str_count(mixed, "a") + xp = np.array([1, np.nan, 0, np.nan, np.nan, 0, np.nan, np.nan, np.nan]) + tm.assert_numpy_array_equal(rs, xp) + + rs = Series(mixed).str.count("a") + xp = Series([1, np.nan, 0, np.nan, np.nan, 0, np.nan, np.nan, np.nan]) + assert isinstance(rs, Series) + tm.assert_series_equal(rs, xp) + + def test_contains(self): + values = np.array( + ["foo", np.nan, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_ + ) + pat = "mmm[_]+" + + result = strings.str_contains(values, pat) + expected = np.array([False, np.nan, True, True, False], dtype=np.object_) + tm.assert_numpy_array_equal(result, expected) + + result = strings.str_contains(values, pat, regex=False) + expected = np.array([False, np.nan, False, False, True], dtype=np.object_) + tm.assert_numpy_array_equal(result, expected) + + values = np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=object) + result = strings.str_contains(values, pat) + expected = np.array([False, False, True, True]) + assert result.dtype == np.bool_ + tm.assert_numpy_array_equal(result, expected) + + # case insensitive using regex + values = np.array(["Foo", "xYz", "fOOomMm__fOo", "MMM_"], dtype=object) + result = strings.str_contains(values, "FOO|mmm", case=False) + expected = np.array([True, False, True, True]) + tm.assert_numpy_array_equal(result, expected) + + # case insensitive without regex + result = strings.str_contains(values, "foo", regex=False, case=False) + expected = np.array([True, False, True, False]) + tm.assert_numpy_array_equal(result, expected) + + # mixed + mixed = np.array( + ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], + dtype=object, + ) + rs = strings.str_contains(mixed, "o") + xp = np.array( + [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan], + dtype=np.object_, + ) + tm.assert_numpy_array_equal(rs, xp) + + rs = Series(mixed).str.contains("o") + xp = Series( + [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan] + ) + assert isinstance(rs, Series) + tm.assert_series_equal(rs, xp) + + # unicode + values = np.array(["foo", np.nan, "fooommm__foo", "mmm_"], dtype=np.object_) + pat = "mmm[_]+" + + result = strings.str_contains(values, pat) + expected = np.array([False, np.nan, True, True], dtype=np.object_) + tm.assert_numpy_array_equal(result, expected) + + result = strings.str_contains(values, pat, na=False) + expected = np.array([False, False, True, True]) + tm.assert_numpy_array_equal(result, expected) + + values = np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=np.object_) + result = strings.str_contains(values, pat) + expected = np.array([False, False, True, True]) + assert result.dtype == np.bool_ + tm.assert_numpy_array_equal(result, expected) + + def test_contains_for_object_category(self): + # gh 22158 + + # na for category + values = Series(["a", "b", "c", "a", np.nan], dtype="category") + result = values.str.contains("a", na=True) + expected = Series([True, False, False, True, True]) + tm.assert_series_equal(result, expected) + + result = values.str.contains("a", na=False) + expected = Series([True, False, False, True, False]) + tm.assert_series_equal(result, expected) + + # na for objects + values = Series(["a", "b", "c", "a", np.nan]) + result = values.str.contains("a", na=True) + expected = Series([True, False, False, True, True]) + tm.assert_series_equal(result, expected) + + result = values.str.contains("a", na=False) + expected = Series([True, False, False, True, False]) + tm.assert_series_equal(result, expected) + + def test_startswith(self): + values = Series(["om", np.nan, "foo_nom", "nom", "bar_foo", np.nan, "foo"]) + + result = values.str.startswith("foo") + exp = Series([False, np.nan, True, False, False, np.nan, True]) + tm.assert_series_equal(result, exp) + + result = values.str.startswith("foo", na=True) + tm.assert_series_equal(result, exp.fillna(True).astype(bool)) + + # mixed + mixed = np.array( + ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], + dtype=np.object_, + ) + rs = strings.str_startswith(mixed, "f") + xp = np.array( + [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan], + dtype=np.object_, + ) + tm.assert_numpy_array_equal(rs, xp) + + rs = Series(mixed).str.startswith("f") + assert isinstance(rs, Series) + xp = Series( + [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan] + ) + tm.assert_series_equal(rs, xp) + + def test_endswith(self): + values = Series(["om", np.nan, "foo_nom", "nom", "bar_foo", np.nan, "foo"]) + + result = values.str.endswith("foo") + exp = Series([False, np.nan, False, False, True, np.nan, True]) + tm.assert_series_equal(result, exp) + + result = values.str.endswith("foo", na=False) + tm.assert_series_equal(result, exp.fillna(False).astype(bool)) + + # mixed + mixed = np.array( + ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], + dtype=object, + ) + rs = strings.str_endswith(mixed, "f") + xp = np.array( + [False, np.nan, False, np.nan, np.nan, False, np.nan, np.nan, np.nan], + dtype=np.object_, + ) + tm.assert_numpy_array_equal(rs, xp) + + rs = Series(mixed).str.endswith("f") + xp = Series( + [False, np.nan, False, np.nan, np.nan, False, np.nan, np.nan, np.nan] + ) + assert isinstance(rs, Series) + tm.assert_series_equal(rs, xp) + + def test_title(self): + values = Series(["FOO", "BAR", np.nan, "Blah", "blurg"]) + + result = values.str.title() + exp = Series(["Foo", "Bar", np.nan, "Blah", "Blurg"]) + tm.assert_series_equal(result, exp) + + # mixed + mixed = Series( + ["FOO", np.nan, "bar", True, datetime.today(), "blah", None, 1, 2.0] + ) + mixed = mixed.str.title() + exp = Series( + ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", np.nan, np.nan, np.nan] + ) + tm.assert_almost_equal(mixed, exp) + + def test_lower_upper(self): + values = Series(["om", np.nan, "nom", "nom"]) + + result = values.str.upper() + exp = Series(["OM", np.nan, "NOM", "NOM"]) + tm.assert_series_equal(result, exp) + + result = result.str.lower() + tm.assert_series_equal(result, values) + + # mixed + mixed = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0]) + mixed = mixed.str.upper() + rs = Series(mixed).str.lower() + xp = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan]) + assert isinstance(rs, Series) + tm.assert_series_equal(rs, xp) + + def test_capitalize(self): + values = Series(["FOO", "BAR", np.nan, "Blah", "blurg"]) + result = values.str.capitalize() + exp = Series(["Foo", "Bar", np.nan, "Blah", "Blurg"]) + tm.assert_series_equal(result, exp) + + # mixed + mixed = Series( + ["FOO", np.nan, "bar", True, datetime.today(), "blah", None, 1, 2.0] + ) + mixed = mixed.str.capitalize() + exp = Series( + ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", np.nan, np.nan, np.nan] + ) + tm.assert_almost_equal(mixed, exp) + + def test_swapcase(self): + values = Series(["FOO", "BAR", np.nan, "Blah", "blurg"]) + result = values.str.swapcase() + exp = Series(["foo", "bar", np.nan, "bLAH", "BLURG"]) + tm.assert_series_equal(result, exp) + + # mixed + mixed = Series( + ["FOO", np.nan, "bar", True, datetime.today(), "Blah", None, 1, 2.0] + ) + mixed = mixed.str.swapcase() + exp = Series( + ["foo", np.nan, "BAR", np.nan, np.nan, "bLAH", np.nan, np.nan, np.nan] + ) + tm.assert_almost_equal(mixed, exp) + + def test_casemethods(self): + values = ["aaa", "bbb", "CCC", "Dddd", "eEEE"] + s = Series(values) + assert s.str.lower().tolist() == [v.lower() for v in values] + assert s.str.upper().tolist() == [v.upper() for v in values] + assert s.str.title().tolist() == [v.title() for v in values] + assert s.str.capitalize().tolist() == [v.capitalize() for v in values] + assert s.str.swapcase().tolist() == [v.swapcase() for v in values] + + def test_replace(self): + values = Series(["fooBAD__barBAD", np.nan]) + + result = values.str.replace("BAD[_]*", "") + exp = Series(["foobar", np.nan]) + tm.assert_series_equal(result, exp) + + result = values.str.replace("BAD[_]*", "", n=1) + exp = Series(["foobarBAD", np.nan]) + tm.assert_series_equal(result, exp) + + # mixed + mixed = Series( + ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] + ) + + rs = Series(mixed).str.replace("BAD[_]*", "") + xp = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan]) + assert isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + # flags + unicode + values = Series([b"abcd,\xc3\xa0".decode("utf-8")]) + exp = Series([b"abcd, \xc3\xa0".decode("utf-8")]) + result = values.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE) + tm.assert_series_equal(result, exp) + + # GH 13438 + msg = "repl must be a string or callable" + for klass in (Series, Index): + for repl in (None, 3, {"a": "b"}): + for data in (["a", "b", None], ["a", "b", "c", "ad"]): + values = klass(data) + with pytest.raises(TypeError, match=msg): + values.str.replace("a", repl) + + def test_replace_callable(self): + # GH 15055 + values = Series(["fooBAD__barBAD", np.nan]) + + # test with callable + repl = lambda m: m.group(0).swapcase() + result = values.str.replace("[a-z][A-Z]{2}", repl, n=2) + exp = Series(["foObaD__baRbaD", np.nan]) + tm.assert_series_equal(result, exp) + + # test with wrong number of arguments, raising an error + p_err = ( + r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " + r"(?(3)required )positional arguments?" + ) + + repl = lambda: None + with pytest.raises(TypeError, match=p_err): + values.str.replace("a", repl) + + repl = lambda m, x: None + with pytest.raises(TypeError, match=p_err): + values.str.replace("a", repl) + + repl = lambda m, x, y=None: None + with pytest.raises(TypeError, match=p_err): + values.str.replace("a", repl) + + # test regex named groups + values = Series(["Foo Bar Baz", np.nan]) + pat = r"(?P\w+) (?P\w+) (?P\w+)" + repl = lambda m: m.group("middle").swapcase() + result = values.str.replace(pat, repl) + exp = Series(["bAR", np.nan]) + tm.assert_series_equal(result, exp) + + def test_replace_compiled_regex(self): + # GH 15446 + values = Series(["fooBAD__barBAD", np.nan]) + + # test with compiled regex + pat = re.compile(r"BAD[_]*") + result = values.str.replace(pat, "") + exp = Series(["foobar", np.nan]) + tm.assert_series_equal(result, exp) + + result = values.str.replace(pat, "", n=1) + exp = Series(["foobarBAD", np.nan]) + tm.assert_series_equal(result, exp) + + # mixed + mixed = Series( + ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] + ) + + rs = Series(mixed).str.replace(pat, "") + xp = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan]) + assert isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + # flags + unicode + values = Series([b"abcd,\xc3\xa0".decode("utf-8")]) + exp = Series([b"abcd, \xc3\xa0".decode("utf-8")]) + pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE) + result = values.str.replace(pat, ", ") + tm.assert_series_equal(result, exp) + + # case and flags provided to str.replace will have no effect + # and will produce warnings + values = Series(["fooBAD__barBAD__bad", np.nan]) + pat = re.compile(r"BAD[_]*") + + with pytest.raises(ValueError, match="case and flags cannot be"): + result = values.str.replace(pat, "", flags=re.IGNORECASE) + + with pytest.raises(ValueError, match="case and flags cannot be"): + result = values.str.replace(pat, "", case=False) + + with pytest.raises(ValueError, match="case and flags cannot be"): + result = values.str.replace(pat, "", case=True) + + # test with callable + values = Series(["fooBAD__barBAD", np.nan]) + repl = lambda m: m.group(0).swapcase() + pat = re.compile("[a-z][A-Z]{2}") + result = values.str.replace(pat, repl, n=2) + exp = Series(["foObaD__baRbaD", np.nan]) + tm.assert_series_equal(result, exp) + + def test_replace_literal(self): + # GH16808 literal replace (regex=False vs regex=True) + values = Series(["f.o", "foo", np.nan]) + exp = Series(["bao", "bao", np.nan]) + result = values.str.replace("f.", "ba") + tm.assert_series_equal(result, exp) + + exp = Series(["bao", "foo", np.nan]) + result = values.str.replace("f.", "ba", regex=False) + tm.assert_series_equal(result, exp) + + # Cannot do a literal replace if given a callable repl or compiled + # pattern + callable_repl = lambda m: m.group(0).swapcase() + compiled_pat = re.compile("[a-z][A-Z]{2}") + + msg = "Cannot use a callable replacement when regex=False" + with pytest.raises(ValueError, match=msg): + values.str.replace("abc", callable_repl, regex=False) + + msg = "Cannot use a compiled regex as replacement pattern with regex=False" + with pytest.raises(ValueError, match=msg): + values.str.replace(compiled_pat, "", regex=False) + + def test_repeat(self): + values = Series(["a", "b", np.nan, "c", np.nan, "d"]) + + result = values.str.repeat(3) + exp = Series(["aaa", "bbb", np.nan, "ccc", np.nan, "ddd"]) + tm.assert_series_equal(result, exp) + + result = values.str.repeat([1, 2, 3, 4, 5, 6]) + exp = Series(["a", "bb", np.nan, "cccc", np.nan, "dddddd"]) + tm.assert_series_equal(result, exp) + + # mixed + mixed = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0]) + + rs = Series(mixed).str.repeat(3) + xp = Series( + ["aaa", np.nan, "bbb", np.nan, np.nan, "foofoofoo", np.nan, np.nan, np.nan] + ) + assert isinstance(rs, Series) + tm.assert_series_equal(rs, xp) + + def test_match(self): + # New match behavior introduced in 0.13 + values = Series(["fooBAD__barBAD", np.nan, "foo"]) + result = values.str.match(".*(BAD[_]+).*(BAD)") + exp = Series([True, np.nan, False]) + tm.assert_series_equal(result, exp) + + values = Series(["fooBAD__barBAD", np.nan, "foo"]) + result = values.str.match(".*BAD[_]+.*BAD") + exp = Series([True, np.nan, False]) + tm.assert_series_equal(result, exp) + + # mixed + mixed = Series( + [ + "aBAD_BAD", + np.nan, + "BAD_b_BAD", + True, + datetime.today(), + "foo", + None, + 1, + 2.0, + ] + ) + rs = Series(mixed).str.match(".*(BAD[_]+).*(BAD)") + xp = Series([True, np.nan, True, np.nan, np.nan, False, np.nan, np.nan, np.nan]) + assert isinstance(rs, Series) + tm.assert_series_equal(rs, xp) + + # na GH #6609 + res = Series(["a", 0, np.nan]).str.match("a", na=False) + exp = Series([True, False, False]) + tm.assert_series_equal(exp, res) + res = Series(["a", 0, np.nan]).str.match("a") + exp = Series([True, np.nan, np.nan]) + tm.assert_series_equal(exp, res) + + def test_extract_expand_None(self): + values = Series(["fooBAD__barBAD", np.nan, "foo"]) + with pytest.raises(ValueError, match="expand must be True or False"): + values.str.extract(".*(BAD[_]+).*(BAD)", expand=None) + + def test_extract_expand_unspecified(self): + values = Series(["fooBAD__barBAD", np.nan, "foo"]) + result_unspecified = values.str.extract(".*(BAD[_]+).*") + assert isinstance(result_unspecified, DataFrame) + result_true = values.str.extract(".*(BAD[_]+).*", expand=True) + tm.assert_frame_equal(result_unspecified, result_true) + + def test_extract_expand_False(self): + # Contains tests like those in test_match and some others. + values = Series(["fooBAD__barBAD", np.nan, "foo"]) + er = [np.nan, np.nan] # empty row + + result = values.str.extract(".*(BAD[_]+).*(BAD)", expand=False) + exp = DataFrame([["BAD__", "BAD"], er, er]) + tm.assert_frame_equal(result, exp) + + # mixed + mixed = Series( + [ + "aBAD_BAD", + np.nan, + "BAD_b_BAD", + True, + datetime.today(), + "foo", + None, + 1, + 2.0, + ] + ) + + rs = Series(mixed).str.extract(".*(BAD[_]+).*(BAD)", expand=False) + exp = DataFrame([["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er]) + tm.assert_frame_equal(rs, exp) + + # unicode + values = Series(["fooBAD__barBAD", np.nan, "foo"]) + + result = values.str.extract(".*(BAD[_]+).*(BAD)", expand=False) + exp = DataFrame([["BAD__", "BAD"], er, er]) + tm.assert_frame_equal(result, exp) + + # GH9980 + # Index only works with one regex group since + # multi-group would expand to a frame + idx = Index(["A1", "A2", "A3", "A4", "B5"]) + with pytest.raises(ValueError, match="supported"): + idx.str.extract("([AB])([123])", expand=False) + + # these should work for both Series and Index + for klass in [Series, Index]: + # no groups + s_or_idx = klass(["A1", "B2", "C3"]) + msg = "pattern contains no capture groups" + with pytest.raises(ValueError, match=msg): + s_or_idx.str.extract("[ABC][123]", expand=False) + + # only non-capturing groups + with pytest.raises(ValueError, match=msg): + s_or_idx.str.extract("(?:[AB]).*", expand=False) + + # single group renames series/index properly + s_or_idx = klass(["A1", "A2"]) + result = s_or_idx.str.extract(r"(?PA)\d", expand=False) + assert result.name == "uno" + + exp = klass(["A", "A"], name="uno") + if klass == Series: + tm.assert_series_equal(result, exp) + else: + tm.assert_index_equal(result, exp) + + s = Series(["A1", "B2", "C3"]) + # one group, no matches + result = s.str.extract("(_)", expand=False) + exp = Series([np.nan, np.nan, np.nan], dtype=object) + tm.assert_series_equal(result, exp) + + # two groups, no matches + result = s.str.extract("(_)(_)", expand=False) + exp = DataFrame( + [[np.nan, np.nan], [np.nan, np.nan], [np.nan, np.nan]], dtype=object + ) + tm.assert_frame_equal(result, exp) + + # one group, some matches + result = s.str.extract("([AB])[123]", expand=False) + exp = Series(["A", "B", np.nan]) + tm.assert_series_equal(result, exp) + + # two groups, some matches + result = s.str.extract("([AB])([123])", expand=False) + exp = DataFrame([["A", "1"], ["B", "2"], [np.nan, np.nan]]) + tm.assert_frame_equal(result, exp) + + # one named group + result = s.str.extract("(?P[AB])", expand=False) + exp = Series(["A", "B", np.nan], name="letter") + tm.assert_series_equal(result, exp) + + # two named groups + result = s.str.extract("(?P[AB])(?P[123])", expand=False) + exp = DataFrame( + [["A", "1"], ["B", "2"], [np.nan, np.nan]], columns=["letter", "number"] + ) + tm.assert_frame_equal(result, exp) + + # mix named and unnamed groups + result = s.str.extract("([AB])(?P[123])", expand=False) + exp = DataFrame( + [["A", "1"], ["B", "2"], [np.nan, np.nan]], columns=[0, "number"] + ) + tm.assert_frame_equal(result, exp) + + # one normal group, one non-capturing group + result = s.str.extract("([AB])(?:[123])", expand=False) + exp = Series(["A", "B", np.nan]) + tm.assert_series_equal(result, exp) + + # two normal groups, one non-capturing group + result = Series(["A11", "B22", "C33"]).str.extract( + "([AB])([123])(?:[123])", expand=False + ) + exp = DataFrame([["A", "1"], ["B", "2"], [np.nan, np.nan]]) + tm.assert_frame_equal(result, exp) + + # one optional group followed by one normal group + result = Series(["A1", "B2", "3"]).str.extract( + "(?P[AB])?(?P[123])", expand=False + ) + exp = DataFrame( + [["A", "1"], ["B", "2"], [np.nan, "3"]], columns=["letter", "number"] + ) + tm.assert_frame_equal(result, exp) + + # one normal group followed by one optional group + result = Series(["A1", "B2", "C"]).str.extract( + "(?P[ABC])(?P[123])?", expand=False + ) + exp = DataFrame( + [["A", "1"], ["B", "2"], ["C", np.nan]], columns=["letter", "number"] + ) + tm.assert_frame_equal(result, exp) + + # GH6348 + # not passing index to the extractor + def check_index(index): + data = ["A1", "B2", "C"] + index = index[: len(data)] + s = Series(data, index=index) + result = s.str.extract(r"(\d)", expand=False) + exp = Series(["1", "2", np.nan], index=index) + tm.assert_series_equal(result, exp) + + result = Series(data, index=index).str.extract( + r"(?P\D)(?P\d)?", expand=False + ) + e_list = [["A", "1"], ["B", "2"], ["C", np.nan]] + exp = DataFrame(e_list, columns=["letter", "number"], index=index) + tm.assert_frame_equal(result, exp) + + i_funs = [ + tm.makeStringIndex, + tm.makeUnicodeIndex, + tm.makeIntIndex, + tm.makeDateIndex, + tm.makePeriodIndex, + tm.makeRangeIndex, + ] + for index in i_funs: + check_index(index()) + + # single_series_name_is_preserved. + s = Series(["a3", "b3", "c2"], name="bob") + r = s.str.extract(r"(?P[a-z])", expand=False) + e = Series(["a", "b", "c"], name="sue") + tm.assert_series_equal(r, e) + assert r.name == e.name + + def test_extract_expand_True(self): + # Contains tests like those in test_match and some others. + values = Series(["fooBAD__barBAD", np.nan, "foo"]) + er = [np.nan, np.nan] # empty row + + result = values.str.extract(".*(BAD[_]+).*(BAD)", expand=True) + exp = DataFrame([["BAD__", "BAD"], er, er]) + tm.assert_frame_equal(result, exp) + + # mixed + mixed = Series( + [ + "aBAD_BAD", + np.nan, + "BAD_b_BAD", + True, + datetime.today(), + "foo", + None, + 1, + 2.0, + ] + ) + + rs = Series(mixed).str.extract(".*(BAD[_]+).*(BAD)", expand=True) + exp = DataFrame([["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er]) + tm.assert_frame_equal(rs, exp) + + # these should work for both Series and Index + for klass in [Series, Index]: + # no groups + s_or_idx = klass(["A1", "B2", "C3"]) + msg = "pattern contains no capture groups" + with pytest.raises(ValueError, match=msg): + s_or_idx.str.extract("[ABC][123]", expand=True) + + # only non-capturing groups + with pytest.raises(ValueError, match=msg): + s_or_idx.str.extract("(?:[AB]).*", expand=True) + + # single group renames series/index properly + s_or_idx = klass(["A1", "A2"]) + result_df = s_or_idx.str.extract(r"(?PA)\d", expand=True) + assert isinstance(result_df, DataFrame) + result_series = result_df["uno"] + tm.assert_series_equal(result_series, Series(["A", "A"], name="uno")) + + def test_extract_series(self): + # extract should give the same result whether or not the + # series has a name. + for series_name in None, "series_name": + s = Series(["A1", "B2", "C3"], name=series_name) + # one group, no matches + result = s.str.extract("(_)", expand=True) + exp = DataFrame([np.nan, np.nan, np.nan], dtype=object) + tm.assert_frame_equal(result, exp) + + # two groups, no matches + result = s.str.extract("(_)(_)", expand=True) + exp = DataFrame( + [[np.nan, np.nan], [np.nan, np.nan], [np.nan, np.nan]], dtype=object + ) + tm.assert_frame_equal(result, exp) + + # one group, some matches + result = s.str.extract("([AB])[123]", expand=True) + exp = DataFrame(["A", "B", np.nan]) + tm.assert_frame_equal(result, exp) + + # two groups, some matches + result = s.str.extract("([AB])([123])", expand=True) + exp = DataFrame([["A", "1"], ["B", "2"], [np.nan, np.nan]]) + tm.assert_frame_equal(result, exp) + + # one named group + result = s.str.extract("(?P[AB])", expand=True) + exp = DataFrame({"letter": ["A", "B", np.nan]}) + tm.assert_frame_equal(result, exp) + + # two named groups + result = s.str.extract("(?P[AB])(?P[123])", expand=True) + e_list = [["A", "1"], ["B", "2"], [np.nan, np.nan]] + exp = DataFrame(e_list, columns=["letter", "number"]) + tm.assert_frame_equal(result, exp) + + # mix named and unnamed groups + result = s.str.extract("([AB])(?P[123])", expand=True) + exp = DataFrame(e_list, columns=[0, "number"]) + tm.assert_frame_equal(result, exp) + + # one normal group, one non-capturing group + result = s.str.extract("([AB])(?:[123])", expand=True) + exp = DataFrame(["A", "B", np.nan]) + tm.assert_frame_equal(result, exp) + + def test_extract_optional_groups(self): + + # two normal groups, one non-capturing group + result = Series(["A11", "B22", "C33"]).str.extract( + "([AB])([123])(?:[123])", expand=True + ) + exp = DataFrame([["A", "1"], ["B", "2"], [np.nan, np.nan]]) + tm.assert_frame_equal(result, exp) + + # one optional group followed by one normal group + result = Series(["A1", "B2", "3"]).str.extract( + "(?P[AB])?(?P[123])", expand=True + ) + e_list = [["A", "1"], ["B", "2"], [np.nan, "3"]] + exp = DataFrame(e_list, columns=["letter", "number"]) + tm.assert_frame_equal(result, exp) + + # one normal group followed by one optional group + result = Series(["A1", "B2", "C"]).str.extract( + "(?P[ABC])(?P[123])?", expand=True + ) + e_list = [["A", "1"], ["B", "2"], ["C", np.nan]] + exp = DataFrame(e_list, columns=["letter", "number"]) + tm.assert_frame_equal(result, exp) + + # GH6348 + # not passing index to the extractor + def check_index(index): + data = ["A1", "B2", "C"] + index = index[: len(data)] + result = Series(data, index=index).str.extract(r"(\d)", expand=True) + exp = DataFrame(["1", "2", np.nan], index=index) + tm.assert_frame_equal(result, exp) + + result = Series(data, index=index).str.extract( + r"(?P\D)(?P\d)?", expand=True + ) + e_list = [["A", "1"], ["B", "2"], ["C", np.nan]] + exp = DataFrame(e_list, columns=["letter", "number"], index=index) + tm.assert_frame_equal(result, exp) + + i_funs = [ + tm.makeStringIndex, + tm.makeUnicodeIndex, + tm.makeIntIndex, + tm.makeDateIndex, + tm.makePeriodIndex, + tm.makeRangeIndex, + ] + for index in i_funs: + check_index(index()) + + def test_extract_single_group_returns_frame(self): + # GH11386 extract should always return DataFrame, even when + # there is only one group. Prior to v0.18.0, extract returned + # Series when there was only one group in the regex. + s = Series(["a3", "b3", "c2"], name="series_name") + r = s.str.extract(r"(?P[a-z])", expand=True) + e = DataFrame({"letter": ["a", "b", "c"]}) + tm.assert_frame_equal(r, e) + + def test_extractall(self): + subject_list = [ + "dave@google.com", + "tdhock5@gmail.com", + "maudelaperriere@gmail.com", + "rob@gmail.com some text steve@gmail.com", + "a@b.com some text c@d.com and e@f.com", + np.nan, + "", + ] + expected_tuples = [ + ("dave", "google", "com"), + ("tdhock5", "gmail", "com"), + ("maudelaperriere", "gmail", "com"), + ("rob", "gmail", "com"), + ("steve", "gmail", "com"), + ("a", "b", "com"), + ("c", "d", "com"), + ("e", "f", "com"), + ] + named_pattern = r""" + (?P[a-z0-9]+) + @ + (?P[a-z]+) + \. + (?P[a-z]{2,4}) + """ + expected_columns = ["user", "domain", "tld"] + S = Series(subject_list) + # extractall should return a DataFrame with one row for each + # match, indexed by the subject from which the match came. + expected_index = MultiIndex.from_tuples( + [(0, 0), (1, 0), (2, 0), (3, 0), (3, 1), (4, 0), (4, 1), (4, 2)], + names=(None, "match"), + ) + expected_df = DataFrame(expected_tuples, expected_index, expected_columns) + computed_df = S.str.extractall(named_pattern, re.VERBOSE) + tm.assert_frame_equal(computed_df, expected_df) + + # The index of the input Series should be used to construct + # the index of the output DataFrame: + series_index = MultiIndex.from_tuples( + [ + ("single", "Dave"), + ("single", "Toby"), + ("single", "Maude"), + ("multiple", "robAndSteve"), + ("multiple", "abcdef"), + ("none", "missing"), + ("none", "empty"), + ] + ) + Si = Series(subject_list, series_index) + expected_index = MultiIndex.from_tuples( + [ + ("single", "Dave", 0), + ("single", "Toby", 0), + ("single", "Maude", 0), + ("multiple", "robAndSteve", 0), + ("multiple", "robAndSteve", 1), + ("multiple", "abcdef", 0), + ("multiple", "abcdef", 1), + ("multiple", "abcdef", 2), + ], + names=(None, None, "match"), + ) + expected_df = DataFrame(expected_tuples, expected_index, expected_columns) + computed_df = Si.str.extractall(named_pattern, re.VERBOSE) + tm.assert_frame_equal(computed_df, expected_df) + + # MultiIndexed subject with names. + Sn = Series(subject_list, series_index) + Sn.index.names = ("matches", "description") + expected_index.names = ("matches", "description", "match") + expected_df = DataFrame(expected_tuples, expected_index, expected_columns) + computed_df = Sn.str.extractall(named_pattern, re.VERBOSE) + tm.assert_frame_equal(computed_df, expected_df) + + # optional groups. + subject_list = ["", "A1", "32"] + named_pattern = "(?P[AB])?(?P[123])" + computed_df = Series(subject_list).str.extractall(named_pattern) + expected_index = MultiIndex.from_tuples( + [(1, 0), (2, 0), (2, 1)], names=(None, "match") + ) + expected_df = DataFrame( + [("A", "1"), (np.nan, "3"), (np.nan, "2")], + expected_index, + columns=["letter", "number"], + ) + tm.assert_frame_equal(computed_df, expected_df) + + # only one of two groups has a name. + pattern = "([AB])?(?P[123])" + computed_df = Series(subject_list).str.extractall(pattern) + expected_df = DataFrame( + [("A", "1"), (np.nan, "3"), (np.nan, "2")], + expected_index, + columns=[0, "number"], + ) + tm.assert_frame_equal(computed_df, expected_df) + + def test_extractall_single_group(self): + # extractall(one named group) returns DataFrame with one named + # column. + s = Series(["a3", "b3", "d4c2"], name="series_name") + r = s.str.extractall(r"(?P[a-z])") + i = MultiIndex.from_tuples( + [(0, 0), (1, 0), (2, 0), (2, 1)], names=(None, "match") + ) + e = DataFrame({"letter": ["a", "b", "d", "c"]}, i) + tm.assert_frame_equal(r, e) + + # extractall(one un-named group) returns DataFrame with one + # un-named column. + r = s.str.extractall(r"([a-z])") + e = DataFrame(["a", "b", "d", "c"], i) + tm.assert_frame_equal(r, e) + + def test_extractall_single_group_with_quantifier(self): + # extractall(one un-named group with quantifier) returns + # DataFrame with one un-named column (GH13382). + s = Series(["ab3", "abc3", "d4cd2"], name="series_name") + r = s.str.extractall(r"([a-z]+)") + i = MultiIndex.from_tuples( + [(0, 0), (1, 0), (2, 0), (2, 1)], names=(None, "match") + ) + e = DataFrame(["ab", "abc", "d", "cd"], i) + tm.assert_frame_equal(r, e) + + @pytest.mark.parametrize( + "data, names", + [ + ([], (None,)), + ([], ("i1",)), + ([], (None, "i2")), + ([], ("i1", "i2")), + (["a3", "b3", "d4c2"], (None,)), + (["a3", "b3", "d4c2"], ("i1", "i2")), + (["a3", "b3", "d4c2"], (None, "i2")), + (["a3", "b3", "d4c2"], ("i1", "i2")), + ], + ) + def test_extractall_no_matches(self, data, names): + # GH19075 extractall with no matches should return a valid MultiIndex + n = len(data) + if len(names) == 1: + i = Index(range(n), name=names[0]) + else: + a = (tuple([i] * (n - 1)) for i in range(n)) + i = MultiIndex.from_tuples(a, names=names) + s = Series(data, name="series_name", index=i, dtype="object") + ei = MultiIndex.from_tuples([], names=(names + ("match",))) + + # one un-named group. + r = s.str.extractall("(z)") + e = DataFrame(columns=[0], index=ei) + tm.assert_frame_equal(r, e) + + # two un-named groups. + r = s.str.extractall("(z)(z)") + e = DataFrame(columns=[0, 1], index=ei) + tm.assert_frame_equal(r, e) + + # one named group. + r = s.str.extractall("(?Pz)") + e = DataFrame(columns=["first"], index=ei) + tm.assert_frame_equal(r, e) + + # two named groups. + r = s.str.extractall("(?Pz)(?Pz)") + e = DataFrame(columns=["first", "second"], index=ei) + tm.assert_frame_equal(r, e) + + # one named, one un-named. + r = s.str.extractall("(z)(?Pz)") + e = DataFrame(columns=[0, "second"], index=ei) + tm.assert_frame_equal(r, e) + + def test_extractall_stringindex(self): + s = Series(["a1a2", "b1", "c1"], name="xxx") + res = s.str.extractall(r"[ab](?P\d)") + exp_idx = MultiIndex.from_tuples( + [(0, 0), (0, 1), (1, 0)], names=[None, "match"] + ) + exp = DataFrame({"digit": ["1", "2", "1"]}, index=exp_idx) + tm.assert_frame_equal(res, exp) + + # index should return the same result as the default index without name + # thus index.name doesn't affect to the result + for idx in [ + Index(["a1a2", "b1", "c1"]), + Index(["a1a2", "b1", "c1"], name="xxx"), + ]: + + res = idx.str.extractall(r"[ab](?P\d)") + tm.assert_frame_equal(res, exp) + + s = Series( + ["a1a2", "b1", "c1"], + name="s_name", + index=Index(["XX", "yy", "zz"], name="idx_name"), + ) + res = s.str.extractall(r"[ab](?P\d)") + exp_idx = MultiIndex.from_tuples( + [("XX", 0), ("XX", 1), ("yy", 0)], names=["idx_name", "match"] + ) + exp = DataFrame({"digit": ["1", "2", "1"]}, index=exp_idx) + tm.assert_frame_equal(res, exp) + + def test_extractall_errors(self): + # Does not make sense to use extractall with a regex that has + # no capture groups. (it returns DataFrame with one column for + # each capture group) + s = Series(["a3", "b3", "d4c2"], name="series_name") + with pytest.raises(ValueError, match="no capture groups"): + s.str.extractall(r"[a-z]") + + def test_extract_index_one_two_groups(self): + s = Series(["a3", "b3", "d4c2"], index=["A3", "B3", "D4"], name="series_name") + r = s.index.str.extract(r"([A-Z])", expand=True) + e = DataFrame(["A", "B", "D"]) + tm.assert_frame_equal(r, e) + + # Prior to v0.18.0, index.str.extract(regex with one group) + # returned Index. With more than one group, extract raised an + # error (GH9980). Now extract always returns DataFrame. + r = s.index.str.extract(r"(?P[A-Z])(?P[0-9])", expand=True) + e_list = [("A", "3"), ("B", "3"), ("D", "4")] + e = DataFrame(e_list, columns=["letter", "digit"]) + tm.assert_frame_equal(r, e) + + def test_extractall_same_as_extract(self): + s = Series(["a3", "b3", "c2"], name="series_name") + + pattern_two_noname = r"([a-z])([0-9])" + extract_two_noname = s.str.extract(pattern_two_noname, expand=True) + has_multi_index = s.str.extractall(pattern_two_noname) + no_multi_index = has_multi_index.xs(0, level="match") + tm.assert_frame_equal(extract_two_noname, no_multi_index) + + pattern_two_named = r"(?P[a-z])(?P[0-9])" + extract_two_named = s.str.extract(pattern_two_named, expand=True) + has_multi_index = s.str.extractall(pattern_two_named) + no_multi_index = has_multi_index.xs(0, level="match") + tm.assert_frame_equal(extract_two_named, no_multi_index) + + pattern_one_named = r"(?P[a-z])" + extract_one_named = s.str.extract(pattern_one_named, expand=True) + has_multi_index = s.str.extractall(pattern_one_named) + no_multi_index = has_multi_index.xs(0, level="match") + tm.assert_frame_equal(extract_one_named, no_multi_index) + + pattern_one_noname = r"([a-z])" + extract_one_noname = s.str.extract(pattern_one_noname, expand=True) + has_multi_index = s.str.extractall(pattern_one_noname) + no_multi_index = has_multi_index.xs(0, level="match") + tm.assert_frame_equal(extract_one_noname, no_multi_index) + + def test_extractall_same_as_extract_subject_index(self): + # same as above tests, but s has an MultiIndex. + i = MultiIndex.from_tuples( + [("A", "first"), ("B", "second"), ("C", "third")], + names=("capital", "ordinal"), + ) + s = Series(["a3", "b3", "c2"], i, name="series_name") + + pattern_two_noname = r"([a-z])([0-9])" + extract_two_noname = s.str.extract(pattern_two_noname, expand=True) + has_match_index = s.str.extractall(pattern_two_noname) + no_match_index = has_match_index.xs(0, level="match") + tm.assert_frame_equal(extract_two_noname, no_match_index) + + pattern_two_named = r"(?P[a-z])(?P[0-9])" + extract_two_named = s.str.extract(pattern_two_named, expand=True) + has_match_index = s.str.extractall(pattern_two_named) + no_match_index = has_match_index.xs(0, level="match") + tm.assert_frame_equal(extract_two_named, no_match_index) + + pattern_one_named = r"(?P[a-z])" + extract_one_named = s.str.extract(pattern_one_named, expand=True) + has_match_index = s.str.extractall(pattern_one_named) + no_match_index = has_match_index.xs(0, level="match") + tm.assert_frame_equal(extract_one_named, no_match_index) + + pattern_one_noname = r"([a-z])" + extract_one_noname = s.str.extract(pattern_one_noname, expand=True) + has_match_index = s.str.extractall(pattern_one_noname) + no_match_index = has_match_index.xs(0, level="match") + tm.assert_frame_equal(extract_one_noname, no_match_index) + + def test_empty_str_methods(self): + empty_str = empty = Series(dtype=object) + empty_int = Series(dtype="int64") + empty_bool = Series(dtype=bool) + empty_bytes = Series(dtype=object) + + # GH7241 + # (extract) on empty series + + tm.assert_series_equal(empty_str, empty.str.cat(empty)) + assert "" == empty.str.cat() + tm.assert_series_equal(empty_str, empty.str.title()) + tm.assert_series_equal(empty_int, empty.str.count("a")) + tm.assert_series_equal(empty_bool, empty.str.contains("a")) + tm.assert_series_equal(empty_bool, empty.str.startswith("a")) + tm.assert_series_equal(empty_bool, empty.str.endswith("a")) + tm.assert_series_equal(empty_str, empty.str.lower()) + tm.assert_series_equal(empty_str, empty.str.upper()) + tm.assert_series_equal(empty_str, empty.str.replace("a", "b")) + tm.assert_series_equal(empty_str, empty.str.repeat(3)) + tm.assert_series_equal(empty_bool, empty.str.match("^a")) + tm.assert_frame_equal( + DataFrame(columns=[0], dtype=str), empty.str.extract("()", expand=True) + ) + tm.assert_frame_equal( + DataFrame(columns=[0, 1], dtype=str), empty.str.extract("()()", expand=True) + ) + tm.assert_series_equal(empty_str, empty.str.extract("()", expand=False)) + tm.assert_frame_equal( + DataFrame(columns=[0, 1], dtype=str), + empty.str.extract("()()", expand=False), + ) + tm.assert_frame_equal(DataFrame(dtype=str), empty.str.get_dummies()) + tm.assert_series_equal(empty_str, empty_str.str.join("")) + tm.assert_series_equal(empty_int, empty.str.len()) + tm.assert_series_equal(empty_str, empty_str.str.findall("a")) + tm.assert_series_equal(empty_int, empty.str.find("a")) + tm.assert_series_equal(empty_int, empty.str.rfind("a")) + tm.assert_series_equal(empty_str, empty.str.pad(42)) + tm.assert_series_equal(empty_str, empty.str.center(42)) + tm.assert_series_equal(empty_str, empty.str.split("a")) + tm.assert_series_equal(empty_str, empty.str.rsplit("a")) + tm.assert_series_equal(empty_str, empty.str.partition("a", expand=False)) + tm.assert_series_equal(empty_str, empty.str.rpartition("a", expand=False)) + tm.assert_series_equal(empty_str, empty.str.slice(stop=1)) + tm.assert_series_equal(empty_str, empty.str.slice(step=1)) + tm.assert_series_equal(empty_str, empty.str.strip()) + tm.assert_series_equal(empty_str, empty.str.lstrip()) + tm.assert_series_equal(empty_str, empty.str.rstrip()) + tm.assert_series_equal(empty_str, empty.str.wrap(42)) + tm.assert_series_equal(empty_str, empty.str.get(0)) + tm.assert_series_equal(empty_str, empty_bytes.str.decode("ascii")) + tm.assert_series_equal(empty_bytes, empty.str.encode("ascii")) + # ismethods should always return boolean (GH 29624) + tm.assert_series_equal(empty_bool, empty.str.isalnum()) + tm.assert_series_equal(empty_bool, empty.str.isalpha()) + tm.assert_series_equal(empty_bool, empty.str.isdigit()) + tm.assert_series_equal(empty_bool, empty.str.isspace()) + tm.assert_series_equal(empty_bool, empty.str.islower()) + tm.assert_series_equal(empty_bool, empty.str.isupper()) + tm.assert_series_equal(empty_bool, empty.str.istitle()) + tm.assert_series_equal(empty_bool, empty.str.isnumeric()) + tm.assert_series_equal(empty_bool, empty.str.isdecimal()) + tm.assert_series_equal(empty_str, empty.str.capitalize()) + tm.assert_series_equal(empty_str, empty.str.swapcase()) + tm.assert_series_equal(empty_str, empty.str.normalize("NFC")) + + table = str.maketrans("a", "b") + tm.assert_series_equal(empty_str, empty.str.translate(table)) + + def test_empty_str_methods_to_frame(self): + empty = Series(dtype=str) + empty_df = DataFrame() + tm.assert_frame_equal(empty_df, empty.str.partition("a")) + tm.assert_frame_equal(empty_df, empty.str.rpartition("a")) + + def test_ismethods(self): + values = ["A", "b", "Xy", "4", "3A", "", "TT", "55", "-", " "] + str_s = Series(values) + alnum_e = [True, True, True, True, True, False, True, True, False, False] + alpha_e = [True, True, True, False, False, False, True, False, False, False] + digit_e = [False, False, False, True, False, False, False, True, False, False] + + # TODO: unused + num_e = [ # noqa + False, + False, + False, + True, + False, + False, + False, + True, + False, + False, + ] + + space_e = [False, False, False, False, False, False, False, False, False, True] + lower_e = [False, True, False, False, False, False, False, False, False, False] + upper_e = [True, False, False, False, True, False, True, False, False, False] + title_e = [True, False, True, False, True, False, False, False, False, False] + + tm.assert_series_equal(str_s.str.isalnum(), Series(alnum_e)) + tm.assert_series_equal(str_s.str.isalpha(), Series(alpha_e)) + tm.assert_series_equal(str_s.str.isdigit(), Series(digit_e)) + tm.assert_series_equal(str_s.str.isspace(), Series(space_e)) + tm.assert_series_equal(str_s.str.islower(), Series(lower_e)) + tm.assert_series_equal(str_s.str.isupper(), Series(upper_e)) + tm.assert_series_equal(str_s.str.istitle(), Series(title_e)) + + assert str_s.str.isalnum().tolist() == [v.isalnum() for v in values] + assert str_s.str.isalpha().tolist() == [v.isalpha() for v in values] + assert str_s.str.isdigit().tolist() == [v.isdigit() for v in values] + assert str_s.str.isspace().tolist() == [v.isspace() for v in values] + assert str_s.str.islower().tolist() == [v.islower() for v in values] + assert str_s.str.isupper().tolist() == [v.isupper() for v in values] + assert str_s.str.istitle().tolist() == [v.istitle() for v in values] + + def test_isnumeric(self): + # 0x00bc: ¼ VULGAR FRACTION ONE QUARTER + # 0x2605: ★ not number + # 0x1378: ፸ ETHIOPIC NUMBER SEVENTY + # 0xFF13: 3 Em 3 + values = ["A", "3", "¼", "★", "፸", "3", "four"] + s = Series(values) + numeric_e = [False, True, True, False, True, True, False] + decimal_e = [False, True, False, False, False, True, False] + tm.assert_series_equal(s.str.isnumeric(), Series(numeric_e)) + tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e)) + + unicodes = ["A", "3", "¼", "★", "፸", "3", "four"] + assert s.str.isnumeric().tolist() == [v.isnumeric() for v in unicodes] + assert s.str.isdecimal().tolist() == [v.isdecimal() for v in unicodes] + + values = ["A", np.nan, "¼", "★", np.nan, "3", "four"] + s = Series(values) + numeric_e = [False, np.nan, True, False, np.nan, True, False] + decimal_e = [False, np.nan, False, False, np.nan, True, False] + tm.assert_series_equal(s.str.isnumeric(), Series(numeric_e)) + tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e)) + + def test_get_dummies(self): + s = Series(["a|b", "a|c", np.nan]) + result = s.str.get_dummies("|") + expected = DataFrame([[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc")) + tm.assert_frame_equal(result, expected) + + s = Series(["a;b", "a", 7]) + result = s.str.get_dummies(";") + expected = DataFrame([[0, 1, 1], [0, 1, 0], [1, 0, 0]], columns=list("7ab")) + tm.assert_frame_equal(result, expected) + + # GH9980, GH8028 + idx = Index(["a|b", "a|c", "b|c"]) + result = idx.str.get_dummies("|") + + expected = MultiIndex.from_tuples( + [(1, 1, 0), (1, 0, 1), (0, 1, 1)], names=("a", "b", "c") + ) + tm.assert_index_equal(result, expected) + + def test_get_dummies_with_name_dummy(self): + # GH 12180 + # Dummies named 'name' should work as expected + s = Series(["a", "b,name", "b"]) + result = s.str.get_dummies(",") + expected = DataFrame( + [[1, 0, 0], [0, 1, 1], [0, 1, 0]], columns=["a", "b", "name"] + ) + tm.assert_frame_equal(result, expected) + + idx = Index(["a|b", "name|c", "b|name"]) + result = idx.str.get_dummies("|") + + expected = MultiIndex.from_tuples( + [(1, 1, 0, 0), (0, 0, 1, 1), (0, 1, 0, 1)], names=("a", "b", "c", "name") + ) + tm.assert_index_equal(result, expected) + + def test_join(self): + values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) + result = values.str.split("_").str.join("_") + tm.assert_series_equal(values, result) + + # mixed + mixed = Series( + [ + "a_b", + np.nan, + "asdf_cas_asdf", + True, + datetime.today(), + "foo", + None, + 1, + 2.0, + ] + ) + + rs = Series(mixed).str.split("_").str.join("_") + xp = Series( + [ + "a_b", + np.nan, + "asdf_cas_asdf", + np.nan, + np.nan, + "foo", + np.nan, + np.nan, + np.nan, + ] + ) + + assert isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + def test_len(self): + values = Series(["foo", "fooo", "fooooo", np.nan, "fooooooo"]) + + result = values.str.len() + exp = values.map(lambda x: len(x) if notna(x) else np.nan) + tm.assert_series_equal(result, exp) + + # mixed + mixed = Series( + [ + "a_b", + np.nan, + "asdf_cas_asdf", + True, + datetime.today(), + "foo", + None, + 1, + 2.0, + ] + ) + + rs = Series(mixed).str.len() + xp = Series([3, np.nan, 13, np.nan, np.nan, 3, np.nan, np.nan, np.nan]) + + assert isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + def test_findall(self): + values = Series(["fooBAD__barBAD", np.nan, "foo", "BAD"]) + + result = values.str.findall("BAD[_]*") + exp = Series([["BAD__", "BAD"], np.nan, [], ["BAD"]]) + tm.assert_almost_equal(result, exp) + + # mixed + mixed = Series( + [ + "fooBAD__barBAD", + np.nan, + "foo", + True, + datetime.today(), + "BAD", + None, + 1, + 2.0, + ] + ) + + rs = Series(mixed).str.findall("BAD[_]*") + xp = Series( + [ + ["BAD__", "BAD"], + np.nan, + [], + np.nan, + np.nan, + ["BAD"], + np.nan, + np.nan, + np.nan, + ] + ) + + assert isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + def test_find(self): + values = Series(["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF", "XXXX"]) + result = values.str.find("EF") + tm.assert_series_equal(result, Series([4, 3, 1, 0, -1])) + expected = np.array([v.find("EF") for v in values.values], dtype=np.int64) + tm.assert_numpy_array_equal(result.values, expected) + + result = values.str.rfind("EF") + tm.assert_series_equal(result, Series([4, 5, 7, 4, -1])) + expected = np.array([v.rfind("EF") for v in values.values], dtype=np.int64) + tm.assert_numpy_array_equal(result.values, expected) + + result = values.str.find("EF", 3) + tm.assert_series_equal(result, Series([4, 3, 7, 4, -1])) + expected = np.array([v.find("EF", 3) for v in values.values], dtype=np.int64) + tm.assert_numpy_array_equal(result.values, expected) + + result = values.str.rfind("EF", 3) + tm.assert_series_equal(result, Series([4, 5, 7, 4, -1])) + expected = np.array([v.rfind("EF", 3) for v in values.values], dtype=np.int64) + tm.assert_numpy_array_equal(result.values, expected) + + result = values.str.find("EF", 3, 6) + tm.assert_series_equal(result, Series([4, 3, -1, 4, -1])) + expected = np.array([v.find("EF", 3, 6) for v in values.values], dtype=np.int64) + tm.assert_numpy_array_equal(result.values, expected) + + result = values.str.rfind("EF", 3, 6) + tm.assert_series_equal(result, Series([4, 3, -1, 4, -1])) + expected = np.array( + [v.rfind("EF", 3, 6) for v in values.values], dtype=np.int64 + ) + tm.assert_numpy_array_equal(result.values, expected) + + with pytest.raises(TypeError, match="expected a string object, not int"): + result = values.str.find(0) + + with pytest.raises(TypeError, match="expected a string object, not int"): + result = values.str.rfind(0) + + def test_find_nan(self): + values = Series(["ABCDEFG", np.nan, "DEFGHIJEF", np.nan, "XXXX"]) + result = values.str.find("EF") + tm.assert_series_equal(result, Series([4, np.nan, 1, np.nan, -1])) + + result = values.str.rfind("EF") + tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1])) + + result = values.str.find("EF", 3) + tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1])) + + result = values.str.rfind("EF", 3) + tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1])) + + result = values.str.find("EF", 3, 6) + tm.assert_series_equal(result, Series([4, np.nan, -1, np.nan, -1])) + + result = values.str.rfind("EF", 3, 6) + tm.assert_series_equal(result, Series([4, np.nan, -1, np.nan, -1])) + + def test_index(self): + def _check(result, expected): + if isinstance(result, Series): + tm.assert_series_equal(result, expected) + else: + tm.assert_index_equal(result, expected) + + for klass in [Series, Index]: + s = klass(["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"]) + + result = s.str.index("EF") + _check(result, klass([4, 3, 1, 0])) + expected = np.array([v.index("EF") for v in s.values], dtype=np.int64) + tm.assert_numpy_array_equal(result.values, expected) + + result = s.str.rindex("EF") + _check(result, klass([4, 5, 7, 4])) + expected = np.array([v.rindex("EF") for v in s.values], dtype=np.int64) + tm.assert_numpy_array_equal(result.values, expected) + + result = s.str.index("EF", 3) + _check(result, klass([4, 3, 7, 4])) + expected = np.array([v.index("EF", 3) for v in s.values], dtype=np.int64) + tm.assert_numpy_array_equal(result.values, expected) + + result = s.str.rindex("EF", 3) + _check(result, klass([4, 5, 7, 4])) + expected = np.array([v.rindex("EF", 3) for v in s.values], dtype=np.int64) + tm.assert_numpy_array_equal(result.values, expected) + + result = s.str.index("E", 4, 8) + _check(result, klass([4, 5, 7, 4])) + expected = np.array([v.index("E", 4, 8) for v in s.values], dtype=np.int64) + tm.assert_numpy_array_equal(result.values, expected) + + result = s.str.rindex("E", 0, 5) + _check(result, klass([4, 3, 1, 4])) + expected = np.array([v.rindex("E", 0, 5) for v in s.values], dtype=np.int64) + tm.assert_numpy_array_equal(result.values, expected) + + with pytest.raises(ValueError, match="substring not found"): + result = s.str.index("DE") + + msg = "expected a string object, not int" + with pytest.raises(TypeError, match=msg): + result = s.str.index(0) + + # test with nan + s = Series(["abcb", "ab", "bcbe", np.nan]) + result = s.str.index("b") + tm.assert_series_equal(result, Series([1, 1, 0, np.nan])) + result = s.str.rindex("b") + tm.assert_series_equal(result, Series([3, 1, 2, np.nan])) + + def test_pad(self): + values = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"]) + + result = values.str.pad(5, side="left") + exp = Series([" a", " b", np.nan, " c", np.nan, "eeeeee"]) + tm.assert_almost_equal(result, exp) + + result = values.str.pad(5, side="right") + exp = Series(["a ", "b ", np.nan, "c ", np.nan, "eeeeee"]) + tm.assert_almost_equal(result, exp) + + result = values.str.pad(5, side="both") + exp = Series([" a ", " b ", np.nan, " c ", np.nan, "eeeeee"]) + tm.assert_almost_equal(result, exp) + + # mixed + mixed = Series(["a", np.nan, "b", True, datetime.today(), "ee", None, 1, 2.0]) + + rs = Series(mixed).str.pad(5, side="left") + xp = Series( + [" a", np.nan, " b", np.nan, np.nan, " ee", np.nan, np.nan, np.nan] + ) + + assert isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + mixed = Series(["a", np.nan, "b", True, datetime.today(), "ee", None, 1, 2.0]) + + rs = Series(mixed).str.pad(5, side="right") + xp = Series( + ["a ", np.nan, "b ", np.nan, np.nan, "ee ", np.nan, np.nan, np.nan] + ) + + assert isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + mixed = Series(["a", np.nan, "b", True, datetime.today(), "ee", None, 1, 2.0]) + + rs = Series(mixed).str.pad(5, side="both") + xp = Series( + [" a ", np.nan, " b ", np.nan, np.nan, " ee ", np.nan, np.nan, np.nan] + ) + + assert isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + def test_pad_fillchar(self): + + values = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"]) + + result = values.str.pad(5, side="left", fillchar="X") + exp = Series(["XXXXa", "XXXXb", np.nan, "XXXXc", np.nan, "eeeeee"]) + tm.assert_almost_equal(result, exp) + + result = values.str.pad(5, side="right", fillchar="X") + exp = Series(["aXXXX", "bXXXX", np.nan, "cXXXX", np.nan, "eeeeee"]) + tm.assert_almost_equal(result, exp) + + result = values.str.pad(5, side="both", fillchar="X") + exp = Series(["XXaXX", "XXbXX", np.nan, "XXcXX", np.nan, "eeeeee"]) + tm.assert_almost_equal(result, exp) + + msg = "fillchar must be a character, not str" + with pytest.raises(TypeError, match=msg): + result = values.str.pad(5, fillchar="XY") + + msg = "fillchar must be a character, not int" + with pytest.raises(TypeError, match=msg): + result = values.str.pad(5, fillchar=5) + + @pytest.mark.parametrize("f", ["center", "ljust", "rjust", "zfill", "pad"]) + def test_pad_width(self, f): + # see gh-13598 + s = Series(["1", "22", "a", "bb"]) + msg = "width must be of integer type, not*" + + with pytest.raises(TypeError, match=msg): + getattr(s.str, f)("f") + + def test_translate(self): + def _check(result, expected): + if isinstance(result, Series): + tm.assert_series_equal(result, expected) + else: + tm.assert_index_equal(result, expected) + + for klass in [Series, Index]: + s = klass(["abcdefg", "abcc", "cdddfg", "cdefggg"]) + table = str.maketrans("abc", "cde") + result = s.str.translate(table) + expected = klass(["cdedefg", "cdee", "edddfg", "edefggg"]) + _check(result, expected) + + # Series with non-string values + s = Series(["a", "b", "c", 1.2]) + expected = Series(["c", "d", "e", np.nan]) + result = s.str.translate(table) + tm.assert_series_equal(result, expected) + + def test_center_ljust_rjust(self): + values = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"]) + + result = values.str.center(5) + exp = Series([" a ", " b ", np.nan, " c ", np.nan, "eeeeee"]) + tm.assert_almost_equal(result, exp) + + result = values.str.ljust(5) + exp = Series(["a ", "b ", np.nan, "c ", np.nan, "eeeeee"]) + tm.assert_almost_equal(result, exp) + + result = values.str.rjust(5) + exp = Series([" a", " b", np.nan, " c", np.nan, "eeeeee"]) + tm.assert_almost_equal(result, exp) + + # mixed + mixed = Series( + ["a", np.nan, "b", True, datetime.today(), "c", "eee", None, 1, 2.0] + ) + + rs = Series(mixed).str.center(5) + xp = Series( + [ + " a ", + np.nan, + " b ", + np.nan, + np.nan, + " c ", + " eee ", + np.nan, + np.nan, + np.nan, + ] + ) + assert isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + rs = Series(mixed).str.ljust(5) + xp = Series( + [ + "a ", + np.nan, + "b ", + np.nan, + np.nan, + "c ", + "eee ", + np.nan, + np.nan, + np.nan, + ] + ) + assert isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + rs = Series(mixed).str.rjust(5) + xp = Series( + [ + " a", + np.nan, + " b", + np.nan, + np.nan, + " c", + " eee", + np.nan, + np.nan, + np.nan, + ] + ) + assert isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + def test_center_ljust_rjust_fillchar(self): + values = Series(["a", "bb", "cccc", "ddddd", "eeeeee"]) + + result = values.str.center(5, fillchar="X") + expected = Series(["XXaXX", "XXbbX", "Xcccc", "ddddd", "eeeeee"]) + tm.assert_series_equal(result, expected) + expected = np.array([v.center(5, "X") for v in values.values], dtype=np.object_) + tm.assert_numpy_array_equal(result.values, expected) + + result = values.str.ljust(5, fillchar="X") + expected = Series(["aXXXX", "bbXXX", "ccccX", "ddddd", "eeeeee"]) + tm.assert_series_equal(result, expected) + expected = np.array([v.ljust(5, "X") for v in values.values], dtype=np.object_) + tm.assert_numpy_array_equal(result.values, expected) + + result = values.str.rjust(5, fillchar="X") + expected = Series(["XXXXa", "XXXbb", "Xcccc", "ddddd", "eeeeee"]) + tm.assert_series_equal(result, expected) + expected = np.array([v.rjust(5, "X") for v in values.values], dtype=np.object_) + tm.assert_numpy_array_equal(result.values, expected) + + # If fillchar is not a charatter, normal str raises TypeError + # 'aaa'.ljust(5, 'XY') + # TypeError: must be char, not str + template = "fillchar must be a character, not {dtype}" + + with pytest.raises(TypeError, match=template.format(dtype="str")): + values.str.center(5, fillchar="XY") + + with pytest.raises(TypeError, match=template.format(dtype="str")): + values.str.ljust(5, fillchar="XY") + + with pytest.raises(TypeError, match=template.format(dtype="str")): + values.str.rjust(5, fillchar="XY") + + with pytest.raises(TypeError, match=template.format(dtype="int")): + values.str.center(5, fillchar=1) + + with pytest.raises(TypeError, match=template.format(dtype="int")): + values.str.ljust(5, fillchar=1) + + with pytest.raises(TypeError, match=template.format(dtype="int")): + values.str.rjust(5, fillchar=1) + + def test_zfill(self): + values = Series(["1", "22", "aaa", "333", "45678"]) + + result = values.str.zfill(5) + expected = Series(["00001", "00022", "00aaa", "00333", "45678"]) + tm.assert_series_equal(result, expected) + expected = np.array([v.zfill(5) for v in values.values], dtype=np.object_) + tm.assert_numpy_array_equal(result.values, expected) + + result = values.str.zfill(3) + expected = Series(["001", "022", "aaa", "333", "45678"]) + tm.assert_series_equal(result, expected) + expected = np.array([v.zfill(3) for v in values.values], dtype=np.object_) + tm.assert_numpy_array_equal(result.values, expected) + + values = Series(["1", np.nan, "aaa", np.nan, "45678"]) + result = values.str.zfill(5) + expected = Series(["00001", np.nan, "00aaa", np.nan, "45678"]) + tm.assert_series_equal(result, expected) + + def test_split(self): + values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) + + result = values.str.split("_") + exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) + tm.assert_series_equal(result, exp) + + # more than one char + values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"]) + result = values.str.split("__") + tm.assert_series_equal(result, exp) + + result = values.str.split("__", expand=False) + tm.assert_series_equal(result, exp) + + # mixed + mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0]) + result = mixed.str.split("_") + exp = Series( + [ + ["a", "b", "c"], + np.nan, + ["d", "e", "f"], + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + ] + ) + assert isinstance(result, Series) + tm.assert_almost_equal(result, exp) + + result = mixed.str.split("_", expand=False) + assert isinstance(result, Series) + tm.assert_almost_equal(result, exp) + + # regex split + values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"]) + result = values.str.split("[,_]") + exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) + tm.assert_series_equal(result, exp) + + def test_rsplit(self): + values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) + result = values.str.rsplit("_") + exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) + tm.assert_series_equal(result, exp) + + # more than one char + values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"]) + result = values.str.rsplit("__") + tm.assert_series_equal(result, exp) + + result = values.str.rsplit("__", expand=False) + tm.assert_series_equal(result, exp) + + # mixed + mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0]) + result = mixed.str.rsplit("_") + exp = Series( + [ + ["a", "b", "c"], + np.nan, + ["d", "e", "f"], + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + ] + ) + assert isinstance(result, Series) + tm.assert_almost_equal(result, exp) + + result = mixed.str.rsplit("_", expand=False) + assert isinstance(result, Series) + tm.assert_almost_equal(result, exp) + + # regex split is not supported by rsplit + values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"]) + result = values.str.rsplit("[,_]") + exp = Series([["a,b_c"], ["c_d,e"], np.nan, ["f,g,h"]]) + tm.assert_series_equal(result, exp) + + # setting max number of splits, make sure it's from reverse + values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) + result = values.str.rsplit("_", n=1) + exp = Series([["a_b", "c"], ["c_d", "e"], np.nan, ["f_g", "h"]]) + tm.assert_series_equal(result, exp) + + def test_split_blank_string(self): + # expand blank split GH 20067 + values = Series([""], name="test") + result = values.str.split(expand=True) + exp = DataFrame([[]]) # NOTE: this is NOT an empty DataFrame + tm.assert_frame_equal(result, exp) + + values = Series(["a b c", "a b", "", " "], name="test") + result = values.str.split(expand=True) + exp = DataFrame( + [ + ["a", "b", "c"], + ["a", "b", np.nan], + [np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan], + ] + ) + tm.assert_frame_equal(result, exp) + + def test_split_noargs(self): + # #1859 + s = Series(["Wes McKinney", "Travis Oliphant"]) + result = s.str.split() + expected = ["Travis", "Oliphant"] + assert result[1] == expected + result = s.str.rsplit() + assert result[1] == expected + + def test_split_maxsplit(self): + # re.split 0, str.split -1 + s = Series(["bd asdf jfg", "kjasdflqw asdfnfk"]) + + result = s.str.split(n=-1) + xp = s.str.split() + tm.assert_series_equal(result, xp) + + result = s.str.split(n=0) + tm.assert_series_equal(result, xp) + + xp = s.str.split("asdf") + result = s.str.split("asdf", n=0) + tm.assert_series_equal(result, xp) + + result = s.str.split("asdf", n=-1) + tm.assert_series_equal(result, xp) + + def test_split_no_pat_with_nonzero_n(self): + s = Series(["split once", "split once too!"]) + result = s.str.split(n=1) + expected = Series({0: ["split", "once"], 1: ["split", "once too!"]}) + tm.assert_series_equal(expected, result, check_index_type=False) + + def test_split_to_dataframe(self): + s = Series(["nosplit", "alsonosplit"]) + result = s.str.split("_", expand=True) + exp = DataFrame({0: Series(["nosplit", "alsonosplit"])}) + tm.assert_frame_equal(result, exp) + + s = Series(["some_equal_splits", "with_no_nans"]) + result = s.str.split("_", expand=True) + exp = DataFrame( + {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]} + ) + tm.assert_frame_equal(result, exp) + + s = Series(["some_unequal_splits", "one_of_these_things_is_not"]) + result = s.str.split("_", expand=True) + exp = DataFrame( + { + 0: ["some", "one"], + 1: ["unequal", "of"], + 2: ["splits", "these"], + 3: [np.nan, "things"], + 4: [np.nan, "is"], + 5: [np.nan, "not"], + } + ) + tm.assert_frame_equal(result, exp) + + s = Series(["some_splits", "with_index"], index=["preserve", "me"]) + result = s.str.split("_", expand=True) + exp = DataFrame( + {0: ["some", "with"], 1: ["splits", "index"]}, index=["preserve", "me"] + ) + tm.assert_frame_equal(result, exp) + + with pytest.raises(ValueError, match="expand must be"): + s.str.split("_", expand="not_a_boolean") + + def test_split_to_multiindex_expand(self): + # https://github.com/pandas-dev/pandas/issues/23677 + + idx = Index(["nosplit", "alsonosplit", np.nan]) + result = idx.str.split("_", expand=True) + exp = idx + tm.assert_index_equal(result, exp) + assert result.nlevels == 1 + + idx = Index(["some_equal_splits", "with_no_nans", np.nan, None]) + result = idx.str.split("_", expand=True) + exp = MultiIndex.from_tuples( + [ + ("some", "equal", "splits"), + ("with", "no", "nans"), + [np.nan, np.nan, np.nan], + [None, None, None], + ] + ) + tm.assert_index_equal(result, exp) + assert result.nlevels == 3 + + idx = Index(["some_unequal_splits", "one_of_these_things_is_not", np.nan, None]) + result = idx.str.split("_", expand=True) + exp = MultiIndex.from_tuples( + [ + ("some", "unequal", "splits", np.nan, np.nan, np.nan), + ("one", "of", "these", "things", "is", "not"), + (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan), + (None, None, None, None, None, None), + ] + ) + tm.assert_index_equal(result, exp) + assert result.nlevels == 6 + + with pytest.raises(ValueError, match="expand must be"): + idx.str.split("_", expand="not_a_boolean") + + def test_rsplit_to_dataframe_expand(self): + s = Series(["nosplit", "alsonosplit"]) + result = s.str.rsplit("_", expand=True) + exp = DataFrame({0: Series(["nosplit", "alsonosplit"])}) + tm.assert_frame_equal(result, exp) + + s = Series(["some_equal_splits", "with_no_nans"]) + result = s.str.rsplit("_", expand=True) + exp = DataFrame( + {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]} + ) + tm.assert_frame_equal(result, exp) + + result = s.str.rsplit("_", expand=True, n=2) + exp = DataFrame( + {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]} + ) + tm.assert_frame_equal(result, exp) + + result = s.str.rsplit("_", expand=True, n=1) + exp = DataFrame({0: ["some_equal", "with_no"], 1: ["splits", "nans"]}) + tm.assert_frame_equal(result, exp) + + s = Series(["some_splits", "with_index"], index=["preserve", "me"]) + result = s.str.rsplit("_", expand=True) + exp = DataFrame( + {0: ["some", "with"], 1: ["splits", "index"]}, index=["preserve", "me"] + ) + tm.assert_frame_equal(result, exp) + + def test_rsplit_to_multiindex_expand(self): + idx = Index(["nosplit", "alsonosplit"]) + result = idx.str.rsplit("_", expand=True) + exp = idx + tm.assert_index_equal(result, exp) + assert result.nlevels == 1 + + idx = Index(["some_equal_splits", "with_no_nans"]) + result = idx.str.rsplit("_", expand=True) + exp = MultiIndex.from_tuples( + [("some", "equal", "splits"), ("with", "no", "nans")] + ) + tm.assert_index_equal(result, exp) + assert result.nlevels == 3 + + idx = Index(["some_equal_splits", "with_no_nans"]) + result = idx.str.rsplit("_", expand=True, n=1) + exp = MultiIndex.from_tuples([("some_equal", "splits"), ("with_no", "nans")]) + tm.assert_index_equal(result, exp) + assert result.nlevels == 2 + + def test_split_nan_expand(self): + # gh-18450 + s = Series(["foo,bar,baz", np.nan]) + result = s.str.split(",", expand=True) + exp = DataFrame([["foo", "bar", "baz"], [np.nan, np.nan, np.nan]]) + tm.assert_frame_equal(result, exp) + + # check that these are actually np.nan and not None + # TODO see GH 18463 + # tm.assert_frame_equal does not differentiate + assert all(np.isnan(x) for x in result.iloc[1]) + + def test_split_with_name(self): + # GH 12617 + + # should preserve name + s = Series(["a,b", "c,d"], name="xxx") + res = s.str.split(",") + exp = Series([["a", "b"], ["c", "d"]], name="xxx") + tm.assert_series_equal(res, exp) + + res = s.str.split(",", expand=True) + exp = DataFrame([["a", "b"], ["c", "d"]]) + tm.assert_frame_equal(res, exp) + + idx = Index(["a,b", "c,d"], name="xxx") + res = idx.str.split(",") + exp = Index([["a", "b"], ["c", "d"]], name="xxx") + assert res.nlevels == 1 + tm.assert_index_equal(res, exp) + + res = idx.str.split(",", expand=True) + exp = MultiIndex.from_tuples([("a", "b"), ("c", "d")]) + assert res.nlevels == 2 + tm.assert_index_equal(res, exp) + + def test_partition_series(self): + # https://github.com/pandas-dev/pandas/issues/23558 + + values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None]) + + result = values.str.partition("_", expand=False) + exp = Series( + [("a", "_", "b_c"), ("c", "_", "d_e"), np.nan, ("f", "_", "g_h"), None] + ) + tm.assert_series_equal(result, exp) + + result = values.str.rpartition("_", expand=False) + exp = Series( + [("a_b", "_", "c"), ("c_d", "_", "e"), np.nan, ("f_g", "_", "h"), None] + ) + tm.assert_series_equal(result, exp) + + # more than one char + values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h", None]) + result = values.str.partition("__", expand=False) + exp = Series( + [ + ("a", "__", "b__c"), + ("c", "__", "d__e"), + np.nan, + ("f", "__", "g__h"), + None, + ] + ) + tm.assert_series_equal(result, exp) + + result = values.str.rpartition("__", expand=False) + exp = Series( + [ + ("a__b", "__", "c"), + ("c__d", "__", "e"), + np.nan, + ("f__g", "__", "h"), + None, + ] + ) + tm.assert_series_equal(result, exp) + + # None + values = Series(["a b c", "c d e", np.nan, "f g h", None]) + result = values.str.partition(expand=False) + exp = Series( + [("a", " ", "b c"), ("c", " ", "d e"), np.nan, ("f", " ", "g h"), None] + ) + tm.assert_series_equal(result, exp) + + result = values.str.rpartition(expand=False) + exp = Series( + [("a b", " ", "c"), ("c d", " ", "e"), np.nan, ("f g", " ", "h"), None] + ) + tm.assert_series_equal(result, exp) + + # Not split + values = Series(["abc", "cde", np.nan, "fgh", None]) + result = values.str.partition("_", expand=False) + exp = Series([("abc", "", ""), ("cde", "", ""), np.nan, ("fgh", "", ""), None]) + tm.assert_series_equal(result, exp) + + result = values.str.rpartition("_", expand=False) + exp = Series([("", "", "abc"), ("", "", "cde"), np.nan, ("", "", "fgh"), None]) + tm.assert_series_equal(result, exp) + + # unicode + values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) + + result = values.str.partition("_", expand=False) + exp = Series([("a", "_", "b_c"), ("c", "_", "d_e"), np.nan, ("f", "_", "g_h")]) + tm.assert_series_equal(result, exp) + + result = values.str.rpartition("_", expand=False) + exp = Series([("a_b", "_", "c"), ("c_d", "_", "e"), np.nan, ("f_g", "_", "h")]) + tm.assert_series_equal(result, exp) + + # compare to standard lib + values = Series(["A_B_C", "B_C_D", "E_F_G", "EFGHEF"]) + result = values.str.partition("_", expand=False).tolist() + assert result == [v.partition("_") for v in values] + result = values.str.rpartition("_", expand=False).tolist() + assert result == [v.rpartition("_") for v in values] + + def test_partition_index(self): + # https://github.com/pandas-dev/pandas/issues/23558 + + values = Index(["a_b_c", "c_d_e", "f_g_h", np.nan, None]) + + result = values.str.partition("_", expand=False) + exp = Index( + np.array( + [("a", "_", "b_c"), ("c", "_", "d_e"), ("f", "_", "g_h"), np.nan, None], + dtype=object, + ) + ) + tm.assert_index_equal(result, exp) + assert result.nlevels == 1 + + result = values.str.rpartition("_", expand=False) + exp = Index( + np.array( + [("a_b", "_", "c"), ("c_d", "_", "e"), ("f_g", "_", "h"), np.nan, None], + dtype=object, + ) + ) + tm.assert_index_equal(result, exp) + assert result.nlevels == 1 + + result = values.str.partition("_") + exp = Index( + [ + ("a", "_", "b_c"), + ("c", "_", "d_e"), + ("f", "_", "g_h"), + (np.nan, np.nan, np.nan), + (None, None, None), + ] + ) + tm.assert_index_equal(result, exp) + assert isinstance(result, MultiIndex) + assert result.nlevels == 3 + + result = values.str.rpartition("_") + exp = Index( + [ + ("a_b", "_", "c"), + ("c_d", "_", "e"), + ("f_g", "_", "h"), + (np.nan, np.nan, np.nan), + (None, None, None), + ] + ) + tm.assert_index_equal(result, exp) + assert isinstance(result, MultiIndex) + assert result.nlevels == 3 + + def test_partition_to_dataframe(self): + # https://github.com/pandas-dev/pandas/issues/23558 + + values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None]) + result = values.str.partition("_") + exp = DataFrame( + { + 0: ["a", "c", np.nan, "f", None], + 1: ["_", "_", np.nan, "_", None], + 2: ["b_c", "d_e", np.nan, "g_h", None], + } + ) + tm.assert_frame_equal(result, exp) + + result = values.str.rpartition("_") + exp = DataFrame( + { + 0: ["a_b", "c_d", np.nan, "f_g", None], + 1: ["_", "_", np.nan, "_", None], + 2: ["c", "e", np.nan, "h", None], + } + ) + tm.assert_frame_equal(result, exp) + + values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None]) + result = values.str.partition("_", expand=True) + exp = DataFrame( + { + 0: ["a", "c", np.nan, "f", None], + 1: ["_", "_", np.nan, "_", None], + 2: ["b_c", "d_e", np.nan, "g_h", None], + } + ) + tm.assert_frame_equal(result, exp) + + result = values.str.rpartition("_", expand=True) + exp = DataFrame( + { + 0: ["a_b", "c_d", np.nan, "f_g", None], + 1: ["_", "_", np.nan, "_", None], + 2: ["c", "e", np.nan, "h", None], + } + ) + tm.assert_frame_equal(result, exp) + + def test_partition_with_name(self): + # GH 12617 + + s = Series(["a,b", "c,d"], name="xxx") + res = s.str.partition(",") + exp = DataFrame({0: ["a", "c"], 1: [",", ","], 2: ["b", "d"]}) + tm.assert_frame_equal(res, exp) + + # should preserve name + res = s.str.partition(",", expand=False) + exp = Series([("a", ",", "b"), ("c", ",", "d")], name="xxx") + tm.assert_series_equal(res, exp) + + idx = Index(["a,b", "c,d"], name="xxx") + res = idx.str.partition(",") + exp = MultiIndex.from_tuples([("a", ",", "b"), ("c", ",", "d")]) + assert res.nlevels == 3 + tm.assert_index_equal(res, exp) + + # should preserve name + res = idx.str.partition(",", expand=False) + exp = Index(np.array([("a", ",", "b"), ("c", ",", "d")]), name="xxx") + assert res.nlevels == 1 + tm.assert_index_equal(res, exp) + + def test_partition_sep_kwarg(self): + # GH 22676; depr kwarg "pat" in favor of "sep" + values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) + + expected = values.str.partition(sep="_") + result = values.str.partition("_") + tm.assert_frame_equal(result, expected) + + expected = values.str.rpartition(sep="_") + result = values.str.rpartition("_") + tm.assert_frame_equal(result, expected) + + def test_pipe_failures(self): + # #2119 + s = Series(["A|B|C"]) + + result = s.str.split("|") + exp = Series([["A", "B", "C"]]) + + tm.assert_series_equal(result, exp) + + result = s.str.replace("|", " ") + exp = Series(["A B C"]) + + tm.assert_series_equal(result, exp) + + @pytest.mark.parametrize( + "start, stop, step, expected", + [ + (2, 5, None, Series(["foo", "bar", np.nan, "baz"])), + (0, 3, -1, Series(["", "", np.nan, ""])), + (None, None, -1, Series(["owtoofaa", "owtrabaa", np.nan, "xuqzabaa"])), + (3, 10, 2, Series(["oto", "ato", np.nan, "aqx"])), + (3, 0, -1, Series(["ofa", "aba", np.nan, "aba"])), + ], + ) + def test_slice(self, start, stop, step, expected): + values = Series(["aafootwo", "aabartwo", np.nan, "aabazqux"]) + result = values.str.slice(start, stop, step) + tm.assert_series_equal(result, expected) + + # mixed + mixed = Series( + ["aafootwo", np.nan, "aabartwo", True, datetime.today(), None, 1, 2.0] + ) + + rs = Series(mixed).str.slice(2, 5) + xp = Series(["foo", np.nan, "bar", np.nan, np.nan, np.nan, np.nan, np.nan]) + + assert isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + rs = Series(mixed).str.slice(2, 5, -1) + xp = Series(["oof", np.nan, "rab", np.nan, np.nan, np.nan, np.nan, np.nan]) + + def test_slice_replace(self): + values = Series(["short", "a bit longer", "evenlongerthanthat", "", np.nan]) + + exp = Series(["shrt", "a it longer", "evnlongerthanthat", "", np.nan]) + result = values.str.slice_replace(2, 3) + tm.assert_series_equal(result, exp) + + exp = Series(["shzrt", "a zit longer", "evznlongerthanthat", "z", np.nan]) + result = values.str.slice_replace(2, 3, "z") + tm.assert_series_equal(result, exp) + + exp = Series(["shzort", "a zbit longer", "evzenlongerthanthat", "z", np.nan]) + result = values.str.slice_replace(2, 2, "z") + tm.assert_series_equal(result, exp) + + exp = Series(["shzort", "a zbit longer", "evzenlongerthanthat", "z", np.nan]) + result = values.str.slice_replace(2, 1, "z") + tm.assert_series_equal(result, exp) + + exp = Series(["shorz", "a bit longez", "evenlongerthanthaz", "z", np.nan]) + result = values.str.slice_replace(-1, None, "z") + tm.assert_series_equal(result, exp) + + exp = Series(["zrt", "zer", "zat", "z", np.nan]) + result = values.str.slice_replace(None, -2, "z") + tm.assert_series_equal(result, exp) + + exp = Series(["shortz", "a bit znger", "evenlozerthanthat", "z", np.nan]) + result = values.str.slice_replace(6, 8, "z") + tm.assert_series_equal(result, exp) + + exp = Series(["zrt", "a zit longer", "evenlongzerthanthat", "z", np.nan]) + result = values.str.slice_replace(-10, 3, "z") + tm.assert_series_equal(result, exp) + + def test_strip_lstrip_rstrip(self): + values = Series([" aa ", " bb \n", np.nan, "cc "]) + + result = values.str.strip() + exp = Series(["aa", "bb", np.nan, "cc"]) + tm.assert_series_equal(result, exp) + + result = values.str.lstrip() + exp = Series(["aa ", "bb \n", np.nan, "cc "]) + tm.assert_series_equal(result, exp) + + result = values.str.rstrip() + exp = Series([" aa", " bb", np.nan, "cc"]) + tm.assert_series_equal(result, exp) + + def test_strip_lstrip_rstrip_mixed(self): + # mixed + mixed = Series( + [" aa ", np.nan, " bb \t\n", True, datetime.today(), None, 1, 2.0] + ) + + rs = Series(mixed).str.strip() + xp = Series(["aa", np.nan, "bb", np.nan, np.nan, np.nan, np.nan, np.nan]) + + assert isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + rs = Series(mixed).str.lstrip() + xp = Series(["aa ", np.nan, "bb \t\n", np.nan, np.nan, np.nan, np.nan, np.nan]) + + assert isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + rs = Series(mixed).str.rstrip() + xp = Series([" aa", np.nan, " bb", np.nan, np.nan, np.nan, np.nan, np.nan]) + + assert isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + def test_strip_lstrip_rstrip_args(self): + values = Series(["xxABCxx", "xx BNSD", "LDFJH xx"]) + + rs = values.str.strip("x") + xp = Series(["ABC", " BNSD", "LDFJH "]) + tm.assert_series_equal(rs, xp) + + rs = values.str.lstrip("x") + xp = Series(["ABCxx", " BNSD", "LDFJH xx"]) + tm.assert_series_equal(rs, xp) + + rs = values.str.rstrip("x") + xp = Series(["xxABC", "xx BNSD", "LDFJH "]) + tm.assert_series_equal(rs, xp) + + def test_wrap(self): + # test values are: two words less than width, two words equal to width, + # two words greater than width, one word less than width, one word + # equal to width, one word greater than width, multiple tokens with + # trailing whitespace equal to width + values = Series( + [ + "hello world", + "hello world!", + "hello world!!", + "abcdefabcde", + "abcdefabcdef", + "abcdefabcdefa", + "ab ab ab ab ", + "ab ab ab ab a", + "\t", + ] + ) + + # expected values + xp = Series( + [ + "hello world", + "hello world!", + "hello\nworld!!", + "abcdefabcde", + "abcdefabcdef", + "abcdefabcdef\na", + "ab ab ab ab", + "ab ab ab ab\na", + "", + ] + ) + + rs = values.str.wrap(12, break_long_words=True) + tm.assert_series_equal(rs, xp) + + # test with pre and post whitespace (non-unicode), NaN, and non-ascii + # Unicode + values = Series([" pre ", np.nan, "\xac\u20ac\U00008000 abadcafe"]) + xp = Series([" pre", np.nan, "\xac\u20ac\U00008000 ab\nadcafe"]) + rs = values.str.wrap(6) + tm.assert_series_equal(rs, xp) + + def test_get(self): + values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) + + result = values.str.split("_").str.get(1) + expected = Series(["b", "d", np.nan, "g"]) + tm.assert_series_equal(result, expected) + + # mixed + mixed = Series(["a_b_c", np.nan, "c_d_e", True, datetime.today(), None, 1, 2.0]) + + rs = Series(mixed).str.split("_").str.get(1) + xp = Series(["b", np.nan, "d", np.nan, np.nan, np.nan, np.nan, np.nan]) + + assert isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + # bounds testing + values = Series(["1_2_3_4_5", "6_7_8_9_10", "11_12"]) + + # positive index + result = values.str.split("_").str.get(2) + expected = Series(["3", "8", np.nan]) + tm.assert_series_equal(result, expected) + + # negative index + result = values.str.split("_").str.get(-3) + expected = Series(["3", "8", np.nan]) + tm.assert_series_equal(result, expected) + + def test_get_complex(self): + # GH 20671, getting value not in dict raising `KeyError` + values = Series([(1, 2, 3), [1, 2, 3], {1, 2, 3}, {1: "a", 2: "b", 3: "c"}]) + + result = values.str.get(1) + expected = Series([2, 2, np.nan, "a"]) + tm.assert_series_equal(result, expected) + + result = values.str.get(-1) + expected = Series([3, 3, np.nan, np.nan]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("to_type", [tuple, list, np.array]) + def test_get_complex_nested(self, to_type): + values = Series([to_type([to_type([1, 2])])]) + + result = values.str.get(0) + expected = Series([to_type([1, 2])]) + tm.assert_series_equal(result, expected) + + result = values.str.get(1) + expected = Series([np.nan]) + tm.assert_series_equal(result, expected) + + def test_contains_moar(self): + # PR #1179 + s = Series(["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"]) + + result = s.str.contains("a") + expected = Series( + [False, False, False, True, True, False, np.nan, False, False, True] + ) + tm.assert_series_equal(result, expected) + + result = s.str.contains("a", case=False) + expected = Series( + [True, False, False, True, True, False, np.nan, True, False, True] + ) + tm.assert_series_equal(result, expected) + + result = s.str.contains("Aa") + expected = Series( + [False, False, False, True, False, False, np.nan, False, False, False] + ) + tm.assert_series_equal(result, expected) + + result = s.str.contains("ba") + expected = Series( + [False, False, False, True, False, False, np.nan, False, False, False] + ) + tm.assert_series_equal(result, expected) + + result = s.str.contains("ba", case=False) + expected = Series( + [False, False, False, True, True, False, np.nan, True, False, False] + ) + tm.assert_series_equal(result, expected) + + def test_contains_nan(self): + # PR #14171 + s = Series([np.nan, np.nan, np.nan], dtype=np.object_) + + result = s.str.contains("foo", na=False) + expected = Series([False, False, False], dtype=np.bool_) + tm.assert_series_equal(result, expected) + + result = s.str.contains("foo", na=True) + expected = Series([True, True, True], dtype=np.bool_) + tm.assert_series_equal(result, expected) + + result = s.str.contains("foo", na="foo") + expected = Series(["foo", "foo", "foo"], dtype=np.object_) + tm.assert_series_equal(result, expected) + + result = s.str.contains("foo") + expected = Series([np.nan, np.nan, np.nan], dtype=np.object_) + tm.assert_series_equal(result, expected) + + def test_replace_moar(self): + # PR #1179 + s = Series(["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"]) + + result = s.str.replace("A", "YYY") + expected = Series( + ["YYY", "B", "C", "YYYaba", "Baca", "", np.nan, "CYYYBYYY", "dog", "cat"] + ) + tm.assert_series_equal(result, expected) + + result = s.str.replace("A", "YYY", case=False) + expected = Series( + [ + "YYY", + "B", + "C", + "YYYYYYbYYY", + "BYYYcYYY", + "", + np.nan, + "CYYYBYYY", + "dog", + "cYYYt", + ] + ) + tm.assert_series_equal(result, expected) + + result = s.str.replace("^.a|dog", "XX-XX ", case=False) + expected = Series( + [ + "A", + "B", + "C", + "XX-XX ba", + "XX-XX ca", + "", + np.nan, + "XX-XX BA", + "XX-XX ", + "XX-XX t", + ] + ) + tm.assert_series_equal(result, expected) + + def test_string_slice_get_syntax(self): + s = Series( + [ + "YYY", + "B", + "C", + "YYYYYYbYYY", + "BYYYcYYY", + np.nan, + "CYYYBYYY", + "dog", + "cYYYt", + ] + ) + + result = s.str[0] + expected = s.str.get(0) + tm.assert_series_equal(result, expected) + + result = s.str[:3] + expected = s.str.slice(stop=3) + tm.assert_series_equal(result, expected) + + result = s.str[2::-1] + expected = s.str.slice(start=2, step=-1) + tm.assert_series_equal(result, expected) + + def test_string_slice_out_of_bounds(self): + s = Series([(1, 2), (1,), (3, 4, 5)]) + + result = s.str[1] + expected = Series([2, np.nan, 4]) + + tm.assert_series_equal(result, expected) + + s = Series(["foo", "b", "ba"]) + result = s.str[1] + expected = Series(["o", np.nan, "a"]) + tm.assert_series_equal(result, expected) + + def test_match_findall_flags(self): + data = { + "Dave": "dave@google.com", + "Steve": "steve@gmail.com", + "Rob": "rob@gmail.com", + "Wes": np.nan, + } + data = Series(data) + + pat = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})" + + result = data.str.extract(pat, flags=re.IGNORECASE, expand=True) + assert result.iloc[0].tolist() == ["dave", "google", "com"] + + result = data.str.match(pat, flags=re.IGNORECASE) + assert result[0] + + result = data.str.findall(pat, flags=re.IGNORECASE) + assert result[0][0] == ("dave", "google", "com") + + result = data.str.count(pat, flags=re.IGNORECASE) + assert result[0] == 1 + + with tm.assert_produces_warning(UserWarning): + result = data.str.contains(pat, flags=re.IGNORECASE) + assert result[0] + + def test_encode_decode(self): + base = Series(["a", "b", "a\xe4"]) + series = base.str.encode("utf-8") + + f = lambda x: x.decode("utf-8") + result = series.str.decode("utf-8") + exp = series.map(f) + + tm.assert_series_equal(result, exp) + + def test_encode_decode_errors(self): + encodeBase = Series(["a", "b", "a\x9d"]) + + msg = ( + r"'charmap' codec can't encode character '\\x9d' in position 1:" + " character maps to " + ) + with pytest.raises(UnicodeEncodeError, match=msg): + encodeBase.str.encode("cp1252") + + f = lambda x: x.encode("cp1252", "ignore") + result = encodeBase.str.encode("cp1252", "ignore") + exp = encodeBase.map(f) + tm.assert_series_equal(result, exp) + + decodeBase = Series([b"a", b"b", b"a\x9d"]) + + msg = ( + "'charmap' codec can't decode byte 0x9d in position 1:" + " character maps to " + ) + with pytest.raises(UnicodeDecodeError, match=msg): + decodeBase.str.decode("cp1252") + + f = lambda x: x.decode("cp1252", "ignore") + result = decodeBase.str.decode("cp1252", "ignore") + exp = decodeBase.map(f) + + tm.assert_series_equal(result, exp) + + def test_normalize(self): + values = ["ABC", "ABC", "123", np.nan, "アイエ"] + s = Series(values, index=["a", "b", "c", "d", "e"]) + + normed = ["ABC", "ABC", "123", np.nan, "アイエ"] + expected = Series(normed, index=["a", "b", "c", "d", "e"]) + + result = s.str.normalize("NFKC") + tm.assert_series_equal(result, expected) + + expected = Series( + ["ABC", "ABC", "123", np.nan, "アイエ"], index=["a", "b", "c", "d", "e"] + ) + + result = s.str.normalize("NFC") + tm.assert_series_equal(result, expected) + + with pytest.raises(ValueError, match="invalid normalization form"): + s.str.normalize("xxx") + + s = Index(["ABC", "123", "アイエ"]) + expected = Index(["ABC", "123", "アイエ"]) + result = s.str.normalize("NFKC") + tm.assert_index_equal(result, expected) + + def test_index_str_accessor_visibility(self): + from pandas.core.strings import StringMethods + + cases = [ + (["a", "b"], "string"), + (["a", "b", 1], "mixed-integer"), + (["a", "b", 1.3], "mixed"), + (["a", "b", 1.3, 1], "mixed-integer"), + (["aa", datetime(2011, 1, 1)], "mixed"), + ] + for values, tp in cases: + idx = Index(values) + assert isinstance(Series(values).str, StringMethods) + assert isinstance(idx.str, StringMethods) + assert idx.inferred_type == tp + + for values, tp in cases: + idx = Index(values) + assert isinstance(Series(values).str, StringMethods) + assert isinstance(idx.str, StringMethods) + assert idx.inferred_type == tp + + cases = [ + ([1, np.nan], "floating"), + ([datetime(2011, 1, 1)], "datetime64"), + ([timedelta(1)], "timedelta64"), + ] + for values, tp in cases: + idx = Index(values) + message = "Can only use .str accessor with string values" + with pytest.raises(AttributeError, match=message): + Series(values).str + with pytest.raises(AttributeError, match=message): + idx.str + assert idx.inferred_type == tp + + # MultiIndex has mixed dtype, but not allow to use accessor + idx = MultiIndex.from_tuples([("a", "b"), ("a", "b")]) + assert idx.inferred_type == "mixed" + message = "Can only use .str accessor with Index, not MultiIndex" + with pytest.raises(AttributeError, match=message): + idx.str + + def test_str_accessor_no_new_attributes(self): + # https://github.com/pandas-dev/pandas/issues/10673 + s = Series(list("aabbcde")) + with pytest.raises(AttributeError, match="You cannot add any new attribute"): + s.str.xlabel = "a" + + def test_method_on_bytes(self): + lhs = Series(np.array(list("abc"), "S1").astype(object)) + rhs = Series(np.array(list("def"), "S1").astype(object)) + with pytest.raises(TypeError, match="Cannot use .str.cat with values of.*"): + lhs.str.cat(rhs) + + def test_casefold(self): + # GH25405 + expected = Series(["ss", np.nan, "case", "ssd"]) + s = Series(["ß", np.nan, "case", "ßd"]) + result = s.str.casefold() + + tm.assert_series_equal(result, expected) + + +def test_string_array(any_string_method): + method_name, args, kwargs = any_string_method + if method_name == "decode": + pytest.skip("decode requires bytes.") + + data = ["a", "bb", np.nan, "ccc"] + a = Series(data, dtype=object) + b = Series(data, dtype="string") + + expected = getattr(a.str, method_name)(*args, **kwargs) + result = getattr(b.str, method_name)(*args, **kwargs) + + if isinstance(expected, Series): + if expected.dtype == "object" and lib.is_string_array( + expected.dropna().values, + ): + assert result.dtype == "string" + result = result.astype(object) + + elif expected.dtype == "object" and lib.is_bool_array( + expected.values, skipna=True + ): + assert result.dtype == "boolean" + result = result.astype(object) + + elif expected.dtype == "float" and expected.isna().any(): + assert result.dtype == "Int64" + result = result.astype("float") + + elif isinstance(expected, DataFrame): + columns = expected.select_dtypes(include="object").columns + assert all(result[columns].dtypes == "string") + result[columns] = result[columns].astype(object) + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "method,expected", + [ + ("count", [2, None]), + ("find", [0, None]), + ("index", [0, None]), + ("rindex", [2, None]), + ], +) +def test_string_array_numeric_integer_array(method, expected): + s = Series(["aba", None], dtype="string") + result = getattr(s.str, method)("a") + expected = Series(expected, dtype="Int64") + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "method,expected", + [ + ("isdigit", [False, None, True]), + ("isalpha", [True, None, False]), + ("isalnum", [True, None, True]), + ("isdigit", [False, None, True]), + ], +) +def test_string_array_boolean_array(method, expected): + s = Series(["a", None, "1"], dtype="string") + result = getattr(s.str, method)() + expected = Series(expected, dtype="boolean") + tm.assert_series_equal(result, expected) + + +def test_string_array_extract(): + # https://github.com/pandas-dev/pandas/issues/30969 + # Only expand=False & multiple groups was failing + a = Series(["a1", "b2", "cc"], dtype="string") + b = Series(["a1", "b2", "cc"], dtype="object") + pat = r"(\w)(\d)" + + result = a.str.extract(pat, expand=False) + expected = b.str.extract(pat, expand=False) + assert all(result.dtypes == "string") + + result = result.astype(object) + tm.assert_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/test_take.py b/venv/Lib/site-packages/pandas/tests/test_take.py new file mode 100644 index 0000000..465296a --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/test_take.py @@ -0,0 +1,461 @@ +from datetime import datetime +import re + +import numpy as np +import pytest + +from pandas._libs.tslib import iNaT + +import pandas._testing as tm +import pandas.core.algorithms as algos + + +@pytest.fixture(params=[True, False]) +def writeable(request): + return request.param + + +# Check that take_nd works both with writeable arrays +# (in which case fast typed memory-views implementation) +# and read-only arrays alike. +@pytest.fixture( + params=[ + (np.float64, True), + (np.float32, True), + (np.uint64, False), + (np.uint32, False), + (np.uint16, False), + (np.uint8, False), + (np.int64, False), + (np.int32, False), + (np.int16, False), + (np.int8, False), + (np.object_, True), + (np.bool, False), + ] +) +def dtype_can_hold_na(request): + return request.param + + +@pytest.fixture( + params=[ + (np.int8, np.int16(127), np.int8), + (np.int8, np.int16(128), np.int16), + (np.int32, 1, np.int32), + (np.int32, 2.0, np.float64), + (np.int32, 3.0 + 4.0j, np.complex128), + (np.int32, True, np.object_), + (np.int32, "", np.object_), + (np.float64, 1, np.float64), + (np.float64, 2.0, np.float64), + (np.float64, 3.0 + 4.0j, np.complex128), + (np.float64, True, np.object_), + (np.float64, "", np.object_), + (np.complex128, 1, np.complex128), + (np.complex128, 2.0, np.complex128), + (np.complex128, 3.0 + 4.0j, np.complex128), + (np.complex128, True, np.object_), + (np.complex128, "", np.object_), + (np.bool_, 1, np.object_), + (np.bool_, 2.0, np.object_), + (np.bool_, 3.0 + 4.0j, np.object_), + (np.bool_, True, np.bool_), + (np.bool_, "", np.object_), + ] +) +def dtype_fill_out_dtype(request): + return request.param + + +class TestTake: + # Standard incompatible fill error. + fill_error = re.compile("Incompatible type for fill_value") + + def test_1d_with_out(self, dtype_can_hold_na, writeable): + dtype, can_hold_na = dtype_can_hold_na + + data = np.random.randint(0, 2, 4).astype(dtype) + data.flags.writeable = writeable + + indexer = [2, 1, 0, 1] + out = np.empty(4, dtype=dtype) + algos.take_1d(data, indexer, out=out) + + expected = data.take(indexer) + tm.assert_almost_equal(out, expected) + + indexer = [2, 1, 0, -1] + out = np.empty(4, dtype=dtype) + + if can_hold_na: + algos.take_1d(data, indexer, out=out) + expected = data.take(indexer) + expected[3] = np.nan + tm.assert_almost_equal(out, expected) + else: + with pytest.raises(TypeError, match=self.fill_error): + algos.take_1d(data, indexer, out=out) + + # No Exception otherwise. + data.take(indexer, out=out) + + def test_1d_fill_nonna(self, dtype_fill_out_dtype): + dtype, fill_value, out_dtype = dtype_fill_out_dtype + data = np.random.randint(0, 2, 4).astype(dtype) + indexer = [2, 1, 0, -1] + + result = algos.take_1d(data, indexer, fill_value=fill_value) + assert (result[[0, 1, 2]] == data[[2, 1, 0]]).all() + assert result[3] == fill_value + assert result.dtype == out_dtype + + indexer = [2, 1, 0, 1] + + result = algos.take_1d(data, indexer, fill_value=fill_value) + assert (result[[0, 1, 2, 3]] == data[indexer]).all() + assert result.dtype == dtype + + def test_2d_with_out(self, dtype_can_hold_na, writeable): + dtype, can_hold_na = dtype_can_hold_na + + data = np.random.randint(0, 2, (5, 3)).astype(dtype) + data.flags.writeable = writeable + + indexer = [2, 1, 0, 1] + out0 = np.empty((4, 3), dtype=dtype) + out1 = np.empty((5, 4), dtype=dtype) + algos.take_nd(data, indexer, out=out0, axis=0) + algos.take_nd(data, indexer, out=out1, axis=1) + + expected0 = data.take(indexer, axis=0) + expected1 = data.take(indexer, axis=1) + tm.assert_almost_equal(out0, expected0) + tm.assert_almost_equal(out1, expected1) + + indexer = [2, 1, 0, -1] + out0 = np.empty((4, 3), dtype=dtype) + out1 = np.empty((5, 4), dtype=dtype) + + if can_hold_na: + algos.take_nd(data, indexer, out=out0, axis=0) + algos.take_nd(data, indexer, out=out1, axis=1) + + expected0 = data.take(indexer, axis=0) + expected1 = data.take(indexer, axis=1) + expected0[3, :] = np.nan + expected1[:, 3] = np.nan + + tm.assert_almost_equal(out0, expected0) + tm.assert_almost_equal(out1, expected1) + else: + for i, out in enumerate([out0, out1]): + with pytest.raises(TypeError, match=self.fill_error): + algos.take_nd(data, indexer, out=out, axis=i) + + # No Exception otherwise. + data.take(indexer, out=out, axis=i) + + def test_2d_fill_nonna(self, dtype_fill_out_dtype): + dtype, fill_value, out_dtype = dtype_fill_out_dtype + data = np.random.randint(0, 2, (5, 3)).astype(dtype) + indexer = [2, 1, 0, -1] + + result = algos.take_nd(data, indexer, axis=0, fill_value=fill_value) + assert (result[[0, 1, 2], :] == data[[2, 1, 0], :]).all() + assert (result[3, :] == fill_value).all() + assert result.dtype == out_dtype + + result = algos.take_nd(data, indexer, axis=1, fill_value=fill_value) + assert (result[:, [0, 1, 2]] == data[:, [2, 1, 0]]).all() + assert (result[:, 3] == fill_value).all() + assert result.dtype == out_dtype + + indexer = [2, 1, 0, 1] + result = algos.take_nd(data, indexer, axis=0, fill_value=fill_value) + assert (result[[0, 1, 2, 3], :] == data[indexer, :]).all() + assert result.dtype == dtype + + result = algos.take_nd(data, indexer, axis=1, fill_value=fill_value) + assert (result[:, [0, 1, 2, 3]] == data[:, indexer]).all() + assert result.dtype == dtype + + def test_3d_with_out(self, dtype_can_hold_na): + dtype, can_hold_na = dtype_can_hold_na + + data = np.random.randint(0, 2, (5, 4, 3)).astype(dtype) + indexer = [2, 1, 0, 1] + + out0 = np.empty((4, 4, 3), dtype=dtype) + out1 = np.empty((5, 4, 3), dtype=dtype) + out2 = np.empty((5, 4, 4), dtype=dtype) + + algos.take_nd(data, indexer, out=out0, axis=0) + algos.take_nd(data, indexer, out=out1, axis=1) + algos.take_nd(data, indexer, out=out2, axis=2) + + expected0 = data.take(indexer, axis=0) + expected1 = data.take(indexer, axis=1) + expected2 = data.take(indexer, axis=2) + + tm.assert_almost_equal(out0, expected0) + tm.assert_almost_equal(out1, expected1) + tm.assert_almost_equal(out2, expected2) + + indexer = [2, 1, 0, -1] + out0 = np.empty((4, 4, 3), dtype=dtype) + out1 = np.empty((5, 4, 3), dtype=dtype) + out2 = np.empty((5, 4, 4), dtype=dtype) + + if can_hold_na: + algos.take_nd(data, indexer, out=out0, axis=0) + algos.take_nd(data, indexer, out=out1, axis=1) + algos.take_nd(data, indexer, out=out2, axis=2) + + expected0 = data.take(indexer, axis=0) + expected1 = data.take(indexer, axis=1) + expected2 = data.take(indexer, axis=2) + + expected0[3, :, :] = np.nan + expected1[:, 3, :] = np.nan + expected2[:, :, 3] = np.nan + + tm.assert_almost_equal(out0, expected0) + tm.assert_almost_equal(out1, expected1) + tm.assert_almost_equal(out2, expected2) + else: + for i, out in enumerate([out0, out1, out2]): + with pytest.raises(TypeError, match=self.fill_error): + algos.take_nd(data, indexer, out=out, axis=i) + + # No Exception otherwise. + data.take(indexer, out=out, axis=i) + + def test_3d_fill_nonna(self, dtype_fill_out_dtype): + dtype, fill_value, out_dtype = dtype_fill_out_dtype + + data = np.random.randint(0, 2, (5, 4, 3)).astype(dtype) + indexer = [2, 1, 0, -1] + + result = algos.take_nd(data, indexer, axis=0, fill_value=fill_value) + assert (result[[0, 1, 2], :, :] == data[[2, 1, 0], :, :]).all() + assert (result[3, :, :] == fill_value).all() + assert result.dtype == out_dtype + + result = algos.take_nd(data, indexer, axis=1, fill_value=fill_value) + assert (result[:, [0, 1, 2], :] == data[:, [2, 1, 0], :]).all() + assert (result[:, 3, :] == fill_value).all() + assert result.dtype == out_dtype + + result = algos.take_nd(data, indexer, axis=2, fill_value=fill_value) + assert (result[:, :, [0, 1, 2]] == data[:, :, [2, 1, 0]]).all() + assert (result[:, :, 3] == fill_value).all() + assert result.dtype == out_dtype + + indexer = [2, 1, 0, 1] + result = algos.take_nd(data, indexer, axis=0, fill_value=fill_value) + assert (result[[0, 1, 2, 3], :, :] == data[indexer, :, :]).all() + assert result.dtype == dtype + + result = algos.take_nd(data, indexer, axis=1, fill_value=fill_value) + assert (result[:, [0, 1, 2, 3], :] == data[:, indexer, :]).all() + assert result.dtype == dtype + + result = algos.take_nd(data, indexer, axis=2, fill_value=fill_value) + assert (result[:, :, [0, 1, 2, 3]] == data[:, :, indexer]).all() + assert result.dtype == dtype + + def test_1d_other_dtypes(self): + arr = np.random.randn(10).astype(np.float32) + + indexer = [1, 2, 3, -1] + result = algos.take_1d(arr, indexer) + expected = arr.take(indexer) + expected[-1] = np.nan + tm.assert_almost_equal(result, expected) + + def test_2d_other_dtypes(self): + arr = np.random.randn(10, 5).astype(np.float32) + + indexer = [1, 2, 3, -1] + + # axis=0 + result = algos.take_nd(arr, indexer, axis=0) + expected = arr.take(indexer, axis=0) + expected[-1] = np.nan + tm.assert_almost_equal(result, expected) + + # axis=1 + result = algos.take_nd(arr, indexer, axis=1) + expected = arr.take(indexer, axis=1) + expected[:, -1] = np.nan + tm.assert_almost_equal(result, expected) + + def test_1d_bool(self): + arr = np.array([0, 1, 0], dtype=bool) + + result = algos.take_1d(arr, [0, 2, 2, 1]) + expected = arr.take([0, 2, 2, 1]) + tm.assert_numpy_array_equal(result, expected) + + result = algos.take_1d(arr, [0, 2, -1]) + assert result.dtype == np.object_ + + def test_2d_bool(self): + arr = np.array([[0, 1, 0], [1, 0, 1], [0, 1, 1]], dtype=bool) + + result = algos.take_nd(arr, [0, 2, 2, 1]) + expected = arr.take([0, 2, 2, 1], axis=0) + tm.assert_numpy_array_equal(result, expected) + + result = algos.take_nd(arr, [0, 2, 2, 1], axis=1) + expected = arr.take([0, 2, 2, 1], axis=1) + tm.assert_numpy_array_equal(result, expected) + + result = algos.take_nd(arr, [0, 2, -1]) + assert result.dtype == np.object_ + + def test_2d_float32(self): + arr = np.random.randn(4, 3).astype(np.float32) + indexer = [0, 2, -1, 1, -1] + + # axis=0 + result = algos.take_nd(arr, indexer, axis=0) + result2 = np.empty_like(result) + algos.take_nd(arr, indexer, axis=0, out=result2) + tm.assert_almost_equal(result, result2) + + expected = arr.take(indexer, axis=0) + expected[[2, 4], :] = np.nan + tm.assert_almost_equal(result, expected) + + # this now accepts a float32! # test with float64 out buffer + out = np.empty((len(indexer), arr.shape[1]), dtype="float32") + algos.take_nd(arr, indexer, out=out) # it works! + + # axis=1 + result = algos.take_nd(arr, indexer, axis=1) + result2 = np.empty_like(result) + algos.take_nd(arr, indexer, axis=1, out=result2) + tm.assert_almost_equal(result, result2) + + expected = arr.take(indexer, axis=1) + expected[:, [2, 4]] = np.nan + tm.assert_almost_equal(result, expected) + + def test_2d_datetime64(self): + # 2005/01/01 - 2006/01/01 + arr = np.random.randint(11045376, 11360736, (5, 3)) * 100000000000 + arr = arr.view(dtype="datetime64[ns]") + indexer = [0, 2, -1, 1, -1] + + # axis=0 + result = algos.take_nd(arr, indexer, axis=0) + result2 = np.empty_like(result) + algos.take_nd(arr, indexer, axis=0, out=result2) + tm.assert_almost_equal(result, result2) + + expected = arr.take(indexer, axis=0) + expected.view(np.int64)[[2, 4], :] = iNaT + tm.assert_almost_equal(result, expected) + + result = algos.take_nd(arr, indexer, axis=0, fill_value=datetime(2007, 1, 1)) + result2 = np.empty_like(result) + algos.take_nd( + arr, indexer, out=result2, axis=0, fill_value=datetime(2007, 1, 1) + ) + tm.assert_almost_equal(result, result2) + + expected = arr.take(indexer, axis=0) + expected[[2, 4], :] = datetime(2007, 1, 1) + tm.assert_almost_equal(result, expected) + + # axis=1 + result = algos.take_nd(arr, indexer, axis=1) + result2 = np.empty_like(result) + algos.take_nd(arr, indexer, axis=1, out=result2) + tm.assert_almost_equal(result, result2) + + expected = arr.take(indexer, axis=1) + expected.view(np.int64)[:, [2, 4]] = iNaT + tm.assert_almost_equal(result, expected) + + result = algos.take_nd(arr, indexer, axis=1, fill_value=datetime(2007, 1, 1)) + result2 = np.empty_like(result) + algos.take_nd( + arr, indexer, out=result2, axis=1, fill_value=datetime(2007, 1, 1) + ) + tm.assert_almost_equal(result, result2) + + expected = arr.take(indexer, axis=1) + expected[:, [2, 4]] = datetime(2007, 1, 1) + tm.assert_almost_equal(result, expected) + + def test_take_axis_0(self): + arr = np.arange(12).reshape(4, 3) + result = algos.take(arr, [0, -1]) + expected = np.array([[0, 1, 2], [9, 10, 11]]) + tm.assert_numpy_array_equal(result, expected) + + # allow_fill=True + result = algos.take(arr, [0, -1], allow_fill=True, fill_value=0) + expected = np.array([[0, 1, 2], [0, 0, 0]]) + tm.assert_numpy_array_equal(result, expected) + + def test_take_axis_1(self): + arr = np.arange(12).reshape(4, 3) + result = algos.take(arr, [0, -1], axis=1) + expected = np.array([[0, 2], [3, 5], [6, 8], [9, 11]]) + tm.assert_numpy_array_equal(result, expected) + + # allow_fill=True + result = algos.take(arr, [0, -1], axis=1, allow_fill=True, fill_value=0) + expected = np.array([[0, 0], [3, 0], [6, 0], [9, 0]]) + tm.assert_numpy_array_equal(result, expected) + + # GH#26976 make sure we validate along the correct axis + with pytest.raises(IndexError, match="indices are out-of-bounds"): + algos.take(arr, [0, 3], axis=1, allow_fill=True, fill_value=0) + + +class TestExtensionTake: + # The take method found in pd.api.extensions + + def test_bounds_check_large(self): + arr = np.array([1, 2]) + with pytest.raises(IndexError): + algos.take(arr, [2, 3], allow_fill=True) + + with pytest.raises(IndexError): + algos.take(arr, [2, 3], allow_fill=False) + + def test_bounds_check_small(self): + arr = np.array([1, 2, 3], dtype=np.int64) + indexer = [0, -1, -2] + with pytest.raises(ValueError): + algos.take(arr, indexer, allow_fill=True) + + result = algos.take(arr, indexer) + expected = np.array([1, 3, 2], dtype=np.int64) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("allow_fill", [True, False]) + def test_take_empty(self, allow_fill): + arr = np.array([], dtype=np.int64) + # empty take is ok + result = algos.take(arr, [], allow_fill=allow_fill) + tm.assert_numpy_array_equal(arr, result) + + with pytest.raises(IndexError): + algos.take(arr, [0], allow_fill=allow_fill) + + def test_take_na_empty(self): + result = algos.take(np.array([]), [-1, -1], allow_fill=True, fill_value=0.0) + expected = np.array([0.0, 0.0]) + tm.assert_numpy_array_equal(result, expected) + + def test_take_coerces_list(self): + arr = [1, 2, 3] + result = algos.take(arr, [0, 0]) + expected = np.array([1, 1]) + tm.assert_numpy_array_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/tools/__init__.py b/venv/Lib/site-packages/pandas/tests/tools/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/tools/test_numeric.py b/venv/Lib/site-packages/pandas/tests/tools/test_numeric.py new file mode 100644 index 0000000..2fd39d5 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/tools/test_numeric.py @@ -0,0 +1,629 @@ +import decimal + +import numpy as np +from numpy import iinfo +import pytest + +import pandas as pd +from pandas import DataFrame, Index, Series, to_numeric +import pandas._testing as tm + + +@pytest.fixture(params=[None, "ignore", "raise", "coerce"]) +def errors(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def signed(request): + return request.param + + +@pytest.fixture(params=[lambda x: x, str], ids=["identity", "str"]) +def transform(request): + return request.param + + +@pytest.fixture(params=[47393996303418497800, 100000000000000000000]) +def large_val(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def multiple_elts(request): + return request.param + + +@pytest.fixture( + params=[ + (lambda x: Index(x, name="idx"), tm.assert_index_equal), + (lambda x: Series(x, name="ser"), tm.assert_series_equal), + (lambda x: np.array(Index(x).values), tm.assert_numpy_array_equal), + ] +) +def transform_assert_equal(request): + return request.param + + +@pytest.mark.parametrize( + "input_kwargs,result_kwargs", + [ + (dict(), dict(dtype=np.int64)), + (dict(errors="coerce", downcast="integer"), dict(dtype=np.int8)), + ], +) +def test_empty(input_kwargs, result_kwargs): + # see gh-16302 + ser = Series([], dtype=object) + result = to_numeric(ser, **input_kwargs) + + expected = Series([], **result_kwargs) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("last_val", ["7", 7]) +def test_series(last_val): + ser = Series(["1", "-3.14", last_val]) + result = to_numeric(ser) + + expected = Series([1, -3.14, 7]) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "data", + [ + [1, 3, 4, 5], + [1.0, 3.0, 4.0, 5.0], + # Bool is regarded as numeric. + [True, False, True, True], + ], +) +def test_series_numeric(data): + ser = Series(data, index=list("ABCD"), name="EFG") + + result = to_numeric(ser) + tm.assert_series_equal(result, ser) + + +@pytest.mark.parametrize( + "data,msg", + [ + ([1, -3.14, "apple"], 'Unable to parse string "apple" at position 2'), + ( + ["orange", 1, -3.14, "apple"], + 'Unable to parse string "orange" at position 0', + ), + ], +) +def test_error(data, msg): + ser = Series(data) + + with pytest.raises(ValueError, match=msg): + to_numeric(ser, errors="raise") + + +@pytest.mark.parametrize( + "errors,exp_data", [("ignore", [1, -3.14, "apple"]), ("coerce", [1, -3.14, np.nan])] +) +def test_ignore_error(errors, exp_data): + ser = Series([1, -3.14, "apple"]) + result = to_numeric(ser, errors=errors) + + expected = Series(exp_data) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "errors,exp", + [ + ("raise", 'Unable to parse string "apple" at position 2'), + ("ignore", [True, False, "apple"]), + # Coerces to float. + ("coerce", [1.0, 0.0, np.nan]), + ], +) +def test_bool_handling(errors, exp): + ser = Series([True, False, "apple"]) + + if isinstance(exp, str): + with pytest.raises(ValueError, match=exp): + to_numeric(ser, errors=errors) + else: + result = to_numeric(ser, errors=errors) + expected = Series(exp) + + tm.assert_series_equal(result, expected) + + +def test_list(): + ser = ["1", "-3.14", "7"] + res = to_numeric(ser) + + expected = np.array([1, -3.14, 7]) + tm.assert_numpy_array_equal(res, expected) + + +@pytest.mark.parametrize( + "data,arr_kwargs", + [ + ([1, 3, 4, 5], dict(dtype=np.int64)), + ([1.0, 3.0, 4.0, 5.0], dict()), + # Boolean is regarded as numeric. + ([True, False, True, True], dict()), + ], +) +def test_list_numeric(data, arr_kwargs): + result = to_numeric(data) + expected = np.array(data, **arr_kwargs) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("kwargs", [dict(dtype="O"), dict()]) +def test_numeric(kwargs): + data = [1, -3.14, 7] + + ser = Series(data, **kwargs) + result = to_numeric(ser) + + expected = Series(data) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "columns", + [ + # One column. + "a", + # Multiple columns. + ["a", "b"], + ], +) +def test_numeric_df_columns(columns): + # see gh-14827 + df = DataFrame( + dict( + a=[1.2, decimal.Decimal(3.14), decimal.Decimal("infinity"), "0.1"], + b=[1.0, 2.0, 3.0, 4.0], + ) + ) + + expected = DataFrame(dict(a=[1.2, 3.14, np.inf, 0.1], b=[1.0, 2.0, 3.0, 4.0])) + + df_copy = df.copy() + df_copy[columns] = df_copy[columns].apply(to_numeric) + + tm.assert_frame_equal(df_copy, expected) + + +@pytest.mark.parametrize( + "data,exp_data", + [ + ( + [[decimal.Decimal(3.14), 1.0], decimal.Decimal(1.6), 0.1], + [[3.14, 1.0], 1.6, 0.1], + ), + ([np.array([decimal.Decimal(3.14), 1.0]), 0.1], [[3.14, 1.0], 0.1]), + ], +) +def test_numeric_embedded_arr_likes(data, exp_data): + # Test to_numeric with embedded lists and arrays + df = DataFrame(dict(a=data)) + df["a"] = df["a"].apply(to_numeric) + + expected = DataFrame(dict(a=exp_data)) + tm.assert_frame_equal(df, expected) + + +def test_all_nan(): + ser = Series(["a", "b", "c"]) + result = to_numeric(ser, errors="coerce") + + expected = Series([np.nan, np.nan, np.nan]) + tm.assert_series_equal(result, expected) + + +def test_type_check(errors): + # see gh-11776 + df = DataFrame({"a": [1, -3.14, 7], "b": ["4", "5", "6"]}) + kwargs = dict(errors=errors) if errors is not None else dict() + error_ctx = pytest.raises(TypeError, match="1-d array") + + with error_ctx: + to_numeric(df, **kwargs) + + +@pytest.mark.parametrize("val", [1, 1.1, 20001]) +def test_scalar(val, signed, transform): + val = -val if signed else val + assert to_numeric(transform(val)) == float(val) + + +def test_really_large_scalar(large_val, signed, transform, errors): + # see gh-24910 + kwargs = dict(errors=errors) if errors is not None else dict() + val = -large_val if signed else large_val + + val = transform(val) + val_is_string = isinstance(val, str) + + if val_is_string and errors in (None, "raise"): + msg = "Integer out of range. at position 0" + with pytest.raises(ValueError, match=msg): + to_numeric(val, **kwargs) + else: + expected = float(val) if (errors == "coerce" and val_is_string) else val + tm.assert_almost_equal(to_numeric(val, **kwargs), expected) + + +def test_really_large_in_arr(large_val, signed, transform, multiple_elts, errors): + # see gh-24910 + kwargs = dict(errors=errors) if errors is not None else dict() + val = -large_val if signed else large_val + val = transform(val) + + extra_elt = "string" + arr = [val] + multiple_elts * [extra_elt] + + val_is_string = isinstance(val, str) + coercing = errors == "coerce" + + if errors in (None, "raise") and (val_is_string or multiple_elts): + if val_is_string: + msg = "Integer out of range. at position 0" + else: + msg = 'Unable to parse string "string" at position 1' + + with pytest.raises(ValueError, match=msg): + to_numeric(arr, **kwargs) + else: + result = to_numeric(arr, **kwargs) + + exp_val = float(val) if (coercing and val_is_string) else val + expected = [exp_val] + + if multiple_elts: + if coercing: + expected.append(np.nan) + exp_dtype = float + else: + expected.append(extra_elt) + exp_dtype = object + else: + exp_dtype = float if isinstance(exp_val, (int, float)) else object + + tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype)) + + +def test_really_large_in_arr_consistent(large_val, signed, multiple_elts, errors): + # see gh-24910 + # + # Even if we discover that we have to hold float, does not mean + # we should be lenient on subsequent elements that fail to be integer. + kwargs = dict(errors=errors) if errors is not None else dict() + arr = [str(-large_val if signed else large_val)] + + if multiple_elts: + arr.insert(0, large_val) + + if errors in (None, "raise"): + index = int(multiple_elts) + msg = "Integer out of range. at position {index}".format(index=index) + + with pytest.raises(ValueError, match=msg): + to_numeric(arr, **kwargs) + else: + result = to_numeric(arr, **kwargs) + + if errors == "coerce": + expected = [float(i) for i in arr] + exp_dtype = float + else: + expected = arr + exp_dtype = object + + tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype)) + + +@pytest.mark.parametrize( + "errors,checker", + [ + ("raise", 'Unable to parse string "fail" at position 0'), + ("ignore", lambda x: x == "fail"), + ("coerce", lambda x: np.isnan(x)), + ], +) +def test_scalar_fail(errors, checker): + scalar = "fail" + + if isinstance(checker, str): + with pytest.raises(ValueError, match=checker): + to_numeric(scalar, errors=errors) + else: + assert checker(to_numeric(scalar, errors=errors)) + + +@pytest.mark.parametrize("data", [[1, 2, 3], [1.0, np.nan, 3, np.nan]]) +def test_numeric_dtypes(data, transform_assert_equal): + transform, assert_equal = transform_assert_equal + data = transform(data) + + result = to_numeric(data) + assert_equal(result, data) + + +@pytest.mark.parametrize( + "data,exp", + [ + (["1", "2", "3"], np.array([1, 2, 3], dtype="int64")), + (["1.5", "2.7", "3.4"], np.array([1.5, 2.7, 3.4])), + ], +) +def test_str(data, exp, transform_assert_equal): + transform, assert_equal = transform_assert_equal + result = to_numeric(transform(data)) + + expected = transform(exp) + assert_equal(result, expected) + + +def test_datetime_like(tz_naive_fixture, transform_assert_equal): + transform, assert_equal = transform_assert_equal + idx = pd.date_range("20130101", periods=3, tz=tz_naive_fixture) + + result = to_numeric(transform(idx)) + expected = transform(idx.asi8) + assert_equal(result, expected) + + +def test_timedelta(transform_assert_equal): + transform, assert_equal = transform_assert_equal + idx = pd.timedelta_range("1 days", periods=3, freq="D") + + result = to_numeric(transform(idx)) + expected = transform(idx.asi8) + assert_equal(result, expected) + + +def test_period(transform_assert_equal): + transform, assert_equal = transform_assert_equal + + idx = pd.period_range("2011-01", periods=3, freq="M", name="") + inp = transform(idx) + + if isinstance(inp, Index): + result = to_numeric(inp) + expected = transform(idx.asi8) + assert_equal(result, expected) + else: + # TODO: PeriodDtype, so support it in to_numeric. + pytest.skip("Missing PeriodDtype support in to_numeric") + + +@pytest.mark.parametrize( + "errors,expected", + [ + ("raise", "Invalid object type at position 0"), + ("ignore", Series([[10.0, 2], 1.0, "apple"])), + ("coerce", Series([np.nan, 1.0, np.nan])), + ], +) +def test_non_hashable(errors, expected): + # see gh-13324 + ser = Series([[10.0, 2], 1.0, "apple"]) + + if isinstance(expected, str): + with pytest.raises(TypeError, match=expected): + to_numeric(ser, errors=errors) + else: + result = to_numeric(ser, errors=errors) + tm.assert_series_equal(result, expected) + + +def test_downcast_invalid_cast(): + # see gh-13352 + data = ["1", 2, 3] + invalid_downcast = "unsigned-integer" + msg = "invalid downcasting method provided" + + with pytest.raises(ValueError, match=msg): + to_numeric(data, downcast=invalid_downcast) + + +def test_errors_invalid_value(): + # see gh-26466 + data = ["1", 2, 3] + invalid_error_value = "invalid" + msg = "invalid error value specified" + + with pytest.raises(ValueError, match=msg): + to_numeric(data, errors=invalid_error_value) + + +@pytest.mark.parametrize( + "data", + [ + ["1", 2, 3], + [1, 2, 3], + np.array(["1970-01-02", "1970-01-03", "1970-01-04"], dtype="datetime64[D]"), + ], +) +@pytest.mark.parametrize( + "kwargs,exp_dtype", + [ + # Basic function tests. + (dict(), np.int64), + (dict(downcast=None), np.int64), + # Support below np.float32 is rare and far between. + (dict(downcast="float"), np.dtype(np.float32).char), + # Basic dtype support. + (dict(downcast="unsigned"), np.dtype(np.typecodes["UnsignedInteger"][0])), + ], +) +def test_downcast_basic(data, kwargs, exp_dtype): + # see gh-13352 + result = to_numeric(data, **kwargs) + expected = np.array([1, 2, 3], dtype=exp_dtype) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("signed_downcast", ["integer", "signed"]) +@pytest.mark.parametrize( + "data", + [ + ["1", 2, 3], + [1, 2, 3], + np.array(["1970-01-02", "1970-01-03", "1970-01-04"], dtype="datetime64[D]"), + ], +) +def test_signed_downcast(data, signed_downcast): + # see gh-13352 + smallest_int_dtype = np.dtype(np.typecodes["Integer"][0]) + expected = np.array([1, 2, 3], dtype=smallest_int_dtype) + + res = to_numeric(data, downcast=signed_downcast) + tm.assert_numpy_array_equal(res, expected) + + +def test_ignore_downcast_invalid_data(): + # If we can't successfully cast the given + # data to a numeric dtype, do not bother + # with the downcast parameter. + data = ["foo", 2, 3] + expected = np.array(data, dtype=object) + + res = to_numeric(data, errors="ignore", downcast="unsigned") + tm.assert_numpy_array_equal(res, expected) + + +def test_ignore_downcast_neg_to_unsigned(): + # Cannot cast to an unsigned integer + # because we have a negative number. + data = ["-1", 2, 3] + expected = np.array([-1, 2, 3], dtype=np.int64) + + res = to_numeric(data, downcast="unsigned") + tm.assert_numpy_array_equal(res, expected) + + +@pytest.mark.parametrize("downcast", ["integer", "signed", "unsigned"]) +@pytest.mark.parametrize( + "data,expected", + [ + (["1.1", 2, 3], np.array([1.1, 2, 3], dtype=np.float64)), + ( + [10000.0, 20000, 3000, 40000.36, 50000, 50000.00], + np.array( + [10000.0, 20000, 3000, 40000.36, 50000, 50000.00], dtype=np.float64 + ), + ), + ], +) +def test_ignore_downcast_cannot_convert_float(data, expected, downcast): + # Cannot cast to an integer (signed or unsigned) + # because we have a float number. + res = to_numeric(data, downcast=downcast) + tm.assert_numpy_array_equal(res, expected) + + +@pytest.mark.parametrize( + "downcast,expected_dtype", + [("integer", np.int16), ("signed", np.int16), ("unsigned", np.uint16)], +) +def test_downcast_not8bit(downcast, expected_dtype): + # the smallest integer dtype need not be np.(u)int8 + data = ["256", 257, 258] + + expected = np.array([256, 257, 258], dtype=expected_dtype) + res = to_numeric(data, downcast=downcast) + tm.assert_numpy_array_equal(res, expected) + + +@pytest.mark.parametrize( + "dtype,downcast,min_max", + [ + ("int8", "integer", [iinfo(np.int8).min, iinfo(np.int8).max]), + ("int16", "integer", [iinfo(np.int16).min, iinfo(np.int16).max]), + ("int32", "integer", [iinfo(np.int32).min, iinfo(np.int32).max]), + ("int64", "integer", [iinfo(np.int64).min, iinfo(np.int64).max]), + ("uint8", "unsigned", [iinfo(np.uint8).min, iinfo(np.uint8).max]), + ("uint16", "unsigned", [iinfo(np.uint16).min, iinfo(np.uint16).max]), + ("uint32", "unsigned", [iinfo(np.uint32).min, iinfo(np.uint32).max]), + ("uint64", "unsigned", [iinfo(np.uint64).min, iinfo(np.uint64).max]), + ("int16", "integer", [iinfo(np.int8).min, iinfo(np.int8).max + 1]), + ("int32", "integer", [iinfo(np.int16).min, iinfo(np.int16).max + 1]), + ("int64", "integer", [iinfo(np.int32).min, iinfo(np.int32).max + 1]), + ("int16", "integer", [iinfo(np.int8).min - 1, iinfo(np.int16).max]), + ("int32", "integer", [iinfo(np.int16).min - 1, iinfo(np.int32).max]), + ("int64", "integer", [iinfo(np.int32).min - 1, iinfo(np.int64).max]), + ("uint16", "unsigned", [iinfo(np.uint8).min, iinfo(np.uint8).max + 1]), + ("uint32", "unsigned", [iinfo(np.uint16).min, iinfo(np.uint16).max + 1]), + ("uint64", "unsigned", [iinfo(np.uint32).min, iinfo(np.uint32).max + 1]), + ], +) +def test_downcast_limits(dtype, downcast, min_max): + # see gh-14404: test the limits of each downcast. + series = to_numeric(Series(min_max), downcast=downcast) + assert series.dtype == dtype + + +@pytest.mark.parametrize( + "ser,expected", + [ + ( + pd.Series([0, 9223372036854775808]), + pd.Series([0, 9223372036854775808], dtype=np.uint64), + ) + ], +) +def test_downcast_uint64(ser, expected): + # see gh-14422: + # BUG: to_numeric doesn't work uint64 numbers + + result = pd.to_numeric(ser, downcast="unsigned") + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "data,exp_data", + [ + ( + [200, 300, "", "NaN", 30000000000000000000], + [200, 300, np.nan, np.nan, 30000000000000000000], + ), + ( + ["12345678901234567890", "1234567890", "ITEM"], + [12345678901234567890, 1234567890, np.nan], + ), + ], +) +def test_coerce_uint64_conflict(data, exp_data): + # see gh-17007 and gh-17125 + # + # Still returns float despite the uint64-nan conflict, + # which would normally force the casting to object. + result = to_numeric(Series(data), errors="coerce") + expected = Series(exp_data, dtype=float) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "errors,exp", + [ + ("ignore", Series(["12345678901234567890", "1234567890", "ITEM"])), + ("raise", "Unable to parse string"), + ], +) +def test_non_coerce_uint64_conflict(errors, exp): + # see gh-17007 and gh-17125 + # + # For completeness. + ser = Series(["12345678901234567890", "1234567890", "ITEM"]) + + if isinstance(exp, str): + with pytest.raises(ValueError, match=exp): + to_numeric(ser, errors=errors) + else: + result = to_numeric(ser, errors=errors) + tm.assert_series_equal(result, ser) diff --git a/venv/Lib/site-packages/pandas/tests/tseries/__init__.py b/venv/Lib/site-packages/pandas/tests/tseries/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/tseries/frequencies/__init__.py b/venv/Lib/site-packages/pandas/tests/tseries/frequencies/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/tseries/frequencies/test_freq_code.py b/venv/Lib/site-packages/pandas/tests/tseries/frequencies/test_freq_code.py new file mode 100644 index 0000000..be07f82 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/tseries/frequencies/test_freq_code.py @@ -0,0 +1,192 @@ +import pytest + +from pandas._libs.tslibs import frequencies as libfrequencies, resolution +from pandas._libs.tslibs.frequencies import ( + FreqGroup, + _period_code_map, + get_freq, + get_freq_code, +) + +import pandas.tseries.offsets as offsets + + +@pytest.fixture(params=list(_period_code_map.items())) +def period_code_item(request): + return request.param + + +@pytest.mark.parametrize( + "freqstr,expected", + [ + ("A", 1000), + ("3A", 1000), + ("-1A", 1000), + ("Y", 1000), + ("3Y", 1000), + ("-1Y", 1000), + ("W", 4000), + ("W-MON", 4001), + ("W-FRI", 4005), + ], +) +def test_freq_code(freqstr, expected): + assert get_freq(freqstr) == expected + + +def test_freq_code_match(period_code_item): + freqstr, code = period_code_item + assert get_freq(freqstr) == code + + +@pytest.mark.parametrize( + "freqstr,expected", + [ + ("A", 1000), + ("3A", 1000), + ("-1A", 1000), + ("A-JAN", 1000), + ("A-MAY", 1000), + ("Y", 1000), + ("3Y", 1000), + ("-1Y", 1000), + ("Y-JAN", 1000), + ("Y-MAY", 1000), + (offsets.YearEnd(), 1000), + (offsets.YearEnd(month=1), 1000), + (offsets.YearEnd(month=5), 1000), + ("W", 4000), + ("W-MON", 4000), + ("W-FRI", 4000), + (offsets.Week(), 4000), + (offsets.Week(weekday=1), 4000), + (offsets.Week(weekday=5), 4000), + ("T", FreqGroup.FR_MIN), + ], +) +def test_freq_group(freqstr, expected): + assert resolution.get_freq_group(freqstr) == expected + + +def test_freq_group_match(period_code_item): + freqstr, code = period_code_item + + str_group = resolution.get_freq_group(freqstr) + code_group = resolution.get_freq_group(code) + + assert str_group == code_group == code // 1000 * 1000 + + +@pytest.mark.parametrize( + "freqstr,exp_freqstr", + [("D", "D"), ("W", "D"), ("M", "D"), ("S", "S"), ("T", "S"), ("H", "S")], +) +def test_get_to_timestamp_base(freqstr, exp_freqstr): + tsb = libfrequencies.get_to_timestamp_base + + assert tsb(get_freq_code(freqstr)[0]) == get_freq_code(exp_freqstr)[0] + + +_reso = resolution.Resolution + + +@pytest.mark.parametrize( + "freqstr,expected", + [ + ("A", "year"), + ("Q", "quarter"), + ("M", "month"), + ("D", "day"), + ("H", "hour"), + ("T", "minute"), + ("S", "second"), + ("L", "millisecond"), + ("U", "microsecond"), + ("N", "nanosecond"), + ], +) +def test_get_str_from_freq(freqstr, expected): + assert _reso.get_str_from_freq(freqstr) == expected + + +@pytest.mark.parametrize("freq", ["A", "Q", "M", "D", "H", "T", "S", "L", "U", "N"]) +def test_get_freq_roundtrip(freq): + result = _reso.get_freq(_reso.get_str_from_freq(freq)) + assert freq == result + + +@pytest.mark.parametrize("freq", ["D", "H", "T", "S", "L", "U"]) +def test_get_freq_roundtrip2(freq): + result = _reso.get_freq(_reso.get_str(_reso.get_reso_from_freq(freq))) + assert freq == result + + +@pytest.mark.parametrize( + "args,expected", + [ + ((1.5, "T"), (90, "S")), + ((62.4, "T"), (3744, "S")), + ((1.04, "H"), (3744, "S")), + ((1, "D"), (1, "D")), + ((0.342931, "H"), (1234551600, "U")), + ((1.2345, "D"), (106660800, "L")), + ], +) +def test_resolution_bumping(args, expected): + # see gh-14378 + assert _reso.get_stride_from_decimal(*args) == expected + + +@pytest.mark.parametrize( + "args", + [ + (0.5, "N"), + # Too much precision in the input can prevent. + (0.3429324798798269273987982, "H"), + ], +) +def test_cat(args): + msg = "Could not convert to integer offset at any resolution" + + with pytest.raises(ValueError, match=msg): + _reso.get_stride_from_decimal(*args) + + +@pytest.mark.parametrize( + "freq_input,expected", + [ + # Frequency string. + ("A", (get_freq("A"), 1)), + ("3D", (get_freq("D"), 3)), + ("-2M", (get_freq("M"), -2)), + # Tuple. + (("D", 1), (get_freq("D"), 1)), + (("A", 3), (get_freq("A"), 3)), + (("M", -2), (get_freq("M"), -2)), + ((5, "T"), (FreqGroup.FR_MIN, 5)), + # Numeric Tuple. + ((1000, 1), (1000, 1)), + # Offsets. + (offsets.Day(), (get_freq("D"), 1)), + (offsets.Day(3), (get_freq("D"), 3)), + (offsets.Day(-2), (get_freq("D"), -2)), + (offsets.MonthEnd(), (get_freq("M"), 1)), + (offsets.MonthEnd(3), (get_freq("M"), 3)), + (offsets.MonthEnd(-2), (get_freq("M"), -2)), + (offsets.Week(), (get_freq("W"), 1)), + (offsets.Week(3), (get_freq("W"), 3)), + (offsets.Week(-2), (get_freq("W"), -2)), + (offsets.Hour(), (FreqGroup.FR_HR, 1)), + # Monday is weekday=0. + (offsets.Week(weekday=1), (get_freq("W-TUE"), 1)), + (offsets.Week(3, weekday=0), (get_freq("W-MON"), 3)), + (offsets.Week(-2, weekday=4), (get_freq("W-FRI"), -2)), + ], +) +def test_get_freq_code(freq_input, expected): + assert get_freq_code(freq_input) == expected + + +def test_get_code_invalid(): + with pytest.raises(ValueError, match="Invalid frequency"): + get_freq_code((5, "baz")) diff --git a/venv/Lib/site-packages/pandas/tests/tseries/frequencies/test_inference.py b/venv/Lib/site-packages/pandas/tests/tseries/frequencies/test_inference.py new file mode 100644 index 0000000..c466041 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/tseries/frequencies/test_inference.py @@ -0,0 +1,535 @@ +from datetime import datetime, timedelta + +import numpy as np +import pytest + +from pandas._libs.tslibs.ccalendar import DAYS, MONTHS +from pandas._libs.tslibs.frequencies import INVALID_FREQ_ERR_MSG +from pandas.compat import is_platform_windows + +from pandas import DatetimeIndex, Index, Series, Timestamp, date_range, period_range +import pandas._testing as tm +from pandas.core.tools.datetimes import to_datetime + +import pandas.tseries.frequencies as frequencies +import pandas.tseries.offsets as offsets + + +def _check_generated_range(start, periods, freq): + """ + Check the range generated from a given start, frequency, and period count. + + Parameters + ---------- + start : str + The start date. + periods : int + The number of periods. + freq : str + The frequency of the range. + """ + freq = freq.upper() + + gen = date_range(start, periods=periods, freq=freq) + index = DatetimeIndex(gen.values) + + if not freq.startswith("Q-"): + assert frequencies.infer_freq(index) == gen.freqstr + else: + inf_freq = frequencies.infer_freq(index) + is_dec_range = inf_freq == "Q-DEC" and gen.freqstr in ( + "Q", + "Q-DEC", + "Q-SEP", + "Q-JUN", + "Q-MAR", + ) + is_nov_range = inf_freq == "Q-NOV" and gen.freqstr in ( + "Q-NOV", + "Q-AUG", + "Q-MAY", + "Q-FEB", + ) + is_oct_range = inf_freq == "Q-OCT" and gen.freqstr in ( + "Q-OCT", + "Q-JUL", + "Q-APR", + "Q-JAN", + ) + assert is_dec_range or is_nov_range or is_oct_range + + +@pytest.fixture( + params=[ + (timedelta(1), "D"), + (timedelta(hours=1), "H"), + (timedelta(minutes=1), "T"), + (timedelta(seconds=1), "S"), + (np.timedelta64(1, "ns"), "N"), + (timedelta(microseconds=1), "U"), + (timedelta(microseconds=1000), "L"), + ] +) +def base_delta_code_pair(request): + return request.param + + +@pytest.fixture(params=[1, 2, 3, 4]) +def count(request): + return request.param + + +@pytest.fixture(params=DAYS) +def day(request): + return request.param + + +@pytest.fixture(params=MONTHS) +def month(request): + return request.param + + +@pytest.fixture(params=[5, 7]) +def periods(request): + return request.param + + +def test_raise_if_period_index(): + index = period_range(start="1/1/1990", periods=20, freq="M") + msg = "Check the `freq` attribute instead of using infer_freq" + + with pytest.raises(TypeError, match=msg): + frequencies.infer_freq(index) + + +def test_raise_if_too_few(): + index = DatetimeIndex(["12/31/1998", "1/3/1999"]) + msg = "Need at least 3 dates to infer frequency" + + with pytest.raises(ValueError, match=msg): + frequencies.infer_freq(index) + + +def test_business_daily(): + index = DatetimeIndex(["01/01/1999", "1/4/1999", "1/5/1999"]) + assert frequencies.infer_freq(index) == "B" + + +def test_business_daily_look_alike(): + # see gh-16624 + # + # Do not infer "B when "weekend" (2-day gap) in wrong place. + index = DatetimeIndex(["12/31/1998", "1/3/1999", "1/4/1999"]) + assert frequencies.infer_freq(index) is None + + +def test_day_corner(): + index = DatetimeIndex(["1/1/2000", "1/2/2000", "1/3/2000"]) + assert frequencies.infer_freq(index) == "D" + + +def test_non_datetime_index(): + dates = to_datetime(["1/1/2000", "1/2/2000", "1/3/2000"]) + assert frequencies.infer_freq(dates) == "D" + + +def test_fifth_week_of_month_infer(): + # see gh-9425 + # + # Only attempt to infer up to WOM-4. + index = DatetimeIndex(["2014-03-31", "2014-06-30", "2015-03-30"]) + assert frequencies.infer_freq(index) is None + + +def test_week_of_month_fake(): + # All of these dates are on same day + # of week and are 4 or 5 weeks apart. + index = DatetimeIndex(["2013-08-27", "2013-10-01", "2013-10-29", "2013-11-26"]) + assert frequencies.infer_freq(index) != "WOM-4TUE" + + +def test_fifth_week_of_month(): + # see gh-9425 + # + # Only supports freq up to WOM-4. + msg = ( + "Of the four parameters: start, end, periods, " + "and freq, exactly three must be specified" + ) + + with pytest.raises(ValueError, match=msg): + date_range("2014-01-01", freq="WOM-5MON") + + +def test_monthly_ambiguous(): + rng = DatetimeIndex(["1/31/2000", "2/29/2000", "3/31/2000"]) + assert rng.inferred_freq == "M" + + +def test_annual_ambiguous(): + rng = DatetimeIndex(["1/31/2000", "1/31/2001", "1/31/2002"]) + assert rng.inferred_freq == "A-JAN" + + +def test_infer_freq_delta(base_delta_code_pair, count): + b = Timestamp(datetime.now()) + base_delta, code = base_delta_code_pair + + inc = base_delta * count + index = DatetimeIndex([b + inc * j for j in range(3)]) + + exp_freq = "{count:d}{code}".format(count=count, code=code) if count > 1 else code + assert frequencies.infer_freq(index) == exp_freq + + +@pytest.mark.parametrize( + "constructor", + [ + lambda now, delta: DatetimeIndex( + [now + delta * 7] + [now + delta * j for j in range(3)] + ), + lambda now, delta: DatetimeIndex( + [now + delta * j for j in range(3)] + [now + delta * 7] + ), + ], +) +def test_infer_freq_custom(base_delta_code_pair, constructor): + b = Timestamp(datetime.now()) + base_delta, _ = base_delta_code_pair + + index = constructor(b, base_delta) + assert frequencies.infer_freq(index) is None + + +def test_weekly_infer(periods, day): + _check_generated_range("1/1/2000", periods, "W-{day}".format(day=day)) + + +def test_week_of_month_infer(periods, day, count): + _check_generated_range( + "1/1/2000", periods, "WOM-{count}{day}".format(count=count, day=day) + ) + + +@pytest.mark.parametrize("freq", ["M", "BM", "BMS"]) +def test_monthly_infer(periods, freq): + _check_generated_range("1/1/2000", periods, "M") + + +def test_quarterly_infer(month, periods): + _check_generated_range("1/1/2000", periods, "Q-{month}".format(month=month)) + + +@pytest.mark.parametrize("annual", ["A", "BA"]) +def test_annually_infer(month, periods, annual): + _check_generated_range( + "1/1/2000", periods, "{annual}-{month}".format(annual=annual, month=month) + ) + + +@pytest.mark.parametrize( + "freq,expected", [("Q", "Q-DEC"), ("Q-NOV", "Q-NOV"), ("Q-OCT", "Q-OCT")] +) +def test_infer_freq_index(freq, expected): + rng = period_range("1959Q2", "2009Q3", freq=freq) + rng = Index(rng.to_timestamp("D", how="e").astype(object)) + + assert rng.inferred_freq == expected + + +@pytest.mark.parametrize( + "expected,dates", + list( + { + "AS-JAN": ["2009-01-01", "2010-01-01", "2011-01-01", "2012-01-01"], + "Q-OCT": ["2009-01-31", "2009-04-30", "2009-07-31", "2009-10-31"], + "M": ["2010-11-30", "2010-12-31", "2011-01-31", "2011-02-28"], + "W-SAT": ["2010-12-25", "2011-01-01", "2011-01-08", "2011-01-15"], + "D": ["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"], + "H": [ + "2011-12-31 22:00", + "2011-12-31 23:00", + "2012-01-01 00:00", + "2012-01-01 01:00", + ], + }.items() + ), +) +def test_infer_freq_tz(tz_naive_fixture, expected, dates): + # see gh-7310 + tz = tz_naive_fixture + idx = DatetimeIndex(dates, tz=tz) + assert idx.inferred_freq == expected + + +@pytest.mark.parametrize( + "date_pair", + [ + ["2013-11-02", "2013-11-5"], # Fall DST + ["2014-03-08", "2014-03-11"], # Spring DST + ["2014-01-01", "2014-01-03"], # Regular Time + ], +) +@pytest.mark.parametrize( + "freq", ["3H", "10T", "3601S", "3600001L", "3600000001U", "3600000000001N"] +) +def test_infer_freq_tz_transition(tz_naive_fixture, date_pair, freq): + # see gh-8772 + tz = tz_naive_fixture + idx = date_range(date_pair[0], date_pair[1], freq=freq, tz=tz) + assert idx.inferred_freq == freq + + +def test_infer_freq_tz_transition_custom(): + index = date_range("2013-11-03", periods=5, freq="3H").tz_localize( + "America/Chicago" + ) + assert index.inferred_freq is None + + +@pytest.mark.parametrize( + "data,expected", + [ + # Hourly freq in a day must result in "H" + ( + [ + "2014-07-01 09:00", + "2014-07-01 10:00", + "2014-07-01 11:00", + "2014-07-01 12:00", + "2014-07-01 13:00", + "2014-07-01 14:00", + ], + "H", + ), + ( + [ + "2014-07-01 09:00", + "2014-07-01 10:00", + "2014-07-01 11:00", + "2014-07-01 12:00", + "2014-07-01 13:00", + "2014-07-01 14:00", + "2014-07-01 15:00", + "2014-07-01 16:00", + "2014-07-02 09:00", + "2014-07-02 10:00", + "2014-07-02 11:00", + ], + "BH", + ), + ( + [ + "2014-07-04 09:00", + "2014-07-04 10:00", + "2014-07-04 11:00", + "2014-07-04 12:00", + "2014-07-04 13:00", + "2014-07-04 14:00", + "2014-07-04 15:00", + "2014-07-04 16:00", + "2014-07-07 09:00", + "2014-07-07 10:00", + "2014-07-07 11:00", + ], + "BH", + ), + ( + [ + "2014-07-04 09:00", + "2014-07-04 10:00", + "2014-07-04 11:00", + "2014-07-04 12:00", + "2014-07-04 13:00", + "2014-07-04 14:00", + "2014-07-04 15:00", + "2014-07-04 16:00", + "2014-07-07 09:00", + "2014-07-07 10:00", + "2014-07-07 11:00", + "2014-07-07 12:00", + "2014-07-07 13:00", + "2014-07-07 14:00", + "2014-07-07 15:00", + "2014-07-07 16:00", + "2014-07-08 09:00", + "2014-07-08 10:00", + "2014-07-08 11:00", + "2014-07-08 12:00", + "2014-07-08 13:00", + "2014-07-08 14:00", + "2014-07-08 15:00", + "2014-07-08 16:00", + ], + "BH", + ), + ], +) +def test_infer_freq_business_hour(data, expected): + # see gh-7905 + idx = DatetimeIndex(data) + assert idx.inferred_freq == expected + + +def test_not_monotonic(): + rng = DatetimeIndex(["1/31/2000", "1/31/2001", "1/31/2002"]) + rng = rng[::-1] + + assert rng.inferred_freq == "-1A-JAN" + + +def test_non_datetime_index2(): + rng = DatetimeIndex(["1/31/2000", "1/31/2001", "1/31/2002"]) + vals = rng.to_pydatetime() + + result = frequencies.infer_freq(vals) + assert result == rng.inferred_freq + + +@pytest.mark.parametrize( + "idx", [tm.makeIntIndex(10), tm.makeFloatIndex(10), tm.makePeriodIndex(10)] +) +def test_invalid_index_types(idx): + msg = ( + "(cannot infer freq from a non-convertible)|" + "(Check the `freq` attribute instead of using infer_freq)" + ) + + with pytest.raises(TypeError, match=msg): + frequencies.infer_freq(idx) + + +@pytest.mark.skipif(is_platform_windows(), reason="see gh-10822: Windows issue") +@pytest.mark.parametrize("idx", [tm.makeStringIndex(10), tm.makeUnicodeIndex(10)]) +def test_invalid_index_types_unicode(idx): + # see gh-10822 + # + # Odd error message on conversions to datetime for unicode. + msg = "Unknown string format" + + with pytest.raises(ValueError, match=msg): + frequencies.infer_freq(idx) + + +def test_string_datetime_like_compat(): + # see gh-6463 + data = ["2004-01", "2004-02", "2004-03", "2004-04"] + + expected = frequencies.infer_freq(data) + result = frequencies.infer_freq(Index(data)) + + assert result == expected + + +def test_series(): + # see gh-6407 + s = Series(date_range("20130101", "20130110")) + inferred = frequencies.infer_freq(s) + assert inferred == "D" + + +@pytest.mark.parametrize("end", [10, 10.0]) +def test_series_invalid_type(end): + # see gh-6407 + msg = "cannot infer freq from a non-convertible dtype on a Series" + s = Series(np.arange(end)) + + with pytest.raises(TypeError, match=msg): + frequencies.infer_freq(s) + + +def test_series_inconvertible_string(): + # see gh-6407 + msg = "Unknown string format" + + with pytest.raises(ValueError, match=msg): + frequencies.infer_freq(Series(["foo", "bar"])) + + +@pytest.mark.parametrize("freq", [None, "L"]) +def test_series_period_index(freq): + # see gh-6407 + # + # Cannot infer on PeriodIndex + msg = "cannot infer freq from a non-convertible dtype on a Series" + s = Series(period_range("2013", periods=10, freq=freq)) + + with pytest.raises(TypeError, match=msg): + frequencies.infer_freq(s) + + +@pytest.mark.parametrize("freq", ["M", "L", "S"]) +def test_series_datetime_index(freq): + s = Series(date_range("20130101", periods=10, freq=freq)) + inferred = frequencies.infer_freq(s) + assert inferred == freq + + +@pytest.mark.parametrize( + "offset_func", + [ + frequencies._get_offset, + lambda freq: date_range("2011-01-01", periods=5, freq=freq), + ], +) +@pytest.mark.parametrize( + "freq", + [ + "WEEKDAY", + "EOM", + "W@MON", + "W@TUE", + "W@WED", + "W@THU", + "W@FRI", + "W@SAT", + "W@SUN", + "Q@JAN", + "Q@FEB", + "Q@MAR", + "A@JAN", + "A@FEB", + "A@MAR", + "A@APR", + "A@MAY", + "A@JUN", + "A@JUL", + "A@AUG", + "A@SEP", + "A@OCT", + "A@NOV", + "A@DEC", + "Y@JAN", + "WOM@1MON", + "WOM@2MON", + "WOM@3MON", + "WOM@4MON", + "WOM@1TUE", + "WOM@2TUE", + "WOM@3TUE", + "WOM@4TUE", + "WOM@1WED", + "WOM@2WED", + "WOM@3WED", + "WOM@4WED", + "WOM@1THU", + "WOM@2THU", + "WOM@3THU", + "WOM@4THU", + "WOM@1FRI", + "WOM@2FRI", + "WOM@3FRI", + "WOM@4FRI", + ], +) +def test_legacy_offset_warnings(offset_func, freq): + with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): + offset_func(freq) + + +def test_ms_vs_capital_ms(): + left = frequencies._get_offset("ms") + right = frequencies._get_offset("MS") + + assert left == offsets.Milli() + assert right == offsets.MonthBegin() diff --git a/venv/Lib/site-packages/pandas/tests/tseries/frequencies/test_to_offset.py b/venv/Lib/site-packages/pandas/tests/tseries/frequencies/test_to_offset.py new file mode 100644 index 0000000..b6069c4 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/tseries/frequencies/test_to_offset.py @@ -0,0 +1,176 @@ +import re + +import pytest + +from pandas import Timedelta + +import pandas.tseries.frequencies as frequencies +import pandas.tseries.offsets as offsets + + +@pytest.mark.parametrize( + "freq_input,expected", + [ + (frequencies.to_offset("10us"), offsets.Micro(10)), + (offsets.Hour(), offsets.Hour()), + ((5, "T"), offsets.Minute(5)), + ("2h30min", offsets.Minute(150)), + ("2h 30min", offsets.Minute(150)), + ("2h30min15s", offsets.Second(150 * 60 + 15)), + ("2h 60min", offsets.Hour(3)), + ("2h 20.5min", offsets.Second(8430)), + ("1.5min", offsets.Second(90)), + ("0.5S", offsets.Milli(500)), + ("15l500u", offsets.Micro(15500)), + ("10s75L", offsets.Milli(10075)), + ("1s0.25ms", offsets.Micro(1000250)), + ("1s0.25L", offsets.Micro(1000250)), + ("2800N", offsets.Nano(2800)), + ("2SM", offsets.SemiMonthEnd(2)), + ("2SM-16", offsets.SemiMonthEnd(2, day_of_month=16)), + ("2SMS-14", offsets.SemiMonthBegin(2, day_of_month=14)), + ("2SMS-15", offsets.SemiMonthBegin(2)), + ], +) +def test_to_offset(freq_input, expected): + result = frequencies.to_offset(freq_input) + assert result == expected + + +@pytest.mark.parametrize( + "freqstr,expected", [("-1S", -1), ("-2SM", -2), ("-1SMS", -1), ("-5min10s", -310)] +) +def test_to_offset_negative(freqstr, expected): + result = frequencies.to_offset(freqstr) + assert result.n == expected + + +@pytest.mark.parametrize( + "freqstr", + [ + "2h20m", + "U1", + "-U", + "3U1", + "-2-3U", + "-2D:3H", + "1.5.0S", + "2SMS-15-15", + "2SMS-15D", + "100foo", + # Invalid leading +/- signs. + "+-1d", + "-+1h", + "+1", + "-7", + "+d", + "-m", + # Invalid shortcut anchors. + "SM-0", + "SM-28", + "SM-29", + "SM-FOO", + "BSM", + "SM--1", + "SMS-1", + "SMS-28", + "SMS-30", + "SMS-BAR", + "SMS-BYR", + "BSMS", + "SMS--2", + ], +) +def test_to_offset_invalid(freqstr): + # see gh-13930 + + # We escape string because some of our + # inputs contain regex special characters. + msg = re.escape("Invalid frequency: {freqstr}".format(freqstr=freqstr)) + with pytest.raises(ValueError, match=msg): + frequencies.to_offset(freqstr) + + +def test_to_offset_no_evaluate(): + with pytest.raises(ValueError, match="Could not evaluate"): + frequencies.to_offset(("", "")) + + +@pytest.mark.parametrize( + "freqstr,expected", + [ + ("2D 3H", offsets.Hour(51)), + ("2 D3 H", offsets.Hour(51)), + ("2 D 3 H", offsets.Hour(51)), + (" 2 D 3 H ", offsets.Hour(51)), + (" H ", offsets.Hour()), + (" 3 H ", offsets.Hour(3)), + ], +) +def test_to_offset_whitespace(freqstr, expected): + result = frequencies.to_offset(freqstr) + assert result == expected + + +@pytest.mark.parametrize( + "freqstr,expected", [("00H 00T 01S", 1), ("-00H 03T 14S", -194)] +) +def test_to_offset_leading_zero(freqstr, expected): + result = frequencies.to_offset(freqstr) + assert result.n == expected + + +@pytest.mark.parametrize("freqstr,expected", [("+1d", 1), ("+2h30min", 150)]) +def test_to_offset_leading_plus(freqstr, expected): + result = frequencies.to_offset(freqstr) + assert result.n == expected + + +@pytest.mark.parametrize( + "kwargs,expected", + [ + (dict(days=1, seconds=1), offsets.Second(86401)), + (dict(days=-1, seconds=1), offsets.Second(-86399)), + (dict(hours=1, minutes=10), offsets.Minute(70)), + (dict(hours=1, minutes=-10), offsets.Minute(50)), + (dict(weeks=1), offsets.Day(7)), + (dict(hours=1), offsets.Hour(1)), + (dict(hours=1), frequencies.to_offset("60min")), + (dict(microseconds=1), offsets.Micro(1)), + ], +) +def test_to_offset_pd_timedelta(kwargs, expected): + # see gh-9064 + td = Timedelta(**kwargs) + result = frequencies.to_offset(td) + assert result == expected + + +def test_to_offset_pd_timedelta_invalid(): + # see gh-9064 + msg = "Invalid frequency: 0 days 00:00:00" + td = Timedelta(microseconds=0) + + with pytest.raises(ValueError, match=msg): + frequencies.to_offset(td) + + +@pytest.mark.parametrize( + "shortcut,expected", + [ + ("W", offsets.Week(weekday=6)), + ("W-SUN", offsets.Week(weekday=6)), + ("Q", offsets.QuarterEnd(startingMonth=12)), + ("Q-DEC", offsets.QuarterEnd(startingMonth=12)), + ("Q-MAY", offsets.QuarterEnd(startingMonth=5)), + ("SM", offsets.SemiMonthEnd(day_of_month=15)), + ("SM-15", offsets.SemiMonthEnd(day_of_month=15)), + ("SM-1", offsets.SemiMonthEnd(day_of_month=1)), + ("SM-27", offsets.SemiMonthEnd(day_of_month=27)), + ("SMS-2", offsets.SemiMonthBegin(day_of_month=2)), + ("SMS-27", offsets.SemiMonthBegin(day_of_month=27)), + ], +) +def test_anchored_shortcuts(shortcut, expected): + result = frequencies.to_offset(shortcut) + assert result == expected diff --git a/venv/Lib/site-packages/pandas/tests/tseries/holiday/__init__.py b/venv/Lib/site-packages/pandas/tests/tseries/holiday/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/tseries/holiday/test_calendar.py b/venv/Lib/site-packages/pandas/tests/tseries/holiday/test_calendar.py new file mode 100644 index 0000000..5b4a7c7 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/tseries/holiday/test_calendar.py @@ -0,0 +1,100 @@ +from datetime import datetime + +import pytest + +from pandas import DatetimeIndex, offsets, to_datetime +import pandas._testing as tm + +from pandas.tseries.holiday import ( + AbstractHolidayCalendar, + Holiday, + Timestamp, + USFederalHolidayCalendar, + USLaborDay, + USThanksgivingDay, + get_calendar, +) + + +@pytest.mark.parametrize( + "transform", [lambda x: x, lambda x: x.strftime("%Y-%m-%d"), lambda x: Timestamp(x)] +) +def test_calendar(transform): + start_date = datetime(2012, 1, 1) + end_date = datetime(2012, 12, 31) + + calendar = USFederalHolidayCalendar() + holidays = calendar.holidays(transform(start_date), transform(end_date)) + + expected = [ + datetime(2012, 1, 2), + datetime(2012, 1, 16), + datetime(2012, 2, 20), + datetime(2012, 5, 28), + datetime(2012, 7, 4), + datetime(2012, 9, 3), + datetime(2012, 10, 8), + datetime(2012, 11, 12), + datetime(2012, 11, 22), + datetime(2012, 12, 25), + ] + + assert list(holidays.to_pydatetime()) == expected + + +def test_calendar_caching(): + # see gh-9552. + + class TestCalendar(AbstractHolidayCalendar): + def __init__(self, name=None, rules=None): + super().__init__(name=name, rules=rules) + + jan1 = TestCalendar(rules=[Holiday("jan1", year=2015, month=1, day=1)]) + jan2 = TestCalendar(rules=[Holiday("jan2", year=2015, month=1, day=2)]) + + # Getting holidays for Jan 1 should not alter results for Jan 2. + tm.assert_index_equal(jan1.holidays(), DatetimeIndex(["01-Jan-2015"])) + tm.assert_index_equal(jan2.holidays(), DatetimeIndex(["02-Jan-2015"])) + + +def test_calendar_observance_dates(): + # see gh-11477 + us_fed_cal = get_calendar("USFederalHolidayCalendar") + holidays0 = us_fed_cal.holidays( + datetime(2015, 7, 3), datetime(2015, 7, 3) + ) # <-- same start and end dates + holidays1 = us_fed_cal.holidays( + datetime(2015, 7, 3), datetime(2015, 7, 6) + ) # <-- different start and end dates + holidays2 = us_fed_cal.holidays( + datetime(2015, 7, 3), datetime(2015, 7, 3) + ) # <-- same start and end dates + + # These should all produce the same result. + # + # In addition, calling with different start and end + # dates should not alter the output if we call the + # function again with the same start and end date. + tm.assert_index_equal(holidays0, holidays1) + tm.assert_index_equal(holidays0, holidays2) + + +def test_rule_from_name(): + us_fed_cal = get_calendar("USFederalHolidayCalendar") + assert us_fed_cal.rule_from_name("Thanksgiving") == USThanksgivingDay + + +def test_calendar_2031(): + # See gh-27790 + # + # Labor Day 2031 is on September 1. Saturday before is August 30. + # Next working day after August 30 ought to be Tuesday, September 2. + + class testCalendar(AbstractHolidayCalendar): + rules = [USLaborDay] + + cal = testCalendar() + workDay = offsets.CustomBusinessDay(calendar=cal) + Sat_before_Labor_Day_2031 = to_datetime("2031-08-30") + next_working_day = Sat_before_Labor_Day_2031 + 0 * workDay + assert next_working_day == to_datetime("2031-09-02") diff --git a/venv/Lib/site-packages/pandas/tests/tseries/holiday/test_federal.py b/venv/Lib/site-packages/pandas/tests/tseries/holiday/test_federal.py new file mode 100644 index 0000000..64c60d4 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/tseries/holiday/test_federal.py @@ -0,0 +1,38 @@ +from datetime import datetime + +from pandas.tseries.holiday import ( + AbstractHolidayCalendar, + USMartinLutherKingJr, + USMemorialDay, +) + + +def test_no_mlk_before_1986(): + # see gh-10278 + class MLKCalendar(AbstractHolidayCalendar): + rules = [USMartinLutherKingJr] + + holidays = MLKCalendar().holidays(start="1984", end="1988").to_pydatetime().tolist() + + # Testing to make sure holiday is not incorrectly observed before 1986. + assert holidays == [datetime(1986, 1, 20, 0, 0), datetime(1987, 1, 19, 0, 0)] + + +def test_memorial_day(): + class MemorialDay(AbstractHolidayCalendar): + rules = [USMemorialDay] + + holidays = MemorialDay().holidays(start="1971", end="1980").to_pydatetime().tolist() + + # Fixes 5/31 error and checked manually against Wikipedia. + assert holidays == [ + datetime(1971, 5, 31, 0, 0), + datetime(1972, 5, 29, 0, 0), + datetime(1973, 5, 28, 0, 0), + datetime(1974, 5, 27, 0, 0), + datetime(1975, 5, 26, 0, 0), + datetime(1976, 5, 31, 0, 0), + datetime(1977, 5, 30, 0, 0), + datetime(1978, 5, 29, 0, 0), + datetime(1979, 5, 28, 0, 0), + ] diff --git a/venv/Lib/site-packages/pandas/tests/tseries/holiday/test_holiday.py b/venv/Lib/site-packages/pandas/tests/tseries/holiday/test_holiday.py new file mode 100644 index 0000000..a2c146d --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/tseries/holiday/test_holiday.py @@ -0,0 +1,268 @@ +from datetime import datetime + +import pytest +from pytz import utc + +import pandas._testing as tm + +from pandas.tseries.holiday import ( + MO, + SA, + AbstractHolidayCalendar, + DateOffset, + EasterMonday, + GoodFriday, + Holiday, + HolidayCalendarFactory, + Timestamp, + USColumbusDay, + USLaborDay, + USMartinLutherKingJr, + USMemorialDay, + USPresidentsDay, + USThanksgivingDay, + get_calendar, + next_monday, +) + + +def _check_holiday_results(holiday, start, end, expected): + """ + Check that the dates for a given holiday match in date and timezone. + + Parameters + ---------- + holiday : Holiday + The holiday to check. + start : datetime-like + The start date of range in which to collect dates for a given holiday. + end : datetime-like + The end date of range in which to collect dates for a given holiday. + expected : list + The list of dates we expect to get. + """ + assert list(holiday.dates(start, end)) == expected + + # Verify that timezone info is preserved. + assert list( + holiday.dates(utc.localize(Timestamp(start)), utc.localize(Timestamp(end))) + ) == [utc.localize(dt) for dt in expected] + + +@pytest.mark.parametrize( + "holiday,start_date,end_date,expected", + [ + ( + USMemorialDay, + datetime(2011, 1, 1), + datetime(2020, 12, 31), + [ + datetime(2011, 5, 30), + datetime(2012, 5, 28), + datetime(2013, 5, 27), + datetime(2014, 5, 26), + datetime(2015, 5, 25), + datetime(2016, 5, 30), + datetime(2017, 5, 29), + datetime(2018, 5, 28), + datetime(2019, 5, 27), + datetime(2020, 5, 25), + ], + ), + ( + Holiday("July 4th Eve", month=7, day=3), + "2001-01-01", + "2003-03-03", + [Timestamp("2001-07-03 00:00:00"), Timestamp("2002-07-03 00:00:00")], + ), + ( + Holiday("July 4th Eve", month=7, day=3, days_of_week=(0, 1, 2, 3)), + "2001-01-01", + "2008-03-03", + [ + Timestamp("2001-07-03 00:00:00"), + Timestamp("2002-07-03 00:00:00"), + Timestamp("2003-07-03 00:00:00"), + Timestamp("2006-07-03 00:00:00"), + Timestamp("2007-07-03 00:00:00"), + ], + ), + ( + EasterMonday, + datetime(2011, 1, 1), + datetime(2020, 12, 31), + [ + Timestamp("2011-04-25 00:00:00"), + Timestamp("2012-04-09 00:00:00"), + Timestamp("2013-04-01 00:00:00"), + Timestamp("2014-04-21 00:00:00"), + Timestamp("2015-04-06 00:00:00"), + Timestamp("2016-03-28 00:00:00"), + Timestamp("2017-04-17 00:00:00"), + Timestamp("2018-04-02 00:00:00"), + Timestamp("2019-04-22 00:00:00"), + Timestamp("2020-04-13 00:00:00"), + ], + ), + ( + GoodFriday, + datetime(2011, 1, 1), + datetime(2020, 12, 31), + [ + Timestamp("2011-04-22 00:00:00"), + Timestamp("2012-04-06 00:00:00"), + Timestamp("2013-03-29 00:00:00"), + Timestamp("2014-04-18 00:00:00"), + Timestamp("2015-04-03 00:00:00"), + Timestamp("2016-03-25 00:00:00"), + Timestamp("2017-04-14 00:00:00"), + Timestamp("2018-03-30 00:00:00"), + Timestamp("2019-04-19 00:00:00"), + Timestamp("2020-04-10 00:00:00"), + ], + ), + ( + USThanksgivingDay, + datetime(2011, 1, 1), + datetime(2020, 12, 31), + [ + datetime(2011, 11, 24), + datetime(2012, 11, 22), + datetime(2013, 11, 28), + datetime(2014, 11, 27), + datetime(2015, 11, 26), + datetime(2016, 11, 24), + datetime(2017, 11, 23), + datetime(2018, 11, 22), + datetime(2019, 11, 28), + datetime(2020, 11, 26), + ], + ), + ], +) +def test_holiday_dates(holiday, start_date, end_date, expected): + _check_holiday_results(holiday, start_date, end_date, expected) + + +@pytest.mark.parametrize( + "holiday,start,expected", + [ + (USMemorialDay, datetime(2015, 7, 1), []), + (USMemorialDay, "2015-05-25", "2015-05-25"), + (USLaborDay, datetime(2015, 7, 1), []), + (USLaborDay, "2015-09-07", "2015-09-07"), + (USColumbusDay, datetime(2015, 7, 1), []), + (USColumbusDay, "2015-10-12", "2015-10-12"), + (USThanksgivingDay, datetime(2015, 7, 1), []), + (USThanksgivingDay, "2015-11-26", "2015-11-26"), + (USMartinLutherKingJr, datetime(2015, 7, 1), []), + (USMartinLutherKingJr, "2015-01-19", "2015-01-19"), + (USPresidentsDay, datetime(2015, 7, 1), []), + (USPresidentsDay, "2015-02-16", "2015-02-16"), + (GoodFriday, datetime(2015, 7, 1), []), + (GoodFriday, "2015-04-03", "2015-04-03"), + (EasterMonday, "2015-04-06", "2015-04-06"), + (EasterMonday, datetime(2015, 7, 1), []), + (EasterMonday, "2015-04-05", []), + ("New Years Day", "2015-01-01", "2015-01-01"), + ("New Years Day", "2010-12-31", "2010-12-31"), + ("New Years Day", datetime(2015, 7, 1), []), + ("New Years Day", "2011-01-01", []), + ("July 4th", "2015-07-03", "2015-07-03"), + ("July 4th", datetime(2015, 7, 1), []), + ("July 4th", "2015-07-04", []), + ("Veterans Day", "2012-11-12", "2012-11-12"), + ("Veterans Day", datetime(2015, 7, 1), []), + ("Veterans Day", "2012-11-11", []), + ("Christmas", "2011-12-26", "2011-12-26"), + ("Christmas", datetime(2015, 7, 1), []), + ("Christmas", "2011-12-25", []), + ], +) +def test_holidays_within_dates(holiday, start, expected): + # see gh-11477 + # + # Fix holiday behavior where holiday.dates returned dates outside + # start/end date, or observed rules could not be applied because the + # holiday was not in the original date range (e.g., 7/4/2015 -> 7/3/2015). + if isinstance(holiday, str): + calendar = get_calendar("USFederalHolidayCalendar") + holiday = calendar.rule_from_name(holiday) + + if isinstance(expected, str): + expected = [Timestamp(expected)] + + _check_holiday_results(holiday, start, start, expected) + + +@pytest.mark.parametrize( + "transform", [lambda x: x.strftime("%Y-%m-%d"), lambda x: Timestamp(x)] +) +def test_argument_types(transform): + start_date = datetime(2011, 1, 1) + end_date = datetime(2020, 12, 31) + + holidays = USThanksgivingDay.dates(start_date, end_date) + holidays2 = USThanksgivingDay.dates(transform(start_date), transform(end_date)) + tm.assert_index_equal(holidays, holidays2) + + +@pytest.mark.parametrize( + "name,kwargs", + [ + ("One-Time", dict(year=2012, month=5, day=28)), + ( + "Range", + dict( + month=5, + day=28, + start_date=datetime(2012, 1, 1), + end_date=datetime(2012, 12, 31), + offset=DateOffset(weekday=MO(1)), + ), + ), + ], +) +def test_special_holidays(name, kwargs): + base_date = [datetime(2012, 5, 28)] + holiday = Holiday(name, **kwargs) + + start_date = datetime(2011, 1, 1) + end_date = datetime(2020, 12, 31) + + assert base_date == holiday.dates(start_date, end_date) + + +def test_get_calendar(): + class TestCalendar(AbstractHolidayCalendar): + rules = [] + + calendar = get_calendar("TestCalendar") + assert TestCalendar == type(calendar) + + +def test_factory(): + class_1 = HolidayCalendarFactory( + "MemorialDay", AbstractHolidayCalendar, USMemorialDay + ) + class_2 = HolidayCalendarFactory( + "Thanksgiving", AbstractHolidayCalendar, USThanksgivingDay + ) + class_3 = HolidayCalendarFactory("Combined", class_1, class_2) + + assert len(class_1.rules) == 1 + assert len(class_2.rules) == 1 + assert len(class_3.rules) == 2 + + +def test_both_offset_observance_raises(): + # see gh-10217 + msg = "Cannot use both offset and observance" + with pytest.raises(NotImplementedError, match=msg): + Holiday( + "Cyber Monday", + month=11, + day=1, + offset=[DateOffset(weekday=SA(4))], + observance=next_monday, + ) diff --git a/venv/Lib/site-packages/pandas/tests/tseries/holiday/test_observance.py b/venv/Lib/site-packages/pandas/tests/tseries/holiday/test_observance.py new file mode 100644 index 0000000..9ee63d2 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/tseries/holiday/test_observance.py @@ -0,0 +1,87 @@ +from datetime import datetime + +import pytest + +from pandas.tseries.holiday import ( + after_nearest_workday, + before_nearest_workday, + nearest_workday, + next_monday, + next_monday_or_tuesday, + next_workday, + previous_friday, + previous_workday, + sunday_to_monday, + weekend_to_monday, +) + +_WEDNESDAY = datetime(2014, 4, 9) +_THURSDAY = datetime(2014, 4, 10) +_FRIDAY = datetime(2014, 4, 11) +_SATURDAY = datetime(2014, 4, 12) +_SUNDAY = datetime(2014, 4, 13) +_MONDAY = datetime(2014, 4, 14) +_TUESDAY = datetime(2014, 4, 15) + + +@pytest.mark.parametrize("day", [_SATURDAY, _SUNDAY]) +def test_next_monday(day): + assert next_monday(day) == _MONDAY + + +@pytest.mark.parametrize( + "day,expected", [(_SATURDAY, _MONDAY), (_SUNDAY, _TUESDAY), (_MONDAY, _TUESDAY)] +) +def test_next_monday_or_tuesday(day, expected): + assert next_monday_or_tuesday(day) == expected + + +@pytest.mark.parametrize("day", [_SATURDAY, _SUNDAY]) +def test_previous_friday(day): + assert previous_friday(day) == _FRIDAY + + +def test_sunday_to_monday(): + assert sunday_to_monday(_SUNDAY) == _MONDAY + + +@pytest.mark.parametrize( + "day,expected", [(_SATURDAY, _FRIDAY), (_SUNDAY, _MONDAY), (_MONDAY, _MONDAY)] +) +def test_nearest_workday(day, expected): + assert nearest_workday(day) == expected + + +@pytest.mark.parametrize( + "day,expected", [(_SATURDAY, _MONDAY), (_SUNDAY, _MONDAY), (_MONDAY, _MONDAY)] +) +def test_weekend_to_monday(day, expected): + assert weekend_to_monday(day) == expected + + +@pytest.mark.parametrize( + "day,expected", [(_SATURDAY, _MONDAY), (_SUNDAY, _MONDAY), (_MONDAY, _TUESDAY)] +) +def test_next_workday(day, expected): + assert next_workday(day) == expected + + +@pytest.mark.parametrize( + "day,expected", [(_SATURDAY, _FRIDAY), (_SUNDAY, _FRIDAY), (_TUESDAY, _MONDAY)] +) +def test_previous_workday(day, expected): + assert previous_workday(day) == expected + + +@pytest.mark.parametrize( + "day,expected", [(_SATURDAY, _THURSDAY), (_SUNDAY, _FRIDAY), (_TUESDAY, _MONDAY)] +) +def test_before_nearest_workday(day, expected): + assert before_nearest_workday(day) == expected + + +@pytest.mark.parametrize( + "day,expected", [(_SATURDAY, _MONDAY), (_SUNDAY, _TUESDAY), (_FRIDAY, _MONDAY)] +) +def test_after_nearest_workday(day, expected): + assert after_nearest_workday(day) == expected diff --git a/venv/Lib/site-packages/pandas/tests/tseries/offsets/__init__.py b/venv/Lib/site-packages/pandas/tests/tseries/offsets/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/tseries/offsets/common.py b/venv/Lib/site-packages/pandas/tests/tseries/offsets/common.py new file mode 100644 index 0000000..71953fd --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/tseries/offsets/common.py @@ -0,0 +1,26 @@ +""" +Assertion helpers for offsets tests +""" + + +def assert_offset_equal(offset, base, expected): + actual = offset + base + actual_swapped = base + offset + actual_apply = offset.apply(base) + try: + assert actual == expected + assert actual_swapped == expected + assert actual_apply == expected + except AssertionError: + raise AssertionError( + f"\nExpected: {expected}\nActual: {actual}\nFor Offset: {offset})" + f"\nAt Date: {base}" + ) + + +def assert_is_on_offset(offset, date, expected): + actual = offset.is_on_offset(date) + assert actual == expected, ( + f"\nExpected: {expected}\nActual: {actual}\nFor Offset: {offset})" + f"\nAt Date: {date}" + ) diff --git a/venv/Lib/site-packages/pandas/tests/tseries/offsets/conftest.py b/venv/Lib/site-packages/pandas/tests/tseries/offsets/conftest.py new file mode 100644 index 0000000..2f6868f --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/tseries/offsets/conftest.py @@ -0,0 +1,25 @@ +import pytest + +import pandas.tseries.offsets as offsets + + +@pytest.fixture(params=[getattr(offsets, o) for o in offsets.__all__]) +def offset_types(request): + """ + Fixture for all the datetime offsets available for a time series. + """ + return request.param + + +@pytest.fixture( + params=[ + getattr(offsets, o) + for o in offsets.__all__ + if issubclass(getattr(offsets, o), offsets.MonthOffset) and o != "MonthOffset" + ] +) +def month_classes(request): + """ + Fixture for month based datetime offsets available for a time series. + """ + return request.param diff --git a/venv/Lib/site-packages/pandas/tests/tseries/offsets/test_fiscal.py b/venv/Lib/site-packages/pandas/tests/tseries/offsets/test_fiscal.py new file mode 100644 index 0000000..5686119 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/tseries/offsets/test_fiscal.py @@ -0,0 +1,692 @@ +""" +Tests for Fiscal Year and Fiscal Quarter offset classes +""" +from datetime import datetime + +from dateutil.relativedelta import relativedelta +import pytest + +from pandas._libs.tslibs.frequencies import INVALID_FREQ_ERR_MSG + +from pandas import Timestamp +import pandas._testing as tm + +from pandas.tseries.frequencies import get_offset +from pandas.tseries.offsets import FY5253, FY5253Quarter + +from .common import assert_is_on_offset, assert_offset_equal +from .test_offsets import Base, WeekDay + + +def makeFY5253LastOfMonthQuarter(*args, **kwds): + return FY5253Quarter(*args, variation="last", **kwds) + + +def makeFY5253NearestEndMonthQuarter(*args, **kwds): + return FY5253Quarter(*args, variation="nearest", **kwds) + + +def makeFY5253NearestEndMonth(*args, **kwds): + return FY5253(*args, variation="nearest", **kwds) + + +def makeFY5253LastOfMonth(*args, **kwds): + return FY5253(*args, variation="last", **kwds) + + +def test_get_offset_name(): + assert ( + makeFY5253LastOfMonthQuarter( + weekday=1, startingMonth=3, qtr_with_extra_week=4 + ).freqstr + == "REQ-L-MAR-TUE-4" + ) + assert ( + makeFY5253NearestEndMonthQuarter( + weekday=1, startingMonth=3, qtr_with_extra_week=3 + ).freqstr + == "REQ-N-MAR-TUE-3" + ) + + +def test_get_offset(): + with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): + with tm.assert_produces_warning(FutureWarning): + get_offset("gibberish") + with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): + with tm.assert_produces_warning(FutureWarning): + get_offset("QS-JAN-B") + + pairs = [ + ("RE-N-DEC-MON", makeFY5253NearestEndMonth(weekday=0, startingMonth=12)), + ("RE-L-DEC-TUE", makeFY5253LastOfMonth(weekday=1, startingMonth=12)), + ( + "REQ-L-MAR-TUE-4", + makeFY5253LastOfMonthQuarter( + weekday=1, startingMonth=3, qtr_with_extra_week=4 + ), + ), + ( + "REQ-L-DEC-MON-3", + makeFY5253LastOfMonthQuarter( + weekday=0, startingMonth=12, qtr_with_extra_week=3 + ), + ), + ( + "REQ-N-DEC-MON-3", + makeFY5253NearestEndMonthQuarter( + weekday=0, startingMonth=12, qtr_with_extra_week=3 + ), + ), + ] + + for name, expected in pairs: + with tm.assert_produces_warning(FutureWarning): + offset = get_offset(name) + assert offset == expected, ( + f"Expected {repr(name)} to yield {repr(expected)} " + f"(actual: {repr(offset)})" + ) + + +class TestFY5253LastOfMonth(Base): + offset_lom_sat_aug = makeFY5253LastOfMonth(1, startingMonth=8, weekday=WeekDay.SAT) + offset_lom_sat_sep = makeFY5253LastOfMonth(1, startingMonth=9, weekday=WeekDay.SAT) + + on_offset_cases = [ + # From Wikipedia (see: + # http://en.wikipedia.org/wiki/4%E2%80%934%E2%80%935_calendar#Last_Saturday_of_the_month_at_fiscal_year_end) + (offset_lom_sat_aug, datetime(2006, 8, 26), True), + (offset_lom_sat_aug, datetime(2007, 8, 25), True), + (offset_lom_sat_aug, datetime(2008, 8, 30), True), + (offset_lom_sat_aug, datetime(2009, 8, 29), True), + (offset_lom_sat_aug, datetime(2010, 8, 28), True), + (offset_lom_sat_aug, datetime(2011, 8, 27), True), + (offset_lom_sat_aug, datetime(2012, 8, 25), True), + (offset_lom_sat_aug, datetime(2013, 8, 31), True), + (offset_lom_sat_aug, datetime(2014, 8, 30), True), + (offset_lom_sat_aug, datetime(2015, 8, 29), True), + (offset_lom_sat_aug, datetime(2016, 8, 27), True), + (offset_lom_sat_aug, datetime(2017, 8, 26), True), + (offset_lom_sat_aug, datetime(2018, 8, 25), True), + (offset_lom_sat_aug, datetime(2019, 8, 31), True), + (offset_lom_sat_aug, datetime(2006, 8, 27), False), + (offset_lom_sat_aug, datetime(2007, 8, 28), False), + (offset_lom_sat_aug, datetime(2008, 8, 31), False), + (offset_lom_sat_aug, datetime(2009, 8, 30), False), + (offset_lom_sat_aug, datetime(2010, 8, 29), False), + (offset_lom_sat_aug, datetime(2011, 8, 28), False), + (offset_lom_sat_aug, datetime(2006, 8, 25), False), + (offset_lom_sat_aug, datetime(2007, 8, 24), False), + (offset_lom_sat_aug, datetime(2008, 8, 29), False), + (offset_lom_sat_aug, datetime(2009, 8, 28), False), + (offset_lom_sat_aug, datetime(2010, 8, 27), False), + (offset_lom_sat_aug, datetime(2011, 8, 26), False), + (offset_lom_sat_aug, datetime(2019, 8, 30), False), + # From GMCR (see for example: + # http://yahoo.brand.edgar-online.com/Default.aspx? + # companyid=3184&formtypeID=7) + (offset_lom_sat_sep, datetime(2010, 9, 25), True), + (offset_lom_sat_sep, datetime(2011, 9, 24), True), + (offset_lom_sat_sep, datetime(2012, 9, 29), True), + ] + + @pytest.mark.parametrize("case", on_offset_cases) + def test_is_on_offset(self, case): + offset, dt, expected = case + assert_is_on_offset(offset, dt, expected) + + def test_apply(self): + offset_lom_aug_sat = makeFY5253LastOfMonth(startingMonth=8, weekday=WeekDay.SAT) + offset_lom_aug_sat_1 = makeFY5253LastOfMonth( + n=1, startingMonth=8, weekday=WeekDay.SAT + ) + + date_seq_lom_aug_sat = [ + datetime(2006, 8, 26), + datetime(2007, 8, 25), + datetime(2008, 8, 30), + datetime(2009, 8, 29), + datetime(2010, 8, 28), + datetime(2011, 8, 27), + datetime(2012, 8, 25), + datetime(2013, 8, 31), + datetime(2014, 8, 30), + datetime(2015, 8, 29), + datetime(2016, 8, 27), + ] + + tests = [ + (offset_lom_aug_sat, date_seq_lom_aug_sat), + (offset_lom_aug_sat_1, date_seq_lom_aug_sat), + (offset_lom_aug_sat, [datetime(2006, 8, 25)] + date_seq_lom_aug_sat), + (offset_lom_aug_sat_1, [datetime(2006, 8, 27)] + date_seq_lom_aug_sat[1:]), + ( + makeFY5253LastOfMonth(n=-1, startingMonth=8, weekday=WeekDay.SAT), + list(reversed(date_seq_lom_aug_sat)), + ), + ] + for test in tests: + offset, data = test + current = data[0] + for datum in data[1:]: + current = current + offset + assert current == datum + + +class TestFY5253NearestEndMonth(Base): + def test_get_year_end(self): + assert makeFY5253NearestEndMonth( + startingMonth=8, weekday=WeekDay.SAT + ).get_year_end(datetime(2013, 1, 1)) == datetime(2013, 8, 31) + assert makeFY5253NearestEndMonth( + startingMonth=8, weekday=WeekDay.SUN + ).get_year_end(datetime(2013, 1, 1)) == datetime(2013, 9, 1) + assert makeFY5253NearestEndMonth( + startingMonth=8, weekday=WeekDay.FRI + ).get_year_end(datetime(2013, 1, 1)) == datetime(2013, 8, 30) + + offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12, variation="nearest") + assert offset_n.get_year_end(datetime(2012, 1, 1)) == datetime(2013, 1, 1) + assert offset_n.get_year_end(datetime(2012, 1, 10)) == datetime(2013, 1, 1) + + assert offset_n.get_year_end(datetime(2013, 1, 1)) == datetime(2013, 12, 31) + assert offset_n.get_year_end(datetime(2013, 1, 2)) == datetime(2013, 12, 31) + assert offset_n.get_year_end(datetime(2013, 1, 3)) == datetime(2013, 12, 31) + assert offset_n.get_year_end(datetime(2013, 1, 10)) == datetime(2013, 12, 31) + + JNJ = FY5253(n=1, startingMonth=12, weekday=6, variation="nearest") + assert JNJ.get_year_end(datetime(2006, 1, 1)) == datetime(2006, 12, 31) + + offset_lom_aug_sat = makeFY5253NearestEndMonth( + 1, startingMonth=8, weekday=WeekDay.SAT + ) + offset_lom_aug_thu = makeFY5253NearestEndMonth( + 1, startingMonth=8, weekday=WeekDay.THU + ) + offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12, variation="nearest") + + on_offset_cases = [ + # From Wikipedia (see: + # http://en.wikipedia.org/wiki/4%E2%80%934%E2%80%935_calendar + # #Saturday_nearest_the_end_of_month) + # 2006-09-02 2006 September 2 + # 2007-09-01 2007 September 1 + # 2008-08-30 2008 August 30 (leap year) + # 2009-08-29 2009 August 29 + # 2010-08-28 2010 August 28 + # 2011-09-03 2011 September 3 + # 2012-09-01 2012 September 1 (leap year) + # 2013-08-31 2013 August 31 + # 2014-08-30 2014 August 30 + # 2015-08-29 2015 August 29 + # 2016-09-03 2016 September 3 (leap year) + # 2017-09-02 2017 September 2 + # 2018-09-01 2018 September 1 + # 2019-08-31 2019 August 31 + (offset_lom_aug_sat, datetime(2006, 9, 2), True), + (offset_lom_aug_sat, datetime(2007, 9, 1), True), + (offset_lom_aug_sat, datetime(2008, 8, 30), True), + (offset_lom_aug_sat, datetime(2009, 8, 29), True), + (offset_lom_aug_sat, datetime(2010, 8, 28), True), + (offset_lom_aug_sat, datetime(2011, 9, 3), True), + (offset_lom_aug_sat, datetime(2016, 9, 3), True), + (offset_lom_aug_sat, datetime(2017, 9, 2), True), + (offset_lom_aug_sat, datetime(2018, 9, 1), True), + (offset_lom_aug_sat, datetime(2019, 8, 31), True), + (offset_lom_aug_sat, datetime(2006, 8, 27), False), + (offset_lom_aug_sat, datetime(2007, 8, 28), False), + (offset_lom_aug_sat, datetime(2008, 8, 31), False), + (offset_lom_aug_sat, datetime(2009, 8, 30), False), + (offset_lom_aug_sat, datetime(2010, 8, 29), False), + (offset_lom_aug_sat, datetime(2011, 8, 28), False), + (offset_lom_aug_sat, datetime(2006, 8, 25), False), + (offset_lom_aug_sat, datetime(2007, 8, 24), False), + (offset_lom_aug_sat, datetime(2008, 8, 29), False), + (offset_lom_aug_sat, datetime(2009, 8, 28), False), + (offset_lom_aug_sat, datetime(2010, 8, 27), False), + (offset_lom_aug_sat, datetime(2011, 8, 26), False), + (offset_lom_aug_sat, datetime(2019, 8, 30), False), + # From Micron, see: + # http://google.brand.edgar-online.com/?sym=MU&formtypeID=7 + (offset_lom_aug_thu, datetime(2012, 8, 30), True), + (offset_lom_aug_thu, datetime(2011, 9, 1), True), + (offset_n, datetime(2012, 12, 31), False), + (offset_n, datetime(2013, 1, 1), True), + (offset_n, datetime(2013, 1, 2), False), + ] + + @pytest.mark.parametrize("case", on_offset_cases) + def test_is_on_offset(self, case): + offset, dt, expected = case + assert_is_on_offset(offset, dt, expected) + + def test_apply(self): + date_seq_nem_8_sat = [ + datetime(2006, 9, 2), + datetime(2007, 9, 1), + datetime(2008, 8, 30), + datetime(2009, 8, 29), + datetime(2010, 8, 28), + datetime(2011, 9, 3), + ] + + JNJ = [ + datetime(2005, 1, 2), + datetime(2006, 1, 1), + datetime(2006, 12, 31), + datetime(2007, 12, 30), + datetime(2008, 12, 28), + datetime(2010, 1, 3), + datetime(2011, 1, 2), + datetime(2012, 1, 1), + datetime(2012, 12, 30), + ] + + DEC_SAT = FY5253(n=-1, startingMonth=12, weekday=5, variation="nearest") + + tests = [ + ( + makeFY5253NearestEndMonth(startingMonth=8, weekday=WeekDay.SAT), + date_seq_nem_8_sat, + ), + ( + makeFY5253NearestEndMonth(n=1, startingMonth=8, weekday=WeekDay.SAT), + date_seq_nem_8_sat, + ), + ( + makeFY5253NearestEndMonth(startingMonth=8, weekday=WeekDay.SAT), + [datetime(2006, 9, 1)] + date_seq_nem_8_sat, + ), + ( + makeFY5253NearestEndMonth(n=1, startingMonth=8, weekday=WeekDay.SAT), + [datetime(2006, 9, 3)] + date_seq_nem_8_sat[1:], + ), + ( + makeFY5253NearestEndMonth(n=-1, startingMonth=8, weekday=WeekDay.SAT), + list(reversed(date_seq_nem_8_sat)), + ), + ( + makeFY5253NearestEndMonth(n=1, startingMonth=12, weekday=WeekDay.SUN), + JNJ, + ), + ( + makeFY5253NearestEndMonth(n=-1, startingMonth=12, weekday=WeekDay.SUN), + list(reversed(JNJ)), + ), + ( + makeFY5253NearestEndMonth(n=1, startingMonth=12, weekday=WeekDay.SUN), + [datetime(2005, 1, 2), datetime(2006, 1, 1)], + ), + ( + makeFY5253NearestEndMonth(n=1, startingMonth=12, weekday=WeekDay.SUN), + [datetime(2006, 1, 2), datetime(2006, 12, 31)], + ), + (DEC_SAT, [datetime(2013, 1, 15), datetime(2012, 12, 29)]), + ] + for test in tests: + offset, data = test + current = data[0] + for datum in data[1:]: + current = current + offset + assert current == datum + + +class TestFY5253LastOfMonthQuarter(Base): + def test_is_anchored(self): + assert makeFY5253LastOfMonthQuarter( + startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ).is_anchored() + assert makeFY5253LastOfMonthQuarter( + weekday=WeekDay.SAT, startingMonth=3, qtr_with_extra_week=4 + ).is_anchored() + assert not makeFY5253LastOfMonthQuarter( + 2, startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ).is_anchored() + + def test_equality(self): + assert makeFY5253LastOfMonthQuarter( + startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) == makeFY5253LastOfMonthQuarter( + startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) + assert makeFY5253LastOfMonthQuarter( + startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) != makeFY5253LastOfMonthQuarter( + startingMonth=1, weekday=WeekDay.SUN, qtr_with_extra_week=4 + ) + assert makeFY5253LastOfMonthQuarter( + startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) != makeFY5253LastOfMonthQuarter( + startingMonth=2, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) + + def test_offset(self): + offset = makeFY5253LastOfMonthQuarter( + 1, startingMonth=9, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) + offset2 = makeFY5253LastOfMonthQuarter( + 2, startingMonth=9, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) + offset4 = makeFY5253LastOfMonthQuarter( + 4, startingMonth=9, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) + + offset_neg1 = makeFY5253LastOfMonthQuarter( + -1, startingMonth=9, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) + offset_neg2 = makeFY5253LastOfMonthQuarter( + -2, startingMonth=9, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) + + GMCR = [ + datetime(2010, 3, 27), + datetime(2010, 6, 26), + datetime(2010, 9, 25), + datetime(2010, 12, 25), + datetime(2011, 3, 26), + datetime(2011, 6, 25), + datetime(2011, 9, 24), + datetime(2011, 12, 24), + datetime(2012, 3, 24), + datetime(2012, 6, 23), + datetime(2012, 9, 29), + datetime(2012, 12, 29), + datetime(2013, 3, 30), + datetime(2013, 6, 29), + ] + + assert_offset_equal(offset, base=GMCR[0], expected=GMCR[1]) + assert_offset_equal( + offset, base=GMCR[0] + relativedelta(days=-1), expected=GMCR[0] + ) + assert_offset_equal(offset, base=GMCR[1], expected=GMCR[2]) + + assert_offset_equal(offset2, base=GMCR[0], expected=GMCR[2]) + assert_offset_equal(offset4, base=GMCR[0], expected=GMCR[4]) + + assert_offset_equal(offset_neg1, base=GMCR[-1], expected=GMCR[-2]) + assert_offset_equal( + offset_neg1, base=GMCR[-1] + relativedelta(days=+1), expected=GMCR[-1] + ) + assert_offset_equal(offset_neg2, base=GMCR[-1], expected=GMCR[-3]) + + date = GMCR[0] + relativedelta(days=-1) + for expected in GMCR: + assert_offset_equal(offset, date, expected) + date = date + offset + + date = GMCR[-1] + relativedelta(days=+1) + for expected in reversed(GMCR): + assert_offset_equal(offset_neg1, date, expected) + date = date + offset_neg1 + + lomq_aug_sat_4 = makeFY5253LastOfMonthQuarter( + 1, startingMonth=8, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) + lomq_sep_sat_4 = makeFY5253LastOfMonthQuarter( + 1, startingMonth=9, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) + + on_offset_cases = [ + # From Wikipedia + (lomq_aug_sat_4, datetime(2006, 8, 26), True), + (lomq_aug_sat_4, datetime(2007, 8, 25), True), + (lomq_aug_sat_4, datetime(2008, 8, 30), True), + (lomq_aug_sat_4, datetime(2009, 8, 29), True), + (lomq_aug_sat_4, datetime(2010, 8, 28), True), + (lomq_aug_sat_4, datetime(2011, 8, 27), True), + (lomq_aug_sat_4, datetime(2019, 8, 31), True), + (lomq_aug_sat_4, datetime(2006, 8, 27), False), + (lomq_aug_sat_4, datetime(2007, 8, 28), False), + (lomq_aug_sat_4, datetime(2008, 8, 31), False), + (lomq_aug_sat_4, datetime(2009, 8, 30), False), + (lomq_aug_sat_4, datetime(2010, 8, 29), False), + (lomq_aug_sat_4, datetime(2011, 8, 28), False), + (lomq_aug_sat_4, datetime(2006, 8, 25), False), + (lomq_aug_sat_4, datetime(2007, 8, 24), False), + (lomq_aug_sat_4, datetime(2008, 8, 29), False), + (lomq_aug_sat_4, datetime(2009, 8, 28), False), + (lomq_aug_sat_4, datetime(2010, 8, 27), False), + (lomq_aug_sat_4, datetime(2011, 8, 26), False), + (lomq_aug_sat_4, datetime(2019, 8, 30), False), + # From GMCR + (lomq_sep_sat_4, datetime(2010, 9, 25), True), + (lomq_sep_sat_4, datetime(2011, 9, 24), True), + (lomq_sep_sat_4, datetime(2012, 9, 29), True), + (lomq_sep_sat_4, datetime(2013, 6, 29), True), + (lomq_sep_sat_4, datetime(2012, 6, 23), True), + (lomq_sep_sat_4, datetime(2012, 6, 30), False), + (lomq_sep_sat_4, datetime(2013, 3, 30), True), + (lomq_sep_sat_4, datetime(2012, 3, 24), True), + (lomq_sep_sat_4, datetime(2012, 12, 29), True), + (lomq_sep_sat_4, datetime(2011, 12, 24), True), + # INTC (extra week in Q1) + # See: http://www.intc.com/releasedetail.cfm?ReleaseID=542844 + ( + makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ), + datetime(2011, 4, 2), + True, + ), + # see: http://google.brand.edgar-online.com/?sym=INTC&formtypeID=7 + ( + makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ), + datetime(2012, 12, 29), + True, + ), + ( + makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ), + datetime(2011, 12, 31), + True, + ), + ( + makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ), + datetime(2010, 12, 25), + True, + ), + ] + + @pytest.mark.parametrize("case", on_offset_cases) + def test_is_on_offset(self, case): + offset, dt, expected = case + assert_is_on_offset(offset, dt, expected) + + def test_year_has_extra_week(self): + # End of long Q1 + assert makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ).year_has_extra_week(datetime(2011, 4, 2)) + + # Start of long Q1 + assert makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ).year_has_extra_week(datetime(2010, 12, 26)) + + # End of year before year with long Q1 + assert not makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ).year_has_extra_week(datetime(2010, 12, 25)) + + for year in [ + x for x in range(1994, 2011 + 1) if x not in [2011, 2005, 2000, 1994] + ]: + assert not makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ).year_has_extra_week(datetime(year, 4, 2)) + + # Other long years + assert makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ).year_has_extra_week(datetime(2005, 4, 2)) + + assert makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ).year_has_extra_week(datetime(2000, 4, 2)) + + assert makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ).year_has_extra_week(datetime(1994, 4, 2)) + + def test_get_weeks(self): + sat_dec_1 = makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ) + sat_dec_4 = makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) + + assert sat_dec_1.get_weeks(datetime(2011, 4, 2)) == [14, 13, 13, 13] + assert sat_dec_4.get_weeks(datetime(2011, 4, 2)) == [13, 13, 13, 14] + assert sat_dec_1.get_weeks(datetime(2010, 12, 25)) == [13, 13, 13, 13] + + +class TestFY5253NearestEndMonthQuarter(Base): + + offset_nem_sat_aug_4 = makeFY5253NearestEndMonthQuarter( + 1, startingMonth=8, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) + offset_nem_thu_aug_4 = makeFY5253NearestEndMonthQuarter( + 1, startingMonth=8, weekday=WeekDay.THU, qtr_with_extra_week=4 + ) + offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12, variation="nearest") + + on_offset_cases = [ + # From Wikipedia + (offset_nem_sat_aug_4, datetime(2006, 9, 2), True), + (offset_nem_sat_aug_4, datetime(2007, 9, 1), True), + (offset_nem_sat_aug_4, datetime(2008, 8, 30), True), + (offset_nem_sat_aug_4, datetime(2009, 8, 29), True), + (offset_nem_sat_aug_4, datetime(2010, 8, 28), True), + (offset_nem_sat_aug_4, datetime(2011, 9, 3), True), + (offset_nem_sat_aug_4, datetime(2016, 9, 3), True), + (offset_nem_sat_aug_4, datetime(2017, 9, 2), True), + (offset_nem_sat_aug_4, datetime(2018, 9, 1), True), + (offset_nem_sat_aug_4, datetime(2019, 8, 31), True), + (offset_nem_sat_aug_4, datetime(2006, 8, 27), False), + (offset_nem_sat_aug_4, datetime(2007, 8, 28), False), + (offset_nem_sat_aug_4, datetime(2008, 8, 31), False), + (offset_nem_sat_aug_4, datetime(2009, 8, 30), False), + (offset_nem_sat_aug_4, datetime(2010, 8, 29), False), + (offset_nem_sat_aug_4, datetime(2011, 8, 28), False), + (offset_nem_sat_aug_4, datetime(2006, 8, 25), False), + (offset_nem_sat_aug_4, datetime(2007, 8, 24), False), + (offset_nem_sat_aug_4, datetime(2008, 8, 29), False), + (offset_nem_sat_aug_4, datetime(2009, 8, 28), False), + (offset_nem_sat_aug_4, datetime(2010, 8, 27), False), + (offset_nem_sat_aug_4, datetime(2011, 8, 26), False), + (offset_nem_sat_aug_4, datetime(2019, 8, 30), False), + # From Micron, see: + # http://google.brand.edgar-online.com/?sym=MU&formtypeID=7 + (offset_nem_thu_aug_4, datetime(2012, 8, 30), True), + (offset_nem_thu_aug_4, datetime(2011, 9, 1), True), + # See: http://google.brand.edgar-online.com/?sym=MU&formtypeID=13 + (offset_nem_thu_aug_4, datetime(2013, 5, 30), True), + (offset_nem_thu_aug_4, datetime(2013, 2, 28), True), + (offset_nem_thu_aug_4, datetime(2012, 11, 29), True), + (offset_nem_thu_aug_4, datetime(2012, 5, 31), True), + (offset_nem_thu_aug_4, datetime(2007, 3, 1), True), + (offset_nem_thu_aug_4, datetime(1994, 3, 3), True), + (offset_n, datetime(2012, 12, 31), False), + (offset_n, datetime(2013, 1, 1), True), + (offset_n, datetime(2013, 1, 2), False), + ] + + @pytest.mark.parametrize("case", on_offset_cases) + def test_is_on_offset(self, case): + offset, dt, expected = case + assert_is_on_offset(offset, dt, expected) + + def test_offset(self): + offset = makeFY5253NearestEndMonthQuarter( + 1, startingMonth=8, weekday=WeekDay.THU, qtr_with_extra_week=4 + ) + + MU = [ + datetime(2012, 5, 31), + datetime(2012, 8, 30), + datetime(2012, 11, 29), + datetime(2013, 2, 28), + datetime(2013, 5, 30), + ] + + date = MU[0] + relativedelta(days=-1) + for expected in MU: + assert_offset_equal(offset, date, expected) + date = date + offset + + assert_offset_equal(offset, datetime(2012, 5, 31), datetime(2012, 8, 30)) + assert_offset_equal(offset, datetime(2012, 5, 30), datetime(2012, 5, 31)) + + offset2 = FY5253Quarter( + weekday=5, startingMonth=12, variation="last", qtr_with_extra_week=4 + ) + + assert_offset_equal(offset2, datetime(2013, 1, 15), datetime(2013, 3, 30)) + + +def test_bunched_yearends(): + # GH#14774 cases with two fiscal year-ends in the same calendar-year + fy = FY5253(n=1, weekday=5, startingMonth=12, variation="nearest") + dt = Timestamp("2004-01-01") + assert fy.rollback(dt) == Timestamp("2002-12-28") + assert (-fy).apply(dt) == Timestamp("2002-12-28") + assert dt - fy == Timestamp("2002-12-28") + + assert fy.rollforward(dt) == Timestamp("2004-01-03") + assert fy.apply(dt) == Timestamp("2004-01-03") + assert fy + dt == Timestamp("2004-01-03") + assert dt + fy == Timestamp("2004-01-03") + + # Same thing, but starting from a Timestamp in the previous year. + dt = Timestamp("2003-12-31") + assert fy.rollback(dt) == Timestamp("2002-12-28") + assert (-fy).apply(dt) == Timestamp("2002-12-28") + assert dt - fy == Timestamp("2002-12-28") + + +def test_fy5253_last_onoffset(): + # GH#18877 dates on the year-end but not normalized to midnight + offset = FY5253(n=-5, startingMonth=5, variation="last", weekday=0) + ts = Timestamp("1984-05-28 06:29:43.955911354+0200", tz="Europe/San_Marino") + fast = offset.is_on_offset(ts) + slow = (ts + offset) - offset == ts + assert fast == slow + + +def test_fy5253_nearest_onoffset(): + # GH#18877 dates on the year-end but not normalized to midnight + offset = FY5253(n=3, startingMonth=7, variation="nearest", weekday=2) + ts = Timestamp("2032-07-28 00:12:59.035729419+0000", tz="Africa/Dakar") + fast = offset.is_on_offset(ts) + slow = (ts + offset) - offset == ts + assert fast == slow + + +def test_fy5253qtr_onoffset_nearest(): + # GH#19036 + ts = Timestamp("1985-09-02 23:57:46.232550356-0300", tz="Atlantic/Bermuda") + offset = FY5253Quarter( + n=3, qtr_with_extra_week=1, startingMonth=2, variation="nearest", weekday=0 + ) + fast = offset.is_on_offset(ts) + slow = (ts + offset) - offset == ts + assert fast == slow + + +def test_fy5253qtr_onoffset_last(): + # GH#19036 + offset = FY5253Quarter( + n=-2, qtr_with_extra_week=1, startingMonth=7, variation="last", weekday=2 + ) + ts = Timestamp("2011-01-26 19:03:40.331096129+0200", tz="Africa/Windhoek") + slow = (ts + offset) - offset == ts + fast = offset.is_on_offset(ts) + assert fast == slow diff --git a/venv/Lib/site-packages/pandas/tests/tseries/offsets/test_offsets.py b/venv/Lib/site-packages/pandas/tests/tseries/offsets/test_offsets.py new file mode 100644 index 0000000..2f00a58 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/tseries/offsets/test_offsets.py @@ -0,0 +1,4415 @@ +from datetime import date, datetime, time as dt_time, timedelta +from typing import Dict, List, Optional, Tuple, Type + +import numpy as np +import pytest + +from pandas._libs.tslibs import ( + NaT, + OutOfBoundsDatetime, + Timestamp, + conversion, + timezones, +) +from pandas._libs.tslibs.frequencies import ( + INVALID_FREQ_ERR_MSG, + get_freq_code, + get_freq_str, +) +import pandas._libs.tslibs.offsets as liboffsets +from pandas._libs.tslibs.offsets import ApplyTypeError +import pandas.compat as compat +from pandas.compat.numpy import np_datetime64_compat +from pandas.errors import PerformanceWarning + +import pandas._testing as tm +from pandas.core.indexes.datetimes import DatetimeIndex, date_range +from pandas.core.series import Series + +from pandas.io.pickle import read_pickle +from pandas.tseries.frequencies import _get_offset, _offset_map +from pandas.tseries.holiday import USFederalHolidayCalendar +import pandas.tseries.offsets as offsets +from pandas.tseries.offsets import ( + FY5253, + BaseOffset, + BDay, + BMonthBegin, + BMonthEnd, + BQuarterBegin, + BQuarterEnd, + BusinessHour, + BYearBegin, + BYearEnd, + CBMonthBegin, + CBMonthEnd, + CDay, + CustomBusinessDay, + CustomBusinessHour, + CustomBusinessMonthBegin, + CustomBusinessMonthEnd, + DateOffset, + Day, + Easter, + FY5253Quarter, + LastWeekOfMonth, + MonthBegin, + MonthEnd, + Nano, + QuarterBegin, + QuarterEnd, + SemiMonthBegin, + SemiMonthEnd, + Tick, + Week, + WeekOfMonth, + YearBegin, + YearEnd, +) + +from .common import assert_is_on_offset, assert_offset_equal + + +class WeekDay: + # TODO: Remove: This is not used outside of tests + MON = 0 + TUE = 1 + WED = 2 + THU = 3 + FRI = 4 + SAT = 5 + SUN = 6 + + +##### +# DateOffset Tests +##### +_ApplyCases = List[Tuple[BaseOffset, Dict[datetime, datetime]]] + + +class Base: + _offset: Optional[Type[DateOffset]] = None + d = Timestamp(datetime(2008, 1, 2)) + + timezones = [ + None, + "UTC", + "Asia/Tokyo", + "US/Eastern", + "dateutil/Asia/Tokyo", + "dateutil/US/Pacific", + ] + + def _get_offset(self, klass, value=1, normalize=False): + # create instance from offset class + if klass is FY5253: + klass = klass( + n=value, + startingMonth=1, + weekday=1, + variation="last", + normalize=normalize, + ) + elif klass is FY5253Quarter: + klass = klass( + n=value, + startingMonth=1, + weekday=1, + qtr_with_extra_week=1, + variation="last", + normalize=normalize, + ) + elif klass is LastWeekOfMonth: + klass = klass(n=value, weekday=5, normalize=normalize) + elif klass is WeekOfMonth: + klass = klass(n=value, week=1, weekday=5, normalize=normalize) + elif klass is Week: + klass = klass(n=value, weekday=5, normalize=normalize) + elif klass is DateOffset: + klass = klass(days=value, normalize=normalize) + else: + klass = klass(value, normalize=normalize) + return klass + + def test_apply_out_of_range(self, tz_naive_fixture): + tz = tz_naive_fixture + if self._offset is None: + return + + # try to create an out-of-bounds result timestamp; if we can't create + # the offset skip + try: + if self._offset in (BusinessHour, CustomBusinessHour): + # Using 10000 in BusinessHour fails in tz check because of DST + # difference + offset = self._get_offset(self._offset, value=100000) + else: + offset = self._get_offset(self._offset, value=10000) + + result = Timestamp("20080101") + offset + assert isinstance(result, datetime) + assert result.tzinfo is None + + # Check tz is preserved + t = Timestamp("20080101", tz=tz) + result = t + offset + assert isinstance(result, datetime) + assert t.tzinfo == result.tzinfo + + except OutOfBoundsDatetime: + pass + except (ValueError, KeyError): + # we are creating an invalid offset + # so ignore + pass + + def test_offsets_compare_equal(self): + # root cause of GH#456: __ne__ was not implemented + if self._offset is None: + return + offset1 = self._offset() + offset2 = self._offset() + assert not offset1 != offset2 + assert offset1 == offset2 + + def test_rsub(self): + if self._offset is None or not hasattr(self, "offset2"): + # i.e. skip for TestCommon and YQM subclasses that do not have + # offset2 attr + return + assert self.d - self.offset2 == (-self.offset2).apply(self.d) + + def test_radd(self): + if self._offset is None or not hasattr(self, "offset2"): + # i.e. skip for TestCommon and YQM subclasses that do not have + # offset2 attr + return + assert self.d + self.offset2 == self.offset2 + self.d + + def test_sub(self): + if self._offset is None or not hasattr(self, "offset2"): + # i.e. skip for TestCommon and YQM subclasses that do not have + # offset2 attr + return + off = self.offset2 + msg = "Cannot subtract datetime from offset" + with pytest.raises(TypeError, match=msg): + off - self.d + + assert 2 * off - off == off + assert self.d - self.offset2 == self.d + self._offset(-2) + assert self.d - self.offset2 == self.d - (2 * off - off) + + def testMult1(self): + if self._offset is None or not hasattr(self, "offset1"): + # i.e. skip for TestCommon and YQM subclasses that do not have + # offset1 attr + return + assert self.d + 10 * self.offset1 == self.d + self._offset(10) + assert self.d + 5 * self.offset1 == self.d + self._offset(5) + + def testMult2(self): + if self._offset is None: + return + assert self.d + (-5 * self._offset(-10)) == self.d + self._offset(50) + assert self.d + (-3 * self._offset(-2)) == self.d + self._offset(6) + + def test_compare_str(self): + # GH#23524 + # comparing to strings that cannot be cast to DateOffsets should + # not raise for __eq__ or __ne__ + if self._offset is None: + return + off = self._get_offset(self._offset) + + assert not off == "infer" + assert off != "foo" + # Note: inequalities are only implemented for Tick subclasses; + # tests for this are in test_ticks + + +class TestCommon(Base): + # exected value created by Base._get_offset + # are applied to 2011/01/01 09:00 (Saturday) + # used for .apply and .rollforward + expecteds = { + "Day": Timestamp("2011-01-02 09:00:00"), + "DateOffset": Timestamp("2011-01-02 09:00:00"), + "BusinessDay": Timestamp("2011-01-03 09:00:00"), + "CustomBusinessDay": Timestamp("2011-01-03 09:00:00"), + "CustomBusinessMonthEnd": Timestamp("2011-01-31 09:00:00"), + "CustomBusinessMonthBegin": Timestamp("2011-01-03 09:00:00"), + "MonthBegin": Timestamp("2011-02-01 09:00:00"), + "BusinessMonthBegin": Timestamp("2011-01-03 09:00:00"), + "MonthEnd": Timestamp("2011-01-31 09:00:00"), + "SemiMonthEnd": Timestamp("2011-01-15 09:00:00"), + "SemiMonthBegin": Timestamp("2011-01-15 09:00:00"), + "BusinessMonthEnd": Timestamp("2011-01-31 09:00:00"), + "YearBegin": Timestamp("2012-01-01 09:00:00"), + "BYearBegin": Timestamp("2011-01-03 09:00:00"), + "YearEnd": Timestamp("2011-12-31 09:00:00"), + "BYearEnd": Timestamp("2011-12-30 09:00:00"), + "QuarterBegin": Timestamp("2011-03-01 09:00:00"), + "BQuarterBegin": Timestamp("2011-03-01 09:00:00"), + "QuarterEnd": Timestamp("2011-03-31 09:00:00"), + "BQuarterEnd": Timestamp("2011-03-31 09:00:00"), + "BusinessHour": Timestamp("2011-01-03 10:00:00"), + "CustomBusinessHour": Timestamp("2011-01-03 10:00:00"), + "WeekOfMonth": Timestamp("2011-01-08 09:00:00"), + "LastWeekOfMonth": Timestamp("2011-01-29 09:00:00"), + "FY5253Quarter": Timestamp("2011-01-25 09:00:00"), + "FY5253": Timestamp("2011-01-25 09:00:00"), + "Week": Timestamp("2011-01-08 09:00:00"), + "Easter": Timestamp("2011-04-24 09:00:00"), + "Hour": Timestamp("2011-01-01 10:00:00"), + "Minute": Timestamp("2011-01-01 09:01:00"), + "Second": Timestamp("2011-01-01 09:00:01"), + "Milli": Timestamp("2011-01-01 09:00:00.001000"), + "Micro": Timestamp("2011-01-01 09:00:00.000001"), + "Nano": Timestamp(np_datetime64_compat("2011-01-01T09:00:00.000000001Z")), + } + + def test_immutable(self, offset_types): + # GH#21341 check that __setattr__ raises + offset = self._get_offset(offset_types) + with pytest.raises(AttributeError): + offset.normalize = True + with pytest.raises(AttributeError): + offset.n = 91 + + def test_return_type(self, offset_types): + offset = self._get_offset(offset_types) + + # make sure that we are returning a Timestamp + result = Timestamp("20080101") + offset + assert isinstance(result, Timestamp) + + # make sure that we are returning NaT + assert NaT + offset is NaT + assert offset + NaT is NaT + + assert NaT - offset is NaT + assert (-offset).apply(NaT) is NaT + + def test_offset_n(self, offset_types): + offset = self._get_offset(offset_types) + assert offset.n == 1 + + neg_offset = offset * -1 + assert neg_offset.n == -1 + + mul_offset = offset * 3 + assert mul_offset.n == 3 + + def test_offset_timedelta64_arg(self, offset_types): + # check that offset._validate_n raises TypeError on a timedelt64 + # object + off = self._get_offset(offset_types) + + td64 = np.timedelta64(4567, "s") + with pytest.raises(TypeError, match="argument must be an integer"): + type(off)(n=td64, **off.kwds) + + def test_offset_mul_ndarray(self, offset_types): + off = self._get_offset(offset_types) + + expected = np.array([[off, off * 2], [off * 3, off * 4]]) + + result = np.array([[1, 2], [3, 4]]) * off + tm.assert_numpy_array_equal(result, expected) + + result = off * np.array([[1, 2], [3, 4]]) + tm.assert_numpy_array_equal(result, expected) + + def test_offset_freqstr(self, offset_types): + offset = self._get_offset(offset_types) + + freqstr = offset.freqstr + if freqstr not in ("", "", "LWOM-SAT"): + code = _get_offset(freqstr) + assert offset.rule_code == code + + def _check_offsetfunc_works(self, offset, funcname, dt, expected, normalize=False): + + if normalize and issubclass(offset, Tick): + # normalize=True disallowed for Tick subclasses GH#21427 + return + + offset_s = self._get_offset(offset, normalize=normalize) + func = getattr(offset_s, funcname) + + result = func(dt) + assert isinstance(result, Timestamp) + assert result == expected + + result = func(Timestamp(dt)) + assert isinstance(result, Timestamp) + assert result == expected + + # see gh-14101 + exp_warning = None + ts = Timestamp(dt) + Nano(5) + + if ( + type(offset_s).__name__ == "DateOffset" + and (funcname == "apply" or normalize) + and ts.nanosecond > 0 + ): + exp_warning = UserWarning + + # test nanosecond is preserved + with tm.assert_produces_warning(exp_warning, check_stacklevel=False): + result = func(ts) + assert isinstance(result, Timestamp) + if normalize is False: + assert result == expected + Nano(5) + else: + assert result == expected + + if isinstance(dt, np.datetime64): + # test tz when input is datetime or Timestamp + return + + for tz in self.timezones: + expected_localize = expected.tz_localize(tz) + tz_obj = timezones.maybe_get_tz(tz) + dt_tz = conversion.localize_pydatetime(dt, tz_obj) + + result = func(dt_tz) + assert isinstance(result, Timestamp) + assert result == expected_localize + + result = func(Timestamp(dt, tz=tz)) + assert isinstance(result, Timestamp) + assert result == expected_localize + + # see gh-14101 + exp_warning = None + ts = Timestamp(dt, tz=tz) + Nano(5) + + if ( + type(offset_s).__name__ == "DateOffset" + and (funcname == "apply" or normalize) + and ts.nanosecond > 0 + ): + exp_warning = UserWarning + + # test nanosecond is preserved + with tm.assert_produces_warning(exp_warning, check_stacklevel=False): + result = func(ts) + assert isinstance(result, Timestamp) + if normalize is False: + assert result == expected_localize + Nano(5) + else: + assert result == expected_localize + + def test_apply(self, offset_types): + sdt = datetime(2011, 1, 1, 9, 0) + ndt = np_datetime64_compat("2011-01-01 09:00Z") + + for dt in [sdt, ndt]: + expected = self.expecteds[offset_types.__name__] + self._check_offsetfunc_works(offset_types, "apply", dt, expected) + + expected = Timestamp(expected.date()) + self._check_offsetfunc_works( + offset_types, "apply", dt, expected, normalize=True + ) + + def test_rollforward(self, offset_types): + expecteds = self.expecteds.copy() + + # result will not be changed if the target is on the offset + no_changes = [ + "Day", + "MonthBegin", + "SemiMonthBegin", + "YearBegin", + "Week", + "Hour", + "Minute", + "Second", + "Milli", + "Micro", + "Nano", + "DateOffset", + ] + for n in no_changes: + expecteds[n] = Timestamp("2011/01/01 09:00") + + expecteds["BusinessHour"] = Timestamp("2011-01-03 09:00:00") + expecteds["CustomBusinessHour"] = Timestamp("2011-01-03 09:00:00") + + # but be changed when normalize=True + norm_expected = expecteds.copy() + for k in norm_expected: + norm_expected[k] = Timestamp(norm_expected[k].date()) + + normalized = { + "Day": Timestamp("2011-01-02 00:00:00"), + "DateOffset": Timestamp("2011-01-02 00:00:00"), + "MonthBegin": Timestamp("2011-02-01 00:00:00"), + "SemiMonthBegin": Timestamp("2011-01-15 00:00:00"), + "YearBegin": Timestamp("2012-01-01 00:00:00"), + "Week": Timestamp("2011-01-08 00:00:00"), + "Hour": Timestamp("2011-01-01 00:00:00"), + "Minute": Timestamp("2011-01-01 00:00:00"), + "Second": Timestamp("2011-01-01 00:00:00"), + "Milli": Timestamp("2011-01-01 00:00:00"), + "Micro": Timestamp("2011-01-01 00:00:00"), + } + norm_expected.update(normalized) + + sdt = datetime(2011, 1, 1, 9, 0) + ndt = np_datetime64_compat("2011-01-01 09:00Z") + + for dt in [sdt, ndt]: + expected = expecteds[offset_types.__name__] + self._check_offsetfunc_works(offset_types, "rollforward", dt, expected) + expected = norm_expected[offset_types.__name__] + self._check_offsetfunc_works( + offset_types, "rollforward", dt, expected, normalize=True + ) + + def test_rollback(self, offset_types): + expecteds = { + "BusinessDay": Timestamp("2010-12-31 09:00:00"), + "CustomBusinessDay": Timestamp("2010-12-31 09:00:00"), + "CustomBusinessMonthEnd": Timestamp("2010-12-31 09:00:00"), + "CustomBusinessMonthBegin": Timestamp("2010-12-01 09:00:00"), + "BusinessMonthBegin": Timestamp("2010-12-01 09:00:00"), + "MonthEnd": Timestamp("2010-12-31 09:00:00"), + "SemiMonthEnd": Timestamp("2010-12-31 09:00:00"), + "BusinessMonthEnd": Timestamp("2010-12-31 09:00:00"), + "BYearBegin": Timestamp("2010-01-01 09:00:00"), + "YearEnd": Timestamp("2010-12-31 09:00:00"), + "BYearEnd": Timestamp("2010-12-31 09:00:00"), + "QuarterBegin": Timestamp("2010-12-01 09:00:00"), + "BQuarterBegin": Timestamp("2010-12-01 09:00:00"), + "QuarterEnd": Timestamp("2010-12-31 09:00:00"), + "BQuarterEnd": Timestamp("2010-12-31 09:00:00"), + "BusinessHour": Timestamp("2010-12-31 17:00:00"), + "CustomBusinessHour": Timestamp("2010-12-31 17:00:00"), + "WeekOfMonth": Timestamp("2010-12-11 09:00:00"), + "LastWeekOfMonth": Timestamp("2010-12-25 09:00:00"), + "FY5253Quarter": Timestamp("2010-10-26 09:00:00"), + "FY5253": Timestamp("2010-01-26 09:00:00"), + "Easter": Timestamp("2010-04-04 09:00:00"), + } + + # result will not be changed if the target is on the offset + for n in [ + "Day", + "MonthBegin", + "SemiMonthBegin", + "YearBegin", + "Week", + "Hour", + "Minute", + "Second", + "Milli", + "Micro", + "Nano", + "DateOffset", + ]: + expecteds[n] = Timestamp("2011/01/01 09:00") + + # but be changed when normalize=True + norm_expected = expecteds.copy() + for k in norm_expected: + norm_expected[k] = Timestamp(norm_expected[k].date()) + + normalized = { + "Day": Timestamp("2010-12-31 00:00:00"), + "DateOffset": Timestamp("2010-12-31 00:00:00"), + "MonthBegin": Timestamp("2010-12-01 00:00:00"), + "SemiMonthBegin": Timestamp("2010-12-15 00:00:00"), + "YearBegin": Timestamp("2010-01-01 00:00:00"), + "Week": Timestamp("2010-12-25 00:00:00"), + "Hour": Timestamp("2011-01-01 00:00:00"), + "Minute": Timestamp("2011-01-01 00:00:00"), + "Second": Timestamp("2011-01-01 00:00:00"), + "Milli": Timestamp("2011-01-01 00:00:00"), + "Micro": Timestamp("2011-01-01 00:00:00"), + } + norm_expected.update(normalized) + + sdt = datetime(2011, 1, 1, 9, 0) + ndt = np_datetime64_compat("2011-01-01 09:00Z") + + for dt in [sdt, ndt]: + expected = expecteds[offset_types.__name__] + self._check_offsetfunc_works(offset_types, "rollback", dt, expected) + + expected = norm_expected[offset_types.__name__] + self._check_offsetfunc_works( + offset_types, "rollback", dt, expected, normalize=True + ) + + def test_is_on_offset(self, offset_types): + dt = self.expecteds[offset_types.__name__] + offset_s = self._get_offset(offset_types) + assert offset_s.is_on_offset(dt) + + # when normalize=True, is_on_offset checks time is 00:00:00 + if issubclass(offset_types, Tick): + # normalize=True disallowed for Tick subclasses GH#21427 + return + offset_n = self._get_offset(offset_types, normalize=True) + assert not offset_n.is_on_offset(dt) + + if offset_types in (BusinessHour, CustomBusinessHour): + # In default BusinessHour (9:00-17:00), normalized time + # cannot be in business hour range + return + date = datetime(dt.year, dt.month, dt.day) + assert offset_n.is_on_offset(date) + + def test_add(self, offset_types, tz_naive_fixture): + tz = tz_naive_fixture + dt = datetime(2011, 1, 1, 9, 0) + + offset_s = self._get_offset(offset_types) + expected = self.expecteds[offset_types.__name__] + + result_dt = dt + offset_s + result_ts = Timestamp(dt) + offset_s + for result in [result_dt, result_ts]: + assert isinstance(result, Timestamp) + assert result == expected + + expected_localize = expected.tz_localize(tz) + result = Timestamp(dt, tz=tz) + offset_s + assert isinstance(result, Timestamp) + assert result == expected_localize + + # normalize=True, disallowed for Tick subclasses GH#21427 + if issubclass(offset_types, Tick): + return + offset_s = self._get_offset(offset_types, normalize=True) + expected = Timestamp(expected.date()) + + result_dt = dt + offset_s + result_ts = Timestamp(dt) + offset_s + for result in [result_dt, result_ts]: + assert isinstance(result, Timestamp) + assert result == expected + + expected_localize = expected.tz_localize(tz) + result = Timestamp(dt, tz=tz) + offset_s + assert isinstance(result, Timestamp) + assert result == expected_localize + + def test_add_empty_datetimeindex(self, offset_types, tz_naive_fixture): + # GH#12724, GH#30336 + offset_s = self._get_offset(offset_types) + + dti = DatetimeIndex([], tz=tz_naive_fixture) + + warn = None + if isinstance( + offset_s, + ( + Easter, + WeekOfMonth, + LastWeekOfMonth, + CustomBusinessDay, + BusinessHour, + CustomBusinessHour, + CustomBusinessMonthBegin, + CustomBusinessMonthEnd, + FY5253, + FY5253Quarter, + ), + ): + # We don't have an optimized apply_index + warn = PerformanceWarning + + with tm.assert_produces_warning(warn): + result = dti + offset_s + tm.assert_index_equal(result, dti) + with tm.assert_produces_warning(warn): + result = offset_s + dti + tm.assert_index_equal(result, dti) + + dta = dti._data + with tm.assert_produces_warning(warn): + result = dta + offset_s + tm.assert_equal(result, dta) + with tm.assert_produces_warning(warn): + result = offset_s + dta + tm.assert_equal(result, dta) + + def test_pickle_v0_15_2(self, datapath): + offsets = { + "DateOffset": DateOffset(years=1), + "MonthBegin": MonthBegin(1), + "Day": Day(1), + "YearBegin": YearBegin(1), + "Week": Week(1), + } + + pickle_path = datapath("tseries", "offsets", "data", "dateoffset_0_15_2.pickle") + # This code was executed once on v0.15.2 to generate the pickle: + # with open(pickle_path, 'wb') as f: pickle.dump(offsets, f) + # + tm.assert_dict_equal(offsets, read_pickle(pickle_path)) + + def test_onOffset_deprecated(self, offset_types): + # GH#30340 use idiomatic naming + off = self._get_offset(offset_types) + + ts = Timestamp.now() + with tm.assert_produces_warning(FutureWarning): + result = off.onOffset(ts) + + expected = off.is_on_offset(ts) + assert result == expected + + def test_isAnchored_deprecated(self, offset_types): + # GH#30340 use idiomatic naming + off = self._get_offset(offset_types) + + with tm.assert_produces_warning(FutureWarning): + result = off.isAnchored() + + expected = off.is_anchored() + assert result == expected + + +class TestDateOffset(Base): + def setup_method(self, method): + self.d = Timestamp(datetime(2008, 1, 2)) + _offset_map.clear() + + def test_repr(self): + repr(DateOffset()) + repr(DateOffset(2)) + repr(2 * DateOffset()) + repr(2 * DateOffset(months=2)) + + def test_mul(self): + assert DateOffset(2) == 2 * DateOffset(1) + assert DateOffset(2) == DateOffset(1) * 2 + + def test_constructor(self): + + assert (self.d + DateOffset(months=2)) == datetime(2008, 3, 2) + assert (self.d - DateOffset(months=2)) == datetime(2007, 11, 2) + + assert (self.d + DateOffset(2)) == datetime(2008, 1, 4) + + assert not DateOffset(2).is_anchored() + assert DateOffset(1).is_anchored() + + d = datetime(2008, 1, 31) + assert (d + DateOffset(months=1)) == datetime(2008, 2, 29) + + def test_copy(self): + assert DateOffset(months=2).copy() == DateOffset(months=2) + + def test_eq(self): + offset1 = DateOffset(days=1) + offset2 = DateOffset(days=365) + + assert offset1 != offset2 + + +class TestBusinessDay(Base): + _offset = BDay + + def setup_method(self, method): + self.d = datetime(2008, 1, 1) + + self.offset = BDay() + self.offset1 = self.offset + self.offset2 = BDay(2) + + def test_different_normalize_equals(self): + # GH#21404 changed __eq__ to return False when `normalize` does not match + offset = self._offset() + offset2 = self._offset(normalize=True) + assert offset != offset2 + + def test_repr(self): + assert repr(self.offset) == "" + assert repr(self.offset2) == "<2 * BusinessDays>" + + if compat.PY37: + expected = "" + else: + expected = "" + assert repr(self.offset + timedelta(1)) == expected + + def test_with_offset(self): + offset = self.offset + timedelta(hours=2) + + assert (self.d + offset) == datetime(2008, 1, 2, 2) + + def test_eq(self): + assert self.offset2 == self.offset2 + + def test_mul(self): + pass + + def test_hash(self): + assert hash(self.offset2) == hash(self.offset2) + + def test_call(self): + assert self.offset2(self.d) == datetime(2008, 1, 3) + + def testRollback1(self): + assert BDay(10).rollback(self.d) == self.d + + def testRollback2(self): + assert BDay(10).rollback(datetime(2008, 1, 5)) == datetime(2008, 1, 4) + + def testRollforward1(self): + assert BDay(10).rollforward(self.d) == self.d + + def testRollforward2(self): + assert BDay(10).rollforward(datetime(2008, 1, 5)) == datetime(2008, 1, 7) + + def test_roll_date_object(self): + offset = BDay() + + dt = date(2012, 9, 15) + + result = offset.rollback(dt) + assert result == datetime(2012, 9, 14) + + result = offset.rollforward(dt) + assert result == datetime(2012, 9, 17) + + offset = offsets.Day() + result = offset.rollback(dt) + assert result == datetime(2012, 9, 15) + + result = offset.rollforward(dt) + assert result == datetime(2012, 9, 15) + + def test_is_on_offset(self): + tests = [ + (BDay(), datetime(2008, 1, 1), True), + (BDay(), datetime(2008, 1, 5), False), + ] + + for offset, d, expected in tests: + assert_is_on_offset(offset, d, expected) + + apply_cases: _ApplyCases = [] + apply_cases.append( + ( + BDay(), + { + datetime(2008, 1, 1): datetime(2008, 1, 2), + datetime(2008, 1, 4): datetime(2008, 1, 7), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 8), + }, + ) + ) + + apply_cases.append( + ( + 2 * BDay(), + { + datetime(2008, 1, 1): datetime(2008, 1, 3), + datetime(2008, 1, 4): datetime(2008, 1, 8), + datetime(2008, 1, 5): datetime(2008, 1, 8), + datetime(2008, 1, 6): datetime(2008, 1, 8), + datetime(2008, 1, 7): datetime(2008, 1, 9), + }, + ) + ) + + apply_cases.append( + ( + -BDay(), + { + datetime(2008, 1, 1): datetime(2007, 12, 31), + datetime(2008, 1, 4): datetime(2008, 1, 3), + datetime(2008, 1, 5): datetime(2008, 1, 4), + datetime(2008, 1, 6): datetime(2008, 1, 4), + datetime(2008, 1, 7): datetime(2008, 1, 4), + datetime(2008, 1, 8): datetime(2008, 1, 7), + }, + ) + ) + + apply_cases.append( + ( + -2 * BDay(), + { + datetime(2008, 1, 1): datetime(2007, 12, 28), + datetime(2008, 1, 4): datetime(2008, 1, 2), + datetime(2008, 1, 5): datetime(2008, 1, 3), + datetime(2008, 1, 6): datetime(2008, 1, 3), + datetime(2008, 1, 7): datetime(2008, 1, 3), + datetime(2008, 1, 8): datetime(2008, 1, 4), + datetime(2008, 1, 9): datetime(2008, 1, 7), + }, + ) + ) + + apply_cases.append( + ( + BDay(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 4): datetime(2008, 1, 4), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 7), + }, + ) + ) + + @pytest.mark.parametrize("case", apply_cases) + def test_apply(self, case): + offset, cases = case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + def test_apply_large_n(self): + dt = datetime(2012, 10, 23) + + result = dt + BDay(10) + assert result == datetime(2012, 11, 6) + + result = dt + BDay(100) - BDay(100) + assert result == dt + + off = BDay() * 6 + rs = datetime(2012, 1, 1) - off + xp = datetime(2011, 12, 23) + assert rs == xp + + st = datetime(2011, 12, 18) + rs = st + off + xp = datetime(2011, 12, 26) + assert rs == xp + + off = BDay() * 10 + rs = datetime(2014, 1, 5) + off # see #5890 + xp = datetime(2014, 1, 17) + assert rs == xp + + def test_apply_corner(self): + msg = "Only know how to combine business day with datetime or timedelta" + with pytest.raises(ApplyTypeError, match=msg): + BDay().apply(BMonthEnd()) + + +class TestBusinessHour(Base): + _offset = BusinessHour + + def setup_method(self, method): + self.d = datetime(2014, 7, 1, 10, 00) + + self.offset1 = BusinessHour() + self.offset2 = BusinessHour(n=3) + + self.offset3 = BusinessHour(n=-1) + self.offset4 = BusinessHour(n=-4) + + from datetime import time as dt_time + + self.offset5 = BusinessHour(start=dt_time(11, 0), end=dt_time(14, 30)) + self.offset6 = BusinessHour(start="20:00", end="05:00") + self.offset7 = BusinessHour(n=-2, start=dt_time(21, 30), end=dt_time(6, 30)) + self.offset8 = BusinessHour(start=["09:00", "13:00"], end=["12:00", "17:00"]) + self.offset9 = BusinessHour( + n=3, start=["09:00", "22:00"], end=["13:00", "03:00"] + ) + self.offset10 = BusinessHour( + n=-1, start=["23:00", "13:00"], end=["02:00", "17:00"] + ) + + @pytest.mark.parametrize( + "start,end,match", + [ + ( + dt_time(11, 0, 5), + "17:00", + "time data must be specified only with hour and minute", + ), + ("AAA", "17:00", "time data must match '%H:%M' format"), + ("14:00:05", "17:00", "time data must match '%H:%M' format"), + ([], "17:00", "Must include at least 1 start time"), + ("09:00", [], "Must include at least 1 end time"), + ( + ["09:00", "11:00"], + "17:00", + "number of starting time and ending time must be the same", + ), + ( + ["09:00", "11:00"], + ["10:00"], + "number of starting time and ending time must be the same", + ), + ( + ["09:00", "11:00"], + ["12:00", "20:00"], + r"invalid starting and ending time\(s\): opening hours should not " + "touch or overlap with one another", + ), + ( + ["12:00", "20:00"], + ["09:00", "11:00"], + r"invalid starting and ending time\(s\): opening hours should not " + "touch or overlap with one another", + ), + ], + ) + def test_constructor_errors(self, start, end, match): + with pytest.raises(ValueError, match=match): + BusinessHour(start=start, end=end) + + def test_different_normalize_equals(self): + # GH#21404 changed __eq__ to return False when `normalize` does not match + offset = self._offset() + offset2 = self._offset(normalize=True) + assert offset != offset2 + + def test_repr(self): + assert repr(self.offset1) == "" + assert repr(self.offset2) == "<3 * BusinessHours: BH=09:00-17:00>" + assert repr(self.offset3) == "<-1 * BusinessHour: BH=09:00-17:00>" + assert repr(self.offset4) == "<-4 * BusinessHours: BH=09:00-17:00>" + + assert repr(self.offset5) == "" + assert repr(self.offset6) == "" + assert repr(self.offset7) == "<-2 * BusinessHours: BH=21:30-06:30>" + assert repr(self.offset8) == "" + assert repr(self.offset9) == "<3 * BusinessHours: BH=09:00-13:00,22:00-03:00>" + assert repr(self.offset10) == "<-1 * BusinessHour: BH=13:00-17:00,23:00-02:00>" + + def test_with_offset(self): + expected = Timestamp("2014-07-01 13:00") + + assert self.d + BusinessHour() * 3 == expected + assert self.d + BusinessHour(n=3) == expected + + @pytest.mark.parametrize( + "offset_name", + ["offset1", "offset2", "offset3", "offset4", "offset8", "offset9", "offset10"], + ) + def test_eq_attribute(self, offset_name): + offset = getattr(self, offset_name) + assert offset == offset + + @pytest.mark.parametrize( + "offset1,offset2", + [ + (BusinessHour(start="09:00"), BusinessHour()), + ( + BusinessHour(start=["23:00", "13:00"], end=["12:00", "17:00"]), + BusinessHour(start=["13:00", "23:00"], end=["17:00", "12:00"]), + ), + ], + ) + def test_eq(self, offset1, offset2): + assert offset1 == offset2 + + @pytest.mark.parametrize( + "offset1,offset2", + [ + (BusinessHour(), BusinessHour(-1)), + (BusinessHour(start="09:00"), BusinessHour(start="09:01")), + ( + BusinessHour(start="09:00", end="17:00"), + BusinessHour(start="17:00", end="09:01"), + ), + ( + BusinessHour(start=["13:00", "23:00"], end=["18:00", "07:00"]), + BusinessHour(start=["13:00", "23:00"], end=["17:00", "12:00"]), + ), + ], + ) + def test_neq(self, offset1, offset2): + assert offset1 != offset2 + + @pytest.mark.parametrize( + "offset_name", + ["offset1", "offset2", "offset3", "offset4", "offset8", "offset9", "offset10"], + ) + def test_hash(self, offset_name): + offset = getattr(self, offset_name) + assert offset == offset + + def test_call(self): + assert self.offset1(self.d) == datetime(2014, 7, 1, 11) + assert self.offset2(self.d) == datetime(2014, 7, 1, 13) + assert self.offset3(self.d) == datetime(2014, 6, 30, 17) + assert self.offset4(self.d) == datetime(2014, 6, 30, 14) + assert self.offset8(self.d) == datetime(2014, 7, 1, 11) + assert self.offset9(self.d) == datetime(2014, 7, 1, 22) + assert self.offset10(self.d) == datetime(2014, 7, 1, 1) + + def test_sub(self): + # we have to override test_sub here because self.offset2 is not + # defined as self._offset(2) + off = self.offset2 + msg = "Cannot subtract datetime from offset" + with pytest.raises(TypeError, match=msg): + off - self.d + assert 2 * off - off == off + + assert self.d - self.offset2 == self.d + self._offset(-3) + + def testRollback1(self): + assert self.offset1.rollback(self.d) == self.d + assert self.offset2.rollback(self.d) == self.d + assert self.offset3.rollback(self.d) == self.d + assert self.offset4.rollback(self.d) == self.d + assert self.offset5.rollback(self.d) == datetime(2014, 6, 30, 14, 30) + assert self.offset6.rollback(self.d) == datetime(2014, 7, 1, 5, 0) + assert self.offset7.rollback(self.d) == datetime(2014, 7, 1, 6, 30) + assert self.offset8.rollback(self.d) == self.d + assert self.offset9.rollback(self.d) == self.d + assert self.offset10.rollback(self.d) == datetime(2014, 7, 1, 2) + + d = datetime(2014, 7, 1, 0) + assert self.offset1.rollback(d) == datetime(2014, 6, 30, 17) + assert self.offset2.rollback(d) == datetime(2014, 6, 30, 17) + assert self.offset3.rollback(d) == datetime(2014, 6, 30, 17) + assert self.offset4.rollback(d) == datetime(2014, 6, 30, 17) + assert self.offset5.rollback(d) == datetime(2014, 6, 30, 14, 30) + assert self.offset6.rollback(d) == d + assert self.offset7.rollback(d) == d + assert self.offset8.rollback(d) == datetime(2014, 6, 30, 17) + assert self.offset9.rollback(d) == d + assert self.offset10.rollback(d) == d + + assert self._offset(5).rollback(self.d) == self.d + + def testRollback2(self): + assert self._offset(-3).rollback(datetime(2014, 7, 5, 15, 0)) == datetime( + 2014, 7, 4, 17, 0 + ) + + def testRollforward1(self): + assert self.offset1.rollforward(self.d) == self.d + assert self.offset2.rollforward(self.d) == self.d + assert self.offset3.rollforward(self.d) == self.d + assert self.offset4.rollforward(self.d) == self.d + assert self.offset5.rollforward(self.d) == datetime(2014, 7, 1, 11, 0) + assert self.offset6.rollforward(self.d) == datetime(2014, 7, 1, 20, 0) + assert self.offset7.rollforward(self.d) == datetime(2014, 7, 1, 21, 30) + assert self.offset8.rollforward(self.d) == self.d + assert self.offset9.rollforward(self.d) == self.d + assert self.offset10.rollforward(self.d) == datetime(2014, 7, 1, 13) + + d = datetime(2014, 7, 1, 0) + assert self.offset1.rollforward(d) == datetime(2014, 7, 1, 9) + assert self.offset2.rollforward(d) == datetime(2014, 7, 1, 9) + assert self.offset3.rollforward(d) == datetime(2014, 7, 1, 9) + assert self.offset4.rollforward(d) == datetime(2014, 7, 1, 9) + assert self.offset5.rollforward(d) == datetime(2014, 7, 1, 11) + assert self.offset6.rollforward(d) == d + assert self.offset7.rollforward(d) == d + assert self.offset8.rollforward(d) == datetime(2014, 7, 1, 9) + assert self.offset9.rollforward(d) == d + assert self.offset10.rollforward(d) == d + + assert self._offset(5).rollforward(self.d) == self.d + + def testRollforward2(self): + assert self._offset(-3).rollforward(datetime(2014, 7, 5, 16, 0)) == datetime( + 2014, 7, 7, 9 + ) + + def test_roll_date_object(self): + offset = BusinessHour() + + dt = datetime(2014, 7, 6, 15, 0) + + result = offset.rollback(dt) + assert result == datetime(2014, 7, 4, 17) + + result = offset.rollforward(dt) + assert result == datetime(2014, 7, 7, 9) + + normalize_cases = [] + normalize_cases.append( + ( + BusinessHour(normalize=True), + { + datetime(2014, 7, 1, 8): datetime(2014, 7, 1), + datetime(2014, 7, 1, 17): datetime(2014, 7, 2), + datetime(2014, 7, 1, 16): datetime(2014, 7, 2), + datetime(2014, 7, 1, 23): datetime(2014, 7, 2), + datetime(2014, 7, 1, 0): datetime(2014, 7, 1), + datetime(2014, 7, 4, 15): datetime(2014, 7, 4), + datetime(2014, 7, 4, 15, 59): datetime(2014, 7, 4), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7), + datetime(2014, 7, 5, 23): datetime(2014, 7, 7), + datetime(2014, 7, 6, 10): datetime(2014, 7, 7), + }, + ) + ) + + normalize_cases.append( + ( + BusinessHour(-1, normalize=True), + { + datetime(2014, 7, 1, 8): datetime(2014, 6, 30), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1), + datetime(2014, 7, 1, 16): datetime(2014, 7, 1), + datetime(2014, 7, 1, 10): datetime(2014, 6, 30), + datetime(2014, 7, 1, 0): datetime(2014, 6, 30), + datetime(2014, 7, 7, 10): datetime(2014, 7, 4), + datetime(2014, 7, 7, 10, 1): datetime(2014, 7, 7), + datetime(2014, 7, 5, 23): datetime(2014, 7, 4), + datetime(2014, 7, 6, 10): datetime(2014, 7, 4), + }, + ) + ) + + normalize_cases.append( + ( + BusinessHour(1, normalize=True, start="17:00", end="04:00"), + { + datetime(2014, 7, 1, 8): datetime(2014, 7, 1), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1), + datetime(2014, 7, 1, 23): datetime(2014, 7, 2), + datetime(2014, 7, 2, 2): datetime(2014, 7, 2), + datetime(2014, 7, 2, 3): datetime(2014, 7, 2), + datetime(2014, 7, 4, 23): datetime(2014, 7, 5), + datetime(2014, 7, 5, 2): datetime(2014, 7, 5), + datetime(2014, 7, 7, 2): datetime(2014, 7, 7), + datetime(2014, 7, 7, 17): datetime(2014, 7, 7), + }, + ) + ) + + @pytest.mark.parametrize("case", normalize_cases) + def test_normalize(self, case): + offset, cases = case + for dt, expected in cases.items(): + assert offset.apply(dt) == expected + + on_offset_cases = [] + on_offset_cases.append( + ( + BusinessHour(), + { + datetime(2014, 7, 1, 9): True, + datetime(2014, 7, 1, 8, 59): False, + datetime(2014, 7, 1, 8): False, + datetime(2014, 7, 1, 17): True, + datetime(2014, 7, 1, 17, 1): False, + datetime(2014, 7, 1, 18): False, + datetime(2014, 7, 5, 9): False, + datetime(2014, 7, 6, 12): False, + }, + ) + ) + + on_offset_cases.append( + ( + BusinessHour(start="10:00", end="15:00"), + { + datetime(2014, 7, 1, 9): False, + datetime(2014, 7, 1, 10): True, + datetime(2014, 7, 1, 15): True, + datetime(2014, 7, 1, 15, 1): False, + datetime(2014, 7, 5, 12): False, + datetime(2014, 7, 6, 12): False, + }, + ) + ) + + on_offset_cases.append( + ( + BusinessHour(start="19:00", end="05:00"), + { + datetime(2014, 7, 1, 9, 0): False, + datetime(2014, 7, 1, 10, 0): False, + datetime(2014, 7, 1, 15): False, + datetime(2014, 7, 1, 15, 1): False, + datetime(2014, 7, 5, 12, 0): False, + datetime(2014, 7, 6, 12, 0): False, + datetime(2014, 7, 1, 19, 0): True, + datetime(2014, 7, 2, 0, 0): True, + datetime(2014, 7, 4, 23): True, + datetime(2014, 7, 5, 1): True, + datetime(2014, 7, 5, 5, 0): True, + datetime(2014, 7, 6, 23, 0): False, + datetime(2014, 7, 7, 3, 0): False, + }, + ) + ) + + on_offset_cases.append( + ( + BusinessHour(start=["09:00", "13:00"], end=["12:00", "17:00"]), + { + datetime(2014, 7, 1, 9): True, + datetime(2014, 7, 1, 8, 59): False, + datetime(2014, 7, 1, 8): False, + datetime(2014, 7, 1, 17): True, + datetime(2014, 7, 1, 17, 1): False, + datetime(2014, 7, 1, 18): False, + datetime(2014, 7, 5, 9): False, + datetime(2014, 7, 6, 12): False, + datetime(2014, 7, 1, 12, 30): False, + }, + ) + ) + + on_offset_cases.append( + ( + BusinessHour(start=["19:00", "23:00"], end=["21:00", "05:00"]), + { + datetime(2014, 7, 1, 9, 0): False, + datetime(2014, 7, 1, 10, 0): False, + datetime(2014, 7, 1, 15): False, + datetime(2014, 7, 1, 15, 1): False, + datetime(2014, 7, 5, 12, 0): False, + datetime(2014, 7, 6, 12, 0): False, + datetime(2014, 7, 1, 19, 0): True, + datetime(2014, 7, 2, 0, 0): True, + datetime(2014, 7, 4, 23): True, + datetime(2014, 7, 5, 1): True, + datetime(2014, 7, 5, 5, 0): True, + datetime(2014, 7, 6, 23, 0): False, + datetime(2014, 7, 7, 3, 0): False, + datetime(2014, 7, 4, 22): False, + }, + ) + ) + + @pytest.mark.parametrize("case", on_offset_cases) + def test_is_on_offset(self, case): + offset, cases = case + for dt, expected in cases.items(): + assert offset.is_on_offset(dt) == expected + + opening_time_cases = [] + # opening time should be affected by sign of n, not by n's value and + # end + opening_time_cases.append( + ( + [ + BusinessHour(), + BusinessHour(n=2), + BusinessHour(n=4), + BusinessHour(end="10:00"), + BusinessHour(n=2, end="4:00"), + BusinessHour(n=4, end="15:00"), + ], + { + datetime(2014, 7, 1, 11): ( + datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 9), + ), + datetime(2014, 7, 1, 18): ( + datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 9), + ), + datetime(2014, 7, 1, 23): ( + datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 9), + ), + datetime(2014, 7, 2, 8): ( + datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 9), + ), + # if timestamp is on opening time, next opening time is + # as it is + datetime(2014, 7, 2, 9): ( + datetime(2014, 7, 2, 9), + datetime(2014, 7, 2, 9), + ), + datetime(2014, 7, 2, 10): ( + datetime(2014, 7, 3, 9), + datetime(2014, 7, 2, 9), + ), + # 2014-07-05 is saturday + datetime(2014, 7, 5, 10): ( + datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 9), + ), + datetime(2014, 7, 4, 10): ( + datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 9), + ), + datetime(2014, 7, 4, 23): ( + datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 9), + ), + datetime(2014, 7, 6, 10): ( + datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 9), + ), + datetime(2014, 7, 7, 5): ( + datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 9), + ), + datetime(2014, 7, 7, 9, 1): ( + datetime(2014, 7, 8, 9), + datetime(2014, 7, 7, 9), + ), + }, + ) + ) + + opening_time_cases.append( + ( + [ + BusinessHour(start="11:15"), + BusinessHour(n=2, start="11:15"), + BusinessHour(n=3, start="11:15"), + BusinessHour(start="11:15", end="10:00"), + BusinessHour(n=2, start="11:15", end="4:00"), + BusinessHour(n=3, start="11:15", end="15:00"), + ], + { + datetime(2014, 7, 1, 11): ( + datetime(2014, 7, 1, 11, 15), + datetime(2014, 6, 30, 11, 15), + ), + datetime(2014, 7, 1, 18): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 11, 15), + ), + datetime(2014, 7, 1, 23): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 11, 15), + ), + datetime(2014, 7, 2, 8): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 11, 15), + ), + datetime(2014, 7, 2, 9): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 11, 15), + ), + datetime(2014, 7, 2, 10): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 11, 15), + ), + datetime(2014, 7, 2, 11, 15): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 2, 11, 15), + ), + datetime(2014, 7, 2, 11, 15, 1): ( + datetime(2014, 7, 3, 11, 15), + datetime(2014, 7, 2, 11, 15), + ), + datetime(2014, 7, 5, 10): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 11, 15), + ), + datetime(2014, 7, 4, 10): ( + datetime(2014, 7, 4, 11, 15), + datetime(2014, 7, 3, 11, 15), + ), + datetime(2014, 7, 4, 23): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 11, 15), + ), + datetime(2014, 7, 6, 10): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 11, 15), + ), + datetime(2014, 7, 7, 5): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 11, 15), + ), + datetime(2014, 7, 7, 9, 1): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 11, 15), + ), + }, + ) + ) + + opening_time_cases.append( + ( + [ + BusinessHour(-1), + BusinessHour(n=-2), + BusinessHour(n=-4), + BusinessHour(n=-1, end="10:00"), + BusinessHour(n=-2, end="4:00"), + BusinessHour(n=-4, end="15:00"), + ], + { + datetime(2014, 7, 1, 11): ( + datetime(2014, 7, 1, 9), + datetime(2014, 7, 2, 9), + ), + datetime(2014, 7, 1, 18): ( + datetime(2014, 7, 1, 9), + datetime(2014, 7, 2, 9), + ), + datetime(2014, 7, 1, 23): ( + datetime(2014, 7, 1, 9), + datetime(2014, 7, 2, 9), + ), + datetime(2014, 7, 2, 8): ( + datetime(2014, 7, 1, 9), + datetime(2014, 7, 2, 9), + ), + datetime(2014, 7, 2, 9): ( + datetime(2014, 7, 2, 9), + datetime(2014, 7, 2, 9), + ), + datetime(2014, 7, 2, 10): ( + datetime(2014, 7, 2, 9), + datetime(2014, 7, 3, 9), + ), + datetime(2014, 7, 5, 10): ( + datetime(2014, 7, 4, 9), + datetime(2014, 7, 7, 9), + ), + datetime(2014, 7, 4, 10): ( + datetime(2014, 7, 4, 9), + datetime(2014, 7, 7, 9), + ), + datetime(2014, 7, 4, 23): ( + datetime(2014, 7, 4, 9), + datetime(2014, 7, 7, 9), + ), + datetime(2014, 7, 6, 10): ( + datetime(2014, 7, 4, 9), + datetime(2014, 7, 7, 9), + ), + datetime(2014, 7, 7, 5): ( + datetime(2014, 7, 4, 9), + datetime(2014, 7, 7, 9), + ), + datetime(2014, 7, 7, 9): ( + datetime(2014, 7, 7, 9), + datetime(2014, 7, 7, 9), + ), + datetime(2014, 7, 7, 9, 1): ( + datetime(2014, 7, 7, 9), + datetime(2014, 7, 8, 9), + ), + }, + ) + ) + + opening_time_cases.append( + ( + [ + BusinessHour(start="17:00", end="05:00"), + BusinessHour(n=3, start="17:00", end="03:00"), + ], + { + datetime(2014, 7, 1, 11): ( + datetime(2014, 7, 1, 17), + datetime(2014, 6, 30, 17), + ), + datetime(2014, 7, 1, 18): ( + datetime(2014, 7, 2, 17), + datetime(2014, 7, 1, 17), + ), + datetime(2014, 7, 1, 23): ( + datetime(2014, 7, 2, 17), + datetime(2014, 7, 1, 17), + ), + datetime(2014, 7, 2, 8): ( + datetime(2014, 7, 2, 17), + datetime(2014, 7, 1, 17), + ), + datetime(2014, 7, 2, 9): ( + datetime(2014, 7, 2, 17), + datetime(2014, 7, 1, 17), + ), + datetime(2014, 7, 4, 17): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 4, 17), + ), + datetime(2014, 7, 5, 10): ( + datetime(2014, 7, 7, 17), + datetime(2014, 7, 4, 17), + ), + datetime(2014, 7, 4, 10): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 3, 17), + ), + datetime(2014, 7, 4, 23): ( + datetime(2014, 7, 7, 17), + datetime(2014, 7, 4, 17), + ), + datetime(2014, 7, 6, 10): ( + datetime(2014, 7, 7, 17), + datetime(2014, 7, 4, 17), + ), + datetime(2014, 7, 7, 5): ( + datetime(2014, 7, 7, 17), + datetime(2014, 7, 4, 17), + ), + datetime(2014, 7, 7, 17, 1): ( + datetime(2014, 7, 8, 17), + datetime(2014, 7, 7, 17), + ), + }, + ) + ) + + opening_time_cases.append( + ( + [ + BusinessHour(-1, start="17:00", end="05:00"), + BusinessHour(n=-2, start="17:00", end="03:00"), + ], + { + datetime(2014, 7, 1, 11): ( + datetime(2014, 6, 30, 17), + datetime(2014, 7, 1, 17), + ), + datetime(2014, 7, 1, 18): ( + datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 17), + ), + datetime(2014, 7, 1, 23): ( + datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 17), + ), + datetime(2014, 7, 2, 8): ( + datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 17), + ), + datetime(2014, 7, 2, 9): ( + datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 17), + ), + datetime(2014, 7, 2, 16, 59): ( + datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 17), + ), + datetime(2014, 7, 5, 10): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 17), + ), + datetime(2014, 7, 4, 10): ( + datetime(2014, 7, 3, 17), + datetime(2014, 7, 4, 17), + ), + datetime(2014, 7, 4, 23): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 17), + ), + datetime(2014, 7, 6, 10): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 17), + ), + datetime(2014, 7, 7, 5): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 17), + ), + datetime(2014, 7, 7, 18): ( + datetime(2014, 7, 7, 17), + datetime(2014, 7, 8, 17), + ), + }, + ) + ) + + opening_time_cases.append( + ( + [ + BusinessHour(start=["11:15", "15:00"], end=["13:00", "20:00"]), + BusinessHour(n=3, start=["11:15", "15:00"], end=["12:00", "20:00"]), + BusinessHour(start=["11:15", "15:00"], end=["13:00", "17:00"]), + BusinessHour(n=2, start=["11:15", "15:00"], end=["12:00", "03:00"]), + BusinessHour(n=3, start=["11:15", "15:00"], end=["13:00", "16:00"]), + ], + { + datetime(2014, 7, 1, 11): ( + datetime(2014, 7, 1, 11, 15), + datetime(2014, 6, 30, 15), + ), + datetime(2014, 7, 1, 18): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 15), + ), + datetime(2014, 7, 1, 23): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 15), + ), + datetime(2014, 7, 2, 8): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 15), + ), + datetime(2014, 7, 2, 9): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 15), + ), + datetime(2014, 7, 2, 10): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 15), + ), + datetime(2014, 7, 2, 11, 15): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 2, 11, 15), + ), + datetime(2014, 7, 2, 11, 15, 1): ( + datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 11, 15), + ), + datetime(2014, 7, 5, 10): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 15), + ), + datetime(2014, 7, 4, 10): ( + datetime(2014, 7, 4, 11, 15), + datetime(2014, 7, 3, 15), + ), + datetime(2014, 7, 4, 23): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 15), + ), + datetime(2014, 7, 6, 10): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 15), + ), + datetime(2014, 7, 7, 5): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 15), + ), + datetime(2014, 7, 7, 9, 1): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 15), + ), + datetime(2014, 7, 7, 12): ( + datetime(2014, 7, 7, 15), + datetime(2014, 7, 7, 11, 15), + ), + }, + ) + ) + + opening_time_cases.append( + ( + [ + BusinessHour(n=-1, start=["17:00", "08:00"], end=["05:00", "10:00"]), + BusinessHour(n=-2, start=["08:00", "17:00"], end=["10:00", "03:00"]), + ], + { + datetime(2014, 7, 1, 11): ( + datetime(2014, 7, 1, 8), + datetime(2014, 7, 1, 17), + ), + datetime(2014, 7, 1, 18): ( + datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 8), + ), + datetime(2014, 7, 1, 23): ( + datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 8), + ), + datetime(2014, 7, 2, 8): ( + datetime(2014, 7, 2, 8), + datetime(2014, 7, 2, 8), + ), + datetime(2014, 7, 2, 9): ( + datetime(2014, 7, 2, 8), + datetime(2014, 7, 2, 17), + ), + datetime(2014, 7, 2, 16, 59): ( + datetime(2014, 7, 2, 8), + datetime(2014, 7, 2, 17), + ), + datetime(2014, 7, 5, 10): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 8), + ), + datetime(2014, 7, 4, 10): ( + datetime(2014, 7, 4, 8), + datetime(2014, 7, 4, 17), + ), + datetime(2014, 7, 4, 23): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 8), + ), + datetime(2014, 7, 6, 10): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 8), + ), + datetime(2014, 7, 7, 5): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 8), + ), + datetime(2014, 7, 7, 18): ( + datetime(2014, 7, 7, 17), + datetime(2014, 7, 8, 8), + ), + }, + ) + ) + + @pytest.mark.parametrize("case", opening_time_cases) + def test_opening_time(self, case): + _offsets, cases = case + for offset in _offsets: + for dt, (exp_next, exp_prev) in cases.items(): + assert offset._next_opening_time(dt) == exp_next + assert offset._prev_opening_time(dt) == exp_prev + + apply_cases = [] + apply_cases.append( + ( + BusinessHour(), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 12), + datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 14), + datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 16), + datetime(2014, 7, 1, 19): datetime(2014, 7, 2, 10), + datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 2, 9, 30, 15), + datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 10), + datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 12), + # out of business hours + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 10), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 10), + datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 10), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 10), + # saturday + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 10), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 10), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 9, 30), + datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 9, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(4), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 15), + datetime(2014, 7, 1, 13): datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 15): datetime(2014, 7, 2, 11), + datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 12), + datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 13), + datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 13), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 13), + datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 13), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 13), + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 13), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 13), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 12, 30), + datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 12, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(-1), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 10), + datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 12), + datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 14), + datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 15), + datetime(2014, 7, 1, 10): datetime(2014, 6, 30, 17), + datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 1, 15, 30, 15), + datetime(2014, 7, 1, 9, 30, 15): datetime(2014, 6, 30, 16, 30, 15), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 16), + datetime(2014, 7, 1, 5): datetime(2014, 6, 30, 16), + datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 10), + # out of business hours + datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 16), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 16), + datetime(2014, 7, 2, 23): datetime(2014, 7, 2, 16), + datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 16), + # saturday + datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 16), + datetime(2014, 7, 7, 9): datetime(2014, 7, 4, 16), + datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 4, 16, 30), + datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 4, 16, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(-4), + { + datetime(2014, 7, 1, 11): datetime(2014, 6, 30, 15), + datetime(2014, 7, 1, 13): datetime(2014, 6, 30, 17), + datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 11), + datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 12), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 13), + datetime(2014, 7, 2, 11): datetime(2014, 7, 1, 15), + datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 13), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 13), + datetime(2014, 7, 2, 23): datetime(2014, 7, 2, 13), + datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 13), + datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 13), + datetime(2014, 7, 4, 18): datetime(2014, 7, 4, 13), + datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 4, 13, 30), + datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 4, 13, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(start="13:00", end="16:00"), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 14), + datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 14), + datetime(2014, 7, 1, 15): datetime(2014, 7, 2, 13), + datetime(2014, 7, 1, 19): datetime(2014, 7, 2, 14), + datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 14), + datetime(2014, 7, 1, 15, 30, 15): datetime(2014, 7, 2, 13, 30, 15), + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 14), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 14), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(n=2, start="13:00", end="16:00"), + { + datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 14): datetime(2014, 7, 3, 13), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 15), + datetime(2014, 7, 2, 14, 30): datetime(2014, 7, 3, 13, 30), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 15), + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 15), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 15), + datetime(2014, 7, 4, 14, 30): datetime(2014, 7, 7, 13, 30), + datetime(2014, 7, 4, 14, 30, 30): datetime(2014, 7, 7, 13, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(n=-1, start="13:00", end="16:00"), + { + datetime(2014, 7, 2, 11): datetime(2014, 7, 1, 15), + datetime(2014, 7, 2, 13): datetime(2014, 7, 1, 15), + datetime(2014, 7, 2, 14): datetime(2014, 7, 1, 16), + datetime(2014, 7, 2, 15): datetime(2014, 7, 2, 14), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 16): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 13, 30, 15): datetime(2014, 7, 1, 15, 30, 15), + datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 15), + datetime(2014, 7, 7, 11): datetime(2014, 7, 4, 15), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(n=-3, start="10:00", end="16:00"), + { + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 13), + datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 11), + datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 13), + datetime(2014, 7, 2, 13): datetime(2014, 7, 1, 16), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 13), + datetime(2014, 7, 2, 11, 30): datetime(2014, 7, 1, 14, 30), + datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 13), + datetime(2014, 7, 4, 10): datetime(2014, 7, 3, 13), + datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 13), + datetime(2014, 7, 4, 16): datetime(2014, 7, 4, 13), + datetime(2014, 7, 4, 12, 30): datetime(2014, 7, 3, 15, 30), + datetime(2014, 7, 4, 12, 30, 30): datetime(2014, 7, 3, 15, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(start="19:00", end="05:00"), + { + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 20), + datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 20), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 20), + datetime(2014, 7, 2, 13): datetime(2014, 7, 2, 20), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 20), + datetime(2014, 7, 2, 4, 30): datetime(2014, 7, 2, 19, 30), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 1), + datetime(2014, 7, 4, 10): datetime(2014, 7, 4, 20), + datetime(2014, 7, 4, 23): datetime(2014, 7, 5, 0), + datetime(2014, 7, 5, 0): datetime(2014, 7, 5, 1), + datetime(2014, 7, 5, 4): datetime(2014, 7, 7, 19), + datetime(2014, 7, 5, 4, 30): datetime(2014, 7, 7, 19, 30), + datetime(2014, 7, 5, 4, 30, 30): datetime(2014, 7, 7, 19, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(n=-1, start="19:00", end="05:00"), + { + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 4), + datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 13): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 20): datetime(2014, 7, 2, 5), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 19, 30): datetime(2014, 7, 2, 4, 30), + datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 23), + datetime(2014, 7, 3, 6): datetime(2014, 7, 3, 4), + datetime(2014, 7, 4, 23): datetime(2014, 7, 4, 22), + datetime(2014, 7, 5, 0): datetime(2014, 7, 4, 23), + datetime(2014, 7, 5, 4): datetime(2014, 7, 5, 3), + datetime(2014, 7, 7, 19, 30): datetime(2014, 7, 5, 4, 30), + datetime(2014, 7, 7, 19, 30, 30): datetime(2014, 7, 5, 4, 30, 30), + }, + ) + ) + + # long business hours (see gh-26381) + apply_cases.append( + ( + BusinessHour(n=4, start="00:00", end="23:00"), + { + datetime(2014, 7, 3, 22): datetime(2014, 7, 4, 3), + datetime(2014, 7, 4, 22): datetime(2014, 7, 7, 3), + datetime(2014, 7, 3, 22, 30): datetime(2014, 7, 4, 3, 30), + datetime(2014, 7, 3, 22, 20): datetime(2014, 7, 4, 3, 20), + datetime(2014, 7, 4, 22, 30, 30): datetime(2014, 7, 7, 3, 30, 30), + datetime(2014, 7, 4, 22, 30, 20): datetime(2014, 7, 7, 3, 30, 20), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(n=-4, start="00:00", end="23:00"), + { + datetime(2014, 7, 4, 3): datetime(2014, 7, 3, 22), + datetime(2014, 7, 7, 3): datetime(2014, 7, 4, 22), + datetime(2014, 7, 4, 3, 30): datetime(2014, 7, 3, 22, 30), + datetime(2014, 7, 4, 3, 20): datetime(2014, 7, 3, 22, 20), + datetime(2014, 7, 7, 3, 30, 30): datetime(2014, 7, 4, 22, 30, 30), + datetime(2014, 7, 7, 3, 30, 20): datetime(2014, 7, 4, 22, 30, 20), + }, + ) + ) + + # multiple business hours + apply_cases.append( + ( + BusinessHour(start=["09:00", "14:00"], end=["12:00", "18:00"]), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 14), + datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 16), + datetime(2014, 7, 1, 19): datetime(2014, 7, 2, 10), + datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 17), + datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 1, 17, 30, 15), + datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 9), + datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 14), + # out of business hours + datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 15), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 10), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 10), + datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 10), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 10), + # saturday + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 10), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 17, 30): datetime(2014, 7, 7, 9, 30), + datetime(2014, 7, 4, 17, 30, 30): datetime(2014, 7, 7, 9, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(n=4, start=["09:00", "14:00"], end=["12:00", "18:00"]), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 17), + datetime(2014, 7, 1, 13): datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 15): datetime(2014, 7, 2, 10), + datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 11), + datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 14), + datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 17), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 15), + datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 15), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 15), + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 15), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 14), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 11, 30), + datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 11, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(n=-4, start=["09:00", "14:00"], end=["12:00", "18:00"]), + { + datetime(2014, 7, 1, 11): datetime(2014, 6, 30, 16), + datetime(2014, 7, 1, 13): datetime(2014, 6, 30, 17), + datetime(2014, 7, 1, 15): datetime(2014, 6, 30, 18), + datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 10), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 11), + datetime(2014, 7, 2, 11): datetime(2014, 7, 1, 16), + datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 12), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 12), + datetime(2014, 7, 2, 23): datetime(2014, 7, 2, 12), + datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 12), + datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 12), + datetime(2014, 7, 4, 18): datetime(2014, 7, 4, 12), + datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 4, 14, 30), + datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 4, 14, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(n=-1, start=["19:00", "03:00"], end=["01:00", "05:00"]), + { + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 4), + datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 13): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 20): datetime(2014, 7, 2, 5), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 4): datetime(2014, 7, 2, 1), + datetime(2014, 7, 2, 19, 30): datetime(2014, 7, 2, 4, 30), + datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 23), + datetime(2014, 7, 3, 6): datetime(2014, 7, 3, 4), + datetime(2014, 7, 4, 23): datetime(2014, 7, 4, 22), + datetime(2014, 7, 5, 0): datetime(2014, 7, 4, 23), + datetime(2014, 7, 5, 4): datetime(2014, 7, 5, 0), + datetime(2014, 7, 7, 3, 30): datetime(2014, 7, 5, 0, 30), + datetime(2014, 7, 7, 19, 30): datetime(2014, 7, 7, 4, 30), + datetime(2014, 7, 7, 19, 30, 30): datetime(2014, 7, 7, 4, 30, 30), + }, + ) + ) + + @pytest.mark.parametrize("case", apply_cases) + def test_apply(self, case): + offset, cases = case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + apply_large_n_cases = [] + # A week later + apply_large_n_cases.append( + ( + BusinessHour(40), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 8, 11), + datetime(2014, 7, 1, 13): datetime(2014, 7, 8, 13), + datetime(2014, 7, 1, 15): datetime(2014, 7, 8, 15), + datetime(2014, 7, 1, 16): datetime(2014, 7, 8, 16), + datetime(2014, 7, 1, 17): datetime(2014, 7, 9, 9), + datetime(2014, 7, 2, 11): datetime(2014, 7, 9, 11), + datetime(2014, 7, 2, 8): datetime(2014, 7, 9, 9), + datetime(2014, 7, 2, 19): datetime(2014, 7, 10, 9), + datetime(2014, 7, 2, 23): datetime(2014, 7, 10, 9), + datetime(2014, 7, 3, 0): datetime(2014, 7, 10, 9), + datetime(2014, 7, 5, 15): datetime(2014, 7, 14, 9), + datetime(2014, 7, 4, 18): datetime(2014, 7, 14, 9), + datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 14, 9, 30), + datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 14, 9, 30, 30), + }, + ) + ) + + # 3 days and 1 hour before + apply_large_n_cases.append( + ( + BusinessHour(-25), + { + datetime(2014, 7, 1, 11): datetime(2014, 6, 26, 10), + datetime(2014, 7, 1, 13): datetime(2014, 6, 26, 12), + datetime(2014, 7, 1, 9): datetime(2014, 6, 25, 16), + datetime(2014, 7, 1, 10): datetime(2014, 6, 25, 17), + datetime(2014, 7, 3, 11): datetime(2014, 6, 30, 10), + datetime(2014, 7, 3, 8): datetime(2014, 6, 27, 16), + datetime(2014, 7, 3, 19): datetime(2014, 6, 30, 16), + datetime(2014, 7, 3, 23): datetime(2014, 6, 30, 16), + datetime(2014, 7, 4, 9): datetime(2014, 6, 30, 16), + datetime(2014, 7, 5, 15): datetime(2014, 7, 1, 16), + datetime(2014, 7, 6, 18): datetime(2014, 7, 1, 16), + datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 1, 16, 30), + datetime(2014, 7, 7, 10, 30, 30): datetime(2014, 7, 2, 9, 30, 30), + }, + ) + ) + + # 5 days and 3 hours later + apply_large_n_cases.append( + ( + BusinessHour(28, start="21:00", end="02:00"), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 9, 0), + datetime(2014, 7, 1, 22): datetime(2014, 7, 9, 1), + datetime(2014, 7, 1, 23): datetime(2014, 7, 9, 21), + datetime(2014, 7, 2, 2): datetime(2014, 7, 10, 0), + datetime(2014, 7, 3, 21): datetime(2014, 7, 11, 0), + datetime(2014, 7, 4, 1): datetime(2014, 7, 11, 23), + datetime(2014, 7, 4, 2): datetime(2014, 7, 12, 0), + datetime(2014, 7, 4, 3): datetime(2014, 7, 12, 0), + datetime(2014, 7, 5, 1): datetime(2014, 7, 14, 23), + datetime(2014, 7, 5, 15): datetime(2014, 7, 15, 0), + datetime(2014, 7, 6, 18): datetime(2014, 7, 15, 0), + datetime(2014, 7, 7, 1): datetime(2014, 7, 15, 0), + datetime(2014, 7, 7, 23, 30): datetime(2014, 7, 15, 21, 30), + }, + ) + ) + + # large n for multiple opening hours (3 days and 1 hour before) + apply_large_n_cases.append( + ( + BusinessHour(n=-25, start=["09:00", "14:00"], end=["12:00", "19:00"]), + { + datetime(2014, 7, 1, 11): datetime(2014, 6, 26, 10), + datetime(2014, 7, 1, 13): datetime(2014, 6, 26, 11), + datetime(2014, 7, 1, 9): datetime(2014, 6, 25, 18), + datetime(2014, 7, 1, 10): datetime(2014, 6, 25, 19), + datetime(2014, 7, 3, 11): datetime(2014, 6, 30, 10), + datetime(2014, 7, 3, 8): datetime(2014, 6, 27, 18), + datetime(2014, 7, 3, 19): datetime(2014, 6, 30, 18), + datetime(2014, 7, 3, 23): datetime(2014, 6, 30, 18), + datetime(2014, 7, 4, 9): datetime(2014, 6, 30, 18), + datetime(2014, 7, 5, 15): datetime(2014, 7, 1, 18), + datetime(2014, 7, 6, 18): datetime(2014, 7, 1, 18), + datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 1, 18, 30), + datetime(2014, 7, 7, 10, 30, 30): datetime(2014, 7, 2, 9, 30, 30), + }, + ) + ) + + # 5 days and 3 hours later + apply_large_n_cases.append( + ( + BusinessHour(28, start=["21:00", "03:00"], end=["01:00", "04:00"]), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 9, 0), + datetime(2014, 7, 1, 22): datetime(2014, 7, 9, 3), + datetime(2014, 7, 1, 23): datetime(2014, 7, 9, 21), + datetime(2014, 7, 2, 2): datetime(2014, 7, 9, 23), + datetime(2014, 7, 3, 21): datetime(2014, 7, 11, 0), + datetime(2014, 7, 4, 1): datetime(2014, 7, 11, 23), + datetime(2014, 7, 4, 2): datetime(2014, 7, 11, 23), + datetime(2014, 7, 4, 3): datetime(2014, 7, 11, 23), + datetime(2014, 7, 4, 21): datetime(2014, 7, 12, 0), + datetime(2014, 7, 5, 0): datetime(2014, 7, 14, 22), + datetime(2014, 7, 5, 1): datetime(2014, 7, 14, 23), + datetime(2014, 7, 5, 15): datetime(2014, 7, 14, 23), + datetime(2014, 7, 6, 18): datetime(2014, 7, 14, 23), + datetime(2014, 7, 7, 1): datetime(2014, 7, 14, 23), + datetime(2014, 7, 7, 23, 30): datetime(2014, 7, 15, 21, 30), + }, + ) + ) + + @pytest.mark.parametrize("case", apply_large_n_cases) + def test_apply_large_n(self, case): + offset, cases = case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + def test_apply_nanoseconds(self): + tests = [] + + tests.append( + ( + BusinessHour(), + { + Timestamp("2014-07-04 15:00") + + Nano(5): Timestamp("2014-07-04 16:00") + + Nano(5), + Timestamp("2014-07-04 16:00") + + Nano(5): Timestamp("2014-07-07 09:00") + + Nano(5), + Timestamp("2014-07-04 16:00") + - Nano(5): Timestamp("2014-07-04 17:00") + - Nano(5), + }, + ) + ) + + tests.append( + ( + BusinessHour(-1), + { + Timestamp("2014-07-04 15:00") + + Nano(5): Timestamp("2014-07-04 14:00") + + Nano(5), + Timestamp("2014-07-04 10:00") + + Nano(5): Timestamp("2014-07-04 09:00") + + Nano(5), + Timestamp("2014-07-04 10:00") + - Nano(5): Timestamp("2014-07-03 17:00") + - Nano(5), + }, + ) + ) + + for offset, cases in tests: + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + def test_datetimeindex(self): + idx1 = date_range(start="2014-07-04 15:00", end="2014-07-08 10:00", freq="BH") + idx2 = date_range(start="2014-07-04 15:00", periods=12, freq="BH") + idx3 = date_range(end="2014-07-08 10:00", periods=12, freq="BH") + expected = DatetimeIndex( + [ + "2014-07-04 15:00", + "2014-07-04 16:00", + "2014-07-07 09:00", + "2014-07-07 10:00", + "2014-07-07 11:00", + "2014-07-07 12:00", + "2014-07-07 13:00", + "2014-07-07 14:00", + "2014-07-07 15:00", + "2014-07-07 16:00", + "2014-07-08 09:00", + "2014-07-08 10:00", + ], + freq="BH", + ) + for idx in [idx1, idx2, idx3]: + tm.assert_index_equal(idx, expected) + + idx1 = date_range(start="2014-07-04 15:45", end="2014-07-08 10:45", freq="BH") + idx2 = date_range(start="2014-07-04 15:45", periods=12, freq="BH") + idx3 = date_range(end="2014-07-08 10:45", periods=12, freq="BH") + + expected = DatetimeIndex( + [ + "2014-07-04 15:45", + "2014-07-04 16:45", + "2014-07-07 09:45", + "2014-07-07 10:45", + "2014-07-07 11:45", + "2014-07-07 12:45", + "2014-07-07 13:45", + "2014-07-07 14:45", + "2014-07-07 15:45", + "2014-07-07 16:45", + "2014-07-08 09:45", + "2014-07-08 10:45", + ], + freq="BH", + ) + expected = idx1 + for idx in [idx1, idx2, idx3]: + tm.assert_index_equal(idx, expected) + + +class TestCustomBusinessHour(Base): + _offset = CustomBusinessHour + holidays = ["2014-06-27", datetime(2014, 6, 30), np.datetime64("2014-07-02")] + + def setup_method(self, method): + # 2014 Calendar to check custom holidays + # Sun Mon Tue Wed Thu Fri Sat + # 6/22 23 24 25 26 27 28 + # 29 30 7/1 2 3 4 5 + # 6 7 8 9 10 11 12 + self.d = datetime(2014, 7, 1, 10, 00) + self.offset1 = CustomBusinessHour(weekmask="Tue Wed Thu Fri") + + self.offset2 = CustomBusinessHour(holidays=self.holidays) + + def test_constructor_errors(self): + from datetime import time as dt_time + + with pytest.raises(ValueError): + CustomBusinessHour(start=dt_time(11, 0, 5)) + with pytest.raises(ValueError): + CustomBusinessHour(start="AAA") + with pytest.raises(ValueError): + CustomBusinessHour(start="14:00:05") + + def test_different_normalize_equals(self): + # GH#21404 changed __eq__ to return False when `normalize` does not match + offset = self._offset() + offset2 = self._offset(normalize=True) + assert offset != offset2 + + def test_repr(self): + assert repr(self.offset1) == "" + assert repr(self.offset2) == "" + + def test_with_offset(self): + expected = Timestamp("2014-07-01 13:00") + + assert self.d + CustomBusinessHour() * 3 == expected + assert self.d + CustomBusinessHour(n=3) == expected + + def test_eq(self): + for offset in [self.offset1, self.offset2]: + assert offset == offset + + assert CustomBusinessHour() != CustomBusinessHour(-1) + assert CustomBusinessHour(start="09:00") == CustomBusinessHour() + assert CustomBusinessHour(start="09:00") != CustomBusinessHour(start="09:01") + assert CustomBusinessHour(start="09:00", end="17:00") != CustomBusinessHour( + start="17:00", end="09:01" + ) + + assert CustomBusinessHour(weekmask="Tue Wed Thu Fri") != CustomBusinessHour( + weekmask="Mon Tue Wed Thu Fri" + ) + assert CustomBusinessHour(holidays=["2014-06-27"]) != CustomBusinessHour( + holidays=["2014-06-28"] + ) + + def test_sub(self): + # override the Base.test_sub implementation because self.offset2 is + # defined differently in this class than the test expects + pass + + def test_hash(self): + assert hash(self.offset1) == hash(self.offset1) + assert hash(self.offset2) == hash(self.offset2) + + def test_call(self): + assert self.offset1(self.d) == datetime(2014, 7, 1, 11) + assert self.offset2(self.d) == datetime(2014, 7, 1, 11) + + def testRollback1(self): + assert self.offset1.rollback(self.d) == self.d + assert self.offset2.rollback(self.d) == self.d + + d = datetime(2014, 7, 1, 0) + + # 2014/07/01 is Tuesday, 06/30 is Monday(holiday) + assert self.offset1.rollback(d) == datetime(2014, 6, 27, 17) + + # 2014/6/30 and 2014/6/27 are holidays + assert self.offset2.rollback(d) == datetime(2014, 6, 26, 17) + + def testRollback2(self): + assert self._offset(-3).rollback(datetime(2014, 7, 5, 15, 0)) == datetime( + 2014, 7, 4, 17, 0 + ) + + def testRollforward1(self): + assert self.offset1.rollforward(self.d) == self.d + assert self.offset2.rollforward(self.d) == self.d + + d = datetime(2014, 7, 1, 0) + assert self.offset1.rollforward(d) == datetime(2014, 7, 1, 9) + assert self.offset2.rollforward(d) == datetime(2014, 7, 1, 9) + + def testRollforward2(self): + assert self._offset(-3).rollforward(datetime(2014, 7, 5, 16, 0)) == datetime( + 2014, 7, 7, 9 + ) + + def test_roll_date_object(self): + offset = BusinessHour() + + dt = datetime(2014, 7, 6, 15, 0) + + result = offset.rollback(dt) + assert result == datetime(2014, 7, 4, 17) + + result = offset.rollforward(dt) + assert result == datetime(2014, 7, 7, 9) + + normalize_cases = [] + normalize_cases.append( + ( + CustomBusinessHour(normalize=True, holidays=holidays), + { + datetime(2014, 7, 1, 8): datetime(2014, 7, 1), + datetime(2014, 7, 1, 17): datetime(2014, 7, 3), + datetime(2014, 7, 1, 16): datetime(2014, 7, 3), + datetime(2014, 7, 1, 23): datetime(2014, 7, 3), + datetime(2014, 7, 1, 0): datetime(2014, 7, 1), + datetime(2014, 7, 4, 15): datetime(2014, 7, 4), + datetime(2014, 7, 4, 15, 59): datetime(2014, 7, 4), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7), + datetime(2014, 7, 5, 23): datetime(2014, 7, 7), + datetime(2014, 7, 6, 10): datetime(2014, 7, 7), + }, + ) + ) + + normalize_cases.append( + ( + CustomBusinessHour(-1, normalize=True, holidays=holidays), + { + datetime(2014, 7, 1, 8): datetime(2014, 6, 26), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1), + datetime(2014, 7, 1, 16): datetime(2014, 7, 1), + datetime(2014, 7, 1, 10): datetime(2014, 6, 26), + datetime(2014, 7, 1, 0): datetime(2014, 6, 26), + datetime(2014, 7, 7, 10): datetime(2014, 7, 4), + datetime(2014, 7, 7, 10, 1): datetime(2014, 7, 7), + datetime(2014, 7, 5, 23): datetime(2014, 7, 4), + datetime(2014, 7, 6, 10): datetime(2014, 7, 4), + }, + ) + ) + + normalize_cases.append( + ( + CustomBusinessHour( + 1, normalize=True, start="17:00", end="04:00", holidays=holidays + ), + { + datetime(2014, 7, 1, 8): datetime(2014, 7, 1), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1), + datetime(2014, 7, 1, 23): datetime(2014, 7, 2), + datetime(2014, 7, 2, 2): datetime(2014, 7, 2), + datetime(2014, 7, 2, 3): datetime(2014, 7, 3), + datetime(2014, 7, 4, 23): datetime(2014, 7, 5), + datetime(2014, 7, 5, 2): datetime(2014, 7, 5), + datetime(2014, 7, 7, 2): datetime(2014, 7, 7), + datetime(2014, 7, 7, 17): datetime(2014, 7, 7), + }, + ) + ) + + @pytest.mark.parametrize("norm_cases", normalize_cases) + def test_normalize(self, norm_cases): + offset, cases = norm_cases + for dt, expected in cases.items(): + assert offset.apply(dt) == expected + + def test_is_on_offset(self): + tests = [] + + tests.append( + ( + CustomBusinessHour(start="10:00", end="15:00", holidays=self.holidays), + { + datetime(2014, 7, 1, 9): False, + datetime(2014, 7, 1, 10): True, + datetime(2014, 7, 1, 15): True, + datetime(2014, 7, 1, 15, 1): False, + datetime(2014, 7, 5, 12): False, + datetime(2014, 7, 6, 12): False, + }, + ) + ) + + for offset, cases in tests: + for dt, expected in cases.items(): + assert offset.is_on_offset(dt) == expected + + apply_cases = [] + apply_cases.append( + ( + CustomBusinessHour(holidays=holidays), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 12), + datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 14), + datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 16), + datetime(2014, 7, 1, 19): datetime(2014, 7, 3, 10), + datetime(2014, 7, 1, 16): datetime(2014, 7, 3, 9), + datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 3, 9, 30, 15), + datetime(2014, 7, 1, 17): datetime(2014, 7, 3, 10), + datetime(2014, 7, 2, 11): datetime(2014, 7, 3, 10), + # out of business hours + datetime(2014, 7, 2, 8): datetime(2014, 7, 3, 10), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 10), + datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 10), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 10), + # saturday + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 10), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 10), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 9, 30), + datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 9, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + CustomBusinessHour(4, holidays=holidays), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 15), + datetime(2014, 7, 1, 13): datetime(2014, 7, 3, 9), + datetime(2014, 7, 1, 15): datetime(2014, 7, 3, 11), + datetime(2014, 7, 1, 16): datetime(2014, 7, 3, 12), + datetime(2014, 7, 1, 17): datetime(2014, 7, 3, 13), + datetime(2014, 7, 2, 11): datetime(2014, 7, 3, 13), + datetime(2014, 7, 2, 8): datetime(2014, 7, 3, 13), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 13), + datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 13), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 13), + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 13), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 13), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 12, 30), + datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 12, 30, 30), + }, + ) + ) + + @pytest.mark.parametrize("apply_case", apply_cases) + def test_apply(self, apply_case): + offset, cases = apply_case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + nano_cases = [] + nano_cases.append( + ( + CustomBusinessHour(holidays=holidays), + { + Timestamp("2014-07-01 15:00") + + Nano(5): Timestamp("2014-07-01 16:00") + + Nano(5), + Timestamp("2014-07-01 16:00") + + Nano(5): Timestamp("2014-07-03 09:00") + + Nano(5), + Timestamp("2014-07-01 16:00") + - Nano(5): Timestamp("2014-07-01 17:00") + - Nano(5), + }, + ) + ) + + nano_cases.append( + ( + CustomBusinessHour(-1, holidays=holidays), + { + Timestamp("2014-07-01 15:00") + + Nano(5): Timestamp("2014-07-01 14:00") + + Nano(5), + Timestamp("2014-07-01 10:00") + + Nano(5): Timestamp("2014-07-01 09:00") + + Nano(5), + Timestamp("2014-07-01 10:00") + - Nano(5): Timestamp("2014-06-26 17:00") + - Nano(5), + }, + ) + ) + + @pytest.mark.parametrize("nano_case", nano_cases) + def test_apply_nanoseconds(self, nano_case): + offset, cases = nano_case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + +class TestCustomBusinessDay(Base): + _offset = CDay + + def setup_method(self, method): + self.d = datetime(2008, 1, 1) + self.nd = np_datetime64_compat("2008-01-01 00:00:00Z") + + self.offset = CDay() + self.offset1 = self.offset + self.offset2 = CDay(2) + + def test_different_normalize_equals(self): + # GH#21404 changed __eq__ to return False when `normalize` does not match + offset = self._offset() + offset2 = self._offset(normalize=True) + assert offset != offset2 + + def test_repr(self): + assert repr(self.offset) == "" + assert repr(self.offset2) == "<2 * CustomBusinessDays>" + + if compat.PY37: + expected = "" + else: + expected = "" + assert repr(self.offset + timedelta(1)) == expected + + def test_with_offset(self): + offset = self.offset + timedelta(hours=2) + + assert (self.d + offset) == datetime(2008, 1, 2, 2) + + def test_eq(self): + assert self.offset2 == self.offset2 + + def test_mul(self): + pass + + def test_hash(self): + assert hash(self.offset2) == hash(self.offset2) + + def test_call(self): + assert self.offset2(self.d) == datetime(2008, 1, 3) + assert self.offset2(self.nd) == datetime(2008, 1, 3) + + def testRollback1(self): + assert CDay(10).rollback(self.d) == self.d + + def testRollback2(self): + assert CDay(10).rollback(datetime(2008, 1, 5)) == datetime(2008, 1, 4) + + def testRollforward1(self): + assert CDay(10).rollforward(self.d) == self.d + + def testRollforward2(self): + assert CDay(10).rollforward(datetime(2008, 1, 5)) == datetime(2008, 1, 7) + + def test_roll_date_object(self): + offset = CDay() + + dt = date(2012, 9, 15) + + result = offset.rollback(dt) + assert result == datetime(2012, 9, 14) + + result = offset.rollforward(dt) + assert result == datetime(2012, 9, 17) + + offset = offsets.Day() + result = offset.rollback(dt) + assert result == datetime(2012, 9, 15) + + result = offset.rollforward(dt) + assert result == datetime(2012, 9, 15) + + on_offset_cases = [ + (CDay(), datetime(2008, 1, 1), True), + (CDay(), datetime(2008, 1, 5), False), + ] + + @pytest.mark.parametrize("case", on_offset_cases) + def test_is_on_offset(self, case): + offset, d, expected = case + assert_is_on_offset(offset, d, expected) + + apply_cases: _ApplyCases = [] + apply_cases.append( + ( + CDay(), + { + datetime(2008, 1, 1): datetime(2008, 1, 2), + datetime(2008, 1, 4): datetime(2008, 1, 7), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 8), + }, + ) + ) + + apply_cases.append( + ( + 2 * CDay(), + { + datetime(2008, 1, 1): datetime(2008, 1, 3), + datetime(2008, 1, 4): datetime(2008, 1, 8), + datetime(2008, 1, 5): datetime(2008, 1, 8), + datetime(2008, 1, 6): datetime(2008, 1, 8), + datetime(2008, 1, 7): datetime(2008, 1, 9), + }, + ) + ) + + apply_cases.append( + ( + -CDay(), + { + datetime(2008, 1, 1): datetime(2007, 12, 31), + datetime(2008, 1, 4): datetime(2008, 1, 3), + datetime(2008, 1, 5): datetime(2008, 1, 4), + datetime(2008, 1, 6): datetime(2008, 1, 4), + datetime(2008, 1, 7): datetime(2008, 1, 4), + datetime(2008, 1, 8): datetime(2008, 1, 7), + }, + ) + ) + + apply_cases.append( + ( + -2 * CDay(), + { + datetime(2008, 1, 1): datetime(2007, 12, 28), + datetime(2008, 1, 4): datetime(2008, 1, 2), + datetime(2008, 1, 5): datetime(2008, 1, 3), + datetime(2008, 1, 6): datetime(2008, 1, 3), + datetime(2008, 1, 7): datetime(2008, 1, 3), + datetime(2008, 1, 8): datetime(2008, 1, 4), + datetime(2008, 1, 9): datetime(2008, 1, 7), + }, + ) + ) + + apply_cases.append( + ( + CDay(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 4): datetime(2008, 1, 4), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 7), + }, + ) + ) + + @pytest.mark.parametrize("case", apply_cases) + def test_apply(self, case): + offset, cases = case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + def test_apply_large_n(self): + dt = datetime(2012, 10, 23) + + result = dt + CDay(10) + assert result == datetime(2012, 11, 6) + + result = dt + CDay(100) - CDay(100) + assert result == dt + + off = CDay() * 6 + rs = datetime(2012, 1, 1) - off + xp = datetime(2011, 12, 23) + assert rs == xp + + st = datetime(2011, 12, 18) + rs = st + off + xp = datetime(2011, 12, 26) + assert rs == xp + + def test_apply_corner(self): + msg = ( + "Only know how to combine trading day " + "with datetime, datetime64 or timedelta" + ) + with pytest.raises(ApplyTypeError, match=msg): + CDay().apply(BMonthEnd()) + + def test_holidays(self): + # Define a TradingDay offset + holidays = ["2012-05-01", datetime(2013, 5, 1), np.datetime64("2014-05-01")] + tday = CDay(holidays=holidays) + for year in range(2012, 2015): + dt = datetime(year, 4, 30) + xp = datetime(year, 5, 2) + rs = dt + tday + assert rs == xp + + def test_weekmask(self): + weekmask_saudi = "Sat Sun Mon Tue Wed" # Thu-Fri Weekend + weekmask_uae = "1111001" # Fri-Sat Weekend + weekmask_egypt = [1, 1, 1, 1, 0, 0, 1] # Fri-Sat Weekend + bday_saudi = CDay(weekmask=weekmask_saudi) + bday_uae = CDay(weekmask=weekmask_uae) + bday_egypt = CDay(weekmask=weekmask_egypt) + dt = datetime(2013, 5, 1) + xp_saudi = datetime(2013, 5, 4) + xp_uae = datetime(2013, 5, 2) + xp_egypt = datetime(2013, 5, 2) + assert xp_saudi == dt + bday_saudi + assert xp_uae == dt + bday_uae + assert xp_egypt == dt + bday_egypt + xp2 = datetime(2013, 5, 5) + assert xp2 == dt + 2 * bday_saudi + assert xp2 == dt + 2 * bday_uae + assert xp2 == dt + 2 * bday_egypt + + def test_weekmask_and_holidays(self): + weekmask_egypt = "Sun Mon Tue Wed Thu" # Fri-Sat Weekend + holidays = ["2012-05-01", datetime(2013, 5, 1), np.datetime64("2014-05-01")] + bday_egypt = CDay(holidays=holidays, weekmask=weekmask_egypt) + dt = datetime(2013, 4, 30) + xp_egypt = datetime(2013, 5, 5) + assert xp_egypt == dt + 2 * bday_egypt + + @pytest.mark.filterwarnings("ignore:Non:pandas.errors.PerformanceWarning") + def test_calendar(self): + calendar = USFederalHolidayCalendar() + dt = datetime(2014, 1, 17) + assert_offset_equal(CDay(calendar=calendar), dt, datetime(2014, 1, 21)) + + def test_roundtrip_pickle(self): + def _check_roundtrip(obj): + unpickled = tm.round_trip_pickle(obj) + assert unpickled == obj + + _check_roundtrip(self.offset) + _check_roundtrip(self.offset2) + _check_roundtrip(self.offset * 2) + + def test_pickle_compat_0_14_1(self, datapath): + hdays = [datetime(2013, 1, 1) for ele in range(4)] + pth = datapath("tseries", "offsets", "data", "cday-0.14.1.pickle") + cday0_14_1 = read_pickle(pth) + cday = CDay(holidays=hdays) + assert cday == cday0_14_1 + + +class CustomBusinessMonthBase: + def setup_method(self, method): + self.d = datetime(2008, 1, 1) + + self.offset = self._offset() + self.offset1 = self.offset + self.offset2 = self._offset(2) + + def test_eq(self): + assert self.offset2 == self.offset2 + + def test_mul(self): + pass + + def test_hash(self): + assert hash(self.offset2) == hash(self.offset2) + + def test_roundtrip_pickle(self): + def _check_roundtrip(obj): + unpickled = tm.round_trip_pickle(obj) + assert unpickled == obj + + _check_roundtrip(self._offset()) + _check_roundtrip(self._offset(2)) + _check_roundtrip(self._offset() * 2) + + def test_copy(self): + # GH 17452 + off = self._offset(weekmask="Mon Wed Fri") + assert off == off.copy() + + +class TestCustomBusinessMonthEnd(CustomBusinessMonthBase, Base): + _offset = CBMonthEnd + + def test_different_normalize_equals(self): + # GH#21404 changed __eq__ to return False when `normalize` does not match + offset = self._offset() + offset2 = self._offset(normalize=True) + assert offset != offset2 + + def test_repr(self): + assert repr(self.offset) == "" + assert repr(self.offset2) == "<2 * CustomBusinessMonthEnds>" + + def testCall(self): + assert self.offset2(self.d) == datetime(2008, 2, 29) + + def testRollback1(self): + assert CDay(10).rollback(datetime(2007, 12, 31)) == datetime(2007, 12, 31) + + def testRollback2(self): + assert CBMonthEnd(10).rollback(self.d) == datetime(2007, 12, 31) + + def testRollforward1(self): + assert CBMonthEnd(10).rollforward(self.d) == datetime(2008, 1, 31) + + def test_roll_date_object(self): + offset = CBMonthEnd() + + dt = date(2012, 9, 15) + + result = offset.rollback(dt) + assert result == datetime(2012, 8, 31) + + result = offset.rollforward(dt) + assert result == datetime(2012, 9, 28) + + offset = offsets.Day() + result = offset.rollback(dt) + assert result == datetime(2012, 9, 15) + + result = offset.rollforward(dt) + assert result == datetime(2012, 9, 15) + + on_offset_cases = [ + (CBMonthEnd(), datetime(2008, 1, 31), True), + (CBMonthEnd(), datetime(2008, 1, 1), False), + ] + + @pytest.mark.parametrize("case", on_offset_cases) + def test_is_on_offset(self, case): + offset, d, expected = case + assert_is_on_offset(offset, d, expected) + + apply_cases: _ApplyCases = [] + apply_cases.append( + ( + CBMonthEnd(), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 2, 7): datetime(2008, 2, 29), + }, + ) + ) + + apply_cases.append( + ( + 2 * CBMonthEnd(), + { + datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 2, 7): datetime(2008, 3, 31), + }, + ) + ) + + apply_cases.append( + ( + -CBMonthEnd(), + { + datetime(2008, 1, 1): datetime(2007, 12, 31), + datetime(2008, 2, 8): datetime(2008, 1, 31), + }, + ) + ) + + apply_cases.append( + ( + -2 * CBMonthEnd(), + { + datetime(2008, 1, 1): datetime(2007, 11, 30), + datetime(2008, 2, 9): datetime(2007, 12, 31), + }, + ) + ) + + apply_cases.append( + ( + CBMonthEnd(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 2, 7): datetime(2008, 2, 29), + }, + ) + ) + + @pytest.mark.parametrize("case", apply_cases) + def test_apply(self, case): + offset, cases = case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + def test_apply_large_n(self): + dt = datetime(2012, 10, 23) + + result = dt + CBMonthEnd(10) + assert result == datetime(2013, 7, 31) + + result = dt + CDay(100) - CDay(100) + assert result == dt + + off = CBMonthEnd() * 6 + rs = datetime(2012, 1, 1) - off + xp = datetime(2011, 7, 29) + assert rs == xp + + st = datetime(2011, 12, 18) + rs = st + off + xp = datetime(2012, 5, 31) + assert rs == xp + + def test_holidays(self): + # Define a TradingDay offset + holidays = ["2012-01-31", datetime(2012, 2, 28), np.datetime64("2012-02-29")] + bm_offset = CBMonthEnd(holidays=holidays) + dt = datetime(2012, 1, 1) + assert dt + bm_offset == datetime(2012, 1, 30) + assert dt + 2 * bm_offset == datetime(2012, 2, 27) + + @pytest.mark.filterwarnings("ignore:Non:pandas.errors.PerformanceWarning") + def test_datetimeindex(self): + from pandas.tseries.holiday import USFederalHolidayCalendar + + hcal = USFederalHolidayCalendar() + freq = CBMonthEnd(calendar=hcal) + + assert date_range(start="20120101", end="20130101", freq=freq).tolist()[ + 0 + ] == datetime(2012, 1, 31) + + +class TestCustomBusinessMonthBegin(CustomBusinessMonthBase, Base): + _offset = CBMonthBegin + + def test_different_normalize_equals(self): + # GH#21404 changed __eq__ to return False when `normalize` does not match + offset = self._offset() + offset2 = self._offset(normalize=True) + assert offset != offset2 + + def test_repr(self): + assert repr(self.offset) == "" + assert repr(self.offset2) == "<2 * CustomBusinessMonthBegins>" + + def testCall(self): + assert self.offset2(self.d) == datetime(2008, 3, 3) + + def testRollback1(self): + assert CDay(10).rollback(datetime(2007, 12, 31)) == datetime(2007, 12, 31) + + def testRollback2(self): + assert CBMonthBegin(10).rollback(self.d) == datetime(2008, 1, 1) + + def testRollforward1(self): + assert CBMonthBegin(10).rollforward(self.d) == datetime(2008, 1, 1) + + def test_roll_date_object(self): + offset = CBMonthBegin() + + dt = date(2012, 9, 15) + + result = offset.rollback(dt) + assert result == datetime(2012, 9, 3) + + result = offset.rollforward(dt) + assert result == datetime(2012, 10, 1) + + offset = offsets.Day() + result = offset.rollback(dt) + assert result == datetime(2012, 9, 15) + + result = offset.rollforward(dt) + assert result == datetime(2012, 9, 15) + + on_offset_cases = [ + (CBMonthBegin(), datetime(2008, 1, 1), True), + (CBMonthBegin(), datetime(2008, 1, 31), False), + ] + + @pytest.mark.parametrize("case", on_offset_cases) + def test_is_on_offset(self, case): + offset, dt, expected = case + assert_is_on_offset(offset, dt, expected) + + apply_cases: _ApplyCases = [] + apply_cases.append( + ( + CBMonthBegin(), + { + datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 2, 7): datetime(2008, 3, 3), + }, + ) + ) + + apply_cases.append( + ( + 2 * CBMonthBegin(), + { + datetime(2008, 1, 1): datetime(2008, 3, 3), + datetime(2008, 2, 7): datetime(2008, 4, 1), + }, + ) + ) + + apply_cases.append( + ( + -CBMonthBegin(), + { + datetime(2008, 1, 1): datetime(2007, 12, 3), + datetime(2008, 2, 8): datetime(2008, 2, 1), + }, + ) + ) + + apply_cases.append( + ( + -2 * CBMonthBegin(), + { + datetime(2008, 1, 1): datetime(2007, 11, 1), + datetime(2008, 2, 9): datetime(2008, 1, 1), + }, + ) + ) + + apply_cases.append( + ( + CBMonthBegin(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 7): datetime(2008, 2, 1), + }, + ) + ) + + @pytest.mark.parametrize("case", apply_cases) + def test_apply(self, case): + offset, cases = case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + def test_apply_large_n(self): + dt = datetime(2012, 10, 23) + + result = dt + CBMonthBegin(10) + assert result == datetime(2013, 8, 1) + + result = dt + CDay(100) - CDay(100) + assert result == dt + + off = CBMonthBegin() * 6 + rs = datetime(2012, 1, 1) - off + xp = datetime(2011, 7, 1) + assert rs == xp + + st = datetime(2011, 12, 18) + rs = st + off + + xp = datetime(2012, 6, 1) + assert rs == xp + + def test_holidays(self): + # Define a TradingDay offset + holidays = ["2012-02-01", datetime(2012, 2, 2), np.datetime64("2012-03-01")] + bm_offset = CBMonthBegin(holidays=holidays) + dt = datetime(2012, 1, 1) + + assert dt + bm_offset == datetime(2012, 1, 2) + assert dt + 2 * bm_offset == datetime(2012, 2, 3) + + @pytest.mark.filterwarnings("ignore:Non:pandas.errors.PerformanceWarning") + def test_datetimeindex(self): + hcal = USFederalHolidayCalendar() + cbmb = CBMonthBegin(calendar=hcal) + assert date_range(start="20120101", end="20130101", freq=cbmb).tolist()[ + 0 + ] == datetime(2012, 1, 3) + + +class TestWeek(Base): + _offset = Week + d = Timestamp(datetime(2008, 1, 2)) + offset1 = _offset() + offset2 = _offset(2) + + def test_repr(self): + assert repr(Week(weekday=0)) == "" + assert repr(Week(n=-1, weekday=0)) == "<-1 * Week: weekday=0>" + assert repr(Week(n=-2, weekday=0)) == "<-2 * Weeks: weekday=0>" + + def test_corner(self): + with pytest.raises(ValueError): + Week(weekday=7) + + with pytest.raises(ValueError, match="Day must be"): + Week(weekday=-1) + + def test_is_anchored(self): + assert Week(weekday=0).is_anchored() + assert not Week().is_anchored() + assert not Week(2, weekday=2).is_anchored() + assert not Week(2).is_anchored() + + offset_cases = [] + # not business week + offset_cases.append( + ( + Week(), + { + datetime(2008, 1, 1): datetime(2008, 1, 8), + datetime(2008, 1, 4): datetime(2008, 1, 11), + datetime(2008, 1, 5): datetime(2008, 1, 12), + datetime(2008, 1, 6): datetime(2008, 1, 13), + datetime(2008, 1, 7): datetime(2008, 1, 14), + }, + ) + ) + + # Mon + offset_cases.append( + ( + Week(weekday=0), + { + datetime(2007, 12, 31): datetime(2008, 1, 7), + datetime(2008, 1, 4): datetime(2008, 1, 7), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 14), + }, + ) + ) + + # n=0 -> roll forward. Mon + offset_cases.append( + ( + Week(0, weekday=0), + { + datetime(2007, 12, 31): datetime(2007, 12, 31), + datetime(2008, 1, 4): datetime(2008, 1, 7), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 7), + }, + ) + ) + + # n=0 -> roll forward. Mon + offset_cases.append( + ( + Week(-2, weekday=1), + { + datetime(2010, 4, 6): datetime(2010, 3, 23), + datetime(2010, 4, 8): datetime(2010, 3, 30), + datetime(2010, 4, 5): datetime(2010, 3, 23), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + @pytest.mark.parametrize("weekday", range(7)) + def test_is_on_offset(self, weekday): + offset = Week(weekday=weekday) + + for day in range(1, 8): + date = datetime(2008, 1, day) + + if day % 7 == weekday: + expected = True + else: + expected = False + assert_is_on_offset(offset, date, expected) + + +class TestWeekOfMonth(Base): + _offset = WeekOfMonth + offset1 = _offset() + offset2 = _offset(2) + + def test_constructor(self): + with pytest.raises(ValueError, match="^Week"): + WeekOfMonth(n=1, week=4, weekday=0) + + with pytest.raises(ValueError, match="^Week"): + WeekOfMonth(n=1, week=-1, weekday=0) + + with pytest.raises(ValueError, match="^Day"): + WeekOfMonth(n=1, week=0, weekday=-1) + + with pytest.raises(ValueError, match="^Day"): + WeekOfMonth(n=1, week=0, weekday=-7) + + def test_repr(self): + assert ( + repr(WeekOfMonth(weekday=1, week=2)) == "" + ) + + def test_offset(self): + date1 = datetime(2011, 1, 4) # 1st Tuesday of Month + date2 = datetime(2011, 1, 11) # 2nd Tuesday of Month + date3 = datetime(2011, 1, 18) # 3rd Tuesday of Month + date4 = datetime(2011, 1, 25) # 4th Tuesday of Month + + # see for loop for structure + test_cases = [ + (-2, 2, 1, date1, datetime(2010, 11, 16)), + (-2, 2, 1, date2, datetime(2010, 11, 16)), + (-2, 2, 1, date3, datetime(2010, 11, 16)), + (-2, 2, 1, date4, datetime(2010, 12, 21)), + (-1, 2, 1, date1, datetime(2010, 12, 21)), + (-1, 2, 1, date2, datetime(2010, 12, 21)), + (-1, 2, 1, date3, datetime(2010, 12, 21)), + (-1, 2, 1, date4, datetime(2011, 1, 18)), + (0, 0, 1, date1, datetime(2011, 1, 4)), + (0, 0, 1, date2, datetime(2011, 2, 1)), + (0, 0, 1, date3, datetime(2011, 2, 1)), + (0, 0, 1, date4, datetime(2011, 2, 1)), + (0, 1, 1, date1, datetime(2011, 1, 11)), + (0, 1, 1, date2, datetime(2011, 1, 11)), + (0, 1, 1, date3, datetime(2011, 2, 8)), + (0, 1, 1, date4, datetime(2011, 2, 8)), + (0, 0, 1, date1, datetime(2011, 1, 4)), + (0, 1, 1, date2, datetime(2011, 1, 11)), + (0, 2, 1, date3, datetime(2011, 1, 18)), + (0, 3, 1, date4, datetime(2011, 1, 25)), + (1, 0, 0, date1, datetime(2011, 2, 7)), + (1, 0, 0, date2, datetime(2011, 2, 7)), + (1, 0, 0, date3, datetime(2011, 2, 7)), + (1, 0, 0, date4, datetime(2011, 2, 7)), + (1, 0, 1, date1, datetime(2011, 2, 1)), + (1, 0, 1, date2, datetime(2011, 2, 1)), + (1, 0, 1, date3, datetime(2011, 2, 1)), + (1, 0, 1, date4, datetime(2011, 2, 1)), + (1, 0, 2, date1, datetime(2011, 1, 5)), + (1, 0, 2, date2, datetime(2011, 2, 2)), + (1, 0, 2, date3, datetime(2011, 2, 2)), + (1, 0, 2, date4, datetime(2011, 2, 2)), + (1, 2, 1, date1, datetime(2011, 1, 18)), + (1, 2, 1, date2, datetime(2011, 1, 18)), + (1, 2, 1, date3, datetime(2011, 2, 15)), + (1, 2, 1, date4, datetime(2011, 2, 15)), + (2, 2, 1, date1, datetime(2011, 2, 15)), + (2, 2, 1, date2, datetime(2011, 2, 15)), + (2, 2, 1, date3, datetime(2011, 3, 15)), + (2, 2, 1, date4, datetime(2011, 3, 15)), + ] + + for n, week, weekday, dt, expected in test_cases: + offset = WeekOfMonth(n, week=week, weekday=weekday) + assert_offset_equal(offset, dt, expected) + + # try subtracting + result = datetime(2011, 2, 1) - WeekOfMonth(week=1, weekday=2) + assert result == datetime(2011, 1, 12) + + result = datetime(2011, 2, 3) - WeekOfMonth(week=0, weekday=2) + assert result == datetime(2011, 2, 2) + + on_offset_cases = [ + (0, 0, datetime(2011, 2, 7), True), + (0, 0, datetime(2011, 2, 6), False), + (0, 0, datetime(2011, 2, 14), False), + (1, 0, datetime(2011, 2, 14), True), + (0, 1, datetime(2011, 2, 1), True), + (0, 1, datetime(2011, 2, 8), False), + ] + + @pytest.mark.parametrize("case", on_offset_cases) + def test_is_on_offset(self, case): + week, weekday, dt, expected = case + offset = WeekOfMonth(week=week, weekday=weekday) + assert offset.is_on_offset(dt) == expected + + +class TestLastWeekOfMonth(Base): + _offset = LastWeekOfMonth + offset1 = _offset() + offset2 = _offset(2) + + def test_constructor(self): + with pytest.raises(ValueError, match="^N cannot be 0"): + LastWeekOfMonth(n=0, weekday=1) + + with pytest.raises(ValueError, match="^Day"): + LastWeekOfMonth(n=1, weekday=-1) + + with pytest.raises(ValueError, match="^Day"): + LastWeekOfMonth(n=1, weekday=7) + + def test_offset(self): + # Saturday + last_sat = datetime(2013, 8, 31) + next_sat = datetime(2013, 9, 28) + offset_sat = LastWeekOfMonth(n=1, weekday=5) + + one_day_before = last_sat + timedelta(days=-1) + assert one_day_before + offset_sat == last_sat + + one_day_after = last_sat + timedelta(days=+1) + assert one_day_after + offset_sat == next_sat + + # Test On that day + assert last_sat + offset_sat == next_sat + + # Thursday + + offset_thur = LastWeekOfMonth(n=1, weekday=3) + last_thurs = datetime(2013, 1, 31) + next_thurs = datetime(2013, 2, 28) + + one_day_before = last_thurs + timedelta(days=-1) + assert one_day_before + offset_thur == last_thurs + + one_day_after = last_thurs + timedelta(days=+1) + assert one_day_after + offset_thur == next_thurs + + # Test on that day + assert last_thurs + offset_thur == next_thurs + + three_before = last_thurs + timedelta(days=-3) + assert three_before + offset_thur == last_thurs + + two_after = last_thurs + timedelta(days=+2) + assert two_after + offset_thur == next_thurs + + offset_sunday = LastWeekOfMonth(n=1, weekday=WeekDay.SUN) + assert datetime(2013, 7, 31) + offset_sunday == datetime(2013, 8, 25) + + on_offset_cases = [ + (WeekDay.SUN, datetime(2013, 1, 27), True), + (WeekDay.SAT, datetime(2013, 3, 30), True), + (WeekDay.MON, datetime(2013, 2, 18), False), # Not the last Mon + (WeekDay.SUN, datetime(2013, 2, 25), False), # Not a SUN + (WeekDay.MON, datetime(2013, 2, 25), True), + (WeekDay.SAT, datetime(2013, 11, 30), True), + (WeekDay.SAT, datetime(2006, 8, 26), True), + (WeekDay.SAT, datetime(2007, 8, 25), True), + (WeekDay.SAT, datetime(2008, 8, 30), True), + (WeekDay.SAT, datetime(2009, 8, 29), True), + (WeekDay.SAT, datetime(2010, 8, 28), True), + (WeekDay.SAT, datetime(2011, 8, 27), True), + (WeekDay.SAT, datetime(2019, 8, 31), True), + ] + + @pytest.mark.parametrize("case", on_offset_cases) + def test_is_on_offset(self, case): + weekday, dt, expected = case + offset = LastWeekOfMonth(weekday=weekday) + assert offset.is_on_offset(dt) == expected + + +class TestSemiMonthEnd(Base): + _offset = SemiMonthEnd + offset1 = _offset() + offset2 = _offset(2) + + def test_offset_whole_year(self): + dates = ( + datetime(2007, 12, 31), + datetime(2008, 1, 15), + datetime(2008, 1, 31), + datetime(2008, 2, 15), + datetime(2008, 2, 29), + datetime(2008, 3, 15), + datetime(2008, 3, 31), + datetime(2008, 4, 15), + datetime(2008, 4, 30), + datetime(2008, 5, 15), + datetime(2008, 5, 31), + datetime(2008, 6, 15), + datetime(2008, 6, 30), + datetime(2008, 7, 15), + datetime(2008, 7, 31), + datetime(2008, 8, 15), + datetime(2008, 8, 31), + datetime(2008, 9, 15), + datetime(2008, 9, 30), + datetime(2008, 10, 15), + datetime(2008, 10, 31), + datetime(2008, 11, 15), + datetime(2008, 11, 30), + datetime(2008, 12, 15), + datetime(2008, 12, 31), + ) + + for base, exp_date in zip(dates[:-1], dates[1:]): + assert_offset_equal(SemiMonthEnd(), base, exp_date) + + # ensure .apply_index works as expected + s = DatetimeIndex(dates[:-1]) + with tm.assert_produces_warning(None): + # GH#22535 check that we don't get a FutureWarning from adding + # an integer array to PeriodIndex + result = SemiMonthEnd().apply_index(s) + + exp = DatetimeIndex(dates[1:]) + tm.assert_index_equal(result, exp) + + # ensure generating a range with DatetimeIndex gives same result + result = date_range(start=dates[0], end=dates[-1], freq="SM") + exp = DatetimeIndex(dates) + tm.assert_index_equal(result, exp) + + offset_cases = [] + offset_cases.append( + ( + SemiMonthEnd(), + { + datetime(2008, 1, 1): datetime(2008, 1, 15), + datetime(2008, 1, 15): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 15), + datetime(2006, 12, 14): datetime(2006, 12, 15), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2007, 1, 15), + datetime(2007, 1, 1): datetime(2007, 1, 15), + datetime(2006, 12, 1): datetime(2006, 12, 15), + datetime(2006, 12, 15): datetime(2006, 12, 31), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthEnd(day_of_month=20), + { + datetime(2008, 1, 1): datetime(2008, 1, 20), + datetime(2008, 1, 15): datetime(2008, 1, 20), + datetime(2008, 1, 21): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 20), + datetime(2006, 12, 14): datetime(2006, 12, 20), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2007, 1, 20), + datetime(2007, 1, 1): datetime(2007, 1, 20), + datetime(2006, 12, 1): datetime(2006, 12, 20), + datetime(2006, 12, 15): datetime(2006, 12, 20), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthEnd(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 15), + datetime(2008, 1, 16): datetime(2008, 1, 31), + datetime(2008, 1, 15): datetime(2008, 1, 15), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2006, 12, 31), + datetime(2007, 1, 1): datetime(2007, 1, 15), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthEnd(0, day_of_month=16), + { + datetime(2008, 1, 1): datetime(2008, 1, 16), + datetime(2008, 1, 16): datetime(2008, 1, 16), + datetime(2008, 1, 15): datetime(2008, 1, 16), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2006, 12, 31), + datetime(2007, 1, 1): datetime(2007, 1, 16), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthEnd(2), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2006, 12, 29): datetime(2007, 1, 15), + datetime(2006, 12, 31): datetime(2007, 1, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31), + datetime(2007, 1, 16): datetime(2007, 2, 15), + datetime(2006, 11, 1): datetime(2006, 11, 30), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthEnd(-1), + { + datetime(2007, 1, 1): datetime(2006, 12, 31), + datetime(2008, 6, 30): datetime(2008, 6, 15), + datetime(2008, 12, 31): datetime(2008, 12, 15), + datetime(2006, 12, 29): datetime(2006, 12, 15), + datetime(2006, 12, 30): datetime(2006, 12, 15), + datetime(2007, 1, 1): datetime(2006, 12, 31), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthEnd(-1, day_of_month=4), + { + datetime(2007, 1, 1): datetime(2006, 12, 31), + datetime(2007, 1, 4): datetime(2006, 12, 31), + datetime(2008, 6, 30): datetime(2008, 6, 4), + datetime(2008, 12, 31): datetime(2008, 12, 4), + datetime(2006, 12, 5): datetime(2006, 12, 4), + datetime(2006, 12, 30): datetime(2006, 12, 4), + datetime(2007, 1, 1): datetime(2006, 12, 31), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthEnd(-2), + { + datetime(2007, 1, 1): datetime(2006, 12, 15), + datetime(2008, 6, 30): datetime(2008, 5, 31), + datetime(2008, 3, 15): datetime(2008, 2, 15), + datetime(2008, 12, 31): datetime(2008, 11, 30), + datetime(2006, 12, 29): datetime(2006, 11, 30), + datetime(2006, 12, 14): datetime(2006, 11, 15), + datetime(2007, 1, 1): datetime(2006, 12, 15), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + @pytest.mark.parametrize("case", offset_cases) + def test_apply_index(self, case): + offset, cases = case + s = DatetimeIndex(cases.keys()) + with tm.assert_produces_warning(None): + # GH#22535 check that we don't get a FutureWarning from adding + # an integer array to PeriodIndex + result = offset.apply_index(s) + + exp = DatetimeIndex(cases.values()) + tm.assert_index_equal(result, exp) + + on_offset_cases = [ + (datetime(2007, 12, 31), True), + (datetime(2007, 12, 15), True), + (datetime(2007, 12, 14), False), + (datetime(2007, 12, 1), False), + (datetime(2008, 2, 29), True), + ] + + @pytest.mark.parametrize("case", on_offset_cases) + def test_is_on_offset(self, case): + dt, expected = case + assert_is_on_offset(SemiMonthEnd(), dt, expected) + + @pytest.mark.parametrize("klass", [Series, DatetimeIndex]) + def test_vectorized_offset_addition(self, klass): + s = klass( + [ + Timestamp("2000-01-15 00:15:00", tz="US/Central"), + Timestamp("2000-02-15", tz="US/Central"), + ], + name="a", + ) + + with tm.assert_produces_warning(None): + # GH#22535 check that we don't get a FutureWarning from adding + # an integer array to PeriodIndex + result = s + SemiMonthEnd() + result2 = SemiMonthEnd() + s + + exp = klass( + [ + Timestamp("2000-01-31 00:15:00", tz="US/Central"), + Timestamp("2000-02-29", tz="US/Central"), + ], + name="a", + ) + tm.assert_equal(result, exp) + tm.assert_equal(result2, exp) + + s = klass( + [ + Timestamp("2000-01-01 00:15:00", tz="US/Central"), + Timestamp("2000-02-01", tz="US/Central"), + ], + name="a", + ) + + with tm.assert_produces_warning(None): + # GH#22535 check that we don't get a FutureWarning from adding + # an integer array to PeriodIndex + result = s + SemiMonthEnd() + result2 = SemiMonthEnd() + s + + exp = klass( + [ + Timestamp("2000-01-15 00:15:00", tz="US/Central"), + Timestamp("2000-02-15", tz="US/Central"), + ], + name="a", + ) + tm.assert_equal(result, exp) + tm.assert_equal(result2, exp) + + +class TestSemiMonthBegin(Base): + _offset = SemiMonthBegin + offset1 = _offset() + offset2 = _offset(2) + + def test_offset_whole_year(self): + dates = ( + datetime(2007, 12, 15), + datetime(2008, 1, 1), + datetime(2008, 1, 15), + datetime(2008, 2, 1), + datetime(2008, 2, 15), + datetime(2008, 3, 1), + datetime(2008, 3, 15), + datetime(2008, 4, 1), + datetime(2008, 4, 15), + datetime(2008, 5, 1), + datetime(2008, 5, 15), + datetime(2008, 6, 1), + datetime(2008, 6, 15), + datetime(2008, 7, 1), + datetime(2008, 7, 15), + datetime(2008, 8, 1), + datetime(2008, 8, 15), + datetime(2008, 9, 1), + datetime(2008, 9, 15), + datetime(2008, 10, 1), + datetime(2008, 10, 15), + datetime(2008, 11, 1), + datetime(2008, 11, 15), + datetime(2008, 12, 1), + datetime(2008, 12, 15), + ) + + for base, exp_date in zip(dates[:-1], dates[1:]): + assert_offset_equal(SemiMonthBegin(), base, exp_date) + + # ensure .apply_index works as expected + s = DatetimeIndex(dates[:-1]) + with tm.assert_produces_warning(None): + # GH#22535 check that we don't get a FutureWarning from adding + # an integer array to PeriodIndex + result = SemiMonthBegin().apply_index(s) + + exp = DatetimeIndex(dates[1:]) + tm.assert_index_equal(result, exp) + + # ensure generating a range with DatetimeIndex gives same result + result = date_range(start=dates[0], end=dates[-1], freq="SMS") + exp = DatetimeIndex(dates) + tm.assert_index_equal(result, exp) + + offset_cases = [] + offset_cases.append( + ( + SemiMonthBegin(), + { + datetime(2008, 1, 1): datetime(2008, 1, 15), + datetime(2008, 1, 15): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 14): datetime(2006, 12, 15), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2007, 1, 1): datetime(2007, 1, 15), + datetime(2006, 12, 1): datetime(2006, 12, 15), + datetime(2006, 12, 15): datetime(2007, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthBegin(day_of_month=20), + { + datetime(2008, 1, 1): datetime(2008, 1, 20), + datetime(2008, 1, 15): datetime(2008, 1, 20), + datetime(2008, 1, 21): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 14): datetime(2006, 12, 20), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2007, 1, 1): datetime(2007, 1, 20), + datetime(2006, 12, 1): datetime(2006, 12, 20), + datetime(2006, 12, 15): datetime(2006, 12, 20), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthBegin(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 16): datetime(2008, 2, 1), + datetime(2008, 1, 15): datetime(2008, 1, 15), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 2): datetime(2006, 12, 15), + datetime(2007, 1, 1): datetime(2007, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthBegin(0, day_of_month=16), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 16): datetime(2008, 1, 16), + datetime(2008, 1, 15): datetime(2008, 1, 16), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2007, 1, 5): datetime(2007, 1, 16), + datetime(2007, 1, 1): datetime(2007, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthBegin(2), + { + datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 15), + datetime(2006, 12, 1): datetime(2007, 1, 1), + datetime(2006, 12, 29): datetime(2007, 1, 15), + datetime(2006, 12, 15): datetime(2007, 1, 15), + datetime(2007, 1, 1): datetime(2007, 2, 1), + datetime(2007, 1, 16): datetime(2007, 2, 15), + datetime(2006, 11, 1): datetime(2006, 12, 1), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthBegin(-1), + { + datetime(2007, 1, 1): datetime(2006, 12, 15), + datetime(2008, 6, 30): datetime(2008, 6, 15), + datetime(2008, 6, 14): datetime(2008, 6, 1), + datetime(2008, 12, 31): datetime(2008, 12, 15), + datetime(2006, 12, 29): datetime(2006, 12, 15), + datetime(2006, 12, 15): datetime(2006, 12, 1), + datetime(2007, 1, 1): datetime(2006, 12, 15), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthBegin(-1, day_of_month=4), + { + datetime(2007, 1, 1): datetime(2006, 12, 4), + datetime(2007, 1, 4): datetime(2007, 1, 1), + datetime(2008, 6, 30): datetime(2008, 6, 4), + datetime(2008, 12, 31): datetime(2008, 12, 4), + datetime(2006, 12, 5): datetime(2006, 12, 4), + datetime(2006, 12, 30): datetime(2006, 12, 4), + datetime(2006, 12, 2): datetime(2006, 12, 1), + datetime(2007, 1, 1): datetime(2006, 12, 4), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthBegin(-2), + { + datetime(2007, 1, 1): datetime(2006, 12, 1), + datetime(2008, 6, 30): datetime(2008, 6, 1), + datetime(2008, 6, 14): datetime(2008, 5, 15), + datetime(2008, 12, 31): datetime(2008, 12, 1), + datetime(2006, 12, 29): datetime(2006, 12, 1), + datetime(2006, 12, 15): datetime(2006, 11, 15), + datetime(2007, 1, 1): datetime(2006, 12, 1), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + @pytest.mark.parametrize("case", offset_cases) + def test_apply_index(self, case): + offset, cases = case + s = DatetimeIndex(cases.keys()) + + with tm.assert_produces_warning(None): + # GH#22535 check that we don't get a FutureWarning from adding + # an integer array to PeriodIndex + result = offset.apply_index(s) + + exp = DatetimeIndex(cases.values()) + tm.assert_index_equal(result, exp) + + on_offset_cases = [ + (datetime(2007, 12, 1), True), + (datetime(2007, 12, 15), True), + (datetime(2007, 12, 14), False), + (datetime(2007, 12, 31), False), + (datetime(2008, 2, 15), True), + ] + + @pytest.mark.parametrize("case", on_offset_cases) + def test_is_on_offset(self, case): + dt, expected = case + assert_is_on_offset(SemiMonthBegin(), dt, expected) + + @pytest.mark.parametrize("klass", [Series, DatetimeIndex]) + def test_vectorized_offset_addition(self, klass): + s = klass( + [ + Timestamp("2000-01-15 00:15:00", tz="US/Central"), + Timestamp("2000-02-15", tz="US/Central"), + ], + name="a", + ) + with tm.assert_produces_warning(None): + # GH#22535 check that we don't get a FutureWarning from adding + # an integer array to PeriodIndex + result = s + SemiMonthBegin() + result2 = SemiMonthBegin() + s + + exp = klass( + [ + Timestamp("2000-02-01 00:15:00", tz="US/Central"), + Timestamp("2000-03-01", tz="US/Central"), + ], + name="a", + ) + tm.assert_equal(result, exp) + tm.assert_equal(result2, exp) + + s = klass( + [ + Timestamp("2000-01-01 00:15:00", tz="US/Central"), + Timestamp("2000-02-01", tz="US/Central"), + ], + name="a", + ) + with tm.assert_produces_warning(None): + # GH#22535 check that we don't get a FutureWarning from adding + # an integer array to PeriodIndex + result = s + SemiMonthBegin() + result2 = SemiMonthBegin() + s + + exp = klass( + [ + Timestamp("2000-01-15 00:15:00", tz="US/Central"), + Timestamp("2000-02-15", tz="US/Central"), + ], + name="a", + ) + tm.assert_equal(result, exp) + tm.assert_equal(result2, exp) + + +def test_Easter(): + assert_offset_equal(Easter(), datetime(2010, 1, 1), datetime(2010, 4, 4)) + assert_offset_equal(Easter(), datetime(2010, 4, 5), datetime(2011, 4, 24)) + assert_offset_equal(Easter(2), datetime(2010, 1, 1), datetime(2011, 4, 24)) + + assert_offset_equal(Easter(), datetime(2010, 4, 4), datetime(2011, 4, 24)) + assert_offset_equal(Easter(2), datetime(2010, 4, 4), datetime(2012, 4, 8)) + + assert_offset_equal(-Easter(), datetime(2011, 1, 1), datetime(2010, 4, 4)) + assert_offset_equal(-Easter(), datetime(2010, 4, 5), datetime(2010, 4, 4)) + assert_offset_equal(-Easter(2), datetime(2011, 1, 1), datetime(2009, 4, 12)) + + assert_offset_equal(-Easter(), datetime(2010, 4, 4), datetime(2009, 4, 12)) + assert_offset_equal(-Easter(2), datetime(2010, 4, 4), datetime(2008, 3, 23)) + + +class TestOffsetNames: + def test_get_offset_name(self): + assert BDay().freqstr == "B" + assert BDay(2).freqstr == "2B" + assert BMonthEnd().freqstr == "BM" + assert Week(weekday=0).freqstr == "W-MON" + assert Week(weekday=1).freqstr == "W-TUE" + assert Week(weekday=2).freqstr == "W-WED" + assert Week(weekday=3).freqstr == "W-THU" + assert Week(weekday=4).freqstr == "W-FRI" + + assert LastWeekOfMonth(weekday=WeekDay.SUN).freqstr == "LWOM-SUN" + + +def test_get_offset(): + with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): + _get_offset("gibberish") + with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): + _get_offset("QS-JAN-B") + + pairs = [ + ("B", BDay()), + ("b", BDay()), + ("bm", BMonthEnd()), + ("Bm", BMonthEnd()), + ("W-MON", Week(weekday=0)), + ("W-TUE", Week(weekday=1)), + ("W-WED", Week(weekday=2)), + ("W-THU", Week(weekday=3)), + ("W-FRI", Week(weekday=4)), + ] + + for name, expected in pairs: + offset = _get_offset(name) + assert offset == expected, ( + f"Expected {repr(name)} to yield {repr(expected)} " + f"(actual: {repr(offset)})" + ) + + +def test_get_offset_legacy(): + pairs = [("w@Sat", Week(weekday=5))] + for name, expected in pairs: + with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): + _get_offset(name) + + +class TestOffsetAliases: + def setup_method(self, method): + _offset_map.clear() + + def test_alias_equality(self): + for k, v in _offset_map.items(): + if v is None: + continue + assert k == v.copy() + + def test_rule_code(self): + lst = ["M", "MS", "BM", "BMS", "D", "B", "H", "T", "S", "L", "U"] + for k in lst: + assert k == _get_offset(k).rule_code + # should be cached - this is kind of an internals test... + assert k in _offset_map + assert k == (_get_offset(k) * 3).rule_code + + suffix_lst = ["MON", "TUE", "WED", "THU", "FRI", "SAT", "SUN"] + base = "W" + for v in suffix_lst: + alias = "-".join([base, v]) + assert alias == _get_offset(alias).rule_code + assert alias == (_get_offset(alias) * 5).rule_code + + suffix_lst = [ + "JAN", + "FEB", + "MAR", + "APR", + "MAY", + "JUN", + "JUL", + "AUG", + "SEP", + "OCT", + "NOV", + "DEC", + ] + base_lst = ["A", "AS", "BA", "BAS", "Q", "QS", "BQ", "BQS"] + for base in base_lst: + for v in suffix_lst: + alias = "-".join([base, v]) + assert alias == _get_offset(alias).rule_code + assert alias == (_get_offset(alias) * 5).rule_code + + lst = ["M", "D", "B", "H", "T", "S", "L", "U"] + for k in lst: + code, stride = get_freq_code("3" + k) + assert isinstance(code, int) + assert stride == 3 + assert k == get_freq_str(code) + + +def test_dateoffset_misc(): + oset = offsets.DateOffset(months=2, days=4) + # it works + oset.freqstr + + assert not offsets.DateOffset(months=2) == 2 + + +def test_freq_offsets(): + off = BDay(1, offset=timedelta(0, 1800)) + assert off.freqstr == "B+30Min" + + off = BDay(1, offset=timedelta(0, -1800)) + assert off.freqstr == "B-30Min" + + +class TestReprNames: + def test_str_for_named_is_name(self): + # look at all the amazing combinations! + month_prefixes = ["A", "AS", "BA", "BAS", "Q", "BQ", "BQS", "QS"] + names = [ + prefix + "-" + month + for prefix in month_prefixes + for month in [ + "JAN", + "FEB", + "MAR", + "APR", + "MAY", + "JUN", + "JUL", + "AUG", + "SEP", + "OCT", + "NOV", + "DEC", + ] + ] + days = ["MON", "TUE", "WED", "THU", "FRI", "SAT", "SUN"] + names += ["W-" + day for day in days] + names += ["WOM-" + week + day for week in ("1", "2", "3", "4") for day in days] + _offset_map.clear() + for name in names: + offset = _get_offset(name) + assert offset.freqstr == name + + +def get_utc_offset_hours(ts): + # take a Timestamp and compute total hours of utc offset + o = ts.utcoffset() + return (o.days * 24 * 3600 + o.seconds) / 3600.0 + + +class TestDST: + """ + test DateOffset additions over Daylight Savings Time + """ + + # one microsecond before the DST transition + ts_pre_fallback = "2013-11-03 01:59:59.999999" + ts_pre_springfwd = "2013-03-10 01:59:59.999999" + + # test both basic names and dateutil timezones + timezone_utc_offsets = { + "US/Eastern": dict(utc_offset_daylight=-4, utc_offset_standard=-5), + "dateutil/US/Pacific": dict(utc_offset_daylight=-7, utc_offset_standard=-8), + } + valid_date_offsets_singular = [ + "weekday", + "day", + "hour", + "minute", + "second", + "microsecond", + ] + valid_date_offsets_plural = [ + "weeks", + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", + ] + + def _test_all_offsets(self, n, **kwds): + valid_offsets = ( + self.valid_date_offsets_plural + if n > 1 + else self.valid_date_offsets_singular + ) + + for name in valid_offsets: + self._test_offset(offset_name=name, offset_n=n, **kwds) + + def _test_offset(self, offset_name, offset_n, tstart, expected_utc_offset): + offset = DateOffset(**{offset_name: offset_n}) + + t = tstart + offset + if expected_utc_offset is not None: + assert get_utc_offset_hours(t) == expected_utc_offset + + if offset_name == "weeks": + # dates should match + assert t.date() == timedelta(days=7 * offset.kwds["weeks"]) + tstart.date() + # expect the same day of week, hour of day, minute, second, ... + assert ( + t.dayofweek == tstart.dayofweek + and t.hour == tstart.hour + and t.minute == tstart.minute + and t.second == tstart.second + ) + elif offset_name == "days": + # dates should match + assert timedelta(offset.kwds["days"]) + tstart.date() == t.date() + # expect the same hour of day, minute, second, ... + assert ( + t.hour == tstart.hour + and t.minute == tstart.minute + and t.second == tstart.second + ) + elif offset_name in self.valid_date_offsets_singular: + # expect the singular offset value to match between tstart and t + datepart_offset = getattr( + t, offset_name if offset_name != "weekday" else "dayofweek" + ) + assert datepart_offset == offset.kwds[offset_name] + else: + # the offset should be the same as if it was done in UTC + assert t == (tstart.tz_convert("UTC") + offset).tz_convert("US/Pacific") + + def _make_timestamp(self, string, hrs_offset, tz): + if hrs_offset >= 0: + offset_string = f"{hrs_offset:02d}00" + else: + offset_string = f"-{(hrs_offset * -1):02}00" + return Timestamp(string + offset_string).tz_convert(tz) + + def test_springforward_plural(self): + # test moving from standard to daylight savings + for tz, utc_offsets in self.timezone_utc_offsets.items(): + hrs_pre = utc_offsets["utc_offset_standard"] + hrs_post = utc_offsets["utc_offset_daylight"] + self._test_all_offsets( + n=3, + tstart=self._make_timestamp(self.ts_pre_springfwd, hrs_pre, tz), + expected_utc_offset=hrs_post, + ) + + def test_fallback_singular(self): + # in the case of singular offsets, we don't necessarily know which utc + # offset the new Timestamp will wind up in (the tz for 1 month may be + # different from 1 second) so we don't specify an expected_utc_offset + for tz, utc_offsets in self.timezone_utc_offsets.items(): + hrs_pre = utc_offsets["utc_offset_standard"] + self._test_all_offsets( + n=1, + tstart=self._make_timestamp(self.ts_pre_fallback, hrs_pre, tz), + expected_utc_offset=None, + ) + + def test_springforward_singular(self): + for tz, utc_offsets in self.timezone_utc_offsets.items(): + hrs_pre = utc_offsets["utc_offset_standard"] + self._test_all_offsets( + n=1, + tstart=self._make_timestamp(self.ts_pre_springfwd, hrs_pre, tz), + expected_utc_offset=None, + ) + + offset_classes = { + MonthBegin: ["11/2/2012", "12/1/2012"], + MonthEnd: ["11/2/2012", "11/30/2012"], + BMonthBegin: ["11/2/2012", "12/3/2012"], + BMonthEnd: ["11/2/2012", "11/30/2012"], + CBMonthBegin: ["11/2/2012", "12/3/2012"], + CBMonthEnd: ["11/2/2012", "11/30/2012"], + SemiMonthBegin: ["11/2/2012", "11/15/2012"], + SemiMonthEnd: ["11/2/2012", "11/15/2012"], + Week: ["11/2/2012", "11/9/2012"], + YearBegin: ["11/2/2012", "1/1/2013"], + YearEnd: ["11/2/2012", "12/31/2012"], + BYearBegin: ["11/2/2012", "1/1/2013"], + BYearEnd: ["11/2/2012", "12/31/2012"], + QuarterBegin: ["11/2/2012", "12/1/2012"], + QuarterEnd: ["11/2/2012", "12/31/2012"], + BQuarterBegin: ["11/2/2012", "12/3/2012"], + BQuarterEnd: ["11/2/2012", "12/31/2012"], + Day: ["11/4/2012", "11/4/2012 23:00"], + }.items() + + @pytest.mark.parametrize("tup", offset_classes) + def test_all_offset_classes(self, tup): + offset, test_values = tup + + first = Timestamp(test_values[0], tz="US/Eastern") + offset() + second = Timestamp(test_values[1], tz="US/Eastern") + assert first == second + + +# --------------------------------------------------------------------- +def test_get_offset_day_error(): + # subclass of _BaseOffset must override _day_opt attribute, or we should + # get a NotImplementedError + + with pytest.raises(NotImplementedError): + DateOffset()._get_offset_day(datetime.now()) + + +def test_valid_default_arguments(offset_types): + # GH#19142 check that the calling the constructors without passing + # any keyword arguments produce valid offsets + cls = offset_types + cls() + + +@pytest.mark.parametrize("kwd", sorted(liboffsets.relativedelta_kwds)) +def test_valid_month_attributes(kwd, month_classes): + # GH#18226 + cls = month_classes + # check that we cannot create e.g. MonthEnd(weeks=3) + with pytest.raises(TypeError): + cls(**{kwd: 3}) + + +@pytest.mark.parametrize("kwd", sorted(liboffsets.relativedelta_kwds)) +def test_valid_relativedelta_kwargs(kwd): + # Check that all the arguments specified in liboffsets.relativedelta_kwds + # are in fact valid relativedelta keyword args + DateOffset(**{kwd: 1}) + + +@pytest.mark.parametrize("kwd", sorted(liboffsets.relativedelta_kwds)) +def test_valid_tick_attributes(kwd, tick_classes): + # GH#18226 + cls = tick_classes + # check that we cannot create e.g. Hour(weeks=3) + with pytest.raises(TypeError): + cls(**{kwd: 3}) + + +def test_validate_n_error(): + with pytest.raises(TypeError): + DateOffset(n="Doh!") + + with pytest.raises(TypeError): + MonthBegin(n=timedelta(1)) + + with pytest.raises(TypeError): + BDay(n=np.array([1, 2], dtype=np.int64)) + + +def test_require_integers(offset_types): + cls = offset_types + with pytest.raises(ValueError): + cls(n=1.5) + + +def test_tick_normalize_raises(tick_classes): + # check that trying to create a Tick object with normalize=True raises + # GH#21427 + cls = tick_classes + with pytest.raises(ValueError): + cls(n=3, normalize=True) + + +def test_weeks_onoffset(): + # GH#18510 Week with weekday = None, normalize = False should always + # be is_on_offset + offset = Week(n=2, weekday=None) + ts = Timestamp("1862-01-13 09:03:34.873477378+0210", tz="Africa/Lusaka") + fast = offset.is_on_offset(ts) + slow = (ts + offset) - offset == ts + assert fast == slow + + # negative n + offset = Week(n=2, weekday=None) + ts = Timestamp("1856-10-24 16:18:36.556360110-0717", tz="Pacific/Easter") + fast = offset.is_on_offset(ts) + slow = (ts + offset) - offset == ts + assert fast == slow + + +def test_weekofmonth_onoffset(): + # GH#18864 + # Make sure that nanoseconds don't trip up is_on_offset (and with it apply) + offset = WeekOfMonth(n=2, week=2, weekday=0) + ts = Timestamp("1916-05-15 01:14:49.583410462+0422", tz="Asia/Qyzylorda") + fast = offset.is_on_offset(ts) + slow = (ts + offset) - offset == ts + assert fast == slow + + # negative n + offset = WeekOfMonth(n=-3, week=1, weekday=0) + ts = Timestamp("1980-12-08 03:38:52.878321185+0500", tz="Asia/Oral") + fast = offset.is_on_offset(ts) + slow = (ts + offset) - offset == ts + assert fast == slow + + +def test_last_week_of_month_on_offset(): + # GH#19036, GH#18977 _adjust_dst was incorrect for LastWeekOfMonth + offset = LastWeekOfMonth(n=4, weekday=6) + ts = Timestamp("1917-05-27 20:55:27.084284178+0200", tz="Europe/Warsaw") + slow = (ts + offset) - offset == ts + fast = offset.is_on_offset(ts) + assert fast == slow + + # negative n + offset = LastWeekOfMonth(n=-4, weekday=5) + ts = Timestamp("2005-08-27 05:01:42.799392561-0500", tz="America/Rainy_River") + slow = (ts + offset) - offset == ts + fast = offset.is_on_offset(ts) + assert fast == slow + + +def test_week_add_invalid(): + # Week with weekday should raise TypeError and _not_ AttributeError + # when adding invalid offset + offset = Week(weekday=1) + other = Day() + with pytest.raises(TypeError, match="Cannot add"): + offset + other diff --git a/venv/Lib/site-packages/pandas/tests/tseries/offsets/test_offsets_properties.py b/venv/Lib/site-packages/pandas/tests/tseries/offsets/test_offsets_properties.py new file mode 100644 index 0000000..716d3ff --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/tseries/offsets/test_offsets_properties.py @@ -0,0 +1,140 @@ +""" +Behavioral based tests for offsets and date_range. + +This file is adapted from https://github.com/pandas-dev/pandas/pull/18761 - +which was more ambitious but less idiomatic in its use of Hypothesis. + +You may wish to consult the previous version for inspiration on further +tests, or when trying to pin down the bugs exposed by the tests below. +""" +import warnings + +from hypothesis import assume, given, strategies as st +from hypothesis.extra.dateutil import timezones as dateutil_timezones +from hypothesis.extra.pytz import timezones as pytz_timezones +import pytest + +import pandas as pd +from pandas import Timestamp + +from pandas.tseries.offsets import ( + BMonthBegin, + BMonthEnd, + BQuarterBegin, + BQuarterEnd, + BYearBegin, + BYearEnd, + MonthBegin, + MonthEnd, + QuarterBegin, + QuarterEnd, + YearBegin, + YearEnd, +) + +# ---------------------------------------------------------------- +# Helpers for generating random data + +with warnings.catch_warnings(): + warnings.simplefilter("ignore") + min_dt = Timestamp(1900, 1, 1).to_pydatetime() + max_dt = Timestamp(1900, 1, 1).to_pydatetime() + +gen_date_range = st.builds( + pd.date_range, + start=st.datetimes( + # TODO: Choose the min/max values more systematically + min_value=Timestamp(1900, 1, 1).to_pydatetime(), + max_value=Timestamp(2100, 1, 1).to_pydatetime(), + ), + periods=st.integers(min_value=2, max_value=100), + freq=st.sampled_from("Y Q M D H T s ms us ns".split()), + tz=st.one_of(st.none(), dateutil_timezones(), pytz_timezones()), +) + +gen_random_datetime = st.datetimes( + min_value=min_dt, + max_value=max_dt, + timezones=st.one_of(st.none(), dateutil_timezones(), pytz_timezones()), +) + +# The strategy for each type is registered in conftest.py, as they don't carry +# enough runtime information (e.g. type hints) to infer how to build them. +gen_yqm_offset = st.one_of( + *map( + st.from_type, + [ + MonthBegin, + MonthEnd, + BMonthBegin, + BMonthEnd, + QuarterBegin, + QuarterEnd, + BQuarterBegin, + BQuarterEnd, + YearBegin, + YearEnd, + BYearBegin, + BYearEnd, + ], + ) +) + + +# ---------------------------------------------------------------- +# Offset-specific behaviour tests + + +# Based on CI runs: Always passes on OSX, fails on Linux, sometimes on Windows +@pytest.mark.xfail(strict=False, reason="inconsistent between OSs, Pythons") +@given(gen_random_datetime, gen_yqm_offset) +def test_on_offset_implementations(dt, offset): + assume(not offset.normalize) + # check that the class-specific implementations of is_on_offset match + # the general case definition: + # (dt + offset) - offset == dt + compare = (dt + offset) - offset + assert offset.is_on_offset(dt) == (compare == dt) + + +@pytest.mark.xfail( + reason="res_v2 below is incorrect, needs to use the " + "commented-out version with tz_localize. " + "But with that fix in place, hypothesis then " + "has errors in timezone generation." +) +@given(gen_yqm_offset, gen_date_range) +def test_apply_index_implementations(offset, rng): + # offset.apply_index(dti)[i] should match dti[i] + offset + assume(offset.n != 0) # TODO: test for that case separately + + # rng = pd.date_range(start='1/1/2000', periods=100000, freq='T') + ser = pd.Series(rng) + + res = rng + offset + res_v2 = offset.apply_index(rng) + # res_v2 = offset.apply_index(rng.tz_localize(None)).tz_localize(rng.tz) + assert (res == res_v2).all() + + assert res[0] == rng[0] + offset + assert res[-1] == rng[-1] + offset + res2 = ser + offset + # apply_index is only for indexes, not series, so no res2_v2 + assert res2.iloc[0] == ser.iloc[0] + offset + assert res2.iloc[-1] == ser.iloc[-1] + offset + # TODO: Check randomly assorted entries, not just first/last + + +@pytest.mark.xfail # TODO: reason? +@given(gen_yqm_offset) +def test_shift_across_dst(offset): + # GH#18319 check that 1) timezone is correctly normalized and + # 2) that hour is not incorrectly changed by this normalization + # Note that dti includes a transition across DST boundary + dti = pd.date_range( + start="2017-10-30 12:00:00", end="2017-11-06", freq="D", tz="US/Eastern" + ) + assert (dti.hour == 12).all() # we haven't screwed up yet + + res = dti + offset + assert (res.hour == 12).all() diff --git a/venv/Lib/site-packages/pandas/tests/tseries/offsets/test_ticks.py b/venv/Lib/site-packages/pandas/tests/tseries/offsets/test_ticks.py new file mode 100644 index 0000000..297e5c3 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/tseries/offsets/test_ticks.py @@ -0,0 +1,322 @@ +""" +Tests for offsets.Tick and subclasses +""" +from datetime import datetime, timedelta + +from hypothesis import assume, example, given, settings, strategies as st +import numpy as np +import pytest + +from pandas import Timedelta, Timestamp +import pandas._testing as tm + +from pandas.tseries import offsets +from pandas.tseries.offsets import Hour, Micro, Milli, Minute, Nano, Second + +from .common import assert_offset_equal + +# --------------------------------------------------------------------- +# Test Helpers + +tick_classes = [Hour, Minute, Second, Milli, Micro, Nano] + + +# --------------------------------------------------------------------- + + +def test_apply_ticks(): + result = offsets.Hour(3).apply(offsets.Hour(4)) + exp = offsets.Hour(7) + assert result == exp + + +def test_delta_to_tick(): + delta = timedelta(3) + + tick = offsets._delta_to_tick(delta) + assert tick == offsets.Day(3) + + td = Timedelta(nanoseconds=5) + tick = offsets._delta_to_tick(td) + assert tick == Nano(5) + + +@pytest.mark.parametrize("cls", tick_classes) +@settings(deadline=None) # GH 24641 +@example(n=2, m=3) +@example(n=800, m=300) +@example(n=1000, m=5) +@given(n=st.integers(-999, 999), m=st.integers(-999, 999)) +def test_tick_add_sub(cls, n, m): + # For all Tick subclasses and all integers n, m, we should have + # tick(n) + tick(m) == tick(n+m) + # tick(n) - tick(m) == tick(n-m) + left = cls(n) + right = cls(m) + expected = cls(n + m) + + assert left + right == expected + assert left.apply(right) == expected + + expected = cls(n - m) + assert left - right == expected + + +@pytest.mark.parametrize("cls", tick_classes) +@settings(deadline=None) +@example(n=2, m=3) +@given(n=st.integers(-999, 999), m=st.integers(-999, 999)) +def test_tick_equality(cls, n, m): + assume(m != n) + # tick == tock iff tick.n == tock.n + left = cls(n) + right = cls(m) + assert left != right + assert not (left == right) + + right = cls(n) + assert left == right + assert not (left != right) + + if n != 0: + assert cls(n) != cls(-n) + + +# --------------------------------------------------------------------- + + +def test_Hour(): + assert_offset_equal(Hour(), datetime(2010, 1, 1), datetime(2010, 1, 1, 1)) + assert_offset_equal(Hour(-1), datetime(2010, 1, 1, 1), datetime(2010, 1, 1)) + assert_offset_equal(2 * Hour(), datetime(2010, 1, 1), datetime(2010, 1, 1, 2)) + assert_offset_equal(-1 * Hour(), datetime(2010, 1, 1, 1), datetime(2010, 1, 1)) + + assert Hour(3) + Hour(2) == Hour(5) + assert Hour(3) - Hour(2) == Hour() + + assert Hour(4) != Hour(1) + + +def test_Minute(): + assert_offset_equal(Minute(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 1)) + assert_offset_equal(Minute(-1), datetime(2010, 1, 1, 0, 1), datetime(2010, 1, 1)) + assert_offset_equal(2 * Minute(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 2)) + assert_offset_equal(-1 * Minute(), datetime(2010, 1, 1, 0, 1), datetime(2010, 1, 1)) + + assert Minute(3) + Minute(2) == Minute(5) + assert Minute(3) - Minute(2) == Minute() + assert Minute(5) != Minute() + + +def test_Second(): + assert_offset_equal(Second(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 0, 1)) + assert_offset_equal(Second(-1), datetime(2010, 1, 1, 0, 0, 1), datetime(2010, 1, 1)) + assert_offset_equal( + 2 * Second(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 0, 2) + ) + assert_offset_equal( + -1 * Second(), datetime(2010, 1, 1, 0, 0, 1), datetime(2010, 1, 1) + ) + + assert Second(3) + Second(2) == Second(5) + assert Second(3) - Second(2) == Second() + + +def test_Millisecond(): + assert_offset_equal( + Milli(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 0, 0, 1000) + ) + assert_offset_equal( + Milli(-1), datetime(2010, 1, 1, 0, 0, 0, 1000), datetime(2010, 1, 1) + ) + assert_offset_equal( + Milli(2), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 0, 0, 2000) + ) + assert_offset_equal( + 2 * Milli(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 0, 0, 2000) + ) + assert_offset_equal( + -1 * Milli(), datetime(2010, 1, 1, 0, 0, 0, 1000), datetime(2010, 1, 1) + ) + + assert Milli(3) + Milli(2) == Milli(5) + assert Milli(3) - Milli(2) == Milli() + + +def test_MillisecondTimestampArithmetic(): + assert_offset_equal( + Milli(), Timestamp("2010-01-01"), Timestamp("2010-01-01 00:00:00.001") + ) + assert_offset_equal( + Milli(-1), Timestamp("2010-01-01 00:00:00.001"), Timestamp("2010-01-01") + ) + + +def test_Microsecond(): + assert_offset_equal(Micro(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 0, 0, 1)) + assert_offset_equal( + Micro(-1), datetime(2010, 1, 1, 0, 0, 0, 1), datetime(2010, 1, 1) + ) + + assert_offset_equal( + 2 * Micro(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 0, 0, 2) + ) + assert_offset_equal( + -1 * Micro(), datetime(2010, 1, 1, 0, 0, 0, 1), datetime(2010, 1, 1) + ) + + assert Micro(3) + Micro(2) == Micro(5) + assert Micro(3) - Micro(2) == Micro() + + +def test_NanosecondGeneric(): + timestamp = Timestamp(datetime(2010, 1, 1)) + assert timestamp.nanosecond == 0 + + result = timestamp + Nano(10) + assert result.nanosecond == 10 + + reverse_result = Nano(10) + timestamp + assert reverse_result.nanosecond == 10 + + +def test_Nanosecond(): + timestamp = Timestamp(datetime(2010, 1, 1)) + assert_offset_equal(Nano(), timestamp, timestamp + np.timedelta64(1, "ns")) + assert_offset_equal(Nano(-1), timestamp + np.timedelta64(1, "ns"), timestamp) + assert_offset_equal(2 * Nano(), timestamp, timestamp + np.timedelta64(2, "ns")) + assert_offset_equal(-1 * Nano(), timestamp + np.timedelta64(1, "ns"), timestamp) + + assert Nano(3) + Nano(2) == Nano(5) + assert Nano(3) - Nano(2) == Nano() + + # GH9284 + assert Nano(1) + Nano(10) == Nano(11) + assert Nano(5) + Micro(1) == Nano(1005) + assert Micro(5) + Nano(1) == Nano(5001) + + +@pytest.mark.parametrize( + "kls, expected", + [ + (Hour, Timedelta(hours=5)), + (Minute, Timedelta(hours=2, minutes=3)), + (Second, Timedelta(hours=2, seconds=3)), + (Milli, Timedelta(hours=2, milliseconds=3)), + (Micro, Timedelta(hours=2, microseconds=3)), + (Nano, Timedelta(hours=2, nanoseconds=3)), + ], +) +def test_tick_addition(kls, expected): + offset = kls(3) + result = offset + Timedelta(hours=2) + assert isinstance(result, Timedelta) + assert result == expected + + +@pytest.mark.parametrize("cls", tick_classes) +def test_tick_division(cls): + off = cls(10) + + assert off / cls(5) == 2 + assert off / 2 == cls(5) + assert off / 2.0 == cls(5) + + assert off / off.delta == 1 + assert off / off.delta.to_timedelta64() == 1 + + assert off / Nano(1) == off.delta / Nano(1).delta + + if cls is not Nano: + # A case where we end up with a smaller class + result = off / 1000 + assert isinstance(result, offsets.Tick) + assert not isinstance(result, cls) + assert result.delta == off.delta / 1000 + + if cls._inc < Timedelta(seconds=1): + # Case where we end up with a bigger class + result = off / 0.001 + assert isinstance(result, offsets.Tick) + assert not isinstance(result, cls) + assert result.delta == off.delta / 0.001 + + +@pytest.mark.parametrize("cls", tick_classes) +def test_tick_rdiv(cls): + off = cls(10) + delta = off.delta + td64 = delta.to_timedelta64() + + with pytest.raises(TypeError): + 2 / off + with pytest.raises(TypeError): + 2.0 / off + + assert (td64 * 2.5) / off == 2.5 + + if cls is not Nano: + # skip pytimedelta for Nano since it gets dropped + assert (delta.to_pytimedelta() * 2) / off == 2 + + result = np.array([2 * td64, td64]) / off + expected = np.array([2.0, 1.0]) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("cls1", tick_classes) +@pytest.mark.parametrize("cls2", tick_classes) +def test_tick_zero(cls1, cls2): + assert cls1(0) == cls2(0) + assert cls1(0) + cls2(0) == cls1(0) + + if cls1 is not Nano: + assert cls1(2) + cls2(0) == cls1(2) + + if cls1 is Nano: + assert cls1(2) + Nano(0) == cls1(2) + + +@pytest.mark.parametrize("cls", tick_classes) +def test_tick_equalities(cls): + assert cls() == cls(1) + + +@pytest.mark.parametrize("cls", tick_classes) +def test_tick_offset(cls): + assert not cls().is_anchored() + + +@pytest.mark.parametrize("cls", tick_classes) +def test_compare_ticks(cls): + three = cls(3) + four = cls(4) + + assert three < cls(4) + assert cls(3) < four + assert four > cls(3) + assert cls(4) > three + assert cls(3) == cls(3) + assert cls(3) != cls(4) + + +@pytest.mark.parametrize("cls", tick_classes) +def test_compare_ticks_to_strs(cls): + # GH#23524 + off = cls(19) + + # These tests should work with any strings, but we particularly are + # interested in "infer" as that comparison is convenient to make in + # Datetime/Timedelta Array/Index constructors + assert not off == "infer" + assert not "foo" == off + + for left, right in [("infer", off), (off, "infer")]: + with pytest.raises(TypeError): + left < right + with pytest.raises(TypeError): + left <= right + with pytest.raises(TypeError): + left > right + with pytest.raises(TypeError): + left >= right diff --git a/venv/Lib/site-packages/pandas/tests/tseries/offsets/test_yqm_offsets.py b/venv/Lib/site-packages/pandas/tests/tseries/offsets/test_yqm_offsets.py new file mode 100644 index 0000000..79a0e0f --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/tseries/offsets/test_yqm_offsets.py @@ -0,0 +1,1464 @@ +""" +Tests for Year, Quarter, and Month-based DateOffset subclasses +""" +from datetime import datetime + +import pytest + +import pandas as pd +from pandas import Timestamp + +from pandas.tseries.offsets import ( + BMonthBegin, + BMonthEnd, + BQuarterBegin, + BQuarterEnd, + BYearBegin, + BYearEnd, + MonthBegin, + MonthEnd, + QuarterBegin, + QuarterEnd, + YearBegin, + YearEnd, +) + +from .common import assert_is_on_offset, assert_offset_equal +from .test_offsets import Base + +# -------------------------------------------------------------------- +# Misc + + +def test_quarterly_dont_normalize(): + date = datetime(2012, 3, 31, 5, 30) + + offsets = (QuarterBegin, QuarterEnd, BQuarterEnd, BQuarterBegin) + + for klass in offsets: + result = date + klass() + assert result.time() == date.time() + + +@pytest.mark.parametrize("n", [-2, 1]) +@pytest.mark.parametrize( + "cls", + [ + MonthBegin, + MonthEnd, + BMonthBegin, + BMonthEnd, + QuarterBegin, + QuarterEnd, + BQuarterBegin, + BQuarterEnd, + YearBegin, + YearEnd, + BYearBegin, + BYearEnd, + ], +) +def test_apply_index(cls, n): + offset = cls(n=n) + rng = pd.date_range(start="1/1/2000", periods=100000, freq="T") + ser = pd.Series(rng) + + res = rng + offset + res_v2 = offset.apply_index(rng) + assert (res == res_v2).all() + assert res[0] == rng[0] + offset + assert res[-1] == rng[-1] + offset + res2 = ser + offset + # apply_index is only for indexes, not series, so no res2_v2 + assert res2.iloc[0] == ser.iloc[0] + offset + assert res2.iloc[-1] == ser.iloc[-1] + offset + + +@pytest.mark.parametrize( + "offset", [QuarterBegin(), QuarterEnd(), BQuarterBegin(), BQuarterEnd()] +) +def test_on_offset(offset): + dates = [ + datetime(2016, m, d) + for m in [10, 11, 12] + for d in [1, 2, 3, 28, 29, 30, 31] + if not (m == 11 and d == 31) + ] + for date in dates: + res = offset.is_on_offset(date) + slow_version = date == (date + offset) - offset + assert res == slow_version + + +# -------------------------------------------------------------------- +# Months + + +class TestMonthBegin(Base): + _offset = MonthBegin + + offset_cases = [] + # NOTE: I'm not entirely happy with the logic here for Begin -ss + # see thread 'offset conventions' on the ML + offset_cases.append( + ( + MonthBegin(), + { + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2008, 2, 1): datetime(2008, 3, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2006, 12, 1): datetime(2007, 1, 1), + datetime(2007, 1, 31): datetime(2007, 2, 1), + }, + ) + ) + + offset_cases.append( + ( + MonthBegin(0), + { + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2006, 12, 3): datetime(2007, 1, 1), + datetime(2007, 1, 31): datetime(2007, 2, 1), + }, + ) + ) + + offset_cases.append( + ( + MonthBegin(2), + { + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 1, 31): datetime(2008, 3, 1), + datetime(2006, 12, 31): datetime(2007, 2, 1), + datetime(2007, 12, 28): datetime(2008, 2, 1), + datetime(2007, 1, 1): datetime(2007, 3, 1), + datetime(2006, 11, 1): datetime(2007, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + MonthBegin(-1), + { + datetime(2007, 1, 1): datetime(2006, 12, 1), + datetime(2008, 5, 31): datetime(2008, 5, 1), + datetime(2008, 12, 31): datetime(2008, 12, 1), + datetime(2006, 12, 29): datetime(2006, 12, 1), + datetime(2006, 1, 2): datetime(2006, 1, 1), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + +class TestMonthEnd(Base): + _offset = MonthEnd + + def test_day_of_month(self): + dt = datetime(2007, 1, 1) + offset = MonthEnd() + + result = dt + offset + assert result == Timestamp(2007, 1, 31) + + result = result + offset + assert result == Timestamp(2007, 2, 28) + + def test_normalize(self): + dt = datetime(2007, 1, 1, 3) + + result = dt + MonthEnd(normalize=True) + expected = dt.replace(hour=0) + MonthEnd() + assert result == expected + + offset_cases = [] + offset_cases.append( + ( + MonthEnd(), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2007, 1, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31), + datetime(2006, 12, 1): datetime(2006, 12, 31), + }, + ) + ) + + offset_cases.append( + ( + MonthEnd(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2006, 12, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31), + }, + ) + ) + + offset_cases.append( + ( + MonthEnd(2), + { + datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 1, 31): datetime(2008, 3, 31), + datetime(2006, 12, 29): datetime(2007, 1, 31), + datetime(2006, 12, 31): datetime(2007, 2, 28), + datetime(2007, 1, 1): datetime(2007, 2, 28), + datetime(2006, 11, 1): datetime(2006, 12, 31), + }, + ) + ) + + offset_cases.append( + ( + MonthEnd(-1), + { + datetime(2007, 1, 1): datetime(2006, 12, 31), + datetime(2008, 6, 30): datetime(2008, 5, 31), + datetime(2008, 12, 31): datetime(2008, 11, 30), + datetime(2006, 12, 29): datetime(2006, 11, 30), + datetime(2006, 12, 30): datetime(2006, 11, 30), + datetime(2007, 1, 1): datetime(2006, 12, 31), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [ + (MonthEnd(), datetime(2007, 12, 31), True), + (MonthEnd(), datetime(2008, 1, 1), False), + ] + + @pytest.mark.parametrize("case", on_offset_cases) + def test_is_on_offset(self, case): + offset, dt, expected = case + assert_is_on_offset(offset, dt, expected) + + +class TestBMonthBegin(Base): + _offset = BMonthBegin + + def test_offsets_compare_equal(self): + # root cause of #456 + offset1 = BMonthBegin() + offset2 = BMonthBegin() + assert not offset1 != offset2 + + offset_cases = [] + offset_cases.append( + ( + BMonthBegin(), + { + datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2006, 9, 1): datetime(2006, 10, 2), + datetime(2007, 1, 1): datetime(2007, 2, 1), + datetime(2006, 12, 1): datetime(2007, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + BMonthBegin(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2006, 10, 2): datetime(2006, 10, 2), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2006, 9, 15): datetime(2006, 10, 2), + }, + ) + ) + + offset_cases.append( + ( + BMonthBegin(2), + { + datetime(2008, 1, 1): datetime(2008, 3, 3), + datetime(2008, 1, 15): datetime(2008, 3, 3), + datetime(2006, 12, 29): datetime(2007, 2, 1), + datetime(2006, 12, 31): datetime(2007, 2, 1), + datetime(2007, 1, 1): datetime(2007, 3, 1), + datetime(2006, 11, 1): datetime(2007, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + BMonthBegin(-1), + { + datetime(2007, 1, 1): datetime(2006, 12, 1), + datetime(2008, 6, 30): datetime(2008, 6, 2), + datetime(2008, 6, 1): datetime(2008, 5, 1), + datetime(2008, 3, 10): datetime(2008, 3, 3), + datetime(2008, 12, 31): datetime(2008, 12, 1), + datetime(2006, 12, 29): datetime(2006, 12, 1), + datetime(2006, 12, 30): datetime(2006, 12, 1), + datetime(2007, 1, 1): datetime(2006, 12, 1), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [ + (BMonthBegin(), datetime(2007, 12, 31), False), + (BMonthBegin(), datetime(2008, 1, 1), True), + (BMonthBegin(), datetime(2001, 4, 2), True), + (BMonthBegin(), datetime(2008, 3, 3), True), + ] + + @pytest.mark.parametrize("case", on_offset_cases) + def test_is_on_offset(self, case): + offset, dt, expected = case + assert_is_on_offset(offset, dt, expected) + + +class TestBMonthEnd(Base): + _offset = BMonthEnd + + def test_normalize(self): + dt = datetime(2007, 1, 1, 3) + + result = dt + BMonthEnd(normalize=True) + expected = dt.replace(hour=0) + BMonthEnd() + assert result == expected + + def test_offsets_compare_equal(self): + # root cause of #456 + offset1 = BMonthEnd() + offset2 = BMonthEnd() + assert not offset1 != offset2 + + offset_cases = [] + offset_cases.append( + ( + BMonthEnd(), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2006, 12, 29): datetime(2007, 1, 31), + datetime(2006, 12, 31): datetime(2007, 1, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31), + datetime(2006, 12, 1): datetime(2006, 12, 29), + }, + ) + ) + + offset_cases.append( + ( + BMonthEnd(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2006, 12, 29): datetime(2006, 12, 29), + datetime(2006, 12, 31): datetime(2007, 1, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31), + }, + ) + ) + + offset_cases.append( + ( + BMonthEnd(2), + { + datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 1, 31): datetime(2008, 3, 31), + datetime(2006, 12, 29): datetime(2007, 2, 28), + datetime(2006, 12, 31): datetime(2007, 2, 28), + datetime(2007, 1, 1): datetime(2007, 2, 28), + datetime(2006, 11, 1): datetime(2006, 12, 29), + }, + ) + ) + + offset_cases.append( + ( + BMonthEnd(-1), + { + datetime(2007, 1, 1): datetime(2006, 12, 29), + datetime(2008, 6, 30): datetime(2008, 5, 30), + datetime(2008, 12, 31): datetime(2008, 11, 28), + datetime(2006, 12, 29): datetime(2006, 11, 30), + datetime(2006, 12, 30): datetime(2006, 12, 29), + datetime(2007, 1, 1): datetime(2006, 12, 29), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [ + (BMonthEnd(), datetime(2007, 12, 31), True), + (BMonthEnd(), datetime(2008, 1, 1), False), + ] + + @pytest.mark.parametrize("case", on_offset_cases) + def test_is_on_offset(self, case): + offset, dt, expected = case + assert_is_on_offset(offset, dt, expected) + + +# -------------------------------------------------------------------- +# Quarters + + +class TestQuarterBegin(Base): + def test_repr(self): + expected = "" + assert repr(QuarterBegin()) == expected + expected = "" + assert repr(QuarterBegin(startingMonth=3)) == expected + expected = "" + assert repr(QuarterBegin(startingMonth=1)) == expected + + def test_is_anchored(self): + assert QuarterBegin(startingMonth=1).is_anchored() + assert QuarterBegin().is_anchored() + assert not QuarterBegin(2, startingMonth=1).is_anchored() + + def test_offset_corner_case(self): + # corner + offset = QuarterBegin(n=-1, startingMonth=1) + assert datetime(2010, 2, 1) + offset == datetime(2010, 1, 1) + + offset_cases = [] + offset_cases.append( + ( + QuarterBegin(startingMonth=1), + { + datetime(2007, 12, 1): datetime(2008, 1, 1), + datetime(2008, 1, 1): datetime(2008, 4, 1), + datetime(2008, 2, 15): datetime(2008, 4, 1), + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 3, 15): datetime(2008, 4, 1), + datetime(2008, 3, 31): datetime(2008, 4, 1), + datetime(2008, 4, 15): datetime(2008, 7, 1), + datetime(2008, 4, 1): datetime(2008, 7, 1), + }, + ) + ) + + offset_cases.append( + ( + QuarterBegin(startingMonth=2), + { + datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2008, 1, 15): datetime(2008, 2, 1), + datetime(2008, 2, 29): datetime(2008, 5, 1), + datetime(2008, 3, 15): datetime(2008, 5, 1), + datetime(2008, 3, 31): datetime(2008, 5, 1), + datetime(2008, 4, 15): datetime(2008, 5, 1), + datetime(2008, 4, 30): datetime(2008, 5, 1), + }, + ) + ) + + offset_cases.append( + ( + QuarterBegin(startingMonth=1, n=0), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 12, 1): datetime(2009, 1, 1), + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 2, 15): datetime(2008, 4, 1), + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 3, 15): datetime(2008, 4, 1), + datetime(2008, 3, 31): datetime(2008, 4, 1), + datetime(2008, 4, 15): datetime(2008, 7, 1), + datetime(2008, 4, 30): datetime(2008, 7, 1), + }, + ) + ) + + offset_cases.append( + ( + QuarterBegin(startingMonth=1, n=-1), + { + datetime(2008, 1, 1): datetime(2007, 10, 1), + datetime(2008, 1, 31): datetime(2008, 1, 1), + datetime(2008, 2, 15): datetime(2008, 1, 1), + datetime(2008, 2, 29): datetime(2008, 1, 1), + datetime(2008, 3, 15): datetime(2008, 1, 1), + datetime(2008, 3, 31): datetime(2008, 1, 1), + datetime(2008, 4, 15): datetime(2008, 4, 1), + datetime(2008, 4, 30): datetime(2008, 4, 1), + datetime(2008, 7, 1): datetime(2008, 4, 1), + }, + ) + ) + + offset_cases.append( + ( + QuarterBegin(startingMonth=1, n=2), + { + datetime(2008, 1, 1): datetime(2008, 7, 1), + datetime(2008, 2, 15): datetime(2008, 7, 1), + datetime(2008, 2, 29): datetime(2008, 7, 1), + datetime(2008, 3, 15): datetime(2008, 7, 1), + datetime(2008, 3, 31): datetime(2008, 7, 1), + datetime(2008, 4, 15): datetime(2008, 10, 1), + datetime(2008, 4, 1): datetime(2008, 10, 1), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + +class TestQuarterEnd(Base): + _offset = QuarterEnd + + def test_repr(self): + expected = "" + assert repr(QuarterEnd()) == expected + expected = "" + assert repr(QuarterEnd(startingMonth=3)) == expected + expected = "" + assert repr(QuarterEnd(startingMonth=1)) == expected + + def test_is_anchored(self): + assert QuarterEnd(startingMonth=1).is_anchored() + assert QuarterEnd().is_anchored() + assert not QuarterEnd(2, startingMonth=1).is_anchored() + + def test_offset_corner_case(self): + # corner + offset = QuarterEnd(n=-1, startingMonth=1) + assert datetime(2010, 2, 1) + offset == datetime(2010, 1, 31) + + offset_cases = [] + offset_cases.append( + ( + QuarterEnd(startingMonth=1), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 4, 30), + datetime(2008, 2, 15): datetime(2008, 4, 30), + datetime(2008, 2, 29): datetime(2008, 4, 30), + datetime(2008, 3, 15): datetime(2008, 4, 30), + datetime(2008, 3, 31): datetime(2008, 4, 30), + datetime(2008, 4, 15): datetime(2008, 4, 30), + datetime(2008, 4, 30): datetime(2008, 7, 31), + }, + ) + ) + + offset_cases.append( + ( + QuarterEnd(startingMonth=2), + { + datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2008, 2, 15): datetime(2008, 2, 29), + datetime(2008, 2, 29): datetime(2008, 5, 31), + datetime(2008, 3, 15): datetime(2008, 5, 31), + datetime(2008, 3, 31): datetime(2008, 5, 31), + datetime(2008, 4, 15): datetime(2008, 5, 31), + datetime(2008, 4, 30): datetime(2008, 5, 31), + }, + ) + ) + + offset_cases.append( + ( + QuarterEnd(startingMonth=1, n=0), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2008, 2, 15): datetime(2008, 4, 30), + datetime(2008, 2, 29): datetime(2008, 4, 30), + datetime(2008, 3, 15): datetime(2008, 4, 30), + datetime(2008, 3, 31): datetime(2008, 4, 30), + datetime(2008, 4, 15): datetime(2008, 4, 30), + datetime(2008, 4, 30): datetime(2008, 4, 30), + }, + ) + ) + + offset_cases.append( + ( + QuarterEnd(startingMonth=1, n=-1), + { + datetime(2008, 1, 1): datetime(2007, 10, 31), + datetime(2008, 1, 31): datetime(2007, 10, 31), + datetime(2008, 2, 15): datetime(2008, 1, 31), + datetime(2008, 2, 29): datetime(2008, 1, 31), + datetime(2008, 3, 15): datetime(2008, 1, 31), + datetime(2008, 3, 31): datetime(2008, 1, 31), + datetime(2008, 4, 15): datetime(2008, 1, 31), + datetime(2008, 4, 30): datetime(2008, 1, 31), + datetime(2008, 7, 1): datetime(2008, 4, 30), + }, + ) + ) + + offset_cases.append( + ( + QuarterEnd(startingMonth=1, n=2), + { + datetime(2008, 1, 31): datetime(2008, 7, 31), + datetime(2008, 2, 15): datetime(2008, 7, 31), + datetime(2008, 2, 29): datetime(2008, 7, 31), + datetime(2008, 3, 15): datetime(2008, 7, 31), + datetime(2008, 3, 31): datetime(2008, 7, 31), + datetime(2008, 4, 15): datetime(2008, 7, 31), + datetime(2008, 4, 30): datetime(2008, 10, 31), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [ + (QuarterEnd(1, startingMonth=1), datetime(2008, 1, 31), True), + (QuarterEnd(1, startingMonth=1), datetime(2007, 12, 31), False), + (QuarterEnd(1, startingMonth=1), datetime(2008, 2, 29), False), + (QuarterEnd(1, startingMonth=1), datetime(2007, 3, 30), False), + (QuarterEnd(1, startingMonth=1), datetime(2007, 3, 31), False), + (QuarterEnd(1, startingMonth=1), datetime(2008, 4, 30), True), + (QuarterEnd(1, startingMonth=1), datetime(2008, 5, 30), False), + (QuarterEnd(1, startingMonth=1), datetime(2008, 5, 31), False), + (QuarterEnd(1, startingMonth=1), datetime(2007, 6, 29), False), + (QuarterEnd(1, startingMonth=1), datetime(2007, 6, 30), False), + (QuarterEnd(1, startingMonth=2), datetime(2008, 1, 31), False), + (QuarterEnd(1, startingMonth=2), datetime(2007, 12, 31), False), + (QuarterEnd(1, startingMonth=2), datetime(2008, 2, 29), True), + (QuarterEnd(1, startingMonth=2), datetime(2007, 3, 30), False), + (QuarterEnd(1, startingMonth=2), datetime(2007, 3, 31), False), + (QuarterEnd(1, startingMonth=2), datetime(2008, 4, 30), False), + (QuarterEnd(1, startingMonth=2), datetime(2008, 5, 30), False), + (QuarterEnd(1, startingMonth=2), datetime(2008, 5, 31), True), + (QuarterEnd(1, startingMonth=2), datetime(2007, 6, 29), False), + (QuarterEnd(1, startingMonth=2), datetime(2007, 6, 30), False), + (QuarterEnd(1, startingMonth=3), datetime(2008, 1, 31), False), + (QuarterEnd(1, startingMonth=3), datetime(2007, 12, 31), True), + (QuarterEnd(1, startingMonth=3), datetime(2008, 2, 29), False), + (QuarterEnd(1, startingMonth=3), datetime(2007, 3, 30), False), + (QuarterEnd(1, startingMonth=3), datetime(2007, 3, 31), True), + (QuarterEnd(1, startingMonth=3), datetime(2008, 4, 30), False), + (QuarterEnd(1, startingMonth=3), datetime(2008, 5, 30), False), + (QuarterEnd(1, startingMonth=3), datetime(2008, 5, 31), False), + (QuarterEnd(1, startingMonth=3), datetime(2007, 6, 29), False), + (QuarterEnd(1, startingMonth=3), datetime(2007, 6, 30), True), + ] + + @pytest.mark.parametrize("case", on_offset_cases) + def test_is_on_offset(self, case): + offset, dt, expected = case + assert_is_on_offset(offset, dt, expected) + + +class TestBQuarterBegin(Base): + _offset = BQuarterBegin + + def test_repr(self): + expected = "" + assert repr(BQuarterBegin()) == expected + expected = "" + assert repr(BQuarterBegin(startingMonth=3)) == expected + expected = "" + assert repr(BQuarterBegin(startingMonth=1)) == expected + + def test_is_anchored(self): + assert BQuarterBegin(startingMonth=1).is_anchored() + assert BQuarterBegin().is_anchored() + assert not BQuarterBegin(2, startingMonth=1).is_anchored() + + def test_offset_corner_case(self): + # corner + offset = BQuarterBegin(n=-1, startingMonth=1) + assert datetime(2007, 4, 3) + offset == datetime(2007, 4, 2) + + offset_cases = [] + offset_cases.append( + ( + BQuarterBegin(startingMonth=1), + { + datetime(2008, 1, 1): datetime(2008, 4, 1), + datetime(2008, 1, 31): datetime(2008, 4, 1), + datetime(2008, 2, 15): datetime(2008, 4, 1), + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 3, 15): datetime(2008, 4, 1), + datetime(2008, 3, 31): datetime(2008, 4, 1), + datetime(2008, 4, 15): datetime(2008, 7, 1), + datetime(2007, 3, 15): datetime(2007, 4, 2), + datetime(2007, 2, 28): datetime(2007, 4, 2), + datetime(2007, 1, 1): datetime(2007, 4, 2), + datetime(2007, 4, 15): datetime(2007, 7, 2), + datetime(2007, 7, 1): datetime(2007, 7, 2), + datetime(2007, 4, 1): datetime(2007, 4, 2), + datetime(2007, 4, 2): datetime(2007, 7, 2), + datetime(2008, 4, 30): datetime(2008, 7, 1), + }, + ) + ) + + offset_cases.append( + ( + BQuarterBegin(startingMonth=2), + { + datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2008, 1, 15): datetime(2008, 2, 1), + datetime(2008, 2, 29): datetime(2008, 5, 1), + datetime(2008, 3, 15): datetime(2008, 5, 1), + datetime(2008, 3, 31): datetime(2008, 5, 1), + datetime(2008, 4, 15): datetime(2008, 5, 1), + datetime(2008, 8, 15): datetime(2008, 11, 3), + datetime(2008, 9, 15): datetime(2008, 11, 3), + datetime(2008, 11, 1): datetime(2008, 11, 3), + datetime(2008, 4, 30): datetime(2008, 5, 1), + }, + ) + ) + + offset_cases.append( + ( + BQuarterBegin(startingMonth=1, n=0), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2007, 12, 31): datetime(2008, 1, 1), + datetime(2008, 2, 15): datetime(2008, 4, 1), + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 1, 15): datetime(2008, 4, 1), + datetime(2008, 2, 27): datetime(2008, 4, 1), + datetime(2008, 3, 15): datetime(2008, 4, 1), + datetime(2007, 4, 1): datetime(2007, 4, 2), + datetime(2007, 4, 2): datetime(2007, 4, 2), + datetime(2007, 7, 1): datetime(2007, 7, 2), + datetime(2007, 4, 15): datetime(2007, 7, 2), + datetime(2007, 7, 2): datetime(2007, 7, 2), + }, + ) + ) + + offset_cases.append( + ( + BQuarterBegin(startingMonth=1, n=-1), + { + datetime(2008, 1, 1): datetime(2007, 10, 1), + datetime(2008, 1, 31): datetime(2008, 1, 1), + datetime(2008, 2, 15): datetime(2008, 1, 1), + datetime(2008, 2, 29): datetime(2008, 1, 1), + datetime(2008, 3, 15): datetime(2008, 1, 1), + datetime(2008, 3, 31): datetime(2008, 1, 1), + datetime(2008, 4, 15): datetime(2008, 4, 1), + datetime(2007, 7, 3): datetime(2007, 7, 2), + datetime(2007, 4, 3): datetime(2007, 4, 2), + datetime(2007, 7, 2): datetime(2007, 4, 2), + datetime(2008, 4, 1): datetime(2008, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + BQuarterBegin(startingMonth=1, n=2), + { + datetime(2008, 1, 1): datetime(2008, 7, 1), + datetime(2008, 1, 15): datetime(2008, 7, 1), + datetime(2008, 2, 29): datetime(2008, 7, 1), + datetime(2008, 3, 15): datetime(2008, 7, 1), + datetime(2007, 3, 31): datetime(2007, 7, 2), + datetime(2007, 4, 15): datetime(2007, 10, 1), + datetime(2008, 4, 30): datetime(2008, 10, 1), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + +class TestBQuarterEnd(Base): + _offset = BQuarterEnd + + def test_repr(self): + expected = "" + assert repr(BQuarterEnd()) == expected + expected = "" + assert repr(BQuarterEnd(startingMonth=3)) == expected + expected = "" + assert repr(BQuarterEnd(startingMonth=1)) == expected + + def test_is_anchored(self): + assert BQuarterEnd(startingMonth=1).is_anchored() + assert BQuarterEnd().is_anchored() + assert not BQuarterEnd(2, startingMonth=1).is_anchored() + + def test_offset_corner_case(self): + # corner + offset = BQuarterEnd(n=-1, startingMonth=1) + assert datetime(2010, 1, 31) + offset == datetime(2010, 1, 29) + + offset_cases = [] + offset_cases.append( + ( + BQuarterEnd(startingMonth=1), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 4, 30), + datetime(2008, 2, 15): datetime(2008, 4, 30), + datetime(2008, 2, 29): datetime(2008, 4, 30), + datetime(2008, 3, 15): datetime(2008, 4, 30), + datetime(2008, 3, 31): datetime(2008, 4, 30), + datetime(2008, 4, 15): datetime(2008, 4, 30), + datetime(2008, 4, 30): datetime(2008, 7, 31), + }, + ) + ) + + offset_cases.append( + ( + BQuarterEnd(startingMonth=2), + { + datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2008, 2, 15): datetime(2008, 2, 29), + datetime(2008, 2, 29): datetime(2008, 5, 30), + datetime(2008, 3, 15): datetime(2008, 5, 30), + datetime(2008, 3, 31): datetime(2008, 5, 30), + datetime(2008, 4, 15): datetime(2008, 5, 30), + datetime(2008, 4, 30): datetime(2008, 5, 30), + }, + ) + ) + + offset_cases.append( + ( + BQuarterEnd(startingMonth=1, n=0), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2008, 2, 15): datetime(2008, 4, 30), + datetime(2008, 2, 29): datetime(2008, 4, 30), + datetime(2008, 3, 15): datetime(2008, 4, 30), + datetime(2008, 3, 31): datetime(2008, 4, 30), + datetime(2008, 4, 15): datetime(2008, 4, 30), + datetime(2008, 4, 30): datetime(2008, 4, 30), + }, + ) + ) + + offset_cases.append( + ( + BQuarterEnd(startingMonth=1, n=-1), + { + datetime(2008, 1, 1): datetime(2007, 10, 31), + datetime(2008, 1, 31): datetime(2007, 10, 31), + datetime(2008, 2, 15): datetime(2008, 1, 31), + datetime(2008, 2, 29): datetime(2008, 1, 31), + datetime(2008, 3, 15): datetime(2008, 1, 31), + datetime(2008, 3, 31): datetime(2008, 1, 31), + datetime(2008, 4, 15): datetime(2008, 1, 31), + datetime(2008, 4, 30): datetime(2008, 1, 31), + }, + ) + ) + + offset_cases.append( + ( + BQuarterEnd(startingMonth=1, n=2), + { + datetime(2008, 1, 31): datetime(2008, 7, 31), + datetime(2008, 2, 15): datetime(2008, 7, 31), + datetime(2008, 2, 29): datetime(2008, 7, 31), + datetime(2008, 3, 15): datetime(2008, 7, 31), + datetime(2008, 3, 31): datetime(2008, 7, 31), + datetime(2008, 4, 15): datetime(2008, 7, 31), + datetime(2008, 4, 30): datetime(2008, 10, 31), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [ + (BQuarterEnd(1, startingMonth=1), datetime(2008, 1, 31), True), + (BQuarterEnd(1, startingMonth=1), datetime(2007, 12, 31), False), + (BQuarterEnd(1, startingMonth=1), datetime(2008, 2, 29), False), + (BQuarterEnd(1, startingMonth=1), datetime(2007, 3, 30), False), + (BQuarterEnd(1, startingMonth=1), datetime(2007, 3, 31), False), + (BQuarterEnd(1, startingMonth=1), datetime(2008, 4, 30), True), + (BQuarterEnd(1, startingMonth=1), datetime(2008, 5, 30), False), + (BQuarterEnd(1, startingMonth=1), datetime(2007, 6, 29), False), + (BQuarterEnd(1, startingMonth=1), datetime(2007, 6, 30), False), + (BQuarterEnd(1, startingMonth=2), datetime(2008, 1, 31), False), + (BQuarterEnd(1, startingMonth=2), datetime(2007, 12, 31), False), + (BQuarterEnd(1, startingMonth=2), datetime(2008, 2, 29), True), + (BQuarterEnd(1, startingMonth=2), datetime(2007, 3, 30), False), + (BQuarterEnd(1, startingMonth=2), datetime(2007, 3, 31), False), + (BQuarterEnd(1, startingMonth=2), datetime(2008, 4, 30), False), + (BQuarterEnd(1, startingMonth=2), datetime(2008, 5, 30), True), + (BQuarterEnd(1, startingMonth=2), datetime(2007, 6, 29), False), + (BQuarterEnd(1, startingMonth=2), datetime(2007, 6, 30), False), + (BQuarterEnd(1, startingMonth=3), datetime(2008, 1, 31), False), + (BQuarterEnd(1, startingMonth=3), datetime(2007, 12, 31), True), + (BQuarterEnd(1, startingMonth=3), datetime(2008, 2, 29), False), + (BQuarterEnd(1, startingMonth=3), datetime(2007, 3, 30), True), + (BQuarterEnd(1, startingMonth=3), datetime(2007, 3, 31), False), + (BQuarterEnd(1, startingMonth=3), datetime(2008, 4, 30), False), + (BQuarterEnd(1, startingMonth=3), datetime(2008, 5, 30), False), + (BQuarterEnd(1, startingMonth=3), datetime(2007, 6, 29), True), + (BQuarterEnd(1, startingMonth=3), datetime(2007, 6, 30), False), + ] + + @pytest.mark.parametrize("case", on_offset_cases) + def test_is_on_offset(self, case): + offset, dt, expected = case + assert_is_on_offset(offset, dt, expected) + + +# -------------------------------------------------------------------- +# Years + + +class TestYearBegin(Base): + _offset = YearBegin + + def test_misspecified(self): + with pytest.raises(ValueError, match="Month must go from 1 to 12"): + YearBegin(month=13) + + offset_cases = [] + offset_cases.append( + ( + YearBegin(), + { + datetime(2008, 1, 1): datetime(2009, 1, 1), + datetime(2008, 6, 30): datetime(2009, 1, 1), + datetime(2008, 12, 31): datetime(2009, 1, 1), + datetime(2005, 12, 30): datetime(2006, 1, 1), + datetime(2005, 12, 31): datetime(2006, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + YearBegin(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 6, 30): datetime(2009, 1, 1), + datetime(2008, 12, 31): datetime(2009, 1, 1), + datetime(2005, 12, 30): datetime(2006, 1, 1), + datetime(2005, 12, 31): datetime(2006, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + YearBegin(3), + { + datetime(2008, 1, 1): datetime(2011, 1, 1), + datetime(2008, 6, 30): datetime(2011, 1, 1), + datetime(2008, 12, 31): datetime(2011, 1, 1), + datetime(2005, 12, 30): datetime(2008, 1, 1), + datetime(2005, 12, 31): datetime(2008, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + YearBegin(-1), + { + datetime(2007, 1, 1): datetime(2006, 1, 1), + datetime(2007, 1, 15): datetime(2007, 1, 1), + datetime(2008, 6, 30): datetime(2008, 1, 1), + datetime(2008, 12, 31): datetime(2008, 1, 1), + datetime(2006, 12, 29): datetime(2006, 1, 1), + datetime(2006, 12, 30): datetime(2006, 1, 1), + datetime(2007, 1, 1): datetime(2006, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + YearBegin(-2), + { + datetime(2007, 1, 1): datetime(2005, 1, 1), + datetime(2008, 6, 30): datetime(2007, 1, 1), + datetime(2008, 12, 31): datetime(2007, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + YearBegin(month=4), + { + datetime(2007, 4, 1): datetime(2008, 4, 1), + datetime(2007, 4, 15): datetime(2008, 4, 1), + datetime(2007, 3, 1): datetime(2007, 4, 1), + datetime(2007, 12, 15): datetime(2008, 4, 1), + datetime(2012, 1, 31): datetime(2012, 4, 1), + }, + ) + ) + + offset_cases.append( + ( + YearBegin(0, month=4), + { + datetime(2007, 4, 1): datetime(2007, 4, 1), + datetime(2007, 3, 1): datetime(2007, 4, 1), + datetime(2007, 12, 15): datetime(2008, 4, 1), + datetime(2012, 1, 31): datetime(2012, 4, 1), + }, + ) + ) + + offset_cases.append( + ( + YearBegin(4, month=4), + { + datetime(2007, 4, 1): datetime(2011, 4, 1), + datetime(2007, 4, 15): datetime(2011, 4, 1), + datetime(2007, 3, 1): datetime(2010, 4, 1), + datetime(2007, 12, 15): datetime(2011, 4, 1), + datetime(2012, 1, 31): datetime(2015, 4, 1), + }, + ) + ) + + offset_cases.append( + ( + YearBegin(-1, month=4), + { + datetime(2007, 4, 1): datetime(2006, 4, 1), + datetime(2007, 3, 1): datetime(2006, 4, 1), + datetime(2007, 12, 15): datetime(2007, 4, 1), + datetime(2012, 1, 31): datetime(2011, 4, 1), + }, + ) + ) + + offset_cases.append( + ( + YearBegin(-3, month=4), + { + datetime(2007, 4, 1): datetime(2004, 4, 1), + datetime(2007, 3, 1): datetime(2004, 4, 1), + datetime(2007, 12, 15): datetime(2005, 4, 1), + datetime(2012, 1, 31): datetime(2009, 4, 1), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [ + (YearBegin(), datetime(2007, 1, 3), False), + (YearBegin(), datetime(2008, 1, 1), True), + (YearBegin(), datetime(2006, 12, 31), False), + (YearBegin(), datetime(2006, 1, 2), False), + ] + + @pytest.mark.parametrize("case", on_offset_cases) + def test_is_on_offset(self, case): + offset, dt, expected = case + assert_is_on_offset(offset, dt, expected) + + +class TestYearEnd(Base): + _offset = YearEnd + + def test_misspecified(self): + with pytest.raises(ValueError, match="Month must go from 1 to 12"): + YearEnd(month=13) + + offset_cases = [] + offset_cases.append( + ( + YearEnd(), + { + datetime(2008, 1, 1): datetime(2008, 12, 31), + datetime(2008, 6, 30): datetime(2008, 12, 31), + datetime(2008, 12, 31): datetime(2009, 12, 31), + datetime(2005, 12, 30): datetime(2005, 12, 31), + datetime(2005, 12, 31): datetime(2006, 12, 31), + }, + ) + ) + + offset_cases.append( + ( + YearEnd(0), + { + datetime(2008, 1, 1): datetime(2008, 12, 31), + datetime(2008, 6, 30): datetime(2008, 12, 31), + datetime(2008, 12, 31): datetime(2008, 12, 31), + datetime(2005, 12, 30): datetime(2005, 12, 31), + }, + ) + ) + + offset_cases.append( + ( + YearEnd(-1), + { + datetime(2007, 1, 1): datetime(2006, 12, 31), + datetime(2008, 6, 30): datetime(2007, 12, 31), + datetime(2008, 12, 31): datetime(2007, 12, 31), + datetime(2006, 12, 29): datetime(2005, 12, 31), + datetime(2006, 12, 30): datetime(2005, 12, 31), + datetime(2007, 1, 1): datetime(2006, 12, 31), + }, + ) + ) + + offset_cases.append( + ( + YearEnd(-2), + { + datetime(2007, 1, 1): datetime(2005, 12, 31), + datetime(2008, 6, 30): datetime(2006, 12, 31), + datetime(2008, 12, 31): datetime(2006, 12, 31), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [ + (YearEnd(), datetime(2007, 12, 31), True), + (YearEnd(), datetime(2008, 1, 1), False), + (YearEnd(), datetime(2006, 12, 31), True), + (YearEnd(), datetime(2006, 12, 29), False), + ] + + @pytest.mark.parametrize("case", on_offset_cases) + def test_is_on_offset(self, case): + offset, dt, expected = case + assert_is_on_offset(offset, dt, expected) + + +class TestYearEndDiffMonth(Base): + offset_cases = [] + offset_cases.append( + ( + YearEnd(month=3), + { + datetime(2008, 1, 1): datetime(2008, 3, 31), + datetime(2008, 2, 15): datetime(2008, 3, 31), + datetime(2008, 3, 31): datetime(2009, 3, 31), + datetime(2008, 3, 30): datetime(2008, 3, 31), + datetime(2005, 3, 31): datetime(2006, 3, 31), + datetime(2006, 7, 30): datetime(2007, 3, 31), + }, + ) + ) + + offset_cases.append( + ( + YearEnd(0, month=3), + { + datetime(2008, 1, 1): datetime(2008, 3, 31), + datetime(2008, 2, 28): datetime(2008, 3, 31), + datetime(2008, 3, 31): datetime(2008, 3, 31), + datetime(2005, 3, 30): datetime(2005, 3, 31), + }, + ) + ) + + offset_cases.append( + ( + YearEnd(-1, month=3), + { + datetime(2007, 1, 1): datetime(2006, 3, 31), + datetime(2008, 2, 28): datetime(2007, 3, 31), + datetime(2008, 3, 31): datetime(2007, 3, 31), + datetime(2006, 3, 29): datetime(2005, 3, 31), + datetime(2006, 3, 30): datetime(2005, 3, 31), + datetime(2007, 3, 1): datetime(2006, 3, 31), + }, + ) + ) + + offset_cases.append( + ( + YearEnd(-2, month=3), + { + datetime(2007, 1, 1): datetime(2005, 3, 31), + datetime(2008, 6, 30): datetime(2007, 3, 31), + datetime(2008, 3, 31): datetime(2006, 3, 31), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [ + (YearEnd(month=3), datetime(2007, 3, 31), True), + (YearEnd(month=3), datetime(2008, 1, 1), False), + (YearEnd(month=3), datetime(2006, 3, 31), True), + (YearEnd(month=3), datetime(2006, 3, 29), False), + ] + + @pytest.mark.parametrize("case", on_offset_cases) + def test_is_on_offset(self, case): + offset, dt, expected = case + assert_is_on_offset(offset, dt, expected) + + +class TestBYearBegin(Base): + _offset = BYearBegin + + def test_misspecified(self): + msg = "Month must go from 1 to 12" + with pytest.raises(ValueError, match=msg): + BYearBegin(month=13) + with pytest.raises(ValueError, match=msg): + BYearEnd(month=13) + + offset_cases = [] + offset_cases.append( + ( + BYearBegin(), + { + datetime(2008, 1, 1): datetime(2009, 1, 1), + datetime(2008, 6, 30): datetime(2009, 1, 1), + datetime(2008, 12, 31): datetime(2009, 1, 1), + datetime(2011, 1, 1): datetime(2011, 1, 3), + datetime(2011, 1, 3): datetime(2012, 1, 2), + datetime(2005, 12, 30): datetime(2006, 1, 2), + datetime(2005, 12, 31): datetime(2006, 1, 2), + }, + ) + ) + + offset_cases.append( + ( + BYearBegin(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 6, 30): datetime(2009, 1, 1), + datetime(2008, 12, 31): datetime(2009, 1, 1), + datetime(2005, 12, 30): datetime(2006, 1, 2), + datetime(2005, 12, 31): datetime(2006, 1, 2), + }, + ) + ) + + offset_cases.append( + ( + BYearBegin(-1), + { + datetime(2007, 1, 1): datetime(2006, 1, 2), + datetime(2009, 1, 4): datetime(2009, 1, 1), + datetime(2009, 1, 1): datetime(2008, 1, 1), + datetime(2008, 6, 30): datetime(2008, 1, 1), + datetime(2008, 12, 31): datetime(2008, 1, 1), + datetime(2006, 12, 29): datetime(2006, 1, 2), + datetime(2006, 12, 30): datetime(2006, 1, 2), + datetime(2006, 1, 1): datetime(2005, 1, 3), + }, + ) + ) + + offset_cases.append( + ( + BYearBegin(-2), + { + datetime(2007, 1, 1): datetime(2005, 1, 3), + datetime(2007, 6, 30): datetime(2006, 1, 2), + datetime(2008, 12, 31): datetime(2007, 1, 1), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + +class TestBYearEnd(Base): + _offset = BYearEnd + + offset_cases = [] + offset_cases.append( + ( + BYearEnd(), + { + datetime(2008, 1, 1): datetime(2008, 12, 31), + datetime(2008, 6, 30): datetime(2008, 12, 31), + datetime(2008, 12, 31): datetime(2009, 12, 31), + datetime(2005, 12, 30): datetime(2006, 12, 29), + datetime(2005, 12, 31): datetime(2006, 12, 29), + }, + ) + ) + + offset_cases.append( + ( + BYearEnd(0), + { + datetime(2008, 1, 1): datetime(2008, 12, 31), + datetime(2008, 6, 30): datetime(2008, 12, 31), + datetime(2008, 12, 31): datetime(2008, 12, 31), + datetime(2005, 12, 31): datetime(2006, 12, 29), + }, + ) + ) + + offset_cases.append( + ( + BYearEnd(-1), + { + datetime(2007, 1, 1): datetime(2006, 12, 29), + datetime(2008, 6, 30): datetime(2007, 12, 31), + datetime(2008, 12, 31): datetime(2007, 12, 31), + datetime(2006, 12, 29): datetime(2005, 12, 30), + datetime(2006, 12, 30): datetime(2006, 12, 29), + datetime(2007, 1, 1): datetime(2006, 12, 29), + }, + ) + ) + + offset_cases.append( + ( + BYearEnd(-2), + { + datetime(2007, 1, 1): datetime(2005, 12, 30), + datetime(2008, 6, 30): datetime(2006, 12, 29), + datetime(2008, 12, 31): datetime(2006, 12, 29), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + on_offset_cases = [ + (BYearEnd(), datetime(2007, 12, 31), True), + (BYearEnd(), datetime(2008, 1, 1), False), + (BYearEnd(), datetime(2006, 12, 31), False), + (BYearEnd(), datetime(2006, 12, 29), True), + ] + + @pytest.mark.parametrize("case", on_offset_cases) + def test_is_on_offset(self, case): + offset, dt, expected = case + assert_is_on_offset(offset, dt, expected) + + +class TestBYearEndLagged(Base): + _offset = BYearEnd + + def test_bad_month_fail(self): + msg = "Month must go from 1 to 12" + with pytest.raises(ValueError, match=msg): + BYearEnd(month=13) + with pytest.raises(ValueError, match=msg): + BYearEnd(month=0) + + offset_cases = [] + offset_cases.append( + ( + BYearEnd(month=6), + { + datetime(2008, 1, 1): datetime(2008, 6, 30), + datetime(2007, 6, 30): datetime(2008, 6, 30), + }, + ) + ) + + offset_cases.append( + ( + BYearEnd(n=-1, month=6), + { + datetime(2008, 1, 1): datetime(2007, 6, 29), + datetime(2007, 6, 30): datetime(2007, 6, 29), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) + def test_offset(self, case): + offset, cases = case + for base, expected in cases.items(): + assert_offset_equal(offset, base, expected) + + def test_roll(self): + offset = BYearEnd(month=6) + date = datetime(2009, 11, 30) + + assert offset.rollforward(date) == datetime(2010, 6, 30) + assert offset.rollback(date) == datetime(2009, 6, 30) + + on_offset_cases = [ + (BYearEnd(month=2), datetime(2007, 2, 28), True), + (BYearEnd(month=6), datetime(2007, 6, 30), False), + ] + + @pytest.mark.parametrize("case", on_offset_cases) + def test_is_on_offset(self, case): + offset, dt, expected = case + assert_is_on_offset(offset, dt, expected) diff --git a/venv/Lib/site-packages/pandas/tests/tslibs/__init__.py b/venv/Lib/site-packages/pandas/tests/tslibs/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/tslibs/test_api.py b/venv/Lib/site-packages/pandas/tests/tslibs/test_api.py new file mode 100644 index 0000000..7a8a6d5 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/tslibs/test_api.py @@ -0,0 +1,47 @@ +"""Tests that the tslibs API is locked down""" + +from pandas._libs import tslibs + + +def test_namespace(): + + submodules = [ + "c_timestamp", + "ccalendar", + "conversion", + "fields", + "frequencies", + "nattype", + "np_datetime", + "offsets", + "parsing", + "period", + "resolution", + "strptime", + "timedeltas", + "timestamps", + "timezones", + "tzconversion", + ] + + api = [ + "NaT", + "NaTType", + "iNaT", + "is_null_datetimelike", + "NullFrequencyError", + "OutOfBoundsDatetime", + "Period", + "IncompatibleFrequency", + "Timedelta", + "Timestamp", + "delta_to_nanoseconds", + "ints_to_pytimedelta", + "localize_pydatetime", + "normalize_date", + "tz_convert_single", + ] + + expected = set(submodules + api) + names = [x for x in dir(tslibs) if not x.startswith("__")] + assert set(names) == expected diff --git a/venv/Lib/site-packages/pandas/tests/tslibs/test_array_to_datetime.py b/venv/Lib/site-packages/pandas/tests/tslibs/test_array_to_datetime.py new file mode 100644 index 0000000..a40fcd7 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/tslibs/test_array_to_datetime.py @@ -0,0 +1,197 @@ +from datetime import date, datetime + +from dateutil.tz.tz import tzoffset +import numpy as np +import pytest +import pytz + +from pandas._libs import iNaT, tslib +from pandas.compat.numpy import np_array_datetime64_compat + +from pandas import Timestamp +import pandas._testing as tm + + +@pytest.mark.parametrize( + "data,expected", + [ + ( + ["01-01-2013", "01-02-2013"], + [ + "2013-01-01T00:00:00.000000000-0000", + "2013-01-02T00:00:00.000000000-0000", + ], + ), + ( + ["Mon Sep 16 2013", "Tue Sep 17 2013"], + [ + "2013-09-16T00:00:00.000000000-0000", + "2013-09-17T00:00:00.000000000-0000", + ], + ), + ], +) +def test_parsing_valid_dates(data, expected): + arr = np.array(data, dtype=object) + result, _ = tslib.array_to_datetime(arr) + + expected = np_array_datetime64_compat(expected, dtype="M8[ns]") + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize( + "dt_string, expected_tz", + [ + ["01-01-2013 08:00:00+08:00", 480], + ["2013-01-01T08:00:00.000000000+0800", 480], + ["2012-12-31T16:00:00.000000000-0800", -480], + ["12-31-2012 23:00:00-01:00", -60], + ], +) +def test_parsing_timezone_offsets(dt_string, expected_tz): + # All of these datetime strings with offsets are equivalent + # to the same datetime after the timezone offset is added. + arr = np.array(["01-01-2013 00:00:00"], dtype=object) + expected, _ = tslib.array_to_datetime(arr) + + arr = np.array([dt_string], dtype=object) + result, result_tz = tslib.array_to_datetime(arr) + + tm.assert_numpy_array_equal(result, expected) + assert result_tz is pytz.FixedOffset(expected_tz) + + +def test_parsing_non_iso_timezone_offset(): + dt_string = "01-01-2013T00:00:00.000000000+0000" + arr = np.array([dt_string], dtype=object) + + result, result_tz = tslib.array_to_datetime(arr) + expected = np.array([np.datetime64("2013-01-01 00:00:00.000000000")]) + + tm.assert_numpy_array_equal(result, expected) + assert result_tz is pytz.FixedOffset(0) + + +def test_parsing_different_timezone_offsets(): + # see gh-17697 + data = ["2015-11-18 15:30:00+05:30", "2015-11-18 15:30:00+06:30"] + data = np.array(data, dtype=object) + + result, result_tz = tslib.array_to_datetime(data) + expected = np.array( + [ + datetime(2015, 11, 18, 15, 30, tzinfo=tzoffset(None, 19800)), + datetime(2015, 11, 18, 15, 30, tzinfo=tzoffset(None, 23400)), + ], + dtype=object, + ) + + tm.assert_numpy_array_equal(result, expected) + assert result_tz is None + + +@pytest.mark.parametrize( + "data", [["-352.737091", "183.575577"], ["1", "2", "3", "4", "5"]] +) +def test_number_looking_strings_not_into_datetime(data): + # see gh-4601 + # + # These strings don't look like datetimes, so + # they shouldn't be attempted to be converted. + arr = np.array(data, dtype=object) + result, _ = tslib.array_to_datetime(arr, errors="ignore") + + tm.assert_numpy_array_equal(result, arr) + + +@pytest.mark.parametrize( + "invalid_date", + [ + date(1000, 1, 1), + datetime(1000, 1, 1), + "1000-01-01", + "Jan 1, 1000", + np.datetime64("1000-01-01"), + ], +) +@pytest.mark.parametrize("errors", ["coerce", "raise"]) +def test_coerce_outside_ns_bounds(invalid_date, errors): + arr = np.array([invalid_date], dtype="object") + kwargs = dict(values=arr, errors=errors) + + if errors == "raise": + msg = "Out of bounds nanosecond timestamp" + + with pytest.raises(ValueError, match=msg): + tslib.array_to_datetime(**kwargs) + else: # coerce. + result, _ = tslib.array_to_datetime(**kwargs) + expected = np.array([iNaT], dtype="M8[ns]") + + tm.assert_numpy_array_equal(result, expected) + + +def test_coerce_outside_ns_bounds_one_valid(): + arr = np.array(["1/1/1000", "1/1/2000"], dtype=object) + result, _ = tslib.array_to_datetime(arr, errors="coerce") + + expected = [iNaT, "2000-01-01T00:00:00.000000000-0000"] + expected = np_array_datetime64_compat(expected, dtype="M8[ns]") + + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("errors", ["ignore", "coerce"]) +def test_coerce_of_invalid_datetimes(errors): + arr = np.array(["01-01-2013", "not_a_date", "1"], dtype=object) + kwargs = dict(values=arr, errors=errors) + + if errors == "ignore": + # Without coercing, the presence of any invalid + # dates prevents any values from being converted. + result, _ = tslib.array_to_datetime(**kwargs) + tm.assert_numpy_array_equal(result, arr) + else: # coerce. + # With coercing, the invalid dates becomes iNaT + result, _ = tslib.array_to_datetime(arr, errors="coerce") + expected = ["2013-01-01T00:00:00.000000000-0000", iNaT, iNaT] + + tm.assert_numpy_array_equal( + result, np_array_datetime64_compat(expected, dtype="M8[ns]") + ) + + +def test_to_datetime_barely_out_of_bounds(): + # see gh-19382, gh-19529 + # + # Close enough to bounds that dropping nanos + # would result in an in-bounds datetime. + arr = np.array(["2262-04-11 23:47:16.854775808"], dtype=object) + msg = "Out of bounds nanosecond timestamp: 2262-04-11 23:47:16" + + with pytest.raises(tslib.OutOfBoundsDatetime, match=msg): + tslib.array_to_datetime(arr) + + +class SubDatetime(datetime): + pass + + +@pytest.mark.parametrize( + "data,expected", + [ + ([SubDatetime(2000, 1, 1)], ["2000-01-01T00:00:00.000000000-0000"]), + ([datetime(2000, 1, 1)], ["2000-01-01T00:00:00.000000000-0000"]), + ([Timestamp(2000, 1, 1)], ["2000-01-01T00:00:00.000000000-0000"]), + ], +) +def test_datetime_subclass(data, expected): + # GH 25851 + # ensure that subclassed datetime works with + # array_to_datetime + + arr = np.array(data, dtype=object) + result, _ = tslib.array_to_datetime(arr) + + expected = np_array_datetime64_compat(expected, dtype="M8[ns]") + tm.assert_numpy_array_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/tslibs/test_ccalendar.py b/venv/Lib/site-packages/pandas/tests/tslibs/test_ccalendar.py new file mode 100644 index 0000000..6f6e324 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/tslibs/test_ccalendar.py @@ -0,0 +1,27 @@ +from datetime import datetime + +import numpy as np +import pytest + +from pandas._libs.tslibs import ccalendar + + +@pytest.mark.parametrize( + "date_tuple,expected", + [ + ((2001, 3, 1), 60), + ((2004, 3, 1), 61), + ((1907, 12, 31), 365), # End-of-year, non-leap year. + ((2004, 12, 31), 366), # End-of-year, leap year. + ], +) +def test_get_day_of_year_numeric(date_tuple, expected): + assert ccalendar.get_day_of_year(*date_tuple) == expected + + +def test_get_day_of_year_dt(): + dt = datetime.fromordinal(1 + np.random.randint(365 * 4000)) + result = ccalendar.get_day_of_year(dt.year, dt.month, dt.day) + + expected = (dt - dt.replace(month=1, day=1)).days + 1 + assert result == expected diff --git a/venv/Lib/site-packages/pandas/tests/tslibs/test_conversion.py b/venv/Lib/site-packages/pandas/tests/tslibs/test_conversion.py new file mode 100644 index 0000000..2beeae8 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/tslibs/test_conversion.py @@ -0,0 +1,100 @@ +from datetime import datetime + +import numpy as np +import pytest +from pytz import UTC + +from pandas._libs.tslib import iNaT +from pandas._libs.tslibs import conversion, timezones, tzconversion + +from pandas import Timestamp, date_range +import pandas._testing as tm + + +def _compare_utc_to_local(tz_didx): + def f(x): + return conversion.tz_convert_single(x, UTC, tz_didx.tz) + + result = tzconversion.tz_convert(tz_didx.asi8, UTC, tz_didx.tz) + expected = np.vectorize(f)(tz_didx.asi8) + + tm.assert_numpy_array_equal(result, expected) + + +def _compare_local_to_utc(tz_didx, utc_didx): + def f(x): + return conversion.tz_convert_single(x, tz_didx.tz, UTC) + + result = tzconversion.tz_convert(utc_didx.asi8, tz_didx.tz, UTC) + expected = np.vectorize(f)(utc_didx.asi8) + + tm.assert_numpy_array_equal(result, expected) + + +def test_tz_convert_single_matches_tz_convert_hourly(tz_aware_fixture): + tz = tz_aware_fixture + tz_didx = date_range("2014-03-01", "2015-01-10", freq="H", tz=tz) + utc_didx = date_range("2014-03-01", "2015-01-10", freq="H") + + _compare_utc_to_local(tz_didx) + _compare_local_to_utc(tz_didx, utc_didx) + + +@pytest.mark.parametrize("freq", ["D", "A"]) +def test_tz_convert_single_matches_tz_convert(tz_aware_fixture, freq): + tz = tz_aware_fixture + tz_didx = date_range("2000-01-01", "2020-01-01", freq=freq, tz=tz) + utc_didx = date_range("2000-01-01", "2020-01-01", freq=freq) + + _compare_utc_to_local(tz_didx) + _compare_local_to_utc(tz_didx, utc_didx) + + +@pytest.mark.parametrize( + "arr", + [ + pytest.param(np.array([], dtype=np.int64), id="empty"), + pytest.param(np.array([iNaT], dtype=np.int64), id="all_nat"), + ], +) +def test_tz_convert_corner(arr): + result = tzconversion.tz_convert( + arr, timezones.maybe_get_tz("US/Eastern"), timezones.maybe_get_tz("Asia/Tokyo") + ) + tm.assert_numpy_array_equal(result, arr) + + +@pytest.mark.parametrize("copy", [True, False]) +@pytest.mark.parametrize("dtype", ["M8[ns]", "M8[s]"]) +def test_length_zero_copy(dtype, copy): + arr = np.array([], dtype=dtype) + result = conversion.ensure_datetime64ns(arr, copy=copy) + assert result.base is (None if copy else arr) + + +class SubDatetime(datetime): + pass + + +@pytest.mark.parametrize( + "dt, expected", + [ + pytest.param( + Timestamp("2000-01-01"), Timestamp("2000-01-01", tz=UTC), id="timestamp" + ), + pytest.param( + datetime(2000, 1, 1), datetime(2000, 1, 1, tzinfo=UTC), id="datetime" + ), + pytest.param( + SubDatetime(2000, 1, 1), + SubDatetime(2000, 1, 1, tzinfo=UTC), + id="subclassed_datetime", + ), + ], +) +def test_localize_pydatetime_dt_types(dt, expected): + # GH 25851 + # ensure that subclassed datetime works with + # localize_pydatetime + result = conversion.localize_pydatetime(dt, UTC) + assert result == expected diff --git a/venv/Lib/site-packages/pandas/tests/tslibs/test_fields.py b/venv/Lib/site-packages/pandas/tests/tslibs/test_fields.py new file mode 100644 index 0000000..943f420 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/tslibs/test_fields.py @@ -0,0 +1,31 @@ +import numpy as np + +from pandas._libs.tslibs import fields + +import pandas._testing as tm + + +def test_fields_readonly(): + # https://github.com/vaexio/vaex/issues/357 + # fields functions should't raise when we pass read-only data + dtindex = np.arange(5, dtype=np.int64) * 10 ** 9 * 3600 * 24 * 32 + dtindex.flags.writeable = False + + result = fields.get_date_name_field(dtindex, "month_name") + expected = np.array( + ["January", "February", "March", "April", "May"], dtype=np.object + ) + tm.assert_numpy_array_equal(result, expected) + + result = fields.get_date_field(dtindex, "Y") + expected = np.array([1970, 1970, 1970, 1970, 1970], dtype=np.int32) + tm.assert_numpy_array_equal(result, expected) + + result = fields.get_start_end_field(dtindex, "is_month_start", None) + expected = np.array([True, False, False, False, False], dtype=np.bool_) + tm.assert_numpy_array_equal(result, expected) + + # treat dtindex as timedeltas for this next one + result = fields.get_timedelta_field(dtindex, "days") + expected = np.arange(5, dtype=np.int32) * 32 + tm.assert_numpy_array_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/tslibs/test_libfrequencies.py b/venv/Lib/site-packages/pandas/tests/tslibs/test_libfrequencies.py new file mode 100644 index 0000000..5810c7e --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/tslibs/test_libfrequencies.py @@ -0,0 +1,104 @@ +import pytest + +from pandas._libs.tslibs.frequencies import ( + INVALID_FREQ_ERR_MSG, + _period_str_to_code, + get_rule_month, + is_subperiod, + is_superperiod, +) + +from pandas.tseries import offsets + + +@pytest.mark.parametrize( + "obj,expected", + [ + ("W", "DEC"), + (offsets.Week(), "DEC"), + ("D", "DEC"), + (offsets.Day(), "DEC"), + ("Q", "DEC"), + (offsets.QuarterEnd(startingMonth=12), "DEC"), + ("Q-JAN", "JAN"), + (offsets.QuarterEnd(startingMonth=1), "JAN"), + ("A-DEC", "DEC"), + ("Y-DEC", "DEC"), + (offsets.YearEnd(), "DEC"), + ("A-MAY", "MAY"), + ("Y-MAY", "MAY"), + (offsets.YearEnd(month=5), "MAY"), + ], +) +def test_get_rule_month(obj, expected): + result = get_rule_month(obj) + assert result == expected + + +@pytest.mark.parametrize( + "obj,expected", + [ + ("A", 1000), + ("A-DEC", 1000), + ("A-JAN", 1001), + ("Y", 1000), + ("Y-DEC", 1000), + ("Y-JAN", 1001), + ("Q", 2000), + ("Q-DEC", 2000), + ("Q-FEB", 2002), + ("W", 4000), + ("W-SUN", 4000), + ("W-FRI", 4005), + ("Min", 8000), + ("ms", 10000), + ("US", 11000), + ("NS", 12000), + ], +) +def test_period_str_to_code(obj, expected): + assert _period_str_to_code(obj) == expected + + +@pytest.mark.parametrize( + "p1,p2,expected", + [ + # Input validation. + (offsets.MonthEnd(), None, False), + (offsets.YearEnd(), None, False), + (None, offsets.YearEnd(), False), + (None, offsets.MonthEnd(), False), + (None, None, False), + (offsets.YearEnd(), offsets.MonthEnd(), True), + (offsets.Hour(), offsets.Minute(), True), + (offsets.Second(), offsets.Milli(), True), + (offsets.Milli(), offsets.Micro(), True), + (offsets.Micro(), offsets.Nano(), True), + ], +) +def test_super_sub_symmetry(p1, p2, expected): + assert is_superperiod(p1, p2) is expected + assert is_subperiod(p2, p1) is expected + + +@pytest.mark.parametrize( + "freq,expected,aliases", + [ + ("D", 6000, ["DAY", "DLY", "DAILY"]), + ("M", 3000, ["MTH", "MONTH", "MONTHLY"]), + ("N", 12000, ["NANOSECOND", "NANOSECONDLY"]), + ("H", 7000, ["HR", "HOUR", "HRLY", "HOURLY"]), + ("T", 8000, ["minute", "MINUTE", "MINUTELY"]), + ("L", 10000, ["MILLISECOND", "MILLISECONDLY"]), + ("U", 11000, ["MICROSECOND", "MICROSECONDLY"]), + ("S", 9000, ["sec", "SEC", "SECOND", "SECONDLY"]), + ("B", 5000, ["BUS", "BUSINESS", "BUSINESSLY", "WEEKDAY"]), + ], +) +def test_assert_aliases_deprecated(freq, expected, aliases): + assert isinstance(aliases, list) + assert _period_str_to_code(freq) == expected + + for alias in aliases: + with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): + _period_str_to_code(alias) diff --git a/venv/Lib/site-packages/pandas/tests/tslibs/test_liboffsets.py b/venv/Lib/site-packages/pandas/tests/tslibs/test_liboffsets.py new file mode 100644 index 0000000..6ff2ae6 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/tslibs/test_liboffsets.py @@ -0,0 +1,169 @@ +""" +Tests for helper functions in the cython tslibs.offsets +""" +from datetime import datetime + +import pytest + +import pandas._libs.tslibs.offsets as liboffsets +from pandas._libs.tslibs.offsets import roll_qtrday + +from pandas import Timestamp + + +@pytest.fixture(params=["start", "end", "business_start", "business_end"]) +def day_opt(request): + return request.param + + +@pytest.mark.parametrize( + "dt,exp_week_day,exp_last_day", + [ + (datetime(2017, 11, 30), 3, 30), # Business day. + (datetime(1993, 10, 31), 6, 29), # Non-business day. + ], +) +def test_get_last_bday(dt, exp_week_day, exp_last_day): + assert dt.weekday() == exp_week_day + assert liboffsets.get_lastbday(dt.year, dt.month) == exp_last_day + + +@pytest.mark.parametrize( + "dt,exp_week_day,exp_first_day", + [ + (datetime(2017, 4, 1), 5, 3), # Non-weekday. + (datetime(1993, 10, 1), 4, 1), # Business day. + ], +) +def test_get_first_bday(dt, exp_week_day, exp_first_day): + assert dt.weekday() == exp_week_day + assert liboffsets.get_firstbday(dt.year, dt.month) == exp_first_day + + +@pytest.mark.parametrize( + "months,day_opt,expected", + [ + (0, 15, datetime(2017, 11, 15)), + (0, None, datetime(2017, 11, 30)), + (1, "start", datetime(2017, 12, 1)), + (-145, "end", datetime(2005, 10, 31)), + (0, "business_end", datetime(2017, 11, 30)), + (0, "business_start", datetime(2017, 11, 1)), + ], +) +def test_shift_month_dt(months, day_opt, expected): + dt = datetime(2017, 11, 30) + assert liboffsets.shift_month(dt, months, day_opt=day_opt) == expected + + +@pytest.mark.parametrize( + "months,day_opt,expected", + [ + (1, "start", Timestamp("1929-06-01")), + (-3, "end", Timestamp("1929-02-28")), + (25, None, Timestamp("1931-06-5")), + (-1, 31, Timestamp("1929-04-30")), + ], +) +def test_shift_month_ts(months, day_opt, expected): + ts = Timestamp("1929-05-05") + assert liboffsets.shift_month(ts, months, day_opt=day_opt) == expected + + +def test_shift_month_error(): + dt = datetime(2017, 11, 15) + day_opt = "this should raise" + + with pytest.raises(ValueError, match=day_opt): + liboffsets.shift_month(dt, 3, day_opt=day_opt) + + +@pytest.mark.parametrize( + "other,expected", + [ + # Before March 1. + (datetime(2017, 2, 10), {2: 1, -7: -7, 0: 0}), + # After March 1. + (Timestamp("2014-03-15", tz="US/Eastern"), {2: 2, -7: -6, 0: 1}), + ], +) +@pytest.mark.parametrize("n", [2, -7, 0]) +def test_roll_yearday(other, expected, n): + month = 3 + day_opt = "start" # `other` will be compared to March 1. + + assert liboffsets.roll_yearday(other, n, month, day_opt) == expected[n] + + +@pytest.mark.parametrize( + "other,expected", + [ + # Before June 30. + (datetime(1999, 6, 29), {5: 4, -7: -7, 0: 0}), + # After June 30. + (Timestamp(2072, 8, 24, 6, 17, 18), {5: 5, -7: -6, 0: 1}), + ], +) +@pytest.mark.parametrize("n", [5, -7, 0]) +def test_roll_yearday2(other, expected, n): + month = 6 + day_opt = "end" # `other` will be compared to June 30. + + assert liboffsets.roll_yearday(other, n, month, day_opt) == expected[n] + + +def test_get_day_of_month_error(): + # get_day_of_month is not directly exposed. + # We test it via roll_yearday. + dt = datetime(2017, 11, 15) + day_opt = "foo" + + with pytest.raises(ValueError, match=day_opt): + # To hit the raising case we need month == dt.month and n > 0. + liboffsets.roll_yearday(dt, n=3, month=11, day_opt=day_opt) + + +@pytest.mark.parametrize( + "month", + [3, 5], # (other.month % 3) < (month % 3) # (other.month % 3) > (month % 3) +) +@pytest.mark.parametrize("n", [4, -3]) +def test_roll_qtr_day_not_mod_unequal(day_opt, month, n): + expected = {3: {-3: -2, 4: 4}, 5: {-3: -3, 4: 3}} + + other = Timestamp(2072, 10, 1, 6, 17, 18) # Saturday. + assert roll_qtrday(other, n, month, day_opt, modby=3) == expected[month][n] + + +@pytest.mark.parametrize( + "other,month,exp_dict", + [ + # Monday. + (datetime(1999, 5, 31), 2, {-1: {"start": 0, "business_start": 0}}), + # Saturday. + ( + Timestamp(2072, 10, 1, 6, 17, 18), + 4, + {2: {"end": 1, "business_end": 1, "business_start": 1}}, + ), + # First business day. + ( + Timestamp(2072, 10, 3, 6, 17, 18), + 4, + {2: {"end": 1, "business_end": 1}, -1: {"start": 0}}, + ), + ], +) +@pytest.mark.parametrize("n", [2, -1]) +def test_roll_qtr_day_mod_equal(other, month, exp_dict, n, day_opt): + # All cases have (other.month % 3) == (month % 3). + expected = exp_dict.get(n, {}).get(day_opt, n) + assert roll_qtrday(other, n, month, day_opt, modby=3) == expected + + +@pytest.mark.parametrize( + "n,expected", [(42, {29: 42, 1: 42, 31: 41}), (-4, {29: -4, 1: -3, 31: -4})] +) +@pytest.mark.parametrize("compare", [29, 1, 31]) +def test_roll_convention(n, expected, compare): + assert liboffsets.roll_convention(29, n, compare) == expected[compare] diff --git a/venv/Lib/site-packages/pandas/tests/tslibs/test_normalize_date.py b/venv/Lib/site-packages/pandas/tests/tslibs/test_normalize_date.py new file mode 100644 index 0000000..2a41836 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/tslibs/test_normalize_date.py @@ -0,0 +1,41 @@ +"""Tests for functions from pandas._libs.tslibs""" + +from datetime import date, datetime + +import pytest + +from pandas._libs import tslibs +from pandas._libs.tslibs.timestamps import Timestamp + + +@pytest.mark.parametrize( + "value,expected", + [ + (date(2012, 9, 7), datetime(2012, 9, 7)), + (datetime(2012, 9, 7, 12), datetime(2012, 9, 7)), + (datetime(2007, 10, 1, 1, 12, 5, 10), datetime(2007, 10, 1)), + ], +) +def test_normalize_date(value, expected): + result = tslibs.normalize_date(value) + assert result == expected + + +class SubDatetime(datetime): + pass + + +@pytest.mark.parametrize( + "dt, expected", + [ + (Timestamp(2000, 1, 1, 1), Timestamp(2000, 1, 1, 0)), + (datetime(2000, 1, 1, 1), datetime(2000, 1, 1, 0)), + (SubDatetime(2000, 1, 1, 1), SubDatetime(2000, 1, 1, 0)), + ], +) +def test_normalize_date_sub_types(dt, expected): + # GH 25851 + # ensure that subclassed datetime works with + # normalize_date + result = tslibs.normalize_date(dt) + assert result == expected diff --git a/venv/Lib/site-packages/pandas/tests/tslibs/test_parse_iso8601.py b/venv/Lib/site-packages/pandas/tests/tslibs/test_parse_iso8601.py new file mode 100644 index 0000000..a58f227 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/tslibs/test_parse_iso8601.py @@ -0,0 +1,72 @@ +from datetime import datetime + +import pytest + +from pandas._libs import tslib + + +@pytest.mark.parametrize( + "date_str, exp", + [ + ("2011-01-02", datetime(2011, 1, 2)), + ("2011-1-2", datetime(2011, 1, 2)), + ("2011-01", datetime(2011, 1, 1)), + ("2011-1", datetime(2011, 1, 1)), + ("2011 01 02", datetime(2011, 1, 2)), + ("2011.01.02", datetime(2011, 1, 2)), + ("2011/01/02", datetime(2011, 1, 2)), + ("2011\\01\\02", datetime(2011, 1, 2)), + ("2013-01-01 05:30:00", datetime(2013, 1, 1, 5, 30)), + ("2013-1-1 5:30:00", datetime(2013, 1, 1, 5, 30)), + ], +) +def test_parsers_iso8601(date_str, exp): + # see gh-12060 + # + # Test only the ISO parser - flexibility to + # different separators and leading zero's. + actual = tslib._test_parse_iso8601(date_str) + assert actual == exp + + +@pytest.mark.parametrize( + "date_str", + [ + "2011-01/02", + "2011=11=11", + "201401", + "201111", + "200101", + # Mixed separated and unseparated. + "2005-0101", + "200501-01", + "20010101 12:3456", + "20010101 1234:56", + # HHMMSS must have two digits in + # each component if unseparated. + "20010101 1", + "20010101 123", + "20010101 12345", + "20010101 12345Z", + ], +) +def test_parsers_iso8601_invalid(date_str): + msg = 'Error parsing datetime string "{s}"'.format(s=date_str) + + with pytest.raises(ValueError, match=msg): + tslib._test_parse_iso8601(date_str) + + +def test_parsers_iso8601_invalid_offset_invalid(): + date_str = "2001-01-01 12-34-56" + msg = f'Timezone hours offset out of range in datetime string "{date_str}"' + + with pytest.raises(ValueError, match=msg): + tslib._test_parse_iso8601(date_str) + + +def test_parsers_iso8601_leading_space(): + # GH#25895 make sure isoparser doesn't overflow with long input + date_str, expected = ("2013-1-1 5:30:00", datetime(2013, 1, 1, 5, 30)) + actual = tslib._test_parse_iso8601(" " * 200 + date_str) + assert actual == expected diff --git a/venv/Lib/site-packages/pandas/tests/tslibs/test_parsing.py b/venv/Lib/site-packages/pandas/tests/tslibs/test_parsing.py new file mode 100644 index 0000000..36f7ada --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/tslibs/test_parsing.py @@ -0,0 +1,227 @@ +""" +Tests for Timestamp parsing, aimed at pandas/_libs/tslibs/parsing.pyx +""" +from datetime import datetime + +from dateutil.parser import parse +import numpy as np +import pytest + +from pandas._libs.tslibs import parsing +from pandas._libs.tslibs.parsing import parse_time_string +import pandas.util._test_decorators as td + +import pandas._testing as tm + + +def test_parse_time_string(): + (date, parsed, reso) = parse_time_string("4Q1984") + (date_lower, parsed_lower, reso_lower) = parse_time_string("4q1984") + + assert date == date_lower + assert reso == reso_lower + assert parsed == parsed_lower + + +def test_parse_time_string_invalid_type(): + # Raise on invalid input, don't just return it + with pytest.raises(TypeError): + parse_time_string((4, 5)) + + +@pytest.mark.parametrize( + "dashed,normal", [("1988-Q2", "1988Q2"), ("2Q-1988", "2Q1988")] +) +def test_parse_time_quarter_with_dash(dashed, normal): + # see gh-9688 + (date_dash, parsed_dash, reso_dash) = parse_time_string(dashed) + (date, parsed, reso) = parse_time_string(normal) + + assert date_dash == date + assert parsed_dash == parsed + assert reso_dash == reso + + +@pytest.mark.parametrize("dashed", ["-2Q1992", "2-Q1992", "4-4Q1992"]) +def test_parse_time_quarter_with_dash_error(dashed): + msg = "Unknown datetime string format, unable to parse: {dashed}" + + with pytest.raises(parsing.DateParseError, match=msg.format(dashed=dashed)): + parse_time_string(dashed) + + +@pytest.mark.parametrize( + "date_string,expected", + [ + ("123.1234", False), + ("-50000", False), + ("999", False), + ("m", False), + ("T", False), + ("Mon Sep 16, 2013", True), + ("2012-01-01", True), + ("01/01/2012", True), + ("01012012", True), + ("0101", True), + ("1-1", True), + ], +) +def test_does_not_convert_mixed_integer(date_string, expected): + assert parsing._does_string_look_like_datetime(date_string) is expected + + +@pytest.mark.parametrize( + "date_str,kwargs,msg", + [ + ( + "2013Q5", + dict(), + ( + "Incorrect quarterly string is given, " + "quarter must be between 1 and 4: 2013Q5" + ), + ), + # see gh-5418 + ( + "2013Q1", + dict(freq="INVLD-L-DEC-SAT"), + ( + "Unable to retrieve month information " + "from given freq: INVLD-L-DEC-SAT" + ), + ), + ], +) +def test_parsers_quarterly_with_freq_error(date_str, kwargs, msg): + with pytest.raises(parsing.DateParseError, match=msg): + parsing.parse_time_string(date_str, **kwargs) + + +@pytest.mark.parametrize( + "date_str,freq,expected", + [ + ("2013Q2", None, datetime(2013, 4, 1)), + ("2013Q2", "A-APR", datetime(2012, 8, 1)), + ("2013-Q2", "A-DEC", datetime(2013, 4, 1)), + ], +) +def test_parsers_quarterly_with_freq(date_str, freq, expected): + result, _, _ = parsing.parse_time_string(date_str, freq=freq) + assert result == expected + + +@pytest.mark.parametrize( + "date_str", ["2Q 2005", "2Q-200A", "2Q-200", "22Q2005", "2Q200.", "6Q-20"] +) +def test_parsers_quarter_invalid(date_str): + if date_str == "6Q-20": + msg = ( + "Incorrect quarterly string is given, quarter " + "must be between 1 and 4: {date_str}" + ) + else: + msg = "Unknown datetime string format, unable to parse: {date_str}" + + with pytest.raises(ValueError, match=msg.format(date_str=date_str)): + parsing.parse_time_string(date_str) + + +@pytest.mark.parametrize( + "date_str,expected", + [("201101", datetime(2011, 1, 1, 0, 0)), ("200005", datetime(2000, 5, 1, 0, 0))], +) +def test_parsers_month_freq(date_str, expected): + result, _, _ = parsing.parse_time_string(date_str, freq="M") + assert result == expected + + +@td.skip_if_not_us_locale +@pytest.mark.parametrize( + "string,fmt", + [ + ("20111230", "%Y%m%d"), + ("2011-12-30", "%Y-%m-%d"), + ("30-12-2011", "%d-%m-%Y"), + ("2011-12-30 00:00:00", "%Y-%m-%d %H:%M:%S"), + ("2011-12-30T00:00:00", "%Y-%m-%dT%H:%M:%S"), + ("2011-12-30 00:00:00.000000", "%Y-%m-%d %H:%M:%S.%f"), + ], +) +def test_guess_datetime_format_with_parseable_formats(string, fmt): + result = parsing._guess_datetime_format(string) + assert result == fmt + + +@pytest.mark.parametrize("dayfirst,expected", [(True, "%d/%m/%Y"), (False, "%m/%d/%Y")]) +def test_guess_datetime_format_with_dayfirst(dayfirst, expected): + ambiguous_string = "01/01/2011" + result = parsing._guess_datetime_format(ambiguous_string, dayfirst=dayfirst) + assert result == expected + + +@td.skip_if_has_locale +@pytest.mark.parametrize( + "string,fmt", + [ + ("30/Dec/2011", "%d/%b/%Y"), + ("30/December/2011", "%d/%B/%Y"), + ("30/Dec/2011 00:00:00", "%d/%b/%Y %H:%M:%S"), + ], +) +def test_guess_datetime_format_with_locale_specific_formats(string, fmt): + result = parsing._guess_datetime_format(string) + assert result == fmt + + +@pytest.mark.parametrize( + "invalid_dt", + [ + "2013", + "01/2013", + "12:00:00", + "1/1/1/1", + "this_is_not_a_datetime", + "51a", + 9, + datetime(2011, 1, 1), + ], +) +def test_guess_datetime_format_invalid_inputs(invalid_dt): + # A datetime string must include a year, month and a day for it to be + # guessable, in addition to being a string that looks like a datetime. + assert parsing._guess_datetime_format(invalid_dt) is None + + +@pytest.mark.parametrize( + "string,fmt", + [ + ("2011-1-1", "%Y-%m-%d"), + ("1/1/2011", "%m/%d/%Y"), + ("30-1-2011", "%d-%m-%Y"), + ("2011-1-1 0:0:0", "%Y-%m-%d %H:%M:%S"), + ("2011-1-3T00:00:0", "%Y-%m-%dT%H:%M:%S"), + ("2011-1-1 00:00:00", "%Y-%m-%d %H:%M:%S"), + ], +) +def test_guess_datetime_format_no_padding(string, fmt): + # see gh-11142 + result = parsing._guess_datetime_format(string) + assert result == fmt + + +def test_try_parse_dates(): + arr = np.array(["5/1/2000", "6/1/2000", "7/1/2000"], dtype=object) + result = parsing.try_parse_dates(arr, dayfirst=True) + + expected = np.array([parse(d, dayfirst=True) for d in arr]) + tm.assert_numpy_array_equal(result, expected) + + +def test_parse_time_string_check_instance_type_raise_exception(): + # issue 20684 + with pytest.raises(TypeError): + parse_time_string((1, 2, 3)) + + result = parse_time_string("2019") + expected = (datetime(2019, 1, 1), datetime(2019, 1, 1), "year") + assert result == expected diff --git a/venv/Lib/site-packages/pandas/tests/tslibs/test_period_asfreq.py b/venv/Lib/site-packages/pandas/tests/tslibs/test_period_asfreq.py new file mode 100644 index 0000000..5497cb6 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/tslibs/test_period_asfreq.py @@ -0,0 +1,78 @@ +import pytest + +from pandas._libs.tslibs.frequencies import get_freq +from pandas._libs.tslibs.period import period_asfreq, period_ordinal + + +@pytest.mark.parametrize( + "freq1,freq2,expected", + [ + ("D", "H", 24), + ("D", "T", 1440), + ("D", "S", 86400), + ("D", "L", 86400000), + ("D", "U", 86400000000), + ("D", "N", 86400000000000), + ("H", "T", 60), + ("H", "S", 3600), + ("H", "L", 3600000), + ("H", "U", 3600000000), + ("H", "N", 3600000000000), + ("T", "S", 60), + ("T", "L", 60000), + ("T", "U", 60000000), + ("T", "N", 60000000000), + ("S", "L", 1000), + ("S", "U", 1000000), + ("S", "N", 1000000000), + ("L", "U", 1000), + ("L", "N", 1000000), + ("U", "N", 1000), + ], +) +def test_intra_day_conversion_factors(freq1, freq2, expected): + assert period_asfreq(1, get_freq(freq1), get_freq(freq2), False) == expected + + +@pytest.mark.parametrize( + "freq,expected", [("A", 0), ("M", 0), ("W", 1), ("D", 0), ("B", 0)] +) +def test_period_ordinal_start_values(freq, expected): + # information for Jan. 1, 1970. + assert period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq(freq)) == expected + + +@pytest.mark.parametrize( + "dt,expected", + [ + ((1970, 1, 4, 0, 0, 0, 0, 0), 1), + ((1970, 1, 5, 0, 0, 0, 0, 0), 2), + ((2013, 10, 6, 0, 0, 0, 0, 0), 2284), + ((2013, 10, 7, 0, 0, 0, 0, 0), 2285), + ], +) +def test_period_ordinal_week(dt, expected): + args = dt + (get_freq("W"),) + assert period_ordinal(*args) == expected + + +@pytest.mark.parametrize( + "day,expected", + [ + # Thursday (Oct. 3, 2013). + (3, 11415), + # Friday (Oct. 4, 2013). + (4, 11416), + # Saturday (Oct. 5, 2013). + (5, 11417), + # Sunday (Oct. 6, 2013). + (6, 11417), + # Monday (Oct. 7, 2013). + (7, 11417), + # Tuesday (Oct. 8, 2013). + (8, 11418), + ], +) +def test_period_ordinal_business_day(day, expected): + args = (2013, 10, day, 0, 0, 0, 0, 0, get_freq("B")) + assert period_ordinal(*args) == expected diff --git a/venv/Lib/site-packages/pandas/tests/tslibs/test_timedeltas.py b/venv/Lib/site-packages/pandas/tests/tslibs/test_timedeltas.py new file mode 100644 index 0000000..86d5cc7 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/tslibs/test_timedeltas.py @@ -0,0 +1,30 @@ +import numpy as np +import pytest + +from pandas._libs.tslibs.timedeltas import delta_to_nanoseconds + +from pandas import Timedelta, offsets + + +@pytest.mark.parametrize( + "obj,expected", + [ + (np.timedelta64(14, "D"), 14 * 24 * 3600 * 1e9), + (Timedelta(minutes=-7), -7 * 60 * 1e9), + (Timedelta(minutes=-7).to_pytimedelta(), -7 * 60 * 1e9), + (offsets.Nano(125), 125), + (1, 1), + (np.int64(2), 2), + (np.int32(3), 3), + ], +) +def test_delta_to_nanoseconds(obj, expected): + result = delta_to_nanoseconds(obj) + assert result == expected + + +def test_delta_to_nanoseconds_error(): + obj = np.array([123456789], dtype="m8[ns]") + + with pytest.raises(TypeError, match=""): + delta_to_nanoseconds(obj) diff --git a/venv/Lib/site-packages/pandas/tests/tslibs/test_timezones.py b/venv/Lib/site-packages/pandas/tests/tslibs/test_timezones.py new file mode 100644 index 0000000..03cc8fc --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/tslibs/test_timezones.py @@ -0,0 +1,108 @@ +from datetime import datetime + +import dateutil.tz +import pytest +import pytz + +from pandas._libs.tslibs import conversion, timezones + +from pandas import Timestamp + + +@pytest.mark.parametrize("tz_name", list(pytz.common_timezones)) +def test_cache_keys_are_distinct_for_pytz_vs_dateutil(tz_name): + if tz_name == "UTC": + pytest.skip("UTC: special case in dateutil") + + tz_p = timezones.maybe_get_tz(tz_name) + tz_d = timezones.maybe_get_tz("dateutil/" + tz_name) + + if tz_d is None: + pytest.skip(tz_name + ": dateutil does not know about this one") + + assert timezones._p_tz_cache_key(tz_p) != timezones._p_tz_cache_key(tz_d) + + +def test_tzlocal_repr(): + # see gh-13583 + ts = Timestamp("2011-01-01", tz=dateutil.tz.tzlocal()) + assert ts.tz == dateutil.tz.tzlocal() + assert "tz='tzlocal()')" in repr(ts) + + +def test_tzlocal_maybe_get_tz(): + # see gh-13583 + tz = timezones.maybe_get_tz("tzlocal()") + assert tz == dateutil.tz.tzlocal() + + +def test_tzlocal_offset(): + # see gh-13583 + # + # Get offset using normal datetime for test. + ts = Timestamp("2011-01-01", tz=dateutil.tz.tzlocal()) + + offset = dateutil.tz.tzlocal().utcoffset(datetime(2011, 1, 1)) + offset = offset.total_seconds() * 1000000000 + + assert ts.value + offset == Timestamp("2011-01-01").value + + +@pytest.fixture( + params=[ + (pytz.timezone("US/Eastern"), lambda tz, x: tz.localize(x)), + (dateutil.tz.gettz("US/Eastern"), lambda tz, x: x.replace(tzinfo=tz)), + ] +) +def infer_setup(request): + eastern, localize = request.param + + start_naive = datetime(2001, 1, 1) + end_naive = datetime(2009, 1, 1) + + start = localize(eastern, start_naive) + end = localize(eastern, end_naive) + + return eastern, localize, start, end, start_naive, end_naive + + +def test_infer_tz_compat(infer_setup): + eastern, _, start, end, start_naive, end_naive = infer_setup + + assert ( + timezones.infer_tzinfo(start, end) + is conversion.localize_pydatetime(start_naive, eastern).tzinfo + ) + assert ( + timezones.infer_tzinfo(start, None) + is conversion.localize_pydatetime(start_naive, eastern).tzinfo + ) + assert ( + timezones.infer_tzinfo(None, end) + is conversion.localize_pydatetime(end_naive, eastern).tzinfo + ) + + +def test_infer_tz_utc_localize(infer_setup): + _, _, start, end, start_naive, end_naive = infer_setup + utc = pytz.utc + + start = utc.localize(start_naive) + end = utc.localize(end_naive) + + assert timezones.infer_tzinfo(start, end) is utc + + +@pytest.mark.parametrize("ordered", [True, False]) +def test_infer_tz_mismatch(infer_setup, ordered): + eastern, _, _, _, start_naive, end_naive = infer_setup + msg = "Inputs must both have the same timezone" + + utc = pytz.utc + start = utc.localize(start_naive) + end = conversion.localize_pydatetime(end_naive, eastern) + + args = (start, end) if ordered else (end, start) + + with pytest.raises(AssertionError, match=msg): + timezones.infer_tzinfo(*args) diff --git a/venv/Lib/site-packages/pandas/tests/util/__init__.py b/venv/Lib/site-packages/pandas/tests/util/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/util/conftest.py b/venv/Lib/site-packages/pandas/tests/util/conftest.py new file mode 100644 index 0000000..5eff49a --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/util/conftest.py @@ -0,0 +1,26 @@ +import pytest + + +@pytest.fixture(params=[True, False]) +def check_dtype(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def check_exact(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def check_index_type(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def check_less_precise(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def check_categorical(request): + return request.param diff --git a/venv/Lib/site-packages/pandas/tests/util/test_assert_almost_equal.py b/venv/Lib/site-packages/pandas/tests/util/test_assert_almost_equal.py new file mode 100644 index 0000000..b804889 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/util/test_assert_almost_equal.py @@ -0,0 +1,361 @@ +import numpy as np +import pytest + +from pandas import DataFrame, Index, Series, Timestamp +import pandas._testing as tm + + +def _assert_almost_equal_both(a, b, **kwargs): + """ + Check that two objects are approximately equal. + + This check is performed commutatively. + + Parameters + ---------- + a : object + The first object to compare. + b : object + The second object to compare. + kwargs : dict + The arguments passed to `tm.assert_almost_equal`. + """ + tm.assert_almost_equal(a, b, **kwargs) + tm.assert_almost_equal(b, a, **kwargs) + + +def _assert_not_almost_equal(a, b, **kwargs): + """ + Check that two objects are not approximately equal. + + Parameters + ---------- + a : object + The first object to compare. + b : object + The second object to compare. + kwargs : dict + The arguments passed to `tm.assert_almost_equal`. + """ + try: + tm.assert_almost_equal(a, b, **kwargs) + msg = f"{a} and {b} were approximately equal when they shouldn't have been" + pytest.fail(msg=msg) + except AssertionError: + pass + + +def _assert_not_almost_equal_both(a, b, **kwargs): + """ + Check that two objects are not approximately equal. + + This check is performed commutatively. + + Parameters + ---------- + a : object + The first object to compare. + b : object + The second object to compare. + kwargs : dict + The arguments passed to `tm.assert_almost_equal`. + """ + _assert_not_almost_equal(a, b, **kwargs) + _assert_not_almost_equal(b, a, **kwargs) + + +@pytest.mark.parametrize( + "a,b", + [ + (1.1, 1.1), + (1.1, 1.100001), + (np.int16(1), 1.000001), + (np.float64(1.1), 1.1), + (np.uint32(5), 5), + ], +) +def test_assert_almost_equal_numbers(a, b): + _assert_almost_equal_both(a, b) + + +@pytest.mark.parametrize("a,b", [(1.1, 1), (1.1, True), (1, 2), (1.0001, np.int16(1))]) +def test_assert_not_almost_equal_numbers(a, b): + _assert_not_almost_equal_both(a, b) + + +@pytest.mark.parametrize("a,b", [(0, 0), (0, 0.0), (0, np.float64(0)), (0.000001, 0)]) +def test_assert_almost_equal_numbers_with_zeros(a, b): + _assert_almost_equal_both(a, b) + + +@pytest.mark.parametrize("a,b", [(0.001, 0), (1, 0)]) +def test_assert_not_almost_equal_numbers_with_zeros(a, b): + _assert_not_almost_equal_both(a, b) + + +@pytest.mark.parametrize("a,b", [(1, "abc"), (1, [1]), (1, object())]) +def test_assert_not_almost_equal_numbers_with_mixed(a, b): + _assert_not_almost_equal_both(a, b) + + +@pytest.mark.parametrize( + "left_dtype", ["M8[ns]", "m8[ns]", "float64", "int64", "object"] +) +@pytest.mark.parametrize( + "right_dtype", ["M8[ns]", "m8[ns]", "float64", "int64", "object"] +) +def test_assert_almost_equal_edge_case_ndarrays(left_dtype, right_dtype): + # Empty compare. + _assert_almost_equal_both( + np.array([], dtype=left_dtype), + np.array([], dtype=right_dtype), + check_dtype=False, + ) + + +def test_assert_almost_equal_dicts(): + _assert_almost_equal_both({"a": 1, "b": 2}, {"a": 1, "b": 2}) + + +@pytest.mark.parametrize( + "a,b", + [ + ({"a": 1, "b": 2}, {"a": 1, "b": 3}), + ({"a": 1, "b": 2}, {"a": 1, "b": 2, "c": 3}), + ({"a": 1}, 1), + ({"a": 1}, "abc"), + ({"a": 1}, [1]), + ], +) +def test_assert_not_almost_equal_dicts(a, b): + _assert_not_almost_equal_both(a, b) + + +@pytest.mark.parametrize("val", [1, 2]) +def test_assert_almost_equal_dict_like_object(val): + dict_val = 1 + real_dict = dict(a=val) + + class DictLikeObj: + def keys(self): + return ("a",) + + def __getitem__(self, item): + if item == "a": + return dict_val + + func = ( + _assert_almost_equal_both if val == dict_val else _assert_not_almost_equal_both + ) + func(real_dict, DictLikeObj(), check_dtype=False) + + +def test_assert_almost_equal_strings(): + _assert_almost_equal_both("abc", "abc") + + +@pytest.mark.parametrize( + "a,b", [("abc", "abcd"), ("abc", "abd"), ("abc", 1), ("abc", [1])] +) +def test_assert_not_almost_equal_strings(a, b): + _assert_not_almost_equal_both(a, b) + + +@pytest.mark.parametrize( + "a,b", [([1, 2, 3], [1, 2, 3]), (np.array([1, 2, 3]), np.array([1, 2, 3]))] +) +def test_assert_almost_equal_iterables(a, b): + _assert_almost_equal_both(a, b) + + +@pytest.mark.parametrize( + "a,b", + [ + # Class is different. + (np.array([1, 2, 3]), [1, 2, 3]), + # Dtype is different. + (np.array([1, 2, 3]), np.array([1.0, 2.0, 3.0])), + # Can't compare generators. + (iter([1, 2, 3]), [1, 2, 3]), + ([1, 2, 3], [1, 2, 4]), + ([1, 2, 3], [1, 2, 3, 4]), + ([1, 2, 3], 1), + ], +) +def test_assert_not_almost_equal_iterables(a, b): + _assert_not_almost_equal(a, b) + + +def test_assert_almost_equal_null(): + _assert_almost_equal_both(None, None) + + +@pytest.mark.parametrize("a,b", [(None, np.NaN), (None, 0), (np.NaN, 0)]) +def test_assert_not_almost_equal_null(a, b): + _assert_not_almost_equal(a, b) + + +@pytest.mark.parametrize( + "a,b", + [ + (np.inf, np.inf), + (np.inf, float("inf")), + (np.array([np.inf, np.nan, -np.inf]), np.array([np.inf, np.nan, -np.inf])), + ( + np.array([np.inf, None, -np.inf], dtype=np.object_), + np.array([np.inf, np.nan, -np.inf], dtype=np.object_), + ), + ], +) +def test_assert_almost_equal_inf(a, b): + _assert_almost_equal_both(a, b) + + +def test_assert_not_almost_equal_inf(): + _assert_not_almost_equal_both(np.inf, 0) + + +@pytest.mark.parametrize( + "a,b", + [ + (Index([1.0, 1.1]), Index([1.0, 1.100001])), + (Series([1.0, 1.1]), Series([1.0, 1.100001])), + (np.array([1.1, 2.000001]), np.array([1.1, 2.0])), + (DataFrame({"a": [1.0, 1.1]}), DataFrame({"a": [1.0, 1.100001]})), + ], +) +def test_assert_almost_equal_pandas(a, b): + _assert_almost_equal_both(a, b) + + +def test_assert_almost_equal_object(): + a = [Timestamp("2011-01-01"), Timestamp("2011-01-01")] + b = [Timestamp("2011-01-01"), Timestamp("2011-01-01")] + _assert_almost_equal_both(a, b) + + +def test_assert_almost_equal_value_mismatch(): + msg = "expected 2\\.00000 but got 1\\.00000, with decimal 5" + + with pytest.raises(AssertionError, match=msg): + tm.assert_almost_equal(1, 2) + + +@pytest.mark.parametrize( + "a,b,klass1,klass2", + [(np.array([1]), 1, "ndarray", "int"), (1, np.array([1]), "int", "ndarray")], +) +def test_assert_almost_equal_class_mismatch(a, b, klass1, klass2): + + msg = f"""numpy array are different + +numpy array classes are different +\\[left\\]: {klass1} +\\[right\\]: {klass2}""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_almost_equal(a, b) + + +def test_assert_almost_equal_value_mismatch1(): + msg = """numpy array are different + +numpy array values are different \\(66\\.66667 %\\) +\\[left\\]: \\[nan, 2\\.0, 3\\.0\\] +\\[right\\]: \\[1\\.0, nan, 3\\.0\\]""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_almost_equal(np.array([np.nan, 2, 3]), np.array([1, np.nan, 3])) + + +def test_assert_almost_equal_value_mismatch2(): + msg = """numpy array are different + +numpy array values are different \\(50\\.0 %\\) +\\[left\\]: \\[1, 2\\] +\\[right\\]: \\[1, 3\\]""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_almost_equal(np.array([1, 2]), np.array([1, 3])) + + +def test_assert_almost_equal_value_mismatch3(): + msg = """numpy array are different + +numpy array values are different \\(16\\.66667 %\\) +\\[left\\]: \\[\\[1, 2\\], \\[3, 4\\], \\[5, 6\\]\\] +\\[right\\]: \\[\\[1, 3\\], \\[3, 4\\], \\[5, 6\\]\\]""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_almost_equal( + np.array([[1, 2], [3, 4], [5, 6]]), np.array([[1, 3], [3, 4], [5, 6]]) + ) + + +def test_assert_almost_equal_value_mismatch4(): + msg = """numpy array are different + +numpy array values are different \\(25\\.0 %\\) +\\[left\\]: \\[\\[1, 2\\], \\[3, 4\\]\\] +\\[right\\]: \\[\\[1, 3\\], \\[3, 4\\]\\]""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_almost_equal(np.array([[1, 2], [3, 4]]), np.array([[1, 3], [3, 4]])) + + +def test_assert_almost_equal_shape_mismatch_override(): + msg = """Index are different + +Index shapes are different +\\[left\\]: \\(2L*,\\) +\\[right\\]: \\(3L*,\\)""" + with pytest.raises(AssertionError, match=msg): + tm.assert_almost_equal(np.array([1, 2]), np.array([3, 4, 5]), obj="Index") + + +def test_assert_almost_equal_unicode(): + # see gh-20503 + msg = """numpy array are different + +numpy array values are different \\(33\\.33333 %\\) +\\[left\\]: \\[á, à, ä\\] +\\[right\\]: \\[á, à, å\\]""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_almost_equal(np.array(["á", "à", "ä"]), np.array(["á", "à", "å"])) + + +def test_assert_almost_equal_timestamp(): + a = np.array([Timestamp("2011-01-01"), Timestamp("2011-01-01")]) + b = np.array([Timestamp("2011-01-01"), Timestamp("2011-01-02")]) + + msg = """numpy array are different + +numpy array values are different \\(50\\.0 %\\) +\\[left\\]: \\[2011-01-01 00:00:00, 2011-01-01 00:00:00\\] +\\[right\\]: \\[2011-01-01 00:00:00, 2011-01-02 00:00:00\\]""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_almost_equal(a, b) + + +def test_assert_almost_equal_iterable_length_mismatch(): + msg = """Iterable are different + +Iterable length are different +\\[left\\]: 2 +\\[right\\]: 3""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_almost_equal([1, 2], [3, 4, 5]) + + +def test_assert_almost_equal_iterable_values_mismatch(): + msg = """Iterable are different + +Iterable values are different \\(50\\.0 %\\) +\\[left\\]: \\[1, 2\\] +\\[right\\]: \\[1, 3\\]""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_almost_equal([1, 2], [1, 3]) diff --git a/venv/Lib/site-packages/pandas/tests/util/test_assert_categorical_equal.py b/venv/Lib/site-packages/pandas/tests/util/test_assert_categorical_equal.py new file mode 100644 index 0000000..8957e7a --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/util/test_assert_categorical_equal.py @@ -0,0 +1,90 @@ +import pytest + +from pandas import Categorical +import pandas._testing as tm + + +@pytest.mark.parametrize( + "c", + [Categorical([1, 2, 3, 4]), Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4, 5])], +) +def test_categorical_equal(c): + tm.assert_categorical_equal(c, c) + + +@pytest.mark.parametrize("check_category_order", [True, False]) +def test_categorical_equal_order_mismatch(check_category_order): + c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4]) + c2 = Categorical([1, 2, 3, 4], categories=[4, 3, 2, 1]) + kwargs = dict(check_category_order=check_category_order) + + if check_category_order: + msg = """Categorical\\.categories are different + +Categorical\\.categories values are different \\(100\\.0 %\\) +\\[left\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\) +\\[right\\]: Int64Index\\(\\[4, 3, 2, 1\\], dtype='int64'\\)""" + with pytest.raises(AssertionError, match=msg): + tm.assert_categorical_equal(c1, c2, **kwargs) + else: + tm.assert_categorical_equal(c1, c2, **kwargs) + + +def test_categorical_equal_categories_mismatch(): + msg = """Categorical\\.categories are different + +Categorical\\.categories values are different \\(25\\.0 %\\) +\\[left\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\) +\\[right\\]: Int64Index\\(\\[1, 2, 3, 5\\], dtype='int64'\\)""" + + c1 = Categorical([1, 2, 3, 4]) + c2 = Categorical([1, 2, 3, 5]) + + with pytest.raises(AssertionError, match=msg): + tm.assert_categorical_equal(c1, c2) + + +def test_categorical_equal_codes_mismatch(): + categories = [1, 2, 3, 4] + msg = """Categorical\\.codes are different + +Categorical\\.codes values are different \\(50\\.0 %\\) +\\[left\\]: \\[0, 1, 3, 2\\] +\\[right\\]: \\[0, 1, 2, 3\\]""" + + c1 = Categorical([1, 2, 4, 3], categories=categories) + c2 = Categorical([1, 2, 3, 4], categories=categories) + + with pytest.raises(AssertionError, match=msg): + tm.assert_categorical_equal(c1, c2) + + +def test_categorical_equal_ordered_mismatch(): + data = [1, 2, 3, 4] + msg = """Categorical are different + +Attribute "ordered" are different +\\[left\\]: False +\\[right\\]: True""" + + c1 = Categorical(data, ordered=False) + c2 = Categorical(data, ordered=True) + + with pytest.raises(AssertionError, match=msg): + tm.assert_categorical_equal(c1, c2) + + +@pytest.mark.parametrize("obj", ["index", "foo", "pandas"]) +def test_categorical_equal_object_override(obj): + data = [1, 2, 3, 4] + msg = f"""{obj} are different + +Attribute "ordered" are different +\\[left\\]: False +\\[right\\]: True""" + + c1 = Categorical(data, ordered=False) + c2 = Categorical(data, ordered=True) + + with pytest.raises(AssertionError, match=msg): + tm.assert_categorical_equal(c1, c2, obj=obj) diff --git a/venv/Lib/site-packages/pandas/tests/util/test_assert_extension_array_equal.py b/venv/Lib/site-packages/pandas/tests/util/test_assert_extension_array_equal.py new file mode 100644 index 0000000..0547323 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/util/test_assert_extension_array_equal.py @@ -0,0 +1,107 @@ +import numpy as np +import pytest + +import pandas._testing as tm +from pandas.core.arrays.sparse import SparseArray + + +@pytest.mark.parametrize( + "kwargs", + [ + dict(), # Default is check_exact=False + dict(check_exact=False), + dict(check_exact=True), + ], +) +def test_assert_extension_array_equal_not_exact(kwargs): + # see gh-23709 + arr1 = SparseArray([-0.17387645482451206, 0.3414148016424936]) + arr2 = SparseArray([-0.17387645482451206, 0.3414148016424937]) + + if kwargs.get("check_exact", False): + msg = """\ +ExtensionArray are different + +ExtensionArray values are different \\(50\\.0 %\\) +\\[left\\]: \\[-0\\.17387645482.*, 0\\.341414801642.*\\] +\\[right\\]: \\[-0\\.17387645482.*, 0\\.341414801642.*\\]""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_extension_array_equal(arr1, arr2, **kwargs) + else: + tm.assert_extension_array_equal(arr1, arr2, **kwargs) + + +@pytest.mark.parametrize( + "check_less_precise", [True, False, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9] +) +def test_assert_extension_array_equal_less_precise(check_less_precise): + arr1 = SparseArray([0.5, 0.123456]) + arr2 = SparseArray([0.5, 0.123457]) + + kwargs = dict(check_less_precise=check_less_precise) + + if check_less_precise is False or check_less_precise >= 5: + msg = """\ +ExtensionArray are different + +ExtensionArray values are different \\(50\\.0 %\\) +\\[left\\]: \\[0\\.5, 0\\.123456\\] +\\[right\\]: \\[0\\.5, 0\\.123457\\]""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_extension_array_equal(arr1, arr2, **kwargs) + else: + tm.assert_extension_array_equal(arr1, arr2, **kwargs) + + +def test_assert_extension_array_equal_dtype_mismatch(check_dtype): + end = 5 + kwargs = dict(check_dtype=check_dtype) + + arr1 = SparseArray(np.arange(end, dtype="int64")) + arr2 = SparseArray(np.arange(end, dtype="int32")) + + if check_dtype: + msg = """\ +ExtensionArray are different + +Attribute "dtype" are different +\\[left\\]: Sparse\\[int64, 0\\] +\\[right\\]: Sparse\\[int32, 0\\]""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_extension_array_equal(arr1, arr2, **kwargs) + else: + tm.assert_extension_array_equal(arr1, arr2, **kwargs) + + +def test_assert_extension_array_equal_missing_values(): + arr1 = SparseArray([np.nan, 1, 2, np.nan]) + arr2 = SparseArray([np.nan, 1, 2, 3]) + + msg = """\ +ExtensionArray NA mask are different + +ExtensionArray NA mask values are different \\(25\\.0 %\\) +\\[left\\]: \\[True, False, False, True\\] +\\[right\\]: \\[True, False, False, False\\]""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_extension_array_equal(arr1, arr2) + + +@pytest.mark.parametrize("side", ["left", "right"]) +def test_assert_extension_array_equal_non_extension_array(side): + numpy_array = np.arange(5) + extension_array = SparseArray(numpy_array) + + msg = f"{side} is not an ExtensionArray" + args = ( + (numpy_array, extension_array) + if side == "left" + else (extension_array, numpy_array) + ) + + with pytest.raises(AssertionError, match=msg): + tm.assert_extension_array_equal(*args) diff --git a/venv/Lib/site-packages/pandas/tests/util/test_assert_frame_equal.py b/venv/Lib/site-packages/pandas/tests/util/test_assert_frame_equal.py new file mode 100644 index 0000000..23c845f --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/util/test_assert_frame_equal.py @@ -0,0 +1,224 @@ +import pytest + +from pandas import DataFrame +import pandas._testing as tm + + +@pytest.fixture(params=[True, False]) +def by_blocks_fixture(request): + return request.param + + +@pytest.fixture(params=["DataFrame", "Series"]) +def obj_fixture(request): + return request.param + + +def _assert_frame_equal_both(a, b, **kwargs): + """ + Check that two DataFrame equal. + + This check is performed commutatively. + + Parameters + ---------- + a : DataFrame + The first DataFrame to compare. + b : DataFrame + The second DataFrame to compare. + kwargs : dict + The arguments passed to `tm.assert_frame_equal`. + """ + tm.assert_frame_equal(a, b, **kwargs) + tm.assert_frame_equal(b, a, **kwargs) + + +def _assert_not_frame_equal(a, b, **kwargs): + """ + Check that two DataFrame are not equal. + + Parameters + ---------- + a : DataFrame + The first DataFrame to compare. + b : DataFrame + The second DataFrame to compare. + kwargs : dict + The arguments passed to `tm.assert_frame_equal`. + """ + try: + tm.assert_frame_equal(a, b, **kwargs) + msg = "The two DataFrames were equal when they shouldn't have been" + + pytest.fail(msg=msg) + except AssertionError: + pass + + +def _assert_not_frame_equal_both(a, b, **kwargs): + """ + Check that two DataFrame are not equal. + + This check is performed commutatively. + + Parameters + ---------- + a : DataFrame + The first DataFrame to compare. + b : DataFrame + The second DataFrame to compare. + kwargs : dict + The arguments passed to `tm.assert_frame_equal`. + """ + _assert_not_frame_equal(a, b, **kwargs) + _assert_not_frame_equal(b, a, **kwargs) + + +@pytest.mark.parametrize("check_like", [True, False]) +def test_frame_equal_row_order_mismatch(check_like, obj_fixture): + df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "c"]) + df2 = DataFrame({"A": [3, 2, 1], "B": [6, 5, 4]}, index=["c", "b", "a"]) + + if not check_like: # Do not ignore row-column orderings. + msg = f"{obj_fixture}.index are different" + with pytest.raises(AssertionError, match=msg): + tm.assert_frame_equal(df1, df2, check_like=check_like, obj=obj_fixture) + else: + _assert_frame_equal_both(df1, df2, check_like=check_like, obj=obj_fixture) + + +@pytest.mark.parametrize( + "df1,df2", + [ + (DataFrame({"A": [1, 2, 3]}), DataFrame({"A": [1, 2, 3, 4]})), + (DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}), DataFrame({"A": [1, 2, 3]})), + ], +) +def test_frame_equal_shape_mismatch(df1, df2, obj_fixture): + msg = f"{obj_fixture} are different" + + with pytest.raises(AssertionError, match=msg): + tm.assert_frame_equal(df1, df2, obj=obj_fixture) + + +@pytest.mark.parametrize( + "df1,df2,msg", + [ + # Index + ( + DataFrame.from_records({"a": [1, 2], "c": ["l1", "l2"]}, index=["a"]), + DataFrame.from_records({"a": [1.0, 2.0], "c": ["l1", "l2"]}, index=["a"]), + "DataFrame\\.index are different", + ), + # MultiIndex + ( + DataFrame.from_records( + {"a": [1, 2], "b": [2.1, 1.5], "c": ["l1", "l2"]}, index=["a", "b"] + ), + DataFrame.from_records( + {"a": [1.0, 2.0], "b": [2.1, 1.5], "c": ["l1", "l2"]}, index=["a", "b"] + ), + "MultiIndex level \\[0\\] are different", + ), + ], +) +def test_frame_equal_index_dtype_mismatch(df1, df2, msg, check_index_type): + kwargs = dict(check_index_type=check_index_type) + + if check_index_type: + with pytest.raises(AssertionError, match=msg): + tm.assert_frame_equal(df1, df2, **kwargs) + else: + tm.assert_frame_equal(df1, df2, **kwargs) + + +def test_empty_dtypes(check_dtype): + columns = ["col1", "col2"] + df1 = DataFrame(columns=columns) + df2 = DataFrame(columns=columns) + + kwargs = dict(check_dtype=check_dtype) + df1["col1"] = df1["col1"].astype("int64") + + if check_dtype: + msg = r"Attributes of DataFrame\..* are different" + with pytest.raises(AssertionError, match=msg): + tm.assert_frame_equal(df1, df2, **kwargs) + else: + tm.assert_frame_equal(df1, df2, **kwargs) + + +def test_frame_equal_index_mismatch(obj_fixture): + msg = f"""{obj_fixture}\\.index are different + +{obj_fixture}\\.index values are different \\(33\\.33333 %\\) +\\[left\\]: Index\\(\\['a', 'b', 'c'\\], dtype='object'\\) +\\[right\\]: Index\\(\\['a', 'b', 'd'\\], dtype='object'\\)""" + + df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "c"]) + df2 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "d"]) + + with pytest.raises(AssertionError, match=msg): + tm.assert_frame_equal(df1, df2, obj=obj_fixture) + + +def test_frame_equal_columns_mismatch(obj_fixture): + msg = f"""{obj_fixture}\\.columns are different + +{obj_fixture}\\.columns values are different \\(50\\.0 %\\) +\\[left\\]: Index\\(\\['A', 'B'\\], dtype='object'\\) +\\[right\\]: Index\\(\\['A', 'b'\\], dtype='object'\\)""" + + df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "c"]) + df2 = DataFrame({"A": [1, 2, 3], "b": [4, 5, 6]}, index=["a", "b", "c"]) + + with pytest.raises(AssertionError, match=msg): + tm.assert_frame_equal(df1, df2, obj=obj_fixture) + + +def test_frame_equal_block_mismatch(by_blocks_fixture, obj_fixture): + obj = obj_fixture + msg = f"""{obj}\\.iloc\\[:, 1\\] \\(column name="B"\\) are different + +{obj}\\.iloc\\[:, 1\\] \\(column name="B"\\) values are different \\(33\\.33333 %\\) +\\[left\\]: \\[4, 5, 6\\] +\\[right\\]: \\[4, 5, 7\\]""" + + df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + df2 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 7]}) + + with pytest.raises(AssertionError, match=msg): + tm.assert_frame_equal(df1, df2, by_blocks=by_blocks_fixture, obj=obj_fixture) + + +@pytest.mark.parametrize( + "df1,df2,msg", + [ + ( + DataFrame({"A": ["á", "à", "ä"], "E": ["é", "è", "ë"]}), + DataFrame({"A": ["á", "à", "ä"], "E": ["é", "è", "e̊"]}), + """{obj}\\.iloc\\[:, 1\\] \\(column name="E"\\) are different + +{obj}\\.iloc\\[:, 1\\] \\(column name="E"\\) values are different \\(33\\.33333 %\\) +\\[left\\]: \\[é, è, ë\\] +\\[right\\]: \\[é, è, e̊\\]""", + ), + ( + DataFrame({"A": ["á", "à", "ä"], "E": ["é", "è", "ë"]}), + DataFrame({"A": ["a", "a", "a"], "E": ["e", "e", "e"]}), + """{obj}\\.iloc\\[:, 0\\] \\(column name="A"\\) are different + +{obj}\\.iloc\\[:, 0\\] \\(column name="A"\\) values are different \\(100\\.0 %\\) +\\[left\\]: \\[á, à, ä\\] +\\[right\\]: \\[a, a, a\\]""", + ), + ], +) +def test_frame_equal_unicode(df1, df2, msg, by_blocks_fixture, obj_fixture): + # see gh-20503 + # + # Test ensures that `tm.assert_frame_equals` raises the right exception + # when comparing DataFrames containing differing unicode objects. + msg = msg.format(obj=obj_fixture) + with pytest.raises(AssertionError, match=msg): + tm.assert_frame_equal(df1, df2, by_blocks=by_blocks_fixture, obj=obj_fixture) diff --git a/venv/Lib/site-packages/pandas/tests/util/test_assert_index_equal.py b/venv/Lib/site-packages/pandas/tests/util/test_assert_index_equal.py new file mode 100644 index 0000000..bbbeebc --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/util/test_assert_index_equal.py @@ -0,0 +1,172 @@ +import numpy as np +import pytest + +from pandas import Categorical, Index, MultiIndex, NaT +import pandas._testing as tm + + +def test_index_equal_levels_mismatch(): + msg = """Index are different + +Index levels are different +\\[left\\]: 1, Int64Index\\(\\[1, 2, 3\\], dtype='int64'\\) +\\[right\\]: 2, MultiIndex\\(\\[\\('A', 1\\), + \\('A', 2\\), + \\('B', 3\\), + \\('B', 4\\)\\], + \\)""" + + idx1 = Index([1, 2, 3]) + idx2 = MultiIndex.from_tuples([("A", 1), ("A", 2), ("B", 3), ("B", 4)]) + + with pytest.raises(AssertionError, match=msg): + tm.assert_index_equal(idx1, idx2, exact=False) + + +def test_index_equal_values_mismatch(check_exact): + msg = """MultiIndex level \\[1\\] are different + +MultiIndex level \\[1\\] values are different \\(25\\.0 %\\) +\\[left\\]: Int64Index\\(\\[2, 2, 3, 4\\], dtype='int64'\\) +\\[right\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)""" + + idx1 = MultiIndex.from_tuples([("A", 2), ("A", 2), ("B", 3), ("B", 4)]) + idx2 = MultiIndex.from_tuples([("A", 1), ("A", 2), ("B", 3), ("B", 4)]) + + with pytest.raises(AssertionError, match=msg): + tm.assert_index_equal(idx1, idx2, check_exact=check_exact) + + +def test_index_equal_length_mismatch(check_exact): + msg = """Index are different + +Index length are different +\\[left\\]: 3, Int64Index\\(\\[1, 2, 3\\], dtype='int64'\\) +\\[right\\]: 4, Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)""" + + idx1 = Index([1, 2, 3]) + idx2 = Index([1, 2, 3, 4]) + + with pytest.raises(AssertionError, match=msg): + tm.assert_index_equal(idx1, idx2, check_exact=check_exact) + + +def test_index_equal_class_mismatch(check_exact): + msg = """Index are different + +Index classes are different +\\[left\\]: Int64Index\\(\\[1, 2, 3\\], dtype='int64'\\) +\\[right\\]: Float64Index\\(\\[1\\.0, 2\\.0, 3\\.0\\], dtype='float64'\\)""" + + idx1 = Index([1, 2, 3]) + idx2 = Index([1, 2, 3.0]) + + with pytest.raises(AssertionError, match=msg): + tm.assert_index_equal(idx1, idx2, exact=True, check_exact=check_exact) + + +def test_index_equal_values_close(check_exact): + idx1 = Index([1, 2, 3.0]) + idx2 = Index([1, 2, 3.0000000001]) + + if check_exact: + msg = """Index are different + +Index values are different \\(33\\.33333 %\\) +\\[left\\]: Float64Index\\(\\[1.0, 2.0, 3.0], dtype='float64'\\) +\\[right\\]: Float64Index\\(\\[1.0, 2.0, 3.0000000001\\], dtype='float64'\\)""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_index_equal(idx1, idx2, check_exact=check_exact) + else: + tm.assert_index_equal(idx1, idx2, check_exact=check_exact) + + +def test_index_equal_values_less_close(check_exact, check_less_precise): + idx1 = Index([1, 2, 3.0]) + idx2 = Index([1, 2, 3.0001]) + kwargs = dict(check_exact=check_exact, check_less_precise=check_less_precise) + + if check_exact or not check_less_precise: + msg = """Index are different + +Index values are different \\(33\\.33333 %\\) +\\[left\\]: Float64Index\\(\\[1.0, 2.0, 3.0], dtype='float64'\\) +\\[right\\]: Float64Index\\(\\[1.0, 2.0, 3.0001\\], dtype='float64'\\)""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_index_equal(idx1, idx2, **kwargs) + else: + tm.assert_index_equal(idx1, idx2, **kwargs) + + +def test_index_equal_values_too_far(check_exact, check_less_precise): + idx1 = Index([1, 2, 3]) + idx2 = Index([1, 2, 4]) + kwargs = dict(check_exact=check_exact, check_less_precise=check_less_precise) + + msg = """Index are different + +Index values are different \\(33\\.33333 %\\) +\\[left\\]: Int64Index\\(\\[1, 2, 3\\], dtype='int64'\\) +\\[right\\]: Int64Index\\(\\[1, 2, 4\\], dtype='int64'\\)""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_index_equal(idx1, idx2, **kwargs) + + +def test_index_equal_level_values_mismatch(check_exact, check_less_precise): + idx1 = MultiIndex.from_tuples([("A", 2), ("A", 2), ("B", 3), ("B", 4)]) + idx2 = MultiIndex.from_tuples([("A", 1), ("A", 2), ("B", 3), ("B", 4)]) + kwargs = dict(check_exact=check_exact, check_less_precise=check_less_precise) + + msg = """MultiIndex level \\[1\\] are different + +MultiIndex level \\[1\\] values are different \\(25\\.0 %\\) +\\[left\\]: Int64Index\\(\\[2, 2, 3, 4\\], dtype='int64'\\) +\\[right\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_index_equal(idx1, idx2, **kwargs) + + +@pytest.mark.parametrize( + "name1,name2", + [(None, "x"), ("x", "x"), (np.nan, np.nan), (NaT, NaT), (np.nan, NaT)], +) +def test_index_equal_names(name1, name2): + + idx1 = Index([1, 2, 3], name=name1) + idx2 = Index([1, 2, 3], name=name2) + + if name1 == name2 or name1 is name2: + tm.assert_index_equal(idx1, idx2) + else: + name1 = "'x'" if name1 == "x" else name1 + name2 = "'x'" if name2 == "x" else name2 + msg = f"""Index are different + +Attribute "names" are different +\\[left\\]: \\[{name1}\\] +\\[right\\]: \\[{name2}\\]""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_index_equal(idx1, idx2) + + +def test_index_equal_category_mismatch(check_categorical): + msg = """Index are different + +Attribute "dtype" are different +\\[left\\]: CategoricalDtype\\(categories=\\['a', 'b'\\], ordered=False\\) +\\[right\\]: CategoricalDtype\\(categories=\\['a', 'b', 'c'\\], \ +ordered=False\\)""" + + idx1 = Index(Categorical(["a", "b"])) + idx2 = Index(Categorical(["a", "b"], categories=["a", "b", "c"])) + + if check_categorical: + with pytest.raises(AssertionError, match=msg): + tm.assert_index_equal(idx1, idx2, check_categorical=check_categorical) + else: + tm.assert_index_equal(idx1, idx2, check_categorical=check_categorical) diff --git a/venv/Lib/site-packages/pandas/tests/util/test_assert_interval_array_equal.py b/venv/Lib/site-packages/pandas/tests/util/test_assert_interval_array_equal.py new file mode 100644 index 0000000..96f2973 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/util/test_assert_interval_array_equal.py @@ -0,0 +1,81 @@ +import pytest + +from pandas import interval_range +import pandas._testing as tm + + +@pytest.mark.parametrize( + "kwargs", + [ + dict(start=0, periods=4), + dict(start=1, periods=5), + dict(start=5, end=10, closed="left"), + ], +) +def test_interval_array_equal(kwargs): + arr = interval_range(**kwargs).values + tm.assert_interval_array_equal(arr, arr) + + +def test_interval_array_equal_closed_mismatch(): + kwargs = dict(start=0, periods=5) + arr1 = interval_range(closed="left", **kwargs).values + arr2 = interval_range(closed="right", **kwargs).values + + msg = """\ +IntervalArray are different + +Attribute "closed" are different +\\[left\\]: left +\\[right\\]: right""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_interval_array_equal(arr1, arr2) + + +def test_interval_array_equal_periods_mismatch(): + kwargs = dict(start=0) + arr1 = interval_range(periods=5, **kwargs).values + arr2 = interval_range(periods=6, **kwargs).values + + msg = """\ +IntervalArray.left are different + +IntervalArray.left length are different +\\[left\\]: 5, Int64Index\\(\\[0, 1, 2, 3, 4\\], dtype='int64'\\) +\\[right\\]: 6, Int64Index\\(\\[0, 1, 2, 3, 4, 5\\], dtype='int64'\\)""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_interval_array_equal(arr1, arr2) + + +def test_interval_array_equal_end_mismatch(): + kwargs = dict(start=0, periods=5) + arr1 = interval_range(end=10, **kwargs).values + arr2 = interval_range(end=20, **kwargs).values + + msg = """\ +IntervalArray.left are different + +IntervalArray.left values are different \\(80.0 %\\) +\\[left\\]: Int64Index\\(\\[0, 2, 4, 6, 8\\], dtype='int64'\\) +\\[right\\]: Int64Index\\(\\[0, 4, 8, 12, 16\\], dtype='int64'\\)""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_interval_array_equal(arr1, arr2) + + +def test_interval_array_equal_start_mismatch(): + kwargs = dict(periods=4) + arr1 = interval_range(start=0, **kwargs).values + arr2 = interval_range(start=1, **kwargs).values + + msg = """\ +IntervalArray.left are different + +IntervalArray.left values are different \\(100.0 %\\) +\\[left\\]: Int64Index\\(\\[0, 1, 2, 3\\], dtype='int64'\\) +\\[right\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_interval_array_equal(arr1, arr2) diff --git a/venv/Lib/site-packages/pandas/tests/util/test_assert_numpy_array_equal.py b/venv/Lib/site-packages/pandas/tests/util/test_assert_numpy_array_equal.py new file mode 100644 index 0000000..c8ae9eb --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/util/test_assert_numpy_array_equal.py @@ -0,0 +1,177 @@ +import numpy as np +import pytest + +from pandas import Timestamp +import pandas._testing as tm + + +def test_assert_numpy_array_equal_shape_mismatch(): + msg = """numpy array are different + +numpy array shapes are different +\\[left\\]: \\(2L*,\\) +\\[right\\]: \\(3L*,\\)""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_numpy_array_equal(np.array([1, 2]), np.array([3, 4, 5])) + + +def test_assert_numpy_array_equal_bad_type(): + expected = "Expected type" + + with pytest.raises(AssertionError, match=expected): + tm.assert_numpy_array_equal(1, 2) + + +@pytest.mark.parametrize( + "a,b,klass1,klass2", + [(np.array([1]), 1, "ndarray", "int"), (1, np.array([1]), "int", "ndarray")], +) +def test_assert_numpy_array_equal_class_mismatch(a, b, klass1, klass2): + msg = f"""numpy array are different + +numpy array classes are different +\\[left\\]: {klass1} +\\[right\\]: {klass2}""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_numpy_array_equal(a, b) + + +def test_assert_numpy_array_equal_value_mismatch1(): + msg = """numpy array are different + +numpy array values are different \\(66\\.66667 %\\) +\\[left\\]: \\[nan, 2\\.0, 3\\.0\\] +\\[right\\]: \\[1\\.0, nan, 3\\.0\\]""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_numpy_array_equal(np.array([np.nan, 2, 3]), np.array([1, np.nan, 3])) + + +def test_assert_numpy_array_equal_value_mismatch2(): + msg = """numpy array are different + +numpy array values are different \\(50\\.0 %\\) +\\[left\\]: \\[1, 2\\] +\\[right\\]: \\[1, 3\\]""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_numpy_array_equal(np.array([1, 2]), np.array([1, 3])) + + +def test_assert_numpy_array_equal_value_mismatch3(): + msg = """numpy array are different + +numpy array values are different \\(16\\.66667 %\\) +\\[left\\]: \\[\\[1, 2\\], \\[3, 4\\], \\[5, 6\\]\\] +\\[right\\]: \\[\\[1, 3\\], \\[3, 4\\], \\[5, 6\\]\\]""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_numpy_array_equal( + np.array([[1, 2], [3, 4], [5, 6]]), np.array([[1, 3], [3, 4], [5, 6]]) + ) + + +def test_assert_numpy_array_equal_value_mismatch4(): + msg = """numpy array are different + +numpy array values are different \\(50\\.0 %\\) +\\[left\\]: \\[1\\.1, 2\\.000001\\] +\\[right\\]: \\[1\\.1, 2.0\\]""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_numpy_array_equal(np.array([1.1, 2.000001]), np.array([1.1, 2.0])) + + +def test_assert_numpy_array_equal_value_mismatch5(): + msg = """numpy array are different + +numpy array values are different \\(16\\.66667 %\\) +\\[left\\]: \\[\\[1, 2\\], \\[3, 4\\], \\[5, 6\\]\\] +\\[right\\]: \\[\\[1, 3\\], \\[3, 4\\], \\[5, 6\\]\\]""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_numpy_array_equal( + np.array([[1, 2], [3, 4], [5, 6]]), np.array([[1, 3], [3, 4], [5, 6]]) + ) + + +def test_assert_numpy_array_equal_value_mismatch6(): + msg = """numpy array are different + +numpy array values are different \\(25\\.0 %\\) +\\[left\\]: \\[\\[1, 2\\], \\[3, 4\\]\\] +\\[right\\]: \\[\\[1, 3\\], \\[3, 4\\]\\]""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_numpy_array_equal( + np.array([[1, 2], [3, 4]]), np.array([[1, 3], [3, 4]]) + ) + + +def test_assert_numpy_array_equal_shape_mismatch_override(): + msg = """Index are different + +Index shapes are different +\\[left\\]: \\(2L*,\\) +\\[right\\]: \\(3L*,\\)""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_numpy_array_equal(np.array([1, 2]), np.array([3, 4, 5]), obj="Index") + + +def test_numpy_array_equal_unicode(): + # see gh-20503 + # + # Test ensures that `tm.assert_numpy_array_equals` raises the right + # exception when comparing np.arrays containing differing unicode objects. + msg = """numpy array are different + +numpy array values are different \\(33\\.33333 %\\) +\\[left\\]: \\[á, à, ä\\] +\\[right\\]: \\[á, à, å\\]""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_numpy_array_equal( + np.array(["á", "à", "ä"]), np.array(["á", "à", "å"]) + ) + + +def test_numpy_array_equal_object(): + a = np.array([Timestamp("2011-01-01"), Timestamp("2011-01-01")]) + b = np.array([Timestamp("2011-01-01"), Timestamp("2011-01-02")]) + + msg = """numpy array are different + +numpy array values are different \\(50\\.0 %\\) +\\[left\\]: \\[2011-01-01 00:00:00, 2011-01-01 00:00:00\\] +\\[right\\]: \\[2011-01-01 00:00:00, 2011-01-02 00:00:00\\]""" + + with pytest.raises(AssertionError, match=msg): + tm.assert_numpy_array_equal(a, b) + + +@pytest.mark.parametrize("other_type", ["same", "copy"]) +@pytest.mark.parametrize("check_same", ["same", "copy"]) +def test_numpy_array_equal_copy_flag(other_type, check_same): + a = np.array([1, 2, 3]) + msg = None + + if other_type == "same": + other = a.view() + else: + other = a.copy() + + if check_same != other_type: + msg = ( + r"array\(\[1, 2, 3\]\) is not array\(\[1, 2, 3\]\)" + if check_same == "same" + else r"array\(\[1, 2, 3\]\) is array\(\[1, 2, 3\]\)" + ) + + if msg is not None: + with pytest.raises(AssertionError, match=msg): + tm.assert_numpy_array_equal(a, other, check_same=check_same) + else: + tm.assert_numpy_array_equal(a, other, check_same=check_same) diff --git a/venv/Lib/site-packages/pandas/tests/util/test_assert_produces_warning.py b/venv/Lib/site-packages/pandas/tests/util/test_assert_produces_warning.py new file mode 100644 index 0000000..87765c9 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/util/test_assert_produces_warning.py @@ -0,0 +1,22 @@ +import warnings + +import pytest + +import pandas._testing as tm + + +def f(): + warnings.warn("f1", FutureWarning) + warnings.warn("f2", RuntimeWarning) + + +@pytest.mark.filterwarnings("ignore:f1:FutureWarning") +def test_assert_produces_warning_honors_filter(): + # Raise by default. + msg = r"Caused unexpected warning\(s\)" + with pytest.raises(AssertionError, match=msg): + with tm.assert_produces_warning(RuntimeWarning): + f() + + with tm.assert_produces_warning(RuntimeWarning, raise_on_extra_warnings=False): + f() diff --git a/venv/Lib/site-packages/pandas/tests/util/test_assert_series_equal.py b/venv/Lib/site-packages/pandas/tests/util/test_assert_series_equal.py new file mode 100644 index 0000000..eaf0824 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/util/test_assert_series_equal.py @@ -0,0 +1,196 @@ +import pytest + +from pandas import Categorical, DataFrame, Series +import pandas._testing as tm + + +def _assert_series_equal_both(a, b, **kwargs): + """ + Check that two Series equal. + + This check is performed commutatively. + + Parameters + ---------- + a : Series + The first Series to compare. + b : Series + The second Series to compare. + kwargs : dict + The arguments passed to `tm.assert_series_equal`. + """ + tm.assert_series_equal(a, b, **kwargs) + tm.assert_series_equal(b, a, **kwargs) + + +def _assert_not_series_equal(a, b, **kwargs): + """ + Check that two Series are not equal. + + Parameters + ---------- + a : Series + The first Series to compare. + b : Series + The second Series to compare. + kwargs : dict + The arguments passed to `tm.assert_series_equal`. + """ + try: + tm.assert_series_equal(a, b, **kwargs) + msg = "The two Series were equal when they shouldn't have been" + + pytest.fail(msg=msg) + except AssertionError: + pass + + +def _assert_not_series_equal_both(a, b, **kwargs): + """ + Check that two Series are not equal. + + This check is performed commutatively. + + Parameters + ---------- + a : Series + The first Series to compare. + b : Series + The second Series to compare. + kwargs : dict + The arguments passed to `tm.assert_series_equal`. + """ + _assert_not_series_equal(a, b, **kwargs) + _assert_not_series_equal(b, a, **kwargs) + + +@pytest.mark.parametrize("data", [range(3), list("abc"), list("áàä")]) +def test_series_equal(data): + _assert_series_equal_both(Series(data), Series(data)) + + +@pytest.mark.parametrize( + "data1,data2", + [ + (range(3), range(1, 4)), + (list("abc"), list("xyz")), + (list("áàä"), list("éèë")), + (list("áàä"), list(b"aaa")), + (range(3), range(4)), + ], +) +def test_series_not_equal_value_mismatch(data1, data2): + _assert_not_series_equal_both(Series(data1), Series(data2)) + + +@pytest.mark.parametrize( + "kwargs", + [ + dict(dtype="float64"), # dtype mismatch + dict(index=[1, 2, 4]), # index mismatch + dict(name="foo"), # name mismatch + ], +) +def test_series_not_equal_metadata_mismatch(kwargs): + data = range(3) + s1 = Series(data) + + s2 = Series(data, **kwargs) + _assert_not_series_equal_both(s1, s2) + + +@pytest.mark.parametrize("data1,data2", [(0.12345, 0.12346), (0.1235, 0.1236)]) +@pytest.mark.parametrize("dtype", ["float32", "float64"]) +@pytest.mark.parametrize("check_less_precise", [False, True, 0, 1, 2, 3, 10]) +def test_less_precise(data1, data2, dtype, check_less_precise): + s1 = Series([data1], dtype=dtype) + s2 = Series([data2], dtype=dtype) + + kwargs = dict(check_less_precise=check_less_precise) + + if (check_less_precise is False or check_less_precise == 10) or ( + (check_less_precise is True or check_less_precise >= 3) + and abs(data1 - data2) >= 0.0001 + ): + msg = "Series values are different" + with pytest.raises(AssertionError, match=msg): + tm.assert_series_equal(s1, s2, **kwargs) + else: + _assert_series_equal_both(s1, s2, **kwargs) + + +@pytest.mark.parametrize( + "s1,s2,msg", + [ + # Index + ( + Series(["l1", "l2"], index=[1, 2]), + Series(["l1", "l2"], index=[1.0, 2.0]), + "Series\\.index are different", + ), + # MultiIndex + ( + DataFrame.from_records( + {"a": [1, 2], "b": [2.1, 1.5], "c": ["l1", "l2"]}, index=["a", "b"] + ).c, + DataFrame.from_records( + {"a": [1.0, 2.0], "b": [2.1, 1.5], "c": ["l1", "l2"]}, index=["a", "b"] + ).c, + "MultiIndex level \\[0\\] are different", + ), + ], +) +def test_series_equal_index_dtype(s1, s2, msg, check_index_type): + kwargs = dict(check_index_type=check_index_type) + + if check_index_type: + with pytest.raises(AssertionError, match=msg): + tm.assert_series_equal(s1, s2, **kwargs) + else: + tm.assert_series_equal(s1, s2, **kwargs) + + +def test_series_equal_length_mismatch(check_less_precise): + msg = """Series are different + +Series length are different +\\[left\\]: 3, RangeIndex\\(start=0, stop=3, step=1\\) +\\[right\\]: 4, RangeIndex\\(start=0, stop=4, step=1\\)""" + + s1 = Series([1, 2, 3]) + s2 = Series([1, 2, 3, 4]) + + with pytest.raises(AssertionError, match=msg): + tm.assert_series_equal(s1, s2, check_less_precise=check_less_precise) + + +def test_series_equal_values_mismatch(check_less_precise): + msg = """Series are different + +Series values are different \\(33\\.33333 %\\) +\\[left\\]: \\[1, 2, 3\\] +\\[right\\]: \\[1, 2, 4\\]""" + + s1 = Series([1, 2, 3]) + s2 = Series([1, 2, 4]) + + with pytest.raises(AssertionError, match=msg): + tm.assert_series_equal(s1, s2, check_less_precise=check_less_precise) + + +def test_series_equal_categorical_mismatch(check_categorical): + msg = """Attributes of Series are different + +Attribute "dtype" are different +\\[left\\]: CategoricalDtype\\(categories=\\['a', 'b'\\], ordered=False\\) +\\[right\\]: CategoricalDtype\\(categories=\\['a', 'b', 'c'\\], \ +ordered=False\\)""" + + s1 = Series(Categorical(["a", "b"])) + s2 = Series(Categorical(["a", "b"], categories=list("abc"))) + + if check_categorical: + with pytest.raises(AssertionError, match=msg): + tm.assert_series_equal(s1, s2, check_categorical=check_categorical) + else: + _assert_series_equal_both(s1, s2, check_categorical=check_categorical) diff --git a/venv/Lib/site-packages/pandas/tests/util/test_deprecate.py b/venv/Lib/site-packages/pandas/tests/util/test_deprecate.py new file mode 100644 index 0000000..ee4f7e3 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/util/test_deprecate.py @@ -0,0 +1,64 @@ +from textwrap import dedent + +import pytest + +from pandas.util._decorators import deprecate + +import pandas._testing as tm + + +def new_func(): + """ + This is the summary. The deprecate directive goes next. + + This is the extended summary. The deprecate directive goes before this. + """ + return "new_func called" + + +def new_func_no_docstring(): + return "new_func_no_docstring called" + + +def new_func_wrong_docstring(): + """Summary should be in the next line.""" + return "new_func_wrong_docstring called" + + +def new_func_with_deprecation(): + """ + This is the summary. The deprecate directive goes next. + + .. deprecated:: 1.0 + Use new_func instead. + + This is the extended summary. The deprecate directive goes before this. + """ + pass + + +def test_deprecate_ok(): + depr_func = deprecate("depr_func", new_func, "1.0", msg="Use new_func instead.") + + with tm.assert_produces_warning(FutureWarning): + result = depr_func() + + assert result == "new_func called" + assert depr_func.__doc__ == dedent(new_func_with_deprecation.__doc__) + + +def test_deprecate_no_docstring(): + depr_func = deprecate( + "depr_func", new_func_no_docstring, "1.0", msg="Use new_func instead." + ) + with tm.assert_produces_warning(FutureWarning): + result = depr_func() + assert result == "new_func_no_docstring called" + + +def test_deprecate_wrong_docstring(): + msg = "deprecate needs a correctly formatted docstring" + with pytest.raises(AssertionError, match=msg): + deprecate( + "depr_func", new_func_wrong_docstring, "1.0", msg="Use new_func instead." + ) diff --git a/venv/Lib/site-packages/pandas/tests/util/test_deprecate_kwarg.py b/venv/Lib/site-packages/pandas/tests/util/test_deprecate_kwarg.py new file mode 100644 index 0000000..b165e9f --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/util/test_deprecate_kwarg.py @@ -0,0 +1,90 @@ +import pytest + +from pandas.util._decorators import deprecate_kwarg + +import pandas._testing as tm + + +@deprecate_kwarg("old", "new") +def _f1(new=False): + return new + + +_f2_mappings = {"yes": True, "no": False} + + +@deprecate_kwarg("old", "new", _f2_mappings) +def _f2(new=False): + return new + + +def _f3_mapping(x): + return x + 1 + + +@deprecate_kwarg("old", "new", _f3_mapping) +def _f3(new=0): + return new + + +@pytest.mark.parametrize("key,klass", [("old", FutureWarning), ("new", None)]) +def test_deprecate_kwarg(key, klass): + x = 78 + + with tm.assert_produces_warning(klass): + assert _f1(**{key: x}) == x + + +@pytest.mark.parametrize("key", list(_f2_mappings.keys())) +def test_dict_deprecate_kwarg(key): + with tm.assert_produces_warning(FutureWarning): + assert _f2(old=key) == _f2_mappings[key] + + +@pytest.mark.parametrize("key", ["bogus", 12345, -1.23]) +def test_missing_deprecate_kwarg(key): + with tm.assert_produces_warning(FutureWarning): + assert _f2(old=key) == key + + +@pytest.mark.parametrize("x", [1, -1.4, 0]) +def test_callable_deprecate_kwarg(x): + with tm.assert_produces_warning(FutureWarning): + assert _f3(old=x) == _f3_mapping(x) + + +def test_callable_deprecate_kwarg_fail(): + msg = "((can only|cannot) concatenate)|(must be str)|(Can't convert)" + + with pytest.raises(TypeError, match=msg): + _f3(old="hello") + + +def test_bad_deprecate_kwarg(): + msg = "mapping from old to new argument values must be dict or callable!" + + with pytest.raises(TypeError, match=msg): + + @deprecate_kwarg("old", "new", 0) + def f4(new=None): + return new + + +@deprecate_kwarg("old", None) +def _f4(old=True, unchanged=True): + return old, unchanged + + +@pytest.mark.parametrize("key", ["old", "unchanged"]) +def test_deprecate_keyword(key): + x = 9 + + if key == "old": + klass = FutureWarning + expected = (x, True) + else: + klass = None + expected = (True, x) + + with tm.assert_produces_warning(klass): + assert _f4(**{key: x}) == expected diff --git a/venv/Lib/site-packages/pandas/tests/util/test_hashing.py b/venv/Lib/site-packages/pandas/tests/util/test_hashing.py new file mode 100644 index 0000000..c856585 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/util/test_hashing.py @@ -0,0 +1,383 @@ +import datetime + +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Index, MultiIndex, Series +import pandas._testing as tm +from pandas.core.util.hashing import _hash_scalar, hash_tuple, hash_tuples +from pandas.util import hash_array, hash_pandas_object + + +@pytest.fixture( + params=[ + Series([1, 2, 3] * 3, dtype="int32"), + Series([None, 2.5, 3.5] * 3, dtype="float32"), + Series(["a", "b", "c"] * 3, dtype="category"), + Series(["d", "e", "f"] * 3), + Series([True, False, True] * 3), + Series(pd.date_range("20130101", periods=9)), + Series(pd.date_range("20130101", periods=9, tz="US/Eastern")), + Series(pd.timedelta_range("2000", periods=9)), + ] +) +def series(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def index(request): + return request.param + + +def _check_equal(obj, **kwargs): + """ + Check that hashing an objects produces the same value each time. + + Parameters + ---------- + obj : object + The object to hash. + kwargs : kwargs + Keyword arguments to pass to the hashing function. + """ + a = hash_pandas_object(obj, **kwargs) + b = hash_pandas_object(obj, **kwargs) + tm.assert_series_equal(a, b) + + +def _check_not_equal_with_index(obj): + """ + Check the hash of an object with and without its index is not the same. + + Parameters + ---------- + obj : object + The object to hash. + """ + if not isinstance(obj, Index): + a = hash_pandas_object(obj, index=True) + b = hash_pandas_object(obj, index=False) + + if len(obj): + assert not (a == b).all() + + +def test_consistency(): + # Check that our hash doesn't change because of a mistake + # in the actual code; this is the ground truth. + result = hash_pandas_object(Index(["foo", "bar", "baz"])) + expected = Series( + np.array( + [3600424527151052760, 1374399572096150070, 477881037637427054], + dtype="uint64", + ), + index=["foo", "bar", "baz"], + ) + tm.assert_series_equal(result, expected) + + +def test_hash_array(series): + arr = series.values + tm.assert_numpy_array_equal(hash_array(arr), hash_array(arr)) + + +@pytest.mark.parametrize( + "arr2", [np.array([3, 4, "All"]), np.array([3, 4, "All"], dtype=object)] +) +def test_hash_array_mixed(arr2): + result1 = hash_array(np.array(["3", "4", "All"])) + result2 = hash_array(arr2) + + tm.assert_numpy_array_equal(result1, result2) + + +@pytest.mark.parametrize("val", [5, "foo", pd.Timestamp("20130101")]) +def test_hash_array_errors(val): + msg = "must pass a ndarray-like" + with pytest.raises(TypeError, match=msg): + hash_array(val) + + +def test_hash_tuples(): + tuples = [(1, "one"), (1, "two"), (2, "one")] + result = hash_tuples(tuples) + + expected = hash_pandas_object(MultiIndex.from_tuples(tuples)).values + tm.assert_numpy_array_equal(result, expected) + + result = hash_tuples(tuples[0]) + assert result == expected[0] + + +@pytest.mark.parametrize( + "tup", + [(1, "one"), (1, np.nan), (1.0, pd.NaT, "A"), ("A", pd.Timestamp("2012-01-01"))], +) +def test_hash_tuple(tup): + # Test equivalence between + # hash_tuples and hash_tuple. + result = hash_tuple(tup) + expected = hash_tuples([tup])[0] + + assert result == expected + + +@pytest.mark.parametrize( + "val", + [ + 1, + 1.4, + "A", + b"A", + pd.Timestamp("2012-01-01"), + pd.Timestamp("2012-01-01", tz="Europe/Brussels"), + datetime.datetime(2012, 1, 1), + pd.Timestamp("2012-01-01", tz="EST").to_pydatetime(), + pd.Timedelta("1 days"), + datetime.timedelta(1), + pd.Period("2012-01-01", freq="D"), + pd.Interval(0, 1), + np.nan, + pd.NaT, + None, + ], +) +def test_hash_scalar(val): + result = _hash_scalar(val) + expected = hash_array(np.array([val], dtype=object), categorize=True) + + assert result[0] == expected[0] + + +@pytest.mark.parametrize("val", [5, "foo", pd.Timestamp("20130101")]) +def test_hash_tuples_err(val): + msg = "must be convertible to a list-of-tuples" + with pytest.raises(TypeError, match=msg): + hash_tuples(val) + + +def test_multiindex_unique(): + mi = MultiIndex.from_tuples([(118, 472), (236, 118), (51, 204), (102, 51)]) + assert mi.is_unique is True + + result = hash_pandas_object(mi) + assert result.is_unique is True + + +def test_multiindex_objects(): + mi = MultiIndex( + levels=[["b", "d", "a"], [1, 2, 3]], + codes=[[0, 1, 0, 2], [2, 0, 0, 1]], + names=["col1", "col2"], + ) + recons = mi._sort_levels_monotonic() + + # These are equal. + assert mi.equals(recons) + assert Index(mi.values).equals(Index(recons.values)) + + # _hashed_values and hash_pandas_object(..., index=False) equivalency. + expected = hash_pandas_object(mi, index=False).values + result = mi._hashed_values + + tm.assert_numpy_array_equal(result, expected) + + expected = hash_pandas_object(recons, index=False).values + result = recons._hashed_values + + tm.assert_numpy_array_equal(result, expected) + + expected = mi._hashed_values + result = recons._hashed_values + + # Values should match, but in different order. + tm.assert_numpy_array_equal(np.sort(result), np.sort(expected)) + + +@pytest.mark.parametrize( + "obj", + [ + Series([1, 2, 3]), + Series([1.0, 1.5, 3.2]), + Series([1.0, 1.5, np.nan]), + Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]), + Series(["a", "b", "c"]), + Series(["a", np.nan, "c"]), + Series(["a", None, "c"]), + Series([True, False, True]), + Series(dtype=object), + Index([1, 2, 3]), + Index([True, False, True]), + DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}), + DataFrame(), + tm.makeMissingDataframe(), + tm.makeMixedDataFrame(), + tm.makeTimeDataFrame(), + tm.makeTimeSeries(), + tm.makeTimedeltaIndex(), + tm.makePeriodIndex(), + Series(tm.makePeriodIndex()), + Series(pd.date_range("20130101", periods=3, tz="US/Eastern")), + MultiIndex.from_product( + [range(5), ["foo", "bar", "baz"], pd.date_range("20130101", periods=2)] + ), + MultiIndex.from_product([pd.CategoricalIndex(list("aabc")), range(3)]), + ], +) +def test_hash_pandas_object(obj, index): + _check_equal(obj, index=index) + _check_not_equal_with_index(obj) + + +def test_hash_pandas_object2(series, index): + _check_equal(series, index=index) + _check_not_equal_with_index(series) + + +@pytest.mark.parametrize( + "obj", [Series([], dtype="float64"), Series([], dtype="object"), Index([])] +) +def test_hash_pandas_empty_object(obj, index): + # These are by-definition the same with + # or without the index as the data is empty. + _check_equal(obj, index=index) + + +@pytest.mark.parametrize( + "s1", + [ + Series(["a", "b", "c", "d"]), + Series([1000, 2000, 3000, 4000]), + Series(pd.date_range(0, periods=4)), + ], +) +@pytest.mark.parametrize("categorize", [True, False]) +def test_categorical_consistency(s1, categorize): + # see gh-15143 + # + # Check that categoricals hash consistent with their values, + # not codes. This should work for categoricals of any dtype. + s2 = s1.astype("category").cat.set_categories(s1) + s3 = s2.cat.set_categories(list(reversed(s1))) + + # These should all hash identically. + h1 = hash_pandas_object(s1, categorize=categorize) + h2 = hash_pandas_object(s2, categorize=categorize) + h3 = hash_pandas_object(s3, categorize=categorize) + + tm.assert_series_equal(h1, h2) + tm.assert_series_equal(h1, h3) + + +def test_categorical_with_nan_consistency(): + c = pd.Categorical.from_codes( + [-1, 0, 1, 2, 3, 4], categories=pd.date_range("2012-01-01", periods=5, name="B") + ) + expected = hash_array(c, categorize=False) + + c = pd.Categorical.from_codes([-1, 0], categories=[pd.Timestamp("2012-01-01")]) + result = hash_array(c, categorize=False) + + assert result[0] in expected + assert result[1] in expected + + +@pytest.mark.parametrize("obj", [pd.Timestamp("20130101")]) +def test_pandas_errors(obj): + msg = "Unexpected type for hashing" + with pytest.raises(TypeError, match=msg): + hash_pandas_object(obj) + + +def test_hash_keys(): + # Using different hash keys, should have + # different hashes for the same data. + # + # This only matters for object dtypes. + obj = Series(list("abc")) + + a = hash_pandas_object(obj, hash_key="9876543210123456") + b = hash_pandas_object(obj, hash_key="9876543210123465") + + assert (a != b).all() + + +def test_invalid_key(): + # This only matters for object dtypes. + msg = "key should be a 16-byte string encoded" + + with pytest.raises(ValueError, match=msg): + hash_pandas_object(Series(list("abc")), hash_key="foo") + + +def test_already_encoded(index): + # If already encoded, then ok. + obj = Series(list("abc")).str.encode("utf8") + _check_equal(obj, index=index) + + +def test_alternate_encoding(index): + obj = Series(list("abc")) + _check_equal(obj, index=index, encoding="ascii") + + +@pytest.mark.parametrize("l_exp", range(8)) +@pytest.mark.parametrize("l_add", [0, 1]) +def test_same_len_hash_collisions(l_exp, l_add): + length = 2 ** (l_exp + 8) + l_add + s = tm.rands_array(length, 2) + + result = hash_array(s, "utf8") + assert not result[0] == result[1] + + +def test_hash_collisions(): + # Hash collisions are bad. + # + # https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726 + hashes = [ + "Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9", # noqa: E501 + "Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe", # noqa: E501 + ] + + # These should be different. + result1 = hash_array(np.asarray(hashes[0:1], dtype=object), "utf8") + expected1 = np.array([14963968704024874985], dtype=np.uint64) + tm.assert_numpy_array_equal(result1, expected1) + + result2 = hash_array(np.asarray(hashes[1:2], dtype=object), "utf8") + expected2 = np.array([16428432627716348016], dtype=np.uint64) + tm.assert_numpy_array_equal(result2, expected2) + + result = hash_array(np.asarray(hashes, dtype=object), "utf8") + tm.assert_numpy_array_equal(result, np.concatenate([expected1, expected2], axis=0)) + + +def test_hash_with_tuple(): + # GH#28969 array containing a tuple raises on call to arr.astype(str) + # apparently a numpy bug github.com/numpy/numpy/issues/9441 + + df = pd.DataFrame({"data": [tuple("1"), tuple("2")]}) + result = hash_pandas_object(df) + expected = pd.Series([10345501319357378243, 8331063931016360761], dtype=np.uint64) + tm.assert_series_equal(result, expected) + + df2 = pd.DataFrame({"data": [tuple([1]), tuple([2])]}) + result = hash_pandas_object(df2) + expected = pd.Series([9408946347443669104, 3278256261030523334], dtype=np.uint64) + tm.assert_series_equal(result, expected) + + # require that the elements of such tuples are themselves hashable + + df3 = pd.DataFrame({"data": [tuple([1, []]), tuple([2, {}])]}) + with pytest.raises(TypeError, match="unhashable type: 'list'"): + hash_pandas_object(df3) + + +def test_hash_object_none_key(): + # https://github.com/pandas-dev/pandas/issues/30887 + result = pd.util.hash_pandas_object(pd.Series(["a", "b"]), hash_key=None) + expected = pd.Series([4578374827886788867, 17338122309987883691], dtype="uint64") + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/util/test_safe_import.py b/venv/Lib/site-packages/pandas/tests/util/test_safe_import.py new file mode 100644 index 0000000..bd07bea --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/util/test_safe_import.py @@ -0,0 +1,39 @@ +import sys +import types + +import pytest + +import pandas.util._test_decorators as td + + +@pytest.mark.parametrize("name", ["foo", "hello123"]) +def test_safe_import_non_existent(name): + assert not td.safe_import(name) + + +def test_safe_import_exists(): + assert td.safe_import("pandas") + + +@pytest.mark.parametrize("min_version,valid", [("0.0.0", True), ("99.99.99", False)]) +def test_safe_import_versions(min_version, valid): + result = td.safe_import("pandas", min_version=min_version) + result = result if valid else not result + assert result + + +@pytest.mark.parametrize( + "min_version,valid", [(None, False), ("1.0", True), ("2.0", False)] +) +def test_safe_import_dummy(monkeypatch, min_version, valid): + mod_name = "hello123" + + mod = types.ModuleType(mod_name) + mod.__version__ = "1.5" + + if min_version is not None: + monkeypatch.setitem(sys.modules, mod_name, mod) + + result = td.safe_import(mod_name, min_version=min_version) + result = result if valid else not result + assert result diff --git a/venv/Lib/site-packages/pandas/tests/util/test_util.py b/venv/Lib/site-packages/pandas/tests/util/test_util.py new file mode 100644 index 0000000..6a19ade --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/util/test_util.py @@ -0,0 +1,78 @@ +import os + +import pytest + +import pandas.compat as compat + +import pandas._testing as tm + + +def test_rands(): + r = tm.rands(10) + assert len(r) == 10 + + +def test_rands_array_1d(): + arr = tm.rands_array(5, size=10) + assert arr.shape == (10,) + assert len(arr[0]) == 5 + + +def test_rands_array_2d(): + arr = tm.rands_array(7, size=(10, 10)) + assert arr.shape == (10, 10) + assert len(arr[1, 1]) == 7 + + +def test_numpy_err_state_is_default(): + expected = {"over": "warn", "divide": "warn", "invalid": "warn", "under": "ignore"} + import numpy as np + + # The error state should be unchanged after that import. + assert np.geterr() == expected + + +def test_convert_rows_list_to_csv_str(): + rows_list = ["aaa", "bbb", "ccc"] + ret = tm.convert_rows_list_to_csv_str(rows_list) + + if compat.is_platform_windows(): + expected = "aaa\r\nbbb\r\nccc\r\n" + else: + expected = "aaa\nbbb\nccc\n" + + assert ret == expected + + +def test_create_temp_directory(): + with tm.ensure_clean_dir() as path: + assert os.path.exists(path) + assert os.path.isdir(path) + assert not os.path.exists(path) + + +@pytest.mark.parametrize("strict_data_files", [True, False]) +def test_datapath_missing(datapath): + with pytest.raises(ValueError, match="Could not find file"): + datapath("not_a_file") + + +def test_datapath(datapath): + args = ("data", "iris.csv") + + result = datapath(*args) + expected = os.path.join(os.path.dirname(os.path.dirname(__file__)), *args) + + assert result == expected + + +def test_rng_context(): + import numpy as np + + expected0 = 1.764052345967664 + expected1 = 1.6243453636632417 + + with tm.RNGContext(0): + with tm.RNGContext(1): + assert np.random.randn() == expected1 + assert np.random.randn() == expected0 diff --git a/venv/Lib/site-packages/pandas/tests/util/test_validate_args.py b/venv/Lib/site-packages/pandas/tests/util/test_validate_args.py new file mode 100644 index 0000000..746d859 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/util/test_validate_args.py @@ -0,0 +1,67 @@ +import pytest + +from pandas.util._validators import validate_args + +_fname = "func" + + +def test_bad_min_fname_arg_count(): + msg = "'max_fname_arg_count' must be non-negative" + + with pytest.raises(ValueError, match=msg): + validate_args(_fname, (None,), -1, "foo") + + +def test_bad_arg_length_max_value_single(): + args = (None, None) + compat_args = ("foo",) + + min_fname_arg_count = 0 + max_length = len(compat_args) + min_fname_arg_count + actual_length = len(args) + min_fname_arg_count + msg = ( + fr"{_fname}\(\) takes at most {max_length} " + fr"argument \({actual_length} given\)" + ) + + with pytest.raises(TypeError, match=msg): + validate_args(_fname, args, min_fname_arg_count, compat_args) + + +def test_bad_arg_length_max_value_multiple(): + args = (None, None) + compat_args = dict(foo=None) + + min_fname_arg_count = 2 + max_length = len(compat_args) + min_fname_arg_count + actual_length = len(args) + min_fname_arg_count + msg = ( + fr"{_fname}\(\) takes at most {max_length} " + fr"arguments \({actual_length} given\)" + ) + + with pytest.raises(TypeError, match=msg): + validate_args(_fname, args, min_fname_arg_count, compat_args) + + +@pytest.mark.parametrize("i", range(1, 3)) +def test_not_all_defaults(i): + bad_arg = "foo" + msg = ( + f"the '{bad_arg}' parameter is not supported " + fr"in the pandas implementation of {_fname}\(\)" + ) + + compat_args = {"foo": 2, "bar": -1, "baz": 3} + arg_vals = (1, -1, 3) + + with pytest.raises(ValueError, match=msg): + validate_args(_fname, arg_vals[:i], 2, compat_args) + + +def test_validation(): + # No exceptions should be raised. + validate_args(_fname, (None,), 2, dict(out=None)) + + compat_args = {"axis": 1, "out": None} + validate_args(_fname, (1, None), 2, compat_args) diff --git a/venv/Lib/site-packages/pandas/tests/util/test_validate_args_and_kwargs.py b/venv/Lib/site-packages/pandas/tests/util/test_validate_args_and_kwargs.py new file mode 100644 index 0000000..941ba86 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/util/test_validate_args_and_kwargs.py @@ -0,0 +1,81 @@ +import pytest + +from pandas.util._validators import validate_args_and_kwargs + +_fname = "func" + + +def test_invalid_total_length_max_length_one(): + compat_args = ("foo",) + kwargs = {"foo": "FOO"} + args = ("FoO", "BaZ") + + min_fname_arg_count = 0 + max_length = len(compat_args) + min_fname_arg_count + actual_length = len(kwargs) + len(args) + min_fname_arg_count + + msg = ( + fr"{_fname}\(\) takes at most {max_length} " + fr"argument \({actual_length} given\)" + ) + + with pytest.raises(TypeError, match=msg): + validate_args_and_kwargs(_fname, args, kwargs, min_fname_arg_count, compat_args) + + +def test_invalid_total_length_max_length_multiple(): + compat_args = ("foo", "bar", "baz") + kwargs = {"foo": "FOO", "bar": "BAR"} + args = ("FoO", "BaZ") + + min_fname_arg_count = 2 + max_length = len(compat_args) + min_fname_arg_count + actual_length = len(kwargs) + len(args) + min_fname_arg_count + + msg = ( + fr"{_fname}\(\) takes at most {max_length} " + fr"arguments \({actual_length} given\)" + ) + + with pytest.raises(TypeError, match=msg): + validate_args_and_kwargs(_fname, args, kwargs, min_fname_arg_count, compat_args) + + +@pytest.mark.parametrize("args,kwargs", [((), {"foo": -5, "bar": 2}), ((-5, 2), {})]) +def test_missing_args_or_kwargs(args, kwargs): + bad_arg = "bar" + min_fname_arg_count = 2 + + compat_args = {"foo": -5, bad_arg: 1} + + msg = ( + fr"the '{bad_arg}' parameter is not supported " + fr"in the pandas implementation of {_fname}\(\)" + ) + + with pytest.raises(ValueError, match=msg): + validate_args_and_kwargs(_fname, args, kwargs, min_fname_arg_count, compat_args) + + +def test_duplicate_argument(): + min_fname_arg_count = 2 + + compat_args = {"foo": None, "bar": None, "baz": None} + kwargs = {"foo": None, "bar": None} + args = (None,) # duplicate value for "foo" + + msg = fr"{_fname}\(\) got multiple values for keyword argument 'foo'" + + with pytest.raises(TypeError, match=msg): + validate_args_and_kwargs(_fname, args, kwargs, min_fname_arg_count, compat_args) + + +def test_validation(): + # No exceptions should be raised. + compat_args = {"foo": 1, "bar": None, "baz": -2} + kwargs = {"baz": -2} + + args = (1, None) + min_fname_arg_count = 2 + + validate_args_and_kwargs(_fname, args, kwargs, min_fname_arg_count, compat_args) diff --git a/venv/Lib/site-packages/pandas/tests/util/test_validate_kwargs.py b/venv/Lib/site-packages/pandas/tests/util/test_validate_kwargs.py new file mode 100644 index 0000000..a7b6d8f --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/util/test_validate_kwargs.py @@ -0,0 +1,63 @@ +import pytest + +from pandas.util._validators import validate_bool_kwarg, validate_kwargs + +_fname = "func" + + +def test_bad_kwarg(): + good_arg = "f" + bad_arg = good_arg + "o" + + compat_args = {good_arg: "foo", bad_arg + "o": "bar"} + kwargs = {good_arg: "foo", bad_arg: "bar"} + + msg = fr"{_fname}\(\) got an unexpected keyword argument '{bad_arg}'" + + with pytest.raises(TypeError, match=msg): + validate_kwargs(_fname, kwargs, compat_args) + + +@pytest.mark.parametrize("i", range(1, 3)) +def test_not_all_none(i): + bad_arg = "foo" + msg = ( + fr"the '{bad_arg}' parameter is not supported " + fr"in the pandas implementation of {_fname}\(\)" + ) + + compat_args = {"foo": 1, "bar": "s", "baz": None} + + kwarg_keys = ("foo", "bar", "baz") + kwarg_vals = (2, "s", None) + + kwargs = dict(zip(kwarg_keys[:i], kwarg_vals[:i])) + + with pytest.raises(ValueError, match=msg): + validate_kwargs(_fname, kwargs, compat_args) + + +def test_validation(): + # No exceptions should be raised. + compat_args = {"f": None, "b": 1, "ba": "s"} + + kwargs = dict(f=None, b=1) + validate_kwargs(_fname, kwargs, compat_args) + + +@pytest.mark.parametrize("name", ["inplace", "copy"]) +@pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0]) +def test_validate_bool_kwarg_fail(name, value): + msg = ( + f'For argument "{name}" expected type bool,' + f" received type {type(value).__name__}" + ) + + with pytest.raises(ValueError, match=msg): + validate_bool_kwarg(value, name) + + +@pytest.mark.parametrize("name", ["inplace", "copy"]) +@pytest.mark.parametrize("value", [True, False, None]) +def test_validate_bool_kwarg(name, value): + assert validate_bool_kwarg(value, name) == value diff --git a/venv/Lib/site-packages/pandas/tests/window/__init__.py b/venv/Lib/site-packages/pandas/tests/window/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tests/window/common.py b/venv/Lib/site-packages/pandas/tests/window/common.py new file mode 100644 index 0000000..6aeada3 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/window/common.py @@ -0,0 +1,386 @@ +from datetime import datetime + +import numpy as np +from numpy.random import randn + +from pandas import DataFrame, Series, bdate_range, notna +import pandas._testing as tm + +N, K = 100, 10 + + +class Base: + + _nan_locs = np.arange(20, 40) + _inf_locs = np.array([]) + + def _create_data(self): + arr = randn(N) + arr[self._nan_locs] = np.NaN + + self.arr = arr + self.rng = bdate_range(datetime(2009, 1, 1), periods=N) + self.series = Series(arr.copy(), index=self.rng) + self.frame = DataFrame(randn(N, K), index=self.rng, columns=np.arange(K)) + + +# create the data only once as we are not setting it +def _create_consistency_data(): + def create_series(): + return [ + Series(dtype=object), + Series([np.nan]), + Series([np.nan, np.nan]), + Series([3.0]), + Series([np.nan, 3.0]), + Series([3.0, np.nan]), + Series([1.0, 3.0]), + Series([2.0, 2.0]), + Series([3.0, 1.0]), + Series( + [5.0, 5.0, 5.0, 5.0, np.nan, np.nan, np.nan, 5.0, 5.0, np.nan, np.nan] + ), + Series( + [ + np.nan, + 5.0, + 5.0, + 5.0, + np.nan, + np.nan, + np.nan, + 5.0, + 5.0, + np.nan, + np.nan, + ] + ), + Series( + [ + np.nan, + np.nan, + 5.0, + 5.0, + np.nan, + np.nan, + np.nan, + 5.0, + 5.0, + np.nan, + np.nan, + ] + ), + Series( + [ + np.nan, + 3.0, + np.nan, + 3.0, + 4.0, + 5.0, + 6.0, + np.nan, + np.nan, + 7.0, + 12.0, + 13.0, + 14.0, + 15.0, + ] + ), + Series( + [ + np.nan, + 5.0, + np.nan, + 2.0, + 4.0, + 0.0, + 9.0, + np.nan, + np.nan, + 3.0, + 12.0, + 13.0, + 14.0, + 15.0, + ] + ), + Series( + [ + 2.0, + 3.0, + np.nan, + 3.0, + 4.0, + 5.0, + 6.0, + np.nan, + np.nan, + 7.0, + 12.0, + 13.0, + 14.0, + 15.0, + ] + ), + Series( + [ + 2.0, + 5.0, + np.nan, + 2.0, + 4.0, + 0.0, + 9.0, + np.nan, + np.nan, + 3.0, + 12.0, + 13.0, + 14.0, + 15.0, + ] + ), + Series(range(10)), + Series(range(20, 0, -2)), + ] + + def create_dataframes(): + return [ + DataFrame(), + DataFrame(columns=["a"]), + DataFrame(columns=["a", "a"]), + DataFrame(columns=["a", "b"]), + DataFrame(np.arange(10).reshape((5, 2))), + DataFrame(np.arange(25).reshape((5, 5))), + DataFrame(np.arange(25).reshape((5, 5)), columns=["a", "b", 99, "d", "d"]), + ] + [DataFrame(s) for s in create_series()] + + def is_constant(x): + values = x.values.ravel() + return len(set(values[notna(values)])) == 1 + + def no_nans(x): + return x.notna().all().all() + + # data is a tuple(object, is_constant, no_nans) + data = create_series() + create_dataframes() + + return [(x, is_constant(x), no_nans(x)) for x in data] + + +_consistency_data = _create_consistency_data() + + +class ConsistencyBase(Base): + base_functions = [ + (lambda v: Series(v).count(), None, "count"), + (lambda v: Series(v).max(), None, "max"), + (lambda v: Series(v).min(), None, "min"), + (lambda v: Series(v).sum(), None, "sum"), + (lambda v: Series(v).mean(), None, "mean"), + (lambda v: Series(v).std(), 1, "std"), + (lambda v: Series(v).cov(Series(v)), None, "cov"), + (lambda v: Series(v).corr(Series(v)), None, "corr"), + (lambda v: Series(v).var(), 1, "var"), + # restore once GH 8086 is fixed + # lambda v: Series(v).skew(), 3, 'skew'), + # (lambda v: Series(v).kurt(), 4, 'kurt'), + # restore once GH 8084 is fixed + # lambda v: Series(v).quantile(0.3), None, 'quantile'), + (lambda v: Series(v).median(), None, "median"), + (np.nanmax, 1, "max"), + (np.nanmin, 1, "min"), + (np.nansum, 1, "sum"), + (np.nanmean, 1, "mean"), + (lambda v: np.nanstd(v, ddof=1), 1, "std"), + (lambda v: np.nanvar(v, ddof=1), 1, "var"), + (np.nanmedian, 1, "median"), + ] + no_nan_functions = [ + (np.max, None, "max"), + (np.min, None, "min"), + (np.sum, None, "sum"), + (np.mean, None, "mean"), + (lambda v: np.std(v, ddof=1), 1, "std"), + (lambda v: np.var(v, ddof=1), 1, "var"), + (np.median, None, "median"), + ] + + def _create_data(self): + super()._create_data() + self.data = _consistency_data + + def _test_moments_consistency_mock_mean(self, mean, mock_mean): + for (x, is_constant, no_nans) in self.data: + mean_x = mean(x) + # check that correlation of a series with itself is either 1 or NaN + + if mock_mean: + # check that mean equals mock_mean + expected = mock_mean(x) + tm.assert_equal(mean_x, expected.astype("float64")) + + def _test_moments_consistency_is_constant(self, min_periods, count, mean, corr): + for (x, is_constant, no_nans) in self.data: + count_x = count(x) + mean_x = mean(x) + # check that correlation of a series with itself is either 1 or NaN + corr_x_x = corr(x, x) + + if is_constant: + exp = x.max() if isinstance(x, Series) else x.max().max() + + # check mean of constant series + expected = x * np.nan + expected[count_x >= max(min_periods, 1)] = exp + tm.assert_equal(mean_x, expected) + + # check correlation of constant series with itself is NaN + expected[:] = np.nan + tm.assert_equal(corr_x_x, expected) + + def _test_moments_consistency_var_debiasing_factors( + self, var_biased=None, var_unbiased=None, var_debiasing_factors=None + ): + for (x, is_constant, no_nans) in self.data: + if var_unbiased and var_biased and var_debiasing_factors: + # check variance debiasing factors + var_unbiased_x = var_unbiased(x) + var_biased_x = var_biased(x) + var_debiasing_factors_x = var_debiasing_factors(x) + tm.assert_equal(var_unbiased_x, var_biased_x * var_debiasing_factors_x) + + def _test_moments_consistency( + self, + min_periods, + count, + mean, + corr, + var_unbiased=None, + std_unbiased=None, + cov_unbiased=None, + var_biased=None, + std_biased=None, + cov_biased=None, + ): + + for (x, is_constant, no_nans) in self.data: + count_x = count(x) + mean_x = mean(x) + + for (std, var, cov) in [ + (std_biased, var_biased, cov_biased), + (std_unbiased, var_unbiased, cov_unbiased), + ]: + + # check that var(x), std(x), and cov(x) are all >= 0 + var_x = var(x) + std_x = std(x) + assert not (var_x < 0).any().any() + assert not (std_x < 0).any().any() + if cov: + cov_x_x = cov(x, x) + assert not (cov_x_x < 0).any().any() + + # check that var(x) == cov(x, x) + tm.assert_equal(var_x, cov_x_x) + + # check that var(x) == std(x)^2 + tm.assert_equal(var_x, std_x * std_x) + + if var is var_biased: + # check that biased var(x) == mean(x^2) - mean(x)^2 + mean_x2 = mean(x * x) + tm.assert_equal(var_x, mean_x2 - (mean_x * mean_x)) + + if is_constant: + # check that variance of constant series is identically 0 + assert not (var_x > 0).any().any() + expected = x * np.nan + expected[count_x >= max(min_periods, 1)] = 0.0 + if var is var_unbiased: + expected[count_x < 2] = np.nan + tm.assert_equal(var_x, expected) + + if isinstance(x, Series): + for (y, is_constant, no_nans) in self.data: + if not x.isna().equals(y.isna()): + # can only easily test two Series with similar + # structure + continue + + # check that cor(x, y) is symmetric + corr_x_y = corr(x, y) + corr_y_x = corr(y, x) + tm.assert_equal(corr_x_y, corr_y_x) + + if cov: + # check that cov(x, y) is symmetric + cov_x_y = cov(x, y) + cov_y_x = cov(y, x) + tm.assert_equal(cov_x_y, cov_y_x) + + # check that cov(x, y) == (var(x+y) - var(x) - + # var(y)) / 2 + var_x_plus_y = var(x + y) + var_y = var(y) + tm.assert_equal( + cov_x_y, 0.5 * (var_x_plus_y - var_x - var_y) + ) + + # check that corr(x, y) == cov(x, y) / (std(x) * + # std(y)) + std_y = std(y) + tm.assert_equal(corr_x_y, cov_x_y / (std_x * std_y)) + + if cov is cov_biased: + # check that biased cov(x, y) == mean(x*y) - + # mean(x)*mean(y) + mean_y = mean(y) + mean_x_times_y = mean(x * y) + tm.assert_equal( + cov_x_y, mean_x_times_y - (mean_x * mean_y) + ) + + def _check_pairwise_moment(self, dispatch, name, **kwargs): + def get_result(obj, obj2=None): + return getattr(getattr(obj, dispatch)(**kwargs), name)(obj2) + + result = get_result(self.frame) + result = result.loc[(slice(None), 1), 5] + result.index = result.index.droplevel(1) + expected = get_result(self.frame[1], self.frame[5]) + tm.assert_series_equal(result, expected, check_names=False) + + +def ew_func(A, B, com, name, **kwargs): + return getattr(A.ewm(com, **kwargs), name)(B) + + +def check_binary_ew(name, A, B): + + result = ew_func(A=A, B=B, com=20, name=name, min_periods=5) + assert np.isnan(result.values[:14]).all() + assert not np.isnan(result.values[14:]).any() + + +def check_binary_ew_min_periods(name, min_periods, A, B): + # GH 7898 + result = ew_func(A, B, 20, name=name, min_periods=min_periods) + # binary functions (ewmcov, ewmcorr) with bias=False require at + # least two values + assert np.isnan(result.values[:11]).all() + assert not np.isnan(result.values[11:]).any() + + # check series of length 0 + empty = Series([], dtype=np.float64) + result = ew_func(empty, empty, 50, name=name, min_periods=min_periods) + tm.assert_series_equal(result, empty) + + # check series of length 1 + result = ew_func( + Series([1.0]), Series([1.0]), 50, name=name, min_periods=min_periods + ) + tm.assert_series_equal(result, Series([np.NaN])) diff --git a/venv/Lib/site-packages/pandas/tests/window/conftest.py b/venv/Lib/site-packages/pandas/tests/window/conftest.py new file mode 100644 index 0000000..fb46ca5 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/window/conftest.py @@ -0,0 +1,89 @@ +import pytest + +import pandas.util._test_decorators as td + + +@pytest.fixture(params=[True, False]) +def raw(request): + return request.param + + +@pytest.fixture( + params=[ + "triang", + "blackman", + "hamming", + "bartlett", + "bohman", + "blackmanharris", + "nuttall", + "barthann", + ] +) +def win_types(request): + return request.param + + +@pytest.fixture(params=["kaiser", "gaussian", "general_gaussian", "exponential"]) +def win_types_special(request): + return request.param + + +@pytest.fixture( + params=["sum", "mean", "median", "max", "min", "var", "std", "kurt", "skew"] +) +def arithmetic_win_operators(request): + return request.param + + +@pytest.fixture(params=["right", "left", "both", "neither"]) +def closed(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def center(request): + return request.param + + +@pytest.fixture(params=[None, 1]) +def min_periods(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def parallel(request): + """parallel keyword argument for numba.jit""" + return request.param + + +@pytest.fixture(params=[True, False]) +def nogil(request): + """nogil keyword argument for numba.jit""" + return request.param + + +@pytest.fixture(params=[True, False]) +def nopython(request): + """nopython keyword argument for numba.jit""" + return request.param + + +@pytest.fixture( + params=[pytest.param("numba", marks=td.skip_if_no("numba", "0.46.0")), "cython"] +) +def engine(request): + """engine keyword argument for rolling.apply""" + return request.param + + +@pytest.fixture( + params=[ + pytest.param(("numba", True), marks=td.skip_if_no("numba", "0.46.0")), + ("cython", True), + ("cython", False), + ] +) +def engine_and_raw(request): + """engine and raw keyword arguments for rolling.apply""" + return request.param diff --git a/venv/Lib/site-packages/pandas/tests/window/test_api.py b/venv/Lib/site-packages/pandas/tests/window/test_api.py new file mode 100644 index 0000000..680237d --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/window/test_api.py @@ -0,0 +1,344 @@ +from collections import OrderedDict + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import DataFrame, Index, Series, Timestamp, concat +import pandas._testing as tm +from pandas.core.base import SpecificationError +from pandas.tests.window.common import Base + + +class TestApi(Base): + def setup_method(self, method): + self._create_data() + + def test_getitem(self): + + r = self.frame.rolling(window=5) + tm.assert_index_equal(r._selected_obj.columns, self.frame.columns) + + r = self.frame.rolling(window=5)[1] + assert r._selected_obj.name == self.frame.columns[1] + + # technically this is allowed + r = self.frame.rolling(window=5)[1, 3] + tm.assert_index_equal(r._selected_obj.columns, self.frame.columns[[1, 3]]) + + r = self.frame.rolling(window=5)[[1, 3]] + tm.assert_index_equal(r._selected_obj.columns, self.frame.columns[[1, 3]]) + + def test_select_bad_cols(self): + df = DataFrame([[1, 2]], columns=["A", "B"]) + g = df.rolling(window=5) + with pytest.raises(KeyError, match="Columns not found: 'C'"): + g[["C"]] + with pytest.raises(KeyError, match="^[^A]+$"): + # A should not be referenced as a bad column... + # will have to rethink regex if you change message! + g[["A", "C"]] + + def test_attribute_access(self): + + df = DataFrame([[1, 2]], columns=["A", "B"]) + r = df.rolling(window=5) + tm.assert_series_equal(r.A.sum(), r["A"].sum()) + msg = "'Rolling' object has no attribute 'F'" + with pytest.raises(AttributeError, match=msg): + r.F + + def tests_skip_nuisance(self): + + df = DataFrame({"A": range(5), "B": range(5, 10), "C": "foo"}) + r = df.rolling(window=3) + result = r[["A", "B"]].sum() + expected = DataFrame( + {"A": [np.nan, np.nan, 3, 6, 9], "B": [np.nan, np.nan, 18, 21, 24]}, + columns=list("AB"), + ) + tm.assert_frame_equal(result, expected) + + def test_skip_sum_object_raises(self): + df = DataFrame({"A": range(5), "B": range(5, 10), "C": "foo"}) + r = df.rolling(window=3) + result = r.sum() + expected = DataFrame( + {"A": [np.nan, np.nan, 3, 6, 9], "B": [np.nan, np.nan, 18, 21, 24]}, + columns=list("AB"), + ) + tm.assert_frame_equal(result, expected) + + def test_agg(self): + df = DataFrame({"A": range(5), "B": range(0, 10, 2)}) + + r = df.rolling(window=3) + a_mean = r["A"].mean() + a_std = r["A"].std() + a_sum = r["A"].sum() + b_mean = r["B"].mean() + b_std = r["B"].std() + + result = r.aggregate([np.mean, np.std]) + expected = concat([a_mean, a_std, b_mean, b_std], axis=1) + expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]]) + tm.assert_frame_equal(result, expected) + + result = r.aggregate({"A": np.mean, "B": np.std}) + + expected = concat([a_mean, b_std], axis=1) + tm.assert_frame_equal(result, expected, check_like=True) + + result = r.aggregate({"A": ["mean", "std"]}) + expected = concat([a_mean, a_std], axis=1) + expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "std")]) + tm.assert_frame_equal(result, expected) + + result = r["A"].aggregate(["mean", "sum"]) + expected = concat([a_mean, a_sum], axis=1) + expected.columns = ["mean", "sum"] + tm.assert_frame_equal(result, expected) + + msg = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + # using a dict with renaming + r.aggregate({"A": {"mean": "mean", "sum": "sum"}}) + + with pytest.raises(SpecificationError, match=msg): + r.aggregate( + { + "A": {"mean": "mean", "sum": "sum"}, + "B": {"mean2": "mean", "sum2": "sum"}, + } + ) + + result = r.aggregate({"A": ["mean", "std"], "B": ["mean", "std"]}) + expected = concat([a_mean, a_std, b_mean, b_std], axis=1) + + exp_cols = [("A", "mean"), ("A", "std"), ("B", "mean"), ("B", "std")] + expected.columns = pd.MultiIndex.from_tuples(exp_cols) + tm.assert_frame_equal(result, expected, check_like=True) + + def test_agg_apply(self, raw): + + # passed lambda + df = DataFrame({"A": range(5), "B": range(0, 10, 2)}) + + r = df.rolling(window=3) + a_sum = r["A"].sum() + + result = r.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)}) + rcustom = r["B"].apply(lambda x: np.std(x, ddof=1), raw=raw) + expected = concat([a_sum, rcustom], axis=1) + tm.assert_frame_equal(result, expected, check_like=True) + + def test_agg_consistency(self): + + df = DataFrame({"A": range(5), "B": range(0, 10, 2)}) + r = df.rolling(window=3) + + result = r.agg([np.sum, np.mean]).columns + expected = pd.MultiIndex.from_product([list("AB"), ["sum", "mean"]]) + tm.assert_index_equal(result, expected) + + result = r["A"].agg([np.sum, np.mean]).columns + expected = Index(["sum", "mean"]) + tm.assert_index_equal(result, expected) + + result = r.agg({"A": [np.sum, np.mean]}).columns + expected = pd.MultiIndex.from_tuples([("A", "sum"), ("A", "mean")]) + tm.assert_index_equal(result, expected) + + def test_agg_nested_dicts(self): + + # API change for disallowing these types of nested dicts + df = DataFrame({"A": range(5), "B": range(0, 10, 2)}) + r = df.rolling(window=3) + + msg = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + r.aggregate({"r1": {"A": ["mean", "sum"]}, "r2": {"B": ["mean", "sum"]}}) + + expected = concat( + [r["A"].mean(), r["A"].std(), r["B"].mean(), r["B"].std()], axis=1 + ) + expected.columns = pd.MultiIndex.from_tuples( + [("ra", "mean"), ("ra", "std"), ("rb", "mean"), ("rb", "std")] + ) + with pytest.raises(SpecificationError, match=msg): + r[["A", "B"]].agg( + {"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}} + ) + + with pytest.raises(SpecificationError, match=msg): + r.agg({"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}}) + + def test_count_nonnumeric_types(self): + # GH12541 + cols = [ + "int", + "float", + "string", + "datetime", + "timedelta", + "periods", + "fl_inf", + "fl_nan", + "str_nan", + "dt_nat", + "periods_nat", + ] + + df = DataFrame( + { + "int": [1, 2, 3], + "float": [4.0, 5.0, 6.0], + "string": list("abc"), + "datetime": pd.date_range("20170101", periods=3), + "timedelta": pd.timedelta_range("1 s", periods=3, freq="s"), + "periods": [ + pd.Period("2012-01"), + pd.Period("2012-02"), + pd.Period("2012-03"), + ], + "fl_inf": [1.0, 2.0, np.Inf], + "fl_nan": [1.0, 2.0, np.NaN], + "str_nan": ["aa", "bb", np.NaN], + "dt_nat": [ + Timestamp("20170101"), + Timestamp("20170203"), + Timestamp(None), + ], + "periods_nat": [ + pd.Period("2012-01"), + pd.Period("2012-02"), + pd.Period(None), + ], + }, + columns=cols, + ) + + expected = DataFrame( + { + "int": [1.0, 2.0, 2.0], + "float": [1.0, 2.0, 2.0], + "string": [1.0, 2.0, 2.0], + "datetime": [1.0, 2.0, 2.0], + "timedelta": [1.0, 2.0, 2.0], + "periods": [1.0, 2.0, 2.0], + "fl_inf": [1.0, 2.0, 2.0], + "fl_nan": [1.0, 2.0, 1.0], + "str_nan": [1.0, 2.0, 1.0], + "dt_nat": [1.0, 2.0, 1.0], + "periods_nat": [1.0, 2.0, 1.0], + }, + columns=cols, + ) + + result = df.rolling(window=2, min_periods=0).count() + tm.assert_frame_equal(result, expected) + + result = df.rolling(1, min_periods=0).count() + expected = df.notna().astype(float) + tm.assert_frame_equal(result, expected) + + @td.skip_if_no_scipy + @pytest.mark.filterwarnings("ignore:can't resolve:ImportWarning") + def test_window_with_args(self): + # make sure that we are aggregating window functions correctly with arg + r = Series(np.random.randn(100)).rolling( + window=10, min_periods=1, win_type="gaussian" + ) + expected = concat([r.mean(std=10), r.mean(std=0.01)], axis=1) + expected.columns = ["", ""] + result = r.aggregate([lambda x: x.mean(std=10), lambda x: x.mean(std=0.01)]) + tm.assert_frame_equal(result, expected) + + def a(x): + return x.mean(std=10) + + def b(x): + return x.mean(std=0.01) + + expected = concat([r.mean(std=10), r.mean(std=0.01)], axis=1) + expected.columns = ["a", "b"] + result = r.aggregate([a, b]) + tm.assert_frame_equal(result, expected) + + def test_preserve_metadata(self): + # GH 10565 + s = Series(np.arange(100), name="foo") + + s2 = s.rolling(30).sum() + s3 = s.rolling(20).sum() + assert s2.name == "foo" + assert s3.name == "foo" + + @pytest.mark.parametrize( + "func,window_size,expected_vals", + [ + ( + "rolling", + 2, + [ + [np.nan, np.nan, np.nan, np.nan], + [15.0, 20.0, 25.0, 20.0], + [25.0, 30.0, 35.0, 30.0], + [np.nan, np.nan, np.nan, np.nan], + [20.0, 30.0, 35.0, 30.0], + [35.0, 40.0, 60.0, 40.0], + [60.0, 80.0, 85.0, 80], + ], + ), + ( + "expanding", + None, + [ + [10.0, 10.0, 20.0, 20.0], + [15.0, 20.0, 25.0, 20.0], + [20.0, 30.0, 30.0, 20.0], + [10.0, 10.0, 30.0, 30.0], + [20.0, 30.0, 35.0, 30.0], + [26.666667, 40.0, 50.0, 30.0], + [40.0, 80.0, 60.0, 30.0], + ], + ), + ], + ) + def test_multiple_agg_funcs(self, func, window_size, expected_vals): + # GH 15072 + df = pd.DataFrame( + [ + ["A", 10, 20], + ["A", 20, 30], + ["A", 30, 40], + ["B", 10, 30], + ["B", 30, 40], + ["B", 40, 80], + ["B", 80, 90], + ], + columns=["stock", "low", "high"], + ) + + f = getattr(df.groupby("stock"), func) + if window_size: + window = f(window_size) + else: + window = f() + + index = pd.MultiIndex.from_tuples( + [("A", 0), ("A", 1), ("A", 2), ("B", 3), ("B", 4), ("B", 5), ("B", 6)], + names=["stock", None], + ) + columns = pd.MultiIndex.from_tuples( + [("low", "mean"), ("low", "max"), ("high", "mean"), ("high", "min")] + ) + expected = pd.DataFrame(expected_vals, index=index, columns=columns) + + result = window.agg( + OrderedDict((("low", ["mean", "max"]), ("high", ["mean", "min"]))) + ) + + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/window/test_apply.py b/venv/Lib/site-packages/pandas/tests/window/test_apply.py new file mode 100644 index 0000000..7132e64 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/window/test_apply.py @@ -0,0 +1,140 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import DataFrame, Series, Timestamp, date_range +import pandas._testing as tm + + +@pytest.mark.parametrize("bad_raw", [None, 1, 0]) +def test_rolling_apply_invalid_raw(bad_raw): + with pytest.raises(ValueError, match="raw parameter must be `True` or `False`"): + Series(range(3)).rolling(1).apply(len, raw=bad_raw) + + +def test_rolling_apply_out_of_bounds(engine_and_raw): + # gh-1850 + engine, raw = engine_and_raw + + vals = Series([1, 2, 3, 4]) + + result = vals.rolling(10).apply(np.sum, engine=engine, raw=raw) + assert result.isna().all() + + result = vals.rolling(10, min_periods=1).apply(np.sum, engine=engine, raw=raw) + expected = Series([1, 3, 6, 10], dtype=float) + tm.assert_almost_equal(result, expected) + + +@pytest.mark.parametrize("window", [2, "2s"]) +def test_rolling_apply_with_pandas_objects(window): + # 5071 + df = DataFrame( + {"A": np.random.randn(5), "B": np.random.randint(0, 10, size=5)}, + index=date_range("20130101", periods=5, freq="s"), + ) + + # we have an equal spaced timeseries index + # so simulate removing the first period + def f(x): + if x.index[0] == df.index[0]: + return np.nan + return x.iloc[-1] + + result = df.rolling(window).apply(f, raw=False) + expected = df.iloc[2:].reindex_like(df) + tm.assert_frame_equal(result, expected) + + with pytest.raises(AttributeError): + df.rolling(window).apply(f, raw=True) + + +def test_rolling_apply(engine_and_raw): + engine, raw = engine_and_raw + + expected = Series([], dtype="float64") + result = expected.rolling(10).apply(lambda x: x.mean(), engine=engine, raw=raw) + tm.assert_series_equal(result, expected) + + # gh-8080 + s = Series([None, None, None]) + result = s.rolling(2, min_periods=0).apply(lambda x: len(x), engine=engine, raw=raw) + expected = Series([1.0, 2.0, 2.0]) + tm.assert_series_equal(result, expected) + + result = s.rolling(2, min_periods=0).apply(len, engine=engine, raw=raw) + tm.assert_series_equal(result, expected) + + +def test_all_apply(engine_and_raw): + engine, raw = engine_and_raw + + df = ( + DataFrame( + {"A": date_range("20130101", periods=5, freq="s"), "B": range(5)} + ).set_index("A") + * 2 + ) + er = df.rolling(window=1) + r = df.rolling(window="1s") + + result = r.apply(lambda x: 1, engine=engine, raw=raw) + expected = er.apply(lambda x: 1, engine=engine, raw=raw) + tm.assert_frame_equal(result, expected) + + +def test_ragged_apply(engine_and_raw): + engine, raw = engine_and_raw + + df = DataFrame({"B": range(5)}) + df.index = [ + Timestamp("20130101 09:00:00"), + Timestamp("20130101 09:00:02"), + Timestamp("20130101 09:00:03"), + Timestamp("20130101 09:00:05"), + Timestamp("20130101 09:00:06"), + ] + + f = lambda x: 1 + result = df.rolling(window="1s", min_periods=1).apply(f, engine=engine, raw=raw) + expected = df.copy() + expected["B"] = 1.0 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=1).apply(f, engine=engine, raw=raw) + expected = df.copy() + expected["B"] = 1.0 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="5s", min_periods=1).apply(f, engine=engine, raw=raw) + expected = df.copy() + expected["B"] = 1.0 + tm.assert_frame_equal(result, expected) + + +def test_invalid_engine(): + with pytest.raises(ValueError, match="engine must be either 'numba' or 'cython'"): + Series(range(1)).rolling(1).apply(lambda x: x, engine="foo") + + +def test_invalid_engine_kwargs_cython(): + with pytest.raises(ValueError, match="cython engine does not accept engine_kwargs"): + Series(range(1)).rolling(1).apply( + lambda x: x, engine="cython", engine_kwargs={"nopython": False} + ) + + +def test_invalid_raw_numba(): + with pytest.raises( + ValueError, match="raw must be `True` when using the numba engine" + ): + Series(range(1)).rolling(1).apply(lambda x: x, raw=False, engine="numba") + + +@td.skip_if_no("numba") +def test_invalid_kwargs_nopython(): + with pytest.raises(ValueError, match="numba does not support kwargs with"): + Series(range(1)).rolling(1).apply( + lambda x: x, kwargs={"a": 1}, engine="numba", raw=True + ) diff --git a/venv/Lib/site-packages/pandas/tests/window/test_base_indexer.py b/venv/Lib/site-packages/pandas/tests/window/test_base_indexer.py new file mode 100644 index 0000000..606520c --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/window/test_base_indexer.py @@ -0,0 +1,82 @@ +import numpy as np +import pytest + +from pandas import DataFrame, Series +import pandas._testing as tm +from pandas.api.indexers import BaseIndexer +from pandas.core.window.indexers import ExpandingIndexer + + +def test_bad_get_window_bounds_signature(): + class BadIndexer(BaseIndexer): + def get_window_bounds(self): + return None + + indexer = BadIndexer() + with pytest.raises(ValueError, match="BadIndexer does not implement"): + Series(range(5)).rolling(indexer) + + +def test_expanding_indexer(): + s = Series(range(10)) + indexer = ExpandingIndexer() + result = s.rolling(indexer).mean() + expected = s.expanding().mean() + tm.assert_series_equal(result, expected) + + +def test_indexer_constructor_arg(): + # Example found in computation.rst + use_expanding = [True, False, True, False, True] + df = DataFrame({"values": range(5)}) + + class CustomIndexer(BaseIndexer): + def get_window_bounds(self, num_values, min_periods, center, closed): + start = np.empty(num_values, dtype=np.int64) + end = np.empty(num_values, dtype=np.int64) + for i in range(num_values): + if self.use_expanding[i]: + start[i] = 0 + end[i] = i + 1 + else: + start[i] = i + end[i] = i + self.window_size + return start, end + + indexer = CustomIndexer(window_size=1, use_expanding=use_expanding) + result = df.rolling(indexer).sum() + expected = DataFrame({"values": [0.0, 1.0, 3.0, 3.0, 10.0]}) + tm.assert_frame_equal(result, expected) + + +def test_indexer_accepts_rolling_args(): + df = DataFrame({"values": range(5)}) + + class CustomIndexer(BaseIndexer): + def get_window_bounds(self, num_values, min_periods, center, closed): + start = np.empty(num_values, dtype=np.int64) + end = np.empty(num_values, dtype=np.int64) + for i in range(num_values): + if center and min_periods == 1 and closed == "both" and i == 2: + start[i] = 0 + end[i] = num_values + else: + start[i] = i + end[i] = i + self.window_size + return start, end + + indexer = CustomIndexer(window_size=1) + result = df.rolling(indexer, center=True, min_periods=1, closed="both").sum() + expected = DataFrame({"values": [0.0, 1.0, 10.0, 3.0, 4.0]}) + tm.assert_frame_equal(result, expected) + + +def test_win_type_not_implemented(): + class CustomIndexer(BaseIndexer): + def get_window_bounds(self, num_values, min_periods, center, closed): + return np.array([0, 1]), np.array([1, 2]) + + df = DataFrame({"values": range(2)}) + indexer = CustomIndexer() + with pytest.raises(NotImplementedError, match="BaseIndexer subclasses not"): + df.rolling(indexer, win_type="boxcar") diff --git a/venv/Lib/site-packages/pandas/tests/window/test_dtypes.py b/venv/Lib/site-packages/pandas/tests/window/test_dtypes.py new file mode 100644 index 0000000..b1c9b66 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/window/test_dtypes.py @@ -0,0 +1,242 @@ +from itertools import product + +import numpy as np +import pytest + +from pandas import DataFrame, Series +import pandas._testing as tm +from pandas.core.base import DataError + +# gh-12373 : rolling functions error on float32 data +# make sure rolling functions works for different dtypes +# +# NOTE that these are yielded tests and so _create_data +# is explicitly called. +# +# further note that we are only checking rolling for fully dtype +# compliance (though both expanding and ewm inherit) + + +class Dtype: + window = 2 + + funcs = { + "count": lambda v: v.count(), + "max": lambda v: v.max(), + "min": lambda v: v.min(), + "sum": lambda v: v.sum(), + "mean": lambda v: v.mean(), + "std": lambda v: v.std(), + "var": lambda v: v.var(), + "median": lambda v: v.median(), + } + + def get_expects(self): + expects = { + "sr1": { + "count": Series([1, 2, 2, 2, 2], dtype="float64"), + "max": Series([np.nan, 1, 2, 3, 4], dtype="float64"), + "min": Series([np.nan, 0, 1, 2, 3], dtype="float64"), + "sum": Series([np.nan, 1, 3, 5, 7], dtype="float64"), + "mean": Series([np.nan, 0.5, 1.5, 2.5, 3.5], dtype="float64"), + "std": Series([np.nan] + [np.sqrt(0.5)] * 4, dtype="float64"), + "var": Series([np.nan, 0.5, 0.5, 0.5, 0.5], dtype="float64"), + "median": Series([np.nan, 0.5, 1.5, 2.5, 3.5], dtype="float64"), + }, + "sr2": { + "count": Series([1, 2, 2, 2, 2], dtype="float64"), + "max": Series([np.nan, 10, 8, 6, 4], dtype="float64"), + "min": Series([np.nan, 8, 6, 4, 2], dtype="float64"), + "sum": Series([np.nan, 18, 14, 10, 6], dtype="float64"), + "mean": Series([np.nan, 9, 7, 5, 3], dtype="float64"), + "std": Series([np.nan] + [np.sqrt(2)] * 4, dtype="float64"), + "var": Series([np.nan, 2, 2, 2, 2], dtype="float64"), + "median": Series([np.nan, 9, 7, 5, 3], dtype="float64"), + }, + "sr3": { + "count": Series([1, 2, 2, 1, 1], dtype="float64"), + "max": Series([np.nan, 1, 2, np.nan, np.nan], dtype="float64"), + "min": Series([np.nan, 0, 1, np.nan, np.nan], dtype="float64"), + "sum": Series([np.nan, 1, 3, np.nan, np.nan], dtype="float64"), + "mean": Series([np.nan, 0.5, 1.5, np.nan, np.nan], dtype="float64"), + "std": Series( + [np.nan] + [np.sqrt(0.5)] * 2 + [np.nan] * 2, dtype="float64" + ), + "var": Series([np.nan, 0.5, 0.5, np.nan, np.nan], dtype="float64"), + "median": Series([np.nan, 0.5, 1.5, np.nan, np.nan], dtype="float64"), + }, + "df": { + "count": DataFrame( + {0: Series([1, 2, 2, 2, 2]), 1: Series([1, 2, 2, 2, 2])}, + dtype="float64", + ), + "max": DataFrame( + {0: Series([np.nan, 2, 4, 6, 8]), 1: Series([np.nan, 3, 5, 7, 9])}, + dtype="float64", + ), + "min": DataFrame( + {0: Series([np.nan, 0, 2, 4, 6]), 1: Series([np.nan, 1, 3, 5, 7])}, + dtype="float64", + ), + "sum": DataFrame( + { + 0: Series([np.nan, 2, 6, 10, 14]), + 1: Series([np.nan, 4, 8, 12, 16]), + }, + dtype="float64", + ), + "mean": DataFrame( + {0: Series([np.nan, 1, 3, 5, 7]), 1: Series([np.nan, 2, 4, 6, 8])}, + dtype="float64", + ), + "std": DataFrame( + { + 0: Series([np.nan] + [np.sqrt(2)] * 4), + 1: Series([np.nan] + [np.sqrt(2)] * 4), + }, + dtype="float64", + ), + "var": DataFrame( + {0: Series([np.nan, 2, 2, 2, 2]), 1: Series([np.nan, 2, 2, 2, 2])}, + dtype="float64", + ), + "median": DataFrame( + {0: Series([np.nan, 1, 3, 5, 7]), 1: Series([np.nan, 2, 4, 6, 8])}, + dtype="float64", + ), + }, + } + return expects + + def _create_dtype_data(self, dtype): + sr1 = Series(np.arange(5), dtype=dtype) + sr2 = Series(np.arange(10, 0, -2), dtype=dtype) + sr3 = sr1.copy() + sr3[3] = np.NaN + df = DataFrame(np.arange(10).reshape((5, 2)), dtype=dtype) + + data = {"sr1": sr1, "sr2": sr2, "sr3": sr3, "df": df} + + return data + + def _create_data(self): + self.data = self._create_dtype_data(self.dtype) + self.expects = self.get_expects() + + def test_dtypes(self): + self._create_data() + for f_name, d_name in product(self.funcs.keys(), self.data.keys()): + + f = self.funcs[f_name] + d = self.data[d_name] + exp = self.expects[d_name][f_name] + self.check_dtypes(f, f_name, d, d_name, exp) + + def check_dtypes(self, f, f_name, d, d_name, exp): + roll = d.rolling(window=self.window) + result = f(roll) + tm.assert_almost_equal(result, exp) + + +class TestDtype_object(Dtype): + dtype = object + + +class Dtype_integer(Dtype): + pass + + +class TestDtype_int8(Dtype_integer): + dtype = np.int8 + + +class TestDtype_int16(Dtype_integer): + dtype = np.int16 + + +class TestDtype_int32(Dtype_integer): + dtype = np.int32 + + +class TestDtype_int64(Dtype_integer): + dtype = np.int64 + + +class Dtype_uinteger(Dtype): + pass + + +class TestDtype_uint8(Dtype_uinteger): + dtype = np.uint8 + + +class TestDtype_uint16(Dtype_uinteger): + dtype = np.uint16 + + +class TestDtype_uint32(Dtype_uinteger): + dtype = np.uint32 + + +class TestDtype_uint64(Dtype_uinteger): + dtype = np.uint64 + + +class Dtype_float(Dtype): + pass + + +class TestDtype_float16(Dtype_float): + dtype = np.float16 + + +class TestDtype_float32(Dtype_float): + dtype = np.float32 + + +class TestDtype_float64(Dtype_float): + dtype = np.float64 + + +class TestDtype_category(Dtype): + dtype = "category" + include_df = False + + def _create_dtype_data(self, dtype): + sr1 = Series(range(5), dtype=dtype) + sr2 = Series(range(10, 0, -2), dtype=dtype) + + data = {"sr1": sr1, "sr2": sr2} + + return data + + +class DatetimeLike(Dtype): + def check_dtypes(self, f, f_name, d, d_name, exp): + + roll = d.rolling(window=self.window) + if f_name == "count": + result = f(roll) + tm.assert_almost_equal(result, exp) + + else: + with pytest.raises(DataError): + f(roll) + + +class TestDtype_timedelta(DatetimeLike): + dtype = np.dtype("m8[ns]") + + +class TestDtype_datetime(DatetimeLike): + dtype = np.dtype("M8[ns]") + + +class TestDtype_datetime64UTC(DatetimeLike): + dtype = "datetime64[ns, UTC]" + + def _create_data(self): + pytest.skip( + "direct creation of extension dtype " + "datetime64[ns, UTC] is not supported ATM" + ) diff --git a/venv/Lib/site-packages/pandas/tests/window/test_ewm.py b/venv/Lib/site-packages/pandas/tests/window/test_ewm.py new file mode 100644 index 0000000..1683fda --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/window/test_ewm.py @@ -0,0 +1,70 @@ +import numpy as np +import pytest + +from pandas.errors import UnsupportedFunctionCall + +from pandas import DataFrame, Series +from pandas.core.window import EWM +from pandas.tests.window.common import Base + + +class TestEWM(Base): + def setup_method(self, method): + self._create_data() + + def test_doc_string(self): + + df = DataFrame({"B": [0, 1, 2, np.nan, 4]}) + df + df.ewm(com=0.5).mean() + + @pytest.mark.parametrize("which", ["series", "frame"]) + def test_constructor(self, which): + o = getattr(self, which) + c = o.ewm + + # valid + c(com=0.5) + c(span=1.5) + c(alpha=0.5) + c(halflife=0.75) + c(com=0.5, span=None) + c(alpha=0.5, com=None) + c(halflife=0.75, alpha=None) + + # not valid: mutually exclusive + with pytest.raises(ValueError): + c(com=0.5, alpha=0.5) + with pytest.raises(ValueError): + c(span=1.5, halflife=0.75) + with pytest.raises(ValueError): + c(alpha=0.5, span=1.5) + + # not valid: com < 0 + with pytest.raises(ValueError): + c(com=-0.5) + + # not valid: span < 1 + with pytest.raises(ValueError): + c(span=0.5) + + # not valid: halflife <= 0 + with pytest.raises(ValueError): + c(halflife=0) + + # not valid: alpha <= 0 or alpha > 1 + for alpha in (-0.5, 1.5): + with pytest.raises(ValueError): + c(alpha=alpha) + + @pytest.mark.parametrize("method", ["std", "mean", "var"]) + def test_numpy_compat(self, method): + # see gh-12811 + e = EWM(Series([2, 4, 6]), alpha=0.5) + + msg = "numpy operations are not valid with window objects" + + with pytest.raises(UnsupportedFunctionCall, match=msg): + getattr(e, method)(1, 2, 3) + with pytest.raises(UnsupportedFunctionCall, match=msg): + getattr(e, method)(dtype=np.float64) diff --git a/venv/Lib/site-packages/pandas/tests/window/test_expanding.py b/venv/Lib/site-packages/pandas/tests/window/test_expanding.py new file mode 100644 index 0000000..6b6367f --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/window/test_expanding.py @@ -0,0 +1,134 @@ +import numpy as np +import pytest + +from pandas.errors import UnsupportedFunctionCall + +import pandas as pd +from pandas import DataFrame, Series +import pandas._testing as tm +from pandas.core.window import Expanding +from pandas.tests.window.common import Base + + +class TestExpanding(Base): + def setup_method(self, method): + self._create_data() + + def test_doc_string(self): + + df = DataFrame({"B": [0, 1, 2, np.nan, 4]}) + df + df.expanding(2).sum() + + @pytest.mark.parametrize("which", ["series", "frame"]) + def test_constructor(self, which): + # GH 12669 + + o = getattr(self, which) + c = o.expanding + + # valid + c(min_periods=1) + c(min_periods=1, center=True) + c(min_periods=1, center=False) + + # not valid + for w in [2.0, "foo", np.array([2])]: + with pytest.raises(ValueError): + c(min_periods=w) + with pytest.raises(ValueError): + c(min_periods=1, center=w) + + @pytest.mark.parametrize("method", ["std", "mean", "sum", "max", "min", "var"]) + def test_numpy_compat(self, method): + # see gh-12811 + e = Expanding(Series([2, 4, 6]), window=2) + + msg = "numpy operations are not valid with window objects" + + with pytest.raises(UnsupportedFunctionCall, match=msg): + getattr(e, method)(1, 2, 3) + with pytest.raises(UnsupportedFunctionCall, match=msg): + getattr(e, method)(dtype=np.float64) + + @pytest.mark.parametrize( + "expander", + [ + 1, + pytest.param( + "ls", + marks=pytest.mark.xfail( + reason="GH#16425 expanding with offset not supported" + ), + ), + ], + ) + def test_empty_df_expanding(self, expander): + # GH 15819 Verifies that datetime and integer expanding windows can be + # applied to empty DataFrames + + expected = DataFrame() + result = DataFrame().expanding(expander).sum() + tm.assert_frame_equal(result, expected) + + # Verifies that datetime and integer expanding windows can be applied + # to empty DataFrames with datetime index + expected = DataFrame(index=pd.DatetimeIndex([])) + result = DataFrame(index=pd.DatetimeIndex([])).expanding(expander).sum() + tm.assert_frame_equal(result, expected) + + def test_missing_minp_zero(self): + # https://github.com/pandas-dev/pandas/pull/18921 + # minp=0 + x = pd.Series([np.nan]) + result = x.expanding(min_periods=0).sum() + expected = pd.Series([0.0]) + tm.assert_series_equal(result, expected) + + # minp=1 + result = x.expanding(min_periods=1).sum() + expected = pd.Series([np.nan]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("klass", [pd.Series, pd.DataFrame]) + def test_iter_raises(self, klass): + # https://github.com/pandas-dev/pandas/issues/11704 + # Iteration over a Window + obj = klass([1, 2, 3, 4]) + with pytest.raises(NotImplementedError): + iter(obj.expanding(2)) + + def test_expanding_axis(self, axis_frame): + # see gh-23372. + df = DataFrame(np.ones((10, 20))) + axis = df._get_axis_number(axis_frame) + + if axis == 0: + expected = DataFrame( + {i: [np.nan] * 2 + [float(j) for j in range(3, 11)] for i in range(20)} + ) + else: + # axis == 1 + expected = DataFrame([[np.nan] * 2 + [float(i) for i in range(3, 21)]] * 10) + + result = df.expanding(3, axis=axis_frame).sum() + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("constructor", [Series, DataFrame]) +def test_expanding_count_with_min_periods(constructor): + # GH 26996 + result = constructor(range(5)).expanding(min_periods=3).count() + expected = constructor([np.nan, np.nan, 3.0, 4.0, 5.0]) + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("constructor", [Series, DataFrame]) +def test_expanding_count_default_min_periods_with_null_values(constructor): + # GH 26996 + values = [1, 2, 3, np.nan, 4, 5, 6] + expected_counts = [1.0, 2.0, 3.0, 3.0, 4.0, 5.0, 6.0] + + result = constructor(values).expanding().count() + expected = constructor(expected_counts) + tm.assert_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/window/test_grouper.py b/venv/Lib/site-packages/pandas/tests/window/test_grouper.py new file mode 100644 index 0000000..355ef3a --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/window/test_grouper.py @@ -0,0 +1,192 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Series +import pandas._testing as tm +from pandas.core.groupby.groupby import get_groupby + + +class TestGrouperGrouping: + def setup_method(self, method): + self.series = Series(np.arange(10)) + self.frame = DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)}) + + def test_mutated(self): + + msg = r"groupby\(\) got an unexpected keyword argument 'foo'" + with pytest.raises(TypeError, match=msg): + self.frame.groupby("A", foo=1) + + g = self.frame.groupby("A") + assert not g.mutated + g = get_groupby(self.frame, by="A", mutated=True) + assert g.mutated + + def test_getitem(self): + g = self.frame.groupby("A") + g_mutated = get_groupby(self.frame, by="A", mutated=True) + + expected = g_mutated.B.apply(lambda x: x.rolling(2).mean()) + + result = g.rolling(2).mean().B + tm.assert_series_equal(result, expected) + + result = g.rolling(2).B.mean() + tm.assert_series_equal(result, expected) + + result = g.B.rolling(2).mean() + tm.assert_series_equal(result, expected) + + result = self.frame.B.groupby(self.frame.A).rolling(2).mean() + tm.assert_series_equal(result, expected) + + def test_getitem_multiple(self): + + # GH 13174 + g = self.frame.groupby("A") + r = g.rolling(2) + g_mutated = get_groupby(self.frame, by="A", mutated=True) + expected = g_mutated.B.apply(lambda x: x.rolling(2).count()) + + result = r.B.count() + tm.assert_series_equal(result, expected) + + result = r.B.count() + tm.assert_series_equal(result, expected) + + def test_rolling(self): + g = self.frame.groupby("A") + r = g.rolling(window=4) + + for f in ["sum", "mean", "min", "max", "count", "kurt", "skew"]: + result = getattr(r, f)() + expected = g.apply(lambda x: getattr(x.rolling(4), f)()) + tm.assert_frame_equal(result, expected) + + for f in ["std", "var"]: + result = getattr(r, f)(ddof=1) + expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "interpolation", ["linear", "lower", "higher", "midpoint", "nearest"] + ) + def test_rolling_quantile(self, interpolation): + g = self.frame.groupby("A") + r = g.rolling(window=4) + result = r.quantile(0.4, interpolation=interpolation) + expected = g.apply( + lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation) + ) + tm.assert_frame_equal(result, expected) + + def test_rolling_corr_cov(self): + g = self.frame.groupby("A") + r = g.rolling(window=4) + + for f in ["corr", "cov"]: + result = getattr(r, f)(self.frame) + + def func(x): + return getattr(x.rolling(4), f)(self.frame) + + expected = g.apply(func) + tm.assert_frame_equal(result, expected) + + result = getattr(r.B, f)(pairwise=True) + + def func(x): + return getattr(x.B.rolling(4), f)(pairwise=True) + + expected = g.apply(func) + tm.assert_series_equal(result, expected) + + def test_rolling_apply(self, raw): + g = self.frame.groupby("A") + r = g.rolling(window=4) + + # reduction + result = r.apply(lambda x: x.sum(), raw=raw) + expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) + tm.assert_frame_equal(result, expected) + + def test_rolling_apply_mutability(self): + # GH 14013 + df = pd.DataFrame({"A": ["foo"] * 3 + ["bar"] * 3, "B": [1] * 6}) + g = df.groupby("A") + + mi = pd.MultiIndex.from_tuples( + [("bar", 3), ("bar", 4), ("bar", 5), ("foo", 0), ("foo", 1), ("foo", 2)] + ) + + mi.names = ["A", None] + # Grouped column should not be a part of the output + expected = pd.DataFrame([np.nan, 2.0, 2.0] * 2, columns=["B"], index=mi) + + result = g.rolling(window=2).sum() + tm.assert_frame_equal(result, expected) + + # Call an arbitrary function on the groupby + g.sum() + + # Make sure nothing has been mutated + result = g.rolling(window=2).sum() + tm.assert_frame_equal(result, expected) + + def test_expanding(self): + g = self.frame.groupby("A") + r = g.expanding() + + for f in ["sum", "mean", "min", "max", "count", "kurt", "skew"]: + + result = getattr(r, f)() + expected = g.apply(lambda x: getattr(x.expanding(), f)()) + tm.assert_frame_equal(result, expected) + + for f in ["std", "var"]: + result = getattr(r, f)(ddof=0) + expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "interpolation", ["linear", "lower", "higher", "midpoint", "nearest"] + ) + def test_expanding_quantile(self, interpolation): + g = self.frame.groupby("A") + r = g.expanding() + result = r.quantile(0.4, interpolation=interpolation) + expected = g.apply( + lambda x: x.expanding().quantile(0.4, interpolation=interpolation) + ) + tm.assert_frame_equal(result, expected) + + def test_expanding_corr_cov(self): + g = self.frame.groupby("A") + r = g.expanding() + + for f in ["corr", "cov"]: + result = getattr(r, f)(self.frame) + + def func(x): + return getattr(x.expanding(), f)(self.frame) + + expected = g.apply(func) + tm.assert_frame_equal(result, expected) + + result = getattr(r.B, f)(pairwise=True) + + def func(x): + return getattr(x.B.expanding(), f)(pairwise=True) + + expected = g.apply(func) + tm.assert_series_equal(result, expected) + + def test_expanding_apply(self, raw): + g = self.frame.groupby("A") + r = g.expanding() + + # reduction + result = r.apply(lambda x: x.sum(), raw=raw) + expected = g.apply(lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw)) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/window/test_numba.py b/venv/Lib/site-packages/pandas/tests/window/test_numba.py new file mode 100644 index 0000000..cc8aef1 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/window/test_numba.py @@ -0,0 +1,74 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import Series +import pandas._testing as tm + + +@td.skip_if_no("numba", "0.46.0") +@pytest.mark.filterwarnings("ignore:\\nThe keyword argument") +# Filter warnings when parallel=True and the function can't be parallelized by Numba +class TestApply: + @pytest.mark.parametrize("jit", [True, False]) + def test_numba_vs_cython(self, jit, nogil, parallel, nopython): + def f(x, *args): + arg_sum = 0 + for arg in args: + arg_sum += arg + return np.mean(x) + arg_sum + + if jit: + import numba + + f = numba.jit(f) + + engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} + args = (2,) + + s = Series(range(10)) + result = s.rolling(2).apply( + f, args=args, engine="numba", engine_kwargs=engine_kwargs, raw=True + ) + expected = s.rolling(2).apply(f, engine="cython", args=args, raw=True) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("jit", [True, False]) + def test_cache(self, jit, nogil, parallel, nopython): + # Test that the functions are cached correctly if we switch functions + def func_1(x): + return np.mean(x) + 4 + + def func_2(x): + return np.std(x) * 5 + + if jit: + import numba + + func_1 = numba.jit(func_1) + func_2 = numba.jit(func_2) + + engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} + + roll = Series(range(10)).rolling(2) + result = roll.apply( + func_1, engine="numba", engine_kwargs=engine_kwargs, raw=True + ) + expected = roll.apply(func_1, engine="cython", raw=True) + tm.assert_series_equal(result, expected) + + # func_1 should be in the cache now + assert func_1 in roll._numba_func_cache + + result = roll.apply( + func_2, engine="numba", engine_kwargs=engine_kwargs, raw=True + ) + expected = roll.apply(func_2, engine="cython", raw=True) + tm.assert_series_equal(result, expected) + # This run should use the cached func_1 + result = roll.apply( + func_1, engine="numba", engine_kwargs=engine_kwargs, raw=True + ) + expected = roll.apply(func_1, engine="cython", raw=True) + tm.assert_series_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/window/test_pairwise.py b/venv/Lib/site-packages/pandas/tests/window/test_pairwise.py new file mode 100644 index 0000000..717273c --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/window/test_pairwise.py @@ -0,0 +1,183 @@ +import warnings + +import pytest + +from pandas import DataFrame, Series +import pandas._testing as tm +from pandas.core.algorithms import safe_sort + + +class TestPairwise: + + # GH 7738 + df1s = [ + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0, 1]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 0]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 1]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=["C", "C"]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1.0, 0]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0.0, 1]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=["C", 1]), + DataFrame([[2.0, 4.0], [1.0, 2.0], [5.0, 2.0], [8.0, 1.0]], columns=[1, 0.0]), + DataFrame([[2, 4.0], [1, 2.0], [5, 2.0], [8, 1.0]], columns=[0, 1.0]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1.0]], columns=[1.0, "X"]), + ] + df2 = DataFrame( + [[None, 1, 1], [None, 1, 2], [None, 3, 2], [None, 8, 1]], + columns=["Y", "Z", "X"], + ) + s = Series([1, 1, 3, 8]) + + def compare(self, result, expected): + + # since we have sorted the results + # we can only compare non-nans + result = result.dropna().values + expected = expected.dropna().values + + tm.assert_numpy_array_equal(result, expected, check_dtype=False) + + @pytest.mark.parametrize("f", [lambda x: x.cov(), lambda x: x.corr()]) + def test_no_flex(self, f): + + # DataFrame methods (which do not call _flex_binary_moment()) + + results = [f(df) for df in self.df1s] + for (df, result) in zip(self.df1s, results): + tm.assert_index_equal(result.index, df.columns) + tm.assert_index_equal(result.columns, df.columns) + for i, result in enumerate(results): + if i > 0: + self.compare(result, results[0]) + + @pytest.mark.parametrize( + "f", + [ + lambda x: x.expanding().cov(pairwise=True), + lambda x: x.expanding().corr(pairwise=True), + lambda x: x.rolling(window=3).cov(pairwise=True), + lambda x: x.rolling(window=3).corr(pairwise=True), + lambda x: x.ewm(com=3).cov(pairwise=True), + lambda x: x.ewm(com=3).corr(pairwise=True), + ], + ) + def test_pairwise_with_self(self, f): + + # DataFrame with itself, pairwise=True + # note that we may construct the 1st level of the MI + # in a non-monotonic way, so compare accordingly + results = [] + for i, df in enumerate(self.df1s): + result = f(df) + tm.assert_index_equal(result.index.levels[0], df.index, check_names=False) + tm.assert_numpy_array_equal( + safe_sort(result.index.levels[1]), safe_sort(df.columns.unique()) + ) + tm.assert_index_equal(result.columns, df.columns) + results.append(df) + + for i, result in enumerate(results): + if i > 0: + self.compare(result, results[0]) + + @pytest.mark.parametrize( + "f", + [ + lambda x: x.expanding().cov(pairwise=False), + lambda x: x.expanding().corr(pairwise=False), + lambda x: x.rolling(window=3).cov(pairwise=False), + lambda x: x.rolling(window=3).corr(pairwise=False), + lambda x: x.ewm(com=3).cov(pairwise=False), + lambda x: x.ewm(com=3).corr(pairwise=False), + ], + ) + def test_no_pairwise_with_self(self, f): + + # DataFrame with itself, pairwise=False + results = [f(df) for df in self.df1s] + for (df, result) in zip(self.df1s, results): + tm.assert_index_equal(result.index, df.index) + tm.assert_index_equal(result.columns, df.columns) + for i, result in enumerate(results): + if i > 0: + self.compare(result, results[0]) + + @pytest.mark.parametrize( + "f", + [ + lambda x, y: x.expanding().cov(y, pairwise=True), + lambda x, y: x.expanding().corr(y, pairwise=True), + lambda x, y: x.rolling(window=3).cov(y, pairwise=True), + lambda x, y: x.rolling(window=3).corr(y, pairwise=True), + lambda x, y: x.ewm(com=3).cov(y, pairwise=True), + lambda x, y: x.ewm(com=3).corr(y, pairwise=True), + ], + ) + def test_pairwise_with_other(self, f): + + # DataFrame with another DataFrame, pairwise=True + results = [f(df, self.df2) for df in self.df1s] + for (df, result) in zip(self.df1s, results): + tm.assert_index_equal(result.index.levels[0], df.index, check_names=False) + tm.assert_numpy_array_equal( + safe_sort(result.index.levels[1]), safe_sort(self.df2.columns.unique()) + ) + for i, result in enumerate(results): + if i > 0: + self.compare(result, results[0]) + + @pytest.mark.parametrize( + "f", + [ + lambda x, y: x.expanding().cov(y, pairwise=False), + lambda x, y: x.expanding().corr(y, pairwise=False), + lambda x, y: x.rolling(window=3).cov(y, pairwise=False), + lambda x, y: x.rolling(window=3).corr(y, pairwise=False), + lambda x, y: x.ewm(com=3).cov(y, pairwise=False), + lambda x, y: x.ewm(com=3).corr(y, pairwise=False), + ], + ) + def test_no_pairwise_with_other(self, f): + + # DataFrame with another DataFrame, pairwise=False + results = [ + f(df, self.df2) if df.columns.is_unique else None for df in self.df1s + ] + for (df, result) in zip(self.df1s, results): + if result is not None: + with warnings.catch_warnings(record=True): + warnings.simplefilter("ignore", RuntimeWarning) + # we can have int and str columns + expected_index = df.index.union(self.df2.index) + expected_columns = df.columns.union(self.df2.columns) + tm.assert_index_equal(result.index, expected_index) + tm.assert_index_equal(result.columns, expected_columns) + else: + with pytest.raises(ValueError, match="'arg1' columns are not unique"): + f(df, self.df2) + with pytest.raises(ValueError, match="'arg2' columns are not unique"): + f(self.df2, df) + + @pytest.mark.parametrize( + "f", + [ + lambda x, y: x.expanding().cov(y), + lambda x, y: x.expanding().corr(y), + lambda x, y: x.rolling(window=3).cov(y), + lambda x, y: x.rolling(window=3).corr(y), + lambda x, y: x.ewm(com=3).cov(y), + lambda x, y: x.ewm(com=3).corr(y), + ], + ) + def test_pairwise_with_series(self, f): + + # DataFrame with a Series + results = [f(df, self.s) for df in self.df1s] + [ + f(self.s, df) for df in self.df1s + ] + for (df, result) in zip(self.df1s, results): + tm.assert_index_equal(result.index, df.index) + tm.assert_index_equal(result.columns, df.columns) + for i, result in enumerate(results): + if i > 0: + self.compare(result, results[0]) diff --git a/venv/Lib/site-packages/pandas/tests/window/test_rolling.py b/venv/Lib/site-packages/pandas/tests/window/test_rolling.py new file mode 100644 index 0000000..80a732c --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/window/test_rolling.py @@ -0,0 +1,447 @@ +from datetime import datetime, timedelta + +import numpy as np +import pytest + +from pandas.errors import UnsupportedFunctionCall +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import DataFrame, Index, Series +import pandas._testing as tm +from pandas.core.window import Rolling +from pandas.tests.window.common import Base + + +class TestRolling(Base): + def setup_method(self, method): + self._create_data() + + def test_doc_string(self): + + df = DataFrame({"B": [0, 1, 2, np.nan, 4]}) + df + df.rolling(2).sum() + df.rolling(2, min_periods=1).sum() + + @pytest.mark.parametrize("which", ["series", "frame"]) + def test_constructor(self, which): + # GH 12669 + + o = getattr(self, which) + c = o.rolling + + # valid + c(window=2) + c(window=2, min_periods=1) + c(window=2, min_periods=1, center=True) + c(window=2, min_periods=1, center=False) + + # GH 13383 + with pytest.raises(ValueError): + c(0) + c(-1) + + # not valid + for w in [2.0, "foo", np.array([2])]: + with pytest.raises(ValueError): + c(window=w) + with pytest.raises(ValueError): + c(window=2, min_periods=w) + with pytest.raises(ValueError): + c(window=2, min_periods=1, center=w) + + @td.skip_if_no_scipy + @pytest.mark.parametrize("which", ["series", "frame"]) + def test_constructor_with_win_type(self, which): + # GH 13383 + o = getattr(self, which) + c = o.rolling + with pytest.raises(ValueError): + c(-1, win_type="boxcar") + + @pytest.mark.parametrize("window", [timedelta(days=3), pd.Timedelta(days=3)]) + def test_constructor_with_timedelta_window(self, window): + # GH 15440 + n = 10 + df = DataFrame( + {"value": np.arange(n)}, + index=pd.date_range("2015-12-24", periods=n, freq="D"), + ) + expected_data = np.append([0.0, 1.0], np.arange(3.0, 27.0, 3)) + + result = df.rolling(window=window).sum() + expected = DataFrame( + {"value": expected_data}, + index=pd.date_range("2015-12-24", periods=n, freq="D"), + ) + tm.assert_frame_equal(result, expected) + expected = df.rolling("3D").sum() + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("window", [timedelta(days=3), pd.Timedelta(days=3), "3D"]) + def test_constructor_timedelta_window_and_minperiods(self, window, raw): + # GH 15305 + n = 10 + df = DataFrame( + {"value": np.arange(n)}, + index=pd.date_range("2017-08-08", periods=n, freq="D"), + ) + expected = DataFrame( + {"value": np.append([np.NaN, 1.0], np.arange(3.0, 27.0, 3))}, + index=pd.date_range("2017-08-08", periods=n, freq="D"), + ) + result_roll_sum = df.rolling(window=window, min_periods=2).sum() + result_roll_generic = df.rolling(window=window, min_periods=2).apply( + sum, raw=raw + ) + tm.assert_frame_equal(result_roll_sum, expected) + tm.assert_frame_equal(result_roll_generic, expected) + + @pytest.mark.parametrize("method", ["std", "mean", "sum", "max", "min", "var"]) + def test_numpy_compat(self, method): + # see gh-12811 + r = Rolling(Series([2, 4, 6]), window=2) + + msg = "numpy operations are not valid with window objects" + + with pytest.raises(UnsupportedFunctionCall, match=msg): + getattr(r, method)(1, 2, 3) + with pytest.raises(UnsupportedFunctionCall, match=msg): + getattr(r, method)(dtype=np.float64) + + def test_closed(self): + df = DataFrame({"A": [0, 1, 2, 3, 4]}) + # closed only allowed for datetimelike + with pytest.raises(ValueError): + df.rolling(window=3, closed="neither") + + @pytest.mark.parametrize("closed", ["neither", "left"]) + def test_closed_empty(self, closed, arithmetic_win_operators): + # GH 26005 + func_name = arithmetic_win_operators + ser = pd.Series( + data=np.arange(5), index=pd.date_range("2000", periods=5, freq="2D") + ) + roll = ser.rolling("1D", closed=closed) + + result = getattr(roll, func_name)() + expected = pd.Series([np.nan] * 5, index=ser.index) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("func", ["min", "max"]) + def test_closed_one_entry(self, func): + # GH24718 + ser = pd.Series(data=[2], index=pd.date_range("2000", periods=1)) + result = getattr(ser.rolling("10D", closed="left"), func)() + tm.assert_series_equal(result, pd.Series([np.nan], index=ser.index)) + + @pytest.mark.parametrize("func", ["min", "max"]) + def test_closed_one_entry_groupby(self, func): + # GH24718 + ser = pd.DataFrame( + data={"A": [1, 1, 2], "B": [3, 2, 1]}, + index=pd.date_range("2000", periods=3), + ) + result = getattr( + ser.groupby("A", sort=False)["B"].rolling("10D", closed="left"), func + )() + exp_idx = pd.MultiIndex.from_arrays( + arrays=[[1, 1, 2], ser.index], names=("A", None) + ) + expected = pd.Series(data=[np.nan, 3, np.nan], index=exp_idx, name="B") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("input_dtype", ["int", "float"]) + @pytest.mark.parametrize( + "func,closed,expected", + [ + ("min", "right", [0.0, 0, 0, 1, 2, 3, 4, 5, 6, 7]), + ("min", "both", [0.0, 0, 0, 0, 1, 2, 3, 4, 5, 6]), + ("min", "neither", [np.nan, 0, 0, 1, 2, 3, 4, 5, 6, 7]), + ("min", "left", [np.nan, 0, 0, 0, 1, 2, 3, 4, 5, 6]), + ("max", "right", [0.0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), + ("max", "both", [0.0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), + ("max", "neither", [np.nan, 0, 1, 2, 3, 4, 5, 6, 7, 8]), + ("max", "left", [np.nan, 0, 1, 2, 3, 4, 5, 6, 7, 8]), + ], + ) + def test_closed_min_max_datetime(self, input_dtype, func, closed, expected): + # see gh-21704 + ser = pd.Series( + data=np.arange(10).astype(input_dtype), + index=pd.date_range("2000", periods=10), + ) + + result = getattr(ser.rolling("3D", closed=closed), func)() + expected = pd.Series(expected, index=ser.index) + tm.assert_series_equal(result, expected) + + def test_closed_uneven(self): + # see gh-21704 + ser = pd.Series(data=np.arange(10), index=pd.date_range("2000", periods=10)) + + # uneven + ser = ser.drop(index=ser.index[[1, 5]]) + result = ser.rolling("3D", closed="left").min() + expected = pd.Series([np.nan, 0, 0, 2, 3, 4, 6, 6], index=ser.index) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "func,closed,expected", + [ + ("min", "right", [np.nan, 0, 0, 1, 2, 3, 4, 5, np.nan, np.nan]), + ("min", "both", [np.nan, 0, 0, 0, 1, 2, 3, 4, 5, np.nan]), + ("min", "neither", [np.nan, np.nan, 0, 1, 2, 3, 4, 5, np.nan, np.nan]), + ("min", "left", [np.nan, np.nan, 0, 0, 1, 2, 3, 4, 5, np.nan]), + ("max", "right", [np.nan, 1, 2, 3, 4, 5, 6, 6, np.nan, np.nan]), + ("max", "both", [np.nan, 1, 2, 3, 4, 5, 6, 6, 6, np.nan]), + ("max", "neither", [np.nan, np.nan, 1, 2, 3, 4, 5, 6, np.nan, np.nan]), + ("max", "left", [np.nan, np.nan, 1, 2, 3, 4, 5, 6, 6, np.nan]), + ], + ) + def test_closed_min_max_minp(self, func, closed, expected): + # see gh-21704 + ser = pd.Series(data=np.arange(10), index=pd.date_range("2000", periods=10)) + ser[ser.index[-3:]] = np.nan + result = getattr(ser.rolling("3D", min_periods=2, closed=closed), func)() + expected = pd.Series(expected, index=ser.index) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "closed,expected", + [ + ("right", [0, 0.5, 1, 2, 3, 4, 5, 6, 7, 8]), + ("both", [0, 0.5, 1, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]), + ("neither", [np.nan, 0, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]), + ("left", [np.nan, 0, 0.5, 1, 2, 3, 4, 5, 6, 7]), + ], + ) + def test_closed_median_quantile(self, closed, expected): + # GH 26005 + ser = pd.Series(data=np.arange(10), index=pd.date_range("2000", periods=10)) + roll = ser.rolling("3D", closed=closed) + expected = pd.Series(expected, index=ser.index) + + result = roll.median() + tm.assert_series_equal(result, expected) + + result = roll.quantile(0.5) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("roller", ["1s", 1]) + def tests_empty_df_rolling(self, roller): + # GH 15819 Verifies that datetime and integer rolling windows can be + # applied to empty DataFrames + expected = DataFrame() + result = DataFrame().rolling(roller).sum() + tm.assert_frame_equal(result, expected) + + # Verifies that datetime and integer rolling windows can be applied to + # empty DataFrames with datetime index + expected = DataFrame(index=pd.DatetimeIndex([])) + result = DataFrame(index=pd.DatetimeIndex([])).rolling(roller).sum() + tm.assert_frame_equal(result, expected) + + def test_empty_window_median_quantile(self): + # GH 26005 + expected = pd.Series([np.nan, np.nan, np.nan]) + roll = pd.Series(np.arange(3)).rolling(0) + + result = roll.median() + tm.assert_series_equal(result, expected) + + result = roll.quantile(0.1) + tm.assert_series_equal(result, expected) + + def test_missing_minp_zero(self): + # https://github.com/pandas-dev/pandas/pull/18921 + # minp=0 + x = pd.Series([np.nan]) + result = x.rolling(1, min_periods=0).sum() + expected = pd.Series([0.0]) + tm.assert_series_equal(result, expected) + + # minp=1 + result = x.rolling(1, min_periods=1).sum() + expected = pd.Series([np.nan]) + tm.assert_series_equal(result, expected) + + def test_missing_minp_zero_variable(self): + # https://github.com/pandas-dev/pandas/pull/18921 + x = pd.Series( + [np.nan] * 4, + index=pd.DatetimeIndex( + ["2017-01-01", "2017-01-04", "2017-01-06", "2017-01-07"] + ), + ) + result = x.rolling(pd.Timedelta("2d"), min_periods=0).sum() + expected = pd.Series(0.0, index=x.index) + tm.assert_series_equal(result, expected) + + def test_multi_index_names(self): + + # GH 16789, 16825 + cols = pd.MultiIndex.from_product( + [["A", "B"], ["C", "D", "E"]], names=["1", "2"] + ) + df = DataFrame(np.ones((10, 6)), columns=cols) + result = df.rolling(3).cov() + + tm.assert_index_equal(result.columns, df.columns) + assert result.index.names == [None, "1", "2"] + + @pytest.mark.parametrize("klass", [pd.Series, pd.DataFrame]) + def test_iter_raises(self, klass): + # https://github.com/pandas-dev/pandas/issues/11704 + # Iteration over a Window + obj = klass([1, 2, 3, 4]) + with pytest.raises(NotImplementedError): + iter(obj.rolling(2)) + + def test_rolling_axis_sum(self, axis_frame): + # see gh-23372. + df = DataFrame(np.ones((10, 20))) + axis = df._get_axis_number(axis_frame) + + if axis == 0: + expected = DataFrame({i: [np.nan] * 2 + [3.0] * 8 for i in range(20)}) + else: + # axis == 1 + expected = DataFrame([[np.nan] * 2 + [3.0] * 18] * 10) + + result = df.rolling(3, axis=axis_frame).sum() + tm.assert_frame_equal(result, expected) + + def test_rolling_axis_count(self, axis_frame): + # see gh-26055 + df = DataFrame({"x": range(3), "y": range(3)}) + + axis = df._get_axis_number(axis_frame) + + if axis in [0, "index"]: + expected = DataFrame({"x": [1.0, 2.0, 2.0], "y": [1.0, 2.0, 2.0]}) + else: + expected = DataFrame({"x": [1.0, 1.0, 1.0], "y": [2.0, 2.0, 2.0]}) + + result = df.rolling(2, axis=axis_frame, min_periods=0).count() + tm.assert_frame_equal(result, expected) + + def test_readonly_array(self): + # GH-27766 + arr = np.array([1, 3, np.nan, 3, 5]) + arr.setflags(write=False) + result = pd.Series(arr).rolling(2).mean() + expected = pd.Series([np.nan, 2, np.nan, np.nan, 4]) + tm.assert_series_equal(result, expected) + + def test_rolling_datetime(self, axis_frame, tz_naive_fixture): + # GH-28192 + tz = tz_naive_fixture + df = pd.DataFrame( + { + i: [1] * 2 + for i in pd.date_range("2019-8-01", "2019-08-03", freq="D", tz=tz) + } + ) + if axis_frame in [0, "index"]: + result = df.T.rolling("2D", axis=axis_frame).sum().T + else: + result = df.rolling("2D", axis=axis_frame).sum() + expected = pd.DataFrame( + { + **{ + i: [1.0] * 2 + for i in pd.date_range("2019-8-01", periods=1, freq="D", tz=tz) + }, + **{ + i: [2.0] * 2 + for i in pd.date_range("2019-8-02", "2019-8-03", freq="D", tz=tz) + }, + } + ) + tm.assert_frame_equal(result, expected) + + +def test_rolling_window_as_string(): + # see gh-22590 + date_today = datetime.now() + days = pd.date_range(date_today, date_today + timedelta(365), freq="D") + + npr = np.random.RandomState(seed=421) + + data = npr.randint(1, high=100, size=len(days)) + df = DataFrame({"DateCol": days, "metric": data}) + + df.set_index("DateCol", inplace=True) + result = df.rolling(window="21D", min_periods=2, closed="left")["metric"].agg("max") + + expData = ( + [np.nan] * 2 + + [88.0] * 16 + + [97.0] * 9 + + [98.0] + + [99.0] * 21 + + [95.0] * 16 + + [93.0] * 5 + + [89.0] * 5 + + [96.0] * 21 + + [94.0] * 14 + + [90.0] * 13 + + [88.0] * 2 + + [90.0] * 9 + + [96.0] * 21 + + [95.0] * 6 + + [91.0] + + [87.0] * 6 + + [92.0] * 21 + + [83.0] * 2 + + [86.0] * 10 + + [87.0] * 5 + + [98.0] * 21 + + [97.0] * 14 + + [93.0] * 7 + + [87.0] * 4 + + [86.0] * 4 + + [95.0] * 21 + + [85.0] * 14 + + [83.0] * 2 + + [76.0] * 5 + + [81.0] * 2 + + [98.0] * 21 + + [95.0] * 14 + + [91.0] * 7 + + [86.0] + + [93.0] * 3 + + [95.0] * 20 + ) + + expected = Series(expData, index=Index(days, name="DateCol"), name="metric") + tm.assert_series_equal(result, expected) + + +def test_min_periods1(): + # GH#6795 + df = pd.DataFrame([0, 1, 2, 1, 0], columns=["a"]) + result = df["a"].rolling(3, center=True, min_periods=1).max() + expected = pd.Series([1.0, 2.0, 2.0, 2.0, 1.0], name="a") + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("constructor", [Series, DataFrame]) +def test_rolling_count_with_min_periods(constructor): + # GH 26996 + result = constructor(range(5)).rolling(3, min_periods=3).count() + expected = constructor([np.nan, np.nan, 3.0, 3.0, 3.0]) + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("constructor", [Series, DataFrame]) +def test_rolling_count_default_min_periods_with_null_values(constructor): + # GH 26996 + values = [1, 2, 3, np.nan, 4, 5, 6] + expected_counts = [1.0, 2.0, 3.0, 2.0, 2.0, 2.0, 3.0] + + result = constructor(values).rolling(3).count() + expected = constructor(expected_counts) + tm.assert_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/window/test_timeseries_window.py b/venv/Lib/site-packages/pandas/tests/window/test_timeseries_window.py new file mode 100644 index 0000000..5f5e10b --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/window/test_timeseries_window.py @@ -0,0 +1,740 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Index, + MultiIndex, + Series, + Timestamp, + date_range, + to_datetime, +) +import pandas._testing as tm + +import pandas.tseries.offsets as offsets + + +class TestRollingTS: + + # rolling time-series friendly + # xref GH13327 + + def setup_method(self, method): + + self.regular = DataFrame( + {"A": date_range("20130101", periods=5, freq="s"), "B": range(5)} + ).set_index("A") + + self.ragged = DataFrame({"B": range(5)}) + self.ragged.index = [ + Timestamp("20130101 09:00:00"), + Timestamp("20130101 09:00:02"), + Timestamp("20130101 09:00:03"), + Timestamp("20130101 09:00:05"), + Timestamp("20130101 09:00:06"), + ] + + def test_doc_string(self): + + df = DataFrame( + {"B": [0, 1, 2, np.nan, 4]}, + index=[ + Timestamp("20130101 09:00:00"), + Timestamp("20130101 09:00:02"), + Timestamp("20130101 09:00:03"), + Timestamp("20130101 09:00:05"), + Timestamp("20130101 09:00:06"), + ], + ) + df + df.rolling("2s").sum() + + def test_valid(self): + + df = self.regular + + # not a valid freq + with pytest.raises(ValueError): + df.rolling(window="foobar") + + # not a datetimelike index + with pytest.raises(ValueError): + df.reset_index().rolling(window="foobar") + + # non-fixed freqs + for freq in ["2MS", offsets.MonthBegin(2)]: + with pytest.raises(ValueError): + df.rolling(window=freq) + + for freq in ["1D", offsets.Day(2), "2ms"]: + df.rolling(window=freq) + + # non-integer min_periods + for minp in [1.0, "foo", np.array([1, 2, 3])]: + with pytest.raises(ValueError): + df.rolling(window="1D", min_periods=minp) + + # center is not implemented + with pytest.raises(NotImplementedError): + df.rolling(window="1D", center=True) + + def test_on(self): + + df = self.regular + + # not a valid column + with pytest.raises(ValueError): + df.rolling(window="2s", on="foobar") + + # column is valid + df = df.copy() + df["C"] = date_range("20130101", periods=len(df)) + df.rolling(window="2d", on="C").sum() + + # invalid columns + with pytest.raises(ValueError): + df.rolling(window="2d", on="B") + + # ok even though on non-selected + df.rolling(window="2d", on="C").B.sum() + + def test_monotonic_on(self): + + # on/index must be monotonic + df = DataFrame( + {"A": date_range("20130101", periods=5, freq="s"), "B": range(5)} + ) + + assert df.A.is_monotonic + df.rolling("2s", on="A").sum() + + df = df.set_index("A") + assert df.index.is_monotonic + df.rolling("2s").sum() + + def test_non_monotonic_on(self): + # GH 19248 + df = DataFrame( + {"A": date_range("20130101", periods=5, freq="s"), "B": range(5)} + ) + df = df.set_index("A") + non_monotonic_index = df.index.to_list() + non_monotonic_index[0] = non_monotonic_index[3] + df.index = non_monotonic_index + + assert not df.index.is_monotonic + + with pytest.raises(ValueError): + df.rolling("2s").sum() + + df = df.reset_index() + with pytest.raises(ValueError): + df.rolling("2s", on="A").sum() + + def test_frame_on(self): + + df = DataFrame( + {"B": range(5), "C": date_range("20130101 09:00:00", periods=5, freq="3s")} + ) + + df["A"] = [ + Timestamp("20130101 09:00:00"), + Timestamp("20130101 09:00:02"), + Timestamp("20130101 09:00:03"), + Timestamp("20130101 09:00:05"), + Timestamp("20130101 09:00:06"), + ] + + # we are doing simulating using 'on' + expected = df.set_index("A").rolling("2s").B.sum().reset_index(drop=True) + + result = df.rolling("2s", on="A").B.sum() + tm.assert_series_equal(result, expected) + + # test as a frame + # we should be ignoring the 'on' as an aggregation column + # note that the expected is setting, computing, and resetting + # so the columns need to be switched compared + # to the actual result where they are ordered as in the + # original + expected = ( + df.set_index("A").rolling("2s")[["B"]].sum().reset_index()[["B", "A"]] + ) + + result = df.rolling("2s", on="A")[["B"]].sum() + tm.assert_frame_equal(result, expected) + + def test_frame_on2(self): + + # using multiple aggregation columns + df = DataFrame( + { + "A": [0, 1, 2, 3, 4], + "B": [0, 1, 2, np.nan, 4], + "C": Index( + [ + Timestamp("20130101 09:00:00"), + Timestamp("20130101 09:00:02"), + Timestamp("20130101 09:00:03"), + Timestamp("20130101 09:00:05"), + Timestamp("20130101 09:00:06"), + ] + ), + }, + columns=["A", "C", "B"], + ) + + expected1 = DataFrame( + {"A": [0.0, 1, 3, 3, 7], "B": [0, 1, 3, np.nan, 4], "C": df["C"]}, + columns=["A", "C", "B"], + ) + + result = df.rolling("2s", on="C").sum() + expected = expected1 + tm.assert_frame_equal(result, expected) + + expected = Series([0, 1, 3, np.nan, 4], name="B") + result = df.rolling("2s", on="C").B.sum() + tm.assert_series_equal(result, expected) + + expected = expected1[["A", "B", "C"]] + result = df.rolling("2s", on="C")[["A", "B", "C"]].sum() + tm.assert_frame_equal(result, expected) + + def test_basic_regular(self): + + df = self.regular.copy() + + df.index = date_range("20130101", periods=5, freq="D") + expected = df.rolling(window=1, min_periods=1).sum() + result = df.rolling(window="1D").sum() + tm.assert_frame_equal(result, expected) + + df.index = date_range("20130101", periods=5, freq="2D") + expected = df.rolling(window=1, min_periods=1).sum() + result = df.rolling(window="2D", min_periods=1).sum() + tm.assert_frame_equal(result, expected) + + expected = df.rolling(window=1, min_periods=1).sum() + result = df.rolling(window="2D", min_periods=1).sum() + tm.assert_frame_equal(result, expected) + + expected = df.rolling(window=1).sum() + result = df.rolling(window="2D").sum() + tm.assert_frame_equal(result, expected) + + def test_min_periods(self): + + # compare for min_periods + df = self.regular + + # these slightly different + expected = df.rolling(2, min_periods=1).sum() + result = df.rolling("2s").sum() + tm.assert_frame_equal(result, expected) + + expected = df.rolling(2, min_periods=1).sum() + result = df.rolling("2s", min_periods=1).sum() + tm.assert_frame_equal(result, expected) + + def test_closed(self): + + # xref GH13965 + + df = DataFrame( + {"A": [1] * 5}, + index=[ + Timestamp("20130101 09:00:01"), + Timestamp("20130101 09:00:02"), + Timestamp("20130101 09:00:03"), + Timestamp("20130101 09:00:04"), + Timestamp("20130101 09:00:06"), + ], + ) + + # closed must be 'right', 'left', 'both', 'neither' + with pytest.raises(ValueError): + self.regular.rolling(window="2s", closed="blabla") + + expected = df.copy() + expected["A"] = [1.0, 2, 2, 2, 1] + result = df.rolling("2s", closed="right").sum() + tm.assert_frame_equal(result, expected) + + # default should be 'right' + result = df.rolling("2s").sum() + tm.assert_frame_equal(result, expected) + + expected = df.copy() + expected["A"] = [1.0, 2, 3, 3, 2] + result = df.rolling("2s", closed="both").sum() + tm.assert_frame_equal(result, expected) + + expected = df.copy() + expected["A"] = [np.nan, 1.0, 2, 2, 1] + result = df.rolling("2s", closed="left").sum() + tm.assert_frame_equal(result, expected) + + expected = df.copy() + expected["A"] = [np.nan, 1.0, 1, 1, np.nan] + result = df.rolling("2s", closed="neither").sum() + tm.assert_frame_equal(result, expected) + + def test_ragged_sum(self): + + df = self.ragged + result = df.rolling(window="1s", min_periods=1).sum() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=1).sum() + expected = df.copy() + expected["B"] = [0.0, 1, 3, 3, 7] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=2).sum() + expected = df.copy() + expected["B"] = [np.nan, np.nan, 3, np.nan, 7] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="3s", min_periods=1).sum() + expected = df.copy() + expected["B"] = [0.0, 1, 3, 5, 7] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="3s").sum() + expected = df.copy() + expected["B"] = [0.0, 1, 3, 5, 7] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="4s", min_periods=1).sum() + expected = df.copy() + expected["B"] = [0.0, 1, 3, 6, 9] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="4s", min_periods=3).sum() + expected = df.copy() + expected["B"] = [np.nan, np.nan, 3, 6, 9] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="5s", min_periods=1).sum() + expected = df.copy() + expected["B"] = [0.0, 1, 3, 6, 10] + tm.assert_frame_equal(result, expected) + + def test_ragged_mean(self): + + df = self.ragged + result = df.rolling(window="1s", min_periods=1).mean() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=1).mean() + expected = df.copy() + expected["B"] = [0.0, 1, 1.5, 3.0, 3.5] + tm.assert_frame_equal(result, expected) + + def test_ragged_median(self): + + df = self.ragged + result = df.rolling(window="1s", min_periods=1).median() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=1).median() + expected = df.copy() + expected["B"] = [0.0, 1, 1.5, 3.0, 3.5] + tm.assert_frame_equal(result, expected) + + def test_ragged_quantile(self): + + df = self.ragged + result = df.rolling(window="1s", min_periods=1).quantile(0.5) + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=1).quantile(0.5) + expected = df.copy() + expected["B"] = [0.0, 1, 1.5, 3.0, 3.5] + tm.assert_frame_equal(result, expected) + + def test_ragged_std(self): + + df = self.ragged + result = df.rolling(window="1s", min_periods=1).std(ddof=0) + expected = df.copy() + expected["B"] = [0.0] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="1s", min_periods=1).std(ddof=1) + expected = df.copy() + expected["B"] = [np.nan] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="3s", min_periods=1).std(ddof=0) + expected = df.copy() + expected["B"] = [0.0] + [0.5] * 4 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="5s", min_periods=1).std(ddof=1) + expected = df.copy() + expected["B"] = [np.nan, 0.707107, 1.0, 1.0, 1.290994] + tm.assert_frame_equal(result, expected) + + def test_ragged_var(self): + + df = self.ragged + result = df.rolling(window="1s", min_periods=1).var(ddof=0) + expected = df.copy() + expected["B"] = [0.0] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="1s", min_periods=1).var(ddof=1) + expected = df.copy() + expected["B"] = [np.nan] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="3s", min_periods=1).var(ddof=0) + expected = df.copy() + expected["B"] = [0.0] + [0.25] * 4 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="5s", min_periods=1).var(ddof=1) + expected = df.copy() + expected["B"] = [np.nan, 0.5, 1.0, 1.0, 1 + 2 / 3.0] + tm.assert_frame_equal(result, expected) + + def test_ragged_skew(self): + + df = self.ragged + result = df.rolling(window="3s", min_periods=1).skew() + expected = df.copy() + expected["B"] = [np.nan] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="5s", min_periods=1).skew() + expected = df.copy() + expected["B"] = [np.nan] * 2 + [0.0, 0.0, 0.0] + tm.assert_frame_equal(result, expected) + + def test_ragged_kurt(self): + + df = self.ragged + result = df.rolling(window="3s", min_periods=1).kurt() + expected = df.copy() + expected["B"] = [np.nan] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="5s", min_periods=1).kurt() + expected = df.copy() + expected["B"] = [np.nan] * 4 + [-1.2] + tm.assert_frame_equal(result, expected) + + def test_ragged_count(self): + + df = self.ragged + result = df.rolling(window="1s", min_periods=1).count() + expected = df.copy() + expected["B"] = [1.0, 1, 1, 1, 1] + tm.assert_frame_equal(result, expected) + + df = self.ragged + result = df.rolling(window="1s").count() + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=1).count() + expected = df.copy() + expected["B"] = [1.0, 1, 2, 1, 2] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=2).count() + expected = df.copy() + expected["B"] = [np.nan, np.nan, 2, np.nan, 2] + tm.assert_frame_equal(result, expected) + + def test_regular_min(self): + + df = DataFrame( + {"A": date_range("20130101", periods=5, freq="s"), "B": [0.0, 1, 2, 3, 4]} + ).set_index("A") + result = df.rolling("1s").min() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + df = DataFrame( + {"A": date_range("20130101", periods=5, freq="s"), "B": [5, 4, 3, 4, 5]} + ).set_index("A") + + tm.assert_frame_equal(result, expected) + result = df.rolling("2s").min() + expected = df.copy() + expected["B"] = [5.0, 4, 3, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling("5s").min() + expected = df.copy() + expected["B"] = [5.0, 4, 3, 3, 3] + tm.assert_frame_equal(result, expected) + + def test_ragged_min(self): + + df = self.ragged + + result = df.rolling(window="1s", min_periods=1).min() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=1).min() + expected = df.copy() + expected["B"] = [0.0, 1, 1, 3, 3] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="5s", min_periods=1).min() + expected = df.copy() + expected["B"] = [0.0, 0, 0, 1, 1] + tm.assert_frame_equal(result, expected) + + def test_perf_min(self): + + N = 10000 + + dfp = DataFrame( + {"B": np.random.randn(N)}, index=date_range("20130101", periods=N, freq="s") + ) + expected = dfp.rolling(2, min_periods=1).min() + result = dfp.rolling("2s").min() + assert ((result - expected) < 0.01).all().bool() + + expected = dfp.rolling(200, min_periods=1).min() + result = dfp.rolling("200s").min() + assert ((result - expected) < 0.01).all().bool() + + def test_ragged_max(self): + + df = self.ragged + + result = df.rolling(window="1s", min_periods=1).max() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=1).max() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="5s", min_periods=1).max() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "freq, op, result_data", + [ + ("ms", "min", [0.0] * 10), + ("ms", "mean", [0.0] * 9 + [2.0 / 9]), + ("ms", "max", [0.0] * 9 + [2.0]), + ("s", "min", [0.0] * 10), + ("s", "mean", [0.0] * 9 + [2.0 / 9]), + ("s", "max", [0.0] * 9 + [2.0]), + ("min", "min", [0.0] * 10), + ("min", "mean", [0.0] * 9 + [2.0 / 9]), + ("min", "max", [0.0] * 9 + [2.0]), + ("h", "min", [0.0] * 10), + ("h", "mean", [0.0] * 9 + [2.0 / 9]), + ("h", "max", [0.0] * 9 + [2.0]), + ("D", "min", [0.0] * 10), + ("D", "mean", [0.0] * 9 + [2.0 / 9]), + ("D", "max", [0.0] * 9 + [2.0]), + ], + ) + def test_freqs_ops(self, freq, op, result_data): + # GH 21096 + index = date_range(start="2018-1-1 01:00:00", freq=f"1{freq}", periods=10) + s = Series(data=0, index=index) + s.iloc[1] = np.nan + s.iloc[-1] = 2 + result = getattr(s.rolling(window=f"10{freq}"), op)() + expected = Series(data=result_data, index=index) + + tm.assert_series_equal(result, expected) + + def test_all(self): + + # simple comparison of integer vs time-based windowing + df = self.regular * 2 + er = df.rolling(window=1) + r = df.rolling(window="1s") + + for f in [ + "sum", + "mean", + "count", + "median", + "std", + "var", + "kurt", + "skew", + "min", + "max", + ]: + + result = getattr(r, f)() + expected = getattr(er, f)() + tm.assert_frame_equal(result, expected) + + result = r.quantile(0.5) + expected = er.quantile(0.5) + tm.assert_frame_equal(result, expected) + + def test_all2(self): + + # more sophisticated comparison of integer vs. + # time-based windowing + df = DataFrame( + {"B": np.arange(50)}, index=date_range("20130101", periods=50, freq="H") + ) + # in-range data + dft = df.between_time("09:00", "16:00") + + r = dft.rolling(window="5H") + + for f in [ + "sum", + "mean", + "count", + "median", + "std", + "var", + "kurt", + "skew", + "min", + "max", + ]: + + result = getattr(r, f)() + + # we need to roll the days separately + # to compare with a time-based roll + # finally groupby-apply will return a multi-index + # so we need to drop the day + def agg_by_day(x): + x = x.between_time("09:00", "16:00") + return getattr(x.rolling(5, min_periods=1), f)() + + expected = ( + df.groupby(df.index.day) + .apply(agg_by_day) + .reset_index(level=0, drop=True) + ) + + tm.assert_frame_equal(result, expected) + + def test_groupby_monotonic(self): + + # GH 15130 + # we don't need to validate monotonicity when grouping + + data = [ + ["David", "1/1/2015", 100], + ["David", "1/5/2015", 500], + ["David", "5/30/2015", 50], + ["David", "7/25/2015", 50], + ["Ryan", "1/4/2014", 100], + ["Ryan", "1/19/2015", 500], + ["Ryan", "3/31/2016", 50], + ["Joe", "7/1/2015", 100], + ["Joe", "9/9/2015", 500], + ["Joe", "10/15/2015", 50], + ] + + df = DataFrame(data=data, columns=["name", "date", "amount"]) + df["date"] = to_datetime(df["date"]) + + expected = ( + df.set_index("date") + .groupby("name") + .apply(lambda x: x.rolling("180D")["amount"].sum()) + ) + result = df.groupby("name").rolling("180D", on="date")["amount"].sum() + tm.assert_series_equal(result, expected) + + def test_non_monotonic(self): + # GH 13966 (similar to #15130, closed by #15175) + + dates = date_range(start="2016-01-01 09:30:00", periods=20, freq="s") + df = DataFrame( + { + "A": [1] * 20 + [2] * 12 + [3] * 8, + "B": np.concatenate((dates, dates)), + "C": np.arange(40), + } + ) + + result = df.groupby("A").rolling("4s", on="B").C.mean() + expected = ( + df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean()) + ) + tm.assert_series_equal(result, expected) + + df2 = df.sort_values("B") + result = df2.groupby("A").rolling("4s", on="B").C.mean() + tm.assert_series_equal(result, expected) + + def test_rolling_cov_offset(self): + # GH16058 + + idx = date_range("2017-01-01", periods=24, freq="1h") + ss = Series(np.arange(len(idx)), index=idx) + + result = ss.rolling("2h").cov() + expected = Series([np.nan] + [0.5] * (len(idx) - 1), index=idx) + tm.assert_series_equal(result, expected) + + expected2 = ss.rolling(2, min_periods=1).cov() + tm.assert_series_equal(result, expected2) + + result = ss.rolling("3h").cov() + expected = Series([np.nan, 0.5] + [1.0] * (len(idx) - 2), index=idx) + tm.assert_series_equal(result, expected) + + expected2 = ss.rolling(3, min_periods=1).cov() + tm.assert_series_equal(result, expected2) + + def test_rolling_on_decreasing_index(self): + # GH-19248 + index = [ + Timestamp("20190101 09:00:00"), + Timestamp("20190101 09:00:02"), + Timestamp("20190101 09:00:03"), + Timestamp("20190101 09:00:05"), + Timestamp("20190101 09:00:06"), + ] + + df = DataFrame({"column": [3, 4, 4, 2, 1]}, index=reversed(index)) + result = df.rolling("2s").min() + expected = DataFrame( + {"column": [3.0, 3.0, 3.0, 2.0, 1.0]}, index=reversed(index) + ) + tm.assert_frame_equal(result, expected) + + def test_rolling_on_multi_index_level(self): + # GH-15584 + df = DataFrame( + {"column": range(6)}, + index=MultiIndex.from_product( + [date_range("20190101", periods=3), range(2)], names=["date", "seq"] + ), + ) + result = df.rolling("10d", on=df.index.get_level_values("date")).sum() + expected = DataFrame( + {"column": [0.0, 1.0, 3.0, 6.0, 10.0, 15.0]}, index=df.index + ) + tm.assert_frame_equal(result, expected) diff --git a/venv/Lib/site-packages/pandas/tests/window/test_window.py b/venv/Lib/site-packages/pandas/tests/window/test_window.py new file mode 100644 index 0000000..cc29ab4 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tests/window/test_window.py @@ -0,0 +1,76 @@ +import numpy as np +import pytest + +from pandas.errors import UnsupportedFunctionCall +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import Series +from pandas.core.window import Window +from pandas.tests.window.common import Base + + +@pytest.mark.filterwarnings("ignore:can't resolve package:ImportWarning") +class TestWindow(Base): + def setup_method(self, method): + self._create_data() + + @td.skip_if_no_scipy + @pytest.mark.parametrize("which", ["series", "frame"]) + def test_constructor(self, which): + # GH 12669 + + o = getattr(self, which) + c = o.rolling + + # valid + c(win_type="boxcar", window=2, min_periods=1) + c(win_type="boxcar", window=2, min_periods=1, center=True) + c(win_type="boxcar", window=2, min_periods=1, center=False) + + # not valid + for w in [2.0, "foo", np.array([2])]: + with pytest.raises(ValueError): + c(win_type="boxcar", window=2, min_periods=w) + with pytest.raises(ValueError): + c(win_type="boxcar", window=2, min_periods=1, center=w) + + for wt in ["foobar", 1]: + with pytest.raises(ValueError): + c(win_type=wt, window=2) + + @td.skip_if_no_scipy + @pytest.mark.parametrize("which", ["series", "frame"]) + def test_constructor_with_win_type(self, which, win_types): + # GH 12669 + o = getattr(self, which) + c = o.rolling + c(win_type=win_types, window=2) + + @pytest.mark.parametrize("method", ["sum", "mean"]) + def test_numpy_compat(self, method): + # see gh-12811 + w = Window(Series([2, 4, 6]), window=[0, 2]) + + msg = "numpy operations are not valid with window objects" + + with pytest.raises(UnsupportedFunctionCall, match=msg): + getattr(w, method)(1, 2, 3) + with pytest.raises(UnsupportedFunctionCall, match=msg): + getattr(w, method)(dtype=np.float64) + + @td.skip_if_no_scipy + @pytest.mark.parametrize("arg", ["median", "kurt", "skew"]) + def test_agg_function_support(self, arg): + df = pd.DataFrame({"A": np.arange(5)}) + roll = df.rolling(2, win_type="triang") + + msg = f"'{arg}' is not a valid function for 'Window' object" + with pytest.raises(AttributeError, match=msg): + roll.agg(arg) + + with pytest.raises(AttributeError, match=msg): + roll.agg([arg]) + + with pytest.raises(AttributeError, match=msg): + roll.agg({"A": arg}) diff --git a/venv/Lib/site-packages/pandas/tseries/__init__.py b/venv/Lib/site-packages/pandas/tseries/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/venv/Lib/site-packages/pandas/tseries/api.py b/venv/Lib/site-packages/pandas/tseries/api.py new file mode 100644 index 0000000..2094791 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tseries/api.py @@ -0,0 +1,8 @@ +""" +Timeseries API +""" + +# flake8: noqa + +from pandas.tseries.frequencies import infer_freq +import pandas.tseries.offsets as offsets diff --git a/venv/Lib/site-packages/pandas/tseries/frequencies.py b/venv/Lib/site-packages/pandas/tseries/frequencies.py new file mode 100644 index 0000000..e2d007c --- /dev/null +++ b/venv/Lib/site-packages/pandas/tseries/frequencies.py @@ -0,0 +1,538 @@ +from datetime import timedelta +import re +from typing import Dict, Optional +import warnings + +import numpy as np +from pytz import AmbiguousTimeError + +from pandas._libs.algos import unique_deltas +from pandas._libs.tslibs import Timedelta, Timestamp +from pandas._libs.tslibs.ccalendar import MONTH_ALIASES, int_to_weekday +from pandas._libs.tslibs.fields import build_field_sarray +import pandas._libs.tslibs.frequencies as libfreqs +from pandas._libs.tslibs.offsets import _offset_to_period_map +import pandas._libs.tslibs.resolution as libresolution +from pandas._libs.tslibs.resolution import Resolution +from pandas._libs.tslibs.timezones import UTC +from pandas._libs.tslibs.tzconversion import tz_convert +from pandas.util._decorators import cache_readonly + +from pandas.core.dtypes.common import ( + is_datetime64_dtype, + is_period_arraylike, + is_timedelta64_dtype, +) +from pandas.core.dtypes.generic import ABCSeries + +from pandas.core.algorithms import unique + +from pandas.tseries.offsets import ( + DateOffset, + Day, + Hour, + Micro, + Milli, + Minute, + Nano, + Second, + prefix_mapping, +) + +_ONE_MICRO = 1000 +_ONE_MILLI = _ONE_MICRO * 1000 +_ONE_SECOND = _ONE_MILLI * 1000 +_ONE_MINUTE = 60 * _ONE_SECOND +_ONE_HOUR = 60 * _ONE_MINUTE +_ONE_DAY = 24 * _ONE_HOUR + +# --------------------------------------------------------------------- +# Offset names ("time rules") and related functions + +#: cache of previously seen offsets +_offset_map: Dict[str, DateOffset] = {} + + +def get_period_alias(offset_str: str) -> Optional[str]: + """ + Alias to closest period strings BQ->Q etc. + """ + return _offset_to_period_map.get(offset_str, None) + + +_name_to_offset_map = { + "days": Day(1), + "hours": Hour(1), + "minutes": Minute(1), + "seconds": Second(1), + "milliseconds": Milli(1), + "microseconds": Micro(1), + "nanoseconds": Nano(1), +} + + +def to_offset(freq) -> Optional[DateOffset]: + """ + Return DateOffset object from string or tuple representation + or datetime.timedelta object. + + Parameters + ---------- + freq : str, tuple, datetime.timedelta, DateOffset or None + + Returns + ------- + DateOffset + None if freq is None. + + Raises + ------ + ValueError + If freq is an invalid frequency + + See Also + -------- + DateOffset + + Examples + -------- + >>> to_offset('5min') + <5 * Minutes> + + >>> to_offset('1D1H') + <25 * Hours> + + >>> to_offset(('W', 2)) + <2 * Weeks: weekday=6> + + >>> to_offset((2, 'B')) + <2 * BusinessDays> + + >>> to_offset(datetime.timedelta(days=1)) + + + >>> to_offset(Hour()) + + """ + if freq is None: + return None + + if isinstance(freq, DateOffset): + return freq + + if isinstance(freq, tuple): + name = freq[0] + stride = freq[1] + if isinstance(stride, str): + name, stride = stride, name + name, _ = libfreqs._base_and_stride(name) + delta = _get_offset(name) * stride + + elif isinstance(freq, timedelta): + delta = None + freq = Timedelta(freq) + try: + for name in freq.components._fields: + offset = _name_to_offset_map[name] + stride = getattr(freq.components, name) + if stride != 0: + offset = stride * offset + if delta is None: + delta = offset + else: + delta = delta + offset + except ValueError: + raise ValueError(libfreqs.INVALID_FREQ_ERR_MSG.format(freq)) + + else: + delta = None + stride_sign = None + try: + splitted = re.split(libfreqs.opattern, freq) + if splitted[-1] != "" and not splitted[-1].isspace(): + # the last element must be blank + raise ValueError("last element must be blank") + for sep, stride, name in zip( + splitted[0::4], splitted[1::4], splitted[2::4] + ): + if sep != "" and not sep.isspace(): + raise ValueError("separator must be spaces") + prefix = libfreqs._lite_rule_alias.get(name) or name + if stride_sign is None: + stride_sign = -1 if stride.startswith("-") else 1 + if not stride: + stride = 1 + if prefix in Resolution._reso_str_bump_map.keys(): + stride, name = Resolution.get_stride_from_decimal( + float(stride), prefix + ) + stride = int(stride) + offset = _get_offset(name) + offset = offset * int(np.fabs(stride) * stride_sign) + if delta is None: + delta = offset + else: + delta = delta + offset + except (ValueError, TypeError): + raise ValueError(libfreqs.INVALID_FREQ_ERR_MSG.format(freq)) + + if delta is None: + raise ValueError(libfreqs.INVALID_FREQ_ERR_MSG.format(freq)) + + return delta + + +def get_offset(name: str) -> DateOffset: + """ + Return DateOffset object associated with rule name. + + .. deprecated:: 1.0.0 + + Examples + -------- + get_offset('EOM') --> BMonthEnd(1) + """ + warnings.warn( + "get_offset is deprecated and will be removed in a future version, " + "use to_offset instead", + FutureWarning, + stacklevel=2, + ) + return _get_offset(name) + + +def _get_offset(name: str) -> DateOffset: + """ + Return DateOffset object associated with rule name. + + Examples + -------- + _get_offset('EOM') --> BMonthEnd(1) + """ + if name not in libfreqs._dont_uppercase: + name = name.upper() + name = libfreqs._lite_rule_alias.get(name, name) + name = libfreqs._lite_rule_alias.get(name.lower(), name) + else: + name = libfreqs._lite_rule_alias.get(name, name) + + if name not in _offset_map: + try: + split = name.split("-") + klass = prefix_mapping[split[0]] + # handles case where there's no suffix (and will TypeError if too + # many '-') + offset = klass._from_name(*split[1:]) + except (ValueError, TypeError, KeyError): + # bad prefix or suffix + raise ValueError(libfreqs.INVALID_FREQ_ERR_MSG.format(name)) + # cache + _offset_map[name] = offset + + return _offset_map[name] + + +# --------------------------------------------------------------------- +# Period codes + + +def infer_freq(index, warn: bool = True) -> Optional[str]: + """ + Infer the most likely frequency given the input index. If the frequency is + uncertain, a warning will be printed. + + Parameters + ---------- + index : DatetimeIndex or TimedeltaIndex + If passed a Series will use the values of the series (NOT THE INDEX). + warn : bool, default True + + Returns + ------- + str or None + None if no discernible frequency + TypeError if the index is not datetime-like + ValueError if there are less than three values. + """ + import pandas as pd + + if isinstance(index, ABCSeries): + values = index._values + if not ( + is_datetime64_dtype(values) + or is_timedelta64_dtype(values) + or values.dtype == object + ): + raise TypeError( + "cannot infer freq from a non-convertible dtype " + f"on a Series of {index.dtype}" + ) + index = values + + inferer: _FrequencyInferer + if is_period_arraylike(index): + raise TypeError( + "PeriodIndex given. Check the `freq` attribute " + "instead of using infer_freq." + ) + elif is_timedelta64_dtype(index): + # Allow TimedeltaIndex and TimedeltaArray + inferer = _TimedeltaFrequencyInferer(index, warn=warn) + return inferer.get_freq() + + if isinstance(index, pd.Index) and not isinstance(index, pd.DatetimeIndex): + if isinstance(index, (pd.Int64Index, pd.Float64Index)): + raise TypeError( + f"cannot infer freq from a non-convertible index type {type(index)}" + ) + index = index.values + + if not isinstance(index, pd.DatetimeIndex): + try: + index = pd.DatetimeIndex(index) + except AmbiguousTimeError: + index = pd.DatetimeIndex(index.asi8) + + inferer = _FrequencyInferer(index, warn=warn) + return inferer.get_freq() + + +class _FrequencyInferer: + """ + Not sure if I can avoid the state machine here + """ + + def __init__(self, index, warn: bool = True): + self.index = index + self.values = index.asi8 + + # This moves the values, which are implicitly in UTC, to the + # the timezone so they are in local time + if hasattr(index, "tz"): + if index.tz is not None: + self.values = tz_convert(self.values, UTC, index.tz) + + self.warn = warn + + if len(index) < 3: + raise ValueError("Need at least 3 dates to infer frequency") + + self.is_monotonic = ( + self.index._is_monotonic_increasing or self.index._is_monotonic_decreasing + ) + + @cache_readonly + def deltas(self): + return unique_deltas(self.values) + + @cache_readonly + def deltas_asi8(self): + return unique_deltas(self.index.asi8) + + @cache_readonly + def is_unique(self) -> bool: + return len(self.deltas) == 1 + + @cache_readonly + def is_unique_asi8(self): + return len(self.deltas_asi8) == 1 + + def get_freq(self) -> Optional[str]: + """ + Find the appropriate frequency string to describe the inferred + frequency of self.values + + Returns + ------- + str or None + """ + if not self.is_monotonic or not self.index._is_unique: + return None + + delta = self.deltas[0] + if _is_multiple(delta, _ONE_DAY): + return self._infer_daily_rule() + + # Business hourly, maybe. 17: one day / 65: one weekend + if self.hour_deltas in ([1, 17], [1, 65], [1, 17, 65]): + return "BH" + # Possibly intraday frequency. Here we use the + # original .asi8 values as the modified values + # will not work around DST transitions. See #8772 + elif not self.is_unique_asi8: + return None + + delta = self.deltas_asi8[0] + if _is_multiple(delta, _ONE_HOUR): + # Hours + return _maybe_add_count("H", delta / _ONE_HOUR) + elif _is_multiple(delta, _ONE_MINUTE): + # Minutes + return _maybe_add_count("T", delta / _ONE_MINUTE) + elif _is_multiple(delta, _ONE_SECOND): + # Seconds + return _maybe_add_count("S", delta / _ONE_SECOND) + elif _is_multiple(delta, _ONE_MILLI): + # Milliseconds + return _maybe_add_count("L", delta / _ONE_MILLI) + elif _is_multiple(delta, _ONE_MICRO): + # Microseconds + return _maybe_add_count("U", delta / _ONE_MICRO) + else: + # Nanoseconds + return _maybe_add_count("N", delta) + + @cache_readonly + def day_deltas(self): + return [x / _ONE_DAY for x in self.deltas] + + @cache_readonly + def hour_deltas(self): + return [x / _ONE_HOUR for x in self.deltas] + + @cache_readonly + def fields(self): + return build_field_sarray(self.values) + + @cache_readonly + def rep_stamp(self): + return Timestamp(self.values[0]) + + def month_position_check(self): + return libresolution.month_position_check(self.fields, self.index.dayofweek) + + @cache_readonly + def mdiffs(self): + nmonths = self.fields["Y"] * 12 + self.fields["M"] + return unique_deltas(nmonths.astype("i8")) + + @cache_readonly + def ydiffs(self): + return unique_deltas(self.fields["Y"].astype("i8")) + + def _infer_daily_rule(self) -> Optional[str]: + annual_rule = self._get_annual_rule() + if annual_rule: + nyears = self.ydiffs[0] + month = MONTH_ALIASES[self.rep_stamp.month] + alias = f"{annual_rule}-{month}" + return _maybe_add_count(alias, nyears) + + quarterly_rule = self._get_quarterly_rule() + if quarterly_rule: + nquarters = self.mdiffs[0] / 3 + mod_dict = {0: 12, 2: 11, 1: 10} + month = MONTH_ALIASES[mod_dict[self.rep_stamp.month % 3]] + alias = f"{quarterly_rule}-{month}" + return _maybe_add_count(alias, nquarters) + + monthly_rule = self._get_monthly_rule() + if monthly_rule: + return _maybe_add_count(monthly_rule, self.mdiffs[0]) + + if self.is_unique: + days = self.deltas[0] / _ONE_DAY + if days % 7 == 0: + # Weekly + day = int_to_weekday[self.rep_stamp.weekday()] + return _maybe_add_count(f"W-{day}", days / 7) + else: + return _maybe_add_count("D", days) + + if self._is_business_daily(): + return "B" + + wom_rule = self._get_wom_rule() + if wom_rule: + return wom_rule + + return None + + def _get_annual_rule(self) -> Optional[str]: + if len(self.ydiffs) > 1: + return None + + if len(unique(self.fields["M"])) > 1: + return None + + pos_check = self.month_position_check() + return {"cs": "AS", "bs": "BAS", "ce": "A", "be": "BA"}.get(pos_check) + + def _get_quarterly_rule(self) -> Optional[str]: + if len(self.mdiffs) > 1: + return None + + if not self.mdiffs[0] % 3 == 0: + return None + + pos_check = self.month_position_check() + return {"cs": "QS", "bs": "BQS", "ce": "Q", "be": "BQ"}.get(pos_check) + + def _get_monthly_rule(self) -> Optional[str]: + if len(self.mdiffs) > 1: + return None + pos_check = self.month_position_check() + return {"cs": "MS", "bs": "BMS", "ce": "M", "be": "BM"}.get(pos_check) + + def _is_business_daily(self) -> bool: + # quick check: cannot be business daily + if self.day_deltas != [1, 3]: + return False + + # probably business daily, but need to confirm + first_weekday = self.index[0].weekday() + shifts = np.diff(self.index.asi8) + shifts = np.floor_divide(shifts, _ONE_DAY) + weekdays = np.mod(first_weekday + np.cumsum(shifts), 7) + return np.all( + ((weekdays == 0) & (shifts == 3)) + | ((weekdays > 0) & (weekdays <= 4) & (shifts == 1)) + ) + + def _get_wom_rule(self) -> Optional[str]: + # wdiffs = unique(np.diff(self.index.week)) + # We also need -47, -49, -48 to catch index spanning year boundary + # if not lib.ismember(wdiffs, set([4, 5, -47, -49, -48])).all(): + # return None + + weekdays = unique(self.index.weekday) + if len(weekdays) > 1: + return None + + week_of_months = unique((self.index.day - 1) // 7) + # Only attempt to infer up to WOM-4. See #9425 + week_of_months = week_of_months[week_of_months < 4] + if len(week_of_months) == 0 or len(week_of_months) > 1: + return None + + # get which week + week = week_of_months[0] + 1 + wd = int_to_weekday[weekdays[0]] + + return f"WOM-{week}{wd}" + + +class _TimedeltaFrequencyInferer(_FrequencyInferer): + def _infer_daily_rule(self): + if self.is_unique: + days = self.deltas[0] / _ONE_DAY + if days % 7 == 0: + # Weekly + wd = int_to_weekday[self.rep_stamp.weekday()] + alias = f"W-{wd}" + return _maybe_add_count(alias, days / 7) + else: + return _maybe_add_count("D", days) + + +def _is_multiple(us, mult: int) -> bool: + return us % mult == 0 + + +def _maybe_add_count(base: str, count: float) -> str: + if count != 1: + assert count == int(count) + count = int(count) + return f"{count}{base}" + else: + return base diff --git a/venv/Lib/site-packages/pandas/tseries/holiday.py b/venv/Lib/site-packages/pandas/tseries/holiday.py new file mode 100644 index 0000000..62d7c26 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tseries/holiday.py @@ -0,0 +1,534 @@ +from datetime import datetime, timedelta +from typing import List +import warnings + +from dateutil.relativedelta import FR, MO, SA, SU, TH, TU, WE # noqa +import numpy as np + +from pandas.errors import PerformanceWarning + +from pandas import DateOffset, Series, Timestamp, date_range + +from pandas.tseries.offsets import Day, Easter + + +def next_monday(dt): + """ + If holiday falls on Saturday, use following Monday instead; + if holiday falls on Sunday, use Monday instead + """ + if dt.weekday() == 5: + return dt + timedelta(2) + elif dt.weekday() == 6: + return dt + timedelta(1) + return dt + + +def next_monday_or_tuesday(dt): + """ + For second holiday of two adjacent ones! + If holiday falls on Saturday, use following Monday instead; + if holiday falls on Sunday or Monday, use following Tuesday instead + (because Monday is already taken by adjacent holiday on the day before) + """ + dow = dt.weekday() + if dow == 5 or dow == 6: + return dt + timedelta(2) + elif dow == 0: + return dt + timedelta(1) + return dt + + +def previous_friday(dt): + """ + If holiday falls on Saturday or Sunday, use previous Friday instead. + """ + if dt.weekday() == 5: + return dt - timedelta(1) + elif dt.weekday() == 6: + return dt - timedelta(2) + return dt + + +def sunday_to_monday(dt): + """ + If holiday falls on Sunday, use day thereafter (Monday) instead. + """ + if dt.weekday() == 6: + return dt + timedelta(1) + return dt + + +def weekend_to_monday(dt): + """ + If holiday falls on Sunday or Saturday, + use day thereafter (Monday) instead. + Needed for holidays such as Christmas observation in Europe + """ + if dt.weekday() == 6: + return dt + timedelta(1) + elif dt.weekday() == 5: + return dt + timedelta(2) + return dt + + +def nearest_workday(dt): + """ + If holiday falls on Saturday, use day before (Friday) instead; + if holiday falls on Sunday, use day thereafter (Monday) instead. + """ + if dt.weekday() == 5: + return dt - timedelta(1) + elif dt.weekday() == 6: + return dt + timedelta(1) + return dt + + +def next_workday(dt): + """ + returns next weekday used for observances + """ + dt += timedelta(days=1) + while dt.weekday() > 4: + # Mon-Fri are 0-4 + dt += timedelta(days=1) + return dt + + +def previous_workday(dt): + """ + returns previous weekday used for observances + """ + dt -= timedelta(days=1) + while dt.weekday() > 4: + # Mon-Fri are 0-4 + dt -= timedelta(days=1) + return dt + + +def before_nearest_workday(dt): + """ + returns previous workday after nearest workday + """ + return previous_workday(nearest_workday(dt)) + + +def after_nearest_workday(dt): + """ + returns next workday after nearest workday + needed for Boxing day or multiple holidays in a series + """ + return next_workday(nearest_workday(dt)) + + +class Holiday: + """ + Class that defines a holiday with start/end dates and rules + for observance. + """ + + def __init__( + self, + name, + year=None, + month=None, + day=None, + offset=None, + observance=None, + start_date=None, + end_date=None, + days_of_week=None, + ): + """ + Parameters + ---------- + name : str + Name of the holiday , defaults to class name + offset : array of pandas.tseries.offsets or + class from pandas.tseries.offsets + computes offset from date + observance: function + computes when holiday is given a pandas Timestamp + days_of_week: + provide a tuple of days e.g (0,1,2,3,) for Monday Through Thursday + Monday=0,..,Sunday=6 + + Examples + -------- + >>> from pandas.tseries.holiday import Holiday, nearest_workday + >>> from dateutil.relativedelta import MO + >>> USMemorialDay = Holiday('Memorial Day', month=5, day=31, + offset=pd.DateOffset(weekday=MO(-1))) + >>> USLaborDay = Holiday('Labor Day', month=9, day=1, + offset=pd.DateOffset(weekday=MO(1))) + >>> July3rd = Holiday('July 3rd', month=7, day=3,) + >>> NewYears = Holiday('New Years Day', month=1, day=1, + observance=nearest_workday), + >>> July3rd = Holiday('July 3rd', month=7, day=3, + days_of_week=(0, 1, 2, 3)) + """ + if offset is not None and observance is not None: + raise NotImplementedError("Cannot use both offset and observance.") + + self.name = name + self.year = year + self.month = month + self.day = day + self.offset = offset + self.start_date = ( + Timestamp(start_date) if start_date is not None else start_date + ) + self.end_date = Timestamp(end_date) if end_date is not None else end_date + self.observance = observance + assert days_of_week is None or type(days_of_week) == tuple + self.days_of_week = days_of_week + + def __repr__(self) -> str: + info = "" + if self.year is not None: + info += f"year={self.year}, " + info += f"month={self.month}, day={self.day}, " + + if self.offset is not None: + info += f"offset={self.offset}" + + if self.observance is not None: + info += f"observance={self.observance}" + + repr = f"Holiday: {self.name} ({info})" + return repr + + def dates(self, start_date, end_date, return_name=False): + """ + Calculate holidays observed between start date and end date + + Parameters + ---------- + start_date : starting date, datetime-like, optional + end_date : ending date, datetime-like, optional + return_name : bool, optional, default=False + If True, return a series that has dates and holiday names. + False will only return dates. + """ + start_date = Timestamp(start_date) + end_date = Timestamp(end_date) + + filter_start_date = start_date + filter_end_date = end_date + + if self.year is not None: + dt = Timestamp(datetime(self.year, self.month, self.day)) + if return_name: + return Series(self.name, index=[dt]) + else: + return [dt] + + dates = self._reference_dates(start_date, end_date) + holiday_dates = self._apply_rule(dates) + if self.days_of_week is not None: + holiday_dates = holiday_dates[ + np.in1d(holiday_dates.dayofweek, self.days_of_week) + ] + + if self.start_date is not None: + filter_start_date = max( + self.start_date.tz_localize(filter_start_date.tz), filter_start_date + ) + if self.end_date is not None: + filter_end_date = min( + self.end_date.tz_localize(filter_end_date.tz), filter_end_date + ) + holiday_dates = holiday_dates[ + (holiday_dates >= filter_start_date) & (holiday_dates <= filter_end_date) + ] + if return_name: + return Series(self.name, index=holiday_dates) + return holiday_dates + + def _reference_dates(self, start_date, end_date): + """ + Get reference dates for the holiday. + + Return reference dates for the holiday also returning the year + prior to the start_date and year following the end_date. This ensures + that any offsets to be applied will yield the holidays within + the passed in dates. + """ + if self.start_date is not None: + start_date = self.start_date.tz_localize(start_date.tz) + + if self.end_date is not None: + end_date = self.end_date.tz_localize(start_date.tz) + + year_offset = DateOffset(years=1) + reference_start_date = Timestamp( + datetime(start_date.year - 1, self.month, self.day) + ) + + reference_end_date = Timestamp( + datetime(end_date.year + 1, self.month, self.day) + ) + # Don't process unnecessary holidays + dates = date_range( + start=reference_start_date, + end=reference_end_date, + freq=year_offset, + tz=start_date.tz, + ) + + return dates + + def _apply_rule(self, dates): + """ + Apply the given offset/observance to a DatetimeIndex of dates. + + Parameters + ---------- + dates : DatetimeIndex + Dates to apply the given offset/observance rule + + Returns + ------- + Dates with rules applied + """ + if self.observance is not None: + return dates.map(lambda d: self.observance(d)) + + if self.offset is not None: + if not isinstance(self.offset, list): + offsets = [self.offset] + else: + offsets = self.offset + for offset in offsets: + + # if we are adding a non-vectorized value + # ignore the PerformanceWarnings: + with warnings.catch_warnings(): + warnings.simplefilter("ignore", PerformanceWarning) + dates += offset + return dates + + +holiday_calendars = {} + + +def register(cls): + try: + name = cls.name + except AttributeError: + name = cls.__name__ + holiday_calendars[name] = cls + + +def get_calendar(name): + """ + Return an instance of a calendar based on its name. + + Parameters + ---------- + name : str + Calendar name to return an instance of + """ + return holiday_calendars[name]() + + +class HolidayCalendarMetaClass(type): + def __new__(cls, clsname, bases, attrs): + calendar_class = super().__new__(cls, clsname, bases, attrs) + register(calendar_class) + return calendar_class + + +class AbstractHolidayCalendar(metaclass=HolidayCalendarMetaClass): + """ + Abstract interface to create holidays following certain rules. + """ + + rules: List[Holiday] = [] + start_date = Timestamp(datetime(1970, 1, 1)) + end_date = Timestamp(datetime(2200, 12, 31)) + _cache = None + + def __init__(self, name=None, rules=None): + """ + Initializes holiday object with a given set a rules. Normally + classes just have the rules defined within them. + + Parameters + ---------- + name : str + Name of the holiday calendar, defaults to class name + rules : array of Holiday objects + A set of rules used to create the holidays. + """ + super().__init__() + if name is None: + name = type(self).__name__ + self.name = name + + if rules is not None: + self.rules = rules + + def rule_from_name(self, name): + for rule in self.rules: + if rule.name == name: + return rule + + return None + + def holidays(self, start=None, end=None, return_name=False): + """ + Returns a curve with holidays between start_date and end_date + + Parameters + ---------- + start : starting date, datetime-like, optional + end : ending date, datetime-like, optional + return_name : bool, optional + If True, return a series that has dates and holiday names. + False will only return a DatetimeIndex of dates. + + Returns + ------- + DatetimeIndex of holidays + """ + if self.rules is None: + raise Exception( + f"Holiday Calendar {self.name} does not have any rules specified" + ) + + if start is None: + start = AbstractHolidayCalendar.start_date + + if end is None: + end = AbstractHolidayCalendar.end_date + + start = Timestamp(start) + end = Timestamp(end) + + holidays = None + # If we don't have a cache or the dates are outside the prior cache, we + # get them again + if self._cache is None or start < self._cache[0] or end > self._cache[1]: + for rule in self.rules: + rule_holidays = rule.dates(start, end, return_name=True) + + if holidays is None: + holidays = rule_holidays + else: + holidays = holidays.append(rule_holidays) + + self._cache = (start, end, holidays.sort_index()) + + holidays = self._cache[2] + holidays = holidays[start:end] + + if return_name: + return holidays + else: + return holidays.index + + @staticmethod + def merge_class(base, other): + """ + Merge holiday calendars together. The base calendar + will take precedence to other. The merge will be done + based on each holiday's name. + + Parameters + ---------- + base : AbstractHolidayCalendar + instance/subclass or array of Holiday objects + other : AbstractHolidayCalendar + instance/subclass or array of Holiday objects + """ + try: + other = other.rules + except AttributeError: + pass + + if not isinstance(other, list): + other = [other] + other_holidays = {holiday.name: holiday for holiday in other} + + try: + base = base.rules + except AttributeError: + pass + + if not isinstance(base, list): + base = [base] + base_holidays = {holiday.name: holiday for holiday in base} + + other_holidays.update(base_holidays) + return list(other_holidays.values()) + + def merge(self, other, inplace=False): + """ + Merge holiday calendars together. The caller's class + rules take precedence. The merge will be done + based on each holiday's name. + + Parameters + ---------- + other : holiday calendar + inplace : bool (default=False) + If True set rule_table to holidays, else return array of Holidays + """ + holidays = self.merge_class(self, other) + if inplace: + self.rules = holidays + else: + return holidays + + +USMemorialDay = Holiday( + "Memorial Day", month=5, day=31, offset=DateOffset(weekday=MO(-1)) +) +USLaborDay = Holiday("Labor Day", month=9, day=1, offset=DateOffset(weekday=MO(1))) +USColumbusDay = Holiday( + "Columbus Day", month=10, day=1, offset=DateOffset(weekday=MO(2)) +) +USThanksgivingDay = Holiday( + "Thanksgiving", month=11, day=1, offset=DateOffset(weekday=TH(4)) +) +USMartinLutherKingJr = Holiday( + "Martin Luther King Jr. Day", + start_date=datetime(1986, 1, 1), + month=1, + day=1, + offset=DateOffset(weekday=MO(3)), +) +USPresidentsDay = Holiday( + "Presidents Day", month=2, day=1, offset=DateOffset(weekday=MO(3)) +) +GoodFriday = Holiday("Good Friday", month=1, day=1, offset=[Easter(), Day(-2)]) + +EasterMonday = Holiday("Easter Monday", month=1, day=1, offset=[Easter(), Day(1)]) + + +class USFederalHolidayCalendar(AbstractHolidayCalendar): + """ + US Federal Government Holiday Calendar based on rules specified by: + https://www.opm.gov/policy-data-oversight/ + snow-dismissal-procedures/federal-holidays/ + """ + + rules = [ + Holiday("New Years Day", month=1, day=1, observance=nearest_workday), + USMartinLutherKingJr, + USPresidentsDay, + USMemorialDay, + Holiday("July 4th", month=7, day=4, observance=nearest_workday), + USLaborDay, + USColumbusDay, + Holiday("Veterans Day", month=11, day=11, observance=nearest_workday), + USThanksgivingDay, + Holiday("Christmas", month=12, day=25, observance=nearest_workday), + ] + + +def HolidayCalendarFactory(name, base, other, base_class=AbstractHolidayCalendar): + rules = AbstractHolidayCalendar.merge_class(base, other) + calendar_class = type(name, (base_class,), {"rules": rules, "name": name}) + return calendar_class diff --git a/venv/Lib/site-packages/pandas/tseries/offsets.py b/venv/Lib/site-packages/pandas/tseries/offsets.py new file mode 100644 index 0000000..8bb98a2 --- /dev/null +++ b/venv/Lib/site-packages/pandas/tseries/offsets.py @@ -0,0 +1,2838 @@ +from datetime import date, datetime, timedelta +import functools +import operator +from typing import Any, Optional +import warnings + +from dateutil.easter import easter +import numpy as np + +from pandas._libs.tslibs import ( + NaT, + OutOfBoundsDatetime, + Period, + Timedelta, + Timestamp, + ccalendar, + conversion, + delta_to_nanoseconds, + frequencies as libfrequencies, + normalize_date, + offsets as liboffsets, + timezones, +) +from pandas._libs.tslibs.offsets import ( + ApplyTypeError, + BaseOffset, + _get_calendar, + _is_normalized, + _to_dt64, + apply_index_wraps, + as_datetime, + roll_yearday, + shift_month, +) +from pandas.errors import AbstractMethodError +from pandas.util._decorators import Appender, Substitution, cache_readonly + +from pandas.core.dtypes.inference import is_list_like + +__all__ = [ + "Day", + "BusinessDay", + "BDay", + "CustomBusinessDay", + "CDay", + "CBMonthEnd", + "CBMonthBegin", + "MonthBegin", + "BMonthBegin", + "MonthEnd", + "BMonthEnd", + "SemiMonthEnd", + "SemiMonthBegin", + "BusinessHour", + "CustomBusinessHour", + "YearBegin", + "BYearBegin", + "YearEnd", + "BYearEnd", + "QuarterBegin", + "BQuarterBegin", + "QuarterEnd", + "BQuarterEnd", + "LastWeekOfMonth", + "FY5253Quarter", + "FY5253", + "Week", + "WeekOfMonth", + "Easter", + "Hour", + "Minute", + "Second", + "Milli", + "Micro", + "Nano", + "DateOffset", +] + +# convert to/from datetime/timestamp to allow invalid Timestamp ranges to +# pass thru + + +def as_timestamp(obj): + if isinstance(obj, Timestamp): + return obj + try: + return Timestamp(obj) + except (OutOfBoundsDatetime): + pass + return obj + + +def apply_wraps(func): + @functools.wraps(func) + def wrapper(self, other): + if other is NaT: + return NaT + elif isinstance(other, (timedelta, Tick, DateOffset)): + # timedelta path + return func(self, other) + elif isinstance(other, (np.datetime64, datetime, date)): + other = as_timestamp(other) + + tz = getattr(other, "tzinfo", None) + nano = getattr(other, "nanosecond", 0) + + try: + if self._adjust_dst and isinstance(other, Timestamp): + other = other.tz_localize(None) + + result = func(self, other) + + if self._adjust_dst: + result = conversion.localize_pydatetime(result, tz) + + result = Timestamp(result) + if self.normalize: + result = result.normalize() + + # nanosecond may be deleted depending on offset process + if not self.normalize and nano != 0: + if not isinstance(self, Nano) and result.nanosecond != nano: + if result.tz is not None: + # convert to UTC + value = conversion.tz_convert_single( + result.value, timezones.UTC, result.tz + ) + else: + value = result.value + result = Timestamp(value + nano) + + if tz is not None and result.tzinfo is None: + result = conversion.localize_pydatetime(result, tz) + + except OutOfBoundsDatetime: + result = func(self, as_datetime(other)) + + if self.normalize: + # normalize_date returns normal datetime + result = normalize_date(result) + + if tz is not None and result.tzinfo is None: + result = conversion.localize_pydatetime(result, tz) + + result = Timestamp(result) + + return result + + return wrapper + + +# --------------------------------------------------------------------- +# DateOffset + + +class DateOffset(BaseOffset): + """ + Standard kind of date increment used for a date range. + + Works exactly like relativedelta in terms of the keyword args you + pass in, use of the keyword n is discouraged-- you would be better + off specifying n in the keywords you use, but regardless it is + there for you. n is needed for DateOffset subclasses. + + DateOffset work as follows. Each offset specify a set of dates + that conform to the DateOffset. For example, Bday defines this + set to be the set of dates that are weekdays (M-F). To test if a + date is in the set of a DateOffset dateOffset we can use the + is_on_offset method: dateOffset.is_on_offset(date). + + If a date is not on a valid date, the rollback and rollforward + methods can be used to roll the date to the nearest valid date + before/after the date. + + DateOffsets can be created to move dates forward a given number of + valid dates. For example, Bday(2) can be added to a date to move + it two business days forward. If the date does not start on a + valid date, first it is moved to a valid date. Thus pseudo code + is: + + def __add__(date): + date = rollback(date) # does nothing if date is valid + return date + + + When a date offset is created for a negative number of periods, + the date is first rolled forward. The pseudo code is: + + def __add__(date): + date = rollforward(date) # does nothing is date is valid + return date + + + Zero presents a problem. Should it roll forward or back? We + arbitrarily have it rollforward: + + date + BDay(0) == BDay.rollforward(date) + + Since 0 is a bit weird, we suggest avoiding its use. + + Parameters + ---------- + n : int, default 1 + The number of time periods the offset represents. + normalize : bool, default False + Whether to round the result of a DateOffset addition down to the + previous midnight. + **kwds + Temporal parameter that add to or replace the offset value. + + Parameters that **add** to the offset (like Timedelta): + + - years + - months + - weeks + - days + - hours + - minutes + - seconds + - microseconds + - nanoseconds + + Parameters that **replace** the offset value: + + - year + - month + - day + - weekday + - hour + - minute + - second + - microsecond + - nanosecond. + + See Also + -------- + dateutil.relativedelta.relativedelta : The relativedelta type is designed + to be applied to an existing datetime an can replace specific components of + that datetime, or represents an interval of time. + + Examples + -------- + >>> from pandas.tseries.offsets import DateOffset + >>> ts = pd.Timestamp('2017-01-01 09:10:11') + >>> ts + DateOffset(months=3) + Timestamp('2017-04-01 09:10:11') + + >>> ts = pd.Timestamp('2017-01-01 09:10:11') + >>> ts + DateOffset(months=2) + Timestamp('2017-03-01 09:10:11') + """ + + _params = cache_readonly(BaseOffset._params.fget) + _use_relativedelta = False + _adjust_dst = False + _attributes = frozenset(["n", "normalize"] + list(liboffsets.relativedelta_kwds)) + _deprecations = frozenset(["isAnchored", "onOffset"]) + + # default for prior pickles + normalize = False + + def __init__(self, n=1, normalize=False, **kwds): + BaseOffset.__init__(self, n, normalize) + + off, use_rd = liboffsets._determine_offset(kwds) + object.__setattr__(self, "_offset", off) + object.__setattr__(self, "_use_relativedelta", use_rd) + for key in kwds: + val = kwds[key] + object.__setattr__(self, key, val) + + @apply_wraps + def apply(self, other): + if self._use_relativedelta: + other = as_datetime(other) + + if len(self.kwds) > 0: + tzinfo = getattr(other, "tzinfo", None) + if tzinfo is not None and self._use_relativedelta: + # perform calculation in UTC + other = other.replace(tzinfo=None) + + if self.n > 0: + for i in range(self.n): + other = other + self._offset + else: + for i in range(-self.n): + other = other - self._offset + + if tzinfo is not None and self._use_relativedelta: + # bring tz back from UTC calculation + other = conversion.localize_pydatetime(other, tzinfo) + + return as_timestamp(other) + else: + return other + timedelta(self.n) + + @apply_index_wraps + def apply_index(self, i): + """ + Vectorized apply of DateOffset to DatetimeIndex, + raises NotImplentedError for offsets without a + vectorized implementation. + + Parameters + ---------- + i : DatetimeIndex + + Returns + ------- + y : DatetimeIndex + """ + + if type(self) is not DateOffset: + raise NotImplementedError( + f"DateOffset subclass {type(self).__name__} " + "does not have a vectorized implementation" + ) + kwds = self.kwds + relativedelta_fast = { + "years", + "months", + "weeks", + "days", + "hours", + "minutes", + "seconds", + "microseconds", + } + # relativedelta/_offset path only valid for base DateOffset + if self._use_relativedelta and set(kwds).issubset(relativedelta_fast): + + months = (kwds.get("years", 0) * 12 + kwds.get("months", 0)) * self.n + if months: + shifted = liboffsets.shift_months(i.asi8, months) + i = type(i)(shifted, dtype=i.dtype) + + weeks = (kwds.get("weeks", 0)) * self.n + if weeks: + # integer addition on PeriodIndex is deprecated, + # so we directly use _time_shift instead + asper = i.to_period("W") + if not isinstance(asper._data, np.ndarray): + # unwrap PeriodIndex --> PeriodArray + asper = asper._data + shifted = asper._time_shift(weeks) + i = shifted.to_timestamp() + i.to_perioddelta("W") + + timedelta_kwds = { + k: v + for k, v in kwds.items() + if k in ["days", "hours", "minutes", "seconds", "microseconds"] + } + if timedelta_kwds: + delta = Timedelta(**timedelta_kwds) + i = i + (self.n * delta) + return i + elif not self._use_relativedelta and hasattr(self, "_offset"): + # timedelta + return i + (self._offset * self.n) + else: + # relativedelta with other keywords + kwd = set(kwds) - relativedelta_fast + raise NotImplementedError( + "DateOffset with relativedelta " + f"keyword(s) {kwd} not able to be " + "applied vectorized" + ) + + def is_anchored(self): + # TODO: Does this make sense for the general case? It would help + # if there were a canonical docstring for what is_anchored means. + return self.n == 1 + + def onOffset(self, dt): + warnings.warn( + "onOffset is a deprecated, use is_on_offset instead", + FutureWarning, + stacklevel=2, + ) + return self.is_on_offset(dt) + + def isAnchored(self): + warnings.warn( + "isAnchored is a deprecated, use is_anchored instead", + FutureWarning, + stacklevel=2, + ) + return self.is_anchored() + + # TODO: Combine this with BusinessMixin version by defining a whitelisted + # set of attributes on each object rather than the existing behavior of + # iterating over internal ``__dict__`` + def _repr_attrs(self): + exclude = {"n", "inc", "normalize"} + attrs = [] + for attr in sorted(self.__dict__): + if attr.startswith("_") or attr == "kwds": + continue + elif attr not in exclude: + value = getattr(self, attr) + attrs.append(f"{attr}={value}") + + out = "" + if attrs: + out += ": " + ", ".join(attrs) + return out + + @property + def name(self): + return self.rule_code + + def rollback(self, dt): + """ + Roll provided date backward to next offset only if not on offset. + + Returns + ------- + TimeStamp + Rolled timestamp if not on offset, otherwise unchanged timestamp. + """ + dt = as_timestamp(dt) + if not self.is_on_offset(dt): + dt = dt - type(self)(1, normalize=self.normalize, **self.kwds) + return dt + + def rollforward(self, dt): + """ + Roll provided date forward to next offset only if not on offset. + + Returns + ------- + TimeStamp + Rolled timestamp if not on offset, otherwise unchanged timestamp. + """ + dt = as_timestamp(dt) + if not self.is_on_offset(dt): + dt = dt + type(self)(1, normalize=self.normalize, **self.kwds) + return dt + + def is_on_offset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + # XXX, see #1395 + if type(self) == DateOffset or isinstance(self, Tick): + return True + + # Default (slow) method for determining if some date is a member of the + # date range generated by this offset. Subclasses may have this + # re-implemented in a nicer way. + a = dt + b = (dt + self) - self + return a == b + + # way to get around weirdness with rule_code + @property + def _prefix(self): + raise NotImplementedError("Prefix not defined") + + @property + def rule_code(self): + return self._prefix + + @cache_readonly + def freqstr(self): + try: + code = self.rule_code + except NotImplementedError: + return repr(self) + + if self.n != 1: + fstr = f"{self.n}{code}" + else: + fstr = code + + try: + if self._offset: + fstr += self._offset_str() + except AttributeError: + # TODO: standardize `_offset` vs `offset` naming convention + pass + + return fstr + + def _offset_str(self): + return "" + + @property + def nanos(self): + raise ValueError(f"{self} is a non-fixed frequency") + + +class SingleConstructorOffset(DateOffset): + @classmethod + def _from_name(cls, suffix=None): + # default _from_name calls cls with no args + if suffix: + raise ValueError(f"Bad freq suffix {suffix}") + return cls() + + +class _CustomMixin: + """ + Mixin for classes that define and validate calendar, holidays, + and weekdays attributes. + """ + + def __init__(self, weekmask, holidays, calendar): + calendar, holidays = _get_calendar( + weekmask=weekmask, holidays=holidays, calendar=calendar + ) + # Custom offset instances are identified by the + # following two attributes. See DateOffset._params() + # holidays, weekmask + + object.__setattr__(self, "weekmask", weekmask) + object.__setattr__(self, "holidays", holidays) + object.__setattr__(self, "calendar", calendar) + + +class BusinessMixin: + """ + Mixin to business types to provide related functions. + """ + + @property + def offset(self): + """ + Alias for self._offset. + """ + # Alias for backward compat + return self._offset + + def _repr_attrs(self): + if self.offset: + attrs = [f"offset={repr(self.offset)}"] + else: + attrs = None + out = "" + if attrs: + out += ": " + ", ".join(attrs) + return out + + +class BusinessDay(BusinessMixin, SingleConstructorOffset): + """ + DateOffset subclass representing possibly n business days. + """ + + _prefix = "B" + _adjust_dst = True + _attributes = frozenset(["n", "normalize", "offset"]) + + def __init__(self, n=1, normalize=False, offset=timedelta(0)): + BaseOffset.__init__(self, n, normalize) + object.__setattr__(self, "_offset", offset) + + def _offset_str(self): + def get_str(td): + off_str = "" + if td.days > 0: + off_str += str(td.days) + "D" + if td.seconds > 0: + s = td.seconds + hrs = int(s / 3600) + if hrs != 0: + off_str += str(hrs) + "H" + s -= hrs * 3600 + mts = int(s / 60) + if mts != 0: + off_str += str(mts) + "Min" + s -= mts * 60 + if s != 0: + off_str += str(s) + "s" + if td.microseconds > 0: + off_str += str(td.microseconds) + "us" + return off_str + + if isinstance(self.offset, timedelta): + zero = timedelta(0, 0, 0) + if self.offset >= zero: + off_str = "+" + get_str(self.offset) + else: + off_str = "-" + get_str(-self.offset) + return off_str + else: + return "+" + repr(self.offset) + + @apply_wraps + def apply(self, other): + if isinstance(other, datetime): + n = self.n + wday = other.weekday() + + # avoid slowness below by operating on weeks first + weeks = n // 5 + if n <= 0 and wday > 4: + # roll forward + n += 1 + + n -= 5 * weeks + + # n is always >= 0 at this point + if n == 0 and wday > 4: + # roll back + days = 4 - wday + elif wday > 4: + # roll forward + days = (7 - wday) + (n - 1) + elif wday + n <= 4: + # shift by n days without leaving the current week + days = n + else: + # shift by n days plus 2 to get past the weekend + days = n + 2 + + result = other + timedelta(days=7 * weeks + days) + if self.offset: + result = result + self.offset + return result + + elif isinstance(other, (timedelta, Tick)): + return BDay(self.n, offset=self.offset + other, normalize=self.normalize) + else: + raise ApplyTypeError( + "Only know how to combine business day with datetime or timedelta." + ) + + @apply_index_wraps + def apply_index(self, i): + time = i.to_perioddelta("D") + # to_period rolls forward to next BDay; track and + # reduce n where it does when rolling forward + asper = i.to_period("B") + if not isinstance(asper._data, np.ndarray): + # unwrap PeriodIndex --> PeriodArray + asper = asper._data + + if self.n > 0: + shifted = (i.to_perioddelta("B") - time).asi8 != 0 + + # Integer-array addition is deprecated, so we use + # _time_shift directly + roll = np.where(shifted, self.n - 1, self.n) + shifted = asper._addsub_int_array(roll, operator.add) + else: + # Integer addition is deprecated, so we use _time_shift directly + roll = self.n + shifted = asper._time_shift(roll) + + result = shifted.to_timestamp() + time + return result + + def is_on_offset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + return dt.weekday() < 5 + + +class BusinessHourMixin(BusinessMixin): + def __init__(self, start="09:00", end="17:00", offset=timedelta(0)): + # must be validated here to equality check + if not is_list_like(start): + start = [start] + if not len(start): + raise ValueError("Must include at least 1 start time") + + if not is_list_like(end): + end = [end] + if not len(end): + raise ValueError("Must include at least 1 end time") + + start = np.array([liboffsets._validate_business_time(x) for x in start]) + end = np.array([liboffsets._validate_business_time(x) for x in end]) + + # Validation of input + if len(start) != len(end): + raise ValueError("number of starting time and ending time must be the same") + num_openings = len(start) + + # sort starting and ending time by starting time + index = np.argsort(start) + + # convert to tuple so that start and end are hashable + start = tuple(start[index]) + end = tuple(end[index]) + + total_secs = 0 + for i in range(num_openings): + total_secs += self._get_business_hours_by_sec(start[i], end[i]) + total_secs += self._get_business_hours_by_sec( + end[i], start[(i + 1) % num_openings] + ) + if total_secs != 24 * 60 * 60: + raise ValueError( + "invalid starting and ending time(s): " + "opening hours should not touch or overlap with " + "one another" + ) + + object.__setattr__(self, "start", start) + object.__setattr__(self, "end", end) + object.__setattr__(self, "_offset", offset) + + @cache_readonly + def next_bday(self): + """ + Used for moving to next business day. + """ + if self.n >= 0: + nb_offset = 1 + else: + nb_offset = -1 + if self._prefix.startswith("C"): + # CustomBusinessHour + return CustomBusinessDay( + n=nb_offset, + weekmask=self.weekmask, + holidays=self.holidays, + calendar=self.calendar, + ) + else: + return BusinessDay(n=nb_offset) + + def _next_opening_time(self, other, sign=1): + """ + If self.n and sign have the same sign, return the earliest opening time + later than or equal to current time. + Otherwise the latest opening time earlier than or equal to current + time. + + Opening time always locates on BusinessDay. + However, closing time may not if business hour extends over midnight. + + Parameters + ---------- + other : datetime + Current time. + sign : int, default 1. + Either 1 or -1. Going forward in time if it has the same sign as + self.n. Going backward in time otherwise. + + Returns + ------- + result : datetime + Next opening time. + """ + earliest_start = self.start[0] + latest_start = self.start[-1] + + if not self.next_bday.is_on_offset(other): + # today is not business day + other = other + sign * self.next_bday + if self.n * sign >= 0: + hour, minute = earliest_start.hour, earliest_start.minute + else: + hour, minute = latest_start.hour, latest_start.minute + else: + if self.n * sign >= 0: + if latest_start < other.time(): + # current time is after latest starting time in today + other = other + sign * self.next_bday + hour, minute = earliest_start.hour, earliest_start.minute + else: + # find earliest starting time no earlier than current time + for st in self.start: + if other.time() <= st: + hour, minute = st.hour, st.minute + break + else: + if other.time() < earliest_start: + # current time is before earliest starting time in today + other = other + sign * self.next_bday + hour, minute = latest_start.hour, latest_start.minute + else: + # find latest starting time no later than current time + for st in reversed(self.start): + if other.time() >= st: + hour, minute = st.hour, st.minute + break + + return datetime(other.year, other.month, other.day, hour, minute) + + def _prev_opening_time(self, other): + """ + If n is positive, return the latest opening time earlier than or equal + to current time. + Otherwise the earliest opening time later than or equal to current + time. + + Parameters + ---------- + other : datetime + Current time. + + Returns + ------- + result : datetime + Previous opening time. + """ + return self._next_opening_time(other, sign=-1) + + def _get_business_hours_by_sec(self, start, end): + """ + Return business hours in a day by seconds. + """ + # create dummy datetime to calculate businesshours in a day + dtstart = datetime(2014, 4, 1, start.hour, start.minute) + day = 1 if start < end else 2 + until = datetime(2014, 4, day, end.hour, end.minute) + return int((until - dtstart).total_seconds()) + + @apply_wraps + def rollback(self, dt): + """ + Roll provided date backward to next offset only if not on offset. + """ + if not self.is_on_offset(dt): + if self.n >= 0: + dt = self._prev_opening_time(dt) + else: + dt = self._next_opening_time(dt) + return self._get_closing_time(dt) + return dt + + @apply_wraps + def rollforward(self, dt): + """ + Roll provided date forward to next offset only if not on offset. + """ + if not self.is_on_offset(dt): + if self.n >= 0: + return self._next_opening_time(dt) + else: + return self._prev_opening_time(dt) + return dt + + def _get_closing_time(self, dt): + """ + Get the closing time of a business hour interval by its opening time. + + Parameters + ---------- + dt : datetime + Opening time of a business hour interval. + + Returns + ------- + result : datetime + Corresponding closing time. + """ + for i, st in enumerate(self.start): + if st.hour == dt.hour and st.minute == dt.minute: + return dt + timedelta( + seconds=self._get_business_hours_by_sec(st, self.end[i]) + ) + assert False + + @apply_wraps + def apply(self, other): + if isinstance(other, datetime): + # used for detecting edge condition + nanosecond = getattr(other, "nanosecond", 0) + # reset timezone and nanosecond + # other may be a Timestamp, thus not use replace + other = datetime( + other.year, + other.month, + other.day, + other.hour, + other.minute, + other.second, + other.microsecond, + ) + n = self.n + + # adjust other to reduce number of cases to handle + if n >= 0: + if other.time() in self.end or not self._is_on_offset(other): + other = self._next_opening_time(other) + else: + if other.time() in self.start: + # adjustment to move to previous business day + other = other - timedelta(seconds=1) + if not self._is_on_offset(other): + other = self._next_opening_time(other) + other = self._get_closing_time(other) + + # get total business hours by sec in one business day + businesshours = sum( + self._get_business_hours_by_sec(st, en) + for st, en in zip(self.start, self.end) + ) + + bd, r = divmod(abs(n * 60), businesshours // 60) + if n < 0: + bd, r = -bd, -r + + # adjust by business days first + if bd != 0: + if isinstance(self, _CustomMixin): # GH 30593 + skip_bd = CustomBusinessDay( + n=bd, + weekmask=self.weekmask, + holidays=self.holidays, + calendar=self.calendar, + ) + else: + skip_bd = BusinessDay(n=bd) + # midnight business hour may not on BusinessDay + if not self.next_bday.is_on_offset(other): + prev_open = self._prev_opening_time(other) + remain = other - prev_open + other = prev_open + skip_bd + remain + else: + other = other + skip_bd + + # remaining business hours to adjust + bhour_remain = timedelta(minutes=r) + + if n >= 0: + while bhour_remain != timedelta(0): + # business hour left in this business time interval + bhour = ( + self._get_closing_time(self._prev_opening_time(other)) - other + ) + if bhour_remain < bhour: + # finish adjusting if possible + other += bhour_remain + bhour_remain = timedelta(0) + else: + # go to next business time interval + bhour_remain -= bhour + other = self._next_opening_time(other + bhour) + else: + while bhour_remain != timedelta(0): + # business hour left in this business time interval + bhour = self._next_opening_time(other) - other + if ( + bhour_remain > bhour + or bhour_remain == bhour + and nanosecond != 0 + ): + # finish adjusting if possible + other += bhour_remain + bhour_remain = timedelta(0) + else: + # go to next business time interval + bhour_remain -= bhour + other = self._get_closing_time( + self._next_opening_time( + other + bhour - timedelta(seconds=1) + ) + ) + + return other + else: + raise ApplyTypeError("Only know how to combine business hour with datetime") + + def is_on_offset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + + if dt.tzinfo is not None: + dt = datetime( + dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second, dt.microsecond + ) + # Valid BH can be on the different BusinessDay during midnight + # Distinguish by the time spent from previous opening time + return self._is_on_offset(dt) + + def _is_on_offset(self, dt): + """ + Slight speedups using calculated values. + """ + # if self.normalize and not _is_normalized(dt): + # return False + # Valid BH can be on the different BusinessDay during midnight + # Distinguish by the time spent from previous opening time + if self.n >= 0: + op = self._prev_opening_time(dt) + else: + op = self._next_opening_time(dt) + span = (dt - op).total_seconds() + businesshours = 0 + for i, st in enumerate(self.start): + if op.hour == st.hour and op.minute == st.minute: + businesshours = self._get_business_hours_by_sec(st, self.end[i]) + if span <= businesshours: + return True + else: + return False + + def _repr_attrs(self): + out = super()._repr_attrs() + hours = ",".join( + f'{st.strftime("%H:%M")}-{en.strftime("%H:%M")}' + for st, en in zip(self.start, self.end) + ) + attrs = [f"{self._prefix}={hours}"] + out += ": " + ", ".join(attrs) + return out + + +class BusinessHour(BusinessHourMixin, SingleConstructorOffset): + """ + DateOffset subclass representing possibly n business hours. + """ + + _prefix = "BH" + _anchor = 0 + _attributes = frozenset(["n", "normalize", "start", "end", "offset"]) + + def __init__( + self, n=1, normalize=False, start="09:00", end="17:00", offset=timedelta(0) + ): + BaseOffset.__init__(self, n, normalize) + super().__init__(start=start, end=end, offset=offset) + + +class CustomBusinessDay(_CustomMixin, BusinessDay): + """ + DateOffset subclass representing possibly n custom business days, + excluding holidays. + + Parameters + ---------- + n : int, default 1 + normalize : bool, default False + Normalize start/end dates to midnight before generating date range. + weekmask : str, Default 'Mon Tue Wed Thu Fri' + Weekmask of valid business days, passed to ``numpy.busdaycalendar``. + holidays : list + List/array of dates to exclude from the set of valid business days, + passed to ``numpy.busdaycalendar``. + calendar : pd.HolidayCalendar or np.busdaycalendar + offset : timedelta, default timedelta(0) + """ + + _prefix = "C" + _attributes = frozenset( + ["n", "normalize", "weekmask", "holidays", "calendar", "offset"] + ) + + def __init__( + self, + n=1, + normalize=False, + weekmask="Mon Tue Wed Thu Fri", + holidays=None, + calendar=None, + offset=timedelta(0), + ): + BaseOffset.__init__(self, n, normalize) + object.__setattr__(self, "_offset", offset) + + _CustomMixin.__init__(self, weekmask, holidays, calendar) + + @apply_wraps + def apply(self, other): + if self.n <= 0: + roll = "forward" + else: + roll = "backward" + + if isinstance(other, datetime): + date_in = other + np_dt = np.datetime64(date_in.date()) + + np_incr_dt = np.busday_offset( + np_dt, self.n, roll=roll, busdaycal=self.calendar + ) + + dt_date = np_incr_dt.astype(datetime) + result = datetime.combine(dt_date, date_in.time()) + + if self.offset: + result = result + self.offset + return result + + elif isinstance(other, (timedelta, Tick)): + return BDay(self.n, offset=self.offset + other, normalize=self.normalize) + else: + raise ApplyTypeError( + "Only know how to combine trading day with " + "datetime, datetime64 or timedelta." + ) + + def apply_index(self, i): + raise NotImplementedError + + def is_on_offset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + day64 = _to_dt64(dt, "datetime64[D]") + return np.is_busday(day64, busdaycal=self.calendar) + + +class CustomBusinessHour(_CustomMixin, BusinessHourMixin, SingleConstructorOffset): + """ + DateOffset subclass representing possibly n custom business days. + """ + + _prefix = "CBH" + _anchor = 0 + _attributes = frozenset( + ["n", "normalize", "weekmask", "holidays", "calendar", "start", "end", "offset"] + ) + + def __init__( + self, + n=1, + normalize=False, + weekmask="Mon Tue Wed Thu Fri", + holidays=None, + calendar=None, + start="09:00", + end="17:00", + offset=timedelta(0), + ): + BaseOffset.__init__(self, n, normalize) + object.__setattr__(self, "_offset", offset) + + _CustomMixin.__init__(self, weekmask, holidays, calendar) + BusinessHourMixin.__init__(self, start=start, end=end, offset=offset) + + +# --------------------------------------------------------------------- +# Month-Based Offset Classes + + +class MonthOffset(SingleConstructorOffset): + _adjust_dst = True + _attributes = frozenset(["n", "normalize"]) + + __init__ = BaseOffset.__init__ + + @property + def name(self): + if self.is_anchored: + return self.rule_code + else: + month = ccalendar.MONTH_ALIASES[self.n] + return f"{self.code_rule}-{month}" + + def is_on_offset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + return dt.day == self._get_offset_day(dt) + + @apply_wraps + def apply(self, other): + compare_day = self._get_offset_day(other) + n = liboffsets.roll_convention(other.day, self.n, compare_day) + return shift_month(other, n, self._day_opt) + + @apply_index_wraps + def apply_index(self, i): + shifted = liboffsets.shift_months(i.asi8, self.n, self._day_opt) + # TODO: going through __new__ raises on call to _validate_frequency; + # are we passing incorrect freq? + return type(i)._simple_new(shifted, freq=i.freq, dtype=i.dtype) + + +class MonthEnd(MonthOffset): + """ + DateOffset of one month end. + """ + + _prefix = "M" + _day_opt = "end" + + +class MonthBegin(MonthOffset): + """ + DateOffset of one month at beginning. + """ + + _prefix = "MS" + _day_opt = "start" + + +class BusinessMonthEnd(MonthOffset): + """ + DateOffset increments between business EOM dates. + """ + + _prefix = "BM" + _day_opt = "business_end" + + +class BusinessMonthBegin(MonthOffset): + """ + DateOffset of one business month at beginning. + """ + + _prefix = "BMS" + _day_opt = "business_start" + + +class _CustomBusinessMonth(_CustomMixin, BusinessMixin, MonthOffset): + """ + DateOffset subclass representing custom business month(s). + + Increments between %(bound)s of month dates. + + Parameters + ---------- + n : int, default 1 + The number of months represented. + normalize : bool, default False + Normalize start/end dates to midnight before generating date range. + weekmask : str, Default 'Mon Tue Wed Thu Fri' + Weekmask of valid business days, passed to ``numpy.busdaycalendar``. + holidays : list + List/array of dates to exclude from the set of valid business days, + passed to ``numpy.busdaycalendar``. + calendar : pd.HolidayCalendar or np.busdaycalendar + Calendar to integrate. + offset : timedelta, default timedelta(0) + Time offset to apply. + """ + + _attributes = frozenset( + ["n", "normalize", "weekmask", "holidays", "calendar", "offset"] + ) + + is_on_offset = DateOffset.is_on_offset # override MonthOffset method + apply_index = DateOffset.apply_index # override MonthOffset method + + def __init__( + self, + n=1, + normalize=False, + weekmask="Mon Tue Wed Thu Fri", + holidays=None, + calendar=None, + offset=timedelta(0), + ): + BaseOffset.__init__(self, n, normalize) + object.__setattr__(self, "_offset", offset) + + _CustomMixin.__init__(self, weekmask, holidays, calendar) + + @cache_readonly + def cbday_roll(self): + """ + Define default roll function to be called in apply method. + """ + cbday = CustomBusinessDay(n=self.n, normalize=False, **self.kwds) + + if self._prefix.endswith("S"): + # MonthBegin + roll_func = cbday.rollforward + else: + # MonthEnd + roll_func = cbday.rollback + return roll_func + + @cache_readonly + def m_offset(self): + if self._prefix.endswith("S"): + # MonthBegin + moff = MonthBegin(n=1, normalize=False) + else: + # MonthEnd + moff = MonthEnd(n=1, normalize=False) + return moff + + @cache_readonly + def month_roll(self): + """ + Define default roll function to be called in apply method. + """ + if self._prefix.endswith("S"): + # MonthBegin + roll_func = self.m_offset.rollback + else: + # MonthEnd + roll_func = self.m_offset.rollforward + return roll_func + + @apply_wraps + def apply(self, other): + # First move to month offset + cur_month_offset_date = self.month_roll(other) + + # Find this custom month offset + compare_date = self.cbday_roll(cur_month_offset_date) + n = liboffsets.roll_convention(other.day, self.n, compare_date.day) + + new = cur_month_offset_date + n * self.m_offset + result = self.cbday_roll(new) + return result + + +@Substitution(bound="end") +@Appender(_CustomBusinessMonth.__doc__) +class CustomBusinessMonthEnd(_CustomBusinessMonth): + _prefix = "CBM" + + +@Substitution(bound="beginning") +@Appender(_CustomBusinessMonth.__doc__) +class CustomBusinessMonthBegin(_CustomBusinessMonth): + _prefix = "CBMS" + + +# --------------------------------------------------------------------- +# Semi-Month Based Offset Classes + + +class SemiMonthOffset(DateOffset): + _adjust_dst = True + _default_day_of_month = 15 + _min_day_of_month = 2 + _attributes = frozenset(["n", "normalize", "day_of_month"]) + + def __init__(self, n=1, normalize=False, day_of_month=None): + BaseOffset.__init__(self, n, normalize) + + if day_of_month is None: + object.__setattr__(self, "day_of_month", self._default_day_of_month) + else: + object.__setattr__(self, "day_of_month", int(day_of_month)) + if not self._min_day_of_month <= self.day_of_month <= 27: + raise ValueError( + "day_of_month must be " + f"{self._min_day_of_month}<=day_of_month<=27, " + f"got {self.day_of_month}" + ) + + @classmethod + def _from_name(cls, suffix=None): + return cls(day_of_month=suffix) + + @property + def rule_code(self): + suffix = f"-{self.day_of_month}" + return self._prefix + suffix + + @apply_wraps + def apply(self, other): + # shift `other` to self.day_of_month, incrementing `n` if necessary + n = liboffsets.roll_convention(other.day, self.n, self.day_of_month) + + days_in_month = ccalendar.get_days_in_month(other.year, other.month) + + # For SemiMonthBegin on other.day == 1 and + # SemiMonthEnd on other.day == days_in_month, + # shifting `other` to `self.day_of_month` _always_ requires + # incrementing/decrementing `n`, regardless of whether it is + # initially positive. + if type(self) is SemiMonthBegin and (self.n <= 0 and other.day == 1): + n -= 1 + elif type(self) is SemiMonthEnd and (self.n > 0 and other.day == days_in_month): + n += 1 + + return self._apply(n, other) + + def _apply(self, n, other): + """ + Handle specific apply logic for child classes. + """ + raise AbstractMethodError(self) + + @apply_index_wraps + def apply_index(self, i): + # determine how many days away from the 1st of the month we are + dti = i + days_from_start = i.to_perioddelta("M").asi8 + delta = Timedelta(days=self.day_of_month - 1).value + + # get boolean array for each element before the day_of_month + before_day_of_month = days_from_start < delta + + # get boolean array for each element after the day_of_month + after_day_of_month = days_from_start > delta + + # determine the correct n for each date in i + roll = self._get_roll(i, before_day_of_month, after_day_of_month) + + # isolate the time since it will be striped away one the next line + time = i.to_perioddelta("D") + + # apply the correct number of months + + # integer-array addition on PeriodIndex is deprecated, + # so we use _addsub_int_array directly + asper = i.to_period("M") + if not isinstance(asper._data, np.ndarray): + # unwrap PeriodIndex --> PeriodArray + asper = asper._data + + shifted = asper._addsub_int_array(roll // 2, operator.add) + i = type(dti)(shifted.to_timestamp()) + + # apply the correct day + i = self._apply_index_days(i, roll) + + return i + time + + def _get_roll(self, i, before_day_of_month, after_day_of_month): + """ + Return an array with the correct n for each date in i. + + The roll array is based on the fact that i gets rolled back to + the first day of the month. + """ + raise AbstractMethodError(self) + + def _apply_index_days(self, i, roll): + """ + Apply the correct day for each date in i. + """ + raise AbstractMethodError(self) + + +class SemiMonthEnd(SemiMonthOffset): + """ + Two DateOffset's per month repeating on the last + day of the month and day_of_month. + + Parameters + ---------- + n : int + normalize : bool, default False + day_of_month : int, {1, 3,...,27}, default 15 + """ + + _prefix = "SM" + _min_day_of_month = 1 + + def is_on_offset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + days_in_month = ccalendar.get_days_in_month(dt.year, dt.month) + return dt.day in (self.day_of_month, days_in_month) + + def _apply(self, n, other): + months = n // 2 + day = 31 if n % 2 else self.day_of_month + return shift_month(other, months, day) + + def _get_roll(self, i, before_day_of_month, after_day_of_month): + n = self.n + is_month_end = i.is_month_end + if n > 0: + roll_end = np.where(is_month_end, 1, 0) + roll_before = np.where(before_day_of_month, n, n + 1) + roll = roll_end + roll_before + elif n == 0: + roll_after = np.where(after_day_of_month, 2, 0) + roll_before = np.where(~after_day_of_month, 1, 0) + roll = roll_before + roll_after + else: + roll = np.where(after_day_of_month, n + 2, n + 1) + return roll + + def _apply_index_days(self, i, roll): + """ + Add days portion of offset to DatetimeIndex i. + + Parameters + ---------- + i : DatetimeIndex + roll : ndarray[int64_t] + + Returns + ------- + result : DatetimeIndex + """ + nanos = (roll % 2) * Timedelta(days=self.day_of_month).value + i += nanos.astype("timedelta64[ns]") + return i + Timedelta(days=-1) + + +class SemiMonthBegin(SemiMonthOffset): + """ + Two DateOffset's per month repeating on the first + day of the month and day_of_month. + + Parameters + ---------- + n : int + normalize : bool, default False + day_of_month : int, {2, 3,...,27}, default 15 + """ + + _prefix = "SMS" + + def is_on_offset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + return dt.day in (1, self.day_of_month) + + def _apply(self, n, other): + months = n // 2 + n % 2 + day = 1 if n % 2 else self.day_of_month + return shift_month(other, months, day) + + def _get_roll(self, i, before_day_of_month, after_day_of_month): + n = self.n + is_month_start = i.is_month_start + if n > 0: + roll = np.where(before_day_of_month, n, n + 1) + elif n == 0: + roll_start = np.where(is_month_start, 0, 1) + roll_after = np.where(after_day_of_month, 1, 0) + roll = roll_start + roll_after + else: + roll_after = np.where(after_day_of_month, n + 2, n + 1) + roll_start = np.where(is_month_start, -1, 0) + roll = roll_after + roll_start + return roll + + def _apply_index_days(self, i, roll): + """ + Add days portion of offset to DatetimeIndex i. + + Parameters + ---------- + i : DatetimeIndex + roll : ndarray[int64_t] + + Returns + ------- + result : DatetimeIndex + """ + nanos = (roll % 2) * Timedelta(days=self.day_of_month - 1).value + return i + nanos.astype("timedelta64[ns]") + + +# --------------------------------------------------------------------- +# Week-Based Offset Classes + + +class Week(DateOffset): + """ + Weekly offset. + + Parameters + ---------- + weekday : int, default None + Always generate specific day of week. 0 for Monday. + """ + + _adjust_dst = True + _inc = timedelta(weeks=1) + _prefix = "W" + _attributes = frozenset(["n", "normalize", "weekday"]) + + def __init__(self, n=1, normalize=False, weekday=None): + BaseOffset.__init__(self, n, normalize) + object.__setattr__(self, "weekday", weekday) + + if self.weekday is not None: + if self.weekday < 0 or self.weekday > 6: + raise ValueError(f"Day must be 0<=day<=6, got {self.weekday}") + + def is_anchored(self): + return self.n == 1 and self.weekday is not None + + @apply_wraps + def apply(self, other): + if self.weekday is None: + return other + self.n * self._inc + + if not isinstance(other, datetime): + raise TypeError( + f"Cannot add {type(other).__name__} to {type(self).__name__}" + ) + + k = self.n + otherDay = other.weekday() + if otherDay != self.weekday: + other = other + timedelta((self.weekday - otherDay) % 7) + if k > 0: + k -= 1 + + return other + timedelta(weeks=k) + + @apply_index_wraps + def apply_index(self, i): + if self.weekday is None: + # integer addition on PeriodIndex is deprecated, + # so we use _time_shift directly + asper = i.to_period("W") + if not isinstance(asper._data, np.ndarray): + # unwrap PeriodIndex --> PeriodArray + asper = asper._data + + shifted = asper._time_shift(self.n) + return shifted.to_timestamp() + i.to_perioddelta("W") + else: + return self._end_apply_index(i) + + def _end_apply_index(self, dtindex): + """ + Add self to the given DatetimeIndex, specialized for case where + self.weekday is non-null. + + Parameters + ---------- + dtindex : DatetimeIndex + + Returns + ------- + result : DatetimeIndex + """ + off = dtindex.to_perioddelta("D") + + base, mult = libfrequencies.get_freq_code(self.freqstr) + base_period = dtindex.to_period(base) + if not isinstance(base_period._data, np.ndarray): + # unwrap PeriodIndex --> PeriodArray + base_period = base_period._data + + if self.n > 0: + # when adding, dates on end roll to next + normed = dtindex - off + Timedelta(1, "D") - Timedelta(1, "ns") + roll = np.where( + base_period.to_timestamp(how="end") == normed, self.n, self.n - 1 + ) + # integer-array addition on PeriodIndex is deprecated, + # so we use _addsub_int_array directly + shifted = base_period._addsub_int_array(roll, operator.add) + base = shifted.to_timestamp(how="end") + else: + # integer addition on PeriodIndex is deprecated, + # so we use _time_shift directly + roll = self.n + base = base_period._time_shift(roll).to_timestamp(how="end") + + return base + off + Timedelta(1, "ns") - Timedelta(1, "D") + + def is_on_offset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + elif self.weekday is None: + return True + return dt.weekday() == self.weekday + + @property + def rule_code(self): + suffix = "" + if self.weekday is not None: + weekday = ccalendar.int_to_weekday[self.weekday] + suffix = f"-{weekday}" + return self._prefix + suffix + + @classmethod + def _from_name(cls, suffix=None): + if not suffix: + weekday = None + else: + weekday = ccalendar.weekday_to_int[suffix] + return cls(weekday=weekday) + + +class _WeekOfMonthMixin: + """ + Mixin for methods common to WeekOfMonth and LastWeekOfMonth. + """ + + @apply_wraps + def apply(self, other): + compare_day = self._get_offset_day(other) + + months = self.n + if months > 0 and compare_day > other.day: + months -= 1 + elif months <= 0 and compare_day < other.day: + months += 1 + + shifted = shift_month(other, months, "start") + to_day = self._get_offset_day(shifted) + return liboffsets.shift_day(shifted, to_day - shifted.day) + + def is_on_offset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + return dt.day == self._get_offset_day(dt) + + +class WeekOfMonth(_WeekOfMonthMixin, DateOffset): + """ + Describes monthly dates like "the Tuesday of the 2nd week of each month". + + Parameters + ---------- + n : int + week : int {0, 1, 2, 3, ...}, default 0 + A specific integer for the week of the month. + e.g. 0 is 1st week of month, 1 is the 2nd week, etc. + weekday : int {0, 1, ..., 6}, default 0 + A specific integer for the day of the week. + + - 0 is Monday + - 1 is Tuesday + - 2 is Wednesday + - 3 is Thursday + - 4 is Friday + - 5 is Saturday + - 6 is Sunday. + """ + + _prefix = "WOM" + _adjust_dst = True + _attributes = frozenset(["n", "normalize", "week", "weekday"]) + + def __init__(self, n=1, normalize=False, week=0, weekday=0): + BaseOffset.__init__(self, n, normalize) + object.__setattr__(self, "weekday", weekday) + object.__setattr__(self, "week", week) + + if self.weekday < 0 or self.weekday > 6: + raise ValueError(f"Day must be 0<=day<=6, got {self.weekday}") + if self.week < 0 or self.week > 3: + raise ValueError(f"Week must be 0<=week<=3, got {self.week}") + + def _get_offset_day(self, other): + """ + Find the day in the same month as other that has the same + weekday as self.weekday and is the self.week'th such day in the month. + + Parameters + ---------- + other : datetime + + Returns + ------- + day : int + """ + mstart = datetime(other.year, other.month, 1) + wday = mstart.weekday() + shift_days = (self.weekday - wday) % 7 + return 1 + shift_days + self.week * 7 + + @property + def rule_code(self): + weekday = ccalendar.int_to_weekday.get(self.weekday, "") + return f"{self._prefix}-{self.week + 1}{weekday}" + + @classmethod + def _from_name(cls, suffix=None): + if not suffix: + raise ValueError(f"Prefix {repr(cls._prefix)} requires a suffix.") + # TODO: handle n here... + # only one digit weeks (1 --> week 0, 2 --> week 1, etc.) + week = int(suffix[0]) - 1 + weekday = ccalendar.weekday_to_int[suffix[1:]] + return cls(week=week, weekday=weekday) + + +class LastWeekOfMonth(_WeekOfMonthMixin, DateOffset): + """ + Describes monthly dates in last week of month like "the last Tuesday of + each month". + + Parameters + ---------- + n : int, default 1 + weekday : int {0, 1, ..., 6}, default 0 + A specific integer for the day of the week. + + - 0 is Monday + - 1 is Tuesday + - 2 is Wednesday + - 3 is Thursday + - 4 is Friday + - 5 is Saturday + - 6 is Sunday. + """ + + _prefix = "LWOM" + _adjust_dst = True + _attributes = frozenset(["n", "normalize", "weekday"]) + + def __init__(self, n=1, normalize=False, weekday=0): + BaseOffset.__init__(self, n, normalize) + object.__setattr__(self, "weekday", weekday) + + if self.n == 0: + raise ValueError("N cannot be 0") + + if self.weekday < 0 or self.weekday > 6: + raise ValueError(f"Day must be 0<=day<=6, got {self.weekday}") + + def _get_offset_day(self, other): + """ + Find the day in the same month as other that has the same + weekday as self.weekday and is the last such day in the month. + + Parameters + ---------- + other: datetime + + Returns + ------- + day: int + """ + dim = ccalendar.get_days_in_month(other.year, other.month) + mend = datetime(other.year, other.month, dim) + wday = mend.weekday() + shift_days = (wday - self.weekday) % 7 + return dim - shift_days + + @property + def rule_code(self): + weekday = ccalendar.int_to_weekday.get(self.weekday, "") + return f"{self._prefix}-{weekday}" + + @classmethod + def _from_name(cls, suffix=None): + if not suffix: + raise ValueError(f"Prefix {repr(cls._prefix)} requires a suffix.") + # TODO: handle n here... + weekday = ccalendar.weekday_to_int[suffix] + return cls(weekday=weekday) + + +# --------------------------------------------------------------------- +# Quarter-Based Offset Classes + + +class QuarterOffset(DateOffset): + """ + Quarter representation - doesn't call super. + """ + + _default_startingMonth: Optional[int] = None + _from_name_startingMonth: Optional[int] = None + _adjust_dst = True + _attributes = frozenset(["n", "normalize", "startingMonth"]) + # TODO: Consider combining QuarterOffset and YearOffset __init__ at some + # point. Also apply_index, is_on_offset, rule_code if + # startingMonth vs month attr names are resolved + + def __init__(self, n=1, normalize=False, startingMonth=None): + BaseOffset.__init__(self, n, normalize) + + if startingMonth is None: + startingMonth = self._default_startingMonth + object.__setattr__(self, "startingMonth", startingMonth) + + def is_anchored(self): + return self.n == 1 and self.startingMonth is not None + + @classmethod + def _from_name(cls, suffix=None): + kwargs = {} + if suffix: + kwargs["startingMonth"] = ccalendar.MONTH_TO_CAL_NUM[suffix] + else: + if cls._from_name_startingMonth is not None: + kwargs["startingMonth"] = cls._from_name_startingMonth + return cls(**kwargs) + + @property + def rule_code(self): + month = ccalendar.MONTH_ALIASES[self.startingMonth] + return f"{self._prefix}-{month}" + + @apply_wraps + def apply(self, other): + # months_since: find the calendar quarter containing other.month, + # e.g. if other.month == 8, the calendar quarter is [Jul, Aug, Sep]. + # Then find the month in that quarter containing an is_on_offset date for + # self. `months_since` is the number of months to shift other.month + # to get to this on-offset month. + months_since = other.month % 3 - self.startingMonth % 3 + qtrs = liboffsets.roll_qtrday( + other, self.n, self.startingMonth, day_opt=self._day_opt, modby=3 + ) + months = qtrs * 3 - months_since + return shift_month(other, months, self._day_opt) + + def is_on_offset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + mod_month = (dt.month - self.startingMonth) % 3 + return mod_month == 0 and dt.day == self._get_offset_day(dt) + + @apply_index_wraps + def apply_index(self, dtindex): + shifted = liboffsets.shift_quarters( + dtindex.asi8, self.n, self.startingMonth, self._day_opt + ) + # TODO: going through __new__ raises on call to _validate_frequency; + # are we passing incorrect freq? + return type(dtindex)._simple_new( + shifted, freq=dtindex.freq, dtype=dtindex.dtype + ) + + +class BQuarterEnd(QuarterOffset): + """ + DateOffset increments between business Quarter dates. + + startingMonth = 1 corresponds to dates like 1/31/2007, 4/30/2007, ... + startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ... + startingMonth = 3 corresponds to dates like 3/30/2007, 6/29/2007, ... + """ + + _outputName = "BusinessQuarterEnd" + _default_startingMonth = 3 + _from_name_startingMonth = 12 + _prefix = "BQ" + _day_opt = "business_end" + + +# TODO: This is basically the same as BQuarterEnd +class BQuarterBegin(QuarterOffset): + _outputName = "BusinessQuarterBegin" + # I suspect this is wrong for *all* of them. + _default_startingMonth = 3 + _from_name_startingMonth = 1 + _prefix = "BQS" + _day_opt = "business_start" + + +class QuarterEnd(QuarterOffset): + """ + DateOffset increments between business Quarter dates. + + startingMonth = 1 corresponds to dates like 1/31/2007, 4/30/2007, ... + startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ... + startingMonth = 3 corresponds to dates like 3/31/2007, 6/30/2007, ... + """ + + _outputName = "QuarterEnd" + _default_startingMonth = 3 + _prefix = "Q" + _day_opt = "end" + + +class QuarterBegin(QuarterOffset): + _outputName = "QuarterBegin" + _default_startingMonth = 3 + _from_name_startingMonth = 1 + _prefix = "QS" + _day_opt = "start" + + +# --------------------------------------------------------------------- +# Year-Based Offset Classes + + +class YearOffset(DateOffset): + """ + DateOffset that just needs a month. + """ + + _adjust_dst = True + _attributes = frozenset(["n", "normalize", "month"]) + + def _get_offset_day(self, other): + # override BaseOffset method to use self.month instead of other.month + # TODO: there may be a more performant way to do this + return liboffsets.get_day_of_month( + other.replace(month=self.month), self._day_opt + ) + + @apply_wraps + def apply(self, other): + years = roll_yearday(other, self.n, self.month, self._day_opt) + months = years * 12 + (self.month - other.month) + return shift_month(other, months, self._day_opt) + + @apply_index_wraps + def apply_index(self, dtindex): + shifted = liboffsets.shift_quarters( + dtindex.asi8, self.n, self.month, self._day_opt, modby=12 + ) + # TODO: going through __new__ raises on call to _validate_frequency; + # are we passing incorrect freq? + return type(dtindex)._simple_new( + shifted, freq=dtindex.freq, dtype=dtindex.dtype + ) + + def is_on_offset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + return dt.month == self.month and dt.day == self._get_offset_day(dt) + + def __init__(self, n=1, normalize=False, month=None): + BaseOffset.__init__(self, n, normalize) + + month = month if month is not None else self._default_month + object.__setattr__(self, "month", month) + + if self.month < 1 or self.month > 12: + raise ValueError("Month must go from 1 to 12") + + @classmethod + def _from_name(cls, suffix=None): + kwargs = {} + if suffix: + kwargs["month"] = ccalendar.MONTH_TO_CAL_NUM[suffix] + return cls(**kwargs) + + @property + def rule_code(self): + month = ccalendar.MONTH_ALIASES[self.month] + return f"{self._prefix}-{month}" + + +class BYearEnd(YearOffset): + """ + DateOffset increments between business EOM dates. + """ + + _outputName = "BusinessYearEnd" + _default_month = 12 + _prefix = "BA" + _day_opt = "business_end" + + +class BYearBegin(YearOffset): + """ + DateOffset increments between business year begin dates. + """ + + _outputName = "BusinessYearBegin" + _default_month = 1 + _prefix = "BAS" + _day_opt = "business_start" + + +class YearEnd(YearOffset): + """ + DateOffset increments between calendar year ends. + """ + + _default_month = 12 + _prefix = "A" + _day_opt = "end" + + +class YearBegin(YearOffset): + """ + DateOffset increments between calendar year begin dates. + """ + + _default_month = 1 + _prefix = "AS" + _day_opt = "start" + + +# --------------------------------------------------------------------- +# Special Offset Classes + + +class FY5253(DateOffset): + """ + Describes 52-53 week fiscal year. This is also known as a 4-4-5 calendar. + + It is used by companies that desire that their + fiscal year always end on the same day of the week. + + It is a method of managing accounting periods. + It is a common calendar structure for some industries, + such as retail, manufacturing and parking industry. + + For more information see: + http://en.wikipedia.org/wiki/4-4-5_calendar + + The year may either: + + - end on the last X day of the Y month. + - end on the last X day closest to the last day of the Y month. + + X is a specific day of the week. + Y is a certain month of the year + + Parameters + ---------- + n : int + weekday : int {0, 1, ..., 6}, default 0 + A specific integer for the day of the week. + + - 0 is Monday + - 1 is Tuesday + - 2 is Wednesday + - 3 is Thursday + - 4 is Friday + - 5 is Saturday + - 6 is Sunday. + + startingMonth : int {1, 2, ... 12}, default 1 + The month in which the fiscal year ends. + + variation : str, default "nearest" + Method of employing 4-4-5 calendar. + + There are two options: + + - "nearest" means year end is **weekday** closest to last day of month in year. + - "last" means year end is final **weekday** of the final month in fiscal year. + """ + + _prefix = "RE" + _adjust_dst = True + _attributes = frozenset(["weekday", "startingMonth", "variation"]) + + def __init__( + self, n=1, normalize=False, weekday=0, startingMonth=1, variation="nearest" + ): + BaseOffset.__init__(self, n, normalize) + object.__setattr__(self, "startingMonth", startingMonth) + object.__setattr__(self, "weekday", weekday) + + object.__setattr__(self, "variation", variation) + + if self.n == 0: + raise ValueError("N cannot be 0") + + if self.variation not in ["nearest", "last"]: + raise ValueError(f"{self.variation} is not a valid variation") + + def is_anchored(self): + return ( + self.n == 1 and self.startingMonth is not None and self.weekday is not None + ) + + def is_on_offset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + dt = datetime(dt.year, dt.month, dt.day) + year_end = self.get_year_end(dt) + + if self.variation == "nearest": + # We have to check the year end of "this" cal year AND the previous + return year_end == dt or self.get_year_end(shift_month(dt, -1, None)) == dt + else: + return year_end == dt + + @apply_wraps + def apply(self, other): + norm = Timestamp(other).normalize() + + n = self.n + prev_year = self.get_year_end(datetime(other.year - 1, self.startingMonth, 1)) + cur_year = self.get_year_end(datetime(other.year, self.startingMonth, 1)) + next_year = self.get_year_end(datetime(other.year + 1, self.startingMonth, 1)) + + prev_year = conversion.localize_pydatetime(prev_year, other.tzinfo) + cur_year = conversion.localize_pydatetime(cur_year, other.tzinfo) + next_year = conversion.localize_pydatetime(next_year, other.tzinfo) + + # Note: next_year.year == other.year + 1, so we will always + # have other < next_year + if norm == prev_year: + n -= 1 + elif norm == cur_year: + pass + elif n > 0: + if norm < prev_year: + n -= 2 + elif prev_year < norm < cur_year: + n -= 1 + elif cur_year < norm < next_year: + pass + else: + if cur_year < norm < next_year: + n += 1 + elif prev_year < norm < cur_year: + pass + elif ( + norm.year == prev_year.year + and norm < prev_year + and prev_year - norm <= timedelta(6) + ): + # GH#14774, error when next_year.year == cur_year.year + # e.g. prev_year == datetime(2004, 1, 3), + # other == datetime(2004, 1, 1) + n -= 1 + else: + assert False + + shifted = datetime(other.year + n, self.startingMonth, 1) + result = self.get_year_end(shifted) + result = datetime( + result.year, + result.month, + result.day, + other.hour, + other.minute, + other.second, + other.microsecond, + ) + return result + + def get_year_end(self, dt): + assert dt.tzinfo is None + + dim = ccalendar.get_days_in_month(dt.year, self.startingMonth) + target_date = datetime(dt.year, self.startingMonth, dim) + wkday_diff = self.weekday - target_date.weekday() + if wkday_diff == 0: + # year_end is the same for "last" and "nearest" cases + return target_date + + if self.variation == "last": + days_forward = (wkday_diff % 7) - 7 + + # days_forward is always negative, so we always end up + # in the same year as dt + return target_date + timedelta(days=days_forward) + else: + # variation == "nearest": + days_forward = wkday_diff % 7 + if days_forward <= 3: + # The upcoming self.weekday is closer than the previous one + return target_date + timedelta(days_forward) + else: + # The previous self.weekday is closer than the upcoming one + return target_date + timedelta(days_forward - 7) + + @property + def rule_code(self): + prefix = self._prefix + suffix = self.get_rule_code_suffix() + return f"{prefix}-{suffix}" + + def _get_suffix_prefix(self): + if self.variation == "nearest": + return "N" + else: + return "L" + + def get_rule_code_suffix(self): + prefix = self._get_suffix_prefix() + month = ccalendar.MONTH_ALIASES[self.startingMonth] + weekday = ccalendar.int_to_weekday[self.weekday] + return f"{prefix}-{month}-{weekday}" + + @classmethod + def _parse_suffix(cls, varion_code, startingMonth_code, weekday_code): + if varion_code == "N": + variation = "nearest" + elif varion_code == "L": + variation = "last" + else: + raise ValueError(f"Unable to parse varion_code: {varion_code}") + + startingMonth = ccalendar.MONTH_TO_CAL_NUM[startingMonth_code] + weekday = ccalendar.weekday_to_int[weekday_code] + + return { + "weekday": weekday, + "startingMonth": startingMonth, + "variation": variation, + } + + @classmethod + def _from_name(cls, *args): + return cls(**cls._parse_suffix(*args)) + + +class FY5253Quarter(DateOffset): + """ + DateOffset increments between business quarter dates + for 52-53 week fiscal year (also known as a 4-4-5 calendar). + + It is used by companies that desire that their + fiscal year always end on the same day of the week. + + It is a method of managing accounting periods. + It is a common calendar structure for some industries, + such as retail, manufacturing and parking industry. + + For more information see: + http://en.wikipedia.org/wiki/4-4-5_calendar + + The year may either: + + - end on the last X day of the Y month. + - end on the last X day closest to the last day of the Y month. + + X is a specific day of the week. + Y is a certain month of the year + + startingMonth = 1 corresponds to dates like 1/31/2007, 4/30/2007, ... + startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ... + startingMonth = 3 corresponds to dates like 3/30/2007, 6/29/2007, ... + + Parameters + ---------- + n : int + weekday : int {0, 1, ..., 6}, default 0 + A specific integer for the day of the week. + + - 0 is Monday + - 1 is Tuesday + - 2 is Wednesday + - 3 is Thursday + - 4 is Friday + - 5 is Saturday + - 6 is Sunday. + + startingMonth : int {1, 2, ..., 12}, default 1 + The month in which fiscal years end. + + qtr_with_extra_week : int {1, 2, 3, 4}, default 1 + The quarter number that has the leap or 14 week when needed. + + variation : str, default "nearest" + Method of employing 4-4-5 calendar. + + There are two options: + + - "nearest" means year end is **weekday** closest to last day of month in year. + - "last" means year end is final **weekday** of the final month in fiscal year. + """ + + _prefix = "REQ" + _adjust_dst = True + _attributes = frozenset( + ["weekday", "startingMonth", "qtr_with_extra_week", "variation"] + ) + + def __init__( + self, + n=1, + normalize=False, + weekday=0, + startingMonth=1, + qtr_with_extra_week=1, + variation="nearest", + ): + BaseOffset.__init__(self, n, normalize) + + object.__setattr__(self, "startingMonth", startingMonth) + object.__setattr__(self, "weekday", weekday) + object.__setattr__(self, "qtr_with_extra_week", qtr_with_extra_week) + object.__setattr__(self, "variation", variation) + + if self.n == 0: + raise ValueError("N cannot be 0") + + @cache_readonly + def _offset(self): + return FY5253( + startingMonth=self.startingMonth, + weekday=self.weekday, + variation=self.variation, + ) + + def is_anchored(self): + return self.n == 1 and self._offset.is_anchored() + + def _rollback_to_year(self, other): + """ + Roll `other` back to the most recent date that was on a fiscal year + end. + + Return the date of that year-end, the number of full quarters + elapsed between that year-end and other, and the remaining Timedelta + since the most recent quarter-end. + + Parameters + ---------- + other : datetime or Timestamp + + Returns + ------- + tuple of + prev_year_end : Timestamp giving most recent fiscal year end + num_qtrs : int + tdelta : Timedelta + """ + num_qtrs = 0 + + norm = Timestamp(other).tz_localize(None) + start = self._offset.rollback(norm) + # Note: start <= norm and self._offset.is_on_offset(start) + + if start < norm: + # roll adjustment + qtr_lens = self.get_weeks(norm) + + # check thet qtr_lens is consistent with self._offset addition + end = liboffsets.shift_day(start, days=7 * sum(qtr_lens)) + assert self._offset.is_on_offset(end), (start, end, qtr_lens) + + tdelta = norm - start + for qlen in qtr_lens: + if qlen * 7 <= tdelta.days: + num_qtrs += 1 + tdelta -= Timedelta(days=qlen * 7) + else: + break + else: + tdelta = Timedelta(0) + + # Note: we always have tdelta.value >= 0 + return start, num_qtrs, tdelta + + @apply_wraps + def apply(self, other): + # Note: self.n == 0 is not allowed. + n = self.n + + prev_year_end, num_qtrs, tdelta = self._rollback_to_year(other) + res = prev_year_end + n += num_qtrs + if self.n <= 0 and tdelta.value > 0: + n += 1 + + # Possible speedup by handling years first. + years = n // 4 + if years: + res += self._offset * years + n -= years * 4 + + # Add an extra day to make *sure* we are getting the quarter lengths + # for the upcoming year, not the previous year + qtr_lens = self.get_weeks(res + Timedelta(days=1)) + + # Note: we always have 0 <= n < 4 + weeks = sum(qtr_lens[:n]) + if weeks: + res = liboffsets.shift_day(res, days=weeks * 7) + + return res + + def get_weeks(self, dt): + ret = [13] * 4 + + year_has_extra_week = self.year_has_extra_week(dt) + + if year_has_extra_week: + ret[self.qtr_with_extra_week - 1] = 14 + + return ret + + def year_has_extra_week(self, dt): + # Avoid round-down errors --> normalize to get + # e.g. '370D' instead of '360D23H' + norm = Timestamp(dt).normalize().tz_localize(None) + + next_year_end = self._offset.rollforward(norm) + prev_year_end = norm - self._offset + weeks_in_year = (next_year_end - prev_year_end).days / 7 + assert weeks_in_year in [52, 53], weeks_in_year + return weeks_in_year == 53 + + def is_on_offset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + if self._offset.is_on_offset(dt): + return True + + next_year_end = dt - self._offset + + qtr_lens = self.get_weeks(dt) + + current = next_year_end + for qtr_len in qtr_lens: + current = liboffsets.shift_day(current, days=qtr_len * 7) + if dt == current: + return True + return False + + @property + def rule_code(self): + suffix = self._offset.get_rule_code_suffix() + qtr = self.qtr_with_extra_week + return f"{self._prefix}-{suffix}-{qtr}" + + @classmethod + def _from_name(cls, *args): + return cls( + **dict(FY5253._parse_suffix(*args[:-1]), qtr_with_extra_week=int(args[-1])) + ) + + +class Easter(DateOffset): + """ + DateOffset for the Easter holiday using logic defined in dateutil. + + Right now uses the revised method which is valid in years 1583-4099. + """ + + _adjust_dst = True + _attributes = frozenset(["n", "normalize"]) + + __init__ = BaseOffset.__init__ + + @apply_wraps + def apply(self, other): + current_easter = easter(other.year) + current_easter = datetime( + current_easter.year, current_easter.month, current_easter.day + ) + current_easter = conversion.localize_pydatetime(current_easter, other.tzinfo) + + n = self.n + if n >= 0 and other < current_easter: + n -= 1 + elif n < 0 and other > current_easter: + n += 1 + # TODO: Why does this handle the 0 case the opposite of others? + + # NOTE: easter returns a datetime.date so we have to convert to type of + # other + new = easter(other.year + n) + new = datetime( + new.year, + new.month, + new.day, + other.hour, + other.minute, + other.second, + other.microsecond, + ) + return new + + def is_on_offset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + return date(dt.year, dt.month, dt.day) == easter(dt.year) + + +# --------------------------------------------------------------------- +# Ticks + + +def _tick_comp(op): + assert op not in [operator.eq, operator.ne] + + def f(self, other): + try: + return op(self.delta, other.delta) + except AttributeError: + # comparing with a non-Tick object + raise TypeError( + f"Invalid comparison between {type(self).__name__} " + f"and {type(other).__name__}" + ) + + f.__name__ = f"__{op.__name__}__" + return f + + +class Tick(liboffsets._Tick, SingleConstructorOffset): + _inc = Timedelta(microseconds=1000) + _prefix = "undefined" + _attributes = frozenset(["n", "normalize"]) + + def __init__(self, n=1, normalize=False): + BaseOffset.__init__(self, n, normalize) + if normalize: + raise ValueError( + "Tick offset with `normalize=True` are not allowed." + ) # GH#21427 + + __gt__ = _tick_comp(operator.gt) + __ge__ = _tick_comp(operator.ge) + __lt__ = _tick_comp(operator.lt) + __le__ = _tick_comp(operator.le) + + def __add__(self, other): + if isinstance(other, Tick): + if type(self) == type(other): + return type(self)(self.n + other.n) + else: + return _delta_to_tick(self.delta + other.delta) + elif isinstance(other, Period): + return other + self + try: + return self.apply(other) + except ApplyTypeError: + return NotImplemented + except OverflowError: + raise OverflowError( + f"the add operation between {self} and {other} will overflow" + ) + + def __eq__(self, other: Any) -> bool: + if isinstance(other, str): + from pandas.tseries.frequencies import to_offset + + try: + # GH#23524 if to_offset fails, we are dealing with an + # incomparable type so == is False and != is True + other = to_offset(other) + except ValueError: + # e.g. "infer" + return False + + if isinstance(other, Tick): + return self.delta == other.delta + else: + return False + + # This is identical to DateOffset.__hash__, but has to be redefined here + # for Python 3, because we've redefined __eq__. + def __hash__(self): + return hash(self._params) + + def __ne__(self, other): + if isinstance(other, str): + from pandas.tseries.frequencies import to_offset + + try: + # GH#23524 if to_offset fails, we are dealing with an + # incomparable type so == is False and != is True + other = to_offset(other) + except ValueError: + # e.g. "infer" + return True + + if isinstance(other, Tick): + return self.delta != other.delta + else: + return True + + @property + def delta(self): + return self.n * self._inc + + @property + def nanos(self): + return delta_to_nanoseconds(self.delta) + + # TODO: Should Tick have its own apply_index? + def apply(self, other): + # Timestamp can handle tz and nano sec, thus no need to use apply_wraps + if isinstance(other, Timestamp): + + # GH 15126 + # in order to avoid a recursive + # call of __add__ and __radd__ if there is + # an exception, when we call using the + operator, + # we directly call the known method + result = other.__add__(self) + if result is NotImplemented: + raise OverflowError + return result + elif isinstance(other, (datetime, np.datetime64, date)): + return as_timestamp(other) + self + + if isinstance(other, timedelta): + return other + self.delta + elif isinstance(other, type(self)): + return type(self)(self.n + other.n) + + raise ApplyTypeError(f"Unhandled type: {type(other).__name__}") + + def is_anchored(self): + return False + + +def _delta_to_tick(delta): + if delta.microseconds == 0 and getattr(delta, "nanoseconds", 0) == 0: + # nanoseconds only for pd.Timedelta + if delta.seconds == 0: + return Day(delta.days) + else: + seconds = delta.days * 86400 + delta.seconds + if seconds % 3600 == 0: + return Hour(seconds / 3600) + elif seconds % 60 == 0: + return Minute(seconds / 60) + else: + return Second(seconds) + else: + nanos = delta_to_nanoseconds(delta) + if nanos % 1000000 == 0: + return Milli(nanos // 1000000) + elif nanos % 1000 == 0: + return Micro(nanos // 1000) + else: # pragma: no cover + return Nano(nanos) + + +class Day(Tick): + _inc = Timedelta(days=1) + _prefix = "D" + + +class Hour(Tick): + _inc = Timedelta(hours=1) + _prefix = "H" + + +class Minute(Tick): + _inc = Timedelta(minutes=1) + _prefix = "T" + + +class Second(Tick): + _inc = Timedelta(seconds=1) + _prefix = "S" + + +class Milli(Tick): + _inc = Timedelta(milliseconds=1) + _prefix = "L" + + +class Micro(Tick): + _inc = Timedelta(microseconds=1) + _prefix = "U" + + +class Nano(Tick): + _inc = Timedelta(nanoseconds=1) + _prefix = "N" + + +BDay = BusinessDay +BMonthEnd = BusinessMonthEnd +BMonthBegin = BusinessMonthBegin +CBMonthEnd = CustomBusinessMonthEnd +CBMonthBegin = CustomBusinessMonthBegin +CDay = CustomBusinessDay + +# --------------------------------------------------------------------- + + +def generate_range(start=None, end=None, periods=None, offset=BDay()): + """ + Generates a sequence of dates corresponding to the specified time + offset. Similar to dateutil.rrule except uses pandas DateOffset + objects to represent time increments. + + Parameters + ---------- + start : datetime, (default None) + end : datetime, (default None) + periods : int, (default None) + offset : DateOffset, (default BDay()) + + Notes + ----- + * This method is faster for generating weekdays than dateutil.rrule + * At least two of (start, end, periods) must be specified. + * If both start and end are specified, the returned dates will + satisfy start <= date <= end. + + Returns + ------- + dates : generator object + """ + from pandas.tseries.frequencies import to_offset + + offset = to_offset(offset) + + start = Timestamp(start) + start = start if start is not NaT else None + end = Timestamp(end) + end = end if end is not NaT else None + + if start and not offset.is_on_offset(start): + start = offset.rollforward(start) + + elif end and not offset.is_on_offset(end): + end = offset.rollback(end) + + if periods is None and end < start and offset.n >= 0: + end = None + periods = 0 + + if end is None: + end = start + (periods - 1) * offset + + if start is None: + start = end - (periods - 1) * offset + + cur = start + if offset.n >= 0: + while cur <= end: + yield cur + + if cur == end: + # GH#24252 avoid overflows by not performing the addition + # in offset.apply unless we have to + break + + # faster than cur + offset + next_date = offset.apply(cur) + if next_date <= cur: + raise ValueError(f"Offset {offset} did not increment date") + cur = next_date + else: + while cur >= end: + yield cur + + if cur == end: + # GH#24252 avoid overflows by not performing the addition + # in offset.apply unless we have to + break + + # faster than cur + offset + next_date = offset.apply(cur) + if next_date >= cur: + raise ValueError(f"Offset {offset} did not decrement date") + cur = next_date + + +prefix_mapping = { + offset._prefix: offset + for offset in [ + YearBegin, # 'AS' + YearEnd, # 'A' + BYearBegin, # 'BAS' + BYearEnd, # 'BA' + BusinessDay, # 'B' + BusinessMonthBegin, # 'BMS' + BusinessMonthEnd, # 'BM' + BQuarterEnd, # 'BQ' + BQuarterBegin, # 'BQS' + BusinessHour, # 'BH' + CustomBusinessDay, # 'C' + CustomBusinessMonthEnd, # 'CBM' + CustomBusinessMonthBegin, # 'CBMS' + CustomBusinessHour, # 'CBH' + MonthEnd, # 'M' + MonthBegin, # 'MS' + Nano, # 'N' + SemiMonthEnd, # 'SM' + SemiMonthBegin, # 'SMS' + Week, # 'W' + Second, # 'S' + Minute, # 'T' + Micro, # 'U' + QuarterEnd, # 'Q' + QuarterBegin, # 'QS' + Milli, # 'L' + Hour, # 'H' + Day, # 'D' + WeekOfMonth, # 'WOM' + FY5253, + FY5253Quarter, + ] +} diff --git a/venv/Lib/site-packages/pandas/util/__init__.py b/venv/Lib/site-packages/pandas/util/__init__.py new file mode 100644 index 0000000..b5271db --- /dev/null +++ b/venv/Lib/site-packages/pandas/util/__init__.py @@ -0,0 +1,30 @@ +from pandas.util._decorators import Appender, Substitution, cache_readonly # noqa + +from pandas import compat +from pandas.core.util.hashing import hash_array, hash_pandas_object # noqa + +# compatibility for import pandas; pandas.util.testing + +if compat.PY37: + + def __getattr__(name): + if name == "testing": + import pandas.util.testing + + return pandas.util.testing + else: + raise AttributeError(f"module 'pandas.util' has no attribute '{name}'") + + +else: + + class _testing: + def __getattr__(self, item): + import pandas.util.testing + + return getattr(pandas.util.testing, item) + + testing = _testing() + + +del compat diff --git a/venv/Lib/site-packages/pandas/util/_decorators.py b/venv/Lib/site-packages/pandas/util/_decorators.py new file mode 100644 index 0000000..d10d3a1 --- /dev/null +++ b/venv/Lib/site-packages/pandas/util/_decorators.py @@ -0,0 +1,342 @@ +from functools import wraps +import inspect +from textwrap import dedent +from typing import ( + Any, + Callable, + List, + Mapping, + Optional, + Tuple, + Type, + TypeVar, + Union, + cast, +) +import warnings + +from pandas._libs.properties import cache_readonly # noqa + +FuncType = Callable[..., Any] +F = TypeVar("F", bound=FuncType) + + +def deprecate( + name: str, + alternative: Callable[..., Any], + version: str, + alt_name: Optional[str] = None, + klass: Optional[Type[Warning]] = None, + stacklevel: int = 2, + msg: Optional[str] = None, +) -> Callable[..., Any]: + """ + Return a new function that emits a deprecation warning on use. + + To use this method for a deprecated function, another function + `alternative` with the same signature must exist. The deprecated + function will emit a deprecation warning, and in the docstring + it will contain the deprecation directive with the provided version + so it can be detected for future removal. + + Parameters + ---------- + name : str + Name of function to deprecate. + alternative : func + Function to use instead. + version : str + Version of pandas in which the method has been deprecated. + alt_name : str, optional + Name to use in preference of alternative.__name__. + klass : Warning, default FutureWarning + stacklevel : int, default 2 + msg : str + The message to display in the warning. + Default is '{name} is deprecated. Use {alt_name} instead.' + """ + + alt_name = alt_name or alternative.__name__ + klass = klass or FutureWarning + warning_msg = msg or f"{name} is deprecated, use {alt_name} instead" + + @wraps(alternative) + def wrapper(*args, **kwargs) -> Callable[..., Any]: + warnings.warn(warning_msg, klass, stacklevel=stacklevel) + return alternative(*args, **kwargs) + + # adding deprecated directive to the docstring + msg = msg or f"Use `{alt_name}` instead." + doc_error_msg = ( + "deprecate needs a correctly formatted docstring in " + "the target function (should have a one liner short " + "summary, and opening quotes should be in their own " + f"line). Found:\n{alternative.__doc__}" + ) + + # when python is running in optimized mode (i.e. `-OO`), docstrings are + # removed, so we check that a docstring with correct formatting is used + # but we allow empty docstrings + if alternative.__doc__: + if alternative.__doc__.count("\n") < 3: + raise AssertionError(doc_error_msg) + empty1, summary, empty2, doc = alternative.__doc__.split("\n", 3) + if empty1 or empty2 and not summary: + raise AssertionError(doc_error_msg) + wrapper.__doc__ = dedent( + f""" + {summary.strip()} + + .. deprecated:: {version} + {msg} + + {dedent(doc)}""" + ) + + return wrapper + + +def deprecate_kwarg( + old_arg_name: str, + new_arg_name: Optional[str], + mapping: Optional[Union[Mapping[Any, Any], Callable[[Any], Any]]] = None, + stacklevel: int = 2, +) -> Callable[..., Any]: + """ + Decorator to deprecate a keyword argument of a function. + + Parameters + ---------- + old_arg_name : str + Name of argument in function to deprecate + new_arg_name : str or None + Name of preferred argument in function. Use None to raise warning that + ``old_arg_name`` keyword is deprecated. + mapping : dict or callable + If mapping is present, use it to translate old arguments to + new arguments. A callable must do its own value checking; + values not found in a dict will be forwarded unchanged. + + Examples + -------- + The following deprecates 'cols', using 'columns' instead + + >>> @deprecate_kwarg(old_arg_name='cols', new_arg_name='columns') + ... def f(columns=''): + ... print(columns) + ... + >>> f(columns='should work ok') + should work ok + + >>> f(cols='should raise warning') + FutureWarning: cols is deprecated, use columns instead + warnings.warn(msg, FutureWarning) + should raise warning + + >>> f(cols='should error', columns="can\'t pass do both") + TypeError: Can only specify 'cols' or 'columns', not both + + >>> @deprecate_kwarg('old', 'new', {'yes': True, 'no': False}) + ... def f(new=False): + ... print('yes!' if new else 'no!') + ... + >>> f(old='yes') + FutureWarning: old='yes' is deprecated, use new=True instead + warnings.warn(msg, FutureWarning) + yes! + + To raise a warning that a keyword will be removed entirely in the future + + >>> @deprecate_kwarg(old_arg_name='cols', new_arg_name=None) + ... def f(cols='', another_param=''): + ... print(cols) + ... + >>> f(cols='should raise warning') + FutureWarning: the 'cols' keyword is deprecated and will be removed in a + future version please takes steps to stop use of 'cols' + should raise warning + >>> f(another_param='should not raise warning') + should not raise warning + + >>> f(cols='should raise warning', another_param='') + FutureWarning: the 'cols' keyword is deprecated and will be removed in a + future version please takes steps to stop use of 'cols' + should raise warning + """ + + if mapping is not None and not hasattr(mapping, "get") and not callable(mapping): + raise TypeError( + "mapping from old to new argument values must be dict or callable!" + ) + + def _deprecate_kwarg(func: F) -> F: + @wraps(func) + def wrapper(*args, **kwargs) -> Callable[..., Any]: + old_arg_value = kwargs.pop(old_arg_name, None) + + if old_arg_value is not None: + if new_arg_name is None: + msg = ( + f"the {repr(old_arg_name)} keyword is deprecated and " + "will be removed in a future version. Please take " + f"steps to stop the use of {repr(old_arg_name)}" + ) + warnings.warn(msg, FutureWarning, stacklevel=stacklevel) + kwargs[old_arg_name] = old_arg_value + return func(*args, **kwargs) + + elif mapping is not None: + if callable(mapping): + new_arg_value = mapping(old_arg_value) + else: + new_arg_value = mapping.get(old_arg_value, old_arg_value) + msg = ( + f"the {old_arg_name}={repr(old_arg_value)} keyword is " + "deprecated, use " + f"{new_arg_name}={repr(new_arg_value)} instead" + ) + else: + new_arg_value = old_arg_value + msg = ( + f"the {repr(old_arg_name)}' keyword is deprecated, " + f"use {repr(new_arg_name)} instead" + ) + + warnings.warn(msg, FutureWarning, stacklevel=stacklevel) + if kwargs.get(new_arg_name) is not None: + msg = ( + f"Can only specify {repr(old_arg_name)} " + f"or {repr(new_arg_name)}, not both" + ) + raise TypeError(msg) + else: + kwargs[new_arg_name] = new_arg_value + return func(*args, **kwargs) + + return cast(F, wrapper) + + return _deprecate_kwarg + + +def rewrite_axis_style_signature( + name: str, extra_params: List[Tuple[str, Any]] +) -> Callable[..., Any]: + def decorate(func: F) -> F: + @wraps(func) + def wrapper(*args, **kwargs) -> Callable[..., Any]: + return func(*args, **kwargs) + + kind = inspect.Parameter.POSITIONAL_OR_KEYWORD + params = [ + inspect.Parameter("self", kind), + inspect.Parameter(name, kind, default=None), + inspect.Parameter("index", kind, default=None), + inspect.Parameter("columns", kind, default=None), + inspect.Parameter("axis", kind, default=None), + ] + + for pname, default in extra_params: + params.append(inspect.Parameter(pname, kind, default=default)) + + sig = inspect.Signature(params) + + # https://github.com/python/typing/issues/598 + func.__signature__ = sig # type: ignore + return cast(F, wrapper) + + return decorate + + +# Substitution and Appender are derived from matplotlib.docstring (1.1.0) +# module http://matplotlib.org/users/license.html + + +class Substitution: + """ + A decorator to take a function's docstring and perform string + substitution on it. + + This decorator should be robust even if func.__doc__ is None + (for example, if -OO was passed to the interpreter) + + Usage: construct a docstring.Substitution with a sequence or + dictionary suitable for performing substitution; then + decorate a suitable function with the constructed object. e.g. + + sub_author_name = Substitution(author='Jason') + + @sub_author_name + def some_function(x): + "%(author)s wrote this function" + + # note that some_function.__doc__ is now "Jason wrote this function" + + One can also use positional arguments. + + sub_first_last_names = Substitution('Edgar Allen', 'Poe') + + @sub_first_last_names + def some_function(x): + "%s %s wrote the Raven" + """ + + def __init__(self, *args, **kwargs): + if args and kwargs: + raise AssertionError("Only positional or keyword args are allowed") + + self.params = args or kwargs + + def __call__(self, func: F) -> F: + func.__doc__ = func.__doc__ and func.__doc__ % self.params + return func + + def update(self, *args, **kwargs) -> None: + """ + Update self.params with supplied args. + """ + if isinstance(self.params, dict): + self.params.update(*args, **kwargs) + + +class Appender: + """ + A function decorator that will append an addendum to the docstring + of the target function. + + This decorator should be robust even if func.__doc__ is None + (for example, if -OO was passed to the interpreter). + + Usage: construct a docstring.Appender with a string to be joined to + the original docstring. An optional 'join' parameter may be supplied + which will be used to join the docstring and addendum. e.g. + + add_copyright = Appender("Copyright (c) 2009", join='\n') + + @add_copyright + def my_dog(has='fleas'): + "This docstring will have a copyright below" + pass + """ + + addendum: Optional[str] + + def __init__(self, addendum: Optional[str], join: str = "", indents: int = 0): + if indents > 0: + self.addendum = indent(addendum, indents=indents) + else: + self.addendum = addendum + self.join = join + + def __call__(self, func: F) -> F: + func.__doc__ = func.__doc__ if func.__doc__ else "" + self.addendum = self.addendum if self.addendum else "" + docitems = [func.__doc__, self.addendum] + func.__doc__ = dedent(self.join.join(docitems)) + return func + + +def indent(text: Optional[str], indents: int = 1) -> str: + if not text or not isinstance(text, str): + return "" + jointext = "".join(["\n"] + [" "] * indents) + return jointext.join(text.split("\n")) diff --git a/venv/Lib/site-packages/pandas/util/_depr_module.py b/venv/Lib/site-packages/pandas/util/_depr_module.py new file mode 100644 index 0000000..5694ca2 --- /dev/null +++ b/venv/Lib/site-packages/pandas/util/_depr_module.py @@ -0,0 +1,107 @@ +""" +This module houses a utility class for mocking deprecated modules. +It is for internal use only and should not be used beyond this purpose. +""" + +import importlib +from typing import Iterable +import warnings + + +class _DeprecatedModule: + """ + Class for mocking deprecated modules. + + Parameters + ---------- + deprmod : name of module to be deprecated. + deprmodto : name of module as a replacement, optional. + If not given, the __module__ attribute will + be used when needed. + removals : objects or methods in module that will no longer be + accessible once module is removed. + moved : dict, optional + dictionary of function name -> new location for moved + objects + """ + + def __init__(self, deprmod, deprmodto=None, removals=None, moved=None): + self.deprmod = deprmod + self.deprmodto = deprmodto + self.removals = removals + if self.removals is not None: + self.removals = frozenset(self.removals) + self.moved = moved + + # For introspection purposes. + self.self_dir = frozenset(dir(type(self))) + + def __dir__(self) -> Iterable[str]: + deprmodule = self._import_deprmod() + return dir(deprmodule) + + def __repr__(self) -> str: + deprmodule = self._import_deprmod() + return repr(deprmodule) + + __str__ = __repr__ + + def __getattr__(self, name: str): + if name in self.self_dir: + return object.__getattribute__(self, name) + + try: + deprmodule = self._import_deprmod(self.deprmod) + except ImportError: + if self.deprmodto is None: + raise + + # a rename + deprmodule = self._import_deprmod(self.deprmodto) + + obj = getattr(deprmodule, name) + + if self.removals is not None and name in self.removals: + warnings.warn( + f"{self.deprmod}.{name} is deprecated and will be removed in " + "a future version.", + FutureWarning, + stacklevel=2, + ) + elif self.moved is not None and name in self.moved: + warnings.warn( + f"{self.deprmod} is deprecated and will be removed in " + f"a future version.\nYou can access {name} as {self.moved[name]}", + FutureWarning, + stacklevel=2, + ) + else: + deprmodto = self.deprmodto + if deprmodto is False: + warnings.warn( + f"{self.deprmod}.{name} is deprecated and will be removed in " + "a future version.", + FutureWarning, + stacklevel=2, + ) + else: + if deprmodto is None: + deprmodto = obj.__module__ + # The object is actually located in another module. + warnings.warn( + f"{self.deprmod}.{name} is deprecated. Please use " + f"{deprmodto}.{name} instead.", + FutureWarning, + stacklevel=2, + ) + + return obj + + def _import_deprmod(self, mod=None): + if mod is None: + mod = self.deprmod + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=FutureWarning) + deprmodule = importlib.import_module(mod) + return deprmodule diff --git a/venv/Lib/site-packages/pandas/util/_doctools.py b/venv/Lib/site-packages/pandas/util/_doctools.py new file mode 100644 index 0000000..8fd4566 --- /dev/null +++ b/venv/Lib/site-packages/pandas/util/_doctools.py @@ -0,0 +1,193 @@ +from typing import Optional, Tuple + +import numpy as np + +import pandas as pd + + +class TablePlotter: + """ + Layout some DataFrames in vertical/horizontal layout for explanation. + Used in merging.rst + """ + + def __init__( + self, + cell_width: float = 0.37, + cell_height: float = 0.25, + font_size: float = 7.5, + ): + self.cell_width = cell_width + self.cell_height = cell_height + self.font_size = font_size + + def _shape(self, df: pd.DataFrame) -> Tuple[int, int]: + """ + Calculate table chape considering index levels. + """ + row, col = df.shape + return row + df.columns.nlevels, col + df.index.nlevels + + def _get_cells(self, left, right, vertical) -> Tuple[int, int]: + """ + Calculate appropriate figure size based on left and right data. + """ + if vertical: + # calculate required number of cells + vcells = max(sum(self._shape(l)[0] for l in left), self._shape(right)[0]) + hcells = max(self._shape(l)[1] for l in left) + self._shape(right)[1] + else: + vcells = max([self._shape(l)[0] for l in left] + [self._shape(right)[0]]) + hcells = sum([self._shape(l)[1] for l in left] + [self._shape(right)[1]]) + return hcells, vcells + + def plot(self, left, right, labels=None, vertical: bool = True): + """ + Plot left / right DataFrames in specified layout. + + Parameters + ---------- + left : list of DataFrames before operation is applied + right : DataFrame of operation result + labels : list of str to be drawn as titles of left DataFrames + vertical : bool, default True + If True, use vertical layout. If False, use horizontal layout. + """ + import matplotlib.pyplot as plt + import matplotlib.gridspec as gridspec + + if not isinstance(left, list): + left = [left] + left = [self._conv(l) for l in left] + right = self._conv(right) + + hcells, vcells = self._get_cells(left, right, vertical) + + if vertical: + figsize = self.cell_width * hcells, self.cell_height * vcells + else: + # include margin for titles + figsize = self.cell_width * hcells, self.cell_height * vcells + fig = plt.figure(figsize=figsize) + + if vertical: + gs = gridspec.GridSpec(len(left), hcells) + # left + max_left_cols = max(self._shape(l)[1] for l in left) + max_left_rows = max(self._shape(l)[0] for l in left) + for i, (l, label) in enumerate(zip(left, labels)): + ax = fig.add_subplot(gs[i, 0:max_left_cols]) + self._make_table(ax, l, title=label, height=1.0 / max_left_rows) + # right + ax = plt.subplot(gs[:, max_left_cols:]) + self._make_table(ax, right, title="Result", height=1.05 / vcells) + fig.subplots_adjust(top=0.9, bottom=0.05, left=0.05, right=0.95) + else: + max_rows = max(self._shape(df)[0] for df in left + [right]) + height = 1.0 / np.max(max_rows) + gs = gridspec.GridSpec(1, hcells) + # left + i = 0 + for l, label in zip(left, labels): + sp = self._shape(l) + ax = fig.add_subplot(gs[0, i : i + sp[1]]) + self._make_table(ax, l, title=label, height=height) + i += sp[1] + # right + ax = plt.subplot(gs[0, i:]) + self._make_table(ax, right, title="Result", height=height) + fig.subplots_adjust(top=0.85, bottom=0.05, left=0.05, right=0.95) + + return fig + + def _conv(self, data): + """ + Convert each input to appropriate for table outplot. + """ + if isinstance(data, pd.Series): + if data.name is None: + data = data.to_frame(name="") + else: + data = data.to_frame() + data = data.fillna("NaN") + return data + + def _insert_index(self, data): + # insert is destructive + data = data.copy() + idx_nlevels = data.index.nlevels + if idx_nlevels == 1: + data.insert(0, "Index", data.index) + else: + for i in range(idx_nlevels): + data.insert(i, f"Index{i}", data.index._get_level_values(i)) + + col_nlevels = data.columns.nlevels + if col_nlevels > 1: + col = data.columns._get_level_values(0) + values = [ + data.columns._get_level_values(i).values for i in range(1, col_nlevels) + ] + col_df = pd.DataFrame(values) + data.columns = col_df.columns + data = pd.concat([col_df, data]) + data.columns = col + return data + + def _make_table(self, ax, df, title: str, height: Optional[float] = None): + if df is None: + ax.set_visible(False) + return + + import pandas.plotting as plotting + + idx_nlevels = df.index.nlevels + col_nlevels = df.columns.nlevels + # must be convert here to get index levels for colorization + df = self._insert_index(df) + tb = plotting.table(ax, df, loc=9) + tb.set_fontsize(self.font_size) + + if height is None: + height = 1.0 / (len(df) + 1) + + props = tb.properties() + for (r, c), cell in props["celld"].items(): + if c == -1: + cell.set_visible(False) + elif r < col_nlevels and c < idx_nlevels: + cell.set_visible(False) + elif r < col_nlevels or c < idx_nlevels: + cell.set_facecolor("#AAAAAA") + cell.set_height(height) + + ax.set_title(title, size=self.font_size) + ax.axis("off") + + +if __name__ == "__main__": + import matplotlib.pyplot as plt + + p = TablePlotter() + + df1 = pd.DataFrame({"A": [10, 11, 12], "B": [20, 21, 22], "C": [30, 31, 32]}) + df2 = pd.DataFrame({"A": [10, 12], "C": [30, 32]}) + + p.plot([df1, df2], pd.concat([df1, df2]), labels=["df1", "df2"], vertical=True) + plt.show() + + df3 = pd.DataFrame({"X": [10, 12], "Z": [30, 32]}) + + p.plot( + [df1, df3], pd.concat([df1, df3], axis=1), labels=["df1", "df2"], vertical=False + ) + plt.show() + + idx = pd.MultiIndex.from_tuples( + [(1, "A"), (1, "B"), (1, "C"), (2, "A"), (2, "B"), (2, "C")] + ) + col = pd.MultiIndex.from_tuples([(1, "A"), (1, "B")]) + df3 = pd.DataFrame({"v1": [1, 2, 3, 4, 5, 6], "v2": [5, 6, 7, 8, 9, 10]}, index=idx) + df3.columns = col + p.plot(df3, df3, labels=["df3"]) + plt.show() diff --git a/venv/Lib/site-packages/pandas/util/_exceptions.py b/venv/Lib/site-packages/pandas/util/_exceptions.py new file mode 100644 index 0000000..0723a37 --- /dev/null +++ b/venv/Lib/site-packages/pandas/util/_exceptions.py @@ -0,0 +1,19 @@ +import contextlib +from typing import Tuple + + +@contextlib.contextmanager +def rewrite_exception(old_name: str, new_name: str): + """ + Rewrite the message of an exception. + """ + try: + yield + except Exception as err: + msg = err.args[0] + msg = msg.replace(old_name, new_name) + args: Tuple[str, ...] = (msg,) + if len(err.args) > 1: + args = args + err.args[1:] + err.args = args + raise diff --git a/venv/Lib/site-packages/pandas/util/_print_versions.py b/venv/Lib/site-packages/pandas/util/_print_versions.py new file mode 100644 index 0000000..2801a2b --- /dev/null +++ b/venv/Lib/site-packages/pandas/util/_print_versions.py @@ -0,0 +1,150 @@ +import codecs +import json +import locale +import os +import platform +import struct +import subprocess +import sys +from typing import List, Optional, Tuple, Union + +from pandas.compat._optional import VERSIONS, _get_version, import_optional_dependency + + +def get_sys_info() -> List[Tuple[str, Optional[Union[str, int]]]]: + """ + Returns system information as a list + """ + blob: List[Tuple[str, Optional[Union[str, int]]]] = [] + + # get full commit hash + commit = None + if os.path.isdir(".git") and os.path.isdir("pandas"): + try: + pipe = subprocess.Popen( + 'git log --format="%H" -n 1'.split(" "), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + so, serr = pipe.communicate() + except (OSError, ValueError): + pass + else: + if pipe.returncode == 0: + commit = so.decode("utf-8").strip().strip('"') + + blob.append(("commit", commit)) + + try: + (sysname, nodename, release, version, machine, processor) = platform.uname() + blob.extend( + [ + ("python", ".".join(map(str, sys.version_info))), + ("python-bits", struct.calcsize("P") * 8), + ("OS", f"{sysname}"), + ("OS-release", f"{release}"), + # ("Version", "{version}".format(version=version)), + ("machine", f"{machine}"), + ("processor", f"{processor}"), + ("byteorder", f"{sys.byteorder}"), + ("LC_ALL", f"{os.environ.get('LC_ALL', 'None')}"), + ("LANG", f"{os.environ.get('LANG', 'None')}"), + ("LOCALE", ".".join(map(str, locale.getlocale()))), + ] + ) + except (KeyError, ValueError): + pass + + return blob + + +def show_versions(as_json=False): + sys_info = get_sys_info() + deps = [ + "pandas", + # required + "numpy", + "pytz", + "dateutil", + # install / build, + "pip", + "setuptools", + "Cython", + # test + "pytest", + "hypothesis", + # docs + "sphinx", + # Other, need a min version + "blosc", + "feather", + "xlsxwriter", + "lxml.etree", + "html5lib", + "pymysql", + "psycopg2", + "jinja2", + # Other, not imported. + "IPython", + "pandas_datareader", + ] + + deps.extend(list(VERSIONS)) + deps_blob = [] + + for modname in deps: + mod = import_optional_dependency( + modname, raise_on_missing=False, on_version="ignore" + ) + ver: Optional[str] + if mod: + ver = _get_version(mod) + else: + ver = None + deps_blob.append((modname, ver)) + + if as_json: + j = dict(system=dict(sys_info), dependencies=dict(deps_blob)) + + if as_json is True: + print(j) + else: + with codecs.open(as_json, "wb", encoding="utf8") as f: + json.dump(j, f, indent=2) + + else: + maxlen = max(len(x) for x in deps) + tpl = "{{k:<{maxlen}}}: {{stat}}".format(maxlen=maxlen) + print("\nINSTALLED VERSIONS") + print("------------------") + for k, stat in sys_info: + print(tpl.format(k=k, stat=stat)) + print("") + for k, stat in deps_blob: + print(tpl.format(k=k, stat=stat)) + + +def main() -> int: + from optparse import OptionParser + + parser = OptionParser() + parser.add_option( + "-j", + "--json", + metavar="FILE", + nargs=1, + help="Save output as JSON into file, pass in '-' to output to stdout", + ) + + (options, args) = parser.parse_args() + + if options.json == "-": + options.json = True + + show_versions(as_json=options.json) + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/venv/Lib/site-packages/pandas/util/_test_decorators.py b/venv/Lib/site-packages/pandas/util/_test_decorators.py new file mode 100644 index 0000000..d880499 --- /dev/null +++ b/venv/Lib/site-packages/pandas/util/_test_decorators.py @@ -0,0 +1,264 @@ +""" +This module provides decorator functions which can be applied to test objects +in order to skip those objects when certain conditions occur. A sample use case +is to detect if the platform is missing ``matplotlib``. If so, any test objects +which require ``matplotlib`` and decorated with ``@td.skip_if_no_mpl`` will be +skipped by ``pytest`` during the execution of the test suite. + +To illustrate, after importing this module: + +import pandas.util._test_decorators as td + +The decorators can be applied to classes: + +@td.skip_if_some_reason +class Foo: + ... + +Or individual functions: + +@td.skip_if_some_reason +def test_foo(): + ... + +For more information, refer to the ``pytest`` documentation on ``skipif``. +""" +from distutils.version import LooseVersion +from functools import wraps +import locale +from typing import Callable, Optional + +import numpy as np +import pytest + +from pandas.compat import is_platform_32bit, is_platform_windows +from pandas.compat._optional import import_optional_dependency +from pandas.compat.numpy import _np_version + +from pandas.core.computation.expressions import _NUMEXPR_INSTALLED, _USE_NUMEXPR + + +def safe_import(mod_name: str, min_version: Optional[str] = None): + """ + Parameters: + ----------- + mod_name : str + Name of the module to be imported + min_version : str, default None + Minimum required version of the specified mod_name + + Returns: + -------- + object + The imported module if successful, or False + """ + try: + mod = __import__(mod_name) + except ImportError: + return False + + if not min_version: + return mod + else: + import sys + + try: + version = getattr(sys.modules[mod_name], "__version__") + except AttributeError: + # xlrd uses a capitalized attribute name + version = getattr(sys.modules[mod_name], "__VERSION__") + if version: + from distutils.version import LooseVersion + + if LooseVersion(version) >= LooseVersion(min_version): + return mod + + return False + + +# TODO: +# remove when gh-24839 is fixed; this affects numpy 1.16 +# and pytables 3.4.4 +tables = safe_import("tables") +xfail_non_writeable = pytest.mark.xfail( + tables + and LooseVersion(np.__version__) >= LooseVersion("1.16") + and LooseVersion(tables.__version__) < LooseVersion("3.5.1"), + reason=( + "gh-25511, gh-24839. pytables needs a " + "release beyong 3.4.4 to support numpy 1.16x" + ), +) + + +def _skip_if_no_mpl(): + mod = safe_import("matplotlib") + if mod: + mod.use("Agg", warn=True) + else: + return True + + +def _skip_if_has_locale(): + lang, _ = locale.getlocale() + if lang is not None: + return True + + +def _skip_if_not_us_locale(): + lang, _ = locale.getlocale() + if lang != "en_US": + return True + + +def _skip_if_no_scipy() -> bool: + return not ( + safe_import("scipy.stats") + and safe_import("scipy.sparse") + and safe_import("scipy.interpolate") + and safe_import("scipy.signal") + ) + + +def skip_if_installed(package: str) -> Callable: + """ + Skip a test if a package is installed. + + Parameters + ---------- + package : str + The name of the package. + """ + return pytest.mark.skipif( + safe_import(package), reason=f"Skipping because {package} is installed." + ) + + +def skip_if_no(package: str, min_version: Optional[str] = None) -> Callable: + """ + Generic function to help skip tests when required packages are not + present on the testing system. + + This function returns a pytest mark with a skip condition that will be + evaluated during test collection. An attempt will be made to import the + specified ``package`` and optionally ensure it meets the ``min_version`` + + The mark can be used as either a decorator for a test function or to be + applied to parameters in pytest.mark.parametrize calls or parametrized + fixtures. + + If the import and version check are unsuccessful, then the test function + (or test case when used in conjunction with parametrization) will be + skipped. + + Parameters + ---------- + package: str + The name of the required package. + min_version: str or None, default None + Optional minimum version of the package. + + Returns + ------- + _pytest.mark.structures.MarkDecorator + a pytest.mark.skipif to use as either a test decorator or a + parametrization mark. + """ + msg = f"Could not import '{package}'" + if min_version: + msg += f" satisfying a min_version of {min_version}" + return pytest.mark.skipif( + not safe_import(package, min_version=min_version), reason=msg + ) + + +skip_if_no_mpl = pytest.mark.skipif( + _skip_if_no_mpl(), reason="Missing matplotlib dependency" +) +skip_if_mpl = pytest.mark.skipif(not _skip_if_no_mpl(), reason="matplotlib is present") +skip_if_32bit = pytest.mark.skipif(is_platform_32bit(), reason="skipping for 32 bit") +skip_if_windows = pytest.mark.skipif(is_platform_windows(), reason="Running on Windows") +skip_if_windows_python_3 = pytest.mark.skipif( + is_platform_windows(), reason="not used on win32" +) +skip_if_has_locale = pytest.mark.skipif( + _skip_if_has_locale(), reason=f"Specific locale is set {locale.getlocale()[0]}", +) +skip_if_not_us_locale = pytest.mark.skipif( + _skip_if_not_us_locale(), reason=f"Specific locale is set {locale.getlocale()[0]}", +) +skip_if_no_scipy = pytest.mark.skipif( + _skip_if_no_scipy(), reason="Missing SciPy requirement" +) +skip_if_no_ne = pytest.mark.skipif( + not _USE_NUMEXPR, + reason=f"numexpr enabled->{_USE_NUMEXPR}, installed->{_NUMEXPR_INSTALLED}", +) + + +def skip_if_np_lt( + ver_str: str, reason: Optional[str] = None, *args, **kwds +) -> Callable: + if reason is None: + reason = f"NumPy {ver_str} or greater required" + return pytest.mark.skipif( + _np_version < LooseVersion(ver_str), reason=reason, *args, **kwds + ) + + +def parametrize_fixture_doc(*args): + """ + Intended for use as a decorator for parametrized fixture, + this function will wrap the decorated function with a pytest + ``parametrize_fixture_doc`` mark. That mark will format + initial fixture docstring by replacing placeholders {0}, {1} etc + with parameters passed as arguments. + + Parameters + ---------- + args: iterable + Positional arguments for docstring. + + Returns + ------- + function + The decorated function wrapped within a pytest + ``parametrize_fixture_doc`` mark + """ + + def documented_fixture(fixture): + fixture.__doc__ = fixture.__doc__.format(*args) + return fixture + + return documented_fixture + + +def check_file_leaks(func) -> Callable: + """ + Decorate a test function tot check that we are not leaking file descriptors. + """ + psutil = safe_import("psutil") + if not psutil: + return func + + @wraps(func) + def new_func(*args, **kwargs): + proc = psutil.Process() + flist = proc.open_files() + + func(*args, **kwargs) + + flist2 = proc.open_files() + assert flist2 == flist + + return new_func + + +def async_mark(): + try: + import_optional_dependency("pytest_asyncio") + async_mark = pytest.mark.asyncio + except ImportError: + async_mark = pytest.mark.skip(reason="Missing dependency pytest-asyncio") + + return async_mark diff --git a/venv/Lib/site-packages/pandas/util/_tester.py b/venv/Lib/site-packages/pandas/util/_tester.py new file mode 100644 index 0000000..b299f37 --- /dev/null +++ b/venv/Lib/site-packages/pandas/util/_tester.py @@ -0,0 +1,30 @@ +""" +Entrypoint for testing from the top-level namespace. +""" +import os +import sys + +PKG = os.path.dirname(os.path.dirname(__file__)) + + +def test(extra_args=None): + try: + import pytest + except ImportError: + raise ImportError("Need pytest>=5.0.1 to run tests") + try: + import hypothesis # noqa + except ImportError: + raise ImportError("Need hypothesis>=3.58 to run tests") + cmd = ["--skip-slow", "--skip-network", "--skip-db"] + if extra_args: + if not isinstance(extra_args, list): + extra_args = [extra_args] + cmd = extra_args + cmd += [PKG] + joined = " ".join(cmd) + print(f"running: pytest {joined}") + sys.exit(pytest.main(cmd)) + + +__all__ = ["test"] diff --git a/venv/Lib/site-packages/pandas/util/_validators.py b/venv/Lib/site-packages/pandas/util/_validators.py new file mode 100644 index 0000000..b69c974 --- /dev/null +++ b/venv/Lib/site-packages/pandas/util/_validators.py @@ -0,0 +1,378 @@ +""" +Module that contains many useful utilities +for validating data or function arguments +""" +from typing import Iterable, Union +import warnings + +import numpy as np + +from pandas.core.dtypes.common import is_bool + + +def _check_arg_length(fname, args, max_fname_arg_count, compat_args): + """ + Checks whether 'args' has length of at most 'compat_args'. Raises + a TypeError if that is not the case, similar to in Python when a + function is called with too many arguments. + """ + if max_fname_arg_count < 0: + raise ValueError("'max_fname_arg_count' must be non-negative") + + if len(args) > len(compat_args): + max_arg_count = len(compat_args) + max_fname_arg_count + actual_arg_count = len(args) + max_fname_arg_count + argument = "argument" if max_arg_count == 1 else "arguments" + + raise TypeError( + f"{fname}() takes at most {max_arg_count} {argument} " + f"({actual_arg_count} given)" + ) + + +def _check_for_default_values(fname, arg_val_dict, compat_args): + """ + Check that the keys in `arg_val_dict` are mapped to their + default values as specified in `compat_args`. + + Note that this function is to be called only when it has been + checked that arg_val_dict.keys() is a subset of compat_args + """ + for key in arg_val_dict: + # try checking equality directly with '=' operator, + # as comparison may have been overridden for the left + # hand object + try: + v1 = arg_val_dict[key] + v2 = compat_args[key] + + # check for None-ness otherwise we could end up + # comparing a numpy array vs None + if (v1 is not None and v2 is None) or (v1 is None and v2 is not None): + match = False + else: + match = v1 == v2 + + if not is_bool(match): + raise ValueError("'match' is not a boolean") + + # could not compare them directly, so try comparison + # using the 'is' operator + except ValueError: + match = arg_val_dict[key] is compat_args[key] + + if not match: + raise ValueError( + f"the '{key}' parameter is not supported in " + f"the pandas implementation of {fname}()" + ) + + +def validate_args(fname, args, max_fname_arg_count, compat_args): + """ + Checks whether the length of the `*args` argument passed into a function + has at most `len(compat_args)` arguments and whether or not all of these + elements in `args` are set to their default values. + + Parameters + ---------- + fname : str + The name of the function being passed the `*args` parameter + args : tuple + The `*args` parameter passed into a function + max_fname_arg_count : int + The maximum number of arguments that the function `fname` + can accept, excluding those in `args`. Used for displaying + appropriate error messages. Must be non-negative. + compat_args : dict + A dictionary of keys and their associated default values. + In order to accommodate buggy behaviour in some versions of `numpy`, + where a signature displayed keyword arguments but then passed those + arguments **positionally** internally when calling downstream + implementations, a dict ensures that the original + order of the keyword arguments is enforced. + Raises + ------ + TypeError + If `args` contains more values than there are `compat_args` + ValueError + If `args` contains values that do not correspond to those + of the default values specified in `compat_args` + """ + _check_arg_length(fname, args, max_fname_arg_count, compat_args) + + # We do this so that we can provide a more informative + # error message about the parameters that we are not + # supporting in the pandas implementation of 'fname' + kwargs = dict(zip(compat_args, args)) + _check_for_default_values(fname, kwargs, compat_args) + + +def _check_for_invalid_keys(fname, kwargs, compat_args): + """ + Checks whether 'kwargs' contains any keys that are not + in 'compat_args' and raises a TypeError if there is one. + """ + # set(dict) --> set of the dictionary's keys + diff = set(kwargs) - set(compat_args) + + if diff: + bad_arg = list(diff)[0] + raise TypeError(f"{fname}() got an unexpected keyword argument '{bad_arg}'") + + +def validate_kwargs(fname, kwargs, compat_args): + """ + Checks whether parameters passed to the **kwargs argument in a + function `fname` are valid parameters as specified in `*compat_args` + and whether or not they are set to their default values. + + Parameters + ---------- + fname : str + The name of the function being passed the `**kwargs` parameter + kwargs : dict + The `**kwargs` parameter passed into `fname` + compat_args: dict + A dictionary of keys that `kwargs` is allowed to have and their + associated default values + + Raises + ------ + TypeError if `kwargs` contains keys not in `compat_args` + ValueError if `kwargs` contains keys in `compat_args` that do not + map to the default values specified in `compat_args` + """ + kwds = kwargs.copy() + _check_for_invalid_keys(fname, kwargs, compat_args) + _check_for_default_values(fname, kwds, compat_args) + + +def validate_args_and_kwargs(fname, args, kwargs, max_fname_arg_count, compat_args): + """ + Checks whether parameters passed to the *args and **kwargs argument in a + function `fname` are valid parameters as specified in `*compat_args` + and whether or not they are set to their default values. + + Parameters + ---------- + fname: str + The name of the function being passed the `**kwargs` parameter + args: tuple + The `*args` parameter passed into a function + kwargs: dict + The `**kwargs` parameter passed into `fname` + max_fname_arg_count: int + The minimum number of arguments that the function `fname` + requires, excluding those in `args`. Used for displaying + appropriate error messages. Must be non-negative. + compat_args: dict + A dictionary of keys that `kwargs` is allowed to + have and their associated default values. + + Raises + ------ + TypeError if `args` contains more values than there are + `compat_args` OR `kwargs` contains keys not in `compat_args` + ValueError if `args` contains values not at the default value (`None`) + `kwargs` contains keys in `compat_args` that do not map to the default + value as specified in `compat_args` + + See Also + -------- + validate_args : Purely args validation. + validate_kwargs : Purely kwargs validation. + + """ + # Check that the total number of arguments passed in (i.e. + # args and kwargs) does not exceed the length of compat_args + _check_arg_length( + fname, args + tuple(kwargs.values()), max_fname_arg_count, compat_args + ) + + # Check there is no overlap with the positional and keyword + # arguments, similar to what is done in actual Python functions + args_dict = dict(zip(compat_args, args)) + + for key in args_dict: + if key in kwargs: + raise TypeError( + f"{fname}() got multiple values for keyword argument '{key}'" + ) + + kwargs.update(args_dict) + validate_kwargs(fname, kwargs, compat_args) + + +def validate_bool_kwarg(value, arg_name): + """ Ensures that argument passed in arg_name is of type bool. """ + if not (is_bool(value) or value is None): + raise ValueError( + f'For argument "{arg_name}" expected type bool, received ' + f"type {type(value).__name__}." + ) + return value + + +def validate_axis_style_args(data, args, kwargs, arg_name, method_name): + """Argument handler for mixed index, columns / axis functions + + In an attempt to handle both `.method(index, columns)`, and + `.method(arg, axis=.)`, we have to do some bad things to argument + parsing. This translates all arguments to `{index=., columns=.}` style. + + Parameters + ---------- + data : DataFrame + args : tuple + All positional arguments from the user + kwargs : dict + All keyword arguments from the user + arg_name, method_name : str + Used for better error messages + + Returns + ------- + kwargs : dict + A dictionary of keyword arguments. Doesn't modify ``kwargs`` + inplace, so update them with the return value here. + + Examples + -------- + >>> df._validate_axis_style_args((str.upper,), {'columns': id}, + ... 'mapper', 'rename') + {'columns': , 'index': } + + This emits a warning + >>> df._validate_axis_style_args((str.upper, id), {}, + ... 'mapper', 'rename') + {'columns': , 'index': } + """ + # TODO: Change to keyword-only args and remove all this + + out = {} + # Goal: fill 'out' with index/columns-style arguments + # like out = {'index': foo, 'columns': bar} + + # Start by validating for consistency + if "axis" in kwargs and any(x in kwargs for x in data._AXIS_NUMBERS): + msg = "Cannot specify both 'axis' and any of 'index' or 'columns'." + raise TypeError(msg) + + # First fill with explicit values provided by the user... + if arg_name in kwargs: + if args: + msg = f"{method_name} got multiple values for argument '{arg_name}'" + raise TypeError(msg) + + axis = data._get_axis_name(kwargs.get("axis", 0)) + out[axis] = kwargs[arg_name] + + # More user-provided arguments, now from kwargs + for k, v in kwargs.items(): + try: + ax = data._get_axis_name(k) + except ValueError: + pass + else: + out[ax] = v + + # All user-provided kwargs have been handled now. + # Now we supplement with positional arguments, emitting warnings + # when there's ambiguity and raising when there's conflicts + + if len(args) == 0: + pass # It's up to the function to decide if this is valid + elif len(args) == 1: + axis = data._get_axis_name(kwargs.get("axis", 0)) + out[axis] = args[0] + elif len(args) == 2: + if "axis" in kwargs: + # Unambiguously wrong + msg = "Cannot specify both 'axis' and any of 'index' or 'columns'" + raise TypeError(msg) + + msg = ( + "Interpreting call\n\t'.{method_name}(a, b)' as " + "\n\t'.{method_name}(index=a, columns=b)'.\nUse named " + "arguments to remove any ambiguity. In the future, using " + "positional arguments for 'index' or 'columns' will raise " + " a 'TypeError'." + ) + warnings.warn(msg.format(method_name=method_name), FutureWarning, stacklevel=4) + out[data._AXIS_NAMES[0]] = args[0] + out[data._AXIS_NAMES[1]] = args[1] + else: + msg = f"Cannot specify all of '{arg_name}', 'index', 'columns'." + raise TypeError(msg) + return out + + +def validate_fillna_kwargs(value, method, validate_scalar_dict_value=True): + """Validate the keyword arguments to 'fillna'. + + This checks that exactly one of 'value' and 'method' is specified. + If 'method' is specified, this validates that it's a valid method. + + Parameters + ---------- + value, method : object + The 'value' and 'method' keyword arguments for 'fillna'. + validate_scalar_dict_value : bool, default True + Whether to validate that 'value' is a scalar or dict. Specifically, + validate that it is not a list or tuple. + + Returns + ------- + value, method : object + """ + from pandas.core.missing import clean_fill_method + + if value is None and method is None: + raise ValueError("Must specify a fill 'value' or 'method'.") + elif value is None and method is not None: + method = clean_fill_method(method) + + elif value is not None and method is None: + if validate_scalar_dict_value and isinstance(value, (list, tuple)): + raise TypeError( + '"value" parameter must be a scalar or dict, but ' + f'you passed a "{type(value).__name__}"' + ) + + elif value is not None and method is not None: + raise ValueError("Cannot specify both 'value' and 'method'.") + + return value, method + + +def validate_percentile(q: Union[float, Iterable[float]]) -> np.ndarray: + """ + Validate percentiles (used by describe and quantile). + + This function checks if the given float oriterable of floats is a valid percentile + otherwise raises a ValueError. + + Parameters + ---------- + q: float or iterable of floats + A single percentile or an iterable of percentiles. + + Returns + ------- + ndarray + An ndarray of the percentiles if valid. + + Raises + ------ + ValueError if percentiles are not in given interval([0, 1]). + """ + msg = "percentiles should all be in the interval [0, 1]. Try {0} instead." + q_arr = np.asarray(q) + if q_arr.ndim == 0: + if not 0 <= q_arr <= 1: + raise ValueError(msg.format(q_arr / 100.0)) + else: + if not all(0 <= qs <= 1 for qs in q_arr): + raise ValueError(msg.format(q_arr / 100.0)) + return q_arr diff --git a/venv/Lib/site-packages/pandas/util/testing.py b/venv/Lib/site-packages/pandas/util/testing.py new file mode 100644 index 0000000..af9fe48 --- /dev/null +++ b/venv/Lib/site-packages/pandas/util/testing.py @@ -0,0 +1,12 @@ +import warnings + +from pandas._testing import * # noqa + +warnings.warn( + ( + "pandas.util.testing is deprecated. Use the functions in the " + "public API at pandas.testing instead." + ), + FutureWarning, + stacklevel=2, +) diff --git a/venv/Lib/site-packages/pytz-2019.3.dist-info/DESCRIPTION.rst b/venv/Lib/site-packages/pytz-2019.3.dist-info/DESCRIPTION.rst new file mode 100644 index 0000000..bea2c3b --- /dev/null +++ b/venv/Lib/site-packages/pytz-2019.3.dist-info/DESCRIPTION.rst @@ -0,0 +1,584 @@ +pytz - World Timezone Definitions for Python +============================================ + +:Author: Stuart Bishop + +Introduction +~~~~~~~~~~~~ + +pytz brings the Olson tz database into Python. This library allows +accurate and cross platform timezone calculations using Python 2.4 +or higher. It also solves the issue of ambiguous times at the end +of daylight saving time, which you can read more about in the Python +Library Reference (``datetime.tzinfo``). + +Almost all of the Olson timezones are supported. + +.. note:: + + This library differs from the documented Python API for + tzinfo implementations; if you want to create local wallclock + times you need to use the ``localize()`` method documented in this + document. In addition, if you perform date arithmetic on local + times that cross DST boundaries, the result may be in an incorrect + timezone (ie. subtract 1 minute from 2002-10-27 1:00 EST and you get + 2002-10-27 0:59 EST instead of the correct 2002-10-27 1:59 EDT). A + ``normalize()`` method is provided to correct this. Unfortunately these + issues cannot be resolved without modifying the Python datetime + implementation (see PEP-431). + + +Installation +~~~~~~~~~~~~ + +This package can either be installed using ``pip`` or from a tarball using the +standard Python distutils. + +If you are installing using ``pip``, you don't need to download anything as the +latest version will be downloaded for you from PyPI:: + + pip install pytz + +If you are installing from a tarball, run the following command as an +administrative user:: + + python setup.py install + + +Example & Usage +~~~~~~~~~~~~~~~ + +Localized times and date arithmetic +----------------------------------- + +>>> from datetime import datetime, timedelta +>>> from pytz import timezone +>>> import pytz +>>> utc = pytz.utc +>>> utc.zone +'UTC' +>>> eastern = timezone('US/Eastern') +>>> eastern.zone +'US/Eastern' +>>> amsterdam = timezone('Europe/Amsterdam') +>>> fmt = '%Y-%m-%d %H:%M:%S %Z%z' + +This library only supports two ways of building a localized time. The +first is to use the ``localize()`` method provided by the pytz library. +This is used to localize a naive datetime (datetime with no timezone +information): + +>>> loc_dt = eastern.localize(datetime(2002, 10, 27, 6, 0, 0)) +>>> print(loc_dt.strftime(fmt)) +2002-10-27 06:00:00 EST-0500 + +The second way of building a localized time is by converting an existing +localized time using the standard ``astimezone()`` method: + +>>> ams_dt = loc_dt.astimezone(amsterdam) +>>> ams_dt.strftime(fmt) +'2002-10-27 12:00:00 CET+0100' + +Unfortunately using the tzinfo argument of the standard datetime +constructors ''does not work'' with pytz for many timezones. + +>>> datetime(2002, 10, 27, 12, 0, 0, tzinfo=amsterdam).strftime(fmt) # /!\ Does not work this way! +'2002-10-27 12:00:00 LMT+0020' + +It is safe for timezones without daylight saving transitions though, such +as UTC: + +>>> datetime(2002, 10, 27, 12, 0, 0, tzinfo=pytz.utc).strftime(fmt) # /!\ Not recommended except for UTC +'2002-10-27 12:00:00 UTC+0000' + +The preferred way of dealing with times is to always work in UTC, +converting to localtime only when generating output to be read +by humans. + +>>> utc_dt = datetime(2002, 10, 27, 6, 0, 0, tzinfo=utc) +>>> loc_dt = utc_dt.astimezone(eastern) +>>> loc_dt.strftime(fmt) +'2002-10-27 01:00:00 EST-0500' + +This library also allows you to do date arithmetic using local +times, although it is more complicated than working in UTC as you +need to use the ``normalize()`` method to handle daylight saving time +and other timezone transitions. In this example, ``loc_dt`` is set +to the instant when daylight saving time ends in the US/Eastern +timezone. + +>>> before = loc_dt - timedelta(minutes=10) +>>> before.strftime(fmt) +'2002-10-27 00:50:00 EST-0500' +>>> eastern.normalize(before).strftime(fmt) +'2002-10-27 01:50:00 EDT-0400' +>>> after = eastern.normalize(before + timedelta(minutes=20)) +>>> after.strftime(fmt) +'2002-10-27 01:10:00 EST-0500' + +Creating local times is also tricky, and the reason why working with +local times is not recommended. Unfortunately, you cannot just pass +a ``tzinfo`` argument when constructing a datetime (see the next +section for more details) + +>>> dt = datetime(2002, 10, 27, 1, 30, 0) +>>> dt1 = eastern.localize(dt, is_dst=True) +>>> dt1.strftime(fmt) +'2002-10-27 01:30:00 EDT-0400' +>>> dt2 = eastern.localize(dt, is_dst=False) +>>> dt2.strftime(fmt) +'2002-10-27 01:30:00 EST-0500' + +Converting between timezones is more easily done, using the +standard astimezone method. + +>>> utc_dt = utc.localize(datetime.utcfromtimestamp(1143408899)) +>>> utc_dt.strftime(fmt) +'2006-03-26 21:34:59 UTC+0000' +>>> au_tz = timezone('Australia/Sydney') +>>> au_dt = utc_dt.astimezone(au_tz) +>>> au_dt.strftime(fmt) +'2006-03-27 08:34:59 AEDT+1100' +>>> utc_dt2 = au_dt.astimezone(utc) +>>> utc_dt2.strftime(fmt) +'2006-03-26 21:34:59 UTC+0000' +>>> utc_dt == utc_dt2 +True + +You can take shortcuts when dealing with the UTC side of timezone +conversions. ``normalize()`` and ``localize()`` are not really +necessary when there are no daylight saving time transitions to +deal with. + +>>> utc_dt = datetime.utcfromtimestamp(1143408899).replace(tzinfo=utc) +>>> utc_dt.strftime(fmt) +'2006-03-26 21:34:59 UTC+0000' +>>> au_tz = timezone('Australia/Sydney') +>>> au_dt = au_tz.normalize(utc_dt.astimezone(au_tz)) +>>> au_dt.strftime(fmt) +'2006-03-27 08:34:59 AEDT+1100' +>>> utc_dt2 = au_dt.astimezone(utc) +>>> utc_dt2.strftime(fmt) +'2006-03-26 21:34:59 UTC+0000' + + +``tzinfo`` API +-------------- + +The ``tzinfo`` instances returned by the ``timezone()`` function have +been extended to cope with ambiguous times by adding an ``is_dst`` +parameter to the ``utcoffset()``, ``dst()`` && ``tzname()`` methods. + +>>> tz = timezone('America/St_Johns') + +>>> normal = datetime(2009, 9, 1) +>>> ambiguous = datetime(2009, 10, 31, 23, 30) + +The ``is_dst`` parameter is ignored for most timestamps. It is only used +during DST transition ambiguous periods to resolve that ambiguity. + +>>> print(tz.utcoffset(normal, is_dst=True)) +-1 day, 21:30:00 +>>> print(tz.dst(normal, is_dst=True)) +1:00:00 +>>> tz.tzname(normal, is_dst=True) +'NDT' + +>>> print(tz.utcoffset(ambiguous, is_dst=True)) +-1 day, 21:30:00 +>>> print(tz.dst(ambiguous, is_dst=True)) +1:00:00 +>>> tz.tzname(ambiguous, is_dst=True) +'NDT' + +>>> print(tz.utcoffset(normal, is_dst=False)) +-1 day, 21:30:00 +>>> tz.dst(normal, is_dst=False) +datetime.timedelta(0, 3600) +>>> tz.tzname(normal, is_dst=False) +'NDT' + +>>> print(tz.utcoffset(ambiguous, is_dst=False)) +-1 day, 20:30:00 +>>> tz.dst(ambiguous, is_dst=False) +datetime.timedelta(0) +>>> tz.tzname(ambiguous, is_dst=False) +'NST' + +If ``is_dst`` is not specified, ambiguous timestamps will raise +an ``pytz.exceptions.AmbiguousTimeError`` exception. + +>>> print(tz.utcoffset(normal)) +-1 day, 21:30:00 +>>> print(tz.dst(normal)) +1:00:00 +>>> tz.tzname(normal) +'NDT' + +>>> import pytz.exceptions +>>> try: +... tz.utcoffset(ambiguous) +... except pytz.exceptions.AmbiguousTimeError: +... print('pytz.exceptions.AmbiguousTimeError: %s' % ambiguous) +pytz.exceptions.AmbiguousTimeError: 2009-10-31 23:30:00 +>>> try: +... tz.dst(ambiguous) +... except pytz.exceptions.AmbiguousTimeError: +... print('pytz.exceptions.AmbiguousTimeError: %s' % ambiguous) +pytz.exceptions.AmbiguousTimeError: 2009-10-31 23:30:00 +>>> try: +... tz.tzname(ambiguous) +... except pytz.exceptions.AmbiguousTimeError: +... print('pytz.exceptions.AmbiguousTimeError: %s' % ambiguous) +pytz.exceptions.AmbiguousTimeError: 2009-10-31 23:30:00 + + +Problems with Localtime +~~~~~~~~~~~~~~~~~~~~~~~ + +The major problem we have to deal with is that certain datetimes +may occur twice in a year. For example, in the US/Eastern timezone +on the last Sunday morning in October, the following sequence +happens: + + - 01:00 EDT occurs + - 1 hour later, instead of 2:00am the clock is turned back 1 hour + and 01:00 happens again (this time 01:00 EST) + +In fact, every instant between 01:00 and 02:00 occurs twice. This means +that if you try and create a time in the 'US/Eastern' timezone +the standard datetime syntax, there is no way to specify if you meant +before of after the end-of-daylight-saving-time transition. Using the +pytz custom syntax, the best you can do is make an educated guess: + +>>> loc_dt = eastern.localize(datetime(2002, 10, 27, 1, 30, 00)) +>>> loc_dt.strftime(fmt) +'2002-10-27 01:30:00 EST-0500' + +As you can see, the system has chosen one for you and there is a 50% +chance of it being out by one hour. For some applications, this does +not matter. However, if you are trying to schedule meetings with people +in different timezones or analyze log files it is not acceptable. + +The best and simplest solution is to stick with using UTC. The pytz +package encourages using UTC for internal timezone representation by +including a special UTC implementation based on the standard Python +reference implementation in the Python documentation. + +The UTC timezone unpickles to be the same instance, and pickles to a +smaller size than other pytz tzinfo instances. The UTC implementation +can be obtained as pytz.utc, pytz.UTC, or pytz.timezone('UTC'). + +>>> import pickle, pytz +>>> dt = datetime(2005, 3, 1, 14, 13, 21, tzinfo=utc) +>>> naive = dt.replace(tzinfo=None) +>>> p = pickle.dumps(dt, 1) +>>> naive_p = pickle.dumps(naive, 1) +>>> len(p) - len(naive_p) +17 +>>> new = pickle.loads(p) +>>> new == dt +True +>>> new is dt +False +>>> new.tzinfo is dt.tzinfo +True +>>> pytz.utc is pytz.UTC is pytz.timezone('UTC') +True + +Note that some other timezones are commonly thought of as the same (GMT, +Greenwich, Universal, etc.). The definition of UTC is distinct from these +other timezones, and they are not equivalent. For this reason, they will +not compare the same in Python. + +>>> utc == pytz.timezone('GMT') +False + +See the section `What is UTC`_, below. + +If you insist on working with local times, this library provides a +facility for constructing them unambiguously: + +>>> loc_dt = datetime(2002, 10, 27, 1, 30, 00) +>>> est_dt = eastern.localize(loc_dt, is_dst=True) +>>> edt_dt = eastern.localize(loc_dt, is_dst=False) +>>> print(est_dt.strftime(fmt) + ' / ' + edt_dt.strftime(fmt)) +2002-10-27 01:30:00 EDT-0400 / 2002-10-27 01:30:00 EST-0500 + +If you pass None as the is_dst flag to localize(), pytz will refuse to +guess and raise exceptions if you try to build ambiguous or non-existent +times. + +For example, 1:30am on 27th Oct 2002 happened twice in the US/Eastern +timezone when the clocks where put back at the end of Daylight Saving +Time: + +>>> dt = datetime(2002, 10, 27, 1, 30, 00) +>>> try: +... eastern.localize(dt, is_dst=None) +... except pytz.exceptions.AmbiguousTimeError: +... print('pytz.exceptions.AmbiguousTimeError: %s' % dt) +pytz.exceptions.AmbiguousTimeError: 2002-10-27 01:30:00 + +Similarly, 2:30am on 7th April 2002 never happened at all in the +US/Eastern timezone, as the clocks where put forward at 2:00am skipping +the entire hour: + +>>> dt = datetime(2002, 4, 7, 2, 30, 00) +>>> try: +... eastern.localize(dt, is_dst=None) +... except pytz.exceptions.NonExistentTimeError: +... print('pytz.exceptions.NonExistentTimeError: %s' % dt) +pytz.exceptions.NonExistentTimeError: 2002-04-07 02:30:00 + +Both of these exceptions share a common base class to make error handling +easier: + +>>> isinstance(pytz.AmbiguousTimeError(), pytz.InvalidTimeError) +True +>>> isinstance(pytz.NonExistentTimeError(), pytz.InvalidTimeError) +True + + +A special case is where countries change their timezone definitions +with no daylight savings time switch. For example, in 1915 Warsaw +switched from Warsaw time to Central European time with no daylight savings +transition. So at the stroke of midnight on August 5th 1915 the clocks +were wound back 24 minutes creating an ambiguous time period that cannot +be specified without referring to the timezone abbreviation or the +actual UTC offset. In this case midnight happened twice, neither time +during a daylight saving time period. pytz handles this transition by +treating the ambiguous period before the switch as daylight savings +time, and the ambiguous period after as standard time. + + +>>> warsaw = pytz.timezone('Europe/Warsaw') +>>> amb_dt1 = warsaw.localize(datetime(1915, 8, 4, 23, 59, 59), is_dst=True) +>>> amb_dt1.strftime(fmt) +'1915-08-04 23:59:59 WMT+0124' +>>> amb_dt2 = warsaw.localize(datetime(1915, 8, 4, 23, 59, 59), is_dst=False) +>>> amb_dt2.strftime(fmt) +'1915-08-04 23:59:59 CET+0100' +>>> switch_dt = warsaw.localize(datetime(1915, 8, 5, 00, 00, 00), is_dst=False) +>>> switch_dt.strftime(fmt) +'1915-08-05 00:00:00 CET+0100' +>>> str(switch_dt - amb_dt1) +'0:24:01' +>>> str(switch_dt - amb_dt2) +'0:00:01' + +The best way of creating a time during an ambiguous time period is +by converting from another timezone such as UTC: + +>>> utc_dt = datetime(1915, 8, 4, 22, 36, tzinfo=pytz.utc) +>>> utc_dt.astimezone(warsaw).strftime(fmt) +'1915-08-04 23:36:00 CET+0100' + +The standard Python way of handling all these ambiguities is not to +handle them, such as demonstrated in this example using the US/Eastern +timezone definition from the Python documentation (Note that this +implementation only works for dates between 1987 and 2006 - it is +included for tests only!): + +>>> from pytz.reference import Eastern # pytz.reference only for tests +>>> dt = datetime(2002, 10, 27, 0, 30, tzinfo=Eastern) +>>> str(dt) +'2002-10-27 00:30:00-04:00' +>>> str(dt + timedelta(hours=1)) +'2002-10-27 01:30:00-05:00' +>>> str(dt + timedelta(hours=2)) +'2002-10-27 02:30:00-05:00' +>>> str(dt + timedelta(hours=3)) +'2002-10-27 03:30:00-05:00' + +Notice the first two results? At first glance you might think they are +correct, but taking the UTC offset into account you find that they are +actually two hours appart instead of the 1 hour we asked for. + +>>> from pytz.reference import UTC # pytz.reference only for tests +>>> str(dt.astimezone(UTC)) +'2002-10-27 04:30:00+00:00' +>>> str((dt + timedelta(hours=1)).astimezone(UTC)) +'2002-10-27 06:30:00+00:00' + + +Country Information +~~~~~~~~~~~~~~~~~~~ + +A mechanism is provided to access the timezones commonly in use +for a particular country, looked up using the ISO 3166 country code. +It returns a list of strings that can be used to retrieve the relevant +tzinfo instance using ``pytz.timezone()``: + +>>> print(' '.join(pytz.country_timezones['nz'])) +Pacific/Auckland Pacific/Chatham + +The Olson database comes with a ISO 3166 country code to English country +name mapping that pytz exposes as a dictionary: + +>>> print(pytz.country_names['nz']) +New Zealand + + +What is UTC +~~~~~~~~~~~ + +'UTC' is `Coordinated Universal Time`_. It is a successor to, but distinct +from, Greenwich Mean Time (GMT) and the various definitions of Universal +Time. UTC is now the worldwide standard for regulating clocks and time +measurement. + +All other timezones are defined relative to UTC, and include offsets like +UTC+0800 - hours to add or subtract from UTC to derive the local time. No +daylight saving time occurs in UTC, making it a useful timezone to perform +date arithmetic without worrying about the confusion and ambiguities caused +by daylight saving time transitions, your country changing its timezone, or +mobile computers that roam through multiple timezones. + +.. _Coordinated Universal Time: https://en.wikipedia.org/wiki/Coordinated_Universal_Time + + +Helpers +~~~~~~~ + +There are two lists of timezones provided. + +``all_timezones`` is the exhaustive list of the timezone names that can +be used. + +>>> from pytz import all_timezones +>>> len(all_timezones) >= 500 +True +>>> 'Etc/Greenwich' in all_timezones +True + +``common_timezones`` is a list of useful, current timezones. It doesn't +contain deprecated zones or historical zones, except for a few I've +deemed in common usage, such as US/Eastern (open a bug report if you +think other timezones are deserving of being included here). It is also +a sequence of strings. + +>>> from pytz import common_timezones +>>> len(common_timezones) < len(all_timezones) +True +>>> 'Etc/Greenwich' in common_timezones +False +>>> 'Australia/Melbourne' in common_timezones +True +>>> 'US/Eastern' in common_timezones +True +>>> 'Canada/Eastern' in common_timezones +True +>>> 'Australia/Yancowinna' in all_timezones +True +>>> 'Australia/Yancowinna' in common_timezones +False + +Both ``common_timezones`` and ``all_timezones`` are alphabetically +sorted: + +>>> common_timezones_dupe = common_timezones[:] +>>> common_timezones_dupe.sort() +>>> common_timezones == common_timezones_dupe +True +>>> all_timezones_dupe = all_timezones[:] +>>> all_timezones_dupe.sort() +>>> all_timezones == all_timezones_dupe +True + +``all_timezones`` and ``common_timezones`` are also available as sets. + +>>> from pytz import all_timezones_set, common_timezones_set +>>> 'US/Eastern' in all_timezones_set +True +>>> 'US/Eastern' in common_timezones_set +True +>>> 'Australia/Victoria' in common_timezones_set +False + +You can also retrieve lists of timezones used by particular countries +using the ``country_timezones()`` function. It requires an ISO-3166 +two letter country code. + +>>> from pytz import country_timezones +>>> print(' '.join(country_timezones('ch'))) +Europe/Zurich +>>> print(' '.join(country_timezones('CH'))) +Europe/Zurich + + +Internationalization - i18n/l10n +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Pytz is an interface to the IANA database, which uses ASCII names. The `Unicode Consortium's Unicode Locales (CLDR) `_ +project provides translations. Thomas Khyn's +`l18n `_ package can be used to access +these translations from Python. + + +License +~~~~~~~ + +MIT license. + +This code is also available as part of Zope 3 under the Zope Public +License, Version 2.1 (ZPL). + +I'm happy to relicense this code if necessary for inclusion in other +open source projects. + + +Latest Versions +~~~~~~~~~~~~~~~ + +This package will be updated after releases of the Olson timezone +database. The latest version can be downloaded from the `Python Package +Index `_. The code that is used +to generate this distribution is hosted on launchpad.net and available +using git:: + + git clone https://git.launchpad.net/pytz + +A mirror on github is also available at https://github.com/stub42/pytz + +Announcements of new releases are made on +`Launchpad `_, and the +`Atom feed `_ +hosted there. + + +Bugs, Feature Requests & Patches +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Bugs can be reported using `Launchpad `__. + + +Issues & Limitations +~~~~~~~~~~~~~~~~~~~~ + +- Offsets from UTC are rounded to the nearest whole minute, so timezones + such as Europe/Amsterdam pre 1937 will be up to 30 seconds out. This + is a limitation of the Python datetime library. + +- If you think a timezone definition is incorrect, I probably can't fix + it. pytz is a direct translation of the Olson timezone database, and + changes to the timezone definitions need to be made to this source. + If you find errors they should be reported to the time zone mailing + list, linked from http://www.iana.org/time-zones. + + +Further Reading +~~~~~~~~~~~~~~~ + +More info than you want to know about timezones: +http://www.twinsun.com/tz/tz-link.htm + + +Contact +~~~~~~~ + +Stuart Bishop + + + + diff --git a/venv/Lib/site-packages/pytz-2019.3.dist-info/INSTALLER b/venv/Lib/site-packages/pytz-2019.3.dist-info/INSTALLER new file mode 100644 index 0000000..a1b589e --- /dev/null +++ b/venv/Lib/site-packages/pytz-2019.3.dist-info/INSTALLER @@ -0,0 +1 @@ +pip diff --git a/venv/Lib/site-packages/pytz-2019.3.dist-info/LICENSE.txt b/venv/Lib/site-packages/pytz-2019.3.dist-info/LICENSE.txt new file mode 100644 index 0000000..7c901fd --- /dev/null +++ b/venv/Lib/site-packages/pytz-2019.3.dist-info/LICENSE.txt @@ -0,0 +1,19 @@ +Copyright (c) 2003-2018 Stuart Bishop + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/venv/Lib/site-packages/pytz-2019.3.dist-info/METADATA b/venv/Lib/site-packages/pytz-2019.3.dist-info/METADATA new file mode 100644 index 0000000..b01a352 --- /dev/null +++ b/venv/Lib/site-packages/pytz-2019.3.dist-info/METADATA @@ -0,0 +1,618 @@ +Metadata-Version: 2.0 +Name: pytz +Version: 2019.3 +Summary: World timezone definitions, modern and historical +Home-page: http://pythonhosted.org/pytz +Author: Stuart Bishop +Author-email: stuart@stuartbishop.net +Maintainer: Stuart Bishop +Maintainer-email: stuart@stuartbishop.net +License: MIT +Download-URL: https://pypi.org/project/pytz/ +Keywords: timezone,tzinfo,datetime,olson,time +Platform: Independent +Classifier: Development Status :: 6 - Mature +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: MIT License +Classifier: Natural Language :: English +Classifier: Operating System :: OS Independent +Classifier: Programming Language :: Python +Classifier: Programming Language :: Python :: 2 +Classifier: Programming Language :: Python :: 2.4 +Classifier: Programming Language :: Python :: 2.5 +Classifier: Programming Language :: Python :: 2.6 +Classifier: Programming Language :: Python :: 2.7 +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3.0 +Classifier: Programming Language :: Python :: 3.1 +Classifier: Programming Language :: Python :: 3.2 +Classifier: Programming Language :: Python :: 3.3 +Classifier: Programming Language :: Python :: 3.4 +Classifier: Programming Language :: Python :: 3.5 +Classifier: Programming Language :: Python :: 3.6 +Classifier: Topic :: Software Development :: Libraries :: Python Modules + +pytz - World Timezone Definitions for Python +============================================ + +:Author: Stuart Bishop + +Introduction +~~~~~~~~~~~~ + +pytz brings the Olson tz database into Python. This library allows +accurate and cross platform timezone calculations using Python 2.4 +or higher. It also solves the issue of ambiguous times at the end +of daylight saving time, which you can read more about in the Python +Library Reference (``datetime.tzinfo``). + +Almost all of the Olson timezones are supported. + +.. note:: + + This library differs from the documented Python API for + tzinfo implementations; if you want to create local wallclock + times you need to use the ``localize()`` method documented in this + document. In addition, if you perform date arithmetic on local + times that cross DST boundaries, the result may be in an incorrect + timezone (ie. subtract 1 minute from 2002-10-27 1:00 EST and you get + 2002-10-27 0:59 EST instead of the correct 2002-10-27 1:59 EDT). A + ``normalize()`` method is provided to correct this. Unfortunately these + issues cannot be resolved without modifying the Python datetime + implementation (see PEP-431). + + +Installation +~~~~~~~~~~~~ + +This package can either be installed using ``pip`` or from a tarball using the +standard Python distutils. + +If you are installing using ``pip``, you don't need to download anything as the +latest version will be downloaded for you from PyPI:: + + pip install pytz + +If you are installing from a tarball, run the following command as an +administrative user:: + + python setup.py install + + +Example & Usage +~~~~~~~~~~~~~~~ + +Localized times and date arithmetic +----------------------------------- + +>>> from datetime import datetime, timedelta +>>> from pytz import timezone +>>> import pytz +>>> utc = pytz.utc +>>> utc.zone +'UTC' +>>> eastern = timezone('US/Eastern') +>>> eastern.zone +'US/Eastern' +>>> amsterdam = timezone('Europe/Amsterdam') +>>> fmt = '%Y-%m-%d %H:%M:%S %Z%z' + +This library only supports two ways of building a localized time. The +first is to use the ``localize()`` method provided by the pytz library. +This is used to localize a naive datetime (datetime with no timezone +information): + +>>> loc_dt = eastern.localize(datetime(2002, 10, 27, 6, 0, 0)) +>>> print(loc_dt.strftime(fmt)) +2002-10-27 06:00:00 EST-0500 + +The second way of building a localized time is by converting an existing +localized time using the standard ``astimezone()`` method: + +>>> ams_dt = loc_dt.astimezone(amsterdam) +>>> ams_dt.strftime(fmt) +'2002-10-27 12:00:00 CET+0100' + +Unfortunately using the tzinfo argument of the standard datetime +constructors ''does not work'' with pytz for many timezones. + +>>> datetime(2002, 10, 27, 12, 0, 0, tzinfo=amsterdam).strftime(fmt) # /!\ Does not work this way! +'2002-10-27 12:00:00 LMT+0020' + +It is safe for timezones without daylight saving transitions though, such +as UTC: + +>>> datetime(2002, 10, 27, 12, 0, 0, tzinfo=pytz.utc).strftime(fmt) # /!\ Not recommended except for UTC +'2002-10-27 12:00:00 UTC+0000' + +The preferred way of dealing with times is to always work in UTC, +converting to localtime only when generating output to be read +by humans. + +>>> utc_dt = datetime(2002, 10, 27, 6, 0, 0, tzinfo=utc) +>>> loc_dt = utc_dt.astimezone(eastern) +>>> loc_dt.strftime(fmt) +'2002-10-27 01:00:00 EST-0500' + +This library also allows you to do date arithmetic using local +times, although it is more complicated than working in UTC as you +need to use the ``normalize()`` method to handle daylight saving time +and other timezone transitions. In this example, ``loc_dt`` is set +to the instant when daylight saving time ends in the US/Eastern +timezone. + +>>> before = loc_dt - timedelta(minutes=10) +>>> before.strftime(fmt) +'2002-10-27 00:50:00 EST-0500' +>>> eastern.normalize(before).strftime(fmt) +'2002-10-27 01:50:00 EDT-0400' +>>> after = eastern.normalize(before + timedelta(minutes=20)) +>>> after.strftime(fmt) +'2002-10-27 01:10:00 EST-0500' + +Creating local times is also tricky, and the reason why working with +local times is not recommended. Unfortunately, you cannot just pass +a ``tzinfo`` argument when constructing a datetime (see the next +section for more details) + +>>> dt = datetime(2002, 10, 27, 1, 30, 0) +>>> dt1 = eastern.localize(dt, is_dst=True) +>>> dt1.strftime(fmt) +'2002-10-27 01:30:00 EDT-0400' +>>> dt2 = eastern.localize(dt, is_dst=False) +>>> dt2.strftime(fmt) +'2002-10-27 01:30:00 EST-0500' + +Converting between timezones is more easily done, using the +standard astimezone method. + +>>> utc_dt = utc.localize(datetime.utcfromtimestamp(1143408899)) +>>> utc_dt.strftime(fmt) +'2006-03-26 21:34:59 UTC+0000' +>>> au_tz = timezone('Australia/Sydney') +>>> au_dt = utc_dt.astimezone(au_tz) +>>> au_dt.strftime(fmt) +'2006-03-27 08:34:59 AEDT+1100' +>>> utc_dt2 = au_dt.astimezone(utc) +>>> utc_dt2.strftime(fmt) +'2006-03-26 21:34:59 UTC+0000' +>>> utc_dt == utc_dt2 +True + +You can take shortcuts when dealing with the UTC side of timezone +conversions. ``normalize()`` and ``localize()`` are not really +necessary when there are no daylight saving time transitions to +deal with. + +>>> utc_dt = datetime.utcfromtimestamp(1143408899).replace(tzinfo=utc) +>>> utc_dt.strftime(fmt) +'2006-03-26 21:34:59 UTC+0000' +>>> au_tz = timezone('Australia/Sydney') +>>> au_dt = au_tz.normalize(utc_dt.astimezone(au_tz)) +>>> au_dt.strftime(fmt) +'2006-03-27 08:34:59 AEDT+1100' +>>> utc_dt2 = au_dt.astimezone(utc) +>>> utc_dt2.strftime(fmt) +'2006-03-26 21:34:59 UTC+0000' + + +``tzinfo`` API +-------------- + +The ``tzinfo`` instances returned by the ``timezone()`` function have +been extended to cope with ambiguous times by adding an ``is_dst`` +parameter to the ``utcoffset()``, ``dst()`` && ``tzname()`` methods. + +>>> tz = timezone('America/St_Johns') + +>>> normal = datetime(2009, 9, 1) +>>> ambiguous = datetime(2009, 10, 31, 23, 30) + +The ``is_dst`` parameter is ignored for most timestamps. It is only used +during DST transition ambiguous periods to resolve that ambiguity. + +>>> print(tz.utcoffset(normal, is_dst=True)) +-1 day, 21:30:00 +>>> print(tz.dst(normal, is_dst=True)) +1:00:00 +>>> tz.tzname(normal, is_dst=True) +'NDT' + +>>> print(tz.utcoffset(ambiguous, is_dst=True)) +-1 day, 21:30:00 +>>> print(tz.dst(ambiguous, is_dst=True)) +1:00:00 +>>> tz.tzname(ambiguous, is_dst=True) +'NDT' + +>>> print(tz.utcoffset(normal, is_dst=False)) +-1 day, 21:30:00 +>>> tz.dst(normal, is_dst=False) +datetime.timedelta(0, 3600) +>>> tz.tzname(normal, is_dst=False) +'NDT' + +>>> print(tz.utcoffset(ambiguous, is_dst=False)) +-1 day, 20:30:00 +>>> tz.dst(ambiguous, is_dst=False) +datetime.timedelta(0) +>>> tz.tzname(ambiguous, is_dst=False) +'NST' + +If ``is_dst`` is not specified, ambiguous timestamps will raise +an ``pytz.exceptions.AmbiguousTimeError`` exception. + +>>> print(tz.utcoffset(normal)) +-1 day, 21:30:00 +>>> print(tz.dst(normal)) +1:00:00 +>>> tz.tzname(normal) +'NDT' + +>>> import pytz.exceptions +>>> try: +... tz.utcoffset(ambiguous) +... except pytz.exceptions.AmbiguousTimeError: +... print('pytz.exceptions.AmbiguousTimeError: %s' % ambiguous) +pytz.exceptions.AmbiguousTimeError: 2009-10-31 23:30:00 +>>> try: +... tz.dst(ambiguous) +... except pytz.exceptions.AmbiguousTimeError: +... print('pytz.exceptions.AmbiguousTimeError: %s' % ambiguous) +pytz.exceptions.AmbiguousTimeError: 2009-10-31 23:30:00 +>>> try: +... tz.tzname(ambiguous) +... except pytz.exceptions.AmbiguousTimeError: +... print('pytz.exceptions.AmbiguousTimeError: %s' % ambiguous) +pytz.exceptions.AmbiguousTimeError: 2009-10-31 23:30:00 + + +Problems with Localtime +~~~~~~~~~~~~~~~~~~~~~~~ + +The major problem we have to deal with is that certain datetimes +may occur twice in a year. For example, in the US/Eastern timezone +on the last Sunday morning in October, the following sequence +happens: + + - 01:00 EDT occurs + - 1 hour later, instead of 2:00am the clock is turned back 1 hour + and 01:00 happens again (this time 01:00 EST) + +In fact, every instant between 01:00 and 02:00 occurs twice. This means +that if you try and create a time in the 'US/Eastern' timezone +the standard datetime syntax, there is no way to specify if you meant +before of after the end-of-daylight-saving-time transition. Using the +pytz custom syntax, the best you can do is make an educated guess: + +>>> loc_dt = eastern.localize(datetime(2002, 10, 27, 1, 30, 00)) +>>> loc_dt.strftime(fmt) +'2002-10-27 01:30:00 EST-0500' + +As you can see, the system has chosen one for you and there is a 50% +chance of it being out by one hour. For some applications, this does +not matter. However, if you are trying to schedule meetings with people +in different timezones or analyze log files it is not acceptable. + +The best and simplest solution is to stick with using UTC. The pytz +package encourages using UTC for internal timezone representation by +including a special UTC implementation based on the standard Python +reference implementation in the Python documentation. + +The UTC timezone unpickles to be the same instance, and pickles to a +smaller size than other pytz tzinfo instances. The UTC implementation +can be obtained as pytz.utc, pytz.UTC, or pytz.timezone('UTC'). + +>>> import pickle, pytz +>>> dt = datetime(2005, 3, 1, 14, 13, 21, tzinfo=utc) +>>> naive = dt.replace(tzinfo=None) +>>> p = pickle.dumps(dt, 1) +>>> naive_p = pickle.dumps(naive, 1) +>>> len(p) - len(naive_p) +17 +>>> new = pickle.loads(p) +>>> new == dt +True +>>> new is dt +False +>>> new.tzinfo is dt.tzinfo +True +>>> pytz.utc is pytz.UTC is pytz.timezone('UTC') +True + +Note that some other timezones are commonly thought of as the same (GMT, +Greenwich, Universal, etc.). The definition of UTC is distinct from these +other timezones, and they are not equivalent. For this reason, they will +not compare the same in Python. + +>>> utc == pytz.timezone('GMT') +False + +See the section `What is UTC`_, below. + +If you insist on working with local times, this library provides a +facility for constructing them unambiguously: + +>>> loc_dt = datetime(2002, 10, 27, 1, 30, 00) +>>> est_dt = eastern.localize(loc_dt, is_dst=True) +>>> edt_dt = eastern.localize(loc_dt, is_dst=False) +>>> print(est_dt.strftime(fmt) + ' / ' + edt_dt.strftime(fmt)) +2002-10-27 01:30:00 EDT-0400 / 2002-10-27 01:30:00 EST-0500 + +If you pass None as the is_dst flag to localize(), pytz will refuse to +guess and raise exceptions if you try to build ambiguous or non-existent +times. + +For example, 1:30am on 27th Oct 2002 happened twice in the US/Eastern +timezone when the clocks where put back at the end of Daylight Saving +Time: + +>>> dt = datetime(2002, 10, 27, 1, 30, 00) +>>> try: +... eastern.localize(dt, is_dst=None) +... except pytz.exceptions.AmbiguousTimeError: +... print('pytz.exceptions.AmbiguousTimeError: %s' % dt) +pytz.exceptions.AmbiguousTimeError: 2002-10-27 01:30:00 + +Similarly, 2:30am on 7th April 2002 never happened at all in the +US/Eastern timezone, as the clocks where put forward at 2:00am skipping +the entire hour: + +>>> dt = datetime(2002, 4, 7, 2, 30, 00) +>>> try: +... eastern.localize(dt, is_dst=None) +... except pytz.exceptions.NonExistentTimeError: +... print('pytz.exceptions.NonExistentTimeError: %s' % dt) +pytz.exceptions.NonExistentTimeError: 2002-04-07 02:30:00 + +Both of these exceptions share a common base class to make error handling +easier: + +>>> isinstance(pytz.AmbiguousTimeError(), pytz.InvalidTimeError) +True +>>> isinstance(pytz.NonExistentTimeError(), pytz.InvalidTimeError) +True + + +A special case is where countries change their timezone definitions +with no daylight savings time switch. For example, in 1915 Warsaw +switched from Warsaw time to Central European time with no daylight savings +transition. So at the stroke of midnight on August 5th 1915 the clocks +were wound back 24 minutes creating an ambiguous time period that cannot +be specified without referring to the timezone abbreviation or the +actual UTC offset. In this case midnight happened twice, neither time +during a daylight saving time period. pytz handles this transition by +treating the ambiguous period before the switch as daylight savings +time, and the ambiguous period after as standard time. + + +>>> warsaw = pytz.timezone('Europe/Warsaw') +>>> amb_dt1 = warsaw.localize(datetime(1915, 8, 4, 23, 59, 59), is_dst=True) +>>> amb_dt1.strftime(fmt) +'1915-08-04 23:59:59 WMT+0124' +>>> amb_dt2 = warsaw.localize(datetime(1915, 8, 4, 23, 59, 59), is_dst=False) +>>> amb_dt2.strftime(fmt) +'1915-08-04 23:59:59 CET+0100' +>>> switch_dt = warsaw.localize(datetime(1915, 8, 5, 00, 00, 00), is_dst=False) +>>> switch_dt.strftime(fmt) +'1915-08-05 00:00:00 CET+0100' +>>> str(switch_dt - amb_dt1) +'0:24:01' +>>> str(switch_dt - amb_dt2) +'0:00:01' + +The best way of creating a time during an ambiguous time period is +by converting from another timezone such as UTC: + +>>> utc_dt = datetime(1915, 8, 4, 22, 36, tzinfo=pytz.utc) +>>> utc_dt.astimezone(warsaw).strftime(fmt) +'1915-08-04 23:36:00 CET+0100' + +The standard Python way of handling all these ambiguities is not to +handle them, such as demonstrated in this example using the US/Eastern +timezone definition from the Python documentation (Note that this +implementation only works for dates between 1987 and 2006 - it is +included for tests only!): + +>>> from pytz.reference import Eastern # pytz.reference only for tests +>>> dt = datetime(2002, 10, 27, 0, 30, tzinfo=Eastern) +>>> str(dt) +'2002-10-27 00:30:00-04:00' +>>> str(dt + timedelta(hours=1)) +'2002-10-27 01:30:00-05:00' +>>> str(dt + timedelta(hours=2)) +'2002-10-27 02:30:00-05:00' +>>> str(dt + timedelta(hours=3)) +'2002-10-27 03:30:00-05:00' + +Notice the first two results? At first glance you might think they are +correct, but taking the UTC offset into account you find that they are +actually two hours appart instead of the 1 hour we asked for. + +>>> from pytz.reference import UTC # pytz.reference only for tests +>>> str(dt.astimezone(UTC)) +'2002-10-27 04:30:00+00:00' +>>> str((dt + timedelta(hours=1)).astimezone(UTC)) +'2002-10-27 06:30:00+00:00' + + +Country Information +~~~~~~~~~~~~~~~~~~~ + +A mechanism is provided to access the timezones commonly in use +for a particular country, looked up using the ISO 3166 country code. +It returns a list of strings that can be used to retrieve the relevant +tzinfo instance using ``pytz.timezone()``: + +>>> print(' '.join(pytz.country_timezones['nz'])) +Pacific/Auckland Pacific/Chatham + +The Olson database comes with a ISO 3166 country code to English country +name mapping that pytz exposes as a dictionary: + +>>> print(pytz.country_names['nz']) +New Zealand + + +What is UTC +~~~~~~~~~~~ + +'UTC' is `Coordinated Universal Time`_. It is a successor to, but distinct +from, Greenwich Mean Time (GMT) and the various definitions of Universal +Time. UTC is now the worldwide standard for regulating clocks and time +measurement. + +All other timezones are defined relative to UTC, and include offsets like +UTC+0800 - hours to add or subtract from UTC to derive the local time. No +daylight saving time occurs in UTC, making it a useful timezone to perform +date arithmetic without worrying about the confusion and ambiguities caused +by daylight saving time transitions, your country changing its timezone, or +mobile computers that roam through multiple timezones. + +.. _Coordinated Universal Time: https://en.wikipedia.org/wiki/Coordinated_Universal_Time + + +Helpers +~~~~~~~ + +There are two lists of timezones provided. + +``all_timezones`` is the exhaustive list of the timezone names that can +be used. + +>>> from pytz import all_timezones +>>> len(all_timezones) >= 500 +True +>>> 'Etc/Greenwich' in all_timezones +True + +``common_timezones`` is a list of useful, current timezones. It doesn't +contain deprecated zones or historical zones, except for a few I've +deemed in common usage, such as US/Eastern (open a bug report if you +think other timezones are deserving of being included here). It is also +a sequence of strings. + +>>> from pytz import common_timezones +>>> len(common_timezones) < len(all_timezones) +True +>>> 'Etc/Greenwich' in common_timezones +False +>>> 'Australia/Melbourne' in common_timezones +True +>>> 'US/Eastern' in common_timezones +True +>>> 'Canada/Eastern' in common_timezones +True +>>> 'Australia/Yancowinna' in all_timezones +True +>>> 'Australia/Yancowinna' in common_timezones +False + +Both ``common_timezones`` and ``all_timezones`` are alphabetically +sorted: + +>>> common_timezones_dupe = common_timezones[:] +>>> common_timezones_dupe.sort() +>>> common_timezones == common_timezones_dupe +True +>>> all_timezones_dupe = all_timezones[:] +>>> all_timezones_dupe.sort() +>>> all_timezones == all_timezones_dupe +True + +``all_timezones`` and ``common_timezones`` are also available as sets. + +>>> from pytz import all_timezones_set, common_timezones_set +>>> 'US/Eastern' in all_timezones_set +True +>>> 'US/Eastern' in common_timezones_set +True +>>> 'Australia/Victoria' in common_timezones_set +False + +You can also retrieve lists of timezones used by particular countries +using the ``country_timezones()`` function. It requires an ISO-3166 +two letter country code. + +>>> from pytz import country_timezones +>>> print(' '.join(country_timezones('ch'))) +Europe/Zurich +>>> print(' '.join(country_timezones('CH'))) +Europe/Zurich + + +Internationalization - i18n/l10n +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Pytz is an interface to the IANA database, which uses ASCII names. The `Unicode Consortium's Unicode Locales (CLDR) `_ +project provides translations. Thomas Khyn's +`l18n `_ package can be used to access +these translations from Python. + + +License +~~~~~~~ + +MIT license. + +This code is also available as part of Zope 3 under the Zope Public +License, Version 2.1 (ZPL). + +I'm happy to relicense this code if necessary for inclusion in other +open source projects. + + +Latest Versions +~~~~~~~~~~~~~~~ + +This package will be updated after releases of the Olson timezone +database. The latest version can be downloaded from the `Python Package +Index `_. The code that is used +to generate this distribution is hosted on launchpad.net and available +using git:: + + git clone https://git.launchpad.net/pytz + +A mirror on github is also available at https://github.com/stub42/pytz + +Announcements of new releases are made on +`Launchpad `_, and the +`Atom feed `_ +hosted there. + + +Bugs, Feature Requests & Patches +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Bugs can be reported using `Launchpad `__. + + +Issues & Limitations +~~~~~~~~~~~~~~~~~~~~ + +- Offsets from UTC are rounded to the nearest whole minute, so timezones + such as Europe/Amsterdam pre 1937 will be up to 30 seconds out. This + is a limitation of the Python datetime library. + +- If you think a timezone definition is incorrect, I probably can't fix + it. pytz is a direct translation of the Olson timezone database, and + changes to the timezone definitions need to be made to this source. + If you find errors they should be reported to the time zone mailing + list, linked from http://www.iana.org/time-zones. + + +Further Reading +~~~~~~~~~~~~~~~ + +More info than you want to know about timezones: +http://www.twinsun.com/tz/tz-link.htm + + +Contact +~~~~~~~ + +Stuart Bishop + + + + diff --git a/venv/Lib/site-packages/pytz-2019.3.dist-info/RECORD b/venv/Lib/site-packages/pytz-2019.3.dist-info/RECORD new file mode 100644 index 0000000..858c8df --- /dev/null +++ b/venv/Lib/site-packages/pytz-2019.3.dist-info/RECORD @@ -0,0 +1,620 @@ +pytz/__init__.py,sha256=Hb9yJWhEFtAjr_1VMQg3g3bEq33EA6kNLmrLgDnEItI,34765 +pytz/exceptions.py,sha256=_GCDPHpBk2r-CQIg3Kcyw8RCsLm2teJdnzT85bl5VsM,1329 +pytz/lazy.py,sha256=toeR5uDWKBj6ezsUZ4elNP6CEMtK7CO2jS9A30nsFbo,5404 +pytz/reference.py,sha256=zUtCki7JFEmrzrjNsfMD7YL0lWDxynKc1Ubo4iXSs74,3778 +pytz/tzfile.py,sha256=g2CMhXZ1PX2slgg5_Kk9TvmIkVKeOjbuONHEfZP6jMk,4745 +pytz/tzinfo.py,sha256=-5UjW-yqHbtO5NtSaWope7EbSdf2oTES26Kdlxjqdk0,19272 +pytz/zoneinfo/CET,sha256=o4omkrM_IsITxooUo8krM921XfBdvRs9JhwGXGd-Ypg,2094 +pytz/zoneinfo/CST6CDT,sha256=WGbtZ1FwjRX6Jeo_TCXKsfeDs4V9uhXGJfcnLJhk3s0,2310 +pytz/zoneinfo/Cuba,sha256=HUQeAuKBsEkI5SLZjqynXICOUVOajkKzKH5r-Ov5Odc,2416 +pytz/zoneinfo/EET,sha256=gGVsW5-qnI7ty8vqVK1ADWhunrvAT8kUC79GUf-_7G8,1908 +pytz/zoneinfo/EST,sha256=uKE_VPKfxGyYEsyqV_DdE2MW55vs_qUioOdIn5Goobc,114 +pytz/zoneinfo/EST5EDT,sha256=fwzEMT1jgnY2dDjd0EqDl26_7LC-oF48Bd4ng5311H0,2310 +pytz/zoneinfo/Egypt,sha256=L6zLQLnQtLkEELOGfm6USaHY33qAEPgGV822-iU1vxc,1955 +pytz/zoneinfo/Eire,sha256=-JSA3vsi44F1DE8supVjSppH2Vpp12WjJI0_COtAmqU,3492 +pytz/zoneinfo/Factory,sha256=aFFlKx93HXoJoF4SSuTlD8cZtJA-ne5oKzAa6eX2V4k,116 +pytz/zoneinfo/GB,sha256=xp08wV44TZMmAdBqppttDChQAb8tRN03GcEht99RYtY,3648 +pytz/zoneinfo/GB-Eire,sha256=xp08wV44TZMmAdBqppttDChQAb8tRN03GcEht99RYtY,3648 +pytz/zoneinfo/GMT,sha256=bZ83iIPAefhsA4elVHqSxEmGnYBuB94QCEqwTwJJAY0,114 +pytz/zoneinfo/GMT+0,sha256=bZ83iIPAefhsA4elVHqSxEmGnYBuB94QCEqwTwJJAY0,114 +pytz/zoneinfo/GMT-0,sha256=bZ83iIPAefhsA4elVHqSxEmGnYBuB94QCEqwTwJJAY0,114 +pytz/zoneinfo/GMT0,sha256=bZ83iIPAefhsA4elVHqSxEmGnYBuB94QCEqwTwJJAY0,114 +pytz/zoneinfo/Greenwich,sha256=bZ83iIPAefhsA4elVHqSxEmGnYBuB94QCEqwTwJJAY0,114 +pytz/zoneinfo/HST,sha256=1YkCncvgL9Z5CmUo4Vk8VbQmgA7ZAQ0PtE37j1yOli8,115 +pytz/zoneinfo/Hongkong,sha256=UcnFEc9S8hMWl9giVXni4TAhLPWX0H12XvwSt4AJHew,1203 +pytz/zoneinfo/Iceland,sha256=mSkaRBGZLeUrm88EeHcaWnEd35Wn-Ag2G10HtI3G2fg,1162 +pytz/zoneinfo/Iran,sha256=ATT50Q0hK6uSba5_WnOE3Px0OWxIwxaqK5Oi10P2A-M,2582 +pytz/zoneinfo/Israel,sha256=xpEJ_vI7aMV0iFD5BN9sq71vlYUZj5Q613TUUwZl1Ww,2288 +pytz/zoneinfo/Jamaica,sha256=wlagieUPRf5-beie-h7QsONbNzjGsm8vMs8uf28pw28,482 +pytz/zoneinfo/Japan,sha256=oCueZgRNxcNcX3ZGdif9y6Su4cyVhga4XHdwlcrYLOs,309 +pytz/zoneinfo/Kwajalein,sha256=L4nH3qxv5EBKVRxYt67b9IfZfBzg5KJk19iu7x3oBMk,316 +pytz/zoneinfo/Libya,sha256=W1dptGD70T7ppGoo0fczFQeDiIp0nultLNPV66MwB2c,625 +pytz/zoneinfo/MET,sha256=i3CKSuP4N_PAj7o-Cbk8zPEdFs0CWWBCAfg2JXDx5V8,2094 +pytz/zoneinfo/MST,sha256=6IQwvtT12Bz1pTiqFuoVxNY-4ViS7ZrYHo5nPWwzKPw,114 +pytz/zoneinfo/MST7MDT,sha256=910Ek32FKoSyZWY_H19VHaVvqb-JsvnWTOOHvhrKsE0,2310 +pytz/zoneinfo/NZ,sha256=gADjoyPo_QISQU6UJrAgcHp3HDaMoOFRdH-d23uBSyc,2437 +pytz/zoneinfo/NZ-CHAT,sha256=lkVqaSF1WWpv_B2K-k2uJp2setRVK6XbjsQ38gDGVEg,2068 +pytz/zoneinfo/Navajo,sha256=6_yPo1_mvnt9DgpPzr0QdHsjdsfUG6ALnagQLML1DSM,2444 +pytz/zoneinfo/PRC,sha256=-fm6S1oS3D2M1qZpgZBlGQnyQrEwixXmz4NsHzmDzWU,533 +pytz/zoneinfo/PST8PDT,sha256=Q7TCLkE69a6g7mPoPAkqhg-0dStyiAC0jVlM72KG_R8,2310 +pytz/zoneinfo/Poland,sha256=TiLDPbeVF0ckgLVEkaSeDaKZ8wctdJDOl_HE_Wd5rKs,2654 +pytz/zoneinfo/Portugal,sha256=L6n3snx6pNHHJIL6JOLFOAlYkQ2J5uB_y5MG_Ic_PDU,3469 +pytz/zoneinfo/ROC,sha256=DMmQwOpPql25ue3Nf8vAKKT4em06D1Z9rHbLIitxixk,761 +pytz/zoneinfo/ROK,sha256=LI9LsV3XcJC0l-KoQf8zI-y7rk-du57erS-N2Ptdi7Q,617 +pytz/zoneinfo/Singapore,sha256=hIgr_LHMTWh3GgeG-MmLHBp-9anUxQcfMlKFtX8WvmU,383 +pytz/zoneinfo/Turkey,sha256=2S0A_f7VxvyErJMMCPqK33AChA29IVkMr1o-SpMtMxk,1947 +pytz/zoneinfo/UCT,sha256=i4WEZ5GrLIpUY8g6W-PAQ-JXDXRIQ01BOYlp7Ufj5vI,114 +pytz/zoneinfo/UTC,sha256=i4WEZ5GrLIpUY8g6W-PAQ-JXDXRIQ01BOYlp7Ufj5vI,114 +pytz/zoneinfo/Universal,sha256=i4WEZ5GrLIpUY8g6W-PAQ-JXDXRIQ01BOYlp7Ufj5vI,114 +pytz/zoneinfo/W-SU,sha256=KmkofRcj6T8Ph28PJChm8JVp13uRvef6TZ0GuPzUiDw,1535 +pytz/zoneinfo/WET,sha256=Sc0l03EfVs_aIi17I4KyZJFkwiAHat5BgpjuuFDhgQ0,1905 +pytz/zoneinfo/Zulu,sha256=i4WEZ5GrLIpUY8g6W-PAQ-JXDXRIQ01BOYlp7Ufj5vI,114 +pytz/zoneinfo/iso3166.tab,sha256=BMh_yY7MXp8DMEy71jarFX3IJSNpwuEyIjIo2HKUXD4,4463 +pytz/zoneinfo/leapseconds,sha256=Fat01V08akMNzRV3QFeMpJ9CIrjLiBNCeA30VvlkEBs,2892 +pytz/zoneinfo/posixrules,sha256=7AoiEGjr3wV4P7C4Qs35COZqwr2mjNDq7ocpsSPFOM8,3536 +pytz/zoneinfo/tzdata.zi,sha256=T2kpL5-JhnhFRoH1fAjJf0F5C3eUpwKyM6-l0QPkvNI,111561 +pytz/zoneinfo/zone.tab,sha256=Bt0UDBVS6uQBo53kgXo9bhEQUlfyp0Cxp8YRNNbLOfw,19424 +pytz/zoneinfo/zone1970.tab,sha256=7ahFA6nJ-GkeE6mzgJxMLyfAeDekvh9SvF1NSs1Xi5E,17938 +pytz/zoneinfo/Africa/Abidjan,sha256=0u-sTl8j2IyV1ywdtCgHFw9S9D3ZiiBa9akqkbny2Zc,148 +pytz/zoneinfo/Africa/Accra,sha256=Wu58Zep-DvaYnW1ahhz2jCDsTtOixcWpUWpYfa1NPqc,816 +pytz/zoneinfo/Africa/Addis_Ababa,sha256=Uum8ISzpRaDh830iNkfRva-Rn6NTuuGHNWjig5C29Zo,251 +pytz/zoneinfo/Africa/Algiers,sha256=k56yX_gxM4qFb7dez04soVIV-qD66Thd2PJxaS-ChHc,735 +pytz/zoneinfo/Africa/Asmara,sha256=Uum8ISzpRaDh830iNkfRva-Rn6NTuuGHNWjig5C29Zo,251 +pytz/zoneinfo/Africa/Asmera,sha256=Uum8ISzpRaDh830iNkfRva-Rn6NTuuGHNWjig5C29Zo,251 +pytz/zoneinfo/Africa/Bamako,sha256=0u-sTl8j2IyV1ywdtCgHFw9S9D3ZiiBa9akqkbny2Zc,148 +pytz/zoneinfo/Africa/Bangui,sha256=xqLNGqbjHZ1JuIHsEXP9ttXSb3v-GWp98SJ14pL6sUw,149 +pytz/zoneinfo/Africa/Banjul,sha256=0u-sTl8j2IyV1ywdtCgHFw9S9D3ZiiBa9akqkbny2Zc,148 +pytz/zoneinfo/Africa/Bissau,sha256=IjuxDP6EZiDHFvl_bHS6NN7sdRxLKXllooBC829poak,194 +pytz/zoneinfo/Africa/Blantyre,sha256=k_GelVHViGiuWCB1LSyTpIYSTDZEY9yclInQRY-LxoI,149 +pytz/zoneinfo/Africa/Brazzaville,sha256=xqLNGqbjHZ1JuIHsEXP9ttXSb3v-GWp98SJ14pL6sUw,149 +pytz/zoneinfo/Africa/Bujumbura,sha256=k_GelVHViGiuWCB1LSyTpIYSTDZEY9yclInQRY-LxoI,149 +pytz/zoneinfo/Africa/Cairo,sha256=L6zLQLnQtLkEELOGfm6USaHY33qAEPgGV822-iU1vxc,1955 +pytz/zoneinfo/Africa/Casablanca,sha256=CZnOkI88s0v9mwk2CtPTVVWip-bSwKryAHOnkJslHZ8,2429 +pytz/zoneinfo/Africa/Ceuta,sha256=jp7xqONgZ3NPnElHzJEVusHKM9rxDK1nxJm4-i7Ln8o,2036 +pytz/zoneinfo/Africa/Conakry,sha256=0u-sTl8j2IyV1ywdtCgHFw9S9D3ZiiBa9akqkbny2Zc,148 +pytz/zoneinfo/Africa/Dakar,sha256=0u-sTl8j2IyV1ywdtCgHFw9S9D3ZiiBa9akqkbny2Zc,148 +pytz/zoneinfo/Africa/Dar_es_Salaam,sha256=Uum8ISzpRaDh830iNkfRva-Rn6NTuuGHNWjig5C29Zo,251 +pytz/zoneinfo/Africa/Djibouti,sha256=Uum8ISzpRaDh830iNkfRva-Rn6NTuuGHNWjig5C29Zo,251 +pytz/zoneinfo/Africa/Douala,sha256=xqLNGqbjHZ1JuIHsEXP9ttXSb3v-GWp98SJ14pL6sUw,149 +pytz/zoneinfo/Africa/El_Aaiun,sha256=n-Z3vtT7eq64AbHbItUQUazpNwGfgASM3MAHsrz87rU,2295 +pytz/zoneinfo/Africa/Freetown,sha256=0u-sTl8j2IyV1ywdtCgHFw9S9D3ZiiBa9akqkbny2Zc,148 +pytz/zoneinfo/Africa/Gaborone,sha256=k_GelVHViGiuWCB1LSyTpIYSTDZEY9yclInQRY-LxoI,149 +pytz/zoneinfo/Africa/Harare,sha256=k_GelVHViGiuWCB1LSyTpIYSTDZEY9yclInQRY-LxoI,149 +pytz/zoneinfo/Africa/Johannesburg,sha256=bBvMdSZo53WFowiuhUO9C8zY6BOGViboCb-U8_49l34,246 +pytz/zoneinfo/Africa/Juba,sha256=TC4SaGEzvtDtdyU6lfxdqVQqDsNklvVokhqHZt_YteU,653 +pytz/zoneinfo/Africa/Kampala,sha256=Uum8ISzpRaDh830iNkfRva-Rn6NTuuGHNWjig5C29Zo,251 +pytz/zoneinfo/Africa/Khartoum,sha256=MYWDoJ3AcCItZdApoeOgtWWDDxquwTon5v5TOGP70-o,679 +pytz/zoneinfo/Africa/Kigali,sha256=k_GelVHViGiuWCB1LSyTpIYSTDZEY9yclInQRY-LxoI,149 +pytz/zoneinfo/Africa/Kinshasa,sha256=xqLNGqbjHZ1JuIHsEXP9ttXSb3v-GWp98SJ14pL6sUw,149 +pytz/zoneinfo/Africa/Lagos,sha256=xqLNGqbjHZ1JuIHsEXP9ttXSb3v-GWp98SJ14pL6sUw,149 +pytz/zoneinfo/Africa/Libreville,sha256=xqLNGqbjHZ1JuIHsEXP9ttXSb3v-GWp98SJ14pL6sUw,149 +pytz/zoneinfo/Africa/Lome,sha256=0u-sTl8j2IyV1ywdtCgHFw9S9D3ZiiBa9akqkbny2Zc,148 +pytz/zoneinfo/Africa/Luanda,sha256=xqLNGqbjHZ1JuIHsEXP9ttXSb3v-GWp98SJ14pL6sUw,149 +pytz/zoneinfo/Africa/Lubumbashi,sha256=k_GelVHViGiuWCB1LSyTpIYSTDZEY9yclInQRY-LxoI,149 +pytz/zoneinfo/Africa/Lusaka,sha256=k_GelVHViGiuWCB1LSyTpIYSTDZEY9yclInQRY-LxoI,149 +pytz/zoneinfo/Africa/Malabo,sha256=xqLNGqbjHZ1JuIHsEXP9ttXSb3v-GWp98SJ14pL6sUw,149 +pytz/zoneinfo/Africa/Maputo,sha256=k_GelVHViGiuWCB1LSyTpIYSTDZEY9yclInQRY-LxoI,149 +pytz/zoneinfo/Africa/Maseru,sha256=bBvMdSZo53WFowiuhUO9C8zY6BOGViboCb-U8_49l34,246 +pytz/zoneinfo/Africa/Mbabane,sha256=bBvMdSZo53WFowiuhUO9C8zY6BOGViboCb-U8_49l34,246 +pytz/zoneinfo/Africa/Mogadishu,sha256=Uum8ISzpRaDh830iNkfRva-Rn6NTuuGHNWjig5C29Zo,251 +pytz/zoneinfo/Africa/Monrovia,sha256=-VsJW5cU4KdvfgYaQVv4lcuzmaKIVFMd42nO6RXOBdU,208 +pytz/zoneinfo/Africa/Nairobi,sha256=Uum8ISzpRaDh830iNkfRva-Rn6NTuuGHNWjig5C29Zo,251 +pytz/zoneinfo/Africa/Ndjamena,sha256=8T3A0Zm9Gj0Bvm6rd88t3GAXKiKdGUfHlIqYlkYI0KM,199 +pytz/zoneinfo/Africa/Niamey,sha256=xqLNGqbjHZ1JuIHsEXP9ttXSb3v-GWp98SJ14pL6sUw,149 +pytz/zoneinfo/Africa/Nouakchott,sha256=0u-sTl8j2IyV1ywdtCgHFw9S9D3ZiiBa9akqkbny2Zc,148 +pytz/zoneinfo/Africa/Ouagadougou,sha256=0u-sTl8j2IyV1ywdtCgHFw9S9D3ZiiBa9akqkbny2Zc,148 +pytz/zoneinfo/Africa/Porto-Novo,sha256=xqLNGqbjHZ1JuIHsEXP9ttXSb3v-GWp98SJ14pL6sUw,149 +pytz/zoneinfo/Africa/Sao_Tome,sha256=MdjxpQ268uzJ7Zx1ZroFUtRUwqsJ6F_yY3AYV9FXw1I,254 +pytz/zoneinfo/Africa/Timbuktu,sha256=0u-sTl8j2IyV1ywdtCgHFw9S9D3ZiiBa9akqkbny2Zc,148 +pytz/zoneinfo/Africa/Tripoli,sha256=W1dptGD70T7ppGoo0fczFQeDiIp0nultLNPV66MwB2c,625 +pytz/zoneinfo/Africa/Tunis,sha256=OFVMEM4eYT2Ez0beuhEUCTSIpcFldWxsV2uEoTZIUNI,689 +pytz/zoneinfo/Africa/Windhoek,sha256=xuhvudrMH4alnVmouSTQI8YL8F_HbgsF2EQ7AZKzuHs,955 +pytz/zoneinfo/America/Adak,sha256=IB1DhwJQAKbhPJ9jHLf8zW5Dad7HIkBS-dhv64E1OlM,2356 +pytz/zoneinfo/America/Anchorage,sha256=oZA1NSPS2BWdymYpnCHFO8BlYVS-ll5KLg2Ez9CbETs,2371 +pytz/zoneinfo/America/Anguilla,sha256=17gT2eOVMFKJF_sypwDPudkFwGEijrRfkBU-aK3FL60,148 +pytz/zoneinfo/America/Antigua,sha256=17gT2eOVMFKJF_sypwDPudkFwGEijrRfkBU-aK3FL60,148 +pytz/zoneinfo/America/Araguaina,sha256=kppiiytmSQeesflyNGYM3r8NVUl1C-ggu08s9_Tt-co,884 +pytz/zoneinfo/America/Aruba,sha256=ZGEIylAZ5iy_rIBsXREtH_ZfWRIkLI9dQjP_EIyn3sY,186 +pytz/zoneinfo/America/Asuncion,sha256=FTLtFk6MjJoh5VIDgJ2Sf4B_iNeCDxrV0MWwQL-sOVM,2044 +pytz/zoneinfo/America/Atikokan,sha256=4a94GtPHUdQ-2sdz9WinsKn9V_QiM4XmFj48FTPMeSA,336 +pytz/zoneinfo/America/Atka,sha256=IB1DhwJQAKbhPJ9jHLf8zW5Dad7HIkBS-dhv64E1OlM,2356 +pytz/zoneinfo/America/Bahia,sha256=cmLkSAAzINlzYGXBqADEU3uPgA9S5nt-p1AV3Zy86VY,1024 +pytz/zoneinfo/America/Bahia_Banderas,sha256=BNjbcHSlPsJ4UpJx-gs1hpIyx2ScBieh1nyDuGb0PcE,1546 +pytz/zoneinfo/America/Barbados,sha256=Y0XwBAv5eSAZHNixVIhRvPxNWqP2a1FCH2_z2Vrd-sc,314 +pytz/zoneinfo/America/Belem,sha256=_258hQZLCEXBX8xRLyQSw-AE-jiDmjVwJX32mN5UUEk,576 +pytz/zoneinfo/America/Belize,sha256=4i5I2QwrrKxujvw_Siu4Nvmb-hySGAjx-Kpkfyt7WGE,948 +pytz/zoneinfo/America/Blanc-Sablon,sha256=tVN5ZPmIO3vc3_ayowg6qbvjheg4OJtDFT9y8IuW334,298 +pytz/zoneinfo/America/Boa_Vista,sha256=V4VVOkrFUV1qUfVp9E974IOJFmA5QxQrctatTBEb-hs,632 +pytz/zoneinfo/America/Bogota,sha256=ZaQKTZi35AMdlROs0vjEDA_phR8ztJOnjA8aLJZ5tHw,246 +pytz/zoneinfo/America/Boise,sha256=Yv4AXa2nSH_oVo3FZqZCR7V7z7c6WnQgKIUyNUpzGXA,2394 +pytz/zoneinfo/America/Buenos_Aires,sha256=ntn_GFHadbrFJ4ZuhU6h2uzbFwmDyS9mXV5S28pkGF8,1076 +pytz/zoneinfo/America/Cambridge_Bay,sha256=Nanl8yH4SshljhEjDe-PZCYEXbUuuZGmkbAAt2dB-bk,2084 +pytz/zoneinfo/America/Campo_Grande,sha256=5BBENR3_8gJp4F_Uj2RRknvRc4JJWNRPnZU9E7tb8QI,1444 +pytz/zoneinfo/America/Cancun,sha256=YR2U5T6mDGd5xm8EVA_TM1NwSRMYPNYWvV7wuthnX0I,782 +pytz/zoneinfo/America/Caracas,sha256=2NpwXPEtQkI82WCZuQWHXf66VCADcawMpfhKTsuA0x4,264 +pytz/zoneinfo/America/Catamarca,sha256=diH1f96kbbY-7gJYQnSCNHs3n9dwHJqUhSdGNx1L7I0,1076 +pytz/zoneinfo/America/Cayenne,sha256=atVbW5ChJiKQ_q-3kFs-DLTTZa9ptkiHkmJlq4AXoY4,198 +pytz/zoneinfo/America/Cayman,sha256=kayA_pdpMcSQ0FjIzotdcf-m1JYfbKE-qcFT8LC8zqA,182 +pytz/zoneinfo/America/Chicago,sha256=4aZFw-svkMyXmSpNufqzK-xveos-oVJDpEyI8Yu9HQE,3576 +pytz/zoneinfo/America/Chihuahua,sha256=cewXJyEw4KCoz33yl8o2tUJZmugBWH4R0Aovdmuqf-o,1484 +pytz/zoneinfo/America/Coral_Harbour,sha256=4a94GtPHUdQ-2sdz9WinsKn9V_QiM4XmFj48FTPMeSA,336 +pytz/zoneinfo/America/Cordoba,sha256=1XqIP8Qo2bPR7909hrAI-qAttybmwEW4ms7FjZA5Yfw,1076 +pytz/zoneinfo/America/Costa_Rica,sha256=74rYa6lrgIkyls9PkHo8SCYl9oOqiuG5S7MWdnJelP4,316 +pytz/zoneinfo/America/Creston,sha256=dNOa71QgQ2d5uh7cl-xZme-8u3nMR9GJ7PSktWIDORQ,208 +pytz/zoneinfo/America/Cuiaba,sha256=M0FsR8T9s4jFSuzD8Qi6pqtb6Rf2NTzyVHKGZrn56n4,1416 +pytz/zoneinfo/America/Curacao,sha256=ZGEIylAZ5iy_rIBsXREtH_ZfWRIkLI9dQjP_EIyn3sY,186 +pytz/zoneinfo/America/Danmarkshavn,sha256=YRZAfUCoVtaL1L-MYMYMH1wyOaVQnfUo_gFnvMXSuzw,698 +pytz/zoneinfo/America/Dawson,sha256=E6UmlysBR0hdkve_79tpRe2z1DORY2hwqKzE--G4ZGs,2084 +pytz/zoneinfo/America/Dawson_Creek,sha256=aJXCyP4j3ggE4wGCN-LrS9hpD_5zWHzQTeSAKTWEPUM,1050 +pytz/zoneinfo/America/Denver,sha256=6_yPo1_mvnt9DgpPzr0QdHsjdsfUG6ALnagQLML1DSM,2444 +pytz/zoneinfo/America/Detroit,sha256=hecz8yqY2Cj5B61G3gLZdAVZvRgK9l0P90c_gN-uD5g,2230 +pytz/zoneinfo/America/Dominica,sha256=17gT2eOVMFKJF_sypwDPudkFwGEijrRfkBU-aK3FL60,148 +pytz/zoneinfo/America/Edmonton,sha256=-TkIfc3QlvaCf0p8COZ43Y1HRBAl-nARUi-JdXeK1vE,2332 +pytz/zoneinfo/America/Eirunepe,sha256=pS90HZzRwH4Tf8ugmKHfiphX7zCPqZkh_0CNb-fEMAM,656 +pytz/zoneinfo/America/El_Salvador,sha256=gvGN8Lkj-sGm2_rs8OUjAMf1oMtKp2Xes6UfWT0WqgU,224 +pytz/zoneinfo/America/Ensenada,sha256=OHHtvy3J70z6wvKBHgPqMEnGs6SXp8fkf0WX9ZiOODk,2342 +pytz/zoneinfo/America/Fort_Nelson,sha256=erfODr3DrSpz65kAdO7Ts2dGbZxvddEP6gx4BX3y2J0,2240 +pytz/zoneinfo/America/Fort_Wayne,sha256=GrNub1_3Um5Qh67wOx58_TEAz4fwAeAlk2AlMTVA_sI,1666 +pytz/zoneinfo/America/Fortaleza,sha256=mITuMrRLRTWyoiF04Oy_UZ8gxZofTpXDblM8t7ch7Sg,716 +pytz/zoneinfo/America/Glace_Bay,sha256=G8DGLGCapH_aYCF_OhaL5Qonf7FOAgAPwelO5htCWBc,2192 +pytz/zoneinfo/America/Godthab,sha256=FtlXWP_hBNuwBHkI2b1yne_tSUJpwLtWLyTHZoFZkmM,1878 +pytz/zoneinfo/America/Goose_Bay,sha256=JgaLueghSvX2g725FOfIgpgvsqxZGykWOhAZWGpQZRY,3210 +pytz/zoneinfo/America/Grand_Turk,sha256=ds3WfGxsBby0eTHik_PEHmmwS0uF-Rd5YdCcb_SAnLw,1848 +pytz/zoneinfo/America/Grenada,sha256=17gT2eOVMFKJF_sypwDPudkFwGEijrRfkBU-aK3FL60,148 +pytz/zoneinfo/America/Guadeloupe,sha256=17gT2eOVMFKJF_sypwDPudkFwGEijrRfkBU-aK3FL60,148 +pytz/zoneinfo/America/Guatemala,sha256=dugUgCd6QY52yHkHuUP4jRWzo5x439IQigaYCvEF46Q,280 +pytz/zoneinfo/America/Guayaquil,sha256=PbcF4bvGAm-aFwdtGPotJy3kb4NwoyWwxgwL98BeUWA,246 +pytz/zoneinfo/America/Guyana,sha256=mDyb0FtGOpwGPq864vAHX22LY0Pxex94f1wVMyo36d0,236 +pytz/zoneinfo/America/Halifax,sha256=TZpmc5PwWoLfTfQoQ_b3U17BE2iVKSeNkR0Ho8mbTn8,3424 +pytz/zoneinfo/America/Havana,sha256=HUQeAuKBsEkI5SLZjqynXICOUVOajkKzKH5r-Ov5Odc,2416 +pytz/zoneinfo/America/Hermosillo,sha256=9Ij30JYmMscC1XHi4o9v-uSXoUuE8V9zhGz2iV5hVFI,416 +pytz/zoneinfo/America/Indianapolis,sha256=GrNub1_3Um5Qh67wOx58_TEAz4fwAeAlk2AlMTVA_sI,1666 +pytz/zoneinfo/America/Inuvik,sha256=MU_oDiidQaijt1KV0B5h9LqHoCrJ8ieldD9tsiJiX5o,1894 +pytz/zoneinfo/America/Iqaluit,sha256=6PitEMSFWcSb-Io8fvm4oQ_7v39G_qANc6reTjXoZJ0,2032 +pytz/zoneinfo/America/Jamaica,sha256=wlagieUPRf5-beie-h7QsONbNzjGsm8vMs8uf28pw28,482 +pytz/zoneinfo/America/Jujuy,sha256=5HR0TlZFifwJ5nLTmg7yWXgCTx9mRhahfs4_Wq70wOY,1048 +pytz/zoneinfo/America/Juneau,sha256=k7hxb0aGRnfnE-DBi3LkcjAzRPyAf0_Hw0vVFfjGeb0,2353 +pytz/zoneinfo/America/Knox_IN,sha256=BiALShjiOLg1o8mMRWJ1jyTlJkgvwzte7B9WSOvTUNg,2428 +pytz/zoneinfo/America/Kralendijk,sha256=ZGEIylAZ5iy_rIBsXREtH_ZfWRIkLI9dQjP_EIyn3sY,186 +pytz/zoneinfo/America/La_Paz,sha256=PAGF2VU_QOw2xT1Cqdp2P8Aj9hXMVWlCByV7cvfIQ_k,232 +pytz/zoneinfo/America/Lima,sha256=JHDCg95uw6BEu4a4Gfyikm1s8rm8AsYPG8dJxQQNZFs,406 +pytz/zoneinfo/America/Los_Angeles,sha256=VOy1PikdjiVdJ7lukVGzwl8uDxV_KYqznkTm5BLEiDM,2836 +pytz/zoneinfo/America/Louisville,sha256=-yqgeeHZdq6oP3_WzVvYOmqV9HQv8y7ZWmc9bzHvJAY,2772 +pytz/zoneinfo/America/Lower_Princes,sha256=ZGEIylAZ5iy_rIBsXREtH_ZfWRIkLI9dQjP_EIyn3sY,186 +pytz/zoneinfo/America/Maceio,sha256=pzjNghmeHhvF4aI3cDq2G_5t71BSNGIbRAF5NmJyDmw,744 +pytz/zoneinfo/America/Managua,sha256=xBzF01AHn2E2fD8Qdy-DHFe36UqoeNpKPfChduBKWdk,430 +pytz/zoneinfo/America/Manaus,sha256=lp6RlkcXJQ7mSsKqnEgC8svJVrFDJk_16xxvfpNSpK4,604 +pytz/zoneinfo/America/Marigot,sha256=17gT2eOVMFKJF_sypwDPudkFwGEijrRfkBU-aK3FL60,148 +pytz/zoneinfo/America/Martinique,sha256=fMs80kOU2YFvC0f9y2eje97JeAtTYBamXrnlTunNLzQ,232 +pytz/zoneinfo/America/Matamoros,sha256=RlEMOT_zvCLQ8s7TNvRE2PnC4H9JrxO7MGxmfu5xPPI,1390 +pytz/zoneinfo/America/Mazatlan,sha256=aIyre-8trAXSHtqxbuu6gDDkWCUjI_SdAKPIjz74M2E,1526 +pytz/zoneinfo/America/Mendoza,sha256=5DJiYYeQpcLBR_IoIJtk43IswJeGYawx5GykszuJ-Nw,1076 +pytz/zoneinfo/America/Menominee,sha256=Arv9WLbfhNcpRsUjHDU757BEdwlp08Gt30AixG3gZ04,2274 +pytz/zoneinfo/America/Merida,sha256=BJQ5mzAT-akb_EA7WqGdNheCorDqLBnDS_4X3YJz0rc,1422 +pytz/zoneinfo/America/Metlakatla,sha256=twmieGTVY2V-U8nFxqvx7asYv8GVjeWdLtrOI7UApVI,1423 +pytz/zoneinfo/America/Mexico_City,sha256=DSpTe5TT0KBsxGx79Rs7ah-zJpiGOJKwPjztovRN0b4,1584 +pytz/zoneinfo/America/Miquelon,sha256=LNbkN87EnZUa41Xizko5VIN55EyQvf5Kk5b5AfNQG8Q,1666 +pytz/zoneinfo/America/Moncton,sha256=Wmv-bk9aKKcWWzOpc1UFu67HOfwaIk2Wmh3LgqGctys,3154 +pytz/zoneinfo/America/Monterrey,sha256=HA4yn9jQHk9i0PqiB7fSoFdzXtB1DT1cheGRPXrQNdQ,1390 +pytz/zoneinfo/America/Montevideo,sha256=4jcgTegK5X8F0yNYzk-3oySZ4U9XQ09UbTJ_mlu8N70,1510 +pytz/zoneinfo/America/Montreal,sha256=ggOSzbHkmfgu9wTQzP0MUKsrKMbgveuAeThh1eFl1a0,3494 +pytz/zoneinfo/America/Montserrat,sha256=17gT2eOVMFKJF_sypwDPudkFwGEijrRfkBU-aK3FL60,148 +pytz/zoneinfo/America/Nassau,sha256=TLJ7tbx0ZIFwvAYgF97DWv9eJ4hmeb5n3RJPWI-uOyM,2258 +pytz/zoneinfo/America/New_York,sha256=7AoiEGjr3wV4P7C4Qs35COZqwr2mjNDq7ocpsSPFOM8,3536 +pytz/zoneinfo/America/Nipigon,sha256=EGPXcOin8mfzFTkYJm4ICpY7fyE24I2pXg4ejafSMyU,2122 +pytz/zoneinfo/America/Nome,sha256=2izM3-P-PqJ9za6MdhzFfMvPFNq7Gim69tAvEwPeY2s,2367 +pytz/zoneinfo/America/Noronha,sha256=3R4lLV8jg5SljhC5OVVCk51Y77Efjo6zCe-oppg_FFo,716 +pytz/zoneinfo/America/Ojinaga,sha256=cO3V-x_1Q-mpbJgKNd6-WTfxDEHBV1aqS4wzVl5A0Q4,1484 +pytz/zoneinfo/America/Panama,sha256=kayA_pdpMcSQ0FjIzotdcf-m1JYfbKE-qcFT8LC8zqA,182 +pytz/zoneinfo/America/Pangnirtung,sha256=P9Kw_I-NxcUYJIr1j40jTn9q7F8TPAE_FqXsfLYF86A,2094 +pytz/zoneinfo/America/Paramaribo,sha256=Hm5tDwUmnoTrTUPEO4WArfSF74ZjywVEocy4kL51FzA,262 +pytz/zoneinfo/America/Phoenix,sha256=nEOwYOnGxENw9zW8m50PGxbtVfTrX3QYAo4x4LgOLfI,328 +pytz/zoneinfo/America/Port-au-Prince,sha256=09ZAJd4IOiMpfdpUuF1U44R_hRt6BvpAkFXOnYO9yOM,1434 +pytz/zoneinfo/America/Port_of_Spain,sha256=17gT2eOVMFKJF_sypwDPudkFwGEijrRfkBU-aK3FL60,148 +pytz/zoneinfo/America/Porto_Acre,sha256=17onkm8P_VgMkErjK9rr0qwNni7qp9tgcUZ93g3ltOs,628 +pytz/zoneinfo/America/Porto_Velho,sha256=ZRfzgGEu26hnl3JPtiZLOSFGj_WBSbOKdiLC1xIyc5c,576 +pytz/zoneinfo/America/Puerto_Rico,sha256=hJHlV_-AGoMGUWuMpZRv9fLmghrzFHfrR9fRkcxaZJc,246 +pytz/zoneinfo/America/Punta_Arenas,sha256=kpqStczF3X0yK0lwOcxmwbQM8ZV9MrNktm7orJF-EJc,1902 +pytz/zoneinfo/America/Rainy_River,sha256=r6kx6lD2IzCdygkj-DKyL2tPSn7k0Zil7PSHCBFKOa0,2122 +pytz/zoneinfo/America/Rankin_Inlet,sha256=KpQX97-EuF4MNyxQrtOKP616CK_vjniM-lo14WGVz0c,1892 +pytz/zoneinfo/America/Recife,sha256=ijFN2ZzZe5oBYdl8Ag3SwmGjj2JeVYYX2Vo767g2s6I,716 +pytz/zoneinfo/America/Regina,sha256=yjqT08pHbICYe83H8JmtaDBvCFqRv7Tfze3Y8xuXukw,980 +pytz/zoneinfo/America/Resolute,sha256=VP_u5XsepfSwx7Ou9zjGw2p5Qi10AIA54sP1J2DkppM,1892 +pytz/zoneinfo/America/Rio_Branco,sha256=17onkm8P_VgMkErjK9rr0qwNni7qp9tgcUZ93g3ltOs,628 +pytz/zoneinfo/America/Rosario,sha256=1XqIP8Qo2bPR7909hrAI-qAttybmwEW4ms7FjZA5Yfw,1076 +pytz/zoneinfo/America/Santa_Isabel,sha256=OHHtvy3J70z6wvKBHgPqMEnGs6SXp8fkf0WX9ZiOODk,2342 +pytz/zoneinfo/America/Santarem,sha256=Gl_lI3pPZ57UIYXWcmaTpFqWDA5re6bHh1nWs_Z0-Nc,602 +pytz/zoneinfo/America/Santiago,sha256=GB14PW0xABV283dXc8qL-nnDW-ViFUR3bne7sg0Aido,2529 +pytz/zoneinfo/America/Santo_Domingo,sha256=DKtaEj8fQ92ybITTWU4Bm160S9pzJmUVbjaWRnenxU4,458 +pytz/zoneinfo/America/Sao_Paulo,sha256=cO3VGekMGdSf1y4f_UgkpDMRes26-l1oGUoDglIiUQg,1444 +pytz/zoneinfo/America/Scoresbysund,sha256=dfHb86egoiNykb3bR3OHXpGFPm_Apck8BLiVTCqVAVc,1916 +pytz/zoneinfo/America/Shiprock,sha256=6_yPo1_mvnt9DgpPzr0QdHsjdsfUG6ALnagQLML1DSM,2444 +pytz/zoneinfo/America/Sitka,sha256=aiS7Fk37hZpzZ9VkeJQeF-BqTLRC1QOTCgMAJwT8UxA,2329 +pytz/zoneinfo/America/St_Barthelemy,sha256=17gT2eOVMFKJF_sypwDPudkFwGEijrRfkBU-aK3FL60,148 +pytz/zoneinfo/America/St_Johns,sha256=r1-17uKv27eZ3JsVkw_DLZQbo6wvjuuVu7C2pDsmOgI,3655 +pytz/zoneinfo/America/St_Kitts,sha256=17gT2eOVMFKJF_sypwDPudkFwGEijrRfkBU-aK3FL60,148 +pytz/zoneinfo/America/St_Lucia,sha256=17gT2eOVMFKJF_sypwDPudkFwGEijrRfkBU-aK3FL60,148 +pytz/zoneinfo/America/St_Thomas,sha256=17gT2eOVMFKJF_sypwDPudkFwGEijrRfkBU-aK3FL60,148 +pytz/zoneinfo/America/St_Vincent,sha256=17gT2eOVMFKJF_sypwDPudkFwGEijrRfkBU-aK3FL60,148 +pytz/zoneinfo/America/Swift_Current,sha256=RRKOF7vZC8VvYxD8PP4J1_hUPayKBP7Lu80avRkfPDY,560 +pytz/zoneinfo/America/Tegucigalpa,sha256=EzOz7ntTlreMq69JZ2CcAb8Ps98V9bUMN480tpPIyw4,252 +pytz/zoneinfo/America/Thule,sha256=8xuPRaZU8RgO5ECqFYHYmnHioc81sBOailkVu8Y02i8,1502 +pytz/zoneinfo/America/Thunder_Bay,sha256=cJ9lcf2mDZttEx_ttYYoZAJfuGhSsDgNV2PI-ggWdPE,2202 +pytz/zoneinfo/America/Tijuana,sha256=OHHtvy3J70z6wvKBHgPqMEnGs6SXp8fkf0WX9ZiOODk,2342 +pytz/zoneinfo/America/Toronto,sha256=ggOSzbHkmfgu9wTQzP0MUKsrKMbgveuAeThh1eFl1a0,3494 +pytz/zoneinfo/America/Tortola,sha256=17gT2eOVMFKJF_sypwDPudkFwGEijrRfkBU-aK3FL60,148 +pytz/zoneinfo/America/Vancouver,sha256=sknKH0jSPWam-DHfM35qXs8Nam7d5TFlkUI9Sgxryyg,2892 +pytz/zoneinfo/America/Virgin,sha256=17gT2eOVMFKJF_sypwDPudkFwGEijrRfkBU-aK3FL60,148 +pytz/zoneinfo/America/Whitehorse,sha256=agbpCco506MSV46rKLEkJd7_RTjinyaBbScQIUDZM00,2084 +pytz/zoneinfo/America/Winnipeg,sha256=7P-_YQrneFcon7QKSTOnkiGjEppFDn3Z48MJ1qq8VBw,2868 +pytz/zoneinfo/America/Yakutat,sha256=tFwnKbvwhyyn4LNTAn5ye_JWDdxjCerNDt7oOwUwO2M,2305 +pytz/zoneinfo/America/Yellowknife,sha256=pfFvC8NEy373KbO6r6ec-Gw_O0D2h64mXU1X1AsUDgE,1966 +pytz/zoneinfo/America/Argentina/Buenos_Aires,sha256=ntn_GFHadbrFJ4ZuhU6h2uzbFwmDyS9mXV5S28pkGF8,1076 +pytz/zoneinfo/America/Argentina/Catamarca,sha256=diH1f96kbbY-7gJYQnSCNHs3n9dwHJqUhSdGNx1L7I0,1076 +pytz/zoneinfo/America/Argentina/ComodRivadavia,sha256=diH1f96kbbY-7gJYQnSCNHs3n9dwHJqUhSdGNx1L7I0,1076 +pytz/zoneinfo/America/Argentina/Cordoba,sha256=1XqIP8Qo2bPR7909hrAI-qAttybmwEW4ms7FjZA5Yfw,1076 +pytz/zoneinfo/America/Argentina/Jujuy,sha256=5HR0TlZFifwJ5nLTmg7yWXgCTx9mRhahfs4_Wq70wOY,1048 +pytz/zoneinfo/America/Argentina/La_Rioja,sha256=Zf_E3akFE1YUt9MZ4xxbRnOrp2bH1D-Bjsc0SLFfRyU,1090 +pytz/zoneinfo/America/Argentina/Mendoza,sha256=5DJiYYeQpcLBR_IoIJtk43IswJeGYawx5GykszuJ-Nw,1076 +pytz/zoneinfo/America/Argentina/Rio_Gallegos,sha256=T97WADwva6JbxICviNQUt_7iw9c-nloI4QJCscENSck,1076 +pytz/zoneinfo/America/Argentina/Salta,sha256=ATw0uR6szWKPs6jzdn6revS7UxCXD26ORK6jlmsjL18,1048 +pytz/zoneinfo/America/Argentina/San_Juan,sha256=qlW693a0Tnofy-RdcVBuWY3DvTTGxWwcYdKU3Y98pX8,1090 +pytz/zoneinfo/America/Argentina/San_Luis,sha256=WYdcro5-Fe-N6LkQsKwx_1tVozmnBp58DO1-BJs2suo,1102 +pytz/zoneinfo/America/Argentina/Tucuman,sha256=wsjg1a5AM1dP2gjr112k3vt54trcOOM_StF74xzvBJc,1104 +pytz/zoneinfo/America/Argentina/Ushuaia,sha256=9548Vvq_kpw_NX5s65vYuIbqvwGV-PBxqwmcrflLI0U,1076 +pytz/zoneinfo/America/Indiana/Indianapolis,sha256=GrNub1_3Um5Qh67wOx58_TEAz4fwAeAlk2AlMTVA_sI,1666 +pytz/zoneinfo/America/Indiana/Knox,sha256=BiALShjiOLg1o8mMRWJ1jyTlJkgvwzte7B9WSOvTUNg,2428 +pytz/zoneinfo/America/Indiana/Marengo,sha256=CPYY3XgJFNEzONxei7x04wOGI_b86RAn4jBPewi1HZw,1722 +pytz/zoneinfo/America/Indiana/Petersburg,sha256=axot1SloP27ZWjezmo7kldu9qA2frEtPVqWngcXtft0,1904 +pytz/zoneinfo/America/Indiana/Tell_City,sha256=GrWNjb1i4sbIYlJ8fU0viJ2Q5JmrlvLgcLQILnk3El8,1684 +pytz/zoneinfo/America/Indiana/Vevay,sha256=GGosHbQUoIDOKPZxdal42X40veEITMmrnlKOnLUhb-c,1414 +pytz/zoneinfo/America/Indiana/Vincennes,sha256=gh7LAbHbMD92eo9C_c5IiwQ1fJvxhdJN402Q_4YJdLg,1694 +pytz/zoneinfo/America/Indiana/Winamac,sha256=yS-_aKSC4crd0WdNutkHRHxUjmBCU56QVQcqy7kYpbQ,1778 +pytz/zoneinfo/America/Kentucky/Louisville,sha256=-yqgeeHZdq6oP3_WzVvYOmqV9HQv8y7ZWmc9bzHvJAY,2772 +pytz/zoneinfo/America/Kentucky/Monticello,sha256=NJMKjG7jjlRzZhndMPw51bYW0D3jviW2Qbl70YcU0Gg,2352 +pytz/zoneinfo/America/North_Dakota/Beulah,sha256=PHlzEk3wsNXYsfMZZSio7ZfdnyxPFpOhK3dS-1AJKGg,2380 +pytz/zoneinfo/America/North_Dakota/Center,sha256=PaM52_JOVMEpVdw5qiOlhkp3qA0xp0d6Z9neOatmLKo,2380 +pytz/zoneinfo/America/North_Dakota/New_Salem,sha256=o0xmH1FUh3lVFLtP5Lb9c0PfSyaPTsRvQSQYwnn_yls,2380 +pytz/zoneinfo/Antarctica/Casey,sha256=vIZdw_xcBjOYXXzwLw6XP_juvNbNf4Jd-cEWhwH7OCY,297 +pytz/zoneinfo/Antarctica/Davis,sha256=6PokyOaaISRTN13sisuGgdt5vG5A2YqNooJpfLTb5SQ,297 +pytz/zoneinfo/Antarctica/DumontDUrville,sha256=g8HQLY-aN3p6az-04KdHOdZYFnN__-8ltHRuY9eQX-I,194 +pytz/zoneinfo/Antarctica/Macquarie,sha256=gMOjOx2dSBH4KxwGKivcaorkLUvw6ZIHFBPKCPCz9eg,1520 +pytz/zoneinfo/Antarctica/Mawson,sha256=9TW1g_z0tk5EfeB7K69VJo8agO7-K9ZxWbiqNKnUZNE,199 +pytz/zoneinfo/Antarctica/McMurdo,sha256=gADjoyPo_QISQU6UJrAgcHp3HDaMoOFRdH-d23uBSyc,2437 +pytz/zoneinfo/Antarctica/Palmer,sha256=DW_DXByXg5MnMZ-w1bNdu8b0lKOYD_EgrPRd5EcyEm4,1418 +pytz/zoneinfo/Antarctica/Rothera,sha256=QQI1m1IN4_2e6Bb0z-rOYaOwxp4XjMJDOKM9SFDUPKg,164 +pytz/zoneinfo/Antarctica/South_Pole,sha256=gADjoyPo_QISQU6UJrAgcHp3HDaMoOFRdH-d23uBSyc,2437 +pytz/zoneinfo/Antarctica/Syowa,sha256=VnmdVypdJUhsBw1XuXZEcEQIFmoiqoYcdpl8ht37QgY,165 +pytz/zoneinfo/Antarctica/Troll,sha256=3zrh-P_jMCss9GGwHJJHkypZZydq4mkgo_TDqctn3c4,1162 +pytz/zoneinfo/Antarctica/Vostok,sha256=6tx86WD3MVGJBCbOJUCoA6YlGwCn2BT4B85Zss0vz4Y,165 +pytz/zoneinfo/Arctic/Longyearbyen,sha256=UdCERhj1JYpx3ojmilaRoyVoR4qMA1-PEv6hGwnpsJA,2228 +pytz/zoneinfo/Asia/Aden,sha256=rq9KPj8l0FBnnKn93WkMeA1IngNtTzk5_oV4sEZhc4w,165 +pytz/zoneinfo/Asia/Almaty,sha256=rBIl_pqZNmKZabjEa4mcsLahl9PbAdZJpQMQLVmcfBU,997 +pytz/zoneinfo/Asia/Amman,sha256=bvwhc1hPCGvQMqWzaoCHrCA_y78n3H-Z2t4wHSocuAw,1853 +pytz/zoneinfo/Asia/Anadyr,sha256=hDDTly45ejoVVP9Al07TmKpTACNGJaIPlcXLRbsG_4g,1188 +pytz/zoneinfo/Asia/Aqtau,sha256=A5exZN256JagFJTcasgdCrQ8giOqZ2EFMRVYBWTaqZA,983 +pytz/zoneinfo/Asia/Aqtobe,sha256=LQ7P5LEEe7jbWbjqvzmM79c0o6AdZeCExQS-fOWp8yw,1011 +pytz/zoneinfo/Asia/Ashgabat,sha256=L4DYV2mZWycsYeHIypXzO6ZNY3tD8wjgxfPR2ZPW26c,619 +pytz/zoneinfo/Asia/Ashkhabad,sha256=L4DYV2mZWycsYeHIypXzO6ZNY3tD8wjgxfPR2ZPW26c,619 +pytz/zoneinfo/Asia/Atyrau,sha256=3uEo89ORyDJqQ_TtaQdIf9UPaB8WqIRQVi0geeY9gVE,991 +pytz/zoneinfo/Asia/Baghdad,sha256=lQMSUnOuijbcoTaCqMNnYhnvKtS2IVP_kXFAzePVNDU,983 +pytz/zoneinfo/Asia/Bahrain,sha256=V0rFJdLHIrToJ5Wl28VzVowwCVZoY8ZZSeNp-7kOvjY,199 +pytz/zoneinfo/Asia/Baku,sha256=vhHnliaOdRyNudl0sFJFdLynEg0Hc0I-IiZNfbDeCbM,1227 +pytz/zoneinfo/Asia/Bangkok,sha256=eYq0vh89N1j069URoQvtBu0ndEal6FPrtbF8WCKKpDw,199 +pytz/zoneinfo/Asia/Barnaul,sha256=2c1Cq8XYlBgybRQMP8w0NCf7kaLDrPZtGn4M5iJZbJo,1221 +pytz/zoneinfo/Asia/Beirut,sha256=_Z_2ZAg_iL9vU51JDB8CB04uXBDrf1kLIis-JnXaS2o,2154 +pytz/zoneinfo/Asia/Bishkek,sha256=do_4ki1JvSKupUrvlz9jRkHspDhdvk1D2IkByFskjJM,983 +pytz/zoneinfo/Asia/Brunei,sha256=BMMjwEmZ9rMoNpWfg8IrlLhRbMKbdW48padRF-FGolc,203 +pytz/zoneinfo/Asia/Calcutta,sha256=6Qw0EDbLcgMgDik8s7UTJn4QSjmllPNeGVJU5rwKF88,285 +pytz/zoneinfo/Asia/Chita,sha256=4ICOcAVAEWnP-cdf_YJu1_kCYnYPG2_vYfSbuNI-VwI,1221 +pytz/zoneinfo/Asia/Choibalsan,sha256=sJQAAjiT9VyG73dYhpYkq4tcmfITcPpiAa8YXsDlKag,949 +pytz/zoneinfo/Asia/Chongqing,sha256=-fm6S1oS3D2M1qZpgZBlGQnyQrEwixXmz4NsHzmDzWU,533 +pytz/zoneinfo/Asia/Chungking,sha256=-fm6S1oS3D2M1qZpgZBlGQnyQrEwixXmz4NsHzmDzWU,533 +pytz/zoneinfo/Asia/Colombo,sha256=HGea9jswIIgz7k20LTzbKtQyUun67IP5HvsZrmAJZJY,372 +pytz/zoneinfo/Asia/Dacca,sha256=3K5llGhcpCdZMMcJuomICVv7lZlDRpU4PUb5DtFx8l4,337 +pytz/zoneinfo/Asia/Damascus,sha256=6mcB6bxH1KsLqzb_LmJUT3tUDnq9_ScLFKoMFkcZy3A,2294 +pytz/zoneinfo/Asia/Dhaka,sha256=3K5llGhcpCdZMMcJuomICVv7lZlDRpU4PUb5DtFx8l4,337 +pytz/zoneinfo/Asia/Dili,sha256=ptjbacc9JK0pv2JpD-gHMglrwYNj9LMMIua0U0ZTMUc,227 +pytz/zoneinfo/Asia/Dubai,sha256=-ga0m3ua9Y6kSWREz2_VdtcVAkq83WrW3vxjBI7WNGs,165 +pytz/zoneinfo/Asia/Dushanbe,sha256=FUk9Tt_GimfRulcWamEvuOvA7FQ52YfZqQ2w88qMx6M,591 +pytz/zoneinfo/Asia/Famagusta,sha256=CFrcygd8ude5x6OEtfM_Dw0KYHoxpPPzq46KoHVxjjc,2028 +pytz/zoneinfo/Asia/Gaza,sha256=VQl9UGxRirFOqw91pba2Ukner-_lKv1lu_OZ77g17kc,2316 +pytz/zoneinfo/Asia/Harbin,sha256=-fm6S1oS3D2M1qZpgZBlGQnyQrEwixXmz4NsHzmDzWU,533 +pytz/zoneinfo/Asia/Hebron,sha256=biIIT_LY-UdgdNsF-inW4A8CPMGtyggJFc4iaq_0xsM,2344 +pytz/zoneinfo/Asia/Ho_Chi_Minh,sha256=L5TXNg6-odIIn-JAyLTR8fKFiUFBNFwy0HzwZchbnm4,351 +pytz/zoneinfo/Asia/Hong_Kong,sha256=UcnFEc9S8hMWl9giVXni4TAhLPWX0H12XvwSt4AJHew,1203 +pytz/zoneinfo/Asia/Hovd,sha256=JUnOos7PNTi2VRKxD6XnaVR3NpuhsX_Pi18rIzVe1xw,891 +pytz/zoneinfo/Asia/Irkutsk,sha256=iUJZCVBjpfB4rNKJOr6g0zUZtccYYk_Gk0wTklx8Yj0,1243 +pytz/zoneinfo/Asia/Istanbul,sha256=2S0A_f7VxvyErJMMCPqK33AChA29IVkMr1o-SpMtMxk,1947 +pytz/zoneinfo/Asia/Jakarta,sha256=_WRgz6Zb6wxIXtMwpKjG4w4PJtDRzkhdrw-3a4NCBFA,355 +pytz/zoneinfo/Asia/Jayapura,sha256=ihzUd-L8HUVqG-Na10MyPE-YYwjVFj-xerqjTN4EJZs,221 +pytz/zoneinfo/Asia/Jerusalem,sha256=xpEJ_vI7aMV0iFD5BN9sq71vlYUZj5Q613TUUwZl1Ww,2288 +pytz/zoneinfo/Asia/Kabul,sha256=ial7SvweHTQXDl79MnXm6QHtiw2i7Zt1e5urLXU8Sq8,208 +pytz/zoneinfo/Asia/Kamchatka,sha256=pBA0RbynKTKsMCmf2hJMZ_hgVUPemms-VceMMJ7QC64,1166 +pytz/zoneinfo/Asia/Karachi,sha256=iB-mWMTXUyfBwAkZdz8_UmEw0xsgxIub-KNI7akzhkk,379 +pytz/zoneinfo/Asia/Kashgar,sha256=AEXDJ5PxQOhePZZw1QZl98moDNa-bW3I3WVNQZHBPYA,165 +pytz/zoneinfo/Asia/Kathmandu,sha256=TUeW7rDSifOTSsNxvo9igIYZfGITEZUf-0EjglyRDWs,212 +pytz/zoneinfo/Asia/Katmandu,sha256=TUeW7rDSifOTSsNxvo9igIYZfGITEZUf-0EjglyRDWs,212 +pytz/zoneinfo/Asia/Khandyga,sha256=XYzE2tsE5Say9pg0cHDQkEE9aTuy2piFSLAGx_d-dmM,1271 +pytz/zoneinfo/Asia/Kolkata,sha256=6Qw0EDbLcgMgDik8s7UTJn4QSjmllPNeGVJU5rwKF88,285 +pytz/zoneinfo/Asia/Krasnoyarsk,sha256=nzRw4PI2AiK_Ge854b8U7TSDw0LGQy3ca5YuOOU2XwI,1207 +pytz/zoneinfo/Asia/Kuala_Lumpur,sha256=RfiIYo6sMEkSA8m5iUmyOyJzKZrgRs8ehGuDZwoq88k,383 +pytz/zoneinfo/Asia/Kuching,sha256=KsAtQ0aocINozixwW7CkorY-1PTLlsj7UUnQGQMEYTQ,483 +pytz/zoneinfo/Asia/Kuwait,sha256=rq9KPj8l0FBnnKn93WkMeA1IngNtTzk5_oV4sEZhc4w,165 +pytz/zoneinfo/Asia/Macao,sha256=MvAkRyRsrA2r052ItlyF5bh2FheRjI0jPwg0uIiH2Yk,1227 +pytz/zoneinfo/Asia/Macau,sha256=MvAkRyRsrA2r052ItlyF5bh2FheRjI0jPwg0uIiH2Yk,1227 +pytz/zoneinfo/Asia/Magadan,sha256=cqwjKQt8TlznM1w2DezAZuz1EjeOfLxPeSY19i9zkfQ,1222 +pytz/zoneinfo/Asia/Makassar,sha256=OhJtCqSTEU-u5n0opBVO5Bu-wQzcYPy9S_6aAhJXgOw,254 +pytz/zoneinfo/Asia/Manila,sha256=ujfq0kl1EhxcYSOrG-FS750aNaYUt1TT4bFuK4EcL_c,328 +pytz/zoneinfo/Asia/Muscat,sha256=-ga0m3ua9Y6kSWREz2_VdtcVAkq83WrW3vxjBI7WNGs,165 +pytz/zoneinfo/Asia/Nicosia,sha256=0Unm0IFT7HyGeQ7F3vTa_-klfysCgrulqFO6BD1plZU,2002 +pytz/zoneinfo/Asia/Novokuznetsk,sha256=vQGcqKdmYmWDdl73QPZTcyadnope1RPJ4oBgZelQu90,1165 +pytz/zoneinfo/Asia/Novosibirsk,sha256=ApL3s20HX2eIAno03HCa2RXdlLotVb9JvnZl7W1sM00,1221 +pytz/zoneinfo/Asia/Omsk,sha256=wxbEesfe7dJOkNPffqTwT6wuTSSTM6E9f0uFMAyzMCM,1207 +pytz/zoneinfo/Asia/Oral,sha256=iMjqD4LvDgyxN15v7CqyEdBDyBFaOlChwX1wHz2JiVQ,1005 +pytz/zoneinfo/Asia/Phnom_Penh,sha256=eYq0vh89N1j069URoQvtBu0ndEal6FPrtbF8WCKKpDw,199 +pytz/zoneinfo/Asia/Pontianak,sha256=inOXwuKtjKv1z_eliPZSIqjSt6whtuxhPeG1YpjU_BQ,353 +pytz/zoneinfo/Asia/Pyongyang,sha256=_-g3GnDAtfDX4XAktXH9jFouLUDmOovnjoOfvRpUDsE,237 +pytz/zoneinfo/Asia/Qatar,sha256=V0rFJdLHIrToJ5Wl28VzVowwCVZoY8ZZSeNp-7kOvjY,199 +pytz/zoneinfo/Asia/Qostanay,sha256=UGYEvmZfAAS9D6EMGd0n6-r_Az_zgTDSWLPeHzFLfu0,1011 +pytz/zoneinfo/Asia/Qyzylorda,sha256=aiSRxwoUbQ-TBHf2wcyaOhQb86j3jQpXwcQaSPnAtwU,1025 +pytz/zoneinfo/Asia/Rangoon,sha256=ZHuX-XVHr8dGJjrPQ5cW7b8jQUv3ihyd-VzN545mlMA,268 +pytz/zoneinfo/Asia/Riyadh,sha256=rq9KPj8l0FBnnKn93WkMeA1IngNtTzk5_oV4sEZhc4w,165 +pytz/zoneinfo/Asia/Saigon,sha256=L5TXNg6-odIIn-JAyLTR8fKFiUFBNFwy0HzwZchbnm4,351 +pytz/zoneinfo/Asia/Sakhalin,sha256=95AdPwOgSe0g9wdx67kKLDbjvY3FtpeVBoAWbJVco0w,1202 +pytz/zoneinfo/Asia/Samarkand,sha256=BBe6Gg_KlSQuS5hAyvvhZWmClcLJaFjnCNGC391HHQM,577 +pytz/zoneinfo/Asia/Seoul,sha256=LI9LsV3XcJC0l-KoQf8zI-y7rk-du57erS-N2Ptdi7Q,617 +pytz/zoneinfo/Asia/Shanghai,sha256=-fm6S1oS3D2M1qZpgZBlGQnyQrEwixXmz4NsHzmDzWU,533 +pytz/zoneinfo/Asia/Singapore,sha256=hIgr_LHMTWh3GgeG-MmLHBp-9anUxQcfMlKFtX8WvmU,383 +pytz/zoneinfo/Asia/Srednekolymsk,sha256=0DllW8q5VgXEMV5c_nLJElZsNpauvNhNACQpcgdqEl0,1208 +pytz/zoneinfo/Asia/Taipei,sha256=DMmQwOpPql25ue3Nf8vAKKT4em06D1Z9rHbLIitxixk,761 +pytz/zoneinfo/Asia/Tashkent,sha256=LS-yTxh0v1vmJoQ9I6fY-IERk7ukPmovVx2Ut_-b-Ig,591 +pytz/zoneinfo/Asia/Tbilisi,sha256=w6UNxgyn4BVVTF5WkAtxo_u7nnIY26makKQ5nRgifds,1035 +pytz/zoneinfo/Asia/Tehran,sha256=ATT50Q0hK6uSba5_WnOE3Px0OWxIwxaqK5Oi10P2A-M,2582 +pytz/zoneinfo/Asia/Tel_Aviv,sha256=xpEJ_vI7aMV0iFD5BN9sq71vlYUZj5Q613TUUwZl1Ww,2288 +pytz/zoneinfo/Asia/Thimbu,sha256=uia8or5dtDkxVUZrcLwkjbTz9C7ZhLq0T4jlE4YvuvQ,203 +pytz/zoneinfo/Asia/Thimphu,sha256=uia8or5dtDkxVUZrcLwkjbTz9C7ZhLq0T4jlE4YvuvQ,203 +pytz/zoneinfo/Asia/Tokyo,sha256=oCueZgRNxcNcX3ZGdif9y6Su4cyVhga4XHdwlcrYLOs,309 +pytz/zoneinfo/Asia/Tomsk,sha256=77YgdJLxETRKjQjnaHHf54xBAqNywTDwQQmZ5v6Aq28,1221 +pytz/zoneinfo/Asia/Ujung_Pandang,sha256=OhJtCqSTEU-u5n0opBVO5Bu-wQzcYPy9S_6aAhJXgOw,254 +pytz/zoneinfo/Asia/Ulaanbaatar,sha256=uyQSzIBl0f2TXHrmUm3VPs1C9ro013hYmAlx6yUjh3Y,891 +pytz/zoneinfo/Asia/Ulan_Bator,sha256=uyQSzIBl0f2TXHrmUm3VPs1C9ro013hYmAlx6yUjh3Y,891 +pytz/zoneinfo/Asia/Urumqi,sha256=AEXDJ5PxQOhePZZw1QZl98moDNa-bW3I3WVNQZHBPYA,165 +pytz/zoneinfo/Asia/Ust-Nera,sha256=JAZhRAPdbOL9AL-WHOL8aZjxdZxLmGDNBGMCw9TKtR8,1252 +pytz/zoneinfo/Asia/Vientiane,sha256=eYq0vh89N1j069URoQvtBu0ndEal6FPrtbF8WCKKpDw,199 +pytz/zoneinfo/Asia/Vladivostok,sha256=Wokhgtj2nwUj992h7SyfB_fRNHAKfPNzhsf_oZpim8c,1208 +pytz/zoneinfo/Asia/Yakutsk,sha256=RVCIl52EvMrp2RG2hg2cjDSr9QhsscaAT-NV81xw7zc,1207 +pytz/zoneinfo/Asia/Yangon,sha256=ZHuX-XVHr8dGJjrPQ5cW7b8jQUv3ihyd-VzN545mlMA,268 +pytz/zoneinfo/Asia/Yekaterinburg,sha256=NzVc2DiPeyw0FdMHwSPQJF9k3tvWdtrETZiN58pyxLk,1243 +pytz/zoneinfo/Asia/Yerevan,sha256=k0WHtWQW_cBCjcEv8nP01cVPeTVDlf18lQ0_u6cin1o,1151 +pytz/zoneinfo/Atlantic/Azores,sha256=ut7TdE-xiQNjRybg56Tt5b7Zo5zqbuF5IFci2aDMs1Q,3484 +pytz/zoneinfo/Atlantic/Bermuda,sha256=164Ap4_hmYOtKX9guV1DrlhSH9LSnMiHEo6vpS8faSw,1978 +pytz/zoneinfo/Atlantic/Canary,sha256=ymK9ufqphvNjDK3hzikN4GfkcR3QeCBiPKyVc6FjlbA,1897 +pytz/zoneinfo/Atlantic/Cape_Verde,sha256=ESQvE3deMI-lx9mG0yJLEsFX5KRl-7c6gD5O2h0Zm9Q,270 +pytz/zoneinfo/Atlantic/Faeroe,sha256=NibdZPZtapnYR_myIZnMdTaSKGsOBGgujj0_T2NvAzs,1815 +pytz/zoneinfo/Atlantic/Faroe,sha256=NibdZPZtapnYR_myIZnMdTaSKGsOBGgujj0_T2NvAzs,1815 +pytz/zoneinfo/Atlantic/Jan_Mayen,sha256=UdCERhj1JYpx3ojmilaRoyVoR4qMA1-PEv6hGwnpsJA,2228 +pytz/zoneinfo/Atlantic/Madeira,sha256=e1K2l8ykd8xpznQNs3SSuIZ1ZfVx2Y69EXrhvYV3P14,3475 +pytz/zoneinfo/Atlantic/Reykjavik,sha256=mSkaRBGZLeUrm88EeHcaWnEd35Wn-Ag2G10HtI3G2fg,1162 +pytz/zoneinfo/Atlantic/South_Georgia,sha256=QZ72fRKp6Kgvy7DfyHGht1MVnzGgSPujLQd4XMjNrrc,164 +pytz/zoneinfo/Atlantic/St_Helena,sha256=0u-sTl8j2IyV1ywdtCgHFw9S9D3ZiiBa9akqkbny2Zc,148 +pytz/zoneinfo/Atlantic/Stanley,sha256=exKMLw-P952wS1FTxVjnUU1mkD2OvKUDwtDt8IGgf8w,1214 +pytz/zoneinfo/Australia/ACT,sha256=lXwgrHWvytX_hJuWsHPSFPNObHReMZ-36D3e_QXMVhk,2204 +pytz/zoneinfo/Australia/Adelaide,sha256=96rGNE9D_GsPI9WH0M8eH4AYG__5B5ZlgQk_rLOfY80,2222 +pytz/zoneinfo/Australia/Brisbane,sha256=r_riK-wbS5fRi0eZxkUQre1nHTQ_q7dMwZ5CRVhS9vI,433 +pytz/zoneinfo/Australia/Broken_Hill,sha256=kdigsOjUvUIi4Tyq2lcZnvSHGoWTLh6J3-PeE5MRaQc,2243 +pytz/zoneinfo/Australia/Canberra,sha256=lXwgrHWvytX_hJuWsHPSFPNObHReMZ-36D3e_QXMVhk,2204 +pytz/zoneinfo/Australia/Currie,sha256=R0oL8IXO87BcbO57nR5d-H21I2ndxQSPsgMYlgR-qhc,2204 +pytz/zoneinfo/Australia/Darwin,sha256=rOoBtYLkk7aeoshCOihPeseuxM184yp7uhSHbdT0FoU,304 +pytz/zoneinfo/Australia/Eucla,sha256=MmcY-HkzU0mccRVN0GSXSZ072x7NanzSS3dDdIjLRl4,484 +pytz/zoneinfo/Australia/Hobart,sha256=Rw57MtxlcRbn-ZszuSjgxMHmpSLLQThAXeqL9l9TvXw,2316 +pytz/zoneinfo/Australia/LHI,sha256=Luf0Lx_iJHuh3kZd4LxRjf36tLF5-wW2UFMVNKNT7gg,1860 +pytz/zoneinfo/Australia/Lindeman,sha256=jkXejV1-5ZLpMTj450TAwKcMPZtuaoKLcSugLsunqBs,489 +pytz/zoneinfo/Australia/Lord_Howe,sha256=Luf0Lx_iJHuh3kZd4LxRjf36tLF5-wW2UFMVNKNT7gg,1860 +pytz/zoneinfo/Australia/Melbourne,sha256=MimH3imrwSUOLJNjIsfQbc5I_6kU6H-VEL8humwNHFk,2204 +pytz/zoneinfo/Australia/NSW,sha256=lXwgrHWvytX_hJuWsHPSFPNObHReMZ-36D3e_QXMVhk,2204 +pytz/zoneinfo/Australia/North,sha256=rOoBtYLkk7aeoshCOihPeseuxM184yp7uhSHbdT0FoU,304 +pytz/zoneinfo/Australia/Perth,sha256=d0oXb77ElK6sKmU7Q-Lsmff0bz6Uk7X3hFMverH2InM,460 +pytz/zoneinfo/Australia/Queensland,sha256=r_riK-wbS5fRi0eZxkUQre1nHTQ_q7dMwZ5CRVhS9vI,433 +pytz/zoneinfo/Australia/South,sha256=96rGNE9D_GsPI9WH0M8eH4AYG__5B5ZlgQk_rLOfY80,2222 +pytz/zoneinfo/Australia/Sydney,sha256=lXwgrHWvytX_hJuWsHPSFPNObHReMZ-36D3e_QXMVhk,2204 +pytz/zoneinfo/Australia/Tasmania,sha256=Rw57MtxlcRbn-ZszuSjgxMHmpSLLQThAXeqL9l9TvXw,2316 +pytz/zoneinfo/Australia/Victoria,sha256=MimH3imrwSUOLJNjIsfQbc5I_6kU6H-VEL8humwNHFk,2204 +pytz/zoneinfo/Australia/West,sha256=d0oXb77ElK6sKmU7Q-Lsmff0bz6Uk7X3hFMverH2InM,460 +pytz/zoneinfo/Australia/Yancowinna,sha256=kdigsOjUvUIi4Tyq2lcZnvSHGoWTLh6J3-PeE5MRaQc,2243 +pytz/zoneinfo/Brazil/Acre,sha256=17onkm8P_VgMkErjK9rr0qwNni7qp9tgcUZ93g3ltOs,628 +pytz/zoneinfo/Brazil/DeNoronha,sha256=3R4lLV8jg5SljhC5OVVCk51Y77Efjo6zCe-oppg_FFo,716 +pytz/zoneinfo/Brazil/East,sha256=cO3VGekMGdSf1y4f_UgkpDMRes26-l1oGUoDglIiUQg,1444 +pytz/zoneinfo/Brazil/West,sha256=lp6RlkcXJQ7mSsKqnEgC8svJVrFDJk_16xxvfpNSpK4,604 +pytz/zoneinfo/Canada/Atlantic,sha256=TZpmc5PwWoLfTfQoQ_b3U17BE2iVKSeNkR0Ho8mbTn8,3424 +pytz/zoneinfo/Canada/Central,sha256=7P-_YQrneFcon7QKSTOnkiGjEppFDn3Z48MJ1qq8VBw,2868 +pytz/zoneinfo/Canada/Eastern,sha256=ggOSzbHkmfgu9wTQzP0MUKsrKMbgveuAeThh1eFl1a0,3494 +pytz/zoneinfo/Canada/Mountain,sha256=-TkIfc3QlvaCf0p8COZ43Y1HRBAl-nARUi-JdXeK1vE,2332 +pytz/zoneinfo/Canada/Newfoundland,sha256=r1-17uKv27eZ3JsVkw_DLZQbo6wvjuuVu7C2pDsmOgI,3655 +pytz/zoneinfo/Canada/Pacific,sha256=sknKH0jSPWam-DHfM35qXs8Nam7d5TFlkUI9Sgxryyg,2892 +pytz/zoneinfo/Canada/Saskatchewan,sha256=yjqT08pHbICYe83H8JmtaDBvCFqRv7Tfze3Y8xuXukw,980 +pytz/zoneinfo/Canada/Yukon,sha256=agbpCco506MSV46rKLEkJd7_RTjinyaBbScQIUDZM00,2084 +pytz/zoneinfo/Chile/Continental,sha256=GB14PW0xABV283dXc8qL-nnDW-ViFUR3bne7sg0Aido,2529 +pytz/zoneinfo/Chile/EasterIsland,sha256=paHp1QRXIa02kgd0-4V6vWXdqcwheow-hJQD9VqacfQ,2233 +pytz/zoneinfo/Etc/GMT,sha256=bZ83iIPAefhsA4elVHqSxEmGnYBuB94QCEqwTwJJAY0,114 +pytz/zoneinfo/Etc/GMT+0,sha256=bZ83iIPAefhsA4elVHqSxEmGnYBuB94QCEqwTwJJAY0,114 +pytz/zoneinfo/Etc/GMT+1,sha256=1Qzl2X9rQ_RXEf11yH09wQZCr_ph6UdFP7E0yu9s-IQ,116 +pytz/zoneinfo/Etc/GMT+10,sha256=JEQyQyQlkC0o6ZTdeVjZhCIOh6cK5TF7H00Pkls-sUI,117 +pytz/zoneinfo/Etc/GMT+11,sha256=tWvcvYMFCaE60nJVvDrrov7stJvs1KQYOyrhl3dzcUs,117 +pytz/zoneinfo/Etc/GMT+12,sha256=b70HEhErq8IJmq8x7cOZy4eR__3fq5uHHpjvPBEHqMA,117 +pytz/zoneinfo/Etc/GMT+2,sha256=T6Ep5zhslBKbYaECFUB6gUKh3iTZPyMoW1kjhonxrUo,116 +pytz/zoneinfo/Etc/GMT+3,sha256=QGoYrE04bUJ-OzL37dt2MZT5FxWNLpJDPVXgJbstYZA,116 +pytz/zoneinfo/Etc/GMT+4,sha256=RWrkNki-wV7X-coe0VvufBe6LrWVpkPJgia5QQYEnBo,116 +pytz/zoneinfo/Etc/GMT+5,sha256=oRmeC41dgYXT-zzyZIRKXN9IvdL2Da5nTuwmG2_prIA,116 +pytz/zoneinfo/Etc/GMT+6,sha256=d6dAnwiejyFI2n7AzFlFW0aFAT6zYNEjBIEG0uu0sbQ,116 +pytz/zoneinfo/Etc/GMT+7,sha256=TqjYbzd0YHpx1wisFg08J19wTpg6ztJLLongZY_lozs,116 +pytz/zoneinfo/Etc/GMT+8,sha256=th_8bIMmYgRPCesBrbmBhRr0jQO7whd70LiY9HfwJyk,116 +pytz/zoneinfo/Etc/GMT+9,sha256=Qq5E6iUS7JMJIymT7YoqlI8MtqtVy0mr9t6zWFtWc9Y,116 +pytz/zoneinfo/Etc/GMT-0,sha256=bZ83iIPAefhsA4elVHqSxEmGnYBuB94QCEqwTwJJAY0,114 +pytz/zoneinfo/Etc/GMT-1,sha256=73F1eU8uAQGP3mcoB2q99CjfManGFHk3fefljp9pYC4,117 +pytz/zoneinfo/Etc/GMT-10,sha256=fKWWNwLBOp1OkKjtc1w9LIXJR1mTTD-JdvYflRy1IrU,118 +pytz/zoneinfo/Etc/GMT-11,sha256=D2S79n6psa9t9_2vj5wIrFpHH2OJLcCKP6vtwzFZINY,118 +pytz/zoneinfo/Etc/GMT-12,sha256=me4V6lmWI8gSr8H7N41WAD0Eww1anh_EF34Qr9UoSnI,118 +pytz/zoneinfo/Etc/GMT-13,sha256=xbmbG1BQA6Dlpa_iUwEGyJxW4a3t6lmawdPKAE8vbR8,118 +pytz/zoneinfo/Etc/GMT-14,sha256=PpXoREBh02qFpvxVMj2pV9IAzSQvBE7XPvnN9qSZ-Kc,118 +pytz/zoneinfo/Etc/GMT-2,sha256=ve6hWLdeuiLhqagaWLqMD6HNybS1chRwjudfTZ2bYBE,117 +pytz/zoneinfo/Etc/GMT-3,sha256=N77jILanuLDVkLsdujXZSu-dsHiwN5MIpwh7fMUifso,117 +pytz/zoneinfo/Etc/GMT-4,sha256=LSko5fVHqPl5zfwjGqkbMa_OFnvtpT6o_4xYxNz9n5o,117 +pytz/zoneinfo/Etc/GMT-5,sha256=uLaSR5Mb18HRTsAA5SveY9PAJ97dO8QzIWqNXe3wZb4,117 +pytz/zoneinfo/Etc/GMT-6,sha256=JSN-RUAphJ50fpIv7cYC6unrtrz9S1Wma-piDHlGe7c,117 +pytz/zoneinfo/Etc/GMT-7,sha256=vVAOF8xU9T9ESnw68c0SFXpcvkoopaiwTR0zbefHHSU,117 +pytz/zoneinfo/Etc/GMT-8,sha256=S7xFQbFMpiDZy4v5L4D9fCrjRIzzoLC5p8Se23xi7us,117 +pytz/zoneinfo/Etc/GMT-9,sha256=I5vHNmUK-Yyg_S1skFN44VGVzBgktjFgVQiDIKO4aMI,117 +pytz/zoneinfo/Etc/GMT0,sha256=bZ83iIPAefhsA4elVHqSxEmGnYBuB94QCEqwTwJJAY0,114 +pytz/zoneinfo/Etc/Greenwich,sha256=bZ83iIPAefhsA4elVHqSxEmGnYBuB94QCEqwTwJJAY0,114 +pytz/zoneinfo/Etc/UCT,sha256=i4WEZ5GrLIpUY8g6W-PAQ-JXDXRIQ01BOYlp7Ufj5vI,114 +pytz/zoneinfo/Etc/UTC,sha256=i4WEZ5GrLIpUY8g6W-PAQ-JXDXRIQ01BOYlp7Ufj5vI,114 +pytz/zoneinfo/Etc/Universal,sha256=i4WEZ5GrLIpUY8g6W-PAQ-JXDXRIQ01BOYlp7Ufj5vI,114 +pytz/zoneinfo/Etc/Zulu,sha256=i4WEZ5GrLIpUY8g6W-PAQ-JXDXRIQ01BOYlp7Ufj5vI,114 +pytz/zoneinfo/Europe/Amsterdam,sha256=pw8HngVt3bU5QrRzu70qOmf69TIyklkglvVUte9ntKo,2910 +pytz/zoneinfo/Europe/Andorra,sha256=gTB5jCQmvIw3JJi1_vAcOYuhtzPBR6RXUx9gVV6p6ug,1742 +pytz/zoneinfo/Europe/Astrakhan,sha256=ywtzL92KVfoybOmAhE9eHqmMcvJZm5b0js5GDdWIJEQ,1165 +pytz/zoneinfo/Europe/Athens,sha256=XDY-FBUddRyQHN8GxQLZ4awjuOlWlzlUdjv7OdXFNzA,2262 +pytz/zoneinfo/Europe/Belfast,sha256=xp08wV44TZMmAdBqppttDChQAb8tRN03GcEht99RYtY,3648 +pytz/zoneinfo/Europe/Belgrade,sha256=OpWtsGFWBE_S-mYoQcAmjCta9HwbGQANnSmVY9OHCTo,1920 +pytz/zoneinfo/Europe/Berlin,sha256=XuR19xoPwaMvrrhJ-MOcbnqmbW1B7HQrl7OnQ2s7BwE,2298 +pytz/zoneinfo/Europe/Bratislava,sha256=G9fdhUXmzx651BnyZ6V7AOYIV9EV5aMJMm44eJaLLZw,2301 +pytz/zoneinfo/Europe/Brussels,sha256=gS9Vrrbozend9HhuFetCVrIegs9fXSjaG60X2UVwysA,2933 +pytz/zoneinfo/Europe/Bucharest,sha256=nfg6-bU2D6DMEWb9EMIBR5kxnNsbDSx0UKfHH_ZzqFc,2184 +pytz/zoneinfo/Europe/Budapest,sha256=J2tBUmArS5llMhfeILd3UGELv1Bup7DMsh4lX3qhqy4,2368 +pytz/zoneinfo/Europe/Busingen,sha256=K5QY7Ujj2VUchKR4bhhb0hgdAJhmwED71ykXDQOGKe8,1909 +pytz/zoneinfo/Europe/Chisinau,sha256=p1J_rqFE13pL8cpBRrEFe-teCI8f0fKK4uTUy_4diF4,2390 +pytz/zoneinfo/Europe/Copenhagen,sha256=q7iAbkd7y9QvbAi6XGZEUOTwNDCRYWRu9VQCxUrZ01U,2137 +pytz/zoneinfo/Europe/Dublin,sha256=-JSA3vsi44F1DE8supVjSppH2Vpp12WjJI0_COtAmqU,3492 +pytz/zoneinfo/Europe/Gibraltar,sha256=egOcazf2u1njGZ0tDj-f1NzZT_K5rpUKSqtShxO7U6c,3052 +pytz/zoneinfo/Europe/Guernsey,sha256=xp08wV44TZMmAdBqppttDChQAb8tRN03GcEht99RYtY,3648 +pytz/zoneinfo/Europe/Helsinki,sha256=GEkB7LsVhmegt7YuuWheCDvDGC7b7Nw9bTdDGS9qkJc,1900 +pytz/zoneinfo/Europe/Isle_of_Man,sha256=xp08wV44TZMmAdBqppttDChQAb8tRN03GcEht99RYtY,3648 +pytz/zoneinfo/Europe/Istanbul,sha256=2S0A_f7VxvyErJMMCPqK33AChA29IVkMr1o-SpMtMxk,1947 +pytz/zoneinfo/Europe/Jersey,sha256=xp08wV44TZMmAdBqppttDChQAb8tRN03GcEht99RYtY,3648 +pytz/zoneinfo/Europe/Kaliningrad,sha256=s7GXSe1YvMcs7AiUhHNTA6I4nAOQn_Kmz_ZqJYO-LMM,1493 +pytz/zoneinfo/Europe/Kiev,sha256=iVkTPFkl2tADYapa1HASlaV3tT2VsJpTPTTJC_9HtAk,2088 +pytz/zoneinfo/Europe/Kirov,sha256=Sr4HEUwk3tPTXioeCLhvlgKbCAFU7Gy2UB3f--uWLDc,1153 +pytz/zoneinfo/Europe/Lisbon,sha256=L6n3snx6pNHHJIL6JOLFOAlYkQ2J5uB_y5MG_Ic_PDU,3469 +pytz/zoneinfo/Europe/Ljubljana,sha256=OpWtsGFWBE_S-mYoQcAmjCta9HwbGQANnSmVY9OHCTo,1920 +pytz/zoneinfo/Europe/London,sha256=xp08wV44TZMmAdBqppttDChQAb8tRN03GcEht99RYtY,3648 +pytz/zoneinfo/Europe/Luxembourg,sha256=974Dvf_X1QISKG1zIiTJJIfGavobO21HUVS-HfysOcY,2946 +pytz/zoneinfo/Europe/Madrid,sha256=MTTMnrbDDtexRikd72-FbQEpCZjc63_UtBIiDomD95c,2614 +pytz/zoneinfo/Europe/Malta,sha256=xRwBfrV8hOihGtqcek5_B6l5hjc206g3yfbEWXIaUis,2620 +pytz/zoneinfo/Europe/Mariehamn,sha256=GEkB7LsVhmegt7YuuWheCDvDGC7b7Nw9bTdDGS9qkJc,1900 +pytz/zoneinfo/Europe/Minsk,sha256=mn86zdrNWpJYDfE51Iy9n1-Zi2piTyb9EPaS2A-uGJQ,1321 +pytz/zoneinfo/Europe/Monaco,sha256=8DHr1ymf4c5sZKAzLBd4GhXsTZUXMOYUKhhVmmGRdrs,2944 +pytz/zoneinfo/Europe/Moscow,sha256=KmkofRcj6T8Ph28PJChm8JVp13uRvef6TZ0GuPzUiDw,1535 +pytz/zoneinfo/Europe/Nicosia,sha256=0Unm0IFT7HyGeQ7F3vTa_-klfysCgrulqFO6BD1plZU,2002 +pytz/zoneinfo/Europe/Oslo,sha256=UdCERhj1JYpx3ojmilaRoyVoR4qMA1-PEv6hGwnpsJA,2228 +pytz/zoneinfo/Europe/Paris,sha256=wMOaHPLy0KwYfPxMbNq_B8U21RctvO0go5jhc0TlXCQ,2962 +pytz/zoneinfo/Europe/Podgorica,sha256=OpWtsGFWBE_S-mYoQcAmjCta9HwbGQANnSmVY9OHCTo,1920 +pytz/zoneinfo/Europe/Prague,sha256=G9fdhUXmzx651BnyZ6V7AOYIV9EV5aMJMm44eJaLLZw,2301 +pytz/zoneinfo/Europe/Riga,sha256=hJ2_0m1taW9IuA-hMyP5n-WX7YOrR0heKszJhgljRWk,2198 +pytz/zoneinfo/Europe/Rome,sha256=-X5F_d3Dz0kBRWiUTXUN-fgeCHbUEHLaaHIwEPZEdUQ,2641 +pytz/zoneinfo/Europe/Samara,sha256=z2innqSZ8_lkEy8cIyF9JM_FfnO2sWZaqeFqOh8pD7M,1215 +pytz/zoneinfo/Europe/San_Marino,sha256=-X5F_d3Dz0kBRWiUTXUN-fgeCHbUEHLaaHIwEPZEdUQ,2641 +pytz/zoneinfo/Europe/Sarajevo,sha256=OpWtsGFWBE_S-mYoQcAmjCta9HwbGQANnSmVY9OHCTo,1920 +pytz/zoneinfo/Europe/Saratov,sha256=BMej49HlQG24CWCh5VOENrB3jPuJPScPszRtb7MrJ3I,1183 +pytz/zoneinfo/Europe/Simferopol,sha256=_M6LXB5Rqh932nKIJotGjT8YNszAOb7RjHN5ng-uW1Y,1453 +pytz/zoneinfo/Europe/Skopje,sha256=OpWtsGFWBE_S-mYoQcAmjCta9HwbGQANnSmVY9OHCTo,1920 +pytz/zoneinfo/Europe/Sofia,sha256=hCQKXfMNrnA5xHNw_uzTjKzVw4-Bvsq5oGO4yUCv5tY,2077 +pytz/zoneinfo/Europe/Stockholm,sha256=Xgp4GSh8-pzdeJeP8TQ20jWDDUj17R69h6RYTbLYd2g,1909 +pytz/zoneinfo/Europe/Tallinn,sha256=4a6JC0aIpMzqIV7O35zoG0LLJwkQq5AoXZ2ivkic6-w,2148 +pytz/zoneinfo/Europe/Tirane,sha256=ztlZyCS9WCXeVW8nBun3Tyi5HUY0EtFbiBbEc1gucuw,2084 +pytz/zoneinfo/Europe/Tiraspol,sha256=p1J_rqFE13pL8cpBRrEFe-teCI8f0fKK4uTUy_4diF4,2390 +pytz/zoneinfo/Europe/Ulyanovsk,sha256=nFsgcVTmTiiFzHtyJDRnO-3H4GRAfAeceb6b2jFHLUQ,1267 +pytz/zoneinfo/Europe/Uzhgorod,sha256=TIG1rC4QR7nz-vO1VtmN9mDMVjKPDKi7mEB9KpfJOBA,2050 +pytz/zoneinfo/Europe/Vaduz,sha256=K5QY7Ujj2VUchKR4bhhb0hgdAJhmwED71ykXDQOGKe8,1909 +pytz/zoneinfo/Europe/Vatican,sha256=-X5F_d3Dz0kBRWiUTXUN-fgeCHbUEHLaaHIwEPZEdUQ,2641 +pytz/zoneinfo/Europe/Vienna,sha256=ZmI3kADE6bnrJEccqh73XXBY36L1G4DkpiTQImtNrUk,2200 +pytz/zoneinfo/Europe/Vilnius,sha256=UFzRX3orCTB8d9IzlxJPy5eUA2oBPuCu1UJl-2D7C3U,2162 +pytz/zoneinfo/Europe/Volgograd,sha256=f93XuSISl0e0ULHzMnywgQk1NA5cnb4Il-XlYIpamy4,1165 +pytz/zoneinfo/Europe/Warsaw,sha256=TiLDPbeVF0ckgLVEkaSeDaKZ8wctdJDOl_HE_Wd5rKs,2654 +pytz/zoneinfo/Europe/Zagreb,sha256=OpWtsGFWBE_S-mYoQcAmjCta9HwbGQANnSmVY9OHCTo,1920 +pytz/zoneinfo/Europe/Zaporozhye,sha256=V0dhGl3gET8OftMezf8CVy-W00Z7FtuEev5TjI2Rnyw,2106 +pytz/zoneinfo/Europe/Zurich,sha256=K5QY7Ujj2VUchKR4bhhb0hgdAJhmwED71ykXDQOGKe8,1909 +pytz/zoneinfo/Indian/Antananarivo,sha256=Uum8ISzpRaDh830iNkfRva-Rn6NTuuGHNWjig5C29Zo,251 +pytz/zoneinfo/Indian/Chagos,sha256=23B26pwwK0gxW7TP76GltyY-RU_o6RGGSrF93pF7S1E,199 +pytz/zoneinfo/Indian/Christmas,sha256=J4I0WDX_LYAJxsx2vU0EdxFJQKRE-rRL1UvNQv09pCs,165 +pytz/zoneinfo/Indian/Cocos,sha256=PX-k8JpghajjvhljtBjWozaiu9NhUSpVeoACy2cAxN8,174 +pytz/zoneinfo/Indian/Comoro,sha256=Uum8ISzpRaDh830iNkfRva-Rn6NTuuGHNWjig5C29Zo,251 +pytz/zoneinfo/Indian/Kerguelen,sha256=oIvd6bmQFMLUefoBn4c1fQTOAawGcrPcmge2jU7BsYo,165 +pytz/zoneinfo/Indian/Mahe,sha256=fFZ8A-WddgCX1zpcNg3qiGYNeKov8azY57WrPT_d8nM,165 +pytz/zoneinfo/Indian/Maldives,sha256=dUQBbrmoB3odWsMt3K1YUnB447A6nkW3aR1aHzdLF7M,199 +pytz/zoneinfo/Indian/Mauritius,sha256=k6vWUVcfU3gS1K12e_aMw6BeSdMvdLyCJRCAL7CD0go,241 +pytz/zoneinfo/Indian/Mayotte,sha256=Uum8ISzpRaDh830iNkfRva-Rn6NTuuGHNWjig5C29Zo,251 +pytz/zoneinfo/Indian/Reunion,sha256=lHnSVh7CYCuDBEM4dYsWDk006BSAznkCPxjiTtL_WiI,165 +pytz/zoneinfo/Mexico/BajaNorte,sha256=OHHtvy3J70z6wvKBHgPqMEnGs6SXp8fkf0WX9ZiOODk,2342 +pytz/zoneinfo/Mexico/BajaSur,sha256=aIyre-8trAXSHtqxbuu6gDDkWCUjI_SdAKPIjz74M2E,1526 +pytz/zoneinfo/Mexico/General,sha256=DSpTe5TT0KBsxGx79Rs7ah-zJpiGOJKwPjztovRN0b4,1584 +pytz/zoneinfo/Pacific/Apia,sha256=p1vFsjfezDCHmPOnmgG47q7wTPM5feosoWN3ucgGnrw,1097 +pytz/zoneinfo/Pacific/Auckland,sha256=gADjoyPo_QISQU6UJrAgcHp3HDaMoOFRdH-d23uBSyc,2437 +pytz/zoneinfo/Pacific/Bougainville,sha256=ZKDa_S_2gSlmOWizV1DqxH3wbE58rfK1vKZHZqrrtjI,268 +pytz/zoneinfo/Pacific/Chatham,sha256=lkVqaSF1WWpv_B2K-k2uJp2setRVK6XbjsQ38gDGVEg,2068 +pytz/zoneinfo/Pacific/Chuuk,sha256=6IYDKViuRDC_RVx1AJOxazVET6cZtdv_LFE6xbtGItI,269 +pytz/zoneinfo/Pacific/Easter,sha256=paHp1QRXIa02kgd0-4V6vWXdqcwheow-hJQD9VqacfQ,2233 +pytz/zoneinfo/Pacific/Efate,sha256=qMpQfM1DMNCg67In4d2-qmMLDANPbHTypP86XOtINuE,466 +pytz/zoneinfo/Pacific/Enderbury,sha256=zqW7qAC_6FTcgrGEMhpIsl1oV9I46gY2nH3pwadll68,234 +pytz/zoneinfo/Pacific/Fakaofo,sha256=gow-SgE5r5c8J_Ag5nvJ5SUPDg6yH8pth_a-QLDcPv8,200 +pytz/zoneinfo/Pacific/Fiji,sha256=zIiVOEMXBs3GcPwqTiaG2QL0egPB8nltIU7ZyoqfaSk,1077 +pytz/zoneinfo/Pacific/Funafuti,sha256=P-XYwlWQpWvS3Q_TYFe37BrgxKJy5tg7PHEQNCDGv5U,166 +pytz/zoneinfo/Pacific/Galapagos,sha256=MdtlC-ffp8reICzDxsQ8tWMsTkq5ZcN-j3OyyhjokV8,238 +pytz/zoneinfo/Pacific/Gambier,sha256=z6eYF8sszLjkfpqmWnbBBAUB-ibaR5nodKaAYbvXOe0,164 +pytz/zoneinfo/Pacific/Guadalcanal,sha256=6GX-XpxcCyA64qUMdxJMFMq4sPk0ZjhexqGbryzfgjE,166 +pytz/zoneinfo/Pacific/Guam,sha256=Ex9znmf6rNfGze6gNpZJCMr1TT4rkl2SnrhecrdJufI,494 +pytz/zoneinfo/Pacific/Honolulu,sha256=fwPRv1Jk56sCOi75uZfd_Iy2k2aSQHx3B2K5xUlSPzM,329 +pytz/zoneinfo/Pacific/Johnston,sha256=fwPRv1Jk56sCOi75uZfd_Iy2k2aSQHx3B2K5xUlSPzM,329 +pytz/zoneinfo/Pacific/Kiritimati,sha256=VHR3iuwiv3tx65WtitVHCoQEg3VJd812VZ5djuSyUxc,238 +pytz/zoneinfo/Pacific/Kosrae,sha256=Vm5AKI6NvuYSz58s8922WNIiWoqPcix2JOJOix1mlSU,351 +pytz/zoneinfo/Pacific/Kwajalein,sha256=L4nH3qxv5EBKVRxYt67b9IfZfBzg5KJk19iu7x3oBMk,316 +pytz/zoneinfo/Pacific/Majuro,sha256=Dwqh7gXoz7Duwu1n7XF8yEjhM4ULEs42LSQyy7F-qzQ,310 +pytz/zoneinfo/Pacific/Marquesas,sha256=uzsjVolutGRXp_FRnvXoU0ApDEb4ZaYoz_r60D7jufg,173 +pytz/zoneinfo/Pacific/Midway,sha256=fCYrYphYY6rUfxOw712y5cyRe104AC3pouqD3bCINFg,175 +pytz/zoneinfo/Pacific/Nauru,sha256=oGxocYsqssZ_EeQHf3cUP5cg0qtqzx1BzoEjVWjE_7g,252 +pytz/zoneinfo/Pacific/Niue,sha256=lSsVlJJ458vNuIgjZESQyatsJV3LpWGyHqbYXMXPjZ4,241 +pytz/zoneinfo/Pacific/Norfolk,sha256=CdEXM9SKYC9Wn7aMxD2sV5i8zE88NQo25Z_L874JthI,880 +pytz/zoneinfo/Pacific/Noumea,sha256=FSanpAOCE7WHQeiop4QErKV9ZC3Tzu2GxkH8-tIXsHY,304 +pytz/zoneinfo/Pacific/Pago_Pago,sha256=fCYrYphYY6rUfxOw712y5cyRe104AC3pouqD3bCINFg,175 +pytz/zoneinfo/Pacific/Palau,sha256=CRW__McXPlOaxo2S9kHMHaBdjv7u59ZWEwYuJConzmQ,180 +pytz/zoneinfo/Pacific/Pitcairn,sha256=O65Ed1FOCF_0rEjpYPAquDwtAF3hxyJNiujgpgZV0kc,202 +pytz/zoneinfo/Pacific/Pohnpei,sha256=YqXrKwjhUnxWyV6PFg1L6_zu84MfPW82dypf0S7pHtQ,303 +pytz/zoneinfo/Pacific/Ponape,sha256=YqXrKwjhUnxWyV6PFg1L6_zu84MfPW82dypf0S7pHtQ,303 +pytz/zoneinfo/Pacific/Port_Moresby,sha256=ei_XjmiRDLh-RU94uvz9CCIIRFH1r0X7WL-sB-6DF60,186 +pytz/zoneinfo/Pacific/Rarotonga,sha256=UfUhlaG0u7yOlzoKnHE9pRiHqQ2N_M9n5WHaCCwtbV4,577 +pytz/zoneinfo/Pacific/Saipan,sha256=Ex9znmf6rNfGze6gNpZJCMr1TT4rkl2SnrhecrdJufI,494 +pytz/zoneinfo/Pacific/Samoa,sha256=fCYrYphYY6rUfxOw712y5cyRe104AC3pouqD3bCINFg,175 +pytz/zoneinfo/Pacific/Tahiti,sha256=9iozXRFYDhBOLijmDk2mRS4Mb-LXWW1u7n790jBNKxM,165 +pytz/zoneinfo/Pacific/Tarawa,sha256=vT6UxW7KeGptdh80Fj9ASATGmLx8Wai630lML4mwg80,166 +pytz/zoneinfo/Pacific/Tongatapu,sha256=ht8ZhdveQXJqsxYtSEcqmRTzXA3OtqYoi4WVBvOPGhw,372 +pytz/zoneinfo/Pacific/Truk,sha256=6IYDKViuRDC_RVx1AJOxazVET6cZtdv_LFE6xbtGItI,269 +pytz/zoneinfo/Pacific/Wake,sha256=dTJxldgcad-kGrODwo4cAHGRSsS-K3fjeZ62WEUhmFk,166 +pytz/zoneinfo/Pacific/Wallis,sha256=CAlw1H5gkER5lkvtmHY-ppoGL3hNmYxfMaXQpI0fTOE,166 +pytz/zoneinfo/Pacific/Yap,sha256=6IYDKViuRDC_RVx1AJOxazVET6cZtdv_LFE6xbtGItI,269 +pytz/zoneinfo/US/Alaska,sha256=oZA1NSPS2BWdymYpnCHFO8BlYVS-ll5KLg2Ez9CbETs,2371 +pytz/zoneinfo/US/Aleutian,sha256=IB1DhwJQAKbhPJ9jHLf8zW5Dad7HIkBS-dhv64E1OlM,2356 +pytz/zoneinfo/US/Arizona,sha256=nEOwYOnGxENw9zW8m50PGxbtVfTrX3QYAo4x4LgOLfI,328 +pytz/zoneinfo/US/Central,sha256=4aZFw-svkMyXmSpNufqzK-xveos-oVJDpEyI8Yu9HQE,3576 +pytz/zoneinfo/US/East-Indiana,sha256=GrNub1_3Um5Qh67wOx58_TEAz4fwAeAlk2AlMTVA_sI,1666 +pytz/zoneinfo/US/Eastern,sha256=7AoiEGjr3wV4P7C4Qs35COZqwr2mjNDq7ocpsSPFOM8,3536 +pytz/zoneinfo/US/Hawaii,sha256=fwPRv1Jk56sCOi75uZfd_Iy2k2aSQHx3B2K5xUlSPzM,329 +pytz/zoneinfo/US/Indiana-Starke,sha256=BiALShjiOLg1o8mMRWJ1jyTlJkgvwzte7B9WSOvTUNg,2428 +pytz/zoneinfo/US/Michigan,sha256=hecz8yqY2Cj5B61G3gLZdAVZvRgK9l0P90c_gN-uD5g,2230 +pytz/zoneinfo/US/Mountain,sha256=6_yPo1_mvnt9DgpPzr0QdHsjdsfUG6ALnagQLML1DSM,2444 +pytz/zoneinfo/US/Pacific,sha256=VOy1PikdjiVdJ7lukVGzwl8uDxV_KYqznkTm5BLEiDM,2836 +pytz/zoneinfo/US/Samoa,sha256=fCYrYphYY6rUfxOw712y5cyRe104AC3pouqD3bCINFg,175 +pytz-2019.3.dist-info/DESCRIPTION.rst,sha256=xI1KevtOms0GULVNhelE82mnz9bO-KkIEPTS37yF-HM,19243 +pytz-2019.3.dist-info/LICENSE.txt,sha256=OfB8cqG_2jScvSe6ybyx5vjFtOXMP631aQBAbozAt5I,1088 +pytz-2019.3.dist-info/METADATA,sha256=oNmEh_Tvwm0rAlV2ocUXR8jDQzPRkdIaP_QsbeSavwo,20635 +pytz-2019.3.dist-info/RECORD,, +pytz-2019.3.dist-info/WHEEL,sha256=kdsN-5OJAZIiHN-iO4Rhl82KyS0bDWf4uBwMbkNafr8,110 +pytz-2019.3.dist-info/metadata.json,sha256=n7e14T2ogy_KLwtqxidvKwHafqXM3GOddSu4Ip3wQn8,1505 +pytz-2019.3.dist-info/top_level.txt,sha256=6xRYlt934v1yHb1JIrXgHyGxn3cqACvd-yE8ski_kcc,5 +pytz-2019.3.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1 +pytz-2019.3.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4 +pytz/__pycache__/exceptions.cpython-37.pyc,, +pytz/__pycache__/lazy.cpython-37.pyc,, +pytz/__pycache__/reference.cpython-37.pyc,, +pytz/__pycache__/tzfile.cpython-37.pyc,, +pytz/__pycache__/tzinfo.cpython-37.pyc,, +pytz/__pycache__/__init__.cpython-37.pyc,, diff --git a/venv/Lib/site-packages/pytz-2019.3.dist-info/WHEEL b/venv/Lib/site-packages/pytz-2019.3.dist-info/WHEEL new file mode 100644 index 0000000..7332a41 --- /dev/null +++ b/venv/Lib/site-packages/pytz-2019.3.dist-info/WHEEL @@ -0,0 +1,6 @@ +Wheel-Version: 1.0 +Generator: bdist_wheel (0.30.0) +Root-Is-Purelib: true +Tag: py2-none-any +Tag: py3-none-any + diff --git a/venv/Lib/site-packages/pytz-2019.3.dist-info/metadata.json b/venv/Lib/site-packages/pytz-2019.3.dist-info/metadata.json new file mode 100644 index 0000000..e0c5a77 --- /dev/null +++ b/venv/Lib/site-packages/pytz-2019.3.dist-info/metadata.json @@ -0,0 +1 @@ +{"classifiers": ["Development Status :: 6 - Mature", "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", "Natural Language :: English", "Operating System :: OS Independent", "Programming Language :: Python", "Programming Language :: Python :: 2", "Programming Language :: Python :: 2.4", "Programming Language :: Python :: 2.5", "Programming Language :: Python :: 2.6", "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.0", "Programming Language :: Python :: 3.1", "Programming Language :: Python :: 3.2", "Programming Language :: Python :: 3.3", "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", "Topic :: Software Development :: Libraries :: Python Modules"], "download_url": "https://pypi.org/project/pytz/", "extensions": {"python.details": {"contacts": [{"email": "stuart@stuartbishop.net", "name": "Stuart Bishop", "role": "author"}, {"email": "stuart@stuartbishop.net", "name": "Stuart Bishop", "role": "maintainer"}], "document_names": {"description": "DESCRIPTION.rst", "license": "LICENSE.txt"}, "project_urls": {"Home": "http://pythonhosted.org/pytz"}}}, "generator": "bdist_wheel (0.30.0)", "keywords": ["timezone", "tzinfo", "datetime", "olson", "time"], "license": "MIT", "metadata_version": "2.0", "name": "pytz", "platform": "Independent", "summary": "World timezone definitions, modern and historical", "version": "2019.3"} \ No newline at end of file diff --git a/venv/Lib/site-packages/pytz-2019.3.dist-info/top_level.txt b/venv/Lib/site-packages/pytz-2019.3.dist-info/top_level.txt new file mode 100644 index 0000000..af44f19 --- /dev/null +++ b/venv/Lib/site-packages/pytz-2019.3.dist-info/top_level.txt @@ -0,0 +1 @@ +pytz diff --git a/venv/Lib/site-packages/pytz-2019.3.dist-info/zip-safe b/venv/Lib/site-packages/pytz-2019.3.dist-info/zip-safe new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/venv/Lib/site-packages/pytz-2019.3.dist-info/zip-safe @@ -0,0 +1 @@ + diff --git a/venv/Lib/site-packages/pytz/__init__.py b/venv/Lib/site-packages/pytz/__init__.py new file mode 100644 index 0000000..8b6bd64 --- /dev/null +++ b/venv/Lib/site-packages/pytz/__init__.py @@ -0,0 +1,1551 @@ +''' +datetime.tzinfo timezone definitions generated from the +Olson timezone database: + + ftp://elsie.nci.nih.gov/pub/tz*.tar.gz + +See the datetime section of the Python Library Reference for information +on how to use these modules. +''' + +import sys +import datetime +import os.path + +from pytz.exceptions import AmbiguousTimeError +from pytz.exceptions import InvalidTimeError +from pytz.exceptions import NonExistentTimeError +from pytz.exceptions import UnknownTimeZoneError +from pytz.lazy import LazyDict, LazyList, LazySet # noqa +from pytz.tzinfo import unpickler, BaseTzInfo +from pytz.tzfile import build_tzinfo + + +# The IANA (nee Olson) database is updated several times a year. +OLSON_VERSION = '2019c' +VERSION = '2019.3' # pip compatible version number. +__version__ = VERSION + +OLSEN_VERSION = OLSON_VERSION # Old releases had this misspelling + +__all__ = [ + 'timezone', 'utc', 'country_timezones', 'country_names', + 'AmbiguousTimeError', 'InvalidTimeError', + 'NonExistentTimeError', 'UnknownTimeZoneError', + 'all_timezones', 'all_timezones_set', + 'common_timezones', 'common_timezones_set', + 'BaseTzInfo', +] + + +if sys.version_info[0] > 2: # Python 3.x + + # Python 3.x doesn't have unicode(), making writing code + # for Python 2.3 and Python 3.x a pain. + unicode = str + + def ascii(s): + r""" + >>> ascii('Hello') + 'Hello' + >>> ascii('\N{TRADE MARK SIGN}') #doctest: +IGNORE_EXCEPTION_DETAIL + Traceback (most recent call last): + ... + UnicodeEncodeError: ... + """ + if type(s) == bytes: + s = s.decode('ASCII') + else: + s.encode('ASCII') # Raise an exception if not ASCII + return s # But the string - not a byte string. + +else: # Python 2.x + + def ascii(s): + r""" + >>> ascii('Hello') + 'Hello' + >>> ascii(u'Hello') + 'Hello' + >>> ascii(u'\N{TRADE MARK SIGN}') #doctest: +IGNORE_EXCEPTION_DETAIL + Traceback (most recent call last): + ... + UnicodeEncodeError: ... + """ + return s.encode('ASCII') + + +def open_resource(name): + """Open a resource from the zoneinfo subdir for reading. + + Uses the pkg_resources module if available and no standard file + found at the calculated location. + + It is possible to specify different location for zoneinfo + subdir by using the PYTZ_TZDATADIR environment variable. + """ + name_parts = name.lstrip('/').split('/') + for part in name_parts: + if part == os.path.pardir or os.path.sep in part: + raise ValueError('Bad path segment: %r' % part) + zoneinfo_dir = os.environ.get('PYTZ_TZDATADIR', None) + if zoneinfo_dir is not None: + filename = os.path.join(zoneinfo_dir, *name_parts) + else: + filename = os.path.join(os.path.dirname(__file__), + 'zoneinfo', *name_parts) + if not os.path.exists(filename): + # http://bugs.launchpad.net/bugs/383171 - we avoid using this + # unless absolutely necessary to help when a broken version of + # pkg_resources is installed. + try: + from pkg_resources import resource_stream + except ImportError: + resource_stream = None + + if resource_stream is not None: + return resource_stream(__name__, 'zoneinfo/' + name) + return open(filename, 'rb') + + +def resource_exists(name): + """Return true if the given resource exists""" + try: + open_resource(name).close() + return True + except IOError: + return False + + +_tzinfo_cache = {} + + +def timezone(zone): + r''' Return a datetime.tzinfo implementation for the given timezone + + >>> from datetime import datetime, timedelta + >>> utc = timezone('UTC') + >>> eastern = timezone('US/Eastern') + >>> eastern.zone + 'US/Eastern' + >>> timezone(unicode('US/Eastern')) is eastern + True + >>> utc_dt = datetime(2002, 10, 27, 6, 0, 0, tzinfo=utc) + >>> loc_dt = utc_dt.astimezone(eastern) + >>> fmt = '%Y-%m-%d %H:%M:%S %Z (%z)' + >>> loc_dt.strftime(fmt) + '2002-10-27 01:00:00 EST (-0500)' + >>> (loc_dt - timedelta(minutes=10)).strftime(fmt) + '2002-10-27 00:50:00 EST (-0500)' + >>> eastern.normalize(loc_dt - timedelta(minutes=10)).strftime(fmt) + '2002-10-27 01:50:00 EDT (-0400)' + >>> (loc_dt + timedelta(minutes=10)).strftime(fmt) + '2002-10-27 01:10:00 EST (-0500)' + + Raises UnknownTimeZoneError if passed an unknown zone. + + >>> try: + ... timezone('Asia/Shangri-La') + ... except UnknownTimeZoneError: + ... print('Unknown') + Unknown + + >>> try: + ... timezone(unicode('\N{TRADE MARK SIGN}')) + ... except UnknownTimeZoneError: + ... print('Unknown') + Unknown + + ''' + if zone is None: + raise UnknownTimeZoneError(None) + + if zone.upper() == 'UTC': + return utc + + try: + zone = ascii(zone) + except UnicodeEncodeError: + # All valid timezones are ASCII + raise UnknownTimeZoneError(zone) + + zone = _case_insensitive_zone_lookup(_unmunge_zone(zone)) + if zone not in _tzinfo_cache: + if zone in all_timezones_set: # noqa + fp = open_resource(zone) + try: + _tzinfo_cache[zone] = build_tzinfo(zone, fp) + finally: + fp.close() + else: + raise UnknownTimeZoneError(zone) + + return _tzinfo_cache[zone] + + +def _unmunge_zone(zone): + """Undo the time zone name munging done by older versions of pytz.""" + return zone.replace('_plus_', '+').replace('_minus_', '-') + + +_all_timezones_lower_to_standard = None + + +def _case_insensitive_zone_lookup(zone): + """case-insensitively matching timezone, else return zone unchanged""" + global _all_timezones_lower_to_standard + if _all_timezones_lower_to_standard is None: + _all_timezones_lower_to_standard = dict((tz.lower(), tz) for tz in all_timezones) # noqa + return _all_timezones_lower_to_standard.get(zone.lower()) or zone # noqa + + +ZERO = datetime.timedelta(0) +HOUR = datetime.timedelta(hours=1) + + +class UTC(BaseTzInfo): + """UTC + + Optimized UTC implementation. It unpickles using the single module global + instance defined beneath this class declaration. + """ + zone = "UTC" + + _utcoffset = ZERO + _dst = ZERO + _tzname = zone + + def fromutc(self, dt): + if dt.tzinfo is None: + return self.localize(dt) + return super(utc.__class__, self).fromutc(dt) + + def utcoffset(self, dt): + return ZERO + + def tzname(self, dt): + return "UTC" + + def dst(self, dt): + return ZERO + + def __reduce__(self): + return _UTC, () + + def localize(self, dt, is_dst=False): + '''Convert naive time to local time''' + if dt.tzinfo is not None: + raise ValueError('Not naive datetime (tzinfo is already set)') + return dt.replace(tzinfo=self) + + def normalize(self, dt, is_dst=False): + '''Correct the timezone information on the given datetime''' + if dt.tzinfo is self: + return dt + if dt.tzinfo is None: + raise ValueError('Naive time - no tzinfo set') + return dt.astimezone(self) + + def __repr__(self): + return "" + + def __str__(self): + return "UTC" + + +UTC = utc = UTC() # UTC is a singleton + + +def _UTC(): + """Factory function for utc unpickling. + + Makes sure that unpickling a utc instance always returns the same + module global. + + These examples belong in the UTC class above, but it is obscured; or in + the README.txt, but we are not depending on Python 2.4 so integrating + the README.txt examples with the unit tests is not trivial. + + >>> import datetime, pickle + >>> dt = datetime.datetime(2005, 3, 1, 14, 13, 21, tzinfo=utc) + >>> naive = dt.replace(tzinfo=None) + >>> p = pickle.dumps(dt, 1) + >>> naive_p = pickle.dumps(naive, 1) + >>> len(p) - len(naive_p) + 17 + >>> new = pickle.loads(p) + >>> new == dt + True + >>> new is dt + False + >>> new.tzinfo is dt.tzinfo + True + >>> utc is UTC is timezone('UTC') + True + >>> utc is timezone('GMT') + False + """ + return utc + + +_UTC.__safe_for_unpickling__ = True + + +def _p(*args): + """Factory function for unpickling pytz tzinfo instances. + + Just a wrapper around tzinfo.unpickler to save a few bytes in each pickle + by shortening the path. + """ + return unpickler(*args) + + +_p.__safe_for_unpickling__ = True + + +class _CountryTimezoneDict(LazyDict): + """Map ISO 3166 country code to a list of timezone names commonly used + in that country. + + iso3166_code is the two letter code used to identify the country. + + >>> def print_list(list_of_strings): + ... 'We use a helper so doctests work under Python 2.3 -> 3.x' + ... for s in list_of_strings: + ... print(s) + + >>> print_list(country_timezones['nz']) + Pacific/Auckland + Pacific/Chatham + >>> print_list(country_timezones['ch']) + Europe/Zurich + >>> print_list(country_timezones['CH']) + Europe/Zurich + >>> print_list(country_timezones[unicode('ch')]) + Europe/Zurich + >>> print_list(country_timezones['XXX']) + Traceback (most recent call last): + ... + KeyError: 'XXX' + + Previously, this information was exposed as a function rather than a + dictionary. This is still supported:: + + >>> print_list(country_timezones('nz')) + Pacific/Auckland + Pacific/Chatham + """ + def __call__(self, iso3166_code): + """Backwards compatibility.""" + return self[iso3166_code] + + def _fill(self): + data = {} + zone_tab = open_resource('zone.tab') + try: + for line in zone_tab: + line = line.decode('UTF-8') + if line.startswith('#'): + continue + code, coordinates, zone = line.split(None, 4)[:3] + if zone not in all_timezones_set: # noqa + continue + try: + data[code].append(zone) + except KeyError: + data[code] = [zone] + self.data = data + finally: + zone_tab.close() + + +country_timezones = _CountryTimezoneDict() + + +class _CountryNameDict(LazyDict): + '''Dictionary proving ISO3166 code -> English name. + + >>> print(country_names['au']) + Australia + ''' + def _fill(self): + data = {} + zone_tab = open_resource('iso3166.tab') + try: + for line in zone_tab.readlines(): + line = line.decode('UTF-8') + if line.startswith('#'): + continue + code, name = line.split(None, 1) + data[code] = name.strip() + self.data = data + finally: + zone_tab.close() + + +country_names = _CountryNameDict() + + +# Time-zone info based solely on fixed offsets + +class _FixedOffset(datetime.tzinfo): + + zone = None # to match the standard pytz API + + def __init__(self, minutes): + if abs(minutes) >= 1440: + raise ValueError("absolute offset is too large", minutes) + self._minutes = minutes + self._offset = datetime.timedelta(minutes=minutes) + + def utcoffset(self, dt): + return self._offset + + def __reduce__(self): + return FixedOffset, (self._minutes, ) + + def dst(self, dt): + return ZERO + + def tzname(self, dt): + return None + + def __repr__(self): + return 'pytz.FixedOffset(%d)' % self._minutes + + def localize(self, dt, is_dst=False): + '''Convert naive time to local time''' + if dt.tzinfo is not None: + raise ValueError('Not naive datetime (tzinfo is already set)') + return dt.replace(tzinfo=self) + + def normalize(self, dt, is_dst=False): + '''Correct the timezone information on the given datetime''' + if dt.tzinfo is self: + return dt + if dt.tzinfo is None: + raise ValueError('Naive time - no tzinfo set') + return dt.astimezone(self) + + +def FixedOffset(offset, _tzinfos={}): + """return a fixed-offset timezone based off a number of minutes. + + >>> one = FixedOffset(-330) + >>> one + pytz.FixedOffset(-330) + >>> str(one.utcoffset(datetime.datetime.now())) + '-1 day, 18:30:00' + >>> str(one.dst(datetime.datetime.now())) + '0:00:00' + + >>> two = FixedOffset(1380) + >>> two + pytz.FixedOffset(1380) + >>> str(two.utcoffset(datetime.datetime.now())) + '23:00:00' + >>> str(two.dst(datetime.datetime.now())) + '0:00:00' + + The datetime.timedelta must be between the range of -1 and 1 day, + non-inclusive. + + >>> FixedOffset(1440) + Traceback (most recent call last): + ... + ValueError: ('absolute offset is too large', 1440) + + >>> FixedOffset(-1440) + Traceback (most recent call last): + ... + ValueError: ('absolute offset is too large', -1440) + + An offset of 0 is special-cased to return UTC. + + >>> FixedOffset(0) is UTC + True + + There should always be only one instance of a FixedOffset per timedelta. + This should be true for multiple creation calls. + + >>> FixedOffset(-330) is one + True + >>> FixedOffset(1380) is two + True + + It should also be true for pickling. + + >>> import pickle + >>> pickle.loads(pickle.dumps(one)) is one + True + >>> pickle.loads(pickle.dumps(two)) is two + True + """ + if offset == 0: + return UTC + + info = _tzinfos.get(offset) + if info is None: + # We haven't seen this one before. we need to save it. + + # Use setdefault to avoid a race condition and make sure we have + # only one + info = _tzinfos.setdefault(offset, _FixedOffset(offset)) + + return info + + +FixedOffset.__safe_for_unpickling__ = True + + +def _test(): + import doctest + sys.path.insert(0, os.pardir) + import pytz + return doctest.testmod(pytz) + + +if __name__ == '__main__': + _test() +all_timezones = \ +['Africa/Abidjan', + 'Africa/Accra', + 'Africa/Addis_Ababa', + 'Africa/Algiers', + 'Africa/Asmara', + 'Africa/Asmera', + 'Africa/Bamako', + 'Africa/Bangui', + 'Africa/Banjul', + 'Africa/Bissau', + 'Africa/Blantyre', + 'Africa/Brazzaville', + 'Africa/Bujumbura', + 'Africa/Cairo', + 'Africa/Casablanca', + 'Africa/Ceuta', + 'Africa/Conakry', + 'Africa/Dakar', + 'Africa/Dar_es_Salaam', + 'Africa/Djibouti', + 'Africa/Douala', + 'Africa/El_Aaiun', + 'Africa/Freetown', + 'Africa/Gaborone', + 'Africa/Harare', + 'Africa/Johannesburg', + 'Africa/Juba', + 'Africa/Kampala', + 'Africa/Khartoum', + 'Africa/Kigali', + 'Africa/Kinshasa', + 'Africa/Lagos', + 'Africa/Libreville', + 'Africa/Lome', + 'Africa/Luanda', + 'Africa/Lubumbashi', + 'Africa/Lusaka', + 'Africa/Malabo', + 'Africa/Maputo', + 'Africa/Maseru', + 'Africa/Mbabane', + 'Africa/Mogadishu', + 'Africa/Monrovia', + 'Africa/Nairobi', + 'Africa/Ndjamena', + 'Africa/Niamey', + 'Africa/Nouakchott', + 'Africa/Ouagadougou', + 'Africa/Porto-Novo', + 'Africa/Sao_Tome', + 'Africa/Timbuktu', + 'Africa/Tripoli', + 'Africa/Tunis', + 'Africa/Windhoek', + 'America/Adak', + 'America/Anchorage', + 'America/Anguilla', + 'America/Antigua', + 'America/Araguaina', + 'America/Argentina/Buenos_Aires', + 'America/Argentina/Catamarca', + 'America/Argentina/ComodRivadavia', + 'America/Argentina/Cordoba', + 'America/Argentina/Jujuy', + 'America/Argentina/La_Rioja', + 'America/Argentina/Mendoza', + 'America/Argentina/Rio_Gallegos', + 'America/Argentina/Salta', + 'America/Argentina/San_Juan', + 'America/Argentina/San_Luis', + 'America/Argentina/Tucuman', + 'America/Argentina/Ushuaia', + 'America/Aruba', + 'America/Asuncion', + 'America/Atikokan', + 'America/Atka', + 'America/Bahia', + 'America/Bahia_Banderas', + 'America/Barbados', + 'America/Belem', + 'America/Belize', + 'America/Blanc-Sablon', + 'America/Boa_Vista', + 'America/Bogota', + 'America/Boise', + 'America/Buenos_Aires', + 'America/Cambridge_Bay', + 'America/Campo_Grande', + 'America/Cancun', + 'America/Caracas', + 'America/Catamarca', + 'America/Cayenne', + 'America/Cayman', + 'America/Chicago', + 'America/Chihuahua', + 'America/Coral_Harbour', + 'America/Cordoba', + 'America/Costa_Rica', + 'America/Creston', + 'America/Cuiaba', + 'America/Curacao', + 'America/Danmarkshavn', + 'America/Dawson', + 'America/Dawson_Creek', + 'America/Denver', + 'America/Detroit', + 'America/Dominica', + 'America/Edmonton', + 'America/Eirunepe', + 'America/El_Salvador', + 'America/Ensenada', + 'America/Fort_Nelson', + 'America/Fort_Wayne', + 'America/Fortaleza', + 'America/Glace_Bay', + 'America/Godthab', + 'America/Goose_Bay', + 'America/Grand_Turk', + 'America/Grenada', + 'America/Guadeloupe', + 'America/Guatemala', + 'America/Guayaquil', + 'America/Guyana', + 'America/Halifax', + 'America/Havana', + 'America/Hermosillo', + 'America/Indiana/Indianapolis', + 'America/Indiana/Knox', + 'America/Indiana/Marengo', + 'America/Indiana/Petersburg', + 'America/Indiana/Tell_City', + 'America/Indiana/Vevay', + 'America/Indiana/Vincennes', + 'America/Indiana/Winamac', + 'America/Indianapolis', + 'America/Inuvik', + 'America/Iqaluit', + 'America/Jamaica', + 'America/Jujuy', + 'America/Juneau', + 'America/Kentucky/Louisville', + 'America/Kentucky/Monticello', + 'America/Knox_IN', + 'America/Kralendijk', + 'America/La_Paz', + 'America/Lima', + 'America/Los_Angeles', + 'America/Louisville', + 'America/Lower_Princes', + 'America/Maceio', + 'America/Managua', + 'America/Manaus', + 'America/Marigot', + 'America/Martinique', + 'America/Matamoros', + 'America/Mazatlan', + 'America/Mendoza', + 'America/Menominee', + 'America/Merida', + 'America/Metlakatla', + 'America/Mexico_City', + 'America/Miquelon', + 'America/Moncton', + 'America/Monterrey', + 'America/Montevideo', + 'America/Montreal', + 'America/Montserrat', + 'America/Nassau', + 'America/New_York', + 'America/Nipigon', + 'America/Nome', + 'America/Noronha', + 'America/North_Dakota/Beulah', + 'America/North_Dakota/Center', + 'America/North_Dakota/New_Salem', + 'America/Ojinaga', + 'America/Panama', + 'America/Pangnirtung', + 'America/Paramaribo', + 'America/Phoenix', + 'America/Port-au-Prince', + 'America/Port_of_Spain', + 'America/Porto_Acre', + 'America/Porto_Velho', + 'America/Puerto_Rico', + 'America/Punta_Arenas', + 'America/Rainy_River', + 'America/Rankin_Inlet', + 'America/Recife', + 'America/Regina', + 'America/Resolute', + 'America/Rio_Branco', + 'America/Rosario', + 'America/Santa_Isabel', + 'America/Santarem', + 'America/Santiago', + 'America/Santo_Domingo', + 'America/Sao_Paulo', + 'America/Scoresbysund', + 'America/Shiprock', + 'America/Sitka', + 'America/St_Barthelemy', + 'America/St_Johns', + 'America/St_Kitts', + 'America/St_Lucia', + 'America/St_Thomas', + 'America/St_Vincent', + 'America/Swift_Current', + 'America/Tegucigalpa', + 'America/Thule', + 'America/Thunder_Bay', + 'America/Tijuana', + 'America/Toronto', + 'America/Tortola', + 'America/Vancouver', + 'America/Virgin', + 'America/Whitehorse', + 'America/Winnipeg', + 'America/Yakutat', + 'America/Yellowknife', + 'Antarctica/Casey', + 'Antarctica/Davis', + 'Antarctica/DumontDUrville', + 'Antarctica/Macquarie', + 'Antarctica/Mawson', + 'Antarctica/McMurdo', + 'Antarctica/Palmer', + 'Antarctica/Rothera', + 'Antarctica/South_Pole', + 'Antarctica/Syowa', + 'Antarctica/Troll', + 'Antarctica/Vostok', + 'Arctic/Longyearbyen', + 'Asia/Aden', + 'Asia/Almaty', + 'Asia/Amman', + 'Asia/Anadyr', + 'Asia/Aqtau', + 'Asia/Aqtobe', + 'Asia/Ashgabat', + 'Asia/Ashkhabad', + 'Asia/Atyrau', + 'Asia/Baghdad', + 'Asia/Bahrain', + 'Asia/Baku', + 'Asia/Bangkok', + 'Asia/Barnaul', + 'Asia/Beirut', + 'Asia/Bishkek', + 'Asia/Brunei', + 'Asia/Calcutta', + 'Asia/Chita', + 'Asia/Choibalsan', + 'Asia/Chongqing', + 'Asia/Chungking', + 'Asia/Colombo', + 'Asia/Dacca', + 'Asia/Damascus', + 'Asia/Dhaka', + 'Asia/Dili', + 'Asia/Dubai', + 'Asia/Dushanbe', + 'Asia/Famagusta', + 'Asia/Gaza', + 'Asia/Harbin', + 'Asia/Hebron', + 'Asia/Ho_Chi_Minh', + 'Asia/Hong_Kong', + 'Asia/Hovd', + 'Asia/Irkutsk', + 'Asia/Istanbul', + 'Asia/Jakarta', + 'Asia/Jayapura', + 'Asia/Jerusalem', + 'Asia/Kabul', + 'Asia/Kamchatka', + 'Asia/Karachi', + 'Asia/Kashgar', + 'Asia/Kathmandu', + 'Asia/Katmandu', + 'Asia/Khandyga', + 'Asia/Kolkata', + 'Asia/Krasnoyarsk', + 'Asia/Kuala_Lumpur', + 'Asia/Kuching', + 'Asia/Kuwait', + 'Asia/Macao', + 'Asia/Macau', + 'Asia/Magadan', + 'Asia/Makassar', + 'Asia/Manila', + 'Asia/Muscat', + 'Asia/Nicosia', + 'Asia/Novokuznetsk', + 'Asia/Novosibirsk', + 'Asia/Omsk', + 'Asia/Oral', + 'Asia/Phnom_Penh', + 'Asia/Pontianak', + 'Asia/Pyongyang', + 'Asia/Qatar', + 'Asia/Qostanay', + 'Asia/Qyzylorda', + 'Asia/Rangoon', + 'Asia/Riyadh', + 'Asia/Saigon', + 'Asia/Sakhalin', + 'Asia/Samarkand', + 'Asia/Seoul', + 'Asia/Shanghai', + 'Asia/Singapore', + 'Asia/Srednekolymsk', + 'Asia/Taipei', + 'Asia/Tashkent', + 'Asia/Tbilisi', + 'Asia/Tehran', + 'Asia/Tel_Aviv', + 'Asia/Thimbu', + 'Asia/Thimphu', + 'Asia/Tokyo', + 'Asia/Tomsk', + 'Asia/Ujung_Pandang', + 'Asia/Ulaanbaatar', + 'Asia/Ulan_Bator', + 'Asia/Urumqi', + 'Asia/Ust-Nera', + 'Asia/Vientiane', + 'Asia/Vladivostok', + 'Asia/Yakutsk', + 'Asia/Yangon', + 'Asia/Yekaterinburg', + 'Asia/Yerevan', + 'Atlantic/Azores', + 'Atlantic/Bermuda', + 'Atlantic/Canary', + 'Atlantic/Cape_Verde', + 'Atlantic/Faeroe', + 'Atlantic/Faroe', + 'Atlantic/Jan_Mayen', + 'Atlantic/Madeira', + 'Atlantic/Reykjavik', + 'Atlantic/South_Georgia', + 'Atlantic/St_Helena', + 'Atlantic/Stanley', + 'Australia/ACT', + 'Australia/Adelaide', + 'Australia/Brisbane', + 'Australia/Broken_Hill', + 'Australia/Canberra', + 'Australia/Currie', + 'Australia/Darwin', + 'Australia/Eucla', + 'Australia/Hobart', + 'Australia/LHI', + 'Australia/Lindeman', + 'Australia/Lord_Howe', + 'Australia/Melbourne', + 'Australia/NSW', + 'Australia/North', + 'Australia/Perth', + 'Australia/Queensland', + 'Australia/South', + 'Australia/Sydney', + 'Australia/Tasmania', + 'Australia/Victoria', + 'Australia/West', + 'Australia/Yancowinna', + 'Brazil/Acre', + 'Brazil/DeNoronha', + 'Brazil/East', + 'Brazil/West', + 'CET', + 'CST6CDT', + 'Canada/Atlantic', + 'Canada/Central', + 'Canada/Eastern', + 'Canada/Mountain', + 'Canada/Newfoundland', + 'Canada/Pacific', + 'Canada/Saskatchewan', + 'Canada/Yukon', + 'Chile/Continental', + 'Chile/EasterIsland', + 'Cuba', + 'EET', + 'EST', + 'EST5EDT', + 'Egypt', + 'Eire', + 'Etc/GMT', + 'Etc/GMT+0', + 'Etc/GMT+1', + 'Etc/GMT+10', + 'Etc/GMT+11', + 'Etc/GMT+12', + 'Etc/GMT+2', + 'Etc/GMT+3', + 'Etc/GMT+4', + 'Etc/GMT+5', + 'Etc/GMT+6', + 'Etc/GMT+7', + 'Etc/GMT+8', + 'Etc/GMT+9', + 'Etc/GMT-0', + 'Etc/GMT-1', + 'Etc/GMT-10', + 'Etc/GMT-11', + 'Etc/GMT-12', + 'Etc/GMT-13', + 'Etc/GMT-14', + 'Etc/GMT-2', + 'Etc/GMT-3', + 'Etc/GMT-4', + 'Etc/GMT-5', + 'Etc/GMT-6', + 'Etc/GMT-7', + 'Etc/GMT-8', + 'Etc/GMT-9', + 'Etc/GMT0', + 'Etc/Greenwich', + 'Etc/UCT', + 'Etc/UTC', + 'Etc/Universal', + 'Etc/Zulu', + 'Europe/Amsterdam', + 'Europe/Andorra', + 'Europe/Astrakhan', + 'Europe/Athens', + 'Europe/Belfast', + 'Europe/Belgrade', + 'Europe/Berlin', + 'Europe/Bratislava', + 'Europe/Brussels', + 'Europe/Bucharest', + 'Europe/Budapest', + 'Europe/Busingen', + 'Europe/Chisinau', + 'Europe/Copenhagen', + 'Europe/Dublin', + 'Europe/Gibraltar', + 'Europe/Guernsey', + 'Europe/Helsinki', + 'Europe/Isle_of_Man', + 'Europe/Istanbul', + 'Europe/Jersey', + 'Europe/Kaliningrad', + 'Europe/Kiev', + 'Europe/Kirov', + 'Europe/Lisbon', + 'Europe/Ljubljana', + 'Europe/London', + 'Europe/Luxembourg', + 'Europe/Madrid', + 'Europe/Malta', + 'Europe/Mariehamn', + 'Europe/Minsk', + 'Europe/Monaco', + 'Europe/Moscow', + 'Europe/Nicosia', + 'Europe/Oslo', + 'Europe/Paris', + 'Europe/Podgorica', + 'Europe/Prague', + 'Europe/Riga', + 'Europe/Rome', + 'Europe/Samara', + 'Europe/San_Marino', + 'Europe/Sarajevo', + 'Europe/Saratov', + 'Europe/Simferopol', + 'Europe/Skopje', + 'Europe/Sofia', + 'Europe/Stockholm', + 'Europe/Tallinn', + 'Europe/Tirane', + 'Europe/Tiraspol', + 'Europe/Ulyanovsk', + 'Europe/Uzhgorod', + 'Europe/Vaduz', + 'Europe/Vatican', + 'Europe/Vienna', + 'Europe/Vilnius', + 'Europe/Volgograd', + 'Europe/Warsaw', + 'Europe/Zagreb', + 'Europe/Zaporozhye', + 'Europe/Zurich', + 'GB', + 'GB-Eire', + 'GMT', + 'GMT+0', + 'GMT-0', + 'GMT0', + 'Greenwich', + 'HST', + 'Hongkong', + 'Iceland', + 'Indian/Antananarivo', + 'Indian/Chagos', + 'Indian/Christmas', + 'Indian/Cocos', + 'Indian/Comoro', + 'Indian/Kerguelen', + 'Indian/Mahe', + 'Indian/Maldives', + 'Indian/Mauritius', + 'Indian/Mayotte', + 'Indian/Reunion', + 'Iran', + 'Israel', + 'Jamaica', + 'Japan', + 'Kwajalein', + 'Libya', + 'MET', + 'MST', + 'MST7MDT', + 'Mexico/BajaNorte', + 'Mexico/BajaSur', + 'Mexico/General', + 'NZ', + 'NZ-CHAT', + 'Navajo', + 'PRC', + 'PST8PDT', + 'Pacific/Apia', + 'Pacific/Auckland', + 'Pacific/Bougainville', + 'Pacific/Chatham', + 'Pacific/Chuuk', + 'Pacific/Easter', + 'Pacific/Efate', + 'Pacific/Enderbury', + 'Pacific/Fakaofo', + 'Pacific/Fiji', + 'Pacific/Funafuti', + 'Pacific/Galapagos', + 'Pacific/Gambier', + 'Pacific/Guadalcanal', + 'Pacific/Guam', + 'Pacific/Honolulu', + 'Pacific/Johnston', + 'Pacific/Kiritimati', + 'Pacific/Kosrae', + 'Pacific/Kwajalein', + 'Pacific/Majuro', + 'Pacific/Marquesas', + 'Pacific/Midway', + 'Pacific/Nauru', + 'Pacific/Niue', + 'Pacific/Norfolk', + 'Pacific/Noumea', + 'Pacific/Pago_Pago', + 'Pacific/Palau', + 'Pacific/Pitcairn', + 'Pacific/Pohnpei', + 'Pacific/Ponape', + 'Pacific/Port_Moresby', + 'Pacific/Rarotonga', + 'Pacific/Saipan', + 'Pacific/Samoa', + 'Pacific/Tahiti', + 'Pacific/Tarawa', + 'Pacific/Tongatapu', + 'Pacific/Truk', + 'Pacific/Wake', + 'Pacific/Wallis', + 'Pacific/Yap', + 'Poland', + 'Portugal', + 'ROC', + 'ROK', + 'Singapore', + 'Turkey', + 'UCT', + 'US/Alaska', + 'US/Aleutian', + 'US/Arizona', + 'US/Central', + 'US/East-Indiana', + 'US/Eastern', + 'US/Hawaii', + 'US/Indiana-Starke', + 'US/Michigan', + 'US/Mountain', + 'US/Pacific', + 'US/Samoa', + 'UTC', + 'Universal', + 'W-SU', + 'WET', + 'Zulu'] +all_timezones = LazyList( + tz for tz in all_timezones if resource_exists(tz)) + +all_timezones_set = LazySet(all_timezones) +common_timezones = \ +['Africa/Abidjan', + 'Africa/Accra', + 'Africa/Addis_Ababa', + 'Africa/Algiers', + 'Africa/Asmara', + 'Africa/Bamako', + 'Africa/Bangui', + 'Africa/Banjul', + 'Africa/Bissau', + 'Africa/Blantyre', + 'Africa/Brazzaville', + 'Africa/Bujumbura', + 'Africa/Cairo', + 'Africa/Casablanca', + 'Africa/Ceuta', + 'Africa/Conakry', + 'Africa/Dakar', + 'Africa/Dar_es_Salaam', + 'Africa/Djibouti', + 'Africa/Douala', + 'Africa/El_Aaiun', + 'Africa/Freetown', + 'Africa/Gaborone', + 'Africa/Harare', + 'Africa/Johannesburg', + 'Africa/Juba', + 'Africa/Kampala', + 'Africa/Khartoum', + 'Africa/Kigali', + 'Africa/Kinshasa', + 'Africa/Lagos', + 'Africa/Libreville', + 'Africa/Lome', + 'Africa/Luanda', + 'Africa/Lubumbashi', + 'Africa/Lusaka', + 'Africa/Malabo', + 'Africa/Maputo', + 'Africa/Maseru', + 'Africa/Mbabane', + 'Africa/Mogadishu', + 'Africa/Monrovia', + 'Africa/Nairobi', + 'Africa/Ndjamena', + 'Africa/Niamey', + 'Africa/Nouakchott', + 'Africa/Ouagadougou', + 'Africa/Porto-Novo', + 'Africa/Sao_Tome', + 'Africa/Tripoli', + 'Africa/Tunis', + 'Africa/Windhoek', + 'America/Adak', + 'America/Anchorage', + 'America/Anguilla', + 'America/Antigua', + 'America/Araguaina', + 'America/Argentina/Buenos_Aires', + 'America/Argentina/Catamarca', + 'America/Argentina/Cordoba', + 'America/Argentina/Jujuy', + 'America/Argentina/La_Rioja', + 'America/Argentina/Mendoza', + 'America/Argentina/Rio_Gallegos', + 'America/Argentina/Salta', + 'America/Argentina/San_Juan', + 'America/Argentina/San_Luis', + 'America/Argentina/Tucuman', + 'America/Argentina/Ushuaia', + 'America/Aruba', + 'America/Asuncion', + 'America/Atikokan', + 'America/Bahia', + 'America/Bahia_Banderas', + 'America/Barbados', + 'America/Belem', + 'America/Belize', + 'America/Blanc-Sablon', + 'America/Boa_Vista', + 'America/Bogota', + 'America/Boise', + 'America/Cambridge_Bay', + 'America/Campo_Grande', + 'America/Cancun', + 'America/Caracas', + 'America/Cayenne', + 'America/Cayman', + 'America/Chicago', + 'America/Chihuahua', + 'America/Costa_Rica', + 'America/Creston', + 'America/Cuiaba', + 'America/Curacao', + 'America/Danmarkshavn', + 'America/Dawson', + 'America/Dawson_Creek', + 'America/Denver', + 'America/Detroit', + 'America/Dominica', + 'America/Edmonton', + 'America/Eirunepe', + 'America/El_Salvador', + 'America/Fort_Nelson', + 'America/Fortaleza', + 'America/Glace_Bay', + 'America/Godthab', + 'America/Goose_Bay', + 'America/Grand_Turk', + 'America/Grenada', + 'America/Guadeloupe', + 'America/Guatemala', + 'America/Guayaquil', + 'America/Guyana', + 'America/Halifax', + 'America/Havana', + 'America/Hermosillo', + 'America/Indiana/Indianapolis', + 'America/Indiana/Knox', + 'America/Indiana/Marengo', + 'America/Indiana/Petersburg', + 'America/Indiana/Tell_City', + 'America/Indiana/Vevay', + 'America/Indiana/Vincennes', + 'America/Indiana/Winamac', + 'America/Inuvik', + 'America/Iqaluit', + 'America/Jamaica', + 'America/Juneau', + 'America/Kentucky/Louisville', + 'America/Kentucky/Monticello', + 'America/Kralendijk', + 'America/La_Paz', + 'America/Lima', + 'America/Los_Angeles', + 'America/Lower_Princes', + 'America/Maceio', + 'America/Managua', + 'America/Manaus', + 'America/Marigot', + 'America/Martinique', + 'America/Matamoros', + 'America/Mazatlan', + 'America/Menominee', + 'America/Merida', + 'America/Metlakatla', + 'America/Mexico_City', + 'America/Miquelon', + 'America/Moncton', + 'America/Monterrey', + 'America/Montevideo', + 'America/Montserrat', + 'America/Nassau', + 'America/New_York', + 'America/Nipigon', + 'America/Nome', + 'America/Noronha', + 'America/North_Dakota/Beulah', + 'America/North_Dakota/Center', + 'America/North_Dakota/New_Salem', + 'America/Ojinaga', + 'America/Panama', + 'America/Pangnirtung', + 'America/Paramaribo', + 'America/Phoenix', + 'America/Port-au-Prince', + 'America/Port_of_Spain', + 'America/Porto_Velho', + 'America/Puerto_Rico', + 'America/Punta_Arenas', + 'America/Rainy_River', + 'America/Rankin_Inlet', + 'America/Recife', + 'America/Regina', + 'America/Resolute', + 'America/Rio_Branco', + 'America/Santarem', + 'America/Santiago', + 'America/Santo_Domingo', + 'America/Sao_Paulo', + 'America/Scoresbysund', + 'America/Sitka', + 'America/St_Barthelemy', + 'America/St_Johns', + 'America/St_Kitts', + 'America/St_Lucia', + 'America/St_Thomas', + 'America/St_Vincent', + 'America/Swift_Current', + 'America/Tegucigalpa', + 'America/Thule', + 'America/Thunder_Bay', + 'America/Tijuana', + 'America/Toronto', + 'America/Tortola', + 'America/Vancouver', + 'America/Whitehorse', + 'America/Winnipeg', + 'America/Yakutat', + 'America/Yellowknife', + 'Antarctica/Casey', + 'Antarctica/Davis', + 'Antarctica/DumontDUrville', + 'Antarctica/Macquarie', + 'Antarctica/Mawson', + 'Antarctica/McMurdo', + 'Antarctica/Palmer', + 'Antarctica/Rothera', + 'Antarctica/Syowa', + 'Antarctica/Troll', + 'Antarctica/Vostok', + 'Arctic/Longyearbyen', + 'Asia/Aden', + 'Asia/Almaty', + 'Asia/Amman', + 'Asia/Anadyr', + 'Asia/Aqtau', + 'Asia/Aqtobe', + 'Asia/Ashgabat', + 'Asia/Atyrau', + 'Asia/Baghdad', + 'Asia/Bahrain', + 'Asia/Baku', + 'Asia/Bangkok', + 'Asia/Barnaul', + 'Asia/Beirut', + 'Asia/Bishkek', + 'Asia/Brunei', + 'Asia/Chita', + 'Asia/Choibalsan', + 'Asia/Colombo', + 'Asia/Damascus', + 'Asia/Dhaka', + 'Asia/Dili', + 'Asia/Dubai', + 'Asia/Dushanbe', + 'Asia/Famagusta', + 'Asia/Gaza', + 'Asia/Hebron', + 'Asia/Ho_Chi_Minh', + 'Asia/Hong_Kong', + 'Asia/Hovd', + 'Asia/Irkutsk', + 'Asia/Jakarta', + 'Asia/Jayapura', + 'Asia/Jerusalem', + 'Asia/Kabul', + 'Asia/Kamchatka', + 'Asia/Karachi', + 'Asia/Kathmandu', + 'Asia/Khandyga', + 'Asia/Kolkata', + 'Asia/Krasnoyarsk', + 'Asia/Kuala_Lumpur', + 'Asia/Kuching', + 'Asia/Kuwait', + 'Asia/Macau', + 'Asia/Magadan', + 'Asia/Makassar', + 'Asia/Manila', + 'Asia/Muscat', + 'Asia/Nicosia', + 'Asia/Novokuznetsk', + 'Asia/Novosibirsk', + 'Asia/Omsk', + 'Asia/Oral', + 'Asia/Phnom_Penh', + 'Asia/Pontianak', + 'Asia/Pyongyang', + 'Asia/Qatar', + 'Asia/Qostanay', + 'Asia/Qyzylorda', + 'Asia/Riyadh', + 'Asia/Sakhalin', + 'Asia/Samarkand', + 'Asia/Seoul', + 'Asia/Shanghai', + 'Asia/Singapore', + 'Asia/Srednekolymsk', + 'Asia/Taipei', + 'Asia/Tashkent', + 'Asia/Tbilisi', + 'Asia/Tehran', + 'Asia/Thimphu', + 'Asia/Tokyo', + 'Asia/Tomsk', + 'Asia/Ulaanbaatar', + 'Asia/Urumqi', + 'Asia/Ust-Nera', + 'Asia/Vientiane', + 'Asia/Vladivostok', + 'Asia/Yakutsk', + 'Asia/Yangon', + 'Asia/Yekaterinburg', + 'Asia/Yerevan', + 'Atlantic/Azores', + 'Atlantic/Bermuda', + 'Atlantic/Canary', + 'Atlantic/Cape_Verde', + 'Atlantic/Faroe', + 'Atlantic/Madeira', + 'Atlantic/Reykjavik', + 'Atlantic/South_Georgia', + 'Atlantic/St_Helena', + 'Atlantic/Stanley', + 'Australia/Adelaide', + 'Australia/Brisbane', + 'Australia/Broken_Hill', + 'Australia/Currie', + 'Australia/Darwin', + 'Australia/Eucla', + 'Australia/Hobart', + 'Australia/Lindeman', + 'Australia/Lord_Howe', + 'Australia/Melbourne', + 'Australia/Perth', + 'Australia/Sydney', + 'Canada/Atlantic', + 'Canada/Central', + 'Canada/Eastern', + 'Canada/Mountain', + 'Canada/Newfoundland', + 'Canada/Pacific', + 'Europe/Amsterdam', + 'Europe/Andorra', + 'Europe/Astrakhan', + 'Europe/Athens', + 'Europe/Belgrade', + 'Europe/Berlin', + 'Europe/Bratislava', + 'Europe/Brussels', + 'Europe/Bucharest', + 'Europe/Budapest', + 'Europe/Busingen', + 'Europe/Chisinau', + 'Europe/Copenhagen', + 'Europe/Dublin', + 'Europe/Gibraltar', + 'Europe/Guernsey', + 'Europe/Helsinki', + 'Europe/Isle_of_Man', + 'Europe/Istanbul', + 'Europe/Jersey', + 'Europe/Kaliningrad', + 'Europe/Kiev', + 'Europe/Kirov', + 'Europe/Lisbon', + 'Europe/Ljubljana', + 'Europe/London', + 'Europe/Luxembourg', + 'Europe/Madrid', + 'Europe/Malta', + 'Europe/Mariehamn', + 'Europe/Minsk', + 'Europe/Monaco', + 'Europe/Moscow', + 'Europe/Oslo', + 'Europe/Paris', + 'Europe/Podgorica', + 'Europe/Prague', + 'Europe/Riga', + 'Europe/Rome', + 'Europe/Samara', + 'Europe/San_Marino', + 'Europe/Sarajevo', + 'Europe/Saratov', + 'Europe/Simferopol', + 'Europe/Skopje', + 'Europe/Sofia', + 'Europe/Stockholm', + 'Europe/Tallinn', + 'Europe/Tirane', + 'Europe/Ulyanovsk', + 'Europe/Uzhgorod', + 'Europe/Vaduz', + 'Europe/Vatican', + 'Europe/Vienna', + 'Europe/Vilnius', + 'Europe/Volgograd', + 'Europe/Warsaw', + 'Europe/Zagreb', + 'Europe/Zaporozhye', + 'Europe/Zurich', + 'GMT', + 'Indian/Antananarivo', + 'Indian/Chagos', + 'Indian/Christmas', + 'Indian/Cocos', + 'Indian/Comoro', + 'Indian/Kerguelen', + 'Indian/Mahe', + 'Indian/Maldives', + 'Indian/Mauritius', + 'Indian/Mayotte', + 'Indian/Reunion', + 'Pacific/Apia', + 'Pacific/Auckland', + 'Pacific/Bougainville', + 'Pacific/Chatham', + 'Pacific/Chuuk', + 'Pacific/Easter', + 'Pacific/Efate', + 'Pacific/Enderbury', + 'Pacific/Fakaofo', + 'Pacific/Fiji', + 'Pacific/Funafuti', + 'Pacific/Galapagos', + 'Pacific/Gambier', + 'Pacific/Guadalcanal', + 'Pacific/Guam', + 'Pacific/Honolulu', + 'Pacific/Kiritimati', + 'Pacific/Kosrae', + 'Pacific/Kwajalein', + 'Pacific/Majuro', + 'Pacific/Marquesas', + 'Pacific/Midway', + 'Pacific/Nauru', + 'Pacific/Niue', + 'Pacific/Norfolk', + 'Pacific/Noumea', + 'Pacific/Pago_Pago', + 'Pacific/Palau', + 'Pacific/Pitcairn', + 'Pacific/Pohnpei', + 'Pacific/Port_Moresby', + 'Pacific/Rarotonga', + 'Pacific/Saipan', + 'Pacific/Tahiti', + 'Pacific/Tarawa', + 'Pacific/Tongatapu', + 'Pacific/Wake', + 'Pacific/Wallis', + 'US/Alaska', + 'US/Arizona', + 'US/Central', + 'US/Eastern', + 'US/Hawaii', + 'US/Mountain', + 'US/Pacific', + 'UTC'] +common_timezones = LazyList( + tz for tz in common_timezones if tz in all_timezones) + +common_timezones_set = LazySet(common_timezones) diff --git a/venv/Lib/site-packages/pytz/exceptions.py b/venv/Lib/site-packages/pytz/exceptions.py new file mode 100644 index 0000000..18df33e --- /dev/null +++ b/venv/Lib/site-packages/pytz/exceptions.py @@ -0,0 +1,48 @@ +''' +Custom exceptions raised by pytz. +''' + +__all__ = [ + 'UnknownTimeZoneError', 'InvalidTimeError', 'AmbiguousTimeError', + 'NonExistentTimeError', +] + + +class UnknownTimeZoneError(KeyError): + '''Exception raised when pytz is passed an unknown timezone. + + >>> isinstance(UnknownTimeZoneError(), LookupError) + True + + This class is actually a subclass of KeyError to provide backwards + compatibility with code relying on the undocumented behavior of earlier + pytz releases. + + >>> isinstance(UnknownTimeZoneError(), KeyError) + True + ''' + pass + + +class InvalidTimeError(Exception): + '''Base class for invalid time exceptions.''' + + +class AmbiguousTimeError(InvalidTimeError): + '''Exception raised when attempting to create an ambiguous wallclock time. + + At the end of a DST transition period, a particular wallclock time will + occur twice (once before the clocks are set back, once after). Both + possibilities may be correct, unless further information is supplied. + + See DstTzInfo.normalize() for more info + ''' + + +class NonExistentTimeError(InvalidTimeError): + '''Exception raised when attempting to create a wallclock time that + cannot exist. + + At the start of a DST transition period, the wallclock time jumps forward. + The instants jumped over never occur. + ''' diff --git a/venv/Lib/site-packages/pytz/lazy.py b/venv/Lib/site-packages/pytz/lazy.py new file mode 100644 index 0000000..39344fc --- /dev/null +++ b/venv/Lib/site-packages/pytz/lazy.py @@ -0,0 +1,172 @@ +from threading import RLock +try: + from collections.abc import Mapping as DictMixin +except ImportError: # Python < 3.3 + try: + from UserDict import DictMixin # Python 2 + except ImportError: # Python 3.0-3.3 + from collections import Mapping as DictMixin + + +# With lazy loading, we might end up with multiple threads triggering +# it at the same time. We need a lock. +_fill_lock = RLock() + + +class LazyDict(DictMixin): + """Dictionary populated on first use.""" + data = None + + def __getitem__(self, key): + if self.data is None: + _fill_lock.acquire() + try: + if self.data is None: + self._fill() + finally: + _fill_lock.release() + return self.data[key.upper()] + + def __contains__(self, key): + if self.data is None: + _fill_lock.acquire() + try: + if self.data is None: + self._fill() + finally: + _fill_lock.release() + return key in self.data + + def __iter__(self): + if self.data is None: + _fill_lock.acquire() + try: + if self.data is None: + self._fill() + finally: + _fill_lock.release() + return iter(self.data) + + def __len__(self): + if self.data is None: + _fill_lock.acquire() + try: + if self.data is None: + self._fill() + finally: + _fill_lock.release() + return len(self.data) + + def keys(self): + if self.data is None: + _fill_lock.acquire() + try: + if self.data is None: + self._fill() + finally: + _fill_lock.release() + return self.data.keys() + + +class LazyList(list): + """List populated on first use.""" + + _props = [ + '__str__', '__repr__', '__unicode__', + '__hash__', '__sizeof__', '__cmp__', + '__lt__', '__le__', '__eq__', '__ne__', '__gt__', '__ge__', + 'append', 'count', 'index', 'extend', 'insert', 'pop', 'remove', + 'reverse', 'sort', '__add__', '__radd__', '__iadd__', '__mul__', + '__rmul__', '__imul__', '__contains__', '__len__', '__nonzero__', + '__getitem__', '__setitem__', '__delitem__', '__iter__', + '__reversed__', '__getslice__', '__setslice__', '__delslice__'] + + def __new__(cls, fill_iter=None): + + if fill_iter is None: + return list() + + # We need a new class as we will be dynamically messing with its + # methods. + class LazyList(list): + pass + + fill_iter = [fill_iter] + + def lazy(name): + def _lazy(self, *args, **kw): + _fill_lock.acquire() + try: + if len(fill_iter) > 0: + list.extend(self, fill_iter.pop()) + for method_name in cls._props: + delattr(LazyList, method_name) + finally: + _fill_lock.release() + return getattr(list, name)(self, *args, **kw) + return _lazy + + for name in cls._props: + setattr(LazyList, name, lazy(name)) + + new_list = LazyList() + return new_list + +# Not all versions of Python declare the same magic methods. +# Filter out properties that don't exist in this version of Python +# from the list. +LazyList._props = [prop for prop in LazyList._props if hasattr(list, prop)] + + +class LazySet(set): + """Set populated on first use.""" + + _props = ( + '__str__', '__repr__', '__unicode__', + '__hash__', '__sizeof__', '__cmp__', + '__lt__', '__le__', '__eq__', '__ne__', '__gt__', '__ge__', + '__contains__', '__len__', '__nonzero__', + '__getitem__', '__setitem__', '__delitem__', '__iter__', + '__sub__', '__and__', '__xor__', '__or__', + '__rsub__', '__rand__', '__rxor__', '__ror__', + '__isub__', '__iand__', '__ixor__', '__ior__', + 'add', 'clear', 'copy', 'difference', 'difference_update', + 'discard', 'intersection', 'intersection_update', 'isdisjoint', + 'issubset', 'issuperset', 'pop', 'remove', + 'symmetric_difference', 'symmetric_difference_update', + 'union', 'update') + + def __new__(cls, fill_iter=None): + + if fill_iter is None: + return set() + + class LazySet(set): + pass + + fill_iter = [fill_iter] + + def lazy(name): + def _lazy(self, *args, **kw): + _fill_lock.acquire() + try: + if len(fill_iter) > 0: + for i in fill_iter.pop(): + set.add(self, i) + for method_name in cls._props: + delattr(LazySet, method_name) + finally: + _fill_lock.release() + return getattr(set, name)(self, *args, **kw) + return _lazy + + for name in cls._props: + setattr(LazySet, name, lazy(name)) + + new_set = LazySet() + return new_set + +# Not all versions of Python declare the same magic methods. +# Filter out properties that don't exist in this version of Python +# from the list. +LazySet._props = [prop for prop in LazySet._props if hasattr(set, prop)] diff --git a/venv/Lib/site-packages/pytz/reference.py b/venv/Lib/site-packages/pytz/reference.py new file mode 100644 index 0000000..f765ca0 --- /dev/null +++ b/venv/Lib/site-packages/pytz/reference.py @@ -0,0 +1,140 @@ +''' +Reference tzinfo implementations from the Python docs. +Used for testing against as they are only correct for the years +1987 to 2006. Do not use these for real code. +''' + +from datetime import tzinfo, timedelta, datetime +from pytz import HOUR, ZERO, UTC + +__all__ = [ + 'FixedOffset', + 'LocalTimezone', + 'USTimeZone', + 'Eastern', + 'Central', + 'Mountain', + 'Pacific', + 'UTC' +] + + +# A class building tzinfo objects for fixed-offset time zones. +# Note that FixedOffset(0, "UTC") is a different way to build a +# UTC tzinfo object. +class FixedOffset(tzinfo): + """Fixed offset in minutes east from UTC.""" + + def __init__(self, offset, name): + self.__offset = timedelta(minutes=offset) + self.__name = name + + def utcoffset(self, dt): + return self.__offset + + def tzname(self, dt): + return self.__name + + def dst(self, dt): + return ZERO + + +import time as _time + +STDOFFSET = timedelta(seconds=-_time.timezone) +if _time.daylight: + DSTOFFSET = timedelta(seconds=-_time.altzone) +else: + DSTOFFSET = STDOFFSET + +DSTDIFF = DSTOFFSET - STDOFFSET + + +# A class capturing the platform's idea of local time. +class LocalTimezone(tzinfo): + + def utcoffset(self, dt): + if self._isdst(dt): + return DSTOFFSET + else: + return STDOFFSET + + def dst(self, dt): + if self._isdst(dt): + return DSTDIFF + else: + return ZERO + + def tzname(self, dt): + return _time.tzname[self._isdst(dt)] + + def _isdst(self, dt): + tt = (dt.year, dt.month, dt.day, + dt.hour, dt.minute, dt.second, + dt.weekday(), 0, -1) + stamp = _time.mktime(tt) + tt = _time.localtime(stamp) + return tt.tm_isdst > 0 + +Local = LocalTimezone() + + +def first_sunday_on_or_after(dt): + days_to_go = 6 - dt.weekday() + if days_to_go: + dt += timedelta(days_to_go) + return dt + + +# In the US, DST starts at 2am (standard time) on the first Sunday in April. +DSTSTART = datetime(1, 4, 1, 2) +# and ends at 2am (DST time; 1am standard time) on the last Sunday of Oct. +# which is the first Sunday on or after Oct 25. +DSTEND = datetime(1, 10, 25, 1) + + +# A complete implementation of current DST rules for major US time zones. +class USTimeZone(tzinfo): + + def __init__(self, hours, reprname, stdname, dstname): + self.stdoffset = timedelta(hours=hours) + self.reprname = reprname + self.stdname = stdname + self.dstname = dstname + + def __repr__(self): + return self.reprname + + def tzname(self, dt): + if self.dst(dt): + return self.dstname + else: + return self.stdname + + def utcoffset(self, dt): + return self.stdoffset + self.dst(dt) + + def dst(self, dt): + if dt is None or dt.tzinfo is None: + # An exception may be sensible here, in one or both cases. + # It depends on how you want to treat them. The default + # fromutc() implementation (called by the default astimezone() + # implementation) passes a datetime with dt.tzinfo is self. + return ZERO + assert dt.tzinfo is self + + # Find first Sunday in April & the last in October. + start = first_sunday_on_or_after(DSTSTART.replace(year=dt.year)) + end = first_sunday_on_or_after(DSTEND.replace(year=dt.year)) + + # Can't compare naive to aware objects, so strip the timezone from + # dt first. + if start <= dt.replace(tzinfo=None) < end: + return HOUR + else: + return ZERO + +Eastern = USTimeZone(-5, "Eastern", "EST", "EDT") +Central = USTimeZone(-6, "Central", "CST", "CDT") +Mountain = USTimeZone(-7, "Mountain", "MST", "MDT") +Pacific = USTimeZone(-8, "Pacific", "PST", "PDT") diff --git a/venv/Lib/site-packages/pytz/tzfile.py b/venv/Lib/site-packages/pytz/tzfile.py new file mode 100644 index 0000000..25117f3 --- /dev/null +++ b/venv/Lib/site-packages/pytz/tzfile.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python +''' +$Id: tzfile.py,v 1.8 2004/06/03 00:15:24 zenzen Exp $ +''' + +from datetime import datetime +from struct import unpack, calcsize + +from pytz.tzinfo import StaticTzInfo, DstTzInfo, memorized_ttinfo +from pytz.tzinfo import memorized_datetime, memorized_timedelta + + +def _byte_string(s): + """Cast a string or byte string to an ASCII byte string.""" + return s.encode('ASCII') + +_NULL = _byte_string('\0') + + +def _std_string(s): + """Cast a string or byte string to an ASCII string.""" + return str(s.decode('ASCII')) + + +def build_tzinfo(zone, fp): + head_fmt = '>4s c 15x 6l' + head_size = calcsize(head_fmt) + (magic, format, ttisgmtcnt, ttisstdcnt, leapcnt, timecnt, + typecnt, charcnt) = unpack(head_fmt, fp.read(head_size)) + + # Make sure it is a tzfile(5) file + assert magic == _byte_string('TZif'), 'Got magic %s' % repr(magic) + + # Read out the transition times, localtime indices and ttinfo structures. + data_fmt = '>%(timecnt)dl %(timecnt)dB %(ttinfo)s %(charcnt)ds' % dict( + timecnt=timecnt, ttinfo='lBB' * typecnt, charcnt=charcnt) + data_size = calcsize(data_fmt) + data = unpack(data_fmt, fp.read(data_size)) + + # make sure we unpacked the right number of values + assert len(data) == 2 * timecnt + 3 * typecnt + 1 + transitions = [memorized_datetime(trans) + for trans in data[:timecnt]] + lindexes = list(data[timecnt:2 * timecnt]) + ttinfo_raw = data[2 * timecnt:-1] + tznames_raw = data[-1] + del data + + # Process ttinfo into separate structs + ttinfo = [] + tznames = {} + i = 0 + while i < len(ttinfo_raw): + # have we looked up this timezone name yet? + tzname_offset = ttinfo_raw[i + 2] + if tzname_offset not in tznames: + nul = tznames_raw.find(_NULL, tzname_offset) + if nul < 0: + nul = len(tznames_raw) + tznames[tzname_offset] = _std_string( + tznames_raw[tzname_offset:nul]) + ttinfo.append((ttinfo_raw[i], + bool(ttinfo_raw[i + 1]), + tznames[tzname_offset])) + i += 3 + + # Now build the timezone object + if len(ttinfo) == 1 or len(transitions) == 0: + ttinfo[0][0], ttinfo[0][2] + cls = type(zone, (StaticTzInfo,), dict( + zone=zone, + _utcoffset=memorized_timedelta(ttinfo[0][0]), + _tzname=ttinfo[0][2])) + else: + # Early dates use the first standard time ttinfo + i = 0 + while ttinfo[i][1]: + i += 1 + if ttinfo[i] == ttinfo[lindexes[0]]: + transitions[0] = datetime.min + else: + transitions.insert(0, datetime.min) + lindexes.insert(0, i) + + # calculate transition info + transition_info = [] + for i in range(len(transitions)): + inf = ttinfo[lindexes[i]] + utcoffset = inf[0] + if not inf[1]: + dst = 0 + else: + for j in range(i - 1, -1, -1): + prev_inf = ttinfo[lindexes[j]] + if not prev_inf[1]: + break + dst = inf[0] - prev_inf[0] # dst offset + + # Bad dst? Look further. DST > 24 hours happens when + # a timzone has moved across the international dateline. + if dst <= 0 or dst > 3600 * 3: + for j in range(i + 1, len(transitions)): + stdinf = ttinfo[lindexes[j]] + if not stdinf[1]: + dst = inf[0] - stdinf[0] + if dst > 0: + break # Found a useful std time. + + tzname = inf[2] + + # Round utcoffset and dst to the nearest minute or the + # datetime library will complain. Conversions to these timezones + # might be up to plus or minus 30 seconds out, but it is + # the best we can do. + utcoffset = int((utcoffset + 30) // 60) * 60 + dst = int((dst + 30) // 60) * 60 + transition_info.append(memorized_ttinfo(utcoffset, dst, tzname)) + + cls = type(zone, (DstTzInfo,), dict( + zone=zone, + _utc_transition_times=transitions, + _transition_info=transition_info)) + + return cls() + +if __name__ == '__main__': + import os.path + from pprint import pprint + base = os.path.join(os.path.dirname(__file__), 'zoneinfo') + tz = build_tzinfo('Australia/Melbourne', + open(os.path.join(base, 'Australia', 'Melbourne'), 'rb')) + tz = build_tzinfo('US/Eastern', + open(os.path.join(base, 'US', 'Eastern'), 'rb')) + pprint(tz._utc_transition_times) diff --git a/venv/Lib/site-packages/pytz/tzinfo.py b/venv/Lib/site-packages/pytz/tzinfo.py new file mode 100644 index 0000000..725978d --- /dev/null +++ b/venv/Lib/site-packages/pytz/tzinfo.py @@ -0,0 +1,577 @@ +'''Base classes and helpers for building zone specific tzinfo classes''' + +from datetime import datetime, timedelta, tzinfo +from bisect import bisect_right +try: + set +except NameError: + from sets import Set as set + +import pytz +from pytz.exceptions import AmbiguousTimeError, NonExistentTimeError + +__all__ = [] + +_timedelta_cache = {} + + +def memorized_timedelta(seconds): + '''Create only one instance of each distinct timedelta''' + try: + return _timedelta_cache[seconds] + except KeyError: + delta = timedelta(seconds=seconds) + _timedelta_cache[seconds] = delta + return delta + +_epoch = datetime.utcfromtimestamp(0) +_datetime_cache = {0: _epoch} + + +def memorized_datetime(seconds): + '''Create only one instance of each distinct datetime''' + try: + return _datetime_cache[seconds] + except KeyError: + # NB. We can't just do datetime.utcfromtimestamp(seconds) as this + # fails with negative values under Windows (Bug #90096) + dt = _epoch + timedelta(seconds=seconds) + _datetime_cache[seconds] = dt + return dt + +_ttinfo_cache = {} + + +def memorized_ttinfo(*args): + '''Create only one instance of each distinct tuple''' + try: + return _ttinfo_cache[args] + except KeyError: + ttinfo = ( + memorized_timedelta(args[0]), + memorized_timedelta(args[1]), + args[2] + ) + _ttinfo_cache[args] = ttinfo + return ttinfo + +_notime = memorized_timedelta(0) + + +def _to_seconds(td): + '''Convert a timedelta to seconds''' + return td.seconds + td.days * 24 * 60 * 60 + + +class BaseTzInfo(tzinfo): + # Overridden in subclass + _utcoffset = None + _tzname = None + zone = None + + def __str__(self): + return self.zone + + +class StaticTzInfo(BaseTzInfo): + '''A timezone that has a constant offset from UTC + + These timezones are rare, as most locations have changed their + offset at some point in their history + ''' + def fromutc(self, dt): + '''See datetime.tzinfo.fromutc''' + if dt.tzinfo is not None and dt.tzinfo is not self: + raise ValueError('fromutc: dt.tzinfo is not self') + return (dt + self._utcoffset).replace(tzinfo=self) + + def utcoffset(self, dt, is_dst=None): + '''See datetime.tzinfo.utcoffset + + is_dst is ignored for StaticTzInfo, and exists only to + retain compatibility with DstTzInfo. + ''' + return self._utcoffset + + def dst(self, dt, is_dst=None): + '''See datetime.tzinfo.dst + + is_dst is ignored for StaticTzInfo, and exists only to + retain compatibility with DstTzInfo. + ''' + return _notime + + def tzname(self, dt, is_dst=None): + '''See datetime.tzinfo.tzname + + is_dst is ignored for StaticTzInfo, and exists only to + retain compatibility with DstTzInfo. + ''' + return self._tzname + + def localize(self, dt, is_dst=False): + '''Convert naive time to local time''' + if dt.tzinfo is not None: + raise ValueError('Not naive datetime (tzinfo is already set)') + return dt.replace(tzinfo=self) + + def normalize(self, dt, is_dst=False): + '''Correct the timezone information on the given datetime. + + This is normally a no-op, as StaticTzInfo timezones never have + ambiguous cases to correct: + + >>> from pytz import timezone + >>> gmt = timezone('GMT') + >>> isinstance(gmt, StaticTzInfo) + True + >>> dt = datetime(2011, 5, 8, 1, 2, 3, tzinfo=gmt) + >>> gmt.normalize(dt) is dt + True + + The supported method of converting between timezones is to use + datetime.astimezone(). Currently normalize() also works: + + >>> la = timezone('America/Los_Angeles') + >>> dt = la.localize(datetime(2011, 5, 7, 1, 2, 3)) + >>> fmt = '%Y-%m-%d %H:%M:%S %Z (%z)' + >>> gmt.normalize(dt).strftime(fmt) + '2011-05-07 08:02:03 GMT (+0000)' + ''' + if dt.tzinfo is self: + return dt + if dt.tzinfo is None: + raise ValueError('Naive time - no tzinfo set') + return dt.astimezone(self) + + def __repr__(self): + return '' % (self.zone,) + + def __reduce__(self): + # Special pickle to zone remains a singleton and to cope with + # database changes. + return pytz._p, (self.zone,) + + +class DstTzInfo(BaseTzInfo): + '''A timezone that has a variable offset from UTC + + The offset might change if daylight saving time comes into effect, + or at a point in history when the region decides to change their + timezone definition. + ''' + # Overridden in subclass + + # Sorted list of DST transition times, UTC + _utc_transition_times = None + + # [(utcoffset, dstoffset, tzname)] corresponding to + # _utc_transition_times entries + _transition_info = None + + zone = None + + # Set in __init__ + + _tzinfos = None + _dst = None # DST offset + + def __init__(self, _inf=None, _tzinfos=None): + if _inf: + self._tzinfos = _tzinfos + self._utcoffset, self._dst, self._tzname = _inf + else: + _tzinfos = {} + self._tzinfos = _tzinfos + self._utcoffset, self._dst, self._tzname = ( + self._transition_info[0]) + _tzinfos[self._transition_info[0]] = self + for inf in self._transition_info[1:]: + if inf not in _tzinfos: + _tzinfos[inf] = self.__class__(inf, _tzinfos) + + def fromutc(self, dt): + '''See datetime.tzinfo.fromutc''' + if (dt.tzinfo is not None and + getattr(dt.tzinfo, '_tzinfos', None) is not self._tzinfos): + raise ValueError('fromutc: dt.tzinfo is not self') + dt = dt.replace(tzinfo=None) + idx = max(0, bisect_right(self._utc_transition_times, dt) - 1) + inf = self._transition_info[idx] + return (dt + inf[0]).replace(tzinfo=self._tzinfos[inf]) + + def normalize(self, dt): + '''Correct the timezone information on the given datetime + + If date arithmetic crosses DST boundaries, the tzinfo + is not magically adjusted. This method normalizes the + tzinfo to the correct one. + + To test, first we need to do some setup + + >>> from pytz import timezone + >>> utc = timezone('UTC') + >>> eastern = timezone('US/Eastern') + >>> fmt = '%Y-%m-%d %H:%M:%S %Z (%z)' + + We next create a datetime right on an end-of-DST transition point, + the instant when the wallclocks are wound back one hour. + + >>> utc_dt = datetime(2002, 10, 27, 6, 0, 0, tzinfo=utc) + >>> loc_dt = utc_dt.astimezone(eastern) + >>> loc_dt.strftime(fmt) + '2002-10-27 01:00:00 EST (-0500)' + + Now, if we subtract a few minutes from it, note that the timezone + information has not changed. + + >>> before = loc_dt - timedelta(minutes=10) + >>> before.strftime(fmt) + '2002-10-27 00:50:00 EST (-0500)' + + But we can fix that by calling the normalize method + + >>> before = eastern.normalize(before) + >>> before.strftime(fmt) + '2002-10-27 01:50:00 EDT (-0400)' + + The supported method of converting between timezones is to use + datetime.astimezone(). Currently, normalize() also works: + + >>> th = timezone('Asia/Bangkok') + >>> am = timezone('Europe/Amsterdam') + >>> dt = th.localize(datetime(2011, 5, 7, 1, 2, 3)) + >>> fmt = '%Y-%m-%d %H:%M:%S %Z (%z)' + >>> am.normalize(dt).strftime(fmt) + '2011-05-06 20:02:03 CEST (+0200)' + ''' + if dt.tzinfo is None: + raise ValueError('Naive time - no tzinfo set') + + # Convert dt in localtime to UTC + offset = dt.tzinfo._utcoffset + dt = dt.replace(tzinfo=None) + dt = dt - offset + # convert it back, and return it + return self.fromutc(dt) + + def localize(self, dt, is_dst=False): + '''Convert naive time to local time. + + This method should be used to construct localtimes, rather + than passing a tzinfo argument to a datetime constructor. + + is_dst is used to determine the correct timezone in the ambigous + period at the end of daylight saving time. + + >>> from pytz import timezone + >>> fmt = '%Y-%m-%d %H:%M:%S %Z (%z)' + >>> amdam = timezone('Europe/Amsterdam') + >>> dt = datetime(2004, 10, 31, 2, 0, 0) + >>> loc_dt1 = amdam.localize(dt, is_dst=True) + >>> loc_dt2 = amdam.localize(dt, is_dst=False) + >>> loc_dt1.strftime(fmt) + '2004-10-31 02:00:00 CEST (+0200)' + >>> loc_dt2.strftime(fmt) + '2004-10-31 02:00:00 CET (+0100)' + >>> str(loc_dt2 - loc_dt1) + '1:00:00' + + Use is_dst=None to raise an AmbiguousTimeError for ambiguous + times at the end of daylight saving time + + >>> try: + ... loc_dt1 = amdam.localize(dt, is_dst=None) + ... except AmbiguousTimeError: + ... print('Ambiguous') + Ambiguous + + is_dst defaults to False + + >>> amdam.localize(dt) == amdam.localize(dt, False) + True + + is_dst is also used to determine the correct timezone in the + wallclock times jumped over at the start of daylight saving time. + + >>> pacific = timezone('US/Pacific') + >>> dt = datetime(2008, 3, 9, 2, 0, 0) + >>> ploc_dt1 = pacific.localize(dt, is_dst=True) + >>> ploc_dt2 = pacific.localize(dt, is_dst=False) + >>> ploc_dt1.strftime(fmt) + '2008-03-09 02:00:00 PDT (-0700)' + >>> ploc_dt2.strftime(fmt) + '2008-03-09 02:00:00 PST (-0800)' + >>> str(ploc_dt2 - ploc_dt1) + '1:00:00' + + Use is_dst=None to raise a NonExistentTimeError for these skipped + times. + + >>> try: + ... loc_dt1 = pacific.localize(dt, is_dst=None) + ... except NonExistentTimeError: + ... print('Non-existent') + Non-existent + ''' + if dt.tzinfo is not None: + raise ValueError('Not naive datetime (tzinfo is already set)') + + # Find the two best possibilities. + possible_loc_dt = set() + for delta in [timedelta(days=-1), timedelta(days=1)]: + loc_dt = dt + delta + idx = max(0, bisect_right( + self._utc_transition_times, loc_dt) - 1) + inf = self._transition_info[idx] + tzinfo = self._tzinfos[inf] + loc_dt = tzinfo.normalize(dt.replace(tzinfo=tzinfo)) + if loc_dt.replace(tzinfo=None) == dt: + possible_loc_dt.add(loc_dt) + + if len(possible_loc_dt) == 1: + return possible_loc_dt.pop() + + # If there are no possibly correct timezones, we are attempting + # to convert a time that never happened - the time period jumped + # during the start-of-DST transition period. + if len(possible_loc_dt) == 0: + # If we refuse to guess, raise an exception. + if is_dst is None: + raise NonExistentTimeError(dt) + + # If we are forcing the pre-DST side of the DST transition, we + # obtain the correct timezone by winding the clock forward a few + # hours. + elif is_dst: + return self.localize( + dt + timedelta(hours=6), is_dst=True) - timedelta(hours=6) + + # If we are forcing the post-DST side of the DST transition, we + # obtain the correct timezone by winding the clock back. + else: + return self.localize( + dt - timedelta(hours=6), + is_dst=False) + timedelta(hours=6) + + # If we get this far, we have multiple possible timezones - this + # is an ambiguous case occuring during the end-of-DST transition. + + # If told to be strict, raise an exception since we have an + # ambiguous case + if is_dst is None: + raise AmbiguousTimeError(dt) + + # Filter out the possiblilities that don't match the requested + # is_dst + filtered_possible_loc_dt = [ + p for p in possible_loc_dt if bool(p.tzinfo._dst) == is_dst + ] + + # Hopefully we only have one possibility left. Return it. + if len(filtered_possible_loc_dt) == 1: + return filtered_possible_loc_dt[0] + + if len(filtered_possible_loc_dt) == 0: + filtered_possible_loc_dt = list(possible_loc_dt) + + # If we get this far, we have in a wierd timezone transition + # where the clocks have been wound back but is_dst is the same + # in both (eg. Europe/Warsaw 1915 when they switched to CET). + # At this point, we just have to guess unless we allow more + # hints to be passed in (such as the UTC offset or abbreviation), + # but that is just getting silly. + # + # Choose the earliest (by UTC) applicable timezone if is_dst=True + # Choose the latest (by UTC) applicable timezone if is_dst=False + # i.e., behave like end-of-DST transition + dates = {} # utc -> local + for local_dt in filtered_possible_loc_dt: + utc_time = ( + local_dt.replace(tzinfo=None) - local_dt.tzinfo._utcoffset) + assert utc_time not in dates + dates[utc_time] = local_dt + return dates[[min, max][not is_dst](dates)] + + def utcoffset(self, dt, is_dst=None): + '''See datetime.tzinfo.utcoffset + + The is_dst parameter may be used to remove ambiguity during DST + transitions. + + >>> from pytz import timezone + >>> tz = timezone('America/St_Johns') + >>> ambiguous = datetime(2009, 10, 31, 23, 30) + + >>> str(tz.utcoffset(ambiguous, is_dst=False)) + '-1 day, 20:30:00' + + >>> str(tz.utcoffset(ambiguous, is_dst=True)) + '-1 day, 21:30:00' + + >>> try: + ... tz.utcoffset(ambiguous) + ... except AmbiguousTimeError: + ... print('Ambiguous') + Ambiguous + + ''' + if dt is None: + return None + elif dt.tzinfo is not self: + dt = self.localize(dt, is_dst) + return dt.tzinfo._utcoffset + else: + return self._utcoffset + + def dst(self, dt, is_dst=None): + '''See datetime.tzinfo.dst + + The is_dst parameter may be used to remove ambiguity during DST + transitions. + + >>> from pytz import timezone + >>> tz = timezone('America/St_Johns') + + >>> normal = datetime(2009, 9, 1) + + >>> str(tz.dst(normal)) + '1:00:00' + >>> str(tz.dst(normal, is_dst=False)) + '1:00:00' + >>> str(tz.dst(normal, is_dst=True)) + '1:00:00' + + >>> ambiguous = datetime(2009, 10, 31, 23, 30) + + >>> str(tz.dst(ambiguous, is_dst=False)) + '0:00:00' + >>> str(tz.dst(ambiguous, is_dst=True)) + '1:00:00' + >>> try: + ... tz.dst(ambiguous) + ... except AmbiguousTimeError: + ... print('Ambiguous') + Ambiguous + + ''' + if dt is None: + return None + elif dt.tzinfo is not self: + dt = self.localize(dt, is_dst) + return dt.tzinfo._dst + else: + return self._dst + + def tzname(self, dt, is_dst=None): + '''See datetime.tzinfo.tzname + + The is_dst parameter may be used to remove ambiguity during DST + transitions. + + >>> from pytz import timezone + >>> tz = timezone('America/St_Johns') + + >>> normal = datetime(2009, 9, 1) + + >>> tz.tzname(normal) + 'NDT' + >>> tz.tzname(normal, is_dst=False) + 'NDT' + >>> tz.tzname(normal, is_dst=True) + 'NDT' + + >>> ambiguous = datetime(2009, 10, 31, 23, 30) + + >>> tz.tzname(ambiguous, is_dst=False) + 'NST' + >>> tz.tzname(ambiguous, is_dst=True) + 'NDT' + >>> try: + ... tz.tzname(ambiguous) + ... except AmbiguousTimeError: + ... print('Ambiguous') + Ambiguous + ''' + if dt is None: + return self.zone + elif dt.tzinfo is not self: + dt = self.localize(dt, is_dst) + return dt.tzinfo._tzname + else: + return self._tzname + + def __repr__(self): + if self._dst: + dst = 'DST' + else: + dst = 'STD' + if self._utcoffset > _notime: + return '' % ( + self.zone, self._tzname, self._utcoffset, dst + ) + else: + return '' % ( + self.zone, self._tzname, self._utcoffset, dst + ) + + def __reduce__(self): + # Special pickle to zone remains a singleton and to cope with + # database changes. + return pytz._p, ( + self.zone, + _to_seconds(self._utcoffset), + _to_seconds(self._dst), + self._tzname + ) + + +def unpickler(zone, utcoffset=None, dstoffset=None, tzname=None): + """Factory function for unpickling pytz tzinfo instances. + + This is shared for both StaticTzInfo and DstTzInfo instances, because + database changes could cause a zones implementation to switch between + these two base classes and we can't break pickles on a pytz version + upgrade. + """ + # Raises a KeyError if zone no longer exists, which should never happen + # and would be a bug. + tz = pytz.timezone(zone) + + # A StaticTzInfo - just return it + if utcoffset is None: + return tz + + # This pickle was created from a DstTzInfo. We need to + # determine which of the list of tzinfo instances for this zone + # to use in order to restore the state of any datetime instances using + # it correctly. + utcoffset = memorized_timedelta(utcoffset) + dstoffset = memorized_timedelta(dstoffset) + try: + return tz._tzinfos[(utcoffset, dstoffset, tzname)] + except KeyError: + # The particular state requested in this timezone no longer exists. + # This indicates a corrupt pickle, or the timezone database has been + # corrected violently enough to make this particular + # (utcoffset,dstoffset) no longer exist in the zone, or the + # abbreviation has been changed. + pass + + # See if we can find an entry differing only by tzname. Abbreviations + # get changed from the initial guess by the database maintainers to + # match reality when this information is discovered. + for localized_tz in tz._tzinfos.values(): + if (localized_tz._utcoffset == utcoffset and + localized_tz._dst == dstoffset): + return localized_tz + + # This (utcoffset, dstoffset) information has been removed from the + # zone. Add it back. This might occur when the database maintainers have + # corrected incorrect information. datetime instances using this + # incorrect information will continue to do so, exactly as they were + # before being pickled. This is purely an overly paranoid safety net - I + # doubt this will ever been needed in real life. + inf = (utcoffset, dstoffset, tzname) + tz._tzinfos[inf] = tz.__class__(inf, tz._tzinfos) + return tz._tzinfos[inf] diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Abidjan b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Abidjan new file mode 100644 index 0000000..28b32ab Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Abidjan differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Accra b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Accra new file mode 100644 index 0000000..697b993 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Accra differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Addis_Ababa b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Addis_Ababa new file mode 100644 index 0000000..9a2918f Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Addis_Ababa differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Algiers b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Algiers new file mode 100644 index 0000000..ae04342 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Algiers differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Asmara b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Asmara new file mode 100644 index 0000000..9a2918f Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Asmara differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Asmera b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Asmera new file mode 100644 index 0000000..9a2918f Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Asmera differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Bamako b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Bamako new file mode 100644 index 0000000..28b32ab Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Bamako differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Bangui b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Bangui new file mode 100644 index 0000000..0c80137 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Bangui differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Banjul b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Banjul new file mode 100644 index 0000000..28b32ab Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Banjul differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Bissau b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Bissau new file mode 100644 index 0000000..82ea5aa Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Bissau differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Blantyre b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Blantyre new file mode 100644 index 0000000..52753c0 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Blantyre differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Brazzaville b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Brazzaville new file mode 100644 index 0000000..0c80137 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Brazzaville differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Bujumbura b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Bujumbura new file mode 100644 index 0000000..52753c0 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Bujumbura differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Cairo b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Cairo new file mode 100644 index 0000000..d3f8196 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Cairo differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Casablanca b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Casablanca new file mode 100644 index 0000000..245f4eb Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Casablanca differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Ceuta b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Ceuta new file mode 100644 index 0000000..850c8f0 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Ceuta differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Conakry b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Conakry new file mode 100644 index 0000000..28b32ab Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Conakry differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Dakar b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Dakar new file mode 100644 index 0000000..28b32ab Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Dakar differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Dar_es_Salaam b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Dar_es_Salaam new file mode 100644 index 0000000..9a2918f Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Dar_es_Salaam differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Djibouti b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Djibouti new file mode 100644 index 0000000..9a2918f Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Djibouti differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Douala b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Douala new file mode 100644 index 0000000..0c80137 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Douala differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/El_Aaiun b/venv/Lib/site-packages/pytz/zoneinfo/Africa/El_Aaiun new file mode 100644 index 0000000..a91f65f Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/El_Aaiun differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Freetown b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Freetown new file mode 100644 index 0000000..28b32ab Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Freetown differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Gaborone b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Gaborone new file mode 100644 index 0000000..52753c0 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Gaborone differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Harare b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Harare new file mode 100644 index 0000000..52753c0 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Harare differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Johannesburg b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Johannesburg new file mode 100644 index 0000000..b1c425d Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Johannesburg differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Juba b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Juba new file mode 100644 index 0000000..625b1ac Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Juba differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Kampala b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Kampala new file mode 100644 index 0000000..9a2918f Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Kampala differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Khartoum b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Khartoum new file mode 100644 index 0000000..8ee8cb9 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Khartoum differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Kigali b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Kigali new file mode 100644 index 0000000..52753c0 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Kigali differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Kinshasa b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Kinshasa new file mode 100644 index 0000000..0c80137 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Kinshasa differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Lagos b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Lagos new file mode 100644 index 0000000..0c80137 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Lagos differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Libreville b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Libreville new file mode 100644 index 0000000..0c80137 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Libreville differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Lome b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Lome new file mode 100644 index 0000000..28b32ab Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Lome differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Luanda b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Luanda new file mode 100644 index 0000000..0c80137 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Luanda differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Lubumbashi b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Lubumbashi new file mode 100644 index 0000000..52753c0 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Lubumbashi differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Lusaka b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Lusaka new file mode 100644 index 0000000..52753c0 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Lusaka differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Malabo b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Malabo new file mode 100644 index 0000000..0c80137 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Malabo differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Maputo b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Maputo new file mode 100644 index 0000000..52753c0 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Maputo differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Maseru b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Maseru new file mode 100644 index 0000000..b1c425d Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Maseru differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Mbabane b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Mbabane new file mode 100644 index 0000000..b1c425d Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Mbabane differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Mogadishu b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Mogadishu new file mode 100644 index 0000000..9a2918f Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Mogadishu differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Monrovia b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Monrovia new file mode 100644 index 0000000..6d68850 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Monrovia differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Nairobi b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Nairobi new file mode 100644 index 0000000..9a2918f Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Nairobi differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Ndjamena b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Ndjamena new file mode 100644 index 0000000..a968845 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Ndjamena differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Niamey b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Niamey new file mode 100644 index 0000000..0c80137 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Niamey differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Nouakchott b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Nouakchott new file mode 100644 index 0000000..28b32ab Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Nouakchott differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Ouagadougou b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Ouagadougou new file mode 100644 index 0000000..28b32ab Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Ouagadougou differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Porto-Novo b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Porto-Novo new file mode 100644 index 0000000..0c80137 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Porto-Novo differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Sao_Tome b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Sao_Tome new file mode 100644 index 0000000..59f3759 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Sao_Tome differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Timbuktu b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Timbuktu new file mode 100644 index 0000000..28b32ab Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Timbuktu differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Tripoli b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Tripoli new file mode 100644 index 0000000..07b393b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Tripoli differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Tunis b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Tunis new file mode 100644 index 0000000..427fa56 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Tunis differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Africa/Windhoek b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Windhoek new file mode 100644 index 0000000..abecd13 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Africa/Windhoek differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Adak b/venv/Lib/site-packages/pytz/zoneinfo/America/Adak new file mode 100644 index 0000000..4323649 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Adak differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Anchorage b/venv/Lib/site-packages/pytz/zoneinfo/America/Anchorage new file mode 100644 index 0000000..9bbb2fd Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Anchorage differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Anguilla b/venv/Lib/site-packages/pytz/zoneinfo/America/Anguilla new file mode 100644 index 0000000..697cf5b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Anguilla differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Antigua b/venv/Lib/site-packages/pytz/zoneinfo/America/Antigua new file mode 100644 index 0000000..697cf5b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Antigua differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Araguaina b/venv/Lib/site-packages/pytz/zoneinfo/America/Araguaina new file mode 100644 index 0000000..49381b4 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Araguaina differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/Buenos_Aires b/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/Buenos_Aires new file mode 100644 index 0000000..260f86a Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/Buenos_Aires differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/Catamarca b/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/Catamarca new file mode 100644 index 0000000..0ae222a Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/Catamarca differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/ComodRivadavia b/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/ComodRivadavia new file mode 100644 index 0000000..0ae222a Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/ComodRivadavia differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/Cordoba b/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/Cordoba new file mode 100644 index 0000000..da4c23a Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/Cordoba differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/Jujuy b/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/Jujuy new file mode 100644 index 0000000..604b856 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/Jujuy differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/La_Rioja b/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/La_Rioja new file mode 100644 index 0000000..2218e36 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/La_Rioja differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/Mendoza b/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/Mendoza new file mode 100644 index 0000000..f9e677f Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/Mendoza differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/Rio_Gallegos b/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/Rio_Gallegos new file mode 100644 index 0000000..c36587e Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/Rio_Gallegos differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/Salta b/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/Salta new file mode 100644 index 0000000..0e797f2 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/Salta differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/San_Juan b/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/San_Juan new file mode 100644 index 0000000..2698495 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/San_Juan differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/San_Luis b/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/San_Luis new file mode 100644 index 0000000..fe50f62 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/San_Luis differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/Tucuman b/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/Tucuman new file mode 100644 index 0000000..c954000 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/Tucuman differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/Ushuaia b/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/Ushuaia new file mode 100644 index 0000000..3643628 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Argentina/Ushuaia differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Aruba b/venv/Lib/site-packages/pytz/zoneinfo/America/Aruba new file mode 100644 index 0000000..f7ab6ef Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Aruba differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Asuncion b/venv/Lib/site-packages/pytz/zoneinfo/America/Asuncion new file mode 100644 index 0000000..2f3bbda Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Asuncion differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Atikokan b/venv/Lib/site-packages/pytz/zoneinfo/America/Atikokan new file mode 100644 index 0000000..629ed42 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Atikokan differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Atka b/venv/Lib/site-packages/pytz/zoneinfo/America/Atka new file mode 100644 index 0000000..4323649 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Atka differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Bahia b/venv/Lib/site-packages/pytz/zoneinfo/America/Bahia new file mode 100644 index 0000000..15808d3 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Bahia differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Bahia_Banderas b/venv/Lib/site-packages/pytz/zoneinfo/America/Bahia_Banderas new file mode 100644 index 0000000..896af3f Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Bahia_Banderas differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Barbados b/venv/Lib/site-packages/pytz/zoneinfo/America/Barbados new file mode 100644 index 0000000..9b90e30 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Barbados differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Belem b/venv/Lib/site-packages/pytz/zoneinfo/America/Belem new file mode 100644 index 0000000..60b5924 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Belem differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Belize b/venv/Lib/site-packages/pytz/zoneinfo/America/Belize new file mode 100644 index 0000000..851051a Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Belize differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Blanc-Sablon b/venv/Lib/site-packages/pytz/zoneinfo/America/Blanc-Sablon new file mode 100644 index 0000000..f9f13a1 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Blanc-Sablon differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Boa_Vista b/venv/Lib/site-packages/pytz/zoneinfo/America/Boa_Vista new file mode 100644 index 0000000..978c331 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Boa_Vista differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Bogota b/venv/Lib/site-packages/pytz/zoneinfo/America/Bogota new file mode 100644 index 0000000..b2647d7 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Bogota differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Boise b/venv/Lib/site-packages/pytz/zoneinfo/America/Boise new file mode 100644 index 0000000..f8d54e2 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Boise differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Buenos_Aires b/venv/Lib/site-packages/pytz/zoneinfo/America/Buenos_Aires new file mode 100644 index 0000000..260f86a Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Buenos_Aires differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Cambridge_Bay b/venv/Lib/site-packages/pytz/zoneinfo/America/Cambridge_Bay new file mode 100644 index 0000000..f8db4b6 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Cambridge_Bay differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Campo_Grande b/venv/Lib/site-packages/pytz/zoneinfo/America/Campo_Grande new file mode 100644 index 0000000..8120624 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Campo_Grande differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Cancun b/venv/Lib/site-packages/pytz/zoneinfo/America/Cancun new file mode 100644 index 0000000..f907f0a Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Cancun differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Caracas b/venv/Lib/site-packages/pytz/zoneinfo/America/Caracas new file mode 100644 index 0000000..eedf725 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Caracas differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Catamarca b/venv/Lib/site-packages/pytz/zoneinfo/America/Catamarca new file mode 100644 index 0000000..0ae222a Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Catamarca differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Cayenne b/venv/Lib/site-packages/pytz/zoneinfo/America/Cayenne new file mode 100644 index 0000000..e5bc06f Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Cayenne differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Cayman b/venv/Lib/site-packages/pytz/zoneinfo/America/Cayman new file mode 100644 index 0000000..9964b9a Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Cayman differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Chicago b/venv/Lib/site-packages/pytz/zoneinfo/America/Chicago new file mode 100644 index 0000000..a5b1617 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Chicago differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Chihuahua b/venv/Lib/site-packages/pytz/zoneinfo/America/Chihuahua new file mode 100644 index 0000000..8ed5f93 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Chihuahua differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Coral_Harbour b/venv/Lib/site-packages/pytz/zoneinfo/America/Coral_Harbour new file mode 100644 index 0000000..629ed42 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Coral_Harbour differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Cordoba b/venv/Lib/site-packages/pytz/zoneinfo/America/Cordoba new file mode 100644 index 0000000..da4c23a Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Cordoba differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Costa_Rica b/venv/Lib/site-packages/pytz/zoneinfo/America/Costa_Rica new file mode 100644 index 0000000..37cb85e Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Costa_Rica differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Creston b/venv/Lib/site-packages/pytz/zoneinfo/America/Creston new file mode 100644 index 0000000..ca64857 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Creston differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Cuiaba b/venv/Lib/site-packages/pytz/zoneinfo/America/Cuiaba new file mode 100644 index 0000000..9bea3d4 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Cuiaba differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Curacao b/venv/Lib/site-packages/pytz/zoneinfo/America/Curacao new file mode 100644 index 0000000..f7ab6ef Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Curacao differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Danmarkshavn b/venv/Lib/site-packages/pytz/zoneinfo/America/Danmarkshavn new file mode 100644 index 0000000..9549adc Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Danmarkshavn differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Dawson b/venv/Lib/site-packages/pytz/zoneinfo/America/Dawson new file mode 100644 index 0000000..db9cead Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Dawson differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Dawson_Creek b/venv/Lib/site-packages/pytz/zoneinfo/America/Dawson_Creek new file mode 100644 index 0000000..db9e339 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Dawson_Creek differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Denver b/venv/Lib/site-packages/pytz/zoneinfo/America/Denver new file mode 100644 index 0000000..5fbe26b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Denver differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Detroit b/venv/Lib/site-packages/pytz/zoneinfo/America/Detroit new file mode 100644 index 0000000..e104faa Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Detroit differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Dominica b/venv/Lib/site-packages/pytz/zoneinfo/America/Dominica new file mode 100644 index 0000000..697cf5b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Dominica differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Edmonton b/venv/Lib/site-packages/pytz/zoneinfo/America/Edmonton new file mode 100644 index 0000000..cd78a6f Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Edmonton differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Eirunepe b/venv/Lib/site-packages/pytz/zoneinfo/America/Eirunepe new file mode 100644 index 0000000..39d6dae Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Eirunepe differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/El_Salvador b/venv/Lib/site-packages/pytz/zoneinfo/America/El_Salvador new file mode 100644 index 0000000..e2f2230 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/El_Salvador differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Ensenada b/venv/Lib/site-packages/pytz/zoneinfo/America/Ensenada new file mode 100644 index 0000000..ada6bf7 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Ensenada differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Fort_Nelson b/venv/Lib/site-packages/pytz/zoneinfo/America/Fort_Nelson new file mode 100644 index 0000000..5a0b7f1 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Fort_Nelson differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Fort_Wayne b/venv/Lib/site-packages/pytz/zoneinfo/America/Fort_Wayne new file mode 100644 index 0000000..09511cc Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Fort_Wayne differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Fortaleza b/venv/Lib/site-packages/pytz/zoneinfo/America/Fortaleza new file mode 100644 index 0000000..be57dc2 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Fortaleza differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Glace_Bay b/venv/Lib/site-packages/pytz/zoneinfo/America/Glace_Bay new file mode 100644 index 0000000..48412a4 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Glace_Bay differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Godthab b/venv/Lib/site-packages/pytz/zoneinfo/America/Godthab new file mode 100644 index 0000000..0160308 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Godthab differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Goose_Bay b/venv/Lib/site-packages/pytz/zoneinfo/America/Goose_Bay new file mode 100644 index 0000000..a3f2990 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Goose_Bay differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Grand_Turk b/venv/Lib/site-packages/pytz/zoneinfo/America/Grand_Turk new file mode 100644 index 0000000..b9bb063 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Grand_Turk differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Grenada b/venv/Lib/site-packages/pytz/zoneinfo/America/Grenada new file mode 100644 index 0000000..697cf5b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Grenada differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Guadeloupe b/venv/Lib/site-packages/pytz/zoneinfo/America/Guadeloupe new file mode 100644 index 0000000..697cf5b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Guadeloupe differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Guatemala b/venv/Lib/site-packages/pytz/zoneinfo/America/Guatemala new file mode 100644 index 0000000..407138c Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Guatemala differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Guayaquil b/venv/Lib/site-packages/pytz/zoneinfo/America/Guayaquil new file mode 100644 index 0000000..0559a7a Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Guayaquil differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Guyana b/venv/Lib/site-packages/pytz/zoneinfo/America/Guyana new file mode 100644 index 0000000..d5dab14 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Guyana differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Halifax b/venv/Lib/site-packages/pytz/zoneinfo/America/Halifax new file mode 100644 index 0000000..756099a Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Halifax differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Havana b/venv/Lib/site-packages/pytz/zoneinfo/America/Havana new file mode 100644 index 0000000..b69ac45 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Havana differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Hermosillo b/venv/Lib/site-packages/pytz/zoneinfo/America/Hermosillo new file mode 100644 index 0000000..791a9fa Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Hermosillo differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Indiana/Indianapolis b/venv/Lib/site-packages/pytz/zoneinfo/America/Indiana/Indianapolis new file mode 100644 index 0000000..09511cc Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Indiana/Indianapolis differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Indiana/Knox b/venv/Lib/site-packages/pytz/zoneinfo/America/Indiana/Knox new file mode 100644 index 0000000..fcd408d Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Indiana/Knox differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Indiana/Marengo b/venv/Lib/site-packages/pytz/zoneinfo/America/Indiana/Marengo new file mode 100644 index 0000000..1abf75e Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Indiana/Marengo differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Indiana/Petersburg b/venv/Lib/site-packages/pytz/zoneinfo/America/Indiana/Petersburg new file mode 100644 index 0000000..0133548 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Indiana/Petersburg differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Indiana/Tell_City b/venv/Lib/site-packages/pytz/zoneinfo/America/Indiana/Tell_City new file mode 100644 index 0000000..7bbb653 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Indiana/Tell_City differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Indiana/Vevay b/venv/Lib/site-packages/pytz/zoneinfo/America/Indiana/Vevay new file mode 100644 index 0000000..d236b7c Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Indiana/Vevay differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Indiana/Vincennes b/venv/Lib/site-packages/pytz/zoneinfo/America/Indiana/Vincennes new file mode 100644 index 0000000..c818929 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Indiana/Vincennes differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Indiana/Winamac b/venv/Lib/site-packages/pytz/zoneinfo/America/Indiana/Winamac new file mode 100644 index 0000000..630935c Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Indiana/Winamac differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Indianapolis b/venv/Lib/site-packages/pytz/zoneinfo/America/Indianapolis new file mode 100644 index 0000000..09511cc Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Indianapolis differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Inuvik b/venv/Lib/site-packages/pytz/zoneinfo/America/Inuvik new file mode 100644 index 0000000..87bb355 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Inuvik differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Iqaluit b/venv/Lib/site-packages/pytz/zoneinfo/America/Iqaluit new file mode 100644 index 0000000..c8138bd Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Iqaluit differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Jamaica b/venv/Lib/site-packages/pytz/zoneinfo/America/Jamaica new file mode 100644 index 0000000..2a9b7fd Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Jamaica differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Jujuy b/venv/Lib/site-packages/pytz/zoneinfo/America/Jujuy new file mode 100644 index 0000000..604b856 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Jujuy differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Juneau b/venv/Lib/site-packages/pytz/zoneinfo/America/Juneau new file mode 100644 index 0000000..451f349 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Juneau differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Kentucky/Louisville b/venv/Lib/site-packages/pytz/zoneinfo/America/Kentucky/Louisville new file mode 100644 index 0000000..177836e Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Kentucky/Louisville differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Kentucky/Monticello b/venv/Lib/site-packages/pytz/zoneinfo/America/Kentucky/Monticello new file mode 100644 index 0000000..438e3ea Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Kentucky/Monticello differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Knox_IN b/venv/Lib/site-packages/pytz/zoneinfo/America/Knox_IN new file mode 100644 index 0000000..fcd408d Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Knox_IN differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Kralendijk b/venv/Lib/site-packages/pytz/zoneinfo/America/Kralendijk new file mode 100644 index 0000000..f7ab6ef Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Kralendijk differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/La_Paz b/venv/Lib/site-packages/pytz/zoneinfo/America/La_Paz new file mode 100644 index 0000000..a101372 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/La_Paz differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Lima b/venv/Lib/site-packages/pytz/zoneinfo/America/Lima new file mode 100644 index 0000000..3c6529b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Lima differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Los_Angeles b/venv/Lib/site-packages/pytz/zoneinfo/America/Los_Angeles new file mode 100644 index 0000000..9dad4f4 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Los_Angeles differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Louisville b/venv/Lib/site-packages/pytz/zoneinfo/America/Louisville new file mode 100644 index 0000000..177836e Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Louisville differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Lower_Princes b/venv/Lib/site-packages/pytz/zoneinfo/America/Lower_Princes new file mode 100644 index 0000000..f7ab6ef Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Lower_Princes differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Maceio b/venv/Lib/site-packages/pytz/zoneinfo/America/Maceio new file mode 100644 index 0000000..bc8b951 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Maceio differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Managua b/venv/Lib/site-packages/pytz/zoneinfo/America/Managua new file mode 100644 index 0000000..e0242bf Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Managua differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Manaus b/venv/Lib/site-packages/pytz/zoneinfo/America/Manaus new file mode 100644 index 0000000..63d58f8 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Manaus differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Marigot b/venv/Lib/site-packages/pytz/zoneinfo/America/Marigot new file mode 100644 index 0000000..697cf5b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Marigot differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Martinique b/venv/Lib/site-packages/pytz/zoneinfo/America/Martinique new file mode 100644 index 0000000..8df43dc Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Martinique differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Matamoros b/venv/Lib/site-packages/pytz/zoneinfo/America/Matamoros new file mode 100644 index 0000000..047968d Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Matamoros differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Mazatlan b/venv/Lib/site-packages/pytz/zoneinfo/America/Mazatlan new file mode 100644 index 0000000..e4a7857 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Mazatlan differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Mendoza b/venv/Lib/site-packages/pytz/zoneinfo/America/Mendoza new file mode 100644 index 0000000..f9e677f Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Mendoza differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Menominee b/venv/Lib/site-packages/pytz/zoneinfo/America/Menominee new file mode 100644 index 0000000..3146138 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Menominee differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Merida b/venv/Lib/site-packages/pytz/zoneinfo/America/Merida new file mode 100644 index 0000000..ea852da Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Merida differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Metlakatla b/venv/Lib/site-packages/pytz/zoneinfo/America/Metlakatla new file mode 100644 index 0000000..1e94be3 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Metlakatla differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Mexico_City b/venv/Lib/site-packages/pytz/zoneinfo/America/Mexico_City new file mode 100644 index 0000000..e7fb6f2 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Mexico_City differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Miquelon b/venv/Lib/site-packages/pytz/zoneinfo/America/Miquelon new file mode 100644 index 0000000..b924b71 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Miquelon differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Moncton b/venv/Lib/site-packages/pytz/zoneinfo/America/Moncton new file mode 100644 index 0000000..9df8d0f Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Moncton differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Monterrey b/venv/Lib/site-packages/pytz/zoneinfo/America/Monterrey new file mode 100644 index 0000000..a8928c8 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Monterrey differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Montevideo b/venv/Lib/site-packages/pytz/zoneinfo/America/Montevideo new file mode 100644 index 0000000..2f357bc Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Montevideo differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Montreal b/venv/Lib/site-packages/pytz/zoneinfo/America/Montreal new file mode 100644 index 0000000..6752c5b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Montreal differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Montserrat b/venv/Lib/site-packages/pytz/zoneinfo/America/Montserrat new file mode 100644 index 0000000..697cf5b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Montserrat differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Nassau b/venv/Lib/site-packages/pytz/zoneinfo/America/Nassau new file mode 100644 index 0000000..33cc6c6 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Nassau differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/New_York b/venv/Lib/site-packages/pytz/zoneinfo/America/New_York new file mode 100644 index 0000000..2f75480 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/New_York differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Nipigon b/venv/Lib/site-packages/pytz/zoneinfo/America/Nipigon new file mode 100644 index 0000000..f6a856e Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Nipigon differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Nome b/venv/Lib/site-packages/pytz/zoneinfo/America/Nome new file mode 100644 index 0000000..10998df Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Nome differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Noronha b/venv/Lib/site-packages/pytz/zoneinfo/America/Noronha new file mode 100644 index 0000000..f140726 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Noronha differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/North_Dakota/Beulah b/venv/Lib/site-packages/pytz/zoneinfo/America/North_Dakota/Beulah new file mode 100644 index 0000000..246345d Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/North_Dakota/Beulah differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/North_Dakota/Center b/venv/Lib/site-packages/pytz/zoneinfo/America/North_Dakota/Center new file mode 100644 index 0000000..1fa0703 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/North_Dakota/Center differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/North_Dakota/New_Salem b/venv/Lib/site-packages/pytz/zoneinfo/America/North_Dakota/New_Salem new file mode 100644 index 0000000..123f2ae Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/North_Dakota/New_Salem differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Ojinaga b/venv/Lib/site-packages/pytz/zoneinfo/America/Ojinaga new file mode 100644 index 0000000..fc4a03e Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Ojinaga differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Panama b/venv/Lib/site-packages/pytz/zoneinfo/America/Panama new file mode 100644 index 0000000..9964b9a Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Panama differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Pangnirtung b/venv/Lib/site-packages/pytz/zoneinfo/America/Pangnirtung new file mode 100644 index 0000000..3e4e0db Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Pangnirtung differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Paramaribo b/venv/Lib/site-packages/pytz/zoneinfo/America/Paramaribo new file mode 100644 index 0000000..bc8a6ed Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Paramaribo differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Phoenix b/venv/Lib/site-packages/pytz/zoneinfo/America/Phoenix new file mode 100644 index 0000000..ac6bb0c Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Phoenix differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Port-au-Prince b/venv/Lib/site-packages/pytz/zoneinfo/America/Port-au-Prince new file mode 100644 index 0000000..287f143 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Port-au-Prince differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Port_of_Spain b/venv/Lib/site-packages/pytz/zoneinfo/America/Port_of_Spain new file mode 100644 index 0000000..697cf5b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Port_of_Spain differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Porto_Acre b/venv/Lib/site-packages/pytz/zoneinfo/America/Porto_Acre new file mode 100644 index 0000000..a374cb4 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Porto_Acre differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Porto_Velho b/venv/Lib/site-packages/pytz/zoneinfo/America/Porto_Velho new file mode 100644 index 0000000..2e873a5 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Porto_Velho differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Puerto_Rico b/venv/Lib/site-packages/pytz/zoneinfo/America/Puerto_Rico new file mode 100644 index 0000000..a662a57 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Puerto_Rico differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Punta_Arenas b/venv/Lib/site-packages/pytz/zoneinfo/America/Punta_Arenas new file mode 100644 index 0000000..a5a8af5 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Punta_Arenas differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Rainy_River b/venv/Lib/site-packages/pytz/zoneinfo/America/Rainy_River new file mode 100644 index 0000000..ea66099 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Rainy_River differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Rankin_Inlet b/venv/Lib/site-packages/pytz/zoneinfo/America/Rankin_Inlet new file mode 100644 index 0000000..3a70587 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Rankin_Inlet differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Recife b/venv/Lib/site-packages/pytz/zoneinfo/America/Recife new file mode 100644 index 0000000..d7abb16 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Recife differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Regina b/venv/Lib/site-packages/pytz/zoneinfo/America/Regina new file mode 100644 index 0000000..20c9c84 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Regina differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Resolute b/venv/Lib/site-packages/pytz/zoneinfo/America/Resolute new file mode 100644 index 0000000..0a73b75 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Resolute differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Rio_Branco b/venv/Lib/site-packages/pytz/zoneinfo/America/Rio_Branco new file mode 100644 index 0000000..a374cb4 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Rio_Branco differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Rosario b/venv/Lib/site-packages/pytz/zoneinfo/America/Rosario new file mode 100644 index 0000000..da4c23a Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Rosario differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Santa_Isabel b/venv/Lib/site-packages/pytz/zoneinfo/America/Santa_Isabel new file mode 100644 index 0000000..ada6bf7 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Santa_Isabel differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Santarem b/venv/Lib/site-packages/pytz/zoneinfo/America/Santarem new file mode 100644 index 0000000..c28f360 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Santarem differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Santiago b/venv/Lib/site-packages/pytz/zoneinfo/America/Santiago new file mode 100644 index 0000000..816a042 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Santiago differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Santo_Domingo b/venv/Lib/site-packages/pytz/zoneinfo/America/Santo_Domingo new file mode 100644 index 0000000..4fe36fd Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Santo_Domingo differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Sao_Paulo b/venv/Lib/site-packages/pytz/zoneinfo/America/Sao_Paulo new file mode 100644 index 0000000..13ff083 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Sao_Paulo differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Scoresbysund b/venv/Lib/site-packages/pytz/zoneinfo/America/Scoresbysund new file mode 100644 index 0000000..e20e9e1 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Scoresbysund differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Shiprock b/venv/Lib/site-packages/pytz/zoneinfo/America/Shiprock new file mode 100644 index 0000000..5fbe26b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Shiprock differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Sitka b/venv/Lib/site-packages/pytz/zoneinfo/America/Sitka new file mode 100644 index 0000000..31f7061 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Sitka differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/St_Barthelemy b/venv/Lib/site-packages/pytz/zoneinfo/America/St_Barthelemy new file mode 100644 index 0000000..697cf5b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/St_Barthelemy differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/St_Johns b/venv/Lib/site-packages/pytz/zoneinfo/America/St_Johns new file mode 100644 index 0000000..65a5b0c Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/St_Johns differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/St_Kitts b/venv/Lib/site-packages/pytz/zoneinfo/America/St_Kitts new file mode 100644 index 0000000..697cf5b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/St_Kitts differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/St_Lucia b/venv/Lib/site-packages/pytz/zoneinfo/America/St_Lucia new file mode 100644 index 0000000..697cf5b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/St_Lucia differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/St_Thomas b/venv/Lib/site-packages/pytz/zoneinfo/America/St_Thomas new file mode 100644 index 0000000..697cf5b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/St_Thomas differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/St_Vincent b/venv/Lib/site-packages/pytz/zoneinfo/America/St_Vincent new file mode 100644 index 0000000..697cf5b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/St_Vincent differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Swift_Current b/venv/Lib/site-packages/pytz/zoneinfo/America/Swift_Current new file mode 100644 index 0000000..8e9ef25 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Swift_Current differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Tegucigalpa b/venv/Lib/site-packages/pytz/zoneinfo/America/Tegucigalpa new file mode 100644 index 0000000..2adacb2 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Tegucigalpa differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Thule b/venv/Lib/site-packages/pytz/zoneinfo/America/Thule new file mode 100644 index 0000000..6f802f1 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Thule differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Thunder_Bay b/venv/Lib/site-packages/pytz/zoneinfo/America/Thunder_Bay new file mode 100644 index 0000000..e504c9a Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Thunder_Bay differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Tijuana b/venv/Lib/site-packages/pytz/zoneinfo/America/Tijuana new file mode 100644 index 0000000..ada6bf7 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Tijuana differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Toronto b/venv/Lib/site-packages/pytz/zoneinfo/America/Toronto new file mode 100644 index 0000000..6752c5b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Toronto differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Tortola b/venv/Lib/site-packages/pytz/zoneinfo/America/Tortola new file mode 100644 index 0000000..697cf5b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Tortola differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Vancouver b/venv/Lib/site-packages/pytz/zoneinfo/America/Vancouver new file mode 100644 index 0000000..bb60cbc Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Vancouver differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Virgin b/venv/Lib/site-packages/pytz/zoneinfo/America/Virgin new file mode 100644 index 0000000..697cf5b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Virgin differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Whitehorse b/venv/Lib/site-packages/pytz/zoneinfo/America/Whitehorse new file mode 100644 index 0000000..fb3cd71 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Whitehorse differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Winnipeg b/venv/Lib/site-packages/pytz/zoneinfo/America/Winnipeg new file mode 100644 index 0000000..ac40299 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Winnipeg differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Yakutat b/venv/Lib/site-packages/pytz/zoneinfo/America/Yakutat new file mode 100644 index 0000000..da209f9 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Yakutat differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/America/Yellowknife b/venv/Lib/site-packages/pytz/zoneinfo/America/Yellowknife new file mode 100644 index 0000000..e6afa39 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/America/Yellowknife differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/Casey b/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/Casey new file mode 100644 index 0000000..f100f47 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/Casey differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/Davis b/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/Davis new file mode 100644 index 0000000..916f2c2 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/Davis differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/DumontDUrville b/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/DumontDUrville new file mode 100644 index 0000000..a71b39c Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/DumontDUrville differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/Macquarie b/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/Macquarie new file mode 100644 index 0000000..616afd9 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/Macquarie differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/Mawson b/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/Mawson new file mode 100644 index 0000000..b32e7fd Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/Mawson differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/McMurdo b/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/McMurdo new file mode 100644 index 0000000..6575fdc Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/McMurdo differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/Palmer b/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/Palmer new file mode 100644 index 0000000..3dd85f8 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/Palmer differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/Rothera b/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/Rothera new file mode 100644 index 0000000..8b2430a Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/Rothera differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/South_Pole b/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/South_Pole new file mode 100644 index 0000000..6575fdc Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/South_Pole differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/Syowa b/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/Syowa new file mode 100644 index 0000000..254af7d Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/Syowa differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/Troll b/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/Troll new file mode 100644 index 0000000..5e565da Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/Troll differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/Vostok b/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/Vostok new file mode 100644 index 0000000..7283053 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Antarctica/Vostok differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Arctic/Longyearbyen b/venv/Lib/site-packages/pytz/zoneinfo/Arctic/Longyearbyen new file mode 100644 index 0000000..15a34c3 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Arctic/Longyearbyen differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Aden b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Aden new file mode 100644 index 0000000..2aea25f Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Aden differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Almaty b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Almaty new file mode 100644 index 0000000..a4b0077 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Almaty differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Amman b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Amman new file mode 100644 index 0000000..c9e8707 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Amman differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Anadyr b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Anadyr new file mode 100644 index 0000000..6ed8b7c Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Anadyr differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Aqtau b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Aqtau new file mode 100644 index 0000000..e2d0f91 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Aqtau differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Aqtobe b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Aqtobe new file mode 100644 index 0000000..06f0a13 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Aqtobe differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Ashgabat b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Ashgabat new file mode 100644 index 0000000..73891af Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Ashgabat differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Ashkhabad b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Ashkhabad new file mode 100644 index 0000000..73891af Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Ashkhabad differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Atyrau b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Atyrau new file mode 100644 index 0000000..8b5153e Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Atyrau differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Baghdad b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Baghdad new file mode 100644 index 0000000..f7162ed Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Baghdad differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Bahrain b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Bahrain new file mode 100644 index 0000000..63188b2 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Bahrain differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Baku b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Baku new file mode 100644 index 0000000..a0de74b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Baku differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Bangkok b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Bangkok new file mode 100644 index 0000000..c292ac5 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Bangkok differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Barnaul b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Barnaul new file mode 100644 index 0000000..759592a Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Barnaul differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Beirut b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Beirut new file mode 100644 index 0000000..fb266ed Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Beirut differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Bishkek b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Bishkek new file mode 100644 index 0000000..f6e20dd Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Bishkek differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Brunei b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Brunei new file mode 100644 index 0000000..3dab0ab Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Brunei differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Calcutta b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Calcutta new file mode 100644 index 0000000..0014046 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Calcutta differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Chita b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Chita new file mode 100644 index 0000000..c4149c0 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Chita differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Choibalsan b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Choibalsan new file mode 100644 index 0000000..e48daa8 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Choibalsan differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Chongqing b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Chongqing new file mode 100644 index 0000000..3c0bef2 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Chongqing differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Chungking b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Chungking new file mode 100644 index 0000000..3c0bef2 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Chungking differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Colombo b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Colombo new file mode 100644 index 0000000..62c64d8 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Colombo differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Dacca b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Dacca new file mode 100644 index 0000000..b11c928 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Dacca differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Damascus b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Damascus new file mode 100644 index 0000000..d9104a7 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Damascus differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Dhaka b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Dhaka new file mode 100644 index 0000000..b11c928 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Dhaka differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Dili b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Dili new file mode 100644 index 0000000..30943bb Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Dili differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Dubai b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Dubai new file mode 100644 index 0000000..fc0a589 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Dubai differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Dushanbe b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Dushanbe new file mode 100644 index 0000000..82d85b8 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Dushanbe differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Famagusta b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Famagusta new file mode 100644 index 0000000..653b146 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Famagusta differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Gaza b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Gaza new file mode 100644 index 0000000..592b632 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Gaza differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Harbin b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Harbin new file mode 100644 index 0000000..3c0bef2 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Harbin differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Hebron b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Hebron new file mode 100644 index 0000000..ae82f9b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Hebron differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Ho_Chi_Minh b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Ho_Chi_Minh new file mode 100644 index 0000000..e2934e3 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Ho_Chi_Minh differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Hong_Kong b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Hong_Kong new file mode 100644 index 0000000..23d0375 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Hong_Kong differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Hovd b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Hovd new file mode 100644 index 0000000..4cb800a Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Hovd differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Irkutsk b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Irkutsk new file mode 100644 index 0000000..4dcbbb7 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Irkutsk differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Istanbul b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Istanbul new file mode 100644 index 0000000..508446b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Istanbul differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Jakarta b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Jakarta new file mode 100644 index 0000000..5baa3a8 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Jakarta differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Jayapura b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Jayapura new file mode 100644 index 0000000..3002c82 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Jayapura differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Jerusalem b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Jerusalem new file mode 100644 index 0000000..440ef06 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Jerusalem differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Kabul b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Kabul new file mode 100644 index 0000000..d19b9bd Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Kabul differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Kamchatka b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Kamchatka new file mode 100644 index 0000000..3e80b4e Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Kamchatka differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Karachi b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Karachi new file mode 100644 index 0000000..ba65c0e Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Karachi differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Kashgar b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Kashgar new file mode 100644 index 0000000..faa14d9 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Kashgar differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Kathmandu b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Kathmandu new file mode 100644 index 0000000..a5d5107 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Kathmandu differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Katmandu b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Katmandu new file mode 100644 index 0000000..a5d5107 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Katmandu differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Khandyga b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Khandyga new file mode 100644 index 0000000..72bea64 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Khandyga differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Kolkata b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Kolkata new file mode 100644 index 0000000..0014046 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Kolkata differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Krasnoyarsk b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Krasnoyarsk new file mode 100644 index 0000000..30c6f16 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Krasnoyarsk differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Kuala_Lumpur b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Kuala_Lumpur new file mode 100644 index 0000000..612b01e Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Kuala_Lumpur differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Kuching b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Kuching new file mode 100644 index 0000000..c86750c Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Kuching differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Kuwait b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Kuwait new file mode 100644 index 0000000..2aea25f Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Kuwait differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Macao b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Macao new file mode 100644 index 0000000..cac6506 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Macao differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Macau b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Macau new file mode 100644 index 0000000..cac6506 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Macau differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Magadan b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Magadan new file mode 100644 index 0000000..b4fcac1 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Magadan differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Makassar b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Makassar new file mode 100644 index 0000000..556ba86 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Makassar differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Manila b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Manila new file mode 100644 index 0000000..f4f4b04 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Manila differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Muscat b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Muscat new file mode 100644 index 0000000..fc0a589 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Muscat differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Nicosia b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Nicosia new file mode 100644 index 0000000..f7f10ab Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Nicosia differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Novokuznetsk b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Novokuznetsk new file mode 100644 index 0000000..d983276 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Novokuznetsk differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Novosibirsk b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Novosibirsk new file mode 100644 index 0000000..e0ee5fc Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Novosibirsk differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Omsk b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Omsk new file mode 100644 index 0000000..b29b769 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Omsk differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Oral b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Oral new file mode 100644 index 0000000..ad1f9ca Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Oral differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Phnom_Penh b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Phnom_Penh new file mode 100644 index 0000000..c292ac5 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Phnom_Penh differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Pontianak b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Pontianak new file mode 100644 index 0000000..12ce24c Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Pontianak differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Pyongyang b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Pyongyang new file mode 100644 index 0000000..7ad7e0b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Pyongyang differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Qatar b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Qatar new file mode 100644 index 0000000..63188b2 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Qatar differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Qostanay b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Qostanay new file mode 100644 index 0000000..73b9d96 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Qostanay differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Qyzylorda b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Qyzylorda new file mode 100644 index 0000000..c2fe4c1 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Qyzylorda differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Rangoon b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Rangoon new file mode 100644 index 0000000..dd77395 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Rangoon differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Riyadh b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Riyadh new file mode 100644 index 0000000..2aea25f Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Riyadh differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Saigon b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Saigon new file mode 100644 index 0000000..e2934e3 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Saigon differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Sakhalin b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Sakhalin new file mode 100644 index 0000000..485459c Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Sakhalin differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Samarkand b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Samarkand new file mode 100644 index 0000000..030d47c Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Samarkand differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Seoul b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Seoul new file mode 100644 index 0000000..96199e7 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Seoul differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Shanghai b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Shanghai new file mode 100644 index 0000000..3c0bef2 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Shanghai differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Singapore b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Singapore new file mode 100644 index 0000000..2364b21 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Singapore differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Srednekolymsk b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Srednekolymsk new file mode 100644 index 0000000..261a983 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Srednekolymsk differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Taipei b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Taipei new file mode 100644 index 0000000..24c4344 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Taipei differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Tashkent b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Tashkent new file mode 100644 index 0000000..32a9d7d Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Tashkent differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Tbilisi b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Tbilisi new file mode 100644 index 0000000..b608d79 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Tbilisi differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Tehran b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Tehran new file mode 100644 index 0000000..8cec5ad Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Tehran differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Tel_Aviv b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Tel_Aviv new file mode 100644 index 0000000..440ef06 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Tel_Aviv differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Thimbu b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Thimbu new file mode 100644 index 0000000..fe409c7 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Thimbu differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Thimphu b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Thimphu new file mode 100644 index 0000000..fe409c7 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Thimphu differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Tokyo b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Tokyo new file mode 100644 index 0000000..26f4d34 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Tokyo differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Tomsk b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Tomsk new file mode 100644 index 0000000..670e2ad Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Tomsk differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Ujung_Pandang b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Ujung_Pandang new file mode 100644 index 0000000..556ba86 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Ujung_Pandang differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Ulaanbaatar b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Ulaanbaatar new file mode 100644 index 0000000..2e20cc3 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Ulaanbaatar differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Ulan_Bator b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Ulan_Bator new file mode 100644 index 0000000..2e20cc3 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Ulan_Bator differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Urumqi b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Urumqi new file mode 100644 index 0000000..faa14d9 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Urumqi differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Ust-Nera b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Ust-Nera new file mode 100644 index 0000000..9e4a78f Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Ust-Nera differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Vientiane b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Vientiane new file mode 100644 index 0000000..c292ac5 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Vientiane differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Vladivostok b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Vladivostok new file mode 100644 index 0000000..8ab253c Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Vladivostok differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Yakutsk b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Yakutsk new file mode 100644 index 0000000..c815e99 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Yakutsk differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Yangon b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Yangon new file mode 100644 index 0000000..dd77395 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Yangon differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Yekaterinburg b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Yekaterinburg new file mode 100644 index 0000000..6958d7e Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Yekaterinburg differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Asia/Yerevan b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Yerevan new file mode 100644 index 0000000..250bfe0 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Asia/Yerevan differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/Azores b/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/Azores new file mode 100644 index 0000000..56593db Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/Azores differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/Bermuda b/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/Bermuda new file mode 100644 index 0000000..419c660 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/Bermuda differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/Canary b/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/Canary new file mode 100644 index 0000000..f319215 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/Canary differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/Cape_Verde b/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/Cape_Verde new file mode 100644 index 0000000..e2a49d2 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/Cape_Verde differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/Faeroe b/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/Faeroe new file mode 100644 index 0000000..4dab7ef Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/Faeroe differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/Faroe b/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/Faroe new file mode 100644 index 0000000..4dab7ef Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/Faroe differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/Jan_Mayen b/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/Jan_Mayen new file mode 100644 index 0000000..15a34c3 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/Jan_Mayen differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/Madeira b/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/Madeira new file mode 100644 index 0000000..5213761 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/Madeira differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/Reykjavik b/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/Reykjavik new file mode 100644 index 0000000..10e0fc8 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/Reykjavik differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/South_Georgia b/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/South_Georgia new file mode 100644 index 0000000..4466608 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/South_Georgia differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/St_Helena b/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/St_Helena new file mode 100644 index 0000000..28b32ab Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/St_Helena differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/Stanley b/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/Stanley new file mode 100644 index 0000000..88077f1 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Atlantic/Stanley differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Australia/ACT b/venv/Lib/site-packages/pytz/zoneinfo/Australia/ACT new file mode 100644 index 0000000..7636592 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Australia/ACT differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Australia/Adelaide b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Adelaide new file mode 100644 index 0000000..0b1252a Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Adelaide differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Australia/Brisbane b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Brisbane new file mode 100644 index 0000000..3021bdb Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Brisbane differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Australia/Broken_Hill b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Broken_Hill new file mode 100644 index 0000000..1ac3fc8 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Broken_Hill differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Australia/Canberra b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Canberra new file mode 100644 index 0000000..7636592 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Canberra differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Australia/Currie b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Currie new file mode 100644 index 0000000..f65a990 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Currie differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Australia/Darwin b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Darwin new file mode 100644 index 0000000..1cf5029 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Darwin differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Australia/Eucla b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Eucla new file mode 100644 index 0000000..98ae557 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Eucla differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Australia/Hobart b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Hobart new file mode 100644 index 0000000..02b07ca Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Hobart differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Australia/LHI b/venv/Lib/site-packages/pytz/zoneinfo/Australia/LHI new file mode 100644 index 0000000..9e04a80 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Australia/LHI differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Australia/Lindeman b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Lindeman new file mode 100644 index 0000000..eab0fb9 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Lindeman differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Australia/Lord_Howe b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Lord_Howe new file mode 100644 index 0000000..9e04a80 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Lord_Howe differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Australia/Melbourne b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Melbourne new file mode 100644 index 0000000..ba45733 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Melbourne differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Australia/NSW b/venv/Lib/site-packages/pytz/zoneinfo/Australia/NSW new file mode 100644 index 0000000..7636592 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Australia/NSW differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Australia/North b/venv/Lib/site-packages/pytz/zoneinfo/Australia/North new file mode 100644 index 0000000..1cf5029 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Australia/North differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Australia/Perth b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Perth new file mode 100644 index 0000000..a876b9e Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Perth differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Australia/Queensland b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Queensland new file mode 100644 index 0000000..3021bdb Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Queensland differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Australia/South b/venv/Lib/site-packages/pytz/zoneinfo/Australia/South new file mode 100644 index 0000000..0b1252a Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Australia/South differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Australia/Sydney b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Sydney new file mode 100644 index 0000000..7636592 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Sydney differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Australia/Tasmania b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Tasmania new file mode 100644 index 0000000..02b07ca Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Tasmania differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Australia/Victoria b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Victoria new file mode 100644 index 0000000..ba45733 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Victoria differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Australia/West b/venv/Lib/site-packages/pytz/zoneinfo/Australia/West new file mode 100644 index 0000000..a876b9e Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Australia/West differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Australia/Yancowinna b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Yancowinna new file mode 100644 index 0000000..1ac3fc8 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Australia/Yancowinna differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Brazil/Acre b/venv/Lib/site-packages/pytz/zoneinfo/Brazil/Acre new file mode 100644 index 0000000..a374cb4 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Brazil/Acre differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Brazil/DeNoronha b/venv/Lib/site-packages/pytz/zoneinfo/Brazil/DeNoronha new file mode 100644 index 0000000..f140726 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Brazil/DeNoronha differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Brazil/East b/venv/Lib/site-packages/pytz/zoneinfo/Brazil/East new file mode 100644 index 0000000..13ff083 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Brazil/East differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Brazil/West b/venv/Lib/site-packages/pytz/zoneinfo/Brazil/West new file mode 100644 index 0000000..63d58f8 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Brazil/West differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/CET b/venv/Lib/site-packages/pytz/zoneinfo/CET new file mode 100644 index 0000000..122e934 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/CET differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/CST6CDT b/venv/Lib/site-packages/pytz/zoneinfo/CST6CDT new file mode 100644 index 0000000..ca67929 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/CST6CDT differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Canada/Atlantic b/venv/Lib/site-packages/pytz/zoneinfo/Canada/Atlantic new file mode 100644 index 0000000..756099a Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Canada/Atlantic differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Canada/Central b/venv/Lib/site-packages/pytz/zoneinfo/Canada/Central new file mode 100644 index 0000000..ac40299 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Canada/Central differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Canada/Eastern b/venv/Lib/site-packages/pytz/zoneinfo/Canada/Eastern new file mode 100644 index 0000000..6752c5b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Canada/Eastern differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Canada/Mountain b/venv/Lib/site-packages/pytz/zoneinfo/Canada/Mountain new file mode 100644 index 0000000..cd78a6f Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Canada/Mountain differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Canada/Newfoundland b/venv/Lib/site-packages/pytz/zoneinfo/Canada/Newfoundland new file mode 100644 index 0000000..65a5b0c Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Canada/Newfoundland differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Canada/Pacific b/venv/Lib/site-packages/pytz/zoneinfo/Canada/Pacific new file mode 100644 index 0000000..bb60cbc Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Canada/Pacific differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Canada/Saskatchewan b/venv/Lib/site-packages/pytz/zoneinfo/Canada/Saskatchewan new file mode 100644 index 0000000..20c9c84 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Canada/Saskatchewan differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Canada/Yukon b/venv/Lib/site-packages/pytz/zoneinfo/Canada/Yukon new file mode 100644 index 0000000..fb3cd71 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Canada/Yukon differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Chile/Continental b/venv/Lib/site-packages/pytz/zoneinfo/Chile/Continental new file mode 100644 index 0000000..816a042 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Chile/Continental differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Chile/EasterIsland b/venv/Lib/site-packages/pytz/zoneinfo/Chile/EasterIsland new file mode 100644 index 0000000..cae3744 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Chile/EasterIsland differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Cuba b/venv/Lib/site-packages/pytz/zoneinfo/Cuba new file mode 100644 index 0000000..b69ac45 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Cuba differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/EET b/venv/Lib/site-packages/pytz/zoneinfo/EET new file mode 100644 index 0000000..cbdb71d Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/EET differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/EST b/venv/Lib/site-packages/pytz/zoneinfo/EST new file mode 100644 index 0000000..21ebc00 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/EST differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/EST5EDT b/venv/Lib/site-packages/pytz/zoneinfo/EST5EDT new file mode 100644 index 0000000..9bce500 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/EST5EDT differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Egypt b/venv/Lib/site-packages/pytz/zoneinfo/Egypt new file mode 100644 index 0000000..d3f8196 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Egypt differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Eire b/venv/Lib/site-packages/pytz/zoneinfo/Eire new file mode 100644 index 0000000..1d99490 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Eire differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT new file mode 100644 index 0000000..c634746 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+0 b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+0 new file mode 100644 index 0000000..c634746 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+0 differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+1 b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+1 new file mode 100644 index 0000000..4dab6f9 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+1 differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+10 b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+10 new file mode 100644 index 0000000..c749290 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+10 differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+11 b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+11 new file mode 100644 index 0000000..d969982 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+11 differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+12 b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+12 new file mode 100644 index 0000000..cdeec90 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+12 differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+2 b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+2 new file mode 100644 index 0000000..fbd2a94 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+2 differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+3 b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+3 new file mode 100644 index 0000000..ee246ef Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+3 differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+4 b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+4 new file mode 100644 index 0000000..5a25ff2 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+4 differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+5 b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+5 new file mode 100644 index 0000000..c0b745f Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+5 differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+6 b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+6 new file mode 100644 index 0000000..06e777d Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+6 differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+7 b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+7 new file mode 100644 index 0000000..4e0b53a Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+7 differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+8 b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+8 new file mode 100644 index 0000000..714b0c5 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+8 differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+9 b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+9 new file mode 100644 index 0000000..78b9daa Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT+9 differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-0 b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-0 new file mode 100644 index 0000000..c634746 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-0 differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-1 b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-1 new file mode 100644 index 0000000..a838beb Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-1 differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-10 b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-10 new file mode 100644 index 0000000..68ff77d Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-10 differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-11 b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-11 new file mode 100644 index 0000000..66af5a4 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-11 differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-12 b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-12 new file mode 100644 index 0000000..17ba505 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-12 differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-13 b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-13 new file mode 100644 index 0000000..5f3706c Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-13 differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-14 b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-14 new file mode 100644 index 0000000..7e9f9c4 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-14 differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-2 b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-2 new file mode 100644 index 0000000..fcef6d9 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-2 differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-3 b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-3 new file mode 100644 index 0000000..27973bc Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-3 differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-4 b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-4 new file mode 100644 index 0000000..1efd841 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-4 differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-5 b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-5 new file mode 100644 index 0000000..1f76184 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-5 differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-6 b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-6 new file mode 100644 index 0000000..952681e Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-6 differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-7 b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-7 new file mode 100644 index 0000000..cefc912 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-7 differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-8 b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-8 new file mode 100644 index 0000000..afb093d Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-8 differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-9 b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-9 new file mode 100644 index 0000000..9265fb7 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT-9 differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT0 b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT0 new file mode 100644 index 0000000..c634746 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/GMT0 differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/Greenwich b/venv/Lib/site-packages/pytz/zoneinfo/Etc/Greenwich new file mode 100644 index 0000000..c634746 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/Greenwich differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/UCT b/venv/Lib/site-packages/pytz/zoneinfo/Etc/UCT new file mode 100644 index 0000000..91558be Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/UCT differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/UTC b/venv/Lib/site-packages/pytz/zoneinfo/Etc/UTC new file mode 100644 index 0000000..91558be Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/UTC differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/Universal b/venv/Lib/site-packages/pytz/zoneinfo/Etc/Universal new file mode 100644 index 0000000..91558be Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/Universal differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Etc/Zulu b/venv/Lib/site-packages/pytz/zoneinfo/Etc/Zulu new file mode 100644 index 0000000..91558be Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Etc/Zulu differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Amsterdam b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Amsterdam new file mode 100644 index 0000000..c3ff07b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Amsterdam differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Andorra b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Andorra new file mode 100644 index 0000000..5962550 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Andorra differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Astrakhan b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Astrakhan new file mode 100644 index 0000000..73a4d01 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Astrakhan differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Athens b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Athens new file mode 100644 index 0000000..9f3a067 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Athens differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Belfast b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Belfast new file mode 100644 index 0000000..ac02a81 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Belfast differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Belgrade b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Belgrade new file mode 100644 index 0000000..27de456 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Belgrade differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Berlin b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Berlin new file mode 100644 index 0000000..7f6d958 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Berlin differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Bratislava b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Bratislava new file mode 100644 index 0000000..ce8f433 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Bratislava differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Brussels b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Brussels new file mode 100644 index 0000000..40d7124 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Brussels differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Bucharest b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Bucharest new file mode 100644 index 0000000..4303b90 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Bucharest differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Budapest b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Budapest new file mode 100644 index 0000000..6b94a4f Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Budapest differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Busingen b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Busingen new file mode 100644 index 0000000..ad6cf59 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Busingen differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Chisinau b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Chisinau new file mode 100644 index 0000000..5ee23fe Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Chisinau differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Copenhagen b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Copenhagen new file mode 100644 index 0000000..776be6e Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Copenhagen differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Dublin b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Dublin new file mode 100644 index 0000000..1d99490 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Dublin differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Gibraltar b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Gibraltar new file mode 100644 index 0000000..117aadb Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Gibraltar differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Guernsey b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Guernsey new file mode 100644 index 0000000..ac02a81 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Guernsey differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Helsinki b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Helsinki new file mode 100644 index 0000000..b4f8f9c Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Helsinki differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Isle_of_Man b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Isle_of_Man new file mode 100644 index 0000000..ac02a81 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Isle_of_Man differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Istanbul b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Istanbul new file mode 100644 index 0000000..508446b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Istanbul differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Jersey b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Jersey new file mode 100644 index 0000000..ac02a81 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Jersey differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Kaliningrad b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Kaliningrad new file mode 100644 index 0000000..cc99bea Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Kaliningrad differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Kiev b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Kiev new file mode 100644 index 0000000..9337c9e Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Kiev differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Kirov b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Kirov new file mode 100644 index 0000000..a3b5320 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Kirov differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Lisbon b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Lisbon new file mode 100644 index 0000000..355817b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Lisbon differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Ljubljana b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Ljubljana new file mode 100644 index 0000000..27de456 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Ljubljana differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/London b/venv/Lib/site-packages/pytz/zoneinfo/Europe/London new file mode 100644 index 0000000..ac02a81 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/London differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Luxembourg b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Luxembourg new file mode 100644 index 0000000..c4ca733 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Luxembourg differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Madrid b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Madrid new file mode 100644 index 0000000..16f6420 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Madrid differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Malta b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Malta new file mode 100644 index 0000000..bf2452d Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Malta differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Mariehamn b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Mariehamn new file mode 100644 index 0000000..b4f8f9c Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Mariehamn differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Minsk b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Minsk new file mode 100644 index 0000000..453306c Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Minsk differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Monaco b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Monaco new file mode 100644 index 0000000..686ae88 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Monaco differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Moscow b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Moscow new file mode 100644 index 0000000..ddb3f4e Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Moscow differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Nicosia b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Nicosia new file mode 100644 index 0000000..f7f10ab Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Nicosia differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Oslo b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Oslo new file mode 100644 index 0000000..15a34c3 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Oslo differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Paris b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Paris new file mode 100644 index 0000000..ca85435 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Paris differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Podgorica b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Podgorica new file mode 100644 index 0000000..27de456 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Podgorica differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Prague b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Prague new file mode 100644 index 0000000..ce8f433 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Prague differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Riga b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Riga new file mode 100644 index 0000000..8db477d Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Riga differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Rome b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Rome new file mode 100644 index 0000000..ac4c163 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Rome differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Samara b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Samara new file mode 100644 index 0000000..97d5dd9 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Samara differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/San_Marino b/venv/Lib/site-packages/pytz/zoneinfo/Europe/San_Marino new file mode 100644 index 0000000..ac4c163 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/San_Marino differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Sarajevo b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Sarajevo new file mode 100644 index 0000000..27de456 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Sarajevo differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Saratov b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Saratov new file mode 100644 index 0000000..8fd5f6d Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Saratov differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Simferopol b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Simferopol new file mode 100644 index 0000000..432e831 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Simferopol differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Skopje b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Skopje new file mode 100644 index 0000000..27de456 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Skopje differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Sofia b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Sofia new file mode 100644 index 0000000..0e4d879 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Sofia differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Stockholm b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Stockholm new file mode 100644 index 0000000..f3e0c7f Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Stockholm differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Tallinn b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Tallinn new file mode 100644 index 0000000..b5acca3 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Tallinn differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Tirane b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Tirane new file mode 100644 index 0000000..0b86017 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Tirane differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Tiraspol b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Tiraspol new file mode 100644 index 0000000..5ee23fe Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Tiraspol differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Ulyanovsk b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Ulyanovsk new file mode 100644 index 0000000..7b61bdc Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Ulyanovsk differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Uzhgorod b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Uzhgorod new file mode 100644 index 0000000..66ae8d6 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Uzhgorod differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Vaduz b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Vaduz new file mode 100644 index 0000000..ad6cf59 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Vaduz differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Vatican b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Vatican new file mode 100644 index 0000000..ac4c163 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Vatican differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Vienna b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Vienna new file mode 100644 index 0000000..3582bb1 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Vienna differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Vilnius b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Vilnius new file mode 100644 index 0000000..7abd63f Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Vilnius differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Volgograd b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Volgograd new file mode 100644 index 0000000..d1cfac0 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Volgograd differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Warsaw b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Warsaw new file mode 100644 index 0000000..e33cf67 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Warsaw differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Zagreb b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Zagreb new file mode 100644 index 0000000..27de456 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Zagreb differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Zaporozhye b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Zaporozhye new file mode 100644 index 0000000..e42edfc Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Zaporozhye differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Europe/Zurich b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Zurich new file mode 100644 index 0000000..ad6cf59 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Europe/Zurich differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Factory b/venv/Lib/site-packages/pytz/zoneinfo/Factory new file mode 100644 index 0000000..60aa2a0 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Factory differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/GB b/venv/Lib/site-packages/pytz/zoneinfo/GB new file mode 100644 index 0000000..ac02a81 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/GB differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/GB-Eire b/venv/Lib/site-packages/pytz/zoneinfo/GB-Eire new file mode 100644 index 0000000..ac02a81 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/GB-Eire differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/GMT b/venv/Lib/site-packages/pytz/zoneinfo/GMT new file mode 100644 index 0000000..c634746 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/GMT differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/GMT+0 b/venv/Lib/site-packages/pytz/zoneinfo/GMT+0 new file mode 100644 index 0000000..c634746 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/GMT+0 differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/GMT-0 b/venv/Lib/site-packages/pytz/zoneinfo/GMT-0 new file mode 100644 index 0000000..c634746 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/GMT-0 differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/GMT0 b/venv/Lib/site-packages/pytz/zoneinfo/GMT0 new file mode 100644 index 0000000..c634746 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/GMT0 differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Greenwich b/venv/Lib/site-packages/pytz/zoneinfo/Greenwich new file mode 100644 index 0000000..c634746 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Greenwich differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/HST b/venv/Lib/site-packages/pytz/zoneinfo/HST new file mode 100644 index 0000000..cccd45e Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/HST differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Hongkong b/venv/Lib/site-packages/pytz/zoneinfo/Hongkong new file mode 100644 index 0000000..23d0375 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Hongkong differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Iceland b/venv/Lib/site-packages/pytz/zoneinfo/Iceland new file mode 100644 index 0000000..10e0fc8 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Iceland differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Indian/Antananarivo b/venv/Lib/site-packages/pytz/zoneinfo/Indian/Antananarivo new file mode 100644 index 0000000..9a2918f Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Indian/Antananarivo differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Indian/Chagos b/venv/Lib/site-packages/pytz/zoneinfo/Indian/Chagos new file mode 100644 index 0000000..93d6dda Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Indian/Chagos differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Indian/Christmas b/venv/Lib/site-packages/pytz/zoneinfo/Indian/Christmas new file mode 100644 index 0000000..d18c381 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Indian/Christmas differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Indian/Cocos b/venv/Lib/site-packages/pytz/zoneinfo/Indian/Cocos new file mode 100644 index 0000000..f8116e7 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Indian/Cocos differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Indian/Comoro b/venv/Lib/site-packages/pytz/zoneinfo/Indian/Comoro new file mode 100644 index 0000000..9a2918f Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Indian/Comoro differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Indian/Kerguelen b/venv/Lib/site-packages/pytz/zoneinfo/Indian/Kerguelen new file mode 100644 index 0000000..cde4cf7 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Indian/Kerguelen differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Indian/Mahe b/venv/Lib/site-packages/pytz/zoneinfo/Indian/Mahe new file mode 100644 index 0000000..cba7dfe Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Indian/Mahe differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Indian/Maldives b/venv/Lib/site-packages/pytz/zoneinfo/Indian/Maldives new file mode 100644 index 0000000..7c839cf Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Indian/Maldives differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Indian/Mauritius b/venv/Lib/site-packages/pytz/zoneinfo/Indian/Mauritius new file mode 100644 index 0000000..17f2616 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Indian/Mauritius differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Indian/Mayotte b/venv/Lib/site-packages/pytz/zoneinfo/Indian/Mayotte new file mode 100644 index 0000000..9a2918f Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Indian/Mayotte differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Indian/Reunion b/venv/Lib/site-packages/pytz/zoneinfo/Indian/Reunion new file mode 100644 index 0000000..dfe0831 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Indian/Reunion differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Iran b/venv/Lib/site-packages/pytz/zoneinfo/Iran new file mode 100644 index 0000000..8cec5ad Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Iran differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Israel b/venv/Lib/site-packages/pytz/zoneinfo/Israel new file mode 100644 index 0000000..440ef06 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Israel differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Jamaica b/venv/Lib/site-packages/pytz/zoneinfo/Jamaica new file mode 100644 index 0000000..2a9b7fd Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Jamaica differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Japan b/venv/Lib/site-packages/pytz/zoneinfo/Japan new file mode 100644 index 0000000..26f4d34 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Japan differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Kwajalein b/venv/Lib/site-packages/pytz/zoneinfo/Kwajalein new file mode 100644 index 0000000..1a7975f Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Kwajalein differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Libya b/venv/Lib/site-packages/pytz/zoneinfo/Libya new file mode 100644 index 0000000..07b393b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Libya differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/MET b/venv/Lib/site-packages/pytz/zoneinfo/MET new file mode 100644 index 0000000..4a826bb Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/MET differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/MST b/venv/Lib/site-packages/pytz/zoneinfo/MST new file mode 100644 index 0000000..c93a58e Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/MST differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/MST7MDT b/venv/Lib/site-packages/pytz/zoneinfo/MST7MDT new file mode 100644 index 0000000..4506a6e Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/MST7MDT differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Mexico/BajaNorte b/venv/Lib/site-packages/pytz/zoneinfo/Mexico/BajaNorte new file mode 100644 index 0000000..ada6bf7 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Mexico/BajaNorte differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Mexico/BajaSur b/venv/Lib/site-packages/pytz/zoneinfo/Mexico/BajaSur new file mode 100644 index 0000000..e4a7857 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Mexico/BajaSur differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Mexico/General b/venv/Lib/site-packages/pytz/zoneinfo/Mexico/General new file mode 100644 index 0000000..e7fb6f2 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Mexico/General differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/NZ b/venv/Lib/site-packages/pytz/zoneinfo/NZ new file mode 100644 index 0000000..6575fdc Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/NZ differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/NZ-CHAT b/venv/Lib/site-packages/pytz/zoneinfo/NZ-CHAT new file mode 100644 index 0000000..c004109 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/NZ-CHAT differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Navajo b/venv/Lib/site-packages/pytz/zoneinfo/Navajo new file mode 100644 index 0000000..5fbe26b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Navajo differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/PRC b/venv/Lib/site-packages/pytz/zoneinfo/PRC new file mode 100644 index 0000000..3c0bef2 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/PRC differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/PST8PDT b/venv/Lib/site-packages/pytz/zoneinfo/PST8PDT new file mode 100644 index 0000000..99d246b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/PST8PDT differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Apia b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Apia new file mode 100644 index 0000000..dab1f3f Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Apia differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Auckland b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Auckland new file mode 100644 index 0000000..6575fdc Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Auckland differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Bougainville b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Bougainville new file mode 100644 index 0000000..2892d26 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Bougainville differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Chatham b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Chatham new file mode 100644 index 0000000..c004109 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Chatham differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Chuuk b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Chuuk new file mode 100644 index 0000000..07c84b7 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Chuuk differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Easter b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Easter new file mode 100644 index 0000000..cae3744 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Easter differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Efate b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Efate new file mode 100644 index 0000000..6015017 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Efate differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Enderbury b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Enderbury new file mode 100644 index 0000000..f0b8252 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Enderbury differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Fakaofo b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Fakaofo new file mode 100644 index 0000000..e40307f Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Fakaofo differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Fiji b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Fiji new file mode 100644 index 0000000..d39bf53 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Fiji differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Funafuti b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Funafuti new file mode 100644 index 0000000..ea72863 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Funafuti differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Galapagos b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Galapagos new file mode 100644 index 0000000..31f0921 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Galapagos differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Gambier b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Gambier new file mode 100644 index 0000000..e1fc3da Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Gambier differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Guadalcanal b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Guadalcanal new file mode 100644 index 0000000..7e9d10a Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Guadalcanal differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Guam b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Guam new file mode 100644 index 0000000..66490d2 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Guam differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Honolulu b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Honolulu new file mode 100644 index 0000000..c7cd060 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Honolulu differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Johnston b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Johnston new file mode 100644 index 0000000..c7cd060 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Johnston differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Kiritimati b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Kiritimati new file mode 100644 index 0000000..7cae0cb Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Kiritimati differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Kosrae b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Kosrae new file mode 100644 index 0000000..a584aae Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Kosrae differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Kwajalein b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Kwajalein new file mode 100644 index 0000000..1a7975f Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Kwajalein differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Majuro b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Majuro new file mode 100644 index 0000000..9ef8374 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Majuro differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Marquesas b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Marquesas new file mode 100644 index 0000000..74d6792 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Marquesas differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Midway b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Midway new file mode 100644 index 0000000..cb56709 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Midway differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Nauru b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Nauru new file mode 100644 index 0000000..acec042 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Nauru differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Niue b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Niue new file mode 100644 index 0000000..684b010 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Niue differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Norfolk b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Norfolk new file mode 100644 index 0000000..53c1aad Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Norfolk differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Noumea b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Noumea new file mode 100644 index 0000000..931a1a3 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Noumea differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Pago_Pago b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Pago_Pago new file mode 100644 index 0000000..cb56709 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Pago_Pago differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Palau b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Palau new file mode 100644 index 0000000..146b351 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Palau differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Pitcairn b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Pitcairn new file mode 100644 index 0000000..ef91b06 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Pitcairn differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Pohnpei b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Pohnpei new file mode 100644 index 0000000..c298ddd Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Pohnpei differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Ponape b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Ponape new file mode 100644 index 0000000..c298ddd Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Ponape differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Port_Moresby b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Port_Moresby new file mode 100644 index 0000000..920ad27 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Port_Moresby differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Rarotonga b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Rarotonga new file mode 100644 index 0000000..da6b0fa Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Rarotonga differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Saipan b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Saipan new file mode 100644 index 0000000..66490d2 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Saipan differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Samoa b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Samoa new file mode 100644 index 0000000..cb56709 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Samoa differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Tahiti b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Tahiti new file mode 100644 index 0000000..442b8eb Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Tahiti differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Tarawa b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Tarawa new file mode 100644 index 0000000..3db6c75 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Tarawa differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Tongatapu b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Tongatapu new file mode 100644 index 0000000..5553c60 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Tongatapu differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Truk b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Truk new file mode 100644 index 0000000..07c84b7 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Truk differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Wake b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Wake new file mode 100644 index 0000000..c9e3106 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Wake differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Wallis b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Wallis new file mode 100644 index 0000000..b35344b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Wallis differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Yap b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Yap new file mode 100644 index 0000000..07c84b7 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Pacific/Yap differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Poland b/venv/Lib/site-packages/pytz/zoneinfo/Poland new file mode 100644 index 0000000..e33cf67 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Poland differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Portugal b/venv/Lib/site-packages/pytz/zoneinfo/Portugal new file mode 100644 index 0000000..355817b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Portugal differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/ROC b/venv/Lib/site-packages/pytz/zoneinfo/ROC new file mode 100644 index 0000000..24c4344 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/ROC differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/ROK b/venv/Lib/site-packages/pytz/zoneinfo/ROK new file mode 100644 index 0000000..96199e7 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/ROK differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Singapore b/venv/Lib/site-packages/pytz/zoneinfo/Singapore new file mode 100644 index 0000000..2364b21 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Singapore differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Turkey b/venv/Lib/site-packages/pytz/zoneinfo/Turkey new file mode 100644 index 0000000..508446b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Turkey differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/UCT b/venv/Lib/site-packages/pytz/zoneinfo/UCT new file mode 100644 index 0000000..91558be Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/UCT differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/US/Alaska b/venv/Lib/site-packages/pytz/zoneinfo/US/Alaska new file mode 100644 index 0000000..9bbb2fd Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/US/Alaska differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/US/Aleutian b/venv/Lib/site-packages/pytz/zoneinfo/US/Aleutian new file mode 100644 index 0000000..4323649 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/US/Aleutian differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/US/Arizona b/venv/Lib/site-packages/pytz/zoneinfo/US/Arizona new file mode 100644 index 0000000..ac6bb0c Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/US/Arizona differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/US/Central b/venv/Lib/site-packages/pytz/zoneinfo/US/Central new file mode 100644 index 0000000..a5b1617 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/US/Central differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/US/East-Indiana b/venv/Lib/site-packages/pytz/zoneinfo/US/East-Indiana new file mode 100644 index 0000000..09511cc Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/US/East-Indiana differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/US/Eastern b/venv/Lib/site-packages/pytz/zoneinfo/US/Eastern new file mode 100644 index 0000000..2f75480 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/US/Eastern differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/US/Hawaii b/venv/Lib/site-packages/pytz/zoneinfo/US/Hawaii new file mode 100644 index 0000000..c7cd060 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/US/Hawaii differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/US/Indiana-Starke b/venv/Lib/site-packages/pytz/zoneinfo/US/Indiana-Starke new file mode 100644 index 0000000..fcd408d Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/US/Indiana-Starke differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/US/Michigan b/venv/Lib/site-packages/pytz/zoneinfo/US/Michigan new file mode 100644 index 0000000..e104faa Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/US/Michigan differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/US/Mountain b/venv/Lib/site-packages/pytz/zoneinfo/US/Mountain new file mode 100644 index 0000000..5fbe26b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/US/Mountain differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/US/Pacific b/venv/Lib/site-packages/pytz/zoneinfo/US/Pacific new file mode 100644 index 0000000..9dad4f4 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/US/Pacific differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/US/Samoa b/venv/Lib/site-packages/pytz/zoneinfo/US/Samoa new file mode 100644 index 0000000..cb56709 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/US/Samoa differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/UTC b/venv/Lib/site-packages/pytz/zoneinfo/UTC new file mode 100644 index 0000000..91558be Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/UTC differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Universal b/venv/Lib/site-packages/pytz/zoneinfo/Universal new file mode 100644 index 0000000..91558be Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Universal differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/W-SU b/venv/Lib/site-packages/pytz/zoneinfo/W-SU new file mode 100644 index 0000000..ddb3f4e Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/W-SU differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/WET b/venv/Lib/site-packages/pytz/zoneinfo/WET new file mode 100644 index 0000000..c27390b Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/WET differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/Zulu b/venv/Lib/site-packages/pytz/zoneinfo/Zulu new file mode 100644 index 0000000..91558be Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/Zulu differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/iso3166.tab b/venv/Lib/site-packages/pytz/zoneinfo/iso3166.tab new file mode 100644 index 0000000..a4ff61a --- /dev/null +++ b/venv/Lib/site-packages/pytz/zoneinfo/iso3166.tab @@ -0,0 +1,274 @@ +# ISO 3166 alpha-2 country codes +# +# This file is in the public domain, so clarified as of +# 2009-05-17 by Arthur David Olson. +# +# From Paul Eggert (2015-05-02): +# This file contains a table of two-letter country codes. Columns are +# separated by a single tab. Lines beginning with '#' are comments. +# All text uses UTF-8 encoding. The columns of the table are as follows: +# +# 1. ISO 3166-1 alpha-2 country code, current as of +# ISO 3166-1 N976 (2018-11-06). See: Updates on ISO 3166-1 +# https://isotc.iso.org/livelink/livelink/Open/16944257 +# 2. The usual English name for the coded region, +# chosen so that alphabetic sorting of subsets produces helpful lists. +# This is not the same as the English name in the ISO 3166 tables. +# +# The table is sorted by country code. +# +# This table is intended as an aid for users, to help them select time +# zone data appropriate for their practical needs. It is not intended +# to take or endorse any position on legal or territorial claims. +# +#country- +#code name of country, territory, area, or subdivision +AD Andorra +AE United Arab Emirates +AF Afghanistan +AG Antigua & Barbuda +AI Anguilla +AL Albania +AM Armenia +AO Angola +AQ Antarctica +AR Argentina +AS Samoa (American) +AT Austria +AU Australia +AW Aruba +AX Åland Islands +AZ Azerbaijan +BA Bosnia & Herzegovina +BB Barbados +BD Bangladesh +BE Belgium +BF Burkina Faso +BG Bulgaria +BH Bahrain +BI Burundi +BJ Benin +BL St Barthelemy +BM Bermuda +BN Brunei +BO Bolivia +BQ Caribbean NL +BR Brazil +BS Bahamas +BT Bhutan +BV Bouvet Island +BW Botswana +BY Belarus +BZ Belize +CA Canada +CC Cocos (Keeling) Islands +CD Congo (Dem. Rep.) +CF Central African Rep. +CG Congo (Rep.) +CH Switzerland +CI Côte d'Ivoire +CK Cook Islands +CL Chile +CM Cameroon +CN China +CO Colombia +CR Costa Rica +CU Cuba +CV Cape Verde +CW Curaçao +CX Christmas Island +CY Cyprus +CZ Czech Republic +DE Germany +DJ Djibouti +DK Denmark +DM Dominica +DO Dominican Republic +DZ Algeria +EC Ecuador +EE Estonia +EG Egypt +EH Western Sahara +ER Eritrea +ES Spain +ET Ethiopia +FI Finland +FJ Fiji +FK Falkland Islands +FM Micronesia +FO Faroe Islands +FR France +GA Gabon +GB Britain (UK) +GD Grenada +GE Georgia +GF French Guiana +GG Guernsey +GH Ghana +GI Gibraltar +GL Greenland +GM Gambia +GN Guinea +GP Guadeloupe +GQ Equatorial Guinea +GR Greece +GS South Georgia & the South Sandwich Islands +GT Guatemala +GU Guam +GW Guinea-Bissau +GY Guyana +HK Hong Kong +HM Heard Island & McDonald Islands +HN Honduras +HR Croatia +HT Haiti +HU Hungary +ID Indonesia +IE Ireland +IL Israel +IM Isle of Man +IN India +IO British Indian Ocean Territory +IQ Iraq +IR Iran +IS Iceland +IT Italy +JE Jersey +JM Jamaica +JO Jordan +JP Japan +KE Kenya +KG Kyrgyzstan +KH Cambodia +KI Kiribati +KM Comoros +KN St Kitts & Nevis +KP Korea (North) +KR Korea (South) +KW Kuwait +KY Cayman Islands +KZ Kazakhstan +LA Laos +LB Lebanon +LC St Lucia +LI Liechtenstein +LK Sri Lanka +LR Liberia +LS Lesotho +LT Lithuania +LU Luxembourg +LV Latvia +LY Libya +MA Morocco +MC Monaco +MD Moldova +ME Montenegro +MF St Martin (French) +MG Madagascar +MH Marshall Islands +MK North Macedonia +ML Mali +MM Myanmar (Burma) +MN Mongolia +MO Macau +MP Northern Mariana Islands +MQ Martinique +MR Mauritania +MS Montserrat +MT Malta +MU Mauritius +MV Maldives +MW Malawi +MX Mexico +MY Malaysia +MZ Mozambique +NA Namibia +NC New Caledonia +NE Niger +NF Norfolk Island +NG Nigeria +NI Nicaragua +NL Netherlands +NO Norway +NP Nepal +NR Nauru +NU Niue +NZ New Zealand +OM Oman +PA Panama +PE Peru +PF French Polynesia +PG Papua New Guinea +PH Philippines +PK Pakistan +PL Poland +PM St Pierre & Miquelon +PN Pitcairn +PR Puerto Rico +PS Palestine +PT Portugal +PW Palau +PY Paraguay +QA Qatar +RE Réunion +RO Romania +RS Serbia +RU Russia +RW Rwanda +SA Saudi Arabia +SB Solomon Islands +SC Seychelles +SD Sudan +SE Sweden +SG Singapore +SH St Helena +SI Slovenia +SJ Svalbard & Jan Mayen +SK Slovakia +SL Sierra Leone +SM San Marino +SN Senegal +SO Somalia +SR Suriname +SS South Sudan +ST Sao Tome & Principe +SV El Salvador +SX St Maarten (Dutch) +SY Syria +SZ Eswatini (Swaziland) +TC Turks & Caicos Is +TD Chad +TF French Southern & Antarctic Lands +TG Togo +TH Thailand +TJ Tajikistan +TK Tokelau +TL East Timor +TM Turkmenistan +TN Tunisia +TO Tonga +TR Turkey +TT Trinidad & Tobago +TV Tuvalu +TW Taiwan +TZ Tanzania +UA Ukraine +UG Uganda +UM US minor outlying islands +US United States +UY Uruguay +UZ Uzbekistan +VA Vatican City +VC St Vincent +VE Venezuela +VG Virgin Islands (UK) +VI Virgin Islands (US) +VN Vietnam +VU Vanuatu +WF Wallis & Futuna +WS Samoa (western) +YE Yemen +YT Mayotte +ZA South Africa +ZM Zambia +ZW Zimbabwe diff --git a/venv/Lib/site-packages/pytz/zoneinfo/leapseconds b/venv/Lib/site-packages/pytz/zoneinfo/leapseconds new file mode 100644 index 0000000..7808df8 --- /dev/null +++ b/venv/Lib/site-packages/pytz/zoneinfo/leapseconds @@ -0,0 +1,72 @@ +# Allowance for leap seconds added to each time zone file. + +# This file is in the public domain. + +# This file is generated automatically from the data in the public-domain +# NIST format leap-seconds.list file, which can be copied from +# +# or . +# For more about leap-seconds.list, please see +# The NTP Timescale and Leap Seconds +# . + +# The rules for leap seconds are specified in Annex 1 (Time scales) of: +# Standard-frequency and time-signal emissions. +# International Telecommunication Union - Radiocommunication Sector +# (ITU-R) Recommendation TF.460-6 (02/2002) +# . +# The International Earth Rotation and Reference Systems Service (IERS) +# periodically uses leap seconds to keep UTC to within 0.9 s of UT1 +# (a proxy for Earth's angle in space as measured by astronomers) +# and publishes leap second data in a copyrighted file +# . +# See: Levine J. Coordinated Universal Time and the leap second. +# URSI Radio Sci Bull. 2016;89(4):30-6. doi:10.23919/URSIRSB.2016.7909995 +# . + +# There were no leap seconds before 1972, as no official mechanism +# accounted for the discrepancy between atomic time (TAI) and the earth's +# rotation. The first ("1 Jan 1972") data line in leap-seconds.list +# does not denote a leap second; it denotes the start of the current definition +# of UTC. + +# All leap-seconds are Stationary (S) at the given UTC time. +# The correction (+ or -) is made at the given time, so in the unlikely +# event of a negative leap second, a line would look like this: +# Leap YEAR MON DAY 23:59:59 - S +# Typical lines look like this: +# Leap YEAR MON DAY 23:59:60 + S +Leap 1972 Jun 30 23:59:60 + S +Leap 1972 Dec 31 23:59:60 + S +Leap 1973 Dec 31 23:59:60 + S +Leap 1974 Dec 31 23:59:60 + S +Leap 1975 Dec 31 23:59:60 + S +Leap 1976 Dec 31 23:59:60 + S +Leap 1977 Dec 31 23:59:60 + S +Leap 1978 Dec 31 23:59:60 + S +Leap 1979 Dec 31 23:59:60 + S +Leap 1981 Jun 30 23:59:60 + S +Leap 1982 Jun 30 23:59:60 + S +Leap 1983 Jun 30 23:59:60 + S +Leap 1985 Jun 30 23:59:60 + S +Leap 1987 Dec 31 23:59:60 + S +Leap 1989 Dec 31 23:59:60 + S +Leap 1990 Dec 31 23:59:60 + S +Leap 1992 Jun 30 23:59:60 + S +Leap 1993 Jun 30 23:59:60 + S +Leap 1994 Jun 30 23:59:60 + S +Leap 1995 Dec 31 23:59:60 + S +Leap 1997 Jun 30 23:59:60 + S +Leap 1998 Dec 31 23:59:60 + S +Leap 2005 Dec 31 23:59:60 + S +Leap 2008 Dec 31 23:59:60 + S +Leap 2012 Jun 30 23:59:60 + S +Leap 2015 Jun 30 23:59:60 + S +Leap 2016 Dec 31 23:59:60 + S + +# POSIX timestamps for the data in this file: +#updated 1467936000 (2016-07-08 00:00:00 UTC) +#expires 1593302400 (2020-06-28 00:00:00 UTC) + +# Updated through IERS Bulletin C58 +# File expires on: 28 June 2020 diff --git a/venv/Lib/site-packages/pytz/zoneinfo/posixrules b/venv/Lib/site-packages/pytz/zoneinfo/posixrules new file mode 100644 index 0000000..2f75480 Binary files /dev/null and b/venv/Lib/site-packages/pytz/zoneinfo/posixrules differ diff --git a/venv/Lib/site-packages/pytz/zoneinfo/tzdata.zi b/venv/Lib/site-packages/pytz/zoneinfo/tzdata.zi new file mode 100644 index 0000000..4229d99 --- /dev/null +++ b/venv/Lib/site-packages/pytz/zoneinfo/tzdata.zi @@ -0,0 +1,4405 @@ +# version unknown +# This zic input file is in the public domain. +R d 1916 o - Jun 14 23s 1 S +R d 1916 1919 - O Su>=1 23s 0 - +R d 1917 o - Mar 24 23s 1 S +R d 1918 o - Mar 9 23s 1 S +R d 1919 o - Mar 1 23s 1 S +R d 1920 o - F 14 23s 1 S +R d 1920 o - O 23 23s 0 - +R d 1921 o - Mar 14 23s 1 S +R d 1921 o - Jun 21 23s 0 - +R d 1939 o - S 11 23s 1 S +R d 1939 o - N 19 1 0 - +R d 1944 1945 - Ap M>=1 2 1 S +R d 1944 o - O 8 2 0 - +R d 1945 o - S 16 1 0 - +R d 1971 o - Ap 25 23s 1 S +R d 1971 o - S 26 23s 0 - +R d 1977 o - May 6 0 1 S +R d 1977 o - O 21 0 0 - +R d 1978 o - Mar 24 1 1 S +R d 1978 o - S 22 3 0 - +R d 1980 o - Ap 25 0 1 S +R d 1980 o - O 31 2 0 - +Z Africa/Algiers 0:12:12 - LMT 1891 Mar 15 0:1 +0:9:21 - PMT 1911 Mar 11 +0 d WE%sT 1940 F 25 2 +1 d CE%sT 1946 O 7 +0 - WET 1956 Ja 29 +1 - CET 1963 Ap 14 +0 d WE%sT 1977 O 21 +1 d CE%sT 1979 O 26 +0 d WE%sT 1981 May +1 - CET +Z Atlantic/Cape_Verde -1:34:4 - LMT 1912 Ja 1 2u +-2 - -02 1942 S +-2 1 -01 1945 O 15 +-2 - -02 1975 N 25 2 +-1 - -01 +Z Africa/Ndjamena 1:0:12 - LMT 1912 +1 - WAT 1979 O 14 +1 1 WAST 1980 Mar 8 +1 - WAT +Z Africa/Abidjan -0:16:8 - LMT 1912 +0 - GMT +L Africa/Abidjan Africa/Bamako +L Africa/Abidjan Africa/Banjul +L Africa/Abidjan Africa/Conakry +L Africa/Abidjan Africa/Dakar +L Africa/Abidjan Africa/Freetown +L Africa/Abidjan Africa/Lome +L Africa/Abidjan Africa/Nouakchott +L Africa/Abidjan Africa/Ouagadougou +L Africa/Abidjan Atlantic/St_Helena +R K 1940 o - Jul 15 0 1 S +R K 1940 o - O 1 0 0 - +R K 1941 o - Ap 15 0 1 S +R K 1941 o - S 16 0 0 - +R K 1942 1944 - Ap 1 0 1 S +R K 1942 o - O 27 0 0 - +R K 1943 1945 - N 1 0 0 - +R K 1945 o - Ap 16 0 1 S +R K 1957 o - May 10 0 1 S +R K 1957 1958 - O 1 0 0 - +R K 1958 o - May 1 0 1 S +R K 1959 1981 - May 1 1 1 S +R K 1959 1965 - S 30 3 0 - +R K 1966 1994 - O 1 3 0 - +R K 1982 o - Jul 25 1 1 S +R K 1983 o - Jul 12 1 1 S +R K 1984 1988 - May 1 1 1 S +R K 1989 o - May 6 1 1 S +R K 1990 1994 - May 1 1 1 S +R K 1995 2010 - Ap lastF 0s 1 S +R K 1995 2005 - S lastTh 24 0 - +R K 2006 o - S 21 24 0 - +R K 2007 o - S Th>=1 24 0 - +R K 2008 o - Au lastTh 24 0 - +R K 2009 o - Au 20 24 0 - +R K 2010 o - Au 10 24 0 - +R K 2010 o - S 9 24 1 S +R K 2010 o - S lastTh 24 0 - +R K 2014 o - May 15 24 1 S +R K 2014 o - Jun 26 24 0 - +R K 2014 o - Jul 31 24 1 S +R K 2014 o - S lastTh 24 0 - +Z Africa/Cairo 2:5:9 - LMT 1900 O +2 K EE%sT +R GH 1920 1942 - S 1 0 0:20 - +R GH 1920 1942 - D 31 0 0 - +Z Africa/Accra -0:0:52 - LMT 1918 +0 GH GMT/+0020 +Z Africa/Bissau -1:2:20 - LMT 1912 Ja 1 1u +-1 - -01 1975 +0 - GMT +Z Africa/Nairobi 2:27:16 - LMT 1928 Jul +3 - EAT 1930 +2:30 - +0230 1940 +2:45 - +0245 1960 +3 - EAT +L Africa/Nairobi Africa/Addis_Ababa +L Africa/Nairobi Africa/Asmara +L Africa/Nairobi Africa/Dar_es_Salaam +L Africa/Nairobi Africa/Djibouti +L Africa/Nairobi Africa/Kampala +L Africa/Nairobi Africa/Mogadishu +L Africa/Nairobi Indian/Antananarivo +L Africa/Nairobi Indian/Comoro +L Africa/Nairobi Indian/Mayotte +Z Africa/Monrovia -0:43:8 - LMT 1882 +-0:43:8 - MMT 1919 Mar +-0:44:30 - MMT 1972 Ja 7 +0 - GMT +R L 1951 o - O 14 2 1 S +R L 1952 o - Ja 1 0 0 - +R L 1953 o - O 9 2 1 S +R L 1954 o - Ja 1 0 0 - +R L 1955 o - S 30 0 1 S +R L 1956 o - Ja 1 0 0 - +R L 1982 1984 - Ap 1 0 1 S +R L 1982 1985 - O 1 0 0 - +R L 1985 o - Ap 6 0 1 S +R L 1986 o - Ap 4 0 1 S +R L 1986 o - O 3 0 0 - +R L 1987 1989 - Ap 1 0 1 S +R L 1987 1989 - O 1 0 0 - +R L 1997 o - Ap 4 0 1 S +R L 1997 o - O 4 0 0 - +R L 2013 o - Mar lastF 1 1 S +R L 2013 o - O lastF 2 0 - +Z Africa/Tripoli 0:52:44 - LMT 1920 +1 L CE%sT 1959 +2 - EET 1982 +1 L CE%sT 1990 May 4 +2 - EET 1996 S 30 +1 L CE%sT 1997 O 4 +2 - EET 2012 N 10 2 +1 L CE%sT 2013 O 25 2 +2 - EET +R MU 1982 o - O 10 0 1 - +R MU 1983 o - Mar 21 0 0 - +R MU 2008 o - O lastSu 2 1 - +R MU 2009 o - Mar lastSu 2 0 - +Z Indian/Mauritius 3:50 - LMT 1907 +4 MU +04/+05 +R M 1939 o - S 12 0 1 - +R M 1939 o - N 19 0 0 - +R M 1940 o - F 25 0 1 - +R M 1945 o - N 18 0 0 - +R M 1950 o - Jun 11 0 1 - +R M 1950 o - O 29 0 0 - +R M 1967 o - Jun 3 12 1 - +R M 1967 o - O 1 0 0 - +R M 1974 o - Jun 24 0 1 - +R M 1974 o - S 1 0 0 - +R M 1976 1977 - May 1 0 1 - +R M 1976 o - Au 1 0 0 - +R M 1977 o - S 28 0 0 - +R M 1978 o - Jun 1 0 1 - +R M 1978 o - Au 4 0 0 - +R M 2008 o - Jun 1 0 1 - +R M 2008 o - S 1 0 0 - +R M 2009 o - Jun 1 0 1 - +R M 2009 o - Au 21 0 0 - +R M 2010 o - May 2 0 1 - +R M 2010 o - Au 8 0 0 - +R M 2011 o - Ap 3 0 1 - +R M 2011 o - Jul 31 0 0 - +R M 2012 2013 - Ap lastSu 2 1 - +R M 2012 o - Jul 20 3 0 - +R M 2012 o - Au 20 2 1 - +R M 2012 o - S 30 3 0 - +R M 2013 o - Jul 7 3 0 - +R M 2013 o - Au 10 2 1 - +R M 2013 2018 - O lastSu 3 0 - +R M 2014 2018 - Mar lastSu 2 1 - +R M 2014 o - Jun 28 3 0 - +R M 2014 o - Au 2 2 1 - +R M 2015 o - Jun 14 3 0 - +R M 2015 o - Jul 19 2 1 - +R M 2016 o - Jun 5 3 0 - +R M 2016 o - Jul 10 2 1 - +R M 2017 o - May 21 3 0 - +R M 2017 o - Jul 2 2 1 - +R M 2018 o - May 13 3 0 - +R M 2018 o - Jun 17 2 1 - +R M 2019 o - May 5 3 -1 - +R M 2019 o - Jun 9 2 0 - +R M 2020 o - Ap 19 3 -1 - +R M 2020 o - May 24 2 0 - +R M 2021 o - Ap 11 3 -1 - +R M 2021 o - May 16 2 0 - +R M 2022 o - Mar 27 3 -1 - +R M 2022 o - May 8 2 0 - +R M 2023 o - Mar 19 3 -1 - +R M 2023 o - Ap 23 2 0 - +R M 2024 o - Mar 10 3 -1 - +R M 2024 o - Ap 14 2 0 - +R M 2025 o - F 23 3 -1 - +R M 2025 o - Ap 6 2 0 - +R M 2026 o - F 15 3 -1 - +R M 2026 o - Mar 22 2 0 - +R M 2027 o - F 7 3 -1 - +R M 2027 o - Mar 14 2 0 - +R M 2028 o - Ja 23 3 -1 - +R M 2028 o - F 27 2 0 - +R M 2029 o - Ja 14 3 -1 - +R M 2029 o - F 18 2 0 - +R M 2029 o - D 30 3 -1 - +R M 2030 o - F 10 2 0 - +R M 2030 o - D 22 3 -1 - +R M 2031 o - Ja 26 2 0 - +R M 2031 o - D 14 3 -1 - +R M 2032 o - Ja 18 2 0 - +R M 2032 o - N 28 3 -1 - +R M 2033 o - Ja 9 2 0 - +R M 2033 o - N 20 3 -1 - +R M 2033 o - D 25 2 0 - +R M 2034 o - N 5 3 -1 - +R M 2034 o - D 17 2 0 - +R M 2035 o - O 28 3 -1 - +R M 2035 o - D 2 2 0 - +R M 2036 o - O 19 3 -1 - +R M 2036 o - N 23 2 0 - +R M 2037 o - O 4 3 -1 - +R M 2037 o - N 15 2 0 - +R M 2038 o - S 26 3 -1 - +R M 2038 o - O 31 2 0 - +R M 2039 o - S 18 3 -1 - +R M 2039 o - O 23 2 0 - +R M 2040 o - S 2 3 -1 - +R M 2040 o - O 14 2 0 - +R M 2041 o - Au 25 3 -1 - +R M 2041 o - S 29 2 0 - +R M 2042 o - Au 10 3 -1 - +R M 2042 o - S 21 2 0 - +R M 2043 o - Au 2 3 -1 - +R M 2043 o - S 6 2 0 - +R M 2044 o - Jul 24 3 -1 - +R M 2044 o - Au 28 2 0 - +R M 2045 o - Jul 9 3 -1 - +R M 2045 o - Au 20 2 0 - +R M 2046 o - Jul 1 3 -1 - +R M 2046 o - Au 5 2 0 - +R M 2047 o - Jun 23 3 -1 - +R M 2047 o - Jul 28 2 0 - +R M 2048 o - Jun 7 3 -1 - +R M 2048 o - Jul 19 2 0 - +R M 2049 o - May 30 3 -1 - +R M 2049 o - Jul 4 2 0 - +R M 2050 o - May 15 3 -1 - +R M 2050 o - Jun 26 2 0 - +R M 2051 o - May 7 3 -1 - +R M 2051 o - Jun 11 2 0 - +R M 2052 o - Ap 28 3 -1 - +R M 2052 o - Jun 2 2 0 - +R M 2053 o - Ap 13 3 -1 - +R M 2053 o - May 25 2 0 - +R M 2054 o - Ap 5 3 -1 - +R M 2054 o - May 10 2 0 - +R M 2055 o - Mar 28 3 -1 - +R M 2055 o - May 2 2 0 - +R M 2056 o - Mar 12 3 -1 - +R M 2056 o - Ap 23 2 0 - +R M 2057 o - Mar 4 3 -1 - +R M 2057 o - Ap 8 2 0 - +R M 2058 o - F 17 3 -1 - +R M 2058 o - Mar 31 2 0 - +R M 2059 o - F 9 3 -1 - +R M 2059 o - Mar 16 2 0 - +R M 2060 o - F 1 3 -1 - +R M 2060 o - Mar 7 2 0 - +R M 2061 o - Ja 16 3 -1 - +R M 2061 o - F 27 2 0 - +R M 2062 o - Ja 8 3 -1 - +R M 2062 o - F 12 2 0 - +R M 2062 o - D 31 3 -1 - +R M 2063 o - F 4 2 0 - +R M 2063 o - D 16 3 -1 - +R M 2064 o - Ja 20 2 0 - +R M 2064 o - D 7 3 -1 - +R M 2065 o - Ja 11 2 0 - +R M 2065 o - N 22 3 -1 - +R M 2066 o - Ja 3 2 0 - +R M 2066 o - N 14 3 -1 - +R M 2066 o - D 19 2 0 - +R M 2067 o - N 6 3 -1 - +R M 2067 o - D 11 2 0 - +R M 2068 o - O 21 3 -1 - +R M 2068 o - D 2 2 0 - +R M 2069 o - O 13 3 -1 - +R M 2069 o - N 17 2 0 - +R M 2070 o - O 5 3 -1 - +R M 2070 o - N 9 2 0 - +R M 2071 o - S 20 3 -1 - +R M 2071 o - O 25 2 0 - +R M 2072 o - S 11 3 -1 - +R M 2072 o - O 16 2 0 - +R M 2073 o - Au 27 3 -1 - +R M 2073 o - O 8 2 0 - +R M 2074 o - Au 19 3 -1 - +R M 2074 o - S 23 2 0 - +R M 2075 o - Au 11 3 -1 - +R M 2075 o - S 15 2 0 - +R M 2076 o - Jul 26 3 -1 - +R M 2076 o - S 6 2 0 - +R M 2077 o - Jul 18 3 -1 - +R M 2077 o - Au 22 2 0 - +R M 2078 o - Jul 10 3 -1 - +R M 2078 o - Au 14 2 0 - +R M 2079 o - Jun 25 3 -1 - +R M 2079 o - Jul 30 2 0 - +R M 2080 o - Jun 16 3 -1 - +R M 2080 o - Jul 21 2 0 - +R M 2081 o - Jun 1 3 -1 - +R M 2081 o - Jul 13 2 0 - +R M 2082 o - May 24 3 -1 - +R M 2082 o - Jun 28 2 0 - +R M 2083 o - May 16 3 -1 - +R M 2083 o - Jun 20 2 0 - +R M 2084 o - Ap 30 3 -1 - +R M 2084 o - Jun 11 2 0 - +R M 2085 o - Ap 22 3 -1 - +R M 2085 o - May 27 2 0 - +R M 2086 o - Ap 14 3 -1 - +R M 2086 o - May 19 2 0 - +R M 2087 o - Mar 30 3 -1 - +R M 2087 o - May 4 2 0 - +Z Africa/Casablanca -0:30:20 - LMT 1913 O 26 +0 M +00/+01 1984 Mar 16 +1 - +01 1986 +0 M +00/+01 2018 O 28 3 +1 M +01/+00 +Z Africa/El_Aaiun -0:52:48 - LMT 1934 +-1 - -01 1976 Ap 14 +0 M +00/+01 2018 O 28 3 +1 M +01/+00 +Z Africa/Maputo 2:10:20 - LMT 1903 Mar +2 - CAT +L Africa/Maputo Africa/Blantyre +L Africa/Maputo Africa/Bujumbura +L Africa/Maputo Africa/Gaborone +L Africa/Maputo Africa/Harare +L Africa/Maputo Africa/Kigali +L Africa/Maputo Africa/Lubumbashi +L Africa/Maputo Africa/Lusaka +R NA 1994 o - Mar 21 0 -1 WAT +R NA 1994 2017 - S Su>=1 2 0 CAT +R NA 1995 2017 - Ap Su>=1 2 -1 WAT +Z Africa/Windhoek 1:8:24 - LMT 1892 F 8 +1:30 - +0130 1903 Mar +2 - SAST 1942 S 20 2 +2 1 SAST 1943 Mar 21 2 +2 - SAST 1990 Mar 21 +2 NA %s +Z Africa/Lagos 0:13:36 - LMT 1919 S +1 - WAT +L Africa/Lagos Africa/Bangui +L Africa/Lagos Africa/Brazzaville +L Africa/Lagos Africa/Douala +L Africa/Lagos Africa/Kinshasa +L Africa/Lagos Africa/Libreville +L Africa/Lagos Africa/Luanda +L Africa/Lagos Africa/Malabo +L Africa/Lagos Africa/Niamey +L Africa/Lagos Africa/Porto-Novo +Z Indian/Reunion 3:41:52 - LMT 1911 Jun +4 - +04 +Z Africa/Sao_Tome 0:26:56 - LMT 1884 +-0:36:45 - LMT 1912 Ja 1 0u +0 - GMT 2018 Ja 1 1 +1 - WAT 2019 Ja 1 2 +0 - GMT +Z Indian/Mahe 3:41:48 - LMT 1906 Jun +4 - +04 +R SA 1942 1943 - S Su>=15 2 1 - +R SA 1943 1944 - Mar Su>=15 2 0 - +Z Africa/Johannesburg 1:52 - LMT 1892 F 8 +1:30 - SAST 1903 Mar +2 SA SAST +L Africa/Johannesburg Africa/Maseru +L Africa/Johannesburg Africa/Mbabane +R SD 1970 o - May 1 0 1 S +R SD 1970 1985 - O 15 0 0 - +R SD 1971 o - Ap 30 0 1 S +R SD 1972 1985 - Ap lastSu 0 1 S +Z Africa/Khartoum 2:10:8 - LMT 1931 +2 SD CA%sT 2000 Ja 15 12 +3 - EAT 2017 N +2 - CAT +Z Africa/Juba 2:6:28 - LMT 1931 +2 SD CA%sT 2000 Ja 15 12 +3 - EAT +R n 1939 o - Ap 15 23s 1 S +R n 1939 o - N 18 23s 0 - +R n 1940 o - F 25 23s 1 S +R n 1941 o - O 6 0 0 - +R n 1942 o - Mar 9 0 1 S +R n 1942 o - N 2 3 0 - +R n 1943 o - Mar 29 2 1 S +R n 1943 o - Ap 17 2 0 - +R n 1943 o - Ap 25 2 1 S +R n 1943 o - O 4 2 0 - +R n 1944 1945 - Ap M>=1 2 1 S +R n 1944 o - O 8 0 0 - +R n 1945 o - S 16 0 0 - +R n 1977 o - Ap 30 0s 1 S +R n 1977 o - S 24 0s 0 - +R n 1978 o - May 1 0s 1 S +R n 1978 o - O 1 0s 0 - +R n 1988 o - Jun 1 0s 1 S +R n 1988 1990 - S lastSu 0s 0 - +R n 1989 o - Mar 26 0s 1 S +R n 1990 o - May 1 0s 1 S +R n 2005 o - May 1 0s 1 S +R n 2005 o - S 30 1s 0 - +R n 2006 2008 - Mar lastSu 2s 1 S +R n 2006 2008 - O lastSu 2s 0 - +Z Africa/Tunis 0:40:44 - LMT 1881 May 12 +0:9:21 - PMT 1911 Mar 11 +1 n CE%sT +Z Antarctica/Casey 0 - -00 1969 +8 - +08 2009 O 18 2 +11 - +11 2010 Mar 5 2 +8 - +08 2011 O 28 2 +11 - +11 2012 F 21 17u +8 - +08 2016 O 22 +11 - +11 2018 Mar 11 4 +8 - +08 +Z Antarctica/Davis 0 - -00 1957 Ja 13 +7 - +07 1964 N +0 - -00 1969 F +7 - +07 2009 O 18 2 +5 - +05 2010 Mar 10 20u +7 - +07 2011 O 28 2 +5 - +05 2012 F 21 20u +7 - +07 +Z Antarctica/Mawson 0 - -00 1954 F 13 +6 - +06 2009 O 18 2 +5 - +05 +Z Indian/Kerguelen 0 - -00 1950 +5 - +05 +Z Antarctica/DumontDUrville 0 - -00 1947 +10 - +10 1952 Ja 14 +0 - -00 1956 N +10 - +10 +Z Antarctica/Syowa 0 - -00 1957 Ja 29 +3 - +03 +R Tr 2005 ma - Mar lastSu 1u 2 +02 +R Tr 2004 ma - O lastSu 1u 0 +00 +Z Antarctica/Troll 0 - -00 2005 F 12 +0 Tr %s +Z Antarctica/Vostok 0 - -00 1957 D 16 +6 - +06 +Z Antarctica/Rothera 0 - -00 1976 D +-3 - -03 +Z Asia/Kabul 4:36:48 - LMT 1890 +4 - +04 1945 +4:30 - +0430 +R AM 2011 o - Mar lastSu 2s 1 - +R AM 2011 o - O lastSu 2s 0 - +Z Asia/Yerevan 2:58 - LMT 1924 May 2 +3 - +03 1957 Mar +4 R +04/+05 1991 Mar 31 2s +3 R +03/+04 1995 S 24 2s +4 - +04 1997 +4 R +04/+05 2011 +4 AM +04/+05 +R AZ 1997 2015 - Mar lastSu 4 1 - +R AZ 1997 2015 - O lastSu 5 0 - +Z Asia/Baku 3:19:24 - LMT 1924 May 2 +3 - +03 1957 Mar +4 R +04/+05 1991 Mar 31 2s +3 R +03/+04 1992 S lastSu 2s +4 - +04 1996 +4 E +04/+05 1997 +4 AZ +04/+05 +R BD 2009 o - Jun 19 23 1 - +R BD 2009 o - D 31 24 0 - +Z Asia/Dhaka 6:1:40 - LMT 1890 +5:53:20 - HMT 1941 O +6:30 - +0630 1942 May 15 +5:30 - +0530 1942 S +6:30 - +0630 1951 S 30 +6 - +06 2009 +6 BD +06/+07 +Z Asia/Thimphu 5:58:36 - LMT 1947 Au 15 +5:30 - +0530 1987 O +6 - +06 +Z Indian/Chagos 4:49:40 - LMT 1907 +5 - +05 1996 +6 - +06 +Z Asia/Brunei 7:39:40 - LMT 1926 Mar +7:30 - +0730 1933 +8 - +08 +Z Asia/Yangon 6:24:47 - LMT 1880 +6:24:47 - RMT 1920 +6:30 - +0630 1942 May +9 - +09 1945 May 3 +6:30 - +0630 +R Sh 1940 o - Jun 1 0 1 D +R Sh 1940 o - O 12 24 0 S +R Sh 1941 o - Mar 15 0 1 D +R Sh 1941 o - N 1 24 0 S +R Sh 1942 o - Ja 31 0 1 D +R Sh 1945 o - S 1 24 0 S +R Sh 1946 o - May 15 0 1 D +R Sh 1946 o - S 30 24 0 S +R Sh 1947 o - Ap 15 0 1 D +R Sh 1947 o - O 31 24 0 S +R Sh 1948 1949 - May 1 0 1 D +R Sh 1948 1949 - S 30 24 0 S +R CN 1986 o - May 4 2 1 D +R CN 1986 1991 - S Su>=11 2 0 S +R CN 1987 1991 - Ap Su>=11 2 1 D +Z Asia/Shanghai 8:5:43 - LMT 1901 +8 Sh C%sT 1949 May 28 +8 CN C%sT +Z Asia/Urumqi 5:50:20 - LMT 1928 +6 - +06 +R HK 1946 o - Ap 21 0 1 S +R HK 1946 o - D 1 3:30s 0 - +R HK 1947 o - Ap 13 3:30s 1 S +R HK 1947 o - N 30 3:30s 0 - +R HK 1948 o - May 2 3:30s 1 S +R HK 1948 1952 - O Su>=28 3:30s 0 - +R HK 1949 1953 - Ap Su>=1 3:30 1 S +R HK 1953 1964 - O Su>=31 3:30 0 - +R HK 1954 1964 - Mar Su>=18 3:30 1 S +R HK 1965 1976 - Ap Su>=16 3:30 1 S +R HK 1965 1976 - O Su>=16 3:30 0 - +R HK 1973 o - D 30 3:30 1 S +R HK 1979 o - May 13 3:30 1 S +R HK 1979 o - O 21 3:30 0 - +Z Asia/Hong_Kong 7:36:42 - LMT 1904 O 30 0:36:42 +8 - HKT 1941 Jun 15 3 +8 1 HKST 1941 O 1 4 +8 0:30 HKWT 1941 D 25 +9 - JST 1945 N 18 2 +8 HK HK%sT +R f 1946 o - May 15 0 1 D +R f 1946 o - O 1 0 0 S +R f 1947 o - Ap 15 0 1 D +R f 1947 o - N 1 0 0 S +R f 1948 1951 - May 1 0 1 D +R f 1948 1951 - O 1 0 0 S +R f 1952 o - Mar 1 0 1 D +R f 1952 1954 - N 1 0 0 S +R f 1953 1959 - Ap 1 0 1 D +R f 1955 1961 - O 1 0 0 S +R f 1960 1961 - Jun 1 0 1 D +R f 1974 1975 - Ap 1 0 1 D +R f 1974 1975 - O 1 0 0 S +R f 1979 o - Jul 1 0 1 D +R f 1979 o - O 1 0 0 S +Z Asia/Taipei 8:6 - LMT 1896 +8 - CST 1937 O +9 - JST 1945 S 21 1 +8 f C%sT +R _ 1942 1943 - Ap 30 23 1 - +R _ 1942 o - N 17 23 0 - +R _ 1943 o - S 30 23 0 S +R _ 1946 o - Ap 30 23s 1 D +R _ 1946 o - S 30 23s 0 S +R _ 1947 o - Ap 19 23s 1 D +R _ 1947 o - N 30 23s 0 S +R _ 1948 o - May 2 23s 1 D +R _ 1948 o - O 31 23s 0 S +R _ 1949 1950 - Ap Sa>=1 23s 1 D +R _ 1949 1950 - O lastSa 23s 0 S +R _ 1951 o - Mar 31 23s 1 D +R _ 1951 o - O 28 23s 0 S +R _ 1952 1953 - Ap Sa>=1 23s 1 D +R _ 1952 o - N 1 23s 0 S +R _ 1953 1954 - O lastSa 23s 0 S +R _ 1954 1956 - Mar Sa>=17 23s 1 D +R _ 1955 o - N 5 23s 0 S +R _ 1956 1964 - N Su>=1 3:30 0 S +R _ 1957 1964 - Mar Su>=18 3:30 1 D +R _ 1965 1973 - Ap Su>=16 3:30 1 D +R _ 1965 1966 - O Su>=16 2:30 0 S +R _ 1967 1976 - O Su>=16 3:30 0 S +R _ 1973 o - D 30 3:30 1 D +R _ 1975 1976 - Ap Su>=16 3:30 1 D +R _ 1979 o - May 13 3:30 1 D +R _ 1979 o - O Su>=16 3:30 0 S +Z Asia/Macau 7:34:10 - LMT 1904 O 30 +8 - CST 1941 D 21 23 +9 _ +09/+10 1945 S 30 24 +8 _ C%sT +R CY 1975 o - Ap 13 0 1 S +R CY 1975 o - O 12 0 0 - +R CY 1976 o - May 15 0 1 S +R CY 1976 o - O 11 0 0 - +R CY 1977 1980 - Ap Su>=1 0 1 S +R CY 1977 o - S 25 0 0 - +R CY 1978 o - O 2 0 0 - +R CY 1979 1997 - S lastSu 0 0 - +R CY 1981 1998 - Mar lastSu 0 1 S +Z Asia/Nicosia 2:13:28 - LMT 1921 N 14 +2 CY EE%sT 1998 S +2 E EE%sT +Z Asia/Famagusta 2:15:48 - LMT 1921 N 14 +2 CY EE%sT 1998 S +2 E EE%sT 2016 S 8 +3 - +03 2017 O 29 1u +2 E EE%sT +L Asia/Nicosia Europe/Nicosia +Z Asia/Tbilisi 2:59:11 - LMT 1880 +2:59:11 - TBMT 1924 May 2 +3 - +03 1957 Mar +4 R +04/+05 1991 Mar 31 2s +3 R +03/+04 1992 +3 e +03/+04 1994 S lastSu +4 e +04/+05 1996 O lastSu +4 1 +05 1997 Mar lastSu +4 e +04/+05 2004 Jun 27 +3 R +03/+04 2005 Mar lastSu 2 +4 - +04 +Z Asia/Dili 8:22:20 - LMT 1912 +8 - +08 1942 F 21 23 +9 - +09 1976 May 3 +8 - +08 2000 S 17 +9 - +09 +Z Asia/Kolkata 5:53:28 - LMT 1854 Jun 28 +5:53:20 - HMT 1870 +5:21:10 - MMT 1906 +5:30 - IST 1941 O +5:30 1 +0630 1942 May 15 +5:30 - IST 1942 S +5:30 1 +0630 1945 O 15 +5:30 - IST +Z Asia/Jakarta 7:7:12 - LMT 1867 Au 10 +7:7:12 - BMT 1923 D 31 23:47:12 +7:20 - +0720 1932 N +7:30 - +0730 1942 Mar 23 +9 - +09 1945 S 23 +7:30 - +0730 1948 May +8 - +08 1950 May +7:30 - +0730 1964 +7 - WIB +Z Asia/Pontianak 7:17:20 - LMT 1908 May +7:17:20 - PMT 1932 N +7:30 - +0730 1942 Ja 29 +9 - +09 1945 S 23 +7:30 - +0730 1948 May +8 - +08 1950 May +7:30 - +0730 1964 +8 - WITA 1988 +7 - WIB +Z Asia/Makassar 7:57:36 - LMT 1920 +7:57:36 - MMT 1932 N +8 - +08 1942 F 9 +9 - +09 1945 S 23 +8 - WITA +Z Asia/Jayapura 9:22:48 - LMT 1932 N +9 - +09 1944 S +9:30 - +0930 1964 +9 - WIT +R i 1978 1980 - Mar 20 24 1 - +R i 1978 o - O 20 24 0 - +R i 1979 o - S 18 24 0 - +R i 1980 o - S 22 24 0 - +R i 1991 o - May 2 24 1 - +R i 1992 1995 - Mar 21 24 1 - +R i 1991 1995 - S 21 24 0 - +R i 1996 o - Mar 20 24 1 - +R i 1996 o - S 20 24 0 - +R i 1997 1999 - Mar 21 24 1 - +R i 1997 1999 - S 21 24 0 - +R i 2000 o - Mar 20 24 1 - +R i 2000 o - S 20 24 0 - +R i 2001 2003 - Mar 21 24 1 - +R i 2001 2003 - S 21 24 0 - +R i 2004 o - Mar 20 24 1 - +R i 2004 o - S 20 24 0 - +R i 2005 o - Mar 21 24 1 - +R i 2005 o - S 21 24 0 - +R i 2008 o - Mar 20 24 1 - +R i 2008 o - S 20 24 0 - +R i 2009 2011 - Mar 21 24 1 - +R i 2009 2011 - S 21 24 0 - +R i 2012 o - Mar 20 24 1 - +R i 2012 o - S 20 24 0 - +R i 2013 2015 - Mar 21 24 1 - +R i 2013 2015 - S 21 24 0 - +R i 2016 o - Mar 20 24 1 - +R i 2016 o - S 20 24 0 - +R i 2017 2019 - Mar 21 24 1 - +R i 2017 2019 - S 21 24 0 - +R i 2020 o - Mar 20 24 1 - +R i 2020 o - S 20 24 0 - +R i 2021 2023 - Mar 21 24 1 - +R i 2021 2023 - S 21 24 0 - +R i 2024 o - Mar 20 24 1 - +R i 2024 o - S 20 24 0 - +R i 2025 2027 - Mar 21 24 1 - +R i 2025 2027 - S 21 24 0 - +R i 2028 2029 - Mar 20 24 1 - +R i 2028 2029 - S 20 24 0 - +R i 2030 2031 - Mar 21 24 1 - +R i 2030 2031 - S 21 24 0 - +R i 2032 2033 - Mar 20 24 1 - +R i 2032 2033 - S 20 24 0 - +R i 2034 2035 - Mar 21 24 1 - +R i 2034 2035 - S 21 24 0 - +R i 2036 2037 - Mar 20 24 1 - +R i 2036 2037 - S 20 24 0 - +R i 2038 2039 - Mar 21 24 1 - +R i 2038 2039 - S 21 24 0 - +R i 2040 2041 - Mar 20 24 1 - +R i 2040 2041 - S 20 24 0 - +R i 2042 2043 - Mar 21 24 1 - +R i 2042 2043 - S 21 24 0 - +R i 2044 2045 - Mar 20 24 1 - +R i 2044 2045 - S 20 24 0 - +R i 2046 2047 - Mar 21 24 1 - +R i 2046 2047 - S 21 24 0 - +R i 2048 2049 - Mar 20 24 1 - +R i 2048 2049 - S 20 24 0 - +R i 2050 2051 - Mar 21 24 1 - +R i 2050 2051 - S 21 24 0 - +R i 2052 2053 - Mar 20 24 1 - +R i 2052 2053 - S 20 24 0 - +R i 2054 2055 - Mar 21 24 1 - +R i 2054 2055 - S 21 24 0 - +R i 2056 2057 - Mar 20 24 1 - +R i 2056 2057 - S 20 24 0 - +R i 2058 2059 - Mar 21 24 1 - +R i 2058 2059 - S 21 24 0 - +R i 2060 2062 - Mar 20 24 1 - +R i 2060 2062 - S 20 24 0 - +R i 2063 o - Mar 21 24 1 - +R i 2063 o - S 21 24 0 - +R i 2064 2066 - Mar 20 24 1 - +R i 2064 2066 - S 20 24 0 - +R i 2067 o - Mar 21 24 1 - +R i 2067 o - S 21 24 0 - +R i 2068 2070 - Mar 20 24 1 - +R i 2068 2070 - S 20 24 0 - +R i 2071 o - Mar 21 24 1 - +R i 2071 o - S 21 24 0 - +R i 2072 2074 - Mar 20 24 1 - +R i 2072 2074 - S 20 24 0 - +R i 2075 o - Mar 21 24 1 - +R i 2075 o - S 21 24 0 - +R i 2076 2078 - Mar 20 24 1 - +R i 2076 2078 - S 20 24 0 - +R i 2079 o - Mar 21 24 1 - +R i 2079 o - S 21 24 0 - +R i 2080 2082 - Mar 20 24 1 - +R i 2080 2082 - S 20 24 0 - +R i 2083 o - Mar 21 24 1 - +R i 2083 o - S 21 24 0 - +R i 2084 2086 - Mar 20 24 1 - +R i 2084 2086 - S 20 24 0 - +R i 2087 o - Mar 21 24 1 - +R i 2087 o - S 21 24 0 - +R i 2088 ma - Mar 20 24 1 - +R i 2088 ma - S 20 24 0 - +Z Asia/Tehran 3:25:44 - LMT 1916 +3:25:44 - TMT 1946 +3:30 - +0330 1977 N +4 i +04/+05 1979 +3:30 i +0330/+0430 +R IQ 1982 o - May 1 0 1 - +R IQ 1982 1984 - O 1 0 0 - +R IQ 1983 o - Mar 31 0 1 - +R IQ 1984 1985 - Ap 1 0 1 - +R IQ 1985 1990 - S lastSu 1s 0 - +R IQ 1986 1990 - Mar lastSu 1s 1 - +R IQ 1991 2007 - Ap 1 3s 1 - +R IQ 1991 2007 - O 1 3s 0 - +Z Asia/Baghdad 2:57:40 - LMT 1890 +2:57:36 - BMT 1918 +3 - +03 1982 May +3 IQ +03/+04 +R Z 1940 o - Jun 1 0 1 D +R Z 1942 1944 - N 1 0 0 S +R Z 1943 o - Ap 1 2 1 D +R Z 1944 o - Ap 1 0 1 D +R Z 1945 o - Ap 16 0 1 D +R Z 1945 o - N 1 2 0 S +R Z 1946 o - Ap 16 2 1 D +R Z 1946 o - N 1 0 0 S +R Z 1948 o - May 23 0 2 DD +R Z 1948 o - S 1 0 1 D +R Z 1948 1949 - N 1 2 0 S +R Z 1949 o - May 1 0 1 D +R Z 1950 o - Ap 16 0 1 D +R Z 1950 o - S 15 3 0 S +R Z 1951 o - Ap 1 0 1 D +R Z 1951 o - N 11 3 0 S +R Z 1952 o - Ap 20 2 1 D +R Z 1952 o - O 19 3 0 S +R Z 1953 o - Ap 12 2 1 D +R Z 1953 o - S 13 3 0 S +R Z 1954 o - Jun 13 0 1 D +R Z 1954 o - S 12 0 0 S +R Z 1955 o - Jun 11 2 1 D +R Z 1955 o - S 11 0 0 S +R Z 1956 o - Jun 3 0 1 D +R Z 1956 o - S 30 3 0 S +R Z 1957 o - Ap 29 2 1 D +R Z 1957 o - S 22 0 0 S +R Z 1974 o - Jul 7 0 1 D +R Z 1974 o - O 13 0 0 S +R Z 1975 o - Ap 20 0 1 D +R Z 1975 o - Au 31 0 0 S +R Z 1980 o - Au 2 0 1 D +R Z 1980 o - S 13 1 0 S +R Z 1984 o - May 5 0 1 D +R Z 1984 o - Au 25 1 0 S +R Z 1985 o - Ap 14 0 1 D +R Z 1985 o - S 15 0 0 S +R Z 1986 o - May 18 0 1 D +R Z 1986 o - S 7 0 0 S +R Z 1987 o - Ap 15 0 1 D +R Z 1987 o - S 13 0 0 S +R Z 1988 o - Ap 10 0 1 D +R Z 1988 o - S 4 0 0 S +R Z 1989 o - Ap 30 0 1 D +R Z 1989 o - S 3 0 0 S +R Z 1990 o - Mar 25 0 1 D +R Z 1990 o - Au 26 0 0 S +R Z 1991 o - Mar 24 0 1 D +R Z 1991 o - S 1 0 0 S +R Z 1992 o - Mar 29 0 1 D +R Z 1992 o - S 6 0 0 S +R Z 1993 o - Ap 2 0 1 D +R Z 1993 o - S 5 0 0 S +R Z 1994 o - Ap 1 0 1 D +R Z 1994 o - Au 28 0 0 S +R Z 1995 o - Mar 31 0 1 D +R Z 1995 o - S 3 0 0 S +R Z 1996 o - Mar 15 0 1 D +R Z 1996 o - S 16 0 0 S +R Z 1997 o - Mar 21 0 1 D +R Z 1997 o - S 14 0 0 S +R Z 1998 o - Mar 20 0 1 D +R Z 1998 o - S 6 0 0 S +R Z 1999 o - Ap 2 2 1 D +R Z 1999 o - S 3 2 0 S +R Z 2000 o - Ap 14 2 1 D +R Z 2000 o - O 6 1 0 S +R Z 2001 o - Ap 9 1 1 D +R Z 2001 o - S 24 1 0 S +R Z 2002 o - Mar 29 1 1 D +R Z 2002 o - O 7 1 0 S +R Z 2003 o - Mar 28 1 1 D +R Z 2003 o - O 3 1 0 S +R Z 2004 o - Ap 7 1 1 D +R Z 2004 o - S 22 1 0 S +R Z 2005 2012 - Ap F<=1 2 1 D +R Z 2005 o - O 9 2 0 S +R Z 2006 o - O 1 2 0 S +R Z 2007 o - S 16 2 0 S +R Z 2008 o - O 5 2 0 S +R Z 2009 o - S 27 2 0 S +R Z 2010 o - S 12 2 0 S +R Z 2011 o - O 2 2 0 S +R Z 2012 o - S 23 2 0 S +R Z 2013 ma - Mar F>=23 2 1 D +R Z 2013 ma - O lastSu 2 0 S +Z Asia/Jerusalem 2:20:54 - LMT 1880 +2:20:40 - JMT 1918 +2 Z I%sT +R JP 1948 o - May Sa>=1 24 1 D +R JP 1948 1951 - S Sa>=8 25 0 S +R JP 1949 o - Ap Sa>=1 24 1 D +R JP 1950 1951 - May Sa>=1 24 1 D +Z Asia/Tokyo 9:18:59 - LMT 1887 D 31 15u +9 JP J%sT +R J 1973 o - Jun 6 0 1 S +R J 1973 1975 - O 1 0 0 - +R J 1974 1977 - May 1 0 1 S +R J 1976 o - N 1 0 0 - +R J 1977 o - O 1 0 0 - +R J 1978 o - Ap 30 0 1 S +R J 1978 o - S 30 0 0 - +R J 1985 o - Ap 1 0 1 S +R J 1985 o - O 1 0 0 - +R J 1986 1988 - Ap F>=1 0 1 S +R J 1986 1990 - O F>=1 0 0 - +R J 1989 o - May 8 0 1 S +R J 1990 o - Ap 27 0 1 S +R J 1991 o - Ap 17 0 1 S +R J 1991 o - S 27 0 0 - +R J 1992 o - Ap 10 0 1 S +R J 1992 1993 - O F>=1 0 0 - +R J 1993 1998 - Ap F>=1 0 1 S +R J 1994 o - S F>=15 0 0 - +R J 1995 1998 - S F>=15 0s 0 - +R J 1999 o - Jul 1 0s 1 S +R J 1999 2002 - S lastF 0s 0 - +R J 2000 2001 - Mar lastTh 0s 1 S +R J 2002 2012 - Mar lastTh 24 1 S +R J 2003 o - O 24 0s 0 - +R J 2004 o - O 15 0s 0 - +R J 2005 o - S lastF 0s 0 - +R J 2006 2011 - O lastF 0s 0 - +R J 2013 o - D 20 0 0 - +R J 2014 ma - Mar lastTh 24 1 S +R J 2014 ma - O lastF 0s 0 - +Z Asia/Amman 2:23:44 - LMT 1931 +2 J EE%sT +Z Asia/Almaty 5:7:48 - LMT 1924 May 2 +5 - +05 1930 Jun 21 +6 R +06/+07 1991 Mar 31 2s +5 R +05/+06 1992 Ja 19 2s +6 R +06/+07 2004 O 31 2s +6 - +06 +Z Asia/Qyzylorda 4:21:52 - LMT 1924 May 2 +4 - +04 1930 Jun 21 +5 - +05 1981 Ap +5 1 +06 1981 O +6 - +06 1982 Ap +5 R +05/+06 1991 Mar 31 2s +4 R +04/+05 1991 S 29 2s +5 R +05/+06 1992 Ja 19 2s +6 R +06/+07 1992 Mar 29 2s +5 R +05/+06 2004 O 31 2s +6 - +06 2018 D 21 +5 - +05 +Z Asia/Qostanay 4:14:28 - LMT 1924 May 2 +4 - +04 1930 Jun 21 +5 - +05 1981 Ap +5 1 +06 1981 O +6 - +06 1982 Ap +5 R +05/+06 1991 Mar 31 2s +4 R +04/+05 1992 Ja 19 2s +5 R +05/+06 2004 O 31 2s +6 - +06 +Z Asia/Aqtobe 3:48:40 - LMT 1924 May 2 +4 - +04 1930 Jun 21 +5 - +05 1981 Ap +5 1 +06 1981 O +6 - +06 1982 Ap +5 R +05/+06 1991 Mar 31 2s +4 R +04/+05 1992 Ja 19 2s +5 R +05/+06 2004 O 31 2s +5 - +05 +Z Asia/Aqtau 3:21:4 - LMT 1924 May 2 +4 - +04 1930 Jun 21 +5 - +05 1981 O +6 - +06 1982 Ap +5 R +05/+06 1991 Mar 31 2s +4 R +04/+05 1992 Ja 19 2s +5 R +05/+06 1994 S 25 2s +4 R +04/+05 2004 O 31 2s +5 - +05 +Z Asia/Atyrau 3:27:44 - LMT 1924 May 2 +3 - +03 1930 Jun 21 +5 - +05 1981 O +6 - +06 1982 Ap +5 R +05/+06 1991 Mar 31 2s +4 R +04/+05 1992 Ja 19 2s +5 R +05/+06 1999 Mar 28 2s +4 R +04/+05 2004 O 31 2s +5 - +05 +Z Asia/Oral 3:25:24 - LMT 1924 May 2 +3 - +03 1930 Jun 21 +5 - +05 1981 Ap +5 1 +06 1981 O +6 - +06 1982 Ap +5 R +05/+06 1989 Mar 26 2s +4 R +04/+05 1992 Ja 19 2s +5 R +05/+06 1992 Mar 29 2s +4 R +04/+05 2004 O 31 2s +5 - +05 +R KG 1992 1996 - Ap Su>=7 0s 1 - +R KG 1992 1996 - S lastSu 0 0 - +R KG 1997 2005 - Mar lastSu 2:30 1 - +R KG 1997 2004 - O lastSu 2:30 0 - +Z Asia/Bishkek 4:58:24 - LMT 1924 May 2 +5 - +05 1930 Jun 21 +6 R +06/+07 1991 Mar 31 2s +5 R +05/+06 1991 Au 31 2 +5 KG +05/+06 2005 Au 12 +6 - +06 +R KR 1948 o - Jun 1 0 1 D +R KR 1948 o - S 12 24 0 S +R KR 1949 o - Ap 3 0 1 D +R KR 1949 1951 - S Sa>=7 24 0 S +R KR 1950 o - Ap 1 0 1 D +R KR 1951 o - May 6 0 1 D +R KR 1955 o - May 5 0 1 D +R KR 1955 o - S 8 24 0 S +R KR 1956 o - May 20 0 1 D +R KR 1956 o - S 29 24 0 S +R KR 1957 1960 - May Su>=1 0 1 D +R KR 1957 1960 - S Sa>=17 24 0 S +R KR 1987 1988 - May Su>=8 2 1 D +R KR 1987 1988 - O Su>=8 3 0 S +Z Asia/Seoul 8:27:52 - LMT 1908 Ap +8:30 - KST 1912 +9 - JST 1945 S 8 +9 KR K%sT 1954 Mar 21 +8:30 KR K%sT 1961 Au 10 +9 KR K%sT +Z Asia/Pyongyang 8:23 - LMT 1908 Ap +8:30 - KST 1912 +9 - JST 1945 Au 24 +9 - KST 2015 Au 15 +8:30 - KST 2018 May 4 23:30 +9 - KST +R l 1920 o - Mar 28 0 1 S +R l 1920 o - O 25 0 0 - +R l 1921 o - Ap 3 0 1 S +R l 1921 o - O 3 0 0 - +R l 1922 o - Mar 26 0 1 S +R l 1922 o - O 8 0 0 - +R l 1923 o - Ap 22 0 1 S +R l 1923 o - S 16 0 0 - +R l 1957 1961 - May 1 0 1 S +R l 1957 1961 - O 1 0 0 - +R l 1972 o - Jun 22 0 1 S +R l 1972 1977 - O 1 0 0 - +R l 1973 1977 - May 1 0 1 S +R l 1978 o - Ap 30 0 1 S +R l 1978 o - S 30 0 0 - +R l 1984 1987 - May 1 0 1 S +R l 1984 1991 - O 16 0 0 - +R l 1988 o - Jun 1 0 1 S +R l 1989 o - May 10 0 1 S +R l 1990 1992 - May 1 0 1 S +R l 1992 o - O 4 0 0 - +R l 1993 ma - Mar lastSu 0 1 S +R l 1993 1998 - S lastSu 0 0 - +R l 1999 ma - O lastSu 0 0 - +Z Asia/Beirut 2:22 - LMT 1880 +2 l EE%sT +R NB 1935 1941 - S 14 0 0:20 - +R NB 1935 1941 - D 14 0 0 - +Z Asia/Kuala_Lumpur 6:46:46 - LMT 1901 +6:55:25 - SMT 1905 Jun +7 - +07 1933 +7 0:20 +0720 1936 +7:20 - +0720 1941 S +7:30 - +0730 1942 F 16 +9 - +09 1945 S 12 +7:30 - +0730 1982 +8 - +08 +Z Asia/Kuching 7:21:20 - LMT 1926 Mar +7:30 - +0730 1933 +8 NB +08/+0820 1942 F 16 +9 - +09 1945 S 12 +8 - +08 +Z Indian/Maldives 4:54 - LMT 1880 +4:54 - MMT 1960 +5 - +05 +R X 1983 1984 - Ap 1 0 1 - +R X 1983 o - O 1 0 0 - +R X 1985 1998 - Mar lastSu 0 1 - +R X 1984 1998 - S lastSu 0 0 - +R X 2001 o - Ap lastSa 2 1 - +R X 2001 2006 - S lastSa 2 0 - +R X 2002 2006 - Mar lastSa 2 1 - +R X 2015 2016 - Mar lastSa 2 1 - +R X 2015 2016 - S lastSa 0 0 - +Z Asia/Hovd 6:6:36 - LMT 1905 Au +6 - +06 1978 +7 X +07/+08 +Z Asia/Ulaanbaatar 7:7:32 - LMT 1905 Au +7 - +07 1978 +8 X +08/+09 +Z Asia/Choibalsan 7:38 - LMT 1905 Au +7 - +07 1978 +8 - +08 1983 Ap +9 X +09/+10 2008 Mar 31 +8 X +08/+09 +Z Asia/Kathmandu 5:41:16 - LMT 1920 +5:30 - +0530 1986 +5:45 - +0545 +R PK 2002 o - Ap Su>=2 0 1 S +R PK 2002 o - O Su>=2 0 0 - +R PK 2008 o - Jun 1 0 1 S +R PK 2008 2009 - N 1 0 0 - +R PK 2009 o - Ap 15 0 1 S +Z Asia/Karachi 4:28:12 - LMT 1907 +5:30 - +0530 1942 S +5:30 1 +0630 1945 O 15 +5:30 - +0530 1951 S 30 +5 - +05 1971 Mar 26 +5 PK PK%sT +R P 1999 2005 - Ap F>=15 0 1 S +R P 1999 2003 - O F>=15 0 0 - +R P 2004 o - O 1 1 0 - +R P 2005 o - O 4 2 0 - +R P 2006 2007 - Ap 1 0 1 S +R P 2006 o - S 22 0 0 - +R P 2007 o - S Th>=8 2 0 - +R P 2008 2009 - Mar lastF 0 1 S +R P 2008 o - S 1 0 0 - +R P 2009 o - S F>=1 1 0 - +R P 2010 o - Mar 26 0 1 S +R P 2010 o - Au 11 0 0 - +R P 2011 o - Ap 1 0:1 1 S +R P 2011 o - Au 1 0 0 - +R P 2011 o - Au 30 0 1 S +R P 2011 o - S 30 0 0 - +R P 2012 2014 - Mar lastTh 24 1 S +R P 2012 o - S 21 1 0 - +R P 2013 o - S F>=21 0 0 - +R P 2014 2015 - O F>=21 0 0 - +R P 2015 o - Mar lastF 24 1 S +R P 2016 2018 - Mar Sa>=24 1 1 S +R P 2016 ma - O lastSa 1 0 - +R P 2019 ma - Mar lastF 0 1 S +Z Asia/Gaza 2:17:52 - LMT 1900 O +2 Z EET/EEST 1948 May 15 +2 K EE%sT 1967 Jun 5 +2 Z I%sT 1996 +2 J EE%sT 1999 +2 P EE%sT 2008 Au 29 +2 - EET 2008 S +2 P EE%sT 2010 +2 - EET 2010 Mar 27 0:1 +2 P EE%sT 2011 Au +2 - EET 2012 +2 P EE%sT +Z Asia/Hebron 2:20:23 - LMT 1900 O +2 Z EET/EEST 1948 May 15 +2 K EE%sT 1967 Jun 5 +2 Z I%sT 1996 +2 J EE%sT 1999 +2 P EE%sT +R PH 1936 o - N 1 0 1 D +R PH 1937 o - F 1 0 0 S +R PH 1954 o - Ap 12 0 1 D +R PH 1954 o - Jul 1 0 0 S +R PH 1978 o - Mar 22 0 1 D +R PH 1978 o - S 21 0 0 S +Z Asia/Manila -15:56 - LMT 1844 D 31 +8:4 - LMT 1899 May 11 +8 PH P%sT 1942 May +9 - JST 1944 N +8 PH P%sT +Z Asia/Qatar 3:26:8 - LMT 1920 +4 - +04 1972 Jun +3 - +03 +L Asia/Qatar Asia/Bahrain +Z Asia/Riyadh 3:6:52 - LMT 1947 Mar 14 +3 - +03 +L Asia/Riyadh Asia/Aden +L Asia/Riyadh Asia/Kuwait +Z Asia/Singapore 6:55:25 - LMT 1901 +6:55:25 - SMT 1905 Jun +7 - +07 1933 +7 0:20 +0720 1936 +7:20 - +0720 1941 S +7:30 - +0730 1942 F 16 +9 - +09 1945 S 12 +7:30 - +0730 1982 +8 - +08 +Z Asia/Colombo 5:19:24 - LMT 1880 +5:19:32 - MMT 1906 +5:30 - +0530 1942 Ja 5 +5:30 0:30 +06 1942 S +5:30 1 +0630 1945 O 16 2 +5:30 - +0530 1996 May 25 +6:30 - +0630 1996 O 26 0:30 +6 - +06 2006 Ap 15 0:30 +5:30 - +0530 +R S 1920 1923 - Ap Su>=15 2 1 S +R S 1920 1923 - O Su>=1 2 0 - +R S 1962 o - Ap 29 2 1 S +R S 1962 o - O 1 2 0 - +R S 1963 1965 - May 1 2 1 S +R S 1963 o - S 30 2 0 - +R S 1964 o - O 1 2 0 - +R S 1965 o - S 30 2 0 - +R S 1966 o - Ap 24 2 1 S +R S 1966 1976 - O 1 2 0 - +R S 1967 1978 - May 1 2 1 S +R S 1977 1978 - S 1 2 0 - +R S 1983 1984 - Ap 9 2 1 S +R S 1983 1984 - O 1 2 0 - +R S 1986 o - F 16 2 1 S +R S 1986 o - O 9 2 0 - +R S 1987 o - Mar 1 2 1 S +R S 1987 1988 - O 31 2 0 - +R S 1988 o - Mar 15 2 1 S +R S 1989 o - Mar 31 2 1 S +R S 1989 o - O 1 2 0 - +R S 1990 o - Ap 1 2 1 S +R S 1990 o - S 30 2 0 - +R S 1991 o - Ap 1 0 1 S +R S 1991 1992 - O 1 0 0 - +R S 1992 o - Ap 8 0 1 S +R S 1993 o - Mar 26 0 1 S +R S 1993 o - S 25 0 0 - +R S 1994 1996 - Ap 1 0 1 S +R S 1994 2005 - O 1 0 0 - +R S 1997 1998 - Mar lastM 0 1 S +R S 1999 2006 - Ap 1 0 1 S +R S 2006 o - S 22 0 0 - +R S 2007 o - Mar lastF 0 1 S +R S 2007 o - N F>=1 0 0 - +R S 2008 o - Ap F>=1 0 1 S +R S 2008 o - N 1 0 0 - +R S 2009 o - Mar lastF 0 1 S +R S 2010 2011 - Ap F>=1 0 1 S +R S 2012 ma - Mar lastF 0 1 S +R S 2009 ma - O lastF 0 0 - +Z Asia/Damascus 2:25:12 - LMT 1920 +2 S EE%sT +Z Asia/Dushanbe 4:35:12 - LMT 1924 May 2 +5 - +05 1930 Jun 21 +6 R +06/+07 1991 Mar 31 2s +5 1 +05/+06 1991 S 9 2s +5 - +05 +Z Asia/Bangkok 6:42:4 - LMT 1880 +6:42:4 - BMT 1920 Ap +7 - +07 +L Asia/Bangkok Asia/Phnom_Penh +L Asia/Bangkok Asia/Vientiane +Z Asia/Ashgabat 3:53:32 - LMT 1924 May 2 +4 - +04 1930 Jun 21 +5 R +05/+06 1991 Mar 31 2 +4 R +04/+05 1992 Ja 19 2 +5 - +05 +Z Asia/Dubai 3:41:12 - LMT 1920 +4 - +04 +L Asia/Dubai Asia/Muscat +Z Asia/Samarkand 4:27:53 - LMT 1924 May 2 +4 - +04 1930 Jun 21 +5 - +05 1981 Ap +5 1 +06 1981 O +6 - +06 1982 Ap +5 R +05/+06 1992 +5 - +05 +Z Asia/Tashkent 4:37:11 - LMT 1924 May 2 +5 - +05 1930 Jun 21 +6 R +06/+07 1991 Mar 31 2 +5 R +05/+06 1992 +5 - +05 +Z Asia/Ho_Chi_Minh 7:6:40 - LMT 1906 Jul +7:6:30 - PLMT 1911 May +7 - +07 1942 D 31 23 +8 - +08 1945 Mar 14 23 +9 - +09 1945 S 2 +7 - +07 1947 Ap +8 - +08 1955 Jul +7 - +07 1959 D 31 23 +8 - +08 1975 Jun 13 +7 - +07 +R AU 1917 o - Ja 1 0:1 1 D +R AU 1917 o - Mar 25 2 0 S +R AU 1942 o - Ja 1 2 1 D +R AU 1942 o - Mar 29 2 0 S +R AU 1942 o - S 27 2 1 D +R AU 1943 1944 - Mar lastSu 2 0 S +R AU 1943 o - O 3 2 1 D +Z Australia/Darwin 8:43:20 - LMT 1895 F +9 - ACST 1899 May +9:30 AU AC%sT +R AW 1974 o - O lastSu 2s 1 D +R AW 1975 o - Mar Su>=1 2s 0 S +R AW 1983 o - O lastSu 2s 1 D +R AW 1984 o - Mar Su>=1 2s 0 S +R AW 1991 o - N 17 2s 1 D +R AW 1992 o - Mar Su>=1 2s 0 S +R AW 2006 o - D 3 2s 1 D +R AW 2007 2009 - Mar lastSu 2s 0 S +R AW 2007 2008 - O lastSu 2s 1 D +Z Australia/Perth 7:43:24 - LMT 1895 D +8 AU AW%sT 1943 Jul +8 AW AW%sT +Z Australia/Eucla 8:35:28 - LMT 1895 D +8:45 AU +0845/+0945 1943 Jul +8:45 AW +0845/+0945 +R AQ 1971 o - O lastSu 2s 1 D +R AQ 1972 o - F lastSu 2s 0 S +R AQ 1989 1991 - O lastSu 2s 1 D +R AQ 1990 1992 - Mar Su>=1 2s 0 S +R Ho 1992 1993 - O lastSu 2s 1 D +R Ho 1993 1994 - Mar Su>=1 2s 0 S +Z Australia/Brisbane 10:12:8 - LMT 1895 +10 AU AE%sT 1971 +10 AQ AE%sT +Z Australia/Lindeman 9:55:56 - LMT 1895 +10 AU AE%sT 1971 +10 AQ AE%sT 1992 Jul +10 Ho AE%sT +R AS 1971 1985 - O lastSu 2s 1 D +R AS 1986 o - O 19 2s 1 D +R AS 1987 2007 - O lastSu 2s 1 D +R AS 1972 o - F 27 2s 0 S +R AS 1973 1985 - Mar Su>=1 2s 0 S +R AS 1986 1990 - Mar Su>=15 2s 0 S +R AS 1991 o - Mar 3 2s 0 S +R AS 1992 o - Mar 22 2s 0 S +R AS 1993 o - Mar 7 2s 0 S +R AS 1994 o - Mar 20 2s 0 S +R AS 1995 2005 - Mar lastSu 2s 0 S +R AS 2006 o - Ap 2 2s 0 S +R AS 2007 o - Mar lastSu 2s 0 S +R AS 2008 ma - Ap Su>=1 2s 0 S +R AS 2008 ma - O Su>=1 2s 1 D +Z Australia/Adelaide 9:14:20 - LMT 1895 F +9 - ACST 1899 May +9:30 AU AC%sT 1971 +9:30 AS AC%sT +R AT 1967 o - O Su>=1 2s 1 D +R AT 1968 o - Mar lastSu 2s 0 S +R AT 1968 1985 - O lastSu 2s 1 D +R AT 1969 1971 - Mar Su>=8 2s 0 S +R AT 1972 o - F lastSu 2s 0 S +R AT 1973 1981 - Mar Su>=1 2s 0 S +R AT 1982 1983 - Mar lastSu 2s 0 S +R AT 1984 1986 - Mar Su>=1 2s 0 S +R AT 1986 o - O Su>=15 2s 1 D +R AT 1987 1990 - Mar Su>=15 2s 0 S +R AT 1987 o - O Su>=22 2s 1 D +R AT 1988 1990 - O lastSu 2s 1 D +R AT 1991 1999 - O Su>=1 2s 1 D +R AT 1991 2005 - Mar lastSu 2s 0 S +R AT 2000 o - Au lastSu 2s 1 D +R AT 2001 ma - O Su>=1 2s 1 D +R AT 2006 o - Ap Su>=1 2s 0 S +R AT 2007 o - Mar lastSu 2s 0 S +R AT 2008 ma - Ap Su>=1 2s 0 S +Z Australia/Hobart 9:49:16 - LMT 1895 S +10 - AEST 1916 O 1 2 +10 1 AEDT 1917 F +10 AU AE%sT 1967 +10 AT AE%sT +Z Australia/Currie 9:35:28 - LMT 1895 S +10 - AEST 1916 O 1 2 +10 1 AEDT 1917 F +10 AU AE%sT 1971 Jul +10 AT AE%sT +R AV 1971 1985 - O lastSu 2s 1 D +R AV 1972 o - F lastSu 2s 0 S +R AV 1973 1985 - Mar Su>=1 2s 0 S +R AV 1986 1990 - Mar Su>=15 2s 0 S +R AV 1986 1987 - O Su>=15 2s 1 D +R AV 1988 1999 - O lastSu 2s 1 D +R AV 1991 1994 - Mar Su>=1 2s 0 S +R AV 1995 2005 - Mar lastSu 2s 0 S +R AV 2000 o - Au lastSu 2s 1 D +R AV 2001 2007 - O lastSu 2s 1 D +R AV 2006 o - Ap Su>=1 2s 0 S +R AV 2007 o - Mar lastSu 2s 0 S +R AV 2008 ma - Ap Su>=1 2s 0 S +R AV 2008 ma - O Su>=1 2s 1 D +Z Australia/Melbourne 9:39:52 - LMT 1895 F +10 AU AE%sT 1971 +10 AV AE%sT +R AN 1971 1985 - O lastSu 2s 1 D +R AN 1972 o - F 27 2s 0 S +R AN 1973 1981 - Mar Su>=1 2s 0 S +R AN 1982 o - Ap Su>=1 2s 0 S +R AN 1983 1985 - Mar Su>=1 2s 0 S +R AN 1986 1989 - Mar Su>=15 2s 0 S +R AN 1986 o - O 19 2s 1 D +R AN 1987 1999 - O lastSu 2s 1 D +R AN 1990 1995 - Mar Su>=1 2s 0 S +R AN 1996 2005 - Mar lastSu 2s 0 S +R AN 2000 o - Au lastSu 2s 1 D +R AN 2001 2007 - O lastSu 2s 1 D +R AN 2006 o - Ap Su>=1 2s 0 S +R AN 2007 o - Mar lastSu 2s 0 S +R AN 2008 ma - Ap Su>=1 2s 0 S +R AN 2008 ma - O Su>=1 2s 1 D +Z Australia/Sydney 10:4:52 - LMT 1895 F +10 AU AE%sT 1971 +10 AN AE%sT +Z Australia/Broken_Hill 9:25:48 - LMT 1895 F +10 - AEST 1896 Au 23 +9 - ACST 1899 May +9:30 AU AC%sT 1971 +9:30 AN AC%sT 2000 +9:30 AS AC%sT +R LH 1981 1984 - O lastSu 2 1 - +R LH 1982 1985 - Mar Su>=1 2 0 - +R LH 1985 o - O lastSu 2 0:30 - +R LH 1986 1989 - Mar Su>=15 2 0 - +R LH 1986 o - O 19 2 0:30 - +R LH 1987 1999 - O lastSu 2 0:30 - +R LH 1990 1995 - Mar Su>=1 2 0 - +R LH 1996 2005 - Mar lastSu 2 0 - +R LH 2000 o - Au lastSu 2 0:30 - +R LH 2001 2007 - O lastSu 2 0:30 - +R LH 2006 o - Ap Su>=1 2 0 - +R LH 2007 o - Mar lastSu 2 0 - +R LH 2008 ma - Ap Su>=1 2 0 - +R LH 2008 ma - O Su>=1 2 0:30 - +Z Australia/Lord_Howe 10:36:20 - LMT 1895 F +10 - AEST 1981 Mar +10:30 LH +1030/+1130 1985 Jul +10:30 LH +1030/+11 +Z Antarctica/Macquarie 0 - -00 1899 N +10 - AEST 1916 O 1 2 +10 1 AEDT 1917 F +10 AU AE%sT 1919 Ap 1 0s +0 - -00 1948 Mar 25 +10 AU AE%sT 1967 +10 AT AE%sT 2010 Ap 4 3 +11 - +11 +Z Indian/Christmas 7:2:52 - LMT 1895 F +7 - +07 +Z Indian/Cocos 6:27:40 - LMT 1900 +6:30 - +0630 +R FJ 1998 1999 - N Su>=1 2 1 - +R FJ 1999 2000 - F lastSu 3 0 - +R FJ 2009 o - N 29 2 1 - +R FJ 2010 o - Mar lastSu 3 0 - +R FJ 2010 2013 - O Su>=21 2 1 - +R FJ 2011 o - Mar Su>=1 3 0 - +R FJ 2012 2013 - Ja Su>=18 3 0 - +R FJ 2014 o - Ja Su>=18 2 0 - +R FJ 2014 2018 - N Su>=1 2 1 - +R FJ 2015 ma - Ja Su>=12 3 0 - +R FJ 2019 ma - N Su>=8 2 1 - +Z Pacific/Fiji 11:55:44 - LMT 1915 O 26 +12 FJ +12/+13 +Z Pacific/Gambier -8:59:48 - LMT 1912 O +-9 - -09 +Z Pacific/Marquesas -9:18 - LMT 1912 O +-9:30 - -0930 +Z Pacific/Tahiti -9:58:16 - LMT 1912 O +-10 - -10 +R Gu 1959 o - Jun 27 2 1 D +R Gu 1961 o - Ja 29 2 0 S +R Gu 1967 o - S 1 2 1 D +R Gu 1969 o - Ja 26 0:1 0 S +R Gu 1969 o - Jun 22 2 1 D +R Gu 1969 o - Au 31 2 0 S +R Gu 1970 1971 - Ap lastSu 2 1 D +R Gu 1970 1971 - S Su>=1 2 0 S +R Gu 1973 o - D 16 2 1 D +R Gu 1974 o - F 24 2 0 S +R Gu 1976 o - May 26 2 1 D +R Gu 1976 o - Au 22 2:1 0 S +R Gu 1977 o - Ap 24 2 1 D +R Gu 1977 o - Au 28 2 0 S +Z Pacific/Guam -14:21 - LMT 1844 D 31 +9:39 - LMT 1901 +10 - GST 1941 D 10 +9 - +09 1944 Jul 31 +10 Gu G%sT 2000 D 23 +10 - ChST +L Pacific/Guam Pacific/Saipan +Z Pacific/Tarawa 11:32:4 - LMT 1901 +12 - +12 +Z Pacific/Enderbury -11:24:20 - LMT 1901 +-12 - -12 1979 O +-11 - -11 1994 D 31 +13 - +13 +Z Pacific/Kiritimati -10:29:20 - LMT 1901 +-10:40 - -1040 1979 O +-10 - -10 1994 D 31 +14 - +14 +Z Pacific/Majuro 11:24:48 - LMT 1901 +11 - +11 1914 O +9 - +09 1919 F +11 - +11 1937 +10 - +10 1941 Ap +9 - +09 1944 Ja 30 +11 - +11 1969 O +12 - +12 +Z Pacific/Kwajalein 11:9:20 - LMT 1901 +11 - +11 1937 +10 - +10 1941 Ap +9 - +09 1944 F 6 +11 - +11 1969 O +-12 - -12 1993 Au 20 24 +12 - +12 +Z Pacific/Chuuk -13:52:52 - LMT 1844 D 31 +10:7:8 - LMT 1901 +10 - +10 1914 O +9 - +09 1919 F +10 - +10 1941 Ap +9 - +09 1945 Au +10 - +10 +Z Pacific/Pohnpei -13:27:8 - LMT 1844 D 31 +10:32:52 - LMT 1901 +11 - +11 1914 O +9 - +09 1919 F +11 - +11 1937 +10 - +10 1941 Ap +9 - +09 1945 Au +11 - +11 +Z Pacific/Kosrae -13:8:4 - LMT 1844 D 31 +10:51:56 - LMT 1901 +11 - +11 1914 O +9 - +09 1919 F +11 - +11 1937 +10 - +10 1941 Ap +9 - +09 1945 Au +11 - +11 1969 O +12 - +12 1999 +11 - +11 +Z Pacific/Nauru 11:7:40 - LMT 1921 Ja 15 +11:30 - +1130 1942 Au 29 +9 - +09 1945 S 8 +11:30 - +1130 1979 F 10 2 +12 - +12 +R NC 1977 1978 - D Su>=1 0 1 - +R NC 1978 1979 - F 27 0 0 - +R NC 1996 o - D 1 2s 1 - +R NC 1997 o - Mar 2 2s 0 - +Z Pacific/Noumea 11:5:48 - LMT 1912 Ja 13 +11 NC +11/+12 +R NZ 1927 o - N 6 2 1 S +R NZ 1928 o - Mar 4 2 0 M +R NZ 1928 1933 - O Su>=8 2 0:30 S +R NZ 1929 1933 - Mar Su>=15 2 0 M +R NZ 1934 1940 - Ap lastSu 2 0 M +R NZ 1934 1940 - S lastSu 2 0:30 S +R NZ 1946 o - Ja 1 0 0 S +R NZ 1974 o - N Su>=1 2s 1 D +R k 1974 o - N Su>=1 2:45s 1 - +R NZ 1975 o - F lastSu 2s 0 S +R k 1975 o - F lastSu 2:45s 0 - +R NZ 1975 1988 - O lastSu 2s 1 D +R k 1975 1988 - O lastSu 2:45s 1 - +R NZ 1976 1989 - Mar Su>=1 2s 0 S +R k 1976 1989 - Mar Su>=1 2:45s 0 - +R NZ 1989 o - O Su>=8 2s 1 D +R k 1989 o - O Su>=8 2:45s 1 - +R NZ 1990 2006 - O Su>=1 2s 1 D +R k 1990 2006 - O Su>=1 2:45s 1 - +R NZ 1990 2007 - Mar Su>=15 2s 0 S +R k 1990 2007 - Mar Su>=15 2:45s 0 - +R NZ 2007 ma - S lastSu 2s 1 D +R k 2007 ma - S lastSu 2:45s 1 - +R NZ 2008 ma - Ap Su>=1 2s 0 S +R k 2008 ma - Ap Su>=1 2:45s 0 - +Z Pacific/Auckland 11:39:4 - LMT 1868 N 2 +11:30 NZ NZ%sT 1946 +12 NZ NZ%sT +Z Pacific/Chatham 12:13:48 - LMT 1868 N 2 +12:15 - +1215 1946 +12:45 k +1245/+1345 +L Pacific/Auckland Antarctica/McMurdo +R CK 1978 o - N 12 0 0:30 - +R CK 1979 1991 - Mar Su>=1 0 0 - +R CK 1979 1990 - O lastSu 0 0:30 - +Z Pacific/Rarotonga -10:39:4 - LMT 1901 +-10:30 - -1030 1978 N 12 +-10 CK -10/-0930 +Z Pacific/Niue -11:19:40 - LMT 1901 +-11:20 - -1120 1951 +-11:30 - -1130 1978 O +-11 - -11 +Z Pacific/Norfolk 11:11:52 - LMT 1901 +11:12 - +1112 1951 +11:30 - +1130 1974 O 27 2s +11:30 1 +1230 1975 Mar 2 2s +11:30 - +1130 2015 O 4 2s +11 - +11 2019 Jul +11 AN +11/+12 +Z Pacific/Palau -15:2:4 - LMT 1844 D 31 +8:57:56 - LMT 1901 +9 - +09 +Z Pacific/Port_Moresby 9:48:40 - LMT 1880 +9:48:32 - PMMT 1895 +10 - +10 +Z Pacific/Bougainville 10:22:16 - LMT 1880 +9:48:32 - PMMT 1895 +10 - +10 1942 Jul +9 - +09 1945 Au 21 +10 - +10 2014 D 28 2 +11 - +11 +Z Pacific/Pitcairn -8:40:20 - LMT 1901 +-8:30 - -0830 1998 Ap 27 +-8 - -08 +Z Pacific/Pago_Pago 12:37:12 - LMT 1892 Jul 5 +-11:22:48 - LMT 1911 +-11 - SST +L Pacific/Pago_Pago Pacific/Midway +R WS 2010 o - S lastSu 0 1 - +R WS 2011 o - Ap Sa>=1 4 0 - +R WS 2011 o - S lastSa 3 1 - +R WS 2012 ma - Ap Su>=1 4 0 - +R WS 2012 ma - S lastSu 3 1 - +Z Pacific/Apia 12:33:4 - LMT 1892 Jul 5 +-11:26:56 - LMT 1911 +-11:30 - -1130 1950 +-11 WS -11/-10 2011 D 29 24 +13 WS +13/+14 +Z Pacific/Guadalcanal 10:39:48 - LMT 1912 O +11 - +11 +Z Pacific/Fakaofo -11:24:56 - LMT 1901 +-11 - -11 2011 D 30 +13 - +13 +R TO 1999 o - O 7 2s 1 - +R TO 2000 o - Mar 19 2s 0 - +R TO 2000 2001 - N Su>=1 2 1 - +R TO 2001 2002 - Ja lastSu 2 0 - +R TO 2016 o - N Su>=1 2 1 - +R TO 2017 o - Ja Su>=15 3 0 - +Z Pacific/Tongatapu 12:19:20 - LMT 1901 +12:20 - +1220 1941 +13 - +13 1999 +13 TO +13/+14 +Z Pacific/Funafuti 11:56:52 - LMT 1901 +12 - +12 +Z Pacific/Wake 11:6:28 - LMT 1901 +12 - +12 +R VU 1983 o - S 25 0 1 - +R VU 1984 1991 - Mar Su>=23 0 0 - +R VU 1984 o - O 23 0 1 - +R VU 1985 1991 - S Su>=23 0 1 - +R VU 1992 1993 - Ja Su>=23 0 0 - +R VU 1992 o - O Su>=23 0 1 - +Z Pacific/Efate 11:13:16 - LMT 1912 Ja 13 +11 VU +11/+12 +Z Pacific/Wallis 12:15:20 - LMT 1901 +12 - +12 +R G 1916 o - May 21 2s 1 BST +R G 1916 o - O 1 2s 0 GMT +R G 1917 o - Ap 8 2s 1 BST +R G 1917 o - S 17 2s 0 GMT +R G 1918 o - Mar 24 2s 1 BST +R G 1918 o - S 30 2s 0 GMT +R G 1919 o - Mar 30 2s 1 BST +R G 1919 o - S 29 2s 0 GMT +R G 1920 o - Mar 28 2s 1 BST +R G 1920 o - O 25 2s 0 GMT +R G 1921 o - Ap 3 2s 1 BST +R G 1921 o - O 3 2s 0 GMT +R G 1922 o - Mar 26 2s 1 BST +R G 1922 o - O 8 2s 0 GMT +R G 1923 o - Ap Su>=16 2s 1 BST +R G 1923 1924 - S Su>=16 2s 0 GMT +R G 1924 o - Ap Su>=9 2s 1 BST +R G 1925 1926 - Ap Su>=16 2s 1 BST +R G 1925 1938 - O Su>=2 2s 0 GMT +R G 1927 o - Ap Su>=9 2s 1 BST +R G 1928 1929 - Ap Su>=16 2s 1 BST +R G 1930 o - Ap Su>=9 2s 1 BST +R G 1931 1932 - Ap Su>=16 2s 1 BST +R G 1933 o - Ap Su>=9 2s 1 BST +R G 1934 o - Ap Su>=16 2s 1 BST +R G 1935 o - Ap Su>=9 2s 1 BST +R G 1936 1937 - Ap Su>=16 2s 1 BST +R G 1938 o - Ap Su>=9 2s 1 BST +R G 1939 o - Ap Su>=16 2s 1 BST +R G 1939 o - N Su>=16 2s 0 GMT +R G 1940 o - F Su>=23 2s 1 BST +R G 1941 o - May Su>=2 1s 2 BDST +R G 1941 1943 - Au Su>=9 1s 1 BST +R G 1942 1944 - Ap Su>=2 1s 2 BDST +R G 1944 o - S Su>=16 1s 1 BST +R G 1945 o - Ap M>=2 1s 2 BDST +R G 1945 o - Jul Su>=9 1s 1 BST +R G 1945 1946 - O Su>=2 2s 0 GMT +R G 1946 o - Ap Su>=9 2s 1 BST +R G 1947 o - Mar 16 2s 1 BST +R G 1947 o - Ap 13 1s 2 BDST +R G 1947 o - Au 10 1s 1 BST +R G 1947 o - N 2 2s 0 GMT +R G 1948 o - Mar 14 2s 1 BST +R G 1948 o - O 31 2s 0 GMT +R G 1949 o - Ap 3 2s 1 BST +R G 1949 o - O 30 2s 0 GMT +R G 1950 1952 - Ap Su>=14 2s 1 BST +R G 1950 1952 - O Su>=21 2s 0 GMT +R G 1953 o - Ap Su>=16 2s 1 BST +R G 1953 1960 - O Su>=2 2s 0 GMT +R G 1954 o - Ap Su>=9 2s 1 BST +R G 1955 1956 - Ap Su>=16 2s 1 BST +R G 1957 o - Ap Su>=9 2s 1 BST +R G 1958 1959 - Ap Su>=16 2s 1 BST +R G 1960 o - Ap Su>=9 2s 1 BST +R G 1961 1963 - Mar lastSu 2s 1 BST +R G 1961 1968 - O Su>=23 2s 0 GMT +R G 1964 1967 - Mar Su>=19 2s 1 BST +R G 1968 o - F 18 2s 1 BST +R G 1972 1980 - Mar Su>=16 2s 1 BST +R G 1972 1980 - O Su>=23 2s 0 GMT +R G 1981 1995 - Mar lastSu 1u 1 BST +R G 1981 1989 - O Su>=23 1u 0 GMT +R G 1990 1995 - O Su>=22 1u 0 GMT +Z Europe/London -0:1:15 - LMT 1847 D 1 0s +0 G %s 1968 O 27 +1 - BST 1971 O 31 2u +0 G %s 1996 +0 E GMT/BST +L Europe/London Europe/Jersey +L Europe/London Europe/Guernsey +L Europe/London Europe/Isle_of_Man +R IE 1971 o - O 31 2u -1 - +R IE 1972 1980 - Mar Su>=16 2u 0 - +R IE 1972 1980 - O Su>=23 2u -1 - +R IE 1981 ma - Mar lastSu 1u 0 - +R IE 1981 1989 - O Su>=23 1u -1 - +R IE 1990 1995 - O Su>=22 1u -1 - +R IE 1996 ma - O lastSu 1u -1 - +Z Europe/Dublin -0:25 - LMT 1880 Au 2 +-0:25:21 - DMT 1916 May 21 2s +-0:25:21 1 IST 1916 O 1 2s +0 G %s 1921 D 6 +0 G GMT/IST 1940 F 25 2s +0 1 IST 1946 O 6 2s +0 - GMT 1947 Mar 16 2s +0 1 IST 1947 N 2 2s +0 - GMT 1948 Ap 18 2s +0 G GMT/IST 1968 O 27 +1 IE IST/GMT +R E 1977 1980 - Ap Su>=1 1u 1 S +R E 1977 o - S lastSu 1u 0 - +R E 1978 o - O 1 1u 0 - +R E 1979 1995 - S lastSu 1u 0 - +R E 1981 ma - Mar lastSu 1u 1 S +R E 1996 ma - O lastSu 1u 0 - +R W- 1977 1980 - Ap Su>=1 1s 1 S +R W- 1977 o - S lastSu 1s 0 - +R W- 1978 o - O 1 1s 0 - +R W- 1979 1995 - S lastSu 1s 0 - +R W- 1981 ma - Mar lastSu 1s 1 S +R W- 1996 ma - O lastSu 1s 0 - +R c 1916 o - Ap 30 23 1 S +R c 1916 o - O 1 1 0 - +R c 1917 1918 - Ap M>=15 2s 1 S +R c 1917 1918 - S M>=15 2s 0 - +R c 1940 o - Ap 1 2s 1 S +R c 1942 o - N 2 2s 0 - +R c 1943 o - Mar 29 2s 1 S +R c 1943 o - O 4 2s 0 - +R c 1944 1945 - Ap M>=1 2s 1 S +R c 1944 o - O 2 2s 0 - +R c 1945 o - S 16 2s 0 - +R c 1977 1980 - Ap Su>=1 2s 1 S +R c 1977 o - S lastSu 2s 0 - +R c 1978 o - O 1 2s 0 - +R c 1979 1995 - S lastSu 2s 0 - +R c 1981 ma - Mar lastSu 2s 1 S +R c 1996 ma - O lastSu 2s 0 - +R e 1977 1980 - Ap Su>=1 0 1 S +R e 1977 o - S lastSu 0 0 - +R e 1978 o - O 1 0 0 - +R e 1979 1995 - S lastSu 0 0 - +R e 1981 ma - Mar lastSu 0 1 S +R e 1996 ma - O lastSu 0 0 - +R R 1917 o - Jul 1 23 1 MST +R R 1917 o - D 28 0 0 MMT +R R 1918 o - May 31 22 2 MDST +R R 1918 o - S 16 1 1 MST +R R 1919 o - May 31 23 2 MDST +R R 1919 o - Jul 1 0u 1 MSD +R R 1919 o - Au 16 0 0 MSK +R R 1921 o - F 14 23 1 MSD +R R 1921 o - Mar 20 23 2 +05 +R R 1921 o - S 1 0 1 MSD +R R 1921 o - O 1 0 0 - +R R 1981 1984 - Ap 1 0 1 S +R R 1981 1983 - O 1 0 0 - +R R 1984 1995 - S lastSu 2s 0 - +R R 1985 2010 - Mar lastSu 2s 1 S +R R 1996 2010 - O lastSu 2s 0 - +Z WET 0 E WE%sT +Z CET 1 c CE%sT +Z MET 1 c ME%sT +Z EET 2 E EE%sT +R q 1940 o - Jun 16 0 1 S +R q 1942 o - N 2 3 0 - +R q 1943 o - Mar 29 2 1 S +R q 1943 o - Ap 10 3 0 - +R q 1974 o - May 4 0 1 S +R q 1974 o - O 2 0 0 - +R q 1975 o - May 1 0 1 S +R q 1975 o - O 2 0 0 - +R q 1976 o - May 2 0 1 S +R q 1976 o - O 3 0 0 - +R q 1977 o - May 8 0 1 S +R q 1977 o - O 2 0 0 - +R q 1978 o - May 6 0 1 S +R q 1978 o - O 1 0 0 - +R q 1979 o - May 5 0 1 S +R q 1979 o - S 30 0 0 - +R q 1980 o - May 3 0 1 S +R q 1980 o - O 4 0 0 - +R q 1981 o - Ap 26 0 1 S +R q 1981 o - S 27 0 0 - +R q 1982 o - May 2 0 1 S +R q 1982 o - O 3 0 0 - +R q 1983 o - Ap 18 0 1 S +R q 1983 o - O 1 0 0 - +R q 1984 o - Ap 1 0 1 S +Z Europe/Tirane 1:19:20 - LMT 1914 +1 - CET 1940 Jun 16 +1 q CE%sT 1984 Jul +1 E CE%sT +Z Europe/Andorra 0:6:4 - LMT 1901 +0 - WET 1946 S 30 +1 - CET 1985 Mar 31 2 +1 E CE%sT +R a 1920 o - Ap 5 2s 1 S +R a 1920 o - S 13 2s 0 - +R a 1946 o - Ap 14 2s 1 S +R a 1946 o - O 7 2s 0 - +R a 1947 1948 - O Su>=1 2s 0 - +R a 1947 o - Ap 6 2s 1 S +R a 1948 o - Ap 18 2s 1 S +R a 1980 o - Ap 6 0 1 S +R a 1980 o - S 28 0 0 - +Z Europe/Vienna 1:5:21 - LMT 1893 Ap +1 c CE%sT 1920 +1 a CE%sT 1940 Ap 1 2s +1 c CE%sT 1945 Ap 2 2s +1 1 CEST 1945 Ap 12 2s +1 - CET 1946 +1 a CE%sT 1981 +1 E CE%sT +Z Europe/Minsk 1:50:16 - LMT 1880 +1:50 - MMT 1924 May 2 +2 - EET 1930 Jun 21 +3 - MSK 1941 Jun 28 +1 c CE%sT 1944 Jul 3 +3 R MSK/MSD 1990 +3 - MSK 1991 Mar 31 2s +2 R EE%sT 2011 Mar 27 2s +3 - +03 +R b 1918 o - Mar 9 0s 1 S +R b 1918 1919 - O Sa>=1 23s 0 - +R b 1919 o - Mar 1 23s 1 S +R b 1920 o - F 14 23s 1 S +R b 1920 o - O 23 23s 0 - +R b 1921 o - Mar 14 23s 1 S +R b 1921 o - O 25 23s 0 - +R b 1922 o - Mar 25 23s 1 S +R b 1922 1927 - O Sa>=1 23s 0 - +R b 1923 o - Ap 21 23s 1 S +R b 1924 o - Mar 29 23s 1 S +R b 1925 o - Ap 4 23s 1 S +R b 1926 o - Ap 17 23s 1 S +R b 1927 o - Ap 9 23s 1 S +R b 1928 o - Ap 14 23s 1 S +R b 1928 1938 - O Su>=2 2s 0 - +R b 1929 o - Ap 21 2s 1 S +R b 1930 o - Ap 13 2s 1 S +R b 1931 o - Ap 19 2s 1 S +R b 1932 o - Ap 3 2s 1 S +R b 1933 o - Mar 26 2s 1 S +R b 1934 o - Ap 8 2s 1 S +R b 1935 o - Mar 31 2s 1 S +R b 1936 o - Ap 19 2s 1 S +R b 1937 o - Ap 4 2s 1 S +R b 1938 o - Mar 27 2s 1 S +R b 1939 o - Ap 16 2s 1 S +R b 1939 o - N 19 2s 0 - +R b 1940 o - F 25 2s 1 S +R b 1944 o - S 17 2s 0 - +R b 1945 o - Ap 2 2s 1 S +R b 1945 o - S 16 2s 0 - +R b 1946 o - May 19 2s 1 S +R b 1946 o - O 7 2s 0 - +Z Europe/Brussels 0:17:30 - LMT 1880 +0:17:30 - BMT 1892 May 1 0:17:30 +0 - WET 1914 N 8 +1 - CET 1916 May +1 c CE%sT 1918 N 11 11u +0 b WE%sT 1940 May 20 2s +1 c CE%sT 1944 S 3 +1 b CE%sT 1977 +1 E CE%sT +R BG 1979 o - Mar 31 23 1 S +R BG 1979 o - O 1 1 0 - +R BG 1980 1982 - Ap Sa>=1 23 1 S +R BG 1980 o - S 29 1 0 - +R BG 1981 o - S 27 2 0 - +Z Europe/Sofia 1:33:16 - LMT 1880 +1:56:56 - IMT 1894 N 30 +2 - EET 1942 N 2 3 +1 c CE%sT 1945 +1 - CET 1945 Ap 2 3 +2 - EET 1979 Mar 31 23 +2 BG EE%sT 1982 S 26 3 +2 c EE%sT 1991 +2 e EE%sT 1997 +2 E EE%sT +R CZ 1945 o - Ap M>=1 2s 1 S +R CZ 1945 o - O 1 2s 0 - +R CZ 1946 o - May 6 2s 1 S +R CZ 1946 1949 - O Su>=1 2s 0 - +R CZ 1947 1948 - Ap Su>=15 2s 1 S +R CZ 1949 o - Ap 9 2s 1 S +Z Europe/Prague 0:57:44 - LMT 1850 +0:57:44 - PMT 1891 O +1 c CE%sT 1945 May 9 +1 CZ CE%sT 1946 D 1 3 +1 -1 GMT 1947 F 23 2 +1 CZ CE%sT 1979 +1 E CE%sT +R D 1916 o - May 14 23 1 S +R D 1916 o - S 30 23 0 - +R D 1940 o - May 15 0 1 S +R D 1945 o - Ap 2 2s 1 S +R D 1945 o - Au 15 2s 0 - +R D 1946 o - May 1 2s 1 S +R D 1946 o - S 1 2s 0 - +R D 1947 o - May 4 2s 1 S +R D 1947 o - Au 10 2s 0 - +R D 1948 o - May 9 2s 1 S +R D 1948 o - Au 8 2s 0 - +Z Europe/Copenhagen 0:50:20 - LMT 1890 +0:50:20 - CMT 1894 +1 D CE%sT 1942 N 2 2s +1 c CE%sT 1945 Ap 2 2 +1 D CE%sT 1980 +1 E CE%sT +Z Atlantic/Faroe -0:27:4 - LMT 1908 Ja 11 +0 - WET 1981 +0 E WE%sT +R Th 1991 1992 - Mar lastSu 2 1 D +R Th 1991 1992 - S lastSu 2 0 S +R Th 1993 2006 - Ap Su>=1 2 1 D +R Th 1993 2006 - O lastSu 2 0 S +R Th 2007 ma - Mar Su>=8 2 1 D +R Th 2007 ma - N Su>=1 2 0 S +Z America/Danmarkshavn -1:14:40 - LMT 1916 Jul 28 +-3 - -03 1980 Ap 6 2 +-3 E -03/-02 1996 +0 - GMT +Z America/Scoresbysund -1:27:52 - LMT 1916 Jul 28 +-2 - -02 1980 Ap 6 2 +-2 c -02/-01 1981 Mar 29 +-1 E -01/+00 +Z America/Godthab -3:26:56 - LMT 1916 Jul 28 +-3 - -03 1980 Ap 6 2 +-3 E -03/-02 +Z America/Thule -4:35:8 - LMT 1916 Jul 28 +-4 Th A%sT +Z Europe/Tallinn 1:39 - LMT 1880 +1:39 - TMT 1918 F +1 c CE%sT 1919 Jul +1:39 - TMT 1921 May +2 - EET 1940 Au 6 +3 - MSK 1941 S 15 +1 c CE%sT 1944 S 22 +3 R MSK/MSD 1989 Mar 26 2s +2 1 EEST 1989 S 24 2s +2 c EE%sT 1998 S 22 +2 E EE%sT 1999 O 31 4 +2 - EET 2002 F 21 +2 E EE%sT +R FI 1942 o - Ap 2 24 1 S +R FI 1942 o - O 4 1 0 - +R FI 1981 1982 - Mar lastSu 2 1 S +R FI 1981 1982 - S lastSu 3 0 - +Z Europe/Helsinki 1:39:49 - LMT 1878 May 31 +1:39:49 - HMT 1921 May +2 FI EE%sT 1983 +2 E EE%sT +L Europe/Helsinki Europe/Mariehamn +R F 1916 o - Jun 14 23s 1 S +R F 1916 1919 - O Su>=1 23s 0 - +R F 1917 o - Mar 24 23s 1 S +R F 1918 o - Mar 9 23s 1 S +R F 1919 o - Mar 1 23s 1 S +R F 1920 o - F 14 23s 1 S +R F 1920 o - O 23 23s 0 - +R F 1921 o - Mar 14 23s 1 S +R F 1921 o - O 25 23s 0 - +R F 1922 o - Mar 25 23s 1 S +R F 1922 1938 - O Sa>=1 23s 0 - +R F 1923 o - May 26 23s 1 S +R F 1924 o - Mar 29 23s 1 S +R F 1925 o - Ap 4 23s 1 S +R F 1926 o - Ap 17 23s 1 S +R F 1927 o - Ap 9 23s 1 S +R F 1928 o - Ap 14 23s 1 S +R F 1929 o - Ap 20 23s 1 S +R F 1930 o - Ap 12 23s 1 S +R F 1931 o - Ap 18 23s 1 S +R F 1932 o - Ap 2 23s 1 S +R F 1933 o - Mar 25 23s 1 S +R F 1934 o - Ap 7 23s 1 S +R F 1935 o - Mar 30 23s 1 S +R F 1936 o - Ap 18 23s 1 S +R F 1937 o - Ap 3 23s 1 S +R F 1938 o - Mar 26 23s 1 S +R F 1939 o - Ap 15 23s 1 S +R F 1939 o - N 18 23s 0 - +R F 1940 o - F 25 2 1 S +R F 1941 o - May 5 0 2 M +R F 1941 o - O 6 0 1 S +R F 1942 o - Mar 9 0 2 M +R F 1942 o - N 2 3 1 S +R F 1943 o - Mar 29 2 2 M +R F 1943 o - O 4 3 1 S +R F 1944 o - Ap 3 2 2 M +R F 1944 o - O 8 1 1 S +R F 1945 o - Ap 2 2 2 M +R F 1945 o - S 16 3 0 - +R F 1976 o - Mar 28 1 1 S +R F 1976 o - S 26 1 0 - +Z Europe/Paris 0:9:21 - LMT 1891 Mar 15 0:1 +0:9:21 - PMT 1911 Mar 11 0:1 +0 F WE%sT 1940 Jun 14 23 +1 c CE%sT 1944 Au 25 +0 F WE%sT 1945 S 16 3 +1 F CE%sT 1977 +1 E CE%sT +R DE 1946 o - Ap 14 2s 1 S +R DE 1946 o - O 7 2s 0 - +R DE 1947 1949 - O Su>=1 2s 0 - +R DE 1947 o - Ap 6 3s 1 S +R DE 1947 o - May 11 2s 2 M +R DE 1947 o - Jun 29 3 1 S +R DE 1948 o - Ap 18 2s 1 S +R DE 1949 o - Ap 10 2s 1 S +R So 1945 o - May 24 2 2 M +R So 1945 o - S 24 3 1 S +R So 1945 o - N 18 2s 0 - +Z Europe/Berlin 0:53:28 - LMT 1893 Ap +1 c CE%sT 1945 May 24 2 +1 So CE%sT 1946 +1 DE CE%sT 1980 +1 E CE%sT +L Europe/Zurich Europe/Busingen +Z Europe/Gibraltar -0:21:24 - LMT 1880 Au 2 0s +0 G %s 1957 Ap 14 2 +1 - CET 1982 +1 E CE%sT +R g 1932 o - Jul 7 0 1 S +R g 1932 o - S 1 0 0 - +R g 1941 o - Ap 7 0 1 S +R g 1942 o - N 2 3 0 - +R g 1943 o - Mar 30 0 1 S +R g 1943 o - O 4 0 0 - +R g 1952 o - Jul 1 0 1 S +R g 1952 o - N 2 0 0 - +R g 1975 o - Ap 12 0s 1 S +R g 1975 o - N 26 0s 0 - +R g 1976 o - Ap 11 2s 1 S +R g 1976 o - O 10 2s 0 - +R g 1977 1978 - Ap Su>=1 2s 1 S +R g 1977 o - S 26 2s 0 - +R g 1978 o - S 24 4 0 - +R g 1979 o - Ap 1 9 1 S +R g 1979 o - S 29 2 0 - +R g 1980 o - Ap 1 0 1 S +R g 1980 o - S 28 0 0 - +Z Europe/Athens 1:34:52 - LMT 1895 S 14 +1:34:52 - AMT 1916 Jul 28 0:1 +2 g EE%sT 1941 Ap 30 +1 g CE%sT 1944 Ap 4 +2 g EE%sT 1981 +2 E EE%sT +R h 1918 o - Ap 1 3 1 S +R h 1918 o - S 16 3 0 - +R h 1919 o - Ap 15 3 1 S +R h 1919 o - N 24 3 0 - +R h 1945 o - May 1 23 1 S +R h 1945 o - N 1 0 0 - +R h 1946 o - Mar 31 2s 1 S +R h 1946 1949 - O Su>=1 2s 0 - +R h 1947 1949 - Ap Su>=4 2s 1 S +R h 1950 o - Ap 17 2s 1 S +R h 1950 o - O 23 2s 0 - +R h 1954 1955 - May 23 0 1 S +R h 1954 1955 - O 3 0 0 - +R h 1956 o - Jun Su>=1 0 1 S +R h 1956 o - S lastSu 0 0 - +R h 1957 o - Jun Su>=1 1 1 S +R h 1957 o - S lastSu 3 0 - +R h 1980 o - Ap 6 1 1 S +Z Europe/Budapest 1:16:20 - LMT 1890 O +1 c CE%sT 1918 +1 h CE%sT 1941 Ap 8 +1 c CE%sT 1945 +1 h CE%sT 1980 S 28 2s +1 E CE%sT +R w 1917 1919 - F 19 23 1 - +R w 1917 o - O 21 1 0 - +R w 1918 1919 - N 16 1 0 - +R w 1921 o - Mar 19 23 1 - +R w 1921 o - Jun 23 1 0 - +R w 1939 o - Ap 29 23 1 - +R w 1939 o - O 29 2 0 - +R w 1940 o - F 25 2 1 - +R w 1940 1941 - N Su>=2 1s 0 - +R w 1941 1942 - Mar Su>=2 1s 1 - +R w 1943 1946 - Mar Su>=1 1s 1 - +R w 1942 1948 - O Su>=22 1s 0 - +R w 1947 1967 - Ap Su>=1 1s 1 - +R w 1949 o - O 30 1s 0 - +R w 1950 1966 - O Su>=22 1s 0 - +R w 1967 o - O 29 1s 0 - +Z Atlantic/Reykjavik -1:28 - LMT 1908 +-1 w -01/+00 1968 Ap 7 1s +0 - GMT +R I 1916 o - Jun 3 24 1 S +R I 1916 1917 - S 30 24 0 - +R I 1917 o - Mar 31 24 1 S +R I 1918 o - Mar 9 24 1 S +R I 1918 o - O 6 24 0 - +R I 1919 o - Mar 1 24 1 S +R I 1919 o - O 4 24 0 - +R I 1920 o - Mar 20 24 1 S +R I 1920 o - S 18 24 0 - +R I 1940 o - Jun 14 24 1 S +R I 1942 o - N 2 2s 0 - +R I 1943 o - Mar 29 2s 1 S +R I 1943 o - O 4 2s 0 - +R I 1944 o - Ap 2 2s 1 S +R I 1944 o - S 17 2s 0 - +R I 1945 o - Ap 2 2 1 S +R I 1945 o - S 15 1 0 - +R I 1946 o - Mar 17 2s 1 S +R I 1946 o - O 6 2s 0 - +R I 1947 o - Mar 16 0s 1 S +R I 1947 o - O 5 0s 0 - +R I 1948 o - F 29 2s 1 S +R I 1948 o - O 3 2s 0 - +R I 1966 1968 - May Su>=22 0s 1 S +R I 1966 o - S 24 24 0 - +R I 1967 1969 - S Su>=22 0s 0 - +R I 1969 o - Jun 1 0s 1 S +R I 1970 o - May 31 0s 1 S +R I 1970 o - S lastSu 0s 0 - +R I 1971 1972 - May Su>=22 0s 1 S +R I 1971 o - S lastSu 0s 0 - +R I 1972 o - O 1 0s 0 - +R I 1973 o - Jun 3 0s 1 S +R I 1973 1974 - S lastSu 0s 0 - +R I 1974 o - May 26 0s 1 S +R I 1975 o - Jun 1 0s 1 S +R I 1975 1977 - S lastSu 0s 0 - +R I 1976 o - May 30 0s 1 S +R I 1977 1979 - May Su>=22 0s 1 S +R I 1978 o - O 1 0s 0 - +R I 1979 o - S 30 0s 0 - +Z Europe/Rome 0:49:56 - LMT 1866 D 12 +0:49:56 - RMT 1893 O 31 23:49:56 +1 I CE%sT 1943 S 10 +1 c CE%sT 1944 Jun 4 +1 I CE%sT 1980 +1 E CE%sT +L Europe/Rome Europe/Vatican +L Europe/Rome Europe/San_Marino +R LV 1989 1996 - Mar lastSu 2s 1 S +R LV 1989 1996 - S lastSu 2s 0 - +Z Europe/Riga 1:36:34 - LMT 1880 +1:36:34 - RMT 1918 Ap 15 2 +1:36:34 1 LST 1918 S 16 3 +1:36:34 - RMT 1919 Ap 1 2 +1:36:34 1 LST 1919 May 22 3 +1:36:34 - RMT 1926 May 11 +2 - EET 1940 Au 5 +3 - MSK 1941 Jul +1 c CE%sT 1944 O 13 +3 R MSK/MSD 1989 Mar lastSu 2s +2 1 EEST 1989 S lastSu 2s +2 LV EE%sT 1997 Ja 21 +2 E EE%sT 2000 F 29 +2 - EET 2001 Ja 2 +2 E EE%sT +L Europe/Zurich Europe/Vaduz +Z Europe/Vilnius 1:41:16 - LMT 1880 +1:24 - WMT 1917 +1:35:36 - KMT 1919 O 10 +1 - CET 1920 Jul 12 +2 - EET 1920 O 9 +1 - CET 1940 Au 3 +3 - MSK 1941 Jun 24 +1 c CE%sT 1944 Au +3 R MSK/MSD 1989 Mar 26 2s +2 R EE%sT 1991 S 29 2s +2 c EE%sT 1998 +2 - EET 1998 Mar 29 1u +1 E CE%sT 1999 O 31 1u +2 - EET 2003 +2 E EE%sT +R LX 1916 o - May 14 23 1 S +R LX 1916 o - O 1 1 0 - +R LX 1917 o - Ap 28 23 1 S +R LX 1917 o - S 17 1 0 - +R LX 1918 o - Ap M>=15 2s 1 S +R LX 1918 o - S M>=15 2s 0 - +R LX 1919 o - Mar 1 23 1 S +R LX 1919 o - O 5 3 0 - +R LX 1920 o - F 14 23 1 S +R LX 1920 o - O 24 2 0 - +R LX 1921 o - Mar 14 23 1 S +R LX 1921 o - O 26 2 0 - +R LX 1922 o - Mar 25 23 1 S +R LX 1922 o - O Su>=2 1 0 - +R LX 1923 o - Ap 21 23 1 S +R LX 1923 o - O Su>=2 2 0 - +R LX 1924 o - Mar 29 23 1 S +R LX 1924 1928 - O Su>=2 1 0 - +R LX 1925 o - Ap 5 23 1 S +R LX 1926 o - Ap 17 23 1 S +R LX 1927 o - Ap 9 23 1 S +R LX 1928 o - Ap 14 23 1 S +R LX 1929 o - Ap 20 23 1 S +Z Europe/Luxembourg 0:24:36 - LMT 1904 Jun +1 LX CE%sT 1918 N 25 +0 LX WE%sT 1929 O 6 2s +0 b WE%sT 1940 May 14 3 +1 c WE%sT 1944 S 18 3 +1 b CE%sT 1977 +1 E CE%sT +R MT 1973 o - Mar 31 0s 1 S +R MT 1973 o - S 29 0s 0 - +R MT 1974 o - Ap 21 0s 1 S +R MT 1974 o - S 16 0s 0 - +R MT 1975 1979 - Ap Su>=15 2 1 S +R MT 1975 1980 - S Su>=15 2 0 - +R MT 1980 o - Mar 31 2 1 S +Z Europe/Malta 0:58:4 - LMT 1893 N 2 0s +1 I CE%sT 1973 Mar 31 +1 MT CE%sT 1981 +1 E CE%sT +R MD 1997 ma - Mar lastSu 2 1 S +R MD 1997 ma - O lastSu 3 0 - +Z Europe/Chisinau 1:55:20 - LMT 1880 +1:55 - CMT 1918 F 15 +1:44:24 - BMT 1931 Jul 24 +2 z EE%sT 1940 Au 15 +2 1 EEST 1941 Jul 17 +1 c CE%sT 1944 Au 24 +3 R MSK/MSD 1990 May 6 2 +2 R EE%sT 1992 +2 e EE%sT 1997 +2 MD EE%sT +Z Europe/Monaco 0:29:32 - LMT 1891 Mar 15 +0:9:21 - PMT 1911 Mar 11 +0 F WE%sT 1945 S 16 3 +1 F CE%sT 1977 +1 E CE%sT +R N 1916 o - May 1 0 1 NST +R N 1916 o - O 1 0 0 AMT +R N 1917 o - Ap 16 2s 1 NST +R N 1917 o - S 17 2s 0 AMT +R N 1918 1921 - Ap M>=1 2s 1 NST +R N 1918 1921 - S lastM 2s 0 AMT +R N 1922 o - Mar lastSu 2s 1 NST +R N 1922 1936 - O Su>=2 2s 0 AMT +R N 1923 o - Jun F>=1 2s 1 NST +R N 1924 o - Mar lastSu 2s 1 NST +R N 1925 o - Jun F>=1 2s 1 NST +R N 1926 1931 - May 15 2s 1 NST +R N 1932 o - May 22 2s 1 NST +R N 1933 1936 - May 15 2s 1 NST +R N 1937 o - May 22 2s 1 NST +R N 1937 o - Jul 1 0 1 S +R N 1937 1939 - O Su>=2 2s 0 - +R N 1938 1939 - May 15 2s 1 S +R N 1945 o - Ap 2 2s 1 S +R N 1945 o - S 16 2s 0 - +Z Europe/Amsterdam 0:19:32 - LMT 1835 +0:19:32 N %s 1937 Jul +0:20 N +0020/+0120 1940 May 16 +1 c CE%sT 1945 Ap 2 2 +1 N CE%sT 1977 +1 E CE%sT +R NO 1916 o - May 22 1 1 S +R NO 1916 o - S 30 0 0 - +R NO 1945 o - Ap 2 2s 1 S +R NO 1945 o - O 1 2s 0 - +R NO 1959 1964 - Mar Su>=15 2s 1 S +R NO 1959 1965 - S Su>=15 2s 0 - +R NO 1965 o - Ap 25 2s 1 S +Z Europe/Oslo 0:43 - LMT 1895 +1 NO CE%sT 1940 Au 10 23 +1 c CE%sT 1945 Ap 2 2 +1 NO CE%sT 1980 +1 E CE%sT +L Europe/Oslo Arctic/Longyearbyen +R O 1918 1919 - S 16 2s 0 - +R O 1919 o - Ap 15 2s 1 S +R O 1944 o - Ap 3 2s 1 S +R O 1944 o - O 4 2 0 - +R O 1945 o - Ap 29 0 1 S +R O 1945 o - N 1 0 0 - +R O 1946 o - Ap 14 0s 1 S +R O 1946 o - O 7 2s 0 - +R O 1947 o - May 4 2s 1 S +R O 1947 1949 - O Su>=1 2s 0 - +R O 1948 o - Ap 18 2s 1 S +R O 1949 o - Ap 10 2s 1 S +R O 1957 o - Jun 2 1s 1 S +R O 1957 1958 - S lastSu 1s 0 - +R O 1958 o - Mar 30 1s 1 S +R O 1959 o - May 31 1s 1 S +R O 1959 1961 - O Su>=1 1s 0 - +R O 1960 o - Ap 3 1s 1 S +R O 1961 1964 - May lastSu 1s 1 S +R O 1962 1964 - S lastSu 1s 0 - +Z Europe/Warsaw 1:24 - LMT 1880 +1:24 - WMT 1915 Au 5 +1 c CE%sT 1918 S 16 3 +2 O EE%sT 1922 Jun +1 O CE%sT 1940 Jun 23 2 +1 c CE%sT 1944 O +1 O CE%sT 1977 +1 W- CE%sT 1988 +1 E CE%sT +R p 1916 o - Jun 17 23 1 S +R p 1916 o - N 1 1 0 - +R p 1917 o - F 28 23s 1 S +R p 1917 1921 - O 14 23s 0 - +R p 1918 o - Mar 1 23s 1 S +R p 1919 o - F 28 23s 1 S +R p 1920 o - F 29 23s 1 S +R p 1921 o - F 28 23s 1 S +R p 1924 o - Ap 16 23s 1 S +R p 1924 o - O 14 23s 0 - +R p 1926 o - Ap 17 23s 1 S +R p 1926 1929 - O Sa>=1 23s 0 - +R p 1927 o - Ap 9 23s 1 S +R p 1928 o - Ap 14 23s 1 S +R p 1929 o - Ap 20 23s 1 S +R p 1931 o - Ap 18 23s 1 S +R p 1931 1932 - O Sa>=1 23s 0 - +R p 1932 o - Ap 2 23s 1 S +R p 1934 o - Ap 7 23s 1 S +R p 1934 1938 - O Sa>=1 23s 0 - +R p 1935 o - Mar 30 23s 1 S +R p 1936 o - Ap 18 23s 1 S +R p 1937 o - Ap 3 23s 1 S +R p 1938 o - Mar 26 23s 1 S +R p 1939 o - Ap 15 23s 1 S +R p 1939 o - N 18 23s 0 - +R p 1940 o - F 24 23s 1 S +R p 1940 1941 - O 5 23s 0 - +R p 1941 o - Ap 5 23s 1 S +R p 1942 1945 - Mar Sa>=8 23s 1 S +R p 1942 o - Ap 25 22s 2 M +R p 1942 o - Au 15 22s 1 S +R p 1942 1945 - O Sa>=24 23s 0 - +R p 1943 o - Ap 17 22s 2 M +R p 1943 1945 - Au Sa>=25 22s 1 S +R p 1944 1945 - Ap Sa>=21 22s 2 M +R p 1946 o - Ap Sa>=1 23s 1 S +R p 1946 o - O Sa>=1 23s 0 - +R p 1947 1949 - Ap Su>=1 2s 1 S +R p 1947 1949 - O Su>=1 2s 0 - +R p 1951 1965 - Ap Su>=1 2s 1 S +R p 1951 1965 - O Su>=1 2s 0 - +R p 1977 o - Mar 27 0s 1 S +R p 1977 o - S 25 0s 0 - +R p 1978 1979 - Ap Su>=1 0s 1 S +R p 1978 o - O 1 0s 0 - +R p 1979 1982 - S lastSu 1s 0 - +R p 1980 o - Mar lastSu 0s 1 S +R p 1981 1982 - Mar lastSu 1s 1 S +R p 1983 o - Mar lastSu 2s 1 S +Z Europe/Lisbon -0:36:45 - LMT 1884 +-0:36:45 - LMT 1912 Ja 1 0u +0 p WE%sT 1966 Ap 3 2 +1 - CET 1976 S 26 1 +0 p WE%sT 1983 S 25 1s +0 W- WE%sT 1992 S 27 1s +1 E CE%sT 1996 Mar 31 1u +0 E WE%sT +Z Atlantic/Azores -1:42:40 - LMT 1884 +-1:54:32 - HMT 1912 Ja 1 2u +-2 p -02/-01 1942 Ap 25 22s +-2 p +00 1942 Au 15 22s +-2 p -02/-01 1943 Ap 17 22s +-2 p +00 1943 Au 28 22s +-2 p -02/-01 1944 Ap 22 22s +-2 p +00 1944 Au 26 22s +-2 p -02/-01 1945 Ap 21 22s +-2 p +00 1945 Au 25 22s +-2 p -02/-01 1966 Ap 3 2 +-1 p -01/+00 1983 S 25 1s +-1 W- -01/+00 1992 S 27 1s +0 E WE%sT 1993 Mar 28 1u +-1 E -01/+00 +Z Atlantic/Madeira -1:7:36 - LMT 1884 +-1:7:36 - FMT 1912 Ja 1 1u +-1 p -01/+00 1942 Ap 25 22s +-1 p +01 1942 Au 15 22s +-1 p -01/+00 1943 Ap 17 22s +-1 p +01 1943 Au 28 22s +-1 p -01/+00 1944 Ap 22 22s +-1 p +01 1944 Au 26 22s +-1 p -01/+00 1945 Ap 21 22s +-1 p +01 1945 Au 25 22s +-1 p -01/+00 1966 Ap 3 2 +0 p WE%sT 1983 S 25 1s +0 E WE%sT +R z 1932 o - May 21 0s 1 S +R z 1932 1939 - O Su>=1 0s 0 - +R z 1933 1939 - Ap Su>=2 0s 1 S +R z 1979 o - May 27 0 1 S +R z 1979 o - S lastSu 0 0 - +R z 1980 o - Ap 5 23 1 S +R z 1980 o - S lastSu 1 0 - +R z 1991 1993 - Mar lastSu 0s 1 S +R z 1991 1993 - S lastSu 0s 0 - +Z Europe/Bucharest 1:44:24 - LMT 1891 O +1:44:24 - BMT 1931 Jul 24 +2 z EE%sT 1981 Mar 29 2s +2 c EE%sT 1991 +2 z EE%sT 1994 +2 e EE%sT 1997 +2 E EE%sT +Z Europe/Kaliningrad 1:22 - LMT 1893 Ap +1 c CE%sT 1945 Ap 10 +2 O EE%sT 1946 Ap 7 +3 R MSK/MSD 1989 Mar 26 2s +2 R EE%sT 2011 Mar 27 2s +3 - +03 2014 O 26 2s +2 - EET +Z Europe/Moscow 2:30:17 - LMT 1880 +2:30:17 - MMT 1916 Jul 3 +2:31:19 R %s 1919 Jul 1 0u +3 R %s 1921 O +3 R MSK/MSD 1922 O +2 - EET 1930 Jun 21 +3 R MSK/MSD 1991 Mar 31 2s +2 R EE%sT 1992 Ja 19 2s +3 R MSK/MSD 2011 Mar 27 2s +4 - MSK 2014 O 26 2s +3 - MSK +Z Europe/Simferopol 2:16:24 - LMT 1880 +2:16 - SMT 1924 May 2 +2 - EET 1930 Jun 21 +3 - MSK 1941 N +1 c CE%sT 1944 Ap 13 +3 R MSK/MSD 1990 +3 - MSK 1990 Jul 1 2 +2 - EET 1992 +2 e EE%sT 1994 May +3 e MSK/MSD 1996 Mar 31 0s +3 1 MSD 1996 O 27 3s +3 R MSK/MSD 1997 +3 - MSK 1997 Mar lastSu 1u +2 E EE%sT 2014 Mar 30 2 +4 - MSK 2014 O 26 2s +3 - MSK +Z Europe/Astrakhan 3:12:12 - LMT 1924 May +3 - +03 1930 Jun 21 +4 R +04/+05 1989 Mar 26 2s +3 R +03/+04 1991 Mar 31 2s +4 - +04 1992 Mar 29 2s +3 R +03/+04 2011 Mar 27 2s +4 - +04 2014 O 26 2s +3 - +03 2016 Mar 27 2s +4 - +04 +Z Europe/Volgograd 2:57:40 - LMT 1920 Ja 3 +3 - +03 1930 Jun 21 +4 - +04 1961 N 11 +4 R +04/+05 1988 Mar 27 2s +3 R +03/+04 1991 Mar 31 2s +4 - +04 1992 Mar 29 2s +3 R +03/+04 2011 Mar 27 2s +4 - +04 2014 O 26 2s +3 - +03 2018 O 28 2s +4 - +04 +Z Europe/Saratov 3:4:18 - LMT 1919 Jul 1 0u +3 - +03 1930 Jun 21 +4 R +04/+05 1988 Mar 27 2s +3 R +03/+04 1991 Mar 31 2s +4 - +04 1992 Mar 29 2s +3 R +03/+04 2011 Mar 27 2s +4 - +04 2014 O 26 2s +3 - +03 2016 D 4 2s +4 - +04 +Z Europe/Kirov 3:18:48 - LMT 1919 Jul 1 0u +3 - +03 1930 Jun 21 +4 R +04/+05 1989 Mar 26 2s +3 R +03/+04 1991 Mar 31 2s +4 - +04 1992 Mar 29 2s +3 R +03/+04 2011 Mar 27 2s +4 - +04 2014 O 26 2s +3 - +03 +Z Europe/Samara 3:20:20 - LMT 1919 Jul 1 0u +3 - +03 1930 Jun 21 +4 - +04 1935 Ja 27 +4 R +04/+05 1989 Mar 26 2s +3 R +03/+04 1991 Mar 31 2s +2 R +02/+03 1991 S 29 2s +3 - +03 1991 O 20 3 +4 R +04/+05 2010 Mar 28 2s +3 R +03/+04 2011 Mar 27 2s +4 - +04 +Z Europe/Ulyanovsk 3:13:36 - LMT 1919 Jul 1 0u +3 - +03 1930 Jun 21 +4 R +04/+05 1989 Mar 26 2s +3 R +03/+04 1991 Mar 31 2s +2 R +02/+03 1992 Ja 19 2s +3 R +03/+04 2011 Mar 27 2s +4 - +04 2014 O 26 2s +3 - +03 2016 Mar 27 2s +4 - +04 +Z Asia/Yekaterinburg 4:2:33 - LMT 1916 Jul 3 +3:45:5 - PMT 1919 Jul 15 4 +4 - +04 1930 Jun 21 +5 R +05/+06 1991 Mar 31 2s +4 R +04/+05 1992 Ja 19 2s +5 R +05/+06 2011 Mar 27 2s +6 - +06 2014 O 26 2s +5 - +05 +Z Asia/Omsk 4:53:30 - LMT 1919 N 14 +5 - +05 1930 Jun 21 +6 R +06/+07 1991 Mar 31 2s +5 R +05/+06 1992 Ja 19 2s +6 R +06/+07 2011 Mar 27 2s +7 - +07 2014 O 26 2s +6 - +06 +Z Asia/Barnaul 5:35 - LMT 1919 D 10 +6 - +06 1930 Jun 21 +7 R +07/+08 1991 Mar 31 2s +6 R +06/+07 1992 Ja 19 2s +7 R +07/+08 1995 May 28 +6 R +06/+07 2011 Mar 27 2s +7 - +07 2014 O 26 2s +6 - +06 2016 Mar 27 2s +7 - +07 +Z Asia/Novosibirsk 5:31:40 - LMT 1919 D 14 6 +6 - +06 1930 Jun 21 +7 R +07/+08 1991 Mar 31 2s +6 R +06/+07 1992 Ja 19 2s +7 R +07/+08 1993 May 23 +6 R +06/+07 2011 Mar 27 2s +7 - +07 2014 O 26 2s +6 - +06 2016 Jul 24 2s +7 - +07 +Z Asia/Tomsk 5:39:51 - LMT 1919 D 22 +6 - +06 1930 Jun 21 +7 R +07/+08 1991 Mar 31 2s +6 R +06/+07 1992 Ja 19 2s +7 R +07/+08 2002 May 1 3 +6 R +06/+07 2011 Mar 27 2s +7 - +07 2014 O 26 2s +6 - +06 2016 May 29 2s +7 - +07 +Z Asia/Novokuznetsk 5:48:48 - LMT 1924 May +6 - +06 1930 Jun 21 +7 R +07/+08 1991 Mar 31 2s +6 R +06/+07 1992 Ja 19 2s +7 R +07/+08 2010 Mar 28 2s +6 R +06/+07 2011 Mar 27 2s +7 - +07 +Z Asia/Krasnoyarsk 6:11:26 - LMT 1920 Ja 6 +6 - +06 1930 Jun 21 +7 R +07/+08 1991 Mar 31 2s +6 R +06/+07 1992 Ja 19 2s +7 R +07/+08 2011 Mar 27 2s +8 - +08 2014 O 26 2s +7 - +07 +Z Asia/Irkutsk 6:57:5 - LMT 1880 +6:57:5 - IMT 1920 Ja 25 +7 - +07 1930 Jun 21 +8 R +08/+09 1991 Mar 31 2s +7 R +07/+08 1992 Ja 19 2s +8 R +08/+09 2011 Mar 27 2s +9 - +09 2014 O 26 2s +8 - +08 +Z Asia/Chita 7:33:52 - LMT 1919 D 15 +8 - +08 1930 Jun 21 +9 R +09/+10 1991 Mar 31 2s +8 R +08/+09 1992 Ja 19 2s +9 R +09/+10 2011 Mar 27 2s +10 - +10 2014 O 26 2s +8 - +08 2016 Mar 27 2 +9 - +09 +Z Asia/Yakutsk 8:38:58 - LMT 1919 D 15 +8 - +08 1930 Jun 21 +9 R +09/+10 1991 Mar 31 2s +8 R +08/+09 1992 Ja 19 2s +9 R +09/+10 2011 Mar 27 2s +10 - +10 2014 O 26 2s +9 - +09 +Z Asia/Vladivostok 8:47:31 - LMT 1922 N 15 +9 - +09 1930 Jun 21 +10 R +10/+11 1991 Mar 31 2s +9 R +09/+10 1992 Ja 19 2s +10 R +10/+11 2011 Mar 27 2s +11 - +11 2014 O 26 2s +10 - +10 +Z Asia/Khandyga 9:2:13 - LMT 1919 D 15 +8 - +08 1930 Jun 21 +9 R +09/+10 1991 Mar 31 2s +8 R +08/+09 1992 Ja 19 2s +9 R +09/+10 2004 +10 R +10/+11 2011 Mar 27 2s +11 - +11 2011 S 13 0s +10 - +10 2014 O 26 2s +9 - +09 +Z Asia/Sakhalin 9:30:48 - LMT 1905 Au 23 +9 - +09 1945 Au 25 +11 R +11/+12 1991 Mar 31 2s +10 R +10/+11 1992 Ja 19 2s +11 R +11/+12 1997 Mar lastSu 2s +10 R +10/+11 2011 Mar 27 2s +11 - +11 2014 O 26 2s +10 - +10 2016 Mar 27 2s +11 - +11 +Z Asia/Magadan 10:3:12 - LMT 1924 May 2 +10 - +10 1930 Jun 21 +11 R +11/+12 1991 Mar 31 2s +10 R +10/+11 1992 Ja 19 2s +11 R +11/+12 2011 Mar 27 2s +12 - +12 2014 O 26 2s +10 - +10 2016 Ap 24 2s +11 - +11 +Z Asia/Srednekolymsk 10:14:52 - LMT 1924 May 2 +10 - +10 1930 Jun 21 +11 R +11/+12 1991 Mar 31 2s +10 R +10/+11 1992 Ja 19 2s +11 R +11/+12 2011 Mar 27 2s +12 - +12 2014 O 26 2s +11 - +11 +Z Asia/Ust-Nera 9:32:54 - LMT 1919 D 15 +8 - +08 1930 Jun 21 +9 R +09/+10 1981 Ap +11 R +11/+12 1991 Mar 31 2s +10 R +10/+11 1992 Ja 19 2s +11 R +11/+12 2011 Mar 27 2s +12 - +12 2011 S 13 0s +11 - +11 2014 O 26 2s +10 - +10 +Z Asia/Kamchatka 10:34:36 - LMT 1922 N 10 +11 - +11 1930 Jun 21 +12 R +12/+13 1991 Mar 31 2s +11 R +11/+12 1992 Ja 19 2s +12 R +12/+13 2010 Mar 28 2s +11 R +11/+12 2011 Mar 27 2s +12 - +12 +Z Asia/Anadyr 11:49:56 - LMT 1924 May 2 +12 - +12 1930 Jun 21 +13 R +13/+14 1982 Ap 1 0s +12 R +12/+13 1991 Mar 31 2s +11 R +11/+12 1992 Ja 19 2s +12 R +12/+13 2010 Mar 28 2s +11 R +11/+12 2011 Mar 27 2s +12 - +12 +Z Europe/Belgrade 1:22 - LMT 1884 +1 - CET 1941 Ap 18 23 +1 c CE%sT 1945 +1 - CET 1945 May 8 2s +1 1 CEST 1945 S 16 2s +1 - CET 1982 N 27 +1 E CE%sT +L Europe/Belgrade Europe/Ljubljana +L Europe/Belgrade Europe/Podgorica +L Europe/Belgrade Europe/Sarajevo +L Europe/Belgrade Europe/Skopje +L Europe/Belgrade Europe/Zagreb +L Europe/Prague Europe/Bratislava +R s 1918 o - Ap 15 23 1 S +R s 1918 1919 - O 6 24s 0 - +R s 1919 o - Ap 6 23 1 S +R s 1924 o - Ap 16 23 1 S +R s 1924 o - O 4 24s 0 - +R s 1926 o - Ap 17 23 1 S +R s 1926 1929 - O Sa>=1 24s 0 - +R s 1927 o - Ap 9 23 1 S +R s 1928 o - Ap 15 0 1 S +R s 1929 o - Ap 20 23 1 S +R s 1937 o - Jun 16 23 1 S +R s 1937 o - O 2 24s 0 - +R s 1938 o - Ap 2 23 1 S +R s 1938 o - Ap 30 23 2 M +R s 1938 o - O 2 24 1 S +R s 1939 o - O 7 24s 0 - +R s 1942 o - May 2 23 1 S +R s 1942 o - S 1 1 0 - +R s 1943 1946 - Ap Sa>=13 23 1 S +R s 1943 1944 - O Su>=1 1 0 - +R s 1945 1946 - S lastSu 1 0 - +R s 1949 o - Ap 30 23 1 S +R s 1949 o - O 2 1 0 - +R s 1974 1975 - Ap Sa>=12 23 1 S +R s 1974 1975 - O Su>=1 1 0 - +R s 1976 o - Mar 27 23 1 S +R s 1976 1977 - S lastSu 1 0 - +R s 1977 o - Ap 2 23 1 S +R s 1978 o - Ap 2 2s 1 S +R s 1978 o - O 1 2s 0 - +R Sp 1967 o - Jun 3 12 1 S +R Sp 1967 o - O 1 0 0 - +R Sp 1974 o - Jun 24 0 1 S +R Sp 1974 o - S 1 0 0 - +R Sp 1976 1977 - May 1 0 1 S +R Sp 1976 o - Au 1 0 0 - +R Sp 1977 o - S 28 0 0 - +R Sp 1978 o - Jun 1 0 1 S +R Sp 1978 o - Au 4 0 0 - +Z Europe/Madrid -0:14:44 - LMT 1900 D 31 23:45:16 +0 s WE%sT 1940 Mar 16 23 +1 s CE%sT 1979 +1 E CE%sT +Z Africa/Ceuta -0:21:16 - LMT 1900 D 31 23:38:44 +0 - WET 1918 May 6 23 +0 1 WEST 1918 O 7 23 +0 - WET 1924 +0 s WE%sT 1929 +0 - WET 1967 +0 Sp WE%sT 1984 Mar 16 +1 - CET 1986 +1 E CE%sT +Z Atlantic/Canary -1:1:36 - LMT 1922 Mar +-1 - -01 1946 S 30 1 +0 - WET 1980 Ap 6 0s +0 1 WEST 1980 S 28 1u +0 E WE%sT +Z Europe/Stockholm 1:12:12 - LMT 1879 +1:0:14 - SET 1900 +1 - CET 1916 May 14 23 +1 1 CEST 1916 O 1 1 +1 - CET 1980 +1 E CE%sT +R CH 1941 1942 - May M>=1 1 1 S +R CH 1941 1942 - O M>=1 2 0 - +Z Europe/Zurich 0:34:8 - LMT 1853 Jul 16 +0:29:46 - BMT 1894 Jun +1 CH CE%sT 1981 +1 E CE%sT +R T 1916 o - May 1 0 1 S +R T 1916 o - O 1 0 0 - +R T 1920 o - Mar 28 0 1 S +R T 1920 o - O 25 0 0 - +R T 1921 o - Ap 3 0 1 S +R T 1921 o - O 3 0 0 - +R T 1922 o - Mar 26 0 1 S +R T 1922 o - O 8 0 0 - +R T 1924 o - May 13 0 1 S +R T 1924 1925 - O 1 0 0 - +R T 1925 o - May 1 0 1 S +R T 1940 o - Jul 1 0 1 S +R T 1940 o - O 6 0 0 - +R T 1940 o - D 1 0 1 S +R T 1941 o - S 21 0 0 - +R T 1942 o - Ap 1 0 1 S +R T 1945 o - O 8 0 0 - +R T 1946 o - Jun 1 0 1 S +R T 1946 o - O 1 0 0 - +R T 1947 1948 - Ap Su>=16 0 1 S +R T 1947 1951 - O Su>=2 0 0 - +R T 1949 o - Ap 10 0 1 S +R T 1950 o - Ap 16 0 1 S +R T 1951 o - Ap 22 0 1 S +R T 1962 o - Jul 15 0 1 S +R T 1963 o - O 30 0 0 - +R T 1964 o - May 15 0 1 S +R T 1964 o - O 1 0 0 - +R T 1973 o - Jun 3 1 1 S +R T 1973 1976 - O Su>=31 2 0 - +R T 1974 o - Mar 31 2 1 S +R T 1975 o - Mar 22 2 1 S +R T 1976 o - Mar 21 2 1 S +R T 1977 1978 - Ap Su>=1 2 1 S +R T 1977 1978 - O Su>=15 2 0 - +R T 1978 o - Jun 29 0 0 - +R T 1983 o - Jul 31 2 1 S +R T 1983 o - O 2 2 0 - +R T 1985 o - Ap 20 1s 1 S +R T 1985 o - S 28 1s 0 - +R T 1986 1993 - Mar lastSu 1s 1 S +R T 1986 1995 - S lastSu 1s 0 - +R T 1994 o - Mar 20 1s 1 S +R T 1995 2006 - Mar lastSu 1s 1 S +R T 1996 2006 - O lastSu 1s 0 - +Z Europe/Istanbul 1:55:52 - LMT 1880 +1:56:56 - IMT 1910 O +2 T EE%sT 1978 Jun 29 +3 T +03/+04 1984 N 1 2 +2 T EE%sT 2007 +2 E EE%sT 2011 Mar 27 1u +2 - EET 2011 Mar 28 1u +2 E EE%sT 2014 Mar 30 1u +2 - EET 2014 Mar 31 1u +2 E EE%sT 2015 O 25 1u +2 1 EEST 2015 N 8 1u +2 E EE%sT 2016 S 7 +3 - +03 +L Europe/Istanbul Asia/Istanbul +Z Europe/Kiev 2:2:4 - LMT 1880 +2:2:4 - KMT 1924 May 2 +2 - EET 1930 Jun 21 +3 - MSK 1941 S 20 +1 c CE%sT 1943 N 6 +3 R MSK/MSD 1990 Jul 1 2 +2 1 EEST 1991 S 29 3 +2 e EE%sT 1995 +2 E EE%sT +Z Europe/Uzhgorod 1:29:12 - LMT 1890 O +1 - CET 1940 +1 c CE%sT 1944 O +1 1 CEST 1944 O 26 +1 - CET 1945 Jun 29 +3 R MSK/MSD 1990 +3 - MSK 1990 Jul 1 2 +1 - CET 1991 Mar 31 3 +2 - EET 1992 +2 e EE%sT 1995 +2 E EE%sT +Z Europe/Zaporozhye 2:20:40 - LMT 1880 +2:20 - +0220 1924 May 2 +2 - EET 1930 Jun 21 +3 - MSK 1941 Au 25 +1 c CE%sT 1943 O 25 +3 R MSK/MSD 1991 Mar 31 2 +2 e EE%sT 1995 +2 E EE%sT +R u 1918 1919 - Mar lastSu 2 1 D +R u 1918 1919 - O lastSu 2 0 S +R u 1942 o - F 9 2 1 W +R u 1945 o - Au 14 23u 1 P +R u 1945 o - S 30 2 0 S +R u 1967 2006 - O lastSu 2 0 S +R u 1967 1973 - Ap lastSu 2 1 D +R u 1974 o - Ja 6 2 1 D +R u 1975 o - F lastSu 2 1 D +R u 1976 1986 - Ap lastSu 2 1 D +R u 1987 2006 - Ap Su>=1 2 1 D +R u 2007 ma - Mar Su>=8 2 1 D +R u 2007 ma - N Su>=1 2 0 S +Z EST -5 - EST +Z MST -7 - MST +Z HST -10 - HST +Z EST5EDT -5 u E%sT +Z CST6CDT -6 u C%sT +Z MST7MDT -7 u M%sT +Z PST8PDT -8 u P%sT +R NY 1920 o - Mar lastSu 2 1 D +R NY 1920 o - O lastSu 2 0 S +R NY 1921 1966 - Ap lastSu 2 1 D +R NY 1921 1954 - S lastSu 2 0 S +R NY 1955 1966 - O lastSu 2 0 S +Z America/New_York -4:56:2 - LMT 1883 N 18 12:3:58 +-5 u E%sT 1920 +-5 NY E%sT 1942 +-5 u E%sT 1946 +-5 NY E%sT 1967 +-5 u E%sT +R Ch 1920 o - Jun 13 2 1 D +R Ch 1920 1921 - O lastSu 2 0 S +R Ch 1921 o - Mar lastSu 2 1 D +R Ch 1922 1966 - Ap lastSu 2 1 D +R Ch 1922 1954 - S lastSu 2 0 S +R Ch 1955 1966 - O lastSu 2 0 S +Z America/Chicago -5:50:36 - LMT 1883 N 18 12:9:24 +-6 u C%sT 1920 +-6 Ch C%sT 1936 Mar 1 2 +-5 - EST 1936 N 15 2 +-6 Ch C%sT 1942 +-6 u C%sT 1946 +-6 Ch C%sT 1967 +-6 u C%sT +Z America/North_Dakota/Center -6:45:12 - LMT 1883 N 18 12:14:48 +-7 u M%sT 1992 O 25 2 +-6 u C%sT +Z America/North_Dakota/New_Salem -6:45:39 - LMT 1883 N 18 12:14:21 +-7 u M%sT 2003 O 26 2 +-6 u C%sT +Z America/North_Dakota/Beulah -6:47:7 - LMT 1883 N 18 12:12:53 +-7 u M%sT 2010 N 7 2 +-6 u C%sT +R De 1920 1921 - Mar lastSu 2 1 D +R De 1920 o - O lastSu 2 0 S +R De 1921 o - May 22 2 0 S +R De 1965 1966 - Ap lastSu 2 1 D +R De 1965 1966 - O lastSu 2 0 S +Z America/Denver -6:59:56 - LMT 1883 N 18 12:0:4 +-7 u M%sT 1920 +-7 De M%sT 1942 +-7 u M%sT 1946 +-7 De M%sT 1967 +-7 u M%sT +R CA 1948 o - Mar 14 2:1 1 D +R CA 1949 o - Ja 1 2 0 S +R CA 1950 1966 - Ap lastSu 1 1 D +R CA 1950 1961 - S lastSu 2 0 S +R CA 1962 1966 - O lastSu 2 0 S +Z America/Los_Angeles -7:52:58 - LMT 1883 N 18 12:7:2 +-8 u P%sT 1946 +-8 CA P%sT 1967 +-8 u P%sT +Z America/Juneau 15:2:19 - LMT 1867 O 19 15:33:32 +-8:57:41 - LMT 1900 Au 20 12 +-8 - PST 1942 +-8 u P%sT 1946 +-8 - PST 1969 +-8 u P%sT 1980 Ap 27 2 +-9 u Y%sT 1980 O 26 2 +-8 u P%sT 1983 O 30 2 +-9 u Y%sT 1983 N 30 +-9 u AK%sT +Z America/Sitka 14:58:47 - LMT 1867 O 19 15:30 +-9:1:13 - LMT 1900 Au 20 12 +-8 - PST 1942 +-8 u P%sT 1946 +-8 - PST 1969 +-8 u P%sT 1983 O 30 2 +-9 u Y%sT 1983 N 30 +-9 u AK%sT +Z America/Metlakatla 15:13:42 - LMT 1867 O 19 15:44:55 +-8:46:18 - LMT 1900 Au 20 12 +-8 - PST 1942 +-8 u P%sT 1946 +-8 - PST 1969 +-8 u P%sT 1983 O 30 2 +-8 - PST 2015 N 1 2 +-9 u AK%sT 2018 N 4 2 +-8 - PST 2019 Ja 20 2 +-9 u AK%sT +Z America/Yakutat 14:41:5 - LMT 1867 O 19 15:12:18 +-9:18:55 - LMT 1900 Au 20 12 +-9 - YST 1942 +-9 u Y%sT 1946 +-9 - YST 1969 +-9 u Y%sT 1983 N 30 +-9 u AK%sT +Z America/Anchorage 14:0:24 - LMT 1867 O 19 14:31:37 +-9:59:36 - LMT 1900 Au 20 12 +-10 - AST 1942 +-10 u A%sT 1967 Ap +-10 - AHST 1969 +-10 u AH%sT 1983 O 30 2 +-9 u Y%sT 1983 N 30 +-9 u AK%sT +Z America/Nome 12:58:22 - LMT 1867 O 19 13:29:35 +-11:1:38 - LMT 1900 Au 20 12 +-11 - NST 1942 +-11 u N%sT 1946 +-11 - NST 1967 Ap +-11 - BST 1969 +-11 u B%sT 1983 O 30 2 +-9 u Y%sT 1983 N 30 +-9 u AK%sT +Z America/Adak 12:13:22 - LMT 1867 O 19 12:44:35 +-11:46:38 - LMT 1900 Au 20 12 +-11 - NST 1942 +-11 u N%sT 1946 +-11 - NST 1967 Ap +-11 - BST 1969 +-11 u B%sT 1983 O 30 2 +-10 u AH%sT 1983 N 30 +-10 u H%sT +Z Pacific/Honolulu -10:31:26 - LMT 1896 Ja 13 12 +-10:30 - HST 1933 Ap 30 2 +-10:30 1 HDT 1933 May 21 12 +-10:30 u H%sT 1947 Jun 8 2 +-10 - HST +Z America/Phoenix -7:28:18 - LMT 1883 N 18 11:31:42 +-7 u M%sT 1944 Ja 1 0:1 +-7 - MST 1944 Ap 1 0:1 +-7 u M%sT 1944 O 1 0:1 +-7 - MST 1967 +-7 u M%sT 1968 Mar 21 +-7 - MST +Z America/Boise -7:44:49 - LMT 1883 N 18 12:15:11 +-8 u P%sT 1923 May 13 2 +-7 u M%sT 1974 +-7 - MST 1974 F 3 2 +-7 u M%sT +R In 1941 o - Jun 22 2 1 D +R In 1941 1954 - S lastSu 2 0 S +R In 1946 1954 - Ap lastSu 2 1 D +Z America/Indiana/Indianapolis -5:44:38 - LMT 1883 N 18 12:15:22 +-6 u C%sT 1920 +-6 In C%sT 1942 +-6 u C%sT 1946 +-6 In C%sT 1955 Ap 24 2 +-5 - EST 1957 S 29 2 +-6 - CST 1958 Ap 27 2 +-5 - EST 1969 +-5 u E%sT 1971 +-5 - EST 2006 +-5 u E%sT +R Ma 1951 o - Ap lastSu 2 1 D +R Ma 1951 o - S lastSu 2 0 S +R Ma 1954 1960 - Ap lastSu 2 1 D +R Ma 1954 1960 - S lastSu 2 0 S +Z America/Indiana/Marengo -5:45:23 - LMT 1883 N 18 12:14:37 +-6 u C%sT 1951 +-6 Ma C%sT 1961 Ap 30 2 +-5 - EST 1969 +-5 u E%sT 1974 Ja 6 2 +-6 1 CDT 1974 O 27 2 +-5 u E%sT 1976 +-5 - EST 2006 +-5 u E%sT +R V 1946 o - Ap lastSu 2 1 D +R V 1946 o - S lastSu 2 0 S +R V 1953 1954 - Ap lastSu 2 1 D +R V 1953 1959 - S lastSu 2 0 S +R V 1955 o - May 1 0 1 D +R V 1956 1963 - Ap lastSu 2 1 D +R V 1960 o - O lastSu 2 0 S +R V 1961 o - S lastSu 2 0 S +R V 1962 1963 - O lastSu 2 0 S +Z America/Indiana/Vincennes -5:50:7 - LMT 1883 N 18 12:9:53 +-6 u C%sT 1946 +-6 V C%sT 1964 Ap 26 2 +-5 - EST 1969 +-5 u E%sT 1971 +-5 - EST 2006 Ap 2 2 +-6 u C%sT 2007 N 4 2 +-5 u E%sT +R Pe 1955 o - May 1 0 1 D +R Pe 1955 1960 - S lastSu 2 0 S +R Pe 1956 1963 - Ap lastSu 2 1 D +R Pe 1961 1963 - O lastSu 2 0 S +Z America/Indiana/Tell_City -5:47:3 - LMT 1883 N 18 12:12:57 +-6 u C%sT 1946 +-6 Pe C%sT 1964 Ap 26 2 +-5 - EST 1967 O 29 2 +-6 u C%sT 1969 Ap 27 2 +-5 u E%sT 1971 +-5 - EST 2006 Ap 2 2 +-6 u C%sT +R Pi 1955 o - May 1 0 1 D +R Pi 1955 1960 - S lastSu 2 0 S +R Pi 1956 1964 - Ap lastSu 2 1 D +R Pi 1961 1964 - O lastSu 2 0 S +Z America/Indiana/Petersburg -5:49:7 - LMT 1883 N 18 12:10:53 +-6 u C%sT 1955 +-6 Pi C%sT 1965 Ap 25 2 +-5 - EST 1966 O 30 2 +-6 u C%sT 1977 O 30 2 +-5 - EST 2006 Ap 2 2 +-6 u C%sT 2007 N 4 2 +-5 u E%sT +R St 1947 1961 - Ap lastSu 2 1 D +R St 1947 1954 - S lastSu 2 0 S +R St 1955 1956 - O lastSu 2 0 S +R St 1957 1958 - S lastSu 2 0 S +R St 1959 1961 - O lastSu 2 0 S +Z America/Indiana/Knox -5:46:30 - LMT 1883 N 18 12:13:30 +-6 u C%sT 1947 +-6 St C%sT 1962 Ap 29 2 +-5 - EST 1963 O 27 2 +-6 u C%sT 1991 O 27 2 +-5 - EST 2006 Ap 2 2 +-6 u C%sT +R Pu 1946 1960 - Ap lastSu 2 1 D +R Pu 1946 1954 - S lastSu 2 0 S +R Pu 1955 1956 - O lastSu 2 0 S +R Pu 1957 1960 - S lastSu 2 0 S +Z America/Indiana/Winamac -5:46:25 - LMT 1883 N 18 12:13:35 +-6 u C%sT 1946 +-6 Pu C%sT 1961 Ap 30 2 +-5 - EST 1969 +-5 u E%sT 1971 +-5 - EST 2006 Ap 2 2 +-6 u C%sT 2007 Mar 11 2 +-5 u E%sT +Z America/Indiana/Vevay -5:40:16 - LMT 1883 N 18 12:19:44 +-6 u C%sT 1954 Ap 25 2 +-5 - EST 1969 +-5 u E%sT 1973 +-5 - EST 2006 +-5 u E%sT +R v 1921 o - May 1 2 1 D +R v 1921 o - S 1 2 0 S +R v 1941 o - Ap lastSu 2 1 D +R v 1941 o - S lastSu 2 0 S +R v 1946 o - Ap lastSu 0:1 1 D +R v 1946 o - Jun 2 2 0 S +R v 1950 1961 - Ap lastSu 2 1 D +R v 1950 1955 - S lastSu 2 0 S +R v 1956 1961 - O lastSu 2 0 S +Z America/Kentucky/Louisville -5:43:2 - LMT 1883 N 18 12:16:58 +-6 u C%sT 1921 +-6 v C%sT 1942 +-6 u C%sT 1946 +-6 v C%sT 1961 Jul 23 2 +-5 - EST 1968 +-5 u E%sT 1974 Ja 6 2 +-6 1 CDT 1974 O 27 2 +-5 u E%sT +Z America/Kentucky/Monticello -5:39:24 - LMT 1883 N 18 12:20:36 +-6 u C%sT 1946 +-6 - CST 1968 +-6 u C%sT 2000 O 29 2 +-5 u E%sT +R Dt 1948 o - Ap lastSu 2 1 D +R Dt 1948 o - S lastSu 2 0 S +Z America/Detroit -5:32:11 - LMT 1905 +-6 - CST 1915 May 15 2 +-5 - EST 1942 +-5 u E%sT 1946 +-5 Dt E%sT 1967 Jun 14 0:1 +-5 u E%sT 1969 +-5 - EST 1973 +-5 u E%sT 1975 +-5 - EST 1975 Ap 27 2 +-5 u E%sT +R Me 1946 o - Ap lastSu 2 1 D +R Me 1946 o - S lastSu 2 0 S +R Me 1966 o - Ap lastSu 2 1 D +R Me 1966 o - O lastSu 2 0 S +Z America/Menominee -5:50:27 - LMT 1885 S 18 12 +-6 u C%sT 1946 +-6 Me C%sT 1969 Ap 27 2 +-5 - EST 1973 Ap 29 2 +-6 u C%sT +R C 1918 o - Ap 14 2 1 D +R C 1918 o - O 27 2 0 S +R C 1942 o - F 9 2 1 W +R C 1945 o - Au 14 23u 1 P +R C 1945 o - S 30 2 0 S +R C 1974 1986 - Ap lastSu 2 1 D +R C 1974 2006 - O lastSu 2 0 S +R C 1987 2006 - Ap Su>=1 2 1 D +R C 2007 ma - Mar Su>=8 2 1 D +R C 2007 ma - N Su>=1 2 0 S +R j 1917 o - Ap 8 2 1 D +R j 1917 o - S 17 2 0 S +R j 1919 o - May 5 23 1 D +R j 1919 o - Au 12 23 0 S +R j 1920 1935 - May Su>=1 23 1 D +R j 1920 1935 - O lastSu 23 0 S +R j 1936 1941 - May M>=9 0 1 D +R j 1936 1941 - O M>=2 0 0 S +R j 1946 1950 - May Su>=8 2 1 D +R j 1946 1950 - O Su>=2 2 0 S +R j 1951 1986 - Ap lastSu 2 1 D +R j 1951 1959 - S lastSu 2 0 S +R j 1960 1986 - O lastSu 2 0 S +R j 1987 o - Ap Su>=1 0:1 1 D +R j 1987 2006 - O lastSu 0:1 0 S +R j 1988 o - Ap Su>=1 0:1 2 DD +R j 1989 2006 - Ap Su>=1 0:1 1 D +R j 2007 2011 - Mar Su>=8 0:1 1 D +R j 2007 2010 - N Su>=1 0:1 0 S +Z America/St_Johns -3:30:52 - LMT 1884 +-3:30:52 j N%sT 1918 +-3:30:52 C N%sT 1919 +-3:30:52 j N%sT 1935 Mar 30 +-3:30 j N%sT 1942 May 11 +-3:30 C N%sT 1946 +-3:30 j N%sT 2011 N +-3:30 C N%sT +Z America/Goose_Bay -4:1:40 - LMT 1884 +-3:30:52 - NST 1918 +-3:30:52 C N%sT 1919 +-3:30:52 - NST 1935 Mar 30 +-3:30 - NST 1936 +-3:30 j N%sT 1942 May 11 +-3:30 C N%sT 1946 +-3:30 j N%sT 1966 Mar 15 2 +-4 j A%sT 2011 N +-4 C A%sT +R H 1916 o - Ap 1 0 1 D +R H 1916 o - O 1 0 0 S +R H 1920 o - May 9 0 1 D +R H 1920 o - Au 29 0 0 S +R H 1921 o - May 6 0 1 D +R H 1921 1922 - S 5 0 0 S +R H 1922 o - Ap 30 0 1 D +R H 1923 1925 - May Su>=1 0 1 D +R H 1923 o - S 4 0 0 S +R H 1924 o - S 15 0 0 S +R H 1925 o - S 28 0 0 S +R H 1926 o - May 16 0 1 D +R H 1926 o - S 13 0 0 S +R H 1927 o - May 1 0 1 D +R H 1927 o - S 26 0 0 S +R H 1928 1931 - May Su>=8 0 1 D +R H 1928 o - S 9 0 0 S +R H 1929 o - S 3 0 0 S +R H 1930 o - S 15 0 0 S +R H 1931 1932 - S M>=24 0 0 S +R H 1932 o - May 1 0 1 D +R H 1933 o - Ap 30 0 1 D +R H 1933 o - O 2 0 0 S +R H 1934 o - May 20 0 1 D +R H 1934 o - S 16 0 0 S +R H 1935 o - Jun 2 0 1 D +R H 1935 o - S 30 0 0 S +R H 1936 o - Jun 1 0 1 D +R H 1936 o - S 14 0 0 S +R H 1937 1938 - May Su>=1 0 1 D +R H 1937 1941 - S M>=24 0 0 S +R H 1939 o - May 28 0 1 D +R H 1940 1941 - May Su>=1 0 1 D +R H 1946 1949 - Ap lastSu 2 1 D +R H 1946 1949 - S lastSu 2 0 S +R H 1951 1954 - Ap lastSu 2 1 D +R H 1951 1954 - S lastSu 2 0 S +R H 1956 1959 - Ap lastSu 2 1 D +R H 1956 1959 - S lastSu 2 0 S +R H 1962 1973 - Ap lastSu 2 1 D +R H 1962 1973 - O lastSu 2 0 S +Z America/Halifax -4:14:24 - LMT 1902 Jun 15 +-4 H A%sT 1918 +-4 C A%sT 1919 +-4 H A%sT 1942 F 9 2s +-4 C A%sT 1946 +-4 H A%sT 1974 +-4 C A%sT +Z America/Glace_Bay -3:59:48 - LMT 1902 Jun 15 +-4 C A%sT 1953 +-4 H A%sT 1954 +-4 - AST 1972 +-4 H A%sT 1974 +-4 C A%sT +R o 1933 1935 - Jun Su>=8 1 1 D +R o 1933 1935 - S Su>=8 1 0 S +R o 1936 1938 - Jun Su>=1 1 1 D +R o 1936 1938 - S Su>=1 1 0 S +R o 1939 o - May 27 1 1 D +R o 1939 1941 - S Sa>=21 1 0 S +R o 1940 o - May 19 1 1 D +R o 1941 o - May 4 1 1 D +R o 1946 1972 - Ap lastSu 2 1 D +R o 1946 1956 - S lastSu 2 0 S +R o 1957 1972 - O lastSu 2 0 S +R o 1993 2006 - Ap Su>=1 0:1 1 D +R o 1993 2006 - O lastSu 0:1 0 S +Z America/Moncton -4:19:8 - LMT 1883 D 9 +-5 - EST 1902 Jun 15 +-4 C A%sT 1933 +-4 o A%sT 1942 +-4 C A%sT 1946 +-4 o A%sT 1973 +-4 C A%sT 1993 +-4 o A%sT 2007 +-4 C A%sT +Z America/Blanc-Sablon -3:48:28 - LMT 1884 +-4 C A%sT 1970 +-4 - AST +R t 1919 o - Mar 30 23:30 1 D +R t 1919 o - O 26 0 0 S +R t 1920 o - May 2 2 1 D +R t 1920 o - S 26 0 0 S +R t 1921 o - May 15 2 1 D +R t 1921 o - S 15 2 0 S +R t 1922 1923 - May Su>=8 2 1 D +R t 1922 1926 - S Su>=15 2 0 S +R t 1924 1927 - May Su>=1 2 1 D +R t 1927 1937 - S Su>=25 2 0 S +R t 1928 1937 - Ap Su>=25 2 1 D +R t 1938 1940 - Ap lastSu 2 1 D +R t 1938 1939 - S lastSu 2 0 S +R t 1945 1946 - S lastSu 2 0 S +R t 1946 o - Ap lastSu 2 1 D +R t 1947 1949 - Ap lastSu 0 1 D +R t 1947 1948 - S lastSu 0 0 S +R t 1949 o - N lastSu 0 0 S +R t 1950 1973 - Ap lastSu 2 1 D +R t 1950 o - N lastSu 2 0 S +R t 1951 1956 - S lastSu 2 0 S +R t 1957 1973 - O lastSu 2 0 S +Z America/Toronto -5:17:32 - LMT 1895 +-5 C E%sT 1919 +-5 t E%sT 1942 F 9 2s +-5 C E%sT 1946 +-5 t E%sT 1974 +-5 C E%sT +Z America/Thunder_Bay -5:57 - LMT 1895 +-6 - CST 1910 +-5 - EST 1942 +-5 C E%sT 1970 +-5 t E%sT 1973 +-5 - EST 1974 +-5 C E%sT +Z America/Nipigon -5:53:4 - LMT 1895 +-5 C E%sT 1940 S 29 +-5 1 EDT 1942 F 9 2s +-5 C E%sT +Z America/Rainy_River -6:18:16 - LMT 1895 +-6 C C%sT 1940 S 29 +-6 1 CDT 1942 F 9 2s +-6 C C%sT +Z America/Atikokan -6:6:28 - LMT 1895 +-6 C C%sT 1940 S 29 +-6 1 CDT 1942 F 9 2s +-6 C C%sT 1945 S 30 2 +-5 - EST +R W 1916 o - Ap 23 0 1 D +R W 1916 o - S 17 0 0 S +R W 1918 o - Ap 14 2 1 D +R W 1918 o - O 27 2 0 S +R W 1937 o - May 16 2 1 D +R W 1937 o - S 26 2 0 S +R W 1942 o - F 9 2 1 W +R W 1945 o - Au 14 23u 1 P +R W 1945 o - S lastSu 2 0 S +R W 1946 o - May 12 2 1 D +R W 1946 o - O 13 2 0 S +R W 1947 1949 - Ap lastSu 2 1 D +R W 1947 1949 - S lastSu 2 0 S +R W 1950 o - May 1 2 1 D +R W 1950 o - S 30 2 0 S +R W 1951 1960 - Ap lastSu 2 1 D +R W 1951 1958 - S lastSu 2 0 S +R W 1959 o - O lastSu 2 0 S +R W 1960 o - S lastSu 2 0 S +R W 1963 o - Ap lastSu 2 1 D +R W 1963 o - S 22 2 0 S +R W 1966 1986 - Ap lastSu 2s 1 D +R W 1966 2005 - O lastSu 2s 0 S +R W 1987 2005 - Ap Su>=1 2s 1 D +Z America/Winnipeg -6:28:36 - LMT 1887 Jul 16 +-6 W C%sT 2006 +-6 C C%sT +R r 1918 o - Ap 14 2 1 D +R r 1918 o - O 27 2 0 S +R r 1930 1934 - May Su>=1 0 1 D +R r 1930 1934 - O Su>=1 0 0 S +R r 1937 1941 - Ap Su>=8 0 1 D +R r 1937 o - O Su>=8 0 0 S +R r 1938 o - O Su>=1 0 0 S +R r 1939 1941 - O Su>=8 0 0 S +R r 1942 o - F 9 2 1 W +R r 1945 o - Au 14 23u 1 P +R r 1945 o - S lastSu 2 0 S +R r 1946 o - Ap Su>=8 2 1 D +R r 1946 o - O Su>=8 2 0 S +R r 1947 1957 - Ap lastSu 2 1 D +R r 1947 1957 - S lastSu 2 0 S +R r 1959 o - Ap lastSu 2 1 D +R r 1959 o - O lastSu 2 0 S +R Sw 1957 o - Ap lastSu 2 1 D +R Sw 1957 o - O lastSu 2 0 S +R Sw 1959 1961 - Ap lastSu 2 1 D +R Sw 1959 o - O lastSu 2 0 S +R Sw 1960 1961 - S lastSu 2 0 S +Z America/Regina -6:58:36 - LMT 1905 S +-7 r M%sT 1960 Ap lastSu 2 +-6 - CST +Z America/Swift_Current -7:11:20 - LMT 1905 S +-7 C M%sT 1946 Ap lastSu 2 +-7 r M%sT 1950 +-7 Sw M%sT 1972 Ap lastSu 2 +-6 - CST +R Ed 1918 1919 - Ap Su>=8 2 1 D +R Ed 1918 o - O 27 2 0 S +R Ed 1919 o - May 27 2 0 S +R Ed 1920 1923 - Ap lastSu 2 1 D +R Ed 1920 o - O lastSu 2 0 S +R Ed 1921 1923 - S lastSu 2 0 S +R Ed 1942 o - F 9 2 1 W +R Ed 1945 o - Au 14 23u 1 P +R Ed 1945 o - S lastSu 2 0 S +R Ed 1947 o - Ap lastSu 2 1 D +R Ed 1947 o - S lastSu 2 0 S +R Ed 1972 1986 - Ap lastSu 2 1 D +R Ed 1972 2006 - O lastSu 2 0 S +Z America/Edmonton -7:33:52 - LMT 1906 S +-7 Ed M%sT 1987 +-7 C M%sT +R Va 1918 o - Ap 14 2 1 D +R Va 1918 o - O 27 2 0 S +R Va 1942 o - F 9 2 1 W +R Va 1945 o - Au 14 23u 1 P +R Va 1945 o - S 30 2 0 S +R Va 1946 1986 - Ap lastSu 2 1 D +R Va 1946 o - S 29 2 0 S +R Va 1947 1961 - S lastSu 2 0 S +R Va 1962 2006 - O lastSu 2 0 S +Z America/Vancouver -8:12:28 - LMT 1884 +-8 Va P%sT 1987 +-8 C P%sT +Z America/Dawson_Creek -8:0:56 - LMT 1884 +-8 C P%sT 1947 +-8 Va P%sT 1972 Au 30 2 +-7 - MST +Z America/Fort_Nelson -8:10:47 - LMT 1884 +-8 Va P%sT 1946 +-8 - PST 1947 +-8 Va P%sT 1987 +-8 C P%sT 2015 Mar 8 2 +-7 - MST +Z America/Creston -7:46:4 - LMT 1884 +-7 - MST 1916 O +-8 - PST 1918 Jun 2 +-7 - MST +R Y 1918 o - Ap 14 2 1 D +R Y 1918 o - O 27 2 0 S +R Y 1919 o - May 25 2 1 D +R Y 1919 o - N 1 0 0 S +R Y 1942 o - F 9 2 1 W +R Y 1945 o - Au 14 23u 1 P +R Y 1945 o - S 30 2 0 S +R Y 1965 o - Ap lastSu 0 2 DD +R Y 1965 o - O lastSu 2 0 S +R Y 1980 1986 - Ap lastSu 2 1 D +R Y 1980 2006 - O lastSu 2 0 S +R Y 1987 2006 - Ap Su>=1 2 1 D +Z America/Pangnirtung 0 - -00 1921 +-4 Y A%sT 1995 Ap Su>=1 2 +-5 C E%sT 1999 O 31 2 +-6 C C%sT 2000 O 29 2 +-5 C E%sT +Z America/Iqaluit 0 - -00 1942 Au +-5 Y E%sT 1999 O 31 2 +-6 C C%sT 2000 O 29 2 +-5 C E%sT +Z America/Resolute 0 - -00 1947 Au 31 +-6 Y C%sT 2000 O 29 2 +-5 - EST 2001 Ap 1 3 +-6 C C%sT 2006 O 29 2 +-5 - EST 2007 Mar 11 3 +-6 C C%sT +Z America/Rankin_Inlet 0 - -00 1957 +-6 Y C%sT 2000 O 29 2 +-5 - EST 2001 Ap 1 3 +-6 C C%sT +Z America/Cambridge_Bay 0 - -00 1920 +-7 Y M%sT 1999 O 31 2 +-6 C C%sT 2000 O 29 2 +-5 - EST 2000 N 5 +-6 - CST 2001 Ap 1 3 +-7 C M%sT +Z America/Yellowknife 0 - -00 1935 +-7 Y M%sT 1980 +-7 C M%sT +Z America/Inuvik 0 - -00 1953 +-8 Y P%sT 1979 Ap lastSu 2 +-7 Y M%sT 1980 +-7 C M%sT +Z America/Whitehorse -9:0:12 - LMT 1900 Au 20 +-9 Y Y%sT 1967 May 28 +-8 Y P%sT 1980 +-8 C P%sT +Z America/Dawson -9:17:40 - LMT 1900 Au 20 +-9 Y Y%sT 1973 O 28 +-8 Y P%sT 1980 +-8 C P%sT +R m 1939 o - F 5 0 1 D +R m 1939 o - Jun 25 0 0 S +R m 1940 o - D 9 0 1 D +R m 1941 o - Ap 1 0 0 S +R m 1943 o - D 16 0 1 W +R m 1944 o - May 1 0 0 S +R m 1950 o - F 12 0 1 D +R m 1950 o - Jul 30 0 0 S +R m 1996 2000 - Ap Su>=1 2 1 D +R m 1996 2000 - O lastSu 2 0 S +R m 2001 o - May Su>=1 2 1 D +R m 2001 o - S lastSu 2 0 S +R m 2002 ma - Ap Su>=1 2 1 D +R m 2002 ma - O lastSu 2 0 S +Z America/Cancun -5:47:4 - LMT 1922 Ja 1 0:12:56 +-6 - CST 1981 D 23 +-5 m E%sT 1998 Au 2 2 +-6 m C%sT 2015 F 1 2 +-5 - EST +Z America/Merida -5:58:28 - LMT 1922 Ja 1 0:1:32 +-6 - CST 1981 D 23 +-5 - EST 1982 D 2 +-6 m C%sT +Z America/Matamoros -6:40 - LMT 1921 D 31 23:20 +-6 - CST 1988 +-6 u C%sT 1989 +-6 m C%sT 2010 +-6 u C%sT +Z America/Monterrey -6:41:16 - LMT 1921 D 31 23:18:44 +-6 - CST 1988 +-6 u C%sT 1989 +-6 m C%sT +Z America/Mexico_City -6:36:36 - LMT 1922 Ja 1 0:23:24 +-7 - MST 1927 Jun 10 23 +-6 - CST 1930 N 15 +-7 - MST 1931 May 1 23 +-6 - CST 1931 O +-7 - MST 1932 Ap +-6 m C%sT 2001 S 30 2 +-6 - CST 2002 F 20 +-6 m C%sT +Z America/Ojinaga -6:57:40 - LMT 1922 Ja 1 0:2:20 +-7 - MST 1927 Jun 10 23 +-6 - CST 1930 N 15 +-7 - MST 1931 May 1 23 +-6 - CST 1931 O +-7 - MST 1932 Ap +-6 - CST 1996 +-6 m C%sT 1998 +-6 - CST 1998 Ap Su>=1 3 +-7 m M%sT 2010 +-7 u M%sT +Z America/Chihuahua -7:4:20 - LMT 1921 D 31 23:55:40 +-7 - MST 1927 Jun 10 23 +-6 - CST 1930 N 15 +-7 - MST 1931 May 1 23 +-6 - CST 1931 O +-7 - MST 1932 Ap +-6 - CST 1996 +-6 m C%sT 1998 +-6 - CST 1998 Ap Su>=1 3 +-7 m M%sT +Z America/Hermosillo -7:23:52 - LMT 1921 D 31 23:36:8 +-7 - MST 1927 Jun 10 23 +-6 - CST 1930 N 15 +-7 - MST 1931 May 1 23 +-6 - CST 1931 O +-7 - MST 1932 Ap +-6 - CST 1942 Ap 24 +-7 - MST 1949 Ja 14 +-8 - PST 1970 +-7 m M%sT 1999 +-7 - MST +Z America/Mazatlan -7:5:40 - LMT 1921 D 31 23:54:20 +-7 - MST 1927 Jun 10 23 +-6 - CST 1930 N 15 +-7 - MST 1931 May 1 23 +-6 - CST 1931 O +-7 - MST 1932 Ap +-6 - CST 1942 Ap 24 +-7 - MST 1949 Ja 14 +-8 - PST 1970 +-7 m M%sT +Z America/Bahia_Banderas -7:1 - LMT 1921 D 31 23:59 +-7 - MST 1927 Jun 10 23 +-6 - CST 1930 N 15 +-7 - MST 1931 May 1 23 +-6 - CST 1931 O +-7 - MST 1932 Ap +-6 - CST 1942 Ap 24 +-7 - MST 1949 Ja 14 +-8 - PST 1970 +-7 m M%sT 2010 Ap 4 2 +-6 m C%sT +Z America/Tijuana -7:48:4 - LMT 1922 Ja 1 0:11:56 +-7 - MST 1924 +-8 - PST 1927 Jun 10 23 +-7 - MST 1930 N 15 +-8 - PST 1931 Ap +-8 1 PDT 1931 S 30 +-8 - PST 1942 Ap 24 +-8 1 PWT 1945 Au 14 23u +-8 1 PPT 1945 N 12 +-8 - PST 1948 Ap 5 +-8 1 PDT 1949 Ja 14 +-8 - PST 1954 +-8 CA P%sT 1961 +-8 - PST 1976 +-8 u P%sT 1996 +-8 m P%sT 2001 +-8 u P%sT 2002 F 20 +-8 m P%sT 2010 +-8 u P%sT +R BS 1964 1975 - O lastSu 2 0 S +R BS 1964 1975 - Ap lastSu 2 1 D +Z America/Nassau -5:9:30 - LMT 1912 Mar 2 +-5 BS E%sT 1976 +-5 u E%sT +R BB 1977 o - Jun 12 2 1 D +R BB 1977 1978 - O Su>=1 2 0 S +R BB 1978 1980 - Ap Su>=15 2 1 D +R BB 1979 o - S 30 2 0 S +R BB 1980 o - S 25 2 0 S +Z America/Barbados -3:58:29 - LMT 1924 +-3:58:29 - BMT 1932 +-4 BB A%sT +R BZ 1918 1942 - O Su>=2 0 0:30 -0530 +R BZ 1919 1943 - F Su>=9 0 0 CST +R BZ 1973 o - D 5 0 1 CDT +R BZ 1974 o - F 9 0 0 CST +R BZ 1982 o - D 18 0 1 CDT +R BZ 1983 o - F 12 0 0 CST +Z America/Belize -5:52:48 - LMT 1912 Ap +-6 BZ %s +Z Atlantic/Bermuda -4:19:18 - LMT 1930 Ja 1 2 +-4 - AST 1974 Ap 28 2 +-4 C A%sT 1976 +-4 u A%sT +R CR 1979 1980 - F lastSu 0 1 D +R CR 1979 1980 - Jun Su>=1 0 0 S +R CR 1991 1992 - Ja Sa>=15 0 1 D +R CR 1991 o - Jul 1 0 0 S +R CR 1992 o - Mar 15 0 0 S +Z America/Costa_Rica -5:36:13 - LMT 1890 +-5:36:13 - SJMT 1921 Ja 15 +-6 CR C%sT +R Q 1928 o - Jun 10 0 1 D +R Q 1928 o - O 10 0 0 S +R Q 1940 1942 - Jun Su>=1 0 1 D +R Q 1940 1942 - S Su>=1 0 0 S +R Q 1945 1946 - Jun Su>=1 0 1 D +R Q 1945 1946 - S Su>=1 0 0 S +R Q 1965 o - Jun 1 0 1 D +R Q 1965 o - S 30 0 0 S +R Q 1966 o - May 29 0 1 D +R Q 1966 o - O 2 0 0 S +R Q 1967 o - Ap 8 0 1 D +R Q 1967 1968 - S Su>=8 0 0 S +R Q 1968 o - Ap 14 0 1 D +R Q 1969 1977 - Ap lastSu 0 1 D +R Q 1969 1971 - O lastSu 0 0 S +R Q 1972 1974 - O 8 0 0 S +R Q 1975 1977 - O lastSu 0 0 S +R Q 1978 o - May 7 0 1 D +R Q 1978 1990 - O Su>=8 0 0 S +R Q 1979 1980 - Mar Su>=15 0 1 D +R Q 1981 1985 - May Su>=5 0 1 D +R Q 1986 1989 - Mar Su>=14 0 1 D +R Q 1990 1997 - Ap Su>=1 0 1 D +R Q 1991 1995 - O Su>=8 0s 0 S +R Q 1996 o - O 6 0s 0 S +R Q 1997 o - O 12 0s 0 S +R Q 1998 1999 - Mar lastSu 0s 1 D +R Q 1998 2003 - O lastSu 0s 0 S +R Q 2000 2003 - Ap Su>=1 0s 1 D +R Q 2004 o - Mar lastSu 0s 1 D +R Q 2006 2010 - O lastSu 0s 0 S +R Q 2007 o - Mar Su>=8 0s 1 D +R Q 2008 o - Mar Su>=15 0s 1 D +R Q 2009 2010 - Mar Su>=8 0s 1 D +R Q 2011 o - Mar Su>=15 0s 1 D +R Q 2011 o - N 13 0s 0 S +R Q 2012 o - Ap 1 0s 1 D +R Q 2012 ma - N Su>=1 0s 0 S +R Q 2013 ma - Mar Su>=8 0s 1 D +Z America/Havana -5:29:28 - LMT 1890 +-5:29:36 - HMT 1925 Jul 19 12 +-5 Q C%sT +R DO 1966 o - O 30 0 1 EDT +R DO 1967 o - F 28 0 0 EST +R DO 1969 1973 - O lastSu 0 0:30 -0430 +R DO 1970 o - F 21 0 0 EST +R DO 1971 o - Ja 20 0 0 EST +R DO 1972 1974 - Ja 21 0 0 EST +Z America/Santo_Domingo -4:39:36 - LMT 1890 +-4:40 - SDMT 1933 Ap 1 12 +-5 DO %s 1974 O 27 +-4 - AST 2000 O 29 2 +-5 u E%sT 2000 D 3 1 +-4 - AST +R SV 1987 1988 - May Su>=1 0 1 D +R SV 1987 1988 - S lastSu 0 0 S +Z America/El_Salvador -5:56:48 - LMT 1921 +-6 SV C%sT +R GT 1973 o - N 25 0 1 D +R GT 1974 o - F 24 0 0 S +R GT 1983 o - May 21 0 1 D +R GT 1983 o - S 22 0 0 S +R GT 1991 o - Mar 23 0 1 D +R GT 1991 o - S 7 0 0 S +R GT 2006 o - Ap 30 0 1 D +R GT 2006 o - O 1 0 0 S +Z America/Guatemala -6:2:4 - LMT 1918 O 5 +-6 GT C%sT +R HT 1983 o - May 8 0 1 D +R HT 1984 1987 - Ap lastSu 0 1 D +R HT 1983 1987 - O lastSu 0 0 S +R HT 1988 1997 - Ap Su>=1 1s 1 D +R HT 1988 1997 - O lastSu 1s 0 S +R HT 2005 2006 - Ap Su>=1 0 1 D +R HT 2005 2006 - O lastSu 0 0 S +R HT 2012 2015 - Mar Su>=8 2 1 D +R HT 2012 2015 - N Su>=1 2 0 S +R HT 2017 ma - Mar Su>=8 2 1 D +R HT 2017 ma - N Su>=1 2 0 S +Z America/Port-au-Prince -4:49:20 - LMT 1890 +-4:49 - PPMT 1917 Ja 24 12 +-5 HT E%sT +R HN 1987 1988 - May Su>=1 0 1 D +R HN 1987 1988 - S lastSu 0 0 S +R HN 2006 o - May Su>=1 0 1 D +R HN 2006 o - Au M>=1 0 0 S +Z America/Tegucigalpa -5:48:52 - LMT 1921 Ap +-6 HN C%sT +Z America/Jamaica -5:7:10 - LMT 1890 +-5:7:10 - KMT 1912 F +-5 - EST 1974 +-5 u E%sT 1984 +-5 - EST +Z America/Martinique -4:4:20 - LMT 1890 +-4:4:20 - FFMT 1911 May +-4 - AST 1980 Ap 6 +-4 1 ADT 1980 S 28 +-4 - AST +R NI 1979 1980 - Mar Su>=16 0 1 D +R NI 1979 1980 - Jun M>=23 0 0 S +R NI 2005 o - Ap 10 0 1 D +R NI 2005 o - O Su>=1 0 0 S +R NI 2006 o - Ap 30 2 1 D +R NI 2006 o - O Su>=1 1 0 S +Z America/Managua -5:45:8 - LMT 1890 +-5:45:12 - MMT 1934 Jun 23 +-6 - CST 1973 May +-5 - EST 1975 F 16 +-6 NI C%sT 1992 Ja 1 4 +-5 - EST 1992 S 24 +-6 - CST 1993 +-5 - EST 1997 +-6 NI C%sT +Z America/Panama -5:18:8 - LMT 1890 +-5:19:36 - CMT 1908 Ap 22 +-5 - EST +L America/Panama America/Cayman +Z America/Puerto_Rico -4:24:25 - LMT 1899 Mar 28 12 +-4 - AST 1942 May 3 +-4 u A%sT 1946 +-4 - AST +Z America/Miquelon -3:44:40 - LMT 1911 May 15 +-4 - AST 1980 May +-3 - -03 1987 +-3 C -03/-02 +Z America/Grand_Turk -4:44:32 - LMT 1890 +-5:7:10 - KMT 1912 F +-5 - EST 1979 +-5 u E%sT 2015 N Su>=1 2 +-4 - AST 2018 Mar 11 3 +-5 u E%sT +R A 1930 o - D 1 0 1 - +R A 1931 o - Ap 1 0 0 - +R A 1931 o - O 15 0 1 - +R A 1932 1940 - Mar 1 0 0 - +R A 1932 1939 - N 1 0 1 - +R A 1940 o - Jul 1 0 1 - +R A 1941 o - Jun 15 0 0 - +R A 1941 o - O 15 0 1 - +R A 1943 o - Au 1 0 0 - +R A 1943 o - O 15 0 1 - +R A 1946 o - Mar 1 0 0 - +R A 1946 o - O 1 0 1 - +R A 1963 o - O 1 0 0 - +R A 1963 o - D 15 0 1 - +R A 1964 1966 - Mar 1 0 0 - +R A 1964 1966 - O 15 0 1 - +R A 1967 o - Ap 2 0 0 - +R A 1967 1968 - O Su>=1 0 1 - +R A 1968 1969 - Ap Su>=1 0 0 - +R A 1974 o - Ja 23 0 1 - +R A 1974 o - May 1 0 0 - +R A 1988 o - D 1 0 1 - +R A 1989 1993 - Mar Su>=1 0 0 - +R A 1989 1992 - O Su>=15 0 1 - +R A 1999 o - O Su>=1 0 1 - +R A 2000 o - Mar 3 0 0 - +R A 2007 o - D 30 0 1 - +R A 2008 2009 - Mar Su>=15 0 0 - +R A 2008 o - O Su>=15 0 1 - +Z America/Argentina/Buenos_Aires -3:53:48 - LMT 1894 O 31 +-4:16:48 - CMT 1920 May +-4 - -04 1930 D +-4 A -04/-03 1969 O 5 +-3 A -03/-02 1999 O 3 +-4 A -04/-03 2000 Mar 3 +-3 A -03/-02 +Z America/Argentina/Cordoba -4:16:48 - LMT 1894 O 31 +-4:16:48 - CMT 1920 May +-4 - -04 1930 D +-4 A -04/-03 1969 O 5 +-3 A -03/-02 1991 Mar 3 +-4 - -04 1991 O 20 +-3 A -03/-02 1999 O 3 +-4 A -04/-03 2000 Mar 3 +-3 A -03/-02 +Z America/Argentina/Salta -4:21:40 - LMT 1894 O 31 +-4:16:48 - CMT 1920 May +-4 - -04 1930 D +-4 A -04/-03 1969 O 5 +-3 A -03/-02 1991 Mar 3 +-4 - -04 1991 O 20 +-3 A -03/-02 1999 O 3 +-4 A -04/-03 2000 Mar 3 +-3 A -03/-02 2008 O 18 +-3 - -03 +Z America/Argentina/Tucuman -4:20:52 - LMT 1894 O 31 +-4:16:48 - CMT 1920 May +-4 - -04 1930 D +-4 A -04/-03 1969 O 5 +-3 A -03/-02 1991 Mar 3 +-4 - -04 1991 O 20 +-3 A -03/-02 1999 O 3 +-4 A -04/-03 2000 Mar 3 +-3 - -03 2004 Jun +-4 - -04 2004 Jun 13 +-3 A -03/-02 +Z America/Argentina/La_Rioja -4:27:24 - LMT 1894 O 31 +-4:16:48 - CMT 1920 May +-4 - -04 1930 D +-4 A -04/-03 1969 O 5 +-3 A -03/-02 1991 Mar +-4 - -04 1991 May 7 +-3 A -03/-02 1999 O 3 +-4 A -04/-03 2000 Mar 3 +-3 - -03 2004 Jun +-4 - -04 2004 Jun 20 +-3 A -03/-02 2008 O 18 +-3 - -03 +Z America/Argentina/San_Juan -4:34:4 - LMT 1894 O 31 +-4:16:48 - CMT 1920 May +-4 - -04 1930 D +-4 A -04/-03 1969 O 5 +-3 A -03/-02 1991 Mar +-4 - -04 1991 May 7 +-3 A -03/-02 1999 O 3 +-4 A -04/-03 2000 Mar 3 +-3 - -03 2004 May 31 +-4 - -04 2004 Jul 25 +-3 A -03/-02 2008 O 18 +-3 - -03 +Z America/Argentina/Jujuy -4:21:12 - LMT 1894 O 31 +-4:16:48 - CMT 1920 May +-4 - -04 1930 D +-4 A -04/-03 1969 O 5 +-3 A -03/-02 1990 Mar 4 +-4 - -04 1990 O 28 +-4 1 -03 1991 Mar 17 +-4 - -04 1991 O 6 +-3 1 -02 1992 +-3 A -03/-02 1999 O 3 +-4 A -04/-03 2000 Mar 3 +-3 A -03/-02 2008 O 18 +-3 - -03 +Z America/Argentina/Catamarca -4:23:8 - LMT 1894 O 31 +-4:16:48 - CMT 1920 May +-4 - -04 1930 D +-4 A -04/-03 1969 O 5 +-3 A -03/-02 1991 Mar 3 +-4 - -04 1991 O 20 +-3 A -03/-02 1999 O 3 +-4 A -04/-03 2000 Mar 3 +-3 - -03 2004 Jun +-4 - -04 2004 Jun 20 +-3 A -03/-02 2008 O 18 +-3 - -03 +Z America/Argentina/Mendoza -4:35:16 - LMT 1894 O 31 +-4:16:48 - CMT 1920 May +-4 - -04 1930 D +-4 A -04/-03 1969 O 5 +-3 A -03/-02 1990 Mar 4 +-4 - -04 1990 O 15 +-4 1 -03 1991 Mar +-4 - -04 1991 O 15 +-4 1 -03 1992 Mar +-4 - -04 1992 O 18 +-3 A -03/-02 1999 O 3 +-4 A -04/-03 2000 Mar 3 +-3 - -03 2004 May 23 +-4 - -04 2004 S 26 +-3 A -03/-02 2008 O 18 +-3 - -03 +R Sa 2008 2009 - Mar Su>=8 0 0 - +R Sa 2007 2008 - O Su>=8 0 1 - +Z America/Argentina/San_Luis -4:25:24 - LMT 1894 O 31 +-4:16:48 - CMT 1920 May +-4 - -04 1930 D +-4 A -04/-03 1969 O 5 +-3 A -03/-02 1990 +-3 1 -02 1990 Mar 14 +-4 - -04 1990 O 15 +-4 1 -03 1991 Mar +-4 - -04 1991 Jun +-3 - -03 1999 O 3 +-4 1 -03 2000 Mar 3 +-3 - -03 2004 May 31 +-4 - -04 2004 Jul 25 +-3 A -03/-02 2008 Ja 21 +-4 Sa -04/-03 2009 O 11 +-3 - -03 +Z America/Argentina/Rio_Gallegos -4:36:52 - LMT 1894 O 31 +-4:16:48 - CMT 1920 May +-4 - -04 1930 D +-4 A -04/-03 1969 O 5 +-3 A -03/-02 1999 O 3 +-4 A -04/-03 2000 Mar 3 +-3 - -03 2004 Jun +-4 - -04 2004 Jun 20 +-3 A -03/-02 2008 O 18 +-3 - -03 +Z America/Argentina/Ushuaia -4:33:12 - LMT 1894 O 31 +-4:16:48 - CMT 1920 May +-4 - -04 1930 D +-4 A -04/-03 1969 O 5 +-3 A -03/-02 1999 O 3 +-4 A -04/-03 2000 Mar 3 +-3 - -03 2004 May 30 +-4 - -04 2004 Jun 20 +-3 A -03/-02 2008 O 18 +-3 - -03 +L America/Curacao America/Aruba +Z America/La_Paz -4:32:36 - LMT 1890 +-4:32:36 - CMT 1931 O 15 +-4:32:36 1 BST 1932 Mar 21 +-4 - -04 +R B 1931 o - O 3 11 1 - +R B 1932 1933 - Ap 1 0 0 - +R B 1932 o - O 3 0 1 - +R B 1949 1952 - D 1 0 1 - +R B 1950 o - Ap 16 1 0 - +R B 1951 1952 - Ap 1 0 0 - +R B 1953 o - Mar 1 0 0 - +R B 1963 o - D 9 0 1 - +R B 1964 o - Mar 1 0 0 - +R B 1965 o - Ja 31 0 1 - +R B 1965 o - Mar 31 0 0 - +R B 1965 o - D 1 0 1 - +R B 1966 1968 - Mar 1 0 0 - +R B 1966 1967 - N 1 0 1 - +R B 1985 o - N 2 0 1 - +R B 1986 o - Mar 15 0 0 - +R B 1986 o - O 25 0 1 - +R B 1987 o - F 14 0 0 - +R B 1987 o - O 25 0 1 - +R B 1988 o - F 7 0 0 - +R B 1988 o - O 16 0 1 - +R B 1989 o - Ja 29 0 0 - +R B 1989 o - O 15 0 1 - +R B 1990 o - F 11 0 0 - +R B 1990 o - O 21 0 1 - +R B 1991 o - F 17 0 0 - +R B 1991 o - O 20 0 1 - +R B 1992 o - F 9 0 0 - +R B 1992 o - O 25 0 1 - +R B 1993 o - Ja 31 0 0 - +R B 1993 1995 - O Su>=11 0 1 - +R B 1994 1995 - F Su>=15 0 0 - +R B 1996 o - F 11 0 0 - +R B 1996 o - O 6 0 1 - +R B 1997 o - F 16 0 0 - +R B 1997 o - O 6 0 1 - +R B 1998 o - Mar 1 0 0 - +R B 1998 o - O 11 0 1 - +R B 1999 o - F 21 0 0 - +R B 1999 o - O 3 0 1 - +R B 2000 o - F 27 0 0 - +R B 2000 2001 - O Su>=8 0 1 - +R B 2001 2006 - F Su>=15 0 0 - +R B 2002 o - N 3 0 1 - +R B 2003 o - O 19 0 1 - +R B 2004 o - N 2 0 1 - +R B 2005 o - O 16 0 1 - +R B 2006 o - N 5 0 1 - +R B 2007 o - F 25 0 0 - +R B 2007 o - O Su>=8 0 1 - +R B 2008 2017 - O Su>=15 0 1 - +R B 2008 2011 - F Su>=15 0 0 - +R B 2012 o - F Su>=22 0 0 - +R B 2013 2014 - F Su>=15 0 0 - +R B 2015 o - F Su>=22 0 0 - +R B 2016 2019 - F Su>=15 0 0 - +R B 2018 o - N Su>=1 0 1 - +Z America/Noronha -2:9:40 - LMT 1914 +-2 B -02/-01 1990 S 17 +-2 - -02 1999 S 30 +-2 B -02/-01 2000 O 15 +-2 - -02 2001 S 13 +-2 B -02/-01 2002 O +-2 - -02 +Z America/Belem -3:13:56 - LMT 1914 +-3 B -03/-02 1988 S 12 +-3 - -03 +Z America/Santarem -3:38:48 - LMT 1914 +-4 B -04/-03 1988 S 12 +-4 - -04 2008 Jun 24 +-3 - -03 +Z America/Fortaleza -2:34 - LMT 1914 +-3 B -03/-02 1990 S 17 +-3 - -03 1999 S 30 +-3 B -03/-02 2000 O 22 +-3 - -03 2001 S 13 +-3 B -03/-02 2002 O +-3 - -03 +Z America/Recife -2:19:36 - LMT 1914 +-3 B -03/-02 1990 S 17 +-3 - -03 1999 S 30 +-3 B -03/-02 2000 O 15 +-3 - -03 2001 S 13 +-3 B -03/-02 2002 O +-3 - -03 +Z America/Araguaina -3:12:48 - LMT 1914 +-3 B -03/-02 1990 S 17 +-3 - -03 1995 S 14 +-3 B -03/-02 2003 S 24 +-3 - -03 2012 O 21 +-3 B -03/-02 2013 S +-3 - -03 +Z America/Maceio -2:22:52 - LMT 1914 +-3 B -03/-02 1990 S 17 +-3 - -03 1995 O 13 +-3 B -03/-02 1996 S 4 +-3 - -03 1999 S 30 +-3 B -03/-02 2000 O 22 +-3 - -03 2001 S 13 +-3 B -03/-02 2002 O +-3 - -03 +Z America/Bahia -2:34:4 - LMT 1914 +-3 B -03/-02 2003 S 24 +-3 - -03 2011 O 16 +-3 B -03/-02 2012 O 21 +-3 - -03 +Z America/Sao_Paulo -3:6:28 - LMT 1914 +-3 B -03/-02 1963 O 23 +-3 1 -02 1964 +-3 B -03/-02 +Z America/Campo_Grande -3:38:28 - LMT 1914 +-4 B -04/-03 +Z America/Cuiaba -3:44:20 - LMT 1914 +-4 B -04/-03 2003 S 24 +-4 - -04 2004 O +-4 B -04/-03 +Z America/Porto_Velho -4:15:36 - LMT 1914 +-4 B -04/-03 1988 S 12 +-4 - -04 +Z America/Boa_Vista -4:2:40 - LMT 1914 +-4 B -04/-03 1988 S 12 +-4 - -04 1999 S 30 +-4 B -04/-03 2000 O 15 +-4 - -04 +Z America/Manaus -4:0:4 - LMT 1914 +-4 B -04/-03 1988 S 12 +-4 - -04 1993 S 28 +-4 B -04/-03 1994 S 22 +-4 - -04 +Z America/Eirunepe -4:39:28 - LMT 1914 +-5 B -05/-04 1988 S 12 +-5 - -05 1993 S 28 +-5 B -05/-04 1994 S 22 +-5 - -05 2008 Jun 24 +-4 - -04 2013 N 10 +-5 - -05 +Z America/Rio_Branco -4:31:12 - LMT 1914 +-5 B -05/-04 1988 S 12 +-5 - -05 2008 Jun 24 +-4 - -04 2013 N 10 +-5 - -05 +R x 1927 1931 - S 1 0 1 - +R x 1928 1932 - Ap 1 0 0 - +R x 1968 o - N 3 4u 1 - +R x 1969 o - Mar 30 3u 0 - +R x 1969 o - N 23 4u 1 - +R x 1970 o - Mar 29 3u 0 - +R x 1971 o - Mar 14 3u 0 - +R x 1970 1972 - O Su>=9 4u 1 - +R x 1972 1986 - Mar Su>=9 3u 0 - +R x 1973 o - S 30 4u 1 - +R x 1974 1987 - O Su>=9 4u 1 - +R x 1987 o - Ap 12 3u 0 - +R x 1988 1990 - Mar Su>=9 3u 0 - +R x 1988 1989 - O Su>=9 4u 1 - +R x 1990 o - S 16 4u 1 - +R x 1991 1996 - Mar Su>=9 3u 0 - +R x 1991 1997 - O Su>=9 4u 1 - +R x 1997 o - Mar 30 3u 0 - +R x 1998 o - Mar Su>=9 3u 0 - +R x 1998 o - S 27 4u 1 - +R x 1999 o - Ap 4 3u 0 - +R x 1999 2010 - O Su>=9 4u 1 - +R x 2000 2007 - Mar Su>=9 3u 0 - +R x 2008 o - Mar 30 3u 0 - +R x 2009 o - Mar Su>=9 3u 0 - +R x 2010 o - Ap Su>=1 3u 0 - +R x 2011 o - May Su>=2 3u 0 - +R x 2011 o - Au Su>=16 4u 1 - +R x 2012 2014 - Ap Su>=23 3u 0 - +R x 2012 2014 - S Su>=2 4u 1 - +R x 2016 2018 - May Su>=9 3u 0 - +R x 2016 2018 - Au Su>=9 4u 1 - +R x 2019 ma - Ap Su>=2 3u 0 - +R x 2019 ma - S Su>=2 4u 1 - +Z America/Santiago -4:42:46 - LMT 1890 +-4:42:46 - SMT 1910 Ja 10 +-5 - -05 1916 Jul +-4:42:46 - SMT 1918 S 10 +-4 - -04 1919 Jul +-4:42:46 - SMT 1927 S +-5 x -05/-04 1932 S +-4 - -04 1942 Jun +-5 - -05 1942 Au +-4 - -04 1946 Jul 15 +-4 1 -03 1946 S +-4 - -04 1947 Ap +-5 - -05 1947 May 21 23 +-4 x -04/-03 +Z America/Punta_Arenas -4:43:40 - LMT 1890 +-4:42:46 - SMT 1910 Ja 10 +-5 - -05 1916 Jul +-4:42:46 - SMT 1918 S 10 +-4 - -04 1919 Jul +-4:42:46 - SMT 1927 S +-5 x -05/-04 1932 S +-4 - -04 1942 Jun +-5 - -05 1942 Au +-4 - -04 1947 Ap +-5 - -05 1947 May 21 23 +-4 x -04/-03 2016 D 4 +-3 - -03 +Z Pacific/Easter -7:17:28 - LMT 1890 +-7:17:28 - EMT 1932 S +-7 x -07/-06 1982 Mar 14 3u +-6 x -06/-05 +Z Antarctica/Palmer 0 - -00 1965 +-4 A -04/-03 1969 O 5 +-3 A -03/-02 1982 May +-4 x -04/-03 2016 D 4 +-3 - -03 +R CO 1992 o - May 3 0 1 - +R CO 1993 o - Ap 4 0 0 - +Z America/Bogota -4:56:16 - LMT 1884 Mar 13 +-4:56:16 - BMT 1914 N 23 +-5 CO -05/-04 +Z America/Curacao -4:35:47 - LMT 1912 F 12 +-4:30 - -0430 1965 +-4 - AST +L America/Curacao America/Lower_Princes +L America/Curacao America/Kralendijk +R EC 1992 o - N 28 0 1 - +R EC 1993 o - F 5 0 0 - +Z America/Guayaquil -5:19:20 - LMT 1890 +-5:14 - QMT 1931 +-5 EC -05/-04 +Z Pacific/Galapagos -5:58:24 - LMT 1931 +-5 - -05 1986 +-6 EC -06/-05 +R FK 1937 1938 - S lastSu 0 1 - +R FK 1938 1942 - Mar Su>=19 0 0 - +R FK 1939 o - O 1 0 1 - +R FK 1940 1942 - S lastSu 0 1 - +R FK 1943 o - Ja 1 0 0 - +R FK 1983 o - S lastSu 0 1 - +R FK 1984 1985 - Ap lastSu 0 0 - +R FK 1984 o - S 16 0 1 - +R FK 1985 2000 - S Su>=9 0 1 - +R FK 1986 2000 - Ap Su>=16 0 0 - +R FK 2001 2010 - Ap Su>=15 2 0 - +R FK 2001 2010 - S Su>=1 2 1 - +Z Atlantic/Stanley -3:51:24 - LMT 1890 +-3:51:24 - SMT 1912 Mar 12 +-4 FK -04/-03 1983 May +-3 FK -03/-02 1985 S 15 +-4 FK -04/-03 2010 S 5 2 +-3 - -03 +Z America/Cayenne -3:29:20 - LMT 1911 Jul +-4 - -04 1967 O +-3 - -03 +Z America/Guyana -3:52:40 - LMT 1915 Mar +-3:45 - -0345 1975 Jul 31 +-3 - -03 1991 +-4 - -04 +R y 1975 1988 - O 1 0 1 - +R y 1975 1978 - Mar 1 0 0 - +R y 1979 1991 - Ap 1 0 0 - +R y 1989 o - O 22 0 1 - +R y 1990 o - O 1 0 1 - +R y 1991 o - O 6 0 1 - +R y 1992 o - Mar 1 0 0 - +R y 1992 o - O 5 0 1 - +R y 1993 o - Mar 31 0 0 - +R y 1993 1995 - O 1 0 1 - +R y 1994 1995 - F lastSu 0 0 - +R y 1996 o - Mar 1 0 0 - +R y 1996 2001 - O Su>=1 0 1 - +R y 1997 o - F lastSu 0 0 - +R y 1998 2001 - Mar Su>=1 0 0 - +R y 2002 2004 - Ap Su>=1 0 0 - +R y 2002 2003 - S Su>=1 0 1 - +R y 2004 2009 - O Su>=15 0 1 - +R y 2005 2009 - Mar Su>=8 0 0 - +R y 2010 ma - O Su>=1 0 1 - +R y 2010 2012 - Ap Su>=8 0 0 - +R y 2013 ma - Mar Su>=22 0 0 - +Z America/Asuncion -3:50:40 - LMT 1890 +-3:50:40 - AMT 1931 O 10 +-4 - -04 1972 O +-3 - -03 1974 Ap +-4 y -04/-03 +R PE 1938 o - Ja 1 0 1 - +R PE 1938 o - Ap 1 0 0 - +R PE 1938 1939 - S lastSu 0 1 - +R PE 1939 1940 - Mar Su>=24 0 0 - +R PE 1986 1987 - Ja 1 0 1 - +R PE 1986 1987 - Ap 1 0 0 - +R PE 1990 o - Ja 1 0 1 - +R PE 1990 o - Ap 1 0 0 - +R PE 1994 o - Ja 1 0 1 - +R PE 1994 o - Ap 1 0 0 - +Z America/Lima -5:8:12 - LMT 1890 +-5:8:36 - LMT 1908 Jul 28 +-5 PE -05/-04 +Z Atlantic/South_Georgia -2:26:8 - LMT 1890 +-2 - -02 +Z America/Paramaribo -3:40:40 - LMT 1911 +-3:40:52 - PMT 1935 +-3:40:36 - PMT 1945 O +-3:30 - -0330 1984 O +-3 - -03 +Z America/Port_of_Spain -4:6:4 - LMT 1912 Mar 2 +-4 - AST +L America/Port_of_Spain America/Anguilla +L America/Port_of_Spain America/Antigua +L America/Port_of_Spain America/Dominica +L America/Port_of_Spain America/Grenada +L America/Port_of_Spain America/Guadeloupe +L America/Port_of_Spain America/Marigot +L America/Port_of_Spain America/Montserrat +L America/Port_of_Spain America/St_Barthelemy +L America/Port_of_Spain America/St_Kitts +L America/Port_of_Spain America/St_Lucia +L America/Port_of_Spain America/St_Thomas +L America/Port_of_Spain America/St_Vincent +L America/Port_of_Spain America/Tortola +R U 1923 1925 - O 1 0 0:30 - +R U 1924 1926 - Ap 1 0 0 - +R U 1933 1938 - O lastSu 0 0:30 - +R U 1934 1941 - Mar lastSa 24 0 - +R U 1939 o - O 1 0 0:30 - +R U 1940 o - O 27 0 0:30 - +R U 1941 o - Au 1 0 0:30 - +R U 1942 o - D 14 0 0:30 - +R U 1943 o - Mar 14 0 0 - +R U 1959 o - May 24 0 0:30 - +R U 1959 o - N 15 0 0 - +R U 1960 o - Ja 17 0 1 - +R U 1960 o - Mar 6 0 0 - +R U 1965 o - Ap 4 0 1 - +R U 1965 o - S 26 0 0 - +R U 1968 o - May 27 0 0:30 - +R U 1968 o - D 1 0 0 - +R U 1970 o - Ap 25 0 1 - +R U 1970 o - Jun 14 0 0 - +R U 1972 o - Ap 23 0 1 - +R U 1972 o - Jul 16 0 0 - +R U 1974 o - Ja 13 0 1:30 - +R U 1974 o - Mar 10 0 0:30 - +R U 1974 o - S 1 0 0 - +R U 1974 o - D 22 0 1 - +R U 1975 o - Mar 30 0 0 - +R U 1976 o - D 19 0 1 - +R U 1977 o - Mar 6 0 0 - +R U 1977 o - D 4 0 1 - +R U 1978 1979 - Mar Su>=1 0 0 - +R U 1978 o - D 17 0 1 - +R U 1979 o - Ap 29 0 1 - +R U 1980 o - Mar 16 0 0 - +R U 1987 o - D 14 0 1 - +R U 1988 o - F 28 0 0 - +R U 1988 o - D 11 0 1 - +R U 1989 o - Mar 5 0 0 - +R U 1989 o - O 29 0 1 - +R U 1990 o - F 25 0 0 - +R U 1990 1991 - O Su>=21 0 1 - +R U 1991 1992 - Mar Su>=1 0 0 - +R U 1992 o - O 18 0 1 - +R U 1993 o - F 28 0 0 - +R U 2004 o - S 19 0 1 - +R U 2005 o - Mar 27 2 0 - +R U 2005 o - O 9 2 1 - +R U 2006 2015 - Mar Su>=8 2 0 - +R U 2006 2014 - O Su>=1 2 1 - +Z America/Montevideo -3:44:51 - LMT 1908 Jun 10 +-3:44:51 - MMT 1920 May +-4 - -04 1923 O +-3:30 U -0330/-03 1942 D 14 +-3 U -03/-0230 1960 +-3 U -03/-02 1968 +-3 U -03/-0230 1970 +-3 U -03/-02 1974 +-3 U -03/-0130 1974 Mar 10 +-3 U -03/-0230 1974 D 22 +-3 U -03/-02 +Z America/Caracas -4:27:44 - LMT 1890 +-4:27:40 - CMT 1912 F 12 +-4:30 - -0430 1965 +-4 - -04 2007 D 9 3 +-4:30 - -0430 2016 May 1 2:30 +-4 - -04 +Z Etc/GMT 0 - GMT +Z Etc/UTC 0 - UTC +L Etc/GMT GMT +L Etc/UTC Etc/Universal +L Etc/UTC Etc/Zulu +L Etc/GMT Etc/Greenwich +L Etc/GMT Etc/GMT-0 +L Etc/GMT Etc/GMT+0 +L Etc/GMT Etc/GMT0 +Z Etc/GMT-14 14 - +14 +Z Etc/GMT-13 13 - +13 +Z Etc/GMT-12 12 - +12 +Z Etc/GMT-11 11 - +11 +Z Etc/GMT-10 10 - +10 +Z Etc/GMT-9 9 - +09 +Z Etc/GMT-8 8 - +08 +Z Etc/GMT-7 7 - +07 +Z Etc/GMT-6 6 - +06 +Z Etc/GMT-5 5 - +05 +Z Etc/GMT-4 4 - +04 +Z Etc/GMT-3 3 - +03 +Z Etc/GMT-2 2 - +02 +Z Etc/GMT-1 1 - +01 +Z Etc/GMT+1 -1 - -01 +Z Etc/GMT+2 -2 - -02 +Z Etc/GMT+3 -3 - -03 +Z Etc/GMT+4 -4 - -04 +Z Etc/GMT+5 -5 - -05 +Z Etc/GMT+6 -6 - -06 +Z Etc/GMT+7 -7 - -07 +Z Etc/GMT+8 -8 - -08 +Z Etc/GMT+9 -9 - -09 +Z Etc/GMT+10 -10 - -10 +Z Etc/GMT+11 -11 - -11 +Z Etc/GMT+12 -12 - -12 +Z Factory 0 - -00 +L Africa/Nairobi Africa/Asmera +L Africa/Abidjan Africa/Timbuktu +L America/Argentina/Catamarca America/Argentina/ComodRivadavia +L America/Adak America/Atka +L America/Argentina/Buenos_Aires America/Buenos_Aires +L America/Argentina/Catamarca America/Catamarca +L America/Atikokan America/Coral_Harbour +L America/Argentina/Cordoba America/Cordoba +L America/Tijuana America/Ensenada +L America/Indiana/Indianapolis America/Fort_Wayne +L America/Indiana/Indianapolis America/Indianapolis +L America/Argentina/Jujuy America/Jujuy +L America/Indiana/Knox America/Knox_IN +L America/Kentucky/Louisville America/Louisville +L America/Argentina/Mendoza America/Mendoza +L America/Toronto America/Montreal +L America/Rio_Branco America/Porto_Acre +L America/Argentina/Cordoba America/Rosario +L America/Tijuana America/Santa_Isabel +L America/Denver America/Shiprock +L America/Port_of_Spain America/Virgin +L Pacific/Auckland Antarctica/South_Pole +L Asia/Ashgabat Asia/Ashkhabad +L Asia/Kolkata Asia/Calcutta +L Asia/Shanghai Asia/Chongqing +L Asia/Shanghai Asia/Chungking +L Asia/Dhaka Asia/Dacca +L Asia/Shanghai Asia/Harbin +L Asia/Urumqi Asia/Kashgar +L Asia/Kathmandu Asia/Katmandu +L Asia/Macau Asia/Macao +L Asia/Yangon Asia/Rangoon +L Asia/Ho_Chi_Minh Asia/Saigon +L Asia/Jerusalem Asia/Tel_Aviv +L Asia/Thimphu Asia/Thimbu +L Asia/Makassar Asia/Ujung_Pandang +L Asia/Ulaanbaatar Asia/Ulan_Bator +L Atlantic/Faroe Atlantic/Faeroe +L Europe/Oslo Atlantic/Jan_Mayen +L Australia/Sydney Australia/ACT +L Australia/Sydney Australia/Canberra +L Australia/Lord_Howe Australia/LHI +L Australia/Sydney Australia/NSW +L Australia/Darwin Australia/North +L Australia/Brisbane Australia/Queensland +L Australia/Adelaide Australia/South +L Australia/Hobart Australia/Tasmania +L Australia/Melbourne Australia/Victoria +L Australia/Perth Australia/West +L Australia/Broken_Hill Australia/Yancowinna +L America/Rio_Branco Brazil/Acre +L America/Noronha Brazil/DeNoronha +L America/Sao_Paulo Brazil/East +L America/Manaus Brazil/West +L America/Halifax Canada/Atlantic +L America/Winnipeg Canada/Central +L America/Toronto Canada/Eastern +L America/Edmonton Canada/Mountain +L America/St_Johns Canada/Newfoundland +L America/Vancouver Canada/Pacific +L America/Regina Canada/Saskatchewan +L America/Whitehorse Canada/Yukon +L America/Santiago Chile/Continental +L Pacific/Easter Chile/EasterIsland +L America/Havana Cuba +L Africa/Cairo Egypt +L Europe/Dublin Eire +L Etc/UTC Etc/UCT +L Europe/London Europe/Belfast +L Europe/Chisinau Europe/Tiraspol +L Europe/London GB +L Europe/London GB-Eire +L Etc/GMT GMT+0 +L Etc/GMT GMT-0 +L Etc/GMT GMT0 +L Etc/GMT Greenwich +L Asia/Hong_Kong Hongkong +L Atlantic/Reykjavik Iceland +L Asia/Tehran Iran +L Asia/Jerusalem Israel +L America/Jamaica Jamaica +L Asia/Tokyo Japan +L Pacific/Kwajalein Kwajalein +L Africa/Tripoli Libya +L America/Tijuana Mexico/BajaNorte +L America/Mazatlan Mexico/BajaSur +L America/Mexico_City Mexico/General +L Pacific/Auckland NZ +L Pacific/Chatham NZ-CHAT +L America/Denver Navajo +L Asia/Shanghai PRC +L Pacific/Honolulu Pacific/Johnston +L Pacific/Pohnpei Pacific/Ponape +L Pacific/Pago_Pago Pacific/Samoa +L Pacific/Chuuk Pacific/Truk +L Pacific/Chuuk Pacific/Yap +L Europe/Warsaw Poland +L Europe/Lisbon Portugal +L Asia/Taipei ROC +L Asia/Seoul ROK +L Asia/Singapore Singapore +L Europe/Istanbul Turkey +L Etc/UTC UCT +L America/Anchorage US/Alaska +L America/Adak US/Aleutian +L America/Phoenix US/Arizona +L America/Chicago US/Central +L America/Indiana/Indianapolis US/East-Indiana +L America/New_York US/Eastern +L Pacific/Honolulu US/Hawaii +L America/Indiana/Knox US/Indiana-Starke +L America/Detroit US/Michigan +L America/Denver US/Mountain +L America/Los_Angeles US/Pacific +L Pacific/Pago_Pago US/Samoa +L Etc/UTC UTC +L Etc/UTC Universal +L Europe/Moscow W-SU +L Etc/UTC Zulu diff --git a/venv/Lib/site-packages/pytz/zoneinfo/zone.tab b/venv/Lib/site-packages/pytz/zoneinfo/zone.tab new file mode 100644 index 0000000..408fcb2 --- /dev/null +++ b/venv/Lib/site-packages/pytz/zoneinfo/zone.tab @@ -0,0 +1,452 @@ +# tzdb timezone descriptions (deprecated version) +# +# This file is in the public domain, so clarified as of +# 2009-05-17 by Arthur David Olson. +# +# From Paul Eggert (2018-06-27): +# This file is intended as a backward-compatibility aid for older programs. +# New programs should use zone1970.tab. This file is like zone1970.tab (see +# zone1970.tab's comments), but with the following additional restrictions: +# +# 1. This file contains only ASCII characters. +# 2. The first data column contains exactly one country code. +# +# Because of (2), each row stands for an area that is the intersection +# of a region identified by a country code and of a timezone where civil +# clocks have agreed since 1970; this is a narrower definition than +# that of zone1970.tab. +# +# This table is intended as an aid for users, to help them select timezones +# appropriate for their practical needs. It is not intended to take or +# endorse any position on legal or territorial claims. +# +#country- +#code coordinates TZ comments +AD +4230+00131 Europe/Andorra +AE +2518+05518 Asia/Dubai +AF +3431+06912 Asia/Kabul +AG +1703-06148 America/Antigua +AI +1812-06304 America/Anguilla +AL +4120+01950 Europe/Tirane +AM +4011+04430 Asia/Yerevan +AO -0848+01314 Africa/Luanda +AQ -7750+16636 Antarctica/McMurdo New Zealand time - McMurdo, South Pole +AQ -6617+11031 Antarctica/Casey Casey +AQ -6835+07758 Antarctica/Davis Davis +AQ -6640+14001 Antarctica/DumontDUrville Dumont-d'Urville +AQ -6736+06253 Antarctica/Mawson Mawson +AQ -6448-06406 Antarctica/Palmer Palmer +AQ -6734-06808 Antarctica/Rothera Rothera +AQ -690022+0393524 Antarctica/Syowa Syowa +AQ -720041+0023206 Antarctica/Troll Troll +AQ -7824+10654 Antarctica/Vostok Vostok +AR -3436-05827 America/Argentina/Buenos_Aires Buenos Aires (BA, CF) +AR -3124-06411 America/Argentina/Cordoba Argentina (most areas: CB, CC, CN, ER, FM, MN, SE, SF) +AR -2447-06525 America/Argentina/Salta Salta (SA, LP, NQ, RN) +AR -2411-06518 America/Argentina/Jujuy Jujuy (JY) +AR -2649-06513 America/Argentina/Tucuman Tucuman (TM) +AR -2828-06547 America/Argentina/Catamarca Catamarca (CT); Chubut (CH) +AR -2926-06651 America/Argentina/La_Rioja La Rioja (LR) +AR -3132-06831 America/Argentina/San_Juan San Juan (SJ) +AR -3253-06849 America/Argentina/Mendoza Mendoza (MZ) +AR -3319-06621 America/Argentina/San_Luis San Luis (SL) +AR -5138-06913 America/Argentina/Rio_Gallegos Santa Cruz (SC) +AR -5448-06818 America/Argentina/Ushuaia Tierra del Fuego (TF) +AS -1416-17042 Pacific/Pago_Pago +AT +4813+01620 Europe/Vienna +AU -3133+15905 Australia/Lord_Howe Lord Howe Island +AU -5430+15857 Antarctica/Macquarie Macquarie Island +AU -4253+14719 Australia/Hobart Tasmania (most areas) +AU -3956+14352 Australia/Currie Tasmania (King Island) +AU -3749+14458 Australia/Melbourne Victoria +AU -3352+15113 Australia/Sydney New South Wales (most areas) +AU -3157+14127 Australia/Broken_Hill New South Wales (Yancowinna) +AU -2728+15302 Australia/Brisbane Queensland (most areas) +AU -2016+14900 Australia/Lindeman Queensland (Whitsunday Islands) +AU -3455+13835 Australia/Adelaide South Australia +AU -1228+13050 Australia/Darwin Northern Territory +AU -3157+11551 Australia/Perth Western Australia (most areas) +AU -3143+12852 Australia/Eucla Western Australia (Eucla) +AW +1230-06958 America/Aruba +AX +6006+01957 Europe/Mariehamn +AZ +4023+04951 Asia/Baku +BA +4352+01825 Europe/Sarajevo +BB +1306-05937 America/Barbados +BD +2343+09025 Asia/Dhaka +BE +5050+00420 Europe/Brussels +BF +1222-00131 Africa/Ouagadougou +BG +4241+02319 Europe/Sofia +BH +2623+05035 Asia/Bahrain +BI -0323+02922 Africa/Bujumbura +BJ +0629+00237 Africa/Porto-Novo +BL +1753-06251 America/St_Barthelemy +BM +3217-06446 Atlantic/Bermuda +BN +0456+11455 Asia/Brunei +BO -1630-06809 America/La_Paz +BQ +120903-0681636 America/Kralendijk +BR -0351-03225 America/Noronha Atlantic islands +BR -0127-04829 America/Belem Para (east); Amapa +BR -0343-03830 America/Fortaleza Brazil (northeast: MA, PI, CE, RN, PB) +BR -0803-03454 America/Recife Pernambuco +BR -0712-04812 America/Araguaina Tocantins +BR -0940-03543 America/Maceio Alagoas, Sergipe +BR -1259-03831 America/Bahia Bahia +BR -2332-04637 America/Sao_Paulo Brazil (southeast: GO, DF, MG, ES, RJ, SP, PR, SC, RS) +BR -2027-05437 America/Campo_Grande Mato Grosso do Sul +BR -1535-05605 America/Cuiaba Mato Grosso +BR -0226-05452 America/Santarem Para (west) +BR -0846-06354 America/Porto_Velho Rondonia +BR +0249-06040 America/Boa_Vista Roraima +BR -0308-06001 America/Manaus Amazonas (east) +BR -0640-06952 America/Eirunepe Amazonas (west) +BR -0958-06748 America/Rio_Branco Acre +BS +2505-07721 America/Nassau +BT +2728+08939 Asia/Thimphu +BW -2439+02555 Africa/Gaborone +BY +5354+02734 Europe/Minsk +BZ +1730-08812 America/Belize +CA +4734-05243 America/St_Johns Newfoundland; Labrador (southeast) +CA +4439-06336 America/Halifax Atlantic - NS (most areas); PE +CA +4612-05957 America/Glace_Bay Atlantic - NS (Cape Breton) +CA +4606-06447 America/Moncton Atlantic - New Brunswick +CA +5320-06025 America/Goose_Bay Atlantic - Labrador (most areas) +CA +5125-05707 America/Blanc-Sablon AST - QC (Lower North Shore) +CA +4339-07923 America/Toronto Eastern - ON, QC (most areas) +CA +4901-08816 America/Nipigon Eastern - ON, QC (no DST 1967-73) +CA +4823-08915 America/Thunder_Bay Eastern - ON (Thunder Bay) +CA +6344-06828 America/Iqaluit Eastern - NU (most east areas) +CA +6608-06544 America/Pangnirtung Eastern - NU (Pangnirtung) +CA +484531-0913718 America/Atikokan EST - ON (Atikokan); NU (Coral H) +CA +4953-09709 America/Winnipeg Central - ON (west); Manitoba +CA +4843-09434 America/Rainy_River Central - ON (Rainy R, Ft Frances) +CA +744144-0944945 America/Resolute Central - NU (Resolute) +CA +624900-0920459 America/Rankin_Inlet Central - NU (central) +CA +5024-10439 America/Regina CST - SK (most areas) +CA +5017-10750 America/Swift_Current CST - SK (midwest) +CA +5333-11328 America/Edmonton Mountain - AB; BC (E); SK (W) +CA +690650-1050310 America/Cambridge_Bay Mountain - NU (west) +CA +6227-11421 America/Yellowknife Mountain - NT (central) +CA +682059-1334300 America/Inuvik Mountain - NT (west) +CA +4906-11631 America/Creston MST - BC (Creston) +CA +5946-12014 America/Dawson_Creek MST - BC (Dawson Cr, Ft St John) +CA +5848-12242 America/Fort_Nelson MST - BC (Ft Nelson) +CA +4916-12307 America/Vancouver Pacific - BC (most areas) +CA +6043-13503 America/Whitehorse Pacific - Yukon (south) +CA +6404-13925 America/Dawson Pacific - Yukon (north) +CC -1210+09655 Indian/Cocos +CD -0418+01518 Africa/Kinshasa Dem. Rep. of Congo (west) +CD -1140+02728 Africa/Lubumbashi Dem. Rep. of Congo (east) +CF +0422+01835 Africa/Bangui +CG -0416+01517 Africa/Brazzaville +CH +4723+00832 Europe/Zurich +CI +0519-00402 Africa/Abidjan +CK -2114-15946 Pacific/Rarotonga +CL -3327-07040 America/Santiago Chile (most areas) +CL -5309-07055 America/Punta_Arenas Region of Magallanes +CL -2709-10926 Pacific/Easter Easter Island +CM +0403+00942 Africa/Douala +CN +3114+12128 Asia/Shanghai Beijing Time +CN +4348+08735 Asia/Urumqi Xinjiang Time +CO +0436-07405 America/Bogota +CR +0956-08405 America/Costa_Rica +CU +2308-08222 America/Havana +CV +1455-02331 Atlantic/Cape_Verde +CW +1211-06900 America/Curacao +CX -1025+10543 Indian/Christmas +CY +3510+03322 Asia/Nicosia Cyprus (most areas) +CY +3507+03357 Asia/Famagusta Northern Cyprus +CZ +5005+01426 Europe/Prague +DE +5230+01322 Europe/Berlin Germany (most areas) +DE +4742+00841 Europe/Busingen Busingen +DJ +1136+04309 Africa/Djibouti +DK +5540+01235 Europe/Copenhagen +DM +1518-06124 America/Dominica +DO +1828-06954 America/Santo_Domingo +DZ +3647+00303 Africa/Algiers +EC -0210-07950 America/Guayaquil Ecuador (mainland) +EC -0054-08936 Pacific/Galapagos Galapagos Islands +EE +5925+02445 Europe/Tallinn +EG +3003+03115 Africa/Cairo +EH +2709-01312 Africa/El_Aaiun +ER +1520+03853 Africa/Asmara +ES +4024-00341 Europe/Madrid Spain (mainland) +ES +3553-00519 Africa/Ceuta Ceuta, Melilla +ES +2806-01524 Atlantic/Canary Canary Islands +ET +0902+03842 Africa/Addis_Ababa +FI +6010+02458 Europe/Helsinki +FJ -1808+17825 Pacific/Fiji +FK -5142-05751 Atlantic/Stanley +FM +0725+15147 Pacific/Chuuk Chuuk/Truk, Yap +FM +0658+15813 Pacific/Pohnpei Pohnpei/Ponape +FM +0519+16259 Pacific/Kosrae Kosrae +FO +6201-00646 Atlantic/Faroe +FR +4852+00220 Europe/Paris +GA +0023+00927 Africa/Libreville +GB +513030-0000731 Europe/London +GD +1203-06145 America/Grenada +GE +4143+04449 Asia/Tbilisi +GF +0456-05220 America/Cayenne +GG +492717-0023210 Europe/Guernsey +GH +0533-00013 Africa/Accra +GI +3608-00521 Europe/Gibraltar +GL +6411-05144 America/Godthab Greenland (most areas) +GL +7646-01840 America/Danmarkshavn National Park (east coast) +GL +7029-02158 America/Scoresbysund Scoresbysund/Ittoqqortoormiit +GL +7634-06847 America/Thule Thule/Pituffik +GM +1328-01639 Africa/Banjul +GN +0931-01343 Africa/Conakry +GP +1614-06132 America/Guadeloupe +GQ +0345+00847 Africa/Malabo +GR +3758+02343 Europe/Athens +GS -5416-03632 Atlantic/South_Georgia +GT +1438-09031 America/Guatemala +GU +1328+14445 Pacific/Guam +GW +1151-01535 Africa/Bissau +GY +0648-05810 America/Guyana +HK +2217+11409 Asia/Hong_Kong +HN +1406-08713 America/Tegucigalpa +HR +4548+01558 Europe/Zagreb +HT +1832-07220 America/Port-au-Prince +HU +4730+01905 Europe/Budapest +ID -0610+10648 Asia/Jakarta Java, Sumatra +ID -0002+10920 Asia/Pontianak Borneo (west, central) +ID -0507+11924 Asia/Makassar Borneo (east, south); Sulawesi/Celebes, Bali, Nusa Tengarra; Timor (west) +ID -0232+14042 Asia/Jayapura New Guinea (West Papua / Irian Jaya); Malukus/Moluccas +IE +5320-00615 Europe/Dublin +IL +314650+0351326 Asia/Jerusalem +IM +5409-00428 Europe/Isle_of_Man +IN +2232+08822 Asia/Kolkata +IO -0720+07225 Indian/Chagos +IQ +3321+04425 Asia/Baghdad +IR +3540+05126 Asia/Tehran +IS +6409-02151 Atlantic/Reykjavik +IT +4154+01229 Europe/Rome +JE +491101-0020624 Europe/Jersey +JM +175805-0764736 America/Jamaica +JO +3157+03556 Asia/Amman +JP +353916+1394441 Asia/Tokyo +KE -0117+03649 Africa/Nairobi +KG +4254+07436 Asia/Bishkek +KH +1133+10455 Asia/Phnom_Penh +KI +0125+17300 Pacific/Tarawa Gilbert Islands +KI -0308-17105 Pacific/Enderbury Phoenix Islands +KI +0152-15720 Pacific/Kiritimati Line Islands +KM -1141+04316 Indian/Comoro +KN +1718-06243 America/St_Kitts +KP +3901+12545 Asia/Pyongyang +KR +3733+12658 Asia/Seoul +KW +2920+04759 Asia/Kuwait +KY +1918-08123 America/Cayman +KZ +4315+07657 Asia/Almaty Kazakhstan (most areas) +KZ +4448+06528 Asia/Qyzylorda Qyzylorda/Kyzylorda/Kzyl-Orda +KZ +5312+06337 Asia/Qostanay Qostanay/Kostanay/Kustanay +KZ +5017+05710 Asia/Aqtobe Aqtobe/Aktobe +KZ +4431+05016 Asia/Aqtau Mangghystau/Mankistau +KZ +4707+05156 Asia/Atyrau Atyrau/Atirau/Gur'yev +KZ +5113+05121 Asia/Oral West Kazakhstan +LA +1758+10236 Asia/Vientiane +LB +3353+03530 Asia/Beirut +LC +1401-06100 America/St_Lucia +LI +4709+00931 Europe/Vaduz +LK +0656+07951 Asia/Colombo +LR +0618-01047 Africa/Monrovia +LS -2928+02730 Africa/Maseru +LT +5441+02519 Europe/Vilnius +LU +4936+00609 Europe/Luxembourg +LV +5657+02406 Europe/Riga +LY +3254+01311 Africa/Tripoli +MA +3339-00735 Africa/Casablanca +MC +4342+00723 Europe/Monaco +MD +4700+02850 Europe/Chisinau +ME +4226+01916 Europe/Podgorica +MF +1804-06305 America/Marigot +MG -1855+04731 Indian/Antananarivo +MH +0709+17112 Pacific/Majuro Marshall Islands (most areas) +MH +0905+16720 Pacific/Kwajalein Kwajalein +MK +4159+02126 Europe/Skopje +ML +1239-00800 Africa/Bamako +MM +1647+09610 Asia/Yangon +MN +4755+10653 Asia/Ulaanbaatar Mongolia (most areas) +MN +4801+09139 Asia/Hovd Bayan-Olgiy, Govi-Altai, Hovd, Uvs, Zavkhan +MN +4804+11430 Asia/Choibalsan Dornod, Sukhbaatar +MO +221150+1133230 Asia/Macau +MP +1512+14545 Pacific/Saipan +MQ +1436-06105 America/Martinique +MR +1806-01557 Africa/Nouakchott +MS +1643-06213 America/Montserrat +MT +3554+01431 Europe/Malta +MU -2010+05730 Indian/Mauritius +MV +0410+07330 Indian/Maldives +MW -1547+03500 Africa/Blantyre +MX +1924-09909 America/Mexico_City Central Time +MX +2105-08646 America/Cancun Eastern Standard Time - Quintana Roo +MX +2058-08937 America/Merida Central Time - Campeche, Yucatan +MX +2540-10019 America/Monterrey Central Time - Durango; Coahuila, Nuevo Leon, Tamaulipas (most areas) +MX +2550-09730 America/Matamoros Central Time US - Coahuila, Nuevo Leon, Tamaulipas (US border) +MX +2313-10625 America/Mazatlan Mountain Time - Baja California Sur, Nayarit, Sinaloa +MX +2838-10605 America/Chihuahua Mountain Time - Chihuahua (most areas) +MX +2934-10425 America/Ojinaga Mountain Time US - Chihuahua (US border) +MX +2904-11058 America/Hermosillo Mountain Standard Time - Sonora +MX +3232-11701 America/Tijuana Pacific Time US - Baja California +MX +2048-10515 America/Bahia_Banderas Central Time - Bahia de Banderas +MY +0310+10142 Asia/Kuala_Lumpur Malaysia (peninsula) +MY +0133+11020 Asia/Kuching Sabah, Sarawak +MZ -2558+03235 Africa/Maputo +NA -2234+01706 Africa/Windhoek +NC -2216+16627 Pacific/Noumea +NE +1331+00207 Africa/Niamey +NF -2903+16758 Pacific/Norfolk +NG +0627+00324 Africa/Lagos +NI +1209-08617 America/Managua +NL +5222+00454 Europe/Amsterdam +NO +5955+01045 Europe/Oslo +NP +2743+08519 Asia/Kathmandu +NR -0031+16655 Pacific/Nauru +NU -1901-16955 Pacific/Niue +NZ -3652+17446 Pacific/Auckland New Zealand (most areas) +NZ -4357-17633 Pacific/Chatham Chatham Islands +OM +2336+05835 Asia/Muscat +PA +0858-07932 America/Panama +PE -1203-07703 America/Lima +PF -1732-14934 Pacific/Tahiti Society Islands +PF -0900-13930 Pacific/Marquesas Marquesas Islands +PF -2308-13457 Pacific/Gambier Gambier Islands +PG -0930+14710 Pacific/Port_Moresby Papua New Guinea (most areas) +PG -0613+15534 Pacific/Bougainville Bougainville +PH +1435+12100 Asia/Manila +PK +2452+06703 Asia/Karachi +PL +5215+02100 Europe/Warsaw +PM +4703-05620 America/Miquelon +PN -2504-13005 Pacific/Pitcairn +PR +182806-0660622 America/Puerto_Rico +PS +3130+03428 Asia/Gaza Gaza Strip +PS +313200+0350542 Asia/Hebron West Bank +PT +3843-00908 Europe/Lisbon Portugal (mainland) +PT +3238-01654 Atlantic/Madeira Madeira Islands +PT +3744-02540 Atlantic/Azores Azores +PW +0720+13429 Pacific/Palau +PY -2516-05740 America/Asuncion +QA +2517+05132 Asia/Qatar +RE -2052+05528 Indian/Reunion +RO +4426+02606 Europe/Bucharest +RS +4450+02030 Europe/Belgrade +RU +5443+02030 Europe/Kaliningrad MSK-01 - Kaliningrad +RU +554521+0373704 Europe/Moscow MSK+00 - Moscow area +# The obsolescent zone.tab format cannot represent Europe/Simferopol well. +# Put it in RU section and list as UA. See "territorial claims" above. +# Programs should use zone1970.tab instead; see above. +UA +4457+03406 Europe/Simferopol MSK+00 - Crimea +RU +5836+04939 Europe/Kirov MSK+00 - Kirov +RU +4621+04803 Europe/Astrakhan MSK+01 - Astrakhan +RU +4844+04425 Europe/Volgograd MSK+01 - Volgograd +RU +5134+04602 Europe/Saratov MSK+01 - Saratov +RU +5420+04824 Europe/Ulyanovsk MSK+01 - Ulyanovsk +RU +5312+05009 Europe/Samara MSK+01 - Samara, Udmurtia +RU +5651+06036 Asia/Yekaterinburg MSK+02 - Urals +RU +5500+07324 Asia/Omsk MSK+03 - Omsk +RU +5502+08255 Asia/Novosibirsk MSK+04 - Novosibirsk +RU +5322+08345 Asia/Barnaul MSK+04 - Altai +RU +5630+08458 Asia/Tomsk MSK+04 - Tomsk +RU +5345+08707 Asia/Novokuznetsk MSK+04 - Kemerovo +RU +5601+09250 Asia/Krasnoyarsk MSK+04 - Krasnoyarsk area +RU +5216+10420 Asia/Irkutsk MSK+05 - Irkutsk, Buryatia +RU +5203+11328 Asia/Chita MSK+06 - Zabaykalsky +RU +6200+12940 Asia/Yakutsk MSK+06 - Lena River +RU +623923+1353314 Asia/Khandyga MSK+06 - Tomponsky, Ust-Maysky +RU +4310+13156 Asia/Vladivostok MSK+07 - Amur River +RU +643337+1431336 Asia/Ust-Nera MSK+07 - Oymyakonsky +RU +5934+15048 Asia/Magadan MSK+08 - Magadan +RU +4658+14242 Asia/Sakhalin MSK+08 - Sakhalin Island +RU +6728+15343 Asia/Srednekolymsk MSK+08 - Sakha (E); North Kuril Is +RU +5301+15839 Asia/Kamchatka MSK+09 - Kamchatka +RU +6445+17729 Asia/Anadyr MSK+09 - Bering Sea +RW -0157+03004 Africa/Kigali +SA +2438+04643 Asia/Riyadh +SB -0932+16012 Pacific/Guadalcanal +SC -0440+05528 Indian/Mahe +SD +1536+03232 Africa/Khartoum +SE +5920+01803 Europe/Stockholm +SG +0117+10351 Asia/Singapore +SH -1555-00542 Atlantic/St_Helena +SI +4603+01431 Europe/Ljubljana +SJ +7800+01600 Arctic/Longyearbyen +SK +4809+01707 Europe/Bratislava +SL +0830-01315 Africa/Freetown +SM +4355+01228 Europe/San_Marino +SN +1440-01726 Africa/Dakar +SO +0204+04522 Africa/Mogadishu +SR +0550-05510 America/Paramaribo +SS +0451+03137 Africa/Juba +ST +0020+00644 Africa/Sao_Tome +SV +1342-08912 America/El_Salvador +SX +180305-0630250 America/Lower_Princes +SY +3330+03618 Asia/Damascus +SZ -2618+03106 Africa/Mbabane +TC +2128-07108 America/Grand_Turk +TD +1207+01503 Africa/Ndjamena +TF -492110+0701303 Indian/Kerguelen +TG +0608+00113 Africa/Lome +TH +1345+10031 Asia/Bangkok +TJ +3835+06848 Asia/Dushanbe +TK -0922-17114 Pacific/Fakaofo +TL -0833+12535 Asia/Dili +TM +3757+05823 Asia/Ashgabat +TN +3648+01011 Africa/Tunis +TO -2110-17510 Pacific/Tongatapu +TR +4101+02858 Europe/Istanbul +TT +1039-06131 America/Port_of_Spain +TV -0831+17913 Pacific/Funafuti +TW +2503+12130 Asia/Taipei +TZ -0648+03917 Africa/Dar_es_Salaam +UA +5026+03031 Europe/Kiev Ukraine (most areas) +UA +4837+02218 Europe/Uzhgorod Ruthenia +UA +4750+03510 Europe/Zaporozhye Zaporozh'ye/Zaporizhia; Lugansk/Luhansk (east) +UG +0019+03225 Africa/Kampala +UM +2813-17722 Pacific/Midway Midway Islands +UM +1917+16637 Pacific/Wake Wake Island +US +404251-0740023 America/New_York Eastern (most areas) +US +421953-0830245 America/Detroit Eastern - MI (most areas) +US +381515-0854534 America/Kentucky/Louisville Eastern - KY (Louisville area) +US +364947-0845057 America/Kentucky/Monticello Eastern - KY (Wayne) +US +394606-0860929 America/Indiana/Indianapolis Eastern - IN (most areas) +US +384038-0873143 America/Indiana/Vincennes Eastern - IN (Da, Du, K, Mn) +US +410305-0863611 America/Indiana/Winamac Eastern - IN (Pulaski) +US +382232-0862041 America/Indiana/Marengo Eastern - IN (Crawford) +US +382931-0871643 America/Indiana/Petersburg Eastern - IN (Pike) +US +384452-0850402 America/Indiana/Vevay Eastern - IN (Switzerland) +US +415100-0873900 America/Chicago Central (most areas) +US +375711-0864541 America/Indiana/Tell_City Central - IN (Perry) +US +411745-0863730 America/Indiana/Knox Central - IN (Starke) +US +450628-0873651 America/Menominee Central - MI (Wisconsin border) +US +470659-1011757 America/North_Dakota/Center Central - ND (Oliver) +US +465042-1012439 America/North_Dakota/New_Salem Central - ND (Morton rural) +US +471551-1014640 America/North_Dakota/Beulah Central - ND (Mercer) +US +394421-1045903 America/Denver Mountain (most areas) +US +433649-1161209 America/Boise Mountain - ID (south); OR (east) +US +332654-1120424 America/Phoenix MST - Arizona (except Navajo) +US +340308-1181434 America/Los_Angeles Pacific +US +611305-1495401 America/Anchorage Alaska (most areas) +US +581807-1342511 America/Juneau Alaska - Juneau area +US +571035-1351807 America/Sitka Alaska - Sitka area +US +550737-1313435 America/Metlakatla Alaska - Annette Island +US +593249-1394338 America/Yakutat Alaska - Yakutat +US +643004-1652423 America/Nome Alaska (west) +US +515248-1763929 America/Adak Aleutian Islands +US +211825-1575130 Pacific/Honolulu Hawaii +UY -345433-0561245 America/Montevideo +UZ +3940+06648 Asia/Samarkand Uzbekistan (west) +UZ +4120+06918 Asia/Tashkent Uzbekistan (east) +VA +415408+0122711 Europe/Vatican +VC +1309-06114 America/St_Vincent +VE +1030-06656 America/Caracas +VG +1827-06437 America/Tortola +VI +1821-06456 America/St_Thomas +VN +1045+10640 Asia/Ho_Chi_Minh +VU -1740+16825 Pacific/Efate +WF -1318-17610 Pacific/Wallis +WS -1350-17144 Pacific/Apia +YE +1245+04512 Asia/Aden +YT -1247+04514 Indian/Mayotte +ZA -2615+02800 Africa/Johannesburg +ZM -1525+02817 Africa/Lusaka +ZW -1750+03103 Africa/Harare diff --git a/venv/Lib/site-packages/pytz/zoneinfo/zone1970.tab b/venv/Lib/site-packages/pytz/zoneinfo/zone1970.tab new file mode 100644 index 0000000..822ffa1 --- /dev/null +++ b/venv/Lib/site-packages/pytz/zoneinfo/zone1970.tab @@ -0,0 +1,384 @@ +# tzdb timezone descriptions +# +# This file is in the public domain. +# +# From Paul Eggert (2018-06-27): +# This file contains a table where each row stands for a timezone where +# civil timestamps have agreed since 1970. Columns are separated by +# a single tab. Lines beginning with '#' are comments. All text uses +# UTF-8 encoding. The columns of the table are as follows: +# +# 1. The countries that overlap the timezone, as a comma-separated list +# of ISO 3166 2-character country codes. See the file 'iso3166.tab'. +# 2. Latitude and longitude of the timezone's principal location +# in ISO 6709 sign-degrees-minutes-seconds format, +# either ±DDMM±DDDMM or ±DDMMSS±DDDMMSS, +# first latitude (+ is north), then longitude (+ is east). +# 3. Timezone name used in value of TZ environment variable. +# Please see the theory.html file for how these names are chosen. +# If multiple timezones overlap a country, each has a row in the +# table, with each column 1 containing the country code. +# 4. Comments; present if and only if a country has multiple timezones. +# +# If a timezone covers multiple countries, the most-populous city is used, +# and that country is listed first in column 1; any other countries +# are listed alphabetically by country code. The table is sorted +# first by country code, then (if possible) by an order within the +# country that (1) makes some geographical sense, and (2) puts the +# most populous timezones first, where that does not contradict (1). +# +# This table is intended as an aid for users, to help them select timezones +# appropriate for their practical needs. It is not intended to take or +# endorse any position on legal or territorial claims. +# +#country- +#codes coordinates TZ comments +AD +4230+00131 Europe/Andorra +AE,OM +2518+05518 Asia/Dubai +AF +3431+06912 Asia/Kabul +AL +4120+01950 Europe/Tirane +AM +4011+04430 Asia/Yerevan +AQ -6617+11031 Antarctica/Casey Casey +AQ -6835+07758 Antarctica/Davis Davis +AQ -6640+14001 Antarctica/DumontDUrville Dumont-d'Urville +AQ -6736+06253 Antarctica/Mawson Mawson +AQ -6448-06406 Antarctica/Palmer Palmer +AQ -6734-06808 Antarctica/Rothera Rothera +AQ -690022+0393524 Antarctica/Syowa Syowa +AQ -720041+0023206 Antarctica/Troll Troll +AQ -7824+10654 Antarctica/Vostok Vostok +AR -3436-05827 America/Argentina/Buenos_Aires Buenos Aires (BA, CF) +AR -3124-06411 America/Argentina/Cordoba Argentina (most areas: CB, CC, CN, ER, FM, MN, SE, SF) +AR -2447-06525 America/Argentina/Salta Salta (SA, LP, NQ, RN) +AR -2411-06518 America/Argentina/Jujuy Jujuy (JY) +AR -2649-06513 America/Argentina/Tucuman Tucumán (TM) +AR -2828-06547 America/Argentina/Catamarca Catamarca (CT); Chubut (CH) +AR -2926-06651 America/Argentina/La_Rioja La Rioja (LR) +AR -3132-06831 America/Argentina/San_Juan San Juan (SJ) +AR -3253-06849 America/Argentina/Mendoza Mendoza (MZ) +AR -3319-06621 America/Argentina/San_Luis San Luis (SL) +AR -5138-06913 America/Argentina/Rio_Gallegos Santa Cruz (SC) +AR -5448-06818 America/Argentina/Ushuaia Tierra del Fuego (TF) +AS,UM -1416-17042 Pacific/Pago_Pago Samoa, Midway +AT +4813+01620 Europe/Vienna +AU -3133+15905 Australia/Lord_Howe Lord Howe Island +AU -5430+15857 Antarctica/Macquarie Macquarie Island +AU -4253+14719 Australia/Hobart Tasmania (most areas) +AU -3956+14352 Australia/Currie Tasmania (King Island) +AU -3749+14458 Australia/Melbourne Victoria +AU -3352+15113 Australia/Sydney New South Wales (most areas) +AU -3157+14127 Australia/Broken_Hill New South Wales (Yancowinna) +AU -2728+15302 Australia/Brisbane Queensland (most areas) +AU -2016+14900 Australia/Lindeman Queensland (Whitsunday Islands) +AU -3455+13835 Australia/Adelaide South Australia +AU -1228+13050 Australia/Darwin Northern Territory +AU -3157+11551 Australia/Perth Western Australia (most areas) +AU -3143+12852 Australia/Eucla Western Australia (Eucla) +AZ +4023+04951 Asia/Baku +BB +1306-05937 America/Barbados +BD +2343+09025 Asia/Dhaka +BE +5050+00420 Europe/Brussels +BG +4241+02319 Europe/Sofia +BM +3217-06446 Atlantic/Bermuda +BN +0456+11455 Asia/Brunei +BO -1630-06809 America/La_Paz +BR -0351-03225 America/Noronha Atlantic islands +BR -0127-04829 America/Belem Pará (east); Amapá +BR -0343-03830 America/Fortaleza Brazil (northeast: MA, PI, CE, RN, PB) +BR -0803-03454 America/Recife Pernambuco +BR -0712-04812 America/Araguaina Tocantins +BR -0940-03543 America/Maceio Alagoas, Sergipe +BR -1259-03831 America/Bahia Bahia +BR -2332-04637 America/Sao_Paulo Brazil (southeast: GO, DF, MG, ES, RJ, SP, PR, SC, RS) +BR -2027-05437 America/Campo_Grande Mato Grosso do Sul +BR -1535-05605 America/Cuiaba Mato Grosso +BR -0226-05452 America/Santarem Pará (west) +BR -0846-06354 America/Porto_Velho Rondônia +BR +0249-06040 America/Boa_Vista Roraima +BR -0308-06001 America/Manaus Amazonas (east) +BR -0640-06952 America/Eirunepe Amazonas (west) +BR -0958-06748 America/Rio_Branco Acre +BS +2505-07721 America/Nassau +BT +2728+08939 Asia/Thimphu +BY +5354+02734 Europe/Minsk +BZ +1730-08812 America/Belize +CA +4734-05243 America/St_Johns Newfoundland; Labrador (southeast) +CA +4439-06336 America/Halifax Atlantic - NS (most areas); PE +CA +4612-05957 America/Glace_Bay Atlantic - NS (Cape Breton) +CA +4606-06447 America/Moncton Atlantic - New Brunswick +CA +5320-06025 America/Goose_Bay Atlantic - Labrador (most areas) +CA +5125-05707 America/Blanc-Sablon AST - QC (Lower North Shore) +CA +4339-07923 America/Toronto Eastern - ON, QC (most areas) +CA +4901-08816 America/Nipigon Eastern - ON, QC (no DST 1967-73) +CA +4823-08915 America/Thunder_Bay Eastern - ON (Thunder Bay) +CA +6344-06828 America/Iqaluit Eastern - NU (most east areas) +CA +6608-06544 America/Pangnirtung Eastern - NU (Pangnirtung) +CA +484531-0913718 America/Atikokan EST - ON (Atikokan); NU (Coral H) +CA +4953-09709 America/Winnipeg Central - ON (west); Manitoba +CA +4843-09434 America/Rainy_River Central - ON (Rainy R, Ft Frances) +CA +744144-0944945 America/Resolute Central - NU (Resolute) +CA +624900-0920459 America/Rankin_Inlet Central - NU (central) +CA +5024-10439 America/Regina CST - SK (most areas) +CA +5017-10750 America/Swift_Current CST - SK (midwest) +CA +5333-11328 America/Edmonton Mountain - AB; BC (E); SK (W) +CA +690650-1050310 America/Cambridge_Bay Mountain - NU (west) +CA +6227-11421 America/Yellowknife Mountain - NT (central) +CA +682059-1334300 America/Inuvik Mountain - NT (west) +CA +4906-11631 America/Creston MST - BC (Creston) +CA +5946-12014 America/Dawson_Creek MST - BC (Dawson Cr, Ft St John) +CA +5848-12242 America/Fort_Nelson MST - BC (Ft Nelson) +CA +4916-12307 America/Vancouver Pacific - BC (most areas) +CA +6043-13503 America/Whitehorse Pacific - Yukon (south) +CA +6404-13925 America/Dawson Pacific - Yukon (north) +CC -1210+09655 Indian/Cocos +CH,DE,LI +4723+00832 Europe/Zurich Swiss time +CI,BF,GM,GN,ML,MR,SH,SL,SN,TG +0519-00402 Africa/Abidjan +CK -2114-15946 Pacific/Rarotonga +CL -3327-07040 America/Santiago Chile (most areas) +CL -5309-07055 America/Punta_Arenas Region of Magallanes +CL -2709-10926 Pacific/Easter Easter Island +CN +3114+12128 Asia/Shanghai Beijing Time +CN +4348+08735 Asia/Urumqi Xinjiang Time +CO +0436-07405 America/Bogota +CR +0956-08405 America/Costa_Rica +CU +2308-08222 America/Havana +CV +1455-02331 Atlantic/Cape_Verde +CW,AW,BQ,SX +1211-06900 America/Curacao +CX -1025+10543 Indian/Christmas +CY +3510+03322 Asia/Nicosia Cyprus (most areas) +CY +3507+03357 Asia/Famagusta Northern Cyprus +CZ,SK +5005+01426 Europe/Prague +DE +5230+01322 Europe/Berlin Germany (most areas) +DK +5540+01235 Europe/Copenhagen +DO +1828-06954 America/Santo_Domingo +DZ +3647+00303 Africa/Algiers +EC -0210-07950 America/Guayaquil Ecuador (mainland) +EC -0054-08936 Pacific/Galapagos Galápagos Islands +EE +5925+02445 Europe/Tallinn +EG +3003+03115 Africa/Cairo +EH +2709-01312 Africa/El_Aaiun +ES +4024-00341 Europe/Madrid Spain (mainland) +ES +3553-00519 Africa/Ceuta Ceuta, Melilla +ES +2806-01524 Atlantic/Canary Canary Islands +FI,AX +6010+02458 Europe/Helsinki +FJ -1808+17825 Pacific/Fiji +FK -5142-05751 Atlantic/Stanley +FM +0725+15147 Pacific/Chuuk Chuuk/Truk, Yap +FM +0658+15813 Pacific/Pohnpei Pohnpei/Ponape +FM +0519+16259 Pacific/Kosrae Kosrae +FO +6201-00646 Atlantic/Faroe +FR +4852+00220 Europe/Paris +GB,GG,IM,JE +513030-0000731 Europe/London +GE +4143+04449 Asia/Tbilisi +GF +0456-05220 America/Cayenne +GH +0533-00013 Africa/Accra +GI +3608-00521 Europe/Gibraltar +GL +6411-05144 America/Godthab Greenland (most areas) +GL +7646-01840 America/Danmarkshavn National Park (east coast) +GL +7029-02158 America/Scoresbysund Scoresbysund/Ittoqqortoormiit +GL +7634-06847 America/Thule Thule/Pituffik +GR +3758+02343 Europe/Athens +GS -5416-03632 Atlantic/South_Georgia +GT +1438-09031 America/Guatemala +GU,MP +1328+14445 Pacific/Guam +GW +1151-01535 Africa/Bissau +GY +0648-05810 America/Guyana +HK +2217+11409 Asia/Hong_Kong +HN +1406-08713 America/Tegucigalpa +HT +1832-07220 America/Port-au-Prince +HU +4730+01905 Europe/Budapest +ID -0610+10648 Asia/Jakarta Java, Sumatra +ID -0002+10920 Asia/Pontianak Borneo (west, central) +ID -0507+11924 Asia/Makassar Borneo (east, south); Sulawesi/Celebes, Bali, Nusa Tengarra; Timor (west) +ID -0232+14042 Asia/Jayapura New Guinea (West Papua / Irian Jaya); Malukus/Moluccas +IE +5320-00615 Europe/Dublin +IL +314650+0351326 Asia/Jerusalem +IN +2232+08822 Asia/Kolkata +IO -0720+07225 Indian/Chagos +IQ +3321+04425 Asia/Baghdad +IR +3540+05126 Asia/Tehran +IS +6409-02151 Atlantic/Reykjavik +IT,SM,VA +4154+01229 Europe/Rome +JM +175805-0764736 America/Jamaica +JO +3157+03556 Asia/Amman +JP +353916+1394441 Asia/Tokyo +KE,DJ,ER,ET,KM,MG,SO,TZ,UG,YT -0117+03649 Africa/Nairobi +KG +4254+07436 Asia/Bishkek +KI +0125+17300 Pacific/Tarawa Gilbert Islands +KI -0308-17105 Pacific/Enderbury Phoenix Islands +KI +0152-15720 Pacific/Kiritimati Line Islands +KP +3901+12545 Asia/Pyongyang +KR +3733+12658 Asia/Seoul +KZ +4315+07657 Asia/Almaty Kazakhstan (most areas) +KZ +4448+06528 Asia/Qyzylorda Qyzylorda/Kyzylorda/Kzyl-Orda +KZ +5312+06337 Asia/Qostanay Qostanay/Kostanay/Kustanay +KZ +5017+05710 Asia/Aqtobe Aqtöbe/Aktobe +KZ +4431+05016 Asia/Aqtau Mangghystaū/Mankistau +KZ +4707+05156 Asia/Atyrau Atyraū/Atirau/Gur'yev +KZ +5113+05121 Asia/Oral West Kazakhstan +LB +3353+03530 Asia/Beirut +LK +0656+07951 Asia/Colombo +LR +0618-01047 Africa/Monrovia +LT +5441+02519 Europe/Vilnius +LU +4936+00609 Europe/Luxembourg +LV +5657+02406 Europe/Riga +LY +3254+01311 Africa/Tripoli +MA +3339-00735 Africa/Casablanca +MC +4342+00723 Europe/Monaco +MD +4700+02850 Europe/Chisinau +MH +0709+17112 Pacific/Majuro Marshall Islands (most areas) +MH +0905+16720 Pacific/Kwajalein Kwajalein +MM +1647+09610 Asia/Yangon +MN +4755+10653 Asia/Ulaanbaatar Mongolia (most areas) +MN +4801+09139 Asia/Hovd Bayan-Ölgii, Govi-Altai, Hovd, Uvs, Zavkhan +MN +4804+11430 Asia/Choibalsan Dornod, Sükhbaatar +MO +221150+1133230 Asia/Macau +MQ +1436-06105 America/Martinique +MT +3554+01431 Europe/Malta +MU -2010+05730 Indian/Mauritius +MV +0410+07330 Indian/Maldives +MX +1924-09909 America/Mexico_City Central Time +MX +2105-08646 America/Cancun Eastern Standard Time - Quintana Roo +MX +2058-08937 America/Merida Central Time - Campeche, Yucatán +MX +2540-10019 America/Monterrey Central Time - Durango; Coahuila, Nuevo León, Tamaulipas (most areas) +MX +2550-09730 America/Matamoros Central Time US - Coahuila, Nuevo León, Tamaulipas (US border) +MX +2313-10625 America/Mazatlan Mountain Time - Baja California Sur, Nayarit, Sinaloa +MX +2838-10605 America/Chihuahua Mountain Time - Chihuahua (most areas) +MX +2934-10425 America/Ojinaga Mountain Time US - Chihuahua (US border) +MX +2904-11058 America/Hermosillo Mountain Standard Time - Sonora +MX +3232-11701 America/Tijuana Pacific Time US - Baja California +MX +2048-10515 America/Bahia_Banderas Central Time - Bahía de Banderas +MY +0310+10142 Asia/Kuala_Lumpur Malaysia (peninsula) +MY +0133+11020 Asia/Kuching Sabah, Sarawak +MZ,BI,BW,CD,MW,RW,ZM,ZW -2558+03235 Africa/Maputo Central Africa Time +NA -2234+01706 Africa/Windhoek +NC -2216+16627 Pacific/Noumea +NF -2903+16758 Pacific/Norfolk +NG,AO,BJ,CD,CF,CG,CM,GA,GQ,NE +0627+00324 Africa/Lagos West Africa Time +NI +1209-08617 America/Managua +NL +5222+00454 Europe/Amsterdam +NO,SJ +5955+01045 Europe/Oslo +NP +2743+08519 Asia/Kathmandu +NR -0031+16655 Pacific/Nauru +NU -1901-16955 Pacific/Niue +NZ,AQ -3652+17446 Pacific/Auckland New Zealand time +NZ -4357-17633 Pacific/Chatham Chatham Islands +PA,KY +0858-07932 America/Panama +PE -1203-07703 America/Lima +PF -1732-14934 Pacific/Tahiti Society Islands +PF -0900-13930 Pacific/Marquesas Marquesas Islands +PF -2308-13457 Pacific/Gambier Gambier Islands +PG -0930+14710 Pacific/Port_Moresby Papua New Guinea (most areas) +PG -0613+15534 Pacific/Bougainville Bougainville +PH +1435+12100 Asia/Manila +PK +2452+06703 Asia/Karachi +PL +5215+02100 Europe/Warsaw +PM +4703-05620 America/Miquelon +PN -2504-13005 Pacific/Pitcairn +PR +182806-0660622 America/Puerto_Rico +PS +3130+03428 Asia/Gaza Gaza Strip +PS +313200+0350542 Asia/Hebron West Bank +PT +3843-00908 Europe/Lisbon Portugal (mainland) +PT +3238-01654 Atlantic/Madeira Madeira Islands +PT +3744-02540 Atlantic/Azores Azores +PW +0720+13429 Pacific/Palau +PY -2516-05740 America/Asuncion +QA,BH +2517+05132 Asia/Qatar +RE,TF -2052+05528 Indian/Reunion Réunion, Crozet, Scattered Islands +RO +4426+02606 Europe/Bucharest +RS,BA,HR,ME,MK,SI +4450+02030 Europe/Belgrade +RU +5443+02030 Europe/Kaliningrad MSK-01 - Kaliningrad +RU +554521+0373704 Europe/Moscow MSK+00 - Moscow area +# Mention RU and UA alphabetically. See "territorial claims" above. +RU,UA +4457+03406 Europe/Simferopol MSK+00 - Crimea +RU +5836+04939 Europe/Kirov MSK+00 - Kirov +RU +4621+04803 Europe/Astrakhan MSK+01 - Astrakhan +RU +4844+04425 Europe/Volgograd MSK+01 - Volgograd +RU +5134+04602 Europe/Saratov MSK+01 - Saratov +RU +5420+04824 Europe/Ulyanovsk MSK+01 - Ulyanovsk +RU +5312+05009 Europe/Samara MSK+01 - Samara, Udmurtia +RU +5651+06036 Asia/Yekaterinburg MSK+02 - Urals +RU +5500+07324 Asia/Omsk MSK+03 - Omsk +RU +5502+08255 Asia/Novosibirsk MSK+04 - Novosibirsk +RU +5322+08345 Asia/Barnaul MSK+04 - Altai +RU +5630+08458 Asia/Tomsk MSK+04 - Tomsk +RU +5345+08707 Asia/Novokuznetsk MSK+04 - Kemerovo +RU +5601+09250 Asia/Krasnoyarsk MSK+04 - Krasnoyarsk area +RU +5216+10420 Asia/Irkutsk MSK+05 - Irkutsk, Buryatia +RU +5203+11328 Asia/Chita MSK+06 - Zabaykalsky +RU +6200+12940 Asia/Yakutsk MSK+06 - Lena River +RU +623923+1353314 Asia/Khandyga MSK+06 - Tomponsky, Ust-Maysky +RU +4310+13156 Asia/Vladivostok MSK+07 - Amur River +RU +643337+1431336 Asia/Ust-Nera MSK+07 - Oymyakonsky +RU +5934+15048 Asia/Magadan MSK+08 - Magadan +RU +4658+14242 Asia/Sakhalin MSK+08 - Sakhalin Island +RU +6728+15343 Asia/Srednekolymsk MSK+08 - Sakha (E); North Kuril Is +RU +5301+15839 Asia/Kamchatka MSK+09 - Kamchatka +RU +6445+17729 Asia/Anadyr MSK+09 - Bering Sea +SA,KW,YE +2438+04643 Asia/Riyadh +SB -0932+16012 Pacific/Guadalcanal +SC -0440+05528 Indian/Mahe +SD +1536+03232 Africa/Khartoum +SE +5920+01803 Europe/Stockholm +SG +0117+10351 Asia/Singapore +SR +0550-05510 America/Paramaribo +SS +0451+03137 Africa/Juba +ST +0020+00644 Africa/Sao_Tome +SV +1342-08912 America/El_Salvador +SY +3330+03618 Asia/Damascus +TC +2128-07108 America/Grand_Turk +TD +1207+01503 Africa/Ndjamena +TF -492110+0701303 Indian/Kerguelen Kerguelen, St Paul Island, Amsterdam Island +TH,KH,LA,VN +1345+10031 Asia/Bangkok Indochina (most areas) +TJ +3835+06848 Asia/Dushanbe +TK -0922-17114 Pacific/Fakaofo +TL -0833+12535 Asia/Dili +TM +3757+05823 Asia/Ashgabat +TN +3648+01011 Africa/Tunis +TO -2110-17510 Pacific/Tongatapu +TR +4101+02858 Europe/Istanbul +TT,AG,AI,BL,DM,GD,GP,KN,LC,MF,MS,VC,VG,VI +1039-06131 America/Port_of_Spain +TV -0831+17913 Pacific/Funafuti +TW +2503+12130 Asia/Taipei +UA +5026+03031 Europe/Kiev Ukraine (most areas) +UA +4837+02218 Europe/Uzhgorod Ruthenia +UA +4750+03510 Europe/Zaporozhye Zaporozh'ye/Zaporizhia; Lugansk/Luhansk (east) +UM +1917+16637 Pacific/Wake Wake Island +US +404251-0740023 America/New_York Eastern (most areas) +US +421953-0830245 America/Detroit Eastern - MI (most areas) +US +381515-0854534 America/Kentucky/Louisville Eastern - KY (Louisville area) +US +364947-0845057 America/Kentucky/Monticello Eastern - KY (Wayne) +US +394606-0860929 America/Indiana/Indianapolis Eastern - IN (most areas) +US +384038-0873143 America/Indiana/Vincennes Eastern - IN (Da, Du, K, Mn) +US +410305-0863611 America/Indiana/Winamac Eastern - IN (Pulaski) +US +382232-0862041 America/Indiana/Marengo Eastern - IN (Crawford) +US +382931-0871643 America/Indiana/Petersburg Eastern - IN (Pike) +US +384452-0850402 America/Indiana/Vevay Eastern - IN (Switzerland) +US +415100-0873900 America/Chicago Central (most areas) +US +375711-0864541 America/Indiana/Tell_City Central - IN (Perry) +US +411745-0863730 America/Indiana/Knox Central - IN (Starke) +US +450628-0873651 America/Menominee Central - MI (Wisconsin border) +US +470659-1011757 America/North_Dakota/Center Central - ND (Oliver) +US +465042-1012439 America/North_Dakota/New_Salem Central - ND (Morton rural) +US +471551-1014640 America/North_Dakota/Beulah Central - ND (Mercer) +US +394421-1045903 America/Denver Mountain (most areas) +US +433649-1161209 America/Boise Mountain - ID (south); OR (east) +US +332654-1120424 America/Phoenix MST - Arizona (except Navajo) +US +340308-1181434 America/Los_Angeles Pacific +US +611305-1495401 America/Anchorage Alaska (most areas) +US +581807-1342511 America/Juneau Alaska - Juneau area +US +571035-1351807 America/Sitka Alaska - Sitka area +US +550737-1313435 America/Metlakatla Alaska - Annette Island +US +593249-1394338 America/Yakutat Alaska - Yakutat +US +643004-1652423 America/Nome Alaska (west) +US +515248-1763929 America/Adak Aleutian Islands +US,UM +211825-1575130 Pacific/Honolulu Hawaii +UY -345433-0561245 America/Montevideo +UZ +3940+06648 Asia/Samarkand Uzbekistan (west) +UZ +4120+06918 Asia/Tashkent Uzbekistan (east) +VE +1030-06656 America/Caracas +VN +1045+10640 Asia/Ho_Chi_Minh Vietnam (south) +VU -1740+16825 Pacific/Efate +WF -1318-17610 Pacific/Wallis +WS -1350-17144 Pacific/Apia +ZA,LS,SZ -2615+02800 Africa/Johannesburg diff --git a/venv/Lib/site-packages/xlrd-1.2.0.dist-info/INSTALLER b/venv/Lib/site-packages/xlrd-1.2.0.dist-info/INSTALLER new file mode 100644 index 0000000..a1b589e --- /dev/null +++ b/venv/Lib/site-packages/xlrd-1.2.0.dist-info/INSTALLER @@ -0,0 +1 @@ +pip diff --git a/venv/Lib/site-packages/xlrd-1.2.0.dist-info/LICENSE b/venv/Lib/site-packages/xlrd-1.2.0.dist-info/LICENSE new file mode 100644 index 0000000..54acf3c --- /dev/null +++ b/venv/Lib/site-packages/xlrd-1.2.0.dist-info/LICENSE @@ -0,0 +1,76 @@ +There are two licenses associated with xlrd. This one relates to the bulk of +the work done on the library:: + + Portions copyright © 2005-2009, Stephen John Machin, Lingfo Pty Ltd + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + 3. None of the names of Stephen John Machin, Lingfo Pty Ltd and any + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS + BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. + +This one covers some earlier work:: + + /*- + * Copyright (c) 2001 David Giffin. + * All rights reserved. + * + * Based on the the Java version: Andrew Khan Copyright (c) 2000. + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by + * David Giffin ." + * + * 4. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by + * David Giffin ." + * + * THIS SOFTWARE IS PROVIDED BY DAVID GIFFIN ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL DAVID GIFFIN OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + */ diff --git a/venv/Lib/site-packages/xlrd-1.2.0.dist-info/METADATA b/venv/Lib/site-packages/xlrd-1.2.0.dist-info/METADATA new file mode 100644 index 0000000..2deef97 --- /dev/null +++ b/venv/Lib/site-packages/xlrd-1.2.0.dist-info/METADATA @@ -0,0 +1,30 @@ +Metadata-Version: 2.1 +Name: xlrd +Version: 1.2.0 +Summary: Library for developers to extract data from Microsoft Excel (tm) spreadsheet files +Home-page: http://www.python-excel.org/ +Author: John Machin +Author-email: sjmachin@lexicon.net +License: BSD +Keywords: xls,excel,spreadsheet,workbook +Platform: Any platform -- don't need Windows +Classifier: Development Status :: 5 - Production/Stable +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: BSD License +Classifier: Programming Language :: Python +Classifier: Programming Language :: Python :: 2 +Classifier: Programming Language :: Python :: 2.7 +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3.4 +Classifier: Programming Language :: Python :: 3.5 +Classifier: Programming Language :: Python :: 3.6 +Classifier: Programming Language :: Python :: 3.7 +Classifier: Operating System :: OS Independent +Classifier: Topic :: Database +Classifier: Topic :: Office/Business +Classifier: Topic :: Software Development :: Libraries :: Python Modules +Requires-Python: >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.* + +Extract data from Excel spreadsheets (.xls and .xlsx, versions 2.0 onwards) on any platform. Pure Python (2.7, 3.4+). Strong support for Excel dates. Unicode-aware. + + diff --git a/venv/Lib/site-packages/xlrd-1.2.0.dist-info/RECORD b/venv/Lib/site-packages/xlrd-1.2.0.dist-info/RECORD new file mode 100644 index 0000000..1d57417 --- /dev/null +++ b/venv/Lib/site-packages/xlrd-1.2.0.dist-info/RECORD @@ -0,0 +1,30 @@ +xlrd/__init__.py,sha256=P29xU7eI4XcTWtxDcbbxfdhcN9NELnYxZ-5-w9TzjW0,6491 +xlrd/biffh.py,sha256=GjtBoI9PNIEdbZmGTvfDQ8HuhZiLJd2ofdo2qmunzmA,16651 +xlrd/book.py,sha256=FSEuCDloH5uwn7h_GPxiv9t6UQcWH7KS2zXJjiBUaYA,57073 +xlrd/compdoc.py,sha256=kEGiURvnuO5lAqNzdkGh01Cqm5ApALyH_R4RDlg9800,20924 +xlrd/formatting.py,sha256=hJC4oFTyf1SGX-nSzYrwe1ry1v1dqHRQTb5C06dDHUs,45546 +xlrd/formula.py,sha256=VskzjScNbPKqDxOa_wOmGZ2u_5xac3lqFgDM0dTGJ30,94455 +xlrd/info.py,sha256=PDnD1B_6qWNNA4JHKdGzGrSQkzgDKclnU69akbpVqLI,36 +xlrd/sheet.py,sha256=Aupht6xtk_U_SN1k2jyPHb--HWyRAfXYxstZBp5weyw,106146 +xlrd/timemachine.py,sha256=7CtU7FY6mQW7HSP5FjM1K9pIfNexVCt_PJXG2M3FEAc,1757 +xlrd/xldate.py,sha256=S-uUaLX8kauGy2SkX_M-S-FBqqQ6H9EfTAVpv5FMX0I,7934 +xlrd/xlsx.py,sha256=TEaj4-zMFkMPz12FixTGjPTPIyC9Iu6rqh-3hrvguLw,33527 +../../Scripts/runxlrd.py,sha256=_UVlzMIXl_JqoBrlSBqzGkwUn0A-bIhB4ZC77IzbOHw,16274 +xlrd-1.2.0.dist-info/LICENSE,sha256=taXbzmAmXjBagVpsuD7QfyRRnYumRPKjB5lEiLztiBU,3771 +xlrd-1.2.0.dist-info/METADATA,sha256=pOF_QooL1wz3ajxkUAAiTidMfIFxt1eXEtg6XcvliTg,1288 +xlrd-1.2.0.dist-info/WHEEL,sha256=_wJFdOYk7i3xxT8ElOkUJvOdOvfNGbR9g-bf6UQT6sU,110 +xlrd-1.2.0.dist-info/top_level.txt,sha256=nZ-t3Sc_CqNsctS3L-V36qXwC5DuAFvkB1inJApF_vM,5 +xlrd-1.2.0.dist-info/RECORD,, +xlrd-1.2.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4 +xlrd/__pycache__/biffh.cpython-37.pyc,, +xlrd/__pycache__/book.cpython-37.pyc,, +xlrd/__pycache__/compdoc.cpython-37.pyc,, +xlrd/__pycache__/formatting.cpython-37.pyc,, +xlrd/__pycache__/formula.cpython-37.pyc,, +xlrd/__pycache__/info.cpython-37.pyc,, +xlrd/__pycache__/sheet.cpython-37.pyc,, +xlrd/__pycache__/timemachine.cpython-37.pyc,, +xlrd/__pycache__/xldate.cpython-37.pyc,, +xlrd/__pycache__/xlsx.cpython-37.pyc,, +xlrd/__pycache__/__init__.cpython-37.pyc,, +../../Scripts/__pycache__/runxlrd.cpython-37.pyc,, diff --git a/venv/Lib/site-packages/xlrd-1.2.0.dist-info/WHEEL b/venv/Lib/site-packages/xlrd-1.2.0.dist-info/WHEEL new file mode 100644 index 0000000..c4bde30 --- /dev/null +++ b/venv/Lib/site-packages/xlrd-1.2.0.dist-info/WHEEL @@ -0,0 +1,6 @@ +Wheel-Version: 1.0 +Generator: bdist_wheel (0.32.3) +Root-Is-Purelib: true +Tag: py2-none-any +Tag: py3-none-any + diff --git a/venv/Lib/site-packages/xlrd-1.2.0.dist-info/top_level.txt b/venv/Lib/site-packages/xlrd-1.2.0.dist-info/top_level.txt new file mode 100644 index 0000000..1a6410a --- /dev/null +++ b/venv/Lib/site-packages/xlrd-1.2.0.dist-info/top_level.txt @@ -0,0 +1 @@ +xlrd diff --git a/venv/Lib/site-packages/xlrd/__init__.py b/venv/Lib/site-packages/xlrd/__init__.py new file mode 100644 index 0000000..078c211 --- /dev/null +++ b/venv/Lib/site-packages/xlrd/__init__.py @@ -0,0 +1,187 @@ +# Copyright (c) 2005-2012 Stephen John Machin, Lingfo Pty Ltd +# This module is part of the xlrd package, which is released under a +# BSD-style licence. +import os +import pprint +import sys +import zipfile + +from . import timemachine +from .biffh import ( + XL_CELL_BLANK, XL_CELL_BOOLEAN, XL_CELL_DATE, XL_CELL_EMPTY, XL_CELL_ERROR, + XL_CELL_NUMBER, XL_CELL_TEXT, XLRDError, biff_text_from_num, + error_text_from_code, +) +from .book import Book, colname +from .formula import * # is constrained by __all__ +from .info import __VERSION__, __version__ +from .sheet import empty_cell +from .xldate import XLDateError, xldate_as_datetime, xldate_as_tuple +from .xlsx import X12Book + +if sys.version.startswith("IronPython"): + # print >> sys.stderr, "...importing encodings" + import encodings + +try: + import mmap + MMAP_AVAILABLE = 1 +except ImportError: + MMAP_AVAILABLE = 0 +USE_MMAP = MMAP_AVAILABLE + +def open_workbook(filename=None, + logfile=sys.stdout, + verbosity=0, + use_mmap=USE_MMAP, + file_contents=None, + encoding_override=None, + formatting_info=False, + on_demand=False, + ragged_rows=False): + """ + Open a spreadsheet file for data extraction. + + :param filename: The path to the spreadsheet file to be opened. + + :param logfile: An open file to which messages and diagnostics are written. + + :param verbosity: Increases the volume of trace material written to the + logfile. + + :param use_mmap: + + Whether to use the mmap module is determined heuristically. + Use this arg to override the result. + + Current heuristic: mmap is used if it exists. + + :param file_contents: + + A string or an :class:`mmap.mmap` object or some other behave-alike + object. If ``file_contents`` is supplied, ``filename`` will not be used, + except (possibly) in messages. + + :param encoding_override: + + Used to overcome missing or bad codepage information + in older-version files. See :doc:`unicode`. + + :param formatting_info: + + The default is ``False``, which saves memory. + In this case, "Blank" cells, which are those with their own formatting + information but no data, are treated as empty by ignoring the file's + ``BLANK`` and ``MULBLANK`` records. + This cuts off any bottom or right "margin" of rows of empty or blank + cells. + Only :meth:`~xlrd.sheet.Sheet.cell_value` and + :meth:`~xlrd.sheet.Sheet.cell_type` are available. + + When ``True``, formatting information will be read from the spreadsheet + file. This provides all cells, including empty and blank cells. + Formatting information is available for each cell. + + Note that this will raise a NotImplementedError when used with an + xlsx file. + + :param on_demand: + + Governs whether sheets are all loaded initially or when demanded + by the caller. See :doc:`on_demand`. + + :param ragged_rows: + + The default of ``False`` means all rows are padded out with empty cells so + that all rows have the same size as found in + :attr:`~xlrd.sheet.Sheet.ncols`. + + ``True`` means that there are no empty cells at the ends of rows. + This can result in substantial memory savings if rows are of widely + varying sizes. See also the :meth:`~xlrd.sheet.Sheet.row_len` method. + + :returns: An instance of the :class:`~xlrd.book.Book` class. + """ + + peeksz = 4 + if file_contents: + peek = file_contents[:peeksz] + else: + filename = os.path.expanduser(filename) + with open(filename, "rb") as f: + peek = f.read(peeksz) + if peek == b"PK\x03\x04": # a ZIP file + if file_contents: + zf = zipfile.ZipFile(timemachine.BYTES_IO(file_contents)) + else: + zf = zipfile.ZipFile(filename) + + # Workaround for some third party files that use forward slashes and + # lower case names. We map the expected name in lowercase to the + # actual filename in the zip container. + component_names = dict([(X12Book.convert_filename(name), name) + for name in zf.namelist()]) + + if verbosity: + logfile.write('ZIP component_names:\n') + pprint.pprint(component_names, logfile) + if 'xl/workbook.xml' in component_names: + from . import xlsx + bk = xlsx.open_workbook_2007_xml( + zf, + component_names, + logfile=logfile, + verbosity=verbosity, + use_mmap=use_mmap, + formatting_info=formatting_info, + on_demand=on_demand, + ragged_rows=ragged_rows, + ) + return bk + if 'xl/workbook.bin' in component_names: + raise XLRDError('Excel 2007 xlsb file; not supported') + if 'content.xml' in component_names: + raise XLRDError('Openoffice.org ODS file; not supported') + raise XLRDError('ZIP file contents not a known type of workbook') + + from . import book + bk = book.open_workbook_xls( + filename=filename, + logfile=logfile, + verbosity=verbosity, + use_mmap=use_mmap, + file_contents=file_contents, + encoding_override=encoding_override, + formatting_info=formatting_info, + on_demand=on_demand, + ragged_rows=ragged_rows, + ) + return bk + + +def dump(filename, outfile=sys.stdout, unnumbered=False): + """ + For debugging: dump an XLS file's BIFF records in char & hex. + + :param filename: The path to the file to be dumped. + :param outfile: An open file, to which the dump is written. + :param unnumbered: If true, omit offsets (for meaningful diffs). + """ + from .biffh import biff_dump + bk = Book() + bk.biff2_8_load(filename=filename, logfile=outfile, ) + biff_dump(bk.mem, bk.base, bk.stream_len, 0, outfile, unnumbered) + + +def count_records(filename, outfile=sys.stdout): + """ + For debugging and analysis: summarise the file's BIFF records. + ie: produce a sorted file of ``(record_name, count)``. + + :param filename: The path to the file to be summarised. + :param outfile: An open file, to which the summary is written. + """ + from .biffh import biff_count_records + bk = Book() + bk.biff2_8_load(filename=filename, logfile=outfile, ) + biff_count_records(bk.mem, bk.base, bk.stream_len, outfile) diff --git a/venv/Lib/site-packages/xlrd/biffh.py b/venv/Lib/site-packages/xlrd/biffh.py new file mode 100644 index 0000000..07ac629 --- /dev/null +++ b/venv/Lib/site-packages/xlrd/biffh.py @@ -0,0 +1,643 @@ +# -*- coding: utf-8 -*- +# Portions copyright © 2005-2010 Stephen John Machin, Lingfo Pty Ltd +# This module is part of the xlrd package, which is released under a +# BSD-style licence. +from __future__ import print_function + +import sys +from struct import unpack + +from .timemachine import * + +DEBUG = 0 + + + +class XLRDError(Exception): + """ + An exception indicating problems reading data from an Excel file. + """ + + +class BaseObject(object): + """ + Parent of almost all other classes in the package. Defines a common + :meth:`dump` method for debugging. + """ + + _repr_these = [] + + + def dump(self, f=None, header=None, footer=None, indent=0): + """ + :param f: open file object, to which the dump is written + :param header: text to write before the dump + :param footer: text to write after the dump + :param indent: number of leading spaces (for recursive calls) + """ + if f is None: + f = sys.stderr + if hasattr(self, "__slots__"): + alist = [] + for attr in self.__slots__: + alist.append((attr, getattr(self, attr))) + else: + alist = self.__dict__.items() + alist = sorted(alist) + pad = " " * indent + if header is not None: print(header, file=f) + list_type = type([]) + dict_type = type({}) + for attr, value in alist: + if getattr(value, 'dump', None) and attr != 'book': + value.dump(f, + header="%s%s (%s object):" % (pad, attr, value.__class__.__name__), + indent=indent+4) + elif (attr not in self._repr_these and + (isinstance(value, list_type) or isinstance(value, dict_type))): + print("%s%s: %s, len = %d" % (pad, attr, type(value), len(value)), file=f) + else: + fprintf(f, "%s%s: %r\n", pad, attr, value) + if footer is not None: print(footer, file=f) + +FUN, FDT, FNU, FGE, FTX = range(5) # unknown, date, number, general, text +DATEFORMAT = FDT +NUMBERFORMAT = FNU + +( + XL_CELL_EMPTY, + XL_CELL_TEXT, + XL_CELL_NUMBER, + XL_CELL_DATE, + XL_CELL_BOOLEAN, + XL_CELL_ERROR, + XL_CELL_BLANK, # for use in debugging, gathering stats, etc +) = range(7) + +biff_text_from_num = { + 0: "(not BIFF)", + 20: "2.0", + 21: "2.1", + 30: "3", + 40: "4S", + 45: "4W", + 50: "5", + 70: "7", + 80: "8", + 85: "8X", +} + +#: This dictionary can be used to produce a text version of the internal codes +#: that Excel uses for error cells. +error_text_from_code = { + 0x00: '#NULL!', # Intersection of two cell ranges is empty + 0x07: '#DIV/0!', # Division by zero + 0x0F: '#VALUE!', # Wrong type of operand + 0x17: '#REF!', # Illegal or deleted cell reference + 0x1D: '#NAME?', # Wrong function or range name + 0x24: '#NUM!', # Value range overflow + 0x2A: '#N/A', # Argument or function not available +} + +BIFF_FIRST_UNICODE = 80 + +XL_WORKBOOK_GLOBALS = WBKBLOBAL = 0x5 +XL_WORKBOOK_GLOBALS_4W = 0x100 +XL_WORKSHEET = WRKSHEET = 0x10 + +XL_BOUNDSHEET_WORKSHEET = 0x00 +XL_BOUNDSHEET_CHART = 0x02 +XL_BOUNDSHEET_VB_MODULE = 0x06 + +# XL_RK2 = 0x7e +XL_ARRAY = 0x0221 +XL_ARRAY2 = 0x0021 +XL_BLANK = 0x0201 +XL_BLANK_B2 = 0x01 +XL_BOF = 0x809 +XL_BOOLERR = 0x205 +XL_BOOLERR_B2 = 0x5 +XL_BOUNDSHEET = 0x85 +XL_BUILTINFMTCOUNT = 0x56 +XL_CF = 0x01B1 +XL_CODEPAGE = 0x42 +XL_COLINFO = 0x7D +XL_COLUMNDEFAULT = 0x20 # BIFF2 only +XL_COLWIDTH = 0x24 # BIFF2 only +XL_CONDFMT = 0x01B0 +XL_CONTINUE = 0x3c +XL_COUNTRY = 0x8C +XL_DATEMODE = 0x22 +XL_DEFAULTROWHEIGHT = 0x0225 +XL_DEFCOLWIDTH = 0x55 +XL_DIMENSION = 0x200 +XL_DIMENSION2 = 0x0 +XL_EFONT = 0x45 +XL_EOF = 0x0a +XL_EXTERNNAME = 0x23 +XL_EXTERNSHEET = 0x17 +XL_EXTSST = 0xff +XL_FEAT11 = 0x872 +XL_FILEPASS = 0x2f +XL_FONT = 0x31 +XL_FONT_B3B4 = 0x231 +XL_FORMAT = 0x41e +XL_FORMAT2 = 0x1E # BIFF2, BIFF3 +XL_FORMULA = 0x6 +XL_FORMULA3 = 0x206 +XL_FORMULA4 = 0x406 +XL_GCW = 0xab +XL_HLINK = 0x01B8 +XL_QUICKTIP = 0x0800 +XL_HORIZONTALPAGEBREAKS = 0x1b +XL_INDEX = 0x20b +XL_INTEGER = 0x2 # BIFF2 only +XL_IXFE = 0x44 # BIFF2 only +XL_LABEL = 0x204 +XL_LABEL_B2 = 0x04 +XL_LABELRANGES = 0x15f +XL_LABELSST = 0xfd +XL_LEFTMARGIN = 0x26 +XL_TOPMARGIN = 0x28 +XL_RIGHTMARGIN = 0x27 +XL_BOTTOMMARGIN = 0x29 +XL_HEADER = 0x14 +XL_FOOTER = 0x15 +XL_HCENTER = 0x83 +XL_VCENTER = 0x84 +XL_MERGEDCELLS = 0xE5 +XL_MSO_DRAWING = 0x00EC +XL_MSO_DRAWING_GROUP = 0x00EB +XL_MSO_DRAWING_SELECTION = 0x00ED +XL_MULRK = 0xbd +XL_MULBLANK = 0xbe +XL_NAME = 0x18 +XL_NOTE = 0x1c +XL_NUMBER = 0x203 +XL_NUMBER_B2 = 0x3 +XL_OBJ = 0x5D +XL_PAGESETUP = 0xA1 +XL_PALETTE = 0x92 +XL_PANE = 0x41 +XL_PRINTGRIDLINES = 0x2B +XL_PRINTHEADERS = 0x2A +XL_RK = 0x27e +XL_ROW = 0x208 +XL_ROW_B2 = 0x08 +XL_RSTRING = 0xd6 +XL_SCL = 0x00A0 +XL_SHEETHDR = 0x8F # BIFF4W only +XL_SHEETPR = 0x81 +XL_SHEETSOFFSET = 0x8E # BIFF4W only +XL_SHRFMLA = 0x04bc +XL_SST = 0xfc +XL_STANDARDWIDTH = 0x99 +XL_STRING = 0x207 +XL_STRING_B2 = 0x7 +XL_STYLE = 0x293 +XL_SUPBOOK = 0x1AE # aka EXTERNALBOOK in OOo docs +XL_TABLEOP = 0x236 +XL_TABLEOP2 = 0x37 +XL_TABLEOP_B2 = 0x36 +XL_TXO = 0x1b6 +XL_UNCALCED = 0x5e +XL_UNKNOWN = 0xffff +XL_VERTICALPAGEBREAKS = 0x1a +XL_WINDOW2 = 0x023E +XL_WINDOW2_B2 = 0x003E +XL_WRITEACCESS = 0x5C +XL_WSBOOL = XL_SHEETPR +XL_XF = 0xe0 +XL_XF2 = 0x0043 # BIFF2 version of XF record +XL_XF3 = 0x0243 # BIFF3 version of XF record +XL_XF4 = 0x0443 # BIFF4 version of XF record + +boflen = {0x0809: 8, 0x0409: 6, 0x0209: 6, 0x0009: 4} +bofcodes = (0x0809, 0x0409, 0x0209, 0x0009) + +XL_FORMULA_OPCODES = (0x0006, 0x0406, 0x0206) + +_cell_opcode_list = [ + XL_BOOLERR, + XL_FORMULA, + XL_FORMULA3, + XL_FORMULA4, + XL_LABEL, + XL_LABELSST, + XL_MULRK, + XL_NUMBER, + XL_RK, + XL_RSTRING, +] +_cell_opcode_dict = {} +for _cell_opcode in _cell_opcode_list: + _cell_opcode_dict[_cell_opcode] = 1 + +def is_cell_opcode(c): + return c in _cell_opcode_dict + +def upkbits(tgt_obj, src, manifest, local_setattr=setattr): + for n, mask, attr in manifest: + local_setattr(tgt_obj, attr, (src & mask) >> n) + +def upkbitsL(tgt_obj, src, manifest, local_setattr=setattr, local_int=int): + for n, mask, attr in manifest: + local_setattr(tgt_obj, attr, local_int((src & mask) >> n)) + +def unpack_string(data, pos, encoding, lenlen=1): + nchars = unpack('<' + 'BH'[lenlen-1], data[pos:pos+lenlen])[0] + pos += lenlen + return unicode(data[pos:pos+nchars], encoding) + +def unpack_string_update_pos(data, pos, encoding, lenlen=1, known_len=None): + if known_len is not None: + # On a NAME record, the length byte is detached from the front of the string. + nchars = known_len + else: + nchars = unpack('<' + 'BH'[lenlen-1], data[pos:pos+lenlen])[0] + pos += lenlen + newpos = pos + nchars + return (unicode(data[pos:newpos], encoding), newpos) + +def unpack_unicode(data, pos, lenlen=2): + "Return unicode_strg" + nchars = unpack('<' + 'BH'[lenlen-1], data[pos:pos+lenlen])[0] + if not nchars: + # Ambiguous whether 0-length string should have an "options" byte. + # Avoid crash if missing. + return UNICODE_LITERAL("") + pos += lenlen + options = BYTES_ORD(data[pos]) + pos += 1 + # phonetic = options & 0x04 + # richtext = options & 0x08 + if options & 0x08: + # rt = unpack(' endpos=%d pos=%d endsub=%d substrg=%r\n', + ofs, dlen, base, endpos, pos, endsub, substrg) + break + hexd = ''.join("%02x " % BYTES_ORD(c) for c in substrg) + + chard = '' + for c in substrg: + c = chr(BYTES_ORD(c)) + if c == '\0': + c = '~' + elif not (' ' <= c <= '~'): + c = '?' + chard += c + if numbered: + num_prefix = "%5d: " % (base+pos-ofs) + + fprintf(fout, "%s %-48s %s\n", num_prefix, hexd, chard) + pos = endsub + +def biff_dump(mem, stream_offset, stream_len, base=0, fout=sys.stdout, unnumbered=False): + pos = stream_offset + stream_end = stream_offset + stream_len + adj = base - stream_offset + dummies = 0 + numbered = not unnumbered + num_prefix = '' + while stream_end - pos >= 4: + rc, length = unpack('') + if numbered: + num_prefix = "%5d: " % (adj + pos) + fprintf(fout, "%s%04x %s len = %04x (%d)\n", num_prefix, rc, recname, length, length) + pos += 4 + hex_char_dump(mem, pos, length, adj+pos, fout, unnumbered) + pos += length + if dummies: + if numbered: + num_prefix = "%5d: " % (adj + savpos) + fprintf(fout, "%s---- %d zero bytes skipped ----\n", num_prefix, dummies) + if pos < stream_end: + if numbered: + num_prefix = "%5d: " % (adj + pos) + fprintf(fout, "%s---- Misc bytes at end ----\n", num_prefix) + hex_char_dump(mem, pos, stream_end-pos, adj + pos, fout, unnumbered) + elif pos > stream_end: + fprintf(fout, "Last dumped record has length (%d) that is too large\n", length) + +def biff_count_records(mem, stream_offset, stream_len, fout=sys.stdout): + pos = stream_offset + stream_end = stream_offset + stream_len + tally = {} + while stream_end - pos >= 4: + rc, length = unpack(' 1: + fprintf( + bk.logfile, + "*** WARNING: Excel 4.0 workbook (.XLW) file contains %d worksheets.\n" + "*** Book-level data will be that of the last worksheet.\n", + bk.nsheets + ) + if TOGGLE_GC: + if orig_gc_enabled: + gc.enable() + t2 = perf_counter() + bk.load_time_stage_2 = t2 - t1 + except: + bk.release_resources() + raise + # normal exit + if not on_demand: + bk.release_resources() + return bk + + +class Name(BaseObject): + """ + Information relating to a named reference, formula, macro, etc. + + .. note:: + + Name information is **not** extracted from files older than + Excel 5.0 (``Book.biff_version < 50``) + """ + _repr_these = ['stack'] + book = None # parent + + #: 0 = Visible; 1 = Hidden + hidden = 0 + + #: 0 = Command macro; 1 = Function macro. Relevant only if macro == 1 + func = 0 + + #: 0 = Sheet macro; 1 = VisualBasic macro. Relevant only if macro == 1 + vbasic = 0 + + #: 0 = Standard name; 1 = Macro name + macro = 0 + + #: 0 = Simple formula; 1 = Complex formula (array formula or user defined). + #: + #: .. note:: No examples have been sighted. + complex = 0 + + #: 0 = User-defined name; 1 = Built-in name + #: + #: Common examples: ``Print_Area``, ``Print_Titles``; see OOo docs for + #: full list + builtin = 0 + + #: Function group. Relevant only if macro == 1; see OOo docs for values. + funcgroup = 0 + + #: 0 = Formula definition; 1 = Binary data + #: + #: .. note:: No examples have been sighted. + binary = 0 + + #: The index of this object in book.name_obj_list + name_index = 0 + + # A Unicode string. If builtin, decoded as per OOo docs. + name = UNICODE_LITERAL("") + + #: An 8-bit string. + raw_formula = b'' + + #: ``-1``: + #: The name is global (visible in all calculation sheets). + #: ``-2``: + #: The name belongs to a macro sheet or VBA sheet. + #: ``-3``: + #: The name is invalid. + #: ``0 <= scope < book.nsheets``: + #: The name is local to the sheet whose index is scope. + scope = -1 + + #: The result of evaluating the formula, if any. + #: If no formula, or evaluation of the formula encountered problems, + #: the result is ``None``. Otherwise the result is a single instance of the + #: :class:`~xlrd.formula.Operand` class. + # + result = None + + def cell(self): + """ + This is a convenience method for the frequent use case where the name + refers to a single cell. + + :returns: An instance of the :class:`~xlrd.sheet.Cell` class. + + :raises xlrd.biffh.XLRDError: + The name is not a constant absolute reference + to a single cell. + """ + res = self.result + if res: + # result should be an instance of the Operand class + kind = res.kind + value = res.value + if kind == oREF and len(value) == 1: + ref3d = value[0] + if (0 <= ref3d.shtxlo == ref3d.shtxhi - 1 and + ref3d.rowxlo == ref3d.rowxhi - 1 and + ref3d.colxlo == ref3d.colxhi - 1): + sh = self.book.sheet_by_index(ref3d.shtxlo) + return sh.cell(ref3d.rowxlo, ref3d.colxlo) + self.dump( + self.book.logfile, + header="=== Dump of Name object ===", + footer="======= End of dump =======", + ) + raise XLRDError("Not a constant absolute reference to a single cell") + + def area2d(self, clipped=True): + """ + This is a convenience method for the use case where the name + refers to one rectangular area in one worksheet. + + :param clipped: + If ``True``, the default, the returned rectangle is clipped + to fit in ``(0, sheet.nrows, 0, sheet.ncols)``. + it is guaranteed that ``0 <= rowxlo <= rowxhi <= sheet.nrows`` and + that the number of usable rows in the area (which may be zero) is + ``rowxhi - rowxlo``; likewise for columns. + + :returns: a tuple ``(sheet_object, rowxlo, rowxhi, colxlo, colxhi)``. + + :raises xlrd.biffh.XLRDError: + The name is not a constant absolute reference + to a single area in a single sheet. + """ + res = self.result + if res: + # result should be an instance of the Operand class + kind = res.kind + value = res.value + if kind == oREF and len(value) == 1: # only 1 reference + ref3d = value[0] + if 0 <= ref3d.shtxlo == ref3d.shtxhi - 1: # only 1 usable sheet + sh = self.book.sheet_by_index(ref3d.shtxlo) + if not clipped: + return sh, ref3d.rowxlo, ref3d.rowxhi, ref3d.colxlo, ref3d.colxhi + rowxlo = min(ref3d.rowxlo, sh.nrows) + rowxhi = max(rowxlo, min(ref3d.rowxhi, sh.nrows)) + colxlo = min(ref3d.colxlo, sh.ncols) + colxhi = max(colxlo, min(ref3d.colxhi, sh.ncols)) + assert 0 <= rowxlo <= rowxhi <= sh.nrows + assert 0 <= colxlo <= colxhi <= sh.ncols + return sh, rowxlo, rowxhi, colxlo, colxhi + self.dump( + self.book.logfile, + header="=== Dump of Name object ===", + footer="======= End of dump =======", + ) + raise XLRDError("Not a constant absolute reference to a single area in a single sheet") + + +class Book(BaseObject): + """ + Contents of a "workbook". + + .. warning:: + + You should not instantiate this class yourself. You use the :class:`Book` + object that was returned when you called :func:`~xlrd.open_workbook`. + """ + + #: The number of worksheets present in the workbook file. + #: This information is available even when no sheets have yet been loaded. + nsheets = 0 + + #: Which date system was in force when this file was last saved. + #: + #: 0: + #: 1900 system (the Excel for Windows default). + #: + #: 1: + #: 1904 system (the Excel for Macintosh default). + #: + #: Defaults to 0 in case it's not specified in the file. + datemode = 0 + + #: Version of BIFF (Binary Interchange File Format) used to create the file. + #: Latest is 8.0 (represented here as 80), introduced with Excel 97. + #: Earliest supported by this module: 2.0 (represented as 20). + biff_version = 0 + + #: List containing a :class:`Name` object for each ``NAME`` record in the + #: workbook. + #: + #: .. versionadded:: 0.6.0 + name_obj_list = [] + + #: An integer denoting the character set used for strings in this file. + #: For BIFF 8 and later, this will be 1200, meaning Unicode; + #: more precisely, UTF_16_LE. + #: For earlier versions, this is used to derive the appropriate Python + #: encoding to be used to convert to Unicode. + #: Examples: ``1252 -> 'cp1252'``, ``10000 -> 'mac_roman'`` + codepage = None + + #: The encoding that was derived from the codepage. + encoding = None + + #: A tuple containing the telephone country code for: + #: + #: ``[0]``: + #: the user-interface setting when the file was created. + #: + #: ``[1]``: + #: the regional settings. + #: + #: Example: ``(1, 61)`` meaning ``(USA, Australia)``. + #: + #: This information may give a clue to the correct encoding for an + #: unknown codepage. For a long list of observed values, refer to the + #: OpenOffice.org documentation for the ``COUNTRY`` record. + countries = (0, 0) + + #: What (if anything) is recorded as the name of the last user to + #: save the file. + user_name = UNICODE_LITERAL('') + + #: A list of :class:`~xlrd.formatting.Font` class instances, + #: each corresponding to a FONT record. + #: + #: .. versionadded:: 0.6.1 + font_list = [] + + #: A list of :class:`~xlrd.formatting.XF` class instances, + #: each corresponding to an ``XF`` record. + #: + #: .. versionadded:: 0.6.1 + xf_list = [] + + #: A list of :class:`~xlrd.formatting.Format` objects, each corresponding to + #: a ``FORMAT`` record, in the order that they appear in the input file. + #: It does *not* contain builtin formats. + #: + #: If you are creating an output file using (for example) :mod:`xlwt`, + #: use this list. + #: + #: The collection to be used for all visual rendering purposes is + #: :attr:`format_map`. + #: + #: .. versionadded:: 0.6.1 + format_list = [] + + ## + #: The mapping from :attr:`~xlrd.formatting.XF.format_key` to + #: :class:`~xlrd.formatting.Format` object. + #: + #: .. versionadded:: 0.6.1 + format_map = {} + + #: This provides access via name to the extended format information for + #: both built-in styles and user-defined styles. + #: + #: It maps ``name`` to ``(built_in, xf_index)``, where + #: ``name`` is either the name of a user-defined style, + #: or the name of one of the built-in styles. Known built-in names are + #: Normal, RowLevel_1 to RowLevel_7, + #: ColLevel_1 to ColLevel_7, Comma, Currency, Percent, "Comma [0]", + #: "Currency [0]", Hyperlink, and "Followed Hyperlink". + #: + #: ``built_in`` has the following meanings + #: + #: 1: + #: built-in style + #: + #: 0: + #: user-defined + #: + #: ``xf_index`` is an index into :attr:`Book.xf_list`. + #: + #: References: OOo docs s6.99 (``STYLE`` record); Excel UI Format/Style + #: + #: .. versionadded:: 0.6.1 + #: + #: Extracted only if ``open_workbook(..., formatting_info=True)`` + #: + #: .. versionadded:: 0.7.4 + style_name_map = {} + + #: This provides definitions for colour indexes. Please refer to + #: :ref:`palette` for an explanation + #: of how colours are represented in Excel. + #: + #: Colour indexes into the palette map into ``(red, green, blue)`` tuples. + #: "Magic" indexes e.g. ``0x7FFF`` map to ``None``. + #: + #: :attr:`colour_map` is what you need if you want to render cells on screen + #: or in a PDF file. If you are writing an output XLS file, use + #: :attr:`palette_record`. + #: + #: .. note:: Extracted only if ``open_workbook(..., formatting_info=True)`` + #: + #: .. versionadded:: 0.6.1 + colour_map = {} + + #: If the user has changed any of the colours in the standard palette, the + #: XLS file will contain a ``PALETTE`` record with 56 (16 for Excel 4.0 and + #: earlier) RGB values in it, and this list will be e.g. + #: ``[(r0, b0, g0), ..., (r55, b55, g55)]``. + #: Otherwise this list will be empty. This is what you need if you are + #: writing an output XLS file. If you want to render cells on screen or in a + #: PDF file, use :attr:`colour_map`. + #: + #: .. note:: Extracted only if ``open_workbook(..., formatting_info=True)`` + #: + #: .. versionadded:: 0.6.1 + palette_record = [] + + #: Time in seconds to extract the XLS image as a contiguous string + #: (or mmap equivalent). + load_time_stage_1 = -1.0 + + #: Time in seconds to parse the data from the contiguous string + #: (or mmap equivalent). + load_time_stage_2 = -1.0 + + def sheets(self): + """ + :returns: A list of all sheets in the book. + + All sheets not already loaded will be loaded. + """ + for sheetx in xrange(self.nsheets): + if not self._sheet_list[sheetx]: + self.get_sheet(sheetx) + return self._sheet_list[:] + + def sheet_by_index(self, sheetx): + """ + :param sheetx: Sheet index in ``range(nsheets)`` + :returns: A :class:`~xlrd.sheet.Sheet`. + """ + return self._sheet_list[sheetx] or self.get_sheet(sheetx) + + def sheet_by_name(self, sheet_name): + """ + :param sheet_name: Name of the sheet required. + :returns: A :class:`~xlrd.sheet.Sheet`. + """ + try: + sheetx = self._sheet_names.index(sheet_name) + except ValueError: + raise XLRDError('No sheet named <%r>' % sheet_name) + return self.sheet_by_index(sheetx) + + def sheet_names(self): + """ + :returns: + A list of the names of all the worksheets in the workbook file. + This information is available even when no sheets have yet been + loaded. + """ + return self._sheet_names[:] + + def sheet_loaded(self, sheet_name_or_index): + """ + :param sheet_name_or_index: Name or index of sheet enquired upon + :returns: ``True`` if sheet is loaded, ``False`` otherwise. + + .. versionadded:: 0.7.1 + """ + if isinstance(sheet_name_or_index, int): + sheetx = sheet_name_or_index + else: + try: + sheetx = self._sheet_names.index(sheet_name_or_index) + except ValueError: + raise XLRDError('No sheet named <%r>' % sheet_name_or_index) + return bool(self._sheet_list[sheetx]) + + def unload_sheet(self, sheet_name_or_index): + """ + :param sheet_name_or_index: Name or index of sheet to be unloaded. + + .. versionadded:: 0.7.1 + """ + if isinstance(sheet_name_or_index, int): + sheetx = sheet_name_or_index + else: + try: + sheetx = self._sheet_names.index(sheet_name_or_index) + except ValueError: + raise XLRDError('No sheet named <%r>' % sheet_name_or_index) + self._sheet_list[sheetx] = None + + def release_resources(self): + """ + This method has a dual purpose. You can call it to release + memory-consuming objects and (possibly) a memory-mapped file + (:class:`mmap.mmap` object) when you have finished loading sheets in + ``on_demand`` mode, but still require the :class:`Book` object to + examine the loaded sheets. It is also called automatically (a) when + :func:`~xlrd.open_workbook` + raises an exception and (b) if you are using a ``with`` statement, when + the ``with`` block is exited. Calling this method multiple times on the + same object has no ill effect. + """ + self._resources_released = 1 + if hasattr(self.mem, "close"): + # must be a mmap.mmap object + self.mem.close() + self.mem = None + if hasattr(self.filestr, "close"): + self.filestr.close() + self.filestr = None + self._sharedstrings = None + self._rich_text_runlist_map = None + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, exc_tb): + self.release_resources() + # return false + + #: A mapping from ``(lower_case_name, scope)`` to a single :class:`Name` + #: object. + #: + #: .. versionadded:: 0.6.0 + name_and_scope_map = {} + + #: A mapping from `lower_case_name` to a list of :class:`Name` objects. + #: The list is sorted in scope order. Typically there will be one item + #: (of global scope) in the list. + #: + #: .. versionadded:: 0.6.0 + name_map = {} + + def __init__(self): + self._sheet_list = [] + self._sheet_names = [] + self._sheet_visibility = [] # from BOUNDSHEET record + self.nsheets = 0 + self._sh_abs_posn = [] # sheet's absolute position in the stream + self._sharedstrings = [] + self._rich_text_runlist_map = {} + self.raw_user_name = False + self._sheethdr_count = 0 # BIFF 4W only + self.builtinfmtcount = -1 # unknown as yet. BIFF 3, 4S, 4W + self.initialise_format_info() + self._all_sheets_count = 0 # includes macro & VBA sheets + self._supbook_count = 0 + self._supbook_locals_inx = None + self._supbook_addins_inx = None + self._all_sheets_map = [] # maps an all_sheets index to a calc-sheets index (or -1) + self._externsheet_info = [] + self._externsheet_type_b57 = [] + self._extnsht_name_from_num = {} + self._sheet_num_from_name = {} + self._extnsht_count = 0 + self._supbook_types = [] + self._resources_released = 0 + self.addin_func_names = [] + self.name_obj_list = [] + self.colour_map = {} + self.palette_record = [] + self.xf_list = [] + self.style_name_map = {} + self.mem = b'' + self.filestr = b'' + + def biff2_8_load(self, filename=None, file_contents=None, + logfile=sys.stdout, verbosity=0, use_mmap=USE_MMAP, + encoding_override=None, + formatting_info=False, + on_demand=False, + ragged_rows=False): + # DEBUG = 0 + self.logfile = logfile + self.verbosity = verbosity + self.use_mmap = use_mmap and MMAP_AVAILABLE + self.encoding_override = encoding_override + self.formatting_info = formatting_info + self.on_demand = on_demand + self.ragged_rows = ragged_rows + + if not file_contents: + with open(filename, "rb") as f: + f.seek(0, 2) # EOF + size = f.tell() + f.seek(0, 0) # BOF + if size == 0: + raise XLRDError("File size is 0 bytes") + if self.use_mmap: + self.filestr = mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ) + self.stream_len = size + else: + self.filestr = f.read() + self.stream_len = len(self.filestr) + else: + self.filestr = file_contents + self.stream_len = len(file_contents) + + self.base = 0 + if self.filestr[:8] != compdoc.SIGNATURE: + # got this one at the antique store + self.mem = self.filestr + else: + cd = compdoc.CompDoc(self.filestr, logfile=self.logfile) + if USE_FANCY_CD: + for qname in ['Workbook', 'Book']: + self.mem, self.base, self.stream_len = \ + cd.locate_named_stream(UNICODE_LITERAL(qname)) + if self.mem: break + else: + raise XLRDError("Can't find workbook in OLE2 compound document") + else: + for qname in ['Workbook', 'Book']: + self.mem = cd.get_named_stream(UNICODE_LITERAL(qname)) + if self.mem: break + else: + raise XLRDError("Can't find workbook in OLE2 compound document") + self.stream_len = len(self.mem) + del cd + if self.mem is not self.filestr: + if hasattr(self.filestr, "close"): + self.filestr.close() + self.filestr = b'' + self._position = self.base + if DEBUG: + print("mem: %s, base: %d, len: %d" % (type(self.mem), self.base, self.stream_len), file=self.logfile) + + def initialise_format_info(self): + # needs to be done once per sheet for BIFF 4W :-( + self.format_map = {} + self.format_list = [] + self.xfcount = 0 + self.actualfmtcount = 0 # number of FORMAT records seen so far + self._xf_index_to_xl_type_map = {0: XL_CELL_NUMBER} + self._xf_epilogue_done = 0 + self.xf_list = [] + self.font_list = [] + + def get2bytes(self): + pos = self._position + buff_two = self.mem[pos:pos+2] + lenbuff = len(buff_two) + self._position += lenbuff + if lenbuff < 2: + return MY_EOF + lo, hi = buff_two + return (BYTES_ORD(hi) << 8) | BYTES_ORD(lo) + + def get_record_parts(self): + pos = self._position + mem = self.mem + code, length = unpack('= 2: + fprintf(self.logfile, + "BOUNDSHEET: inx=%d vis=%r sheet_name=%r abs_posn=%d sheet_type=0x%02x\n", + self._all_sheets_count, visibility, sheet_name, abs_posn, sheet_type) + self._all_sheets_count += 1 + if sheet_type != XL_BOUNDSHEET_WORKSHEET: + self._all_sheets_map.append(-1) + descr = { + 1: 'Macro sheet', + 2: 'Chart', + 6: 'Visual Basic module', + }.get(sheet_type, 'UNKNOWN') + + if DEBUG or self.verbosity >= 1: + fprintf(self.logfile, + "NOTE *** Ignoring non-worksheet data named %r (type 0x%02x = %s)\n", + sheet_name, sheet_type, descr) + else: + snum = len(self._sheet_names) + self._all_sheets_map.append(snum) + self._sheet_names.append(sheet_name) + self._sh_abs_posn.append(abs_posn) + self._sheet_visibility.append(visibility) + self._sheet_num_from_name[sheet_name] = snum + + def handle_builtinfmtcount(self, data): + ### N.B. This count appears to be utterly useless. + # DEBUG = 1 + builtinfmtcount = unpack('= 2: + fprintf(self.logfile, "*** No CODEPAGE record; assuming 1200 (utf_16_le)\n") + else: + codepage = self.codepage + if codepage in encoding_from_codepage: + encoding = encoding_from_codepage[codepage] + elif 300 <= codepage <= 1999: + encoding = 'cp' + str(codepage) + else: + encoding = 'unknown_codepage_' + str(codepage) + if DEBUG or (self.verbosity and encoding != self.encoding) : + fprintf(self.logfile, "CODEPAGE: codepage %r -> encoding %r\n", codepage, encoding) + self.encoding = encoding + if self.codepage != 1200: # utf_16_le + # If we don't have a codec that can decode ASCII into Unicode, + # we're well & truly stuffed -- let the punter know ASAP. + try: + unicode(b'trial', self.encoding) + except BaseException as e: + fprintf(self.logfile, + "ERROR *** codepage %r -> encoding %r -> %s: %s\n", + self.codepage, self.encoding, type(e).__name__.split(".")[-1], e) + raise + if self.raw_user_name: + strg = unpack_string(self.user_name, 0, self.encoding, lenlen=1) + strg = strg.rstrip() + # if DEBUG: + # print "CODEPAGE: user name decoded from %r to %r" % (self.user_name, strg) + self.user_name = strg + self.raw_user_name = False + return self.encoding + + def handle_codepage(self, data): + # DEBUG = 0 + codepage = unpack('= 2 + if self.biff_version >= 80: + option_flags, other_info =unpack("= 1 + blah2 = DEBUG or self.verbosity >= 2 + if self.biff_version >= 80: + num_refs = unpack("= 2: + logf = self.logfile + fprintf(logf, "FILEPASS:\n") + hex_char_dump(data, 0, len(data), base=0, fout=logf) + if self.biff_version >= 80: + kind1, = unpack('= 2 + bv = self.biff_version + if bv < 50: + return + self.derive_encoding() + # print + # hex_char_dump(data, 0, len(data), fout=self.logfile) + ( + option_flags, kb_shortcut, name_len, fmla_len, extsht_index, sheet_index, + menu_text_len, description_text_len, help_topic_text_len, status_bar_text_len, + ) = unpack("> nshift) + + macro_flag = " M"[nobj.macro] + if bv < 80: + internal_name, pos = unpack_string_update_pos(data, 14, self.encoding, known_len=name_len) + else: + internal_name, pos = unpack_unicode_update_pos(data, 14, known_len=name_len) + nobj.extn_sheet_num = extsht_index + nobj.excel_sheet_index = sheet_index + nobj.scope = None # patched up in the names_epilogue() method + if blah: + fprintf( + self.logfile, + "NAME[%d]:%s oflags=%d, name_len=%d, fmla_len=%d, extsht_index=%d, sheet_index=%d, name=%r\n", + name_index, macro_flag, option_flags, name_len, + fmla_len, extsht_index, sheet_index, internal_name) + name = internal_name + if nobj.builtin: + name = builtin_name_from_code.get(name, "??Unknown??") + if blah: print(" builtin: %s" % name, file=self.logfile) + nobj.name = name + nobj.raw_formula = data[pos:] + nobj.basic_formula_len = fmla_len + nobj.evaluated = 0 + if blah: + nobj.dump( + self.logfile, + header="--- handle_name: name[%d] ---" % name_index, + footer="-------------------", + ) + + def names_epilogue(self): + blah = self.verbosity >= 2 + f = self.logfile + if blah: + print("+++++ names_epilogue +++++", file=f) + print("_all_sheets_map", REPR(self._all_sheets_map), file=f) + print("_extnsht_name_from_num", REPR(self._extnsht_name_from_num), file=f) + print("_sheet_num_from_name", REPR(self._sheet_num_from_name), file=f) + num_names = len(self.name_obj_list) + for namex in range(num_names): + nobj = self.name_obj_list[namex] + # Convert from excel_sheet_index to scope. + # This is done here because in BIFF7 and earlier, the + # BOUNDSHEET records (from which _all_sheets_map is derived) + # come after the NAME records. + if self.biff_version >= 80: + sheet_index = nobj.excel_sheet_index + if sheet_index == 0: + intl_sheet_index = -1 # global + elif 1 <= sheet_index <= len(self._all_sheets_map): + intl_sheet_index = self._all_sheets_map[sheet_index-1] + if intl_sheet_index == -1: # maps to a macro or VBA sheet + intl_sheet_index = -2 # valid sheet reference but not useful + else: + # huh? + intl_sheet_index = -3 # invalid + elif 50 <= self.biff_version <= 70: + sheet_index = nobj.extn_sheet_num + if sheet_index == 0: + intl_sheet_index = -1 # global + else: + sheet_name = self._extnsht_name_from_num[sheet_index] + intl_sheet_index = self._sheet_num_from_name.get(sheet_name, -2) + nobj.scope = intl_sheet_index + + for namex in range(num_names): + nobj = self.name_obj_list[namex] + # Parse the formula ... + if nobj.macro or nobj.binary: continue + if nobj.evaluated: continue + evaluate_name_formula(self, nobj, namex, blah=blah) + + if self.verbosity >= 2: + print("---------- name object dump ----------", file=f) + for namex in range(num_names): + nobj = self.name_obj_list[namex] + nobj.dump(f, header="--- name[%d] ---" % namex) + print("--------------------------------------", file=f) + # + # Build some dicts for access to the name objects + # + name_and_scope_map = {} # (name.lower(), scope): Name_object + name_map = {} # name.lower() : list of Name_objects (sorted in scope order) + for namex in range(num_names): + nobj = self.name_obj_list[namex] + name_lcase = nobj.name.lower() + key = (name_lcase, nobj.scope) + if key in name_and_scope_map and self.verbosity: + fprintf(f, 'Duplicate entry %r in name_and_scope_map\n', key) + name_and_scope_map[key] = nobj + sort_data = (nobj.scope, namex, nobj) + # namex (a temp unique ID) ensures the Name objects will not + # be compared (fatal in py3) + if name_lcase in name_map: + name_map[name_lcase].append(sort_data) + else: + name_map[name_lcase] = [sort_data] + for key in name_map.keys(): + alist = name_map[key] + alist.sort() + name_map[key] = [x[2] for x in alist] + self.name_and_scope_map = name_and_scope_map + self.name_map = name_map + + def handle_obj(self, data): + # Not doing much handling at all. + # Worrying about embedded (BOF ... EOF) substreams is done elsewhere. + # DEBUG = 1 + obj_type, obj_id = unpack(' handle_obj type=%d id=0x%08x" % (obj_type, obj_id) + + def handle_supbook(self, data): + # aka EXTERNALBOOK in OOo docs + self._supbook_types.append(None) + blah = DEBUG or self.verbosity >= 2 + if blah: + print("SUPBOOK:", file=self.logfile) + hex_char_dump(data, 0, len(data), fout=self.logfile) + num_sheets = unpack("= 2: + fprintf(self.logfile, "SST: unique strings: %d\n", uniquestrings) + while 1: + code, nb, data = self.get_record_parts_conditional(XL_CONTINUE) + if code is None: + break + nbt += nb + if DEBUG >= 2: + fprintf(self.logfile, "CONTINUE: adding %d bytes to SST -> %d\n", nb, nbt) + strlist.append(data) + self._sharedstrings, rt_runlist = unpack_SST_table(strlist, uniquestrings) + if self.formatting_info: + self._rich_text_runlist_map = rt_runlist + if DEBUG: + t1 = perf_counter() + print("SST processing took %.2f seconds" % (t1 - t0, ), file=self.logfile) + + def handle_writeaccess(self, data): + DEBUG = 0 + if self.biff_version < 80: + if not self.encoding: + self.raw_user_name = True + self.user_name = data + return + strg = unpack_string(data, 0, self.encoding, lenlen=1) + else: + strg = unpack_unicode(data, 0, lenlen=2) + if DEBUG: fprintf(self.logfile, "WRITEACCESS: %d bytes; raw=%s %r\n", len(data), self.raw_user_name, strg) + strg = strg.rstrip() + self.user_name = strg + + def parse_globals(self): + # DEBUG = 0 + # no need to position, just start reading (after the BOF) + formatting.initialise_book(self) + while 1: + rc, length, data = self.get_record_parts() + if DEBUG: print("parse_globals: record code is 0x%04x" % rc, file=self.logfile) + if rc == XL_SST: + self.handle_sst(data) + elif rc == XL_FONT or rc == XL_FONT_B3B4: + self.handle_font(data) + elif rc == XL_FORMAT: # XL_FORMAT2 is BIFF <= 3.0, can't appear in globals + self.handle_format(data) + elif rc == XL_XF: + self.handle_xf(data) + elif rc == XL_BOUNDSHEET: + self.handle_boundsheet(data) + elif rc == XL_DATEMODE: + self.handle_datemode(data) + elif rc == XL_CODEPAGE: + self.handle_codepage(data) + elif rc == XL_COUNTRY: + self.handle_country(data) + elif rc == XL_EXTERNNAME: + self.handle_externname(data) + elif rc == XL_EXTERNSHEET: + self.handle_externsheet(data) + elif rc == XL_FILEPASS: + self.handle_filepass(data) + elif rc == XL_WRITEACCESS: + self.handle_writeaccess(data) + elif rc == XL_SHEETSOFFSET: + self.handle_sheetsoffset(data) + elif rc == XL_SHEETHDR: + self.handle_sheethdr(data) + elif rc == XL_SUPBOOK: + self.handle_supbook(data) + elif rc == XL_NAME: + self.handle_name(data) + elif rc == XL_PALETTE: + self.handle_palette(data) + elif rc == XL_STYLE: + self.handle_style(data) + elif rc & 0xff == 9 and self.verbosity: + fprintf(self.logfile, "*** Unexpected BOF at posn %d: 0x%04x len=%d data=%r\n", + self._position - length - 4, rc, length, data) + elif rc == XL_EOF: + self.xf_epilogue() + self.names_epilogue() + self.palette_epilogue() + if not self.encoding: + self.derive_encoding() + if self.biff_version == 45: + # DEBUG = 0 + if DEBUG: print("global EOF: position", self._position, file=self.logfile) + # if DEBUG: + # pos = self._position - 4 + # print repr(self.mem[pos:pos+40]) + return + else: + # if DEBUG: + # print >> self.logfile, "parse_globals: ignoring record code 0x%04x" % rc + pass + + def read(self, pos, length): + data = self.mem[pos:pos+length] + self._position = pos + len(data) + return data + + def getbof(self, rqd_stream): + # DEBUG = 1 + # if DEBUG: print >> self.logfile, "getbof(): position", self._position + if DEBUG: print("reqd: 0x%04x" % rqd_stream, file=self.logfile) + + def bof_error(msg): + raise XLRDError('Unsupported format, or corrupt file: ' + msg) + savpos = self._position + opcode = self.get2bytes() + if opcode == MY_EOF: + bof_error('Expected BOF record; met end of file') + if opcode not in bofcodes: + bof_error('Expected BOF record; found %r' % self.mem[savpos:savpos+8]) + length = self.get2bytes() + if length == MY_EOF: + bof_error('Incomplete BOF record[1]; met end of file') + if not (4 <= length <= 20): + bof_error( + 'Invalid length (%d) for BOF record type 0x%04x' + % (length, opcode)) + padding = b'\0' * max(0, boflen[opcode] - length) + data = self.read(self._position, length) + if DEBUG: fprintf(self.logfile, "\ngetbof(): data=%r\n", data) + if len(data) < length: + bof_error('Incomplete BOF record[2]; met end of file') + data += padding + version1 = opcode >> 8 + version2, streamtype = unpack('= 2: + print("BOF: op=0x%04x vers=0x%04x stream=0x%04x buildid=%d buildyr=%d -> BIFF%d" + % (opcode, version2, streamtype, build, year, version), file=self.logfile) + got_globals = streamtype == XL_WORKBOOK_GLOBALS or ( + version == 45 and streamtype == XL_WORKBOOK_GLOBALS_4W) + if (rqd_stream == XL_WORKBOOK_GLOBALS and got_globals) or streamtype == rqd_stream: + return version + if version < 50 and streamtype == XL_WORKSHEET: + return version + if version >= 50 and streamtype == 0x0100: + bof_error("Workspace file -- no spreadsheet data") + bof_error( + 'BOF not workbook/worksheet: op=0x%04x vers=0x%04x strm=0x%04x build=%d year=%d -> BIFF%d' + % (opcode, version2, streamtype, build, year, version) + ) + +# === helper functions + +def expand_cell_address(inrow, incol): + # Ref : OOo docs, "4.3.4 Cell Addresses in BIFF8" + outrow = inrow + if incol & 0x8000: + if outrow >= 32768: + outrow -= 65536 + relrow = 1 + else: + relrow = 0 + outcol = incol & 0xFF + if incol & 0x4000: + if outcol >= 128: + outcol -= 256 + relcol = 1 + else: + relcol = 0 + return outrow, outcol, relrow, relcol + +def colname(colx, _A2Z="ABCDEFGHIJKLMNOPQRSTUVWXYZ"): + assert colx >= 0 + name = UNICODE_LITERAL('') + while 1: + quot, rem = divmod(colx, 26) + name = _A2Z[rem] + name + if not quot: + return name + colx = quot - 1 + +def display_cell_address(rowx, colx, relrow, relcol): + if relrow: + rowpart = "(*%s%d)" % ("+-"[rowx < 0], abs(rowx)) + else: + rowpart = "$%d" % (rowx+1,) + if relcol: + colpart = "(*%s%d)" % ("+-"[colx < 0], abs(colx)) + else: + colpart = "$" + colname(colx) + return colpart + rowpart + +def unpack_SST_table(datatab, nstrings): + "Return list of strings" + datainx = 0 + ndatas = len(datatab) + data = datatab[0] + datalen = len(data) + pos = 8 + strings = [] + strappend = strings.append + richtext_runs = {} + local_unpack = unpack + local_min = min + local_BYTES_ORD = BYTES_ORD + latin_1 = "latin_1" + for _unused_i in xrange(nstrings): + nchars = local_unpack('> 1, charsneed) + rawstrg = data[pos:pos+2*charsavail] + # if DEBUG: print "SST U16: nchars=%d pos=%d rawstrg=%r" % (nchars, pos, rawstrg) + try: + accstrg += unicode(rawstrg, "utf_16_le") + except: + # print "SST U16: nchars=%d pos=%d rawstrg=%r" % (nchars, pos, rawstrg) + # Probable cause: dodgy data e.g. unfinished surrogate pair. + # E.g. file unicode2.xls in pyExcelerator's examples has cells containing + # unichr(i) for i in range(0x100000) + # so this will include 0xD800 etc + raise + pos += 2*charsavail + else: + # Note: this is COMPRESSED (not ASCII!) encoding!!! + charsavail = local_min(datalen - pos, charsneed) + rawstrg = data[pos:pos+charsavail] + # if DEBUG: print "SST CMPRSD: nchars=%d pos=%d rawstrg=%r" % (nchars, pos, rawstrg) + accstrg += unicode(rawstrg, latin_1) + pos += charsavail + charsgot += charsavail + if charsgot == nchars: + break + datainx += 1 + data = datatab[datainx] + datalen = len(data) + options = local_BYTES_ORD(data[0]) + pos = 1 + + if rtcount: + runs = [] + for runindex in xrange(rtcount): + if pos == datalen: + pos = 0 + datainx += 1 + data = datatab[datainx] + datalen = len(data) + runs.append(local_unpack("= datalen: + # adjust to correct position in next record + pos = pos - datalen + datainx += 1 + if datainx < ndatas: + data = datatab[datainx] + datalen = len(data) + else: + assert _unused_i == nstrings - 1 + strappend(accstrg) + return strings, richtext_runs diff --git a/venv/Lib/site-packages/xlrd/compdoc.py b/venv/Lib/site-packages/xlrd/compdoc.py new file mode 100644 index 0000000..b4632dc --- /dev/null +++ b/venv/Lib/site-packages/xlrd/compdoc.py @@ -0,0 +1,483 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2005-2012 Stephen John Machin, Lingfo Pty Ltd +# This module is part of the xlrd package, which is released under a +# BSD-style licence. +# No part of the content of this file was derived from the works of +# David Giffin. +""" +Implements the minimal functionality required +to extract a "Workbook" or "Book" stream (as one big string) +from an OLE2 Compound Document file. +""" +from __future__ import print_function + +import array +import sys +from struct import unpack + +from .timemachine import * + +#: Magic cookie that should appear in the first 8 bytes of the file. +SIGNATURE = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1" + +EOCSID = -2 +FREESID = -1 +SATSID = -3 +MSATSID = -4 +EVILSID = -5 + +class CompDocError(Exception): + pass + +class DirNode(object): + + def __init__(self, DID, dent, DEBUG=0, logfile=sys.stdout): + # dent is the 128-byte directory entry + self.DID = DID + self.logfile = logfile + (cbufsize, self.etype, self.colour, self.left_DID, self.right_DID, + self.root_DID) = \ + unpack(' 20: # allows for 2**20 bytes i.e. 1MB + print("WARNING: sector size (2**%d) is preposterous; assuming 512 and continuing ..." + % ssz, file=logfile) + ssz = 9 + if sssz > ssz: + print("WARNING: short stream sector size (2**%d) is preposterous; assuming 64 and continuing ..." + % sssz, file=logfile) + sssz = 6 + self.sec_size = sec_size = 1 << ssz + self.short_sec_size = 1 << sssz + if self.sec_size != 512 or self.short_sec_size != 64: + print("@@@@ sec_size=%d short_sec_size=%d" % (self.sec_size, self.short_sec_size), file=logfile) + ( + SAT_tot_secs, self.dir_first_sec_sid, _unused, self.min_size_std_stream, + SSAT_first_sec_sid, SSAT_tot_secs, + MSATX_first_sec_sid, MSATX_tot_secs, + ) = unpack(' 1: + print('MSATX: sid=%d (0x%08X)' % (sid, sid), file=logfile) + if sid >= mem_data_secs: + msg = "MSAT extension: accessing sector %d but only %d in file" % (sid, mem_data_secs) + if DEBUG > 1: + print(msg, file=logfile) + break + raise CompDocError(msg) + elif sid < 0: + raise CompDocError("MSAT extension: invalid sector id: %d" % sid) + if seen[sid]: + raise CompDocError("MSAT corruption: seen[%d] == %d" % (sid, seen[sid])) + seen[sid] = 1 + actual_MSATX_sectors += 1 + if DEBUG and actual_MSATX_sectors > expected_MSATX_sectors: + print("[1]===>>>", mem_data_secs, nent, SAT_sectors_reqd, expected_MSATX_sectors, actual_MSATX_sectors, file=logfile) + offset = 512 + sec_size * sid + MSAT.extend(unpack(fmt, mem[offset:offset+sec_size])) + sid = MSAT.pop() # last sector id is sid of next sector in the chain + + if DEBUG and actual_MSATX_sectors != expected_MSATX_sectors: + print("[2]===>>>", mem_data_secs, nent, SAT_sectors_reqd, expected_MSATX_sectors, actual_MSATX_sectors, file=logfile) + if DEBUG: + print("MSAT: len =", len(MSAT), file=logfile) + dump_list(MSAT, 10, logfile) + # + # === build the SAT === + # + self.SAT = [] + actual_SAT_sectors = 0 + dump_again = 0 + for msidx in xrange(len(MSAT)): + msid = MSAT[msidx] + if msid in (FREESID, EOCSID): + # Specification: the MSAT array may be padded with trailing FREESID entries. + # Toleration: a FREESID or EOCSID entry anywhere in the MSAT array will be ignored. + continue + if msid >= mem_data_secs: + if not trunc_warned: + print("WARNING *** File is truncated, or OLE2 MSAT is corrupt!!", file=logfile) + print("INFO: Trying to access sector %d but only %d available" + % (msid, mem_data_secs), file=logfile) + trunc_warned = 1 + MSAT[msidx] = EVILSID + dump_again = 1 + continue + elif msid < -2: + raise CompDocError("MSAT: invalid sector id: %d" % msid) + if seen[msid]: + raise CompDocError("MSAT extension corruption: seen[%d] == %d" % (msid, seen[msid])) + seen[msid] = 2 + actual_SAT_sectors += 1 + if DEBUG and actual_SAT_sectors > SAT_sectors_reqd: + print("[3]===>>>", mem_data_secs, nent, SAT_sectors_reqd, expected_MSATX_sectors, actual_MSATX_sectors, actual_SAT_sectors, msid, file=logfile) + offset = 512 + sec_size * msid + self.SAT.extend(unpack(fmt, mem[offset:offset+sec_size])) + + if DEBUG: + print("SAT: len =", len(self.SAT), file=logfile) + dump_list(self.SAT, 10, logfile) + # print >> logfile, "SAT ", + # for i, s in enumerate(self.SAT): + # print >> logfile, "entry: %4d offset: %6d, next entry: %4d" % (i, 512 + sec_size * i, s) + # print >> logfile, "%d:%d " % (i, s), + print(file=logfile) + if DEBUG and dump_again: + print("MSAT: len =", len(MSAT), file=logfile) + dump_list(MSAT, 10, logfile) + for satx in xrange(mem_data_secs, len(self.SAT)): + self.SAT[satx] = EVILSID + print("SAT: len =", len(self.SAT), file=logfile) + dump_list(self.SAT, 10, logfile) + # + # === build the directory === + # + dbytes = self._get_stream( + self.mem, 512, self.SAT, self.sec_size, self.dir_first_sec_sid, + name="directory", seen_id=3) + dirlist = [] + did = -1 + for pos in xrange(0, len(dbytes), 128): + did += 1 + dirlist.append(DirNode(did, dbytes[pos:pos+128], 0, logfile)) + self.dirlist = dirlist + _build_family_tree(dirlist, 0, dirlist[0].root_DID) # and stand well back ... + if DEBUG: + for d in dirlist: + d.dump(DEBUG) + # + # === get the SSCS === + # + sscs_dir = self.dirlist[0] + assert sscs_dir.etype == 5 # root entry + if sscs_dir.first_SID < 0 or sscs_dir.tot_size == 0: + # Problem reported by Frank Hoffsuemmer: some software was + # writing -1 instead of -2 (EOCSID) for the first_SID + # when the SCCS was empty. Not having EOCSID caused assertion + # failure in _get_stream. + # Solution: avoid calling _get_stream in any case when the + # SCSS appears to be empty. + self.SSCS = "" + else: + self.SSCS = self._get_stream( + self.mem, 512, self.SAT, sec_size, sscs_dir.first_SID, + sscs_dir.tot_size, name="SSCS", seen_id=4) + # if DEBUG: print >> logfile, "SSCS", repr(self.SSCS) + # + # === build the SSAT === + # + self.SSAT = [] + if SSAT_tot_secs > 0 and sscs_dir.tot_size == 0: + print("WARNING *** OLE2 inconsistency: SSCS size is 0 but SSAT size is non-zero", file=logfile) + if sscs_dir.tot_size > 0: + sid = SSAT_first_sec_sid + nsecs = SSAT_tot_secs + while sid >= 0 and nsecs > 0: + if seen[sid]: + raise CompDocError("SSAT corruption: seen[%d] == %d" % (sid, seen[sid])) + seen[sid] = 5 + nsecs -= 1 + start_pos = 512 + sid * sec_size + news = list(unpack(fmt, mem[start_pos:start_pos+sec_size])) + self.SSAT.extend(news) + sid = self.SAT[sid] + if DEBUG: print("SSAT last sid %d; remaining sectors %d" % (sid, nsecs), file=logfile) + assert nsecs == 0 and sid == EOCSID + if DEBUG: + print("SSAT", file=logfile) + dump_list(self.SSAT, 10, logfile) + if DEBUG: + print("seen", file=logfile) + dump_list(seen, 20, logfile) + + def _get_stream(self, mem, base, sat, sec_size, start_sid, size=None, name='', seen_id=None): + # print >> self.logfile, "_get_stream", base, sec_size, start_sid, size + sectors = [] + s = start_sid + if size is None: + # nothing to check against + while s >= 0: + if seen_id is not None: + if self.seen[s]: + raise CompDocError("%s corruption: seen[%d] == %d" % (name, s, self.seen[s])) + self.seen[s] = seen_id + start_pos = base + s * sec_size + sectors.append(mem[start_pos:start_pos+sec_size]) + try: + s = sat[s] + except IndexError: + raise CompDocError( + "OLE2 stream %r: sector allocation table invalid entry (%d)" % + (name, s) + ) + assert s == EOCSID + else: + todo = size + while s >= 0: + if seen_id is not None: + if self.seen[s]: + raise CompDocError("%s corruption: seen[%d] == %d" % (name, s, self.seen[s])) + self.seen[s] = seen_id + start_pos = base + s * sec_size + grab = sec_size + if grab > todo: + grab = todo + todo -= grab + sectors.append(mem[start_pos:start_pos+grab]) + try: + s = sat[s] + except IndexError: + raise CompDocError( + "OLE2 stream %r: sector allocation table invalid entry (%d)" % + (name, s) + ) + assert s == EOCSID + if todo != 0: + fprintf(self.logfile, + "WARNING *** OLE2 stream %r: expected size %d, actual size %d\n", + name, size, size - todo) + + return b''.join(sectors) + + def _dir_search(self, path, storage_DID=0): + # Return matching DirNode instance, or None + head = path[0] + tail = path[1:] + dl = self.dirlist + for child in dl[storage_DID].children: + if dl[child].name.lower() == head.lower(): + et = dl[child].etype + if et == 2: + return dl[child] + if et == 1: + if not tail: + raise CompDocError("Requested component is a 'storage'") + return self._dir_search(tail, child) + dl[child].dump(1) + raise CompDocError("Requested stream is not a 'user stream'") + return None + + + def get_named_stream(self, qname): + """ + Interrogate the compound document's directory; return the stream as a + string if found, otherwise return ``None``. + + :param qname: + Name of the desired stream e.g. ``'Workbook'``. + Should be in Unicode or convertible thereto. + """ + d = self._dir_search(qname.split("/")) + if d is None: + return None + if d.tot_size >= self.min_size_std_stream: + return self._get_stream( + self.mem, 512, self.SAT, self.sec_size, d.first_SID, + d.tot_size, name=qname, seen_id=d.DID+6) + else: + return self._get_stream( + self.SSCS, 0, self.SSAT, self.short_sec_size, d.first_SID, + d.tot_size, name=qname + " (from SSCS)", seen_id=None) + + def locate_named_stream(self, qname): + """ + Interrogate the compound document's directory. + + If the named stream is not found, ``(None, 0, 0)`` will be returned. + + If the named stream is found and is contiguous within the original + byte sequence (``mem``) used when the document was opened, + then ``(mem, offset_to_start_of_stream, length_of_stream)`` is returned. + + Otherwise a new string is built from the fragments and + ``(new_string, 0, length_of_stream)`` is returned. + + :param qname: + Name of the desired stream e.g. ``'Workbook'``. + Should be in Unicode or convertible thereto. + """ + d = self._dir_search(qname.split("/")) + if d is None: + return (None, 0, 0) + if d.tot_size > self.mem_data_len: + raise CompDocError("%r stream length (%d bytes) > file data size (%d bytes)" + % (qname, d.tot_size, self.mem_data_len)) + if d.tot_size >= self.min_size_std_stream: + result = self._locate_stream( + self.mem, 512, self.SAT, self.sec_size, d.first_SID, + d.tot_size, qname, d.DID+6) + if self.DEBUG: + print("\nseen", file=self.logfile) + dump_list(self.seen, 20, self.logfile) + return result + else: + return ( + self._get_stream( + self.SSCS, 0, self.SSAT, self.short_sec_size, d.first_SID, + d.tot_size, qname + " (from SSCS)", None), + 0, + d.tot_size, + ) + + def _locate_stream(self, mem, base, sat, sec_size, start_sid, expected_stream_size, qname, seen_id): + # print >> self.logfile, "_locate_stream", base, sec_size, start_sid, expected_stream_size + s = start_sid + if s < 0: + raise CompDocError("_locate_stream: start_sid (%d) is -ve" % start_sid) + p = -99 # dummy previous SID + start_pos = -9999 + end_pos = -8888 + slices = [] + tot_found = 0 + found_limit = (expected_stream_size + sec_size - 1) // sec_size + while s >= 0: + if self.seen[s]: + print("_locate_stream(%s): seen" % qname, file=self.logfile); dump_list(self.seen, 20, self.logfile) + raise CompDocError("%s corruption: seen[%d] == %d" % (qname, s, self.seen[s])) + self.seen[s] = seen_id + tot_found += 1 + if tot_found > found_limit: + # Note: expected size rounded up to higher sector + raise CompDocError( + "%s: size exceeds expected %d bytes; corrupt?" + % (qname, found_limit * sec_size) + ) + if s == p+1: + # contiguous sectors + end_pos += sec_size + else: + # start new slice + if p >= 0: + # not first time + slices.append((start_pos, end_pos)) + start_pos = base + s * sec_size + end_pos = start_pos + sec_size + p = s + s = sat[s] + assert s == EOCSID + assert tot_found == found_limit + # print >> self.logfile, "_locate_stream(%s): seen" % qname; dump_list(self.seen, 20, self.logfile) + if not slices: + # The stream is contiguous ... just what we like! + return (mem, start_pos, expected_stream_size) + slices.append((start_pos, end_pos)) + # print >> self.logfile, "+++>>> %d fragments" % len(slices) + return (b''.join(mem[start_pos:end_pos] for start_pos, end_pos in slices), 0, expected_stream_size) + +# ========================================================================================== +def x_dump_line(alist, stride, f, dpos, equal=0): + print("%5d%s" % (dpos, " ="[equal]), end=' ', file=f) + for value in alist[dpos:dpos + stride]: + print(str(value), end=' ', file=f) + print(file=f) + +def dump_list(alist, stride, f=sys.stdout): + def _dump_line(dpos, equal=0): + print("%5d%s" % (dpos, " ="[equal]), end=' ', file=f) + for value in alist[dpos:dpos + stride]: + print(str(value), end=' ', file=f) + print(file=f) + pos = None + oldpos = None + for pos in xrange(0, len(alist), stride): + if oldpos is None: + _dump_line(pos) + oldpos = pos + elif alist[pos:pos+stride] != alist[oldpos:oldpos+stride]: + if pos - oldpos > stride: + _dump_line(pos - stride, equal=1) + _dump_line(pos) + oldpos = pos + if oldpos is not None and pos is not None and pos != oldpos: + _dump_line(pos, equal=1) diff --git a/venv/Lib/site-packages/xlrd/formatting.py b/venv/Lib/site-packages/xlrd/formatting.py new file mode 100644 index 0000000..9e4db6a --- /dev/null +++ b/venv/Lib/site-packages/xlrd/formatting.py @@ -0,0 +1,1321 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2005-2012 Stephen John Machin, Lingfo Pty Ltd +# This module is part of the xlrd package, which is released under a +# BSD-style licence. +# No part of the content of this file was derived from the works of +# David Giffin. +""" +Module for formatting information. +""" + +from __future__ import print_function + +import re +from struct import unpack + +from .biffh import ( + FDT, FGE, FNU, FTX, FUN, XL_CELL_DATE, XL_CELL_NUMBER, XL_CELL_TEXT, + XL_FORMAT, XL_FORMAT2, BaseObject, XLRDError, fprintf, unpack_string, + unpack_unicode, upkbits, upkbitsL, +) +from .timemachine import * + +DEBUG = 0 + +_cellty_from_fmtty = { + FNU: XL_CELL_NUMBER, + FUN: XL_CELL_NUMBER, + FGE: XL_CELL_NUMBER, + FDT: XL_CELL_DATE, + FTX: XL_CELL_NUMBER, # Yes, a number can be formatted as text. +} + +excel_default_palette_b5 = ( + ( 0, 0, 0), (255, 255, 255), (255, 0, 0), ( 0, 255, 0), + ( 0, 0, 255), (255, 255, 0), (255, 0, 255), ( 0, 255, 255), + (128, 0, 0), ( 0, 128, 0), ( 0, 0, 128), (128, 128, 0), + (128, 0, 128), ( 0, 128, 128), (192, 192, 192), (128, 128, 128), + (153, 153, 255), (153, 51, 102), (255, 255, 204), (204, 255, 255), + (102, 0, 102), (255, 128, 128), ( 0, 102, 204), (204, 204, 255), + ( 0, 0, 128), (255, 0, 255), (255, 255, 0), ( 0, 255, 255), + (128, 0, 128), (128, 0, 0), ( 0, 128, 128), ( 0, 0, 255), + ( 0, 204, 255), (204, 255, 255), (204, 255, 204), (255, 255, 153), + (153, 204, 255), (255, 153, 204), (204, 153, 255), (227, 227, 227), + ( 51, 102, 255), ( 51, 204, 204), (153, 204, 0), (255, 204, 0), + (255, 153, 0), (255, 102, 0), (102, 102, 153), (150, 150, 150), + ( 0, 51, 102), ( 51, 153, 102), ( 0, 51, 0), ( 51, 51, 0), + (153, 51, 0), (153, 51, 102), ( 51, 51, 153), ( 51, 51, 51), +) + +excel_default_palette_b2 = excel_default_palette_b5[:16] + +# Following table borrowed from Gnumeric 1.4 source. +# Checked against OOo docs and MS docs. +excel_default_palette_b8 = ( # (red, green, blue) + ( 0, 0, 0), (255,255,255), (255, 0, 0), ( 0,255, 0), # 0 + ( 0, 0,255), (255,255, 0), (255, 0,255), ( 0,255,255), # 4 + (128, 0, 0), ( 0,128, 0), ( 0, 0,128), (128,128, 0), # 8 + (128, 0,128), ( 0,128,128), (192,192,192), (128,128,128), # 12 + (153,153,255), (153, 51,102), (255,255,204), (204,255,255), # 16 + (102, 0,102), (255,128,128), ( 0,102,204), (204,204,255), # 20 + ( 0, 0,128), (255, 0,255), (255,255, 0), ( 0,255,255), # 24 + (128, 0,128), (128, 0, 0), ( 0,128,128), ( 0, 0,255), # 28 + ( 0,204,255), (204,255,255), (204,255,204), (255,255,153), # 32 + (153,204,255), (255,153,204), (204,153,255), (255,204,153), # 36 + ( 51,102,255), ( 51,204,204), (153,204, 0), (255,204, 0), # 40 + (255,153, 0), (255,102, 0), (102,102,153), (150,150,150), # 44 + ( 0, 51,102), ( 51,153,102), ( 0, 51, 0), ( 51, 51, 0), # 48 + (153, 51, 0), (153, 51,102), ( 51, 51,153), ( 51, 51, 51), # 52 +) + +default_palette = { + 80: excel_default_palette_b8, + 70: excel_default_palette_b5, + 50: excel_default_palette_b5, + 45: excel_default_palette_b2, + 40: excel_default_palette_b2, + 30: excel_default_palette_b2, + 21: excel_default_palette_b2, + 20: excel_default_palette_b2, +} + +# 00H = Normal +# 01H = RowLevel_lv (see next field) +# 02H = ColLevel_lv (see next field) +# 03H = Comma +# 04H = Currency +# 05H = Percent +# 06H = Comma [0] (BIFF4-BIFF8) +# 07H = Currency [0] (BIFF4-BIFF8) +# 08H = Hyperlink (BIFF8) +# 09H = Followed Hyperlink (BIFF8) +built_in_style_names = [ + "Normal", + "RowLevel_", + "ColLevel_", + "Comma", + "Currency", + "Percent", + "Comma [0]", + "Currency [0]", + "Hyperlink", + "Followed Hyperlink", +] + +def initialise_colour_map(book): + book.colour_map = {} + book.colour_indexes_used = {} + if not book.formatting_info: + return + # Add the 8 invariant colours + for i in xrange(8): + book.colour_map[i] = excel_default_palette_b8[i] + # Add the default palette depending on the version + dpal = default_palette[book.biff_version] + ndpal = len(dpal) + for i in xrange(ndpal): + book.colour_map[i+8] = dpal[i] + # Add the specials -- None means the RGB value is not known + # System window text colour for border lines + book.colour_map[ndpal+8] = None + # System window background colour for pattern background + book.colour_map[ndpal+8+1] = None + # System ToolTip text colour (used in note objects) + book.colour_map[0x51] = None + # 32767, system window text colour for fonts + book.colour_map[0x7FFF] = None + + +def nearest_colour_index(colour_map, rgb, debug=0): + """ + General purpose function. Uses Euclidean distance. + So far used only for pre-BIFF8 ``WINDOW2`` record. + Doesn't have to be fast. + Doesn't have to be fancy. + """ + best_metric = 3 * 256 * 256 + best_colourx = 0 + for colourx, cand_rgb in colour_map.items(): + if cand_rgb is None: + continue + metric = 0 + for v1, v2 in zip(rgb, cand_rgb): + metric += (v1 - v2) * (v1 - v2) + if metric < best_metric: + best_metric = metric + best_colourx = colourx + if metric == 0: + break + if 0 and debug: + print("nearest_colour_index for %r is %r -> %r; best_metric is %d" + % (rgb, best_colourx, colour_map[best_colourx], best_metric)) + return best_colourx + +class EqNeAttrs(object): + """ + This mixin class exists solely so that :class:`Format`, :class:`Font`, and + :class:`XF` objects can be compared by value of their attributes. + """ + + def __eq__(self, other): + return self.__dict__ == other.__dict__ + + def __ne__(self, other): + return self.__dict__ != other.__dict__ + +class Font(BaseObject, EqNeAttrs): + """ + An Excel "font" contains the details of not only what is normally + considered a font, but also several other display attributes. + Items correspond to those in the Excel UI's Format -> Cells -> Font tab. + + .. versionadded:: 0.6.1 + """ + + #: 1 = Characters are bold. Redundant; see "weight" attribute. + bold = 0 + + #: Values:: + #: + #: 0 = ANSI Latin + #: 1 = System default + #: 2 = Symbol, + #: 77 = Apple Roman, + #: 128 = ANSI Japanese Shift-JIS, + #: 129 = ANSI Korean (Hangul), + #: 130 = ANSI Korean (Johab), + #: 134 = ANSI Chinese Simplified GBK, + #: 136 = ANSI Chinese Traditional BIG5, + #: 161 = ANSI Greek, + #: 162 = ANSI Turkish, + #: 163 = ANSI Vietnamese, + #: 177 = ANSI Hebrew, + #: 178 = ANSI Arabic, + #: 186 = ANSI Baltic, + #: 204 = ANSI Cyrillic, + #: 222 = ANSI Thai, + #: 238 = ANSI Latin II (Central European), + #: 255 = OEM Latin I + character_set = 0 + + #: An explanation of "colour index" is given in :ref:`palette`. + colour_index = 0 + + #: 1 = Superscript, 2 = Subscript. + escapement = 0 + + #: Values:: + #: + #: 0 = None (unknown or don't care) + #: 1 = Roman (variable width, serifed) + #: 2 = Swiss (variable width, sans-serifed) + #: 3 = Modern (fixed width, serifed or sans-serifed) + #: 4 = Script (cursive) + #: 5 = Decorative (specialised, for example Old English, Fraktur) + family = 0 + + #: The 0-based index used to refer to this Font() instance. + #: Note that index 4 is never used; xlrd supplies a dummy place-holder. + font_index = 0 + + #: Height of the font (in twips). A twip = 1/20 of a point. + height = 0 + + #: 1 = Characters are italic. + italic = 0 + + #: The name of the font. Example: ``"Arial"``. + name = UNICODE_LITERAL("") + + #: 1 = Characters are struck out. + struck_out = 0 + + #: Values:: + #: + #: 0 = None + #: 1 = Single; 0x21 (33) = Single accounting + #: 2 = Double; 0x22 (34) = Double accounting + underline_type = 0 + + #: 1 = Characters are underlined. Redundant; see + #: :attr:`underline_type` attribute. + underlined = 0 + + #: Font weight (100-1000). Standard values are 400 for normal text + #: and 700 for bold text. + weight = 400 + + #: 1 = Font is outline style (Macintosh only) + outline = 0 + + #: 1 = Font is shadow style (Macintosh only) + shadow = 0 + +def handle_efont(book, data): # BIFF2 only + if not book.formatting_info: + return + book.font_list[-1].colour_index = unpack('= 2 + bv = book.biff_version + k = len(book.font_list) + if k == 4: + f = Font() + f.name = UNICODE_LITERAL('Dummy Font') + f.font_index = k + book.font_list.append(f) + k += 1 + f = Font() + f.font_index = k + book.font_list.append(f) + if bv >= 50: + ( + f.height, option_flags, f.colour_index, f.weight, + f.escapement, f.underline_type, f.family, + f.character_set, + ) = unpack('> 1 + f.underlined = (option_flags & 4) >> 2 + f.struck_out = (option_flags & 8) >> 3 + f.outline = (option_flags & 16) >> 4 + f.shadow = (option_flags & 32) >> 5 + if bv >= 80: + f.name = unpack_unicode(data, 14, lenlen=1) + else: + f.name = unpack_string(data, 14, book.encoding, lenlen=1) + elif bv >= 30: + f.height, option_flags, f.colour_index = unpack('> 1 + f.underlined = (option_flags & 4) >> 2 + f.struck_out = (option_flags & 8) >> 3 + f.outline = (option_flags & 16) >> 4 + f.shadow = (option_flags & 32) >> 5 + f.name = unpack_string(data, 6, book.encoding, lenlen=1) + # Now cook up the remaining attributes ... + f.weight = [400, 700][f.bold] + f.escapement = 0 # None + f.underline_type = f.underlined # None or Single + f.family = 0 # Unknown / don't care + f.character_set = 1 # System default (0 means "ANSI Latin") + else: # BIFF2 + f.height, option_flags = unpack('> 1 + f.underlined = (option_flags & 4) >> 2 + f.struck_out = (option_flags & 8) >> 3 + f.outline = 0 + f.shadow = 0 + f.name = unpack_string(data, 4, book.encoding, lenlen=1) + # Now cook up the remaining attributes ... + f.weight = [400, 700][f.bold] + f.escapement = 0 # None + f.underline_type = f.underlined # None or Single + f.family = 0 # Unknown / don't care + f.character_set = 1 # System default (0 means "ANSI Latin") + if blah: + f.dump( + book.logfile, + header="--- handle_font: font[%d] ---" % f.font_index, + footer="-------------------", + ) + +# === "Number formats" === + +class Format(BaseObject, EqNeAttrs): + """ + "Number format" information from a ``FORMAT`` record. + + .. versionadded:: 0.6.1 + """ + + #: The key into :attr:`~xlrd.book.Book.format_map` + format_key = 0 + + #: A classification that has been inferred from the format string. + #: Currently, this is used only to distinguish between numbers and dates. + #: Values:: + #: + #: FUN = 0 # unknown + #: FDT = 1 # date + #: FNU = 2 # number + #: FGE = 3 # general + #: FTX = 4 # text + type = FUN + + #: The format string + format_str = UNICODE_LITERAL('') + + def __init__(self, format_key, ty, format_str): + self.format_key = format_key + self.type = ty + self.format_str = format_str + +std_format_strings = { + # "std" == "standard for US English locale" + # #### TODO ... a lot of work to tailor these to the user's locale. + # See e.g. gnumeric-1.x.y/src/formats.c + 0x00: "General", + 0x01: "0", + 0x02: "0.00", + 0x03: "#,##0", + 0x04: "#,##0.00", + 0x05: "$#,##0_);($#,##0)", + 0x06: "$#,##0_);[Red]($#,##0)", + 0x07: "$#,##0.00_);($#,##0.00)", + 0x08: "$#,##0.00_);[Red]($#,##0.00)", + 0x09: "0%", + 0x0a: "0.00%", + 0x0b: "0.00E+00", + 0x0c: "# ?/?", + 0x0d: "# ??/??", + 0x0e: "m/d/yy", + 0x0f: "d-mmm-yy", + 0x10: "d-mmm", + 0x11: "mmm-yy", + 0x12: "h:mm AM/PM", + 0x13: "h:mm:ss AM/PM", + 0x14: "h:mm", + 0x15: "h:mm:ss", + 0x16: "m/d/yy h:mm", + 0x25: "#,##0_);(#,##0)", + 0x26: "#,##0_);[Red](#,##0)", + 0x27: "#,##0.00_);(#,##0.00)", + 0x28: "#,##0.00_);[Red](#,##0.00)", + 0x29: "_(* #,##0_);_(* (#,##0);_(* \"-\"_);_(@_)", + 0x2a: "_($* #,##0_);_($* (#,##0);_($* \"-\"_);_(@_)", + 0x2b: "_(* #,##0.00_);_(* (#,##0.00);_(* \"-\"??_);_(@_)", + 0x2c: "_($* #,##0.00_);_($* (#,##0.00);_($* \"-\"??_);_(@_)", + 0x2d: "mm:ss", + 0x2e: "[h]:mm:ss", + 0x2f: "mm:ss.0", + 0x30: "##0.0E+0", + 0x31: "@", +} + +fmt_code_ranges = [ # both-inclusive ranges of "standard" format codes + # Source: the openoffice.org doc't + # and the OOXML spec Part 4, section 3.8.30 + ( 0, 0, FGE), + ( 1, 13, FNU), + (14, 22, FDT), + (27, 36, FDT), # CJK date formats + (37, 44, FNU), + (45, 47, FDT), + (48, 48, FNU), + (49, 49, FTX), + # Gnumeric assumes (or assumed) that built-in formats finish at 49, not at 163 + (50, 58, FDT), # CJK date formats + (59, 62, FNU), # Thai number (currency?) formats + (67, 70, FNU), # Thai number (currency?) formats + (71, 81, FDT), # Thai date formats +] + +std_format_code_types = {} +for lo, hi, ty in fmt_code_ranges: + for x in xrange(lo, hi+1): + std_format_code_types[x] = ty +del lo, hi, ty, x + +date_chars = UNICODE_LITERAL('ymdhs') # year, month/minute, day, hour, second +date_char_dict = {} +for _c in date_chars + date_chars.upper(): + date_char_dict[_c] = 5 +del _c, date_chars + +skip_char_dict = {} +for _c in UNICODE_LITERAL('$-+/(): '): + skip_char_dict[_c] = 1 + +num_char_dict = { + UNICODE_LITERAL('0'): 5, + UNICODE_LITERAL('#'): 5, + UNICODE_LITERAL('?'): 5, +} + +non_date_formats = { + UNICODE_LITERAL('0.00E+00'):1, + UNICODE_LITERAL('##0.0E+0'):1, + UNICODE_LITERAL('General') :1, + UNICODE_LITERAL('GENERAL') :1, # OOo Calc 1.1.4 does this. + UNICODE_LITERAL('general') :1, # pyExcelerator 0.6.3 does this. + UNICODE_LITERAL('@') :1, +} + +fmt_bracketed_sub = re.compile(r'\[[^]]*\]').sub + +# Boolean format strings (actual cases) +# '"Yes";"Yes";"No"' +# '"True";"True";"False"' +# '"On";"On";"Off"' + +def is_date_format_string(book, fmt): + # Heuristics: + # Ignore "text" and [stuff in square brackets (aarrgghh -- see below)]. + # Handle backslashed-escaped chars properly. + # E.g. hh\hmm\mss\s should produce a display like 23h59m59s + # Date formats have one or more of ymdhs (caseless) in them. + # Numeric formats have # and 0. + # N.B. 'General"."' hence get rid of "text" first. + # TODO: Find where formats are interpreted in Gnumeric + # TODO: '[h]\\ \\h\\o\\u\\r\\s' ([h] means don't care about hours > 23) + state = 0 + s = '' + + for c in fmt: + if state == 0: + if c == UNICODE_LITERAL('"'): + state = 1 + elif c in UNICODE_LITERAL(r"\_*"): + state = 2 + elif c in skip_char_dict: + pass + else: + s += c + elif state == 1: + if c == UNICODE_LITERAL('"'): + state = 0 + elif state == 2: + # Ignore char after backslash, underscore or asterisk + state = 0 + assert 0 <= state <= 2 + if book.verbosity >= 4: + print("is_date_format_string: reduced format is %s" % REPR(s), file=book.logfile) + s = fmt_bracketed_sub('', s) + if s in non_date_formats: + return False + state = 0 + separator = ";" + got_sep = 0 + date_count = num_count = 0 + for c in s: + if c in date_char_dict: + date_count += date_char_dict[c] + elif c in num_char_dict: + num_count += num_char_dict[c] + elif c == separator: + got_sep = 1 + # print num_count, date_count, repr(fmt) + if date_count and not num_count: + return True + if num_count and not date_count: + return False + if date_count: + if book.verbosity: + fprintf(book.logfile, + 'WARNING *** is_date_format: ambiguous d=%d n=%d fmt=%r\n', + date_count, num_count, fmt) + elif not got_sep: + if book.verbosity: + fprintf(book.logfile, + "WARNING *** format %r produces constant result\n", + fmt) + return date_count > num_count + +def handle_format(self, data, rectype=XL_FORMAT): + DEBUG = 0 + bv = self.biff_version + if rectype == XL_FORMAT2: + bv = min(bv, 30) + if not self.encoding: + self.derive_encoding() + strpos = 2 + if bv >= 50: + fmtkey = unpack('= 80: + unistrg = unpack_unicode(data, 2) + else: + unistrg = unpack_string(data, strpos, self.encoding, lenlen=1) + blah = DEBUG or self.verbosity >= 3 + if blah: + fprintf(self.logfile, + "FORMAT: count=%d fmtkey=0x%04x (%d) s=%r\n", + self.actualfmtcount, fmtkey, fmtkey, unistrg) + is_date_s = self.is_date_format_string(unistrg) + ty = [FGE, FDT][is_date_s] + if not(fmtkey > 163 or bv < 50): + # user_defined if fmtkey > 163 + # N.B. Gnumeric incorrectly starts these at 50 instead of 164 :-( + # if earlier than BIFF 5, standard info is useless + std_ty = std_format_code_types.get(fmtkey, FUN) + # print "std ty", std_ty + is_date_c = std_ty == FDT + if self.verbosity and 0 < fmtkey < 50 and (is_date_c ^ is_date_s): + DEBUG = 2 + fprintf(self.logfile, + "WARNING *** Conflict between " + "std format key %d and its format string %r\n", + fmtkey, unistrg) + if DEBUG == 2: + fprintf(self.logfile, + "ty: %d; is_date_c: %r; is_date_s: %r; fmt_strg: %r", + ty, is_date_c, is_date_s, unistrg) + fmtobj = Format(fmtkey, ty, unistrg) + if blah: + fmtobj.dump(self.logfile, + header="--- handle_format [%d] ---" % (self.actualfmtcount-1, )) + self.format_map[fmtkey] = fmtobj + self.format_list.append(fmtobj) + +# ============================================================================= + +def handle_palette(book, data): + if not book.formatting_info: + return + blah = DEBUG or book.verbosity >= 2 + n_colours, = unpack('= 50] + if (DEBUG or book.verbosity >= 1) and n_colours != expected_n_colours: + fprintf(book.logfile, + "NOTE *** Expected %d colours in PALETTE record, found %d\n", + expected_n_colours, n_colours) + elif blah: + fprintf(book.logfile, + "PALETTE record with %d colours\n", n_colours) + fmt = '> 8) & 0xff + blue = (c >> 16) & 0xff + old_rgb = book.colour_map[8+i] + new_rgb = (red, green, blue) + book.palette_record.append(new_rgb) + book.colour_map[8+i] = new_rgb + if blah: + if new_rgb != old_rgb: + print("%2d: %r -> %r" % (i, old_rgb, new_rgb), file=book.logfile) + +def palette_epilogue(book): + # Check colour indexes in fonts etc. + # This must be done here as FONT records + # come *before* the PALETTE record :-( + for font in book.font_list: + if font.font_index == 4: # the missing font record + continue + cx = font.colour_index + if cx == 0x7fff: # system window text colour + continue + if cx in book.colour_map: + book.colour_indexes_used[cx] = 1 + elif book.verbosity: + print("Size of colour table:", len(book.colour_map), file=book.logfile) + fprintf(book.logfile, "*** Font #%d (%r): colour index 0x%04x is unknown\n", + font.font_index, font.name, cx) + if book.verbosity >= 1: + used = sorted(book.colour_indexes_used.keys()) + print("\nColour indexes used:\n%r\n" % used, file=book.logfile) + +def handle_style(book, data): + if not book.formatting_info: + return + blah = DEBUG or book.verbosity >= 2 + bv = book.biff_version + flag_and_xfx, built_in_id, level = unpack('= 80: + try: + name = unpack_unicode(data, 2, lenlen=2) + except UnicodeDecodeError: + print("STYLE: built_in=%d xf_index=%d built_in_id=%d level=%d" + % (built_in, xf_index, built_in_id, level), file=book.logfile) + print("raw bytes:", repr(data[2:]), file=book.logfile) + raise + else: + name = unpack_string(data, 2, book.encoding, lenlen=1) + if blah and not name: + print("WARNING *** A user-defined style has a zero-length name", file=book.logfile) + book.style_name_map[name] = (built_in, xf_index) + if blah: + fprintf(book.logfile, "STYLE: built_in=%d xf_index=%d built_in_id=%d level=%d name=%r\n", + built_in, xf_index, built_in_id, level, name) + +def check_colour_indexes_in_obj(book, obj, orig_index): + alist = sorted(obj.__dict__.items()) + for attr, nobj in alist: + if hasattr(nobj, 'dump'): + check_colour_indexes_in_obj(book, nobj, orig_index) + elif attr.find('colour_index') >= 0: + if nobj in book.colour_map: + book.colour_indexes_used[nobj] = 1 + continue + oname = obj.__class__.__name__ + print("*** xf #%d : %s.%s = 0x%04x (unknown)" + % (orig_index, oname, attr, nobj), file=book.logfile) + +def fill_in_standard_formats(book): + for x in std_format_code_types.keys(): + if x not in book.format_map: + ty = std_format_code_types[x] + # Note: many standard format codes (mostly CJK date formats) have + # format strings that vary by locale; xlrd does not (yet) + # handle those; the type (date or numeric) is recorded but the fmt_str will be None. + fmt_str = std_format_strings.get(x) + fmtobj = Format(x, ty, fmt_str) + book.format_map[x] = fmtobj + +def handle_xf(self, data): + # self is a Book instance + # DEBUG = 0 + blah = DEBUG or self.verbosity >= 3 + bv = self.biff_version + xf = XF() + xf.alignment = XFAlignment() + xf.alignment.indent_level = 0 + xf.alignment.shrink_to_fit = 0 + xf.alignment.text_direction = 0 + xf.border = XFBorder() + xf.border.diag_up = 0 + xf.border.diag_down = 0 + xf.border.diag_colour_index = 0 + xf.border.diag_line_style = 0 # no line + xf.background = XFBackground() + xf.protection = XFProtection() + # fill in the known standard formats + if bv >= 50 and not self.xfcount: + # i.e. do this once before we process the first XF record + fill_in_standard_formats(self) + if bv >= 80: + unpack_fmt = '> 2 + attr_stems = [ + 'format', + 'font', + 'alignment', + 'border', + 'background', + 'protection', + ] + for attr_stem in attr_stems: + attr = "_" + attr_stem + "_flag" + setattr(xf, attr, reg & 1) + reg >>= 1 + upkbitsL(xf.border, pkd_brdbkg1, ( + (0, 0x0000000f, 'left_line_style'), + (4, 0x000000f0, 'right_line_style'), + (8, 0x00000f00, 'top_line_style'), + (12, 0x0000f000, 'bottom_line_style'), + (16, 0x007f0000, 'left_colour_index'), + (23, 0x3f800000, 'right_colour_index'), + (30, 0x40000000, 'diag_down'), + (31, 0x80000000, 'diag_up'), + )) + upkbits(xf.border, pkd_brdbkg2, ( + (0, 0x0000007F, 'top_colour_index'), + (7, 0x00003F80, 'bottom_colour_index'), + (14, 0x001FC000, 'diag_colour_index'), + (21, 0x01E00000, 'diag_line_style'), + )) + upkbitsL(xf.background, pkd_brdbkg2, ( + (26, 0xFC000000, 'fill_pattern'), + )) + upkbits(xf.background, pkd_brdbkg3, ( + (0, 0x007F, 'pattern_colour_index'), + (7, 0x3F80, 'background_colour_index'), + )) + elif bv >= 50: + unpack_fmt = '> 2 + attr_stems = [ + 'format', + 'font', + 'alignment', + 'border', + 'background', + 'protection', + ] + for attr_stem in attr_stems: + attr = "_" + attr_stem + "_flag" + setattr(xf, attr, reg & 1) + reg >>= 1 + upkbitsL(xf.background, pkd_brdbkg1, ( + ( 0, 0x0000007F, 'pattern_colour_index'), + ( 7, 0x00003F80, 'background_colour_index'), + (16, 0x003F0000, 'fill_pattern'), + )) + upkbitsL(xf.border, pkd_brdbkg1, ( + (22, 0x01C00000, 'bottom_line_style'), + (25, 0xFE000000, 'bottom_colour_index'), + )) + upkbits(xf.border, pkd_brdbkg2, ( + ( 0, 0x00000007, 'top_line_style'), + ( 3, 0x00000038, 'left_line_style'), + ( 6, 0x000001C0, 'right_line_style'), + ( 9, 0x0000FE00, 'top_colour_index'), + (16, 0x007F0000, 'left_colour_index'), + (23, 0x3F800000, 'right_colour_index'), + )) + elif bv >= 40: + unpack_fmt = '> 6 + xf.alignment.rotation = [0, 255, 90, 180][orientation] + reg = pkd_used >> 2 + attr_stems = [ + 'format', + 'font', + 'alignment', + 'border', + 'background', + 'protection', + ] + for attr_stem in attr_stems: + attr = "_" + attr_stem + "_flag" + setattr(xf, attr, reg & 1) + reg >>= 1 + upkbits(xf.background, pkd_bkg_34, ( + ( 0, 0x003F, 'fill_pattern'), + ( 6, 0x07C0, 'pattern_colour_index'), + (11, 0xF800, 'background_colour_index'), + )) + upkbitsL(xf.border, pkd_brd_34, ( + ( 0, 0x00000007, 'top_line_style'), + ( 3, 0x000000F8, 'top_colour_index'), + ( 8, 0x00000700, 'left_line_style'), + (11, 0x0000F800, 'left_colour_index'), + (16, 0x00070000, 'bottom_line_style'), + (19, 0x00F80000, 'bottom_colour_index'), + (24, 0x07000000, 'right_line_style'), + (27, 0xF8000000, 'right_colour_index'), + )) + elif bv == 30: + unpack_fmt = '> 2 + attr_stems = [ + 'format', + 'font', + 'alignment', + 'border', + 'background', + 'protection', + ] + for attr_stem in attr_stems: + attr = "_" + attr_stem + "_flag" + setattr(xf, attr, reg & 1) + reg >>= 1 + upkbits(xf.background, pkd_bkg_34, ( + ( 0, 0x003F, 'fill_pattern'), + ( 6, 0x07C0, 'pattern_colour_index'), + (11, 0xF800, 'background_colour_index'), + )) + upkbitsL(xf.border, pkd_brd_34, ( + ( 0, 0x00000007, 'top_line_style'), + ( 3, 0x000000F8, 'top_colour_index'), + ( 8, 0x00000700, 'left_line_style'), + (11, 0x0000F800, 'left_colour_index'), + (16, 0x00070000, 'bottom_line_style'), + (19, 0x00F80000, 'bottom_colour_index'), + (24, 0x07000000, 'right_line_style'), + (27, 0xF8000000, 'right_colour_index'), + )) + xf.alignment.vert_align = 2 # bottom + xf.alignment.rotation = 0 + elif bv == 21: + ## Warning: incomplete treatment; formatting_info not fully supported. + ## Probably need to offset incoming BIFF2 XF[n] to BIFF8-like XF[n+16], + ## and create XF[0:16] like the standard ones in BIFF8 *AND* add 16 to + ## all XF references in cell records :-( + (xf.font_index, format_etc, halign_etc) = unpack('= 3 + blah1 = DEBUG or self.verbosity >= 1 + if blah: + fprintf(self.logfile, "xf_epilogue called ...\n") + + def check_same(book_arg, xf_arg, parent_arg, attr): + # the _arg caper is to avoid a Warning msg from Python 2.1 :-( + if getattr(xf_arg, attr) != getattr(parent_arg, attr): + fprintf(book_arg.logfile, + "NOTE !!! XF[%d] parent[%d] %s different\n", + xf_arg.xf_index, parent_arg.xf_index, attr) + + for xfx in xrange(num_xfs): + xf = self.xf_list[xfx] + + try: + fmt = self.format_map[xf.format_key] + cellty = _cellty_from_fmtty[fmt.type] + except KeyError: + cellty = XL_CELL_TEXT + self._xf_index_to_xl_type_map[xf.xf_index] = cellty + # Now for some assertions etc + if not self.formatting_info: + continue + if xf.is_style: + continue + if not(0 <= xf.parent_style_index < num_xfs): + if blah1: + fprintf(self.logfile, + "WARNING *** XF[%d]: is_style=%d but parent_style_index=%d\n", + xf.xf_index, xf.is_style, xf.parent_style_index) + # make it conform + xf.parent_style_index = 0 + if self.biff_version >= 30: + if blah1: + if xf.parent_style_index == xf.xf_index: + fprintf(self.logfile, + "NOTE !!! XF[%d]: parent_style_index is also %d\n", + xf.xf_index, xf.parent_style_index) + elif not self.xf_list[xf.parent_style_index].is_style: + fprintf(self.logfile, + "NOTE !!! XF[%d]: parent_style_index is %d; style flag not set\n", + xf.xf_index, xf.parent_style_index) + if blah1 and xf.parent_style_index > xf.xf_index: + fprintf(self.logfile, + "NOTE !!! XF[%d]: parent_style_index is %d; out of order?\n", + xf.xf_index, xf.parent_style_index) + parent = self.xf_list[xf.parent_style_index] + if not xf._alignment_flag and not parent._alignment_flag: + if blah1: check_same(self, xf, parent, 'alignment') + if not xf._background_flag and not parent._background_flag: + if blah1: check_same(self, xf, parent, 'background') + if not xf._border_flag and not parent._border_flag: + if blah1: check_same(self, xf, parent, 'border') + if not xf._protection_flag and not parent._protection_flag: + if blah1: check_same(self, xf, parent, 'protection') + if not xf._format_flag and not parent._format_flag: + if blah1 and xf.format_key != parent.format_key: + fprintf(self.logfile, + "NOTE !!! XF[%d] fmtk=%d, parent[%d] fmtk=%r\n%r / %r\n", + xf.xf_index, xf.format_key, parent.xf_index, parent.format_key, + self.format_map[xf.format_key].format_str, + self.format_map[parent.format_key].format_str) + if not xf._font_flag and not parent._font_flag: + if blah1 and xf.font_index != parent.font_index: + fprintf(self.logfile, + "NOTE !!! XF[%d] fontx=%d, parent[%d] fontx=%r\n", + xf.xf_index, xf.font_index, parent.xf_index, parent.font_index) + +def initialise_book(book): + initialise_colour_map(book) + book._xf_epilogue_done = 0 + methods = ( + handle_font, + handle_efont, + handle_format, + is_date_format_string, + handle_palette, + palette_epilogue, + handle_style, + handle_xf, + xf_epilogue, + ) + for method in methods: + setattr(book.__class__, method.__name__, method) + +class XFBorder(BaseObject, EqNeAttrs): + """ + A collection of the border-related attributes of an ``XF`` record. + Items correspond to those in the Excel UI's Format -> Cells -> Border tab. + + An explanations of "colour index" is given in :ref:`palette`. + + There are five line style attributes; possible values and the + associated meanings are:: + + 0 = No line, + 1 = Thin, + 2 = Medium, + 3 = Dashed, + 4 = Dotted, + 5 = Thick, + 6 = Double, + 7 = Hair, + 8 = Medium dashed, + 9 = Thin dash-dotted, + 10 = Medium dash-dotted, + 11 = Thin dash-dot-dotted, + 12 = Medium dash-dot-dotted, + 13 = Slanted medium dash-dotted. + + The line styles 8 to 13 appear in BIFF8 files (Excel 97 and later) only. + For pictures of the line styles, refer to OOo docs s3.10 (p22) + "Line Styles for Cell Borders (BIFF3-BIFF8)".

+ + .. versionadded:: 0.6.1 + """ + + #: The colour index for the cell's top line + top_colour_index = 0 + #: The colour index for the cell's bottom line + bottom_colour_index = 0 + + #: The colour index for the cell's left line + left_colour_index = 0 + + #: The colour index for the cell's right line + right_colour_index = 0 + + #: The colour index for the cell's diagonal lines, if any + diag_colour_index = 0 + + #: The line style for the cell's top line + top_line_style = 0 + + #: The line style for the cell's bottom line + bottom_line_style = 0 + + #: The line style for the cell's left line + left_line_style = 0 + + #: The line style for the cell's right line + right_line_style = 0 + + #: The line style for the cell's diagonal lines, if any + diag_line_style = 0 + + #: 1 = draw a diagonal from top left to bottom right + diag_down = 0 + + #: 1 = draw a diagonal from bottom left to top right + diag_up = 0 + +class XFBackground(BaseObject, EqNeAttrs): + """ + A collection of the background-related attributes of an ``XF`` record. + Items correspond to those in the Excel UI's Format -> Cells -> Patterns tab. + + An explanations of "colour index" is given in :ref:`palette`. + + .. versionadded:: 0.6.1 + """ + + #: See section 3.11 of the OOo docs. + fill_pattern = 0 + + #: See section 3.11 of the OOo docs. + background_colour_index = 0 + + #: See section 3.11 of the OOo docs. + pattern_colour_index = 0 + + +class XFAlignment(BaseObject, EqNeAttrs): + """ + A collection of the alignment and similar attributes of an ``XF`` record. + Items correspond to those in the Excel UI's Format -> Cells -> Alignment tab. + + .. versionadded:: 0.6.1 + """ + + #: Values: section 6.115 (p 214) of OOo docs + hor_align = 0 + + #: Values: section 6.115 (p 215) of OOo docs + vert_align = 0 + + #: Values: section 6.115 (p 215) of OOo docs. + #: + #: .. note:: + #: file versions BIFF7 and earlier use the documented + #: :attr:`orientation` attribute; this will be mapped (without loss) + #: into :attr:`rotation`. + rotation = 0 + + #: 1 = text is wrapped at right margin + text_wrapped = 0 + + #: A number in ``range(15)``. + indent_level = 0 + + #: 1 = shrink font size to fit text into cell. + shrink_to_fit = 0 + + #: 0 = according to context; 1 = left-to-right; 2 = right-to-left + text_direction = 0 + +class XFProtection(BaseObject, EqNeAttrs): + """ + A collection of the protection-related attributes of an ``XF`` record. + Items correspond to those in the Excel UI's Format -> Cells -> Protection tab. + Note the OOo docs include the "cell or style" bit in this bundle of + attributes. This is incorrect; the bit is used in determining which bundles + to use. + + .. versionadded:: 0.6.1 + """ + + #: 1 = Cell is prevented from being changed, moved, resized, or deleted + #: (only if the sheet is protected). + cell_locked = 0 + + #: 1 = Hide formula so that it doesn't appear in the formula bar when + #: the cell is selected (only if the sheet is protected). + formula_hidden = 0 + +class XF(BaseObject): + """ + eXtended Formatting information for cells, rows, columns and styles. + + Each of the 6 flags below describes the validity of + a specific group of attributes. + + In cell XFs: + + - ``flag==0`` means the attributes of the parent style ``XF`` are + used, (but only if the attributes are valid there); + + - ``flag==1`` means the attributes of this ``XF`` are used. + + In style XFs: + + - ``flag==0`` means the attribute setting is valid; + - ``flag==1`` means the attribute should be ignored. + + .. note:: + the API provides both "raw" XFs and "computed" XFs. In the latter case, + cell XFs have had the above inheritance mechanism applied. + + .. versionadded:: 0.6.1 + """ + + #: 0 = cell XF, 1 = style XF + is_style = 0 + + #: cell XF: Index into Book.xf_list of this XF's style XF + #: + #: style XF: 0xFFF + parent_style_index = 0 + + # + _format_flag = 0 + + # + _font_flag = 0 + + # + _alignment_flag = 0 + + # + _border_flag = 0 + + # + _background_flag = 0 + + _protection_flag = 0 + + #: Index into :attr:`~xlrd.book.Book.xf_list` + xf_index = 0 + + #: Index into :attr:`~xlrd.book.Book.font_list` + font_index = 0 + + #: Key into :attr:`~xlrd.book.Book.format_map` + #: + #: .. warning:: + #: OOo docs on the XF record call this "Index to FORMAT record". + #: It is not an index in the Python sense. It is a key to a map. + #: It is true *only* for Excel 4.0 and earlier files + #: that the key into format_map from an XF instance + #: is the same as the index into format_list, and *only* + #: if the index is less than 164. + format_key = 0 + + #: An instance of an :class:`XFProtection` object. + protection = None + + #: An instance of an :class:`XFBackground` object. + background = None + + #: An instance of an :class:`XFAlignment` object. + alignment = None + + #: An instance of an :class:`XFBorder` object. + border = None diff --git a/venv/Lib/site-packages/xlrd/formula.py b/venv/Lib/site-packages/xlrd/formula.py new file mode 100644 index 0000000..e26639b --- /dev/null +++ b/venv/Lib/site-packages/xlrd/formula.py @@ -0,0 +1,2190 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2005-2012 Stephen John Machin, Lingfo Pty Ltd +# This module is part of the xlrd package, which is released under a +# BSD-style licence. +# No part of the content of this file was derived from the works of +# David Giffin. +""" +Module for parsing/evaluating Microsoft Excel formulas. +""" + +from __future__ import print_function + +import copy +import operator as opr +from struct import unpack + +from .biffh import ( + BaseObject, XLRDError, error_text_from_code, hex_char_dump, + unpack_string_update_pos, unpack_unicode_update_pos, +) +from .timemachine import * + +__all__ = [ + 'oBOOL', 'oERR', 'oNUM', 'oREF', 'oREL', 'oSTRG', 'oUNK', + 'decompile_formula', + 'dump_formula', + 'evaluate_name_formula', + 'okind_dict', + 'rangename3d', 'rangename3drel', 'cellname', 'cellnameabs', 'colname', + 'FMLA_TYPE_CELL', + 'FMLA_TYPE_SHARED', + 'FMLA_TYPE_ARRAY', + 'FMLA_TYPE_COND_FMT', + 'FMLA_TYPE_DATA_VAL', + 'FMLA_TYPE_NAME', + 'Operand', 'Ref3D', +] + +FMLA_TYPE_CELL = 1 +FMLA_TYPE_SHARED = 2 +FMLA_TYPE_ARRAY = 4 +FMLA_TYPE_COND_FMT = 8 +FMLA_TYPE_DATA_VAL = 16 +FMLA_TYPE_NAME = 32 +ALL_FMLA_TYPES = 63 + + +FMLA_TYPEDESCR_MAP = { + 1 : 'CELL', + 2 : 'SHARED', + 4 : 'ARRAY', + 8 : 'COND-FMT', + 16: 'DATA-VAL', + 32: 'NAME', +} + +_TOKEN_NOT_ALLOWED = { + 0x01: ALL_FMLA_TYPES - FMLA_TYPE_CELL, # tExp + 0x02: ALL_FMLA_TYPES - FMLA_TYPE_CELL, # tTbl + 0x0F: FMLA_TYPE_SHARED + FMLA_TYPE_COND_FMT + FMLA_TYPE_DATA_VAL, # tIsect + 0x10: FMLA_TYPE_SHARED + FMLA_TYPE_COND_FMT + FMLA_TYPE_DATA_VAL, # tUnion/List + 0x11: FMLA_TYPE_SHARED + FMLA_TYPE_COND_FMT + FMLA_TYPE_DATA_VAL, # tRange + 0x20: FMLA_TYPE_SHARED + FMLA_TYPE_COND_FMT + FMLA_TYPE_DATA_VAL, # tArray + 0x23: FMLA_TYPE_SHARED, # tName + 0x39: FMLA_TYPE_SHARED + FMLA_TYPE_COND_FMT + FMLA_TYPE_DATA_VAL, # tNameX + 0x3A: FMLA_TYPE_SHARED + FMLA_TYPE_COND_FMT + FMLA_TYPE_DATA_VAL, # tRef3d + 0x3B: FMLA_TYPE_SHARED + FMLA_TYPE_COND_FMT + FMLA_TYPE_DATA_VAL, # tArea3d + 0x2C: FMLA_TYPE_CELL + FMLA_TYPE_ARRAY, # tRefN + 0x2D: FMLA_TYPE_CELL + FMLA_TYPE_ARRAY, # tAreaN + # plus weird stuff like tMem* +}.get + +oBOOL = 3 +oERR = 4 +oMSNG = 5 # tMissArg +oNUM = 2 +oREF = -1 +oREL = -2 +oSTRG = 1 +oUNK = 0 + +okind_dict = { + -2: "oREL", + -1: "oREF", + 0 : "oUNK", + 1 : "oSTRG", + 2 : "oNUM", + 3 : "oBOOL", + 4 : "oERR", + 5 : "oMSNG", +} + +listsep = ',' #### probably should depend on locale + + +# sztabN[opcode] -> the number of bytes to consume. +# -1 means variable +# -2 means this opcode not implemented in this version. +# Which N to use? Depends on biff_version; see szdict. +sztab0 = [-2, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -2, -1, 8, 4, 2, 2, 3, 9, 8, 2, 3, 8, 4, 7, 5, 5, 5, 2, 4, 7, 4, 7, 2, 2, -2, -2, -2, -2, -2, -2, -2, -2, 3, -2, -2, -2, -2, -2, -2, -2] +sztab1 = [-2, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -2, -1, 11, 5, 2, 2, 3, 9, 9, 2, 3, 11, 4, 7, 7, 7, 7, 3, 4, 7, 4, 7, 3, 3, -2, -2, -2, -2, -2, -2, -2, -2, 3, -2, -2, -2, -2, -2, -2, -2] +sztab2 = [-2, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -2, -1, 11, 5, 2, 2, 3, 9, 9, 3, 4, 11, 4, 7, 7, 7, 7, 3, 4, 7, 4, 7, 3, 3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2] +sztab3 = [-2, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -2, -1, -2, -2, 2, 2, 3, 9, 9, 3, 4, 15, 4, 7, 7, 7, 7, 3, 4, 7, 4, 7, 3, 3, -2, -2, -2, -2, -2, -2, -2, -2, -2, 25, 18, 21, 18, 21, -2, -2] +sztab4 = [-2, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -2, -2, 2, 2, 3, 9, 9, 3, 4, 5, 5, 9, 7, 7, 7, 3, 5, 9, 5, 9, 3, 3, -2, -2, -2, -2, -2, -2, -2, -2, -2, 7, 7, 11, 7, 11, -2, -2] + +szdict = { + 20 : sztab0, + 21 : sztab0, + 30 : sztab1, + 40 : sztab2, + 45 : sztab2, + 50 : sztab3, + 70 : sztab3, + 80 : sztab4, +} + +# For debugging purposes ... the name for each opcode +# (without the prefix "t" used on OOo docs) +onames = ['Unk00', 'Exp', 'Tbl', 'Add', 'Sub', 'Mul', 'Div', 'Power', 'Concat', 'LT', 'LE', 'EQ', 'GE', 'GT', 'NE', 'Isect', 'List', 'Range', 'Uplus', 'Uminus', 'Percent', 'Paren', 'MissArg', 'Str', 'Extended', 'Attr', 'Sheet', 'EndSheet', 'Err', 'Bool', 'Int', 'Num', 'Array', 'Func', 'FuncVar', 'Name', 'Ref', 'Area', 'MemArea', 'MemErr', 'MemNoMem', 'MemFunc', 'RefErr', 'AreaErr', 'RefN', 'AreaN', 'MemAreaN', 'MemNoMemN', '', '', '', '', '', '', '', '', 'FuncCE', 'NameX', 'Ref3d', 'Area3d', 'RefErr3d', 'AreaErr3d', '', ''] + +func_defs = { + # index: (name, min#args, max#args, flags, #known_args, return_type, kargs) + 0 : ('COUNT', 0, 30, 0x04, 1, 'V', 'R'), + 1 : ('IF', 2, 3, 0x04, 3, 'V', 'VRR'), + 2 : ('ISNA', 1, 1, 0x02, 1, 'V', 'V'), + 3 : ('ISERROR', 1, 1, 0x02, 1, 'V', 'V'), + 4 : ('SUM', 0, 30, 0x04, 1, 'V', 'R'), + 5 : ('AVERAGE', 1, 30, 0x04, 1, 'V', 'R'), + 6 : ('MIN', 1, 30, 0x04, 1, 'V', 'R'), + 7 : ('MAX', 1, 30, 0x04, 1, 'V', 'R'), + 8 : ('ROW', 0, 1, 0x04, 1, 'V', 'R'), + 9 : ('COLUMN', 0, 1, 0x04, 1, 'V', 'R'), + 10 : ('NA', 0, 0, 0x02, 0, 'V', ''), + 11 : ('NPV', 2, 30, 0x04, 2, 'V', 'VR'), + 12 : ('STDEV', 1, 30, 0x04, 1, 'V', 'R'), + 13 : ('DOLLAR', 1, 2, 0x04, 1, 'V', 'V'), + 14 : ('FIXED', 2, 3, 0x04, 3, 'V', 'VVV'), + 15 : ('SIN', 1, 1, 0x02, 1, 'V', 'V'), + 16 : ('COS', 1, 1, 0x02, 1, 'V', 'V'), + 17 : ('TAN', 1, 1, 0x02, 1, 'V', 'V'), + 18 : ('ATAN', 1, 1, 0x02, 1, 'V', 'V'), + 19 : ('PI', 0, 0, 0x02, 0, 'V', ''), + 20 : ('SQRT', 1, 1, 0x02, 1, 'V', 'V'), + 21 : ('EXP', 1, 1, 0x02, 1, 'V', 'V'), + 22 : ('LN', 1, 1, 0x02, 1, 'V', 'V'), + 23 : ('LOG10', 1, 1, 0x02, 1, 'V', 'V'), + 24 : ('ABS', 1, 1, 0x02, 1, 'V', 'V'), + 25 : ('INT', 1, 1, 0x02, 1, 'V', 'V'), + 26 : ('SIGN', 1, 1, 0x02, 1, 'V', 'V'), + 27 : ('ROUND', 2, 2, 0x02, 2, 'V', 'VV'), + 28 : ('LOOKUP', 2, 3, 0x04, 2, 'V', 'VR'), + 29 : ('INDEX', 2, 4, 0x0c, 4, 'R', 'RVVV'), + 30 : ('REPT', 2, 2, 0x02, 2, 'V', 'VV'), + 31 : ('MID', 3, 3, 0x02, 3, 'V', 'VVV'), + 32 : ('LEN', 1, 1, 0x02, 1, 'V', 'V'), + 33 : ('VALUE', 1, 1, 0x02, 1, 'V', 'V'), + 34 : ('TRUE', 0, 0, 0x02, 0, 'V', ''), + 35 : ('FALSE', 0, 0, 0x02, 0, 'V', ''), + 36 : ('AND', 1, 30, 0x04, 1, 'V', 'R'), + 37 : ('OR', 1, 30, 0x04, 1, 'V', 'R'), + 38 : ('NOT', 1, 1, 0x02, 1, 'V', 'V'), + 39 : ('MOD', 2, 2, 0x02, 2, 'V', 'VV'), + 40 : ('DCOUNT', 3, 3, 0x02, 3, 'V', 'RRR'), + 41 : ('DSUM', 3, 3, 0x02, 3, 'V', 'RRR'), + 42 : ('DAVERAGE', 3, 3, 0x02, 3, 'V', 'RRR'), + 43 : ('DMIN', 3, 3, 0x02, 3, 'V', 'RRR'), + 44 : ('DMAX', 3, 3, 0x02, 3, 'V', 'RRR'), + 45 : ('DSTDEV', 3, 3, 0x02, 3, 'V', 'RRR'), + 46 : ('VAR', 1, 30, 0x04, 1, 'V', 'R'), + 47 : ('DVAR', 3, 3, 0x02, 3, 'V', 'RRR'), + 48 : ('TEXT', 2, 2, 0x02, 2, 'V', 'VV'), + 49 : ('LINEST', 1, 4, 0x04, 4, 'A', 'RRVV'), + 50 : ('TREND', 1, 4, 0x04, 4, 'A', 'RRRV'), + 51 : ('LOGEST', 1, 4, 0x04, 4, 'A', 'RRVV'), + 52 : ('GROWTH', 1, 4, 0x04, 4, 'A', 'RRRV'), + 56 : ('PV', 3, 5, 0x04, 5, 'V', 'VVVVV'), + 57 : ('FV', 3, 5, 0x04, 5, 'V', 'VVVVV'), + 58 : ('NPER', 3, 5, 0x04, 5, 'V', 'VVVVV'), + 59 : ('PMT', 3, 5, 0x04, 5, 'V', 'VVVVV'), + 60 : ('RATE', 3, 6, 0x04, 6, 'V', 'VVVVVV'), + 61 : ('MIRR', 3, 3, 0x02, 3, 'V', 'RVV'), + 62 : ('IRR', 1, 2, 0x04, 2, 'V', 'RV'), + 63 : ('RAND', 0, 0, 0x0a, 0, 'V', ''), + 64 : ('MATCH', 2, 3, 0x04, 3, 'V', 'VRR'), + 65 : ('DATE', 3, 3, 0x02, 3, 'V', 'VVV'), + 66 : ('TIME', 3, 3, 0x02, 3, 'V', 'VVV'), + 67 : ('DAY', 1, 1, 0x02, 1, 'V', 'V'), + 68 : ('MONTH', 1, 1, 0x02, 1, 'V', 'V'), + 69 : ('YEAR', 1, 1, 0x02, 1, 'V', 'V'), + 70 : ('WEEKDAY', 1, 2, 0x04, 2, 'V', 'VV'), + 71 : ('HOUR', 1, 1, 0x02, 1, 'V', 'V'), + 72 : ('MINUTE', 1, 1, 0x02, 1, 'V', 'V'), + 73 : ('SECOND', 1, 1, 0x02, 1, 'V', 'V'), + 74 : ('NOW', 0, 0, 0x0a, 0, 'V', ''), + 75 : ('AREAS', 1, 1, 0x02, 1, 'V', 'R'), + 76 : ('ROWS', 1, 1, 0x02, 1, 'V', 'R'), + 77 : ('COLUMNS', 1, 1, 0x02, 1, 'V', 'R'), + 78 : ('OFFSET', 3, 5, 0x04, 5, 'R', 'RVVVV'), + 82 : ('SEARCH', 2, 3, 0x04, 3, 'V', 'VVV'), + 83 : ('TRANSPOSE', 1, 1, 0x02, 1, 'A', 'A'), + 86 : ('TYPE', 1, 1, 0x02, 1, 'V', 'V'), + 92 : ('SERIESSUM', 4, 4, 0x02, 4, 'V', 'VVVA'), + 97 : ('ATAN2', 2, 2, 0x02, 2, 'V', 'VV'), + 98 : ('ASIN', 1, 1, 0x02, 1, 'V', 'V'), + 99 : ('ACOS', 1, 1, 0x02, 1, 'V', 'V'), + 100: ('CHOOSE', 2, 30, 0x04, 2, 'V', 'VR'), + 101: ('HLOOKUP', 3, 4, 0x04, 4, 'V', 'VRRV'), + 102: ('VLOOKUP', 3, 4, 0x04, 4, 'V', 'VRRV'), + 105: ('ISREF', 1, 1, 0x02, 1, 'V', 'R'), + 109: ('LOG', 1, 2, 0x04, 2, 'V', 'VV'), + 111: ('CHAR', 1, 1, 0x02, 1, 'V', 'V'), + 112: ('LOWER', 1, 1, 0x02, 1, 'V', 'V'), + 113: ('UPPER', 1, 1, 0x02, 1, 'V', 'V'), + 114: ('PROPER', 1, 1, 0x02, 1, 'V', 'V'), + 115: ('LEFT', 1, 2, 0x04, 2, 'V', 'VV'), + 116: ('RIGHT', 1, 2, 0x04, 2, 'V', 'VV'), + 117: ('EXACT', 2, 2, 0x02, 2, 'V', 'VV'), + 118: ('TRIM', 1, 1, 0x02, 1, 'V', 'V'), + 119: ('REPLACE', 4, 4, 0x02, 4, 'V', 'VVVV'), + 120: ('SUBSTITUTE', 3, 4, 0x04, 4, 'V', 'VVVV'), + 121: ('CODE', 1, 1, 0x02, 1, 'V', 'V'), + 124: ('FIND', 2, 3, 0x04, 3, 'V', 'VVV'), + 125: ('CELL', 1, 2, 0x0c, 2, 'V', 'VR'), + 126: ('ISERR', 1, 1, 0x02, 1, 'V', 'V'), + 127: ('ISTEXT', 1, 1, 0x02, 1, 'V', 'V'), + 128: ('ISNUMBER', 1, 1, 0x02, 1, 'V', 'V'), + 129: ('ISBLANK', 1, 1, 0x02, 1, 'V', 'V'), + 130: ('T', 1, 1, 0x02, 1, 'V', 'R'), + 131: ('N', 1, 1, 0x02, 1, 'V', 'R'), + 140: ('DATEVALUE', 1, 1, 0x02, 1, 'V', 'V'), + 141: ('TIMEVALUE', 1, 1, 0x02, 1, 'V', 'V'), + 142: ('SLN', 3, 3, 0x02, 3, 'V', 'VVV'), + 143: ('SYD', 4, 4, 0x02, 4, 'V', 'VVVV'), + 144: ('DDB', 4, 5, 0x04, 5, 'V', 'VVVVV'), + 148: ('INDIRECT', 1, 2, 0x0c, 2, 'R', 'VV'), + 162: ('CLEAN', 1, 1, 0x02, 1, 'V', 'V'), + 163: ('MDETERM', 1, 1, 0x02, 1, 'V', 'A'), + 164: ('MINVERSE', 1, 1, 0x02, 1, 'A', 'A'), + 165: ('MMULT', 2, 2, 0x02, 2, 'A', 'AA'), + 167: ('IPMT', 4, 6, 0x04, 6, 'V', 'VVVVVV'), + 168: ('PPMT', 4, 6, 0x04, 6, 'V', 'VVVVVV'), + 169: ('COUNTA', 0, 30, 0x04, 1, 'V', 'R'), + 183: ('PRODUCT', 0, 30, 0x04, 1, 'V', 'R'), + 184: ('FACT', 1, 1, 0x02, 1, 'V', 'V'), + 189: ('DPRODUCT', 3, 3, 0x02, 3, 'V', 'RRR'), + 190: ('ISNONTEXT', 1, 1, 0x02, 1, 'V', 'V'), + 193: ('STDEVP', 1, 30, 0x04, 1, 'V', 'R'), + 194: ('VARP', 1, 30, 0x04, 1, 'V', 'R'), + 195: ('DSTDEVP', 3, 3, 0x02, 3, 'V', 'RRR'), + 196: ('DVARP', 3, 3, 0x02, 3, 'V', 'RRR'), + 197: ('TRUNC', 1, 2, 0x04, 2, 'V', 'VV'), + 198: ('ISLOGICAL', 1, 1, 0x02, 1, 'V', 'V'), + 199: ('DCOUNTA', 3, 3, 0x02, 3, 'V', 'RRR'), + 204: ('USDOLLAR', 1, 2, 0x04, 2, 'V', 'VV'), + 205: ('FINDB', 2, 3, 0x04, 3, 'V', 'VVV'), + 206: ('SEARCHB', 2, 3, 0x04, 3, 'V', 'VVV'), + 207: ('REPLACEB', 4, 4, 0x02, 4, 'V', 'VVVV'), + 208: ('LEFTB', 1, 2, 0x04, 2, 'V', 'VV'), + 209: ('RIGHTB', 1, 2, 0x04, 2, 'V', 'VV'), + 210: ('MIDB', 3, 3, 0x02, 3, 'V', 'VVV'), + 211: ('LENB', 1, 1, 0x02, 1, 'V', 'V'), + 212: ('ROUNDUP', 2, 2, 0x02, 2, 'V', 'VV'), + 213: ('ROUNDDOWN', 2, 2, 0x02, 2, 'V', 'VV'), + 214: ('ASC', 1, 1, 0x02, 1, 'V', 'V'), + 215: ('DBCS', 1, 1, 0x02, 1, 'V', 'V'), + 216: ('RANK', 2, 3, 0x04, 3, 'V', 'VRV'), + 219: ('ADDRESS', 2, 5, 0x04, 5, 'V', 'VVVVV'), + 220: ('DAYS360', 2, 3, 0x04, 3, 'V', 'VVV'), + 221: ('TODAY', 0, 0, 0x0a, 0, 'V', ''), + 222: ('VDB', 5, 7, 0x04, 7, 'V', 'VVVVVVV'), + 227: ('MEDIAN', 1, 30, 0x04, 1, 'V', 'R'), + 228: ('SUMPRODUCT', 1, 30, 0x04, 1, 'V', 'A'), + 229: ('SINH', 1, 1, 0x02, 1, 'V', 'V'), + 230: ('COSH', 1, 1, 0x02, 1, 'V', 'V'), + 231: ('TANH', 1, 1, 0x02, 1, 'V', 'V'), + 232: ('ASINH', 1, 1, 0x02, 1, 'V', 'V'), + 233: ('ACOSH', 1, 1, 0x02, 1, 'V', 'V'), + 234: ('ATANH', 1, 1, 0x02, 1, 'V', 'V'), + 235: ('DGET', 3, 3, 0x02, 3, 'V', 'RRR'), + 244: ('INFO', 1, 1, 0x02, 1, 'V', 'V'), + 247: ('DB', 4, 5, 0x04, 5, 'V', 'VVVVV'), + 252: ('FREQUENCY', 2, 2, 0x02, 2, 'A', 'RR'), + 261: ('ERROR.TYPE', 1, 1, 0x02, 1, 'V', 'V'), + 269: ('AVEDEV', 1, 30, 0x04, 1, 'V', 'R'), + 270: ('BETADIST', 3, 5, 0x04, 1, 'V', 'V'), + 271: ('GAMMALN', 1, 1, 0x02, 1, 'V', 'V'), + 272: ('BETAINV', 3, 5, 0x04, 1, 'V', 'V'), + 273: ('BINOMDIST', 4, 4, 0x02, 4, 'V', 'VVVV'), + 274: ('CHIDIST', 2, 2, 0x02, 2, 'V', 'VV'), + 275: ('CHIINV', 2, 2, 0x02, 2, 'V', 'VV'), + 276: ('COMBIN', 2, 2, 0x02, 2, 'V', 'VV'), + 277: ('CONFIDENCE', 3, 3, 0x02, 3, 'V', 'VVV'), + 278: ('CRITBINOM', 3, 3, 0x02, 3, 'V', 'VVV'), + 279: ('EVEN', 1, 1, 0x02, 1, 'V', 'V'), + 280: ('EXPONDIST', 3, 3, 0x02, 3, 'V', 'VVV'), + 281: ('FDIST', 3, 3, 0x02, 3, 'V', 'VVV'), + 282: ('FINV', 3, 3, 0x02, 3, 'V', 'VVV'), + 283: ('FISHER', 1, 1, 0x02, 1, 'V', 'V'), + 284: ('FISHERINV', 1, 1, 0x02, 1, 'V', 'V'), + 285: ('FLOOR', 2, 2, 0x02, 2, 'V', 'VV'), + 286: ('GAMMADIST', 4, 4, 0x02, 4, 'V', 'VVVV'), + 287: ('GAMMAINV', 3, 3, 0x02, 3, 'V', 'VVV'), + 288: ('CEILING', 2, 2, 0x02, 2, 'V', 'VV'), + 289: ('HYPGEOMDIST', 4, 4, 0x02, 4, 'V', 'VVVV'), + 290: ('LOGNORMDIST', 3, 3, 0x02, 3, 'V', 'VVV'), + 291: ('LOGINV', 3, 3, 0x02, 3, 'V', 'VVV'), + 292: ('NEGBINOMDIST', 3, 3, 0x02, 3, 'V', 'VVV'), + 293: ('NORMDIST', 4, 4, 0x02, 4, 'V', 'VVVV'), + 294: ('NORMSDIST', 1, 1, 0x02, 1, 'V', 'V'), + 295: ('NORMINV', 3, 3, 0x02, 3, 'V', 'VVV'), + 296: ('NORMSINV', 1, 1, 0x02, 1, 'V', 'V'), + 297: ('STANDARDIZE', 3, 3, 0x02, 3, 'V', 'VVV'), + 298: ('ODD', 1, 1, 0x02, 1, 'V', 'V'), + 299: ('PERMUT', 2, 2, 0x02, 2, 'V', 'VV'), + 300: ('POISSON', 3, 3, 0x02, 3, 'V', 'VVV'), + 301: ('TDIST', 3, 3, 0x02, 3, 'V', 'VVV'), + 302: ('WEIBULL', 4, 4, 0x02, 4, 'V', 'VVVV'), + 303: ('SUMXMY2', 2, 2, 0x02, 2, 'V', 'AA'), + 304: ('SUMX2MY2', 2, 2, 0x02, 2, 'V', 'AA'), + 305: ('SUMX2PY2', 2, 2, 0x02, 2, 'V', 'AA'), + 306: ('CHITEST', 2, 2, 0x02, 2, 'V', 'AA'), + 307: ('CORREL', 2, 2, 0x02, 2, 'V', 'AA'), + 308: ('COVAR', 2, 2, 0x02, 2, 'V', 'AA'), + 309: ('FORECAST', 3, 3, 0x02, 3, 'V', 'VAA'), + 310: ('FTEST', 2, 2, 0x02, 2, 'V', 'AA'), + 311: ('INTERCEPT', 2, 2, 0x02, 2, 'V', 'AA'), + 312: ('PEARSON', 2, 2, 0x02, 2, 'V', 'AA'), + 313: ('RSQ', 2, 2, 0x02, 2, 'V', 'AA'), + 314: ('STEYX', 2, 2, 0x02, 2, 'V', 'AA'), + 315: ('SLOPE', 2, 2, 0x02, 2, 'V', 'AA'), + 316: ('TTEST', 4, 4, 0x02, 4, 'V', 'AAVV'), + 317: ('PROB', 3, 4, 0x04, 3, 'V', 'AAV'), + 318: ('DEVSQ', 1, 30, 0x04, 1, 'V', 'R'), + 319: ('GEOMEAN', 1, 30, 0x04, 1, 'V', 'R'), + 320: ('HARMEAN', 1, 30, 0x04, 1, 'V', 'R'), + 321: ('SUMSQ', 0, 30, 0x04, 1, 'V', 'R'), + 322: ('KURT', 1, 30, 0x04, 1, 'V', 'R'), + 323: ('SKEW', 1, 30, 0x04, 1, 'V', 'R'), + 324: ('ZTEST', 2, 3, 0x04, 2, 'V', 'RV'), + 325: ('LARGE', 2, 2, 0x02, 2, 'V', 'RV'), + 326: ('SMALL', 2, 2, 0x02, 2, 'V', 'RV'), + 327: ('QUARTILE', 2, 2, 0x02, 2, 'V', 'RV'), + 328: ('PERCENTILE', 2, 2, 0x02, 2, 'V', 'RV'), + 329: ('PERCENTRANK', 2, 3, 0x04, 2, 'V', 'RV'), + 330: ('MODE', 1, 30, 0x04, 1, 'V', 'A'), + 331: ('TRIMMEAN', 2, 2, 0x02, 2, 'V', 'RV'), + 332: ('TINV', 2, 2, 0x02, 2, 'V', 'VV'), + 336: ('CONCATENATE', 0, 30, 0x04, 1, 'V', 'V'), + 337: ('POWER', 2, 2, 0x02, 2, 'V', 'VV'), + 342: ('RADIANS', 1, 1, 0x02, 1, 'V', 'V'), + 343: ('DEGREES', 1, 1, 0x02, 1, 'V', 'V'), + 344: ('SUBTOTAL', 2, 30, 0x04, 2, 'V', 'VR'), + 345: ('SUMIF', 2, 3, 0x04, 3, 'V', 'RVR'), + 346: ('COUNTIF', 2, 2, 0x02, 2, 'V', 'RV'), + 347: ('COUNTBLANK', 1, 1, 0x02, 1, 'V', 'R'), + 350: ('ISPMT', 4, 4, 0x02, 4, 'V', 'VVVV'), + 351: ('DATEDIF', 3, 3, 0x02, 3, 'V', 'VVV'), + 352: ('DATESTRING', 1, 1, 0x02, 1, 'V', 'V'), + 353: ('NUMBERSTRING', 2, 2, 0x02, 2, 'V', 'VV'), + 354: ('ROMAN', 1, 2, 0x04, 2, 'V', 'VV'), + 358: ('GETPIVOTDATA', 2, 2, 0x02, 2, 'V', 'RV'), + 359: ('HYPERLINK', 1, 2, 0x04, 2, 'V', 'VV'), + 360: ('PHONETIC', 1, 1, 0x02, 1, 'V', 'V'), + 361: ('AVERAGEA', 1, 30, 0x04, 1, 'V', 'R'), + 362: ('MAXA', 1, 30, 0x04, 1, 'V', 'R'), + 363: ('MINA', 1, 30, 0x04, 1, 'V', 'R'), + 364: ('STDEVPA', 1, 30, 0x04, 1, 'V', 'R'), + 365: ('VARPA', 1, 30, 0x04, 1, 'V', 'R'), + 366: ('STDEVA', 1, 30, 0x04, 1, 'V', 'R'), + 367: ('VARA', 1, 30, 0x04, 1, 'V', 'R'), + 368: ('BAHTTEXT', 1, 1, 0x02, 1, 'V', 'V'), + 369: ('THAIDAYOFWEEK', 1, 1, 0x02, 1, 'V', 'V'), + 370: ('THAIDIGIT', 1, 1, 0x02, 1, 'V', 'V'), + 371: ('THAIMONTHOFYEAR', 1, 1, 0x02, 1, 'V', 'V'), + 372: ('THAINUMSOUND', 1, 1, 0x02, 1, 'V', 'V'), + 373: ('THAINUMSTRING', 1, 1, 0x02, 1, 'V', 'V'), + 374: ('THAISTRINGLENGTH', 1, 1, 0x02, 1, 'V', 'V'), + 375: ('ISTHAIDIGIT', 1, 1, 0x02, 1, 'V', 'V'), + 376: ('ROUNDBAHTDOWN', 1, 1, 0x02, 1, 'V', 'V'), + 377: ('ROUNDBAHTUP', 1, 1, 0x02, 1, 'V', 'V'), + 378: ('THAIYEAR', 1, 1, 0x02, 1, 'V', 'V'), + 379: ('RTD', 2, 5, 0x04, 1, 'V', 'V'), +} + +tAttrNames = { + 0x00: "Skip??", # seen in SAMPLES.XLS which shipped with Excel 5.0 + 0x01: "Volatile", + 0x02: "If", + 0x04: "Choose", + 0x08: "Skip", + 0x10: "Sum", + 0x20: "Assign", + 0x40: "Space", + 0x41: "SpaceVolatile", +} + +error_opcodes = set([0x07, 0x08, 0x0A, 0x0B, 0x1C, 0x1D, 0x2F]) + +tRangeFuncs = (min, max, min, max, min, max) +tIsectFuncs = (max, min, max, min, max, min) + +def do_box_funcs(box_funcs, boxa, boxb): + return tuple( + func(numa, numb) + for func, numa, numb in zip(box_funcs, boxa.coords, boxb.coords) + ) + +def adjust_cell_addr_biff8(rowval, colval, reldelta, browx=None, bcolx=None): + row_rel = (colval >> 15) & 1 + col_rel = (colval >> 14) & 1 + rowx = rowval + colx = colval & 0xff + if reldelta: + if row_rel and rowx >= 32768: + rowx -= 65536 + if col_rel and colx >= 128: + colx -= 256 + else: + if row_rel: + rowx -= browx + if col_rel: + colx -= bcolx + return rowx, colx, row_rel, col_rel + +def adjust_cell_addr_biff_le7( + rowval, colval, reldelta, browx=None, bcolx=None): + row_rel = (rowval >> 15) & 1 + col_rel = (rowval >> 14) & 1 + rowx = rowval & 0x3fff + colx = colval + if reldelta: + if row_rel and rowx >= 8192: + rowx -= 16384 + if col_rel and colx >= 128: + colx -= 256 + else: + if row_rel: + rowx -= browx + if col_rel: + colx -= bcolx + return rowx, colx, row_rel, col_rel + +def get_cell_addr(data, pos, bv, reldelta, browx=None, bcolx=None): + if bv >= 80: + rowval, colval = unpack("= 80: + row1val, row2val, col1val, col2val = unpack(" addins %r" % (refx, info), file=bk.logfile) + assert ref_first_sheetx == 0xFFFE == ref_last_sheetx + return (-5, -5) + if ref_recordx != bk._supbook_locals_inx: + if blah: + print("/// get_externsheet_local_range(refx=%d) -> external %r" % (refx, info), file=bk.logfile) + return (-4, -4) # external reference + if ref_first_sheetx == 0xFFFE == ref_last_sheetx: + if blah: + print("/// get_externsheet_local_range(refx=%d) -> unspecified sheet %r" % (refx, info), file=bk.logfile) + return (-1, -1) # internal reference, any sheet + if ref_first_sheetx == 0xFFFF == ref_last_sheetx: + if blah: + print("/// get_externsheet_local_range(refx=%d) -> deleted sheet(s)" % (refx, ), file=bk.logfile) + return (-2, -2) # internal reference, deleted sheet(s) + nsheets = len(bk._all_sheets_map) + if not(0 <= ref_first_sheetx <= ref_last_sheetx < nsheets): + if blah: + print("/// get_externsheet_local_range(refx=%d) -> %r" % (refx, info), file=bk.logfile) + print("--- first/last sheet not in range(%d)" % nsheets, file=bk.logfile) + return (-102, -102) # stuffed up somewhere :-( + xlrd_sheetx1 = bk._all_sheets_map[ref_first_sheetx] + xlrd_sheetx2 = bk._all_sheets_map[ref_last_sheetx] + if not(0 <= xlrd_sheetx1 <= xlrd_sheetx2): + return (-3, -3) # internal reference, but to a macro sheet + return xlrd_sheetx1, xlrd_sheetx2 + +def get_externsheet_local_range_b57( + bk, raw_extshtx, ref_first_sheetx, ref_last_sheetx, blah=0): + if raw_extshtx > 0: + if blah: + print("/// get_externsheet_local_range_b57(raw_extshtx=%d) -> external" % raw_extshtx, file=bk.logfile) + return (-4, -4) # external reference + if ref_first_sheetx == -1 and ref_last_sheetx == -1: + return (-2, -2) # internal reference, deleted sheet(s) + nsheets = len(bk._all_sheets_map) + if not(0 <= ref_first_sheetx <= ref_last_sheetx < nsheets): + if blah: + print("/// get_externsheet_local_range_b57(%d, %d, %d) -> ???" + % (raw_extshtx, ref_first_sheetx, ref_last_sheetx), file=bk.logfile) + print("--- first/last sheet not in range(%d)" % nsheets, file=bk.logfile) + return (-103, -103) # stuffed up somewhere :-( + xlrd_sheetx1 = bk._all_sheets_map[ref_first_sheetx] + xlrd_sheetx2 = bk._all_sheets_map[ref_last_sheetx] + if not(0 <= xlrd_sheetx1 <= xlrd_sheetx2): + return (-3, -3) # internal reference, but to a macro sheet + return xlrd_sheetx1, xlrd_sheetx2 + +class FormulaError(Exception): + pass + + +class Operand(object): + """ + Used in evaluating formulas. + The following table describes the kinds and how their values + are represented. + + .. raw:: html + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Kind symbolKind numberValue representation
oBOOL3integer: 0 => False; 1 => True
oERR4None, or an int error code (same as XL_CELL_ERROR in the Cell class). +
oMSNG5Used by Excel as a placeholder for a missing (not supplied) function + argument. Should *not* appear as a final formula result. Value is None.
oNUM2A float. Note that there is no way of distinguishing dates.
oREF-1The value is either None or a non-empty list of + absolute Ref3D instances.
+
oREL-2The value is None or a non-empty list of + fully or partially relative Ref3D instances. +
oSTRG1A Unicode string.
oUNK0The kind is unknown or ambiguous. The value is None
+ """ + + #: None means that the actual value of the operand is a variable + #: (depends on cell data), not a constant. + value = None + + #: oUNK means that the kind of operand is not known unambiguously. + kind = oUNK + + #: The reconstituted text of the original formula. Function names will be + #: in English irrespective of the original language, which doesn't seem + #: to be recorded anywhere. The separator is ",", not ";" or whatever else + #: might be more appropriate for the end-user's locale; patches welcome. + text = '?' + + def __init__(self, akind=None, avalue=None, arank=0, atext='?'): + if akind is not None: + self.kind = akind + if avalue is not None: + self.value = avalue + self.rank = arank + # rank is an internal gizmo (operator precedence); + # it's used in reconstructing formula text. + self.text = atext + + def __repr__(self): + kind_text = okind_dict.get(self.kind, "?Unknown kind?") + return "Operand(kind=%s, value=%r, text=%r)" \ + % (kind_text, self.value, self.text) + + +class Ref3D(tuple): + """ + Represents an absolute or relative 3-dimensional reference to a box + of one or more cells. + + The ``coords`` attribute is a tuple of the form:: + + (shtxlo, shtxhi, rowxlo, rowxhi, colxlo, colxhi) + + where ``0 <= thingxlo <= thingx < thingxhi``. + + .. note:: + It is quite possible to have ``thingx > nthings``; for example + ``Print_Titles`` could have ``colxhi == 256`` and/or ``rowxhi == 65536`` + irrespective of how many columns/rows are actually used in the worksheet. + The caller will need to decide how to handle this situation. + Keyword: :class:`IndexError` :-) + + The components of the coords attribute are also available as individual + attributes: ``shtxlo``, ``shtxhi``, ``rowxlo``, ``rowxhi``, ``colxlo``, and + ``colxhi``. + + The ``relflags`` attribute is a 6-tuple of flags which indicate whether + the corresponding (sheet|row|col)(lo|hi) is relative (1) or absolute (0). + + .. note:: + There is necessarily no information available as to what cell(s) + the reference could possibly be relative to. The caller must decide what + if any use to make of ``oREL`` operands. + + .. note: + A partially relative reference may well be a typo. + For example, define name ``A1Z10`` as ``$a$1:$z10`` (missing ``$`` after + ``z``) while the cursor is on cell ``Sheet3!A27``. + + The resulting :class:`Ref3D` instance will have + ``coords = (2, 3, 0, -16, 0, 26)`` + and ``relflags = (0, 0, 0, 1, 0, 0).
+ + So far, only one possibility of a sheet-relative component in + a reference has been noticed: a 2D reference located in the + "current sheet". + + This will appear as ``coords = (0, 1, ...)`` and + ``relflags = (1, 1, ...)``. + + .. versionadded:: 0.6.0 + """ + + def __init__(self, atuple): + self.coords = atuple[0:6] + self.relflags = atuple[6:12] + if not self.relflags: + self.relflags = (0, 0, 0, 0, 0, 0) + (self.shtxlo, self.shtxhi, + self.rowxlo, self.rowxhi, + self.colxlo, self.colxhi) = self.coords + + def __repr__(self): + if not self.relflags or self.relflags == (0, 0, 0, 0, 0, 0): + return "Ref3D(coords=%r)" % (self.coords, ) + else: + return "Ref3D(coords=%r, relflags=%r)" \ + % (self.coords, self.relflags) + +tAdd = 0x03 +tSub = 0x04 +tMul = 0x05 +tDiv = 0x06 +tPower = 0x07 +tConcat = 0x08 +tLT, tLE, tEQ, tGE, tGT, tNE = range(0x09, 0x0F) + + +def nop(x): + return x + +def _opr_pow(x, y): return x ** y + +def _opr_lt(x, y): return x < y +def _opr_le(x, y): return x <= y +def _opr_eq(x, y): return x == y +def _opr_ge(x, y): return x >= y +def _opr_gt(x, y): return x > y +def _opr_ne(x, y): return x != y + +def num2strg(num): + """ + Attempt to emulate Excel's default conversion from number to string. + """ + s = str(num) + if s.endswith(".0"): + s = s[:-2] + return s + +_arith_argdict = {oNUM: nop, oSTRG: float} +_cmp_argdict = {oNUM: nop, oSTRG: nop} +# Seems no conversions done on relops; in Excel, "1" > 9 produces TRUE. +_strg_argdict = {oNUM:num2strg, oSTRG:nop} +binop_rules = { + tAdd: (_arith_argdict, oNUM, opr.add, 30, '+'), + tSub: (_arith_argdict, oNUM, opr.sub, 30, '-'), + tMul: (_arith_argdict, oNUM, opr.mul, 40, '*'), + tDiv: (_arith_argdict, oNUM, opr.truediv, 40, '/'), + tPower: (_arith_argdict, oNUM, _opr_pow, 50, '^',), + tConcat:(_strg_argdict, oSTRG, opr.add, 20, '&'), + tLT: (_cmp_argdict, oBOOL, _opr_lt, 10, '<'), + tLE: (_cmp_argdict, oBOOL, _opr_le, 10, '<='), + tEQ: (_cmp_argdict, oBOOL, _opr_eq, 10, '='), + tGE: (_cmp_argdict, oBOOL, _opr_ge, 10, '>='), + tGT: (_cmp_argdict, oBOOL, _opr_gt, 10, '>'), + tNE: (_cmp_argdict, oBOOL, _opr_ne, 10, '<>'), +} + +unop_rules = { + 0x13: (lambda x: -x, 70, '-', ''), # unary minus + 0x12: (lambda x: x, 70, '+', ''), # unary plus + 0x14: (lambda x: x / 100.0, 60, '', '%'),# percent +} + +LEAF_RANK = 90 +FUNC_RANK = 90 + +STACK_ALARM_LEVEL = 5 +STACK_PANIC_LEVEL = 10 + +def evaluate_name_formula(bk, nobj, namex, blah=0, level=0): + if level > STACK_ALARM_LEVEL: + blah = 1 + data = nobj.raw_formula + fmlalen = nobj.basic_formula_len + bv = bk.biff_version + reldelta = 1 # All defined name formulas use "Method B" [OOo docs] + if blah: + print("::: evaluate_name_formula %r %r %d %d %r level=%d" + % (namex, nobj.name, fmlalen, bv, data, level), file=bk.logfile) + hex_char_dump(data, 0, fmlalen, fout=bk.logfile) + if level > STACK_PANIC_LEVEL: + raise XLRDError("Excessive indirect references in NAME formula") + sztab = szdict[bv] + pos = 0 + stack = [] + any_rel = 0 + any_err = 0 + any_external = 0 + unk_opnd = Operand(oUNK, None) + error_opnd = Operand(oERR, None) + spush = stack.append + + def do_binop(opcd, stk): + assert len(stk) >= 2 + bop = stk.pop() + aop = stk.pop() + argdict, result_kind, func, rank, sym = binop_rules[opcd] + otext = ''.join([ + '('[:aop.rank < rank], + aop.text, + ')'[:aop.rank < rank], + sym, + '('[:bop.rank < rank], + bop.text, + ')'[:bop.rank < rank], + ]) + resop = Operand(result_kind, None, rank, otext) + try: + bconv = argdict[bop.kind] + aconv = argdict[aop.kind] + except KeyError: + stk.append(resop) + return + if bop.value is None or aop.value is None: + stk.append(resop) + return + bval = bconv(bop.value) + aval = aconv(aop.value) + result = func(aval, bval) + if result_kind == oBOOL: + result = 1 if result else 0 + resop.value = result + stk.append(resop) + + def do_unaryop(opcode, result_kind, stk): + assert len(stk) >= 1 + aop = stk.pop() + val = aop.value + func, rank, sym1, sym2 = unop_rules[opcode] + otext = ''.join([ + sym1, + '('[:aop.rank < rank], + aop.text, + ')'[:aop.rank < rank], + sym2, + ]) + if val is not None: + val = func(val) + stk.append(Operand(result_kind, val, rank, otext)) + + def not_in_name_formula(op_arg, oname_arg): + msg = "ERROR *** Token 0x%02x (%s) found in NAME formula" \ + % (op_arg, oname_arg) + raise FormulaError(msg) + + if fmlalen == 0: + stack = [unk_opnd] + + while 0 <= pos < fmlalen: + op = BYTES_ORD(data[pos]) + opcode = op & 0x1f + optype = (op & 0x60) >> 5 + if optype: + opx = opcode + 32 + else: + opx = opcode + oname = onames[opx] # + [" RVA"][optype] + sz = sztab[opx] + if blah: + print("Pos:%d Op:0x%02x Name:t%s Sz:%d opcode:%02xh optype:%02xh" + % (pos, op, oname, sz, opcode, optype), file=bk.logfile) + print("Stack =", stack, file=bk.logfile) + if sz == -2: + msg = 'ERROR *** Unexpected token 0x%02x ("%s"); biff_version=%d' \ + % (op, oname, bv) + raise FormulaError(msg) + if not optype: + if 0x00 <= opcode <= 0x02: # unk_opnd, tExp, tTbl + not_in_name_formula(op, oname) + elif 0x03 <= opcode <= 0x0E: + # Add, Sub, Mul, Div, Power + # tConcat + # tLT, ..., tNE + do_binop(opcode, stack) + elif opcode == 0x0F: # tIsect + if blah: print("tIsect pre", stack, file=bk.logfile) + assert len(stack) >= 2 + bop = stack.pop() + aop = stack.pop() + sym = ' ' + rank = 80 ########## check ####### + otext = ''.join([ + '('[:aop.rank < rank], + aop.text, + ')'[:aop.rank < rank], + sym, + '('[:bop.rank < rank], + bop.text, + ')'[:bop.rank < rank], + ]) + res = Operand(oREF) + res.text = otext + if bop.kind == oERR or aop.kind == oERR: + res.kind = oERR + elif bop.kind == oUNK or aop.kind == oUNK: + # This can happen with undefined + # (go search in the current sheet) labels. + # For example =Bob Sales + # Each label gets a NAME record with an empty formula (!) + # Evaluation of the tName token classifies it as oUNK + # res.kind = oREF + pass + elif bop.kind == oREF == aop.kind: + if aop.value is not None and bop.value is not None: + assert len(aop.value) == 1 + assert len(bop.value) == 1 + coords = do_box_funcs( + tIsectFuncs, aop.value[0], bop.value[0]) + res.value = [Ref3D(coords)] + elif bop.kind == oREL == aop.kind: + res.kind = oREL + if aop.value is not None and bop.value is not None: + assert len(aop.value) == 1 + assert len(bop.value) == 1 + coords = do_box_funcs( + tIsectFuncs, aop.value[0], bop.value[0]) + relfa = aop.value[0].relflags + relfb = bop.value[0].relflags + if relfa == relfb: + res.value = [Ref3D(coords + relfa)] + else: + pass + spush(res) + if blah: print("tIsect post", stack, file=bk.logfile) + elif opcode == 0x10: # tList + if blah: print("tList pre", stack, file=bk.logfile) + assert len(stack) >= 2 + bop = stack.pop() + aop = stack.pop() + sym = ',' + rank = 80 ########## check ####### + otext = ''.join([ + '('[:aop.rank < rank], + aop.text, + ')'[:aop.rank < rank], + sym, + '('[:bop.rank < rank], + bop.text, + ')'[:bop.rank < rank], + ]) + res = Operand(oREF, None, rank, otext) + if bop.kind == oERR or aop.kind == oERR: + res.kind = oERR + elif bop.kind in (oREF, oREL) and aop.kind in (oREF, oREL): + res.kind = oREF + if aop.kind == oREL or bop.kind == oREL: + res.kind = oREL + if aop.value is not None and bop.value is not None: + assert len(aop.value) >= 1 + assert len(bop.value) == 1 + res.value = aop.value + bop.value + else: + pass + spush(res) + if blah: print("tList post", stack, file=bk.logfile) + elif opcode == 0x11: # tRange + if blah: print("tRange pre", stack, file=bk.logfile) + assert len(stack) >= 2 + bop = stack.pop() + aop = stack.pop() + sym = ':' + rank = 80 ########## check ####### + otext = ''.join([ + '('[:aop.rank < rank], + aop.text, + ')'[:aop.rank < rank], + sym, + '('[:bop.rank < rank], + bop.text, + ')'[:bop.rank < rank], + ]) + res = Operand(oREF, None, rank, otext) + if bop.kind == oERR or aop.kind == oERR: + res = oERR + elif bop.kind == oREF == aop.kind: + if aop.value is not None and bop.value is not None: + assert len(aop.value) == 1 + assert len(bop.value) == 1 + coords = do_box_funcs( + tRangeFuncs, aop.value[0], bop.value[0]) + res.value = [Ref3D(coords)] + elif bop.kind == oREL == aop.kind: + res.kind = oREL + if aop.value is not None and bop.value is not None: + assert len(aop.value) == 1 + assert len(bop.value) == 1 + coords = do_box_funcs( + tRangeFuncs, aop.value[0], bop.value[0]) + relfa = aop.value[0].relflags + relfb = bop.value[0].relflags + if relfa == relfb: + res.value = [Ref3D(coords + relfa)] + else: + pass + spush(res) + if blah: print("tRange post", stack, file=bk.logfile) + elif 0x12 <= opcode <= 0x14: # tUplus, tUminus, tPercent + do_unaryop(opcode, oNUM, stack) + elif opcode == 0x15: # tParen + # source cosmetics + pass + elif opcode == 0x16: # tMissArg + spush(Operand(oMSNG, None, LEAF_RANK, '')) + elif opcode == 0x17: # tStr + if bv <= 70: + strg, newpos = unpack_string_update_pos( + data, pos+1, bk.encoding, lenlen=1) + else: + strg, newpos = unpack_unicode_update_pos( + data, pos+1, lenlen=1) + sz = newpos - pos + if blah: print(" sz=%d strg=%r" % (sz, strg), file=bk.logfile) + text = '"' + strg.replace('"', '""') + '"' + spush(Operand(oSTRG, strg, LEAF_RANK, text)) + elif opcode == 0x18: # tExtended + # new with BIFF 8 + assert bv >= 80 + # not in OOo docs + raise FormulaError("tExtended token not implemented") + elif opcode == 0x19: # tAttr + subop, nc = unpack("= 1 + aop = stack[-1] + otext = 'SUM(%s)' % aop.text + stack[-1] = Operand(oNUM, None, FUNC_RANK, otext) + else: + sz = 4 + if blah: + print(" subop=%02xh subname=t%s sz=%d nc=%02xh" + % (subop, subname, sz, nc), file=bk.logfile) + elif 0x1A <= opcode <= 0x1B: # tSheet, tEndSheet + assert bv < 50 + raise FormulaError("tSheet & tEndsheet tokens not implemented") + elif 0x1C <= opcode <= 0x1F: # tErr, tBool, tInt, tNum + inx = opcode - 0x1C + nb = [1, 1, 2, 8][inx] + kind = [oERR, oBOOL, oNUM, oNUM][inx] + value, = unpack("<" + "BBHd"[inx], data[pos+1:pos+1+nb]) + if inx == 2: # tInt + value = float(value) + text = str(value) + elif inx == 3: # tNum + text = str(value) + elif inx == 1: # tBool + text = ('FALSE', 'TRUE')[value] + else: + text = '"' +error_text_from_code[value] + '"' + spush(Operand(kind, value, LEAF_RANK, text)) + else: + raise FormulaError("Unhandled opcode: 0x%02x" % opcode) + if sz <= 0: + raise FormulaError("Size not set for opcode 0x%02x" % opcode) + pos += sz + continue + if opcode == 0x00: # tArray + spush(unk_opnd) + elif opcode == 0x01: # tFunc + nb = 1 + int(bv >= 40) + funcx = unpack("<" + " BH"[nb], data[pos+1:pos+1+nb])[0] + func_attrs = func_defs.get(funcx, None) + if not func_attrs: + print("*** formula/tFunc unknown FuncID:%d" + % funcx, file=bk.logfile) + spush(unk_opnd) + else: + func_name, nargs = func_attrs[:2] + if blah: + print(" FuncID=%d name=%s nargs=%d" + % (funcx, func_name, nargs), file=bk.logfile) + assert len(stack) >= nargs + if nargs: + argtext = listsep.join(arg.text for arg in stack[-nargs:]) + otext = "%s(%s)" % (func_name, argtext) + del stack[-nargs:] + else: + otext = func_name + "()" + res = Operand(oUNK, None, FUNC_RANK, otext) + spush(res) + elif opcode == 0x02: #tFuncVar + nb = 1 + int(bv >= 40) + nargs, funcx = unpack("= nargs + assert len(stack) >= nargs + argtext = listsep.join(arg.text for arg in stack[-nargs:]) + otext = "%s(%s)" % (func_name, argtext) + res = Operand(oUNK, None, FUNC_RANK, otext) + if funcx == 1: # IF + testarg = stack[-nargs] + if testarg.kind not in (oNUM, oBOOL): + if blah and testarg.kind != oUNK: + print("IF testarg kind?", file=bk.logfile) + elif testarg.value not in (0, 1): + if blah and testarg.value is not None: + print("IF testarg value?", file=bk.logfile) + else: + if nargs == 2 and not testarg.value: + # IF(FALSE, tv) => FALSE + res.kind, res.value = oBOOL, 0 + else: + respos = -nargs + 2 - int(testarg.value) + chosen = stack[respos] + if chosen.kind == oMSNG: + res.kind, res.value = oNUM, 0 + else: + res.kind, res.value = chosen.kind, chosen.value + if blah: + print("$$$$$$ IF => constant", file=bk.logfile) + elif funcx == 100: # CHOOSE + testarg = stack[-nargs] + if testarg.kind == oNUM: + if 1 <= testarg.value < nargs: + chosen = stack[-nargs + int(testarg.value)] + if chosen.kind == oMSNG: + res.kind, res.value = oNUM, 0 + else: + res.kind, res.value = chosen.kind, chosen.value + del stack[-nargs:] + spush(res) + elif opcode == 0x03: #tName + tgtnamex = unpack("> bk.logfile, " ", res + # spush(res) + elif opcode == 0x0D: #tAreaN + not_in_name_formula(op, oname) + # res = get_cell_range_addr(data, pos+1, bv, reldelta=1) + # # note *ALL* tAreaN usage has signed offset for relative addresses + # any_rel = 1 + # if blah: print >> bk.logfile, " ", res + elif opcode == 0x1A: # tRef3d + if bv >= 80: + res = get_cell_addr(data, pos+3, bv, reldelta) + refx = unpack("= 80: + res1, res2 = get_cell_range_addr(data, pos+3, bv, reldelta) + refx = unpack("= 80: + refx, tgtnamex = unpack(" 0: + refx -= 1 + elif refx < 0: + refx = -refx - 1 + else: + dodgy = 1 + if blah: + print(" origrefx=%d refx=%d tgtnamex=%d dodgy=%d" + % (origrefx, refx, tgtnamex, dodgy), file=bk.logfile) + if tgtnamex == namex: + if blah: print("!!!! Self-referential !!!!", file=bk.logfile) + dodgy = any_err = 1 + if not dodgy: + if bv >= 80: + shx1, shx2 = get_externsheet_local_range(bk, refx, blah) + elif origrefx > 0: + shx1, shx2 = (-4, -4) # external ref + else: + exty = bk._externsheet_type_b57[refx] + if exty == 4: # non-specific sheet in own doc't + shx1, shx2 = (-1, -1) # internal, any sheet + else: + shx1, shx2 = (-666, -666) + if dodgy or shx1 < -1: + otext = "<>" \ + % (tgtnamex, origrefx) + res = Operand(oUNK, None, LEAF_RANK, otext) + else: + tgtobj = bk.name_obj_list[tgtnamex] + if not tgtobj.evaluated: + ### recursive ### + evaluate_name_formula(bk, tgtobj, tgtnamex, blah, level+1) + if tgtobj.macro or tgtobj.binary or tgtobj.any_err: + if blah: + tgtobj.dump( + bk.logfile, + header="!!! bad tgtobj !!!", + footer="------------------", + ) + res = Operand(oUNK, None) + any_err = any_err or tgtobj.macro or tgtobj.binary or tgtobj.any_err + any_rel = any_rel or tgtobj.any_rel + else: + assert len(tgtobj.stack) == 1 + res = copy.deepcopy(tgtobj.stack[0]) + res.rank = LEAF_RANK + if tgtobj.scope == -1: + res.text = tgtobj.name + else: + res.text = "%s!%s" \ + % (bk._sheet_names[tgtobj.scope], tgtobj.name) + if blah: + print(" tNameX: setting text to", repr(res.text), file=bk.logfile) + spush(res) + elif opcode in error_opcodes: + any_err = 1 + spush(error_opnd) + else: + if blah: + print("FORMULA: /// Not handled yet: t" + oname, file=bk.logfile) + any_err = 1 + if sz <= 0: + raise FormulaError("Fatal: token size is not positive") + pos += sz + any_rel = not not any_rel + if blah: + fprintf(bk.logfile, "End of formula. level=%d any_rel=%d any_err=%d stack=%r\n", + level, not not any_rel, any_err, stack) + if len(stack) >= 2: + print("*** Stack has unprocessed args", file=bk.logfile) + print(file=bk.logfile) + nobj.stack = stack + if len(stack) != 1: + nobj.result = None + else: + nobj.result = stack[0] + nobj.any_rel = any_rel + nobj.any_err = any_err + nobj.any_external = any_external + nobj.evaluated = 1 + +#### under construction ############################################################################# +def decompile_formula(bk, fmla, fmlalen, + fmlatype=None, browx=None, bcolx=None, + blah=0, level=0, r1c1=0): + if level > STACK_ALARM_LEVEL: + blah = 1 + reldelta = fmlatype in (FMLA_TYPE_SHARED, FMLA_TYPE_NAME, FMLA_TYPE_COND_FMT, FMLA_TYPE_DATA_VAL) + data = fmla + bv = bk.biff_version + if blah: + print("::: decompile_formula len=%d fmlatype=%r browx=%r bcolx=%r reldelta=%d %r level=%d" + % (fmlalen, fmlatype, browx, bcolx, reldelta, data, level), file=bk.logfile) + hex_char_dump(data, 0, fmlalen, fout=bk.logfile) + if level > STACK_PANIC_LEVEL: + raise XLRDError("Excessive indirect references in formula") + sztab = szdict[bv] + pos = 0 + stack = [] + any_rel = 0 + any_err = 0 + unk_opnd = Operand(oUNK, None) + error_opnd = Operand(oERR, None) + spush = stack.append + + def do_binop(opcd, stk): + assert len(stk) >= 2 + bop = stk.pop() + aop = stk.pop() + argdict, result_kind, func, rank, sym = binop_rules[opcd] + otext = ''.join([ + '('[:aop.rank < rank], + aop.text, + ')'[:aop.rank < rank], + sym, + '('[:bop.rank < rank], + bop.text, + ')'[:bop.rank < rank], + ]) + resop = Operand(result_kind, None, rank, otext) + stk.append(resop) + + def do_unaryop(opcode, result_kind, stk): + assert len(stk) >= 1 + aop = stk.pop() + func, rank, sym1, sym2 = unop_rules[opcode] + otext = ''.join([ + sym1, + '('[:aop.rank < rank], + aop.text, + ')'[:aop.rank < rank], + sym2, + ]) + stk.append(Operand(result_kind, None, rank, otext)) + + def unexpected_opcode(op_arg, oname_arg): + msg = "ERROR *** Unexpected token 0x%02x (%s) found in formula type %s" \ + % (op_arg, oname_arg, FMLA_TYPEDESCR_MAP[fmlatype]) + print(msg, file=bk.logfile) + # raise FormulaError(msg) + + if fmlalen == 0: + stack = [unk_opnd] + + while 0 <= pos < fmlalen: + op = BYTES_ORD(data[pos]) + opcode = op & 0x1f + optype = (op & 0x60) >> 5 + if optype: + opx = opcode + 32 + else: + opx = opcode + oname = onames[opx] # + [" RVA"][optype] + sz = sztab[opx] + if blah: + print("Pos:%d Op:0x%02x opname:t%s Sz:%d opcode:%02xh optype:%02xh" + % (pos, op, oname, sz, opcode, optype), file=bk.logfile) + print("Stack =", stack, file=bk.logfile) + if sz == -2: + msg = 'ERROR *** Unexpected token 0x%02x ("%s"); biff_version=%d' \ + % (op, oname, bv) + raise FormulaError(msg) + if _TOKEN_NOT_ALLOWED(opx, 0) & fmlatype: + unexpected_opcode(op, oname) + if not optype: + if opcode <= 0x01: # tExp + if bv >= 30: + fmt = '= 2 + bop = stack.pop() + aop = stack.pop() + sym = ' ' + rank = 80 ########## check ####### + otext = ''.join([ + '('[:aop.rank < rank], + aop.text, + ')'[:aop.rank < rank], + sym, + '('[:bop.rank < rank], + bop.text, + ')'[:bop.rank < rank], + ]) + res = Operand(oREF) + res.text = otext + if bop.kind == oERR or aop.kind == oERR: + res.kind = oERR + elif bop.kind == oUNK or aop.kind == oUNK: + # This can happen with undefined + # (go search in the current sheet) labels. + # For example =Bob Sales + # Each label gets a NAME record with an empty formula (!) + # Evaluation of the tName token classifies it as oUNK + # res.kind = oREF + pass + elif bop.kind == oREF == aop.kind: + pass + elif bop.kind == oREL == aop.kind: + res.kind = oREL + else: + pass + spush(res) + if blah: print("tIsect post", stack, file=bk.logfile) + elif opcode == 0x10: # tList + if blah: print("tList pre", stack, file=bk.logfile) + assert len(stack) >= 2 + bop = stack.pop() + aop = stack.pop() + sym = ',' + rank = 80 ########## check ####### + otext = ''.join([ + '('[:aop.rank < rank], + aop.text, + ')'[:aop.rank < rank], + sym, + '('[:bop.rank < rank], + bop.text, + ')'[:bop.rank < rank], + ]) + res = Operand(oREF, None, rank, otext) + if bop.kind == oERR or aop.kind == oERR: + res.kind = oERR + elif bop.kind in (oREF, oREL) and aop.kind in (oREF, oREL): + res.kind = oREF + if aop.kind == oREL or bop.kind == oREL: + res.kind = oREL + else: + pass + spush(res) + if blah: print("tList post", stack, file=bk.logfile) + elif opcode == 0x11: # tRange + if blah: print("tRange pre", stack, file=bk.logfile) + assert len(stack) >= 2 + bop = stack.pop() + aop = stack.pop() + sym = ':' + rank = 80 ########## check ####### + otext = ''.join([ + '('[:aop.rank < rank], + aop.text, + ')'[:aop.rank < rank], + sym, + '('[:bop.rank < rank], + bop.text, + ')'[:bop.rank < rank], + ]) + res = Operand(oREF, None, rank, otext) + if bop.kind == oERR or aop.kind == oERR: + res = oERR + elif bop.kind == oREF == aop.kind: + pass + else: + pass + spush(res) + if blah: print("tRange post", stack, file=bk.logfile) + elif 0x12 <= opcode <= 0x14: # tUplus, tUminus, tPercent + do_unaryop(opcode, oNUM, stack) + elif opcode == 0x15: # tParen + # source cosmetics + pass + elif opcode == 0x16: # tMissArg + spush(Operand(oMSNG, None, LEAF_RANK, '')) + elif opcode == 0x17: # tStr + if bv <= 70: + strg, newpos = unpack_string_update_pos( + data, pos+1, bk.encoding, lenlen=1) + else: + strg, newpos = unpack_unicode_update_pos( + data, pos+1, lenlen=1) + sz = newpos - pos + if blah: print(" sz=%d strg=%r" % (sz, strg), file=bk.logfile) + text = '"' + strg.replace('"', '""') + '"' + spush(Operand(oSTRG, None, LEAF_RANK, text)) + elif opcode == 0x18: # tExtended + # new with BIFF 8 + assert bv >= 80 + # not in OOo docs, don't even know how to determine its length + raise FormulaError("tExtended token not implemented") + elif opcode == 0x19: # tAttr + subop, nc = unpack("= 1 + aop = stack[-1] + otext = 'SUM(%s)' % aop.text + stack[-1] = Operand(oNUM, None, FUNC_RANK, otext) + else: + sz = 4 + if blah: + print(" subop=%02xh subname=t%s sz=%d nc=%02xh" + % (subop, subname, sz, nc), file=bk.logfile) + elif 0x1A <= opcode <= 0x1B: # tSheet, tEndSheet + assert bv < 50 + raise FormulaError("tSheet & tEndsheet tokens not implemented") + elif 0x1C <= opcode <= 0x1F: # tErr, tBool, tInt, tNum + inx = opcode - 0x1C + nb = [1, 1, 2, 8][inx] + kind = [oERR, oBOOL, oNUM, oNUM][inx] + value, = unpack("<" + "BBHd"[inx], data[pos+1:pos+1+nb]) + if inx == 2: # tInt + value = float(value) + text = str(value) + elif inx == 3: # tNum + text = str(value) + elif inx == 1: # tBool + text = ('FALSE', 'TRUE')[value] + else: + text = '"' +error_text_from_code[value] + '"' + spush(Operand(kind, None, LEAF_RANK, text)) + else: + raise FormulaError("Unhandled opcode: 0x%02x" % opcode) + if sz <= 0: + raise FormulaError("Size not set for opcode 0x%02x" % opcode) + pos += sz + continue + if opcode == 0x00: # tArray + spush(unk_opnd) + elif opcode == 0x01: # tFunc + nb = 1 + int(bv >= 40) + funcx = unpack("<" + " BH"[nb], data[pos+1:pos+1+nb])[0] + func_attrs = func_defs.get(funcx, None) + if not func_attrs: + print("*** formula/tFunc unknown FuncID:%d" % funcx, file=bk.logfile) + spush(unk_opnd) + else: + func_name, nargs = func_attrs[:2] + if blah: + print(" FuncID=%d name=%s nargs=%d" + % (funcx, func_name, nargs), file=bk.logfile) + assert len(stack) >= nargs + if nargs: + argtext = listsep.join(arg.text for arg in stack[-nargs:]) + otext = "%s(%s)" % (func_name, argtext) + del stack[-nargs:] + else: + otext = func_name + "()" + res = Operand(oUNK, None, FUNC_RANK, otext) + spush(res) + elif opcode == 0x02: #tFuncVar + nb = 1 + int(bv >= 40) + nargs, funcx = unpack("= nargs + assert len(stack) >= nargs + argtext = listsep.join(arg.text for arg in stack[-nargs:]) + otext = "%s(%s)" % (func_name, argtext) + res = Operand(oUNK, None, FUNC_RANK, otext) + del stack[-nargs:] + spush(res) + elif opcode == 0x03: #tName + tgtnamex = unpack("> bk.logfile, " ", res + res1, res2 = get_cell_range_addr( + data, pos+1, bv, reldelta, browx, bcolx) + if blah: print(" ", res1, res2, file=bk.logfile) + rowx1, colx1, row_rel1, col_rel1 = res1 + rowx2, colx2, row_rel2, col_rel2 = res2 + coords = (rowx1, rowx2+1, colx1, colx2+1) + relflags = (row_rel1, row_rel2, col_rel1, col_rel2) + if sum(relflags): # relative + okind = oREL + else: + okind = oREF + if blah: print(" ", coords, relflags, file=bk.logfile) + otext = rangename2drel(coords, relflags, browx, bcolx, r1c1) + res = Operand(okind, None, LEAF_RANK, otext) + spush(res) + elif opcode == 0x1A: # tRef3d + if bv >= 80: + res = get_cell_addr(data, pos+3, bv, reldelta, browx, bcolx) + refx = unpack("= 80: + res1, res2 = get_cell_range_addr(data, pos+3, bv, reldelta) + refx = unpack("= 80: + refx, tgtnamex = unpack(" 0: + refx -= 1 + elif refx < 0: + refx = -refx - 1 + else: + dodgy = 1 + if blah: + print(" origrefx=%d refx=%d tgtnamex=%d dodgy=%d" + % (origrefx, refx, tgtnamex, dodgy), file=bk.logfile) + # if tgtnamex == namex: + # if blah: print >> bk.logfile, "!!!! Self-referential !!!!" + # dodgy = any_err = 1 + if not dodgy: + if bv >= 80: + shx1, shx2 = get_externsheet_local_range(bk, refx, blah) + elif origrefx > 0: + shx1, shx2 = (-4, -4) # external ref + else: + exty = bk._externsheet_type_b57[refx] + if exty == 4: # non-specific sheet in own doc't + shx1, shx2 = (-1, -1) # internal, any sheet + else: + shx1, shx2 = (-666, -666) + okind = oUNK + ovalue = None + if shx1 == -5: # addin func name + okind = oSTRG + ovalue = bk.addin_func_names[tgtnamex] + otext = '"' + ovalue.replace('"', '""') + '"' + elif dodgy or shx1 < -1: + otext = "<>" \ + % (tgtnamex, origrefx) + else: + tgtobj = bk.name_obj_list[tgtnamex] + if tgtobj.scope == -1: + otext = tgtobj.name + else: + otext = "%s!%s" \ + % (bk._sheet_names[tgtobj.scope], tgtobj.name) + if blah: + print(" tNameX: setting text to", repr(res.text), file=bk.logfile) + res = Operand(okind, ovalue, LEAF_RANK, otext) + spush(res) + elif opcode in error_opcodes: + any_err = 1 + spush(error_opnd) + else: + if blah: + print("FORMULA: /// Not handled yet: t" + oname, file=bk.logfile) + any_err = 1 + if sz <= 0: + raise FormulaError("Fatal: token size is not positive") + pos += sz + any_rel = not not any_rel + if blah: + print("End of formula. level=%d any_rel=%d any_err=%d stack=%r" % + (level, not not any_rel, any_err, stack), file=bk.logfile) + if len(stack) >= 2: + print("*** Stack has unprocessed args", file=bk.logfile) + print(file=bk.logfile) + + if len(stack) != 1: + result = None + else: + result = stack[0].text + return result + +#### under deconstruction ### +def dump_formula(bk, data, fmlalen, bv, reldelta, blah=0, isname=0): + if blah: + print("dump_formula", fmlalen, bv, len(data), file=bk.logfile) + hex_char_dump(data, 0, fmlalen, fout=bk.logfile) + assert bv >= 80 #### this function needs updating #### + sztab = szdict[bv] + pos = 0 + stack = [] + any_rel = 0 + any_err = 0 + spush = stack.append + while 0 <= pos < fmlalen: + op = BYTES_ORD(data[pos]) + opcode = op & 0x1f + optype = (op & 0x60) >> 5 + if optype: + opx = opcode + 32 + else: + opx = opcode + oname = onames[opx] # + [" RVA"][optype] + + sz = sztab[opx] + if blah: + print("Pos:%d Op:0x%02x Name:t%s Sz:%d opcode:%02xh optype:%02xh" + % (pos, op, oname, sz, opcode, optype), file=bk.logfile) + if not optype: + if 0x01 <= opcode <= 0x02: # tExp, tTbl + # reference to a shared formula or table record + rowx, colx = unpack("= 2 + bop = stack.pop() + aop = stack.pop() + spush(aop + bop) + if blah: print("tlist post", stack, file=bk.logfile) + elif opcode == 0x11: # tRange + if blah: print("tRange pre", stack, file=bk.logfile) + assert len(stack) >= 2 + bop = stack.pop() + aop = stack.pop() + assert len(aop) == 1 + assert len(bop) == 1 + result = do_box_funcs(tRangeFuncs, aop[0], bop[0]) + spush(result) + if blah: print("tRange post", stack, file=bk.logfile) + elif opcode == 0x0F: # tIsect + if blah: print("tIsect pre", stack, file=bk.logfile) + assert len(stack) >= 2 + bop = stack.pop() + aop = stack.pop() + assert len(aop) == 1 + assert len(bop) == 1 + result = do_box_funcs(tIsectFuncs, aop[0], bop[0]) + spush(result) + if blah: print("tIsect post", stack, file=bk.logfile) + elif opcode == 0x19: # tAttr + subop, nc = unpack("= 40) + funcx = unpack("<" + " BH"[nb], data[pos+1:pos+1+nb]) + if blah: print(" FuncID=%d" % funcx, file=bk.logfile) + elif opcode == 0x02: #tFuncVar + nb = 1 + int(bv >= 40) + nargs, funcx = unpack("= 2: + print("*** Stack has unprocessed args", file=bk.logfile) + +# === Some helper functions for displaying cell references === + +# I'm aware of only one possibility of a sheet-relative component in +# a reference: a 2D reference located in the "current sheet". +# xlrd stores this internally with bounds of (0, 1, ...) and +# relative flags of (1, 1, ...). These functions display the +# sheet component as empty, just like Excel etc. + +def rownamerel(rowx, rowxrel, browx=None, r1c1=0): + # if no base rowx is provided, we have to return r1c1 + if browx is None: + r1c1 = True + if not rowxrel: + if r1c1: + return "R%d" % (rowx+1) + return "$%d" % (rowx+1) + if r1c1: + if rowx: + return "R[%d]" % rowx + return "R" + return "%d" % ((browx + rowx) % 65536 + 1) + +def colnamerel(colx, colxrel, bcolx=None, r1c1=0): + # if no base colx is provided, we have to return r1c1 + if bcolx is None: + r1c1 = True + if not colxrel: + if r1c1: + return "C%d" % (colx + 1) + return "$" + colname(colx) + if r1c1: + if colx: + return "C[%d]" % colx + return "C" + return colname((bcolx + colx) % 256) + +def cellname(rowx, colx): + """Utility function: ``(5, 7)`` => ``'H6'``""" + return "%s%d" % (colname(colx), rowx+1) + +def cellnameabs(rowx, colx, r1c1=0): + """Utility function: ``(5, 7)`` => ``'$H$6'``""" + if r1c1: + return "R%dC%d" % (rowx+1, colx+1) + return "$%s$%d" % (colname(colx), rowx+1) + +def cellnamerel(rowx, colx, rowxrel, colxrel, browx=None, bcolx=None, r1c1=0): + if not rowxrel and not colxrel: + return cellnameabs(rowx, colx, r1c1) + if (rowxrel and browx is None) or (colxrel and bcolx is None): + # must flip the whole cell into R1C1 mode + r1c1 = True + c = colnamerel(colx, colxrel, bcolx, r1c1) + r = rownamerel(rowx, rowxrel, browx, r1c1) + if r1c1: + return r + c + return c + r + +def colname(colx): + """Utility function: ``7`` => ``'H'``, ``27`` => ``'AB'``""" + alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + if colx <= 25: + return alphabet[colx] + else: + xdiv26, xmod26 = divmod(colx, 26) + return alphabet[xdiv26 - 1] + alphabet[xmod26] + +def rangename2d(rlo, rhi, clo, chi, r1c1=0): + """ ``(5, 20, 7, 10)`` => ``'$H$6:$J$20'`` """ + if r1c1: + return + if rhi == rlo+1 and chi == clo+1: + return cellnameabs(rlo, clo, r1c1) + return "%s:%s" % (cellnameabs(rlo, clo, r1c1), cellnameabs(rhi-1, chi-1, r1c1)) + +def rangename2drel(rlo_rhi_clo_chi, rlorel_rhirel_clorel_chirel, browx=None, bcolx=None, r1c1=0): + rlo, rhi, clo, chi = rlo_rhi_clo_chi + rlorel, rhirel, clorel, chirel = rlorel_rhirel_clorel_chirel + if (rlorel or rhirel) and browx is None: + r1c1 = True + if (clorel or chirel) and bcolx is None: + r1c1 = True + return "%s:%s" % ( + cellnamerel(rlo, clo, rlorel, clorel, browx, bcolx, r1c1), + cellnamerel(rhi-1, chi-1, rhirel, chirel, browx, bcolx, r1c1), + ) + + +def rangename3d(book, ref3d): + """ + Utility function: + ``Ref3D(1, 4, 5, 20, 7, 10)`` => + ``'Sheet2:Sheet3!$H$6:$J$20'`` + (assuming Excel's default sheetnames) + """ + coords = ref3d.coords + return "%s!%s" % ( + sheetrange(book, *coords[:2]), + rangename2d(*coords[2:6])) + +def rangename3drel(book, ref3d, browx=None, bcolx=None, r1c1=0): + """ + Utility function: + ``Ref3D(coords=(0, 1, -32, -22, -13, 13), relflags=(0, 0, 1, 1, 1, 1))`` + + In R1C1 mode => ``'Sheet1!R[-32]C[-13]:R[-23]C[12]'`` + + In A1 mode => depends on base cell ``(browx, bcolx)`` + """ + coords = ref3d.coords + relflags = ref3d.relflags + shdesc = sheetrangerel(book, coords[:2], relflags[:2]) + rngdesc = rangename2drel(coords[2:6], relflags[2:6], browx, bcolx, r1c1) + if not shdesc: + return rngdesc + return "%s!%s" % (shdesc, rngdesc) + +def quotedsheetname(shnames, shx): + if shx >= 0: + shname = shnames[shx] + else: + shname = { + -1: "?internal; any sheet?", + -2: "internal; deleted sheet", + -3: "internal; macro sheet", + -4: "<>", + }.get(shx, "?error %d?" % shx) + if "'" in shname: + return "'" + shname.replace("'", "''") + "'" + if " " in shname: + return "'" + shname + "'" + return shname + +def sheetrange(book, slo, shi): + shnames = book.sheet_names() + shdesc = quotedsheetname(shnames, slo) + if slo != shi-1: + shdesc += ":" + quotedsheetname(shnames, shi-1) + return shdesc + +def sheetrangerel(book, srange, srangerel): + slo, shi = srange + slorel, shirel = srangerel + if not slorel and not shirel: + return sheetrange(book, slo, shi) + assert (slo == 0 == shi-1) and slorel and shirel + return "" + +# ============================================================== diff --git a/venv/Lib/site-packages/xlrd/info.py b/venv/Lib/site-packages/xlrd/info.py new file mode 100644 index 0000000..f63f017 --- /dev/null +++ b/venv/Lib/site-packages/xlrd/info.py @@ -0,0 +1 @@ +__version__ = __VERSION__ = "1.2.0" diff --git a/venv/Lib/site-packages/xlrd/sheet.py b/venv/Lib/site-packages/xlrd/sheet.py new file mode 100644 index 0000000..79c200d --- /dev/null +++ b/venv/Lib/site-packages/xlrd/sheet.py @@ -0,0 +1,2469 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2005-2013 Stephen John Machin, Lingfo Pty Ltd +# This module is part of the xlrd package, which is released under a +# BSD-style licence. + +from __future__ import print_function + +from array import array +from struct import calcsize, unpack + +from .biffh import * +from .formatting import Format, nearest_colour_index +from .formula import ( + FMLA_TYPE_CELL, FMLA_TYPE_SHARED, decompile_formula, dump_formula, + rangename2d, +) +from .timemachine import * + +DEBUG = 0 +OBJ_MSO_DEBUG = 0 + +_WINDOW2_options = ( + # Attribute names and initial values to use in case + # a WINDOW2 record is not written. + ("show_formulas", 0), + ("show_grid_lines", 1), + ("show_sheet_headers", 1), + ("panes_are_frozen", 0), + ("show_zero_values", 1), + ("automatic_grid_line_colour", 1), + ("columns_from_right_to_left", 0), + ("show_outline_symbols", 1), + ("remove_splits_if_pane_freeze_is_removed", 0), + # Multiple sheets can be selected, but only one can be active + # (hold down Ctrl and click multiple tabs in the file in OOo) + ("sheet_selected", 0), + # "sheet_visible" should really be called "sheet_active" + # and is 1 when this sheet is the sheet displayed when the file + # is open. More than likely only one sheet should ever be set as + # visible. + # This would correspond to the Book's sheet_active attribute, but + # that doesn't exist as WINDOW1 records aren't currently processed. + # The real thing is the visibility attribute from the BOUNDSHEET record. + ("sheet_visible", 0), + ("show_in_page_break_preview", 0), +) + + + +class Sheet(BaseObject): + """ + Contains the data for one worksheet. + + In the cell access functions, ``rowx`` is a row index, counting from + zero, and ``colx`` is a column index, counting from zero. + Negative values for row/column indexes and slice positions are supported in + the expected fashion. + + For information about cell types and cell values, refer to the documentation + of the :class:`Cell` class. + + .. warning:: + + You don't instantiate this class yourself. You access :class:`Sheet` + objects via the :class:`~xlrd.book.Book` object that + was returned when you called :func:`xlrd.open_workbook`. + """ + + #: Name of sheet. + name = '' + + #: A reference to the :class:`~xlrd.book.Book` object to which this sheet + #: belongs. + #: + #: Example usage: ``some_sheet.book.datemode`` + book = None + + #: Number of rows in sheet. A row index is in ``range(thesheet.nrows)``. + nrows = 0 + + #: Nominal number of columns in sheet. It is one more than the maximum + #: column index found, ignoring trailing empty cells. + #: See also the ``ragged_rows`` parameter to :func:`~xlrd.open_workbook` + #: and :meth:`~xlrd.sheet.Sheet.row_len`. + ncols = 0 + + + #: The map from a column index to a :class:`Colinfo` object. Often there is + #: an entry in ``COLINFO`` records for all column indexes in ``range(257)``. + #: + #: .. note:: + #: xlrd ignores the entry for the non-existent + #: 257th column. + #: + #: On the other hand, there may be no entry for unused columns. + #: + #: .. versionadded:: 0.6.1 + #: + #: Populated only if ``open_workbook(..., formatting_info=True)`` + colinfo_map = {} + + #: The map from a row index to a :class:`Rowinfo` object. + #: + #: ..note:: + #: It is possible to have missing entries -- at least one source of + #: XLS files doesn't bother writing ``ROW`` records. + #: + #: .. versionadded:: 0.6.1 + #: + #: Populated only if ``open_workbook(..., formatting_info=True)`` + rowinfo_map = {} + + #: List of address ranges of cells containing column labels. + #: These are set up in Excel by Insert > Name > Labels > Columns. + #: + #: .. versionadded:: 0.6.0 + #: + #: How to deconstruct the list: + #: + #: .. code-block:: python + #: + #: for crange in thesheet.col_label_ranges: + #: rlo, rhi, clo, chi = crange + #: for rx in xrange(rlo, rhi): + #: for cx in xrange(clo, chi): + #: print "Column label at (rowx=%d, colx=%d) is %r" \ + #: (rx, cx, thesheet.cell_value(rx, cx)) + col_label_ranges = [] + + #: List of address ranges of cells containing row labels. + #: For more details, see :attr:`col_label_ranges`. + #: + #: .. versionadded:: 0.6.0 + row_label_ranges = [] + + #: List of address ranges of cells which have been merged. + #: These are set up in Excel by Format > Cells > Alignment, then ticking + #: the "Merge cells" box. + #: + #: .. note:: + #: The upper limits are exclusive: i.e. ``[2, 3, 7, 9]`` only + #: spans two cells. + #: + #: .. note:: Extracted only if ``open_workbook(..., formatting_info=True)`` + #: + #: .. versionadded:: 0.6.1 + #: + #: How to deconstruct the list: + #: + #: .. code-block:: python + #: + #: for crange in thesheet.merged_cells: + #: rlo, rhi, clo, chi = crange + #: for rowx in xrange(rlo, rhi): + #: for colx in xrange(clo, chi): + #: # cell (rlo, clo) (the top left one) will carry the data + #: # and formatting info; the remainder will be recorded as + #: # blank cells, but a renderer will apply the formatting info + #: # for the top left cell (e.g. border, pattern) to all cells in + #: # the range. + merged_cells = [] + + #: Mapping of ``(rowx, colx)`` to list of ``(offset, font_index)`` tuples. + #: The offset defines where in the string the font begins to be used. + #: Offsets are expected to be in ascending order. + #: If the first offset is not zero, the meaning is that the cell's ``XF``'s + #: font should be used from offset 0. + #: + #: This is a sparse mapping. There is no entry for cells that are not + #: formatted with rich text. + #: + #: How to use: + #: + #: .. code-block:: python + #: + #: runlist = thesheet.rich_text_runlist_map.get((rowx, colx)) + #: if runlist: + #: for offset, font_index in runlist: + #: # do work here. + #: pass + #: + #: .. versionadded:: 0.7.2 + #: + #: Populated only if ``open_workbook(..., formatting_info=True)`` + rich_text_runlist_map = {} + + #: Default column width from ``DEFCOLWIDTH`` record, else ``None``. + #: From the OOo docs: + #: + #: Column width in characters, using the width of the zero character + #: from default font (first FONT record in the file). Excel adds some + #: extra space to the default width, depending on the default font and + #: default font size. The algorithm how to exactly calculate the resulting + #: column width is not known. + #: Example: The default width of 8 set in this record results in a column + #: width of 8.43 using Arial font with a size of 10 points. + #: + #: For the default hierarchy, refer to the :class:`Colinfo` class. + #: + #: .. versionadded:: 0.6.1 + defcolwidth = None + + #: Default column width from ``STANDARDWIDTH`` record, else ``None``. + #: + #: From the OOo docs: + #: + #: Default width of the columns in 1/256 of the width of the zero + #: character, using default font (first FONT record in the file). + #: + #: For the default hierarchy, refer to the :class:`Colinfo` class. + #: + #: .. versionadded:: 0.6.1 + standardwidth = None + + #: Default value to be used for a row if there is + #: no ``ROW`` record for that row. + #: From the *optional* ``DEFAULTROWHEIGHT`` record. + default_row_height = None + + #: Default value to be used for a row if there is + #: no ``ROW`` record for that row. + #: From the *optional* ``DEFAULTROWHEIGHT`` record. + default_row_height_mismatch = None + + #: Default value to be used for a row if there is + #: no ``ROW`` record for that row. + #: From the *optional* ``DEFAULTROWHEIGHT`` record. + default_row_hidden = None + + #: Default value to be used for a row if there is + #: no ``ROW`` record for that row. + #: From the *optional* ``DEFAULTROWHEIGHT`` record. + default_additional_space_above = None + + #: Default value to be used for a row if there is + #: no ``ROW`` record for that row. + #: From the *optional* ``DEFAULTROWHEIGHT`` record. + default_additional_space_below = None + + #: Visibility of the sheet:: + #: + #: 0 = visible + #: 1 = hidden (can be unhidden by user -- Format -> Sheet -> Unhide) + #: 2 = "very hidden" (can be unhidden only by VBA macro). + visibility = 0 + + #: A 256-element tuple corresponding to the contents of the GCW record for + #: this sheet. If no such record, treat as all bits zero. + #: Applies to BIFF4-7 only. See docs of the :class:`Colinfo` class for + #: discussion. + gcw = (0, ) * 256 + + #: A list of :class:`Hyperlink` objects corresponding to ``HLINK`` records + #: found in the worksheet. + #: + #: .. versionadded:: 0.7.2 + hyperlink_list = [] + + #: A sparse mapping from ``(rowx, colx)`` to an item in + #: :attr:`~xlrd.sheet.Sheet.hyperlink_list`. + #: Cells not covered by a hyperlink are not mapped. + #: It is possible using the Excel UI to set up a hyperlink that + #: covers a larger-than-1x1 rectangle of cells. + #: Hyperlink rectangles may overlap (Excel doesn't check). + #: When a multiply-covered cell is clicked on, the hyperlink that is + #: activated + #: (and the one that is mapped here) is the last in + #: :attr:`~xlrd.sheet.Sheet.hyperlink_list`. + #: + #: .. versionadded:: 0.7.2 + hyperlink_map = {} + + #: A sparse mapping from ``(rowx, colx)`` to a :class:`Note` object. + #: Cells not containing a note ("comment") are not mapped. + #: + #: .. versionadded:: 0.7.2 + cell_note_map = {} + + #: Number of columns in left pane (frozen panes; for split panes, see + #: comments in code) + vert_split_pos = 0 + + #: Number of rows in top pane (frozen panes; for split panes, see comments + #: in code) + horz_split_pos = 0 + + #: Index of first visible row in bottom frozen/split pane + horz_split_first_visible = 0 + + #: Index of first visible column in right frozen/split pane + vert_split_first_visible = 0 + + #: Frozen panes: ignore it. Split panes: explanation and diagrams in + #: OOo docs. + split_active_pane = 0 + + #: Boolean specifying if a ``PANE`` record was present, ignore unless you're + #: ``xlutils.copy`` + has_pane_record = 0 + + #: A list of the horizontal page breaks in this sheet. + #: Breaks are tuples in the form + #: ``(index of row after break, start col index, end col index)``. + #: + #: Populated only if ``open_workbook(..., formatting_info=True)`` + #: + #: .. versionadded:: 0.7.2 + horizontal_page_breaks = [] + + #: A list of the vertical page breaks in this sheet. + #: Breaks are tuples in the form + #: ``(index of col after break, start row index, end row index)``. + #: + #: Populated only if ``open_workbook(..., formatting_info=True)`` + #: + #: .. versionadded:: 0.7.2 + vertical_page_breaks = [] + + def __init__(self, book, position, name, number): + self.book = book + self.biff_version = book.biff_version + self._position = position + self.logfile = book.logfile + self.bt = array('B', [XL_CELL_EMPTY]) + self.bf = array('h', [-1]) + self.name = name + self.number = number + self.verbosity = book.verbosity + self.formatting_info = book.formatting_info + self.ragged_rows = book.ragged_rows + if self.ragged_rows: + self.put_cell = self.put_cell_ragged + else: + self.put_cell = self.put_cell_unragged + self._xf_index_to_xl_type_map = book._xf_index_to_xl_type_map + self.nrows = 0 # actual, including possibly empty cells + self.ncols = 0 + self._maxdatarowx = -1 # highest rowx containing a non-empty cell + self._maxdatacolx = -1 # highest colx containing a non-empty cell + self._dimnrows = 0 # as per DIMENSIONS record + self._dimncols = 0 + self._cell_values = [] + self._cell_types = [] + self._cell_xf_indexes = [] + self.defcolwidth = None + self.standardwidth = None + self.default_row_height = None + self.default_row_height_mismatch = 0 + self.default_row_hidden = 0 + self.default_additional_space_above = 0 + self.default_additional_space_below = 0 + self.colinfo_map = {} + self.rowinfo_map = {} + self.col_label_ranges = [] + self.row_label_ranges = [] + self.merged_cells = [] + self.rich_text_runlist_map = {} + self.horizontal_page_breaks = [] + self.vertical_page_breaks = [] + self._xf_index_stats = [0, 0, 0, 0] + self.visibility = book._sheet_visibility[number] # from BOUNDSHEET record + for attr, defval in _WINDOW2_options: + setattr(self, attr, defval) + self.first_visible_rowx = 0 + self.first_visible_colx = 0 + self.gridline_colour_index = 0x40 + self.gridline_colour_rgb = None # pre-BIFF8 + self.hyperlink_list = [] + self.hyperlink_map = {} + self.cell_note_map = {} + + # Values calculated by xlrd to predict the mag factors that + # will actually be used by Excel to display your worksheet. + # Pass these values to xlwt when writing XLS files. + # Warning 1: Behaviour of OOo Calc and Gnumeric has been observed to differ from Excel's. + # Warning 2: A value of zero means almost exactly what it says. Your sheet will be + # displayed as a very tiny speck on the screen. xlwt will reject attempts to set + # a mag_factor that is not (10 <= mag_factor <= 400). + self.cooked_page_break_preview_mag_factor = 60 + self.cooked_normal_view_mag_factor = 100 + + # Values (if any) actually stored on the XLS file + self.cached_page_break_preview_mag_factor = 0 # default (60%), from WINDOW2 record + self.cached_normal_view_mag_factor = 0 # default (100%), from WINDOW2 record + self.scl_mag_factor = None # from SCL record + + self._ixfe = None # BIFF2 only + self._cell_attr_to_xfx = {} # BIFF2.0 only + + if self.biff_version >= 80: + self.utter_max_rows = 65536 + else: + self.utter_max_rows = 16384 + self.utter_max_cols = 256 + + self._first_full_rowx = -1 + + # self._put_cell_exceptions = 0 + # self._put_cell_row_widenings = 0 + # self._put_cell_rows_appended = 0 + # self._put_cell_cells_appended = 0 + + def cell(self, rowx, colx): + """ + :class:`Cell` object in the given row and column. + """ + if self.formatting_info: + xfx = self.cell_xf_index(rowx, colx) + else: + xfx = None + return Cell( + self._cell_types[rowx][colx], + self._cell_values[rowx][colx], + xfx, + ) + + def cell_value(self, rowx, colx): + "Value of the cell in the given row and column." + return self._cell_values[rowx][colx] + + def cell_type(self, rowx, colx): + """ + Type of the cell in the given row and column. + + Refer to the documentation of the :class:`Cell` class. + """ + return self._cell_types[rowx][colx] + + def cell_xf_index(self, rowx, colx): + """ + XF index of the cell in the given row and column. + This is an index into :attr:`~xlrd.book.Book.xf_list`. + + .. versionadded:: 0.6.1 + """ + self.req_fmt_info() + xfx = self._cell_xf_indexes[rowx][colx] + if xfx > -1: + self._xf_index_stats[0] += 1 + return xfx + # Check for a row xf_index + try: + xfx = self.rowinfo_map[rowx].xf_index + if xfx > -1: + self._xf_index_stats[1] += 1 + return xfx + except KeyError: + pass + # Check for a column xf_index + try: + xfx = self.colinfo_map[colx].xf_index + if xfx == -1: xfx = 15 + self._xf_index_stats[2] += 1 + return xfx + except KeyError: + # If all else fails, 15 is used as hardwired global default xf_index. + self._xf_index_stats[3] += 1 + return 15 + + def row_len(self, rowx): + """ + Returns the effective number of cells in the given row. For use with + ``open_workbook(ragged_rows=True)`` which is likely to produce rows + with fewer than :attr:`~Sheet.ncols` cells. + + .. versionadded:: 0.7.2 + """ + return len(self._cell_values[rowx]) + + def row(self, rowx): + """ + Returns a sequence of the :class:`Cell` objects in the given row. + """ + return [ + self.cell(rowx, colx) + for colx in xrange(len(self._cell_values[rowx])) + ] + + def get_rows(self): + "Returns a generator for iterating through each row." + return (self.row(index) for index in range(self.nrows)) + + def row_types(self, rowx, start_colx=0, end_colx=None): + """ + Returns a slice of the types of the cells in the given row. + """ + if end_colx is None: + return self._cell_types[rowx][start_colx:] + return self._cell_types[rowx][start_colx:end_colx] + + def row_values(self, rowx, start_colx=0, end_colx=None): + """ + Returns a slice of the values of the cells in the given row. + """ + if end_colx is None: + return self._cell_values[rowx][start_colx:] + return self._cell_values[rowx][start_colx:end_colx] + + def row_slice(self, rowx, start_colx=0, end_colx=None): + """ + Returns a slice of the :class:`Cell` objects in the given row. + """ + nc = len(self._cell_values[rowx]) + if start_colx < 0: + start_colx += nc + if start_colx < 0: + start_colx = 0 + if end_colx is None or end_colx > nc: + end_colx = nc + elif end_colx < 0: + end_colx += nc + return [ + self.cell(rowx, colx) + for colx in xrange(start_colx, end_colx) + ] + + def col_slice(self, colx, start_rowx=0, end_rowx=None): + """ + Returns a slice of the :class:`Cell` objects in the given column. + """ + nr = self.nrows + if start_rowx < 0: + start_rowx += nr + if start_rowx < 0: + start_rowx = 0 + if end_rowx is None or end_rowx > nr: + end_rowx = nr + elif end_rowx < 0: + end_rowx += nr + return [ + self.cell(rowx, colx) + for rowx in xrange(start_rowx, end_rowx) + ] + + def col_values(self, colx, start_rowx=0, end_rowx=None): + """ + Returns a slice of the values of the cells in the given column. + """ + nr = self.nrows + if start_rowx < 0: + start_rowx += nr + if start_rowx < 0: + start_rowx = 0 + if end_rowx is None or end_rowx > nr: + end_rowx = nr + elif end_rowx < 0: + end_rowx += nr + return [ + self._cell_values[rowx][colx] + for rowx in xrange(start_rowx, end_rowx) + ] + + def col_types(self, colx, start_rowx=0, end_rowx=None): + """ + Returns a slice of the types of the cells in the given column. + """ + nr = self.nrows + if start_rowx < 0: + start_rowx += nr + if start_rowx < 0: + start_rowx = 0 + if end_rowx is None or end_rowx > nr: + end_rowx = nr + elif end_rowx < 0: + end_rowx += nr + return [ + self._cell_types[rowx][colx] + for rowx in xrange(start_rowx, end_rowx) + ] + + col = col_slice + + # === Following methods are used in building the worksheet. + # === They are not part of the API. + + def tidy_dimensions(self): + if self.verbosity >= 3: + fprintf( + self.logfile, + "tidy_dimensions: nrows=%d ncols=%d \n", + self.nrows, self.ncols, + ) + if 1 and self.merged_cells: + nr = nc = 0 + umaxrows = self.utter_max_rows + umaxcols = self.utter_max_cols + for crange in self.merged_cells: + rlo, rhi, clo, chi = crange + if not (0 <= rlo < rhi <= umaxrows) or not (0 <= clo < chi <= umaxcols): + fprintf(self.logfile, + "*** WARNING: sheet #%d (%r), MERGEDCELLS bad range %r\n", + self.number, self.name, crange) + if rhi > nr: nr = rhi + if chi > nc: nc = chi + if nc > self.ncols: + self.ncols = nc + self._first_full_rowx = -2 + if nr > self.nrows: + # we put one empty cell at (nr-1,0) to make sure + # we have the right number of rows. The ragged rows + # will sort out the rest if needed. + self.put_cell(nr-1, 0, XL_CELL_EMPTY, UNICODE_LITERAL(''), -1) + if (self.verbosity >= 1 and + (self.nrows != self._dimnrows or self.ncols != self._dimncols)): + fprintf( + self.logfile, + "NOTE *** sheet %d (%r): DIMENSIONS R,C = %d,%d should be %d,%d\n", + self.number, + self.name, + self._dimnrows, + self._dimncols, + self.nrows, + self.ncols, + ) + if not self.ragged_rows: + # fix ragged rows + ncols = self.ncols + s_cell_types = self._cell_types + s_cell_values = self._cell_values + s_cell_xf_indexes = self._cell_xf_indexes + s_fmt_info = self.formatting_info + # for rowx in xrange(self.nrows): + if self._first_full_rowx == -2: + ubound = self.nrows + else: + ubound = self._first_full_rowx + for rowx in xrange(ubound): + trow = s_cell_types[rowx] + rlen = len(trow) + nextra = ncols - rlen + if nextra > 0: + s_cell_values[rowx][rlen:] = [UNICODE_LITERAL('')] * nextra + trow[rlen:] = self.bt * nextra + if s_fmt_info: + s_cell_xf_indexes[rowx][rlen:] = self.bf * nextra + + def put_cell_ragged(self, rowx, colx, ctype, value, xf_index): + if ctype is None: + # we have a number, so look up the cell type + ctype = self._xf_index_to_xl_type_map[xf_index] + assert 0 <= colx < self.utter_max_cols + assert 0 <= rowx < self.utter_max_rows + fmt_info = self.formatting_info + + try: + nr = rowx + 1 + if self.nrows < nr: + + scta = self._cell_types.append + scva = self._cell_values.append + scxa = self._cell_xf_indexes.append + bt = self.bt + bf = self.bf + for _unused in xrange(self.nrows, nr): + scta(bt * 0) + scva([]) + if fmt_info: + scxa(bf * 0) + self.nrows = nr + + types_row = self._cell_types[rowx] + values_row = self._cell_values[rowx] + if fmt_info: + fmt_row = self._cell_xf_indexes[rowx] + ltr = len(types_row) + if colx >= self.ncols: + self.ncols = colx + 1 + num_empty = colx - ltr + if not num_empty: + # most common case: colx == previous colx + 1 + # self._put_cell_cells_appended += 1 + types_row.append(ctype) + values_row.append(value) + if fmt_info: + fmt_row.append(xf_index) + return + if num_empty > 0: + num_empty += 1 + # self._put_cell_row_widenings += 1 + # types_row.extend(self.bt * num_empty) + # values_row.extend([UNICODE_LITERAL('')] * num_empty) + # if fmt_info: + # fmt_row.extend(self.bf * num_empty) + types_row[ltr:] = self.bt * num_empty + values_row[ltr:] = [UNICODE_LITERAL('')] * num_empty + if fmt_info: + fmt_row[ltr:] = self.bf * num_empty + types_row[colx] = ctype + values_row[colx] = value + if fmt_info: + fmt_row[colx] = xf_index + except: + print("put_cell", rowx, colx, file=self.logfile) + raise + + def put_cell_unragged(self, rowx, colx, ctype, value, xf_index): + if ctype is None: + # we have a number, so look up the cell type + ctype = self._xf_index_to_xl_type_map[xf_index] + # assert 0 <= colx < self.utter_max_cols + # assert 0 <= rowx < self.utter_max_rows + try: + self._cell_types[rowx][colx] = ctype + self._cell_values[rowx][colx] = value + if self.formatting_info: + self._cell_xf_indexes[rowx][colx] = xf_index + except IndexError: + # print >> self.logfile, "put_cell extending", rowx, colx + # self.extend_cells(rowx+1, colx+1) + # self._put_cell_exceptions += 1 + nr = rowx + 1 + nc = colx + 1 + assert 1 <= nc <= self.utter_max_cols + assert 1 <= nr <= self.utter_max_rows + if nc > self.ncols: + self.ncols = nc + # The row self._first_full_rowx and all subsequent rows + # are guaranteed to have length == self.ncols. Thus the + # "fix ragged rows" section of the tidy_dimensions method + # doesn't need to examine them. + if nr < self.nrows: + # cell data is not in non-descending row order *AND* + # self.ncols has been bumped up. + # This very rare case ruins this optmisation. + self._first_full_rowx = -2 + elif rowx > self._first_full_rowx > -2: + self._first_full_rowx = rowx + if nr <= self.nrows: + # New cell is in an existing row, so extend that row (if necessary). + # Note that nr < self.nrows means that the cell data + # is not in ascending row order!! + trow = self._cell_types[rowx] + nextra = self.ncols - len(trow) + if nextra > 0: + # self._put_cell_row_widenings += 1 + trow.extend(self.bt * nextra) + if self.formatting_info: + self._cell_xf_indexes[rowx].extend(self.bf * nextra) + self._cell_values[rowx].extend([UNICODE_LITERAL('')] * nextra) + else: + scta = self._cell_types.append + scva = self._cell_values.append + scxa = self._cell_xf_indexes.append + fmt_info = self.formatting_info + nc = self.ncols + bt = self.bt + bf = self.bf + for _unused in xrange(self.nrows, nr): + # self._put_cell_rows_appended += 1 + scta(bt * nc) + scva([UNICODE_LITERAL('')] * nc) + if fmt_info: + scxa(bf * nc) + self.nrows = nr + # === end of code from extend_cells() + try: + self._cell_types[rowx][colx] = ctype + self._cell_values[rowx][colx] = value + if self.formatting_info: + self._cell_xf_indexes[rowx][colx] = xf_index + except: + print("put_cell", rowx, colx, file=self.logfile) + raise + except: + print("put_cell", rowx, colx, file=self.logfile) + raise + + + # === Methods after this line neither know nor care about how cells are stored. + + def read(self, bk): + global rc_stats + DEBUG = 0 + blah = DEBUG or self.verbosity >= 2 + blah_rows = DEBUG or self.verbosity >= 4 + blah_formulas = 0 and blah + r1c1 = 0 + oldpos = bk._position + bk._position = self._position + XL_SHRFMLA_ETC_ETC = ( + XL_SHRFMLA, XL_ARRAY, XL_TABLEOP, XL_TABLEOP2, + XL_ARRAY2, XL_TABLEOP_B2, + ) + self_put_cell = self.put_cell + local_unpack = unpack + bk_get_record_parts = bk.get_record_parts + bv = self.biff_version + fmt_info = self.formatting_info + do_sst_rich_text = fmt_info and bk._rich_text_runlist_map + rowinfo_sharing_dict = {} + txos = {} + eof_found = 0 + while 1: + # if DEBUG: print "SHEET.READ: about to read from position %d" % bk._position + rc, data_len, data = bk_get_record_parts() + # if rc in rc_stats: + # rc_stats[rc] += 1 + # else: + # rc_stats[rc] = 1 + # if DEBUG: print "SHEET.READ: op 0x%04x, %d bytes %r" % (rc, data_len, data) + if rc == XL_NUMBER: + # [:14] in following stmt ignores extraneous rubbish at end of record. + # Sample file testEON-8.xls supplied by Jan Kraus. + rowx, colx, xf_index, d = local_unpack('> 15) & 1 + r.outline_level = bits2 & 7 + r.outline_group_starts_ends = (bits2 >> 4) & 1 + r.hidden = (bits2 >> 5) & 1 + r.height_mismatch = (bits2 >> 6) & 1 + r.has_default_xf_index = (bits2 >> 7) & 1 + r.xf_index = (bits2 >> 16) & 0xfff + r.additional_space_above = (bits2 >> 28) & 1 + r.additional_space_below = (bits2 >> 29) & 1 + if not r.has_default_xf_index: + r.xf_index = -1 + self.rowinfo_map[rowx] = r + if 0 and r.xf_index > -1: + fprintf(self.logfile, + "**ROW %d %d %d\n", + self.number, rowx, r.xf_index) + if blah_rows: + print('ROW', rowx, bits1, bits2, file=self.logfile) + r.dump(self.logfile, + header="--- sh #%d, rowx=%d ---" % (self.number, rowx)) + elif rc in XL_FORMULA_OPCODES: # 06, 0206, 0406 + # DEBUG = 1 + # if DEBUG: print "FORMULA: rc: 0x%04x data: %r" % (rc, data) + if bv >= 50: + rowx, colx, xf_index, result_str, flags = local_unpack('= 30: + rowx, colx, xf_index, result_str, flags = local_unpack(' 255: break # Excel does 0 to 256 inclusive + self.colinfo_map[colx] = c + if 0: + fprintf(self.logfile, + "**COL %d %d %d\n", + self.number, colx, c.xf_index) + if blah: + fprintf( + self.logfile, + "COLINFO sheet #%d cols %d-%d: wid=%d xf_index=%d flags=0x%04x\n", + self.number, first_colx, last_colx, c.width, c.xf_index, flags, + ) + c.dump(self.logfile, header='===') + elif rc == XL_DEFCOLWIDTH: + self.defcolwidth, = local_unpack(">= 1 + self.gcw = tuple(gcw) + if 0: + showgcw = "".join(map(lambda x: "F "[x], gcw)).rstrip().replace(' ', '.') + print("GCW:", showgcw, file=self.logfile) + elif rc == XL_BLANK: + if not fmt_info: continue + rowx, colx, xf_index = local_unpack('> self.logfile, "BLANK", rowx, colx, xf_index + self_put_cell(rowx, colx, XL_CELL_BLANK, '', xf_index) + elif rc == XL_MULBLANK: # 00BE + if not fmt_info: continue + nitems = data_len >> 1 + result = local_unpack("<%dH" % nitems, data) + rowx, mul_first = result[:2] + mul_last = result[-1] + # print >> self.logfile, "MULBLANK", rowx, mul_first, mul_last, data_len, nitems, mul_last + 4 - mul_first + assert nitems == mul_last + 4 - mul_first + pos = 2 + for colx in xrange(mul_first, mul_last + 1): + self_put_cell(rowx, colx, XL_CELL_BLANK, '', result[pos]) + pos += 1 + elif rc == XL_DIMENSION or rc == XL_DIMENSION2: + if data_len == 0: + # Four zero bytes after some other record. See github issue 64. + continue + # if data_len == 10: + # Was crashing on BIFF 4.0 file w/o the two trailing unused bytes. + # Reported by Ralph Heimburger. + if bv < 80: + dim_tuple = local_unpack(' found EOF", file=self.logfile) + elif rc == XL_COUNTRY: + bk.handle_country(data) + elif rc == XL_LABELRANGES: + pos = 0 + pos = unpack_cell_range_address_list_update_pos( + self.row_label_ranges, data, pos, bv, addr_size=8, + ) + pos = unpack_cell_range_address_list_update_pos( + self.col_label_ranges, data, pos, bv, addr_size=8, + ) + assert pos == data_len + elif rc == XL_ARRAY: + row1x, rownx, col1x, colnx, array_flags, tokslen = \ + local_unpack("= 80 + num_CFs, needs_recalc, browx1, browx2, bcolx1, bcolx2 = \ + unpack("<6H", data[0:12]) + if self.verbosity >= 1: + fprintf( + self.logfile, + "\n*** WARNING: Ignoring CONDFMT (conditional formatting) record\n" + "*** in Sheet %d (%r).\n" + "*** %d CF record(s); needs_recalc_or_redraw = %d\n" + "*** Bounding box is %s\n", + self.number, self.name, num_CFs, needs_recalc, + rangename2d(browx1, browx2+1, bcolx1, bcolx2+1), + ) + olist = [] # updated by the function + pos = unpack_cell_range_address_list_update_pos( + olist, data, 12, bv, addr_size=8) + # print >> self.logfile, repr(result), len(result) + if self.verbosity >= 1: + fprintf( + self.logfile, + "*** %d individual range(s):\n" + "*** %s\n", + len(olist), + ", ".join(rangename2d(*coords) for coords in olist), + ) + elif rc == XL_CF: + if not fmt_info: continue + cf_type, cmp_op, sz1, sz2, flags = unpack("> 26) & 1 + bord_block = (flags >> 28) & 1 + patt_block = (flags >> 29) & 1 + if self.verbosity >= 1: + fprintf( + self.logfile, + "\n*** WARNING: Ignoring CF (conditional formatting) sub-record.\n" + "*** cf_type=%d, cmp_op=%d, sz1=%d, sz2=%d, flags=0x%08x\n" + "*** optional data blocks: font=%d, border=%d, pattern=%d\n", + cf_type, cmp_op, sz1, sz2, flags, + font_block, bord_block, patt_block, + ) + # hex_char_dump(data, 0, data_len, fout=self.logfile) + pos = 12 + if font_block: + (font_height, font_options, weight, escapement, underline, + font_colour_index, two_bits, font_esc, font_underl) = unpack("<64x i i H H B 3x i 4x i i i 18x", data[pos:pos+118]) + font_style = (two_bits > 1) & 1 + posture = (font_options > 1) & 1 + font_canc = (two_bits > 7) & 1 + cancellation = (font_options > 7) & 1 + if self.verbosity >= 1: + fprintf( + self.logfile, + "*** Font info: height=%d, weight=%d, escapement=%d,\n" + "*** underline=%d, colour_index=%d, esc=%d, underl=%d,\n" + "*** style=%d, posture=%d, canc=%d, cancellation=%d\n", + font_height, weight, escapement, underline, + font_colour_index, font_esc, font_underl, + font_style, posture, font_canc, cancellation, + ) + pos += 118 + if bord_block: + pos += 8 + if patt_block: + pos += 4 + fmla1 = data[pos:pos+sz1] + pos += sz1 + if blah and sz1: + fprintf(self.logfile, "*** formula 1:\n") + dump_formula(bk, fmla1, sz1, bv, reldelta=0, blah=1) + fmla2 = data[pos:pos+sz2] + pos += sz2 + assert pos == data_len + if blah and sz2: + fprintf(self.logfile, "*** formula 2:\n") + dump_formula(bk, fmla2, sz2, bv, reldelta=0, blah=1) + elif rc == XL_DEFAULTROWHEIGHT: + if data_len == 4: + bits, self.default_row_height = unpack("> 1) & 1 + self.default_additional_space_above = (bits >> 2) & 1 + self.default_additional_space_below = (bits >> 3) & 1 + elif rc == XL_MERGEDCELLS: + if not fmt_info: continue + pos = unpack_cell_range_address_list_update_pos( + self.merged_cells, data, 0, bv, addr_size=8) + if blah: + fprintf(self.logfile, + "MERGEDCELLS: %d ranges\n", (pos - 2) // 8) + assert pos == data_len, \ + "MERGEDCELLS: pos=%d data_len=%d" % (pos, data_len) + elif rc == XL_WINDOW2: + if bv >= 80 and data_len >= 14: + ( + options, + self.first_visible_rowx, self.first_visible_colx, + self.gridline_colour_index, + self.cached_page_break_preview_mag_factor, + self.cached_normal_view_mag_factor + ) = unpack("= 30 # BIFF3-7 + ( + options, + self.first_visible_rowx, self.first_visible_colx, + ) = unpack(">= 1 + elif rc == XL_SCL: + num, den = unpack("= 0: + print( + "WARNING *** SCL rcd sheet %d: should have 0.1 <= num/den <= 4; got %d/%d" + % (self.number, num, den), + file=self.logfile, + ) + result = 100 + self.scl_mag_factor = result + elif rc == XL_PANE: + ( + self.vert_split_pos, + self.horz_split_pos, + self.horz_split_first_visible, + self.vert_split_first_visible, + self.split_active_pane, + ) = unpack("= 80)) + 2 == data_len + pos = 2 + if bv < 80: + while pos < data_len: + self.horizontal_page_breaks.append((local_unpack("= 80)) + 2 == data_len + pos = 2 + if bv < 80: + while pos < data_len: + self.vertical_page_breaks.append((local_unpack("> 15) & 1 + r.has_default_xf_index = bits2 & 1 + r.xf_index = xf_index + # r.outline_level = 0 # set in __init__ + # r.outline_group_starts_ends = 0 # set in __init__ + # r.hidden = 0 # set in __init__ + # r.height_mismatch = 0 # set in __init__ + # r.additional_space_above = 0 # set in __init__ + # r.additional_space_below = 0 # set in __init__ + self.rowinfo_map[rowx] = r + if 0 and r.xf_index > -1: + fprintf(self.logfile, + "**ROW %d %d %d\n", + self.number, rowx, r.xf_index) + if blah_rows: + print('ROW_B2', rowx, bits1, file=self.logfile) + r.dump(self.logfile, + header="--- sh #%d, rowx=%d ---" % (self.number, rowx)) + elif rc == XL_COLWIDTH: # BIFF2 only + if not fmt_info: continue + first_colx, last_colx, width\ + = local_unpack("= 30) + 1 + nchars_expected = unpack("<" + "BH"[lenlen - 1], data[:lenlen])[0] + offset = lenlen + if bv < 80: + enc = bk.encoding or bk.derive_encoding() + nchars_found = 0 + result = UNICODE_LITERAL("") + while 1: + if bv >= 80: + flag = BYTES_ORD(data[offset]) & 1 + enc = ("latin_1", "utf_16_le")[flag] + offset += 1 + chunk = unicode(data[offset:], enc) + result += chunk + nchars_found += len(chunk) + if nchars_found == nchars_expected: + return result + if nchars_found > nchars_expected: + msg = ("STRING/CONTINUE: expected %d chars, found %d" + % (nchars_expected, nchars_found)) + raise XLRDError(msg) + rc, _unused_len, data = bk.get_record_parts() + if rc != XL_CONTINUE: + raise XLRDError( + "Expected CONTINUE record; found record-type 0x%04X" % rc) + offset = 0 + + def update_cooked_mag_factors(self): + # Cached values are used ONLY for the non-active view mode. + # When the user switches to the non-active view mode, + # if the cached value for that mode is not valid, + # Excel pops up a window which says: + # "The number must be between 10 and 400. Try again by entering a number in this range." + # When the user hits OK, it drops into the non-active view mode + # but uses the magn from the active mode. + # NOTE: definition of "valid" depends on mode ... see below + blah = DEBUG or self.verbosity > 0 + if self.show_in_page_break_preview: + if self.scl_mag_factor is None: # no SCL record + self.cooked_page_break_preview_mag_factor = 100 # Yes, 100, not 60, NOT a typo + else: + self.cooked_page_break_preview_mag_factor = self.scl_mag_factor + zoom = self.cached_normal_view_mag_factor + if not (10 <= zoom <=400): + if blah: + print( + "WARNING *** WINDOW2 rcd sheet %d: Bad cached_normal_view_mag_factor: %d" + % (self.number, self.cached_normal_view_mag_factor), + file=self.logfile, + ) + zoom = self.cooked_page_break_preview_mag_factor + self.cooked_normal_view_mag_factor = zoom + else: + # normal view mode + if self.scl_mag_factor is None: # no SCL record + self.cooked_normal_view_mag_factor = 100 + else: + self.cooked_normal_view_mag_factor = self.scl_mag_factor + zoom = self.cached_page_break_preview_mag_factor + if not zoom: + # VALID, defaults to 60 + zoom = 60 + elif not (10 <= zoom <= 400): + if blah: + print( + "WARNING *** WINDOW2 rcd sheet %r: Bad cached_page_break_preview_mag_factor: %r" + % (self.number, self.cached_page_break_preview_mag_factor), + file=self.logfile, + ) + zoom = self.cooked_normal_view_mag_factor + self.cooked_page_break_preview_mag_factor = zoom + + def fixed_BIFF2_xfindex(self, cell_attr, rowx, colx, true_xfx=None): + DEBUG = 0 + blah = DEBUG or self.verbosity >= 2 + if self.biff_version == 21: + if self.book.xf_list: + if true_xfx is not None: + xfx = true_xfx + else: + xfx = BYTES_ORD(cell_attr[0]) & 0x3F + if xfx == 0x3F: + if self._ixfe is None: + raise XLRDError("BIFF2 cell record has XF index 63 but no preceding IXFE record.") + xfx = self._ixfe + # OOo docs are capable of interpretation that each + # cell record is preceded immediately by its own IXFE record. + # Empirical evidence is that (sensibly) an IXFE record applies to all + # following cell records until another IXFE comes along. + return xfx + # Have either Excel 2.0, or broken 2.1 w/o XF records -- same effect. + self.biff_version = self.book.biff_version = 20 + #### check that XF slot in cell_attr is zero + xfx_slot = BYTES_ORD(cell_attr[0]) & 0x3F + assert xfx_slot == 0 + xfx = self._cell_attr_to_xfx.get(cell_attr) + if xfx is not None: + return xfx + if blah: + fprintf(self.logfile, "New cell_attr %r at (%r, %r)\n", cell_attr, rowx, colx) + if not self.book.xf_list: + for xfx in xrange(16): + self.insert_new_BIFF20_xf(cell_attr=b"\x40\x00\x00", style=xfx < 15) + xfx = self.insert_new_BIFF20_xf(cell_attr=cell_attr) + return xfx + + def insert_new_BIFF20_xf(self, cell_attr, style=0): + DEBUG = 0 + blah = DEBUG or self.verbosity >= 2 + book = self.book + xfx = len(book.xf_list) + xf = self.fake_XF_from_BIFF20_cell_attr(cell_attr, style) + xf.xf_index = xfx + book.xf_list.append(xf) + if blah: + xf.dump(self.logfile, header="=== Faked XF %d ===" % xfx, footer="======") + if xf.format_key not in book.format_map: + if xf.format_key: + msg = "ERROR *** XF[%d] unknown format key (%d, 0x%04x)\n" + fprintf(self.logfile, msg, + xf.xf_index, xf.format_key, xf.format_key) + fmt = Format(xf.format_key, FUN, UNICODE_LITERAL("General")) + book.format_map[xf.format_key] = fmt + book.format_list.append(fmt) + cellty_from_fmtty = { + FNU: XL_CELL_NUMBER, + FUN: XL_CELL_NUMBER, + FGE: XL_CELL_NUMBER, + FDT: XL_CELL_DATE, + FTX: XL_CELL_NUMBER, # Yes, a number can be formatted as text. + } + fmt = book.format_map[xf.format_key] + cellty = cellty_from_fmtty[fmt.type] + self._xf_index_to_xl_type_map[xf.xf_index] = cellty + self._cell_attr_to_xfx[cell_attr] = xfx + return xfx + + def fake_XF_from_BIFF20_cell_attr(self, cell_attr, style=0): + from .formatting import XF, XFAlignment, XFBorder, XFBackground, XFProtection + xf = XF() + xf.alignment = XFAlignment() + xf.alignment.indent_level = 0 + xf.alignment.shrink_to_fit = 0 + xf.alignment.text_direction = 0 + xf.border = XFBorder() + xf.border.diag_up = 0 + xf.border.diag_down = 0 + xf.border.diag_colour_index = 0 + xf.border.diag_line_style = 0 # no line + xf.background = XFBackground() + xf.protection = XFProtection() + (prot_bits, font_and_format, halign_etc) = unpack('> 6 + upkbits(xf.protection, prot_bits, ( + (6, 0x40, 'cell_locked'), + (7, 0x80, 'formula_hidden'), + )) + xf.alignment.hor_align = halign_etc & 0x07 + for mask, side in ((0x08, 'left'), (0x10, 'right'), (0x20, 'top'), (0x40, 'bottom')): + if halign_etc & mask: + colour_index, line_style = 8, 1 # black, thin + else: + colour_index, line_style = 0, 0 # none, none + setattr(xf.border, side + '_colour_index', colour_index) + setattr(xf.border, side + '_line_style', line_style) + bg = xf.background + if halign_etc & 0x80: + bg.fill_pattern = 17 + else: + bg.fill_pattern = 0 + bg.background_colour_index = 9 # white + bg.pattern_colour_index = 8 # black + xf.parent_style_index = (0x0FFF, 0)[style] + xf.alignment.vert_align = 2 # bottom + xf.alignment.rotation = 0 + attr_stems = [ + 'format', + 'font', + 'alignment', + 'border', + 'background', + 'protection', + ] + for attr_stem in attr_stems: + attr = "_" + attr_stem + "_flag" + setattr(xf, attr, 1) + return xf + + def req_fmt_info(self): + if not self.formatting_info: + raise XLRDError("Feature requires open_workbook(..., formatting_info=True)") + + def computed_column_width(self, colx): + """ + Determine column display width. + + :param colx: + Index of the queried column, range 0 to 255. + Note that it is possible to find out the width that will be used to + display columns with no cell information e.g. column IV (colx=255). + + :return: + The column width that will be used for displaying + the given column by Excel, in units of 1/256th of the width of a + standard character (the digit zero in the first font). + + .. versionadded:: 0.6.1 + """ + self.req_fmt_info() + if self.biff_version >= 80: + colinfo = self.colinfo_map.get(colx, None) + if colinfo is not None: + return colinfo.width + if self.standardwidth is not None: + return self.standardwidth + elif self.biff_version >= 40: + if self.gcw[colx]: + if self.standardwidth is not None: + return self.standardwidth + else: + colinfo = self.colinfo_map.get(colx, None) + if colinfo is not None: + return colinfo.width + elif self.biff_version == 30: + colinfo = self.colinfo_map.get(colx, None) + if colinfo is not None: + return colinfo.width + # All roads lead to Rome and the DEFCOLWIDTH ... + if self.defcolwidth is not None: + return self.defcolwidth * 256 + return 8 * 256 # 8 is what Excel puts in a DEFCOLWIDTH record + + def handle_hlink(self, data): + # DEBUG = 1 + if DEBUG: print("\n=== hyperlink ===", file=self.logfile) + record_size = len(data) + h = Hyperlink() + h.frowx, h.lrowx, h.fcolx, h.lcolx, guid0, dummy, options = unpack(' 0: + fprintf( + self.logfile, + "*** WARNING: hyperlink at R%dC%d has %d extra data bytes: %s\n", + h.frowx + 1, + h.fcolx + 1, + extra_nbytes, + REPR(data[-extra_nbytes:]), + ) + # Seen: b"\x00\x00" also b"A\x00", b"V\x00" + elif extra_nbytes < 0: + raise XLRDError("Bug or corrupt file, send copy of input file for debugging") + + self.hyperlink_list.append(h) + for rowx in xrange(h.frowx, h.lrowx+1): + for colx in xrange(h.fcolx, h.lcolx+1): + self.hyperlink_map[rowx, colx] = h + + def handle_quicktip(self, data): + rcx, frowx, lrowx, fcolx, lcolx = unpack('<5H', data[:10]) + assert rcx == XL_QUICKTIP + assert self.hyperlink_list + h = self.hyperlink_list[-1] + assert (frowx, lrowx, fcolx, lcolx) == (h.frowx, h.lrowx, h.fcolx, h.lcolx) + assert data[-2:] == b'\x00\x00' + h.quicktip = unicode(data[10:-2], 'utf_16_le') + + def handle_msodrawingetc(self, recid, data_len, data): + if not OBJ_MSO_DEBUG: + return + DEBUG = 1 + if self.biff_version < 80: + return + o = MSODrawing() + pos = 0 + while pos < data_len: + tmp, fbt, cb = unpack('> 4) & 0xFFF + if ver == 0xF: + ndb = 0 # container + else: + ndb = cb + if DEBUG: + hex_char_dump(data, pos, ndb + 8, base=0, fout=self.logfile) + fprintf(self.logfile, + "fbt:0x%04X inst:%d ver:0x%X cb:%d (0x%04X)\n", + fbt, inst, ver, cb, cb) + if fbt == 0xF010: # Client Anchor + assert ndb == 18 + (o.anchor_unk, + o.anchor_colx_lo, o.anchor_rowx_lo, + o.anchor_colx_hi, o.anchor_rowx_hi) = unpack(' 0: + rc2, data2_len, data2 = self.book.get_record_parts() + assert rc2 == XL_NOTE + dummy_rowx, nb = unpack('> 1) & 1 + o.row_hidden = (option_flags >> 7) & 1 + o.col_hidden = (option_flags >> 8) & 1 + # XL97 dev kit book says NULL [sic] bytes padding between string count and string data + # to ensure that string is word-aligned. Appears to be nonsense. + o.author, endpos = unpack_unicode_update_pos(data, 8, lenlen=2) + # There is a random/undefined byte after the author string (not counted in the + # string length). + # Issue 4 on github: Google Spreadsheet doesn't write the undefined byte. + assert (data_len - endpos) in (0, 1) + if OBJ_MSO_DEBUG: + o.dump(self.logfile, header="=== Note ===", footer= " ") + txo = txos.get(o._object_id) + if txo: + o.text = txo.text + o.rich_text_runlist = txo.rich_text_runlist + self.cell_note_map[o.rowx, o.colx] = o + + def handle_txo(self, data): + if self.biff_version < 80: + return + o = MSTxo() + fmt = '